From 0793a61d4df8daeac6492dbf8d2f3e5713caae5e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 4 Dec 2008 20:12:29 +0100
Subject: performance counters: core code

Implement the core kernel bits of Performance Counters subsystem.

The Linux Performance Counter subsystem provides an abstraction of
performance counter hardware capabilities. It provides per task and per
CPU counters, and it provides event capabilities on top of those.

Performance counters are accessed via special file descriptors.
There's one file descriptor per virtual counter used.

The special file descriptor is opened via the perf_counter_open()
system call:

 int
 perf_counter_open(u32 hw_event_type,
                   u32 hw_event_period,
                   u32 record_type,
                   pid_t pid,
                   int cpu);

The syscall returns the new fd. The fd can be used via the normal
VFS system calls: read() can be used to read the counter, fcntl()
can be used to set the blocking mode, etc.

Multiple counters can be kept open at a time, and the counters
can be poll()ed.

See more details in Documentation/perf-counters.txt.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index ce0d9da..52146c2 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -25,6 +25,7 @@
 #include <linux/kbd_kern.h>
 #include <linux/proc_fs.h>
 #include <linux/quotaops.h>
+#include <linux/perf_counter.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
@@ -244,6 +245,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty)
 	struct pt_regs *regs = get_irq_regs();
 	if (regs)
 		show_regs(regs);
+	perf_counter_print_debug();
 }
 static struct sysrq_key_op sysrq_showregs_op = {
 	.handler	= sysrq_handle_showregs,
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
new file mode 100644
index 0000000..22c4469
--- /dev/null
+++ b/include/linux/perf_counter.h
@@ -0,0 +1,171 @@
+/*
+ *  Performance counters:
+ *
+ *   Copyright(C) 2008, Thomas Gleixner <tglx@linutronix.de>
+ *   Copyright(C) 2008, Red Hat, Inc., Ingo Molnar
+ *
+ *  Data type definitions, declarations, prototypes.
+ *
+ *  Started by: Thomas Gleixner and Ingo Molnar
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+#ifndef _LINUX_PERF_COUNTER_H
+#define _LINUX_PERF_COUNTER_H
+
+#include <asm/atomic.h>
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+
+struct task_struct;
+
+/*
+ * Generalized hardware event types, used by the hw_event_type parameter
+ * of the sys_perf_counter_open() syscall:
+ */
+enum hw_event_types {
+	PERF_COUNT_CYCLES,
+	PERF_COUNT_INSTRUCTIONS,
+	PERF_COUNT_CACHE_REFERENCES,
+	PERF_COUNT_CACHE_MISSES,
+	PERF_COUNT_BRANCH_INSTRUCTIONS,
+	PERF_COUNT_BRANCH_MISSES,
+	/*
+	 * If this bit is set in the type, then trigger NMI sampling:
+	 */
+	PERF_COUNT_NMI			= (1 << 30),
+};
+
+/*
+ * IRQ-notification data record type:
+ */
+enum perf_record_type {
+	PERF_RECORD_SIMPLE,
+	PERF_RECORD_IRQ,
+	PERF_RECORD_GROUP,
+};
+
+/**
+ * struct hw_perf_counter - performance counter hardware details
+ */
+struct hw_perf_counter {
+	u64			config;
+	unsigned long		config_base;
+	unsigned long		counter_base;
+	int			nmi;
+	unsigned int		idx;
+	u64			prev_count;
+	s32			next_count;
+	u64			irq_period;
+};
+
+/*
+ * Hardcoded buffer length limit for now, for IRQ-fed events:
+ */
+#define PERF_DATA_BUFLEN	2048
+
+/**
+ * struct perf_data - performance counter IRQ data sampling ...
+ */
+struct perf_data {
+	int			len;
+	int			rd_idx;
+	int			overrun;
+	u8			data[PERF_DATA_BUFLEN];
+};
+
+/**
+ * struct perf_counter - performance counter kernel representation:
+ */
+struct perf_counter {
+	struct list_head		list;
+	int				active;
+#if BITS_PER_LONG == 64
+	atomic64_t			count;
+#else
+	atomic_t			count32[2];
+#endif
+	u64				__irq_period;
+
+	struct hw_perf_counter		hw;
+
+	struct perf_counter_context	*ctx;
+	struct task_struct		*task;
+
+	/*
+	 * Protect attach/detach:
+	 */
+	struct mutex			mutex;
+
+	int				oncpu;
+	int				cpu;
+
+	s32				hw_event_type;
+	enum perf_record_type		record_type;
+
+	/* read() / irq related data */
+	wait_queue_head_t		waitq;
+	/* optional: for NMIs */
+	int				wakeup_pending;
+	struct perf_data		*irqdata;
+	struct perf_data		*usrdata;
+	struct perf_data		data[2];
+};
+
+/**
+ * struct perf_counter_context - counter context structure
+ *
+ * Used as a container for task counters and CPU counters as well:
+ */
+struct perf_counter_context {
+#ifdef CONFIG_PERF_COUNTERS
+	/*
+	 * Protect the list of counters:
+	 */
+	spinlock_t		lock;
+	struct list_head	counters;
+	int			nr_counters;
+	int			nr_active;
+	struct task_struct	*task;
+#endif
+};
+
+/**
+ * struct perf_counter_cpu_context - per cpu counter context structure
+ */
+struct perf_cpu_context {
+	struct perf_counter_context	ctx;
+	struct perf_counter_context	*task_ctx;
+	int				active_oncpu;
+	int				max_pertask;
+};
+
+/*
+ * Set by architecture code:
+ */
+extern int perf_max_counters;
+
+#ifdef CONFIG_PERF_COUNTERS
+extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
+extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
+extern void perf_counter_task_tick(struct task_struct *task, int cpu);
+extern void perf_counter_init_task(struct task_struct *task);
+extern void perf_counter_notify(struct pt_regs *regs);
+extern void perf_counter_print_debug(void);
+#else
+static inline void
+perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
+static inline void
+perf_counter_task_sched_out(struct task_struct *task, int cpu)		{ }
+static inline void
+perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
+static inline void perf_counter_init_task(struct task_struct *task)	{ }
+static inline void perf_counter_notify(struct pt_regs *regs)		{ }
+static inline void perf_counter_print_debug(void)			{ }
+#endif
+
+#endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 55e30d1..4c53027 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -71,6 +71,7 @@ struct sched_param {
 #include <linux/fs_struct.h>
 #include <linux/compiler.h>
 #include <linux/completion.h>
+#include <linux/perf_counter.h>
 #include <linux/pid.h>
 #include <linux/percpu.h>
 #include <linux/topology.h>
@@ -1326,6 +1327,7 @@ struct task_struct {
 	struct list_head pi_state_list;
 	struct futex_pi_state *pi_state_cache;
 #endif
+	struct perf_counter_context perf_counter_ctx;
 #ifdef CONFIG_NUMA
 	struct mempolicy *mempolicy;
 	short il_next;
@@ -2285,6 +2287,13 @@ static inline void inc_syscw(struct task_struct *tsk)
 #define TASK_SIZE_OF(tsk)	TASK_SIZE
 #endif
 
+/*
+ * Call the function if the target task is executing on a CPU right now:
+ */
+extern void task_oncpu_function_call(struct task_struct *p,
+				     void (*func) (void *info), void *info);
+
+
 #ifdef CONFIG_MM_OWNER
 extern void mm_update_next_owner(struct mm_struct *mm);
 extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 04fb47b..6cce728 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -624,4 +624,10 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
+asmlinkage int
+sys_perf_counter_open(u32 hw_event_type,
+		      u32 hw_event_period,
+		      u32 record_type,
+		      pid_t pid,
+		      int cpu);
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index f763762..78bede2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -732,6 +732,35 @@ config AIO
           by some high performance threaded applications. Disabling
           this option saves about 7k.
 
+config HAVE_PERF_COUNTERS
+	bool
+
+menu "Performance Counters"
+
+config PERF_COUNTERS
+	bool "Kernel Performance Counters"
+	depends on HAVE_PERF_COUNTERS
+	default y
+	help
+	  Enable kernel support for performance counter hardware.
+
+	  Performance counters are special hardware registers available
+	  on most modern CPUs. These registers count the number of certain
+	  types of hw events: such as instructions executed, cachemisses
+	  suffered, or branches mis-predicted - without slowing down the
+	  kernel or applications. These registers can also trigger interrupts
+	  when a threshold number of events have passed - and can thus be
+	  used to profile the code that runs on that CPU.
+
+	  The Linux Performance Counter subsystem provides an abstraction of
+	  these hardware capabilities, available via a system call. It
+	  provides per task and per CPU counters, and it provides event
+	  capabilities on top of those.
+
+	  Say Y if unsure.
+
+endmenu
+
 config VM_EVENT_COUNTERS
 	default y
 	bool "Enable VM event counters for /proc/vmstat" if EMBEDDED
diff --git a/kernel/Makefile b/kernel/Makefile
index 19fad00..1f184a1 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -89,6 +89,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
+obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/fork.c b/kernel/fork.c
index 2a372a0..441fadf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -975,6 +975,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto fork_out;
 
 	rt_mutex_init_task(p);
+	perf_counter_init_task(p);
 
 #ifdef CONFIG_PROVE_LOCKING
 	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
new file mode 100644
index 0000000..20508f0
--- /dev/null
+++ b/kernel/perf_counter.c
@@ -0,0 +1,943 @@
+/*
+ * Performance counter core code
+ *
+ *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/fs.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/poll.h>
+#include <linux/sysfs.h>
+#include <linux/ptrace.h>
+#include <linux/percpu.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/anon_inodes.h>
+#include <linux/perf_counter.h>
+
+/*
+ * Each CPU has a list of per CPU counters:
+ */
+DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+
+int perf_max_counters __read_mostly;
+static int perf_reserved_percpu __read_mostly;
+static int perf_overcommit __read_mostly = 1;
+
+/*
+ * Mutex for (sysadmin-configurable) counter reservations:
+ */
+static DEFINE_MUTEX(perf_resource_mutex);
+
+/*
+ * Architecture provided APIs - weak aliases:
+ */
+
+int __weak hw_perf_counter_init(struct perf_counter *counter, u32 hw_event_type)
+{
+	return -EINVAL;
+}
+
+void __weak hw_perf_counter_enable(struct perf_counter *counter)	 { }
+void __weak hw_perf_counter_disable(struct perf_counter *counter)	 { }
+void __weak hw_perf_counter_read(struct perf_counter *counter)		 { }
+void __weak hw_perf_disable_all(void) { }
+void __weak hw_perf_enable_all(void) { }
+void __weak hw_perf_counter_setup(void) { }
+
+#if BITS_PER_LONG == 64
+
+/*
+ * Read the cached counter in counter safe against cross CPU / NMI
+ * modifications. 64 bit version - no complications.
+ */
+static inline u64 perf_read_counter_safe(struct perf_counter *counter)
+{
+	return (u64) atomic64_read(&counter->count);
+}
+
+#else
+
+/*
+ * Read the cached counter in counter safe against cross CPU / NMI
+ * modifications. 32 bit version.
+ */
+static u64 perf_read_counter_safe(struct perf_counter *counter)
+{
+	u32 cntl, cnth;
+
+	local_irq_disable();
+	do {
+		cnth = atomic_read(&counter->count32[1]);
+		cntl = atomic_read(&counter->count32[0]);
+	} while (cnth != atomic_read(&counter->count32[1]));
+
+	local_irq_enable();
+
+	return cntl | ((u64) cnth) << 32;
+}
+
+#endif
+
+/*
+ * Cross CPU call to remove a performance counter
+ *
+ * We disable the counter on the hardware level first. After that we
+ * remove it from the context list.
+ */
+static void __perf_remove_from_context(void *info)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_counter *counter = info;
+	struct perf_counter_context *ctx = counter->ctx;
+
+	/*
+	 * If this is a task context, we need to check whether it is
+	 * the current task context of this cpu. If not it has been
+	 * scheduled out before the smp call arrived.
+	 */
+	if (ctx->task && cpuctx->task_ctx != ctx)
+		return;
+
+	spin_lock(&ctx->lock);
+
+	if (counter->active) {
+		hw_perf_counter_disable(counter);
+		counter->active = 0;
+		ctx->nr_active--;
+		cpuctx->active_oncpu--;
+		counter->task = NULL;
+	}
+	ctx->nr_counters--;
+
+	/*
+	 * Protect the list operation against NMI by disabling the
+	 * counters on a global level. NOP for non NMI based counters.
+	 */
+	hw_perf_disable_all();
+	list_del_init(&counter->list);
+	hw_perf_enable_all();
+
+	if (!ctx->task) {
+		/*
+		 * Allow more per task counters with respect to the
+		 * reservation:
+		 */
+		cpuctx->max_pertask =
+			min(perf_max_counters - ctx->nr_counters,
+			    perf_max_counters - perf_reserved_percpu);
+	}
+
+	spin_unlock(&ctx->lock);
+}
+
+
+/*
+ * Remove the counter from a task's (or a CPU's) list of counters.
+ *
+ * Must be called with counter->mutex held.
+ *
+ * CPU counters are removed with a smp call. For task counters we only
+ * call when the task is on a CPU.
+ */
+static void perf_remove_from_context(struct perf_counter *counter)
+{
+	struct perf_counter_context *ctx = counter->ctx;
+	struct task_struct *task = ctx->task;
+
+	if (!task) {
+		/*
+		 * Per cpu counters are removed via an smp call and
+		 * the removal is always sucessful.
+		 */
+		smp_call_function_single(counter->cpu,
+					 __perf_remove_from_context,
+					 counter, 1);
+		return;
+	}
+
+retry:
+	task_oncpu_function_call(task, __perf_remove_from_context,
+				 counter);
+
+	spin_lock_irq(&ctx->lock);
+	/*
+	 * If the context is active we need to retry the smp call.
+	 */
+	if (ctx->nr_active && !list_empty(&counter->list)) {
+		spin_unlock_irq(&ctx->lock);
+		goto retry;
+	}
+
+	/*
+	 * The lock prevents that this context is scheduled in so we
+	 * can remove the counter safely, if it the call above did not
+	 * succeed.
+	 */
+	if (!list_empty(&counter->list)) {
+		ctx->nr_counters--;
+		list_del_init(&counter->list);
+		counter->task = NULL;
+	}
+	spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Cross CPU call to install and enable a preformance counter
+ */
+static void __perf_install_in_context(void *info)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_counter *counter = info;
+	struct perf_counter_context *ctx = counter->ctx;
+	int cpu = smp_processor_id();
+
+	/*
+	 * If this is a task context, we need to check whether it is
+	 * the current task context of this cpu. If not it has been
+	 * scheduled out before the smp call arrived.
+	 */
+	if (ctx->task && cpuctx->task_ctx != ctx)
+		return;
+
+	spin_lock(&ctx->lock);
+
+	/*
+	 * Protect the list operation against NMI by disabling the
+	 * counters on a global level. NOP for non NMI based counters.
+	 */
+	hw_perf_disable_all();
+	list_add_tail(&counter->list, &ctx->counters);
+	hw_perf_enable_all();
+
+	ctx->nr_counters++;
+
+	if (cpuctx->active_oncpu < perf_max_counters) {
+		hw_perf_counter_enable(counter);
+		counter->active = 1;
+		counter->oncpu = cpu;
+		ctx->nr_active++;
+		cpuctx->active_oncpu++;
+	}
+
+	if (!ctx->task && cpuctx->max_pertask)
+		cpuctx->max_pertask--;
+
+	spin_unlock(&ctx->lock);
+}
+
+/*
+ * Attach a performance counter to a context
+ *
+ * First we add the counter to the list with the hardware enable bit
+ * in counter->hw_config cleared.
+ *
+ * If the counter is attached to a task which is on a CPU we use a smp
+ * call to enable it in the task context. The task might have been
+ * scheduled away, but we check this in the smp call again.
+ */
+static void
+perf_install_in_context(struct perf_counter_context *ctx,
+			struct perf_counter *counter,
+			int cpu)
+{
+	struct task_struct *task = ctx->task;
+
+	counter->ctx = ctx;
+	if (!task) {
+		/*
+		 * Per cpu counters are installed via an smp call and
+		 * the install is always sucessful.
+		 */
+		smp_call_function_single(cpu, __perf_install_in_context,
+					 counter, 1);
+		return;
+	}
+
+	counter->task = task;
+retry:
+	task_oncpu_function_call(task, __perf_install_in_context,
+				 counter);
+
+	spin_lock_irq(&ctx->lock);
+	/*
+	 * If the context is active and the counter has not been added
+	 * we need to retry the smp call.
+	 */
+	if (ctx->nr_active && list_empty(&counter->list)) {
+		spin_unlock_irq(&ctx->lock);
+		goto retry;
+	}
+
+	/*
+	 * The lock prevents that this context is scheduled in so we
+	 * can add the counter safely, if it the call above did not
+	 * succeed.
+	 */
+	if (list_empty(&counter->list)) {
+		list_add_tail(&counter->list, &ctx->counters);
+		ctx->nr_counters++;
+	}
+	spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Called from scheduler to remove the counters of the current task,
+ * with interrupts disabled.
+ *
+ * We stop each counter and update the counter value in counter->count.
+ *
+ * This does not protect us against NMI, but hw_perf_counter_disable()
+ * sets the disabled bit in the control field of counter _before_
+ * accessing the counter control register. If a NMI hits, then it will
+ * not restart the counter.
+ */
+void perf_counter_task_sched_out(struct task_struct *task, int cpu)
+{
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct perf_counter_context *ctx = &task->perf_counter_ctx;
+	struct perf_counter *counter;
+
+	if (likely(!cpuctx->task_ctx))
+		return;
+
+	spin_lock(&ctx->lock);
+	list_for_each_entry(counter, &ctx->counters, list) {
+		if (!ctx->nr_active)
+			break;
+		if (counter->active) {
+			hw_perf_counter_disable(counter);
+			counter->active = 0;
+			counter->oncpu = -1;
+			ctx->nr_active--;
+			cpuctx->active_oncpu--;
+		}
+	}
+	spin_unlock(&ctx->lock);
+	cpuctx->task_ctx = NULL;
+}
+
+/*
+ * Called from scheduler to add the counters of the current task
+ * with interrupts disabled.
+ *
+ * We restore the counter value and then enable it.
+ *
+ * This does not protect us against NMI, but hw_perf_counter_enable()
+ * sets the enabled bit in the control field of counter _before_
+ * accessing the counter control register. If a NMI hits, then it will
+ * keep the counter running.
+ */
+void perf_counter_task_sched_in(struct task_struct *task, int cpu)
+{
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct perf_counter_context *ctx = &task->perf_counter_ctx;
+	struct perf_counter *counter;
+
+	if (likely(!ctx->nr_counters))
+		return;
+
+	spin_lock(&ctx->lock);
+	list_for_each_entry(counter, &ctx->counters, list) {
+		if (ctx->nr_active == cpuctx->max_pertask)
+			break;
+		if (counter->cpu != -1 && counter->cpu != cpu)
+			continue;
+
+		hw_perf_counter_enable(counter);
+		counter->active = 1;
+		counter->oncpu = cpu;
+		ctx->nr_active++;
+		cpuctx->active_oncpu++;
+	}
+	spin_unlock(&ctx->lock);
+	cpuctx->task_ctx = ctx;
+}
+
+void perf_counter_task_tick(struct task_struct *curr, int cpu)
+{
+	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+	struct perf_counter *counter;
+
+	if (likely(!ctx->nr_counters))
+		return;
+
+	perf_counter_task_sched_out(curr, cpu);
+
+	spin_lock(&ctx->lock);
+
+	/*
+	 * Rotate the first entry last:
+	 */
+	hw_perf_disable_all();
+	list_for_each_entry(counter, &ctx->counters, list) {
+		list_del(&counter->list);
+		list_add_tail(&counter->list, &ctx->counters);
+		break;
+	}
+	hw_perf_enable_all();
+
+	spin_unlock(&ctx->lock);
+
+	perf_counter_task_sched_in(curr, cpu);
+}
+
+/*
+ * Initialize the perf_counter context in task_struct
+ */
+void perf_counter_init_task(struct task_struct *task)
+{
+	struct perf_counter_context *ctx = &task->perf_counter_ctx;
+
+	spin_lock_init(&ctx->lock);
+	INIT_LIST_HEAD(&ctx->counters);
+	ctx->nr_counters = 0;
+	ctx->task = task;
+}
+
+/*
+ * Cross CPU call to read the hardware counter
+ */
+static void __hw_perf_counter_read(void *info)
+{
+	hw_perf_counter_read(info);
+}
+
+static u64 perf_read_counter(struct perf_counter *counter)
+{
+	/*
+	 * If counter is enabled and currently active on a CPU, update the
+	 * value in the counter structure:
+	 */
+	if (counter->active) {
+		smp_call_function_single(counter->oncpu,
+					 __hw_perf_counter_read, counter, 1);
+	}
+
+	return perf_read_counter_safe(counter);
+}
+
+/*
+ * Cross CPU call to switch performance data pointers
+ */
+static void __perf_switch_irq_data(void *info)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_counter *counter = info;
+	struct perf_counter_context *ctx = counter->ctx;
+	struct perf_data *oldirqdata = counter->irqdata;
+
+	/*
+	 * If this is a task context, we need to check whether it is
+	 * the current task context of this cpu. If not it has been
+	 * scheduled out before the smp call arrived.
+	 */
+	if (ctx->task) {
+		if (cpuctx->task_ctx != ctx)
+			return;
+		spin_lock(&ctx->lock);
+	}
+
+	/* Change the pointer NMI safe */
+	atomic_long_set((atomic_long_t *)&counter->irqdata,
+			(unsigned long) counter->usrdata);
+	counter->usrdata = oldirqdata;
+
+	if (ctx->task)
+		spin_unlock(&ctx->lock);
+}
+
+static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
+{
+	struct perf_counter_context *ctx = counter->ctx;
+	struct perf_data *oldirqdata = counter->irqdata;
+	struct task_struct *task = ctx->task;
+
+	if (!task) {
+		smp_call_function_single(counter->cpu,
+					 __perf_switch_irq_data,
+					 counter, 1);
+		return counter->usrdata;
+	}
+
+retry:
+	spin_lock_irq(&ctx->lock);
+	if (!counter->active) {
+		counter->irqdata = counter->usrdata;
+		counter->usrdata = oldirqdata;
+		spin_unlock_irq(&ctx->lock);
+		return oldirqdata;
+	}
+	spin_unlock_irq(&ctx->lock);
+	task_oncpu_function_call(task, __perf_switch_irq_data, counter);
+	/* Might have failed, because task was scheduled out */
+	if (counter->irqdata == oldirqdata)
+		goto retry;
+
+	return counter->usrdata;
+}
+
+static void put_context(struct perf_counter_context *ctx)
+{
+	if (ctx->task)
+		put_task_struct(ctx->task);
+}
+
+static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
+{
+	struct perf_cpu_context *cpuctx;
+	struct perf_counter_context *ctx;
+	struct task_struct *task;
+
+	/*
+	 * If cpu is not a wildcard then this is a percpu counter:
+	 */
+	if (cpu != -1) {
+		/* Must be root to operate on a CPU counter: */
+		if (!capable(CAP_SYS_ADMIN))
+			return ERR_PTR(-EACCES);
+
+		if (cpu < 0 || cpu > num_possible_cpus())
+			return ERR_PTR(-EINVAL);
+
+		/*
+		 * We could be clever and allow to attach a counter to an
+		 * offline CPU and activate it when the CPU comes up, but
+		 * that's for later.
+		 */
+		if (!cpu_isset(cpu, cpu_online_map))
+			return ERR_PTR(-ENODEV);
+
+		cpuctx = &per_cpu(perf_cpu_context, cpu);
+		ctx = &cpuctx->ctx;
+
+		WARN_ON_ONCE(ctx->task);
+		return ctx;
+	}
+
+	rcu_read_lock();
+	if (!pid)
+		task = current;
+	else
+		task = find_task_by_vpid(pid);
+	if (task)
+		get_task_struct(task);
+	rcu_read_unlock();
+
+	if (!task)
+		return ERR_PTR(-ESRCH);
+
+	ctx = &task->perf_counter_ctx;
+	ctx->task = task;
+
+	/* Reuse ptrace permission checks for now. */
+	if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
+		put_context(ctx);
+		return ERR_PTR(-EACCES);
+	}
+
+	return ctx;
+}
+
+/*
+ * Called when the last reference to the file is gone.
+ */
+static int perf_release(struct inode *inode, struct file *file)
+{
+	struct perf_counter *counter = file->private_data;
+	struct perf_counter_context *ctx = counter->ctx;
+
+	file->private_data = NULL;
+
+	mutex_lock(&counter->mutex);
+
+	perf_remove_from_context(counter);
+	put_context(ctx);
+
+	mutex_unlock(&counter->mutex);
+
+	kfree(counter);
+
+	return 0;
+}
+
+/*
+ * Read the performance counter - simple non blocking version for now
+ */
+static ssize_t
+perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
+{
+	u64 cntval;
+
+	if (count != sizeof(cntval))
+		return -EINVAL;
+
+	mutex_lock(&counter->mutex);
+	cntval = perf_read_counter(counter);
+	mutex_unlock(&counter->mutex);
+
+	return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
+}
+
+static ssize_t
+perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
+{
+	if (!usrdata->len)
+		return 0;
+
+	count = min(count, (size_t)usrdata->len);
+	if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
+		return -EFAULT;
+
+	/* Adjust the counters */
+	usrdata->len -= count;
+	if (!usrdata->len)
+		usrdata->rd_idx = 0;
+	else
+		usrdata->rd_idx += count;
+
+	return count;
+}
+
+static ssize_t
+perf_read_irq_data(struct perf_counter	*counter,
+		   char __user		*buf,
+		   size_t		count,
+		   int			nonblocking)
+{
+	struct perf_data *irqdata, *usrdata;
+	DECLARE_WAITQUEUE(wait, current);
+	ssize_t res;
+
+	irqdata = counter->irqdata;
+	usrdata = counter->usrdata;
+
+	if (usrdata->len + irqdata->len >= count)
+		goto read_pending;
+
+	if (nonblocking)
+		return -EAGAIN;
+
+	spin_lock_irq(&counter->waitq.lock);
+	__add_wait_queue(&counter->waitq, &wait);
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (usrdata->len + irqdata->len >= count)
+			break;
+
+		if (signal_pending(current))
+			break;
+
+		spin_unlock_irq(&counter->waitq.lock);
+		schedule();
+		spin_lock_irq(&counter->waitq.lock);
+	}
+	__remove_wait_queue(&counter->waitq, &wait);
+	__set_current_state(TASK_RUNNING);
+	spin_unlock_irq(&counter->waitq.lock);
+
+	if (usrdata->len + irqdata->len < count)
+		return -ERESTARTSYS;
+read_pending:
+	mutex_lock(&counter->mutex);
+
+	/* Drain pending data first: */
+	res = perf_copy_usrdata(usrdata, buf, count);
+	if (res < 0 || res == count)
+		goto out;
+
+	/* Switch irq buffer: */
+	usrdata = perf_switch_irq_data(counter);
+	if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) {
+		if (!res)
+			res = -EFAULT;
+	} else {
+		res = count;
+	}
+out:
+	mutex_unlock(&counter->mutex);
+
+	return res;
+}
+
+static ssize_t
+perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+	struct perf_counter *counter = file->private_data;
+
+	switch (counter->record_type) {
+	case PERF_RECORD_SIMPLE:
+		return perf_read_hw(counter, buf, count);
+
+	case PERF_RECORD_IRQ:
+	case PERF_RECORD_GROUP:
+		return perf_read_irq_data(counter, buf, count,
+					  file->f_flags & O_NONBLOCK);
+	}
+	return -EINVAL;
+}
+
+static unsigned int perf_poll(struct file *file, poll_table *wait)
+{
+	struct perf_counter *counter = file->private_data;
+	unsigned int events = 0;
+	unsigned long flags;
+
+	poll_wait(file, &counter->waitq, wait);
+
+	spin_lock_irqsave(&counter->waitq.lock, flags);
+	if (counter->usrdata->len || counter->irqdata->len)
+		events |= POLLIN;
+	spin_unlock_irqrestore(&counter->waitq.lock, flags);
+
+	return events;
+}
+
+static const struct file_operations perf_fops = {
+	.release		= perf_release,
+	.read			= perf_read,
+	.poll			= perf_poll,
+};
+
+/*
+ * Allocate and initialize a counter structure
+ */
+static struct perf_counter *
+perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type)
+{
+	struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+
+	if (!counter)
+		return NULL;
+
+	mutex_init(&counter->mutex);
+	INIT_LIST_HEAD(&counter->list);
+	init_waitqueue_head(&counter->waitq);
+
+	counter->irqdata	= &counter->data[0];
+	counter->usrdata	= &counter->data[1];
+	counter->cpu		= cpu;
+	counter->record_type	= record_type;
+	counter->__irq_period	= hw_event_period;
+	counter->wakeup_pending = 0;
+
+	return counter;
+}
+
+/**
+ * sys_perf_task_open - open a performance counter associate it to a task
+ * @hw_event_type:	event type for monitoring/sampling...
+ * @pid:		target pid
+ */
+asmlinkage int
+sys_perf_counter_open(u32 hw_event_type,
+		      u32 hw_event_period,
+		      u32 record_type,
+		      pid_t pid,
+		      int cpu)
+{
+	struct perf_counter_context *ctx;
+	struct perf_counter *counter;
+	int ret;
+
+	ctx = find_get_context(pid, cpu);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = -ENOMEM;
+	counter = perf_counter_alloc(hw_event_period, cpu, record_type);
+	if (!counter)
+		goto err_put_context;
+
+	ret = hw_perf_counter_init(counter, hw_event_type);
+	if (ret)
+		goto err_free_put_context;
+
+	perf_install_in_context(ctx, counter, cpu);
+
+	ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
+	if (ret < 0)
+		goto err_remove_free_put_context;
+
+	return ret;
+
+err_remove_free_put_context:
+	mutex_lock(&counter->mutex);
+	perf_remove_from_context(counter);
+	mutex_unlock(&counter->mutex);
+
+err_free_put_context:
+	kfree(counter);
+
+err_put_context:
+	put_context(ctx);
+
+	return ret;
+}
+
+static void __cpuinit perf_init_cpu(int cpu)
+{
+	struct perf_cpu_context *ctx;
+
+	ctx = &per_cpu(perf_cpu_context, cpu);
+	spin_lock_init(&ctx->ctx.lock);
+	INIT_LIST_HEAD(&ctx->ctx.counters);
+
+	mutex_lock(&perf_resource_mutex);
+	ctx->max_pertask = perf_max_counters - perf_reserved_percpu;
+	mutex_unlock(&perf_resource_mutex);
+	hw_perf_counter_setup();
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void __perf_exit_cpu(void *info)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_counter_context *ctx = &cpuctx->ctx;
+	struct perf_counter *counter, *tmp;
+
+	list_for_each_entry_safe(counter, tmp, &ctx->counters, list)
+		__perf_remove_from_context(counter);
+
+}
+static void perf_exit_cpu(int cpu)
+{
+	smp_call_function_single(cpu, __perf_exit_cpu, NULL, 1);
+}
+#else
+static inline void perf_exit_cpu(int cpu) { }
+#endif
+
+static int __cpuinit
+perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (long)hcpu;
+
+	switch (action) {
+
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		perf_init_cpu(cpu);
+		break;
+
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		perf_exit_cpu(cpu);
+		break;
+
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata perf_cpu_nb = {
+	.notifier_call		= perf_cpu_notify,
+};
+
+static int __init perf_counter_init(void)
+{
+	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
+			(void *)(long)smp_processor_id());
+	register_cpu_notifier(&perf_cpu_nb);
+
+	return 0;
+}
+early_initcall(perf_counter_init);
+
+static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
+{
+	return sprintf(buf, "%d\n", perf_reserved_percpu);
+}
+
+static ssize_t
+perf_set_reserve_percpu(struct sysdev_class *class,
+			const char *buf,
+			size_t count)
+{
+	struct perf_cpu_context *cpuctx;
+	unsigned long val;
+	int err, cpu, mpt;
+
+	err = strict_strtoul(buf, 10, &val);
+	if (err)
+		return err;
+	if (val > perf_max_counters)
+		return -EINVAL;
+
+	mutex_lock(&perf_resource_mutex);
+	perf_reserved_percpu = val;
+	for_each_online_cpu(cpu) {
+		cpuctx = &per_cpu(perf_cpu_context, cpu);
+		spin_lock_irq(&cpuctx->ctx.lock);
+		mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
+			  perf_max_counters - perf_reserved_percpu);
+		cpuctx->max_pertask = mpt;
+		spin_unlock_irq(&cpuctx->ctx.lock);
+	}
+	mutex_unlock(&perf_resource_mutex);
+
+	return count;
+}
+
+static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
+{
+	return sprintf(buf, "%d\n", perf_overcommit);
+}
+
+static ssize_t
+perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
+{
+	unsigned long val;
+	int err;
+
+	err = strict_strtoul(buf, 10, &val);
+	if (err)
+		return err;
+	if (val > 1)
+		return -EINVAL;
+
+	mutex_lock(&perf_resource_mutex);
+	perf_overcommit = val;
+	mutex_unlock(&perf_resource_mutex);
+
+	return count;
+}
+
+static SYSDEV_CLASS_ATTR(
+				reserve_percpu,
+				0644,
+				perf_show_reserve_percpu,
+				perf_set_reserve_percpu
+			);
+
+static SYSDEV_CLASS_ATTR(
+				overcommit,
+				0644,
+				perf_show_overcommit,
+				perf_set_overcommit
+			);
+
+static struct attribute *perfclass_attrs[] = {
+	&attr_reserve_percpu.attr,
+	&attr_overcommit.attr,
+	NULL
+};
+
+static struct attribute_group perfclass_attr_group = {
+	.attrs			= perfclass_attrs,
+	.name			= "perf_counters",
+};
+
+static int __init perf_counter_sysfs_init(void)
+{
+	return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
+				  &perfclass_attr_group);
+}
+device_initcall(perf_counter_sysfs_init);
+
diff --git a/kernel/sched.c b/kernel/sched.c
index b7480fb..254d56d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2212,6 +2212,27 @@ static int sched_balance_self(int cpu, int flag)
 
 #endif /* CONFIG_SMP */
 
+/**
+ * task_oncpu_function_call - call a function on the cpu on which a task runs
+ * @p:		the task to evaluate
+ * @func:	the function to be called
+ * @info:	the function call argument
+ *
+ * Calls the function @func when the task is currently running. This might
+ * be on the current CPU, which just calls the function directly
+ */
+void task_oncpu_function_call(struct task_struct *p,
+			      void (*func) (void *info), void *info)
+{
+	int cpu;
+
+	preempt_disable();
+	cpu = task_cpu(p);
+	if (task_curr(p))
+		smp_call_function_single(cpu, func, info, 1);
+	preempt_enable();
+}
+
 /***
  * try_to_wake_up - wake up a thread
  * @p: the to-be-woken-up thread
@@ -2534,6 +2555,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
 		    struct task_struct *next)
 {
 	fire_sched_out_preempt_notifiers(prev, next);
+	perf_counter_task_sched_out(prev, cpu_of(rq));
 	prepare_lock_switch(rq, next);
 	prepare_arch_switch(next);
 }
@@ -2574,6 +2596,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	 */
 	prev_state = prev->state;
 	finish_arch_switch(prev);
+	perf_counter_task_sched_in(current, cpu_of(rq));
 	finish_lock_switch(rq, prev);
 #ifdef CONFIG_SMP
 	if (current->sched_class->post_schedule)
@@ -4296,6 +4319,7 @@ void scheduler_tick(void)
 	rq->idle_at_tick = idle_cpu(cpu);
 	trigger_load_balance(rq, cpu);
 #endif
+	perf_counter_task_tick(curr, cpu);
 }
 
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e14a232..4be8bbc 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -174,3 +174,6 @@ cond_syscall(compat_sys_timerfd_settime);
 cond_syscall(compat_sys_timerfd_gettime);
 cond_syscall(sys_eventfd);
 cond_syscall(sys_eventfd2);
+
+/* performance counters: */
+cond_syscall(sys_perf_counter_open);
-- 
cgit v0.10.2


From e7bc62b6b3aeaa8849f8383e0cfb7ca6c003adc6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 4 Dec 2008 20:13:45 +0100
Subject: performance counters: documentation

Add more documentation about performance counters.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf-counters.txt b/Documentation/perf-counters.txt
new file mode 100644
index 0000000..19033a0
--- /dev/null
+++ b/Documentation/perf-counters.txt
@@ -0,0 +1,104 @@
+
+Performance Counters for Linux
+------------------------------
+
+Performance counters are special hardware registers available on most modern
+CPUs. These registers count the number of certain types of hw events: such
+as instructions executed, cachemisses suffered, or branches mis-predicted -
+without slowing down the kernel or applications. These registers can also
+trigger interrupts when a threshold number of events have passed - and can
+thus be used to profile the code that runs on that CPU.
+
+The Linux Performance Counter subsystem provides an abstraction of these
+hardware capabilities. It provides per task and per CPU counters, and
+it provides event capabilities on top of those.
+
+Performance counters are accessed via special file descriptors.
+There's one file descriptor per virtual counter used.
+
+The special file descriptor is opened via the perf_counter_open()
+system call:
+
+ int
+ perf_counter_open(u32 hw_event_type,
+                   u32 hw_event_period,
+                   u32 record_type,
+                   pid_t pid,
+                   int cpu);
+
+The syscall returns the new fd. The fd can be used via the normal
+VFS system calls: read() can be used to read the counter, fcntl()
+can be used to set the blocking mode, etc.
+
+Multiple counters can be kept open at a time, and the counters
+can be poll()ed.
+
+When creating a new counter fd, 'hw_event_type' is one of:
+
+ enum hw_event_types {
+	PERF_COUNT_CYCLES,
+	PERF_COUNT_INSTRUCTIONS,
+	PERF_COUNT_CACHE_REFERENCES,
+	PERF_COUNT_CACHE_MISSES,
+	PERF_COUNT_BRANCH_INSTRUCTIONS,
+	PERF_COUNT_BRANCH_MISSES,
+ };
+
+These are standardized types of events that work uniformly on all CPUs
+that implements Performance Counters support under Linux. If a CPU is
+not able to count branch-misses, then the system call will return
+-EINVAL.
+
+[ Note: more hw_event_types are supported as well, but they are CPU
+  specific and are enumerated via /sys on a per CPU basis. Raw hw event
+  types can be passed in as negative numbers. For example, to count
+  "External bus cycles while bus lock signal asserted" events on Intel
+  Core CPUs, pass in a -0x4064 event type value. ]
+
+The parameter 'hw_event_period' is the number of events before waking up
+a read() that is blocked on a counter fd. Zero value means a non-blocking
+counter.
+
+'record_type' is the type of data that a read() will provide for the
+counter, and it can be one of:
+
+  enum perf_record_type {
+	PERF_RECORD_SIMPLE,
+	PERF_RECORD_IRQ,
+  };
+
+a "simple" counter is one that counts hardware events and allows
+them to be read out into a u64 count value. (read() returns 8 on
+a successful read of a simple counter.)
+
+An "irq" counter is one that will also provide an IRQ context information:
+the IP of the interrupted context. In this case read() will return
+the 8-byte counter value, plus the Instruction Pointer address of the
+interrupted context.
+
+The 'pid' parameter allows the counter to be specific to a task:
+
+ pid == 0: if the pid parameter is zero, the counter is attached to the
+ current task.
+
+ pid > 0: the counter is attached to a specific task (if the current task
+ has sufficient privilege to do so)
+
+ pid < 0: all tasks are counted (per cpu counters)
+
+The 'cpu' parameter allows a counter to be made specific to a full
+CPU:
+
+ cpu >= 0: the counter is restricted to a specific CPU
+ cpu == -1: the counter counts on all CPUs
+
+Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.
+
+A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts
+events of that task and 'follows' that task to whatever CPU the task
+gets schedule to. Per task counters can be created by any user, for
+their own tasks.
+
+A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts
+all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege.
+
-- 
cgit v0.10.2


From 241771ef016b5c0c83cd7a4372a74321c973c1e6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Dec 2008 10:39:53 +0100
Subject: performance counters: x86 support

Implement performance counters for x86 Intel CPUs.

It's simplified right now: the PERFMON CPU feature is assumed,
which is available in Core2 and later Intel CPUs.

The design is flexible to be extended to more CPU types as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d4d4cb7..f2fdc18 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -643,6 +643,7 @@ config X86_UP_IOAPIC
 config X86_LOCAL_APIC
 	def_bool y
 	depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
+	select HAVE_PERF_COUNTERS
 
 config X86_IO_APIC
 	def_bool y
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 256b00b..3c14ed0 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -823,7 +823,8 @@ ia32_sys_call_table:
 	.quad compat_sys_signalfd4
 	.quad sys_eventfd2
 	.quad sys_epoll_create1
-	.quad sys_dup3			/* 330 */
+	.quad sys_dup3				/* 330 */
 	.quad sys_pipe2
 	.quad sys_inotify_init1
+	.quad sys_perf_counter_open
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/hardirq_32.h b/arch/x86/include/asm/hardirq_32.h
index 5ca135e..b3e475d 100644
--- a/arch/x86/include/asm/hardirq_32.h
+++ b/arch/x86/include/asm/hardirq_32.h
@@ -9,6 +9,7 @@ typedef struct {
 	unsigned long idle_timestamp;
 	unsigned int __nmi_count;	/* arch dependent */
 	unsigned int apic_timer_irqs;	/* arch dependent */
+	unsigned int apic_perf_irqs;	/* arch dependent */
 	unsigned int irq0_irqs;
 	unsigned int irq_resched_count;
 	unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 8de644b6..aa93e53 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -30,6 +30,8 @@
 /* Interrupt handlers registered during init_IRQ */
 extern void apic_timer_interrupt(void);
 extern void error_interrupt(void);
+extern void perf_counter_interrupt(void);
+
 extern void spurious_interrupt(void);
 extern void thermal_interrupt(void);
 extern void reschedule_interrupt(void);
diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
index fa0fd06..71598a9 100644
--- a/arch/x86/include/asm/intel_arch_perfmon.h
+++ b/arch/x86/include/asm/intel_arch_perfmon.h
@@ -1,22 +1,24 @@
 #ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
 #define _ASM_X86_INTEL_ARCH_PERFMON_H
 
-#define MSR_ARCH_PERFMON_PERFCTR0		0xc1
-#define MSR_ARCH_PERFMON_PERFCTR1		0xc2
+#define MSR_ARCH_PERFMON_PERFCTR0			      0xc1
+#define MSR_ARCH_PERFMON_PERFCTR1			      0xc2
 
-#define MSR_ARCH_PERFMON_EVENTSEL0		0x186
-#define MSR_ARCH_PERFMON_EVENTSEL1		0x187
+#define MSR_ARCH_PERFMON_EVENTSEL0			     0x186
+#define MSR_ARCH_PERFMON_EVENTSEL1			     0x187
 
-#define ARCH_PERFMON_EVENTSEL0_ENABLE	(1 << 22)
-#define ARCH_PERFMON_EVENTSEL_INT	(1 << 20)
-#define ARCH_PERFMON_EVENTSEL_OS	(1 << 17)
-#define ARCH_PERFMON_EVENTSEL_USR	(1 << 16)
+#define ARCH_PERFMON_EVENTSEL0_ENABLE			  (1 << 22)
+#define ARCH_PERFMON_EVENTSEL_INT			  (1 << 20)
+#define ARCH_PERFMON_EVENTSEL_OS			  (1 << 17)
+#define ARCH_PERFMON_EVENTSEL_USR			  (1 << 16)
 
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL	(0x3c)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK	(0x00 << 8)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		      0x3c
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 		 0
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
-	(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
+		(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
+
+#define ARCH_PERFMON_BRANCH_MISSES_RETIRED			 6
 
 union cpuid10_eax {
 	struct {
@@ -28,4 +30,12 @@ union cpuid10_eax {
 	unsigned int full;
 };
 
+#ifdef CONFIG_PERF_COUNTERS
+extern void init_hw_perf_counters(void);
+extern void perf_counters_lapic_init(int nmi);
+#else
+static inline void init_hw_perf_counters(void)		{ }
+static inline void perf_counters_lapic_init(int nmi)	{ }
+#endif
+
 #endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 0005adb..b8d277f 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -87,6 +87,11 @@
 #define LOCAL_TIMER_VECTOR	0xef
 
 /*
+ * Performance monitoring interrupt vector:
+ */
+#define LOCAL_PERF_VECTOR	0xee
+
+/*
  * First APIC vector available to drivers: (vectors 0x30-0xee) we
  * start at 0x31(0x41) to spread out vectors evenly between priority
  * levels. (0x80 is the syscall vector)
diff --git a/arch/x86/include/asm/mach-default/entry_arch.h b/arch/x86/include/asm/mach-default/entry_arch.h
index 6b1add8..ad31e5d 100644
--- a/arch/x86/include/asm/mach-default/entry_arch.h
+++ b/arch/x86/include/asm/mach-default/entry_arch.h
@@ -25,10 +25,15 @@ BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
  * a much simpler SMP time architecture:
  */
 #ifdef CONFIG_X86_LOCAL_APIC
+
 BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
 BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
 BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
 
+#ifdef CONFIG_PERF_COUNTERS
+BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
+#endif
+
 #ifdef CONFIG_X86_MCE_P4THERMAL
 BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
 #endif
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 2fbfff8..90a8d9d 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -30,6 +30,7 @@ struct x8664_pda {
 	short isidle;
 	struct mm_struct *active_mm;
 	unsigned apic_timer_irqs;
+	unsigned apic_perf_irqs;
 	unsigned irq0_irqs;
 	unsigned irq_resched_count;
 	unsigned irq_call_count;
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e44d379..810bf26 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -80,6 +80,7 @@ struct thread_info {
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
+#define TIF_PERF_COUNTERS	11	/* notify perf counter work */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* 32bit process */
 #define TIF_FORK		18	/* ret_from_fork */
@@ -103,6 +104,7 @@ struct thread_info {
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
+#define _TIF_PERF_COUNTERS	(1 << TIF_PERF_COUNTERS)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
 #define _TIF_IA32		(1 << TIF_IA32)
 #define _TIF_FORK		(1 << TIF_FORK)
@@ -135,7 +137,7 @@ struct thread_info {
 
 /* Only used for 64 bit */
 #define _TIF_DO_NOTIFY_MASK						\
-	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
+	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERF_COUNTERS|_TIF_NOTIFY_RESUME)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index f2bba78..7e47658 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -338,6 +338,7 @@
 #define __NR_dup3		330
 #define __NR_pipe2		331
 #define __NR_inotify_init1	332
+#define __NR_perf_counter_open	333
 
 #ifdef __KERNEL__
 
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index d2e415e..53025fe 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -653,7 +653,8 @@ __SYSCALL(__NR_dup3, sys_dup3)
 __SYSCALL(__NR_pipe2, sys_pipe2)
 #define __NR_inotify_init1			294
 __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
-
+#define __NR_perf_counter_open		295
+__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 16f9487..8ab8c18 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -31,6 +31,7 @@
 #include <linux/dmi.h>
 #include <linux/dmar.h>
 
+#include <asm/intel_arch_perfmon.h>
 #include <asm/atomic.h>
 #include <asm/smp.h>
 #include <asm/mtrr.h>
@@ -1147,6 +1148,7 @@ void __cpuinit setup_local_APIC(void)
 		apic_write(APIC_ESR, 0);
 	}
 #endif
+	perf_counters_lapic_init(0);
 
 	preempt_disable();
 
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 82ec607..89e5336 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,5 +1,5 @@
 #
-# Makefile for x86-compatible CPU details and quirks
+# Makefile for x86-compatible CPU details, features and quirks
 #
 
 obj-y			:= intel_cacheinfo.o addon_cpuid_features.o
@@ -16,11 +16,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR_64)	+= centaur_64.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
-obj-$(CONFIG_X86_MCE)	+= mcheck/
-obj-$(CONFIG_MTRR)	+= mtrr/
-obj-$(CONFIG_CPU_FREQ)	+= cpufreq/
+obj-$(CONFIG_PERF_COUNTERS)		+= perf_counter.o
 
-obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
+obj-$(CONFIG_X86_MCE)			+= mcheck/
+obj-$(CONFIG_MTRR)			+= mtrr/
+obj-$(CONFIG_CPU_FREQ)			+= cpufreq/
+
+obj-$(CONFIG_X86_LOCAL_APIC)		+= perfctr-watchdog.o
 
 quiet_cmd_mkcapflags = MKCAP   $@
       cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b9c9ea0..4461011 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -17,6 +17,7 @@
 #include <asm/mmu_context.h>
 #include <asm/mtrr.h>
 #include <asm/mce.h>
+#include <asm/intel_arch_perfmon.h>
 #include <asm/pat.h>
 #include <asm/asm.h>
 #include <asm/numa.h>
@@ -750,6 +751,7 @@ void __init identify_boot_cpu(void)
 #else
 	vgetcpu_set_mode();
 #endif
+	init_hw_perf_counters();
 }
 
 void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
new file mode 100644
index 0000000..82440cb
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -0,0 +1,571 @@
+/*
+ * Performance counter x86 architecture code
+ *
+ *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_counter.h>
+#include <linux/capability.h>
+#include <linux/notifier.h>
+#include <linux/hardirq.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/sched.h>
+
+#include <asm/intel_arch_perfmon.h>
+#include <asm/apic.h>
+
+static bool perf_counters_initialized __read_mostly;
+
+/*
+ * Number of (generic) HW counters:
+ */
+static int nr_hw_counters __read_mostly;
+static u32 perf_counter_mask __read_mostly;
+
+/* No support for fixed function counters yet */
+
+#define MAX_HW_COUNTERS		8
+
+struct cpu_hw_counters {
+	struct perf_counter	*counters[MAX_HW_COUNTERS];
+	unsigned long		used[BITS_TO_LONGS(MAX_HW_COUNTERS)];
+	int			enable_all;
+};
+
+/*
+ * Intel PerfMon v3. Used on Core2 and later.
+ */
+static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
+
+const int intel_perfmon_event_map[] =
+{
+  [PERF_COUNT_CYCLES]			= 0x003c,
+  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
+  [PERF_COUNT_CACHE_REFERENCES]		= 0x4f2e,
+  [PERF_COUNT_CACHE_MISSES]		= 0x412e,
+  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
+  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
+};
+
+const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
+
+/*
+ * Setup the hardware configuration for a given hw_event_type
+ */
+int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+
+	if (unlikely(!perf_counters_initialized))
+		return -EINVAL;
+
+	/*
+	 * Count user events, and generate PMC IRQs:
+	 * (keep 'enabled' bit clear for now)
+	 */
+	hwc->config = ARCH_PERFMON_EVENTSEL_USR | ARCH_PERFMON_EVENTSEL_INT;
+
+	/*
+	 * If privileged enough, count OS events too, and allow
+	 * NMI events as well:
+	 */
+	hwc->nmi = 0;
+	if (capable(CAP_SYS_ADMIN)) {
+		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
+		if (hw_event_type & PERF_COUNT_NMI)
+			hwc->nmi = 1;
+	}
+
+	hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0;
+	hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
+
+	hwc->irq_period = counter->__irq_period;
+	/*
+	 * Intel PMCs cannot be accessed sanely above 32 bit width,
+	 * so we install an artificial 1<<31 period regardless of
+	 * the generic counter period:
+	 */
+	if (!hwc->irq_period)
+		hwc->irq_period = 0x7FFFFFFF;
+
+	hwc->next_count = -((s32) hwc->irq_period);
+
+	/*
+	 * Negative event types mean raw encoded event+umask values:
+	 */
+	if (hw_event_type < 0) {
+		counter->hw_event_type = -hw_event_type;
+		counter->hw_event_type &= ~PERF_COUNT_NMI;
+	} else {
+		hw_event_type &= ~PERF_COUNT_NMI;
+		if (hw_event_type >= max_intel_perfmon_events)
+			return -EINVAL;
+		/*
+		 * The generic map:
+		 */
+		counter->hw_event_type = intel_perfmon_event_map[hw_event_type];
+	}
+	hwc->config |= counter->hw_event_type;
+	counter->wakeup_pending = 0;
+
+	return 0;
+}
+
+static void __hw_perf_enable_all(void)
+{
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
+}
+
+void hw_perf_enable_all(void)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+	cpuc->enable_all = 1;
+	__hw_perf_enable_all();
+}
+
+void hw_perf_disable_all(void)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+	cpuc->enable_all = 0;
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+}
+
+static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]);
+
+static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+{
+	per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count;
+
+	wrmsr(hwc->counter_base + idx, hwc->next_count, 0);
+	wrmsr(hwc->config_base + idx, hwc->config, 0);
+}
+
+void hw_perf_counter_enable(struct perf_counter *counter)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+	struct hw_perf_counter *hwc = &counter->hw;
+	int idx = hwc->idx;
+
+	/* Try to get the previous counter again */
+	if (test_and_set_bit(idx, cpuc->used)) {
+		idx = find_first_zero_bit(cpuc->used, nr_hw_counters);
+		set_bit(idx, cpuc->used);
+		hwc->idx = idx;
+	}
+
+	perf_counters_lapic_init(hwc->nmi);
+
+	wrmsr(hwc->config_base + idx,
+	      hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+
+	cpuc->counters[idx] = counter;
+	counter->hw.config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+	__hw_perf_counter_enable(hwc, idx);
+}
+
+#ifdef CONFIG_X86_64
+static inline void atomic64_counter_set(struct perf_counter *counter, u64 val)
+{
+	atomic64_set(&counter->count, val);
+}
+
+static inline u64 atomic64_counter_read(struct perf_counter *counter)
+{
+	return atomic64_read(&counter->count);
+}
+#else
+/*
+ * Todo: add proper atomic64_t support to 32-bit x86:
+ */
+static inline void atomic64_counter_set(struct perf_counter *counter, u64 val64)
+{
+	u32 *val32 = (void *)&val64;
+
+	atomic_set(counter->count32 + 0, *(val32 + 0));
+	atomic_set(counter->count32 + 1, *(val32 + 1));
+}
+
+static inline u64 atomic64_counter_read(struct perf_counter *counter)
+{
+	return atomic_read(counter->count32 + 0) |
+		(u64) atomic_read(counter->count32 + 1) << 32;
+}
+#endif
+
+static void __hw_perf_save_counter(struct perf_counter *counter,
+				   struct hw_perf_counter *hwc, int idx)
+{
+	s64 raw = -1;
+	s64 delta;
+	int err;
+
+	/*
+	 * Get the raw hw counter value:
+	 */
+	err = rdmsrl_safe(hwc->counter_base + idx, &raw);
+	WARN_ON_ONCE(err);
+
+	/*
+	 * Rebase it to zero (it started counting at -irq_period),
+	 * to see the delta since ->prev_count:
+	 */
+	delta = (s64)hwc->irq_period + (s64)(s32)raw;
+
+	atomic64_counter_set(counter, hwc->prev_count + delta);
+
+	/*
+	 * Adjust the ->prev_count offset - if we went beyond
+	 * irq_period of units, then we got an IRQ and the counter
+	 * was set back to -irq_period:
+	 */
+	while (delta >= (s64)hwc->irq_period) {
+		hwc->prev_count += hwc->irq_period;
+		delta -= (s64)hwc->irq_period;
+	}
+
+	/*
+	 * Calculate the next raw counter value we'll write into
+	 * the counter at the next sched-in time:
+	 */
+	delta -= (s64)hwc->irq_period;
+
+	hwc->next_count = (s32)delta;
+}
+
+void perf_counter_print_debug(void)
+{
+	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count;
+	int cpu, err, idx;
+
+	local_irq_disable();
+
+	cpu = smp_processor_id();
+
+	err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_CTRL, &ctrl);
+	WARN_ON_ONCE(err);
+
+	err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_STATUS, &status);
+	WARN_ON_ONCE(err);
+
+	err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_OVF_CTRL, &overflow);
+	WARN_ON_ONCE(err);
+
+	printk(KERN_INFO "\n");
+	printk(KERN_INFO "CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
+	printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
+	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
+
+	for (idx = 0; idx < nr_hw_counters; idx++) {
+		err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl);
+		WARN_ON_ONCE(err);
+
+		err = rdmsrl_safe(MSR_ARCH_PERFMON_PERFCTR0 + idx, &pmc_count);
+		WARN_ON_ONCE(err);
+
+		next_count = per_cpu(prev_next_count[idx], cpu);
+
+		printk(KERN_INFO "CPU#%d: PMC%d ctrl:  %016llx\n",
+			cpu, idx, pmc_ctrl);
+		printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n",
+			cpu, idx, pmc_count);
+		printk(KERN_INFO "CPU#%d: PMC%d next:  %016llx\n",
+			cpu, idx, next_count);
+	}
+	local_irq_enable();
+}
+
+void hw_perf_counter_disable(struct perf_counter *counter)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+	struct hw_perf_counter *hwc = &counter->hw;
+	unsigned int idx = hwc->idx;
+
+	counter->hw.config &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+	wrmsr(hwc->config_base + idx, hwc->config, 0);
+
+	clear_bit(idx, cpuc->used);
+	cpuc->counters[idx] = NULL;
+	__hw_perf_save_counter(counter, hwc, idx);
+}
+
+void hw_perf_counter_read(struct perf_counter *counter)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+	unsigned long addr = hwc->counter_base + hwc->idx;
+	s64 offs, val = -1LL;
+	s32 val32;
+	int err;
+
+	/* Careful: NMI might modify the counter offset */
+	do {
+		offs = hwc->prev_count;
+		err = rdmsrl_safe(addr, &val);
+		WARN_ON_ONCE(err);
+	} while (offs != hwc->prev_count);
+
+	val32 = (s32) val;
+	val =  (s64)hwc->irq_period + (s64)val32;
+	atomic64_counter_set(counter, hwc->prev_count + val);
+}
+
+static void perf_store_irq_data(struct perf_counter *counter, u64 data)
+{
+	struct perf_data *irqdata = counter->irqdata;
+
+	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
+		irqdata->overrun++;
+	} else {
+		u64 *p = (u64 *) &irqdata->data[irqdata->len];
+
+		*p = data;
+		irqdata->len += sizeof(u64);
+	}
+}
+
+static void perf_save_and_restart(struct perf_counter *counter)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+	int idx = hwc->idx;
+
+	wrmsr(hwc->config_base + idx,
+	      hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+
+	if (hwc->config & ARCH_PERFMON_EVENTSEL0_ENABLE) {
+		__hw_perf_save_counter(counter, hwc, idx);
+		__hw_perf_counter_enable(hwc, idx);
+	}
+}
+
+static void
+perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
+{
+	struct perf_counter_context *ctx = leader->ctx;
+	struct perf_counter *counter;
+	int bit;
+
+	list_for_each_entry(counter, &ctx->counters, list) {
+		if (counter->record_type != PERF_RECORD_SIMPLE ||
+		    counter == leader)
+			continue;
+
+		if (counter->active) {
+			/*
+			 * When counter was not in the overflow mask, we have to
+			 * read it from hardware. We read it as well, when it
+			 * has not been read yet and clear the bit in the
+			 * status mask.
+			 */
+			bit = counter->hw.idx;
+			if (!test_bit(bit, (unsigned long *) overflown) ||
+			    test_bit(bit, (unsigned long *) status)) {
+				clear_bit(bit, (unsigned long *) status);
+				perf_save_and_restart(counter);
+			}
+		}
+		perf_store_irq_data(leader, counter->hw_event_type);
+		perf_store_irq_data(leader, atomic64_counter_read(counter));
+	}
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
+{
+	int bit, cpu = smp_processor_id();
+	struct cpu_hw_counters *cpuc;
+	u64 ack, status;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+	if (!status) {
+		ack_APIC_irq();
+		return;
+	}
+
+	/* Disable counters globally */
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+	ack_APIC_irq();
+
+	cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+again:
+	ack = status;
+	for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) {
+		struct perf_counter *counter = cpuc->counters[bit];
+
+		clear_bit(bit, (unsigned long *) &status);
+		if (!counter)
+			continue;
+
+		perf_save_and_restart(counter);
+
+		switch (counter->record_type) {
+		case PERF_RECORD_SIMPLE:
+			continue;
+		case PERF_RECORD_IRQ:
+			perf_store_irq_data(counter, instruction_pointer(regs));
+			break;
+		case PERF_RECORD_GROUP:
+			perf_store_irq_data(counter, counter->hw_event_type);
+			perf_store_irq_data(counter,
+					    atomic64_counter_read(counter));
+			perf_handle_group(counter, &status, &ack);
+			break;
+		}
+		/*
+		 * From NMI context we cannot call into the scheduler to
+		 * do a task wakeup - but we mark these counters as
+		 * wakeup_pending and initate a wakeup callback:
+		 */
+		if (nmi) {
+			counter->wakeup_pending = 1;
+			set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
+		} else {
+			wake_up(&counter->waitq);
+		}
+	}
+
+	wrmsr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack, 0);
+
+	/*
+	 * Repeat if there is more work to be done:
+	 */
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+	if (status)
+		goto again;
+
+	/*
+	 * Do not reenable when global enable is off:
+	 */
+	if (cpuc->enable_all)
+		__hw_perf_enable_all();
+}
+
+void smp_perf_counter_interrupt(struct pt_regs *regs)
+{
+	irq_enter();
+#ifdef CONFIG_X86_64
+	add_pda(apic_perf_irqs, 1);
+#else
+	per_cpu(irq_stat, smp_processor_id()).apic_perf_irqs++;
+#endif
+	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
+	__smp_perf_counter_interrupt(regs, 0);
+
+	irq_exit();
+}
+
+/*
+ * This handler is triggered by NMI contexts:
+ */
+void perf_counter_notify(struct pt_regs *regs)
+{
+	struct cpu_hw_counters *cpuc;
+	unsigned long flags;
+	int bit, cpu;
+
+	local_irq_save(flags);
+	cpu = smp_processor_id();
+	cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+	for_each_bit(bit, cpuc->used, nr_hw_counters) {
+		struct perf_counter *counter = cpuc->counters[bit];
+
+		if (!counter)
+			continue;
+
+		if (counter->wakeup_pending) {
+			counter->wakeup_pending = 0;
+			wake_up(&counter->waitq);
+		}
+	}
+
+	local_irq_restore(flags);
+}
+
+void __cpuinit perf_counters_lapic_init(int nmi)
+{
+	u32 apic_val;
+
+	if (!perf_counters_initialized)
+		return;
+	/*
+	 * Enable the performance counter vector in the APIC LVT:
+	 */
+	apic_val = apic_read(APIC_LVTERR);
+
+	apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
+	if (nmi)
+		apic_write(APIC_LVTPC, APIC_DM_NMI);
+	else
+		apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
+	apic_write(APIC_LVTERR, apic_val);
+}
+
+static int __kprobes
+perf_counter_nmi_handler(struct notifier_block *self,
+			 unsigned long cmd, void *__args)
+{
+	struct die_args *args = __args;
+	struct pt_regs *regs;
+
+	if (likely(cmd != DIE_NMI_IPI))
+		return NOTIFY_DONE;
+
+	regs = args->regs;
+
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	__smp_perf_counter_interrupt(regs, 1);
+
+	return NOTIFY_STOP;
+}
+
+static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
+	.notifier_call		= perf_counter_nmi_handler
+};
+
+void __init init_hw_perf_counters(void)
+{
+	union cpuid10_eax eax;
+	unsigned int unused;
+	unsigned int ebx;
+
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+		return;
+
+	/*
+	 * Check whether the Architectural PerfMon supports
+	 * Branch Misses Retired Event or not.
+	 */
+	cpuid(10, &(eax.full), &ebx, &unused, &unused);
+	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
+		return;
+
+	printk(KERN_INFO "Intel Performance Monitoring support detected.\n");
+
+	printk(KERN_INFO "... version:      %d\n", eax.split.version_id);
+	printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters);
+	nr_hw_counters = eax.split.num_counters;
+	if (nr_hw_counters > MAX_HW_COUNTERS) {
+		nr_hw_counters = MAX_HW_COUNTERS;
+		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
+			nr_hw_counters, MAX_HW_COUNTERS);
+	}
+	perf_counter_mask = (1 << nr_hw_counters) - 1;
+	perf_max_counters = nr_hw_counters;
+
+	printk(KERN_INFO "... bit_width:    %d\n", eax.split.bit_width);
+	printk(KERN_INFO "... mask_length:  %d\n", eax.split.mask_length);
+
+	perf_counters_lapic_init(0);
+	register_die_notifier(&perf_counter_nmi_notifier);
+
+	perf_counters_initialized = true;
+}
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3194636..fc013cf 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -984,6 +984,11 @@ apicinterrupt ERROR_APIC_VECTOR \
 apicinterrupt SPURIOUS_APIC_VECTOR \
 	spurious_interrupt smp_spurious_interrupt
 
+#ifdef CONFIG_PERF_COUNTERS
+apicinterrupt LOCAL_PERF_VECTOR \
+	perf_counter_interrupt smp_perf_counter_interrupt
+#endif
+
 /*
  * Exception entry points.
  */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d1d4dc5..d92bc71 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -56,6 +56,10 @@ static int show_other_interrupts(struct seq_file *p)
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
 	seq_printf(p, "  Local timer interrupts\n");
+	seq_printf(p, "CNT: ");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
+	seq_printf(p, "  Performance counter interrupts\n");
 #endif
 #ifdef CONFIG_SMP
 	seq_printf(p, "RES: ");
@@ -160,6 +164,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 
 #ifdef CONFIG_X86_LOCAL_APIC
 	sum += irq_stats(cpu)->apic_timer_irqs;
+	sum += irq_stats(cpu)->apic_perf_irqs;
 #endif
 #ifdef CONFIG_SMP
 	sum += irq_stats(cpu)->irq_resched_count;
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 607db63..6a33b5e 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -160,6 +160,9 @@ void __init native_init_IRQ(void)
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+# ifdef CONFIG_PERF_COUNTERS
+	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+# endif
 #endif
 
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 8670b3c..91d785c 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -138,6 +138,11 @@ static void __init apic_intr_init(void)
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+
+	/* Performance monitoring interrupt: */
+#ifdef CONFIG_PERF_COUNTERS
+	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+#endif
 }
 
 void __init native_init_IRQ(void)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index b1cc6da..dee553c 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,7 @@
  *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
  *  2000-2002   x86-64 support by Andi Kleen
  */
-
+#include <linux/perf_counter.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
@@ -891,6 +891,11 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 		tracehook_notify_resume(regs);
 	}
 
+	if (thread_info_flags & _TIF_PERF_COUNTERS) {
+		clear_thread_flag(TIF_PERF_COUNTERS);
+		perf_counter_notify(regs);
+	}
+
 #ifdef CONFIG_X86_32
 	clear_thread_flag(TIF_IRET);
 #endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d44395f..496726d 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -332,3 +332,4 @@ ENTRY(sys_call_table)
 	.long sys_dup3			/* 330 */
 	.long sys_pipe2
 	.long sys_inotify_init1
+	.long sys_perf_counter_open
-- 
cgit v0.10.2


From 87b9cf4623ad4e5fc009e48c020593dffd5d3793 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 8 Dec 2008 14:20:16 +0100
Subject: x86, perfcounters: read out MSR_CORE_PERF_GLOBAL_STATUS with counters
 disabled

Impact: make perfcounter NMI and IRQ sequence more robust

Make __smp_perf_counter_interrupt() a bit more conservative: first disable
all counters, then read out the status. Most invocations are because there
are real events, so there's no performance impact.

Code flow gets a bit simpler as well this way.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 82440cb..615e953 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -383,18 +383,16 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 	struct cpu_hw_counters *cpuc;
 	u64 ack, status;
 
-	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-	if (!status) {
-		ack_APIC_irq();
-		return;
-	}
-
 	/* Disable counters globally */
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
 	ack_APIC_irq();
 
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+	if (!status)
+		goto out;
+
 again:
 	ack = status;
 	for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) {
@@ -440,7 +438,7 @@ again:
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 	if (status)
 		goto again;
-
+out:
 	/*
 	 * Do not reenable when global enable is off:
 	 */
-- 
cgit v0.10.2


From 4c59e4676dc95f6f58a2cff5390b2699fa5b5549 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 8 Dec 2008 19:38:33 +0100
Subject: perfcounters: select ANON_INODES

The perfcounters subsystem depends on CONFIG_ANON_INODES facilities,
so make sure it's selected.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/init/Kconfig b/init/Kconfig
index 78bede2..7d147a3 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -741,6 +741,7 @@ config PERF_COUNTERS
 	bool "Kernel Performance Counters"
 	depends on HAVE_PERF_COUNTERS
 	default y
+	select ANON_INODES
 	help
 	  Enable kernel support for performance counter hardware.
 
-- 
cgit v0.10.2


From 7e2ae34749edf19e76e594b9c4b2cdde1066afc5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 9 Dec 2008 11:40:46 +0100
Subject: perfcounters, x86: simplify disable/enable of counters

Impact: fix spurious missed counter wakeups

In the case of NMI events, close a race window that can occur if an NMI
hits counter code that temporarily disables+enables a counter, and the NMI
leaks into the disabled section.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 615e953..7d528ff 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -136,14 +136,25 @@ void hw_perf_disable_all(void)
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
 }
 
+static inline void
+__hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
+{
+	wrmsr(hwc->config_base + idx, hwc->config, 0);
+}
+
 static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]);
 
-static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx)
 {
 	per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count;
 
 	wrmsr(hwc->counter_base + idx, hwc->next_count, 0);
-	wrmsr(hwc->config_base + idx, hwc->config, 0);
+}
+
+static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+{
+	wrmsr(hwc->config_base + idx,
+	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
 void hw_perf_counter_enable(struct perf_counter *counter)
@@ -161,11 +172,11 @@ void hw_perf_counter_enable(struct perf_counter *counter)
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	wrmsr(hwc->config_base + idx,
-	      hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+	__hw_perf_counter_disable(hwc, idx);
 
 	cpuc->counters[idx] = counter;
-	counter->hw.config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+
+	__hw_perf_counter_set_period(hwc, idx);
 	__hw_perf_counter_enable(hwc, idx);
 }
 
@@ -286,8 +297,7 @@ void hw_perf_counter_disable(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
-	counter->hw.config &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
-	wrmsr(hwc->config_base + idx, hwc->config, 0);
+	__hw_perf_counter_disable(hwc, idx);
 
 	clear_bit(idx, cpuc->used);
 	cpuc->counters[idx] = NULL;
@@ -328,18 +338,24 @@ static void perf_store_irq_data(struct perf_counter *counter, u64 data)
 	}
 }
 
+/*
+ * NMI-safe enable method:
+ */
 static void perf_save_and_restart(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	int idx = hwc->idx;
+	u64 pmc_ctrl;
+	int err;
 
-	wrmsr(hwc->config_base + idx,
-	      hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+	err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl);
+	WARN_ON_ONCE(err);
 
-	if (hwc->config & ARCH_PERFMON_EVENTSEL0_ENABLE) {
-		__hw_perf_save_counter(counter, hwc, idx);
+	__hw_perf_save_counter(counter, hwc, idx);
+	__hw_perf_counter_set_period(hwc, idx);
+
+	if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
 		__hw_perf_counter_enable(hwc, idx);
-	}
 }
 
 static void
-- 
cgit v0.10.2


From 1e12567678054bc1d4c944ecfad17624b3e49345 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 9 Dec 2008 12:18:18 +0100
Subject: perfcounters, x86: clean up debug code

Impact: cleanup

Get rid of unused debug code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 7d528ff..919ec46 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -214,13 +214,11 @@ static void __hw_perf_save_counter(struct perf_counter *counter,
 {
 	s64 raw = -1;
 	s64 delta;
-	int err;
 
 	/*
 	 * Get the raw hw counter value:
 	 */
-	err = rdmsrl_safe(hwc->counter_base + idx, &raw);
-	WARN_ON_ONCE(err);
+	rdmsrl(hwc->counter_base + idx, raw);
 
 	/*
 	 * Rebase it to zero (it started counting at -irq_period),
@@ -252,20 +250,18 @@ static void __hw_perf_save_counter(struct perf_counter *counter,
 void perf_counter_print_debug(void)
 {
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count;
-	int cpu, err, idx;
+	int cpu, idx;
+
+	if (!nr_hw_counters)
+		return;
 
 	local_irq_disable();
 
 	cpu = smp_processor_id();
 
-	err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_CTRL, &ctrl);
-	WARN_ON_ONCE(err);
-
-	err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_STATUS, &status);
-	WARN_ON_ONCE(err);
-
-	err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_OVF_CTRL, &overflow);
-	WARN_ON_ONCE(err);
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+	rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
 
 	printk(KERN_INFO "\n");
 	printk(KERN_INFO "CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
@@ -273,11 +269,8 @@ void perf_counter_print_debug(void)
 	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
 
 	for (idx = 0; idx < nr_hw_counters; idx++) {
-		err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl);
-		WARN_ON_ONCE(err);
-
-		err = rdmsrl_safe(MSR_ARCH_PERFMON_PERFCTR0 + idx, &pmc_count);
-		WARN_ON_ONCE(err);
+		rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
+		rdmsrl(MSR_ARCH_PERFMON_PERFCTR0  + idx, pmc_count);
 
 		next_count = per_cpu(prev_next_count[idx], cpu);
 
@@ -310,13 +303,11 @@ void hw_perf_counter_read(struct perf_counter *counter)
 	unsigned long addr = hwc->counter_base + hwc->idx;
 	s64 offs, val = -1LL;
 	s32 val32;
-	int err;
 
 	/* Careful: NMI might modify the counter offset */
 	do {
 		offs = hwc->prev_count;
-		err = rdmsrl_safe(addr, &val);
-		WARN_ON_ONCE(err);
+		rdmsrl(addr, val);
 	} while (offs != hwc->prev_count);
 
 	val32 = (s32) val;
@@ -346,10 +337,8 @@ static void perf_save_and_restart(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	int idx = hwc->idx;
 	u64 pmc_ctrl;
-	int err;
 
-	err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl);
-	WARN_ON_ONCE(err);
+	rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 
 	__hw_perf_save_counter(counter, hwc, idx);
 	__hw_perf_counter_set_period(hwc, idx);
-- 
cgit v0.10.2


From 43874d238d5f208854a73c3225ca2a22833eec8b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 9 Dec 2008 12:23:59 +0100
Subject: perfcounters: consolidate global-disable codepaths

Impact: cleanup

Simplify global disable handling.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 919ec46..6a93d1f 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -33,7 +33,6 @@ static u32 perf_counter_mask __read_mostly;
 struct cpu_hw_counters {
 	struct perf_counter	*counters[MAX_HW_COUNTERS];
 	unsigned long		used[BITS_TO_LONGS(MAX_HW_COUNTERS)];
-	int			enable_all;
 };
 
 /*
@@ -115,24 +114,13 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type)
 	return 0;
 }
 
-static void __hw_perf_enable_all(void)
-{
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
-}
-
 void hw_perf_enable_all(void)
 {
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	cpuc->enable_all = 1;
-	__hw_perf_enable_all();
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
 }
 
 void hw_perf_disable_all(void)
 {
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	cpuc->enable_all = 0;
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
 }
 
@@ -385,8 +373,10 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
 static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 {
 	int bit, cpu = smp_processor_id();
+	u64 ack, status, saved_global;
 	struct cpu_hw_counters *cpuc;
-	u64 ack, status;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
 
 	/* Disable counters globally */
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
@@ -445,10 +435,9 @@ again:
 		goto again;
 out:
 	/*
-	 * Do not reenable when global enable is off:
+	 * Restore - do not reenable when global enable is off:
 	 */
-	if (cpuc->enable_all)
-		__hw_perf_enable_all();
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, saved_global, 0);
 }
 
 void smp_perf_counter_interrupt(struct pt_regs *regs)
-- 
cgit v0.10.2


From 4ac13294e44664bb7edf4daf52edb71e7c6bbe84 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Dec 2008 21:43:39 +0100
Subject: perf counters: protect them against CSTATE transitions

Impact: fix rare lost events problem

There are CPUs whose performance counters misbehave on CSTATE transitions,
so provide a way to just disable/enable them around deep idle methods.

(hw_perf_enable_all() is cheap on x86.)

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6a93d1f..0a7f3be 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -12,6 +12,7 @@
 #include <linux/notifier.h>
 #include <linux/hardirq.h>
 #include <linux/kprobes.h>
+#include <linux/module.h>
 #include <linux/kdebug.h>
 #include <linux/sched.h>
 
@@ -119,10 +120,21 @@ void hw_perf_enable_all(void)
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
 }
 
-void hw_perf_disable_all(void)
+void hw_perf_restore_ctrl(u64 ctrl)
 {
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
+}
+EXPORT_SYMBOL_GPL(hw_perf_restore_ctrl);
+
+u64 hw_perf_disable_all(void)
+{
+	u64 ctrl;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+	return ctrl;
 }
+EXPORT_SYMBOL_GPL(hw_perf_disable_all);
 
 static inline void
 __hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 5f8d746..cca804e 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -270,8 +270,11 @@ static atomic_t c3_cpu_count;
 /* Common C-state entry for C2, C3, .. */
 static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
 {
+	u64 pctrl;
+
 	/* Don't trace irqs off for idle */
 	stop_critical_timings();
+	pctrl = hw_perf_disable_all();
 	if (cstate->entry_method == ACPI_CSTATE_FFH) {
 		/* Call into architectural FFH based C-state */
 		acpi_processor_ffh_cstate_enter(cstate);
@@ -284,6 +287,7 @@ static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
 		   gets asserted in time to freeze execution properly. */
 		unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
 	}
+	hw_perf_restore_ctrl(pctrl);
 	start_critical_timings();
 }
 #endif /* !CONFIG_CPU_IDLE */
@@ -1425,8 +1429,11 @@ static inline void acpi_idle_update_bm_rld(struct acpi_processor *pr,
  */
 static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 {
+	u64 pctrl;
+
 	/* Don't trace irqs off for idle */
 	stop_critical_timings();
+	pctrl = hw_perf_disable_all();
 	if (cx->entry_method == ACPI_CSTATE_FFH) {
 		/* Call into architectural FFH based C-state */
 		acpi_processor_ffh_cstate_enter(cx);
@@ -1441,6 +1448,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 		   gets asserted in time to freeze execution properly. */
 		unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
 	}
+	hw_perf_restore_ctrl(pctrl);
 	start_critical_timings();
 }
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 22c4469..5031b56 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -156,6 +156,8 @@ extern void perf_counter_task_tick(struct task_struct *task, int cpu);
 extern void perf_counter_init_task(struct task_struct *task);
 extern void perf_counter_notify(struct pt_regs *regs);
 extern void perf_counter_print_debug(void);
+extern void hw_perf_restore_ctrl(u64 ctrl);
+extern u64 hw_perf_disable_all(void);
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -166,6 +168,8 @@ perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
 static inline void perf_counter_init_task(struct task_struct *task)	{ }
 static inline void perf_counter_notify(struct pt_regs *regs)		{ }
 static inline void perf_counter_print_debug(void)			{ }
+static inline void hw_perf_restore_ctrl(u64 ctrl)			{ }
+static inline u64 hw_perf_disable_all(void)		{ return 0; }
 #endif
 
 #endif /* _LINUX_PERF_COUNTER_H */
-- 
cgit v0.10.2


From eab656ae04b9d3b83265e3db01c0d2c46b748ef7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 8 Dec 2008 19:26:59 +0100
Subject: perf counters: clean up 'raw' type API

Impact: cleanup

Introduce a separate hw_event type.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 5031b56..daedd7d 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -38,6 +38,7 @@ enum hw_event_types {
 	 * If this bit is set in the type, then trigger NMI sampling:
 	 */
 	PERF_COUNT_NMI			= (1 << 30),
+	PERF_COUNT_RAW			= (1 << 31),
 };
 
 /*
@@ -49,6 +50,12 @@ enum perf_record_type {
 	PERF_RECORD_GROUP,
 };
 
+struct perf_counter_event {
+	u32			hw_event_type;
+	u32			hw_event_period;
+	u64			hw_raw_ctrl;
+};
+
 /**
  * struct hw_perf_counter - performance counter hardware details
  */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 6cce728..3ecd73d 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -54,6 +54,7 @@ struct compat_stat;
 struct compat_timeval;
 struct robust_list_head;
 struct getcpu_cache;
+struct perf_counter_event;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -625,9 +626,6 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
 asmlinkage int
-sys_perf_counter_open(u32 hw_event_type,
-		      u32 hw_event_period,
-		      u32 record_type,
-		      pid_t pid,
-		      int cpu);
+sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
+		      pid_t pid, int cpu, int masterfd);
 #endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 20508f0..96c333a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -734,26 +734,27 @@ perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type)
  * @pid:		target pid
  */
 asmlinkage int
-sys_perf_counter_open(u32 hw_event_type,
-		      u32 hw_event_period,
-		      u32 record_type,
-		      pid_t pid,
-		      int cpu)
+sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
+		      pid_t pid, int cpu, int masterfd)
 {
 	struct perf_counter_context *ctx;
+	struct perf_counter_event event;
 	struct perf_counter *counter;
 	int ret;
 
+	if (copy_from_user(&event, uevent, sizeof(event)) != 0)
+		return -EFAULT;
+
 	ctx = find_get_context(pid, cpu);
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
 	ret = -ENOMEM;
-	counter = perf_counter_alloc(hw_event_period, cpu, record_type);
+	counter = perf_counter_alloc(event.hw_event_period, cpu, record_type);
 	if (!counter)
 		goto err_put_context;
 
-	ret = hw_perf_counter_init(counter, hw_event_type);
+	ret = hw_perf_counter_init(counter, event.hw_event_type);
 	if (ret)
 		goto err_free_put_context;
 
-- 
cgit v0.10.2


From dfa7c899b401d7dc5d85aca416aee64ac82812f2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 8 Dec 2008 19:35:37 +0100
Subject: perf counters: expand use of counter->event

Impact: change syscall, cleanup

Make use of the new perf_counters event type.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 0a7f3be..30e7ebf 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -56,9 +56,10 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
 /*
  * Setup the hardware configuration for a given hw_event_type
  */
-int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type)
+int hw_perf_counter_init(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
+	u32 hw_event_type = counter->event.hw_event_type;
 
 	if (unlikely(!perf_counters_initialized))
 		return -EINVAL;
@@ -83,7 +84,7 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type)
 	hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0;
 	hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
 
-	hwc->irq_period = counter->__irq_period;
+	hwc->irq_period = counter->event.hw_event_period;
 	/*
 	 * Intel PMCs cannot be accessed sanely above 32 bit width,
 	 * so we install an artificial 1<<31 period regardless of
@@ -95,21 +96,19 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type)
 	hwc->next_count = -((s32) hwc->irq_period);
 
 	/*
-	 * Negative event types mean raw encoded event+umask values:
+	 * Raw event type provide the config in the event structure
 	 */
-	if (hw_event_type < 0) {
-		counter->hw_event_type = -hw_event_type;
-		counter->hw_event_type &= ~PERF_COUNT_NMI;
+	hw_event_type &= ~PERF_COUNT_NMI;
+	if (hw_event_type == PERF_COUNT_RAW) {
+		hwc->config |= counter->event.hw_raw_ctrl;
 	} else {
-		hw_event_type &= ~PERF_COUNT_NMI;
 		if (hw_event_type >= max_intel_perfmon_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		counter->hw_event_type = intel_perfmon_event_map[hw_event_type];
+		hwc->config |= intel_perfmon_event_map[hw_event_type];
 	}
-	hwc->config |= counter->hw_event_type;
 	counter->wakeup_pending = 0;
 
 	return 0;
@@ -373,7 +372,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
 				perf_save_and_restart(counter);
 			}
 		}
-		perf_store_irq_data(leader, counter->hw_event_type);
+		perf_store_irq_data(leader, counter->event.hw_event_type);
 		perf_store_irq_data(leader, atomic64_counter_read(counter));
 	}
 }
@@ -418,7 +417,8 @@ again:
 			perf_store_irq_data(counter, instruction_pointer(regs));
 			break;
 		case PERF_RECORD_GROUP:
-			perf_store_irq_data(counter, counter->hw_event_type);
+			perf_store_irq_data(counter,
+					    counter->event.hw_event_type);
 			perf_store_irq_data(counter,
 					    atomic64_counter_read(counter));
 			perf_handle_group(counter, &status, &ack);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index daedd7d..1f00176 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -96,8 +96,7 @@ struct perf_counter {
 #else
 	atomic_t			count32[2];
 #endif
-	u64				__irq_period;
-
+	struct perf_counter_event	event;
 	struct hw_perf_counter		hw;
 
 	struct perf_counter_context	*ctx;
@@ -111,7 +110,6 @@ struct perf_counter {
 	int				oncpu;
 	int				cpu;
 
-	s32				hw_event_type;
 	enum perf_record_type		record_type;
 
 	/* read() / irq related data */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 96c333a..2557c67 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -37,7 +37,7 @@ static DEFINE_MUTEX(perf_resource_mutex);
  * Architecture provided APIs - weak aliases:
  */
 
-int __weak hw_perf_counter_init(struct perf_counter *counter, u32 hw_event_type)
+int __weak hw_perf_counter_init(struct perf_counter *counter)
 {
 	return -EINVAL;
 }
@@ -707,7 +707,7 @@ static const struct file_operations perf_fops = {
  * Allocate and initialize a counter structure
  */
 static struct perf_counter *
-perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type)
+perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type)
 {
 	struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
 
@@ -722,7 +722,7 @@ perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type)
 	counter->usrdata	= &counter->data[1];
 	counter->cpu		= cpu;
 	counter->record_type	= record_type;
-	counter->__irq_period	= hw_event_period;
+	counter->event		= *event;
 	counter->wakeup_pending = 0;
 
 	return counter;
@@ -750,11 +750,11 @@ sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
 		return PTR_ERR(ctx);
 
 	ret = -ENOMEM;
-	counter = perf_counter_alloc(event.hw_event_period, cpu, record_type);
+	counter = perf_counter_alloc(&event, cpu, record_type);
 	if (!counter)
 		goto err_put_context;
 
-	ret = hw_perf_counter_init(counter, event.hw_event_type);
+	ret = hw_perf_counter_init(counter);
 	if (ret)
 		goto err_free_put_context;
 
-- 
cgit v0.10.2


From 9f66a3810fe0d4100972db84290f3ae4a4d77025 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 10 Dec 2008 12:33:23 +0100
Subject: perf counters: restructure the API

Impact: clean up new API

Thorough cleanup of the new perf counters API, we now get clean separation
of the various concepts:

 - introduce perf_counter_hw_event to separate out the event source details

 - move special type flags into separate attributes: PERF_COUNT_NMI,
   PERF_COUNT_RAW

 - extend the type to u64 and reserve it fully to the architecture in the
   raw type case.

And make use of all these changes in the core and x86 perfcounters code.

Also change the syscall signature to:

  asmlinkage int sys_perf_counter_open(

	struct perf_counter_hw_event	*hw_event_uptr		__user,
	pid_t				pid,
	int				cpu,
	int				group_fd);

( Note that group_fd is unused for now - it's reserved for the counter
  groups abstraction. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 30e7ebf..ef1936a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -58,8 +58,8 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
  */
 int hw_perf_counter_init(struct perf_counter *counter)
 {
+	struct perf_counter_hw_event *hw_event = &counter->hw_event;
 	struct hw_perf_counter *hwc = &counter->hw;
-	u32 hw_event_type = counter->event.hw_event_type;
 
 	if (unlikely(!perf_counters_initialized))
 		return -EINVAL;
@@ -77,14 +77,14 @@ int hw_perf_counter_init(struct perf_counter *counter)
 	hwc->nmi = 0;
 	if (capable(CAP_SYS_ADMIN)) {
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
-		if (hw_event_type & PERF_COUNT_NMI)
+		if (hw_event->nmi)
 			hwc->nmi = 1;
 	}
 
-	hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0;
-	hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
+	hwc->config_base	= MSR_ARCH_PERFMON_EVENTSEL0;
+	hwc->counter_base	= MSR_ARCH_PERFMON_PERFCTR0;
 
-	hwc->irq_period = counter->event.hw_event_period;
+	hwc->irq_period		= hw_event->irq_period;
 	/*
 	 * Intel PMCs cannot be accessed sanely above 32 bit width,
 	 * so we install an artificial 1<<31 period regardless of
@@ -93,21 +93,20 @@ int hw_perf_counter_init(struct perf_counter *counter)
 	if (!hwc->irq_period)
 		hwc->irq_period = 0x7FFFFFFF;
 
-	hwc->next_count = -((s32) hwc->irq_period);
+	hwc->next_count	= -(s32)hwc->irq_period;
 
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
-	hw_event_type &= ~PERF_COUNT_NMI;
-	if (hw_event_type == PERF_COUNT_RAW) {
-		hwc->config |= counter->event.hw_raw_ctrl;
+	if (hw_event->raw) {
+		hwc->config |= hw_event->type;
 	} else {
-		if (hw_event_type >= max_intel_perfmon_events)
+		if (hw_event->type >= max_intel_perfmon_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= intel_perfmon_event_map[hw_event_type];
+		hwc->config |= intel_perfmon_event_map[hw_event->type];
 	}
 	counter->wakeup_pending = 0;
 
@@ -354,7 +353,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
 	int bit;
 
 	list_for_each_entry(counter, &ctx->counters, list) {
-		if (counter->record_type != PERF_RECORD_SIMPLE ||
+		if (counter->hw_event.record_type != PERF_RECORD_SIMPLE ||
 		    counter == leader)
 			continue;
 
@@ -372,7 +371,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
 				perf_save_and_restart(counter);
 			}
 		}
-		perf_store_irq_data(leader, counter->event.hw_event_type);
+		perf_store_irq_data(leader, counter->hw_event.type);
 		perf_store_irq_data(leader, atomic64_counter_read(counter));
 	}
 }
@@ -410,7 +409,7 @@ again:
 
 		perf_save_and_restart(counter);
 
-		switch (counter->record_type) {
+		switch (counter->hw_event.record_type) {
 		case PERF_RECORD_SIMPLE:
 			continue;
 		case PERF_RECORD_IRQ:
@@ -418,7 +417,7 @@ again:
 			break;
 		case PERF_RECORD_GROUP:
 			perf_store_irq_data(counter,
-					    counter->event.hw_event_type);
+					    counter->hw_event.type);
 			perf_store_irq_data(counter,
 					    atomic64_counter_read(counter));
 			perf_handle_group(counter, &status, &ack);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 1f00176..a2b4852 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -24,65 +24,93 @@
 struct task_struct;
 
 /*
- * Generalized hardware event types, used by the hw_event_type parameter
- * of the sys_perf_counter_open() syscall:
+ * User-space ABI bits:
+ */
+
+/*
+ * Generalized performance counter event types, used by the hw_event.type
+ * parameter of the sys_perf_counter_open() syscall:
  */
 enum hw_event_types {
-	PERF_COUNT_CYCLES,
-	PERF_COUNT_INSTRUCTIONS,
-	PERF_COUNT_CACHE_REFERENCES,
-	PERF_COUNT_CACHE_MISSES,
-	PERF_COUNT_BRANCH_INSTRUCTIONS,
-	PERF_COUNT_BRANCH_MISSES,
 	/*
-	 * If this bit is set in the type, then trigger NMI sampling:
+	 * Common hardware events, generalized by the kernel:
 	 */
-	PERF_COUNT_NMI			= (1 << 30),
-	PERF_COUNT_RAW			= (1 << 31),
+	PERF_COUNT_CYCLES		=  0,
+	PERF_COUNT_INSTRUCTIONS		=  1,
+	PERF_COUNT_CACHE_REFERENCES	=  2,
+	PERF_COUNT_CACHE_MISSES		=  3,
+	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
+	PERF_COUNT_BRANCH_MISSES	=  5,
+
+	/*
+	 * Special "software" counters provided by the kernel, even if
+	 * the hardware does not support performance counters. These
+	 * counters measure various physical and sw events of the
+	 * kernel (and allow the profiling of them as well):
+	 */
+	PERF_COUNT_CPU_CLOCK		= -1,
+	PERF_COUNT_TASK_CLOCK		= -2,
+	PERF_COUNT_PAGE_FAULTS		= -3,
+	PERF_COUNT_CONTEXT_SWITCHES	= -4,
 };
 
 /*
  * IRQ-notification data record type:
  */
-enum perf_record_type {
-	PERF_RECORD_SIMPLE,
-	PERF_RECORD_IRQ,
-	PERF_RECORD_GROUP,
+enum perf_counter_record_type {
+	PERF_RECORD_SIMPLE		=  0,
+	PERF_RECORD_IRQ			=  1,
+	PERF_RECORD_GROUP		=  2,
 };
 
-struct perf_counter_event {
-	u32			hw_event_type;
-	u32			hw_event_period;
-	u64			hw_raw_ctrl;
+/*
+ * Hardware event to monitor via a performance monitoring counter:
+ */
+struct perf_counter_hw_event {
+	u64			type;
+
+	u64			irq_period;
+	u32			record_type;
+
+	u32			disabled     :  1, /* off by default */
+				nmi	     :  1, /* NMI sampling   */
+				raw	     :  1, /* raw event type */
+				__reserved_1 : 29;
+
+	u64			__reserved_2;
 };
 
+/*
+ * Kernel-internal data types:
+ */
+
 /**
- * struct hw_perf_counter - performance counter hardware details
+ * struct hw_perf_counter - performance counter hardware details:
  */
 struct hw_perf_counter {
-	u64			config;
-	unsigned long		config_base;
-	unsigned long		counter_base;
-	int			nmi;
-	unsigned int		idx;
-	u64			prev_count;
-	s32			next_count;
-	u64			irq_period;
+	u64				config;
+	unsigned long			config_base;
+	unsigned long			counter_base;
+	int				nmi;
+	unsigned int			idx;
+	u64				prev_count;
+	u64				irq_period;
+	s32				next_count;
 };
 
 /*
  * Hardcoded buffer length limit for now, for IRQ-fed events:
  */
-#define PERF_DATA_BUFLEN	2048
+#define PERF_DATA_BUFLEN		2048
 
 /**
  * struct perf_data - performance counter IRQ data sampling ...
  */
 struct perf_data {
-	int			len;
-	int			rd_idx;
-	int			overrun;
-	u8			data[PERF_DATA_BUFLEN];
+	int				len;
+	int				rd_idx;
+	int				overrun;
+	u8				data[PERF_DATA_BUFLEN];
 };
 
 /**
@@ -96,7 +124,7 @@ struct perf_counter {
 #else
 	atomic_t			count32[2];
 #endif
-	struct perf_counter_event	event;
+	struct perf_counter_hw_event	hw_event;
 	struct hw_perf_counter		hw;
 
 	struct perf_counter_context	*ctx;
@@ -110,8 +138,6 @@ struct perf_counter {
 	int				oncpu;
 	int				cpu;
 
-	enum perf_record_type		record_type;
-
 	/* read() / irq related data */
 	wait_queue_head_t		waitq;
 	/* optional: for NMIs */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3ecd73d..a549678 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -54,7 +54,7 @@ struct compat_stat;
 struct compat_timeval;
 struct robust_list_head;
 struct getcpu_cache;
-struct perf_counter_event;
+struct perf_counter_hw_event;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -625,7 +625,11 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
-asmlinkage int
-sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
-		      pid_t pid, int cpu, int masterfd);
+
+asmlinkage int sys_perf_counter_open(
+
+	struct perf_counter_hw_event	*hw_event_uptr		__user,
+	pid_t				pid,
+	int				cpu,
+	int				group_fd);
 #endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 2557c67..0d323ce 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -669,7 +669,7 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 {
 	struct perf_counter *counter = file->private_data;
 
-	switch (counter->record_type) {
+	switch (counter->hw_event.record_type) {
 	case PERF_RECORD_SIMPLE:
 		return perf_read_hw(counter, buf, count);
 
@@ -707,7 +707,7 @@ static const struct file_operations perf_fops = {
  * Allocate and initialize a counter structure
  */
 static struct perf_counter *
-perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type)
+perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu)
 {
 	struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
 
@@ -718,31 +718,37 @@ perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type)
 	INIT_LIST_HEAD(&counter->list);
 	init_waitqueue_head(&counter->waitq);
 
-	counter->irqdata	= &counter->data[0];
-	counter->usrdata	= &counter->data[1];
-	counter->cpu		= cpu;
-	counter->record_type	= record_type;
-	counter->event		= *event;
-	counter->wakeup_pending = 0;
+	counter->irqdata		= &counter->data[0];
+	counter->usrdata		= &counter->data[1];
+	counter->cpu			= cpu;
+	counter->hw_event		= *hw_event;
+	counter->wakeup_pending		= 0;
 
 	return counter;
 }
 
 /**
- * sys_perf_task_open - open a performance counter associate it to a task
- * @hw_event_type:	event type for monitoring/sampling...
+ * sys_perf_task_open - open a performance counter, associate it to a task/cpu
+ *
+ * @hw_event_uptr:	event type attributes for monitoring/sampling
  * @pid:		target pid
+ * @cpu:		target cpu
+ * @group_fd:		group leader counter fd
  */
-asmlinkage int
-sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
-		      pid_t pid, int cpu, int masterfd)
+asmlinkage int sys_perf_counter_open(
+
+	struct perf_counter_hw_event	*hw_event_uptr		__user,
+	pid_t				pid,
+	int				cpu,
+	int				group_fd)
+
 {
 	struct perf_counter_context *ctx;
-	struct perf_counter_event event;
+	struct perf_counter_hw_event hw_event;
 	struct perf_counter *counter;
 	int ret;
 
-	if (copy_from_user(&event, uevent, sizeof(event)) != 0)
+	if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
 		return -EFAULT;
 
 	ctx = find_get_context(pid, cpu);
@@ -750,7 +756,7 @@ sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
 		return PTR_ERR(ctx);
 
 	ret = -ENOMEM;
-	counter = perf_counter_alloc(&event, cpu, record_type);
+	counter = perf_counter_alloc(&hw_event, cpu);
 	if (!counter)
 		goto err_put_context;
 
-- 
cgit v0.10.2


From 04289bb9891882202d7e961c4c04d2376930e9f9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 08:38:42 +0100
Subject: perf counters: add support for group counters

Impact: add group counters

This patch adds the "counter groups" abstraction.

Groups of counters behave much like normal 'single' counters, with a
few semantic and behavioral extensions on top of that.

A counter group is created by creating a new counter with the open()
syscall's group-leader group_fd file descriptor parameter pointing
to another, already existing counter.

Groups of counters are scheduled in and out in one atomic group, and
they are also roundrobin-scheduled atomically.

Counters that are member of a group can also record events with an
(atomic) extended timestamp that extends to all members of the group,
if the record type is set to PERF_RECORD_GROUP.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index ef1936a..54b4ad0 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -346,18 +346,22 @@ static void perf_save_and_restart(struct perf_counter *counter)
 }
 
 static void
-perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
+perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 {
-	struct perf_counter_context *ctx = leader->ctx;
-	struct perf_counter *counter;
+	struct perf_counter *counter, *group_leader = sibling->group_leader;
 	int bit;
 
-	list_for_each_entry(counter, &ctx->counters, list) {
-		if (counter->hw_event.record_type != PERF_RECORD_SIMPLE ||
-		    counter == leader)
-			continue;
+	/*
+	 * Store the counter's own timestamp first:
+	 */
+	perf_store_irq_data(sibling, sibling->hw_event.type);
+	perf_store_irq_data(sibling, atomic64_counter_read(sibling));
 
-		if (counter->active) {
+	/*
+	 * Then store sibling timestamps (if any):
+	 */
+	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
+		if (!counter->active) {
 			/*
 			 * When counter was not in the overflow mask, we have to
 			 * read it from hardware. We read it as well, when it
@@ -371,8 +375,8 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
 				perf_save_and_restart(counter);
 			}
 		}
-		perf_store_irq_data(leader, counter->hw_event.type);
-		perf_store_irq_data(leader, atomic64_counter_read(counter));
+		perf_store_irq_data(sibling, counter->hw_event.type);
+		perf_store_irq_data(sibling, atomic64_counter_read(counter));
 	}
 }
 
@@ -416,10 +420,6 @@ again:
 			perf_store_irq_data(counter, instruction_pointer(regs));
 			break;
 		case PERF_RECORD_GROUP:
-			perf_store_irq_data(counter,
-					    counter->hw_event.type);
-			perf_store_irq_data(counter,
-					    atomic64_counter_read(counter));
 			perf_handle_group(counter, &status, &ack);
 			break;
 		}
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index a2b4852..7af7d89 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -117,7 +117,10 @@ struct perf_data {
  * struct perf_counter - performance counter kernel representation:
  */
 struct perf_counter {
-	struct list_head		list;
+	struct list_head		list_entry;
+	struct list_head		sibling_list;
+	struct perf_counter		*group_leader;
+
 	int				active;
 #if BITS_PER_LONG == 64
 	atomic64_t			count;
@@ -158,7 +161,8 @@ struct perf_counter_context {
 	 * Protect the list of counters:
 	 */
 	spinlock_t		lock;
-	struct list_head	counters;
+
+	struct list_head	counter_list;
 	int			nr_counters;
 	int			nr_active;
 	struct task_struct	*task;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0d323ce..fa59fe8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -10,6 +10,7 @@
 #include <linux/fs.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
+#include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/sysfs.h>
 #include <linux/ptrace.h>
@@ -55,7 +56,7 @@ void __weak hw_perf_counter_setup(void) { }
  * Read the cached counter in counter safe against cross CPU / NMI
  * modifications. 64 bit version - no complications.
  */
-static inline u64 perf_read_counter_safe(struct perf_counter *counter)
+static inline u64 perf_counter_read_safe(struct perf_counter *counter)
 {
 	return (u64) atomic64_read(&counter->count);
 }
@@ -66,7 +67,7 @@ static inline u64 perf_read_counter_safe(struct perf_counter *counter)
  * Read the cached counter in counter safe against cross CPU / NMI
  * modifications. 32 bit version.
  */
-static u64 perf_read_counter_safe(struct perf_counter *counter)
+static u64 perf_counter_read_safe(struct perf_counter *counter)
 {
 	u32 cntl, cnth;
 
@@ -83,13 +84,55 @@ static u64 perf_read_counter_safe(struct perf_counter *counter)
 
 #endif
 
+static void
+list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
+{
+	struct perf_counter *group_leader = counter->group_leader;
+
+	/*
+	 * Depending on whether it is a standalone or sibling counter,
+	 * add it straight to the context's counter list, or to the group
+	 * leader's sibling list:
+	 */
+	if (counter->group_leader == counter)
+		list_add_tail(&counter->list_entry, &ctx->counter_list);
+	else
+		list_add_tail(&counter->list_entry, &group_leader->sibling_list);
+}
+
+static void
+list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
+{
+	struct perf_counter *sibling, *tmp;
+
+	list_del_init(&counter->list_entry);
+
+	if (list_empty(&counter->sibling_list))
+		return;
+
+	/*
+	 * If this was a group counter with sibling counters then
+	 * upgrade the siblings to singleton counters by adding them
+	 * to the context list directly:
+	 */
+	list_for_each_entry_safe(sibling, tmp,
+				 &counter->sibling_list, list_entry) {
+
+		list_del_init(&sibling->list_entry);
+		list_add_tail(&sibling->list_entry, &ctx->counter_list);
+		WARN_ON_ONCE(!sibling->group_leader);
+		WARN_ON_ONCE(sibling->group_leader == sibling);
+		sibling->group_leader = sibling;
+	}
+}
+
 /*
  * Cross CPU call to remove a performance counter
  *
  * We disable the counter on the hardware level first. After that we
  * remove it from the context list.
  */
-static void __perf_remove_from_context(void *info)
+static void __perf_counter_remove_from_context(void *info)
 {
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter *counter = info;
@@ -119,7 +162,7 @@ static void __perf_remove_from_context(void *info)
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
 	hw_perf_disable_all();
-	list_del_init(&counter->list);
+	list_del_counter(counter, ctx);
 	hw_perf_enable_all();
 
 	if (!ctx->task) {
@@ -144,7 +187,7 @@ static void __perf_remove_from_context(void *info)
  * CPU counters are removed with a smp call. For task counters we only
  * call when the task is on a CPU.
  */
-static void perf_remove_from_context(struct perf_counter *counter)
+static void perf_counter_remove_from_context(struct perf_counter *counter)
 {
 	struct perf_counter_context *ctx = counter->ctx;
 	struct task_struct *task = ctx->task;
@@ -155,32 +198,32 @@ static void perf_remove_from_context(struct perf_counter *counter)
 		 * the removal is always sucessful.
 		 */
 		smp_call_function_single(counter->cpu,
-					 __perf_remove_from_context,
+					 __perf_counter_remove_from_context,
 					 counter, 1);
 		return;
 	}
 
 retry:
-	task_oncpu_function_call(task, __perf_remove_from_context,
+	task_oncpu_function_call(task, __perf_counter_remove_from_context,
 				 counter);
 
 	spin_lock_irq(&ctx->lock);
 	/*
 	 * If the context is active we need to retry the smp call.
 	 */
-	if (ctx->nr_active && !list_empty(&counter->list)) {
+	if (ctx->nr_active && !list_empty(&counter->list_entry)) {
 		spin_unlock_irq(&ctx->lock);
 		goto retry;
 	}
 
 	/*
 	 * The lock prevents that this context is scheduled in so we
-	 * can remove the counter safely, if it the call above did not
+	 * can remove the counter safely, if the call above did not
 	 * succeed.
 	 */
-	if (!list_empty(&counter->list)) {
+	if (!list_empty(&counter->list_entry)) {
 		ctx->nr_counters--;
-		list_del_init(&counter->list);
+		list_del_counter(counter, ctx);
 		counter->task = NULL;
 	}
 	spin_unlock_irq(&ctx->lock);
@@ -211,7 +254,7 @@ static void __perf_install_in_context(void *info)
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
 	hw_perf_disable_all();
-	list_add_tail(&counter->list, &ctx->counters);
+	list_add_counter(counter, ctx);
 	hw_perf_enable_all();
 
 	ctx->nr_counters++;
@@ -268,7 +311,7 @@ retry:
 	 * If the context is active and the counter has not been added
 	 * we need to retry the smp call.
 	 */
-	if (ctx->nr_active && list_empty(&counter->list)) {
+	if (ctx->nr_active && list_empty(&counter->list_entry)) {
 		spin_unlock_irq(&ctx->lock);
 		goto retry;
 	}
@@ -278,13 +321,45 @@ retry:
 	 * can add the counter safely, if it the call above did not
 	 * succeed.
 	 */
-	if (list_empty(&counter->list)) {
-		list_add_tail(&counter->list, &ctx->counters);
+	if (list_empty(&counter->list_entry)) {
+		list_add_counter(counter, ctx);
 		ctx->nr_counters++;
 	}
 	spin_unlock_irq(&ctx->lock);
 }
 
+static void
+counter_sched_out(struct perf_counter *counter,
+		  struct perf_cpu_context *cpuctx,
+		  struct perf_counter_context *ctx)
+{
+	if (!counter->active)
+		return;
+
+	hw_perf_counter_disable(counter);
+	counter->active	=  0;
+	counter->oncpu	= -1;
+
+	cpuctx->active_oncpu--;
+	ctx->nr_active--;
+}
+
+static void
+group_sched_out(struct perf_counter *group_counter,
+		struct perf_cpu_context *cpuctx,
+		struct perf_counter_context *ctx)
+{
+	struct perf_counter *counter;
+
+	counter_sched_out(group_counter, cpuctx, ctx);
+
+	/*
+	 * Schedule out siblings (if any):
+	 */
+	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
+		counter_sched_out(counter, cpuctx, ctx);
+}
+
 /*
  * Called from scheduler to remove the counters of the current task,
  * with interrupts disabled.
@@ -306,21 +381,48 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 		return;
 
 	spin_lock(&ctx->lock);
-	list_for_each_entry(counter, &ctx->counters, list) {
-		if (!ctx->nr_active)
-			break;
-		if (counter->active) {
-			hw_perf_counter_disable(counter);
-			counter->active = 0;
-			counter->oncpu = -1;
-			ctx->nr_active--;
-			cpuctx->active_oncpu--;
-		}
+	if (ctx->nr_active) {
+		list_for_each_entry(counter, &ctx->counter_list, list_entry)
+			group_sched_out(counter, cpuctx, ctx);
 	}
 	spin_unlock(&ctx->lock);
 	cpuctx->task_ctx = NULL;
 }
 
+static void
+counter_sched_in(struct perf_counter *counter,
+		 struct perf_cpu_context *cpuctx,
+		 struct perf_counter_context *ctx,
+		 int cpu)
+{
+	if (!counter->active)
+		return;
+
+	hw_perf_counter_enable(counter);
+	counter->active = 1;
+	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
+
+	cpuctx->active_oncpu++;
+	ctx->nr_active++;
+}
+
+static void
+group_sched_in(struct perf_counter *group_counter,
+	       struct perf_cpu_context *cpuctx,
+	       struct perf_counter_context *ctx,
+	       int cpu)
+{
+	struct perf_counter *counter;
+
+	counter_sched_in(group_counter, cpuctx, ctx, cpu);
+
+	/*
+	 * Schedule in siblings as one group (if any):
+	 */
+	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
+		counter_sched_in(counter, cpuctx, ctx, cpu);
+}
+
 /*
  * Called from scheduler to add the counters of the current task
  * with interrupts disabled.
@@ -342,19 +444,21 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 		return;
 
 	spin_lock(&ctx->lock);
-	list_for_each_entry(counter, &ctx->counters, list) {
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		if (ctx->nr_active == cpuctx->max_pertask)
 			break;
+
+		/*
+		 * Listen to the 'cpu' scheduling filter constraint
+		 * of counters:
+		 */
 		if (counter->cpu != -1 && counter->cpu != cpu)
 			continue;
 
-		hw_perf_counter_enable(counter);
-		counter->active = 1;
-		counter->oncpu = cpu;
-		ctx->nr_active++;
-		cpuctx->active_oncpu++;
+		group_sched_in(counter, cpuctx, ctx, cpu);
 	}
 	spin_unlock(&ctx->lock);
+
 	cpuctx->task_ctx = ctx;
 }
 
@@ -371,12 +475,12 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	spin_lock(&ctx->lock);
 
 	/*
-	 * Rotate the first entry last:
+	 * Rotate the first entry last (works just fine for group counters too):
 	 */
 	hw_perf_disable_all();
-	list_for_each_entry(counter, &ctx->counters, list) {
-		list_del(&counter->list);
-		list_add_tail(&counter->list, &ctx->counters);
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		list_del(&counter->list_entry);
+		list_add_tail(&counter->list_entry, &ctx->counter_list);
 		break;
 	}
 	hw_perf_enable_all();
@@ -387,16 +491,23 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 }
 
 /*
+ * Initialize the perf_counter context in a task_struct:
+ */
+static void
+__perf_counter_init_context(struct perf_counter_context *ctx,
+			    struct task_struct *task)
+{
+	spin_lock_init(&ctx->lock);
+	INIT_LIST_HEAD(&ctx->counter_list);
+	ctx->nr_counters	= 0;
+	ctx->task		= task;
+}
+/*
  * Initialize the perf_counter context in task_struct
  */
 void perf_counter_init_task(struct task_struct *task)
 {
-	struct perf_counter_context *ctx = &task->perf_counter_ctx;
-
-	spin_lock_init(&ctx->lock);
-	INIT_LIST_HEAD(&ctx->counters);
-	ctx->nr_counters = 0;
-	ctx->task = task;
+	__perf_counter_init_context(&task->perf_counter_ctx, task);
 }
 
 /*
@@ -407,7 +518,7 @@ static void __hw_perf_counter_read(void *info)
 	hw_perf_counter_read(info);
 }
 
-static u64 perf_read_counter(struct perf_counter *counter)
+static u64 perf_counter_read(struct perf_counter *counter)
 {
 	/*
 	 * If counter is enabled and currently active on a CPU, update the
@@ -418,7 +529,7 @@ static u64 perf_read_counter(struct perf_counter *counter)
 					 __hw_perf_counter_read, counter, 1);
 	}
 
-	return perf_read_counter_safe(counter);
+	return perf_counter_read_safe(counter);
 }
 
 /*
@@ -555,7 +666,7 @@ static int perf_release(struct inode *inode, struct file *file)
 
 	mutex_lock(&counter->mutex);
 
-	perf_remove_from_context(counter);
+	perf_counter_remove_from_context(counter);
 	put_context(ctx);
 
 	mutex_unlock(&counter->mutex);
@@ -577,7 +688,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 		return -EINVAL;
 
 	mutex_lock(&counter->mutex);
-	cntval = perf_read_counter(counter);
+	cntval = perf_counter_read(counter);
 	mutex_unlock(&counter->mutex);
 
 	return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
@@ -707,15 +818,25 @@ static const struct file_operations perf_fops = {
  * Allocate and initialize a counter structure
  */
 static struct perf_counter *
-perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu)
+perf_counter_alloc(struct perf_counter_hw_event *hw_event,
+		   int cpu,
+		   struct perf_counter *group_leader)
 {
 	struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
 
 	if (!counter)
 		return NULL;
 
+	/*
+	 * Single counters are their own group leaders, with an
+	 * empty sibling list:
+	 */
+	if (!group_leader)
+		group_leader = counter;
+
 	mutex_init(&counter->mutex);
-	INIT_LIST_HEAD(&counter->list);
+	INIT_LIST_HEAD(&counter->list_entry);
+	INIT_LIST_HEAD(&counter->sibling_list);
 	init_waitqueue_head(&counter->waitq);
 
 	counter->irqdata		= &counter->data[0];
@@ -723,6 +844,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu)
 	counter->cpu			= cpu;
 	counter->hw_event		= *hw_event;
 	counter->wakeup_pending		= 0;
+	counter->group_leader		= group_leader;
 
 	return counter;
 }
@@ -743,20 +865,45 @@ asmlinkage int sys_perf_counter_open(
 	int				group_fd)
 
 {
-	struct perf_counter_context *ctx;
+	struct perf_counter *counter, *group_leader;
 	struct perf_counter_hw_event hw_event;
-	struct perf_counter *counter;
+	struct perf_counter_context *ctx;
+	struct file *group_file = NULL;
+	int fput_needed = 0;
 	int ret;
 
 	if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
 		return -EFAULT;
 
+	/*
+	 * Look up the group leader:
+	 */
+	group_leader = NULL;
+	if (group_fd != -1) {
+		ret = -EINVAL;
+		group_file = fget_light(group_fd, &fput_needed);
+		if (!group_file)
+			goto out_fput;
+		if (group_file->f_op != &perf_fops)
+			goto out_fput;
+
+		group_leader = group_file->private_data;
+		/*
+		 * Do not allow a recursive hierarchy:
+		 */
+		if (group_leader->group_leader)
+			goto out_fput;
+	}
+
+	/*
+	 * Get the target context (task or percpu):
+	 */
 	ctx = find_get_context(pid, cpu);
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
 	ret = -ENOMEM;
-	counter = perf_counter_alloc(&hw_event, cpu);
+	counter = perf_counter_alloc(&hw_event, cpu, group_leader);
 	if (!counter)
 		goto err_put_context;
 
@@ -770,11 +917,14 @@ asmlinkage int sys_perf_counter_open(
 	if (ret < 0)
 		goto err_remove_free_put_context;
 
+out_fput:
+	fput_light(group_file, fput_needed);
+
 	return ret;
 
 err_remove_free_put_context:
 	mutex_lock(&counter->mutex);
-	perf_remove_from_context(counter);
+	perf_counter_remove_from_context(counter);
 	mutex_unlock(&counter->mutex);
 
 err_free_put_context:
@@ -783,40 +933,40 @@ err_free_put_context:
 err_put_context:
 	put_context(ctx);
 
-	return ret;
+	goto out_fput;
 }
 
-static void __cpuinit perf_init_cpu(int cpu)
+static void __cpuinit perf_counter_init_cpu(int cpu)
 {
-	struct perf_cpu_context *ctx;
+	struct perf_cpu_context *cpuctx;
 
-	ctx = &per_cpu(perf_cpu_context, cpu);
-	spin_lock_init(&ctx->ctx.lock);
-	INIT_LIST_HEAD(&ctx->ctx.counters);
+	cpuctx = &per_cpu(perf_cpu_context, cpu);
+	__perf_counter_init_context(&cpuctx->ctx, NULL);
 
 	mutex_lock(&perf_resource_mutex);
-	ctx->max_pertask = perf_max_counters - perf_reserved_percpu;
+	cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
 	mutex_unlock(&perf_resource_mutex);
+
 	hw_perf_counter_setup();
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void __perf_exit_cpu(void *info)
+static void __perf_counter_exit_cpu(void *info)
 {
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter_context *ctx = &cpuctx->ctx;
 	struct perf_counter *counter, *tmp;
 
-	list_for_each_entry_safe(counter, tmp, &ctx->counters, list)
-		__perf_remove_from_context(counter);
+	list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
+		__perf_counter_remove_from_context(counter);
 
 }
-static void perf_exit_cpu(int cpu)
+static void perf_counter_exit_cpu(int cpu)
 {
-	smp_call_function_single(cpu, __perf_exit_cpu, NULL, 1);
+	smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
 }
 #else
-static inline void perf_exit_cpu(int cpu) { }
+static inline void perf_counter_exit_cpu(int cpu) { }
 #endif
 
 static int __cpuinit
@@ -828,12 +978,12 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
-		perf_init_cpu(cpu);
+		perf_counter_init_cpu(cpu);
 		break;
 
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
-		perf_exit_cpu(cpu);
+		perf_counter_exit_cpu(cpu);
 		break;
 
 	default:
-- 
cgit v0.10.2


From ccff286d85098ba5438e22aa2ea807fc1e18cf2f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 11:26:29 +0100
Subject: perf counters: group counter, fixes

Impact: bugfix

Check that a group does not span outside the context of a CPU or a task.

Also, do not allow deep recursive hierarchies.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index fa59fe8..278209c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -107,9 +107,6 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 
 	list_del_init(&counter->list_entry);
 
-	if (list_empty(&counter->sibling_list))
-		return;
-
 	/*
 	 * If this was a group counter with sibling counters then
 	 * upgrade the siblings to singleton counters by adding them
@@ -395,9 +392,6 @@ counter_sched_in(struct perf_counter *counter,
 		 struct perf_counter_context *ctx,
 		 int cpu)
 {
-	if (!counter->active)
-		return;
-
 	hw_perf_counter_enable(counter);
 	counter->active = 1;
 	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
@@ -876,32 +870,39 @@ asmlinkage int sys_perf_counter_open(
 		return -EFAULT;
 
 	/*
-	 * Look up the group leader:
+	 * Get the target context (task or percpu):
+	 */
+	ctx = find_get_context(pid, cpu);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	/*
+	 * Look up the group leader (we will attach this counter to it):
 	 */
 	group_leader = NULL;
 	if (group_fd != -1) {
 		ret = -EINVAL;
 		group_file = fget_light(group_fd, &fput_needed);
 		if (!group_file)
-			goto out_fput;
+			goto err_put_context;
 		if (group_file->f_op != &perf_fops)
-			goto out_fput;
+			goto err_put_context;
 
 		group_leader = group_file->private_data;
 		/*
-		 * Do not allow a recursive hierarchy:
+		 * Do not allow a recursive hierarchy (this new sibling
+		 * becoming part of another group-sibling):
+		 */
+		if (group_leader->group_leader != group_leader)
+			goto err_put_context;
+		/*
+		 * Do not allow to attach to a group in a different
+		 * task or CPU context:
 		 */
-		if (group_leader->group_leader)
-			goto out_fput;
+		if (group_leader->ctx != ctx)
+			goto err_put_context;
 	}
 
-	/*
-	 * Get the target context (task or percpu):
-	 */
-	ctx = find_get_context(pid, cpu);
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
 	ret = -ENOMEM;
 	counter = perf_counter_alloc(&hw_event, cpu, group_leader);
 	if (!counter)
-- 
cgit v0.10.2


From 621a01eac89b5e2f81a4cf576568b31f40a02724 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 12:46:46 +0100
Subject: perf counters: hw driver API

Impact: restructure code, introduce hw_ops driver abstraction

Introduce this abstraction to handle counter details:

 struct hw_perf_counter_ops {
	void (*hw_perf_counter_enable)	(struct perf_counter *counter);
	void (*hw_perf_counter_disable)	(struct perf_counter *counter);
	void (*hw_perf_counter_read)	(struct perf_counter *counter);
 };

This will be useful to support assymetric hw details, and it will also
be useful to implement "software counters". (Counters that count kernel
managed sw events such as pagefaults, context-switches, wall-clock time
or task-local time.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 54b4ad0..718b635 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -56,7 +56,7 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
 /*
  * Setup the hardware configuration for a given hw_event_type
  */
-int hw_perf_counter_init(struct perf_counter *counter)
+static int __hw_perf_counter_init(struct perf_counter *counter)
 {
 	struct perf_counter_hw_event *hw_event = &counter->hw_event;
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -135,7 +135,7 @@ u64 hw_perf_disable_all(void)
 EXPORT_SYMBOL_GPL(hw_perf_disable_all);
 
 static inline void
-__hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
+__x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
 {
 	wrmsr(hwc->config_base + idx, hwc->config, 0);
 }
@@ -149,13 +149,13 @@ static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx)
 	wrmsr(hwc->counter_base + idx, hwc->next_count, 0);
 }
 
-static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+static void __x86_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
 {
 	wrmsr(hwc->config_base + idx,
 	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
-void hw_perf_counter_enable(struct perf_counter *counter)
+static void x86_perf_counter_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -170,12 +170,12 @@ void hw_perf_counter_enable(struct perf_counter *counter)
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	__hw_perf_counter_disable(hwc, idx);
+	__x86_perf_counter_disable(hwc, idx);
 
 	cpuc->counters[idx] = counter;
 
 	__hw_perf_counter_set_period(hwc, idx);
-	__hw_perf_counter_enable(hwc, idx);
+	__x86_perf_counter_enable(hwc, idx);
 }
 
 #ifdef CONFIG_X86_64
@@ -282,20 +282,20 @@ void perf_counter_print_debug(void)
 	local_irq_enable();
 }
 
-void hw_perf_counter_disable(struct perf_counter *counter)
+static void x86_perf_counter_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
-	__hw_perf_counter_disable(hwc, idx);
+	__x86_perf_counter_disable(hwc, idx);
 
 	clear_bit(idx, cpuc->used);
 	cpuc->counters[idx] = NULL;
 	__hw_perf_save_counter(counter, hwc, idx);
 }
 
-void hw_perf_counter_read(struct perf_counter *counter)
+static void x86_perf_counter_read(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned long addr = hwc->counter_base + hwc->idx;
@@ -342,7 +342,7 @@ static void perf_save_and_restart(struct perf_counter *counter)
 	__hw_perf_counter_set_period(hwc, idx);
 
 	if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
-		__hw_perf_counter_enable(hwc, idx);
+		__x86_perf_counter_enable(hwc, idx);
 }
 
 static void
@@ -572,3 +572,20 @@ void __init init_hw_perf_counters(void)
 
 	perf_counters_initialized = true;
 }
+
+static struct hw_perf_counter_ops x86_perf_counter_ops = {
+	.hw_perf_counter_enable		= x86_perf_counter_enable,
+	.hw_perf_counter_disable	= x86_perf_counter_disable,
+	.hw_perf_counter_read		= x86_perf_counter_read,
+};
+
+struct hw_perf_counter_ops *hw_perf_counter_init(struct perf_counter *counter)
+{
+	int err;
+
+	err = __hw_perf_counter_init(counter);
+	if (err)
+		return NULL;
+
+	return &x86_perf_counter_ops;
+}
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 7af7d89..2738564 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -113,6 +113,17 @@ struct perf_data {
 	u8				data[PERF_DATA_BUFLEN];
 };
 
+struct perf_counter;
+
+/**
+ * struct hw_perf_counter_ops - performance counter hw ops
+ */
+struct hw_perf_counter_ops {
+	void (*hw_perf_counter_enable)	(struct perf_counter *counter);
+	void (*hw_perf_counter_disable)	(struct perf_counter *counter);
+	void (*hw_perf_counter_read)	(struct perf_counter *counter);
+};
+
 /**
  * struct perf_counter - performance counter kernel representation:
  */
@@ -120,6 +131,7 @@ struct perf_counter {
 	struct list_head		list_entry;
 	struct list_head		sibling_list;
 	struct perf_counter		*group_leader;
+	struct hw_perf_counter_ops	*hw_ops;
 
 	int				active;
 #if BITS_PER_LONG == 64
@@ -185,6 +197,9 @@ struct perf_cpu_context {
 extern int perf_max_counters;
 
 #ifdef CONFIG_PERF_COUNTERS
+extern struct hw_perf_counter_ops *
+hw_perf_counter_init(struct perf_counter *counter);
+
 extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
 extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
 extern void perf_counter_task_tick(struct task_struct *task, int cpu);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 278209c..e6e41ca 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -37,18 +37,15 @@ static DEFINE_MUTEX(perf_resource_mutex);
 /*
  * Architecture provided APIs - weak aliases:
  */
-
-int __weak hw_perf_counter_init(struct perf_counter *counter)
+extern __weak struct hw_perf_counter_ops *
+hw_perf_counter_init(struct perf_counter *counter)
 {
-	return -EINVAL;
+	return ERR_PTR(-EINVAL);
 }
 
-void __weak hw_perf_counter_enable(struct perf_counter *counter)	 { }
-void __weak hw_perf_counter_disable(struct perf_counter *counter)	 { }
-void __weak hw_perf_counter_read(struct perf_counter *counter)		 { }
-void __weak hw_perf_disable_all(void) { }
-void __weak hw_perf_enable_all(void) { }
-void __weak hw_perf_counter_setup(void) { }
+void __weak hw_perf_disable_all(void)	 { }
+void __weak hw_perf_enable_all(void)	 { }
+void __weak hw_perf_counter_setup(void)	 { }
 
 #if BITS_PER_LONG == 64
 
@@ -146,7 +143,7 @@ static void __perf_counter_remove_from_context(void *info)
 	spin_lock(&ctx->lock);
 
 	if (counter->active) {
-		hw_perf_counter_disable(counter);
+		counter->hw_ops->hw_perf_counter_disable(counter);
 		counter->active = 0;
 		ctx->nr_active--;
 		cpuctx->active_oncpu--;
@@ -257,7 +254,7 @@ static void __perf_install_in_context(void *info)
 	ctx->nr_counters++;
 
 	if (cpuctx->active_oncpu < perf_max_counters) {
-		hw_perf_counter_enable(counter);
+		counter->hw_ops->hw_perf_counter_enable(counter);
 		counter->active = 1;
 		counter->oncpu = cpu;
 		ctx->nr_active++;
@@ -333,7 +330,7 @@ counter_sched_out(struct perf_counter *counter,
 	if (!counter->active)
 		return;
 
-	hw_perf_counter_disable(counter);
+	counter->hw_ops->hw_perf_counter_disable(counter);
 	counter->active	=  0;
 	counter->oncpu	= -1;
 
@@ -392,7 +389,7 @@ counter_sched_in(struct perf_counter *counter,
 		 struct perf_counter_context *ctx,
 		 int cpu)
 {
-	hw_perf_counter_enable(counter);
+	counter->hw_ops->hw_perf_counter_enable(counter);
 	counter->active = 1;
 	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
 
@@ -509,7 +506,9 @@ void perf_counter_init_task(struct task_struct *task)
  */
 static void __hw_perf_counter_read(void *info)
 {
-	hw_perf_counter_read(info);
+	struct perf_counter *counter = info;
+
+	counter->hw_ops->hw_perf_counter_read(counter);
 }
 
 static u64 perf_counter_read(struct perf_counter *counter)
@@ -816,8 +815,10 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		   int cpu,
 		   struct perf_counter *group_leader)
 {
-	struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+	struct hw_perf_counter_ops *hw_ops;
+	struct perf_counter *counter;
 
+	counter = kzalloc(sizeof(*counter), GFP_KERNEL);
 	if (!counter)
 		return NULL;
 
@@ -839,6 +840,14 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->hw_event		= *hw_event;
 	counter->wakeup_pending		= 0;
 	counter->group_leader		= group_leader;
+	counter->hw_ops			= NULL;
+
+	hw_ops = hw_perf_counter_init(counter);
+	if (!hw_ops) {
+		kfree(counter);
+		return NULL;
+	}
+	counter->hw_ops = hw_ops;
 
 	return counter;
 }
@@ -908,10 +917,6 @@ asmlinkage int sys_perf_counter_open(
 	if (!counter)
 		goto err_put_context;
 
-	ret = hw_perf_counter_init(counter);
-	if (ret)
-		goto err_free_put_context;
-
 	perf_install_in_context(ctx, counter, cpu);
 
 	ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
@@ -927,8 +932,6 @@ err_remove_free_put_context:
 	mutex_lock(&counter->mutex);
 	perf_counter_remove_from_context(counter);
 	mutex_unlock(&counter->mutex);
-
-err_free_put_context:
 	kfree(counter);
 
 err_put_context:
-- 
cgit v0.10.2


From 5c92d12411dfe5f0f3d1b1c1e2f756245e6f7249 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 13:21:10 +0100
Subject: perf counters: implement PERF_COUNT_CPU_CLOCK

Impact: add new perf-counter type

The 'CPU clock' counter counts the amount of CPU clock time that is
elapsing, in nanoseconds. (regardless of how much of it the task is
spending on a CPU executing)

This counter type is a Linux kernel based abstraction, it is available
even if the hardware does not support native hardware performance counters.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 718b635..43c8e9a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -178,35 +178,6 @@ static void x86_perf_counter_enable(struct perf_counter *counter)
 	__x86_perf_counter_enable(hwc, idx);
 }
 
-#ifdef CONFIG_X86_64
-static inline void atomic64_counter_set(struct perf_counter *counter, u64 val)
-{
-	atomic64_set(&counter->count, val);
-}
-
-static inline u64 atomic64_counter_read(struct perf_counter *counter)
-{
-	return atomic64_read(&counter->count);
-}
-#else
-/*
- * Todo: add proper atomic64_t support to 32-bit x86:
- */
-static inline void atomic64_counter_set(struct perf_counter *counter, u64 val64)
-{
-	u32 *val32 = (void *)&val64;
-
-	atomic_set(counter->count32 + 0, *(val32 + 0));
-	atomic_set(counter->count32 + 1, *(val32 + 1));
-}
-
-static inline u64 atomic64_counter_read(struct perf_counter *counter)
-{
-	return atomic_read(counter->count32 + 0) |
-		(u64) atomic_read(counter->count32 + 1) << 32;
-}
-#endif
-
 static void __hw_perf_save_counter(struct perf_counter *counter,
 				   struct hw_perf_counter *hwc, int idx)
 {
@@ -309,7 +280,7 @@ static void x86_perf_counter_read(struct perf_counter *counter)
 	} while (offs != hwc->prev_count);
 
 	val32 = (s32) val;
-	val =  (s64)hwc->irq_period + (s64)val32;
+	val = (s64)hwc->irq_period + (s64)val32;
 	atomic64_counter_set(counter, hwc->prev_count + val);
 }
 
@@ -573,13 +544,14 @@ void __init init_hw_perf_counters(void)
 	perf_counters_initialized = true;
 }
 
-static struct hw_perf_counter_ops x86_perf_counter_ops = {
+static const struct hw_perf_counter_ops x86_perf_counter_ops = {
 	.hw_perf_counter_enable		= x86_perf_counter_enable,
 	.hw_perf_counter_disable	= x86_perf_counter_disable,
 	.hw_perf_counter_read		= x86_perf_counter_read,
 };
 
-struct hw_perf_counter_ops *hw_perf_counter_init(struct perf_counter *counter)
+const struct hw_perf_counter_ops *
+hw_perf_counter_init(struct perf_counter *counter)
 {
 	int err;
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 2738564..9a1713a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -131,7 +131,7 @@ struct perf_counter {
 	struct list_head		list_entry;
 	struct list_head		sibling_list;
 	struct perf_counter		*group_leader;
-	struct hw_perf_counter_ops	*hw_ops;
+	const struct hw_perf_counter_ops *hw_ops;
 
 	int				active;
 #if BITS_PER_LONG == 64
@@ -197,7 +197,7 @@ struct perf_cpu_context {
 extern int perf_max_counters;
 
 #ifdef CONFIG_PERF_COUNTERS
-extern struct hw_perf_counter_ops *
+extern const struct hw_perf_counter_ops *
 hw_perf_counter_init(struct perf_counter *counter);
 
 extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
@@ -208,6 +208,9 @@ extern void perf_counter_notify(struct pt_regs *regs);
 extern void perf_counter_print_debug(void);
 extern void hw_perf_restore_ctrl(u64 ctrl);
 extern u64 hw_perf_disable_all(void);
+extern void atomic64_counter_set(struct perf_counter *counter, u64 val64);
+extern u64 atomic64_counter_read(struct perf_counter *counter);
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -219,7 +222,7 @@ static inline void perf_counter_init_task(struct task_struct *task)	{ }
 static inline void perf_counter_notify(struct pt_regs *regs)		{ }
 static inline void perf_counter_print_debug(void)			{ }
 static inline void hw_perf_restore_ctrl(u64 ctrl)			{ }
-static inline u64 hw_perf_disable_all(void)		{ return 0; }
+static inline u64 hw_perf_disable_all(void)		      { return 0; }
 #endif
 
 #endif /* _LINUX_PERF_COUNTER_H */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index e6e41ca..506286e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -37,15 +37,15 @@ static DEFINE_MUTEX(perf_resource_mutex);
 /*
  * Architecture provided APIs - weak aliases:
  */
-extern __weak struct hw_perf_counter_ops *
+extern __weak const struct hw_perf_counter_ops *
 hw_perf_counter_init(struct perf_counter *counter)
 {
 	return ERR_PTR(-EINVAL);
 }
 
-void __weak hw_perf_disable_all(void)	 { }
-void __weak hw_perf_enable_all(void)	 { }
-void __weak hw_perf_counter_setup(void)	 { }
+u64 __weak hw_perf_disable_all(void)		{ return 0; }
+void __weak hw_perf_restore_ctrl(u64 ctrl)	{ }
+void __weak hw_perf_counter_setup(void)		{ }
 
 #if BITS_PER_LONG == 64
 
@@ -58,6 +58,16 @@ static inline u64 perf_counter_read_safe(struct perf_counter *counter)
 	return (u64) atomic64_read(&counter->count);
 }
 
+void atomic64_counter_set(struct perf_counter *counter, u64 val)
+{
+	atomic64_set(&counter->count, val);
+}
+
+u64 atomic64_counter_read(struct perf_counter *counter)
+{
+	return atomic64_read(&counter->count);
+}
+
 #else
 
 /*
@@ -79,6 +89,20 @@ static u64 perf_counter_read_safe(struct perf_counter *counter)
 	return cntl | ((u64) cnth) << 32;
 }
 
+void atomic64_counter_set(struct perf_counter *counter, u64 val64)
+{
+	u32 *val32 = (void *)&val64;
+
+	atomic_set(counter->count32 + 0, *(val32 + 0));
+	atomic_set(counter->count32 + 1, *(val32 + 1));
+}
+
+u64 atomic64_counter_read(struct perf_counter *counter)
+{
+	return atomic_read(counter->count32 + 0) |
+		(u64) atomic_read(counter->count32 + 1) << 32;
+}
+
 #endif
 
 static void
@@ -131,6 +155,7 @@ static void __perf_counter_remove_from_context(void *info)
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter *counter = info;
 	struct perf_counter_context *ctx = counter->ctx;
+	u64 perf_flags;
 
 	/*
 	 * If this is a task context, we need to check whether it is
@@ -155,9 +180,9 @@ static void __perf_counter_remove_from_context(void *info)
 	 * Protect the list operation against NMI by disabling the
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
-	hw_perf_disable_all();
+	perf_flags = hw_perf_disable_all();
 	list_del_counter(counter, ctx);
-	hw_perf_enable_all();
+	hw_perf_restore_ctrl(perf_flags);
 
 	if (!ctx->task) {
 		/*
@@ -232,6 +257,7 @@ static void __perf_install_in_context(void *info)
 	struct perf_counter *counter = info;
 	struct perf_counter_context *ctx = counter->ctx;
 	int cpu = smp_processor_id();
+	u64 perf_flags;
 
 	/*
 	 * If this is a task context, we need to check whether it is
@@ -247,9 +273,9 @@ static void __perf_install_in_context(void *info)
 	 * Protect the list operation against NMI by disabling the
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
-	hw_perf_disable_all();
+	perf_flags = hw_perf_disable_all();
 	list_add_counter(counter, ctx);
-	hw_perf_enable_all();
+	hw_perf_restore_ctrl(perf_flags);
 
 	ctx->nr_counters++;
 
@@ -457,6 +483,7 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 {
 	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 	struct perf_counter *counter;
+	u64 perf_flags;
 
 	if (likely(!ctx->nr_counters))
 		return;
@@ -468,13 +495,13 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	/*
 	 * Rotate the first entry last (works just fine for group counters too):
 	 */
-	hw_perf_disable_all();
+	perf_flags = hw_perf_disable_all();
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		list_del(&counter->list_entry);
 		list_add_tail(&counter->list_entry, &ctx->counter_list);
 		break;
 	}
-	hw_perf_enable_all();
+	hw_perf_restore_ctrl(perf_flags);
 
 	spin_unlock(&ctx->lock);
 
@@ -807,6 +834,42 @@ static const struct file_operations perf_fops = {
 	.poll			= perf_poll,
 };
 
+static void cpu_clock_perf_counter_enable(struct perf_counter *counter)
+{
+}
+
+static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
+{
+}
+
+static void cpu_clock_perf_counter_read(struct perf_counter *counter)
+{
+	int cpu = raw_smp_processor_id();
+
+	atomic64_counter_set(counter, cpu_clock(cpu));
+}
+
+static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
+	.hw_perf_counter_enable		= cpu_clock_perf_counter_enable,
+	.hw_perf_counter_disable	= cpu_clock_perf_counter_disable,
+	.hw_perf_counter_read		= cpu_clock_perf_counter_read,
+};
+
+static const struct hw_perf_counter_ops *
+sw_perf_counter_init(struct perf_counter *counter)
+{
+	const struct hw_perf_counter_ops *hw_ops = NULL;
+
+	switch (counter->hw_event.type) {
+	case PERF_COUNT_CPU_CLOCK:
+		hw_ops = &perf_ops_cpu_clock;
+		break;
+	default:
+		break;
+	}
+	return hw_ops;
+}
+
 /*
  * Allocate and initialize a counter structure
  */
@@ -815,7 +878,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		   int cpu,
 		   struct perf_counter *group_leader)
 {
-	struct hw_perf_counter_ops *hw_ops;
+	const struct hw_perf_counter_ops *hw_ops;
 	struct perf_counter *counter;
 
 	counter = kzalloc(sizeof(*counter), GFP_KERNEL);
@@ -842,7 +905,13 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->group_leader		= group_leader;
 	counter->hw_ops			= NULL;
 
-	hw_ops = hw_perf_counter_init(counter);
+	hw_ops = NULL;
+	if (!hw_event->raw && hw_event->type < 0)
+		hw_ops = sw_perf_counter_init(counter);
+	if (!hw_ops) {
+		hw_ops = hw_perf_counter_init(counter);
+	}
+
 	if (!hw_ops) {
 		kfree(counter);
 		return NULL;
@@ -912,7 +981,7 @@ asmlinkage int sys_perf_counter_open(
 			goto err_put_context;
 	}
 
-	ret = -ENOMEM;
+	ret = -EINVAL;
 	counter = perf_counter_alloc(&hw_event, cpu, group_leader);
 	if (!counter)
 		goto err_put_context;
-- 
cgit v0.10.2


From 01b2838c4298c5e0d30b4993c195ac34dd9df61e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 13:45:51 +0100
Subject: perf counters: consolidate hw_perf save/restore APIs

Impact: cleanup

Rename them to better match up the usual IRQ disable/enable APIs:

 hw_perf_disable_all()  => hw_perf_save_disable()
 hw_perf_restore_ctrl() => hw_perf_restore()

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 43c8e9a..3e1dbeb 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -118,13 +118,13 @@ void hw_perf_enable_all(void)
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
 }
 
-void hw_perf_restore_ctrl(u64 ctrl)
+void hw_perf_restore(u64 ctrl)
 {
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
 }
-EXPORT_SYMBOL_GPL(hw_perf_restore_ctrl);
+EXPORT_SYMBOL_GPL(hw_perf_restore);
 
-u64 hw_perf_disable_all(void)
+u64 hw_perf_save_disable(void)
 {
 	u64 ctrl;
 
@@ -132,7 +132,7 @@ u64 hw_perf_disable_all(void)
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
 	return ctrl;
 }
-EXPORT_SYMBOL_GPL(hw_perf_disable_all);
+EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
 static inline void
 __x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index cca804e..a3e66a3 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -270,11 +270,11 @@ static atomic_t c3_cpu_count;
 /* Common C-state entry for C2, C3, .. */
 static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
 {
-	u64 pctrl;
+	u64 perf_flags;
 
 	/* Don't trace irqs off for idle */
 	stop_critical_timings();
-	pctrl = hw_perf_disable_all();
+	perf_flags = hw_perf_save_disable();
 	if (cstate->entry_method == ACPI_CSTATE_FFH) {
 		/* Call into architectural FFH based C-state */
 		acpi_processor_ffh_cstate_enter(cstate);
@@ -287,7 +287,7 @@ static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
 		   gets asserted in time to freeze execution properly. */
 		unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
 	}
-	hw_perf_restore_ctrl(pctrl);
+	hw_perf_restore(perf_flags);
 	start_critical_timings();
 }
 #endif /* !CONFIG_CPU_IDLE */
@@ -1433,7 +1433,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 
 	/* Don't trace irqs off for idle */
 	stop_critical_timings();
-	pctrl = hw_perf_disable_all();
+	pctrl = hw_perf_save_disable();
 	if (cx->entry_method == ACPI_CSTATE_FFH) {
 		/* Call into architectural FFH based C-state */
 		acpi_processor_ffh_cstate_enter(cx);
@@ -1448,7 +1448,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 		   gets asserted in time to freeze execution properly. */
 		unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
 	}
-	hw_perf_restore_ctrl(pctrl);
+	hw_perf_restore(pctrl);
 	start_critical_timings();
 }
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 9a1713a..68f6e3a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -67,7 +67,7 @@ enum perf_counter_record_type {
  * Hardware event to monitor via a performance monitoring counter:
  */
 struct perf_counter_hw_event {
-	u64			type;
+	s64			type;
 
 	u64			irq_period;
 	u32			record_type;
@@ -206,8 +206,8 @@ extern void perf_counter_task_tick(struct task_struct *task, int cpu);
 extern void perf_counter_init_task(struct task_struct *task);
 extern void perf_counter_notify(struct pt_regs *regs);
 extern void perf_counter_print_debug(void);
-extern void hw_perf_restore_ctrl(u64 ctrl);
-extern u64 hw_perf_disable_all(void);
+extern u64 hw_perf_save_disable(void);
+extern void hw_perf_restore(u64 ctrl);
 extern void atomic64_counter_set(struct perf_counter *counter, u64 val64);
 extern u64 atomic64_counter_read(struct perf_counter *counter);
 
@@ -221,8 +221,8 @@ perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
 static inline void perf_counter_init_task(struct task_struct *task)	{ }
 static inline void perf_counter_notify(struct pt_regs *regs)		{ }
 static inline void perf_counter_print_debug(void)			{ }
-static inline void hw_perf_restore_ctrl(u64 ctrl)			{ }
-static inline u64 hw_perf_disable_all(void)		      { return 0; }
+static inline void hw_perf_restore(u64 ctrl)			{ }
+static inline u64 hw_perf_save_disable(void)		      { return 0; }
 #endif
 
 #endif /* _LINUX_PERF_COUNTER_H */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 506286e..0e93fea 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -43,8 +43,8 @@ hw_perf_counter_init(struct perf_counter *counter)
 	return ERR_PTR(-EINVAL);
 }
 
-u64 __weak hw_perf_disable_all(void)		{ return 0; }
-void __weak hw_perf_restore_ctrl(u64 ctrl)	{ }
+u64 __weak hw_perf_save_disable(void)		{ return 0; }
+void __weak hw_perf_restore(u64 ctrl)	{ }
 void __weak hw_perf_counter_setup(void)		{ }
 
 #if BITS_PER_LONG == 64
@@ -180,9 +180,9 @@ static void __perf_counter_remove_from_context(void *info)
 	 * Protect the list operation against NMI by disabling the
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
-	perf_flags = hw_perf_disable_all();
+	perf_flags = hw_perf_save_disable();
 	list_del_counter(counter, ctx);
-	hw_perf_restore_ctrl(perf_flags);
+	hw_perf_restore(perf_flags);
 
 	if (!ctx->task) {
 		/*
@@ -273,9 +273,9 @@ static void __perf_install_in_context(void *info)
 	 * Protect the list operation against NMI by disabling the
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
-	perf_flags = hw_perf_disable_all();
+	perf_flags = hw_perf_save_disable();
 	list_add_counter(counter, ctx);
-	hw_perf_restore_ctrl(perf_flags);
+	hw_perf_restore(perf_flags);
 
 	ctx->nr_counters++;
 
@@ -495,13 +495,13 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	/*
 	 * Rotate the first entry last (works just fine for group counters too):
 	 */
-	perf_flags = hw_perf_disable_all();
+	perf_flags = hw_perf_save_disable();
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		list_del(&counter->list_entry);
 		list_add_tail(&counter->list_entry, &ctx->counter_list);
 		break;
 	}
-	hw_perf_restore_ctrl(perf_flags);
+	hw_perf_restore(perf_flags);
 
 	spin_unlock(&ctx->lock);
 
-- 
cgit v0.10.2


From bae43c9945ebeef15e7952e317efb02393d3bfc7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 14:03:20 +0100
Subject: perf counters: implement PERF_COUNT_TASK_CLOCK

Impact: add new perf-counter type

The 'task clock' counter counts the amount of time a task is executing,
in nanoseconds. It stops ticking when a task is scheduled out either due
to it blocking, sleeping or it being preempted.

This counter type is a Linux kernel based abstraction, it is available
even if the hardware does not support native hardware performance counters.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 68f6e3a..30c0ec8 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -50,8 +50,11 @@ enum hw_event_types {
 	 */
 	PERF_COUNT_CPU_CLOCK		= -1,
 	PERF_COUNT_TASK_CLOCK		= -2,
-	PERF_COUNT_PAGE_FAULTS		= -3,
-	PERF_COUNT_CONTEXT_SWITCHES	= -4,
+	/*
+	 * Future software events:
+	 */
+	/* PERF_COUNT_PAGE_FAULTS	= -3,
+	   PERF_COUNT_CONTEXT_SWITCHES	= -4, */
 };
 
 /*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0e93fea..a0fe847 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -855,6 +855,25 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
 	.hw_perf_counter_read		= cpu_clock_perf_counter_read,
 };
 
+static void task_clock_perf_counter_enable(struct perf_counter *counter)
+{
+}
+
+static void task_clock_perf_counter_disable(struct perf_counter *counter)
+{
+}
+
+static void task_clock_perf_counter_read(struct perf_counter *counter)
+{
+	atomic64_counter_set(counter, current->se.sum_exec_runtime);
+}
+
+static const struct hw_perf_counter_ops perf_ops_task_clock = {
+	.hw_perf_counter_enable		= task_clock_perf_counter_enable,
+	.hw_perf_counter_disable	= task_clock_perf_counter_disable,
+	.hw_perf_counter_read		= task_clock_perf_counter_read,
+};
+
 static const struct hw_perf_counter_ops *
 sw_perf_counter_init(struct perf_counter *counter)
 {
@@ -864,6 +883,9 @@ sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_CPU_CLOCK:
 		hw_ops = &perf_ops_cpu_clock;
 		break;
+	case PERF_COUNT_TASK_CLOCK:
+		hw_ops = &perf_ops_task_clock;
+		break;
 	default:
 		break;
 	}
-- 
cgit v0.10.2


From 1d1c7ddbfab358445a542715551301b7fc363e28 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 14:59:31 +0100
Subject: perf counters: add prctl interface to disable/enable counters

Add a way for self-monitoring tasks to disable/enable counters summarily,
via a prctl:

	PR_TASK_PERF_COUNTERS_DISABLE		31
	PR_TASK_PERF_COUNTERS_ENABLE		32

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 30c0ec8..97d86c2 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -213,6 +213,8 @@ extern u64 hw_perf_save_disable(void);
 extern void hw_perf_restore(u64 ctrl);
 extern void atomic64_counter_set(struct perf_counter *counter, u64 val64);
 extern u64 atomic64_counter_read(struct perf_counter *counter);
+extern int perf_counter_task_disable(void);
+extern int perf_counter_task_enable(void);
 
 #else
 static inline void
@@ -226,6 +228,8 @@ static inline void perf_counter_notify(struct pt_regs *regs)		{ }
 static inline void perf_counter_print_debug(void)			{ }
 static inline void hw_perf_restore(u64 ctrl)			{ }
 static inline u64 hw_perf_save_disable(void)		      { return 0; }
+static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
+static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
 #endif
 
 #endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 48d887e..b00df4c 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -85,4 +85,7 @@
 #define PR_SET_TIMERSLACK 29
 #define PR_GET_TIMERSLACK 30
 
+#define PR_TASK_PERF_COUNTERS_DISABLE		31
+#define PR_TASK_PERF_COUNTERS_ENABLE		32
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index a0fe847..4e679b9 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -415,6 +415,9 @@ counter_sched_in(struct perf_counter *counter,
 		 struct perf_counter_context *ctx,
 		 int cpu)
 {
+	if (counter->active == -1)
+		return;
+
 	counter->hw_ops->hw_perf_counter_enable(counter);
 	counter->active = 1;
 	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
@@ -479,6 +482,79 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 	cpuctx->task_ctx = ctx;
 }
 
+int perf_counter_task_disable(void)
+{
+	struct task_struct *curr = current;
+	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+	struct perf_counter *counter;
+	u64 perf_flags;
+	int cpu;
+
+	if (likely(!ctx->nr_counters))
+		return 0;
+
+	local_irq_disable();
+	cpu = smp_processor_id();
+
+	perf_counter_task_sched_out(curr, cpu);
+
+	spin_lock(&ctx->lock);
+
+	/*
+	 * Disable all the counters:
+	 */
+	perf_flags = hw_perf_save_disable();
+
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		WARN_ON_ONCE(counter->active == 1);
+		counter->active = -1;
+	}
+	hw_perf_restore(perf_flags);
+
+	spin_unlock(&ctx->lock);
+
+	local_irq_enable();
+
+	return 0;
+}
+
+int perf_counter_task_enable(void)
+{
+	struct task_struct *curr = current;
+	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+	struct perf_counter *counter;
+	u64 perf_flags;
+	int cpu;
+
+	if (likely(!ctx->nr_counters))
+		return 0;
+
+	local_irq_disable();
+	cpu = smp_processor_id();
+
+	spin_lock(&ctx->lock);
+
+	/*
+	 * Disable all the counters:
+	 */
+	perf_flags = hw_perf_save_disable();
+
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		if (counter->active != -1)
+			continue;
+		counter->active = 0;
+	}
+	hw_perf_restore(perf_flags);
+
+	spin_unlock(&ctx->lock);
+
+	perf_counter_task_sched_in(curr, cpu);
+
+	local_irq_enable();
+
+	return 0;
+}
+
 void perf_counter_task_tick(struct task_struct *curr, int cpu)
 {
 	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
@@ -951,13 +1027,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
  * @cpu:		target cpu
  * @group_fd:		group leader counter fd
  */
-asmlinkage int sys_perf_counter_open(
-
-	struct perf_counter_hw_event	*hw_event_uptr		__user,
-	pid_t				pid,
-	int				cpu,
-	int				group_fd)
-
+asmlinkage int
+sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
+		      pid_t pid, int cpu, int group_fd)
 {
 	struct perf_counter *counter, *group_leader;
 	struct perf_counter_hw_event hw_event;
diff --git a/kernel/sys.c b/kernel/sys.c
index 31deba8..0f66633 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,6 +14,7 @@
 #include <linux/prctl.h>
 #include <linux/highuid.h>
 #include <linux/fs.h>
+#include <linux/perf_counter.h>
 #include <linux/resource.h>
 #include <linux/kernel.h>
 #include <linux/kexec.h>
@@ -1716,6 +1717,12 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 		case PR_SET_TSC:
 			error = SET_TSC_CTL(arg2);
 			break;
+		case PR_TASK_PERF_COUNTERS_DISABLE:
+			error = perf_counter_task_disable();
+			break;
+		case PR_TASK_PERF_COUNTERS_ENABLE:
+			error = perf_counter_task_enable();
+			break;
 		case PR_GET_TIMERSLACK:
 			error = current->timer_slack_ns;
 			break;
-- 
cgit v0.10.2


From 6a930700c8b655a9e25e42fc4adc0b225ebbcefc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 15:17:03 +0100
Subject: perf counters: clean up state transitions

Impact: cleanup

Introduce a proper enum for the 3 states of a counter:

	PERF_COUNTER_STATE_OFF		= -1
	PERF_COUNTER_STATE_INACTIVE	=  0
	PERF_COUNTER_STATE_ACTIVE	=  1

and rename counter->active to counter->state and propagate the
changes everywhere.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 3e1dbeb..4854cca 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -332,7 +332,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 	 * Then store sibling timestamps (if any):
 	 */
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
-		if (!counter->active) {
+		if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
 			/*
 			 * When counter was not in the overflow mask, we have to
 			 * read it from hardware. We read it as well, when it
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 97d86c2..8cb095f 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -128,6 +128,15 @@ struct hw_perf_counter_ops {
 };
 
 /**
+ * enum perf_counter_active_state - the states of a counter
+ */
+enum perf_counter_active_state {
+	PERF_COUNTER_STATE_OFF		= -1,
+	PERF_COUNTER_STATE_INACTIVE	=  0,
+	PERF_COUNTER_STATE_ACTIVE	=  1,
+};
+
+/**
  * struct perf_counter - performance counter kernel representation:
  */
 struct perf_counter {
@@ -136,7 +145,7 @@ struct perf_counter {
 	struct perf_counter		*group_leader;
 	const struct hw_perf_counter_ops *hw_ops;
 
-	int				active;
+	enum perf_counter_active_state	state;
 #if BITS_PER_LONG == 64
 	atomic64_t			count;
 #else
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4e679b9..559130b 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -167,9 +167,9 @@ static void __perf_counter_remove_from_context(void *info)
 
 	spin_lock(&ctx->lock);
 
-	if (counter->active) {
+	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 		counter->hw_ops->hw_perf_counter_disable(counter);
-		counter->active = 0;
+		counter->state = PERF_COUNTER_STATE_INACTIVE;
 		ctx->nr_active--;
 		cpuctx->active_oncpu--;
 		counter->task = NULL;
@@ -281,7 +281,7 @@ static void __perf_install_in_context(void *info)
 
 	if (cpuctx->active_oncpu < perf_max_counters) {
 		counter->hw_ops->hw_perf_counter_enable(counter);
-		counter->active = 1;
+		counter->state = PERF_COUNTER_STATE_ACTIVE;
 		counter->oncpu = cpu;
 		ctx->nr_active++;
 		cpuctx->active_oncpu++;
@@ -328,7 +328,6 @@ retry:
 
 	spin_lock_irq(&ctx->lock);
 	/*
-	 * If the context is active and the counter has not been added
 	 * we need to retry the smp call.
 	 */
 	if (ctx->nr_active && list_empty(&counter->list_entry)) {
@@ -353,12 +352,12 @@ counter_sched_out(struct perf_counter *counter,
 		  struct perf_cpu_context *cpuctx,
 		  struct perf_counter_context *ctx)
 {
-	if (!counter->active)
+	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 		return;
 
 	counter->hw_ops->hw_perf_counter_disable(counter);
-	counter->active	=  0;
-	counter->oncpu	= -1;
+	counter->state = PERF_COUNTER_STATE_INACTIVE;
+	counter->oncpu = -1;
 
 	cpuctx->active_oncpu--;
 	ctx->nr_active--;
@@ -415,11 +414,11 @@ counter_sched_in(struct perf_counter *counter,
 		 struct perf_counter_context *ctx,
 		 int cpu)
 {
-	if (counter->active == -1)
+	if (counter->state == PERF_COUNTER_STATE_OFF)
 		return;
 
 	counter->hw_ops->hw_perf_counter_enable(counter);
-	counter->active = 1;
+	counter->state = PERF_COUNTER_STATE_ACTIVE;
 	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
 
 	cpuctx->active_oncpu++;
@@ -506,8 +505,8 @@ int perf_counter_task_disable(void)
 	perf_flags = hw_perf_save_disable();
 
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		WARN_ON_ONCE(counter->active == 1);
-		counter->active = -1;
+		WARN_ON_ONCE(counter->state == PERF_COUNTER_STATE_ACTIVE);
+		counter->state = PERF_COUNTER_STATE_OFF;
 	}
 	hw_perf_restore(perf_flags);
 
@@ -540,9 +539,9 @@ int perf_counter_task_enable(void)
 	perf_flags = hw_perf_save_disable();
 
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		if (counter->active != -1)
+		if (counter->state != PERF_COUNTER_STATE_OFF)
 			continue;
-		counter->active = 0;
+		counter->state = PERF_COUNTER_STATE_INACTIVE;
 	}
 	hw_perf_restore(perf_flags);
 
@@ -620,7 +619,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
 	 * If counter is enabled and currently active on a CPU, update the
 	 * value in the counter structure:
 	 */
-	if (counter->active) {
+	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 		smp_call_function_single(counter->oncpu,
 					 __hw_perf_counter_read, counter, 1);
 	}
@@ -673,7 +672,7 @@ static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
 
 retry:
 	spin_lock_irq(&ctx->lock);
-	if (!counter->active) {
+	if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
 		counter->irqdata = counter->usrdata;
 		counter->usrdata = oldirqdata;
 		spin_unlock_irq(&ctx->lock);
-- 
cgit v0.10.2


From 447557ac7ce120306b4a31d6003faef39cb1bf14 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 20:40:18 +0100
Subject: perf counters: update docs

Impact: update docs

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf-counters.txt b/Documentation/perf-counters.txt
index 19033a0..fddd321 100644
--- a/Documentation/perf-counters.txt
+++ b/Documentation/perf-counters.txt
@@ -10,8 +10,8 @@ trigger interrupts when a threshold number of events have passed - and can
 thus be used to profile the code that runs on that CPU.
 
 The Linux Performance Counter subsystem provides an abstraction of these
-hardware capabilities. It provides per task and per CPU counters, and
-it provides event capabilities on top of those.
+hardware capabilities. It provides per task and per CPU counters, counter
+groups, and it provides event capabilities on top of those.
 
 Performance counters are accessed via special file descriptors.
 There's one file descriptor per virtual counter used.
@@ -19,12 +19,8 @@ There's one file descriptor per virtual counter used.
 The special file descriptor is opened via the perf_counter_open()
 system call:
 
- int
- perf_counter_open(u32 hw_event_type,
-                   u32 hw_event_period,
-                   u32 record_type,
-                   pid_t pid,
-                   int cpu);
+   int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
+			     pid_t pid, int cpu, int group_fd);
 
 The syscall returns the new fd. The fd can be used via the normal
 VFS system calls: read() can be used to read the counter, fcntl()
@@ -33,39 +29,78 @@ can be used to set the blocking mode, etc.
 Multiple counters can be kept open at a time, and the counters
 can be poll()ed.
 
-When creating a new counter fd, 'hw_event_type' is one of:
-
- enum hw_event_types {
-	PERF_COUNT_CYCLES,
-	PERF_COUNT_INSTRUCTIONS,
-	PERF_COUNT_CACHE_REFERENCES,
-	PERF_COUNT_CACHE_MISSES,
-	PERF_COUNT_BRANCH_INSTRUCTIONS,
-	PERF_COUNT_BRANCH_MISSES,
- };
+When creating a new counter fd, 'perf_counter_hw_event' is:
+
+/*
+ * Hardware event to monitor via a performance monitoring counter:
+ */
+struct perf_counter_hw_event {
+	s64			type;
+
+	u64			irq_period;
+	u32			record_type;
+
+	u32			disabled     :  1, /* off by default */
+				nmi	     :  1, /* NMI sampling   */
+				raw	     :  1, /* raw event type */
+				__reserved_1 : 29;
+
+	u64			__reserved_2;
+};
+
+/*
+ * Generalized performance counter event types, used by the hw_event.type
+ * parameter of the sys_perf_counter_open() syscall:
+ */
+enum hw_event_types {
+	/*
+	 * Common hardware events, generalized by the kernel:
+	 */
+	PERF_COUNT_CYCLES		=  0,
+	PERF_COUNT_INSTRUCTIONS		=  1,
+	PERF_COUNT_CACHE_REFERENCES	=  2,
+	PERF_COUNT_CACHE_MISSES		=  3,
+	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
+	PERF_COUNT_BRANCH_MISSES	=  5,
+
+	/*
+	 * Special "software" counters provided by the kernel, even if
+	 * the hardware does not support performance counters. These
+	 * counters measure various physical and sw events of the
+	 * kernel (and allow the profiling of them as well):
+	 */
+	PERF_COUNT_CPU_CLOCK		= -1,
+	PERF_COUNT_TASK_CLOCK		= -2,
+	/*
+	 * Future software events:
+	 */
+	/* PERF_COUNT_PAGE_FAULTS	= -3,
+	   PERF_COUNT_CONTEXT_SWITCHES	= -4, */
+};
 
 These are standardized types of events that work uniformly on all CPUs
 that implements Performance Counters support under Linux. If a CPU is
 not able to count branch-misses, then the system call will return
 -EINVAL.
 
-[ Note: more hw_event_types are supported as well, but they are CPU
-  specific and are enumerated via /sys on a per CPU basis. Raw hw event
-  types can be passed in as negative numbers. For example, to count
-  "External bus cycles while bus lock signal asserted" events on Intel
-  Core CPUs, pass in a -0x4064 event type value. ]
-
-The parameter 'hw_event_period' is the number of events before waking up
-a read() that is blocked on a counter fd. Zero value means a non-blocking
-counter.
+More hw_event_types are supported as well, but they are CPU
+specific and are enumerated via /sys on a per CPU basis. Raw hw event
+types can be passed in under hw_event.type if hw_event.raw is 1.
+For example, to count "External bus cycles while bus lock signal asserted"
+events on Intel Core CPUs, pass in a 0x4064 event type value and set
+hw_event.raw to 1.
 
 'record_type' is the type of data that a read() will provide for the
 counter, and it can be one of:
 
-  enum perf_record_type {
-	PERF_RECORD_SIMPLE,
-	PERF_RECORD_IRQ,
-  };
+/*
+ * IRQ-notification data record type:
+ */
+enum perf_counter_record_type {
+	PERF_RECORD_SIMPLE		=  0,
+	PERF_RECORD_IRQ			=  1,
+	PERF_RECORD_GROUP		=  2,
+};
 
 a "simple" counter is one that counts hardware events and allows
 them to be read out into a u64 count value. (read() returns 8 on
@@ -76,6 +111,10 @@ the IP of the interrupted context. In this case read() will return
 the 8-byte counter value, plus the Instruction Pointer address of the
 interrupted context.
 
+The parameter 'hw_event_period' is the number of events before waking up
+a read() that is blocked on a counter fd. Zero value means a non-blocking
+counter.
+
 The 'pid' parameter allows the counter to be specific to a task:
 
  pid == 0: if the pid parameter is zero, the counter is attached to the
@@ -92,7 +131,7 @@ CPU:
  cpu >= 0: the counter is restricted to a specific CPU
  cpu == -1: the counter counts on all CPUs
 
-Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.
+(Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.)
 
 A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts
 events of that task and 'follows' that task to whatever CPU the task
@@ -102,3 +141,7 @@ their own tasks.
 A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts
 all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege.
 
+Group counters are created by passing in a group_fd of another counter.
+Groups are scheduled at once and can be used with PERF_RECORD_GROUP
+to record multi-dimensional timestamps.
+
-- 
cgit v0.10.2


From 9b194e831fb2c322ed81a373e49620f34edc2778 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Dec 2008 20:22:35 +0100
Subject: x86: implement atomic64_t on 32-bit

Impact: new API

Implement the atomic64_t APIs on 32-bit as well. Will be used by
the performance counters code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index ad5b9f6..9927e01 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -255,5 +255,223 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
 
+/* An 64bit atomic type */
+
+typedef struct {
+	unsigned long long counter;
+} atomic64_t;
+
+#define ATOMIC64_INIT(val)	{ (val) }
+
+/**
+ * atomic64_read - read atomic64 variable
+ * @v: pointer of type atomic64_t
+ *
+ * Atomically reads the value of @v.
+ * Doesn't imply a read memory barrier.
+ */
+#define __atomic64_read(ptr)		((ptr)->counter)
+
+static inline unsigned long long
+cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new)
+{
+	asm volatile(
+
+		LOCK_PREFIX "cmpxchg8b (%[ptr])\n"
+
+		     :		"=A" (old)
+
+		     : [ptr]	"D" (ptr),
+				"A" (old),
+				"b" (ll_low(new)),
+				"c" (ll_high(new))
+
+		     : "memory");
+
+	return old;
+}
+
+static inline unsigned long long
+atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
+		 unsigned long long new_val)
+{
+	return cmpxchg8b(&ptr->counter, old_val, new_val);
+}
+
+/**
+ * atomic64_set - set atomic64 variable
+ * @ptr:      pointer to type atomic64_t
+ * @new_val:  value to assign
+ *
+ * Atomically sets the value of @ptr to @new_val.
+ */
+static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
+{
+	unsigned long long old_val;
+
+	do {
+		old_val = atomic_read(ptr);
+	} while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
+}
+
+/**
+ * atomic64_read - read atomic64 variable
+ * @ptr:      pointer to type atomic64_t
+ *
+ * Atomically reads the value of @ptr and returns it.
+ */
+static inline unsigned long long atomic64_read(atomic64_t *ptr)
+{
+	unsigned long long curr_val;
+
+	do {
+		curr_val = __atomic64_read(ptr);
+	} while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val);
+
+	return curr_val;
+}
+
+/**
+ * atomic64_add_return - add and return
+ * @delta: integer value to add
+ * @ptr:   pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr and returns @delta + *@ptr
+ */
+static inline unsigned long long
+atomic64_add_return(unsigned long long delta, atomic64_t *ptr)
+{
+	unsigned long long old_val, new_val;
+
+	do {
+		old_val = atomic_read(ptr);
+		new_val = old_val + delta;
+
+	} while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
+
+	return new_val;
+}
+
+static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr)
+{
+	return atomic64_add_return(-delta, ptr);
+}
+
+static inline long atomic64_inc_return(atomic64_t *ptr)
+{
+	return atomic64_add_return(1, ptr);
+}
+
+static inline long atomic64_dec_return(atomic64_t *ptr)
+{
+	return atomic64_sub_return(1, ptr);
+}
+
+/**
+ * atomic64_add - add integer to atomic64 variable
+ * @delta: integer value to add
+ * @ptr:   pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr.
+ */
+static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr)
+{
+	atomic64_add_return(delta, ptr);
+}
+
+/**
+ * atomic64_sub - subtract the atomic64 variable
+ * @delta: integer value to subtract
+ * @ptr:   pointer to type atomic64_t
+ *
+ * Atomically subtracts @delta from @ptr.
+ */
+static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr)
+{
+	atomic64_add(-delta, ptr);
+}
+
+/**
+ * atomic64_sub_and_test - subtract value from variable and test result
+ * @delta: integer value to subtract
+ * @ptr:   pointer to type atomic64_t
+ *
+ * Atomically subtracts @delta from @ptr and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int
+atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
+{
+	unsigned long long old_val = atomic64_sub_return(delta, ptr);
+
+	return old_val == 0;
+}
+
+/**
+ * atomic64_inc - increment atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically increments @ptr by 1.
+ */
+static inline void atomic64_inc(atomic64_t *ptr)
+{
+	atomic64_add(1, ptr);
+}
+
+/**
+ * atomic64_dec - decrement atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically decrements @ptr by 1.
+ */
+static inline void atomic64_dec(atomic64_t *ptr)
+{
+	atomic64_sub(1, ptr);
+}
+
+/**
+ * atomic64_dec_and_test - decrement and test
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically decrements @ptr by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+static inline int atomic64_dec_and_test(atomic64_t *ptr)
+{
+	return atomic64_sub_and_test(1, ptr);
+}
+
+/**
+ * atomic64_inc_and_test - increment and test
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically increments @ptr by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int atomic64_inc_and_test(atomic64_t *ptr)
+{
+	return atomic64_sub_and_test(-1, ptr);
+}
+
+/**
+ * atomic64_add_negative - add and test if negative
+ * @delta: integer value to add
+ * @ptr:   pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+static inline int
+atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
+{
+	long long old_val = atomic64_add_return(delta, ptr);
+
+	return old_val < 0;
+}
+
 #include <asm-generic/atomic.h>
 #endif /* _ASM_X86_ATOMIC_32_H */
-- 
cgit v0.10.2


From ee06094f8279e1312fc0a31591320cc7b6f0ab1e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 13 Dec 2008 09:00:03 +0100
Subject: perfcounters: restructure x86 counter math

Impact: restructure code

Change counter math from absolute values to clear delta logic.

We try to extract elapsed deltas from the raw hw counter - and put
that into the generic counter.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f2fdc18..fe94490 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -643,7 +643,7 @@ config X86_UP_IOAPIC
 config X86_LOCAL_APIC
 	def_bool y
 	depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
-	select HAVE_PERF_COUNTERS
+	select HAVE_PERF_COUNTERS if (!M386 && !M486)
 
 config X86_IO_APIC
 	def_bool y
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index b903f8d..5afae13 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -54,6 +54,48 @@ const int intel_perfmon_event_map[] =
 const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
 
 /*
+ * Propagate counter elapsed time into the generic counter.
+ * Can only be executed on the CPU where the counter is active.
+ * Returns the delta events processed.
+ */
+static void
+x86_perf_counter_update(struct perf_counter *counter,
+			struct hw_perf_counter *hwc, int idx)
+{
+	u64 prev_raw_count, new_raw_count, delta;
+
+	WARN_ON_ONCE(counter->state != PERF_COUNTER_STATE_ACTIVE);
+	/*
+	 * Careful: an NMI might modify the previous counter value.
+	 *
+	 * Our tactic to handle this is to first atomically read and
+	 * exchange a new raw count - then add that new-prev delta
+	 * count to the generic counter atomically:
+	 */
+again:
+	prev_raw_count = atomic64_read(&hwc->prev_count);
+	rdmsrl(hwc->counter_base + idx, new_raw_count);
+
+	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+					new_raw_count) != prev_raw_count)
+		goto again;
+
+	/*
+	 * Now we have the new raw value and have updated the prev
+	 * timestamp already. We can now calculate the elapsed delta
+	 * (counter-)time and add that to the generic counter.
+	 *
+	 * Careful, not all hw sign-extends above the physical width
+	 * of the count, so we do that by clipping the delta to 32 bits:
+	 */
+	delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
+	WARN_ON_ONCE((int)delta < 0);
+
+	atomic64_add(delta, &counter->count);
+	atomic64_sub(delta, &hwc->period_left);
+}
+
+/*
  * Setup the hardware configuration for a given hw_event_type
  */
 static int __hw_perf_counter_init(struct perf_counter *counter)
@@ -90,10 +132,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * so we install an artificial 1<<31 period regardless of
 	 * the generic counter period:
 	 */
-	if (!hwc->irq_period)
+	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
 		hwc->irq_period = 0x7FFFFFFF;
 
-	hwc->next_count	= -(s32)hwc->irq_period;
+	atomic64_set(&hwc->period_left, hwc->irq_period);
 
 	/*
 	 * Raw event type provide the config in the event structure
@@ -118,12 +160,6 @@ void hw_perf_enable_all(void)
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
 }
 
-void hw_perf_restore(u64 ctrl)
-{
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
-}
-EXPORT_SYMBOL_GPL(hw_perf_restore);
-
 u64 hw_perf_save_disable(void)
 {
 	u64 ctrl;
@@ -134,27 +170,74 @@ u64 hw_perf_save_disable(void)
 }
 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
+void hw_perf_restore(u64 ctrl)
+{
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
+}
+EXPORT_SYMBOL_GPL(hw_perf_restore);
+
 static inline void
-__x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
+__x86_perf_counter_disable(struct perf_counter *counter,
+			   struct hw_perf_counter *hwc, unsigned int idx)
 {
-	wrmsr(hwc->config_base + idx, hwc->config, 0);
+	int err;
+
+	err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
+	WARN_ON_ONCE(err);
 }
 
-static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]);
+static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]);
 
-static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx)
+/*
+ * Set the next IRQ period, based on the hwc->period_left value.
+ * To be called with the counter disabled in hw:
+ */
+static void
+__hw_perf_counter_set_period(struct perf_counter *counter,
+			     struct hw_perf_counter *hwc, int idx)
 {
-	per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count;
+	s32 left = atomic64_read(&hwc->period_left);
+	s32 period = hwc->irq_period;
+
+	WARN_ON_ONCE(period <= 0);
+
+	/*
+	 * If we are way outside a reasoable range then just skip forward:
+	 */
+	if (unlikely(left <= -period)) {
+		left = period;
+		atomic64_set(&hwc->period_left, left);
+	}
+
+	if (unlikely(left <= 0)) {
+		left += period;
+		atomic64_set(&hwc->period_left, left);
+	}
 
-	wrmsr(hwc->counter_base + idx, hwc->next_count, 0);
+	WARN_ON_ONCE(left <= 0);
+
+	per_cpu(prev_left[idx], smp_processor_id()) = left;
+
+	/*
+	 * The hw counter starts counting from this counter offset,
+	 * mark it to be able to extra future deltas:
+	 */
+	atomic64_set(&hwc->prev_count, (u64)(s64)-left);
+
+	wrmsr(hwc->counter_base + idx, -left, 0);
 }
 
-static void __x86_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+static void
+__x86_perf_counter_enable(struct perf_counter *counter,
+			  struct hw_perf_counter *hwc, int idx)
 {
 	wrmsr(hwc->config_base + idx,
 	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
+/*
+ * Find a PMC slot for the freshly enabled / scheduled in counter:
+ */
 static void x86_perf_counter_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
@@ -170,55 +253,17 @@ static void x86_perf_counter_enable(struct perf_counter *counter)
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	__x86_perf_counter_disable(hwc, idx);
+	__x86_perf_counter_disable(counter, hwc, idx);
 
 	cpuc->counters[idx] = counter;
 
-	__hw_perf_counter_set_period(hwc, idx);
-	__x86_perf_counter_enable(hwc, idx);
-}
-
-static void __hw_perf_save_counter(struct perf_counter *counter,
-				   struct hw_perf_counter *hwc, int idx)
-{
-	s64 raw = -1;
-	s64 delta;
-
-	/*
-	 * Get the raw hw counter value:
-	 */
-	rdmsrl(hwc->counter_base + idx, raw);
-
-	/*
-	 * Rebase it to zero (it started counting at -irq_period),
-	 * to see the delta since ->prev_count:
-	 */
-	delta = (s64)hwc->irq_period + (s64)(s32)raw;
-
-	atomic64_counter_set(counter, hwc->prev_count + delta);
-
-	/*
-	 * Adjust the ->prev_count offset - if we went beyond
-	 * irq_period of units, then we got an IRQ and the counter
-	 * was set back to -irq_period:
-	 */
-	while (delta >= (s64)hwc->irq_period) {
-		hwc->prev_count += hwc->irq_period;
-		delta -= (s64)hwc->irq_period;
-	}
-
-	/*
-	 * Calculate the next raw counter value we'll write into
-	 * the counter at the next sched-in time:
-	 */
-	delta -= (s64)hwc->irq_period;
-
-	hwc->next_count = (s32)delta;
+	__hw_perf_counter_set_period(counter, hwc, idx);
+	__x86_perf_counter_enable(counter, hwc, idx);
 }
 
 void perf_counter_print_debug(void)
 {
-	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count;
+	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left;
 	int cpu, idx;
 
 	if (!nr_hw_counters)
@@ -241,14 +286,14 @@ void perf_counter_print_debug(void)
 		rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 		rdmsrl(MSR_ARCH_PERFMON_PERFCTR0  + idx, pmc_count);
 
-		next_count = per_cpu(prev_next_count[idx], cpu);
+		prev_left = per_cpu(prev_left[idx], cpu);
 
 		printk(KERN_INFO "CPU#%d: PMC%d ctrl:  %016llx\n",
 			cpu, idx, pmc_ctrl);
 		printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n",
 			cpu, idx, pmc_count);
-		printk(KERN_INFO "CPU#%d: PMC%d next:  %016llx\n",
-			cpu, idx, next_count);
+		printk(KERN_INFO "CPU#%d: PMC%d left:  %016llx\n",
+			cpu, idx, prev_left);
 	}
 	local_irq_enable();
 }
@@ -259,29 +304,16 @@ static void x86_perf_counter_disable(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
-	__x86_perf_counter_disable(hwc, idx);
+	__x86_perf_counter_disable(counter, hwc, idx);
 
 	clear_bit(idx, cpuc->used);
 	cpuc->counters[idx] = NULL;
-	__hw_perf_save_counter(counter, hwc, idx);
-}
 
-static void x86_perf_counter_read(struct perf_counter *counter)
-{
-	struct hw_perf_counter *hwc = &counter->hw;
-	unsigned long addr = hwc->counter_base + hwc->idx;
-	s64 offs, val = -1LL;
-	s32 val32;
-
-	/* Careful: NMI might modify the counter offset */
-	do {
-		offs = hwc->prev_count;
-		rdmsrl(addr, val);
-	} while (offs != hwc->prev_count);
-
-	val32 = (s32) val;
-	val = (s64)hwc->irq_period + (s64)val32;
-	atomic64_counter_set(counter, hwc->prev_count + val);
+	/*
+	 * Drain the remaining delta count out of a counter
+	 * that we are disabling:
+	 */
+	x86_perf_counter_update(counter, hwc, idx);
 }
 
 static void perf_store_irq_data(struct perf_counter *counter, u64 data)
@@ -299,7 +331,8 @@ static void perf_store_irq_data(struct perf_counter *counter, u64 data)
 }
 
 /*
- * NMI-safe enable method:
+ * Save and restart an expired counter. Called by NMI contexts,
+ * so it has to be careful about preempting normal counter ops:
  */
 static void perf_save_and_restart(struct perf_counter *counter)
 {
@@ -309,45 +342,25 @@ static void perf_save_and_restart(struct perf_counter *counter)
 
 	rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 
-	__hw_perf_save_counter(counter, hwc, idx);
-	__hw_perf_counter_set_period(hwc, idx);
+	x86_perf_counter_update(counter, hwc, idx);
+	__hw_perf_counter_set_period(counter, hwc, idx);
 
 	if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
-		__x86_perf_counter_enable(hwc, idx);
+		__x86_perf_counter_enable(counter, hwc, idx);
 }
 
 static void
 perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 {
 	struct perf_counter *counter, *group_leader = sibling->group_leader;
-	int bit;
-
-	/*
-	 * Store the counter's own timestamp first:
-	 */
-	perf_store_irq_data(sibling, sibling->hw_event.type);
-	perf_store_irq_data(sibling, atomic64_counter_read(sibling));
 
 	/*
-	 * Then store sibling timestamps (if any):
+	 * Store sibling timestamps (if any):
 	 */
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
-		if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
-			/*
-			 * When counter was not in the overflow mask, we have to
-			 * read it from hardware. We read it as well, when it
-			 * has not been read yet and clear the bit in the
-			 * status mask.
-			 */
-			bit = counter->hw.idx;
-			if (!test_bit(bit, (unsigned long *) overflown) ||
-			    test_bit(bit, (unsigned long *) status)) {
-				clear_bit(bit, (unsigned long *) status);
-				perf_save_and_restart(counter);
-			}
-		}
+		x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
 		perf_store_irq_data(sibling, counter->hw_event.type);
-		perf_store_irq_data(sibling, atomic64_counter_read(counter));
+		perf_store_irq_data(sibling, atomic64_read(&counter->count));
 	}
 }
 
@@ -540,6 +553,11 @@ void __init init_hw_perf_counters(void)
 	perf_counters_initialized = true;
 }
 
+static void x86_perf_counter_read(struct perf_counter *counter)
+{
+	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
+}
+
 static const struct hw_perf_counter_ops x86_perf_counter_ops = {
 	.hw_perf_counter_enable		= x86_perf_counter_enable,
 	.hw_perf_counter_disable	= x86_perf_counter_disable,
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 8cb095f..7246028 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -91,14 +91,16 @@ struct perf_counter_hw_event {
  * struct hw_perf_counter - performance counter hardware details:
  */
 struct hw_perf_counter {
+#ifdef CONFIG_PERF_COUNTERS
 	u64				config;
 	unsigned long			config_base;
 	unsigned long			counter_base;
 	int				nmi;
 	unsigned int			idx;
-	u64				prev_count;
+	atomic64_t			prev_count;
 	u64				irq_period;
-	s32				next_count;
+	atomic64_t			period_left;
+#endif
 };
 
 /*
@@ -140,17 +142,15 @@ enum perf_counter_active_state {
  * struct perf_counter - performance counter kernel representation:
  */
 struct perf_counter {
+#ifdef CONFIG_PERF_COUNTERS
 	struct list_head		list_entry;
 	struct list_head		sibling_list;
 	struct perf_counter		*group_leader;
 	const struct hw_perf_counter_ops *hw_ops;
 
 	enum perf_counter_active_state	state;
-#if BITS_PER_LONG == 64
 	atomic64_t			count;
-#else
-	atomic_t			count32[2];
-#endif
+
 	struct perf_counter_hw_event	hw_event;
 	struct hw_perf_counter		hw;
 
@@ -172,6 +172,7 @@ struct perf_counter {
 	struct perf_data		*irqdata;
 	struct perf_data		*usrdata;
 	struct perf_data		data[2];
+#endif
 };
 
 /**
@@ -220,8 +221,6 @@ extern void perf_counter_notify(struct pt_regs *regs);
 extern void perf_counter_print_debug(void);
 extern u64 hw_perf_save_disable(void);
 extern void hw_perf_restore(u64 ctrl);
-extern void atomic64_counter_set(struct perf_counter *counter, u64 val64);
-extern u64 atomic64_counter_read(struct perf_counter *counter);
 extern int perf_counter_task_disable(void);
 extern int perf_counter_task_enable(void);
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 559130b..416861c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -44,67 +44,9 @@ hw_perf_counter_init(struct perf_counter *counter)
 }
 
 u64 __weak hw_perf_save_disable(void)		{ return 0; }
-void __weak hw_perf_restore(u64 ctrl)	{ }
+void __weak hw_perf_restore(u64 ctrl)		{ }
 void __weak hw_perf_counter_setup(void)		{ }
 
-#if BITS_PER_LONG == 64
-
-/*
- * Read the cached counter in counter safe against cross CPU / NMI
- * modifications. 64 bit version - no complications.
- */
-static inline u64 perf_counter_read_safe(struct perf_counter *counter)
-{
-	return (u64) atomic64_read(&counter->count);
-}
-
-void atomic64_counter_set(struct perf_counter *counter, u64 val)
-{
-	atomic64_set(&counter->count, val);
-}
-
-u64 atomic64_counter_read(struct perf_counter *counter)
-{
-	return atomic64_read(&counter->count);
-}
-
-#else
-
-/*
- * Read the cached counter in counter safe against cross CPU / NMI
- * modifications. 32 bit version.
- */
-static u64 perf_counter_read_safe(struct perf_counter *counter)
-{
-	u32 cntl, cnth;
-
-	local_irq_disable();
-	do {
-		cnth = atomic_read(&counter->count32[1]);
-		cntl = atomic_read(&counter->count32[0]);
-	} while (cnth != atomic_read(&counter->count32[1]));
-
-	local_irq_enable();
-
-	return cntl | ((u64) cnth) << 32;
-}
-
-void atomic64_counter_set(struct perf_counter *counter, u64 val64)
-{
-	u32 *val32 = (void *)&val64;
-
-	atomic_set(counter->count32 + 0, *(val32 + 0));
-	atomic_set(counter->count32 + 1, *(val32 + 1));
-}
-
-u64 atomic64_counter_read(struct perf_counter *counter)
-{
-	return atomic_read(counter->count32 + 0) |
-		(u64) atomic_read(counter->count32 + 1) << 32;
-}
-
-#endif
-
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
@@ -280,11 +222,11 @@ static void __perf_install_in_context(void *info)
 	ctx->nr_counters++;
 
 	if (cpuctx->active_oncpu < perf_max_counters) {
-		counter->hw_ops->hw_perf_counter_enable(counter);
 		counter->state = PERF_COUNTER_STATE_ACTIVE;
 		counter->oncpu = cpu;
 		ctx->nr_active++;
 		cpuctx->active_oncpu++;
+		counter->hw_ops->hw_perf_counter_enable(counter);
 	}
 
 	if (!ctx->task && cpuctx->max_pertask)
@@ -624,7 +566,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
 					 __hw_perf_counter_read, counter, 1);
 	}
 
-	return perf_counter_read_safe(counter);
+	return atomic64_read(&counter->count);
 }
 
 /*
@@ -921,7 +863,7 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter)
 {
 	int cpu = raw_smp_processor_id();
 
-	atomic64_counter_set(counter, cpu_clock(cpu));
+	atomic64_set(&counter->count, cpu_clock(cpu));
 }
 
 static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
@@ -940,7 +882,7 @@ static void task_clock_perf_counter_disable(struct perf_counter *counter)
 
 static void task_clock_perf_counter_read(struct perf_counter *counter)
 {
-	atomic64_counter_set(counter, current->se.sum_exec_runtime);
+	atomic64_set(&counter->count, current->se.sum_exec_runtime);
 }
 
 static const struct hw_perf_counter_ops perf_ops_task_clock = {
-- 
cgit v0.10.2


From 9b51f66dcb09ac5eb6bc68fc111d5c7a1e0131d6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 12 Dec 2008 13:49:45 +0100
Subject: perfcounters: implement "counter inheritance"

Impact: implement new performance feature

Counter inheritance can be used to run performance counters in a workload,
transparently - and pipe back the counter results to the parent counter.

Inheritance for performance counters works the following way: when creating
a counter it can be marked with the .inherit=1 flag. Such counters are then
'inherited' by all child tasks (be they fork()-ed or clone()-ed). These
counters get inherited through exec() boundaries as well (except through
setuid boundaries).

The counter values get added back to the parent counter(s) when the child
task(s) exit - much like stime/utime statistics are gathered. So inherited
counters are ideal to gather summary statistics about an application's
behavior via shell commands, without having to modify that application.

The timec.c command utilizes counter inheritance:

  http://redhat.com/~mingo/perfcounters/timec.c

Sample output:

   $ ./timec -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null

   Performance counter stats for 'ls':

           163516953 instructions
                2295 cache-misses
             2855182 branch-misses

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 7246028..e5d25bf 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -75,10 +75,11 @@ struct perf_counter_hw_event {
 	u64			irq_period;
 	u32			record_type;
 
-	u32			disabled     :  1, /* off by default */
-				nmi	     :  1, /* NMI sampling   */
-				raw	     :  1, /* raw event type */
-				__reserved_1 : 29;
+	u32			disabled     :  1, /* off by default      */
+				nmi	     :  1, /* NMI sampling        */
+				raw	     :  1, /* raw event type      */
+				inherit	     :  1, /* children inherit it */
+				__reserved_1 : 28;
 
 	u64			__reserved_2;
 };
@@ -138,6 +139,8 @@ enum perf_counter_active_state {
 	PERF_COUNTER_STATE_ACTIVE	=  1,
 };
 
+struct file;
+
 /**
  * struct perf_counter - performance counter kernel representation:
  */
@@ -156,7 +159,10 @@ struct perf_counter {
 
 	struct perf_counter_context	*ctx;
 	struct task_struct		*task;
+	struct file			*filp;
 
+	unsigned int			nr_inherited;
+	struct perf_counter		*parent;
 	/*
 	 * Protect attach/detach:
 	 */
@@ -210,13 +216,16 @@ struct perf_cpu_context {
 extern int perf_max_counters;
 
 #ifdef CONFIG_PERF_COUNTERS
+extern void
+perf_counter_show(struct perf_counter *counter, char *str, int trace);
 extern const struct hw_perf_counter_ops *
 hw_perf_counter_init(struct perf_counter *counter);
 
 extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
 extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
 extern void perf_counter_task_tick(struct task_struct *task, int cpu);
-extern void perf_counter_init_task(struct task_struct *task);
+extern void perf_counter_init_task(struct task_struct *child);
+extern void perf_counter_exit_task(struct task_struct *child);
 extern void perf_counter_notify(struct pt_regs *regs);
 extern void perf_counter_print_debug(void);
 extern u64 hw_perf_save_disable(void);
@@ -226,12 +235,15 @@ extern int perf_counter_task_enable(void);
 
 #else
 static inline void
+perf_counter_show(struct perf_counter *counter, char *str, int trace)   { }
+static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
 static inline void
 perf_counter_task_sched_out(struct task_struct *task, int cpu)		{ }
 static inline void
 perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
-static inline void perf_counter_init_task(struct task_struct *task)	{ }
+static inline void perf_counter_init_task(struct task_struct *child)	{ }
+static inline void perf_counter_exit_task(struct task_struct *child)	{ }
 static inline void perf_counter_notify(struct pt_regs *regs)		{ }
 static inline void perf_counter_print_debug(void)			{ }
 static inline void hw_perf_restore(u64 ctrl)			{ }
diff --git a/kernel/exit.c b/kernel/exit.c
index 2d8be7e..d336c90 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1093,11 +1093,12 @@ NORET_TYPE void do_exit(long code)
 	mpol_put(tsk->mempolicy);
 	tsk->mempolicy = NULL;
 #endif
-#ifdef CONFIG_FUTEX
 	/*
-	 * This must happen late, after the PID is not
-	 * hashed anymore:
+	 * These must happen late, after the PID is not
+	 * hashed anymore, but still at a point that may sleep:
 	 */
+	perf_counter_exit_task(tsk);
+#ifdef CONFIG_FUTEX
 	if (unlikely(!list_empty(&tsk->pi_state_list)))
 		exit_pi_state_list(tsk);
 	if (unlikely(current->pi_state_cache))
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 416861c..f5e81dd 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -80,8 +80,6 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 
 		list_del_init(&sibling->list_entry);
 		list_add_tail(&sibling->list_entry, &ctx->counter_list);
-		WARN_ON_ONCE(!sibling->group_leader);
-		WARN_ON_ONCE(sibling->group_leader == sibling);
 		sibling->group_leader = sibling;
 	}
 }
@@ -97,6 +95,7 @@ static void __perf_counter_remove_from_context(void *info)
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter *counter = info;
 	struct perf_counter_context *ctx = counter->ctx;
+	unsigned long flags;
 	u64 perf_flags;
 
 	/*
@@ -107,7 +106,7 @@ static void __perf_counter_remove_from_context(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	spin_lock(&ctx->lock);
+	spin_lock_irqsave(&ctx->lock, flags);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 		counter->hw_ops->hw_perf_counter_disable(counter);
@@ -136,7 +135,7 @@ static void __perf_counter_remove_from_context(void *info)
 			    perf_max_counters - perf_reserved_percpu);
 	}
 
-	spin_unlock(&ctx->lock);
+	spin_unlock_irqrestore(&ctx->lock, flags);
 }
 
 
@@ -199,6 +198,7 @@ static void __perf_install_in_context(void *info)
 	struct perf_counter *counter = info;
 	struct perf_counter_context *ctx = counter->ctx;
 	int cpu = smp_processor_id();
+	unsigned long flags;
 	u64 perf_flags;
 
 	/*
@@ -209,7 +209,7 @@ static void __perf_install_in_context(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	spin_lock(&ctx->lock);
+	spin_lock_irqsave(&ctx->lock, flags);
 
 	/*
 	 * Protect the list operation against NMI by disabling the
@@ -232,7 +232,7 @@ static void __perf_install_in_context(void *info)
 	if (!ctx->task && cpuctx->max_pertask)
 		cpuctx->max_pertask--;
 
-	spin_unlock(&ctx->lock);
+	spin_unlock_irqrestore(&ctx->lock, flags);
 }
 
 /*
@@ -446,10 +446,9 @@ int perf_counter_task_disable(void)
 	 */
 	perf_flags = hw_perf_save_disable();
 
-	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		WARN_ON_ONCE(counter->state == PERF_COUNTER_STATE_ACTIVE);
+	list_for_each_entry(counter, &ctx->counter_list, list_entry)
 		counter->state = PERF_COUNTER_STATE_OFF;
-	}
+
 	hw_perf_restore(perf_flags);
 
 	spin_unlock(&ctx->lock);
@@ -526,26 +525,6 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 }
 
 /*
- * Initialize the perf_counter context in a task_struct:
- */
-static void
-__perf_counter_init_context(struct perf_counter_context *ctx,
-			    struct task_struct *task)
-{
-	spin_lock_init(&ctx->lock);
-	INIT_LIST_HEAD(&ctx->counter_list);
-	ctx->nr_counters	= 0;
-	ctx->task		= task;
-}
-/*
- * Initialize the perf_counter context in task_struct
- */
-void perf_counter_init_task(struct task_struct *task)
-{
-	__perf_counter_init_context(&task->perf_counter_ctx, task);
-}
-
-/*
  * Cross CPU call to read the hardware counter
  */
 static void __hw_perf_counter_read(void *info)
@@ -663,7 +642,6 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 		cpuctx = &per_cpu(perf_cpu_context, cpu);
 		ctx = &cpuctx->ctx;
 
-		WARN_ON_ONCE(ctx->task);
 		return ctx;
 	}
 
@@ -915,12 +893,13 @@ sw_perf_counter_init(struct perf_counter *counter)
 static struct perf_counter *
 perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		   int cpu,
-		   struct perf_counter *group_leader)
+		   struct perf_counter *group_leader,
+		   gfp_t gfpflags)
 {
 	const struct hw_perf_counter_ops *hw_ops;
 	struct perf_counter *counter;
 
-	counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+	counter = kzalloc(sizeof(*counter), gfpflags);
 	if (!counter)
 		return NULL;
 
@@ -947,9 +926,8 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	hw_ops = NULL;
 	if (!hw_event->raw && hw_event->type < 0)
 		hw_ops = sw_perf_counter_init(counter);
-	if (!hw_ops) {
+	if (!hw_ops)
 		hw_ops = hw_perf_counter_init(counter);
-	}
 
 	if (!hw_ops) {
 		kfree(counter);
@@ -975,8 +953,10 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
 	struct perf_counter *counter, *group_leader;
 	struct perf_counter_hw_event hw_event;
 	struct perf_counter_context *ctx;
+	struct file *counter_file = NULL;
 	struct file *group_file = NULL;
 	int fput_needed = 0;
+	int fput_needed2 = 0;
 	int ret;
 
 	if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
@@ -1017,25 +997,29 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
 	}
 
 	ret = -EINVAL;
-	counter = perf_counter_alloc(&hw_event, cpu, group_leader);
+	counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL);
 	if (!counter)
 		goto err_put_context;
 
-	perf_install_in_context(ctx, counter, cpu);
-
 	ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
 	if (ret < 0)
-		goto err_remove_free_put_context;
+		goto err_free_put_context;
+
+	counter_file = fget_light(ret, &fput_needed2);
+	if (!counter_file)
+		goto err_free_put_context;
+
+	counter->filp = counter_file;
+	perf_install_in_context(ctx, counter, cpu);
+
+	fput_light(counter_file, fput_needed2);
 
 out_fput:
 	fput_light(group_file, fput_needed);
 
 	return ret;
 
-err_remove_free_put_context:
-	mutex_lock(&counter->mutex);
-	perf_counter_remove_from_context(counter);
-	mutex_unlock(&counter->mutex);
+err_free_put_context:
 	kfree(counter);
 
 err_put_context:
@@ -1044,6 +1028,186 @@ err_put_context:
 	goto out_fput;
 }
 
+/*
+ * Initialize the perf_counter context in a task_struct:
+ */
+static void
+__perf_counter_init_context(struct perf_counter_context *ctx,
+			    struct task_struct *task)
+{
+	memset(ctx, 0, sizeof(*ctx));
+	spin_lock_init(&ctx->lock);
+	INIT_LIST_HEAD(&ctx->counter_list);
+	ctx->task = task;
+}
+
+/*
+ * inherit a counter from parent task to child task:
+ */
+static int
+inherit_counter(struct perf_counter *parent_counter,
+	      struct task_struct *parent,
+	      struct perf_counter_context *parent_ctx,
+	      struct task_struct *child,
+	      struct perf_counter_context *child_ctx)
+{
+	struct perf_counter *child_counter;
+
+	child_counter = perf_counter_alloc(&parent_counter->hw_event,
+					    parent_counter->cpu, NULL,
+					    GFP_ATOMIC);
+	if (!child_counter)
+		return -ENOMEM;
+
+	/*
+	 * Link it up in the child's context:
+	 */
+	child_counter->ctx = child_ctx;
+	child_counter->task = child;
+	list_add_counter(child_counter, child_ctx);
+	child_ctx->nr_counters++;
+
+	child_counter->parent = parent_counter;
+	parent_counter->nr_inherited++;
+	/*
+	 * inherit into child's child as well:
+	 */
+	child_counter->hw_event.inherit = 1;
+
+	/*
+	 * Get a reference to the parent filp - we will fput it
+	 * when the child counter exits. This is safe to do because
+	 * we are in the parent and we know that the filp still
+	 * exists and has a nonzero count:
+	 */
+	atomic_long_inc(&parent_counter->filp->f_count);
+
+	return 0;
+}
+
+static void
+__perf_counter_exit_task(struct task_struct *child,
+			 struct perf_counter *child_counter,
+			 struct perf_counter_context *child_ctx)
+{
+	struct perf_counter *parent_counter;
+	u64 parent_val, child_val;
+	u64 perf_flags;
+
+	/*
+	 * Disable and unlink this counter.
+	 *
+	 * Be careful about zapping the list - IRQ/NMI context
+	 * could still be processing it:
+	 */
+	local_irq_disable();
+	perf_flags = hw_perf_save_disable();
+
+	if (child_counter->state == PERF_COUNTER_STATE_ACTIVE)
+		child_counter->hw_ops->hw_perf_counter_disable(child_counter);
+	list_del_init(&child_counter->list_entry);
+
+	hw_perf_restore(perf_flags);
+	local_irq_enable();
+
+	parent_counter = child_counter->parent;
+	/*
+	 * It can happen that parent exits first, and has counters
+	 * that are still around due to the child reference. These
+	 * counters need to be zapped - but otherwise linger.
+	 */
+	if (!parent_counter)
+		return;
+
+	parent_val = atomic64_read(&parent_counter->count);
+	child_val = atomic64_read(&child_counter->count);
+
+	/*
+	 * Add back the child's count to the parent's count:
+	 */
+	atomic64_add(child_val, &parent_counter->count);
+
+	fput(parent_counter->filp);
+
+	kfree(child_counter);
+}
+
+/*
+ * When a child task exist, feed back counter values to parent counters.
+ *
+ * Note: we are running in child context, but the PID is not hashed
+ * anymore so new counters will not be added.
+ */
+void perf_counter_exit_task(struct task_struct *child)
+{
+	struct perf_counter *child_counter, *tmp;
+	struct perf_counter_context *child_ctx;
+
+	child_ctx = &child->perf_counter_ctx;
+
+	if (likely(!child_ctx->nr_counters))
+		return;
+
+	list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
+				 list_entry)
+		__perf_counter_exit_task(child, child_counter, child_ctx);
+}
+
+/*
+ * Initialize the perf_counter context in task_struct
+ */
+void perf_counter_init_task(struct task_struct *child)
+{
+	struct perf_counter_context *child_ctx, *parent_ctx;
+	struct perf_counter *counter, *parent_counter;
+	struct task_struct *parent = current;
+	unsigned long flags;
+
+	child_ctx  =  &child->perf_counter_ctx;
+	parent_ctx = &parent->perf_counter_ctx;
+
+	__perf_counter_init_context(child_ctx, child);
+
+	/*
+	 * This is executed from the parent task context, so inherit
+	 * counters that have been marked for cloning:
+	 */
+
+	if (likely(!parent_ctx->nr_counters))
+		return;
+
+	/*
+	 * Lock the parent list. No need to lock the child - not PID
+	 * hashed yet and not running, so nobody can access it.
+	 */
+	spin_lock_irqsave(&parent_ctx->lock, flags);
+
+	/*
+	 * We dont have to disable NMIs - we are only looking at
+	 * the list, not manipulating it:
+	 */
+	list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
+		if (!counter->hw_event.inherit || counter->group_leader != counter)
+			continue;
+
+		/*
+		 * Instead of creating recursive hierarchies of counters,
+		 * we link inheritd counters back to the original parent,
+		 * which has a filp for sure, which we use as the reference
+		 * count:
+		 */
+		parent_counter = counter;
+		if (counter->parent)
+			parent_counter = counter->parent;
+
+		if (inherit_counter(parent_counter, parent,
+				  parent_ctx, child, child_ctx))
+			break;
+	}
+
+	spin_unlock_irqrestore(&parent_ctx->lock, flags);
+}
+
 static void __cpuinit perf_counter_init_cpu(int cpu)
 {
 	struct perf_cpu_context *cpuctx;
-- 
cgit v0.10.2


From 8cb391e8786c8072367f0aeb90551903fef074ba Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Dec 2008 12:22:31 +0100
Subject: perfcounters: fix task clock counter

Impact: bugfix

Update the task clock counter to the new math.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f5e81dd..1f81cde 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -850,17 +850,36 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
 	.hw_perf_counter_read		= cpu_clock_perf_counter_read,
 };
 
-static void task_clock_perf_counter_enable(struct perf_counter *counter)
+static void task_clock_perf_counter_update(struct perf_counter *counter)
 {
+	u64 prev, now;
+	s64 delta;
+
+	prev = atomic64_read(&counter->hw.prev_count);
+	now = current->se.sum_exec_runtime;
+
+	atomic64_set(&counter->hw.prev_count, now);
+
+	delta = now - prev;
+	if (WARN_ON_ONCE(delta < 0))
+		delta = 0;
+
+	atomic64_add(delta, &counter->count);
 }
 
-static void task_clock_perf_counter_disable(struct perf_counter *counter)
+static void task_clock_perf_counter_read(struct perf_counter *counter)
 {
+	task_clock_perf_counter_update(counter);
 }
 
-static void task_clock_perf_counter_read(struct perf_counter *counter)
+static void task_clock_perf_counter_enable(struct perf_counter *counter)
+{
+	atomic64_set(&counter->hw.prev_count, current->se.sum_exec_runtime);
+}
+
+static void task_clock_perf_counter_disable(struct perf_counter *counter)
 {
-	atomic64_set(&counter->count, current->se.sum_exec_runtime);
+	task_clock_perf_counter_update(counter);
 }
 
 static const struct hw_perf_counter_ops perf_ops_task_clock = {
-- 
cgit v0.10.2


From 5d6a27d8a096868ae313f71f563b06074a7e34fe Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Dec 2008 12:28:33 +0100
Subject: perfcounters: add context switch counter

Impact: add new feature, new sw counter

Add a counter that counts the number of context-switches a task
is doing.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index e5d25bf..d2a1656 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -53,8 +53,8 @@ enum hw_event_types {
 	/*
 	 * Future software events:
 	 */
-	/* PERF_COUNT_PAGE_FAULTS	= -3,
-	   PERF_COUNT_CONTEXT_SWITCHES	= -4, */
+	PERF_COUNT_PAGE_FAULTS		= -3,
+	PERF_COUNT_CONTEXT_SWITCHES	= -4,
 };
 
 /*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 1f81cde..0928709 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -888,6 +888,54 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = {
 	.hw_perf_counter_read		= task_clock_perf_counter_read,
 };
 
+static u64 get_context_switches(void)
+{
+	struct task_struct *curr = current;
+
+	return curr->nvcsw + curr->nivcsw;
+}
+
+static void context_switches_perf_counter_update(struct perf_counter *counter)
+{
+	u64 prev, now;
+	s64 delta;
+
+	prev = atomic64_read(&counter->hw.prev_count);
+	now = get_context_switches();
+
+	atomic64_set(&counter->hw.prev_count, now);
+
+	delta = now - prev;
+	if (WARN_ON_ONCE(delta < 0))
+		delta = 0;
+
+	atomic64_add(delta, &counter->count);
+}
+
+static void context_switches_perf_counter_read(struct perf_counter *counter)
+{
+	context_switches_perf_counter_update(counter);
+}
+
+static void context_switches_perf_counter_enable(struct perf_counter *counter)
+{
+	/*
+	 * ->nvcsw + curr->nivcsw is a per-task value already,
+	 * so we dont have to clear it on switch-in.
+	 */
+}
+
+static void context_switches_perf_counter_disable(struct perf_counter *counter)
+{
+	context_switches_perf_counter_update(counter);
+}
+
+static const struct hw_perf_counter_ops perf_ops_context_switches = {
+	.hw_perf_counter_enable		= context_switches_perf_counter_enable,
+	.hw_perf_counter_disable	= context_switches_perf_counter_disable,
+	.hw_perf_counter_read		= context_switches_perf_counter_read,
+};
+
 static const struct hw_perf_counter_ops *
 sw_perf_counter_init(struct perf_counter *counter)
 {
@@ -900,6 +948,9 @@ sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_TASK_CLOCK:
 		hw_ops = &perf_ops_task_clock;
 		break;
+	case PERF_COUNT_CONTEXT_SWITCHES:
+		hw_ops = &perf_ops_context_switches;
+		break;
 	default:
 		break;
 	}
-- 
cgit v0.10.2


From 6c594c21fcb02c662f11c97be4d7d2b73060a205 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Dec 2008 12:34:15 +0100
Subject: perfcounters: add task migrations counter

Impact: add new feature, new sw counter

Add a counter that counts the number of cross-CPU migrations a
task is suffering.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index d2a1656..f30486f 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -42,6 +42,8 @@ enum hw_event_types {
 	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
 	PERF_COUNT_BRANCH_MISSES	=  5,
 
+	PERF_HW_EVENTS_MAX		=  6,
+
 	/*
 	 * Special "software" counters provided by the kernel, even if
 	 * the hardware does not support performance counters. These
@@ -50,11 +52,11 @@ enum hw_event_types {
 	 */
 	PERF_COUNT_CPU_CLOCK		= -1,
 	PERF_COUNT_TASK_CLOCK		= -2,
-	/*
-	 * Future software events:
-	 */
 	PERF_COUNT_PAGE_FAULTS		= -3,
 	PERF_COUNT_CONTEXT_SWITCHES	= -4,
+	PERF_COUNT_CPU_MIGRATIONS	= -5,
+
+	PERF_SW_EVENTS_MIN		= -6,
 };
 
 /*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4c53027..2e15be8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1014,6 +1014,8 @@ struct sched_entity {
 	u64			last_wakeup;
 	u64			avg_overlap;
 
+	u64			nr_migrations;
+
 #ifdef CONFIG_SCHEDSTATS
 	u64			wait_start;
 	u64			wait_max;
@@ -1029,7 +1031,6 @@ struct sched_entity {
 	u64			exec_max;
 	u64			slice_max;
 
-	u64			nr_migrations;
 	u64			nr_migrations_cold;
 	u64			nr_failed_migrations_affine;
 	u64			nr_failed_migrations_running;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0928709..fb11e35 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -936,6 +936,52 @@ static const struct hw_perf_counter_ops perf_ops_context_switches = {
 	.hw_perf_counter_read		= context_switches_perf_counter_read,
 };
 
+static inline u64 get_cpu_migrations(void)
+{
+	return current->se.nr_migrations;
+}
+
+static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
+{
+	u64 prev, now;
+	s64 delta;
+
+	prev = atomic64_read(&counter->hw.prev_count);
+	now = get_cpu_migrations();
+
+	atomic64_set(&counter->hw.prev_count, now);
+
+	delta = now - prev;
+	if (WARN_ON_ONCE(delta < 0))
+		delta = 0;
+
+	atomic64_add(delta, &counter->count);
+}
+
+static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
+{
+	cpu_migrations_perf_counter_update(counter);
+}
+
+static void cpu_migrations_perf_counter_enable(struct perf_counter *counter)
+{
+	/*
+	 * se.nr_migrations is a per-task value already,
+	 * so we dont have to clear it on switch-in.
+	 */
+}
+
+static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
+{
+	cpu_migrations_perf_counter_update(counter);
+}
+
+static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
+	.hw_perf_counter_enable		= cpu_migrations_perf_counter_enable,
+	.hw_perf_counter_disable	= cpu_migrations_perf_counter_disable,
+	.hw_perf_counter_read		= cpu_migrations_perf_counter_read,
+};
+
 static const struct hw_perf_counter_ops *
 sw_perf_counter_init(struct perf_counter *counter)
 {
@@ -951,6 +997,9 @@ sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_CONTEXT_SWITCHES:
 		hw_ops = &perf_ops_context_switches;
 		break;
+	case PERF_COUNT_CPU_MIGRATIONS:
+		hw_ops = &perf_ops_cpu_migrations;
+		break;
 	default:
 		break;
 	}
diff --git a/kernel/sched.c b/kernel/sched.c
index 5c3f410..382cfdb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1852,12 +1852,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		p->se.sleep_start -= clock_offset;
 	if (p->se.block_start)
 		p->se.block_start -= clock_offset;
+#endif
 	if (old_cpu != new_cpu) {
-		schedstat_inc(p, se.nr_migrations);
+		p->se.nr_migrations++;
+#ifdef CONFIG_SCHEDSTATS
 		if (task_hot(p, old_rq->clock, NULL))
 			schedstat_inc(p, se.nr_forced2_migrations);
-	}
 #endif
+	}
 	p->se.vruntime -= old_cfsrq->min_vruntime -
 					 new_cfsrq->min_vruntime;
 
@@ -2375,6 +2377,7 @@ static void __sched_fork(struct task_struct *p)
 	p->se.exec_start		= 0;
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
+	p->se.nr_migrations		= 0;
 	p->se.last_wakeup		= 0;
 	p->se.avg_overlap		= 0;
 
-- 
cgit v0.10.2


From e06c61a879910869aa5bf3f8f634abfee1a7bebc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Dec 2008 14:44:31 +0100
Subject: perfcounters: add nr-of-faults counter

Impact: add new feature, new sw counter

Add a counter that counts the number of pagefaults a task
is experiencing.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index fb11e35..59c52f9 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -888,6 +888,54 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = {
 	.hw_perf_counter_read		= task_clock_perf_counter_read,
 };
 
+static u64 get_page_faults(void)
+{
+	struct task_struct *curr = current;
+
+	return curr->maj_flt + curr->min_flt;
+}
+
+static void page_faults_perf_counter_update(struct perf_counter *counter)
+{
+	u64 prev, now;
+	s64 delta;
+
+	prev = atomic64_read(&counter->hw.prev_count);
+	now = get_page_faults();
+
+	atomic64_set(&counter->hw.prev_count, now);
+
+	delta = now - prev;
+	if (WARN_ON_ONCE(delta < 0))
+		delta = 0;
+
+	atomic64_add(delta, &counter->count);
+}
+
+static void page_faults_perf_counter_read(struct perf_counter *counter)
+{
+	page_faults_perf_counter_update(counter);
+}
+
+static void page_faults_perf_counter_enable(struct perf_counter *counter)
+{
+	/*
+	 * page-faults is a per-task value already,
+	 * so we dont have to clear it on switch-in.
+	 */
+}
+
+static void page_faults_perf_counter_disable(struct perf_counter *counter)
+{
+	page_faults_perf_counter_update(counter);
+}
+
+static const struct hw_perf_counter_ops perf_ops_page_faults = {
+	.hw_perf_counter_enable		= page_faults_perf_counter_enable,
+	.hw_perf_counter_disable	= page_faults_perf_counter_disable,
+	.hw_perf_counter_read		= page_faults_perf_counter_read,
+};
+
 static u64 get_context_switches(void)
 {
 	struct task_struct *curr = current;
@@ -994,6 +1042,9 @@ sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_TASK_CLOCK:
 		hw_ops = &perf_ops_task_clock;
 		break;
+	case PERF_COUNT_PAGE_FAULTS:
+		hw_ops = &perf_ops_page_faults;
+		break;
 	case PERF_COUNT_CONTEXT_SWITCHES:
 		hw_ops = &perf_ops_context_switches;
 		break;
-- 
cgit v0.10.2


From 2b9ff0db19b5e2c77000b7201525f9c3d6e8328d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Dec 2008 18:36:30 +0100
Subject: perfcounters: fix non-intel-perfmon CPUs

Do not write MSR_CORE_PERF_GLOBAL_CTRL on CPUs where it does not exist.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 5afae13..6d30f60 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -157,6 +157,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 
 void hw_perf_enable_all(void)
 {
+	if (unlikely(!perf_counters_initialized))
+		return;
+
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
 }
 
@@ -164,14 +167,21 @@ u64 hw_perf_save_disable(void)
 {
 	u64 ctrl;
 
+	if (unlikely(!perf_counters_initialized))
+		return 0;
+
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+
 	return ctrl;
 }
 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
 void hw_perf_restore(u64 ctrl)
 {
+	if (unlikely(!perf_counters_initialized))
+		return;
+
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
 }
 EXPORT_SYMBOL_GPL(hw_perf_restore);
-- 
cgit v0.10.2


From 088e2852c858159d47f71ee8da38e0fb1b21f806 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Dec 2008 20:21:00 +0100
Subject: perfcounters, x86: fix sw counters on non-PMC CPUs

Make perf_max_counters default to at least 1 - this allows the sw
counters to be used.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 59c52f9..539fa82 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -25,7 +25,7 @@
  */
 DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
 
-int perf_max_counters __read_mostly;
+int perf_max_counters __read_mostly = 1;
 static int perf_reserved_percpu __read_mostly;
 static int perf_overcommit __read_mostly = 1;
 
-- 
cgit v0.10.2


From 75f224cf7700ed6006574dc3f2efa29860727570 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Dec 2008 21:58:46 +0100
Subject: perfcounters: fix lapic initialization

Fix non-working NMI sampling in certain bootup scenarios.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6d30f60..8a154bd 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -557,10 +557,10 @@ void __init init_hw_perf_counters(void)
 	printk(KERN_INFO "... bit_width:    %d\n", eax.split.bit_width);
 	printk(KERN_INFO "... mask_length:  %d\n", eax.split.mask_length);
 
+	perf_counters_initialized = true;
+
 	perf_counters_lapic_init(0);
 	register_die_notifier(&perf_counter_nmi_notifier);
-
-	perf_counters_initialized = true;
 }
 
 static void x86_perf_counter_read(struct perf_counter *counter)
-- 
cgit v0.10.2


From 0cc0c027d4e028632933f1be2dc4cd730358183b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Dec 2008 23:20:36 +0100
Subject: perfcounters: release CPU context when exiting task counters

If counters are exiting via do_exit() not via filp close, then
the CPU context needs to be released - otherwise future percpu
counter creations might fail.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 539fa82..16396e9 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1273,8 +1273,19 @@ __perf_counter_exit_task(struct task_struct *child,
 	local_irq_disable();
 	perf_flags = hw_perf_save_disable();
 
-	if (child_counter->state == PERF_COUNTER_STATE_ACTIVE)
+	if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
+		struct perf_cpu_context *cpuctx;
+
+		cpuctx = &__get_cpu_var(perf_cpu_context);
+
 		child_counter->hw_ops->hw_perf_counter_disable(child_counter);
+		child_counter->state = PERF_COUNTER_STATE_INACTIVE;
+		child_counter->oncpu = -1;
+
+		cpuctx->active_oncpu--;
+		child_ctx->nr_active--;
+	}
+
 	list_del_init(&child_counter->list_entry);
 
 	hw_perf_restore(perf_flags);
@@ -1539,4 +1550,3 @@ static int __init perf_counter_sysfs_init(void)
 				  &perfclass_attr_group);
 }
 device_initcall(perf_counter_sysfs_init);
-
-- 
cgit v0.10.2


From f65cb45cba63f249458b669aa67069eabc37b2f5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 16 Dec 2008 13:40:44 +0100
Subject: perfcounters: flush on setuid exec

Pavel Machek pointed out that performance counters should be flushed
when crossing protection domains on setuid execution.

Reported-by: Pavel Machek <pavel@suse.cz>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/fs/exec.c b/fs/exec.c
index ec5df9a..d5165d8 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,6 +33,7 @@
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
+#include <linux/perf_counter.h>
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
 #include <linux/key.h>
@@ -1017,6 +1018,13 @@ int flush_old_exec(struct linux_binprm * bprm)
 		set_dumpable(current->mm, suid_dumpable);
 	}
 
+	/*
+	 * Flush performance counters when crossing a
+	 * security domain:
+	 */
+	if (!get_dumpable(current->mm))
+		perf_counter_exit_task(current);
+
 	/* An exec changes our domain. We are no longer part of the thread
 	   group */
 
-- 
cgit v0.10.2


From a86ed50859d65a08beec9474df97b88438a996df Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 00:43:10 +0100
Subject: perfcounters: use hw_event.disable flag

Impact: implement default-off counters

Make sure that counters that are created with counter.hw_event.disabled=1,
get created in disabled state.

They can be enabled via:

        prctl(PR_TASK_PERF_COUNTERS_ENABLE);

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 16396e9..5431e79 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1093,6 +1093,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->group_leader		= group_leader;
 	counter->hw_ops			= NULL;
 
+	if (hw_event->disabled)
+		counter->state = PERF_COUNTER_STATE_OFF;
+
 	hw_ops = NULL;
 	if (!hw_event->raw && hw_event->type < 0)
 		hw_ops = sw_perf_counter_init(counter);
-- 
cgit v0.10.2


From 94c46572a6d9bb497eda0a14099d9f1360d57d5d Mon Sep 17 00:00:00 2001
From: Jaswinder Singh <jaswinder@infradead.org>
Date: Fri, 19 Dec 2008 22:37:58 +0530
Subject: x86: perf_counter.c intel_perfmon_event_map and
 max_intel_perfmon_events should be static

Impact: cleanup, avoid sparse warnings, reduce kernel size a bit

Fixes these sparse warnings:
 arch/x86/kernel/cpu/perf_counter.c:44:11: warning: symbol 'intel_perfmon_event_map' was not declared. Should it be static?
 arch/x86/kernel/cpu/perf_counter.c:54:11: warning: symbol 'max_intel_perfmon_events' was not declared. Should it be static?

Signed-off-by: Jaswinder Singh <jaswinder@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 8a154bd..bdbdb56 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -41,7 +41,7 @@ struct cpu_hw_counters {
  */
 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
 
-const int intel_perfmon_event_map[] =
+static const int intel_perfmon_event_map[] =
 {
   [PERF_COUNT_CYCLES]			= 0x003c,
   [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
@@ -51,7 +51,7 @@ const int intel_perfmon_event_map[] =
   [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
 };
 
-const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
+static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
 
 /*
  * Propagate counter elapsed time into the generic counter.
-- 
cgit v0.10.2


From 8fb9331391af95ca1f4e5c0a0da8120b13cbae01 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 23 Dec 2008 12:04:16 +0100
Subject: perfcounters: remove warnings

Impact: remove debug checks

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index bdbdb56..89fad5d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -64,7 +64,6 @@ x86_perf_counter_update(struct perf_counter *counter,
 {
 	u64 prev_raw_count, new_raw_count, delta;
 
-	WARN_ON_ONCE(counter->state != PERF_COUNTER_STATE_ACTIVE);
 	/*
 	 * Careful: an NMI might modify the previous counter value.
 	 *
@@ -89,7 +88,6 @@ again:
 	 * of the count, so we do that by clipping the delta to 32 bits:
 	 */
 	delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
-	WARN_ON_ONCE((int)delta < 0);
 
 	atomic64_add(delta, &counter->count);
 	atomic64_sub(delta, &hwc->period_left);
@@ -193,7 +191,6 @@ __x86_perf_counter_disable(struct perf_counter *counter,
 	int err;
 
 	err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
-	WARN_ON_ONCE(err);
 }
 
 static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]);
@@ -209,8 +206,6 @@ __hw_perf_counter_set_period(struct perf_counter *counter,
 	s32 left = atomic64_read(&hwc->period_left);
 	s32 period = hwc->irq_period;
 
-	WARN_ON_ONCE(period <= 0);
-
 	/*
 	 * If we are way outside a reasoable range then just skip forward:
 	 */
@@ -224,8 +219,6 @@ __hw_perf_counter_set_period(struct perf_counter *counter,
 		atomic64_set(&hwc->period_left, left);
 	}
 
-	WARN_ON_ONCE(left <= 0);
-
 	per_cpu(prev_left[idx], smp_processor_id()) = left;
 
 	/*
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index f30486f..d038450 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -218,8 +218,6 @@ struct perf_cpu_context {
 extern int perf_max_counters;
 
 #ifdef CONFIG_PERF_COUNTERS
-extern void
-perf_counter_show(struct perf_counter *counter, char *str, int trace);
 extern const struct hw_perf_counter_ops *
 hw_perf_counter_init(struct perf_counter *counter);
 
@@ -237,8 +235,6 @@ extern int perf_counter_task_enable(void);
 
 #else
 static inline void
-perf_counter_show(struct perf_counter *counter, char *str, int trace)   { }
-static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
 static inline void
 perf_counter_task_sched_out(struct task_struct *task, int cpu)		{ }
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 5431e79..aab6c12 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -861,8 +861,6 @@ static void task_clock_perf_counter_update(struct perf_counter *counter)
 	atomic64_set(&counter->hw.prev_count, now);
 
 	delta = now - prev;
-	if (WARN_ON_ONCE(delta < 0))
-		delta = 0;
 
 	atomic64_add(delta, &counter->count);
 }
@@ -906,8 +904,6 @@ static void page_faults_perf_counter_update(struct perf_counter *counter)
 	atomic64_set(&counter->hw.prev_count, now);
 
 	delta = now - prev;
-	if (WARN_ON_ONCE(delta < 0))
-		delta = 0;
 
 	atomic64_add(delta, &counter->count);
 }
@@ -954,8 +950,6 @@ static void context_switches_perf_counter_update(struct perf_counter *counter)
 	atomic64_set(&counter->hw.prev_count, now);
 
 	delta = now - prev;
-	if (WARN_ON_ONCE(delta < 0))
-		delta = 0;
 
 	atomic64_add(delta, &counter->count);
 }
@@ -1000,8 +994,6 @@ static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
 	atomic64_set(&counter->hw.prev_count, now);
 
 	delta = now - prev;
-	if (WARN_ON_ONCE(delta < 0))
-		delta = 0;
 
 	atomic64_add(delta, &counter->count);
 }
-- 
cgit v0.10.2


From 7995888fcb0246543ee8027bf2835a250ba8c925 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 08:54:56 +0100
Subject: perfcounters: tweak group scheduling

Impact: schedule in groups atomically

If there are multiple groups in a task, make sure they are scheduled
in and out atomically.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index aab6c12..f8a4d9a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -367,21 +367,26 @@ counter_sched_in(struct perf_counter *counter,
 	ctx->nr_active++;
 }
 
-static void
+static int
 group_sched_in(struct perf_counter *group_counter,
 	       struct perf_cpu_context *cpuctx,
 	       struct perf_counter_context *ctx,
 	       int cpu)
 {
 	struct perf_counter *counter;
+	int was_group = 0;
 
 	counter_sched_in(group_counter, cpuctx, ctx, cpu);
 
 	/*
 	 * Schedule in siblings as one group (if any):
 	 */
-	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
+	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
 		counter_sched_in(counter, cpuctx, ctx, cpu);
+		was_group = 1;
+	}
+
+	return was_group;
 }
 
 /*
@@ -416,7 +421,12 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 		if (counter->cpu != -1 && counter->cpu != cpu)
 			continue;
 
-		group_sched_in(counter, cpuctx, ctx, cpu);
+		/*
+		 * If we scheduled in a group atomically and
+		 * exclusively, break out:
+		 */
+		if (group_sched_in(counter, cpuctx, ctx, cpu))
+			break;
 	}
 	spin_unlock(&ctx->lock);
 
-- 
cgit v0.10.2


From 5c167b8585c8d91206b395d57011ead7711e322f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 09:02:19 +0100
Subject: x86, perfcounters: rename intel_arch_perfmon.h => perf_counter.h

Impact: rename include file

We'll be providing an asm/perf_counter.h to the generic perfcounter code,
so use the already existing x86 file for this purpose and rename it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
deleted file mode 100644
index 71598a9..0000000
--- a/arch/x86/include/asm/intel_arch_perfmon.h
+++ /dev/null
@@ -1,41 +0,0 @@
-#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
-#define _ASM_X86_INTEL_ARCH_PERFMON_H
-
-#define MSR_ARCH_PERFMON_PERFCTR0			      0xc1
-#define MSR_ARCH_PERFMON_PERFCTR1			      0xc2
-
-#define MSR_ARCH_PERFMON_EVENTSEL0			     0x186
-#define MSR_ARCH_PERFMON_EVENTSEL1			     0x187
-
-#define ARCH_PERFMON_EVENTSEL0_ENABLE			  (1 << 22)
-#define ARCH_PERFMON_EVENTSEL_INT			  (1 << 20)
-#define ARCH_PERFMON_EVENTSEL_OS			  (1 << 17)
-#define ARCH_PERFMON_EVENTSEL_USR			  (1 << 16)
-
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		      0x3c
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 		 0
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
-		(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
-
-#define ARCH_PERFMON_BRANCH_MISSES_RETIRED			 6
-
-union cpuid10_eax {
-	struct {
-		unsigned int version_id:8;
-		unsigned int num_counters:8;
-		unsigned int bit_width:8;
-		unsigned int mask_length:8;
-	} split;
-	unsigned int full;
-};
-
-#ifdef CONFIG_PERF_COUNTERS
-extern void init_hw_perf_counters(void);
-extern void perf_counters_lapic_init(int nmi);
-#else
-static inline void init_hw_perf_counters(void)		{ }
-static inline void perf_counters_lapic_init(int nmi)	{ }
-#endif
-
-#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
new file mode 100644
index 0000000..9dadce1
--- /dev/null
+++ b/arch/x86/include/asm/perf_counter.h
@@ -0,0 +1,41 @@
+#ifndef _ASM_X86_PERF_COUNTER_H
+#define _ASM_X86_PERF_COUNTER_H
+
+#define MSR_ARCH_PERFMON_PERFCTR0			      0xc1
+#define MSR_ARCH_PERFMON_PERFCTR1			      0xc2
+
+#define MSR_ARCH_PERFMON_EVENTSEL0			     0x186
+#define MSR_ARCH_PERFMON_EVENTSEL1			     0x187
+
+#define ARCH_PERFMON_EVENTSEL0_ENABLE			  (1 << 22)
+#define ARCH_PERFMON_EVENTSEL_INT			  (1 << 20)
+#define ARCH_PERFMON_EVENTSEL_OS			  (1 << 17)
+#define ARCH_PERFMON_EVENTSEL_USR			  (1 << 16)
+
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		      0x3c
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 		 0
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
+		(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
+
+#define ARCH_PERFMON_BRANCH_MISSES_RETIRED			 6
+
+union cpuid10_eax {
+	struct {
+		unsigned int version_id:8;
+		unsigned int num_counters:8;
+		unsigned int bit_width:8;
+		unsigned int mask_length:8;
+	} split;
+	unsigned int full;
+};
+
+#ifdef CONFIG_PERF_COUNTERS
+extern void init_hw_perf_counters(void);
+extern void perf_counters_lapic_init(int nmi);
+#else
+static inline void init_hw_perf_counters(void)		{ }
+static inline void perf_counters_lapic_init(int nmi)	{ }
+#endif
+
+#endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 0579ec1..4f859ac 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -31,7 +31,7 @@
 #include <linux/dmi.h>
 #include <linux/dmar.h>
 
-#include <asm/intel_arch_perfmon.h>
+#include <asm/perf_counter.h>
 #include <asm/atomic.h>
 #include <asm/smp.h>
 #include <asm/mtrr.h>
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4461011..ad331b4 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -17,7 +17,7 @@
 #include <asm/mmu_context.h>
 #include <asm/mtrr.h>
 #include <asm/mce.h>
-#include <asm/intel_arch_perfmon.h>
+#include <asm/perf_counter.h>
 #include <asm/pat.h>
 #include <asm/asm.h>
 #include <asm/numa.h>
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 89fad5d..a4a3a09 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -16,7 +16,7 @@
 #include <linux/kdebug.h>
 #include <linux/sched.h>
 
-#include <asm/intel_arch_perfmon.h>
+#include <asm/perf_counter.h>
 #include <asm/apic.h>
 
 static bool perf_counters_initialized __read_mostly;
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 9abd48b..d6f5b9f 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -20,7 +20,7 @@
 #include <linux/kprobes.h>
 
 #include <asm/apic.h>
-#include <asm/intel_arch_perfmon.h>
+#include <asm/perf_counter.h>
 
 struct nmi_watchdog_ctlblk {
 	unsigned int cccr_msr;
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index e9f80c7..07c9145 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -18,7 +18,7 @@
 #include <asm/msr.h>
 #include <asm/apic.h>
 #include <asm/nmi.h>
-#include <asm/intel_arch_perfmon.h>
+#include <asm/perf_counter.h>
 
 #include "op_x86_model.h"
 #include "op_counter.h"
-- 
cgit v0.10.2


From eb2b861810d4ff72454c83996b891df4e0aaff9a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 09:09:13 +0100
Subject: x86, perfcounters: prepare for fixed-mode PMCs

Impact: refactor the x86 code for fixed-mode PMCs

Extend the data structures and rename the existing facilities
to allow for a 'generic' versus 'fixed' counter distinction.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 9dadce1..dd5a4a5 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -1,6 +1,13 @@
 #ifndef _ASM_X86_PERF_COUNTER_H
 #define _ASM_X86_PERF_COUNTER_H
 
+/*
+ * Performance counter hw details:
+ */
+
+#define X86_PMC_MAX_GENERIC					8
+#define X86_PMC_MAX_FIXED					3
+
 #define MSR_ARCH_PERFMON_PERFCTR0			      0xc1
 #define MSR_ARCH_PERFMON_PERFCTR1			      0xc2
 
@@ -20,6 +27,10 @@
 
 #define ARCH_PERFMON_BRANCH_MISSES_RETIRED			 6
 
+/*
+ * Intel "Architectural Performance Monitoring" CPUID
+ * detection/enumeration details:
+ */
 union cpuid10_eax {
 	struct {
 		unsigned int version_id:8;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a4a3a09..fc3af86 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -27,13 +27,12 @@ static bool perf_counters_initialized __read_mostly;
 static int nr_hw_counters __read_mostly;
 static u32 perf_counter_mask __read_mostly;
 
-/* No support for fixed function counters yet */
-
-#define MAX_HW_COUNTERS		8
-
 struct cpu_hw_counters {
-	struct perf_counter	*counters[MAX_HW_COUNTERS];
-	unsigned long		used[BITS_TO_LONGS(MAX_HW_COUNTERS)];
+	struct perf_counter	*generic[X86_PMC_MAX_GENERIC];
+	unsigned long		used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)];
+
+	struct perf_counter	*fixed[X86_PMC_MAX_FIXED];
+	unsigned long		used_fixed[BITS_TO_LONGS(X86_PMC_MAX_FIXED)];
 };
 
 /*
@@ -185,7 +184,7 @@ void hw_perf_restore(u64 ctrl)
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
 static inline void
-__x86_perf_counter_disable(struct perf_counter *counter,
+__pmc_generic_disable(struct perf_counter *counter,
 			   struct hw_perf_counter *hwc, unsigned int idx)
 {
 	int err;
@@ -193,7 +192,7 @@ __x86_perf_counter_disable(struct perf_counter *counter,
 	err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
 }
 
-static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]);
+static DEFINE_PER_CPU(u64, prev_left[X86_PMC_MAX_GENERIC]);
 
 /*
  * Set the next IRQ period, based on the hwc->period_left value.
@@ -231,7 +230,7 @@ __hw_perf_counter_set_period(struct perf_counter *counter,
 }
 
 static void
-__x86_perf_counter_enable(struct perf_counter *counter,
+__pmc_generic_enable(struct perf_counter *counter,
 			  struct hw_perf_counter *hwc, int idx)
 {
 	wrmsr(hwc->config_base + idx,
@@ -241,7 +240,7 @@ __x86_perf_counter_enable(struct perf_counter *counter,
 /*
  * Find a PMC slot for the freshly enabled / scheduled in counter:
  */
-static void x86_perf_counter_enable(struct perf_counter *counter)
+static void pmc_generic_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -256,12 +255,12 @@ static void x86_perf_counter_enable(struct perf_counter *counter)
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	__x86_perf_counter_disable(counter, hwc, idx);
+	__pmc_generic_disable(counter, hwc, idx);
 
-	cpuc->counters[idx] = counter;
+	cpuc->generic[idx] = counter;
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
-	__x86_perf_counter_enable(counter, hwc, idx);
+	__pmc_generic_enable(counter, hwc, idx);
 }
 
 void perf_counter_print_debug(void)
@@ -301,16 +300,16 @@ void perf_counter_print_debug(void)
 	local_irq_enable();
 }
 
-static void x86_perf_counter_disable(struct perf_counter *counter)
+static void pmc_generic_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
-	__x86_perf_counter_disable(counter, hwc, idx);
+	__pmc_generic_disable(counter, hwc, idx);
 
 	clear_bit(idx, cpuc->used);
-	cpuc->counters[idx] = NULL;
+	cpuc->generic[idx] = NULL;
 
 	/*
 	 * Drain the remaining delta count out of a counter
@@ -349,7 +348,7 @@ static void perf_save_and_restart(struct perf_counter *counter)
 	__hw_perf_counter_set_period(counter, hwc, idx);
 
 	if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
-		__x86_perf_counter_enable(counter, hwc, idx);
+		__pmc_generic_enable(counter, hwc, idx);
 }
 
 static void
@@ -392,7 +391,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 again:
 	ack = status;
 	for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) {
-		struct perf_counter *counter = cpuc->counters[bit];
+		struct perf_counter *counter = cpuc->generic[bit];
 
 		clear_bit(bit, (unsigned long *) &status);
 		if (!counter)
@@ -412,7 +411,7 @@ again:
 		}
 		/*
 		 * From NMI context we cannot call into the scheduler to
-		 * do a task wakeup - but we mark these counters as
+		 * do a task wakeup - but we mark these generic as
 		 * wakeup_pending and initate a wakeup callback:
 		 */
 		if (nmi) {
@@ -462,7 +461,7 @@ void perf_counter_notify(struct pt_regs *regs)
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
 	for_each_bit(bit, cpuc->used, nr_hw_counters) {
-		struct perf_counter *counter = cpuc->counters[bit];
+		struct perf_counter *counter = cpuc->generic[bit];
 
 		if (!counter)
 			continue;
@@ -539,10 +538,10 @@ void __init init_hw_perf_counters(void)
 	printk(KERN_INFO "... version:      %d\n", eax.split.version_id);
 	printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters);
 	nr_hw_counters = eax.split.num_counters;
-	if (nr_hw_counters > MAX_HW_COUNTERS) {
-		nr_hw_counters = MAX_HW_COUNTERS;
+	if (nr_hw_counters > X86_PMC_MAX_GENERIC) {
+		nr_hw_counters = X86_PMC_MAX_GENERIC;
 		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
-			nr_hw_counters, MAX_HW_COUNTERS);
+			nr_hw_counters, X86_PMC_MAX_GENERIC);
 	}
 	perf_counter_mask = (1 << nr_hw_counters) - 1;
 	perf_max_counters = nr_hw_counters;
@@ -556,15 +555,15 @@ void __init init_hw_perf_counters(void)
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 
-static void x86_perf_counter_read(struct perf_counter *counter)
+static void pmc_generic_read(struct perf_counter *counter)
 {
 	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
 }
 
 static const struct hw_perf_counter_ops x86_perf_counter_ops = {
-	.hw_perf_counter_enable		= x86_perf_counter_enable,
-	.hw_perf_counter_disable	= x86_perf_counter_disable,
-	.hw_perf_counter_read		= x86_perf_counter_read,
+	.hw_perf_counter_enable		= pmc_generic_enable,
+	.hw_perf_counter_disable	= pmc_generic_disable,
+	.hw_perf_counter_read		= pmc_generic_read,
 };
 
 const struct hw_perf_counter_ops *
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index d038450..984da54 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -14,6 +14,7 @@
 #define _LINUX_PERF_COUNTER_H
 
 #include <asm/atomic.h>
+#include <asm/perf_counter.h>
 
 #include <linux/list.h>
 #include <linux/mutex.h>
-- 
cgit v0.10.2


From 703e937c83bbad79075a7846e062e447c2fee6a4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 10:51:15 +0100
Subject: perfcounters: add fixed-mode PMC enumeration

Enumerate fixed-mode PMCs based on CPUID, and feed that into the
perfcounter code.

Does not use fixed-mode PMCs yet.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index dd5a4a5..945a315 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -41,6 +41,29 @@ union cpuid10_eax {
 	unsigned int full;
 };
 
+union cpuid10_edx {
+	struct {
+		unsigned int num_counters_fixed:4;
+		unsigned int reserved:28;
+	} split;
+	unsigned int full;
+};
+
+
+/*
+ * Fixed-purpose performance counters:
+ */
+
+/* Instr_Retired.Any: */
+#define MSR_ARCH_PERFMON_FIXED_CTR0			0x309
+
+/* CPU_CLK_Unhalted.Core: */
+#define MSR_ARCH_PERFMON_FIXED_CTR1			0x30a
+
+/* CPU_CLK_Unhalted.Ref: */
+#define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
+
+
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
 extern void perf_counters_lapic_init(int nmi);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index fc3af86..2fca50c 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -27,6 +27,8 @@ static bool perf_counters_initialized __read_mostly;
 static int nr_hw_counters __read_mostly;
 static u32 perf_counter_mask __read_mostly;
 
+static int nr_hw_counters_fixed __read_mostly;
+
 struct cpu_hw_counters {
 	struct perf_counter	*generic[X86_PMC_MAX_GENERIC];
 	unsigned long		used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)];
@@ -519,8 +521,9 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 void __init init_hw_perf_counters(void)
 {
 	union cpuid10_eax eax;
-	unsigned int unused;
 	unsigned int ebx;
+	unsigned int unused;
+	union cpuid10_edx edx;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 		return;
@@ -529,14 +532,14 @@ void __init init_hw_perf_counters(void)
 	 * Check whether the Architectural PerfMon supports
 	 * Branch Misses Retired Event or not.
 	 */
-	cpuid(10, &(eax.full), &ebx, &unused, &unused);
+	cpuid(10, &eax.full, &ebx, &unused, &edx.full);
 	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
 		return;
 
 	printk(KERN_INFO "Intel Performance Monitoring support detected.\n");
 
-	printk(KERN_INFO "... version:      %d\n", eax.split.version_id);
-	printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters);
+	printk(KERN_INFO "... version:         %d\n", eax.split.version_id);
+	printk(KERN_INFO "... num counters:    %d\n", eax.split.num_counters);
 	nr_hw_counters = eax.split.num_counters;
 	if (nr_hw_counters > X86_PMC_MAX_GENERIC) {
 		nr_hw_counters = X86_PMC_MAX_GENERIC;
@@ -546,8 +549,16 @@ void __init init_hw_perf_counters(void)
 	perf_counter_mask = (1 << nr_hw_counters) - 1;
 	perf_max_counters = nr_hw_counters;
 
-	printk(KERN_INFO "... bit_width:    %d\n", eax.split.bit_width);
-	printk(KERN_INFO "... mask_length:  %d\n", eax.split.mask_length);
+	printk(KERN_INFO "... bit width:       %d\n", eax.split.bit_width);
+	printk(KERN_INFO "... mask length:     %d\n", eax.split.mask_length);
+
+	nr_hw_counters_fixed = edx.split.num_counters_fixed;
+	if (nr_hw_counters_fixed > X86_PMC_MAX_FIXED) {
+		nr_hw_counters_fixed = X86_PMC_MAX_FIXED;
+		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
+			nr_hw_counters_fixed, X86_PMC_MAX_FIXED);
+	}
+	printk(KERN_INFO "... fixed counters:  %d\n", nr_hw_counters_fixed);
 
 	perf_counters_initialized = true;
 
-- 
cgit v0.10.2


From 862a1a5f346fe7e9181ea51eaae48cf2cd70f746 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 13:09:20 +0100
Subject: x86, perfcounters: refactor code for fixed-function PMCs

Impact: clean up

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 945a315..13745de 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -8,6 +8,10 @@
 #define X86_PMC_MAX_GENERIC					8
 #define X86_PMC_MAX_FIXED					3
 
+#define X86_PMC_IDX_GENERIC				        0
+#define X86_PMC_IDX_FIXED				       32
+#define X86_PMC_IDX_MAX					       64
+
 #define MSR_ARCH_PERFMON_PERFCTR0			      0xc1
 #define MSR_ARCH_PERFMON_PERFCTR1			      0xc2
 
@@ -54,6 +58,15 @@ union cpuid10_edx {
  * Fixed-purpose performance counters:
  */
 
+/*
+ * All 3 fixed-mode PMCs are configured via this single MSR:
+ */
+#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL			0x38d
+
+/*
+ * The counts are available in three separate MSRs:
+ */
+
 /* Instr_Retired.Any: */
 #define MSR_ARCH_PERFMON_FIXED_CTR0			0x309
 
@@ -63,7 +76,6 @@ union cpuid10_edx {
 /* CPU_CLK_Unhalted.Ref: */
 #define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
 
-
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
 extern void perf_counters_lapic_init(int nmi);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 2fca50c..358af52 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -24,17 +24,14 @@ static bool perf_counters_initialized __read_mostly;
 /*
  * Number of (generic) HW counters:
  */
-static int nr_hw_counters __read_mostly;
-static u32 perf_counter_mask __read_mostly;
+static int nr_counters_generic __read_mostly;
+static u64 perf_counter_mask __read_mostly;
 
-static int nr_hw_counters_fixed __read_mostly;
+static int nr_counters_fixed __read_mostly;
 
 struct cpu_hw_counters {
-	struct perf_counter	*generic[X86_PMC_MAX_GENERIC];
-	unsigned long		used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)];
-
-	struct perf_counter	*fixed[X86_PMC_MAX_FIXED];
-	unsigned long		used_fixed[BITS_TO_LONGS(X86_PMC_MAX_FIXED)];
+	struct perf_counter	*counters[X86_PMC_IDX_MAX];
+	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 };
 
 /*
@@ -159,7 +156,7 @@ void hw_perf_enable_all(void)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask);
 }
 
 u64 hw_perf_save_disable(void)
@@ -170,7 +167,7 @@ u64 hw_perf_save_disable(void)
 		return 0;
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 
 	return ctrl;
 }
@@ -181,7 +178,7 @@ void hw_perf_restore(u64 ctrl)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 }
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
@@ -239,6 +236,11 @@ __pmc_generic_enable(struct perf_counter *counter,
 	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
+static int fixed_mode_idx(struct hw_perf_counter *hwc)
+{
+	return -1;
+}
+
 /*
  * Find a PMC slot for the freshly enabled / scheduled in counter:
  */
@@ -250,7 +252,7 @@ static void pmc_generic_enable(struct perf_counter *counter)
 
 	/* Try to get the previous counter again */
 	if (test_and_set_bit(idx, cpuc->used)) {
-		idx = find_first_zero_bit(cpuc->used, nr_hw_counters);
+		idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
 		set_bit(idx, cpuc->used);
 		hwc->idx = idx;
 	}
@@ -259,7 +261,7 @@ static void pmc_generic_enable(struct perf_counter *counter)
 
 	__pmc_generic_disable(counter, hwc, idx);
 
-	cpuc->generic[idx] = counter;
+	cpuc->counters[idx] = counter;
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
 	__pmc_generic_enable(counter, hwc, idx);
@@ -270,7 +272,7 @@ void perf_counter_print_debug(void)
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left;
 	int cpu, idx;
 
-	if (!nr_hw_counters)
+	if (!nr_counters_generic)
 		return;
 
 	local_irq_disable();
@@ -286,7 +288,7 @@ void perf_counter_print_debug(void)
 	printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
 	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
 
-	for (idx = 0; idx < nr_hw_counters; idx++) {
+	for (idx = 0; idx < nr_counters_generic; idx++) {
 		rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 		rdmsrl(MSR_ARCH_PERFMON_PERFCTR0  + idx, pmc_count);
 
@@ -311,7 +313,7 @@ static void pmc_generic_disable(struct perf_counter *counter)
 	__pmc_generic_disable(counter, hwc, idx);
 
 	clear_bit(idx, cpuc->used);
-	cpuc->generic[idx] = NULL;
+	cpuc->counters[idx] = NULL;
 
 	/*
 	 * Drain the remaining delta count out of a counter
@@ -381,7 +383,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
 
 	/* Disable counters globally */
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 	ack_APIC_irq();
 
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
@@ -392,8 +394,8 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 
 again:
 	ack = status;
-	for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) {
-		struct perf_counter *counter = cpuc->generic[bit];
+	for_each_bit(bit, (unsigned long *) &status, nr_counters_generic) {
+		struct perf_counter *counter = cpuc->counters[bit];
 
 		clear_bit(bit, (unsigned long *) &status);
 		if (!counter)
@@ -424,7 +426,7 @@ again:
 		}
 	}
 
-	wrmsr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack, 0);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
 
 	/*
 	 * Repeat if there is more work to be done:
@@ -436,7 +438,7 @@ out:
 	/*
 	 * Restore - do not reenable when global enable is off:
 	 */
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, saved_global, 0);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
 }
 
 void smp_perf_counter_interrupt(struct pt_regs *regs)
@@ -462,8 +464,8 @@ void perf_counter_notify(struct pt_regs *regs)
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
-	for_each_bit(bit, cpuc->used, nr_hw_counters) {
-		struct perf_counter *counter = cpuc->generic[bit];
+	for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) {
+		struct perf_counter *counter = cpuc->counters[bit];
 
 		if (!counter)
 			continue;
@@ -540,26 +542,29 @@ void __init init_hw_perf_counters(void)
 
 	printk(KERN_INFO "... version:         %d\n", eax.split.version_id);
 	printk(KERN_INFO "... num counters:    %d\n", eax.split.num_counters);
-	nr_hw_counters = eax.split.num_counters;
-	if (nr_hw_counters > X86_PMC_MAX_GENERIC) {
-		nr_hw_counters = X86_PMC_MAX_GENERIC;
+	nr_counters_generic = eax.split.num_counters;
+	if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
+		nr_counters_generic = X86_PMC_MAX_GENERIC;
 		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
-			nr_hw_counters, X86_PMC_MAX_GENERIC);
+			nr_counters_generic, X86_PMC_MAX_GENERIC);
 	}
-	perf_counter_mask = (1 << nr_hw_counters) - 1;
-	perf_max_counters = nr_hw_counters;
+	perf_counter_mask = (1 << nr_counters_generic) - 1;
+	perf_max_counters = nr_counters_generic;
 
 	printk(KERN_INFO "... bit width:       %d\n", eax.split.bit_width);
 	printk(KERN_INFO "... mask length:     %d\n", eax.split.mask_length);
 
-	nr_hw_counters_fixed = edx.split.num_counters_fixed;
-	if (nr_hw_counters_fixed > X86_PMC_MAX_FIXED) {
-		nr_hw_counters_fixed = X86_PMC_MAX_FIXED;
+	nr_counters_fixed = edx.split.num_counters_fixed;
+	if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
+		nr_counters_fixed = X86_PMC_MAX_FIXED;
 		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
-			nr_hw_counters_fixed, X86_PMC_MAX_FIXED);
+			nr_counters_fixed, X86_PMC_MAX_FIXED);
 	}
-	printk(KERN_INFO "... fixed counters:  %d\n", nr_hw_counters_fixed);
+	printk(KERN_INFO "... fixed counters:  %d\n", nr_counters_fixed);
+
+	perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED;
 
+	printk(KERN_INFO "... counter mask:    %016Lx\n", perf_counter_mask);
 	perf_counters_initialized = true;
 
 	perf_counters_lapic_init(0);
-- 
cgit v0.10.2


From 7671581f1666ef4b54a1c1e598c51ac44c060a9b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 14:20:28 +0100
Subject: perfcounters: hw ops rename

Impact: rename field names

Shorten them.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 358af52..b675571 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -577,9 +577,9 @@ static void pmc_generic_read(struct perf_counter *counter)
 }
 
 static const struct hw_perf_counter_ops x86_perf_counter_ops = {
-	.hw_perf_counter_enable		= pmc_generic_enable,
-	.hw_perf_counter_disable	= pmc_generic_disable,
-	.hw_perf_counter_read		= pmc_generic_read,
+	.enable		= pmc_generic_enable,
+	.disable	= pmc_generic_disable,
+	.read		= pmc_generic_read,
 };
 
 const struct hw_perf_counter_ops *
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 984da54..48f76d2 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -128,9 +128,9 @@ struct perf_counter;
  * struct hw_perf_counter_ops - performance counter hw ops
  */
 struct hw_perf_counter_ops {
-	void (*hw_perf_counter_enable)	(struct perf_counter *counter);
-	void (*hw_perf_counter_disable)	(struct perf_counter *counter);
-	void (*hw_perf_counter_read)	(struct perf_counter *counter);
+	void (*enable)			(struct perf_counter *counter);
+	void (*disable)			(struct perf_counter *counter);
+	void (*read)			(struct perf_counter *counter);
 };
 
 /**
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f8a4d9a..961d651 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -109,7 +109,7 @@ static void __perf_counter_remove_from_context(void *info)
 	spin_lock_irqsave(&ctx->lock, flags);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
-		counter->hw_ops->hw_perf_counter_disable(counter);
+		counter->hw_ops->disable(counter);
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
 		ctx->nr_active--;
 		cpuctx->active_oncpu--;
@@ -226,7 +226,7 @@ static void __perf_install_in_context(void *info)
 		counter->oncpu = cpu;
 		ctx->nr_active++;
 		cpuctx->active_oncpu++;
-		counter->hw_ops->hw_perf_counter_enable(counter);
+		counter->hw_ops->enable(counter);
 	}
 
 	if (!ctx->task && cpuctx->max_pertask)
@@ -297,7 +297,7 @@ counter_sched_out(struct perf_counter *counter,
 	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 		return;
 
-	counter->hw_ops->hw_perf_counter_disable(counter);
+	counter->hw_ops->disable(counter);
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	counter->oncpu = -1;
 
@@ -327,7 +327,7 @@ group_sched_out(struct perf_counter *group_counter,
  *
  * We stop each counter and update the counter value in counter->count.
  *
- * This does not protect us against NMI, but hw_perf_counter_disable()
+ * This does not protect us against NMI, but disable()
  * sets the disabled bit in the control field of counter _before_
  * accessing the counter control register. If a NMI hits, then it will
  * not restart the counter.
@@ -359,7 +359,7 @@ counter_sched_in(struct perf_counter *counter,
 	if (counter->state == PERF_COUNTER_STATE_OFF)
 		return;
 
-	counter->hw_ops->hw_perf_counter_enable(counter);
+	counter->hw_ops->enable(counter);
 	counter->state = PERF_COUNTER_STATE_ACTIVE;
 	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
 
@@ -395,7 +395,7 @@ group_sched_in(struct perf_counter *group_counter,
  *
  * We restore the counter value and then enable it.
  *
- * This does not protect us against NMI, but hw_perf_counter_enable()
+ * This does not protect us against NMI, but enable()
  * sets the enabled bit in the control field of counter _before_
  * accessing the counter control register. If a NMI hits, then it will
  * keep the counter running.
@@ -537,11 +537,11 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 /*
  * Cross CPU call to read the hardware counter
  */
-static void __hw_perf_counter_read(void *info)
+static void __read(void *info)
 {
 	struct perf_counter *counter = info;
 
-	counter->hw_ops->hw_perf_counter_read(counter);
+	counter->hw_ops->read(counter);
 }
 
 static u64 perf_counter_read(struct perf_counter *counter)
@@ -552,7 +552,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
 	 */
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 		smp_call_function_single(counter->oncpu,
-					 __hw_perf_counter_read, counter, 1);
+					 __read, counter, 1);
 	}
 
 	return atomic64_read(&counter->count);
@@ -855,9 +855,9 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter)
 }
 
 static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
-	.hw_perf_counter_enable		= cpu_clock_perf_counter_enable,
-	.hw_perf_counter_disable	= cpu_clock_perf_counter_disable,
-	.hw_perf_counter_read		= cpu_clock_perf_counter_read,
+	.enable		= cpu_clock_perf_counter_enable,
+	.disable	= cpu_clock_perf_counter_disable,
+	.read		= cpu_clock_perf_counter_read,
 };
 
 static void task_clock_perf_counter_update(struct perf_counter *counter)
@@ -891,9 +891,9 @@ static void task_clock_perf_counter_disable(struct perf_counter *counter)
 }
 
 static const struct hw_perf_counter_ops perf_ops_task_clock = {
-	.hw_perf_counter_enable		= task_clock_perf_counter_enable,
-	.hw_perf_counter_disable	= task_clock_perf_counter_disable,
-	.hw_perf_counter_read		= task_clock_perf_counter_read,
+	.enable		= task_clock_perf_counter_enable,
+	.disable	= task_clock_perf_counter_disable,
+	.read		= task_clock_perf_counter_read,
 };
 
 static u64 get_page_faults(void)
@@ -937,9 +937,9 @@ static void page_faults_perf_counter_disable(struct perf_counter *counter)
 }
 
 static const struct hw_perf_counter_ops perf_ops_page_faults = {
-	.hw_perf_counter_enable		= page_faults_perf_counter_enable,
-	.hw_perf_counter_disable	= page_faults_perf_counter_disable,
-	.hw_perf_counter_read		= page_faults_perf_counter_read,
+	.enable		= page_faults_perf_counter_enable,
+	.disable	= page_faults_perf_counter_disable,
+	.read		= page_faults_perf_counter_read,
 };
 
 static u64 get_context_switches(void)
@@ -983,9 +983,9 @@ static void context_switches_perf_counter_disable(struct perf_counter *counter)
 }
 
 static const struct hw_perf_counter_ops perf_ops_context_switches = {
-	.hw_perf_counter_enable		= context_switches_perf_counter_enable,
-	.hw_perf_counter_disable	= context_switches_perf_counter_disable,
-	.hw_perf_counter_read		= context_switches_perf_counter_read,
+	.enable		= context_switches_perf_counter_enable,
+	.disable	= context_switches_perf_counter_disable,
+	.read		= context_switches_perf_counter_read,
 };
 
 static inline u64 get_cpu_migrations(void)
@@ -1027,9 +1027,9 @@ static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
 }
 
 static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
-	.hw_perf_counter_enable		= cpu_migrations_perf_counter_enable,
-	.hw_perf_counter_disable	= cpu_migrations_perf_counter_disable,
-	.hw_perf_counter_read		= cpu_migrations_perf_counter_read,
+	.enable		= cpu_migrations_perf_counter_enable,
+	.disable	= cpu_migrations_perf_counter_disable,
+	.read		= cpu_migrations_perf_counter_read,
 };
 
 static const struct hw_perf_counter_ops *
@@ -1283,7 +1283,7 @@ __perf_counter_exit_task(struct task_struct *child,
 
 		cpuctx = &__get_cpu_var(perf_cpu_context);
 
-		child_counter->hw_ops->hw_perf_counter_disable(child_counter);
+		child_counter->hw_ops->disable(child_counter);
 		child_counter->state = PERF_COUNTER_STATE_INACTIVE;
 		child_counter->oncpu = -1;
 
-- 
cgit v0.10.2


From aa9c4c0f967fdb482ea95e8473ec3d201e6e0781 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 14:10:57 +0100
Subject: perfcounters: fix task clock counter

Impact: fix per task clock counter precision

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 4a145ca..1b2e324 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -66,7 +66,15 @@ static inline unsigned int kstat_irqs(unsigned int irq)
 	return sum;
 }
 
+
+/*
+ * Lock/unlock the current runqueue - to extract task statistics:
+ */
+extern void curr_rq_lock_irq_save(unsigned long *flags);
+extern void curr_rq_unlock_irq_restore(unsigned long *flags);
+extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update);
 extern unsigned long long task_delta_exec(struct task_struct *);
+
 extern void account_user_time(struct task_struct *, cputime_t);
 extern void account_user_time_scaled(struct task_struct *, cputime_t);
 extern void account_system_time(struct task_struct *, int, cputime_t);
diff --git a/kernel/exit.c b/kernel/exit.c
index d336c90..244edfd 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -922,6 +922,12 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	forget_original_parent(tsk);
 	exit_task_namespaces(tsk);
 
+	/*
+	 * Flush inherited counters to the parent - before the parent
+	 * gets woken up by child-exit notifications.
+	 */
+	perf_counter_exit_task(tsk);
+
 	write_lock_irq(&tasklist_lock);
 	if (group_dead)
 		kill_orphaned_pgrp(tsk->group_leader, NULL);
@@ -1093,11 +1099,6 @@ NORET_TYPE void do_exit(long code)
 	mpol_put(tsk->mempolicy);
 	tsk->mempolicy = NULL;
 #endif
-	/*
-	 * These must happen late, after the PID is not
-	 * hashed anymore, but still at a point that may sleep:
-	 */
-	perf_counter_exit_task(tsk);
 #ifdef CONFIG_FUTEX
 	if (unlikely(!list_empty(&tsk->pi_state_list)))
 		exit_pi_state_list(tsk);
@@ -1121,6 +1122,12 @@ NORET_TYPE void do_exit(long code)
 	if (tsk->splice_pipe)
 		__free_pipe_info(tsk->splice_pipe);
 
+	/*
+	 * These must happen late, after the PID is not
+	 * hashed anymore, but still at a point that may sleep:
+	 */
+	perf_counter_exit_task(tsk);
+
 	preempt_disable();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 961d651..f1110ac 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -18,6 +18,7 @@
 #include <linux/uaccess.h>
 #include <linux/syscalls.h>
 #include <linux/anon_inodes.h>
+#include <linux/kernel_stat.h>
 #include <linux/perf_counter.h>
 
 /*
@@ -106,7 +107,8 @@ static void __perf_counter_remove_from_context(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	spin_lock_irqsave(&ctx->lock, flags);
+	curr_rq_lock_irq_save(&flags);
+	spin_lock(&ctx->lock);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 		counter->hw_ops->disable(counter);
@@ -135,7 +137,8 @@ static void __perf_counter_remove_from_context(void *info)
 			    perf_max_counters - perf_reserved_percpu);
 	}
 
-	spin_unlock_irqrestore(&ctx->lock, flags);
+	spin_unlock(&ctx->lock);
+	curr_rq_unlock_irq_restore(&flags);
 }
 
 
@@ -209,7 +212,8 @@ static void __perf_install_in_context(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	spin_lock_irqsave(&ctx->lock, flags);
+	curr_rq_lock_irq_save(&flags);
+	spin_lock(&ctx->lock);
 
 	/*
 	 * Protect the list operation against NMI by disabling the
@@ -232,7 +236,8 @@ static void __perf_install_in_context(void *info)
 	if (!ctx->task && cpuctx->max_pertask)
 		cpuctx->max_pertask--;
 
-	spin_unlock_irqrestore(&ctx->lock, flags);
+	spin_unlock(&ctx->lock);
+	curr_rq_unlock_irq_restore(&flags);
 }
 
 /*
@@ -438,15 +443,19 @@ int perf_counter_task_disable(void)
 	struct task_struct *curr = current;
 	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 	struct perf_counter *counter;
+	unsigned long flags;
 	u64 perf_flags;
 	int cpu;
 
 	if (likely(!ctx->nr_counters))
 		return 0;
 
-	local_irq_disable();
+	curr_rq_lock_irq_save(&flags);
 	cpu = smp_processor_id();
 
+	/* force the update of the task clock: */
+	__task_delta_exec(curr, 1);
+
 	perf_counter_task_sched_out(curr, cpu);
 
 	spin_lock(&ctx->lock);
@@ -463,7 +472,7 @@ int perf_counter_task_disable(void)
 
 	spin_unlock(&ctx->lock);
 
-	local_irq_enable();
+	curr_rq_unlock_irq_restore(&flags);
 
 	return 0;
 }
@@ -473,15 +482,19 @@ int perf_counter_task_enable(void)
 	struct task_struct *curr = current;
 	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 	struct perf_counter *counter;
+	unsigned long flags;
 	u64 perf_flags;
 	int cpu;
 
 	if (likely(!ctx->nr_counters))
 		return 0;
 
-	local_irq_disable();
+	curr_rq_lock_irq_save(&flags);
 	cpu = smp_processor_id();
 
+	/* force the update of the task clock: */
+	__task_delta_exec(curr, 1);
+
 	spin_lock(&ctx->lock);
 
 	/*
@@ -493,6 +506,7 @@ int perf_counter_task_enable(void)
 		if (counter->state != PERF_COUNTER_STATE_OFF)
 			continue;
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
+		counter->hw_event.disabled = 0;
 	}
 	hw_perf_restore(perf_flags);
 
@@ -500,7 +514,7 @@ int perf_counter_task_enable(void)
 
 	perf_counter_task_sched_in(curr, cpu);
 
-	local_irq_enable();
+	curr_rq_unlock_irq_restore(&flags);
 
 	return 0;
 }
@@ -540,8 +554,11 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 static void __read(void *info)
 {
 	struct perf_counter *counter = info;
+	unsigned long flags;
 
+	curr_rq_lock_irq_save(&flags);
 	counter->hw_ops->read(counter);
+	curr_rq_unlock_irq_restore(&flags);
 }
 
 static u64 perf_counter_read(struct perf_counter *counter)
@@ -860,13 +877,27 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
 	.read		= cpu_clock_perf_counter_read,
 };
 
-static void task_clock_perf_counter_update(struct perf_counter *counter)
+/*
+ * Called from within the scheduler:
+ */
+static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
 {
-	u64 prev, now;
+	struct task_struct *curr = counter->task;
+	u64 delta;
+
+	WARN_ON_ONCE(counter->task != current);
+
+	delta = __task_delta_exec(curr, update);
+
+	return curr->se.sum_exec_runtime + delta;
+}
+
+static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
+{
+	u64 prev;
 	s64 delta;
 
 	prev = atomic64_read(&counter->hw.prev_count);
-	now = current->se.sum_exec_runtime;
 
 	atomic64_set(&counter->hw.prev_count, now);
 
@@ -877,17 +908,23 @@ static void task_clock_perf_counter_update(struct perf_counter *counter)
 
 static void task_clock_perf_counter_read(struct perf_counter *counter)
 {
-	task_clock_perf_counter_update(counter);
+	u64 now = task_clock_perf_counter_val(counter, 1);
+
+	task_clock_perf_counter_update(counter, now);
 }
 
 static void task_clock_perf_counter_enable(struct perf_counter *counter)
 {
-	atomic64_set(&counter->hw.prev_count, current->se.sum_exec_runtime);
+	u64 now = task_clock_perf_counter_val(counter, 0);
+
+	atomic64_set(&counter->hw.prev_count, now);
 }
 
 static void task_clock_perf_counter_disable(struct perf_counter *counter)
 {
-	task_clock_perf_counter_update(counter);
+	u64 now = task_clock_perf_counter_val(counter, 0);
+
+	task_clock_perf_counter_update(counter, now);
 }
 
 static const struct hw_perf_counter_ops perf_ops_task_clock = {
@@ -1267,6 +1304,7 @@ __perf_counter_exit_task(struct task_struct *child,
 {
 	struct perf_counter *parent_counter;
 	u64 parent_val, child_val;
+	unsigned long flags;
 	u64 perf_flags;
 
 	/*
@@ -1275,7 +1313,7 @@ __perf_counter_exit_task(struct task_struct *child,
 	 * Be careful about zapping the list - IRQ/NMI context
 	 * could still be processing it:
 	 */
-	local_irq_disable();
+	curr_rq_lock_irq_save(&flags);
 	perf_flags = hw_perf_save_disable();
 
 	if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
@@ -1294,7 +1332,7 @@ __perf_counter_exit_task(struct task_struct *child,
 	list_del_init(&child_counter->list_entry);
 
 	hw_perf_restore(perf_flags);
-	local_irq_enable();
+	curr_rq_unlock_irq_restore(&flags);
 
 	parent_counter = child_counter->parent;
 	/*
diff --git a/kernel/sched.c b/kernel/sched.c
index 382cfdb..4d84ff4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -638,7 +638,7 @@ static inline int cpu_of(struct rq *rq)
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
-static inline void update_rq_clock(struct rq *rq)
+inline void update_rq_clock(struct rq *rq)
 {
 	rq->clock = sched_clock_cpu(cpu_of(rq));
 }
@@ -969,6 +969,26 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 	}
 }
 
+void curr_rq_lock_irq_save(unsigned long *flags)
+	__acquires(rq->lock)
+{
+	struct rq *rq;
+
+	local_irq_save(*flags);
+	rq = cpu_rq(smp_processor_id());
+	spin_lock(&rq->lock);
+}
+
+void curr_rq_unlock_irq_restore(unsigned long *flags)
+	__releases(rq->lock)
+{
+	struct rq *rq;
+
+	rq = cpu_rq(smp_processor_id());
+	spin_unlock(&rq->lock);
+	local_irq_restore(*flags);
+}
+
 void task_rq_unlock_wait(struct task_struct *p)
 {
 	struct rq *rq = task_rq(p);
@@ -2558,7 +2578,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
 		    struct task_struct *next)
 {
 	fire_sched_out_preempt_notifiers(prev, next);
-	perf_counter_task_sched_out(prev, cpu_of(rq));
 	prepare_lock_switch(rq, next);
 	prepare_arch_switch(next);
 }
@@ -4093,6 +4112,29 @@ EXPORT_PER_CPU_SYMBOL(kstat);
  * Return any ns on the sched_clock that have not yet been banked in
  * @p in case that task is currently running.
  */
+unsigned long long __task_delta_exec(struct task_struct *p, int update)
+{
+	s64 delta_exec;
+	struct rq *rq;
+
+	rq = task_rq(p);
+	WARN_ON_ONCE(!runqueue_is_locked());
+	WARN_ON_ONCE(!task_current(rq, p));
+
+	if (update)
+		update_rq_clock(rq);
+
+	delta_exec = rq->clock - p->se.exec_start;
+
+	WARN_ON_ONCE(delta_exec < 0);
+
+	return delta_exec;
+}
+
+/*
+ * Return any ns on the sched_clock that have not yet been banked in
+ * @p in case that task is currently running.
+ */
 unsigned long long task_delta_exec(struct task_struct *p)
 {
 	unsigned long flags;
@@ -4316,13 +4358,13 @@ void scheduler_tick(void)
 	update_rq_clock(rq);
 	update_cpu_load(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
+	perf_counter_task_tick(curr, cpu);
 	spin_unlock(&rq->lock);
 
 #ifdef CONFIG_SMP
 	rq->idle_at_tick = idle_cpu(cpu);
 	trigger_load_balance(rq, cpu);
 #endif
-	perf_counter_task_tick(curr, cpu);
 }
 
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
@@ -4512,6 +4554,7 @@ need_resched_nonpreemptible:
 
 	if (likely(prev != next)) {
 		sched_info_switch(prev, next);
+		perf_counter_task_sched_out(prev, cpu);
 
 		rq->nr_switches++;
 		rq->curr = next;
-- 
cgit v0.10.2


From eef6cbf5844c620d9db9be99e4908cdf92492fb9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 19 Dec 2008 10:20:42 +0100
Subject: perfcounters: pull inherited counters

Change counter inheritance from a 'push' to a 'pull' model: instead of
child tasks pushing their final counts to the parent, reuse the wait4
infrastructure to pull counters as child tasks are exit-processed,
much like how cutime/cstime is collected.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 23fd890..54fa2fa 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -113,6 +113,14 @@ extern struct group_info init_groups;
 # define CAP_INIT_BSET  CAP_INIT_EFF_SET
 #endif
 
+#ifdef CONFIG_PERF_COUNTERS
+# define INIT_PERF_COUNTERS(tsk)					\
+	.perf_counter_ctx.counter_list =				\
+		LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list),
+#else
+# define INIT_PERF_COUNTERS(tsk)
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -180,6 +188,7 @@ extern struct group_info init_groups;
 	INIT_IDS							\
 	INIT_TRACE_IRQFLAGS						\
 	INIT_LOCKDEP							\
+	INIT_PERF_COUNTERS(tsk)						\
 }
 
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 244edfd..101b7ee 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -153,6 +153,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 {
 	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
 
+#ifdef CONFIG_PERF_COUNTERS
+	WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));
+#endif
 	trace_sched_process_free(tsk);
 	put_task_struct(tsk);
 }
@@ -922,12 +925,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	forget_original_parent(tsk);
 	exit_task_namespaces(tsk);
 
-	/*
-	 * Flush inherited counters to the parent - before the parent
-	 * gets woken up by child-exit notifications.
-	 */
-	perf_counter_exit_task(tsk);
-
 	write_lock_irq(&tasklist_lock);
 	if (group_dead)
 		kill_orphaned_pgrp(tsk->group_leader, NULL);
@@ -1122,12 +1119,6 @@ NORET_TYPE void do_exit(long code)
 	if (tsk->splice_pipe)
 		__free_pipe_info(tsk->splice_pipe);
 
-	/*
-	 * These must happen late, after the PID is not
-	 * hashed anymore, but still at a point that may sleep:
-	 */
-	perf_counter_exit_task(tsk);
-
 	preempt_disable();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
@@ -1371,6 +1362,12 @@ static int wait_task_zombie(struct task_struct *p, int options,
 	 */
 	read_unlock(&tasklist_lock);
 
+	/*
+	 * Flush inherited counters to the parent - before the parent
+	 * gets woken up by child-exit notifications.
+	 */
+	perf_counter_exit_task(p);
+
 	retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
 	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
 		? p->signal->group_exit_code : p->exit_code;
-- 
cgit v0.10.2


From 78b6084c907cea15bb40a564b974e072f5163781 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 21 Dec 2008 15:07:49 +0100
Subject: perfcounters: fix init context lock

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 54fa2fa..467cff5 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -116,7 +116,9 @@ extern struct group_info init_groups;
 #ifdef CONFIG_PERF_COUNTERS
 # define INIT_PERF_COUNTERS(tsk)					\
 	.perf_counter_ctx.counter_list =				\
-		LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list),
+		LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list),	\
+	.perf_counter_ctx.lock =					\
+		__SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock),
 #else
 # define INIT_PERF_COUNTERS(tsk)
 #endif
-- 
cgit v0.10.2


From 95cdd2e7851cce79ab839cb0b3cbe68d7911d0f1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 21 Dec 2008 13:50:42 +0100
Subject: perfcounters: enable lowlevel pmc code to schedule counters

Allow lowlevel ->enable() op to return an error if a counter can not be
added. This can be used to handle counter constraints.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index b675571..74090a3 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -244,7 +244,7 @@ static int fixed_mode_idx(struct hw_perf_counter *hwc)
 /*
  * Find a PMC slot for the freshly enabled / scheduled in counter:
  */
-static void pmc_generic_enable(struct perf_counter *counter)
+static int pmc_generic_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -253,6 +253,8 @@ static void pmc_generic_enable(struct perf_counter *counter)
 	/* Try to get the previous counter again */
 	if (test_and_set_bit(idx, cpuc->used)) {
 		idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
+		if (idx == nr_counters_generic)
+			return -EAGAIN;
 		set_bit(idx, cpuc->used);
 		hwc->idx = idx;
 	}
@@ -265,6 +267,8 @@ static void pmc_generic_enable(struct perf_counter *counter)
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
 	__pmc_generic_enable(counter, hwc, idx);
+
+	return 0;
 }
 
 void perf_counter_print_debug(void)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 48f76d2..53af11d 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -128,7 +128,7 @@ struct perf_counter;
  * struct hw_perf_counter_ops - performance counter hw ops
  */
 struct hw_perf_counter_ops {
-	void (*enable)			(struct perf_counter *counter);
+	int (*enable)			(struct perf_counter *counter);
 	void (*disable)			(struct perf_counter *counter);
 	void (*read)			(struct perf_counter *counter);
 };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f1110ac..2e73929 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -355,21 +355,25 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 	cpuctx->task_ctx = NULL;
 }
 
-static void
+static int
 counter_sched_in(struct perf_counter *counter,
 		 struct perf_cpu_context *cpuctx,
 		 struct perf_counter_context *ctx,
 		 int cpu)
 {
 	if (counter->state == PERF_COUNTER_STATE_OFF)
-		return;
+		return 0;
+
+	if (counter->hw_ops->enable(counter))
+		return -EAGAIN;
 
-	counter->hw_ops->enable(counter);
 	counter->state = PERF_COUNTER_STATE_ACTIVE;
 	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
 
 	cpuctx->active_oncpu++;
 	ctx->nr_active++;
+
+	return 0;
 }
 
 static int
@@ -378,20 +382,38 @@ group_sched_in(struct perf_counter *group_counter,
 	       struct perf_counter_context *ctx,
 	       int cpu)
 {
-	struct perf_counter *counter;
-	int was_group = 0;
+	struct perf_counter *counter, *partial_group;
+	int ret = 0;
 
-	counter_sched_in(group_counter, cpuctx, ctx, cpu);
+	if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
+		return -EAGAIN;
 
 	/*
 	 * Schedule in siblings as one group (if any):
 	 */
 	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
-		counter_sched_in(counter, cpuctx, ctx, cpu);
-		was_group = 1;
+		if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
+			partial_group = counter;
+			goto group_error;
+		}
+		ret = -EAGAIN;
 	}
 
-	return was_group;
+	return ret;
+
+group_error:
+	/*
+	 * Groups can be scheduled in as one unit only, so undo any
+	 * partial group before returning:
+	 */
+	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
+		if (counter == partial_group)
+			break;
+		counter_sched_out(counter, cpuctx, ctx);
+	}
+	counter_sched_out(group_counter, cpuctx, ctx);
+
+	return -EAGAIN;
 }
 
 /*
@@ -416,9 +438,6 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 
 	spin_lock(&ctx->lock);
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		if (ctx->nr_active == cpuctx->max_pertask)
-			break;
-
 		/*
 		 * Listen to the 'cpu' scheduling filter constraint
 		 * of counters:
@@ -856,8 +875,9 @@ static const struct file_operations perf_fops = {
 	.poll			= perf_poll,
 };
 
-static void cpu_clock_perf_counter_enable(struct perf_counter *counter)
+static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 {
+	return 0;
 }
 
 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
@@ -913,11 +933,13 @@ static void task_clock_perf_counter_read(struct perf_counter *counter)
 	task_clock_perf_counter_update(counter, now);
 }
 
-static void task_clock_perf_counter_enable(struct perf_counter *counter)
+static int task_clock_perf_counter_enable(struct perf_counter *counter)
 {
 	u64 now = task_clock_perf_counter_val(counter, 0);
 
 	atomic64_set(&counter->hw.prev_count, now);
+
+	return 0;
 }
 
 static void task_clock_perf_counter_disable(struct perf_counter *counter)
@@ -960,12 +982,14 @@ static void page_faults_perf_counter_read(struct perf_counter *counter)
 	page_faults_perf_counter_update(counter);
 }
 
-static void page_faults_perf_counter_enable(struct perf_counter *counter)
+static int page_faults_perf_counter_enable(struct perf_counter *counter)
 {
 	/*
 	 * page-faults is a per-task value already,
 	 * so we dont have to clear it on switch-in.
 	 */
+
+	return 0;
 }
 
 static void page_faults_perf_counter_disable(struct perf_counter *counter)
@@ -1006,12 +1030,14 @@ static void context_switches_perf_counter_read(struct perf_counter *counter)
 	context_switches_perf_counter_update(counter);
 }
 
-static void context_switches_perf_counter_enable(struct perf_counter *counter)
+static int context_switches_perf_counter_enable(struct perf_counter *counter)
 {
 	/*
 	 * ->nvcsw + curr->nivcsw is a per-task value already,
 	 * so we dont have to clear it on switch-in.
 	 */
+
+	return 0;
 }
 
 static void context_switches_perf_counter_disable(struct perf_counter *counter)
@@ -1050,12 +1076,14 @@ static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
 	cpu_migrations_perf_counter_update(counter);
 }
 
-static void cpu_migrations_perf_counter_enable(struct perf_counter *counter)
+static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
 {
 	/*
 	 * se.nr_migrations is a per-task value already,
 	 * so we dont have to clear it on switch-in.
 	 */
+
+	return 0;
 }
 
 static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
-- 
cgit v0.10.2


From 0dff86aa7b9ec65a6d07167b7afb050b5fc98ddc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 23 Dec 2008 12:28:12 +0100
Subject: x86, perfcounters: print out the ->used bitmask

Impact: extend debug printouts

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 74090a3..f3359c2 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -255,6 +255,7 @@ static int pmc_generic_enable(struct perf_counter *counter)
 		idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
 		if (idx == nr_counters_generic)
 			return -EAGAIN;
+
 		set_bit(idx, cpuc->used);
 		hwc->idx = idx;
 	}
@@ -274,6 +275,7 @@ static int pmc_generic_enable(struct perf_counter *counter)
 void perf_counter_print_debug(void)
 {
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left;
+	struct cpu_hw_counters *cpuc;
 	int cpu, idx;
 
 	if (!nr_counters_generic)
@@ -282,6 +284,7 @@ void perf_counter_print_debug(void)
 	local_irq_disable();
 
 	cpu = smp_processor_id();
+	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
@@ -291,6 +294,7 @@ void perf_counter_print_debug(void)
 	printk(KERN_INFO "CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
 	printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
 	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
+	printk(KERN_INFO "CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
 		rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
-- 
cgit v0.10.2


From 8fe91e61cdc407c7556d3cd71cf20141a25bbcea Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 23 Dec 2008 12:29:25 +0100
Subject: perfcounters: remove ->nr_inherited

Impact: remove dead code

nr_inherited was not maintained correctly (not decremented) - and also
not used - remove it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 53af11d..1ea08e9 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -164,7 +164,6 @@ struct perf_counter {
 	struct task_struct		*task;
 	struct file			*filp;
 
-	unsigned int			nr_inherited;
 	struct perf_counter		*parent;
 	/*
 	 * Protect attach/detach:
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 2e73929..48e1dbc 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1308,7 +1308,6 @@ inherit_counter(struct perf_counter *parent_counter,
 	child_ctx->nr_counters++;
 
 	child_counter->parent = parent_counter;
-	parent_counter->nr_inherited++;
 	/*
 	 * inherit into child's child as well:
 	 */
-- 
cgit v0.10.2


From 235c7fc7c500e4fd1700c4ad01b5612bcdc1b449 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 21 Dec 2008 14:43:25 +0100
Subject: perfcounters: generalize the counter scheduler

Impact: clean up and refactor code

refactor the counter scheduler: separate out in/out functions and
introduce a counter-rotation function as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 48e1dbc..d7a79f3 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -111,11 +111,12 @@ static void __perf_counter_remove_from_context(void *info)
 	spin_lock(&ctx->lock);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
-		counter->hw_ops->disable(counter);
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
+		counter->hw_ops->disable(counter);
 		ctx->nr_active--;
 		cpuctx->active_oncpu--;
 		counter->task = NULL;
+		counter->oncpu = -1;
 	}
 	ctx->nr_counters--;
 
@@ -192,8 +193,36 @@ retry:
 	spin_unlock_irq(&ctx->lock);
 }
 
+static int
+counter_sched_in(struct perf_counter *counter,
+		 struct perf_cpu_context *cpuctx,
+		 struct perf_counter_context *ctx,
+		 int cpu)
+{
+	if (counter->state == PERF_COUNTER_STATE_OFF)
+		return 0;
+
+	counter->state = PERF_COUNTER_STATE_ACTIVE;
+	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
+	/*
+	 * The new state must be visible before we turn it on in the hardware:
+	 */
+	smp_wmb();
+
+	if (counter->hw_ops->enable(counter)) {
+		counter->state = PERF_COUNTER_STATE_INACTIVE;
+		counter->oncpu = -1;
+		return -EAGAIN;
+	}
+
+	cpuctx->active_oncpu++;
+	ctx->nr_active++;
+
+	return 0;
+}
+
 /*
- * Cross CPU call to install and enable a preformance counter
+ * Cross CPU call to install and enable a performance counter
  */
 static void __perf_install_in_context(void *info)
 {
@@ -220,22 +249,17 @@ static void __perf_install_in_context(void *info)
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
 	perf_flags = hw_perf_save_disable();
-	list_add_counter(counter, ctx);
-	hw_perf_restore(perf_flags);
 
+	list_add_counter(counter, ctx);
 	ctx->nr_counters++;
 
-	if (cpuctx->active_oncpu < perf_max_counters) {
-		counter->state = PERF_COUNTER_STATE_ACTIVE;
-		counter->oncpu = cpu;
-		ctx->nr_active++;
-		cpuctx->active_oncpu++;
-		counter->hw_ops->enable(counter);
-	}
+	counter_sched_in(counter, cpuctx, ctx, cpu);
 
 	if (!ctx->task && cpuctx->max_pertask)
 		cpuctx->max_pertask--;
 
+	hw_perf_restore(perf_flags);
+
 	spin_unlock(&ctx->lock);
 	curr_rq_unlock_irq_restore(&flags);
 }
@@ -302,8 +326,8 @@ counter_sched_out(struct perf_counter *counter,
 	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 		return;
 
-	counter->hw_ops->disable(counter);
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
+	counter->hw_ops->disable(counter);
 	counter->oncpu = -1;
 
 	cpuctx->active_oncpu--;
@@ -326,6 +350,22 @@ group_sched_out(struct perf_counter *group_counter,
 		counter_sched_out(counter, cpuctx, ctx);
 }
 
+void __perf_counter_sched_out(struct perf_counter_context *ctx,
+			      struct perf_cpu_context *cpuctx)
+{
+	struct perf_counter *counter;
+
+	if (likely(!ctx->nr_counters))
+		return;
+
+	spin_lock(&ctx->lock);
+	if (ctx->nr_active) {
+		list_for_each_entry(counter, &ctx->counter_list, list_entry)
+			group_sched_out(counter, cpuctx, ctx);
+	}
+	spin_unlock(&ctx->lock);
+}
+
 /*
  * Called from scheduler to remove the counters of the current task,
  * with interrupts disabled.
@@ -341,39 +381,18 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 {
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 	struct perf_counter_context *ctx = &task->perf_counter_ctx;
-	struct perf_counter *counter;
 
 	if (likely(!cpuctx->task_ctx))
 		return;
 
-	spin_lock(&ctx->lock);
-	if (ctx->nr_active) {
-		list_for_each_entry(counter, &ctx->counter_list, list_entry)
-			group_sched_out(counter, cpuctx, ctx);
-	}
-	spin_unlock(&ctx->lock);
+	__perf_counter_sched_out(ctx, cpuctx);
+
 	cpuctx->task_ctx = NULL;
 }
 
-static int
-counter_sched_in(struct perf_counter *counter,
-		 struct perf_cpu_context *cpuctx,
-		 struct perf_counter_context *ctx,
-		 int cpu)
+static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
 {
-	if (counter->state == PERF_COUNTER_STATE_OFF)
-		return 0;
-
-	if (counter->hw_ops->enable(counter))
-		return -EAGAIN;
-
-	counter->state = PERF_COUNTER_STATE_ACTIVE;
-	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
-
-	cpuctx->active_oncpu++;
-	ctx->nr_active++;
-
-	return 0;
+	__perf_counter_sched_out(&cpuctx->ctx, cpuctx);
 }
 
 static int
@@ -416,21 +435,10 @@ group_error:
 	return -EAGAIN;
 }
 
-/*
- * Called from scheduler to add the counters of the current task
- * with interrupts disabled.
- *
- * We restore the counter value and then enable it.
- *
- * This does not protect us against NMI, but enable()
- * sets the enabled bit in the control field of counter _before_
- * accessing the counter control register. If a NMI hits, then it will
- * keep the counter running.
- */
-void perf_counter_task_sched_in(struct task_struct *task, int cpu)
+static void
+__perf_counter_sched_in(struct perf_counter_context *ctx,
+			struct perf_cpu_context *cpuctx, int cpu)
 {
-	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-	struct perf_counter_context *ctx = &task->perf_counter_ctx;
 	struct perf_counter *counter;
 
 	if (likely(!ctx->nr_counters))
@@ -453,10 +461,35 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 			break;
 	}
 	spin_unlock(&ctx->lock);
+}
 
+/*
+ * Called from scheduler to add the counters of the current task
+ * with interrupts disabled.
+ *
+ * We restore the counter value and then enable it.
+ *
+ * This does not protect us against NMI, but enable()
+ * sets the enabled bit in the control field of counter _before_
+ * accessing the counter control register. If a NMI hits, then it will
+ * keep the counter running.
+ */
+void perf_counter_task_sched_in(struct task_struct *task, int cpu)
+{
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct perf_counter_context *ctx = &task->perf_counter_ctx;
+
+	__perf_counter_sched_in(ctx, cpuctx, cpu);
 	cpuctx->task_ctx = ctx;
 }
 
+static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
+{
+	struct perf_counter_context *ctx = &cpuctx->ctx;
+
+	__perf_counter_sched_in(ctx, cpuctx, cpu);
+}
+
 int perf_counter_task_disable(void)
 {
 	struct task_struct *curr = current;
@@ -514,6 +547,8 @@ int perf_counter_task_enable(void)
 	/* force the update of the task clock: */
 	__task_delta_exec(curr, 1);
 
+	perf_counter_task_sched_out(curr, cpu);
+
 	spin_lock(&ctx->lock);
 
 	/*
@@ -538,19 +573,18 @@ int perf_counter_task_enable(void)
 	return 0;
 }
 
-void perf_counter_task_tick(struct task_struct *curr, int cpu)
+/*
+ * Round-robin a context's counters:
+ */
+static void rotate_ctx(struct perf_counter_context *ctx)
 {
-	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 	struct perf_counter *counter;
 	u64 perf_flags;
 
-	if (likely(!ctx->nr_counters))
+	if (!ctx->nr_counters)
 		return;
 
-	perf_counter_task_sched_out(curr, cpu);
-
 	spin_lock(&ctx->lock);
-
 	/*
 	 * Rotate the first entry last (works just fine for group counters too):
 	 */
@@ -563,7 +597,24 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	hw_perf_restore(perf_flags);
 
 	spin_unlock(&ctx->lock);
+}
+
+void perf_counter_task_tick(struct task_struct *curr, int cpu)
+{
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+	const int rotate_percpu = 0;
+
+	if (rotate_percpu)
+		perf_counter_cpu_sched_out(cpuctx);
+	perf_counter_task_sched_out(curr, cpu);
 
+	if (rotate_percpu)
+		rotate_ctx(&cpuctx->ctx);
+	rotate_ctx(ctx);
+
+	if (rotate_percpu)
+		perf_counter_cpu_sched_in(cpuctx, cpu);
 	perf_counter_task_sched_in(curr, cpu);
 }
 
@@ -905,8 +956,6 @@ static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
 	struct task_struct *curr = counter->task;
 	u64 delta;
 
-	WARN_ON_ONCE(counter->task != current);
-
 	delta = __task_delta_exec(curr, update);
 
 	return curr->se.sum_exec_runtime + delta;
@@ -1160,6 +1209,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->group_leader		= group_leader;
 	counter->hw_ops			= NULL;
 
+	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	if (hw_event->disabled)
 		counter->state = PERF_COUNTER_STATE_OFF;
 
@@ -1331,35 +1381,49 @@ __perf_counter_exit_task(struct task_struct *child,
 {
 	struct perf_counter *parent_counter;
 	u64 parent_val, child_val;
-	unsigned long flags;
-	u64 perf_flags;
 
 	/*
-	 * Disable and unlink this counter.
-	 *
-	 * Be careful about zapping the list - IRQ/NMI context
-	 * could still be processing it:
+	 * If we do not self-reap then we have to wait for the
+	 * child task to unschedule (it will happen for sure),
+	 * so that its counter is at its final count. (This
+	 * condition triggers rarely - child tasks usually get
+	 * off their CPU before the parent has a chance to
+	 * get this far into the reaping action)
 	 */
-	curr_rq_lock_irq_save(&flags);
-	perf_flags = hw_perf_save_disable();
-
-	if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
+	if (child != current) {
+		wait_task_inactive(child, 0);
+		list_del_init(&child_counter->list_entry);
+	} else {
 		struct perf_cpu_context *cpuctx;
+		unsigned long flags;
+		u64 perf_flags;
+
+		/*
+		 * Disable and unlink this counter.
+		 *
+		 * Be careful about zapping the list - IRQ/NMI context
+		 * could still be processing it:
+		 */
+		curr_rq_lock_irq_save(&flags);
+		perf_flags = hw_perf_save_disable();
 
 		cpuctx = &__get_cpu_var(perf_cpu_context);
 
-		child_counter->hw_ops->disable(child_counter);
-		child_counter->state = PERF_COUNTER_STATE_INACTIVE;
-		child_counter->oncpu = -1;
+		if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
+			child_counter->state = PERF_COUNTER_STATE_INACTIVE;
+			child_counter->hw_ops->disable(child_counter);
+			cpuctx->active_oncpu--;
+			child_ctx->nr_active--;
+			child_counter->oncpu = -1;
+		}
 
-		cpuctx->active_oncpu--;
-		child_ctx->nr_active--;
-	}
+		list_del_init(&child_counter->list_entry);
 
-	list_del_init(&child_counter->list_entry);
+		child_ctx->nr_counters--;
 
-	hw_perf_restore(perf_flags);
-	curr_rq_unlock_irq_restore(&flags);
+		hw_perf_restore(perf_flags);
+		curr_rq_unlock_irq_restore(&flags);
+	}
 
 	parent_counter = child_counter->parent;
 	/*
-- 
cgit v0.10.2


From f650a672359819454c3d8d4135ecd1558cde0b24 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 23 Dec 2008 12:17:29 +0100
Subject: perfcounters: add PERF_COUNT_BUS_CYCLES

Generalize "bus cycles" hw events - and map them to CPU_CLK_Unhalted.Ref
on x86. (which is a good enough approximation)

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f3359c2..86b2fdd 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -41,12 +41,13 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
 
 static const int intel_perfmon_event_map[] =
 {
-  [PERF_COUNT_CYCLES]			= 0x003c,
+  [PERF_COUNT_CPU_CYCLES]		= 0x003c,
   [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
   [PERF_COUNT_CACHE_REFERENCES]		= 0x4f2e,
   [PERF_COUNT_CACHE_MISSES]		= 0x412e,
   [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
   [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
+  [PERF_COUNT_BUS_CYCLES]		= 0x013c,
 };
 
 static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 1ea08e9..ec77d16 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -36,14 +36,15 @@ enum hw_event_types {
 	/*
 	 * Common hardware events, generalized by the kernel:
 	 */
-	PERF_COUNT_CYCLES		=  0,
+	PERF_COUNT_CPU_CYCLES		=  0,
 	PERF_COUNT_INSTRUCTIONS		=  1,
 	PERF_COUNT_CACHE_REFERENCES	=  2,
 	PERF_COUNT_CACHE_MISSES		=  3,
 	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
 	PERF_COUNT_BRANCH_MISSES	=  5,
+	PERF_COUNT_BUS_CYCLES		=  6,
 
-	PERF_HW_EVENTS_MAX		=  6,
+	PERF_HW_EVENTS_MAX		=  7,
 
 	/*
 	 * Special "software" counters provided by the kernel, even if
-- 
cgit v0.10.2


From 2f18d1e8d07ae67dd0afce875287756d4bd31a46 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 22 Dec 2008 11:10:42 +0100
Subject: x86, perfcounters: add support for fixed-function pmcs

Impact: extend performance counter support on x86 Intel CPUs

Modern Intel CPUs have 3 "fixed-function" performance counters, which
count these hardware events:

    Instr_Retired.Any
    CPU_CLK_Unhalted.Core
    CPU_CLK_Unhalted.Ref

Add support for them to the performance counters subsystem.

Their use is transparent to user-space: the counter scheduler is
extended to automatically recognize the cases where a fixed-function
PMC can be utilized instead of a generic PMC. In such cases the
generic PMC is kept available for more counters.

The above fixed-function events map to these generic counter hw events:

        PERF_COUNT_INSTRUCTIONS
        PERF_COUNT_CPU_CYCLES
        PERF_COUNT_BUS_CYCLES

(The 'bus' cycles are in reality often CPU-ish cycles, just with a fixed
 frequency.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 13745de..2e08ed7 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -23,6 +23,11 @@
 #define ARCH_PERFMON_EVENTSEL_OS			  (1 << 17)
 #define ARCH_PERFMON_EVENTSEL_USR			  (1 << 16)
 
+/*
+ * Includes eventsel and unit mask as well:
+ */
+#define ARCH_PERFMON_EVENT_MASK				    0xffff
+
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		      0x3c
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8)
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 		 0
@@ -69,12 +74,15 @@ union cpuid10_edx {
 
 /* Instr_Retired.Any: */
 #define MSR_ARCH_PERFMON_FIXED_CTR0			0x309
+#define X86_PMC_IDX_FIXED_INSTRUCTIONS			(X86_PMC_IDX_FIXED + 0)
 
 /* CPU_CLK_Unhalted.Core: */
 #define MSR_ARCH_PERFMON_FIXED_CTR1			0x30a
+#define X86_PMC_IDX_FIXED_CPU_CYCLES			(X86_PMC_IDX_FIXED + 1)
 
 /* CPU_CLK_Unhalted.Ref: */
 #define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
+#define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2)
 
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 86b2fdd..da46eca 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -26,6 +26,7 @@ static bool perf_counters_initialized __read_mostly;
  */
 static int nr_counters_generic __read_mostly;
 static u64 perf_counter_mask __read_mostly;
+static u64 counter_value_mask __read_mostly;
 
 static int nr_counters_fixed __read_mostly;
 
@@ -120,9 +121,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 			hwc->nmi = 1;
 	}
 
-	hwc->config_base	= MSR_ARCH_PERFMON_EVENTSEL0;
-	hwc->counter_base	= MSR_ARCH_PERFMON_PERFCTR0;
-
 	hwc->irq_period		= hw_event->irq_period;
 	/*
 	 * Intel PMCs cannot be accessed sanely above 32 bit width,
@@ -184,15 +182,33 @@ void hw_perf_restore(u64 ctrl)
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
 static inline void
+__pmc_fixed_disable(struct perf_counter *counter,
+		    struct hw_perf_counter *hwc, unsigned int __idx)
+{
+	int idx = __idx - X86_PMC_IDX_FIXED;
+	u64 ctrl_val, mask;
+	int err;
+
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	err = checking_wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static inline void
 __pmc_generic_disable(struct perf_counter *counter,
 			   struct hw_perf_counter *hwc, unsigned int idx)
 {
 	int err;
 
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
+		return __pmc_fixed_disable(counter, hwc, idx);
+
 	err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
 }
 
-static DEFINE_PER_CPU(u64, prev_left[X86_PMC_MAX_GENERIC]);
+static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
 
 /*
  * Set the next IRQ period, based on the hwc->period_left value.
@@ -202,8 +218,9 @@ static void
 __hw_perf_counter_set_period(struct perf_counter *counter,
 			     struct hw_perf_counter *hwc, int idx)
 {
-	s32 left = atomic64_read(&hwc->period_left);
+	s64 left = atomic64_read(&hwc->period_left);
 	s32 period = hwc->irq_period;
+	int err;
 
 	/*
 	 * If we are way outside a reasoable range then just skip forward:
@@ -224,21 +241,64 @@ __hw_perf_counter_set_period(struct perf_counter *counter,
 	 * The hw counter starts counting from this counter offset,
 	 * mark it to be able to extra future deltas:
 	 */
-	atomic64_set(&hwc->prev_count, (u64)(s64)-left);
+	atomic64_set(&hwc->prev_count, (u64)-left);
 
-	wrmsr(hwc->counter_base + idx, -left, 0);
+	err = checking_wrmsrl(hwc->counter_base + idx,
+			     (u64)(-left) & counter_value_mask);
+}
+
+static inline void
+__pmc_fixed_enable(struct perf_counter *counter,
+		   struct hw_perf_counter *hwc, unsigned int __idx)
+{
+	int idx = __idx - X86_PMC_IDX_FIXED;
+	u64 ctrl_val, bits, mask;
+	int err;
+
+	/*
+	 * Enable IRQ generation (0x8) and ring-3 counting (0x2),
+	 * and enable ring-0 counting if allowed:
+	 */
+	bits = 0x8ULL | 0x2ULL;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+		bits |= 0x1;
+	bits <<= (idx * 4);
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	ctrl_val |= bits;
+	err = checking_wrmsrl(hwc->config_base, ctrl_val);
 }
 
 static void
 __pmc_generic_enable(struct perf_counter *counter,
 			  struct hw_perf_counter *hwc, int idx)
 {
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
+		return __pmc_fixed_enable(counter, hwc, idx);
+
 	wrmsr(hwc->config_base + idx,
 	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
-static int fixed_mode_idx(struct hw_perf_counter *hwc)
+static int
+fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 {
+	unsigned int event;
+
+	if (unlikely(hwc->nmi))
+		return -1;
+
+	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
+
+	if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_INSTRUCTIONS]))
+		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
+	if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_CPU_CYCLES]))
+		return X86_PMC_IDX_FIXED_CPU_CYCLES;
+	if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_BUS_CYCLES]))
+		return X86_PMC_IDX_FIXED_BUS_CYCLES;
+
 	return -1;
 }
 
@@ -249,16 +309,39 @@ static int pmc_generic_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
-	int idx = hwc->idx;
+	int idx;
 
-	/* Try to get the previous counter again */
-	if (test_and_set_bit(idx, cpuc->used)) {
-		idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
-		if (idx == nr_counters_generic)
-			return -EAGAIN;
+	idx = fixed_mode_idx(counter, hwc);
+	if (idx >= 0) {
+		/*
+		 * Try to get the fixed counter, if that is already taken
+		 * then try to get a generic counter:
+		 */
+		if (test_and_set_bit(idx, cpuc->used))
+			goto try_generic;
 
-		set_bit(idx, cpuc->used);
+		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+		/*
+		 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
+		 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
+		 */
+		hwc->counter_base =
+			MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
 		hwc->idx = idx;
+	} else {
+		idx = hwc->idx;
+		/* Try to get the previous generic counter again */
+		if (test_and_set_bit(idx, cpuc->used)) {
+try_generic:
+			idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
+			if (idx == nr_counters_generic)
+				return -EAGAIN;
+
+			set_bit(idx, cpuc->used);
+			hwc->idx = idx;
+		}
+		hwc->config_base  = MSR_ARCH_PERFMON_EVENTSEL0;
+		hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
 	}
 
 	perf_counters_lapic_init(hwc->nmi);
@@ -266,6 +349,10 @@ static int pmc_generic_enable(struct perf_counter *counter)
 	__pmc_generic_disable(counter, hwc, idx);
 
 	cpuc->counters[idx] = counter;
+	/*
+	 * Make it visible before enabling the hw:
+	 */
+	smp_wmb();
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
 	__pmc_generic_enable(counter, hwc, idx);
@@ -275,7 +362,7 @@ static int pmc_generic_enable(struct perf_counter *counter)
 
 void perf_counter_print_debug(void)
 {
-	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left;
+	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
 	struct cpu_hw_counters *cpuc;
 	int cpu, idx;
 
@@ -290,11 +377,13 @@ void perf_counter_print_debug(void)
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 	rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
+	rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
 
 	printk(KERN_INFO "\n");
 	printk(KERN_INFO "CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
 	printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
 	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
+	printk(KERN_INFO "CPU#%d: fixed:      %016llx\n", cpu, fixed);
 	printk(KERN_INFO "CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
@@ -303,13 +392,19 @@ void perf_counter_print_debug(void)
 
 		prev_left = per_cpu(prev_left[idx], cpu);
 
-		printk(KERN_INFO "CPU#%d: PMC%d ctrl:  %016llx\n",
+		printk(KERN_INFO "CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
 			cpu, idx, pmc_ctrl);
-		printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n",
+		printk(KERN_INFO "CPU#%d:   gen-PMC%d count: %016llx\n",
 			cpu, idx, pmc_count);
-		printk(KERN_INFO "CPU#%d: PMC%d left:  %016llx\n",
+		printk(KERN_INFO "CPU#%d:   gen-PMC%d left:  %016llx\n",
 			cpu, idx, prev_left);
 	}
+	for (idx = 0; idx < nr_counters_fixed; idx++) {
+		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
+
+		printk(KERN_INFO "CPU#%d: fixed-PMC%d count: %016llx\n",
+			cpu, idx, pmc_count);
+	}
 	local_irq_enable();
 }
 
@@ -323,6 +418,11 @@ static void pmc_generic_disable(struct perf_counter *counter)
 
 	clear_bit(idx, cpuc->used);
 	cpuc->counters[idx] = NULL;
+	/*
+	 * Make sure the cleared pointer becomes visible before we
+	 * (potentially) free the counter:
+	 */
+	smp_wmb();
 
 	/*
 	 * Drain the remaining delta count out of a counter
@@ -353,14 +453,11 @@ static void perf_save_and_restart(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	int idx = hwc->idx;
-	u64 pmc_ctrl;
-
-	rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 
 	x86_perf_counter_update(counter, hwc, idx);
 	__hw_perf_counter_set_period(counter, hwc, idx);
 
-	if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
+	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		__pmc_generic_enable(counter, hwc, idx);
 }
 
@@ -373,6 +470,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 	 * Store sibling timestamps (if any):
 	 */
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
+
 		x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
 		perf_store_irq_data(sibling, counter->hw_event.type);
 		perf_store_irq_data(sibling, atomic64_read(&counter->count));
@@ -403,7 +501,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 
 again:
 	ack = status;
-	for_each_bit(bit, (unsigned long *) &status, nr_counters_generic) {
+	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
 		struct perf_counter *counter = cpuc->counters[bit];
 
 		clear_bit(bit, (unsigned long *) &status);
@@ -561,6 +659,9 @@ void __init init_hw_perf_counters(void)
 	perf_max_counters = nr_counters_generic;
 
 	printk(KERN_INFO "... bit width:       %d\n", eax.split.bit_width);
+	counter_value_mask = (1ULL << eax.split.bit_width) - 1;
+	printk(KERN_INFO "... value mask:      %016Lx\n", counter_value_mask);
+
 	printk(KERN_INFO "... mask length:     %d\n", eax.split.mask_length);
 
 	nr_counters_fixed = edx.split.num_counters_fixed;
-- 
cgit v0.10.2


From e44aef58ecfbe061eb4c53b939bcc35fb1bee82d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 25 Dec 2008 09:02:11 +0100
Subject: perfcounters: include asm/perf_counter.h only if
 CONFIG_PERF_COUNTERS=y

Impact: build fix on ia64

KOSAKI Motohiro reported that -tip doesnt build on ia64 because
asm/perf_counter.h only exists on x86 for now. Fix it.

Reported-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index ec77d16..cc3a75a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -14,7 +14,10 @@
 #define _LINUX_PERF_COUNTER_H
 
 #include <asm/atomic.h>
-#include <asm/perf_counter.h>
+
+#ifdef CONFIG_PERF_COUNTERS
+# include <asm/perf_counter.h>
+#endif
 
 #include <linux/list.h>
 #include <linux/mutex.h>
-- 
cgit v0.10.2


From 01ea1ccaa24dea3552e103be13b7897211607a8b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 26 Dec 2008 21:05:06 -0800
Subject: perf_counter: more barrier in blank weak function

Impact: fix panic possible panic

Some versions of GCC inline the weak global function if it's empty.
Add a barrier() to work it around.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d7a79f3..37f77169 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -45,8 +45,8 @@ hw_perf_counter_init(struct perf_counter *counter)
 }
 
 u64 __weak hw_perf_save_disable(void)		{ return 0; }
-void __weak hw_perf_restore(u64 ctrl)		{ }
-void __weak hw_perf_counter_setup(void)		{ }
+void __weak hw_perf_restore(u64 ctrl)		{ barrier(); }
+void __weak hw_perf_counter_setup(void)		{ barrier(); }
 
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
-- 
cgit v0.10.2


From 2b583d8bc8d7105b58d7481a4a0ceb718dac49c6 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@infradead.org>
Date: Sat, 27 Dec 2008 19:15:43 +0530
Subject: x86: perf_counter remove unwanted hw_perf_enable_all

Impact: clean, reduce kernel size a bit, avoid sparse warnings

Fixes sparse warnings:

 arch/x86/kernel/cpu/perf_counter.c:153:6: warning: symbol 'hw_perf_enable_all' was not declared. Should it be static?
 arch/x86/kernel/cpu/perf_counter.c:279:3: warning: returning void-valued expression
 arch/x86/kernel/cpu/perf_counter.c:206:3: warning: returning void-valued expression
 arch/x86/kernel/cpu/perf_counter.c:206:3: warning: returning void-valued expression

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index da46eca..9376771 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -150,14 +150,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	return 0;
 }
 
-void hw_perf_enable_all(void)
-{
-	if (unlikely(!perf_counters_initialized))
-		return;
-
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask);
-}
-
 u64 hw_perf_save_disable(void)
 {
 	u64 ctrl;
@@ -200,12 +192,10 @@ static inline void
 __pmc_generic_disable(struct perf_counter *counter,
 			   struct hw_perf_counter *hwc, unsigned int idx)
 {
-	int err;
-
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
-		return __pmc_fixed_disable(counter, hwc, idx);
-
-	err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
+		__pmc_fixed_disable(counter, hwc, idx);
+	else
+		wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
 }
 
 static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
@@ -276,10 +266,10 @@ __pmc_generic_enable(struct perf_counter *counter,
 			  struct hw_perf_counter *hwc, int idx)
 {
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
-		return __pmc_fixed_enable(counter, hwc, idx);
-
-	wrmsr(hwc->config_base + idx,
-	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+		__pmc_fixed_enable(counter, hwc, idx);
+	else
+		wrmsr(hwc->config_base + idx,
+		      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
 static int
-- 
cgit v0.10.2


From ff6f05416ece2caec1a7a1f8180d6598e0ab9272 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 9 Jan 2009 16:19:25 +1100
Subject: perf_counter: Fix return value from dummy hw_perf_counter_init

Impact: fix oops-causing bug

Currently, if you try to use perf_counters on an architecture that has
no hardware support, and you select an event that doesn't map to any of
the defined software counters, you get an oops rather than an error.
This is because the dummy hw_perf_counter_init returns ERR_PTR(-EINVAL)
but the caller (perf_counter_alloc) only tests for NULL.

This makes the dummy hw_perf_counter_init return NULL instead.

Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 37f77169..4be1a8d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -41,7 +41,7 @@ static DEFINE_MUTEX(perf_resource_mutex);
 extern __weak const struct hw_perf_counter_ops *
 hw_perf_counter_init(struct perf_counter *counter)
 {
-	return ERR_PTR(-EINVAL);
+	return NULL;
 }
 
 u64 __weak hw_perf_save_disable(void)		{ return 0; }
-- 
cgit v0.10.2


From 9abf8a08bc8f18a3b125f834f00e2e71b49c15d2 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 9 Jan 2009 16:26:43 +1100
Subject: perf_counter: Fix the cpu_clock software counter

Impact: bug fix

Currently if you do (e.g.) timec -e -1 ls, it will report 0 for the
value of the cpu_clock counter.  The reason is that the core assumes
that a counter's count field is up-to-date when the counter is inactive,
and doesn't call the counter's read function.  However, the cpu_clock
counter code only updates the count in the read function.

This fixes it by making both the read and disable functions update the
count.  It also makes the counter ignore time passing while the counter
is disabled, by making the enable function update the hw.prev_count field.

Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4be1a8d..b7a027a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -928,18 +928,32 @@ static const struct file_operations perf_fops = {
 
 static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 {
+	int cpu = raw_smp_processor_id();
+
+	atomic64_set(&counter->hw.prev_count, cpu_clock(cpu));
 	return 0;
 }
 
+static void cpu_clock_perf_counter_update(struct perf_counter *counter)
+{
+	int cpu = raw_smp_processor_id();
+	s64 prev;
+	u64 now;
+
+	now = cpu_clock(cpu);
+	prev = atomic64_read(&counter->hw.prev_count);
+	atomic64_set(&counter->hw.prev_count, now);
+	atomic64_add(now - prev, &counter->count);
+}
+
 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
 {
+	cpu_clock_perf_counter_update(counter);
 }
 
 static void cpu_clock_perf_counter_read(struct perf_counter *counter)
 {
-	int cpu = raw_smp_processor_id();
-
-	atomic64_set(&counter->count, cpu_clock(cpu));
+	cpu_clock_perf_counter_update(counter);
 }
 
 static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
-- 
cgit v0.10.2


From 3cbed429a9ccdb7a243f733b1056fe5c39e9004c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 9 Jan 2009 16:43:42 +1100
Subject: perf_counter: Add optional hw_perf_group_sched_in arch function

Impact: extend perf_counter infrastructure

This adds an optional hw_perf_group_sched_in() arch function that enables
a whole group of counters in one go.  It returns 1 if it added the group
successfully, 0 if it did nothing (and therefore the core needs to add
the counters individually), or a negative number if an error occurred.
It should add all the counters and enable any software counters in the
group, or else add none of them and return an error.

There are a couple of related changes/improvements in the group handling
here:

* As an optimization, group_sched_out() and group_sched_in() now check the
  state of the group leader, and do nothing if the leader is not active
  or disabled.

* We now call hw_perf_save_disable/hw_perf_restore around the complete
  set of counter enable/disable calls in __perf_counter_sched_in/out,
  to give the arch code the opportunity to defer updating the hardware
  state until the hw_perf_restore call if it wants.

* We no longer stop adding groups after we get to a group that has more
  than one counter.  We will ultimately add an option for a group to be
  exclusive.  The current code doesn't really implement exclusive groups
  anyway, since a group could end up going on with other counters that
  get added before it.

Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index cc3a75a..b21d1ea 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -236,6 +236,9 @@ extern u64 hw_perf_save_disable(void);
 extern void hw_perf_restore(u64 ctrl);
 extern int perf_counter_task_disable(void);
 extern int perf_counter_task_enable(void);
+extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
+	       struct perf_cpu_context *cpuctx,
+	       struct perf_counter_context *ctx, int cpu);
 
 #else
 static inline void
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index b7a027a..9ad11e4 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -47,6 +47,12 @@ hw_perf_counter_init(struct perf_counter *counter)
 u64 __weak hw_perf_save_disable(void)		{ return 0; }
 void __weak hw_perf_restore(u64 ctrl)		{ barrier(); }
 void __weak hw_perf_counter_setup(void)		{ barrier(); }
+int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
+	       struct perf_cpu_context *cpuctx,
+	       struct perf_counter_context *ctx, int cpu)
+{
+	return 0;
+}
 
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
@@ -341,6 +347,9 @@ group_sched_out(struct perf_counter *group_counter,
 {
 	struct perf_counter *counter;
 
+	if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
+		return;
+
 	counter_sched_out(group_counter, cpuctx, ctx);
 
 	/*
@@ -354,15 +363,18 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 			      struct perf_cpu_context *cpuctx)
 {
 	struct perf_counter *counter;
+	u64 flags;
 
 	if (likely(!ctx->nr_counters))
 		return;
 
 	spin_lock(&ctx->lock);
+	flags = hw_perf_save_disable();
 	if (ctx->nr_active) {
 		list_for_each_entry(counter, &ctx->counter_list, list_entry)
 			group_sched_out(counter, cpuctx, ctx);
 	}
+	hw_perf_restore(flags);
 	spin_unlock(&ctx->lock);
 }
 
@@ -402,7 +414,14 @@ group_sched_in(struct perf_counter *group_counter,
 	       int cpu)
 {
 	struct perf_counter *counter, *partial_group;
-	int ret = 0;
+	int ret;
+
+	if (group_counter->state == PERF_COUNTER_STATE_OFF)
+		return 0;
+
+	ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
+	if (ret)
+		return ret < 0 ? ret : 0;
 
 	if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
 		return -EAGAIN;
@@ -415,10 +434,9 @@ group_sched_in(struct perf_counter *group_counter,
 			partial_group = counter;
 			goto group_error;
 		}
-		ret = -EAGAIN;
 	}
 
-	return ret;
+	return 0;
 
 group_error:
 	/*
@@ -440,11 +458,13 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 			struct perf_cpu_context *cpuctx, int cpu)
 {
 	struct perf_counter *counter;
+	u64 flags;
 
 	if (likely(!ctx->nr_counters))
 		return;
 
 	spin_lock(&ctx->lock);
+	flags = hw_perf_save_disable();
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		/*
 		 * Listen to the 'cpu' scheduling filter constraint
@@ -454,12 +474,13 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 			continue;
 
 		/*
-		 * If we scheduled in a group atomically and
-		 * exclusively, break out:
+		 * If we scheduled in a group atomically and exclusively,
+		 * or if this group can't go on, break out:
 		 */
 		if (group_sched_in(counter, cpuctx, ctx, cpu))
 			break;
 	}
+	hw_perf_restore(flags);
 	spin_unlock(&ctx->lock);
 }
 
-- 
cgit v0.10.2


From 4eb96fcfe07b7f2a05577e57533840f8e26bea53 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 9 Jan 2009 17:24:34 +1100
Subject: perf_counter: Add dummy perf_counter_print_debug function

Impact: minimize requirements on architectures

Currently, an architecture just enabling CONFIG_PERF_COUNTERS but not
providing any extra functions will fail to build with
perf_counter_print_debug being undefined, since we don't provide an
empty dummy definition like we do with the hw_perf_* functions.

This provides an empty dummy perf_counter_print_debug() to make it
easier for architectures to turn on CONFIG_PERF_COUNTERS.

Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 9ad11e4..4c0dccb 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -54,6 +54,8 @@ int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
 	return 0;
 }
 
+void __weak perf_counter_print_debug(void)	{ }
+
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
-- 
cgit v0.10.2


From d662ed26734473d4cb5f3d78cebfec8f9126e97c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 9 Jan 2009 17:01:53 +1100
Subject: powerpc/perf_counter: Add perf_counter system call on powerpc

... with an empty/dummy asm/perf_counter.h so it builds.

Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
new file mode 100644
index 0000000..59530ae
--- /dev/null
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -0,0 +1,10 @@
+/*
+ * Performance counter support - PowerPC-specific definitions.
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index 803def23..da300c4 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -322,3 +322,4 @@ SYSCALL_SPU(epoll_create1)
 SYSCALL_SPU(dup3)
 SYSCALL_SPU(pipe2)
 SYSCALL(inotify_init1)
+SYSCALL(perf_counter_open)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index e07d0c7..7cef5af 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -341,10 +341,11 @@
 #define __NR_dup3		316
 #define __NR_pipe2		317
 #define __NR_inotify_init1	318
+#define __NR_perf_counter_open	319
 
 #ifdef __KERNEL__
 
-#define __NR_syscalls		319
+#define __NR_syscalls		320
 
 #define __NR__exit __NR_exit
 #define NR_syscalls	__NR_syscalls
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 3d0c776..94dd1fb 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -1,6 +1,7 @@
 config PPC64
 	bool "64-bit kernel"
 	default n
+	select HAVE_PERF_COUNTERS
 	help
 	  This option selects whether a 32-bit or a 64-bit kernel
 	  will be built.
-- 
cgit v0.10.2


From 93a6d3ce6962044fe9badf528fed46b455d58292 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 9 Jan 2009 16:52:19 +1100
Subject: powerpc: Provide a way to defer perf counter work until interrupts
 are enabled

Because 64-bit powerpc uses lazy (soft) interrupt disabling, it is
possible for a performance monitor exception to come in when the
kernel thinks interrupts are disabled (i.e. when they are
soft-disabled but hard-enabled).  In such a situation the performance
monitor exception handler might have some processing to do (such as
process wakeups) which can't be done in what is effectively an NMI
handler.

This provides a way to defer that work until interrupts get enabled,
either in raw_local_irq_restore() or by returning from an interrupt
handler to code that had interrupts enabled.  We have a per-processor
flag that indicates that there is work pending to do when interrupts
subsequently get re-enabled.  This flag is checked in the interrupt
return path and in raw_local_irq_restore(), and if it is set,
perf_counter_do_pending() is called to do the pending work.

Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index f75a5fc..e10f151 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -131,5 +131,36 @@ static inline int irqs_disabled_flags(unsigned long flags)
  */
 struct hw_interrupt_type;
 
+#ifdef CONFIG_PERF_COUNTERS
+static inline unsigned long get_perf_counter_pending(void)
+{
+	unsigned long x;
+
+	asm volatile("lbz %0,%1(13)"
+		: "=r" (x)
+		: "i" (offsetof(struct paca_struct, perf_counter_pending)));
+	return x;
+}
+
+static inline void set_perf_counter_pending(int x)
+{
+	asm volatile("stb %0,%1(13)" : :
+		"r" (x),
+		"i" (offsetof(struct paca_struct, perf_counter_pending)));
+}
+
+extern void perf_counter_do_pending(void);
+
+#else
+
+static inline unsigned long get_perf_counter_pending(void)
+{
+	return 0;
+}
+
+static inline void set_perf_counter_pending(int x) {}
+static inline void perf_counter_do_pending(void) {}
+#endif /* CONFIG_PERF_COUNTERS */
+
 #endif	/* __KERNEL__ */
 #endif	/* _ASM_POWERPC_HW_IRQ_H */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 082b3ae..6ef0557 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -99,6 +99,7 @@ struct paca_struct {
 	u8 soft_enabled;		/* irq soft-enable flag */
 	u8 hard_enabled;		/* set if irqs are enabled in MSR */
 	u8 io_sync;			/* writel() needs spin_unlock sync */
+	u8 perf_counter_pending;	/* PM interrupt while soft-disabled */
 
 	/* Stuff for accurate time accounting */
 	u64 user_time;			/* accumulated usermode TB ticks */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 661d07d..cea4629 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -127,6 +127,7 @@ int main(void)
 	DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr));
 	DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled));
 	DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled));
+	DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending));
 	DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache));
 	DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));
 	DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 383ed6e..f30b4e5 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -526,6 +526,15 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES)
 2:
 	TRACE_AND_RESTORE_IRQ(r5);
 
+#ifdef CONFIG_PERF_COUNTERS
+	/* check paca->perf_counter_pending if we're enabling ints */
+	lbz	r3,PACAPERFPEND(r13)
+	and.	r3,r3,r5
+	beq	27f
+	bl	.perf_counter_do_pending
+27:
+#endif /* CONFIG_PERF_COUNTERS */
+
 	/* extract EE bit and use it to restore paca->hard_enabled */
 	ld	r3,_MSR(r1)
 	rldicl	r4,r3,49,63		/* r0 = (r3 >> 15) & 1 */
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index ac222d0..4efb886 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -104,6 +104,13 @@ static inline notrace void set_soft_enabled(unsigned long enable)
 	: : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled)));
 }
 
+#ifdef CONFIG_PERF_COUNTERS
+notrace void __weak perf_counter_do_pending(void)
+{
+	set_perf_counter_pending(0);
+}
+#endif
+
 notrace void raw_local_irq_restore(unsigned long en)
 {
 	/*
@@ -135,6 +142,9 @@ notrace void raw_local_irq_restore(unsigned long en)
 			iseries_handle_interrupts();
 	}
 
+	if (get_perf_counter_pending())
+		perf_counter_do_pending();
+
 	/*
 	 * if (get_paca()->hard_enabled) return;
 	 * But again we need to take care that gcc gets hard_enabled directly
-- 
cgit v0.10.2


From 4574910e5087085a1f330ff8373cee4503f5c77c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 9 Jan 2009 20:21:55 +1100
Subject: powerpc/perf_counter: Add generic support for POWER-family PMU
 hardware

This provides the architecture-specific functions needed to access
PMU hardware on the 64-bit PowerPC processors.  It has been designed
for the IBM POWER family (POWER 4/4+/5/5+/6 and PPC970) but will
hopefully also suit other 64-bit PowerPC machines (although probably
not Cell given how different it is in this area).  This doesn't
include back-ends for any specific processors.

This implements a system which allows back-ends to express the
constraints that their hardware has on what events can be counted
simultaneously.  The constraints are expressed as a 64-bit mask +
64-bit value for each event, and the encoding is capable of
expressing the constraints arising from having a set of multiplexers
feeding an event bus, with some events being available through
multiple multiplexer settings, such as we get on POWER4 and PPC970.
Furthermore, the back-end can supply alternative event codes for
each event, and the constraint checking code will try all possible
combinations of alternative event codes to try to find a combination
that will fit.

Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
index 59530ae..9d7ff6d 100644
--- a/arch/powerpc/include/asm/perf_counter.h
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -8,3 +8,65 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
+#include <linux/types.h>
+
+#define MAX_HWCOUNTERS		8
+#define MAX_EVENT_ALTERNATIVES	8
+
+/*
+ * This struct provides the constants and functions needed to
+ * describe the PMU on a particular POWER-family CPU.
+ */
+struct power_pmu {
+	int	n_counter;
+	int	max_alternatives;
+	u64	add_fields;
+	u64	test_adder;
+	int	(*compute_mmcr)(unsigned int events[], int n_ev,
+				unsigned int hwc[], u64 mmcr[]);
+	int	(*get_constraint)(unsigned int event, u64 *mskp, u64 *valp);
+	int	(*get_alternatives)(unsigned int event, unsigned int alt[]);
+	void	(*disable_pmc)(unsigned int pmc, u64 mmcr[]);
+	int	n_generic;
+	int	*generic_events;
+};
+
+extern struct power_pmu *ppmu;
+
+/*
+ * The power_pmu.get_constraint function returns a 64-bit value and
+ * a 64-bit mask that express the constraints between this event and
+ * other events.
+ *
+ * The value and mask are divided up into (non-overlapping) bitfields
+ * of three different types:
+ *
+ * Select field: this expresses the constraint that some set of bits
+ * in MMCR* needs to be set to a specific value for this event.  For a
+ * select field, the mask contains 1s in every bit of the field, and
+ * the value contains a unique value for each possible setting of the
+ * MMCR* bits.  The constraint checking code will ensure that two events
+ * that set the same field in their masks have the same value in their
+ * value dwords.
+ *
+ * Add field: this expresses the constraint that there can be at most
+ * N events in a particular class.  A field of k bits can be used for
+ * N <= 2^(k-1) - 1.  The mask has the most significant bit of the field
+ * set (and the other bits 0), and the value has only the least significant
+ * bit of the field set.  In addition, the 'add_fields' and 'test_adder'
+ * in the struct power_pmu for this processor come into play.  The
+ * add_fields value contains 1 in the LSB of the field, and the
+ * test_adder contains 2^(k-1) - 1 - N in the field.
+ *
+ * NAND field: this expresses the constraint that you may not have events
+ * in all of a set of classes.  (For example, on PPC970, you can't select
+ * events from the FPU, ISU and IDU simultaneously, although any two are
+ * possible.)  For N classes, the field is N+1 bits wide, and each class
+ * is assigned one bit from the least-significant N bits.  The mask has
+ * only the most-significant bit set, and the value has only the bit
+ * for the event's class set.  The test_adder has the least significant
+ * bit set in the field.
+ *
+ * If an event is not subject to the constraint expressed by a particular
+ * field, then it will have 0 in both the mask and value for that field.
+ */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 1308a86..fde190b 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,6 +94,7 @@ obj-$(CONFIG_AUDIT)		+= audit.o
 obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
+obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o
 
 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
 
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
new file mode 100644
index 0000000..c7d4c29
--- /dev/null
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -0,0 +1,754 @@
+/*
+ * Performance counter support - powerpc architecture code
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/perf_counter.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <asm/reg.h>
+#include <asm/pmc.h>
+
+struct cpu_hw_counters {
+	int n_counters;
+	int n_percpu;
+	int disabled;
+	int n_added;
+	struct perf_counter *counter[MAX_HWCOUNTERS];
+	unsigned int events[MAX_HWCOUNTERS];
+	u64 mmcr[3];
+};
+DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
+
+struct power_pmu *ppmu;
+
+void perf_counter_print_debug(void)
+{
+}
+
+/*
+ * Return 1 for a software counter, 0 for a hardware counter
+ */
+static inline int is_software_counter(struct perf_counter *counter)
+{
+	return !counter->hw_event.raw && counter->hw_event.type < 0;
+}
+
+/*
+ * Read one performance monitor counter (PMC).
+ */
+static unsigned long read_pmc(int idx)
+{
+	unsigned long val;
+
+	switch (idx) {
+	case 1:
+		val = mfspr(SPRN_PMC1);
+		break;
+	case 2:
+		val = mfspr(SPRN_PMC2);
+		break;
+	case 3:
+		val = mfspr(SPRN_PMC3);
+		break;
+	case 4:
+		val = mfspr(SPRN_PMC4);
+		break;
+	case 5:
+		val = mfspr(SPRN_PMC5);
+		break;
+	case 6:
+		val = mfspr(SPRN_PMC6);
+		break;
+	case 7:
+		val = mfspr(SPRN_PMC7);
+		break;
+	case 8:
+		val = mfspr(SPRN_PMC8);
+		break;
+	default:
+		printk(KERN_ERR "oops trying to read PMC%d\n", idx);
+		val = 0;
+	}
+	return val;
+}
+
+/*
+ * Write one PMC.
+ */
+static void write_pmc(int idx, unsigned long val)
+{
+	switch (idx) {
+	case 1:
+		mtspr(SPRN_PMC1, val);
+		break;
+	case 2:
+		mtspr(SPRN_PMC2, val);
+		break;
+	case 3:
+		mtspr(SPRN_PMC3, val);
+		break;
+	case 4:
+		mtspr(SPRN_PMC4, val);
+		break;
+	case 5:
+		mtspr(SPRN_PMC5, val);
+		break;
+	case 6:
+		mtspr(SPRN_PMC6, val);
+		break;
+	case 7:
+		mtspr(SPRN_PMC7, val);
+		break;
+	case 8:
+		mtspr(SPRN_PMC8, val);
+		break;
+	default:
+		printk(KERN_ERR "oops trying to write PMC%d\n", idx);
+	}
+}
+
+/*
+ * Check if a set of events can all go on the PMU at once.
+ * If they can't, this will look at alternative codes for the events
+ * and see if any combination of alternative codes is feasible.
+ * The feasible set is returned in event[].
+ */
+static int power_check_constraints(unsigned int event[], int n_ev)
+{
+	u64 mask, value, nv;
+	unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
+	u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
+	u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
+	u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
+	int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS];
+	int i, j;
+	u64 addf = ppmu->add_fields;
+	u64 tadd = ppmu->test_adder;
+
+	if (n_ev > ppmu->n_counter)
+		return -1;
+
+	/* First see if the events will go on as-is */
+	for (i = 0; i < n_ev; ++i) {
+		alternatives[i][0] = event[i];
+		if (ppmu->get_constraint(event[i], &amasks[i][0],
+					 &avalues[i][0]))
+			return -1;
+		choice[i] = 0;
+	}
+	value = mask = 0;
+	for (i = 0; i < n_ev; ++i) {
+		nv = (value | avalues[i][0]) + (value & avalues[i][0] & addf);
+		if ((((nv + tadd) ^ value) & mask) != 0 ||
+		    (((nv + tadd) ^ avalues[i][0]) & amasks[i][0]) != 0)
+			break;
+		value = nv;
+		mask |= amasks[i][0];
+	}
+	if (i == n_ev)
+		return 0;	/* all OK */
+
+	/* doesn't work, gather alternatives... */
+	if (!ppmu->get_alternatives)
+		return -1;
+	for (i = 0; i < n_ev; ++i) {
+		n_alt[i] = ppmu->get_alternatives(event[i], alternatives[i]);
+		for (j = 1; j < n_alt[i]; ++j)
+			ppmu->get_constraint(alternatives[i][j],
+					     &amasks[i][j], &avalues[i][j]);
+	}
+
+	/* enumerate all possibilities and see if any will work */
+	i = 0;
+	j = -1;
+	value = mask = nv = 0;
+	while (i < n_ev) {
+		if (j >= 0) {
+			/* we're backtracking, restore context */
+			value = svalues[i];
+			mask = smasks[i];
+			j = choice[i];
+		}
+		/*
+		 * See if any alternative k for event i,
+		 * where k > j, will satisfy the constraints.
+		 */
+		while (++j < n_alt[i]) {
+			nv = (value | avalues[i][j]) +
+				(value & avalues[i][j] & addf);
+			if ((((nv + tadd) ^ value) & mask) == 0 &&
+			    (((nv + tadd) ^ avalues[i][j])
+			     & amasks[i][j]) == 0)
+				break;
+		}
+		if (j >= n_alt[i]) {
+			/*
+			 * No feasible alternative, backtrack
+			 * to event i-1 and continue enumerating its
+			 * alternatives from where we got up to.
+			 */
+			if (--i < 0)
+				return -1;
+		} else {
+			/*
+			 * Found a feasible alternative for event i,
+			 * remember where we got up to with this event,
+			 * go on to the next event, and start with
+			 * the first alternative for it.
+			 */
+			choice[i] = j;
+			svalues[i] = value;
+			smasks[i] = mask;
+			value = nv;
+			mask |= amasks[i][j];
+			++i;
+			j = -1;
+		}
+	}
+
+	/* OK, we have a feasible combination, tell the caller the solution */
+	for (i = 0; i < n_ev; ++i)
+		event[i] = alternatives[i][choice[i]];
+	return 0;
+}
+
+static void power_perf_read(struct perf_counter *counter)
+{
+	long val, delta, prev;
+
+	if (!counter->hw.idx)
+		return;
+	/*
+	 * Performance monitor interrupts come even when interrupts
+	 * are soft-disabled, as long as interrupts are hard-enabled.
+	 * Therefore we treat them like NMIs.
+	 */
+	do {
+		prev = atomic64_read(&counter->hw.prev_count);
+		barrier();
+		val = read_pmc(counter->hw.idx);
+	} while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev);
+
+	/* The counters are only 32 bits wide */
+	delta = (val - prev) & 0xfffffffful;
+	atomic64_add(delta, &counter->count);
+	atomic64_sub(delta, &counter->hw.period_left);
+}
+
+/*
+ * Disable all counters to prevent PMU interrupts and to allow
+ * counters to be added or removed.
+ */
+u64 hw_perf_save_disable(void)
+{
+	struct cpu_hw_counters *cpuhw;
+	unsigned long ret;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	cpuhw = &__get_cpu_var(cpu_hw_counters);
+
+	ret = cpuhw->disabled;
+	if (!ret) {
+		cpuhw->disabled = 1;
+		cpuhw->n_added = 0;
+
+		/*
+		 * Set the 'freeze counters' bit.
+		 * The barrier is to make sure the mtspr has been
+		 * executed and the PMU has frozen the counters
+		 * before we return.
+		 */
+		mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC);
+		mb();
+	}
+	local_irq_restore(flags);
+	return ret;
+}
+
+/*
+ * Re-enable all counters if disable == 0.
+ * If we were previously disabled and counters were added, then
+ * put the new config on the PMU.
+ */
+void hw_perf_restore(u64 disable)
+{
+	struct perf_counter *counter;
+	struct cpu_hw_counters *cpuhw;
+	unsigned long flags;
+	long i;
+	unsigned long val;
+	s64 left;
+	unsigned int hwc_index[MAX_HWCOUNTERS];
+
+	if (disable)
+		return;
+	local_irq_save(flags);
+	cpuhw = &__get_cpu_var(cpu_hw_counters);
+	cpuhw->disabled = 0;
+
+	/*
+	 * If we didn't change anything, or only removed counters,
+	 * no need to recalculate MMCR* settings and reset the PMCs.
+	 * Just reenable the PMU with the current MMCR* settings
+	 * (possibly updated for removal of counters).
+	 */
+	if (!cpuhw->n_added) {
+		mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+		mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
+		mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
+		goto out;
+	}
+
+	/*
+	 * Compute MMCR* values for the new set of counters
+	 */
+	if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index,
+			       cpuhw->mmcr)) {
+		/* shouldn't ever get here */
+		printk(KERN_ERR "oops compute_mmcr failed\n");
+		goto out;
+	}
+
+	/*
+	 * Write the new configuration to MMCR* with the freeze
+	 * bit set and set the hardware counters to their initial values.
+	 * Then unfreeze the counters.
+	 */
+	mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+	mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
+	mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
+				| MMCR0_FC);
+
+	/*
+	 * Read off any pre-existing counters that need to move
+	 * to another PMC.
+	 */
+	for (i = 0; i < cpuhw->n_counters; ++i) {
+		counter = cpuhw->counter[i];
+		if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
+			power_perf_read(counter);
+			write_pmc(counter->hw.idx, 0);
+			counter->hw.idx = 0;
+		}
+	}
+
+	/*
+	 * Initialize the PMCs for all the new and moved counters.
+	 */
+	for (i = 0; i < cpuhw->n_counters; ++i) {
+		counter = cpuhw->counter[i];
+		if (counter->hw.idx)
+			continue;
+		val = 0;
+		if (counter->hw_event.irq_period) {
+			left = atomic64_read(&counter->hw.period_left);
+			if (left < 0x80000000L)
+				val = 0x80000000L - left;
+		}
+		atomic64_set(&counter->hw.prev_count, val);
+		counter->hw.idx = hwc_index[i] + 1;
+		write_pmc(counter->hw.idx, val);
+	}
+	mb();
+	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
+	mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
+
+ out:
+	local_irq_restore(flags);
+}
+
+static int collect_events(struct perf_counter *group, int max_count,
+			  struct perf_counter *ctrs[], unsigned int *events)
+{
+	int n = 0;
+	struct perf_counter *counter;
+
+	if (!is_software_counter(group)) {
+		if (n >= max_count)
+			return -1;
+		ctrs[n] = group;
+		events[n++] = group->hw.config;
+	}
+	list_for_each_entry(counter, &group->sibling_list, list_entry) {
+		if (!is_software_counter(counter) &&
+		    counter->state != PERF_COUNTER_STATE_OFF) {
+			if (n >= max_count)
+				return -1;
+			ctrs[n] = counter;
+			events[n++] = counter->hw.config;
+		}
+	}
+	return n;
+}
+
+static void counter_sched_in(struct perf_counter *counter, int cpu)
+{
+	counter->state = PERF_COUNTER_STATE_ACTIVE;
+	counter->oncpu = cpu;
+	if (is_software_counter(counter))
+		counter->hw_ops->enable(counter);
+}
+
+/*
+ * Called to enable a whole group of counters.
+ * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
+ * Assumes the caller has disabled interrupts and has
+ * frozen the PMU with hw_perf_save_disable.
+ */
+int hw_perf_group_sched_in(struct perf_counter *group_leader,
+	       struct perf_cpu_context *cpuctx,
+	       struct perf_counter_context *ctx, int cpu)
+{
+	struct cpu_hw_counters *cpuhw;
+	long i, n, n0;
+	struct perf_counter *sub;
+
+	cpuhw = &__get_cpu_var(cpu_hw_counters);
+	n0 = cpuhw->n_counters;
+	n = collect_events(group_leader, ppmu->n_counter - n0,
+			   &cpuhw->counter[n0], &cpuhw->events[n0]);
+	if (n < 0)
+		return -EAGAIN;
+	if (power_check_constraints(cpuhw->events, n + n0))
+		return -EAGAIN;
+	cpuhw->n_counters = n0 + n;
+	cpuhw->n_added += n;
+
+	/*
+	 * OK, this group can go on; update counter states etc.,
+	 * and enable any software counters
+	 */
+	for (i = n0; i < n0 + n; ++i)
+		cpuhw->counter[i]->hw.config = cpuhw->events[i];
+	n = 1;
+	counter_sched_in(group_leader, cpu);
+	list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
+		if (sub->state != PERF_COUNTER_STATE_OFF) {
+			counter_sched_in(sub, cpu);
+			++n;
+		}
+	}
+	cpuctx->active_oncpu += n;
+	ctx->nr_active += n;
+
+	return 1;
+}
+
+/*
+ * Add a counter to the PMU.
+ * If all counters are not already frozen, then we disable and
+ * re-enable the PMU in order to get hw_perf_restore to do the
+ * actual work of reconfiguring the PMU.
+ */
+static int power_perf_enable(struct perf_counter *counter)
+{
+	struct cpu_hw_counters *cpuhw;
+	unsigned long flags;
+	u64 pmudis;
+	int n0;
+	int ret = -EAGAIN;
+
+	local_irq_save(flags);
+	pmudis = hw_perf_save_disable();
+
+	/*
+	 * Add the counter to the list (if there is room)
+	 * and check whether the total set is still feasible.
+	 */
+	cpuhw = &__get_cpu_var(cpu_hw_counters);
+	n0 = cpuhw->n_counters;
+	if (n0 >= ppmu->n_counter)
+		goto out;
+	cpuhw->counter[n0] = counter;
+	cpuhw->events[n0] = counter->hw.config;
+	if (power_check_constraints(cpuhw->events, n0 + 1))
+		goto out;
+
+	counter->hw.config = cpuhw->events[n0];
+	++cpuhw->n_counters;
+	++cpuhw->n_added;
+
+	ret = 0;
+ out:
+	hw_perf_restore(pmudis);
+	local_irq_restore(flags);
+	return ret;
+}
+
+/*
+ * Remove a counter from the PMU.
+ */
+static void power_perf_disable(struct perf_counter *counter)
+{
+	struct cpu_hw_counters *cpuhw;
+	long i;
+	u64 pmudis;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	pmudis = hw_perf_save_disable();
+
+	power_perf_read(counter);
+
+	cpuhw = &__get_cpu_var(cpu_hw_counters);
+	for (i = 0; i < cpuhw->n_counters; ++i) {
+		if (counter == cpuhw->counter[i]) {
+			while (++i < cpuhw->n_counters)
+				cpuhw->counter[i-1] = cpuhw->counter[i];
+			--cpuhw->n_counters;
+			ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
+			write_pmc(counter->hw.idx, 0);
+			counter->hw.idx = 0;
+			break;
+		}
+	}
+	if (cpuhw->n_counters == 0) {
+		/* disable exceptions if no counters are running */
+		cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
+	}
+
+	hw_perf_restore(pmudis);
+	local_irq_restore(flags);
+}
+
+struct hw_perf_counter_ops power_perf_ops = {
+	.enable = power_perf_enable,
+	.disable = power_perf_disable,
+	.read = power_perf_read
+};
+
+const struct hw_perf_counter_ops *
+hw_perf_counter_init(struct perf_counter *counter)
+{
+	unsigned long ev;
+	struct perf_counter *ctrs[MAX_HWCOUNTERS];
+	unsigned int events[MAX_HWCOUNTERS];
+	int n;
+
+	if (!ppmu)
+		return NULL;
+	if ((s64)counter->hw_event.irq_period < 0)
+		return NULL;
+	ev = counter->hw_event.type;
+	if (!counter->hw_event.raw) {
+		if (ev >= ppmu->n_generic ||
+		    ppmu->generic_events[ev] == 0)
+			return NULL;
+		ev = ppmu->generic_events[ev];
+	}
+	counter->hw.config_base = ev;
+	counter->hw.idx = 0;
+
+	/*
+	 * If this is in a group, check if it can go on with all the
+	 * other hardware counters in the group.  We assume the counter
+	 * hasn't been linked into its leader's sibling list at this point.
+	 */
+	n = 0;
+	if (counter->group_leader != counter) {
+		n = collect_events(counter->group_leader, ppmu->n_counter - 1,
+				   ctrs, events);
+		if (n < 0)
+			return NULL;
+	}
+	events[n++] = ev;
+	if (power_check_constraints(events, n))
+		return NULL;
+
+	counter->hw.config = events[n - 1];
+	atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
+	return &power_perf_ops;
+}
+
+/*
+ * Handle wakeups.
+ */
+void perf_counter_do_pending(void)
+{
+	int i;
+	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
+	struct perf_counter *counter;
+
+	set_perf_counter_pending(0);
+	for (i = 0; i < cpuhw->n_counters; ++i) {
+		counter = cpuhw->counter[i];
+		if (counter && counter->wakeup_pending) {
+			counter->wakeup_pending = 0;
+			wake_up(&counter->waitq);
+		}
+	}
+}
+
+/*
+ * Record data for an irq counter.
+ * This function was lifted from the x86 code; maybe it should
+ * go in the core?
+ */
+static void perf_store_irq_data(struct perf_counter *counter, u64 data)
+{
+	struct perf_data *irqdata = counter->irqdata;
+
+	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
+		irqdata->overrun++;
+	} else {
+		u64 *p = (u64 *) &irqdata->data[irqdata->len];
+
+		*p = data;
+		irqdata->len += sizeof(u64);
+	}
+}
+
+/*
+ * Record all the values of the counters in a group
+ */
+static void perf_handle_group(struct perf_counter *counter)
+{
+	struct perf_counter *leader, *sub;
+
+	leader = counter->group_leader;
+	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+		if (sub != counter)
+			sub->hw_ops->read(sub);
+		perf_store_irq_data(counter, sub->hw_event.type);
+		perf_store_irq_data(counter, atomic64_read(&sub->count));
+	}
+}
+
+/*
+ * A counter has overflowed; update its count and record
+ * things if requested.  Note that interrupts are hard-disabled
+ * here so there is no possibility of being interrupted.
+ */
+static void record_and_restart(struct perf_counter *counter, long val,
+			       struct pt_regs *regs)
+{
+	s64 prev, delta, left;
+	int record = 0;
+
+	/* we don't have to worry about interrupts here */
+	prev = atomic64_read(&counter->hw.prev_count);
+	delta = (val - prev) & 0xfffffffful;
+	atomic64_add(delta, &counter->count);
+
+	/*
+	 * See if the total period for this counter has expired,
+	 * and update for the next period.
+	 */
+	val = 0;
+	left = atomic64_read(&counter->hw.period_left) - delta;
+	if (counter->hw_event.irq_period) {
+		if (left <= 0) {
+			left += counter->hw_event.irq_period;
+			if (left <= 0)
+				left = counter->hw_event.irq_period;
+			record = 1;
+		}
+		if (left < 0x80000000L)
+			val = 0x80000000L - left;
+	}
+	write_pmc(counter->hw.idx, val);
+	atomic64_set(&counter->hw.prev_count, val);
+	atomic64_set(&counter->hw.period_left, left);
+
+	/*
+	 * Finally record data if requested.
+	 */
+	if (record) {
+		switch (counter->hw_event.record_type) {
+		case PERF_RECORD_SIMPLE:
+			break;
+		case PERF_RECORD_IRQ:
+			perf_store_irq_data(counter, instruction_pointer(regs));
+			counter->wakeup_pending = 1;
+			break;
+		case PERF_RECORD_GROUP:
+			perf_handle_group(counter);
+			counter->wakeup_pending = 1;
+			break;
+		}
+	}
+}
+
+/*
+ * Performance monitor interrupt stuff
+ */
+static void perf_counter_interrupt(struct pt_regs *regs)
+{
+	int i;
+	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
+	struct perf_counter *counter;
+	long val;
+	int need_wakeup = 0, found = 0;
+
+	for (i = 0; i < cpuhw->n_counters; ++i) {
+		counter = cpuhw->counter[i];
+		val = read_pmc(counter->hw.idx);
+		if ((int)val < 0) {
+			/* counter has overflowed */
+			found = 1;
+			record_and_restart(counter, val, regs);
+			if (counter->wakeup_pending)
+				need_wakeup = 1;
+		}
+	}
+
+	/*
+	 * In case we didn't find and reset the counter that caused
+	 * the interrupt, scan all counters and reset any that are
+	 * negative, to avoid getting continual interrupts.
+	 * Any that we processed in the previous loop will not be negative.
+	 */
+	if (!found) {
+		for (i = 0; i < ppmu->n_counter; ++i) {
+			val = read_pmc(i + 1);
+			if ((int)val < 0)
+				write_pmc(i + 1, 0);
+		}
+	}
+
+	/*
+	 * Reset MMCR0 to its normal value.  This will set PMXE and
+	 * clear FC (freeze counters) and PMAO (perf mon alert occurred)
+	 * and thus allow interrupts to occur again.
+	 * XXX might want to use MSR.PM to keep the counters frozen until
+	 * we get back out of this interrupt.
+	 */
+	mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
+
+	/*
+	 * If we need a wakeup, check whether interrupts were soft-enabled
+	 * when we took the interrupt.  If they were, we can wake stuff up
+	 * immediately; otherwise we'll have to set a flag and do the
+	 * wakeup when interrupts get soft-enabled.
+	 */
+	if (need_wakeup) {
+		if (regs->softe) {
+			irq_enter();
+			perf_counter_do_pending();
+			irq_exit();
+		} else {
+			set_perf_counter_pending(1);
+		}
+	}
+}
+
+static int init_perf_counters(void)
+{
+	if (reserve_pmc_hardware(perf_counter_interrupt)) {
+		printk(KERN_ERR "Couldn't init performance monitor subsystem\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+arch_initcall(init_perf_counters);
-- 
cgit v0.10.2


From 16b067993dee3dfde61b20027e0b168dc06201ee Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Sat, 10 Jan 2009 16:34:07 +1100
Subject: powerpc/perf_counter: Add support for PPC970 family

This adds the back-end for the PMU on the PPC970 family.

The PPC970 allows events from the ISU to be selected in two different
ways.  Rather than use alternative event codes to express this, we
instead use a single encoding for ISU events and express the
resulting constraint (that you can't select events from all three
of FPU/IFU/VPU, ISU and IDU/STS at the same time, since they all come
in through only 2 multiplexers) using a NAND constraint field, and
work out which multiplexer is used for ISU events at compute_mmcr
time.

Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index fde190b..45798f6 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,7 +94,7 @@ obj-$(CONFIG_AUDIT)		+= audit.o
 obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o
+obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o ppc970-pmu.o
 
 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
 
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index c7d4c29..5561ecb 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -741,13 +741,26 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	}
 }
 
+extern struct power_pmu ppc970_pmu;
+
 static int init_perf_counters(void)
 {
+	unsigned long pvr;
+
 	if (reserve_pmc_hardware(perf_counter_interrupt)) {
 		printk(KERN_ERR "Couldn't init performance monitor subsystem\n");
 		return -EBUSY;
 	}
 
+	/* XXX should get this from cputable */
+	pvr = mfspr(SPRN_PVR);
+	switch (PVR_VER(pvr)) {
+	case PV_970:
+	case PV_970FX:
+	case PV_970MP:
+		ppmu = &ppc970_pmu;
+		break;
+	}
 	return 0;
 }
 
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
new file mode 100644
index 0000000..c325658
--- /dev/null
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -0,0 +1,375 @@
+/*
+ * Performance counter support for PPC970-family processors.
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/string.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for PPC970
+ */
+#define PM_PMC_SH	12	/* PMC number (1-based) for direct events */
+#define PM_PMC_MSK	0xf
+#define PM_UNIT_SH	8	/* TTMMUX number and setting - unit select */
+#define PM_UNIT_MSK	0xf
+#define PM_BYTE_SH	4	/* Byte number of event bus to use */
+#define PM_BYTE_MSK	3
+#define PM_PMCSEL_MSK	0xf
+
+/* Values in PM_UNIT field */
+#define PM_NONE		0
+#define PM_FPU		1
+#define PM_VPU		2
+#define PM_ISU		3
+#define PM_IFU		4
+#define PM_IDU		5
+#define PM_STS		6
+#define PM_LSU0		7
+#define PM_LSU1U	8
+#define PM_LSU1L	9
+#define PM_LASTUNIT	9
+
+/*
+ * Bits in MMCR0 for PPC970
+ */
+#define MMCR0_PMC1SEL_SH	8
+#define MMCR0_PMC2SEL_SH	1
+#define MMCR_PMCSEL_MSK		0x1f
+
+/*
+ * Bits in MMCR1 for PPC970
+ */
+#define MMCR1_TTM0SEL_SH	62
+#define MMCR1_TTM1SEL_SH	59
+#define MMCR1_TTM3SEL_SH	53
+#define MMCR1_TTMSEL_MSK	3
+#define MMCR1_TD_CP_DBG0SEL_SH	50
+#define MMCR1_TD_CP_DBG1SEL_SH	48
+#define MMCR1_TD_CP_DBG2SEL_SH	46
+#define MMCR1_TD_CP_DBG3SEL_SH	44
+#define MMCR1_PMC1_ADDER_SEL_SH	39
+#define MMCR1_PMC2_ADDER_SEL_SH	38
+#define MMCR1_PMC6_ADDER_SEL_SH	37
+#define MMCR1_PMC5_ADDER_SEL_SH	36
+#define MMCR1_PMC8_ADDER_SEL_SH	35
+#define MMCR1_PMC7_ADDER_SEL_SH	34
+#define MMCR1_PMC3_ADDER_SEL_SH	33
+#define MMCR1_PMC4_ADDER_SEL_SH	32
+#define MMCR1_PMC3SEL_SH	27
+#define MMCR1_PMC4SEL_SH	22
+#define MMCR1_PMC5SEL_SH	17
+#define MMCR1_PMC6SEL_SH	12
+#define MMCR1_PMC7SEL_SH	7
+#define MMCR1_PMC8SEL_SH	2
+
+static short mmcr1_adder_bits[8] = {
+	MMCR1_PMC1_ADDER_SEL_SH,
+	MMCR1_PMC2_ADDER_SEL_SH,
+	MMCR1_PMC3_ADDER_SEL_SH,
+	MMCR1_PMC4_ADDER_SEL_SH,
+	MMCR1_PMC5_ADDER_SEL_SH,
+	MMCR1_PMC6_ADDER_SEL_SH,
+	MMCR1_PMC7_ADDER_SEL_SH,
+	MMCR1_PMC8_ADDER_SEL_SH
+};
+
+/*
+ * Bits in MMCRA
+ */
+
+/*
+ * Layout of constraint bits:
+ * 6666555555555544444444443333333333222222222211111111110000000000
+ * 3210987654321098765432109876543210987654321098765432109876543210
+ *                 <><>[  >[  >[  ><  ><  ><  ><  ><><><><><><><><>
+ *                 T0T1 UC  PS1 PS2 B0  B1  B2  B3 P1P2P3P4P5P6P7P8
+ *
+ * T0 - TTM0 constraint
+ *     46-47: TTM0SEL value (0=FPU, 2=IFU, 3=VPU) 0xC000_0000_0000
+ *
+ * T1 - TTM1 constraint
+ *     44-45: TTM1SEL value (0=IDU, 3=STS) 0x3000_0000_0000
+ *
+ * UC - unit constraint: can't have all three of FPU|IFU|VPU, ISU, IDU|STS
+ *     43: UC3 error 0x0800_0000_0000
+ *     42: FPU|IFU|VPU events needed 0x0400_0000_0000
+ *     41: ISU events needed 0x0200_0000_0000
+ *     40: IDU|STS events needed 0x0100_0000_0000
+ *
+ * PS1
+ *     39: PS1 error 0x0080_0000_0000
+ *     36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
+ *
+ * PS2
+ *     35: PS2 error 0x0008_0000_0000
+ *     32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
+ *
+ * B0
+ *     28-31: Byte 0 event source 0xf000_0000
+ *	      Encoding as for the event code
+ *
+ * B1, B2, B3
+ *     24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
+ *
+ * P1
+ *     15: P1 error 0x8000
+ *     14-15: Count of events needing PMC1
+ *
+ * P2..P8
+ *     0-13: Count of events needing PMC2..PMC8
+ */
+
+/* Masks and values for using events from the various units */
+static u64 unit_cons[PM_LASTUNIT+1][2] = {
+	[PM_FPU] =   { 0xc80000000000ull, 0x040000000000ull },
+	[PM_VPU] =   { 0xc80000000000ull, 0xc40000000000ull },
+	[PM_ISU] =   { 0x080000000000ull, 0x020000000000ull },
+	[PM_IFU] =   { 0xc80000000000ull, 0x840000000000ull },
+	[PM_IDU] =   { 0x380000000000ull, 0x010000000000ull },
+	[PM_STS] =   { 0x380000000000ull, 0x310000000000ull },
+};
+
+static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+{
+	int pmc, byte, unit, sh;
+	u64 mask = 0, value = 0;
+	int grp = -1;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	if (pmc) {
+		if (pmc > 8)
+			return -1;
+		sh = (pmc - 1) * 2;
+		mask |= 2 << sh;
+		value |= 1 << sh;
+		grp = ((pmc - 1) >> 1) & 1;
+	}
+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+	if (unit) {
+		if (unit > PM_LASTUNIT)
+			return -1;
+		mask |= unit_cons[unit][0];
+		value |= unit_cons[unit][1];
+		byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+		/*
+		 * Bus events on bytes 0 and 2 can be counted
+		 * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
+		 */
+		if (!pmc)
+			grp = byte & 1;
+		/* Set byte lane select field */
+		mask  |= 0xfULL << (28 - 4 * byte);
+		value |= (u64)unit << (28 - 4 * byte);
+	}
+	if (grp == 0) {
+		/* increment PMC1/2/5/6 field */
+		mask  |= 0x8000000000ull;
+		value |= 0x1000000000ull;
+	} else if (grp == 1) {
+		/* increment PMC3/4/7/8 field */
+		mask  |= 0x800000000ull;
+		value |= 0x100000000ull;
+	}
+	*maskp = mask;
+	*valp = value;
+	return 0;
+}
+
+static int p970_get_alternatives(unsigned int event, unsigned int alt[])
+{
+	alt[0] = event;
+
+	/* 2 alternatives for LSU empty */
+	if (event == 0x2002 || event == 0x3002) {
+		alt[1] = event ^ 0x1000;
+		return 2;
+	}
+		
+	return 1;
+}
+
+static int p970_compute_mmcr(unsigned int event[], int n_ev,
+			     unsigned int hwc[], u64 mmcr[])
+{
+	u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
+	unsigned int pmc, unit, byte, psel;
+	unsigned int ttm, grp;
+	unsigned int pmc_inuse = 0;
+	unsigned int pmc_grp_use[2];
+	unsigned char busbyte[4];
+	unsigned char unituse[16];
+	unsigned char unitmap[] = { 0, 0<<3, 3<<3, 1<<3, 2<<3, 0|4, 3|4 };
+	unsigned char ttmuse[2];
+	unsigned char pmcsel[8];
+	int i;
+
+	if (n_ev > 8)
+		return -1;
+
+	/* First pass to count resource use */
+	pmc_grp_use[0] = pmc_grp_use[1] = 0;
+	memset(busbyte, 0, sizeof(busbyte));
+	memset(unituse, 0, sizeof(unituse));
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		if (pmc) {
+			if (pmc_inuse & (1 << (pmc - 1)))
+				return -1;
+			pmc_inuse |= 1 << (pmc - 1);
+			/* count 1/2/5/6 vs 3/4/7/8 use */
+			++pmc_grp_use[((pmc - 1) >> 1) & 1];
+		}
+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+		if (unit) {
+			if (unit > PM_LASTUNIT)
+				return -1;
+			if (!pmc)
+				++pmc_grp_use[byte & 1];
+			if (busbyte[byte] && busbyte[byte] != unit)
+				return -1;
+			busbyte[byte] = unit;
+			unituse[unit] = 1;
+		}
+	}
+	if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
+		return -1;
+
+	/*
+	 * Assign resources and set multiplexer selects.
+	 *
+	 * PM_ISU can go either on TTM0 or TTM1, but that's the only
+	 * choice we have to deal with.
+	 */
+	if (unituse[PM_ISU] &
+	    (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_VPU]))
+		unitmap[PM_ISU] = 2 | 4;	/* move ISU to TTM1 */
+	/* Set TTM[01]SEL fields. */
+	ttmuse[0] = ttmuse[1] = 0;
+	for (i = PM_FPU; i <= PM_STS; ++i) {
+		if (!unituse[i])
+			continue;
+		ttm = unitmap[i];
+		++ttmuse[(ttm >> 2) & 1];
+		mmcr1 |= (u64)(ttm & ~4) << MMCR1_TTM1SEL_SH;
+	}
+	/* Check only one unit per TTMx */
+	if (ttmuse[0] > 1 || ttmuse[1] > 1)
+		return -1;
+
+	/* Set byte lane select fields and TTM3SEL. */
+	for (byte = 0; byte < 4; ++byte) {
+		unit = busbyte[byte];
+		if (!unit)
+			continue;
+		if (unit <= PM_STS)
+			ttm = (unitmap[unit] >> 2) & 1;
+		else if (unit == PM_LSU0)
+			ttm = 2;
+		else {
+			ttm = 3;
+			if (unit == PM_LSU1L && byte >= 2)
+				mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
+		}
+		mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
+	}
+
+	/* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
+	memset(pmcsel, 0x8, sizeof(pmcsel));	/* 8 means don't count */
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+		psel = event[i] & PM_PMCSEL_MSK;
+		if (!pmc) {
+			/* Bus event or any-PMC direct event */
+			if (unit)
+				psel |= 0x10 | ((byte & 2) << 2);
+			else
+				psel |= 8;
+			for (pmc = 0; pmc < 8; ++pmc) {
+				if (pmc_inuse & (1 << pmc))
+					continue;
+				grp = (pmc >> 1) & 1;
+				if (unit) {
+					if (grp == (byte & 1))
+						break;
+				} else if (pmc_grp_use[grp] < 4) {
+					++pmc_grp_use[grp];
+					break;
+				}
+			}
+			pmc_inuse |= 1 << pmc;
+		} else {
+			/* Direct event */
+			--pmc;
+			if (psel == 0 && (byte & 2))
+				/* add events on higher-numbered bus */
+				mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
+		}
+		pmcsel[pmc] = psel;
+		hwc[i] = pmc;
+	}
+	for (pmc = 0; pmc < 2; ++pmc)
+		mmcr0 |= pmcsel[pmc] << (MMCR0_PMC1SEL_SH - 7 * pmc);
+	for (; pmc < 8; ++pmc)
+		mmcr1 |= (u64)pmcsel[pmc] << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
+	if (pmc_inuse & 1)
+		mmcr0 |= MMCR0_PMC1CE;
+	if (pmc_inuse & 0xfe)
+		mmcr0 |= MMCR0_PMCjCE;
+
+	mmcra |= 0x2000;	/* mark only one IOP per PPC instruction */
+
+	/* Return MMCRx values */
+	mmcr[0] = mmcr0;
+	mmcr[1] = mmcr1;
+	mmcr[2] = mmcra;
+	return 0;
+}
+
+static void p970_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+	int shift, i;
+
+	if (pmc <= 1) {
+		shift = MMCR0_PMC1SEL_SH - 7 * pmc;
+		i = 0;
+	} else {
+		shift = MMCR1_PMC3SEL_SH - 5 * (pmc - 2);
+		i = 1;
+	}
+	/*
+	 * Setting the PMCxSEL field to 0x08 disables PMC x.
+	 */
+	mmcr[i] = (mmcr[i] & ~(0x1fUL << shift)) | (0x08UL << shift);
+}
+
+static int ppc970_generic_events[] = {
+	[PERF_COUNT_CPU_CYCLES] = 7,
+	[PERF_COUNT_INSTRUCTIONS] = 1,
+	[PERF_COUNT_CACHE_REFERENCES] = 0x8810,		/* PM_LD_REF_L1 */
+	[PERF_COUNT_CACHE_MISSES] = 0x3810,		/* PM_LD_MISS_L1 */
+	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x431,	/* PM_BR_ISSUED */
+	[PERF_COUNT_BRANCH_MISSES] = 0x327,		/* PM_GRP_BR_MPRED */
+};
+
+struct power_pmu ppc970_pmu = {
+	.n_counter = 8,
+	.max_alternatives = 2,
+	.add_fields = 0x001100005555ull,
+	.test_adder = 0x013300000000ull,
+	.compute_mmcr = p970_compute_mmcr,
+	.get_constraint = p970_get_constraint,
+	.get_alternatives = p970_get_alternatives,
+	.disable_pmc = p970_disable_pmc,
+	.n_generic = ARRAY_SIZE(ppc970_generic_events),
+	.generic_events = ppc970_generic_events,
+};
-- 
cgit v0.10.2


From f78628374a13bc150db77c6e02d4f2c0a7f932ef Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 9 Jan 2009 21:05:35 +1100
Subject: powerpc/perf_counter: Add support for POWER6

This adds the back-end for the PMU on the POWER6 processor.
Fortunately, the event selection hardware is somewhat simpler on
POWER6 than on other POWER family processors, so the constraints
fit into only 32 bits.

Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 45798f6..0ebf4d0 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,7 +94,7 @@ obj-$(CONFIG_AUDIT)		+= audit.o
 obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o ppc970-pmu.o
+obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o ppc970-pmu.o power6-pmu.o
 
 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
 
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 5561ecb..df3fe05 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -742,6 +742,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 }
 
 extern struct power_pmu ppc970_pmu;
+extern struct power_pmu power6_pmu;
 
 static int init_perf_counters(void)
 {
@@ -760,6 +761,9 @@ static int init_perf_counters(void)
 	case PV_970MP:
 		ppmu = &ppc970_pmu;
 		break;
+	case 0x3e:
+		ppmu = &power6_pmu;
+		break;
 	}
 	return 0;
 }
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
new file mode 100644
index 0000000..b1f61f3
--- /dev/null
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -0,0 +1,283 @@
+/*
+ * Performance counter support for POWER6 processors.
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for POWER6
+ */
+#define PM_PMC_SH	20	/* PMC number (1-based) for direct events */
+#define PM_PMC_MSK	0x7
+#define PM_PMC_MSKS	(PM_PMC_MSK << PM_PMC_SH)
+#define PM_UNIT_SH	16	/* Unit event comes (TTMxSEL encoding) */
+#define PM_UNIT_MSK	0xf
+#define PM_UNIT_MSKS	(PM_UNIT_MSK << PM_UNIT_SH)
+#define PM_LLAV		0x8000	/* Load lookahead match value */
+#define PM_LLA		0x4000	/* Load lookahead match enable */
+#define PM_BYTE_SH	12	/* Byte of event bus to use */
+#define PM_BYTE_MSK	3
+#define PM_SUBUNIT_SH	8	/* Subunit event comes from (NEST_SEL enc.) */
+#define PM_SUBUNIT_MSK	7
+#define PM_SUBUNIT_MSKS	(PM_SUBUNIT_MSK << PM_SUBUNIT_SH)
+#define PM_PMCSEL_MSK	0xff	/* PMCxSEL value */
+#define PM_BUSEVENT_MSK	0xf3700
+
+/*
+ * Bits in MMCR1 for POWER6
+ */
+#define MMCR1_TTM0SEL_SH	60
+#define MMCR1_TTMSEL_SH(n)	(MMCR1_TTM0SEL_SH - (n) * 4)
+#define MMCR1_TTMSEL_MSK	0xf
+#define MMCR1_TTMSEL(m, n)	(((m) >> MMCR1_TTMSEL_SH(n)) & MMCR1_TTMSEL_MSK)
+#define MMCR1_NESTSEL_SH	45
+#define MMCR1_NESTSEL_MSK	0x7
+#define MMCR1_NESTSEL(m)	(((m) >> MMCR1_NESTSEL_SH) & MMCR1_NESTSEL_MSK)
+#define MMCR1_PMC1_LLA		((u64)1 << 44)
+#define MMCR1_PMC1_LLA_VALUE	((u64)1 << 39)
+#define MMCR1_PMC1_ADDR_SEL	((u64)1 << 35)
+#define MMCR1_PMC1SEL_SH	24
+#define MMCR1_PMCSEL_SH(n)	(MMCR1_PMC1SEL_SH - (n) * 8)
+#define MMCR1_PMCSEL_MSK	0xff
+
+/*
+ * Assign PMC numbers and compute MMCR1 value for a set of events
+ */
+static int p6_compute_mmcr(unsigned int event[], int n_ev,
+			   unsigned int hwc[], u64 mmcr[])
+{
+	u64 mmcr1 = 0;
+	int i;
+	unsigned int pmc, ev, b, u, s, psel;
+	unsigned int ttmset = 0;
+	unsigned int pmc_inuse = 0;
+
+	if (n_ev > 4)
+		return -1;
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		if (pmc) {
+			if (pmc_inuse & (1 << (pmc - 1)))
+				return -1;	/* collision! */
+			pmc_inuse |= 1 << (pmc - 1);
+		}
+	}
+	for (i = 0; i < n_ev; ++i) {
+		ev = event[i];
+		pmc = (ev >> PM_PMC_SH) & PM_PMC_MSK;
+		if (pmc) {
+			--pmc;
+		} else {
+			/* can go on any PMC; find a free one */
+			for (pmc = 0; pmc < 4; ++pmc)
+				if (!(pmc_inuse & (1 << pmc)))
+					break;
+			pmc_inuse |= 1 << pmc;
+		}
+		hwc[i] = pmc;
+		psel = ev & PM_PMCSEL_MSK;
+		if (ev & PM_BUSEVENT_MSK) {
+			/* this event uses the event bus */
+			b = (ev >> PM_BYTE_SH) & PM_BYTE_MSK;
+			u = (ev >> PM_UNIT_SH) & PM_UNIT_MSK;
+			/* check for conflict on this byte of event bus */
+			if ((ttmset & (1 << b)) && MMCR1_TTMSEL(mmcr1, b) != u)
+				return -1;
+			mmcr1 |= (u64)u << MMCR1_TTMSEL_SH(b);
+			ttmset |= 1 << b;
+			if (u == 5) {
+				/* Nest events have a further mux */
+				s = (ev >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK;
+				if ((ttmset & 0x10) &&
+				    MMCR1_NESTSEL(mmcr1) != s)
+					return -1;
+				ttmset |= 0x10;
+				mmcr1 |= (u64)s << MMCR1_NESTSEL_SH;
+			}
+			if (0x30 <= psel && psel <= 0x3d) {
+				/* these need the PMCx_ADDR_SEL bits */
+				if (b >= 2)
+					mmcr1 |= MMCR1_PMC1_ADDR_SEL >> pmc;
+			}
+			/* bus select values are different for PMC3/4 */
+			if (pmc >= 2 && (psel & 0x90) == 0x80)
+				psel ^= 0x20;
+		}
+		if (ev & PM_LLA) {
+			mmcr1 |= MMCR1_PMC1_LLA >> pmc;
+			if (ev & PM_LLAV)
+				mmcr1 |= MMCR1_PMC1_LLA_VALUE >> pmc;
+		}
+		mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc);
+	}
+	mmcr[0] = 0;
+	if (pmc_inuse & 1)
+		mmcr[0] = MMCR0_PMC1CE;
+	if (pmc_inuse & 0xe)
+		mmcr[0] |= MMCR0_PMCjCE;
+	mmcr[1] = mmcr1;
+	mmcr[2] = 0;
+	return 0;
+}
+
+/*
+ * Layout of constraint bits:
+ *
+ *	0-1	add field: number of uses of PMC1 (max 1)
+ *	2-3, 4-5, 6-7: ditto for PMC2, 3, 4
+ *	8-10	select field: nest (subunit) event selector
+ *	16-19	select field: unit on byte 0 of event bus
+ *	20-23, 24-27, 28-31 ditto for bytes 1, 2, 3
+ */
+static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+{
+	int pmc, byte, sh;
+	unsigned int mask = 0, value = 0;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	if (pmc) {
+		if (pmc > 4)
+			return -1;
+		sh = (pmc - 1) * 2;
+		mask |= 2 << sh;
+		value |= 1 << sh;
+	}
+	if (event & PM_BUSEVENT_MSK) {
+		byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+		sh = byte * 4;
+		mask |= PM_UNIT_MSKS << sh;
+		value |= (event & PM_UNIT_MSKS) << sh;
+		if ((event & PM_UNIT_MSKS) == (5 << PM_UNIT_SH)) {
+			mask |= PM_SUBUNIT_MSKS;
+			value |= event & PM_SUBUNIT_MSKS;
+		}
+	}
+	*maskp = mask;
+	*valp = value;
+	return 0;
+}
+
+#define MAX_ALT	4	/* at most 4 alternatives for any event */
+
+static const unsigned int event_alternatives[][MAX_ALT] = {
+	{ 0x0130e8, 0x2000f6, 0x3000fc },	/* PM_PTEG_RELOAD_VALID */
+	{ 0x080080, 0x10000d, 0x30000c, 0x4000f0 }, /* PM_LD_MISS_L1 */
+	{ 0x080088, 0x200054, 0x3000f0 },	/* PM_ST_MISS_L1 */
+	{ 0x10000a, 0x2000f4 },			/* PM_RUN_CYC */
+	{ 0x10000b, 0x2000f5 },			/* PM_RUN_COUNT */
+	{ 0x10000e, 0x400010 },			/* PM_PURR */
+	{ 0x100010, 0x4000f8 },			/* PM_FLUSH */
+	{ 0x10001a, 0x200010 },			/* PM_MRK_INST_DISP */
+	{ 0x100026, 0x3000f8 },			/* PM_TB_BIT_TRANS */
+	{ 0x100054, 0x2000f0 },			/* PM_ST_FIN */
+	{ 0x100056, 0x2000fc },			/* PM_L1_ICACHE_MISS */
+	{ 0x1000f0, 0x40000a },			/* PM_INST_IMC_MATCH_CMPL */
+	{ 0x1000f8, 0x200008 },			/* PM_GCT_EMPTY_CYC */
+	{ 0x1000fc, 0x400006 },			/* PM_LSU_DERAT_MISS_CYC */
+	{ 0x20000e, 0x400007 },			/* PM_LSU_DERAT_MISS */
+	{ 0x200012, 0x300012 },			/* PM_INST_DISP */
+	{ 0x2000f2, 0x3000f2 },			/* PM_INST_DISP */
+	{ 0x2000f8, 0x300010 },			/* PM_EXT_INT */
+	{ 0x2000fe, 0x300056 },			/* PM_DATA_FROM_L2MISS */
+	{ 0x2d0030, 0x30001a },			/* PM_MRK_FPU_FIN */
+	{ 0x30000a, 0x400018 },			/* PM_MRK_INST_FIN */
+	{ 0x3000f6, 0x40000e },			/* PM_L1_DCACHE_RELOAD_VALID */
+	{ 0x3000fe, 0x400056 },			/* PM_DATA_FROM_L3MISS */
+};
+
+/*
+ * This could be made more efficient with a binary search on
+ * a presorted list, if necessary
+ */
+static int find_alternatives_list(unsigned int event)
+{
+	int i, j;
+	unsigned int alt;
+
+	for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
+		if (event < event_alternatives[i][0])
+			return -1;
+		for (j = 0; j < MAX_ALT; ++j) {
+			alt = event_alternatives[i][j];
+			if (!alt || event < alt)
+				break;
+			if (event == alt)
+				return i;
+		}
+	}
+	return -1;
+}
+
+static int p6_get_alternatives(unsigned int event, unsigned int alt[])
+{
+	int i, j;
+	unsigned int aevent, psel, pmc;
+	unsigned int nalt = 1;
+
+	alt[0] = event;
+
+	/* check the alternatives table */
+	i = find_alternatives_list(event);
+	if (i >= 0) {
+		/* copy out alternatives from list */
+		for (j = 0; j < MAX_ALT; ++j) {
+			aevent = event_alternatives[i][j];
+			if (!aevent)
+				break;
+			if (aevent != event)
+				alt[nalt++] = aevent;
+		}
+
+	} else {
+		/* Check for alternative ways of computing sum events */
+		/* PMCSEL 0x32 counter N == PMCSEL 0x34 counter 5-N */
+		psel = event & (PM_PMCSEL_MSK & ~1);	/* ignore edge bit */
+		pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+		if (pmc && (psel == 0x32 || psel == 0x34))
+			alt[nalt++] = ((event ^ 0x6) & ~PM_PMC_MSKS) |
+				((5 - pmc) << PM_PMC_SH);
+
+		/* PMCSEL 0x38 counter N == PMCSEL 0x3a counter N+/-2 */
+		if (pmc && (psel == 0x38 || psel == 0x3a))
+			alt[nalt++] = ((event ^ 0x2) & ~PM_PMC_MSKS) |
+				((pmc > 2? pmc - 2: pmc + 2) << PM_PMC_SH);
+	}
+
+	return nalt;
+}
+
+static void p6_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+	/* Set PMCxSEL to 0 to disable PMCx */
+	mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc));
+}
+
+static int power6_generic_events[] = {
+	[PERF_COUNT_CPU_CYCLES] = 0x1e,
+	[PERF_COUNT_INSTRUCTIONS] = 2,
+	[PERF_COUNT_CACHE_REFERENCES] = 0x280030,	/* LD_REF_L1 */
+	[PERF_COUNT_CACHE_MISSES] = 0x30000c,		/* LD_MISS_L1 */
+	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x410a0,	/* BR_PRED */ 
+	[PERF_COUNT_BRANCH_MISSES] = 0x400052,		/* BR_MPRED */
+};
+
+struct power_pmu power6_pmu = {
+	.n_counter = 4,
+	.max_alternatives = MAX_ALT,
+	.add_fields = 0x55,
+	.test_adder = 0,
+	.compute_mmcr = p6_compute_mmcr,
+	.get_constraint = p6_get_constraint,
+	.get_alternatives = p6_get_alternatives,
+	.disable_pmc = p6_disable_pmc,
+	.n_generic = ARRAY_SIZE(power6_generic_events),
+	.generic_events = power6_generic_events,
+};
-- 
cgit v0.10.2


From dd0e6ba22ea21bcc2c420b385a170593c58f4c08 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 12 Jan 2009 15:11:00 +1100
Subject: perf_counter: Always schedule all software counters in

Software counters aren't subject to the limitations imposed by the
fixed number of hardware counter registers, so there is no reason not
to enable them all in __perf_counter_sched_in.  Previously we used to
break out of the loop when we got to a group that wouldn't fit on the
PMU; with this we continue through the list but only schedule in
software counters (or groups containing only software counters) from
there on.

Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4c0dccb..3aef306 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -455,12 +455,37 @@ group_error:
 	return -EAGAIN;
 }
 
+/*
+ * Return 1 for a software counter, 0 for a hardware counter
+ */
+static inline int is_software_counter(struct perf_counter *counter)
+{
+	return !counter->hw_event.raw && counter->hw_event.type < 0;
+}
+
+/*
+ * Return 1 for a group consisting entirely of software counters,
+ * 0 if the group contains any hardware counters.
+ */
+static int is_software_only_group(struct perf_counter *leader)
+{
+	struct perf_counter *counter;
+
+	if (!is_software_counter(leader))
+		return 0;
+	list_for_each_entry(counter, &leader->sibling_list, list_entry)
+		if (!is_software_counter(counter))
+			return 0;
+	return 1;
+}
+
 static void
 __perf_counter_sched_in(struct perf_counter_context *ctx,
 			struct perf_cpu_context *cpuctx, int cpu)
 {
 	struct perf_counter *counter;
 	u64 flags;
+	int can_add_hw = 1;
 
 	if (likely(!ctx->nr_counters))
 		return;
@@ -477,10 +502,12 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 
 		/*
 		 * If we scheduled in a group atomically and exclusively,
-		 * or if this group can't go on, break out:
+		 * or if this group can't go on, don't add any more
+		 * hardware counters.
 		 */
-		if (group_sched_in(counter, cpuctx, ctx, cpu))
-			break;
+		if (can_add_hw || is_software_only_group(counter))
+			if (group_sched_in(counter, cpuctx, ctx, cpu))
+				can_add_hw = 0;
 	}
 	hw_perf_restore(flags);
 	spin_unlock(&ctx->lock);
-- 
cgit v0.10.2


From 01d0287f068de2934109ba9b989d8807526cccc2 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 14 Jan 2009 13:44:19 +1100
Subject: powerpc/perf_counter: Make sure PMU gets enabled properly

This makes sure that we call the platform-specific ppc_md.enable_pmcs
function on each CPU before we try to use the PMU on that CPU.  If the
CPU goes off-line and then on-line, we need to do the enable_pmcs call
again, so we use the hw_perf_counter_setup hook to ensure that.  It gets
called as each CPU comes online, but it isn't called on the CPU that is
coming up, so this adds the CPU number as an argument to it (there were
no non-empty instances of hw_perf_counter_setup before).

This also arranges to set the pmcregs_in_use field of the lppaca (data
structure shared with the hypervisor) on each CPU when we are using the
PMU and clear it when we are not.  This allows the hypervisor to optimize
partition switches by not saving/restoring the PMU registers when we
aren't using the PMU.

Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index df3fe05..85ad259 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -15,6 +15,7 @@
 #include <linux/hardirq.h>
 #include <asm/reg.h>
 #include <asm/pmc.h>
+#include <asm/machdep.h>
 
 struct cpu_hw_counters {
 	int n_counters;
@@ -24,6 +25,7 @@ struct cpu_hw_counters {
 	struct perf_counter *counter[MAX_HWCOUNTERS];
 	unsigned int events[MAX_HWCOUNTERS];
 	u64 mmcr[3];
+	u8 pmcs_enabled;
 };
 DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
 
@@ -262,6 +264,15 @@ u64 hw_perf_save_disable(void)
 		cpuhw->n_added = 0;
 
 		/*
+		 * Check if we ever enabled the PMU on this cpu.
+		 */
+		if (!cpuhw->pmcs_enabled) {
+			if (ppc_md.enable_pmcs)
+				ppc_md.enable_pmcs();
+			cpuhw->pmcs_enabled = 1;
+		}
+
+		/*
 		 * Set the 'freeze counters' bit.
 		 * The barrier is to make sure the mtspr has been
 		 * executed and the PMU has frozen the counters
@@ -305,6 +316,8 @@ void hw_perf_restore(u64 disable)
 		mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
 		mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
 		mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
+		if (cpuhw->n_counters == 0)
+			get_lppaca()->pmcregs_in_use = 0;
 		goto out;
 	}
 
@@ -323,6 +336,7 @@ void hw_perf_restore(u64 disable)
 	 * bit set and set the hardware counters to their initial values.
 	 * Then unfreeze the counters.
 	 */
+	get_lppaca()->pmcregs_in_use = 1;
 	mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
 	mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
 	mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
@@ -741,6 +755,14 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	}
 }
 
+void hw_perf_counter_setup(int cpu)
+{
+	struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu);
+
+	memset(cpuhw, 0, sizeof(*cpuhw));
+	cpuhw->mmcr[0] = MMCR0_FC;
+}
+
 extern struct power_pmu ppc970_pmu;
 extern struct power_pmu power6_pmu;
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 3aef306..52f2f52 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -46,7 +46,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 
 u64 __weak hw_perf_save_disable(void)		{ return 0; }
 void __weak hw_perf_restore(u64 ctrl)		{ barrier(); }
-void __weak hw_perf_counter_setup(void)		{ barrier(); }
+void __weak hw_perf_counter_setup(int cpu)	{ barrier(); }
 int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_cpu_context *cpuctx,
 	       struct perf_counter_context *ctx, int cpu)
@@ -1598,7 +1598,7 @@ static void __cpuinit perf_counter_init_cpu(int cpu)
 	cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
 	mutex_unlock(&perf_resource_mutex);
 
-	hw_perf_counter_setup();
+	hw_perf_counter_setup(cpu);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-- 
cgit v0.10.2


From 3b6f9e5cb21964b7ce12bf81076f830885563ec8 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 14 Jan 2009 21:00:30 +1100
Subject: perf_counter: Add support for pinned and exclusive counter groups

Impact: New perf_counter features

A pinned counter group is one that the user wants to have on the CPU
whenever possible, i.e. whenever the associated task is running, for
a per-task group, or always for a per-cpu group.  If the system
cannot satisfy that, it puts the group into an error state where
it is not scheduled any more and reads from it return EOF (i.e. 0
bytes read).  The group can be released from error state and made
readable again using prctl(PR_TASK_PERF_COUNTERS_ENABLE).  When we
have finer-grained enable/disable controls on counters we'll be able
to reset the error state on individual groups.

An exclusive group is one that the user wants to be the only group
using the CPU performance monitor hardware whenever it is on.  The
counter group scheduler will not schedule an exclusive group if there
are already other groups on the CPU and will not schedule other groups
onto the CPU if there is an exclusive group scheduled (that statement
does not apply to groups containing only software counters, which can
always go on and which do not prevent an exclusive group from going on).
With an exclusive group, we will be able to let users program PMU
registers at a low level without the concern that those settings will
perturb other measurements.

Along the way this reorganizes things a little:
- is_software_counter() is moved to perf_counter.h.
- cpuctx->active_oncpu now records the number of hardware counters on
  the CPU, i.e. it now excludes software counters.  Nothing was reading
  cpuctx->active_oncpu before, so this change is harmless.
- A new cpuctx->exclusive field records whether we currently have an
  exclusive group on the CPU.
- counter_sched_out moves higher up in perf_counter.c and gets called
  from __perf_counter_remove_from_context and __perf_counter_exit_task,
  where we used to have essentially the same code.
- __perf_counter_sched_in now goes through the counter list twice, doing
  the pinned counters in the first loop and the non-pinned counters in
  the second loop, in order to give the pinned counters the best chance
  to be scheduled in.

Note that only a group leader can be exclusive or pinned, and that
attribute applies to the whole group.  This avoids some awkwardness in
some corner cases (e.g. where a group leader is closed and the other
group members get added to the context list).  If we want to relax that
restriction later, we can, and it is easier to relax a restriction than
to apply a new one.

This doesn't yet handle the case where a pinned counter is inherited
and goes into error state in the child - the error state is not
propagated up to the parent when the child exits, and arguably it
should.

Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 85ad259..5b02113 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -36,14 +36,6 @@ void perf_counter_print_debug(void)
 }
 
 /*
- * Return 1 for a software counter, 0 for a hardware counter
- */
-static inline int is_software_counter(struct perf_counter *counter)
-{
-	return !counter->hw_event.raw && counter->hw_event.type < 0;
-}
-
-/*
  * Read one performance monitor counter (PMC).
  */
 static unsigned long read_pmc(int idx)
@@ -443,6 +435,7 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
 	 */
 	for (i = n0; i < n0 + n; ++i)
 		cpuhw->counter[i]->hw.config = cpuhw->events[i];
+	cpuctx->active_oncpu += n;
 	n = 1;
 	counter_sched_in(group_leader, cpu);
 	list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
@@ -451,7 +444,6 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
 			++n;
 		}
 	}
-	cpuctx->active_oncpu += n;
 	ctx->nr_active += n;
 
 	return 1;
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index b21d1ea..7ab8e5f 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -86,7 +86,10 @@ struct perf_counter_hw_event {
 				nmi	     :  1, /* NMI sampling        */
 				raw	     :  1, /* raw event type      */
 				inherit	     :  1, /* children inherit it */
-				__reserved_1 : 28;
+				pinned	     :  1, /* must always be on PMU */
+				exclusive    :  1, /* only counter on PMU */
+
+				__reserved_1 : 26;
 
 	u64			__reserved_2;
 };
@@ -141,6 +144,7 @@ struct hw_perf_counter_ops {
  * enum perf_counter_active_state - the states of a counter
  */
 enum perf_counter_active_state {
+	PERF_COUNTER_STATE_ERROR	= -2,
 	PERF_COUNTER_STATE_OFF		= -1,
 	PERF_COUNTER_STATE_INACTIVE	=  0,
 	PERF_COUNTER_STATE_ACTIVE	=  1,
@@ -214,6 +218,7 @@ struct perf_cpu_context {
 	struct perf_counter_context	*task_ctx;
 	int				active_oncpu;
 	int				max_pertask;
+	int				exclusive;
 };
 
 /*
@@ -240,6 +245,14 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_cpu_context *cpuctx,
 	       struct perf_counter_context *ctx, int cpu);
 
+/*
+ * Return 1 for a software counter, 0 for a hardware counter
+ */
+static inline int is_software_counter(struct perf_counter *counter)
+{
+	return !counter->hw_event.raw && counter->hw_event.type < 0;
+}
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 52f2f52..faf671b 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -93,6 +93,25 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	}
 }
 
+static void
+counter_sched_out(struct perf_counter *counter,
+		  struct perf_cpu_context *cpuctx,
+		  struct perf_counter_context *ctx)
+{
+	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+		return;
+
+	counter->state = PERF_COUNTER_STATE_INACTIVE;
+	counter->hw_ops->disable(counter);
+	counter->oncpu = -1;
+
+	if (!is_software_counter(counter))
+		cpuctx->active_oncpu--;
+	ctx->nr_active--;
+	if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
+		cpuctx->exclusive = 0;
+}
+
 /*
  * Cross CPU call to remove a performance counter
  *
@@ -118,14 +137,9 @@ static void __perf_counter_remove_from_context(void *info)
 	curr_rq_lock_irq_save(&flags);
 	spin_lock(&ctx->lock);
 
-	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
-		counter->state = PERF_COUNTER_STATE_INACTIVE;
-		counter->hw_ops->disable(counter);
-		ctx->nr_active--;
-		cpuctx->active_oncpu--;
-		counter->task = NULL;
-		counter->oncpu = -1;
-	}
+	counter_sched_out(counter, cpuctx, ctx);
+
+	counter->task = NULL;
 	ctx->nr_counters--;
 
 	/*
@@ -207,7 +221,7 @@ counter_sched_in(struct perf_counter *counter,
 		 struct perf_counter_context *ctx,
 		 int cpu)
 {
-	if (counter->state == PERF_COUNTER_STATE_OFF)
+	if (counter->state <= PERF_COUNTER_STATE_OFF)
 		return 0;
 
 	counter->state = PERF_COUNTER_STATE_ACTIVE;
@@ -223,13 +237,64 @@ counter_sched_in(struct perf_counter *counter,
 		return -EAGAIN;
 	}
 
-	cpuctx->active_oncpu++;
+	if (!is_software_counter(counter))
+		cpuctx->active_oncpu++;
 	ctx->nr_active++;
 
+	if (counter->hw_event.exclusive)
+		cpuctx->exclusive = 1;
+
 	return 0;
 }
 
 /*
+ * Return 1 for a group consisting entirely of software counters,
+ * 0 if the group contains any hardware counters.
+ */
+static int is_software_only_group(struct perf_counter *leader)
+{
+	struct perf_counter *counter;
+
+	if (!is_software_counter(leader))
+		return 0;
+	list_for_each_entry(counter, &leader->sibling_list, list_entry)
+		if (!is_software_counter(counter))
+			return 0;
+	return 1;
+}
+
+/*
+ * Work out whether we can put this counter group on the CPU now.
+ */
+static int group_can_go_on(struct perf_counter *counter,
+			   struct perf_cpu_context *cpuctx,
+			   int can_add_hw)
+{
+	/*
+	 * Groups consisting entirely of software counters can always go on.
+	 */
+	if (is_software_only_group(counter))
+		return 1;
+	/*
+	 * If an exclusive group is already on, no other hardware
+	 * counters can go on.
+	 */
+	if (cpuctx->exclusive)
+		return 0;
+	/*
+	 * If this group is exclusive and there are already
+	 * counters on the CPU, it can't go on.
+	 */
+	if (counter->hw_event.exclusive && cpuctx->active_oncpu)
+		return 0;
+	/*
+	 * Otherwise, try to add it if all previous groups were able
+	 * to go on.
+	 */
+	return can_add_hw;
+}
+
+/*
  * Cross CPU call to install and enable a performance counter
  */
 static void __perf_install_in_context(void *info)
@@ -240,6 +305,7 @@ static void __perf_install_in_context(void *info)
 	int cpu = smp_processor_id();
 	unsigned long flags;
 	u64 perf_flags;
+	int err;
 
 	/*
 	 * If this is a task context, we need to check whether it is
@@ -261,9 +327,21 @@ static void __perf_install_in_context(void *info)
 	list_add_counter(counter, ctx);
 	ctx->nr_counters++;
 
-	counter_sched_in(counter, cpuctx, ctx, cpu);
+	/*
+	 * An exclusive counter can't go on if there are already active
+	 * hardware counters, and no hardware counter can go on if there
+	 * is already an exclusive counter on.
+	 */
+	if (counter->state == PERF_COUNTER_STATE_INACTIVE &&
+	    !group_can_go_on(counter, cpuctx, 1))
+		err = -EEXIST;
+	else
+		err = counter_sched_in(counter, cpuctx, ctx, cpu);
+
+	if (err && counter->hw_event.pinned)
+		counter->state = PERF_COUNTER_STATE_ERROR;
 
-	if (!ctx->task && cpuctx->max_pertask)
+	if (!err && !ctx->task && cpuctx->max_pertask)
 		cpuctx->max_pertask--;
 
 	hw_perf_restore(perf_flags);
@@ -327,22 +405,6 @@ retry:
 }
 
 static void
-counter_sched_out(struct perf_counter *counter,
-		  struct perf_cpu_context *cpuctx,
-		  struct perf_counter_context *ctx)
-{
-	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
-		return;
-
-	counter->state = PERF_COUNTER_STATE_INACTIVE;
-	counter->hw_ops->disable(counter);
-	counter->oncpu = -1;
-
-	cpuctx->active_oncpu--;
-	ctx->nr_active--;
-}
-
-static void
 group_sched_out(struct perf_counter *group_counter,
 		struct perf_cpu_context *cpuctx,
 		struct perf_counter_context *ctx)
@@ -359,6 +421,9 @@ group_sched_out(struct perf_counter *group_counter,
 	 */
 	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
 		counter_sched_out(counter, cpuctx, ctx);
+
+	if (group_counter->hw_event.exclusive)
+		cpuctx->exclusive = 0;
 }
 
 void __perf_counter_sched_out(struct perf_counter_context *ctx,
@@ -455,30 +520,6 @@ group_error:
 	return -EAGAIN;
 }
 
-/*
- * Return 1 for a software counter, 0 for a hardware counter
- */
-static inline int is_software_counter(struct perf_counter *counter)
-{
-	return !counter->hw_event.raw && counter->hw_event.type < 0;
-}
-
-/*
- * Return 1 for a group consisting entirely of software counters,
- * 0 if the group contains any hardware counters.
- */
-static int is_software_only_group(struct perf_counter *leader)
-{
-	struct perf_counter *counter;
-
-	if (!is_software_counter(leader))
-		return 0;
-	list_for_each_entry(counter, &leader->sibling_list, list_entry)
-		if (!is_software_counter(counter))
-			return 0;
-	return 1;
-}
-
 static void
 __perf_counter_sched_in(struct perf_counter_context *ctx,
 			struct perf_cpu_context *cpuctx, int cpu)
@@ -492,22 +533,49 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 
 	spin_lock(&ctx->lock);
 	flags = hw_perf_save_disable();
+
+	/*
+	 * First go through the list and put on any pinned groups
+	 * in order to give them the best chance of going on.
+	 */
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		if (counter->state <= PERF_COUNTER_STATE_OFF ||
+		    !counter->hw_event.pinned)
+			continue;
+		if (counter->cpu != -1 && counter->cpu != cpu)
+			continue;
+
+		if (group_can_go_on(counter, cpuctx, 1))
+			group_sched_in(counter, cpuctx, ctx, cpu);
+
+		/*
+		 * If this pinned group hasn't been scheduled,
+		 * put it in error state.
+		 */
+		if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+			counter->state = PERF_COUNTER_STATE_ERROR;
+	}
+
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		/*
+		 * Ignore counters in OFF or ERROR state, and
+		 * ignore pinned counters since we did them already.
+		 */
+		if (counter->state <= PERF_COUNTER_STATE_OFF ||
+		    counter->hw_event.pinned)
+			continue;
+
+		/*
 		 * Listen to the 'cpu' scheduling filter constraint
 		 * of counters:
 		 */
 		if (counter->cpu != -1 && counter->cpu != cpu)
 			continue;
 
-		/*
-		 * If we scheduled in a group atomically and exclusively,
-		 * or if this group can't go on, don't add any more
-		 * hardware counters.
-		 */
-		if (can_add_hw || is_software_only_group(counter))
+		if (group_can_go_on(counter, cpuctx, can_add_hw)) {
 			if (group_sched_in(counter, cpuctx, ctx, cpu))
 				can_add_hw = 0;
+		}
 	}
 	hw_perf_restore(flags);
 	spin_unlock(&ctx->lock);
@@ -567,8 +635,10 @@ int perf_counter_task_disable(void)
 	 */
 	perf_flags = hw_perf_save_disable();
 
-	list_for_each_entry(counter, &ctx->counter_list, list_entry)
-		counter->state = PERF_COUNTER_STATE_OFF;
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		if (counter->state != PERF_COUNTER_STATE_ERROR)
+			counter->state = PERF_COUNTER_STATE_OFF;
+	}
 
 	hw_perf_restore(perf_flags);
 
@@ -607,7 +677,7 @@ int perf_counter_task_enable(void)
 	perf_flags = hw_perf_save_disable();
 
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		if (counter->state != PERF_COUNTER_STATE_OFF)
+		if (counter->state > PERF_COUNTER_STATE_OFF)
 			continue;
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
 		counter->hw_event.disabled = 0;
@@ -849,6 +919,14 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 	if (count != sizeof(cntval))
 		return -EINVAL;
 
+	/*
+	 * Return end-of-file for a read on a counter that is in
+	 * error state (i.e. because it was pinned but it couldn't be
+	 * scheduled on to the CPU at some point).
+	 */
+	if (counter->state == PERF_COUNTER_STATE_ERROR)
+		return 0;
+
 	mutex_lock(&counter->mutex);
 	cntval = perf_counter_read(counter);
 	mutex_unlock(&counter->mutex);
@@ -884,7 +962,7 @@ perf_read_irq_data(struct perf_counter	*counter,
 {
 	struct perf_data *irqdata, *usrdata;
 	DECLARE_WAITQUEUE(wait, current);
-	ssize_t res;
+	ssize_t res, res2;
 
 	irqdata = counter->irqdata;
 	usrdata = counter->usrdata;
@@ -905,6 +983,9 @@ perf_read_irq_data(struct perf_counter	*counter,
 		if (signal_pending(current))
 			break;
 
+		if (counter->state == PERF_COUNTER_STATE_ERROR)
+			break;
+
 		spin_unlock_irq(&counter->waitq.lock);
 		schedule();
 		spin_lock_irq(&counter->waitq.lock);
@@ -913,7 +994,8 @@ perf_read_irq_data(struct perf_counter	*counter,
 	__set_current_state(TASK_RUNNING);
 	spin_unlock_irq(&counter->waitq.lock);
 
-	if (usrdata->len + irqdata->len < count)
+	if (usrdata->len + irqdata->len < count &&
+	    counter->state != PERF_COUNTER_STATE_ERROR)
 		return -ERESTARTSYS;
 read_pending:
 	mutex_lock(&counter->mutex);
@@ -925,11 +1007,12 @@ read_pending:
 
 	/* Switch irq buffer: */
 	usrdata = perf_switch_irq_data(counter);
-	if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) {
+	res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
+	if (res2 < 0) {
 		if (!res)
 			res = -EFAULT;
 	} else {
-		res = count;
+		res += res2;
 	}
 out:
 	mutex_unlock(&counter->mutex);
@@ -1348,6 +1431,11 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
 		 */
 		if (group_leader->ctx != ctx)
 			goto err_put_context;
+		/*
+		 * Only a group leader can be exclusive or pinned
+		 */
+		if (hw_event.exclusive || hw_event.pinned)
+			goto err_put_context;
 	}
 
 	ret = -EINVAL;
@@ -1473,13 +1561,7 @@ __perf_counter_exit_task(struct task_struct *child,
 
 		cpuctx = &__get_cpu_var(perf_cpu_context);
 
-		if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
-			child_counter->state = PERF_COUNTER_STATE_INACTIVE;
-			child_counter->hw_ops->disable(child_counter);
-			cpuctx->active_oncpu--;
-			child_ctx->nr_active--;
-			child_counter->oncpu = -1;
-		}
+		counter_sched_out(child_counter, cpuctx, child_ctx);
 
 		list_del_init(&child_counter->list_entry);
 
-- 
cgit v0.10.2


From d859e29fe34cb833071b20aef860ee94fbad9bb2 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Sat, 17 Jan 2009 18:10:22 +1100
Subject: perf_counter: Add counter enable/disable ioctls

Impact: New perf_counter features

This primarily adds a way for perf_counter users to enable and disable
counters and groups.  Enabling or disabling a counter or group also
enables or disables all of the child counters that have been cloned
from it to monitor children of the task monitored by the top-level
counter.  The userspace interface to enable/disable counters is via
ioctl on the counter file descriptor.

Along the way this extends the code that handles child counters to
handle child counter groups properly.  A group with multiple counters
will be cloned to child tasks if and only if the group leader has the
hw_event.inherit bit set - if it is set the whole group is cloned as a
group in the child task.

In order to be able to enable or disable all child counters of a given
top-level counter, we need a way to find them all.  Hence I have added
a child_list field to struct perf_counter, which is the head of the
list of children for a top-level counter, or the link in that list for
a child counter.  That list is protected by the perf_counter.mutex
field.

This also adds a mutex to the perf_counter_context struct.  Previously
the list of counters was protected just by the lock field in the
context, which meant that perf_counter_init_task had to take that lock
and then take whatever lock/mutex protects the top-level counter's
child_list.  But the counter enable/disable functions need to take
that lock in order to traverse the list, then for each counter take
the lock in that counter's context in order to change the counter's
state safely, which will lead to a deadlock.

To solve this, we now have both a mutex and a spinlock in the context,
and taking either is sufficient to ensure the list of counters can't
change - you have to take both before changing the list.  Now
perf_counter_init_task takes the mutex instead of the lock (which
incidentally means that inherit_counter can use GFP_KERNEL instead of
GFP_ATOMIC) and thus avoids the possible deadlock.  Similarly the new
enable/disable functions can take the mutex while traversing the list
of child counters without incurring a possible deadlock when the
counter manipulation code locks the context for a child counter.

We also had an misfeature that the first counter added to a context
would possibly not go on until the next sched-in, because we were
using ctx->nr_active to detect if the context was running on a CPU.
But nr_active is the number of active counters, and if that was zero
(because the context didn't have any counters yet) it would look like
the context wasn't running on a cpu and so the retry code in
__perf_install_in_context wouldn't retry.  So this adds an 'is_active'
field that is set when the context is on a CPU, even if it has no
counters.  The is_active field is only used for task contexts, not for
per-cpu contexts.

If we enable a subsidiary counter in a group that is active on a CPU,
and the arch code can't enable the counter, then we have to pull the
whole group off the CPU.  We do this with group_sched_out, which gets
moved up in the file so it comes before all its callers.  This also
adds similar logic to __perf_install_in_context so that the "all on,
or none" invariant of groups is preserved when adding a new counter to
a group.

Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 7ab8e5f..33ba9fe 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -14,6 +14,7 @@
 #define _LINUX_PERF_COUNTER_H
 
 #include <asm/atomic.h>
+#include <asm/ioctl.h>
 
 #ifdef CONFIG_PERF_COUNTERS
 # include <asm/perf_counter.h>
@@ -95,6 +96,12 @@ struct perf_counter_hw_event {
 };
 
 /*
+ * Ioctls that can be done on a perf counter fd:
+ */
+#define PERF_COUNTER_IOC_ENABLE		_IO('$', 0)
+#define PERF_COUNTER_IOC_DISABLE	_IO('$', 1)
+
+/*
  * Kernel-internal data types:
  */
 
@@ -173,8 +180,10 @@ struct perf_counter {
 	struct file			*filp;
 
 	struct perf_counter		*parent;
+	struct list_head		child_list;
+
 	/*
-	 * Protect attach/detach:
+	 * Protect attach/detach and child_list:
 	 */
 	struct mutex			mutex;
 
@@ -199,13 +208,21 @@ struct perf_counter {
 struct perf_counter_context {
 #ifdef CONFIG_PERF_COUNTERS
 	/*
-	 * Protect the list of counters:
+	 * Protect the states of the counters in the list,
+	 * nr_active, and the list:
 	 */
 	spinlock_t		lock;
+	/*
+	 * Protect the list of counters.  Locking either mutex or lock
+	 * is sufficient to ensure the list doesn't change; to change
+	 * the list you need to lock both the mutex and the spinlock.
+	 */
+	struct mutex		mutex;
 
 	struct list_head	counter_list;
 	int			nr_counters;
 	int			nr_active;
+	int			is_active;
 	struct task_struct	*task;
 #endif
 };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index faf671b..1ac18da 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -112,6 +112,28 @@ counter_sched_out(struct perf_counter *counter,
 		cpuctx->exclusive = 0;
 }
 
+static void
+group_sched_out(struct perf_counter *group_counter,
+		struct perf_cpu_context *cpuctx,
+		struct perf_counter_context *ctx)
+{
+	struct perf_counter *counter;
+
+	if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
+		return;
+
+	counter_sched_out(group_counter, cpuctx, ctx);
+
+	/*
+	 * Schedule out siblings (if any):
+	 */
+	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
+		counter_sched_out(counter, cpuctx, ctx);
+
+	if (group_counter->hw_event.exclusive)
+		cpuctx->exclusive = 0;
+}
+
 /*
  * Cross CPU call to remove a performance counter
  *
@@ -168,7 +190,7 @@ static void __perf_counter_remove_from_context(void *info)
 /*
  * Remove the counter from a task's (or a CPU's) list of counters.
  *
- * Must be called with counter->mutex held.
+ * Must be called with counter->mutex and ctx->mutex held.
  *
  * CPU counters are removed with a smp call. For task counters we only
  * call when the task is on a CPU.
@@ -215,6 +237,99 @@ retry:
 	spin_unlock_irq(&ctx->lock);
 }
 
+/*
+ * Cross CPU call to disable a performance counter
+ */
+static void __perf_counter_disable(void *info)
+{
+	struct perf_counter *counter = info;
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_counter_context *ctx = counter->ctx;
+	unsigned long flags;
+
+	/*
+	 * If this is a per-task counter, need to check whether this
+	 * counter's task is the current task on this cpu.
+	 */
+	if (ctx->task && cpuctx->task_ctx != ctx)
+		return;
+
+	curr_rq_lock_irq_save(&flags);
+	spin_lock(&ctx->lock);
+
+	/*
+	 * If the counter is on, turn it off.
+	 * If it is in error state, leave it in error state.
+	 */
+	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
+		if (counter == counter->group_leader)
+			group_sched_out(counter, cpuctx, ctx);
+		else
+			counter_sched_out(counter, cpuctx, ctx);
+		counter->state = PERF_COUNTER_STATE_OFF;
+	}
+
+	spin_unlock(&ctx->lock);
+	curr_rq_unlock_irq_restore(&flags);
+}
+
+/*
+ * Disable a counter.
+ */
+static void perf_counter_disable(struct perf_counter *counter)
+{
+	struct perf_counter_context *ctx = counter->ctx;
+	struct task_struct *task = ctx->task;
+
+	if (!task) {
+		/*
+		 * Disable the counter on the cpu that it's on
+		 */
+		smp_call_function_single(counter->cpu, __perf_counter_disable,
+					 counter, 1);
+		return;
+	}
+
+ retry:
+	task_oncpu_function_call(task, __perf_counter_disable, counter);
+
+	spin_lock_irq(&ctx->lock);
+	/*
+	 * If the counter is still active, we need to retry the cross-call.
+	 */
+	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
+		spin_unlock_irq(&ctx->lock);
+		goto retry;
+	}
+
+	/*
+	 * Since we have the lock this context can't be scheduled
+	 * in, so we can change the state safely.
+	 */
+	if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+		counter->state = PERF_COUNTER_STATE_OFF;
+
+	spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Disable a counter and all its children.
+ */
+static void perf_counter_disable_family(struct perf_counter *counter)
+{
+	struct perf_counter *child;
+
+	perf_counter_disable(counter);
+
+	/*
+	 * Lock the mutex to protect the list of children
+	 */
+	mutex_lock(&counter->mutex);
+	list_for_each_entry(child, &counter->child_list, child_list)
+		perf_counter_disable(child);
+	mutex_unlock(&counter->mutex);
+}
+
 static int
 counter_sched_in(struct perf_counter *counter,
 		 struct perf_cpu_context *cpuctx,
@@ -302,6 +417,7 @@ static void __perf_install_in_context(void *info)
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter *counter = info;
 	struct perf_counter_context *ctx = counter->ctx;
+	struct perf_counter *leader = counter->group_leader;
 	int cpu = smp_processor_id();
 	unsigned long flags;
 	u64 perf_flags;
@@ -328,22 +444,39 @@ static void __perf_install_in_context(void *info)
 	ctx->nr_counters++;
 
 	/*
+	 * Don't put the counter on if it is disabled or if
+	 * it is in a group and the group isn't on.
+	 */
+	if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
+	    (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
+		goto unlock;
+
+	/*
 	 * An exclusive counter can't go on if there are already active
 	 * hardware counters, and no hardware counter can go on if there
 	 * is already an exclusive counter on.
 	 */
-	if (counter->state == PERF_COUNTER_STATE_INACTIVE &&
-	    !group_can_go_on(counter, cpuctx, 1))
+	if (!group_can_go_on(counter, cpuctx, 1))
 		err = -EEXIST;
 	else
 		err = counter_sched_in(counter, cpuctx, ctx, cpu);
 
-	if (err && counter->hw_event.pinned)
-		counter->state = PERF_COUNTER_STATE_ERROR;
+	if (err) {
+		/*
+		 * This counter couldn't go on.  If it is in a group
+		 * then we have to pull the whole group off.
+		 * If the counter group is pinned then put it in error state.
+		 */
+		if (leader != counter)
+			group_sched_out(leader, cpuctx, ctx);
+		if (leader->hw_event.pinned)
+			leader->state = PERF_COUNTER_STATE_ERROR;
+	}
 
 	if (!err && !ctx->task && cpuctx->max_pertask)
 		cpuctx->max_pertask--;
 
+ unlock:
 	hw_perf_restore(perf_flags);
 
 	spin_unlock(&ctx->lock);
@@ -359,6 +492,8 @@ static void __perf_install_in_context(void *info)
  * If the counter is attached to a task which is on a CPU we use a smp
  * call to enable it in the task context. The task might have been
  * scheduled away, but we check this in the smp call again.
+ *
+ * Must be called with ctx->mutex held.
  */
 static void
 perf_install_in_context(struct perf_counter_context *ctx,
@@ -387,7 +522,7 @@ retry:
 	/*
 	 * we need to retry the smp call.
 	 */
-	if (ctx->nr_active && list_empty(&counter->list_entry)) {
+	if (ctx->is_active && list_empty(&counter->list_entry)) {
 		spin_unlock_irq(&ctx->lock);
 		goto retry;
 	}
@@ -404,26 +539,131 @@ retry:
 	spin_unlock_irq(&ctx->lock);
 }
 
-static void
-group_sched_out(struct perf_counter *group_counter,
-		struct perf_cpu_context *cpuctx,
-		struct perf_counter_context *ctx)
+/*
+ * Cross CPU call to enable a performance counter
+ */
+static void __perf_counter_enable(void *info)
 {
-	struct perf_counter *counter;
+	struct perf_counter *counter = info;
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_counter_context *ctx = counter->ctx;
+	struct perf_counter *leader = counter->group_leader;
+	unsigned long flags;
+	int err;
 
-	if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
+	/*
+	 * If this is a per-task counter, need to check whether this
+	 * counter's task is the current task on this cpu.
+	 */
+	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	counter_sched_out(group_counter, cpuctx, ctx);
+	curr_rq_lock_irq_save(&flags);
+	spin_lock(&ctx->lock);
+
+	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+		goto unlock;
+	counter->state = PERF_COUNTER_STATE_INACTIVE;
 
 	/*
-	 * Schedule out siblings (if any):
+	 * If the counter is in a group and isn't the group leader,
+	 * then don't put it on unless the group is on.
 	 */
-	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
-		counter_sched_out(counter, cpuctx, ctx);
+	if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
+		goto unlock;
 
-	if (group_counter->hw_event.exclusive)
-		cpuctx->exclusive = 0;
+	if (!group_can_go_on(counter, cpuctx, 1))
+		err = -EEXIST;
+	else
+		err = counter_sched_in(counter, cpuctx, ctx,
+				       smp_processor_id());
+
+	if (err) {
+		/*
+		 * If this counter can't go on and it's part of a
+		 * group, then the whole group has to come off.
+		 */
+		if (leader != counter)
+			group_sched_out(leader, cpuctx, ctx);
+		if (leader->hw_event.pinned)
+			leader->state = PERF_COUNTER_STATE_ERROR;
+	}
+
+ unlock:
+	spin_unlock(&ctx->lock);
+	curr_rq_unlock_irq_restore(&flags);
+}
+
+/*
+ * Enable a counter.
+ */
+static void perf_counter_enable(struct perf_counter *counter)
+{
+	struct perf_counter_context *ctx = counter->ctx;
+	struct task_struct *task = ctx->task;
+
+	if (!task) {
+		/*
+		 * Enable the counter on the cpu that it's on
+		 */
+		smp_call_function_single(counter->cpu, __perf_counter_enable,
+					 counter, 1);
+		return;
+	}
+
+	spin_lock_irq(&ctx->lock);
+	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+		goto out;
+
+	/*
+	 * If the counter is in error state, clear that first.
+	 * That way, if we see the counter in error state below, we
+	 * know that it has gone back into error state, as distinct
+	 * from the task having been scheduled away before the
+	 * cross-call arrived.
+	 */
+	if (counter->state == PERF_COUNTER_STATE_ERROR)
+		counter->state = PERF_COUNTER_STATE_OFF;
+
+ retry:
+	spin_unlock_irq(&ctx->lock);
+	task_oncpu_function_call(task, __perf_counter_enable, counter);
+
+	spin_lock_irq(&ctx->lock);
+
+	/*
+	 * If the context is active and the counter is still off,
+	 * we need to retry the cross-call.
+	 */
+	if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
+		goto retry;
+
+	/*
+	 * Since we have the lock this context can't be scheduled
+	 * in, so we can change the state safely.
+	 */
+	if (counter->state == PERF_COUNTER_STATE_OFF)
+		counter->state = PERF_COUNTER_STATE_INACTIVE;
+ out:
+	spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Enable a counter and all its children.
+ */
+static void perf_counter_enable_family(struct perf_counter *counter)
+{
+	struct perf_counter *child;
+
+	perf_counter_enable(counter);
+
+	/*
+	 * Lock the mutex to protect the list of children
+	 */
+	mutex_lock(&counter->mutex);
+	list_for_each_entry(child, &counter->child_list, child_list)
+		perf_counter_enable(child);
+	mutex_unlock(&counter->mutex);
 }
 
 void __perf_counter_sched_out(struct perf_counter_context *ctx,
@@ -432,16 +672,18 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 	struct perf_counter *counter;
 	u64 flags;
 
+	spin_lock(&ctx->lock);
+	ctx->is_active = 0;
 	if (likely(!ctx->nr_counters))
-		return;
+		goto out;
 
-	spin_lock(&ctx->lock);
 	flags = hw_perf_save_disable();
 	if (ctx->nr_active) {
 		list_for_each_entry(counter, &ctx->counter_list, list_entry)
 			group_sched_out(counter, cpuctx, ctx);
 	}
 	hw_perf_restore(flags);
+ out:
 	spin_unlock(&ctx->lock);
 }
 
@@ -528,10 +770,11 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 	u64 flags;
 	int can_add_hw = 1;
 
+	spin_lock(&ctx->lock);
+	ctx->is_active = 1;
 	if (likely(!ctx->nr_counters))
-		return;
+		goto out;
 
-	spin_lock(&ctx->lock);
 	flags = hw_perf_save_disable();
 
 	/*
@@ -578,6 +821,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 		}
 	}
 	hw_perf_restore(flags);
+ out:
 	spin_unlock(&ctx->lock);
 }
 
@@ -896,12 +1140,14 @@ static int perf_release(struct inode *inode, struct file *file)
 
 	file->private_data = NULL;
 
+	mutex_lock(&ctx->mutex);
 	mutex_lock(&counter->mutex);
 
 	perf_counter_remove_from_context(counter);
 	put_context(ctx);
 
 	mutex_unlock(&counter->mutex);
+	mutex_unlock(&ctx->mutex);
 
 	kfree(counter);
 
@@ -1053,10 +1299,30 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
 	return events;
 }
 
+static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct perf_counter *counter = file->private_data;
+	int err = 0;
+
+	switch (cmd) {
+	case PERF_COUNTER_IOC_ENABLE:
+		perf_counter_enable_family(counter);
+		break;
+	case PERF_COUNTER_IOC_DISABLE:
+		perf_counter_disable_family(counter);
+		break;
+	default:
+		err = -ENOTTY;
+	}
+	return err;
+}
+
 static const struct file_operations perf_fops = {
 	.release		= perf_release,
 	.read			= perf_read,
 	.poll			= perf_poll,
+	.unlocked_ioctl		= perf_ioctl,
+	.compat_ioctl		= perf_ioctl,
 };
 
 static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
@@ -1348,6 +1614,8 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	INIT_LIST_HEAD(&counter->sibling_list);
 	init_waitqueue_head(&counter->waitq);
 
+	INIT_LIST_HEAD(&counter->child_list);
+
 	counter->irqdata		= &counter->data[0];
 	counter->usrdata		= &counter->data[1];
 	counter->cpu			= cpu;
@@ -1452,7 +1720,9 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
 		goto err_free_put_context;
 
 	counter->filp = counter_file;
+	mutex_lock(&ctx->mutex);
 	perf_install_in_context(ctx, counter, cpu);
+	mutex_unlock(&ctx->mutex);
 
 	fput_light(counter_file, fput_needed2);
 
@@ -1479,6 +1749,7 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
 {
 	memset(ctx, 0, sizeof(*ctx));
 	spin_lock_init(&ctx->lock);
+	mutex_init(&ctx->mutex);
 	INIT_LIST_HEAD(&ctx->counter_list);
 	ctx->task = task;
 }
@@ -1486,20 +1757,30 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
 /*
  * inherit a counter from parent task to child task:
  */
-static int
+static struct perf_counter *
 inherit_counter(struct perf_counter *parent_counter,
 	      struct task_struct *parent,
 	      struct perf_counter_context *parent_ctx,
 	      struct task_struct *child,
+	      struct perf_counter *group_leader,
 	      struct perf_counter_context *child_ctx)
 {
 	struct perf_counter *child_counter;
 
+	/*
+	 * Instead of creating recursive hierarchies of counters,
+	 * we link inherited counters back to the original parent,
+	 * which has a filp for sure, which we use as the reference
+	 * count:
+	 */
+	if (parent_counter->parent)
+		parent_counter = parent_counter->parent;
+
 	child_counter = perf_counter_alloc(&parent_counter->hw_event,
-					    parent_counter->cpu, NULL,
-					    GFP_ATOMIC);
+					    parent_counter->cpu, group_leader,
+					    GFP_KERNEL);
 	if (!child_counter)
-		return -ENOMEM;
+		return NULL;
 
 	/*
 	 * Link it up in the child's context:
@@ -1523,16 +1804,82 @@ inherit_counter(struct perf_counter *parent_counter,
 	 */
 	atomic_long_inc(&parent_counter->filp->f_count);
 
+	/*
+	 * Link this into the parent counter's child list
+	 */
+	mutex_lock(&parent_counter->mutex);
+	list_add_tail(&child_counter->child_list, &parent_counter->child_list);
+
+	/*
+	 * Make the child state follow the state of the parent counter,
+	 * not its hw_event.disabled bit.  We hold the parent's mutex,
+	 * so we won't race with perf_counter_{en,dis}able_family.
+	 */
+	if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
+		child_counter->state = PERF_COUNTER_STATE_INACTIVE;
+	else
+		child_counter->state = PERF_COUNTER_STATE_OFF;
+
+	mutex_unlock(&parent_counter->mutex);
+
+	return child_counter;
+}
+
+static int inherit_group(struct perf_counter *parent_counter,
+	      struct task_struct *parent,
+	      struct perf_counter_context *parent_ctx,
+	      struct task_struct *child,
+	      struct perf_counter_context *child_ctx)
+{
+	struct perf_counter *leader;
+	struct perf_counter *sub;
+
+	leader = inherit_counter(parent_counter, parent, parent_ctx,
+				 child, NULL, child_ctx);
+	if (!leader)
+		return -ENOMEM;
+	list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
+		if (!inherit_counter(sub, parent, parent_ctx,
+				     child, leader, child_ctx))
+			return -ENOMEM;
+	}
 	return 0;
 }
 
+static void sync_child_counter(struct perf_counter *child_counter,
+			       struct perf_counter *parent_counter)
+{
+	u64 parent_val, child_val;
+
+	parent_val = atomic64_read(&parent_counter->count);
+	child_val = atomic64_read(&child_counter->count);
+
+	/*
+	 * Add back the child's count to the parent's count:
+	 */
+	atomic64_add(child_val, &parent_counter->count);
+
+	/*
+	 * Remove this counter from the parent's list
+	 */
+	mutex_lock(&parent_counter->mutex);
+	list_del_init(&child_counter->child_list);
+	mutex_unlock(&parent_counter->mutex);
+
+	/*
+	 * Release the parent counter, if this was the last
+	 * reference to it.
+	 */
+	fput(parent_counter->filp);
+}
+
 static void
 __perf_counter_exit_task(struct task_struct *child,
 			 struct perf_counter *child_counter,
 			 struct perf_counter_context *child_ctx)
 {
 	struct perf_counter *parent_counter;
-	u64 parent_val, child_val;
+	struct perf_counter *sub, *tmp;
 
 	/*
 	 * If we do not self-reap then we have to wait for the
@@ -1561,7 +1908,7 @@ __perf_counter_exit_task(struct task_struct *child,
 
 		cpuctx = &__get_cpu_var(perf_cpu_context);
 
-		counter_sched_out(child_counter, cpuctx, child_ctx);
+		group_sched_out(child_counter, cpuctx, child_ctx);
 
 		list_del_init(&child_counter->list_entry);
 
@@ -1577,26 +1924,23 @@ __perf_counter_exit_task(struct task_struct *child,
 	 * that are still around due to the child reference. These
 	 * counters need to be zapped - but otherwise linger.
 	 */
-	if (!parent_counter)
-		return;
-
-	parent_val = atomic64_read(&parent_counter->count);
-	child_val = atomic64_read(&child_counter->count);
-
-	/*
-	 * Add back the child's count to the parent's count:
-	 */
-	atomic64_add(child_val, &parent_counter->count);
-
-	fput(parent_counter->filp);
+	if (parent_counter) {
+		sync_child_counter(child_counter, parent_counter);
+		list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
+					 list_entry) {
+			if (sub->parent)
+				sync_child_counter(sub, sub->parent);
+			kfree(sub);
+		}
+	}
 
 	kfree(child_counter);
 }
 
 /*
- * When a child task exist, feed back counter values to parent counters.
+ * When a child task exits, feed back counter values to parent counters.
  *
- * Note: we are running in child context, but the PID is not hashed
+ * Note: we may be running in child context, but the PID is not hashed
  * anymore so new counters will not be added.
  */
 void perf_counter_exit_task(struct task_struct *child)
@@ -1620,9 +1964,8 @@ void perf_counter_exit_task(struct task_struct *child)
 void perf_counter_init_task(struct task_struct *child)
 {
 	struct perf_counter_context *child_ctx, *parent_ctx;
-	struct perf_counter *counter, *parent_counter;
+	struct perf_counter *counter;
 	struct task_struct *parent = current;
-	unsigned long flags;
 
 	child_ctx  =  &child->perf_counter_ctx;
 	parent_ctx = &parent->perf_counter_ctx;
@@ -1641,32 +1984,22 @@ void perf_counter_init_task(struct task_struct *child)
 	 * Lock the parent list. No need to lock the child - not PID
 	 * hashed yet and not running, so nobody can access it.
 	 */
-	spin_lock_irqsave(&parent_ctx->lock, flags);
+	mutex_lock(&parent_ctx->mutex);
 
 	/*
 	 * We dont have to disable NMIs - we are only looking at
 	 * the list, not manipulating it:
 	 */
 	list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
-		if (!counter->hw_event.inherit || counter->group_leader != counter)
+		if (!counter->hw_event.inherit)
 			continue;
 
-		/*
-		 * Instead of creating recursive hierarchies of counters,
-		 * we link inheritd counters back to the original parent,
-		 * which has a filp for sure, which we use as the reference
-		 * count:
-		 */
-		parent_counter = counter;
-		if (counter->parent)
-			parent_counter = counter->parent;
-
-		if (inherit_counter(parent_counter, parent,
+		if (inherit_group(counter, parent,
 				  parent_ctx, child, child_ctx))
 			break;
 	}
 
-	spin_unlock_irqrestore(&parent_ctx->lock, flags);
+	mutex_unlock(&parent_ctx->mutex);
 }
 
 static void __cpuinit perf_counter_init_cpu(int cpu)
@@ -1692,11 +2025,15 @@ static void __perf_counter_exit_cpu(void *info)
 
 	list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
 		__perf_counter_remove_from_context(counter);
-
 }
 static void perf_counter_exit_cpu(int cpu)
 {
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct perf_counter_context *ctx = &cpuctx->ctx;
+
+	mutex_lock(&ctx->mutex);
 	smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
+	mutex_unlock(&ctx->mutex);
 }
 #else
 static inline void perf_counter_exit_cpu(int cpu) { }
-- 
cgit v0.10.2


From 1b023a96d9b44f50f4d8ff28c15f5b80e354760f Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 23 Jan 2009 10:13:01 +0100
Subject: perfcounters: throttle on too high IRQ rates

Starting kerneltop with only -c 100 seems to be a bad idea, it can
easily lock the system due to perfcounter IRQ overload.

So add throttling: if a new IRQ arrives in a shorter than
PERFMON_MIN_PERIOD_NS time, turn off perfcounters and untrottle them
from the next timer tick.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 7b434e5..849c230 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -781,6 +781,8 @@ static void local_apic_timer_interrupt(void)
 	inc_irq_stat(apic_timer_irqs);
 
 	evt->event_handler(evt);
+
+	perf_counter_unthrottle();
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 9376771..1a040b1 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -33,6 +33,9 @@ static int nr_counters_fixed __read_mostly;
 struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
 	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	u64			last_interrupt;
+	u64			global_enable;
+	int			throttled;
 };
 
 /*
@@ -474,16 +477,19 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 {
 	int bit, cpu = smp_processor_id();
-	u64 ack, status, saved_global;
-	struct cpu_hw_counters *cpuc;
+	u64 ack, status, now;
+	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
 
-	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
 
 	/* Disable counters globally */
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 	ack_APIC_irq();
 
-	cpuc = &per_cpu(cpu_hw_counters, cpu);
+	now = sched_clock();
+	if (now - cpuc->last_interrupt < PERFMON_MIN_PERIOD_NS)
+		cpuc->throttled = 1;
+	cpuc->last_interrupt = now;
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 	if (!status)
@@ -533,9 +539,29 @@ again:
 		goto again;
 out:
 	/*
-	 * Restore - do not reenable when global enable is off:
+	 * Restore - do not reenable when global enable is off or throttled:
 	 */
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
+	if (!cpuc->throttled)
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
+}
+
+void perf_counter_unthrottle(void)
+{
+	struct cpu_hw_counters *cpuc;
+
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+		return;
+
+	if (unlikely(!perf_counters_initialized))
+		return;
+
+	cpuc = &per_cpu(cpu_hw_counters, smp_processor_id());
+	if (cpuc->throttled) {
+		if (printk_ratelimit())
+			printk(KERN_WARNING "PERFMON: max event frequency exceeded!\n");
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
+		cpuc->throttled = 0;
+	}
 }
 
 void smp_perf_counter_interrupt(struct pt_regs *regs)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 33ba9fe..91f1ca4 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -254,6 +254,7 @@ extern void perf_counter_init_task(struct task_struct *child);
 extern void perf_counter_exit_task(struct task_struct *child);
 extern void perf_counter_notify(struct pt_regs *regs);
 extern void perf_counter_print_debug(void);
+extern void perf_counter_unthrottle(void);
 extern u64 hw_perf_save_disable(void);
 extern void hw_perf_restore(u64 ctrl);
 extern int perf_counter_task_disable(void);
@@ -270,6 +271,8 @@ static inline int is_software_counter(struct perf_counter *counter)
 	return !counter->hw_event.raw && counter->hw_event.type < 0;
 }
 
+#define PERFMON_MIN_PERIOD_NS 10000
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -281,6 +284,7 @@ static inline void perf_counter_init_task(struct task_struct *child)	{ }
 static inline void perf_counter_exit_task(struct task_struct *child)	{ }
 static inline void perf_counter_notify(struct pt_regs *regs)		{ }
 static inline void perf_counter_print_debug(void)			{ }
+static inline void perf_counter_unthrottle(void)			{ }
 static inline void hw_perf_restore(u64 ctrl)			{ }
 static inline u64 hw_perf_save_disable(void)		      { return 0; }
 static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
-- 
cgit v0.10.2


From 4b39fd96855254a244f71245b41a91cdecb87d63 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 23 Jan 2009 14:36:16 +0100
Subject: perfcounters: ratelimit performance counter interrupts

Ratelimit performance counter interrupts to 100KHz per CPU.

This replaces the irq-delta-time based method.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 1a040b1..a56d4cf 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -33,9 +33,8 @@ static int nr_counters_fixed __read_mostly;
 struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
 	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	u64			last_interrupt;
+	unsigned long		interrupts;
 	u64			global_enable;
-	int			throttled;
 };
 
 /*
@@ -471,13 +470,18 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 }
 
 /*
+ * Maximum interrupt frequency of 100KHz per CPU
+ */
+#define PERFMON_MAX_INTERRUPTS 100000/HZ
+
+/*
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
  */
 static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 {
 	int bit, cpu = smp_processor_id();
-	u64 ack, status, now;
+	u64 ack, status;
 	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
@@ -486,11 +490,6 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 	ack_APIC_irq();
 
-	now = sched_clock();
-	if (now - cpuc->last_interrupt < PERFMON_MIN_PERIOD_NS)
-		cpuc->throttled = 1;
-	cpuc->last_interrupt = now;
-
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 	if (!status)
 		goto out;
@@ -541,13 +540,14 @@ out:
 	/*
 	 * Restore - do not reenable when global enable is off or throttled:
 	 */
-	if (!cpuc->throttled)
+	if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
 		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
 }
 
 void perf_counter_unthrottle(void)
 {
 	struct cpu_hw_counters *cpuc;
+	u64 global_enable;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 		return;
@@ -556,12 +556,15 @@ void perf_counter_unthrottle(void)
 		return;
 
 	cpuc = &per_cpu(cpu_hw_counters, smp_processor_id());
-	if (cpuc->throttled) {
+	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
 		if (printk_ratelimit())
-			printk(KERN_WARNING "PERFMON: max event frequency exceeded!\n");
+			printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n");
 		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
-		cpuc->throttled = 0;
 	}
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, global_enable);
+	if (unlikely(cpuc->global_enable && !global_enable))
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
+	cpuc->interrupts = 0;
 }
 
 void smp_perf_counter_interrupt(struct pt_regs *regs)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 91f1ca4..f55381f 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -271,8 +271,6 @@ static inline int is_software_counter(struct perf_counter *counter)
 	return !counter->hw_event.raw && counter->hw_event.type < 0;
 }
 
-#define PERFMON_MIN_PERIOD_NS 10000
-
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
-- 
cgit v0.10.2


From 3415dd9146c574bffe8f012c096bfc2bc62b9508 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 23 Jan 2009 14:16:53 +0100
Subject: perfcounters fix section mismatch warning in
 perf_counter.c::perf_counters_lapic_init()

Fix:

WARNING: arch/x86/kernel/built-in.o(.text+0xdd0f): Section mismatch in reference from the function pmc_generic_enable() to the function .cpuinit.text:perf_counters_lapic_init()
The function pmc_generic_enable() references
the function __cpuinit perf_counters_lapic_init().
This is often because pmc_generic_enable lacks a __cpuinit
annotation or the annotation of perf_counters_lapic_init is wrong.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a56d4cf..46c436c 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -605,7 +605,7 @@ void perf_counter_notify(struct pt_regs *regs)
 	local_irq_restore(flags);
 }
 
-void __cpuinit perf_counters_lapic_init(int nmi)
+void perf_counters_lapic_init(int nmi)
 {
 	u32 apic_val;
 
-- 
cgit v0.10.2


From bb3f0b59ad005d2d2ecbbe9bd048eab6d1ecbd31 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sun, 25 Jan 2009 02:38:09 -0800
Subject: x86: make irqinit_32.c more like irqinit_64.c, v2

Impact: cleanup

1. add smp_intr_init and apic_intr_init for 32bit, the same as 64bit
2. move the apic_intr_init() call before set gate with interrupt[i]
3. for 64bit, if ia32_emulation is not used, will make per_cpu to use 0x80 vector.

[ v2: should use !test_bit() instead of test_bit() with 32bit ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index c56496f..ddf3eb7 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -120,28 +120,8 @@ int vector_used_by_percpu_irq(unsigned int vector)
 	return 0;
 }
 
-/* Overridden in paravirt.c */
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-void __init native_init_IRQ(void)
+static void __init smp_intr_init(void)
 {
-	int i;
-
-	/* all the set up before the call gates are initialised */
-	pre_intr_init_hook();
-
-	/*
-	 * Cover the whole vector space, no vector can escape
-	 * us. (some of these will be overridden and become
-	 * 'special' SMP interrupts)
-	 */
-	for (i =  FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
-		/* SYSCALL_VECTOR was reserved in trap_init. */
-		if (i != SYSCALL_VECTOR)
-			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
-	}
-
-
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
 	/*
 	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
@@ -170,8 +150,13 @@ void __init native_init_IRQ(void)
 	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
 	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
 #endif
+}
 
+static void __init apic_intr_init(void)
+{
 #ifdef CONFIG_X86_LOCAL_APIC
+	smp_intr_init();
+
 	/* self generated IPI for local APIC timer */
 	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
@@ -181,12 +166,37 @@ void __init native_init_IRQ(void)
 # ifdef CONFIG_PERF_COUNTERS
 	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
 # endif
-#endif
 
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
+# ifdef CONFIG_X86_MCE_P4THERMAL
 	/* thermal monitor LVT interrupt */
 	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+# endif
 #endif
+}
+
+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+void __init native_init_IRQ(void)
+{
+	int i;
+
+	/* all the set up before the call gates are initialised */
+	pre_intr_init_hook();
+
+	apic_intr_init();
+
+	/*
+	 * Cover the whole vector space, no vector can escape
+	 * us. (some of these will be overridden and become
+	 * 'special' SMP interrupts)
+	 */
+	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+		int vector = FIRST_EXTERNAL_VECTOR + i;
+		/* SYSCALL_VECTOR was reserved in trap_init. */
+		if (!test_bit(vector, used_vectors))
+			set_intr_gate(vector, interrupt[i]);
+	}
 
 	if (!acpi_ioapic)
 		setup_irq(2, &irq2);
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 6a71bfc..16e1fc6 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -162,6 +162,9 @@ void __init native_init_IRQ(void)
 	int i;
 
 	init_ISA_irqs();
+
+	apic_intr_init();
+
 	/*
 	 * Cover the whole vector space, no vector can escape
 	 * us. (some of these will be overridden and become
@@ -169,12 +172,10 @@ void __init native_init_IRQ(void)
 	 */
 	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
 		int vector = FIRST_EXTERNAL_VECTOR + i;
-		if (vector != IA32_SYSCALL_VECTOR)
+		if (!test_bit(vector, used_vectors))
 			set_intr_gate(vector, interrupt[i]);
 	}
 
-	apic_intr_init();
-
 	if (!acpi_ioapic)
 		setup_irq(2, &irq2);
 }
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ed5aee5..d36a502 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -979,8 +979,13 @@ void __init trap_init(void)
 #endif
 	set_intr_gate(19, &simd_coprocessor_error);
 
+	/* Reserve all the builtin and the syscall vector: */
+	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
+		set_bit(i, used_vectors);
+
 #ifdef CONFIG_IA32_EMULATION
 	set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
+	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
 #endif
 
 #ifdef CONFIG_X86_32
@@ -997,17 +1002,9 @@ void __init trap_init(void)
 	}
 
 	set_system_trap_gate(SYSCALL_VECTOR, &system_call);
-#endif
-
-	/* Reserve all the builtin and the syscall vector: */
-	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
-		set_bit(i, used_vectors);
-
-#ifdef CONFIG_X86_64
-	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
-#else
 	set_bit(SYSCALL_VECTOR, used_vectors);
 #endif
+
 	/*
 	 * Should be a barrier for any external CPU state:
 	 */
-- 
cgit v0.10.2


From 65d370862f64973611a271ced61864b5f9bb6fc0 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 29 Jan 2009 14:06:52 +0100
Subject: perfcounters: fix refcounting bug

don't kfree in use counters.

Running...

	while true; do perfstat -e 1 -c true; done

...on all cores for a while doesn't seem to be eating ram, and my oops
is gone.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 1ac18da..f27a7e9 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1934,7 +1934,8 @@ __perf_counter_exit_task(struct task_struct *child,
 		}
 	}
 
-	kfree(child_counter);
+	if (!child_counter->filp || !atomic_long_read(&child_counter->filp->f_count))
+		kfree(child_counter);
 }
 
 /*
-- 
cgit v0.10.2


From 15081c61362618a0c81cc8d04e45e7427bc1ed71 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Sun, 1 Feb 2009 22:07:39 +0530
Subject: x86: irqinit_32.c fix compilation warning

Fix:

  arch/x86/kernel/irqinit_32.c:124: warning: 'smp_intr_init' defined but not used

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index ddf3eb7..520e6c1 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -154,9 +154,9 @@ static void __init smp_intr_init(void)
 
 static void __init apic_intr_init(void)
 {
-#ifdef CONFIG_X86_LOCAL_APIC
 	smp_intr_init();
 
+#ifdef CONFIG_X86_LOCAL_APIC
 	/* self generated IPI for local APIC timer */
 	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
-- 
cgit v0.10.2


From 5b75af0a02fcf3b8899f38ff6f22164c5d8e2fdd Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 4 Feb 2009 17:11:34 +0100
Subject: perfcounters: fix "perf counters kill oprofile" bug

With oprofile as a module, and unloaded by profiling script,
both oprofile and kerneltop work fine.. unless you leave kerneltop
running when you start profiling, then you may see badness.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 46c436c..8bb2133 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -643,7 +643,9 @@ perf_counter_nmi_handler(struct notifier_block *self,
 }
 
 static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
-	.notifier_call		= perf_counter_nmi_handler
+	.notifier_call		= perf_counter_nmi_handler,
+	.next			= NULL,
+	.priority		= 1
 };
 
 void __init init_hw_perf_counters(void)
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 202864a..c638685 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -40,8 +40,9 @@ static int profile_exceptions_notify(struct notifier_block *self,
 
 	switch (val) {
 	case DIE_NMI:
-		if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu)))
-			ret = NOTIFY_STOP;
+	case DIE_NMI_IPI:
+		model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu));
+		ret = NOTIFY_STOP;
 		break;
 	default:
 		break;
@@ -134,7 +135,7 @@ static void nmi_cpu_setup(void *dummy)
 static struct notifier_block profile_exceptions_nb = {
 	.notifier_call = profile_exceptions_notify,
 	.next = NULL,
-	.priority = 0
+	.priority = 2
 };
 
 static int nmi_setup(void)
-- 
cgit v0.10.2


From 82aa9a1829199233f9bdaf26e2ee271114f4701e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 5 Feb 2009 15:23:08 +0100
Subject: perfcounters: fix "perf counters kills oprofile" bug, v2

Impact: fix kernel crash

Both oprofile and perfcounters register an NMI die handler, but only one
can handle the NMI.  Conveniently, oprofile unregisters it's notifier
when not actively in use, so setting it's notifier priority higher than
perfcounter's allows oprofile to borrow the NMI for the duration of it's
run.  Tested/works both as module and built-in.

While testing, I found that if kerneltop was generating NMIs at very
high frequency, the kernel may panic when oprofile registered it's
handler.  This turned out to be because oprofile registers it's handler
before reset_value has been allocated, so if an NMI comes in while it's
still setting up, kabOom.  Rather than try more invasive changes, I
followed the lead of other places in op_model_ppro.c, and simply
returned in that highly unlikely event.  (debug warnings attached)

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 07c9145..85eb626 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -126,6 +126,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
 	u64 val;
 	int i;
 
+	/*
+	 * This can happen if perf counters are in use when
+	 * we steal the die notifier NMI.
+	 */
+	if (unlikely(!reset_value))
+		goto out;
+
 	for (i = 0 ; i < num_counters; ++i) {
 		if (!reset_value[i])
 			continue;
@@ -136,6 +143,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
 		}
 	}
 
+out:
 	/* Only P6 based Pentium M need to re-unmask the apic vector but it
 	 * doesn't hurt other P6 variant */
 	apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
-- 
cgit v0.10.2


From 23a185ca8abbeef64b6ffc33059b1d630e43ec10 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 9 Feb 2009 22:42:47 +1100
Subject: perf_counters: make software counters work as per-cpu counters

Impact: kernel crash fix

Yanmin Zhang reported that using a PERF_COUNT_TASK_CLOCK software
counter as a per-cpu counter would reliably crash the system, because
it calls __task_delta_exec with a null pointer.  The page fault,
context switch and cpu migration counters also won't function
correctly as per-cpu counters since they reference the current task.

This fixes the problem by redirecting the task_clock counter to the
cpu_clock counter when used as a per-cpu counter, and by implementing
per-cpu page fault, context switch and cpu migration counters.

Along the way, this:

- Initializes counter->ctx earlier, in perf_counter_alloc, so that
  sw_perf_counter_init can use it
- Adds code to kernel/sched.c to count task migrations into each
  cpu, in rq->nr_migrations_in
- Exports the per-cpu context switch and task migration counts
  via new functions added to kernel/sched.c
- Makes sure that if sw_perf_counter_init fails, we don't try to
  initialize the counter as a hardware counter.  Since the user has
  passed a negative, non-raw event type, they clearly don't intend
  for it to be interpreted as a hardware event.

Reported-by: "Zhang Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b85b10a..1e5f700 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -137,6 +137,8 @@ extern unsigned long nr_running(void);
 extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_active(void);
 extern unsigned long nr_iowait(void);
+extern u64 cpu_nr_switches(int cpu);
+extern u64 cpu_nr_migrations(int cpu);
 
 struct seq_file;
 struct cfs_rq;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f27a7e9..544193c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -20,6 +20,8 @@
 #include <linux/anon_inodes.h>
 #include <linux/kernel_stat.h>
 #include <linux/perf_counter.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
 
 /*
  * Each CPU has a list of per CPU counters:
@@ -502,7 +504,6 @@ perf_install_in_context(struct perf_counter_context *ctx,
 {
 	struct task_struct *task = ctx->task;
 
-	counter->ctx = ctx;
 	if (!task) {
 		/*
 		 * Per cpu counters are installed via an smp call and
@@ -1417,11 +1418,19 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = {
 	.read		= task_clock_perf_counter_read,
 };
 
-static u64 get_page_faults(void)
+#ifdef CONFIG_VM_EVENT_COUNTERS
+#define cpu_page_faults()	__get_cpu_var(vm_event_states).event[PGFAULT]
+#else
+#define cpu_page_faults()	0
+#endif
+
+static u64 get_page_faults(struct perf_counter *counter)
 {
-	struct task_struct *curr = current;
+	struct task_struct *curr = counter->ctx->task;
 
-	return curr->maj_flt + curr->min_flt;
+	if (curr)
+		return curr->maj_flt + curr->min_flt;
+	return cpu_page_faults();
 }
 
 static void page_faults_perf_counter_update(struct perf_counter *counter)
@@ -1430,7 +1439,7 @@ static void page_faults_perf_counter_update(struct perf_counter *counter)
 	s64 delta;
 
 	prev = atomic64_read(&counter->hw.prev_count);
-	now = get_page_faults();
+	now = get_page_faults(counter);
 
 	atomic64_set(&counter->hw.prev_count, now);
 
@@ -1446,11 +1455,7 @@ static void page_faults_perf_counter_read(struct perf_counter *counter)
 
 static int page_faults_perf_counter_enable(struct perf_counter *counter)
 {
-	/*
-	 * page-faults is a per-task value already,
-	 * so we dont have to clear it on switch-in.
-	 */
-
+	atomic64_set(&counter->hw.prev_count, get_page_faults(counter));
 	return 0;
 }
 
@@ -1465,11 +1470,13 @@ static const struct hw_perf_counter_ops perf_ops_page_faults = {
 	.read		= page_faults_perf_counter_read,
 };
 
-static u64 get_context_switches(void)
+static u64 get_context_switches(struct perf_counter *counter)
 {
-	struct task_struct *curr = current;
+	struct task_struct *curr = counter->ctx->task;
 
-	return curr->nvcsw + curr->nivcsw;
+	if (curr)
+		return curr->nvcsw + curr->nivcsw;
+	return cpu_nr_switches(smp_processor_id());
 }
 
 static void context_switches_perf_counter_update(struct perf_counter *counter)
@@ -1478,7 +1485,7 @@ static void context_switches_perf_counter_update(struct perf_counter *counter)
 	s64 delta;
 
 	prev = atomic64_read(&counter->hw.prev_count);
-	now = get_context_switches();
+	now = get_context_switches(counter);
 
 	atomic64_set(&counter->hw.prev_count, now);
 
@@ -1494,11 +1501,7 @@ static void context_switches_perf_counter_read(struct perf_counter *counter)
 
 static int context_switches_perf_counter_enable(struct perf_counter *counter)
 {
-	/*
-	 * ->nvcsw + curr->nivcsw is a per-task value already,
-	 * so we dont have to clear it on switch-in.
-	 */
-
+	atomic64_set(&counter->hw.prev_count, get_context_switches(counter));
 	return 0;
 }
 
@@ -1513,9 +1516,13 @@ static const struct hw_perf_counter_ops perf_ops_context_switches = {
 	.read		= context_switches_perf_counter_read,
 };
 
-static inline u64 get_cpu_migrations(void)
+static inline u64 get_cpu_migrations(struct perf_counter *counter)
 {
-	return current->se.nr_migrations;
+	struct task_struct *curr = counter->ctx->task;
+
+	if (curr)
+		return curr->se.nr_migrations;
+	return cpu_nr_migrations(smp_processor_id());
 }
 
 static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
@@ -1524,7 +1531,7 @@ static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
 	s64 delta;
 
 	prev = atomic64_read(&counter->hw.prev_count);
-	now = get_cpu_migrations();
+	now = get_cpu_migrations(counter);
 
 	atomic64_set(&counter->hw.prev_count, now);
 
@@ -1540,11 +1547,7 @@ static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
 
 static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
 {
-	/*
-	 * se.nr_migrations is a per-task value already,
-	 * so we dont have to clear it on switch-in.
-	 */
-
+	atomic64_set(&counter->hw.prev_count, get_cpu_migrations(counter));
 	return 0;
 }
 
@@ -1569,7 +1572,14 @@ sw_perf_counter_init(struct perf_counter *counter)
 		hw_ops = &perf_ops_cpu_clock;
 		break;
 	case PERF_COUNT_TASK_CLOCK:
-		hw_ops = &perf_ops_task_clock;
+		/*
+		 * If the user instantiates this as a per-cpu counter,
+		 * use the cpu_clock counter instead.
+		 */
+		if (counter->ctx->task)
+			hw_ops = &perf_ops_task_clock;
+		else
+			hw_ops = &perf_ops_cpu_clock;
 		break;
 	case PERF_COUNT_PAGE_FAULTS:
 		hw_ops = &perf_ops_page_faults;
@@ -1592,6 +1602,7 @@ sw_perf_counter_init(struct perf_counter *counter)
 static struct perf_counter *
 perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		   int cpu,
+		   struct perf_counter_context *ctx,
 		   struct perf_counter *group_leader,
 		   gfp_t gfpflags)
 {
@@ -1623,6 +1634,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->wakeup_pending		= 0;
 	counter->group_leader		= group_leader;
 	counter->hw_ops			= NULL;
+	counter->ctx			= ctx;
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	if (hw_event->disabled)
@@ -1631,7 +1643,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	hw_ops = NULL;
 	if (!hw_event->raw && hw_event->type < 0)
 		hw_ops = sw_perf_counter_init(counter);
-	if (!hw_ops)
+	else
 		hw_ops = hw_perf_counter_init(counter);
 
 	if (!hw_ops) {
@@ -1707,7 +1719,8 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
 	}
 
 	ret = -EINVAL;
-	counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL);
+	counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
+				     GFP_KERNEL);
 	if (!counter)
 		goto err_put_context;
 
@@ -1777,15 +1790,14 @@ inherit_counter(struct perf_counter *parent_counter,
 		parent_counter = parent_counter->parent;
 
 	child_counter = perf_counter_alloc(&parent_counter->hw_event,
-					    parent_counter->cpu, group_leader,
-					    GFP_KERNEL);
+					   parent_counter->cpu, child_ctx,
+					   group_leader, GFP_KERNEL);
 	if (!child_counter)
 		return NULL;
 
 	/*
 	 * Link it up in the child's context:
 	 */
-	child_counter->ctx = child_ctx;
 	child_counter->task = child;
 	list_add_counter(child_counter, child_ctx);
 	child_ctx->nr_counters++;
diff --git a/kernel/sched.c b/kernel/sched.c
index 8db1a4c..173768f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -558,6 +558,7 @@ struct rq {
 	struct load_weight load;
 	unsigned long nr_load_updates;
 	u64 nr_switches;
+	u64 nr_migrations_in;
 
 	struct cfs_rq cfs;
 	struct rt_rq rt;
@@ -1908,6 +1909,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 #endif
 	if (old_cpu != new_cpu) {
 		p->se.nr_migrations++;
+		new_rq->nr_migrations_in++;
 #ifdef CONFIG_SCHEDSTATS
 		if (task_hot(p, old_rq->clock, NULL))
 			schedstat_inc(p, se.nr_forced2_migrations);
@@ -2811,6 +2813,21 @@ unsigned long nr_active(void)
 }
 
 /*
+ * Externally visible per-cpu scheduler statistics:
+ * cpu_nr_switches(cpu) - number of context switches on that cpu
+ * cpu_nr_migrations(cpu) - number of migrations into that cpu
+ */
+u64 cpu_nr_switches(int cpu)
+{
+	return cpu_rq(cpu)->nr_switches;
+}
+
+u64 cpu_nr_migrations(int cpu)
+{
+	return cpu_rq(cpu)->nr_migrations_in;
+}
+
+/*
  * Update rq->cpu_load[] statistics. This function is usually called every
  * scheduler tick (TICK_NSEC).
  */
-- 
cgit v0.10.2


From d278c48435625cb6b7edcf6a547620768b175709 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Mon, 9 Feb 2009 07:38:50 +0100
Subject: perf_counters: account NMI interrupts

I noticed that kerneltop interrupts were accounted as NMI, but not their
perf counter origin.

Account NMI performance counter interrupts.

Signed-off-by: Mike Galbraith  <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

 arch/x86/kernel/cpu/perf_counter.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 8bb2133..9901e46 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -495,6 +495,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 		goto out;
 
 again:
+	inc_irq_stat(apic_perf_irqs);
 	ack = status;
 	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
 		struct perf_counter *counter = cpuc->counters[bit];
@@ -570,7 +571,6 @@ void perf_counter_unthrottle(void)
 void smp_perf_counter_interrupt(struct pt_regs *regs)
 {
 	irq_enter();
-	inc_irq_stat(apic_perf_irqs);
 	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
 	__smp_perf_counter_interrupt(regs, 0);
 
-- 
cgit v0.10.2


From 0475f9ea8e2cc030298908949e0d5da9f2fc2cfe Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 11 Feb 2009 14:35:35 +1100
Subject: perf_counters: allow users to count user, kernel and/or hypervisor
 events

Impact: new perf_counter feature

This extends the perf_counter_hw_event struct with bits that specify
that events in user, kernel and/or hypervisor mode should not be
counted (i.e. should be excluded), and adds code to program the PMU
mode selection bits accordingly on x86 and powerpc.

For software counters, we don't currently have the infrastructure to
distinguish which mode an event occurs in, so we currently fail the
counter initialization if the setting of the hw_event.exclude_* bits
would require us to distinguish.  Context switches and CPU migrations
are currently considered to occur in kernel mode.

On x86, this changes the previous policy that only root can count
kernel events.  Now non-root users can count kernel events or exclude
them.  Non-root users still can't use NMI events, though.  On x86 we
don't appear to have any way to control whether hypervisor events are
counted or not, so hw_event.exclude_hv is ignored.

On powerpc, the selection of whether to count events in user, kernel
and/or hypervisor mode is PMU-wide, not per-counter, so this adds a
check that the hw_event.exclude_* settings are the same as other events
on the PMU.  Counters being added to a group have to have the same
settings as the other hardware counters in the group.  Counters and
groups can only be enabled in hw_perf_group_sched_in or power_perf_enable
if they have the same settings as any other counters already on the
PMU.  If we are not running on a hypervisor, the exclude_hv setting
is ignored (by forcing it to 0) since we can't ever get any
hypervisor events.

Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 5b02113..bd6ba85 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -16,6 +16,7 @@
 #include <asm/reg.h>
 #include <asm/pmc.h>
 #include <asm/machdep.h>
+#include <asm/firmware.h>
 
 struct cpu_hw_counters {
 	int n_counters;
@@ -214,6 +215,36 @@ static int power_check_constraints(unsigned int event[], int n_ev)
 	return 0;
 }
 
+/*
+ * Check if newly-added counters have consistent settings for
+ * exclude_{user,kernel,hv} with each other and any previously
+ * added counters.
+ */
+static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new)
+{
+	int eu, ek, eh;
+	int i, n;
+	struct perf_counter *counter;
+
+	n = n_prev + n_new;
+	if (n <= 1)
+		return 0;
+
+	eu = ctrs[0]->hw_event.exclude_user;
+	ek = ctrs[0]->hw_event.exclude_kernel;
+	eh = ctrs[0]->hw_event.exclude_hv;
+	if (n_prev == 0)
+		n_prev = 1;
+	for (i = n_prev; i < n; ++i) {
+		counter = ctrs[i];
+		if (counter->hw_event.exclude_user != eu ||
+		    counter->hw_event.exclude_kernel != ek ||
+		    counter->hw_event.exclude_hv != eh)
+			return -EAGAIN;
+	}
+	return 0;
+}
+
 static void power_perf_read(struct perf_counter *counter)
 {
 	long val, delta, prev;
@@ -324,6 +355,20 @@ void hw_perf_restore(u64 disable)
 	}
 
 	/*
+	 * Add in MMCR0 freeze bits corresponding to the
+	 * hw_event.exclude_* bits for the first counter.
+	 * We have already checked that all counters have the
+	 * same values for these bits as the first counter.
+	 */
+	counter = cpuhw->counter[0];
+	if (counter->hw_event.exclude_user)
+		cpuhw->mmcr[0] |= MMCR0_FCP;
+	if (counter->hw_event.exclude_kernel)
+		cpuhw->mmcr[0] |= MMCR0_FCS;
+	if (counter->hw_event.exclude_hv)
+		cpuhw->mmcr[0] |= MMCR0_FCHV;
+
+	/*
 	 * Write the new configuration to MMCR* with the freeze
 	 * bit set and set the hardware counters to their initial values.
 	 * Then unfreeze the counters.
@@ -424,6 +469,8 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
 			   &cpuhw->counter[n0], &cpuhw->events[n0]);
 	if (n < 0)
 		return -EAGAIN;
+	if (check_excludes(cpuhw->counter, n0, n))
+		return -EAGAIN;
 	if (power_check_constraints(cpuhw->events, n + n0))
 		return -EAGAIN;
 	cpuhw->n_counters = n0 + n;
@@ -476,6 +523,8 @@ static int power_perf_enable(struct perf_counter *counter)
 		goto out;
 	cpuhw->counter[n0] = counter;
 	cpuhw->events[n0] = counter->hw.config;
+	if (check_excludes(cpuhw->counter, n0, 1))
+		goto out;
 	if (power_check_constraints(cpuhw->events, n0 + 1))
 		goto out;
 
@@ -555,6 +604,17 @@ hw_perf_counter_init(struct perf_counter *counter)
 	counter->hw.idx = 0;
 
 	/*
+	 * If we are not running on a hypervisor, force the
+	 * exclude_hv bit to 0 so that we don't care what
+	 * the user set it to.  This also means that we don't
+	 * set the MMCR0_FCHV bit, which unconditionally freezes
+	 * the counters on the PPC970 variants used in Apple G5
+	 * machines (since MSR.HV is always 1 on those machines).
+	 */
+	if (!firmware_has_feature(FW_FEATURE_LPAR))
+		counter->hw_event.exclude_hv = 0;
+	
+	/*
 	 * If this is in a group, check if it can go on with all the
 	 * other hardware counters in the group.  We assume the counter
 	 * hasn't been linked into its leader's sibling list at this point.
@@ -566,11 +626,13 @@ hw_perf_counter_init(struct perf_counter *counter)
 		if (n < 0)
 			return NULL;
 	}
-	events[n++] = ev;
-	if (power_check_constraints(events, n))
+	events[n] = ev;
+	if (check_excludes(ctrs, n, 1))
+		return NULL;
+	if (power_check_constraints(events, n + 1))
 		return NULL;
 
-	counter->hw.config = events[n - 1];
+	counter->hw.config = events[n];
 	atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
 	return &power_perf_ops;
 }
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 9901e46..383d4c6 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -107,21 +107,25 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		return -EINVAL;
 
 	/*
-	 * Count user events, and generate PMC IRQs:
+	 * Generate PMC IRQs:
 	 * (keep 'enabled' bit clear for now)
 	 */
-	hwc->config = ARCH_PERFMON_EVENTSEL_USR | ARCH_PERFMON_EVENTSEL_INT;
+	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
 
 	/*
-	 * If privileged enough, count OS events too, and allow
-	 * NMI events as well:
+	 * Count user and OS events unless requested not to.
 	 */
-	hwc->nmi = 0;
-	if (capable(CAP_SYS_ADMIN)) {
+	if (!hw_event->exclude_user)
+		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
+	if (!hw_event->exclude_kernel)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
-		if (hw_event->nmi)
-			hwc->nmi = 1;
-	}
+
+	/*
+	 * If privileged enough, allow NMI events:
+	 */
+	hwc->nmi = 0;
+	if (capable(CAP_SYS_ADMIN) && hw_event->nmi)
+		hwc->nmi = 1;
 
 	hwc->irq_period		= hw_event->irq_period;
 	/*
@@ -248,10 +252,13 @@ __pmc_fixed_enable(struct perf_counter *counter,
 	int err;
 
 	/*
-	 * Enable IRQ generation (0x8) and ring-3 counting (0x2),
-	 * and enable ring-0 counting if allowed:
+	 * Enable IRQ generation (0x8),
+	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+	 * if requested:
 	 */
-	bits = 0x8ULL | 0x2ULL;
+	bits = 0x8ULL;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+		bits |= 0x2;
 	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
 		bits |= 0x1;
 	bits <<= (idx * 4);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index f55381f..c83f51d 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -83,14 +83,17 @@ struct perf_counter_hw_event {
 	u64			irq_period;
 	u32			record_type;
 
-	u32			disabled     :  1, /* off by default      */
-				nmi	     :  1, /* NMI sampling        */
-				raw	     :  1, /* raw event type      */
-				inherit	     :  1, /* children inherit it */
-				pinned	     :  1, /* must always be on PMU */
-				exclusive    :  1, /* only counter on PMU */
-
-				__reserved_1 : 26;
+	u32			disabled       :  1, /* off by default        */
+				nmi	       :  1, /* NMI sampling          */
+				raw	       :  1, /* raw event type        */
+				inherit	       :  1, /* children inherit it   */
+				pinned	       :  1, /* must always be on PMU */
+				exclusive      :  1, /* only group on PMU     */
+				exclude_user   :  1, /* don't count user      */
+				exclude_kernel :  1, /* ditto kernel          */
+				exclude_hv     :  1, /* ditto hypervisor      */
+
+				__reserved_1 : 23;
 
 	u64			__reserved_2;
 };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 544193c..89d5e3f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1567,11 +1567,25 @@ sw_perf_counter_init(struct perf_counter *counter)
 {
 	const struct hw_perf_counter_ops *hw_ops = NULL;
 
+	/*
+	 * Software counters (currently) can't in general distinguish
+	 * between user, kernel and hypervisor events.
+	 * However, context switches and cpu migrations are considered
+	 * to be kernel events, and page faults are never hypervisor
+	 * events.
+	 */
 	switch (counter->hw_event.type) {
 	case PERF_COUNT_CPU_CLOCK:
-		hw_ops = &perf_ops_cpu_clock;
+		if (!(counter->hw_event.exclude_user ||
+		      counter->hw_event.exclude_kernel ||
+		      counter->hw_event.exclude_hv))
+			hw_ops = &perf_ops_cpu_clock;
 		break;
 	case PERF_COUNT_TASK_CLOCK:
+		if (counter->hw_event.exclude_user ||
+		    counter->hw_event.exclude_kernel ||
+		    counter->hw_event.exclude_hv)
+			break;
 		/*
 		 * If the user instantiates this as a per-cpu counter,
 		 * use the cpu_clock counter instead.
@@ -1582,13 +1596,17 @@ sw_perf_counter_init(struct perf_counter *counter)
 			hw_ops = &perf_ops_cpu_clock;
 		break;
 	case PERF_COUNT_PAGE_FAULTS:
-		hw_ops = &perf_ops_page_faults;
+		if (!(counter->hw_event.exclude_user ||
+		      counter->hw_event.exclude_kernel))
+			hw_ops = &perf_ops_page_faults;
 		break;
 	case PERF_COUNT_CONTEXT_SWITCHES:
-		hw_ops = &perf_ops_context_switches;
+		if (!counter->hw_event.exclude_kernel)
+			hw_ops = &perf_ops_context_switches;
 		break;
 	case PERF_COUNT_CPU_MIGRATIONS:
-		hw_ops = &perf_ops_cpu_migrations;
+		if (!counter->hw_event.exclude_kernel)
+			hw_ops = &perf_ops_cpu_migrations;
 		break;
 	default:
 		break;
-- 
cgit v0.10.2


From 5af759176cc767e7426f89764bde4996ebaaf419 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 11 Feb 2009 10:53:37 +0100
Subject: perfcounters: fix use after free in perf_release()

running...

  while true; do
    foo -d 1 -f 1 -c 100000 & sleep 1
    kerneltop -d 1 -f 1 -e 1 -c 25000 -p `pidof foo`
  done

  while true; do
    killall foo; killall kerneltop; sleep 2
  done

...in two shells with SLUB_DEBUG enabled produces flood of:
BUG task_struct: Poison overwritten.

Fix the use-after-free bug in perf_release().

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 89d5e3f..e0576c3 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1145,12 +1145,12 @@ static int perf_release(struct inode *inode, struct file *file)
 	mutex_lock(&counter->mutex);
 
 	perf_counter_remove_from_context(counter);
-	put_context(ctx);
 
 	mutex_unlock(&counter->mutex);
 	mutex_unlock(&ctx->mutex);
 
 	kfree(counter);
+	put_context(ctx);
 
 	return 0;
 }
-- 
cgit v0.10.2


From 4bcf349a0f90d1e69eb35c6df0fa285c886c1cd6 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 11 Feb 2009 13:53:19 +0100
Subject: perfcounters: fix refcounting bug, take 2

Only free child_counter if it has a parent; if it doesn't, then it
has a file pointing to it and we'll free it in perf_release.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index e0576c3..fcefb0a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1958,14 +1958,13 @@ __perf_counter_exit_task(struct task_struct *child,
 		sync_child_counter(child_counter, parent_counter);
 		list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
 					 list_entry) {
-			if (sub->parent)
+			if (sub->parent) {
 				sync_child_counter(sub, sub->parent);
-			kfree(sub);
+				kfree(sub);
+			}
 		}
-	}
-
-	if (!child_counter->filp || !atomic_long_read(&child_counter->filp->f_count))
 		kfree(child_counter);
+	}
 }
 
 /*
-- 
cgit v0.10.2


From c07c99b67233ccaad38a961c17405dc1e1542aa4 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 13 Feb 2009 22:10:34 +1100
Subject: perfcounters: make context switch and migration software counters
 work again

Jaswinder Singh Rajput reported that commit 23a185ca8abbeef caused the
context switch and migration software counters to report zero always.
With that commit, the software counters only count events that occur
between sched-in and sched-out for a task.  This is necessary for the
counter enable/disable prctls and ioctls to work.  However, the
context switch and migration counts are incremented after sched-out
for one task and before sched-in for the next.  Since the increment
doesn't occur while a task is scheduled in (as far as the software
counters are concerned) it doesn't count towards any counter.

Thus the context switch and migration counters need to count events
that occur at any time, provided the counter is enabled, not just
those that occur while the task is scheduled in (from the perf_counter
subsystem's point of view).  The problem though is that the software
counter code can't tell the difference between being enabled and being
scheduled in, and between being disabled and being scheduled out,
since we use the one pair of enable/disable entry points for both.
That is, the high-level disable operation simply arranges for the
counter to not be scheduled in any more, and the high-level enable
operation arranges for it to be scheduled in again.

One way to solve this would be to have sched_in/out operations in the
hw_perf_counter_ops struct as well as enable/disable.  However, this
takes a simpler approach: it adds a 'prev_state' field to the
perf_counter struct that allows a counter's enable method to know
whether the counter was previously disabled or just inactive
(scheduled out), and therefore whether the enable method is being
called as a result of a high-level enable or a schedule-in operation.

This then allows the context switch, migration and page fault counters
to reset their hw.prev_count value in their enable functions only if
they are called as a result of a high-level enable operation.
Although page faults would normally only occur while the counter is
scheduled in, this changes the page fault counter code too in case
there are ever circumstances where page faults get counted against a
task while its counters are not scheduled in.

Reported-by: Jaswinder Singh Rajput <jaswinder@kernel.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c83f51d..32cd1ac 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -173,6 +173,7 @@ struct perf_counter {
 	const struct hw_perf_counter_ops *hw_ops;
 
 	enum perf_counter_active_state	state;
+	enum perf_counter_active_state	prev_state;
 	atomic64_t			count;
 
 	struct perf_counter_hw_event	hw_event;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index fcefb0a..ad62965 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -444,6 +444,7 @@ static void __perf_install_in_context(void *info)
 
 	list_add_counter(counter, ctx);
 	ctx->nr_counters++;
+	counter->prev_state = PERF_COUNTER_STATE_OFF;
 
 	/*
 	 * Don't put the counter on if it is disabled or if
@@ -562,6 +563,7 @@ static void __perf_counter_enable(void *info)
 	curr_rq_lock_irq_save(&flags);
 	spin_lock(&ctx->lock);
 
+	counter->prev_state = counter->state;
 	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 		goto unlock;
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
@@ -733,6 +735,7 @@ group_sched_in(struct perf_counter *group_counter,
 	if (ret)
 		return ret < 0 ? ret : 0;
 
+	group_counter->prev_state = group_counter->state;
 	if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
 		return -EAGAIN;
 
@@ -740,6 +743,7 @@ group_sched_in(struct perf_counter *group_counter,
 	 * Schedule in siblings as one group (if any):
 	 */
 	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
+		counter->prev_state = counter->state;
 		if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
 			partial_group = counter;
 			goto group_error;
@@ -1398,9 +1402,9 @@ static void task_clock_perf_counter_read(struct perf_counter *counter)
 
 static int task_clock_perf_counter_enable(struct perf_counter *counter)
 {
-	u64 now = task_clock_perf_counter_val(counter, 0);
-
-	atomic64_set(&counter->hw.prev_count, now);
+	if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
+		atomic64_set(&counter->hw.prev_count,
+			     task_clock_perf_counter_val(counter, 0));
 
 	return 0;
 }
@@ -1455,7 +1459,8 @@ static void page_faults_perf_counter_read(struct perf_counter *counter)
 
 static int page_faults_perf_counter_enable(struct perf_counter *counter)
 {
-	atomic64_set(&counter->hw.prev_count, get_page_faults(counter));
+	if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
+		atomic64_set(&counter->hw.prev_count, get_page_faults(counter));
 	return 0;
 }
 
@@ -1501,7 +1506,9 @@ static void context_switches_perf_counter_read(struct perf_counter *counter)
 
 static int context_switches_perf_counter_enable(struct perf_counter *counter)
 {
-	atomic64_set(&counter->hw.prev_count, get_context_switches(counter));
+	if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
+		atomic64_set(&counter->hw.prev_count,
+			     get_context_switches(counter));
 	return 0;
 }
 
@@ -1547,7 +1554,9 @@ static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
 
 static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
 {
-	atomic64_set(&counter->hw.prev_count, get_cpu_migrations(counter));
+	if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
+		atomic64_set(&counter->hw.prev_count,
+			     get_cpu_migrations(counter));
 	return 0;
 }
 
-- 
cgit v0.10.2


From 73ca2f8380311115723c7afe811f3ed1f0ba945e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 16 Feb 2009 01:08:17 +0100
Subject: perfcounters: remove duplicate definition of LOCAL_PERF_VECTOR

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index b66b518..b07278c 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -112,11 +112,6 @@
 #define LOCAL_PERF_VECTOR		0xee
 
 /*
- * Performance monitoring interrupt vector:
- */
-#define LOCAL_PERF_VECTOR	0xee
-
-/*
  * First APIC vector available to drivers: (vectors 0x30-0xee) we
  * start at 0x31(0x41) to spread out vectors evenly between priority
  * levels. (0x80 is the syscall vector)
-- 
cgit v0.10.2


From 37a25424252b6cff4dd4b1937ab6a1dbfcadabcc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 16 Feb 2009 15:32:23 +0100
Subject: perfcounters: fix acpi_idle_do_entry() workaround

Fix merge error in drivers/acpi/processor_idle.c. This
resulted in non-working perfcounters on certain Nehalem
systems.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 259f6e8..08def2f 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -826,12 +826,9 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 {
 	u64 perf_flags;
 
-	u64 pctrl;
-
 	/* Don't trace irqs off for idle */
 	stop_critical_timings();
 	perf_flags = hw_perf_save_disable();
-	pctrl = hw_perf_save_disable();
 	if (cx->entry_method == ACPI_CSTATE_FFH) {
 		/* Call into architectural FFH based C-state */
 		acpi_processor_ffh_cstate_enter(cx);
@@ -847,7 +844,6 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 		unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
 	}
 	hw_perf_restore(perf_flags);
-	hw_perf_restore(pctrl);
 	start_critical_timings();
 }
 
-- 
cgit v0.10.2


From d095cd46dac104e4d2a4967c7c19b55a12f78240 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 23 Feb 2009 23:01:28 +1100
Subject: perfcounters/powerpc: Make exclude_kernel bit work on Apple G5
 processors

Currently, setting hw_event.exclude_kernel does nothing on the PPC970
variants used in Apple G5 machines, because they have the HV (hypervisor)
bit in the MSR forced to 1, so as far as the PMU is concerned, the
kernel runs in hypervisor mode.  Thus we have to use the MMCR0_FCHV
(freeze counters in hypervisor mode) bit rather than the MMCR0_FCS
(freeze counters in supervisor mode) bit.

This checks the MSR.HV bit at startup, and if it is set, we set the
freeze_counters_kernel variable to MMCR0_FCHV (it was initialized to
MMCR0_FCS).  We then use that whenever we need to exclude kernel events.

Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index bd6ba85..6e27913 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -32,6 +32,15 @@ DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
 
 struct power_pmu *ppmu;
 
+/*
+ * Normally, to ignore kernel events we set the FCS (freeze counters
+ * in supervisor mode) bit in MMCR0, but if the kernel runs with the
+ * hypervisor bit set in the MSR, or if we are running on a processor
+ * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
+ * then we need to use the FCHV bit to ignore kernel events.
+ */
+static unsigned int freeze_counters_kernel = MMCR0_FCS;
+
 void perf_counter_print_debug(void)
 {
 }
@@ -364,7 +373,7 @@ void hw_perf_restore(u64 disable)
 	if (counter->hw_event.exclude_user)
 		cpuhw->mmcr[0] |= MMCR0_FCP;
 	if (counter->hw_event.exclude_kernel)
-		cpuhw->mmcr[0] |= MMCR0_FCS;
+		cpuhw->mmcr[0] |= freeze_counters_kernel;
 	if (counter->hw_event.exclude_hv)
 		cpuhw->mmcr[0] |= MMCR0_FCHV;
 
@@ -606,10 +615,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * If we are not running on a hypervisor, force the
 	 * exclude_hv bit to 0 so that we don't care what
-	 * the user set it to.  This also means that we don't
-	 * set the MMCR0_FCHV bit, which unconditionally freezes
-	 * the counters on the PPC970 variants used in Apple G5
-	 * machines (since MSR.HV is always 1 on those machines).
+	 * the user set it to.
 	 */
 	if (!firmware_has_feature(FW_FEATURE_LPAR))
 		counter->hw_event.exclude_hv = 0;
@@ -841,6 +847,13 @@ static int init_perf_counters(void)
 		ppmu = &power6_pmu;
 		break;
 	}
+
+	/*
+	 * Use FCHV to ignore kernel events if MSR.HV is set.
+	 */
+	if (mfmsr() & MSR_HV)
+		freeze_counters_kernel = MMCR0_FCHV;
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From 742bd95ba96e19b3f7196c3a0834ebc17c8ba006 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Tue, 24 Feb 2009 11:33:56 +1100
Subject: perfcounters/powerpc: Add support for POWER5 processors

This adds the back-end for the PMU on the POWER5 processor.  This knows
how to use the fixed-function PMC5 and PMC6 (instructions completed and
run cycles).  Unlike POWER6, PMC5/6 obey the freeze conditions and can
generate interrupts, so their use doesn't impose any extra restrictions.

POWER5+ is different and is not supported by this patch.

Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 7c941ec..b4c6f46 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,7 +94,8 @@ obj-$(CONFIG_AUDIT)		+= audit.o
 obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o ppc970-pmu.o power6-pmu.o
+obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o ppc970-pmu.o power5-pmu.o \
+				   power6-pmu.o
 
 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
 
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 6e27913..112332d 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -824,6 +824,7 @@ void hw_perf_counter_setup(int cpu)
 }
 
 extern struct power_pmu ppc970_pmu;
+extern struct power_pmu power5_pmu;
 extern struct power_pmu power6_pmu;
 
 static int init_perf_counters(void)
@@ -843,6 +844,9 @@ static int init_perf_counters(void)
 	case PV_970MP:
 		ppmu = &ppc970_pmu;
 		break;
+	case PV_POWER5:
+		ppmu = &power5_pmu;
+		break;
 	case 0x3e:
 		ppmu = &power6_pmu;
 		break;
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
new file mode 100644
index 0000000..379ed10
--- /dev/null
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -0,0 +1,475 @@
+/*
+ * Performance counter support for POWER5 (not POWER5++) processors.
+ *
+ * Copyright 2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for POWER5 (not POWER5++)
+ */
+#define PM_PMC_SH	20	/* PMC number (1-based) for direct events */
+#define PM_PMC_MSK	0xf
+#define PM_PMC_MSKS	(PM_PMC_MSK << PM_PMC_SH)
+#define PM_UNIT_SH	16	/* TTMMUX number and setting - unit select */
+#define PM_UNIT_MSK	0xf
+#define PM_BYTE_SH	12	/* Byte number of event bus to use */
+#define PM_BYTE_MSK	7
+#define PM_GRS_SH	8	/* Storage subsystem mux select */
+#define PM_GRS_MSK	7
+#define PM_BUSEVENT_MSK	0x80	/* Set if event uses event bus */
+#define PM_PMCSEL_MSK	0x7f
+
+/* Values in PM_UNIT field */
+#define PM_FPU		0
+#define PM_ISU0		1
+#define PM_IFU		2
+#define PM_ISU1		3
+#define PM_IDU		4
+#define PM_ISU0_ALT	6
+#define PM_GRS		7
+#define PM_LSU0		8
+#define PM_LSU1		0xc
+#define PM_LASTUNIT	0xc
+
+/*
+ * Bits in MMCR1 for POWER5
+ */
+#define MMCR1_TTM0SEL_SH	62
+#define MMCR1_TTM1SEL_SH	60
+#define MMCR1_TTM2SEL_SH	58
+#define MMCR1_TTM3SEL_SH	56
+#define MMCR1_TTMSEL_MSK	3
+#define MMCR1_TD_CP_DBG0SEL_SH	54
+#define MMCR1_TD_CP_DBG1SEL_SH	52
+#define MMCR1_TD_CP_DBG2SEL_SH	50
+#define MMCR1_TD_CP_DBG3SEL_SH	48
+#define MMCR1_GRS_L2SEL_SH	46
+#define MMCR1_GRS_L2SEL_MSK	3
+#define MMCR1_GRS_L3SEL_SH	44
+#define MMCR1_GRS_L3SEL_MSK	3
+#define MMCR1_GRS_MCSEL_SH	41
+#define MMCR1_GRS_MCSEL_MSK	7
+#define MMCR1_GRS_FABSEL_SH	39
+#define MMCR1_GRS_FABSEL_MSK	3
+#define MMCR1_PMC1_ADDER_SEL_SH	35
+#define MMCR1_PMC2_ADDER_SEL_SH	34
+#define MMCR1_PMC3_ADDER_SEL_SH	33
+#define MMCR1_PMC4_ADDER_SEL_SH	32
+#define MMCR1_PMC1SEL_SH	25
+#define MMCR1_PMC2SEL_SH	17
+#define MMCR1_PMC3SEL_SH	9
+#define MMCR1_PMC4SEL_SH	1
+#define MMCR1_PMCSEL_SH(n)	(MMCR1_PMC1SEL_SH - (n) * 8)
+#define MMCR1_PMCSEL_MSK	0x7f
+
+/*
+ * Bits in MMCRA
+ */
+
+/*
+ * Layout of constraint bits:
+ * 6666555555555544444444443333333333222222222211111111110000000000
+ * 3210987654321098765432109876543210987654321098765432109876543210
+ *         <><>[  ><><>< ><> [  >[ >[ ><  ><  ><  ><  ><><><><><><>
+ *         T0T1 NC G0G1G2 G3  UC PS1PS2 B0  B1  B2  B3 P6P5P4P3P2P1
+ *
+ * T0 - TTM0 constraint
+ *     54-55: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0xc0_0000_0000_0000
+ *
+ * T1 - TTM1 constraint
+ *     52-53: TTM1SEL value (0=IDU, 3=GRS) 0x30_0000_0000_0000
+ *
+ * NC - number of counters
+ *     51: NC error 0x0008_0000_0000_0000
+ *     48-50: number of events needing PMC1-4 0x0007_0000_0000_0000
+ *
+ * G0..G3 - GRS mux constraints
+ *     46-47: GRS_L2SEL value
+ *     44-45: GRS_L3SEL value
+ *     41-44: GRS_MCSEL value
+ *     39-40: GRS_FABSEL value
+ *	Note that these match up with their bit positions in MMCR1
+ *
+ * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS
+ *     37: UC3 error 0x20_0000_0000
+ *     36: FPU|IFU|ISU1 events needed 0x10_0000_0000
+ *     35: ISU0 events needed 0x08_0000_0000
+ *     34: IDU|GRS events needed 0x04_0000_0000
+ *
+ * PS1
+ *     33: PS1 error 0x2_0000_0000
+ *     31-32: count of events needing PMC1/2 0x1_8000_0000
+ *
+ * PS2
+ *     30: PS2 error 0x4000_0000
+ *     28-29: count of events needing PMC3/4 0x3000_0000
+ *
+ * B0
+ *     24-27: Byte 0 event source 0x0f00_0000
+ *	      Encoding as for the event code
+ *
+ * B1, B2, B3
+ *     20-23, 16-19, 12-15: Byte 1, 2, 3 event sources
+ *
+ * P1..P6
+ *     0-11: Count of events needing PMC1..PMC6
+ */
+
+static const int grsel_shift[8] = {
+	MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH,
+	MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH,
+	MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH
+};
+
+/* Masks and values for using events from the various units */
+static u64 unit_cons[PM_LASTUNIT+1][2] = {
+	[PM_FPU] =   { 0xc0002000000000ull, 0x00001000000000ull },
+	[PM_ISU0] =  { 0x00002000000000ull, 0x00000800000000ull },
+	[PM_ISU1] =  { 0xc0002000000000ull, 0xc0001000000000ull },
+	[PM_IFU] =   { 0xc0002000000000ull, 0x80001000000000ull },
+	[PM_IDU] =   { 0x30002000000000ull, 0x00000400000000ull },
+	[PM_GRS] =   { 0x30002000000000ull, 0x30000400000000ull },
+};
+
+static int power5_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+{
+	int pmc, byte, unit, sh;
+	int bit, fmask;
+	u64 mask = 0, value = 0;
+	int grp = -1;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	if (pmc) {
+		if (pmc > 6)
+			return -1;
+		sh = (pmc - 1) * 2;
+		mask |= 2 << sh;
+		value |= 1 << sh;
+		if (pmc <= 4)
+			grp = (pmc - 1) >> 1;
+		else if (event != 0x500009 && event != 0x600005)
+			return -1;
+	}
+	if (event & PM_BUSEVENT_MSK) {
+		unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+		if (unit > PM_LASTUNIT)
+			return -1;
+		if (unit == PM_ISU0_ALT)
+			unit = PM_ISU0;
+		mask |= unit_cons[unit][0];
+		value |= unit_cons[unit][1];
+		byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+		if (byte >= 4) {
+			if (unit != PM_LSU1)
+				return -1;
+			/* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */
+			++unit;
+			byte &= 3;
+		}
+		if (unit == PM_GRS) {
+			bit = event & 7;
+			fmask = (bit == 6)? 7: 3;
+			sh = grsel_shift[bit];
+			mask |= (u64)fmask << sh;
+			value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
+		}
+		/*
+		 * Bus events on bytes 0 and 2 can be counted
+		 * on PMC1/2; bytes 1 and 3 on PMC3/4.
+		 */
+		if (!pmc)
+			grp = byte & 1;
+		/* Set byte lane select field */
+		mask  |= 0xfULL << (24 - 4 * byte);
+		value |= (u64)unit << (24 - 4 * byte);
+	}
+	if (grp == 0) {
+		/* increment PMC1/2 field */
+		mask  |= 0x200000000ull;
+		value |= 0x080000000ull;
+	} else if (grp == 1) {
+		/* increment PMC3/4 field */
+		mask  |= 0x40000000ull;
+		value |= 0x10000000ull;
+	}
+	if (pmc < 5) {
+		/* need a counter from PMC1-4 set */
+		mask  |= 0x8000000000000ull;
+		value |= 0x1000000000000ull;
+	}
+	*maskp = mask;
+	*valp = value;
+	return 0;
+}
+
+#define MAX_ALT	3	/* at most 3 alternatives for any event */
+
+static const unsigned int event_alternatives[][MAX_ALT] = {
+	{ 0x120e4,  0x400002 },			/* PM_GRP_DISP_REJECT */
+	{ 0x410c7,  0x441084 },			/* PM_THRD_L2MISS_BOTH_CYC */
+	{ 0x100005, 0x600005 },			/* PM_RUN_CYC */
+	{ 0x100009, 0x200009, 0x500009 },	/* PM_INST_CMPL */
+	{ 0x300009, 0x400009 },			/* PM_INST_DISP */
+};
+
+/*
+ * Scan the alternatives table for a match and return the
+ * index into the alternatives table if found, else -1.
+ */
+static int find_alternative(unsigned int event)
+{
+	int i, j;
+
+	for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
+		if (event < event_alternatives[i][0])
+			break;
+		for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
+			if (event == event_alternatives[i][j])
+				return i;
+	}
+	return -1;
+}
+
+static const unsigned char bytedecode_alternatives[4][4] = {
+	/* PMC 1 */	{ 0x21, 0x23, 0x25, 0x27 },
+	/* PMC 2 */	{ 0x07, 0x17, 0x0e, 0x1e },
+	/* PMC 3 */	{ 0x20, 0x22, 0x24, 0x26 },
+	/* PMC 4 */	{ 0x07, 0x17, 0x0e, 0x1e }
+};
+
+/*
+ * Some direct events for decodes of event bus byte 3 have alternative
+ * PMCSEL values on other counters.  This returns the alternative
+ * event code for those that do, or -1 otherwise.
+ */
+static int find_alternative_bdecode(unsigned int event)
+{
+	int pmc, altpmc, pp, j;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	if (pmc == 0 || pmc > 4)
+		return -1;
+	altpmc = 5 - pmc;	/* 1 <-> 4, 2 <-> 3 */
+	pp = event & PM_PMCSEL_MSK;
+	for (j = 0; j < 4; ++j) {
+		if (bytedecode_alternatives[pmc - 1][j] == pp) {
+			return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) |
+				(altpmc << PM_PMC_SH) |
+				bytedecode_alternatives[altpmc - 1][j];
+		}
+	}
+	return -1;
+}
+
+static int power5_get_alternatives(unsigned int event, unsigned int alt[])
+{
+	int i, j, ae, nalt = 1;
+
+	alt[0] = event;
+	nalt = 1;
+	i = find_alternative(event);
+	if (i >= 0) {
+		for (j = 0; j < MAX_ALT; ++j) {
+			ae = event_alternatives[i][j];
+			if (ae && ae != event)
+				alt[nalt++] = ae;
+		}
+	} else {
+		ae = find_alternative_bdecode(event);
+		if (ae > 0)
+			alt[nalt++] = ae;
+	}
+	return nalt;
+}
+
+static int power5_compute_mmcr(unsigned int event[], int n_ev,
+			       unsigned int hwc[], u64 mmcr[])
+{
+	u64 mmcr1 = 0;
+	unsigned int pmc, unit, byte, psel;
+	unsigned int ttm, grp;
+	int i, isbus, bit, grsel;
+	unsigned int pmc_inuse = 0;
+	unsigned int pmc_grp_use[2];
+	unsigned char busbyte[4];
+	unsigned char unituse[16];
+	int ttmuse;
+
+	if (n_ev > 6)
+		return -1;
+
+	/* First pass to count resource use */
+	pmc_grp_use[0] = pmc_grp_use[1] = 0;
+	memset(busbyte, 0, sizeof(busbyte));
+	memset(unituse, 0, sizeof(unituse));
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		if (pmc) {
+			if (pmc > 6)
+				return -1;
+			if (pmc_inuse & (1 << (pmc - 1)))
+				return -1;
+			pmc_inuse |= 1 << (pmc - 1);
+			/* count 1/2 vs 3/4 use */
+			if (pmc <= 4)
+				++pmc_grp_use[(pmc - 1) >> 1];
+		}
+		if (event[i] & PM_BUSEVENT_MSK) {
+			unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+			byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+			if (unit > PM_LASTUNIT)
+				return -1;
+			if (unit == PM_ISU0_ALT)
+				unit = PM_ISU0;
+			if (byte >= 4) {
+				if (unit != PM_LSU1)
+					return -1;
+				++unit;
+				byte &= 3;
+			}
+			if (!pmc)
+				++pmc_grp_use[byte & 1];
+			if (busbyte[byte] && busbyte[byte] != unit)
+				return -1;
+			busbyte[byte] = unit;
+			unituse[unit] = 1;
+		}
+	}
+	if (pmc_grp_use[0] > 2 || pmc_grp_use[1] > 2)
+		return -1;
+
+	/*
+	 * Assign resources and set multiplexer selects.
+	 *
+	 * PM_ISU0 can go either on TTM0 or TTM1, but that's the only
+	 * choice we have to deal with.
+	 */
+	if (unituse[PM_ISU0] &
+	    (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) {
+		unituse[PM_ISU0_ALT] = 1;	/* move ISU to TTM1 */
+		unituse[PM_ISU0] = 0;
+	}
+	/* Set TTM[01]SEL fields. */
+	ttmuse = 0;
+	for (i = PM_FPU; i <= PM_ISU1; ++i) {
+		if (!unituse[i])
+			continue;
+		if (ttmuse++)
+			return -1;
+		mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH;
+	}
+	ttmuse = 0;
+	for (; i <= PM_GRS; ++i) {
+		if (!unituse[i])
+			continue;
+		if (ttmuse++)
+			return -1;
+		mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH;
+	}
+	if (ttmuse > 1)
+		return -1;
+
+	/* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */
+	for (byte = 0; byte < 4; ++byte) {
+		unit = busbyte[byte];
+		if (!unit)
+			continue;
+		if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) {
+			/* get ISU0 through TTM1 rather than TTM0 */
+			unit = PM_ISU0_ALT;
+		} else if (unit == PM_LSU1 + 1) {
+			/* select lower word of LSU1 for this byte */
+			mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
+		}
+		ttm = unit >> 2;
+		mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
+	}
+
+	/* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+		psel = event[i] & PM_PMCSEL_MSK;
+		isbus = event[i] & PM_BUSEVENT_MSK;
+		if (!pmc) {
+			/* Bus event or any-PMC direct event */
+			for (pmc = 0; pmc < 4; ++pmc) {
+				if (pmc_inuse & (1 << pmc))
+					continue;
+				grp = (pmc >> 1) & 1;
+				if (isbus) {
+					if (grp == (byte & 1))
+						break;
+				} else if (pmc_grp_use[grp] < 2) {
+					++pmc_grp_use[grp];
+					break;
+				}
+			}
+			pmc_inuse |= 1 << pmc;
+		} else if (pmc <= 4) {
+			/* Direct event */
+			--pmc;
+			if ((psel == 8 || psel == 0x10) && isbus && (byte & 2))
+				/* add events on higher-numbered bus */
+				mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
+		} else {
+			/* Instructions or run cycles on PMC5/6 */
+			--pmc;
+		}
+		if (isbus && unit == PM_GRS) {
+			bit = psel & 7;
+			grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
+			mmcr1 |= (u64)grsel << grsel_shift[bit];
+		}
+		if (pmc <= 3)
+			mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
+		hwc[i] = pmc;
+	}
+
+	/* Return MMCRx values */
+	mmcr[0] = 0;
+	if (pmc_inuse & 1)
+		mmcr[0] = MMCR0_PMC1CE;
+	if (pmc_inuse & 0x3e)
+		mmcr[0] |= MMCR0_PMCjCE;
+	mmcr[1] = mmcr1;
+	mmcr[2] = 0;
+	return 0;
+}
+
+static void power5_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+	if (pmc <= 3)
+		mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
+}
+
+static int power5_generic_events[] = {
+	[PERF_COUNT_CPU_CYCLES] = 0xf,
+	[PERF_COUNT_INSTRUCTIONS] = 0x100009,
+	[PERF_COUNT_CACHE_REFERENCES] = 0x4c1090,	/* LD_REF_L1 */
+	[PERF_COUNT_CACHE_MISSES] = 0x3c1088,		/* LD_MISS_L1 */
+	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4,	/* BR_ISSUED */ 
+	[PERF_COUNT_BRANCH_MISSES] = 0x230e5,		/* BR_MPRED_CR */
+};
+
+struct power_pmu power5_pmu = {
+	.n_counter = 6,
+	.max_alternatives = MAX_ALT,
+	.add_fields = 0x7000090000555ull,
+	.test_adder = 0x3000490000000ull,
+	.compute_mmcr = power5_compute_mmcr,
+	.get_constraint = power5_get_constraint,
+	.get_alternatives = power5_get_alternatives,
+	.disable_pmc = power5_disable_pmc,
+	.n_generic = ARRAY_SIZE(power5_generic_events),
+	.generic_events = power5_generic_events,
+};
-- 
cgit v0.10.2


From f3dfd2656deb81a0addee4f4ceff66b50a387388 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 26 Feb 2009 22:43:46 +1100
Subject: perfcounters: fix a few minor cleanliness issues

This fixes three issues noticed by Arnd Bergmann:

- Add #ifdef __KERNEL__ and move some things around in perf_counter.h
  to make sure only the bits that userspace needs are exported to
  userspace.

- Use __u64, __s64, __u32 types in the structs exported to userspace
  rather than u64, s64, u32.

- Make the sys_perf_counter_open syscall available to the SPUs on
  Cell platforms.

And one issue that I noticed in looking at the code again:

- Wrap the perf_counter_open syscall with SYSCALL_DEFINE4 so we get
  the proper handling of int arguments on ppc64 (and some other 64-bit
  architectures).

Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index 4c8095f..d312eec 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -322,4 +322,4 @@ SYSCALL_SPU(epoll_create1)
 SYSCALL_SPU(dup3)
 SYSCALL_SPU(pipe2)
 SYSCALL(inotify_init1)
-SYSCALL(perf_counter_open)
+SYSCALL_SPU(perf_counter_open)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 32cd1ac..186efaf 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -13,20 +13,8 @@
 #ifndef _LINUX_PERF_COUNTER_H
 #define _LINUX_PERF_COUNTER_H
 
-#include <asm/atomic.h>
-#include <asm/ioctl.h>
-
-#ifdef CONFIG_PERF_COUNTERS
-# include <asm/perf_counter.h>
-#endif
-
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-#include <linux/spinlock.h>
-
-struct task_struct;
+#include <linux/types.h>
+#include <linux/ioctl.h>
 
 /*
  * User-space ABI bits:
@@ -78,12 +66,12 @@ enum perf_counter_record_type {
  * Hardware event to monitor via a performance monitoring counter:
  */
 struct perf_counter_hw_event {
-	s64			type;
+	__s64			type;
 
-	u64			irq_period;
-	u32			record_type;
+	__u64			irq_period;
+	__u32			record_type;
 
-	u32			disabled       :  1, /* off by default        */
+	__u32			disabled       :  1, /* off by default        */
 				nmi	       :  1, /* NMI sampling          */
 				raw	       :  1, /* raw event type        */
 				inherit	       :  1, /* children inherit it   */
@@ -95,7 +83,7 @@ struct perf_counter_hw_event {
 
 				__reserved_1 : 23;
 
-	u64			__reserved_2;
+	__u64			__reserved_2;
 };
 
 /*
@@ -104,10 +92,24 @@ struct perf_counter_hw_event {
 #define PERF_COUNTER_IOC_ENABLE		_IO('$', 0)
 #define PERF_COUNTER_IOC_DISABLE	_IO('$', 1)
 
+#ifdef __KERNEL__
 /*
- * Kernel-internal data types:
+ * Kernel-internal data types and definitions:
  */
 
+#ifdef CONFIG_PERF_COUNTERS
+# include <asm/perf_counter.h>
+#endif
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+
+struct task_struct;
+
 /**
  * struct hw_perf_counter - performance counter hardware details:
  */
@@ -293,4 +295,5 @@ static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
 static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
 #endif
 
+#endif /* __KERNEL__ */
 #endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 88255d32..28ef2be 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -696,10 +696,7 @@ asmlinkage long sys_pipe(int __user *);
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
 
-asmlinkage int sys_perf_counter_open(
-
-	struct perf_counter_hw_event	*hw_event_uptr		__user,
-	pid_t				pid,
-	int				cpu,
-	int				group_fd);
+asmlinkage long sys_perf_counter_open(
+		const struct perf_counter_hw_event __user *hw_event_uptr,
+		pid_t pid, int cpu, int group_fd);
 #endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ad62965..16b14ba 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1690,9 +1690,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
  * @cpu:		target cpu
  * @group_fd:		group leader counter fd
  */
-asmlinkage int
-sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
-		      pid_t pid, int cpu, int group_fd)
+SYSCALL_DEFINE4(perf_counter_open,
+		const struct perf_counter_hw_event __user *, hw_event_uptr,
+		pid_t, pid, int, cpu, int, group_fd)
 {
 	struct perf_counter *counter, *group_leader;
 	struct perf_counter_hw_event hw_event;
-- 
cgit v0.10.2


From b56a3802dc6df29aa27d2c12edf420258091ad66 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Fri, 27 Feb 2009 18:09:09 +0530
Subject: x86: prepare perf_counter to add more cpus

Introduced  struct pmc_x86_ops to add more cpus.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 383d4c6..a3c8852 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -3,6 +3,7 @@
  *
  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
+ *  Copyright(C) 2009 Jaswinder Singh Rajput
  *
  *  For licencing details see kernel-base/COPYING
  */
@@ -38,10 +39,24 @@ struct cpu_hw_counters {
 };
 
 /*
- * Intel PerfMon v3. Used on Core2 and later.
+ * struct pmc_x86_ops - performance counter x86 ops
  */
+struct pmc_x86_ops {
+	u64 (*save_disable_all)		(void);
+	void (*restore_all)		(u64 ctrl);
+	unsigned eventsel;
+	unsigned perfctr;
+	int (*event_map)		(int event);
+	int max_events;
+};
+
+static struct pmc_x86_ops *pmc_ops;
+
 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
 
+/*
+ * Intel PerfMon v3. Used on Core2 and later.
+ */
 static const int intel_perfmon_event_map[] =
 {
   [PERF_COUNT_CPU_CYCLES]		= 0x003c,
@@ -53,7 +68,10 @@ static const int intel_perfmon_event_map[] =
   [PERF_COUNT_BUS_CYCLES]		= 0x013c,
 };
 
-static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
+static int pmc_intel_event_map(int event)
+{
+	return intel_perfmon_event_map[event];
+}
 
 /*
  * Propagate counter elapsed time into the generic counter.
@@ -144,38 +162,48 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (hw_event->raw) {
 		hwc->config |= hw_event->type;
 	} else {
-		if (hw_event->type >= max_intel_perfmon_events)
+		if (hw_event->type >= pmc_ops->max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= intel_perfmon_event_map[hw_event->type];
+		hwc->config |= pmc_ops->event_map(hw_event->type);
 	}
 	counter->wakeup_pending = 0;
 
 	return 0;
 }
 
-u64 hw_perf_save_disable(void)
+static u64 pmc_intel_save_disable_all(void)
 {
 	u64 ctrl;
 
-	if (unlikely(!perf_counters_initialized))
-		return 0;
-
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 
 	return ctrl;
 }
+
+u64 hw_perf_save_disable(void)
+{
+	if (unlikely(!perf_counters_initialized))
+		return 0;
+
+	return pmc_ops->save_disable_all();
+}
 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
+static void pmc_intel_restore_all(u64 ctrl)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+}
+
 void hw_perf_restore(u64 ctrl)
 {
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+	pmc_ops->restore_all(ctrl);
 }
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
@@ -291,11 +319,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 
 	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
 
-	if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_INSTRUCTIONS]))
+	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_INSTRUCTIONS)))
 		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
-	if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_CPU_CYCLES]))
+	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_CPU_CYCLES)))
 		return X86_PMC_IDX_FIXED_CPU_CYCLES;
-	if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_BUS_CYCLES]))
+	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_BUS_CYCLES)))
 		return X86_PMC_IDX_FIXED_BUS_CYCLES;
 
 	return -1;
@@ -339,8 +367,8 @@ try_generic:
 			set_bit(idx, cpuc->used);
 			hwc->idx = idx;
 		}
-		hwc->config_base  = MSR_ARCH_PERFMON_EVENTSEL0;
-		hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
+		hwc->config_base  = pmc_ops->eventsel;
+		hwc->counter_base = pmc_ops->perfctr;
 	}
 
 	perf_counters_lapic_init(hwc->nmi);
@@ -386,8 +414,8 @@ void perf_counter_print_debug(void)
 	printk(KERN_INFO "CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-		rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
-		rdmsrl(MSR_ARCH_PERFMON_PERFCTR0  + idx, pmc_count);
+		rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl);
+		rdmsrl(pmc_ops->perfctr  + idx, pmc_count);
 
 		prev_left = per_cpu(prev_left[idx], cpu);
 
@@ -655,29 +683,56 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 	.priority		= 1
 };
 
-void __init init_hw_perf_counters(void)
+static struct pmc_x86_ops pmc_intel_ops = {
+	.save_disable_all	= pmc_intel_save_disable_all,
+	.restore_all		= pmc_intel_restore_all,
+	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
+	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
+	.event_map		= pmc_intel_event_map,
+	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
+};
+
+static struct pmc_x86_ops *pmc_intel_init(void)
 {
 	union cpuid10_eax eax;
 	unsigned int ebx;
 	unsigned int unused;
 	union cpuid10_edx edx;
 
-	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
-		return;
-
 	/*
 	 * Check whether the Architectural PerfMon supports
 	 * Branch Misses Retired Event or not.
 	 */
 	cpuid(10, &eax.full, &ebx, &unused, &edx.full);
 	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
-		return;
+		return NULL;
 
 	printk(KERN_INFO "Intel Performance Monitoring support detected.\n");
-
 	printk(KERN_INFO "... version:         %d\n", eax.split.version_id);
-	printk(KERN_INFO "... num counters:    %d\n", eax.split.num_counters);
+	printk(KERN_INFO "... bit width:       %d\n", eax.split.bit_width);
+	printk(KERN_INFO "... mask length:     %d\n", eax.split.mask_length);
+
 	nr_counters_generic = eax.split.num_counters;
+	nr_counters_fixed = edx.split.num_counters_fixed;
+	counter_value_mask = (1ULL << eax.split.bit_width) - 1;
+
+	return &pmc_intel_ops;
+}
+
+void __init init_hw_perf_counters(void)
+{
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+		return;
+
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_INTEL:
+		pmc_ops = pmc_intel_init();
+		break;
+	}
+	if (!pmc_ops)
+		return;
+
+	printk(KERN_INFO "... num counters:    %d\n", nr_counters_generic);
 	if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
 		nr_counters_generic = X86_PMC_MAX_GENERIC;
 		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
@@ -686,13 +741,8 @@ void __init init_hw_perf_counters(void)
 	perf_counter_mask = (1 << nr_counters_generic) - 1;
 	perf_max_counters = nr_counters_generic;
 
-	printk(KERN_INFO "... bit width:       %d\n", eax.split.bit_width);
-	counter_value_mask = (1ULL << eax.split.bit_width) - 1;
 	printk(KERN_INFO "... value mask:      %016Lx\n", counter_value_mask);
 
-	printk(KERN_INFO "... mask length:     %d\n", eax.split.mask_length);
-
-	nr_counters_fixed = edx.split.num_counters_fixed;
 	if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
 		nr_counters_fixed = X86_PMC_MAX_FIXED;
 		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
-- 
cgit v0.10.2


From f87ad35d37fa543925210550f7db20a54c83ed70 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Fri, 27 Feb 2009 20:15:14 +0530
Subject: x86: AMD Support for perf_counter

Supported basic performance counter for AMD K7 and later:

$ perfstat -e 0,1,2,3,4,5,-1,-2,-3,-4,-5 ls > /dev/null

 Performance counter stats for 'ls':

      12.298610  task clock ticks     (msecs)

        3298477  CPU cycles           (events)
        1406354  instructions         (events)
         749035  cache references     (events)
          16939  cache misses         (events)
         100589  branches             (events)
          11159  branch misses        (events)
       7.627540  cpu clock ticks      (msecs)
      12.298610  task clock ticks     (msecs)
            500  pagefaults           (events)
              6  context switches     (events)
              3  CPU migrations       (events)

 Wall-clock time elapsed:     8.672290 msecs

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 25423a5..edcde52 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -368,6 +368,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	if (c->x86 >= 6)
 		set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
 
+	/* Enable Performance counter for K7 and later */
+	if (c->x86 > 6 && c->x86 <= 0x11)
+		set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
+
 	if (!c->x86_model_id[0]) {
 		switch (c->x86) {
 		case 0xf:
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a3c8852..266618a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -74,6 +74,24 @@ static int pmc_intel_event_map(int event)
 }
 
 /*
+ * AMD Performance Monitor K7 and later.
+ */
+static const int amd_perfmon_event_map[] =
+{
+  [PERF_COUNT_CPU_CYCLES]		= 0x0076,
+  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
+  [PERF_COUNT_CACHE_REFERENCES]		= 0x0080,
+  [PERF_COUNT_CACHE_MISSES]		= 0x0081,
+  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
+  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
+};
+
+static int pmc_amd_event_map(int event)
+{
+	return amd_perfmon_event_map[event];
+}
+
+/*
  * Propagate counter elapsed time into the generic counter.
  * Can only be executed on the CPU where the counter is active.
  * Returns the delta events processed.
@@ -151,8 +169,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * so we install an artificial 1<<31 period regardless of
 	 * the generic counter period:
 	 */
-	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
-		hwc->irq_period = 0x7FFFFFFF;
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
+			hwc->irq_period = 0x7FFFFFFF;
 
 	atomic64_set(&hwc->period_left, hwc->irq_period);
 
@@ -184,6 +203,22 @@ static u64 pmc_intel_save_disable_all(void)
 	return ctrl;
 }
 
+static u64 pmc_amd_save_disable_all(void)
+{
+	int idx;
+	u64 val, ctrl = 0;
+
+	for (idx = 0; idx < nr_counters_generic; idx++) {
+		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
+		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
+			ctrl |= (1 << idx);
+		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+	}
+
+	return ctrl;
+}
+
 u64 hw_perf_save_disable(void)
 {
 	if (unlikely(!perf_counters_initialized))
@@ -198,6 +233,20 @@ static void pmc_intel_restore_all(u64 ctrl)
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 }
 
+static void pmc_amd_restore_all(u64 ctrl)
+{
+	u64 val;
+	int idx;
+
+	for (idx = 0; idx < nr_counters_generic; idx++) {
+		if (ctrl & (1 << idx)) {
+			rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
+			val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+			wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+		}
+	}
+}
+
 void hw_perf_restore(u64 ctrl)
 {
 	if (unlikely(!perf_counters_initialized))
@@ -314,6 +363,9 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 {
 	unsigned int event;
 
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+		return -1;
+
 	if (unlikely(hwc->nmi))
 		return -1;
 
@@ -401,6 +453,7 @@ void perf_counter_print_debug(void)
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 	rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
@@ -411,6 +464,7 @@ void perf_counter_print_debug(void)
 	printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
 	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
 	printk(KERN_INFO "CPU#%d: fixed:      %016llx\n", cpu, fixed);
+	}
 	printk(KERN_INFO "CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
@@ -588,6 +642,9 @@ void perf_counter_unthrottle(void)
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 		return;
 
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+		return;
+
 	if (unlikely(!perf_counters_initialized))
 		return;
 
@@ -692,6 +749,15 @@ static struct pmc_x86_ops pmc_intel_ops = {
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 };
 
+static struct pmc_x86_ops pmc_amd_ops = {
+	.save_disable_all	= pmc_amd_save_disable_all,
+	.restore_all		= pmc_amd_restore_all,
+	.eventsel		= MSR_K7_EVNTSEL0,
+	.perfctr		= MSR_K7_PERFCTR0,
+	.event_map		= pmc_amd_event_map,
+	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
+};
+
 static struct pmc_x86_ops *pmc_intel_init(void)
 {
 	union cpuid10_eax eax;
@@ -719,6 +785,16 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 	return &pmc_intel_ops;
 }
 
+static struct pmc_x86_ops *pmc_amd_init(void)
+{
+	nr_counters_generic = 4;
+	nr_counters_fixed = 0;
+
+	printk(KERN_INFO "AMD Performance Monitoring support detected.\n");
+
+	return &pmc_amd_ops;
+}
+
 void __init init_hw_perf_counters(void)
 {
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
@@ -728,6 +804,9 @@ void __init init_hw_perf_counters(void)
 	case X86_VENDOR_INTEL:
 		pmc_ops = pmc_intel_init();
 		break;
+	case X86_VENDOR_AMD:
+		pmc_ops = pmc_amd_init();
+		break;
 	}
 	if (!pmc_ops)
 		return;
-- 
cgit v0.10.2


From 169e41eb7f5464c077a7e0e129f025759d04cc54 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sat, 28 Feb 2009 18:37:49 +0530
Subject: x86: decent declarations in perf_counter.c

Impact: cleanup

making decent declrations for struct pmc_x86_ops and
fix checkpatch error:
 ERROR: Macros with complex values should be enclosed in parenthesis

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 266618a..a1f3646 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -42,12 +42,12 @@ struct cpu_hw_counters {
  * struct pmc_x86_ops - performance counter x86 ops
  */
 struct pmc_x86_ops {
-	u64 (*save_disable_all)		(void);
-	void (*restore_all)		(u64 ctrl);
-	unsigned eventsel;
-	unsigned perfctr;
-	int (*event_map)		(int event);
-	int max_events;
+	u64		(*save_disable_all)(void);
+	void		(*restore_all)(u64 ctrl);
+	unsigned	eventsel;
+	unsigned	perfctr;
+	int		(*event_map)(int event);
+	int		max_events;
 };
 
 static struct pmc_x86_ops *pmc_ops;
@@ -561,7 +561,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 /*
  * Maximum interrupt frequency of 100KHz per CPU
  */
-#define PERFMON_MAX_INTERRUPTS 100000/HZ
+#define PERFMON_MAX_INTERRUPTS (100000/HZ)
 
 /*
  * This handler is triggered by the local APIC, so the APIC IRQ handling
-- 
cgit v0.10.2


From a1ef58f442542d8b3e3b963339fbc522c36e827c Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sat, 28 Feb 2009 18:45:39 +0530
Subject: x86: use pr_info in perf_counter.c

Impact: cleanup

using pr_info in perf_counter.c fixes various 80 characters warnings and
also indenting for conditional statement

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a1f3646..3b65f19 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -454,18 +454,18 @@ void perf_counter_print_debug(void)
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
-	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
-	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-	rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
-	rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
-
-	printk(KERN_INFO "\n");
-	printk(KERN_INFO "CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
-	printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
-	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
-	printk(KERN_INFO "CPU#%d: fixed:      %016llx\n", cpu, fixed);
+		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
+		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
+
+		pr_info("\n");
+		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
+		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
+		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
+		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
 	}
-	printk(KERN_INFO "CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
+	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
 		rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl);
@@ -473,17 +473,17 @@ void perf_counter_print_debug(void)
 
 		prev_left = per_cpu(prev_left[idx], cpu);
 
-		printk(KERN_INFO "CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
+		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
 			cpu, idx, pmc_ctrl);
-		printk(KERN_INFO "CPU#%d:   gen-PMC%d count: %016llx\n",
+		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
 			cpu, idx, pmc_count);
-		printk(KERN_INFO "CPU#%d:   gen-PMC%d left:  %016llx\n",
+		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
 			cpu, idx, prev_left);
 	}
 	for (idx = 0; idx < nr_counters_fixed; idx++) {
 		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
 
-		printk(KERN_INFO "CPU#%d: fixed-PMC%d count: %016llx\n",
+		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
 			cpu, idx, pmc_count);
 	}
 	local_irq_enable();
@@ -773,10 +773,10 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
 		return NULL;
 
-	printk(KERN_INFO "Intel Performance Monitoring support detected.\n");
-	printk(KERN_INFO "... version:         %d\n", eax.split.version_id);
-	printk(KERN_INFO "... bit width:       %d\n", eax.split.bit_width);
-	printk(KERN_INFO "... mask length:     %d\n", eax.split.mask_length);
+	pr_info("Intel Performance Monitoring support detected.\n");
+	pr_info("... version:         %d\n", eax.split.version_id);
+	pr_info("... bit width:       %d\n", eax.split.bit_width);
+	pr_info("... mask length:     %d\n", eax.split.mask_length);
 
 	nr_counters_generic = eax.split.num_counters;
 	nr_counters_fixed = edx.split.num_counters_fixed;
@@ -790,7 +790,7 @@ static struct pmc_x86_ops *pmc_amd_init(void)
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
 
-	printk(KERN_INFO "AMD Performance Monitoring support detected.\n");
+	pr_info("AMD Performance Monitoring support detected.\n");
 
 	return &pmc_amd_ops;
 }
@@ -811,7 +811,7 @@ void __init init_hw_perf_counters(void)
 	if (!pmc_ops)
 		return;
 
-	printk(KERN_INFO "... num counters:    %d\n", nr_counters_generic);
+	pr_info("... num counters:    %d\n", nr_counters_generic);
 	if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
 		nr_counters_generic = X86_PMC_MAX_GENERIC;
 		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
@@ -820,18 +820,18 @@ void __init init_hw_perf_counters(void)
 	perf_counter_mask = (1 << nr_counters_generic) - 1;
 	perf_max_counters = nr_counters_generic;
 
-	printk(KERN_INFO "... value mask:      %016Lx\n", counter_value_mask);
+	pr_info("... value mask:      %016Lx\n", counter_value_mask);
 
 	if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
 		nr_counters_fixed = X86_PMC_MAX_FIXED;
 		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
 			nr_counters_fixed, X86_PMC_MAX_FIXED);
 	}
-	printk(KERN_INFO "... fixed counters:  %d\n", nr_counters_fixed);
+	pr_info("... fixed counters:  %d\n", nr_counters_fixed);
 
 	perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED;
 
-	printk(KERN_INFO "... counter mask:    %016Lx\n", perf_counter_mask);
+	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
 	perf_counters_initialized = true;
 
 	perf_counters_lapic_init(0);
-- 
cgit v0.10.2


From 2743a5b0fa6f309da904f2190a9cc25deee34dbd Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 4 Mar 2009 20:36:51 +1100
Subject: perfcounters: provide expansion room in the ABI

Impact: ABI change

This expands several fields in the perf_counter_hw_event struct and adds
a "flags" argument to the perf_counter_open system call, in order that
features can be added in future without ABI changes.

In particular the record_type field is expanded to 64 bits, and the
space for flag bits has been expanded from 32 to 64 bits.

This also adds some new fields:

* read_format (64 bits) is intended to provide a way to specify what
  userspace wants to get back when it does a read() on a simple
  (non-interrupting) counter;

* exclude_idle (1 bit) provides a way for userspace to ask that events
  that occur when the cpu is idle be excluded;

* extra_config_len will provide a way for userspace to supply an
  arbitrary amount of extra machine-specific PMU configuration data
  immediately following the perf_counter_hw_event struct, to allow
  sophisticated users to program things such as instruction matching
  CAMs and address range registers;

* __reserved_3 and __reserved_4 provide space for future expansion.

Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 186efaf..c42455a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -69,9 +69,10 @@ struct perf_counter_hw_event {
 	__s64			type;
 
 	__u64			irq_period;
-	__u32			record_type;
+	__u64			record_type;
+	__u64			read_format;
 
-	__u32			disabled       :  1, /* off by default        */
+	__u64			disabled       :  1, /* off by default        */
 				nmi	       :  1, /* NMI sampling          */
 				raw	       :  1, /* raw event type        */
 				inherit	       :  1, /* children inherit it   */
@@ -80,10 +81,15 @@ struct perf_counter_hw_event {
 				exclude_user   :  1, /* don't count user      */
 				exclude_kernel :  1, /* ditto kernel          */
 				exclude_hv     :  1, /* ditto hypervisor      */
+				exclude_idle   :  1, /* don't count when idle */
 
-				__reserved_1 : 23;
+				__reserved_1 : 55;
+
+	__u32			extra_config_len;
+	__u32			__reserved_4;
 
 	__u64			__reserved_2;
+	__u64			__reserved_3;
 };
 
 /*
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 28ef2be..ab1d772 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -698,5 +698,5 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
 asmlinkage long sys_perf_counter_open(
 		const struct perf_counter_hw_event __user *hw_event_uptr,
-		pid_t pid, int cpu, int group_fd);
+		pid_t pid, int cpu, int group_fd, unsigned long flags);
 #endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 16b14ba..b2e8389 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1683,16 +1683,16 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 }
 
 /**
- * sys_perf_task_open - open a performance counter, associate it to a task/cpu
+ * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
  *
  * @hw_event_uptr:	event type attributes for monitoring/sampling
  * @pid:		target pid
  * @cpu:		target cpu
  * @group_fd:		group leader counter fd
  */
-SYSCALL_DEFINE4(perf_counter_open,
+SYSCALL_DEFINE5(perf_counter_open,
 		const struct perf_counter_hw_event __user *, hw_event_uptr,
-		pid_t, pid, int, cpu, int, group_fd)
+		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
 {
 	struct perf_counter *counter, *group_leader;
 	struct perf_counter_hw_event hw_event;
@@ -1703,6 +1703,10 @@ SYSCALL_DEFINE4(perf_counter_open,
 	int fput_needed2 = 0;
 	int ret;
 
+	/* for future expandability... */
+	if (flags)
+		return -EINVAL;
+
 	if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
 		return -EFAULT;
 
-- 
cgit v0.10.2


From 2485e5184452ccbda34ff83883883d9107800dff Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 5 Mar 2009 12:33:16 +0100
Subject: perfcounters: fix reserved bits sizing

The sum of bits is 65 currently not 64 - so reduce the # of reserved bits
from 55 to 54.

Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c42455a..dde5645 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -83,7 +83,7 @@ struct perf_counter_hw_event {
 				exclude_hv     :  1, /* ditto hypervisor      */
 				exclude_idle   :  1, /* don't count when idle */
 
-				__reserved_1 : 55;
+				__reserved_1   : 54;
 
 	__u32			extra_config_len;
 	__u32			__reserved_4;
-- 
cgit v0.10.2


From b0f3f28e0f14eb335f67bfaae33ce8b8d74fd58b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 5 Mar 2009 18:08:27 +0100
Subject: perfcounters: IRQ and NMI support on AMD CPUs

The below completes the K7+ performance counter support:

 - IRQ support
 - NMI support

KernelTop output works now as well.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Jaswinder Singh Rajput <jaswinder@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1236273633.5187.286.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 3b65f19..6ebe9ab 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -28,6 +28,7 @@ static bool perf_counters_initialized __read_mostly;
 static int nr_counters_generic __read_mostly;
 static u64 perf_counter_mask __read_mostly;
 static u64 counter_value_mask __read_mostly;
+static int counter_value_bits __read_mostly;
 
 static int nr_counters_fixed __read_mostly;
 
@@ -35,7 +36,9 @@ struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
 	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		interrupts;
-	u64			global_enable;
+	u64			throttle_ctrl;
+	u64			active_mask;
+	int			enabled;
 };
 
 /*
@@ -43,21 +46,28 @@ struct cpu_hw_counters {
  */
 struct pmc_x86_ops {
 	u64		(*save_disable_all)(void);
-	void		(*restore_all)(u64 ctrl);
+	void		(*restore_all)(u64);
+	u64		(*get_status)(u64);
+	void		(*ack_status)(u64);
+	void		(*enable)(int, u64);
+	void		(*disable)(int, u64);
 	unsigned	eventsel;
 	unsigned	perfctr;
-	int		(*event_map)(int event);
+	u64		(*event_map)(int);
+	u64		(*raw_event)(u64);
 	int		max_events;
 };
 
 static struct pmc_x86_ops *pmc_ops;
 
-static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
+static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
+	.enabled = 1,
+};
 
 /*
  * Intel PerfMon v3. Used on Core2 and later.
  */
-static const int intel_perfmon_event_map[] =
+static const u64 intel_perfmon_event_map[] =
 {
   [PERF_COUNT_CPU_CYCLES]		= 0x003c,
   [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
@@ -68,15 +78,29 @@ static const int intel_perfmon_event_map[] =
   [PERF_COUNT_BUS_CYCLES]		= 0x013c,
 };
 
-static int pmc_intel_event_map(int event)
+static u64 pmc_intel_event_map(int event)
 {
 	return intel_perfmon_event_map[event];
 }
 
+static u64 pmc_intel_raw_event(u64 event)
+{
+#define CORE_EVNTSEL_EVENT_MASK		0x000000FF
+#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00
+#define CORE_EVNTSEL_COUNTER_MASK	0xFF000000
+
+#define CORE_EVNTSEL_MASK 		\
+	(CORE_EVNTSEL_EVENT_MASK |	\
+	 CORE_EVNTSEL_UNIT_MASK  |	\
+	 CORE_EVNTSEL_COUNTER_MASK)
+
+	return event & CORE_EVNTSEL_MASK;
+}
+
 /*
  * AMD Performance Monitor K7 and later.
  */
-static const int amd_perfmon_event_map[] =
+static const u64 amd_perfmon_event_map[] =
 {
   [PERF_COUNT_CPU_CYCLES]		= 0x0076,
   [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
@@ -86,11 +110,25 @@ static const int amd_perfmon_event_map[] =
   [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
 };
 
-static int pmc_amd_event_map(int event)
+static u64 pmc_amd_event_map(int event)
 {
 	return amd_perfmon_event_map[event];
 }
 
+static u64 pmc_amd_raw_event(u64 event)
+{
+#define K7_EVNTSEL_EVENT_MASK	0x7000000FF
+#define K7_EVNTSEL_UNIT_MASK	0x00000FF00
+#define K7_EVNTSEL_COUNTER_MASK	0x0FF000000
+
+#define K7_EVNTSEL_MASK			\
+	(K7_EVNTSEL_EVENT_MASK |	\
+	 K7_EVNTSEL_UNIT_MASK  |	\
+	 K7_EVNTSEL_COUNTER_MASK)
+
+	return event & K7_EVNTSEL_MASK;
+}
+
 /*
  * Propagate counter elapsed time into the generic counter.
  * Can only be executed on the CPU where the counter is active.
@@ -179,7 +217,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * Raw event type provide the config in the event structure
 	 */
 	if (hw_event->raw) {
-		hwc->config |= hw_event->type;
+		hwc->config |= pmc_ops->raw_event(hw_event->type);
 	} else {
 		if (hw_event->type >= pmc_ops->max_events)
 			return -EINVAL;
@@ -205,18 +243,24 @@ static u64 pmc_intel_save_disable_all(void)
 
 static u64 pmc_amd_save_disable_all(void)
 {
-	int idx;
-	u64 val, ctrl = 0;
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+	int enabled, idx;
+
+	enabled = cpuc->enabled;
+	cpuc->enabled = 0;
+	barrier();
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
+		u64 val;
+
 		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
-		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
-			ctrl |= (1 << idx);
-		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
-		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) {
+			val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+			wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+		}
 	}
 
-	return ctrl;
+	return enabled;
 }
 
 u64 hw_perf_save_disable(void)
@@ -226,6 +270,9 @@ u64 hw_perf_save_disable(void)
 
 	return pmc_ops->save_disable_all();
 }
+/*
+ * Exported because of ACPI idle
+ */
 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
 static void pmc_intel_restore_all(u64 ctrl)
@@ -235,11 +282,18 @@ static void pmc_intel_restore_all(u64 ctrl)
 
 static void pmc_amd_restore_all(u64 ctrl)
 {
-	u64 val;
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	int idx;
 
+	cpuc->enabled = ctrl;
+	barrier();
+	if (!ctrl)
+		return;
+
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-		if (ctrl & (1 << idx)) {
+		if (test_bit(idx, (unsigned long *)&cpuc->active_mask)) {
+			u64 val;
+
 			rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
 			val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
 			wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
@@ -254,8 +308,112 @@ void hw_perf_restore(u64 ctrl)
 
 	pmc_ops->restore_all(ctrl);
 }
+/*
+ * Exported because of ACPI idle
+ */
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
+static u64 pmc_intel_get_status(u64 mask)
+{
+	u64 status;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+	return status;
+}
+
+static u64 pmc_amd_get_status(u64 mask)
+{
+	u64 status = 0;
+	int idx;
+
+	for (idx = 0; idx < nr_counters_generic; idx++) {
+		s64 val;
+
+		if (!(mask & (1 << idx)))
+			continue;
+
+		rdmsrl(MSR_K7_PERFCTR0 + idx, val);
+		val <<= (64 - counter_value_bits);
+		if (val >= 0)
+			status |= (1 << idx);
+	}
+
+	return status;
+}
+
+static u64 hw_perf_get_status(u64 mask)
+{
+	if (unlikely(!perf_counters_initialized))
+		return 0;
+
+	return pmc_ops->get_status(mask);
+}
+
+static void pmc_intel_ack_status(u64 ack)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static void pmc_amd_ack_status(u64 ack)
+{
+}
+
+static void hw_perf_ack_status(u64 ack)
+{
+	if (unlikely(!perf_counters_initialized))
+		return;
+
+	pmc_ops->ack_status(ack);
+}
+
+static void pmc_intel_enable(int idx, u64 config)
+{
+	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx,
+			config | ARCH_PERFMON_EVENTSEL0_ENABLE);
+}
+
+static void pmc_amd_enable(int idx, u64 config)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+	set_bit(idx, (unsigned long *)&cpuc->active_mask);
+	if (cpuc->enabled)
+		config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+
+	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
+}
+
+static void hw_perf_enable(int idx, u64 config)
+{
+	if (unlikely(!perf_counters_initialized))
+		return;
+
+	pmc_ops->enable(idx, config);
+}
+
+static void pmc_intel_disable(int idx, u64 config)
+{
+	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config);
+}
+
+static void pmc_amd_disable(int idx, u64 config)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+	clear_bit(idx, (unsigned long *)&cpuc->active_mask);
+	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
+
+}
+
+static void hw_perf_disable(int idx, u64 config)
+{
+	if (unlikely(!perf_counters_initialized))
+		return;
+
+	pmc_ops->disable(idx, config);
+}
+
 static inline void
 __pmc_fixed_disable(struct perf_counter *counter,
 		    struct hw_perf_counter *hwc, unsigned int __idx)
@@ -278,7 +436,7 @@ __pmc_generic_disable(struct perf_counter *counter,
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
 		__pmc_fixed_disable(counter, hwc, idx);
 	else
-		wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
+		hw_perf_disable(idx, hwc->config);
 }
 
 static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
@@ -354,8 +512,7 @@ __pmc_generic_enable(struct perf_counter *counter,
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
 		__pmc_fixed_enable(counter, hwc, idx);
 	else
-		wrmsr(hwc->config_base + idx,
-		      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+		hw_perf_enable(idx, hwc->config);
 }
 
 static int
@@ -567,22 +724,20 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
  */
-static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
+static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 {
 	int bit, cpu = smp_processor_id();
 	u64 ack, status;
 	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
+	int ret = 0;
 
-	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
-
-	/* Disable counters globally */
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-	ack_APIC_irq();
+	cpuc->throttle_ctrl = hw_perf_save_disable();
 
-	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+	status = hw_perf_get_status(cpuc->throttle_ctrl);
 	if (!status)
 		goto out;
 
+	ret = 1;
 again:
 	inc_irq_stat(apic_perf_irqs);
 	ack = status;
@@ -618,12 +773,12 @@ again:
 		}
 	}
 
-	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+	hw_perf_ack_status(ack);
 
 	/*
 	 * Repeat if there is more work to be done:
 	 */
-	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+	status = hw_perf_get_status(cpuc->throttle_ctrl);
 	if (status)
 		goto again;
 out:
@@ -631,32 +786,27 @@ out:
 	 * Restore - do not reenable when global enable is off or throttled:
 	 */
 	if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
-		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
+		hw_perf_restore(cpuc->throttle_ctrl);
+
+	return ret;
 }
 
 void perf_counter_unthrottle(void)
 {
 	struct cpu_hw_counters *cpuc;
-	u64 global_enable;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 		return;
 
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
-		return;
-
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	cpuc = &per_cpu(cpu_hw_counters, smp_processor_id());
+	cpuc = &__get_cpu_var(cpu_hw_counters);
 	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
 		if (printk_ratelimit())
 			printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n");
-		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
+		hw_perf_restore(cpuc->throttle_ctrl);
 	}
-	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, global_enable);
-	if (unlikely(cpuc->global_enable && !global_enable))
-		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
 	cpuc->interrupts = 0;
 }
 
@@ -664,8 +814,8 @@ void smp_perf_counter_interrupt(struct pt_regs *regs)
 {
 	irq_enter();
 	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
+	ack_APIC_irq();
 	__smp_perf_counter_interrupt(regs, 0);
-
 	irq_exit();
 }
 
@@ -722,16 +872,23 @@ perf_counter_nmi_handler(struct notifier_block *self,
 {
 	struct die_args *args = __args;
 	struct pt_regs *regs;
+	int ret;
+
+	switch (cmd) {
+	case DIE_NMI:
+	case DIE_NMI_IPI:
+		break;
 
-	if (likely(cmd != DIE_NMI_IPI))
+	default:
 		return NOTIFY_DONE;
+	}
 
 	regs = args->regs;
 
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	__smp_perf_counter_interrupt(regs, 1);
+	ret = __smp_perf_counter_interrupt(regs, 1);
 
-	return NOTIFY_STOP;
+	return ret ? NOTIFY_STOP : NOTIFY_OK;
 }
 
 static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
@@ -743,18 +900,28 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 static struct pmc_x86_ops pmc_intel_ops = {
 	.save_disable_all	= pmc_intel_save_disable_all,
 	.restore_all		= pmc_intel_restore_all,
+	.get_status		= pmc_intel_get_status,
+	.ack_status		= pmc_intel_ack_status,
+	.enable			= pmc_intel_enable,
+	.disable		= pmc_intel_disable,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.event_map		= pmc_intel_event_map,
+	.raw_event		= pmc_intel_raw_event,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 };
 
 static struct pmc_x86_ops pmc_amd_ops = {
 	.save_disable_all	= pmc_amd_save_disable_all,
 	.restore_all		= pmc_amd_restore_all,
+	.get_status		= pmc_amd_get_status,
+	.ack_status		= pmc_amd_ack_status,
+	.enable			= pmc_amd_enable,
+	.disable		= pmc_amd_disable,
 	.eventsel		= MSR_K7_EVNTSEL0,
 	.perfctr		= MSR_K7_PERFCTR0,
 	.event_map		= pmc_amd_event_map,
+	.raw_event		= pmc_amd_raw_event,
 	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
 };
 
@@ -787,8 +954,25 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 
 static struct pmc_x86_ops *pmc_amd_init(void)
 {
+	u64 old;
+	int bits;
+
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
+	counter_value_mask = ~0ULL;
+
+	rdmsrl(MSR_K7_PERFCTR0, old);
+	wrmsrl(MSR_K7_PERFCTR0, counter_value_mask);
+	/*
+	 * read the truncated mask
+	 */
+	rdmsrl(MSR_K7_PERFCTR0, counter_value_mask);
+	wrmsrl(MSR_K7_PERFCTR0, old);
+
+	bits = 32 + fls(counter_value_mask >> 32);
+	if (bits == 32)
+		bits = fls((u32)counter_value_mask);
+	counter_value_bits = bits;
 
 	pr_info("AMD Performance Monitoring support detected.\n");
 
-- 
cgit v0.10.2


From b5e8acf66ff5db707c7e08df49fdf6b415878442 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 5 Mar 2009 20:34:21 +0100
Subject: perfcounters: IRQ and NMI support on AMD CPUs, fix

The BKGD suggests that counter width on AMD CPUs is 48 for all
existing models (it certainly is for mine).

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6ebe9ab..f585371 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -959,20 +959,8 @@ static struct pmc_x86_ops *pmc_amd_init(void)
 
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
-	counter_value_mask = ~0ULL;
-
-	rdmsrl(MSR_K7_PERFCTR0, old);
-	wrmsrl(MSR_K7_PERFCTR0, counter_value_mask);
-	/*
-	 * read the truncated mask
-	 */
-	rdmsrl(MSR_K7_PERFCTR0, counter_value_mask);
-	wrmsrl(MSR_K7_PERFCTR0, old);
-
-	bits = 32 + fls(counter_value_mask >> 32);
-	if (bits == 32)
-		bits = fls((u32)counter_value_mask);
-	counter_value_bits = bits;
+	counter_value_mask = 0x0000FFFFFFFFFFFFULL;
+	counter_value_bits = 48;
 
 	pr_info("AMD Performance Monitoring support detected.\n");
 
-- 
cgit v0.10.2


From 86028598de16538f02519141756ccf4accfc29a6 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 5 Mar 2009 14:05:57 +1100
Subject: perfcounters/powerpc: fix oops with multiple counters in a group

Impact: fix oops-causing bug

This fixes a bug in the powerpc hw_perf_counter_init where the code
didn't initialize ctrs[n] before passing the ctrs array to check_excludes,
leading to possible oopses and other incorrect behaviour.  This fixes it
by initializing ctrs[n] correctly.

Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 112332d..4fec112 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -633,6 +633,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 			return NULL;
 	}
 	events[n] = ev;
+	ctrs[n] = counter;
 	if (check_excludes(ctrs, n, 1))
 		return NULL;
 	if (power_check_constraints(events, n + 1))
-- 
cgit v0.10.2


From aabbaa6036fd847c583f585c6bae82b5a033e6c7 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 6 Mar 2009 16:27:10 +1100
Subject: perfcounters/powerpc: add support for POWER5+ processors

Impact: more hardware support

This adds the back-end for the PMU on the POWER5+ processors (i.e. GS,
including GS DD3 aka POWER5++).  This doesn't use the fixed-function
PMC5 and PMC6 since they don't respect the freeze conditions and don't
generate interrupts, as on POWER6.

Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index b4c6f46..49851e0 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,8 +94,8 @@ obj-$(CONFIG_AUDIT)		+= audit.o
 obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o ppc970-pmu.o power5-pmu.o \
-				   power6-pmu.o
+obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o ppc970-pmu.o \
+				   power5-pmu.o power5+-pmu.o power6-pmu.o
 
 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
 
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 4fec112..162f398 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -826,6 +826,7 @@ void hw_perf_counter_setup(int cpu)
 
 extern struct power_pmu ppc970_pmu;
 extern struct power_pmu power5_pmu;
+extern struct power_pmu power5p_pmu;
 extern struct power_pmu power6_pmu;
 
 static int init_perf_counters(void)
@@ -848,6 +849,9 @@ static int init_perf_counters(void)
 	case PV_POWER5:
 		ppmu = &power5_pmu;
 		break;
+	case PV_POWER5p:
+		ppmu = &power5p_pmu;
+		break;
 	case 0x3e:
 		ppmu = &power6_pmu;
 		break;
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
new file mode 100644
index 0000000..cec21ea
--- /dev/null
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -0,0 +1,452 @@
+/*
+ * Performance counter support for POWER5 (not POWER5++) processors.
+ *
+ * Copyright 2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for POWER5+ (POWER5 GS) and POWER5++ (POWER5 GS DD3)
+ */
+#define PM_PMC_SH	20	/* PMC number (1-based) for direct events */
+#define PM_PMC_MSK	0xf
+#define PM_PMC_MSKS	(PM_PMC_MSK << PM_PMC_SH)
+#define PM_UNIT_SH	16	/* TTMMUX number and setting - unit select */
+#define PM_UNIT_MSK	0xf
+#define PM_BYTE_SH	12	/* Byte number of event bus to use */
+#define PM_BYTE_MSK	7
+#define PM_GRS_SH	8	/* Storage subsystem mux select */
+#define PM_GRS_MSK	7
+#define PM_BUSEVENT_MSK	0x80	/* Set if event uses event bus */
+#define PM_PMCSEL_MSK	0x7f
+
+/* Values in PM_UNIT field */
+#define PM_FPU		0
+#define PM_ISU0		1
+#define PM_IFU		2
+#define PM_ISU1		3
+#define PM_IDU		4
+#define PM_ISU0_ALT	6
+#define PM_GRS		7
+#define PM_LSU0		8
+#define PM_LSU1		0xc
+#define PM_LASTUNIT	0xc
+
+/*
+ * Bits in MMCR1 for POWER5+
+ */
+#define MMCR1_TTM0SEL_SH	62
+#define MMCR1_TTM1SEL_SH	60
+#define MMCR1_TTM2SEL_SH	58
+#define MMCR1_TTM3SEL_SH	56
+#define MMCR1_TTMSEL_MSK	3
+#define MMCR1_TD_CP_DBG0SEL_SH	54
+#define MMCR1_TD_CP_DBG1SEL_SH	52
+#define MMCR1_TD_CP_DBG2SEL_SH	50
+#define MMCR1_TD_CP_DBG3SEL_SH	48
+#define MMCR1_GRS_L2SEL_SH	46
+#define MMCR1_GRS_L2SEL_MSK	3
+#define MMCR1_GRS_L3SEL_SH	44
+#define MMCR1_GRS_L3SEL_MSK	3
+#define MMCR1_GRS_MCSEL_SH	41
+#define MMCR1_GRS_MCSEL_MSK	7
+#define MMCR1_GRS_FABSEL_SH	39
+#define MMCR1_GRS_FABSEL_MSK	3
+#define MMCR1_PMC1_ADDER_SEL_SH	35
+#define MMCR1_PMC2_ADDER_SEL_SH	34
+#define MMCR1_PMC3_ADDER_SEL_SH	33
+#define MMCR1_PMC4_ADDER_SEL_SH	32
+#define MMCR1_PMC1SEL_SH	25
+#define MMCR1_PMC2SEL_SH	17
+#define MMCR1_PMC3SEL_SH	9
+#define MMCR1_PMC4SEL_SH	1
+#define MMCR1_PMCSEL_SH(n)	(MMCR1_PMC1SEL_SH - (n) * 8)
+#define MMCR1_PMCSEL_MSK	0x7f
+
+/*
+ * Bits in MMCRA
+ */
+
+/*
+ * Layout of constraint bits:
+ * 6666555555555544444444443333333333222222222211111111110000000000
+ * 3210987654321098765432109876543210987654321098765432109876543210
+ *             [  ><><>< ><> <><>[  >      <  ><  ><  ><  ><><><><>
+ *             NC  G0G1G2 G3 T0T1 UC        B0  B1  B2  B3 P4P3P2P1
+ *
+ * NC - number of counters
+ *     51: NC error 0x0008_0000_0000_0000
+ *     48-50: number of events needing PMC1-4 0x0007_0000_0000_0000
+ *
+ * G0..G3 - GRS mux constraints
+ *     46-47: GRS_L2SEL value
+ *     44-45: GRS_L3SEL value
+ *     41-44: GRS_MCSEL value
+ *     39-40: GRS_FABSEL value
+ *	Note that these match up with their bit positions in MMCR1
+ *
+ * T0 - TTM0 constraint
+ *     36-37: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0x30_0000_0000
+ *
+ * T1 - TTM1 constraint
+ *     34-35: TTM1SEL value (0=IDU, 3=GRS) 0x0c_0000_0000
+ *
+ * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS
+ *     33: UC3 error 0x02_0000_0000
+ *     32: FPU|IFU|ISU1 events needed 0x01_0000_0000
+ *     31: ISU0 events needed 0x01_8000_0000
+ *     30: IDU|GRS events needed 0x00_4000_0000
+ *
+ * B0
+ *     20-23: Byte 0 event source 0x00f0_0000
+ *	      Encoding as for the event code
+ *
+ * B1, B2, B3
+ *     16-19, 12-15, 8-11: Byte 1, 2, 3 event sources
+ *
+ * P4
+ *     7: P1 error 0x80
+ *     6-7: Count of events needing PMC4
+ *
+ * P1..P3
+ *     0-6: Count of events needing PMC1..PMC3
+ */
+
+static const int grsel_shift[8] = {
+	MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH,
+	MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH,
+	MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH
+};
+
+/* Masks and values for using events from the various units */
+static u64 unit_cons[PM_LASTUNIT+1][2] = {
+	[PM_FPU] =   { 0x3200000000ull, 0x0100000000ull },
+	[PM_ISU0] =  { 0x0200000000ull, 0x0080000000ull },
+	[PM_ISU1] =  { 0x3200000000ull, 0x3100000000ull },
+	[PM_IFU] =   { 0x3200000000ull, 0x2100000000ull },
+	[PM_IDU] =   { 0x0e00000000ull, 0x0040000000ull },
+	[PM_GRS] =   { 0x0e00000000ull, 0x0c40000000ull },
+};
+
+static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+{
+	int pmc, byte, unit, sh;
+	int bit, fmask;
+	u64 mask = 0, value = 0;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	if (pmc) {
+		if (pmc > 4)
+			return -1;
+		sh = (pmc - 1) * 2;
+		mask |= 2 << sh;
+		value |= 1 << sh;
+	}
+	if (event & PM_BUSEVENT_MSK) {
+		unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+		if (unit > PM_LASTUNIT)
+			return -1;
+		if (unit == PM_ISU0_ALT)
+			unit = PM_ISU0;
+		mask |= unit_cons[unit][0];
+		value |= unit_cons[unit][1];
+		byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+		if (byte >= 4) {
+			if (unit != PM_LSU1)
+				return -1;
+			/* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */
+			++unit;
+			byte &= 3;
+		}
+		if (unit == PM_GRS) {
+			bit = event & 7;
+			fmask = (bit == 6)? 7: 3;
+			sh = grsel_shift[bit];
+			mask |= (u64)fmask << sh;
+			value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
+		}
+		/* Set byte lane select field */
+		mask  |= 0xfULL << (20 - 4 * byte);
+		value |= (u64)unit << (20 - 4 * byte);
+	}
+	mask  |= 0x8000000000000ull;
+	value |= 0x1000000000000ull;
+	*maskp = mask;
+	*valp = value;
+	return 0;
+}
+
+#define MAX_ALT	3	/* at most 3 alternatives for any event */
+
+static const unsigned int event_alternatives[][MAX_ALT] = {
+	{ 0x100c0,  0x40001f },			/* PM_GCT_FULL_CYC */
+	{ 0x120e4,  0x400002 },			/* PM_GRP_DISP_REJECT */
+	{ 0x230e2,  0x323087 },			/* PM_BR_PRED_CR */
+	{ 0x230e3,  0x223087, 0x3230a0 },	/* PM_BR_PRED_TA */
+	{ 0x410c7,  0x441084 },			/* PM_THRD_L2MISS_BOTH_CYC */
+	{ 0x800c4,  0xc20e0 },			/* PM_DTLB_MISS */
+	{ 0xc50c6,  0xc60e0 },			/* PM_MRK_DTLB_MISS */
+	{ 0x100009, 0x200009 },			/* PM_INST_CMPL */
+	{ 0x200015, 0x300015 },			/* PM_LSU_LMQ_SRQ_EMPTY_CYC */
+	{ 0x300009, 0x400009 },			/* PM_INST_DISP */
+};
+
+/*
+ * Scan the alternatives table for a match and return the
+ * index into the alternatives table if found, else -1.
+ */
+static int find_alternative(unsigned int event)
+{
+	int i, j;
+
+	for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
+		if (event < event_alternatives[i][0])
+			break;
+		for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
+			if (event == event_alternatives[i][j])
+				return i;
+	}
+	return -1;
+}
+
+static const unsigned char bytedecode_alternatives[4][4] = {
+	/* PMC 1 */	{ 0x21, 0x23, 0x25, 0x27 },
+	/* PMC 2 */	{ 0x07, 0x17, 0x0e, 0x1e },
+	/* PMC 3 */	{ 0x20, 0x22, 0x24, 0x26 },
+	/* PMC 4 */	{ 0x07, 0x17, 0x0e, 0x1e }
+};
+
+/*
+ * Some direct events for decodes of event bus byte 3 have alternative
+ * PMCSEL values on other counters.  This returns the alternative
+ * event code for those that do, or -1 otherwise.  This also handles
+ * alternative PCMSEL values for add events.
+ */
+static int find_alternative_bdecode(unsigned int event)
+{
+	int pmc, altpmc, pp, j;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	if (pmc == 0 || pmc > 4)
+		return -1;
+	altpmc = 5 - pmc;	/* 1 <-> 4, 2 <-> 3 */
+	pp = event & PM_PMCSEL_MSK;
+	for (j = 0; j < 4; ++j) {
+		if (bytedecode_alternatives[pmc - 1][j] == pp) {
+			return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) |
+				(altpmc << PM_PMC_SH) |
+				bytedecode_alternatives[altpmc - 1][j];
+		}
+	}
+
+	/* new decode alternatives for power5+ */
+	if (pmc == 1 && (pp == 0x0d || pp == 0x0e))
+		return event + (2 << PM_PMC_SH) + (0x2e - 0x0d);
+	if (pmc == 3 && (pp == 0x2e || pp == 0x2f))
+		return event - (2 << PM_PMC_SH) - (0x2e - 0x0d);
+
+	/* alternative add event encodings */
+	if (pp == 0x10 || pp == 0x28)
+		return ((event ^ (0x10 ^ 0x28)) & ~PM_PMC_MSKS) |
+			(altpmc << PM_PMC_SH);
+
+	return -1;
+}
+
+static int power5p_get_alternatives(unsigned int event, unsigned int alt[])
+{
+	int i, j, ae, nalt = 1;
+
+	alt[0] = event;
+	nalt = 1;
+	i = find_alternative(event);
+	if (i >= 0) {
+		for (j = 0; j < MAX_ALT; ++j) {
+			ae = event_alternatives[i][j];
+			if (ae && ae != event)
+				alt[nalt++] = ae;
+		}
+	} else {
+		ae = find_alternative_bdecode(event);
+		if (ae > 0)
+			alt[nalt++] = ae;
+	}
+	return nalt;
+}
+
+static int power5p_compute_mmcr(unsigned int event[], int n_ev,
+				unsigned int hwc[], u64 mmcr[])
+{
+	u64 mmcr1 = 0;
+	unsigned int pmc, unit, byte, psel;
+	unsigned int ttm;
+	int i, isbus, bit, grsel;
+	unsigned int pmc_inuse = 0;
+	unsigned char busbyte[4];
+	unsigned char unituse[16];
+	int ttmuse;
+
+	if (n_ev > 4)
+		return -1;
+
+	/* First pass to count resource use */
+	memset(busbyte, 0, sizeof(busbyte));
+	memset(unituse, 0, sizeof(unituse));
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		if (pmc) {
+			if (pmc > 4)
+				return -1;
+			if (pmc_inuse & (1 << (pmc - 1)))
+				return -1;
+			pmc_inuse |= 1 << (pmc - 1);
+		}
+		if (event[i] & PM_BUSEVENT_MSK) {
+			unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+			byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+			if (unit > PM_LASTUNIT)
+				return -1;
+			if (unit == PM_ISU0_ALT)
+				unit = PM_ISU0;
+			if (byte >= 4) {
+				if (unit != PM_LSU1)
+					return -1;
+				++unit;
+				byte &= 3;
+			}
+			if (busbyte[byte] && busbyte[byte] != unit)
+				return -1;
+			busbyte[byte] = unit;
+			unituse[unit] = 1;
+		}
+	}
+
+	/*
+	 * Assign resources and set multiplexer selects.
+	 *
+	 * PM_ISU0 can go either on TTM0 or TTM1, but that's the only
+	 * choice we have to deal with.
+	 */
+	if (unituse[PM_ISU0] &
+	    (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) {
+		unituse[PM_ISU0_ALT] = 1;	/* move ISU to TTM1 */
+		unituse[PM_ISU0] = 0;
+	}
+	/* Set TTM[01]SEL fields. */
+	ttmuse = 0;
+	for (i = PM_FPU; i <= PM_ISU1; ++i) {
+		if (!unituse[i])
+			continue;
+		if (ttmuse++)
+			return -1;
+		mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH;
+	}
+	ttmuse = 0;
+	for (; i <= PM_GRS; ++i) {
+		if (!unituse[i])
+			continue;
+		if (ttmuse++)
+			return -1;
+		mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH;
+	}
+	if (ttmuse > 1)
+		return -1;
+
+	/* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */
+	for (byte = 0; byte < 4; ++byte) {
+		unit = busbyte[byte];
+		if (!unit)
+			continue;
+		if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) {
+			/* get ISU0 through TTM1 rather than TTM0 */
+			unit = PM_ISU0_ALT;
+		} else if (unit == PM_LSU1 + 1) {
+			/* select lower word of LSU1 for this byte */
+			mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
+		}
+		ttm = unit >> 2;
+		mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
+	}
+
+	/* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+		psel = event[i] & PM_PMCSEL_MSK;
+		isbus = event[i] & PM_BUSEVENT_MSK;
+		if (!pmc) {
+			/* Bus event or any-PMC direct event */
+			for (pmc = 0; pmc < 4; ++pmc) {
+				if (!(pmc_inuse & (1 << pmc)))
+					break;
+			}
+			if (pmc >= 4)
+				return -1;
+			pmc_inuse |= 1 << pmc;
+		} else {
+			/* Direct event */
+			--pmc;
+			if (isbus && (byte & 2) &&
+			    (psel == 8 || psel == 0x10 || psel == 0x28))
+				/* add events on higher-numbered bus */
+				mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
+		}
+		if (isbus && unit == PM_GRS) {
+			bit = psel & 7;
+			grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
+			mmcr1 |= (u64)grsel << grsel_shift[bit];
+		}
+		if ((psel & 0x58) == 0x40 && (byte & 1) != ((pmc >> 1) & 1))
+			/* select alternate byte lane */
+			psel |= 0x10;
+		if (pmc <= 3)
+			mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
+		hwc[i] = pmc;
+	}
+
+	/* Return MMCRx values */
+	mmcr[0] = 0;
+	if (pmc_inuse & 1)
+		mmcr[0] = MMCR0_PMC1CE;
+	if (pmc_inuse & 0x3e)
+		mmcr[0] |= MMCR0_PMCjCE;
+	mmcr[1] = mmcr1;
+	mmcr[2] = 0;
+	return 0;
+}
+
+static void power5p_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+	if (pmc <= 3)
+		mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
+}
+
+static int power5p_generic_events[] = {
+	[PERF_COUNT_CPU_CYCLES] = 0xf,
+	[PERF_COUNT_INSTRUCTIONS] = 0x100009,
+	[PERF_COUNT_CACHE_REFERENCES] = 0x1c10a8,	/* LD_REF_L1 */
+	[PERF_COUNT_CACHE_MISSES] = 0x3c1088,		/* LD_MISS_L1 */
+	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4,	/* BR_ISSUED */ 
+	[PERF_COUNT_BRANCH_MISSES] = 0x230e5,		/* BR_MPRED_CR */
+};
+
+struct power_pmu power5p_pmu = {
+	.n_counter = 4,
+	.max_alternatives = MAX_ALT,
+	.add_fields = 0x7000000000055ull,
+	.test_adder = 0x3000040000000ull,
+	.compute_mmcr = power5p_compute_mmcr,
+	.get_constraint = power5p_get_constraint,
+	.get_alternatives = power5p_get_alternatives,
+	.disable_pmc = power5p_disable_pmc,
+	.n_generic = ARRAY_SIZE(power5p_generic_events),
+	.generic_events = power5p_generic_events,
+};
-- 
cgit v0.10.2


From 880860e392d92c457e8116cdee39ec4d109174ee Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 6 Mar 2009 16:30:52 +1100
Subject: perfcounters/powerpc: add support for POWER4 processors

Impact: more hardware support

This adds the back-end for the PMU on the POWER4 and POWER4+ processors
(GP and GQ).  This is quite similar to the PPC970, with 8 PMCs, but has
fewer events than the PPC970.

Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 49851e0..8e5e2c7 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,7 +94,7 @@ obj-$(CONFIG_AUDIT)		+= audit.o
 obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o ppc970-pmu.o \
+obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o power4-pmu.o ppc970-pmu.o \
 				   power5-pmu.o power5+-pmu.o power6-pmu.o
 
 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 162f398..0e33d27 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -824,6 +824,7 @@ void hw_perf_counter_setup(int cpu)
 	cpuhw->mmcr[0] = MMCR0_FC;
 }
 
+extern struct power_pmu power4_pmu;
 extern struct power_pmu ppc970_pmu;
 extern struct power_pmu power5_pmu;
 extern struct power_pmu power5p_pmu;
@@ -841,6 +842,10 @@ static int init_perf_counters(void)
 	/* XXX should get this from cputable */
 	pvr = mfspr(SPRN_PVR);
 	switch (PVR_VER(pvr)) {
+	case PV_POWER4:
+	case PV_POWER4p:
+		ppmu = &power4_pmu;
+		break;
 	case PV_970:
 	case PV_970FX:
 	case PV_970MP:
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
new file mode 100644
index 0000000..1407b19
--- /dev/null
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -0,0 +1,557 @@
+/*
+ * Performance counter support for POWER4 (GP) and POWER4+ (GQ) processors.
+ *
+ * Copyright 2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for POWER4
+ */
+#define PM_PMC_SH	12	/* PMC number (1-based) for direct events */
+#define PM_PMC_MSK	0xf
+#define PM_UNIT_SH	8	/* TTMMUX number and setting - unit select */
+#define PM_UNIT_MSK	0xf
+#define PM_LOWER_SH	6
+#define PM_LOWER_MSK	1
+#define PM_LOWER_MSKS	0x40
+#define PM_BYTE_SH	4	/* Byte number of event bus to use */
+#define PM_BYTE_MSK	3
+#define PM_PMCSEL_MSK	7
+
+/*
+ * Unit code values
+ */
+#define PM_FPU		1
+#define PM_ISU1		2
+#define PM_IFU		3
+#define PM_IDU0		4
+#define PM_ISU1_ALT	6
+#define PM_ISU2		7
+#define PM_IFU_ALT	8
+#define PM_LSU0		9
+#define PM_LSU1		0xc
+#define PM_GPS		0xf
+
+/*
+ * Bits in MMCR0 for POWER4
+ */
+#define MMCR0_PMC1SEL_SH	8
+#define MMCR0_PMC2SEL_SH	1
+#define MMCR_PMCSEL_MSK		0x1f
+
+/*
+ * Bits in MMCR1 for POWER4
+ */
+#define MMCR1_TTM0SEL_SH	62
+#define MMCR1_TTC0SEL_SH	61
+#define MMCR1_TTM1SEL_SH	59
+#define MMCR1_TTC1SEL_SH	58
+#define MMCR1_TTM2SEL_SH	56
+#define MMCR1_TTC2SEL_SH	55
+#define MMCR1_TTM3SEL_SH	53
+#define MMCR1_TTC3SEL_SH	52
+#define MMCR1_TTMSEL_MSK	3
+#define MMCR1_TD_CP_DBG0SEL_SH	50
+#define MMCR1_TD_CP_DBG1SEL_SH	48
+#define MMCR1_TD_CP_DBG2SEL_SH	46
+#define MMCR1_TD_CP_DBG3SEL_SH	44
+#define MMCR1_DEBUG0SEL_SH	43
+#define MMCR1_DEBUG1SEL_SH	42
+#define MMCR1_DEBUG2SEL_SH	41
+#define MMCR1_DEBUG3SEL_SH	40
+#define MMCR1_PMC1_ADDER_SEL_SH	39
+#define MMCR1_PMC2_ADDER_SEL_SH	38
+#define MMCR1_PMC6_ADDER_SEL_SH	37
+#define MMCR1_PMC5_ADDER_SEL_SH	36
+#define MMCR1_PMC8_ADDER_SEL_SH	35
+#define MMCR1_PMC7_ADDER_SEL_SH	34
+#define MMCR1_PMC3_ADDER_SEL_SH	33
+#define MMCR1_PMC4_ADDER_SEL_SH	32
+#define MMCR1_PMC3SEL_SH	27
+#define MMCR1_PMC4SEL_SH	22
+#define MMCR1_PMC5SEL_SH	17
+#define MMCR1_PMC6SEL_SH	12
+#define MMCR1_PMC7SEL_SH	7
+#define MMCR1_PMC8SEL_SH	2	/* note bit 0 is in MMCRA for GP */
+
+static short mmcr1_adder_bits[8] = {
+	MMCR1_PMC1_ADDER_SEL_SH,
+	MMCR1_PMC2_ADDER_SEL_SH,
+	MMCR1_PMC3_ADDER_SEL_SH,
+	MMCR1_PMC4_ADDER_SEL_SH,
+	MMCR1_PMC5_ADDER_SEL_SH,
+	MMCR1_PMC6_ADDER_SEL_SH,
+	MMCR1_PMC7_ADDER_SEL_SH,
+	MMCR1_PMC8_ADDER_SEL_SH
+};
+
+/*
+ * Bits in MMCRA
+ */
+#define MMCRA_PMC8SEL0_SH	17	/* PMC8SEL bit 0 for GP */
+
+/*
+ * Layout of constraint bits:
+ * 6666555555555544444444443333333333222222222211111111110000000000
+ * 3210987654321098765432109876543210987654321098765432109876543210
+ *        |[  >[  >[   >|||[  >[  ><  ><  ><  ><  ><><><><><><><><>
+ *        | UC1 UC2 UC3 ||| PS1 PS2 B0  B1  B2  B3 P1P2P3P4P5P6P7P8
+ * 	  \SMPL	        ||\TTC3SEL
+ * 		        |\TTC_IFU_SEL
+ * 		        \TTM2SEL0
+ *
+ * SMPL - SAMPLE_ENABLE constraint
+ *     56: SAMPLE_ENABLE value 0x0100_0000_0000_0000
+ *
+ * UC1 - unit constraint 1: can't have all three of FPU/ISU1/IDU0|ISU2
+ *     55: UC1 error 0x0080_0000_0000_0000
+ *     54: FPU events needed 0x0040_0000_0000_0000
+ *     53: ISU1 events needed 0x0020_0000_0000_0000
+ *     52: IDU0|ISU2 events needed 0x0010_0000_0000_0000
+ *
+ * UC2 - unit constraint 2: can't have all three of FPU/IFU/LSU0
+ *     51: UC2 error 0x0008_0000_0000_0000
+ *     50: FPU events needed 0x0004_0000_0000_0000
+ *     49: IFU events needed 0x0002_0000_0000_0000
+ *     48: LSU0 events needed 0x0001_0000_0000_0000
+ *
+ * UC3 - unit constraint 3: can't have all four of LSU0/IFU/IDU0|ISU2/ISU1
+ *     47: UC3 error 0x8000_0000_0000
+ *     46: LSU0 events needed 0x4000_0000_0000
+ *     45: IFU events needed 0x2000_0000_0000
+ *     44: IDU0|ISU2 events needed 0x1000_0000_0000
+ *     43: ISU1 events needed 0x0800_0000_0000
+ *
+ * TTM2SEL0
+ *     42: 0 = IDU0 events needed
+ *     	   1 = ISU2 events needed 0x0400_0000_0000
+ *
+ * TTC_IFU_SEL
+ *     41: 0 = IFU.U events needed
+ *     	   1 = IFU.L events needed 0x0200_0000_0000
+ *
+ * TTC3SEL
+ *     40: 0 = LSU1.U events needed
+ *     	   1 = LSU1.L events needed 0x0100_0000_0000
+ *
+ * PS1
+ *     39: PS1 error 0x0080_0000_0000
+ *     36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
+ *
+ * PS2
+ *     35: PS2 error 0x0008_0000_0000
+ *     32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
+ *
+ * B0
+ *     28-31: Byte 0 event source 0xf000_0000
+ *     	   1 = FPU
+ * 	   2 = ISU1
+ * 	   3 = IFU
+ * 	   4 = IDU0
+ * 	   7 = ISU2
+ * 	   9 = LSU0
+ * 	   c = LSU1
+ * 	   f = GPS
+ *
+ * B1, B2, B3
+ *     24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
+ *
+ * P8
+ *     15: P8 error 0x8000
+ *     14-15: Count of events needing PMC8
+ *
+ * P1..P7
+ *     0-13: Count of events needing PMC1..PMC7
+ *
+ * Note: this doesn't allow events using IFU.U to be combined with events
+ * using IFU.L, though that is feasible (using TTM0 and TTM2).  However
+ * there are no listed events for IFU.L (they are debug events not
+ * verified for performance monitoring) so this shouldn't cause a
+ * problem.
+ */
+
+static struct unitinfo {
+	u64	value, mask;
+	int	unit;
+	int	lowerbit;
+} p4_unitinfo[16] = {
+	[PM_FPU]  = { 0x44000000000000ull, 0x88000000000000ull, PM_FPU, 0 },
+	[PM_ISU1] = { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 },
+	[PM_ISU1_ALT] =
+		    { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 },
+	[PM_IFU]  = { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 },
+	[PM_IFU_ALT] =
+		    { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 },
+	[PM_IDU0] = { 0x10100000000000ull, 0x80840000000000ull, PM_IDU0, 1 },
+	[PM_ISU2] = { 0x10140000000000ull, 0x80840000000000ull, PM_ISU2, 0 },
+	[PM_LSU0] = { 0x01400000000000ull, 0x08800000000000ull, PM_LSU0, 0 },
+	[PM_LSU1] = { 0x00000000000000ull, 0x00010000000000ull, PM_LSU1, 40 },
+	[PM_GPS]  = { 0x00000000000000ull, 0x00000000000000ull, PM_GPS, 0 }
+};
+
+static unsigned char direct_marked_event[8] = {
+	(1<<2) | (1<<3),	/* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */
+	(1<<3) | (1<<5),	/* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */
+	(1<<3),			/* PMC3: PM_MRK_ST_CMPL_INT */
+	(1<<4) | (1<<5),	/* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */
+	(1<<4) | (1<<5),	/* PMC5: PM_MRK_GRP_TIMEO */
+	(1<<3) | (1<<4) | (1<<5),
+		/* PMC6: PM_MRK_ST_GPS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */
+	(1<<4) | (1<<5),	/* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */
+	(1<<4),			/* PMC8: PM_MRK_LSU_FIN */
+};
+
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int p4_marked_instr_event(unsigned int event)
+{
+	int pmc, psel, unit, byte, bit;
+	unsigned int mask;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	psel = event & PM_PMCSEL_MSK;
+	if (pmc) {
+		if (direct_marked_event[pmc - 1] & (1 << psel))
+			return 1;
+		if (psel == 0)		/* add events */
+			bit = (pmc <= 4)? pmc - 1: 8 - pmc;
+		else if (psel == 6)	/* decode events */
+			bit = 4;
+		else
+			return 0;
+	} else
+		bit = psel;
+
+	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+	mask = 0;
+	switch (unit) {
+	case PM_LSU1:
+		if (event & PM_LOWER_MSKS)
+			mask = 1 << 28;		/* byte 7 bit 4 */
+		else
+			mask = 6 << 24;		/* byte 3 bits 1 and 2 */
+		break;
+	case PM_LSU0:
+		/* byte 3, bit 3; byte 2 bits 0,2,3,4,5; byte 1 */
+		mask = 0x083dff00;
+	}
+	return (mask >> (byte * 8 + bit)) & 1;
+}
+
+static int p4_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+{
+	int pmc, byte, unit, lower, sh;
+	u64 mask = 0, value = 0;
+	int grp = -1;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	if (pmc) {
+		if (pmc > 8)
+			return -1;
+		sh = (pmc - 1) * 2;
+		mask |= 2 << sh;
+		value |= 1 << sh;
+		grp = ((pmc - 1) >> 1) & 1;
+	}
+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+	if (unit) {
+		lower = (event >> PM_LOWER_SH) & PM_LOWER_MSK;
+
+		/*
+		 * Bus events on bytes 0 and 2 can be counted
+		 * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
+		 */
+		if (!pmc)
+			grp = byte & 1;
+
+		if (!p4_unitinfo[unit].unit)
+			return -1;
+		mask  |= p4_unitinfo[unit].mask;
+		value |= p4_unitinfo[unit].value;
+		sh = p4_unitinfo[unit].lowerbit;
+		if (sh > 1)
+			value |= (u64)lower << sh;
+		else if (lower != sh)
+			return -1;
+		unit = p4_unitinfo[unit].unit;
+
+		/* Set byte lane select field */
+		mask  |= 0xfULL << (28 - 4 * byte);
+		value |= (u64)unit << (28 - 4 * byte);
+	}
+	if (grp == 0) {
+		/* increment PMC1/2/5/6 field */
+		mask  |= 0x8000000000ull;
+		value |= 0x1000000000ull;
+	} else {
+		/* increment PMC3/4/7/8 field */
+		mask  |= 0x800000000ull;
+		value |= 0x100000000ull;
+	}
+
+	/* Marked instruction events need sample_enable set */
+	if (p4_marked_instr_event(event)) {
+		mask  |= 1ull << 56;
+		value |= 1ull << 56;
+	}
+
+	/* PMCSEL=6 decode events on byte 2 need sample_enable clear */
+	if (pmc && (event & PM_PMCSEL_MSK) == 6 && byte == 2)
+		mask  |= 1ull << 56;
+
+	*maskp = mask;
+	*valp = value;
+	return 0;
+}
+
+static unsigned int ppc_inst_cmpl[] = {
+	0x1001, 0x4001, 0x6001, 0x7001, 0x8001
+};
+
+static int p4_get_alternatives(unsigned int event, unsigned int alt[])
+{
+	int i, j, na;
+
+	alt[0] = event;
+	na = 1;
+
+	/* 2 possibilities for PM_GRP_DISP_REJECT */
+	if (event == 0x8003 || event == 0x0224) {
+		alt[1] = event ^ (0x8003 ^ 0x0224);
+		return 2;
+	}
+
+	/* 2 possibilities for PM_ST_MISS_L1 */
+	if (event == 0x0c13 || event == 0x0c23) {
+		alt[1] = event ^ (0x0c13 ^ 0x0c23);
+		return 2;
+	}
+
+	/* several possibilities for PM_INST_CMPL */
+	for (i = 0; i < ARRAY_SIZE(ppc_inst_cmpl); ++i) {
+		if (event == ppc_inst_cmpl[i]) {
+			for (j = 0; j < ARRAY_SIZE(ppc_inst_cmpl); ++j)
+				if (j != i)
+					alt[na++] = ppc_inst_cmpl[j];
+			break;
+		}
+	}
+
+	return na;
+}
+
+static int p4_compute_mmcr(unsigned int event[], int n_ev,
+			   unsigned int hwc[], u64 mmcr[])
+{
+	u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
+	unsigned int pmc, unit, byte, psel, lower;
+	unsigned int ttm, grp;
+	unsigned int pmc_inuse = 0;
+	unsigned int pmc_grp_use[2];
+	unsigned char busbyte[4];
+	unsigned char unituse[16];
+	unsigned int unitlower = 0;
+	int i;
+
+	if (n_ev > 8)
+		return -1;
+
+	/* First pass to count resource use */
+	pmc_grp_use[0] = pmc_grp_use[1] = 0;
+	memset(busbyte, 0, sizeof(busbyte));
+	memset(unituse, 0, sizeof(unituse));
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		if (pmc) {
+			if (pmc_inuse & (1 << (pmc - 1)))
+				return -1;
+			pmc_inuse |= 1 << (pmc - 1);
+			/* count 1/2/5/6 vs 3/4/7/8 use */
+			++pmc_grp_use[((pmc - 1) >> 1) & 1];
+		}
+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+		lower = (event[i] >> PM_LOWER_SH) & PM_LOWER_MSK;
+		if (unit) {
+			if (!pmc)
+				++pmc_grp_use[byte & 1];
+			if (unit == 6 || unit == 8)
+				/* map alt ISU1/IFU codes: 6->2, 8->3 */
+				unit = (unit >> 1) - 1;
+			if (busbyte[byte] && busbyte[byte] != unit)
+				return -1;
+			busbyte[byte] = unit;
+			lower <<= unit;
+			if (unituse[unit] && lower != (unitlower & lower))
+				return -1;
+			unituse[unit] = 1;
+			unitlower |= lower;
+		}
+	}
+	if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
+		return -1;
+
+	/*
+	 * Assign resources and set multiplexer selects.
+	 *
+	 * Units 1,2,3 are on TTM0, 4,6,7 on TTM1, 8,10 on TTM2.
+	 * Each TTMx can only select one unit, but since
+	 * units 2 and 6 are both ISU1, and 3 and 8 are both IFU,
+	 * we have some choices.
+	 */
+	if (unituse[2] & (unituse[1] | (unituse[3] & unituse[9]))) {
+		unituse[6] = 1;		/* Move 2 to 6 */
+		unituse[2] = 0;
+	}
+	if (unituse[3] & (unituse[1] | unituse[2])) {
+		unituse[8] = 1;		/* Move 3 to 8 */
+		unituse[3] = 0;
+		unitlower = (unitlower & ~8) | ((unitlower & 8) << 5);
+	}
+	/* Check only one unit per TTMx */
+	if (unituse[1] + unituse[2] + unituse[3] > 1 ||
+	    unituse[4] + unituse[6] + unituse[7] > 1 ||
+	    unituse[8] + unituse[9] > 1 ||
+	    (unituse[5] | unituse[10] | unituse[11] |
+	     unituse[13] | unituse[14]))
+		return -1;
+
+	/* Set TTMxSEL fields.  Note, units 1-3 => TTM0SEL codes 0-2 */
+	mmcr1 |= (u64)(unituse[3] * 2 + unituse[2]) << MMCR1_TTM0SEL_SH;
+	mmcr1 |= (u64)(unituse[7] * 3 + unituse[6] * 2) << MMCR1_TTM1SEL_SH;
+	mmcr1 |= (u64)unituse[9] << MMCR1_TTM2SEL_SH;
+
+	/* Set TTCxSEL fields. */
+	if (unitlower & 0xe)
+		mmcr1 |= 1ull << MMCR1_TTC0SEL_SH;
+	if (unitlower & 0xf0)
+		mmcr1 |= 1ull << MMCR1_TTC1SEL_SH;
+	if (unitlower & 0xf00)
+		mmcr1 |= 1ull << MMCR1_TTC2SEL_SH;
+	if (unitlower & 0x7000)
+		mmcr1 |= 1ull << MMCR1_TTC3SEL_SH;
+
+	/* Set byte lane select fields. */
+	for (byte = 0; byte < 4; ++byte) {
+		unit = busbyte[byte];
+		if (!unit)
+			continue;
+		if (unit == 0xf) {
+			/* special case for GPS */
+			mmcr1 |= 1ull << (MMCR1_DEBUG0SEL_SH - byte);
+		} else {
+			if (!unituse[unit])
+				ttm = unit - 1;		/* 2->1, 3->2 */
+			else
+				ttm = unit >> 2;
+			mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2*byte);
+		}
+	}
+
+	/* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+		psel = event[i] & PM_PMCSEL_MSK;
+		if (!pmc) {
+			/* Bus event or 00xxx direct event (off or cycles) */
+			if (unit)
+				psel |= 0x10 | ((byte & 2) << 2);
+			for (pmc = 0; pmc < 8; ++pmc) {
+				if (pmc_inuse & (1 << pmc))
+					continue;
+				grp = (pmc >> 1) & 1;
+				if (unit) {
+					if (grp == (byte & 1))
+						break;
+				} else if (pmc_grp_use[grp] < 4) {
+					++pmc_grp_use[grp];
+					break;
+				}
+			}
+			pmc_inuse |= 1 << pmc;
+		} else {
+			/* Direct event */
+			--pmc;
+			if (psel == 0 && (byte & 2))
+				/* add events on higher-numbered bus */
+				mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
+			else if (psel == 6 && byte == 3)
+				/* seem to need to set sample_enable here */
+				mmcra |= MMCRA_SAMPLE_ENABLE;
+			psel |= 8;
+		}
+		if (pmc <= 1)
+			mmcr0 |= psel << (MMCR0_PMC1SEL_SH - 7 * pmc);
+		else
+			mmcr1 |= psel << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
+		if (pmc == 7)	/* PMC8 */
+			mmcra |= (psel & 1) << MMCRA_PMC8SEL0_SH;
+		hwc[i] = pmc;
+		if (p4_marked_instr_event(event[i]))
+			mmcra |= MMCRA_SAMPLE_ENABLE;
+	}
+
+	if (pmc_inuse & 1)
+		mmcr0 |= MMCR0_PMC1CE;
+	if (pmc_inuse & 0xfe)
+		mmcr0 |= MMCR0_PMCjCE;
+
+	mmcra |= 0x2000;	/* mark only one IOP per PPC instruction */
+
+	/* Return MMCRx values */
+	mmcr[0] = mmcr0;
+	mmcr[1] = mmcr1;
+	mmcr[2] = mmcra;
+	return 0;
+}
+
+static void p4_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+	/*
+	 * Setting the PMCxSEL field to 0 disables PMC x.
+	 * (Note that pmc is 0-based here, not 1-based.)
+	 */
+	if (pmc <= 1) {
+		mmcr[0] &= ~(0x1fUL << (MMCR0_PMC1SEL_SH - 7 * pmc));
+	} else {
+		mmcr[1] &= ~(0x1fUL << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2)));
+		if (pmc == 7)
+			mmcr[2] &= ~(1UL << MMCRA_PMC8SEL0_SH);
+	}
+}
+
+static int p4_generic_events[] = {
+	[PERF_COUNT_CPU_CYCLES] = 7,
+	[PERF_COUNT_INSTRUCTIONS] = 0x1001,
+	[PERF_COUNT_CACHE_REFERENCES] = 0x8c10,		/* PM_LD_REF_L1 */
+	[PERF_COUNT_CACHE_MISSES] = 0x3c10,		/* PM_LD_MISS_L1 */
+	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x330,	/* PM_BR_ISSUED */
+	[PERF_COUNT_BRANCH_MISSES] = 0x331,		/* PM_BR_MPRED_CR */
+};
+
+struct power_pmu power4_pmu = {
+	.n_counter = 8,
+	.max_alternatives = 5,
+	.add_fields = 0x0000001100005555ull,
+	.test_adder = 0x0011083300000000ull,
+	.compute_mmcr = p4_compute_mmcr,
+	.get_constraint = p4_get_constraint,
+	.get_alternatives = p4_get_alternatives,
+	.disable_pmc = p4_disable_pmc,
+	.n_generic = ARRAY_SIZE(p4_generic_events),
+	.generic_events = p4_generic_events,
+};
-- 
cgit v0.10.2


From 184fe4ab1f2e4dfa45584889bb3820031648386b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sun, 8 Mar 2009 11:34:19 +0100
Subject: x86: perf_counter cleanup

Use and actual unsigned long bitmap instead of casting our way around.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Jaswinder Singh Rajput <jaswinder@kernel.org>
LKML-Reference: <1236508459.22914.3645.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f585371..1df4210 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -37,7 +37,7 @@ struct cpu_hw_counters {
 	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		interrupts;
 	u64			throttle_ctrl;
-	u64			active_mask;
+	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	int			enabled;
 };
 
@@ -291,7 +291,7 @@ static void pmc_amd_restore_all(u64 ctrl)
 		return;
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-		if (test_bit(idx, (unsigned long *)&cpuc->active_mask)) {
+		if (test_bit(idx, cpuc->active_mask)) {
 			u64 val;
 
 			rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
@@ -377,7 +377,7 @@ static void pmc_amd_enable(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
-	set_bit(idx, (unsigned long *)&cpuc->active_mask);
+	set_bit(idx, cpuc->active_mask);
 	if (cpuc->enabled)
 		config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
 
@@ -401,7 +401,7 @@ static void pmc_amd_disable(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
-	clear_bit(idx, (unsigned long *)&cpuc->active_mask);
+	clear_bit(idx, cpuc->active_mask);
 	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
 
 }
-- 
cgit v0.10.2


From e255357764f92afcafafbd4879b222b8c752065a Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Sun, 8 Mar 2009 17:09:49 +0530
Subject: x86: perf_counter cleanup

Remove unused variables and duplicate header file.

Signed-off-by: Jaswinder Singh Rajput <jaswinder@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 1df4210..155bc3c 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -17,7 +17,6 @@
 #include <linux/kdebug.h>
 #include <linux/sched.h>
 
-#include <asm/perf_counter.h>
 #include <asm/apic.h>
 
 static bool perf_counters_initialized __read_mostly;
@@ -954,9 +953,6 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 
 static struct pmc_x86_ops *pmc_amd_init(void)
 {
-	u64 old;
-	int bits;
-
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
 	counter_value_mask = 0x0000FFFFFFFFFFFFULL;
-- 
cgit v0.10.2


From bc44fb5f7d3e764ed7698c835a1a0f35aba2eb3d Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 13 Mar 2009 10:42:18 +0100
Subject: x86, bts: detect size of DS fields

Impact: more robust DS feature enumeration

Detect the size of the pointer-type fields in the DS area
configuration via the DTES64 features rather than based on
the cpuid.

Rename a variable to denote that size to reflect that it only
covers the pointer-type fields.

Add more boot-time diagnostics giving the detected size and
the sizes of BTS and PEBS records.

Use the size of the BTS/PEBS record to indicate that the
respective feature is not available (if the record size is zero).

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
LKML-Reference: <20090313104218.A30096@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 87b67e3..6e5ec67 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -39,7 +39,7 @@ struct ds_configuration {
 	/* the size of one pointer-typed field in the DS structure and
 	   in the BTS and PEBS buffers in bytes;
 	   this covers the first 8 DS fields related to buffer management. */
-	unsigned char  sizeof_field;
+	unsigned char  sizeof_ptr_field;
 	/* the size of a BTS/PEBS record in bytes */
 	unsigned char  sizeof_rec[2];
 	/* a series of bit-masks to control various features indexed
@@ -142,14 +142,14 @@ enum ds_qualifier {
 static inline unsigned long ds_get(const unsigned char *base,
 				   enum ds_qualifier qual, enum ds_field field)
 {
-	base += (ds_cfg.sizeof_field * (field + (4 * qual)));
+	base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
 	return *(unsigned long *)base;
 }
 
 static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
 			  enum ds_field field, unsigned long value)
 {
-	base += (ds_cfg.sizeof_field * (field + (4 * qual)));
+	base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
 	(*(unsigned long *)base) = value;
 }
 
@@ -410,7 +410,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
  * Later architectures use 64bit pointers throughout, whereas earlier
  * architectures use 32bit pointers in 32bit mode.
  *
- * We compute the base address for the first 8 fields based on:
+ * We compute the base address for the fields based on:
  * - the field size stored in the DS configuration
  * - the relative field position
  *
@@ -441,13 +441,13 @@ enum bts_field {
 
 static inline unsigned long bts_get(const char *base, enum bts_field field)
 {
-	base += (ds_cfg.sizeof_field * field);
+	base += (ds_cfg.sizeof_ptr_field * field);
 	return *(unsigned long *)base;
 }
 
 static inline void bts_set(char *base, enum bts_field field, unsigned long val)
 {
-	base += (ds_cfg.sizeof_field * field);;
+	base += (ds_cfg.sizeof_ptr_field * field);;
 	(*(unsigned long *)base) = val;
 }
 
@@ -593,6 +593,10 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
 	struct ds_context *context;
 	int error;
 
+	error = -EOPNOTSUPP;
+	if (!ds_cfg.sizeof_rec[qual])
+		goto out;
+
 	error = -EINVAL;
 	if (!base)
 		goto out;
@@ -635,10 +639,6 @@ struct bts_tracer *ds_request_bts(struct task_struct *task,
 	unsigned long irq;
 	int error;
 
-	error = -EOPNOTSUPP;
-	if (!ds_cfg.ctl[dsf_bts])
-		goto out;
-
 	/* buffer overflow notification is not yet implemented */
 	error = -EOPNOTSUPP;
 	if (ovfl)
@@ -848,7 +848,8 @@ const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
 
 	ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
 	tracer->trace.reset_value =
-		*(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8));
+		*(u64 *)(tracer->ds.context->ds +
+			 (ds_cfg.sizeof_ptr_field * 8));
 
 	return &tracer->trace;
 }
@@ -884,7 +885,8 @@ int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)
 	if (!tracer)
 		return -EINVAL;
 
-	*(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value;
+	*(u64 *)(tracer->ds.context->ds +
+		 (ds_cfg.sizeof_ptr_field * 8)) = value;
 
 	return 0;
 }
@@ -894,52 +896,54 @@ static const struct ds_configuration ds_cfg_netburst = {
 	.ctl[dsf_bts]		= (1 << 2) | (1 << 3),
 	.ctl[dsf_bts_kernel]	= (1 << 5),
 	.ctl[dsf_bts_user]	= (1 << 6),
-
-	.sizeof_field		= sizeof(long),
-	.sizeof_rec[ds_bts]	= sizeof(long) * 3,
-#ifdef __i386__
-	.sizeof_rec[ds_pebs]	= sizeof(long) * 10,
-#else
-	.sizeof_rec[ds_pebs]	= sizeof(long) * 18,
-#endif
 };
 static const struct ds_configuration ds_cfg_pentium_m = {
 	.name = "Pentium M",
 	.ctl[dsf_bts]		= (1 << 6) | (1 << 7),
-
-	.sizeof_field		= sizeof(long),
-	.sizeof_rec[ds_bts]	= sizeof(long) * 3,
-#ifdef __i386__
-	.sizeof_rec[ds_pebs]	= sizeof(long) * 10,
-#else
-	.sizeof_rec[ds_pebs]	= sizeof(long) * 18,
-#endif
 };
 static const struct ds_configuration ds_cfg_core2_atom = {
 	.name = "Core 2/Atom",
 	.ctl[dsf_bts]		= (1 << 6) | (1 << 7),
 	.ctl[dsf_bts_kernel]	= (1 << 9),
 	.ctl[dsf_bts_user]	= (1 << 10),
-
-	.sizeof_field		= 8,
-	.sizeof_rec[ds_bts]	= 8 * 3,
-	.sizeof_rec[ds_pebs]	= 8 * 18,
 };
 
 static void
-ds_configure(const struct ds_configuration *cfg)
+ds_configure(const struct ds_configuration *cfg,
+	     struct cpuinfo_x86 *cpu)
 {
+	unsigned long nr_pebs_fields = 0;
+
+	printk(KERN_INFO "[ds] using %s configuration\n", cfg->name);
+
+#ifdef __i386__
+	nr_pebs_fields = 10;
+#else
+	nr_pebs_fields = 18;
+#endif
+
 	memset(&ds_cfg, 0, sizeof(ds_cfg));
 	ds_cfg = *cfg;
 
-	printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name);
+	ds_cfg.sizeof_ptr_field =
+		(cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4);
 
-	if (!cpu_has_bts) {
-		ds_cfg.ctl[dsf_bts] = 0;
+	ds_cfg.sizeof_rec[ds_bts]  = ds_cfg.sizeof_ptr_field * 3;
+	ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields;
+
+	if (!cpu_has(cpu, X86_FEATURE_BTS)) {
+		ds_cfg.sizeof_rec[ds_bts] = 0;
 		printk(KERN_INFO "[ds] bts not available\n");
 	}
-	if (!cpu_has_pebs)
+	if (!cpu_has(cpu, X86_FEATURE_PEBS)) {
+		ds_cfg.sizeof_rec[ds_pebs] = 0;
 		printk(KERN_INFO "[ds] pebs not available\n");
+	}
+
+	printk(KERN_INFO "[ds] sizes: address: %u bit, ",
+	       8 * ds_cfg.sizeof_ptr_field);
+	printk("bts/pebs record: %u/%u bytes\n",
+	       ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
 
 	WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field));
 }
@@ -951,12 +955,12 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
 		switch (c->x86_model) {
 		case 0x9:
 		case 0xd: /* Pentium M */
-			ds_configure(&ds_cfg_pentium_m);
+			ds_configure(&ds_cfg_pentium_m, c);
 			break;
 		case 0xf:
 		case 0x17: /* Core2 */
 		case 0x1c: /* Atom */
-			ds_configure(&ds_cfg_core2_atom);
+			ds_configure(&ds_cfg_core2_atom, c);
 			break;
 		case 0x1a: /* i7 */
 		default:
@@ -969,7 +973,7 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
 		case 0x0:
 		case 0x1:
 		case 0x2: /* Netburst */
-			ds_configure(&ds_cfg_netburst);
+			ds_configure(&ds_cfg_netburst, c);
 			break;
 		default:
 			/* sorry, don't know about them */
-- 
cgit v0.10.2


From 8a327f6d1b05f5ce16572b4413a5df1d0e872283 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 13 Mar 2009 10:45:07 +0100
Subject: x86, bts: add selftest for BTS

Perform a selftest of branch trace store when a cpu is initialized.

WARN and disable branch trace store support if the selftest fails.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
LKML-Reference: <20090313104507.A30125@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index fdb45df..dfd74ab 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -175,6 +175,15 @@ config IOMMU_LEAK
 	  Add a simple leak tracer to the IOMMU code. This is useful when you
 	  are debugging a buggy device driver that leaks IOMMU mappings.
 
+config X86_DS_SELFTEST
+    bool "DS selftest"
+    default y
+    depends on DEBUG_KERNEL
+    depends on X86_DS
+	---help---
+	  Perform Debug Store selftests at boot time.
+	  If in doubt, say "N".
+
 config HAVE_MMIOTRACE_SUPPORT
 	def_bool y
 
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 339ce35..a0c9e13 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -44,6 +44,7 @@ obj-y				+= process.o
 obj-y				+= i387.o xsave.o
 obj-y				+= ptrace.o
 obj-$(CONFIG_X86_DS)		+= ds.o
+obj-$(CONFIG_X86_DS_SELFTEST)		+= ds_selftest.o
 obj-$(CONFIG_X86_32)		+= tls.o
 obj-$(CONFIG_IA32_EMULATION)	+= tls.o
 obj-y				+= step.o
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 6e5ec67..51c936c 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -29,6 +29,7 @@
 #include <linux/mm.h>
 #include <linux/kernel.h>
 
+#include "ds_selftest.h"
 
 /*
  * The configuration for a particular DS hardware implementation.
@@ -940,6 +941,26 @@ ds_configure(const struct ds_configuration *cfg,
 		printk(KERN_INFO "[ds] pebs not available\n");
 	}
 
+	if (ds_cfg.sizeof_rec[ds_bts]) {
+		int error;
+
+		error = ds_selftest_bts();
+		if (error) {
+			WARN(1, "[ds] selftest failed. disabling bts.\n");
+			ds_cfg.sizeof_rec[ds_bts] = 0;
+		}
+	}
+
+	if (ds_cfg.sizeof_rec[ds_pebs]) {
+		int error;
+
+		error = ds_selftest_pebs();
+		if (error) {
+			WARN(1, "[ds] selftest failed. disabling pebs.\n");
+			ds_cfg.sizeof_rec[ds_pebs] = 0;
+		}
+	}
+
 	printk(KERN_INFO "[ds] sizes: address: %u bit, ",
 	       8 * ds_cfg.sizeof_ptr_field);
 	printk("bts/pebs record: %u/%u bytes\n",
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
new file mode 100644
index 0000000..8c46fbf
--- /dev/null
+++ b/arch/x86/kernel/ds_selftest.c
@@ -0,0 +1,241 @@
+/*
+ * Debug Store support - selftest
+ *
+ *
+ * Copyright (C) 2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2009
+ */
+
+#include "ds_selftest.h"
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+#include <asm/ds.h>
+
+
+#define DS_SELFTEST_BUFFER_SIZE 1021 /* Intentionally chose an odd size. */
+
+
+static int ds_selftest_bts_consistency(const struct bts_trace *trace)
+{
+	int error = 0;
+
+	if (!trace) {
+		printk(KERN_CONT "failed to access trace...");
+		/* Bail out. Other tests are pointless. */
+		return -1;
+	}
+
+	if (!trace->read) {
+		printk(KERN_CONT "bts read not available...");
+		error = -1;
+	}
+
+	/* Do some sanity checks on the trace configuration. */
+	if (!trace->ds.n) {
+		printk(KERN_CONT "empty bts buffer...");
+		error = -1;
+	}
+	if (!trace->ds.size) {
+		printk(KERN_CONT "bad bts trace setup...");
+		error = -1;
+	}
+	if (trace->ds.end !=
+	    (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) {
+		printk(KERN_CONT "bad bts buffer setup...");
+		error = -1;
+	}
+	if ((trace->ds.top < trace->ds.begin) ||
+	    (trace->ds.end <= trace->ds.top)) {
+		printk(KERN_CONT "bts top out of bounds...");
+		error = -1;
+	}
+
+	return error;
+}
+
+static int ds_selftest_bts_read(struct bts_tracer *tracer,
+				const struct bts_trace *trace,
+				const void *from, const void *to)
+{
+	const unsigned char *at;
+
+	/*
+	 * Check a few things which do not belong to this test.
+	 * They should be covered by other tests.
+	 */
+	if (!trace)
+		return -1;
+
+	if (!trace->read)
+		return -1;
+
+	if (to < from)
+		return -1;
+
+	if (from < trace->ds.begin)
+		return -1;
+
+	if (trace->ds.end < to)
+		return -1;
+
+	if (!trace->ds.size)
+		return -1;
+
+	/* Now to the test itself. */
+	for (at = from; (void *)at < to; at += trace->ds.size) {
+		struct bts_struct bts;
+		size_t index;
+		int error;
+
+		if (((void *)at - trace->ds.begin) % trace->ds.size) {
+			printk(KERN_CONT
+			       "read from non-integer index...");
+			return -1;
+		}
+		index = ((void *)at - trace->ds.begin) / trace->ds.size;
+
+		memset(&bts, 0, sizeof(bts));
+		error = trace->read(tracer, at, &bts);
+		if (error < 0) {
+			printk(KERN_CONT
+			       "error reading bts trace at [%lu] (0x%p)...",
+			       index, at);
+			return error;
+		}
+
+		switch (bts.qualifier) {
+		case BTS_BRANCH:
+			break;
+		default:
+			printk(KERN_CONT
+			       "unexpected bts entry %llu at [%lu] (0x%p)...",
+			       bts.qualifier, index, at);
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+int ds_selftest_bts(void)
+{
+	const struct bts_trace *trace;
+	struct bts_tracer *tracer;
+	int error = 0;
+	void *top;
+	unsigned char buffer[DS_SELFTEST_BUFFER_SIZE];
+
+	printk(KERN_INFO "[ds] bts selftest...");
+
+	tracer = ds_request_bts(NULL, buffer, DS_SELFTEST_BUFFER_SIZE,
+				NULL, (size_t)-1, BTS_KERNEL);
+	if (IS_ERR(tracer)) {
+		error = PTR_ERR(tracer);
+		tracer = NULL;
+
+		printk(KERN_CONT
+		       "initialization failed (err: %d)...", error);
+		goto out;
+	}
+
+	/* The return should already give us enough trace. */
+	ds_suspend_bts(tracer);
+
+	/* Let's see if we can access the trace. */
+	trace = ds_read_bts(tracer);
+
+	error = ds_selftest_bts_consistency(trace);
+	if (error < 0)
+		goto out;
+
+	/* If everything went well, we should have a few trace entries. */
+	if (trace->ds.top == trace->ds.begin) {
+		/*
+		 * It is possible but highly unlikely that we got a
+		 * buffer overflow and end up at exactly the same
+		 * position we started from.
+		 * Let's issue a warning, but continue.
+		 */
+		printk(KERN_CONT "no trace/overflow...");
+	}
+
+	/* Let's try to read the trace we collected. */
+	error = ds_selftest_bts_read(tracer, trace,
+				     trace->ds.begin, trace->ds.top);
+	if (error < 0)
+		goto out;
+
+	/*
+	 * Let's read the trace again.
+	 * Since we suspended tracing, we should get the same result.
+	 */
+	top = trace->ds.top;
+
+	trace = ds_read_bts(tracer);
+	error = ds_selftest_bts_consistency(trace);
+	if (error < 0)
+		goto out;
+
+	if (top != trace->ds.top) {
+		printk(KERN_CONT "suspend not working...");
+		error = -1;
+		goto out;
+	}
+
+	/* Let's collect some more trace - see if resume is working. */
+	ds_resume_bts(tracer);
+	ds_suspend_bts(tracer);
+
+	trace = ds_read_bts(tracer);
+
+	error = ds_selftest_bts_consistency(trace);
+	if (error < 0)
+		goto out;
+
+	if (trace->ds.top == top) {
+		/*
+		 * It is possible but highly unlikely that we got a
+		 * buffer overflow and end up at exactly the same
+		 * position we started from.
+		 * Let's issue a warning and check the full trace.
+		 */
+		printk(KERN_CONT
+		       "no resume progress/overflow...");
+
+		error = ds_selftest_bts_read(tracer, trace,
+					     trace->ds.begin, trace->ds.end);
+	} else if (trace->ds.top < top) {
+		/*
+		 * We had a buffer overflow - the entire buffer should
+		 * contain trace records.
+		 */
+		error = ds_selftest_bts_read(tracer, trace,
+					     trace->ds.begin, trace->ds.end);
+	} else {
+		/*
+		 * It is quite likely that the buffer did not overflow.
+		 * Let's just check the delta trace.
+		 */
+		error = ds_selftest_bts_read(tracer, trace,
+					     top, trace->ds.top);
+	}
+	if (error < 0)
+		goto out;
+
+	error = 0;
+
+	/* The final test: release the tracer while tracing is suspended. */
+ out:
+	ds_release_bts(tracer);
+
+	printk(KERN_CONT "%s.\n", (error ? "failed" : "passed"));
+
+	return error;
+}
+
+int ds_selftest_pebs(void)
+{
+	return 0;
+}
diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h
new file mode 100644
index 0000000..0e6e19d
--- /dev/null
+++ b/arch/x86/kernel/ds_selftest.h
@@ -0,0 +1,15 @@
+/*
+ * Debug Store support - selftest
+ *
+ *
+ * Copyright (C) 2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2009
+ */
+
+#ifdef CONFIG_X86_DS_SELFTEST
+extern int ds_selftest_bts(void);
+extern int ds_selftest_pebs(void);
+#else
+static inline int ds_selftest_bts(void) { return 0; }
+static inline int ds_selftest_pebs(void) { return 0; }
+#endif /* CONFIG_X86_DS_SELFTEST */
-- 
cgit v0.10.2


From b8e47195451c5d3f62620b2b1b5928669afd56eb Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 13 Mar 2009 10:46:42 +0100
Subject: x86, bts: correct comment style in ds.c

Correct the comment style in ds.c.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
LKML-Reference: <20090313104642.A30149@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 51c936c..d9cab71 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -35,25 +35,22 @@
  * The configuration for a particular DS hardware implementation.
  */
 struct ds_configuration {
-	/* the name of the configuration */
+	/* The name of the configuration. */
 	const char *name;
-	/* the size of one pointer-typed field in the DS structure and
-	   in the BTS and PEBS buffers in bytes;
-	   this covers the first 8 DS fields related to buffer management. */
+	/* The size of pointer-typed fields in DS, BTS, and PEBS. */
 	unsigned char  sizeof_ptr_field;
-	/* the size of a BTS/PEBS record in bytes */
+	/* The size of a BTS/PEBS record in bytes. */
 	unsigned char  sizeof_rec[2];
-	/* a series of bit-masks to control various features indexed
-	 * by enum ds_feature */
+	/* Control bit-masks indexed by enum ds_feature. */
 	unsigned long ctl[dsf_ctl_max];
 };
 static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
 
 #define ds_cfg per_cpu(ds_cfg_array, smp_processor_id())
 
-#define MAX_SIZEOF_DS (12 * 8)	/* maximal size of a DS configuration */
-#define MAX_SIZEOF_BTS (3 * 8)	/* maximal size of a BTS record */
-#define DS_ALIGNMENT (1 << 3)	/* BTS and PEBS buffer alignment */
+#define MAX_SIZEOF_DS (12 * 8)	/* Maximal size of a DS configuration. */
+#define MAX_SIZEOF_BTS (3 * 8)	/* Maximal size of a BTS record. */
+#define DS_ALIGNMENT (1 << 3)	/* BTS and PEBS buffer alignment. */
 
 #define BTS_CONTROL \
  (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\
@@ -67,28 +64,28 @@ static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
  * to identify tracers.
  */
 struct ds_tracer {
-	/* the DS context (partially) owned by this tracer */
+	/* The DS context (partially) owned by this tracer. */
 	struct ds_context *context;
-	/* the buffer provided on ds_request() and its size in bytes */
+	/* The buffer provided on ds_request() and its size in bytes. */
 	void *buffer;
 	size_t size;
 };
 
 struct bts_tracer {
-	/* the common DS part */
+	/* The common DS part. */
 	struct ds_tracer ds;
-	/* the trace including the DS configuration */
+	/* The trace including the DS configuration. */
 	struct bts_trace trace;
-	/* buffer overflow notification function */
+	/* Buffer overflow notification function. */
 	bts_ovfl_callback_t ovfl;
 };
 
 struct pebs_tracer {
-	/* the common DS part */
+	/* The common DS part. */
 	struct ds_tracer ds;
-	/* the trace including the DS configuration */
+	/* The trace including the DS configuration. */
 	struct pebs_trace trace;
-	/* buffer overflow notification function */
+	/* Buffer overflow notification function. */
 	pebs_ovfl_callback_t ovfl;
 };
 
@@ -214,18 +211,16 @@ static inline int check_tracer(struct task_struct *task)
  * deallocated when the last user puts the context.
  */
 struct ds_context {
-	/* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
+	/* The DS configuration; goes into MSR_IA32_DS_AREA. */
 	unsigned char ds[MAX_SIZEOF_DS];
-	/* the owner of the BTS and PEBS configuration, respectively */
+	/* The owner of the BTS and PEBS configuration, respectively. */
 	struct bts_tracer *bts_master;
 	struct pebs_tracer *pebs_master;
-	/* use count */
+	/* Use count. */
 	unsigned long count;
-	/* a pointer to the context location inside the thread_struct
-	 * or the per_cpu context array */
+	/* Pointer to the context pointer field. */
 	struct ds_context **this;
-	/* a pointer to the task owning this context, or NULL, if the
-	 * context is owned by a cpu */
+	/* The traced task; NULL for current cpu. */
 	struct task_struct *task;
 };
 
@@ -350,14 +345,14 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
 		unsigned long write_size, adj_write_size;
 
 		/*
-		 * write as much as possible without producing an
+		 * Write as much as possible without producing an
 		 * overflow interrupt.
 		 *
-		 * interrupt_threshold must either be
+		 * Interrupt_threshold must either be
 		 * - bigger than absolute_maximum or
 		 * - point to a record between buffer_base and absolute_maximum
 		 *
-		 * index points to a valid record.
+		 * Index points to a valid record.
 		 */
 		base   = ds_get(context->ds, qual, ds_buffer_base);
 		index  = ds_get(context->ds, qual, ds_index);
@@ -366,8 +361,10 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
 
 		write_end = min(end, int_th);
 
-		/* if we are already beyond the interrupt threshold,
-		 * we fill the entire buffer */
+		/*
+		 * If we are already beyond the interrupt threshold,
+		 * we fill the entire buffer.
+		 */
 		if (write_end <= index)
 			write_end = end;
 
@@ -384,7 +381,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
 		adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
 		adj_write_size *= ds_cfg.sizeof_rec[qual];
 
-		/* zero out trailing bytes */
+		/* Zero out trailing bytes. */
 		memset((char *)index + write_size, 0,
 		       adj_write_size - write_size);
 		index += adj_write_size;
@@ -556,7 +553,8 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
 			     unsigned int flags) {
 	unsigned long buffer, adj;
 
-	/* adjust the buffer address and size to meet alignment
+	/*
+	 * Adjust the buffer address and size to meet alignment
 	 * constraints:
 	 * - buffer is double-word aligned
 	 * - size is multiple of record size
@@ -578,7 +576,8 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
 	trace->begin = (void *)buffer;
 	trace->top = trace->begin;
 	trace->end = (void *)(buffer + size);
-	/* The value for 'no threshold' is -1, which will set the
+	/*
+	 * The value for 'no threshold' is -1, which will set the
 	 * threshold outside of the buffer, just like we want it.
 	 */
 	trace->ith = (void *)(buffer + size - ith);
@@ -602,7 +601,7 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
 	if (!base)
 		goto out;
 
-	/* we require some space to do alignment adjustments below */
+	/* We require some space to do alignment adjustments below. */
 	error = -EINVAL;
 	if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual]))
 		goto out;
@@ -640,7 +639,7 @@ struct bts_tracer *ds_request_bts(struct task_struct *task,
 	unsigned long irq;
 	int error;
 
-	/* buffer overflow notification is not yet implemented */
+	/* Buffer overflow notification is not yet implemented. */
 	error = -EOPNOTSUPP;
 	if (ovfl)
 		goto out;
@@ -700,7 +699,7 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
 	unsigned long irq;
 	int error;
 
-	/* buffer overflow notification is not yet implemented */
+	/* Buffer overflow notification is not yet implemented. */
 	error = -EOPNOTSUPP;
 	if (ovfl)
 		goto out;
@@ -983,9 +982,9 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
 		case 0x1c: /* Atom */
 			ds_configure(&ds_cfg_core2_atom, c);
 			break;
-		case 0x1a: /* i7 */
+		case 0x1a: /* Core i7 */
 		default:
-			/* sorry, don't know about them */
+			/* Sorry, don't know about them. */
 			break;
 		}
 		break;
@@ -997,12 +996,12 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
 			ds_configure(&ds_cfg_netburst, c);
 			break;
 		default:
-			/* sorry, don't know about them */
+			/* Sorry, don't know about them. */
 			break;
 		}
 		break;
 	default:
-		/* sorry, don't know about them */
+		/* Sorry, don't know about them. */
 		break;
 	}
 }
-- 
cgit v0.10.2


From ba9372a8f306c4e53a5f61dcbcd6c1e4a8c2e9ac Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 13 Mar 2009 10:48:52 +0100
Subject: x86, hw-branch-tracer: keep resources on stop

Distinguish init/reset and start/stop:

init/reset will allocate and release bts tracing resources
stop/start will suspend and resume bts tracing

Return an error on init() if no cpu can be traced.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
LKML-Reference: <20090313104852.A30168@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 7bfdf4c..a99a04c 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -19,7 +19,7 @@
 #include "trace_output.h"
 
 
-#define SIZEOF_BTS (1 << 13)
+#define BTS_BUFFER_SIZE (1 << 13)
 
 /*
  * The tracer lock protects the below per-cpu tracer array.
@@ -33,53 +33,68 @@
  */
 static DEFINE_SPINLOCK(bts_tracer_lock);
 static DEFINE_PER_CPU(struct bts_tracer *, tracer);
-static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
+static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer);
 
 #define this_tracer per_cpu(tracer, smp_processor_id())
 #define this_buffer per_cpu(buffer, smp_processor_id())
 
-static int __read_mostly trace_hw_branches_enabled;
+static int trace_hw_branches_enabled __read_mostly;
+static int trace_hw_branches_suspended __read_mostly;
 static struct trace_array *hw_branch_trace __read_mostly;
 
 
 /*
- * Start tracing on the current cpu.
+ * Initialize the tracer for the current cpu.
  * The argument is ignored.
  *
  * pre: bts_tracer_lock must be locked.
  */
-static void bts_trace_start_cpu(void *arg)
+static void bts_trace_init_cpu(void *arg)
 {
 	if (this_tracer)
 		ds_release_bts(this_tracer);
 
-	this_tracer =
-		ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS,
-			       /* ovfl = */ NULL, /* th = */ (size_t)-1,
-			       BTS_KERNEL);
+	this_tracer = ds_request_bts(NULL, this_buffer, BTS_BUFFER_SIZE,
+				     NULL, (size_t)-1, BTS_KERNEL);
 	if (IS_ERR(this_tracer)) {
 		this_tracer = NULL;
 		return;
 	}
 }
 
-static void bts_trace_start(struct trace_array *tr)
+static int bts_trace_init(struct trace_array *tr)
 {
+	int cpu, avail;
+
 	spin_lock(&bts_tracer_lock);
 
-	on_each_cpu(bts_trace_start_cpu, NULL, 1);
-	trace_hw_branches_enabled = 1;
+	hw_branch_trace = tr;
+
+	on_each_cpu(bts_trace_init_cpu, NULL, 1);
+
+	/* Check on how many cpus we could enable tracing */
+	avail = 0;
+	for_each_online_cpu(cpu)
+		if (per_cpu(tracer, cpu))
+			avail++;
+
+	trace_hw_branches_enabled = (avail ? 1 : 0);
+	trace_hw_branches_suspended = 0;
 
 	spin_unlock(&bts_tracer_lock);
+
+
+	/* If we could not enable tracing on a single cpu, we fail. */
+	return avail ? 0 : -EOPNOTSUPP;
 }
 
 /*
- * Stop tracing on the current cpu.
+ * Release the tracer for the current cpu.
  * The argument is ignored.
  *
  * pre: bts_tracer_lock must be locked.
  */
-static void bts_trace_stop_cpu(void *arg)
+static void bts_trace_release_cpu(void *arg)
 {
 	if (this_tracer) {
 		ds_release_bts(this_tracer);
@@ -87,12 +102,57 @@ static void bts_trace_stop_cpu(void *arg)
 	}
 }
 
-static void bts_trace_stop(struct trace_array *tr)
+static void bts_trace_reset(struct trace_array *tr)
 {
 	spin_lock(&bts_tracer_lock);
 
+	on_each_cpu(bts_trace_release_cpu, NULL, 1);
 	trace_hw_branches_enabled = 0;
-	on_each_cpu(bts_trace_stop_cpu, NULL, 1);
+	trace_hw_branches_suspended = 0;
+
+	spin_unlock(&bts_tracer_lock);
+}
+
+/*
+ * Resume tracing on the current cpu.
+ * The argument is ignored.
+ *
+ * pre: bts_tracer_lock must be locked.
+ */
+static void bts_trace_resume_cpu(void *arg)
+{
+	if (this_tracer)
+		ds_resume_bts(this_tracer);
+}
+
+static void bts_trace_start(struct trace_array *tr)
+{
+	spin_lock(&bts_tracer_lock);
+
+	on_each_cpu(bts_trace_resume_cpu, NULL, 1);
+	trace_hw_branches_suspended = 0;
+
+	spin_unlock(&bts_tracer_lock);
+}
+
+/*
+ * Suspend tracing on the current cpu.
+ * The argument is ignored.
+ *
+ * pre: bts_tracer_lock must be locked.
+ */
+static void bts_trace_suspend_cpu(void *arg)
+{
+	if (this_tracer)
+		ds_suspend_bts(this_tracer);
+}
+
+static void bts_trace_stop(struct trace_array *tr)
+{
+	spin_lock(&bts_tracer_lock);
+
+	on_each_cpu(bts_trace_suspend_cpu, NULL, 1);
+	trace_hw_branches_suspended = 1;
 
 	spin_unlock(&bts_tracer_lock);
 }
@@ -110,10 +170,14 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_DOWN_FAILED:
-		smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
+		smp_call_function_single(cpu, bts_trace_init_cpu, NULL, 1);
+
+		if (trace_hw_branches_suspended)
+			smp_call_function_single(cpu, bts_trace_suspend_cpu,
+						 NULL, 1);
 		break;
 	case CPU_DOWN_PREPARE:
-		smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
+		smp_call_function_single(cpu, bts_trace_release_cpu, NULL, 1);
 		break;
 	}
 
@@ -126,20 +190,6 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
 	.notifier_call = bts_hotcpu_handler
 };
 
-static int bts_trace_init(struct trace_array *tr)
-{
-	hw_branch_trace = tr;
-
-	bts_trace_start(tr);
-
-	return 0;
-}
-
-static void bts_trace_reset(struct trace_array *tr)
-{
-	bts_trace_stop(tr);
-}
-
 static void bts_trace_print_header(struct seq_file *m)
 {
 	seq_puts(m, "# CPU#        TO  <-  FROM\n");
@@ -228,7 +278,7 @@ static void trace_bts_at(const struct bts_trace *trace, void *at)
  */
 static void trace_bts_cpu(void *arg)
 {
-	struct trace_array *tr = (struct trace_array *) arg;
+	struct trace_array *tr = (struct trace_array *)arg;
 	const struct bts_trace *trace;
 	unsigned char *at;
 
@@ -276,7 +326,8 @@ void trace_hw_branch_oops(void)
 {
 	spin_lock(&bts_tracer_lock);
 
-	trace_bts_cpu(hw_branch_trace);
+	if (trace_hw_branches_enabled)
+		trace_bts_cpu(hw_branch_trace);
 
 	spin_unlock(&bts_tracer_lock);
 }
-- 
cgit v0.10.2


From 321bb5e1ac461c04b6a93f795010d6eb01d8c5ca Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 13 Mar 2009 10:50:27 +0100
Subject: x86, hw-branch-tracer: add selftest

Add a selftest for the hw-branch-tracer.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
LKML-Reference: <20090313105027.A30183@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 56ce34d..e7fbc82 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -576,6 +576,8 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,
 					       struct trace_array *tr);
 extern int trace_selftest_startup_branch(struct tracer *trace,
 					 struct trace_array *tr);
+extern int trace_selftest_startup_hw_branches(struct tracer *trace,
+					      struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
 extern void *head_page(struct trace_array_cpu *data);
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index a99a04c..4ca8270 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -342,7 +342,10 @@ struct tracer bts_tracer __read_mostly =
 	.start		= bts_trace_start,
 	.stop		= bts_trace_stop,
 	.open		= trace_bts_prepare,
-	.close		= trace_bts_close
+	.close		= trace_bts_close,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest	= trace_selftest_startup_hw_branches,
+#endif /* CONFIG_FTRACE_SELFTEST */
 };
 
 __init static int init_bts_trace(void)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index f907a2b..3c7b797 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -16,6 +16,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	case TRACE_BRANCH:
 	case TRACE_GRAPH_ENT:
 	case TRACE_GRAPH_RET:
+	case TRACE_HW_BRANCHES:
 		return 1;
 	}
 	return 0;
@@ -691,3 +692,55 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
 	return ret;
 }
 #endif /* CONFIG_BRANCH_TRACER */
+
+#ifdef CONFIG_HW_BRANCH_TRACER
+int
+trace_selftest_startup_hw_branches(struct tracer *trace,
+				   struct trace_array *tr)
+{
+	unsigned long count;
+	int ret;
+	struct trace_iterator iter;
+	struct tracer tracer;
+
+	if (!trace->open) {
+		printk(KERN_CONT "missing open function...");
+		return -1;
+	}
+
+	ret = tracer_init(trace, tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		return ret;
+	}
+
+	/*
+	 * The hw-branch tracer needs to collect the trace from the various
+	 * cpu trace buffers - before tracing is stopped.
+	 */
+	memset(&iter, 0, sizeof(iter));
+	memcpy(&tracer, trace, sizeof(tracer));
+
+	iter.trace = &tracer;
+	iter.tr = tr;
+	iter.pos = -1;
+	mutex_init(&iter.mutex);
+
+	trace->open(&iter);
+
+	mutex_destroy(&iter.mutex);
+
+	tracing_stop();
+
+	ret = trace_test_buffer(tr, &count);
+	trace->reset(tr);
+	tracing_start();
+
+	if (!ret && !count) {
+		printk(KERN_CONT "no entries found..");
+		ret = -1;
+	}
+
+	return ret;
+}
+#endif /* CONFIG_HW_BRANCH_TRACER */
-- 
cgit v0.10.2


From e9a22d1fb94050b7d600019c32e6b672d539054b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 11:54:40 +0100
Subject: x86, bts: cleanups

Impact: cleanup, no code changed

Cc: Markus Metzger <markus.t.metzger@intel.com>
LKML-Reference: <20090313104218.A30096@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index d9cab71..7363e01 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -19,43 +19,52 @@
  * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
  */
 
-
-#include <asm/ds.h>
-
-#include <linux/errno.h>
+#include <linux/kernel.h>
 #include <linux/string.h>
-#include <linux/slab.h>
+#include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/mm.h>
-#include <linux/kernel.h>
+
+#include <asm/ds.h>
 
 #include "ds_selftest.h"
 
 /*
- * The configuration for a particular DS hardware implementation.
+ * The configuration for a particular DS hardware implementation:
  */
 struct ds_configuration {
-	/* The name of the configuration. */
-	const char *name;
-	/* The size of pointer-typed fields in DS, BTS, and PEBS. */
-	unsigned char  sizeof_ptr_field;
-	/* The size of a BTS/PEBS record in bytes. */
-	unsigned char  sizeof_rec[2];
-	/* Control bit-masks indexed by enum ds_feature. */
-	unsigned long ctl[dsf_ctl_max];
+	/* The name of the configuration: */
+	const char		*name;
+
+	/* The size of pointer-typed fields in DS, BTS, and PEBS: */
+	unsigned char		sizeof_ptr_field;
+
+	/* The size of a BTS/PEBS record in bytes: */
+	unsigned char		sizeof_rec[2];
+
+	/* Control bit-masks indexed by enum ds_feature: */
+	unsigned long		ctl[dsf_ctl_max];
 };
 static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
 
 #define ds_cfg per_cpu(ds_cfg_array, smp_processor_id())
 
-#define MAX_SIZEOF_DS (12 * 8)	/* Maximal size of a DS configuration. */
-#define MAX_SIZEOF_BTS (3 * 8)	/* Maximal size of a BTS record. */
-#define DS_ALIGNMENT (1 << 3)	/* BTS and PEBS buffer alignment. */
+/* Maximal size of a DS configuration: */
+#define MAX_SIZEOF_DS		(12 * 8)
 
-#define BTS_CONTROL \
- (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\
-  ds_cfg.ctl[dsf_bts_overflow])
+/* Maximal size of a BTS record: */
+#define MAX_SIZEOF_BTS		(3 * 8)
 
+/* BTS and PEBS buffer alignment: */
+#define DS_ALIGNMENT		(1 << 3)
+
+/* Mask of control bits in the DS MSR register: */
+#define BTS_CONTROL				  \
+	( ds_cfg.ctl[dsf_bts]			| \
+	  ds_cfg.ctl[dsf_bts_kernel]		| \
+	  ds_cfg.ctl[dsf_bts_user]		| \
+	  ds_cfg.ctl[dsf_bts_overflow] )
 
 /*
  * A BTS or PEBS tracer.
@@ -65,28 +74,32 @@ static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
  */
 struct ds_tracer {
 	/* The DS context (partially) owned by this tracer. */
-	struct ds_context *context;
+	struct ds_context	*context;
 	/* The buffer provided on ds_request() and its size in bytes. */
-	void *buffer;
-	size_t size;
+	void			*buffer;
+	size_t			size;
 };
 
 struct bts_tracer {
-	/* The common DS part. */
-	struct ds_tracer ds;
-	/* The trace including the DS configuration. */
-	struct bts_trace trace;
-	/* Buffer overflow notification function. */
-	bts_ovfl_callback_t ovfl;
+	/* The common DS part: */
+	struct ds_tracer	ds;
+
+	/* The trace including the DS configuration: */
+	struct bts_trace	trace;
+
+	/* Buffer overflow notification function: */
+	bts_ovfl_callback_t	ovfl;
 };
 
 struct pebs_tracer {
-	/* The common DS part. */
-	struct ds_tracer ds;
-	/* The trace including the DS configuration. */
-	struct pebs_trace trace;
-	/* Buffer overflow notification function. */
-	pebs_ovfl_callback_t ovfl;
+	/* The common DS part: */
+	struct ds_tracer	ds;
+
+	/* The trace including the DS configuration: */
+	struct pebs_trace	trace;
+
+	/* Buffer overflow notification function: */
+	pebs_ovfl_callback_t	ovfl;
 };
 
 /*
@@ -95,6 +108,7 @@ struct pebs_tracer {
  *
  * The DS configuration consists of the following fields; different
  * architetures vary in the size of those fields.
+ *
  * - double-word aligned base linear address of the BTS buffer
  * - write pointer into the BTS buffer
  * - end linear address of the BTS buffer (one byte beyond the end of
@@ -133,19 +147,20 @@ enum ds_field {
 };
 
 enum ds_qualifier {
-	ds_bts  = 0,
+	ds_bts = 0,
 	ds_pebs
 };
 
-static inline unsigned long ds_get(const unsigned char *base,
-				   enum ds_qualifier qual, enum ds_field field)
+static inline unsigned long
+ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)
 {
 	base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
 	return *(unsigned long *)base;
 }
 
-static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
-			  enum ds_field field, unsigned long value)
+static inline void
+ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field,
+       unsigned long value)
 {
 	base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
 	(*(unsigned long *)base) = value;
@@ -157,7 +172,6 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
  */
 static DEFINE_SPINLOCK(ds_lock);
 
-
 /*
  * We either support (system-wide) per-cpu or per-thread allocation.
  * We distinguish the two based on the task_struct pointer, where a
@@ -211,17 +225,21 @@ static inline int check_tracer(struct task_struct *task)
  * deallocated when the last user puts the context.
  */
 struct ds_context {
-	/* The DS configuration; goes into MSR_IA32_DS_AREA. */
-	unsigned char ds[MAX_SIZEOF_DS];
-	/* The owner of the BTS and PEBS configuration, respectively. */
-	struct bts_tracer *bts_master;
-	struct pebs_tracer *pebs_master;
-	/* Use count. */
+	/* The DS configuration; goes into MSR_IA32_DS_AREA: */
+	unsigned char		ds[MAX_SIZEOF_DS];
+
+	/* The owner of the BTS and PEBS configuration, respectively: */
+	struct bts_tracer	*bts_master;
+	struct pebs_tracer	*pebs_master;
+
+	/* Use count: */
 	unsigned long count;
-	/* Pointer to the context pointer field. */
-	struct ds_context **this;
-	/* The traced task; NULL for current cpu. */
-	struct task_struct *task;
+
+	/* Pointer to the context pointer field: */
+	struct ds_context	**this;
+
+	/* The traced task; NULL for current cpu: */
+	struct task_struct	*task;
 };
 
 static DEFINE_PER_CPU(struct ds_context *, system_context_array);
@@ -328,9 +346,9 @@ static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
  * The remainder of any partially written record is zeroed out.
  *
  * context: the DS context
- * qual: the buffer type
- * record: the data to write
- * size: the size of the data
+ * qual:    the buffer type
+ * record:  the data to write
+ * size:    the size of the data
  */
 static int ds_write(struct ds_context *context, enum ds_qualifier qual,
 		    const void *record, size_t size)
@@ -429,12 +447,12 @@ enum bts_field {
 	bts_to,
 	bts_flags,
 
-	bts_qual = bts_from,
-	bts_jiffies = bts_to,
-	bts_pid = bts_flags,
+	bts_qual		= bts_from,
+	bts_jiffies		= bts_to,
+	bts_pid			= bts_flags,
 
-	bts_qual_mask = (bts_qual_max - 1),
-	bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
+	bts_qual_mask		= (bts_qual_max - 1),
+	bts_escape		= ((unsigned long)-1 & ~bts_qual_mask)
 };
 
 static inline unsigned long bts_get(const char *base, enum bts_field field)
@@ -461,8 +479,8 @@ static inline void bts_set(char *base, enum bts_field field, unsigned long val)
  *
  * return: bytes read/written on success; -Eerrno, otherwise
  */
-static int bts_read(struct bts_tracer *tracer, const void *at,
-		    struct bts_struct *out)
+static int
+bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
 {
 	if (!tracer)
 		return -EINVAL;
diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h
index 0e6e19d..2ba8745 100644
--- a/arch/x86/kernel/ds_selftest.h
+++ b/arch/x86/kernel/ds_selftest.h
@@ -12,4 +12,4 @@ extern int ds_selftest_pebs(void);
 #else
 static inline int ds_selftest_bts(void) { return 0; }
 static inline int ds_selftest_pebs(void) { return 0; }
-#endif /* CONFIG_X86_DS_SELFTEST */
+#endif
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 4ca8270..8b2109a 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -1,5 +1,5 @@
 /*
- * h/w branch tracer for x86 based on bts
+ * h/w branch tracer for x86 based on BTS
  *
  * Copyright (C) 2008-2009 Intel Corporation.
  * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
@@ -15,8 +15,8 @@
 
 #include <asm/ds.h>
 
-#include "trace.h"
 #include "trace_output.h"
+#include "trace.h"
 
 
 #define BTS_BUFFER_SIZE (1 << 13)
@@ -197,10 +197,10 @@ static void bts_trace_print_header(struct seq_file *m)
 
 static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
 {
+	unsigned long symflags = TRACE_ITER_SYM_OFFSET;
 	struct trace_entry *entry = iter->ent;
 	struct trace_seq *seq = &iter->seq;
 	struct hw_branch_entry *it;
-	unsigned long symflags = TRACE_ITER_SYM_OFFSET;
 
 	trace_assign_type(it, entry);
 
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 3c7b797..b910912 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -189,6 +189,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 #else
 # define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
 #endif /* CONFIG_DYNAMIC_FTRACE */
+
 /*
  * Simple verification test of ftrace function tracer.
  * Enable ftrace, sleep 1/10 second, and then read the trace
@@ -698,10 +699,10 @@ int
 trace_selftest_startup_hw_branches(struct tracer *trace,
 				   struct trace_array *tr)
 {
-	unsigned long count;
-	int ret;
 	struct trace_iterator iter;
 	struct tracer tracer;
+	unsigned long count;
+	int ret;
 
 	if (!trace->open) {
 		printk(KERN_CONT "missing open function...");
-- 
cgit v0.10.2


From 79258a354e0c69be94ae2871809a195bf4a647b1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 12:02:08 +0100
Subject: x86, bts: detect size of DS fields, fix

Impact: build fix

One usage site was missed in the sizeof_field -> sizeof_ptr_field
rename.

Cc: Markus Metzger <markus.t.metzger@intel.com>
LKML-Reference: <20090313104218.A30096@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 7363e01..5fd5333 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -983,7 +983,7 @@ ds_configure(const struct ds_configuration *cfg,
 	printk("bts/pebs record: %u/%u bytes\n",
 	       ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
 
-	WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field));
+	WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_ptr_field));
 }
 
 void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
-- 
cgit v0.10.2


From c78a3956b982418186e40978a51636a2b43221bc Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Wed, 18 Mar 2009 19:27:00 +0100
Subject: x86, bts: use atomic memory allocation

Ds_request_bts() needs to allocate memory. It uses GFP_KERNEL.

Hw-branch-tracer calls ds_request_bts() within on_each_cpu().

Use atomic memory allocation to allow it to be used in that context.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
LKML-Reference: <20090318192700.A6038@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 5fd5333..b1d6e1f 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -255,8 +255,13 @@ static inline struct ds_context *ds_get_context(struct task_struct *task)
 	struct ds_context *new_context = NULL;
 	unsigned long irq;
 
-	/* Chances are small that we already have a context. */
-	new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
+	/*
+	 * Chances are small that we already have a context.
+	 *
+	 * Contexts for per-cpu tracing are allocated using
+	 * smp_call_function(). We must not sleep.
+	 */
+	new_context = kzalloc(sizeof(*new_context), GFP_ATOMIC);
 	if (!new_context)
 		return NULL;
 
@@ -662,8 +667,12 @@ struct bts_tracer *ds_request_bts(struct task_struct *task,
 	if (ovfl)
 		goto out;
 
+	/*
+	 * Per-cpu tracing is typically requested using smp_call_function().
+	 * We must not sleep.
+	 */
 	error = -ENOMEM;
-	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
+	tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC);
 	if (!tracer)
 		goto out;
 	tracer->ovfl = ovfl;
@@ -722,8 +731,12 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
 	if (ovfl)
 		goto out;
 
+	/*
+	 * Per-cpu tracing is typically requested using smp_call_function().
+	 * We must not sleep.
+	 */
 	error = -ENOMEM;
-	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
+	tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC);
 	if (!tracer)
 		goto out;
 	tracer->ovfl = ovfl;
-- 
cgit v0.10.2


From 425480081e936d8725f0d44b8829d699bf088c6b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Mar 2009 13:38:36 -0400
Subject: tracing: add handler to trace_stat

Currently, if a trace_stat user wants a handle to some private data,
the trace_stat infrastructure does not supply a way to do that.

This patch passes the trace_stat structure to the start function of
the trace_stat code.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index ad8c22e..e6e3291 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -263,7 +263,7 @@ static int branch_stat_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static void *annotated_branch_stat_start(void)
+static void *annotated_branch_stat_start(struct tracer_stat *trace)
 {
 	return __start_annotated_branch_profile;
 }
@@ -338,7 +338,7 @@ static int all_branch_stat_headers(struct seq_file *m)
 	return 0;
 }
 
-static void *all_branch_stat_start(void)
+static void *all_branch_stat_start(struct tracer_stat *trace)
 {
 	return __start_branch_profile;
 }
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index f71b85b..f8f48d8 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -85,7 +85,7 @@ static int stat_seq_init(struct tracer_stat_session *session)
 	if (!ts->stat_cmp)
 		ts->stat_cmp = dummy_cmp;
 
-	stat = ts->stat_start();
+	stat = ts->stat_start(ts);
 	if (!stat)
 		goto exit;
 
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
index 202274c..f3546a2 100644
--- a/kernel/trace/trace_stat.h
+++ b/kernel/trace/trace_stat.h
@@ -12,7 +12,7 @@ struct tracer_stat {
 	/* The name of your stat file */
 	const char		*name;
 	/* Iteration over statistic entries */
-	void			*(*stat_start)(void);
+	void			*(*stat_start)(struct tracer_stat *trace);
 	void			*(*stat_next)(void *prev, int idx);
 	/* Compare two entries for stats sorting */
 	int			(*stat_cmp)(void *p1, void *p2);
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 9ab035b..ee533c2 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -152,7 +152,7 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
 	return ret;
 }
 
-static void *workqueue_stat_start(void)
+static void *workqueue_stat_start(struct tracer_stat *trace)
 {
 	int cpu;
 	void *ret = NULL;
-- 
cgit v0.10.2


From bac429f037f1a51a74d62bad6d1518c3be065df3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 20 Mar 2009 12:50:56 -0400
Subject: tracing: add function profiler

Impact: new profiling feature

This patch adds a function profiler. In debugfs/tracing/ two new
files are created.

  function_profile_enabled  - to enable or disable profiling

  trace_stat/functions   - the profiled functions.

For example:

  echo 1 > /debugfs/tracing/function_profile_enabled
  ./hackbench 50
  echo 0 > /debugfs/tracing/function_profile_enabled

yields:

  cat /debugfs/tracing/trace_stat/functions

  Function                               Hit
  --------                               ---
  _spin_lock                        10106442
  _spin_unlock                      10097492
  kfree                              6013704
  _spin_unlock_irqrestore            4423941
  _spin_lock_irqsave                 4406825
  __phys_addr                        4181686
  __slab_free                        4038222
  dput                               4030130
  path_put                           4023387
  unroll_tree_refs                   4019532
[...]

The most hit functions are listed first. Functions that are not
hit are not listed.

This feature depends on and uses dynamic function tracing. When the
function profiling is disabled, no overhead occurs. But it still
takes up around 300KB to hold the data, thus it is not recomended
to keep it enabled for systems low on memory.

When a '1' is echoed into the function_profile_enabled file, the
counters for is function is reset back to zero. Thus you can see what
functions are hit most by different programs.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 015a3d2..0456c3a 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -153,6 +153,10 @@ struct dyn_ftrace {
 		unsigned long		flags;
 		struct dyn_ftrace	*newlist;
 	};
+#ifdef CONFIG_FUNCTION_PROFILER
+	unsigned long			counter;
+	struct hlist_node		node;
+#endif
 	struct dyn_arch_ftrace		arch;
 };
 
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8a4d729..95e9ad5 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -105,6 +105,7 @@ config FUNCTION_GRAPH_TRACER
 	  This is done by setting the current return address on the current
 	  task structure into a stack of calls.
 
+
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
 	default n
@@ -376,6 +377,24 @@ config DYNAMIC_FTRACE
 	 were made. If so, it runs stop_machine (stops all CPUS)
 	 and modifies the code to jump over the call to ftrace.
 
+config FUNCTION_PROFILER
+	bool "Kernel function profiler"
+	depends on DYNAMIC_FTRACE
+	default n
+	help
+	 This option enables the kernel function profiler. When the dynamic
+	 function tracing is enabled, a counter is added into the function
+	 records used by the dynamic function tracer. A file is created in
+	 debugfs called function_profile_enabled which defaults to zero.
+	 When a 1 is echoed into this file profiling begins, and when a
+	 zero is entered, profiling stops. A file in the trace_stats
+	 directory called functions, that show the list of functions that
+	 have been hit and their counters.
+
+	 This takes up around 320K more memory.
+
+	 If in doubt, say N
+
 config FTRACE_MCOUNT_RECORD
 	def_bool y
 	depends on DYNAMIC_FTRACE
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7b8722b..11f364c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -34,6 +34,7 @@
 #include <asm/ftrace.h>
 
 #include "trace.h"
+#include "trace_stat.h"
 
 #define FTRACE_WARN_ON(cond)			\
 	do {					\
@@ -261,7 +262,6 @@ struct ftrace_func_probe {
 	struct rcu_head		rcu;
 };
 
-
 enum {
 	FTRACE_ENABLE_CALLS		= (1 << 0),
 	FTRACE_DISABLE_CALLS		= (1 << 1),
@@ -309,6 +309,307 @@ static struct dyn_ftrace *ftrace_free_records;
 		}				\
 	}
 
+#ifdef CONFIG_FUNCTION_PROFILER
+static struct hlist_head *ftrace_profile_hash;
+static int ftrace_profile_bits;
+static int ftrace_profile_enabled;
+static DEFINE_MUTEX(ftrace_profile_lock);
+
+static void *
+function_stat_next(void *v, int idx)
+{
+	struct dyn_ftrace *rec = v;
+	struct ftrace_page *pg;
+
+	pg = (struct ftrace_page *)((unsigned long)rec & PAGE_MASK);
+
+ again:
+	rec++;
+	if ((void *)rec >= (void *)&pg->records[pg->index]) {
+		pg = pg->next;
+		if (!pg)
+			return NULL;
+		rec = &pg->records[0];
+	}
+
+	if (rec->flags & FTRACE_FL_FREE ||
+	    rec->flags & FTRACE_FL_FAILED ||
+	    !(rec->flags & FTRACE_FL_CONVERTED) ||
+	    /* ignore non hit functions */
+	    !rec->counter)
+		goto again;
+
+	return rec;
+}
+
+static void *function_stat_start(struct tracer_stat *trace)
+{
+	return function_stat_next(&ftrace_pages_start->records[0], 0);
+}
+
+static int function_stat_cmp(void *p1, void *p2)
+{
+	struct dyn_ftrace *a = p1;
+	struct dyn_ftrace *b = p2;
+
+	if (a->counter < b->counter)
+		return -1;
+	if (a->counter > b->counter)
+		return 1;
+	else
+		return 0;
+}
+
+static int function_stat_headers(struct seq_file *m)
+{
+	seq_printf(m, "  Function                               Hit\n"
+		      "  --------                               ---\n");
+	return 0;
+}
+
+static int function_stat_show(struct seq_file *m, void *v)
+{
+	struct dyn_ftrace *rec = v;
+	char str[KSYM_SYMBOL_LEN];
+
+	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+
+	seq_printf(m, "  %-30.30s  %10lu\n", str, rec->counter);
+	return 0;
+}
+
+static struct tracer_stat function_stats = {
+	.name = "functions",
+	.stat_start = function_stat_start,
+	.stat_next = function_stat_next,
+	.stat_cmp = function_stat_cmp,
+	.stat_headers = function_stat_headers,
+	.stat_show = function_stat_show
+};
+
+static void ftrace_profile_init(int nr_funcs)
+{
+	unsigned long addr;
+	int order;
+	int size;
+
+	/*
+	 * We are profiling all functions, lets make it 1/4th of the
+	 * number of functions that are in core kernel. So we have to
+	 * iterate 4 times.
+	 */
+	order = (sizeof(struct hlist_head) * nr_funcs) / 4;
+	order = get_order(order);
+	size = 1 << (PAGE_SHIFT + order);
+
+	pr_info("Allocating %d KB for profiler hash\n", size >> 10);
+
+	addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
+	if (!addr) {
+		pr_warning("Could not allocate function profiler hash\n");
+		return;
+	}
+
+	ftrace_profile_hash = (void *)addr;
+
+	/*
+	 * struct hlist_head should be a pointer of 4 or 8 bytes.
+	 * And a simple bit manipulation can be done, but if for
+	 * some reason struct hlist_head is not a mulitple of 2,
+	 * then we play it safe, and simply count. This function
+	 * is done once at boot up, so it is not that critical in
+	 * performance.
+	 */
+
+	size--;
+	size /= sizeof(struct hlist_head);
+
+	for (; size; size >>= 1)
+		ftrace_profile_bits++;
+
+	pr_info("Function profiler has %d hash buckets\n",
+		1 << ftrace_profile_bits);
+
+	return;
+}
+
+static ssize_t
+ftrace_profile_read(struct file *filp, char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	int r;
+
+	r = sprintf(buf, "%u\n", ftrace_profile_enabled);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static void ftrace_profile_reset(void)
+{
+	struct dyn_ftrace *rec;
+	struct ftrace_page *pg;
+
+	do_for_each_ftrace_rec(pg, rec) {
+		rec->counter = 0;
+	} while_for_each_ftrace_rec();
+}
+
+static struct dyn_ftrace *ftrace_find_profiled_func(unsigned long ip)
+{
+	struct dyn_ftrace *rec;
+	struct hlist_head *hhd;
+	struct hlist_node *n;
+	unsigned long flags;
+	unsigned long key;
+
+	if (!ftrace_profile_hash)
+		return NULL;
+
+	key = hash_long(ip, ftrace_profile_bits);
+	hhd = &ftrace_profile_hash[key];
+
+	if (hlist_empty(hhd))
+		return NULL;
+
+	local_irq_save(flags);
+	hlist_for_each_entry_rcu(rec, n, hhd, node) {
+		if (rec->ip == ip)
+			goto out;
+	}
+	rec = NULL;
+ out:
+	local_irq_restore(flags);
+
+	return rec;
+}
+
+static void
+function_profile_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct dyn_ftrace *rec;
+	unsigned long flags;
+
+	if (!ftrace_profile_enabled)
+		return;
+
+	local_irq_save(flags);
+	rec = ftrace_find_profiled_func(ip);
+	if (!rec)
+		goto out;
+
+	rec->counter++;
+ out:
+	local_irq_restore(flags);
+}
+
+static struct ftrace_ops ftrace_profile_ops __read_mostly =
+{
+	.func = function_profile_call,
+};
+
+static ssize_t
+ftrace_profile_write(struct file *filp, const char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	unsigned long val;
+	char buf[64];
+	int ret;
+
+	if (!ftrace_profile_hash) {
+		pr_info("Can not enable hash due to earlier problems\n");
+		return -ENODEV;
+	}
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	val = !!val;
+
+	mutex_lock(&ftrace_profile_lock);
+	if (ftrace_profile_enabled ^ val) {
+		if (val) {
+			ftrace_profile_reset();
+			register_ftrace_function(&ftrace_profile_ops);
+			ftrace_profile_enabled = 1;
+		} else {
+			ftrace_profile_enabled = 0;
+			unregister_ftrace_function(&ftrace_profile_ops);
+		}
+	}
+	mutex_unlock(&ftrace_profile_lock);
+
+	filp->f_pos += cnt;
+
+	return cnt;
+}
+
+static const struct file_operations ftrace_profile_fops = {
+	.open		= tracing_open_generic,
+	.read		= ftrace_profile_read,
+	.write		= ftrace_profile_write,
+};
+
+static void ftrace_profile_debugfs(struct dentry *d_tracer)
+{
+	struct dentry *entry;
+	int ret;
+
+	ret = register_stat_tracer(&function_stats);
+	if (ret) {
+		pr_warning("Warning: could not register "
+			   "function stats\n");
+		return;
+	}
+
+	entry = debugfs_create_file("function_profile_enabled", 0644,
+				    d_tracer, NULL, &ftrace_profile_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'function_profile_enabled' entry\n");
+}
+
+static void ftrace_add_profile(struct dyn_ftrace *rec)
+{
+	unsigned long key;
+
+	if (!ftrace_profile_hash)
+		return;
+
+	key = hash_long(rec->ip, ftrace_profile_bits);
+	hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]);
+}
+
+static void ftrace_profile_release(struct dyn_ftrace *rec)
+{
+	mutex_lock(&ftrace_profile_lock);
+	hlist_del(&rec->node);
+	mutex_unlock(&ftrace_profile_lock);
+}
+
+#else /* CONFIG_FUNCTION_PROFILER */
+static void ftrace_profile_init(int nr_funcs)
+{
+}
+static void ftrace_add_profile(struct dyn_ftrace *rec)
+{
+}
+static void ftrace_profile_debugfs(struct dentry *d_tracer)
+{
+}
+static void ftrace_profile_release(struct dyn_ftrace *rec)
+{
+}
+#endif /* CONFIG_FUNCTION_PROFILER */
+
 #ifdef CONFIG_KPROBES
 
 static int frozen_record_count;
@@ -359,8 +660,10 @@ void ftrace_release(void *start, unsigned long size)
 	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
 		if ((rec->ip >= s) && (rec->ip < e) &&
-		    !(rec->flags & FTRACE_FL_FREE))
+		    !(rec->flags & FTRACE_FL_FREE)) {
 			ftrace_free_rec(rec);
+			ftrace_profile_release(rec);
+		}
 	} while_for_each_ftrace_rec();
 	mutex_unlock(&ftrace_lock);
 }
@@ -414,6 +717,8 @@ ftrace_record_ip(unsigned long ip)
 	rec->newlist = ftrace_new_addrs;
 	ftrace_new_addrs = rec;
 
+	ftrace_add_profile(rec);
+
 	return rec;
 }
 
@@ -2157,6 +2462,8 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 			   "'set_graph_function' entry\n");
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
+	ftrace_profile_debugfs(d_tracer);
+
 	return 0;
 }
 
@@ -2225,6 +2532,8 @@ void __init ftrace_init(void)
 	if (ret)
 		goto failed;
 
+	ftrace_profile_init(count);
+
 	last_ftrace_enabled = ftrace_enabled = 1;
 
 	ret = ftrace_convert_nops(NULL,
-- 
cgit v0.10.2


From 493762fc534c71d11d489f872c4b4a2c61173668 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 23 Mar 2009 17:12:36 -0400
Subject: tracing: move function profiler data out of function struct

Impact: reduce size of memory in function profiler

The function profiler originally introduces its counters into the
function records itself. There is 20 thousand different functions on
a normal system, and that is adding 20 thousand counters for profiling
event when not needed.

A normal run of the profiler yields only a couple of thousand functions
executed, depending on what is being profiled. This means we have around
18 thousand useless counters.

This patch rectifies this by moving the data out of the function
records used by dynamic ftrace. Data is preallocated to hold the functions
when the profiling begins. Checks are made during profiling to see if
more recorcds should be allocated, and they are allocated if it is safe
to do so.

This also removes the dependency from using dynamic ftrace, and also
removes the overhead by having it enabled.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 0456c3a..015a3d2 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -153,10 +153,6 @@ struct dyn_ftrace {
 		unsigned long		flags;
 		struct dyn_ftrace	*newlist;
 	};
-#ifdef CONFIG_FUNCTION_PROFILER
-	unsigned long			counter;
-	struct hlist_node		node;
-#endif
 	struct dyn_arch_ftrace		arch;
 };
 
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 95e9ad5..8a41360 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -379,20 +379,16 @@ config DYNAMIC_FTRACE
 
 config FUNCTION_PROFILER
 	bool "Kernel function profiler"
-	depends on DYNAMIC_FTRACE
+	depends on FUNCTION_TRACER
 	default n
 	help
-	 This option enables the kernel function profiler. When the dynamic
-	 function tracing is enabled, a counter is added into the function
-	 records used by the dynamic function tracer. A file is created in
-	 debugfs called function_profile_enabled which defaults to zero.
+	 This option enables the kernel function profiler. A file is created
+	 in debugfs called function_profile_enabled which defaults to zero.
 	 When a 1 is echoed into this file profiling begins, and when a
 	 zero is entered, profiling stops. A file in the trace_stats
 	 directory called functions, that show the list of functions that
 	 have been hit and their counters.
 
-	 This takes up around 320K more memory.
-
 	 If in doubt, say N
 
 config FTRACE_MCOUNT_RECORD
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 11f364c..24dac44 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -241,87 +241,48 @@ static void ftrace_update_pid_func(void)
 #endif
 }
 
-/* set when tracing only a pid */
-struct pid *ftrace_pid_trace;
-static struct pid * const ftrace_swapper_pid = &init_struct_pid;
-
-#ifdef CONFIG_DYNAMIC_FTRACE
-
-#ifndef CONFIG_FTRACE_MCOUNT_RECORD
-# error Dynamic ftrace depends on MCOUNT_RECORD
-#endif
-
-static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
-
-struct ftrace_func_probe {
-	struct hlist_node	node;
-	struct ftrace_probe_ops	*ops;
-	unsigned long		flags;
-	unsigned long		ip;
-	void			*data;
-	struct rcu_head		rcu;
+#ifdef CONFIG_FUNCTION_PROFILER
+struct ftrace_profile {
+	struct hlist_node		node;
+	unsigned long			ip;
+	unsigned long			counter;
 };
 
-enum {
-	FTRACE_ENABLE_CALLS		= (1 << 0),
-	FTRACE_DISABLE_CALLS		= (1 << 1),
-	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
-	FTRACE_ENABLE_MCOUNT		= (1 << 3),
-	FTRACE_DISABLE_MCOUNT		= (1 << 4),
-	FTRACE_START_FUNC_RET		= (1 << 5),
-	FTRACE_STOP_FUNC_RET		= (1 << 6),
+struct ftrace_profile_page {
+	struct ftrace_profile_page	*next;
+	unsigned long			index;
+	struct ftrace_profile		records[];
 };
 
-static int ftrace_filtered;
+#define PROFILE_RECORDS_SIZE						\
+	(PAGE_SIZE - offsetof(struct ftrace_profile_page, records))
 
-static struct dyn_ftrace *ftrace_new_addrs;
+#define PROFILES_PER_PAGE					\
+	(PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
 
-static DEFINE_MUTEX(ftrace_regex_lock);
-
-struct ftrace_page {
-	struct ftrace_page	*next;
-	int			index;
-	struct dyn_ftrace	records[];
-};
+/* TODO: make these percpu, to prevent cache line bouncing */
+static struct ftrace_profile_page *profile_pages_start;
+static struct ftrace_profile_page *profile_pages;
 
-#define ENTRIES_PER_PAGE \
-  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
-
-/* estimate from running different kernels */
-#define NR_TO_INIT		10000
-
-static struct ftrace_page	*ftrace_pages_start;
-static struct ftrace_page	*ftrace_pages;
-
-static struct dyn_ftrace *ftrace_free_records;
-
-/*
- * This is a double for. Do not use 'break' to break out of the loop,
- * you must use a goto.
- */
-#define do_for_each_ftrace_rec(pg, rec)					\
-	for (pg = ftrace_pages_start; pg; pg = pg->next) {		\
-		int _____i;						\
-		for (_____i = 0; _____i < pg->index; _____i++) {	\
-			rec = &pg->records[_____i];
-
-#define while_for_each_ftrace_rec()		\
-		}				\
-	}
-
-#ifdef CONFIG_FUNCTION_PROFILER
 static struct hlist_head *ftrace_profile_hash;
 static int ftrace_profile_bits;
 static int ftrace_profile_enabled;
 static DEFINE_MUTEX(ftrace_profile_lock);
 
+static DEFINE_PER_CPU(atomic_t, ftrace_profile_disable);
+
+#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */
+
+static raw_spinlock_t ftrace_profile_rec_lock =
+	(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
 static void *
 function_stat_next(void *v, int idx)
 {
-	struct dyn_ftrace *rec = v;
-	struct ftrace_page *pg;
+	struct ftrace_profile *rec = v;
+	struct ftrace_profile_page *pg;
 
-	pg = (struct ftrace_page *)((unsigned long)rec & PAGE_MASK);
+	pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);
 
  again:
 	rec++;
@@ -330,27 +291,22 @@ function_stat_next(void *v, int idx)
 		if (!pg)
 			return NULL;
 		rec = &pg->records[0];
+		if (!rec->counter)
+			goto again;
 	}
 
-	if (rec->flags & FTRACE_FL_FREE ||
-	    rec->flags & FTRACE_FL_FAILED ||
-	    !(rec->flags & FTRACE_FL_CONVERTED) ||
-	    /* ignore non hit functions */
-	    !rec->counter)
-		goto again;
-
 	return rec;
 }
 
 static void *function_stat_start(struct tracer_stat *trace)
 {
-	return function_stat_next(&ftrace_pages_start->records[0], 0);
+	return function_stat_next(&profile_pages_start->records[0], 0);
 }
 
 static int function_stat_cmp(void *p1, void *p2)
 {
-	struct dyn_ftrace *a = p1;
-	struct dyn_ftrace *b = p2;
+	struct ftrace_profile *a = p1;
+	struct ftrace_profile *b = p2;
 
 	if (a->counter < b->counter)
 		return -1;
@@ -369,7 +325,7 @@ static int function_stat_headers(struct seq_file *m)
 
 static int function_stat_show(struct seq_file *m, void *v)
 {
-	struct dyn_ftrace *rec = v;
+	struct ftrace_profile *rec = v;
 	char str[KSYM_SYMBOL_LEN];
 
 	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
@@ -387,115 +343,191 @@ static struct tracer_stat function_stats = {
 	.stat_show = function_stat_show
 };
 
-static void ftrace_profile_init(int nr_funcs)
+static void ftrace_profile_reset(void)
 {
-	unsigned long addr;
-	int order;
-	int size;
+	struct ftrace_profile_page *pg;
 
-	/*
-	 * We are profiling all functions, lets make it 1/4th of the
-	 * number of functions that are in core kernel. So we have to
-	 * iterate 4 times.
-	 */
-	order = (sizeof(struct hlist_head) * nr_funcs) / 4;
-	order = get_order(order);
-	size = 1 << (PAGE_SHIFT + order);
-
-	pr_info("Allocating %d KB for profiler hash\n", size >> 10);
+	pg = profile_pages = profile_pages_start;
 
-	addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
-	if (!addr) {
-		pr_warning("Could not allocate function profiler hash\n");
-		return;
+	while (pg) {
+		memset(pg->records, 0, PROFILE_RECORDS_SIZE);
+		pg->index = 0;
+		pg = pg->next;
 	}
 
-	ftrace_profile_hash = (void *)addr;
+	memset(ftrace_profile_hash, 0,
+	       FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head));
+}
 
-	/*
-	 * struct hlist_head should be a pointer of 4 or 8 bytes.
-	 * And a simple bit manipulation can be done, but if for
-	 * some reason struct hlist_head is not a mulitple of 2,
-	 * then we play it safe, and simply count. This function
-	 * is done once at boot up, so it is not that critical in
-	 * performance.
-	 */
+int ftrace_profile_pages_init(void)
+{
+	struct ftrace_profile_page *pg;
+	int i;
 
-	size--;
-	size /= sizeof(struct hlist_head);
+	/* If we already allocated, do nothing */
+	if (profile_pages)
+		return 0;
 
-	for (; size; size >>= 1)
-		ftrace_profile_bits++;
+	profile_pages = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!profile_pages)
+		return -ENOMEM;
 
-	pr_info("Function profiler has %d hash buckets\n",
-		1 << ftrace_profile_bits);
+	pg = profile_pages_start = profile_pages;
 
-	return;
+	/* allocate 10 more pages to start */
+	for (i = 0; i < 10; i++) {
+		pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+		/*
+		 * We only care about allocating profile_pages, if
+		 * we failed to allocate here, hopefully we will allocate
+		 * later.
+		 */
+		if (!pg->next)
+			break;
+		pg = pg->next;
+	}
+
+	return 0;
 }
 
-static ssize_t
-ftrace_profile_read(struct file *filp, char __user *ubuf,
-		     size_t cnt, loff_t *ppos)
+static int ftrace_profile_init(void)
 {
-	char buf[64];
-	int r;
+	int size;
 
-	r = sprintf(buf, "%u\n", ftrace_profile_enabled);
-	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
+	if (ftrace_profile_hash) {
+		/* If the profile is already created, simply reset it */
+		ftrace_profile_reset();
+		return 0;
+	}
 
-static void ftrace_profile_reset(void)
-{
-	struct dyn_ftrace *rec;
-	struct ftrace_page *pg;
+	/*
+	 * We are profiling all functions, but usually only a few thousand
+	 * functions are hit. We'll make a hash of 1024 items.
+	 */
+	size = FTRACE_PROFILE_HASH_SIZE;
 
-	do_for_each_ftrace_rec(pg, rec) {
-		rec->counter = 0;
-	} while_for_each_ftrace_rec();
+	ftrace_profile_hash =
+		kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL);
+
+	if (!ftrace_profile_hash)
+		return -ENOMEM;
+
+	size--;
+
+	for (; size; size >>= 1)
+		ftrace_profile_bits++;
+
+	/* Preallocate a few pages */
+	if (ftrace_profile_pages_init() < 0) {
+		kfree(ftrace_profile_hash);
+		ftrace_profile_hash = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
 }
 
-static struct dyn_ftrace *ftrace_find_profiled_func(unsigned long ip)
+/* interrupts must be disabled */
+static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip)
 {
-	struct dyn_ftrace *rec;
+	struct ftrace_profile *rec;
 	struct hlist_head *hhd;
 	struct hlist_node *n;
-	unsigned long flags;
 	unsigned long key;
 
-	if (!ftrace_profile_hash)
-		return NULL;
-
 	key = hash_long(ip, ftrace_profile_bits);
 	hhd = &ftrace_profile_hash[key];
 
 	if (hlist_empty(hhd))
 		return NULL;
 
-	local_irq_save(flags);
 	hlist_for_each_entry_rcu(rec, n, hhd, node) {
 		if (rec->ip == ip)
-			goto out;
+			return rec;
+	}
+
+	return NULL;
+}
+
+static void ftrace_add_profile(struct ftrace_profile *rec)
+{
+	unsigned long key;
+
+	key = hash_long(rec->ip, ftrace_profile_bits);
+	hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]);
+}
+
+/* Interrupts must be disabled calling this */
+static struct ftrace_profile *
+ftrace_profile_alloc(unsigned long ip, bool alloc_safe)
+{
+	struct ftrace_profile *rec = NULL;
+
+	/* prevent recursion */
+	if (atomic_inc_return(&__get_cpu_var(ftrace_profile_disable)) != 1)
+		goto out;
+
+	__raw_spin_lock(&ftrace_profile_rec_lock);
+
+	/* Try to always keep another page available */
+	if (!profile_pages->next && alloc_safe)
+		profile_pages->next = (void *)get_zeroed_page(GFP_ATOMIC);
+
+	/*
+	 * Try to find the function again since another
+	 * task on another CPU could have added it
+	 */
+	rec = ftrace_find_profiled_func(ip);
+	if (rec)
+		goto out_unlock;
+
+	if (profile_pages->index == PROFILES_PER_PAGE) {
+		if (!profile_pages->next)
+			goto out_unlock;
+		profile_pages = profile_pages->next;
 	}
-	rec = NULL;
+
+	rec = &profile_pages->records[profile_pages->index++];
+	rec->ip = ip;
+	ftrace_add_profile(rec);
+
+ out_unlock:
+	__raw_spin_unlock(&ftrace_profile_rec_lock);
  out:
-	local_irq_restore(flags);
+	atomic_dec(&__get_cpu_var(ftrace_profile_disable));
 
 	return rec;
 }
 
+/*
+ * If we are not in an interrupt, or softirq and
+ * and interrupts are disabled and preemption is not enabled
+ * (not in a spinlock) then it should be safe to allocate memory.
+ */
+static bool ftrace_safe_to_allocate(void)
+{
+	return !in_interrupt() && irqs_disabled() && !preempt_count();
+}
+
 static void
 function_profile_call(unsigned long ip, unsigned long parent_ip)
 {
-	struct dyn_ftrace *rec;
+	struct ftrace_profile *rec;
 	unsigned long flags;
+	bool alloc_safe;
 
 	if (!ftrace_profile_enabled)
 		return;
 
+	alloc_safe = ftrace_safe_to_allocate();
+
 	local_irq_save(flags);
 	rec = ftrace_find_profiled_func(ip);
-	if (!rec)
-		goto out;
+	if (!rec) {
+		rec = ftrace_profile_alloc(ip, alloc_safe);
+		if (!rec)
+			goto out;
+	}
 
 	rec->counter++;
  out:
@@ -515,11 +547,6 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 	char buf[64];
 	int ret;
 
-	if (!ftrace_profile_hash) {
-		pr_info("Can not enable hash due to earlier problems\n");
-		return -ENODEV;
-	}
-
 	if (cnt >= sizeof(buf))
 		return -EINVAL;
 
@@ -537,7 +564,12 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 	mutex_lock(&ftrace_profile_lock);
 	if (ftrace_profile_enabled ^ val) {
 		if (val) {
-			ftrace_profile_reset();
+			ret = ftrace_profile_init();
+			if (ret < 0) {
+				cnt = ret;
+				goto out;
+			}
+
 			register_ftrace_function(&ftrace_profile_ops);
 			ftrace_profile_enabled = 1;
 		} else {
@@ -545,6 +577,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 			unregister_ftrace_function(&ftrace_profile_ops);
 		}
 	}
+ out:
 	mutex_unlock(&ftrace_profile_lock);
 
 	filp->f_pos += cnt;
@@ -552,6 +585,17 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
+static ssize_t
+ftrace_profile_read(struct file *filp, char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	int r;
+
+	r = sprintf(buf, "%u\n", ftrace_profile_enabled);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
 static const struct file_operations ftrace_profile_fops = {
 	.open		= tracing_open_generic,
 	.read		= ftrace_profile_read,
@@ -577,39 +621,80 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer)
 			   "'function_profile_enabled' entry\n");
 }
 
-static void ftrace_add_profile(struct dyn_ftrace *rec)
-{
-	unsigned long key;
-
-	if (!ftrace_profile_hash)
-		return;
-
-	key = hash_long(rec->ip, ftrace_profile_bits);
-	hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]);
-}
-
-static void ftrace_profile_release(struct dyn_ftrace *rec)
-{
-	mutex_lock(&ftrace_profile_lock);
-	hlist_del(&rec->node);
-	mutex_unlock(&ftrace_profile_lock);
-}
-
 #else /* CONFIG_FUNCTION_PROFILER */
-static void ftrace_profile_init(int nr_funcs)
-{
-}
-static void ftrace_add_profile(struct dyn_ftrace *rec)
-{
-}
 static void ftrace_profile_debugfs(struct dentry *d_tracer)
 {
 }
-static void ftrace_profile_release(struct dyn_ftrace *rec)
-{
-}
 #endif /* CONFIG_FUNCTION_PROFILER */
 
+/* set when tracing only a pid */
+struct pid *ftrace_pid_trace;
+static struct pid * const ftrace_swapper_pid = &init_struct_pid;
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+#ifndef CONFIG_FTRACE_MCOUNT_RECORD
+# error Dynamic ftrace depends on MCOUNT_RECORD
+#endif
+
+static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
+
+struct ftrace_func_probe {
+	struct hlist_node	node;
+	struct ftrace_probe_ops	*ops;
+	unsigned long		flags;
+	unsigned long		ip;
+	void			*data;
+	struct rcu_head		rcu;
+};
+
+enum {
+	FTRACE_ENABLE_CALLS		= (1 << 0),
+	FTRACE_DISABLE_CALLS		= (1 << 1),
+	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
+	FTRACE_ENABLE_MCOUNT		= (1 << 3),
+	FTRACE_DISABLE_MCOUNT		= (1 << 4),
+	FTRACE_START_FUNC_RET		= (1 << 5),
+	FTRACE_STOP_FUNC_RET		= (1 << 6),
+};
+
+static int ftrace_filtered;
+
+static struct dyn_ftrace *ftrace_new_addrs;
+
+static DEFINE_MUTEX(ftrace_regex_lock);
+
+struct ftrace_page {
+	struct ftrace_page	*next;
+	int			index;
+	struct dyn_ftrace	records[];
+};
+
+#define ENTRIES_PER_PAGE \
+  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
+
+/* estimate from running different kernels */
+#define NR_TO_INIT		10000
+
+static struct ftrace_page	*ftrace_pages_start;
+static struct ftrace_page	*ftrace_pages;
+
+static struct dyn_ftrace *ftrace_free_records;
+
+/*
+ * This is a double for. Do not use 'break' to break out of the loop,
+ * you must use a goto.
+ */
+#define do_for_each_ftrace_rec(pg, rec)					\
+	for (pg = ftrace_pages_start; pg; pg = pg->next) {		\
+		int _____i;						\
+		for (_____i = 0; _____i < pg->index; _____i++) {	\
+			rec = &pg->records[_____i];
+
+#define while_for_each_ftrace_rec()		\
+		}				\
+	}
+
 #ifdef CONFIG_KPROBES
 
 static int frozen_record_count;
@@ -660,10 +745,8 @@ void ftrace_release(void *start, unsigned long size)
 	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
 		if ((rec->ip >= s) && (rec->ip < e) &&
-		    !(rec->flags & FTRACE_FL_FREE)) {
+		    !(rec->flags & FTRACE_FL_FREE))
 			ftrace_free_rec(rec);
-			ftrace_profile_release(rec);
-		}
 	} while_for_each_ftrace_rec();
 	mutex_unlock(&ftrace_lock);
 }
@@ -717,8 +800,6 @@ ftrace_record_ip(unsigned long ip)
 	rec->newlist = ftrace_new_addrs;
 	ftrace_new_addrs = rec;
 
-	ftrace_add_profile(rec);
-
 	return rec;
 }
 
@@ -2462,8 +2543,6 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 			   "'set_graph_function' entry\n");
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
-	ftrace_profile_debugfs(d_tracer);
-
 	return 0;
 }
 
@@ -2532,8 +2611,6 @@ void __init ftrace_init(void)
 	if (ret)
 		goto failed;
 
-	ftrace_profile_init(count);
-
 	last_ftrace_enabled = ftrace_enabled = 1;
 
 	ret = ftrace_convert_nops(NULL,
@@ -2734,6 +2811,9 @@ static __init int ftrace_init_debugfs(void)
 	if (!entry)
 		pr_warning("Could not create debugfs "
 			   "'set_ftrace_pid' entry\n");
+
+	ftrace_profile_debugfs(d_tracer);
+
 	return 0;
 }
 fs_initcall(ftrace_init_debugfs);
-- 
cgit v0.10.2


From 0706f1c48ca8a7ab478090b4e38f2e578ae2bfe0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 23 Mar 2009 23:12:58 -0400
Subject: tracing: adding function timings to function profiler

If the function graph trace is enabled, the function profiler will
use it to take the timing of the functions.

 cat /debug/tracing/trace_stat/functions

  Function                               Hit    Time
  --------                               ---    ----
  mwait_idle                             127    183028.4 us
  schedule                                26    151997.7 us
  __schedule                              31    151975.1 us
  sys_wait4                                2    74080.53 us
  do_wait                                  2    74077.80 us
  sys_newlstat                           138    39929.16 us
  do_path_lookup                         179    39845.79 us
  vfs_lstat_fd                           138    39761.97 us
  user_path_at                           153    39469.58 us
  path_walk                              179    39435.76 us
  __link_path_walk                       189    39143.73 us
[...]

Note the times are skewed due to the function graph tracer not taking
into account schedules.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 24dac44..a9ccd71 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -33,7 +33,7 @@
 
 #include <asm/ftrace.h>
 
-#include "trace.h"
+#include "trace_output.h"
 #include "trace_stat.h"
 
 #define FTRACE_WARN_ON(cond)			\
@@ -246,6 +246,9 @@ struct ftrace_profile {
 	struct hlist_node		node;
 	unsigned long			ip;
 	unsigned long			counter;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	unsigned long long		time;
+#endif
 };
 
 struct ftrace_profile_page {
@@ -303,6 +306,22 @@ static void *function_stat_start(struct tracer_stat *trace)
 	return function_stat_next(&profile_pages_start->records[0], 0);
 }
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+/* function graph compares on total time */
+static int function_stat_cmp(void *p1, void *p2)
+{
+	struct ftrace_profile *a = p1;
+	struct ftrace_profile *b = p2;
+
+	if (a->time < b->time)
+		return -1;
+	if (a->time > b->time)
+		return 1;
+	else
+		return 0;
+}
+#else
+/* not function graph compares against hits */
 static int function_stat_cmp(void *p1, void *p2)
 {
 	struct ftrace_profile *a = p1;
@@ -315,11 +334,17 @@ static int function_stat_cmp(void *p1, void *p2)
 	else
 		return 0;
 }
+#endif
 
 static int function_stat_headers(struct seq_file *m)
 {
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	seq_printf(m, "  Function                               Hit    Time\n"
+		      "  --------                               ---    ----\n");
+#else
 	seq_printf(m, "  Function                               Hit\n"
 		      "  --------                               ---\n");
+#endif
 	return 0;
 }
 
@@ -327,10 +352,25 @@ static int function_stat_show(struct seq_file *m, void *v)
 {
 	struct ftrace_profile *rec = v;
 	char str[KSYM_SYMBOL_LEN];
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	static struct trace_seq s;
+	static DEFINE_MUTEX(mutex);
+
+	mutex_lock(&mutex);
+	trace_seq_init(&s);
+	trace_print_graph_duration(rec->time, &s);
+#endif
 
 	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+	seq_printf(m, "  %-30.30s  %10lu", str, rec->counter);
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	seq_printf(m, "    ");
+	trace_print_seq(m, &s);
+	mutex_unlock(&mutex);
+#endif
+	seq_putc(m, '\n');
 
-	seq_printf(m, "  %-30.30s  %10lu\n", str, rec->counter);
 	return 0;
 }
 
@@ -534,11 +574,52 @@ function_profile_call(unsigned long ip, unsigned long parent_ip)
 	local_irq_restore(flags);
 }
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int profile_graph_entry(struct ftrace_graph_ent *trace)
+{
+	function_profile_call(trace->func, 0);
+	return 1;
+}
+
+static void profile_graph_return(struct ftrace_graph_ret *trace)
+{
+	unsigned long flags;
+	struct ftrace_profile *rec;
+
+	local_irq_save(flags);
+	rec = ftrace_find_profiled_func(trace->func);
+	if (rec)
+		rec->time += trace->rettime - trace->calltime;
+	local_irq_restore(flags);
+}
+
+static int register_ftrace_profiler(void)
+{
+	return register_ftrace_graph(&profile_graph_return,
+				     &profile_graph_entry);
+}
+
+static void unregister_ftrace_profiler(void)
+{
+	unregister_ftrace_graph();
+}
+#else
 static struct ftrace_ops ftrace_profile_ops __read_mostly =
 {
 	.func = function_profile_call,
 };
 
+static int register_ftrace_profiler(void)
+{
+	return register_ftrace_function(&ftrace_profile_ops);
+}
+
+static void unregister_ftrace_profiler(void)
+{
+	unregister_ftrace_function(&ftrace_profile_ops);
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
 static ssize_t
 ftrace_profile_write(struct file *filp, const char __user *ubuf,
 		     size_t cnt, loff_t *ppos)
@@ -570,11 +651,15 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 				goto out;
 			}
 
-			register_ftrace_function(&ftrace_profile_ops);
+			ret = register_ftrace_profiler();
+			if (ret < 0) {
+				cnt = ret;
+				goto out;
+			}
 			ftrace_profile_enabled = 1;
 		} else {
 			ftrace_profile_enabled = 0;
-			unregister_ftrace_function(&ftrace_profile_ops);
+			unregister_ftrace_profiler();
 		}
 	}
  out:
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 67c6a21..821bf49 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -402,17 +402,6 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
 	return cnt;
 }
 
-static void
-trace_print_seq(struct seq_file *m, struct trace_seq *s)
-{
-	int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
-
-	s->buffer[len] = 0;
-	seq_puts(m, s->buffer);
-
-	trace_seq_init(s);
-}
-
 /**
  * update_max_tr - snapshot all trace buffers from global_trace to max_tr
  * @tr: tracer
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d7410bb..c66ca3b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -605,6 +605,8 @@ extern unsigned long trace_flags;
 /* Standard output formatting function used for function return traces */
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 extern enum print_line_t print_graph_function(struct trace_iterator *iter);
+extern enum print_line_t
+trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 /* TODO: make this variable */
@@ -636,7 +638,6 @@ static inline int ftrace_graph_addr(unsigned long addr)
 	return 1;
 }
 #endif /* CONFIG_DYNAMIC_FTRACE */
-
 #else /* CONFIG_FUNCTION_GRAPH_TRACER */
 static inline enum print_line_t
 print_graph_function(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d28687e..85bba0f 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -426,8 +426,8 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
 	return TRACE_TYPE_HANDLED;
 }
 
-static enum print_line_t
-print_graph_duration(unsigned long long duration, struct trace_seq *s)
+enum print_line_t
+trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
 {
 	unsigned long nsecs_rem = do_div(duration, 1000);
 	/* log10(ULONG_MAX) + '\0' */
@@ -464,12 +464,23 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
+	return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+print_graph_duration(unsigned long long duration, struct trace_seq *s)
+{
+	int ret;
+
+	ret = trace_print_graph_duration(duration, s);
+	if (ret != TRACE_TYPE_HANDLED)
+		return ret;
 
 	ret = trace_seq_printf(s, "|  ");
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
-	return TRACE_TYPE_HANDLED;
 
+	return TRACE_TYPE_HANDLED;
 }
 
 /* Case of a leaf function on its call entry */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 19261fd..a3b6e3f 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -19,6 +19,16 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
 static int next_event_type = __TRACE_LAST_TYPE + 1;
 
+void trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+	int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
+
+	s->buffer[len] = 0;
+	seq_puts(m, s->buffer);
+
+	trace_seq_init(s);
+}
+
 enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 35c422f..1eac297 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -20,6 +20,8 @@ trace_print_bprintk_msg_only(struct trace_iterator *iter);
 extern enum print_line_t
 trace_print_printk_msg_only(struct trace_iterator *iter);
 
+extern void trace_print_seq(struct seq_file *m, struct trace_seq *s);
+
 extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
 extern int
-- 
cgit v0.10.2


From cafb168a1c92e4c9e1731fe3d666c39611762c49 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Mar 2009 20:50:39 -0400
Subject: tracing: make the function profiler per cpu

Impact: speed enhancement

By making the function profiler record in per cpu data we not only
get better readings, avoid races, we also do not have to take any
locks.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a9ccd71..ed1fc50 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -257,28 +257,28 @@ struct ftrace_profile_page {
 	struct ftrace_profile		records[];
 };
 
+struct ftrace_profile_stat {
+	atomic_t			disabled;
+	struct hlist_head		*hash;
+	struct ftrace_profile_page	*pages;
+	struct ftrace_profile_page	*start;
+	struct tracer_stat		stat;
+};
+
 #define PROFILE_RECORDS_SIZE						\
 	(PAGE_SIZE - offsetof(struct ftrace_profile_page, records))
 
 #define PROFILES_PER_PAGE					\
 	(PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
 
-/* TODO: make these percpu, to prevent cache line bouncing */
-static struct ftrace_profile_page *profile_pages_start;
-static struct ftrace_profile_page *profile_pages;
-
-static struct hlist_head *ftrace_profile_hash;
 static int ftrace_profile_bits;
 static int ftrace_profile_enabled;
 static DEFINE_MUTEX(ftrace_profile_lock);
 
-static DEFINE_PER_CPU(atomic_t, ftrace_profile_disable);
+static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
 
 #define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */
 
-static raw_spinlock_t ftrace_profile_rec_lock =
-	(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
-
 static void *
 function_stat_next(void *v, int idx)
 {
@@ -303,7 +303,13 @@ function_stat_next(void *v, int idx)
 
 static void *function_stat_start(struct tracer_stat *trace)
 {
-	return function_stat_next(&profile_pages_start->records[0], 0);
+	struct ftrace_profile_stat *stat =
+		container_of(trace, struct ftrace_profile_stat, stat);
+
+	if (!stat || !stat->start)
+		return NULL;
+
+	return function_stat_next(&stat->start->records[0], 0);
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -374,20 +380,11 @@ static int function_stat_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static struct tracer_stat function_stats = {
-	.name = "functions",
-	.stat_start = function_stat_start,
-	.stat_next = function_stat_next,
-	.stat_cmp = function_stat_cmp,
-	.stat_headers = function_stat_headers,
-	.stat_show = function_stat_show
-};
-
-static void ftrace_profile_reset(void)
+static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
 {
 	struct ftrace_profile_page *pg;
 
-	pg = profile_pages = profile_pages_start;
+	pg = stat->pages = stat->start;
 
 	while (pg) {
 		memset(pg->records, 0, PROFILE_RECORDS_SIZE);
@@ -395,24 +392,24 @@ static void ftrace_profile_reset(void)
 		pg = pg->next;
 	}
 
-	memset(ftrace_profile_hash, 0,
+	memset(stat->hash, 0,
 	       FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head));
 }
 
-int ftrace_profile_pages_init(void)
+int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
 {
 	struct ftrace_profile_page *pg;
 	int i;
 
 	/* If we already allocated, do nothing */
-	if (profile_pages)
+	if (stat->pages)
 		return 0;
 
-	profile_pages = (void *)get_zeroed_page(GFP_KERNEL);
-	if (!profile_pages)
+	stat->pages = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!stat->pages)
 		return -ENOMEM;
 
-	pg = profile_pages_start = profile_pages;
+	pg = stat->start = stat->pages;
 
 	/* allocate 10 more pages to start */
 	for (i = 0; i < 10; i++) {
@@ -430,13 +427,16 @@ int ftrace_profile_pages_init(void)
 	return 0;
 }
 
-static int ftrace_profile_init(void)
+static int ftrace_profile_init_cpu(int cpu)
 {
+	struct ftrace_profile_stat *stat;
 	int size;
 
-	if (ftrace_profile_hash) {
+	stat = &per_cpu(ftrace_profile_stats, cpu);
+
+	if (stat->hash) {
 		/* If the profile is already created, simply reset it */
-		ftrace_profile_reset();
+		ftrace_profile_reset(stat);
 		return 0;
 	}
 
@@ -446,29 +446,45 @@ static int ftrace_profile_init(void)
 	 */
 	size = FTRACE_PROFILE_HASH_SIZE;
 
-	ftrace_profile_hash =
-		kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL);
+	stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL);
 
-	if (!ftrace_profile_hash)
+	if (!stat->hash)
 		return -ENOMEM;
 
-	size--;
+	if (!ftrace_profile_bits) {
+		size--;
 
-	for (; size; size >>= 1)
-		ftrace_profile_bits++;
+		for (; size; size >>= 1)
+			ftrace_profile_bits++;
+	}
 
 	/* Preallocate a few pages */
-	if (ftrace_profile_pages_init() < 0) {
-		kfree(ftrace_profile_hash);
-		ftrace_profile_hash = NULL;
+	if (ftrace_profile_pages_init(stat) < 0) {
+		kfree(stat->hash);
+		stat->hash = NULL;
 		return -ENOMEM;
 	}
 
 	return 0;
 }
 
+static int ftrace_profile_init(void)
+{
+	int cpu;
+	int ret = 0;
+
+	for_each_online_cpu(cpu) {
+		ret = ftrace_profile_init_cpu(cpu);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
 /* interrupts must be disabled */
-static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip)
+static struct ftrace_profile *
+ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
 {
 	struct ftrace_profile *rec;
 	struct hlist_head *hhd;
@@ -476,7 +492,7 @@ static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip)
 	unsigned long key;
 
 	key = hash_long(ip, ftrace_profile_bits);
-	hhd = &ftrace_profile_hash[key];
+	hhd = &stat->hash[key];
 
 	if (hlist_empty(hhd))
 		return NULL;
@@ -489,52 +505,50 @@ static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip)
 	return NULL;
 }
 
-static void ftrace_add_profile(struct ftrace_profile *rec)
+static void ftrace_add_profile(struct ftrace_profile_stat *stat,
+			       struct ftrace_profile *rec)
 {
 	unsigned long key;
 
 	key = hash_long(rec->ip, ftrace_profile_bits);
-	hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]);
+	hlist_add_head_rcu(&rec->node, &stat->hash[key]);
 }
 
 /* Interrupts must be disabled calling this */
 static struct ftrace_profile *
-ftrace_profile_alloc(unsigned long ip, bool alloc_safe)
+ftrace_profile_alloc(struct ftrace_profile_stat *stat,
+		     unsigned long ip, bool alloc_safe)
 {
 	struct ftrace_profile *rec = NULL;
 
 	/* prevent recursion */
-	if (atomic_inc_return(&__get_cpu_var(ftrace_profile_disable)) != 1)
+	if (atomic_inc_return(&stat->disabled) != 1)
 		goto out;
 
-	__raw_spin_lock(&ftrace_profile_rec_lock);
-
 	/* Try to always keep another page available */
-	if (!profile_pages->next && alloc_safe)
-		profile_pages->next = (void *)get_zeroed_page(GFP_ATOMIC);
+	if (!stat->pages->next && alloc_safe)
+		stat->pages->next = (void *)get_zeroed_page(GFP_ATOMIC);
 
 	/*
 	 * Try to find the function again since another
 	 * task on another CPU could have added it
 	 */
-	rec = ftrace_find_profiled_func(ip);
+	rec = ftrace_find_profiled_func(stat, ip);
 	if (rec)
-		goto out_unlock;
+		goto out;
 
-	if (profile_pages->index == PROFILES_PER_PAGE) {
-		if (!profile_pages->next)
-			goto out_unlock;
-		profile_pages = profile_pages->next;
+	if (stat->pages->index == PROFILES_PER_PAGE) {
+		if (!stat->pages->next)
+			goto out;
+		stat->pages = stat->pages->next;
 	}
 
-	rec = &profile_pages->records[profile_pages->index++];
+	rec = &stat->pages->records[stat->pages->index++];
 	rec->ip = ip;
-	ftrace_add_profile(rec);
+	ftrace_add_profile(stat, rec);
 
- out_unlock:
-	__raw_spin_unlock(&ftrace_profile_rec_lock);
  out:
-	atomic_dec(&__get_cpu_var(ftrace_profile_disable));
+	atomic_dec(&stat->disabled);
 
 	return rec;
 }
@@ -552,6 +566,7 @@ static bool ftrace_safe_to_allocate(void)
 static void
 function_profile_call(unsigned long ip, unsigned long parent_ip)
 {
+	struct ftrace_profile_stat *stat;
 	struct ftrace_profile *rec;
 	unsigned long flags;
 	bool alloc_safe;
@@ -562,9 +577,14 @@ function_profile_call(unsigned long ip, unsigned long parent_ip)
 	alloc_safe = ftrace_safe_to_allocate();
 
 	local_irq_save(flags);
-	rec = ftrace_find_profiled_func(ip);
+
+	stat = &__get_cpu_var(ftrace_profile_stats);
+	if (!stat->hash)
+		goto out;
+
+	rec = ftrace_find_profiled_func(stat, ip);
 	if (!rec) {
-		rec = ftrace_profile_alloc(ip, alloc_safe);
+		rec = ftrace_profile_alloc(stat, ip, alloc_safe);
 		if (!rec)
 			goto out;
 	}
@@ -583,13 +603,19 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace)
 
 static void profile_graph_return(struct ftrace_graph_ret *trace)
 {
-	unsigned long flags;
+	struct ftrace_profile_stat *stat;
 	struct ftrace_profile *rec;
+	unsigned long flags;
 
 	local_irq_save(flags);
-	rec = ftrace_find_profiled_func(trace->func);
+	stat = &__get_cpu_var(ftrace_profile_stats);
+	if (!stat->hash)
+		goto out;
+
+	rec = ftrace_find_profiled_func(stat, trace->func);
 	if (rec)
 		rec->time += trace->rettime - trace->calltime;
+ out:
 	local_irq_restore(flags);
 }
 
@@ -687,16 +713,51 @@ static const struct file_operations ftrace_profile_fops = {
 	.write		= ftrace_profile_write,
 };
 
+/* used to initialize the real stat files */
+static struct tracer_stat function_stats __initdata = {
+	.name = "functions",
+	.stat_start = function_stat_start,
+	.stat_next = function_stat_next,
+	.stat_cmp = function_stat_cmp,
+	.stat_headers = function_stat_headers,
+	.stat_show = function_stat_show
+};
+
 static void ftrace_profile_debugfs(struct dentry *d_tracer)
 {
+	struct ftrace_profile_stat *stat;
 	struct dentry *entry;
+	char *name;
 	int ret;
+	int cpu;
 
-	ret = register_stat_tracer(&function_stats);
-	if (ret) {
-		pr_warning("Warning: could not register "
-			   "function stats\n");
-		return;
+	for_each_possible_cpu(cpu) {
+		stat = &per_cpu(ftrace_profile_stats, cpu);
+
+		/* allocate enough for function name + cpu number */
+		name = kmalloc(32, GFP_KERNEL);
+		if (!name) {
+			/*
+			 * The files created are permanent, if something happens
+			 * we still do not free memory.
+			 */
+			kfree(stat);
+			WARN(1,
+			     "Could not allocate stat file for cpu %d\n",
+			     cpu);
+			return;
+		}
+		stat->stat = function_stats;
+		snprintf(name, 32, "function%d", cpu);
+		stat->stat.name = name;
+		ret = register_stat_tracer(&stat->stat);
+		if (ret) {
+			WARN(1,
+			     "Could not register function stat for cpu %d\n",
+			     cpu);
+			kfree(name);
+			return;
+		}
 	}
 
 	entry = debugfs_create_file("function_profile_enabled", 0644,
-- 
cgit v0.10.2


From a2a16d6a3156ef7309ca7328a20c35df9418e670 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Mar 2009 23:17:58 -0400
Subject: function-graph: add option to calculate graph time or not

graph time is the time that a function is executing another function.
Thus if function A calls B, if graph-time is set, then the time for
A includes B. This is the default behavior. But if graph-time is off,
then the time spent executing B is subtracted from A.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 015a3d2..9e0a8d2 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -365,6 +365,7 @@ struct ftrace_ret_stack {
 	unsigned long ret;
 	unsigned long func;
 	unsigned long long calltime;
+	unsigned long long subtime;
 };
 
 /*
@@ -376,8 +377,6 @@ extern void return_to_handler(void);
 
 extern int
 ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth);
-extern void
-ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret);
 
 /*
  * Sometimes we don't want to trace a function with the function
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ed1fc50..71e5fae 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -604,6 +604,7 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace)
 static void profile_graph_return(struct ftrace_graph_ret *trace)
 {
 	struct ftrace_profile_stat *stat;
+	unsigned long long calltime;
 	struct ftrace_profile *rec;
 	unsigned long flags;
 
@@ -612,9 +613,27 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
 	if (!stat->hash)
 		goto out;
 
+	calltime = trace->rettime - trace->calltime;
+
+	if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
+		int index;
+
+		index = trace->depth;
+
+		/* Append this call time to the parent time to subtract */
+		if (index)
+			current->ret_stack[index - 1].subtime += calltime;
+
+		if (current->ret_stack[index].subtime < calltime)
+			calltime -= current->ret_stack[index].subtime;
+		else
+			calltime = 0;
+	}
+
 	rec = ftrace_find_profiled_func(stat, trace->func);
 	if (rec)
-		rec->time += trace->rettime - trace->calltime;
+		rec->time += calltime;
+
  out:
 	local_irq_restore(flags);
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 821bf49..5d1a16c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -255,7 +255,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
-	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME;
+	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
+	TRACE_ITER_GRAPH_TIME;
 
 /**
  * trace_wake_up - wake up tasks waiting for trace input
@@ -317,6 +318,7 @@ static const char *trace_options[] = {
 	"latency-format",
 	"global-clock",
 	"sleep-time",
+	"graph-time",
 	NULL
 };
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c66ca3b..e3429a8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -685,6 +685,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_LATENCY_FMT		= 0x40000,
 	TRACE_ITER_GLOBAL_CLK		= 0x80000,
 	TRACE_ITER_SLEEP_TIME		= 0x100000,
+	TRACE_ITER_GRAPH_TIME		= 0x200000,
 };
 
 /*
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 85bba0f..10f6ad7 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -78,13 +78,14 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
 	current->ret_stack[index].ret = ret;
 	current->ret_stack[index].func = func;
 	current->ret_stack[index].calltime = calltime;
+	current->ret_stack[index].subtime = 0;
 	*depth = index;
 
 	return 0;
 }
 
 /* Retrieve a function return address to the trace stack on thread info.*/
-void
+static void
 ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
 {
 	int index;
@@ -104,9 +105,6 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
 	trace->calltime = current->ret_stack[index].calltime;
 	trace->overrun = atomic_read(&current->trace_overrun);
 	trace->depth = index;
-	barrier();
-	current->curr_ret_stack--;
-
 }
 
 /*
@@ -121,6 +119,8 @@ unsigned long ftrace_return_to_handler(void)
 	ftrace_pop_return_trace(&trace, &ret);
 	trace.rettime = trace_clock_local();
 	ftrace_graph_return(&trace);
+	barrier();
+	current->curr_ret_stack--;
 
 	if (unlikely(!ret)) {
 		ftrace_graph_stop();
-- 
cgit v0.10.2


From fb9fb015e92123fa3a8e0c2e2fff491d4a56b470 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 25 Mar 2009 13:26:41 -0400
Subject: tracing: clean up tracing profiler

Ingo Molnar suggested clean ups for the profiling code. This patch
makes those updates.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 71e5fae..a141d84 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -69,7 +69,7 @@ static DEFINE_MUTEX(ftrace_lock);
 
 static struct ftrace_ops ftrace_list_end __read_mostly =
 {
-	.func = ftrace_stub,
+	.func		= ftrace_stub,
 };
 
 static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
@@ -271,8 +271,10 @@ struct ftrace_profile_stat {
 #define PROFILES_PER_PAGE					\
 	(PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
 
-static int ftrace_profile_bits;
-static int ftrace_profile_enabled;
+static int ftrace_profile_bits __read_mostly;
+static int ftrace_profile_enabled __read_mostly;
+
+/* ftrace_profile_lock - synchronize the enable and disable of the profiler */
 static DEFINE_MUTEX(ftrace_profile_lock);
 
 static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
@@ -651,7 +653,7 @@ static void unregister_ftrace_profiler(void)
 #else
 static struct ftrace_ops ftrace_profile_ops __read_mostly =
 {
-	.func = function_profile_call,
+	.func		= function_profile_call,
 };
 
 static int register_ftrace_profiler(void)
@@ -670,7 +672,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 		     size_t cnt, loff_t *ppos)
 {
 	unsigned long val;
-	char buf[64];
+	char buf[64];		/* big enough to hold a number */
 	int ret;
 
 	if (cnt >= sizeof(buf))
@@ -719,7 +721,7 @@ static ssize_t
 ftrace_profile_read(struct file *filp, char __user *ubuf,
 		     size_t cnt, loff_t *ppos)
 {
-	char buf[64];
+	char buf[64];		/* big enough to hold a number */
 	int r;
 
 	r = sprintf(buf, "%u\n", ftrace_profile_enabled);
@@ -734,12 +736,12 @@ static const struct file_operations ftrace_profile_fops = {
 
 /* used to initialize the real stat files */
 static struct tracer_stat function_stats __initdata = {
-	.name = "functions",
-	.stat_start = function_stat_start,
-	.stat_next = function_stat_next,
-	.stat_cmp = function_stat_cmp,
-	.stat_headers = function_stat_headers,
-	.stat_show = function_stat_show
+	.name		= "functions",
+	.stat_start	= function_stat_start,
+	.stat_next	= function_stat_next,
+	.stat_cmp	= function_stat_cmp,
+	.stat_headers	= function_stat_headers,
+	.stat_show	= function_stat_show
 };
 
 static void ftrace_profile_debugfs(struct dentry *d_tracer)
@@ -1954,7 +1956,7 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
 
 static struct ftrace_ops trace_probe_ops __read_mostly =
 {
-	.func = function_trace_probe_call,
+	.func		= function_trace_probe_call,
 };
 
 static int ftrace_probe_registered;
-- 
cgit v0.10.2


From 318e0a73c9e41b9a17241829bcd0605a39b87cb9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 25 Mar 2009 20:06:34 -0400
Subject: tracing: remove on the fly allocator from function profiler

Impact: safer code

The on the fly allocator for the function profiler was to save
memory. But at the expense of stability. Although it survived several
tests, allocating from the function tracer is just too risky, just
to save space.

This patch removes the allocator and simply allocates enough entries
at start up.

Each function gets a profiling structure of 40 bytes. With an average
of 20K functions, and this is for each CPU, we have 800K per online
CPU. This is not too bad, at least for non-embedded.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a141d84..4d90c91 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -401,6 +401,8 @@ static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
 int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
 {
 	struct ftrace_profile_page *pg;
+	int functions;
+	int pages;
 	int i;
 
 	/* If we already allocated, do nothing */
@@ -411,22 +413,46 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
 	if (!stat->pages)
 		return -ENOMEM;
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+	functions = ftrace_update_tot_cnt;
+#else
+	/*
+	 * We do not know the number of functions that exist because
+	 * dynamic tracing is what counts them. With past experience
+	 * we have around 20K functions. That should be more than enough.
+	 * It is highly unlikely we will execute every function in
+	 * the kernel.
+	 */
+	functions = 20000;
+#endif
+
 	pg = stat->start = stat->pages;
 
-	/* allocate 10 more pages to start */
-	for (i = 0; i < 10; i++) {
+	pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);
+
+	for (i = 0; i < pages; i++) {
 		pg->next = (void *)get_zeroed_page(GFP_KERNEL);
-		/*
-		 * We only care about allocating profile_pages, if
-		 * we failed to allocate here, hopefully we will allocate
-		 * later.
-		 */
 		if (!pg->next)
-			break;
+			goto out_free;
 		pg = pg->next;
 	}
 
 	return 0;
+
+ out_free:
+	pg = stat->start;
+	while (pg) {
+		unsigned long tmp = (unsigned long)pg;
+
+		pg = pg->next;
+		free_page(tmp);
+	}
+
+	free_page((unsigned long)stat->pages);
+	stat->pages = NULL;
+	stat->start = NULL;
+
+	return -ENOMEM;
 }
 
 static int ftrace_profile_init_cpu(int cpu)
@@ -460,7 +486,7 @@ static int ftrace_profile_init_cpu(int cpu)
 			ftrace_profile_bits++;
 	}
 
-	/* Preallocate a few pages */
+	/* Preallocate the function profiling pages */
 	if (ftrace_profile_pages_init(stat) < 0) {
 		kfree(stat->hash);
 		stat->hash = NULL;
@@ -516,24 +542,21 @@ static void ftrace_add_profile(struct ftrace_profile_stat *stat,
 	hlist_add_head_rcu(&rec->node, &stat->hash[key]);
 }
 
-/* Interrupts must be disabled calling this */
+/*
+ * The memory is already allocated, this simply finds a new record to use.
+ */
 static struct ftrace_profile *
-ftrace_profile_alloc(struct ftrace_profile_stat *stat,
-		     unsigned long ip, bool alloc_safe)
+ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)
 {
 	struct ftrace_profile *rec = NULL;
 
-	/* prevent recursion */
+	/* prevent recursion (from NMIs) */
 	if (atomic_inc_return(&stat->disabled) != 1)
 		goto out;
 
-	/* Try to always keep another page available */
-	if (!stat->pages->next && alloc_safe)
-		stat->pages->next = (void *)get_zeroed_page(GFP_ATOMIC);
-
 	/*
-	 * Try to find the function again since another
-	 * task on another CPU could have added it
+	 * Try to find the function again since an NMI
+	 * could have added it
 	 */
 	rec = ftrace_find_profiled_func(stat, ip);
 	if (rec)
@@ -555,29 +578,16 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat,
 	return rec;
 }
 
-/*
- * If we are not in an interrupt, or softirq and
- * and interrupts are disabled and preemption is not enabled
- * (not in a spinlock) then it should be safe to allocate memory.
- */
-static bool ftrace_safe_to_allocate(void)
-{
-	return !in_interrupt() && irqs_disabled() && !preempt_count();
-}
-
 static void
 function_profile_call(unsigned long ip, unsigned long parent_ip)
 {
 	struct ftrace_profile_stat *stat;
 	struct ftrace_profile *rec;
 	unsigned long flags;
-	bool alloc_safe;
 
 	if (!ftrace_profile_enabled)
 		return;
 
-	alloc_safe = ftrace_safe_to_allocate();
-
 	local_irq_save(flags);
 
 	stat = &__get_cpu_var(ftrace_profile_stats);
@@ -586,7 +596,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip)
 
 	rec = ftrace_find_profiled_func(stat, ip);
 	if (!rec) {
-		rec = ftrace_profile_alloc(stat, ip, alloc_safe);
+		rec = ftrace_profile_alloc(stat, ip);
 		if (!rec)
 			goto out;
 	}
-- 
cgit v0.10.2


From 34886c8bc590f078d4c0b88f50d061326639198d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 25 Mar 2009 21:00:47 -0400
Subject: tracing: add average time in function to function profiler

Show the average time in the function (Time / Hit)

  Function                               Hit    Time            Avg
  --------                               ---    ----            ---
  mwait_idle                              51    140326.6 us     2751.503 us
  smp_apic_timer_interrupt                47    3517.735 us     74.845 us
  schedule                                10    2738.754 us     273.875 us
  __schedule                              10    2732.857 us     273.285 us
  hrtimer_interrupt                       47    1896.104 us     40.342 us
  irq_exit                                56    1711.833 us     30.568 us
  __run_hrtimer                           47    1315.589 us     27.991 us
  tick_sched_timer                        47    1138.690 us     24.227 us
  do_softirq                              56    1116.829 us     19.943 us
  __do_softirq                            56    1066.932 us     19.052 us
  do_IRQ                                   9    926.153 us      102.905 us

Signed-off-by: Steven Rostedt <srostedt@redhat.com>

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4d90c91..c7f4a4b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -347,8 +347,10 @@ static int function_stat_cmp(void *p1, void *p2)
 static int function_stat_headers(struct seq_file *m)
 {
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	seq_printf(m, "  Function                               Hit    Time\n"
-		      "  --------                               ---    ----\n");
+	seq_printf(m, "  Function                               "
+		   "Hit    Time            Avg\n"
+		      "  --------                               "
+		   "---    ----            ---\n");
 #else
 	seq_printf(m, "  Function                               Hit\n"
 		      "  --------                               ---\n");
@@ -361,12 +363,9 @@ static int function_stat_show(struct seq_file *m, void *v)
 	struct ftrace_profile *rec = v;
 	char str[KSYM_SYMBOL_LEN];
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	static struct trace_seq s;
 	static DEFINE_MUTEX(mutex);
-
-	mutex_lock(&mutex);
-	trace_seq_init(&s);
-	trace_print_graph_duration(rec->time, &s);
+	static struct trace_seq s;
+	unsigned long long avg;
 #endif
 
 	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
@@ -374,6 +373,14 @@ static int function_stat_show(struct seq_file *m, void *v)
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	seq_printf(m, "    ");
+	avg = rec->time;
+	do_div(avg, rec->counter);
+
+	mutex_lock(&mutex);
+	trace_seq_init(&s);
+	trace_print_graph_duration(rec->time, &s);
+	trace_seq_puts(&s, "    ");
+	trace_print_graph_duration(avg, &s);
 	trace_print_seq(m, &s);
 	mutex_unlock(&mutex);
 #endif
-- 
cgit v0.10.2


From a8a93f3f03b7a8008d720e8d91798efe599d416c Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 12 Feb 2009 13:45:34 -0800
Subject: mm: disable preemption in apply_to_pte_range

Impact: bugfix

Lazy mmu mode needs preemption disabled, so if we're apply to
init_mm (which doesn't require any pte locks), then explicitly
disable preemption.  (Do it unconditionally after checking we've
successfully done the allocation to simplify the error handling.)

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/mm/memory.c b/mm/memory.c
index baa999e..b80cc31 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1718,6 +1718,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 
 	BUG_ON(pmd_huge(*pmd));
 
+	preempt_disable();
 	arch_enter_lazy_mmu_mode();
 
 	token = pmd_pgtable(*pmd);
@@ -1729,6 +1730,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 
 	arch_leave_lazy_mmu_mode();
+	preempt_enable();
 
 	if (mm != &init_mm)
 		pte_unmap_unlock(pte-1, ptl);
-- 
cgit v0.10.2


From b8bcfe997e46150fedcc3f5b26b846400122fdd9 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 17 Feb 2009 23:05:19 -0800
Subject: x86/paravirt: remove lazy mode in interrupts

Impact: simplification, robustness

Make paravirt_lazy_mode() always return PARAVIRT_LAZY_NONE
when in an interrupt.  This prevents interrupt code from
accidentally inheriting an outer lazy state, and instead
does everything synchronously.  Outer batched operations
are left deferred.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 63dd358..8ab250a 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -282,6 +282,9 @@ void paravirt_leave_lazy_cpu(void)
 
 enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
 {
+	if (in_interrupt())
+		return PARAVIRT_LAZY_NONE;
+
 	return __get_cpu_var(paravirt_lazy_mode);
 }
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a03b727..cfbb4a7 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -225,12 +225,10 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
 	if (!pmd_present(*pmd_k))
 		return NULL;
 
-	if (!pmd_present(*pmd)) {
+	if (!pmd_present(*pmd))
 		set_pmd(pmd, *pmd_k);
-		arch_flush_lazy_mmu_mode();
-	} else {
+	else
 		BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
-	}
 
 	return pmd_k;
 }
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 00f127c..e81dfa4 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -87,7 +87,6 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 	BUG_ON(!pte_none(*(kmap_pte-idx)));
 	set_pte(kmap_pte-idx, mk_pte(page, prot));
-	arch_flush_lazy_mmu_mode();
 
 	return (void *)vaddr;
 }
@@ -117,7 +116,6 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
 #endif
 	}
 
-	arch_flush_lazy_mmu_mode();
 	pagefault_enable();
 }
 
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 04102d4..b6a61f3 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -74,7 +74,6 @@ iounmap_atomic(void *kvaddr, enum km_type type)
 	if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
 		kpte_clear_flush(kmap_pte-idx, vaddr);
 
-	arch_flush_lazy_mmu_mode();
 	pagefault_enable();
 }
 EXPORT_SYMBOL_GPL(iounmap_atomic);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 9c42949..9015e5e 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -824,13 +824,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
 
 	vm_unmap_aliases();
 
-	/*
-	 * If we're called with lazy mmu updates enabled, the
-	 * in-memory pte state may be stale.  Flush pending updates to
-	 * bring them up to date.
-	 */
-	arch_flush_lazy_mmu_mode();
-
 	cpa.vaddr = addr;
 	cpa.numpages = numpages;
 	cpa.mask_set = mask_set;
@@ -873,13 +866,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
 	} else
 		cpa_flush_all(cache);
 
-	/*
-	 * If we've been called with lazy mmu updates enabled, then
-	 * make sure that everything gets flushed out before we
-	 * return.
-	 */
-	arch_flush_lazy_mmu_mode();
-
 out:
 	return ret;
 }
-- 
cgit v0.10.2


From 7fd7d83d49914f03aefffba6aee09032fcd54cce Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 17 Feb 2009 23:24:03 -0800
Subject: x86/pvops: replace arch_enter_lazy_cpu_mode with
 arch_start_context_switch

Impact: simplification, prepare for later changes

Make lazy cpu mode more specific to context switching, so that
it makes sense to do more context-switch specific things in
the callbacks.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 0617d5c..7b28aba 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -1420,19 +1420,17 @@ void paravirt_enter_lazy_mmu(void);
 void paravirt_leave_lazy_mmu(void);
 void paravirt_leave_lazy(enum paravirt_lazy_mode mode);
 
-#define  __HAVE_ARCH_ENTER_LAZY_CPU_MODE
-static inline void arch_enter_lazy_cpu_mode(void)
+#define  __HAVE_ARCH_START_CONTEXT_SWITCH
+static inline void arch_start_context_switch(void)
 {
 	PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter);
 }
 
-static inline void arch_leave_lazy_cpu_mode(void)
+static inline void arch_end_context_switch(void)
 {
 	PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave);
 }
 
-void arch_flush_lazy_cpu_mode(void);
-
 #define  __HAVE_ARCH_ENTER_LAZY_MMU_MODE
 static inline void arch_enter_lazy_mmu_mode(void)
 {
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 8ab250a..5eea954 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -301,19 +301,6 @@ void arch_flush_lazy_mmu_mode(void)
 	preempt_enable();
 }
 
-void arch_flush_lazy_cpu_mode(void)
-{
-	preempt_disable();
-
-	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
-		WARN_ON(preempt_count() == 1);
-		arch_leave_lazy_cpu_mode();
-		arch_enter_lazy_cpu_mode();
-	}
-
-	preempt_enable();
-}
-
 struct pv_info pv_info = {
 	.name = "bare hardware",
 	.paravirt_enabled = 0,
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 14014d7..57e49a8 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -407,7 +407,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 * done before math_state_restore, so the TS bit is up
 	 * to date.
 	 */
-	arch_leave_lazy_cpu_mode();
+	arch_end_context_switch();
 
 	/* If the task has used fpu the last 5 timeslices, just do a full
 	 * restore of the math state immediately to avoid the trap; the
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index abb7e6a..7115e60 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -428,7 +428,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 * done before math_state_restore, so the TS bit is up
 	 * to date.
 	 */
-	arch_leave_lazy_cpu_mode();
+	arch_end_context_switch();
 
 	/*
 	 * Switch FS and GS.
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index cb6afa4..6b98f87 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1119,10 +1119,8 @@ static void drop_other_mm_ref(void *info)
 
 	/* If this cpu still has a stale cr3 reference, then make sure
 	   it has been flushed. */
-	if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
+	if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
 		load_cr3(swapper_pg_dir);
-		arch_flush_lazy_cpu_mode();
-	}
 }
 
 static void xen_drop_mm_ref(struct mm_struct *mm)
@@ -1135,7 +1133,6 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
 			load_cr3(swapper_pg_dir);
 		else
 			leave_mm(smp_processor_id());
-		arch_flush_lazy_cpu_mode();
 	}
 
 	/* Get the "official" set of cpus referring to our pagetable. */
diff --git a/include/asm-frv/pgtable.h b/include/asm-frv/pgtable.h
index e16fdb1..235e34a 100644
--- a/include/asm-frv/pgtable.h
+++ b/include/asm-frv/pgtable.h
@@ -73,8 +73,8 @@ static inline int pte_file(pte_t pte) { return 0; }
 #define pgtable_cache_init()		do {} while (0)
 #define arch_enter_lazy_mmu_mode()	do {} while (0)
 #define arch_leave_lazy_mmu_mode()	do {} while (0)
-#define arch_enter_lazy_cpu_mode()	do {} while (0)
-#define arch_leave_lazy_cpu_mode()	do {} while (0)
+
+#define arch_start_context_switch()	do {} while (0)
 
 #else /* !CONFIG_MMU */
 /*****************************************************************************/
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 8e6d0ca..922f036 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -280,17 +280,18 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm,
 #endif
 
 /*
- * A facility to provide batching of the reload of page tables with the
- * actual context switch code for paravirtualized guests.  By convention,
- * only one of the lazy modes (CPU, MMU) should be active at any given
- * time, entry should never be nested, and entry and exits should always
- * be paired.  This is for sanity of maintaining and reasoning about the
- * kernel code.
+ * A facility to provide batching of the reload of page tables and
+ * other process state with the actual context switch code for
+ * paravirtualized guests.  By convention, only one of the batched
+ * update (lazy) modes (CPU, MMU) should be active at any given time,
+ * entry should never be nested, and entry and exits should always be
+ * paired.  This is for sanity of maintaining and reasoning about the
+ * kernel code.  In this case, the exit (end of the context switch) is
+ * in architecture-specific code, and so doesn't need a generic
+ * definition.
  */
-#ifndef __HAVE_ARCH_ENTER_LAZY_CPU_MODE
-#define arch_enter_lazy_cpu_mode()	do {} while (0)
-#define arch_leave_lazy_cpu_mode()	do {} while (0)
-#define arch_flush_lazy_cpu_mode()	do {} while (0)
+#ifndef __HAVE_ARCH_START_CONTEXT_SWITCH
+#define arch_start_context_switch()	do {} while (0)
 #endif
 
 #ifndef __HAVE_PFNMAP_TRACKING
diff --git a/kernel/sched.c b/kernel/sched.c
index 5757e03..7530fdd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2746,7 +2746,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	 * combine the page table reload and the switch backend into
 	 * one hypercall.
 	 */
-	arch_enter_lazy_cpu_mode();
+	arch_start_context_switch();
 
 	if (unlikely(!mm)) {
 		next->active_mm = oldmm;
-- 
cgit v0.10.2


From b407fc57b815b2016186220baabc76cc8264206e Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 17 Feb 2009 23:46:21 -0800
Subject: x86/paravirt: flush pending mmu updates on context switch

Impact: allow preemption during lazy mmu updates

If we're in lazy mmu mode when context switching, leave
lazy mmu mode, but remember the task's state in
TIF_LAZY_MMU_UPDATES.  When we resume the task, check this
flag and re-enter lazy mmu mode if its set.

This sets things up for allowing lazy mmu mode while preemptible,
though that won't actually be active until the next change.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 7b28aba..58d2481 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -1418,7 +1418,6 @@ void paravirt_enter_lazy_cpu(void);
 void paravirt_leave_lazy_cpu(void);
 void paravirt_enter_lazy_mmu(void);
 void paravirt_leave_lazy_mmu(void);
-void paravirt_leave_lazy(enum paravirt_lazy_mode mode);
 
 #define  __HAVE_ARCH_START_CONTEXT_SWITCH
 static inline void arch_start_context_switch(void)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index df9d5f7..2f34d64 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -94,6 +94,7 @@ struct thread_info {
 #define TIF_FORCED_TF		24	/* true if TF in eflags artificially */
 #define TIF_DEBUGCTLMSR		25	/* uses thread_struct.debugctlmsr */
 #define TIF_DS_AREA_MSR		26      /* uses thread_struct.ds_area_msr */
+#define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -115,6 +116,7 @@ struct thread_info {
 #define _TIF_FORCED_TF		(1 << TIF_FORCED_TF)
 #define _TIF_DEBUGCTLMSR	(1 << TIF_DEBUGCTLMSR)
 #define _TIF_DS_AREA_MSR	(1 << TIF_DS_AREA_MSR)
+#define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
 
 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY	\
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 478bca9..5d7f6e7 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -201,7 +201,7 @@ static void kvm_leave_lazy_mmu(void)
 	struct kvm_para_state *state = kvm_para_state();
 
 	mmu_queue_flush(state);
-	paravirt_leave_lazy(paravirt_get_lazy_mode());
+	paravirt_leave_lazy_mmu();
 	state->mode = paravirt_get_lazy_mode();
 }
 
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 5eea954..430a0e3 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -252,7 +252,7 @@ static inline void enter_lazy(enum paravirt_lazy_mode mode)
 	__get_cpu_var(paravirt_lazy_mode) = mode;
 }
 
-void paravirt_leave_lazy(enum paravirt_lazy_mode mode)
+static void leave_lazy(enum paravirt_lazy_mode mode)
 {
 	BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode);
 	BUG_ON(preemptible());
@@ -267,17 +267,24 @@ void paravirt_enter_lazy_mmu(void)
 
 void paravirt_leave_lazy_mmu(void)
 {
-	paravirt_leave_lazy(PARAVIRT_LAZY_MMU);
+	leave_lazy(PARAVIRT_LAZY_MMU);
 }
 
 void paravirt_enter_lazy_cpu(void)
 {
+	if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
+		arch_leave_lazy_mmu_mode();
+		set_thread_flag(TIF_LAZY_MMU_UPDATES);
+	}
 	enter_lazy(PARAVIRT_LAZY_CPU);
 }
 
 void paravirt_leave_lazy_cpu(void)
 {
-	paravirt_leave_lazy(PARAVIRT_LAZY_CPU);
+	leave_lazy(PARAVIRT_LAZY_CPU);
+
+	if (test_and_clear_thread_flag(TIF_LAZY_MMU_UPDATES))
+		arch_enter_lazy_mmu_mode();
 }
 
 enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 2cc4a90..950929c 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -473,16 +473,22 @@ static void vmi_enter_lazy_cpu(void)
 	vmi_ops.set_lazy_mode(2);
 }
 
+static void vmi_leave_lazy_cpu(void)
+{
+	vmi_ops.set_lazy_mode(0);
+	paravirt_leave_lazy_cpu();
+}
+
 static void vmi_enter_lazy_mmu(void)
 {
 	paravirt_enter_lazy_mmu();
 	vmi_ops.set_lazy_mode(1);
 }
 
-static void vmi_leave_lazy(void)
+static void vmi_leave_lazy_mmu(void)
 {
-	paravirt_leave_lazy(paravirt_get_lazy_mode());
 	vmi_ops.set_lazy_mode(0);
+	paravirt_leave_lazy_mmu();
 }
 
 static inline int __init check_vmi_rom(struct vrom_header *rom)
@@ -718,12 +724,12 @@ static inline int __init activate_vmi(void)
 
 	para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu,
 		  set_lazy_mode, SetLazyMode);
-	para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy,
+	para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy_cpu,
 		  set_lazy_mode, SetLazyMode);
 
 	para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
 		  set_lazy_mode, SetLazyMode);
-	para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy,
+	para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
 		  set_lazy_mode, SetLazyMode);
 
 	/* user and kernel flush are just handled with different flags to FlushTLB */
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 9fe4dda..41a5562 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -147,10 +147,16 @@ static void lazy_hcall(unsigned long call,
 
 /* When lazy mode is turned off reset the per-cpu lazy mode variable and then
  * issue the do-nothing hypercall to flush any stored calls. */
-static void lguest_leave_lazy_mode(void)
+static void lguest_leave_lazy_mmu_mode(void)
 {
-	paravirt_leave_lazy(paravirt_get_lazy_mode());
 	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
+	paravirt_leave_lazy_mmu();
+}
+
+static void lguest_leave_lazy_cpu_mode(void)
+{
+	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
+	paravirt_leave_lazy_cpu();
 }
 
 /*G:033
@@ -1026,7 +1032,7 @@ __init void lguest_init(void)
 	pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
 	pv_cpu_ops.wbinvd = lguest_wbinvd;
 	pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu;
-	pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
+	pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_cpu_mode;
 
 	/* pagetable management */
 	pv_mmu_ops.write_cr3 = lguest_write_cr3;
@@ -1039,7 +1045,7 @@ __init void lguest_init(void)
 	pv_mmu_ops.read_cr2 = lguest_read_cr2;
 	pv_mmu_ops.read_cr3 = lguest_read_cr3;
 	pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
-	pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
+	pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
 
 #ifdef CONFIG_X86_LOCAL_APIC
 	/* apic read/write intercepts */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 82cd39a..f586e63 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -203,10 +203,10 @@ static unsigned long xen_get_debugreg(int reg)
 	return HYPERVISOR_get_debugreg(reg);
 }
 
-void xen_leave_lazy(void)
+static void xen_leave_lazy_cpu(void)
 {
-	paravirt_leave_lazy(paravirt_get_lazy_mode());
 	xen_mc_flush();
+	paravirt_leave_lazy_cpu();
 }
 
 static unsigned long xen_store_tr(void)
@@ -819,7 +819,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
 
 	.lazy_mode = {
 		.enter = paravirt_enter_lazy_cpu,
-		.leave = xen_leave_lazy,
+		.leave = xen_leave_lazy_cpu,
 	},
 };
 
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 6b98f87..f5f8faa 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1816,6 +1816,11 @@ __init void xen_post_allocator_init(void)
 	xen_mark_init_mm_pinned();
 }
 
+static void xen_leave_lazy_mmu(void)
+{
+	xen_mc_flush();
+	paravirt_leave_lazy_mmu();
+}
 
 const struct pv_mmu_ops xen_mmu_ops __initdata = {
 	.pagetable_setup_start = xen_pagetable_setup_start,
@@ -1891,7 +1896,7 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
 
 	.lazy_mode = {
 		.enter = paravirt_enter_lazy_mmu,
-		.leave = xen_leave_lazy,
+		.leave = xen_leave_lazy_mmu,
 	},
 
 	.set_fixmap = xen_set_fixmap,
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 2f5ef26..f897cdf 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -30,7 +30,6 @@ pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
 void xen_ident_map_ISA(void);
 void xen_reserve_top(void);
 
-void xen_leave_lazy(void);
 void xen_post_allocator_init(void);
 
 char * __init xen_memory_setup(void);
-- 
cgit v0.10.2


From 224101ed69d3fbb486868e0f6e0f9fa37302efb4 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 18 Feb 2009 11:18:57 -0800
Subject: x86/paravirt: finish change from lazy cpu to context switch start/end

Impact: fix lazy context switch API

Pass the previous and next tasks into the context switch start
end calls, so that the called functions can properly access the
task state (esp in end_context_switch, in which the next task
is not yet completely current).

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 58d2481..dfdee0c 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -56,6 +56,7 @@ struct desc_ptr;
 struct tss_struct;
 struct mm_struct;
 struct desc_struct;
+struct task_struct;
 
 /*
  * Wrapper type for pointers to code which uses the non-standard
@@ -203,7 +204,8 @@ struct pv_cpu_ops {
 
 	void (*swapgs)(void);
 
-	struct pv_lazy_ops lazy_mode;
+	void (*start_context_switch)(struct task_struct *prev);
+	void (*end_context_switch)(struct task_struct *next);
 };
 
 struct pv_irq_ops {
@@ -1414,20 +1416,21 @@ enum paravirt_lazy_mode {
 };
 
 enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
-void paravirt_enter_lazy_cpu(void);
-void paravirt_leave_lazy_cpu(void);
+void paravirt_start_context_switch(struct task_struct *prev);
+void paravirt_end_context_switch(struct task_struct *next);
+
 void paravirt_enter_lazy_mmu(void);
 void paravirt_leave_lazy_mmu(void);
 
 #define  __HAVE_ARCH_START_CONTEXT_SWITCH
-static inline void arch_start_context_switch(void)
+static inline void arch_start_context_switch(struct task_struct *prev)
 {
-	PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter);
+	PVOP_VCALL1(pv_cpu_ops.start_context_switch, prev);
 }
 
-static inline void arch_end_context_switch(void)
+static inline void arch_end_context_switch(struct task_struct *next)
 {
-	PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave);
+	PVOP_VCALL1(pv_cpu_ops.end_context_switch, next);
 }
 
 #define  __HAVE_ARCH_ENTER_LAZY_MMU_MODE
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index d0812e1..24e4283 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -83,6 +83,8 @@ static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
 #define pte_val(x)	native_pte_val(x)
 #define __pte(x)	native_make_pte(x)
 
+#define arch_end_context_switch(prev)	do {} while(0)
+
 #endif	/* CONFIG_PARAVIRT */
 
 /*
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 430a0e3..cf14375 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -270,20 +270,20 @@ void paravirt_leave_lazy_mmu(void)
 	leave_lazy(PARAVIRT_LAZY_MMU);
 }
 
-void paravirt_enter_lazy_cpu(void)
+void paravirt_start_context_switch(struct task_struct *prev)
 {
 	if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
 		arch_leave_lazy_mmu_mode();
-		set_thread_flag(TIF_LAZY_MMU_UPDATES);
+		set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
 	}
 	enter_lazy(PARAVIRT_LAZY_CPU);
 }
 
-void paravirt_leave_lazy_cpu(void)
+void paravirt_end_context_switch(struct task_struct *next)
 {
 	leave_lazy(PARAVIRT_LAZY_CPU);
 
-	if (test_and_clear_thread_flag(TIF_LAZY_MMU_UPDATES))
+	if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
 		arch_enter_lazy_mmu_mode();
 }
 
@@ -399,10 +399,8 @@ struct pv_cpu_ops pv_cpu_ops = {
 	.set_iopl_mask = native_set_iopl_mask,
 	.io_delay = native_io_delay,
 
-	.lazy_mode = {
-		.enter = paravirt_nop,
-		.leave = paravirt_nop,
-	},
+	.start_context_switch = paravirt_nop,
+	.end_context_switch = paravirt_nop,
 };
 
 struct pv_apic_ops pv_apic_ops = {
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 57e49a8..d766c76 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -407,7 +407,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 * done before math_state_restore, so the TS bit is up
 	 * to date.
 	 */
-	arch_end_context_switch();
+	arch_end_context_switch(next_p);
 
 	/* If the task has used fpu the last 5 timeslices, just do a full
 	 * restore of the math state immediately to avoid the trap; the
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 7115e60..e8a9aaf 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -428,7 +428,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 * done before math_state_restore, so the TS bit is up
 	 * to date.
 	 */
-	arch_end_context_switch();
+	arch_end_context_switch(next_p);
 
 	/*
 	 * Switch FS and GS.
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 950929c..55a5d69 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -467,16 +467,16 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
 }
 #endif
 
-static void vmi_enter_lazy_cpu(void)
+static void vmi_start_context_switch(struct task_struct *prev)
 {
-	paravirt_enter_lazy_cpu();
+	paravirt_start_context_switch(prev);
 	vmi_ops.set_lazy_mode(2);
 }
 
-static void vmi_leave_lazy_cpu(void)
+static void vmi_end_context_switch(struct task_struct *next)
 {
 	vmi_ops.set_lazy_mode(0);
-	paravirt_leave_lazy_cpu();
+	paravirt_end_context_switch(next);
 }
 
 static void vmi_enter_lazy_mmu(void)
@@ -722,9 +722,9 @@ static inline int __init activate_vmi(void)
 	para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
 	para_fill(pv_cpu_ops.io_delay, IODelay);
 
-	para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu,
+	para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
 		  set_lazy_mode, SetLazyMode);
-	para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy_cpu,
+	para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
 		  set_lazy_mode, SetLazyMode);
 
 	para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 41a5562..5287081 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -153,10 +153,10 @@ static void lguest_leave_lazy_mmu_mode(void)
 	paravirt_leave_lazy_mmu();
 }
 
-static void lguest_leave_lazy_cpu_mode(void)
+static void lguest_end_context_switch(struct task_struct *next)
 {
 	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
-	paravirt_leave_lazy_cpu();
+	paravirt_end_context_switch(next);
 }
 
 /*G:033
@@ -1031,8 +1031,8 @@ __init void lguest_init(void)
 	pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
 	pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
 	pv_cpu_ops.wbinvd = lguest_wbinvd;
-	pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu;
-	pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_cpu_mode;
+	pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
+	pv_cpu_ops.end_context_switch = lguest_end_context_switch;
 
 	/* pagetable management */
 	pv_mmu_ops.write_cr3 = lguest_write_cr3;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index f586e63..70b355d3a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -203,10 +203,10 @@ static unsigned long xen_get_debugreg(int reg)
 	return HYPERVISOR_get_debugreg(reg);
 }
 
-static void xen_leave_lazy_cpu(void)
+static void xen_end_context_switch(struct task_struct *next)
 {
 	xen_mc_flush();
-	paravirt_leave_lazy_cpu();
+	paravirt_end_context_switch(next);
 }
 
 static unsigned long xen_store_tr(void)
@@ -817,10 +817,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
 	/* Xen takes care of %gs when switching to usermode for us */
 	.swapgs = paravirt_nop,
 
-	.lazy_mode = {
-		.enter = paravirt_enter_lazy_cpu,
-		.leave = xen_leave_lazy_cpu,
-	},
+	.start_context_switch = paravirt_start_context_switch,
+	.end_context_switch = xen_end_context_switch,
 };
 
 static const struct pv_apic_ops xen_apic_ops __initdata = {
diff --git a/include/asm-frv/pgtable.h b/include/asm-frv/pgtable.h
index 235e34a..0988704 100644
--- a/include/asm-frv/pgtable.h
+++ b/include/asm-frv/pgtable.h
@@ -74,7 +74,7 @@ static inline int pte_file(pte_t pte) { return 0; }
 #define arch_enter_lazy_mmu_mode()	do {} while (0)
 #define arch_leave_lazy_mmu_mode()	do {} while (0)
 
-#define arch_start_context_switch()	do {} while (0)
+#define arch_start_context_switch(prev)	do {} while (0)
 
 #else /* !CONFIG_MMU */
 /*****************************************************************************/
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 922f036..e410f60 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -291,7 +291,7 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm,
  * definition.
  */
 #ifndef __HAVE_ARCH_START_CONTEXT_SWITCH
-#define arch_start_context_switch()	do {} while (0)
+#define arch_start_context_switch(prev)	do {} while (0)
 #endif
 
 #ifndef __HAVE_PFNMAP_TRACKING
diff --git a/kernel/sched.c b/kernel/sched.c
index 7530fdd..133762a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2746,7 +2746,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	 * combine the page table reload and the switch backend into
 	 * one hypercall.
 	 */
-	arch_start_context_switch();
+	arch_start_context_switch(prev);
 
 	if (unlikely(!mm)) {
 		next->active_mm = oldmm;
-- 
cgit v0.10.2


From 2829b449276aed45f3d649efb21e3418e39dd5d1 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 17 Feb 2009 23:53:19 -0800
Subject: x86/paravirt: allow preemption with lazy mmu mode

Impact: remove obsolete checks, simplification

Lift restrictions on preemption with lazy mmu mode, as it is now allowed.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index cf14375..bf2e86e 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -247,7 +247,6 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
 static inline void enter_lazy(enum paravirt_lazy_mode mode)
 {
 	BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
-	BUG_ON(preemptible());
 
 	__get_cpu_var(paravirt_lazy_mode) = mode;
 }
@@ -255,7 +254,6 @@ static inline void enter_lazy(enum paravirt_lazy_mode mode)
 static void leave_lazy(enum paravirt_lazy_mode mode)
 {
 	BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode);
-	BUG_ON(preemptible());
 
 	__get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
 }
@@ -272,6 +270,8 @@ void paravirt_leave_lazy_mmu(void)
 
 void paravirt_start_context_switch(struct task_struct *prev)
 {
+	BUG_ON(preemptible());
+
 	if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
 		arch_leave_lazy_mmu_mode();
 		set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
@@ -281,6 +281,8 @@ void paravirt_start_context_switch(struct task_struct *prev)
 
 void paravirt_end_context_switch(struct task_struct *next)
 {
+	BUG_ON(preemptible());
+
 	leave_lazy(PARAVIRT_LAZY_CPU);
 
 	if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
@@ -300,7 +302,6 @@ void arch_flush_lazy_mmu_mode(void)
 	preempt_disable();
 
 	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
-		WARN_ON(preempt_count() == 1);
 		arch_leave_lazy_mmu_mode();
 		arch_enter_lazy_mmu_mode();
 	}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index f5f8faa..3f2d0fe 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -419,10 +419,6 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 		    pte_t *ptep, pte_t pteval)
 {
-	/* updates to init_mm may be done without lock */
-	if (mm == &init_mm)
-		preempt_disable();
-
 	ADD_STATS(set_pte_at, 1);
 //	ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
 	ADD_STATS(set_pte_at_current, mm == current->mm);
@@ -443,9 +439,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 	}
 	xen_set_pte(ptep, pteval);
 
-out:
-	if (mm == &init_mm)
-		preempt_enable();
+out:	return;
 }
 
 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
-- 
cgit v0.10.2


From 252a6bf2a3a7e7add56b17d48aecf3f3ef213103 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 18 Feb 2009 00:11:28 -0800
Subject: mm: allow preemption in apply_to_pte_range

Impact: allow preemption in apply_to_pte_range updates to init_mm

Preemption is now allowed for lazy mmu mode, so don't disable
it for the inner loop of apply_to_pte_range.  This only applies
when doing updates to init_mm; user pagetables are still modified
under the pte lock, so preemption is disabled anyway.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

diff --git a/mm/memory.c b/mm/memory.c
index b80cc31..baa999e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1718,7 +1718,6 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 
 	BUG_ON(pmd_huge(*pmd));
 
-	preempt_disable();
 	arch_enter_lazy_mmu_mode();
 
 	token = pmd_pgtable(*pmd);
@@ -1730,7 +1729,6 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 
 	arch_leave_lazy_mmu_mode();
-	preempt_enable();
 
 	if (mm != &init_mm)
 		pte_unmap_unlock(pte-1, ptl);
-- 
cgit v0.10.2


From ab2f75f0b760d2b0c9a875b669a1b51dce02c85a Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 18 Feb 2009 00:18:50 -0800
Subject: x86/paravirt: use percpu_ rather than __get_cpu_var

Impact: minor optimisation

percpu_read/write is a slightly more direct way of getting
to percpu data.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index bf2e86e..254e8aa 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -246,16 +246,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
 
 static inline void enter_lazy(enum paravirt_lazy_mode mode)
 {
-	BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
+	BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
 
-	__get_cpu_var(paravirt_lazy_mode) = mode;
+	percpu_write(paravirt_lazy_mode, mode);
 }
 
 static void leave_lazy(enum paravirt_lazy_mode mode)
 {
-	BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode);
+	BUG_ON(percpu_read(paravirt_lazy_mode) != mode);
 
-	__get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
+	percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
 }
 
 void paravirt_enter_lazy_mmu(void)
@@ -294,7 +294,7 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
 	if (in_interrupt())
 		return PARAVIRT_LAZY_NONE;
 
-	return __get_cpu_var(paravirt_lazy_mode);
+	return percpu_read(paravirt_lazy_mode);
 }
 
 void arch_flush_lazy_mmu_mode(void)
-- 
cgit v0.10.2


From 5caecb9432428241d0c641897f07ff4003f1b55f Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 20 Feb 2009 23:01:26 -0800
Subject: xen: disable preempt for leave_lazy_mmu

xen_mc_flush() requires preemption to be disabled for its own sanity,
so disable it while we're flushing.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3f2d0fe..0e57238 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1812,8 +1812,10 @@ __init void xen_post_allocator_init(void)
 
 static void xen_leave_lazy_mmu(void)
 {
+	preempt_disable();
 	xen_mc_flush();
 	paravirt_leave_lazy_mmu();
+	preempt_enable();
 }
 
 const struct pv_mmu_ops xen_mmu_ops __initdata = {
-- 
cgit v0.10.2


From 59d7187142bbe9b404a403ed0f874d3227305f26 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 26 Feb 2009 15:48:33 -0800
Subject: xen: separate p2m allocation from setting

When doing very early p2m setting, we need to separate setting
from allocation, so split things up accordingly.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 0e57238..e0a55b7 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -233,47 +233,74 @@ unsigned long get_phys_to_machine(unsigned long pfn)
 }
 EXPORT_SYMBOL_GPL(get_phys_to_machine);
 
-static void alloc_p2m(unsigned long **pp, unsigned long *mfnp)
+/* install a  new p2m_top page */
+bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
 {
-	unsigned long *p;
+	unsigned topidx = p2m_top_index(pfn);
+	unsigned long **pfnp, *mfnp;
 	unsigned i;
 
-	p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
-	BUG_ON(p == NULL);
+	pfnp = &p2m_top[topidx];
+	mfnp = &p2m_top_mfn[topidx];
 
 	for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
 		p[i] = INVALID_P2M_ENTRY;
 
-	if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
-		free_page((unsigned long)p);
-	else
+	if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) {
 		*mfnp = virt_to_mfn(p);
+		return true;
+	}
+
+	return false;
 }
 
-void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+static void alloc_p2m(unsigned long pfn)
 {
-	unsigned topidx, idx;
+	unsigned long *p;
 
-	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
-		BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
-		return;
-	}
+	p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
+	BUG_ON(p == NULL);
+
+	if (!install_p2mtop_page(pfn, p))
+		free_page((unsigned long)p);
+}
+
+/* Try to install p2m mapping; fail if intermediate bits missing */
+bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+	unsigned topidx, idx;
 
 	if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
 		BUG_ON(mfn != INVALID_P2M_ENTRY);
-		return;
+		return true;
 	}
 
 	topidx = p2m_top_index(pfn);
 	if (p2m_top[topidx] == p2m_missing) {
-		/* no need to allocate a page to store an invalid entry */
 		if (mfn == INVALID_P2M_ENTRY)
-			return;
-		alloc_p2m(&p2m_top[topidx], &p2m_top_mfn[topidx]);
+			return true;
+		return false;
 	}
 
 	idx = p2m_index(pfn);
 	p2m_top[topidx][idx] = mfn;
+
+	return true;
+}
+
+void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
+		BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+		return;
+	}
+
+	if (unlikely(!__set_phys_to_machine(pfn, mfn)))  {
+		alloc_p2m(pfn);
+
+		if (!__set_phys_to_machine(pfn, mfn))
+			BUG();
+	}
 }
 
 unsigned long arbitrary_virt_to_mfn(void *vaddr)
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 24d1b44..da73026 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -11,6 +11,9 @@ enum pt_level {
 };
 
 
+bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+bool install_p2mtop_page(unsigned long pfn, unsigned long *p);
+
 void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
 
 
-- 
cgit v0.10.2


From a2bcd4731f77cb77ae4b5e4a3d7f5471cf346c33 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 29 Mar 2009 23:47:48 +0200
Subject: x86/mm: further cleanups of fault.c's include file section

Impact: cleanup

Eliminate more than 20 unnecessary #include lines in fault.c

Also fix include file dependency bug in asm/traps.h. (this was
masked before, by implicit inclusion)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <tip-56aea8468746e673a4bf50b6a13d97b2d1cbe1e8@git.kernel.org>
Acked-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 0d53425..37fb07a 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -2,6 +2,7 @@
 #define _ASM_X86_TRAPS_H
 
 #include <asm/debugreg.h>
+#include <asm/siginfo.h>			/* TRAP_TRACE, ... */
 
 #ifdef CONFIG_X86_32
 #define dotraplinkage
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a03b727..24a36a6 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -3,40 +3,16 @@
  *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
  *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
  */
-#include <linux/interrupt.h>
-#include <linux/mmiotrace.h>
-#include <linux/bootmem.h>
-#include <linux/compiler.h>
-#include <linux/highmem.h>
-#include <linux/kprobes.h>
-#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
-#include <linux/vt_kern.h>
-#include <linux/signal.h>
-#include <linux/kernel.h>
-#include <linux/ptrace.h>
-#include <linux/string.h>
-#include <linux/module.h>
-#include <linux/kdebug.h>
-#include <linux/errno.h>
-#include <linux/magic.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/mman.h>
-#include <linux/tty.h>
-#include <linux/smp.h>
-#include <linux/mm.h>
-
-#include <asm-generic/sections.h>
-
-#include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
-#include <asm/segment.h>
-#include <asm/system.h>
-#include <asm/proto.h>
-#include <asm/traps.h>
-#include <asm/desc.h>
+#include <linux/magic.h>		/* STACK_END_MAGIC		*/
+#include <linux/sched.h>		/* test_thread_flag(), ...	*/
+#include <linux/kdebug.h>		/* oops_begin/end, ...		*/
+#include <linux/module.h>		/* search_exception_table	*/
+#include <linux/bootmem.h>		/* max_low_pfn			*/
+#include <linux/kprobes.h>		/* __kprobes, ...		*/
+#include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
+
+#include <asm/traps.h>			/* dotraplinkage, ...		*/
+#include <asm/pgalloc.h>		/* pgd_*(), ...			*/
 
 /*
  * Page fault error code bits:
-- 
cgit v0.10.2


From 7571a60446030d2576d881438447e86a0755a83b Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 27 Feb 2009 15:34:59 -0800
Subject: xen: split construction of p2m mfn tables from registration

Build the p2m_mfn_list_list early with the rest of the p2m table, but
register it later when the real shared_info structure is in place.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index e0a55b7..67d2ab4 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -184,7 +184,7 @@ static inline unsigned p2m_index(unsigned long pfn)
 }
 
 /* Build the parallel p2m_top_mfn structures */
-void xen_setup_mfn_list_list(void)
+static void __init xen_build_mfn_list_list(void)
 {
 	unsigned pfn, idx;
 
@@ -198,7 +198,10 @@ void xen_setup_mfn_list_list(void)
 		unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
 		p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
 	}
+}
 
+void xen_setup_mfn_list_list(void)
+{
 	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
 
 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
@@ -218,6 +221,8 @@ void __init xen_build_dynamic_phys_to_machine(void)
 
 		p2m_top[topidx] = &mfn_list[pfn];
 	}
+
+	xen_build_mfn_list_list();
 }
 
 unsigned long get_phys_to_machine(unsigned long pfn)
-- 
cgit v0.10.2


From 6ed6bf428aff64fe37cdc54b239d598fee6016f1 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 4 Mar 2009 13:02:18 -0800
Subject: xen: clean up xen_load_gdt

Makes the logic a bit clearer.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 70b355d3a..5776dc2 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -301,10 +301,21 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
 	frames = mcs.args;
 
 	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
-		frames[f] = arbitrary_virt_to_mfn((void *)va);
+		int level;
+		pte_t *ptep = lookup_address(va, &level);
+		unsigned long pfn, mfn;
+		void *virt;
+
+		BUG_ON(ptep == NULL);
+
+		pfn = pte_pfn(*ptep);
+		mfn = pfn_to_mfn(pfn);
+		virt = __va(PFN_PHYS(pfn));
+
+		frames[f] = mfn;
 
 		make_lowmem_page_readonly((void *)va);
-		make_lowmem_page_readonly(mfn_to_virt(frames[f]));
+		make_lowmem_page_readonly(virt);
 	}
 
 	MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
-- 
cgit v0.10.2


From 3ce5fa7ebff74b6a4dc5fdcdc22e6979f5a4ff85 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 4 Mar 2009 15:26:00 -0800
Subject: xen: make xen_load_gdt simpler

Remove use of multicall machinery which is unused (gdt loading
is never performance critical).  This removes the implicit use
of percpu variables, which simplifies understanding how
the percpu code's use of load_gdt interacts with this code.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5776dc2..48b399b 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -284,12 +284,11 @@ static void xen_set_ldt(const void *addr, unsigned entries)
 
 static void xen_load_gdt(const struct desc_ptr *dtr)
 {
-	unsigned long *frames;
 	unsigned long va = dtr->address;
 	unsigned int size = dtr->size + 1;
 	unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+	unsigned long frames[pages];
 	int f;
-	struct multicall_space mcs;
 
 	/* A GDT can be up to 64k in size, which corresponds to 8192
 	   8-byte entries, or 16 4k pages.. */
@@ -297,9 +296,6 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
 	BUG_ON(size > 65536);
 	BUG_ON(va & ~PAGE_MASK);
 
-	mcs = xen_mc_entry(sizeof(*frames) * pages);
-	frames = mcs.args;
-
 	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
 		int level;
 		pte_t *ptep = lookup_address(va, &level);
@@ -314,13 +310,15 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
 
 		frames[f] = mfn;
 
+		printk("xen_load_gdt: %d va=%p mfn=%lx pfn=%lx va'=%p\n",
+		       f, (void *)va, mfn, pfn, virt);
+
 		make_lowmem_page_readonly((void *)va);
 		make_lowmem_page_readonly(virt);
 	}
 
-	MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
-
-	xen_mc_issue(PARAVIRT_LAZY_CPU);
+	if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
+		BUG();
 }
 
 static void load_TLS_descriptor(struct thread_struct *t,
-- 
cgit v0.10.2


From b4b7e58590d0e94ed78bd6be1aa163caba7b6c74 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 4 Mar 2009 16:34:27 -0800
Subject: xen: remove xen_load_gdt debug

Don't need the noise.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 48b399b..75b7a0f 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -310,9 +310,6 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
 
 		frames[f] = mfn;
 
-		printk("xen_load_gdt: %d va=%p mfn=%lx pfn=%lx va'=%p\n",
-		       f, (void *)va, mfn, pfn, virt);
-
 		make_lowmem_page_readonly((void *)va);
 		make_lowmem_page_readonly(virt);
 	}
-- 
cgit v0.10.2


From e9e2d1ffcfdb38bed11a3064aa74bea9ee38ed80 Mon Sep 17 00:00:00 2001
From: Hannes Eder <hannes@hanneseder.net>
Date: Thu, 5 Mar 2009 20:13:57 +0100
Subject: NULL noise: arch/x86/xen/smp.c

Fix this sparse warnings:
  arch/x86/xen/smp.c:316:52: warning: Using plain integer as NULL pointer
  arch/x86/xen/smp.c:421:60: warning: Using plain integer as NULL pointer

Signed-off-by: Hannes Eder <hannes@hanneseder.net>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 8d47056..304d832 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -317,7 +317,7 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
 	BUG_ON(rc);
 
 	while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
-		HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+		HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
 		barrier();
 	}
 
@@ -422,7 +422,7 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
 	/* Make sure other vcpus get a chance to run if they need to. */
 	for_each_cpu(cpu, mask) {
 		if (xen_vcpu_stolen(cpu)) {
-			HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+			HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
 			break;
 		}
 	}
-- 
cgit v0.10.2


From e826fe1ba1563a9272345da8e3279a930ac160a7 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Sat, 7 Mar 2009 17:09:27 -0800
Subject: xen: mask XSAVE from cpuid

Xen leaves XSAVE set in cpuid, but doesn't allow cr4.OSXSAVE
to be set.  This confuses the kernel and it ends up crashing on
an xsetbv instruction.

At boot time, try to set cr4.OSXSAVE, and mask XSAVE out of
cpuid it we can't.  This will produce a spurious error from Xen,
but allows us to support XSAVE if/when Xen does.

This also factors out the cpuid mask decisions to boot time.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 75b7a0f..da33e0c 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -168,21 +168,23 @@ static void __init xen_banner(void)
 	       xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
 }
 
+static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;
+static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
+
 static void xen_cpuid(unsigned int *ax, unsigned int *bx,
 		      unsigned int *cx, unsigned int *dx)
 {
+	unsigned maskecx = ~0;
 	unsigned maskedx = ~0;
 
 	/*
 	 * Mask out inconvenient features, to try and disable as many
 	 * unsupported kernel subsystems as possible.
 	 */
-	if (*ax == 1)
-		maskedx = ~((1 << X86_FEATURE_APIC) |  /* disable APIC */
-			    (1 << X86_FEATURE_ACPI) |  /* disable ACPI */
-			    (1 << X86_FEATURE_MCE)  |  /* disable MCE */
-			    (1 << X86_FEATURE_MCA)  |  /* disable MCA */
-			    (1 << X86_FEATURE_ACC));   /* thermal monitoring */
+	if (*ax == 1) {
+		maskecx = cpuid_leaf1_ecx_mask;
+		maskedx = cpuid_leaf1_edx_mask;
+	}
 
 	asm(XEN_EMULATE_PREFIX "cpuid"
 		: "=a" (*ax),
@@ -190,9 +192,43 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
 		  "=c" (*cx),
 		  "=d" (*dx)
 		: "0" (*ax), "2" (*cx));
+
+	*cx &= maskecx;
 	*dx &= maskedx;
 }
 
+static __init void xen_init_cpuid_mask(void)
+{
+	unsigned int ax, bx, cx, dx;
+
+	cpuid_leaf1_edx_mask =
+		~((1 << X86_FEATURE_MCE)  |  /* disable MCE */
+		  (1 << X86_FEATURE_MCA)  |  /* disable MCA */
+		  (1 << X86_FEATURE_ACC));   /* thermal monitoring */
+
+	if (!xen_initial_domain())
+		cpuid_leaf1_edx_mask &=
+			~((1 << X86_FEATURE_APIC) |  /* disable local APIC */
+			  (1 << X86_FEATURE_ACPI));  /* disable ACPI */
+
+	ax = 1;
+	xen_cpuid(&ax, &bx, &cx, &dx);
+
+	/* cpuid claims we support xsave; try enabling it to see what happens */
+	if (cx & (1 << (X86_FEATURE_XSAVE % 32))) {
+		unsigned long cr4;
+
+		set_in_cr4(X86_CR4_OSXSAVE);
+		
+		cr4 = read_cr4();
+
+		if ((cr4 & X86_CR4_OSXSAVE) == 0)
+			cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32));
+
+		clear_in_cr4(X86_CR4_OSXSAVE);
+	}
+}
+
 static void xen_set_debugreg(int reg, unsigned long val)
 {
 	HYPERVISOR_set_debugreg(reg, val);
@@ -901,6 +937,8 @@ asmlinkage void __init xen_start_kernel(void)
 
 	xen_init_irq_ops();
 
+	xen_init_cpuid_mask();
+
 #ifdef CONFIG_X86_LOCAL_APIC
 	/*
 	 * set up the basic apic ops.
-- 
cgit v0.10.2


From 68509cdcde6583ee1a9542899d1270449c7d5903 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Sun, 8 Mar 2009 03:59:04 -0700
Subject: x86-64: remove PGE from must-have feature list

PGE may not be available when running paravirtualized, so test the cpuid
bit before using it.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index d5cd6c5..a4737dd 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -50,7 +50,7 @@
 #ifdef CONFIG_X86_64
 #define NEED_PSE	0
 #define NEED_MSR	(1<<(X86_FEATURE_MSR & 31))
-#define NEED_PGE	(1<<(X86_FEATURE_PGE & 31))
+#define NEED_PGE	0
 #define NEED_FXSR	(1<<(X86_FEATURE_FXSR & 31))
 #define NEED_XMM	(1<<(X86_FEATURE_XMM & 31))
 #define NEED_XMM2	(1<<(X86_FEATURE_XMM2 & 31))
-- 
cgit v0.10.2


From 1e7449730853e7c9ae9a2458b2ced7ba12559a0e Mon Sep 17 00:00:00 2001
From: Alex Nixon <alex.nixon@citrix.com>
Date: Mon, 9 Feb 2009 12:05:46 -0800
Subject: Xen: Add virt_to_pfn helper function

Signed-off-by: Alex Nixon <alex.nixon@citrix.com>

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 1a918dd..018a0a4 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -124,7 +124,8 @@ static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
 
 /* VIRT <-> MACHINE conversion */
 #define virt_to_machine(v)	(phys_to_machine(XPADDR(__pa(v))))
-#define virt_to_mfn(v)		(pfn_to_mfn(PFN_DOWN(__pa(v))))
+#define virt_to_pfn(v)          (PFN_DOWN(__pa(v)))
+#define virt_to_mfn(v)		(pfn_to_mfn(virt_to_pfn(v)))
 #define mfn_to_virt(m)		(__va(mfn_to_pfn(m) << PAGE_SHIFT))
 
 static inline unsigned long pte_mfn(pte_t pte)
-- 
cgit v0.10.2


From 5f241e65f2be4661a33e1937e1c829252a80b2b8 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Mon, 16 Mar 2009 17:08:48 -0700
Subject: x86-64: non-paravirt systems always has PSE and PGE

A paravirtualized system may not have PSE or PGE available to
guests, so they are not required features.  However, without
paravirt we can assume that any x86-64 implementation will have
them available.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index a4737dd..64cf2d2 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -48,9 +48,15 @@
 #endif
 
 #ifdef CONFIG_X86_64
+#ifdef CONFIG_PARAVIRT
+/* Paravirtualized systems may not have PSE or PGE available */
 #define NEED_PSE	0
-#define NEED_MSR	(1<<(X86_FEATURE_MSR & 31))
 #define NEED_PGE	0
+#else
+#define NEED_PSE	(1<<(X86_FEATURE_PSE) & 31)
+#define NEED_PGE	(1<<(X86_FEATURE_PGE) & 31)
+#endif
+#define NEED_MSR	(1<<(X86_FEATURE_MSR & 31))
 #define NEED_FXSR	(1<<(X86_FEATURE_FXSR & 31))
 #define NEED_XMM	(1<<(X86_FEATURE_XMM & 31))
 #define NEED_XMM2	(1<<(X86_FEATURE_XMM2 & 31))
-- 
cgit v0.10.2


From 4185f35404dc96f8525298c7c548aee419f3b3f4 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 17 Mar 2009 13:30:55 -0700
Subject: xen/mmu: some early pagetable cleanups

1. make sure early-allocated ptes are pinned, so they can be later
   unpinned
2. don't pin pmd+pud, just make them RO
3. scatter some __inits around

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 67d2ab4..df87c80 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1013,7 +1013,7 @@ static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
 	return 0;
 }
 
-void __init xen_mark_init_mm_pinned(void)
+static void __init xen_mark_init_mm_pinned(void)
 {
 	xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
 }
@@ -1461,6 +1461,15 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
 }
 #endif
 
+static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+	struct mmuext_op op;
+	op.cmd = cmd;
+	op.arg1.mfn = pfn_to_mfn(pfn);
+	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+		BUG();
+}
+
 /* Early in boot, while setting up the initial pagetable, assume
    everything is pinned. */
 static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
@@ -1469,22 +1478,29 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
 	BUG_ON(mem_map);	/* should only be used early */
 #endif
 	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+	pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+}
+
+/* Used for pmd and pud */
+static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
+{
+#ifdef CONFIG_FLATMEM
+	BUG_ON(mem_map);	/* should only be used early */
+#endif
+	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
 }
 
 /* Early release_pte assumes that all pts are pinned, since there's
    only init_mm and anything attached to that is pinned. */
-static void xen_release_pte_init(unsigned long pfn)
+static __init void xen_release_pte_init(unsigned long pfn)
 {
+	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
 	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
 }
 
-static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+static __init void xen_release_pmd_init(unsigned long pfn)
 {
-	struct mmuext_op op;
-	op.cmd = cmd;
-	op.arg1.mfn = pfn_to_mfn(pfn);
-	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
-		BUG();
+	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
 }
 
 /* This needs to make sure the new pte page is pinned iff its being
@@ -1873,9 +1889,9 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
 
 	.alloc_pte = xen_alloc_pte_init,
 	.release_pte = xen_release_pte_init,
-	.alloc_pmd = xen_alloc_pte_init,
+	.alloc_pmd = xen_alloc_pmd_init,
 	.alloc_pmd_clone = paravirt_nop,
-	.release_pmd = xen_release_pte_init,
+	.release_pmd = xen_release_pmd_init,
 
 #ifdef CONFIG_HIGHPTE
 	.kmap_atomic_pte = xen_kmap_atomic_pte,
@@ -1914,8 +1930,8 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
 	.make_pud = PV_CALLEE_SAVE(xen_make_pud),
 	.set_pgd = xen_set_pgd_hyper,
 
-	.alloc_pud = xen_alloc_pte_init,
-	.release_pud = xen_release_pte_init,
+	.alloc_pud = xen_alloc_pmd_init,
+	.release_pud = xen_release_pmd_init,
 #endif	/* PAGETABLE_LEVELS == 4 */
 
 	.activate_mm = xen_activate_mm,
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index f897cdf..5c50a10 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -56,8 +56,6 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
 
 bool xen_vcpu_stolen(int vcpu);
 
-void xen_mark_init_mm_pinned(void);
-
 void xen_setup_vcpu_info_placement(void);
 
 #ifdef CONFIG_SMP
-- 
cgit v0.10.2


From 8de07bbdede03598801cf33ab23dcbcd28a918d2 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 4 Mar 2009 17:36:57 -0800
Subject: xen/mmu: weaken flush_tlb_other test

Impact: fixes crashing bug

There's no particular problem with getting an empty cpu mask,
so just shortcut-return if we get one.

Avoids crash reported by Christophe Saout <christophe@saout.de>

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index df87c80..e425a32 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1293,8 +1293,8 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
 	} *args;
 	struct multicall_space mcs;
 
-	BUG_ON(cpumask_empty(cpus));
-	BUG_ON(!mm);
+	if (cpumask_empty(cpus))
+		return;		/* nothing to do */
 
 	mcs = xen_mc_entry(sizeof(*args));
 	args = mcs.args;
-- 
cgit v0.10.2


From 1e6fcf840e11ceff8a656a678c6e4b0560a98e08 Mon Sep 17 00:00:00 2001
From: Ian Campbell <Ian.Campbell@citrix.com>
Date: Wed, 25 Mar 2009 17:46:42 +0000
Subject: xen: resume interrupts before system devices.

Impact: bugfix Xen domain restore

Otherwise the first timer interrupt after resume is missed and we never
get another.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 3ccd348..b703dd2 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -68,15 +68,15 @@ static int xen_suspend(void *data)
 	gnttab_resume();
 	xen_mm_unpin_all();
 
-	sysdev_resume();
-	device_power_up(PMSG_RESUME);
-
 	if (!*cancelled) {
 		xen_irq_resume();
 		xen_console_resume();
 		xen_timer_resume();
 	}
 
+	sysdev_resume();
+	device_power_up(PMSG_RESUME);
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From 707ebbc81c61eb480d8a51ca61e355e240df1d32 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 27 Mar 2009 11:29:02 -0700
Subject: xen: set _PAGE_NX in __supported_pte_mask before pagetable
 construction

Some 64-bit machines don't support the NX flag in ptes.
Check for NX before constructing the kernel pagetables.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index da33e0c..80f4c53 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -42,6 +42,7 @@
 #include <asm/xen/hypervisor.h>
 #include <asm/fixmap.h>
 #include <asm/processor.h>
+#include <asm/proto.h>
 #include <asm/msr-index.h>
 #include <asm/setup.h>
 #include <asm/desc.h>
@@ -912,7 +913,6 @@ static const struct machine_ops __initdata xen_machine_ops = {
 	.emergency_restart = xen_emergency_restart,
 };
 
-
 /* First C function to be called on Xen boot */
 asmlinkage void __init xen_start_kernel(void)
 {
@@ -980,6 +980,11 @@ asmlinkage void __init xen_start_kernel(void)
 	if (!xen_initial_domain())
 		__supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
 
+#ifdef CONFIG_X86_64
+	/* Work out if we support NX */
+	check_efer();
+#endif
+
 	/* Don't do the full vcpu_info placement stuff until we have a
 	   possible map and a non-dummy shared_info. */
 	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
-- 
cgit v0.10.2


From 6d02c42698f99eccb290ac53d4f10ca883b9f90c Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Sun, 29 Mar 2009 22:57:15 -0700
Subject: xen: clean up gate trap/interrupt constants

Use GATE_INTERRUPT/TRAP rather than 0xe/f.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 80f4c53..12a3159 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -428,7 +428,7 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
 static int cvt_gate_to_trap(int vector, const gate_desc *val,
 			    struct trap_info *info)
 {
-	if (val->type != 0xf && val->type != 0xe)
+	if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
 		return 0;
 
 	info->vector = vector;
@@ -436,8 +436,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
 	info->cs = gate_segment(*val);
 	info->flags = val->dpl;
 	/* interrupt gates clear IF */
-	if (val->type == 0xe)
-		info->flags |= 4;
+	if (val->type == GATE_INTERRUPT)
+		info->flags |= 1 << 2;
 
 	return 1;
 }
-- 
cgit v0.10.2


From d4c045364d3107603187f21a56ec231e74d26441 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 6 Feb 2009 19:20:31 -0800
Subject: xen: add irq_from_evtchn

Given an evtchn, return the corresponding irq.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 30963af..1cd2a0e 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -151,6 +151,12 @@ static unsigned int evtchn_from_irq(unsigned irq)
 	return info_for_irq(irq)->evtchn;
 }
 
+unsigned irq_from_evtchn(unsigned int evtchn)
+{
+	return evtchn_to_irq[evtchn];
+}
+EXPORT_SYMBOL_GPL(irq_from_evtchn);
+
 static enum ipi_vector ipi_from_irq(unsigned irq)
 {
 	struct irq_info *info = info_for_irq(irq);
diff --git a/include/xen/events.h b/include/xen/events.h
index 0d5f1ad..e68d59a 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -53,4 +53,7 @@ bool xen_test_irq_pending(int irq);
    irq will be disabled so it won't deliver an interrupt. */
 void xen_poll_irq(int irq);
 
+/* Determine the IRQ which is bound to an event channel */
+unsigned irq_from_evtchn(unsigned int evtchn);
+
 #endif	/* _XEN_EVENTS_H */
-- 
cgit v0.10.2


From f7116284c734f3a47180cd9c907944a1837ccb3c Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 6 Feb 2009 19:21:19 -0800
Subject: xen: add /dev/xen/evtchn driver

This driver is used by application which wish to receive notifications
from the hypervisor or other guests via Xen's event channel
mechanism. In particular it is used by the xenstore daemon in domain
0.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 526187c..1bbb910 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -18,6 +18,16 @@ config XEN_SCRUB_PAGES
 	  secure, but slightly less efficient.
 	  If in doubt, say yes.
 
+config XEN_DEV_EVTCHN
+	tristate "Xen /dev/xen/evtchn device"
+	depends on XEN
+	default y
+	help
+	  The evtchn driver allows a userspace process to triger event
+	  channels and to receive notification of an event channel
+	  firing.
+	  If in doubt, say yes.
+
 config XENFS
 	tristate "Xen filesystem"
 	depends on XEN
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index ff8accc..1567639 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -4,4 +4,5 @@ obj-y	+= xenbus/
 obj-$(CONFIG_HOTPLUG_CPU)	+= cpu_hotplug.o
 obj-$(CONFIG_XEN_XENCOMM)	+= xencomm.o
 obj-$(CONFIG_XEN_BALLOON)	+= balloon.o
-obj-$(CONFIG_XENFS)		+= xenfs/
\ No newline at end of file
+obj-$(CONFIG_XEN_DEV_EVTCHN)	+= evtchn.o
+obj-$(CONFIG_XENFS)		+= xenfs/
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
new file mode 100644
index 0000000..517b9ee
--- /dev/null
+++ b/drivers/xen/evtchn.c
@@ -0,0 +1,494 @@
+/******************************************************************************
+ * evtchn.c
+ *
+ * Driver for receiving and demuxing event-channel signals.
+ *
+ * Copyright (c) 2004-2005, K A Fraser
+ * Multi-process extensions Copyright (c) 2004, Steven Smith
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/errno.h>
+#include <linux/miscdevice.h>
+#include <linux/major.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/poll.h>
+#include <linux/irq.h>
+#include <linux/init.h>
+#include <linux/gfp.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <xen/events.h>
+#include <xen/evtchn.h>
+#include <asm/xen/hypervisor.h>
+
+struct per_user_data {
+	/* Notification ring, accessed via /dev/xen/evtchn. */
+#define EVTCHN_RING_SIZE     (PAGE_SIZE / sizeof(evtchn_port_t))
+#define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1))
+	evtchn_port_t *ring;
+	unsigned int ring_cons, ring_prod, ring_overflow;
+	struct mutex ring_cons_mutex; /* protect against concurrent readers */
+
+	/* Processes wait on this queue when ring is empty. */
+	wait_queue_head_t evtchn_wait;
+	struct fasync_struct *evtchn_async_queue;
+	const char *name;
+};
+
+/* Who's bound to each port? */
+static struct per_user_data *port_user[NR_EVENT_CHANNELS];
+static DEFINE_SPINLOCK(port_user_lock);
+
+irqreturn_t evtchn_interrupt(int irq, void *data)
+{
+	unsigned int port = (unsigned long)data;
+	struct per_user_data *u;
+
+	spin_lock(&port_user_lock);
+
+	u = port_user[port];
+
+	disable_irq_nosync(irq);
+
+	if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) {
+		u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port;
+		wmb(); /* Ensure ring contents visible */
+		if (u->ring_cons == u->ring_prod++) {
+			wake_up_interruptible(&u->evtchn_wait);
+			kill_fasync(&u->evtchn_async_queue,
+				    SIGIO, POLL_IN);
+		}
+	} else {
+		u->ring_overflow = 1;
+	}
+
+	spin_unlock(&port_user_lock);
+
+	return IRQ_HANDLED;
+}
+
+static ssize_t evtchn_read(struct file *file, char __user *buf,
+			   size_t count, loff_t *ppos)
+{
+	int rc;
+	unsigned int c, p, bytes1 = 0, bytes2 = 0;
+	struct per_user_data *u = file->private_data;
+
+	/* Whole number of ports. */
+	count &= ~(sizeof(evtchn_port_t)-1);
+
+	if (count == 0)
+		return 0;
+
+	if (count > PAGE_SIZE)
+		count = PAGE_SIZE;
+
+	for (;;) {
+		mutex_lock(&u->ring_cons_mutex);
+
+		rc = -EFBIG;
+		if (u->ring_overflow)
+			goto unlock_out;
+
+		c = u->ring_cons;
+		p = u->ring_prod;
+		if (c != p)
+			break;
+
+		mutex_unlock(&u->ring_cons_mutex);
+
+		if (file->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		rc = wait_event_interruptible(u->evtchn_wait,
+					      u->ring_cons != u->ring_prod);
+		if (rc)
+			return rc;
+	}
+
+	/* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */
+	if (((c ^ p) & EVTCHN_RING_SIZE) != 0) {
+		bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) *
+			sizeof(evtchn_port_t);
+		bytes2 = EVTCHN_RING_MASK(p) * sizeof(evtchn_port_t);
+	} else {
+		bytes1 = (p - c) * sizeof(evtchn_port_t);
+		bytes2 = 0;
+	}
+
+	/* Truncate chunks according to caller's maximum byte count. */
+	if (bytes1 > count) {
+		bytes1 = count;
+		bytes2 = 0;
+	} else if ((bytes1 + bytes2) > count) {
+		bytes2 = count - bytes1;
+	}
+
+	rc = -EFAULT;
+	rmb(); /* Ensure that we see the port before we copy it. */
+	if (copy_to_user(buf, &u->ring[EVTCHN_RING_MASK(c)], bytes1) ||
+	    ((bytes2 != 0) &&
+	     copy_to_user(&buf[bytes1], &u->ring[0], bytes2)))
+		goto unlock_out;
+
+	u->ring_cons += (bytes1 + bytes2) / sizeof(evtchn_port_t);
+	rc = bytes1 + bytes2;
+
+ unlock_out:
+	mutex_unlock(&u->ring_cons_mutex);
+	return rc;
+}
+
+static ssize_t evtchn_write(struct file *file, const char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	int rc, i;
+	evtchn_port_t *kbuf = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
+	struct per_user_data *u = file->private_data;
+
+	if (kbuf == NULL)
+		return -ENOMEM;
+
+	/* Whole number of ports. */
+	count &= ~(sizeof(evtchn_port_t)-1);
+
+	rc = 0;
+	if (count == 0)
+		goto out;
+
+	if (count > PAGE_SIZE)
+		count = PAGE_SIZE;
+
+	rc = -EFAULT;
+	if (copy_from_user(kbuf, buf, count) != 0)
+		goto out;
+
+	spin_lock_irq(&port_user_lock);
+	for (i = 0; i < (count/sizeof(evtchn_port_t)); i++)
+		if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u))
+			enable_irq(irq_from_evtchn(kbuf[i]));
+	spin_unlock_irq(&port_user_lock);
+
+	rc = count;
+
+ out:
+	free_page((unsigned long)kbuf);
+	return rc;
+}
+
+static int evtchn_bind_to_user(struct per_user_data *u, int port)
+{
+	int irq;
+	int rc = 0;
+
+	spin_lock_irq(&port_user_lock);
+
+	BUG_ON(port_user[port] != NULL);
+
+	irq = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED,
+					u->name, (void *)(unsigned long)port);
+	if (rc < 0)
+		goto fail;
+
+	port_user[port] = u;
+
+fail:
+	spin_unlock_irq(&port_user_lock);
+	return rc;
+}
+
+static void evtchn_unbind_from_user(struct per_user_data *u, int port)
+{
+	int irq = irq_from_evtchn(port);
+
+	unbind_from_irqhandler(irq, (void *)(unsigned long)port);
+	port_user[port] = NULL;
+}
+
+static long evtchn_ioctl(struct file *file,
+			 unsigned int cmd, unsigned long arg)
+{
+	int rc;
+	struct per_user_data *u = file->private_data;
+	void __user *uarg = (void __user *) arg;
+
+	switch (cmd) {
+	case IOCTL_EVTCHN_BIND_VIRQ: {
+		struct ioctl_evtchn_bind_virq bind;
+		struct evtchn_bind_virq bind_virq;
+
+		rc = -EFAULT;
+		if (copy_from_user(&bind, uarg, sizeof(bind)))
+			break;
+
+		bind_virq.virq = bind.virq;
+		bind_virq.vcpu = 0;
+		rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+						 &bind_virq);
+		if (rc != 0)
+			break;
+
+		rc = evtchn_bind_to_user(u, bind_virq.port);
+		if (rc == 0)
+			rc = bind_virq.port;
+		break;
+	}
+
+	case IOCTL_EVTCHN_BIND_INTERDOMAIN: {
+		struct ioctl_evtchn_bind_interdomain bind;
+		struct evtchn_bind_interdomain bind_interdomain;
+
+		rc = -EFAULT;
+		if (copy_from_user(&bind, uarg, sizeof(bind)))
+			break;
+
+		bind_interdomain.remote_dom  = bind.remote_domain;
+		bind_interdomain.remote_port = bind.remote_port;
+		rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
+						 &bind_interdomain);
+		if (rc != 0)
+			break;
+
+		rc = evtchn_bind_to_user(u, bind_interdomain.local_port);
+		if (rc == 0)
+			rc = bind_interdomain.local_port;
+		break;
+	}
+
+	case IOCTL_EVTCHN_BIND_UNBOUND_PORT: {
+		struct ioctl_evtchn_bind_unbound_port bind;
+		struct evtchn_alloc_unbound alloc_unbound;
+
+		rc = -EFAULT;
+		if (copy_from_user(&bind, uarg, sizeof(bind)))
+			break;
+
+		alloc_unbound.dom        = DOMID_SELF;
+		alloc_unbound.remote_dom = bind.remote_domain;
+		rc = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
+						 &alloc_unbound);
+		if (rc != 0)
+			break;
+
+		rc = evtchn_bind_to_user(u, alloc_unbound.port);
+		if (rc == 0)
+			rc = alloc_unbound.port;
+		break;
+	}
+
+	case IOCTL_EVTCHN_UNBIND: {
+		struct ioctl_evtchn_unbind unbind;
+
+		rc = -EFAULT;
+		if (copy_from_user(&unbind, uarg, sizeof(unbind)))
+			break;
+
+		rc = -EINVAL;
+		if (unbind.port >= NR_EVENT_CHANNELS)
+			break;
+
+		spin_lock_irq(&port_user_lock);
+
+		rc = -ENOTCONN;
+		if (port_user[unbind.port] != u) {
+			spin_unlock_irq(&port_user_lock);
+			break;
+		}
+
+		evtchn_unbind_from_user(u, unbind.port);
+
+		spin_unlock_irq(&port_user_lock);
+
+		rc = 0;
+		break;
+	}
+
+	case IOCTL_EVTCHN_NOTIFY: {
+		struct ioctl_evtchn_notify notify;
+
+		rc = -EFAULT;
+		if (copy_from_user(&notify, uarg, sizeof(notify)))
+			break;
+
+		if (notify.port >= NR_EVENT_CHANNELS) {
+			rc = -EINVAL;
+		} else if (port_user[notify.port] != u) {
+			rc = -ENOTCONN;
+		} else {
+			notify_remote_via_evtchn(notify.port);
+			rc = 0;
+		}
+		break;
+	}
+
+	case IOCTL_EVTCHN_RESET: {
+		/* Initialise the ring to empty. Clear errors. */
+		mutex_lock(&u->ring_cons_mutex);
+		spin_lock_irq(&port_user_lock);
+		u->ring_cons = u->ring_prod = u->ring_overflow = 0;
+		spin_unlock_irq(&port_user_lock);
+		mutex_unlock(&u->ring_cons_mutex);
+		rc = 0;
+		break;
+	}
+
+	default:
+		rc = -ENOSYS;
+		break;
+	}
+
+	return rc;
+}
+
+static unsigned int evtchn_poll(struct file *file, poll_table *wait)
+{
+	unsigned int mask = POLLOUT | POLLWRNORM;
+	struct per_user_data *u = file->private_data;
+
+	poll_wait(file, &u->evtchn_wait, wait);
+	if (u->ring_cons != u->ring_prod)
+		mask |= POLLIN | POLLRDNORM;
+	if (u->ring_overflow)
+		mask = POLLERR;
+	return mask;
+}
+
+static int evtchn_fasync(int fd, struct file *filp, int on)
+{
+	struct per_user_data *u = filp->private_data;
+	return fasync_helper(fd, filp, on, &u->evtchn_async_queue);
+}
+
+static int evtchn_open(struct inode *inode, struct file *filp)
+{
+	struct per_user_data *u;
+
+	u = kzalloc(sizeof(*u), GFP_KERNEL);
+	if (u == NULL)
+		return -ENOMEM;
+
+	u->name = kasprintf(GFP_KERNEL, "evtchn:%s", current->comm);
+	if (u->name == NULL) {
+		kfree(u);
+		return -ENOMEM;
+	}
+
+	init_waitqueue_head(&u->evtchn_wait);
+
+	u->ring = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
+	if (u->ring == NULL) {
+		kfree(u->name);
+		kfree(u);
+		return -ENOMEM;
+	}
+
+	mutex_init(&u->ring_cons_mutex);
+
+	filp->private_data = u;
+
+	return 0;
+}
+
+static int evtchn_release(struct inode *inode, struct file *filp)
+{
+	int i;
+	struct per_user_data *u = filp->private_data;
+
+	spin_lock_irq(&port_user_lock);
+
+	free_page((unsigned long)u->ring);
+
+	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+		if (port_user[i] != u)
+			continue;
+
+		evtchn_unbind_from_user(port_user[i], i);
+	}
+
+	spin_unlock_irq(&port_user_lock);
+
+	kfree(u->name);
+	kfree(u);
+
+	return 0;
+}
+
+static const struct file_operations evtchn_fops = {
+	.owner   = THIS_MODULE,
+	.read    = evtchn_read,
+	.write   = evtchn_write,
+	.unlocked_ioctl = evtchn_ioctl,
+	.poll    = evtchn_poll,
+	.fasync  = evtchn_fasync,
+	.open    = evtchn_open,
+	.release = evtchn_release,
+};
+
+static struct miscdevice evtchn_miscdev = {
+	.minor        = MISC_DYNAMIC_MINOR,
+	.name         = "evtchn",
+	.fops         = &evtchn_fops,
+};
+static int __init evtchn_init(void)
+{
+	int err;
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	spin_lock_init(&port_user_lock);
+	memset(port_user, 0, sizeof(port_user));
+
+	/* Create '/dev/misc/evtchn'. */
+	err = misc_register(&evtchn_miscdev);
+	if (err != 0) {
+		printk(KERN_ALERT "Could not register /dev/misc/evtchn\n");
+		return err;
+	}
+
+	printk(KERN_INFO "Event-channel device installed.\n");
+
+	return 0;
+}
+
+static void __exit evtchn_cleanup(void)
+{
+	misc_deregister(&evtchn_miscdev);
+}
+
+module_init(evtchn_init);
+module_exit(evtchn_cleanup);
+
+MODULE_LICENSE("GPL");
diff --git a/include/xen/evtchn.h b/include/xen/evtchn.h
new file mode 100644
index 0000000..14e833ee4
--- /dev/null
+++ b/include/xen/evtchn.h
@@ -0,0 +1,88 @@
+/******************************************************************************
+ * evtchn.h
+ *
+ * Interface to /dev/xen/evtchn.
+ *
+ * Copyright (c) 2003-2005, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __LINUX_PUBLIC_EVTCHN_H__
+#define __LINUX_PUBLIC_EVTCHN_H__
+
+/*
+ * Bind a fresh port to VIRQ @virq.
+ * Return allocated port.
+ */
+#define IOCTL_EVTCHN_BIND_VIRQ				\
+	_IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq))
+struct ioctl_evtchn_bind_virq {
+	unsigned int virq;
+};
+
+/*
+ * Bind a fresh port to remote <@remote_domain, @remote_port>.
+ * Return allocated port.
+ */
+#define IOCTL_EVTCHN_BIND_INTERDOMAIN			\
+	_IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain))
+struct ioctl_evtchn_bind_interdomain {
+	unsigned int remote_domain, remote_port;
+};
+
+/*
+ * Allocate a fresh port for binding to @remote_domain.
+ * Return allocated port.
+ */
+#define IOCTL_EVTCHN_BIND_UNBOUND_PORT			\
+	_IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port))
+struct ioctl_evtchn_bind_unbound_port {
+	unsigned int remote_domain;
+};
+
+/*
+ * Unbind previously allocated @port.
+ */
+#define IOCTL_EVTCHN_UNBIND				\
+	_IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind))
+struct ioctl_evtchn_unbind {
+	unsigned int port;
+};
+
+/*
+ * Unbind previously allocated @port.
+ */
+#define IOCTL_EVTCHN_NOTIFY				\
+	_IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify))
+struct ioctl_evtchn_notify {
+	unsigned int port;
+};
+
+/* Clear and reinitialise the event buffer. Clear error condition. */
+#define IOCTL_EVTCHN_RESET				\
+	_IOC(_IOC_NONE, 'E', 5, 0)
+
+#endif /* __LINUX_PUBLIC_EVTCHN_H__ */
-- 
cgit v0.10.2


From c5cfef0f79cacc3aa438fc28f4747f0d10c54d0d Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 6 Feb 2009 19:21:19 -0800
Subject: xen: export ioctl headers to userspace

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/include/Kbuild b/include/Kbuild
index d8c3e3c..fe36acc 100644
--- a/include/Kbuild
+++ b/include/Kbuild
@@ -8,3 +8,4 @@ header-y += mtd/
 header-y += rdma/
 header-y += video/
 header-y += drm/
+header-y += xen/
diff --git a/include/xen/Kbuild b/include/xen/Kbuild
new file mode 100644
index 0000000..4e65c16
--- /dev/null
+++ b/include/xen/Kbuild
@@ -0,0 +1 @@
+header-y += evtchn.h
-- 
cgit v0.10.2


From 0a4666b539a0e896ec4e8396a034a479e3573125 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 12 Feb 2009 13:03:24 -0800
Subject: xen/dev-evtchn: clean up locking in evtchn

Define a new per_user_data mutex to serialize bind/unbind operations
to prevent them from racing with each other.  Fix error returns
and don't do a bind while holding a spinlock.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index 517b9ee..af03195 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -54,6 +54,8 @@
 #include <asm/xen/hypervisor.h>
 
 struct per_user_data {
+	struct mutex bind_mutex; /* serialize bind/unbind operations */
+
 	/* Notification ring, accessed via /dev/xen/evtchn. */
 #define EVTCHN_RING_SIZE     (PAGE_SIZE / sizeof(evtchn_port_t))
 #define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1))
@@ -69,7 +71,7 @@ struct per_user_data {
 
 /* Who's bound to each port? */
 static struct per_user_data *port_user[NR_EVENT_CHANNELS];
-static DEFINE_SPINLOCK(port_user_lock);
+static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */
 
 irqreturn_t evtchn_interrupt(int irq, void *data)
 {
@@ -210,22 +212,24 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf,
 
 static int evtchn_bind_to_user(struct per_user_data *u, int port)
 {
-	int irq;
 	int rc = 0;
 
-	spin_lock_irq(&port_user_lock);
-
+	/*
+	 * Ports are never reused, so every caller should pass in a
+	 * unique port.
+	 *
+	 * (Locking not necessary because we haven't registered the
+	 * interrupt handler yet, and our caller has already
+	 * serialized bind operations.)
+	 */
 	BUG_ON(port_user[port] != NULL);
-
-	irq = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED,
-					u->name, (void *)(unsigned long)port);
-	if (rc < 0)
-		goto fail;
-
 	port_user[port] = u;
 
-fail:
-	spin_unlock_irq(&port_user_lock);
+	rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED,
+				       u->name, (void *)(unsigned long)port);
+	if (rc >= 0)
+		rc = 0;
+
 	return rc;
 }
 
@@ -234,6 +238,10 @@ static void evtchn_unbind_from_user(struct per_user_data *u, int port)
 	int irq = irq_from_evtchn(port);
 
 	unbind_from_irqhandler(irq, (void *)(unsigned long)port);
+
+	/* make sure we unbind the irq handler before clearing the port */
+	barrier();
+
 	port_user[port] = NULL;
 }
 
@@ -244,6 +252,9 @@ static long evtchn_ioctl(struct file *file,
 	struct per_user_data *u = file->private_data;
 	void __user *uarg = (void __user *) arg;
 
+	/* Prevent bind from racing with unbind */
+	mutex_lock(&u->bind_mutex);
+
 	switch (cmd) {
 	case IOCTL_EVTCHN_BIND_VIRQ: {
 		struct ioctl_evtchn_bind_virq bind;
@@ -368,6 +379,7 @@ static long evtchn_ioctl(struct file *file,
 		rc = -ENOSYS;
 		break;
 	}
+	mutex_unlock(&u->bind_mutex);
 
 	return rc;
 }
@@ -414,6 +426,7 @@ static int evtchn_open(struct inode *inode, struct file *filp)
 		return -ENOMEM;
 	}
 
+	mutex_init(&u->bind_mutex);
 	mutex_init(&u->ring_cons_mutex);
 
 	filp->private_data = u;
-- 
cgit v0.10.2


From a1ce1be578365a4da7e7d7db4812539d2d5da763 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Mon, 9 Feb 2009 12:05:50 -0800
Subject: xen: remove suspend_cancel hook

Remove suspend_cancel hook from xenbus_driver, in preparation for using
the device model for suspending.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index 773d1cf..bd20361 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -689,27 +689,6 @@ static int suspend_dev(struct device *dev, void *data)
 	return 0;
 }
 
-static int suspend_cancel_dev(struct device *dev, void *data)
-{
-	int err = 0;
-	struct xenbus_driver *drv;
-	struct xenbus_device *xdev;
-
-	DPRINTK("");
-
-	if (dev->driver == NULL)
-		return 0;
-	drv = to_xenbus_driver(dev->driver);
-	xdev = container_of(dev, struct xenbus_device, dev);
-	if (drv->suspend_cancel)
-		err = drv->suspend_cancel(xdev);
-	if (err)
-		printk(KERN_WARNING
-		       "xenbus: suspend_cancel %s failed: %i\n",
-		       dev_name(dev), err);
-	return 0;
-}
-
 static int resume_dev(struct device *dev, void *data)
 {
 	int err;
@@ -777,8 +756,6 @@ EXPORT_SYMBOL_GPL(xenbus_resume);
 void xenbus_suspend_cancel(void)
 {
 	xs_suspend_cancel();
-	bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_cancel_dev);
-	xenbus_backend_resume(suspend_cancel_dev);
 }
 EXPORT_SYMBOL_GPL(xenbus_suspend_cancel);
 
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
index f87f961..0836772 100644
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -92,7 +92,6 @@ struct xenbus_driver {
 				 enum xenbus_state backend_state);
 	int (*remove)(struct xenbus_device *dev);
 	int (*suspend)(struct xenbus_device *dev);
-	int (*suspend_cancel)(struct xenbus_device *dev);
 	int (*resume)(struct xenbus_device *dev);
 	int (*uevent)(struct xenbus_device *, char **, int, char *, int);
 	struct device_driver driver;
-- 
cgit v0.10.2


From de5b31bd47de7e6f41be2e271318dbc8f1af354d Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Mon, 9 Feb 2009 12:05:50 -0800
Subject: xen: use device model for suspending xenbus devices

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index b703dd2..5269bb4 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -104,9 +104,8 @@ static void do_suspend(void)
 		goto out;
 	}
 
-	printk("suspending xenbus...\n");
-	/* XXX use normal device tree? */
-	xenbus_suspend();
+	printk(KERN_DEBUG "suspending xenstore...\n");
+	xs_suspend();
 
 	err = stop_machine(xen_suspend, &cancelled, cpumask_of(0));
 	if (err) {
@@ -116,9 +115,9 @@ static void do_suspend(void)
 
 	if (!cancelled) {
 		xen_arch_resume();
-		xenbus_resume();
+		xs_resume();
 	} else
-		xenbus_suspend_cancel();
+		xs_suspend_cancel();
 
 	device_resume(PMSG_RESUME);
 
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index bd20361..4649213 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -71,6 +71,9 @@ static int xenbus_probe_frontend(const char *type, const char *name);
 
 static void xenbus_dev_shutdown(struct device *_dev);
 
+static int xenbus_dev_suspend(struct device *dev, pm_message_t state);
+static int xenbus_dev_resume(struct device *dev);
+
 /* If something in array of ids matches this device, return it. */
 static const struct xenbus_device_id *
 match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
@@ -188,6 +191,9 @@ static struct xen_bus_type xenbus_frontend = {
 		.remove    = xenbus_dev_remove,
 		.shutdown  = xenbus_dev_shutdown,
 		.dev_attrs = xenbus_dev_attrs,
+
+		.suspend   = xenbus_dev_suspend,
+		.resume    = xenbus_dev_resume,
 	},
 };
 
@@ -669,7 +675,7 @@ static struct xenbus_watch fe_watch = {
 	.callback = frontend_changed,
 };
 
-static int suspend_dev(struct device *dev, void *data)
+static int xenbus_dev_suspend(struct device *dev, pm_message_t state)
 {
 	int err = 0;
 	struct xenbus_driver *drv;
@@ -682,14 +688,14 @@ static int suspend_dev(struct device *dev, void *data)
 	drv = to_xenbus_driver(dev->driver);
 	xdev = container_of(dev, struct xenbus_device, dev);
 	if (drv->suspend)
-		err = drv->suspend(xdev);
+		err = drv->suspend(xdev, state);
 	if (err)
 		printk(KERN_WARNING
 		       "xenbus: suspend %s failed: %i\n", dev_name(dev), err);
 	return 0;
 }
 
-static int resume_dev(struct device *dev, void *data)
+static int xenbus_dev_resume(struct device *dev)
 {
 	int err;
 	struct xenbus_driver *drv;
@@ -734,31 +740,6 @@ static int resume_dev(struct device *dev, void *data)
 	return 0;
 }
 
-void xenbus_suspend(void)
-{
-	DPRINTK("");
-
-	bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev);
-	xenbus_backend_suspend(suspend_dev);
-	xs_suspend();
-}
-EXPORT_SYMBOL_GPL(xenbus_suspend);
-
-void xenbus_resume(void)
-{
-	xb_init_comms();
-	xs_resume();
-	bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev);
-	xenbus_backend_resume(resume_dev);
-}
-EXPORT_SYMBOL_GPL(xenbus_resume);
-
-void xenbus_suspend_cancel(void)
-{
-	xs_suspend_cancel();
-}
-EXPORT_SYMBOL_GPL(xenbus_suspend_cancel);
-
 /* A flag to determine if xenstored is 'ready' (i.e. has started) */
 int xenstored_ready = 0;
 
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
index e325eab..eab33f1 100644
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -673,6 +673,8 @@ void xs_resume(void)
 	struct xenbus_watch *watch;
 	char token[sizeof(watch) * 2 + 1];
 
+	xb_init_comms();
+
 	mutex_unlock(&xs_state.response_mutex);
 	mutex_unlock(&xs_state.request_mutex);
 	up_write(&xs_state.transaction_mutex);
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
index 0836772..b9763ba 100644
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -91,7 +91,7 @@ struct xenbus_driver {
 	void (*otherend_changed)(struct xenbus_device *dev,
 				 enum xenbus_state backend_state);
 	int (*remove)(struct xenbus_device *dev);
-	int (*suspend)(struct xenbus_device *dev);
+	int (*suspend)(struct xenbus_device *dev, pm_message_t state);
 	int (*resume)(struct xenbus_device *dev);
 	int (*uevent)(struct xenbus_device *, char **, int, char *, int);
 	struct device_driver driver;
-- 
cgit v0.10.2


From c6a960ce8858f20036cc3afc3b9422670d0d9021 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 9 Feb 2009 12:05:53 -0800
Subject: xen/xenbus: export xenbus_dev_changed

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index 4649213..d42e25d 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -660,6 +660,7 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus)
 
 	kfree(root);
 }
+EXPORT_SYMBOL_GPL(xenbus_dev_changed);
 
 static void frontend_changed(struct xenbus_watch *watch,
 			     const char **vec, unsigned int len)
-- 
cgit v0.10.2


From cff7e81b3dd7c25cd2248cd7a04c5764552d5d55 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 10 Mar 2009 14:39:59 -0700
Subject: xen: add /sys/hypervisor support

Adds support for Xen info under /sys/hypervisor.  Taken from Novell 2.6.27
backport tree.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 526187c..88bca1c 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -41,3 +41,13 @@ config XEN_COMPAT_XENFS
          a xen platform.
          If in doubt, say yes.
 
+config XEN_SYS_HYPERVISOR
+       bool "Create xen entries under /sys/hypervisor"
+       depends on XEN && SYSFS
+       select SYS_HYPERVISOR
+       default y
+       help
+         Create entries under /sys/hypervisor describing the Xen
+	 hypervisor environment.  When running native or in another
+	 virtual environment, /sys/hypervisor will still be present,
+	 but will have no xen contents.
\ No newline at end of file
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index ff8accc..f3603a3 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -4,4 +4,5 @@ obj-y	+= xenbus/
 obj-$(CONFIG_HOTPLUG_CPU)	+= cpu_hotplug.o
 obj-$(CONFIG_XEN_XENCOMM)	+= xencomm.o
 obj-$(CONFIG_XEN_BALLOON)	+= balloon.o
-obj-$(CONFIG_XENFS)		+= xenfs/
\ No newline at end of file
+obj-$(CONFIG_XENFS)		+= xenfs/
+obj-$(CONFIG_XEN_SYS_HYPERVISOR)	+= sys-hypervisor.o
diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
new file mode 100644
index 0000000..cb29d1c
--- /dev/null
+++ b/drivers/xen/sys-hypervisor.c
@@ -0,0 +1,475 @@
+/*
+ *  copyright (c) 2006 IBM Corporation
+ *  Authored by: Mike D. Day <ncmike@us.ibm.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xenbus.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/version.h>
+
+#define HYPERVISOR_ATTR_RO(_name) \
+static struct hyp_sysfs_attr  _name##_attr = __ATTR_RO(_name)
+
+#define HYPERVISOR_ATTR_RW(_name) \
+static struct hyp_sysfs_attr _name##_attr = \
+	__ATTR(_name, 0644, _name##_show, _name##_store)
+
+struct hyp_sysfs_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct hyp_sysfs_attr *, char *);
+	ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t);
+	void *hyp_attr_data;
+};
+
+static ssize_t type_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	return sprintf(buffer, "xen\n");
+}
+
+HYPERVISOR_ATTR_RO(type);
+
+static int __init xen_sysfs_type_init(void)
+{
+	return sysfs_create_file(hypervisor_kobj, &type_attr.attr);
+}
+
+static void xen_sysfs_type_destroy(void)
+{
+	sysfs_remove_file(hypervisor_kobj, &type_attr.attr);
+}
+
+/* xen version attributes */
+static ssize_t major_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int version = HYPERVISOR_xen_version(XENVER_version, NULL);
+	if (version)
+		return sprintf(buffer, "%d\n", version >> 16);
+	return -ENODEV;
+}
+
+HYPERVISOR_ATTR_RO(major);
+
+static ssize_t minor_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int version = HYPERVISOR_xen_version(XENVER_version, NULL);
+	if (version)
+		return sprintf(buffer, "%d\n", version & 0xff);
+	return -ENODEV;
+}
+
+HYPERVISOR_ATTR_RO(minor);
+
+static ssize_t extra_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int ret = -ENOMEM;
+	char *extra;
+
+	extra = kmalloc(XEN_EXTRAVERSION_LEN, GFP_KERNEL);
+	if (extra) {
+		ret = HYPERVISOR_xen_version(XENVER_extraversion, extra);
+		if (!ret)
+			ret = sprintf(buffer, "%s\n", extra);
+		kfree(extra);
+	}
+
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(extra);
+
+static struct attribute *version_attrs[] = {
+	&major_attr.attr,
+	&minor_attr.attr,
+	&extra_attr.attr,
+	NULL
+};
+
+static struct attribute_group version_group = {
+	.name = "version",
+	.attrs = version_attrs,
+};
+
+static int __init xen_sysfs_version_init(void)
+{
+	return sysfs_create_group(hypervisor_kobj, &version_group);
+}
+
+static void xen_sysfs_version_destroy(void)
+{
+	sysfs_remove_group(hypervisor_kobj, &version_group);
+}
+
+/* UUID */
+
+static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	char *vm, *val;
+	int ret;
+	extern int xenstored_ready;
+
+	if (!xenstored_ready)
+		return -EBUSY;
+
+	vm = xenbus_read(XBT_NIL, "vm", "", NULL);
+	if (IS_ERR(vm))
+		return PTR_ERR(vm);
+	val = xenbus_read(XBT_NIL, vm, "uuid", NULL);
+	kfree(vm);
+	if (IS_ERR(val))
+		return PTR_ERR(val);
+	ret = sprintf(buffer, "%s\n", val);
+	kfree(val);
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(uuid);
+
+static int __init xen_sysfs_uuid_init(void)
+{
+	return sysfs_create_file(hypervisor_kobj, &uuid_attr.attr);
+}
+
+static void xen_sysfs_uuid_destroy(void)
+{
+	sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr);
+}
+
+/* xen compilation attributes */
+
+static ssize_t compiler_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int ret = -ENOMEM;
+	struct xen_compile_info *info;
+
+	info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
+	if (info) {
+		ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
+		if (!ret)
+			ret = sprintf(buffer, "%s\n", info->compiler);
+		kfree(info);
+	}
+
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(compiler);
+
+static ssize_t compiled_by_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int ret = -ENOMEM;
+	struct xen_compile_info *info;
+
+	info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
+	if (info) {
+		ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
+		if (!ret)
+			ret = sprintf(buffer, "%s\n", info->compile_by);
+		kfree(info);
+	}
+
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(compiled_by);
+
+static ssize_t compile_date_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int ret = -ENOMEM;
+	struct xen_compile_info *info;
+
+	info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
+	if (info) {
+		ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
+		if (!ret)
+			ret = sprintf(buffer, "%s\n", info->compile_date);
+		kfree(info);
+	}
+
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(compile_date);
+
+static struct attribute *xen_compile_attrs[] = {
+	&compiler_attr.attr,
+	&compiled_by_attr.attr,
+	&compile_date_attr.attr,
+	NULL
+};
+
+static struct attribute_group xen_compilation_group = {
+	.name = "compilation",
+	.attrs = xen_compile_attrs,
+};
+
+int __init static xen_compilation_init(void)
+{
+	return sysfs_create_group(hypervisor_kobj, &xen_compilation_group);
+}
+
+static void xen_compilation_destroy(void)
+{
+	sysfs_remove_group(hypervisor_kobj, &xen_compilation_group);
+}
+
+/* xen properties info */
+
+static ssize_t capabilities_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int ret = -ENOMEM;
+	char *caps;
+
+	caps = kmalloc(XEN_CAPABILITIES_INFO_LEN, GFP_KERNEL);
+	if (caps) {
+		ret = HYPERVISOR_xen_version(XENVER_capabilities, caps);
+		if (!ret)
+			ret = sprintf(buffer, "%s\n", caps);
+		kfree(caps);
+	}
+
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(capabilities);
+
+static ssize_t changeset_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int ret = -ENOMEM;
+	char *cset;
+
+	cset = kmalloc(XEN_CHANGESET_INFO_LEN, GFP_KERNEL);
+	if (cset) {
+		ret = HYPERVISOR_xen_version(XENVER_changeset, cset);
+		if (!ret)
+			ret = sprintf(buffer, "%s\n", cset);
+		kfree(cset);
+	}
+
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(changeset);
+
+static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int ret = -ENOMEM;
+	struct xen_platform_parameters *parms;
+
+	parms = kmalloc(sizeof(struct xen_platform_parameters), GFP_KERNEL);
+	if (parms) {
+		ret = HYPERVISOR_xen_version(XENVER_platform_parameters,
+					     parms);
+		if (!ret)
+			ret = sprintf(buffer, "%lx\n", parms->virt_start);
+		kfree(parms);
+	}
+
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(virtual_start);
+
+static ssize_t pagesize_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int ret;
+
+	ret = HYPERVISOR_xen_version(XENVER_pagesize, NULL);
+	if (ret > 0)
+		ret = sprintf(buffer, "%x\n", ret);
+
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(pagesize);
+
+/* eventually there will be several more features to export */
+static ssize_t xen_feature_show(int index, char *buffer)
+{
+	int ret = -ENOMEM;
+	struct xen_feature_info *info;
+
+	info = kmalloc(sizeof(struct xen_feature_info), GFP_KERNEL);
+	if (info) {
+		info->submap_idx = index;
+		ret = HYPERVISOR_xen_version(XENVER_get_features, info);
+		if (!ret)
+			ret = sprintf(buffer, "%d\n", info->submap);
+		kfree(info);
+	}
+
+	return ret;
+}
+
+static ssize_t writable_pt_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	return xen_feature_show(XENFEAT_writable_page_tables, buffer);
+}
+
+HYPERVISOR_ATTR_RO(writable_pt);
+
+static struct attribute *xen_properties_attrs[] = {
+	&capabilities_attr.attr,
+	&changeset_attr.attr,
+	&virtual_start_attr.attr,
+	&pagesize_attr.attr,
+	&writable_pt_attr.attr,
+	NULL
+};
+
+static struct attribute_group xen_properties_group = {
+	.name = "properties",
+	.attrs = xen_properties_attrs,
+};
+
+static int __init xen_properties_init(void)
+{
+	return sysfs_create_group(hypervisor_kobj, &xen_properties_group);
+}
+
+static void xen_properties_destroy(void)
+{
+	sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
+}
+
+#ifdef CONFIG_KEXEC
+
+extern size_t vmcoreinfo_size_xen;
+extern unsigned long paddr_vmcoreinfo_xen;
+
+static ssize_t vmcoreinfo_show(struct hyp_sysfs_attr *attr, char *page)
+{
+	return sprintf(page, "%lx %zx\n",
+		paddr_vmcoreinfo_xen, vmcoreinfo_size_xen);
+}
+
+HYPERVISOR_ATTR_RO(vmcoreinfo);
+
+static int __init xen_sysfs_vmcoreinfo_init(void)
+{
+	return sysfs_create_file(hypervisor_kobj,
+				 &vmcoreinfo_attr.attr);
+}
+
+static void xen_sysfs_vmcoreinfo_destroy(void)
+{
+	sysfs_remove_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
+}
+
+#endif
+
+static int __init hyper_sysfs_init(void)
+{
+	int ret;
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	ret = xen_sysfs_type_init();
+	if (ret)
+		goto out;
+	ret = xen_sysfs_version_init();
+	if (ret)
+		goto version_out;
+	ret = xen_compilation_init();
+	if (ret)
+		goto comp_out;
+	ret = xen_sysfs_uuid_init();
+	if (ret)
+		goto uuid_out;
+	ret = xen_properties_init();
+	if (ret)
+		goto prop_out;
+#ifdef CONFIG_KEXEC
+	if (vmcoreinfo_size_xen != 0) {
+		ret = xen_sysfs_vmcoreinfo_init();
+		if (ret)
+			goto vmcoreinfo_out;
+	}
+#endif
+
+	goto out;
+
+#ifdef CONFIG_KEXEC
+vmcoreinfo_out:
+#endif
+	xen_properties_destroy();
+prop_out:
+	xen_sysfs_uuid_destroy();
+uuid_out:
+	xen_compilation_destroy();
+comp_out:
+	xen_sysfs_version_destroy();
+version_out:
+	xen_sysfs_type_destroy();
+out:
+	return ret;
+}
+
+static void __exit hyper_sysfs_exit(void)
+{
+#ifdef CONFIG_KEXEC
+	if (vmcoreinfo_size_xen != 0)
+		xen_sysfs_vmcoreinfo_destroy();
+#endif
+	xen_properties_destroy();
+	xen_compilation_destroy();
+	xen_sysfs_uuid_destroy();
+	xen_sysfs_version_destroy();
+	xen_sysfs_type_destroy();
+
+}
+module_init(hyper_sysfs_init);
+module_exit(hyper_sysfs_exit);
+
+static ssize_t hyp_sysfs_show(struct kobject *kobj,
+			      struct attribute *attr,
+			      char *buffer)
+{
+	struct hyp_sysfs_attr *hyp_attr;
+	hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
+	if (hyp_attr->show)
+		return hyp_attr->show(hyp_attr, buffer);
+	return 0;
+}
+
+static ssize_t hyp_sysfs_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buffer,
+			       size_t len)
+{
+	struct hyp_sysfs_attr *hyp_attr;
+	hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
+	if (hyp_attr->store)
+		return hyp_attr->store(hyp_attr, buffer, len);
+	return 0;
+}
+
+static struct sysfs_ops hyp_sysfs_ops = {
+	.show = hyp_sysfs_show,
+	.store = hyp_sysfs_store,
+};
+
+static struct kobj_type hyp_sysfs_kobj_type = {
+	.sysfs_ops = &hyp_sysfs_ops,
+};
+
+static int __init hypervisor_subsys_init(void)
+{
+	if (!xen_domain())
+		return -ENODEV;
+
+	hypervisor_kobj->ktype = &hyp_sysfs_kobj_type;
+	return 0;
+}
+device_initcall(hypervisor_subsys_init);
diff --git a/include/xen/interface/version.h b/include/xen/interface/version.h
index 453235e..e8b6519 100644
--- a/include/xen/interface/version.h
+++ b/include/xen/interface/version.h
@@ -57,4 +57,7 @@ struct xen_feature_info {
 /* Declares the features reported by XENVER_get_features. */
 #include "features.h"
 
+/* arg == NULL; returns host memory page size. */
+#define XENVER_pagesize 7
+
 #endif /* __XEN_PUBLIC_VERSION_H__ */
-- 
cgit v0.10.2


From a649b720614d5675dc402bef75a92576143fede7 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 10 Mar 2009 17:17:41 -0700
Subject: xen/sys/hypervisor: change writable_pt to features

/sys/hypervisor/properties/writable_pt was misnamed.  Rename to features,
expressed as a bit array in hex.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
index cb29d1c..1267d6f 100644
--- a/drivers/xen/sys-hypervisor.c
+++ b/drivers/xen/sys-hypervisor.c
@@ -293,37 +293,48 @@ static ssize_t pagesize_show(struct hyp_sysfs_attr *attr, char *buffer)
 
 HYPERVISOR_ATTR_RO(pagesize);
 
-/* eventually there will be several more features to export */
 static ssize_t xen_feature_show(int index, char *buffer)
 {
-	int ret = -ENOMEM;
-	struct xen_feature_info *info;
+	ssize_t ret;
+	struct xen_feature_info info;
 
-	info = kmalloc(sizeof(struct xen_feature_info), GFP_KERNEL);
-	if (info) {
-		info->submap_idx = index;
-		ret = HYPERVISOR_xen_version(XENVER_get_features, info);
-		if (!ret)
-			ret = sprintf(buffer, "%d\n", info->submap);
-		kfree(info);
-	}
+	info.submap_idx = index;
+	ret = HYPERVISOR_xen_version(XENVER_get_features, &info);
+	if (!ret)
+		ret = sprintf(buffer, "%08x", info.submap);
 
 	return ret;
 }
 
-static ssize_t writable_pt_show(struct hyp_sysfs_attr *attr, char *buffer)
+static ssize_t features_show(struct hyp_sysfs_attr *attr, char *buffer)
 {
-	return xen_feature_show(XENFEAT_writable_page_tables, buffer);
+	ssize_t len;
+	int i;
+
+	len = 0;
+	for (i = XENFEAT_NR_SUBMAPS-1; i >= 0; i--) {
+		int ret = xen_feature_show(i, buffer + len);
+		if (ret < 0) {
+			if (len == 0)
+				len = ret;
+			break;
+		}
+		len += ret;
+	}
+	if (len > 0)
+		buffer[len++] = '\n';
+
+	return len;
 }
 
-HYPERVISOR_ATTR_RO(writable_pt);
+HYPERVISOR_ATTR_RO(features);
 
 static struct attribute *xen_properties_attrs[] = {
 	&capabilities_attr.attr,
 	&changeset_attr.attr,
 	&virtual_start_attr.attr,
 	&pagesize_attr.attr,
-	&writable_pt_attr.attr,
+	&features_attr.attr,
 	NULL
 };
 
-- 
cgit v0.10.2


From f0783708bf63a2827863cf2be57c08a24843e6bd Mon Sep 17 00:00:00 2001
From: Ian Campbell <Ian.Campbell@citrix.com>
Date: Wed, 11 Mar 2009 10:19:54 +0000
Subject: xen: drop kexec bits from /sys/hypervisor since kexec isn't
 implemented yet

I needed this to compile since there is no kexec yet in pvops kernel
  CC      drivers/xen/sys-hypervisor.o
drivers/xen/sys-hypervisor.c: In function 'hyper_sysfs_init':
drivers/xen/sys-hypervisor.c:405: error: 'vmcoreinfo_size_xen' undeclared (first use in this function)
drivers/xen/sys-hypervisor.c:405: error: (Each undeclared identifier is reported only once
drivers/xen/sys-hypervisor.c:405: error: for each function it appears in.)
drivers/xen/sys-hypervisor.c:406: error: implicit declaration of function 'xen_sysfs_vmcoreinfo_init'
drivers/xen/sys-hypervisor.c: In function 'hyper_sysfs_exit':
drivers/xen/sys-hypervisor.c:433: error: 'vmcoreinfo_size_xen' undeclared (first use in this function)
drivers/xen/sys-hypervisor.c:434: error: implicit declaration of function 'xen_sysfs_vmcoreinfo_destroy'

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>

diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
index 1267d6f..88a60e0 100644
--- a/drivers/xen/sys-hypervisor.c
+++ b/drivers/xen/sys-hypervisor.c
@@ -353,32 +353,6 @@ static void xen_properties_destroy(void)
 	sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
 }
 
-#ifdef CONFIG_KEXEC
-
-extern size_t vmcoreinfo_size_xen;
-extern unsigned long paddr_vmcoreinfo_xen;
-
-static ssize_t vmcoreinfo_show(struct hyp_sysfs_attr *attr, char *page)
-{
-	return sprintf(page, "%lx %zx\n",
-		paddr_vmcoreinfo_xen, vmcoreinfo_size_xen);
-}
-
-HYPERVISOR_ATTR_RO(vmcoreinfo);
-
-static int __init xen_sysfs_vmcoreinfo_init(void)
-{
-	return sysfs_create_file(hypervisor_kobj,
-				 &vmcoreinfo_attr.attr);
-}
-
-static void xen_sysfs_vmcoreinfo_destroy(void)
-{
-	sysfs_remove_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
-}
-
-#endif
-
 static int __init hyper_sysfs_init(void)
 {
 	int ret;
@@ -401,20 +375,9 @@ static int __init hyper_sysfs_init(void)
 	ret = xen_properties_init();
 	if (ret)
 		goto prop_out;
-#ifdef CONFIG_KEXEC
-	if (vmcoreinfo_size_xen != 0) {
-		ret = xen_sysfs_vmcoreinfo_init();
-		if (ret)
-			goto vmcoreinfo_out;
-	}
-#endif
 
 	goto out;
 
-#ifdef CONFIG_KEXEC
-vmcoreinfo_out:
-#endif
-	xen_properties_destroy();
 prop_out:
 	xen_sysfs_uuid_destroy();
 uuid_out:
@@ -429,10 +392,6 @@ out:
 
 static void __exit hyper_sysfs_exit(void)
 {
-#ifdef CONFIG_KEXEC
-	if (vmcoreinfo_size_xen != 0)
-		xen_sysfs_vmcoreinfo_destroy();
-#endif
 	xen_properties_destroy();
 	xen_compilation_destroy();
 	xen_sysfs_uuid_destroy();
-- 
cgit v0.10.2


From 818fd20673df82031e604bb784d836f1fc2e2451 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Fri, 6 Feb 2009 18:46:47 -0800
Subject: xen: add "capabilities" file

The xenfs capabilities file allows usermode to determine what
capabilities the domain has.  The only one at present is "control_d"
in a privileged domain.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
index 515741a..6559e0c 100644
--- a/drivers/xen/xenfs/super.c
+++ b/drivers/xen/xenfs/super.c
@@ -20,10 +20,27 @@
 MODULE_DESCRIPTION("Xen filesystem");
 MODULE_LICENSE("GPL");
 
+static ssize_t capabilities_read(struct file *file, char __user *buf,
+				 size_t size, loff_t *off)
+{
+	char *tmp = "";
+
+	if (xen_initial_domain())
+		tmp = "control_d\n";
+
+	return simple_read_from_buffer(buf, size, off, tmp, strlen(tmp));
+}
+
+static const struct file_operations capabilities_file_ops = {
+	.read = capabilities_read,
+};
+
 static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
 {
 	static struct tree_descr xenfs_files[] = {
-		[2] = {"xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR},
+		[1] = {},
+		{ "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR },
+		{ "capabilities", &capabilities_file_ops, S_IRUGO },
 		{""},
 	};
 
-- 
cgit v0.10.2


From 8a6f83afd0c5355db6d11394a798e94950306239 Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Wed, 1 Apr 2009 10:07:57 +0900
Subject: Permissive domain in userspace object manager

This patch enables applications to handle permissive domain correctly.

Since the v2.6.26 kernel, SELinux has supported an idea of permissive
domain which allows certain processes to work as if permissive mode,
even if the global setting is enforcing mode.
However, we don't have an application program interface to inform
what domains are permissive one, and what domains are not.
It means applications focuses on SELinux (XACE/SELinux, SE-PostgreSQL
and so on) cannot handle permissive domain correctly.

This patch add the sixth field (flags) on the reply of the /selinux/access
interface which is used to make an access control decision from userspace.
If the first bit of the flags field is positive, it means the required
access control decision is on permissive domain, so application should
allow any required actions, as the kernel doing.

This patch also has a side benefit. The av_decision.flags is set at
context_struct_compute_av(). It enables to check required permissions
without read_lock(&policy_rwlock).

Signed-off-by: KaiGai Kohei <kaigai@ak.jp.nec.com>
Acked-by:  Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: Eric Paris <eparis@redhat.com>
--
 security/selinux/avc.c              |    2 +-
 security/selinux/include/security.h |    4 +++-
 security/selinux/selinuxfs.c        |    4 ++--
 security/selinux/ss/services.c      |   30 +++++-------------------------
 4 files changed, 11 insertions(+), 29 deletions(-)
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index 7f9b5fa..b2ab608 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -927,7 +927,7 @@ int avc_has_perm_noaudit(u32 ssid, u32 tsid,
 	if (denied) {
 		if (flags & AVC_STRICT)
 			rc = -EACCES;
-		else if (!selinux_enforcing || security_permissive_sid(ssid))
+		else if (!selinux_enforcing || (avd->flags & AVD_FLAGS_PERMISSIVE))
 			avc_update_node(AVC_CALLBACK_GRANT, requested, ssid,
 					tsid, tclass, avd->seqno);
 		else
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h
index 5c3434f..a7be3f0 100644
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -91,9 +91,11 @@ struct av_decision {
 	u32 auditallow;
 	u32 auditdeny;
 	u32 seqno;
+	u32 flags;
 };
 
-int security_permissive_sid(u32 sid);
+/* definitions of av_decision.flags */
+#define AVD_FLAGS_PERMISSIVE	0x0001
 
 int security_compute_av(u32 ssid, u32 tsid,
 	u16 tclass, u32 requested,
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 2d5136e..8d4007f 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -527,10 +527,10 @@ static ssize_t sel_write_access(struct file *file, char *buf, size_t size)
 		goto out2;
 
 	length = scnprintf(buf, SIMPLE_TRANSACTION_LIMIT,
-			  "%x %x %x %x %u",
+			  "%x %x %x %x %u %x",
 			  avd.allowed, 0xffffffff,
 			  avd.auditallow, avd.auditdeny,
-			  avd.seqno);
+			  avd.seqno, avd.flags);
 out2:
 	kfree(tcon);
 out:
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index deeec6c..500e6f7 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -410,6 +410,7 @@ static int context_struct_compute_av(struct context *scontext,
 	avd->auditallow = 0;
 	avd->auditdeny = 0xffffffff;
 	avd->seqno = latest_granting;
+	avd->flags = 0;
 
 	/*
 	 * Check for all the invalid cases.
@@ -528,31 +529,6 @@ inval_class:
 	return 0;
 }
 
-/*
- * Given a sid find if the type has the permissive flag set
- */
-int security_permissive_sid(u32 sid)
-{
-	struct context *context;
-	u32 type;
-	int rc;
-
-	read_lock(&policy_rwlock);
-
-	context = sidtab_search(&sidtab, sid);
-	BUG_ON(!context);
-
-	type = context->type;
-	/*
-	 * we are intentionally using type here, not type-1, the 0th bit may
-	 * someday indicate that we are globally setting permissive in policy.
-	 */
-	rc = ebitmap_get_bit(&policydb.permissive_map, type);
-
-	read_unlock(&policy_rwlock);
-	return rc;
-}
-
 static int security_validtrans_handle_fail(struct context *ocontext,
 					   struct context *ncontext,
 					   struct context *tcontext,
@@ -767,6 +743,10 @@ int security_compute_av(u32 ssid,
 
 	rc = context_struct_compute_av(scontext, tcontext, tclass,
 				       requested, avd);
+
+	/* permissive domain? */
+	if (ebitmap_get_bit(&policydb.permissive_map, scontext->type))
+	    avd->flags |= AVD_FLAGS_PERMISSIVE;
 out:
 	read_unlock(&policy_rwlock);
 	return rc;
-- 
cgit v0.10.2


From 53152f957d4a5dfd537d17c823afeb1a2c03753e Mon Sep 17 00:00:00 2001
From: Ian Campbell <Ian.Campbell@citrix.com>
Date: Thu, 2 Apr 2009 13:24:28 +0100
Subject: xen: honour VCPU availability on boot

If a VM is booted with offline VCPUs then unplug them during boot. Determining
the availability of a VCPU requires access to XenStore which is not available
at the point smp_prepare_cpus() is called, therefore we bring up all VCPUS
initially and unplug the offline ones as soon as XenStore becomes available.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>

diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c
index 974f56d..411cb1f 100644
--- a/drivers/xen/cpu_hotplug.c
+++ b/drivers/xen/cpu_hotplug.c
@@ -21,29 +21,41 @@ static void disable_hotplug_cpu(int cpu)
 	cpu_clear(cpu, cpu_present_map);
 }
 
-static void vcpu_hotplug(unsigned int cpu)
+static int vcpu_online(unsigned int cpu)
 {
 	int err;
 	char dir[32], state[32];
 
-	if (!cpu_possible(cpu))
-		return;
-
 	sprintf(dir, "cpu/%u", cpu);
 	err = xenbus_scanf(XBT_NIL, dir, "availability", "%s", state);
 	if (err != 1) {
 		printk(KERN_ERR "XENBUS: Unable to read cpu state\n");
-		return;
+		return err;
 	}
 
-	if (strcmp(state, "online") == 0) {
+	if (strcmp(state, "online") == 0)
+		return 1;
+	else if (strcmp(state, "offline") == 0)
+		return 0;
+
+	printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n", state, cpu);
+	return -EINVAL;
+}
+static void vcpu_hotplug(unsigned int cpu)
+{
+	if (!cpu_possible(cpu))
+		return;
+
+	switch (vcpu_online(cpu)) {
+	case 1:
 		enable_hotplug_cpu(cpu);
-	} else if (strcmp(state, "offline") == 0) {
+		break;
+	case 0:
 		(void)cpu_down(cpu);
 		disable_hotplug_cpu(cpu);
-	} else {
-		printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n",
-		       state, cpu);
+		break;
+	default:
+		break;
 	}
 }
 
@@ -64,12 +76,20 @@ static void handle_vcpu_hotplug_event(struct xenbus_watch *watch,
 static int setup_cpu_watcher(struct notifier_block *notifier,
 			      unsigned long event, void *data)
 {
+	int cpu;
 	static struct xenbus_watch cpu_watch = {
 		.node = "cpu",
 		.callback = handle_vcpu_hotplug_event};
 
 	(void)register_xenbus_watch(&cpu_watch);
 
+	for_each_possible_cpu(cpu) {
+		if (vcpu_online(cpu) == 0) {
+			(void)cpu_down(cpu);
+			cpu_clear(cpu, cpu_present_map);
+		}
+	}
+
 	return NOTIFY_DONE;
 }
 
-- 
cgit v0.10.2


From 3d43321b7015387cfebbe26436d0e9d299162ea1 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@ubuntu.com>
Date: Thu, 2 Apr 2009 15:49:29 -0700
Subject: modules: sysctl to block module loading

Implement a sysctl file that disables module-loading system-wide since
there is no longer a viable way to remove CAP_SYS_MODULE after the system
bounding capability set was removed in 2.6.25.

Value can only be set to "1", and is tested only if standard capability
checks allow CAP_SYS_MODULE.  Given existing /dev/mem protections, this
should allow administrators a one-way method to block module loading
after initial boot-time module loading has finished.

Signed-off-by: Kees Cook <kees.cook@canonical.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index a4ccdd1..02b1349 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -30,6 +30,7 @@ show up in /proc/sys/kernel:
 - kstack_depth_to_print       [ X86 only ]
 - l2cr                        [ PPC only ]
 - modprobe                    ==> Documentation/debugging-modules.txt
+- modules_disabled
 - msgmax
 - msgmnb
 - msgmni
@@ -179,6 +180,16 @@ kernel stack.
 
 ==============================================================
 
+modules_disabled:
+
+A toggle value indicating if modules are allowed to be loaded
+in an otherwise modular kernel.  This toggle defaults to off
+(0), but can be set true (1).  Once true, modules can be
+neither loaded nor unloaded, and the toggle cannot be set back
+to false.
+
+==============================================================
+
 osrelease, ostype & version:
 
 # cat osrelease
diff --git a/kernel/module.c b/kernel/module.c
index f77ac32..eeb3f7b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -778,6 +778,9 @@ static void wait_for_zero_refcount(struct module *mod)
 	mutex_lock(&module_mutex);
 }
 
+/* Block module loading/unloading? */
+int modules_disabled = 0;
+
 SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 		unsigned int, flags)
 {
@@ -785,7 +788,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 	char name[MODULE_NAME_LEN];
 	int ret, forced = 0;
 
-	if (!capable(CAP_SYS_MODULE))
+	if (!capable(CAP_SYS_MODULE) || modules_disabled)
 		return -EPERM;
 
 	if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
@@ -2349,7 +2352,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
 	int ret = 0;
 
 	/* Must have permission */
-	if (!capable(CAP_SYS_MODULE))
+	if (!capable(CAP_SYS_MODULE) || modules_disabled)
 		return -EPERM;
 
 	/* Only one module load at a time, please */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c5ef44f..2fb4246 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -113,6 +113,7 @@ static int ngroups_max = NGROUPS_MAX;
 
 #ifdef CONFIG_MODULES
 extern char modprobe_path[];
+extern int modules_disabled;
 #endif
 #ifdef CONFIG_CHR_DEV_SG
 extern int sg_big_buff;
@@ -533,6 +534,17 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dostring,
 		.strategy	= &sysctl_string,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "modules_disabled",
+		.data		= &modules_disabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		/* only handle a transition from default "0" to "1" */
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &one,
+		.extra2		= &one,
+	},
 #endif
 #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
 	{
-- 
cgit v0.10.2


From b5f22a59c0356655a501190959db9f7f5dd07e3f Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Thu, 2 Apr 2009 18:47:14 -0500
Subject: don't raise all privs on setuid-root file with fE set (v2)

Distributions face a backward compatibility problem with starting to use
file capabilities.  For instance, removing setuid root from ping and
doing setcap cap_net_raw=pe means that booting with an older kernel
or one compiled without file capabilities means ping won't work for
non-root users.

In order to replace the setuid root bit on a capability-unaware
program, one has to set the effective, or legacy, file capability,
which makes the capability effective immediately.  This patch
uses the legacy bit as a queue to not automatically add full
privilege to a setuid-root program.

So, with this patch, an ordinary setuid-root program will run with
privilege.  But if /bin/ping has both setuid-root and cap_net_raw in
fP and fE, then ping (when run by non-root user) will not run
with only cap_net_raw.

Changelog:
	Apr 2 2009: Print a message once when such a binary is loaded,
		as per James Morris' suggestion.
	Apr 2 2009: Fix the condition to only catch uid!=0 && euid==0.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/commoncap.c b/security/commoncap.c
index 7cd61a5..97ac1f1 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -28,6 +28,28 @@
 #include <linux/prctl.h>
 #include <linux/securebits.h>
 
+/*
+ * If a non-root user executes a setuid-root binary in
+ * !secure(SECURE_NOROOT) mode, then we raise capabilities.
+ * However if fE is also set, then the intent is for only
+ * the file capabilities to be applied, and the setuid-root
+ * bit is left on either to change the uid (plausible) or
+ * to get full privilege on a kernel without file capabilities
+ * support.  So in that case we do not raise capabilities.
+ *
+ * Warn if that happens, once per boot.
+ */
+static void warn_setuid_and_fcaps_mixed(char *fname)
+{
+	static int warned;
+	if (!warned) {
+		printk(KERN_INFO "warning: `%s' has both setuid-root and"
+			" effective capabilities. Therefore not raising all"
+			" capabilities.\n", fname);
+		warned = 1;
+	}
+}
+
 int cap_netlink_send(struct sock *sk, struct sk_buff *skb)
 {
 	NETLINK_CB(skb).eff_cap = current_cap();
@@ -464,6 +486,15 @@ int cap_bprm_set_creds(struct linux_binprm *bprm)
 
 	if (!issecure(SECURE_NOROOT)) {
 		/*
+		 * If the legacy file capability is set, then don't set privs
+		 * for a setuid root binary run by a non-root user.  Do set it
+		 * for a root user just to cause least surprise to an admin.
+		 */
+		if (effective && new->uid != 0 && new->euid == 0) {
+			warn_setuid_and_fcaps_mixed(bprm->filename);
+			goto skip;
+		}
+		/*
 		 * To support inheritance of root-permissions and suid-root
 		 * executables under compatibility mode, we override the
 		 * capability sets for the file.
@@ -478,6 +509,7 @@ int cap_bprm_set_creds(struct linux_binprm *bprm)
 		if (new->euid == 0)
 			effective = true;
 	}
+skip:
 
 	/* Don't let someone trace a set[ug]id/setpcap binary with the revised
 	 * credentials unless they have the appropriate permit
-- 
cgit v0.10.2


From 31c9a24ec82926fcae49483e53566d231e705057 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 2 Apr 2009 21:06:25 -0700
Subject: RCU: make treercu be default

Impact: switch default config from CLASSIC_RCU to TREE_RCU

Given that I have not gotten any complaints or bug reports on treercu
recently, this patch makes it be the default.  There are a number of
other defconfig files that explicitly call out CLASSIC_RCU, but which
have comment headers saying not to edit them.  Probably holdovers from
one of the flavors of "make config", but...

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: akpm@linux-foundation.org
Cc: dipankar@in.ibm.com
Cc: niv@us.ibm.com
Cc: manfred@colorfullife.com
Cc: peterz@infradead.org
LKML-Reference: <20090403040625.GA9473@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/init/Kconfig b/init/Kconfig
index 14c483d..26ac877 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -302,7 +302,7 @@ menu "RCU Subsystem"
 
 choice
 	prompt "RCU Implementation"
-	default CLASSIC_RCU
+	default TREE_RCU
 
 config CLASSIC_RCU
 	bool "Classic RCU"
-- 
cgit v0.10.2


From 595258aaeac4cc6e187b98b1bf29bb176febe763 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:28 +0100
Subject: perf_counter: x86: fix 32-bit irq_period assumption

No need to assume the irq_period is 32bit.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 155bc3c..1cedc34 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -449,7 +449,7 @@ __hw_perf_counter_set_period(struct perf_counter *counter,
 			     struct hw_perf_counter *hwc, int idx)
 {
 	s64 left = atomic64_read(&hwc->period_left);
-	s32 period = hwc->irq_period;
+	s64 period = hwc->irq_period;
 	int err;
 
 	/*
-- 
cgit v0.10.2


From 755642322aa66fbc5421a35fd3e1733f73e20083 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:29 +0100
Subject: perf_counter: use list_move_tail()

Instead of del/add use a move list-op.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index b2e8389..0fe22c9 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -89,8 +89,7 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	list_for_each_entry_safe(sibling, tmp,
 				 &counter->sibling_list, list_entry) {
 
-		list_del_init(&sibling->list_entry);
-		list_add_tail(&sibling->list_entry, &ctx->counter_list);
+		list_move_tail(&sibling->list_entry, &ctx->counter_list);
 		sibling->group_leader = sibling;
 	}
 }
@@ -959,8 +958,7 @@ static void rotate_ctx(struct perf_counter_context *ctx)
 	 */
 	perf_flags = hw_perf_save_disable();
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		list_del(&counter->list_entry);
-		list_add_tail(&counter->list_entry, &ctx->counter_list);
+		list_move_tail(&counter->list_entry, &ctx->counter_list);
 		break;
 	}
 	hw_perf_restore(perf_flags);
-- 
cgit v0.10.2


From 60b3df9c1e24a18aabb412da9905208c5f04ebea Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:30 +0100
Subject: perf_counter: add comment to barrier

We need to ensure the enabled=0 write happens before we
start disabling the actual counters, so that a pcm_amd_enable()
will not enable one underneath us.

I think the race is impossible anyway, we always balance the
ops within any one context and perform enable() with IRQs disabled.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 1cedc34..a2e3b76 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -247,6 +247,10 @@ static u64 pmc_amd_save_disable_all(void)
 
 	enabled = cpuc->enabled;
 	cpuc->enabled = 0;
+	/*
+	 * ensure we write the disable before we start disabling the
+	 * counters proper, so that pcm_amd_enable() does the right thing.
+	 */
 	barrier();
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-- 
cgit v0.10.2


From 82bae4f8c2fd64a2bb1e2e72c508853ed2b4a299 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:31 +0100
Subject: perf_counter: x86: use ULL postfix for 64bit constants

Fix a build warning on 32bit machines by explicitly marking the
constants as 64-bit.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a2e3b76..22dab06 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -84,9 +84,9 @@ static u64 pmc_intel_event_map(int event)
 
 static u64 pmc_intel_raw_event(u64 event)
 {
-#define CORE_EVNTSEL_EVENT_MASK		0x000000FF
-#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00
-#define CORE_EVNTSEL_COUNTER_MASK	0xFF000000
+#define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
+#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
+#define CORE_EVNTSEL_COUNTER_MASK	0xFF000000ULL
 
 #define CORE_EVNTSEL_MASK 		\
 	(CORE_EVNTSEL_EVENT_MASK |	\
@@ -116,9 +116,9 @@ static u64 pmc_amd_event_map(int event)
 
 static u64 pmc_amd_raw_event(u64 event)
 {
-#define K7_EVNTSEL_EVENT_MASK	0x7000000FF
-#define K7_EVNTSEL_UNIT_MASK	0x00000FF00
-#define K7_EVNTSEL_COUNTER_MASK	0x0FF000000
+#define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL
+#define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL
+#define K7_EVNTSEL_COUNTER_MASK	0x0FF000000ULL
 
 #define K7_EVNTSEL_MASK			\
 	(K7_EVNTSEL_EVENT_MASK |	\
-- 
cgit v0.10.2


From 15dbf27cc18559a14e99609f78678aa86b9c6ff1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:32 +0100
Subject: perf_counter: software counter event infrastructure

Provide generic software counter infrastructure that supports
software events.

This will be used to allow sample based profiling based on software
events such as pagefaults. The current infrastructure can only
provide a count of such events, no place information.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index dde5645..3fefc3b 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -126,6 +126,7 @@ struct hw_perf_counter {
 	unsigned long			counter_base;
 	int				nmi;
 	unsigned int			idx;
+	atomic64_t			count; /* software */
 	atomic64_t			prev_count;
 	u64				irq_period;
 	atomic64_t			period_left;
@@ -283,6 +284,8 @@ static inline int is_software_counter(struct perf_counter *counter)
 	return !counter->hw_event.raw && counter->hw_event.type < 0;
 }
 
+extern void perf_swcounter_event(enum hw_event_types, u64, int, struct pt_regs *);
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -295,10 +298,13 @@ static inline void perf_counter_exit_task(struct task_struct *child)	{ }
 static inline void perf_counter_notify(struct pt_regs *regs)		{ }
 static inline void perf_counter_print_debug(void)			{ }
 static inline void perf_counter_unthrottle(void)			{ }
-static inline void hw_perf_restore(u64 ctrl)			{ }
+static inline void hw_perf_restore(u64 ctrl)				{ }
 static inline u64 hw_perf_save_disable(void)		      { return 0; }
 static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
 static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
+
+static inline void perf_swcounter_event(enum hw_event_types event, u64 nr,
+					int nmi, struct pt_regs *regs)	{ }
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0fe22c9..eeb1b46 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1328,6 +1328,185 @@ static const struct file_operations perf_fops = {
 	.compat_ioctl		= perf_ioctl,
 };
 
+/*
+ * Generic software counter infrastructure
+ */
+
+static void perf_swcounter_update(struct perf_counter *counter)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+	u64 prev, now;
+	s64 delta;
+
+again:
+	prev = atomic64_read(&hwc->prev_count);
+	now = atomic64_read(&hwc->count);
+	if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
+		goto again;
+
+	delta = now - prev;
+
+	atomic64_add(delta, &counter->count);
+	atomic64_sub(delta, &hwc->period_left);
+}
+
+static void perf_swcounter_set_period(struct perf_counter *counter)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+	s64 left = atomic64_read(&hwc->period_left);
+	s64 period = hwc->irq_period;
+
+	if (unlikely(left <= -period)) {
+		left = period;
+		atomic64_set(&hwc->period_left, left);
+	}
+
+	if (unlikely(left <= 0)) {
+		left += period;
+		atomic64_add(period, &hwc->period_left);
+	}
+
+	atomic64_set(&hwc->prev_count, -left);
+	atomic64_set(&hwc->count, -left);
+}
+
+static void perf_swcounter_save_and_restart(struct perf_counter *counter)
+{
+	perf_swcounter_update(counter);
+	perf_swcounter_set_period(counter);
+}
+
+static void perf_swcounter_store_irq(struct perf_counter *counter, u64 data)
+{
+	struct perf_data *irqdata = counter->irqdata;
+
+	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
+		irqdata->overrun++;
+	} else {
+		u64 *p = (u64 *) &irqdata->data[irqdata->len];
+
+		*p = data;
+		irqdata->len += sizeof(u64);
+	}
+}
+
+static void perf_swcounter_handle_group(struct perf_counter *sibling)
+{
+	struct perf_counter *counter, *group_leader = sibling->group_leader;
+
+	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
+		perf_swcounter_update(counter);
+		perf_swcounter_store_irq(sibling, counter->hw_event.type);
+		perf_swcounter_store_irq(sibling, atomic64_read(&counter->count));
+	}
+}
+
+static void perf_swcounter_interrupt(struct perf_counter *counter,
+				     int nmi, struct pt_regs *regs)
+{
+	perf_swcounter_save_and_restart(counter);
+
+	switch (counter->hw_event.record_type) {
+	case PERF_RECORD_SIMPLE:
+		break;
+
+	case PERF_RECORD_IRQ:
+		perf_swcounter_store_irq(counter, instruction_pointer(regs));
+		break;
+
+	case PERF_RECORD_GROUP:
+		perf_swcounter_handle_group(counter);
+		break;
+	}
+
+	if (nmi) {
+		counter->wakeup_pending = 1;
+		set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
+	} else
+		wake_up(&counter->waitq);
+}
+
+static int perf_swcounter_match(struct perf_counter *counter,
+				enum hw_event_types event,
+				struct pt_regs *regs)
+{
+	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+		return 0;
+
+	if (counter->hw_event.raw)
+		return 0;
+
+	if (counter->hw_event.type != event)
+		return 0;
+
+	if (counter->hw_event.exclude_user && user_mode(regs))
+		return 0;
+
+	if (counter->hw_event.exclude_kernel && !user_mode(regs))
+		return 0;
+
+	return 1;
+}
+
+static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
+				     enum hw_event_types event, u64 nr,
+				     int nmi, struct pt_regs *regs)
+{
+	struct perf_counter *counter;
+	unsigned long flags;
+	int neg;
+
+	if (list_empty(&ctx->counter_list))
+		return;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	/*
+	 * XXX: make counter_list RCU safe
+	 */
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		if (perf_swcounter_match(counter, event, regs)) {
+			neg = atomic64_add_negative(nr, &counter->hw.count);
+			if (counter->hw.irq_period && !neg)
+				perf_swcounter_interrupt(counter, nmi, regs);
+		}
+	}
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+}
+
+void perf_swcounter_event(enum hw_event_types event, u64 nr,
+			  int nmi, struct pt_regs *regs)
+{
+	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+
+	perf_swcounter_ctx_event(&cpuctx->ctx, event, nr, nmi, regs);
+	if (cpuctx->task_ctx)
+		perf_swcounter_ctx_event(cpuctx->task_ctx, event, nr, nmi, regs);
+
+	put_cpu_var(perf_cpu_context);
+}
+
+static void perf_swcounter_read(struct perf_counter *counter)
+{
+	perf_swcounter_update(counter);
+}
+
+static int perf_swcounter_enable(struct perf_counter *counter)
+{
+	perf_swcounter_set_period(counter);
+	return 0;
+}
+
+static void perf_swcounter_disable(struct perf_counter *counter)
+{
+	perf_swcounter_update(counter);
+}
+
+/*
+ * Software counter: cpu wall time clock
+ */
+
 static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 {
 	int cpu = raw_smp_processor_id();
@@ -1365,6 +1544,10 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
 };
 
 /*
+ * Software counter: task time clock
+ */
+
+/*
  * Called from within the scheduler:
  */
 static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
@@ -1420,6 +1603,10 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = {
 	.read		= task_clock_perf_counter_read,
 };
 
+/*
+ * Software counter: page faults
+ */
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 #define cpu_page_faults()	__get_cpu_var(vm_event_states).event[PGFAULT]
 #else
@@ -1473,6 +1660,10 @@ static const struct hw_perf_counter_ops perf_ops_page_faults = {
 	.read		= page_faults_perf_counter_read,
 };
 
+/*
+ * Software counter: context switches
+ */
+
 static u64 get_context_switches(struct perf_counter *counter)
 {
 	struct task_struct *curr = counter->ctx->task;
@@ -1521,6 +1712,10 @@ static const struct hw_perf_counter_ops perf_ops_context_switches = {
 	.read		= context_switches_perf_counter_read,
 };
 
+/*
+ * Software counter: cpu migrations
+ */
+
 static inline u64 get_cpu_migrations(struct perf_counter *counter)
 {
 	struct task_struct *curr = counter->ctx->task;
@@ -1572,7 +1767,9 @@ static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
 static const struct hw_perf_counter_ops *
 sw_perf_counter_init(struct perf_counter *counter)
 {
+	struct perf_counter_hw_event *hw_event = &counter->hw_event;
 	const struct hw_perf_counter_ops *hw_ops = NULL;
+	struct hw_perf_counter *hwc = &counter->hw;
 
 	/*
 	 * Software counters (currently) can't in general distinguish
@@ -1618,6 +1815,10 @@ sw_perf_counter_init(struct perf_counter *counter)
 	default:
 		break;
 	}
+
+	if (hw_ops)
+		hwc->irq_period = hw_event->irq_period;
+
 	return hw_ops;
 }
 
-- 
cgit v0.10.2


From 7dd1fcc258b65da718f01e4684a7b9244501a9fb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:33 +0100
Subject: perf_counter: provide pagefault software events

We use the generic software counter infrastructure to provide
page fault events.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 7699394..eda5b0c 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -29,6 +29,7 @@
 #include <linux/module.h>
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
+#include <linux/perf_counter.h>
 
 #include <asm/firmware.h>
 #include <asm/page.h>
@@ -170,6 +171,8 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 		die("Weird page fault", regs, SIGSEGV);
 	}
 
+	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs);
+
 	/* When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
 	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a03b727..c872575 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -27,6 +27,7 @@
 #include <linux/tty.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
+#include <linux/perf_counter.h>
 
 #include <asm-generic/sections.h>
 
@@ -1044,6 +1045,8 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	if (unlikely(error_code & PF_RSVD))
 		pgtable_bad(regs, error_code, address);
 
+	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs);
+
 	/*
 	 * If we're in an interrupt, have no user context or are running
 	 * in an atomic region then we must not take the fault:
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index eeb1b46..1773c5d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1607,57 +1607,10 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = {
  * Software counter: page faults
  */
 
-#ifdef CONFIG_VM_EVENT_COUNTERS
-#define cpu_page_faults()	__get_cpu_var(vm_event_states).event[PGFAULT]
-#else
-#define cpu_page_faults()	0
-#endif
-
-static u64 get_page_faults(struct perf_counter *counter)
-{
-	struct task_struct *curr = counter->ctx->task;
-
-	if (curr)
-		return curr->maj_flt + curr->min_flt;
-	return cpu_page_faults();
-}
-
-static void page_faults_perf_counter_update(struct perf_counter *counter)
-{
-	u64 prev, now;
-	s64 delta;
-
-	prev = atomic64_read(&counter->hw.prev_count);
-	now = get_page_faults(counter);
-
-	atomic64_set(&counter->hw.prev_count, now);
-
-	delta = now - prev;
-
-	atomic64_add(delta, &counter->count);
-}
-
-static void page_faults_perf_counter_read(struct perf_counter *counter)
-{
-	page_faults_perf_counter_update(counter);
-}
-
-static int page_faults_perf_counter_enable(struct perf_counter *counter)
-{
-	if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
-		atomic64_set(&counter->hw.prev_count, get_page_faults(counter));
-	return 0;
-}
-
-static void page_faults_perf_counter_disable(struct perf_counter *counter)
-{
-	page_faults_perf_counter_update(counter);
-}
-
 static const struct hw_perf_counter_ops perf_ops_page_faults = {
-	.enable		= page_faults_perf_counter_enable,
-	.disable	= page_faults_perf_counter_disable,
-	.read		= page_faults_perf_counter_read,
+	.enable		= perf_swcounter_enable,
+	.disable	= perf_swcounter_disable,
+	.read		= perf_swcounter_read,
 };
 
 /*
-- 
cgit v0.10.2


From ac17dc8e58f3069ea895cfff963adf98ff3cf6b2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:34 +0100
Subject: perf_counter: provide major/minor page fault software events

Provide separate sw counters for major and minor page faults.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index eda5b0c..17bbf6f 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -312,6 +312,7 @@ good_area:
 	}
 	if (ret & VM_FAULT_MAJOR) {
 		current->maj_flt++;
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs);
 #ifdef CONFIG_PPC_SMLPAR
 		if (firmware_has_feature(FW_FEATURE_CMO)) {
 			preempt_disable();
@@ -319,8 +320,10 @@ good_area:
 			preempt_enable();
 		}
 #endif
-	} else
+	} else {
 		current->min_flt++;
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs);
+	}
 	up_read(&mm->mmap_sem);
 	return 0;
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index c872575..f2d3324 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1140,10 +1140,13 @@ good_area:
 		return;
 	}
 
-	if (fault & VM_FAULT_MAJOR)
+	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-	else
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs);
+	} else {
 		tsk->min_flt++;
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs);
+	}
 
 	check_v8086_mode(regs, address, tsk);
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 3fefc3b..4b14a8e 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -49,8 +49,10 @@ enum hw_event_types {
 	PERF_COUNT_PAGE_FAULTS		= -3,
 	PERF_COUNT_CONTEXT_SWITCHES	= -4,
 	PERF_COUNT_CPU_MIGRATIONS	= -5,
+	PERF_COUNT_PAGE_FAULTS_MIN	= -6,
+	PERF_COUNT_PAGE_FAULTS_MAJ	= -7,
 
-	PERF_SW_EVENTS_MIN		= -6,
+	PERF_SW_EVENTS_MIN		= -8,
 };
 
 /*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 1773c5d..68950a3 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1503,6 +1503,12 @@ static void perf_swcounter_disable(struct perf_counter *counter)
 	perf_swcounter_update(counter);
 }
 
+static const struct hw_perf_counter_ops perf_ops_generic = {
+	.enable		= perf_swcounter_enable,
+	.disable	= perf_swcounter_disable,
+	.read		= perf_swcounter_read,
+};
+
 /*
  * Software counter: cpu wall time clock
  */
@@ -1604,16 +1610,6 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = {
 };
 
 /*
- * Software counter: page faults
- */
-
-static const struct hw_perf_counter_ops perf_ops_page_faults = {
-	.enable		= perf_swcounter_enable,
-	.disable	= perf_swcounter_disable,
-	.read		= perf_swcounter_read,
-};
-
-/*
  * Software counter: context switches
  */
 
@@ -1753,9 +1749,9 @@ sw_perf_counter_init(struct perf_counter *counter)
 			hw_ops = &perf_ops_cpu_clock;
 		break;
 	case PERF_COUNT_PAGE_FAULTS:
-		if (!(counter->hw_event.exclude_user ||
-		      counter->hw_event.exclude_kernel))
-			hw_ops = &perf_ops_page_faults;
+	case PERF_COUNT_PAGE_FAULTS_MIN:
+	case PERF_COUNT_PAGE_FAULTS_MAJ:
+		hw_ops = &perf_ops_generic;
 		break;
 	case PERF_COUNT_CONTEXT_SWITCHES:
 		if (!counter->hw_event.exclude_kernel)
-- 
cgit v0.10.2


From d6d020e9957745c61285ef3da9f294c5e6801f0f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:35 +0100
Subject: perf_counter: hrtimer based sampling for software time events

Use hrtimers to profile timer based sampling for the software time
counters.

This allows platforms without hardware counter support to still
perform sample based profiling.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 4b14a8e..dfb4c7c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -114,6 +114,7 @@ struct perf_counter_hw_event {
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
 #include <linux/spinlock.h>
+#include <linux/hrtimer.h>
 #include <asm/atomic.h>
 
 struct task_struct;
@@ -123,12 +124,19 @@ struct task_struct;
  */
 struct hw_perf_counter {
 #ifdef CONFIG_PERF_COUNTERS
-	u64				config;
-	unsigned long			config_base;
-	unsigned long			counter_base;
-	int				nmi;
-	unsigned int			idx;
-	atomic64_t			count; /* software */
+	union {
+		struct { /* hardware */
+			u64				config;
+			unsigned long			config_base;
+			unsigned long			counter_base;
+			int				nmi;
+			unsigned int			idx;
+		};
+		union { /* software */
+			atomic64_t			count;
+			struct hrtimer			hrtimer;
+		};
+	};
 	atomic64_t			prev_count;
 	u64				irq_period;
 	atomic64_t			period_left;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 68950a3..f9330d5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1395,7 +1395,7 @@ static void perf_swcounter_handle_group(struct perf_counter *sibling)
 	struct perf_counter *counter, *group_leader = sibling->group_leader;
 
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
-		perf_swcounter_update(counter);
+		counter->hw_ops->read(counter);
 		perf_swcounter_store_irq(sibling, counter->hw_event.type);
 		perf_swcounter_store_irq(sibling, atomic64_read(&counter->count));
 	}
@@ -1404,8 +1404,6 @@ static void perf_swcounter_handle_group(struct perf_counter *sibling)
 static void perf_swcounter_interrupt(struct perf_counter *counter,
 				     int nmi, struct pt_regs *regs)
 {
-	perf_swcounter_save_and_restart(counter);
-
 	switch (counter->hw_event.record_type) {
 	case PERF_RECORD_SIMPLE:
 		break;
@@ -1426,6 +1424,38 @@ static void perf_swcounter_interrupt(struct perf_counter *counter,
 		wake_up(&counter->waitq);
 }
 
+static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
+{
+	struct perf_counter *counter;
+	struct pt_regs *regs;
+
+	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer);
+	counter->hw_ops->read(counter);
+
+	regs = get_irq_regs();
+	/*
+	 * In case we exclude kernel IPs or are somehow not in interrupt
+	 * context, provide the next best thing, the user IP.
+	 */
+	if ((counter->hw_event.exclude_kernel || !regs) &&
+			!counter->hw_event.exclude_user)
+		regs = task_pt_regs(current);
+
+	if (regs)
+		perf_swcounter_interrupt(counter, 0, regs);
+
+	hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
+
+	return HRTIMER_RESTART;
+}
+
+static void perf_swcounter_overflow(struct perf_counter *counter,
+				    int nmi, struct pt_regs *regs)
+{
+	perf_swcounter_save_and_restart(counter);
+	perf_swcounter_interrupt(counter, nmi, regs);
+}
+
 static int perf_swcounter_match(struct perf_counter *counter,
 				enum hw_event_types event,
 				struct pt_regs *regs)
@@ -1448,13 +1478,20 @@ static int perf_swcounter_match(struct perf_counter *counter,
 	return 1;
 }
 
+static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
+			       int nmi, struct pt_regs *regs)
+{
+	int neg = atomic64_add_negative(nr, &counter->hw.count);
+	if (counter->hw.irq_period && !neg)
+		perf_swcounter_overflow(counter, nmi, regs);
+}
+
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 				     enum hw_event_types event, u64 nr,
 				     int nmi, struct pt_regs *regs)
 {
 	struct perf_counter *counter;
 	unsigned long flags;
-	int neg;
 
 	if (list_empty(&ctx->counter_list))
 		return;
@@ -1465,11 +1502,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 	 * XXX: make counter_list RCU safe
 	 */
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		if (perf_swcounter_match(counter, event, regs)) {
-			neg = atomic64_add_negative(nr, &counter->hw.count);
-			if (counter->hw.irq_period && !neg)
-				perf_swcounter_interrupt(counter, nmi, regs);
-		}
+		if (perf_swcounter_match(counter, event, regs))
+			perf_swcounter_add(counter, nr, nmi, regs);
 	}
 
 	spin_unlock_irqrestore(&ctx->lock, flags);
@@ -1513,14 +1547,6 @@ static const struct hw_perf_counter_ops perf_ops_generic = {
  * Software counter: cpu wall time clock
  */
 
-static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
-{
-	int cpu = raw_smp_processor_id();
-
-	atomic64_set(&counter->hw.prev_count, cpu_clock(cpu));
-	return 0;
-}
-
 static void cpu_clock_perf_counter_update(struct perf_counter *counter)
 {
 	int cpu = raw_smp_processor_id();
@@ -1533,8 +1559,26 @@ static void cpu_clock_perf_counter_update(struct perf_counter *counter)
 	atomic64_add(now - prev, &counter->count);
 }
 
+static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+	int cpu = raw_smp_processor_id();
+
+	atomic64_set(&hwc->prev_count, cpu_clock(cpu));
+	if (hwc->irq_period) {
+		hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+		hwc->hrtimer.function = perf_swcounter_hrtimer;
+		__hrtimer_start_range_ns(&hwc->hrtimer,
+				ns_to_ktime(hwc->irq_period), 0,
+				HRTIMER_MODE_REL, 0);
+	}
+
+	return 0;
+}
+
 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
 {
+	hrtimer_cancel(&counter->hw.hrtimer);
 	cpu_clock_perf_counter_update(counter);
 }
 
@@ -1580,27 +1624,33 @@ static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now
 	atomic64_add(delta, &counter->count);
 }
 
-static void task_clock_perf_counter_read(struct perf_counter *counter)
-{
-	u64 now = task_clock_perf_counter_val(counter, 1);
-
-	task_clock_perf_counter_update(counter, now);
-}
-
 static int task_clock_perf_counter_enable(struct perf_counter *counter)
 {
-	if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
-		atomic64_set(&counter->hw.prev_count,
-			     task_clock_perf_counter_val(counter, 0));
+	struct hw_perf_counter *hwc = &counter->hw;
+
+	atomic64_set(&hwc->prev_count, task_clock_perf_counter_val(counter, 0));
+	if (hwc->irq_period) {
+		hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+		hwc->hrtimer.function = perf_swcounter_hrtimer;
+		__hrtimer_start_range_ns(&hwc->hrtimer,
+				ns_to_ktime(hwc->irq_period), 0,
+				HRTIMER_MODE_REL, 0);
+	}
 
 	return 0;
 }
 
 static void task_clock_perf_counter_disable(struct perf_counter *counter)
 {
-	u64 now = task_clock_perf_counter_val(counter, 0);
+	hrtimer_cancel(&counter->hw.hrtimer);
+	task_clock_perf_counter_update(counter,
+			task_clock_perf_counter_val(counter, 0));
+}
 
-	task_clock_perf_counter_update(counter, now);
+static void task_clock_perf_counter_read(struct perf_counter *counter)
+{
+	task_clock_perf_counter_update(counter,
+			task_clock_perf_counter_val(counter, 1));
 }
 
 static const struct hw_perf_counter_ops perf_ops_task_clock = {
@@ -1729,16 +1779,12 @@ sw_perf_counter_init(struct perf_counter *counter)
 	 */
 	switch (counter->hw_event.type) {
 	case PERF_COUNT_CPU_CLOCK:
-		if (!(counter->hw_event.exclude_user ||
-		      counter->hw_event.exclude_kernel ||
-		      counter->hw_event.exclude_hv))
-			hw_ops = &perf_ops_cpu_clock;
+		hw_ops = &perf_ops_cpu_clock;
+
+		if (hw_event->irq_period && hw_event->irq_period < 10000)
+			hw_event->irq_period = 10000;
 		break;
 	case PERF_COUNT_TASK_CLOCK:
-		if (counter->hw_event.exclude_user ||
-		    counter->hw_event.exclude_kernel ||
-		    counter->hw_event.exclude_hv)
-			break;
 		/*
 		 * If the user instantiates this as a per-cpu counter,
 		 * use the cpu_clock counter instead.
@@ -1747,6 +1793,9 @@ sw_perf_counter_init(struct perf_counter *counter)
 			hw_ops = &perf_ops_task_clock;
 		else
 			hw_ops = &perf_ops_cpu_clock;
+
+		if (hw_event->irq_period && hw_event->irq_period < 10000)
+			hw_event->irq_period = 10000;
 		break;
 	case PERF_COUNT_PAGE_FAULTS:
 	case PERF_COUNT_PAGE_FAULTS_MIN:
-- 
cgit v0.10.2


From 592903cdcbf606a838056bae6d03fc557806c914 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:36 +0100
Subject: perf_counter: add an event_list

I noticed that the counter_list only includes top-level counters, thus
perf_swcounter_event() will miss sw-counters in groups.

Since perf_swcounter_event() also wants an RCU safe list, create a new
event_list that includes all counters and uses RCU list ops and use call_rcu
to free the counter structure.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index dfb4c7c..08c11a6 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -187,6 +187,7 @@ struct file;
 struct perf_counter {
 #ifdef CONFIG_PERF_COUNTERS
 	struct list_head		list_entry;
+	struct list_head		event_entry;
 	struct list_head		sibling_list;
 	struct perf_counter		*group_leader;
 	const struct hw_perf_counter_ops *hw_ops;
@@ -220,6 +221,8 @@ struct perf_counter {
 	struct perf_data		*irqdata;
 	struct perf_data		*usrdata;
 	struct perf_data		data[2];
+
+	struct rcu_head			rcu_head;
 #endif
 };
 
@@ -243,6 +246,7 @@ struct perf_counter_context {
 	struct mutex		mutex;
 
 	struct list_head	counter_list;
+	struct list_head	event_list;
 	int			nr_counters;
 	int			nr_active;
 	int			is_active;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f9330d5..8d6ecfa 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -22,6 +22,7 @@
 #include <linux/perf_counter.h>
 #include <linux/mm.h>
 #include <linux/vmstat.h>
+#include <linux/rculist.h>
 
 /*
  * Each CPU has a list of per CPU counters:
@@ -72,6 +73,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 		list_add_tail(&counter->list_entry, &ctx->counter_list);
 	else
 		list_add_tail(&counter->list_entry, &group_leader->sibling_list);
+
+	list_add_rcu(&counter->event_entry, &ctx->event_list);
 }
 
 static void
@@ -80,6 +83,7 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	struct perf_counter *sibling, *tmp;
 
 	list_del_init(&counter->list_entry);
+	list_del_rcu(&counter->event_entry);
 
 	/*
 	 * If this was a group counter with sibling counters then
@@ -1133,6 +1137,14 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	return ctx;
 }
 
+static void free_counter_rcu(struct rcu_head *head)
+{
+	struct perf_counter *counter;
+
+	counter = container_of(head, struct perf_counter, rcu_head);
+	kfree(counter);
+}
+
 /*
  * Called when the last reference to the file is gone.
  */
@@ -1151,7 +1163,7 @@ static int perf_release(struct inode *inode, struct file *file)
 	mutex_unlock(&counter->mutex);
 	mutex_unlock(&ctx->mutex);
 
-	kfree(counter);
+	call_rcu(&counter->rcu_head, free_counter_rcu);
 	put_context(ctx);
 
 	return 0;
@@ -1491,22 +1503,16 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 				     int nmi, struct pt_regs *regs)
 {
 	struct perf_counter *counter;
-	unsigned long flags;
 
-	if (list_empty(&ctx->counter_list))
+	if (list_empty(&ctx->event_list))
 		return;
 
-	spin_lock_irqsave(&ctx->lock, flags);
-
-	/*
-	 * XXX: make counter_list RCU safe
-	 */
-	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
 		if (perf_swcounter_match(counter, event, regs))
 			perf_swcounter_add(counter, nr, nmi, regs);
 	}
-
-	spin_unlock_irqrestore(&ctx->lock, flags);
+	rcu_read_unlock();
 }
 
 void perf_swcounter_event(enum hw_event_types event, u64 nr,
@@ -1846,6 +1852,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
 	mutex_init(&counter->mutex);
 	INIT_LIST_HEAD(&counter->list_entry);
+	INIT_LIST_HEAD(&counter->event_entry);
 	INIT_LIST_HEAD(&counter->sibling_list);
 	init_waitqueue_head(&counter->waitq);
 
@@ -1992,6 +1999,7 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
 	spin_lock_init(&ctx->lock);
 	mutex_init(&ctx->mutex);
 	INIT_LIST_HEAD(&ctx->counter_list);
+	INIT_LIST_HEAD(&ctx->event_list);
 	ctx->task = task;
 }
 
-- 
cgit v0.10.2


From 039fc91e064b81c2820ff16c304be5aba35fd126 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 16:43:47 +0100
Subject: perf_counter: fix hrtimer sampling

Impact: fix deadlock with perfstat

Fix for the perfstat fubar..

We cannot unconditionally call hrtimer_cancel() without ever having done
hrtimer_init() on the thing.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Orig-LKML-Reference: <1236959027.22447.149.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 8d6ecfa..d6cc222 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1571,9 +1571,9 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 	int cpu = raw_smp_processor_id();
 
 	atomic64_set(&hwc->prev_count, cpu_clock(cpu));
+	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hwc->hrtimer.function = perf_swcounter_hrtimer;
 	if (hwc->irq_period) {
-		hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-		hwc->hrtimer.function = perf_swcounter_hrtimer;
 		__hrtimer_start_range_ns(&hwc->hrtimer,
 				ns_to_ktime(hwc->irq_period), 0,
 				HRTIMER_MODE_REL, 0);
@@ -1635,9 +1635,9 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 
 	atomic64_set(&hwc->prev_count, task_clock_perf_counter_val(counter, 0));
+	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hwc->hrtimer.function = perf_swcounter_hrtimer;
 	if (hwc->irq_period) {
-		hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-		hwc->hrtimer.function = perf_swcounter_hrtimer;
 		__hrtimer_start_range_ns(&hwc->hrtimer,
 				ns_to_ktime(hwc->irq_period), 0,
 				HRTIMER_MODE_REL, 0);
-- 
cgit v0.10.2


From 4e193bd4dfdc983d12969b51439b4a1fbaf2daad Mon Sep 17 00:00:00 2001
From: Tim Blechmann <tim@klingt.org>
Date: Sat, 14 Mar 2009 14:29:25 +0100
Subject: perf_counter: include missing header

Impact: build fix

In order to compile a kernel with performance counter patches,
<asm/irq_regs.h> has to be included to provide the declaration of
struct pt_regs *get_irq_regs(void);

[ This bug was masked by unrelated x86 header file changes in the
  x86 tree, but occurs in the tip:perfcounters/core standalone
  tree. ]

Signed-off-by: Tim Blechmann <tim@klingt.org>
Orig-LKML-Reference: <20090314142925.49c29c17@thinkpad>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d6cc222..0018c5e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -24,6 +24,8 @@
 #include <linux/vmstat.h>
 #include <linux/rculist.h>
 
+#include <asm/irq_regs.h>
+
 /*
  * Each CPU has a list of per CPU counters:
  */
-- 
cgit v0.10.2


From 7bb497bd885eedd0f56dfe3cc1b5ff20710d33b9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 18 Mar 2009 08:59:21 +0100
Subject: perf_counter: fix crash on perfmon v1 systems

Impact: fix boot crash on Intel Perfmon Version 1 systems

Intel Perfmon v1 does not support the global MSRs, nor does
it offer the generalized MSR ranges. So support v2 and later
CPUs only.

Also mark pmc_ops as read-mostly - to avoid false cacheline
sharing.

Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 22dab06..6cba9d4 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -57,12 +57,14 @@ struct pmc_x86_ops {
 	int		max_events;
 };
 
-static struct pmc_x86_ops *pmc_ops;
+static struct pmc_x86_ops *pmc_ops __read_mostly;
 
 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
 	.enabled = 1,
 };
 
+static __read_mostly int intel_perfmon_version;
+
 /*
  * Intel PerfMon v3. Used on Core2 and later.
  */
@@ -613,7 +615,7 @@ void perf_counter_print_debug(void)
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+	if (intel_perfmon_version >= 2) {
 		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
@@ -930,10 +932,10 @@ static struct pmc_x86_ops pmc_amd_ops = {
 
 static struct pmc_x86_ops *pmc_intel_init(void)
 {
+	union cpuid10_edx edx;
 	union cpuid10_eax eax;
-	unsigned int ebx;
 	unsigned int unused;
-	union cpuid10_edx edx;
+	unsigned int ebx;
 
 	/*
 	 * Check whether the Architectural PerfMon supports
@@ -943,8 +945,12 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
 		return NULL;
 
+	intel_perfmon_version = eax.split.version_id;
+	if (intel_perfmon_version < 2)
+		return NULL;
+
 	pr_info("Intel Performance Monitoring support detected.\n");
-	pr_info("... version:         %d\n", eax.split.version_id);
+	pr_info("... version:         %d\n", intel_perfmon_version);
 	pr_info("... bit width:       %d\n", eax.split.bit_width);
 	pr_info("... mask length:     %d\n", eax.split.mask_length);
 
-- 
cgit v0.10.2


From b6c5a71da1477d261bc36254fe1f20d32b57598d Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 16 Mar 2009 21:00:00 +1100
Subject: perf_counter: abstract wakeup flag setting in core to fix powerpc
 build

Impact: build fix for powerpc

Commit bd753921015e7905 ("perf_counter: software counter event
infrastructure") introduced a use of TIF_PERF_COUNTERS into the core
perfcounter code.  This breaks the build on powerpc because we use
a flag in a per-cpu area to signal wakeups on powerpc rather than
a thread_info flag, because the thread_info flags have to be
manipulated with atomic operations and are thus slower than per-cpu
flags.

This fixes the by changing the core to use an abstracted
set_perf_counter_pending() function, which is defined on x86 to set
the TIF_PERF_COUNTERS flag and on powerpc to set the per-cpu flag
(paca->perf_counter_pending).  It changes the previous powerpc
definition of set_perf_counter_pending to not take an argument and
adds a clear_perf_counter_pending, so as to simplify the definition
on x86.

On x86, set_perf_counter_pending() is defined as a macro.  Defining
it as a static inline in arch/x86/include/asm/perf_counters.h causes
compile failures because <asm/perf_counters.h> gets included early in
<linux/sched.h>, and the definitions of set_tsk_thread_flag etc. are
therefore not available in <asm/perf_counters.h>.  (On powerpc this
problem is avoided by defining set_perf_counter_pending etc. in
<asm/hw_irq.h>.)

Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index b43076f..cb32d57 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -142,10 +142,17 @@ static inline unsigned long get_perf_counter_pending(void)
 	return x;
 }
 
-static inline void set_perf_counter_pending(int x)
+static inline void set_perf_counter_pending(void)
 {
 	asm volatile("stb %0,%1(13)" : :
-		"r" (x),
+		"r" (1),
+		"i" (offsetof(struct paca_struct, perf_counter_pending)));
+}
+
+static inline void clear_perf_counter_pending(void)
+{
+	asm volatile("stb %0,%1(13)" : :
+		"r" (0),
 		"i" (offsetof(struct paca_struct, perf_counter_pending)));
 }
 
@@ -158,7 +165,8 @@ static inline unsigned long get_perf_counter_pending(void)
 	return 0;
 }
 
-static inline void set_perf_counter_pending(int x) {}
+static inline void set_perf_counter_pending(void) {}
+static inline void clear_perf_counter_pending(void) {}
 static inline void perf_counter_do_pending(void) {}
 #endif /* CONFIG_PERF_COUNTERS */
 
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 0d2e37c..469e963 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -104,13 +104,6 @@ static inline notrace void set_soft_enabled(unsigned long enable)
 	: : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled)));
 }
 
-#ifdef CONFIG_PERF_COUNTERS
-notrace void __weak perf_counter_do_pending(void)
-{
-	set_perf_counter_pending(0);
-}
-#endif
-
 notrace void raw_local_irq_restore(unsigned long en)
 {
 	/*
@@ -142,8 +135,10 @@ notrace void raw_local_irq_restore(unsigned long en)
 			iseries_handle_interrupts();
 	}
 
-	if (get_perf_counter_pending())
+	if (get_perf_counter_pending()) {
+		clear_perf_counter_pending();
 		perf_counter_do_pending();
+	}
 
 	/*
 	 * if (get_paca()->hard_enabled) return;
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 0e33d27..5008762 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -653,7 +653,6 @@ void perf_counter_do_pending(void)
 	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
 	struct perf_counter *counter;
 
-	set_perf_counter_pending(0);
 	for (i = 0; i < cpuhw->n_counters; ++i) {
 		counter = cpuhw->counter[i];
 		if (counter && counter->wakeup_pending) {
@@ -811,7 +810,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 			perf_counter_do_pending();
 			irq_exit();
 		} else {
-			set_perf_counter_pending(1);
+			set_perf_counter_pending();
 		}
 	}
 }
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 2e08ed7..1662043 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -84,6 +84,9 @@ union cpuid10_edx {
 #define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
 #define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2)
 
+#define set_perf_counter_pending()	\
+		set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
+
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
 extern void perf_counters_lapic_init(int nmi);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0018c5e..b39456a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1433,7 +1433,7 @@ static void perf_swcounter_interrupt(struct perf_counter *counter,
 
 	if (nmi) {
 		counter->wakeup_pending = 1;
-		set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
+		set_perf_counter_pending();
 	} else
 		wake_up(&counter->waitq);
 }
-- 
cgit v0.10.2


From 01ef09d9ffb5ce9f8d62d1e5206da3d5ca612acc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:11 +0100
Subject: perf_counter: fix uninitialized usage of event_list

Impact: fix boot crash

When doing the generic context switch event I ran into some early
boot hangs, which were caused by inf func recursion (event, fault,
event, fault).

I eventually tracked it down to event_list not being initialized
at the time of the first event. Fix this.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Orig-LKML-Reference: <20090319194233.195392657@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 219748d..ca226a9 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -124,6 +124,8 @@ extern struct cred init_cred;
 # define INIT_PERF_COUNTERS(tsk)					\
 	.perf_counter_ctx.counter_list =				\
 		LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list),	\
+	.perf_counter_ctx.event_list =					\
+		LIST_HEAD_INIT(tsk.perf_counter_ctx.event_list),	\
 	.perf_counter_ctx.lock =					\
 		__SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock),
 #else
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index b39456a..4c4e9eb 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1506,7 +1506,7 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 {
 	struct perf_counter *counter;
 
-	if (list_empty(&ctx->event_list))
+	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
 		return;
 
 	rcu_read_lock();
-- 
cgit v0.10.2


From 4a0deca657f3dbb8a707b5dc8f173beec01e7ed2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:12 +0100
Subject: perf_counter: generic context switch event

Impact: cleanup

Use the generic software events for context switches.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Orig-LKML-Reference: <20090319194233.283522645@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 75b2fc5..7ed41f7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -138,7 +138,6 @@ extern unsigned long nr_running(void);
 extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_active(void);
 extern unsigned long nr_iowait(void);
-extern u64 cpu_nr_switches(int cpu);
 extern u64 cpu_nr_migrations(int cpu);
 
 extern unsigned long get_parent_ip(unsigned long addr);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4c4e9eb..99d5930 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -710,10 +710,13 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 {
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 	struct perf_counter_context *ctx = &task->perf_counter_ctx;
+	struct pt_regs *regs;
 
 	if (likely(!cpuctx->task_ctx))
 		return;
 
+	regs = task_pt_regs(task);
+	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs);
 	__perf_counter_sched_out(ctx, cpuctx);
 
 	cpuctx->task_ctx = NULL;
@@ -1668,58 +1671,6 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = {
 };
 
 /*
- * Software counter: context switches
- */
-
-static u64 get_context_switches(struct perf_counter *counter)
-{
-	struct task_struct *curr = counter->ctx->task;
-
-	if (curr)
-		return curr->nvcsw + curr->nivcsw;
-	return cpu_nr_switches(smp_processor_id());
-}
-
-static void context_switches_perf_counter_update(struct perf_counter *counter)
-{
-	u64 prev, now;
-	s64 delta;
-
-	prev = atomic64_read(&counter->hw.prev_count);
-	now = get_context_switches(counter);
-
-	atomic64_set(&counter->hw.prev_count, now);
-
-	delta = now - prev;
-
-	atomic64_add(delta, &counter->count);
-}
-
-static void context_switches_perf_counter_read(struct perf_counter *counter)
-{
-	context_switches_perf_counter_update(counter);
-}
-
-static int context_switches_perf_counter_enable(struct perf_counter *counter)
-{
-	if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
-		atomic64_set(&counter->hw.prev_count,
-			     get_context_switches(counter));
-	return 0;
-}
-
-static void context_switches_perf_counter_disable(struct perf_counter *counter)
-{
-	context_switches_perf_counter_update(counter);
-}
-
-static const struct hw_perf_counter_ops perf_ops_context_switches = {
-	.enable		= context_switches_perf_counter_enable,
-	.disable	= context_switches_perf_counter_disable,
-	.read		= context_switches_perf_counter_read,
-};
-
-/*
  * Software counter: cpu migrations
  */
 
@@ -1808,11 +1759,8 @@ sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_PAGE_FAULTS:
 	case PERF_COUNT_PAGE_FAULTS_MIN:
 	case PERF_COUNT_PAGE_FAULTS_MAJ:
-		hw_ops = &perf_ops_generic;
-		break;
 	case PERF_COUNT_CONTEXT_SWITCHES:
-		if (!counter->hw_event.exclude_kernel)
-			hw_ops = &perf_ops_context_switches;
+		hw_ops = &perf_ops_generic;
 		break;
 	case PERF_COUNT_CPU_MIGRATIONS:
 		if (!counter->hw_event.exclude_kernel)
diff --git a/kernel/sched.c b/kernel/sched.c
index 39e7086..f76e3c0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2900,14 +2900,8 @@ unsigned long nr_active(void)
 
 /*
  * Externally visible per-cpu scheduler statistics:
- * cpu_nr_switches(cpu) - number of context switches on that cpu
  * cpu_nr_migrations(cpu) - number of migrations into that cpu
  */
-u64 cpu_nr_switches(int cpu)
-{
-	return cpu_rq(cpu)->nr_switches;
-}
-
 u64 cpu_nr_migrations(int cpu)
 {
 	return cpu_rq(cpu)->nr_migrations_in;
-- 
cgit v0.10.2


From f16009527595ee562308653bc3d0039166d2ab15 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:16 +0100
Subject: perf_counter: fix up counter free paths

Impact: fix crash during perfcounters use

I found another counter free path, create a free_counter() call to
accomodate generic tear-down.

Fixes an RCU bug.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Orig-LKML-Reference: <20090319194233.652078652@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 99d5930..97f891f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1150,6 +1150,11 @@ static void free_counter_rcu(struct rcu_head *head)
 	kfree(counter);
 }
 
+static void free_counter(struct perf_counter *counter)
+{
+	call_rcu(&counter->rcu_head, free_counter_rcu);
+}
+
 /*
  * Called when the last reference to the file is gone.
  */
@@ -1168,7 +1173,7 @@ static int perf_release(struct inode *inode, struct file *file)
 	mutex_unlock(&counter->mutex);
 	mutex_unlock(&ctx->mutex);
 
-	call_rcu(&counter->rcu_head, free_counter_rcu);
+	free_counter(counter);
 	put_context(ctx);
 
 	return 0;
@@ -2128,10 +2133,10 @@ __perf_counter_exit_task(struct task_struct *child,
 					 list_entry) {
 			if (sub->parent) {
 				sync_child_counter(sub, sub->parent);
-				kfree(sub);
+				free_counter(sub);
 			}
 		}
-		kfree(child_counter);
+		free_counter(child_counter);
 	}
 }
 
-- 
cgit v0.10.2


From e077df4f439681e43f0db8255b2d215b342ebdc6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:17 +0100
Subject: perf_counter: hook up the tracepoint events

Impact: new perfcounters feature

Enable usage of tracepoints as perf counter events.

tracepoint event ids can be found in /debug/tracing/event/*/*/id
and (for now) are represented as -65536+id in the type field.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Orig-LKML-Reference: <20090319194233.744044174@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 08c11a6..065984c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -53,6 +53,8 @@ enum hw_event_types {
 	PERF_COUNT_PAGE_FAULTS_MAJ	= -7,
 
 	PERF_SW_EVENTS_MIN		= -8,
+
+	PERF_TP_EVENTS_MIN		= -65536
 };
 
 /*
@@ -222,6 +224,7 @@ struct perf_counter {
 	struct perf_data		*usrdata;
 	struct perf_data		data[2];
 
+	void (*destroy)(struct perf_counter *);
 	struct rcu_head			rcu_head;
 #endif
 };
diff --git a/init/Kconfig b/init/Kconfig
index 38a2ecd..4f64714 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -947,6 +947,11 @@ config PERF_COUNTERS
 
 	  Say Y if unsure.
 
+config EVENT_PROFILE
+	bool "Tracepoint profile sources"
+	depends on PERF_COUNTERS && EVENT_TRACER
+	default y
+
 endmenu
 
 config VM_EVENT_COUNTERS
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 97f891f..0bbe3e45 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1152,6 +1152,9 @@ static void free_counter_rcu(struct rcu_head *head)
 
 static void free_counter(struct perf_counter *counter)
 {
+	if (counter->destroy)
+		counter->destroy(counter);
+
 	call_rcu(&counter->rcu_head, free_counter_rcu);
 }
 
@@ -1727,6 +1730,45 @@ static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
 	.read		= cpu_migrations_perf_counter_read,
 };
 
+#ifdef CONFIG_EVENT_PROFILE
+void perf_tpcounter_event(int event_id)
+{
+	perf_swcounter_event(PERF_TP_EVENTS_MIN + event_id, 1, 1,
+			task_pt_regs(current));
+}
+
+extern int ftrace_profile_enable(int);
+extern void ftrace_profile_disable(int);
+
+static void tp_perf_counter_destroy(struct perf_counter *counter)
+{
+	int event_id = counter->hw_event.type - PERF_TP_EVENTS_MIN;
+
+	ftrace_profile_disable(event_id);
+}
+
+static const struct hw_perf_counter_ops *
+tp_perf_counter_init(struct perf_counter *counter)
+{
+	int event_id = counter->hw_event.type - PERF_TP_EVENTS_MIN;
+	int ret;
+
+	ret = ftrace_profile_enable(event_id);
+	if (ret)
+		return NULL;
+
+	counter->destroy = tp_perf_counter_destroy;
+
+	return &perf_ops_generic;
+}
+#else
+static const struct hw_perf_counter_ops *
+tp_perf_counter_init(struct perf_counter *counter)
+{
+	return NULL;
+}
+#endif
+
 static const struct hw_perf_counter_ops *
 sw_perf_counter_init(struct perf_counter *counter)
 {
@@ -1772,6 +1814,7 @@ sw_perf_counter_init(struct perf_counter *counter)
 			hw_ops = &perf_ops_cpu_migrations;
 		break;
 	default:
+		hw_ops = tp_perf_counter_init(counter);
 		break;
 	}
 
-- 
cgit v0.10.2


From b8e83514b64577b48bfb794fe85fcde40a9343ca Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:18 +0100
Subject: perf_counter: revamp syscall input ABI

Impact: modify ABI

The hardware/software classification in hw_event->type became a little
strained due to the addition of tracepoint tracing.

Instead split up the field and provide a type field to explicitly specify
the counter type, while using the event_id field to specify which event to
use.

Raw counters still work as before, only the raw config now goes into
raw_event.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Orig-LKML-Reference: <20090319194233.836807573@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 5008762..26f69dc 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -602,7 +602,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 	if ((s64)counter->hw_event.irq_period < 0)
 		return NULL;
-	ev = counter->hw_event.type;
+	ev = counter->hw_event.event_id;
 	if (!counter->hw_event.raw) {
 		if (ev >= ppmu->n_generic ||
 		    ppmu->generic_events[ev] == 0)
@@ -692,7 +692,7 @@ static void perf_handle_group(struct perf_counter *counter)
 	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
 		if (sub != counter)
 			sub->hw_ops->read(sub);
-		perf_store_irq_data(counter, sub->hw_event.type);
+		perf_store_irq_data(counter, sub->hw_event.event_config);
 		perf_store_irq_data(counter, atomic64_read(&sub->count));
 	}
 }
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6cba9d4..d844ae4 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -217,15 +217,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
-	if (hw_event->raw) {
-		hwc->config |= pmc_ops->raw_event(hw_event->type);
+	if (hw_event->raw_type) {
+		hwc->config |= pmc_ops->raw_event(hw_event->raw_event_id);
 	} else {
-		if (hw_event->type >= pmc_ops->max_events)
+		if (hw_event->event_id >= pmc_ops->max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= pmc_ops->event_map(hw_event->type);
+		hwc->config |= pmc_ops->event_map(hw_event->event_id);
 	}
 	counter->wakeup_pending = 0;
 
@@ -715,7 +715,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
 
 		x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
-		perf_store_irq_data(sibling, counter->hw_event.type);
+		perf_store_irq_data(sibling, counter->hw_event.event_config);
 		perf_store_irq_data(sibling, atomic64_read(&counter->count));
 	}
 }
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 065984c..8f93949 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -21,56 +21,81 @@
  */
 
 /*
- * Generalized performance counter event types, used by the hw_event.type
- * parameter of the sys_perf_counter_open() syscall:
+ * hw_event.type
  */
-enum hw_event_types {
+enum perf_event_types {
+	PERF_TYPE_HARDWARE		= 0,
+	PERF_TYPE_SOFTWARE		= 1,
+	PERF_TYPE_TRACEPOINT		= 2,
+
 	/*
-	 * Common hardware events, generalized by the kernel:
+	 * available TYPE space, raw is the max value.
 	 */
-	PERF_COUNT_CPU_CYCLES		=  0,
-	PERF_COUNT_INSTRUCTIONS		=  1,
-	PERF_COUNT_CACHE_REFERENCES	=  2,
-	PERF_COUNT_CACHE_MISSES		=  3,
-	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
-	PERF_COUNT_BRANCH_MISSES	=  5,
-	PERF_COUNT_BUS_CYCLES		=  6,
 
-	PERF_HW_EVENTS_MAX		=  7,
+	PERF_TYPE_RAW			= 128,
+};
 
+/*
+ * Generalized performance counter event types, used by the hw_event.event_id
+ * parameter of the sys_perf_counter_open() syscall:
+ */
+enum hw_event_ids {
 	/*
-	 * Special "software" counters provided by the kernel, even if
-	 * the hardware does not support performance counters. These
-	 * counters measure various physical and sw events of the
-	 * kernel (and allow the profiling of them as well):
+	 * Common hardware events, generalized by the kernel:
 	 */
-	PERF_COUNT_CPU_CLOCK		= -1,
-	PERF_COUNT_TASK_CLOCK		= -2,
-	PERF_COUNT_PAGE_FAULTS		= -3,
-	PERF_COUNT_CONTEXT_SWITCHES	= -4,
-	PERF_COUNT_CPU_MIGRATIONS	= -5,
-	PERF_COUNT_PAGE_FAULTS_MIN	= -6,
-	PERF_COUNT_PAGE_FAULTS_MAJ	= -7,
-
-	PERF_SW_EVENTS_MIN		= -8,
+	PERF_COUNT_CPU_CYCLES		= 0,
+	PERF_COUNT_INSTRUCTIONS		= 1,
+	PERF_COUNT_CACHE_REFERENCES	= 2,
+	PERF_COUNT_CACHE_MISSES		= 3,
+	PERF_COUNT_BRANCH_INSTRUCTIONS	= 4,
+	PERF_COUNT_BRANCH_MISSES	= 5,
+	PERF_COUNT_BUS_CYCLES		= 6,
+
+	PERF_HW_EVENTS_MAX		= 7,
+};
 
-	PERF_TP_EVENTS_MIN		= -65536
+/*
+ * Special "software" counters provided by the kernel, even if the hardware
+ * does not support performance counters. These counters measure various
+ * physical and sw events of the kernel (and allow the profiling of them as
+ * well):
+ */
+enum sw_event_ids {
+	PERF_COUNT_CPU_CLOCK		= 0,
+	PERF_COUNT_TASK_CLOCK		= 1,
+	PERF_COUNT_PAGE_FAULTS		= 2,
+	PERF_COUNT_CONTEXT_SWITCHES	= 3,
+	PERF_COUNT_CPU_MIGRATIONS	= 4,
+	PERF_COUNT_PAGE_FAULTS_MIN	= 5,
+	PERF_COUNT_PAGE_FAULTS_MAJ	= 6,
+
+	PERF_SW_EVENTS_MAX		= 7,
 };
 
 /*
  * IRQ-notification data record type:
  */
 enum perf_counter_record_type {
-	PERF_RECORD_SIMPLE		=  0,
-	PERF_RECORD_IRQ			=  1,
-	PERF_RECORD_GROUP		=  2,
+	PERF_RECORD_SIMPLE		= 0,
+	PERF_RECORD_IRQ			= 1,
+	PERF_RECORD_GROUP		= 2,
 };
 
 /*
  * Hardware event to monitor via a performance monitoring counter:
  */
 struct perf_counter_hw_event {
-	__s64			type;
+	union {
+		struct {
+			__u64			event_id	: 56,
+						type		:  8;
+		};
+		struct {
+			__u64			raw_event_id	: 63,
+						raw_type	:  1;
+		};
+		__u64		event_config;
+	};
 
 	__u64			irq_period;
 	__u64			record_type;
@@ -78,7 +103,6 @@ struct perf_counter_hw_event {
 
 	__u64			disabled       :  1, /* off by default        */
 				nmi	       :  1, /* NMI sampling          */
-				raw	       :  1, /* raw event type        */
 				inherit	       :  1, /* children inherit it   */
 				pinned	       :  1, /* must always be on PMU */
 				exclusive      :  1, /* only group on PMU     */
@@ -87,7 +111,7 @@ struct perf_counter_hw_event {
 				exclude_hv     :  1, /* ditto hypervisor      */
 				exclude_idle   :  1, /* don't count when idle */
 
-				__reserved_1   : 54;
+				__reserved_1   : 55;
 
 	__u32			extra_config_len;
 	__u32			__reserved_4;
@@ -298,10 +322,11 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
  */
 static inline int is_software_counter(struct perf_counter *counter)
 {
-	return !counter->hw_event.raw && counter->hw_event.type < 0;
+	return !counter->hw_event.raw_type &&
+		counter->hw_event.type != PERF_TYPE_HARDWARE;
 }
 
-extern void perf_swcounter_event(enum hw_event_types, u64, int, struct pt_regs *);
+extern void perf_swcounter_event(u32, u64, int, struct pt_regs *);
 
 #else
 static inline void
@@ -320,7 +345,7 @@ static inline u64 hw_perf_save_disable(void)		      { return 0; }
 static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
 static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
 
-static inline void perf_swcounter_event(enum hw_event_types event, u64 nr,
+static inline void perf_swcounter_event(u32 event, u64 nr,
 					int nmi, struct pt_regs *regs)	{ }
 #endif
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0bbe3e45..68a56a6 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1395,12 +1395,6 @@ static void perf_swcounter_set_period(struct perf_counter *counter)
 	atomic64_set(&hwc->count, -left);
 }
 
-static void perf_swcounter_save_and_restart(struct perf_counter *counter)
-{
-	perf_swcounter_update(counter);
-	perf_swcounter_set_period(counter);
-}
-
 static void perf_swcounter_store_irq(struct perf_counter *counter, u64 data)
 {
 	struct perf_data *irqdata = counter->irqdata;
@@ -1421,7 +1415,7 @@ static void perf_swcounter_handle_group(struct perf_counter *sibling)
 
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
 		counter->hw_ops->read(counter);
-		perf_swcounter_store_irq(sibling, counter->hw_event.type);
+		perf_swcounter_store_irq(sibling, counter->hw_event.event_config);
 		perf_swcounter_store_irq(sibling, atomic64_read(&counter->count));
 	}
 }
@@ -1477,21 +1471,25 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 static void perf_swcounter_overflow(struct perf_counter *counter,
 				    int nmi, struct pt_regs *regs)
 {
-	perf_swcounter_save_and_restart(counter);
+	perf_swcounter_update(counter);
+	perf_swcounter_set_period(counter);
 	perf_swcounter_interrupt(counter, nmi, regs);
 }
 
 static int perf_swcounter_match(struct perf_counter *counter,
-				enum hw_event_types event,
-				struct pt_regs *regs)
+				enum perf_event_types type,
+				u32 event, struct pt_regs *regs)
 {
 	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 		return 0;
 
-	if (counter->hw_event.raw)
+	if (counter->hw_event.raw_type)
+		return 0;
+
+	if (counter->hw_event.type != type)
 		return 0;
 
-	if (counter->hw_event.type != event)
+	if (counter->hw_event.event_id != event)
 		return 0;
 
 	if (counter->hw_event.exclude_user && user_mode(regs))
@@ -1512,8 +1510,8 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
 }
 
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
-				     enum hw_event_types event, u64 nr,
-				     int nmi, struct pt_regs *regs)
+				     enum perf_event_types type, u32 event,
+				     u64 nr, int nmi, struct pt_regs *regs)
 {
 	struct perf_counter *counter;
 
@@ -1522,24 +1520,31 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-		if (perf_swcounter_match(counter, event, regs))
+		if (perf_swcounter_match(counter, type, event, regs))
 			perf_swcounter_add(counter, nr, nmi, regs);
 	}
 	rcu_read_unlock();
 }
 
-void perf_swcounter_event(enum hw_event_types event, u64 nr,
-			  int nmi, struct pt_regs *regs)
+static void __perf_swcounter_event(enum perf_event_types type, u32 event,
+				   u64 nr, int nmi, struct pt_regs *regs)
 {
 	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
 
-	perf_swcounter_ctx_event(&cpuctx->ctx, event, nr, nmi, regs);
-	if (cpuctx->task_ctx)
-		perf_swcounter_ctx_event(cpuctx->task_ctx, event, nr, nmi, regs);
+	perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs);
+	if (cpuctx->task_ctx) {
+		perf_swcounter_ctx_event(cpuctx->task_ctx, type, event,
+				nr, nmi, regs);
+	}
 
 	put_cpu_var(perf_cpu_context);
 }
 
+void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs)
+{
+	__perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs);
+}
+
 static void perf_swcounter_read(struct perf_counter *counter)
 {
 	perf_swcounter_update(counter);
@@ -1733,8 +1738,12 @@ static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
 #ifdef CONFIG_EVENT_PROFILE
 void perf_tpcounter_event(int event_id)
 {
-	perf_swcounter_event(PERF_TP_EVENTS_MIN + event_id, 1, 1,
-			task_pt_regs(current));
+	struct pt_regs *regs = get_irq_regs();
+
+	if (!regs)
+		regs = task_pt_regs(current);
+
+	__perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs);
 }
 
 extern int ftrace_profile_enable(int);
@@ -1742,15 +1751,13 @@ extern void ftrace_profile_disable(int);
 
 static void tp_perf_counter_destroy(struct perf_counter *counter)
 {
-	int event_id = counter->hw_event.type - PERF_TP_EVENTS_MIN;
-
-	ftrace_profile_disable(event_id);
+	ftrace_profile_disable(counter->hw_event.event_id);
 }
 
 static const struct hw_perf_counter_ops *
 tp_perf_counter_init(struct perf_counter *counter)
 {
-	int event_id = counter->hw_event.type - PERF_TP_EVENTS_MIN;
+	int event_id = counter->hw_event.event_id;
 	int ret;
 
 	ret = ftrace_profile_enable(event_id);
@@ -1758,6 +1765,7 @@ tp_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 
 	counter->destroy = tp_perf_counter_destroy;
+	counter->hw.irq_period = counter->hw_event.irq_period;
 
 	return &perf_ops_generic;
 }
@@ -1783,7 +1791,7 @@ sw_perf_counter_init(struct perf_counter *counter)
 	 * to be kernel events, and page faults are never hypervisor
 	 * events.
 	 */
-	switch (counter->hw_event.type) {
+	switch (counter->hw_event.event_id) {
 	case PERF_COUNT_CPU_CLOCK:
 		hw_ops = &perf_ops_cpu_clock;
 
@@ -1813,9 +1821,6 @@ sw_perf_counter_init(struct perf_counter *counter)
 		if (!counter->hw_event.exclude_kernel)
 			hw_ops = &perf_ops_cpu_migrations;
 		break;
-	default:
-		hw_ops = tp_perf_counter_init(counter);
-		break;
 	}
 
 	if (hw_ops)
@@ -1870,10 +1875,22 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		counter->state = PERF_COUNTER_STATE_OFF;
 
 	hw_ops = NULL;
-	if (!hw_event->raw && hw_event->type < 0)
-		hw_ops = sw_perf_counter_init(counter);
-	else
+
+	if (hw_event->raw_type)
+		hw_ops = hw_perf_counter_init(counter);
+	else switch (hw_event->type) {
+	case PERF_TYPE_HARDWARE:
 		hw_ops = hw_perf_counter_init(counter);
+		break;
+
+	case PERF_TYPE_SOFTWARE:
+		hw_ops = sw_perf_counter_init(counter);
+		break;
+
+	case PERF_TYPE_TRACEPOINT:
+		hw_ops = tp_perf_counter_init(counter);
+		break;
+	}
 
 	if (!hw_ops) {
 		kfree(counter);
-- 
cgit v0.10.2


From 0322cd6ec504b0bf08ca7b2c3d7f43bda37d79c9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:19 +0100
Subject: perf_counter: unify irq output code

Impact: cleanup

Having 3 slightly different copies of the same code around does nobody
any good. First step in revamping the output format.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Orig-LKML-Reference: <20090319194233.929962222@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 26f69dc..88b72eb 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -663,41 +663,6 @@ void perf_counter_do_pending(void)
 }
 
 /*
- * Record data for an irq counter.
- * This function was lifted from the x86 code; maybe it should
- * go in the core?
- */
-static void perf_store_irq_data(struct perf_counter *counter, u64 data)
-{
-	struct perf_data *irqdata = counter->irqdata;
-
-	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
-		irqdata->overrun++;
-	} else {
-		u64 *p = (u64 *) &irqdata->data[irqdata->len];
-
-		*p = data;
-		irqdata->len += sizeof(u64);
-	}
-}
-
-/*
- * Record all the values of the counters in a group
- */
-static void perf_handle_group(struct perf_counter *counter)
-{
-	struct perf_counter *leader, *sub;
-
-	leader = counter->group_leader;
-	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
-		if (sub != counter)
-			sub->hw_ops->read(sub);
-		perf_store_irq_data(counter, sub->hw_event.event_config);
-		perf_store_irq_data(counter, atomic64_read(&sub->count));
-	}
-}
-
-/*
  * A counter has overflowed; update its count and record
  * things if requested.  Note that interrupts are hard-disabled
  * here so there is no possibility of being interrupted.
@@ -736,20 +701,8 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	/*
 	 * Finally record data if requested.
 	 */
-	if (record) {
-		switch (counter->hw_event.record_type) {
-		case PERF_RECORD_SIMPLE:
-			break;
-		case PERF_RECORD_IRQ:
-			perf_store_irq_data(counter, instruction_pointer(regs));
-			counter->wakeup_pending = 1;
-			break;
-		case PERF_RECORD_GROUP:
-			perf_handle_group(counter);
-			counter->wakeup_pending = 1;
-			break;
-		}
-	}
+	if (record)
+		perf_counter_output(counter, 1, regs);
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d844ae4..902282d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -674,20 +674,6 @@ static void pmc_generic_disable(struct perf_counter *counter)
 	x86_perf_counter_update(counter, hwc, idx);
 }
 
-static void perf_store_irq_data(struct perf_counter *counter, u64 data)
-{
-	struct perf_data *irqdata = counter->irqdata;
-
-	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
-		irqdata->overrun++;
-	} else {
-		u64 *p = (u64 *) &irqdata->data[irqdata->len];
-
-		*p = data;
-		irqdata->len += sizeof(u64);
-	}
-}
-
 /*
  * Save and restart an expired counter. Called by NMI contexts,
  * so it has to be careful about preempting normal counter ops:
@@ -704,22 +690,6 @@ static void perf_save_and_restart(struct perf_counter *counter)
 		__pmc_generic_enable(counter, hwc, idx);
 }
 
-static void
-perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
-{
-	struct perf_counter *counter, *group_leader = sibling->group_leader;
-
-	/*
-	 * Store sibling timestamps (if any):
-	 */
-	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
-
-		x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
-		perf_store_irq_data(sibling, counter->hw_event.event_config);
-		perf_store_irq_data(sibling, atomic64_read(&counter->count));
-	}
-}
-
 /*
  * Maximum interrupt frequency of 100KHz per CPU
  */
@@ -754,28 +724,7 @@ again:
 			continue;
 
 		perf_save_and_restart(counter);
-
-		switch (counter->hw_event.record_type) {
-		case PERF_RECORD_SIMPLE:
-			continue;
-		case PERF_RECORD_IRQ:
-			perf_store_irq_data(counter, instruction_pointer(regs));
-			break;
-		case PERF_RECORD_GROUP:
-			perf_handle_group(counter, &status, &ack);
-			break;
-		}
-		/*
-		 * From NMI context we cannot call into the scheduler to
-		 * do a task wakeup - but we mark these generic as
-		 * wakeup_pending and initate a wakeup callback:
-		 */
-		if (nmi) {
-			counter->wakeup_pending = 1;
-			set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
-		} else {
-			wake_up(&counter->waitq);
-		}
+		perf_counter_output(counter, nmi, regs);
 	}
 
 	hw_perf_ack_status(ack);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 8f93949..a4b76c0 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -317,6 +317,8 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_cpu_context *cpuctx,
 	       struct perf_counter_context *ctx, int cpu);
 
+extern void perf_counter_output(struct perf_counter *counter,
+				int nmi, struct pt_regs *regs);
 /*
  * Return 1 for a software counter, 0 for a hardware counter
  */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 68a56a6..f054b8c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1354,6 +1354,60 @@ static const struct file_operations perf_fops = {
 };
 
 /*
+ * Output
+ */
+
+static void perf_counter_store_irq(struct perf_counter *counter, u64 data)
+{
+	struct perf_data *irqdata = counter->irqdata;
+
+	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
+		irqdata->overrun++;
+	} else {
+		u64 *p = (u64 *) &irqdata->data[irqdata->len];
+
+		*p = data;
+		irqdata->len += sizeof(u64);
+	}
+}
+
+static void perf_counter_handle_group(struct perf_counter *counter)
+{
+	struct perf_counter *leader, *sub;
+
+	leader = counter->group_leader;
+	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+		if (sub != counter)
+			sub->hw_ops->read(sub);
+		perf_counter_store_irq(counter, sub->hw_event.event_config);
+		perf_counter_store_irq(counter, atomic64_read(&sub->count));
+	}
+}
+
+void perf_counter_output(struct perf_counter *counter,
+			 int nmi, struct pt_regs *regs)
+{
+	switch (counter->hw_event.record_type) {
+	case PERF_RECORD_SIMPLE:
+		return;
+
+	case PERF_RECORD_IRQ:
+		perf_counter_store_irq(counter, instruction_pointer(regs));
+		break;
+
+	case PERF_RECORD_GROUP:
+		perf_counter_handle_group(counter);
+		break;
+	}
+
+	if (nmi) {
+		counter->wakeup_pending = 1;
+		set_perf_counter_pending();
+	} else
+		wake_up(&counter->waitq);
+}
+
+/*
  * Generic software counter infrastructure
  */
 
@@ -1395,54 +1449,6 @@ static void perf_swcounter_set_period(struct perf_counter *counter)
 	atomic64_set(&hwc->count, -left);
 }
 
-static void perf_swcounter_store_irq(struct perf_counter *counter, u64 data)
-{
-	struct perf_data *irqdata = counter->irqdata;
-
-	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
-		irqdata->overrun++;
-	} else {
-		u64 *p = (u64 *) &irqdata->data[irqdata->len];
-
-		*p = data;
-		irqdata->len += sizeof(u64);
-	}
-}
-
-static void perf_swcounter_handle_group(struct perf_counter *sibling)
-{
-	struct perf_counter *counter, *group_leader = sibling->group_leader;
-
-	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
-		counter->hw_ops->read(counter);
-		perf_swcounter_store_irq(sibling, counter->hw_event.event_config);
-		perf_swcounter_store_irq(sibling, atomic64_read(&counter->count));
-	}
-}
-
-static void perf_swcounter_interrupt(struct perf_counter *counter,
-				     int nmi, struct pt_regs *regs)
-{
-	switch (counter->hw_event.record_type) {
-	case PERF_RECORD_SIMPLE:
-		break;
-
-	case PERF_RECORD_IRQ:
-		perf_swcounter_store_irq(counter, instruction_pointer(regs));
-		break;
-
-	case PERF_RECORD_GROUP:
-		perf_swcounter_handle_group(counter);
-		break;
-	}
-
-	if (nmi) {
-		counter->wakeup_pending = 1;
-		set_perf_counter_pending();
-	} else
-		wake_up(&counter->waitq);
-}
-
 static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 {
 	struct perf_counter *counter;
@@ -1461,7 +1467,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 		regs = task_pt_regs(current);
 
 	if (regs)
-		perf_swcounter_interrupt(counter, 0, regs);
+		perf_counter_output(counter, 0, regs);
 
 	hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
 
@@ -1473,7 +1479,7 @@ static void perf_swcounter_overflow(struct perf_counter *counter,
 {
 	perf_swcounter_update(counter);
 	perf_swcounter_set_period(counter);
-	perf_swcounter_interrupt(counter, nmi, regs);
+	perf_counter_output(counter, nmi, regs);
 }
 
 static int perf_swcounter_match(struct perf_counter *counter,
-- 
cgit v0.10.2


From db4fb5acf20295063d1d5105e67724eb51440207 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 19 Mar 2009 20:26:20 +0100
Subject: perf_counter: powerpc: clean up perc_counter_interrupt

Impact: cleanup

This updates the powerpc perf_counter_interrupt following on from the
"perf_counter: unify irq output code" patch.  Since we now use the
generic perf_counter_output code, which sets the perf_counter_pending
flag directly, we no longer need the need_wakeup variable.

This removes need_wakeup and makes perf_counter_interrupt use
get_perf_counter_pending() instead.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <rostedt@goodmis.org>
Orig-LKML-Reference: <20090319194234.024464535@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 88b72eb..830ca9c 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -723,8 +723,6 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 			/* counter has overflowed */
 			found = 1;
 			record_and_restart(counter, val, regs);
-			if (counter->wakeup_pending)
-				need_wakeup = 1;
 		}
 	}
 
@@ -754,17 +752,14 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	/*
 	 * If we need a wakeup, check whether interrupts were soft-enabled
 	 * when we took the interrupt.  If they were, we can wake stuff up
-	 * immediately; otherwise we'll have to set a flag and do the
-	 * wakeup when interrupts get soft-enabled.
+	 * immediately; otherwise we'll have do the wakeup when interrupts
+	 * get soft-enabled.
 	 */
-	if (need_wakeup) {
-		if (regs->softe) {
-			irq_enter();
-			perf_counter_do_pending();
-			irq_exit();
-		} else {
-			set_perf_counter_pending();
-		}
+	if (get_perf_counter_pending() && regs->softe) {
+		irq_enter();
+		clear_perf_counter_pending();
+		perf_counter_do_pending();
+		irq_exit();
 	}
 }
 
-- 
cgit v0.10.2


From 9aaa131a279834dff75c290c91f0058f62d72d46 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Sat, 21 Mar 2009 15:31:47 +1100
Subject: perf_counter: fix type/event_id layout on big-endian systems

Impact: build fix for powerpc

Commit db3a944aca35ae61 ("perf_counter: revamp syscall input ABI")
expanded the hw_event.type field into a union of structs containing
bitfields.  In particular it introduced a type field and a raw_type
field, with the intention that the 1-bit raw_type field should
overlay the most-significant bit of the 8-bit type field, and in fact
perf_counter_alloc() now assumes that (or at least, assumes that
raw_type doesn't overlay any of the bits that are 1 in the values of
PERF_TYPE_{HARDWARE,SOFTWARE,TRACEPOINT}).

Unfortunately this is not true on big-endian systems such as PowerPC,
where bitfields are laid out from left to right, i.e. from most
significant bit to least significant.  This means that setting
hw_event.type = PERF_TYPE_SOFTWARE will set hw_event.raw_type to 1.

This fixes it by making the layout depend on whether or not
__BIG_ENDIAN_BITFIELD is defined.  It's a bit ugly, but that's what
we get for using bitfields in a user/kernel ABI.

Also, that commit didn't fix up some places in arch/powerpc/kernel/
perf_counter.c where hw_event.raw and hw_event.event_id were used.
This fixes them too.

Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 830ca9c..6413d9c 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -602,12 +602,13 @@ hw_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 	if ((s64)counter->hw_event.irq_period < 0)
 		return NULL;
-	ev = counter->hw_event.event_id;
-	if (!counter->hw_event.raw) {
-		if (ev >= ppmu->n_generic ||
-		    ppmu->generic_events[ev] == 0)
+	if (!counter->hw_event.raw_type) {
+		ev = counter->hw_event.event_id;
+		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
 			return NULL;
 		ev = ppmu->generic_events[ev];
+	} else {
+		ev = counter->hw_event.raw_event_id;
 	}
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index a4b76c0..98f5990 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -15,6 +15,7 @@
 
 #include <linux/types.h>
 #include <linux/ioctl.h>
+#include <asm/byteorder.h>
 
 /*
  * User-space ABI bits:
@@ -86,6 +87,7 @@ enum perf_counter_record_type {
  */
 struct perf_counter_hw_event {
 	union {
+#ifndef __BIG_ENDIAN_BITFIELD
 		struct {
 			__u64			event_id	: 56,
 						type		:  8;
@@ -94,6 +96,16 @@ struct perf_counter_hw_event {
 			__u64			raw_event_id	: 63,
 						raw_type	:  1;
 		};
+#else
+		struct {
+			__u64			type		:  8,
+						event_id	: 56;
+		};
+		struct {
+			__u64			raw_type	:  1,
+						raw_event_id	: 63;
+		};
+#endif /* __BIT_ENDIAN_BITFIELD */
 		__u64		event_config;
 	};
 
-- 
cgit v0.10.2


From 6f9f791eb53b56097cd311a1a5517a8e89bdaf35 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 23 Mar 2009 21:26:52 +0100
Subject: perf_counter: create Documentation/perf_counter/ and move
 perfcounters.txt there

We'll have more files in that directory, prepare for that.

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf-counters.txt b/Documentation/perf-counters.txt
deleted file mode 100644
index fddd321..0000000
--- a/Documentation/perf-counters.txt
+++ /dev/null
@@ -1,147 +0,0 @@
-
-Performance Counters for Linux
-------------------------------
-
-Performance counters are special hardware registers available on most modern
-CPUs. These registers count the number of certain types of hw events: such
-as instructions executed, cachemisses suffered, or branches mis-predicted -
-without slowing down the kernel or applications. These registers can also
-trigger interrupts when a threshold number of events have passed - and can
-thus be used to profile the code that runs on that CPU.
-
-The Linux Performance Counter subsystem provides an abstraction of these
-hardware capabilities. It provides per task and per CPU counters, counter
-groups, and it provides event capabilities on top of those.
-
-Performance counters are accessed via special file descriptors.
-There's one file descriptor per virtual counter used.
-
-The special file descriptor is opened via the perf_counter_open()
-system call:
-
-   int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
-			     pid_t pid, int cpu, int group_fd);
-
-The syscall returns the new fd. The fd can be used via the normal
-VFS system calls: read() can be used to read the counter, fcntl()
-can be used to set the blocking mode, etc.
-
-Multiple counters can be kept open at a time, and the counters
-can be poll()ed.
-
-When creating a new counter fd, 'perf_counter_hw_event' is:
-
-/*
- * Hardware event to monitor via a performance monitoring counter:
- */
-struct perf_counter_hw_event {
-	s64			type;
-
-	u64			irq_period;
-	u32			record_type;
-
-	u32			disabled     :  1, /* off by default */
-				nmi	     :  1, /* NMI sampling   */
-				raw	     :  1, /* raw event type */
-				__reserved_1 : 29;
-
-	u64			__reserved_2;
-};
-
-/*
- * Generalized performance counter event types, used by the hw_event.type
- * parameter of the sys_perf_counter_open() syscall:
- */
-enum hw_event_types {
-	/*
-	 * Common hardware events, generalized by the kernel:
-	 */
-	PERF_COUNT_CYCLES		=  0,
-	PERF_COUNT_INSTRUCTIONS		=  1,
-	PERF_COUNT_CACHE_REFERENCES	=  2,
-	PERF_COUNT_CACHE_MISSES		=  3,
-	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
-	PERF_COUNT_BRANCH_MISSES	=  5,
-
-	/*
-	 * Special "software" counters provided by the kernel, even if
-	 * the hardware does not support performance counters. These
-	 * counters measure various physical and sw events of the
-	 * kernel (and allow the profiling of them as well):
-	 */
-	PERF_COUNT_CPU_CLOCK		= -1,
-	PERF_COUNT_TASK_CLOCK		= -2,
-	/*
-	 * Future software events:
-	 */
-	/* PERF_COUNT_PAGE_FAULTS	= -3,
-	   PERF_COUNT_CONTEXT_SWITCHES	= -4, */
-};
-
-These are standardized types of events that work uniformly on all CPUs
-that implements Performance Counters support under Linux. If a CPU is
-not able to count branch-misses, then the system call will return
--EINVAL.
-
-More hw_event_types are supported as well, but they are CPU
-specific and are enumerated via /sys on a per CPU basis. Raw hw event
-types can be passed in under hw_event.type if hw_event.raw is 1.
-For example, to count "External bus cycles while bus lock signal asserted"
-events on Intel Core CPUs, pass in a 0x4064 event type value and set
-hw_event.raw to 1.
-
-'record_type' is the type of data that a read() will provide for the
-counter, and it can be one of:
-
-/*
- * IRQ-notification data record type:
- */
-enum perf_counter_record_type {
-	PERF_RECORD_SIMPLE		=  0,
-	PERF_RECORD_IRQ			=  1,
-	PERF_RECORD_GROUP		=  2,
-};
-
-a "simple" counter is one that counts hardware events and allows
-them to be read out into a u64 count value. (read() returns 8 on
-a successful read of a simple counter.)
-
-An "irq" counter is one that will also provide an IRQ context information:
-the IP of the interrupted context. In this case read() will return
-the 8-byte counter value, plus the Instruction Pointer address of the
-interrupted context.
-
-The parameter 'hw_event_period' is the number of events before waking up
-a read() that is blocked on a counter fd. Zero value means a non-blocking
-counter.
-
-The 'pid' parameter allows the counter to be specific to a task:
-
- pid == 0: if the pid parameter is zero, the counter is attached to the
- current task.
-
- pid > 0: the counter is attached to a specific task (if the current task
- has sufficient privilege to do so)
-
- pid < 0: all tasks are counted (per cpu counters)
-
-The 'cpu' parameter allows a counter to be made specific to a full
-CPU:
-
- cpu >= 0: the counter is restricted to a specific CPU
- cpu == -1: the counter counts on all CPUs
-
-(Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.)
-
-A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts
-events of that task and 'follows' that task to whatever CPU the task
-gets schedule to. Per task counters can be created by any user, for
-their own tasks.
-
-A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts
-all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege.
-
-Group counters are created by passing in a group_fd of another counter.
-Groups are scheduled at once and can be used with PERF_RECORD_GROUP
-to record multi-dimensional timestamps.
-
diff --git a/Documentation/perf_counter/design.txt b/Documentation/perf_counter/design.txt
new file mode 100644
index 0000000..fddd321
--- /dev/null
+++ b/Documentation/perf_counter/design.txt
@@ -0,0 +1,147 @@
+
+Performance Counters for Linux
+------------------------------
+
+Performance counters are special hardware registers available on most modern
+CPUs. These registers count the number of certain types of hw events: such
+as instructions executed, cachemisses suffered, or branches mis-predicted -
+without slowing down the kernel or applications. These registers can also
+trigger interrupts when a threshold number of events have passed - and can
+thus be used to profile the code that runs on that CPU.
+
+The Linux Performance Counter subsystem provides an abstraction of these
+hardware capabilities. It provides per task and per CPU counters, counter
+groups, and it provides event capabilities on top of those.
+
+Performance counters are accessed via special file descriptors.
+There's one file descriptor per virtual counter used.
+
+The special file descriptor is opened via the perf_counter_open()
+system call:
+
+   int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
+			     pid_t pid, int cpu, int group_fd);
+
+The syscall returns the new fd. The fd can be used via the normal
+VFS system calls: read() can be used to read the counter, fcntl()
+can be used to set the blocking mode, etc.
+
+Multiple counters can be kept open at a time, and the counters
+can be poll()ed.
+
+When creating a new counter fd, 'perf_counter_hw_event' is:
+
+/*
+ * Hardware event to monitor via a performance monitoring counter:
+ */
+struct perf_counter_hw_event {
+	s64			type;
+
+	u64			irq_period;
+	u32			record_type;
+
+	u32			disabled     :  1, /* off by default */
+				nmi	     :  1, /* NMI sampling   */
+				raw	     :  1, /* raw event type */
+				__reserved_1 : 29;
+
+	u64			__reserved_2;
+};
+
+/*
+ * Generalized performance counter event types, used by the hw_event.type
+ * parameter of the sys_perf_counter_open() syscall:
+ */
+enum hw_event_types {
+	/*
+	 * Common hardware events, generalized by the kernel:
+	 */
+	PERF_COUNT_CYCLES		=  0,
+	PERF_COUNT_INSTRUCTIONS		=  1,
+	PERF_COUNT_CACHE_REFERENCES	=  2,
+	PERF_COUNT_CACHE_MISSES		=  3,
+	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
+	PERF_COUNT_BRANCH_MISSES	=  5,
+
+	/*
+	 * Special "software" counters provided by the kernel, even if
+	 * the hardware does not support performance counters. These
+	 * counters measure various physical and sw events of the
+	 * kernel (and allow the profiling of them as well):
+	 */
+	PERF_COUNT_CPU_CLOCK		= -1,
+	PERF_COUNT_TASK_CLOCK		= -2,
+	/*
+	 * Future software events:
+	 */
+	/* PERF_COUNT_PAGE_FAULTS	= -3,
+	   PERF_COUNT_CONTEXT_SWITCHES	= -4, */
+};
+
+These are standardized types of events that work uniformly on all CPUs
+that implements Performance Counters support under Linux. If a CPU is
+not able to count branch-misses, then the system call will return
+-EINVAL.
+
+More hw_event_types are supported as well, but they are CPU
+specific and are enumerated via /sys on a per CPU basis. Raw hw event
+types can be passed in under hw_event.type if hw_event.raw is 1.
+For example, to count "External bus cycles while bus lock signal asserted"
+events on Intel Core CPUs, pass in a 0x4064 event type value and set
+hw_event.raw to 1.
+
+'record_type' is the type of data that a read() will provide for the
+counter, and it can be one of:
+
+/*
+ * IRQ-notification data record type:
+ */
+enum perf_counter_record_type {
+	PERF_RECORD_SIMPLE		=  0,
+	PERF_RECORD_IRQ			=  1,
+	PERF_RECORD_GROUP		=  2,
+};
+
+a "simple" counter is one that counts hardware events and allows
+them to be read out into a u64 count value. (read() returns 8 on
+a successful read of a simple counter.)
+
+An "irq" counter is one that will also provide an IRQ context information:
+the IP of the interrupted context. In this case read() will return
+the 8-byte counter value, plus the Instruction Pointer address of the
+interrupted context.
+
+The parameter 'hw_event_period' is the number of events before waking up
+a read() that is blocked on a counter fd. Zero value means a non-blocking
+counter.
+
+The 'pid' parameter allows the counter to be specific to a task:
+
+ pid == 0: if the pid parameter is zero, the counter is attached to the
+ current task.
+
+ pid > 0: the counter is attached to a specific task (if the current task
+ has sufficient privilege to do so)
+
+ pid < 0: all tasks are counted (per cpu counters)
+
+The 'cpu' parameter allows a counter to be made specific to a full
+CPU:
+
+ cpu >= 0: the counter is restricted to a specific CPU
+ cpu == -1: the counter counts on all CPUs
+
+(Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.)
+
+A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts
+events of that task and 'follows' that task to whatever CPU the task
+gets schedule to. Per task counters can be created by any user, for
+their own tasks.
+
+A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts
+all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege.
+
+Group counters are created by passing in a group_fd of another counter.
+Groups are scheduled at once and can be used with PERF_RECORD_GROUP
+to record multi-dimensional timestamps.
+
-- 
cgit v0.10.2


From e0143bad9dbf2a8fad4c5430562bceba196b66ea Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 23 Mar 2009 21:29:59 +0100
Subject: perf_counter: add sample user-space to Documentation/perf_counter/

Initial version of kerneltop.c and perfstat.c.

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
new file mode 100644
index 0000000..b457497
--- /dev/null
+++ b/Documentation/perf_counter/Makefile
@@ -0,0 +1,12 @@
+BINS = kerneltop perfstat
+
+all: $(BINS)
+
+kerneltop: kerneltop.c perfcounters.h
+	cc -O6 -Wall -lrt `pkg-config --cflags --libs glib-2.0` -o $@ $<
+
+perfstat: kerneltop
+	ln -sf kerneltop perfstat
+
+clean:
+	rm $(BINS)
diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
new file mode 100644
index 0000000..cf0e30b
--- /dev/null
+++ b/Documentation/perf_counter/kerneltop.c
@@ -0,0 +1,956 @@
+/*
+ * kerneltop.c: show top kernel functions - performance counters showcase
+
+   Build with:
+
+     cc -O6 -Wall `pkg-config --cflags --libs glib-2.0` -o kerneltop kerneltop.c
+
+   Sample output:
+
+------------------------------------------------------------------------------
+ KernelTop:    2669 irqs/sec  [NMI, cache-misses/cache-refs],  (all, cpu: 2)
+------------------------------------------------------------------------------
+
+             weight         RIP          kernel function
+             ______   ________________   _______________
+
+              35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
+              33.00 - ffffffff804cb740 : sock_alloc_send_skb
+              31.26 - ffffffff804ce808 : skb_push
+              22.43 - ffffffff80510004 : tcp_established_options
+              19.00 - ffffffff8027d250 : find_get_page
+              15.76 - ffffffff804e4fc9 : eth_type_trans
+              15.20 - ffffffff804d8baa : dst_release
+              14.86 - ffffffff804cf5d8 : skb_release_head_state
+              14.00 - ffffffff802217d5 : read_hpet
+              12.00 - ffffffff804ffb7f : __ip_local_out
+              11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
+               8.54 - ffffffff805001a3 : ip_queue_xmit
+
+  Started by Ingo Molnar <mingo@redhat.com>
+
+  Improvements and fixes by:
+
+    Arjan van de Ven <arjan@linux.intel.com>
+    Yanmin Zhang <yanmin.zhang@intel.com>
+    Mike Galbraith <efault@gmx.de>
+
+  Released under the GPL v2. (and only v2, not any later version)
+
+ */
+#define _GNU_SOURCE
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <errno.h>
+#include <ctype.h>
+#include <time.h>
+
+#include <glib.h>
+
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <sys/uio.h>
+
+#include <linux/unistd.h>
+
+#ifdef __x86_64__
+# define __NR_perf_counter_open	295
+#endif
+
+#ifdef __i386__
+# define __NR_perf_counter_open 333
+#endif
+
+/*
+ * Pick up some kernel type conventions:
+ */
+#define __user
+#define asmlinkage
+
+typedef unsigned int		__u32;
+typedef unsigned long long	__u64;
+typedef long long		__s64;
+
+/*
+ * User-space ABI bits:
+ */
+
+/*
+ * Generalized performance counter event types, used by the hw_event.type
+ * parameter of the sys_perf_counter_open() syscall:
+ */
+enum hw_event_types {
+	/*
+	 * Common hardware events, generalized by the kernel:
+	 */
+	PERF_COUNT_CPU_CYCLES		=  0,
+	PERF_COUNT_INSTRUCTIONS		=  1,
+	PERF_COUNT_CACHE_REFERENCES	=  2,
+	PERF_COUNT_CACHE_MISSES		=  3,
+	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
+	PERF_COUNT_BRANCH_MISSES	=  5,
+	PERF_COUNT_BUS_CYCLES		=  6,
+
+	PERF_HW_EVENTS_MAX		=  7,
+
+	/*
+	 * Special "software" counters provided by the kernel, even if
+	 * the hardware does not support performance counters. These
+	 * counters measure various physical and sw events of the
+	 * kernel (and allow the profiling of them as well):
+	 */
+	PERF_COUNT_CPU_CLOCK		= -1,
+	PERF_COUNT_TASK_CLOCK		= -2,
+	PERF_COUNT_PAGE_FAULTS		= -3,
+	PERF_COUNT_CONTEXT_SWITCHES	= -4,
+	PERF_COUNT_CPU_MIGRATIONS	= -5,
+
+	PERF_SW_EVENTS_MIN		= -6,
+};
+
+/*
+ * IRQ-notification data record type:
+ */
+enum perf_counter_record_type {
+	PERF_RECORD_SIMPLE		=  0,
+	PERF_RECORD_IRQ			=  1,
+	PERF_RECORD_GROUP		=  2,
+};
+
+/*
+ * Hardware event to monitor via a performance monitoring counter:
+ */
+struct perf_counter_hw_event {
+	__s64			type;
+
+	__u64			irq_period;
+	__u64			record_type;
+	__u64			read_format;
+
+	__u64			disabled       :  1, /* off by default        */
+				nmi	       :  1, /* NMI sampling          */
+				raw	       :  1, /* raw event type        */
+				inherit	       :  1, /* children inherit it   */
+				pinned	       :  1, /* must always be on PMU */
+				exclusive      :  1, /* only group on PMU     */
+				exclude_user   :  1, /* don't count user      */
+				exclude_kernel :  1, /* ditto kernel          */
+				exclude_hv     :  1, /* ditto hypervisor      */
+				exclude_idle   :  1, /* don't count when idle */
+
+				__reserved_1   : 54;
+
+	__u32			extra_config_len;
+	__u32			__reserved_4;
+
+	__u64			__reserved_2;
+	__u64			__reserved_3;
+};
+
+/*
+ * Ioctls that can be done on a perf counter fd:
+ */
+#define PERF_COUNTER_IOC_ENABLE		_IO('$', 0)
+#define PERF_COUNTER_IOC_DISABLE	_IO('$', 1)
+
+asmlinkage int sys_perf_counter_open(
+
+	struct perf_counter_hw_event	*hw_event_uptr		__user,
+	pid_t				pid,
+	int				cpu,
+	int				group_fd,
+	unsigned long			flags)
+{
+	int ret;
+
+	ret = syscall(
+		__NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
+#if defined(__x86_64__) || defined(__i386__)
+	if (ret < 0 && ret > -4096) {
+		errno = -ret;
+		ret = -1;
+	}
+#endif
+	return ret;
+}
+
+const char *event_types [] = {
+	"CPU cycles",
+	"instructions",
+	"cache-refs",
+	"cache-misses",
+	"branches",
+	"branch-misses",
+	"bus cycles"
+};
+
+const unsigned int default_count[] = {
+	1000000,
+	1000000,
+	  10000,
+	  10000,
+	1000000,
+	  10000,
+};
+
+/*
+ * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
+ * counters in the current task.
+ */
+#define PR_TASK_PERF_COUNTERS_DISABLE           31
+#define PR_TASK_PERF_COUNTERS_ENABLE            32
+
+#define MAX_COUNTERS		8
+
+static int			nr_counters			= -1;
+
+static __u64			count_filter		       = 100;
+
+#define MAX_NR_CPUS		256
+
+static int			event_count[MAX_COUNTERS];
+static unsigned long		event_id[MAX_COUNTERS];
+static int			event_raw[MAX_COUNTERS];
+
+static int			tid				= -1;
+static int			profile_cpu			= -1;
+static int			nr_cpus				=  0;
+static int			nmi				=  1;
+static int			group				=  0;
+
+static char			*vmlinux;
+
+static char			*sym_filter;
+static unsigned long		filter_start;
+static unsigned long		filter_end;
+
+static int			delay_secs			=  2;
+static int			zero;
+static int			dump_symtab;
+
+struct source_line {
+	uint64_t		EIP;
+	unsigned long		count;
+	char			*line;
+};
+
+static GList			*lines;
+
+static void display_help(void)
+{
+	printf(
+	"Usage: kerneltop [<options>]\n\n"
+	"KernelTop Options (up to %d event types can be specified at once):\n\n",
+		 MAX_COUNTERS);
+	printf(
+	" -e EID    --event_id=EID     # event type ID                    [default:  0]\n"
+	"                                   0: CPU cycles\n"
+	"                                   1: instructions\n"
+	"                                   2: cache accesses\n"
+	"                                   3: cache misses\n"
+	"                                   4: branch instructions\n"
+	"                                   5: branch prediction misses\n"
+	"                                   6: bus cycles\n\n"
+	"                                rNNN: raw PMU events (eventsel+umask)\n\n"
+	" -c CNT    --count=CNT        # event period to sample\n\n"
+	" -C CPU    --cpu=CPU          # CPU (-1 for all)                 [default: -1]\n"
+	" -p PID    --pid=PID          # PID of sampled task (-1 for all) [default: -1]\n\n"
+	" -d delay  --delay=<seconds>  # sampling/display delay           [default:  2]\n"
+	" -f CNT --filter=CNT          # min-event-count filter          [default: 100]\n\n"
+	" -s symbol --symbol=<symbol>  # function to be showed annotated one-shot\n"
+	" -x path   --vmlinux=<path>   # the vmlinux binary, required for -s use:\n"
+	" -z        --zero             # zero counts after display\n"
+	" -D        --dump_symtab      # dump symbol table to stderr on startup\n"
+	"\n");
+
+	exit(0);
+}
+
+static void process_options(int argc, char *argv[])
+{
+	int error = 0, counter;
+
+	for (;;) {
+		int option_index = 0;
+		/** Options for getopt */
+		static struct option long_options[] = {
+			{"count",	required_argument,	NULL, 'c'},
+			{"cpu",		required_argument,	NULL, 'C'},
+			{"delay",	required_argument,	NULL, 'd'},
+			{"dump_symtab",	no_argument,		NULL, 'D'},
+			{"event_id",	required_argument,	NULL, 'e'},
+			{"filter",	required_argument,	NULL, 'f'},
+			{"group",	required_argument,	NULL, 'g'},
+			{"help",	no_argument,		NULL, 'h'},
+			{"nmi",		required_argument,	NULL, 'n'},
+			{"pid",		required_argument,	NULL, 'p'},
+			{"vmlinux",	required_argument,	NULL, 'x'},
+			{"symbol",	required_argument,	NULL, 's'},
+			{"zero",	no_argument,		NULL, 'z'},
+			{NULL,		0,			NULL,  0 }
+		};
+		int c = getopt_long(argc, argv, "c:C:d:De:f:g:hn:p:s:x:z",
+				    long_options, &option_index);
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 'c':
+			if (nr_counters == -1)
+				nr_counters = 0;
+			event_count[nr_counters]	=   atoi(optarg); break;
+		case 'C':
+			/* CPU and PID are mutually exclusive */
+			if (tid != -1) {
+				printf("WARNING: CPU switch overriding PID\n");
+				sleep(1);
+				tid = -1;
+			}
+			profile_cpu			=   atoi(optarg); break;
+		case 'd': delay_secs			=   atoi(optarg); break;
+		case 'D': dump_symtab			=              1; break;
+
+		case 'e':
+			nr_counters++;
+			if (nr_counters == MAX_COUNTERS) {
+				error = 1;
+				break;
+			}
+			if (*optarg == 'r') {
+				event_raw[nr_counters] = 1;
+				++optarg;
+			}
+			event_id[nr_counters] = strtol(optarg, NULL, 16);
+			break;
+
+		case 'f': count_filter			=   atoi(optarg); break;
+		case 'g': group				=   atoi(optarg); break;
+		case 'h':      				  display_help(); break;
+		case 'n': nmi				=   atoi(optarg); break;
+		case 'p':
+			/* CPU and PID are mutually exclusive */
+			if (profile_cpu != -1) {
+				printf("WARNING: PID switch overriding CPU\n");
+				sleep(1);
+				profile_cpu = -1;
+			}
+			tid				=   atoi(optarg); break;
+		case 's': sym_filter			= strdup(optarg); break;
+		case 'x': vmlinux			= strdup(optarg); break;
+		case 'z': zero				=              1; break;
+		default: error = 1; break;
+		}
+	}
+	if (error)
+		display_help();
+
+	nr_counters++;
+	if (nr_counters < 1)
+		nr_counters = 1;
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		if (event_count[counter])
+			continue;
+
+		if (event_id[counter] < PERF_HW_EVENTS_MAX)
+			event_count[counter] = default_count[event_id[counter]];
+		else
+			event_count[counter] = 100000;
+	}
+}
+
+static uint64_t			min_ip;
+static uint64_t			max_ip = -1ll;
+
+struct sym_entry {
+	unsigned long long	addr;
+	char			*sym;
+	unsigned long		count[MAX_COUNTERS];
+	int			skip;
+	GList			*source;
+};
+
+#define MAX_SYMS		100000
+
+static int sym_table_count;
+
+struct sym_entry		*sym_filter_entry;
+
+static struct sym_entry		sym_table[MAX_SYMS];
+
+static void show_details(struct sym_entry *sym);
+
+/*
+ * Ordering weight: count-1 * count-1 * ... / count-n
+ */
+static double sym_weight(const struct sym_entry *sym)
+{
+	double weight;
+	int counter;
+
+	weight = sym->count[0];
+
+	for (counter = 1; counter < nr_counters-1; counter++)
+		weight *= sym->count[counter];
+
+	weight /= (sym->count[counter] + 1);
+
+	return weight;
+}
+
+static int compare(const void *__sym1, const void *__sym2)
+{
+	const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
+
+	return sym_weight(sym1) < sym_weight(sym2);
+}
+
+static time_t			last_refresh;
+static long			events;
+static long			userspace_events;
+static const char		CONSOLE_CLEAR[] = "[H[2J";
+
+static struct sym_entry		tmp[MAX_SYMS];
+
+static void print_sym_table(void)
+{
+	int i, printed;
+	int counter;
+	float events_per_sec = events/delay_secs;
+	float kevents_per_sec = (events-userspace_events)/delay_secs;
+
+	memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
+	qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
+
+	write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
+
+	printf(
+"------------------------------------------------------------------------------\n");
+	printf( " KernelTop:%8.0f irqs/sec  kernel:%3.1f%% [%s, ",
+		events_per_sec,
+		100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
+		nmi ? "NMI" : "IRQ");
+
+	if (nr_counters == 1)
+		printf("%d ", event_count[0]);
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		if (counter)
+			printf("/");
+
+		if (event_id[counter] < PERF_HW_EVENTS_MAX)
+			printf( "%s", event_types[event_id[counter]]);
+		else
+			printf( "raw:%04lx", event_id[counter]);
+	}
+
+	printf( "], ");
+
+	if (tid != -1)
+		printf(" (tid: %d", tid);
+	else
+		printf(" (all");
+
+	if (profile_cpu != -1)
+		printf(", cpu: %d)\n", profile_cpu);
+	else {
+		if (tid != -1)
+			printf(")\n");
+		else
+			printf(", %d CPUs)\n", nr_cpus);
+	}
+
+	printf("------------------------------------------------------------------------------\n\n");
+
+	if (nr_counters == 1)
+		printf("             events");
+	else
+		printf("  weight     events");
+
+	printf("         RIP          kernel function\n"
+	       	       "  ______     ______   ________________   _______________\n\n"
+	);
+
+	printed = 0;
+	for (i = 0; i < sym_table_count; i++) {
+		int count;
+
+		if (nr_counters == 1) {
+			if (printed <= 18 &&
+					tmp[i].count[0] >= count_filter) {
+				printf("%19.2f - %016llx : %s\n",
+				  sym_weight(tmp + i), tmp[i].addr, tmp[i].sym);
+				printed++;
+			}
+		} else {
+			if (printed <= 18 &&
+					tmp[i].count[0] >= count_filter) {
+				printf("%8.1f %10ld - %016llx : %s\n",
+				  sym_weight(tmp + i),
+				  tmp[i].count[0],
+				  tmp[i].addr, tmp[i].sym);
+				printed++;
+			}
+		}
+		/*
+		 * Add decay to the counts:
+		 */
+		for (count = 0; count < nr_counters; count++)
+			sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
+	}
+
+	if (sym_filter_entry)
+		show_details(sym_filter_entry);
+
+	last_refresh = time(NULL);
+
+	{
+		struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
+
+		if (poll(&stdin_poll, 1, 0) == 1) {
+			printf("key pressed - exiting.\n");
+			exit(0);
+		}
+	}
+}
+
+static int read_symbol(FILE *in, struct sym_entry *s)
+{
+	static int filter_match = 0;
+	char *sym, stype;
+	char str[500];
+	int rc, pos;
+
+	rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
+	if (rc == EOF)
+		return -1;
+
+	assert(rc == 3);
+
+	/* skip until end of line: */
+	pos = strlen(str);
+	do {
+		rc = fgetc(in);
+		if (rc == '\n' || rc == EOF || pos >= 499)
+			break;
+		str[pos] = rc;
+		pos++;
+	} while (1);
+	str[pos] = 0;
+
+	sym = str;
+
+	/* Filter out known duplicates and non-text symbols. */
+	if (!strcmp(sym, "_text"))
+		return 1;
+	if (!min_ip && !strcmp(sym, "_stext"))
+		return 1;
+	if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
+		return 1;
+	if (stype != 'T' && stype != 't')
+		return 1;
+	if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
+		return 1;
+	if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
+		return 1;
+
+	s->sym = malloc(strlen(str));
+	assert(s->sym);
+
+	strcpy((char *)s->sym, str);
+	s->skip = 0;
+
+	/* Tag events to be skipped. */
+	if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
+		s->skip = 1;
+	if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
+		s->skip = 1;
+
+	if (filter_match == 1) {
+		filter_end = s->addr;
+		filter_match = -1;
+		if (filter_end - filter_start > 10000) {
+			printf("hm, too large filter symbol <%s> - skipping.\n",
+				sym_filter);
+			printf("symbol filter start: %016lx\n", filter_start);
+			printf("                end: %016lx\n", filter_end);
+			filter_end = filter_start = 0;
+			sym_filter = NULL;
+			sleep(1);
+		}
+	}
+	if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
+		filter_match = 1;
+		filter_start = s->addr;
+	}
+
+	return 0;
+}
+
+int compare_addr(const void *__sym1, const void *__sym2)
+{
+	const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
+
+	return sym1->addr > sym2->addr;
+}
+
+static void sort_symbol_table(void)
+{
+	int i, dups;
+
+	do {
+		qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
+		for (i = 0, dups = 0; i < sym_table_count; i++) {
+			if (sym_table[i].addr == sym_table[i+1].addr) {
+				sym_table[i+1].addr = -1ll;
+				dups++;
+			}
+		}
+		sym_table_count -= dups;
+	} while(dups);
+}
+
+static void parse_symbols(void)
+{
+	struct sym_entry *last;
+
+	FILE *kallsyms = fopen("/proc/kallsyms", "r");
+
+	if (!kallsyms) {
+		printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
+		exit(-1);
+	}
+
+	while (!feof(kallsyms)) {
+		if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
+			sym_table_count++;
+			assert(sym_table_count <= MAX_SYMS);
+		}
+	}
+
+	sort_symbol_table();
+	min_ip = sym_table[0].addr;
+	max_ip = sym_table[sym_table_count-1].addr;
+	last = sym_table + sym_table_count++;
+
+	last->addr = -1ll;
+	last->sym = "<end>";
+
+	if (filter_end) {
+		int count;
+		for (count=0; count < sym_table_count; count ++) {
+			if (!strcmp(sym_table[count].sym, sym_filter)) {
+				sym_filter_entry = &sym_table[count];
+				break;
+			}
+		}
+	}
+	if (dump_symtab) {
+		int i;
+
+		for (i = 0; i < sym_table_count; i++)
+			fprintf(stderr, "%llx %s\n",
+				sym_table[i].addr, sym_table[i].sym);
+	}
+}
+
+
+static void parse_vmlinux(char *filename)
+{
+	FILE *file;
+	char command[PATH_MAX*2];
+	if (!filename)
+		return;
+
+	sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
+
+	file = popen(command, "r");
+	if (!file)
+		return;
+
+	while (!feof(file)) {
+		struct source_line *src;
+		size_t dummy = 0;
+		char *c;
+
+		src = malloc(sizeof(struct source_line));
+		assert(src != NULL);	
+		memset(src, 0, sizeof(struct source_line));
+
+		if (getline(&src->line, &dummy, file) < 0)
+			break;
+		if (!src->line)
+			break;
+
+		c = strchr(src->line, '\n');
+		if (c)
+			*c = 0;
+
+		lines = g_list_prepend(lines, src);
+
+		if (strlen(src->line)>8 && src->line[8] == ':')
+			src->EIP = strtoull(src->line, NULL, 16);
+		if (strlen(src->line)>8 && src->line[16] == ':')
+			src->EIP = strtoull(src->line, NULL, 16);
+	}
+	pclose(file);
+	lines = g_list_reverse(lines);
+}
+
+static void record_precise_ip(uint64_t ip)
+{
+	struct source_line *line;
+	GList *item;
+
+	item = g_list_first(lines);
+	while (item) {
+		line = item->data;
+		if (line->EIP == ip)
+			line->count++;
+		if (line->EIP > ip)
+			break;
+		item = g_list_next(item);
+	}
+}
+
+static void lookup_sym_in_vmlinux(struct sym_entry *sym)
+{
+	struct source_line *line;
+	GList *item;
+	char pattern[PATH_MAX];
+	sprintf(pattern, "<%s>:", sym->sym);
+
+	item = g_list_first(lines);
+	while (item) {
+		line = item->data;
+		if (strstr(line->line, pattern)) {
+			sym->source = item;
+			break;
+		}
+		item = g_list_next(item);
+	}
+}
+
+void show_lines(GList *item_queue, int item_queue_count)
+{
+	int i;
+	struct source_line *line;
+
+	for (i = 0; i < item_queue_count; i++) {
+		line = item_queue->data;
+		printf("%8li\t%s\n", line->count, line->line);
+		item_queue = g_list_next(item_queue);
+	}
+}
+
+#define TRACE_COUNT     3
+
+static void show_details(struct sym_entry *sym)
+{
+	struct source_line *line;
+	GList *item;
+	int displayed = 0;
+	GList *item_queue = NULL;
+	int item_queue_count = 0;
+
+	if (!sym->source)
+		lookup_sym_in_vmlinux(sym);
+	if (!sym->source)
+		return;
+
+	printf("Showing details for %s\n", sym->sym);
+
+	item = sym->source;
+	while (item) {
+		line = item->data;
+		if (displayed && strstr(line->line, ">:"))
+			break;
+
+		if (!item_queue_count)
+			item_queue = item;
+		item_queue_count ++;
+
+		if (line->count >= count_filter) {
+			show_lines(item_queue, item_queue_count);
+			item_queue_count = 0;
+			item_queue = NULL;
+		} else if (item_queue_count > TRACE_COUNT) {
+			item_queue = g_list_next(item_queue);
+			item_queue_count --;
+		}
+
+		line->count = 0;
+		displayed++;
+		if (displayed > 300)
+			break;
+		item = g_list_next(item);
+	}
+}
+
+/*
+ * Binary search in the histogram table and record the hit:
+ */
+static void record_ip(uint64_t ip, int counter)
+{
+	int left_idx, middle_idx, right_idx, idx;
+	unsigned long left, middle, right;
+
+	record_precise_ip(ip);
+
+	left_idx = 0;
+	right_idx = sym_table_count-1;
+	assert(ip <= max_ip && ip >= min_ip);
+
+	while (left_idx + 1 < right_idx) {
+		middle_idx = (left_idx + right_idx) / 2;
+
+		left   = sym_table[  left_idx].addr;
+		middle = sym_table[middle_idx].addr;
+		right  = sym_table[ right_idx].addr;
+
+		if (!(left <= middle && middle <= right)) {
+			printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
+			printf("%d %d %d\n", left_idx, middle_idx, right_idx);
+		}
+		assert(left <= middle && middle <= right);
+		if (!(left <= ip && ip <= right)) {
+			printf(" left: %016lx\n", left);
+			printf("   ip: %016lx\n", ip);
+			printf("right: %016lx\n", right);
+		}
+		assert(left <= ip && ip <= right);
+		/*
+		 * [ left .... target .... middle .... right ]
+		 *   => right := middle
+		 */
+		if (ip < middle) {
+			right_idx = middle_idx;
+			continue;
+		}
+		/*
+		 * [ left .... middle ... target ... right ]
+		 *   => left := middle
+		 */
+		left_idx = middle_idx;
+	}
+
+	idx = left_idx;
+
+	if (!sym_table[idx].skip)
+		sym_table[idx].count[counter]++;
+	else events--;
+}
+
+static void process_event(uint64_t ip, int counter)
+{
+	events++;
+
+	if (ip < min_ip || ip > max_ip) {
+		userspace_events++;
+		return;
+	}
+
+	record_ip(ip, counter);
+}
+
+int main(int argc, char *argv[])
+{
+	struct pollfd event_array[MAX_NR_CPUS][MAX_COUNTERS];
+	struct perf_counter_hw_event hw_event;
+	int fd[MAX_NR_CPUS][MAX_COUNTERS];
+	int i, counter, group_fd;
+	unsigned int cpu;
+	uint64_t ip;
+	ssize_t res;
+	int ret;
+
+	process_options(argc, argv);
+
+	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	if (tid != -1 || profile_cpu != -1)
+		nr_cpus = 1;
+
+	assert(nr_cpus <= MAX_NR_CPUS);
+
+	for (i = 0; i < nr_cpus; i++) {
+		group_fd = -1;
+		for (counter = 0; counter < nr_counters; counter++) {
+
+			cpu	= profile_cpu;
+			if (tid == -1 && profile_cpu == -1)
+				cpu = i;
+
+			memset(&hw_event, 0, sizeof(hw_event));
+			hw_event.type		= event_id[counter];
+			hw_event.raw		= event_raw[counter];
+			hw_event.irq_period	= event_count[counter];
+			hw_event.record_type	= PERF_RECORD_IRQ;
+			hw_event.nmi		= nmi;
+
+			fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
+			fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
+			if (fd[i][counter] < 0) {
+				printf("kerneltop error: syscall returned with %d (%s)\n",
+					fd[i][counter], strerror(-fd[i][counter]));
+				if (fd[i][counter] == -1)
+					printf("Are you root?\n");
+				exit(-1);
+			}
+			assert(fd[i][counter] >= 0);
+
+			/*
+			 * First counter acts as the group leader:
+			 */
+			if (group && group_fd == -1)
+				group_fd = fd[i][counter];
+
+			event_array[i][counter].fd = fd[i][counter];
+			event_array[i][counter].events = POLLIN;
+		}
+	}
+
+	parse_symbols();
+	if (vmlinux && sym_filter_entry)
+		parse_vmlinux(vmlinux);
+
+	printf("KernelTop refresh period: %d seconds\n", delay_secs);
+	last_refresh = time(NULL);
+
+	while (1) {
+		int hits = events;
+
+		for (i = 0; i < nr_cpus; i++) {
+			for (counter = 0; counter < nr_counters; counter++) {
+				res = read(fd[i][counter], (char *) &ip, sizeof(ip));
+				if (res > 0) {
+					assert(res == sizeof(ip));
+
+					process_event(ip, counter);
+				}
+			}
+		}
+
+		if (time(NULL) >= last_refresh + delay_secs) {
+			print_sym_table();
+			events = userspace_events = 0;
+		}
+
+		if (hits == events)
+			ret = poll(event_array[0], nr_cpus, 1000);
+		hits = events;
+	}
+
+	return 0;
+}
diff --git a/Documentation/perf_counter/perfstat.c b/Documentation/perf_counter/perfstat.c
new file mode 100644
index 0000000..9a5808f
--- /dev/null
+++ b/Documentation/perf_counter/perfstat.c
@@ -0,0 +1,521 @@
+/*
+ * perfstat:  /usr/bin/time -alike performance counter statistics utility
+ *
+ *        It summarizes the counter events of all tasks (and child tasks),
+ *        covering all CPUs that the command (or workload) executes on.
+ *        It only counts the per-task events of the workload started,
+ *        independent of how many other tasks run on those CPUs.
+ *
+ * Build with:       cc -O2 -g -lrt -Wall -W -o perfstat perfstat.c
+ *
+ * Sample output:
+ *
+
+   $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
+
+   Performance counter stats for 'ls':
+
+           163516953 instructions
+                2295 cache-misses
+             2855182 branch-misses
+
+ *
+ * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
+ *
+ * Released under the GPLv2 (not later).
+ *
+ * Percpu counter support by: Yanmin Zhang <yanmin_zhang@linux.intel.com>
+ * Symbolic event options by: Wu Fengguang <fengguang.wu@intel.com>
+ */
+#define _GNU_SOURCE
+
+#include <assert.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <time.h>
+
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/prctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/uio.h>
+
+#include <linux/unistd.h>
+
+#ifdef __x86_64__
+# define __NR_perf_counter_open	295
+#endif
+
+#ifdef __i386__
+# define __NR_perf_counter_open 333
+#endif
+
+#ifdef __powerpc__
+#define __NR_perf_counter_open 319
+#endif
+
+/*
+ * Pick up some kernel type conventions:
+ */
+#define __user
+#define asmlinkage
+
+typedef unsigned int		__u32;
+typedef unsigned long long	__u64;
+typedef long long		__s64;
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+/*
+ * User-space ABI bits:
+ */
+
+/*
+ * Generalized performance counter event types, used by the hw_event.type
+ * parameter of the sys_perf_counter_open() syscall:
+ */
+enum hw_event_types {
+	/*
+	 * Common hardware events, generalized by the kernel:
+	 */
+	PERF_COUNT_CPU_CYCLES		=  0,
+	PERF_COUNT_INSTRUCTIONS		=  1,
+	PERF_COUNT_CACHE_REFERENCES	=  2,
+	PERF_COUNT_CACHE_MISSES		=  3,
+	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
+	PERF_COUNT_BRANCH_MISSES	=  5,
+	PERF_COUNT_BUS_CYCLES		=  6,
+
+	PERF_HW_EVENTS_MAX		=  7,
+
+	/*
+	 * Special "software" counters provided by the kernel, even if
+	 * the hardware does not support performance counters. These
+	 * counters measure various physical and sw events of the
+	 * kernel (and allow the profiling of them as well):
+	 */
+	PERF_COUNT_CPU_CLOCK		= -1,
+	PERF_COUNT_TASK_CLOCK		= -2,
+	PERF_COUNT_PAGE_FAULTS		= -3,
+	PERF_COUNT_CONTEXT_SWITCHES	= -4,
+	PERF_COUNT_CPU_MIGRATIONS	= -5,
+
+	PERF_SW_EVENTS_MIN		= -6,
+};
+
+/*
+ * IRQ-notification data record type:
+ */
+enum perf_counter_record_type {
+	PERF_RECORD_SIMPLE		=  0,
+	PERF_RECORD_IRQ			=  1,
+	PERF_RECORD_GROUP		=  2,
+};
+
+/*
+ * Hardware event to monitor via a performance monitoring counter:
+ */
+struct perf_counter_hw_event {
+	__s64			type;
+
+	__u64			irq_period;
+	__u64			record_type;
+	__u64			read_format;
+
+	__u64			disabled       :  1, /* off by default        */
+				nmi	       :  1, /* NMI sampling          */
+				raw	       :  1, /* raw event type        */
+				inherit	       :  1, /* children inherit it   */
+				pinned	       :  1, /* must always be on PMU */
+				exclusive      :  1, /* only group on PMU     */
+				exclude_user   :  1, /* don't count user      */
+				exclude_kernel :  1, /* ditto kernel          */
+				exclude_hv     :  1, /* ditto hypervisor      */
+				exclude_idle   :  1, /* don't count when idle */
+
+				__reserved_1   : 54;
+
+	__u32			extra_config_len;
+	__u32			__reserved_4;
+
+	__u64			__reserved_2;
+	__u64			__reserved_3;
+};
+
+/*
+ * Ioctls that can be done on a perf counter fd:
+ */
+#define PERF_COUNTER_IOC_ENABLE		_IO('$', 0)
+#define PERF_COUNTER_IOC_DISABLE	_IO('$', 1)
+
+asmlinkage int sys_perf_counter_open(
+
+	struct perf_counter_hw_event	*hw_event_uptr		__user,
+	pid_t				pid,
+	int				cpu,
+	int				group_fd,
+	unsigned long			flags)
+{
+	int ret;
+
+	ret = syscall(
+		__NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
+#if defined(__x86_64__) || defined(__i386__)
+	if (ret < 0 && ret > -4096) {
+		errno = -ret;
+		ret = -1;
+	}
+#endif
+	return ret;
+}
+
+
+static char *hw_event_names [] = {
+	"CPU cycles",
+	"instructions",
+	"cache references",
+	"cache misses",
+	"branches",
+	"branch misses",
+	"bus cycles",
+};
+
+static char *sw_event_names [] = {
+	"cpu clock ticks",
+	"task clock ticks",
+	"pagefaults",
+	"context switches",
+	"CPU migrations",
+};
+
+struct event_symbol {
+	int event;
+	char *symbol;
+};
+
+static struct event_symbol event_symbols [] = {
+	{PERF_COUNT_CPU_CYCLES,			"cpu-cycles",		},
+	{PERF_COUNT_CPU_CYCLES,			"cycles",		},
+	{PERF_COUNT_INSTRUCTIONS,		"instructions",		},
+	{PERF_COUNT_CACHE_REFERENCES,		"cache-references",	},
+	{PERF_COUNT_CACHE_MISSES,		"cache-misses",		},
+	{PERF_COUNT_BRANCH_INSTRUCTIONS,	"branch-instructions",	},
+	{PERF_COUNT_BRANCH_INSTRUCTIONS,	"branches",		},
+	{PERF_COUNT_BRANCH_MISSES,		"branch-misses",	},
+	{PERF_COUNT_BUS_CYCLES,			"bus-cycles",		},
+	{PERF_COUNT_CPU_CLOCK,			"cpu-ticks",		},
+	{PERF_COUNT_CPU_CLOCK,			"ticks",		},
+	{PERF_COUNT_TASK_CLOCK,			"task-ticks",		},
+	{PERF_COUNT_PAGE_FAULTS,		"page-faults",		},
+	{PERF_COUNT_PAGE_FAULTS,		"faults",		},
+	{PERF_COUNT_CONTEXT_SWITCHES,		"context-switches",	},
+	{PERF_COUNT_CONTEXT_SWITCHES,		"cs",			},
+	{PERF_COUNT_CPU_MIGRATIONS,		"cpu-migrations",	},
+	{PERF_COUNT_CPU_MIGRATIONS,		"migrations",		},
+};
+
+#define MAX_COUNTERS					64
+#define MAX_NR_CPUS					256
+
+static int			nr_counters		= 0;
+static int			nr_cpus			= 0;
+
+static int			event_id[MAX_COUNTERS]	=
+					 { -2, -5, -4, -3, 0, 1, 2, 3};
+
+static int			event_raw[MAX_COUNTERS];
+
+static int			system_wide		= 0;
+
+static void display_help(void)
+{
+	unsigned int i;
+	int e;
+
+	printf(
+	"Usage: perfstat [<events...>] <cmd...>\n\n"
+	"PerfStat Options (up to %d event types can be specified):\n\n",
+		 MAX_COUNTERS);
+	printf(
+	" -e EVENT     --event=EVENT        #  symbolic-name        abbreviations");
+
+	for (i = 0, e = PERF_HW_EVENTS_MAX; i < ARRAY_SIZE(event_symbols); i++) {
+		if (e != event_symbols[i].event) {
+			e = event_symbols[i].event;
+			printf(
+	"\n                                  %2d: %-20s", e, event_symbols[i].symbol);
+		} else
+			printf(" %s", event_symbols[i].symbol);
+	}
+
+	printf("\n"
+	"                                rNNN: raw event type\n\n"
+	" -s                                # system-wide collection\n\n"
+	" -c <cmd..>   --command=<cmd..>    # command+arguments to be timed.\n"
+	"\n");
+	exit(0);
+}
+
+static int type_valid(int type)
+{
+	if (type >= PERF_HW_EVENTS_MAX)
+		return 0;
+	if (type <= PERF_SW_EVENTS_MIN)
+		return 0;
+
+	return 1;
+}
+
+static char *event_name(int ctr)
+{
+	int type = event_id[ctr];
+	static char buf[32];
+
+	if (event_raw[ctr]) {
+		sprintf(buf, "raw 0x%x", type);
+		return buf;
+	}
+	if (!type_valid(type))
+		return "unknown";
+
+	if (type >= 0)
+		return hw_event_names[type];
+
+	return sw_event_names[-type-1];
+}
+
+/*
+ * Each event can have multiple symbolic names.
+ * Symbolic names are (almost) exactly matched.
+ */
+static int match_event_symbols(char *str)
+{
+	unsigned int i;
+
+	if (isdigit(str[0]) || str[0] == '-')
+		return atoi(str);
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		if (!strncmp(str, event_symbols[i].symbol,
+			     strlen(event_symbols[i].symbol)))
+			return event_symbols[i].event;
+	}
+
+	return PERF_HW_EVENTS_MAX;
+}
+
+static void parse_events(char *str)
+{
+	int type, raw;
+
+again:
+	nr_counters++;
+	if (nr_counters == MAX_COUNTERS)
+		display_help();
+
+	raw = 0;
+	if (*str == 'r') {
+		raw = 1;
+		++str;
+		type = strtol(str, NULL, 16);
+	} else {
+		type = match_event_symbols(str);
+		if (!type_valid(type))
+			display_help();
+	}
+
+	event_id[nr_counters] = type;
+	event_raw[nr_counters] = raw;
+
+	str = strstr(str, ",");
+	if (str) {
+		str++;
+		goto again;
+	}
+}
+
+static void process_options(int argc, char *argv[])
+{
+	for (;;) {
+		int option_index = 0;
+		/** Options for getopt */
+		static struct option long_options[] = {
+			{"event",	required_argument,	NULL, 'e'},
+			{"help",	no_argument,		NULL, 'h'},
+			{"command",	no_argument,		NULL, 'c'},
+			{NULL,		0,			NULL,  0 }
+		};
+		int c = getopt_long(argc, argv, "+:e:c:s",
+				    long_options, &option_index);
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 'c':
+			break;
+		case 's':
+			system_wide = 1;
+			break;
+		case 'e':
+			parse_events(optarg);
+			break;
+		default:
+			break;
+		}
+	}
+	if (optind == argc)
+		goto err;
+
+	if (!nr_counters)
+		nr_counters = 8;
+	else
+		nr_counters++;
+	return;
+
+err:
+	display_help();
+}
+
+char fault_here[1000000];
+
+#define PR_TASK_PERF_COUNTERS_DISABLE           31
+#define PR_TASK_PERF_COUNTERS_ENABLE            32
+
+static int fd[MAX_NR_CPUS][MAX_COUNTERS];
+
+static void create_counter(int counter)
+{
+	struct perf_counter_hw_event hw_event;
+
+	memset(&hw_event, 0, sizeof(hw_event));
+	hw_event.type		= event_id[counter];
+	hw_event.raw		= event_raw[counter];
+	hw_event.record_type	= PERF_RECORD_SIMPLE;
+	hw_event.nmi		= 0;
+
+	if (system_wide) {
+		int cpu;
+		for (cpu = 0; cpu < nr_cpus; cpu ++) {
+			fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
+			if (fd[cpu][counter] < 0) {
+				printf("perfstat error: syscall returned with %d (%s)\n",
+						fd[cpu][counter], strerror(errno));
+				exit(-1);
+			}
+			
+		}
+	} else {
+		hw_event.inherit	= 1;
+		hw_event.disabled	= 1;
+
+		fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
+		if (fd[0][counter] < 0) {
+			printf("perfstat error: syscall returned with %d (%s)\n",
+					fd[0][counter], strerror(errno));
+			exit(-1);
+		}
+	}
+}
+
+
+#define rdclock()					\
+({							\
+	struct timespec ts;				\
+							\
+	clock_gettime(CLOCK_MONOTONIC, &ts);		\
+	ts.tv_sec * 1000000000ULL + ts.tv_nsec;		\
+})
+
+int main(int argc, char *argv[])
+{
+	unsigned long long t0, t1;
+	int counter;
+	ssize_t res;
+	int status;
+	int pid;
+
+	process_options(argc, argv);
+
+	if (system_wide) {
+		nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+		assert(nr_cpus <= MAX_NR_CPUS);
+		assert(nr_cpus >= 0);
+	} else
+		nr_cpus = 1;
+
+	for (counter = 0; counter < nr_counters; counter++)
+		create_counter(counter);
+
+	argc -= optind;
+	argv += optind;
+
+	/*
+	 * Enable counters and exec the command:
+	 */
+	t0 = rdclock();
+	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
+
+	if ((pid = fork()) < 0)
+		perror("failed to fork");
+	if (!pid) {
+		if (execvp(argv[0], argv)) {
+			perror(argv[0]);
+			exit(-1);
+		}
+	}
+	while (wait(&status) >= 0)
+		;
+	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
+	t1 = rdclock();
+
+	fflush(stdout);
+
+	fprintf(stderr, "\n");
+	fprintf(stderr, " Performance counter stats for \'%s\':\n",
+		argv[0]);
+	fprintf(stderr, "\n");
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		int cpu;
+		__u64 count, single_count;
+
+		count = 0;
+		for (cpu = 0; cpu < nr_cpus; cpu ++) {
+			res = read(fd[cpu][counter],
+					(char *) &single_count, sizeof(single_count));
+			assert(res == sizeof(single_count));
+			count += single_count;
+		}
+
+		if (!event_raw[counter] &&
+		    (event_id[counter] == PERF_COUNT_CPU_CLOCK ||
+		     event_id[counter] == PERF_COUNT_TASK_CLOCK)) {
+
+			double msecs = (double)count / 1000000;
+
+			fprintf(stderr, " %14.6f  %-20s (msecs)\n",
+				msecs, event_name(counter));
+		} else {
+			fprintf(stderr, " %14Ld  %-20s (events)\n",
+				count, event_name(counter));
+		}
+		if (!counter)
+			fprintf(stderr, "\n");
+	}
+	fprintf(stderr, "\n");
+	fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
+			(double)(t1-t0)/1e6);
+	fprintf(stderr, "\n");
+
+	return 0;
+}
-- 
cgit v0.10.2


From cea92ce5b07078cd62c4712d51390b09a43dba2e Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Fri, 20 Mar 2009 10:08:02 +0800
Subject: perf_counter tools: Merge common code into perfcounters.h

kerneltop's MAX_COUNTERS is increased from 8 to 64(the value used by perfstat).

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index cf0e30b..fe70a2c 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -65,126 +65,7 @@
 
 #include <linux/unistd.h>
 
-#ifdef __x86_64__
-# define __NR_perf_counter_open	295
-#endif
-
-#ifdef __i386__
-# define __NR_perf_counter_open 333
-#endif
-
-/*
- * Pick up some kernel type conventions:
- */
-#define __user
-#define asmlinkage
-
-typedef unsigned int		__u32;
-typedef unsigned long long	__u64;
-typedef long long		__s64;
-
-/*
- * User-space ABI bits:
- */
-
-/*
- * Generalized performance counter event types, used by the hw_event.type
- * parameter of the sys_perf_counter_open() syscall:
- */
-enum hw_event_types {
-	/*
-	 * Common hardware events, generalized by the kernel:
-	 */
-	PERF_COUNT_CPU_CYCLES		=  0,
-	PERF_COUNT_INSTRUCTIONS		=  1,
-	PERF_COUNT_CACHE_REFERENCES	=  2,
-	PERF_COUNT_CACHE_MISSES		=  3,
-	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
-	PERF_COUNT_BRANCH_MISSES	=  5,
-	PERF_COUNT_BUS_CYCLES		=  6,
-
-	PERF_HW_EVENTS_MAX		=  7,
-
-	/*
-	 * Special "software" counters provided by the kernel, even if
-	 * the hardware does not support performance counters. These
-	 * counters measure various physical and sw events of the
-	 * kernel (and allow the profiling of them as well):
-	 */
-	PERF_COUNT_CPU_CLOCK		= -1,
-	PERF_COUNT_TASK_CLOCK		= -2,
-	PERF_COUNT_PAGE_FAULTS		= -3,
-	PERF_COUNT_CONTEXT_SWITCHES	= -4,
-	PERF_COUNT_CPU_MIGRATIONS	= -5,
-
-	PERF_SW_EVENTS_MIN		= -6,
-};
-
-/*
- * IRQ-notification data record type:
- */
-enum perf_counter_record_type {
-	PERF_RECORD_SIMPLE		=  0,
-	PERF_RECORD_IRQ			=  1,
-	PERF_RECORD_GROUP		=  2,
-};
-
-/*
- * Hardware event to monitor via a performance monitoring counter:
- */
-struct perf_counter_hw_event {
-	__s64			type;
-
-	__u64			irq_period;
-	__u64			record_type;
-	__u64			read_format;
-
-	__u64			disabled       :  1, /* off by default        */
-				nmi	       :  1, /* NMI sampling          */
-				raw	       :  1, /* raw event type        */
-				inherit	       :  1, /* children inherit it   */
-				pinned	       :  1, /* must always be on PMU */
-				exclusive      :  1, /* only group on PMU     */
-				exclude_user   :  1, /* don't count user      */
-				exclude_kernel :  1, /* ditto kernel          */
-				exclude_hv     :  1, /* ditto hypervisor      */
-				exclude_idle   :  1, /* don't count when idle */
-
-				__reserved_1   : 54;
-
-	__u32			extra_config_len;
-	__u32			__reserved_4;
-
-	__u64			__reserved_2;
-	__u64			__reserved_3;
-};
-
-/*
- * Ioctls that can be done on a perf counter fd:
- */
-#define PERF_COUNTER_IOC_ENABLE		_IO('$', 0)
-#define PERF_COUNTER_IOC_DISABLE	_IO('$', 1)
-
-asmlinkage int sys_perf_counter_open(
-
-	struct perf_counter_hw_event	*hw_event_uptr		__user,
-	pid_t				pid,
-	int				cpu,
-	int				group_fd,
-	unsigned long			flags)
-{
-	int ret;
-
-	ret = syscall(
-		__NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
-#if defined(__x86_64__) || defined(__i386__)
-	if (ret < 0 && ret > -4096) {
-		errno = -ret;
-		ret = -1;
-	}
-#endif
-	return ret;
-}
+#include "perfcounters.h"
 
 const char *event_types [] = {
 	"CPU cycles",
@@ -205,21 +86,10 @@ const unsigned int default_count[] = {
 	  10000,
 };
 
-/*
- * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
- * counters in the current task.
- */
-#define PR_TASK_PERF_COUNTERS_DISABLE           31
-#define PR_TASK_PERF_COUNTERS_ENABLE            32
-
-#define MAX_COUNTERS		8
-
 static int			nr_counters			= -1;
 
 static __u64			count_filter		       = 100;
 
-#define MAX_NR_CPUS		256
-
 static int			event_count[MAX_COUNTERS];
 static unsigned long		event_id[MAX_COUNTERS];
 static int			event_raw[MAX_COUNTERS];
diff --git a/Documentation/perf_counter/perfcounters.h b/Documentation/perf_counter/perfcounters.h
new file mode 100644
index 0000000..8c1559b
--- /dev/null
+++ b/Documentation/perf_counter/perfcounters.h
@@ -0,0 +1,137 @@
+/*
+ * Ioctls that can be done on a perf counter fd:
+ */
+#define PERF_COUNTER_IOC_ENABLE		_IO('$', 0)
+#define PERF_COUNTER_IOC_DISABLE	_IO('$', 1)
+
+/*
+ * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
+ * counters in the current task.
+ */
+#define PR_TASK_PERF_COUNTERS_DISABLE	31
+#define PR_TASK_PERF_COUNTERS_ENABLE    32
+
+#define MAX_COUNTERS			64
+#define MAX_NR_CPUS			256
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+/*
+ * Pick up some kernel type conventions:
+ */
+#define __user
+#define asmlinkage
+
+typedef unsigned int		__u32;
+typedef unsigned long long	__u64;
+typedef long long		__s64;
+
+/*
+ * User-space ABI bits:
+ */
+
+/*
+ * Generalized performance counter event types, used by the hw_event.type
+ * parameter of the sys_perf_counter_open() syscall:
+ */
+enum hw_event_types {
+	/*
+	 * Common hardware events, generalized by the kernel:
+	 */
+	PERF_COUNT_CPU_CYCLES		=  0,
+	PERF_COUNT_INSTRUCTIONS		=  1,
+	PERF_COUNT_CACHE_REFERENCES	=  2,
+	PERF_COUNT_CACHE_MISSES		=  3,
+	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
+	PERF_COUNT_BRANCH_MISSES	=  5,
+	PERF_COUNT_BUS_CYCLES		=  6,
+
+	PERF_HW_EVENTS_MAX		=  7,
+
+	/*
+	 * Special "software" counters provided by the kernel, even if
+	 * the hardware does not support performance counters. These
+	 * counters measure various physical and sw events of the
+	 * kernel (and allow the profiling of them as well):
+	 */
+	PERF_COUNT_CPU_CLOCK		= -1,
+	PERF_COUNT_TASK_CLOCK		= -2,
+	PERF_COUNT_PAGE_FAULTS		= -3,
+	PERF_COUNT_CONTEXT_SWITCHES	= -4,
+	PERF_COUNT_CPU_MIGRATIONS	= -5,
+
+	PERF_SW_EVENTS_MIN		= -6,
+};
+
+/*
+ * IRQ-notification data record type:
+ */
+enum perf_counter_record_type {
+	PERF_RECORD_SIMPLE		=  0,
+	PERF_RECORD_IRQ			=  1,
+	PERF_RECORD_GROUP		=  2,
+};
+
+/*
+ * Hardware event to monitor via a performance monitoring counter:
+ */
+struct perf_counter_hw_event {
+	__s64			type;
+
+	__u64			irq_period;
+	__u64			record_type;
+	__u64			read_format;
+
+	__u64			disabled       :  1, /* off by default        */
+				nmi	       :  1, /* NMI sampling          */
+				raw	       :  1, /* raw event type        */
+				inherit	       :  1, /* children inherit it   */
+				pinned	       :  1, /* must always be on PMU */
+				exclusive      :  1, /* only group on PMU     */
+				exclude_user   :  1, /* don't count user      */
+				exclude_kernel :  1, /* ditto kernel          */
+				exclude_hv     :  1, /* ditto hypervisor      */
+				exclude_idle   :  1, /* don't count when idle */
+
+				__reserved_1   : 54;
+
+	__u32			extra_config_len;
+	__u32			__reserved_4;
+
+	__u64			__reserved_2;
+	__u64			__reserved_3;
+};
+
+#ifdef __x86_64__
+# define __NR_perf_counter_open	295
+#endif
+
+#ifdef __i386__
+# define __NR_perf_counter_open 333
+#endif
+
+#ifdef __powerpc__
+#define __NR_perf_counter_open 319
+#endif
+
+asmlinkage int sys_perf_counter_open(
+
+	struct perf_counter_hw_event	*hw_event_uptr		__user,
+	pid_t				pid,
+	int				cpu,
+	int				group_fd,
+	unsigned long			flags)
+{
+	int ret;
+
+	ret = syscall(
+		__NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
+#if defined(__x86_64__) || defined(__i386__)
+	if (ret < 0 && ret > -4096) {
+		errno = -ret;
+		ret = -1;
+	}
+#endif
+	return ret;
+}
+
diff --git a/Documentation/perf_counter/perfstat.c b/Documentation/perf_counter/perfstat.c
index 9a5808f..a3d4a7a 100644
--- a/Documentation/perf_counter/perfstat.c
+++ b/Documentation/perf_counter/perfstat.c
@@ -52,133 +52,7 @@
 
 #include <linux/unistd.h>
 
-#ifdef __x86_64__
-# define __NR_perf_counter_open	295
-#endif
-
-#ifdef __i386__
-# define __NR_perf_counter_open 333
-#endif
-
-#ifdef __powerpc__
-#define __NR_perf_counter_open 319
-#endif
-
-/*
- * Pick up some kernel type conventions:
- */
-#define __user
-#define asmlinkage
-
-typedef unsigned int		__u32;
-typedef unsigned long long	__u64;
-typedef long long		__s64;
-
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-
-/*
- * User-space ABI bits:
- */
-
-/*
- * Generalized performance counter event types, used by the hw_event.type
- * parameter of the sys_perf_counter_open() syscall:
- */
-enum hw_event_types {
-	/*
-	 * Common hardware events, generalized by the kernel:
-	 */
-	PERF_COUNT_CPU_CYCLES		=  0,
-	PERF_COUNT_INSTRUCTIONS		=  1,
-	PERF_COUNT_CACHE_REFERENCES	=  2,
-	PERF_COUNT_CACHE_MISSES		=  3,
-	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
-	PERF_COUNT_BRANCH_MISSES	=  5,
-	PERF_COUNT_BUS_CYCLES		=  6,
-
-	PERF_HW_EVENTS_MAX		=  7,
-
-	/*
-	 * Special "software" counters provided by the kernel, even if
-	 * the hardware does not support performance counters. These
-	 * counters measure various physical and sw events of the
-	 * kernel (and allow the profiling of them as well):
-	 */
-	PERF_COUNT_CPU_CLOCK		= -1,
-	PERF_COUNT_TASK_CLOCK		= -2,
-	PERF_COUNT_PAGE_FAULTS		= -3,
-	PERF_COUNT_CONTEXT_SWITCHES	= -4,
-	PERF_COUNT_CPU_MIGRATIONS	= -5,
-
-	PERF_SW_EVENTS_MIN		= -6,
-};
-
-/*
- * IRQ-notification data record type:
- */
-enum perf_counter_record_type {
-	PERF_RECORD_SIMPLE		=  0,
-	PERF_RECORD_IRQ			=  1,
-	PERF_RECORD_GROUP		=  2,
-};
-
-/*
- * Hardware event to monitor via a performance monitoring counter:
- */
-struct perf_counter_hw_event {
-	__s64			type;
-
-	__u64			irq_period;
-	__u64			record_type;
-	__u64			read_format;
-
-	__u64			disabled       :  1, /* off by default        */
-				nmi	       :  1, /* NMI sampling          */
-				raw	       :  1, /* raw event type        */
-				inherit	       :  1, /* children inherit it   */
-				pinned	       :  1, /* must always be on PMU */
-				exclusive      :  1, /* only group on PMU     */
-				exclude_user   :  1, /* don't count user      */
-				exclude_kernel :  1, /* ditto kernel          */
-				exclude_hv     :  1, /* ditto hypervisor      */
-				exclude_idle   :  1, /* don't count when idle */
-
-				__reserved_1   : 54;
-
-	__u32			extra_config_len;
-	__u32			__reserved_4;
-
-	__u64			__reserved_2;
-	__u64			__reserved_3;
-};
-
-/*
- * Ioctls that can be done on a perf counter fd:
- */
-#define PERF_COUNTER_IOC_ENABLE		_IO('$', 0)
-#define PERF_COUNTER_IOC_DISABLE	_IO('$', 1)
-
-asmlinkage int sys_perf_counter_open(
-
-	struct perf_counter_hw_event	*hw_event_uptr		__user,
-	pid_t				pid,
-	int				cpu,
-	int				group_fd,
-	unsigned long			flags)
-{
-	int ret;
-
-	ret = syscall(
-		__NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
-#if defined(__x86_64__) || defined(__i386__)
-	if (ret < 0 && ret > -4096) {
-		errno = -ret;
-		ret = -1;
-	}
-#endif
-	return ret;
-}
-
+#include "perfcounters.h"
 
 static char *hw_event_names [] = {
 	"CPU cycles",
@@ -224,9 +98,6 @@ static struct event_symbol event_symbols [] = {
 	{PERF_COUNT_CPU_MIGRATIONS,		"migrations",		},
 };
 
-#define MAX_COUNTERS					64
-#define MAX_NR_CPUS					256
-
 static int			nr_counters		= 0;
 static int			nr_cpus			= 0;
 
@@ -388,9 +259,6 @@ err:
 
 char fault_here[1000000];
 
-#define PR_TASK_PERF_COUNTERS_DISABLE           31
-#define PR_TASK_PERF_COUNTERS_ENABLE            32
-
 static int fd[MAX_NR_CPUS][MAX_COUNTERS];
 
 static void create_counter(int counter)
-- 
cgit v0.10.2


From f49012fad4ed2231c7380c0b1901122242b3eab0 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Fri, 20 Mar 2009 10:08:03 +0800
Subject: perf_counter tools: Move perfstat supporting code into perfcounters.h

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/perfcounters.h b/Documentation/perf_counter/perfcounters.h
index 8c1559b..0f3764a 100644
--- a/Documentation/perf_counter/perfcounters.h
+++ b/Documentation/perf_counter/perfcounters.h
@@ -16,6 +16,14 @@
 
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 
+#define rdclock()					\
+({							\
+	struct timespec ts;				\
+							\
+	clock_gettime(CLOCK_MONOTONIC, &ts);		\
+	ts.tv_sec * 1000000000ULL + ts.tv_nsec;		\
+})
+
 /*
  * Pick up some kernel type conventions:
  */
@@ -135,3 +143,125 @@ asmlinkage int sys_perf_counter_open(
 	return ret;
 }
 
+static char *hw_event_names [] = {
+	"CPU cycles",
+	"instructions",
+	"cache references",
+	"cache misses",
+	"branches",
+	"branch misses",
+	"bus cycles",
+};
+
+static char *sw_event_names [] = {
+	"cpu clock ticks",
+	"task clock ticks",
+	"pagefaults",
+	"context switches",
+	"CPU migrations",
+};
+
+struct event_symbol {
+	int event;
+	char *symbol;
+};
+
+static struct event_symbol event_symbols [] = {
+	{PERF_COUNT_CPU_CYCLES,			"cpu-cycles",		},
+	{PERF_COUNT_CPU_CYCLES,			"cycles",		},
+	{PERF_COUNT_INSTRUCTIONS,		"instructions",		},
+	{PERF_COUNT_CACHE_REFERENCES,		"cache-references",	},
+	{PERF_COUNT_CACHE_MISSES,		"cache-misses",		},
+	{PERF_COUNT_BRANCH_INSTRUCTIONS,	"branch-instructions",	},
+	{PERF_COUNT_BRANCH_INSTRUCTIONS,	"branches",		},
+	{PERF_COUNT_BRANCH_MISSES,		"branch-misses",	},
+	{PERF_COUNT_BUS_CYCLES,			"bus-cycles",		},
+	{PERF_COUNT_CPU_CLOCK,			"cpu-ticks",		},
+	{PERF_COUNT_CPU_CLOCK,			"ticks",		},
+	{PERF_COUNT_TASK_CLOCK,			"task-ticks",		},
+	{PERF_COUNT_PAGE_FAULTS,		"page-faults",		},
+	{PERF_COUNT_PAGE_FAULTS,		"faults",		},
+	{PERF_COUNT_CONTEXT_SWITCHES,		"context-switches",	},
+	{PERF_COUNT_CONTEXT_SWITCHES,		"cs",			},
+	{PERF_COUNT_CPU_MIGRATIONS,		"cpu-migrations",	},
+	{PERF_COUNT_CPU_MIGRATIONS,		"migrations",		},
+};
+
+static int type_valid(int type)
+{
+	if (type >= PERF_HW_EVENTS_MAX)
+		return 0;
+	if (type <= PERF_SW_EVENTS_MIN)
+		return 0;
+
+	return 1;
+}
+
+static char *event_name(int ctr)
+{
+	int type = event_id[ctr];
+	static char buf[32];
+
+	if (event_raw[ctr]) {
+		sprintf(buf, "raw 0x%x", type);
+		return buf;
+	}
+	if (!type_valid(type))
+		return "unknown";
+
+	if (type >= 0)
+		return hw_event_names[type];
+
+	return sw_event_names[-type-1];
+}
+
+/*
+ * Each event can have multiple symbolic names.
+ * Symbolic names are (almost) exactly matched.
+ */
+static int match_event_symbols(char *str)
+{
+	unsigned int i;
+
+	if (isdigit(str[0]) || str[0] == '-')
+		return atoi(str);
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		if (!strncmp(str, event_symbols[i].symbol,
+			     strlen(event_symbols[i].symbol)))
+			return event_symbols[i].event;
+	}
+
+	return PERF_HW_EVENTS_MAX;
+}
+
+static void parse_events(char *str)
+{
+	int type, raw;
+
+again:
+	nr_counters++;
+	if (nr_counters == MAX_COUNTERS)
+		display_help();
+
+	raw = 0;
+	if (*str == 'r') {
+		raw = 1;
+		++str;
+		type = strtol(str, NULL, 16);
+	} else {
+		type = match_event_symbols(str);
+		if (!type_valid(type))
+			display_help();
+	}
+
+	event_id[nr_counters] = type;
+	event_raw[nr_counters] = raw;
+
+	str = strstr(str, ",");
+	if (str) {
+		str++;
+		goto again;
+	}
+}
+
diff --git a/Documentation/perf_counter/perfstat.c b/Documentation/perf_counter/perfstat.c
index a3d4a7a..3364dcb 100644
--- a/Documentation/perf_counter/perfstat.c
+++ b/Documentation/perf_counter/perfstat.c
@@ -54,50 +54,6 @@
 
 #include "perfcounters.h"
 
-static char *hw_event_names [] = {
-	"CPU cycles",
-	"instructions",
-	"cache references",
-	"cache misses",
-	"branches",
-	"branch misses",
-	"bus cycles",
-};
-
-static char *sw_event_names [] = {
-	"cpu clock ticks",
-	"task clock ticks",
-	"pagefaults",
-	"context switches",
-	"CPU migrations",
-};
-
-struct event_symbol {
-	int event;
-	char *symbol;
-};
-
-static struct event_symbol event_symbols [] = {
-	{PERF_COUNT_CPU_CYCLES,			"cpu-cycles",		},
-	{PERF_COUNT_CPU_CYCLES,			"cycles",		},
-	{PERF_COUNT_INSTRUCTIONS,		"instructions",		},
-	{PERF_COUNT_CACHE_REFERENCES,		"cache-references",	},
-	{PERF_COUNT_CACHE_MISSES,		"cache-misses",		},
-	{PERF_COUNT_BRANCH_INSTRUCTIONS,	"branch-instructions",	},
-	{PERF_COUNT_BRANCH_INSTRUCTIONS,	"branches",		},
-	{PERF_COUNT_BRANCH_MISSES,		"branch-misses",	},
-	{PERF_COUNT_BUS_CYCLES,			"bus-cycles",		},
-	{PERF_COUNT_CPU_CLOCK,			"cpu-ticks",		},
-	{PERF_COUNT_CPU_CLOCK,			"ticks",		},
-	{PERF_COUNT_TASK_CLOCK,			"task-ticks",		},
-	{PERF_COUNT_PAGE_FAULTS,		"page-faults",		},
-	{PERF_COUNT_PAGE_FAULTS,		"faults",		},
-	{PERF_COUNT_CONTEXT_SWITCHES,		"context-switches",	},
-	{PERF_COUNT_CONTEXT_SWITCHES,		"cs",			},
-	{PERF_COUNT_CPU_MIGRATIONS,		"cpu-migrations",	},
-	{PERF_COUNT_CPU_MIGRATIONS,		"migrations",		},
-};
-
 static int			nr_counters		= 0;
 static int			nr_cpus			= 0;
 
@@ -137,84 +93,6 @@ static void display_help(void)
 	exit(0);
 }
 
-static int type_valid(int type)
-{
-	if (type >= PERF_HW_EVENTS_MAX)
-		return 0;
-	if (type <= PERF_SW_EVENTS_MIN)
-		return 0;
-
-	return 1;
-}
-
-static char *event_name(int ctr)
-{
-	int type = event_id[ctr];
-	static char buf[32];
-
-	if (event_raw[ctr]) {
-		sprintf(buf, "raw 0x%x", type);
-		return buf;
-	}
-	if (!type_valid(type))
-		return "unknown";
-
-	if (type >= 0)
-		return hw_event_names[type];
-
-	return sw_event_names[-type-1];
-}
-
-/*
- * Each event can have multiple symbolic names.
- * Symbolic names are (almost) exactly matched.
- */
-static int match_event_symbols(char *str)
-{
-	unsigned int i;
-
-	if (isdigit(str[0]) || str[0] == '-')
-		return atoi(str);
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		if (!strncmp(str, event_symbols[i].symbol,
-			     strlen(event_symbols[i].symbol)))
-			return event_symbols[i].event;
-	}
-
-	return PERF_HW_EVENTS_MAX;
-}
-
-static void parse_events(char *str)
-{
-	int type, raw;
-
-again:
-	nr_counters++;
-	if (nr_counters == MAX_COUNTERS)
-		display_help();
-
-	raw = 0;
-	if (*str == 'r') {
-		raw = 1;
-		++str;
-		type = strtol(str, NULL, 16);
-	} else {
-		type = match_event_symbols(str);
-		if (!type_valid(type))
-			display_help();
-	}
-
-	event_id[nr_counters] = type;
-	event_raw[nr_counters] = raw;
-
-	str = strstr(str, ",");
-	if (str) {
-		str++;
-		goto again;
-	}
-}
-
 static void process_options(int argc, char *argv[])
 {
 	for (;;) {
@@ -296,14 +174,6 @@ static void create_counter(int counter)
 }
 
 
-#define rdclock()					\
-({							\
-	struct timespec ts;				\
-							\
-	clock_gettime(CLOCK_MONOTONIC, &ts);		\
-	ts.tv_sec * 1000000000ULL + ts.tv_nsec;		\
-})
-
 int main(int argc, char *argv[])
 {
 	unsigned long long t0, t1;
-- 
cgit v0.10.2


From 95bb3be1b3ca4a71cc168787b675d5b7852fc6be Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Fri, 20 Mar 2009 10:08:04 +0800
Subject: perf_counter tools: support symbolic event names in kerneltop

- kerneltop: --event_id => --event
- kerneltop: can accept SW event types now
- perfstat: it used to implicitly add event -2(task-clock),
	    the new code no longer does this. Shall we?

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index fe70a2c..edc5b09 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -86,13 +86,9 @@ const unsigned int default_count[] = {
 	  10000,
 };
 
-static int			nr_counters			= -1;
-
 static __u64			count_filter		       = 100;
 
 static int			event_count[MAX_COUNTERS];
-static unsigned long		event_id[MAX_COUNTERS];
-static int			event_raw[MAX_COUNTERS];
 
 static int			tid				= -1;
 static int			profile_cpu			= -1;
@@ -125,7 +121,7 @@ static void display_help(void)
 	"KernelTop Options (up to %d event types can be specified at once):\n\n",
 		 MAX_COUNTERS);
 	printf(
-	" -e EID    --event_id=EID     # event type ID                    [default:  0]\n"
+	" -e EID    --event=EID        # event type ID                    [default:  0]\n"
 	"                                   0: CPU cycles\n"
 	"                                   1: instructions\n"
 	"                                   2: cache accesses\n"
@@ -160,7 +156,7 @@ static void process_options(int argc, char *argv[])
 			{"cpu",		required_argument,	NULL, 'C'},
 			{"delay",	required_argument,	NULL, 'd'},
 			{"dump_symtab",	no_argument,		NULL, 'D'},
-			{"event_id",	required_argument,	NULL, 'e'},
+			{"event",	required_argument,	NULL, 'e'},
 			{"filter",	required_argument,	NULL, 'f'},
 			{"group",	required_argument,	NULL, 'g'},
 			{"help",	no_argument,		NULL, 'h'},
@@ -178,8 +174,6 @@ static void process_options(int argc, char *argv[])
 
 		switch (c) {
 		case 'c':
-			if (nr_counters == -1)
-				nr_counters = 0;
 			event_count[nr_counters]	=   atoi(optarg); break;
 		case 'C':
 			/* CPU and PID are mutually exclusive */
@@ -192,18 +186,7 @@ static void process_options(int argc, char *argv[])
 		case 'd': delay_secs			=   atoi(optarg); break;
 		case 'D': dump_symtab			=              1; break;
 
-		case 'e':
-			nr_counters++;
-			if (nr_counters == MAX_COUNTERS) {
-				error = 1;
-				break;
-			}
-			if (*optarg == 'r') {
-				event_raw[nr_counters] = 1;
-				++optarg;
-			}
-			event_id[nr_counters] = strtol(optarg, NULL, 16);
-			break;
+		case 'e': error				= parse_events(optarg); break;
 
 		case 'f': count_filter			=   atoi(optarg); break;
 		case 'g': group				=   atoi(optarg); break;
@@ -226,9 +209,10 @@ static void process_options(int argc, char *argv[])
 	if (error)
 		display_help();
 
-	nr_counters++;
-	if (nr_counters < 1)
+	if (!nr_counters) {
 		nr_counters = 1;
+		event_id[0] = 0;
+	}
 
 	for (counter = 0; counter < nr_counters; counter++) {
 		if (event_count[counter])
diff --git a/Documentation/perf_counter/perfcounters.h b/Documentation/perf_counter/perfcounters.h
index 0f3764a..99a90d8 100644
--- a/Documentation/perf_counter/perfcounters.h
+++ b/Documentation/perf_counter/perfcounters.h
@@ -143,6 +143,10 @@ asmlinkage int sys_perf_counter_open(
 	return ret;
 }
 
+static int			nr_counters			= 0;
+static long			event_id[MAX_COUNTERS]		= { -2, -5, -4, -3, 0, 1, 2, 3};
+static int			event_raw[MAX_COUNTERS];
+
 static char *hw_event_names [] = {
 	"CPU cycles",
 	"instructions",
@@ -235,14 +239,13 @@ static int match_event_symbols(char *str)
 	return PERF_HW_EVENTS_MAX;
 }
 
-static void parse_events(char *str)
+static int parse_events(char *str)
 {
 	int type, raw;
 
 again:
-	nr_counters++;
 	if (nr_counters == MAX_COUNTERS)
-		display_help();
+		return -1;
 
 	raw = 0;
 	if (*str == 'r') {
@@ -252,16 +255,18 @@ again:
 	} else {
 		type = match_event_symbols(str);
 		if (!type_valid(type))
-			display_help();
+			return -1;
 	}
 
 	event_id[nr_counters] = type;
 	event_raw[nr_counters] = raw;
+	nr_counters++;
 
 	str = strstr(str, ",");
 	if (str) {
 		str++;
 		goto again;
 	}
-}
 
+	return 0;
+}
diff --git a/Documentation/perf_counter/perfstat.c b/Documentation/perf_counter/perfstat.c
index 3364dcb..fd59446 100644
--- a/Documentation/perf_counter/perfstat.c
+++ b/Documentation/perf_counter/perfstat.c
@@ -54,14 +54,8 @@
 
 #include "perfcounters.h"
 
-static int			nr_counters		= 0;
 static int			nr_cpus			= 0;
 
-static int			event_id[MAX_COUNTERS]	=
-					 { -2, -5, -4, -3, 0, 1, 2, 3};
-
-static int			event_raw[MAX_COUNTERS];
-
 static int			system_wide		= 0;
 
 static void display_help(void)
@@ -127,8 +121,6 @@ static void process_options(int argc, char *argv[])
 
 	if (!nr_counters)
 		nr_counters = 8;
-	else
-		nr_counters++;
 	return;
 
 err:
-- 
cgit v0.10.2


From e3908612d93dff9d7746d85d37c42593254bf282 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Fri, 20 Mar 2009 10:08:05 +0800
Subject: perf_counter tools: Reuse event_name() in kerneltop

- can handle sw counters now
- the outputs will look slightly different

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index edc5b09..cba5cb0 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -67,16 +67,6 @@
 
 #include "perfcounters.h"
 
-const char *event_types [] = {
-	"CPU cycles",
-	"instructions",
-	"cache-refs",
-	"cache-misses",
-	"branches",
-	"branch-misses",
-	"bus cycles"
-};
-
 const unsigned int default_count[] = {
 	1000000,
 	1000000,
@@ -304,10 +294,7 @@ static void print_sym_table(void)
 		if (counter)
 			printf("/");
 
-		if (event_id[counter] < PERF_HW_EVENTS_MAX)
-			printf( "%s", event_types[event_id[counter]]);
-		else
-			printf( "raw:%04lx", event_id[counter]);
+		printf("%s", event_name(counter));
 	}
 
 	printf( "], ");
-- 
cgit v0.10.2


From f7524bda8be8be98db356d6a83ac1da451ecdb2e Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Fri, 20 Mar 2009 10:08:06 +0800
Subject: perf_counter tools: move remaining code into kerneltop.c

- perfstat.c can be safely removed now
- perfstat: -s => -a for system wide accounting
- kerneltop: add -S/--stat for perfstat mode
- minor adjustments to kerneltop --help, perfstat --help

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index cba5cb0..9db65a4 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -3,7 +3,7 @@
 
    Build with:
 
-     cc -O6 -Wall `pkg-config --cflags --libs glib-2.0` -o kerneltop kerneltop.c
+     cc -O6 -Wall -lrt `pkg-config --cflags --libs glib-2.0` -o kerneltop kerneltop.c
 
    Sample output:
 
@@ -26,18 +26,40 @@
               12.00 - ffffffff804ffb7f : __ip_local_out
               11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
                8.54 - ffffffff805001a3 : ip_queue_xmit
+ */
 
-  Started by Ingo Molnar <mingo@redhat.com>
+/*
+ * perfstat:  /usr/bin/time -alike performance counter statistics utility
 
-  Improvements and fixes by:
+          It summarizes the counter events of all tasks (and child tasks),
+          covering all CPUs that the command (or workload) executes on.
+          It only counts the per-task events of the workload started,
+          independent of how many other tasks run on those CPUs.
 
-    Arjan van de Ven <arjan@linux.intel.com>
-    Yanmin Zhang <yanmin.zhang@intel.com>
-    Mike Galbraith <efault@gmx.de>
+   Sample output:
 
-  Released under the GPL v2. (and only v2, not any later version)
+   $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
 
+   Performance counter stats for 'ls':
+
+           163516953 instructions
+                2295 cache-misses
+             2855182 branch-misses
  */
+
+ /*
+  * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
+  *
+  * Improvements and fixes by:
+  *
+  *   Arjan van de Ven <arjan@linux.intel.com>
+  *   Yanmin Zhang <yanmin.zhang@intel.com>
+  *   Wu Fengguang <fengguang.wu@intel.com>
+  *   Mike Galbraith <efault@gmx.de>
+  *
+  * Released under the GPL v2. (and only v2, not any later version)
+  */
+
 #define _GNU_SOURCE
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -67,18 +89,22 @@
 
 #include "perfcounters.h"
 
-const unsigned int default_count[] = {
-	1000000,
-	1000000,
-	  10000,
-	  10000,
-	1000000,
-	  10000,
-};
 
-static __u64			count_filter		       = 100;
+#define MAX_COUNTERS			64
+#define MAX_NR_CPUS			256
+
+#define DEF_PERFSTAT_EVENTS		{ -2, -5, -4, -3, 0, 1, 2, 3}
+
+static int			run_perfstat			=  0;
+static int			system_wide			=  0;
 
+static int			nr_counters			=  0;
+static long			event_id[MAX_COUNTERS]		= DEF_PERFSTAT_EVENTS;
+static int			event_raw[MAX_COUNTERS];
 static int			event_count[MAX_COUNTERS];
+static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+
+static __u64			count_filter		       = 100;
 
 static int			tid				= -1;
 static int			profile_cpu			= -1;
@@ -96,125 +122,335 @@ static int			delay_secs			=  2;
 static int			zero;
 static int			dump_symtab;
 
+static GList			*lines;
+
 struct source_line {
 	uint64_t		EIP;
 	unsigned long		count;
 	char			*line;
 };
 
-static GList			*lines;
+
+const unsigned int default_count[] = {
+	1000000,
+	1000000,
+	  10000,
+	  10000,
+	1000000,
+	  10000,
+};
+
+static char *hw_event_names[] = {
+	"CPU cycles",
+	"instructions",
+	"cache references",
+	"cache misses",
+	"branches",
+	"branch misses",
+	"bus cycles",
+};
+
+static char *sw_event_names[] = {
+	"cpu clock ticks",
+	"task clock ticks",
+	"pagefaults",
+	"context switches",
+	"CPU migrations",
+};
+
+struct event_symbol {
+	int event;
+	char *symbol;
+};
+
+static struct event_symbol event_symbols[] = {
+	{PERF_COUNT_CPU_CYCLES,			"cpu-cycles",		},
+	{PERF_COUNT_CPU_CYCLES,			"cycles",		},
+	{PERF_COUNT_INSTRUCTIONS,		"instructions",		},
+	{PERF_COUNT_CACHE_REFERENCES,		"cache-references",	},
+	{PERF_COUNT_CACHE_MISSES,		"cache-misses",		},
+	{PERF_COUNT_BRANCH_INSTRUCTIONS,	"branch-instructions",	},
+	{PERF_COUNT_BRANCH_INSTRUCTIONS,	"branches",		},
+	{PERF_COUNT_BRANCH_MISSES,		"branch-misses",	},
+	{PERF_COUNT_BUS_CYCLES,			"bus-cycles",		},
+	{PERF_COUNT_CPU_CLOCK,			"cpu-ticks",		},
+	{PERF_COUNT_CPU_CLOCK,			"ticks",		},
+	{PERF_COUNT_TASK_CLOCK,			"task-ticks",		},
+	{PERF_COUNT_PAGE_FAULTS,		"page-faults",		},
+	{PERF_COUNT_PAGE_FAULTS,		"faults",		},
+	{PERF_COUNT_CONTEXT_SWITCHES,		"context-switches",	},
+	{PERF_COUNT_CONTEXT_SWITCHES,		"cs",			},
+	{PERF_COUNT_CPU_MIGRATIONS,		"cpu-migrations",	},
+	{PERF_COUNT_CPU_MIGRATIONS,		"migrations",		},
+};
+
+static void display_events_help(void)
+{
+	unsigned int i;
+	int e;
+
+	printf(
+	" -e EVENT     --event=EVENT   #  symbolic-name        abbreviations");
+
+	for (i = 0, e = PERF_HW_EVENTS_MAX; i < ARRAY_SIZE(event_symbols); i++) {
+		if (e != event_symbols[i].event) {
+			e = event_symbols[i].event;
+			printf(
+	"\n                             %2d: %-20s", e, event_symbols[i].symbol);
+		} else
+			printf(" %s", event_symbols[i].symbol);
+	}
+
+	printf("\n"
+	"                           rNNN: raw PMU events (eventsel+umask)\n\n");
+}
+
+static void display_perfstat_help(void)
+{
+	printf(
+	"Usage: perfstat [<events...>] <cmd...>\n\n"
+	"PerfStat Options (up to %d event types can be specified):\n\n",
+		 MAX_COUNTERS);
+
+	display_events_help();
+
+	printf(
+	" -a                           # system-wide collection\n");
+	exit(0);
+}
 
 static void display_help(void)
 {
+	if (run_perfstat)
+		return display_perfstat_help();
+
 	printf(
-	"Usage: kerneltop [<options>]\n\n"
+	"Usage: kerneltop [<options>]\n"
+	"   Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
 	"KernelTop Options (up to %d event types can be specified at once):\n\n",
 		 MAX_COUNTERS);
+
+	display_events_help();
+
 	printf(
-	" -e EID    --event=EID        # event type ID                    [default:  0]\n"
-	"                                   0: CPU cycles\n"
-	"                                   1: instructions\n"
-	"                                   2: cache accesses\n"
-	"                                   3: cache misses\n"
-	"                                   4: branch instructions\n"
-	"                                   5: branch prediction misses\n"
-	"                                   6: bus cycles\n\n"
-	"                                rNNN: raw PMU events (eventsel+umask)\n\n"
+	" -S        --stat             # perfstat COMMAND\n"
+	" -a                           # system-wide collection (for perfstat)\n\n"
 	" -c CNT    --count=CNT        # event period to sample\n\n"
 	" -C CPU    --cpu=CPU          # CPU (-1 for all)                 [default: -1]\n"
 	" -p PID    --pid=PID          # PID of sampled task (-1 for all) [default: -1]\n\n"
 	" -d delay  --delay=<seconds>  # sampling/display delay           [default:  2]\n"
-	" -f CNT --filter=CNT          # min-event-count filter          [default: 100]\n\n"
+	" -f CNT    --filter=CNT       # min-event-count filter          [default: 100]\n\n"
 	" -s symbol --symbol=<symbol>  # function to be showed annotated one-shot\n"
-	" -x path   --vmlinux=<path>   # the vmlinux binary, required for -s use:\n"
+	" -x path   --vmlinux=<path>   # the vmlinux binary, required for -s use\n"
 	" -z        --zero             # zero counts after display\n"
 	" -D        --dump_symtab      # dump symbol table to stderr on startup\n"
-	"\n");
+	);
 
 	exit(0);
 }
 
-static void process_options(int argc, char *argv[])
+static int type_valid(int type)
 {
-	int error = 0, counter;
+	if (type >= PERF_HW_EVENTS_MAX)
+		return 0;
+	if (type <= PERF_SW_EVENTS_MIN)
+		return 0;
 
-	for (;;) {
-		int option_index = 0;
-		/** Options for getopt */
-		static struct option long_options[] = {
-			{"count",	required_argument,	NULL, 'c'},
-			{"cpu",		required_argument,	NULL, 'C'},
-			{"delay",	required_argument,	NULL, 'd'},
-			{"dump_symtab",	no_argument,		NULL, 'D'},
-			{"event",	required_argument,	NULL, 'e'},
-			{"filter",	required_argument,	NULL, 'f'},
-			{"group",	required_argument,	NULL, 'g'},
-			{"help",	no_argument,		NULL, 'h'},
-			{"nmi",		required_argument,	NULL, 'n'},
-			{"pid",		required_argument,	NULL, 'p'},
-			{"vmlinux",	required_argument,	NULL, 'x'},
-			{"symbol",	required_argument,	NULL, 's'},
-			{"zero",	no_argument,		NULL, 'z'},
-			{NULL,		0,			NULL,  0 }
-		};
-		int c = getopt_long(argc, argv, "c:C:d:De:f:g:hn:p:s:x:z",
-				    long_options, &option_index);
-		if (c == -1)
-			break;
+	return 1;
+}
 
-		switch (c) {
-		case 'c':
-			event_count[nr_counters]	=   atoi(optarg); break;
-		case 'C':
-			/* CPU and PID are mutually exclusive */
-			if (tid != -1) {
-				printf("WARNING: CPU switch overriding PID\n");
-				sleep(1);
-				tid = -1;
-			}
-			profile_cpu			=   atoi(optarg); break;
-		case 'd': delay_secs			=   atoi(optarg); break;
-		case 'D': dump_symtab			=              1; break;
+static char *event_name(int ctr)
+{
+	int type = event_id[ctr];
+	static char buf[32];
 
-		case 'e': error				= parse_events(optarg); break;
+	if (event_raw[ctr]) {
+		sprintf(buf, "raw 0x%x", type);
+		return buf;
+	}
+	if (!type_valid(type))
+		return "unknown";
 
-		case 'f': count_filter			=   atoi(optarg); break;
-		case 'g': group				=   atoi(optarg); break;
-		case 'h':      				  display_help(); break;
-		case 'n': nmi				=   atoi(optarg); break;
-		case 'p':
-			/* CPU and PID are mutually exclusive */
-			if (profile_cpu != -1) {
-				printf("WARNING: PID switch overriding CPU\n");
-				sleep(1);
-				profile_cpu = -1;
+	if (type >= 0)
+		return hw_event_names[type];
+
+	return sw_event_names[-type-1];
+}
+
+/*
+ * Each event can have multiple symbolic names.
+ * Symbolic names are (almost) exactly matched.
+ */
+static int match_event_symbols(char *str)
+{
+	unsigned int i;
+
+	if (isdigit(str[0]) || str[0] == '-')
+		return atoi(str);
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		if (!strncmp(str, event_symbols[i].symbol,
+			     strlen(event_symbols[i].symbol)))
+			return event_symbols[i].event;
+	}
+
+	return PERF_HW_EVENTS_MAX;
+}
+
+static int parse_events(char *str)
+{
+	int type, raw;
+
+again:
+	if (nr_counters == MAX_COUNTERS)
+		return -1;
+
+	raw = 0;
+	if (*str == 'r') {
+		raw = 1;
+		++str;
+		type = strtol(str, NULL, 16);
+	} else {
+		type = match_event_symbols(str);
+		if (!type_valid(type))
+			return -1;
+	}
+
+	event_id[nr_counters] = type;
+	event_raw[nr_counters] = raw;
+	nr_counters++;
+
+	str = strstr(str, ",");
+	if (str) {
+		str++;
+		goto again;
+	}
+
+	return 0;
+}
+
+
+/*
+ * perfstat
+ */
+
+char fault_here[1000000];
+
+static void create_perfstat_counter(int counter)
+{
+	struct perf_counter_hw_event hw_event;
+
+	memset(&hw_event, 0, sizeof(hw_event));
+	hw_event.type		= event_id[counter];
+	hw_event.raw		= event_raw[counter];
+	hw_event.record_type	= PERF_RECORD_SIMPLE;
+	hw_event.nmi		= 0;
+
+	if (system_wide) {
+		int cpu;
+		for (cpu = 0; cpu < nr_cpus; cpu ++) {
+			fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
+			if (fd[cpu][counter] < 0) {
+				printf("perfstat error: syscall returned with %d (%s)\n",
+						fd[cpu][counter], strerror(errno));
+				exit(-1);
 			}
-			tid				=   atoi(optarg); break;
-		case 's': sym_filter			= strdup(optarg); break;
-		case 'x': vmlinux			= strdup(optarg); break;
-		case 'z': zero				=              1; break;
-		default: error = 1; break;
+		}
+	} else {
+		hw_event.inherit	= 1;
+		hw_event.disabled	= 1;
+
+		fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
+		if (fd[0][counter] < 0) {
+			printf("perfstat error: syscall returned with %d (%s)\n",
+					fd[0][counter], strerror(errno));
+			exit(-1);
 		}
 	}
-	if (error)
-		display_help();
+}
 
-	if (!nr_counters) {
-		nr_counters = 1;
-		event_id[0] = 0;
+int do_perfstat(int argc, char *argv[])
+{
+	unsigned long long t0, t1;
+	int counter;
+	ssize_t res;
+	int status;
+	int pid;
+
+	if (!system_wide)
+		nr_cpus = 1;
+
+	for (counter = 0; counter < nr_counters; counter++)
+		create_perfstat_counter(counter);
+
+	argc -= optind;
+	argv += optind;
+
+	/*
+	 * Enable counters and exec the command:
+	 */
+	t0 = rdclock();
+	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
+
+	if ((pid = fork()) < 0)
+		perror("failed to fork");
+	if (!pid) {
+		if (execvp(argv[0], argv)) {
+			perror(argv[0]);
+			exit(-1);
+		}
 	}
+	while (wait(&status) >= 0)
+		;
+	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
+	t1 = rdclock();
+
+	fflush(stdout);
+
+	fprintf(stderr, "\n");
+	fprintf(stderr, " Performance counter stats for \'%s\':\n",
+		argv[0]);
+	fprintf(stderr, "\n");
 
 	for (counter = 0; counter < nr_counters; counter++) {
-		if (event_count[counter])
-			continue;
+		int cpu;
+		__u64 count, single_count;
+
+		count = 0;
+		for (cpu = 0; cpu < nr_cpus; cpu ++) {
+			res = read(fd[cpu][counter],
+					(char *) &single_count, sizeof(single_count));
+			assert(res == sizeof(single_count));
+			count += single_count;
+		}
 
-		if (event_id[counter] < PERF_HW_EVENTS_MAX)
-			event_count[counter] = default_count[event_id[counter]];
-		else
-			event_count[counter] = 100000;
+		if (!event_raw[counter] &&
+		    (event_id[counter] == PERF_COUNT_CPU_CLOCK ||
+		     event_id[counter] == PERF_COUNT_TASK_CLOCK)) {
+
+			double msecs = (double)count / 1000000;
+
+			fprintf(stderr, " %14.6f  %-20s (msecs)\n",
+				msecs, event_name(counter));
+		} else {
+			fprintf(stderr, " %14Ld  %-20s (events)\n",
+				count, event_name(counter));
+		}
+		if (!counter)
+			fprintf(stderr, "\n");
 	}
+	fprintf(stderr, "\n");
+	fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
+			(double)(t1-t0)/1e6);
+	fprintf(stderr, "\n");
+
+	return 0;
 }
 
+/*
+ * Symbols
+ */
+
 static uint64_t			min_ip;
 static uint64_t			max_ip = -1ll;
 
@@ -507,6 +743,9 @@ static void parse_symbols(void)
 	}
 }
 
+/*
+ * Source lines
+ */
 
 static void parse_vmlinux(char *filename)
 {
@@ -527,7 +766,7 @@ static void parse_vmlinux(char *filename)
 		char *c;
 
 		src = malloc(sizeof(struct source_line));
-		assert(src != NULL);	
+		assert(src != NULL);
 		memset(src, 0, sizeof(struct source_line));
 
 		if (getline(&src->line, &dummy, file) < 0)
@@ -706,11 +945,100 @@ static void process_event(uint64_t ip, int counter)
 	record_ip(ip, counter);
 }
 
+static void process_options(int argc, char *argv[])
+{
+	int error = 0, counter;
+
+	if (strstr(argv[0], "perfstat"))
+		run_perfstat = 1;
+
+	for (;;) {
+		int option_index = 0;
+		/** Options for getopt */
+		static struct option long_options[] = {
+			{"count",	required_argument,	NULL, 'c'},
+			{"cpu",		required_argument,	NULL, 'C'},
+			{"delay",	required_argument,	NULL, 'd'},
+			{"dump_symtab",	no_argument,		NULL, 'D'},
+			{"event",	required_argument,	NULL, 'e'},
+			{"filter",	required_argument,	NULL, 'f'},
+			{"group",	required_argument,	NULL, 'g'},
+			{"help",	no_argument,		NULL, 'h'},
+			{"nmi",		required_argument,	NULL, 'n'},
+			{"pid",		required_argument,	NULL, 'p'},
+			{"vmlinux",	required_argument,	NULL, 'x'},
+			{"symbol",	required_argument,	NULL, 's'},
+			{"stat",	no_argument,		NULL, 'S'},
+			{"zero",	no_argument,		NULL, 'z'},
+			{NULL,		0,			NULL,  0 }
+		};
+		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:p:s:Sx:z",
+				    long_options, &option_index);
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 'a': system_wide			=	       1; break;
+		case 'c': event_count[nr_counters]	=   atoi(optarg); break;
+		case 'C':
+			/* CPU and PID are mutually exclusive */
+			if (tid != -1) {
+				printf("WARNING: CPU switch overriding PID\n");
+				sleep(1);
+				tid = -1;
+			}
+			profile_cpu			=   atoi(optarg); break;
+		case 'd': delay_secs			=   atoi(optarg); break;
+		case 'D': dump_symtab			=              1; break;
+
+		case 'e': error				= parse_events(optarg); break;
+
+		case 'f': count_filter			=   atoi(optarg); break;
+		case 'g': group				=   atoi(optarg); break;
+		case 'h':      				  display_help(); break;
+		case 'n': nmi				=   atoi(optarg); break;
+		case 'p':
+			/* CPU and PID are mutually exclusive */
+			if (profile_cpu != -1) {
+				printf("WARNING: PID switch overriding CPU\n");
+				sleep(1);
+				profile_cpu = -1;
+			}
+			tid				=   atoi(optarg); break;
+		case 's': sym_filter			= strdup(optarg); break;
+		case 'S': run_perfstat			=	       1; break;
+		case 'x': vmlinux			= strdup(optarg); break;
+		case 'z': zero				=              1; break;
+		default: error = 1; break;
+		}
+	}
+	if (error)
+		display_help();
+
+	if (!nr_counters) {
+		if (run_perfstat)
+			nr_counters = 8;
+		else {
+			nr_counters = 1;
+			event_id[0] = 0;
+		}
+	}
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		if (event_count[counter])
+			continue;
+
+		if (event_id[counter] < PERF_HW_EVENTS_MAX)
+			event_count[counter] = default_count[event_id[counter]];
+		else
+			event_count[counter] = 100000;
+	}
+}
+
 int main(int argc, char *argv[])
 {
 	struct pollfd event_array[MAX_NR_CPUS][MAX_COUNTERS];
 	struct perf_counter_hw_event hw_event;
-	int fd[MAX_NR_CPUS][MAX_COUNTERS];
 	int i, counter, group_fd;
 	unsigned int cpu;
 	uint64_t ip;
@@ -720,11 +1048,15 @@ int main(int argc, char *argv[])
 	process_options(argc, argv);
 
 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	assert(nr_cpus <= MAX_NR_CPUS);
+	assert(nr_cpus >= 0);
+
+	if (run_perfstat)
+		return do_perfstat(argc, argv);
+
 	if (tid != -1 || profile_cpu != -1)
 		nr_cpus = 1;
 
-	assert(nr_cpus <= MAX_NR_CPUS);
-
 	for (i = 0; i < nr_cpus; i++) {
 		group_fd = -1;
 		for (counter = 0; counter < nr_counters; counter++) {
diff --git a/Documentation/perf_counter/perfcounters.h b/Documentation/perf_counter/perfcounters.h
index 99a90d8..32e24b9 100644
--- a/Documentation/perf_counter/perfcounters.h
+++ b/Documentation/perf_counter/perfcounters.h
@@ -11,9 +11,6 @@
 #define PR_TASK_PERF_COUNTERS_DISABLE	31
 #define PR_TASK_PERF_COUNTERS_ENABLE    32
 
-#define MAX_COUNTERS			64
-#define MAX_NR_CPUS			256
-
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 
 #define rdclock()					\
@@ -110,6 +107,7 @@ struct perf_counter_hw_event {
 	__u64			__reserved_3;
 };
 
+
 #ifdef __x86_64__
 # define __NR_perf_counter_open	295
 #endif
@@ -142,131 +140,3 @@ asmlinkage int sys_perf_counter_open(
 #endif
 	return ret;
 }
-
-static int			nr_counters			= 0;
-static long			event_id[MAX_COUNTERS]		= { -2, -5, -4, -3, 0, 1, 2, 3};
-static int			event_raw[MAX_COUNTERS];
-
-static char *hw_event_names [] = {
-	"CPU cycles",
-	"instructions",
-	"cache references",
-	"cache misses",
-	"branches",
-	"branch misses",
-	"bus cycles",
-};
-
-static char *sw_event_names [] = {
-	"cpu clock ticks",
-	"task clock ticks",
-	"pagefaults",
-	"context switches",
-	"CPU migrations",
-};
-
-struct event_symbol {
-	int event;
-	char *symbol;
-};
-
-static struct event_symbol event_symbols [] = {
-	{PERF_COUNT_CPU_CYCLES,			"cpu-cycles",		},
-	{PERF_COUNT_CPU_CYCLES,			"cycles",		},
-	{PERF_COUNT_INSTRUCTIONS,		"instructions",		},
-	{PERF_COUNT_CACHE_REFERENCES,		"cache-references",	},
-	{PERF_COUNT_CACHE_MISSES,		"cache-misses",		},
-	{PERF_COUNT_BRANCH_INSTRUCTIONS,	"branch-instructions",	},
-	{PERF_COUNT_BRANCH_INSTRUCTIONS,	"branches",		},
-	{PERF_COUNT_BRANCH_MISSES,		"branch-misses",	},
-	{PERF_COUNT_BUS_CYCLES,			"bus-cycles",		},
-	{PERF_COUNT_CPU_CLOCK,			"cpu-ticks",		},
-	{PERF_COUNT_CPU_CLOCK,			"ticks",		},
-	{PERF_COUNT_TASK_CLOCK,			"task-ticks",		},
-	{PERF_COUNT_PAGE_FAULTS,		"page-faults",		},
-	{PERF_COUNT_PAGE_FAULTS,		"faults",		},
-	{PERF_COUNT_CONTEXT_SWITCHES,		"context-switches",	},
-	{PERF_COUNT_CONTEXT_SWITCHES,		"cs",			},
-	{PERF_COUNT_CPU_MIGRATIONS,		"cpu-migrations",	},
-	{PERF_COUNT_CPU_MIGRATIONS,		"migrations",		},
-};
-
-static int type_valid(int type)
-{
-	if (type >= PERF_HW_EVENTS_MAX)
-		return 0;
-	if (type <= PERF_SW_EVENTS_MIN)
-		return 0;
-
-	return 1;
-}
-
-static char *event_name(int ctr)
-{
-	int type = event_id[ctr];
-	static char buf[32];
-
-	if (event_raw[ctr]) {
-		sprintf(buf, "raw 0x%x", type);
-		return buf;
-	}
-	if (!type_valid(type))
-		return "unknown";
-
-	if (type >= 0)
-		return hw_event_names[type];
-
-	return sw_event_names[-type-1];
-}
-
-/*
- * Each event can have multiple symbolic names.
- * Symbolic names are (almost) exactly matched.
- */
-static int match_event_symbols(char *str)
-{
-	unsigned int i;
-
-	if (isdigit(str[0]) || str[0] == '-')
-		return atoi(str);
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		if (!strncmp(str, event_symbols[i].symbol,
-			     strlen(event_symbols[i].symbol)))
-			return event_symbols[i].event;
-	}
-
-	return PERF_HW_EVENTS_MAX;
-}
-
-static int parse_events(char *str)
-{
-	int type, raw;
-
-again:
-	if (nr_counters == MAX_COUNTERS)
-		return -1;
-
-	raw = 0;
-	if (*str == 'r') {
-		raw = 1;
-		++str;
-		type = strtol(str, NULL, 16);
-	} else {
-		type = match_event_symbols(str);
-		if (!type_valid(type))
-			return -1;
-	}
-
-	event_id[nr_counters] = type;
-	event_raw[nr_counters] = raw;
-	nr_counters++;
-
-	str = strstr(str, ",");
-	if (str) {
-		str++;
-		goto again;
-	}
-
-	return 0;
-}
-- 
cgit v0.10.2


From ef45fa9e6c1694d3e8063f39749097a6e496b12c Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Fri, 20 Mar 2009 10:08:07 +0800
Subject: perf_counter tools: fix comment for sym_weight()

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 9db65a4..7bf2a51 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -473,7 +473,7 @@ static struct sym_entry		sym_table[MAX_SYMS];
 static void show_details(struct sym_entry *sym);
 
 /*
- * Ordering weight: count-1 * count-1 * ... / count-n
+ * Ordering weight: count-1 * count-2 * ... / count-n
  */
 static double sym_weight(const struct sym_entry *sym)
 {
-- 
cgit v0.10.2


From 3ab8d792b1348eaabfe550ba60902d781d160dd4 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Fri, 20 Mar 2009 10:08:08 +0800
Subject: perf_counter tools: fix event_id type

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 7bf2a51..7bfb0f0 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -99,7 +99,7 @@ static int			run_perfstat			=  0;
 static int			system_wide			=  0;
 
 static int			nr_counters			=  0;
-static long			event_id[MAX_COUNTERS]		= DEF_PERFSTAT_EVENTS;
+static __s64			event_id[MAX_COUNTERS]		= DEF_PERFSTAT_EVENTS;
 static int			event_raw[MAX_COUNTERS];
 static int			event_count[MAX_COUNTERS];
 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
@@ -261,11 +261,11 @@ static int type_valid(int type)
 
 static char *event_name(int ctr)
 {
-	int type = event_id[ctr];
+	__s64 type = event_id[ctr];
 	static char buf[32];
 
 	if (event_raw[ctr]) {
-		sprintf(buf, "raw 0x%x", type);
+		sprintf(buf, "raw 0x%llx", (long long)type);
 		return buf;
 	}
 	if (!type_valid(type))
@@ -299,7 +299,8 @@ static int match_event_symbols(char *str)
 
 static int parse_events(char *str)
 {
-	int type, raw;
+	__s64 type;
+	int raw;
 
 again:
 	if (nr_counters == MAX_COUNTERS)
-- 
cgit v0.10.2


From dda7c02f33833bfa9412ba6f0e410b0a18b42c88 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Fri, 20 Mar 2009 10:08:09 +0800
Subject: perf_counter tools: cut down default count for cpu-cycles

In my system, it takes kerneltop dozens of minutes to
show up usable numbers. Make the default count 100 times
smaller fixed this long startup latency.

I'm not sure if it's the right solution though.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 7bfb0f0..0bd3c13 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -132,7 +132,7 @@ struct source_line {
 
 
 const unsigned int default_count[] = {
-	1000000,
+	  10000,
 	1000000,
 	  10000,
 	  10000,
-- 
cgit v0.10.2


From af9522cf133e9be6da8525a46a9ed7e7659f0e1a Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Fri, 20 Mar 2009 10:08:10 +0800
Subject: perf_counter tools: when no command is feed to perfstat, display help
 and exit

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 0bd3c13..81a68aa 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -387,6 +387,9 @@ int do_perfstat(int argc, char *argv[])
 	argc -= optind;
 	argv += optind;
 
+	if (!argc)
+		display_help();
+
 	/*
 	 * Enable counters and exec the command:
 	 */
-- 
cgit v0.10.2


From f4a2deb4860497f4332cf6a1acddab3dd628ddf0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 23 Mar 2009 18:22:06 +0100
Subject: perf_counter: remove the event config bitfields

Since the bitfields turned into a bit of a mess, remove them and rely on
good old masks.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090323172417.059499915@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 6413d9c..d056515 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -602,13 +602,13 @@ hw_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 	if ((s64)counter->hw_event.irq_period < 0)
 		return NULL;
-	if (!counter->hw_event.raw_type) {
-		ev = counter->hw_event.event_id;
+	if (!perf_event_raw(&counter->hw_event)) {
+		ev = perf_event_id(&counter->hw_event);
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
 			return NULL;
 		ev = ppmu->generic_events[ev];
 	} else {
-		ev = counter->hw_event.raw_event_id;
+		ev = perf_event_config(&counter->hw_event);
 	}
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 902282d..3f95b0c 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -217,15 +217,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
-	if (hw_event->raw_type) {
-		hwc->config |= pmc_ops->raw_event(hw_event->raw_event_id);
+	if (perf_event_raw(hw_event)) {
+		hwc->config |= pmc_ops->raw_event(perf_event_config(hw_event));
 	} else {
-		if (hw_event->event_id >= pmc_ops->max_events)
+		if (perf_event_id(hw_event) >= pmc_ops->max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= pmc_ops->event_map(hw_event->event_id);
+		hwc->config |= pmc_ops->event_map(perf_event_id(hw_event));
 	}
 	counter->wakeup_pending = 0;
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 98f5990..56099e5 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -82,32 +82,37 @@ enum perf_counter_record_type {
 	PERF_RECORD_GROUP		= 2,
 };
 
+#define __PERF_COUNTER_MASK(name) 			\
+	(((1ULL << PERF_COUNTER_##name##_BITS) - 1) <<	\
+	 PERF_COUNTER_##name##_SHIFT)
+
+#define PERF_COUNTER_RAW_BITS		1
+#define PERF_COUNTER_RAW_SHIFT		63
+#define PERF_COUNTER_RAW_MASK		__PERF_COUNTER_MASK(RAW)
+
+#define PERF_COUNTER_CONFIG_BITS	63
+#define PERF_COUNTER_CONFIG_SHIFT	0
+#define PERF_COUNTER_CONFIG_MASK	__PERF_COUNTER_MASK(CONFIG)
+
+#define PERF_COUNTER_TYPE_BITS		7
+#define PERF_COUNTER_TYPE_SHIFT		56
+#define PERF_COUNTER_TYPE_MASK		__PERF_COUNTER_MASK(TYPE)
+
+#define PERF_COUNTER_EVENT_BITS		56
+#define PERF_COUNTER_EVENT_SHIFT	0
+#define PERF_COUNTER_EVENT_MASK		__PERF_COUNTER_MASK(EVENT)
+
 /*
  * Hardware event to monitor via a performance monitoring counter:
  */
 struct perf_counter_hw_event {
-	union {
-#ifndef __BIG_ENDIAN_BITFIELD
-		struct {
-			__u64			event_id	: 56,
-						type		:  8;
-		};
-		struct {
-			__u64			raw_event_id	: 63,
-						raw_type	:  1;
-		};
-#else
-		struct {
-			__u64			type		:  8,
-						event_id	: 56;
-		};
-		struct {
-			__u64			raw_type	:  1,
-						raw_event_id	: 63;
-		};
-#endif /* __BIT_ENDIAN_BITFIELD */
-		__u64		event_config;
-	};
+	/*
+	 * The MSB of the config word signifies if the rest contains cpu
+	 * specific (raw) counter configuration data, if unset, the next
+	 * 7 bits are an event type and the rest of the bits are the event
+	 * identifier.
+	 */
+	__u64			config;
 
 	__u64			irq_period;
 	__u64			record_type;
@@ -157,6 +162,27 @@ struct perf_counter_hw_event {
 
 struct task_struct;
 
+static inline u64 perf_event_raw(struct perf_counter_hw_event *hw_event)
+{
+	return hw_event->config & PERF_COUNTER_RAW_MASK;
+}
+
+static inline u64 perf_event_config(struct perf_counter_hw_event *hw_event)
+{
+	return hw_event->config & PERF_COUNTER_CONFIG_MASK;
+}
+
+static inline u64 perf_event_type(struct perf_counter_hw_event *hw_event)
+{
+	return (hw_event->config & PERF_COUNTER_TYPE_MASK) >>
+		PERF_COUNTER_TYPE_SHIFT;
+}
+
+static inline u64 perf_event_id(struct perf_counter_hw_event *hw_event)
+{
+	return hw_event->config & PERF_COUNTER_EVENT_MASK;
+}
+
 /**
  * struct hw_perf_counter - performance counter hardware details:
  */
@@ -336,8 +362,8 @@ extern void perf_counter_output(struct perf_counter *counter,
  */
 static inline int is_software_counter(struct perf_counter *counter)
 {
-	return !counter->hw_event.raw_type &&
-		counter->hw_event.type != PERF_TYPE_HARDWARE;
+	return !perf_event_raw(&counter->hw_event) &&
+		perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE;
 }
 
 extern void perf_swcounter_event(u32, u64, int, struct pt_regs *);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f054b8c..ca14fc4 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1379,7 +1379,7 @@ static void perf_counter_handle_group(struct perf_counter *counter)
 	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
 		if (sub != counter)
 			sub->hw_ops->read(sub);
-		perf_counter_store_irq(counter, sub->hw_event.event_config);
+		perf_counter_store_irq(counter, sub->hw_event.config);
 		perf_counter_store_irq(counter, atomic64_read(&sub->count));
 	}
 }
@@ -1489,13 +1489,13 @@ static int perf_swcounter_match(struct perf_counter *counter,
 	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 		return 0;
 
-	if (counter->hw_event.raw_type)
+	if (perf_event_raw(&counter->hw_event))
 		return 0;
 
-	if (counter->hw_event.type != type)
+	if (perf_event_type(&counter->hw_event) != type)
 		return 0;
 
-	if (counter->hw_event.event_id != event)
+	if (perf_event_id(&counter->hw_event) != event)
 		return 0;
 
 	if (counter->hw_event.exclude_user && user_mode(regs))
@@ -1757,13 +1757,13 @@ extern void ftrace_profile_disable(int);
 
 static void tp_perf_counter_destroy(struct perf_counter *counter)
 {
-	ftrace_profile_disable(counter->hw_event.event_id);
+	ftrace_profile_disable(perf_event_id(&counter->hw_event));
 }
 
 static const struct hw_perf_counter_ops *
 tp_perf_counter_init(struct perf_counter *counter)
 {
-	int event_id = counter->hw_event.event_id;
+	int event_id = perf_event_id(&counter->hw_event);
 	int ret;
 
 	ret = ftrace_profile_enable(event_id);
@@ -1797,7 +1797,7 @@ sw_perf_counter_init(struct perf_counter *counter)
 	 * to be kernel events, and page faults are never hypervisor
 	 * events.
 	 */
-	switch (counter->hw_event.event_id) {
+	switch (perf_event_id(&counter->hw_event)) {
 	case PERF_COUNT_CPU_CLOCK:
 		hw_ops = &perf_ops_cpu_clock;
 
@@ -1882,9 +1882,12 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
 	hw_ops = NULL;
 
-	if (hw_event->raw_type)
+	if (perf_event_raw(hw_event)) {
 		hw_ops = hw_perf_counter_init(counter);
-	else switch (hw_event->type) {
+		goto done;
+	}
+
+	switch (perf_event_type(hw_event)) {
 	case PERF_TYPE_HARDWARE:
 		hw_ops = hw_perf_counter_init(counter);
 		break;
@@ -1902,6 +1905,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		kfree(counter);
 		return NULL;
 	}
+done:
 	counter->hw_ops = hw_ops;
 
 	return counter;
-- 
cgit v0.10.2


From 96f6d4444302bb2ea2cf409529eef816462f6ce0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 23 Mar 2009 18:22:07 +0100
Subject: perf_counter: avoid recursion

Tracepoint events like lock_acquire and software counters like
pagefaults can recurse into the perf counter code again, avoid that.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090323172417.152096433@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 56099e5..18dc17d 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -328,6 +328,13 @@ struct perf_cpu_context {
 	int				active_oncpu;
 	int				max_pertask;
 	int				exclusive;
+
+	/*
+	 * Recursion avoidance:
+	 *
+	 * task, softirq, irq, nmi context
+	 */
+	int			recursion[4];
 };
 
 /*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ca14fc4..ce34bff 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -23,6 +23,7 @@
 #include <linux/mm.h>
 #include <linux/vmstat.h>
 #include <linux/rculist.h>
+#include <linux/hardirq.h>
 
 #include <asm/irq_regs.h>
 
@@ -1532,10 +1533,31 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 	rcu_read_unlock();
 }
 
+static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
+{
+	if (in_nmi())
+		return &cpuctx->recursion[3];
+
+	if (in_irq())
+		return &cpuctx->recursion[2];
+
+	if (in_softirq())
+		return &cpuctx->recursion[1];
+
+	return &cpuctx->recursion[0];
+}
+
 static void __perf_swcounter_event(enum perf_event_types type, u32 event,
 				   u64 nr, int nmi, struct pt_regs *regs)
 {
 	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+	int *recursion = perf_swcounter_recursion_context(cpuctx);
+
+	if (*recursion)
+		goto out;
+
+	(*recursion)++;
+	barrier();
 
 	perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs);
 	if (cpuctx->task_ctx) {
@@ -1543,6 +1565,10 @@ static void __perf_swcounter_event(enum perf_event_types type, u32 event,
 				nr, nmi, regs);
 	}
 
+	barrier();
+	(*recursion)--;
+
+out:
 	put_cpu_var(perf_cpu_context);
 }
 
-- 
cgit v0.10.2


From 37d81828385f8ff823caaaf1a83e72d065b6cfa1 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 23 Mar 2009 18:22:08 +0100
Subject: perf_counter: add an mmap method to allow userspace to read hardware
 counters

Impact: new feature giving performance improvement

This adds the ability for userspace to do an mmap on a hardware counter
fd and get access to a read-only page that contains the information
needed to translate a hardware counter value to the full 64-bit
counter value that would be returned by a read on the fd.  This is
useful on architectures that allow user programs to read the hardware
counters, such as PowerPC.

The mmap will only succeed if the counter is a hardware counter
monitoring the current process.

On my quad 2.5GHz PowerPC 970MP machine, userspace can read a counter
and translate it to the full 64-bit value in about 30ns using the
mmapped page, compared to about 830ns for the read syscall on the
counter, so this does give a significant performance improvement.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Orig-LKML-Reference: <20090323172417.297057964@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index d056515..e434928 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -417,6 +417,8 @@ void hw_perf_restore(u64 disable)
 		atomic64_set(&counter->hw.prev_count, val);
 		counter->hw.idx = hwc_index[i] + 1;
 		write_pmc(counter->hw.idx, val);
+		if (counter->user_page)
+			perf_counter_update_userpage(counter);
 	}
 	mb();
 	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
@@ -572,6 +574,8 @@ static void power_perf_disable(struct perf_counter *counter)
 			ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
 			write_pmc(counter->hw.idx, 0);
 			counter->hw.idx = 0;
+			if (counter->user_page)
+				perf_counter_update_userpage(counter);
 			break;
 		}
 	}
@@ -698,6 +702,8 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	write_pmc(counter->hw.idx, val);
 	atomic64_set(&counter->hw.prev_count, val);
 	atomic64_set(&counter->hw.period_left, left);
+	if (counter->user_page)
+		perf_counter_update_userpage(counter);
 
 	/*
 	 * Finally record data if requested.
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 18dc17d..40b324e 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -143,6 +143,17 @@ struct perf_counter_hw_event {
 #define PERF_COUNTER_IOC_ENABLE		_IO('$', 0)
 #define PERF_COUNTER_IOC_DISABLE	_IO('$', 1)
 
+/*
+ * Structure of the page that can be mapped via mmap
+ */
+struct perf_counter_mmap_page {
+	__u32	version;		/* version number of this structure */
+	__u32	compat_version;		/* lowest version this is compat with */
+	__u32	lock;			/* seqlock for synchronization */
+	__u32	index;			/* hardware counter identifier */
+	__s64	offset;			/* add to hardware counter value */
+};
+
 #ifdef __KERNEL__
 /*
  * Kernel-internal data types and definitions:
@@ -278,6 +289,9 @@ struct perf_counter {
 	int				oncpu;
 	int				cpu;
 
+	/* pointer to page shared with userspace via mmap */
+	unsigned long			user_page;
+
 	/* read() / irq related data */
 	wait_queue_head_t		waitq;
 	/* optional: for NMIs */
@@ -361,6 +375,7 @@ extern int perf_counter_task_enable(void);
 extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_cpu_context *cpuctx,
 	       struct perf_counter_context *ctx, int cpu);
+extern void perf_counter_update_userpage(struct perf_counter *counter);
 
 extern void perf_counter_output(struct perf_counter *counter,
 				int nmi, struct pt_regs *regs);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ce34bff..d9cfd90 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1177,6 +1177,7 @@ static int perf_release(struct inode *inode, struct file *file)
 	mutex_unlock(&counter->mutex);
 	mutex_unlock(&ctx->mutex);
 
+	free_page(counter->user_page);
 	free_counter(counter);
 	put_context(ctx);
 
@@ -1346,12 +1347,87 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return err;
 }
 
+void perf_counter_update_userpage(struct perf_counter *counter)
+{
+	struct perf_counter_mmap_page *userpg;
+
+	if (!counter->user_page)
+		return;
+	userpg = (struct perf_counter_mmap_page *) counter->user_page;
+
+	++userpg->lock;
+	smp_wmb();
+	userpg->index = counter->hw.idx;
+	userpg->offset = atomic64_read(&counter->count);
+	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
+		userpg->offset -= atomic64_read(&counter->hw.prev_count);
+	smp_wmb();
+	++userpg->lock;
+}
+
+static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct perf_counter *counter = vma->vm_file->private_data;
+
+	if (!counter->user_page)
+		return VM_FAULT_SIGBUS;
+
+	vmf->page = virt_to_page(counter->user_page);
+	get_page(vmf->page);
+	return 0;
+}
+
+static struct vm_operations_struct perf_mmap_vmops = {
+	.fault = perf_mmap_fault,
+};
+
+static int perf_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct perf_counter *counter = file->private_data;
+	unsigned long userpg;
+
+	if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
+		return -EINVAL;
+	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+		return -EINVAL;
+
+	/*
+	 * For now, restrict to the case of a hardware counter
+	 * on the current task.
+	 */
+	if (is_software_counter(counter) || counter->task != current)
+		return -EINVAL;
+
+	userpg = counter->user_page;
+	if (!userpg) {
+		userpg = get_zeroed_page(GFP_KERNEL);
+		mutex_lock(&counter->mutex);
+		if (counter->user_page) {
+			free_page(userpg);
+			userpg = counter->user_page;
+		} else {
+			counter->user_page = userpg;
+		}
+		mutex_unlock(&counter->mutex);
+		if (!userpg)
+			return -ENOMEM;
+	}
+
+	perf_counter_update_userpage(counter);
+
+	vma->vm_flags &= ~VM_MAYWRITE;
+	vma->vm_flags |= VM_RESERVED;
+	vma->vm_ops = &perf_mmap_vmops;
+	return 0;
+}
+
 static const struct file_operations perf_fops = {
 	.release		= perf_release,
 	.read			= perf_read,
 	.poll			= perf_poll,
 	.unlocked_ioctl		= perf_ioctl,
 	.compat_ioctl		= perf_ioctl,
+	.mmap			= perf_mmap,
 };
 
 /*
-- 
cgit v0.10.2


From 9ab772cd535c4b256a577eae516f9c7462346b2d Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 23 Mar 2009 18:22:09 +0100
Subject: mutex: add atomic_dec_and_mutex_lock()

Much like the atomic_dec_and_lock() function in which we take an hold a
spin_lock if we drop the atomic to 0 this function takes and holds the
mutex if we dec the atomic to 0.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090323172417.410913479@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 3069ec7..93054fc 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -151,4 +151,27 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
 extern int mutex_trylock(struct mutex *lock);
 extern void mutex_unlock(struct mutex *lock);
 
+/**
+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+ * @cnt: the atomic which we are to dec
+ * @lock: the mutex to return holding if we dec to 0
+ *
+ * return true and hold lock if we dec to 0, return false otherwise
+ */
+static inline int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
+{
+	/* dec if we can't possibly hit 0 */
+	if (atomic_add_unless(cnt, -1, 1))
+		return 0;
+	/* we might hit 0, so take the lock */
+	mutex_lock(lock);
+	if (!atomic_dec_and_test(cnt)) {
+		/* when we actually did the dec, we didn't hit 0 */
+		mutex_unlock(lock);
+		return 0;
+	}
+	/* we hit 0, and we hold the lock */
+	return 1;
+}
+
 #endif
-- 
cgit v0.10.2


From b09d2501ed3d294619cbfbcf828ad39324d0e548 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 1 Apr 2009 17:21:56 -0700
Subject: mutex: drop "inline" from mutex_lock() inside kernel/mutex.c

Impact: build fix

mutex_lock() is was defined inline in kernel/mutex.c, but wasn't
declared so not in <linux/mutex.h>.  This didn't cause a problem until
checkin 3a2d367d9aabac486ac4444c6c7ec7a1dab16267 added the
atomic_dec_and_mutex_lock() inline in between declaration and
definion.

This broke building with CONFIG_ALLOW_WARNINGS=n, e.g. make
allnoconfig.

Either from the source code nor the allnoconfig binary output I cannot
find any internal references to mutex_lock() in kernel/mutex.c, so
presumably this "inline" is now-useless legacy.

Cc: Eric Paris <eparis@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <tip-3a2d367d9aabac486ac4444c6c7ec7a1dab16267@git.kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/kernel/mutex.c b/kernel/mutex.c
index 5d79781..fd95eaa 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -89,7 +89,7 @@ __mutex_lock_slowpath(atomic_t *lock_count);
  *
  * This function is similar to (but not equivalent to) down().
  */
-void inline __sched mutex_lock(struct mutex *lock)
+void __sched mutex_lock(struct mutex *lock)
 {
 	might_sleep();
 	/*
-- 
cgit v0.10.2


From 7b732a75047738e4f85438ed2f9cd34bf5f2a19a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 23 Mar 2009 18:22:10 +0100
Subject: perf_counter: new output ABI - part 1

Impact: Rework the perfcounter output ABI

use sys_read() only for instant data and provide mmap() output for all
async overflow data.

The first mmap() determines the size of the output buffer. The mmap()
size must be a PAGE_SIZE multiple of 1+pages, where pages must be a
power of 2 or 0. Further mmap()s of the same fd must have the same
size. Once all maps are gone, you can again mmap() with a new size.

In case of 0 extra pages there is no data output and the first page
only contains meta data.

When there are data pages, a poll() event will be generated for each
full page of data. Furthermore, the output is circular. This means
that although 1 page is a valid configuration, its useless, since
we'll start overwriting it the instant we report a full page.

Future work will focus on the output format (currently maintained)
where we'll likey want each entry denoted by a header which includes a
type and length.

Further future work will allow to splice() the fd, also containing the
async overflow data -- splice() would be mutually exclusive with
mmap() of the data.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090323172417.470536358@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index e434928..d48596a 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -417,8 +417,7 @@ void hw_perf_restore(u64 disable)
 		atomic64_set(&counter->hw.prev_count, val);
 		counter->hw.idx = hwc_index[i] + 1;
 		write_pmc(counter->hw.idx, val);
-		if (counter->user_page)
-			perf_counter_update_userpage(counter);
+		perf_counter_update_userpage(counter);
 	}
 	mb();
 	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
@@ -574,8 +573,7 @@ static void power_perf_disable(struct perf_counter *counter)
 			ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
 			write_pmc(counter->hw.idx, 0);
 			counter->hw.idx = 0;
-			if (counter->user_page)
-				perf_counter_update_userpage(counter);
+			perf_counter_update_userpage(counter);
 			break;
 		}
 	}
@@ -702,8 +700,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	write_pmc(counter->hw.idx, val);
 	atomic64_set(&counter->hw.prev_count, val);
 	atomic64_set(&counter->hw.period_left, left);
-	if (counter->user_page)
-		perf_counter_update_userpage(counter);
+	perf_counter_update_userpage(counter);
 
 	/*
 	 * Finally record data if requested.
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 40b324e..2b5e66d 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -152,6 +152,8 @@ struct perf_counter_mmap_page {
 	__u32	lock;			/* seqlock for synchronization */
 	__u32	index;			/* hardware counter identifier */
 	__s64	offset;			/* add to hardware counter value */
+
+	__u32   data_head;		/* head in the data section */
 };
 
 #ifdef __KERNEL__
@@ -218,21 +220,6 @@ struct hw_perf_counter {
 #endif
 };
 
-/*
- * Hardcoded buffer length limit for now, for IRQ-fed events:
- */
-#define PERF_DATA_BUFLEN		2048
-
-/**
- * struct perf_data - performance counter IRQ data sampling ...
- */
-struct perf_data {
-	int				len;
-	int				rd_idx;
-	int				overrun;
-	u8				data[PERF_DATA_BUFLEN];
-};
-
 struct perf_counter;
 
 /**
@@ -256,6 +243,14 @@ enum perf_counter_active_state {
 
 struct file;
 
+struct perf_mmap_data {
+	struct rcu_head			rcu_head;
+	int				nr_pages;
+	atomic_t			head;
+	struct perf_counter_mmap_page   *user_page;
+	void 				*data_pages[0];
+};
+
 /**
  * struct perf_counter - performance counter kernel representation:
  */
@@ -289,16 +284,15 @@ struct perf_counter {
 	int				oncpu;
 	int				cpu;
 
-	/* pointer to page shared with userspace via mmap */
-	unsigned long			user_page;
+	/* mmap bits */
+	struct mutex			mmap_mutex;
+	atomic_t			mmap_count;
+	struct perf_mmap_data		*data;
 
-	/* read() / irq related data */
+	/* poll related */
 	wait_queue_head_t		waitq;
 	/* optional: for NMIs */
 	int				wakeup_pending;
-	struct perf_data		*irqdata;
-	struct perf_data		*usrdata;
-	struct perf_data		data[2];
 
 	void (*destroy)(struct perf_counter *);
 	struct rcu_head			rcu_head;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d9cfd90..0dfe910 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -4,7 +4,8 @@
  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
  *
- *  For licencing details see kernel-base/COPYING
+ *
+ *  For licensing details see kernel-base/COPYING
  */
 
 #include <linux/fs.h>
@@ -1022,66 +1023,6 @@ static u64 perf_counter_read(struct perf_counter *counter)
 	return atomic64_read(&counter->count);
 }
 
-/*
- * Cross CPU call to switch performance data pointers
- */
-static void __perf_switch_irq_data(void *info)
-{
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-	struct perf_counter *counter = info;
-	struct perf_counter_context *ctx = counter->ctx;
-	struct perf_data *oldirqdata = counter->irqdata;
-
-	/*
-	 * If this is a task context, we need to check whether it is
-	 * the current task context of this cpu. If not it has been
-	 * scheduled out before the smp call arrived.
-	 */
-	if (ctx->task) {
-		if (cpuctx->task_ctx != ctx)
-			return;
-		spin_lock(&ctx->lock);
-	}
-
-	/* Change the pointer NMI safe */
-	atomic_long_set((atomic_long_t *)&counter->irqdata,
-			(unsigned long) counter->usrdata);
-	counter->usrdata = oldirqdata;
-
-	if (ctx->task)
-		spin_unlock(&ctx->lock);
-}
-
-static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
-{
-	struct perf_counter_context *ctx = counter->ctx;
-	struct perf_data *oldirqdata = counter->irqdata;
-	struct task_struct *task = ctx->task;
-
-	if (!task) {
-		smp_call_function_single(counter->cpu,
-					 __perf_switch_irq_data,
-					 counter, 1);
-		return counter->usrdata;
-	}
-
-retry:
-	spin_lock_irq(&ctx->lock);
-	if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
-		counter->irqdata = counter->usrdata;
-		counter->usrdata = oldirqdata;
-		spin_unlock_irq(&ctx->lock);
-		return oldirqdata;
-	}
-	spin_unlock_irq(&ctx->lock);
-	task_oncpu_function_call(task, __perf_switch_irq_data, counter);
-	/* Might have failed, because task was scheduled out */
-	if (counter->irqdata == oldirqdata)
-		goto retry;
-
-	return counter->usrdata;
-}
-
 static void put_context(struct perf_counter_context *ctx)
 {
 	if (ctx->task)
@@ -1177,7 +1118,6 @@ static int perf_release(struct inode *inode, struct file *file)
 	mutex_unlock(&counter->mutex);
 	mutex_unlock(&ctx->mutex);
 
-	free_page(counter->user_page);
 	free_counter(counter);
 	put_context(ctx);
 
@@ -1192,7 +1132,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 {
 	u64 cntval;
 
-	if (count != sizeof(cntval))
+	if (count < sizeof(cntval))
 		return -EINVAL;
 
 	/*
@@ -1211,121 +1151,20 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 }
 
 static ssize_t
-perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
-{
-	if (!usrdata->len)
-		return 0;
-
-	count = min(count, (size_t)usrdata->len);
-	if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
-		return -EFAULT;
-
-	/* Adjust the counters */
-	usrdata->len -= count;
-	if (!usrdata->len)
-		usrdata->rd_idx = 0;
-	else
-		usrdata->rd_idx += count;
-
-	return count;
-}
-
-static ssize_t
-perf_read_irq_data(struct perf_counter	*counter,
-		   char __user		*buf,
-		   size_t		count,
-		   int			nonblocking)
-{
-	struct perf_data *irqdata, *usrdata;
-	DECLARE_WAITQUEUE(wait, current);
-	ssize_t res, res2;
-
-	irqdata = counter->irqdata;
-	usrdata = counter->usrdata;
-
-	if (usrdata->len + irqdata->len >= count)
-		goto read_pending;
-
-	if (nonblocking)
-		return -EAGAIN;
-
-	spin_lock_irq(&counter->waitq.lock);
-	__add_wait_queue(&counter->waitq, &wait);
-	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (usrdata->len + irqdata->len >= count)
-			break;
-
-		if (signal_pending(current))
-			break;
-
-		if (counter->state == PERF_COUNTER_STATE_ERROR)
-			break;
-
-		spin_unlock_irq(&counter->waitq.lock);
-		schedule();
-		spin_lock_irq(&counter->waitq.lock);
-	}
-	__remove_wait_queue(&counter->waitq, &wait);
-	__set_current_state(TASK_RUNNING);
-	spin_unlock_irq(&counter->waitq.lock);
-
-	if (usrdata->len + irqdata->len < count &&
-	    counter->state != PERF_COUNTER_STATE_ERROR)
-		return -ERESTARTSYS;
-read_pending:
-	mutex_lock(&counter->mutex);
-
-	/* Drain pending data first: */
-	res = perf_copy_usrdata(usrdata, buf, count);
-	if (res < 0 || res == count)
-		goto out;
-
-	/* Switch irq buffer: */
-	usrdata = perf_switch_irq_data(counter);
-	res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
-	if (res2 < 0) {
-		if (!res)
-			res = -EFAULT;
-	} else {
-		res += res2;
-	}
-out:
-	mutex_unlock(&counter->mutex);
-
-	return res;
-}
-
-static ssize_t
 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 {
 	struct perf_counter *counter = file->private_data;
 
-	switch (counter->hw_event.record_type) {
-	case PERF_RECORD_SIMPLE:
-		return perf_read_hw(counter, buf, count);
-
-	case PERF_RECORD_IRQ:
-	case PERF_RECORD_GROUP:
-		return perf_read_irq_data(counter, buf, count,
-					  file->f_flags & O_NONBLOCK);
-	}
-	return -EINVAL;
+	return perf_read_hw(counter, buf, count);
 }
 
 static unsigned int perf_poll(struct file *file, poll_table *wait)
 {
 	struct perf_counter *counter = file->private_data;
-	unsigned int events = 0;
-	unsigned long flags;
+	unsigned int events = POLLIN;
 
 	poll_wait(file, &counter->waitq, wait);
 
-	spin_lock_irqsave(&counter->waitq.lock, flags);
-	if (counter->usrdata->len || counter->irqdata->len)
-		events |= POLLIN;
-	spin_unlock_irqrestore(&counter->waitq.lock, flags);
-
 	return events;
 }
 
@@ -1347,78 +1186,207 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return err;
 }
 
-void perf_counter_update_userpage(struct perf_counter *counter)
+static void __perf_counter_update_userpage(struct perf_counter *counter,
+					   struct perf_mmap_data *data)
 {
-	struct perf_counter_mmap_page *userpg;
-
-	if (!counter->user_page)
-		return;
-	userpg = (struct perf_counter_mmap_page *) counter->user_page;
+	struct perf_counter_mmap_page *userpg = data->user_page;
 
+	/*
+	 * Disable preemption so as to not let the corresponding user-space
+	 * spin too long if we get preempted.
+	 */
+	preempt_disable();
 	++userpg->lock;
 	smp_wmb();
 	userpg->index = counter->hw.idx;
 	userpg->offset = atomic64_read(&counter->count);
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		userpg->offset -= atomic64_read(&counter->hw.prev_count);
+
+	userpg->data_head = atomic_read(&data->head);
 	smp_wmb();
 	++userpg->lock;
+	preempt_enable();
+}
+
+void perf_counter_update_userpage(struct perf_counter *counter)
+{
+	struct perf_mmap_data *data;
+
+	rcu_read_lock();
+	data = rcu_dereference(counter->data);
+	if (data)
+		__perf_counter_update_userpage(counter, data);
+	rcu_read_unlock();
 }
 
 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct perf_counter *counter = vma->vm_file->private_data;
+	struct perf_mmap_data *data;
+	int ret = VM_FAULT_SIGBUS;
 
-	if (!counter->user_page)
-		return VM_FAULT_SIGBUS;
+	rcu_read_lock();
+	data = rcu_dereference(counter->data);
+	if (!data)
+		goto unlock;
+
+	if (vmf->pgoff == 0) {
+		vmf->page = virt_to_page(data->user_page);
+	} else {
+		int nr = vmf->pgoff - 1;
 
-	vmf->page = virt_to_page(counter->user_page);
+		if ((unsigned)nr > data->nr_pages)
+			goto unlock;
+
+		vmf->page = virt_to_page(data->data_pages[nr]);
+	}
 	get_page(vmf->page);
+	ret = 0;
+unlock:
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
+{
+	struct perf_mmap_data *data;
+	unsigned long size;
+	int i;
+
+	WARN_ON(atomic_read(&counter->mmap_count));
+
+	size = sizeof(struct perf_mmap_data);
+	size += nr_pages * sizeof(void *);
+
+	data = kzalloc(size, GFP_KERNEL);
+	if (!data)
+		goto fail;
+
+	data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!data->user_page)
+		goto fail_user_page;
+
+	for (i = 0; i < nr_pages; i++) {
+		data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+		if (!data->data_pages[i])
+			goto fail_data_pages;
+	}
+
+	data->nr_pages = nr_pages;
+
+	rcu_assign_pointer(counter->data, data);
+
 	return 0;
+
+fail_data_pages:
+	for (i--; i >= 0; i--)
+		free_page((unsigned long)data->data_pages[i]);
+
+	free_page((unsigned long)data->user_page);
+
+fail_user_page:
+	kfree(data);
+
+fail:
+	return -ENOMEM;
+}
+
+static void __perf_mmap_data_free(struct rcu_head *rcu_head)
+{
+	struct perf_mmap_data *data = container_of(rcu_head,
+			struct perf_mmap_data, rcu_head);
+	int i;
+
+	free_page((unsigned long)data->user_page);
+	for (i = 0; i < data->nr_pages; i++)
+		free_page((unsigned long)data->data_pages[i]);
+	kfree(data);
+}
+
+static void perf_mmap_data_free(struct perf_counter *counter)
+{
+	struct perf_mmap_data *data = counter->data;
+
+	WARN_ON(atomic_read(&counter->mmap_count));
+
+	rcu_assign_pointer(counter->data, NULL);
+	call_rcu(&data->rcu_head, __perf_mmap_data_free);
+}
+
+static void perf_mmap_open(struct vm_area_struct *vma)
+{
+	struct perf_counter *counter = vma->vm_file->private_data;
+
+	atomic_inc(&counter->mmap_count);
+}
+
+static void perf_mmap_close(struct vm_area_struct *vma)
+{
+	struct perf_counter *counter = vma->vm_file->private_data;
+
+	if (atomic_dec_and_mutex_lock(&counter->mmap_count,
+				      &counter->mmap_mutex)) {
+		perf_mmap_data_free(counter);
+		mutex_unlock(&counter->mmap_mutex);
+	}
 }
 
 static struct vm_operations_struct perf_mmap_vmops = {
+	.open = perf_mmap_open,
+	.close = perf_mmap_close,
 	.fault = perf_mmap_fault,
 };
 
 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct perf_counter *counter = file->private_data;
-	unsigned long userpg;
+	unsigned long vma_size;
+	unsigned long nr_pages;
+	unsigned long locked, lock_limit;
+	int ret = 0;
 
 	if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
 		return -EINVAL;
-	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+
+	vma_size = vma->vm_end - vma->vm_start;
+	nr_pages = (vma_size / PAGE_SIZE) - 1;
+
+	if (nr_pages == 0 || !is_power_of_2(nr_pages))
 		return -EINVAL;
 
-	/*
-	 * For now, restrict to the case of a hardware counter
-	 * on the current task.
-	 */
-	if (is_software_counter(counter) || counter->task != current)
+	if (vma_size != PAGE_SIZE * (1 + nr_pages))
 		return -EINVAL;
 
-	userpg = counter->user_page;
-	if (!userpg) {
-		userpg = get_zeroed_page(GFP_KERNEL);
-		mutex_lock(&counter->mutex);
-		if (counter->user_page) {
-			free_page(userpg);
-			userpg = counter->user_page;
-		} else {
-			counter->user_page = userpg;
-		}
-		mutex_unlock(&counter->mutex);
-		if (!userpg)
-			return -ENOMEM;
-	}
+	if (vma->vm_pgoff != 0)
+		return -EINVAL;
+
+	locked = vma_size >>  PAGE_SHIFT;
+	locked += vma->vm_mm->locked_vm;
 
-	perf_counter_update_userpage(counter);
+	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+	lock_limit >>= PAGE_SHIFT;
+
+	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK))
+		return -EPERM;
+
+	mutex_lock(&counter->mmap_mutex);
+	if (atomic_inc_not_zero(&counter->mmap_count))
+		goto out;
+
+	WARN_ON(counter->data);
+	ret = perf_mmap_data_alloc(counter, nr_pages);
+	if (!ret)
+		atomic_set(&counter->mmap_count, 1);
+out:
+	mutex_unlock(&counter->mmap_mutex);
 
 	vma->vm_flags &= ~VM_MAYWRITE;
 	vma->vm_flags |= VM_RESERVED;
 	vma->vm_ops = &perf_mmap_vmops;
-	return 0;
+
+	return ret;
 }
 
 static const struct file_operations perf_fops = {
@@ -1434,30 +1402,94 @@ static const struct file_operations perf_fops = {
  * Output
  */
 
-static void perf_counter_store_irq(struct perf_counter *counter, u64 data)
+static int perf_output_write(struct perf_counter *counter, int nmi,
+			     void *buf, ssize_t size)
 {
-	struct perf_data *irqdata = counter->irqdata;
+	struct perf_mmap_data *data;
+	unsigned int offset, head, nr;
+	unsigned int len;
+	int ret, wakeup;
 
-	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
-		irqdata->overrun++;
-	} else {
-		u64 *p = (u64 *) &irqdata->data[irqdata->len];
+	rcu_read_lock();
+	ret = -ENOSPC;
+	data = rcu_dereference(counter->data);
+	if (!data)
+		goto out;
+
+	if (!data->nr_pages)
+		goto out;
+
+	ret = -EINVAL;
+	if (size > PAGE_SIZE)
+		goto out;
+
+	do {
+		offset = head = atomic_read(&data->head);
+		head += sizeof(u64);
+	} while (atomic_cmpxchg(&data->head, offset, head) != offset);
+
+	wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
 
-		*p = data;
-		irqdata->len += sizeof(u64);
+	nr = (offset >> PAGE_SHIFT) & (data->nr_pages - 1);
+	offset &= PAGE_SIZE - 1;
+
+	len = min_t(unsigned int, PAGE_SIZE - offset, size);
+	memcpy(data->data_pages[nr] + offset, buf, len);
+	size -= len;
+
+	if (size) {
+		nr = (nr + 1) & (data->nr_pages - 1);
+		memcpy(data->data_pages[nr], buf + len, size);
+	}
+
+	/*
+	 * generate a poll() wakeup for every page boundary crossed
+	 */
+	if (wakeup) {
+		__perf_counter_update_userpage(counter, data);
+		if (nmi) {
+			counter->wakeup_pending = 1;
+			set_perf_counter_pending();
+		} else
+			wake_up(&counter->waitq);
 	}
+	ret = 0;
+out:
+	rcu_read_unlock();
+
+	return ret;
 }
 
-static void perf_counter_handle_group(struct perf_counter *counter)
+static void perf_output_simple(struct perf_counter *counter,
+			       int nmi, struct pt_regs *regs)
+{
+	u64 entry;
+
+	entry = instruction_pointer(regs);
+
+	perf_output_write(counter, nmi, &entry, sizeof(entry));
+}
+
+struct group_entry {
+	u64 event;
+	u64 counter;
+};
+
+static void perf_output_group(struct perf_counter *counter, int nmi)
 {
 	struct perf_counter *leader, *sub;
 
 	leader = counter->group_leader;
 	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+		struct group_entry entry;
+
 		if (sub != counter)
 			sub->hw_ops->read(sub);
-		perf_counter_store_irq(counter, sub->hw_event.config);
-		perf_counter_store_irq(counter, atomic64_read(&sub->count));
+
+		entry.event = sub->hw_event.config;
+		entry.counter = atomic64_read(&sub->count);
+
+		perf_output_write(counter, nmi, &entry, sizeof(entry));
 	}
 }
 
@@ -1469,19 +1501,13 @@ void perf_counter_output(struct perf_counter *counter,
 		return;
 
 	case PERF_RECORD_IRQ:
-		perf_counter_store_irq(counter, instruction_pointer(regs));
+		perf_output_simple(counter, nmi, regs);
 		break;
 
 	case PERF_RECORD_GROUP:
-		perf_counter_handle_group(counter);
+		perf_output_group(counter, nmi);
 		break;
 	}
-
-	if (nmi) {
-		counter->wakeup_pending = 1;
-		set_perf_counter_pending();
-	} else
-		wake_up(&counter->waitq);
 }
 
 /*
@@ -1967,10 +1993,10 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	INIT_LIST_HEAD(&counter->sibling_list);
 	init_waitqueue_head(&counter->waitq);
 
+	mutex_init(&counter->mmap_mutex);
+
 	INIT_LIST_HEAD(&counter->child_list);
 
-	counter->irqdata		= &counter->data[0];
-	counter->usrdata		= &counter->data[1];
 	counter->cpu			= cpu;
 	counter->hw_event		= *hw_event;
 	counter->wakeup_pending		= 0;
-- 
cgit v0.10.2


From 803d4f3980f6e220b27311a283aab0a4d68b6709 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 23 Mar 2009 18:22:11 +0100
Subject: perf_counter tools: update to new syscall ABI

update the kerneltop userspace to work with the latest syscall ABI

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090323172417.559643732@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 81a68aa..a72c9bd 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -87,20 +87,90 @@
 
 #include <linux/unistd.h>
 
-#include "perfcounters.h"
+#include "include/linux/perf_counter.h"
 
 
+/*
+ * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
+ * counters in the current task.
+ */
+#define PR_TASK_PERF_COUNTERS_DISABLE   31
+#define PR_TASK_PERF_COUNTERS_ENABLE    32
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+#define rdclock()                                       \
+({                                                      \
+        struct timespec ts;                             \
+                                                        \
+        clock_gettime(CLOCK_MONOTONIC, &ts);            \
+        ts.tv_sec * 1000000000ULL + ts.tv_nsec;         \
+})
+
+/*
+ * Pick up some kernel type conventions:
+ */
+#define __user
+#define asmlinkage
+
+typedef unsigned int            __u32;
+typedef unsigned long long      __u64;
+typedef long long               __s64;
+
+
+#ifdef __x86_64__
+# define __NR_perf_counter_open 295
+#endif
+
+#ifdef __i386__
+# define __NR_perf_counter_open 333
+#endif
+
+#ifdef __powerpc__
+#define __NR_perf_counter_open 319
+#endif
+
+asmlinkage int sys_perf_counter_open(
+        struct perf_counter_hw_event    *hw_event_uptr          __user,
+        pid_t                           pid,
+        int                             cpu,
+        int                             group_fd,
+        unsigned long                   flags)
+{
+        int ret;
+
+        ret = syscall(
+                __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
+#if defined(__x86_64__) || defined(__i386__)
+        if (ret < 0 && ret > -4096) {
+                errno = -ret;
+                ret = -1;
+        }
+#endif
+        return ret;
+}
+
 #define MAX_COUNTERS			64
 #define MAX_NR_CPUS			256
 
-#define DEF_PERFSTAT_EVENTS		{ -2, -5, -4, -3, 0, 1, 2, 3}
+#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
 
 static int			run_perfstat			=  0;
 static int			system_wide			=  0;
 
 static int			nr_counters			=  0;
-static __s64			event_id[MAX_COUNTERS]		= DEF_PERFSTAT_EVENTS;
-static int			event_raw[MAX_COUNTERS];
+static __u64			event_id[MAX_COUNTERS]		= {
+	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
+	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
+	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
+	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
+
+	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
+	EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
+	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
+	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
+};
+static int			default_interval = 100000;
 static int			event_count[MAX_COUNTERS];
 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
 
@@ -156,49 +226,63 @@ static char *sw_event_names[] = {
 	"pagefaults",
 	"context switches",
 	"CPU migrations",
+	"minor faults",
+	"major faults",
 };
 
 struct event_symbol {
-	int event;
+	__u64 event;
 	char *symbol;
 };
 
 static struct event_symbol event_symbols[] = {
-	{PERF_COUNT_CPU_CYCLES,			"cpu-cycles",		},
-	{PERF_COUNT_CPU_CYCLES,			"cycles",		},
-	{PERF_COUNT_INSTRUCTIONS,		"instructions",		},
-	{PERF_COUNT_CACHE_REFERENCES,		"cache-references",	},
-	{PERF_COUNT_CACHE_MISSES,		"cache-misses",		},
-	{PERF_COUNT_BRANCH_INSTRUCTIONS,	"branch-instructions",	},
-	{PERF_COUNT_BRANCH_INSTRUCTIONS,	"branches",		},
-	{PERF_COUNT_BRANCH_MISSES,		"branch-misses",	},
-	{PERF_COUNT_BUS_CYCLES,			"bus-cycles",		},
-	{PERF_COUNT_CPU_CLOCK,			"cpu-ticks",		},
-	{PERF_COUNT_CPU_CLOCK,			"ticks",		},
-	{PERF_COUNT_TASK_CLOCK,			"task-ticks",		},
-	{PERF_COUNT_PAGE_FAULTS,		"page-faults",		},
-	{PERF_COUNT_PAGE_FAULTS,		"faults",		},
-	{PERF_COUNT_CONTEXT_SWITCHES,		"context-switches",	},
-	{PERF_COUNT_CONTEXT_SWITCHES,		"cs",			},
-	{PERF_COUNT_CPU_MIGRATIONS,		"cpu-migrations",	},
-	{PERF_COUNT_CPU_MIGRATIONS,		"migrations",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
+
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
 };
 
+#define __PERF_COUNTER_FIELD(config, name) \
+	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
+
+#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
+#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
+#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
+#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
+
 static void display_events_help(void)
 {
 	unsigned int i;
-	int e;
+	__u64 e;
 
 	printf(
 	" -e EVENT     --event=EVENT   #  symbolic-name        abbreviations");
 
-	for (i = 0, e = PERF_HW_EVENTS_MAX; i < ARRAY_SIZE(event_symbols); i++) {
-		if (e != event_symbols[i].event) {
-			e = event_symbols[i].event;
-			printf(
-	"\n                             %2d: %-20s", e, event_symbols[i].symbol);
-		} else
-			printf(" %s", event_symbols[i].symbol);
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		int type, id;
+
+		e = event_symbols[i].event;
+		type = PERF_COUNTER_TYPE(e);
+		id = PERF_COUNTER_ID(e);
+
+		printf("\n                             %d:%d: %-20s",
+				type, id, event_symbols[i].symbol);
 	}
 
 	printf("\n"
@@ -249,44 +333,51 @@ static void display_help(void)
 	exit(0);
 }
 
-static int type_valid(int type)
-{
-	if (type >= PERF_HW_EVENTS_MAX)
-		return 0;
-	if (type <= PERF_SW_EVENTS_MIN)
-		return 0;
-
-	return 1;
-}
-
 static char *event_name(int ctr)
 {
-	__s64 type = event_id[ctr];
+	__u64 config = event_id[ctr];
+	int type = PERF_COUNTER_TYPE(config);
+	int id = PERF_COUNTER_ID(config);
 	static char buf[32];
 
-	if (event_raw[ctr]) {
-		sprintf(buf, "raw 0x%llx", (long long)type);
+	if (PERF_COUNTER_RAW(config)) {
+		sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
 		return buf;
 	}
-	if (!type_valid(type))
-		return "unknown";
 
-	if (type >= 0)
-		return hw_event_names[type];
+	switch (type) {
+	case PERF_TYPE_HARDWARE:
+		if (id < PERF_HW_EVENTS_MAX)
+			return hw_event_names[id];
+		return "unknown-hardware";
+
+	case PERF_TYPE_SOFTWARE:
+		if (id < PERF_SW_EVENTS_MAX)
+			return sw_event_names[id];
+		return "unknown-software";
 
-	return sw_event_names[-type-1];
+	default:
+		break;
+	}
+
+	return "unknown";
 }
 
 /*
  * Each event can have multiple symbolic names.
  * Symbolic names are (almost) exactly matched.
  */
-static int match_event_symbols(char *str)
+static __u64 match_event_symbols(char *str)
 {
+	__u64 config, id;
+	int type;
 	unsigned int i;
 
-	if (isdigit(str[0]) || str[0] == '-')
-		return atoi(str);
+	if (sscanf(str, "r%llx", &config) == 1)
+		return config | PERF_COUNTER_RAW_MASK;
+
+	if (sscanf(str, "%d:%llu", &type, &id) == 2)
+		return EID(type, id);
 
 	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
 		if (!strncmp(str, event_symbols[i].symbol,
@@ -294,31 +385,22 @@ static int match_event_symbols(char *str)
 			return event_symbols[i].event;
 	}
 
-	return PERF_HW_EVENTS_MAX;
+	return ~0ULL;
 }
 
 static int parse_events(char *str)
 {
-	__s64 type;
-	int raw;
+	__u64 config;
 
 again:
 	if (nr_counters == MAX_COUNTERS)
 		return -1;
 
-	raw = 0;
-	if (*str == 'r') {
-		raw = 1;
-		++str;
-		type = strtol(str, NULL, 16);
-	} else {
-		type = match_event_symbols(str);
-		if (!type_valid(type))
-			return -1;
-	}
+	config = match_event_symbols(str);
+	if (config == ~0ULL)
+		return -1;
 
-	event_id[nr_counters] = type;
-	event_raw[nr_counters] = raw;
+	event_id[nr_counters] = config;
 	nr_counters++;
 
 	str = strstr(str, ",");
@@ -342,8 +424,7 @@ static void create_perfstat_counter(int counter)
 	struct perf_counter_hw_event hw_event;
 
 	memset(&hw_event, 0, sizeof(hw_event));
-	hw_event.type		= event_id[counter];
-	hw_event.raw		= event_raw[counter];
+	hw_event.config		= event_id[counter];
 	hw_event.record_type	= PERF_RECORD_SIMPLE;
 	hw_event.nmi		= 0;
 
@@ -428,7 +509,7 @@ int do_perfstat(int argc, char *argv[])
 			count += single_count;
 		}
 
-		if (!event_raw[counter] &&
+		if (!PERF_COUNTER_RAW(event_id[counter]) &&
 		    (event_id[counter] == PERF_COUNT_CPU_CLOCK ||
 		     event_id[counter] == PERF_COUNT_TASK_CLOCK)) {
 
@@ -911,7 +992,7 @@ static void record_ip(uint64_t ip, int counter)
 		assert(left <= middle && middle <= right);
 		if (!(left <= ip && ip <= right)) {
 			printf(" left: %016lx\n", left);
-			printf("   ip: %016lx\n", ip);
+			printf("   ip: %016llx\n", ip);
 			printf("right: %016lx\n", right);
 		}
 		assert(left <= ip && ip <= right);
@@ -983,7 +1064,7 @@ static void process_options(int argc, char *argv[])
 
 		switch (c) {
 		case 'a': system_wide			=	       1; break;
-		case 'c': event_count[nr_counters]	=   atoi(optarg); break;
+		case 'c': default_interval		=   atoi(optarg); break;
 		case 'C':
 			/* CPU and PID are mutually exclusive */
 			if (tid != -1) {
@@ -1032,10 +1113,7 @@ static void process_options(int argc, char *argv[])
 		if (event_count[counter])
 			continue;
 
-		if (event_id[counter] < PERF_HW_EVENTS_MAX)
-			event_count[counter] = default_count[event_id[counter]];
-		else
-			event_count[counter] = 100000;
+		event_count[counter] = default_interval;
 	}
 }
 
@@ -1070,12 +1148,13 @@ int main(int argc, char *argv[])
 				cpu = i;
 
 			memset(&hw_event, 0, sizeof(hw_event));
-			hw_event.type		= event_id[counter];
-			hw_event.raw		= event_raw[counter];
+			hw_event.config		= event_id[counter];
 			hw_event.irq_period	= event_count[counter];
 			hw_event.record_type	= PERF_RECORD_IRQ;
 			hw_event.nmi		= nmi;
 
+			printf("FOO: %d %llx %llx\n", counter, event_id[counter], event_count[counter]);
+
 			fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
 			fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
 			if (fd[i][counter] < 0) {
-- 
cgit v0.10.2


From bcbcb37cdb67d8100acfa66df40c4d636c28c4d1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 23 Mar 2009 18:22:12 +0100
Subject: perf_counter tools: use mmap() output

update kerneltop to use the mmap() output to gather overflow information

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090323172417.677932499@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index a72c9bd..80b7905 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -84,6 +84,7 @@
 #include <sys/prctl.h>
 #include <sys/wait.h>
 #include <sys/uio.h>
+#include <sys/mman.h>
 
 #include <linux/unistd.h>
 
@@ -119,17 +120,25 @@ typedef long long               __s64;
 
 
 #ifdef __x86_64__
-# define __NR_perf_counter_open 295
+#define __NR_perf_counter_open 295
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
 #endif
 
 #ifdef __i386__
-# define __NR_perf_counter_open 333
+#define __NR_perf_counter_open 333
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
 #endif
 
 #ifdef __powerpc__
 #define __NR_perf_counter_open 319
+#define rmb() 		asm volatile ("sync" ::: "memory")
+#define cpu_relax()	asm volatile ("" ::: "memory");
 #endif
 
+#define unlikely(x)	__builtin_expect(!!(x), 0)
+
 asmlinkage int sys_perf_counter_open(
         struct perf_counter_hw_event    *hw_event_uptr          __user,
         pid_t                           pid,
@@ -181,6 +190,7 @@ static int			profile_cpu			= -1;
 static int			nr_cpus				=  0;
 static int			nmi				=  1;
 static int			group				=  0;
+static unsigned int		page_size;
 
 static char			*vmlinux;
 
@@ -1117,16 +1127,68 @@ static void process_options(int argc, char *argv[])
 	}
 }
 
+struct mmap_data {
+	int counter;
+	void *base;
+	unsigned int mask;
+	unsigned int prev;
+};
+
+static unsigned int mmap_read_head(struct mmap_data *md)
+{
+	struct perf_counter_mmap_page *pc = md->base;
+	unsigned int seq, head;
+
+repeat:
+	rmb();
+	seq = pc->lock;
+
+	if (unlikely(seq & 1)) {
+		cpu_relax();
+		goto repeat;
+	}
+
+	head = pc->data_head;
+
+	rmb();
+	if (pc->lock != seq)
+		goto repeat;
+
+	return head;
+}
+
+static void mmap_read(struct mmap_data *md)
+{
+	unsigned int head = mmap_read_head(md);
+	unsigned int old = md->prev;
+	unsigned char *data = md->base + page_size;
+
+	if (head - old > md->mask) {
+		printf("ERROR: failed to keep up with mmap data\n");
+		exit(-1);
+	}
+
+	for (; old != head;) {
+		__u64 *ptr = (__u64 *)&data[old & md->mask];
+		old += sizeof(__u64);
+
+		process_event(*ptr, md->counter);
+	}
+
+	md->prev = old;
+}
+
 int main(int argc, char *argv[])
 {
 	struct pollfd event_array[MAX_NR_CPUS][MAX_COUNTERS];
+	struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 	struct perf_counter_hw_event hw_event;
 	int i, counter, group_fd;
 	unsigned int cpu;
-	uint64_t ip;
-	ssize_t res;
 	int ret;
 
+	page_size = sysconf(_SC_PAGE_SIZE);
+
 	process_options(argc, argv);
 
 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
@@ -1153,8 +1215,6 @@ int main(int argc, char *argv[])
 			hw_event.record_type	= PERF_RECORD_IRQ;
 			hw_event.nmi		= nmi;
 
-			printf("FOO: %d %llx %llx\n", counter, event_id[counter], event_count[counter]);
-
 			fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
 			fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
 			if (fd[i][counter] < 0) {
@@ -1174,6 +1234,17 @@ int main(int argc, char *argv[])
 
 			event_array[i][counter].fd = fd[i][counter];
 			event_array[i][counter].events = POLLIN;
+
+			mmap_array[i][counter].counter = counter;
+			mmap_array[i][counter].prev = 0;
+			mmap_array[i][counter].mask = 2*page_size - 1;
+			mmap_array[i][counter].base = mmap(NULL, 3*page_size,
+					PROT_READ, MAP_SHARED, fd[i][counter], 0);
+			if (mmap_array[i][counter].base == MAP_FAILED) {
+				printf("kerneltop error: failed to mmap with %d (%s)\n",
+						errno, strerror(errno));
+				exit(-1);
+			}
 		}
 	}
 
@@ -1188,14 +1259,8 @@ int main(int argc, char *argv[])
 		int hits = events;
 
 		for (i = 0; i < nr_cpus; i++) {
-			for (counter = 0; counter < nr_counters; counter++) {
-				res = read(fd[i][counter], (char *) &ip, sizeof(ip));
-				if (res > 0) {
-					assert(res == sizeof(ip));
-
-					process_event(ip, counter);
-				}
-			}
+			for (counter = 0; counter < nr_counters; counter++)
+				mmap_read(&mmap_array[i][counter]);
 		}
 
 		if (time(NULL) >= last_refresh + delay_secs) {
-- 
cgit v0.10.2


From 383c5f8cd7d253be0017d8d5a39cbb78aaf70206 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 23 Mar 2009 21:49:25 +0100
Subject: perf_counter tools: tidy up in-kernel dependencies

Remove now unified perfstat.c and perf_counter.h, and link to the
in-kernel perf_counter.h.

Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Orig-LKML-Reference: <20090323172417.677932499@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index b457497..666da95 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -2,7 +2,7 @@ BINS = kerneltop perfstat
 
 all: $(BINS)
 
-kerneltop: kerneltop.c perfcounters.h
+kerneltop: kerneltop.c ../../include/linux/perf_counter.h
 	cc -O6 -Wall -lrt `pkg-config --cflags --libs glib-2.0` -o $@ $<
 
 perfstat: kerneltop
diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 80b7905..25e80bc 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -88,7 +88,7 @@
 
 #include <linux/unistd.h>
 
-#include "include/linux/perf_counter.h"
+#include "../../include/linux/perf_counter.h"
 
 
 /*
diff --git a/Documentation/perf_counter/perfcounters.h b/Documentation/perf_counter/perfcounters.h
deleted file mode 100644
index 32e24b9..0000000
--- a/Documentation/perf_counter/perfcounters.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Ioctls that can be done on a perf counter fd:
- */
-#define PERF_COUNTER_IOC_ENABLE		_IO('$', 0)
-#define PERF_COUNTER_IOC_DISABLE	_IO('$', 1)
-
-/*
- * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
- * counters in the current task.
- */
-#define PR_TASK_PERF_COUNTERS_DISABLE	31
-#define PR_TASK_PERF_COUNTERS_ENABLE    32
-
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-
-#define rdclock()					\
-({							\
-	struct timespec ts;				\
-							\
-	clock_gettime(CLOCK_MONOTONIC, &ts);		\
-	ts.tv_sec * 1000000000ULL + ts.tv_nsec;		\
-})
-
-/*
- * Pick up some kernel type conventions:
- */
-#define __user
-#define asmlinkage
-
-typedef unsigned int		__u32;
-typedef unsigned long long	__u64;
-typedef long long		__s64;
-
-/*
- * User-space ABI bits:
- */
-
-/*
- * Generalized performance counter event types, used by the hw_event.type
- * parameter of the sys_perf_counter_open() syscall:
- */
-enum hw_event_types {
-	/*
-	 * Common hardware events, generalized by the kernel:
-	 */
-	PERF_COUNT_CPU_CYCLES		=  0,
-	PERF_COUNT_INSTRUCTIONS		=  1,
-	PERF_COUNT_CACHE_REFERENCES	=  2,
-	PERF_COUNT_CACHE_MISSES		=  3,
-	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
-	PERF_COUNT_BRANCH_MISSES	=  5,
-	PERF_COUNT_BUS_CYCLES		=  6,
-
-	PERF_HW_EVENTS_MAX		=  7,
-
-	/*
-	 * Special "software" counters provided by the kernel, even if
-	 * the hardware does not support performance counters. These
-	 * counters measure various physical and sw events of the
-	 * kernel (and allow the profiling of them as well):
-	 */
-	PERF_COUNT_CPU_CLOCK		= -1,
-	PERF_COUNT_TASK_CLOCK		= -2,
-	PERF_COUNT_PAGE_FAULTS		= -3,
-	PERF_COUNT_CONTEXT_SWITCHES	= -4,
-	PERF_COUNT_CPU_MIGRATIONS	= -5,
-
-	PERF_SW_EVENTS_MIN		= -6,
-};
-
-/*
- * IRQ-notification data record type:
- */
-enum perf_counter_record_type {
-	PERF_RECORD_SIMPLE		=  0,
-	PERF_RECORD_IRQ			=  1,
-	PERF_RECORD_GROUP		=  2,
-};
-
-/*
- * Hardware event to monitor via a performance monitoring counter:
- */
-struct perf_counter_hw_event {
-	__s64			type;
-
-	__u64			irq_period;
-	__u64			record_type;
-	__u64			read_format;
-
-	__u64			disabled       :  1, /* off by default        */
-				nmi	       :  1, /* NMI sampling          */
-				raw	       :  1, /* raw event type        */
-				inherit	       :  1, /* children inherit it   */
-				pinned	       :  1, /* must always be on PMU */
-				exclusive      :  1, /* only group on PMU     */
-				exclude_user   :  1, /* don't count user      */
-				exclude_kernel :  1, /* ditto kernel          */
-				exclude_hv     :  1, /* ditto hypervisor      */
-				exclude_idle   :  1, /* don't count when idle */
-
-				__reserved_1   : 54;
-
-	__u32			extra_config_len;
-	__u32			__reserved_4;
-
-	__u64			__reserved_2;
-	__u64			__reserved_3;
-};
-
-
-#ifdef __x86_64__
-# define __NR_perf_counter_open	295
-#endif
-
-#ifdef __i386__
-# define __NR_perf_counter_open 333
-#endif
-
-#ifdef __powerpc__
-#define __NR_perf_counter_open 319
-#endif
-
-asmlinkage int sys_perf_counter_open(
-
-	struct perf_counter_hw_event	*hw_event_uptr		__user,
-	pid_t				pid,
-	int				cpu,
-	int				group_fd,
-	unsigned long			flags)
-{
-	int ret;
-
-	ret = syscall(
-		__NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
-#if defined(__x86_64__) || defined(__i386__)
-	if (ret < 0 && ret > -4096) {
-		errno = -ret;
-		ret = -1;
-	}
-#endif
-	return ret;
-}
diff --git a/Documentation/perf_counter/perfstat.c b/Documentation/perf_counter/perfstat.c
deleted file mode 100644
index fd59446..0000000
--- a/Documentation/perf_counter/perfstat.c
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- * perfstat:  /usr/bin/time -alike performance counter statistics utility
- *
- *        It summarizes the counter events of all tasks (and child tasks),
- *        covering all CPUs that the command (or workload) executes on.
- *        It only counts the per-task events of the workload started,
- *        independent of how many other tasks run on those CPUs.
- *
- * Build with:       cc -O2 -g -lrt -Wall -W -o perfstat perfstat.c
- *
- * Sample output:
- *
-
-   $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
-
-   Performance counter stats for 'ls':
-
-           163516953 instructions
-                2295 cache-misses
-             2855182 branch-misses
-
- *
- * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
- *
- * Released under the GPLv2 (not later).
- *
- * Percpu counter support by: Yanmin Zhang <yanmin_zhang@linux.intel.com>
- * Symbolic event options by: Wu Fengguang <fengguang.wu@intel.com>
- */
-#define _GNU_SOURCE
-
-#include <assert.h>
-#include <getopt.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <ctype.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <time.h>
-
-#include <sys/syscall.h>
-#include <sys/ioctl.h>
-#include <sys/prctl.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <sys/wait.h>
-#include <sys/uio.h>
-
-#include <linux/unistd.h>
-
-#include "perfcounters.h"
-
-static int			nr_cpus			= 0;
-
-static int			system_wide		= 0;
-
-static void display_help(void)
-{
-	unsigned int i;
-	int e;
-
-	printf(
-	"Usage: perfstat [<events...>] <cmd...>\n\n"
-	"PerfStat Options (up to %d event types can be specified):\n\n",
-		 MAX_COUNTERS);
-	printf(
-	" -e EVENT     --event=EVENT        #  symbolic-name        abbreviations");
-
-	for (i = 0, e = PERF_HW_EVENTS_MAX; i < ARRAY_SIZE(event_symbols); i++) {
-		if (e != event_symbols[i].event) {
-			e = event_symbols[i].event;
-			printf(
-	"\n                                  %2d: %-20s", e, event_symbols[i].symbol);
-		} else
-			printf(" %s", event_symbols[i].symbol);
-	}
-
-	printf("\n"
-	"                                rNNN: raw event type\n\n"
-	" -s                                # system-wide collection\n\n"
-	" -c <cmd..>   --command=<cmd..>    # command+arguments to be timed.\n"
-	"\n");
-	exit(0);
-}
-
-static void process_options(int argc, char *argv[])
-{
-	for (;;) {
-		int option_index = 0;
-		/** Options for getopt */
-		static struct option long_options[] = {
-			{"event",	required_argument,	NULL, 'e'},
-			{"help",	no_argument,		NULL, 'h'},
-			{"command",	no_argument,		NULL, 'c'},
-			{NULL,		0,			NULL,  0 }
-		};
-		int c = getopt_long(argc, argv, "+:e:c:s",
-				    long_options, &option_index);
-		if (c == -1)
-			break;
-
-		switch (c) {
-		case 'c':
-			break;
-		case 's':
-			system_wide = 1;
-			break;
-		case 'e':
-			parse_events(optarg);
-			break;
-		default:
-			break;
-		}
-	}
-	if (optind == argc)
-		goto err;
-
-	if (!nr_counters)
-		nr_counters = 8;
-	return;
-
-err:
-	display_help();
-}
-
-char fault_here[1000000];
-
-static int fd[MAX_NR_CPUS][MAX_COUNTERS];
-
-static void create_counter(int counter)
-{
-	struct perf_counter_hw_event hw_event;
-
-	memset(&hw_event, 0, sizeof(hw_event));
-	hw_event.type		= event_id[counter];
-	hw_event.raw		= event_raw[counter];
-	hw_event.record_type	= PERF_RECORD_SIMPLE;
-	hw_event.nmi		= 0;
-
-	if (system_wide) {
-		int cpu;
-		for (cpu = 0; cpu < nr_cpus; cpu ++) {
-			fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
-			if (fd[cpu][counter] < 0) {
-				printf("perfstat error: syscall returned with %d (%s)\n",
-						fd[cpu][counter], strerror(errno));
-				exit(-1);
-			}
-			
-		}
-	} else {
-		hw_event.inherit	= 1;
-		hw_event.disabled	= 1;
-
-		fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
-		if (fd[0][counter] < 0) {
-			printf("perfstat error: syscall returned with %d (%s)\n",
-					fd[0][counter], strerror(errno));
-			exit(-1);
-		}
-	}
-}
-
-
-int main(int argc, char *argv[])
-{
-	unsigned long long t0, t1;
-	int counter;
-	ssize_t res;
-	int status;
-	int pid;
-
-	process_options(argc, argv);
-
-	if (system_wide) {
-		nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
-		assert(nr_cpus <= MAX_NR_CPUS);
-		assert(nr_cpus >= 0);
-	} else
-		nr_cpus = 1;
-
-	for (counter = 0; counter < nr_counters; counter++)
-		create_counter(counter);
-
-	argc -= optind;
-	argv += optind;
-
-	/*
-	 * Enable counters and exec the command:
-	 */
-	t0 = rdclock();
-	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
-
-	if ((pid = fork()) < 0)
-		perror("failed to fork");
-	if (!pid) {
-		if (execvp(argv[0], argv)) {
-			perror(argv[0]);
-			exit(-1);
-		}
-	}
-	while (wait(&status) >= 0)
-		;
-	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
-	t1 = rdclock();
-
-	fflush(stdout);
-
-	fprintf(stderr, "\n");
-	fprintf(stderr, " Performance counter stats for \'%s\':\n",
-		argv[0]);
-	fprintf(stderr, "\n");
-
-	for (counter = 0; counter < nr_counters; counter++) {
-		int cpu;
-		__u64 count, single_count;
-
-		count = 0;
-		for (cpu = 0; cpu < nr_cpus; cpu ++) {
-			res = read(fd[cpu][counter],
-					(char *) &single_count, sizeof(single_count));
-			assert(res == sizeof(single_count));
-			count += single_count;
-		}
-
-		if (!event_raw[counter] &&
-		    (event_id[counter] == PERF_COUNT_CPU_CLOCK ||
-		     event_id[counter] == PERF_COUNT_TASK_CLOCK)) {
-
-			double msecs = (double)count / 1000000;
-
-			fprintf(stderr, " %14.6f  %-20s (msecs)\n",
-				msecs, event_name(counter));
-		} else {
-			fprintf(stderr, " %14Ld  %-20s (events)\n",
-				count, event_name(counter));
-		}
-		if (!counter)
-			fprintf(stderr, "\n");
-	}
-	fprintf(stderr, "\n");
-	fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
-			(double)(t1-t0)/1e6);
-	fprintf(stderr, "\n");
-
-	return 0;
-}
-- 
cgit v0.10.2


From 193e8df1b465f206f8a286202680702e1c153367 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 23 Mar 2009 22:23:16 +0100
Subject: perf_counter tools: fix build warning in kerneltop.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix:

 kerneltop.c: In function ‘record_ip’:
 kerneltop.c:1005: warning: format ‘%016llx’ expects type ‘long long unsigned int’, but argument 2 has type ‘uint64_t’

Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Orig-LKML-Reference: <20090323172417.677932499@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 25e80bc..8f9a303 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -1002,7 +1002,7 @@ static void record_ip(uint64_t ip, int counter)
 		assert(left <= middle && middle <= right);
 		if (!(left <= ip && ip <= right)) {
 			printf(" left: %016lx\n", left);
-			printf("   ip: %016llx\n", ip);
+			printf("   ip: %016lx\n", (unsigned long)ip);
 			printf("right: %016lx\n", right);
 		}
 		assert(left <= ip && ip <= right);
-- 
cgit v0.10.2


From 81cdbe0509542324ad7d3282ab67c2b6716df663 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 23 Mar 2009 22:29:50 +0100
Subject: perf_counter tools: increase cpu-cycles again

Commit b7368fdd7d decreased the CPU cycles interval 100-fold, but
this is causig kerneltop failures on my Nehalem box:

 aldebaran:/home/mingo/linux/linux/Documentation/perf_counter>
 ./kerneltop
 KernelTop refresh period: 2 seconds
 ERROR: failed to keep up with mmap data

10,000 cycles is way too short.

What we should do instead on mostly-idle systems is some sort of
read/poll timeout, so that we display something every 2 seconds
for sure.

Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 8f9a303..2ab29b5 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -212,7 +212,7 @@ struct source_line {
 
 
 const unsigned int default_count[] = {
-	  10000,
+	1000000,
 	1000000,
 	  10000,
 	  10000,
-- 
cgit v0.10.2


From cbe46555dc4de6403cd757139d42289b5f21abb9 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Tue, 24 Mar 2009 16:52:34 +1100
Subject: perf_counter tools: remove glib dependency and fix bugs in
 kerneltop.c

The glib dependency in kerneltop.c is only for a little bit of list
manipulation, and I find it inconvenient.  This adds a 'next' field to
struct source_line, which lets us link them together into a list.  The
code to do the linking ourselves turns out to be no longer or more
difficult than using glib.

This also fixes a few other problems:

- We need to #include <limits.h> to get PATH_MAX on powerpc.

- We need to #include <linux/types.h> rather than have our own
  definitions of __u64 and __s64; on powerpc the installed headers
  define them to be unsigned long and long respectively, and if we
  have our own, different definition here that causes a compile error.

- This takes out the x86 setting of errno from -ret in
  sys_perf_counter_open.  My experiments on x86 indicate that the
  glibc syscall() does this for us already.

- We had two CPU migration counters in the default set, which seems
  unnecessary; I changed one of them to a context switch counter.

- In perfstat mode we were printing CPU cycles and instructions as
  milliseconds, and the cpu clock and task clock counters as events.
  This fixes that.

- In perfstat mode we were still printing a blank line after the first
  counter, which was a holdover from when a task clock counter was
  automatically included as the first counter.  This removes the blank
  line.

- On a test machine here, parse_symbols() and parse_vmlinux() were
  taking long enough (almost 0.5 seconds) for the mmap buffer to
  overflow before we got to the first mmap_read() call, so this moves
  them before we open all the counters.

- The error message if sys_perf_counter_open fails needs to use errno,
  not -fd[i][counter].

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Mike Galbraith <efault@gmx.de>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Orig-LKML-Reference: <18888.29986.340328.540512@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 666da95..194b662 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -3,7 +3,7 @@ BINS = kerneltop perfstat
 all: $(BINS)
 
 kerneltop: kerneltop.c ../../include/linux/perf_counter.h
-	cc -O6 -Wall -lrt `pkg-config --cflags --libs glib-2.0` -o $@ $<
+	cc -O6 -Wall -lrt -o $@ $<
 
 perfstat: kerneltop
 	ln -sf kerneltop perfstat
diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 2ab29b5..ea13e4e 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -3,7 +3,7 @@
 
    Build with:
 
-     cc -O6 -Wall -lrt `pkg-config --cflags --libs glib-2.0` -o kerneltop kerneltop.c
+     cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
 
    Sample output:
 
@@ -56,6 +56,7 @@
   *   Yanmin Zhang <yanmin.zhang@intel.com>
   *   Wu Fengguang <fengguang.wu@intel.com>
   *   Mike Galbraith <efault@gmx.de>
+  *   Paul Mackerras <paulus@samba.org>
   *
   * Released under the GPL v2. (and only v2, not any later version)
   */
@@ -68,6 +69,7 @@
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
+#include <limits.h>
 #include <getopt.h>
 #include <assert.h>
 #include <fcntl.h>
@@ -76,8 +78,6 @@
 #include <ctype.h>
 #include <time.h>
 
-#include <glib.h>
-
 #include <sys/syscall.h>
 #include <sys/ioctl.h>
 #include <sys/poll.h>
@@ -87,6 +87,7 @@
 #include <sys/mman.h>
 
 #include <linux/unistd.h>
+#include <linux/types.h>
 
 #include "../../include/linux/perf_counter.h"
 
@@ -114,11 +115,6 @@
 #define __user
 #define asmlinkage
 
-typedef unsigned int            __u32;
-typedef unsigned long long      __u64;
-typedef long long               __s64;
-
-
 #ifdef __x86_64__
 #define __NR_perf_counter_open 295
 #define rmb()		asm volatile("lfence" ::: "memory")
@@ -146,17 +142,8 @@ asmlinkage int sys_perf_counter_open(
         int                             group_fd,
         unsigned long                   flags)
 {
-        int ret;
-
-        ret = syscall(
+        return syscall(
                 __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
-#if defined(__x86_64__) || defined(__i386__)
-        if (ret < 0 && ret > -4096) {
-                errno = -ret;
-                ret = -1;
-        }
-#endif
-        return ret;
 }
 
 #define MAX_COUNTERS			64
@@ -170,7 +157,7 @@ static int			system_wide			=  0;
 static int			nr_counters			=  0;
 static __u64			event_id[MAX_COUNTERS]		= {
 	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
+	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
 	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
 	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
 
@@ -202,14 +189,15 @@ static int			delay_secs			=  2;
 static int			zero;
 static int			dump_symtab;
 
-static GList			*lines;
-
 struct source_line {
 	uint64_t		EIP;
 	unsigned long		count;
 	char			*line;
+	struct source_line	*next;
 };
 
+static struct source_line	*lines;
+static struct source_line	**lines_tail;
 
 const unsigned int default_count[] = {
 	1000000,
@@ -519,9 +507,8 @@ int do_perfstat(int argc, char *argv[])
 			count += single_count;
 		}
 
-		if (!PERF_COUNTER_RAW(event_id[counter]) &&
-		    (event_id[counter] == PERF_COUNT_CPU_CLOCK ||
-		     event_id[counter] == PERF_COUNT_TASK_CLOCK)) {
+		if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
+		    event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
 
 			double msecs = (double)count / 1000000;
 
@@ -531,8 +518,6 @@ int do_perfstat(int argc, char *argv[])
 			fprintf(stderr, " %14Ld  %-20s (events)\n",
 				count, event_name(counter));
 		}
-		if (!counter)
-			fprintf(stderr, "\n");
 	}
 	fprintf(stderr, "\n");
 	fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
@@ -554,7 +539,7 @@ struct sym_entry {
 	char			*sym;
 	unsigned long		count[MAX_COUNTERS];
 	int			skip;
-	GList			*source;
+	struct source_line	*source;
 };
 
 #define MAX_SYMS		100000
@@ -855,6 +840,7 @@ static void parse_vmlinux(char *filename)
 	if (!file)
 		return;
 
+	lines_tail = &lines;
 	while (!feof(file)) {
 		struct source_line *src;
 		size_t dummy = 0;
@@ -873,7 +859,9 @@ static void parse_vmlinux(char *filename)
 		if (c)
 			*c = 0;
 
-		lines = g_list_prepend(lines, src);
+		src->next = NULL;
+		*lines_tail = src;
+		lines_tail = &src->next;
 
 		if (strlen(src->line)>8 && src->line[8] == ':')
 			src->EIP = strtoull(src->line, NULL, 16);
@@ -881,52 +869,43 @@ static void parse_vmlinux(char *filename)
 			src->EIP = strtoull(src->line, NULL, 16);
 	}
 	pclose(file);
-	lines = g_list_reverse(lines);
 }
 
 static void record_precise_ip(uint64_t ip)
 {
 	struct source_line *line;
-	GList *item;
 
-	item = g_list_first(lines);
-	while (item) {
-		line = item->data;
+	for (line = lines; line; line = line->next) {
 		if (line->EIP == ip)
 			line->count++;
 		if (line->EIP > ip)
 			break;
-		item = g_list_next(item);
 	}
 }
 
 static void lookup_sym_in_vmlinux(struct sym_entry *sym)
 {
 	struct source_line *line;
-	GList *item;
 	char pattern[PATH_MAX];
 	sprintf(pattern, "<%s>:", sym->sym);
 
-	item = g_list_first(lines);
-	while (item) {
-		line = item->data;
+	for (line = lines; line; line = line->next) {
 		if (strstr(line->line, pattern)) {
-			sym->source = item;
+			sym->source = line;
 			break;
 		}
-		item = g_list_next(item);
 	}
 }
 
-void show_lines(GList *item_queue, int item_queue_count)
+static void show_lines(struct source_line *line_queue, int line_queue_count)
 {
 	int i;
 	struct source_line *line;
 
-	for (i = 0; i < item_queue_count; i++) {
-		line = item_queue->data;
+	line = line_queue;
+	for (i = 0; i < line_queue_count; i++) {
 		printf("%8li\t%s\n", line->count, line->line);
-		item_queue = g_list_next(item_queue);
+		line = line->next;
 	}
 }
 
@@ -935,10 +914,9 @@ void show_lines(GList *item_queue, int item_queue_count)
 static void show_details(struct sym_entry *sym)
 {
 	struct source_line *line;
-	GList *item;
+	struct source_line *line_queue = NULL;
 	int displayed = 0;
-	GList *item_queue = NULL;
-	int item_queue_count = 0;
+	int line_queue_count = 0;
 
 	if (!sym->source)
 		lookup_sym_in_vmlinux(sym);
@@ -947,30 +925,29 @@ static void show_details(struct sym_entry *sym)
 
 	printf("Showing details for %s\n", sym->sym);
 
-	item = sym->source;
-	while (item) {
-		line = item->data;
+	line = sym->source;
+	while (line) {
 		if (displayed && strstr(line->line, ">:"))
 			break;
 
-		if (!item_queue_count)
-			item_queue = item;
-		item_queue_count ++;
+		if (!line_queue_count)
+			line_queue = line;
+		line_queue_count ++;
 
 		if (line->count >= count_filter) {
-			show_lines(item_queue, item_queue_count);
-			item_queue_count = 0;
-			item_queue = NULL;
-		} else if (item_queue_count > TRACE_COUNT) {
-			item_queue = g_list_next(item_queue);
-			item_queue_count --;
+			show_lines(line_queue, line_queue_count);
+			line_queue_count = 0;
+			line_queue = NULL;
+		} else if (line_queue_count > TRACE_COUNT) {
+			line_queue = line_queue->next;
+			line_queue_count --;
 		}
 
 		line->count = 0;
 		displayed++;
 		if (displayed > 300)
 			break;
-		item = g_list_next(item);
+		line = line->next;
 	}
 }
 
@@ -1201,6 +1178,10 @@ int main(int argc, char *argv[])
 	if (tid != -1 || profile_cpu != -1)
 		nr_cpus = 1;
 
+	parse_symbols();
+	if (vmlinux && sym_filter_entry)
+		parse_vmlinux(vmlinux);
+
 	for (i = 0; i < nr_cpus; i++) {
 		group_fd = -1;
 		for (counter = 0; counter < nr_counters; counter++) {
@@ -1216,15 +1197,16 @@ int main(int argc, char *argv[])
 			hw_event.nmi		= nmi;
 
 			fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
-			fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
 			if (fd[i][counter] < 0) {
+				int err = errno;
 				printf("kerneltop error: syscall returned with %d (%s)\n",
-					fd[i][counter], strerror(-fd[i][counter]));
-				if (fd[i][counter] == -1)
+					fd[i][counter], strerror(err));
+				if (err == EPERM)
 					printf("Are you root?\n");
 				exit(-1);
 			}
 			assert(fd[i][counter] >= 0);
+			fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
 
 			/*
 			 * First counter acts as the group leader:
@@ -1248,10 +1230,6 @@ int main(int argc, char *argv[])
 		}
 	}
 
-	parse_symbols();
-	if (vmlinux && sym_filter_entry)
-		parse_vmlinux(vmlinux);
-
 	printf("KernelTop refresh period: %d seconds\n", delay_secs);
 	last_refresh = time(NULL);
 
-- 
cgit v0.10.2


From 0fd112e41cd6f6d4779cbe327c3632d087e31476 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 24 Mar 2009 10:50:24 +0100
Subject: perf_counter tools: remove glib dependency and fix bugs in
 kerneltop.c, fix poll()

Paul Mackerras wrote:

> I noticed the poll stuff is bogus - we have a 2D array of struct
> pollfds (MAX_NR_CPUS x MAX_COUNTERS), we fill in a sub-array (with the
> rest being uninitialized, since the array is on the stack) and then
> pass the first nr_cpus elements to poll.  Not what we really meant, I
> suspect. :)  Not even if we only have one counter, since it's the
> counter dimension that varies fastest.

This should fix the most obvious poll fubar.. not enough to fix the
full problem though..

Reported-by: Paul Mackerras <paulus@samba.org>
Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Orig-LKML-Reference: <18888.29986.340328.540512@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index ea13e4e..7ebde7a 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -1157,10 +1157,10 @@ static void mmap_read(struct mmap_data *md)
 
 int main(int argc, char *argv[])
 {
-	struct pollfd event_array[MAX_NR_CPUS][MAX_COUNTERS];
+	struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
 	struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 	struct perf_counter_hw_event hw_event;
-	int i, counter, group_fd;
+	int i, counter, group_fd, nr_poll = 0;
 	unsigned int cpu;
 	int ret;
 
@@ -1214,8 +1214,9 @@ int main(int argc, char *argv[])
 			if (group && group_fd == -1)
 				group_fd = fd[i][counter];
 
-			event_array[i][counter].fd = fd[i][counter];
-			event_array[i][counter].events = POLLIN;
+			event_array[nr_poll].fd = fd[i][counter];
+			event_array[nr_poll].events = POLLIN;
+			nr_poll++;
 
 			mmap_array[i][counter].counter = counter;
 			mmap_array[i][counter].prev = 0;
@@ -1247,7 +1248,7 @@ int main(int argc, char *argv[])
 		}
 
 		if (hits == events)
-			ret = poll(event_array[0], nr_cpus, 1000);
+			ret = poll(event_array, nr_poll, 1000);
 		hits = events;
 	}
 
-- 
cgit v0.10.2


From f66c6b2066b44d4ab8e8ac1ee4cae543738fe2ac Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 23 Mar 2009 10:29:36 +1100
Subject: perf_counter: update documentation

Impact: documentation fix

This updates the perfcounter documentation to reflect recent changes.

Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/Documentation/perf_counter/design.txt b/Documentation/perf_counter/design.txt
index fddd321..aaf105c 100644
--- a/Documentation/perf_counter/design.txt
+++ b/Documentation/perf_counter/design.txt
@@ -11,7 +11,9 @@ thus be used to profile the code that runs on that CPU.
 
 The Linux Performance Counter subsystem provides an abstraction of these
 hardware capabilities. It provides per task and per CPU counters, counter
-groups, and it provides event capabilities on top of those.
+groups, and it provides event capabilities on top of those.  It
+provides "virtual" 64-bit counters, regardless of the width of the
+underlying hardware counters.
 
 Performance counters are accessed via special file descriptors.
 There's one file descriptor per virtual counter used.
@@ -20,7 +22,8 @@ The special file descriptor is opened via the perf_counter_open()
 system call:
 
    int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
-			     pid_t pid, int cpu, int group_fd);
+			     pid_t pid, int cpu, int group_fd,
+			     unsigned long flags);
 
 The syscall returns the new fd. The fd can be used via the normal
 VFS system calls: read() can be used to read the counter, fcntl()
@@ -32,90 +35,180 @@ can be poll()ed.
 When creating a new counter fd, 'perf_counter_hw_event' is:
 
 /*
- * Hardware event to monitor via a performance monitoring counter:
+ * Event to monitor via a performance monitoring counter:
  */
 struct perf_counter_hw_event {
-	s64			type;
+	__u64			event_config;
 
-	u64			irq_period;
-	u32			record_type;
+	__u64			irq_period;
+	__u64			record_type;
+	__u64			read_format;
 
-	u32			disabled     :  1, /* off by default */
-				nmi	     :  1, /* NMI sampling   */
-				raw	     :  1, /* raw event type */
-				__reserved_1 : 29;
+	__u64			disabled       :  1, /* off by default        */
+				nmi	       :  1, /* NMI sampling          */
+				inherit	       :  1, /* children inherit it   */
+				pinned	       :  1, /* must always be on PMU */
+				exclusive      :  1, /* only group on PMU     */
+				exclude_user   :  1, /* don't count user      */
+				exclude_kernel :  1, /* ditto kernel          */
+				exclude_hv     :  1, /* ditto hypervisor      */
+				exclude_idle   :  1, /* don't count when idle */
 
-	u64			__reserved_2;
+				__reserved_1   : 55;
+
+	__u32			extra_config_len;
+
+	__u32			__reserved_4;
+	__u64			__reserved_2;
+	__u64			__reserved_3;
 };
 
+The 'event_config' field specifies what the counter should count.  It
+is divided into 3 bit-fields:
+
+raw_type: 1 bit (most significant bit)		0x8000_0000_0000_0000
+type:	  7 bits (next most significant)	0x7f00_0000_0000_0000
+event_id: 56 bits (least significant)		0x00ff_0000_0000_0000
+
+If 'raw_type' is 1, then the counter will count a hardware event
+specified by the remaining 63 bits of event_config.  The encoding is
+machine-specific.
+
+If 'raw_type' is 0, then the 'type' field says what kind of counter
+this is, with the following encoding:
+
+enum perf_event_types {
+	PERF_TYPE_HARDWARE		= 0,
+	PERF_TYPE_SOFTWARE		= 1,
+	PERF_TYPE_TRACEPOINT		= 2,
+};
+
+A counter of PERF_TYPE_HARDWARE will count the hardware event
+specified by 'event_id':
+
 /*
- * Generalized performance counter event types, used by the hw_event.type
+ * Generalized performance counter event types, used by the hw_event.event_id
  * parameter of the sys_perf_counter_open() syscall:
  */
-enum hw_event_types {
+enum hw_event_ids {
 	/*
 	 * Common hardware events, generalized by the kernel:
 	 */
-	PERF_COUNT_CYCLES		=  0,
-	PERF_COUNT_INSTRUCTIONS		=  1,
-	PERF_COUNT_CACHE_REFERENCES	=  2,
-	PERF_COUNT_CACHE_MISSES		=  3,
-	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
-	PERF_COUNT_BRANCH_MISSES	=  5,
-
-	/*
-	 * Special "software" counters provided by the kernel, even if
-	 * the hardware does not support performance counters. These
-	 * counters measure various physical and sw events of the
-	 * kernel (and allow the profiling of them as well):
-	 */
-	PERF_COUNT_CPU_CLOCK		= -1,
-	PERF_COUNT_TASK_CLOCK		= -2,
-	/*
-	 * Future software events:
-	 */
-	/* PERF_COUNT_PAGE_FAULTS	= -3,
-	   PERF_COUNT_CONTEXT_SWITCHES	= -4, */
+	PERF_COUNT_CPU_CYCLES		= 0,
+	PERF_COUNT_INSTRUCTIONS		= 1,
+	PERF_COUNT_CACHE_REFERENCES	= 2,
+	PERF_COUNT_CACHE_MISSES		= 3,
+	PERF_COUNT_BRANCH_INSTRUCTIONS	= 4,
+	PERF_COUNT_BRANCH_MISSES	= 5,
+	PERF_COUNT_BUS_CYCLES		= 6,
 };
 
-These are standardized types of events that work uniformly on all CPUs
-that implements Performance Counters support under Linux. If a CPU is
-not able to count branch-misses, then the system call will return
--EINVAL.
+These are standardized types of events that work relatively uniformly
+on all CPUs that implement Performance Counters support under Linux,
+although there may be variations (e.g., different CPUs might count
+cache references and misses at different levels of the cache hierarchy).
+If a CPU is not able to count the selected event, then the system call
+will return -EINVAL.
 
-More hw_event_types are supported as well, but they are CPU
-specific and are enumerated via /sys on a per CPU basis. Raw hw event
-types can be passed in under hw_event.type if hw_event.raw is 1.
-For example, to count "External bus cycles while bus lock signal asserted"
-events on Intel Core CPUs, pass in a 0x4064 event type value and set
-hw_event.raw to 1.
+More hw_event_types are supported as well, but they are CPU-specific
+and accessed as raw events.  For example, to count "External bus
+cycles while bus lock signal asserted" events on Intel Core CPUs, pass
+in a 0x4064 event_id value and set hw_event.raw_type to 1.
 
-'record_type' is the type of data that a read() will provide for the
-counter, and it can be one of:
+A counter of type PERF_TYPE_SOFTWARE will count one of the available
+software events, selected by 'event_id':
 
 /*
- * IRQ-notification data record type:
+ * Special "software" counters provided by the kernel, even if the hardware
+ * does not support performance counters. These counters measure various
+ * physical and sw events of the kernel (and allow the profiling of them as
+ * well):
  */
-enum perf_counter_record_type {
-	PERF_RECORD_SIMPLE		=  0,
-	PERF_RECORD_IRQ			=  1,
-	PERF_RECORD_GROUP		=  2,
+enum sw_event_ids {
+	PERF_COUNT_CPU_CLOCK		= 0,
+	PERF_COUNT_TASK_CLOCK		= 1,
+	PERF_COUNT_PAGE_FAULTS		= 2,
+	PERF_COUNT_CONTEXT_SWITCHES	= 3,
+	PERF_COUNT_CPU_MIGRATIONS	= 4,
+	PERF_COUNT_PAGE_FAULTS_MIN	= 5,
+	PERF_COUNT_PAGE_FAULTS_MAJ	= 6,
 };
 
-a "simple" counter is one that counts hardware events and allows
-them to be read out into a u64 count value. (read() returns 8 on
-a successful read of a simple counter.)
+Counters come in two flavours: counting counters and sampling
+counters.  A "counting" counter is one that is used for counting the
+number of events that occur, and is characterised by having
+irq_period = 0 and record_type = PERF_RECORD_SIMPLE.  A read() on a
+counting counter simply returns the current value of the counter as
+an 8-byte number.
 
-An "irq" counter is one that will also provide an IRQ context information:
-the IP of the interrupted context. In this case read() will return
-the 8-byte counter value, plus the Instruction Pointer address of the
-interrupted context.
+A "sampling" counter is one that is set up to generate an interrupt
+every N events, where N is given by 'irq_period'.  A sampling counter
+has irq_period > 0 and record_type != PERF_RECORD_SIMPLE.  The
+record_type controls what data is recorded on each interrupt, and the
+available values are currently:
 
-The parameter 'hw_event_period' is the number of events before waking up
-a read() that is blocked on a counter fd. Zero value means a non-blocking
-counter.
+/*
+ * IRQ-notification data record type:
+ */
+enum perf_counter_record_type {
+	PERF_RECORD_SIMPLE		= 0,
+	PERF_RECORD_IRQ			= 1,
+	PERF_RECORD_GROUP		= 2,
+};
 
-The 'pid' parameter allows the counter to be specific to a task:
+A record_type value of PERF_RECORD_IRQ will record the instruction
+pointer (IP) at which the interrupt occurred.  A record_type value of
+PERF_RECORD_GROUP will record the event_config and counter value of
+all of the other counters in the group, and should only be used on a
+group leader (see below).  Currently these two values are mutually
+exclusive, but record_type will become a bit-mask in future and
+support other values.
+
+A sampling counter has an event queue, into which an event is placed
+on each interrupt.  A read() on a sampling counter will read the next
+event from the event queue.  If the queue is empty, the read() will
+either block or return an EAGAIN error, depending on whether the fd
+has been set to non-blocking mode or not.
+
+The 'disabled' bit specifies whether the counter starts out disabled
+or enabled.  If it is initially disabled, it can be enabled by ioctl
+or prctl (see below).
+
+The 'nmi' bit specifies, for hardware events, whether the counter
+should be set up to request non-maskable interrupts (NMIs) or normal
+interrupts.  This bit is ignored if the user doesn't have
+CAP_SYS_ADMIN privilege (i.e. is not root) or if the CPU doesn't
+generate NMIs from hardware counters.
+
+The 'inherit' bit, if set, specifies that this counter should count
+events on descendant tasks as well as the task specified.  This only
+applies to new descendents, not to any existing descendents at the
+time the counter is created (nor to any new descendents of existing
+descendents).
+
+The 'pinned' bit, if set, specifies that the counter should always be
+on the CPU if at all possible.  It only applies to hardware counters
+and only to group leaders.  If a pinned counter cannot be put onto the
+CPU (e.g. because there are not enough hardware counters or because of
+a conflict with some other event), then the counter goes into an
+'error' state, where reads return end-of-file (i.e. read() returns 0)
+until the counter is subsequently enabled or disabled.
+
+The 'exclusive' bit, if set, specifies that when this counter's group
+is on the CPU, it should be the only group using the CPU's counters.
+In future, this will allow sophisticated monitoring programs to supply
+extra configuration information via 'extra_config_len' to exploit
+advanced features of the CPU's Performance Monitor Unit (PMU) that are
+not otherwise accessible and that might disrupt other hardware
+counters.
+
+The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a
+way to request that counting of events be restricted to times when the
+CPU is in user, kernel and/or hypervisor mode.
+
+
+The 'pid' parameter to the perf_counter_open() system call allows the
+counter to be specific to a task:
 
  pid == 0: if the pid parameter is zero, the counter is attached to the
  current task.
@@ -125,8 +218,7 @@ The 'pid' parameter allows the counter to be specific to a task:
 
  pid < 0: all tasks are counted (per cpu counters)
 
-The 'cpu' parameter allows a counter to be made specific to a full
-CPU:
+The 'cpu' parameter allows a counter to be made specific to a CPU:
 
  cpu >= 0: the counter is restricted to a specific CPU
  cpu == -1: the counter counts on all CPUs
@@ -141,7 +233,51 @@ their own tasks.
 A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts
 all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege.
 
-Group counters are created by passing in a group_fd of another counter.
-Groups are scheduled at once and can be used with PERF_RECORD_GROUP
-to record multi-dimensional timestamps.
+The 'flags' parameter is currently unused and must be zero.
+
+The 'group_fd' parameter allows counter "groups" to be set up.  A
+counter group has one counter which is the group "leader".  The leader
+is created first, with group_fd = -1 in the perf_counter_open call
+that creates it.  The rest of the group members are created
+subsequently, with group_fd giving the fd of the group leader.
+(A single counter on its own is created with group_fd = -1 and is
+considered to be a group with only 1 member.)
+
+A counter group is scheduled onto the CPU as a unit, that is, it will
+only be put onto the CPU if all of the counters in the group can be
+put onto the CPU.  This means that the values of the member counters
+can be meaningfully compared, added, divided (to get ratios), etc.,
+with each other, since they have counted events for the same set of
+executed instructions.
+
+Counters can be enabled and disabled in two ways: via ioctl and via
+prctl.  When a counter is disabled, it doesn't count or generate
+events but does continue to exist and maintain its count value.
+
+An individual counter or counter group can be enabled with
+
+	ioctl(fd, PERF_COUNTER_IOC_ENABLE);
+
+or disabled with
+
+	ioctl(fd, PERF_COUNTER_IOC_DISABLE);
+
+Enabling or disabling the leader of a group enables or disables the
+whole group; that is, while the group leader is disabled, none of the
+counters in the group will count.  Enabling or disabling a member of a
+group other than the leader only affects that counter - disabling an
+non-leader stops that counter from counting but doesn't affect any
+other counter.
+
+A process can enable or disable all the counter groups that are
+attached to it, using prctl:
+
+	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
+
+	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
+
+This applies to all counters on the current process, whether created
+by this process or by another, and doesn't affect any counters that
+this process has created on other processes.  It only enables or
+disables the group leaders, not any other members in the groups.
 
-- 
cgit v0.10.2


From c7138f37f905bb7987b1f9f5a8ee73667db39f25 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 24 Mar 2009 13:18:16 +0100
Subject: perf_counter: fix perf_poll()

Impact: fix kerneltop 100% CPU usage

Only return a poll event when there's actually been one, poll_wait()
doesn't actually wait for the waitq you pass it, it only enqueues
you on it.

Only once all FDs have been iterated and none of thm returned a
poll-event will it schedule().

Also make it return POLL_HUP when there's not mmap() area to read from.

Further, fix a silly bug in the write code.

Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arjan van de Ven <arjan@infradead.org>
Orig-LKML-Reference: <1237897096.24918.181.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 2b5e66d..48212c1 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -246,6 +246,7 @@ struct file;
 struct perf_mmap_data {
 	struct rcu_head			rcu_head;
 	int				nr_pages;
+	atomic_t			wakeup;
 	atomic_t			head;
 	struct perf_counter_mmap_page   *user_page;
 	void 				*data_pages[0];
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0dfe910..affe227 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1161,7 +1161,16 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 static unsigned int perf_poll(struct file *file, poll_table *wait)
 {
 	struct perf_counter *counter = file->private_data;
-	unsigned int events = POLLIN;
+	struct perf_mmap_data *data;
+	unsigned int events;
+
+	rcu_read_lock();
+	data = rcu_dereference(counter->data);
+	if (data)
+		events = atomic_xchg(&data->wakeup, 0);
+	else
+		events = POLL_HUP;
+	rcu_read_unlock();
 
 	poll_wait(file, &counter->waitq, wait);
 
@@ -1425,7 +1434,7 @@ static int perf_output_write(struct perf_counter *counter, int nmi,
 
 	do {
 		offset = head = atomic_read(&data->head);
-		head += sizeof(u64);
+		head += size;
 	} while (atomic_cmpxchg(&data->head, offset, head) != offset);
 
 	wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
@@ -1446,6 +1455,7 @@ static int perf_output_write(struct perf_counter *counter, int nmi,
 	 * generate a poll() wakeup for every page boundary crossed
 	 */
 	if (wakeup) {
+		atomic_xchg(&data->wakeup, POLL_IN);
 		__perf_counter_update_userpage(counter, data);
 		if (nmi) {
 			counter->wakeup_pending = 1;
-- 
cgit v0.10.2


From b9cacc7bf193df16532bfa7d7ca77fe50fc3c2e6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Mar 2009 12:30:22 +0100
Subject: perf_counter: more elaborate write API

Provide a begin, copy, end interface to the output buffer.

begin() reserves the space,
 copy() copies the data over, considering page boundaries,
  end() finalizes the event and does the wakeup.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Orig-LKML-Reference: <20090325113316.740550870@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index affe227..0422fd9 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/mm.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
 #include <linux/file.h>
@@ -16,15 +17,14 @@
 #include <linux/sysfs.h>
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
+#include <linux/vmstat.h>
+#include <linux/hardirq.h>
+#include <linux/rculist.h>
 #include <linux/uaccess.h>
 #include <linux/syscalls.h>
 #include <linux/anon_inodes.h>
 #include <linux/kernel_stat.h>
 #include <linux/perf_counter.h>
-#include <linux/mm.h>
-#include <linux/vmstat.h>
-#include <linux/rculist.h>
-#include <linux/hardirq.h>
 
 #include <asm/irq_regs.h>
 
@@ -1411,16 +1411,20 @@ static const struct file_operations perf_fops = {
  * Output
  */
 
-static int perf_output_write(struct perf_counter *counter, int nmi,
-			     void *buf, ssize_t size)
+struct perf_output_handle {
+	struct perf_counter	*counter;
+	struct perf_mmap_data	*data;
+	unsigned int		offset;
+	int			wakeup;
+};
+
+static int perf_output_begin(struct perf_output_handle *handle,
+			     struct perf_counter *counter, unsigned int size)
 {
 	struct perf_mmap_data *data;
-	unsigned int offset, head, nr;
-	unsigned int len;
-	int ret, wakeup;
+	unsigned int offset, head;
 
 	rcu_read_lock();
-	ret = -ENOSPC;
 	data = rcu_dereference(counter->data);
 	if (!data)
 		goto out;
@@ -1428,45 +1432,82 @@ static int perf_output_write(struct perf_counter *counter, int nmi,
 	if (!data->nr_pages)
 		goto out;
 
-	ret = -EINVAL;
-	if (size > PAGE_SIZE)
-		goto out;
-
 	do {
 		offset = head = atomic_read(&data->head);
 		head += size;
 	} while (atomic_cmpxchg(&data->head, offset, head) != offset);
 
-	wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
+	handle->counter	= counter;
+	handle->data	= data;
+	handle->offset	= offset;
+	handle->wakeup	= (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
 
-	nr = (offset >> PAGE_SHIFT) & (data->nr_pages - 1);
-	offset &= PAGE_SIZE - 1;
+	return 0;
 
-	len = min_t(unsigned int, PAGE_SIZE - offset, size);
-	memcpy(data->data_pages[nr] + offset, buf, len);
-	size -= len;
+out:
+	rcu_read_unlock();
 
-	if (size) {
-		nr = (nr + 1) & (data->nr_pages - 1);
-		memcpy(data->data_pages[nr], buf + len, size);
-	}
+	return -ENOSPC;
+}
 
-	/*
-	 * generate a poll() wakeup for every page boundary crossed
-	 */
-	if (wakeup) {
-		atomic_xchg(&data->wakeup, POLL_IN);
-		__perf_counter_update_userpage(counter, data);
+static void perf_output_copy(struct perf_output_handle *handle,
+			     void *buf, unsigned int len)
+{
+	unsigned int pages_mask;
+	unsigned int offset;
+	unsigned int size;
+	void **pages;
+
+	offset		= handle->offset;
+	pages_mask	= handle->data->nr_pages - 1;
+	pages		= handle->data->data_pages;
+
+	do {
+		unsigned int page_offset;
+		int nr;
+
+		nr	    = (offset >> PAGE_SHIFT) & pages_mask;
+		page_offset = offset & (PAGE_SIZE - 1);
+		size	    = min_t(unsigned int, PAGE_SIZE - page_offset, len);
+
+		memcpy(pages[nr] + page_offset, buf, size);
+
+		len	    -= size;
+		buf	    += size;
+		offset	    += size;
+	} while (len);
+
+	handle->offset = offset;
+}
+
+static void perf_output_end(struct perf_output_handle *handle, int nmi)
+{
+	if (handle->wakeup) {
+		(void)atomic_xchg(&handle->data->wakeup, POLL_IN);
+		__perf_counter_update_userpage(handle->counter, handle->data);
 		if (nmi) {
-			counter->wakeup_pending = 1;
+			handle->counter->wakeup_pending = 1;
 			set_perf_counter_pending();
 		} else
-			wake_up(&counter->waitq);
+			wake_up(&handle->counter->waitq);
 	}
-	ret = 0;
-out:
 	rcu_read_unlock();
+}
+
+static int perf_output_write(struct perf_counter *counter, int nmi,
+			     void *buf, ssize_t size)
+{
+	struct perf_output_handle handle;
+	int ret;
 
+	ret = perf_output_begin(&handle, counter, size);
+	if (ret)
+		goto out;
+
+	perf_output_copy(&handle, buf, size);
+	perf_output_end(&handle, nmi);
+
+out:
 	return ret;
 }
 
-- 
cgit v0.10.2


From 5c1481943250ab65fa5130e05ec479c93216e9f7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Mar 2009 12:30:23 +0100
Subject: perf_counter: output objects

Provide a {type,size} header for each output entry.

This should provide extensible output, and the ability to mix multiple streams.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Orig-LKML-Reference: <20090325113316.831607932@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 48212c1..c256635 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -156,6 +156,16 @@ struct perf_counter_mmap_page {
 	__u32   data_head;		/* head in the data section */
 };
 
+struct perf_event_header {
+	__u32	type;
+	__u32	size;
+};
+
+enum perf_event_type {
+	PERF_EVENT_IP		= 0,
+	PERF_EVENT_GROUP	= 1,
+};
+
 #ifdef __KERNEL__
 /*
  * Kernel-internal data types and definitions:
@@ -260,6 +270,7 @@ struct perf_counter {
 	struct list_head		list_entry;
 	struct list_head		event_entry;
 	struct list_head		sibling_list;
+	int 				nr_siblings;
 	struct perf_counter		*group_leader;
 	const struct hw_perf_counter_ops *hw_ops;
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0422fd9..d76e311 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -75,8 +75,10 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	 */
 	if (counter->group_leader == counter)
 		list_add_tail(&counter->list_entry, &ctx->counter_list);
-	else
+	else {
 		list_add_tail(&counter->list_entry, &group_leader->sibling_list);
+		group_leader->nr_siblings++;
+	}
 
 	list_add_rcu(&counter->event_entry, &ctx->event_list);
 }
@@ -89,6 +91,9 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	list_del_init(&counter->list_entry);
 	list_del_rcu(&counter->event_entry);
 
+	if (counter->group_leader != counter)
+		counter->group_leader->nr_siblings--;
+
 	/*
 	 * If this was a group counter with sibling counters then
 	 * upgrade the siblings to singleton counters by adding them
@@ -381,9 +386,11 @@ static int is_software_only_group(struct perf_counter *leader)
 
 	if (!is_software_counter(leader))
 		return 0;
+
 	list_for_each_entry(counter, &leader->sibling_list, list_entry)
 		if (!is_software_counter(counter))
 			return 0;
+
 	return 1;
 }
 
@@ -1480,6 +1487,9 @@ static void perf_output_copy(struct perf_output_handle *handle,
 	handle->offset = offset;
 }
 
+#define perf_output_put(handle, x) \
+	perf_output_copy((handle), &(x), sizeof(x))
+
 static void perf_output_end(struct perf_output_handle *handle, int nmi)
 {
 	if (handle->wakeup) {
@@ -1514,34 +1524,53 @@ out:
 static void perf_output_simple(struct perf_counter *counter,
 			       int nmi, struct pt_regs *regs)
 {
-	u64 entry;
+	struct {
+		struct perf_event_header header;
+		u64 ip;
+	} event;
 
-	entry = instruction_pointer(regs);
+	event.header.type = PERF_EVENT_IP;
+	event.header.size = sizeof(event);
+	event.ip = instruction_pointer(regs);
 
-	perf_output_write(counter, nmi, &entry, sizeof(entry));
+	perf_output_write(counter, nmi, &event, sizeof(event));
 }
 
-struct group_entry {
-	u64 event;
-	u64 counter;
-};
-
 static void perf_output_group(struct perf_counter *counter, int nmi)
 {
+	struct perf_output_handle handle;
+	struct perf_event_header header;
 	struct perf_counter *leader, *sub;
+	unsigned int size;
+	struct {
+		u64 event;
+		u64 counter;
+	} entry;
+	int ret;
+
+	size = sizeof(header) + counter->nr_siblings * sizeof(entry);
+
+	ret = perf_output_begin(&handle, counter, size);
+	if (ret)
+		return;
+
+	header.type = PERF_EVENT_GROUP;
+	header.size = size;
+
+	perf_output_put(&handle, header);
 
 	leader = counter->group_leader;
 	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
-		struct group_entry entry;
-
 		if (sub != counter)
 			sub->hw_ops->read(sub);
 
 		entry.event = sub->hw_event.config;
 		entry.counter = atomic64_read(&sub->count);
 
-		perf_output_write(counter, nmi, &entry, sizeof(entry));
+		perf_output_put(&handle, entry);
 	}
+
+	perf_output_end(&handle, nmi);
 }
 
 void perf_counter_output(struct perf_counter *counter,
-- 
cgit v0.10.2


From 63e35b25d6b5c3136d22ef249dbbf96716aa08bf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Mar 2009 12:30:24 +0100
Subject: perf_counter: sanity check on the output API

Ensure we never write more than we said we would.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Orig-LKML-Reference: <20090325113316.921433024@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d76e311..7669afe 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1422,6 +1422,7 @@ struct perf_output_handle {
 	struct perf_counter	*counter;
 	struct perf_mmap_data	*data;
 	unsigned int		offset;
+	unsigned int		head;
 	int			wakeup;
 };
 
@@ -1447,6 +1448,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	handle->counter	= counter;
 	handle->data	= data;
 	handle->offset	= offset;
+	handle->head	= head;
 	handle->wakeup	= (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
 
 	return 0;
@@ -1485,6 +1487,8 @@ static void perf_output_copy(struct perf_output_handle *handle,
 	} while (len);
 
 	handle->offset = offset;
+
+	WARN_ON_ONCE(handle->offset > handle->head);
 }
 
 #define perf_output_put(handle, x) \
-- 
cgit v0.10.2


From ea5d20cf99db5d26d43b6d322d3ace17e08a6552 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Mar 2009 12:30:25 +0100
Subject: perf_counter: optionally provide the pid/tid of the sampled task

Allow cpu wide counters to profile userspace by providing what process
the sample belongs to.

This raises the first issue with the output type, lots of these
options: group, tid, callchain, etc.. are non-exclusive and could be
combined, suggesting a bitfield.

However, things like the mmap() data stream doesn't fit in that.

How to split the type field...

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Orig-LKML-Reference: <20090325113317.013775235@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c256635..7fdbdf8 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -127,8 +127,9 @@ struct perf_counter_hw_event {
 				exclude_kernel :  1, /* ditto kernel          */
 				exclude_hv     :  1, /* ditto hypervisor      */
 				exclude_idle   :  1, /* don't count when idle */
+				include_tid    :  1, /* include the tid */
 
-				__reserved_1   : 55;
+				__reserved_1   : 54;
 
 	__u32			extra_config_len;
 	__u32			__reserved_4;
@@ -164,6 +165,8 @@ struct perf_event_header {
 enum perf_event_type {
 	PERF_EVENT_IP		= 0,
 	PERF_EVENT_GROUP	= 1,
+
+	__PERF_EVENT_TID	= 0x100,
 };
 
 #ifdef __KERNEL__
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 7669afe..f3e1b27 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1528,16 +1528,30 @@ out:
 static void perf_output_simple(struct perf_counter *counter,
 			       int nmi, struct pt_regs *regs)
 {
+	unsigned int size;
 	struct {
 		struct perf_event_header header;
 		u64 ip;
+		u32 pid, tid;
 	} event;
 
 	event.header.type = PERF_EVENT_IP;
-	event.header.size = sizeof(event);
 	event.ip = instruction_pointer(regs);
 
-	perf_output_write(counter, nmi, &event, sizeof(event));
+	size = sizeof(event);
+
+	if (counter->hw_event.include_tid) {
+		/* namespace issues */
+		event.pid = current->group_leader->pid;
+		event.tid = current->pid;
+
+		event.header.type |= __PERF_EVENT_TID;
+	} else
+		size -= sizeof(u64);
+
+	event.header.size = size;
+
+	perf_output_write(counter, nmi, &event, size);
 }
 
 static void perf_output_group(struct perf_counter *counter, int nmi)
-- 
cgit v0.10.2


From 4c4ba21d2c3659e4c0421533939b58a8fd9f06c9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Mar 2009 12:30:26 +0100
Subject: perf_counter: kerneltop: mmap_pages argument

provide a knob to set the number of mmap data pages.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Orig-LKML-Reference: <20090325113317.104545398@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 7ebde7a..3e45bf6 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -178,6 +178,7 @@ static int			nr_cpus				=  0;
 static int			nmi				=  1;
 static int			group				=  0;
 static unsigned int		page_size;
+static unsigned int		mmap_pages			=  4;
 
 static char			*vmlinux;
 
@@ -326,6 +327,7 @@ static void display_help(void)
 	" -x path   --vmlinux=<path>   # the vmlinux binary, required for -s use\n"
 	" -z        --zero             # zero counts after display\n"
 	" -D        --dump_symtab      # dump symbol table to stderr on startup\n"
+	" -m pages  --mmap_pages=<pages> # number of mmap data pages\n"
 	);
 
 	exit(0);
@@ -732,7 +734,9 @@ static int read_symbol(FILE *in, struct sym_entry *s)
 	/* Tag events to be skipped. */
 	if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
 		s->skip = 1;
-	if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
+	else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
+		s->skip = 1;
+	else if (!strcmp("mwait_idle", s->sym))
 		s->skip = 1;
 
 	if (filter_match == 1) {
@@ -1042,9 +1046,10 @@ static void process_options(int argc, char *argv[])
 			{"symbol",	required_argument,	NULL, 's'},
 			{"stat",	no_argument,		NULL, 'S'},
 			{"zero",	no_argument,		NULL, 'z'},
+			{"mmap_pages",	required_argument,	NULL, 'm'},
 			{NULL,		0,			NULL,  0 }
 		};
-		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:p:s:Sx:z",
+		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:m:p:s:Sx:z",
 				    long_options, &option_index);
 		if (c == -1)
 			break;
@@ -1081,6 +1086,7 @@ static void process_options(int argc, char *argv[])
 		case 'S': run_perfstat			=	       1; break;
 		case 'x': vmlinux			= strdup(optarg); break;
 		case 'z': zero				=              1; break;
+		case 'm': mmap_pages			=   atoi(optarg); break;
 		default: error = 1; break;
 		}
 	}
@@ -1134,17 +1140,30 @@ repeat:
 	return head;
 }
 
+struct timeval last_read, this_read;
+
 static void mmap_read(struct mmap_data *md)
 {
 	unsigned int head = mmap_read_head(md);
 	unsigned int old = md->prev;
 	unsigned char *data = md->base + page_size;
 
+	gettimeofday(&this_read, NULL);
+
 	if (head - old > md->mask) {
-		printf("ERROR: failed to keep up with mmap data\n");
-		exit(-1);
+		struct timeval iv;
+		unsigned long msecs;
+
+		timersub(&this_read, &last_read, &iv);
+		msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
+
+		fprintf(stderr, "WARNING: failed to keep up with mmap data.  Last read %lu msecs ago.\n", msecs);
+
+		old = head;
 	}
 
+	last_read = this_read;
+
 	for (; old != head;) {
 		__u64 *ptr = (__u64 *)&data[old & md->mask];
 		old += sizeof(__u64);
@@ -1220,8 +1239,8 @@ int main(int argc, char *argv[])
 
 			mmap_array[i][counter].counter = counter;
 			mmap_array[i][counter].prev = 0;
-			mmap_array[i][counter].mask = 2*page_size - 1;
-			mmap_array[i][counter].base = mmap(NULL, 3*page_size,
+			mmap_array[i][counter].mask = mmap_pages*page_size - 1;
+			mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
 					PROT_READ, MAP_SHARED, fd[i][counter], 0);
 			if (mmap_array[i][counter].base == MAP_FAILED) {
 				printf("kerneltop error: failed to mmap with %d (%s)\n",
-- 
cgit v0.10.2


From 00f0ad73ac90e3fba8b4cbe4cf21b2fb9a56cb72 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Mar 2009 12:30:27 +0100
Subject: perf_counter: kerneltop: output event support

Teach kerneltop about the new output ABI.

XXX: anybody fancy integrating the PID/TID data into the output?

Bump the mmap_data pages a little because we bloated the output and
have to be more careful about overruns with structured data.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Orig-LKML-Reference: <20090325113317.192910290@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 3e45bf6..fda1438 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -134,6 +134,11 @@
 #endif
 
 #define unlikely(x)	__builtin_expect(!!(x), 0)
+#define min(x, y) ({				\
+	typeof(x) _min1 = (x);			\
+	typeof(y) _min2 = (y);			\
+	(void) (&_min1 == &_min2);		\
+	_min1 < _min2 ? _min1 : _min2; })
 
 asmlinkage int sys_perf_counter_open(
         struct perf_counter_hw_event    *hw_event_uptr          __user,
@@ -178,7 +183,7 @@ static int			nr_cpus				=  0;
 static int			nmi				=  1;
 static int			group				=  0;
 static unsigned int		page_size;
-static unsigned int		mmap_pages			=  4;
+static unsigned int		mmap_pages			=  16;
 
 static char			*vmlinux;
 
@@ -1147,28 +1152,75 @@ static void mmap_read(struct mmap_data *md)
 	unsigned int head = mmap_read_head(md);
 	unsigned int old = md->prev;
 	unsigned char *data = md->base + page_size;
+	int diff;
 
 	gettimeofday(&this_read, NULL);
 
-	if (head - old > md->mask) {
+	/*
+	 * If we're further behind than half the buffer, there's a chance
+	 * the writer will bite our tail and screw up the events under us.
+	 *
+	 * If we somehow ended up ahead of the head, we got messed up.
+	 *
+	 * In either case, truncate and restart at head.
+	 */
+	diff = head - old;
+	if (diff > md->mask / 2 || diff < 0) {
 		struct timeval iv;
 		unsigned long msecs;
 
 		timersub(&this_read, &last_read, &iv);
 		msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
 
-		fprintf(stderr, "WARNING: failed to keep up with mmap data.  Last read %lu msecs ago.\n", msecs);
+		fprintf(stderr, "WARNING: failed to keep up with mmap data."
+				"  Last read %lu msecs ago.\n", msecs);
 
+		/*
+		 * head points to a known good entry, start there.
+		 */
 		old = head;
 	}
 
 	last_read = this_read;
 
 	for (; old != head;) {
-		__u64 *ptr = (__u64 *)&data[old & md->mask];
-		old += sizeof(__u64);
+		struct event_struct {
+			struct perf_event_header header;
+			__u64 ip;
+			__u32 pid, tid;
+		} *event = (struct event_struct *)&data[old & md->mask];
+		struct event_struct event_copy;
+
+		unsigned int size = event->header.size;
+
+		/*
+		 * Event straddles the mmap boundary -- header should always
+		 * be inside due to u64 alignment of output.
+		 */
+		if ((old & md->mask) + size != ((old + size) & md->mask)) {
+			unsigned int offset = old;
+			unsigned int len = sizeof(*event), cpy;
+			void *dst = &event_copy;
+
+			do {
+				cpy = min(md->mask + 1 - (offset & md->mask), len);
+				memcpy(dst, &data[offset & md->mask], cpy);
+				offset += cpy;
+				dst += cpy;
+				len -= cpy;
+			} while (len);
+
+			event = &event_copy;
+		}
 
-		process_event(*ptr, md->counter);
+		old += size;
+
+		switch (event->header.type) {
+		case PERF_EVENT_IP:
+		case PERF_EVENT_IP | __PERF_EVENT_TID:
+			process_event(event->ip, md->counter);
+			break;
+		}
 	}
 
 	md->prev = old;
@@ -1214,6 +1266,7 @@ int main(int argc, char *argv[])
 			hw_event.irq_period	= event_count[counter];
 			hw_event.record_type	= PERF_RECORD_IRQ;
 			hw_event.nmi		= nmi;
+			hw_event.include_tid	= 1;
 
 			fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
 			if (fd[i][counter] < 0) {
-- 
cgit v0.10.2


From 7730d8655880f41f2ea519aca2ca6a1413dfd2c9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Mar 2009 12:48:31 +0100
Subject: perf_counter: allow and require one-page mmap on counting counters

A brainfart stopped single page mmap()s working. The rest of the code
should be perfectly fine with not having any data pages.

Reported-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Orig-LKML-Reference: <1237981712.7972.812.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f3e1b27..95e0257 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1369,7 +1369,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	vma_size = vma->vm_end - vma->vm_start;
 	nr_pages = (vma_size / PAGE_SIZE) - 1;
 
-	if (nr_pages == 0 || !is_power_of_2(nr_pages))
+	/*
+	 * If we have data pages ensure they're a power-of-two number, so we
+	 * can do bitmasks instead of modulo.
+	 */
+	if (nr_pages != 0 && !is_power_of_2(nr_pages))
 		return -EINVAL;
 
 	if (vma_size != PAGE_SIZE * (1 + nr_pages))
-- 
cgit v0.10.2


From 53cfbf593758916aac41db728f029986a62f1254 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 25 Mar 2009 22:46:58 +1100
Subject: perf_counter: record time running and time enabled for each counter

Impact: new functionality

Currently, if there are more counters enabled than can fit on the CPU,
the kernel will multiplex the counters on to the hardware using
round-robin scheduling.  That isn't too bad for sampling counters, but
for counting counters it means that the value read from a counter
represents some unknown fraction of the true count of events that
occurred while the counter was enabled.

This remedies the situation by keeping track of how long each counter
is enabled for, and how long it is actually on the cpu and counting
events.  These times are recorded in nanoseconds using the task clock
for per-task counters and the cpu clock for per-cpu counters.

These values can be supplied to userspace on a read from the counter.
Userspace requests that they be supplied after the counter value by
setting the PERF_FORMAT_TOTAL_TIME_ENABLED and/or
PERF_FORMAT_TOTAL_TIME_RUNNING bits in the hw_event.read_format field
when creating the counter.  (There is no way to change the read format
after the counter is created, though it would be possible to add some
way to do that.)

Using this information it is possible for userspace to scale the count
it reads from the counter to get an estimate of the true count:

true_count_estimate = count * total_time_enabled / total_time_running

This also lets userspace detect the situation where the counter never
got to go on the cpu: total_time_running == 0.

This functionality has been requested by the PAPI developers, and will
be generally needed for interpreting the count values from counting
counters correctly.

In the implementation, this keeps 5 time values (in nanoseconds) for
each counter: total_time_enabled and total_time_running are used when
the counter is in state OFF or ERROR and for reporting back to
userspace.  When the counter is in state INACTIVE or ACTIVE, it is the
tstamp_enabled, tstamp_running and tstamp_stopped values that are
relevant, and total_time_enabled and total_time_running are determined
from them.  (tstamp_stopped is only used in INACTIVE state.)  The
reason for doing it like this is that it means that only counters
being enabled or disabled at sched-in and sched-out time need to be
updated.  There are no new loops that iterate over all counters to
update total_time_enabled or total_time_running.

This also keeps separate child_total_time_running and
child_total_time_enabled fields that get added in when reporting the
totals to userspace.  They are separate fields so that they can be
atomic.  We don't want to use atomics for total_time_running,
total_time_enabled etc., because then we would have to use atomic
sequences to update them, which are slower than regular arithmetic and
memory accesses.

It is possible to measure total_time_running by adding a task_clock
counter to each group of counters, and total_time_enabled can be
measured approximately with a top-level task_clock counter (though
inaccuracies will creep in if you need to disable and enable groups
since it is not possible in general to disable/enable the top-level
task_clock counter simultaneously with another group).  However, that
adds extra overhead - I measured around 15% increase in the context
switch latency reported by lat_ctx (from lmbench) when a task_clock
counter was added to each of 2 groups, and around 25% increase when a
task_clock counter was added to each of 4 groups.  (In both cases a
top-level task-clock counter was also added.)

In contrast, the code added in this commit gives better information
with no overhead that I could measure (in fact in some cases I
measured lower times with this code, but the differences were all less
than one standard deviation).

[ v2: address review comments by Andrew Morton. ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrew Morton <akpm@linux-foundation.org>
Orig-LKML-Reference: <18890.6578.728637.139402@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index d48596a..df007fe 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -455,6 +455,8 @@ static void counter_sched_in(struct perf_counter *counter, int cpu)
 {
 	counter->state = PERF_COUNTER_STATE_ACTIVE;
 	counter->oncpu = cpu;
+	counter->tstamp_running += counter->ctx->time_now -
+		counter->tstamp_stopped;
 	if (is_software_counter(counter))
 		counter->hw_ops->enable(counter);
 }
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 7fdbdf8..6bf67ce 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -103,6 +103,16 @@ enum perf_counter_record_type {
 #define PERF_COUNTER_EVENT_MASK		__PERF_COUNTER_MASK(EVENT)
 
 /*
+ * Bits that can be set in hw_event.read_format to request that
+ * reads on the counter should return the indicated quantities,
+ * in increasing order of bit value, after the counter value.
+ */
+enum perf_counter_read_format {
+	PERF_FORMAT_TOTAL_TIME_ENABLED	=  1,
+	PERF_FORMAT_TOTAL_TIME_RUNNING	=  2,
+};
+
+/*
  * Hardware event to monitor via a performance monitoring counter:
  */
 struct perf_counter_hw_event {
@@ -281,6 +291,32 @@ struct perf_counter {
 	enum perf_counter_active_state	prev_state;
 	atomic64_t			count;
 
+	/*
+	 * These are the total time in nanoseconds that the counter
+	 * has been enabled (i.e. eligible to run, and the task has
+	 * been scheduled in, if this is a per-task counter)
+	 * and running (scheduled onto the CPU), respectively.
+	 *
+	 * They are computed from tstamp_enabled, tstamp_running and
+	 * tstamp_stopped when the counter is in INACTIVE or ACTIVE state.
+	 */
+	u64				total_time_enabled;
+	u64				total_time_running;
+
+	/*
+	 * These are timestamps used for computing total_time_enabled
+	 * and total_time_running when the counter is in INACTIVE or
+	 * ACTIVE state, measured in nanoseconds from an arbitrary point
+	 * in time.
+	 * tstamp_enabled: the notional time when the counter was enabled
+	 * tstamp_running: the notional time when the counter was scheduled on
+	 * tstamp_stopped: in INACTIVE state, the notional time when the
+	 *	counter was scheduled off.
+	 */
+	u64				tstamp_enabled;
+	u64				tstamp_running;
+	u64				tstamp_stopped;
+
 	struct perf_counter_hw_event	hw_event;
 	struct hw_perf_counter		hw;
 
@@ -292,6 +328,13 @@ struct perf_counter {
 	struct list_head		child_list;
 
 	/*
+	 * These accumulate total time (in nanoseconds) that children
+	 * counters have been enabled and running, respectively.
+	 */
+	atomic64_t			child_total_time_enabled;
+	atomic64_t			child_total_time_running;
+
+	/*
 	 * Protect attach/detach and child_list:
 	 */
 	struct mutex			mutex;
@@ -339,6 +382,16 @@ struct perf_counter_context {
 	int			nr_active;
 	int			is_active;
 	struct task_struct	*task;
+
+	/*
+	 * time_now is the current time in nanoseconds since an arbitrary
+	 * point in the past.  For per-task counters, this is based on the
+	 * task clock, and for per-cpu counters it is based on the cpu clock.
+	 * time_lost is an offset from the task/cpu clock, used to make it
+	 * appear that time only passes while the context is scheduled in.
+	 */
+	u64			time_now;
+	u64			time_lost;
 #endif
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 95e0257..3b862a7 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -116,6 +116,7 @@ counter_sched_out(struct perf_counter *counter,
 		return;
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
+	counter->tstamp_stopped = ctx->time_now;
 	counter->hw_ops->disable(counter);
 	counter->oncpu = -1;
 
@@ -252,6 +253,60 @@ retry:
 }
 
 /*
+ * Get the current time for this context.
+ * If this is a task context, we use the task's task clock,
+ * or for a per-cpu context, we use the cpu clock.
+ */
+static u64 get_context_time(struct perf_counter_context *ctx, int update)
+{
+	struct task_struct *curr = ctx->task;
+
+	if (!curr)
+		return cpu_clock(smp_processor_id());
+
+	return __task_delta_exec(curr, update) + curr->se.sum_exec_runtime;
+}
+
+/*
+ * Update the record of the current time in a context.
+ */
+static void update_context_time(struct perf_counter_context *ctx, int update)
+{
+	ctx->time_now = get_context_time(ctx, update) - ctx->time_lost;
+}
+
+/*
+ * Update the total_time_enabled and total_time_running fields for a counter.
+ */
+static void update_counter_times(struct perf_counter *counter)
+{
+	struct perf_counter_context *ctx = counter->ctx;
+	u64 run_end;
+
+	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
+		counter->total_time_enabled = ctx->time_now -
+			counter->tstamp_enabled;
+		if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+			run_end = counter->tstamp_stopped;
+		else
+			run_end = ctx->time_now;
+		counter->total_time_running = run_end - counter->tstamp_running;
+	}
+}
+
+/*
+ * Update total_time_enabled and total_time_running for all counters in a group.
+ */
+static void update_group_times(struct perf_counter *leader)
+{
+	struct perf_counter *counter;
+
+	update_counter_times(leader);
+	list_for_each_entry(counter, &leader->sibling_list, list_entry)
+		update_counter_times(counter);
+}
+
+/*
  * Cross CPU call to disable a performance counter
  */
 static void __perf_counter_disable(void *info)
@@ -276,6 +331,8 @@ static void __perf_counter_disable(void *info)
 	 * If it is in error state, leave it in error state.
 	 */
 	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
+		update_context_time(ctx, 1);
+		update_counter_times(counter);
 		if (counter == counter->group_leader)
 			group_sched_out(counter, cpuctx, ctx);
 		else
@@ -320,8 +377,10 @@ static void perf_counter_disable(struct perf_counter *counter)
 	 * Since we have the lock this context can't be scheduled
 	 * in, so we can change the state safely.
 	 */
-	if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+	if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+		update_counter_times(counter);
 		counter->state = PERF_COUNTER_STATE_OFF;
+	}
 
 	spin_unlock_irq(&ctx->lock);
 }
@@ -366,6 +425,8 @@ counter_sched_in(struct perf_counter *counter,
 		return -EAGAIN;
 	}
 
+	counter->tstamp_running += ctx->time_now - counter->tstamp_stopped;
+
 	if (!is_software_counter(counter))
 		cpuctx->active_oncpu++;
 	ctx->nr_active++;
@@ -425,6 +486,17 @@ static int group_can_go_on(struct perf_counter *counter,
 	return can_add_hw;
 }
 
+static void add_counter_to_ctx(struct perf_counter *counter,
+			       struct perf_counter_context *ctx)
+{
+	list_add_counter(counter, ctx);
+	ctx->nr_counters++;
+	counter->prev_state = PERF_COUNTER_STATE_OFF;
+	counter->tstamp_enabled = ctx->time_now;
+	counter->tstamp_running = ctx->time_now;
+	counter->tstamp_stopped = ctx->time_now;
+}
+
 /*
  * Cross CPU call to install and enable a performance counter
  */
@@ -449,6 +521,7 @@ static void __perf_install_in_context(void *info)
 
 	curr_rq_lock_irq_save(&flags);
 	spin_lock(&ctx->lock);
+	update_context_time(ctx, 1);
 
 	/*
 	 * Protect the list operation against NMI by disabling the
@@ -456,9 +529,7 @@ static void __perf_install_in_context(void *info)
 	 */
 	perf_flags = hw_perf_save_disable();
 
-	list_add_counter(counter, ctx);
-	ctx->nr_counters++;
-	counter->prev_state = PERF_COUNTER_STATE_OFF;
+	add_counter_to_ctx(counter, ctx);
 
 	/*
 	 * Don't put the counter on if it is disabled or if
@@ -486,8 +557,10 @@ static void __perf_install_in_context(void *info)
 		 */
 		if (leader != counter)
 			group_sched_out(leader, cpuctx, ctx);
-		if (leader->hw_event.pinned)
+		if (leader->hw_event.pinned) {
+			update_group_times(leader);
 			leader->state = PERF_COUNTER_STATE_ERROR;
+		}
 	}
 
 	if (!err && !ctx->task && cpuctx->max_pertask)
@@ -548,10 +621,8 @@ retry:
 	 * can add the counter safely, if it the call above did not
 	 * succeed.
 	 */
-	if (list_empty(&counter->list_entry)) {
-		list_add_counter(counter, ctx);
-		ctx->nr_counters++;
-	}
+	if (list_empty(&counter->list_entry))
+		add_counter_to_ctx(counter, ctx);
 	spin_unlock_irq(&ctx->lock);
 }
 
@@ -576,11 +647,13 @@ static void __perf_counter_enable(void *info)
 
 	curr_rq_lock_irq_save(&flags);
 	spin_lock(&ctx->lock);
+	update_context_time(ctx, 1);
 
 	counter->prev_state = counter->state;
 	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 		goto unlock;
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
+	counter->tstamp_enabled = ctx->time_now - counter->total_time_enabled;
 
 	/*
 	 * If the counter is in a group and isn't the group leader,
@@ -602,8 +675,10 @@ static void __perf_counter_enable(void *info)
 		 */
 		if (leader != counter)
 			group_sched_out(leader, cpuctx, ctx);
-		if (leader->hw_event.pinned)
+		if (leader->hw_event.pinned) {
+			update_group_times(leader);
 			leader->state = PERF_COUNTER_STATE_ERROR;
+		}
 	}
 
  unlock:
@@ -659,8 +734,11 @@ static void perf_counter_enable(struct perf_counter *counter)
 	 * Since we have the lock this context can't be scheduled
 	 * in, so we can change the state safely.
 	 */
-	if (counter->state == PERF_COUNTER_STATE_OFF)
+	if (counter->state == PERF_COUNTER_STATE_OFF) {
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
+		counter->tstamp_enabled = ctx->time_now -
+			counter->total_time_enabled;
+	}
  out:
 	spin_unlock_irq(&ctx->lock);
 }
@@ -693,6 +771,7 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 	ctx->is_active = 0;
 	if (likely(!ctx->nr_counters))
 		goto out;
+	update_context_time(ctx, 0);
 
 	flags = hw_perf_save_disable();
 	if (ctx->nr_active) {
@@ -797,6 +876,13 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 	if (likely(!ctx->nr_counters))
 		goto out;
 
+	/*
+	 * Add any time since the last sched_out to the lost time
+	 * so it doesn't get included in the total_time_enabled and
+	 * total_time_running measures for counters in the context.
+	 */
+	ctx->time_lost = get_context_time(ctx, 0) - ctx->time_now;
+
 	flags = hw_perf_save_disable();
 
 	/*
@@ -817,8 +903,10 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 		 * If this pinned group hasn't been scheduled,
 		 * put it in error state.
 		 */
-		if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+		if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+			update_group_times(counter);
 			counter->state = PERF_COUNTER_STATE_ERROR;
+		}
 	}
 
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
@@ -902,8 +990,10 @@ int perf_counter_task_disable(void)
 	perf_flags = hw_perf_save_disable();
 
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		if (counter->state != PERF_COUNTER_STATE_ERROR)
+		if (counter->state != PERF_COUNTER_STATE_ERROR) {
+			update_group_times(counter);
 			counter->state = PERF_COUNTER_STATE_OFF;
+		}
 	}
 
 	hw_perf_restore(perf_flags);
@@ -946,6 +1036,8 @@ int perf_counter_task_enable(void)
 		if (counter->state > PERF_COUNTER_STATE_OFF)
 			continue;
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
+		counter->tstamp_enabled = ctx->time_now -
+			counter->total_time_enabled;
 		counter->hw_event.disabled = 0;
 	}
 	hw_perf_restore(perf_flags);
@@ -1009,10 +1101,14 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 static void __read(void *info)
 {
 	struct perf_counter *counter = info;
+	struct perf_counter_context *ctx = counter->ctx;
 	unsigned long flags;
 
 	curr_rq_lock_irq_save(&flags);
+	if (ctx->is_active)
+		update_context_time(ctx, 1);
 	counter->hw_ops->read(counter);
+	update_counter_times(counter);
 	curr_rq_unlock_irq_restore(&flags);
 }
 
@@ -1025,6 +1121,8 @@ static u64 perf_counter_read(struct perf_counter *counter)
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 		smp_call_function_single(counter->oncpu,
 					 __read, counter, 1);
+	} else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+		update_counter_times(counter);
 	}
 
 	return atomic64_read(&counter->count);
@@ -1137,10 +1235,8 @@ static int perf_release(struct inode *inode, struct file *file)
 static ssize_t
 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 {
-	u64 cntval;
-
-	if (count < sizeof(cntval))
-		return -EINVAL;
+	u64 values[3];
+	int n;
 
 	/*
 	 * Return end-of-file for a read on a counter that is in
@@ -1151,10 +1247,24 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 		return 0;
 
 	mutex_lock(&counter->mutex);
-	cntval = perf_counter_read(counter);
+	values[0] = perf_counter_read(counter);
+	n = 1;
+	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		values[n++] = counter->total_time_enabled +
+			atomic64_read(&counter->child_total_time_enabled);
+	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		values[n++] = counter->total_time_running +
+			atomic64_read(&counter->child_total_time_running);
 	mutex_unlock(&counter->mutex);
 
-	return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
+	if (count < n * sizeof(u64))
+		return -EINVAL;
+	count = n * sizeof(u64);
+
+	if (copy_to_user(buf, values, count))
+		return -EFAULT;
+
+	return count;
 }
 
 static ssize_t
@@ -2290,8 +2400,7 @@ inherit_counter(struct perf_counter *parent_counter,
 	 * Link it up in the child's context:
 	 */
 	child_counter->task = child;
-	list_add_counter(child_counter, child_ctx);
-	child_ctx->nr_counters++;
+	add_counter_to_ctx(child_counter, child_ctx);
 
 	child_counter->parent = parent_counter;
 	/*
@@ -2361,6 +2470,10 @@ static void sync_child_counter(struct perf_counter *child_counter,
 	 * Add back the child's count to the parent's count:
 	 */
 	atomic64_add(child_val, &parent_counter->count);
+	atomic64_add(child_counter->total_time_enabled,
+		     &parent_counter->child_total_time_enabled);
+	atomic64_add(child_counter->total_time_running,
+		     &parent_counter->child_total_time_running);
 
 	/*
 	 * Remove this counter from the parent's list
@@ -2395,6 +2508,7 @@ __perf_counter_exit_task(struct task_struct *child,
 	if (child != current) {
 		wait_task_inactive(child, 0);
 		list_del_init(&child_counter->list_entry);
+		update_counter_times(child_counter);
 	} else {
 		struct perf_cpu_context *cpuctx;
 		unsigned long flags;
@@ -2412,6 +2526,7 @@ __perf_counter_exit_task(struct task_struct *child,
 		cpuctx = &__get_cpu_var(perf_cpu_context);
 
 		group_sched_out(child_counter, cpuctx, child_ctx);
+		update_counter_times(child_counter);
 
 		list_del_init(&child_counter->list_entry);
 
-- 
cgit v0.10.2


From 925d519ab82b6dd7aca9420d809ee83819c08db2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:02 +0200
Subject: perf_counter: unify and fix delayed counter wakeup

While going over the wakeup code I noticed delayed wakeups only work
for hardware counters but basically all software counters rely on
them.

This patch unifies and generalizes the delayed wakeup to fix this
issue.

Since we're dealing with NMI context bits here, use a cmpxchg() based
single link list implementation to track counters that have pending
wakeups.

[ This should really be generic code for delayed wakeups, but since we
  cannot use cmpxchg()/xchg() in generic code, I've let it live in the
  perf_counter code. -- Eric Dumazet could use it to aggregate the
  network wakeups. ]

Furthermore, the x86 method of using TIF flags was flawed in that its
quite possible to end up setting the bit on the idle task, loosing the
wakeup.

The powerpc method uses per-cpu storage and does appear to be
sufficient.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171023.153932974@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index cb32d57..20a44d0 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -132,7 +132,7 @@ static inline int irqs_disabled_flags(unsigned long flags)
 struct irq_chip;
 
 #ifdef CONFIG_PERF_COUNTERS
-static inline unsigned long get_perf_counter_pending(void)
+static inline unsigned long test_perf_counter_pending(void)
 {
 	unsigned long x;
 
@@ -160,7 +160,7 @@ extern void perf_counter_do_pending(void);
 
 #else
 
-static inline unsigned long get_perf_counter_pending(void)
+static inline unsigned long test_perf_counter_pending(void)
 {
 	return 0;
 }
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 469e963..2cd471f 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -135,7 +135,7 @@ notrace void raw_local_irq_restore(unsigned long en)
 			iseries_handle_interrupts();
 	}
 
-	if (get_perf_counter_pending()) {
+	if (test_perf_counter_pending()) {
 		clear_perf_counter_pending();
 		perf_counter_do_pending();
 	}
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index df007fe..cde720f 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -650,24 +650,6 @@ hw_perf_counter_init(struct perf_counter *counter)
 }
 
 /*
- * Handle wakeups.
- */
-void perf_counter_do_pending(void)
-{
-	int i;
-	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
-	struct perf_counter *counter;
-
-	for (i = 0; i < cpuhw->n_counters; ++i) {
-		counter = cpuhw->counter[i];
-		if (counter && counter->wakeup_pending) {
-			counter->wakeup_pending = 0;
-			wake_up(&counter->waitq);
-		}
-	}
-}
-
-/*
  * A counter has overflowed; update its count and record
  * things if requested.  Note that interrupts are hard-disabled
  * here so there is no possibility of being interrupted.
@@ -720,7 +702,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
 	struct perf_counter *counter;
 	long val;
-	int need_wakeup = 0, found = 0;
+	int found = 0;
 
 	for (i = 0; i < cpuhw->n_counters; ++i) {
 		counter = cpuhw->counter[i];
@@ -761,7 +743,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	 * immediately; otherwise we'll have do the wakeup when interrupts
 	 * get soft-enabled.
 	 */
-	if (get_perf_counter_pending() && regs->softe) {
+	if (test_perf_counter_pending() && regs->softe) {
 		irq_enter();
 		clear_perf_counter_pending();
 		perf_counter_do_pending();
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 1662043..e2b0e66 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -84,8 +84,9 @@ union cpuid10_edx {
 #define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
 #define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2)
 
-#define set_perf_counter_pending()	\
-		set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
+#define set_perf_counter_pending()	do { } while (0)
+#define clear_perf_counter_pending()	do { } while (0)
+#define test_perf_counter_pending()	(0)
 
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 3ffd5d2..8820a73 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,7 +83,6 @@ struct thread_info {
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
-#define TIF_PERF_COUNTERS	11	/* notify perf counter work */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* 32bit process */
 #define TIF_FORK		18	/* ret_from_fork */
@@ -107,7 +106,6 @@ struct thread_info {
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
-#define _TIF_PERF_COUNTERS	(1 << TIF_PERF_COUNTERS)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
 #define _TIF_IA32		(1 << TIF_IA32)
 #define _TIF_FORK		(1 << TIF_FORK)
@@ -141,7 +139,7 @@ struct thread_info {
 
 /* Only used for 64 bit */
 #define _TIF_DO_NOTIFY_MASK						\
-	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERF_COUNTERS|_TIF_NOTIFY_RESUME)
+	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 3f95b0c..7aab177 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -227,7 +227,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		 */
 		hwc->config |= pmc_ops->event_map(perf_event_id(hw_event));
 	}
-	counter->wakeup_pending = 0;
 
 	return 0;
 }
@@ -773,34 +772,6 @@ void smp_perf_counter_interrupt(struct pt_regs *regs)
 	irq_exit();
 }
 
-/*
- * This handler is triggered by NMI contexts:
- */
-void perf_counter_notify(struct pt_regs *regs)
-{
-	struct cpu_hw_counters *cpuc;
-	unsigned long flags;
-	int bit, cpu;
-
-	local_irq_save(flags);
-	cpu = smp_processor_id();
-	cpuc = &per_cpu(cpu_hw_counters, cpu);
-
-	for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) {
-		struct perf_counter *counter = cpuc->counters[bit];
-
-		if (!counter)
-			continue;
-
-		if (counter->wakeup_pending) {
-			counter->wakeup_pending = 0;
-			wake_up(&counter->waitq);
-		}
-	}
-
-	local_irq_restore(flags);
-}
-
 void perf_counters_lapic_init(int nmi)
 {
 	u32 apic_val;
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 611615a..0a813b1 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,6 @@
  *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
  *  2000-2002   x86-64 support by Andi Kleen
  */
-#include <linux/perf_counter.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
@@ -872,11 +871,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 		tracehook_notify_resume(regs);
 	}
 
-	if (thread_info_flags & _TIF_PERF_COUNTERS) {
-		clear_thread_flag(TIF_PERF_COUNTERS);
-		perf_counter_notify(regs);
-	}
-
 #ifdef CONFIG_X86_32
 	clear_thread_flag(TIF_IRET);
 #endif /* CONFIG_X86_32 */
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 6bf67ce..0d83322 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -275,6 +275,10 @@ struct perf_mmap_data {
 	void 				*data_pages[0];
 };
 
+struct perf_wakeup_entry {
+	struct perf_wakeup_entry *next;
+};
+
 /**
  * struct perf_counter - performance counter kernel representation:
  */
@@ -350,7 +354,7 @@ struct perf_counter {
 	/* poll related */
 	wait_queue_head_t		waitq;
 	/* optional: for NMIs */
-	int				wakeup_pending;
+	struct perf_wakeup_entry	wakeup;
 
 	void (*destroy)(struct perf_counter *);
 	struct rcu_head			rcu_head;
@@ -427,7 +431,7 @@ extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
 extern void perf_counter_task_tick(struct task_struct *task, int cpu);
 extern void perf_counter_init_task(struct task_struct *child);
 extern void perf_counter_exit_task(struct task_struct *child);
-extern void perf_counter_notify(struct pt_regs *regs);
+extern void perf_counter_do_pending(void);
 extern void perf_counter_print_debug(void);
 extern void perf_counter_unthrottle(void);
 extern u64 hw_perf_save_disable(void);
@@ -461,7 +465,7 @@ static inline void
 perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
 static inline void perf_counter_init_task(struct task_struct *child)	{ }
 static inline void perf_counter_exit_task(struct task_struct *child)	{ }
-static inline void perf_counter_notify(struct pt_regs *regs)		{ }
+static inline void perf_counter_do_pending(void)			{ }
 static inline void perf_counter_print_debug(void)			{ }
 static inline void perf_counter_unthrottle(void)			{ }
 static inline void hw_perf_restore(u64 ctrl)				{ }
@@ -469,8 +473,9 @@ static inline u64 hw_perf_save_disable(void)		      { return 0; }
 static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
 static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
 
-static inline void perf_swcounter_event(u32 event, u64 nr,
-					int nmi, struct pt_regs *regs)	{ }
+static inline void
+perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs)	{ }
+
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 3b862a7..f70ff80 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1197,8 +1197,12 @@ static void free_counter_rcu(struct rcu_head *head)
 	kfree(counter);
 }
 
+static void perf_pending_sync(struct perf_counter *counter);
+
 static void free_counter(struct perf_counter *counter)
 {
+	perf_pending_sync(counter);
+
 	if (counter->destroy)
 		counter->destroy(counter);
 
@@ -1529,6 +1533,118 @@ static const struct file_operations perf_fops = {
 };
 
 /*
+ * Perf counter wakeup
+ *
+ * If there's data, ensure we set the poll() state and publish everything
+ * to user-space before waking everybody up.
+ */
+
+void perf_counter_wakeup(struct perf_counter *counter)
+{
+	struct perf_mmap_data *data;
+
+	rcu_read_lock();
+	data = rcu_dereference(counter->data);
+	if (data) {
+		(void)atomic_xchg(&data->wakeup, POLL_IN);
+		__perf_counter_update_userpage(counter, data);
+	}
+	rcu_read_unlock();
+
+	wake_up_all(&counter->waitq);
+}
+
+/*
+ * Pending wakeups
+ *
+ * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
+ *
+ * The NMI bit means we cannot possibly take locks. Therefore, maintain a
+ * single linked list and use cmpxchg() to add entries lockless.
+ */
+
+#define PENDING_TAIL ((struct perf_wakeup_entry *)-1UL)
+
+static DEFINE_PER_CPU(struct perf_wakeup_entry *, perf_wakeup_head) = {
+	PENDING_TAIL,
+};
+
+static void perf_pending_queue(struct perf_counter *counter)
+{
+	struct perf_wakeup_entry **head;
+	struct perf_wakeup_entry *prev, *next;
+
+	if (cmpxchg(&counter->wakeup.next, NULL, PENDING_TAIL) != NULL)
+		return;
+
+	head = &get_cpu_var(perf_wakeup_head);
+
+	do {
+		prev = counter->wakeup.next = *head;
+		next = &counter->wakeup;
+	} while (cmpxchg(head, prev, next) != prev);
+
+	set_perf_counter_pending();
+
+	put_cpu_var(perf_wakeup_head);
+}
+
+static int __perf_pending_run(void)
+{
+	struct perf_wakeup_entry *list;
+	int nr = 0;
+
+	list = xchg(&__get_cpu_var(perf_wakeup_head), PENDING_TAIL);
+	while (list != PENDING_TAIL) {
+		struct perf_counter *counter = container_of(list,
+				struct perf_counter, wakeup);
+
+		list = list->next;
+
+		counter->wakeup.next = NULL;
+		/*
+		 * Ensure we observe the unqueue before we issue the wakeup,
+		 * so that we won't be waiting forever.
+		 * -- see perf_not_pending().
+		 */
+		smp_wmb();
+
+		perf_counter_wakeup(counter);
+		nr++;
+	}
+
+	return nr;
+}
+
+static inline int perf_not_pending(struct perf_counter *counter)
+{
+	/*
+	 * If we flush on whatever cpu we run, there is a chance we don't
+	 * need to wait.
+	 */
+	get_cpu();
+	__perf_pending_run();
+	put_cpu();
+
+	/*
+	 * Ensure we see the proper queue state before going to sleep
+	 * so that we do not miss the wakeup. -- see perf_pending_handle()
+	 */
+	smp_rmb();
+	return counter->wakeup.next == NULL;
+}
+
+static void perf_pending_sync(struct perf_counter *counter)
+{
+	wait_event(counter->waitq, perf_not_pending(counter));
+}
+
+void perf_counter_do_pending(void)
+{
+	__perf_pending_run();
+}
+
+/*
  * Output
  */
 
@@ -1611,13 +1727,10 @@ static void perf_output_copy(struct perf_output_handle *handle,
 static void perf_output_end(struct perf_output_handle *handle, int nmi)
 {
 	if (handle->wakeup) {
-		(void)atomic_xchg(&handle->data->wakeup, POLL_IN);
-		__perf_counter_update_userpage(handle->counter, handle->data);
-		if (nmi) {
-			handle->counter->wakeup_pending = 1;
-			set_perf_counter_pending();
-		} else
-			wake_up(&handle->counter->waitq);
+		if (nmi)
+			perf_pending_queue(handle->counter);
+		else
+			perf_counter_wakeup(handle->counter);
 	}
 	rcu_read_unlock();
 }
@@ -2211,7 +2324,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
 	counter->cpu			= cpu;
 	counter->hw_event		= *hw_event;
-	counter->wakeup_pending		= 0;
 	counter->group_leader		= group_leader;
 	counter->hw_ops			= NULL;
 	counter->ctx			= ctx;
diff --git a/kernel/timer.c b/kernel/timer.c
index b455556..672ca25 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,6 +37,7 @@
 #include <linux/delay.h>
 #include <linux/tick.h>
 #include <linux/kallsyms.h>
+#include <linux/perf_counter.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -1167,6 +1168,8 @@ static void run_timer_softirq(struct softirq_action *h)
 {
 	struct tvec_base *base = __get_cpu_var(tvec_bases);
 
+	perf_counter_do_pending();
+
 	hrtimer_run_pending();
 
 	if (time_after_eq(jiffies, base->timer_jiffies))
-- 
cgit v0.10.2


From 38ff667b321b00f5e6830e93fb4ab11a653a2920 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:03 +0200
Subject: perf_counter: fix update_userpage()

It just occured to me it is possible to have multiple contending
updates of the userpage (mmap information vs overflow vs counter).
This would break the seqlock logic.

It appear the arch code uses this from NMI context, so we cannot
possibly serialize its use, therefore separate the data_head update
from it and let it return to its original use.

The arch code needs to make sure there are no contending callers by
disabling the counter before using it -- powerpc appears to do this
nicely.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171023.241410660@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 0d83322..8ac1885 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -160,10 +160,45 @@ struct perf_counter_hw_event {
 struct perf_counter_mmap_page {
 	__u32	version;		/* version number of this structure */
 	__u32	compat_version;		/* lowest version this is compat with */
+
+	/*
+	 * Bits needed to read the hw counters in user-space.
+	 *
+	 * The index and offset should be read atomically using the seqlock:
+	 *
+	 *   __u32 seq, index;
+	 *   __s64 offset;
+	 *
+	 * again:
+	 *   rmb();
+	 *   seq = pc->lock;
+	 *
+	 *   if (unlikely(seq & 1)) {
+	 *     cpu_relax();
+	 *     goto again;
+	 *   }
+	 *
+	 *   index = pc->index;
+	 *   offset = pc->offset;
+	 *
+	 *   rmb();
+	 *   if (pc->lock != seq)
+	 *     goto again;
+	 *
+	 * After this, index contains architecture specific counter index + 1,
+	 * so that 0 means unavailable, offset contains the value to be added
+	 * to the result of the raw timer read to obtain this counter's value.
+	 */
 	__u32	lock;			/* seqlock for synchronization */
 	__u32	index;			/* hardware counter identifier */
 	__s64	offset;			/* add to hardware counter value */
 
+	/*
+	 * Control data for the mmap() data buffer.
+	 *
+	 * User-space reading this value should issue an rmb(), on SMP capable
+	 * platforms, after reading this value -- see perf_counter_wakeup().
+	 */
 	__u32   data_head;		/* head in the data section */
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f70ff80..c95e923 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1316,10 +1316,22 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return err;
 }
 
-static void __perf_counter_update_userpage(struct perf_counter *counter,
-					   struct perf_mmap_data *data)
+/*
+ * Callers need to ensure there can be no nesting of this function, otherwise
+ * the seqlock logic goes bad. We can not serialize this because the arch
+ * code calls this from NMI context.
+ */
+void perf_counter_update_userpage(struct perf_counter *counter)
 {
-	struct perf_counter_mmap_page *userpg = data->user_page;
+	struct perf_mmap_data *data;
+	struct perf_counter_mmap_page *userpg;
+
+	rcu_read_lock();
+	data = rcu_dereference(counter->data);
+	if (!data)
+		goto unlock;
+
+	userpg = data->user_page;
 
 	/*
 	 * Disable preemption so as to not let the corresponding user-space
@@ -1333,20 +1345,10 @@ static void __perf_counter_update_userpage(struct perf_counter *counter,
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		userpg->offset -= atomic64_read(&counter->hw.prev_count);
 
-	userpg->data_head = atomic_read(&data->head);
 	smp_wmb();
 	++userpg->lock;
 	preempt_enable();
-}
-
-void perf_counter_update_userpage(struct perf_counter *counter)
-{
-	struct perf_mmap_data *data;
-
-	rcu_read_lock();
-	data = rcu_dereference(counter->data);
-	if (data)
-		__perf_counter_update_userpage(counter, data);
+unlock:
 	rcu_read_unlock();
 }
 
@@ -1547,7 +1549,13 @@ void perf_counter_wakeup(struct perf_counter *counter)
 	data = rcu_dereference(counter->data);
 	if (data) {
 		(void)atomic_xchg(&data->wakeup, POLL_IN);
-		__perf_counter_update_userpage(counter, data);
+		/*
+		 * Ensure all data writes are issued before updating the
+		 * user-space data head information. The matching rmb()
+		 * will be in userspace after reading this value.
+		 */
+		smp_wmb();
+		data->user_page->data_head = atomic_read(&data->head);
 	}
 	rcu_read_unlock();
 
-- 
cgit v0.10.2


From 195564390210977954fe4ef45b39cdee34f41b59 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:04 +0200
Subject: perf_counter: kerneltop: simplify data_head read

Now that the kernel side changed, match up again.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171023.327144324@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index fda1438..2779c57 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -1125,22 +1125,10 @@ struct mmap_data {
 static unsigned int mmap_read_head(struct mmap_data *md)
 {
 	struct perf_counter_mmap_page *pc = md->base;
-	unsigned int seq, head;
-
-repeat:
-	rmb();
-	seq = pc->lock;
-
-	if (unlikely(seq & 1)) {
-		cpu_relax();
-		goto repeat;
-	}
+	int head;
 
 	head = pc->data_head;
-
 	rmb();
-	if (pc->lock != seq)
-		goto repeat;
 
 	return head;
 }
-- 
cgit v0.10.2


From 0a4a93919bdc5cee48fe4367591e8e0449c1086c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:05 +0200
Subject: perf_counter: executable mmap() information

Currently the profiling information returns userspace IPs but no way
to correlate them to userspace code. Userspace could look into
/proc/$pid/maps but that might not be current or even present anymore
at the time of analyzing the IPs.

Therefore provide means to track the mmap information and provide it
in the output stream.

XXX: only covers mmap()/munmap(), mremap() and mprotect() are missing.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Orig-LKML-Reference: <20090330171023.417259499@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 8ac1885..037a811 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -137,9 +137,11 @@ struct perf_counter_hw_event {
 				exclude_kernel :  1, /* ditto kernel          */
 				exclude_hv     :  1, /* ditto hypervisor      */
 				exclude_idle   :  1, /* don't count when idle */
-				include_tid    :  1, /* include the tid */
+				include_tid    :  1, /* include the tid       */
+				mmap           :  1, /* include mmap data     */
+				munmap         :  1, /* include munmap data   */
 
-				__reserved_1   : 54;
+				__reserved_1   : 52;
 
 	__u32			extra_config_len;
 	__u32			__reserved_4;
@@ -211,6 +213,9 @@ enum perf_event_type {
 	PERF_EVENT_IP		= 0,
 	PERF_EVENT_GROUP	= 1,
 
+	PERF_EVENT_MMAP		= 2,
+	PERF_EVENT_MUNMAP	= 3,
+
 	__PERF_EVENT_TID	= 0x100,
 };
 
@@ -491,6 +496,12 @@ static inline int is_software_counter(struct perf_counter *counter)
 
 extern void perf_swcounter_event(u32, u64, int, struct pt_regs *);
 
+extern void perf_counter_mmap(unsigned long addr, unsigned long len,
+			      unsigned long pgoff, struct file *file);
+
+extern void perf_counter_munmap(unsigned long addr, unsigned long len,
+				unsigned long pgoff, struct file *file);
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -511,6 +522,15 @@ static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
 static inline void
 perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs)	{ }
 
+
+static inline void
+perf_counter_mmap(unsigned long addr, unsigned long len,
+		  unsigned long pgoff, struct file *file)		{ }
+
+static inline void
+perf_counter_munmap(unsigned long addr, unsigned long len,
+		    unsigned long pgoff, struct file *file) 		{ }
+
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c95e923..f35e89e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -25,6 +25,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/kernel_stat.h>
 #include <linux/perf_counter.h>
+#include <linux/dcache.h>
 
 #include <asm/irq_regs.h>
 
@@ -1844,6 +1845,150 @@ void perf_counter_output(struct perf_counter *counter,
 }
 
 /*
+ * mmap tracking
+ */
+
+struct perf_mmap_event {
+	struct file	*file;
+	char		*file_name;
+	int		file_size;
+
+	struct {
+		struct perf_event_header	header;
+
+		u32				pid;
+		u32				tid;
+		u64				start;
+		u64				len;
+		u64				pgoff;
+	} event;
+};
+
+static void perf_counter_mmap_output(struct perf_counter *counter,
+				     struct perf_mmap_event *mmap_event)
+{
+	struct perf_output_handle handle;
+	int size = mmap_event->event.header.size;
+	int ret = perf_output_begin(&handle, counter, size);
+
+	if (ret)
+		return;
+
+	perf_output_put(&handle, mmap_event->event);
+	perf_output_copy(&handle, mmap_event->file_name,
+				   mmap_event->file_size);
+	perf_output_end(&handle, 0);
+}
+
+static int perf_counter_mmap_match(struct perf_counter *counter,
+				   struct perf_mmap_event *mmap_event)
+{
+	if (counter->hw_event.mmap &&
+	    mmap_event->event.header.type == PERF_EVENT_MMAP)
+		return 1;
+
+	if (counter->hw_event.munmap &&
+	    mmap_event->event.header.type == PERF_EVENT_MUNMAP)
+		return 1;
+
+	return 0;
+}
+
+static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
+				  struct perf_mmap_event *mmap_event)
+{
+	struct perf_counter *counter;
+
+	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
+		if (perf_counter_mmap_match(counter, mmap_event))
+			perf_counter_mmap_output(counter, mmap_event);
+	}
+	rcu_read_unlock();
+}
+
+static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
+{
+	struct perf_cpu_context *cpuctx;
+	struct file *file = mmap_event->file;
+	unsigned int size;
+	char tmp[16];
+	char *buf = NULL;
+	char *name;
+
+	if (file) {
+		buf = kzalloc(PATH_MAX, GFP_KERNEL);
+		if (!buf) {
+			name = strncpy(tmp, "//enomem", sizeof(tmp));
+			goto got_name;
+		}
+		name = dentry_path(file->f_dentry, buf, PATH_MAX);
+		if (IS_ERR(name)) {
+			name = strncpy(tmp, "//toolong", sizeof(tmp));
+			goto got_name;
+		}
+	} else {
+		name = strncpy(tmp, "//anon", sizeof(tmp));
+		goto got_name;
+	}
+
+got_name:
+	size = ALIGN(strlen(name), sizeof(u64));
+
+	mmap_event->file_name = name;
+	mmap_event->file_size = size;
+
+	mmap_event->event.header.size = sizeof(mmap_event->event) + size;
+
+	cpuctx = &get_cpu_var(perf_cpu_context);
+	perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
+	put_cpu_var(perf_cpu_context);
+
+	perf_counter_mmap_ctx(&current->perf_counter_ctx, mmap_event);
+
+	kfree(buf);
+}
+
+void perf_counter_mmap(unsigned long addr, unsigned long len,
+		       unsigned long pgoff, struct file *file)
+{
+	struct perf_mmap_event mmap_event = {
+		.file   = file,
+		.event  = {
+			.header = { .type = PERF_EVENT_MMAP, },
+			.pid	= current->group_leader->pid,
+			.tid	= current->pid,
+			.start  = addr,
+			.len    = len,
+			.pgoff  = pgoff,
+		},
+	};
+
+	perf_counter_mmap_event(&mmap_event);
+}
+
+void perf_counter_munmap(unsigned long addr, unsigned long len,
+			 unsigned long pgoff, struct file *file)
+{
+	struct perf_mmap_event mmap_event = {
+		.file   = file,
+		.event  = {
+			.header = { .type = PERF_EVENT_MUNMAP, },
+			.pid	= current->group_leader->pid,
+			.tid	= current->pid,
+			.start  = addr,
+			.len    = len,
+			.pgoff  = pgoff,
+		},
+	};
+
+	perf_counter_mmap_event(&mmap_event);
+}
+
+/*
  * Generic software counter infrastructure
  */
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 4a38411..1df63f6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,6 +28,7 @@
 #include <linux/mempolicy.h>
 #include <linux/rmap.h>
 #include <linux/mmu_notifier.h>
+#include <linux/perf_counter.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -1223,6 +1224,9 @@ munmap_back:
 	if (correct_wcount)
 		atomic_inc(&inode->i_writecount);
 out:
+	if (vm_flags & VM_EXEC)
+		perf_counter_mmap(addr, len, pgoff, file);
+
 	mm->total_vm += len >> PAGE_SHIFT;
 	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
 	if (vm_flags & VM_LOCKED) {
@@ -1756,6 +1760,12 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
 	do {
 		long nrpages = vma_pages(vma);
 
+		if (vma->vm_flags & VM_EXEC) {
+			perf_counter_munmap(vma->vm_start,
+					nrpages << PAGE_SHIFT,
+					vma->vm_pgoff, vma->vm_file);
+		}
+
 		mm->total_vm -= nrpages;
 		vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
 		vma = remove_vma(vma);
-- 
cgit v0.10.2


From 3c1ba6fafecaed295017881f8863a18602f32c1d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:06 +0200
Subject: perf_counter: kerneltop: parse the mmap data stream

frob the kerneltop code to print the mmap data in the stream

Better use would be collecting the IPs per PID and mapping them onto
the provided userspace code.. TODO

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171023.501902515@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 2779c57..995111d 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -184,6 +184,8 @@ static int			nmi				=  1;
 static int			group				=  0;
 static unsigned int		page_size;
 static unsigned int		mmap_pages			=  16;
+static int			use_mmap			= 0;
+static int			use_munmap			= 0;
 
 static char			*vmlinux;
 
@@ -333,6 +335,8 @@ static void display_help(void)
 	" -z        --zero             # zero counts after display\n"
 	" -D        --dump_symtab      # dump symbol table to stderr on startup\n"
 	" -m pages  --mmap_pages=<pages> # number of mmap data pages\n"
+	" -M        --mmap_info        # print mmap info stream\n"
+	" -U        --munmap_info      # print munmap info stream\n"
 	);
 
 	exit(0);
@@ -1052,9 +1056,11 @@ static void process_options(int argc, char *argv[])
 			{"stat",	no_argument,		NULL, 'S'},
 			{"zero",	no_argument,		NULL, 'z'},
 			{"mmap_pages",	required_argument,	NULL, 'm'},
+			{"mmap_info",	no_argument,		NULL, 'M'},
+			{"munmap_info",	no_argument,		NULL, 'U'},
 			{NULL,		0,			NULL,  0 }
 		};
-		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:m:p:s:Sx:z",
+		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:m:p:s:Sx:zMU",
 				    long_options, &option_index);
 		if (c == -1)
 			break;
@@ -1092,6 +1098,8 @@ static void process_options(int argc, char *argv[])
 		case 'x': vmlinux			= strdup(optarg); break;
 		case 'z': zero				=              1; break;
 		case 'm': mmap_pages			=   atoi(optarg); break;
+		case 'M': use_mmap			=              1; break;
+		case 'U': use_munmap			=              1; break;
 		default: error = 1; break;
 		}
 	}
@@ -1172,12 +1180,29 @@ static void mmap_read(struct mmap_data *md)
 	last_read = this_read;
 
 	for (; old != head;) {
-		struct event_struct {
+		struct ip_event {
 			struct perf_event_header header;
 			__u64 ip;
 			__u32 pid, tid;
-		} *event = (struct event_struct *)&data[old & md->mask];
-		struct event_struct event_copy;
+		};
+		struct mmap_event {
+			struct perf_event_header header;
+			__u32 pid, tid;
+			__u64 start;
+			__u64 len;
+			__u64 pgoff;
+			char filename[PATH_MAX];
+		};
+
+		typedef union event_union {
+			struct perf_event_header header;
+			struct ip_event ip;
+			struct mmap_event mmap;
+		} event_t;
+
+		event_t *event = (event_t *)&data[old & md->mask];
+
+		event_t event_copy;
 
 		unsigned int size = event->header.size;
 
@@ -1187,7 +1212,7 @@ static void mmap_read(struct mmap_data *md)
 		 */
 		if ((old & md->mask) + size != ((old + size) & md->mask)) {
 			unsigned int offset = old;
-			unsigned int len = sizeof(*event), cpy;
+			unsigned int len = min(sizeof(*event), size), cpy;
 			void *dst = &event_copy;
 
 			do {
@@ -1206,7 +1231,18 @@ static void mmap_read(struct mmap_data *md)
 		switch (event->header.type) {
 		case PERF_EVENT_IP:
 		case PERF_EVENT_IP | __PERF_EVENT_TID:
-			process_event(event->ip, md->counter);
+			process_event(event->ip.ip, md->counter);
+			break;
+
+		case PERF_EVENT_MMAP:
+		case PERF_EVENT_MUNMAP:
+			printf("%s: %Lu %Lu %Lu %s\n",
+					event->header.type == PERF_EVENT_MMAP
+					  ? "mmap" : "munmap",
+					event->mmap.start,
+					event->mmap.len,
+					event->mmap.pgoff,
+					event->mmap.filename);
 			break;
 		}
 	}
@@ -1255,6 +1291,8 @@ int main(int argc, char *argv[])
 			hw_event.record_type	= PERF_RECORD_IRQ;
 			hw_event.nmi		= nmi;
 			hw_event.include_tid	= 1;
+			hw_event.mmap		= use_mmap;
+			hw_event.munmap		= use_munmap;
 
 			fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
 			if (fd[i][counter] < 0) {
-- 
cgit v0.10.2


From 7595d63b3a9ce65d14c4fbd0e7de448a343d7215 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 30 Mar 2009 19:07:07 +0200
Subject: perf_counter: powerpc: only reserve PMU hardware when we need it

Impact: cooperate with oprofile

At present, on PowerPC, if you have perf_counters compiled in, oprofile
doesn't work.  There is code to allow the PMU to be shared between
competing subsystems, such as perf_counters and oprofile, but currently
the perf_counter subsystem reserves the PMU for itself at boot time,
and never releases it.

This makes perf_counter play nicely with oprofile.  Now we keep a count
of how many perf_counter instances are counting hardware events, and
reserve the PMU when that count becomes non-zero, and release the PMU
when that count becomes zero.  This means that it is possible to have
perf_counters compiled in and still use oprofile, as long as there are
no hardware perf_counters active.  This also means that if oprofile is
active, sys_perf_counter_open will fail if the hw_event specifies a
hardware event.

To avoid races with other tasks creating and destroying perf_counters,
we use a mutex.  We use atomic_inc_not_zero and atomic_add_unless to
avoid having to take the mutex unless there is a possibility of the
count going between 0 and 1.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Orig-LKML-Reference: <20090330171023.627912475@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index cde720f..560dd1e 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -41,6 +41,8 @@ struct power_pmu *ppmu;
  */
 static unsigned int freeze_counters_kernel = MMCR0_FCS;
 
+static void perf_counter_interrupt(struct pt_regs *regs);
+
 void perf_counter_print_debug(void)
 {
 }
@@ -594,6 +596,24 @@ struct hw_perf_counter_ops power_perf_ops = {
 	.read = power_perf_read
 };
 
+/* Number of perf_counters counting hardware events */
+static atomic_t num_counters;
+/* Used to avoid races in calling reserve/release_pmc_hardware */
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+/*
+ * Release the PMU if this is the last perf_counter.
+ */
+static void hw_perf_counter_destroy(struct perf_counter *counter)
+{
+	if (!atomic_add_unless(&num_counters, -1, 1)) {
+		mutex_lock(&pmc_reserve_mutex);
+		if (atomic_dec_return(&num_counters) == 0)
+			release_pmc_hardware();
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+}
+
 const struct hw_perf_counter_ops *
 hw_perf_counter_init(struct perf_counter *counter)
 {
@@ -601,6 +621,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 	struct perf_counter *ctrs[MAX_HWCOUNTERS];
 	unsigned int events[MAX_HWCOUNTERS];
 	int n;
+	int err;
 
 	if (!ppmu)
 		return NULL;
@@ -646,6 +667,27 @@ hw_perf_counter_init(struct perf_counter *counter)
 
 	counter->hw.config = events[n];
 	atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
+
+	/*
+	 * See if we need to reserve the PMU.
+	 * If no counters are currently in use, then we have to take a
+	 * mutex to ensure that we don't race with another task doing
+	 * reserve_pmc_hardware or release_pmc_hardware.
+	 */
+	err = 0;
+	if (!atomic_inc_not_zero(&num_counters)) {
+		mutex_lock(&pmc_reserve_mutex);
+		if (atomic_read(&num_counters) == 0 &&
+		    reserve_pmc_hardware(perf_counter_interrupt))
+			err = -EBUSY;
+		else
+			atomic_inc(&num_counters);
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+	counter->destroy = hw_perf_counter_destroy;
+
+	if (err)
+		return NULL;
 	return &power_perf_ops;
 }
 
@@ -769,11 +811,6 @@ static int init_perf_counters(void)
 {
 	unsigned long pvr;
 
-	if (reserve_pmc_hardware(perf_counter_interrupt)) {
-		printk(KERN_ERR "Couldn't init performance monitor subsystem\n");
-		return -EBUSY;
-	}
-
 	/* XXX should get this from cputable */
 	pvr = mfspr(SPRN_PVR);
 	switch (PVR_VER(pvr)) {
-- 
cgit v0.10.2


From d5d2bc0dd0379deddb9ede66fec90a3083eaec57 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 30 Mar 2009 19:07:08 +0200
Subject: perf_counter: make it possible for hw_perf_counter_init to return
 error codes

Impact: better error reporting

At present, if hw_perf_counter_init encounters an error, all it can do
is return NULL, which causes sys_perf_counter_open to return an EINVAL
error to userspace.  This isn't very informative for userspace; it means
that userspace can't tell the difference between "sorry, oprofile is
already using the PMU" and "we don't support this CPU" and "this CPU
doesn't support the requested generic hardware event".

This commit uses the PTR_ERR/ERR_PTR/IS_ERR set of macros to let
hw_perf_counter_init return an error code on error rather than just NULL
if it wishes.  If it does so, that error code will be returned from
sys_perf_counter_open to userspace.  If it returns NULL, an EINVAL
error will be returned to userspace, as before.

This also adapts the powerpc hw_perf_counter_init to make use of this
to return ENXIO, EINVAL, EBUSY, or EOPNOTSUPP as appropriate.  It would
be good to add extra error numbers in future to allow userspace to
distinguish the various errors that are currently reported as EINVAL,
i.e. irq_period < 0, too many events in a group, conflict between
exclude_* settings in a group, and PMU resource conflict in a group.

[ v2: fix a bug pointed out by Corey Ashford where error returns from
      hw_perf_counter_init were not handled correctly in the case of
      raw hardware events.]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Orig-LKML-Reference: <20090330171023.682428180@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 560dd1e..0a4d14f 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -624,13 +624,13 @@ hw_perf_counter_init(struct perf_counter *counter)
 	int err;
 
 	if (!ppmu)
-		return NULL;
+		return ERR_PTR(-ENXIO);
 	if ((s64)counter->hw_event.irq_period < 0)
-		return NULL;
+		return ERR_PTR(-EINVAL);
 	if (!perf_event_raw(&counter->hw_event)) {
 		ev = perf_event_id(&counter->hw_event);
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
-			return NULL;
+			return ERR_PTR(-EOPNOTSUPP);
 		ev = ppmu->generic_events[ev];
 	} else {
 		ev = perf_event_config(&counter->hw_event);
@@ -656,14 +656,14 @@ hw_perf_counter_init(struct perf_counter *counter)
 		n = collect_events(counter->group_leader, ppmu->n_counter - 1,
 				   ctrs, events);
 		if (n < 0)
-			return NULL;
+			return ERR_PTR(-EINVAL);
 	}
 	events[n] = ev;
 	ctrs[n] = counter;
 	if (check_excludes(ctrs, n, 1))
-		return NULL;
+		return ERR_PTR(-EINVAL);
 	if (power_check_constraints(events, n + 1))
-		return NULL;
+		return ERR_PTR(-EINVAL);
 
 	counter->hw.config = events[n];
 	atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
@@ -687,7 +687,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 	counter->destroy = hw_perf_counter_destroy;
 
 	if (err)
-		return NULL;
+		return ERR_PTR(err);
 	return &power_perf_ops;
 }
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f35e89e..d07b452 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2453,10 +2453,11 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 {
 	const struct hw_perf_counter_ops *hw_ops;
 	struct perf_counter *counter;
+	long err;
 
 	counter = kzalloc(sizeof(*counter), gfpflags);
 	if (!counter)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	/*
 	 * Single counters are their own group leaders, with an
@@ -2505,12 +2506,18 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		hw_ops = tp_perf_counter_init(counter);
 		break;
 	}
+done:
+	err = 0;
+	if (!hw_ops)
+		err = -EINVAL;
+	else if (IS_ERR(hw_ops))
+		err = PTR_ERR(hw_ops);
 
-	if (!hw_ops) {
+	if (err) {
 		kfree(counter);
-		return NULL;
+		return ERR_PTR(err);
 	}
-done:
+
 	counter->hw_ops = hw_ops;
 
 	return counter;
@@ -2583,10 +2590,10 @@ SYSCALL_DEFINE5(perf_counter_open,
 			goto err_put_context;
 	}
 
-	ret = -EINVAL;
 	counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
 				     GFP_KERNEL);
-	if (!counter)
+	ret = PTR_ERR(counter);
+	if (IS_ERR(counter))
 		goto err_put_context;
 
 	ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
@@ -2658,8 +2665,8 @@ inherit_counter(struct perf_counter *parent_counter,
 	child_counter = perf_counter_alloc(&parent_counter->hw_event,
 					   parent_counter->cpu, child_ctx,
 					   group_leader, GFP_KERNEL);
-	if (!child_counter)
-		return NULL;
+	if (IS_ERR(child_counter))
+		return child_counter;
 
 	/*
 	 * Link it up in the child's context:
@@ -2710,15 +2717,17 @@ static int inherit_group(struct perf_counter *parent_counter,
 {
 	struct perf_counter *leader;
 	struct perf_counter *sub;
+	struct perf_counter *child_ctr;
 
 	leader = inherit_counter(parent_counter, parent, parent_ctx,
 				 child, NULL, child_ctx);
-	if (!leader)
-		return -ENOMEM;
+	if (IS_ERR(leader))
+		return PTR_ERR(leader);
 	list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
-		if (!inherit_counter(sub, parent, parent_ctx,
-				     child, leader, child_ctx))
-			return -ENOMEM;
+		child_ctr = inherit_counter(sub, parent, parent_ctx,
+					    child, leader, child_ctx);
+		if (IS_ERR(child_ctr))
+			return PTR_ERR(child_ctr);
 	}
 	return 0;
 }
-- 
cgit v0.10.2


From 9ea98e191255ee642e64a5745014424fc63f83b0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:09 +0200
Subject: perf_counter: x86: proper error propagation for the x86
 hw_perf_counter_init()

Now that Paul cleaned up the error propagation paths, pass down the
x86 error as well.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171023.792822360@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 7aab177..b8885cc 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -954,7 +954,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 
 	err = __hw_perf_counter_init(counter);
 	if (err)
-		return NULL;
+		return ERR_PTR(err);
 
 	return &x86_perf_counter_ops;
 }
-- 
cgit v0.10.2


From 31f004df8d14212f0a8a2fb12a8ed44a3d80e2fb Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 30 Mar 2009 19:07:10 +0200
Subject: perf_counter tools: optionally scale counter values in perfstat mode

Impact: new functionality

This adds add an option to the perfstat mode of kerneltop to scale the
reported counter values according to the fraction of time that each
counter gets to count.  This is invoked with the -l option (I used 'l'
because s, c, a and e were all taken already.)  This uses the new
PERF_RECORD_TOTAL_TIME_{ENABLED,RUNNING} read format options.

With this, we get output like this:

$ ./perfstat -l -e 0:0,0:1,0:2,0:3,0:4,0:5 ./spin

 Performance counter stats for './spin':

     4016072055  CPU cycles           (events)  (scaled from 66.53%)
     2005887318  instructions         (events)  (scaled from 66.53%)
        1762849  cache references     (events)  (scaled from 66.69%)
         165229  cache misses         (events)  (scaled from 66.85%)
     1001298009  branches             (events)  (scaled from 66.78%)
          41566  branch misses        (events)  (scaled from 66.61%)

 Wall-clock time elapsed:  2438.227446 msecs

This also lets us detect when a counter is zero because the counter
never got to go on the CPU at all.  In that case we print <not counted>
rather than 0.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Orig-LKML-Reference: <20090330171023.871484899@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 995111d..c0ca015 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -197,6 +197,8 @@ static int			delay_secs			=  2;
 static int			zero;
 static int			dump_symtab;
 
+static int			scale;
+
 struct source_line {
 	uint64_t		EIP;
 	unsigned long		count;
@@ -305,6 +307,7 @@ static void display_perfstat_help(void)
 	display_events_help();
 
 	printf(
+	" -l                           # scale counter values\n"
 	" -a                           # system-wide collection\n");
 	exit(0);
 }
@@ -328,6 +331,7 @@ static void display_help(void)
 	" -c CNT    --count=CNT        # event period to sample\n\n"
 	" -C CPU    --cpu=CPU          # CPU (-1 for all)                 [default: -1]\n"
 	" -p PID    --pid=PID          # PID of sampled task (-1 for all) [default: -1]\n\n"
+	" -l                           # show scale factor for RR events\n"
 	" -d delay  --delay=<seconds>  # sampling/display delay           [default:  2]\n"
 	" -f CNT    --filter=CNT       # min-event-count filter          [default: 100]\n\n"
 	" -s symbol --symbol=<symbol>  # function to be showed annotated one-shot\n"
@@ -436,6 +440,9 @@ static void create_perfstat_counter(int counter)
 	hw_event.config		= event_id[counter];
 	hw_event.record_type	= PERF_RECORD_SIMPLE;
 	hw_event.nmi		= 0;
+	if (scale)
+		hw_event.read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
+					  PERF_FORMAT_TOTAL_TIME_RUNNING;
 
 	if (system_wide) {
 		int cpu;
@@ -507,28 +514,53 @@ int do_perfstat(int argc, char *argv[])
 	fprintf(stderr, "\n");
 
 	for (counter = 0; counter < nr_counters; counter++) {
-		int cpu;
-		__u64 count, single_count;
+		int cpu, nv;
+		__u64 count[3], single_count[3];
+		int scaled;
 
-		count = 0;
+		count[0] = count[1] = count[2] = 0;
+		nv = scale ? 3 : 1;
 		for (cpu = 0; cpu < nr_cpus; cpu ++) {
 			res = read(fd[cpu][counter],
-					(char *) &single_count, sizeof(single_count));
-			assert(res == sizeof(single_count));
-			count += single_count;
+				   single_count, nv * sizeof(__u64));
+			assert(res == nv * sizeof(__u64));
+
+			count[0] += single_count[0];
+			if (scale) {
+				count[1] += single_count[1];
+				count[2] += single_count[2];
+			}
+		}
+
+		scaled = 0;
+		if (scale) {
+			if (count[2] == 0) {
+				fprintf(stderr, " %14s  %-20s\n",
+					"<not counted>", event_name(counter));
+				continue;
+			}
+			if (count[2] < count[1]) {
+				scaled = 1;
+				count[0] = (unsigned long long)
+					((double)count[0] * count[1] / count[2] + 0.5);
+			}
 		}
 
 		if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
 		    event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
 
-			double msecs = (double)count / 1000000;
+			double msecs = (double)count[0] / 1000000;
 
-			fprintf(stderr, " %14.6f  %-20s (msecs)\n",
+			fprintf(stderr, " %14.6f  %-20s (msecs)",
 				msecs, event_name(counter));
 		} else {
-			fprintf(stderr, " %14Ld  %-20s (events)\n",
-				count, event_name(counter));
+			fprintf(stderr, " %14Ld  %-20s (events)",
+				count[0], event_name(counter));
 		}
+		if (scaled)
+			fprintf(stderr, "  (scaled from %.2f%%)",
+				(double) count[2] / count[1] * 100);
+		fprintf(stderr, "\n");
 	}
 	fprintf(stderr, "\n");
 	fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
@@ -1049,6 +1081,7 @@ static void process_options(int argc, char *argv[])
 			{"filter",	required_argument,	NULL, 'f'},
 			{"group",	required_argument,	NULL, 'g'},
 			{"help",	no_argument,		NULL, 'h'},
+			{"scale",	no_argument,		NULL, 'l'},
 			{"nmi",		required_argument,	NULL, 'n'},
 			{"pid",		required_argument,	NULL, 'p'},
 			{"vmlinux",	required_argument,	NULL, 'x'},
@@ -1060,7 +1093,7 @@ static void process_options(int argc, char *argv[])
 			{"munmap_info",	no_argument,		NULL, 'U'},
 			{NULL,		0,			NULL,  0 }
 		};
-		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:m:p:s:Sx:zMU",
+		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:s:Sx:zMU",
 				    long_options, &option_index);
 		if (c == -1)
 			break;
@@ -1084,6 +1117,7 @@ static void process_options(int argc, char *argv[])
 		case 'f': count_filter			=   atoi(optarg); break;
 		case 'g': group				=   atoi(optarg); break;
 		case 'h':      				  display_help(); break;
+		case 'l': scale				=	       1; break;
 		case 'n': nmi				=   atoi(optarg); break;
 		case 'p':
 			/* CPU and PID are mutually exclusive */
-- 
cgit v0.10.2


From 78d613eb129fc4edf0e2cabfcc6a4c5285482d21 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:11 +0200
Subject: perf_counter: small cleanup of the output routines

Move the nmi argument to the _begin() function, so that _end() only needs the
handle. This allows the _begin() function to generate a wakeup on event loss.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171023.959404268@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d07b452..4471e7e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1663,10 +1663,20 @@ struct perf_output_handle {
 	unsigned int		offset;
 	unsigned int		head;
 	int			wakeup;
+	int			nmi;
 };
 
+static inline void __perf_output_wakeup(struct perf_output_handle *handle)
+{
+	if (handle->nmi)
+		perf_pending_queue(handle->counter);
+	else
+		perf_counter_wakeup(handle->counter);
+}
+
 static int perf_output_begin(struct perf_output_handle *handle,
-			     struct perf_counter *counter, unsigned int size)
+			     struct perf_counter *counter, unsigned int size,
+			     int nmi)
 {
 	struct perf_mmap_data *data;
 	unsigned int offset, head;
@@ -1676,15 +1686,17 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	if (!data)
 		goto out;
 
+	handle->counter	= counter;
+	handle->nmi	= nmi;
+
 	if (!data->nr_pages)
-		goto out;
+		goto fail;
 
 	do {
 		offset = head = atomic_read(&data->head);
 		head += size;
 	} while (atomic_cmpxchg(&data->head, offset, head) != offset);
 
-	handle->counter	= counter;
 	handle->data	= data;
 	handle->offset	= offset;
 	handle->head	= head;
@@ -1692,6 +1704,8 @@ static int perf_output_begin(struct perf_output_handle *handle,
 
 	return 0;
 
+fail:
+	__perf_output_wakeup(handle);
 out:
 	rcu_read_unlock();
 
@@ -1733,14 +1747,10 @@ static void perf_output_copy(struct perf_output_handle *handle,
 #define perf_output_put(handle, x) \
 	perf_output_copy((handle), &(x), sizeof(x))
 
-static void perf_output_end(struct perf_output_handle *handle, int nmi)
+static void perf_output_end(struct perf_output_handle *handle)
 {
-	if (handle->wakeup) {
-		if (nmi)
-			perf_pending_queue(handle->counter);
-		else
-			perf_counter_wakeup(handle->counter);
-	}
+	if (handle->wakeup)
+		__perf_output_wakeup(handle);
 	rcu_read_unlock();
 }
 
@@ -1750,12 +1760,12 @@ static int perf_output_write(struct perf_counter *counter, int nmi,
 	struct perf_output_handle handle;
 	int ret;
 
-	ret = perf_output_begin(&handle, counter, size);
+	ret = perf_output_begin(&handle, counter, size, nmi);
 	if (ret)
 		goto out;
 
 	perf_output_copy(&handle, buf, size);
-	perf_output_end(&handle, nmi);
+	perf_output_end(&handle);
 
 out:
 	return ret;
@@ -1804,7 +1814,7 @@ static void perf_output_group(struct perf_counter *counter, int nmi)
 
 	size = sizeof(header) + counter->nr_siblings * sizeof(entry);
 
-	ret = perf_output_begin(&handle, counter, size);
+	ret = perf_output_begin(&handle, counter, size, nmi);
 	if (ret)
 		return;
 
@@ -1824,7 +1834,7 @@ static void perf_output_group(struct perf_counter *counter, int nmi)
 		perf_output_put(&handle, entry);
 	}
 
-	perf_output_end(&handle, nmi);
+	perf_output_end(&handle);
 }
 
 void perf_counter_output(struct perf_counter *counter,
@@ -1869,7 +1879,7 @@ static void perf_counter_mmap_output(struct perf_counter *counter,
 {
 	struct perf_output_handle handle;
 	int size = mmap_event->event.header.size;
-	int ret = perf_output_begin(&handle, counter, size);
+	int ret = perf_output_begin(&handle, counter, size, 0);
 
 	if (ret)
 		return;
@@ -1877,7 +1887,7 @@ static void perf_counter_mmap_output(struct perf_counter *counter,
 	perf_output_put(&handle, mmap_event->event);
 	perf_output_copy(&handle, mmap_event->file_name,
 				   mmap_event->file_size);
-	perf_output_end(&handle, 0);
+	perf_output_end(&handle);
 }
 
 static int perf_counter_mmap_match(struct perf_counter *counter,
-- 
cgit v0.10.2


From 5ed00415e304203a0a9dcaef226d6d3f1106070e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:12 +0200
Subject: perf_counter: re-arrange the perf_event_type

Breaks ABI yet again :-)

Change the event type so that [0, 2^31-1] are regular event types, but
[2^31, 2^32-1] forms a bitmask for overflow events.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171024.047961770@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 037a811..edf5bfb 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -210,13 +210,15 @@ struct perf_event_header {
 };
 
 enum perf_event_type {
-	PERF_EVENT_IP		= 0,
+
 	PERF_EVENT_GROUP	= 1,
 
 	PERF_EVENT_MMAP		= 2,
 	PERF_EVENT_MUNMAP	= 3,
 
-	__PERF_EVENT_TID	= 0x100,
+	PERF_EVENT_OVERFLOW	= 1UL << 31,
+	__PERF_EVENT_IP		= 1UL << 30,
+	__PERF_EVENT_TID	= 1UL << 29,
 };
 
 #ifdef __KERNEL__
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4471e7e..d93e9dd 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1754,50 +1754,44 @@ static void perf_output_end(struct perf_output_handle *handle)
 	rcu_read_unlock();
 }
 
-static int perf_output_write(struct perf_counter *counter, int nmi,
-			     void *buf, ssize_t size)
-{
-	struct perf_output_handle handle;
-	int ret;
-
-	ret = perf_output_begin(&handle, counter, size, nmi);
-	if (ret)
-		goto out;
-
-	perf_output_copy(&handle, buf, size);
-	perf_output_end(&handle);
-
-out:
-	return ret;
-}
-
 static void perf_output_simple(struct perf_counter *counter,
 			       int nmi, struct pt_regs *regs)
 {
-	unsigned int size;
+	int ret;
+	struct perf_output_handle handle;
+	struct perf_event_header header;
+	u64 ip;
 	struct {
-		struct perf_event_header header;
-		u64 ip;
 		u32 pid, tid;
-	} event;
+	} tid_entry;
 
-	event.header.type = PERF_EVENT_IP;
-	event.ip = instruction_pointer(regs);
+	header.type = PERF_EVENT_OVERFLOW;
+	header.size = sizeof(header);
 
-	size = sizeof(event);
+	ip = instruction_pointer(regs);
+	header.type |= __PERF_EVENT_IP;
+	header.size += sizeof(ip);
 
 	if (counter->hw_event.include_tid) {
 		/* namespace issues */
-		event.pid = current->group_leader->pid;
-		event.tid = current->pid;
+		tid_entry.pid = current->group_leader->pid;
+		tid_entry.tid = current->pid;
 
-		event.header.type |= __PERF_EVENT_TID;
-	} else
-		size -= sizeof(u64);
+		header.type |= __PERF_EVENT_TID;
+		header.size += sizeof(tid_entry);
+	}
 
-	event.header.size = size;
+	ret = perf_output_begin(&handle, counter, header.size, nmi);
+	if (ret)
+		return;
+
+	perf_output_put(&handle, header);
+	perf_output_put(&handle, ip);
+
+	if (counter->hw_event.include_tid)
+		perf_output_put(&handle, tid_entry);
 
-	perf_output_write(counter, nmi, &event, size);
+	perf_output_end(&handle);
 }
 
 static void perf_output_group(struct perf_counter *counter, int nmi)
-- 
cgit v0.10.2


From 023c54c42288416b4f43c67bfd5049a76926fad6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:13 +0200
Subject: perf_counter tools: kerneltop: update event_types

Go along with the new perf_event_type ABI.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171024.133985461@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index c0ca015..430810d 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -1263,8 +1263,8 @@ static void mmap_read(struct mmap_data *md)
 		old += size;
 
 		switch (event->header.type) {
-		case PERF_EVENT_IP:
-		case PERF_EVENT_IP | __PERF_EVENT_TID:
+		case PERF_EVENT_OVERFLOW | __PERF_EVENT_IP:
+		case PERF_EVENT_OVERFLOW | __PERF_EVENT_IP | __PERF_EVENT_TID:
 			process_event(event->ip.ip, md->counter);
 			break;
 
-- 
cgit v0.10.2


From 394ee07623cf556c8daae2b3c00cf5fea47f0811 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:14 +0200
Subject: perf_counter: provide generic callchain bits

Provide the generic callchain support bits. If hw_event->callchain is
set the arch specific perf_callchain() function is called upon to
provide a perf_callchain_entry structure filled with the current
callchain.

If it does so, it is added to the overflow output event.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171024.254266860@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index edf5bfb..43083af 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -140,8 +140,9 @@ struct perf_counter_hw_event {
 				include_tid    :  1, /* include the tid       */
 				mmap           :  1, /* include mmap data     */
 				munmap         :  1, /* include munmap data   */
+				callchain      :  1, /* add callchain data    */
 
-				__reserved_1   : 52;
+				__reserved_1   : 51;
 
 	__u32			extra_config_len;
 	__u32			__reserved_4;
@@ -219,6 +220,7 @@ enum perf_event_type {
 	PERF_EVENT_OVERFLOW	= 1UL << 31,
 	__PERF_EVENT_IP		= 1UL << 30,
 	__PERF_EVENT_TID	= 1UL << 29,
+	__PERF_EVENT_CALLCHAIN  = 1UL << 28,
 };
 
 #ifdef __KERNEL__
@@ -504,6 +506,15 @@ extern void perf_counter_mmap(unsigned long addr, unsigned long len,
 extern void perf_counter_munmap(unsigned long addr, unsigned long len,
 				unsigned long pgoff, struct file *file);
 
+#define MAX_STACK_DEPTH		255
+
+struct perf_callchain_entry {
+	u64	nr;
+	u64	ip[MAX_STACK_DEPTH];
+};
+
+extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d93e9dd..860cdc2 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1654,6 +1654,17 @@ void perf_counter_do_pending(void)
 }
 
 /*
+ * Callchain support -- arch specific
+ */
+
+struct perf_callchain_entry *
+__attribute__((weak))
+perf_callchain(struct pt_regs *regs)
+{
+	return NULL;
+}
+
+/*
  * Output
  */
 
@@ -1764,6 +1775,8 @@ static void perf_output_simple(struct perf_counter *counter,
 	struct {
 		u32 pid, tid;
 	} tid_entry;
+	struct perf_callchain_entry *callchain = NULL;
+	int callchain_size = 0;
 
 	header.type = PERF_EVENT_OVERFLOW;
 	header.size = sizeof(header);
@@ -1781,6 +1794,17 @@ static void perf_output_simple(struct perf_counter *counter,
 		header.size += sizeof(tid_entry);
 	}
 
+	if (counter->hw_event.callchain) {
+		callchain = perf_callchain(regs);
+
+		if (callchain) {
+			callchain_size = (1 + callchain->nr) * sizeof(u64);
+
+			header.type |= __PERF_EVENT_CALLCHAIN;
+			header.size += callchain_size;
+		}
+	}
+
 	ret = perf_output_begin(&handle, counter, header.size, nmi);
 	if (ret)
 		return;
@@ -1791,6 +1815,9 @@ static void perf_output_simple(struct perf_counter *counter,
 	if (counter->hw_event.include_tid)
 		perf_output_put(&handle, tid_entry);
 
+	if (callchain)
+		perf_output_copy(&handle, callchain, callchain_size);
+
 	perf_output_end(&handle);
 }
 
-- 
cgit v0.10.2


From d7d59fb323833682b117b528d77eeb8ef587036a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:15 +0200
Subject: perf_counter: x86: callchain support

Provide the x86 perf_callchain() implementation.

Code based on the ftrace/sysprof code from Soeren Sandmann Pedersen.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Cc: Soeren Sandmann Pedersen <sandmann@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Orig-LKML-Reference: <20090330171024.341993293@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index b8885cc..e16dfaf 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -16,8 +16,10 @@
 #include <linux/module.h>
 #include <linux/kdebug.h>
 #include <linux/sched.h>
+#include <linux/uaccess.h>
 
 #include <asm/apic.h>
+#include <asm/stacktrace.h>
 
 static bool perf_counters_initialized __read_mostly;
 
@@ -958,3 +960,155 @@ hw_perf_counter_init(struct perf_counter *counter)
 
 	return &x86_perf_counter_ops;
 }
+
+/*
+ * callchain support
+ */
+
+static inline
+void callchain_store(struct perf_callchain_entry *entry, unsigned long ip)
+{
+	if (entry->nr < MAX_STACK_DEPTH)
+		entry->ip[entry->nr++] = ip;
+}
+
+static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
+static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
+
+
+static void
+backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+	/* Ignore warnings */
+}
+
+static void backtrace_warning(void *data, char *msg)
+{
+	/* Ignore warnings */
+}
+
+static int backtrace_stack(void *data, char *name)
+{
+	/* Don't bother with IRQ stacks for now */
+	return -1;
+}
+
+static void backtrace_address(void *data, unsigned long addr, int reliable)
+{
+	struct perf_callchain_entry *entry = data;
+
+	if (reliable)
+		callchain_store(entry, addr);
+}
+
+static const struct stacktrace_ops backtrace_ops = {
+	.warning		= backtrace_warning,
+	.warning_symbol		= backtrace_warning_symbol,
+	.stack			= backtrace_stack,
+	.address		= backtrace_address,
+};
+
+static void
+perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+	unsigned long bp;
+	char *stack;
+
+	callchain_store(entry, instruction_pointer(regs));
+
+	stack = ((char *)regs + sizeof(struct pt_regs));
+#ifdef CONFIG_FRAME_POINTER
+	bp = frame_pointer(regs);
+#else
+	bp = 0;
+#endif
+
+	dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
+}
+
+
+struct stack_frame {
+	const void __user	*next_fp;
+	unsigned long		return_address;
+};
+
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+	int ret;
+
+	if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+		return 0;
+
+	ret = 1;
+	pagefault_disable();
+	if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+		ret = 0;
+	pagefault_enable();
+
+	return ret;
+}
+
+static void
+perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+	struct stack_frame frame;
+	const void __user *fp;
+
+	regs = (struct pt_regs *)current->thread.sp0 - 1;
+	fp   = (void __user *)regs->bp;
+
+	callchain_store(entry, regs->ip);
+
+	while (entry->nr < MAX_STACK_DEPTH) {
+		frame.next_fp	     = NULL;
+		frame.return_address = 0;
+
+		if (!copy_stack_frame(fp, &frame))
+			break;
+
+		if ((unsigned long)fp < user_stack_pointer(regs))
+			break;
+
+		callchain_store(entry, frame.return_address);
+		fp = frame.next_fp;
+	}
+}
+
+static void
+perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+	int is_user;
+
+	if (!regs)
+		return;
+
+	is_user = user_mode(regs);
+
+	if (!current || current->pid == 0)
+		return;
+
+	if (is_user && current->state != TASK_RUNNING)
+		return;
+
+	if (!is_user)
+		perf_callchain_kernel(regs, entry);
+
+	if (current->mm)
+		perf_callchain_user(regs, entry);
+}
+
+struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+	struct perf_callchain_entry *entry;
+
+	if (in_nmi())
+		entry = &__get_cpu_var(nmi_entry);
+	else
+		entry = &__get_cpu_var(irq_entry);
+
+	entry->nr = 0;
+
+	perf_do_callchain(regs, entry);
+
+	return entry;
+}
-- 
cgit v0.10.2


From 4e935e47177c3b26cf383e79849bae2a464d0160 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:16 +0200
Subject: perf_counter: pmc arbitration

Follow the example set by powerpc and try to play nice with oprofile
and the nmi watchdog.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171024.459968444@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index e16dfaf..2a946a1 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -20,6 +20,7 @@
 
 #include <asm/apic.h>
 #include <asm/stacktrace.h>
+#include <asm/nmi.h>
 
 static bool perf_counters_initialized __read_mostly;
 
@@ -172,6 +173,65 @@ again:
 	atomic64_sub(delta, &hwc->period_left);
 }
 
+static atomic_t num_counters;
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+static bool reserve_pmc_hardware(void)
+{
+	int i;
+
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		disable_lapic_nmi_watchdog();
+
+	for (i = 0; i < nr_counters_generic; i++) {
+		if (!reserve_perfctr_nmi(pmc_ops->perfctr + i))
+			goto perfctr_fail;
+	}
+
+	for (i = 0; i < nr_counters_generic; i++) {
+		if (!reserve_evntsel_nmi(pmc_ops->eventsel + i))
+			goto eventsel_fail;
+	}
+
+	return true;
+
+eventsel_fail:
+	for (i--; i >= 0; i--)
+		release_evntsel_nmi(pmc_ops->eventsel + i);
+
+	i = nr_counters_generic;
+
+perfctr_fail:
+	for (i--; i >= 0; i--)
+		release_perfctr_nmi(pmc_ops->perfctr + i);
+
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		enable_lapic_nmi_watchdog();
+
+	return false;
+}
+
+static void release_pmc_hardware(void)
+{
+	int i;
+
+	for (i = 0; i < nr_counters_generic; i++) {
+		release_perfctr_nmi(pmc_ops->perfctr + i);
+		release_evntsel_nmi(pmc_ops->eventsel + i);
+	}
+
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		enable_lapic_nmi_watchdog();
+}
+
+static void hw_perf_counter_destroy(struct perf_counter *counter)
+{
+	if (atomic_dec_and_mutex_lock(&num_counters, &pmc_reserve_mutex)) {
+		release_pmc_hardware();
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+}
+
 /*
  * Setup the hardware configuration for a given hw_event_type
  */
@@ -179,10 +239,23 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 {
 	struct perf_counter_hw_event *hw_event = &counter->hw_event;
 	struct hw_perf_counter *hwc = &counter->hw;
+	int err;
 
 	if (unlikely(!perf_counters_initialized))
 		return -EINVAL;
 
+	err = 0;
+	if (atomic_inc_not_zero(&num_counters)) {
+		mutex_lock(&pmc_reserve_mutex);
+		if (atomic_read(&num_counters) == 0 && !reserve_pmc_hardware())
+			err = -EBUSY;
+		else
+			atomic_inc(&num_counters);
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+	if (err)
+		return err;
+
 	/*
 	 * Generate PMC IRQs:
 	 * (keep 'enabled' bit clear for now)
@@ -230,6 +303,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->config |= pmc_ops->event_map(perf_event_id(hw_event));
 	}
 
+	counter->destroy = hw_perf_counter_destroy;
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From 9dd499889bdb12ac0e412ccdd718fe0d348258f2 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 27 Mar 2009 12:13:43 +0100
Subject: perf_counter tools: kerneltop: add real-time data acquisition thread

Decouple kerneltop display from event acquisition by introducing
a separate data acquisition thread. This fixes annnoying kerneltop
display refresh jitter and missed events.

Also add a -r <prio> option, to switch the data acquisition thread
to real-time priority.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Orig-LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 430810d..33b4fcf 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -77,6 +77,8 @@
 #include <errno.h>
 #include <ctype.h>
 #include <time.h>
+#include <sched.h>
+#include <pthread.h>
 
 #include <sys/syscall.h>
 #include <sys/ioctl.h>
@@ -181,6 +183,7 @@ static int			tid				= -1;
 static int			profile_cpu			= -1;
 static int			nr_cpus				=  0;
 static int			nmi				=  1;
+static unsigned int		realtime_prio			=  0;
 static int			group				=  0;
 static unsigned int		page_size;
 static unsigned int		mmap_pages			=  16;
@@ -334,6 +337,7 @@ static void display_help(void)
 	" -l                           # show scale factor for RR events\n"
 	" -d delay  --delay=<seconds>  # sampling/display delay           [default:  2]\n"
 	" -f CNT    --filter=CNT       # min-event-count filter          [default: 100]\n\n"
+	" -r prio   --realtime=<prio>  # event acquisition runs with SCHED_FIFO policy\n"
 	" -s symbol --symbol=<symbol>  # function to be showed annotated one-shot\n"
 	" -x path   --vmlinux=<path>   # the vmlinux binary, required for -s use\n"
 	" -z        --zero             # zero counts after display\n"
@@ -620,7 +624,6 @@ static int compare(const void *__sym1, const void *__sym2)
 	return sym_weight(sym1) < sym_weight(sym2);
 }
 
-static time_t			last_refresh;
 static long			events;
 static long			userspace_events;
 static const char		CONSOLE_CLEAR[] = "[H[2J";
@@ -634,6 +637,7 @@ static void print_sym_table(void)
 	float events_per_sec = events/delay_secs;
 	float kevents_per_sec = (events-userspace_events)/delay_secs;
 
+	events = userspace_events = 0;
 	memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
 	qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
 
@@ -714,8 +718,6 @@ static void print_sym_table(void)
 	if (sym_filter_entry)
 		show_details(sym_filter_entry);
 
-	last_refresh = time(NULL);
-
 	{
 		struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
 
@@ -726,6 +728,16 @@ static void print_sym_table(void)
 	}
 }
 
+static void *display_thread(void *arg)
+{
+	printf("KernelTop refresh period: %d seconds\n", delay_secs);
+
+	while (!sleep(delay_secs))
+		print_sym_table();
+
+	return NULL;
+}
+
 static int read_symbol(FILE *in, struct sym_entry *s)
 {
 	static int filter_match = 0;
@@ -1081,19 +1093,20 @@ static void process_options(int argc, char *argv[])
 			{"filter",	required_argument,	NULL, 'f'},
 			{"group",	required_argument,	NULL, 'g'},
 			{"help",	no_argument,		NULL, 'h'},
-			{"scale",	no_argument,		NULL, 'l'},
 			{"nmi",		required_argument,	NULL, 'n'},
+			{"mmap_info",	no_argument,		NULL, 'M'},
+			{"mmap_pages",	required_argument,	NULL, 'm'},
+			{"munmap_info",	no_argument,		NULL, 'U'},
 			{"pid",		required_argument,	NULL, 'p'},
-			{"vmlinux",	required_argument,	NULL, 'x'},
+			{"realtime",	required_argument,	NULL, 'r'},
+			{"scale",	no_argument,		NULL, 'l'},
 			{"symbol",	required_argument,	NULL, 's'},
 			{"stat",	no_argument,		NULL, 'S'},
+			{"vmlinux",	required_argument,	NULL, 'x'},
 			{"zero",	no_argument,		NULL, 'z'},
-			{"mmap_pages",	required_argument,	NULL, 'm'},
-			{"mmap_info",	no_argument,		NULL, 'M'},
-			{"munmap_info",	no_argument,		NULL, 'U'},
 			{NULL,		0,			NULL,  0 }
 		};
-		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:s:Sx:zMU",
+		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
 				    long_options, &option_index);
 		if (c == -1)
 			break;
@@ -1127,6 +1140,7 @@ static void process_options(int argc, char *argv[])
 				profile_cpu = -1;
 			}
 			tid				=   atoi(optarg); break;
+		case 'r': realtime_prio			=   atoi(optarg); break;
 		case 's': sym_filter			= strdup(optarg); break;
 		case 'S': run_perfstat			=	       1; break;
 		case 'x': vmlinux			= strdup(optarg); break;
@@ -1289,6 +1303,7 @@ int main(int argc, char *argv[])
 	struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
 	struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 	struct perf_counter_hw_event hw_event;
+	pthread_t thread;
 	int i, counter, group_fd, nr_poll = 0;
 	unsigned int cpu;
 	int ret;
@@ -1363,8 +1378,20 @@ int main(int argc, char *argv[])
 		}
 	}
 
-	printf("KernelTop refresh period: %d seconds\n", delay_secs);
-	last_refresh = time(NULL);
+	if (pthread_create(&thread, NULL, display_thread, NULL)) {
+		printf("Could not create display thread.\n");
+		exit(-1);
+	}
+
+	if (realtime_prio) {
+		struct sched_param param;
+
+		param.sched_priority = realtime_prio;
+		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
+			printf("Could not set realtime priority.\n");
+			exit(-1);
+		}
+	}
 
 	while (1) {
 		int hits = events;
@@ -1374,14 +1401,8 @@ int main(int argc, char *argv[])
 				mmap_read(&mmap_array[i][counter]);
 		}
 
-		if (time(NULL) >= last_refresh + delay_secs) {
-			print_sym_table();
-			events = userspace_events = 0;
-		}
-
 		if (hits == events)
-			ret = poll(event_array, nr_poll, 1000);
-		hits = events;
+			ret = poll(event_array, nr_poll, 100);
 	}
 
 	return 0;
-- 
cgit v0.10.2


From 8a057d84912f36e53f970c4d177cb4bb6b2f9e08 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 2 Apr 2009 11:11:59 +0200
Subject: perf_counter: move the event overflow output bits to record_type

Per suggestion from Paul, move the event overflow bits to record_type
and sanitize the enums a bit.

Breaks the ABI -- again ;-)

Suggested-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Orig-LKML-Reference: <20090402091319.151921176@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 43083af..06a6fba 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -73,15 +73,6 @@ enum sw_event_ids {
 	PERF_SW_EVENTS_MAX		= 7,
 };
 
-/*
- * IRQ-notification data record type:
- */
-enum perf_counter_record_type {
-	PERF_RECORD_SIMPLE		= 0,
-	PERF_RECORD_IRQ			= 1,
-	PERF_RECORD_GROUP		= 2,
-};
-
 #define __PERF_COUNTER_MASK(name) 			\
 	(((1ULL << PERF_COUNTER_##name##_BITS) - 1) <<	\
 	 PERF_COUNTER_##name##_SHIFT)
@@ -103,6 +94,17 @@ enum perf_counter_record_type {
 #define PERF_COUNTER_EVENT_MASK		__PERF_COUNTER_MASK(EVENT)
 
 /*
+ * Bits that can be set in hw_event.record_type to request information
+ * in the overflow packets.
+ */
+enum perf_counter_record_format {
+	PERF_RECORD_IP		= 1U << 0,
+	PERF_RECORD_TID		= 1U << 1,
+	PERF_RECORD_GROUP	= 1U << 2,
+	PERF_RECORD_CALLCHAIN	= 1U << 3,
+};
+
+/*
  * Bits that can be set in hw_event.read_format to request that
  * reads on the counter should return the indicated quantities,
  * in increasing order of bit value, after the counter value.
@@ -125,8 +127,8 @@ struct perf_counter_hw_event {
 	__u64			config;
 
 	__u64			irq_period;
-	__u64			record_type;
-	__u64			read_format;
+	__u32			record_type;
+	__u32			read_format;
 
 	__u64			disabled       :  1, /* off by default        */
 				nmi	       :  1, /* NMI sampling          */
@@ -137,12 +139,10 @@ struct perf_counter_hw_event {
 				exclude_kernel :  1, /* ditto kernel          */
 				exclude_hv     :  1, /* ditto hypervisor      */
 				exclude_idle   :  1, /* don't count when idle */
-				include_tid    :  1, /* include the tid       */
 				mmap           :  1, /* include mmap data     */
 				munmap         :  1, /* include munmap data   */
-				callchain      :  1, /* add callchain data    */
 
-				__reserved_1   : 51;
+				__reserved_1   : 53;
 
 	__u32			extra_config_len;
 	__u32			__reserved_4;
@@ -212,15 +212,21 @@ struct perf_event_header {
 
 enum perf_event_type {
 
-	PERF_EVENT_GROUP	= 1,
-
-	PERF_EVENT_MMAP		= 2,
-	PERF_EVENT_MUNMAP	= 3,
+	PERF_EVENT_MMAP			= 1,
+	PERF_EVENT_MUNMAP		= 2,
 
-	PERF_EVENT_OVERFLOW	= 1UL << 31,
-	__PERF_EVENT_IP		= 1UL << 30,
-	__PERF_EVENT_TID	= 1UL << 29,
-	__PERF_EVENT_CALLCHAIN  = 1UL << 28,
+	/*
+	 * Half the event type space is reserved for the counter overflow
+	 * bitfields, as found in hw_event.record_type.
+	 *
+	 * These events will have types of the form:
+	 *   PERF_EVENT_COUNTER_OVERFLOW { | __PERF_EVENT_* } *
+	 */
+	PERF_EVENT_COUNTER_OVERFLOW	= 1UL << 31,
+	__PERF_EVENT_IP			= PERF_RECORD_IP,
+	__PERF_EVENT_TID		= PERF_RECORD_TID,
+	__PERF_EVENT_GROUP		= PERF_RECORD_GROUP,
+	__PERF_EVENT_CALLCHAIN		= PERF_RECORD_CALLCHAIN,
 };
 
 #ifdef __KERNEL__
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 860cdc2..995063d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1765,27 +1765,34 @@ static void perf_output_end(struct perf_output_handle *handle)
 	rcu_read_unlock();
 }
 
-static void perf_output_simple(struct perf_counter *counter,
-			       int nmi, struct pt_regs *regs)
+void perf_counter_output(struct perf_counter *counter,
+			 int nmi, struct pt_regs *regs)
 {
 	int ret;
+	u64 record_type = counter->hw_event.record_type;
 	struct perf_output_handle handle;
 	struct perf_event_header header;
 	u64 ip;
 	struct {
 		u32 pid, tid;
 	} tid_entry;
+	struct {
+		u64 event;
+		u64 counter;
+	} group_entry;
 	struct perf_callchain_entry *callchain = NULL;
 	int callchain_size = 0;
 
-	header.type = PERF_EVENT_OVERFLOW;
+	header.type = PERF_EVENT_COUNTER_OVERFLOW;
 	header.size = sizeof(header);
 
-	ip = instruction_pointer(regs);
-	header.type |= __PERF_EVENT_IP;
-	header.size += sizeof(ip);
+	if (record_type & PERF_RECORD_IP) {
+		ip = instruction_pointer(regs);
+		header.type |= __PERF_EVENT_IP;
+		header.size += sizeof(ip);
+	}
 
-	if (counter->hw_event.include_tid) {
+	if (record_type & PERF_RECORD_TID) {
 		/* namespace issues */
 		tid_entry.pid = current->group_leader->pid;
 		tid_entry.tid = current->pid;
@@ -1794,7 +1801,13 @@ static void perf_output_simple(struct perf_counter *counter,
 		header.size += sizeof(tid_entry);
 	}
 
-	if (counter->hw_event.callchain) {
+	if (record_type & PERF_RECORD_GROUP) {
+		header.type |= __PERF_EVENT_GROUP;
+		header.size += sizeof(u64) +
+			counter->nr_siblings * sizeof(group_entry);
+	}
+
+	if (record_type & PERF_RECORD_CALLCHAIN) {
 		callchain = perf_callchain(regs);
 
 		if (callchain) {
@@ -1810,69 +1823,35 @@ static void perf_output_simple(struct perf_counter *counter,
 		return;
 
 	perf_output_put(&handle, header);
-	perf_output_put(&handle, ip);
 
-	if (counter->hw_event.include_tid)
-		perf_output_put(&handle, tid_entry);
+	if (record_type & PERF_RECORD_IP)
+		perf_output_put(&handle, ip);
 
-	if (callchain)
-		perf_output_copy(&handle, callchain, callchain_size);
-
-	perf_output_end(&handle);
-}
-
-static void perf_output_group(struct perf_counter *counter, int nmi)
-{
-	struct perf_output_handle handle;
-	struct perf_event_header header;
-	struct perf_counter *leader, *sub;
-	unsigned int size;
-	struct {
-		u64 event;
-		u64 counter;
-	} entry;
-	int ret;
-
-	size = sizeof(header) + counter->nr_siblings * sizeof(entry);
+	if (record_type & PERF_RECORD_TID)
+		perf_output_put(&handle, tid_entry);
 
-	ret = perf_output_begin(&handle, counter, size, nmi);
-	if (ret)
-		return;
+	if (record_type & PERF_RECORD_GROUP) {
+		struct perf_counter *leader, *sub;
+		u64 nr = counter->nr_siblings;
 
-	header.type = PERF_EVENT_GROUP;
-	header.size = size;
+		perf_output_put(&handle, nr);
 
-	perf_output_put(&handle, header);
+		leader = counter->group_leader;
+		list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+			if (sub != counter)
+				sub->hw_ops->read(sub);
 
-	leader = counter->group_leader;
-	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
-		if (sub != counter)
-			sub->hw_ops->read(sub);
+			group_entry.event = sub->hw_event.config;
+			group_entry.counter = atomic64_read(&sub->count);
 
-		entry.event = sub->hw_event.config;
-		entry.counter = atomic64_read(&sub->count);
-
-		perf_output_put(&handle, entry);
+			perf_output_put(&handle, group_entry);
+		}
 	}
 
-	perf_output_end(&handle);
-}
-
-void perf_counter_output(struct perf_counter *counter,
-			 int nmi, struct pt_regs *regs)
-{
-	switch (counter->hw_event.record_type) {
-	case PERF_RECORD_SIMPLE:
-		return;
-
-	case PERF_RECORD_IRQ:
-		perf_output_simple(counter, nmi, regs);
-		break;
+	if (callchain)
+		perf_output_copy(&handle, callchain, callchain_size);
 
-	case PERF_RECORD_GROUP:
-		perf_output_group(counter, nmi);
-		break;
-	}
+	perf_output_end(&handle);
 }
 
 /*
-- 
cgit v0.10.2


From c457810ab4a825161aec6ef71b581e1bc8febd1a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 2 Apr 2009 11:12:01 +0200
Subject: perf_counter: per event wakeups

By request, provide a way to request a wakeup every 'n' events instead
of every page of output.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Orig-LKML-Reference: <20090402091319.323309784@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 06a6fba..5428ba1 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -145,7 +145,7 @@ struct perf_counter_hw_event {
 				__reserved_1   : 53;
 
 	__u32			extra_config_len;
-	__u32			__reserved_4;
+	__u32			wakeup_events;	/* wakeup every n events */
 
 	__u64			__reserved_2;
 	__u64			__reserved_3;
@@ -321,6 +321,7 @@ struct perf_mmap_data {
 	int				nr_pages;
 	atomic_t			wakeup;
 	atomic_t			head;
+	atomic_t			events;
 	struct perf_counter_mmap_page   *user_page;
 	void 				*data_pages[0];
 };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 995063d..9bcab10 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1760,7 +1760,15 @@ static void perf_output_copy(struct perf_output_handle *handle,
 
 static void perf_output_end(struct perf_output_handle *handle)
 {
-	if (handle->wakeup)
+	int wakeup_events = handle->counter->hw_event.wakeup_events;
+
+	if (wakeup_events) {
+		int events = atomic_inc_return(&handle->data->events);
+		if (events >= wakeup_events) {
+			atomic_sub(wakeup_events, &handle->data->events);
+			__perf_output_wakeup(handle);
+		}
+	} else if (handle->wakeup)
 		__perf_output_wakeup(handle);
 	rcu_read_unlock();
 }
-- 
cgit v0.10.2


From 3df70fd623bb109e0079e697c0276d220a4b7908 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 2 Apr 2009 11:12:02 +0200
Subject: perf_counter: kerneltop: update to new ABI

Update to reflect the new record_type ABI changes.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Orig-LKML-Reference: <20090402091319.407283141@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 33b4fcf..4f8d791 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -442,7 +442,7 @@ static void create_perfstat_counter(int counter)
 
 	memset(&hw_event, 0, sizeof(hw_event));
 	hw_event.config		= event_id[counter];
-	hw_event.record_type	= PERF_RECORD_SIMPLE;
+	hw_event.record_type	= 0;
 	hw_event.nmi		= 0;
 	if (scale)
 		hw_event.read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
@@ -1277,8 +1277,8 @@ static void mmap_read(struct mmap_data *md)
 		old += size;
 
 		switch (event->header.type) {
-		case PERF_EVENT_OVERFLOW | __PERF_EVENT_IP:
-		case PERF_EVENT_OVERFLOW | __PERF_EVENT_IP | __PERF_EVENT_TID:
+		case PERF_EVENT_COUNTER_OVERFLOW | __PERF_EVENT_IP:
+		case PERF_EVENT_COUNTER_OVERFLOW | __PERF_EVENT_IP | __PERF_EVENT_TID:
 			process_event(event->ip.ip, md->counter);
 			break;
 
@@ -1337,9 +1337,8 @@ int main(int argc, char *argv[])
 			memset(&hw_event, 0, sizeof(hw_event));
 			hw_event.config		= event_id[counter];
 			hw_event.irq_period	= event_count[counter];
-			hw_event.record_type	= PERF_RECORD_IRQ;
+			hw_event.record_type	= PERF_RECORD_IP | PERF_RECORD_TID;
 			hw_event.nmi		= nmi;
-			hw_event.include_tid	= 1;
 			hw_event.mmap		= use_mmap;
 			hw_event.munmap		= use_munmap;
 
-- 
cgit v0.10.2


From 5872bdb88a35fae7d224bd6b21e5f377e854ccfc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 2 Apr 2009 11:12:03 +0200
Subject: perf_counter: add more context information

Put in counts to tell which ips belong to what context.

  -----
   | |  hv
   | --
nr | |  kernel
   | --
   | |  user
  -----

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Orig-LKML-Reference: <20090402091319.493101305@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 2a946a1..c74e20d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1088,6 +1088,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
 	unsigned long bp;
 	char *stack;
+	int nr = entry->nr;
 
 	callchain_store(entry, instruction_pointer(regs));
 
@@ -1099,6 +1100,8 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 #endif
 
 	dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
+
+	entry->kernel = entry->nr - nr;
 }
 
 
@@ -1128,6 +1131,7 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
 	struct stack_frame frame;
 	const void __user *fp;
+	int nr = entry->nr;
 
 	regs = (struct pt_regs *)current->thread.sp0 - 1;
 	fp   = (void __user *)regs->bp;
@@ -1147,6 +1151,8 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
 		callchain_store(entry, frame.return_address);
 		fp = frame.next_fp;
 	}
+
+	entry->user = entry->nr - nr;
 }
 
 static void
@@ -1182,6 +1188,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 		entry = &__get_cpu_var(irq_entry);
 
 	entry->nr = 0;
+	entry->hv = 0;
+	entry->kernel = 0;
+	entry->user = 0;
 
 	perf_do_callchain(regs, entry);
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 5428ba1..90cce0c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -513,10 +513,10 @@ extern void perf_counter_mmap(unsigned long addr, unsigned long len,
 extern void perf_counter_munmap(unsigned long addr, unsigned long len,
 				unsigned long pgoff, struct file *file);
 
-#define MAX_STACK_DEPTH		255
+#define MAX_STACK_DEPTH		254
 
 struct perf_callchain_entry {
-	u64	nr;
+	u32	nr, hv, kernel, user;
 	u64	ip[MAX_STACK_DEPTH];
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 9bcab10..f105a6e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1819,7 +1819,7 @@ void perf_counter_output(struct perf_counter *counter,
 		callchain = perf_callchain(regs);
 
 		if (callchain) {
-			callchain_size = (1 + callchain->nr) * sizeof(u64);
+			callchain_size = (2 + callchain->nr) * sizeof(u64);
 
 			header.type |= __PERF_EVENT_CALLCHAIN;
 			header.size += callchain_size;
-- 
cgit v0.10.2


From 92f22a3865abe87eea2609a6f8e5be5123f7ce4f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 2 Apr 2009 11:12:04 +0200
Subject: perf_counter: update mmap() counter read

Paul noted that we don't need SMP barriers for the mmap() counter read
because its always on the same cpu (otherwise you can't access the hw
counter anyway).

So remove the SMP barriers and replace them with regular compiler
barriers.

Further, update the comment to include a race free method of reading
said hardware counter. The primary change is putting the pmc_read
inside the seq-loop, otherwise we can still race and read rubbish.

Noticed-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Orig-LKML-Reference: <20090402091319.577951445@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 90cce0c..f2b914d 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -167,30 +167,28 @@ struct perf_counter_mmap_page {
 	/*
 	 * Bits needed to read the hw counters in user-space.
 	 *
-	 * The index and offset should be read atomically using the seqlock:
-	 *
-	 *   __u32 seq, index;
-	 *   __s64 offset;
+	 *   u32 seq;
+	 *   s64 count;
 	 *
 	 * again:
-	 *   rmb();
 	 *   seq = pc->lock;
-	 *
 	 *   if (unlikely(seq & 1)) {
 	 *     cpu_relax();
 	 *     goto again;
 	 *   }
 	 *
-	 *   index = pc->index;
-	 *   offset = pc->offset;
+	 *   if (pc->index) {
+	 *     count = pmc_read(pc->index - 1);
+	 *     count += pc->offset;
+	 *   } else
+	 *     goto regular_read;
 	 *
-	 *   rmb();
+	 *   barrier();
 	 *   if (pc->lock != seq)
 	 *     goto again;
 	 *
-	 * After this, index contains architecture specific counter index + 1,
-	 * so that 0 means unavailable, offset contains the value to be added
-	 * to the result of the raw timer read to obtain this counter's value.
+	 * NOTE: for obvious reason this only works on self-monitoring
+	 *       processes.
 	 */
 	__u32	lock;			/* seqlock for synchronization */
 	__u32	index;			/* hardware counter identifier */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f105a6e..2a5d4f5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1340,13 +1340,13 @@ void perf_counter_update_userpage(struct perf_counter *counter)
 	 */
 	preempt_disable();
 	++userpg->lock;
-	smp_wmb();
+	barrier();
 	userpg->index = counter->hw.idx;
 	userpg->offset = atomic64_read(&counter->count);
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		userpg->offset -= atomic64_read(&counter->hw.prev_count);
 
-	smp_wmb();
+	barrier();
 	++userpg->lock;
 	preempt_enable();
 unlock:
-- 
cgit v0.10.2


From ca5f9524d61f54b1f618293ab92fc6b49cac864d Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Fri, 3 Apr 2009 13:39:33 -0700
Subject: futex: separate futex_wait_queue_me() logic from futex_wait()

Refactor futex_wait() in preparation for futex_wait_requeue_pi(). In
order to reuse a good chunk of the futex_wait() code for the upcoming
futex_wait_requeue_pi() function, this patch breaks out the
queue-to-wakeup section of futex_wait() into futex_wait_queue_me().

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/futex.c b/kernel/futex.c
index 6b50a02..ebb48d6 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1115,24 +1115,87 @@ handle_fault:
 
 static long futex_wait_restart(struct restart_block *restart);
 
+/**
+ * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
+ * @hb:		the futex hash bucket, must be locked by the caller
+ * @q:		the futex_q to queue up on
+ * @timeout:	the prepared hrtimer_sleeper, or null for no timeout
+ * @wait:	the wait_queue to add to the futex_q after queueing in the hb
+ */
+static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
+				struct hrtimer_sleeper *timeout,
+				wait_queue_t *wait)
+{
+	queue_me(q, hb);
+
+	/*
+	 * There might have been scheduling since the queue_me(), as we
+	 * cannot hold a spinlock across the get_user() in case it
+	 * faults, and we cannot just set TASK_INTERRUPTIBLE state when
+	 * queueing ourselves into the futex hash.  This code thus has to
+	 * rely on the futex_wake() code removing us from hash when it
+	 * wakes us up.
+	 */
+
+	/* add_wait_queue is the barrier after __set_current_state. */
+	__set_current_state(TASK_INTERRUPTIBLE);
+
+	/*
+	 * Add current as the futex_q waiter.  We don't remove ourselves from
+	 * the wait_queue because we are the only user of it.
+	 */
+	add_wait_queue(&q->waiter, wait);
+
+	/* Arm the timer */
+	if (timeout) {
+		hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
+		if (!hrtimer_active(&timeout->timer))
+			timeout->task = NULL;
+	}
+
+	/*
+	 * !plist_node_empty() is safe here without any lock.
+	 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
+	 */
+	if (likely(!plist_node_empty(&q->list))) {
+		/*
+		 * If the timer has already expired, current will already be
+		 * flagged for rescheduling. Only call schedule if there
+		 * is no timeout, or if it has yet to expire.
+		 */
+		if (!timeout || timeout->task)
+			schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+}
+
 static int futex_wait(u32 __user *uaddr, int fshared,
 		      u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
 {
-	struct task_struct *curr = current;
+	struct hrtimer_sleeper timeout, *to = NULL;
+	DECLARE_WAITQUEUE(wait, current);
 	struct restart_block *restart;
-	DECLARE_WAITQUEUE(wait, curr);
 	struct futex_hash_bucket *hb;
 	struct futex_q q;
 	u32 uval;
 	int ret;
-	struct hrtimer_sleeper t;
-	int rem = 0;
 
 	if (!bitset)
 		return -EINVAL;
 
 	q.pi_state = NULL;
 	q.bitset = bitset;
+
+	if (abs_time) {
+		to = &timeout;
+
+		hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
+				      CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+		hrtimer_init_sleeper(to, current);
+		hrtimer_set_expires_range_ns(&to->timer, *abs_time,
+					     current->timer_slack_ns);
+	}
+
 retry:
 	q.key = FUTEX_KEY_INIT;
 	ret = get_futex_key(uaddr, fshared, &q.key);
@@ -1178,75 +1241,22 @@ retry_private:
 		goto retry;
 	}
 	ret = -EWOULDBLOCK;
+
+	/* Only actually queue if *uaddr contained val.  */
 	if (unlikely(uval != val)) {
 		queue_unlock(&q, hb);
 		goto out_put_key;
 	}
 
-	/* Only actually queue if *uaddr contained val.  */
-	queue_me(&q, hb);
-
-	/*
-	 * There might have been scheduling since the queue_me(), as we
-	 * cannot hold a spinlock across the get_user() in case it
-	 * faults, and we cannot just set TASK_INTERRUPTIBLE state when
-	 * queueing ourselves into the futex hash.  This code thus has to
-	 * rely on the futex_wake() code removing us from hash when it
-	 * wakes us up.
-	 */
-
-	/* add_wait_queue is the barrier after __set_current_state. */
-	__set_current_state(TASK_INTERRUPTIBLE);
-	add_wait_queue(&q.waiter, &wait);
-	/*
-	 * !plist_node_empty() is safe here without any lock.
-	 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
-	 */
-	if (likely(!plist_node_empty(&q.list))) {
-		if (!abs_time)
-			schedule();
-		else {
-			hrtimer_init_on_stack(&t.timer,
-					      clockrt ? CLOCK_REALTIME :
-					      CLOCK_MONOTONIC,
-					      HRTIMER_MODE_ABS);
-			hrtimer_init_sleeper(&t, current);
-			hrtimer_set_expires_range_ns(&t.timer, *abs_time,
-						     current->timer_slack_ns);
-
-			hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
-			if (!hrtimer_active(&t.timer))
-				t.task = NULL;
-
-			/*
-			 * the timer could have already expired, in which
-			 * case current would be flagged for rescheduling.
-			 * Don't bother calling schedule.
-			 */
-			if (likely(t.task))
-				schedule();
-
-			hrtimer_cancel(&t.timer);
-
-			/* Flag if a timeout occured */
-			rem = (t.task == NULL);
-
-			destroy_hrtimer_on_stack(&t.timer);
-		}
-	}
-	__set_current_state(TASK_RUNNING);
-
-	/*
-	 * NOTE: we don't remove ourselves from the waitqueue because
-	 * we are the only user of it.
-	 */
+	/* queue_me and wait for wakeup, timeout, or a signal. */
+	futex_wait_queue_me(hb, &q, to, &wait);
 
 	/* If we were woken (and unqueued), we succeeded, whatever. */
 	ret = 0;
 	if (!unqueue_me(&q))
 		goto out_put_key;
 	ret = -ETIMEDOUT;
-	if (rem)
+	if (to && !to->task)
 		goto out_put_key;
 
 	/*
@@ -1275,6 +1285,10 @@ retry_private:
 out_put_key:
 	put_futex_key(fshared, &q.key);
 out:
+	if (to) {
+		hrtimer_cancel(&to->timer);
+		destroy_hrtimer_on_stack(&to->timer);
+	}
 	return ret;
 }
 
-- 
cgit v0.10.2


From 4b1c486b3587d2abf50bee4a05eb488cd4045f2c Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Fri, 3 Apr 2009 13:39:42 -0700
Subject: futex: add helper to find the top prio waiter of a futex

Improve legibility by wrapping finding the top waiter in a function.
This will be used by the follow-on patches for enabling requeue pi.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/futex.c b/kernel/futex.c
index ebb48d6..421fb5e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -276,6 +276,25 @@ void put_futex_key(int fshared, union futex_key *key)
 	drop_futex_key_refs(key);
 }
 
+/**
+ * futex_top_waiter() - Return the highest priority waiter on a futex
+ * @hb:     the hash bucket the futex_q's reside in
+ * @key:    the futex key (to distinguish it from other futex futex_q's)
+ *
+ * Must be called with the hb lock held.
+ */
+static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
+					union futex_key *key)
+{
+	struct futex_q *this;
+
+	plist_for_each_entry(this, &hb->chain, list) {
+		if (match_futex(&this->key, key))
+			return this;
+	}
+	return NULL;
+}
+
 static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
 {
 	u32 curval;
-- 
cgit v0.10.2


From 1a52084d0919c2799258737c21fb328a9de159b5 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Fri, 3 Apr 2009 13:39:52 -0700
Subject: futex: split out atomic logic from futex_lock_pi()

Refactor the atomic portion of futex_lock_pi() into futex_lock_pi_atomic().

This logic will be needed by requeue_pi, so modularize it to reduce
code duplication.  The only significant change is passing of the task
to try and take the lock for.  This simplifies the -EDEADLK test as if
the lock is owned by task t, it's a deadlock, regardless of if we are
doing requeue pi or not.  This patch updates the corresponding comment
accordingly.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/futex.c b/kernel/futex.c
index 421fb5e..986b16e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -556,6 +556,127 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
 	return 0;
 }
 
+/**
+ * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex
+ * @uaddr:	the pi futex user address
+ * @hb:		the pi futex hash bucket
+ * @key:	the futex key associated with uaddr and hb
+ * @ps:		the pi_state pointer where we store the result of the lookup
+ * @task:	the task to perform the atomic lock work for.  This will be
+ * 		"current" except in the case of requeue pi.
+ *
+ * Returns:
+ *  0 - ready to wait
+ *  1 - acquired the lock
+ * <0 - error
+ *
+ * The hb->lock and futex_key refs shall be held by the caller.
+ */
+static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
+				union futex_key *key,
+				struct futex_pi_state **ps,
+				struct task_struct *task)
+{
+	int lock_taken, ret, ownerdied = 0;
+	u32 uval, newval, curval;
+
+retry:
+	ret = lock_taken = 0;
+
+	/*
+	 * To avoid races, we attempt to take the lock here again
+	 * (by doing a 0 -> TID atomic cmpxchg), while holding all
+	 * the locks. It will most likely not succeed.
+	 */
+	newval = task_pid_vnr(task);
+
+	curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
+
+	if (unlikely(curval == -EFAULT))
+		return -EFAULT;
+
+	/*
+	 * Detect deadlocks.
+	 */
+	if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task))))
+		return -EDEADLK;
+
+	/*
+	 * Surprise - we got the lock. Just return to userspace:
+	 */
+	if (unlikely(!curval))
+		return 1;
+
+	uval = curval;
+
+	/*
+	 * Set the FUTEX_WAITERS flag, so the owner will know it has someone
+	 * to wake at the next unlock.
+	 */
+	newval = curval | FUTEX_WAITERS;
+
+	/*
+	 * There are two cases, where a futex might have no owner (the
+	 * owner TID is 0): OWNER_DIED. We take over the futex in this
+	 * case. We also do an unconditional take over, when the owner
+	 * of the futex died.
+	 *
+	 * This is safe as we are protected by the hash bucket lock !
+	 */
+	if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
+		/* Keep the OWNER_DIED bit */
+		newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task);
+		ownerdied = 0;
+		lock_taken = 1;
+	}
+
+	curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
+
+	if (unlikely(curval == -EFAULT))
+		return -EFAULT;
+	if (unlikely(curval != uval))
+		goto retry;
+
+	/*
+	 * We took the lock due to owner died take over.
+	 */
+	if (unlikely(lock_taken))
+		return 1;
+
+	/*
+	 * We dont have the lock. Look up the PI state (or create it if
+	 * we are the first waiter):
+	 */
+	ret = lookup_pi_state(uval, hb, key, ps);
+
+	if (unlikely(ret)) {
+		switch (ret) {
+		case -ESRCH:
+			/*
+			 * No owner found for this futex. Check if the
+			 * OWNER_DIED bit is set to figure out whether
+			 * this is a robust futex or not.
+			 */
+			if (get_futex_value_locked(&curval, uaddr))
+				return -EFAULT;
+
+			/*
+			 * We simply start over in case of a robust
+			 * futex. The code above will take the futex
+			 * and return happy.
+			 */
+			if (curval & FUTEX_OWNER_DIED) {
+				ownerdied = 1;
+				goto retry;
+			}
+		default:
+			break;
+		}
+	}
+
+	return ret;
+}
+
 /*
  * The hash bucket lock must be held when this is called.
  * Afterwards, the futex_q must not be accessed.
@@ -1340,9 +1461,9 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 	struct hrtimer_sleeper timeout, *to = NULL;
 	struct task_struct *curr = current;
 	struct futex_hash_bucket *hb;
-	u32 uval, newval, curval;
+	u32 uval;
 	struct futex_q q;
-	int ret, lock_taken, ownerdied = 0;
+	int ret;
 
 	if (refill_pi_state_cache())
 		return -ENOMEM;
@@ -1365,81 +1486,15 @@ retry:
 retry_private:
 	hb = queue_lock(&q);
 
-retry_locked:
-	ret = lock_taken = 0;
-
-	/*
-	 * To avoid races, we attempt to take the lock here again
-	 * (by doing a 0 -> TID atomic cmpxchg), while holding all
-	 * the locks. It will most likely not succeed.
-	 */
-	newval = task_pid_vnr(current);
-
-	curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
-
-	if (unlikely(curval == -EFAULT))
-		goto uaddr_faulted;
-
-	/*
-	 * Detect deadlocks. In case of REQUEUE_PI this is a valid
-	 * situation and we return success to user space.
-	 */
-	if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
-		ret = -EDEADLK;
-		goto out_unlock_put_key;
-	}
-
-	/*
-	 * Surprise - we got the lock. Just return to userspace:
-	 */
-	if (unlikely(!curval))
-		goto out_unlock_put_key;
-
-	uval = curval;
-
-	/*
-	 * Set the WAITERS flag, so the owner will know it has someone
-	 * to wake at next unlock
-	 */
-	newval = curval | FUTEX_WAITERS;
-
-	/*
-	 * There are two cases, where a futex might have no owner (the
-	 * owner TID is 0): OWNER_DIED. We take over the futex in this
-	 * case. We also do an unconditional take over, when the owner
-	 * of the futex died.
-	 *
-	 * This is safe as we are protected by the hash bucket lock !
-	 */
-	if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
-		/* Keep the OWNER_DIED bit */
-		newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current);
-		ownerdied = 0;
-		lock_taken = 1;
-	}
-
-	curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
-	if (unlikely(curval == -EFAULT))
-		goto uaddr_faulted;
-	if (unlikely(curval != uval))
-		goto retry_locked;
-
-	/*
-	 * We took the lock due to owner died take over.
-	 */
-	if (unlikely(lock_taken))
-		goto out_unlock_put_key;
-
-	/*
-	 * We dont have the lock. Look up the PI state (or create it if
-	 * we are the first waiter):
-	 */
-	ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
-
+	ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current);
 	if (unlikely(ret)) {
 		switch (ret) {
-
+		case 1:
+			/* We got the lock. */
+			ret = 0;
+			goto out_unlock_put_key;
+		case -EFAULT:
+			goto uaddr_faulted;
 		case -EAGAIN:
 			/*
 			 * Task is exiting and we just wait for the
@@ -1449,25 +1504,6 @@ retry_locked:
 			put_futex_key(fshared, &q.key);
 			cond_resched();
 			goto retry;
-
-		case -ESRCH:
-			/*
-			 * No owner found for this futex. Check if the
-			 * OWNER_DIED bit is set to figure out whether
-			 * this is a robust futex or not.
-			 */
-			if (get_futex_value_locked(&curval, uaddr))
-				goto uaddr_faulted;
-
-			/*
-			 * We simply start over in case of a robust
-			 * futex. The code above will take the futex
-			 * and return happy.
-			 */
-			if (curval & FUTEX_OWNER_DIED) {
-				ownerdied = 1;
-				goto retry_locked;
-			}
 		default:
 			goto out_unlock_put_key;
 		}
-- 
cgit v0.10.2


From dd9739980b50c8cde33e1f8eb08b7e0140bcd61e Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Fri, 3 Apr 2009 13:40:02 -0700
Subject: futex: split out fixup owner logic from futex_lock_pi()

Refactor the post lock acquisition logic from futex_lock_pi(). This
code will be reused in futex_wait_requeue_pi().

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/futex.c b/kernel/futex.c
index 986b16e..af831fb 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1256,6 +1256,79 @@ handle_fault:
 static long futex_wait_restart(struct restart_block *restart);
 
 /**
+ * fixup_owner() - Post lock pi_state and corner case management
+ * @uaddr:	user address of the futex
+ * @fshared:	whether the futex is shared (1) or not (0)
+ * @q:		futex_q (contains pi_state and access to the rt_mutex)
+ * @locked:	if the attempt to take the rt_mutex succeeded (1) or not (0)
+ *
+ * After attempting to lock an rt_mutex, this function is called to cleanup
+ * the pi_state owner as well as handle race conditions that may allow us to
+ * acquire the lock. Must be called with the hb lock held.
+ *
+ * Returns:
+ *  1 - success, lock taken
+ *  0 - success, lock not taken
+ * <0 - on error (-EFAULT)
+ */
+static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
+		       int locked)
+{
+	struct task_struct *owner;
+	int ret = 0;
+
+	if (locked) {
+		/*
+		 * Got the lock. We might not be the anticipated owner if we
+		 * did a lock-steal - fix up the PI-state in that case:
+		 */
+		if (q->pi_state->owner != current)
+			ret = fixup_pi_state_owner(uaddr, q, current, fshared);
+		goto out;
+	}
+
+	/*
+	 * Catch the rare case, where the lock was released when we were on the
+	 * way back before we locked the hash bucket.
+	 */
+	if (q->pi_state->owner == current) {
+		/*
+		 * Try to get the rt_mutex now. This might fail as some other
+		 * task acquired the rt_mutex after we removed ourself from the
+		 * rt_mutex waiters list.
+		 */
+		if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
+			locked = 1;
+			goto out;
+		}
+
+		/*
+		 * pi_state is incorrect, some other task did a lock steal and
+		 * we returned due to timeout or signal without taking the
+		 * rt_mutex. Too late. We can access the rt_mutex_owner without
+		 * locking, as the other task is now blocked on the hash bucket
+		 * lock. Fix the state up.
+		 */
+		owner = rt_mutex_owner(&q->pi_state->pi_mutex);
+		ret = fixup_pi_state_owner(uaddr, q, owner, fshared);
+		goto out;
+	}
+
+	/*
+	 * Paranoia check. If we did not take the lock, then we should not be
+	 * the owner, nor the pending owner, of the rt_mutex.
+	 */
+	if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
+		printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
+				"pi-state %p\n", ret,
+				q->pi_state->pi_mutex.owner,
+				q->pi_state->owner);
+
+out:
+	return ret ? ret : locked;
+}
+
+/**
  * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
  * @hb:		the futex hash bucket, must be locked by the caller
  * @q:		the futex_q to queue up on
@@ -1459,11 +1532,10 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 			 int detect, ktime_t *time, int trylock)
 {
 	struct hrtimer_sleeper timeout, *to = NULL;
-	struct task_struct *curr = current;
 	struct futex_hash_bucket *hb;
 	u32 uval;
 	struct futex_q q;
-	int ret;
+	int res, ret;
 
 	if (refill_pi_state_cache())
 		return -ENOMEM;
@@ -1527,71 +1599,21 @@ retry_private:
 	}
 
 	spin_lock(q.lock_ptr);
-
-	if (!ret) {
-		/*
-		 * Got the lock. We might not be the anticipated owner
-		 * if we did a lock-steal - fix up the PI-state in
-		 * that case:
-		 */
-		if (q.pi_state->owner != curr)
-			ret = fixup_pi_state_owner(uaddr, &q, curr, fshared);
-	} else {
-		/*
-		 * Catch the rare case, where the lock was released
-		 * when we were on the way back before we locked the
-		 * hash bucket.
-		 */
-		if (q.pi_state->owner == curr) {
-			/*
-			 * Try to get the rt_mutex now. This might
-			 * fail as some other task acquired the
-			 * rt_mutex after we removed ourself from the
-			 * rt_mutex waiters list.
-			 */
-			if (rt_mutex_trylock(&q.pi_state->pi_mutex))
-				ret = 0;
-			else {
-				/*
-				 * pi_state is incorrect, some other
-				 * task did a lock steal and we
-				 * returned due to timeout or signal
-				 * without taking the rt_mutex. Too
-				 * late. We can access the
-				 * rt_mutex_owner without locking, as
-				 * the other task is now blocked on
-				 * the hash bucket lock. Fix the state
-				 * up.
-				 */
-				struct task_struct *owner;
-				int res;
-
-				owner = rt_mutex_owner(&q.pi_state->pi_mutex);
-				res = fixup_pi_state_owner(uaddr, &q, owner,
-							   fshared);
-
-				/* propagate -EFAULT, if the fixup failed */
-				if (res)
-					ret = res;
-			}
-		} else {
-			/*
-			 * Paranoia check. If we did not take the lock
-			 * in the trylock above, then we should not be
-			 * the owner of the rtmutex, neither the real
-			 * nor the pending one:
-			 */
-			if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr)
-				printk(KERN_ERR "futex_lock_pi: ret = %d "
-				       "pi-mutex: %p pi-state %p\n", ret,
-				       q.pi_state->pi_mutex.owner,
-				       q.pi_state->owner);
-		}
-	}
+	/*
+	 * Fixup the pi_state owner and possibly acquire the lock if we
+	 * haven't already.
+	 */
+	res = fixup_owner(uaddr, fshared, &q, !ret);
+	/*
+	 * If fixup_owner() returned an error, proprogate that.  If it acquired
+	 * the lock, clear our -ETIMEDOUT or -EINTR.
+	 */
+	if (res)
+		ret = (res < 0) ? res : 0;
 
 	/*
-	 * If fixup_pi_state_owner() faulted and was unable to handle the
-	 * fault, unlock it and return the fault to userspace.
+	 * If fixup_owner() faulted and was unable to handle the fault, unlock
+	 * it and return the fault to userspace.
 	 */
 	if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
 		rt_mutex_unlock(&q.pi_state->pi_mutex);
@@ -1599,9 +1621,7 @@ retry_private:
 	/* Unqueue and drop the lock */
 	unqueue_me_pi(&q);
 
-	if (to)
-		destroy_hrtimer_on_stack(&to->timer);
-	return ret != -EINTR ? ret : -ERESTARTNOINTR;
+	goto out;
 
 out_unlock_put_key:
 	queue_unlock(&q, hb);
@@ -1611,7 +1631,7 @@ out_put_key:
 out:
 	if (to)
 		destroy_hrtimer_on_stack(&to->timer);
-	return ret;
+	return ret != -EINTR ? ret : -ERESTARTNOINTR;
 
 uaddr_faulted:
 	/*
-- 
cgit v0.10.2


From 8dac456a681bd94272ff50ecb31be6b669382c2b Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Fri, 3 Apr 2009 13:40:12 -0700
Subject: rt_mutex: add proxy lock routines

This patch is a prerequisite for futex requeue_pi. It basically splits
rt_mutex_slowlock() right down the middle, just before the first call
to schedule(). It further adds helper functions which make use of the
split and provide the rt-mutex preliminaries for futex requeue_pi.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 69d9cb9..fec77e7 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -300,7 +300,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
  * assigned pending owner [which might not have taken the
  * lock yet]:
  */
-static inline int try_to_steal_lock(struct rt_mutex *lock)
+static inline int try_to_steal_lock(struct rt_mutex *lock,
+				    struct task_struct *task)
 {
 	struct task_struct *pendowner = rt_mutex_owner(lock);
 	struct rt_mutex_waiter *next;
@@ -309,11 +310,11 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
 	if (!rt_mutex_owner_pending(lock))
 		return 0;
 
-	if (pendowner == current)
+	if (pendowner == task)
 		return 1;
 
 	spin_lock_irqsave(&pendowner->pi_lock, flags);
-	if (current->prio >= pendowner->prio) {
+	if (task->prio >= pendowner->prio) {
 		spin_unlock_irqrestore(&pendowner->pi_lock, flags);
 		return 0;
 	}
@@ -338,21 +339,21 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
 	 * We are going to steal the lock and a waiter was
 	 * enqueued on the pending owners pi_waiters queue. So
 	 * we have to enqueue this waiter into
-	 * current->pi_waiters list. This covers the case,
-	 * where current is boosted because it holds another
+	 * task->pi_waiters list. This covers the case,
+	 * where task is boosted because it holds another
 	 * lock and gets unboosted because the booster is
 	 * interrupted, so we would delay a waiter with higher
-	 * priority as current->normal_prio.
+	 * priority as task->normal_prio.
 	 *
 	 * Note: in the rare case of a SCHED_OTHER task changing
 	 * its priority and thus stealing the lock, next->task
-	 * might be current:
+	 * might be task:
 	 */
-	if (likely(next->task != current)) {
-		spin_lock_irqsave(&current->pi_lock, flags);
-		plist_add(&next->pi_list_entry, &current->pi_waiters);
-		__rt_mutex_adjust_prio(current);
-		spin_unlock_irqrestore(&current->pi_lock, flags);
+	if (likely(next->task != task)) {
+		spin_lock_irqsave(&task->pi_lock, flags);
+		plist_add(&next->pi_list_entry, &task->pi_waiters);
+		__rt_mutex_adjust_prio(task);
+		spin_unlock_irqrestore(&task->pi_lock, flags);
 	}
 	return 1;
 }
@@ -389,7 +390,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
 	 */
 	mark_rt_mutex_waiters(lock);
 
-	if (rt_mutex_owner(lock) && !try_to_steal_lock(lock))
+	if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current))
 		return 0;
 
 	/* We got the lock. */
@@ -411,6 +412,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
  */
 static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 				   struct rt_mutex_waiter *waiter,
+				   struct task_struct *task,
 				   int detect_deadlock)
 {
 	struct task_struct *owner = rt_mutex_owner(lock);
@@ -418,21 +420,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 	unsigned long flags;
 	int chain_walk = 0, res;
 
-	spin_lock_irqsave(&current->pi_lock, flags);
-	__rt_mutex_adjust_prio(current);
-	waiter->task = current;
+	spin_lock_irqsave(&task->pi_lock, flags);
+	__rt_mutex_adjust_prio(task);
+	waiter->task = task;
 	waiter->lock = lock;
-	plist_node_init(&waiter->list_entry, current->prio);
-	plist_node_init(&waiter->pi_list_entry, current->prio);
+	plist_node_init(&waiter->list_entry, task->prio);
+	plist_node_init(&waiter->pi_list_entry, task->prio);
 
 	/* Get the top priority waiter on the lock */
 	if (rt_mutex_has_waiters(lock))
 		top_waiter = rt_mutex_top_waiter(lock);
 	plist_add(&waiter->list_entry, &lock->wait_list);
 
-	current->pi_blocked_on = waiter;
+	task->pi_blocked_on = waiter;
 
-	spin_unlock_irqrestore(&current->pi_lock, flags);
+	spin_unlock_irqrestore(&task->pi_lock, flags);
 
 	if (waiter == rt_mutex_top_waiter(lock)) {
 		spin_lock_irqsave(&owner->pi_lock, flags);
@@ -460,7 +462,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 	spin_unlock(&lock->wait_lock);
 
 	res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
-					 current);
+					 task);
 
 	spin_lock(&lock->wait_lock);
 
@@ -605,37 +607,25 @@ void rt_mutex_adjust_pi(struct task_struct *task)
 	rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
 }
 
-/*
- * Slow path lock function:
+/**
+ * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
+ * @lock:		 the rt_mutex to take
+ * @state:		 the state the task should block in (TASK_INTERRUPTIBLE
+ * 			 or TASK_UNINTERRUPTIBLE)
+ * @timeout:		 the pre-initialized and started timer, or NULL for none
+ * @waiter:		 the pre-initialized rt_mutex_waiter
+ * @detect_deadlock:	 passed to task_blocks_on_rt_mutex
+ *
+ * lock->wait_lock must be held by the caller.
  */
 static int __sched
-rt_mutex_slowlock(struct rt_mutex *lock, int state,
-		  struct hrtimer_sleeper *timeout,
-		  int detect_deadlock)
+__rt_mutex_slowlock(struct rt_mutex *lock, int state,
+		    struct hrtimer_sleeper *timeout,
+		    struct rt_mutex_waiter *waiter,
+		    int detect_deadlock)
 {
-	struct rt_mutex_waiter waiter;
 	int ret = 0;
 
-	debug_rt_mutex_init_waiter(&waiter);
-	waiter.task = NULL;
-
-	spin_lock(&lock->wait_lock);
-
-	/* Try to acquire the lock again: */
-	if (try_to_take_rt_mutex(lock)) {
-		spin_unlock(&lock->wait_lock);
-		return 0;
-	}
-
-	set_current_state(state);
-
-	/* Setup the timer, when timeout != NULL */
-	if (unlikely(timeout)) {
-		hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
-		if (!hrtimer_active(&timeout->timer))
-			timeout->task = NULL;
-	}
-
 	for (;;) {
 		/* Try to acquire the lock: */
 		if (try_to_take_rt_mutex(lock))
@@ -656,19 +646,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 		}
 
 		/*
-		 * waiter.task is NULL the first time we come here and
+		 * waiter->task is NULL the first time we come here and
 		 * when we have been woken up by the previous owner
 		 * but the lock got stolen by a higher prio task.
 		 */
-		if (!waiter.task) {
-			ret = task_blocks_on_rt_mutex(lock, &waiter,
+		if (!waiter->task) {
+			ret = task_blocks_on_rt_mutex(lock, waiter, current,
 						      detect_deadlock);
 			/*
 			 * If we got woken up by the owner then start loop
 			 * all over without going into schedule to try
 			 * to get the lock now:
 			 */
-			if (unlikely(!waiter.task)) {
+			if (unlikely(!waiter->task)) {
 				/*
 				 * Reset the return value. We might
 				 * have returned with -EDEADLK and the
@@ -684,15 +674,52 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 
 		spin_unlock(&lock->wait_lock);
 
-		debug_rt_mutex_print_deadlock(&waiter);
+		debug_rt_mutex_print_deadlock(waiter);
 
-		if (waiter.task)
+		if (waiter->task)
 			schedule_rt_mutex(lock);
 
 		spin_lock(&lock->wait_lock);
 		set_current_state(state);
 	}
 
+	return ret;
+}
+
+/*
+ * Slow path lock function:
+ */
+static int __sched
+rt_mutex_slowlock(struct rt_mutex *lock, int state,
+		  struct hrtimer_sleeper *timeout,
+		  int detect_deadlock)
+{
+	struct rt_mutex_waiter waiter;
+	int ret = 0;
+
+	debug_rt_mutex_init_waiter(&waiter);
+	waiter.task = NULL;
+
+	spin_lock(&lock->wait_lock);
+
+	/* Try to acquire the lock again: */
+	if (try_to_take_rt_mutex(lock)) {
+		spin_unlock(&lock->wait_lock);
+		return 0;
+	}
+
+	set_current_state(state);
+
+	/* Setup the timer, when timeout != NULL */
+	if (unlikely(timeout)) {
+		hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
+		if (!hrtimer_active(&timeout->timer))
+			timeout->task = NULL;
+	}
+
+	ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
+				  detect_deadlock);
+
 	set_current_state(TASK_RUNNING);
 
 	if (unlikely(waiter.task))
@@ -986,6 +1013,59 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
 }
 
 /**
+ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock:		the rt_mutex to take
+ * @waiter:		the pre-initialized rt_mutex_waiter
+ * @task:		the task to prepare
+ * @detect_deadlock:	perform deadlock detection (1) or not (0)
+ *
+ * Returns:
+ *  0 - task blocked on lock
+ *  1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for FUTEX_REQUEUE_PI support.
+ */
+int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+			      struct rt_mutex_waiter *waiter,
+			      struct task_struct *task, int detect_deadlock)
+{
+	int ret;
+
+	spin_lock(&lock->wait_lock);
+
+	mark_rt_mutex_waiters(lock);
+
+	if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
+		/* We got the lock for task. */
+		debug_rt_mutex_lock(lock);
+
+		rt_mutex_set_owner(lock, task, 0);
+
+		rt_mutex_deadlock_account_lock(lock, task);
+		return 1;
+	}
+
+	ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
+
+
+	if (ret && !waiter->task) {
+		/*
+		 * Reset the return value. We might have
+		 * returned with -EDEADLK and the owner
+		 * released the lock while we were walking the
+		 * pi chain.  Let the waiter sort it out.
+		 */
+		ret = 0;
+	}
+	spin_unlock(&lock->wait_lock);
+
+	debug_rt_mutex_print_deadlock(waiter);
+
+	return ret;
+}
+
+/**
  * rt_mutex_next_owner - return the next owner of the lock
  *
  * @lock: the rt lock query
@@ -1004,3 +1084,57 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
 
 	return rt_mutex_top_waiter(lock)->task;
 }
+
+/**
+ * rt_mutex_finish_proxy_lock() - Complete lock acquisition
+ * @lock:		the rt_mutex we were woken on
+ * @to:			the timeout, null if none. hrtimer should already have
+ * 			been started.
+ * @waiter:		the pre-initialized rt_mutex_waiter
+ * @detect_deadlock:	perform deadlock detection (1) or not (0)
+ *
+ * Complete the lock acquisition started our behalf by another thread.
+ *
+ * Returns:
+ *  0 - success
+ * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK
+ *
+ * Special API call for PI-futex requeue support
+ */
+int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+			       struct hrtimer_sleeper *to,
+			       struct rt_mutex_waiter *waiter,
+			       int detect_deadlock)
+{
+	int ret;
+
+	spin_lock(&lock->wait_lock);
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter,
+				  detect_deadlock);
+
+	set_current_state(TASK_RUNNING);
+
+	if (unlikely(waiter->task))
+		remove_waiter(lock, waiter);
+
+	/*
+	 * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+	 * have to fix that up.
+	 */
+	fixup_rt_mutex_waiters(lock);
+
+	spin_unlock(&lock->wait_lock);
+
+	/*
+	 * Readjust priority, when we did not get the lock. We might have been
+	 * the pending owner and boosted. Since we did not take the lock, the
+	 * PI boost has to go.
+	 */
+	if (unlikely(ret))
+		rt_mutex_adjust_prio(current);
+
+	return ret;
+}
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index e124bf5..97a2f81 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -120,6 +120,14 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
 				       struct task_struct *proxy_owner);
 extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
 				  struct task_struct *proxy_owner);
+extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+				     struct rt_mutex_waiter *waiter,
+				     struct task_struct *task,
+				     int detect_deadlock);
+extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+				      struct hrtimer_sleeper *to,
+				      struct rt_mutex_waiter *waiter,
+				      int detect_deadlock);
 
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 # include "rtmutex-debug.h"
-- 
cgit v0.10.2


From a72188d8a64ebe74722f1cf7ffac41b41ffdba21 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Fri, 3 Apr 2009 13:40:22 -0700
Subject: futex: add FUTEX_HAS_TIMEOUT flag to restart.futex.flags

Currently restart is only used if there is a timeout. The requeue_pi
functionality requires restarting to futex_lock_pi() on signal after
wakeup in futex_wait_requeue_pi() regardless of if there was a timeout
or not. Using 0 for the timeout value is confusing as that could
indicate an expired timer. The flag makes this explicit. While the
check is not technically needed in futex_wait_restart(), doing so
makes the code consistent with and will avoid confusion should the
need arise to restart wait without a timeout.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/futex.c b/kernel/futex.c
index af831fb..6b597cf 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1252,6 +1252,7 @@ handle_fault:
  */
 #define FLAGS_SHARED		0x01
 #define FLAGS_CLOCKRT		0x02
+#define FLAGS_HAS_TIMEOUT	0x04
 
 static long futex_wait_restart(struct restart_block *restart);
 
@@ -1486,7 +1487,7 @@ retry_private:
 	restart->futex.val = val;
 	restart->futex.time = abs_time->tv64;
 	restart->futex.bitset = bitset;
-	restart->futex.flags = 0;
+	restart->futex.flags = FLAGS_HAS_TIMEOUT;
 
 	if (fshared)
 		restart->futex.flags |= FLAGS_SHARED;
@@ -1510,13 +1511,16 @@ static long futex_wait_restart(struct restart_block *restart)
 {
 	u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
 	int fshared = 0;
-	ktime_t t;
+	ktime_t t, *tp = NULL;
 
-	t.tv64 = restart->futex.time;
+	if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
+		t.tv64 = restart->futex.time;
+		tp = &t;
+	}
 	restart->fn = do_no_restart_syscall;
 	if (restart->futex.flags & FLAGS_SHARED)
 		fshared = 1;
-	return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
+	return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,
 				restart->futex.bitset,
 				restart->futex.flags & FLAGS_CLOCKRT);
 }
-- 
cgit v0.10.2


From 9121e4783cd5c7e2a407763f3b61c2d573891133 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Fri, 3 Apr 2009 13:40:31 -0700
Subject: futex: distangle futex_requeue()

futex_requeue() is getting a bit long-winded, and will be getting more
so after the requeue_pi patch. Factor out the actual requeueing into a
nicely contained inline function to reduce function length and improve
legibility.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/futex.c b/kernel/futex.c
index 6b597cf..e76942e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -940,6 +940,34 @@ out:
 	return ret;
 }
 
+/**
+ * requeue_futex() - Requeue a futex_q from one hb to another
+ * @q:		the futex_q to requeue
+ * @hb1:	the source hash_bucket
+ * @hb2:	the target hash_bucket
+ * @key2:	the new key for the requeued futex_q
+ */
+static inline
+void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
+		   struct futex_hash_bucket *hb2, union futex_key *key2)
+{
+
+	/*
+	 * If key1 and key2 hash to the same bucket, no need to
+	 * requeue.
+	 */
+	if (likely(&hb1->chain != &hb2->chain)) {
+		plist_del(&q->list, &hb1->chain);
+		plist_add(&q->list, &hb2->chain);
+		q->lock_ptr = &hb2->lock;
+#ifdef CONFIG_DEBUG_PI_LIST
+		q->list.plist.lock = &hb2->lock;
+#endif
+	}
+	get_futex_key_refs(key2);
+	q->key = *key2;
+}
+
 /*
  * Requeue all waiters hashed on one physical page to another
  * physical page.
@@ -999,20 +1027,7 @@ retry_private:
 		if (++ret <= nr_wake) {
 			wake_futex(this);
 		} else {
-			/*
-			 * If key1 and key2 hash to the same bucket, no need to
-			 * requeue.
-			 */
-			if (likely(head1 != &hb2->chain)) {
-				plist_del(&this->list, &hb1->chain);
-				plist_add(&this->list, &hb2->chain);
-				this->lock_ptr = &hb2->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
-				this->list.plist.lock = &hb2->lock;
-#endif
-			}
-			this->key = key2;
-			get_futex_key_refs(&key2);
+			requeue_futex(this, hb1, hb2, &key2);
 			drop_count++;
 
 			if (ret - nr_wake >= nr_requeue)
-- 
cgit v0.10.2


From f801073f87aa22ddf0e9146355fec3993163790f Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Fri, 3 Apr 2009 13:40:40 -0700
Subject: futex: split out futex value validation code

Refactor the code to validate the expected futex value in order to
reuse it with the requeue_pi code.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/futex.c b/kernel/futex.c
index e76942e..dbe857a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1398,42 +1398,29 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 	__set_current_state(TASK_RUNNING);
 }
 
-static int futex_wait(u32 __user *uaddr, int fshared,
-		      u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+/**
+ * futex_wait_setup() - Prepare to wait on a futex
+ * @uaddr:	the futex userspace address
+ * @val:	the expected value
+ * @fshared:	whether the futex is shared (1) or not (0)
+ * @q:		the associated futex_q
+ * @hb:		storage for hash_bucket pointer to be returned to caller
+ *
+ * Setup the futex_q and locate the hash_bucket.  Get the futex value and
+ * compare it with the expected value.  Handle atomic faults internally.
+ * Return with the hb lock held and a q.key reference on success, and unlocked
+ * with no q.key reference on failure.
+ *
+ * Returns:
+ *  0 - uaddr contains val and hb has been locked
+ * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
+ */
+static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
+			   struct futex_q *q, struct futex_hash_bucket **hb)
 {
-	struct hrtimer_sleeper timeout, *to = NULL;
-	DECLARE_WAITQUEUE(wait, current);
-	struct restart_block *restart;
-	struct futex_hash_bucket *hb;
-	struct futex_q q;
 	u32 uval;
 	int ret;
 
-	if (!bitset)
-		return -EINVAL;
-
-	q.pi_state = NULL;
-	q.bitset = bitset;
-
-	if (abs_time) {
-		to = &timeout;
-
-		hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
-				      CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
-		hrtimer_init_sleeper(to, current);
-		hrtimer_set_expires_range_ns(&to->timer, *abs_time,
-					     current->timer_slack_ns);
-	}
-
-retry:
-	q.key = FUTEX_KEY_INIT;
-	ret = get_futex_key(uaddr, fshared, &q.key);
-	if (unlikely(ret != 0))
-		goto out;
-
-retry_private:
-	hb = queue_lock(&q);
-
 	/*
 	 * Access the page AFTER the hash-bucket is locked.
 	 * Order is important:
@@ -1450,33 +1437,74 @@ retry_private:
 	 * A consequence is that futex_wait() can return zero and absorb
 	 * a wakeup when *uaddr != val on entry to the syscall.  This is
 	 * rare, but normal.
-	 *
-	 * For shared futexes, we hold the mmap semaphore, so the mapping
-	 * cannot have changed since we looked it up in get_futex_key.
 	 */
+retry:
+	q->key = FUTEX_KEY_INIT;
+	ret = get_futex_key(uaddr, fshared, &q->key);
+	if (unlikely(ret != 0))
+		goto out;
+
+retry_private:
+	*hb = queue_lock(q);
+
 	ret = get_futex_value_locked(&uval, uaddr);
 
-	if (unlikely(ret)) {
-		queue_unlock(&q, hb);
+	if (ret) {
+		queue_unlock(q, *hb);
 
 		ret = get_user(uval, uaddr);
 		if (ret)
-			goto out_put_key;
+			goto out;
 
 		if (!fshared)
 			goto retry_private;
 
-		put_futex_key(fshared, &q.key);
+		put_futex_key(fshared, &q->key);
 		goto retry;
 	}
-	ret = -EWOULDBLOCK;
 
-	/* Only actually queue if *uaddr contained val.  */
-	if (unlikely(uval != val)) {
-		queue_unlock(&q, hb);
-		goto out_put_key;
+	if (uval != val) {
+		queue_unlock(q, *hb);
+		ret = -EWOULDBLOCK;
 	}
 
+out:
+	if (ret)
+		put_futex_key(fshared, &q->key);
+	return ret;
+}
+
+static int futex_wait(u32 __user *uaddr, int fshared,
+		      u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+{
+	struct hrtimer_sleeper timeout, *to = NULL;
+	DECLARE_WAITQUEUE(wait, current);
+	struct restart_block *restart;
+	struct futex_hash_bucket *hb;
+	struct futex_q q;
+	int ret;
+
+	if (!bitset)
+		return -EINVAL;
+
+	q.pi_state = NULL;
+	q.bitset = bitset;
+
+	if (abs_time) {
+		to = &timeout;
+
+		hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
+				      CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+		hrtimer_init_sleeper(to, current);
+		hrtimer_set_expires_range_ns(&to->timer, *abs_time,
+					     current->timer_slack_ns);
+	}
+
+	/* Prepare to wait on uaddr. */
+	ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+	if (ret)
+		goto out;
+
 	/* queue_me and wait for wakeup, timeout, or a signal. */
 	futex_wait_queue_me(hb, &q, to, &wait);
 
-- 
cgit v0.10.2


From 52400ba946759af28442dee6265c5c0180ac7122 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Fri, 3 Apr 2009 13:40:49 -0700
Subject: futex: add requeue_pi functionality

PI Futexes and their underlying rt_mutex cannot be left ownerless if
there are pending waiters as this will break the PI boosting logic, so
the standard requeue commands aren't sufficient.  The new commands
properly manage pi futex ownership by ensuring a futex with waiters
has an owner at all times.  This will allow glibc to properly handle
pi mutexes with pthread_condvars.

The approach taken here is to create two new futex op codes:

FUTEX_WAIT_REQUEUE_PI:
Tasks will use this op code to wait on a futex (such as a non-pi waitqueue)
and wake after they have been requeued to a pi futex.  Prior to returning to
userspace, they will acquire this pi futex (and the underlying rt_mutex).

futex_wait_requeue_pi() is the result of a high speed collision between
futex_wait() and futex_lock_pi() (with the first part of futex_lock_pi() being
done by futex_proxy_trylock_atomic() on behalf of the top_waiter).

FUTEX_REQUEUE_PI (and FUTEX_CMP_REQUEUE_PI):
This call must be used to wake tasks waiting with FUTEX_WAIT_REQUEUE_PI,
regardless of how many tasks the caller intends to wake or requeue.
pthread_cond_broadcast() should call this with nr_wake=1 and
nr_requeue=INT_MAX.  pthread_cond_signal() should call this with nr_wake=1 and
nr_requeue=0.  The reason being we need both callers to get the benefit of the
futex_proxy_trylock_atomic() routine.  futex_requeue() also enqueues the
top_waiter on the rt_mutex via rt_mutex_start_proxy_lock().

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 3bf5bb5..b05519c 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -23,6 +23,9 @@ union ktime;
 #define FUTEX_TRYLOCK_PI	8
 #define FUTEX_WAIT_BITSET	9
 #define FUTEX_WAKE_BITSET	10
+#define FUTEX_WAIT_REQUEUE_PI	11
+#define FUTEX_REQUEUE_PI	12
+#define FUTEX_CMP_REQUEUE_PI	13
 
 #define FUTEX_PRIVATE_FLAG	128
 #define FUTEX_CLOCK_REALTIME	256
@@ -38,6 +41,11 @@ union ktime;
 #define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG)
 #define FUTEX_WAIT_BITSET_PRIVATE	(FUTEX_WAIT_BITS | FUTEX_PRIVATE_FLAG)
 #define FUTEX_WAKE_BITSET_PRIVATE	(FUTEX_WAKE_BITS | FUTEX_PRIVATE_FLAG)
+#define FUTEX_WAIT_REQUEUE_PI_PRIVATE	(FUTEX_WAIT_REQUEUE_PI | \
+					 FUTEX_PRIVATE_FLAG)
+#define FUTEX_REQUEUE_PI_PRIVATE	(FUTEX_REQUEUE_PI | FUTEX_PRIVATE_FLAG)
+#define FUTEX_CMP_REQUEUE_PI_PRIVATE	(FUTEX_CMP_REQUEUE_PI | \
+					 FUTEX_PRIVATE_FLAG)
 
 /*
  * Support for robust futexes: the kernel cleans up held futexes at
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index e6b820f..a8cc4e1 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -21,13 +21,14 @@ struct restart_block {
 		struct {
 			unsigned long arg0, arg1, arg2, arg3;
 		};
-		/* For futex_wait */
+		/* For futex_wait and futex_wait_requeue_pi */
 		struct {
 			u32 *uaddr;
 			u32 val;
 			u32 flags;
 			u32 bitset;
 			u64 time;
+			u32 *uaddr2;
 		} futex;
 		/* For nanosleep */
 		struct {
diff --git a/kernel/futex.c b/kernel/futex.c
index dbe857a..185c981 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -19,6 +19,10 @@
  *  PRIVATE futexes by Eric Dumazet
  *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
  *
+ *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
+ *  Copyright (C) IBM Corporation, 2009
+ *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
+ *
  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  *  enough at me, Linus for the original (flawed) idea, Matthew
  *  Kirkwood for proof-of-concept implementation.
@@ -109,6 +113,9 @@ struct futex_q {
 	struct futex_pi_state *pi_state;
 	struct task_struct *task;
 
+	/* rt_waiter storage for requeue_pi: */
+	struct rt_mutex_waiter *rt_waiter;
+
 	/* Bitset for the optional bitmasked wakeup */
 	u32 bitset;
 };
@@ -827,7 +834,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
 
 	plist_for_each_entry_safe(this, next, head, list) {
 		if (match_futex (&this->key, &key)) {
-			if (this->pi_state) {
+			if (this->pi_state || this->rt_waiter) {
 				ret = -EINVAL;
 				break;
 			}
@@ -968,20 +975,138 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
 	q->key = *key2;
 }
 
-/*
- * Requeue all waiters hashed on one physical page to another
- * physical page.
+/**
+ * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
+ * q:	the futex_q
+ * key:	the key of the requeue target futex
+ *
+ * During futex_requeue, with requeue_pi=1, it is possible to acquire the
+ * target futex if it is uncontended or via a lock steal.  Set the futex_q key
+ * to the requeue target futex so the waiter can detect the wakeup on the right
+ * futex, but remove it from the hb and NULL the rt_waiter so it can detect
+ * atomic lock acquisition.  Must be called with the q->lock_ptr held.
+ */
+static inline
+void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
+{
+	drop_futex_key_refs(&q->key);
+	get_futex_key_refs(key);
+	q->key = *key;
+
+	WARN_ON(plist_node_empty(&q->list));
+	plist_del(&q->list, &q->list.plist);
+
+	WARN_ON(!q->rt_waiter);
+	q->rt_waiter = NULL;
+
+	wake_up(&q->waiter);
+}
+
+/**
+ * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
+ * @pifutex:	the user address of the to futex
+ * @hb1:	the from futex hash bucket, must be locked by the caller
+ * @hb2:	the to futex hash bucket, must be locked by the caller
+ * @key1:	the from futex key
+ * @key2:	the to futex key
+ *
+ * Try and get the lock on behalf of the top waiter if we can do it atomically.
+ * Wake the top waiter if we succeed.  hb1 and hb2 must be held by the caller.
+ *
+ * Returns:
+ *  0 - failed to acquire the lock atomicly
+ *  1 - acquired the lock
+ * <0 - error
+ */
+static int futex_proxy_trylock_atomic(u32 __user *pifutex,
+				 struct futex_hash_bucket *hb1,
+				 struct futex_hash_bucket *hb2,
+				 union futex_key *key1, union futex_key *key2,
+				 struct futex_pi_state **ps)
+{
+	struct futex_q *top_waiter;
+	u32 curval;
+	int ret;
+
+	if (get_futex_value_locked(&curval, pifutex))
+		return -EFAULT;
+
+	top_waiter = futex_top_waiter(hb1, key1);
+
+	/* There are no waiters, nothing for us to do. */
+	if (!top_waiter)
+		return 0;
+
+	/*
+	 * Either take the lock for top_waiter or set the FUTEX_WAITERS bit.
+	 * The pi_state is returned in ps in contended cases.
+	 */
+	ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task);
+	if (ret == 1)
+		requeue_pi_wake_futex(top_waiter, key2);
+
+	return ret;
+}
+
+/**
+ * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
+ * uaddr1:	source futex user address
+ * uaddr2:	target futex user address
+ * nr_wake:	number of waiters to wake (must be 1 for requeue_pi)
+ * nr_requeue:	number of waiters to requeue (0-INT_MAX)
+ * requeue_pi:	if we are attempting to requeue from a non-pi futex to a
+ * 		pi futex (pi to pi requeue is not supported)
+ *
+ * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
+ * uaddr2 atomically on behalf of the top waiter.
+ *
+ * Returns:
+ * >=0 - on success, the number of tasks requeued or woken
+ *  <0 - on error
  */
 static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
-			 int nr_wake, int nr_requeue, u32 *cmpval)
+			 int nr_wake, int nr_requeue, u32 *cmpval,
+			 int requeue_pi)
 {
 	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
+	int drop_count = 0, task_count = 0, ret;
+	struct futex_pi_state *pi_state = NULL;
 	struct futex_hash_bucket *hb1, *hb2;
 	struct plist_head *head1;
 	struct futex_q *this, *next;
-	int ret, drop_count = 0;
+	u32 curval2;
+
+	if (requeue_pi) {
+		/*
+		 * requeue_pi requires a pi_state, try to allocate it now
+		 * without any locks in case it fails.
+		 */
+		if (refill_pi_state_cache())
+			return -ENOMEM;
+		/*
+		 * requeue_pi must wake as many tasks as it can, up to nr_wake
+		 * + nr_requeue, since it acquires the rt_mutex prior to
+		 * returning to userspace, so as to not leave the rt_mutex with
+		 * waiters and no owner.  However, second and third wake-ups
+		 * cannot be predicted as they involve race conditions with the
+		 * first wake and a fault while looking up the pi_state.  Both
+		 * pthread_cond_signal() and pthread_cond_broadcast() should
+		 * use nr_wake=1.
+		 */
+		if (nr_wake != 1)
+			return -EINVAL;
+	}
 
 retry:
+	if (pi_state != NULL) {
+		/*
+		 * We will have to lookup the pi_state again, so free this one
+		 * to keep the accounting correct.
+		 */
+		free_pi_state(pi_state);
+		pi_state = NULL;
+	}
+
 	ret = get_futex_key(uaddr1, fshared, &key1);
 	if (unlikely(ret != 0))
 		goto out;
@@ -1020,19 +1145,94 @@ retry_private:
 		}
 	}
 
+	if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
+		/* Attempt to acquire uaddr2 and wake the top_waiter. */
+		ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
+						 &key2, &pi_state);
+
+		/*
+		 * At this point the top_waiter has either taken uaddr2 or is
+		 * waiting on it.  If the former, then the pi_state will not
+		 * exist yet, look it up one more time to ensure we have a
+		 * reference to it.
+		 */
+		if (ret == 1) {
+			WARN_ON(pi_state);
+			task_count++;
+			ret = get_futex_value_locked(&curval2, uaddr2);
+			if (!ret)
+				ret = lookup_pi_state(curval2, hb2, &key2,
+						      &pi_state);
+		}
+
+		switch (ret) {
+		case 0:
+			break;
+		case -EFAULT:
+			double_unlock_hb(hb1, hb2);
+			put_futex_key(fshared, &key2);
+			put_futex_key(fshared, &key1);
+			ret = get_user(curval2, uaddr2);
+			if (!ret)
+				goto retry;
+			goto out;
+		case -EAGAIN:
+			/* The owner was exiting, try again. */
+			double_unlock_hb(hb1, hb2);
+			put_futex_key(fshared, &key2);
+			put_futex_key(fshared, &key1);
+			cond_resched();
+			goto retry;
+		default:
+			goto out_unlock;
+		}
+	}
+
 	head1 = &hb1->chain;
 	plist_for_each_entry_safe(this, next, head1, list) {
-		if (!match_futex (&this->key, &key1))
+		if (task_count - nr_wake >= nr_requeue)
+			break;
+
+		if (!match_futex(&this->key, &key1))
 			continue;
-		if (++ret <= nr_wake) {
+
+		WARN_ON(!requeue_pi && this->rt_waiter);
+		WARN_ON(requeue_pi && !this->rt_waiter);
+
+		/*
+		 * Wake nr_wake waiters.  For requeue_pi, if we acquired the
+		 * lock, we already woke the top_waiter.  If not, it will be
+		 * woken by futex_unlock_pi().
+		 */
+		if (++task_count <= nr_wake && !requeue_pi) {
 			wake_futex(this);
-		} else {
-			requeue_futex(this, hb1, hb2, &key2);
-			drop_count++;
+			continue;
+		}
 
-			if (ret - nr_wake >= nr_requeue)
-				break;
+		/*
+		 * Requeue nr_requeue waiters and possibly one more in the case
+		 * of requeue_pi if we couldn't acquire the lock atomically.
+		 */
+		if (requeue_pi) {
+			/* Prepare the waiter to take the rt_mutex. */
+			atomic_inc(&pi_state->refcount);
+			this->pi_state = pi_state;
+			ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
+							this->rt_waiter,
+							this->task, 1);
+			if (ret == 1) {
+				/* We got the lock. */
+				requeue_pi_wake_futex(this, &key2);
+				continue;
+			} else if (ret) {
+				/* -EDEADLK */
+				this->pi_state = NULL;
+				free_pi_state(pi_state);
+				goto out_unlock;
+			}
 		}
+		requeue_futex(this, hb1, hb2, &key2);
+		drop_count++;
 	}
 
 out_unlock:
@@ -1047,7 +1247,9 @@ out_put_keys:
 out_put_key1:
 	put_futex_key(fshared, &key1);
 out:
-	return ret;
+	if (pi_state != NULL)
+		free_pi_state(pi_state);
+	return ret ? ret : task_count;
 }
 
 /* The key must be already stored in q->key. */
@@ -1270,6 +1472,7 @@ handle_fault:
 #define FLAGS_HAS_TIMEOUT	0x04
 
 static long futex_wait_restart(struct restart_block *restart);
+static long futex_lock_pi_restart(struct restart_block *restart);
 
 /**
  * fixup_owner() - Post lock pi_state and corner case management
@@ -1489,6 +1692,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 
 	q.pi_state = NULL;
 	q.bitset = bitset;
+	q.rt_waiter = NULL;
 
 	if (abs_time) {
 		to = &timeout;
@@ -1596,6 +1800,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 	}
 
 	q.pi_state = NULL;
+	q.rt_waiter = NULL;
 retry:
 	q.key = FUTEX_KEY_INIT;
 	ret = get_futex_key(uaddr, fshared, &q.key);
@@ -1701,6 +1906,20 @@ uaddr_faulted:
 	goto retry;
 }
 
+static long futex_lock_pi_restart(struct restart_block *restart)
+{
+	u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
+	ktime_t t, *tp = NULL;
+	int fshared = restart->futex.flags & FLAGS_SHARED;
+
+	if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
+		t.tv64 = restart->futex.time;
+		tp = &t;
+	}
+	restart->fn = do_no_restart_syscall;
+
+	return (long)futex_lock_pi(uaddr, fshared, restart->futex.val, tp, 0);
+}
 
 /*
  * Userspace attempted a TID -> 0 atomic transition, and failed.
@@ -1803,6 +2022,253 @@ pi_faulted:
 	return ret;
 }
 
+/**
+ * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
+ * @hb:		the hash_bucket futex_q was original enqueued on
+ * @q:		the futex_q woken while waiting to be requeued
+ * @key2:	the futex_key of the requeue target futex
+ * @timeout:	the timeout associated with the wait (NULL if none)
+ *
+ * Detect if the task was woken on the initial futex as opposed to the requeue
+ * target futex.  If so, determine if it was a timeout or a signal that caused
+ * the wakeup and return the appropriate error code to the caller.  Must be
+ * called with the hb lock held.
+ *
+ * Returns
+ *  0 - no early wakeup detected
+ * <0 - -ETIMEDOUT or -ERESTARTSYS (FIXME: or ERESTARTNOINTR?)
+ */
+static inline
+int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
+				   struct futex_q *q, union futex_key *key2,
+				   struct hrtimer_sleeper *timeout)
+{
+	int ret = 0;
+
+	/*
+	 * With the hb lock held, we avoid races while we process the wakeup.
+	 * We only need to hold hb (and not hb2) to ensure atomicity as the
+	 * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
+	 * It can't be requeued from uaddr2 to something else since we don't
+	 * support a PI aware source futex for requeue.
+	 */
+	if (!match_futex(&q->key, key2)) {
+		WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
+		/*
+		 * We were woken prior to requeue by a timeout or a signal.
+		 * Unqueue the futex_q and determine which it was.
+		 */
+		plist_del(&q->list, &q->list.plist);
+		drop_futex_key_refs(&q->key);
+
+		if (timeout && !timeout->task)
+			ret = -ETIMEDOUT;
+		else {
+			/*
+			 * We expect signal_pending(current), but another
+			 * thread may have handled it for us already.
+			 */
+			/* FIXME: ERESTARTSYS or ERESTARTNOINTR?  Do we care if
+			 * the user specified SA_RESTART or not? */
+			ret = -ERESTARTSYS;
+		}
+	}
+	return ret;
+}
+
+/**
+ * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
+ * @uaddr:	the futex we initialyl wait on (non-pi)
+ * @fshared:	whether the futexes are shared (1) or not (0).  They must be
+ * 		the same type, no requeueing from private to shared, etc.
+ * @val:	the expected value of uaddr
+ * @abs_time:	absolute timeout
+ * @bitset:	32 bit wakeup bitset set by userspace, defaults to all.
+ * @clockrt:	whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
+ * @uaddr2:	the pi futex we will take prior to returning to user-space
+ *
+ * The caller will wait on uaddr and will be requeued by futex_requeue() to
+ * uaddr2 which must be PI aware.  Normal wakeup will wake on uaddr2 and
+ * complete the acquisition of the rt_mutex prior to returning to userspace.
+ * This ensures the rt_mutex maintains an owner when it has waiters; without
+ * one, the pi logic wouldn't know which task to boost/deboost, if there was a
+ * need to.
+ *
+ * We call schedule in futex_wait_queue_me() when we enqueue and return there
+ * via the following:
+ * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
+ * 2) wakeup on uaddr2 after a requeue and subsequent unlock
+ * 3) signal (before or after requeue)
+ * 4) timeout (before or after requeue)
+ *
+ * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function.
+ *
+ * If 2, we may then block on trying to take the rt_mutex and return via:
+ * 5) successful lock
+ * 6) signal
+ * 7) timeout
+ * 8) other lock acquisition failure
+ *
+ * If 6, we setup a restart_block with futex_lock_pi() as the function.
+ *
+ * If 4 or 7, we cleanup and return with -ETIMEDOUT.
+ *
+ * Returns:
+ *  0 - On success
+ * <0 - On error
+ */
+static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
+				 u32 val, ktime_t *abs_time, u32 bitset,
+				 int clockrt, u32 __user *uaddr2)
+{
+	struct hrtimer_sleeper timeout, *to = NULL;
+	struct rt_mutex_waiter rt_waiter;
+	struct rt_mutex *pi_mutex = NULL;
+	DECLARE_WAITQUEUE(wait, current);
+	struct restart_block *restart;
+	struct futex_hash_bucket *hb;
+	union futex_key key2;
+	struct futex_q q;
+	int res, ret;
+	u32 uval;
+
+	if (!bitset)
+		return -EINVAL;
+
+	if (abs_time) {
+		to = &timeout;
+		hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
+				      CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+		hrtimer_init_sleeper(to, current);
+		hrtimer_set_expires_range_ns(&to->timer, *abs_time,
+					     current->timer_slack_ns);
+	}
+
+	/*
+	 * The waiter is allocated on our stack, manipulated by the requeue
+	 * code while we sleep on uaddr.
+	 */
+	debug_rt_mutex_init_waiter(&rt_waiter);
+	rt_waiter.task = NULL;
+
+	q.pi_state = NULL;
+	q.bitset = bitset;
+	q.rt_waiter = &rt_waiter;
+
+	key2 = FUTEX_KEY_INIT;
+	ret = get_futex_key(uaddr2, fshared, &key2);
+	if (unlikely(ret != 0))
+		goto out;
+
+	/* Prepare to wait on uaddr. */
+	ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+	if (ret) {
+		put_futex_key(fshared, &key2);
+		goto out;
+	}
+
+	/* Queue the futex_q, drop the hb lock, wait for wakeup. */
+	futex_wait_queue_me(hb, &q, to, &wait);
+
+	spin_lock(&hb->lock);
+	ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
+	spin_unlock(&hb->lock);
+	if (ret)
+		goto out_put_keys;
+
+	/*
+	 * In order for us to be here, we know our q.key == key2, and since
+	 * we took the hb->lock above, we also know that futex_requeue() has
+	 * completed and we no longer have to concern ourselves with a wakeup
+	 * race with the atomic proxy lock acquition by the requeue code.
+	 */
+
+	/* Check if the requeue code acquired the second futex for us. */
+	if (!q.rt_waiter) {
+		/*
+		 * Got the lock. We might not be the anticipated owner if we
+		 * did a lock-steal - fix up the PI-state in that case.
+		 */
+		if (q.pi_state && (q.pi_state->owner != current)) {
+			spin_lock(q.lock_ptr);
+			ret = fixup_pi_state_owner(uaddr2, &q, current,
+						   fshared);
+			spin_unlock(q.lock_ptr);
+		}
+	} else {
+		/*
+		 * We have been woken up by futex_unlock_pi(), a timeout, or a
+		 * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
+		 * the pi_state.
+		 */
+		WARN_ON(!&q.pi_state);
+		pi_mutex = &q.pi_state->pi_mutex;
+		ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
+		debug_rt_mutex_free_waiter(&rt_waiter);
+
+		spin_lock(q.lock_ptr);
+		/*
+		 * Fixup the pi_state owner and possibly acquire the lock if we
+		 * haven't already.
+		 */
+		res = fixup_owner(uaddr2, fshared, &q, !ret);
+		/*
+		 * If fixup_owner() returned an error, proprogate that.  If it
+		 * acquired the lock, clear our -ETIMEDOUT or -EINTR.
+		 */
+		if (res)
+			ret = (res < 0) ? res : 0;
+
+		/* Unqueue and drop the lock. */
+		unqueue_me_pi(&q);
+	}
+
+	/*
+	 * If fixup_pi_state_owner() faulted and was unable to handle the
+	 * fault, unlock the rt_mutex and return the fault to userspace.
+	 */
+	if (ret == -EFAULT) {
+		if (rt_mutex_owner(pi_mutex) == current)
+			rt_mutex_unlock(pi_mutex);
+	} else if (ret == -EINTR) {
+		ret = -EFAULT;
+		if (get_user(uval, uaddr2))
+			goto out_put_keys;
+
+		/*
+		 * We've already been requeued, so restart by calling
+		 * futex_lock_pi() directly, rather then returning to this
+		 * function.
+		 */
+		ret = -ERESTART_RESTARTBLOCK;
+		restart = &current_thread_info()->restart_block;
+		restart->fn = futex_lock_pi_restart;
+		restart->futex.uaddr = (u32 *)uaddr2;
+		restart->futex.val = uval;
+		restart->futex.flags = 0;
+		if (abs_time) {
+			restart->futex.flags |= FLAGS_HAS_TIMEOUT;
+			restart->futex.time = abs_time->tv64;
+		}
+
+		if (fshared)
+			restart->futex.flags |= FLAGS_SHARED;
+		if (clockrt)
+			restart->futex.flags |= FLAGS_CLOCKRT;
+	}
+
+out_put_keys:
+	put_futex_key(fshared, &q.key);
+	put_futex_key(fshared, &key2);
+
+out:
+	if (to) {
+		hrtimer_cancel(&to->timer);
+		destroy_hrtimer_on_stack(&to->timer);
+	}
+	return ret;
+}
+
 /*
  * Support for robust futexes: the kernel cleans up held futexes at
  * thread exit time.
@@ -2025,7 +2491,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		fshared = 1;
 
 	clockrt = op & FUTEX_CLOCK_REALTIME;
-	if (clockrt && cmd != FUTEX_WAIT_BITSET)
+	if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
 		return -ENOSYS;
 
 	switch (cmd) {
@@ -2040,10 +2506,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		ret = futex_wake(uaddr, fshared, val, val3);
 		break;
 	case FUTEX_REQUEUE:
-		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
+		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
 		break;
 	case FUTEX_CMP_REQUEUE:
-		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
+		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
+				    0);
 		break;
 	case FUTEX_WAKE_OP:
 		ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
@@ -2060,6 +2527,18 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		if (futex_cmpxchg_enabled)
 			ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
 		break;
+	case FUTEX_WAIT_REQUEUE_PI:
+		val3 = FUTEX_BITSET_MATCH_ANY;
+		ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
+					    clockrt, uaddr2);
+		break;
+	case FUTEX_REQUEUE_PI:
+		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 1);
+		break;
+	case FUTEX_CMP_REQUEUE_PI:
+		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
+				    1);
+		break;
 	default:
 		ret = -ENOSYS;
 	}
@@ -2077,7 +2556,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 	int cmd = op & FUTEX_CMD_MASK;
 
 	if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
-		      cmd == FUTEX_WAIT_BITSET)) {
+		      cmd == FUTEX_WAIT_BITSET ||
+		      cmd == FUTEX_WAIT_REQUEUE_PI)) {
 		if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
 			return -EFAULT;
 		if (!timespec_valid(&ts))
@@ -2089,10 +2569,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 		tp = &t;
 	}
 	/*
-	 * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
+	 * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
 	 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
 	 */
 	if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
+	    cmd == FUTEX_REQUEUE_PI || cmd == FUTEX_CMP_REQUEUE_PI ||
 	    cmd == FUTEX_WAKE_OP)
 		val2 = (u32) (unsigned long) utime;
 
-- 
cgit v0.10.2


From 7ba5779533819fc061b4afafcb4a609d55f37057 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 6 Apr 2009 20:49:14 +0900
Subject: tomoyo: remove "undelete domain" command.

Since TOMOYO's policy management tools does not use the "undelete domain"
command, we decided to remove that command.

Signed-off-by: Kentaro Takeda <takedakn@nttdata.co.jp>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Toshiharu Harada <haradats@nttdata.co.jp>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c
index 92cea65..a0affd9 100644
--- a/security/tomoyo/common.c
+++ b/security/tomoyo/common.c
@@ -1252,15 +1252,12 @@ static int tomoyo_write_domain_policy(struct tomoyo_io_buffer *head)
 	struct tomoyo_domain_info *domain = head->write_var1;
 	bool is_delete = false;
 	bool is_select = false;
-	bool is_undelete = false;
 	unsigned int profile;
 
 	if (tomoyo_str_starts(&data, TOMOYO_KEYWORD_DELETE))
 		is_delete = true;
 	else if (tomoyo_str_starts(&data, TOMOYO_KEYWORD_SELECT))
 		is_select = true;
-	else if (tomoyo_str_starts(&data, TOMOYO_KEYWORD_UNDELETE))
-		is_undelete = true;
 	if (is_select && tomoyo_is_select_one(head, data))
 		return 0;
 	/* Don't allow updating policies by non manager programs. */
@@ -1274,9 +1271,7 @@ static int tomoyo_write_domain_policy(struct tomoyo_io_buffer *head)
 			down_read(&tomoyo_domain_list_lock);
 			domain = tomoyo_find_domain(data);
 			up_read(&tomoyo_domain_list_lock);
-		} else if (is_undelete)
-			domain = tomoyo_undelete_domain(data);
-		else
+		} else
 			domain = tomoyo_find_or_assign_new_domain(data, 0);
 		head->write_var1 = domain;
 		return 0;
diff --git a/security/tomoyo/common.h b/security/tomoyo/common.h
index 26a76d6..e77e6a6 100644
--- a/security/tomoyo/common.h
+++ b/security/tomoyo/common.h
@@ -88,10 +88,7 @@ struct tomoyo_domain_info {
 	/* Name of this domain. Never NULL.          */
 	const struct tomoyo_path_info *domainname;
 	u8 profile;        /* Profile number to use. */
-	u8 is_deleted;     /* Delete flag.
-			      0 = active.
-			      1 = deleted but undeletable.
-			      255 = deleted and no longer undeletable. */
+	bool is_deleted;   /* Delete flag.           */
 	bool quota_warned; /* Quota warnning flag.   */
 	/* DOMAIN_FLAGS_*. Use tomoyo_set_domain_flag() to modify. */
 	u8 flags;
@@ -144,7 +141,6 @@ struct tomoyo_double_path_acl_record {
 #define TOMOYO_KEYWORD_NO_INITIALIZE_DOMAIN      "no_initialize_domain "
 #define TOMOYO_KEYWORD_NO_KEEP_DOMAIN            "no_keep_domain "
 #define TOMOYO_KEYWORD_SELECT                    "select "
-#define TOMOYO_KEYWORD_UNDELETE                  "undelete "
 #define TOMOYO_KEYWORD_USE_PROFILE               "use_profile "
 #define TOMOYO_KEYWORD_IGNORE_GLOBAL_ALLOW_READ  "ignore_global_allow_read"
 /* A domain definition starts with <kernel>. */
@@ -267,8 +263,6 @@ struct tomoyo_domain_info *tomoyo_find_domain(const char *domainname);
 struct tomoyo_domain_info *tomoyo_find_or_assign_new_domain(const char *
 							    domainname,
 							    const u8 profile);
-/* Undelete a domain. */
-struct tomoyo_domain_info *tomoyo_undelete_domain(const char *domainname);
 /* Check mode for specified functionality. */
 unsigned int tomoyo_check_flags(const struct tomoyo_domain_info *domain,
 				const u8 index);
diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index 093a756..2f2b449 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -551,9 +551,7 @@ int tomoyo_write_alias_policy(char *data, const bool is_delete)
 	return tomoyo_update_alias_entry(data, cp, is_delete);
 }
 
-/* Domain create/delete/undelete handler. */
-
-/* #define TOMOYO_DEBUG_DOMAIN_UNDELETE */
+/* Domain create/delete handler. */
 
 /**
  * tomoyo_delete_domain - Delete a domain.
@@ -571,41 +569,15 @@ int tomoyo_delete_domain(char *domainname)
 	tomoyo_fill_path_info(&name);
 	/***** EXCLUSIVE SECTION START *****/
 	down_write(&tomoyo_domain_list_lock);
-#ifdef TOMOYO_DEBUG_DOMAIN_UNDELETE
-	printk(KERN_DEBUG "tomoyo_delete_domain %s\n", domainname);
-	list_for_each_entry(domain, &tomoyo_domain_list, list) {
-		if (tomoyo_pathcmp(domain->domainname, &name))
-			continue;
-		printk(KERN_DEBUG "List: %p %u\n", domain, domain->is_deleted);
-	}
-#endif
 	/* Is there an active domain? */
 	list_for_each_entry(domain, &tomoyo_domain_list, list) {
-		struct tomoyo_domain_info *domain2;
 		/* Never delete tomoyo_kernel_domain */
 		if (domain == &tomoyo_kernel_domain)
 			continue;
 		if (domain->is_deleted ||
 		    tomoyo_pathcmp(domain->domainname, &name))
 			continue;
-		/* Mark already deleted domains as non undeletable. */
-		list_for_each_entry(domain2, &tomoyo_domain_list, list) {
-			if (!domain2->is_deleted ||
-			    tomoyo_pathcmp(domain2->domainname, &name))
-				continue;
-#ifdef TOMOYO_DEBUG_DOMAIN_UNDELETE
-			if (domain2->is_deleted != 255)
-				printk(KERN_DEBUG
-				       "Marked %p as non undeletable\n",
-				       domain2);
-#endif
-			domain2->is_deleted = 255;
-		}
-		/* Delete and mark active domain as undeletable. */
-		domain->is_deleted = 1;
-#ifdef TOMOYO_DEBUG_DOMAIN_UNDELETE
-		printk(KERN_DEBUG "Marked %p as undeletable\n", domain);
-#endif
+		domain->is_deleted = true;
 		break;
 	}
 	up_write(&tomoyo_domain_list_lock);
@@ -614,58 +586,6 @@ int tomoyo_delete_domain(char *domainname)
 }
 
 /**
- * tomoyo_undelete_domain - Undelete a domain.
- *
- * @domainname: The name of domain.
- *
- * Returns pointer to "struct tomoyo_domain_info" on success, NULL otherwise.
- */
-struct tomoyo_domain_info *tomoyo_undelete_domain(const char *domainname)
-{
-	struct tomoyo_domain_info *domain;
-	struct tomoyo_domain_info *candidate_domain = NULL;
-	struct tomoyo_path_info name;
-
-	name.name = domainname;
-	tomoyo_fill_path_info(&name);
-	/***** EXCLUSIVE SECTION START *****/
-	down_write(&tomoyo_domain_list_lock);
-#ifdef TOMOYO_DEBUG_DOMAIN_UNDELETE
-	printk(KERN_DEBUG "tomoyo_undelete_domain %s\n", domainname);
-	list_for_each_entry(domain, &tomoyo_domain_list, list) {
-		if (tomoyo_pathcmp(domain->domainname, &name))
-			continue;
-		printk(KERN_DEBUG "List: %p %u\n", domain, domain->is_deleted);
-	}
-#endif
-	list_for_each_entry(domain, &tomoyo_domain_list, list) {
-		if (tomoyo_pathcmp(&name, domain->domainname))
-			continue;
-		if (!domain->is_deleted) {
-			/* This domain is active. I can't undelete. */
-			candidate_domain = NULL;
-#ifdef TOMOYO_DEBUG_DOMAIN_UNDELETE
-			printk(KERN_DEBUG "%p is active. I can't undelete.\n",
-			       domain);
-#endif
-			break;
-		}
-		/* Is this domain undeletable? */
-		if (domain->is_deleted == 1)
-			candidate_domain = domain;
-	}
-	if (candidate_domain) {
-		candidate_domain->is_deleted = 0;
-#ifdef TOMOYO_DEBUG_DOMAIN_UNDELETE
-		printk(KERN_DEBUG "%p was undeleted.\n", candidate_domain);
-#endif
-	}
-	up_write(&tomoyo_domain_list_lock);
-	/***** EXCLUSIVE SECTION END *****/
-	return candidate_domain;
-}
-
-/**
  * tomoyo_find_or_assign_new_domain - Create a domain.
  *
  * @domainname: The name of domain.
@@ -711,10 +631,6 @@ struct tomoyo_domain_info *tomoyo_find_or_assign_new_domain(const char *
 		/***** CRITICAL SECTION END *****/
 		if (flag)
 			continue;
-#ifdef TOMOYO_DEBUG_DOMAIN_UNDELETE
-		printk(KERN_DEBUG "Reusing %p %s\n", domain,
-		       domain->domainname->name);
-#endif
 		list_for_each_entry(ptr, &domain->acl_info_list, list) {
 			ptr->type |= TOMOYO_ACL_DELETED;
 		}
@@ -722,7 +638,7 @@ struct tomoyo_domain_info *tomoyo_find_or_assign_new_domain(const char *
 		domain->profile = profile;
 		domain->quota_warned = false;
 		mb(); /* Avoid out-of-order execution. */
-		domain->is_deleted = 0;
+		domain->is_deleted = false;
 		goto out;
 	}
 	/* No memory reusable. Create using new memory. */
-- 
cgit v0.10.2


From a2e87d06ddbe6e6fdb8d6d2e5e985efe4efb07dd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:44:59 +0200
Subject: perf_counter: update mmap() counter read, take 2

Update the userspace read method.

Paul noted that:
 - userspace cannot observe ->lock & 1 on the same cpu.
 - we need a barrier() between reading ->lock and ->index
   to ensure we read them in that prticular order.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094517.368446033@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index f2b914d..e22ab47 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -170,22 +170,18 @@ struct perf_counter_mmap_page {
 	 *   u32 seq;
 	 *   s64 count;
 	 *
-	 * again:
-	 *   seq = pc->lock;
-	 *   if (unlikely(seq & 1)) {
-	 *     cpu_relax();
-	 *     goto again;
-	 *   }
+	 *   do {
+	 *     seq = pc->lock;
 	 *
-	 *   if (pc->index) {
-	 *     count = pmc_read(pc->index - 1);
-	 *     count += pc->offset;
-	 *   } else
-	 *     goto regular_read;
+	 *     barrier()
+	 *     if (pc->index) {
+	 *       count = pmc_read(pc->index - 1);
+	 *       count += pc->offset;
+	 *     } else
+	 *       goto regular_read;
 	 *
-	 *   barrier();
-	 *   if (pc->lock != seq)
-	 *     goto again;
+	 *     barrier();
+	 *   } while (pc->lock != seq);
 	 *
 	 * NOTE: for obvious reason this only works on self-monitoring
 	 *       processes.
-- 
cgit v0.10.2


From 9c03d88e328d5f28f13191622c2ea1349c36b799 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:00 +0200
Subject: perf_counter: add more context information

Change the callchain context entries to u16, so as to gain some space.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094517.457320003@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index e22ab47..f9d5cf0 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -507,10 +507,10 @@ extern void perf_counter_mmap(unsigned long addr, unsigned long len,
 extern void perf_counter_munmap(unsigned long addr, unsigned long len,
 				unsigned long pgoff, struct file *file);
 
-#define MAX_STACK_DEPTH		254
+#define MAX_STACK_DEPTH		255
 
 struct perf_callchain_entry {
-	u32	nr, hv, kernel, user;
+	u16	nr, hv, kernel, user;
 	u64	ip[MAX_STACK_DEPTH];
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 2a5d4f5..727624d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1657,9 +1657,7 @@ void perf_counter_do_pending(void)
  * Callchain support -- arch specific
  */
 
-struct perf_callchain_entry *
-__attribute__((weak))
-perf_callchain(struct pt_regs *regs)
+__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 {
 	return NULL;
 }
@@ -1819,7 +1817,7 @@ void perf_counter_output(struct perf_counter *counter,
 		callchain = perf_callchain(regs);
 
 		if (callchain) {
-			callchain_size = (2 + callchain->nr) * sizeof(u64);
+			callchain_size = (1 + callchain->nr) * sizeof(u64);
 
 			header.type |= __PERF_EVENT_CALLCHAIN;
 			header.size += callchain_size;
-- 
cgit v0.10.2


From 3c446b3d3b38f991f97e9d2df0ad26a60a94dcff Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:01 +0200
Subject: perf_counter: SIGIO support

Provide support for fcntl() I/O availability signals.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094517.579788800@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index f9d5cf0..8d5d11b 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -238,6 +238,7 @@ enum perf_event_type {
 #include <linux/rcupdate.h>
 #include <linux/spinlock.h>
 #include <linux/hrtimer.h>
+#include <linux/fs.h>
 #include <asm/atomic.h>
 
 struct task_struct;
@@ -398,6 +399,7 @@ struct perf_counter {
 
 	/* poll related */
 	wait_queue_head_t		waitq;
+	struct fasync_struct		*fasync;
 	/* optional: for NMIs */
 	struct perf_wakeup_entry	wakeup;
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 727624d..c58cc64 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1526,6 +1526,22 @@ out:
 	return ret;
 }
 
+static int perf_fasync(int fd, struct file *filp, int on)
+{
+	struct perf_counter *counter = filp->private_data;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	int retval;
+
+	mutex_lock(&inode->i_mutex);
+	retval = fasync_helper(fd, filp, on, &counter->fasync);
+	mutex_unlock(&inode->i_mutex);
+
+	if (retval < 0)
+		return retval;
+
+	return 0;
+}
+
 static const struct file_operations perf_fops = {
 	.release		= perf_release,
 	.read			= perf_read,
@@ -1533,6 +1549,7 @@ static const struct file_operations perf_fops = {
 	.unlocked_ioctl		= perf_ioctl,
 	.compat_ioctl		= perf_ioctl,
 	.mmap			= perf_mmap,
+	.fasync			= perf_fasync,
 };
 
 /*
@@ -1549,7 +1566,7 @@ void perf_counter_wakeup(struct perf_counter *counter)
 	rcu_read_lock();
 	data = rcu_dereference(counter->data);
 	if (data) {
-		(void)atomic_xchg(&data->wakeup, POLL_IN);
+		atomic_set(&data->wakeup, POLL_IN);
 		/*
 		 * Ensure all data writes are issued before updating the
 		 * user-space data head information. The matching rmb()
@@ -1561,6 +1578,7 @@ void perf_counter_wakeup(struct perf_counter *counter)
 	rcu_read_unlock();
 
 	wake_up_all(&counter->waitq);
+	kill_fasync(&counter->fasync, SIGIO, POLL_IN);
 }
 
 /*
-- 
cgit v0.10.2


From 671dec5daf3b3c43c5777be282f00120a44cf37f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:02 +0200
Subject: perf_counter: generalize pending infrastructure

Prepare the pending infrastructure to do more than wakeups.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094517.634732847@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 8d5d11b..977fb15 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -321,8 +321,9 @@ struct perf_mmap_data {
 	void 				*data_pages[0];
 };
 
-struct perf_wakeup_entry {
-	struct perf_wakeup_entry *next;
+struct perf_pending_entry {
+	struct perf_pending_entry *next;
+	void (*func)(struct perf_pending_entry *);
 };
 
 /**
@@ -401,7 +402,7 @@ struct perf_counter {
 	wait_queue_head_t		waitq;
 	struct fasync_struct		*fasync;
 	/* optional: for NMIs */
-	struct perf_wakeup_entry	wakeup;
+	struct perf_pending_entry	pending;
 
 	void (*destroy)(struct perf_counter *);
 	struct rcu_head			rcu_head;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c58cc64..0a2ade2 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1581,6 +1581,14 @@ void perf_counter_wakeup(struct perf_counter *counter)
 	kill_fasync(&counter->fasync, SIGIO, POLL_IN);
 }
 
+static void perf_pending_wakeup(struct perf_pending_entry *entry)
+{
+	struct perf_counter *counter = container_of(entry,
+			struct perf_counter, pending);
+
+	perf_counter_wakeup(counter);
+}
+
 /*
  * Pending wakeups
  *
@@ -1590,45 +1598,47 @@ void perf_counter_wakeup(struct perf_counter *counter)
  * single linked list and use cmpxchg() to add entries lockless.
  */
 
-#define PENDING_TAIL ((struct perf_wakeup_entry *)-1UL)
+#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
 
-static DEFINE_PER_CPU(struct perf_wakeup_entry *, perf_wakeup_head) = {
+static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
 	PENDING_TAIL,
 };
 
-static void perf_pending_queue(struct perf_counter *counter)
+static void perf_pending_queue(struct perf_pending_entry *entry,
+			       void (*func)(struct perf_pending_entry *))
 {
-	struct perf_wakeup_entry **head;
-	struct perf_wakeup_entry *prev, *next;
+	struct perf_pending_entry **head;
 
-	if (cmpxchg(&counter->wakeup.next, NULL, PENDING_TAIL) != NULL)
+	if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
 		return;
 
-	head = &get_cpu_var(perf_wakeup_head);
+	entry->func = func;
+
+	head = &get_cpu_var(perf_pending_head);
 
 	do {
-		prev = counter->wakeup.next = *head;
-		next = &counter->wakeup;
-	} while (cmpxchg(head, prev, next) != prev);
+		entry->next = *head;
+	} while (cmpxchg(head, entry->next, entry) != entry->next);
 
 	set_perf_counter_pending();
 
-	put_cpu_var(perf_wakeup_head);
+	put_cpu_var(perf_pending_head);
 }
 
 static int __perf_pending_run(void)
 {
-	struct perf_wakeup_entry *list;
+	struct perf_pending_entry *list;
 	int nr = 0;
 
-	list = xchg(&__get_cpu_var(perf_wakeup_head), PENDING_TAIL);
+	list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
 	while (list != PENDING_TAIL) {
-		struct perf_counter *counter = container_of(list,
-				struct perf_counter, wakeup);
+		void (*func)(struct perf_pending_entry *);
+		struct perf_pending_entry *entry = list;
 
 		list = list->next;
 
-		counter->wakeup.next = NULL;
+		func = entry->func;
+		entry->next = NULL;
 		/*
 		 * Ensure we observe the unqueue before we issue the wakeup,
 		 * so that we won't be waiting forever.
@@ -1636,7 +1646,7 @@ static int __perf_pending_run(void)
 		 */
 		smp_wmb();
 
-		perf_counter_wakeup(counter);
+		func(entry);
 		nr++;
 	}
 
@@ -1658,7 +1668,7 @@ static inline int perf_not_pending(struct perf_counter *counter)
 	 * so that we do not miss the wakeup. -- see perf_pending_handle()
 	 */
 	smp_rmb();
-	return counter->wakeup.next == NULL;
+	return counter->pending.next == NULL;
 }
 
 static void perf_pending_sync(struct perf_counter *counter)
@@ -1695,9 +1705,10 @@ struct perf_output_handle {
 
 static inline void __perf_output_wakeup(struct perf_output_handle *handle)
 {
-	if (handle->nmi)
-		perf_pending_queue(handle->counter);
-	else
+	if (handle->nmi) {
+		perf_pending_queue(&handle->counter->pending,
+				   perf_pending_wakeup);
+	} else
 		perf_counter_wakeup(handle->counter);
 }
 
-- 
cgit v0.10.2


From b6276f353bf490add62dcf7db0ebd75baa3e1a37 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:03 +0200
Subject: perf_counter: x86: self-IPI for pending work

Implement set_perf_counter_pending() with a self-IPI so that it will
run ASAP in a usable context.

For now use a second IRQ vector, because the primary vector pokes
the apic in funny ways that seem to confuse things.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094517.724626696@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index c2e6bed..fe24d28 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -50,6 +50,7 @@ BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
 
 #ifdef CONFIG_PERF_COUNTERS
 BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
+BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
 #endif
 
 #ifdef CONFIG_X86_MCE_P4THERMAL
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 2545442..f5ebe2a 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -14,6 +14,7 @@ typedef struct {
 #endif
 	unsigned int generic_irqs;	/* arch dependent */
 	unsigned int apic_perf_irqs;
+	unsigned int apic_pending_irqs;
 #ifdef CONFIG_SMP
 	unsigned int irq_resched_count;
 	unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index ae80f64..7309c0a 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -30,6 +30,7 @@ extern void apic_timer_interrupt(void);
 extern void generic_interrupt(void);
 extern void error_interrupt(void);
 extern void perf_counter_interrupt(void);
+extern void perf_pending_interrupt(void);
 
 extern void spurious_interrupt(void);
 extern void thermal_interrupt(void);
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 3cbd79b..545bb81 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -117,6 +117,11 @@
 #define GENERIC_INTERRUPT_VECTOR	0xed
 
 /*
+ * Performance monitoring pending work vector:
+ */
+#define LOCAL_PENDING_VECTOR		0xec
+
+/*
  * First APIC vector available to drivers: (vectors 0x30-0xee) we
  * start at 0x31(0x41) to spread out vectors evenly between priority
  * levels. (0x80 is the syscall vector)
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index e2b0e66..d08dd52 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -84,7 +84,8 @@ union cpuid10_edx {
 #define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
 #define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2)
 
-#define set_perf_counter_pending()	do { } while (0)
+extern void set_perf_counter_pending(void);
+
 #define clear_perf_counter_pending()	do { } while (0)
 #define test_perf_counter_pending()	(0)
 
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index c74e20d..4384158 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -849,6 +849,20 @@ void smp_perf_counter_interrupt(struct pt_regs *regs)
 	irq_exit();
 }
 
+void smp_perf_pending_interrupt(struct pt_regs *regs)
+{
+	irq_enter();
+	ack_APIC_irq();
+	inc_irq_stat(apic_pending_irqs);
+	perf_counter_do_pending();
+	irq_exit();
+}
+
+void set_perf_counter_pending(void)
+{
+	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
+}
+
 void perf_counters_lapic_init(int nmi)
 {
 	u32 apic_val;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3f129d9..1d46cba 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1028,6 +1028,8 @@ apicinterrupt SPURIOUS_APIC_VECTOR \
 #ifdef CONFIG_PERF_COUNTERS
 apicinterrupt LOCAL_PERF_VECTOR \
 	perf_counter_interrupt smp_perf_counter_interrupt
+apicinterrupt LOCAL_PENDING_VECTOR \
+	perf_pending_interrupt smp_perf_pending_interrupt
 #endif
 
 /*
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 9c27543..d465487 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -67,6 +67,10 @@ static int show_other_interrupts(struct seq_file *p, int prec)
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
 	seq_printf(p, "  Performance counter interrupts\n");
+	seq_printf(p, "PND: ");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
+	seq_printf(p, "  Performance pending work\n");
 #endif
 	if (generic_interrupt_extension) {
 		seq_printf(p, "PLT: ");
@@ -171,6 +175,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 	sum += irq_stats(cpu)->apic_timer_irqs;
 	sum += irq_stats(cpu)->irq_spurious_count;
 	sum += irq_stats(cpu)->apic_perf_irqs;
+	sum += irq_stats(cpu)->apic_pending_irqs;
 #endif
 	if (generic_interrupt_extension)
 		sum += irq_stats(cpu)->generic_irqs;
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 925d87c..3190a6b 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -166,6 +166,7 @@ static void __init apic_intr_init(void)
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 # ifdef CONFIG_PERF_COUNTERS
 	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
 # endif
 
 # ifdef CONFIG_X86_MCE_P4THERMAL
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 665e2ab..53ceb26 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -156,6 +156,7 @@ static void __init apic_intr_init(void)
 	/* Performance monitoring interrupt: */
 #ifdef CONFIG_PERF_COUNTERS
 	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
 #endif
 }
 
-- 
cgit v0.10.2


From f6c7d5fe58b4846ee0cb4b98b6042489705eced4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:04 +0200
Subject: perf_counter: theres more to overflow than writing events

Prepare for more generic overflow handling. The new perf_counter_overflow()
method will handle the generic bits of the counter overflow, and can return
a !0 return value, in which case the counter should be (soft) disabled, so
that it won't count until it's properly disabled.

XXX: do powerpc and swcounter

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094517.812109629@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 0a4d14f..f88c35d 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -732,7 +732,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 * Finally record data if requested.
 	 */
 	if (record)
-		perf_counter_output(counter, 1, regs);
+		perf_counter_overflow(counter, 1, regs);
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 4384158..1116a41 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -800,7 +800,8 @@ again:
 			continue;
 
 		perf_save_and_restart(counter);
-		perf_counter_output(counter, nmi, regs);
+		if (perf_counter_overflow(counter, nmi, regs))
+			__pmc_generic_disable(counter, &counter->hw, bit);
 	}
 
 	hw_perf_ack_status(ack);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 977fb15..ca2d4df 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -491,8 +491,8 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_counter_context *ctx, int cpu);
 extern void perf_counter_update_userpage(struct perf_counter *counter);
 
-extern void perf_counter_output(struct perf_counter *counter,
-				int nmi, struct pt_regs *regs);
+extern int perf_counter_overflow(struct perf_counter *counter,
+				 int nmi, struct pt_regs *regs);
 /*
  * Return 1 for a software counter, 0 for a hardware counter
  */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0a2ade2..195e976 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1800,8 +1800,8 @@ static void perf_output_end(struct perf_output_handle *handle)
 	rcu_read_unlock();
 }
 
-void perf_counter_output(struct perf_counter *counter,
-			 int nmi, struct pt_regs *regs)
+static void perf_counter_output(struct perf_counter *counter,
+				int nmi, struct pt_regs *regs)
 {
 	int ret;
 	u64 record_type = counter->hw_event.record_type;
@@ -2034,6 +2034,17 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
 }
 
 /*
+ * Generic counter overflow handling.
+ */
+
+int perf_counter_overflow(struct perf_counter *counter,
+			  int nmi, struct pt_regs *regs)
+{
+	perf_counter_output(counter, nmi, regs);
+	return 0;
+}
+
+/*
  * Generic software counter infrastructure
  */
 
@@ -2077,6 +2088,7 @@ static void perf_swcounter_set_period(struct perf_counter *counter)
 
 static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 {
+	enum hrtimer_restart ret = HRTIMER_RESTART;
 	struct perf_counter *counter;
 	struct pt_regs *regs;
 
@@ -2092,12 +2104,14 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 			!counter->hw_event.exclude_user)
 		regs = task_pt_regs(current);
 
-	if (regs)
-		perf_counter_output(counter, 0, regs);
+	if (regs) {
+		if (perf_counter_overflow(counter, 0, regs))
+			ret = HRTIMER_NORESTART;
+	}
 
 	hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
 
-	return HRTIMER_RESTART;
+	return ret;
 }
 
 static void perf_swcounter_overflow(struct perf_counter *counter,
@@ -2105,7 +2119,10 @@ static void perf_swcounter_overflow(struct perf_counter *counter,
 {
 	perf_swcounter_update(counter);
 	perf_swcounter_set_period(counter);
-	perf_counter_output(counter, nmi, regs);
+	if (perf_counter_overflow(counter, nmi, regs))
+		/* soft-disable the counter */
+		;
+
 }
 
 static int perf_swcounter_match(struct perf_counter *counter,
-- 
cgit v0.10.2


From ebb3c4c4cb81d64cc041356915ec015e2c57092a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:05 +0200
Subject: perf_counter: fix the mlock accounting

Reading through the code I saw I forgot the finish the mlock accounting.
Do so now.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094517.899767331@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 195e976..c841563 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1461,13 +1461,14 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 
 	if (atomic_dec_and_mutex_lock(&counter->mmap_count,
 				      &counter->mmap_mutex)) {
+		vma->vm_mm->locked_vm -= counter->data->nr_pages + 1;
 		perf_mmap_data_free(counter);
 		mutex_unlock(&counter->mmap_mutex);
 	}
 }
 
 static struct vm_operations_struct perf_mmap_vmops = {
-	.open = perf_mmap_open,
+	.open  = perf_mmap_open,
 	.close = perf_mmap_close,
 	.fault = perf_mmap_fault,
 };
@@ -1499,24 +1500,32 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	if (vma->vm_pgoff != 0)
 		return -EINVAL;
 
-	locked = vma_size >>  PAGE_SHIFT;
-	locked += vma->vm_mm->locked_vm;
+	mutex_lock(&counter->mmap_mutex);
+	if (atomic_inc_not_zero(&counter->mmap_count)) {
+		if (nr_pages != counter->data->nr_pages)
+			ret = -EINVAL;
+		goto unlock;
+	}
+
+	locked = vma->vm_mm->locked_vm;
+	locked += nr_pages + 1;
 
 	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 	lock_limit >>= PAGE_SHIFT;
 
-	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK))
-		return -EPERM;
-
-	mutex_lock(&counter->mmap_mutex);
-	if (atomic_inc_not_zero(&counter->mmap_count))
-		goto out;
+	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+		ret = -EPERM;
+		goto unlock;
+	}
 
 	WARN_ON(counter->data);
 	ret = perf_mmap_data_alloc(counter, nr_pages);
-	if (!ret)
-		atomic_set(&counter->mmap_count, 1);
-out:
+	if (ret)
+		goto unlock;
+
+	atomic_set(&counter->mmap_count, 1);
+	vma->vm_mm->locked_vm += nr_pages + 1;
+unlock:
 	mutex_unlock(&counter->mmap_mutex);
 
 	vma->vm_flags &= ~VM_MAYWRITE;
-- 
cgit v0.10.2


From 339f7c90b8a2f3aa2dd4267e79f797999e8a3c59 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:06 +0200
Subject: perf_counter: PERF_RECORD_TIME

By popular request, provide means to log a timestamp along with the
counter overflow event.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094518.024173282@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index ca2d4df..928a7fa 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -102,6 +102,7 @@ enum perf_counter_record_format {
 	PERF_RECORD_TID		= 1U << 1,
 	PERF_RECORD_GROUP	= 1U << 2,
 	PERF_RECORD_CALLCHAIN	= 1U << 3,
+	PERF_RECORD_TIME	= 1U << 4,
 };
 
 /*
@@ -221,6 +222,7 @@ enum perf_event_type {
 	__PERF_EVENT_TID		= PERF_RECORD_TID,
 	__PERF_EVENT_GROUP		= PERF_RECORD_GROUP,
 	__PERF_EVENT_CALLCHAIN		= PERF_RECORD_CALLCHAIN,
+	__PERF_EVENT_TIME		= PERF_RECORD_TIME,
 };
 
 #ifdef __KERNEL__
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c841563..19990d1 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1826,6 +1826,7 @@ static void perf_counter_output(struct perf_counter *counter,
 	} group_entry;
 	struct perf_callchain_entry *callchain = NULL;
 	int callchain_size = 0;
+	u64 time;
 
 	header.type = PERF_EVENT_COUNTER_OVERFLOW;
 	header.size = sizeof(header);
@@ -1862,6 +1863,16 @@ static void perf_counter_output(struct perf_counter *counter,
 		}
 	}
 
+	if (record_type & PERF_RECORD_TIME) {
+		/*
+		 * Maybe do better on x86 and provide cpu_clock_nmi()
+		 */
+		time = sched_clock();
+
+		header.type |= __PERF_EVENT_TIME;
+		header.size += sizeof(u64);
+	}
+
 	ret = perf_output_begin(&handle, counter, header.size, nmi);
 	if (ret)
 		return;
@@ -1895,6 +1906,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (callchain)
 		perf_output_copy(&handle, callchain, callchain_size);
 
+	if (record_type & PERF_RECORD_TIME)
+		perf_output_put(&handle, time);
+
 	perf_output_end(&handle);
 }
 
-- 
cgit v0.10.2


From 79f146415623fe74f39af67c0f6adc208939a410 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:07 +0200
Subject: perf_counter: counter overflow limit

Provide means to auto-disable the counter after 'n' overflow events.

Create the counter with hw_event.disabled = 1, and then issue an
ioctl(fd, PREF_COUNTER_IOC_REFRESH, n); to set the limit and enable
the counter.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094518.083139737@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 928a7fa..ef4dcbf 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -155,8 +155,9 @@ struct perf_counter_hw_event {
 /*
  * Ioctls that can be done on a perf counter fd:
  */
-#define PERF_COUNTER_IOC_ENABLE		_IO('$', 0)
-#define PERF_COUNTER_IOC_DISABLE	_IO('$', 1)
+#define PERF_COUNTER_IOC_ENABLE		_IO ('$', 0)
+#define PERF_COUNTER_IOC_DISABLE	_IO ('$', 1)
+#define PERF_COUNTER_IOC_REFRESH	_IOW('$', 2, u32)
 
 /*
  * Structure of the page that can be mapped via mmap
@@ -403,9 +404,14 @@ struct perf_counter {
 	/* poll related */
 	wait_queue_head_t		waitq;
 	struct fasync_struct		*fasync;
-	/* optional: for NMIs */
+
+	/* delayed work for NMIs and such */
+	int				pending_wakeup;
+	int				pending_disable;
 	struct perf_pending_entry	pending;
 
+	atomic_t			event_limit;
+
 	void (*destroy)(struct perf_counter *);
 	struct rcu_head			rcu_head;
 #endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 19990d1..c05e103 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -744,6 +744,12 @@ static void perf_counter_enable(struct perf_counter *counter)
 	spin_unlock_irq(&ctx->lock);
 }
 
+static void perf_counter_refresh(struct perf_counter *counter, int refresh)
+{
+	atomic_add(refresh, &counter->event_limit);
+	perf_counter_enable(counter);
+}
+
 /*
  * Enable a counter and all its children.
  */
@@ -1311,6 +1317,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case PERF_COUNTER_IOC_DISABLE:
 		perf_counter_disable_family(counter);
 		break;
+	case PERF_COUNTER_IOC_REFRESH:
+		perf_counter_refresh(counter, arg);
+		break;
 	default:
 		err = -ENOTTY;
 	}
@@ -1590,14 +1599,6 @@ void perf_counter_wakeup(struct perf_counter *counter)
 	kill_fasync(&counter->fasync, SIGIO, POLL_IN);
 }
 
-static void perf_pending_wakeup(struct perf_pending_entry *entry)
-{
-	struct perf_counter *counter = container_of(entry,
-			struct perf_counter, pending);
-
-	perf_counter_wakeup(counter);
-}
-
 /*
  * Pending wakeups
  *
@@ -1607,6 +1608,22 @@ static void perf_pending_wakeup(struct perf_pending_entry *entry)
  * single linked list and use cmpxchg() to add entries lockless.
  */
 
+static void perf_pending_counter(struct perf_pending_entry *entry)
+{
+	struct perf_counter *counter = container_of(entry,
+			struct perf_counter, pending);
+
+	if (counter->pending_disable) {
+		counter->pending_disable = 0;
+		perf_counter_disable(counter);
+	}
+
+	if (counter->pending_wakeup) {
+		counter->pending_wakeup = 0;
+		perf_counter_wakeup(counter);
+	}
+}
+
 #define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
 
 static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
@@ -1715,8 +1732,9 @@ struct perf_output_handle {
 static inline void __perf_output_wakeup(struct perf_output_handle *handle)
 {
 	if (handle->nmi) {
+		handle->counter->pending_wakeup = 1;
 		perf_pending_queue(&handle->counter->pending,
-				   perf_pending_wakeup);
+				   perf_pending_counter);
 	} else
 		perf_counter_wakeup(handle->counter);
 }
@@ -2063,8 +2081,21 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
 int perf_counter_overflow(struct perf_counter *counter,
 			  int nmi, struct pt_regs *regs)
 {
+	int events = atomic_read(&counter->event_limit);
+	int ret = 0;
+
+	if (events && atomic_dec_and_test(&counter->event_limit)) {
+		ret = 1;
+		if (nmi) {
+			counter->pending_disable = 1;
+			perf_pending_queue(&counter->pending,
+					   perf_pending_counter);
+		} else
+			perf_counter_disable(counter);
+	}
+
 	perf_counter_output(counter, nmi, regs);
-	return 0;
+	return ret;
 }
 
 /*
-- 
cgit v0.10.2


From 0c593b3411341e3a05a61f5527df36ab02bd11e8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:08 +0200
Subject: perf_counter: comment the perf_event_type stuff

Describe the event format.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094518.211174347@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index ef4dcbf..81220188 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -208,6 +208,20 @@ struct perf_event_header {
 
 enum perf_event_type {
 
+	/*
+	 * The MMAP events record the PROT_EXEC mappings so that we can
+	 * correlate userspace IPs to code. They have the following structure:
+	 *
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 *
+	 * 	u32				pid, tid;
+	 * 	u64				addr;
+	 * 	u64				len;
+	 * 	u64				pgoff;
+	 * 	char				filename[];
+	 * };
+	 */
 	PERF_EVENT_MMAP			= 1,
 	PERF_EVENT_MUNMAP		= 2,
 
@@ -217,6 +231,24 @@ enum perf_event_type {
 	 *
 	 * These events will have types of the form:
 	 *   PERF_EVENT_COUNTER_OVERFLOW { | __PERF_EVENT_* } *
+	 *
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 *
+	 * 	{ u64			ip;	  } && __PERF_EVENT_IP
+	 * 	{ u32			pid, tid; } && __PERF_EVENT_TID
+	 *
+	 * 	{ u64			nr;
+	 * 	  { u64 event, val; } 	cnt[nr];  } && __PERF_EVENT_GROUP
+	 *
+	 * 	{ u16			nr,
+	 * 				hv,
+	 * 				kernel,
+	 * 				user;
+	 * 	  u64			ips[nr];  } && __PERF_EVENT_CALLCHAIN
+	 *
+	 * 	{ u64			time;     } && __PERF_EVENT_TIME
+	 * };
 	 */
 	PERF_EVENT_COUNTER_OVERFLOW	= 1UL << 31,
 	__PERF_EVENT_IP			= PERF_RECORD_IP,
-- 
cgit v0.10.2


From 4c9e25428ff46b968a30f1dfafdba550cb6e4141 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:09 +0200
Subject: perf_counter: change event definition

Currently the definition of an event is slightly ambiguous. We have
wakeup events, for poll() and SIGIO, which are either generated
when a record crosses a page boundary (hw_events.wakeup_events == 0),
or every wakeup_events new records.

Now a record can be either a counter overflow record, or a number of
different things, like the mmap PROT_EXEC region notifications.

Then there is the PERF_COUNTER_IOC_REFRESH event limit, which only
considers counter overflows.

This patch changes then wakeup_events and SIGIO notification to only
consider overflow events. Furthermore it changes the SIGIO notification
to report SIGHUP when the event limit is reached and the counter will
be disabled.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094518.266679874@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 81220188..0f5a400 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -439,6 +439,7 @@ struct perf_counter {
 
 	/* delayed work for NMIs and such */
 	int				pending_wakeup;
+	int				pending_kill;
 	int				pending_disable;
 	struct perf_pending_entry	pending;
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c05e103..8c8eaf0 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1596,7 +1596,11 @@ void perf_counter_wakeup(struct perf_counter *counter)
 	rcu_read_unlock();
 
 	wake_up_all(&counter->waitq);
-	kill_fasync(&counter->fasync, SIGIO, POLL_IN);
+
+	if (counter->pending_kill) {
+		kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
+		counter->pending_kill = 0;
+	}
 }
 
 /*
@@ -1727,6 +1731,7 @@ struct perf_output_handle {
 	unsigned int		head;
 	int			wakeup;
 	int			nmi;
+	int			overflow;
 };
 
 static inline void __perf_output_wakeup(struct perf_output_handle *handle)
@@ -1741,7 +1746,7 @@ static inline void __perf_output_wakeup(struct perf_output_handle *handle)
 
 static int perf_output_begin(struct perf_output_handle *handle,
 			     struct perf_counter *counter, unsigned int size,
-			     int nmi)
+			     int nmi, int overflow)
 {
 	struct perf_mmap_data *data;
 	unsigned int offset, head;
@@ -1751,8 +1756,9 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	if (!data)
 		goto out;
 
-	handle->counter	= counter;
-	handle->nmi	= nmi;
+	handle->counter	 = counter;
+	handle->nmi	 = nmi;
+	handle->overflow = overflow;
 
 	if (!data->nr_pages)
 		goto fail;
@@ -1816,7 +1822,7 @@ static void perf_output_end(struct perf_output_handle *handle)
 {
 	int wakeup_events = handle->counter->hw_event.wakeup_events;
 
-	if (wakeup_events) {
+	if (handle->overflow && wakeup_events) {
 		int events = atomic_inc_return(&handle->data->events);
 		if (events >= wakeup_events) {
 			atomic_sub(wakeup_events, &handle->data->events);
@@ -1891,7 +1897,7 @@ static void perf_counter_output(struct perf_counter *counter,
 		header.size += sizeof(u64);
 	}
 
-	ret = perf_output_begin(&handle, counter, header.size, nmi);
+	ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
 	if (ret)
 		return;
 
@@ -1955,7 +1961,7 @@ static void perf_counter_mmap_output(struct perf_counter *counter,
 {
 	struct perf_output_handle handle;
 	int size = mmap_event->event.header.size;
-	int ret = perf_output_begin(&handle, counter, size, 0);
+	int ret = perf_output_begin(&handle, counter, size, 0, 0);
 
 	if (ret)
 		return;
@@ -2084,8 +2090,10 @@ int perf_counter_overflow(struct perf_counter *counter,
 	int events = atomic_read(&counter->event_limit);
 	int ret = 0;
 
+	counter->pending_kill = POLL_IN;
 	if (events && atomic_dec_and_test(&counter->event_limit)) {
 		ret = 1;
+		counter->pending_kill = POLL_HUP;
 		if (nmi) {
 			counter->pending_disable = 1;
 			perf_pending_queue(&counter->pending,
-- 
cgit v0.10.2


From 4af4998b8aa35600f4c4a4f3c3a23baca6081d02 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:10 +0200
Subject: perf_counter: rework context time

Since perf_counter_context is switched along with tasks, we can
maintain the context time without using the task runtime clock.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094518.353552838@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 0f5a400..7f5d353 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -477,14 +477,10 @@ struct perf_counter_context {
 	struct task_struct	*task;
 
 	/*
-	 * time_now is the current time in nanoseconds since an arbitrary
-	 * point in the past.  For per-task counters, this is based on the
-	 * task clock, and for per-cpu counters it is based on the cpu clock.
-	 * time_lost is an offset from the task/cpu clock, used to make it
-	 * appear that time only passes while the context is scheduled in.
+	 * Context clock, runs when context enabled.
 	 */
-	u64			time_now;
-	u64			time_lost;
+	u64			time;
+	u64			timestamp;
 #endif
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 8c8eaf0..84d85ab 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -117,7 +117,7 @@ counter_sched_out(struct perf_counter *counter,
 		return;
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
-	counter->tstamp_stopped = ctx->time_now;
+	counter->tstamp_stopped = ctx->time;
 	counter->hw_ops->disable(counter);
 	counter->oncpu = -1;
 
@@ -253,27 +253,20 @@ retry:
 	spin_unlock_irq(&ctx->lock);
 }
 
-/*
- * Get the current time for this context.
- * If this is a task context, we use the task's task clock,
- * or for a per-cpu context, we use the cpu clock.
- */
-static u64 get_context_time(struct perf_counter_context *ctx, int update)
+static inline u64 perf_clock(void)
 {
-	struct task_struct *curr = ctx->task;
-
-	if (!curr)
-		return cpu_clock(smp_processor_id());
-
-	return __task_delta_exec(curr, update) + curr->se.sum_exec_runtime;
+	return cpu_clock(smp_processor_id());
 }
 
 /*
  * Update the record of the current time in a context.
  */
-static void update_context_time(struct perf_counter_context *ctx, int update)
+static void update_context_time(struct perf_counter_context *ctx)
 {
-	ctx->time_now = get_context_time(ctx, update) - ctx->time_lost;
+	u64 now = perf_clock();
+
+	ctx->time += now - ctx->timestamp;
+	ctx->timestamp = now;
 }
 
 /*
@@ -284,15 +277,17 @@ static void update_counter_times(struct perf_counter *counter)
 	struct perf_counter_context *ctx = counter->ctx;
 	u64 run_end;
 
-	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
-		counter->total_time_enabled = ctx->time_now -
-			counter->tstamp_enabled;
-		if (counter->state == PERF_COUNTER_STATE_INACTIVE)
-			run_end = counter->tstamp_stopped;
-		else
-			run_end = ctx->time_now;
-		counter->total_time_running = run_end - counter->tstamp_running;
-	}
+	if (counter->state < PERF_COUNTER_STATE_INACTIVE)
+		return;
+
+	counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
+
+	if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+		run_end = counter->tstamp_stopped;
+	else
+		run_end = ctx->time;
+
+	counter->total_time_running = run_end - counter->tstamp_running;
 }
 
 /*
@@ -332,7 +327,7 @@ static void __perf_counter_disable(void *info)
 	 * If it is in error state, leave it in error state.
 	 */
 	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
-		update_context_time(ctx, 1);
+		update_context_time(ctx);
 		update_counter_times(counter);
 		if (counter == counter->group_leader)
 			group_sched_out(counter, cpuctx, ctx);
@@ -426,7 +421,7 @@ counter_sched_in(struct perf_counter *counter,
 		return -EAGAIN;
 	}
 
-	counter->tstamp_running += ctx->time_now - counter->tstamp_stopped;
+	counter->tstamp_running += ctx->time - counter->tstamp_stopped;
 
 	if (!is_software_counter(counter))
 		cpuctx->active_oncpu++;
@@ -493,9 +488,9 @@ static void add_counter_to_ctx(struct perf_counter *counter,
 	list_add_counter(counter, ctx);
 	ctx->nr_counters++;
 	counter->prev_state = PERF_COUNTER_STATE_OFF;
-	counter->tstamp_enabled = ctx->time_now;
-	counter->tstamp_running = ctx->time_now;
-	counter->tstamp_stopped = ctx->time_now;
+	counter->tstamp_enabled = ctx->time;
+	counter->tstamp_running = ctx->time;
+	counter->tstamp_stopped = ctx->time;
 }
 
 /*
@@ -522,7 +517,7 @@ static void __perf_install_in_context(void *info)
 
 	curr_rq_lock_irq_save(&flags);
 	spin_lock(&ctx->lock);
-	update_context_time(ctx, 1);
+	update_context_time(ctx);
 
 	/*
 	 * Protect the list operation against NMI by disabling the
@@ -648,13 +643,13 @@ static void __perf_counter_enable(void *info)
 
 	curr_rq_lock_irq_save(&flags);
 	spin_lock(&ctx->lock);
-	update_context_time(ctx, 1);
+	update_context_time(ctx);
 
 	counter->prev_state = counter->state;
 	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 		goto unlock;
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
-	counter->tstamp_enabled = ctx->time_now - counter->total_time_enabled;
+	counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
 
 	/*
 	 * If the counter is in a group and isn't the group leader,
@@ -737,8 +732,8 @@ static void perf_counter_enable(struct perf_counter *counter)
 	 */
 	if (counter->state == PERF_COUNTER_STATE_OFF) {
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
-		counter->tstamp_enabled = ctx->time_now -
-			counter->total_time_enabled;
+		counter->tstamp_enabled =
+			ctx->time - counter->total_time_enabled;
 	}
  out:
 	spin_unlock_irq(&ctx->lock);
@@ -778,7 +773,7 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 	ctx->is_active = 0;
 	if (likely(!ctx->nr_counters))
 		goto out;
-	update_context_time(ctx, 0);
+	update_context_time(ctx);
 
 	flags = hw_perf_save_disable();
 	if (ctx->nr_active) {
@@ -883,12 +878,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 	if (likely(!ctx->nr_counters))
 		goto out;
 
-	/*
-	 * Add any time since the last sched_out to the lost time
-	 * so it doesn't get included in the total_time_enabled and
-	 * total_time_running measures for counters in the context.
-	 */
-	ctx->time_lost = get_context_time(ctx, 0) - ctx->time_now;
+	ctx->timestamp = perf_clock();
 
 	flags = hw_perf_save_disable();
 
@@ -1043,8 +1033,8 @@ int perf_counter_task_enable(void)
 		if (counter->state > PERF_COUNTER_STATE_OFF)
 			continue;
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
-		counter->tstamp_enabled = ctx->time_now -
-			counter->total_time_enabled;
+		counter->tstamp_enabled =
+			ctx->time - counter->total_time_enabled;
 		counter->hw_event.disabled = 0;
 	}
 	hw_perf_restore(perf_flags);
@@ -1113,7 +1103,7 @@ static void __read(void *info)
 
 	curr_rq_lock_irq_save(&flags);
 	if (ctx->is_active)
-		update_context_time(ctx, 1);
+		update_context_time(ctx);
 	counter->hw_ops->read(counter);
 	update_counter_times(counter);
 	curr_rq_unlock_irq_restore(&flags);
-- 
cgit v0.10.2


From a39d6f2556c4a19f58f538c6aa28bf8faca4fcb8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:11 +0200
Subject: perf_counter: rework the task clock software counter

Rework the task clock software counter to use the context time instead
of the task runtime clock, this removes the last such user.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094518.445450972@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 84d85ab..56b7eb5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -974,9 +974,6 @@ int perf_counter_task_disable(void)
 	curr_rq_lock_irq_save(&flags);
 	cpu = smp_processor_id();
 
-	/* force the update of the task clock: */
-	__task_delta_exec(curr, 1);
-
 	perf_counter_task_sched_out(curr, cpu);
 
 	spin_lock(&ctx->lock);
@@ -1017,9 +1014,6 @@ int perf_counter_task_enable(void)
 	curr_rq_lock_irq_save(&flags);
 	cpu = smp_processor_id();
 
-	/* force the update of the task clock: */
-	__task_delta_exec(curr, 1);
-
 	perf_counter_task_sched_out(curr, cpu);
 
 	spin_lock(&ctx->lock);
@@ -2347,38 +2341,28 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
  * Software counter: task time clock
  */
 
-/*
- * Called from within the scheduler:
- */
-static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
-{
-	struct task_struct *curr = counter->task;
-	u64 delta;
-
-	delta = __task_delta_exec(curr, update);
-
-	return curr->se.sum_exec_runtime + delta;
-}
-
-static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
+static void task_clock_perf_counter_update(struct perf_counter *counter)
 {
-	u64 prev;
+	u64 prev, now;
 	s64 delta;
 
-	prev = atomic64_read(&counter->hw.prev_count);
-
-	atomic64_set(&counter->hw.prev_count, now);
+	update_context_time(counter->ctx);
+	now = counter->ctx->time;
 
+	prev = atomic64_xchg(&counter->hw.prev_count, now);
 	delta = now - prev;
-
 	atomic64_add(delta, &counter->count);
 }
 
 static int task_clock_perf_counter_enable(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
+	u64 now;
+
+	update_context_time(counter->ctx);
+	now = counter->ctx->time;
 
-	atomic64_set(&hwc->prev_count, task_clock_perf_counter_val(counter, 0));
+	atomic64_set(&hwc->prev_count, now);
 	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hwc->hrtimer.function = perf_swcounter_hrtimer;
 	if (hwc->irq_period) {
@@ -2393,14 +2377,12 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter)
 static void task_clock_perf_counter_disable(struct perf_counter *counter)
 {
 	hrtimer_cancel(&counter->hw.hrtimer);
-	task_clock_perf_counter_update(counter,
-			task_clock_perf_counter_val(counter, 0));
+	task_clock_perf_counter_update(counter);
 }
 
 static void task_clock_perf_counter_read(struct perf_counter *counter)
 {
-	task_clock_perf_counter_update(counter,
-			task_clock_perf_counter_val(counter, 1));
+	task_clock_perf_counter_update(counter);
 }
 
 static const struct hw_perf_counter_ops perf_ops_task_clock = {
-- 
cgit v0.10.2


From 849691a6cd40270ff5f4a8846d5f6bf8df663ffc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:12 +0200
Subject: perf_counter: remove rq->lock usage

Now that all the task runtime clock users are gone, remove the ugly
rq->lock usage from perf counters, which solves the nasty deadlock
seen when a software task clock counter was read from an NMI overflow
context.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094518.531137582@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index b6d2887..080d1fd 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -85,8 +85,6 @@ static inline unsigned int kstat_irqs(unsigned int irq)
 /*
  * Lock/unlock the current runqueue - to extract task statistics:
  */
-extern void curr_rq_lock_irq_save(unsigned long *flags);
-extern void curr_rq_unlock_irq_restore(unsigned long *flags);
 extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update);
 extern unsigned long long task_delta_exec(struct task_struct *);
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 56b7eb5..f4f7596 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -172,8 +172,7 @@ static void __perf_counter_remove_from_context(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	curr_rq_lock_irq_save(&flags);
-	spin_lock(&ctx->lock);
+	spin_lock_irqsave(&ctx->lock, flags);
 
 	counter_sched_out(counter, cpuctx, ctx);
 
@@ -198,8 +197,7 @@ static void __perf_counter_remove_from_context(void *info)
 			    perf_max_counters - perf_reserved_percpu);
 	}
 
-	spin_unlock(&ctx->lock);
-	curr_rq_unlock_irq_restore(&flags);
+	spin_unlock_irqrestore(&ctx->lock, flags);
 }
 
 
@@ -319,8 +317,7 @@ static void __perf_counter_disable(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	curr_rq_lock_irq_save(&flags);
-	spin_lock(&ctx->lock);
+	spin_lock_irqsave(&ctx->lock, flags);
 
 	/*
 	 * If the counter is on, turn it off.
@@ -336,8 +333,7 @@ static void __perf_counter_disable(void *info)
 		counter->state = PERF_COUNTER_STATE_OFF;
 	}
 
-	spin_unlock(&ctx->lock);
-	curr_rq_unlock_irq_restore(&flags);
+	spin_unlock_irqrestore(&ctx->lock, flags);
 }
 
 /*
@@ -515,8 +511,7 @@ static void __perf_install_in_context(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	curr_rq_lock_irq_save(&flags);
-	spin_lock(&ctx->lock);
+	spin_lock_irqsave(&ctx->lock, flags);
 	update_context_time(ctx);
 
 	/*
@@ -565,8 +560,7 @@ static void __perf_install_in_context(void *info)
  unlock:
 	hw_perf_restore(perf_flags);
 
-	spin_unlock(&ctx->lock);
-	curr_rq_unlock_irq_restore(&flags);
+	spin_unlock_irqrestore(&ctx->lock, flags);
 }
 
 /*
@@ -641,8 +635,7 @@ static void __perf_counter_enable(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	curr_rq_lock_irq_save(&flags);
-	spin_lock(&ctx->lock);
+	spin_lock_irqsave(&ctx->lock, flags);
 	update_context_time(ctx);
 
 	counter->prev_state = counter->state;
@@ -678,8 +671,7 @@ static void __perf_counter_enable(void *info)
 	}
 
  unlock:
-	spin_unlock(&ctx->lock);
-	curr_rq_unlock_irq_restore(&flags);
+	spin_unlock_irqrestore(&ctx->lock, flags);
 }
 
 /*
@@ -971,7 +963,7 @@ int perf_counter_task_disable(void)
 	if (likely(!ctx->nr_counters))
 		return 0;
 
-	curr_rq_lock_irq_save(&flags);
+	local_irq_save(flags);
 	cpu = smp_processor_id();
 
 	perf_counter_task_sched_out(curr, cpu);
@@ -992,9 +984,7 @@ int perf_counter_task_disable(void)
 
 	hw_perf_restore(perf_flags);
 
-	spin_unlock(&ctx->lock);
-
-	curr_rq_unlock_irq_restore(&flags);
+	spin_unlock_irqrestore(&ctx->lock, flags);
 
 	return 0;
 }
@@ -1011,7 +1001,7 @@ int perf_counter_task_enable(void)
 	if (likely(!ctx->nr_counters))
 		return 0;
 
-	curr_rq_lock_irq_save(&flags);
+	local_irq_save(flags);
 	cpu = smp_processor_id();
 
 	perf_counter_task_sched_out(curr, cpu);
@@ -1037,7 +1027,7 @@ int perf_counter_task_enable(void)
 
 	perf_counter_task_sched_in(curr, cpu);
 
-	curr_rq_unlock_irq_restore(&flags);
+	local_irq_restore(flags);
 
 	return 0;
 }
@@ -1095,12 +1085,12 @@ static void __read(void *info)
 	struct perf_counter_context *ctx = counter->ctx;
 	unsigned long flags;
 
-	curr_rq_lock_irq_save(&flags);
+	local_irq_save(flags);
 	if (ctx->is_active)
 		update_context_time(ctx);
 	counter->hw_ops->read(counter);
 	update_counter_times(counter);
-	curr_rq_unlock_irq_restore(&flags);
+	local_irq_restore(flags);
 }
 
 static u64 perf_counter_read(struct perf_counter *counter)
@@ -2890,7 +2880,7 @@ __perf_counter_exit_task(struct task_struct *child,
 		 * Be careful about zapping the list - IRQ/NMI context
 		 * could still be processing it:
 		 */
-		curr_rq_lock_irq_save(&flags);
+		local_irq_save(flags);
 		perf_flags = hw_perf_save_disable();
 
 		cpuctx = &__get_cpu_var(perf_cpu_context);
@@ -2903,7 +2893,7 @@ __perf_counter_exit_task(struct task_struct *child,
 		child_ctx->nr_counters--;
 
 		hw_perf_restore(perf_flags);
-		curr_rq_unlock_irq_restore(&flags);
+		local_irq_restore(flags);
 	}
 
 	parent_counter = child_counter->parent;
diff --git a/kernel/sched.c b/kernel/sched.c
index f76e3c0..0de2f81 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -997,26 +997,6 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 	}
 }
 
-void curr_rq_lock_irq_save(unsigned long *flags)
-	__acquires(rq->lock)
-{
-	struct rq *rq;
-
-	local_irq_save(*flags);
-	rq = cpu_rq(smp_processor_id());
-	spin_lock(&rq->lock);
-}
-
-void curr_rq_unlock_irq_restore(unsigned long *flags)
-	__releases(rq->lock)
-{
-	struct rq *rq;
-
-	rq = cpu_rq(smp_processor_id());
-	spin_unlock(&rq->lock);
-	local_irq_restore(*flags);
-}
-
 void task_rq_unlock_wait(struct task_struct *p)
 {
 	struct rq *rq = task_rq(p);
-- 
cgit v0.10.2


From bce379bf358738ab8efc8cda2594a299ac685085 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:13 +0200
Subject: perf_counter: minimize context time updates

Push the update_context_time() calls up the stack so that we get less
invokations and thereby a less noisy output:

before:

 # ./perfstat -e 1:0 -e 1:1 -e 1:1 -e 1:1 -l ls > /dev/null

 Performance counter stats for 'ls':

      10.163691  cpu clock ticks      (msecs)  (scaled from 98.94%)
      10.215360  task clock ticks     (msecs)  (scaled from 98.18%)
      10.185549  task clock ticks     (msecs)  (scaled from 98.53%)
      10.183581  task clock ticks     (msecs)  (scaled from 98.71%)

 Wall-clock time elapsed:    11.912858 msecs

after:

 # ./perfstat -e 1:0 -e 1:1 -e 1:1 -e 1:1 -l ls > /dev/null

 Performance counter stats for 'ls':

       9.316630  cpu clock ticks      (msecs)
       9.280789  task clock ticks     (msecs)
       9.280789  task clock ticks     (msecs)
       9.280789  task clock ticks     (msecs)

 Wall-clock time elapsed:     9.574872 msecs

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094518.618876874@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f4f7596..863703b 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -319,6 +319,8 @@ static void __perf_counter_disable(void *info)
 
 	spin_lock_irqsave(&ctx->lock, flags);
 
+	update_context_time(ctx);
+
 	/*
 	 * If the counter is on, turn it off.
 	 * If it is in error state, leave it in error state.
@@ -797,6 +799,8 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 	if (likely(!cpuctx->task_ctx))
 		return;
 
+	update_context_time(ctx);
+
 	regs = task_pt_regs(task);
 	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs);
 	__perf_counter_sched_out(ctx, cpuctx);
@@ -2336,7 +2340,6 @@ static void task_clock_perf_counter_update(struct perf_counter *counter)
 	u64 prev, now;
 	s64 delta;
 
-	update_context_time(counter->ctx);
 	now = counter->ctx->time;
 
 	prev = atomic64_xchg(&counter->hw.prev_count, now);
@@ -2349,7 +2352,6 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	u64 now;
 
-	update_context_time(counter->ctx);
 	now = counter->ctx->time;
 
 	atomic64_set(&hwc->prev_count, now);
@@ -2372,6 +2374,7 @@ static void task_clock_perf_counter_disable(struct perf_counter *counter)
 
 static void task_clock_perf_counter_read(struct perf_counter *counter)
 {
+	update_context_time(counter->ctx);
 	task_clock_perf_counter_update(counter);
 }
 
-- 
cgit v0.10.2


From 6278af660ff83fbafb18e53fc2747eb2ee6780fa Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 2 Apr 2009 10:40:28 +0200
Subject: perf_counter tools: kerneltop: display per function percentage along
 with event count

------------------------------------------------------------------------------
 KernelTop:   90551 irqs/sec  kernel:15.0% [NMI, 100000 CPU cycles],  (all, 4 CPUs)
------------------------------------------------------------------------------

             events    pcnt         RIP          kernel function
  ______     ______   _____   ________________   _______________

           16871.00 - 19.1% - ffffffff80328e20 : clear_page_c
            8810.00 -  9.9% - ffffffff8048ce80 : page_fault
            4746.00 -  5.4% - ffffffff8048cae2 : _spin_lock
            4428.00 -  5.0% - ffffffff80328e70 : copy_page_c
            3340.00 -  3.8% - ffffffff80329090 : copy_user_generic_string!
            2679.00 -  3.0% - ffffffff8028a16b : get_page_from_freelist
            2254.00 -  2.5% - ffffffff80296f19 : unmap_vmas
            2082.00 -  2.4% - ffffffff80297e19 : handle_mm_fault
            1754.00 -  2.0% - ffffffff80288dc8 : __rmqueue_smallest
            1553.00 -  1.8% - ffffffff8048ca58 : _spin_lock_irqsave
            1400.00 -  1.6% - ffffffff8028cdc8 : release_pages
            1337.00 -  1.5% - ffffffff80285400 : find_get_page
            1335.00 -  1.5% - ffffffff80225a23 : do_page_fault
            1299.00 -  1.5% - ffffffff802ba8e7 : __d_lookup
            1174.00 -  1.3% - ffffffff802b38f3 : __link_path_walk
            1155.00 -  1.3% - ffffffff802843e1 : perf_swcounter_ctx_event!
            1137.00 -  1.3% - ffffffff8028d118 : ____pagevec_lru_add
             963.00 -  1.1% - ffffffff802a670b : kmem_cache_alloc
             885.00 -  1.0% - ffffffff8024bc61 : __wake_up_bit

Display per function percentage along with event count.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 4f8d791..15f3a5f 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -636,16 +636,20 @@ static void print_sym_table(void)
 	int counter;
 	float events_per_sec = events/delay_secs;
 	float kevents_per_sec = (events-userspace_events)/delay_secs;
+	float sum_kevents = 0.0;
 
 	events = userspace_events = 0;
 	memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
 	qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
 
+	for (i = 0; i < sym_table_count && tmp[i].count[0]; i++)
+		sum_kevents += tmp[i].count[0];
+
 	write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
 
 	printf(
 "------------------------------------------------------------------------------\n");
-	printf( " KernelTop:%8.0f irqs/sec  kernel:%3.1f%% [%s, ",
+	printf( " KernelTop:%8.0f irqs/sec  kernel:%4.1f%% [%s, ",
 		events_per_sec,
 		100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
 		nmi ? "NMI" : "IRQ");
@@ -679,34 +683,31 @@ static void print_sym_table(void)
 	printf("------------------------------------------------------------------------------\n\n");
 
 	if (nr_counters == 1)
-		printf("             events");
+		printf("             events    pcnt");
 	else
-		printf("  weight     events");
+		printf("  weight     events    pcnt");
 
 	printf("         RIP          kernel function\n"
-	       	       "  ______     ______   ________________   _______________\n\n"
+	       	       "  ______     ______   _____   ________________   _______________\n\n"
 	);
 
-	printed = 0;
-	for (i = 0; i < sym_table_count; i++) {
+	for (i = 0, printed = 0; i < sym_table_count; i++) {
+		float pcnt;
 		int count;
 
-		if (nr_counters == 1) {
-			if (printed <= 18 &&
-					tmp[i].count[0] >= count_filter) {
-				printf("%19.2f - %016llx : %s\n",
-				  sym_weight(tmp + i), tmp[i].addr, tmp[i].sym);
-				printed++;
-			}
-		} else {
-			if (printed <= 18 &&
-					tmp[i].count[0] >= count_filter) {
-				printf("%8.1f %10ld - %016llx : %s\n",
-				  sym_weight(tmp + i),
-				  tmp[i].count[0],
-				  tmp[i].addr, tmp[i].sym);
-				printed++;
-			}
+		if (printed <= 18 && tmp[i].count[0] >= count_filter) {
+			pcnt = 100.0 - (100.0*((sum_kevents-tmp[i].count[0])/sum_kevents));
+
+			if (nr_counters == 1)
+				printf("%19.2f - %4.1f%% - %016llx : %s\n",
+					sym_weight(tmp + i),
+					pcnt, tmp[i].addr, tmp[i].sym);
+			else
+				printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n",
+					sym_weight(tmp + i),
+					tmp[i].count[0],
+					pcnt, tmp[i].addr, tmp[i].sym);
+			printed++;
 		}
 		/*
 		 * Add decay to the counts:
-- 
cgit v0.10.2


From 98c2aaf8be5baf7193be37fb28bce8e7327158bc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 7 Apr 2009 11:30:17 +0200
Subject: x86, perfcounters: add atomic64_xchg()

Complete atomic64_t support on the 32-bit side by adding atomic64_xch().

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090406094518.445450972@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index 977250e..aff9f1f 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -291,19 +291,37 @@ atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
 }
 
 /**
- * atomic64_set - set atomic64 variable
+ * atomic64_xchg - xchg atomic64 variable
  * @ptr:      pointer to type atomic64_t
  * @new_val:  value to assign
+ * @old_val:  old value that was there
  *
- * Atomically sets the value of @ptr to @new_val.
+ * Atomically xchgs the value of @ptr to @new_val and returns
+ * the old value.
  */
-static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
+
+static inline unsigned long long
+atomic64_xchg(atomic64_t *ptr, unsigned long long new_val)
 {
 	unsigned long long old_val;
 
 	do {
 		old_val = atomic_read(ptr);
 	} while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
+
+	return old_val;
+}
+
+/**
+ * atomic64_set - set atomic64 variable
+ * @ptr:      pointer to type atomic64_t
+ * @new_val:  value to assign
+ *
+ * Atomically sets the value of @ptr to @new_val.
+ */
+static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
+{
+	atomic64_xchg(ptr, new_val);
 }
 
 /**
-- 
cgit v0.10.2


From cac94f979326212831c0ea44ed9ea1622b4f4e93 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:33 +0200
Subject: x86, bts: fix race when bts tracer is removed

When the bts tracer is removed while the traced task is running,
the write to clear the bts tracer pointer races with context switch code.

Read the tracer once during a context switch.

When a new tracer is installed, the bts tracer is set in the ds context
before the tracer is initialized in order to claim the context for that
tracer.

This may result in write accesses using an uninitialized trace configuration
when scheduling timestamps have been requested.

Store active tracing flags separately and only set active flags after
the tracing configuration has been initialized.

Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144548.881338000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index b1d6e1f..c730155 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -89,6 +89,9 @@ struct bts_tracer {
 
 	/* Buffer overflow notification function: */
 	bts_ovfl_callback_t	ovfl;
+
+	/* Active flags affecting trace collection. */
+	unsigned int		flags;
 };
 
 struct pebs_tracer {
@@ -799,6 +802,8 @@ void ds_suspend_bts(struct bts_tracer *tracer)
 	if (!tracer)
 		return;
 
+	tracer->flags = 0;
+
 	task = tracer->ds.context->task;
 
 	if (!task || (task == current))
@@ -820,6 +825,8 @@ void ds_resume_bts(struct bts_tracer *tracer)
 	if (!tracer)
 		return;
 
+	tracer->flags = tracer->trace.ds.flags;
+
 	task = tracer->ds.context->task;
 
 	control = ds_cfg.ctl[dsf_bts];
@@ -1037,43 +1044,52 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
 	}
 }
 
+static inline void ds_take_timestamp(struct ds_context *context,
+				     enum bts_qualifier qualifier,
+				     struct task_struct *task)
+{
+	struct bts_tracer *tracer = context->bts_master;
+	struct bts_struct ts;
+
+	/* Prevent compilers from reading the tracer pointer twice. */
+	barrier();
+
+	if (!tracer || !(tracer->flags & BTS_TIMESTAMPS))
+		return;
+
+	memset(&ts, 0, sizeof(ts));
+	ts.qualifier			= qualifier;
+	ts.variant.timestamp.jiffies	= jiffies_64;
+	ts.variant.timestamp.pid	= task->pid;
+
+	bts_write(tracer, &ts);
+}
+
 /*
  * Change the DS configuration from tracing prev to tracing next.
  */
 void ds_switch_to(struct task_struct *prev, struct task_struct *next)
 {
-	struct ds_context *prev_ctx = prev->thread.ds_ctx;
-	struct ds_context *next_ctx = next->thread.ds_ctx;
+	struct ds_context *prev_ctx	= prev->thread.ds_ctx;
+	struct ds_context *next_ctx	= next->thread.ds_ctx;
+	unsigned long debugctlmsr	= next->thread.debugctlmsr;
+
+	/* Make sure all data is read before we start. */
+	barrier();
 
 	if (prev_ctx) {
 		update_debugctlmsr(0);
 
-		if (prev_ctx->bts_master &&
-		    (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
-			struct bts_struct ts = {
-				.qualifier = bts_task_departs,
-				.variant.timestamp.jiffies = jiffies_64,
-				.variant.timestamp.pid = prev->pid
-			};
-			bts_write(prev_ctx->bts_master, &ts);
-		}
+		ds_take_timestamp(prev_ctx, bts_task_departs, prev);
 	}
 
 	if (next_ctx) {
-		if (next_ctx->bts_master &&
-		    (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
-			struct bts_struct ts = {
-				.qualifier = bts_task_arrives,
-				.variant.timestamp.jiffies = jiffies_64,
-				.variant.timestamp.pid = next->pid
-			};
-			bts_write(next_ctx->bts_master, &ts);
-		}
+		ds_take_timestamp(next_ctx, bts_task_arrives, next);
 
 		wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
 	}
 
-	update_debugctlmsr(next->thread.debugctlmsr);
+	update_debugctlmsr(debugctlmsr);
 }
 
 void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
-- 
cgit v0.10.2


From a26b89f05d194413c7238e0bea071054f6b5d3c8 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:34 +0200
Subject: sched, hw-branch-tracer: add wait_task_context_switch() function to
 sched.h

Add a function to wait until some other task has been
switched out at least once.

This differs from wait_task_inactive() subtly, in that the
latter will wait until the task has left the CPU.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: markus.t.metzger@gmail.com
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144549.794157000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b94f354..a5b9a83 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1993,8 +1993,10 @@ extern void set_task_comm(struct task_struct *tsk, char *from);
 extern char *get_task_comm(char *to, struct task_struct *tsk);
 
 #ifdef CONFIG_SMP
+extern void wait_task_context_switch(struct task_struct *p);
 extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
 #else
+static inline void wait_task_context_switch(struct task_struct *p) {}
 static inline unsigned long wait_task_inactive(struct task_struct *p,
 					       long match_state)
 {
diff --git a/kernel/sched.c b/kernel/sched.c
index 6cc1fd5..f91bc81 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2003,6 +2003,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 }
 
 /*
+ * wait_task_context_switch -	wait for a thread to complete at least one
+ *				context switch.
+ *
+ * @p must not be current.
+ */
+void wait_task_context_switch(struct task_struct *p)
+{
+	unsigned long nvcsw, nivcsw, flags;
+	int running;
+	struct rq *rq;
+
+	nvcsw	= p->nvcsw;
+	nivcsw	= p->nivcsw;
+	for (;;) {
+		/*
+		 * The runqueue is assigned before the actual context
+		 * switch. We need to take the runqueue lock.
+		 *
+		 * We could check initially without the lock but it is
+		 * very likely that we need to take the lock in every
+		 * iteration.
+		 */
+		rq = task_rq_lock(p, &flags);
+		running = task_running(rq, p);
+		task_rq_unlock(rq, &flags);
+
+		if (likely(!running))
+			break;
+		/*
+		 * The switch count is incremented before the actual
+		 * context switch. We thus wait for two switches to be
+		 * sure at least one completed.
+		 */
+		if ((p->nvcsw - nvcsw) > 1)
+			break;
+		if ((p->nivcsw - nivcsw) > 1)
+			break;
+
+		cpu_relax();
+	}
+}
+
+/*
  * wait_task_inactive - wait for a thread to unschedule.
  *
  * If @match_state is nonzero, it's the @p->state value just checked and
-- 
cgit v0.10.2


From e2b371f00a6f529f6362654239bdec8dcd510760 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:35 +0200
Subject: mm, x86, ptrace, bts: defer branch trace stopping

When a ptraced task is unlinked, we need to stop branch tracing for
that task.

Since the unlink is called with interrupts disabled, and we need
interrupts enabled to stop branch tracing, we defer the work.

Collect all branch tracing related stuff in a branch tracing context.

Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144550.712401000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 34c5237..2483807 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -458,10 +458,6 @@ struct thread_struct {
 /* Debug Store context; see include/asm-x86/ds.h; goes into MSR_IA32_DS_AREA */
 	struct ds_context	*ds_ctx;
 #endif /* CONFIG_X86_DS */
-#ifdef CONFIG_X86_PTRACE_BTS
-/* the signal to send on a bts buffer overflow */
-	unsigned int	bts_ovfl_signal;
-#endif /* CONFIG_X86_PTRACE_BTS */
 };
 
 static inline unsigned long native_get_debugreg(int regno)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index fe9345c..7c21d1e 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -22,6 +22,7 @@
 #include <linux/seccomp.h>
 #include <linux/signal.h>
 #include <linux/ftrace.h>
+#include <linux/workqueue.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -577,17 +578,119 @@ static int ioperm_get(struct task_struct *target,
 }
 
 #ifdef CONFIG_X86_PTRACE_BTS
+/*
+ * A branch trace store context.
+ *
+ * Contexts may only be installed by ptrace_bts_config() and only for
+ * ptraced tasks.
+ *
+ * Contexts are destroyed when the tracee is detached from the tracer.
+ * The actual destruction work requires interrupts enabled, so the
+ * work is deferred and will be scheduled during __ptrace_unlink().
+ *
+ * Contexts hold an additional task_struct reference on the traced
+ * task, as well as a reference on the tracer's mm.
+ *
+ * Ptrace already holds a task_struct for the duration of ptrace operations,
+ * but since destruction is deferred, it may be executed after both
+ * tracer and tracee exited.
+ */
+struct bts_context {
+	/* The branch trace handle. */
+	struct bts_tracer	*tracer;
+
+	/* The buffer used to store the branch trace and its size. */
+	void			*buffer;
+	unsigned int		size;
+
+	/* The mm that paid for the above buffer. */
+	struct mm_struct	*mm;
+
+	/* The task this context belongs to. */
+	struct task_struct	*task;
+
+	/* The signal to send on a bts buffer overflow. */
+	unsigned int		bts_ovfl_signal;
+
+	/* The work struct to destroy a context. */
+	struct work_struct	work;
+};
+
+static inline void alloc_bts_buffer(struct bts_context *context,
+				    unsigned int size)
+{
+	void *buffer;
+
+	buffer = alloc_locked_buffer(size);
+	if (buffer) {
+		context->buffer = buffer;
+		context->size = size;
+		context->mm = get_task_mm(current);
+	}
+}
+
+static inline void free_bts_buffer(struct bts_context *context)
+{
+	if (!context->buffer)
+		return;
+
+	kfree(context->buffer);
+	context->buffer = NULL;
+
+	refund_locked_buffer_memory(context->mm, context->size);
+	context->size = 0;
+
+	mmput(context->mm);
+	context->mm = NULL;
+}
+
+static void free_bts_context_work(struct work_struct *w)
+{
+	struct bts_context *context;
+
+	context = container_of(w, struct bts_context, work);
+
+	ds_release_bts(context->tracer);
+	put_task_struct(context->task);
+	free_bts_buffer(context);
+	kfree(context);
+}
+
+static inline void free_bts_context(struct bts_context *context)
+{
+	INIT_WORK(&context->work, free_bts_context_work);
+	schedule_work(&context->work);
+}
+
+static inline struct bts_context *alloc_bts_context(struct task_struct *task)
+{
+	struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (context) {
+		context->task = task;
+		task->bts = context;
+
+		get_task_struct(task);
+	}
+
+	return context;
+}
+
 static int ptrace_bts_read_record(struct task_struct *child, size_t index,
 				  struct bts_struct __user *out)
 {
+	struct bts_context *context;
 	const struct bts_trace *trace;
 	struct bts_struct bts;
 	const unsigned char *at;
 	int error;
 
-	trace = ds_read_bts(child->bts);
+	context = child->bts;
+	if (!context)
+		return -ESRCH;
+
+	trace = ds_read_bts(context->tracer);
 	if (!trace)
-		return -EPERM;
+		return -ESRCH;
 
 	at = trace->ds.top - ((index + 1) * trace->ds.size);
 	if ((void *)at < trace->ds.begin)
@@ -596,7 +699,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,
 	if (!trace->read)
 		return -EOPNOTSUPP;
 
-	error = trace->read(child->bts, at, &bts);
+	error = trace->read(context->tracer, at, &bts);
 	if (error < 0)
 		return error;
 
@@ -610,13 +713,18 @@ static int ptrace_bts_drain(struct task_struct *child,
 			    long size,
 			    struct bts_struct __user *out)
 {
+	struct bts_context *context;
 	const struct bts_trace *trace;
 	const unsigned char *at;
 	int error, drained = 0;
 
-	trace = ds_read_bts(child->bts);
+	context = child->bts;
+	if (!context)
+		return -ESRCH;
+
+	trace = ds_read_bts(context->tracer);
 	if (!trace)
-		return -EPERM;
+		return -ESRCH;
 
 	if (!trace->read)
 		return -EOPNOTSUPP;
@@ -627,9 +735,8 @@ static int ptrace_bts_drain(struct task_struct *child,
 	for (at = trace->ds.begin; (void *)at < trace->ds.top;
 	     out++, drained++, at += trace->ds.size) {
 		struct bts_struct bts;
-		int error;
 
-		error = trace->read(child->bts, at, &bts);
+		error = trace->read(context->tracer, at, &bts);
 		if (error < 0)
 			return error;
 
@@ -639,35 +746,18 @@ static int ptrace_bts_drain(struct task_struct *child,
 
 	memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
 
-	error = ds_reset_bts(child->bts);
+	error = ds_reset_bts(context->tracer);
 	if (error < 0)
 		return error;
 
 	return drained;
 }
 
-static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size)
-{
-	child->bts_buffer = alloc_locked_buffer(size);
-	if (!child->bts_buffer)
-		return -ENOMEM;
-
-	child->bts_size = size;
-
-	return 0;
-}
-
-static void ptrace_bts_free_buffer(struct task_struct *child)
-{
-	free_locked_buffer(child->bts_buffer, child->bts_size);
-	child->bts_buffer = NULL;
-	child->bts_size = 0;
-}
-
 static int ptrace_bts_config(struct task_struct *child,
 			     long cfg_size,
 			     const struct ptrace_bts_config __user *ucfg)
 {
+	struct bts_context *context;
 	struct ptrace_bts_config cfg;
 	unsigned int flags = 0;
 
@@ -677,28 +767,31 @@ static int ptrace_bts_config(struct task_struct *child,
 	if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
 		return -EFAULT;
 
-	if (child->bts) {
-		ds_release_bts(child->bts);
-		child->bts = NULL;
-	}
+	context = child->bts;
+	if (!context)
+		context = alloc_bts_context(child);
+	if (!context)
+		return -ENOMEM;
 
 	if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
 		if (!cfg.signal)
 			return -EINVAL;
 
-		child->thread.bts_ovfl_signal = cfg.signal;
 		return -EOPNOTSUPP;
+		context->bts_ovfl_signal = cfg.signal;
 	}
 
-	if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
-	    (cfg.size != child->bts_size)) {
-		int error;
+	ds_release_bts(context->tracer);
+	context->tracer = NULL;
 
-		ptrace_bts_free_buffer(child);
+	if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
+		free_bts_buffer(context);
+		if (!cfg.size)
+			return 0;
 
-		error = ptrace_bts_allocate_buffer(child, cfg.size);
-		if (error < 0)
-			return error;
+		alloc_bts_buffer(context, cfg.size);
+		if (!context->buffer)
+			return -ENOMEM;
 	}
 
 	if (cfg.flags & PTRACE_BTS_O_TRACE)
@@ -707,15 +800,13 @@ static int ptrace_bts_config(struct task_struct *child,
 	if (cfg.flags & PTRACE_BTS_O_SCHED)
 		flags |= BTS_TIMESTAMPS;
 
-	child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size,
-				    /* ovfl = */ NULL, /* th = */ (size_t)-1,
-				    flags);
-	if (IS_ERR(child->bts)) {
-		int error = PTR_ERR(child->bts);
-
-		ptrace_bts_free_buffer(child);
-		child->bts = NULL;
+	context->tracer = ds_request_bts(child, context->buffer, context->size,
+					 NULL, (size_t)-1, flags);
+	if (unlikely(IS_ERR(context->tracer))) {
+		int error = PTR_ERR(context->tracer);
 
+		free_bts_buffer(context);
+		context->tracer = NULL;
 		return error;
 	}
 
@@ -726,20 +817,25 @@ static int ptrace_bts_status(struct task_struct *child,
 			     long cfg_size,
 			     struct ptrace_bts_config __user *ucfg)
 {
+	struct bts_context *context;
 	const struct bts_trace *trace;
 	struct ptrace_bts_config cfg;
 
+	context = child->bts;
+	if (!context)
+		return -ESRCH;
+
 	if (cfg_size < sizeof(cfg))
 		return -EIO;
 
-	trace = ds_read_bts(child->bts);
+	trace = ds_read_bts(context->tracer);
 	if (!trace)
-		return -EPERM;
+		return -ESRCH;
 
 	memset(&cfg, 0, sizeof(cfg));
-	cfg.size = trace->ds.end - trace->ds.begin;
-	cfg.signal = child->thread.bts_ovfl_signal;
-	cfg.bts_size = sizeof(struct bts_struct);
+	cfg.size	= trace->ds.end - trace->ds.begin;
+	cfg.signal	= context->bts_ovfl_signal;
+	cfg.bts_size	= sizeof(struct bts_struct);
 
 	if (cfg.signal)
 		cfg.flags |= PTRACE_BTS_O_SIGNAL;
@@ -758,67 +854,56 @@ static int ptrace_bts_status(struct task_struct *child,
 
 static int ptrace_bts_clear(struct task_struct *child)
 {
+	struct bts_context *context;
 	const struct bts_trace *trace;
 
-	trace = ds_read_bts(child->bts);
+	context = child->bts;
+	if (!context)
+		return -ESRCH;
+
+	trace = ds_read_bts(context->tracer);
 	if (!trace)
-		return -EPERM;
+		return -ESRCH;
 
 	memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
 
-	return ds_reset_bts(child->bts);
+	return ds_reset_bts(context->tracer);
 }
 
 static int ptrace_bts_size(struct task_struct *child)
 {
+	struct bts_context *context;
 	const struct bts_trace *trace;
 
-	trace = ds_read_bts(child->bts);
+	context = child->bts;
+	if (!context)
+		return -ESRCH;
+
+	trace = ds_read_bts(context->tracer);
 	if (!trace)
-		return -EPERM;
+		return -ESRCH;
 
 	return (trace->ds.top - trace->ds.begin) / trace->ds.size;
 }
 
-static void ptrace_bts_fork(struct task_struct *tsk)
+static inline void ptrace_bts_fork(struct task_struct *tsk)
 {
 	tsk->bts = NULL;
-	tsk->bts_buffer = NULL;
-	tsk->bts_size = 0;
-	tsk->thread.bts_ovfl_signal = 0;
 }
 
-static void ptrace_bts_untrace(struct task_struct *child)
+/*
+ * Called from __ptrace_unlink() after the child has been moved back
+ * to its original parent.
+ */
+static inline void ptrace_bts_untrace(struct task_struct *child)
 {
 	if (unlikely(child->bts)) {
-		ds_release_bts(child->bts);
+		free_bts_context(child->bts);
 		child->bts = NULL;
-
-		/* We cannot update total_vm and locked_vm since
-		   child's mm is already gone. But we can reclaim the
-		   memory. */
-		kfree(child->bts_buffer);
-		child->bts_buffer = NULL;
-		child->bts_size = 0;
 	}
 }
-
-static void ptrace_bts_detach(struct task_struct *child)
-{
-	/*
-	 * Ptrace_detach() races with ptrace_untrace() in case
-	 * the child dies and is reaped by another thread.
-	 *
-	 * We only do the memory accounting at this point and
-	 * leave the buffer deallocation and the bts tracer
-	 * release to ptrace_bts_untrace() which will be called
-	 * later on with tasklist_lock held.
-	 */
-	release_locked_buffer(child->bts_buffer, child->bts_size);
-}
 #else
 static inline void ptrace_bts_fork(struct task_struct *tsk) {}
-static inline void ptrace_bts_detach(struct task_struct *child) {}
 static inline void ptrace_bts_untrace(struct task_struct *child) {}
 #endif /* CONFIG_X86_PTRACE_BTS */
 
@@ -843,7 +928,6 @@ void ptrace_disable(struct task_struct *child)
 #ifdef TIF_SYSCALL_EMU
 	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 #endif
-	ptrace_bts_detach(child);
 }
 
 #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bff1f0d..64d8ed2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -13,6 +13,7 @@
 #include <linux/prio_tree.h>
 #include <linux/debug_locks.h>
 #include <linux/mm_types.h>
+#include <linux/sched.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -1321,6 +1322,6 @@ void vmemmap_populate_print_last(void);
 
 extern void *alloc_locked_buffer(size_t size);
 extern void free_locked_buffer(void *buffer, size_t size);
-extern void release_locked_buffer(void *buffer, size_t size);
+extern void refund_locked_buffer_memory(struct mm_struct *mm, size_t size);
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a5b9a83..52b8cd0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -96,8 +96,8 @@ struct exec_domain;
 struct futex_pi_state;
 struct robust_list_head;
 struct bio;
-struct bts_tracer;
 struct fs_struct;
+struct bts_context;
 
 /*
  * List of flags we want to share for kernel threads,
@@ -1210,12 +1210,7 @@ struct task_struct {
 	 * This is the tracer handle for the ptrace BTS extension.
 	 * This field actually belongs to the ptracer task.
 	 */
-	struct bts_tracer *bts;
-	/*
-	 * The buffer to hold the BTS data.
-	 */
-	void *bts_buffer;
-	size_t bts_size;
+	struct bts_context *bts;
 #endif /* CONFIG_X86_PTRACE_BTS */
 
 	/* PID/PID hash table linkage. */
diff --git a/mm/mlock.c b/mm/mlock.c
index cbe9e05..749383b 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -660,21 +660,20 @@ void *alloc_locked_buffer(size_t size)
 	return buffer;
 }
 
-void release_locked_buffer(void *buffer, size_t size)
+void refund_locked_buffer_memory(struct mm_struct *mm, size_t size)
 {
 	unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
 
-	down_write(&current->mm->mmap_sem);
+	down_write(&mm->mmap_sem);
 
-	current->mm->total_vm  -= pgsz;
-	current->mm->locked_vm -= pgsz;
+	mm->total_vm  -= pgsz;
+	mm->locked_vm -= pgsz;
 
-	up_write(&current->mm->mmap_sem);
+	up_write(&mm->mmap_sem);
 }
 
 void free_locked_buffer(void *buffer, size_t size)
 {
-	release_locked_buffer(buffer, size);
-
+	refund_locked_buffer_memory(current->mm, size);
 	kfree(buffer);
 }
-- 
cgit v0.10.2


From 8d99b3ac2726e5edd97ad147fa5c1f2acb63a745 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:36 +0200
Subject: x86, bts: wait until traced task has been scheduled out

In order to stop branch tracing for a running task, we need to first
clear the branch tracing control bits before we may free the tracing
buffer.

If the traced task is running, the cpu might still trace that task
after the branch trace control bits have cleared.

Wait until the traced task has been scheduled out before proceeding.

A similar problem affects the task debug store context. We first remove
the context, then we need to wait until the task has been scheduled
out before we can free the context memory.

Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144551.919636000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index c730155..5cd137a 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -299,6 +299,7 @@ static inline struct ds_context *ds_get_context(struct task_struct *task)
 
 static inline void ds_put_context(struct ds_context *context)
 {
+	struct task_struct *task;
 	unsigned long irq;
 
 	if (!context)
@@ -313,14 +314,20 @@ static inline void ds_put_context(struct ds_context *context)
 
 	*(context->this) = NULL;
 
-	if (context->task)
-		clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
+	task = context->task;
+
+	if (task)
+		clear_tsk_thread_flag(task, TIF_DS_AREA_MSR);
 
-	if (!context->task || (context->task == current))
+	if (!task || (task == current))
 		wrmsrl(MSR_IA32_DS_AREA, 0);
 
 	spin_unlock_irqrestore(&ds_lock, irq);
 
+	/* The context might still be in use for context switching. */
+	if (task && (task != current))
+		wait_task_context_switch(task);
+
 	kfree(context);
 }
 
@@ -781,15 +788,23 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
 
 void ds_release_bts(struct bts_tracer *tracer)
 {
+	struct task_struct *task;
+
 	if (!tracer)
 		return;
 
+	task = tracer->ds.context->task;
+
 	ds_suspend_bts(tracer);
 
 	WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
 	tracer->ds.context->bts_master = NULL;
 
-	put_tracer(tracer->ds.context->task);
+	/* Make sure tracing stopped and the tracer is not in use. */
+	if (task && (task != current))
+		wait_task_context_switch(task);
+
+	put_tracer(task);
 	ds_put_context(tracer->ds.context);
 
 	kfree(tracer);
-- 
cgit v0.10.2


From 38f801129ad07b9afa7f9bd3779f61b805416d8c Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:37 +0200
Subject: x86, bts: fix race between per-task and per-cpu branch tracing

Per-task branch tracing installs a debug store context with the traced
task. This immediately results in the branch trace control bits to be
cleared for the next context switch of that task, if not set before.

Either per-cpu or per-task tracing are allowed at the same time.

An active per-cpu tracing would be disabled even if the per-task tracing
request is rejected and the task debug store context removed.

Check the tracing type (per-cpu or per-task) before installing a task
debug store context.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144552.856000000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 5cd137a..f03f117 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -193,12 +193,28 @@ static DEFINE_SPINLOCK(ds_lock);
  */
 static atomic_t tracers = ATOMIC_INIT(0);
 
-static inline void get_tracer(struct task_struct *task)
+static inline int get_tracer(struct task_struct *task)
 {
-	if (task)
+	int error;
+
+	spin_lock_irq(&ds_lock);
+
+	if (task) {
+		error = -EPERM;
+		if (atomic_read(&tracers) < 0)
+			goto out;
 		atomic_inc(&tracers);
-	else
+	} else {
+		error = -EPERM;
+		if (atomic_read(&tracers) > 0)
+			goto out;
 		atomic_dec(&tracers);
+	}
+
+	error = 0;
+out:
+	spin_unlock_irq(&ds_lock);
+	return error;
 }
 
 static inline void put_tracer(struct task_struct *task)
@@ -209,14 +225,6 @@ static inline void put_tracer(struct task_struct *task)
 		atomic_inc(&tracers);
 }
 
-static inline int check_tracer(struct task_struct *task)
-{
-	return task ?
-		(atomic_read(&tracers) >= 0) :
-		(atomic_read(&tracers) <= 0);
-}
-
-
 /*
  * The DS context is either attached to a thread or to a cpu:
  * - in the former case, the thread_struct contains a pointer to the
@@ -677,6 +685,10 @@ struct bts_tracer *ds_request_bts(struct task_struct *task,
 	if (ovfl)
 		goto out;
 
+	error = get_tracer(task);
+	if (error < 0)
+		goto out;
+
 	/*
 	 * Per-cpu tracing is typically requested using smp_call_function().
 	 * We must not sleep.
@@ -684,7 +696,7 @@ struct bts_tracer *ds_request_bts(struct task_struct *task,
 	error = -ENOMEM;
 	tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC);
 	if (!tracer)
-		goto out;
+		goto out_put_tracer;
 	tracer->ovfl = ovfl;
 
 	error = ds_request(&tracer->ds, &tracer->trace.ds,
@@ -696,13 +708,8 @@ struct bts_tracer *ds_request_bts(struct task_struct *task,
 	spin_lock_irqsave(&ds_lock, irq);
 
 	error = -EPERM;
-	if (!check_tracer(task))
-		goto out_unlock;
-	get_tracer(task);
-
-	error = -EPERM;
 	if (tracer->ds.context->bts_master)
-		goto out_put_tracer;
+		goto out_unlock;
 	tracer->ds.context->bts_master = tracer;
 
 	spin_unlock_irqrestore(&ds_lock, irq);
@@ -716,13 +723,13 @@ struct bts_tracer *ds_request_bts(struct task_struct *task,
 
 	return tracer;
 
- out_put_tracer:
-	put_tracer(task);
  out_unlock:
 	spin_unlock_irqrestore(&ds_lock, irq);
 	ds_put_context(tracer->ds.context);
  out_tracer:
 	kfree(tracer);
+ out_put_tracer:
+	put_tracer(task);
  out:
 	return ERR_PTR(error);
 }
@@ -741,6 +748,10 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
 	if (ovfl)
 		goto out;
 
+	error = get_tracer(task);
+	if (error < 0)
+		goto out;
+
 	/*
 	 * Per-cpu tracing is typically requested using smp_call_function().
 	 * We must not sleep.
@@ -748,7 +759,7 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
 	error = -ENOMEM;
 	tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC);
 	if (!tracer)
-		goto out;
+		goto out_put_tracer;
 	tracer->ovfl = ovfl;
 
 	error = ds_request(&tracer->ds, &tracer->trace.ds,
@@ -759,13 +770,8 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
 	spin_lock_irqsave(&ds_lock, irq);
 
 	error = -EPERM;
-	if (!check_tracer(task))
-		goto out_unlock;
-	get_tracer(task);
-
-	error = -EPERM;
 	if (tracer->ds.context->pebs_master)
-		goto out_put_tracer;
+		goto out_unlock;
 	tracer->ds.context->pebs_master = tracer;
 
 	spin_unlock_irqrestore(&ds_lock, irq);
@@ -775,13 +781,13 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
 
 	return tracer;
 
- out_put_tracer:
-	put_tracer(task);
  out_unlock:
 	spin_unlock_irqrestore(&ds_lock, irq);
 	ds_put_context(tracer->ds.context);
  out_tracer:
 	kfree(tracer);
+ out_put_tracer:
+	put_tracer(task);
  out:
 	return ERR_PTR(error);
 }
@@ -804,8 +810,8 @@ void ds_release_bts(struct bts_tracer *tracer)
 	if (task && (task != current))
 		wait_task_context_switch(task);
 
-	put_tracer(task);
 	ds_put_context(tracer->ds.context);
+	put_tracer(task);
 
 	kfree(tracer);
 }
@@ -861,16 +867,20 @@ void ds_resume_bts(struct bts_tracer *tracer)
 
 void ds_release_pebs(struct pebs_tracer *tracer)
 {
+	struct task_struct *task;
+
 	if (!tracer)
 		return;
 
+	task = tracer->ds.context->task;
+
 	ds_suspend_pebs(tracer);
 
 	WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
 	tracer->ds.context->pebs_master = NULL;
 
-	put_tracer(tracer->ds.context->task);
 	ds_put_context(tracer->ds.context);
+	put_tracer(task);
 
 	kfree(tracer);
 }
-- 
cgit v0.10.2


From 15879d042164650b93d83281ad5f87ad323bfbfe Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:38 +0200
Subject: x86, bts: use trace_clock_global() for timestamps

Rename the bts_struct timestamp field to event.

Use trace_clock_global() for time measurement.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144553.773216000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
index a8f672b..772f141 100644
--- a/arch/x86/include/asm/ds.h
+++ b/arch/x86/include/asm/ds.h
@@ -170,9 +170,9 @@ struct bts_struct {
 		} lbr;
 		/* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */
 		struct {
-			__u64 jiffies;
+			__u64 clock;
 			pid_t pid;
-		} timestamp;
+		} event;
 	} variant;
 };
 
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index f03f117..2071b99 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -25,6 +25,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
+#include <linux/trace_clock.h>
 
 #include <asm/ds.h>
 
@@ -471,7 +472,7 @@ enum bts_field {
 	bts_flags,
 
 	bts_qual		= bts_from,
-	bts_jiffies		= bts_to,
+	bts_clock		= bts_to,
 	bts_pid			= bts_flags,
 
 	bts_qual_mask		= (bts_qual_max - 1),
@@ -517,8 +518,8 @@ bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
 	memset(out, 0, sizeof(*out));
 	if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
 		out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
-		out->variant.timestamp.jiffies = bts_get(at, bts_jiffies);
-		out->variant.timestamp.pid = bts_get(at, bts_pid);
+		out->variant.event.clock = bts_get(at, bts_clock);
+		out->variant.event.pid = bts_get(at, bts_pid);
 	} else {
 		out->qualifier = bts_branch;
 		out->variant.lbr.from = bts_get(at, bts_from);
@@ -555,8 +556,8 @@ static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
 	case bts_task_arrives:
 	case bts_task_departs:
 		bts_set(raw, bts_qual, (bts_escape | in->qualifier));
-		bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies);
-		bts_set(raw, bts_pid, in->variant.timestamp.pid);
+		bts_set(raw, bts_clock, in->variant.event.clock);
+		bts_set(raw, bts_pid, in->variant.event.pid);
 		break;
 	default:
 		return -EINVAL;
@@ -1083,9 +1084,9 @@ static inline void ds_take_timestamp(struct ds_context *context,
 		return;
 
 	memset(&ts, 0, sizeof(ts));
-	ts.qualifier			= qualifier;
-	ts.variant.timestamp.jiffies	= jiffies_64;
-	ts.variant.timestamp.pid	= task->pid;
+	ts.qualifier		= qualifier;
+	ts.variant.event.clock	= trace_clock_global();
+	ts.variant.event.pid	= task->pid;
 
 	bts_write(tracer, &ts);
 }
-- 
cgit v0.10.2


From 35bb7600c17762bb129588c1877d2717fe325289 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:39 +0200
Subject: x86, debugctlmsr: add _on_cpu variants to debugctlmsr functions

Add functions to get and set the debugctlmsr on different cpus.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144554.738772000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 2483807..1efeb49 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -785,6 +785,21 @@ static inline unsigned long get_debugctlmsr(void)
     return debugctlmsr;
 }
 
+static inline unsigned long get_debugctlmsr_on_cpu(int cpu)
+{
+	u64 debugctlmsr = 0;
+	u32 val1, val2;
+
+#ifndef CONFIG_X86_DEBUGCTLMSR
+	if (boot_cpu_data.x86 < 6)
+		return 0;
+#endif
+	rdmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, &val1, &val2);
+	debugctlmsr = val1 | ((u64)val2 << 32);
+
+	return debugctlmsr;
+}
+
 static inline void update_debugctlmsr(unsigned long debugctlmsr)
 {
 #ifndef CONFIG_X86_DEBUGCTLMSR
@@ -794,6 +809,18 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr)
 	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
 }
 
+static inline void update_debugctlmsr_on_cpu(int cpu,
+					     unsigned long debugctlmsr)
+{
+#ifndef CONFIG_X86_DEBUGCTLMSR
+	if (boot_cpu_data.x86 < 6)
+		return;
+#endif
+	wrmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR,
+		     (u32)((u64)debugctlmsr),
+		     (u32)((u64)debugctlmsr >> 32));
+}
+
 /*
  * from system description table in BIOS. Mostly for MCA use, but
  * others may find it useful:
-- 
cgit v0.10.2


From de79f54f5347ad7ec6ff55ccbb6d4ab2a21f6a93 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:40 +0200
Subject: x86, bts, hw-branch-tracer: add _noirq variants to the debug store
 interface

The hw-branch-tracer uses debug store functions from an on_each_cpu()
context, which is simply wrong since the functions may sleep.

Add _noirq variants for most functions, which  may be called with
interrupts disabled.

Separate per-cpu and per-task tracing and allow per-cpu tracing to be
controlled from any cpu.

Make the hw-branch-tracer use the new debug store interface, synchronize
with hotplug cpu event using get/put_online_cpus(), and remove the
unnecessary spinlock.

Make the ptrace bts and the ds selftest code use the new interface.

Defer the ds selftest.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144555.658136000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
index 772f141..413e127 100644
--- a/arch/x86/include/asm/ds.h
+++ b/arch/x86/include/asm/ds.h
@@ -15,8 +15,8 @@
  * - buffer allocation (memory accounting)
  *
  *
- * Copyright (C) 2007-2008 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008
+ * Copyright (C) 2007-2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
  */
 
 #ifndef _ASM_X86_DS_H
@@ -83,8 +83,10 @@ enum ds_feature {
  * The interrupt threshold is independent from the overflow callback
  * to allow users to use their own overflow interrupt handling mechanism.
  *
- * task: the task to request recording for;
- *       NULL for per-cpu recording on the current cpu
+ * The function might sleep.
+ *
+ * task: the task to request recording for
+ * cpu:  the cpu to request recording for
  * base: the base pointer for the (non-pageable) buffer;
  * size: the size of the provided buffer in bytes
  * ovfl: pointer to a function to be called on buffer overflow;
@@ -93,19 +95,28 @@ enum ds_feature {
  *     -1 if no interrupt threshold is requested.
  * flags: a bit-mask of the above flags
  */
-extern struct bts_tracer *ds_request_bts(struct task_struct *task,
-					 void *base, size_t size,
-					 bts_ovfl_callback_t ovfl,
-					 size_t th, unsigned int flags);
-extern struct pebs_tracer *ds_request_pebs(struct task_struct *task,
-					   void *base, size_t size,
-					   pebs_ovfl_callback_t ovfl,
-					   size_t th, unsigned int flags);
+extern struct bts_tracer *ds_request_bts_task(struct task_struct *task,
+					      void *base, size_t size,
+					      bts_ovfl_callback_t ovfl,
+					      size_t th, unsigned int flags);
+extern struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
+					     bts_ovfl_callback_t ovfl,
+					     size_t th, unsigned int flags);
+extern struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
+						void *base, size_t size,
+						pebs_ovfl_callback_t ovfl,
+						size_t th, unsigned int flags);
+extern struct pebs_tracer *ds_request_pebs_cpu(int cpu,
+					       void *base, size_t size,
+					       pebs_ovfl_callback_t ovfl,
+					       size_t th, unsigned int flags);
 
 /*
  * Release BTS or PEBS resources
  * Suspend and resume BTS or PEBS tracing
  *
+ * Must be called with irq's enabled.
+ *
  * tracer: the tracer handle returned from ds_request_~()
  */
 extern void ds_release_bts(struct bts_tracer *tracer);
@@ -115,6 +126,28 @@ extern void ds_release_pebs(struct pebs_tracer *tracer);
 extern void ds_suspend_pebs(struct pebs_tracer *tracer);
 extern void ds_resume_pebs(struct pebs_tracer *tracer);
 
+/*
+ * Release BTS or PEBS resources
+ * Suspend and resume BTS or PEBS tracing
+ *
+ * Cpu tracers must call this on the traced cpu.
+ * Task tracers must call ds_release_~_noirq() for themselves.
+ *
+ * May be called with irq's disabled.
+ *
+ * Returns 0 if successful;
+ * -EPERM if the cpu tracer does not trace the current cpu.
+ * -EPERM if the task tracer does not trace itself.
+ *
+ * tracer: the tracer handle returned from ds_request_~()
+ */
+extern int ds_release_bts_noirq(struct bts_tracer *tracer);
+extern int ds_suspend_bts_noirq(struct bts_tracer *tracer);
+extern int ds_resume_bts_noirq(struct bts_tracer *tracer);
+extern int ds_release_pebs_noirq(struct pebs_tracer *tracer);
+extern int ds_suspend_pebs_noirq(struct pebs_tracer *tracer);
+extern int ds_resume_pebs_noirq(struct pebs_tracer *tracer);
+
 
 /*
  * The raw DS buffer state as it is used for BTS and PEBS recording.
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 2071b99..21a3852 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -245,60 +245,50 @@ struct ds_context {
 	struct pebs_tracer	*pebs_master;
 
 	/* Use count: */
-	unsigned long count;
+	unsigned long		count;
 
 	/* Pointer to the context pointer field: */
 	struct ds_context	**this;
 
-	/* The traced task; NULL for current cpu: */
+	/* The traced task; NULL for cpu tracing: */
 	struct task_struct	*task;
-};
 
-static DEFINE_PER_CPU(struct ds_context *, system_context_array);
+	/* The traced cpu; only valid if task is NULL: */
+	int			cpu;
+};
 
-#define system_context per_cpu(system_context_array, smp_processor_id())
+static DEFINE_PER_CPU(struct ds_context *, cpu_context);
 
 
-static inline struct ds_context *ds_get_context(struct task_struct *task)
+static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
 {
 	struct ds_context **p_context =
-		(task ? &task->thread.ds_ctx : &system_context);
+		(task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu));
 	struct ds_context *context = NULL;
 	struct ds_context *new_context = NULL;
-	unsigned long irq;
 
-	/*
-	 * Chances are small that we already have a context.
-	 *
-	 * Contexts for per-cpu tracing are allocated using
-	 * smp_call_function(). We must not sleep.
-	 */
-	new_context = kzalloc(sizeof(*new_context), GFP_ATOMIC);
+	/* Chances are small that we already have a context. */
+	new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
 	if (!new_context)
 		return NULL;
 
-	spin_lock_irqsave(&ds_lock, irq);
+	spin_lock_irq(&ds_lock);
 
 	context = *p_context;
-	if (!context) {
+	if (likely(!context)) {
 		context = new_context;
 
 		context->this = p_context;
 		context->task = task;
+		context->cpu = cpu;
 		context->count = 0;
 
-		if (task)
-			set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
-
-		if (!task || (task == current))
-			wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds);
-
 		*p_context = context;
 	}
 
 	context->count++;
 
-	spin_unlock_irqrestore(&ds_lock, irq);
+	spin_unlock_irq(&ds_lock);
 
 	if (context != new_context)
 		kfree(new_context);
@@ -306,7 +296,7 @@ static inline struct ds_context *ds_get_context(struct task_struct *task)
 	return context;
 }
 
-static inline void ds_put_context(struct ds_context *context)
+static void ds_put_context(struct ds_context *context)
 {
 	struct task_struct *task;
 	unsigned long irq;
@@ -328,8 +318,15 @@ static inline void ds_put_context(struct ds_context *context)
 	if (task)
 		clear_tsk_thread_flag(task, TIF_DS_AREA_MSR);
 
-	if (!task || (task == current))
-		wrmsrl(MSR_IA32_DS_AREA, 0);
+	/*
+	 * We leave the (now dangling) pointer to the DS configuration in
+	 * the DS_AREA msr. This is as good or as bad as replacing it with
+	 * NULL - the hardware would crash if we enabled tracing.
+	 *
+	 * This saves us some problems with having to write an msr on a
+	 * different cpu while preventing others from doing the same for the
+	 * next context for that same cpu.
+	 */
 
 	spin_unlock_irqrestore(&ds_lock, irq);
 
@@ -340,6 +337,31 @@ static inline void ds_put_context(struct ds_context *context)
 	kfree(context);
 }
 
+static void ds_install_ds_area(struct ds_context *context)
+{
+	unsigned long ds;
+
+	ds = (unsigned long)context->ds;
+
+	/*
+	 * There is a race between the bts master and the pebs master.
+	 *
+	 * The thread/cpu access is synchronized via get/put_cpu() for
+	 * task tracing and via wrmsr_on_cpu for cpu tracing.
+	 *
+	 * If bts and pebs are collected for the same task or same cpu,
+	 * the same confiuration is written twice.
+	 */
+	if (context->task) {
+		get_cpu();
+		if (context->task == current)
+			wrmsrl(MSR_IA32_DS_AREA, ds);
+		set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
+		put_cpu();
+	} else
+		wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA,
+			     (u32)((u64)ds), (u32)((u64)ds >> 32));
+}
 
 /*
  * Call the tracer's callback on a buffer overflow.
@@ -622,6 +644,7 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
 	 * The value for 'no threshold' is -1, which will set the
 	 * threshold outside of the buffer, just like we want it.
 	 */
+	ith *= ds_cfg.sizeof_rec[qual];
 	trace->ith = (void *)(buffer + size - ith);
 
 	trace->flags = flags;
@@ -630,7 +653,7 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
 
 static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
 		      enum ds_qualifier qual, struct task_struct *task,
-		      void *base, size_t size, size_t th, unsigned int flags)
+		      int cpu, void *base, size_t size, size_t th)
 {
 	struct ds_context *context;
 	int error;
@@ -643,7 +666,7 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
 	if (!base)
 		goto out;
 
-	/* We require some space to do alignment adjustments below. */
+	/* We need space for alignment adjustments in ds_init_ds_trace(). */
 	error = -EINVAL;
 	if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual]))
 		goto out;
@@ -660,25 +683,27 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
 	tracer->size = size;
 
 	error = -ENOMEM;
-	context = ds_get_context(task);
+	context = ds_get_context(task, cpu);
 	if (!context)
 		goto out;
 	tracer->context = context;
 
-	ds_init_ds_trace(trace, qual, base, size, th, flags);
+	/*
+	 * Defer any tracer-specific initialization work for the context until
+	 * context ownership has been clarified.
+	 */
 
 	error = 0;
  out:
 	return error;
 }
 
-struct bts_tracer *ds_request_bts(struct task_struct *task,
-				  void *base, size_t size,
-				  bts_ovfl_callback_t ovfl, size_t th,
-				  unsigned int flags)
+static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu,
+					 void *base, size_t size,
+					 bts_ovfl_callback_t ovfl, size_t th,
+					 unsigned int flags)
 {
 	struct bts_tracer *tracer;
-	unsigned long irq;
 	int error;
 
 	/* Buffer overflow notification is not yet implemented. */
@@ -690,42 +715,46 @@ struct bts_tracer *ds_request_bts(struct task_struct *task,
 	if (error < 0)
 		goto out;
 
-	/*
-	 * Per-cpu tracing is typically requested using smp_call_function().
-	 * We must not sleep.
-	 */
 	error = -ENOMEM;
-	tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC);
+	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
 	if (!tracer)
 		goto out_put_tracer;
 	tracer->ovfl = ovfl;
 
+	/* Do some more error checking and acquire a tracing context. */
 	error = ds_request(&tracer->ds, &tracer->trace.ds,
-			   ds_bts, task, base, size, th, flags);
+			   ds_bts, task, cpu, base, size, th);
 	if (error < 0)
 		goto out_tracer;
 
-
-	spin_lock_irqsave(&ds_lock, irq);
+	/* Claim the bts part of the tracing context we acquired above. */
+	spin_lock_irq(&ds_lock);
 
 	error = -EPERM;
 	if (tracer->ds.context->bts_master)
 		goto out_unlock;
 	tracer->ds.context->bts_master = tracer;
 
-	spin_unlock_irqrestore(&ds_lock, irq);
+	spin_unlock_irq(&ds_lock);
 
+	/*
+	 * Now that we own the bts part of the context, let's complete the
+	 * initialization for that part.
+	 */
+	ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags);
+	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+	ds_install_ds_area(tracer->ds.context);
 
 	tracer->trace.read  = bts_read;
 	tracer->trace.write = bts_write;
 
-	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+	/* Start tracing. */
 	ds_resume_bts(tracer);
 
 	return tracer;
 
  out_unlock:
-	spin_unlock_irqrestore(&ds_lock, irq);
+	spin_unlock_irq(&ds_lock);
 	ds_put_context(tracer->ds.context);
  out_tracer:
 	kfree(tracer);
@@ -735,13 +764,27 @@ struct bts_tracer *ds_request_bts(struct task_struct *task,
 	return ERR_PTR(error);
 }
 
-struct pebs_tracer *ds_request_pebs(struct task_struct *task,
-				    void *base, size_t size,
-				    pebs_ovfl_callback_t ovfl, size_t th,
-				    unsigned int flags)
+struct bts_tracer *ds_request_bts_task(struct task_struct *task,
+				       void *base, size_t size,
+				       bts_ovfl_callback_t ovfl,
+				       size_t th, unsigned int flags)
+{
+	return ds_request_bts(task, 0, base, size, ovfl, th, flags);
+}
+
+struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
+				      bts_ovfl_callback_t ovfl,
+				      size_t th, unsigned int flags)
+{
+	return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags);
+}
+
+static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu,
+					   void *base, size_t size,
+					   pebs_ovfl_callback_t ovfl, size_t th,
+					   unsigned int flags)
 {
 	struct pebs_tracer *tracer;
-	unsigned long irq;
 	int error;
 
 	/* Buffer overflow notification is not yet implemented. */
@@ -753,37 +796,43 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
 	if (error < 0)
 		goto out;
 
-	/*
-	 * Per-cpu tracing is typically requested using smp_call_function().
-	 * We must not sleep.
-	 */
 	error = -ENOMEM;
-	tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC);
+	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
 	if (!tracer)
 		goto out_put_tracer;
 	tracer->ovfl = ovfl;
 
+	/* Do some more error checking and acquire a tracing context. */
 	error = ds_request(&tracer->ds, &tracer->trace.ds,
-			   ds_pebs, task, base, size, th, flags);
+			   ds_pebs, task, cpu, base, size, th);
 	if (error < 0)
 		goto out_tracer;
 
-	spin_lock_irqsave(&ds_lock, irq);
+	/* Claim the pebs part of the tracing context we acquired above. */
+	spin_lock_irq(&ds_lock);
 
 	error = -EPERM;
 	if (tracer->ds.context->pebs_master)
 		goto out_unlock;
 	tracer->ds.context->pebs_master = tracer;
 
-	spin_unlock_irqrestore(&ds_lock, irq);
+	spin_unlock_irq(&ds_lock);
 
+	/*
+	 * Now that we own the pebs part of the context, let's complete the
+	 * initialization for that part.
+	 */
+	ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags);
 	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
+	ds_install_ds_area(tracer->ds.context);
+
+	/* Start tracing. */
 	ds_resume_pebs(tracer);
 
 	return tracer;
 
  out_unlock:
-	spin_unlock_irqrestore(&ds_lock, irq);
+	spin_unlock_irq(&ds_lock);
 	ds_put_context(tracer->ds.context);
  out_tracer:
 	kfree(tracer);
@@ -793,16 +842,26 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
 	return ERR_PTR(error);
 }
 
-void ds_release_bts(struct bts_tracer *tracer)
+struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
+					 void *base, size_t size,
+					 pebs_ovfl_callback_t ovfl,
+					 size_t th, unsigned int flags)
 {
-	struct task_struct *task;
+	return ds_request_pebs(task, 0, base, size, ovfl, th, flags);
+}
 
-	if (!tracer)
-		return;
+struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size,
+					pebs_ovfl_callback_t ovfl,
+					size_t th, unsigned int flags)
+{
+	return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags);
+}
 
-	task = tracer->ds.context->task;
+static void ds_free_bts(struct bts_tracer *tracer)
+{
+	struct task_struct *task;
 
-	ds_suspend_bts(tracer);
+	task = tracer->ds.context->task;
 
 	WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
 	tracer->ds.context->bts_master = NULL;
@@ -817,9 +876,69 @@ void ds_release_bts(struct bts_tracer *tracer)
 	kfree(tracer);
 }
 
+void ds_release_bts(struct bts_tracer *tracer)
+{
+	might_sleep();
+
+	if (!tracer)
+		return;
+
+	ds_suspend_bts(tracer);
+	ds_free_bts(tracer);
+}
+
+int ds_release_bts_noirq(struct bts_tracer *tracer)
+{
+	struct task_struct *task;
+	unsigned long irq;
+	int error;
+
+	if (!tracer)
+		return 0;
+
+	task = tracer->ds.context->task;
+
+	local_irq_save(irq);
+
+	error = -EPERM;
+	if (!task &&
+	    (tracer->ds.context->cpu != smp_processor_id()))
+		goto out;
+
+	error = -EPERM;
+	if (task && (task != current))
+		goto out;
+
+	ds_suspend_bts_noirq(tracer);
+	ds_free_bts(tracer);
+
+	error = 0;
+ out:
+	local_irq_restore(irq);
+	return error;
+}
+
+static void update_task_debugctlmsr(struct task_struct *task,
+				    unsigned long debugctlmsr)
+{
+	task->thread.debugctlmsr = debugctlmsr;
+
+	get_cpu();
+	if (task == current)
+		update_debugctlmsr(debugctlmsr);
+
+	if (task->thread.debugctlmsr)
+		set_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
+	else
+		clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
+	put_cpu();
+}
+
 void ds_suspend_bts(struct bts_tracer *tracer)
 {
 	struct task_struct *task;
+	unsigned long debugctlmsr;
+	int cpu;
 
 	if (!tracer)
 		return;
@@ -827,29 +946,60 @@ void ds_suspend_bts(struct bts_tracer *tracer)
 	tracer->flags = 0;
 
 	task = tracer->ds.context->task;
+	cpu  = tracer->ds.context->cpu;
 
-	if (!task || (task == current))
-		update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL);
+	WARN_ON(!task && irqs_disabled());
 
-	if (task) {
-		task->thread.debugctlmsr &= ~BTS_CONTROL;
+	debugctlmsr = (task ?
+		       task->thread.debugctlmsr :
+		       get_debugctlmsr_on_cpu(cpu));
+	debugctlmsr &= ~BTS_CONTROL;
 
-		if (!task->thread.debugctlmsr)
-			clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
-	}
+	if (task)
+		update_task_debugctlmsr(task, debugctlmsr);
+	else
+		update_debugctlmsr_on_cpu(cpu, debugctlmsr);
 }
 
-void ds_resume_bts(struct bts_tracer *tracer)
+int ds_suspend_bts_noirq(struct bts_tracer *tracer)
 {
 	struct task_struct *task;
-	unsigned long control;
+	unsigned long debugctlmsr, irq;
+	int cpu, error = 0;
 
 	if (!tracer)
-		return;
+		return 0;
 
-	tracer->flags = tracer->trace.ds.flags;
+	tracer->flags = 0;
 
 	task = tracer->ds.context->task;
+	cpu  = tracer->ds.context->cpu;
+
+	local_irq_save(irq);
+
+	error = -EPERM;
+	if (!task && (cpu != smp_processor_id()))
+		goto out;
+
+	debugctlmsr = (task ?
+		       task->thread.debugctlmsr :
+		       get_debugctlmsr());
+	debugctlmsr &= ~BTS_CONTROL;
+
+	if (task)
+		update_task_debugctlmsr(task, debugctlmsr);
+	else
+		update_debugctlmsr(debugctlmsr);
+
+	error = 0;
+ out:
+	local_irq_restore(irq);
+	return error;
+}
+
+static unsigned long ds_bts_control(struct bts_tracer *tracer)
+{
+	unsigned long control;
 
 	control = ds_cfg.ctl[dsf_bts];
 	if (!(tracer->trace.ds.flags & BTS_KERNEL))
@@ -857,25 +1007,77 @@ void ds_resume_bts(struct bts_tracer *tracer)
 	if (!(tracer->trace.ds.flags & BTS_USER))
 		control |= ds_cfg.ctl[dsf_bts_user];
 
-	if (task) {
-		task->thread.debugctlmsr |= control;
-		set_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
-	}
-
-	if (!task || (task == current))
-		update_debugctlmsr(get_debugctlmsr() | control);
+	return control;
 }
 
-void ds_release_pebs(struct pebs_tracer *tracer)
+void ds_resume_bts(struct bts_tracer *tracer)
 {
 	struct task_struct *task;
+	unsigned long debugctlmsr;
+	int cpu;
 
 	if (!tracer)
 		return;
 
+	tracer->flags = tracer->trace.ds.flags;
+
 	task = tracer->ds.context->task;
+	cpu  = tracer->ds.context->cpu;
 
-	ds_suspend_pebs(tracer);
+	WARN_ON(!task && irqs_disabled());
+
+	debugctlmsr = (task ?
+		       task->thread.debugctlmsr :
+		       get_debugctlmsr_on_cpu(cpu));
+	debugctlmsr |= ds_bts_control(tracer);
+
+	if (task)
+		update_task_debugctlmsr(task, debugctlmsr);
+	else
+		update_debugctlmsr_on_cpu(cpu, debugctlmsr);
+}
+
+int ds_resume_bts_noirq(struct bts_tracer *tracer)
+{
+	struct task_struct *task;
+	unsigned long debugctlmsr, irq;
+	int cpu, error = 0;
+
+	if (!tracer)
+		return 0;
+
+	tracer->flags = tracer->trace.ds.flags;
+
+	task = tracer->ds.context->task;
+	cpu  = tracer->ds.context->cpu;
+
+	local_irq_save(irq);
+
+	error = -EPERM;
+	if (!task && (cpu != smp_processor_id()))
+		goto out;
+
+	debugctlmsr = (task ?
+		       task->thread.debugctlmsr :
+		       get_debugctlmsr());
+	debugctlmsr |= ds_bts_control(tracer);
+
+	if (task)
+		update_task_debugctlmsr(task, debugctlmsr);
+	else
+		update_debugctlmsr(debugctlmsr);
+
+	error = 0;
+ out:
+	local_irq_restore(irq);
+	return error;
+}
+
+static void ds_free_pebs(struct pebs_tracer *tracer)
+{
+	struct task_struct *task;
+
+	task = tracer->ds.context->task;
 
 	WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
 	tracer->ds.context->pebs_master = NULL;
@@ -886,16 +1088,68 @@ void ds_release_pebs(struct pebs_tracer *tracer)
 	kfree(tracer);
 }
 
+void ds_release_pebs(struct pebs_tracer *tracer)
+{
+	might_sleep();
+
+	if (!tracer)
+		return;
+
+	ds_suspend_pebs(tracer);
+	ds_free_pebs(tracer);
+}
+
+int ds_release_pebs_noirq(struct pebs_tracer *tracer)
+{
+	struct task_struct *task;
+	unsigned long irq;
+	int error;
+
+	if (!tracer)
+		return 0;
+
+	task = tracer->ds.context->task;
+
+	local_irq_save(irq);
+
+	error = -EPERM;
+	if (!task &&
+	    (tracer->ds.context->cpu != smp_processor_id()))
+		goto out;
+
+	error = -EPERM;
+	if (task && (task != current))
+		goto out;
+
+	ds_suspend_pebs_noirq(tracer);
+	ds_free_pebs(tracer);
+
+	error = 0;
+ out:
+	local_irq_restore(irq);
+	return error;
+}
+
 void ds_suspend_pebs(struct pebs_tracer *tracer)
 {
 
 }
 
+int ds_suspend_pebs_noirq(struct pebs_tracer *tracer)
+{
+	return 0;
+}
+
 void ds_resume_pebs(struct pebs_tracer *tracer)
 {
 
 }
 
+int ds_resume_pebs_noirq(struct pebs_tracer *tracer)
+{
+	return 0;
+}
+
 const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
 {
 	if (!tracer)
@@ -1004,26 +1258,6 @@ ds_configure(const struct ds_configuration *cfg,
 		printk(KERN_INFO "[ds] pebs not available\n");
 	}
 
-	if (ds_cfg.sizeof_rec[ds_bts]) {
-		int error;
-
-		error = ds_selftest_bts();
-		if (error) {
-			WARN(1, "[ds] selftest failed. disabling bts.\n");
-			ds_cfg.sizeof_rec[ds_bts] = 0;
-		}
-	}
-
-	if (ds_cfg.sizeof_rec[ds_pebs]) {
-		int error;
-
-		error = ds_selftest_pebs();
-		if (error) {
-			WARN(1, "[ds] selftest failed. disabling pebs.\n");
-			ds_cfg.sizeof_rec[ds_pebs] = 0;
-		}
-	}
-
 	printk(KERN_INFO "[ds] sizes: address: %u bit, ",
 	       8 * ds_cfg.sizeof_ptr_field);
 	printk("bts/pebs record: %u/%u bytes\n",
@@ -1127,3 +1361,29 @@ void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
 void ds_exit_thread(struct task_struct *tsk)
 {
 }
+
+static __init int ds_selftest(void)
+{
+	if (ds_cfg.sizeof_rec[ds_bts]) {
+		int error;
+
+		error = ds_selftest_bts();
+		if (error) {
+			WARN(1, "[ds] selftest failed. disabling bts.\n");
+			ds_cfg.sizeof_rec[ds_bts] = 0;
+		}
+	}
+
+	if (ds_cfg.sizeof_rec[ds_pebs]) {
+		int error;
+
+		error = ds_selftest_pebs();
+		if (error) {
+			WARN(1, "[ds] selftest failed. disabling pebs.\n");
+			ds_cfg.sizeof_rec[ds_pebs] = 0;
+		}
+	}
+
+	return 0;
+}
+device_initcall(ds_selftest);
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
index 8c46fbf..e5a263c 100644
--- a/arch/x86/kernel/ds_selftest.c
+++ b/arch/x86/kernel/ds_selftest.c
@@ -10,11 +10,12 @@
 
 #include <linux/kernel.h>
 #include <linux/string.h>
+#include <linux/smp.h>
 
 #include <asm/ds.h>
 
 
-#define DS_SELFTEST_BUFFER_SIZE 1021 /* Intentionally chose an odd size. */
+#define BUFFER_SIZE 1021 /* Intentionally chose an odd size. */
 
 
 static int ds_selftest_bts_consistency(const struct bts_trace *trace)
@@ -125,12 +126,12 @@ int ds_selftest_bts(void)
 	struct bts_tracer *tracer;
 	int error = 0;
 	void *top;
-	unsigned char buffer[DS_SELFTEST_BUFFER_SIZE];
+	unsigned char buffer[BUFFER_SIZE];
 
 	printk(KERN_INFO "[ds] bts selftest...");
 
-	tracer = ds_request_bts(NULL, buffer, DS_SELFTEST_BUFFER_SIZE,
-				NULL, (size_t)-1, BTS_KERNEL);
+	tracer = ds_request_bts_cpu(smp_processor_id(), buffer, BUFFER_SIZE,
+				    NULL, (size_t)-1, BTS_KERNEL);
 	if (IS_ERR(tracer)) {
 		error = PTR_ERR(tracer);
 		tracer = NULL;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 7c21d1e..adbb243 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -800,8 +800,9 @@ static int ptrace_bts_config(struct task_struct *child,
 	if (cfg.flags & PTRACE_BTS_O_SCHED)
 		flags |= BTS_TIMESTAMPS;
 
-	context->tracer = ds_request_bts(child, context->buffer, context->size,
-					 NULL, (size_t)-1, flags);
+	context->tracer =
+		ds_request_bts_task(child, context->buffer, context->size,
+				    NULL, (size_t)-1, flags);
 	if (unlikely(IS_ERR(context->tracer))) {
 		int error = PTR_ERR(context->tracer);
 
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 8b2109a..50565d8 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -4,7 +4,6 @@
  * Copyright (C) 2008-2009 Intel Corporation.
  * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
  */
-#include <linux/spinlock.h>
 #include <linux/kallsyms.h>
 #include <linux/debugfs.h>
 #include <linux/ftrace.h>
@@ -21,168 +20,113 @@
 
 #define BTS_BUFFER_SIZE (1 << 13)
 
-/*
- * The tracer lock protects the below per-cpu tracer array.
- * It needs to be held to:
- * - start tracing on all cpus
- * - stop tracing on all cpus
- * - start tracing on a single hotplug cpu
- * - stop tracing on a single hotplug cpu
- * - read the trace from all cpus
- * - read the trace from a single cpu
- */
-static DEFINE_SPINLOCK(bts_tracer_lock);
 static DEFINE_PER_CPU(struct bts_tracer *, tracer);
 static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer);
 
 #define this_tracer per_cpu(tracer, smp_processor_id())
-#define this_buffer per_cpu(buffer, smp_processor_id())
 
 static int trace_hw_branches_enabled __read_mostly;
 static int trace_hw_branches_suspended __read_mostly;
 static struct trace_array *hw_branch_trace __read_mostly;
 
 
-/*
- * Initialize the tracer for the current cpu.
- * The argument is ignored.
- *
- * pre: bts_tracer_lock must be locked.
- */
-static void bts_trace_init_cpu(void *arg)
+static void bts_trace_init_cpu(int cpu)
 {
-	if (this_tracer)
-		ds_release_bts(this_tracer);
+	per_cpu(tracer, cpu) =
+		ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE,
+				   NULL, (size_t)-1, BTS_KERNEL);
 
-	this_tracer = ds_request_bts(NULL, this_buffer, BTS_BUFFER_SIZE,
-				     NULL, (size_t)-1, BTS_KERNEL);
-	if (IS_ERR(this_tracer)) {
-		this_tracer = NULL;
-		return;
-	}
+	if (IS_ERR(per_cpu(tracer, cpu)))
+		per_cpu(tracer, cpu) = NULL;
 }
 
 static int bts_trace_init(struct trace_array *tr)
 {
-	int cpu, avail;
-
-	spin_lock(&bts_tracer_lock);
+	int cpu;
 
 	hw_branch_trace = tr;
+	trace_hw_branches_enabled = 0;
 
-	on_each_cpu(bts_trace_init_cpu, NULL, 1);
-
-	/* Check on how many cpus we could enable tracing */
-	avail = 0;
-	for_each_online_cpu(cpu)
-		if (per_cpu(tracer, cpu))
-			avail++;
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		bts_trace_init_cpu(cpu);
 
-	trace_hw_branches_enabled = (avail ? 1 : 0);
+		if (likely(per_cpu(tracer, cpu)))
+			trace_hw_branches_enabled = 1;
+	}
 	trace_hw_branches_suspended = 0;
-
-	spin_unlock(&bts_tracer_lock);
-
+	put_online_cpus();
 
 	/* If we could not enable tracing on a single cpu, we fail. */
-	return avail ? 0 : -EOPNOTSUPP;
-}
-
-/*
- * Release the tracer for the current cpu.
- * The argument is ignored.
- *
- * pre: bts_tracer_lock must be locked.
- */
-static void bts_trace_release_cpu(void *arg)
-{
-	if (this_tracer) {
-		ds_release_bts(this_tracer);
-		this_tracer = NULL;
-	}
+	return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
 }
 
 static void bts_trace_reset(struct trace_array *tr)
 {
-	spin_lock(&bts_tracer_lock);
+	int cpu;
 
-	on_each_cpu(bts_trace_release_cpu, NULL, 1);
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		if (likely(per_cpu(tracer, cpu))) {
+			ds_release_bts(per_cpu(tracer, cpu));
+			per_cpu(tracer, cpu) = NULL;
+		}
+	}
 	trace_hw_branches_enabled = 0;
 	trace_hw_branches_suspended = 0;
-
-	spin_unlock(&bts_tracer_lock);
-}
-
-/*
- * Resume tracing on the current cpu.
- * The argument is ignored.
- *
- * pre: bts_tracer_lock must be locked.
- */
-static void bts_trace_resume_cpu(void *arg)
-{
-	if (this_tracer)
-		ds_resume_bts(this_tracer);
+	put_online_cpus();
 }
 
 static void bts_trace_start(struct trace_array *tr)
 {
-	spin_lock(&bts_tracer_lock);
+	int cpu;
 
-	on_each_cpu(bts_trace_resume_cpu, NULL, 1);
+	get_online_cpus();
+	for_each_online_cpu(cpu)
+		if (likely(per_cpu(tracer, cpu)))
+			ds_resume_bts(per_cpu(tracer, cpu));
 	trace_hw_branches_suspended = 0;
-
-	spin_unlock(&bts_tracer_lock);
-}
-
-/*
- * Suspend tracing on the current cpu.
- * The argument is ignored.
- *
- * pre: bts_tracer_lock must be locked.
- */
-static void bts_trace_suspend_cpu(void *arg)
-{
-	if (this_tracer)
-		ds_suspend_bts(this_tracer);
+	put_online_cpus();
 }
 
 static void bts_trace_stop(struct trace_array *tr)
 {
-	spin_lock(&bts_tracer_lock);
+	int cpu;
 
-	on_each_cpu(bts_trace_suspend_cpu, NULL, 1);
+	get_online_cpus();
+	for_each_online_cpu(cpu)
+		if (likely(per_cpu(tracer, cpu)))
+			ds_suspend_bts(per_cpu(tracer, cpu));
 	trace_hw_branches_suspended = 1;
-
-	spin_unlock(&bts_tracer_lock);
+	put_online_cpus();
 }
 
 static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
 				     unsigned long action, void *hcpu)
 {
-	unsigned int cpu = (unsigned long)hcpu;
-
-	spin_lock(&bts_tracer_lock);
-
-	if (!trace_hw_branches_enabled)
-		goto out;
+	int cpu = (long)hcpu;
 
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_DOWN_FAILED:
-		smp_call_function_single(cpu, bts_trace_init_cpu, NULL, 1);
-
-		if (trace_hw_branches_suspended)
-			smp_call_function_single(cpu, bts_trace_suspend_cpu,
-						 NULL, 1);
+		/* The notification is sent with interrupts enabled. */
+		if (trace_hw_branches_enabled) {
+			bts_trace_init_cpu(cpu);
+
+			if (trace_hw_branches_suspended &&
+			    likely(per_cpu(tracer, cpu)))
+				ds_suspend_bts(per_cpu(tracer, cpu));
+		}
 		break;
+
 	case CPU_DOWN_PREPARE:
-		smp_call_function_single(cpu, bts_trace_release_cpu, NULL, 1);
-		break;
+		/* The notification is sent with interrupts enabled. */
+		if (likely(per_cpu(tracer, cpu))) {
+			ds_release_bts(per_cpu(tracer, cpu));
+			per_cpu(tracer, cpu) = NULL;
+		}
 	}
 
- out:
-	spin_unlock(&bts_tracer_lock);
 	return NOTIFY_DONE;
 }
 
@@ -274,7 +218,7 @@ static void trace_bts_at(const struct bts_trace *trace, void *at)
 /*
  * Collect the trace on the current cpu and write it into the ftrace buffer.
  *
- * pre: bts_tracer_lock must be locked
+ * pre: tracing must be suspended on the current cpu
  */
 static void trace_bts_cpu(void *arg)
 {
@@ -291,10 +235,9 @@ static void trace_bts_cpu(void *arg)
 	if (unlikely(!this_tracer))
 		return;
 
-	ds_suspend_bts(this_tracer);
 	trace = ds_read_bts(this_tracer);
 	if (!trace)
-		goto out;
+		return;
 
 	for (at = trace->ds.top; (void *)at < trace->ds.end;
 	     at += trace->ds.size)
@@ -303,18 +246,27 @@ static void trace_bts_cpu(void *arg)
 	for (at = trace->ds.begin; (void *)at < trace->ds.top;
 	     at += trace->ds.size)
 		trace_bts_at(trace, at);
-
-out:
-	ds_resume_bts(this_tracer);
 }
 
 static void trace_bts_prepare(struct trace_iterator *iter)
 {
-	spin_lock(&bts_tracer_lock);
+	int cpu;
 
+	get_online_cpus();
+	for_each_online_cpu(cpu)
+		if (likely(per_cpu(tracer, cpu)))
+			ds_suspend_bts(per_cpu(tracer, cpu));
+	/*
+	 * We need to collect the trace on the respective cpu since ftrace
+	 * implicitly adds the record for the current cpu.
+	 * Once that is more flexible, we could collect the data from any cpu.
+	 */
 	on_each_cpu(trace_bts_cpu, iter->tr, 1);
 
-	spin_unlock(&bts_tracer_lock);
+	for_each_online_cpu(cpu)
+		if (likely(per_cpu(tracer, cpu)))
+			ds_resume_bts(per_cpu(tracer, cpu));
+	put_online_cpus();
 }
 
 static void trace_bts_close(struct trace_iterator *iter)
@@ -324,12 +276,11 @@ static void trace_bts_close(struct trace_iterator *iter)
 
 void trace_hw_branch_oops(void)
 {
-	spin_lock(&bts_tracer_lock);
-
-	if (trace_hw_branches_enabled)
+	if (this_tracer) {
+		ds_suspend_bts_noirq(this_tracer);
 		trace_bts_cpu(hw_branch_trace);
-
-	spin_unlock(&bts_tracer_lock);
+		ds_resume_bts_noirq(this_tracer);
+	}
 }
 
 struct tracer bts_tracer __read_mostly =
-- 
cgit v0.10.2


From 4d657e51dfc042216febd4a007c6f36881f9256d Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:41 +0200
Subject: x86, hw-branch-tracer: allocate selftest iterator on heap

Allocate the trace_iterator for the hw-branch-tracer selftest on the heap.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144556.578777000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 499d01c..00dd648 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -757,7 +757,7 @@ int
 trace_selftest_startup_hw_branches(struct tracer *trace,
 				   struct trace_array *tr)
 {
-	struct trace_iterator iter;
+	struct trace_iterator *iter;
 	struct tracer tracer;
 	unsigned long count;
 	int ret;
@@ -777,17 +777,21 @@ trace_selftest_startup_hw_branches(struct tracer *trace,
 	 * The hw-branch tracer needs to collect the trace from the various
 	 * cpu trace buffers - before tracing is stopped.
 	 */
-	memset(&iter, 0, sizeof(iter));
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return -ENOMEM;
+
 	memcpy(&tracer, trace, sizeof(tracer));
 
-	iter.trace = &tracer;
-	iter.tr = tr;
-	iter.pos = -1;
-	mutex_init(&iter.mutex);
+	iter->trace = &tracer;
+	iter->tr = tr;
+	iter->pos = -1;
+	mutex_init(&iter->mutex);
 
-	trace->open(&iter);
+	trace->open(iter);
 
-	mutex_destroy(&iter.mutex);
+	mutex_destroy(&iter->mutex);
+	kfree(iter);
 
 	tracing_stop();
 
-- 
cgit v0.10.2


From 353afeea24cc51aafc0ff21a72ec740b6f0af50c Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:42 +0200
Subject: x86, ds: fix compiler warning

Size_t is defined differently on i386 and x86_64.
Change type to avoid compiler warning.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144557.523964000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
index e5a263c..e1ba510 100644
--- a/arch/x86/kernel/ds_selftest.c
+++ b/arch/x86/kernel/ds_selftest.c
@@ -87,7 +87,7 @@ static int ds_selftest_bts_read(struct bts_tracer *tracer,
 	/* Now to the test itself. */
 	for (at = from; (void *)at < to; at += trace->ds.size) {
 		struct bts_struct bts;
-		size_t index;
+		unsigned long index;
 		int error;
 
 		if (((void *)at - trace->ds.begin) % trace->ds.size) {
-- 
cgit v0.10.2


From 84f201139245c30777ff858e71b8d7e134b8c3ed Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:43 +0200
Subject: x86, ds: fix bounds check in ds selftest

Fix a bad bounds check in the debug store selftest.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144558.450027000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
index e1ba510..cccc19a 100644
--- a/arch/x86/kernel/ds_selftest.c
+++ b/arch/x86/kernel/ds_selftest.c
@@ -47,8 +47,13 @@ static int ds_selftest_bts_consistency(const struct bts_trace *trace)
 		printk(KERN_CONT "bad bts buffer setup...");
 		error = -1;
 	}
+	/*
+	 * We allow top in [begin; end], since its not clear when the
+	 * overflow adjustment happens: after the increment or before the
+	 * write.
+	 */
 	if ((trace->ds.top < trace->ds.begin) ||
-	    (trace->ds.end <= trace->ds.top)) {
+	    (trace->ds.end < trace->ds.top)) {
 		printk(KERN_CONT "bts top out of bounds...");
 		error = -1;
 	}
-- 
cgit v0.10.2


From 01f6569ece6915616f6cae1d7d8b46ab8da9c1bd Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:44 +0200
Subject: x86, ds: selftest each cpu

Perform debug store selftests on each cpu.

Cover both the normal and the _noirq variant of the debug store interface.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144559.394583000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
index cccc19a..599a963 100644
--- a/arch/x86/kernel/ds_selftest.c
+++ b/arch/x86/kernel/ds_selftest.c
@@ -11,13 +11,21 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/smp.h>
+#include <linux/cpu.h>
 
 #include <asm/ds.h>
 
 
-#define BUFFER_SIZE 1021 /* Intentionally chose an odd size. */
+#define BUFFER_SIZE	521 /* Intentionally chose an odd size. */
 
 
+struct ds_selftest_bts_conf {
+	struct bts_tracer *tracer;
+	int error;
+	int (*suspend)(struct bts_tracer *);
+	int (*resume)(struct bts_tracer *);
+};
+
 static int ds_selftest_bts_consistency(const struct bts_trace *trace)
 {
 	int error = 0;
@@ -125,36 +133,32 @@ static int ds_selftest_bts_read(struct bts_tracer *tracer,
 	return 0;
 }
 
-int ds_selftest_bts(void)
+static void ds_selftest_bts_cpu(void *arg)
 {
+	struct ds_selftest_bts_conf *conf = arg;
 	const struct bts_trace *trace;
-	struct bts_tracer *tracer;
-	int error = 0;
 	void *top;
-	unsigned char buffer[BUFFER_SIZE];
 
-	printk(KERN_INFO "[ds] bts selftest...");
-
-	tracer = ds_request_bts_cpu(smp_processor_id(), buffer, BUFFER_SIZE,
-				    NULL, (size_t)-1, BTS_KERNEL);
-	if (IS_ERR(tracer)) {
-		error = PTR_ERR(tracer);
-		tracer = NULL;
+	if (IS_ERR(conf->tracer)) {
+		conf->error = PTR_ERR(conf->tracer);
+		conf->tracer = NULL;
 
 		printk(KERN_CONT
-		       "initialization failed (err: %d)...", error);
-		goto out;
+		       "initialization failed (err: %d)...", conf->error);
+		return;
 	}
 
-	/* The return should already give us enough trace. */
-	ds_suspend_bts(tracer);
+	/* We should meanwhile have enough trace. */
+	conf->error = conf->suspend(conf->tracer);
+	if (conf->error < 0)
+		return;
 
 	/* Let's see if we can access the trace. */
-	trace = ds_read_bts(tracer);
+	trace = ds_read_bts(conf->tracer);
 
-	error = ds_selftest_bts_consistency(trace);
-	if (error < 0)
-		goto out;
+	conf->error = ds_selftest_bts_consistency(trace);
+	if (conf->error < 0)
+		return;
 
 	/* If everything went well, we should have a few trace entries. */
 	if (trace->ds.top == trace->ds.begin) {
@@ -168,10 +172,11 @@ int ds_selftest_bts(void)
 	}
 
 	/* Let's try to read the trace we collected. */
-	error = ds_selftest_bts_read(tracer, trace,
+	conf->error =
+		ds_selftest_bts_read(conf->tracer, trace,
 				     trace->ds.begin, trace->ds.top);
-	if (error < 0)
-		goto out;
+	if (conf->error < 0)
+		return;
 
 	/*
 	 * Let's read the trace again.
@@ -179,26 +184,31 @@ int ds_selftest_bts(void)
 	 */
 	top = trace->ds.top;
 
-	trace = ds_read_bts(tracer);
-	error = ds_selftest_bts_consistency(trace);
-	if (error < 0)
-		goto out;
+	trace = ds_read_bts(conf->tracer);
+	conf->error = ds_selftest_bts_consistency(trace);
+	if (conf->error < 0)
+		return;
 
 	if (top != trace->ds.top) {
 		printk(KERN_CONT "suspend not working...");
-		error = -1;
-		goto out;
+		conf->error = -1;
+		return;
 	}
 
 	/* Let's collect some more trace - see if resume is working. */
-	ds_resume_bts(tracer);
-	ds_suspend_bts(tracer);
+	conf->error = conf->resume(conf->tracer);
+	if (conf->error < 0)
+		return;
+
+	conf->error = conf->suspend(conf->tracer);
+	if (conf->error < 0)
+		return;
 
-	trace = ds_read_bts(tracer);
+	trace = ds_read_bts(conf->tracer);
 
-	error = ds_selftest_bts_consistency(trace);
-	if (error < 0)
-		goto out;
+	conf->error = ds_selftest_bts_consistency(trace);
+	if (conf->error < 0)
+		return;
 
 	if (trace->ds.top == top) {
 		/*
@@ -210,35 +220,113 @@ int ds_selftest_bts(void)
 		printk(KERN_CONT
 		       "no resume progress/overflow...");
 
-		error = ds_selftest_bts_read(tracer, trace,
+		conf->error =
+			ds_selftest_bts_read(conf->tracer, trace,
 					     trace->ds.begin, trace->ds.end);
 	} else if (trace->ds.top < top) {
 		/*
 		 * We had a buffer overflow - the entire buffer should
 		 * contain trace records.
 		 */
-		error = ds_selftest_bts_read(tracer, trace,
+		conf->error =
+			ds_selftest_bts_read(conf->tracer, trace,
 					     trace->ds.begin, trace->ds.end);
 	} else {
 		/*
 		 * It is quite likely that the buffer did not overflow.
 		 * Let's just check the delta trace.
 		 */
-		error = ds_selftest_bts_read(tracer, trace,
-					     top, trace->ds.top);
+		conf->error =
+			ds_selftest_bts_read(conf->tracer, trace, top,
+					     trace->ds.top);
 	}
-	if (error < 0)
-		goto out;
+	if (conf->error < 0)
+		return;
 
-	error = 0;
+	conf->error = 0;
+}
 
-	/* The final test: release the tracer while tracing is suspended. */
- out:
-	ds_release_bts(tracer);
+static int ds_suspend_bts_wrap(struct bts_tracer *tracer)
+{
+	ds_suspend_bts(tracer);
+	return 0;
+}
+
+static int ds_resume_bts_wrap(struct bts_tracer *tracer)
+{
+	ds_resume_bts(tracer);
+	return 0;
+}
 
-	printk(KERN_CONT "%s.\n", (error ? "failed" : "passed"));
+static void ds_release_bts_noirq_wrap(void *tracer)
+{
+	(void)ds_release_bts_noirq(tracer);
+}
 
-	return error;
+static int ds_selftest_bts_bad_release_noirq(int cpu,
+					     struct bts_tracer *tracer)
+{
+	int error = -EPERM;
+
+	/* Try to release the tracer on the wrong cpu. */
+	get_cpu();
+	if (cpu != smp_processor_id()) {
+		error = ds_release_bts_noirq(tracer);
+		if (error != -EPERM)
+			printk(KERN_CONT "release on wrong cpu...");
+	}
+	put_cpu();
+
+	return error ? 0 : -1;
+}
+
+int ds_selftest_bts(void)
+{
+	struct ds_selftest_bts_conf conf;
+	unsigned char buffer[BUFFER_SIZE];
+	int cpu;
+
+	printk(KERN_INFO "[ds] bts selftest...");
+	conf.error = 0;
+
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		conf.suspend = ds_suspend_bts_wrap;
+		conf.resume = ds_resume_bts_wrap;
+		conf.tracer =
+			ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
+					   NULL, (size_t)-1, BTS_KERNEL);
+		ds_selftest_bts_cpu(&conf);
+		ds_release_bts(conf.tracer);
+		if (conf.error < 0)
+			goto out;
+
+		conf.suspend = ds_suspend_bts_noirq;
+		conf.resume = ds_resume_bts_noirq;
+		conf.tracer =
+			ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
+					   NULL, (size_t)-1, BTS_KERNEL);
+		smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1);
+		if (conf.error >= 0) {
+			conf.error =
+				ds_selftest_bts_bad_release_noirq(cpu,
+								  conf.tracer);
+			/* We must not release the tracer twice. */
+			if (conf.error < 0)
+				conf.tracer = NULL;
+		}
+		smp_call_function_single(cpu, ds_release_bts_noirq_wrap,
+					 conf.tracer, 1);
+		if (conf.error < 0)
+			goto out;
+	}
+
+	conf.error = 0;
+ out:
+	put_online_cpus();
+	printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed"));
+
+	return conf.error;
 }
 
 int ds_selftest_pebs(void)
-- 
cgit v0.10.2


From 3a68eef945b234f286406d96dc690fe17863c203 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:45 +0200
Subject: x86, ds: add task tracing selftest

Add selftests to cover per-task branch tracing.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144600.329346000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
index 599a963..a40b253 100644
--- a/arch/x86/kernel/ds_selftest.c
+++ b/arch/x86/kernel/ds_selftest.c
@@ -280,10 +280,51 @@ static int ds_selftest_bts_bad_release_noirq(int cpu,
 	return error ? 0 : -1;
 }
 
+static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer)
+{
+	struct bts_tracer *tracer;
+	int error;
+
+	/* Try to request cpu tracing while task tracing is active. */
+	tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL,
+				    (size_t)-1, BTS_KERNEL);
+	error = PTR_ERR(tracer);
+	if (!IS_ERR(tracer)) {
+		ds_release_bts(tracer);
+		error = 0;
+	}
+
+	if (error != -EPERM)
+		printk(KERN_CONT "cpu/task tracing overlap...");
+
+	return error ? 0 : -1;
+}
+
+static int ds_selftest_bts_bad_request_task(void *buffer)
+{
+	struct bts_tracer *tracer;
+	int error;
+
+	/* Try to request cpu tracing while task tracing is active. */
+	tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL,
+				    (size_t)-1, BTS_KERNEL);
+	error = PTR_ERR(tracer);
+	if (!IS_ERR(tracer)) {
+		error = 0;
+		ds_release_bts(tracer);
+	}
+
+	if (error != -EPERM)
+		printk(KERN_CONT "task/cpu tracing overlap...");
+
+	return error ? 0 : -1;
+}
+
 int ds_selftest_bts(void)
 {
 	struct ds_selftest_bts_conf conf;
 	unsigned char buffer[BUFFER_SIZE];
+	unsigned long irq;
 	int cpu;
 
 	printk(KERN_INFO "[ds] bts selftest...");
@@ -297,6 +338,8 @@ int ds_selftest_bts(void)
 			ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
 					   NULL, (size_t)-1, BTS_KERNEL);
 		ds_selftest_bts_cpu(&conf);
+		if (conf.error >= 0)
+			conf.error = ds_selftest_bts_bad_request_task(buffer);
 		ds_release_bts(conf.tracer);
 		if (conf.error < 0)
 			goto out;
@@ -315,12 +358,40 @@ int ds_selftest_bts(void)
 			if (conf.error < 0)
 				conf.tracer = NULL;
 		}
+		if (conf.error >= 0)
+			conf.error = ds_selftest_bts_bad_request_task(buffer);
 		smp_call_function_single(cpu, ds_release_bts_noirq_wrap,
 					 conf.tracer, 1);
 		if (conf.error < 0)
 			goto out;
 	}
 
+	conf.suspend = ds_suspend_bts_wrap;
+	conf.resume = ds_resume_bts_wrap;
+	conf.tracer =
+		ds_request_bts_task(current, buffer, BUFFER_SIZE,
+				    NULL, (size_t)-1, BTS_KERNEL);
+	ds_selftest_bts_cpu(&conf);
+	if (conf.error >= 0)
+		conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
+	ds_release_bts(conf.tracer);
+	if (conf.error < 0)
+		goto out;
+
+	conf.suspend = ds_suspend_bts_noirq;
+	conf.resume = ds_resume_bts_noirq;
+	conf.tracer =
+		ds_request_bts_task(current, buffer, BUFFER_SIZE,
+				   NULL, (size_t)-1, BTS_KERNEL);
+	local_irq_save(irq);
+	ds_selftest_bts_cpu(&conf);
+	if (conf.error >= 0)
+		conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
+	ds_release_bts_noirq(conf.tracer);
+	local_irq_restore(irq);
+	if (conf.error < 0)
+		goto out;
+
 	conf.error = 0;
  out:
 	put_online_cpus();
-- 
cgit v0.10.2


From 2311f0de21c17b2a8b960677a9cccfbfa52beb35 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:46 +0200
Subject: x86, ds: add leakage warning

Add a warning in case a debug store context is not removed before
the task it is attached to is freed.

Remove the old warning at thread exit. It is too early.

Declare the debug store context field in thread_struct unconditionally.

Remove ds_copy_thread() and ds_exit_thread() and do the work directly
in process*.c.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144601.254472000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
index 413e127..149e520 100644
--- a/arch/x86/include/asm/ds.h
+++ b/arch/x86/include/asm/ds.h
@@ -285,21 +285,12 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
  */
 extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
 
-/*
- * Task clone/init and cleanup work
- */
-extern void ds_copy_thread(struct task_struct *tsk, struct task_struct *father);
-extern void ds_exit_thread(struct task_struct *tsk);
-
 #else /* CONFIG_X86_DS */
 
 struct cpuinfo_x86;
 static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
 static inline void ds_switch_to(struct task_struct *prev,
 				struct task_struct *next) {}
-static inline void ds_copy_thread(struct task_struct *tsk,
-				  struct task_struct *father) {}
-static inline void ds_exit_thread(struct task_struct *tsk) {}
 
 #endif /* CONFIG_X86_DS */
 #endif /* _ASM_X86_DS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 1efeb49..7c39de7 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -454,10 +454,8 @@ struct thread_struct {
 	unsigned		io_bitmap_max;
 /* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set.  */
 	unsigned long	debugctlmsr;
-#ifdef CONFIG_X86_DS
-/* Debug Store context; see include/asm-x86/ds.h; goes into MSR_IA32_DS_AREA */
+	/* Debug Store context; see asm/ds.h */
 	struct ds_context	*ds_ctx;
-#endif /* CONFIG_X86_DS */
 };
 
 static inline unsigned long native_get_debugreg(int regno)
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 21a3852..71cab3b 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -1352,16 +1352,6 @@ void ds_switch_to(struct task_struct *prev, struct task_struct *next)
 	update_debugctlmsr(debugctlmsr);
 }
 
-void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
-{
-	clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR);
-	tsk->thread.ds_ctx = NULL;
-}
-
-void ds_exit_thread(struct task_struct *tsk)
-{
-}
-
 static __init int ds_selftest(void)
 {
 	if (ds_cfg.sizeof_rec[ds_bts]) {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ca98915..fb5dfb8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -14,6 +14,7 @@
 #include <asm/idle.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
+#include <asm/ds.h>
 
 unsigned long idle_halt;
 EXPORT_SYMBOL(idle_halt);
@@ -45,6 +46,8 @@ void free_thread_xstate(struct task_struct *tsk)
 		kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
 		tsk->thread.xstate = NULL;
 	}
+
+	WARN(tsk->thread.ds_ctx, "leaking DS context\n");
 }
 
 void free_thread_info(struct thread_info *ti)
@@ -83,8 +86,6 @@ void exit_thread(void)
 		put_cpu();
 		kfree(bp);
 	}
-
-	ds_exit_thread(current);
 }
 
 void flush_thread(void)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 76f8f84..b5e4bfe 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -290,7 +290,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 		p->thread.io_bitmap_max = 0;
 	}
 
-	ds_copy_thread(p, current);
+	clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
+	p->thread.ds_ctx = NULL;
 
 	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
 	p->thread.debugctlmsr = 0;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b751a41..5a1a1de 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -335,7 +335,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 			goto out;
 	}
 
-	ds_copy_thread(p, me);
+	clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
+	p->thread.ds_ctx = NULL;
 
 	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
 	p->thread.debugctlmsr = 0;
-- 
cgit v0.10.2


From ee811517a5604aa63fae803b7c044712699e1303 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:47 +0200
Subject: x86, ds: use single debug store cpu configuration

Use a single configuration for all cpus.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144602.191165000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 71cab3b..443f415 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -47,9 +47,8 @@ struct ds_configuration {
 	/* Control bit-masks indexed by enum ds_feature: */
 	unsigned long		ctl[dsf_ctl_max];
 };
-static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
+static struct ds_configuration ds_cfg __read_mostly;
 
-#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id())
 
 /* Maximal size of a DS configuration: */
 #define MAX_SIZEOF_DS		(12 * 8)
@@ -1268,6 +1267,10 @@ ds_configure(const struct ds_configuration *cfg,
 
 void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
 {
+	/* Only configure the first cpu. Others are identical. */
+	if (ds_cfg.name)
+		return;
+
 	switch (c->x86) {
 	case 0x6:
 		switch (c->x86_model) {
-- 
cgit v0.10.2


From 0f4814065ff8c24ca8bfd75c9b73502be152c287 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:48 +0200
Subject: x86, ptrace: add bts context unconditionally

Add the ptrace bts context field to task_struct unconditionally.

Initialize the field directly in copy_process().
Remove all the unneeded functionality used to initialize that field.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144603.292754000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index e304b66..5cdd19f 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -235,12 +235,11 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
 extern int do_set_thread_area(struct task_struct *p, int idx,
 			      struct user_desc __user *info, int can_allocate);
 
-extern void x86_ptrace_untrace(struct task_struct *);
-extern void x86_ptrace_fork(struct task_struct *child,
-			    unsigned long clone_flags);
+#ifdef CONFIG_X86_PTRACE_BTS
+extern void ptrace_bts_untrace(struct task_struct *tsk);
 
-#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk)
-#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags)
+#define arch_ptrace_untrace(tsk)	ptrace_bts_untrace(tsk)
+#endif /* CONFIG_X86_PTRACE_BTS */
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index adbb243..b32a8ee 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -887,37 +887,19 @@ static int ptrace_bts_size(struct task_struct *child)
 	return (trace->ds.top - trace->ds.begin) / trace->ds.size;
 }
 
-static inline void ptrace_bts_fork(struct task_struct *tsk)
-{
-	tsk->bts = NULL;
-}
-
 /*
  * Called from __ptrace_unlink() after the child has been moved back
  * to its original parent.
  */
-static inline void ptrace_bts_untrace(struct task_struct *child)
+void ptrace_bts_untrace(struct task_struct *child)
 {
 	if (unlikely(child->bts)) {
 		free_bts_context(child->bts);
 		child->bts = NULL;
 	}
 }
-#else
-static inline void ptrace_bts_fork(struct task_struct *tsk) {}
-static inline void ptrace_bts_untrace(struct task_struct *child) {}
 #endif /* CONFIG_X86_PTRACE_BTS */
 
-void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags)
-{
-	ptrace_bts_fork(child);
-}
-
-void x86_ptrace_untrace(struct task_struct *child)
-{
-	ptrace_bts_untrace(child);
-}
-
 /*
  * Called by kernel/ptrace.c when detaching..
  *
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 67c1565..59e133d 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -95,7 +95,6 @@ extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
 extern void exit_ptrace(struct task_struct *tracer);
-extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags);
 #define PTRACE_MODE_READ   1
 #define PTRACE_MODE_ATTACH 2
 /* Returns 0 on success, -errno on denial. */
@@ -327,15 +326,6 @@ static inline void user_enable_block_step(struct task_struct *task)
 #define arch_ptrace_untrace(task)		do { } while (0)
 #endif
 
-#ifndef arch_ptrace_fork
-/*
- * Do machine-specific work to initialize a new task.
- *
- * This is called from copy_process().
- */
-#define arch_ptrace_fork(child, clone_flags)	do { } while (0)
-#endif
-
 extern int task_current_syscall(struct task_struct *target, long *callno,
 				unsigned long args[6], unsigned int maxargs,
 				unsigned long *sp, unsigned long *pc);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 52b8cd0..451186a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1205,13 +1205,11 @@ struct task_struct {
 	struct list_head ptraced;
 	struct list_head ptrace_entry;
 
-#ifdef CONFIG_X86_PTRACE_BTS
 	/*
 	 * This is the tracer handle for the ptrace BTS extension.
 	 * This field actually belongs to the ptracer task.
 	 */
 	struct bts_context *bts;
-#endif /* CONFIG_X86_PTRACE_BTS */
 
 	/* PID/PID hash table linkage. */
 	struct pid_link pids[PIDTYPE_MAX];
diff --git a/kernel/fork.c b/kernel/fork.c
index 660c2b8..69bde7a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1086,8 +1086,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_DEBUG_MUTEXES
 	p->blocked_on = NULL; /* not blocked yet */
 #endif
-	if (unlikely(current->ptrace))
-		ptrace_fork(p, clone_flags);
+
+	p->bts = NULL;
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p, clone_flags);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index aaad0ec..321127d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -27,16 +27,6 @@
 
 
 /*
- * Initialize a new task whose father had been ptraced.
- *
- * Called from copy_process().
- */
-void ptrace_fork(struct task_struct *child, unsigned long clone_flags)
-{
-	arch_ptrace_fork(child, clone_flags);
-}
-
-/*
  * ptrace a task: make the debugger its new parent and
  * move it to the ptrace list.
  *
-- 
cgit v0.10.2


From 6047550d3d26fed88b18a208b31f8b90b5ef3e9b Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:49 +0200
Subject: x86, ds: dont use TIF_DEBUGCTLMSR

Debug store already uses TIF_DS_AREA_MSR to trigger debug store context
switch handling. No need to use TIF_DEBUGCTLMSR, as well.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144604.256645000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 443f415..cab2832 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -925,11 +925,6 @@ static void update_task_debugctlmsr(struct task_struct *task,
 	get_cpu();
 	if (task == current)
 		update_debugctlmsr(debugctlmsr);
-
-	if (task->thread.debugctlmsr)
-		set_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
-	else
-		clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
 	put_cpu();
 }
 
-- 
cgit v0.10.2


From 608780a9048efa3e85fbc4d8649b26805cc588aa Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:50 +0200
Subject: x86, ds: fix bad ds_reset_pebs()

Ds_reset_pebs() passed the wrong qualifier to a shared function resulting
in a reset of bts, rather than pebs.

Reported-by: Stephane Eranian <eranian@googlemail.com>
Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144605.206510000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index cab2832..ebfb0fd 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -1186,7 +1186,7 @@ int ds_reset_pebs(struct pebs_tracer *tracer)
 
 	tracer->trace.ds.top = tracer->trace.ds.begin;
 
-	ds_set(tracer->ds.context->ds, ds_bts, ds_index,
+	ds_set(tracer->ds.context->ds, ds_pebs, ds_index,
 	       (unsigned long)tracer->trace.ds.top);
 
 	return 0;
-- 
cgit v0.10.2


From 150f5164c1258e05b7dea16f29e592f354c48f34 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:51 +0200
Subject: x86, ds: allow small debug store buffers

Check the buffer size more precisely to allow buffers for exactly
one element provided the base address is already properly aligned.

Add a debug store selftest.

Reported-by: Stephane Eranian <eranian@googlemail.com>
Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144606.139137000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index ebfb0fd..4e05157 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -656,6 +656,7 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
 {
 	struct ds_context *context;
 	int error;
+	size_t req_size;
 
 	error = -EOPNOTSUPP;
 	if (!ds_cfg.sizeof_rec[qual])
@@ -665,9 +666,13 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
 	if (!base)
 		goto out;
 
-	/* We need space for alignment adjustments in ds_init_ds_trace(). */
+	req_size = ds_cfg.sizeof_rec[qual];
+	/* We might need space for alignment adjustments. */
+	if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT))
+		req_size += DS_ALIGNMENT;
+
 	error = -EINVAL;
-	if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual]))
+	if (size < req_size)
 		goto out;
 
 	if (th != (size_t)-1) {
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
index a40b253..5f104a0 100644
--- a/arch/x86/kernel/ds_selftest.c
+++ b/arch/x86/kernel/ds_selftest.c
@@ -16,8 +16,8 @@
 #include <asm/ds.h>
 
 
-#define BUFFER_SIZE	521 /* Intentionally chose an odd size. */
-
+#define BUFFER_SIZE		521	/* Intentionally chose an odd size. */
+#define SMALL_BUFFER_SIZE	24	/* A single bts entry. */
 
 struct ds_selftest_bts_conf {
 	struct bts_tracer *tracer;
@@ -381,7 +381,7 @@ int ds_selftest_bts(void)
 	conf.suspend = ds_suspend_bts_noirq;
 	conf.resume = ds_resume_bts_noirq;
 	conf.tracer =
-		ds_request_bts_task(current, buffer, BUFFER_SIZE,
+		ds_request_bts_task(current, buffer, SMALL_BUFFER_SIZE,
 				   NULL, (size_t)-1, BTS_KERNEL);
 	local_irq_save(irq);
 	ds_selftest_bts_cpu(&conf);
-- 
cgit v0.10.2


From 017bc617657c928cb9a0c45a7a7e9f4e66695347 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:52 +0200
Subject: x86, ds: support Core i7

Add debug store support for Core i7.

Core i7 adds a reset value for each performance counter and a new
PEBS record format.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144607.088997000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
index 149e520..70dac19 100644
--- a/arch/x86/include/asm/ds.h
+++ b/arch/x86/include/asm/ds.h
@@ -234,8 +234,12 @@ struct bts_trace {
 struct pebs_trace {
 	struct ds_trace ds;
 
-	/* the PEBS reset value */
-	unsigned long long reset_value;
+	/* the number of valid counters in the below array */
+	unsigned int counters;
+
+#define MAX_PEBS_COUNTERS 4
+	/* the counter reset value */
+	unsigned long long counter_reset[MAX_PEBS_COUNTERS];
 };
 
 
@@ -270,9 +274,11 @@ extern int ds_reset_pebs(struct pebs_tracer *tracer);
  * Returns 0 on success; -Eerrno on error
  *
  * tracer: the tracer handle returned from ds_request_pebs()
+ * counter: the index of the counter
  * value: the new counter reset value
  */
-extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value);
+extern int ds_set_pebs_reset(struct pebs_tracer *tracer,
+			     unsigned int counter, u64 value);
 
 /*
  * Initialization
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 4e05157..48bfe13 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -44,6 +44,9 @@ struct ds_configuration {
 	/* The size of a BTS/PEBS record in bytes: */
 	unsigned char		sizeof_rec[2];
 
+	/* The number of pebs counter reset values in the DS structure. */
+	unsigned char		nr_counter_reset;
+
 	/* Control bit-masks indexed by enum ds_feature: */
 	unsigned long		ctl[dsf_ctl_max];
 };
@@ -51,7 +54,7 @@ static struct ds_configuration ds_cfg __read_mostly;
 
 
 /* Maximal size of a DS configuration: */
-#define MAX_SIZEOF_DS		(12 * 8)
+#define MAX_SIZEOF_DS		0x80
 
 /* Maximal size of a BTS record: */
 #define MAX_SIZEOF_BTS		(3 * 8)
@@ -59,6 +62,12 @@ static struct ds_configuration ds_cfg __read_mostly;
 /* BTS and PEBS buffer alignment: */
 #define DS_ALIGNMENT		(1 << 3)
 
+/* Number of buffer pointers in DS: */
+#define NUM_DS_PTR_FIELDS	8
+
+/* Size of a pebs reset value in DS: */
+#define PEBS_RESET_FIELD_SIZE	8
+
 /* Mask of control bits in the DS MSR register: */
 #define BTS_CONTROL				  \
 	( ds_cfg.ctl[dsf_bts]			| \
@@ -1164,9 +1173,12 @@ const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
 		return NULL;
 
 	ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
-	tracer->trace.reset_value =
-		*(u64 *)(tracer->ds.context->ds +
-			 (ds_cfg.sizeof_ptr_field * 8));
+
+	tracer->trace.counters = ds_cfg.nr_counter_reset;
+	memcpy(tracer->trace.counter_reset,
+	       tracer->ds.context->ds +
+	       (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field),
+	       ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE);
 
 	return &tracer->trace;
 }
@@ -1197,13 +1209,18 @@ int ds_reset_pebs(struct pebs_tracer *tracer)
 	return 0;
 }
 
-int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)
+int ds_set_pebs_reset(struct pebs_tracer *tracer,
+		      unsigned int counter, u64 value)
 {
 	if (!tracer)
 		return -EINVAL;
 
+	if (ds_cfg.nr_counter_reset < counter)
+		return -EINVAL;
+
 	*(u64 *)(tracer->ds.context->ds +
-		 (ds_cfg.sizeof_ptr_field * 8)) = value;
+		 (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) +
+		 (counter * PEBS_RESET_FIELD_SIZE)) = value;
 
 	return 0;
 }
@@ -1213,16 +1230,26 @@ static const struct ds_configuration ds_cfg_netburst = {
 	.ctl[dsf_bts]		= (1 << 2) | (1 << 3),
 	.ctl[dsf_bts_kernel]	= (1 << 5),
 	.ctl[dsf_bts_user]	= (1 << 6),
+	.nr_counter_reset	= 1,
 };
 static const struct ds_configuration ds_cfg_pentium_m = {
 	.name = "Pentium M",
 	.ctl[dsf_bts]		= (1 << 6) | (1 << 7),
+	.nr_counter_reset	= 1,
 };
 static const struct ds_configuration ds_cfg_core2_atom = {
 	.name = "Core 2/Atom",
 	.ctl[dsf_bts]		= (1 << 6) | (1 << 7),
 	.ctl[dsf_bts_kernel]	= (1 << 9),
 	.ctl[dsf_bts_user]	= (1 << 10),
+	.nr_counter_reset	= 1,
+};
+static const struct ds_configuration ds_cfg_core_i7 = {
+	.name = "Core i7",
+	.ctl[dsf_bts]		= (1 << 6) | (1 << 7),
+	.ctl[dsf_bts_kernel]	= (1 << 9),
+	.ctl[dsf_bts_user]	= (1 << 10),
+	.nr_counter_reset	= 4,
 };
 
 static void
@@ -1239,6 +1266,32 @@ ds_configure(const struct ds_configuration *cfg,
 	nr_pebs_fields = 18;
 #endif
 
+	/*
+	 * Starting with version 2, architectural performance
+	 * monitoring supports a format specifier.
+	 */
+	if ((cpuid_eax(0xa) & 0xff) > 1) {
+		unsigned long perf_capabilities, format;
+
+		rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
+
+		format = (perf_capabilities >> 8) & 0xf;
+
+		switch (format) {
+		case 0:
+			nr_pebs_fields = 18;
+			break;
+		case 1:
+			nr_pebs_fields = 22;
+			break;
+		default:
+			printk(KERN_INFO
+			       "[ds] unknown PEBS format: %lu\n", format);
+			nr_pebs_fields = 0;
+			break;
+		}
+	}
+
 	memset(&ds_cfg, 0, sizeof(ds_cfg));
 	ds_cfg = *cfg;
 
@@ -1262,7 +1315,7 @@ ds_configure(const struct ds_configuration *cfg,
 	printk("bts/pebs record: %u/%u bytes\n",
 	       ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
 
-	WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_ptr_field));
+	WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset);
 }
 
 void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
@@ -1284,6 +1337,8 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
 			ds_configure(&ds_cfg_core2_atom, c);
 			break;
 		case 0x1a: /* Core i7 */
+			ds_configure(&ds_cfg_core_i7, c);
+			break;
 		default:
 			/* Sorry, don't know about them. */
 			break;
-- 
cgit v0.10.2


From a5dec5573f3c7e63f2f9b5852b9759ea342a5ff9 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 27 Mar 2009 14:55:44 +0800
Subject: tracing: use macros to denote usec and nsec per second

Impact: cleanup

Use USEC_PER_SEC and NSEC_PER_SEC instead of 1000000 and 1000000000.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <49CC7870.9000309@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 7a30fc4..a29ef23 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -9,6 +9,7 @@
 #include <linux/debugfs.h>
 #include <linux/ftrace.h>
 #include <linux/kallsyms.h>
+#include <linux/time.h>
 
 #include "trace.h"
 #include "trace_output.h"
@@ -67,7 +68,7 @@ initcall_call_print_line(struct trace_iterator *iter)
 	trace_assign_type(field, entry);
 	call = &field->boot_call;
 	ts = iter->ts;
-	nsec_rem = do_div(ts, 1000000000);
+	nsec_rem = do_div(ts, NSEC_PER_SEC);
 
 	ret = trace_seq_printf(s, "[%5ld.%09ld] calling  %s @ %i\n",
 			(unsigned long)ts, nsec_rem, call->func, call->caller);
@@ -92,7 +93,7 @@ initcall_ret_print_line(struct trace_iterator *iter)
 	trace_assign_type(field, entry);
 	init_ret = &field->boot_ret;
 	ts = iter->ts;
-	nsec_rem = do_div(ts, 1000000000);
+	nsec_rem = do_div(ts, NSEC_PER_SEC);
 
 	ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
 			"returned %d after %llu msecs\n",
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 8e37fcd..d53b45e 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,6 +9,8 @@
 #include <linux/kernel.h>
 #include <linux/mmiotrace.h>
 #include <linux/pci.h>
+#include <linux/time.h>
+
 #include <asm/atomic.h>
 
 #include "trace.h"
@@ -174,7 +176,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
 	struct mmiotrace_rw *rw;
 	struct trace_seq *s	= &iter->seq;
 	unsigned long long t	= ns2usecs(iter->ts);
-	unsigned long usec_rem	= do_div(t, 1000000ULL);
+	unsigned long usec_rem	= do_div(t, USEC_PER_SEC);
 	unsigned secs		= (unsigned long)t;
 	int ret = 1;
 
@@ -221,7 +223,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
 	struct mmiotrace_map *m;
 	struct trace_seq *s	= &iter->seq;
 	unsigned long long t	= ns2usecs(iter->ts);
-	unsigned long usec_rem	= do_div(t, 1000000ULL);
+	unsigned long usec_rem	= do_div(t, USEC_PER_SEC);
 	unsigned secs		= (unsigned long)t;
 	int ret;
 
-- 
cgit v0.10.2


From 5452af664f6fba26b80eb2c8c4ceae2999d5cf56 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 27 Mar 2009 00:25:38 +0100
Subject: tracing/ftrace: factorize the tracing files creation

Impact: cleanup

Most of the tracing files creation follow the same pattern:

ret = debugfs_create_file(...)
if (!ret)
	pr_warning("Couldn't create ... entry\n")

Unify it!

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1238109938-11840-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 678e3d6..6ea5a1a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2698,38 +2698,23 @@ static const struct file_operations ftrace_graph_fops = {
 
 static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 {
-	struct dentry *entry;
 
-	entry = debugfs_create_file("available_filter_functions", 0444,
-				    d_tracer, NULL, &ftrace_avail_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'available_filter_functions' entry\n");
+	trace_create_file("available_filter_functions", 0444,
+			d_tracer, NULL, &ftrace_avail_fops);
 
-	entry = debugfs_create_file("failures", 0444,
-				    d_tracer, NULL, &ftrace_failures_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'failures' entry\n");
+	trace_create_file("failures", 0444,
+			d_tracer, NULL, &ftrace_failures_fops);
 
-	entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer,
-				    NULL, &ftrace_filter_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'set_ftrace_filter' entry\n");
+	trace_create_file("set_ftrace_filter", 0644, d_tracer,
+			NULL, &ftrace_filter_fops);
 
-	entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer,
+	trace_create_file("set_ftrace_notrace", 0644, d_tracer,
 				    NULL, &ftrace_notrace_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'set_ftrace_notrace' entry\n");
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	entry = debugfs_create_file("set_graph_function", 0444, d_tracer,
+	trace_create_file("set_graph_function", 0444, d_tracer,
 				    NULL,
 				    &ftrace_graph_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'set_graph_function' entry\n");
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
 	return 0;
@@ -2987,7 +2972,6 @@ static const struct file_operations ftrace_pid_fops = {
 static __init int ftrace_init_debugfs(void)
 {
 	struct dentry *d_tracer;
-	struct dentry *entry;
 
 	d_tracer = tracing_init_dentry();
 	if (!d_tracer)
@@ -2995,11 +2979,8 @@ static __init int ftrace_init_debugfs(void)
 
 	ftrace_init_dyn_debugfs(d_tracer);
 
-	entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer,
-				    NULL, &ftrace_pid_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'set_ftrace_pid' entry\n");
+	trace_create_file("set_ftrace_pid", 0644, d_tracer,
+			    NULL, &ftrace_pid_fops);
 
 	ftrace_profile_debugfs(d_tracer);
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 960cbf4..74a1180 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2845,14 +2845,11 @@ static const struct file_operations rb_simple_fops = {
 static __init int rb_init_debugfs(void)
 {
 	struct dentry *d_tracer;
-	struct dentry *entry;
 
 	d_tracer = tracing_init_dentry();
 
-	entry = debugfs_create_file("tracing_on", 0644, d_tracer,
-				    &ring_buffer_flags, &rb_simple_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'tracing_on' entry\n");
+	trace_create_file("tracing_on", 0644, d_tracer,
+			    &ring_buffer_flags, &rb_simple_fops);
 
 	return 0;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 32653c8..0615751 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3581,7 +3581,7 @@ struct dentry *tracing_dentry_percpu(void)
 static void tracing_init_debugfs_percpu(long cpu)
 {
 	struct dentry *d_percpu = tracing_dentry_percpu();
-	struct dentry *entry, *d_cpu;
+	struct dentry *d_cpu;
 	/* strlen(cpu) + MAX(log10(cpu)) + '\0' */
 	char cpu_dir[7];
 
@@ -3596,21 +3596,15 @@ static void tracing_init_debugfs_percpu(long cpu)
 	}
 
 	/* per cpu trace_pipe */
-	entry = debugfs_create_file("trace_pipe", 0444, d_cpu,
-				(void *) cpu, &tracing_pipe_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'trace_pipe' entry\n");
+	trace_create_file("trace_pipe", 0444, d_cpu,
+			(void *) cpu, &tracing_pipe_fops);
 
 	/* per cpu trace */
-	entry = debugfs_create_file("trace", 0644, d_cpu,
-				(void *) cpu, &tracing_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'trace' entry\n");
+	trace_create_file("trace", 0644, d_cpu,
+			(void *) cpu, &tracing_fops);
 
-	entry = debugfs_create_file("trace_pipe_raw", 0444, d_cpu,
-				    (void *) cpu, &tracing_buffers_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'trace_pipe_raw' entry\n");
+	trace_create_file("trace_pipe_raw", 0444, d_cpu,
+			(void *) cpu, &tracing_buffers_fops);
 }
 
 #ifdef CONFIG_FTRACE_SELFTEST
@@ -3766,6 +3760,22 @@ static const struct file_operations trace_options_core_fops = {
 	.write = trace_options_core_write,
 };
 
+struct dentry *trace_create_file(const char *name,
+				 mode_t mode,
+				 struct dentry *parent,
+				 void *data,
+				 const struct file_operations *fops)
+{
+	struct dentry *ret;
+
+	ret = debugfs_create_file(name, mode, parent, data, fops);
+	if (!ret)
+		pr_warning("Could not create debugfs '%s' entry\n", name);
+
+	return ret;
+}
+
+
 static struct dentry *trace_options_init_dentry(void)
 {
 	struct dentry *d_tracer;
@@ -3793,7 +3803,6 @@ create_trace_option_file(struct trace_option_dentry *topt,
 			 struct tracer_opt *opt)
 {
 	struct dentry *t_options;
-	struct dentry *entry;
 
 	t_options = trace_options_init_dentry();
 	if (!t_options)
@@ -3802,11 +3811,9 @@ create_trace_option_file(struct trace_option_dentry *topt,
 	topt->flags = flags;
 	topt->opt = opt;
 
-	entry = debugfs_create_file(opt->name, 0644, t_options, topt,
+	topt->entry = trace_create_file(opt->name, 0644, t_options, topt,
 				    &trace_options_fops);
 
-	topt->entry = entry;
-
 }
 
 static struct trace_option_dentry *
@@ -3861,123 +3868,81 @@ static struct dentry *
 create_trace_option_core_file(const char *option, long index)
 {
 	struct dentry *t_options;
-	struct dentry *entry;
 
 	t_options = trace_options_init_dentry();
 	if (!t_options)
 		return NULL;
 
-	entry = debugfs_create_file(option, 0644, t_options, (void *)index,
+	return trace_create_file(option, 0644, t_options, (void *)index,
 				    &trace_options_core_fops);
-
-	return entry;
 }
 
 static __init void create_trace_options_dir(void)
 {
 	struct dentry *t_options;
-	struct dentry *entry;
 	int i;
 
 	t_options = trace_options_init_dentry();
 	if (!t_options)
 		return;
 
-	for (i = 0; trace_options[i]; i++) {
-		entry = create_trace_option_core_file(trace_options[i], i);
-		if (!entry)
-			pr_warning("Could not create debugfs %s entry\n",
-				   trace_options[i]);
-	}
+	for (i = 0; trace_options[i]; i++)
+		create_trace_option_core_file(trace_options[i], i);
 }
 
 static __init int tracer_init_debugfs(void)
 {
 	struct dentry *d_tracer;
-	struct dentry *entry;
 	int cpu;
 
 	d_tracer = tracing_init_dentry();
 
-	entry = debugfs_create_file("tracing_enabled", 0644, d_tracer,
-				    &global_trace, &tracing_ctrl_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
+	trace_create_file("tracing_enabled", 0644, d_tracer,
+			&global_trace, &tracing_ctrl_fops);
 
-	entry = debugfs_create_file("trace_options", 0644, d_tracer,
-				    NULL, &tracing_iter_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'trace_options' entry\n");
+	trace_create_file("trace_options", 0644, d_tracer,
+			NULL, &tracing_iter_fops);
 
-	create_trace_options_dir();
+	trace_create_file("tracing_cpumask", 0644, d_tracer,
+			NULL, &tracing_cpumask_fops);
+
+	trace_create_file("trace", 0644, d_tracer,
+			(void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
+
+	trace_create_file("available_tracers", 0444, d_tracer,
+			&global_trace, &show_traces_fops);
+
+	trace_create_file("current_tracer", 0444, d_tracer,
+			&global_trace, &set_tracer_fops);
+
+	trace_create_file("tracing_max_latency", 0644, d_tracer,
+			&tracing_max_latency, &tracing_max_lat_fops);
+
+	trace_create_file("tracing_thresh", 0644, d_tracer,
+			&tracing_thresh, &tracing_max_lat_fops);
 
-	entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
-				    NULL, &tracing_cpumask_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
-
-	entry = debugfs_create_file("trace", 0644, d_tracer,
-				 (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'trace' entry\n");
-
-	entry = debugfs_create_file("available_tracers", 0444, d_tracer,
-				    &global_trace, &show_traces_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'available_tracers' entry\n");
-
-	entry = debugfs_create_file("current_tracer", 0444, d_tracer,
-				    &global_trace, &set_tracer_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'current_tracer' entry\n");
-
-	entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,
-				    &tracing_max_latency,
-				    &tracing_max_lat_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'tracing_max_latency' entry\n");
-
-	entry = debugfs_create_file("tracing_thresh", 0644, d_tracer,
-				    &tracing_thresh, &tracing_max_lat_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'tracing_thresh' entry\n");
-	entry = debugfs_create_file("README", 0644, d_tracer,
-				    NULL, &tracing_readme_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'README' entry\n");
-
-	entry = debugfs_create_file("trace_pipe", 0444, d_tracer,
+	trace_create_file("README", 0644, d_tracer,
+			NULL, &tracing_readme_fops);
+
+	trace_create_file("trace_pipe", 0444, d_tracer,
 			(void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'trace_pipe' entry\n");
-
-	entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer,
-				    &global_trace, &tracing_entries_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'buffer_size_kb' entry\n");
-
-	entry = debugfs_create_file("trace_marker", 0220, d_tracer,
-				    NULL, &tracing_mark_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'trace_marker' entry\n");
+
+	trace_create_file("buffer_size_kb", 0644, d_tracer,
+			&global_trace, &tracing_entries_fops);
+
+	trace_create_file("trace_marker", 0220, d_tracer,
+			NULL, &tracing_mark_fops);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
-	entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
-				    &ftrace_update_tot_cnt,
-				    &tracing_dyn_info_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'dyn_ftrace_total_info' entry\n");
+	trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
+			&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
 #endif
 #ifdef CONFIG_SYSPROF_TRACER
 	init_tracer_sysprof_debugfs(d_tracer);
 #endif
 
+	create_trace_options_dir();
+
 	for_each_tracing_cpu(cpu)
 		tracing_init_debugfs_percpu(cpu);
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 47aa6d0..f76a8f86 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -470,6 +470,12 @@ void trace_wake_up(void);
 void tracing_reset(struct trace_array *tr, int cpu);
 void tracing_reset_online_cpus(struct trace_array *tr);
 int tracing_open_generic(struct inode *inode, struct file *filp);
+struct dentry *trace_create_file(const char *name,
+				 mode_t mode,
+				 struct dentry *parent,
+				 void *data,
+				 const struct file_operations *fops);
+
 struct dentry *tracing_init_dentry(void);
 void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
 
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 22cba99..199de9c 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -28,4 +28,3 @@ void ftrace_profile_disable(int event_id)
 			return event->profile_disable(event);
 	}
 }
-
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index eb81556..9bece96 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -245,17 +245,13 @@ static const struct file_operations ftrace_formats_fops = {
 static __init int init_trace_printk_function_export(void)
 {
 	struct dentry *d_tracer;
-	struct dentry *entry;
 
 	d_tracer = tracing_init_dentry();
 	if (!d_tracer)
 		return 0;
 
-	entry = debugfs_create_file("printk_formats", 0444, d_tracer,
+	trace_create_file("printk_formats", 0444, d_tracer,
 				    NULL, &ftrace_formats_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'printk_formats' entry\n");
 
 	return 0;
 }
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index c750f65..1796f00 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -352,19 +352,14 @@ __setup("stacktrace", enable_stacktrace);
 static __init int stack_trace_init(void)
 {
 	struct dentry *d_tracer;
-	struct dentry *entry;
 
 	d_tracer = tracing_init_dentry();
 
-	entry = debugfs_create_file("stack_max_size", 0644, d_tracer,
-				    &max_stack_size, &stack_max_size_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'stack_max_size' entry\n");
+	trace_create_file("stack_max_size", 0644, d_tracer,
+			&max_stack_size, &stack_max_size_fops);
 
-	entry = debugfs_create_file("stack_trace", 0444, d_tracer,
-				    NULL, &stack_trace_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'stack_trace' entry\n");
+	trace_create_file("stack_trace", 0444, d_tracer,
+			NULL, &stack_trace_fops);
 
 	if (stack_tracer_enabled)
 		register_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 91fd19c..e04b76c 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -321,11 +321,7 @@ static const struct file_operations sysprof_sample_fops = {
 
 void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
 {
-	struct dentry *entry;
 
-	entry = debugfs_create_file("sysprof_sample_period", 0644,
+	trace_create_file("sysprof_sample_period", 0644,
 			d_tracer, NULL, &sysprof_sample_fops);
-	if (entry)
-		return;
-	pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n");
 }
-- 
cgit v0.10.2


From 597af81537654097b67fd7a0c92775e66d4a86fe Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 3 Apr 2009 15:24:12 -0400
Subject: function-graph: use int instead of atomic for ftrace_graph_active

Impact: cleanup

The variable ftrace_graph_active is only modified under the
ftrace_lock mutex, thus an atomic is not necessary for modification.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6ea5a1a..8e6a0b5 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3092,7 +3092,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
-static atomic_t ftrace_graph_active;
+static int ftrace_graph_active;
 static struct notifier_block ftrace_suspend_notifier;
 
 int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
@@ -3244,7 +3244,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 	mutex_lock(&ftrace_lock);
 
 	/* we currently allow only one tracer registered at a time */
-	if (atomic_read(&ftrace_graph_active)) {
+	if (ftrace_graph_active) {
 		ret = -EBUSY;
 		goto out;
 	}
@@ -3252,10 +3252,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 	ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
 	register_pm_notifier(&ftrace_suspend_notifier);
 
-	atomic_inc(&ftrace_graph_active);
+	ftrace_graph_active++;
 	ret = start_graph_tracing();
 	if (ret) {
-		atomic_dec(&ftrace_graph_active);
+		ftrace_graph_active--;
 		goto out;
 	}
 
@@ -3273,10 +3273,10 @@ void unregister_ftrace_graph(void)
 {
 	mutex_lock(&ftrace_lock);
 
-	if (!unlikely(atomic_read(&ftrace_graph_active)))
+	if (unlikely(!ftrace_graph_active))
 		goto out;
 
-	atomic_dec(&ftrace_graph_active);
+	ftrace_graph_active--;
 	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
 	ftrace_graph_entry = ftrace_graph_entry_stub;
@@ -3290,7 +3290,7 @@ void unregister_ftrace_graph(void)
 /* Allocate a return stack for newly created task */
 void ftrace_graph_init_task(struct task_struct *t)
 {
-	if (atomic_read(&ftrace_graph_active)) {
+	if (ftrace_graph_active) {
 		t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
 				* sizeof(struct ftrace_ret_stack),
 				GFP_KERNEL);
-- 
cgit v0.10.2


From dcef788eb9659b61a2110284fcce3ca6e63480d2 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Tue, 31 Mar 2009 15:26:14 +0800
Subject: ftrace: clean up enable logic for sched_switch

Unify sched_switch and sched_wakeup's action to following logic:
Do record_cmdline when start_cmdline_record() is called.
Start tracing events when the tracer is started.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
LKML-Reference: <49D1C596.5050203@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 9117cea..9d8cccd 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -29,13 +29,13 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
 	int cpu;
 	int pc;
 
-	if (!sched_ref || sched_stopped)
+	if (unlikely(!sched_ref))
 		return;
 
 	tracing_record_cmdline(prev);
 	tracing_record_cmdline(next);
 
-	if (!tracer_enabled)
+	if (!tracer_enabled || sched_stopped)
 		return;
 
 	pc = preempt_count();
@@ -56,15 +56,15 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
 	unsigned long flags;
 	int cpu, pc;
 
-	if (!likely(tracer_enabled))
+	if (unlikely(!sched_ref))
 		return;
 
-	pc = preempt_count();
 	tracing_record_cmdline(current);
 
-	if (sched_stopped)
+	if (!tracer_enabled || sched_stopped)
 		return;
 
+	pc = preempt_count();
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = ctx_trace->data[cpu];
-- 
cgit v0.10.2


From 9c83633ad38138855181af6936e8ac570ef7e2cb Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Tue, 7 Apr 2009 14:48:16 +0300
Subject: missing unlock in jfs_quota_write()

We should unlock &inode->i_mutex on the error path.  This bug was
in ext2_quota_write().  I sent a patch to them today as well.

Found by smatch (http://repo.or.cz/w/smatch.git).  Compile tested.

regards,
dan carpenter

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>

diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 6f21adf..d9b0e92 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -720,8 +720,10 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
 		blk++;
 	}
 out:
-	if (len == towrite)
+	if (len == towrite) {
+		mutex_unlock(&inode->i_mutex);
 		return err;
+	}
 	if (inode->i_size < off+len-towrite)
 		i_size_write(inode, off+len-towrite);
 	inode->i_version++;
-- 
cgit v0.10.2


From 169aafbc8d3f05431b5cfeb60294a12b8ef2bcee Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 7 Apr 2009 13:37:26 -0700
Subject: lguest: update lazy mmu changes to match lguest's use of kvm
 hypercalls

Duplicate hcall -> kvm_hypercall0 convertion from "lguest: use KVM
hypercalls".

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Matias Zabaljauregui <zabaljauregui at gmail.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 5ab2397..cfb2d68 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -168,7 +168,7 @@ static void lazy_hcall3(unsigned long call,
  * issue the do-nothing hypercall to flush any stored calls. */
 static void lguest_leave_lazy_mmu_mode(void)
 {
-	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
+	kvm_hypercall0(LHCALL_FLUSH_ASYNC);
 	paravirt_leave_lazy_mmu();
 }
 
-- 
cgit v0.10.2


From 44bc9dc729e33a4ec6ebed4d0b6c08e8d20b42cf Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 8 Apr 2009 10:47:17 +0200
Subject: mm, x86, ptrace, bts: defer branch trace stopping, cleanup

Andrew Morton noticed that mm.h needlessly includes sched.h - remove it.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 64d8ed2..776b641 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -13,7 +13,6 @@
 #include <linux/prio_tree.h>
 #include <linux/debug_locks.h>
 #include <linux/mm_types.h>
-#include <linux/sched.h>
 
 struct mempolicy;
 struct anon_vma;
-- 
cgit v0.10.2


From a34b50ddc265bae058c66661b096ef6384c5a8b1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 8 Apr 2009 10:56:54 +0200
Subject: mm, x86, ptrace, bts: defer branch trace stopping, remove dead code

Remove the unused free_locked_buffer() API.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 776b641..a3963ba 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1320,7 +1320,6 @@ int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
 void vmemmap_populate_print_last(void);
 
 extern void *alloc_locked_buffer(size_t size);
-extern void free_locked_buffer(void *buffer, size_t size);
 extern void refund_locked_buffer_memory(struct mm_struct *mm, size_t size);
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/mm/mlock.c b/mm/mlock.c
index 749383b..28be15e 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -671,9 +671,3 @@ void refund_locked_buffer_memory(struct mm_struct *mm, size_t size)
 
 	up_write(&mm->mmap_sem);
 }
-
-void free_locked_buffer(void *buffer, size_t size)
-{
-	refund_locked_buffer_memory(current->mm, size);
-	kfree(buffer);
-}
-- 
cgit v0.10.2


From dc66270b51a62b1a6888d5309229e638a305c47b Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 8 Apr 2009 20:30:10 +1000
Subject: perf_counter: fix powerpc build

Commit 4af4998b ("perf_counter: rework context time") changed struct
perf_counter_context to have a 'time' field instead of a 'time_now'
field, but neglected to fix the place in the powerpc perf_counter.c
where the time_now field was accessed.  This fixes it.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18908.31922.411398.147810@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index f88c35d..0e56513 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -457,8 +457,7 @@ static void counter_sched_in(struct perf_counter *counter, int cpu)
 {
 	counter->state = PERF_COUNTER_STATE_ACTIVE;
 	counter->oncpu = cpu;
-	counter->tstamp_running += counter->ctx->time_now -
-		counter->tstamp_stopped;
+	counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped;
 	if (is_software_counter(counter))
 		counter->hw_ops->enable(counter);
 }
-- 
cgit v0.10.2


From f708223d49ac39f5af1643985056206c98033f5b Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 8 Apr 2009 20:30:18 +1000
Subject: perf_counter: powerpc: set sample enable bit for marked instruction
 events

Impact: enable access to hardware feature

POWER processors have the ability to "mark" a subset of the instructions
and provide more detailed information on what happens to the marked
instructions as they flow through the pipeline.  This marking is
enabled by the "sample enable" bit in MMCRA, and there are
synchronization requirements around setting and clearing the bit.

This adds logic to the processor-specific back-ends so that they know
which events relate to marked instructions and set the sampling enable
bit if any event that we want to put on the PMU is a marked instruction
event.  It also adds logic to the generic powerpc code to do the
necessary synchronization if that bit is set.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18908.31930.1024.228867@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 0e56513..0697ade 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -307,6 +307,15 @@ u64 hw_perf_save_disable(void)
 		}
 
 		/*
+		 * Disable instruction sampling if it was enabled
+		 */
+		if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
+			mtspr(SPRN_MMCRA,
+			      cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
+			mb();
+		}
+
+		/*
 		 * Set the 'freeze counters' bit.
 		 * The barrier is to make sure the mtspr has been
 		 * executed and the PMU has frozen the counters
@@ -347,12 +356,11 @@ void hw_perf_restore(u64 disable)
 	 * (possibly updated for removal of counters).
 	 */
 	if (!cpuhw->n_added) {
-		mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+		mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
 		mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
-		mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
 		if (cpuhw->n_counters == 0)
 			get_lppaca()->pmcregs_in_use = 0;
-		goto out;
+		goto out_enable;
 	}
 
 	/*
@@ -385,7 +393,7 @@ void hw_perf_restore(u64 disable)
 	 * Then unfreeze the counters.
 	 */
 	get_lppaca()->pmcregs_in_use = 1;
-	mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+	mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
 	mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
 	mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
 				| MMCR0_FC);
@@ -421,10 +429,20 @@ void hw_perf_restore(u64 disable)
 		write_pmc(counter->hw.idx, val);
 		perf_counter_update_userpage(counter);
 	}
-	mb();
 	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
+
+ out_enable:
+	mb();
 	mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
 
+	/*
+	 * Enable instruction sampling if necessary
+	 */
+	if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
+		mb();
+		mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+	}
+
  out:
 	local_irq_restore(flags);
 }
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index cec21ea..1222c8e 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -1,5 +1,5 @@
 /*
- * Performance counter support for POWER5 (not POWER5++) processors.
+ * Performance counter support for POWER5+/++ (not POWER5) processors.
  *
  * Copyright 2009 Paul Mackerras, IBM Corporation.
  *
@@ -281,10 +281,107 @@ static int power5p_get_alternatives(unsigned int event, unsigned int alt[])
 	return nalt;
 }
 
+/*
+ * Map of which direct events on which PMCs are marked instruction events.
+ * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event.
+ * Bit 0 is set if it is marked for all PMCs.
+ * The 0x80 bit indicates a byte decode PMCSEL value.
+ */
+static unsigned char direct_event_is_marked[0x28] = {
+	0,	/* 00 */
+	0x1f,	/* 01 PM_IOPS_CMPL */
+	0x2,	/* 02 PM_MRK_GRP_DISP */
+	0xe,	/* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
+	0,	/* 04 */
+	0x1c,	/* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */
+	0x80,	/* 06 */
+	0x80,	/* 07 */
+	0, 0, 0,/* 08 - 0a */
+	0x18,	/* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */
+	0,	/* 0c */
+	0x80,	/* 0d */
+	0x80,	/* 0e */
+	0,	/* 0f */
+	0,	/* 10 */
+	0x14,	/* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */
+	0,	/* 12 */
+	0x10,	/* 13 PM_MRK_GRP_CMPL */
+	0x1f,	/* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */
+	0x2,	/* 15 PM_MRK_GRP_ISSUED */
+	0x80,	/* 16 */
+	0x80,	/* 17 */
+	0, 0, 0, 0, 0,
+	0x80,	/* 1d */
+	0x80,	/* 1e */
+	0,	/* 1f */
+	0x80,	/* 20 */
+	0x80,	/* 21 */
+	0x80,	/* 22 */
+	0x80,	/* 23 */
+	0x80,	/* 24 */
+	0x80,	/* 25 */
+	0x80,	/* 26 */
+	0x80,	/* 27 */
+};
+
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int power5p_marked_instr_event(unsigned int event)
+{
+	int pmc, psel;
+	int bit, byte, unit;
+	u32 mask;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	psel = event & PM_PMCSEL_MSK;
+	if (pmc >= 5)
+		return 0;
+
+	bit = -1;
+	if (psel < sizeof(direct_event_is_marked)) {
+		if (direct_event_is_marked[psel] & (1 << pmc))
+			return 1;
+		if (direct_event_is_marked[psel] & 0x80)
+			bit = 4;
+		else if (psel == 0x08)
+			bit = pmc - 1;
+		else if (psel == 0x10)
+			bit = 4 - pmc;
+		else if (psel == 0x1b && (pmc == 1 || pmc == 3))
+			bit = 4;
+	} else if ((psel & 0x48) == 0x40) {
+		bit = psel & 7;
+	} else if (psel == 0x28) {
+		bit = pmc - 1;
+	} else if (pmc == 3 && (psel == 0x2e || psel == 0x2f)) {
+		bit = 4;
+	}
+
+	if (!(event & PM_BUSEVENT_MSK) || bit == -1)
+		return 0;
+
+	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+	if (unit == PM_LSU0) {
+		/* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */
+		mask = 0x5dff00;
+	} else if (unit == PM_LSU1 && byte >= 4) {
+		byte -= 4;
+		/* byte 5 bits 6-7, byte 6 bits 0,4, byte 7 bits 0-4,6 */
+		mask = 0x5f11c000;
+	} else
+		return 0;
+
+	return (mask >> (byte * 8 + bit)) & 1;
+}
+
 static int power5p_compute_mmcr(unsigned int event[], int n_ev,
 				unsigned int hwc[], u64 mmcr[])
 {
 	u64 mmcr1 = 0;
+	u64 mmcra = 0;
 	unsigned int pmc, unit, byte, psel;
 	unsigned int ttm;
 	int i, isbus, bit, grsel;
@@ -404,6 +501,8 @@ static int power5p_compute_mmcr(unsigned int event[], int n_ev,
 			grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
 			mmcr1 |= (u64)grsel << grsel_shift[bit];
 		}
+		if (power5p_marked_instr_event(event[i]))
+			mmcra |= MMCRA_SAMPLE_ENABLE;
 		if ((psel & 0x58) == 0x40 && (byte & 1) != ((pmc >> 1) & 1))
 			/* select alternate byte lane */
 			psel |= 0x10;
@@ -419,7 +518,7 @@ static int power5p_compute_mmcr(unsigned int event[], int n_ev,
 	if (pmc_inuse & 0x3e)
 		mmcr[0] |= MMCR0_PMCjCE;
 	mmcr[1] = mmcr1;
-	mmcr[2] = 0;
+	mmcr[2] = mmcra;
 	return 0;
 }
 
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index 379ed10..116c4bb 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -290,10 +290,102 @@ static int power5_get_alternatives(unsigned int event, unsigned int alt[])
 	return nalt;
 }
 
+/*
+ * Map of which direct events on which PMCs are marked instruction events.
+ * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event.
+ * Bit 0 is set if it is marked for all PMCs.
+ * The 0x80 bit indicates a byte decode PMCSEL value.
+ */
+static unsigned char direct_event_is_marked[0x28] = {
+	0,	/* 00 */
+	0x1f,	/* 01 PM_IOPS_CMPL */
+	0x2,	/* 02 PM_MRK_GRP_DISP */
+	0xe,	/* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
+	0,	/* 04 */
+	0x1c,	/* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */
+	0x80,	/* 06 */
+	0x80,	/* 07 */
+	0, 0, 0,/* 08 - 0a */
+	0x18,	/* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */
+	0,	/* 0c */
+	0x80,	/* 0d */
+	0x80,	/* 0e */
+	0,	/* 0f */
+	0,	/* 10 */
+	0x14,	/* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */
+	0,	/* 12 */
+	0x10,	/* 13 PM_MRK_GRP_CMPL */
+	0x1f,	/* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */
+	0x2,	/* 15 PM_MRK_GRP_ISSUED */
+	0x80,	/* 16 */
+	0x80,	/* 17 */
+	0, 0, 0, 0, 0,
+	0x80,	/* 1d */
+	0x80,	/* 1e */
+	0,	/* 1f */
+	0x80,	/* 20 */
+	0x80,	/* 21 */
+	0x80,	/* 22 */
+	0x80,	/* 23 */
+	0x80,	/* 24 */
+	0x80,	/* 25 */
+	0x80,	/* 26 */
+	0x80,	/* 27 */
+};
+
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int power5_marked_instr_event(unsigned int event)
+{
+	int pmc, psel;
+	int bit, byte, unit;
+	u32 mask;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	psel = event & PM_PMCSEL_MSK;
+	if (pmc >= 5)
+		return 0;
+
+	bit = -1;
+	if (psel < sizeof(direct_event_is_marked)) {
+		if (direct_event_is_marked[psel] & (1 << pmc))
+			return 1;
+		if (direct_event_is_marked[psel] & 0x80)
+			bit = 4;
+		else if (psel == 0x08)
+			bit = pmc - 1;
+		else if (psel == 0x10)
+			bit = 4 - pmc;
+		else if (psel == 0x1b && (pmc == 1 || pmc == 3))
+			bit = 4;
+	} else if ((psel & 0x58) == 0x40)
+		bit = psel & 7;
+
+	if (!(event & PM_BUSEVENT_MSK))
+		return 0;
+
+	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+	if (unit == PM_LSU0) {
+		/* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */
+		mask = 0x5dff00;
+	} else if (unit == PM_LSU1 && byte >= 4) {
+		byte -= 4;
+		/* byte 4 bits 1,3,5,7, byte 5 bits 6-7, byte 7 bits 0-4,6 */
+		mask = 0x5f00c0aa;
+	} else
+		return 0;
+
+	return (mask >> (byte * 8 + bit)) & 1;
+}
+
 static int power5_compute_mmcr(unsigned int event[], int n_ev,
 			       unsigned int hwc[], u64 mmcr[])
 {
 	u64 mmcr1 = 0;
+	u64 mmcra = 0;
 	unsigned int pmc, unit, byte, psel;
 	unsigned int ttm, grp;
 	int i, isbus, bit, grsel;
@@ -430,6 +522,8 @@ static int power5_compute_mmcr(unsigned int event[], int n_ev,
 			grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
 			mmcr1 |= (u64)grsel << grsel_shift[bit];
 		}
+		if (power5_marked_instr_event(event[i]))
+			mmcra |= MMCRA_SAMPLE_ENABLE;
 		if (pmc <= 3)
 			mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
 		hwc[i] = pmc;
@@ -442,7 +536,7 @@ static int power5_compute_mmcr(unsigned int event[], int n_ev,
 	if (pmc_inuse & 0x3e)
 		mmcr[0] |= MMCR0_PMCjCE;
 	mmcr[1] = mmcr1;
-	mmcr[2] = 0;
+	mmcr[2] = mmcra;
 	return 0;
 }
 
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index b1f61f3..fce1fc2 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -49,12 +49,134 @@
 #define MMCR1_PMCSEL_MSK	0xff
 
 /*
+ * Map of which direct events on which PMCs are marked instruction events.
+ * Indexed by PMCSEL value >> 1.
+ * Bottom 4 bits are a map of which PMCs are interesting,
+ * top 4 bits say what sort of event:
+ *   0 = direct marked event,
+ *   1 = byte decode event,
+ *   4 = add/and event (PMC1 -> bits 0 & 4),
+ *   5 = add/and event (PMC1 -> bits 1 & 5),
+ *   6 = add/and event (PMC1 -> bits 2 & 6),
+ *   7 = add/and event (PMC1 -> bits 3 & 7).
+ */
+static unsigned char direct_event_is_marked[0x60 >> 1] = {
+	0,	/* 00 */
+	0,	/* 02 */
+	0,	/* 04 */
+	0x07,	/* 06 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
+	0x04,	/* 08 PM_MRK_DFU_FIN */
+	0x06,	/* 0a PM_MRK_IFU_FIN, PM_MRK_INST_FIN */
+	0,	/* 0c */
+	0,	/* 0e */
+	0x02,	/* 10 PM_MRK_INST_DISP */
+	0x08,	/* 12 PM_MRK_LSU_DERAT_MISS */
+	0,	/* 14 */
+	0,	/* 16 */
+	0x0c,	/* 18 PM_THRESH_TIMEO, PM_MRK_INST_FIN */
+	0x0f,	/* 1a PM_MRK_INST_DISP, PM_MRK_{FXU,FPU,LSU}_FIN */
+	0x01,	/* 1c PM_MRK_INST_ISSUED */
+	0,	/* 1e */
+	0,	/* 20 */
+	0,	/* 22 */
+	0,	/* 24 */
+	0,	/* 26 */
+	0x15,	/* 28 PM_MRK_DATA_FROM_L2MISS, PM_MRK_DATA_FROM_L3MISS */
+	0,	/* 2a */
+	0,	/* 2c */
+	0,	/* 2e */
+	0x4f,	/* 30 */
+	0x7f,	/* 32 */
+	0x4f,	/* 34 */
+	0x5f,	/* 36 */
+	0x6f,	/* 38 */
+	0x4f,	/* 3a */
+	0,	/* 3c */
+	0x08,	/* 3e PM_MRK_INST_TIMEO */
+	0x1f,	/* 40 */
+	0x1f,	/* 42 */
+	0x1f,	/* 44 */
+	0x1f,	/* 46 */
+	0x1f,	/* 48 */
+	0x1f,	/* 4a */
+	0x1f,	/* 4c */
+	0x1f,	/* 4e */
+	0,	/* 50 */
+	0x05,	/* 52 PM_MRK_BR_TAKEN, PM_MRK_BR_MPRED */
+	0x1c,	/* 54 PM_MRK_PTEG_FROM_L3MISS, PM_MRK_PTEG_FROM_L2MISS */
+	0x02,	/* 56 PM_MRK_LD_MISS_L1 */
+	0,	/* 58 */
+	0,	/* 5a */
+	0,	/* 5c */
+	0,	/* 5e */
+};
+
+/*
+ * Masks showing for each unit which bits are marked events.
+ * These masks are in LE order, i.e. 0x00000001 is byte 0, bit 0.
+ */
+static u32 marked_bus_events[16] = {
+	0x01000000,	/* direct events set 1: byte 3 bit 0 */
+	0x00010000,	/* direct events set 2: byte 2 bit 0 */
+	0, 0, 0, 0,	/* IDU, IFU, nest: nothing */
+	0x00000088,	/* VMX set 1: byte 0 bits 3, 7 */
+	0x000000c0,	/* VMX set 2: byte 0 bits 4-7 */
+	0x04010000,	/* LSU set 1: byte 2 bit 0, byte 3 bit 2 */
+	0xff010000u,	/* LSU set 2: byte 2 bit 0, all of byte 3 */
+	0,		/* LSU set 3 */
+	0x00000010,	/* VMX set 3: byte 0 bit 4 */
+	0,		/* BFP set 1 */
+	0x00000022,	/* BFP set 2: byte 0 bits 1, 5 */
+	0, 0
+};
+	
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int power6_marked_instr_event(unsigned int event)
+{
+	int pmc, psel, ptype;
+	int bit, byte, unit;
+	u32 mask;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	psel = (event & PM_PMCSEL_MSK) >> 1;	/* drop edge/level bit */
+	if (pmc >= 5)
+		return 0;
+
+	bit = -1;
+	if (psel < sizeof(direct_event_is_marked)) {
+		ptype = direct_event_is_marked[psel];
+		if (pmc == 0 || !(ptype & (1 << (pmc - 1))))
+			return 0;
+		ptype >>= 4;
+		if (ptype == 0)
+			return 1;
+		if (ptype == 1)
+			bit = 0;
+		else
+			bit = ptype ^ (pmc - 1);
+	} else if ((psel & 0x48) == 0x40)
+		bit = psel & 7;
+
+	if (!(event & PM_BUSEVENT_MSK) || bit == -1)
+		return 0;
+
+	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+	mask = marked_bus_events[unit];
+	return (mask >> (byte * 8 + bit)) & 1;
+}
+
+/*
  * Assign PMC numbers and compute MMCR1 value for a set of events
  */
 static int p6_compute_mmcr(unsigned int event[], int n_ev,
 			   unsigned int hwc[], u64 mmcr[])
 {
 	u64 mmcr1 = 0;
+	u64 mmcra = 0;
 	int i;
 	unsigned int pmc, ev, b, u, s, psel;
 	unsigned int ttmset = 0;
@@ -116,6 +238,8 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev,
 			if (ev & PM_LLAV)
 				mmcr1 |= MMCR1_PMC1_LLA_VALUE >> pmc;
 		}
+		if (power6_marked_instr_event(event[i]))
+			mmcra |= MMCRA_SAMPLE_ENABLE;
 		mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc);
 	}
 	mmcr[0] = 0;
@@ -124,7 +248,7 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev,
 	if (pmc_inuse & 0xe)
 		mmcr[0] |= MMCR0_PMCjCE;
 	mmcr[1] = mmcr1;
-	mmcr[2] = 0;
+	mmcr[2] = mmcra;
 	return 0;
 }
 
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index c325658..aed8ccd 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -19,6 +19,8 @@
 #define PM_PMC_MSK	0xf
 #define PM_UNIT_SH	8	/* TTMMUX number and setting - unit select */
 #define PM_UNIT_MSK	0xf
+#define PM_SPCSEL_SH	6
+#define PM_SPCSEL_MSK	3
 #define PM_BYTE_SH	4	/* Byte number of event bus to use */
 #define PM_BYTE_MSK	3
 #define PM_PMCSEL_MSK	0xf
@@ -88,8 +90,11 @@ static short mmcr1_adder_bits[8] = {
  * Layout of constraint bits:
  * 6666555555555544444444443333333333222222222211111111110000000000
  * 3210987654321098765432109876543210987654321098765432109876543210
- *                 <><>[  >[  >[  ><  ><  ><  ><  ><><><><><><><><>
- *                 T0T1 UC  PS1 PS2 B0  B1  B2  B3 P1P2P3P4P5P6P7P8
+ *               <><><>[  >[  >[  ><  ><  ><  ><  ><><><><><><><><>
+ *               SPT0T1 UC  PS1 PS2 B0  B1  B2  B3 P1P2P3P4P5P6P7P8
+ *
+ * SP - SPCSEL constraint
+ *     48-49: SPCSEL value 0x3_0000_0000_0000
  *
  * T0 - TTM0 constraint
  *     46-47: TTM0SEL value (0=FPU, 2=IFU, 3=VPU) 0xC000_0000_0000
@@ -126,6 +131,57 @@ static short mmcr1_adder_bits[8] = {
  *     0-13: Count of events needing PMC2..PMC8
  */
 
+static unsigned char direct_marked_event[8] = {
+	(1<<2) | (1<<3),	/* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */
+	(1<<3) | (1<<5),	/* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */
+	(1<<3) | (1<<5),	/* PMC3: PM_MRK_ST_CMPL_INT, PM_MRK_VMX_FIN */
+	(1<<4) | (1<<5),	/* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */
+	(1<<4) | (1<<5),	/* PMC5: PM_GRP_MRK, PM_MRK_GRP_TIMEO */
+	(1<<3) | (1<<4) | (1<<5),
+		/* PMC6: PM_MRK_ST_STS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */
+	(1<<4) | (1<<5),	/* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */
+	(1<<4)			/* PMC8: PM_MRK_LSU_FIN */
+};
+
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int p970_marked_instr_event(unsigned int event)
+{
+	int pmc, psel, unit, byte, bit;
+	unsigned int mask;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	psel = event & PM_PMCSEL_MSK;
+	if (pmc) {
+		if (direct_marked_event[pmc - 1] & (1 << psel))
+			return 1;
+		if (psel == 0)		/* add events */
+			bit = (pmc <= 4)? pmc - 1: 8 - pmc;
+		else if (psel == 7 || psel == 13)	/* decode events */
+			bit = 4;
+		else
+			return 0;
+	} else
+		bit = psel;
+
+	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+	mask = 0;
+	switch (unit) {
+	case PM_VPU:
+		mask = 0x4c;		/* byte 0 bits 2,3,6 */
+	case PM_LSU0:
+		/* byte 2 bits 0,2,3,4,6; all of byte 1 */
+		mask = 0x085dff00;
+	case PM_LSU1L:
+		mask = 0x50 << 24;	/* byte 3 bits 4,6 */
+		break;
+	}
+	return (mask >> (byte * 8 + bit)) & 1;
+}
+
 /* Masks and values for using events from the various units */
 static u64 unit_cons[PM_LASTUNIT+1][2] = {
 	[PM_FPU] =   { 0xc80000000000ull, 0x040000000000ull },
@@ -138,7 +194,7 @@ static u64 unit_cons[PM_LASTUNIT+1][2] = {
 
 static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 {
-	int pmc, byte, unit, sh;
+	int pmc, byte, unit, sh, spcsel;
 	u64 mask = 0, value = 0;
 	int grp = -1;
 
@@ -177,6 +233,11 @@ static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 		mask  |= 0x800000000ull;
 		value |= 0x100000000ull;
 	}
+	spcsel = (event >> PM_SPCSEL_SH) & PM_SPCSEL_MSK;
+	if (spcsel) {
+		mask  |= 3ull << 48;
+		value |= (u64)spcsel << 48;
+	}
 	*maskp = mask;
 	*valp = value;
 	return 0;
@@ -209,6 +270,7 @@ static int p970_compute_mmcr(unsigned int event[], int n_ev,
 	unsigned char ttmuse[2];
 	unsigned char pmcsel[8];
 	int i;
+	int spcsel;
 
 	if (n_ev > 8)
 		return -1;
@@ -316,6 +378,10 @@ static int p970_compute_mmcr(unsigned int event[], int n_ev,
 		}
 		pmcsel[pmc] = psel;
 		hwc[i] = pmc;
+		spcsel = (event[i] >> PM_SPCSEL_SH) & PM_SPCSEL_MSK;
+		mmcr1 |= spcsel;
+		if (p970_marked_instr_event(event[i]))
+			mmcra |= MMCRA_SAMPLE_ENABLE;
 	}
 	for (pmc = 0; pmc < 2; ++pmc)
 		mmcr0 |= pmcsel[pmc] << (MMCR0_PMC1SEL_SH - 7 * pmc);
-- 
cgit v0.10.2


From bab5bc9e857638880facef76e4b4c3fa807f8c73 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Tue, 7 Apr 2009 23:23:50 -0700
Subject: futex: fixup unlocked requeue pi case

Thomas's testing caught a problem when the requeue target futex is
unowned and multiple tasks are requeued to it. This patch ensures
the FUTEX_WAITERS bit gets set if futex_requeue() will requeue one
or more tasks in addition to the one acquiring the lock.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/futex.c b/kernel/futex.c
index 185c981..041bf3a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -565,12 +565,14 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
 
 /**
  * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex
- * @uaddr:	the pi futex user address
- * @hb:		the pi futex hash bucket
- * @key:	the futex key associated with uaddr and hb
- * @ps:		the pi_state pointer where we store the result of the lookup
- * @task:	the task to perform the atomic lock work for.  This will be
- * 		"current" except in the case of requeue pi.
+ * @uaddr:		the pi futex user address
+ * @hb:			the pi futex hash bucket
+ * @key:		the futex key associated with uaddr and hb
+ * @ps:			the pi_state pointer where we store the result of the
+ *			lookup
+ * @task:		the task to perform the atomic lock work for.  This will
+ *			be "current" except in the case of requeue pi.
+ * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)
  *
  * Returns:
  *  0 - ready to wait
@@ -582,7 +584,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
 static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
 				union futex_key *key,
 				struct futex_pi_state **ps,
-				struct task_struct *task)
+				struct task_struct *task, int set_waiters)
 {
 	int lock_taken, ret, ownerdied = 0;
 	u32 uval, newval, curval;
@@ -596,6 +598,8 @@ retry:
 	 * the locks. It will most likely not succeed.
 	 */
 	newval = task_pid_vnr(task);
+	if (set_waiters)
+		newval |= FUTEX_WAITERS;
 
 	curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
 
@@ -1004,14 +1008,18 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
 
 /**
  * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
- * @pifutex:	the user address of the to futex
- * @hb1:	the from futex hash bucket, must be locked by the caller
- * @hb2:	the to futex hash bucket, must be locked by the caller
- * @key1:	the from futex key
- * @key2:	the to futex key
+ * @pifutex:		the user address of the to futex
+ * @hb1:		the from futex hash bucket, must be locked by the caller
+ * @hb2:		the to futex hash bucket, must be locked by the caller
+ * @key1:		the from futex key
+ * @key2:		the to futex key
+ * @ps:			address to store the pi_state pointer
+ * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)
  *
  * Try and get the lock on behalf of the top waiter if we can do it atomically.
- * Wake the top waiter if we succeed.  hb1 and hb2 must be held by the caller.
+ * Wake the top waiter if we succeed.  If the caller specified set_waiters,
+ * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
+ * hb1 and hb2 must be held by the caller.
  *
  * Returns:
  *  0 - failed to acquire the lock atomicly
@@ -1022,15 +1030,23 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 				 struct futex_hash_bucket *hb1,
 				 struct futex_hash_bucket *hb2,
 				 union futex_key *key1, union futex_key *key2,
-				 struct futex_pi_state **ps)
+				 struct futex_pi_state **ps, int set_waiters)
 {
-	struct futex_q *top_waiter;
+	struct futex_q *top_waiter = NULL;
 	u32 curval;
 	int ret;
 
 	if (get_futex_value_locked(&curval, pifutex))
 		return -EFAULT;
 
+	/*
+	 * Find the top_waiter and determine if there are additional waiters.
+	 * If the caller intends to requeue more than 1 waiter to pifutex,
+	 * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
+	 * as we have means to handle the possible fault.  If not, don't set
+	 * the bit unecessarily as it will force the subsequent unlock to enter
+	 * the kernel.
+	 */
 	top_waiter = futex_top_waiter(hb1, key1);
 
 	/* There are no waiters, nothing for us to do. */
@@ -1038,10 +1054,12 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 		return 0;
 
 	/*
-	 * Either take the lock for top_waiter or set the FUTEX_WAITERS bit.
-	 * The pi_state is returned in ps in contended cases.
+	 * Try to take the lock for top_waiter.  Set the FUTEX_WAITERS bit in
+	 * the contended case or if set_waiters is 1.  The pi_state is returned
+	 * in ps in contended cases.
 	 */
-	ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task);
+	ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
+				   set_waiters);
 	if (ret == 1)
 		requeue_pi_wake_futex(top_waiter, key2);
 
@@ -1146,9 +1164,14 @@ retry_private:
 	}
 
 	if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
-		/* Attempt to acquire uaddr2 and wake the top_waiter. */
+		/*
+		 * Attempt to acquire uaddr2 and wake the top waiter. If we
+		 * intend to requeue waiters, force setting the FUTEX_WAITERS
+		 * bit.  We force this here where we are able to easily handle
+		 * faults rather in the requeue loop below.
+		 */
 		ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
-						 &key2, &pi_state);
+						 &key2, &pi_state, nr_requeue);
 
 		/*
 		 * At this point the top_waiter has either taken uaddr2 or is
@@ -1810,7 +1833,7 @@ retry:
 retry_private:
 	hb = queue_lock(&q);
 
-	ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current);
+	ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
 	if (unlikely(ret)) {
 		switch (ret) {
 		case 1:
-- 
cgit v0.10.2


From fcb2ac5bdfa3a7a04fb9749b916f64400f4c35a8 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Wed, 8 Apr 2009 13:31:58 +0200
Subject: x86_32: introduce restore_fpu_checking()

Impact: cleanup, prepare FPU code unificaton

Like on x86_64, return an error from restore_fpu and kill the task
if it fails.

Also rename restore_fpu to restore_fpu_checking which allows ifdefs
to be removed in math_state_restore().

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
LKML-Reference: <1239190320-23952-1-git-send-email-jirislaby@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 71c9e51..09a2d6d 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -185,12 +185,10 @@ static inline void tolerant_fwait(void)
 	asm volatile("fnclex ; fwait");
 }
 
-static inline void restore_fpu(struct task_struct *tsk)
+static inline int restore_fpu_checking(struct task_struct *tsk)
 {
-	if (task_thread_info(tsk)->status & TS_XSAVE) {
-		xrstor_checking(&tsk->thread.xstate->xsave);
-		return;
-	}
+	if (task_thread_info(tsk)->status & TS_XSAVE)
+		return xrstor_checking(&tsk->thread.xstate->xsave);
 	/*
 	 * The "nop" is needed to make the instructions the same
 	 * length.
@@ -200,6 +198,7 @@ static inline void restore_fpu(struct task_struct *tsk)
 		"fxrstor %1",
 		X86_FEATURE_FXSR,
 		"m" (tsk->thread.xstate->fxsave));
+	return 0;
 }
 
 /* We need a safe address that is cheap to find and that is already
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a1d2883..d696145 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -839,9 +839,6 @@ asmlinkage void math_state_restore(void)
 	}
 
 	clts();				/* Allow maths ops (or we recurse) */
-#ifdef CONFIG_X86_32
-	restore_fpu(tsk);
-#else
 	/*
 	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
 	 */
@@ -850,7 +847,7 @@ asmlinkage void math_state_restore(void)
 		force_sig(SIGSEGV, tsk);
 		return;
 	}
-#endif
+
 	thread->status |= TS_USEDFPU;	/* So we fnsave on switch_to() */
 	tsk->fpu_counter++;
 }
-- 
cgit v0.10.2


From 34ba476a01e128aad51e02f9be854584e9ec73cf Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Wed, 8 Apr 2009 13:31:59 +0200
Subject: x86: unify restore_fpu_checking

Impact: cleanup

On x86_32, separate f*rstor to an inline function which makes
restore_fpu_checking the same on both platforms -> move it
outside the ifdefs.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
LKML-Reference: <1239190320-23952-2-git-send-email-jirislaby@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 09a2d6d..7a6f21d 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -75,14 +75,6 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
 	return err;
 }
 
-static inline int restore_fpu_checking(struct task_struct *tsk)
-{
-	if (task_thread_info(tsk)->status & TS_XSAVE)
-		return xrstor_checking(&tsk->thread.xstate->xsave);
-	else
-		return fxrstor_checking(&tsk->thread.xstate->fxsave);
-}
-
 /* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
    is pending. Clear the x87 state here by setting it to fixed
    values. The kernel data segment can be sometimes 0 and sometimes
@@ -185,10 +177,9 @@ static inline void tolerant_fwait(void)
 	asm volatile("fnclex ; fwait");
 }
 
-static inline int restore_fpu_checking(struct task_struct *tsk)
+/* perform fxrstor iff the processor has extended states, otherwise frstor */
+static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
 {
-	if (task_thread_info(tsk)->status & TS_XSAVE)
-		return xrstor_checking(&tsk->thread.xstate->xsave);
 	/*
 	 * The "nop" is needed to make the instructions the same
 	 * length.
@@ -197,7 +188,8 @@ static inline int restore_fpu_checking(struct task_struct *tsk)
 		"nop ; frstor %1",
 		"fxrstor %1",
 		X86_FEATURE_FXSR,
-		"m" (tsk->thread.xstate->fxsave));
+		"m" (*fx));
+
 	return 0;
 }
 
@@ -261,6 +253,14 @@ end:
 
 #endif	/* CONFIG_X86_64 */
 
+static inline int restore_fpu_checking(struct task_struct *tsk)
+{
+	if (task_thread_info(tsk)->status & TS_XSAVE)
+		return xrstor_checking(&tsk->thread.xstate->xsave);
+	else
+		return fxrstor_checking(&tsk->thread.xstate->fxsave);
+}
+
 /*
  * Signal frame handlers...
  */
-- 
cgit v0.10.2


From 4ecf458492c2d97b3f9d850a5f92d79792e0a7e7 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Wed, 8 Apr 2009 13:32:00 +0200
Subject: x86_64: fix incorrect comments

Impact: cleanup

The comments which fxrstor_checking and fxsave_uset refer to is
now in fxsave. Change the comments appropriately.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Cc: Jiri Slaby <jirislaby@gmail.com>
LKML-Reference: <1239190320-23952-3-git-send-email-jirislaby@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 7a6f21d..63d1850 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -67,7 +67,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
 		     ".previous\n"
 		     _ASM_EXTABLE(1b, 3b)
 		     : [err] "=r" (err)
-#if 0 /* See comment in __save_init_fpu() below. */
+#if 0 /* See comment in fxsave() below. */
 		     : [fx] "r" (fx), "m" (*fx), "0" (0));
 #else
 		     : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
@@ -112,7 +112,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
 		     ".previous\n"
 		     _ASM_EXTABLE(1b, 3b)
 		     : [err] "=r" (err), "=m" (*fx)
-#if 0 /* See comment in __fxsave_clear() below. */
+#if 0 /* See comment in fxsave() below. */
 		     : [fx] "r" (fx), "0" (0));
 #else
 		     : [fx] "cdaSDb" (fx), "0" (0));
-- 
cgit v0.10.2


From a59dacfdc9ba06903652fa4883bf1106278b18ec Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 17 Oct 2008 14:38:08 +0200
Subject: x86 early quirks: eliminate unused function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: cleanup

this warning:

  arch/x86/kernel/early-quirks.c:99: warning: ‘ati_ixp4x0_rev’ defined but not used

triggers because ati_ixp4x0_rev() is only used in the
ACPI && X86_IO_APIC case.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 76b8cd9..ebdb85c 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -97,6 +97,7 @@ static void __init nvidia_bugs(int num, int slot, int func)
 }
 
 #if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
+#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
 static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
 {
 	u32 d;
@@ -114,6 +115,7 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
 	d &= 0xff;
 	return d;
 }
+#endif
 
 static void __init ati_bugs(int num, int slot, int func)
 {
-- 
cgit v0.10.2


From cdc1cb0d4445f39561a65204d26f89365f917550 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 3 Apr 2009 17:15:14 -0700
Subject: x86: make wakeup_secondary_cpu_via_init static

Impact: cleanup

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <49D6A692.6040400@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 58d24ef..bddf2cc 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -538,7 +538,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
 	return (send_status | accept_status);
 }
 
-int __devinit
+static int __devinit
 wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 {
 	unsigned long send_status, accept_status = 0;
-- 
cgit v0.10.2


From 02421f98ec55c3ff118f358740ff640f096c7ad6 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 3 Apr 2009 17:15:53 -0700
Subject: x86: consistent about warm_reset_vector for UN_NON_UNIQUE_APIC

Impact: cleanup

didn't set it for UV_NON_UNIQUE_APIC, so don't restore it

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <49D6A6B9.6060501@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index bddf2cc..bf8ad63 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -822,10 +822,12 @@ do_rest:
 	/* mark "stuck" area as not stuck */
 	*((volatile unsigned long *)trampoline_base) = 0;
 
-	/*
-	 * Cleanup possible dangling ends...
-	 */
-	smpboot_restore_warm_reset_vector();
+	if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
+		/*
+		 * Cleanup possible dangling ends...
+		 */
+		smpboot_restore_warm_reset_vector();
+	}
 
 	return boot_error;
 }
-- 
cgit v0.10.2


From ceb5ac3264686e75e6951de6a18d4baa9bdecb92 Mon Sep 17 00:00:00 2001
From: Becky Bruce <beckyb@kernel.crashing.org>
Date: Wed, 8 Apr 2009 09:09:15 -0500
Subject: swiotlb: comment corrections

Impact: cleanup

swiotlb_map/unmap_single are now swiotlb_map/unmap_page;
trivially change all the comments to reference new names.

Also, there were some comments that should have been
referring to just plain old map_single, not swiotlb_map_single;
fix those as well.

Also change a use of the word "pointer", when what is
referred to is actually a dma/physical address.

Signed-off-by: Becky Bruce <beckyb@kernel.crashing.org>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Cc: jeremy@goop.org
Cc: ian.campbell@citrix.com
LKML-Reference: <1239199761-22886-2-git-send-email-galak@kernel.crashing.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 2b0b5a7..170cf56 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -60,8 +60,8 @@ enum dma_sync_target {
 int swiotlb_force;
 
 /*
- * Used to do a quick range check in swiotlb_unmap_single and
- * swiotlb_sync_single_*, to see if the memory was in fact allocated by this
+ * Used to do a quick range check in unmap_single and
+ * sync_single_*, to see if the memory was in fact allocated by this
  * API.
  */
 static char *io_tlb_start, *io_tlb_end;
@@ -560,7 +560,6 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 				   size)) {
 		/*
 		 * The allocated memory isn't reachable by the device.
-		 * Fall back on swiotlb_map_single().
 		 */
 		free_pages((unsigned long) ret, order);
 		ret = NULL;
@@ -568,9 +567,8 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 	if (!ret) {
 		/*
 		 * We are either out of memory or the device can't DMA
-		 * to GFP_DMA memory; fall back on
-		 * swiotlb_map_single(), which will grab memory from
-		 * the lowest available address range.
+		 * to GFP_DMA memory; fall back on map_single(), which
+		 * will grab memory from the lowest available address range.
 		 */
 		ret = map_single(hwdev, 0, size, DMA_FROM_DEVICE);
 		if (!ret)
@@ -634,7 +632,7 @@ swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
  * physical address to use is returned.
  *
  * Once the device is given the dma address, the device owns this memory until
- * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
+ * either swiotlb_unmap_page or swiotlb_dma_sync_single is performed.
  */
 dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
 			    unsigned long offset, size_t size,
@@ -648,7 +646,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
 
 	BUG_ON(dir == DMA_NONE);
 	/*
-	 * If the pointer passed in happens to be in the device's DMA window,
+	 * If the address happens to be in the device's DMA window,
 	 * we can safely return the device addr and not worry about bounce
 	 * buffering it.
 	 */
@@ -679,7 +677,7 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page);
 
 /*
  * Unmap a single streaming mode DMA translation.  The dma_addr and size must
- * match what was provided for in a previous swiotlb_map_single call.  All
+ * match what was provided for in a previous swiotlb_map_page call.  All
  * other usages are undefined.
  *
  * After this call, reads by the cpu to the buffer are guaranteed to see
@@ -703,7 +701,7 @@ EXPORT_SYMBOL_GPL(swiotlb_unmap_page);
  * Make physical memory consistent for a single streaming mode DMA translation
  * after a transfer.
  *
- * If you perform a swiotlb_map_single() but wish to interrogate the buffer
+ * If you perform a swiotlb_map_page() but wish to interrogate the buffer
  * using the cpu, yet do not wish to teardown the dma mapping, you must
  * call this function before doing so.  At the next point you give the dma
  * address back to the card, you must first perform a
@@ -777,7 +775,7 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device);
 
 /*
  * Map a set of buffers described by scatterlist in streaming mode for DMA.
- * This is the scatter-gather version of the above swiotlb_map_single
+ * This is the scatter-gather version of the above swiotlb_map_page
  * interface.  Here the scatter gather list elements are each tagged with the
  * appropriate dma address and length.  They are obtained via
  * sg_dma_{address,length}(SG).
@@ -788,7 +786,7 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device);
  *       The routine returns the number of addr/length pairs actually
  *       used, at most nents.
  *
- * Device ownership issues as mentioned above for swiotlb_map_single are the
+ * Device ownership issues as mentioned above for swiotlb_map_page are the
  * same here.
  */
 int
@@ -836,7 +834,7 @@ EXPORT_SYMBOL(swiotlb_map_sg);
 
 /*
  * Unmap a set of streaming mode DMA translations.  Again, cpu read rules
- * concerning calls here are the same as for swiotlb_unmap_single() above.
+ * concerning calls here are the same as for swiotlb_unmap_page() above.
  */
 void
 swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
-- 
cgit v0.10.2


From 67131ad0514d7105b55003a0506209cf1bba3f00 Mon Sep 17 00:00:00 2001
From: Becky Bruce <beckyb@kernel.crashing.org>
Date: Wed, 8 Apr 2009 09:09:16 -0500
Subject: swiotlb: fix compile warning

Squash a build warning seen on 32-bit powerpc caused by
calling min() with 2 different types. Use min_t() instead.

Signed-off-by: Becky Bruce <beckyb@kernel.crashing.org>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Cc: jeremy@goop.org
Cc: ian.campbell@citrix.com
LKML-Reference: <1239199761-22886-3-git-send-email-galak@kernel.crashing.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 170cf56..4fd6a76 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -341,7 +341,7 @@ static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
 		unsigned long flags;
 
 		while (size) {
-			sz = min(PAGE_SIZE - offset, size);
+			sz = min_t(size_t, PAGE_SIZE - offset, size);
 
 			local_irq_save(flags);
 			buffer = kmap_atomic(pfn_to_page(pfn),
-- 
cgit v0.10.2


From dd6b02fe427f30520d0adc94aa52352367227873 Mon Sep 17 00:00:00 2001
From: Becky Bruce <beckyb@kernel.crashing.org>
Date: Wed, 8 Apr 2009 09:09:17 -0500
Subject: swiotlb: map_page fix for highmem systems

The current code calls virt_to_phys() on address that might
be in highmem, which is bad.  This wasn't needed, anyway, because
we already have the physical address we need.

Get rid of the now-unused virtual address as well.

Signed-off-by: Becky Bruce <beckyb@kernel.crashing.org>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Cc: jeremy@goop.org
Cc: ian.campbell@citrix.com
LKML-Reference: <1239199761-22886-4-git-send-email-galak@kernel.crashing.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 4fd6a76..e8a47c8 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -640,7 +640,6 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
 			    struct dma_attrs *attrs)
 {
 	phys_addr_t phys = page_to_phys(page) + offset;
-	void *ptr = page_address(page) + offset;
 	dma_addr_t dev_addr = swiotlb_phys_to_bus(dev, phys);
 	void *map;
 
@@ -651,7 +650,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
 	 * buffering it.
 	 */
 	if (!address_needs_mapping(dev, dev_addr, size) &&
-	    !range_needs_mapping(virt_to_phys(ptr), size))
+	    !range_needs_mapping(phys, size))
 		return dev_addr;
 
 	/*
-- 
cgit v0.10.2


From ef5722f698bde01cfec2b98fff733a48663ebf55 Mon Sep 17 00:00:00 2001
From: Becky Bruce <beckyb@kernel.crashing.org>
Date: Wed, 8 Apr 2009 09:09:18 -0500
Subject: swiotlb: allow arch override of address_needs_mapping

Some architectures require additional checking to determine
if a device can dma to an address and need to provide their
own address_needs_mapping..

Signed-off-by: Becky Bruce <beckyb@kernel.crashing.org>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Cc: jeremy@goop.org
Cc: ian.campbell@citrix.com
LKML-Reference: <1239199761-22886-5-git-send-email-galak@kernel.crashing.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index e8a47c8..d81afab 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -145,6 +145,12 @@ static void *swiotlb_bus_to_virt(dma_addr_t address)
 	return phys_to_virt(swiotlb_bus_to_phys(address));
 }
 
+int __weak swiotlb_arch_address_needs_mapping(struct device *hwdev,
+					       dma_addr_t addr, size_t size)
+{
+	return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size);
+}
+
 int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size)
 {
 	return 0;
@@ -309,10 +315,10 @@ cleanup1:
 	return -ENOMEM;
 }
 
-static int
+static inline int
 address_needs_mapping(struct device *hwdev, dma_addr_t addr, size_t size)
 {
-	return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size);
+	return swiotlb_arch_address_needs_mapping(hwdev, addr, size);
 }
 
 static inline int range_needs_mapping(phys_addr_t paddr, size_t size)
-- 
cgit v0.10.2


From 7fcebbd2d984eac3fdd6da2f4453e7c43d32de89 Mon Sep 17 00:00:00 2001
From: Becky Bruce <beckyb@kernel.crashing.org>
Date: Wed, 8 Apr 2009 09:09:19 -0500
Subject: swiotlb: rename unmap_single to do_unmap_single

Previously, swiotlb_unmap_page and swiotlb_unmap_sg were
duplicating very similar code.  Refactor that code into a
new unmap_single and unmap_single use do_unmap_single.

Note that the swiotlb_unmap_sg code was previously doing
a complicated comparison to determine if an addresses needed
to be unmapped where a simple is_swiotlb_buffer() call
would have sufficed.

Signed-off-by: Becky Bruce <beckyb@kernel.crashing.org>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Cc: jeremy@goop.org
Cc: ian.campbell@citrix.com
LKML-Reference: <1239199761-22886-6-git-send-email-galak@kernel.crashing.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index d81afab..2bde54a 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -482,7 +482,7 @@ found:
  * dma_addr is the kernel virtual address of the bounce buffer to unmap.
  */
 static void
-unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
+do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
 {
 	unsigned long flags;
 	int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
@@ -591,7 +591,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 		       (unsigned long long)dev_addr);
 
 		/* DMA_TO_DEVICE to avoid memcpy in unmap_single */
-		unmap_single(hwdev, ret, size, DMA_TO_DEVICE);
+		do_unmap_single(hwdev, ret, size, DMA_TO_DEVICE);
 		return NULL;
 	}
 	*dma_handle = dev_addr;
@@ -608,7 +608,7 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
 		free_pages((unsigned long) vaddr, get_order(size));
 	else
 		/* DMA_TO_DEVICE to avoid memcpy in unmap_single */
-		unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE);
+		do_unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE);
 }
 EXPORT_SYMBOL(swiotlb_free_coherent);
 
@@ -688,17 +688,29 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page);
  * After this call, reads by the cpu to the buffer are guaranteed to see
  * whatever the device wrote there.
  */
-void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
-			size_t size, enum dma_data_direction dir,
-			struct dma_attrs *attrs)
+static void unmap_single(struct device *hwdev, dma_addr_t dev_addr,
+			 size_t size, int dir)
 {
 	char *dma_addr = swiotlb_bus_to_virt(dev_addr);
 
 	BUG_ON(dir == DMA_NONE);
-	if (is_swiotlb_buffer(dma_addr))
-		unmap_single(hwdev, dma_addr, size, dir);
-	else if (dir == DMA_FROM_DEVICE)
-		dma_mark_clean(dma_addr, size);
+
+	if (is_swiotlb_buffer(dma_addr)) {
+		do_unmap_single(hwdev, dma_addr, size, dir);
+		return;
+	}
+
+	if (dir != DMA_FROM_DEVICE)
+		return;
+
+	dma_mark_clean(dma_addr, size);
+}
+
+void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
+			size_t size, enum dma_data_direction dir,
+			struct dma_attrs *attrs)
+{
+	unmap_single(hwdev, dev_addr, size, dir);
 }
 EXPORT_SYMBOL_GPL(swiotlb_unmap_page);
 
@@ -850,13 +862,9 @@ swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
 
 	BUG_ON(dir == DMA_NONE);
 
-	for_each_sg(sgl, sg, nelems, i) {
-		if (sg->dma_address != swiotlb_phys_to_bus(hwdev, sg_phys(sg)))
-			unmap_single(hwdev, swiotlb_bus_to_virt(sg->dma_address),
-				     sg->dma_length, dir);
-		else if (dir == DMA_FROM_DEVICE)
-			dma_mark_clean(swiotlb_bus_to_virt(sg->dma_address), sg->dma_length);
-	}
+	for_each_sg(sgl, sg, nelems, i)
+		unmap_single(hwdev, sg->dma_address, sg->dma_length, dir);
+
 }
 EXPORT_SYMBOL(swiotlb_unmap_sg_attrs);
 
-- 
cgit v0.10.2


From 380d687833aee098c4a2c3b35beaefe1c1f48d01 Mon Sep 17 00:00:00 2001
From: Becky Bruce <beckyb@kernel.crashing.org>
Date: Wed, 8 Apr 2009 09:09:20 -0500
Subject: swiotlb: use swiotlb_sync_single instead of duplicating code

Right now both swiotlb_sync_single_range and swiotlb_sync_sg
were duplicating the code in swiotlb_sync_single.  Just call it
instead.  Also rearrange the sync_single code for readability.

Note that the swiotlb_sync_sg code was previously doing
a complicated comparison to determine if an addresses needed
to be unmapped where a simple is_swiotlb_buffer() call
would have sufficed.

Signed-off-by: Becky Bruce <beckyb@kernel.crashing.org>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Cc: jeremy@goop.org
Cc: ian.campbell@citrix.com
LKML-Reference: <1239199761-22886-7-git-send-email-galak@kernel.crashing.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 2bde54a..d912f06 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -731,10 +731,16 @@ swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
 	char *dma_addr = swiotlb_bus_to_virt(dev_addr);
 
 	BUG_ON(dir == DMA_NONE);
-	if (is_swiotlb_buffer(dma_addr))
+
+	if (is_swiotlb_buffer(dma_addr)) {
 		sync_single(hwdev, dma_addr, size, dir, target);
-	else if (dir == DMA_FROM_DEVICE)
-		dma_mark_clean(dma_addr, size);
+		return;
+	}
+
+	if (dir != DMA_FROM_DEVICE)
+		return;
+
+	dma_mark_clean(dma_addr, size);
 }
 
 void
@@ -761,13 +767,7 @@ swiotlb_sync_single_range(struct device *hwdev, dma_addr_t dev_addr,
 			  unsigned long offset, size_t size,
 			  int dir, int target)
 {
-	char *dma_addr = swiotlb_bus_to_virt(dev_addr) + offset;
-
-	BUG_ON(dir == DMA_NONE);
-	if (is_swiotlb_buffer(dma_addr))
-		sync_single(hwdev, dma_addr, size, dir, target);
-	else if (dir == DMA_FROM_DEVICE)
-		dma_mark_clean(dma_addr, size);
+	swiotlb_sync_single(hwdev, dev_addr + offset, size, dir, target);
 }
 
 void
@@ -890,15 +890,9 @@ swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
 	struct scatterlist *sg;
 	int i;
 
-	BUG_ON(dir == DMA_NONE);
-
-	for_each_sg(sgl, sg, nelems, i) {
-		if (sg->dma_address != swiotlb_phys_to_bus(hwdev, sg_phys(sg)))
-			sync_single(hwdev, swiotlb_bus_to_virt(sg->dma_address),
+	for_each_sg(sgl, sg, nelems, i)
+		swiotlb_sync_single(hwdev, sg->dma_address,
 				    sg->dma_length, dir, target);
-		else if (dir == DMA_FROM_DEVICE)
-			dma_mark_clean(swiotlb_bus_to_virt(sg->dma_address), sg->dma_length);
-	}
 }
 
 void
-- 
cgit v0.10.2


From 42d7c5e353cef9062129b0de3ec9ddf10567b9ca Mon Sep 17 00:00:00 2001
From: Becky Bruce <beckyb@kernel.crashing.org>
Date: Wed, 8 Apr 2009 09:09:21 -0500
Subject: swiotlb: change swiotlb_bus_to[phys,virt] prototypes

Add a hwdev argument that is needed on some architectures
in order to access a per-device offset that is taken into
account when producing a physical address (also needed to
get from bus address to virtual address because the physical
address is an intermediate step).

Also make swiotlb_bus_to_virt weak so architectures can
override it.

Signed-off-by: Becky Bruce <beckyb@kernel.crashing.org>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Cc: jeremy@goop.org
Cc: ian.campbell@citrix.com
LKML-Reference: <1239199761-22886-8-git-send-email-galak@kernel.crashing.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 34f12e9..887388a 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -28,7 +28,7 @@ dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
 	return paddr;
 }
 
-phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr)
+phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
 {
 	return baddr;
 }
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index ac9ff54..cb1a663 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -29,7 +29,8 @@ extern void *swiotlb_alloc(unsigned order, unsigned long nslabs);
 
 extern dma_addr_t swiotlb_phys_to_bus(struct device *hwdev,
 				      phys_addr_t address);
-extern phys_addr_t swiotlb_bus_to_phys(dma_addr_t address);
+extern phys_addr_t swiotlb_bus_to_phys(struct device *hwdev,
+				       dma_addr_t address);
 
 extern int swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size);
 
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index d912f06..bffe6d7 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -129,7 +129,7 @@ dma_addr_t __weak swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
 	return paddr;
 }
 
-phys_addr_t __weak swiotlb_bus_to_phys(dma_addr_t baddr)
+phys_addr_t __weak swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
 {
 	return baddr;
 }
@@ -140,9 +140,9 @@ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
 	return swiotlb_phys_to_bus(hwdev, virt_to_phys(address));
 }
 
-static void *swiotlb_bus_to_virt(dma_addr_t address)
+void * __weak swiotlb_bus_to_virt(struct device *hwdev, dma_addr_t address)
 {
-	return phys_to_virt(swiotlb_bus_to_phys(address));
+	return phys_to_virt(swiotlb_bus_to_phys(hwdev, address));
 }
 
 int __weak swiotlb_arch_address_needs_mapping(struct device *hwdev,
@@ -691,7 +691,7 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page);
 static void unmap_single(struct device *hwdev, dma_addr_t dev_addr,
 			 size_t size, int dir)
 {
-	char *dma_addr = swiotlb_bus_to_virt(dev_addr);
+	char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr);
 
 	BUG_ON(dir == DMA_NONE);
 
@@ -728,7 +728,7 @@ static void
 swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
 		    size_t size, int dir, int target)
 {
-	char *dma_addr = swiotlb_bus_to_virt(dev_addr);
+	char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr);
 
 	BUG_ON(dir == DMA_NONE);
 
-- 
cgit v0.10.2


From a4e94ef0dd391eae05bdeacd12b8da3510957a97 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Fri, 27 Mar 2009 17:07:05 +0800
Subject: printk: add support of hh length modifier for printk

Impact: new feature, extend vsprintf format strings

hh is used as length modifier for signed char or unsigned char.
It is supported by glibc, we add kernel support now.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Acked-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: torvalds@linux-foundation.org
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49CC9739.30107@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 7536ace..b56f6d0 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -408,6 +408,8 @@ enum format_type {
 	FORMAT_TYPE_LONG_LONG,
 	FORMAT_TYPE_ULONG,
 	FORMAT_TYPE_LONG,
+	FORMAT_TYPE_UBYTE,
+	FORMAT_TYPE_BYTE,
 	FORMAT_TYPE_USHORT,
 	FORMAT_TYPE_SHORT,
 	FORMAT_TYPE_UINT,
@@ -853,11 +855,15 @@ qualifier:
 	spec->qualifier = -1;
 	if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' ||
 	    *fmt == 'Z' || *fmt == 'z' || *fmt == 't') {
-		spec->qualifier = *fmt;
-		++fmt;
-		if (spec->qualifier == 'l' && *fmt == 'l') {
-			spec->qualifier = 'L';
-			++fmt;
+		spec->qualifier = *fmt++;
+		if (unlikely(spec->qualifier == *fmt)) {
+			if (spec->qualifier == 'l') {
+				spec->qualifier = 'L';
+				++fmt;
+			} else if (spec->qualifier == 'h') {
+				spec->qualifier = 'H';
+				++fmt;
+			}
 		}
 	}
 
@@ -919,6 +925,11 @@ qualifier:
 		spec->type = FORMAT_TYPE_SIZE_T;
 	} else if (spec->qualifier == 't') {
 		spec->type = FORMAT_TYPE_PTRDIFF;
+	} else if (spec->qualifier == 'H') {
+		if (spec->flags & SIGN)
+			spec->type = FORMAT_TYPE_BYTE;
+		else
+			spec->type = FORMAT_TYPE_UBYTE;
 	} else if (spec->qualifier == 'h') {
 		if (spec->flags & SIGN)
 			spec->type = FORMAT_TYPE_SHORT;
@@ -1087,6 +1098,12 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
 			case FORMAT_TYPE_PTRDIFF:
 				num = va_arg(args, ptrdiff_t);
 				break;
+			case FORMAT_TYPE_UBYTE:
+				num = (unsigned char) va_arg(args, int);
+				break;
+			case FORMAT_TYPE_BYTE:
+				num = (signed char) va_arg(args, int);
+				break;
 			case FORMAT_TYPE_USHORT:
 				num = (unsigned short) va_arg(args, int);
 				break;
@@ -1363,6 +1380,10 @@ do {									\
 			case FORMAT_TYPE_PTRDIFF:
 				save_arg(ptrdiff_t);
 				break;
+			case FORMAT_TYPE_UBYTE:
+			case FORMAT_TYPE_BYTE:
+				save_arg(char);
+				break;
 			case FORMAT_TYPE_USHORT:
 			case FORMAT_TYPE_SHORT:
 				save_arg(short);
@@ -1538,6 +1559,12 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
 			case FORMAT_TYPE_PTRDIFF:
 				num = get_arg(ptrdiff_t);
 				break;
+			case FORMAT_TYPE_UBYTE:
+				num = get_arg(unsigned char);
+				break;
+			case FORMAT_TYPE_BYTE:
+				num = get_arg(signed char);
+				break;
 			case FORMAT_TYPE_USHORT:
 				num = get_arg(unsigned short);
 				break;
-- 
cgit v0.10.2


From 7333a8003cdc0470e8c0ae8b949cbc44f3165ff3 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Wed, 25 Mar 2009 10:50:34 +0900
Subject: x86: smarten /proc/interrupts output for new counters

Now /proc/interrupts of tip tree has new counters:

  CNT: Performance counter interrupts

Format change of output, as like that by commit:

  commit 7a81d9a7da03d2f27840d659f97ef140d032f609
  x86: smarten /proc/interrupts output

should be applied to these new counters too.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <49C98DEA.8060208@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d465487..dccaaa8 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -63,7 +63,7 @@ static int show_other_interrupts(struct seq_file *p, int prec)
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
 	seq_printf(p, "  Spurious interrupts\n");
-	seq_printf(p, "CNT: ");
+	seq_printf(p, "%*s: ", prec, "CNT");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
 	seq_printf(p, "  Performance counter interrupts\n");
-- 
cgit v0.10.2


From ae9e6bc9f74f8247cbca50a6a93c80e0d686fa19 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 2 Apr 2009 13:19:54 +0900
Subject: percpu: don't put the first chunk in reverse-map rbtree

Impact: both first chunks don't use rbtree, no functional change

There can be two first chunks - reserved and dynamic with the former
one being optional.  Dynamic first chunk was linked on reverse-mapping
rbtree while the reserved one was mapped manually using the start
address and reserved offset limit.

This patch makes both first chunks to be looked up manually without
using the rbtree.  This is to help getting rid of the rbtree.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: rusty@rustcorp.com.au
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: rmk@arm.linux.org.uk
Cc: starvik@axis.com
Cc: ralf@linux-mips.org
Cc: davem@davemloft.net
Cc: cooloney@kernel.org
Cc: kyle@mcmartin.ca
Cc: matthew@wil.cx
Cc: grundler@parisc-linux.org
Cc: takata@linux-m32r.org
Cc: benh@kernel.crashing.org
Cc: rth@twiddle.net
Cc: ink@jurassic.park.msu.ru
Cc: heiko.carstens@de.ibm.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Christoph Lameter <cl@linux.com>
LKML-Reference: <49D43CEA.3040609@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/mm/percpu.c b/mm/percpu.c
index 1aa5d8f..bf1bf1f 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -110,9 +110,21 @@ static size_t pcpu_chunk_struct_size __read_mostly;
 void *pcpu_base_addr __read_mostly;
 EXPORT_SYMBOL_GPL(pcpu_base_addr);
 
-/* optional reserved chunk, only accessible for reserved allocations */
+/*
+ * The first chunk which always exists.  Note that unlike other
+ * chunks, this one can be allocated and mapped in several different
+ * ways and thus often doesn't live in the vmalloc area.
+ */
+static struct pcpu_chunk *pcpu_first_chunk;
+
+/*
+ * Optional reserved chunk.  This chunk reserves part of the first
+ * chunk and serves it for reserved allocations.  The amount of
+ * reserved offset is in pcpu_reserved_chunk_limit.  When reserved
+ * area doesn't exist, the following variables contain NULL and 0
+ * respectively.
+ */
 static struct pcpu_chunk *pcpu_reserved_chunk;
-/* offset limit of the reserved chunk */
 static int pcpu_reserved_chunk_limit;
 
 /*
@@ -297,15 +309,16 @@ static struct rb_node **pcpu_chunk_rb_search(void *addr,
  */
 static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 {
+	void *first_start = pcpu_first_chunk->vm->addr;
 	struct rb_node *n, *parent;
 	struct pcpu_chunk *chunk;
 
-	/* is it in the reserved chunk? */
-	if (pcpu_reserved_chunk) {
-		void *start = pcpu_reserved_chunk->vm->addr;
-
-		if (addr >= start && addr < start + pcpu_reserved_chunk_limit)
+	/* is it in the first chunk? */
+	if (addr >= first_start && addr < first_start + pcpu_chunk_size) {
+		/* is it in the reserved area? */
+		if (addr < first_start + pcpu_reserved_chunk_limit)
 			return pcpu_reserved_chunk;
+		return pcpu_first_chunk;
 	}
 
 	/* nah... search the regular ones */
@@ -1147,7 +1160,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 
 	if (reserved_size) {
 		schunk->free_size = reserved_size;
-		pcpu_reserved_chunk = schunk;	/* not for dynamic alloc */
+		pcpu_reserved_chunk = schunk;
+		pcpu_reserved_chunk_limit = static_size + reserved_size;
 	} else {
 		schunk->free_size = dyn_size;
 		dyn_size = 0;			/* dynamic area covered */
@@ -1158,8 +1172,6 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	if (schunk->free_size)
 		schunk->map[schunk->map_used++] = schunk->free_size;
 
-	pcpu_reserved_chunk_limit = static_size + schunk->free_size;
-
 	/* init dynamic chunk if necessary */
 	if (dyn_size) {
 		dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
@@ -1226,13 +1238,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	}
 
 	/* link the first chunk in */
-	if (!dchunk) {
-		pcpu_chunk_relocate(schunk, -1);
-		pcpu_chunk_addr_insert(schunk);
-	} else {
-		pcpu_chunk_relocate(dchunk, -1);
-		pcpu_chunk_addr_insert(dchunk);
-	}
+	pcpu_first_chunk = dchunk ?: schunk;
+	pcpu_chunk_relocate(pcpu_first_chunk, -1);
 
 	/* we're done */
 	pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
-- 
cgit v0.10.2


From e1b9aa3f47242e757c776a3771bb6613e675bf9c Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Thu, 2 Apr 2009 13:21:44 +0900
Subject: percpu: remove rbtree and use page->index instead

Impact: use page->index for addr to chunk mapping instead of dedicated rbtree

The rbtree is used to determine the chunk from the virtual address.
However, we can already determine the page struct from a virtual
address and there are several unused fields in page struct used by
vmalloc.  Use the index field to store a pointer to the chunk. Then
there is no need anymore for an rbtree.

tj: * s/(set|get)_chunk/pcpu_\1_page_chunk/

    * Drop inline from the above two functions and moved them upwards
      so that they are with other simple helpers.

    * Initial pages might not (actually most of the time don't) live
      in the vmalloc area.  With the previous patch to manually
      reverse-map both first chunks, this is no longer an issue.
      Removed pcpu_set_chunk() call on initial pages.

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: rusty@rustcorp.com.au
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: rmk@arm.linux.org.uk
Cc: starvik@axis.com
Cc: ralf@linux-mips.org
Cc: davem@davemloft.net
Cc: cooloney@kernel.org
Cc: kyle@mcmartin.ca
Cc: matthew@wil.cx
Cc: grundler@parisc-linux.org
Cc: takata@linux-m32r.org
Cc: benh@kernel.crashing.org
Cc: rth@twiddle.net
Cc: ink@jurassic.park.msu.ru
Cc: heiko.carstens@de.ibm.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nick Piggin <npiggin@suse.de>
LKML-Reference: <49D43D58.4050102@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/mm/percpu.c b/mm/percpu.c
index bf1bf1f..c0b2c1a 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -23,7 +23,7 @@
  * Allocation is done in offset-size areas of single unit space.  Ie,
  * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
  * c1:u1, c1:u2 and c1:u3.  Percpu access can be done by configuring
- * percpu base registers UNIT_SIZE apart.
+ * percpu base registers pcpu_unit_size apart.
  *
  * There are usually many small percpu allocations many of them as
  * small as 4 bytes.  The allocator organizes chunks into lists
@@ -38,8 +38,8 @@
  * region and negative allocated.  Allocation inside a chunk is done
  * by scanning this map sequentially and serving the first matching
  * entry.  This is mostly copied from the percpu_modalloc() allocator.
- * Chunks are also linked into a rb tree to ease address to chunk
- * mapping during free.
+ * Chunks can be determined from the address using the index field
+ * in the page struct. The index field contains a pointer to the chunk.
  *
  * To use this allocator, arch code should do the followings.
  *
@@ -61,7 +61,6 @@
 #include <linux/mutex.h>
 #include <linux/percpu.h>
 #include <linux/pfn.h>
-#include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
@@ -88,7 +87,6 @@
 
 struct pcpu_chunk {
 	struct list_head	list;		/* linked to pcpu_slot lists */
-	struct rb_node		rb_node;	/* key is chunk->vm->addr */
 	int			free_size;	/* free bytes in the chunk */
 	int			contig_hint;	/* max contiguous size hint */
 	struct vm_struct	*vm;		/* mapped vmalloc region */
@@ -133,7 +131,7 @@ static int pcpu_reserved_chunk_limit;
  * There are two locks - pcpu_alloc_mutex and pcpu_lock.  The former
  * protects allocation/reclaim paths, chunks and chunk->page arrays.
  * The latter is a spinlock and protects the index data structures -
- * chunk slots, rbtree, chunks and area maps in chunks.
+ * chunk slots, chunks and area maps in chunks.
  *
  * During allocation, pcpu_alloc_mutex is kept locked all the time and
  * pcpu_lock is grabbed and released as necessary.  All actual memory
@@ -152,7 +150,6 @@ static DEFINE_MUTEX(pcpu_alloc_mutex);	/* protects whole alloc and reclaim */
 static DEFINE_SPINLOCK(pcpu_lock);	/* protects index data structures */
 
 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
-static struct rb_root pcpu_addr_root = RB_ROOT;	/* chunks by address */
 
 /* reclaim work to release fully free chunks, scheduled from free path */
 static void pcpu_reclaim(struct work_struct *work);
@@ -203,6 +200,18 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
 	return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
 }
 
+/* set the pointer to a chunk in a page struct */
+static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
+{
+	page->index = (unsigned long)pcpu;
+}
+
+/* obtain pointer to a chunk from a page struct */
+static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
+{
+	return (struct pcpu_chunk *)page->index;
+}
+
 /**
  * pcpu_mem_alloc - allocate memory
  * @size: bytes to allocate
@@ -269,40 +278,9 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 	}
 }
 
-static struct rb_node **pcpu_chunk_rb_search(void *addr,
-					     struct rb_node **parentp)
-{
-	struct rb_node **p = &pcpu_addr_root.rb_node;
-	struct rb_node *parent = NULL;
-	struct pcpu_chunk *chunk;
-
-	while (*p) {
-		parent = *p;
-		chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
-
-		if (addr < chunk->vm->addr)
-			p = &(*p)->rb_left;
-		else if (addr > chunk->vm->addr)
-			p = &(*p)->rb_right;
-		else
-			break;
-	}
-
-	if (parentp)
-		*parentp = parent;
-	return p;
-}
-
 /**
- * pcpu_chunk_addr_search - search for chunk containing specified address
- * @addr: address to search for
- *
- * Look for chunk which might contain @addr.  More specifically, it
- * searchs for the chunk with the highest start address which isn't
- * beyond @addr.
- *
- * CONTEXT:
- * pcpu_lock.
+ * pcpu_chunk_addr_search - determine chunk containing specified address
+ * @addr: address for which the chunk needs to be determined.
  *
  * RETURNS:
  * The address of the found chunk.
@@ -310,8 +288,6 @@ static struct rb_node **pcpu_chunk_rb_search(void *addr,
 static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 {
 	void *first_start = pcpu_first_chunk->vm->addr;
-	struct rb_node *n, *parent;
-	struct pcpu_chunk *chunk;
 
 	/* is it in the first chunk? */
 	if (addr >= first_start && addr < first_start + pcpu_chunk_size) {
@@ -321,42 +297,7 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 		return pcpu_first_chunk;
 	}
 
-	/* nah... search the regular ones */
-	n = *pcpu_chunk_rb_search(addr, &parent);
-	if (!n) {
-		/* no exactly matching chunk, the parent is the closest */
-		n = parent;
-		BUG_ON(!n);
-	}
-	chunk = rb_entry(n, struct pcpu_chunk, rb_node);
-
-	if (addr < chunk->vm->addr) {
-		/* the parent was the next one, look for the previous one */
-		n = rb_prev(n);
-		BUG_ON(!n);
-		chunk = rb_entry(n, struct pcpu_chunk, rb_node);
-	}
-
-	return chunk;
-}
-
-/**
- * pcpu_chunk_addr_insert - insert chunk into address rb tree
- * @new: chunk to insert
- *
- * Insert @new into address rb tree.
- *
- * CONTEXT:
- * pcpu_lock.
- */
-static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
-{
-	struct rb_node **p, *parent;
-
-	p = pcpu_chunk_rb_search(new->vm->addr, &parent);
-	BUG_ON(*p);
-	rb_link_node(&new->rb_node, parent, p);
-	rb_insert_color(&new->rb_node, &pcpu_addr_root);
+	return pcpu_get_page_chunk(vmalloc_to_page(addr));
 }
 
 /**
@@ -768,6 +709,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
 						  alloc_mask, 0);
 			if (!*pagep)
 				goto err;
+			pcpu_set_page_chunk(*pagep, chunk);
 		}
 	}
 
@@ -892,7 +834,6 @@ restart:
 
 	spin_lock_irq(&pcpu_lock);
 	pcpu_chunk_relocate(chunk, -1);
-	pcpu_chunk_addr_insert(chunk);
 	goto restart;
 
 area_found:
@@ -981,7 +922,6 @@ static void pcpu_reclaim(struct work_struct *work)
 		if (chunk == list_first_entry(head, struct pcpu_chunk, list))
 			continue;
 
-		rb_erase(&chunk->rb_node, &pcpu_addr_root);
 		list_move(&chunk->list, &todo);
 	}
 
-- 
cgit v0.10.2


From e30e08f65c7ef6c230424264f09c3d53f117f58b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:25 +0200
Subject: perf_counter: fix NMI race in task clock

We should not be updating ctx->time from NMI context, work around that.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090408130408.681326666@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 863703b..84a3908 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -319,8 +319,6 @@ static void __perf_counter_disable(void *info)
 
 	spin_lock_irqsave(&ctx->lock, flags);
 
-	update_context_time(ctx);
-
 	/*
 	 * If the counter is on, turn it off.
 	 * If it is in error state, leave it in error state.
@@ -2335,13 +2333,11 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
  * Software counter: task time clock
  */
 
-static void task_clock_perf_counter_update(struct perf_counter *counter)
+static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
 {
-	u64 prev, now;
+	u64 prev;
 	s64 delta;
 
-	now = counter->ctx->time;
-
 	prev = atomic64_xchg(&counter->hw.prev_count, now);
 	delta = now - prev;
 	atomic64_add(delta, &counter->count);
@@ -2369,13 +2365,24 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter)
 static void task_clock_perf_counter_disable(struct perf_counter *counter)
 {
 	hrtimer_cancel(&counter->hw.hrtimer);
-	task_clock_perf_counter_update(counter);
+	task_clock_perf_counter_update(counter, counter->ctx->time);
+
 }
 
 static void task_clock_perf_counter_read(struct perf_counter *counter)
 {
-	update_context_time(counter->ctx);
-	task_clock_perf_counter_update(counter);
+	u64 time;
+
+	if (!in_nmi()) {
+		update_context_time(counter->ctx);
+		time = counter->ctx->time;
+	} else {
+		u64 now = perf_clock();
+		u64 delta = now - counter->ctx->timestamp;
+		time = counter->ctx->time + delta;
+	}
+
+	task_clock_perf_counter_update(counter, time);
 }
 
 static const struct hw_perf_counter_ops perf_ops_task_clock = {
-- 
cgit v0.10.2


From 6fab01927e8bdbbc77bafba2abb4810c5591ad52 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:26 +0200
Subject: perf_counter: provide misc bits in the event header

Limit the size of each record to 64k (or should we count in multiples
of u64 and have a 512K limit?), this gives 16 bits or spare room in the
header, which we can use for misc bits, so as to not have to grow the
record with u64 every time we have a few bits to report.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090408130408.769271806@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 7f5d353..5bd8817 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -201,9 +201,13 @@ struct perf_counter_mmap_page {
 	__u32   data_head;		/* head in the data section */
 };
 
+#define PERF_EVENT_MISC_KERNEL	(1 << 0)
+#define PERF_EVENT_MISC_USER	(1 << 1)
+
 struct perf_event_header {
 	__u32	type;
-	__u32	size;
+	__u16	misc;
+	__u16	size;
 };
 
 enum perf_event_type {
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 84a3908..4af98f9 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1831,6 +1831,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	header.type = PERF_EVENT_COUNTER_OVERFLOW;
 	header.size = sizeof(header);
 
+	header.misc = user_mode(regs) ?
+		PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL;
+
 	if (record_type & PERF_RECORD_IP) {
 		ip = instruction_pointer(regs);
 		header.type |= __PERF_EVENT_IP;
-- 
cgit v0.10.2


From 6b6e5486b3a168f0328c82a8d4376caf901472b1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:27 +0200
Subject: perf_counter: use misc field to widen type

Push the PERF_EVENT_COUNTER_OVERFLOW bit into the misc field so that
we can have the full 32bit for PERF_RECORD_ bits.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090408130408.891867663@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 5bd8817..4809ae1 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -201,8 +201,9 @@ struct perf_counter_mmap_page {
 	__u32   data_head;		/* head in the data section */
 };
 
-#define PERF_EVENT_MISC_KERNEL	(1 << 0)
-#define PERF_EVENT_MISC_USER	(1 << 1)
+#define PERF_EVENT_MISC_KERNEL		(1 << 0)
+#define PERF_EVENT_MISC_USER		(1 << 1)
+#define PERF_EVENT_MISC_OVERFLOW	(1 << 2)
 
 struct perf_event_header {
 	__u32	type;
@@ -230,36 +231,27 @@ enum perf_event_type {
 	PERF_EVENT_MUNMAP		= 2,
 
 	/*
-	 * Half the event type space is reserved for the counter overflow
-	 * bitfields, as found in hw_event.record_type.
-	 *
-	 * These events will have types of the form:
-	 *   PERF_EVENT_COUNTER_OVERFLOW { | __PERF_EVENT_* } *
+	 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
+	 * will be PERF_RECORD_*
 	 *
 	 * struct {
 	 * 	struct perf_event_header	header;
 	 *
-	 * 	{ u64			ip;	  } && __PERF_EVENT_IP
-	 * 	{ u32			pid, tid; } && __PERF_EVENT_TID
+	 * 	{ u64			ip;	  } && PERF_RECORD_IP
+	 * 	{ u32			pid, tid; } && PERF_RECORD_TID
 	 *
 	 * 	{ u64			nr;
-	 * 	  { u64 event, val; } 	cnt[nr];  } && __PERF_EVENT_GROUP
+	 * 	  { u64 event, val; } 	cnt[nr];  } && PERF_RECORD_GROUP
 	 *
 	 * 	{ u16			nr,
 	 * 				hv,
 	 * 				kernel,
 	 * 				user;
-	 * 	  u64			ips[nr];  } && __PERF_EVENT_CALLCHAIN
+	 * 	  u64			ips[nr];  } && PERF_RECORD_CALLCHAIN
 	 *
-	 * 	{ u64			time;     } && __PERF_EVENT_TIME
+	 * 	{ u64			time;     } && PERF_RECORD_TIME
 	 * };
 	 */
-	PERF_EVENT_COUNTER_OVERFLOW	= 1UL << 31,
-	__PERF_EVENT_IP			= PERF_RECORD_IP,
-	__PERF_EVENT_TID		= PERF_RECORD_TID,
-	__PERF_EVENT_GROUP		= PERF_RECORD_GROUP,
-	__PERF_EVENT_CALLCHAIN		= PERF_RECORD_CALLCHAIN,
-	__PERF_EVENT_TIME		= PERF_RECORD_TIME,
 };
 
 #ifdef __KERNEL__
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4af98f9..bf12df6 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1828,15 +1828,16 @@ static void perf_counter_output(struct perf_counter *counter,
 	int callchain_size = 0;
 	u64 time;
 
-	header.type = PERF_EVENT_COUNTER_OVERFLOW;
+	header.type = 0;
 	header.size = sizeof(header);
 
-	header.misc = user_mode(regs) ?
+	header.misc = PERF_EVENT_MISC_OVERFLOW;
+	header.misc |= user_mode(regs) ?
 		PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL;
 
 	if (record_type & PERF_RECORD_IP) {
 		ip = instruction_pointer(regs);
-		header.type |= __PERF_EVENT_IP;
+		header.type |= PERF_RECORD_IP;
 		header.size += sizeof(ip);
 	}
 
@@ -1845,12 +1846,12 @@ static void perf_counter_output(struct perf_counter *counter,
 		tid_entry.pid = current->group_leader->pid;
 		tid_entry.tid = current->pid;
 
-		header.type |= __PERF_EVENT_TID;
+		header.type |= PERF_RECORD_TID;
 		header.size += sizeof(tid_entry);
 	}
 
 	if (record_type & PERF_RECORD_GROUP) {
-		header.type |= __PERF_EVENT_GROUP;
+		header.type |= PERF_RECORD_GROUP;
 		header.size += sizeof(u64) +
 			counter->nr_siblings * sizeof(group_entry);
 	}
@@ -1861,7 +1862,7 @@ static void perf_counter_output(struct perf_counter *counter,
 		if (callchain) {
 			callchain_size = (1 + callchain->nr) * sizeof(u64);
 
-			header.type |= __PERF_EVENT_CALLCHAIN;
+			header.type |= PERF_RECORD_CALLCHAIN;
 			header.size += callchain_size;
 		}
 	}
@@ -1872,7 +1873,7 @@ static void perf_counter_output(struct perf_counter *counter,
 		 */
 		time = sched_clock();
 
-		header.type |= __PERF_EVENT_TIME;
+		header.type |= PERF_RECORD_TIME;
 		header.size += sizeof(u64);
 	}
 
-- 
cgit v0.10.2


From 808382b33bb4c60df6379ec2db39f332cc56b82a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:28 +0200
Subject: perf_counter: kerneltop: keep up with ABI changes

Update kerneltop to use PERF_EVENT_MISC_OVERFLOW

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090408130408.947197470@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 15f3a5f..042c1b8 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -1277,22 +1277,22 @@ static void mmap_read(struct mmap_data *md)
 
 		old += size;
 
-		switch (event->header.type) {
-		case PERF_EVENT_COUNTER_OVERFLOW | __PERF_EVENT_IP:
-		case PERF_EVENT_COUNTER_OVERFLOW | __PERF_EVENT_IP | __PERF_EVENT_TID:
-			process_event(event->ip.ip, md->counter);
-			break;
-
-		case PERF_EVENT_MMAP:
-		case PERF_EVENT_MUNMAP:
-			printf("%s: %Lu %Lu %Lu %s\n",
-					event->header.type == PERF_EVENT_MMAP
-					  ? "mmap" : "munmap",
-					event->mmap.start,
-					event->mmap.len,
-					event->mmap.pgoff,
-					event->mmap.filename);
-			break;
+		if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
+			if (event->header.type & PERF_RECORD_IP)
+				process_event(event->ip.ip, md->counter);
+		} else {
+			switch (event->header.type) {
+				case PERF_EVENT_MMAP:
+				case PERF_EVENT_MUNMAP:
+					printf("%s: %Lu %Lu %Lu %s\n",
+							event->header.type == PERF_EVENT_MMAP
+							? "mmap" : "munmap",
+							event->mmap.start,
+							event->mmap.len,
+							event->mmap.pgoff,
+							event->mmap.filename);
+					break;
+			}
 		}
 	}
 
-- 
cgit v0.10.2


From 8740f9418c78dcad694b46ab25d1645d5aef1f5e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:29 +0200
Subject: perf_counter: add some comments

Add a few comments because I was forgetting what field what for what
functionality.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090408130409.036984214@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 4809ae1..8bf764fc 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -344,10 +344,12 @@ struct file;
 
 struct perf_mmap_data {
 	struct rcu_head			rcu_head;
-	int				nr_pages;
-	atomic_t			wakeup;
-	atomic_t			head;
-	atomic_t			events;
+	int				nr_pages;	/* nr of data pages  */
+
+	atomic_t			wakeup;		/* POLL_ for wakeups */
+	atomic_t			head;		/* write position    */
+	atomic_t			events;		/* event limit       */
+
 	struct perf_counter_mmap_page   *user_page;
 	void 				*data_pages[0];
 };
-- 
cgit v0.10.2


From 8d1b2d9361b494bfc761700c348c65ebbe3deb5b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:30 +0200
Subject: perf_counter: track task-comm data

Similar to the mmap data stream, add one that tracks the task COMM field,
so that the userspace reporting knows what to call a task.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090408130409.127422406@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/fs/exec.c b/fs/exec.c
index e015c0b..bf47ed0 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -951,6 +951,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
 	task_lock(tsk);
 	strlcpy(tsk->comm, buf, sizeof(tsk->comm));
 	task_unlock(tsk);
+	perf_counter_comm(tsk);
 }
 
 int flush_old_exec(struct linux_binprm * bprm)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 8bf764fc..a70a55f 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -142,8 +142,9 @@ struct perf_counter_hw_event {
 				exclude_idle   :  1, /* don't count when idle */
 				mmap           :  1, /* include mmap data     */
 				munmap         :  1, /* include munmap data   */
+				comm	       :  1, /* include comm data     */
 
-				__reserved_1   : 53;
+				__reserved_1   : 52;
 
 	__u32			extra_config_len;
 	__u32			wakeup_events;	/* wakeup every n events */
@@ -231,6 +232,16 @@ enum perf_event_type {
 	PERF_EVENT_MUNMAP		= 2,
 
 	/*
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 *
+	 * 	u32				pid, tid;
+	 * 	char				comm[];
+	 * };
+	 */
+	PERF_EVENT_COMM			= 3,
+
+	/*
 	 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
 	 * will be PERF_RECORD_*
 	 *
@@ -545,6 +556,8 @@ extern void perf_counter_mmap(unsigned long addr, unsigned long len,
 extern void perf_counter_munmap(unsigned long addr, unsigned long len,
 				unsigned long pgoff, struct file *file);
 
+extern void perf_counter_comm(struct task_struct *tsk);
+
 #define MAX_STACK_DEPTH		255
 
 struct perf_callchain_entry {
@@ -583,6 +596,7 @@ static inline void
 perf_counter_munmap(unsigned long addr, unsigned long len,
 		    unsigned long pgoff, struct file *file) 		{ }
 
+static inline void perf_counter_comm(struct task_struct *tsk)		{ }
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index bf12df6..2d4aebb 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1917,6 +1917,99 @@ static void perf_counter_output(struct perf_counter *counter,
 }
 
 /*
+ * comm tracking
+ */
+
+struct perf_comm_event {
+	struct task_struct 	*task;
+	char 			*comm;
+	int			comm_size;
+
+	struct {
+		struct perf_event_header	header;
+
+		u32				pid;
+		u32				tid;
+	} event;
+};
+
+static void perf_counter_comm_output(struct perf_counter *counter,
+				     struct perf_comm_event *comm_event)
+{
+	struct perf_output_handle handle;
+	int size = comm_event->event.header.size;
+	int ret = perf_output_begin(&handle, counter, size, 0, 0);
+
+	if (ret)
+		return;
+
+	perf_output_put(&handle, comm_event->event);
+	perf_output_copy(&handle, comm_event->comm,
+				   comm_event->comm_size);
+	perf_output_end(&handle);
+}
+
+static int perf_counter_comm_match(struct perf_counter *counter,
+				   struct perf_comm_event *comm_event)
+{
+	if (counter->hw_event.comm &&
+	    comm_event->event.header.type == PERF_EVENT_COMM)
+		return 1;
+
+	return 0;
+}
+
+static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
+				  struct perf_comm_event *comm_event)
+{
+	struct perf_counter *counter;
+
+	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
+		if (perf_counter_comm_match(counter, comm_event))
+			perf_counter_comm_output(counter, comm_event);
+	}
+	rcu_read_unlock();
+}
+
+static void perf_counter_comm_event(struct perf_comm_event *comm_event)
+{
+	struct perf_cpu_context *cpuctx;
+	unsigned int size;
+	char *comm = comm_event->task->comm;
+
+	size = ALIGN(strlen(comm), sizeof(u64));
+
+	comm_event->comm = comm;
+	comm_event->comm_size = size;
+
+	comm_event->event.header.size = sizeof(comm_event->event) + size;
+
+	cpuctx = &get_cpu_var(perf_cpu_context);
+	perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
+	put_cpu_var(perf_cpu_context);
+
+	perf_counter_comm_ctx(&current->perf_counter_ctx, comm_event);
+}
+
+void perf_counter_comm(struct task_struct *task)
+{
+	struct perf_comm_event comm_event = {
+		.task	= task,
+		.event  = {
+			.header = { .type = PERF_EVENT_COMM, },
+			.pid	= task->group_leader->pid,
+			.tid	= task->pid,
+		},
+	};
+
+	perf_counter_comm_event(&comm_event);
+}
+
+/*
  * mmap tracking
  */
 
-- 
cgit v0.10.2


From de9ac07bbf8f51e0ce40e5428c3a8f627bd237c2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:31 +0200
Subject: perf_counter: some simple userspace profiling

# perf-record make -j4 kernel/
 # perf-report | tail -15

  0.39              cc1 [kernel] lock_acquired
  0.42              cc1 [kernel] lock_acquire
  0.51              cc1 [ user ] /lib64/libc-2.8.90.so: _int_free
  0.51               as [kernel] clear_page_c
  0.53              cc1 [ user ] /lib64/libc-2.8.90.so: memcpy
  0.56              cc1 [ user ] /lib64/libc-2.8.90.so: _IO_vfprintf
  0.63              cc1 [kernel] lock_release
  0.67              cc1 [ user ] /lib64/libc-2.8.90.so: strlen
  0.68              cc1 [kernel] debug_smp_processor_id
  1.38              cc1 [ user ] /lib64/libc-2.8.90.so: _int_malloc
  1.55              cc1 [ user ] /lib64/libc-2.8.90.so: memset
  1.77              cc1 [kernel] __lock_acquire
  1.88              cc1 [kernel] clear_page_c
  3.61               as [ user ] /usr/bin/as: <unknown>
 59.16              cc1 [ user ] /usr/libexec/gcc/x86_64-redhat-linux/4.3.2/cc1: <unknown>

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
LKML-Reference: <20090408130409.220518450@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 194b662..1dd37ee 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -1,10 +1,16 @@
-BINS = kerneltop perfstat
+BINS = kerneltop perfstat perf-record perf-report
 
 all: $(BINS)
 
 kerneltop: kerneltop.c ../../include/linux/perf_counter.h
 	cc -O6 -Wall -lrt -o $@ $<
 
+perf-record: perf-record.c ../../include/linux/perf_counter.h
+	cc -O6 -Wall -lrt -o $@ $<
+
+perf-report: perf-report.cc ../../include/linux/perf_counter.h
+	g++ -O6 -Wall -lrt -o $@ $<
+
 perfstat: kerneltop
 	ln -sf kerneltop perfstat
 
diff --git a/Documentation/perf_counter/perf-record.c b/Documentation/perf_counter/perf-record.c
new file mode 100644
index 0000000..614de7c
--- /dev/null
+++ b/Documentation/perf_counter/perf-record.c
@@ -0,0 +1,530 @@
+
+
+#define _GNU_SOURCE
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <getopt.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <errno.h>
+#include <ctype.h>
+#include <time.h>
+#include <sched.h>
+#include <pthread.h>
+
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <sys/uio.h>
+#include <sys/mman.h>
+
+#include <linux/unistd.h>
+#include <linux/types.h>
+
+#include "../../include/linux/perf_counter.h"
+
+
+/*
+ * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
+ * counters in the current task.
+ */
+#define PR_TASK_PERF_COUNTERS_DISABLE   31
+#define PR_TASK_PERF_COUNTERS_ENABLE    32
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+#define rdclock()                                       \
+({                                                      \
+        struct timespec ts;                             \
+                                                        \
+        clock_gettime(CLOCK_MONOTONIC, &ts);            \
+        ts.tv_sec * 1000000000ULL + ts.tv_nsec;         \
+})
+
+/*
+ * Pick up some kernel type conventions:
+ */
+#define __user
+#define asmlinkage
+
+#ifdef __x86_64__
+#define __NR_perf_counter_open 295
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#endif
+
+#ifdef __i386__
+#define __NR_perf_counter_open 333
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#endif
+
+#ifdef __powerpc__
+#define __NR_perf_counter_open 319
+#define rmb() 		asm volatile ("sync" ::: "memory")
+#define cpu_relax()	asm volatile ("" ::: "memory");
+#endif
+
+#define unlikely(x)	__builtin_expect(!!(x), 0)
+#define min(x, y) ({				\
+	typeof(x) _min1 = (x);			\
+	typeof(y) _min2 = (y);			\
+	(void) (&_min1 == &_min2);		\
+	_min1 < _min2 ? _min1 : _min2; })
+
+asmlinkage int sys_perf_counter_open(
+        struct perf_counter_hw_event    *hw_event_uptr          __user,
+        pid_t                           pid,
+        int                             cpu,
+        int                             group_fd,
+        unsigned long                   flags)
+{
+        return syscall(
+                __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
+}
+
+#define MAX_COUNTERS			64
+#define MAX_NR_CPUS			256
+
+#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
+
+static int			nr_counters			=  0;
+static __u64			event_id[MAX_COUNTERS]		= { };
+static int			default_interval = 100000;
+static int			event_count[MAX_COUNTERS];
+static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+static int			nr_cpus				=  0;
+static unsigned int		page_size;
+static unsigned int		mmap_pages			= 16;
+static int			output;
+static char 			*output_name			= "output.perf";
+static int			group				= 0;
+static unsigned int		realtime_prio			=  0;
+
+const unsigned int default_count[] = {
+	1000000,
+	1000000,
+	  10000,
+	  10000,
+	1000000,
+	  10000,
+};
+
+static char *hw_event_names[] = {
+	"CPU cycles",
+	"instructions",
+	"cache references",
+	"cache misses",
+	"branches",
+	"branch misses",
+	"bus cycles",
+};
+
+static char *sw_event_names[] = {
+	"cpu clock ticks",
+	"task clock ticks",
+	"pagefaults",
+	"context switches",
+	"CPU migrations",
+	"minor faults",
+	"major faults",
+};
+
+struct event_symbol {
+	__u64 event;
+	char *symbol;
+};
+
+static struct event_symbol event_symbols[] = {
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
+
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
+};
+
+/*
+ * Each event can have multiple symbolic names.
+ * Symbolic names are (almost) exactly matched.
+ */
+static __u64 match_event_symbols(char *str)
+{
+	__u64 config, id;
+	int type;
+	unsigned int i;
+
+	if (sscanf(str, "r%llx", &config) == 1)
+		return config | PERF_COUNTER_RAW_MASK;
+
+	if (sscanf(str, "%d:%llu", &type, &id) == 2)
+		return EID(type, id);
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		if (!strncmp(str, event_symbols[i].symbol,
+			     strlen(event_symbols[i].symbol)))
+			return event_symbols[i].event;
+	}
+
+	return ~0ULL;
+}
+
+static int parse_events(char *str)
+{
+	__u64 config;
+
+again:
+	if (nr_counters == MAX_COUNTERS)
+		return -1;
+
+	config = match_event_symbols(str);
+	if (config == ~0ULL)
+		return -1;
+
+	event_id[nr_counters] = config;
+	nr_counters++;
+
+	str = strstr(str, ",");
+	if (str) {
+		str++;
+		goto again;
+	}
+
+	return 0;
+}
+
+#define __PERF_COUNTER_FIELD(config, name) \
+	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
+
+#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
+#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
+#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
+#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
+
+static void display_events_help(void)
+{
+	unsigned int i;
+	__u64 e;
+
+	printf(
+	" -e EVENT     --event=EVENT   #  symbolic-name        abbreviations");
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		int type, id;
+
+		e = event_symbols[i].event;
+		type = PERF_COUNTER_TYPE(e);
+		id = PERF_COUNTER_ID(e);
+
+		printf("\n                             %d:%d: %-20s",
+				type, id, event_symbols[i].symbol);
+	}
+
+	printf("\n"
+	"                           rNNN: raw PMU events (eventsel+umask)\n\n");
+}
+
+static void display_help(void)
+{
+	printf(
+	"Usage: perf-record [<options>]\n"
+	"perf-record Options (up to %d event types can be specified at once):\n\n",
+		 MAX_COUNTERS);
+
+	display_events_help();
+
+	printf(
+	" -c CNT    --count=CNT          # event period to sample\n"
+	" -m pages  --mmap_pages=<pages> # number of mmap data pages\n"
+	" -o file   --output=<file>      # output file\n"
+	" -r prio   --realtime=<prio>    # use RT prio\n"
+	);
+
+	exit(0);
+}
+
+static void process_options(int argc, char *argv[])
+{
+	int error = 0, counter;
+
+	for (;;) {
+		int option_index = 0;
+		/** Options for getopt */
+		static struct option long_options[] = {
+			{"count",	required_argument,	NULL, 'c'},
+			{"event",	required_argument,	NULL, 'e'},
+			{"mmap_pages",	required_argument,	NULL, 'm'},
+			{"output",	required_argument,	NULL, 'o'},
+			{"realtime",	required_argument,	NULL, 'r'},
+			{NULL,		0,			NULL,  0 }
+		};
+		int c = getopt_long(argc, argv, "+:c:e:m:o:r:",
+				    long_options, &option_index);
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 'c': default_interval		=   atoi(optarg); break;
+		case 'e': error				= parse_events(optarg); break;
+		case 'm': mmap_pages			=   atoi(optarg); break;
+		case 'o': output_name			= strdup(optarg); break;
+		case 'r': realtime_prio			=   atoi(optarg); break;
+		default: error = 1; break;
+		}
+	}
+	if (error)
+		display_help();
+
+	if (!nr_counters) {
+		nr_counters = 1;
+		event_id[0] = 0;
+	}
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		if (event_count[counter])
+			continue;
+
+		event_count[counter] = default_interval;
+	}
+}
+
+struct mmap_data {
+	int counter;
+	void *base;
+	unsigned int mask;
+	unsigned int prev;
+};
+
+static unsigned int mmap_read_head(struct mmap_data *md)
+{
+	struct perf_counter_mmap_page *pc = md->base;
+	int head;
+
+	head = pc->data_head;
+	rmb();
+
+	return head;
+}
+
+static long events;
+static struct timeval last_read, this_read;
+
+static void mmap_read(struct mmap_data *md)
+{
+	unsigned int head = mmap_read_head(md);
+	unsigned int old = md->prev;
+	unsigned char *data = md->base + page_size;
+	unsigned long size;
+	void *buf;
+	int diff;
+
+	gettimeofday(&this_read, NULL);
+
+	/*
+	 * If we're further behind than half the buffer, there's a chance
+	 * the writer will bite our tail and screw up the events under us.
+	 *
+	 * If we somehow ended up ahead of the head, we got messed up.
+	 *
+	 * In either case, truncate and restart at head.
+	 */
+	diff = head - old;
+	if (diff > md->mask / 2 || diff < 0) {
+		struct timeval iv;
+		unsigned long msecs;
+
+		timersub(&this_read, &last_read, &iv);
+		msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
+
+		fprintf(stderr, "WARNING: failed to keep up with mmap data."
+				"  Last read %lu msecs ago.\n", msecs);
+
+		/*
+		 * head points to a known good entry, start there.
+		 */
+		old = head;
+	}
+
+	last_read = this_read;
+
+	if (old != head)
+		events++;
+
+	size = head - old;
+
+	if ((old & md->mask) + size != (head & md->mask)) {
+		buf = &data[old & md->mask];
+		size = md->mask + 1 - (old & md->mask);
+		old += size;
+		while (size) {
+			int ret = write(output, buf, size);
+			if (ret < 0) {
+				perror("failed to write");
+				exit(-1);
+			}
+			size -= ret;
+			buf += ret;
+		}
+	}
+
+	buf = &data[old & md->mask];
+	size = head - old;
+	old += size;
+	while (size) {
+		int ret = write(output, buf, size);
+		if (ret < 0) {
+			perror("failed to write");
+			exit(-1);
+		}
+		size -= ret;
+		buf += ret;
+	}
+
+	md->prev = old;
+}
+
+static volatile int done = 0;
+
+static void sigchld_handler(int sig)
+{
+	if (sig == SIGCHLD)
+		done = 1;
+}
+
+int main(int argc, char *argv[])
+{
+	struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
+	struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
+	struct perf_counter_hw_event hw_event;
+	int i, counter, group_fd, nr_poll = 0;
+	pid_t pid;
+	int ret;
+
+	page_size = sysconf(_SC_PAGE_SIZE);
+
+	process_options(argc, argv);
+
+	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	assert(nr_cpus <= MAX_NR_CPUS);
+	assert(nr_cpus >= 0);
+
+	output = open(output_name, O_CREAT|O_RDWR, S_IRWXU);
+	if (output < 0) {
+		perror("failed to create output file");
+		exit(-1);
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	for (i = 0; i < nr_cpus; i++) {
+		group_fd = -1;
+		for (counter = 0; counter < nr_counters; counter++) {
+
+			memset(&hw_event, 0, sizeof(hw_event));
+			hw_event.config		= event_id[counter];
+			hw_event.irq_period	= event_count[counter];
+			hw_event.record_type	= PERF_RECORD_IP | PERF_RECORD_TID;
+			hw_event.nmi		= 1;
+			hw_event.mmap		= 1;
+			hw_event.comm		= 1;
+
+			fd[i][counter] = sys_perf_counter_open(&hw_event, -1, i, group_fd, 0);
+			if (fd[i][counter] < 0) {
+				int err = errno;
+				printf("kerneltop error: syscall returned with %d (%s)\n",
+					fd[i][counter], strerror(err));
+				if (err == EPERM)
+					printf("Are you root?\n");
+				exit(-1);
+			}
+			assert(fd[i][counter] >= 0);
+			fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
+
+			/*
+			 * First counter acts as the group leader:
+			 */
+			if (group && group_fd == -1)
+				group_fd = fd[i][counter];
+
+			event_array[nr_poll].fd = fd[i][counter];
+			event_array[nr_poll].events = POLLIN;
+			nr_poll++;
+
+			mmap_array[i][counter].counter = counter;
+			mmap_array[i][counter].prev = 0;
+			mmap_array[i][counter].mask = mmap_pages*page_size - 1;
+			mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
+					PROT_READ, MAP_SHARED, fd[i][counter], 0);
+			if (mmap_array[i][counter].base == MAP_FAILED) {
+				printf("kerneltop error: failed to mmap with %d (%s)\n",
+						errno, strerror(errno));
+				exit(-1);
+			}
+		}
+	}
+
+	signal(SIGCHLD, sigchld_handler);
+
+	pid = fork();
+	if (pid < 0)
+		perror("failed to fork");
+
+	if (!pid) {
+		if (execvp(argv[0], argv)) {
+			perror(argv[0]);
+			exit(-1);
+		}
+	}
+
+	if (realtime_prio) {
+		struct sched_param param;
+
+		param.sched_priority = realtime_prio;
+		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
+			printf("Could not set realtime priority.\n");
+			exit(-1);
+		}
+	}
+
+	/*
+	 * TODO: store the current /proc/$/maps information somewhere
+	 */
+
+	while (!done) {
+		int hits = events;
+
+		for (i = 0; i < nr_cpus; i++) {
+			for (counter = 0; counter < nr_counters; counter++)
+				mmap_read(&mmap_array[i][counter]);
+		}
+
+		if (hits == events)
+			ret = poll(event_array, nr_poll, 100);
+	}
+
+	return 0;
+}
diff --git a/Documentation/perf_counter/perf-report.cc b/Documentation/perf_counter/perf-report.cc
new file mode 100644
index 0000000..09da0ba
--- /dev/null
+++ b/Documentation/perf_counter/perf-report.cc
@@ -0,0 +1,472 @@
+#define _GNU_SOURCE
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <errno.h>
+#include <ctype.h>
+#include <time.h>
+#include <getopt.h>
+
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <linux/unistd.h>
+#include <linux/types.h>
+
+#include "../../include/linux/perf_counter.h"
+
+#include <set>
+#include <map>
+#include <string>
+
+
+static char 		const *input_name = "output.perf";
+static int		input;
+
+static unsigned long	page_size;
+static unsigned long	mmap_window = 32;
+
+struct ip_event {
+	struct perf_event_header header;
+	__u64 ip;
+	__u32 pid, tid;
+};
+struct mmap_event {
+	struct perf_event_header header;
+	__u32 pid, tid;
+	__u64 start;
+	__u64 len;
+	__u64 pgoff;
+	char filename[PATH_MAX];
+};
+struct comm_event {
+	struct perf_event_header header;
+	__u32 pid,tid;
+	char comm[16];
+};
+
+typedef union event_union {
+	struct perf_event_header header;
+	struct ip_event ip;
+	struct mmap_event mmap;
+	struct comm_event comm;
+} event_t;
+
+struct section {
+	uint64_t start;
+	uint64_t end;
+
+	uint64_t offset;
+
+	std::string name;
+
+	section() { };
+
+	section(uint64_t stab) : end(stab) { };
+
+	section(uint64_t start, uint64_t size, uint64_t offset, std::string name) :
+		start(start), end(start + size), offset(offset), name(name)
+	{ };
+
+	bool operator < (const struct section &s) const {
+		return end < s.end;
+	};
+};
+
+typedef std::set<struct section> sections_t;
+
+struct symbol {
+	uint64_t start;
+	uint64_t end;
+
+	std::string name;
+
+	symbol() { };
+
+	symbol(uint64_t ip) : start(ip) { }
+
+	symbol(uint64_t start, uint64_t len, std::string name) :
+		start(start), end(start + len), name(name)
+	{ };
+
+	bool operator < (const struct symbol &s) const {
+		return start < s.start;
+	};
+};
+
+typedef std::set<struct symbol> symbols_t;
+
+struct dso {
+	sections_t sections;
+	symbols_t syms;
+};
+
+static std::map<std::string, struct dso> dsos;
+
+static void load_dso_sections(std::string dso_name)
+{
+	struct dso &dso = dsos[dso_name];
+
+	std::string cmd = "readelf -DSW " + dso_name;
+
+	FILE *file = popen(cmd.c_str(), "r");
+	if (!file) {
+		perror("failed to open pipe");
+		exit(-1);
+	}
+
+	char *line = NULL;
+	size_t n = 0;
+
+	while (!feof(file)) {
+		uint64_t addr, off, size;
+		char name[32];
+
+		if (getline(&line, &n, file) < 0)
+			break;
+		if (!line)
+			break;
+
+		if (sscanf(line, "  [%*2d] %16s %*14s %Lx %Lx %Lx",
+					name, &addr, &off, &size) == 4) {
+
+			dso.sections.insert(section(addr, size, addr - off, name));
+		}
+#if 0
+		/*
+		 * for reading readelf symbols (-s), however these don't seem
+		 * to include nearly everything, so use nm for that.
+		 */
+		if (sscanf(line, " %*4d %*3d: %Lx %5Lu %*7s %*6s %*7s %3d %s",
+			   &start, &size, &section, sym) == 4) {
+
+			start -= dso.section_offsets[section];
+
+			dso.syms.insert(symbol(start, size, std::string(sym)));
+		}
+#endif
+	}
+	pclose(file);
+}
+
+static void load_dso_symbols(std::string dso_name, std::string args)
+{
+	struct dso &dso = dsos[dso_name];
+
+	std::string cmd = "nm -nSC " + args + " " + dso_name;
+
+	FILE *file = popen(cmd.c_str(), "r");
+	if (!file) {
+		perror("failed to open pipe");
+		exit(-1);
+	}
+
+	char *line = NULL;
+	size_t n = 0;
+
+	while (!feof(file)) {
+		uint64_t start, size;
+		char c;
+		char sym[1024];
+
+		if (getline(&line, &n, file) < 0)
+			break;
+		if (!line)
+			break;
+
+
+		if (sscanf(line, "%Lx %Lx %c %s", &start, &size, &c, sym) == 4) {
+			sections_t::const_iterator si =
+				dso.sections.upper_bound(section(start));
+			if (si == dso.sections.end()) {
+				printf("symbol in unknown section: %s\n", sym);
+				continue;
+			}
+
+			start -= si->offset;
+
+			dso.syms.insert(symbol(start, size, sym));
+		}
+	}
+	pclose(file);
+}
+
+static void load_dso(std::string dso_name)
+{
+	load_dso_sections(dso_name);
+	load_dso_symbols(dso_name, "-D"); /* dynamic symbols */
+	load_dso_symbols(dso_name, "");   /* regular ones */
+}
+
+void load_kallsyms(void)
+{
+	struct dso &dso = dsos["[kernel]"];
+
+	FILE *file = fopen("/proc/kallsyms", "r");
+	if (!file) {
+		perror("failed to open kallsyms");
+		exit(-1);
+	}
+
+	char *line;
+	size_t n;
+
+	while (!feof(file)) {
+		uint64_t start;
+		char c;
+		char sym[1024];
+
+		if (getline(&line, &n, file) < 0)
+			break;
+		if (!line)
+			break;
+
+		if (sscanf(line, "%Lx %c %s", &start, &c, sym) == 3)
+			dso.syms.insert(symbol(start, 0x1000000, std::string(sym)));
+	}
+	fclose(file);
+}
+
+struct map {
+	uint64_t start;
+	uint64_t end;
+	uint64_t pgoff;
+
+	std::string dso;
+
+	map() { };
+
+	map(uint64_t ip) : end(ip) { }
+
+	map(mmap_event *mmap) {
+		start = mmap->start;
+		end = mmap->start + mmap->len;
+		pgoff = mmap->pgoff;
+
+		dso = std::string(mmap->filename);
+
+		if (dsos.find(dso) == dsos.end())
+			load_dso(dso);
+	};
+
+	bool operator < (const struct map &m) const {
+		return end < m.end;
+	};
+};
+
+typedef std::set<struct map> maps_t;
+
+static std::map<int, maps_t> maps;
+
+static std::map<int, std::string> comms;
+
+static std::map<std::string, int> hist;
+static std::multimap<int, std::string> rev_hist;
+
+static std::string resolve_comm(int pid)
+{
+	std::string comm = "<unknown>";
+	std::map<int, std::string>::const_iterator ci = comms.find(pid);
+	if (ci != comms.end())
+		comm = ci->second;
+
+	return comm;
+}
+
+static std::string resolve_user_symbol(int pid, uint64_t ip)
+{
+	std::string sym = "<unknown>";
+
+	maps_t &m = maps[pid];
+	maps_t::const_iterator mi = m.upper_bound(map(ip));
+	if (mi == m.end())
+		return sym;
+
+	ip -= mi->start + mi->pgoff;
+
+	symbols_t &s = dsos[mi->dso].syms;
+	symbols_t::const_iterator si = s.upper_bound(symbol(ip));
+
+	sym = mi->dso + ": <unknown>";
+
+	if (si == s.begin())
+		return sym;
+	si--;
+
+	if (si->start <= ip && ip < si->end)
+		sym = mi->dso + ": " + si->name;
+#if 0
+	else if (si->start <= ip)
+		sym = mi->dso + ": ?" + si->name;
+#endif
+
+	return sym;
+}
+
+static std::string resolve_kernel_symbol(uint64_t ip)
+{
+	std::string sym = "<unknown>";
+
+	symbols_t &s = dsos["[kernel]"].syms;
+	symbols_t::const_iterator si = s.upper_bound(symbol(ip));
+
+	if (si == s.begin())
+		return sym;
+	si--;
+
+	if (si->start <= ip && ip < si->end)
+		sym = si->name;
+
+	return sym;
+}
+
+static void display_help(void)
+{
+	printf(
+	"Usage: perf-report [<options>]\n"
+	" -i file   --input=<file>      # input file\n"
+	);
+
+	exit(0);
+}
+
+static void process_options(int argc, char *argv[])
+{
+	int error = 0;
+
+	for (;;) {
+		int option_index = 0;
+		/** Options for getopt */
+		static struct option long_options[] = {
+			{"input",	required_argument,	NULL, 'i'},
+			{NULL,		0,			NULL,  0 }
+		};
+		int c = getopt_long(argc, argv, "+:i:",
+				    long_options, &option_index);
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 'i': input_name			= strdup(optarg); break;
+		default: error = 1; break;
+		}
+	}
+
+	if (error)
+		display_help();
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned long offset = 0;
+	unsigned long head = 0;
+	struct stat stat;
+	char *buf;
+	event_t *event;
+	int ret;
+	unsigned long total = 0;
+
+	page_size = getpagesize();
+
+	process_options(argc, argv);
+
+	input = open(input_name, O_RDONLY);
+	if (input < 0) {
+		perror("failed to open file");
+		exit(-1);
+	}
+
+	ret = fstat(input, &stat);
+	if (ret < 0) {
+		perror("failed to stat file");
+		exit(-1);
+	}
+
+	load_kallsyms();
+
+remap:
+	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
+			   MAP_SHARED, input, offset);
+	if (buf == MAP_FAILED) {
+		perror("failed to mmap file");
+		exit(-1);
+	}
+
+more:
+	event = (event_t *)(buf + head);
+
+	if (head + event->header.size >= page_size * mmap_window) {
+		unsigned long shift = page_size * (head / page_size);
+
+		munmap(buf, page_size * mmap_window);
+		offset += shift;
+		head -= shift;
+		goto remap;
+	}
+	head += event->header.size;
+
+	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
+		std::string comm, sym, level;
+		char output[1024];
+
+		if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
+			level = "[kernel]";
+			sym = resolve_kernel_symbol(event->ip.ip);
+		} else if (event->header.misc & PERF_EVENT_MISC_USER) {
+			level = "[ user ]";
+			sym = resolve_user_symbol(event->ip.pid, event->ip.ip);
+		} else {
+			level = "[  hv  ]";
+		}
+		comm = resolve_comm(event->ip.pid);
+
+		snprintf(output, sizeof(output), "%16s %s %s",
+				comm.c_str(), level.c_str(), sym.c_str());
+		hist[output]++;
+
+		total++;
+
+	} else switch (event->header.type) {
+	case PERF_EVENT_MMAP:
+		maps[event->mmap.pid].insert(map(&event->mmap));
+		break;
+
+	case PERF_EVENT_COMM:
+		comms[event->comm.pid] = std::string(event->comm.comm);
+		break;
+	}
+
+	if (offset + head < stat.st_size)
+		goto more;
+
+	close(input);
+
+	std::map<std::string, int>::iterator hi = hist.begin();
+
+	while (hi != hist.end()) {
+		rev_hist.insert(std::pair<int, std::string>(hi->second, hi->first));
+		hist.erase(hi++);
+	}
+
+	std::multimap<int, std::string>::const_iterator ri = rev_hist.begin();
+
+	while (ri != rev_hist.end()) {
+		printf(" %5.2f %s\n", (100.0 * ri->first)/total, ri->second.c_str());
+		ri++;
+	}
+
+	return 0;
+}
+
-- 
cgit v0.10.2


From 4d855457d84b819fefcd1cd1b0a2a0a0ec475c07 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:32 +0200
Subject: perf_counter: move PERF_RECORD_TIME

Move PERF_RECORD_TIME so that all the fixed length items come before
the variable length ones.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090408130409.307926436@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index a70a55f..8bd1be5 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -100,9 +100,9 @@ enum sw_event_ids {
 enum perf_counter_record_format {
 	PERF_RECORD_IP		= 1U << 0,
 	PERF_RECORD_TID		= 1U << 1,
-	PERF_RECORD_GROUP	= 1U << 2,
-	PERF_RECORD_CALLCHAIN	= 1U << 3,
-	PERF_RECORD_TIME	= 1U << 4,
+	PERF_RECORD_TIME	= 1U << 2,
+	PERF_RECORD_GROUP	= 1U << 3,
+	PERF_RECORD_CALLCHAIN	= 1U << 4,
 };
 
 /*
@@ -250,6 +250,7 @@ enum perf_event_type {
 	 *
 	 * 	{ u64			ip;	  } && PERF_RECORD_IP
 	 * 	{ u32			pid, tid; } && PERF_RECORD_TID
+	 * 	{ u64			time;     } && PERF_RECORD_TIME
 	 *
 	 * 	{ u64			nr;
 	 * 	  { u64 event, val; } 	cnt[nr];  } && PERF_RECORD_GROUP
@@ -259,8 +260,6 @@ enum perf_event_type {
 	 * 				kernel,
 	 * 				user;
 	 * 	  u64			ips[nr];  } && PERF_RECORD_CALLCHAIN
-	 *
-	 * 	{ u64			time;     } && PERF_RECORD_TIME
 	 * };
 	 */
 };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 2d4aebb..4dc8600d2 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1850,6 +1850,16 @@ static void perf_counter_output(struct perf_counter *counter,
 		header.size += sizeof(tid_entry);
 	}
 
+	if (record_type & PERF_RECORD_TIME) {
+		/*
+		 * Maybe do better on x86 and provide cpu_clock_nmi()
+		 */
+		time = sched_clock();
+
+		header.type |= PERF_RECORD_TIME;
+		header.size += sizeof(u64);
+	}
+
 	if (record_type & PERF_RECORD_GROUP) {
 		header.type |= PERF_RECORD_GROUP;
 		header.size += sizeof(u64) +
@@ -1867,16 +1877,6 @@ static void perf_counter_output(struct perf_counter *counter,
 		}
 	}
 
-	if (record_type & PERF_RECORD_TIME) {
-		/*
-		 * Maybe do better on x86 and provide cpu_clock_nmi()
-		 */
-		time = sched_clock();
-
-		header.type |= PERF_RECORD_TIME;
-		header.size += sizeof(u64);
-	}
-
 	ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
 	if (ret)
 		return;
@@ -1889,6 +1889,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (record_type & PERF_RECORD_TID)
 		perf_output_put(&handle, tid_entry);
 
+	if (record_type & PERF_RECORD_TIME)
+		perf_output_put(&handle, time);
+
 	if (record_type & PERF_RECORD_GROUP) {
 		struct perf_counter *leader, *sub;
 		u64 nr = counter->nr_siblings;
@@ -1910,9 +1913,6 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (callchain)
 		perf_output_copy(&handle, callchain, callchain_size);
 
-	if (record_type & PERF_RECORD_TIME)
-		perf_output_put(&handle, time);
-
 	perf_output_end(&handle);
 }
 
-- 
cgit v0.10.2


From 78f13e9525ba777da25c4ddab89f28e9366a8b7c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:33 +0200
Subject: perf_counter: allow for data addresses to be recorded

Paul suggested we allow for data addresses to be recorded along with
the traditional IPs as power can provide these.

For now, only the software pagefault events provide data addresses,
but in the future power might as well for some events.

x86 doesn't seem capable of providing this atm.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090408130409.394816925@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 0697ade..c9d019f 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -749,7 +749,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 * Finally record data if requested.
 	 */
 	if (record)
-		perf_counter_overflow(counter, 1, regs);
+		perf_counter_overflow(counter, 1, regs, 0);
 }
 
 /*
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 17bbf6f..ac0e112 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -171,7 +171,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 		die("Weird page fault", regs, SIGSEGV);
 	}
 
-	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs);
+	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address);
 
 	/* When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
@@ -312,7 +312,8 @@ good_area:
 	}
 	if (ret & VM_FAULT_MAJOR) {
 		current->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs);
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0,
+				     regs, address);
 #ifdef CONFIG_PPC_SMLPAR
 		if (firmware_has_feature(FW_FEATURE_CMO)) {
 			preempt_disable();
@@ -322,7 +323,8 @@ good_area:
 #endif
 	} else {
 		current->min_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs);
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0,
+				     regs, address);
 	}
 	up_read(&mm->mmap_sem);
 	return 0;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 1116a41..0fcbaab 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -800,7 +800,7 @@ again:
 			continue;
 
 		perf_save_and_restart(counter);
-		if (perf_counter_overflow(counter, nmi, regs))
+		if (perf_counter_overflow(counter, nmi, regs, 0))
 			__pmc_generic_disable(counter, &counter->hw, bit);
 	}
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f2d3324..6f9df2b 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1045,7 +1045,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	if (unlikely(error_code & PF_RSVD))
 		pgtable_bad(regs, error_code, address);
 
-	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs);
+	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address);
 
 	/*
 	 * If we're in an interrupt, have no user context or are running
@@ -1142,10 +1142,12 @@ good_area:
 
 	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs);
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0,
+				     regs, address);
 	} else {
 		tsk->min_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs);
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0,
+				     regs, address);
 	}
 
 	check_v8086_mode(regs, address, tsk);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 8bd1be5..c22363a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -101,8 +101,9 @@ enum perf_counter_record_format {
 	PERF_RECORD_IP		= 1U << 0,
 	PERF_RECORD_TID		= 1U << 1,
 	PERF_RECORD_TIME	= 1U << 2,
-	PERF_RECORD_GROUP	= 1U << 3,
-	PERF_RECORD_CALLCHAIN	= 1U << 4,
+	PERF_RECORD_ADDR	= 1U << 3,
+	PERF_RECORD_GROUP	= 1U << 4,
+	PERF_RECORD_CALLCHAIN	= 1U << 5,
 };
 
 /*
@@ -251,6 +252,7 @@ enum perf_event_type {
 	 * 	{ u64			ip;	  } && PERF_RECORD_IP
 	 * 	{ u32			pid, tid; } && PERF_RECORD_TID
 	 * 	{ u64			time;     } && PERF_RECORD_TIME
+	 * 	{ u64			addr;     } && PERF_RECORD_ADDR
 	 *
 	 * 	{ u64			nr;
 	 * 	  { u64 event, val; } 	cnt[nr];  } && PERF_RECORD_GROUP
@@ -537,7 +539,7 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 extern void perf_counter_update_userpage(struct perf_counter *counter);
 
 extern int perf_counter_overflow(struct perf_counter *counter,
-				 int nmi, struct pt_regs *regs);
+				 int nmi, struct pt_regs *regs, u64 addr);
 /*
  * Return 1 for a software counter, 0 for a hardware counter
  */
@@ -547,7 +549,7 @@ static inline int is_software_counter(struct perf_counter *counter)
 		perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE;
 }
 
-extern void perf_swcounter_event(u32, u64, int, struct pt_regs *);
+extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
 
 extern void perf_counter_mmap(unsigned long addr, unsigned long len,
 			      unsigned long pgoff, struct file *file);
@@ -584,8 +586,8 @@ static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
 static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
 
 static inline void
-perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs)	{ }
-
+perf_swcounter_event(u32 event, u64 nr, int nmi,
+		     struct pt_regs *regs, u64 addr)			{ }
 
 static inline void
 perf_counter_mmap(unsigned long addr, unsigned long len,
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4dc8600d2..321c57e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -800,7 +800,7 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 	update_context_time(ctx);
 
 	regs = task_pt_regs(task);
-	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs);
+	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
 	__perf_counter_sched_out(ctx, cpuctx);
 
 	cpuctx->task_ctx = NULL;
@@ -1810,7 +1810,7 @@ static void perf_output_end(struct perf_output_handle *handle)
 }
 
 static void perf_counter_output(struct perf_counter *counter,
-				int nmi, struct pt_regs *regs)
+				int nmi, struct pt_regs *regs, u64 addr)
 {
 	int ret;
 	u64 record_type = counter->hw_event.record_type;
@@ -1860,6 +1860,11 @@ static void perf_counter_output(struct perf_counter *counter,
 		header.size += sizeof(u64);
 	}
 
+	if (record_type & PERF_RECORD_ADDR) {
+		header.type |= PERF_RECORD_ADDR;
+		header.size += sizeof(u64);
+	}
+
 	if (record_type & PERF_RECORD_GROUP) {
 		header.type |= PERF_RECORD_GROUP;
 		header.size += sizeof(u64) +
@@ -1892,6 +1897,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (record_type & PERF_RECORD_TIME)
 		perf_output_put(&handle, time);
 
+	if (record_type & PERF_RECORD_ADDR)
+		perf_output_put(&handle, addr);
+
 	if (record_type & PERF_RECORD_GROUP) {
 		struct perf_counter *leader, *sub;
 		u64 nr = counter->nr_siblings;
@@ -2158,7 +2166,7 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
  */
 
 int perf_counter_overflow(struct perf_counter *counter,
-			  int nmi, struct pt_regs *regs)
+			  int nmi, struct pt_regs *regs, u64 addr)
 {
 	int events = atomic_read(&counter->event_limit);
 	int ret = 0;
@@ -2175,7 +2183,7 @@ int perf_counter_overflow(struct perf_counter *counter,
 			perf_counter_disable(counter);
 	}
 
-	perf_counter_output(counter, nmi, regs);
+	perf_counter_output(counter, nmi, regs, addr);
 	return ret;
 }
 
@@ -2240,7 +2248,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 		regs = task_pt_regs(current);
 
 	if (regs) {
-		if (perf_counter_overflow(counter, 0, regs))
+		if (perf_counter_overflow(counter, 0, regs, 0))
 			ret = HRTIMER_NORESTART;
 	}
 
@@ -2250,11 +2258,11 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 }
 
 static void perf_swcounter_overflow(struct perf_counter *counter,
-				    int nmi, struct pt_regs *regs)
+				    int nmi, struct pt_regs *regs, u64 addr)
 {
 	perf_swcounter_update(counter);
 	perf_swcounter_set_period(counter);
-	if (perf_counter_overflow(counter, nmi, regs))
+	if (perf_counter_overflow(counter, nmi, regs, addr))
 		/* soft-disable the counter */
 		;
 
@@ -2286,16 +2294,17 @@ static int perf_swcounter_match(struct perf_counter *counter,
 }
 
 static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
-			       int nmi, struct pt_regs *regs)
+			       int nmi, struct pt_regs *regs, u64 addr)
 {
 	int neg = atomic64_add_negative(nr, &counter->hw.count);
 	if (counter->hw.irq_period && !neg)
-		perf_swcounter_overflow(counter, nmi, regs);
+		perf_swcounter_overflow(counter, nmi, regs, addr);
 }
 
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 				     enum perf_event_types type, u32 event,
-				     u64 nr, int nmi, struct pt_regs *regs)
+				     u64 nr, int nmi, struct pt_regs *regs,
+				     u64 addr)
 {
 	struct perf_counter *counter;
 
@@ -2305,7 +2314,7 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 	rcu_read_lock();
 	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
 		if (perf_swcounter_match(counter, type, event, regs))
-			perf_swcounter_add(counter, nr, nmi, regs);
+			perf_swcounter_add(counter, nr, nmi, regs, addr);
 	}
 	rcu_read_unlock();
 }
@@ -2325,7 +2334,8 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
 }
 
 static void __perf_swcounter_event(enum perf_event_types type, u32 event,
-				   u64 nr, int nmi, struct pt_regs *regs)
+				   u64 nr, int nmi, struct pt_regs *regs,
+				   u64 addr)
 {
 	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
 	int *recursion = perf_swcounter_recursion_context(cpuctx);
@@ -2336,10 +2346,11 @@ static void __perf_swcounter_event(enum perf_event_types type, u32 event,
 	(*recursion)++;
 	barrier();
 
-	perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs);
+	perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
+				 nr, nmi, regs, addr);
 	if (cpuctx->task_ctx) {
 		perf_swcounter_ctx_event(cpuctx->task_ctx, type, event,
-				nr, nmi, regs);
+					 nr, nmi, regs, addr);
 	}
 
 	barrier();
@@ -2349,9 +2360,10 @@ out:
 	put_cpu_var(perf_cpu_context);
 }
 
-void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs)
+void
+perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
 {
-	__perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs);
+	__perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr);
 }
 
 static void perf_swcounter_read(struct perf_counter *counter)
@@ -2548,7 +2560,7 @@ void perf_tpcounter_event(int event_id)
 	if (!regs)
 		regs = task_pt_regs(current);
 
-	__perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs);
+	__perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0);
 }
 
 extern int ftrace_profile_enable(int);
-- 
cgit v0.10.2


From e85abf8f432bb2a13733ab7609fbb8e1500af51d Mon Sep 17 00:00:00 2001
From: Gary Hade <garyhade@us.ibm.com>
Date: Wed, 8 Apr 2009 14:07:25 -0700
Subject: x86: consolidate SMP code in io_apic.c

Impact: Cleanup

Reorganizes the code in arch/x86/kernel/io_apic.c by
combining two '#ifdef CONFIG_SMP' regions.  In addition
to making the code easier to understand the first
'#ifdef CONFIG_SMP' region is moved to a location later
in the file which will reduce the need for function
forward declarations when the code subsequently revised.

The only changes other than relocating code to a different
position in the file were the removal of the assign_irq_vector()
forward declaration which was no longer needed and some line
length reduction formatting changes.

Signed-off-by: Gary Hade <garyhade@us.ibm.com>
Cc: lcm@us.ibm.com
LKML-Reference: <20090408210725.GC11159@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 767fe7e..7c9d045 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -518,120 +518,6 @@ static void ioapic_mask_entry(int apic, int pin)
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-#ifdef CONFIG_SMP
-static void send_cleanup_vector(struct irq_cfg *cfg)
-{
-	cpumask_var_t cleanup_mask;
-
-	if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
-		unsigned int i;
-		cfg->move_cleanup_count = 0;
-		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
-			cfg->move_cleanup_count++;
-		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
-			apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
-	} else {
-		cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
-		cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
-		apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
-		free_cpumask_var(cleanup_mask);
-	}
-	cfg->move_in_progress = 0;
-}
-
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
-{
-	int apic, pin;
-	struct irq_pin_list *entry;
-	u8 vector = cfg->vector;
-
-	entry = cfg->irq_2_pin;
-	for (;;) {
-		unsigned int reg;
-
-		if (!entry)
-			break;
-
-		apic = entry->apic;
-		pin = entry->pin;
-		/*
-		 * With interrupt-remapping, destination information comes
-		 * from interrupt-remapping table entry.
-		 */
-		if (!irq_remapped(irq))
-			io_apic_write(apic, 0x11 + pin*2, dest);
-		reg = io_apic_read(apic, 0x10 + pin*2);
-		reg &= ~IO_APIC_REDIR_VECTOR_MASK;
-		reg |= vector;
-		io_apic_modify(apic, 0x10 + pin*2, reg);
-		if (!entry->next)
-			break;
-		entry = entry->next;
-	}
-}
-
-static int
-assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
-
-/*
- * Either sets desc->affinity to a valid value, and returns
- * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
- * leaves desc->affinity untouched.
- */
-static unsigned int
-set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
-{
-	struct irq_cfg *cfg;
-	unsigned int irq;
-
-	if (!cpumask_intersects(mask, cpu_online_mask))
-		return BAD_APICID;
-
-	irq = desc->irq;
-	cfg = desc->chip_data;
-	if (assign_irq_vector(irq, cfg, mask))
-		return BAD_APICID;
-
-	/* check that before desc->addinity get updated */
-	set_extra_move_desc(desc, mask);
-
-	cpumask_copy(desc->affinity, mask);
-
-	return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
-}
-
-static void
-set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
-{
-	struct irq_cfg *cfg;
-	unsigned long flags;
-	unsigned int dest;
-	unsigned int irq;
-
-	irq = desc->irq;
-	cfg = desc->chip_data;
-
-	spin_lock_irqsave(&ioapic_lock, flags);
-	dest = set_desc_affinity(desc, mask);
-	if (dest != BAD_APICID) {
-		/* Only the high 8 bits are valid. */
-		dest = SET_APIC_LOGICAL_ID(dest);
-		__target_IO_APIC_irq(irq, dest, cfg);
-	}
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-static void
-set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
-{
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
-
-	set_ioapic_affinity_irq_desc(desc, mask);
-}
-#endif /* CONFIG_SMP */
-
 /*
  * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
  * shared ISA-space IRQs, so we have to support them. We are super
@@ -2360,6 +2246,115 @@ static int ioapic_retrigger_irq(unsigned int irq)
  */
 
 #ifdef CONFIG_SMP
+static void send_cleanup_vector(struct irq_cfg *cfg)
+{
+	cpumask_var_t cleanup_mask;
+
+	if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
+		unsigned int i;
+		cfg->move_cleanup_count = 0;
+		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+			cfg->move_cleanup_count++;
+		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+			apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
+	} else {
+		cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+		cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
+		apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+		free_cpumask_var(cleanup_mask);
+	}
+	cfg->move_in_progress = 0;
+}
+
+static void
+__target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
+{
+	int apic, pin;
+	struct irq_pin_list *entry;
+	u8 vector = cfg->vector;
+
+	entry = cfg->irq_2_pin;
+	for (;;) {
+		unsigned int reg;
+
+		if (!entry)
+			break;
+
+		apic = entry->apic;
+		pin = entry->pin;
+		/*
+		 * With interrupt-remapping, destination information comes
+		 * from interrupt-remapping table entry.
+		 */
+		if (!irq_remapped(irq))
+			io_apic_write(apic, 0x11 + pin*2, dest);
+		reg = io_apic_read(apic, 0x10 + pin*2);
+		reg &= ~IO_APIC_REDIR_VECTOR_MASK;
+		reg |= vector;
+		io_apic_modify(apic, 0x10 + pin*2, reg);
+		if (!entry->next)
+			break;
+		entry = entry->next;
+	}
+}
+
+/*
+ * Either sets desc->affinity to a valid value, and returns
+ * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
+ * leaves desc->affinity untouched.
+ */
+static unsigned int
+set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
+{
+	struct irq_cfg *cfg;
+	unsigned int irq;
+
+	if (!cpumask_intersects(mask, cpu_online_mask))
+		return BAD_APICID;
+
+	irq = desc->irq;
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
+		return BAD_APICID;
+
+	/* check that before desc->addinity get updated */
+	set_extra_move_desc(desc, mask);
+
+	cpumask_copy(desc->affinity, mask);
+
+	return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
+}
+
+static void
+set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+{
+	struct irq_cfg *cfg;
+	unsigned long flags;
+	unsigned int dest;
+	unsigned int irq;
+
+	irq = desc->irq;
+	cfg = desc->chip_data;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	dest = set_desc_affinity(desc, mask);
+	if (dest != BAD_APICID) {
+		/* Only the high 8 bits are valid. */
+		dest = SET_APIC_LOGICAL_ID(dest);
+		__target_IO_APIC_irq(irq, dest, cfg);
+	}
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void
+set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	set_ioapic_affinity_irq_desc(desc, mask);
+}
 
 #ifdef CONFIG_INTR_REMAP
 
-- 
cgit v0.10.2


From 6c0b324435ff49fb3c68fe808a93853d81c7fb97 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 9 Apr 2009 09:27:37 +1000
Subject: perf_counter: add MAINTAINERS entry

This adds an entry in MAINTAINERS for the perf_counter subsystem.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18909.13033.345975.434902@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/MAINTAINERS b/MAINTAINERS
index c3b2159..16fb45a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3483,6 +3483,16 @@ M:	balbir@linux.vnet.ibm.com
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
 
+PERFORMANCE COUNTER SUBSYSTEM
+P:	Peter Zijlstra
+M:	a.p.zijlstra@chello.nl
+P:	Paul Mackerras
+M:	paulus@samba.org
+P:	Ingo Molnar
+M:	mingo@elte.hu
+L:	linux-kernel@vger.kernel.org
+S:	Supported
+
 PERSONALITY HANDLING
 P:	Christoph Hellwig
 M:	hch@infradead.org
-- 
cgit v0.10.2


From ca8f2d7f019a8547f39ddb9ed0144932f12807f2 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 9 Apr 2009 14:42:56 +1000
Subject: perf_counter: powerpc: add nmi_enter/nmi_exit calls

Impact: fix potential deadlocks on powerpc

Now that the core is using in_nmi() (added in e30e08f6, "perf_counter:
fix NMI race in task clock"), we need the powerpc perf_counter_interrupt
to call nmi_enter() and nmi_exit() in those cases where the interrupt
happens when interrupts are soft-disabled.

If interrupts were soft-enabled, we can treat it as a regular interrupt
and do irq_enter/irq_exit around the whole routine. This lets us get rid
of the test_perf_counter_pending() call at the end of
perf_counter_interrupt, thus simplifying things a little.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18909.31952.873098.336615@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index c9d019f..bd76d0f 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -714,7 +714,7 @@ hw_perf_counter_init(struct perf_counter *counter)
  * here so there is no possibility of being interrupted.
  */
 static void record_and_restart(struct perf_counter *counter, long val,
-			       struct pt_regs *regs)
+			       struct pt_regs *regs, int nmi)
 {
 	s64 prev, delta, left;
 	int record = 0;
@@ -749,7 +749,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 * Finally record data if requested.
 	 */
 	if (record)
-		perf_counter_overflow(counter, 1, regs, 0);
+		perf_counter_overflow(counter, nmi, regs, 0);
 }
 
 /*
@@ -762,6 +762,17 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	struct perf_counter *counter;
 	long val;
 	int found = 0;
+	int nmi;
+
+	/*
+	 * If interrupts were soft-disabled when this PMU interrupt
+	 * occurred, treat it as an NMI.
+	 */
+	nmi = !regs->softe;
+	if (nmi)
+		nmi_enter();
+	else
+		irq_enter();
 
 	for (i = 0; i < cpuhw->n_counters; ++i) {
 		counter = cpuhw->counter[i];
@@ -769,7 +780,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 		if ((int)val < 0) {
 			/* counter has overflowed */
 			found = 1;
-			record_and_restart(counter, val, regs);
+			record_and_restart(counter, val, regs, nmi);
 		}
 	}
 
@@ -796,18 +807,10 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	 */
 	mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
 
-	/*
-	 * If we need a wakeup, check whether interrupts were soft-enabled
-	 * when we took the interrupt.  If they were, we can wake stuff up
-	 * immediately; otherwise we'll have do the wakeup when interrupts
-	 * get soft-enabled.
-	 */
-	if (test_perf_counter_pending() && regs->softe) {
-		irq_enter();
-		clear_perf_counter_pending();
-		perf_counter_do_pending();
+	if (nmi)
+		nmi_exit();
+	else
 		irq_exit();
-	}
 }
 
 void hw_perf_counter_setup(int cpu)
-- 
cgit v0.10.2


From d2de688891909b148efe83a6fc9520a9cd6015f0 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Thu, 9 Apr 2009 15:17:22 +1000
Subject: sparc64: extend TI_RESTART_BLOCK space by 8 bytes

Impact: build fix

Today's linux-next build (sparc64 defconfig) failed like this:

  arch/sparc/kernel/built-in.o: In function `trap_init':
  (.init.text+0x4): undefined reference to `thread_info_offsets_are_bolixed_dave'

Caused by commit 52400ba946759af28442dee6265c5c0180ac7122 ("futex: add
requeue_pi functionality") (from the tip-core tree) which changed the
size of struct restart_block.

Shift TI_KUNA_REGS and TI_KUNA_INSN up by 8 bytes to make space for the
larger restart block.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: "David S. Miller" <davem@davemloft.net>
Cc: Darren Hart <dvhltc@us.ibm.com>
LKML-Reference: <20090409151722.c8eabb56.sfr@canb.auug.org.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h
index 639ac80..6586572 100644
--- a/arch/sparc/include/asm/thread_info_64.h
+++ b/arch/sparc/include/asm/thread_info_64.h
@@ -102,8 +102,8 @@ struct thread_info {
 #define TI_KERN_CNTD1	0x00000488
 #define TI_PCR		0x00000490
 #define TI_RESTART_BLOCK 0x00000498
-#define TI_KUNA_REGS	0x000004c0
-#define TI_KUNA_INSN	0x000004c8
+#define TI_KUNA_REGS	0x000004c8
+#define TI_KUNA_INSN	0x000004d0
 #define TI_FPREGS	0x00000500
 
 /* We embed this in the uppermost byte of thread_info->flags */
-- 
cgit v0.10.2


From 002f128b473fb82f454654be5081b0919ee01ab2 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Wed, 8 Apr 2009 15:29:43 -0700
Subject: sched: remove redundant hierarchy walk in check_preempt_wakeup

Impact: micro-optimization

Under group scheduling we traverse up until we are at common siblings
to make the wakeup comparison on.

At this point however, they should have the same parent so continuing
to check up the tree is redundant.

Signed-off-by: Paul Turner <pjt@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <alpine.DEB.1.00.0904081520320.30317@kitami.corp.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3816f21..5f9650e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1487,17 +1487,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 
 	find_matching_se(&se, &pse);
 
-	while (se) {
-		BUG_ON(!pse);
+	BUG_ON(!pse);
 
-		if (wakeup_preempt_entity(se, pse) == 1) {
-			resched_task(curr);
-			break;
-		}
-
-		se = parent_entity(se);
-		pse = parent_entity(pse);
-	}
+	if (wakeup_preempt_entity(se, pse) == 1)
+		resched_task(curr);
 }
 
 static struct task_struct *pick_next_task_fair(struct rq *rq)
-- 
cgit v0.10.2


From 888fcee066a2f4abd0d0bc9418c0535f9b01e6e5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Apr 2009 09:48:22 +0200
Subject: perf_counter: fix off task->comm by one

strlen() does not include the \0.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 321c57e..b07195b 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1989,7 +1989,7 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event)
 	unsigned int size;
 	char *comm = comm_event->task->comm;
 
-	size = ALIGN(strlen(comm), sizeof(u64));
+	size = ALIGN(strlen(comm)+1, sizeof(u64));
 
 	comm_event->comm = comm;
 	comm_event->comm_size = size;
@@ -2109,7 +2109,7 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
 	}
 
 got_name:
-	size = ALIGN(strlen(name), sizeof(u64));
+	size = ALIGN(strlen(name)+1, sizeof(u64));
 
 	mmap_event->file_name = name;
 	mmap_event->file_size = size;
-- 
cgit v0.10.2


From b3828ebb3901adfe989d8d4157ed28247aeec132 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Apr 2009 09:50:04 +0200
Subject: perf_counter tools: include PID in perf-report output, tweak
 user/kernel printut

It's handier than an <unknown> entry.
Also replace the kernel/user column with a more compact version:

  0.52              cc1  [k]  page_fault
  0.57               :0  [k]   _spin_lock
  0.59            :7506  [.]  <unknown>
  0.69               as  [.]  /usr/bin/as: <unknown>
  0.76              cc1  [.]  /lib64/libc-2.8.so: _int_free
  0.92              cc1  [k]  clear_page_c
  1.00            :7465  [.]  <unknown>
  1.43              cc1  [.]  /lib64/libc-2.8.so: memset
  1.86              cc1  [.]  /lib64/libc-2.8.so: _int_malloc
 70.33              cc1  [.]  /usr/libexec/gcc/x86_64-redhat-linux/4.3.2/cc1: <unknown>

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/perf-report.cc b/Documentation/perf_counter/perf-report.cc
index 09da0ba..1727317 100644
--- a/Documentation/perf_counter/perf-report.cc
+++ b/Documentation/perf_counter/perf-report.cc
@@ -277,10 +277,17 @@ static std::multimap<int, std::string> rev_hist;
 
 static std::string resolve_comm(int pid)
 {
-	std::string comm = "<unknown>";
+	std::string comm;
+
 	std::map<int, std::string>::const_iterator ci = comms.find(pid);
-	if (ci != comms.end())
+	if (ci != comms.end()) {
 		comm = ci->second;
+	} else {
+		char pid_str[30];
+
+		sprintf(pid_str, ":%d", pid);
+		comm = pid_str;
+	}
 
 	return comm;
 }
@@ -422,13 +429,13 @@ more:
 		char output[1024];
 
 		if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
-			level = "[kernel]";
+			level = " [k] ";
 			sym = resolve_kernel_symbol(event->ip.ip);
 		} else if (event->header.misc & PERF_EVENT_MISC_USER) {
-			level = "[ user ]";
+			level = " [.] ";
 			sym = resolve_user_symbol(event->ip.pid, event->ip.ip);
 		} else {
-			level = "[  hv  ]";
+			level = " [H] ";
 		}
 		comm = resolve_comm(event->ip.pid);
 
-- 
cgit v0.10.2


From 9ee318a7825929bc3734110b83ae8e20e53d9de3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 9 Apr 2009 10:53:44 +0200
Subject: perf_counter: optimize mmap/comm tracking

Impact: performance optimization

The mmap/comm tracking code does quite a lot of work before it discovers
there's no interest in it, avoid that by keeping a counter.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090409085524.427173196@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index b07195b..76376ec 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -38,6 +38,10 @@ int perf_max_counters __read_mostly = 1;
 static int perf_reserved_percpu __read_mostly;
 static int perf_overcommit __read_mostly = 1;
 
+static atomic_t nr_mmap_tracking __read_mostly;
+static atomic_t nr_munmap_tracking __read_mostly;
+static atomic_t nr_comm_tracking __read_mostly;
+
 /*
  * Mutex for (sysadmin-configurable) counter reservations:
  */
@@ -1186,6 +1190,13 @@ static void free_counter(struct perf_counter *counter)
 {
 	perf_pending_sync(counter);
 
+	if (counter->hw_event.mmap)
+		atomic_dec(&nr_mmap_tracking);
+	if (counter->hw_event.munmap)
+		atomic_dec(&nr_munmap_tracking);
+	if (counter->hw_event.comm)
+		atomic_dec(&nr_comm_tracking);
+
 	if (counter->destroy)
 		counter->destroy(counter);
 
@@ -2005,7 +2016,12 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event)
 
 void perf_counter_comm(struct task_struct *task)
 {
-	struct perf_comm_event comm_event = {
+	struct perf_comm_event comm_event;
+
+	if (!atomic_read(&nr_comm_tracking))
+		return;
+       
+	comm_event = (struct perf_comm_event){
 		.task	= task,
 		.event  = {
 			.header = { .type = PERF_EVENT_COMM, },
@@ -2128,7 +2144,12 @@ got_name:
 void perf_counter_mmap(unsigned long addr, unsigned long len,
 		       unsigned long pgoff, struct file *file)
 {
-	struct perf_mmap_event mmap_event = {
+	struct perf_mmap_event mmap_event;
+
+	if (!atomic_read(&nr_mmap_tracking))
+		return;
+
+	mmap_event = (struct perf_mmap_event){
 		.file   = file,
 		.event  = {
 			.header = { .type = PERF_EVENT_MMAP, },
@@ -2146,7 +2167,12 @@ void perf_counter_mmap(unsigned long addr, unsigned long len,
 void perf_counter_munmap(unsigned long addr, unsigned long len,
 			 unsigned long pgoff, struct file *file)
 {
-	struct perf_mmap_event mmap_event = {
+	struct perf_mmap_event mmap_event;
+
+	if (!atomic_read(&nr_munmap_tracking))
+		return;
+
+	mmap_event = (struct perf_mmap_event){
 		.file   = file,
 		.event  = {
 			.header = { .type = PERF_EVENT_MUNMAP, },
@@ -2725,6 +2751,13 @@ done:
 
 	counter->hw_ops = hw_ops;
 
+	if (counter->hw_event.mmap)
+		atomic_inc(&nr_mmap_tracking);
+	if (counter->hw_event.munmap)
+		atomic_inc(&nr_munmap_tracking);
+	if (counter->hw_event.comm)
+		atomic_inc(&nr_comm_tracking);
+
 	return counter;
 }
 
-- 
cgit v0.10.2


From 1ccd15497869f3ed83b5225d410df53a96e52757 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 9 Apr 2009 10:53:45 +0200
Subject: perf_counter: sysctl for system wide perf counters

Impact: add sysctl for paranoid/relaxed perfcounters policy

Allow the use of system wide perf counters to everybody, but provide
a sysctl to disable it for the paranoid security minded.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090409085524.514046352@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c22363a..9814328 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -568,6 +568,8 @@ struct perf_callchain_entry {
 
 extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
 
+extern int sysctl_perf_counter_priv;
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 76376ec..7efb7eb 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -42,6 +42,8 @@ static atomic_t nr_mmap_tracking __read_mostly;
 static atomic_t nr_munmap_tracking __read_mostly;
 static atomic_t nr_comm_tracking __read_mostly;
 
+int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
+
 /*
  * Mutex for (sysadmin-configurable) counter reservations:
  */
@@ -1132,7 +1134,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	 */
 	if (cpu != -1) {
 		/* Must be root to operate on a CPU counter: */
-		if (!capable(CAP_SYS_ADMIN))
+		if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN))
 			return ERR_PTR(-EACCES);
 
 		if (cpu < 0 || cpu > num_possible_cpus())
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4286b62..8ba4578 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -49,6 +49,7 @@
 #include <linux/reboot.h>
 #include <linux/ftrace.h>
 #include <linux/slow-work.h>
+#include <linux/perf_counter.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -920,6 +921,16 @@ static struct ctl_table kern_table[] = {
 		.child		= slow_work_sysctls,
 	},
 #endif
+#ifdef CONFIG_PERF_COUNTERS
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "perf_counter_privileged",
+		.data		= &sysctl_perf_counter_priv,
+		.maxlen		= sizeof(sysctl_perf_counter_priv),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
-- 
cgit v0.10.2


From d3d21c412d8525eb2e208d990ab5eee5fb0fe03d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 9 Apr 2009 10:53:46 +0200
Subject: perf_counter: log full path names

Impact: fix perf-report output for /home mounted binaries, etc.

dentry_path() only provide path-names up to the mount root, which is
unsuited for out purpose, use d_path() instead.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090409085524.601794134@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 7efb7eb..7f9521c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2116,7 +2116,7 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
 			name = strncpy(tmp, "//enomem", sizeof(tmp));
 			goto got_name;
 		}
-		name = dentry_path(file->f_dentry, buf, PATH_MAX);
+		name = d_path(&file->f_path, buf, PATH_MAX);
 		if (IS_ERR(name)) {
 			name = strncpy(tmp, "//toolong", sizeof(tmp));
 			goto got_name;
-- 
cgit v0.10.2


From e7c064889606aab3569669078c69b87b2c527e72 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Sat, 7 Mar 2009 23:48:41 -0800
Subject: xen: add FIX_TEXT_POKE to fixmap

FIX_TEXT_POKE[01] are used to map kernel addresses, so they're mapping
pfns, not mfns.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 77b242c..a96f5b9 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1812,6 +1812,9 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
 #ifdef CONFIG_X86_LOCAL_APIC
 	case FIX_APIC_BASE:	/* maps dummy local APIC */
 #endif
+	case FIX_TEXT_POKE0:
+	case FIX_TEXT_POKE1:
+		/* All local page mappings */
 		pte = pfn_pte(phys, prot);
 		break;
 
-- 
cgit v0.10.2


From 7a734e7dd93b9aea08ed51036a9a0e2c9dfd8dac Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 1 Apr 2009 18:08:28 -0700
Subject: x86, setup: "glove box" BIOS calls -- infrastructure

Impact: new interfaces (not yet used)

For all the platforms out there, there is an infinite number of buggy
BIOSes.  This adds infrastructure to treat BIOS interrupts more like
toxic waste and "glove box" them -- we switch out the register set,
perform the BIOS interrupt, and then restore the previous state.

LKML-Reference: <49DE7F79.4030106@zytor.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Rafael J. Wysocki <rjw@sisk.pl>

diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 6633b6e..658bc52 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -26,9 +26,10 @@ targets		:= vmlinux.bin setup.bin setup.elf bzImage
 targets		+= fdimage fdimage144 fdimage288 image.iso mtools.conf
 subdir-		:= compressed
 
-setup-y		+= a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o
+setup-y		+= a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o edd.o
 setup-y		+= header.o main.o mca.o memory.o pm.o pmjump.o
-setup-y		+= printf.o string.o tty.o video.o video-mode.o version.o
+setup-y		+= printf.o regs.o string.o tty.o video.o video-mode.o
+setup-y		+= version.o
 setup-$(CONFIG_X86_APM_BOOT) += apm.o
 
 # The link order of the video-*.o modules can matter.  In particular,
diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S
new file mode 100644
index 0000000..22b4b3e
--- /dev/null
+++ b/arch/x86/boot/bioscall.S
@@ -0,0 +1,82 @@
+/* -----------------------------------------------------------------------
+ *
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2 or (at your
+ *   option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * "Glove box" for BIOS calls.  Avoids the constant problems with BIOSes
+ * touching memory they shouldn't be.
+ */
+
+	.code16
+	.text
+	.globl	intcall
+	.type	intcall, @function
+intcall:
+	/* Self-modify the INT instruction.  Ugly, but works. */
+	cmpb	%al, 3f
+	je	1f
+	movb	%al, 3f
+	jmp	1f		/* Synchronize pipeline */
+1:
+	/* Save state */
+	pushfl
+	pushw	%fs
+	pushw	%gs
+	pushal
+
+	/* Copy input state to stack frame */
+	subw	$44, %sp
+	movw	%dx, %si
+	movw	%sp, %di
+	movw	$11, %cx
+	rep; movsd
+
+	/* Pop full state from the stack */
+	popal
+	popw	%gs
+	popw	%fs
+	popw	%es
+	popw	%ds
+	popfl
+
+	/* Actual INT */
+	.byte	0xcd		/* INT opcode */
+3:	.byte	0
+
+	/* Push full state to the stack */
+	pushfl
+	pushw	%ds
+	pushw	%es
+	pushw	%fs
+	pushw	%gs
+	pushal
+
+	/* Re-establish C environment invariants */
+	cld
+	movzwl	%sp, %esp
+	movw	%cs, %ax
+	movw	%ax, %ds
+	movw	%ax, %es
+
+	/* Copy output state from stack frame */
+	movw	68(%esp), %di	/* Original %cx == 3rd argument */
+	andw	%di, %di
+	jz	4f
+	movw	%sp, %si
+	movw	$11, %cx
+	rep; movsd
+4:	addw	$44, %sp
+
+	/* Restore state and return */
+	popal
+	popw	%gs
+	popw	%fs
+	popfl
+	retl
+	.size	intcall, .-intcall
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 7b2692e..98239d2 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -26,6 +27,7 @@
 #include <asm/setup.h>
 #include "bitops.h"
 #include <asm/cpufeature.h>
+#include <asm/processor-flags.h>
 
 /* Useful macros */
 #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
@@ -241,6 +243,49 @@ int enable_a20(void);
 /* apm.c */
 int query_apm_bios(void);
 
+/* bioscall.c */
+struct biosregs {
+	union {
+		struct {
+			u32 edi;
+			u32 esi;
+			u32 ebp;
+			u32 _esp;
+			u32 ebx;
+			u32 edx;
+			u32 ecx;
+			u32 eax;
+			u32 _fsgs;
+			u32 _dses;
+			u32 eflags;
+		};
+		struct {
+			u16 di, hdi;
+			u16 si, hsi;
+			u16 bp, hbp;
+			u16 _sp, _hsp;
+			u16 bx, hbx;
+			u16 dx, hdx;
+			u16 cx, hcx;
+			u16 ax, hax;
+			u16 gs, fs;
+			u16 es, ds;
+			u16 flags, hflags;
+		};
+		struct {
+			u8 dil, dih, edi2, edi3;
+			u8 sil, sih, esi2, esi3;
+			u8 bpl, bph, ebp2, ebp3;
+			u8 _spl, _sph, _esp2, _esp3;
+			u8 bl, bh, ebx2, ebx3;
+			u8 dl, dh, edx2, edx3;
+			u8 cl, ch, ecx2, ecx3;
+			u8 al, ah, eax2, eax3;
+		};
+	};
+};
+void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg);
+
 /* cmdline.c */
 int cmdline_find_option(const char *option, char *buffer, int bufsize);
 int cmdline_find_option_bool(const char *option);
@@ -279,6 +324,9 @@ int sprintf(char *buf, const char *fmt, ...);
 int vsprintf(char *buf, const char *fmt, va_list args);
 int printf(const char *fmt, ...);
 
+/* regs.c */
+void initregs(struct biosregs *regs);
+
 /* string.c */
 int strcmp(const char *str1, const char *str2);
 size_t strnlen(const char *s, size_t maxlen);
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 5d84d1c..486d97f 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -221,7 +221,7 @@ setup_data:		.quad 0			# 64-bit physical pointer to
 
 # End of setup header #####################################################
 
-	.section ".inittext", "ax"
+	.section ".entrytext", "ax"
 start_of_setup:
 #ifdef SAFE_RESET_DISK_CONTROLLER
 # Reset the disk controller.
diff --git a/arch/x86/boot/regs.c b/arch/x86/boot/regs.c
new file mode 100644
index 0000000..958019b
--- /dev/null
+++ b/arch/x86/boot/regs.c
@@ -0,0 +1,29 @@
+/* -----------------------------------------------------------------------
+ *
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2 or (at your
+ *   option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Simple helper function for initializing a register set.
+ *
+ * Note that this sets EFLAGS_CF in the input register set; this
+ * makes it easier to catch functions which do nothing but don't
+ * explicitly set CF.
+ */
+
+#include "boot.h"
+
+void initregs(struct biosregs *reg)
+{
+	memset(reg, 0, sizeof *reg);
+	reg->eflags |= X86_EFLAGS_CF;
+	reg->ds = ds();
+	reg->es = ds();
+	reg->fs = fs();
+	reg->gs = gs();
+}
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
index bb8dc2d..0f6ec455 100644
--- a/arch/x86/boot/setup.ld
+++ b/arch/x86/boot/setup.ld
@@ -15,8 +15,11 @@ SECTIONS
 
 	. = 497;
 	.header		: { *(.header) }
+	.entrytext	: { *(.entrytext) }
 	.inittext	: { *(.inittext) }
 	.initdata	: { *(.initdata) }
+	__end_init = .;
+
 	.text		: { *(.text) }
 	.text32		: { *(.text32) }
 
@@ -52,4 +55,7 @@ SECTIONS
 
 	. = ASSERT(_end <= 0x8000, "Setup too big!");
 	. = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!");
+	/* Necessary for the very-old-loader check to work... */
+	. = ASSERT(__end_init <= 5*512, "init sections too big!");
+
 }
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile
index 1c31cc0..167bc16 100644
--- a/arch/x86/kernel/acpi/realmode/Makefile
+++ b/arch/x86/kernel/acpi/realmode/Makefile
@@ -9,7 +9,7 @@
 always		:= wakeup.bin
 targets		:= wakeup.elf wakeup.lds
 
-wakeup-y	+= wakeup.o wakemain.o video-mode.o copy.o
+wakeup-y	+= wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o
 
 # The link order of the video-*.o modules can matter.  In particular,
 # video-vga.o *must* be listed first, followed by video-vesa.o.
diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S
new file mode 100644
index 0000000..f51eb0b
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/bioscall.S
@@ -0,0 +1 @@
+#include "../../../boot/bioscall.S"
diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c
new file mode 100644
index 0000000..6206033
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/regs.c
@@ -0,0 +1 @@
+#include "../../../boot/regs.c"
-- 
cgit v0.10.2


From df7699c56421c0476704f24a43409ac8c505f3d2 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 1 Apr 2009 18:13:46 -0700
Subject: x86, setup: "glove box" BIOS interrupts in the core boot code

Impact: BIOS proofing

"Glove box" off BIOS interrupts in the core boot code.

LKML-Reference: <49DE7F79.4030106@zytor.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
index 7c19ce8..64a31a6 100644
--- a/arch/x86/boot/a20.c
+++ b/arch/x86/boot/a20.c
@@ -2,7 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007-2008 rPath, Inc. - All Rights Reserved
- *   Copyright 2009 Intel Corporation
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -90,8 +90,11 @@ static int a20_test_long(void)
 
 static void enable_a20_bios(void)
 {
-	asm volatile("pushfl; int $0x15; popfl"
-		     : : "a" ((u16)0x2401));
+	struct biosregs ireg;
+
+	initregs(&ireg);
+	ireg.ax = 0x2401;
+	intcall(0x15, &ireg, NULL);
 }
 
 static void enable_a20_kbc(void)
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 58f0415..140172b 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -61,11 +62,10 @@ static void copy_boot_params(void)
  */
 static void keyboard_set_repeat(void)
 {
-	u16 ax = 0x0305;
-	u16 bx = 0;
-	asm volatile("int $0x16"
-		     : "+a" (ax), "+b" (bx)
-		     : : "ecx", "edx", "esi", "edi");
+	struct biosregs ireg;
+	initregs(&ireg);
+	ireg.ax = 0x0305;
+	intcall(0x16, &ireg, NULL);
 }
 
 /*
@@ -73,18 +73,22 @@ static void keyboard_set_repeat(void)
  */
 static void query_ist(void)
 {
+	struct biosregs ireg, oreg;
+
 	/* Some older BIOSes apparently crash on this call, so filter
 	   it from machines too old to have SpeedStep at all. */
 	if (cpu.level < 6)
 		return;
 
-	asm("int $0x15"
-	    : "=a" (boot_params.ist_info.signature),
-	      "=b" (boot_params.ist_info.command),
-	      "=c" (boot_params.ist_info.event),
-	      "=d" (boot_params.ist_info.perf_level)
-	    : "a" (0x0000e980),	 /* IST Support */
-	      "d" (0x47534943)); /* Request value */
+	initregs(&ireg);
+	ireg.ax  = 0xe980;	 /* IST Support */
+	ireg.edx = 0x47534943;	 /* Request value */
+	intcall(0x15, &ireg, &oreg);
+
+	boot_params.ist_info.signature  = oreg.eax;
+	boot_params.ist_info.command    = oreg.ebx;
+	boot_params.ist_info.event      = oreg.ecx;
+	boot_params.ist_info.perf_level = oreg.edx;
 }
 
 /*
@@ -93,13 +97,12 @@ static void query_ist(void)
 static void set_bios_mode(void)
 {
 #ifdef CONFIG_X86_64
-	u32 eax, ebx;
+	struct biosregs ireg;
 
-	eax = 0xec00;
-	ebx = 2;
-	asm volatile("int $0x15"
-		     : "+a" (eax), "+b" (ebx)
-		     : : "ecx", "edx", "esi", "edi");
+	initregs(&ireg);
+	ireg.ax = 0xec00;
+	ireg.bx = 2;
+	intcall(0x15, &ireg, NULL);
 #endif
 }
 
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index 5054c2d..d989de8 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -25,12 +25,16 @@ struct e820_ext_entry {
 static int detect_memory_e820(void)
 {
 	int count = 0;
-	u32 next = 0;
-	u32 size, id, edi;
-	u8 err;
+	struct biosregs ireg, oreg;
 	struct e820entry *desc = boot_params.e820_map;
 	static struct e820_ext_entry buf; /* static so it is zeroed */
 
+	initregs(&ireg);
+	ireg.ax  = 0xe820;
+	ireg.cx  = sizeof buf;
+	ireg.edx = SMAP;
+	ireg.di  = (size_t)&buf;
+
 	/*
 	 * Set this here so that if the BIOS doesn't change this field
 	 * but still doesn't change %ecx, we're still okay...
@@ -38,22 +42,13 @@ static int detect_memory_e820(void)
 	buf.ext_flags = 1;
 
 	do {
-		size = sizeof buf;
-
-		/* Important: %edx and %esi are clobbered by some BIOSes,
-		   so they must be either used for the error output
-		   or explicitly marked clobbered.  Given that, assume there
-		   is something out there clobbering %ebp and %edi, too. */
-		asm("pushl %%ebp; int $0x15; popl %%ebp; setc %0"
-		    : "=d" (err), "+b" (next), "=a" (id), "+c" (size),
-		      "=D" (edi), "+m" (buf)
-		    : "D" (&buf), "d" (SMAP), "a" (0xe820)
-		    : "esi");
+		intcall(0x15, &ireg, &oreg);
+		ireg.ebx = oreg.ebx; /* for next iteration... */
 
 		/* BIOSes which terminate the chain with CF = 1 as opposed
 		   to %ebx = 0 don't always report the SMAP signature on
 		   the final, failing, probe. */
-		if (err)
+		if (oreg.eflags & X86_EFLAGS_CF)
 			break;
 
 		/* Some BIOSes stop returning SMAP in the middle of
@@ -61,7 +56,7 @@ static int detect_memory_e820(void)
 		   screwed up the map at that point, we might have a
 		   partial map, the full map, or complete garbage, so
 		   just return failure. */
-		if (id != SMAP) {
+		if (oreg.eax != SMAP) {
 			count = 0;
 			break;
 		}
@@ -69,58 +64,62 @@ static int detect_memory_e820(void)
 		/* ACPI 3.0 added the extended flags support.  If bit 0
 		   in the extended flags is zero, we're supposed to simply
 		   ignore the entry -- a backwards incompatible change! */
-		if (size > 20 && !(buf.ext_flags & 1))
+		if (oreg.cx > 20 && !(buf.ext_flags & 1))
 			continue;
 
 		*desc++ = buf.std;
 		count++;
-	} while (next && count < ARRAY_SIZE(boot_params.e820_map));
+	} while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_map));
 
 	return boot_params.e820_entries = count;
 }
 
 static int detect_memory_e801(void)
 {
-	u16 ax, bx, cx, dx;
-	u8 err;
+	struct biosregs ireg, oreg;
 
-	bx = cx = dx = 0;
-	ax = 0xe801;
-	asm("stc; int $0x15; setc %0"
-	    : "=m" (err), "+a" (ax), "+b" (bx), "+c" (cx), "+d" (dx));
+	initregs(&ireg);
+	ireg.ax = 0xe801;
+	intcall(0x15, &ireg, &oreg);
 
-	if (err)
+	if (oreg.eflags & X86_EFLAGS_CF)
 		return -1;
 
 	/* Do we really need to do this? */
-	if (cx || dx) {
-		ax = cx;
-		bx = dx;
+	if (oreg.cx || oreg.dx) {
+		oreg.ax = oreg.cx;
+		oreg.bx = oreg.dx;
 	}
 
-	if (ax > 15*1024)
+	if (oreg.ax > 15*1024) {
 		return -1;	/* Bogus! */
-
-	/* This ignores memory above 16MB if we have a memory hole
-	   there.  If someone actually finds a machine with a memory
-	   hole at 16MB and no support for 0E820h they should probably
-	   generate a fake e820 map. */
-	boot_params.alt_mem_k = (ax == 15*1024) ? (dx << 6)+ax : ax;
+	} else if (oreg.ax == 15*1024) {
+		boot_params.alt_mem_k = (oreg.dx << 6) + oreg.ax;
+	} else {
+		/*
+		 * This ignores memory above 16MB if we have a memory
+		 * hole there.  If someone actually finds a machine
+		 * with a memory hole at 16MB and no support for
+		 * 0E820h they should probably generate a fake e820
+		 * map.
+		 */
+		boot_params.alt_mem_k = oreg.ax;
+	}
 
 	return 0;
 }
 
 static int detect_memory_88(void)
 {
-	u16 ax;
-	u8 err;
+	struct biosregs ireg, oreg;
 
-	ax = 0x8800;
-	asm("stc; int $0x15; setc %0" : "=bcdm" (err), "+a" (ax));
+	initregs(&ireg);
+	ireg.ah = 0x88;
+	intcall(0x15, &ireg, &oreg);
 
-	boot_params.screen_info.ext_mem_k = ax;
+	boot_params.screen_info.ext_mem_k = oreg.ax;
 
-	return -err;
+	return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */
 }
 
 int detect_memory(void)
diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c
index 7e8e8b2..01ec69c 100644
--- a/arch/x86/boot/tty.c
+++ b/arch/x86/boot/tty.c
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -22,24 +23,23 @@
 
 void __attribute__((section(".inittext"))) putchar(int ch)
 {
-	unsigned char c = ch;
+	struct biosregs ireg;
 
-	if (c == '\n')
+	if (ch == '\n')
 		putchar('\r');	/* \n -> \r\n */
 
-	/* int $0x10 is known to have bugs involving touching registers
-	   it shouldn't.  Be extra conservative... */
-	asm volatile("pushal; pushw %%ds; int $0x10; popw %%ds; popal"
-		     : : "b" (0x0007), "c" (0x0001), "a" (0x0e00|ch));
+	initregs(&ireg);
+	ireg.bx = 0x0007;
+	ireg.cx = 0x0001;
+	ireg.ah = 0x0e;
+	ireg.al = ch;
+	intcall(0x10, &ireg, NULL);
 }
 
 void __attribute__((section(".inittext"))) puts(const char *str)
 {
-	int n = 0;
-	while (*str) {
+	while (*str)
 		putchar(*str++);
-		n++;
-	}
 }
 
 /*
@@ -49,14 +49,13 @@ void __attribute__((section(".inittext"))) puts(const char *str)
 
 static u8 gettime(void)
 {
-	u16 ax = 0x0200;
-	u16 cx, dx;
+	struct biosregs ireg, oreg;
 
-	asm volatile("int $0x1a"
-		     : "+a" (ax), "=c" (cx), "=d" (dx)
-		     : : "ebx", "esi", "edi");
+	initregs(&ireg);
+	ireg.ah = 0x02;
+	intcall(0x1a, &ireg, &oreg);
 
-	return dx >> 8;
+	return oreg.dh;
 }
 
 /*
@@ -64,19 +63,24 @@ static u8 gettime(void)
  */
 int getchar(void)
 {
-	u16 ax = 0;
-	asm volatile("int $0x16" : "+a" (ax));
+	struct biosregs ireg, oreg;
+
+	initregs(&ireg);
+	/* ireg.ah = 0x00; */
+	intcall(0x16, &ireg, &oreg);
 
-	return ax & 0xff;
+	return oreg.al;
 }
 
 static int kbd_pending(void)
 {
-	u8 pending;
-	asm volatile("int $0x16; setnz %0"
-		     : "=qm" (pending)
-		     : "a" (0x0100));
-	return pending;
+	struct biosregs ireg, oreg;
+
+	initregs(&ireg);
+	ireg.ah = 0x01;
+	intcall(0x16, &ireg, &oreg);
+
+	return !(oreg.eflags & X86_EFLAGS_ZF);
 }
 
 void kbd_flush(void)
-- 
cgit v0.10.2


From d54ea252e4c92357226992cf65d94616a96e6fce Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 1 Apr 2009 18:14:26 -0700
Subject: x86, setup: "glove box" BIOS interrupts in the APM code

Impact: BIOS proofing

"Glove box" off BIOS interrupts in the APM code.

LKML-Reference: <49DE7F79.4030106@zytor.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>

diff --git a/arch/x86/boot/apm.c b/arch/x86/boot/apm.c
index 7aa6033..ee27483 100644
--- a/arch/x86/boot/apm.c
+++ b/arch/x86/boot/apm.c
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   Original APM BIOS checking by Stephen Rothwell, May 1994
  *   (sfr@canb.auug.org.au)
@@ -19,75 +20,56 @@
 
 int query_apm_bios(void)
 {
-	u16 ax, bx, cx, dx, di;
-	u32 ebx, esi;
-	u8 err;
+	struct biosregs ireg, oreg;
 
 	/* APM BIOS installation check */
-	ax = 0x5300;
-	bx = cx = 0;
-	asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0"
-		     : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx)
-		     : : "esi", "edi");
+	initregs(&ireg);
+	ireg.ah = 0x53;
+	intcall(0x15, &ireg, &oreg);
 
-	if (err)
+	if (oreg.flags & X86_EFLAGS_CF)
 		return -1;		/* No APM BIOS */
 
-	if (bx != 0x504d)	/* "PM" signature */
+	if (oreg.bx != 0x504d)		/* "PM" signature */
 		return -1;
 
-	if (!(cx & 0x02))		/* 32 bits supported? */
+	if (!(oreg.cx & 0x02))		/* 32 bits supported? */
 		return -1;
 
 	/* Disconnect first, just in case */
-	ax = 0x5304;
-	bx = 0;
-	asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp"
-		     : "+a" (ax), "+b" (bx)
-		     : : "ecx", "edx", "esi", "edi");
-
-	/* Paranoia */
-	ebx = esi = 0;
-	cx = dx = di = 0;
+	ireg.al = 0x04;
+	intcall(0x15, &ireg, NULL);
 
 	/* 32-bit connect */
-	asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %6"
-		     : "=a" (ax), "+b" (ebx), "+c" (cx), "+d" (dx),
-		       "+S" (esi), "+D" (di), "=m" (err)
-		     : "a" (0x5303));
-
-	boot_params.apm_bios_info.cseg = ax;
-	boot_params.apm_bios_info.offset = ebx;
-	boot_params.apm_bios_info.cseg_16 = cx;
-	boot_params.apm_bios_info.dseg = dx;
-	boot_params.apm_bios_info.cseg_len = (u16)esi;
-	boot_params.apm_bios_info.cseg_16_len = esi >> 16;
-	boot_params.apm_bios_info.dseg_len = di;
-
-	if (err)
+	ireg.al = 0x03;
+	intcall(0x15, &ireg, &oreg);
+
+	boot_params.apm_bios_info.cseg        = oreg.ax;
+	boot_params.apm_bios_info.offset      = oreg.ebx;
+	boot_params.apm_bios_info.cseg_16     = oreg.cx;
+	boot_params.apm_bios_info.dseg        = oreg.dx;
+	boot_params.apm_bios_info.cseg_len    = oreg.si;
+	boot_params.apm_bios_info.cseg_16_len = oreg.hsi;
+	boot_params.apm_bios_info.dseg_len    = oreg.di;
+
+	if (oreg.flags & X86_EFLAGS_CF)
 		return -1;
 
 	/* Redo the installation check as the 32-bit connect;
 	   some BIOSes return different flags this way... */
 
-	ax = 0x5300;
-	bx = cx = 0;
-	asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0"
-		     : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx)
-		     : : "esi", "edi");
+	ireg.al = 0x00;
+	intcall(0x15, &ireg, &oreg);
 
-	if (err || bx != 0x504d) {
+	if ((oreg.eflags & X86_EFLAGS_CF) || oreg.bx != 0x504d) {
 		/* Failure with 32-bit connect, try to disconect and ignore */
-		ax = 0x5304;
-		bx = 0;
-		asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp"
-			     : "+a" (ax), "+b" (bx)
-			     : : "ecx", "edx", "esi", "edi");
+		ireg.al = 0x04;
+		intcall(0x15, &ireg, NULL);
 		return -1;
 	}
 
-	boot_params.apm_bios_info.version = ax;
-	boot_params.apm_bios_info.flags = cx;
+	boot_params.apm_bios_info.version = oreg.ax;
+	boot_params.apm_bios_info.flags   = oreg.cx;
 	return 0;
 }
 
-- 
cgit v0.10.2


From 3435d3476c5ed955d56a6216ed2d156847b3a575 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 1 Apr 2009 18:17:17 -0700
Subject: x86, setup: "glove box" BIOS interrupts in the EDD code

Impact: BIOS proofing

"Glove box" off BIOS interrupts in the EDD code.

LKML-Reference: <49DE7F79.4030106@zytor.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
index 1aae8f3..c501a5b 100644
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -22,17 +23,17 @@
  */
 static int read_mbr(u8 devno, void *buf)
 {
-	u16 ax, bx, cx, dx;
+	struct biosregs ireg, oreg;
 
-	ax = 0x0201;		/* Legacy Read, one sector */
-	cx = 0x0001;		/* Sector 0-0-1 */
-	dx = devno;
-	bx = (size_t)buf;
-	asm volatile("pushfl; stc; int $0x13; setc %%al; popfl"
-		     : "+a" (ax), "+c" (cx), "+d" (dx), "+b" (bx)
-		     : : "esi", "edi", "memory");
+	initregs(&ireg);
+	ireg.ax = 0x0201;		/* Legacy Read, one sector */
+	ireg.cx = 0x0001;		/* Sector 0-0-1 */
+	ireg.dl = devno;
+	ireg.bx = (size_t)buf;
 
-	return -(u8)ax;		/* 0 or -1 */
+	intcall(0x13, &ireg, &oreg);
+
+	return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */
 }
 
 static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
@@ -72,56 +73,46 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
 
 static int get_edd_info(u8 devno, struct edd_info *ei)
 {
-	u16 ax, bx, cx, dx, di;
+	struct biosregs ireg, oreg;
 
 	memset(ei, 0, sizeof *ei);
 
 	/* Check Extensions Present */
 
-	ax = 0x4100;
-	bx = EDDMAGIC1;
-	dx = devno;
-	asm("pushfl; stc; int $0x13; setc %%al; popfl"
-	    : "+a" (ax), "+b" (bx), "=c" (cx), "+d" (dx)
-	    : : "esi", "edi");
+	initregs(&ireg);
+	ireg.ah = 0x41;
+	ireg.bx = EDDMAGIC1;
+	ireg.dl = devno;
+	intcall(0x13, &ireg, &oreg);
 
-	if ((u8)ax)
+	if (oreg.eflags & X86_EFLAGS_CF)
 		return -1;	/* No extended information */
 
-	if (bx != EDDMAGIC2)
+	if (oreg.bx != EDDMAGIC2)
 		return -1;
 
 	ei->device  = devno;
-	ei->version = ax >> 8;	/* EDD version number */
-	ei->interface_support = cx; /* EDD functionality subsets */
+	ei->version = oreg.ah;		 /* EDD version number */
+	ei->interface_support = oreg.cx; /* EDD functionality subsets */
 
 	/* Extended Get Device Parameters */
 
 	ei->params.length = sizeof(ei->params);
-	ax = 0x4800;
-	dx = devno;
-	asm("pushfl; int $0x13; popfl"
-	    : "+a" (ax), "+d" (dx), "=m" (ei->params)
-	    : "S" (&ei->params)
-	    : "ebx", "ecx", "edi");
+	ireg.ah = 0x48;
+	ireg.si = (size_t)&ei->params;
+	intcall(0x13, &ireg, &oreg);
 
 	/* Get legacy CHS parameters */
 
 	/* Ralf Brown recommends setting ES:DI to 0:0 */
-	ax = 0x0800;
-	dx = devno;
-	di = 0;
-	asm("pushw %%es; "
-	    "movw %%di,%%es; "
-	    "pushfl; stc; int $0x13; setc %%al; popfl; "
-	    "popw %%es"
-	    : "+a" (ax), "=b" (bx), "=c" (cx), "+d" (dx), "+D" (di)
-	    : : "esi");
-
-	if ((u8)ax == 0) {
-		ei->legacy_max_cylinder = (cx >> 8) + ((cx & 0xc0) << 2);
-		ei->legacy_max_head = dx >> 8;
-		ei->legacy_sectors_per_track = cx & 0x3f;
+	ireg.ah = 0x08;
+	ireg.es = 0;
+	intcall(0x13, &ireg, &oreg);
+
+	if (!(oreg.eflags & X86_EFLAGS_CF)) {
+		ei->legacy_max_cylinder = oreg.ch + ((oreg.cl & 0xc0) << 2);
+		ei->legacy_max_head = oreg.dh;
+		ei->legacy_sectors_per_track = oreg.cl & 0x3f;
 	}
 
 	return 0;
-- 
cgit v0.10.2


From 0a706db320768f8f6e43bbf73b58d2aabdc93354 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 1 Apr 2009 18:19:00 -0700
Subject: x86, setup: "glove box" BIOS interrupts in the MCA code

Impact: BIOS proofing

"Glove box" off BIOS interrupts in the MCA code.

LKML-Reference: <49DE7F79.4030106@zytor.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>

diff --git a/arch/x86/boot/mca.c b/arch/x86/boot/mca.c
index 911eaae..a95a531 100644
--- a/arch/x86/boot/mca.c
+++ b/arch/x86/boot/mca.c
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -16,26 +17,22 @@
 
 int query_mca(void)
 {
-	u8 err;
-	u16 es, bx, len;
-
-	asm("pushw %%es ; "
-	    "int $0x15 ; "
-	    "setc %0 ; "
-	    "movw %%es, %1 ; "
-	    "popw %%es"
-	    : "=acd" (err), "=acdSD" (es), "=b" (bx)
-	    : "a" (0xc000));
-
-	if (err)
+	struct biosregs ireg, oreg;
+	u16 len;
+
+	initregs(&ireg);
+	ireg.ah = 0xc0;
+	intcall(0x15, &ireg, &oreg);
+
+	if (oreg.eflags & X86_EFLAGS_CF)
 		return -1;	/* No MCA present */
 
-	set_fs(es);
-	len = rdfs16(bx);
+	set_fs(oreg.es);
+	len = rdfs16(oreg.bx);
 
 	if (len > sizeof(boot_params.sys_desc_table))
 		len = sizeof(boot_params.sys_desc_table);
 
-	copy_from_fs(&boot_params.sys_desc_table, bx, len);
+	copy_from_fs(&boot_params.sys_desc_table, oreg.bx, len);
 	return 0;
 }
-- 
cgit v0.10.2


From cf06de7b9cdd3efee7a59dced1977b3c21d43732 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 1 Apr 2009 18:20:11 -0700
Subject: x86, setup: "glove box" BIOS interrupts in the video code

Impact: BIOS proofing

"Glove box" off BIOS interrupts in the video code.

LKML-Reference: <49DE7F79.4030106@zytor.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Rafael J. Wysocki <rjw@sisk.pl>

diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c
index 3fa979c..d660be4 100644
--- a/arch/x86/boot/video-bios.c
+++ b/arch/x86/boot/video-bios.c
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -29,21 +30,21 @@ static int bios_set_mode(struct mode_info *mi)
 
 static int set_bios_mode(u8 mode)
 {
-	u16 ax;
+	struct biosregs ireg, oreg;
 	u8 new_mode;
 
-	ax = mode;		/* AH=0x00 Set Video Mode */
-	asm volatile(INT10
-		     : "+a" (ax)
-		     : : "ebx", "ecx", "edx", "esi", "edi");
+	initregs(&ireg);
+	ireg.al = mode;		/* AH=0x00 Set Video Mode */
+	intcall(0x10, &ireg, NULL);
 
-	ax = 0x0f00;		/* Get Current Video Mode */
-	asm volatile(INT10
-		     : "+a" (ax)
-		     : : "ebx", "ecx", "edx", "esi", "edi");
+
+	ireg.ah = 0x0f;		/* Get Current Video Mode */
+	intcall(0x10, &ireg, &oreg);
 
 	do_restore = 1;		/* Assume video contents were lost */
-	new_mode = ax & 0x7f;	/* Not all BIOSes are clean with the top bit */
+
+	/* Not all BIOSes are clean with the top bit */
+	new_mode = ireg.al & 0x7f;
 
 	if (new_mode == mode)
 		return 0;	/* Mode change OK */
@@ -53,10 +54,8 @@ static int set_bios_mode(u8 mode)
 		/* Mode setting failed, but we didn't end up where we
 		   started.  That's bad.  Try to revert to the original
 		   video mode. */
-		ax = boot_params.screen_info.orig_video_mode;
-		asm volatile(INT10
-			     : "+a" (ax)
-			     : : "ebx", "ecx", "edx", "esi", "edi");
+		ireg.ax = boot_params.screen_info.orig_video_mode;
+		intcall(0x10, &ireg, NULL);
 	}
 #endif
 	return -1;
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 4a58c8c..c700147 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -31,7 +32,7 @@ static inline void vesa_store_mode_params_graphics(void) {}
 static int vesa_probe(void)
 {
 #if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID)
-	u16 ax, cx, di;
+	struct biosregs ireg, oreg;
 	u16 mode;
 	addr_t mode_ptr;
 	struct mode_info *mi;
@@ -39,13 +40,12 @@ static int vesa_probe(void)
 
 	video_vesa.modes = GET_HEAP(struct mode_info, 0);
 
-	ax = 0x4f00;
-	di = (size_t)&vginfo;
-	asm(INT10
-	    : "+a" (ax), "+D" (di), "=m" (vginfo)
-	    : : "ebx", "ecx", "edx", "esi");
+	initregs(&ireg);
+	ireg.ax = 0x4f00;
+	ireg.di = (size_t)&vginfo;
+	intcall(0x10, &ireg, &oreg);
 
-	if (ax != 0x004f ||
+	if (ireg.ax != 0x004f ||
 	    vginfo.signature != VESA_MAGIC ||
 	    vginfo.version < 0x0102)
 		return 0;	/* Not present */
@@ -65,14 +65,12 @@ static int vesa_probe(void)
 
 		memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
 
-		ax = 0x4f01;
-		cx = mode;
-		di = (size_t)&vminfo;
-		asm(INT10
-		    : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo)
-		    : : "ebx", "edx", "esi");
+		ireg.ax = 0x4f01;
+		ireg.cx = mode;
+		ireg.di = (size_t)&vminfo;
+		intcall(0x10, &ireg, &oreg);
 
-		if (ax != 0x004f)
+		if (ireg.ax != 0x004f)
 			continue;
 
 		if ((vminfo.mode_attr & 0x15) == 0x05) {
@@ -111,20 +109,19 @@ static int vesa_probe(void)
 
 static int vesa_set_mode(struct mode_info *mode)
 {
-	u16 ax, bx, cx, di;
+	struct biosregs ireg, oreg;
 	int is_graphic;
 	u16 vesa_mode = mode->mode - VIDEO_FIRST_VESA;
 
 	memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
 
-	ax = 0x4f01;
-	cx = vesa_mode;
-	di = (size_t)&vminfo;
-	asm(INT10
-	    : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo)
-	    : : "ebx", "edx", "esi");
+	initregs(&ireg);
+	ireg.ax = 0x4f01;
+	ireg.cx = vesa_mode;
+	ireg.di = (size_t)&vminfo;
+	intcall(0x10, &ireg, &oreg);
 
-	if (ax != 0x004f)
+	if (oreg.ax != 0x004f)
 		return -1;
 
 	if ((vminfo.mode_attr & 0x15) == 0x05) {
@@ -141,14 +138,12 @@ static int vesa_set_mode(struct mode_info *mode)
 	}
 
 
-	ax = 0x4f02;
-	bx = vesa_mode;
-	di = 0;
-	asm volatile(INT10
-		     : "+a" (ax), "+b" (bx), "+D" (di)
-		     : : "ecx", "edx", "esi");
+	initregs(&ireg);
+	ireg.ax = 0x4f02;
+	ireg.bx = vesa_mode;
+	intcall(0x10, &ireg, &oreg);
 
-	if (ax != 0x004f)
+	if (oreg.ax != 0x004f)
 		return -1;
 
 	graphic_mode = is_graphic;
@@ -171,50 +166,45 @@ static int vesa_set_mode(struct mode_info *mode)
 /* Switch DAC to 8-bit mode */
 static void vesa_dac_set_8bits(void)
 {
+	struct biosregs ireg, oreg;
 	u8 dac_size = 6;
 
 	/* If possible, switch the DAC to 8-bit mode */
 	if (vginfo.capabilities & 1) {
-		u16 ax, bx;
-
-		ax = 0x4f08;
-		bx = 0x0800;
-		asm volatile(INT10
-			     : "+a" (ax), "+b" (bx)
-			     : : "ecx", "edx", "esi", "edi");
-
-		if (ax == 0x004f)
-			dac_size = bx >> 8;
+		initregs(&ireg);
+		ireg.ax = 0x4f08;
+		ireg.bh = 0x08;
+		intcall(0x10, &ireg, &oreg);
+		if (oreg.ax == 0x004f)
+			dac_size = oreg.bh;
 	}
 
 	/* Set the color sizes to the DAC size, and offsets to 0 */
-	boot_params.screen_info.red_size = dac_size;
+	boot_params.screen_info.red_size   = dac_size;
 	boot_params.screen_info.green_size = dac_size;
-	boot_params.screen_info.blue_size = dac_size;
-	boot_params.screen_info.rsvd_size = dac_size;
+	boot_params.screen_info.blue_size  = dac_size;
+	boot_params.screen_info.rsvd_size  = dac_size;
 
-	boot_params.screen_info.red_pos = 0;
-	boot_params.screen_info.green_pos = 0;
-	boot_params.screen_info.blue_pos = 0;
-	boot_params.screen_info.rsvd_pos = 0;
+	boot_params.screen_info.red_pos    = 0;
+	boot_params.screen_info.green_pos  = 0;
+	boot_params.screen_info.blue_pos   = 0;
+	boot_params.screen_info.rsvd_pos   = 0;
 }
 
 /* Save the VESA protected mode info */
 static void vesa_store_pm_info(void)
 {
-	u16 ax, bx, di, es;
+	struct biosregs ireg, oreg;
 
-	ax = 0x4f0a;
-	bx = di = 0;
-	asm("pushw %%es; "INT10"; movw %%es,%0; popw %%es"
-	    : "=d" (es), "+a" (ax), "+b" (bx), "+D" (di)
-	    : : "ecx", "esi");
+	initregs(&ireg);
+	ireg.ax = 0x4f0a;
+	intcall(0x10, &ireg, &oreg);
 
-	if (ax != 0x004f)
+	if (oreg.ax != 0x004f)
 		return;
 
-	boot_params.screen_info.vesapm_seg = es;
-	boot_params.screen_info.vesapm_off = di;
+	boot_params.screen_info.vesapm_seg = oreg.es;
+	boot_params.screen_info.vesapm_off = oreg.di;
 }
 
 /*
@@ -252,7 +242,7 @@ static void vesa_store_mode_params_graphics(void)
 void vesa_store_edid(void)
 {
 #ifdef CONFIG_FIRMWARE_EDID
-	u16 ax, bx, cx, dx, di;
+	struct biosregs ireg, oreg;
 
 	/* Apparently used as a nonsense token... */
 	memset(&boot_params.edid_info, 0x13, sizeof boot_params.edid_info);
@@ -260,33 +250,26 @@ void vesa_store_edid(void)
 	if (vginfo.version < 0x0200)
 		return;		/* EDID requires VBE 2.0+ */
 
-	ax = 0x4f15;		/* VBE DDC */
-	bx = 0x0000;		/* Report DDC capabilities */
-	cx = 0;			/* Controller 0 */
-	di = 0;			/* ES:DI must be 0 by spec */
-
-	/* Note: The VBE DDC spec is different from the main VESA spec;
-	   we genuinely have to assume all registers are destroyed here. */
-
-	asm("pushw %%es; movw %2,%%es; "INT10"; popw %%es"
-	    : "+a" (ax), "+b" (bx), "+c" (cx), "+D" (di)
-	    : : "esi", "edx");
+	initregs(&ireg);
+	ireg.ax = 0x4f15;		/* VBE DDC */
+	/* ireg.bx = 0x0000; */		/* Report DDC capabilities */
+	/* ireg.cx = 0;	*/		/* Controller 0 */
+	ireg.es = 0;			/* ES:DI must be 0 by spec */
+	intcall(0x10, &ireg, &oreg);
 
-	if (ax != 0x004f)
+	if (oreg.ax != 0x004f)
 		return;		/* No EDID */
 
 	/* BH = time in seconds to transfer EDD information */
 	/* BL = DDC level supported */
 
-	ax = 0x4f15;		/* VBE DDC */
-	bx = 0x0001;		/* Read EDID */
-	cx = 0;			/* Controller 0 */
-	dx = 0;			/* EDID block number */
-	di =(size_t) &boot_params.edid_info; /* (ES:)Pointer to block */
-	asm(INT10
-	    : "+a" (ax), "+b" (bx), "+d" (dx), "=m" (boot_params.edid_info),
-	      "+c" (cx), "+D" (di)
-	    : : "esi");
+	ireg.ax = 0x4f15;		/* VBE DDC */
+	ireg.bx = 0x0001;		/* Read EDID */
+	/* ireg.cx = 0; */		/* Controller 0 */
+	/* ireg.dx = 0;	*/		/* EDID block number */
+	ireg.es = ds();
+	ireg.di =(size_t)&boot_params.edid_info; /* (ES:)Pointer to block */
+	intcall(0x10, &ireg, &oreg);
 #endif /* CONFIG_FIRMWARE_EDID */
 }
 
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index 9e0587a..8f8d827 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -39,30 +40,30 @@ static __videocard video_vga;
 /* Set basic 80x25 mode */
 static u8 vga_set_basic_mode(void)
 {
+	struct biosregs ireg, oreg;
 	u16 ax;
 	u8 rows;
 	u8 mode;
 
+	initregs(&ireg);
+
 #ifdef CONFIG_VIDEO_400_HACK
 	if (adapter >= ADAPTER_VGA) {
-		asm volatile(INT10
-			     : : "a" (0x1202), "b" (0x0030)
-			     : "ecx", "edx", "esi", "edi");
+		ireg.ax = 0x1202;
+		ireg.bx = 0x0030;
+		intcall(0x10, &ireg, NULL);
 	}
 #endif
 
 	ax = 0x0f00;
-	asm volatile(INT10
-		     : "+a" (ax)
-		     : : "ebx", "ecx", "edx", "esi", "edi");
-
-	mode = (u8)ax;
+	intcall(0x10, &ireg, &oreg);
+	mode = oreg.al;
 
 	set_fs(0);
 	rows = rdfs8(0x484);	/* rows minus one */
 
 #ifndef CONFIG_VIDEO_400_HACK
-	if ((ax == 0x5003 || ax == 0x5007) &&
+	if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) &&
 	    (rows == 0 || rows == 24))
 		return mode;
 #endif
@@ -71,10 +72,8 @@ static u8 vga_set_basic_mode(void)
 		mode = 3;
 
 	/* Set the mode */
-	ax = mode;
-	asm volatile(INT10
-		     : "+a" (ax)
-		     : : "ebx", "ecx", "edx", "esi", "edi");
+	ireg.ax = mode;		/* AH=0: set mode */
+	intcall(0x10, &ireg, NULL);
 	do_restore = 1;
 	return mode;
 }
@@ -82,43 +81,69 @@ static u8 vga_set_basic_mode(void)
 static void vga_set_8font(void)
 {
 	/* Set 8x8 font - 80x43 on EGA, 80x50 on VGA */
+	struct biosregs ireg;
+
+	initregs(&ireg);
 
 	/* Set 8x8 font */
-	asm volatile(INT10 : : "a" (0x1112), "b" (0));
+	ireg.ax = 0x1112;
+	/* ireg.bl = 0; */
+	intcall(0x10, &ireg, NULL);
 
 	/* Use alternate print screen */
-	asm volatile(INT10 : : "a" (0x1200), "b" (0x20));
+	ireg.ax = 0x1200;
+	ireg.bl = 0x20;
+	intcall(0x10, &ireg, NULL);
 
 	/* Turn off cursor emulation */
-	asm volatile(INT10 : : "a" (0x1201), "b" (0x34));
+	ireg.ax = 0x1201;
+	ireg.bl = 0x34;
+	intcall(0x10, &ireg, NULL);
 
 	/* Cursor is scan lines 6-7 */
-	asm volatile(INT10 : : "a" (0x0100), "c" (0x0607));
+	ireg.ax = 0x0100;
+	ireg.cx = 0x0607;
+	intcall(0x10, &ireg, NULL);
 }
 
 static void vga_set_14font(void)
 {
 	/* Set 9x14 font - 80x28 on VGA */
+	struct biosregs ireg;
+
+	initregs(&ireg);
 
 	/* Set 9x14 font */
-	asm volatile(INT10 : : "a" (0x1111), "b" (0));
+	ireg.ax = 0x1111;
+	/* ireg.bl = 0; */
+	intcall(0x10, &ireg, NULL);
 
 	/* Turn off cursor emulation */
-	asm volatile(INT10 : : "a" (0x1201), "b" (0x34));
+	ireg.ax = 0x1201;
+	ireg.bl = 0x34;
+	intcall(0x10, &ireg, NULL);
 
 	/* Cursor is scan lines 11-12 */
-	asm volatile(INT10 : : "a" (0x0100), "c" (0x0b0c));
+	ireg.ax = 0x0100;
+	ireg.cx = 0x0b0c;
+	intcall(0x10, &ireg, NULL);
 }
 
 static void vga_set_80x43(void)
 {
 	/* Set 80x43 mode on VGA (not EGA) */
+	struct biosregs ireg;
+
+	initregs(&ireg);
 
 	/* Set 350 scans */
-	asm volatile(INT10 : : "a" (0x1201), "b" (0x30));
+	ireg.ax = 0x1201;
+	ireg.bl = 0x30;
+	intcall(0x10, &ireg, NULL);
 
 	/* Reset video mode */
-	asm volatile(INT10 : : "a" (0x0003));
+	ireg.ax = 0x0003;
+	intcall(0x10, &ireg, NULL);
 
 	vga_set_8font();
 }
@@ -225,8 +250,6 @@ static int vga_set_mode(struct mode_info *mode)
  */
 static int vga_probe(void)
 {
-	u16 ega_bx;
-
 	static const char *card_name[] = {
 		"CGA/MDA/HGC", "EGA", "VGA"
 	};
@@ -240,26 +263,26 @@ static int vga_probe(void)
 		sizeof(ega_modes)/sizeof(struct mode_info),
 		sizeof(vga_modes)/sizeof(struct mode_info),
 	};
-	u8 vga_flag;
 
-	asm(INT10
-	    : "=b" (ega_bx)
-	    : "a" (0x1200), "b" (0x10) /* Check EGA/VGA */
-	    : "ecx", "edx", "esi", "edi");
+	struct biosregs ireg, oreg;
+
+	initregs(&ireg);
+
+	ireg.ax = 0x1200;
+	ireg.bl = 0x10;		/* Check EGA/VGA */
+	intcall(0x10, &ireg, &oreg);
 
 #ifndef _WAKEUP
-	boot_params.screen_info.orig_video_ega_bx = ega_bx;
+	boot_params.screen_info.orig_video_ega_bx = oreg.bx;
 #endif
 
 	/* If we have MDA/CGA/HGC then BL will be unchanged at 0x10 */
-	if ((u8)ega_bx != 0x10) {
+	if (oreg.bl != 0x10) {
 		/* EGA/VGA */
-		asm(INT10
-		    : "=a" (vga_flag)
-		    : "a" (0x1a00)
-		    : "ebx", "ecx", "edx", "esi", "edi");
+		ireg.ax = 0x1a00;
+		intcall(0x10, &ireg, &oreg);
 
-		if (vga_flag == 0x1a) {
+		if (oreg.al == 0x1a) {
 			adapter = ADAPTER_VGA;
 #ifndef _WAKEUP
 			boot_params.screen_info.orig_video_isVGA = 1;
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index 3bef2c1..bad728b 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -18,33 +19,29 @@
 
 static void store_cursor_position(void)
 {
-	u16 curpos;
-	u16 ax, bx;
+	struct biosregs ireg, oreg;
 
-	ax = 0x0300;
-	bx = 0;
-	asm(INT10
-	    : "=d" (curpos), "+a" (ax), "+b" (bx)
-	    : : "ecx", "esi", "edi");
+	initregs(&ireg);
+	ireg.ah = 0x03;
+	intcall(0x10, &ireg, &oreg);
 
-	boot_params.screen_info.orig_x = curpos;
-	boot_params.screen_info.orig_y = curpos >> 8;
+	boot_params.screen_info.orig_x = oreg.dl;
+	boot_params.screen_info.orig_y = oreg.dh;
 }
 
 static void store_video_mode(void)
 {
-	u16 ax, page;
+	struct biosregs ireg, oreg;
 
 	/* N.B.: the saving of the video page here is a bit silly,
 	   since we pretty much assume page 0 everywhere. */
-	ax = 0x0f00;
-	asm(INT10
-	    : "+a" (ax), "=b" (page)
-	    : : "ecx", "edx", "esi", "edi");
+	initregs(&ireg);
+	ireg.ah = 0x0f;
+	intcall(0x10, &ireg, &oreg);
 
 	/* Not all BIOSes are clean with respect to the top bit */
-	boot_params.screen_info.orig_video_mode = ax & 0x7f;
-	boot_params.screen_info.orig_video_page = page >> 8;
+	boot_params.screen_info.orig_video_mode = oreg.al & 0x7f;
+	boot_params.screen_info.orig_video_page = oreg.bh;
 }
 
 /*
@@ -257,7 +254,7 @@ static void restore_screen(void)
 	int y;
 	addr_t dst = 0;
 	u16 *src = saved.data;
-	u16 ax, bx, dx;
+	struct biosregs ireg;
 
 	if (graphic_mode)
 		return;		/* Can't restore onto a graphic mode */
@@ -296,12 +293,11 @@ static void restore_screen(void)
 	}
 
 	/* Restore cursor position */
-	ax = 0x0200;		/* Set cursor position */
-	bx = 0;			/* Page number (<< 8) */
-	dx = (saved.cury << 8)+saved.curx;
-	asm volatile(INT10
-		     : "+a" (ax), "+b" (bx), "+d" (dx)
-		     : : "ecx", "esi", "edi");
+	initregs(&ireg);
+	ireg.ah = 0x02;		/* Set cursor position */
+	ireg.dh = saved.cury;
+	ireg.dl = saved.curx;
+	intcall(0x10, &ireg, NULL);
 }
 #else
 #define save_screen()		((void)0)
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h
index ee63f5d..5bb174a 100644
--- a/arch/x86/boot/video.h
+++ b/arch/x86/boot/video.h
@@ -112,20 +112,6 @@ extern int force_x, force_y;	/* Don't query the BIOS for cols/rows */
 extern int do_restore;		/* Restore screen contents */
 extern int graphic_mode;	/* Graphics mode with linear frame buffer */
 
-/*
- * int $0x10 is notorious for touching registers it shouldn't.
- * gcc doesn't like %ebp being clobbered, so define it as a push/pop
- * sequence here.
- *
- * A number of systems, including the original PC can clobber %bp in
- * certain circumstances, like when scrolling.  There exists at least
- * one Trident video card which could clobber DS under a set of
- * circumstances that we are unlikely to encounter (scrolling when
- * using an extended graphics mode of more than 800x600 pixels), but
- * it's cheap insurance to deal with that here.
- */
-#define INT10 "pushl %%ebp; pushw %%ds; int $0x10; popw %%ds; popl %%ebp"
-
 /* Accessing VGA indexed registers */
 static inline u8 in_idx(u16 port, u8 index)
 {
-- 
cgit v0.10.2


From 2062501ae6505dbc5bff3a792246c2661d114050 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 6 Apr 2009 01:49:33 +0200
Subject: tracing/lockdep: report the time waited for a lock

While trying to optimize the new lock on reiserfs to replace
the bkl, I find the lock tracing very useful though it lacks
something important for performance (and latency) instrumentation:
the time a task waits for a lock.

That's what this patch implements:

  bash-4816  [000]   202.652815: lock_contended: lock_contended: &sb->s_type->i_mutex_key
  bash-4816  [000]   202.652819: lock_acquired: &rq->lock (0.000 us)
 <...>-4787  [000]   202.652825: lock_acquired: &rq->lock (0.000 us)
 <...>-4787  [000]   202.652829: lock_acquired: &rq->lock (0.000 us)
  bash-4816  [000]   202.652833: lock_acquired: &sb->s_type->i_mutex_key (16.005 us)

As shown above, the "lock acquired" field is followed by the time
it has been waiting for the lock. Usually, a lock contended entry
is followed by a near lock_acquired entry with a non-zero time waited.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1238975373-15739-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h
index adccfcd..863f1e4 100644
--- a/include/trace/lockdep_event_types.h
+++ b/include/trace/lockdep_event_types.h
@@ -32,11 +32,24 @@ TRACE_FORMAT(lock_contended,
 	TP_FMT("%s", lock->name)
 	);
 
-TRACE_FORMAT(lock_acquired,
-	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
-	TP_ARGS(lock, ip),
-	TP_FMT("%s", lock->name)
-	);
+TRACE_EVENT(lock_acquired,
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime),
+
+	TP_ARGS(lock, ip, waittime),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(unsigned long, wait_usec)
+		__field(unsigned long, wait_nsec_rem)
+	),
+	TP_fast_assign(
+		__entry->name = lock->name;
+		__entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC);
+		__entry->wait_usec = (unsigned long) waittime;
+	),
+	TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec,
+				       __entry->wait_nsec_rem)
+);
 
 #endif
 #endif
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index b0f0118..c4582a6 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3061,6 +3061,8 @@ found_it:
 	put_lock_stats(stats);
 }
 
+DEFINE_TRACE(lock_acquired);
+
 static void
 __lock_acquired(struct lockdep_map *lock, unsigned long ip)
 {
@@ -3099,6 +3101,8 @@ found_it:
 		hlock->holdtime_stamp = now;
 	}
 
+	trace_lock_acquired(lock, ip, waittime);
+
 	stats = get_lock_stats(hlock_class(hlock));
 	if (waittime) {
 		if (hlock->read)
@@ -3137,14 +3141,10 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
 }
 EXPORT_SYMBOL_GPL(lock_contended);
 
-DEFINE_TRACE(lock_acquired);
-
 void lock_acquired(struct lockdep_map *lock, unsigned long ip)
 {
 	unsigned long flags;
 
-	trace_lock_acquired(lock, ip);
-
 	if (unlikely(!lock_stat))
 		return;
 
-- 
cgit v0.10.2


From e71e99c294058a61b7a8b9bb6da2f745ac51aa4f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 25 Mar 2009 14:30:04 -0400
Subject: x86, function-graph: only save return values on x86_64

Impact: speed up

The return to handler portion of the function graph tracer should only
need to save the return values. The caller already saved off the
registers that the callee can modify. The returning function already
saved the registers it modified. When we call our own trace function
it too will save the registers that the callee must restore.

There's no reason to save off anything more that the registers used
to return the values.

Note, I did a complete kernel build with this modification and the
function graph tracer running on x86_64.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a331ec3..1ac9986 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -147,27 +147,14 @@ END(ftrace_graph_caller)
 GLOBAL(return_to_handler)
 	subq  $80, %rsp
 
+	/* Save the return values */
 	movq %rax, (%rsp)
-	movq %rcx, 8(%rsp)
-	movq %rdx, 16(%rsp)
-	movq %rsi, 24(%rsp)
-	movq %rdi, 32(%rsp)
-	movq %r8, 40(%rsp)
-	movq %r9, 48(%rsp)
-	movq %r10, 56(%rsp)
-	movq %r11, 64(%rsp)
+	movq %rdx, 8(%rsp)
 
 	call ftrace_return_to_handler
 
 	movq %rax, 72(%rsp)
-	movq 64(%rsp), %r11
-	movq 56(%rsp), %r10
-	movq 48(%rsp), %r9
-	movq 40(%rsp), %r8
-	movq 32(%rsp), %rdi
-	movq 24(%rsp), %rsi
-	movq 16(%rsp), %rdx
-	movq 8(%rsp), %rcx
+	movq 8(%rsp), %rdx
 	movq (%rsp), %rax
 	addq $72, %rsp
 	retq
-- 
cgit v0.10.2


From 5cb3d1d9d34ac04bcaa2034139345b2a5fea54c1 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Thu, 9 Apr 2009 14:08:18 +0800
Subject: tracing, net, skb tracepoint: make skb tracepoint use the
 TRACE_EVENT() macro

TRACE_EVENT is a more generic way to define a tracepoint.
Doing so adds these new capabilities to this tracepoint:

  - zero-copy and per-cpu splice() tracing
  - binary tracing without printf overhead
  - structured logging records exposed under /debug/tracing/events
  - trace events embedded in function tracer output and other plugins
  - user-defined, per tracepoint filter expressions

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: "Steven Rostedt ;" <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <49DD90D2.5020604@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/trace/skb.h b/include/trace/skb.h
index b66206d..d2de717 100644
--- a/include/trace/skb.h
+++ b/include/trace/skb.h
@@ -4,8 +4,6 @@
 #include <linux/skbuff.h>
 #include <linux/tracepoint.h>
 
-DECLARE_TRACE(kfree_skb,
-	TP_PROTO(struct sk_buff *skb, void *location),
-	TP_ARGS(skb, location));
+#include <trace/skb_event_types.h>
 
 #endif
diff --git a/include/trace/skb_event_types.h b/include/trace/skb_event_types.h
new file mode 100644
index 0000000..4a1c504
--- /dev/null
+++ b/include/trace/skb_event_types.h
@@ -0,0 +1,38 @@
+
+/* use <trace/skb.h> instead */
+#ifndef TRACE_EVENT
+# error Do not include this file directly.
+# error Unless you know what you are doing.
+#endif
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM skb
+
+/*
+ * Tracepoint for free an sk_buff:
+ */
+TRACE_EVENT(kfree_skb,
+
+	TP_PROTO(struct sk_buff *skb, void *location),
+
+	TP_ARGS(skb, location),
+
+	TP_STRUCT__entry(
+		__field(	void *,		skbaddr		)
+		__field(	unsigned short,	protocol	)
+		__field(	void *,		location	)
+	),
+
+	TP_fast_assign(
+		__entry->skbaddr = skb;
+		if (skb) {
+			__entry->protocol = ntohs(skb->protocol);
+		}
+		__entry->location = location;
+	),
+
+	TP_printk("skbaddr=%p protocol=%u location=%p",
+		__entry->skbaddr, __entry->protocol, __entry->location)
+);
+
+#undef TRACE_SYSTEM
diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h
index df56f56..33b6bfc 100644
--- a/include/trace/trace_event_types.h
+++ b/include/trace/trace_event_types.h
@@ -3,3 +3,4 @@
 #include <trace/sched_event_types.h>
 #include <trace/irq_event_types.h>
 #include <trace/lockdep_event_types.h>
+#include <trace/skb_event_types.h>
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index fd13750..0e2aa80 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -3,3 +3,4 @@
 #include <trace/sched.h>
 #include <trace/irq.h>
 #include <trace/lockdep.h>
+#include <trace/skb.h>
-- 
cgit v0.10.2


From bda869c614c937c318547c3ee1d65a316b693c21 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Thu, 9 Apr 2009 15:05:10 +0200
Subject: x86: cacheinfo: use L3 cache index disable feature only for CPUs that
 support it

AMD family 0x11 CPU doesn't support the feature.

Some AMD family 0x10 CPUs do not support it or have an erratum, see
erratum #382 in "Revision Guide for AMD Family 10h Processors, 41322
Rev. 3.40 February 2009".

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
CC: Mark Langsdorf <mark.langsdorf@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <20090409130510.GG31527@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 483eda9..7240126 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -291,6 +291,14 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 {
 	if (index < 3)
 		return;
+
+	if (boot_cpu_data.x86 == 0x11)
+		return;
+
+	/* see erratum #382 */
+	if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8))
+		return;
+
 	this_leaf->can_disable = 1;
 }
 
-- 
cgit v0.10.2


From 845d8c761ec763871936c62b837c4a9ea6d0fbdb Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Thu, 9 Apr 2009 15:07:29 +0200
Subject: x86: cacheinfo: correct return value when cache_disable feature is
 not active

Impact: bug fix

If user writes to "cache_disable" attribute on a CPU that does not support
this feature, the process hangs due to an invalid return value in
store_cache_disable().

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mark Langsdorf <mark.langsdorf@amd.com>
LKML-Reference: <20090409130729.GH31527@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 7240126..1ab46e0 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -771,7 +771,7 @@ store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
 	unsigned int ret, index, val;
 
 	if (!this_leaf->can_disable)
-		return 0;
+		return -EINVAL;
 
 	if (strlen(buf) > 15)
 		return -EINVAL;
-- 
cgit v0.10.2


From afd9fceec55225d33be878927056a548c2eef26c Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Thu, 9 Apr 2009 15:16:17 +0200
Subject: x86: cacheinfo: use cached K8 NB_MISC devices instead of scanning for
 it

Impact: avoid code duplication

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mark Langsdorf <mark.langsdorf@amd.com>
LKML-Reference: <20090409131617.GI31527@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
index 54c8cc5..c23b3d17 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -12,4 +12,12 @@ extern int cache_k8_northbridges(void);
 extern void k8_flush_garts(void);
 extern int k8_scan_nodes(unsigned long start, unsigned long end);
 
+#ifdef CONFIG_K8_NB
+#define node_to_k8_nb_misc(node) \
+	(node < num_k8_northbridges) ? k8_northbridges[node] : NULL
+#else
+#define node_to_k8_nb_misc(node) NULL
+#endif
+
+
 #endif /* _ASM_X86_K8_H */
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 1ab46e0..0cde071 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -17,6 +17,7 @@
 
 #include <asm/processor.h>
 #include <asm/smp.h>
+#include <asm/k8.h>
 
 #define LVL_1_INST	1
 #define LVL_1_DATA	2
@@ -159,14 +160,6 @@ struct _cpuid4_info_regs {
 	unsigned long can_disable;
 };
 
-#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS)
-static struct pci_device_id k8_nb_id[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
-	{}
-};
-#endif
-
 unsigned short			num_cache_leaves;
 
 /* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -704,30 +697,6 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
 #define to_object(k)	container_of(k, struct _index_kobject, kobj)
 #define to_attr(a)	container_of(a, struct _cache_attr, attr)
 
-#ifdef CONFIG_PCI
-static struct pci_dev *get_k8_northbridge(int node)
-{
-	struct pci_dev *dev = NULL;
-	int i;
-
-	for (i = 0; i <= node; i++) {
-		do {
-			dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
-			if (!dev)
-				break;
-		} while (!pci_match_id(&k8_nb_id[0], dev));
-		if (!dev)
-			break;
-	}
-	return dev;
-}
-#else
-static struct pci_dev *get_k8_northbridge(int node)
-{
-	return NULL;
-}
-#endif
-
 static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
 {
 	const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
@@ -739,7 +708,7 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
 	if (!this_leaf->can_disable)
 		return sprintf(buf, "Feature not enabled\n");
 
-	dev = get_k8_northbridge(node);
+	dev = node_to_k8_nb_misc(node);
 	if (!dev) {
 		printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
 		return -EINVAL;
@@ -783,7 +752,7 @@ store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
 		return -EINVAL;
 
 	val |= 0xc0000000;
-	dev = get_k8_northbridge(node);
+	dev = node_to_k8_nb_misc(node);
 	if (!dev) {
 		printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
 		return -EINVAL;
-- 
cgit v0.10.2


From f8b201fc7110c3673437254e8ba02451461ece0b Mon Sep 17 00:00:00 2001
From: Mark Langsdorf <mark.langsdorf@amd.com>
Date: Thu, 9 Apr 2009 15:18:49 +0200
Subject: x86: cacheinfo: replace sysfs interface for cache_disable feature

Impact: replace sysfs attribute

Current interface violates against "one-value-per-sysfs-attribute
rule". This patch replaces current attribute with two attributes --
one for each L3 Cache Index Disable register.

Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <20090409131849.GJ31527@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 0cde071..fc28291 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -697,73 +697,69 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
 #define to_object(k)	container_of(k, struct _index_kobject, kobj)
 #define to_attr(a)	container_of(a, struct _cache_attr, attr)
 
-static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
+static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
+				  unsigned int index)
 {
-	const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
-	int node = cpu_to_node(cpumask_first(mask));
-	struct pci_dev *dev = NULL;
-	ssize_t ret = 0;
-	int i;
+	int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+	int node = cpu_to_node(cpu);
+	struct pci_dev *dev = node_to_k8_nb_misc(node);
+	unsigned int reg = 0;
 
 	if (!this_leaf->can_disable)
-		return sprintf(buf, "Feature not enabled\n");
-
-	dev = node_to_k8_nb_misc(node);
-	if (!dev) {
-		printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
 		return -EINVAL;
-	}
 
-	for (i = 0; i < 2; i++) {
-		unsigned int reg;
+	if (!dev)
+		return -EINVAL;
 
-		pci_read_config_dword(dev, 0x1BC + i * 4, &reg);
+	pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
+	return sprintf(buf, "%x\n", reg);
+}
 
-		ret += sprintf(buf, "%sEntry: %d\n", buf, i);
-		ret += sprintf(buf, "%sReads:  %s\tNew Entries: %s\n",  
-			buf,
-			reg & 0x80000000 ? "Disabled" : "Allowed",
-			reg & 0x40000000 ? "Disabled" : "Allowed");
-		ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n",
-			buf, (reg & 0x30000) >> 16, reg & 0xfff);
-	}
-	return ret;
+#define SHOW_CACHE_DISABLE(index)					\
+static ssize_t								\
+show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf)  	\
+{									\
+	return show_cache_disable(this_leaf, buf, index);		\
 }
+SHOW_CACHE_DISABLE(0)
+SHOW_CACHE_DISABLE(1)
 
-static ssize_t
-store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
-		    size_t count)
+static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
+	const char *buf, size_t count, unsigned int index)
 {
-	const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
-	int node = cpu_to_node(cpumask_first(mask));
-	struct pci_dev *dev = NULL;
-	unsigned int ret, index, val;
+	int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+	int node = cpu_to_node(cpu);
+	struct pci_dev *dev = node_to_k8_nb_misc(node);
+	unsigned long val = 0;
 
 	if (!this_leaf->can_disable)
 		return -EINVAL;
 
-	if (strlen(buf) > 15)
-		return -EINVAL;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
 
-	ret = sscanf(buf, "%x %x", &index, &val);
-	if (ret != 2)
-		return -EINVAL;
-	if (index > 1)
+	if (!dev)
 		return -EINVAL;
 
-	val |= 0xc0000000;
-	dev = node_to_k8_nb_misc(node);
-	if (!dev) {
-		printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
+	if (strict_strtoul(buf, 10, &val) < 0)
 		return -EINVAL;
-	}
 
+	val |= 0xc0000000;
 	pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
 	wbinvd();
 	pci_write_config_dword(dev, 0x1BC + index * 4, val);
+	return count;
+}
 
-	return 1;
+#define STORE_CACHE_DISABLE(index)					\
+static ssize_t								\
+store_cache_disable_##index(struct _cpuid4_info *this_leaf,	     	\
+			    const char *buf, size_t count)		\
+{									\
+	return store_cache_disable(this_leaf, buf, count, index);	\
 }
+STORE_CACHE_DISABLE(0)
+STORE_CACHE_DISABLE(1)
 
 struct _cache_attr {
 	struct attribute attr;
@@ -785,7 +781,10 @@ define_one_ro(size);
 define_one_ro(shared_cpu_map);
 define_one_ro(shared_cpu_list);
 
-static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable);
+static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
+		show_cache_disable_0, store_cache_disable_0);
+static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
+		show_cache_disable_1, store_cache_disable_1);
 
 static struct attribute * default_attrs[] = {
 	&type.attr,
@@ -797,7 +796,8 @@ static struct attribute * default_attrs[] = {
 	&size.attr,
 	&shared_cpu_map.attr,
 	&shared_cpu_list.attr,
-	&cache_disable.attr,
+	&cache_disable_0.attr,
+	&cache_disable_1.attr,
 	NULL
 };
 
-- 
cgit v0.10.2


From ba518bea2db21c72d44a6cbfd825b026ef9cdcb6 Mon Sep 17 00:00:00 2001
From: Mark Langsdorf <mark.langsdorf@amd.com>
Date: Thu, 9 Apr 2009 15:24:06 +0200
Subject: x86: cacheinfo: disable L3 ECC scrubbing when L3 cache index is
 disabled

(Use correct mask to zero out bits 24-28 by Andreas)

Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <20090409132406.GK31527@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index fc28291..d46a849 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -731,6 +731,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
 	int node = cpu_to_node(cpu);
 	struct pci_dev *dev = node_to_k8_nb_misc(node);
 	unsigned long val = 0;
+	unsigned int scrubber = 0;
 
 	if (!this_leaf->can_disable)
 		return -EINVAL;
@@ -745,6 +746,11 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
 		return -EINVAL;
 
 	val |= 0xc0000000;
+
+	pci_read_config_dword(dev, 0x58, &scrubber);
+	scrubber &= ~0x1f000000;
+	pci_write_config_dword(dev, 0x58, scrubber);
+
 	pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
 	wbinvd();
 	pci_write_config_dword(dev, 0x1BC + index * 4, val);
-- 
cgit v0.10.2


From 2fad2d9bb8310889f3261035b594b4e068b6eb8b Mon Sep 17 00:00:00 2001
From: Mark Langsdorf <mark.langsdorf@amd.com>
Date: Thu, 9 Apr 2009 15:31:53 +0200
Subject: x86/docs: add description for cache_disable sysfs interface

Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <20090409133153.GL31527@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/ABI/testing/sysfs-devices-cache_disable b/Documentation/ABI/testing/sysfs-devices-cache_disable
new file mode 100644
index 0000000..175bb4f
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-cache_disable
@@ -0,0 +1,18 @@
+What:      /sys/devices/system/cpu/cpu*/cache/index*/cache_disable_X
+Date:      August 2008
+KernelVersion:	2.6.27
+Contact:	mark.langsdorf@amd.com
+Description:	These files exist in every cpu's cache index directories.
+		There are currently 2 cache_disable_# files in each
+		directory.  Reading from these files on a supported
+		processor will return that cache disable index value
+		for that processor and node.  Writing to one of these
+		files will cause the specificed cache index to be disabled.
+
+		Currently, only AMD Family 10h Processors support cache index
+		disable, and only for their L3 caches.  See the BIOS and
+		Kernel Developer's Guide at
+		http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/31116-Public-GH-BKDG_3.20_2-4-09.pdf
+		for formatting information and other details on the
+		cache index disable.
+Users:    joachim.deguara@amd.com
-- 
cgit v0.10.2


From f465145235313c451164bdfa9037ac254bf00c9a Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 9 Apr 2009 11:52:18 +0300
Subject: x86: move x86_quirk_pre_intr_init() to irqinit_32.c

Impact: cleanup

In preparation for unifying irqinit_{32,64}.c, make
x86_quirk_pre_intr_init() local to irqinit_32.c.

Reviewed-by Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index 1a99e6c..58d7091 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -60,8 +60,4 @@ extern struct irq_chip i8259A_chip;
 extern void mask_8259A(void);
 extern void unmask_8259A(void);
 
-#ifdef CONFIG_X86_32
-extern void init_ISA_irqs(void);
-#endif
-
 #endif /* _ASM_X86_I8259_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index bdc2ada..4093d1e 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -33,7 +33,6 @@ struct x86_quirks {
 	int (*setup_ioapic_ids)(void);
 };
 
-extern void x86_quirk_pre_intr_init(void);
 extern void x86_quirk_intr_init(void);
 
 extern void x86_quirk_trap_init(void);
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 368b0a8..0c0dedc 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -53,7 +53,7 @@ static struct irqaction fpu_irq = {
 	.name = "fpu",
 };
 
-void __init init_ISA_irqs(void)
+static void __init init_ISA_irqs(void)
 {
 	int i;
 
@@ -121,6 +121,24 @@ int vector_used_by_percpu_irq(unsigned int vector)
 /* Overridden in paravirt.c */
 void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
 
+/**
+ * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
+ *
+ * Description:
+ *	Perform any necessary interrupt initialisation prior to setting up
+ *	the "ordinary" interrupt call gates.  For legacy reasons, the ISA
+ *	interrupts should be initialised here if the machine emulates a PC
+ *	in any way.
+ **/
+static void __init x86_quirk_pre_intr_init(void)
+{
+	if (x86_quirks->arch_pre_intr_init) {
+		if (x86_quirks->arch_pre_intr_init())
+			return;
+	}
+	init_ISA_irqs();
+}
+
 void __init native_init_IRQ(void)
 {
 	int i;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b415843..523bb69 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -997,24 +997,6 @@ void __init setup_arch(char **cmdline_p)
 #ifdef CONFIG_X86_32
 
 /**
- * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
- *
- * Description:
- *	Perform any necessary interrupt initialisation prior to setting up
- *	the "ordinary" interrupt call gates.  For legacy reasons, the ISA
- *	interrupts should be initialised here if the machine emulates a PC
- *	in any way.
- **/
-void __init x86_quirk_pre_intr_init(void)
-{
-	if (x86_quirks->arch_pre_intr_init) {
-		if (x86_quirks->arch_pre_intr_init())
-			return;
-	}
-	init_ISA_irqs();
-}
-
-/**
  * x86_quirk_intr_init - post gate setup interrupt initialisation
  *
  * Description:
-- 
cgit v0.10.2


From 7371d9fcb88dc9185be9719f64744a339c537a92 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 9 Apr 2009 11:52:19 +0300
Subject: x86: move init_ISA_irqs() in irqinit_32.c to match ordering in
 irqinit_64.c

Impact: cleanup

Reviewed-by Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 0c0dedc..c5cb769 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -53,30 +53,6 @@ static struct irqaction fpu_irq = {
 	.name = "fpu",
 };
 
-static void __init init_ISA_irqs(void)
-{
-	int i;
-
-#ifdef CONFIG_X86_LOCAL_APIC
-	init_bsp_APIC();
-#endif
-	init_8259A(0);
-
-	/*
-	 * 16 old-style INTA-cycle interrupts:
-	 */
-	for (i = 0; i < NR_IRQS_LEGACY; i++) {
-		struct irq_desc *desc = irq_to_desc(i);
-
-		desc->status = IRQ_DISABLED;
-		desc->action = NULL;
-		desc->depth = 1;
-
-		set_irq_chip_and_handler_name(i, &i8259A_chip,
-					      handle_level_irq, "XT");
-	}
-}
-
 /*
  * IRQ2 is cascade interrupt to second interrupt controller
  */
@@ -118,6 +94,30 @@ int vector_used_by_percpu_irq(unsigned int vector)
 	return 0;
 }
 
+static void __init init_ISA_irqs(void)
+{
+	int i;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	init_bsp_APIC();
+#endif
+	init_8259A(0);
+
+	/*
+	 * 16 old-style INTA-cycle interrupts:
+	 */
+	for (i = 0; i < NR_IRQS_LEGACY; i++) {
+		struct irq_desc *desc = irq_to_desc(i);
+
+		desc->status = IRQ_DISABLED;
+		desc->action = NULL;
+		desc->depth = 1;
+
+		set_irq_chip_and_handler_name(i, &i8259A_chip,
+					      handle_level_irq, "XT");
+	}
+}
+
 /* Overridden in paravirt.c */
 void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
 
-- 
cgit v0.10.2


From 36290d87f5abf260a543e5b711be4ceed03e6b1a Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 9 Apr 2009 11:52:20 +0300
Subject: x86: introduce smp_intr_init() in irqinit_32.c

Impact: cleanup

Reviewed-by Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index c5cb769..df0aad5 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -121,6 +121,38 @@ static void __init init_ISA_irqs(void)
 /* Overridden in paravirt.c */
 void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
 
+static void __init smp_intr_init(void)
+{
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
+	/*
+	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
+	 * IPI, driven by wakeup.
+	 */
+	alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+
+	/* IPIs for invalidation */
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
+
+	/* IPI for generic function call */
+	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+
+	/* IPI for single call function */
+	alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
+				 call_function_single_interrupt);
+
+	/* Low priority IPI to cleanup after moving an irq */
+	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
+#endif
+}
+
 /**
  * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
  *
@@ -158,34 +190,7 @@ void __init native_init_IRQ(void)
 	}
 
 
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
-	/*
-	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
-	 * IPI, driven by wakeup.
-	 */
-	alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
-	/* IPIs for invalidation */
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
-
-	/* IPI for generic function call */
-	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-
-	/* IPI for single call function */
-	alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
-				 call_function_single_interrupt);
-
-	/* Low priority IPI to cleanup after moving an irq */
-	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
-	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
-#endif
+	smp_intr_init();
 
 #ifdef CONFIG_X86_LOCAL_APIC
 	/* self generated IPI for local APIC timer */
-- 
cgit v0.10.2


From 22813c45228160b07244a7c4ed7580388ac0f33d Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 9 Apr 2009 11:52:21 +0300
Subject: x86: introduce apic_intr_init() in irqinit_32.c

Impact: cleanup

Reviewed-by Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index df0aad5..9ba68c4 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -171,25 +171,8 @@ static void __init x86_quirk_pre_intr_init(void)
 	init_ISA_irqs();
 }
 
-void __init native_init_IRQ(void)
+static void __init apic_intr_init(void)
 {
-	int i;
-
-	/* Execute any quirks before the call gates are initialised: */
-	x86_quirk_pre_intr_init();
-
-	/*
-	 * Cover the whole vector space, no vector can escape
-	 * us. (some of these will be overridden and become
-	 * 'special' SMP interrupts)
-	 */
-	for (i =  FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
-		/* SYSCALL_VECTOR was reserved in trap_init. */
-		if (i != SYSCALL_VECTOR)
-			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
-	}
-
-
 	smp_intr_init();
 
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -208,6 +191,27 @@ void __init native_init_IRQ(void)
 	/* thermal monitor LVT interrupt */
 	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
 #endif
+}
+
+void __init native_init_IRQ(void)
+{
+	int i;
+
+	/* Execute any quirks before the call gates are initialised: */
+	x86_quirk_pre_intr_init();
+
+	/*
+	 * Cover the whole vector space, no vector can escape
+	 * us. (some of these will be overridden and become
+	 * 'special' SMP interrupts)
+	 */
+	for (i =  FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
+		/* SYSCALL_VECTOR was reserved in trap_init. */
+		if (i != SYSCALL_VECTOR)
+			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
+	}
+
+	apic_intr_init();
 
 	if (!acpi_ioapic)
 		setup_irq(2, &irq2);
-- 
cgit v0.10.2


From d3496c85cae22fb7713af6ed542a6aeae8ee4210 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 9 Apr 2009 11:52:22 +0300
Subject: x86: use identical loop constructs in 32-bit and 64-bit
 native_init_IRQ()

Impact: cleanup

Reviewed-by Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 9ba68c4..1029a18 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -205,7 +205,7 @@ void __init native_init_IRQ(void)
 	 * us. (some of these will be overridden and become
 	 * 'special' SMP interrupts)
 	 */
-	for (i =  FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
+	for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
 		/* SYSCALL_VECTOR was reserved in trap_init. */
 		if (i != SYSCALL_VECTOR)
 			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 8cd1053..1c8858b 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -159,15 +159,16 @@ void __init native_init_IRQ(void)
 	int i;
 
 	init_ISA_irqs();
+
 	/*
 	 * Cover the whole vector space, no vector can escape
 	 * us. (some of these will be overridden and become
 	 * 'special' SMP interrupts)
 	 */
-	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
-		int vector = FIRST_EXTERNAL_VECTOR + i;
-		if (vector != IA32_SYSCALL_VECTOR)
-			set_intr_gate(vector, interrupt[i]);
+	for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
+		/* IA32_SYSCALL_VECTOR was reserved in trap_init. */
+		if (i != IA32_SYSCALL_VECTOR)
+			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
 	}
 
 	apic_intr_init();
-- 
cgit v0.10.2


From b0096bb0b640d0a7713618b3472fd0f4adf30a96 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 9 Apr 2009 11:52:23 +0300
Subject: x86: unify smp_intr_init() in irqinit_{32,64}.h

Impact: cleanup

Reviewed-by Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 1029a18..ef2528d 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -123,7 +123,8 @@ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
 
 static void __init smp_intr_init(void)
 {
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
 	/*
 	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
 	 * IPI, driven by wakeup.
@@ -143,14 +144,15 @@ static void __init smp_intr_init(void)
 	/* IPI for generic function call */
 	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
 
-	/* IPI for single call function */
+	/* IPI for generic single function call */
 	alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
-				 call_function_single_interrupt);
+			call_function_single_interrupt);
 
 	/* Low priority IPI to cleanup after moving an irq */
 	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
 	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
 #endif
+#endif /* CONFIG_SMP */
 }
 
 /**
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 1c8858b..9e7c57d 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -107,6 +107,7 @@ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
 static void __init smp_intr_init(void)
 {
 #ifdef CONFIG_SMP
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
 	/*
 	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
 	 * IPI, driven by wakeup.
@@ -134,6 +135,7 @@ static void __init smp_intr_init(void)
 	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
 	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
 #endif
+#endif /* CONFIG_SMP */
 }
 
 static void __init apic_intr_init(void)
-- 
cgit v0.10.2


From 598c73d250ffb112715aa48fb325d79e255be23b Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 9 Apr 2009 11:52:24 +0300
Subject: x86: unify init_ISA_irqs() in irqinit_{32,64}.c

Impact: cleanup

Reviewed-by Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index ef2528d..4488b71 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -98,7 +98,7 @@ static void __init init_ISA_irqs(void)
 {
 	int i;
 
-#ifdef CONFIG_X86_LOCAL_APIC
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
 	init_bsp_APIC();
 #endif
 	init_8259A(0);
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 9e7c57d..61c9a92 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -84,9 +84,14 @@ static void __init init_ISA_irqs(void)
 {
 	int i;
 
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
 	init_bsp_APIC();
+#endif
 	init_8259A(0);
 
+	/*
+	 * 16 old-style INTA-cycle interrupts:
+	 */
 	for (i = 0; i < NR_IRQS_LEGACY; i++) {
 		struct irq_desc *desc = irq_to_desc(i);
 
@@ -94,11 +99,8 @@ static void __init init_ISA_irqs(void)
 		desc->action = NULL;
 		desc->depth = 1;
 
-		/*
-		 * 16 old-style INTA-cycle interrupts:
-		 */
 		set_irq_chip_and_handler_name(i, &i8259A_chip,
-						      handle_level_irq, "XT");
+					      handle_level_irq, "XT");
 	}
 }
 
-- 
cgit v0.10.2


From 320fd99672a44ece6d1cd0d838ba31c8ebbf5979 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 9 Apr 2009 11:52:25 +0300
Subject: x86: unify native_init_IRQ() in irqinit_{32,64}.c

Impact: cleanup

Reviewed-by Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 4488b71..a780de3 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -22,7 +22,7 @@
 #include <asm/i8259.h>
 #include <asm/traps.h>
 
-
+#ifdef CONFIG_X86_32
 /*
  * Note that on a 486, we don't want to do a SIGFPE on an irq13
  * as the irq is unreliable, and exception 16 works correctly
@@ -52,6 +52,7 @@ static struct irqaction fpu_irq = {
 	.handler = math_error_irq,
 	.name = "fpu",
 };
+#endif
 
 /*
  * IRQ2 is cascade interrupt to second interrupt controller
@@ -155,24 +156,6 @@ static void __init smp_intr_init(void)
 #endif /* CONFIG_SMP */
 }
 
-/**
- * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
- *
- * Description:
- *	Perform any necessary interrupt initialisation prior to setting up
- *	the "ordinary" interrupt call gates.  For legacy reasons, the ISA
- *	interrupts should be initialised here if the machine emulates a PC
- *	in any way.
- **/
-static void __init x86_quirk_pre_intr_init(void)
-{
-	if (x86_quirks->arch_pre_intr_init) {
-		if (x86_quirks->arch_pre_intr_init())
-			return;
-	}
-	init_ISA_irqs();
-}
-
 static void __init apic_intr_init(void)
 {
 	smp_intr_init();
@@ -195,12 +178,36 @@ static void __init apic_intr_init(void)
 #endif
 }
 
+#ifdef CONFIG_X86_32
+/**
+ * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
+ *
+ * Description:
+ *	Perform any necessary interrupt initialisation prior to setting up
+ *	the "ordinary" interrupt call gates.  For legacy reasons, the ISA
+ *	interrupts should be initialised here if the machine emulates a PC
+ *	in any way.
+ **/
+static void __init x86_quirk_pre_intr_init(void)
+{
+	if (x86_quirks->arch_pre_intr_init) {
+		if (x86_quirks->arch_pre_intr_init())
+			return;
+	}
+	init_ISA_irqs();
+}
+#endif
+
 void __init native_init_IRQ(void)
 {
 	int i;
 
+#ifdef CONFIG_X86_32
 	/* Execute any quirks before the call gates are initialised: */
 	x86_quirk_pre_intr_init();
+#else
+	init_ISA_irqs();
+#endif
 
 	/*
 	 * Cover the whole vector space, no vector can escape
@@ -208,9 +215,15 @@ void __init native_init_IRQ(void)
 	 * 'special' SMP interrupts)
 	 */
 	for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
+#ifdef CONFIG_X86_32
 		/* SYSCALL_VECTOR was reserved in trap_init. */
 		if (i != SYSCALL_VECTOR)
 			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
+#else
+		/* IA32_SYSCALL_VECTOR was reserved in trap_init. */
+		if (i != IA32_SYSCALL_VECTOR)
+			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
+#endif
 	}
 
 	apic_intr_init();
@@ -218,6 +231,7 @@ void __init native_init_IRQ(void)
 	if (!acpi_ioapic)
 		setup_irq(2, &irq2);
 
+#ifdef CONFIG_X86_32
 	/*
 	 * Call quirks after call gates are initialised (usually add in
 	 * the architecture specific gates):
@@ -232,4 +246,5 @@ void __init native_init_IRQ(void)
 		setup_irq(FPU_IRQ, &fpu_irq);
 
 	irq_ctx_init(smp_processor_id());
+#endif
 }
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 61c9a92..ed50e35 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -39,14 +39,46 @@
  * (these are usually mapped into the 0x30-0xff vector range)
  */
 
+#ifdef CONFIG_X86_32
 /*
- * IRQ2 is cascade interrupt to second interrupt controller
+ * Note that on a 486, we don't want to do a SIGFPE on an irq13
+ * as the irq is unreliable, and exception 16 works correctly
+ * (ie as explained in the intel literature). On a 386, you
+ * can't use exception 16 due to bad IBM design, so we have to
+ * rely on the less exact irq13.
+ *
+ * Careful.. Not only is IRQ13 unreliable, but it is also
+ * leads to races. IBM designers who came up with it should
+ * be shot.
+ */
+
+static irqreturn_t math_error_irq(int cpl, void *dev_id)
+{
+	outb(0, 0xF0);
+	if (ignore_fpu_irq || !boot_cpu_data.hard_math)
+		return IRQ_NONE;
+	math_error((void __user *)get_irq_regs()->ip);
+	return IRQ_HANDLED;
+}
+
+/*
+ * New motherboards sometimes make IRQ 13 be a PCI interrupt,
+ * so allow interrupt sharing.
  */
+static struct irqaction fpu_irq = {
+	.handler = math_error_irq,
+	.name = "fpu",
+};
+#endif
 
+/*
+ * IRQ2 is cascade interrupt to second interrupt controller
+ */
 static struct irqaction irq2 = {
 	.handler = no_action,
 	.name = "cascade",
 };
+
 DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
 	[0 ... IRQ0_VECTOR - 1] = -1,
 	[IRQ0_VECTOR] = 0,
@@ -158,11 +190,36 @@ static void __init apic_intr_init(void)
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 }
 
+#ifdef CONFIG_X86_32
+/**
+ * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
+ *
+ * Description:
+ *	Perform any necessary interrupt initialisation prior to setting up
+ *	the "ordinary" interrupt call gates.  For legacy reasons, the ISA
+ *	interrupts should be initialised here if the machine emulates a PC
+ *	in any way.
+ **/
+static void __init x86_quirk_pre_intr_init(void)
+{
+	if (x86_quirks->arch_pre_intr_init) {
+		if (x86_quirks->arch_pre_intr_init())
+			return;
+	}
+	init_ISA_irqs();
+}
+#endif
+
 void __init native_init_IRQ(void)
 {
 	int i;
 
+#ifdef CONFIG_X86_32
+	/* Execute any quirks before the call gates are initialised: */
+	x86_quirk_pre_intr_init();
+#else
 	init_ISA_irqs();
+#endif
 
 	/*
 	 * Cover the whole vector space, no vector can escape
@@ -170,13 +227,36 @@ void __init native_init_IRQ(void)
 	 * 'special' SMP interrupts)
 	 */
 	for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
+#ifdef CONFIG_X86_32
+		/* SYSCALL_VECTOR was reserved in trap_init. */
+		if (i != SYSCALL_VECTOR)
+			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
+#else
 		/* IA32_SYSCALL_VECTOR was reserved in trap_init. */
 		if (i != IA32_SYSCALL_VECTOR)
 			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
+#endif
 	}
 
 	apic_intr_init();
 
 	if (!acpi_ioapic)
 		setup_irq(2, &irq2);
+
+#ifdef CONFIG_X86_32
+	/*
+	 * Call quirks after call gates are initialised (usually add in
+	 * the architecture specific gates):
+	 */
+	x86_quirk_intr_init();
+
+	/*
+	 * External FPU? Set up irq13 if so, for
+	 * original braindamaged IBM FERR coupling.
+	 */
+	if (boot_cpu_data.hard_math && !cpu_has_fpu)
+		setup_irq(FPU_IRQ, &fpu_irq);
+
+	irq_ctx_init(smp_processor_id());
+#endif
 }
-- 
cgit v0.10.2


From 778838600eb6973bdb6fd11e7f91b43cea4d6f45 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 9 Apr 2009 11:52:26 +0300
Subject: x86: unify trivial differences in irqinit_{32,64}.c

Impact: cleanup

Reviewed-by Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index a780de3..72ce942 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -1,20 +1,24 @@
+#include <linux/linkage.h>
 #include <linux/errno.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/ioport.h>
 #include <linux/interrupt.h>
+#include <linux/timex.h>
 #include <linux/slab.h>
 #include <linux/random.h>
 #include <linux/init.h>
 #include <linux/kernel_stat.h>
 #include <linux/sysdev.h>
 #include <linux/bitops.h>
+#include <linux/acpi.h>
 #include <linux/io.h>
 #include <linux/delay.h>
 
 #include <asm/atomic.h>
 #include <asm/system.h>
 #include <asm/timer.h>
+#include <asm/hw_irq.h>
 #include <asm/pgtable.h>
 #include <asm/desc.h>
 #include <asm/apic.h>
@@ -22,6 +26,22 @@
 #include <asm/i8259.h>
 #include <asm/traps.h>
 
+/*
+ * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
+ * (these are usually mapped to vectors 0x30-0x3f)
+ */
+
+/*
+ * The IO-APIC gives us many more interrupt sources. Most of these
+ * are unused but an SMP system is supposed to have enough memory ...
+ * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
+ * across the spectrum, so we really want to be prepared to get all
+ * of these. Plus, more powerful systems might have more than 64
+ * IO-APIC registers.
+ *
+ * (these are usually mapped into the 0x30-0xff vector range)
+ */
+
 #ifdef CONFIG_X86_32
 /*
  * Note that on a 486, we don't want to do a SIGFPE on an irq13
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index ed50e35..687b6c3 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -17,11 +17,14 @@
 
 #include <asm/atomic.h>
 #include <asm/system.h>
+#include <asm/timer.h>
 #include <asm/hw_irq.h>
 #include <asm/pgtable.h>
 #include <asm/desc.h>
 #include <asm/apic.h>
+#include <asm/setup.h>
 #include <asm/i8259.h>
+#include <asm/traps.h>
 
 /*
  * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
@@ -136,6 +139,7 @@ static void __init init_ISA_irqs(void)
 	}
 }
 
+/* Overridden in paravirt.c */
 void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
 
 static void __init smp_intr_init(void)
-- 
cgit v0.10.2


From ab19c25abd14db28d7454f00805ea59f22ed6057 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 9 Apr 2009 11:52:27 +0300
Subject: x86: unify apic_intr_init() in irqinit_{32,64}.c

Impact: cleanup

Reviewed-by Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 72ce942..f3be5e9 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -180,7 +180,12 @@ static void __init apic_intr_init(void)
 {
 	smp_intr_init();
 
-#ifdef CONFIG_X86_LOCAL_APIC
+#ifdef CONFIG_X86_64
+	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
+#endif
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
 	/* self generated IPI for local APIC timer */
 	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
@@ -192,10 +197,12 @@ static void __init apic_intr_init(void)
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 #endif
 
+#ifdef CONFIG_X86_32
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
 	/* thermal monitor LVT interrupt */
 	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
 #endif
+#endif
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 687b6c3..f3be5e9 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -180,9 +180,12 @@ static void __init apic_intr_init(void)
 {
 	smp_intr_init();
 
+#ifdef CONFIG_X86_64
 	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
 	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
+#endif
 
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
 	/* self generated IPI for local APIC timer */
 	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
@@ -192,6 +195,14 @@ static void __init apic_intr_init(void)
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+#endif
+
+#ifdef CONFIG_X86_32
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
+	/* thermal monitor LVT interrupt */
+	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+#endif
+#endif
 }
 
 #ifdef CONFIG_X86_32
-- 
cgit v0.10.2


From 31cb45ef2600d47191d51253ec94b5e3f689260d Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 9 Apr 2009 11:52:28 +0300
Subject: x86: unify irqinit_{32,64}.c into irqinit.c

Impact: cleanup

Reviewed-by Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 145cce7..16e3acf 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -28,7 +28,7 @@ CFLAGS_paravirt.o	:= $(nostackp)
 obj-y			:= process_$(BITS).o signal.o entry_$(BITS).o
 obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y			+= time_$(BITS).o ioport.o ldt.o dumpstack.o
-obj-y			+= setup.o i8259.o irqinit_$(BITS).o
+obj-y			+= setup.o i8259.o irqinit.o
 obj-$(CONFIG_X86_VISWS)	+= visws_quirks.o
 obj-$(CONFIG_X86_32)	+= probe_roms_32.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
new file mode 100644
index 0000000..f3be5e9
--- /dev/null
+++ b/arch/x86/kernel/irqinit.c
@@ -0,0 +1,277 @@
+#include <linux/linkage.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/timex.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/bitops.h>
+#include <linux/acpi.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/timer.h>
+#include <asm/hw_irq.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+#include <asm/apic.h>
+#include <asm/setup.h>
+#include <asm/i8259.h>
+#include <asm/traps.h>
+
+/*
+ * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
+ * (these are usually mapped to vectors 0x30-0x3f)
+ */
+
+/*
+ * The IO-APIC gives us many more interrupt sources. Most of these
+ * are unused but an SMP system is supposed to have enough memory ...
+ * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
+ * across the spectrum, so we really want to be prepared to get all
+ * of these. Plus, more powerful systems might have more than 64
+ * IO-APIC registers.
+ *
+ * (these are usually mapped into the 0x30-0xff vector range)
+ */
+
+#ifdef CONFIG_X86_32
+/*
+ * Note that on a 486, we don't want to do a SIGFPE on an irq13
+ * as the irq is unreliable, and exception 16 works correctly
+ * (ie as explained in the intel literature). On a 386, you
+ * can't use exception 16 due to bad IBM design, so we have to
+ * rely on the less exact irq13.
+ *
+ * Careful.. Not only is IRQ13 unreliable, but it is also
+ * leads to races. IBM designers who came up with it should
+ * be shot.
+ */
+
+static irqreturn_t math_error_irq(int cpl, void *dev_id)
+{
+	outb(0, 0xF0);
+	if (ignore_fpu_irq || !boot_cpu_data.hard_math)
+		return IRQ_NONE;
+	math_error((void __user *)get_irq_regs()->ip);
+	return IRQ_HANDLED;
+}
+
+/*
+ * New motherboards sometimes make IRQ 13 be a PCI interrupt,
+ * so allow interrupt sharing.
+ */
+static struct irqaction fpu_irq = {
+	.handler = math_error_irq,
+	.name = "fpu",
+};
+#endif
+
+/*
+ * IRQ2 is cascade interrupt to second interrupt controller
+ */
+static struct irqaction irq2 = {
+	.handler = no_action,
+	.name = "cascade",
+};
+
+DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+	[0 ... IRQ0_VECTOR - 1] = -1,
+	[IRQ0_VECTOR] = 0,
+	[IRQ1_VECTOR] = 1,
+	[IRQ2_VECTOR] = 2,
+	[IRQ3_VECTOR] = 3,
+	[IRQ4_VECTOR] = 4,
+	[IRQ5_VECTOR] = 5,
+	[IRQ6_VECTOR] = 6,
+	[IRQ7_VECTOR] = 7,
+	[IRQ8_VECTOR] = 8,
+	[IRQ9_VECTOR] = 9,
+	[IRQ10_VECTOR] = 10,
+	[IRQ11_VECTOR] = 11,
+	[IRQ12_VECTOR] = 12,
+	[IRQ13_VECTOR] = 13,
+	[IRQ14_VECTOR] = 14,
+	[IRQ15_VECTOR] = 15,
+	[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
+};
+
+int vector_used_by_percpu_irq(unsigned int vector)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		if (per_cpu(vector_irq, cpu)[vector] != -1)
+			return 1;
+	}
+
+	return 0;
+}
+
+static void __init init_ISA_irqs(void)
+{
+	int i;
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
+	init_bsp_APIC();
+#endif
+	init_8259A(0);
+
+	/*
+	 * 16 old-style INTA-cycle interrupts:
+	 */
+	for (i = 0; i < NR_IRQS_LEGACY; i++) {
+		struct irq_desc *desc = irq_to_desc(i);
+
+		desc->status = IRQ_DISABLED;
+		desc->action = NULL;
+		desc->depth = 1;
+
+		set_irq_chip_and_handler_name(i, &i8259A_chip,
+					      handle_level_irq, "XT");
+	}
+}
+
+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+static void __init smp_intr_init(void)
+{
+#ifdef CONFIG_SMP
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
+	/*
+	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
+	 * IPI, driven by wakeup.
+	 */
+	alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+
+	/* IPIs for invalidation */
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
+
+	/* IPI for generic function call */
+	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+
+	/* IPI for generic single function call */
+	alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
+			call_function_single_interrupt);
+
+	/* Low priority IPI to cleanup after moving an irq */
+	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
+#endif
+#endif /* CONFIG_SMP */
+}
+
+static void __init apic_intr_init(void)
+{
+	smp_intr_init();
+
+#ifdef CONFIG_X86_64
+	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
+#endif
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
+	/* self generated IPI for local APIC timer */
+	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+
+	/* generic IPI for platform specific use */
+	alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
+
+	/* IPI vectors for APIC spurious and error interrupts */
+	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+#endif
+
+#ifdef CONFIG_X86_32
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
+	/* thermal monitor LVT interrupt */
+	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+#endif
+#endif
+}
+
+#ifdef CONFIG_X86_32
+/**
+ * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
+ *
+ * Description:
+ *	Perform any necessary interrupt initialisation prior to setting up
+ *	the "ordinary" interrupt call gates.  For legacy reasons, the ISA
+ *	interrupts should be initialised here if the machine emulates a PC
+ *	in any way.
+ **/
+static void __init x86_quirk_pre_intr_init(void)
+{
+	if (x86_quirks->arch_pre_intr_init) {
+		if (x86_quirks->arch_pre_intr_init())
+			return;
+	}
+	init_ISA_irqs();
+}
+#endif
+
+void __init native_init_IRQ(void)
+{
+	int i;
+
+#ifdef CONFIG_X86_32
+	/* Execute any quirks before the call gates are initialised: */
+	x86_quirk_pre_intr_init();
+#else
+	init_ISA_irqs();
+#endif
+
+	/*
+	 * Cover the whole vector space, no vector can escape
+	 * us. (some of these will be overridden and become
+	 * 'special' SMP interrupts)
+	 */
+	for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
+#ifdef CONFIG_X86_32
+		/* SYSCALL_VECTOR was reserved in trap_init. */
+		if (i != SYSCALL_VECTOR)
+			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
+#else
+		/* IA32_SYSCALL_VECTOR was reserved in trap_init. */
+		if (i != IA32_SYSCALL_VECTOR)
+			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
+#endif
+	}
+
+	apic_intr_init();
+
+	if (!acpi_ioapic)
+		setup_irq(2, &irq2);
+
+#ifdef CONFIG_X86_32
+	/*
+	 * Call quirks after call gates are initialised (usually add in
+	 * the architecture specific gates):
+	 */
+	x86_quirk_intr_init();
+
+	/*
+	 * External FPU? Set up irq13 if so, for
+	 * original braindamaged IBM FERR coupling.
+	 */
+	if (boot_cpu_data.hard_math && !cpu_has_fpu)
+		setup_irq(FPU_IRQ, &fpu_irq);
+
+	irq_ctx_init(smp_processor_id());
+#endif
+}
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
deleted file mode 100644
index f3be5e9..0000000
--- a/arch/x86/kernel/irqinit_32.c
+++ /dev/null
@@ -1,277 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/ioport.h>
-#include <linux/interrupt.h>
-#include <linux/timex.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/init.h>
-#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
-#include <linux/bitops.h>
-#include <linux/acpi.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <asm/timer.h>
-#include <asm/hw_irq.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/apic.h>
-#include <asm/setup.h>
-#include <asm/i8259.h>
-#include <asm/traps.h>
-
-/*
- * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
- * (these are usually mapped to vectors 0x30-0x3f)
- */
-
-/*
- * The IO-APIC gives us many more interrupt sources. Most of these
- * are unused but an SMP system is supposed to have enough memory ...
- * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
- * across the spectrum, so we really want to be prepared to get all
- * of these. Plus, more powerful systems might have more than 64
- * IO-APIC registers.
- *
- * (these are usually mapped into the 0x30-0xff vector range)
- */
-
-#ifdef CONFIG_X86_32
-/*
- * Note that on a 486, we don't want to do a SIGFPE on an irq13
- * as the irq is unreliable, and exception 16 works correctly
- * (ie as explained in the intel literature). On a 386, you
- * can't use exception 16 due to bad IBM design, so we have to
- * rely on the less exact irq13.
- *
- * Careful.. Not only is IRQ13 unreliable, but it is also
- * leads to races. IBM designers who came up with it should
- * be shot.
- */
-
-static irqreturn_t math_error_irq(int cpl, void *dev_id)
-{
-	outb(0, 0xF0);
-	if (ignore_fpu_irq || !boot_cpu_data.hard_math)
-		return IRQ_NONE;
-	math_error((void __user *)get_irq_regs()->ip);
-	return IRQ_HANDLED;
-}
-
-/*
- * New motherboards sometimes make IRQ 13 be a PCI interrupt,
- * so allow interrupt sharing.
- */
-static struct irqaction fpu_irq = {
-	.handler = math_error_irq,
-	.name = "fpu",
-};
-#endif
-
-/*
- * IRQ2 is cascade interrupt to second interrupt controller
- */
-static struct irqaction irq2 = {
-	.handler = no_action,
-	.name = "cascade",
-};
-
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
-	[0 ... IRQ0_VECTOR - 1] = -1,
-	[IRQ0_VECTOR] = 0,
-	[IRQ1_VECTOR] = 1,
-	[IRQ2_VECTOR] = 2,
-	[IRQ3_VECTOR] = 3,
-	[IRQ4_VECTOR] = 4,
-	[IRQ5_VECTOR] = 5,
-	[IRQ6_VECTOR] = 6,
-	[IRQ7_VECTOR] = 7,
-	[IRQ8_VECTOR] = 8,
-	[IRQ9_VECTOR] = 9,
-	[IRQ10_VECTOR] = 10,
-	[IRQ11_VECTOR] = 11,
-	[IRQ12_VECTOR] = 12,
-	[IRQ13_VECTOR] = 13,
-	[IRQ14_VECTOR] = 14,
-	[IRQ15_VECTOR] = 15,
-	[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
-};
-
-int vector_used_by_percpu_irq(unsigned int vector)
-{
-	int cpu;
-
-	for_each_online_cpu(cpu) {
-		if (per_cpu(vector_irq, cpu)[vector] != -1)
-			return 1;
-	}
-
-	return 0;
-}
-
-static void __init init_ISA_irqs(void)
-{
-	int i;
-
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
-	init_bsp_APIC();
-#endif
-	init_8259A(0);
-
-	/*
-	 * 16 old-style INTA-cycle interrupts:
-	 */
-	for (i = 0; i < NR_IRQS_LEGACY; i++) {
-		struct irq_desc *desc = irq_to_desc(i);
-
-		desc->status = IRQ_DISABLED;
-		desc->action = NULL;
-		desc->depth = 1;
-
-		set_irq_chip_and_handler_name(i, &i8259A_chip,
-					      handle_level_irq, "XT");
-	}
-}
-
-/* Overridden in paravirt.c */
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-static void __init smp_intr_init(void)
-{
-#ifdef CONFIG_SMP
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
-	/*
-	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
-	 * IPI, driven by wakeup.
-	 */
-	alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
-	/* IPIs for invalidation */
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
-
-	/* IPI for generic function call */
-	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-
-	/* IPI for generic single function call */
-	alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
-			call_function_single_interrupt);
-
-	/* Low priority IPI to cleanup after moving an irq */
-	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
-	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
-#endif
-#endif /* CONFIG_SMP */
-}
-
-static void __init apic_intr_init(void)
-{
-	smp_intr_init();
-
-#ifdef CONFIG_X86_64
-	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
-#endif
-
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
-	/* self generated IPI for local APIC timer */
-	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
-
-	/* generic IPI for platform specific use */
-	alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
-
-	/* IPI vectors for APIC spurious and error interrupts */
-	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
-	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-#endif
-
-#ifdef CONFIG_X86_32
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
-	/* thermal monitor LVT interrupt */
-	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-#endif
-#endif
-}
-
-#ifdef CONFIG_X86_32
-/**
- * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
- *
- * Description:
- *	Perform any necessary interrupt initialisation prior to setting up
- *	the "ordinary" interrupt call gates.  For legacy reasons, the ISA
- *	interrupts should be initialised here if the machine emulates a PC
- *	in any way.
- **/
-static void __init x86_quirk_pre_intr_init(void)
-{
-	if (x86_quirks->arch_pre_intr_init) {
-		if (x86_quirks->arch_pre_intr_init())
-			return;
-	}
-	init_ISA_irqs();
-}
-#endif
-
-void __init native_init_IRQ(void)
-{
-	int i;
-
-#ifdef CONFIG_X86_32
-	/* Execute any quirks before the call gates are initialised: */
-	x86_quirk_pre_intr_init();
-#else
-	init_ISA_irqs();
-#endif
-
-	/*
-	 * Cover the whole vector space, no vector can escape
-	 * us. (some of these will be overridden and become
-	 * 'special' SMP interrupts)
-	 */
-	for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
-#ifdef CONFIG_X86_32
-		/* SYSCALL_VECTOR was reserved in trap_init. */
-		if (i != SYSCALL_VECTOR)
-			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
-#else
-		/* IA32_SYSCALL_VECTOR was reserved in trap_init. */
-		if (i != IA32_SYSCALL_VECTOR)
-			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
-#endif
-	}
-
-	apic_intr_init();
-
-	if (!acpi_ioapic)
-		setup_irq(2, &irq2);
-
-#ifdef CONFIG_X86_32
-	/*
-	 * Call quirks after call gates are initialised (usually add in
-	 * the architecture specific gates):
-	 */
-	x86_quirk_intr_init();
-
-	/*
-	 * External FPU? Set up irq13 if so, for
-	 * original braindamaged IBM FERR coupling.
-	 */
-	if (boot_cpu_data.hard_math && !cpu_has_fpu)
-		setup_irq(FPU_IRQ, &fpu_irq);
-
-	irq_ctx_init(smp_processor_id());
-#endif
-}
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
deleted file mode 100644
index f3be5e9..0000000
--- a/arch/x86/kernel/irqinit_64.c
+++ /dev/null
@@ -1,277 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/ioport.h>
-#include <linux/interrupt.h>
-#include <linux/timex.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/init.h>
-#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
-#include <linux/bitops.h>
-#include <linux/acpi.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <asm/timer.h>
-#include <asm/hw_irq.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/apic.h>
-#include <asm/setup.h>
-#include <asm/i8259.h>
-#include <asm/traps.h>
-
-/*
- * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
- * (these are usually mapped to vectors 0x30-0x3f)
- */
-
-/*
- * The IO-APIC gives us many more interrupt sources. Most of these
- * are unused but an SMP system is supposed to have enough memory ...
- * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
- * across the spectrum, so we really want to be prepared to get all
- * of these. Plus, more powerful systems might have more than 64
- * IO-APIC registers.
- *
- * (these are usually mapped into the 0x30-0xff vector range)
- */
-
-#ifdef CONFIG_X86_32
-/*
- * Note that on a 486, we don't want to do a SIGFPE on an irq13
- * as the irq is unreliable, and exception 16 works correctly
- * (ie as explained in the intel literature). On a 386, you
- * can't use exception 16 due to bad IBM design, so we have to
- * rely on the less exact irq13.
- *
- * Careful.. Not only is IRQ13 unreliable, but it is also
- * leads to races. IBM designers who came up with it should
- * be shot.
- */
-
-static irqreturn_t math_error_irq(int cpl, void *dev_id)
-{
-	outb(0, 0xF0);
-	if (ignore_fpu_irq || !boot_cpu_data.hard_math)
-		return IRQ_NONE;
-	math_error((void __user *)get_irq_regs()->ip);
-	return IRQ_HANDLED;
-}
-
-/*
- * New motherboards sometimes make IRQ 13 be a PCI interrupt,
- * so allow interrupt sharing.
- */
-static struct irqaction fpu_irq = {
-	.handler = math_error_irq,
-	.name = "fpu",
-};
-#endif
-
-/*
- * IRQ2 is cascade interrupt to second interrupt controller
- */
-static struct irqaction irq2 = {
-	.handler = no_action,
-	.name = "cascade",
-};
-
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
-	[0 ... IRQ0_VECTOR - 1] = -1,
-	[IRQ0_VECTOR] = 0,
-	[IRQ1_VECTOR] = 1,
-	[IRQ2_VECTOR] = 2,
-	[IRQ3_VECTOR] = 3,
-	[IRQ4_VECTOR] = 4,
-	[IRQ5_VECTOR] = 5,
-	[IRQ6_VECTOR] = 6,
-	[IRQ7_VECTOR] = 7,
-	[IRQ8_VECTOR] = 8,
-	[IRQ9_VECTOR] = 9,
-	[IRQ10_VECTOR] = 10,
-	[IRQ11_VECTOR] = 11,
-	[IRQ12_VECTOR] = 12,
-	[IRQ13_VECTOR] = 13,
-	[IRQ14_VECTOR] = 14,
-	[IRQ15_VECTOR] = 15,
-	[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
-};
-
-int vector_used_by_percpu_irq(unsigned int vector)
-{
-	int cpu;
-
-	for_each_online_cpu(cpu) {
-		if (per_cpu(vector_irq, cpu)[vector] != -1)
-			return 1;
-	}
-
-	return 0;
-}
-
-static void __init init_ISA_irqs(void)
-{
-	int i;
-
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
-	init_bsp_APIC();
-#endif
-	init_8259A(0);
-
-	/*
-	 * 16 old-style INTA-cycle interrupts:
-	 */
-	for (i = 0; i < NR_IRQS_LEGACY; i++) {
-		struct irq_desc *desc = irq_to_desc(i);
-
-		desc->status = IRQ_DISABLED;
-		desc->action = NULL;
-		desc->depth = 1;
-
-		set_irq_chip_and_handler_name(i, &i8259A_chip,
-					      handle_level_irq, "XT");
-	}
-}
-
-/* Overridden in paravirt.c */
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-static void __init smp_intr_init(void)
-{
-#ifdef CONFIG_SMP
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
-	/*
-	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
-	 * IPI, driven by wakeup.
-	 */
-	alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
-	/* IPIs for invalidation */
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
-
-	/* IPI for generic function call */
-	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-
-	/* IPI for generic single function call */
-	alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
-			call_function_single_interrupt);
-
-	/* Low priority IPI to cleanup after moving an irq */
-	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
-	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
-#endif
-#endif /* CONFIG_SMP */
-}
-
-static void __init apic_intr_init(void)
-{
-	smp_intr_init();
-
-#ifdef CONFIG_X86_64
-	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
-#endif
-
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
-	/* self generated IPI for local APIC timer */
-	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
-
-	/* generic IPI for platform specific use */
-	alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
-
-	/* IPI vectors for APIC spurious and error interrupts */
-	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
-	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-#endif
-
-#ifdef CONFIG_X86_32
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
-	/* thermal monitor LVT interrupt */
-	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-#endif
-#endif
-}
-
-#ifdef CONFIG_X86_32
-/**
- * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
- *
- * Description:
- *	Perform any necessary interrupt initialisation prior to setting up
- *	the "ordinary" interrupt call gates.  For legacy reasons, the ISA
- *	interrupts should be initialised here if the machine emulates a PC
- *	in any way.
- **/
-static void __init x86_quirk_pre_intr_init(void)
-{
-	if (x86_quirks->arch_pre_intr_init) {
-		if (x86_quirks->arch_pre_intr_init())
-			return;
-	}
-	init_ISA_irqs();
-}
-#endif
-
-void __init native_init_IRQ(void)
-{
-	int i;
-
-#ifdef CONFIG_X86_32
-	/* Execute any quirks before the call gates are initialised: */
-	x86_quirk_pre_intr_init();
-#else
-	init_ISA_irqs();
-#endif
-
-	/*
-	 * Cover the whole vector space, no vector can escape
-	 * us. (some of these will be overridden and become
-	 * 'special' SMP interrupts)
-	 */
-	for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
-#ifdef CONFIG_X86_32
-		/* SYSCALL_VECTOR was reserved in trap_init. */
-		if (i != SYSCALL_VECTOR)
-			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
-#else
-		/* IA32_SYSCALL_VECTOR was reserved in trap_init. */
-		if (i != IA32_SYSCALL_VECTOR)
-			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
-#endif
-	}
-
-	apic_intr_init();
-
-	if (!acpi_ioapic)
-		setup_irq(2, &irq2);
-
-#ifdef CONFIG_X86_32
-	/*
-	 * Call quirks after call gates are initialised (usually add in
-	 * the architecture specific gates):
-	 */
-	x86_quirk_intr_init();
-
-	/*
-	 * External FPU? Set up irq13 if so, for
-	 * original braindamaged IBM FERR coupling.
-	 */
-	if (boot_cpu_data.hard_math && !cpu_has_fpu)
-		setup_irq(FPU_IRQ, &fpu_irq);
-
-	irq_ctx_init(smp_processor_id());
-#endif
-}
-- 
cgit v0.10.2


From ac3048dfd4740becf8d768844cf47ebee363c9f8 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 9 Apr 2009 11:52:29 +0300
Subject: x86: define IA32_SYSCALL_VECTOR on 32-bit to reduce ifdefs

Impact: cleanup

We can remove some #ifdefs if we define IA32_SYSCALL_VECTOR on 32-bit.

Reviewed-by Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 3cbd79b..910b5a3 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -34,6 +34,7 @@
 
 #ifdef CONFIG_X86_32
 # define SYSCALL_VECTOR			0x80
+# define IA32_SYSCALL_VECTOR		0x80
 #else
 # define IA32_SYSCALL_VECTOR		0x80
 #endif
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index f3be5e9..f2c60a5 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -242,15 +242,9 @@ void __init native_init_IRQ(void)
 	 * 'special' SMP interrupts)
 	 */
 	for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
-#ifdef CONFIG_X86_32
-		/* SYSCALL_VECTOR was reserved in trap_init. */
-		if (i != SYSCALL_VECTOR)
-			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
-#else
 		/* IA32_SYSCALL_VECTOR was reserved in trap_init. */
 		if (i != IA32_SYSCALL_VECTOR)
 			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
-#endif
 	}
 
 	apic_intr_init();
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a1d2883..2310700 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -969,11 +969,8 @@ void __init trap_init(void)
 	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
 		set_bit(i, used_vectors);
 
-#ifdef CONFIG_X86_64
 	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
-#else
-	set_bit(SYSCALL_VECTOR, used_vectors);
-#endif
+
 	/*
 	 * Should be a barrier for any external CPU state:
 	 */
-- 
cgit v0.10.2


From abdb5a5713330e17dfe91ab0d3e29c4744d95162 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 9 Apr 2009 11:52:30 +0300
Subject: x86: remove some ifdefs from native_init_IRQ()

Impact: cleanup

Reviewed-by Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index f2c60a5..6269772 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -205,7 +205,6 @@ static void __init apic_intr_init(void)
 #endif
 }
 
-#ifdef CONFIG_X86_32
 /**
  * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
  *
@@ -217,24 +216,21 @@ static void __init apic_intr_init(void)
  **/
 static void __init x86_quirk_pre_intr_init(void)
 {
+#ifdef CONFIG_X86_32
 	if (x86_quirks->arch_pre_intr_init) {
 		if (x86_quirks->arch_pre_intr_init())
 			return;
 	}
+#endif
 	init_ISA_irqs();
 }
-#endif
 
 void __init native_init_IRQ(void)
 {
 	int i;
 
-#ifdef CONFIG_X86_32
 	/* Execute any quirks before the call gates are initialised: */
 	x86_quirk_pre_intr_init();
-#else
-	init_ISA_irqs();
-#endif
 
 	/*
 	 * Cover the whole vector space, no vector can escape
-- 
cgit v0.10.2


From 6265ff19ca08df0d96c859ae5e4dc2d9ad07070e Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Thu, 9 Apr 2009 15:47:10 +0200
Subject: x86: cacheinfo: complete L2/L3 Cache and TLB associativity field
 definitions

See "CPUID Specification" (AMD Publication #: 25481, Rev. 2.28, April 2008)

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Mark Langsdorf <mark.langsdorf@amd.com>
LKML-Reference: <20090409134710.GA8026@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index d46a849..789efe2 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -200,10 +200,17 @@ union l3_cache {
 };
 
 static const unsigned short __cpuinitconst assocs[] = {
-	[1] = 1, [2] = 2, [4] = 4, [6] = 8,
-	[8] = 16, [0xa] = 32, [0xb] = 48,
+	[1] = 1,
+	[2] = 2,
+	[4] = 4,
+	[6] = 8,
+	[8] = 16,
+	[0xa] = 32,
+	[0xb] = 48,
 	[0xc] = 64,
-	[0xf] = 0xffff // ??
+	[0xd] = 96,
+	[0xe] = 128,
+	[0xf] = 0xffff /* fully associative - no way to show this currently */
 };
 
 static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
@@ -264,7 +271,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
 	eax->split.type = types[leaf];
 	eax->split.level = levels[leaf];
 	if (leaf == 3)
-		eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1;
+		eax->split.num_threads_sharing =
+			current_cpu_data.x86_max_cores - 1;
 	else
 		eax->split.num_threads_sharing = 0;
 	eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
-- 
cgit v0.10.2


From 47f16ca7631f9c6bad8e6d968cfb1433029b09ec Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 10 Apr 2009 14:58:05 +0200
Subject: x86, irqinit: preempt merge conflicts

To make the topic merge life easier for tip:perfcounters/core,
include two (inactive in this topic) IRQ vector initializations
here.

Also fix build bug - missing kprobes.h inclusion.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 6269772..b424c32 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -7,6 +7,7 @@
 #include <linux/timex.h>
 #include <linux/slab.h>
 #include <linux/random.h>
+#include <linux/kprobes.h>
 #include <linux/init.h>
 #include <linux/kernel_stat.h>
 #include <linux/sysdev.h>
@@ -195,6 +196,13 @@ static void __init apic_intr_init(void)
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+
+	/* Performance monitoring interrupts: */
+# ifdef CONFIG_PERF_COUNTERS
+	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
+# endif
+
 #endif
 
 #ifdef CONFIG_X86_32
-- 
cgit v0.10.2


From a5a2a0c7fa039c59619bc908b3b1ed24734d442a Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Fri, 10 Apr 2009 09:50:05 -0700
Subject: futex: fix futex_wait_setup key handling

If the get_futex_key() call were to fail, the existing code would
try and put_futex_key() prior to returning.  This patch makes sure
we only put_futex_key() if get_futex_key() succeeded.

Reported-by: Clark Williams <williams@redhat.com>
Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
LKML-Reference: <20090410165005.14342.16973.stgit@Aeon>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/futex.c b/kernel/futex.c
index 041bf3a..6d2daa4 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1668,7 +1668,7 @@ retry:
 	q->key = FUTEX_KEY_INIT;
 	ret = get_futex_key(uaddr, fshared, &q->key);
 	if (unlikely(ret != 0))
-		goto out;
+		return ret;
 
 retry_private:
 	*hb = queue_lock(q);
-- 
cgit v0.10.2


From cf9972a921470b0a2da7906104bcd540b20e33bf Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Sat, 11 Apr 2009 22:24:05 -0700
Subject: x86, setup: fix comment in the "glove box" code

Impact: Comment change only

The glove box is about avoiding problems with *registers* being
touched, not *memory*.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S
index 22b4b3e..5077937 100644
--- a/arch/x86/boot/bioscall.S
+++ b/arch/x86/boot/bioscall.S
@@ -10,7 +10,7 @@
 
 /*
  * "Glove box" for BIOS calls.  Avoids the constant problems with BIOSes
- * touching memory they shouldn't be.
+ * touching registers they shouldn't be.
  */
 
 	.code16
-- 
cgit v0.10.2


From 2de1f33e99cec5fd79542a1d0e26efb9c36a98bb Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Sat, 11 Apr 2009 12:55:26 +0530
Subject: x86: apic/x2apic_cluster.c x86_cpu_to_logical_apicid should be static

Impact: reduce kernel size a bit, address sparse warning

Addresses the problem pointed out by this sparse warning:
  arch/x86/kernel/apic/x2apic_cluster.c:13:1: warning: symbol 'per_cpu__x86_cpu_to_logical_apicid' was not declared. Should it be static?

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1239434726.4418.24.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 4a903e2..8e4cbb2 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -10,7 +10,7 @@
 #include <asm/apic.h>
 #include <asm/ipi.h>
 
-DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
 
 static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
-- 
cgit v0.10.2


From 56c49951747f250d8398582509e02ae5ce1d36d1 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 11 Apr 2009 15:51:19 -0400
Subject: tracing: Add documentation for the power tracer

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <srostedt@redhat.com>
LKML-Reference: <1239479479-2603-4-git-send-email-tytso@mit.edu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/trace/power.txt b/Documentation/trace/power.txt
new file mode 100644
index 0000000..cd805e1
--- /dev/null
+++ b/Documentation/trace/power.txt
@@ -0,0 +1,17 @@
+The power tracer collects detailed information about C-state and P-state
+transitions, instead of just looking at the high-level "average"
+information.
+
+There is a helper script found in scrips/tracing/power.pl in the kernel
+sources which can be used to parse this information and create a
+Scalable Vector Graphics (SVG) picture from the trace data.
+
+To use this tracer:
+
+	echo 0 > /sys/kernel/debug/tracing/tracing_enabled
+	echo power > /sys/kernel/debug/tracing/current_tracer
+	echo 1 > /sys/kernel/debug/tracing/tracing_enabled
+	sleep 1
+	echo 0 > /sys/kernel/debug/tracing/tracing_enabled
+	cat /sys/kernel/debug/tracing/trace | \
+		perl scripts/tracing/power.pl > out.sv
-- 
cgit v0.10.2


From abd41443ac76d3e9c29a8c1d9e9a3312306cc55e Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 11 Apr 2009 15:51:18 -0400
Subject: tracing: Document the event tracing system

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1239479479-2603-3-git-send-email-tytso@mit.edu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
new file mode 100644
index 0000000..abdee66
--- /dev/null
+++ b/Documentation/trace/events.txt
@@ -0,0 +1,135 @@
+			     Event Tracing
+
+		Documentation written by Theodore Ts'o
+
+Introduction
+============
+
+Tracepoints (see Documentation/trace/tracepoints.txt) can be used
+without creating custom kernel modules to register probe functions
+using the event tracing infrastructure.
+
+Not all tracepoints can be traced using the event tracing system;
+the kernel developer must provide code snippets which define how the
+tracing information is saved into the tracing buffer, and how the
+the tracing information should be printed.
+
+Using Event Tracing
+===================
+
+The events which are available for tracing can be found in the file
+/sys/kernel/debug/tracing/available_events.
+
+To enable a particular event, such as 'sched_wakeup', simply echo it
+to /sys/debug/tracing/set_event. For example:
+
+	# echo sched_wakeup > /sys/kernel/debug/tracing/set_event
+
+[ Note: events can also be enabled/disabled via the 'enabled' toggle
+  found in the /sys/kernel/tracing/events/ hierarchy of directories. ]
+
+To disable an event, echo the event name to the set_event file prefixed
+with an exclamation point:
+
+	# echo '!sched_wakeup' >> /sys/kernel/debug/tracing/set_event
+
+To disable events, echo an empty line to the set_event file:
+
+	# echo > /sys/kernel/debug/tracing/set_event
+
+The events are organized into subsystems, such as ext4, irq, sched,
+etc., and a full event name looks like this: <subsystem>:<event>.  The
+subsystem name is optional, but it is displayed in the available_events
+file.  All of the events in a subsystem can be specified via the syntax
+"<subsystem>:*"; for example, to enable all irq events, you can use the
+command:
+
+	# echo 'irq:*' > /sys/kernel/debug/tracing/set_event
+
+Defining an event-enabled tracepoint
+------------------------------------
+
+A kernel developer which wishes to define an event-enabled tracepoint
+must declare the tracepoint using TRACE_EVENT instead of DECLARE_TRACE.
+This is done via two header files in include/trace.  For example, to
+event-enable the jbd2 subsystem, we must create two files,
+include/trace/jbd2.h and include/trace/jbd2_event_types.h.  The
+include/trace/jbd2.h file should be included by kernel source files that
+will have a tracepoint inserted, and might look like this:
+
+#ifndef _TRACE_JBD2_H
+#define _TRACE_JBD2_H
+
+#include <linux/jbd2.h>
+#include <linux/tracepoint.h>
+
+#include <trace/jbd2_event_types.h>
+
+#endif
+
+In a file that utilizes a jbd2 tracepoint, this header file would be
+included.  Note that you still have to use DEFINE_TRACE().  So for
+example, if fs/jbd2/commit.c planned to use the jbd2_start_commit
+tracepoint, it would have the following near the beginning of the file:
+
+#include <trace/jbd2.h>
+
+DEFINE_TRACE(jbd2_start_commit);
+
+Then in the function that would call the tracepoint, it would call the
+tracepoint function.  (For more information, please see the tracepoint
+documentation in Documentation/trace/tracepoints.txt):
+
+	trace_jbd2_start_commit(journal, commit_transaction);
+
+The code snippets which allow jbd2_start_commit to be an event-enabled
+tracepoint are placed in the file include/trace/jbd2_event_types.h:
+
+/* use <trace/jbd2.h> instead */
+#ifndef TRACE_EVENT
+# error Do not include this file directly.
+# error Unless you know what you are doing.
+#endif
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM jbd2
+
+#include <linux/jbd2.h>
+
+TRACE_EVENT(jbd2_start_commit,
+	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
+	TP_ARGS(journal, commit_transaction),
+	TP_STRUCT__entry(
+		__array(	char,	devname, BDEVNAME_SIZE+24 )
+		__field(	int,	transaction		  )
+	),
+	TP_fast_assign(
+		memcpy(__entry->devname, journal->j_devname, BDEVNAME_SIZE+24);
+		__entry->transaction	= commit_transaction->t_tid;
+	),
+	TP_printk("dev %s transaction %d",
+		  __entry->devname, __entry->transaction)
+);
+
+The TP_PROTO and TP_ARGS are unchanged from DECLARE_TRACE.  The new
+arguments to TRACE_EVENT are TP_STRUCT__entry, TP_fast_assign, and
+TP_printk.
+
+TP_STRUCT__entry defines the data structure which will be stored in the
+trace buffer.  Normally, fields in __entry will be arrays or simple
+types.  It is possible to place data structures in __entry --- however,
+pointers in the data structure can not be trusted, since they will be
+accessed sometime later by TP_printk, and if the data structure contains
+fields that will not or cannot be used by TP_printk, this will waste
+space in the trace buffer.  In general, data structures should be
+avoided, unless they do only contain non-pointer types and all of the
+fields will be used by TP_printk.
+
+TP_fast_assign defines the code snippet which saves information into the
+__entry data structure, using the passed-in arguments defined in
+TP_PROTO and TP_ARGS.
+
+Finally, TP_printk will print the __entry data structure.  At the time
+when the code snippet defined by TP_printk is executed, it will not have
+access to the TP_ARGS arguments; it can only use the information saved
+in the __entry data structure.
-- 
cgit v0.10.2


From 2c1b284e4fa260fd922b9a65c99169e2630c6862 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Sat, 11 Apr 2009 00:03:10 +0530
Subject: x86: clean up declarations and variables

Impact: cleanup, no code changed

 - syscalls.h       update declarations due to unifications
 - irq.c            declare smp_generic_interrupt() before it gets used
 - process.c        declare sys_fork() and sys_vfork() before they get used
 - tsc.c            rename tsc_khz shadowed variable
 - apic/probe_32.c  declare apic_default before it gets used
 - apic/nmi.c       prev_nmi_count should be unsigned
 - apic/io_apic.c   declare smp_irq_move_cleanup_interrupt() before it gets used
 - mm/init.c        declare direct_gbpages and free_initrd_mem before they get used

Signed-off-by: Jaswinder Singh Rajput <jaswinder@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 42f2f83..5773660 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -478,6 +478,9 @@ static inline unsigned int read_apic_id(void)
 extern void default_setup_apic_routing(void);
 
 #ifdef CONFIG_X86_32
+
+extern struct apic apic_default;
+
 /*
  * Set up the logical destination ID.
  *
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index b762ea4..be9ae41 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -78,7 +78,11 @@ extern void eisa_set_level_irq(unsigned int irq);
 /* SMP */
 extern void smp_apic_timer_interrupt(struct pt_regs *);
 extern void smp_spurious_interrupt(struct pt_regs *);
+extern void smp_generic_interrupt(struct pt_regs *);
 extern void smp_error_interrupt(struct pt_regs *);
+#ifdef CONFIG_X86_IO_APIC
+extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
+#endif
 #ifdef CONFIG_SMP
 extern void smp_reschedule_interrupt(struct pt_regs *);
 extern void smp_call_function_interrupt(struct pt_regs *);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 29d96d1..3f8d09d 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -503,6 +503,8 @@ static inline int pgd_none(pgd_t pgd)
 
 #ifndef __ASSEMBLY__
 
+extern int direct_gbpages;
+
 /* local pte updates need not use xchg for locking */
 static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
 {
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 6b87bc6..abde308 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -25,10 +25,6 @@ extern pgd_t init_level4_pgt[];
 
 extern void paging_init(void);
 
-#endif /* !__ASSEMBLY__ */
-
-#ifndef __ASSEMBLY__
-
 #define pte_ERROR(e)					\
 	printk("%s:%d: bad pte %p(%016lx).\n",		\
 	       __FILE__, __LINE__, &(e), pte_val(e))
@@ -135,8 +131,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 
 #define update_mmu_cache(vma, address, pte) do { } while (0)
 
-extern int direct_gbpages;
-
 /* Encode and de-code a swap entry */
 #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
 #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 7043408..372b76e 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -1,7 +1,7 @@
 /*
  * syscalls.h - Linux syscall interfaces (arch-specific)
  *
- * Copyright (c) 2008 Jaswinder Singh
+ * Copyright (c) 2008 Jaswinder Singh Rajput
  *
  * This file is released under the GPLv2.
  * See the file COPYING for more details.
@@ -12,50 +12,55 @@
 
 #include <linux/compiler.h>
 #include <linux/linkage.h>
-#include <linux/types.h>
 #include <linux/signal.h>
+#include <linux/types.h>
 
 /* Common in X86_32 and X86_64 */
 /* kernel/ioport.c */
 asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
 
+/* kernel/process.c */
+int sys_fork(struct pt_regs *);
+int sys_vfork(struct pt_regs *);
+
 /* kernel/ldt.c */
 asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
 
+/* kernel/signal.c */
+long sys_rt_sigreturn(struct pt_regs *);
+
 /* kernel/tls.c */
 asmlinkage int sys_set_thread_area(struct user_desc __user *);
 asmlinkage int sys_get_thread_area(struct user_desc __user *);
 
 /* X86_32 only */
 #ifdef CONFIG_X86_32
+/* kernel/ioport.c */
+long sys_iopl(struct pt_regs *);
+
 /* kernel/process_32.c */
-int sys_fork(struct pt_regs *);
 int sys_clone(struct pt_regs *);
-int sys_vfork(struct pt_regs *);
 int sys_execve(struct pt_regs *);
 
-/* kernel/signal_32.c */
+/* kernel/signal.c */
 asmlinkage int sys_sigsuspend(int, int, old_sigset_t);
 asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
 			     struct old_sigaction __user *);
 int sys_sigaltstack(struct pt_regs *);
 unsigned long sys_sigreturn(struct pt_regs *);
-long sys_rt_sigreturn(struct pt_regs *);
-
-/* kernel/ioport.c */
-long sys_iopl(struct pt_regs *);
 
 /* kernel/sys_i386_32.c */
+struct mmap_arg_struct;
+struct sel_arg_struct;
+struct oldold_utsname;
+struct old_utsname;
+
 asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long,
 			  unsigned long, unsigned long, unsigned long);
-struct mmap_arg_struct;
 asmlinkage int old_mmap(struct mmap_arg_struct __user *);
-struct sel_arg_struct;
 asmlinkage int old_select(struct sel_arg_struct __user *);
 asmlinkage int sys_ipc(uint, int, int, int, void __user *, long);
-struct old_utsname;
 asmlinkage int sys_uname(struct old_utsname __user *);
-struct oldold_utsname;
 asmlinkage int sys_olduname(struct oldold_utsname __user *);
 
 /* kernel/vm86_32.c */
@@ -65,29 +70,27 @@ int sys_vm86(struct pt_regs *);
 #else /* CONFIG_X86_32 */
 
 /* X86_64 only */
+/* kernel/ioport.c */
+asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
+
 /* kernel/process_64.c */
-asmlinkage long sys_fork(struct pt_regs *);
 asmlinkage long sys_clone(unsigned long, unsigned long,
 			  void __user *, void __user *,
 			  struct pt_regs *);
-asmlinkage long sys_vfork(struct pt_regs *);
 asmlinkage long sys_execve(char __user *, char __user * __user *,
 			   char __user * __user *,
 			   struct pt_regs *);
 long sys_arch_prctl(int, unsigned long);
 
-/* kernel/ioport.c */
-asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
-
-/* kernel/signal_64.c */
+/* kernel/signal.c */
 asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *,
 				struct pt_regs *);
-long sys_rt_sigreturn(struct pt_regs *);
 
 /* kernel/sys_x86_64.c */
+struct new_utsname;
+
 asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long,
 			 unsigned long, unsigned long, unsigned long);
-struct new_utsname;
 asmlinkage long sys_uname(struct new_utsname __user *);
 
 #endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 767fe7e..870c92d 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -59,6 +59,7 @@
 #include <asm/setup.h>
 #include <asm/irq_remapping.h>
 #include <asm/hpet.h>
+#include <asm/hw_irq.h>
 #include <asm/uv/uv_hub.h>
 #include <asm/uv/uv_irq.h>
 
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index d6bd624..0205631 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -104,7 +104,7 @@ static __init void nmi_cpu_busy(void *data)
 }
 #endif
 
-static void report_broken_nmi(int cpu, int *prev_nmi_count)
+static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)
 {
 	printk(KERN_CONT "\n");
 
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 01eda2a..440a8bc 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -160,7 +160,6 @@ extern struct apic apic_summit;
 extern struct apic apic_bigsmp;
 extern struct apic apic_es7000;
 extern struct apic apic_es7000_cluster;
-extern struct apic apic_default;
 
 struct apic *apic = &apic_default;
 EXPORT_SYMBOL_GPL(apic);
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 3aaf7b9..2188267 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -12,6 +12,7 @@
 #include <asm/io_apic.h>
 #include <asm/irq.h>
 #include <asm/idle.h>
+#include <asm/hw_irq.h>
 
 atomic_t irq_err_count;
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ca98915..3e21e38 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -11,6 +11,7 @@
 #include <trace/power.h>
 #include <asm/system.h>
 #include <asm/apic.h>
+#include <asm/syscalls.h>
 #include <asm/idle.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7a567eb..a8dc0d0 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -384,13 +384,13 @@ unsigned long native_calibrate_tsc(void)
 {
 	u64 tsc1, tsc2, delta, ref1, ref2;
 	unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
-	unsigned long flags, latch, ms, fast_calibrate, tsc_khz;
+	unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz;
 	int hpet = is_hpet_enabled(), i, loopmin;
 
-	tsc_khz = get_hypervisor_tsc_freq();
-	if (tsc_khz) {
+	hv_tsc_khz = get_hypervisor_tsc_freq();
+	if (hv_tsc_khz) {
 		printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
-		return tsc_khz;
+		return hv_tsc_khz;
 	}
 
 	local_irq_save(flags);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index fd3da1d..40924e4 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1,3 +1,4 @@
+#include <linux/initrd.h>
 #include <linux/ioport.h>
 #include <linux/swap.h>
 
-- 
cgit v0.10.2


From 02af61bb50f5d5f0322dbe5ab2a0d75808d25c7b Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Fri, 10 Apr 2009 14:26:18 +0800
Subject: tracing, kmemtrace: Separate include/trace/kmemtrace.h to kmemtrace
 part and tracepoint part

Impact: refactor code for future changes

Current kmemtrace.h is used both as header file of kmemtrace and kmem's
tracepoints definition.

Tracepoints' definition file may be used by other code, and should only have
definition of tracepoint.

We can separate include/trace/kmemtrace.h into 2 files:

  include/linux/kmemtrace.h: header file for kmemtrace
  include/trace/kmem.h:      definition of kmem tracepoints

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Acked-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <49DEE68A.5040902@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h
new file mode 100644
index 0000000..15c45a2
--- /dev/null
+++ b/include/linux/kmemtrace.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2008 Eduard - Gabriel Munteanu
+ *
+ * This file is released under GPL version 2.
+ */
+
+#ifndef _LINUX_KMEMTRACE_H
+#define _LINUX_KMEMTRACE_H
+
+#ifdef __KERNEL__
+
+#include <trace/kmem.h>
+
+#ifdef CONFIG_KMEMTRACE
+extern void kmemtrace_init(void);
+#else
+static inline void kmemtrace_init(void)
+{
+}
+#endif
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_KMEMTRACE_H */
+
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 5ac9b0b..713f841 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -14,7 +14,7 @@
 #include <asm/page.h>		/* kmalloc_sizes.h needs PAGE_SIZE */
 #include <asm/cache.h>		/* kmalloc_sizes.h needs L1_CACHE_BYTES */
 #include <linux/compiler.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 
 /* Size description struct for general caches. */
 struct cache_sizes {
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 5046f90..be5d40c 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -10,7 +10,7 @@
 #include <linux/gfp.h>
 #include <linux/workqueue.h>
 #include <linux/kobject.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 
 enum stat_item {
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */
diff --git a/include/trace/kmem.h b/include/trace/kmem.h
new file mode 100644
index 0000000..24d2519
--- /dev/null
+++ b/include/trace/kmem.h
@@ -0,0 +1,44 @@
+#ifndef _TRACE_KMEM_H
+#define _TRACE_KMEM_H
+
+#include <linux/tracepoint.h>
+#include <linux/types.h>
+
+DECLARE_TRACE(kmalloc,
+	      TP_PROTO(unsigned long call_site,
+		      const void *ptr,
+		      size_t bytes_req,
+		      size_t bytes_alloc,
+		      gfp_t gfp_flags),
+	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags));
+DECLARE_TRACE(kmem_cache_alloc,
+	      TP_PROTO(unsigned long call_site,
+		      const void *ptr,
+		      size_t bytes_req,
+		      size_t bytes_alloc,
+		      gfp_t gfp_flags),
+	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags));
+DECLARE_TRACE(kmalloc_node,
+	      TP_PROTO(unsigned long call_site,
+		      const void *ptr,
+		      size_t bytes_req,
+		      size_t bytes_alloc,
+		      gfp_t gfp_flags,
+		      int node),
+	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node));
+DECLARE_TRACE(kmem_cache_alloc_node,
+	      TP_PROTO(unsigned long call_site,
+		      const void *ptr,
+		      size_t bytes_req,
+		      size_t bytes_alloc,
+		      gfp_t gfp_flags,
+		      int node),
+	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node));
+DECLARE_TRACE(kfree,
+	      TP_PROTO(unsigned long call_site, const void *ptr),
+	      TP_ARGS(call_site, ptr));
+DECLARE_TRACE(kmem_cache_free,
+	      TP_PROTO(unsigned long call_site, const void *ptr),
+	      TP_ARGS(call_site, ptr));
+
+#endif /* _TRACE_KMEM_H */
diff --git a/include/trace/kmemtrace.h b/include/trace/kmemtrace.h
deleted file mode 100644
index 28ee69f..0000000
--- a/include/trace/kmemtrace.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (C) 2008 Eduard - Gabriel Munteanu
- *
- * This file is released under GPL version 2.
- */
-
-#ifndef _LINUX_KMEMTRACE_H
-#define _LINUX_KMEMTRACE_H
-
-#ifdef __KERNEL__
-
-#include <linux/tracepoint.h>
-#include <linux/types.h>
-
-#ifdef CONFIG_KMEMTRACE
-extern void kmemtrace_init(void);
-#else
-static inline void kmemtrace_init(void)
-{
-}
-#endif
-
-DECLARE_TRACE(kmalloc,
-	      TP_PROTO(unsigned long call_site,
-		      const void *ptr,
-		      size_t bytes_req,
-		      size_t bytes_alloc,
-		      gfp_t gfp_flags),
-	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags));
-DECLARE_TRACE(kmem_cache_alloc,
-	      TP_PROTO(unsigned long call_site,
-		      const void *ptr,
-		      size_t bytes_req,
-		      size_t bytes_alloc,
-		      gfp_t gfp_flags),
-	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags));
-DECLARE_TRACE(kmalloc_node,
-	      TP_PROTO(unsigned long call_site,
-		      const void *ptr,
-		      size_t bytes_req,
-		      size_t bytes_alloc,
-		      gfp_t gfp_flags,
-		      int node),
-	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node));
-DECLARE_TRACE(kmem_cache_alloc_node,
-	      TP_PROTO(unsigned long call_site,
-		      const void *ptr,
-		      size_t bytes_req,
-		      size_t bytes_alloc,
-		      gfp_t gfp_flags,
-		      int node),
-	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node));
-DECLARE_TRACE(kfree,
-	      TP_PROTO(unsigned long call_site, const void *ptr),
-	      TP_ARGS(call_site, ptr));
-DECLARE_TRACE(kmem_cache_free,
-	      TP_PROTO(unsigned long call_site, const void *ptr),
-	      TP_ARGS(call_site, ptr));
-
-#endif /* __KERNEL__ */
-
-#endif /* _LINUX_KMEMTRACE_H */
-
diff --git a/init/main.c b/init/main.c
index 3585f07..eece40c 100644
--- a/init/main.c
+++ b/init/main.c
@@ -64,6 +64,7 @@
 #include <linux/idr.h>
 #include <linux/ftrace.h>
 #include <linux/async.h>
+#include <linux/kmemtrace.h>
 #include <trace/boot.h>
 
 #include <asm/io.h>
@@ -71,7 +72,6 @@
 #include <asm/setup.h>
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
-#include <trace/kmemtrace.h>
 
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/smp.h>
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 5011f4d..7a0aa0e 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -12,7 +12,7 @@
 #include <linux/dcache.h>
 #include <linux/fs.h>
 
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 
 #include "trace_output.h"
 #include "trace.h"
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f76a8f86..34b94c3f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,7 +9,7 @@
 #include <linux/mmiotrace.h>
 #include <linux/ftrace.h>
 #include <trace/boot.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 #include <trace/power.h>
 
 enum trace_type {
diff --git a/mm/slab.c b/mm/slab.c
index 9a90b00..f85831d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -102,7 +102,7 @@
 #include	<linux/cpu.h>
 #include	<linux/sysctl.h>
 #include	<linux/module.h>
-#include	<trace/kmemtrace.h>
+#include	<linux/kmemtrace.h>
 #include	<linux/rcupdate.h>
 #include	<linux/string.h>
 #include	<linux/uaccess.h>
diff --git a/mm/slob.c b/mm/slob.c
index a2d4ab3..494f05f 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -65,7 +65,7 @@
 #include <linux/module.h>
 #include <linux/rcupdate.h>
 #include <linux/list.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 #include <asm/atomic.h>
 
 /*
diff --git a/mm/slub.c b/mm/slub.c
index 7ab54ec..ea9e716 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -16,7 +16,7 @@
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/mempolicy.h>
-- 
cgit v0.10.2


From fc182a4330fc22ea1b68fa3d5064dd85a73a4c4a Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Fri, 10 Apr 2009 14:27:38 +0800
Subject: tracing, kmemtrace: Make kmem tracepoints use TRACE_EVENT macro

TRACE_EVENT is a more generic way to define tracepoints.
Doing so adds these new capabilities to this tracepoint:

  - zero-copy and per-cpu splice() tracing
  - binary tracing without printf overhead
  - structured logging records exposed under /debug/tracing/events
  - trace events embedded in function tracer output and other plugins
  - user-defined, per tracepoint filter expressions

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Acked-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <49DEE6DA.80600@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/trace/kmem.h b/include/trace/kmem.h
index 24d2519..46efc24 100644
--- a/include/trace/kmem.h
+++ b/include/trace/kmem.h
@@ -1,44 +1,9 @@
 #ifndef _TRACE_KMEM_H
 #define _TRACE_KMEM_H
 
-#include <linux/tracepoint.h>
 #include <linux/types.h>
+#include <linux/tracepoint.h>
 
-DECLARE_TRACE(kmalloc,
-	      TP_PROTO(unsigned long call_site,
-		      const void *ptr,
-		      size_t bytes_req,
-		      size_t bytes_alloc,
-		      gfp_t gfp_flags),
-	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags));
-DECLARE_TRACE(kmem_cache_alloc,
-	      TP_PROTO(unsigned long call_site,
-		      const void *ptr,
-		      size_t bytes_req,
-		      size_t bytes_alloc,
-		      gfp_t gfp_flags),
-	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags));
-DECLARE_TRACE(kmalloc_node,
-	      TP_PROTO(unsigned long call_site,
-		      const void *ptr,
-		      size_t bytes_req,
-		      size_t bytes_alloc,
-		      gfp_t gfp_flags,
-		      int node),
-	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node));
-DECLARE_TRACE(kmem_cache_alloc_node,
-	      TP_PROTO(unsigned long call_site,
-		      const void *ptr,
-		      size_t bytes_req,
-		      size_t bytes_alloc,
-		      gfp_t gfp_flags,
-		      int node),
-	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node));
-DECLARE_TRACE(kfree,
-	      TP_PROTO(unsigned long call_site, const void *ptr),
-	      TP_ARGS(call_site, ptr));
-DECLARE_TRACE(kmem_cache_free,
-	      TP_PROTO(unsigned long call_site, const void *ptr),
-	      TP_ARGS(call_site, ptr));
+#include <trace/kmem_event_types.h>
 
 #endif /* _TRACE_KMEM_H */
diff --git a/include/trace/kmem_event_types.h b/include/trace/kmem_event_types.h
new file mode 100644
index 0000000..4ff420f
--- /dev/null
+++ b/include/trace/kmem_event_types.h
@@ -0,0 +1,193 @@
+
+/* use <trace/kmem.h> instead */
+#ifndef TRACE_EVENT
+# error Do not include this file directly.
+# error Unless you know what you are doing.
+#endif
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kmem
+
+TRACE_EVENT(kmalloc,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags)
+);
+
+TRACE_EVENT(kmem_cache_alloc,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags)
+);
+
+TRACE_EVENT(kmalloc_node,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags,
+		 int node),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+		__field(	int,		node		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+		__entry->node		= node;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags,
+		__entry->node)
+);
+
+TRACE_EVENT(kmem_cache_alloc_node,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags,
+		 int node),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+		__field(	int,		node		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+		__entry->node		= node;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags,
+		__entry->node)
+);
+
+TRACE_EVENT(kfree,
+
+	TP_PROTO(unsigned long call_site, const void *ptr),
+
+	TP_ARGS(call_site, ptr),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+	),
+
+	TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
+);
+
+TRACE_EVENT(kmem_cache_free,
+
+	TP_PROTO(unsigned long call_site, const void *ptr),
+
+	TP_ARGS(call_site, ptr),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+	),
+
+	TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
+);
+
+#undef TRACE_SYSTEM
diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h
index 33b6bfc..552a50e 100644
--- a/include/trace/trace_event_types.h
+++ b/include/trace/trace_event_types.h
@@ -4,3 +4,4 @@
 #include <trace/irq_event_types.h>
 #include <trace/lockdep_event_types.h>
 #include <trace/skb_event_types.h>
+#include <trace/kmem_event_types.h>
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index 0e2aa80..13d6b85 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -4,3 +4,4 @@
 #include <trace/irq.h>
 #include <trace/lockdep.h>
 #include <trace/skb.h>
+#include <trace/kmem.h>
-- 
cgit v0.10.2


From b78825d608f30a47e3154ab6872a03f0de0c9d45 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 1 Apr 2009 16:18:53 +0800
Subject: blktrace: fix output of unknown events

Not all events are pc (packet command) events. An event is a pc
event only if it has BLK_TC_PC bit set.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49D3236D.3090705@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 921ef5d..e45e1af 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1182,7 +1182,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
 	}
 
 	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
-		ret = trace_seq_printf(s, "Bad pc action %x\n", what);
+		ret = trace_seq_printf(s, "Unknown action %x\n", what);
 	else {
 		ret = log_action(iter, what2act[what].act[long_act]);
 		if (ret)
-- 
cgit v0.10.2


From 66de7792c02693b49671afe58c771fde3b092fc7 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 1 Apr 2009 16:19:19 +0800
Subject: blktrace: fix output of BLK_TC_PC events

BLK_TC_PC events should be treated differently with BLK_TC_FS events.

Before this patch:

 # echo 1 > /sys/block/sda/sda1/trace/enable
 # echo pc > /sys/block/sda/sda1/trace/act_mask
 # echo blk > /debugfs/tracing/current_tracer
 # (generate some BLK_TC_PC events)
 # cat trace
        bash-2184  [000]  1774.275413:   8,7    I   N [bash]
        bash-2184  [000]  1774.275435:   8,7    D   N [bash]
        bash-2184  [000]  1774.275540:   8,7    I   R [bash]
        bash-2184  [000]  1774.275547:   8,7    D   R [bash]
 ksoftirqd/0-4     [000]  1774.275580:   8,7    C   N 0 [0]
        bash-2184  [000]  1774.275648:   8,7    I   R [bash]
        bash-2184  [000]  1774.275653:   8,7    D   R [bash]
 ksoftirqd/0-4     [000]  1774.275682:   8,7    C   N 0 [0]
        bash-2184  [000]  1774.275739:   8,7    I   R [bash]
        bash-2184  [000]  1774.275744:   8,7    D   R [bash]
 ksoftirqd/0-4     [000]  1774.275771:   8,7    C   N 0 [0]
        bash-2184  [000]  1774.275804:   8,7    I   R [bash]
        bash-2184  [000]  1774.275808:   8,7    D   R [bash]
 ksoftirqd/0-4     [000]  1774.275836:   8,7    C   N 0 [0]

After this patch:

 # cat trace
        bash-2263  [000]   366.782149:   8,7    I   N 0 (00 ..) [bash]
        bash-2263  [000]   366.782323:   8,7    D   N 0 (00 ..) [bash]
        bash-2263  [000]   366.782557:   8,7    I   R 8 (25 00 ..) [bash]
        bash-2263  [000]   366.782560:   8,7    D   R 8 (25 00 ..) [bash]
 ksoftirqd/0-4     [000]   366.782582:   8,7    C   N (25 00 ..) [0]
        bash-2263  [000]   366.782648:   8,7    I   R 8 (5a 00 3f 00) [bash]
        bash-2263  [000]   366.782650:   8,7    D   R 8 (5a 00 3f 00) [bash]
 ksoftirqd/0-4     [000]   366.782669:   8,7    C   N (5a 00 3f 00) [0]
        bash-2263  [000]   366.782710:   8,7    I   R 8 (5a 00 08 00) [bash]
        bash-2263  [000]   366.782713:   8,7    D   R 8 (5a 00 08 00) [bash]
 ksoftirqd/0-4     [000]   366.782730:   8,7    C   N (5a 00 08 00) [0]
        bash-2263  [000]   366.783375:   8,7    I   R 36 (5a 00 08 00) [bash]
        bash-2263  [000]   366.783379:   8,7    D   R 36 (5a 00 08 00) [bash]
 ksoftirqd/0-4     [000]   366.783404:   8,7    C   N (5a 00 08 00) [0]

This is what we do with PC events in user-space blktrace.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49D32387.9040106@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e45e1af..2b98195 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -971,6 +971,16 @@ static inline const void *pdu_start(const struct trace_entry *ent)
 	return te_blk_io_trace(ent) + 1;
 }
 
+static inline u32 t_action(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent)->action;
+}
+
+static inline u32 t_bytes(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent)->bytes;
+}
+
 static inline u32 t_sec(const struct trace_entry *ent)
 {
 	return te_blk_io_trace(ent)->bytes >> 9;
@@ -1031,25 +1041,87 @@ static int blk_log_action(struct trace_iterator *iter, const char *act)
 				MAJOR(t->device), MINOR(t->device), act, rwbs);
 }
 
+static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
+{
+	const char *pdu_buf;
+	int pdu_len;
+	int i, end, ret;
+
+	pdu_buf = pdu_start(ent);
+	pdu_len = te_blk_io_trace(ent)->pdu_len;
+
+	if (!pdu_len)
+		return 1;
+
+	/* find the last zero that needs to be printed */
+	for (end = pdu_len - 1; end >= 0; end--)
+		if (pdu_buf[end])
+			break;
+	end++;
+
+	if (!trace_seq_putc(s, '('))
+		return 0;
+
+	for (i = 0; i < pdu_len; i++) {
+
+		ret = trace_seq_printf(s, "%s%02x",
+				       i == 0 ? "" : " ", pdu_buf[i]);
+		if (!ret)
+			return ret;
+
+		/*
+		 * stop when the rest is just zeroes and indicate so
+		 * with a ".." appended
+		 */
+		if (i == end && end != pdu_len - 1)
+			return trace_seq_puts(s, " ..) ");
+	}
+
+	return trace_seq_puts(s, ") ");
+}
+
 static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
 {
 	char cmd[TASK_COMM_LEN];
 
 	trace_find_cmdline(ent->pid, cmd);
 
-	if (t_sec(ent))
-		return trace_seq_printf(s, "%llu + %u [%s]\n",
-					t_sector(ent), t_sec(ent), cmd);
-	return trace_seq_printf(s, "[%s]\n", cmd);
+	if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
+		int ret;
+
+		ret = trace_seq_printf(s, "%u ", t_bytes(ent));
+		if (!ret)
+			return 0;
+		ret = blk_log_dump_pdu(s, ent);
+		if (!ret)
+			return 0;
+		return trace_seq_printf(s, "[%s]\n", cmd);
+	} else {
+		if (t_sec(ent))
+			return trace_seq_printf(s, "%llu + %u [%s]\n",
+						t_sector(ent), t_sec(ent), cmd);
+		return trace_seq_printf(s, "[%s]\n", cmd);
+	}
 }
 
 static int blk_log_with_error(struct trace_seq *s,
 			      const struct trace_entry *ent)
 {
-	if (t_sec(ent))
-		return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent),
-					t_sec(ent), t_error(ent));
-	return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent));
+	if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
+		int ret;
+
+		ret = blk_log_dump_pdu(s, ent);
+		if (ret)
+			return trace_seq_printf(s, "[%d]\n", t_error(ent));
+		return 0;
+	} else {
+		if (t_sec(ent))
+			return trace_seq_printf(s, "%llu + %u [%d]\n",
+						t_sector(ent),
+						t_sec(ent), t_error(ent));
+		return trace_seq_printf(s, "%llu [%d]\n",
+					t_sector(ent), t_error(ent));
+	}
 }
 
 static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
-- 
cgit v0.10.2


From 3fa89ca7ba5ba50b3924a11f6604b4bdce5f7842 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Sun, 12 Apr 2009 20:37:25 +0530
Subject: x86: vdso/vma.c declare vdso_enabled and arch_setup_additional_pages
 before they get used

Impact: cleanup, address sparse warnings

Addresses the problem pointed out by these sparse warning:
  arch/x86/vdso/vma.c:19:28: warning: symbol 'vdso_enabled' was not declared. Should it be static?
  arch/x86/vdso/vma.c:101:5: warning: symbol 'arch_setup_additional_pages' was not declared. Should it be static?

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
LKML-Reference: <1239548845.4170.2.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 7133cdf..cac0833 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -8,6 +8,7 @@
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/random.h>
+#include <linux/elf.h>
 #include <asm/vsyscall.h>
 #include <asm/vgtod.h>
 #include <asm/proto.h>
-- 
cgit v0.10.2


From edea7148a87c099e5d5d4838285cc27e459588b7 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sun, 12 Apr 2009 20:47:39 +0400
Subject: x86: irq.c - tiny cleanup

Impact: cleanup, robustization

 1) guard ack_bad_irq with printk_ratelimit since there is no
    guarantee we will not be flooded one day

 2) use pr_emerg() helper

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <20090412165058.277579847@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 3aaf7b9..6603492 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -24,7 +24,8 @@ void (*generic_interrupt_extension)(void) = NULL;
  */
 void ack_bad_irq(unsigned int irq)
 {
-	printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
+	if (printk_ratelimit())
+		pr_err("unexpected IRQ trap at vector %02x\n", irq);
 
 #ifdef CONFIG_X86_LOCAL_APIC
 	/*
@@ -178,7 +179,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 	sum += irq_stats(cpu)->irq_thermal_count;
 # ifdef CONFIG_X86_64
 	sum += irq_stats(cpu)->irq_threshold_count;
-#endif
+# endif
 #endif
 	return sum;
 }
@@ -219,8 +220,8 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 #endif
 
 		if (printk_ratelimit())
-			printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n",
-			       __func__, smp_processor_id(), vector, irq);
+			pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n",
+				__func__, smp_processor_id(), vector, irq);
 	}
 
 	irq_exit();
-- 
cgit v0.10.2


From c0eaa4536f08b98fbcfa7fce5b7b0de1bebcb0e1 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sun, 12 Apr 2009 20:47:40 +0400
Subject: x86: apic - introduce imcr_ helpers

Impact: cleanup

Distinguish port writting magic into helpers with comments.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <20090412165058.535921550@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 098ec84..c3be10f5 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -98,6 +98,29 @@ early_param("lapic", parse_lapic);
 /* Local APIC was disabled by the BIOS and enabled by the kernel */
 static int enabled_via_apicbase;
 
+/*
+ * Handle interrupt mode configuration register (IMCR).
+ * This register controls whether the interrupt signals
+ * that reach the BSP come from the master PIC or from the
+ * local APIC. Before entering Symmetric I/O Mode, either
+ * the BIOS or the operating system must switch out of
+ * PIC Mode by changing the IMCR.
+ */
+static inline imcr_pic_to_apic(void)
+{
+	/* select IMCR register */
+	outb(0x70, 0x22);
+	/* NMI and 8259 INTR go through APIC */
+	outb(0x01, 0x23);
+}
+
+static inline imcr_apic_to_pic(void)
+{
+	/* select IMCR register */
+	outb(0x70, 0x22);
+	/* NMI and 8259 INTR go directly to BSP */
+	outb(0x00, 0x23);
+}
 #endif
 
 #ifdef CONFIG_X86_64
@@ -1727,8 +1750,7 @@ void __init connect_bsp_APIC(void)
 		 */
 		apic_printk(APIC_VERBOSE, "leaving PIC mode, "
 				"enabling APIC mode.\n");
-		outb(0x70, 0x22);
-		outb(0x01, 0x23);
+		imcr_pic_to_apic();
 	}
 #endif
 	if (apic->enable_apic_mode)
@@ -1756,8 +1778,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
 		 */
 		apic_printk(APIC_VERBOSE, "disabling APIC mode, "
 				"entering PIC mode.\n");
-		outb(0x70, 0x22);
-		outb(0x00, 0x23);
+		imcr_apic_to_pic();
 		return;
 	}
 #endif
-- 
cgit v0.10.2


From 08306ce61d6848e6fbf74fa4cc693c3fb29e943f Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sun, 12 Apr 2009 20:47:41 +0400
Subject: x86: apic - introduce dummy apic operations

Impact: refactor, speed up and robustize code

In case if apic was disabled by kernel option
or by hardware limits we can use dummy operations
in apic->write to simplify the ack_APIC_irq() code.

At the lame time the patch fixes the missed EOI in
do_IRQ function (which has place if kernel is compiled
as X86-32 and interrupt without handler happens where
apic was not asked to be disabled via kernel option).

Note that native_apic_write_dummy() consists of
WARN_ON_ONCE to catch any buggy writes on enabled
APICs. Could be removed after some time of testing.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <20090412165058.724788431@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 42f2f83..2bd5a46 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -212,6 +212,7 @@ static inline void ack_x2APIC_irq(void)
 }
 #endif
 
+extern void apic_disable(void);
 extern int lapic_get_maxlvt(void);
 extern void clear_local_APIC(void);
 extern void connect_bsp_APIC(void);
@@ -252,7 +253,7 @@ static inline void lapic_shutdown(void) { }
 #define local_apic_timer_c2_ok		1
 static inline void init_apic_mappings(void) { }
 static inline void disable_local_APIC(void) { }
-
+static inline void apic_disable(void) { }
 #endif /* !CONFIG_X86_LOCAL_APIC */
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index c3be10f5..9b849d4 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -232,6 +232,24 @@ static int modern_apic(void)
 	return lapic_get_version() >= 0x14;
 }
 
+/*
+ * bare function to substitute write operation
+ * and it's _that_ fast :)
+ */
+void native_apic_write_dummy(u32 reg, u32 v)
+{
+	WARN_ON_ONCE((cpu_has_apic || !disable_apic));
+}
+
+/*
+ * right after this call apic->write doesn't do anything
+ * note that there is no restore operation it works one way
+ */
+void apic_disable(void)
+{
+	apic->write = native_apic_write_dummy;
+}
+
 void native_apic_wait_icr_idle(void)
 {
 	while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
@@ -1582,6 +1600,12 @@ void __init init_apic_mappings(void)
 	 */
 	if (boot_cpu_physical_apicid == -1U)
 		boot_cpu_physical_apicid = read_apic_id();
+
+	/* lets check if we may to NOP'ify apic operations */
+	if (!cpu_has_apic) {
+		pr_info("APIC: disable apic facility\n");
+		apic_disable();
+	}
 }
 
 /*
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 6603492..fd57bf3 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -27,7 +27,6 @@ void ack_bad_irq(unsigned int irq)
 	if (printk_ratelimit())
 		pr_err("unexpected IRQ trap at vector %02x\n", irq);
 
-#ifdef CONFIG_X86_LOCAL_APIC
 	/*
 	 * Currently unexpected vectors happen only on SMP and APIC.
 	 * We _must_ ack these because every local APIC has only N
@@ -37,9 +36,7 @@ void ack_bad_irq(unsigned int irq)
 	 * completely.
 	 * But only ack when the APIC is enabled -AK
 	 */
-	if (cpu_has_apic)
-		ack_APIC_irq();
-#endif
+	ack_APIC_irq();
 }
 
 #define irq_stats(x)		(&per_cpu(irq_stat, x))
@@ -214,10 +211,7 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 	irq = __get_cpu_var(vector_irq)[vector];
 
 	if (!handle_irq(irq, regs)) {
-#ifdef CONFIG_X86_64
-		if (!disable_apic)
-			ack_APIC_irq();
-#endif
+		ack_APIC_irq();
 
 		if (printk_ratelimit())
 			pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n",
-- 
cgit v0.10.2


From b9b34f24b23ba9e79e07c0980e7fff16af2a67d1 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sun, 12 Apr 2009 20:47:42 +0400
Subject: x86: smp.c - align smp_ops assignments

Impact: cleanup

It's a bit hard to parse by eyes without
them being aligned.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <20090412165058.924175574@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 13f33ea..f6db48c 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -193,19 +193,19 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
 }
 
 struct smp_ops smp_ops = {
-	.smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
-	.smp_prepare_cpus = native_smp_prepare_cpus,
-	.smp_cpus_done = native_smp_cpus_done,
+	.smp_prepare_boot_cpu	= native_smp_prepare_boot_cpu,
+	.smp_prepare_cpus	= native_smp_prepare_cpus,
+	.smp_cpus_done		= native_smp_cpus_done,
 
-	.smp_send_stop = native_smp_send_stop,
-	.smp_send_reschedule = native_smp_send_reschedule,
+	.smp_send_stop		= native_smp_send_stop,
+	.smp_send_reschedule	= native_smp_send_reschedule,
 
-	.cpu_up = native_cpu_up,
-	.cpu_die = native_cpu_die,
-	.cpu_disable = native_cpu_disable,
-	.play_dead = native_play_dead,
+	.cpu_up			= native_cpu_up,
+	.cpu_die		= native_cpu_die,
+	.cpu_disable		= native_cpu_disable,
+	.play_dead		= native_play_dead,
 
-	.send_call_func_ipi = native_send_call_func_ipi,
+	.send_call_func_ipi	= native_send_call_func_ipi,
 	.send_call_func_single_ipi = native_send_call_func_single_ipi,
 };
 EXPORT_SYMBOL_GPL(smp_ops);
-- 
cgit v0.10.2


From 0f3fd87ce43727d6b8573191ce89e874533b1429 Mon Sep 17 00:00:00 2001
From: Luis Henriques <henrix@sapo.pt>
Date: Mon, 13 Apr 2009 20:24:50 +0100
Subject: perf_counter: fix alignment in /proc/interrupts

Trivial fix on columns alignment in /proc/interrupts file.

Signed-off-by: Luis Henriques <henrix@sapo.pt>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090413192449.GA3920@hades.domain.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index dccaaa8..849cfab 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -67,7 +67,7 @@ static int show_other_interrupts(struct seq_file *p, int prec)
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
 	seq_printf(p, "  Performance counter interrupts\n");
-	seq_printf(p, "PND: ");
+	seq_printf(p, "%*s: ", prec, "PND");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
 	seq_printf(p, "  Performance pending work\n");
-- 
cgit v0.10.2


From 5cda395f4a262788d8ed79ac8a26a2b821e5f751 Mon Sep 17 00:00:00 2001
From: Alexander van Heukelum <heukelum@mailshack.com>
Date: Mon, 13 Apr 2009 17:39:24 +0200
Subject: x86: fix function definitions after: x86: apic - introduce imcr_
 helpers

The patch "introduce imcr_ helpers" introduced good comments, but
also a few new compile warnings. This fixes the function definitions
to have a 'void' return type.

Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <20090413153924.GA20287@mailshack.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 9b849d4..4b48ff9 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -106,7 +106,7 @@ static int enabled_via_apicbase;
  * the BIOS or the operating system must switch out of
  * PIC Mode by changing the IMCR.
  */
-static inline imcr_pic_to_apic(void)
+static inline void imcr_pic_to_apic(void)
 {
 	/* select IMCR register */
 	outb(0x70, 0x22);
@@ -114,7 +114,7 @@ static inline imcr_pic_to_apic(void)
 	outb(0x01, 0x23);
 }
 
-static inline imcr_apic_to_pic(void)
+static inline void imcr_apic_to_pic(void)
 {
 	/* select IMCR register */
 	outb(0x70, 0x22);
-- 
cgit v0.10.2


From e1112b4d96859367a93468027c9635e2ac04eb3f Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 31 Mar 2009 00:48:49 -0500
Subject: tracing/filters: add run-time field descriptions to
 TRACE_EVENT_FORMAT events

This patch adds run-time field descriptions to all the event formats
exported using TRACE_EVENT_FORMAT.  It also hooks up all the tracers
that use them (i.e. the tracers in the 'ftrace subsystem') so they can
also have their output filtered by the event-filtering mechanism.

When I was testing this, there were a couple of things that fooled me
into thinking the filters weren't working, when actually they were -
I'll mention them here so others don't make the same mistakes (and file
bug reports. ;-)

One is that some of the tracers trace multiple events e.g. the
sched_switch tracer uses the context_switch and wakeup events, and if
you don't set filters on all of the traced events, the unfiltered output
from the events without filters on them can make it look like the
filtering as a whole isn't working properly, when actually it is doing
what it was asked to do - it just wasn't asked to do the right thing.

The other is that for the really high-volume tracers e.g. the function
tracer, the volume of filtered events can be so high that it pushes the
unfiltered events out of the ring buffer before they can be read so e.g.
cat'ing the trace file repeatedly shows either no output, or once in
awhile some output but that isn't there the next time you read the
trace, which isn't what you normally expect when reading the trace file.
If you read from the trace_pipe file though, you can catch them before
they disappear.

Changes from v1:

As suggested by Frederic Weisbecker:

- get rid of externs in functions
- added unlikely() to filter_check_discard()

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 7a0aa0e..9419ad1 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -42,6 +42,7 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
 				   gfp_t gfp_flags,
 				   int node)
 {
+	struct ftrace_event_call *call = &event_kmem_alloc;
 	struct trace_array *tr = kmemtrace_array;
 	struct kmemtrace_alloc_entry *entry;
 	struct ring_buffer_event *event;
@@ -62,6 +63,8 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
 	entry->gfp_flags	= gfp_flags;
 	entry->node		= node;
 
+	filter_check_discard(call, entry, event);
+
 	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
@@ -71,6 +74,7 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
 				  unsigned long call_site,
 				  const void *ptr)
 {
+	struct ftrace_event_call *call = &event_kmem_free;
 	struct trace_array *tr = kmemtrace_array;
 	struct kmemtrace_free_entry *entry;
 	struct ring_buffer_event *event;
@@ -86,6 +90,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
 	entry->call_site	= call_site;
 	entry->ptr		= ptr;
 
+	filter_check_discard(call, entry, event);
+
 	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4865459..962e617 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -898,6 +898,7 @@ trace_function(struct trace_array *tr,
 	       unsigned long ip, unsigned long parent_ip, unsigned long flags,
 	       int pc)
 {
+	struct ftrace_event_call *call = &event_function;
 	struct ring_buffer_event *event;
 	struct ftrace_entry *entry;
 
@@ -912,6 +913,9 @@ trace_function(struct trace_array *tr,
 	entry	= ring_buffer_event_data(event);
 	entry->ip			= ip;
 	entry->parent_ip		= parent_ip;
+
+	filter_check_discard(call, entry, event);
+
 	ring_buffer_unlock_commit(tr->buffer, event);
 }
 
@@ -921,6 +925,7 @@ static int __trace_graph_entry(struct trace_array *tr,
 				unsigned long flags,
 				int pc)
 {
+	struct ftrace_event_call *call = &event_funcgraph_entry;
 	struct ring_buffer_event *event;
 	struct ftrace_graph_ent_entry *entry;
 
@@ -933,6 +938,7 @@ static int __trace_graph_entry(struct trace_array *tr,
 		return 0;
 	entry	= ring_buffer_event_data(event);
 	entry->graph_ent			= *trace;
+	filter_check_discard(call, entry, event);
 	ring_buffer_unlock_commit(global_trace.buffer, event);
 
 	return 1;
@@ -943,6 +949,7 @@ static void __trace_graph_return(struct trace_array *tr,
 				unsigned long flags,
 				int pc)
 {
+	struct ftrace_event_call *call = &event_funcgraph_exit;
 	struct ring_buffer_event *event;
 	struct ftrace_graph_ret_entry *entry;
 
@@ -955,6 +962,7 @@ static void __trace_graph_return(struct trace_array *tr,
 		return;
 	entry	= ring_buffer_event_data(event);
 	entry->ret				= *trace;
+	filter_check_discard(call, entry, event);
 	ring_buffer_unlock_commit(global_trace.buffer, event);
 }
 #endif
@@ -973,6 +981,7 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 				 int skip, int pc)
 {
 #ifdef CONFIG_STACKTRACE
+	struct ftrace_event_call *call = &event_kernel_stack;
 	struct ring_buffer_event *event;
 	struct stack_entry *entry;
 	struct stack_trace trace;
@@ -990,6 +999,7 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 	trace.entries		= entry->caller;
 
 	save_stack_trace(&trace);
+	filter_check_discard(call, entry, event);
 	ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 }
@@ -1015,6 +1025,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 				   unsigned long flags, int pc)
 {
 #ifdef CONFIG_STACKTRACE
+	struct ftrace_event_call *call = &event_user_stack;
 	struct ring_buffer_event *event;
 	struct userstack_entry *entry;
 	struct stack_trace trace;
@@ -1036,6 +1047,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 	trace.entries		= entry->caller;
 
 	save_stack_trace_user(&trace);
+	filter_check_discard(call, entry, event);
 	ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 }
@@ -1052,6 +1064,7 @@ ftrace_trace_special(void *__tr,
 		     unsigned long arg1, unsigned long arg2, unsigned long arg3,
 		     int pc)
 {
+	struct ftrace_event_call *call = &event_special;
 	struct ring_buffer_event *event;
 	struct trace_array *tr = __tr;
 	struct special_entry *entry;
@@ -1064,6 +1077,7 @@ ftrace_trace_special(void *__tr,
 	entry->arg1			= arg1;
 	entry->arg2			= arg2;
 	entry->arg3			= arg3;
+	filter_check_discard(call, entry, event);
 	trace_buffer_unlock_commit(tr, event, 0, pc);
 }
 
@@ -1080,6 +1094,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 			   struct task_struct *next,
 			   unsigned long flags, int pc)
 {
+	struct ftrace_event_call *call = &event_context_switch;
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
 
@@ -1095,6 +1110,9 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->next_prio		= next->prio;
 	entry->next_state		= next->state;
 	entry->next_cpu	= task_cpu(next);
+
+	filter_check_discard(call, entry, event);
+
 	trace_buffer_unlock_commit(tr, event, flags, pc);
 }
 
@@ -1104,6 +1122,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 			   struct task_struct *curr,
 			   unsigned long flags, int pc)
 {
+	struct ftrace_event_call *call = &event_wakeup;
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
 
@@ -1120,6 +1139,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_state		= wakee->state;
 	entry->next_cpu			= task_cpu(wakee);
 
+	filter_check_discard(call, entry, event);
+
 	ring_buffer_unlock_commit(tr->buffer, event);
 	ftrace_trace_stack(tr, flags, 6, pc);
 	ftrace_trace_userstack(tr, flags, pc);
@@ -1221,6 +1242,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 		(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 	static u32 trace_buf[TRACE_BUF_SIZE];
 
+	struct ftrace_event_call *call = &event_bprint;
 	struct ring_buffer_event *event;
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
@@ -1260,6 +1282,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	entry->fmt			= fmt;
 
 	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
+	filter_check_discard(call, entry, event);
 	ring_buffer_unlock_commit(tr->buffer, event);
 
 out_unlock:
@@ -1279,6 +1302,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 	static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
 	static char trace_buf[TRACE_BUF_SIZE];
 
+	struct ftrace_event_call *call = &event_print;
 	struct ring_buffer_event *event;
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
@@ -1314,6 +1338,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 
 	memcpy(&entry->buf, trace_buf, len);
 	entry->buf[len] = 0;
+	filter_check_discard(call, entry, event);
 	ring_buffer_unlock_commit(tr->buffer, event);
 
  out_unlock:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 34b94c3f..e773728 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -866,6 +866,21 @@ extern void filter_free_subsystem_preds(struct event_subsystem *system);
 extern int filter_add_subsystem_pred(struct event_subsystem *system,
 				     struct filter_pred *pred);
 
+static inline void
+filter_check_discard(struct ftrace_event_call *call, void *rec,
+		     struct ring_buffer_event *event)
+{
+	if (unlikely(call->preds) && !filter_match_preds(call, rec))
+		ring_buffer_event_discard(event);
+}
+
+#define __common_field(type, item)					\
+	ret = trace_define_field(event_call, #type, "common_" #item,	\
+				 offsetof(typeof(field.ent), item),	\
+				 sizeof(field.ent.item));		\
+	if (ret)							\
+		return ret;
+
 void event_trace_printk(unsigned long ip, const char *fmt, ...);
 extern struct ftrace_event_call __start_ftrace_events[];
 extern struct ftrace_event_call __stop_ftrace_events[];
@@ -897,4 +912,9 @@ do {									\
 		__trace_printk(ip, fmt, ##args);			\
 } while (0)
 
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+	extern struct ftrace_event_call event_##call;
+#include "trace_event_types.h"
+
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index e6e3291..c95c25d 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -30,6 +30,7 @@ static struct trace_array *branch_tracer;
 static void
 probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 {
+	struct ftrace_event_call *call = &event_branch;
 	struct trace_array *tr = branch_tracer;
 	struct ring_buffer_event *event;
 	struct trace_branch *entry;
@@ -73,6 +74,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	entry->line = f->line;
 	entry->correct = val == expect;
 
+	filter_check_discard(call, entry, event);
+
 	ring_buffer_unlock_commit(tr->buffer, event);
 
  out:
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index fd78bee..95b147a 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -122,8 +122,10 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
 TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned int, line, line)
-		TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func)
-		TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file)
+		TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func,
+				    TRACE_FUNC_SIZE+1, func)
+		TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file,
+				    TRACE_FUNC_SIZE+1, file)
 		TRACE_FIELD(char, correct, correct)
 	),
 	TP_RAW_FMT("%u:%s:%s (%u)")
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 64ec4d2..be9299a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -680,6 +680,7 @@ static struct dentry *
 event_subsystem_dir(const char *name, struct dentry *d_events)
 {
 	struct event_subsystem *system;
+	struct dentry *entry;
 
 	/* First see if we did not already create this dir */
 	list_for_each_entry(system, &event_subsystems, list) {
@@ -708,6 +709,12 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 
 	system->preds = NULL;
 
+	entry = debugfs_create_file("filter", 0644, system->entry, system,
+				    &ftrace_subsystem_filter_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'%s/filter' entry\n", name);
+
 	return system->entry;
 }
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 026be41..470ad94 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -185,7 +185,7 @@ void filter_free_subsystem_preds(struct event_subsystem *system)
 	}
 
 	events_for_each(call) {
-		if (!call->name || !call->regfunc)
+		if (!call->define_fields)
 			continue;
 
 		if (!strcmp(call->system, system->name))
@@ -324,7 +324,7 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 	events_for_each(call) {
 		int err;
 
-		if (!call->name || !call->regfunc)
+		if (!call->define_fields)
 			continue;
 
 		if (strcmp(call->system, system->name))
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 30743f7..1c94b87 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -146,13 +146,6 @@ ftrace_format_##call(struct trace_seq *s)				\
 	if (ret)							\
 		return ret;
 
-#define __common_field(type, item)					\
-	ret = trace_define_field(event_call, #type, "common_" #item,	\
-				 offsetof(typeof(field.ent), item),	\
-				 sizeof(field.ent.item));		\
-	if (ret)							\
-		return ret;
-
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
 int									\
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 07a22c3..f4e4661 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -30,7 +30,7 @@
 
 
 #undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\
+#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd)			\
 	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
 			       "offset:%u;\tsize:%u;\n",		\
 			       (unsigned int)offsetof(typeof(field), item), \
@@ -85,18 +85,69 @@ ftrace_format_##call(struct trace_seq *s)				\
 #define TRACE_ENTRY	entry
 
 #undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
+#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd)	\
 	cmd;
 
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+int ftrace_define_fields_##call(void);					\
+static int ftrace_raw_init_event_##call(void);				\
 									\
-static struct ftrace_event_call __used					\
+struct ftrace_event_call __used						\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name			= #call,				\
 	.id			= proto,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
+	.raw_init		= ftrace_raw_init_event_##call,		\
 	.show_format		= ftrace_format_##call,			\
+	.define_fields		= ftrace_define_fields_##call,		\
+};									\
+static int ftrace_raw_init_event_##call(void)				\
+{									\
+	INIT_LIST_HEAD(&event_##call.fields);				\
+	return 0;							\
+}									\
+
+#include "trace_event_types.h"
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)					\
+	ret = trace_define_field(event_call, #type, #item,		\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item));			\
+	if (ret)							\
+		return ret;
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type, item, len, cmd)			\
+	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item));			\
+	if (ret)							\
+		return ret;
+
+#undef TRACE_FIELD_ZERO_CHAR
+#define TRACE_FIELD_ZERO_CHAR(item)
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+int									\
+ftrace_define_fields_##call(void)					\
+{									\
+	struct ftrace_event_call *event_call = &event_##call;		\
+	struct args field;						\
+	int ret;							\
+									\
+	__common_field(unsigned char, type);				\
+	__common_field(unsigned char, flags);				\
+	__common_field(unsigned char, preempt_count);			\
+	__common_field(int, pid);					\
+	__common_field(int, tgid);					\
+									\
+	tstruct;							\
+									\
+	return ret;							\
 }
+
 #include "trace_event_types.h"
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 7bfdf4c..e6b275b 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -168,6 +168,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
 
 void trace_hw_branch(u64 from, u64 to)
 {
+	struct ftrace_event_call *call = &event_hw_branch;
 	struct trace_array *tr = hw_branch_trace;
 	struct ring_buffer_event *event;
 	struct hw_branch_entry *entry;
@@ -194,6 +195,7 @@ void trace_hw_branch(u64 from, u64 to)
 	entry->ent.type = TRACE_HW_BRANCHES;
 	entry->from = from;
 	entry->to   = to;
+	filter_check_discard(call, entry, event);
 	trace_buffer_unlock_commit(tr, event, 0, 0);
 
  out:
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index bae791e..8ce7d7d 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -36,6 +36,7 @@ static void probe_power_start(struct power_trace *it, unsigned int type,
 
 static void probe_power_end(struct power_trace *it)
 {
+	struct ftrace_event_call *call = &event_power;
 	struct ring_buffer_event *event;
 	struct trace_power *entry;
 	struct trace_array_cpu *data;
@@ -54,6 +55,7 @@ static void probe_power_end(struct power_trace *it)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	entry->state_data = *it;
+	filter_check_discard(call, entry, event);
 	trace_buffer_unlock_commit(tr, event, 0, 0);
  out:
 	preempt_enable();
@@ -62,6 +64,7 @@ static void probe_power_end(struct power_trace *it)
 static void probe_power_mark(struct power_trace *it, unsigned int type,
 				unsigned int level)
 {
+	struct ftrace_event_call *call = &event_power;
 	struct ring_buffer_event *event;
 	struct trace_power *entry;
 	struct trace_array_cpu *data;
@@ -84,6 +87,7 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	entry->state_data = *it;
+	filter_check_discard(call, entry, event);
 	trace_buffer_unlock_commit(tr, event, 0, 0);
  out:
 	preempt_enable();
-- 
cgit v0.10.2


From e45f2e2bd298e1ff687448e5fd15a3588b5807ec Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 31 Mar 2009 00:49:16 -0500
Subject: tracing/filters: add TRACE_EVENT_FORMAT_NOFILTER event macro

Frederic Weisbecker suggested that the trace_special event shouldn't be
filterable; this patch adds a TRACE_EVENT_FORMAT_NOFILTER event macro
that allows an event format to be exported without having a filter
attached, and removes filtering from the trace_special event.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 962e617..c209d21 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1064,7 +1064,6 @@ ftrace_trace_special(void *__tr,
 		     unsigned long arg1, unsigned long arg2, unsigned long arg3,
 		     int pc)
 {
-	struct ftrace_event_call *call = &event_special;
 	struct ring_buffer_event *event;
 	struct trace_array *tr = __tr;
 	struct special_entry *entry;
@@ -1077,7 +1076,6 @@ ftrace_trace_special(void *__tr,
 	entry->arg1			= arg1;
 	entry->arg2			= arg2;
 	entry->arg3			= arg3;
-	filter_check_discard(call, entry, event);
 	trace_buffer_unlock_commit(tr, event, 0, pc);
 }
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e773728..3cf856f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -915,6 +915,8 @@ do {									\
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
 	extern struct ftrace_event_call event_##call;
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt)
 #include "trace_event_types.h"
 
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index 95b147a..cfcecc4 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -57,7 +57,7 @@ TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
 	TP_RAW_FMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
 );
 
-TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore,
+TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned long, arg1, arg1)
 		TRACE_FIELD(unsigned long, arg2, arg2)
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index f4e4661..77c494f 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -65,6 +65,22 @@ ftrace_format_##call(struct trace_seq *s)				\
 	return ret;							\
 }
 
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,	\
+				    tpfmt)				\
+static int								\
+ftrace_format_##call(struct trace_seq *s)				\
+{									\
+	struct args field;						\
+	int ret;							\
+									\
+	tstruct;							\
+									\
+	trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);		\
+									\
+	return ret;							\
+}
+
 #include "trace_event_types.h"
 
 #undef TRACE_ZERO_CHAR
@@ -109,6 +125,19 @@ static int ftrace_raw_init_event_##call(void)				\
 	return 0;							\
 }									\
 
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,	\
+				    tpfmt)				\
+									\
+struct ftrace_event_call __used						\
+__attribute__((__aligned__(4)))						\
+__attribute__((section("_ftrace_events"))) event_##call = {		\
+	.name			= #call,				\
+	.id			= proto,				\
+	.system			= __stringify(TRACE_SYSTEM),		\
+	.show_format		= ftrace_format_##call,			\
+};
+
 #include "trace_event_types.h"
 
 #undef TRACE_FIELD
@@ -150,4 +179,8 @@ ftrace_define_fields_##call(void)					\
 	return ret;							\
 }
 
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,	\
+				    tpfmt)
+
 #include "trace_event_types.h"
-- 
cgit v0.10.2


From fa1b47dd85453ec7d4bcfe4aa4a2d172ba452fc3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 2 Apr 2009 00:09:41 -0400
Subject: ring-buffer: add ring_buffer_discard_commit

The ring_buffer_discard_commit is similar to ring_buffer_event_discard
but it can only be done on an event that has yet to be commited.
Unpredictable results can happen otherwise.

The main difference between ring_buffer_discard_commit and
ring_buffer_event_discard is that ring_buffer_discard_commit will try
to free the data in the ring buffer if nothing has addded data
after the reserved event. If something did, then it acts almost the
same as ring_buffer_event_discard followed by a
ring_buffer_unlock_commit.

Note, either ring_buffer_commit_discard and ring_buffer_unlock_commit
can be called on an event, not both.

This commit also exports both discard functions to be usable by
GPL modules.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index e1b7b21..f0aa486 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -68,9 +68,38 @@ ring_buffer_event_time_delta(struct ring_buffer_event *event)
 	return event->time_delta;
 }
 
+/*
+ * ring_buffer_event_discard can discard any event in the ring buffer.
+ *   it is up to the caller to protect against a reader from
+ *   consuming it or a writer from wrapping and replacing it.
+ *
+ * No external protection is needed if this is called before
+ * the event is commited. But in that case it would be better to
+ * use ring_buffer_discard_commit.
+ *
+ * Note, if an event that has not been committed is discarded
+ * with ring_buffer_event_discard, it must still be committed.
+ */
 void ring_buffer_event_discard(struct ring_buffer_event *event);
 
 /*
+ * ring_buffer_discard_commit will remove an event that has not
+ *   ben committed yet. If this is used, then ring_buffer_unlock_commit
+ *   must not be called on the discarded event. This function
+ *   will try to remove the event from the ring buffer completely
+ *   if another event has not been written after it.
+ *
+ * Example use:
+ *
+ *  if (some_condition)
+ *    ring_buffer_discard_commit(buffer, event);
+ *  else
+ *    ring_buffer_unlock_commit(buffer, event);
+ */
+void ring_buffer_discard_commit(struct ring_buffer *buffer,
+				struct ring_buffer_event *event);
+
+/*
  * size is in bytes for each per CPU buffer.
  */
 struct ring_buffer *
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 74a1180..f935bd5 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -205,27 +205,6 @@ static void rb_event_set_padding(struct ring_buffer_event *event)
 	event->time_delta = 0;
 }
 
-/**
- * ring_buffer_event_discard - discard an event in the ring buffer
- * @buffer: the ring buffer
- * @event: the event to discard
- *
- * Sometimes a event that is in the ring buffer needs to be ignored.
- * This function lets the user discard an event in the ring buffer
- * and then that event will not be read later.
- *
- * Note, it is up to the user to be careful with this, and protect
- * against races. If the user discards an event that has been consumed
- * it is possible that it could corrupt the ring buffer.
- */
-void ring_buffer_event_discard(struct ring_buffer_event *event)
-{
-	event->type = RINGBUF_TYPE_PADDING;
-	/* time delta must be non zero */
-	if (!event->time_delta)
-		event->time_delta = 1;
-}
-
 static unsigned
 rb_event_data_length(struct ring_buffer_event *event)
 {
@@ -1571,6 +1550,110 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
 EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
 
 /**
+ * ring_buffer_event_discard - discard any event in the ring buffer
+ * @event: the event to discard
+ *
+ * Sometimes a event that is in the ring buffer needs to be ignored.
+ * This function lets the user discard an event in the ring buffer
+ * and then that event will not be read later.
+ *
+ * Note, it is up to the user to be careful with this, and protect
+ * against races. If the user discards an event that has been consumed
+ * it is possible that it could corrupt the ring buffer.
+ */
+void ring_buffer_event_discard(struct ring_buffer_event *event)
+{
+	event->type = RINGBUF_TYPE_PADDING;
+	/* time delta must be non zero */
+	if (!event->time_delta)
+		event->time_delta = 1;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
+
+/**
+ * ring_buffer_commit_discard - discard an event that has not been committed
+ * @buffer: the ring buffer
+ * @event: non committed event to discard
+ *
+ * This is similar to ring_buffer_event_discard but must only be
+ * performed on an event that has not been committed yet. The difference
+ * is that this will also try to free the event from the ring buffer
+ * if another event has not been added behind it.
+ *
+ * If another event has been added behind it, it will set the event
+ * up as discarded, and perform the commit.
+ *
+ * If this function is called, do not call ring_buffer_unlock_commit on
+ * the event.
+ */
+void ring_buffer_discard_commit(struct ring_buffer *buffer,
+				struct ring_buffer_event *event)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long new_index, old_index;
+	struct buffer_page *bpage;
+	unsigned long index;
+	unsigned long addr;
+	int cpu;
+
+	/* The event is discarded regardless */
+	ring_buffer_event_discard(event);
+
+	/*
+	 * This must only be called if the event has not been
+	 * committed yet. Thus we can assume that preemption
+	 * is still disabled.
+	 */
+	RB_WARN_ON(buffer, !preempt_count());
+
+	cpu = smp_processor_id();
+	cpu_buffer = buffer->buffers[cpu];
+
+	new_index = rb_event_index(event);
+	old_index = new_index + rb_event_length(event);
+	addr = (unsigned long)event;
+	addr &= PAGE_MASK;
+
+	bpage = cpu_buffer->tail_page;
+
+	if (bpage == (void *)addr && rb_page_write(bpage) == old_index) {
+		/*
+		 * This is on the tail page. It is possible that
+		 * a write could come in and move the tail page
+		 * and write to the next page. That is fine
+		 * because we just shorten what is on this page.
+		 */
+		index = local_cmpxchg(&bpage->write, old_index, new_index);
+		if (index == old_index)
+			goto out;
+	}
+
+	/*
+	 * The commit is still visible by the reader, so we
+	 * must increment entries.
+	 */
+	cpu_buffer->entries++;
+ out:
+	/*
+	 * If a write came in and pushed the tail page
+	 * we still need to update the commit pointer
+	 * if we were the commit.
+	 */
+	if (rb_is_commit(cpu_buffer, event))
+		rb_set_commit_to_write(cpu_buffer);
+
+	/*
+	 * Only the last preempt count needs to restore preemption.
+	 */
+	if (preempt_count() == 1)
+		ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
+	else
+		preempt_enable_no_resched_notrace();
+
+}
+EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
+
+/**
  * ring_buffer_write - write data to the buffer without reserving
  * @buffer: The ring buffer to write to.
  * @length: The length of the data being written (excluding the event header)
-- 
cgit v0.10.2


From 77d9f465d46fd67cdb82ee5e1ab99dd57a17c486 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 2 Apr 2009 01:16:59 -0400
Subject: tracing/filters: use ring_buffer_discard_commit for discarded events

The ring_buffer_discard_commit makes better usage of the ring_buffer
when an event has been discarded. It tries to remove it completely if
possible.

This patch converts the trace event filtering to use
ring_buffer_discard_commit instead of the ring_buffer_event_discard.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c209d21..d880ab2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -884,13 +884,18 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
 void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc)
 {
-	return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
+	__trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
 }
 
 void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc)
 {
-	return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
+	__trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
+}
+
+void trace_current_buffer_discard_commit(struct ring_buffer_event *event)
+{
+	ring_buffer_discard_commit(global_trace.buffer, event);
 }
 
 void
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3cf856f..dfefffd 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -497,6 +497,7 @@ void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc);
 void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc);
+void trace_current_buffer_discard_commit(struct ring_buffer_event *event);
 
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 						struct trace_array_cpu *data);
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 9d2fa78..d2f34bf 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -223,9 +223,9 @@ static void ftrace_raw_event_##call(proto)				\
 	assign;								\
 									\
 	if (call->preds && !filter_match_preds(call, entry))		\
-		ring_buffer_event_discard(event);			\
-									\
-	trace_nowake_buffer_unlock_commit(event, irq_flags, pc);	\
+		trace_current_buffer_discard_commit(event);		\
+	else								\
+		trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
 									\
 }									\
 									\
-- 
cgit v0.10.2


From 5f77a88b3f8268b11940b51d2e03d26a663ceb90 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Wed, 8 Apr 2009 03:14:01 -0500
Subject: tracing/infrastructure: separate event tracer from event support

Add a new config option, CONFIG_EVENT_TRACING that gets selected
when CONFIG_TRACING is selected and adds everything needed by the stuff
in trace_export - basically all the event tracing support needed by e.g.
bprint, minus the actual events, which are only included if
CONFIG_EVENT_TRACER is selected.

So CONFIG_EVENT_TRACER can be used to turn on or off the generated events
(what I think of as the 'event tracer'), while CONFIG_EVENT_TRACING turns
on or off the base event tracing support used by both the event tracer and
the other things such as bprint that can't be configured out.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
LKML-Reference: <1239178441.10295.34.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 7fa660f..7e9b1e9 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -61,7 +61,7 @@
 #define BRANCH_PROFILE()
 #endif
 
-#ifdef CONFIG_EVENT_TRACER
+#ifdef CONFIG_EVENT_TRACING
 #define FTRACE_EVENTS()	VMLINUX_SYMBOL(__start_ftrace_events) = .;	\
 			*(_ftrace_events)				\
 			VMLINUX_SYMBOL(__stop_ftrace_events) = .;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 23b96eb..644606e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -48,6 +48,9 @@ config FTRACE_NMI_ENTER
        depends on HAVE_FTRACE_NMI_ENTER
        default y
 
+config EVENT_TRACING
+	bool
+
 config TRACING
 	bool
 	select DEBUG_FS
@@ -56,6 +59,7 @@ config TRACING
 	select TRACEPOINTS
 	select NOP_TRACER
 	select BINARY_PRINTF
+	select EVENT_TRACING
 
 #
 # Minimum requirements an architecture has to meet for us to
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 2630f51..3ad367e 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -40,11 +40,11 @@ obj-$(CONFIG_POWER_TRACER) += trace_power.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
-obj-$(CONFIG_EVENT_TRACER) += trace_events.o
+obj-$(CONFIG_EVENT_TRACING) += trace_events.o
 obj-$(CONFIG_EVENT_TRACER) += events.o
-obj-$(CONFIG_EVENT_TRACER) += trace_export.o
+obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
-obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o
+obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 
 libftrace-y := ftrace.o
-- 
cgit v0.10.2


From eb02ce017dd83985041a7e54c6449f92d53b026f Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Wed, 8 Apr 2009 03:15:54 -0500
Subject: tracing/filters: use ring_buffer_discard_commit() in
 filter_check_discard()

This patch changes filter_check_discard() to make use of the new
ring_buffer_discard_commit() function and modifies the current users to
call the old commit function in the non-discard case.

It also introduces a version of filter_check_discard() that uses the
global trace buffer (filter_current_check_discard()) for those cases.

v2 changes:

- fix compile error noticed by Ingo Molnar

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
LKML-Reference: <1239178554.10295.36.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 9419ad1..86cdf67 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -63,9 +63,8 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
 	entry->gfp_flags	= gfp_flags;
 	entry->node		= node;
 
-	filter_check_discard(call, entry, event);
-
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 }
@@ -90,9 +89,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
 	entry->call_site	= call_site;
 	entry->ptr		= ptr;
 
-	filter_check_discard(call, entry, event);
-
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d880ab2..c0047fc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -171,6 +171,12 @@ static struct trace_array	global_trace;
 
 static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
 
+int filter_current_check_discard(struct ftrace_event_call *call, void *rec,
+				 struct ring_buffer_event *event)
+{
+	return filter_check_discard(call, rec, global_trace.buffer, event);
+}
+
 cycle_t ftrace_now(int cpu)
 {
 	u64 ts;
@@ -919,9 +925,8 @@ trace_function(struct trace_array *tr,
 	entry->ip			= ip;
 	entry->parent_ip		= parent_ip;
 
-	filter_check_discard(call, entry, event);
-
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -943,8 +948,8 @@ static int __trace_graph_entry(struct trace_array *tr,
 		return 0;
 	entry	= ring_buffer_event_data(event);
 	entry->graph_ent			= *trace;
-	filter_check_discard(call, entry, event);
-	ring_buffer_unlock_commit(global_trace.buffer, event);
+	if (!filter_current_check_discard(call, entry, event))
+		ring_buffer_unlock_commit(global_trace.buffer, event);
 
 	return 1;
 }
@@ -967,8 +972,8 @@ static void __trace_graph_return(struct trace_array *tr,
 		return;
 	entry	= ring_buffer_event_data(event);
 	entry->ret				= *trace;
-	filter_check_discard(call, entry, event);
-	ring_buffer_unlock_commit(global_trace.buffer, event);
+	if (!filter_current_check_discard(call, entry, event))
+		ring_buffer_unlock_commit(global_trace.buffer, event);
 }
 #endif
 
@@ -1004,8 +1009,8 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 	trace.entries		= entry->caller;
 
 	save_stack_trace(&trace);
-	filter_check_discard(call, entry, event);
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 }
 
@@ -1052,8 +1057,8 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 	trace.entries		= entry->caller;
 
 	save_stack_trace_user(&trace);
-	filter_check_discard(call, entry, event);
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 }
 
@@ -1114,9 +1119,8 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->next_state		= next->state;
 	entry->next_cpu	= task_cpu(next);
 
-	filter_check_discard(call, entry, event);
-
-	trace_buffer_unlock_commit(tr, event, flags, pc);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		trace_buffer_unlock_commit(tr, event, flags, pc);
 }
 
 void
@@ -1142,9 +1146,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_state		= wakee->state;
 	entry->next_cpu			= task_cpu(wakee);
 
-	filter_check_discard(call, entry, event);
-
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 	ftrace_trace_stack(tr, flags, 6, pc);
 	ftrace_trace_userstack(tr, flags, pc);
 }
@@ -1285,8 +1288,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	entry->fmt			= fmt;
 
 	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
-	filter_check_discard(call, entry, event);
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 
 out_unlock:
 	__raw_spin_unlock(&trace_buf_lock);
@@ -1341,8 +1344,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 
 	memcpy(&entry->buf, trace_buf, len);
 	entry->buf[len] = 0;
-	filter_check_discard(call, entry, event);
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 
  out_unlock:
 	__raw_spin_unlock(&trace_buf_lock);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index dfefffd..9729d14 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -866,13 +866,21 @@ extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
 extern void filter_free_subsystem_preds(struct event_subsystem *system);
 extern int filter_add_subsystem_pred(struct event_subsystem *system,
 				     struct filter_pred *pred);
+extern int filter_current_check_discard(struct ftrace_event_call *call,
+					void *rec,
+					struct ring_buffer_event *event);
 
-static inline void
+static inline int
 filter_check_discard(struct ftrace_event_call *call, void *rec,
+		     struct ring_buffer *buffer,
 		     struct ring_buffer_event *event)
 {
-	if (unlikely(call->preds) && !filter_match_preds(call, rec))
-		ring_buffer_event_discard(event);
+	if (unlikely(call->preds) && !filter_match_preds(call, rec)) {
+		ring_buffer_discard_commit(buffer, event);
+		return 1;
+	}
+
+	return 0;
 }
 
 #define __common_field(type, item)					\
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index c95c25d..8e64e60 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -74,9 +74,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	entry->line = f->line;
 	entry->correct = val == expect;
 
-	filter_check_discard(call, entry, event);
-
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index d2f34bf..b2b2982 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -222,11 +222,8 @@ static void ftrace_raw_event_##call(proto)				\
 									\
 	assign;								\
 									\
-	if (call->preds && !filter_match_preds(call, entry))		\
-		trace_current_buffer_discard_commit(event);		\
-	else								\
+	if (!filter_current_check_discard(call, entry, event))		\
 		trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
-									\
 }									\
 									\
 static int ftrace_raw_reg_event_##call(void)				\
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index e6b275b..8683d50 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -195,8 +195,8 @@ void trace_hw_branch(u64 from, u64 to)
 	entry->ent.type = TRACE_HW_BRANCHES;
 	entry->from = from;
 	entry->to   = to;
-	filter_check_discard(call, entry, event);
-	trace_buffer_unlock_commit(tr, event, 0, 0);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		trace_buffer_unlock_commit(tr, event, 0, 0);
 
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index 8ce7d7d..810a5b7 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -55,8 +55,8 @@ static void probe_power_end(struct power_trace *it)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	entry->state_data = *it;
-	filter_check_discard(call, entry, event);
-	trace_buffer_unlock_commit(tr, event, 0, 0);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		trace_buffer_unlock_commit(tr, event, 0, 0);
  out:
 	preempt_enable();
 }
@@ -87,8 +87,8 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	entry->state_data = *it;
-	filter_check_discard(call, entry, event);
-	trace_buffer_unlock_commit(tr, event, 0, 0);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		trace_buffer_unlock_commit(tr, event, 0, 0);
  out:
 	preempt_enable();
 }
-- 
cgit v0.10.2


From 0a19e53c1514ad8e9c3cbab40c6c3f52c86f403d Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Mon, 13 Apr 2009 03:17:50 -0500
Subject: tracing/filters: allow on-the-fly filter switching

This patch allows event filters to be safely removed or switched
on-the-fly while avoiding the use of rcu or the suspension of tracing of
previous versions.

It does it by adding a new filter_pred_none() predicate function which
does nothing and by never deallocating either the predicates or any of
the filter_pred members used in matching; the predicate lists are
allocated and initialized during ftrace_event_calls initialization.

Whenever a filter is removed or replaced, the filter_pred_* functions
currently in use by the affected ftrace_event_call are immediately
switched over to to the filter_pred_none() function, while the rest of
the filter_pred members are left intact, allowing any currently
executing filter_pred_* functions to finish up, using the values they're
currently using.

In the case of filter replacement, the new predicate values are copied
into the old predicates after the above step, and the filter_pred_none()
functions are replaced by the filter_pred_* functions for the new
filter.  In this case, it is possible though very unlikely that a
previous filter_pred_* is still running even after the
filter_pred_none() switch and the switch to the new filter_pred_*.  In
that case, however, because nothing has been deallocated in the
filter_pred, the worst that can happen is that the old filter_pred_*
function sees the new values and as a result produces either a false
positive or a false negative, depending on the values it finds.

So one downside to this method is that rarely, it can produce a bad
match during the filter switch, but it should be possible to live with
that, IMHO.

The other downside is that at least in this patch the predicate lists
are always pre-allocated, taking up memory from the start.  They could
probably be allocated on first-use, and de-allocated when tracing is
completely stopped - if this patch makes sense, I could create another
one to do that later on.

Oh, and it also places a restriction on the size of __arrays in events,
currently set to 128, since they can't be larger than the now embedded
str_val arrays in the filter_pred struct.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: paulmck@linux.vnet.ibm.com
LKML-Reference: <1239610670.6660.49.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9729d14..b05b6ac 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -813,6 +813,7 @@ struct ftrace_event_call {
 	int			(*show_format)(struct trace_seq *s);
 	int			(*define_fields)(void);
 	struct list_head	fields;
+	int			n_preds;
 	struct filter_pred	**preds;
 
 #ifdef CONFIG_EVENT_PROFILE
@@ -826,6 +827,7 @@ struct event_subsystem {
 	struct list_head	list;
 	const char		*name;
 	struct dentry		*entry;
+	int			n_preds;
 	struct filter_pred	**preds;
 };
 
@@ -834,7 +836,8 @@ struct event_subsystem {
 	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
 	     event++)
 
-#define MAX_FILTER_PRED 8
+#define MAX_FILTER_PRED		8
+#define MAX_FILTER_STR_VAL	128
 
 struct filter_pred;
 
@@ -843,7 +846,7 @@ typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
 struct filter_pred {
 	filter_pred_fn_t fn;
 	u64 val;
-	char *str_val;
+	char str_val[MAX_FILTER_STR_VAL];
 	int str_len;
 	char *field_name;
 	int offset;
@@ -855,13 +858,14 @@ struct filter_pred {
 
 int trace_define_field(struct ftrace_event_call *call, char *type,
 		       char *name, int offset, int size);
+extern int init_preds(struct ftrace_event_call *call);
 extern void filter_free_pred(struct filter_pred *pred);
-extern void filter_print_preds(struct filter_pred **preds,
+extern void filter_print_preds(struct filter_pred **preds, int n_preds,
 			       struct trace_seq *s);
 extern int filter_parse(char **pbuf, struct filter_pred *pred);
 extern int filter_add_pred(struct ftrace_event_call *call,
 			   struct filter_pred *pred);
-extern void filter_free_preds(struct ftrace_event_call *call);
+extern void filter_disable_preds(struct ftrace_event_call *call);
 extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
 extern void filter_free_subsystem_preds(struct event_subsystem *system);
 extern int filter_add_subsystem_pred(struct event_subsystem *system,
@@ -875,7 +879,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 		     struct ring_buffer *buffer,
 		     struct ring_buffer_event *event)
 {
-	if (unlikely(call->preds) && !filter_match_preds(call, rec)) {
+	if (unlikely(call->n_preds) && !filter_match_preds(call, rec)) {
 		ring_buffer_discard_commit(buffer, event);
 		return 1;
 	}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 789e14e..ead68ac 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -481,7 +481,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	trace_seq_init(s);
 
-	filter_print_preds(call->preds, s);
+	filter_print_preds(call->preds, call->n_preds, s);
 	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 	kfree(s);
@@ -516,7 +516,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	}
 
 	if (pred->clear) {
-		filter_free_preds(call);
+		filter_disable_preds(call);
 		filter_free_pred(pred);
 		return cnt;
 	}
@@ -527,6 +527,8 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		return err;
 	}
 
+	filter_free_pred(pred);
+
 	*ppos += cnt;
 
 	return cnt;
@@ -549,7 +551,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	trace_seq_init(s);
 
-	filter_print_preds(system->preds, s);
+	filter_print_preds(system->preds, system->n_preds, s);
 	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 	kfree(s);
@@ -712,6 +714,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 	list_add(&system->list, &event_subsystems);
 
 	system->preds = NULL;
+	system->n_preds = 0;
 
 	entry = debugfs_create_file("filter", 0644, system->entry, system,
 				    &ftrace_subsystem_filter_fops);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 9f8ecca..de42dad 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -82,25 +82,27 @@ static int filter_pred_string(struct filter_pred *pred, void *event)
 	return match;
 }
 
+static int filter_pred_none(struct filter_pred *pred, void *event)
+{
+	return 0;
+}
+
 /* return 1 if event matches, 0 otherwise (discard) */
 int filter_match_preds(struct ftrace_event_call *call, void *rec)
 {
 	int i, matched, and_failed = 0;
 	struct filter_pred *pred;
 
-	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		if (call->preds[i]) {
-			pred = call->preds[i];
-			if (and_failed && !pred->or)
-				continue;
-			matched = pred->fn(pred, rec);
-			if (!matched && !pred->or) {
-				and_failed = 1;
-				continue;
-			} else if (matched && pred->or)
-				return 1;
-		} else
-			break;
+	for (i = 0; i < call->n_preds; i++) {
+		pred = call->preds[i];
+		if (and_failed && !pred->or)
+			continue;
+		matched = pred->fn(pred, rec);
+		if (!matched && !pred->or) {
+			and_failed = 1;
+			continue;
+		} else if (matched && pred->or)
+			return 1;
 	}
 
 	if (and_failed)
@@ -109,31 +111,29 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec)
 	return 1;
 }
 
-void filter_print_preds(struct filter_pred **preds, struct trace_seq *s)
+void filter_print_preds(struct filter_pred **preds, int n_preds,
+			struct trace_seq *s)
 {
 	char *field_name;
 	struct filter_pred *pred;
 	int i;
 
-	if (!preds) {
+	if (!n_preds) {
 		trace_seq_printf(s, "none\n");
 		return;
 	}
 
-	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		if (preds[i]) {
-			pred = preds[i];
-			field_name = pred->field_name;
-			if (i)
-				trace_seq_printf(s, pred->or ? "|| " : "&& ");
-			trace_seq_printf(s, "%s ", field_name);
-			trace_seq_printf(s, pred->not ? "!= " : "== ");
-			if (pred->str_val)
-				trace_seq_printf(s, "%s\n", pred->str_val);
-			else
-				trace_seq_printf(s, "%llu\n", pred->val);
-		} else
-			break;
+	for (i = 0; i < n_preds; i++) {
+		pred = preds[i];
+		field_name = pred->field_name;
+		if (i)
+			trace_seq_printf(s, pred->or ? "|| " : "&& ");
+		trace_seq_printf(s, "%s ", field_name);
+		trace_seq_printf(s, pred->not ? "!= " : "== ");
+		if (pred->str_len)
+			trace_seq_printf(s, "%s\n", pred->str_val);
+		else
+			trace_seq_printf(s, "%llu\n", pred->val);
 	}
 }
 
@@ -156,20 +156,69 @@ void filter_free_pred(struct filter_pred *pred)
 		return;
 
 	kfree(pred->field_name);
-	kfree(pred->str_val);
 	kfree(pred);
 }
 
-void filter_free_preds(struct ftrace_event_call *call)
+static void filter_clear_pred(struct filter_pred *pred)
+{
+	kfree(pred->field_name);
+	pred->field_name = NULL;
+	pred->str_len = 0;
+}
+
+static int filter_set_pred(struct filter_pred *dest,
+			   struct filter_pred *src,
+			   filter_pred_fn_t fn)
+{
+	*dest = *src;
+	dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
+	if (!dest->field_name)
+		return -ENOMEM;
+	dest->fn = fn;
+
+	return 0;
+}
+
+void filter_disable_preds(struct ftrace_event_call *call)
 {
 	int i;
 
-	if (call->preds) {
-		for (i = 0; i < MAX_FILTER_PRED; i++)
+	call->n_preds = 0;
+
+	for (i = 0; i < MAX_FILTER_PRED; i++)
+		call->preds[i]->fn = filter_pred_none;
+}
+
+int init_preds(struct ftrace_event_call *call)
+{
+	struct filter_pred *pred;
+	int i;
+
+	call->n_preds = 0;
+
+	call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
+	if (!call->preds)
+		return -ENOMEM;
+
+	for (i = 0; i < MAX_FILTER_PRED; i++) {
+		pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+		if (!pred)
+			goto oom;
+		pred->fn = filter_pred_none;
+		call->preds[i] = pred;
+	}
+
+	return 0;
+
+oom:
+	for (i = 0; i < MAX_FILTER_PRED; i++) {
+		if (call->preds[i])
 			filter_free_pred(call->preds[i]);
-		kfree(call->preds);
-		call->preds = NULL;
 	}
+	kfree(call->preds);
+	call->preds = NULL;
+
+	return -ENOMEM;
 }
 
 void filter_free_subsystem_preds(struct event_subsystem *system)
@@ -177,11 +226,12 @@ void filter_free_subsystem_preds(struct event_subsystem *system)
 	struct ftrace_event_call *call = __start_ftrace_events;
 	int i;
 
-	if (system->preds) {
-		for (i = 0; i < MAX_FILTER_PRED; i++)
+	if (system->n_preds) {
+		for (i = 0; i < system->n_preds; i++)
 			filter_free_pred(system->preds[i]);
 		kfree(system->preds);
 		system->preds = NULL;
+		system->n_preds = 0;
 	}
 
 	events_for_each(call) {
@@ -189,33 +239,31 @@ void filter_free_subsystem_preds(struct event_subsystem *system)
 			continue;
 
 		if (!strcmp(call->system, system->name))
-			filter_free_preds(call);
+			filter_disable_preds(call);
 	}
 }
 
 static int __filter_add_pred(struct ftrace_event_call *call,
-			     struct filter_pred *pred)
+			     struct filter_pred *pred,
+			     filter_pred_fn_t fn)
 {
-	int i;
+	int idx, err;
 
-	if (call->preds && !pred->compound)
-		filter_free_preds(call);
+	if (call->n_preds && !pred->compound)
+		filter_disable_preds(call);
 
-	if (!call->preds) {
-		call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
-				      GFP_KERNEL);
-		if (!call->preds)
-			return -ENOMEM;
-	}
+	if (call->n_preds == MAX_FILTER_PRED)
+		return -ENOSPC;
 
-	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		if (!call->preds[i]) {
-			call->preds[i] = pred;
-			return 0;
-		}
-	}
+	idx = call->n_preds;
+	filter_clear_pred(call->preds[idx]);
+	err = filter_set_pred(call->preds[idx], pred, fn);
+	if (err)
+		return err;
+
+	call->n_preds++;
 
-	return -ENOSPC;
+	return 0;
 }
 
 static int is_string_field(const char *type)
@@ -229,98 +277,66 @@ static int is_string_field(const char *type)
 int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
 {
 	struct ftrace_event_field *field;
+	filter_pred_fn_t fn;
 
 	field = find_event_field(call, pred->field_name);
 	if (!field)
 		return -EINVAL;
 
+	pred->fn = filter_pred_none;
 	pred->offset = field->offset;
 
 	if (is_string_field(field->type)) {
-		if (!pred->str_val)
+		if (!pred->str_len)
 			return -EINVAL;
-		pred->fn = filter_pred_string;
+		fn = filter_pred_string;
 		pred->str_len = field->size;
-		return __filter_add_pred(call, pred);
+		return __filter_add_pred(call, pred, fn);
 	} else {
-		if (pred->str_val)
+		if (pred->str_len)
 			return -EINVAL;
 	}
 
 	switch (field->size) {
 	case 8:
-		pred->fn = filter_pred_64;
+		fn = filter_pred_64;
 		break;
 	case 4:
-		pred->fn = filter_pred_32;
+		fn = filter_pred_32;
 		break;
 	case 2:
-		pred->fn = filter_pred_16;
+		fn = filter_pred_16;
 		break;
 	case 1:
-		pred->fn = filter_pred_8;
+		fn = filter_pred_8;
 		break;
 	default:
 		return -EINVAL;
 	}
 
-	return __filter_add_pred(call, pred);
-}
-
-static struct filter_pred *copy_pred(struct filter_pred *pred)
-{
-	struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL);
-	if (!new_pred)
-		return NULL;
-
-	memcpy(new_pred, pred, sizeof(*pred));
-
-	if (pred->field_name) {
-		new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
-		if (!new_pred->field_name) {
-			kfree(new_pred);
-			return NULL;
-		}
-	}
-
-	if (pred->str_val) {
-		new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL);
-		if (!new_pred->str_val) {
-			filter_free_pred(new_pred);
-			return NULL;
-		}
-	}
-
-	return new_pred;
+	return __filter_add_pred(call, pred, fn);
 }
 
 int filter_add_subsystem_pred(struct event_subsystem *system,
 			      struct filter_pred *pred)
 {
 	struct ftrace_event_call *call = __start_ftrace_events;
-	struct filter_pred *event_pred;
-	int i;
 
-	if (system->preds && !pred->compound)
+	if (system->n_preds && !pred->compound)
 		filter_free_subsystem_preds(system);
 
-	if (!system->preds) {
+	if (!system->n_preds) {
 		system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
 					GFP_KERNEL);
 		if (!system->preds)
 			return -ENOMEM;
 	}
 
-	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		if (!system->preds[i]) {
-			system->preds[i] = pred;
-			break;
-		}
-	}
-
-	if (i == MAX_FILTER_PRED)
+	if (system->n_preds == MAX_FILTER_PRED)
 		return -ENOSPC;
 
+	system->preds[system->n_preds] = pred;
+
 	events_for_each(call) {
 		int err;
 
@@ -333,22 +349,16 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 		if (!find_event_field(call, pred->field_name))
 			continue;
 
-		event_pred = copy_pred(pred);
-		if (!event_pred)
-			goto oom;
-
-		err = filter_add_pred(call, event_pred);
-		if (err)
-			filter_free_pred(event_pred);
-		if (err == -ENOMEM)
-			goto oom;
+		err = filter_add_pred(call, pred);
+		if (err == -ENOMEM) {
+			system->preds[system->n_preds] = NULL;
+			return err;
+		}
 	}
 
-	return 0;
+	system->n_preds++;
 
-oom:
-	system->preds[i] = NULL;
-	return -ENOMEM;
+	return 0;
 }
 
 int filter_parse(char **pbuf, struct filter_pred *pred)
@@ -410,7 +420,8 @@ int filter_parse(char **pbuf, struct filter_pred *pred)
 		}
 	}
 
-	if (!val_str) {
+	if (!val_str || !strlen(val_str)
+	    || strlen(val_str) >= MAX_FILTER_STR_VAL) {
 		pred->field_name = NULL;
 		return -EINVAL;
 	}
@@ -419,11 +430,12 @@ int filter_parse(char **pbuf, struct filter_pred *pred)
 	if (!pred->field_name)
 		return -ENOMEM;
 
+	pred->str_len = 0;
 	pred->val = simple_strtoull(val_str, &tmp, 0);
 	if (tmp == val_str) {
-		pred->str_val = kstrdup(val_str, GFP_KERNEL);
-		if (!pred->str_val)
-			return -ENOMEM;
+		strncpy(pred->str_val, val_str, MAX_FILTER_STR_VAL);
+		pred->str_len = strlen(val_str);
+		pred->str_val[pred->str_len] = '\0';
 	} else if (*tmp != '\0')
 		return -EINVAL;
 
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 02fb710..59cfd7d 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -140,6 +140,7 @@ ftrace_format_##call(struct trace_seq *s)				\
 
 #undef __array
 #define __array(type, item, len)					\
+	BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);				\
 	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
 				 offsetof(typeof(field), item),		\
 				 sizeof(field.item));			\
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index b2b2982..5bb1b7f 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -255,6 +255,7 @@ static int ftrace_raw_init_event_##call(void)				\
 		return -ENODEV;						\
 	event_##call.id = id;						\
 	INIT_LIST_HEAD(&event_##call.fields);				\
+	init_preds(&event_##call);					\
 	return 0;							\
 }									\
 									\
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 77c494f..48fc02f 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -122,6 +122,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 static int ftrace_raw_init_event_##call(void)				\
 {									\
 	INIT_LIST_HEAD(&event_##call.fields);				\
+	init_preds(&event_##call);					\
 	return 0;							\
 }									\
 
-- 
cgit v0.10.2


From 6e837fb152410e571a81aaadbd9884f0bc46a55e Mon Sep 17 00:00:00 2001
From: Etienne Basset <etienne.basset@numericable.fr>
Date: Wed, 8 Apr 2009 20:39:40 +0200
Subject: smack: implement logging V3

This patch creates auditing functions usable by LSM to audit security
events. It provides standard dumping of FS, NET, task etc ... events
(code borrowed from SELinux)
and provides 2 callbacks to define LSM specific auditing, which should be
flexible enough to convert SELinux too.

Signed-off-by: Etienne Basset <etienne.basset@numericable.fr>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
cked-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h
new file mode 100644
index 0000000..e461b2c
--- /dev/null
+++ b/include/linux/lsm_audit.h
@@ -0,0 +1,111 @@
+/*
+ * Common LSM logging functions
+ * Heavily borrowed from selinux/avc.h
+ *
+ * Author : Etienne BASSET  <etienne.basset@ensta.org>
+ *
+ * All credits to : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * All BUGS to : Etienne BASSET  <etienne.basset@ensta.org>
+ */
+#ifndef _LSM_COMMON_LOGGING_
+#define _LSM_COMMON_LOGGING_
+
+#include <linux/stddef.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/kdev_t.h>
+#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <linux/audit.h>
+#include <linux/in6.h>
+#include <linux/path.h>
+#include <linux/key.h>
+#include <linux/skbuff.h>
+#include <asm/system.h>
+
+
+/* Auxiliary data to use in generating the audit record. */
+struct common_audit_data {
+	char    type;
+#define LSM_AUDIT_DATA_FS      1
+#define LSM_AUDIT_DATA_NET     2
+#define LSM_AUDIT_DATA_CAP     3
+#define LSM_AUDIT_DATA_IPC     4
+#define LSM_AUDIT_DATA_TASK    5
+#define LSM_AUDIT_DATA_KEY     6
+	struct task_struct *tsk;
+	union 	{
+		struct {
+			struct path path;
+			struct inode *inode;
+		} fs;
+		struct {
+			int netif;
+			struct sock *sk;
+			u16 family;
+			__be16 dport;
+			__be16 sport;
+			union {
+				struct {
+					__be32 daddr;
+					__be32 saddr;
+				} v4;
+				struct {
+					struct in6_addr daddr;
+					struct in6_addr saddr;
+				} v6;
+			} fam;
+		} net;
+		int cap;
+		int ipc_id;
+		struct task_struct *tsk;
+#ifdef CONFIG_KEYS
+		struct {
+			key_serial_t key;
+			char *key_desc;
+		} key_struct;
+#endif
+	} u;
+	const char *function;
+	/* this union contains LSM specific data */
+	union {
+		/* SMACK data */
+		struct smack_audit_data {
+			char *subject;
+			char *object;
+			char *request;
+			int result;
+		} smack_audit_data;
+		/* SELinux data */
+		struct {
+			u32 ssid;
+			u32 tsid;
+			u16 tclass;
+			u32 requested;
+			u32 audited;
+			struct av_decision *avd;
+			int result;
+		} selinux_audit_data;
+	} lsm_priv;
+	/* these callback will be implemented by a specific LSM */
+	void (*lsm_pre_audit)(struct audit_buffer *, void *);
+	void (*lsm_post_audit)(struct audit_buffer *, void *);
+};
+
+#define v4info fam.v4
+#define v6info fam.v6
+
+int ipv4_skb_to_auditdata(struct sk_buff *skb,
+		struct common_audit_data *ad, u8 *proto);
+
+int ipv6_skb_to_auditdata(struct sk_buff *skb,
+		struct common_audit_data *ad, u8 *proto);
+
+/* Initialize an LSM audit data structure. */
+#define COMMON_AUDIT_DATA_INIT(_d, _t) \
+	{ memset((_d), 0, sizeof(struct common_audit_data)); \
+	 (_d)->type = LSM_AUDIT_DATA_##_t; (_d)->function = __func__; }
+
+void common_lsm_audit(struct common_audit_data *a);
+
+#endif
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
new file mode 100644
index 0000000..94b8684
--- /dev/null
+++ b/security/lsm_audit.c
@@ -0,0 +1,386 @@
+/*
+ * common LSM auditing functions
+ *
+ * Based on code written for SELinux by :
+ *			Stephen Smalley, <sds@epoch.ncsc.mil>
+ * 			James Morris <jmorris@redhat.com>
+ * Author : Etienne Basset, <etienne.basset@ensta.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2,
+ * as published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <net/sock.h>
+#include <linux/un.h>
+#include <net/af_unix.h>
+#include <linux/audit.h>
+#include <linux/ipv6.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/dccp.h>
+#include <linux/sctp.h>
+#include <linux/lsm_audit.h>
+
+/**
+ * ipv4_skb_to_auditdata : fill auditdata from skb
+ * @skb : the skb
+ * @ad : the audit data to fill
+ * @proto : the layer 4 protocol
+ *
+ * return  0 on success
+ */
+int ipv4_skb_to_auditdata(struct sk_buff *skb,
+		struct common_audit_data *ad, u8 *proto)
+{
+	int ret = 0;
+	struct iphdr *ih;
+
+	ih = ip_hdr(skb);
+	if (ih == NULL)
+		return -EINVAL;
+
+	ad->u.net.v4info.saddr = ih->saddr;
+	ad->u.net.v4info.daddr = ih->daddr;
+
+	if (proto)
+		*proto = ih->protocol;
+	/* non initial fragment */
+	if (ntohs(ih->frag_off) & IP_OFFSET)
+		return 0;
+
+	switch (ih->protocol) {
+	case IPPROTO_TCP: {
+		struct tcphdr *th = tcp_hdr(skb);
+		if (th == NULL)
+			break;
+
+		ad->u.net.sport = th->source;
+		ad->u.net.dport = th->dest;
+		break;
+	}
+	case IPPROTO_UDP: {
+		struct udphdr *uh = udp_hdr(skb);
+		if (uh == NULL)
+			break;
+
+		ad->u.net.sport = uh->source;
+		ad->u.net.dport = uh->dest;
+		break;
+	}
+	case IPPROTO_DCCP: {
+		struct dccp_hdr *dh = dccp_hdr(skb);
+		if (dh == NULL)
+			break;
+
+		ad->u.net.sport = dh->dccph_sport;
+		ad->u.net.dport = dh->dccph_dport;
+		break;
+	}
+	case IPPROTO_SCTP: {
+		struct sctphdr *sh = sctp_hdr(skb);
+		if (sh == NULL)
+			break;
+		ad->u.net.sport = sh->source;
+		ad->u.net.dport = sh->dest;
+		break;
+	}
+	default:
+		ret = -EINVAL;
+	}
+	return ret;
+}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/**
+ * ipv6_skb_to_auditdata : fill auditdata from skb
+ * @skb : the skb
+ * @ad : the audit data to fill
+ * @proto : the layer 4 protocol
+ *
+ * return  0 on success
+ */
+int ipv6_skb_to_auditdata(struct sk_buff *skb,
+		struct common_audit_data *ad, u8 *proto)
+{
+	int offset, ret = 0;
+	struct ipv6hdr *ip6;
+	u8 nexthdr;
+
+	ip6 = ipv6_hdr(skb);
+	if (ip6 == NULL)
+		return -EINVAL;
+	ipv6_addr_copy(&ad->u.net.v6info.saddr, &ip6->saddr);
+	ipv6_addr_copy(&ad->u.net.v6info.daddr, &ip6->daddr);
+	ret = 0;
+	/* IPv6 can have several extension header before the Transport header
+	 * skip them */
+	offset = skb_network_offset(skb);
+	offset += sizeof(*ip6);
+	nexthdr = ip6->nexthdr;
+	offset = ipv6_skip_exthdr(skb, offset, &nexthdr);
+	if (offset < 0)
+		return 0;
+	if (proto)
+		*proto = nexthdr;
+	switch (nexthdr) {
+	case IPPROTO_TCP: {
+		struct tcphdr _tcph, *th;
+
+		th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
+		if (th == NULL)
+			break;
+
+		ad->u.net.sport = th->source;
+		ad->u.net.dport = th->dest;
+		break;
+	}
+	case IPPROTO_UDP: {
+		struct udphdr _udph, *uh;
+
+		uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
+		if (uh == NULL)
+			break;
+
+		ad->u.net.sport = uh->source;
+		ad->u.net.dport = uh->dest;
+		break;
+	}
+	case IPPROTO_DCCP: {
+		struct dccp_hdr _dccph, *dh;
+
+		dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph);
+		if (dh == NULL)
+			break;
+
+		ad->u.net.sport = dh->dccph_sport;
+		ad->u.net.dport = dh->dccph_dport;
+		break;
+	}
+	case IPPROTO_SCTP: {
+		struct sctphdr _sctph, *sh;
+
+		sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph);
+		if (sh == NULL)
+			break;
+		ad->u.net.sport = sh->source;
+		ad->u.net.dport = sh->dest;
+		break;
+	}
+	default:
+		ret = -EINVAL;
+	}
+	return ret;
+}
+#endif
+
+
+static inline void print_ipv6_addr(struct audit_buffer *ab,
+				   struct in6_addr *addr, __be16 port,
+				   char *name1, char *name2)
+{
+	if (!ipv6_addr_any(addr))
+		audit_log_format(ab, " %s=%pI6", name1, addr);
+	if (port)
+		audit_log_format(ab, " %s=%d", name2, ntohs(port));
+}
+
+static inline void print_ipv4_addr(struct audit_buffer *ab, __be32 addr,
+				   __be16 port, char *name1, char *name2)
+{
+	if (addr)
+		audit_log_format(ab, " %s=%pI4", name1, &addr);
+	if (port)
+		audit_log_format(ab, " %s=%d", name2, ntohs(port));
+}
+
+/**
+ * dump_common_audit_data - helper to dump common audit data
+ * @a : common audit data
+ *
+ */
+static void dump_common_audit_data(struct audit_buffer *ab,
+				   struct common_audit_data *a)
+{
+	struct inode *inode = NULL;
+	struct task_struct *tsk = current;
+
+	if (a->tsk)
+		tsk = a->tsk;
+	if (tsk && tsk->pid) {
+		audit_log_format(ab, " pid=%d comm=", tsk->pid);
+		audit_log_untrustedstring(ab, tsk->comm);
+	}
+
+	switch (a->type) {
+	case LSM_AUDIT_DATA_IPC:
+		audit_log_format(ab, " key=%d ", a->u.ipc_id);
+		break;
+	case LSM_AUDIT_DATA_CAP:
+		audit_log_format(ab, " capability=%d ", a->u.cap);
+		break;
+	case LSM_AUDIT_DATA_FS:
+		if (a->u.fs.path.dentry) {
+			struct dentry *dentry = a->u.fs.path.dentry;
+			if (a->u.fs.path.mnt) {
+				audit_log_d_path(ab, "path=", &a->u.fs.path);
+			} else {
+				audit_log_format(ab, " name=");
+				audit_log_untrustedstring(ab,
+						 dentry->d_name.name);
+			}
+			inode = dentry->d_inode;
+		} else if (a->u.fs.inode) {
+			struct dentry *dentry;
+			inode = a->u.fs.inode;
+			dentry = d_find_alias(inode);
+			if (dentry) {
+				audit_log_format(ab, " name=");
+				audit_log_untrustedstring(ab,
+						 dentry->d_name.name);
+				dput(dentry);
+			}
+		}
+		if (inode)
+			audit_log_format(ab, " dev=%s ino=%lu",
+					inode->i_sb->s_id,
+					inode->i_ino);
+		break;
+	case LSM_AUDIT_DATA_TASK:
+		tsk = a->u.tsk;
+		if (tsk && tsk->pid) {
+			audit_log_format(ab, " pid=%d comm=", tsk->pid);
+			audit_log_untrustedstring(ab, tsk->comm);
+		}
+		break;
+	case LSM_AUDIT_DATA_NET:
+		if (a->u.net.sk) {
+			struct sock *sk = a->u.net.sk;
+			struct unix_sock *u;
+			int len = 0;
+			char *p = NULL;
+
+			switch (sk->sk_family) {
+			case AF_INET: {
+				struct inet_sock *inet = inet_sk(sk);
+
+				print_ipv4_addr(ab, inet->rcv_saddr,
+						inet->sport,
+						"laddr", "lport");
+				print_ipv4_addr(ab, inet->daddr,
+						inet->dport,
+						"faddr", "fport");
+				break;
+			}
+			case AF_INET6: {
+				struct inet_sock *inet = inet_sk(sk);
+				struct ipv6_pinfo *inet6 = inet6_sk(sk);
+
+				print_ipv6_addr(ab, &inet6->rcv_saddr,
+						inet->sport,
+						"laddr", "lport");
+				print_ipv6_addr(ab, &inet6->daddr,
+						inet->dport,
+						"faddr", "fport");
+				break;
+			}
+			case AF_UNIX:
+				u = unix_sk(sk);
+				if (u->dentry) {
+					struct path path = {
+						.dentry = u->dentry,
+						.mnt = u->mnt
+					};
+					audit_log_d_path(ab, "path=", &path);
+					break;
+				}
+				if (!u->addr)
+					break;
+				len = u->addr->len-sizeof(short);
+				p = &u->addr->name->sun_path[0];
+				audit_log_format(ab, " path=");
+				if (*p)
+					audit_log_untrustedstring(ab, p);
+				else
+					audit_log_n_hex(ab, p, len);
+				break;
+			}
+		}
+
+		switch (a->u.net.family) {
+		case AF_INET:
+			print_ipv4_addr(ab, a->u.net.v4info.saddr,
+					a->u.net.sport,
+					"saddr", "src");
+			print_ipv4_addr(ab, a->u.net.v4info.daddr,
+					a->u.net.dport,
+					"daddr", "dest");
+			break;
+		case AF_INET6:
+			print_ipv6_addr(ab, &a->u.net.v6info.saddr,
+					a->u.net.sport,
+					"saddr", "src");
+			print_ipv6_addr(ab, &a->u.net.v6info.daddr,
+					a->u.net.dport,
+					"daddr", "dest");
+			break;
+		}
+		if (a->u.net.netif > 0) {
+			struct net_device *dev;
+
+			/* NOTE: we always use init's namespace */
+			dev = dev_get_by_index(&init_net, a->u.net.netif);
+			if (dev) {
+				audit_log_format(ab, " netif=%s", dev->name);
+				dev_put(dev);
+			}
+		}
+		break;
+#ifdef CONFIG_KEYS
+	case LSM_AUDIT_DATA_KEY:
+		audit_log_format(ab, " key_serial=%u", a->u.key_struct.key);
+		if (a->u.key_struct.key_desc) {
+			audit_log_format(ab, " key_desc=");
+			audit_log_untrustedstring(ab, a->u.key_struct.key_desc);
+		}
+		break;
+#endif
+	} /* switch (a->type) */
+}
+
+/**
+ * common_lsm_audit - generic LSM auditing function
+ * @a:  auxiliary audit data
+ *
+ * setup the audit buffer for common security information
+ * uses callback to print LSM specific information
+ */
+void common_lsm_audit(struct common_audit_data *a)
+{
+	struct audit_buffer *ab;
+
+	if (a == NULL)
+		return;
+	/* we use GFP_ATOMIC so we won't sleep */
+	ab = audit_log_start(current->audit_context, GFP_ATOMIC, AUDIT_AVC);
+
+	if (ab == NULL)
+		return;
+
+	if (a->lsm_pre_audit)
+		a->lsm_pre_audit(ab, a);
+
+	dump_common_audit_data(ab, a);
+
+	if (a->lsm_post_audit)
+		a->lsm_post_audit(ab, a);
+
+	audit_log_end(ab);
+}
-- 
cgit v0.10.2


From ecfcc53fef3c357574bb6143dce6631e6d56295c Mon Sep 17 00:00:00 2001
From: Etienne Basset <etienne.basset@numericable.fr>
Date: Wed, 8 Apr 2009 20:40:06 +0200
Subject: smack: implement logging V3

the following patch, add logging of Smack security decisions.
This is of course very useful to understand what your current smack policy does.
As suggested by Casey, it also now forbids labels with ', " or \

It introduces a '/smack/logging' switch :
0: no logging
1: log denied (default)
2: log accepted
3: log denied&accepted

Signed-off-by: Etienne Basset <etienne.basset@numericable.fr>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Acked-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/Documentation/Smack.txt b/Documentation/Smack.txt
index 629c92e..34614b4 100644
--- a/Documentation/Smack.txt
+++ b/Documentation/Smack.txt
@@ -184,8 +184,9 @@ length. Single character labels using special characters, that being anything
 other than a letter or digit, are reserved for use by the Smack development
 team. Smack labels are unstructured, case sensitive, and the only operation
 ever performed on them is comparison for equality. Smack labels cannot
-contain unprintable characters or the "/" (slash) character. Smack labels
-cannot begin with a '-', which is reserved for special options.
+contain unprintable characters, the "/" (slash), the "\" (backslash), the "'"
+(quote) and '"' (double-quote) characters.
+Smack labels cannot begin with a '-', which is reserved for special options.
 
 There are some predefined labels:
 
@@ -523,3 +524,18 @@ Smack supports some mount options:
 
 These mount options apply to all file system types.
 
+Smack auditing
+
+If you want Smack auditing of security events, you need to set CONFIG_AUDIT
+in your kernel configuration.
+By default, all denied events will be audited. You can change this behavior by
+writing a single character to the /smack/logging file :
+0 : no logging
+1 : log denied (default)
+2 : log accepted
+3 : log denied & accepted
+
+Events are logged as 'key=value' pairs, for each event you at least will get
+the subjet, the object, the rights requested, the action, the kernel function
+that triggered the event, plus other pairs depending on the type of event
+audited.
diff --git a/security/Makefile b/security/Makefile
index fa77021..c67557c 100644
--- a/security/Makefile
+++ b/security/Makefile
@@ -16,6 +16,9 @@ obj-$(CONFIG_SECURITYFS)		+= inode.o
 # Must precede capability.o in order to stack properly.
 obj-$(CONFIG_SECURITY_SELINUX)		+= selinux/built-in.o
 obj-$(CONFIG_SECURITY_SMACK)		+= smack/built-in.o
+ifeq ($(CONFIG_AUDIT),y)
+obj-$(CONFIG_SECURITY_SMACK)		+= lsm_audit.o
+endif
 obj-$(CONFIG_SECURITY_TOMOYO)		+= tomoyo/built-in.o
 obj-$(CONFIG_SECURITY_ROOTPLUG)		+= root_plug.o
 obj-$(CONFIG_CGROUP_DEVICE)		+= device_cgroup.o
diff --git a/security/smack/smack.h b/security/smack/smack.h
index 42ef313..243bec1 100644
--- a/security/smack/smack.h
+++ b/security/smack/smack.h
@@ -20,6 +20,7 @@
 #include <net/netlabel.h>
 #include <linux/list.h>
 #include <linux/rculist.h>
+#include <linux/lsm_audit.h>
 
 /*
  * Why 23? CIPSO is constrained to 30, so a 32 byte buffer is
@@ -179,6 +180,20 @@ struct smack_known {
 #define MAY_NOT		0
 
 /*
+ * Number of access types used by Smack (rwxa)
+ */
+#define SMK_NUM_ACCESS_TYPE 4
+
+/*
+ * Smack audit data; is empty if CONFIG_AUDIT not set
+ * to save some stack
+ */
+struct smk_audit_info {
+#ifdef CONFIG_AUDIT
+	struct common_audit_data a;
+#endif
+};
+/*
  * These functions are in smack_lsm.c
  */
 struct inode_smack *new_inode_smack(char *);
@@ -186,8 +201,8 @@ struct inode_smack *new_inode_smack(char *);
 /*
  * These functions are in smack_access.c
  */
-int smk_access(char *, char *, int);
-int smk_curacc(char *, u32);
+int smk_access(char *, char *, int, struct smk_audit_info *);
+int smk_curacc(char *, u32, struct smk_audit_info *);
 int smack_to_cipso(const char *, struct smack_cipso *);
 void smack_from_cipso(u32, char *, char *);
 char *smack_from_secid(const u32);
@@ -237,4 +252,93 @@ static inline char *smk_of_inode(const struct inode *isp)
 	return sip->smk_inode;
 }
 
+/*
+ * logging functions
+ */
+#define SMACK_AUDIT_DENIED 0x1
+#define SMACK_AUDIT_ACCEPT 0x2
+extern int log_policy;
+
+void smack_log(char *subject_label, char *object_label,
+		int request,
+		int result, struct smk_audit_info *auditdata);
+
+#ifdef CONFIG_AUDIT
+
+/*
+ * some inline functions to set up audit data
+ * they do nothing if CONFIG_AUDIT is not set
+ *
+ */
+static inline void smk_ad_init(struct smk_audit_info *a, const char *func,
+			       char type)
+{
+	memset(a, 0, sizeof(*a));
+	a->a.type = type;
+	a->a.function = func;
+}
+
+static inline void smk_ad_setfield_u_tsk(struct smk_audit_info *a,
+					 struct task_struct *t)
+{
+	a->a.u.tsk = t;
+}
+static inline void smk_ad_setfield_u_fs_path_dentry(struct smk_audit_info *a,
+						    struct dentry *d)
+{
+	a->a.u.fs.path.dentry = d;
+}
+static inline void smk_ad_setfield_u_fs_path_mnt(struct smk_audit_info *a,
+						 struct vfsmount *m)
+{
+	a->a.u.fs.path.mnt = m;
+}
+static inline void smk_ad_setfield_u_fs_inode(struct smk_audit_info *a,
+					      struct inode *i)
+{
+	a->a.u.fs.inode = i;
+}
+static inline void smk_ad_setfield_u_fs_path(struct smk_audit_info *a,
+					     struct path p)
+{
+	a->a.u.fs.path = p;
+}
+static inline void smk_ad_setfield_u_net_sk(struct smk_audit_info *a,
+					    struct sock *sk)
+{
+	a->a.u.net.sk = sk;
+}
+
+#else /* no AUDIT */
+
+static inline void smk_ad_init(struct smk_audit_info *a, const char *func,
+			       char type)
+{
+}
+static inline void smk_ad_setfield_u_tsk(struct smk_audit_info *a,
+					 struct task_struct *t)
+{
+}
+static inline void smk_ad_setfield_u_fs_path_dentry(struct smk_audit_info *a,
+						    struct dentry *d)
+{
+}
+static inline void smk_ad_setfield_u_fs_path_mnt(struct smk_audit_info *a,
+						 struct vfsmount *m)
+{
+}
+static inline void smk_ad_setfield_u_fs_inode(struct smk_audit_info *a,
+					      struct inode *i)
+{
+}
+static inline void smk_ad_setfield_u_fs_path(struct smk_audit_info *a,
+					     struct path p)
+{
+}
+static inline void smk_ad_setfield_u_net_sk(struct smk_audit_info *a,
+					    struct sock *sk)
+{
+}
+#endif
+
 #endif  /* _SECURITY_SMACK_H */
diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c
index ac0a270..513dc1a 100644
--- a/security/smack/smack_access.c
+++ b/security/smack/smack_access.c
@@ -59,11 +59,18 @@ LIST_HEAD(smack_known_list);
  */
 static u32 smack_next_secid = 10;
 
+/*
+ * what events do we log
+ * can be overwritten at run-time by /smack/logging
+ */
+int log_policy = SMACK_AUDIT_DENIED;
+
 /**
  * smk_access - determine if a subject has a specific access to an object
  * @subject_label: a pointer to the subject's Smack label
  * @object_label: a pointer to the object's Smack label
  * @request: the access requested, in "MAY" format
+ * @a : a pointer to the audit data
  *
  * This function looks up the subject/object pair in the
  * access rule list and returns 0 if the access is permitted,
@@ -78,10 +85,12 @@ static u32 smack_next_secid = 10;
  * will be on the list, so checking the pointers may be a worthwhile
  * optimization.
  */
-int smk_access(char *subject_label, char *object_label, int request)
+int smk_access(char *subject_label, char *object_label, int request,
+	       struct smk_audit_info *a)
 {
 	u32 may = MAY_NOT;
 	struct smack_rule *srp;
+	int rc = 0;
 
 	/*
 	 * Hardcoded comparisons.
@@ -89,8 +98,10 @@ int smk_access(char *subject_label, char *object_label, int request)
 	 * A star subject can't access any object.
 	 */
 	if (subject_label == smack_known_star.smk_known ||
-	    strcmp(subject_label, smack_known_star.smk_known) == 0)
-		return -EACCES;
+	    strcmp(subject_label, smack_known_star.smk_known) == 0) {
+		rc = -EACCES;
+		goto out_audit;
+	}
 	/*
 	 * An internet object can be accessed by any subject.
 	 * Tasks cannot be assigned the internet label.
@@ -100,20 +111,20 @@ int smk_access(char *subject_label, char *object_label, int request)
 	    subject_label == smack_known_web.smk_known ||
 	    strcmp(object_label, smack_known_web.smk_known) == 0 ||
 	    strcmp(subject_label, smack_known_web.smk_known) == 0)
-		return 0;
+		goto out_audit;
 	/*
 	 * A star object can be accessed by any subject.
 	 */
 	if (object_label == smack_known_star.smk_known ||
 	    strcmp(object_label, smack_known_star.smk_known) == 0)
-		return 0;
+		goto out_audit;
 	/*
 	 * An object can be accessed in any way by a subject
 	 * with the same label.
 	 */
 	if (subject_label == object_label ||
 	    strcmp(subject_label, object_label) == 0)
-		return 0;
+		goto out_audit;
 	/*
 	 * A hat subject can read any object.
 	 * A floor object can be read by any subject.
@@ -121,10 +132,10 @@ int smk_access(char *subject_label, char *object_label, int request)
 	if ((request & MAY_ANYREAD) == request) {
 		if (object_label == smack_known_floor.smk_known ||
 		    strcmp(object_label, smack_known_floor.smk_known) == 0)
-			return 0;
+			goto out_audit;
 		if (subject_label == smack_known_hat.smk_known ||
 		    strcmp(subject_label, smack_known_hat.smk_known) == 0)
-			return 0;
+			goto out_audit;
 	}
 	/*
 	 * Beyond here an explicit relationship is required.
@@ -148,28 +159,36 @@ int smk_access(char *subject_label, char *object_label, int request)
 	 * This is a bit map operation.
 	 */
 	if ((request & may) == request)
-		return 0;
-
-	return -EACCES;
+		goto out_audit;
+
+	rc = -EACCES;
+out_audit:
+#ifdef CONFIG_AUDIT
+	if (a)
+		smack_log(subject_label, object_label, request, rc, a);
+#endif
+	return rc;
 }
 
 /**
  * smk_curacc - determine if current has a specific access to an object
  * @obj_label: a pointer to the object's Smack label
  * @mode: the access requested, in "MAY" format
+ * @a : common audit data
  *
  * This function checks the current subject label/object label pair
  * in the access rule list and returns 0 if the access is permitted,
  * non zero otherwise. It allows that current may have the capability
  * to override the rules.
  */
-int smk_curacc(char *obj_label, u32 mode)
+int smk_curacc(char *obj_label, u32 mode, struct smk_audit_info *a)
 {
 	int rc;
+	char *sp = current_security();
 
-	rc = smk_access(current_security(), obj_label, mode);
+	rc = smk_access(sp, obj_label, mode, NULL);
 	if (rc == 0)
-		return 0;
+		goto out_audit;
 
 	/*
 	 * Return if a specific label has been designated as the
@@ -177,14 +196,105 @@ int smk_curacc(char *obj_label, u32 mode)
 	 * have that label.
 	 */
 	if (smack_onlycap != NULL && smack_onlycap != current->cred->security)
-		return rc;
+		goto out_audit;
 
 	if (capable(CAP_MAC_OVERRIDE))
 		return 0;
 
+out_audit:
+#ifdef CONFIG_AUDIT
+	if (a)
+		smack_log(sp, obj_label, mode, rc, a);
+#endif
 	return rc;
 }
 
+#ifdef CONFIG_AUDIT
+/**
+ * smack_str_from_perm : helper to transalate an int to a
+ * readable string
+ * @string : the string to fill
+ * @access : the int
+ *
+ */
+static inline void smack_str_from_perm(char *string, int access)
+{
+	int i = 0;
+	if (access & MAY_READ)
+		string[i++] = 'r';
+	if (access & MAY_WRITE)
+		string[i++] = 'w';
+	if (access & MAY_EXEC)
+		string[i++] = 'x';
+	if (access & MAY_APPEND)
+		string[i++] = 'a';
+	string[i] = '\0';
+}
+/**
+ * smack_log_callback - SMACK specific information
+ * will be called by generic audit code
+ * @ab : the audit_buffer
+ * @a  : audit_data
+ *
+ */
+static void smack_log_callback(struct audit_buffer *ab, void *a)
+{
+	struct common_audit_data *ad = a;
+	struct smack_audit_data *sad = &ad->lsm_priv.smack_audit_data;
+	audit_log_format(ab, "lsm=SMACK fn=%s action=%s", ad->function,
+			 sad->result ? "denied" : "granted");
+	audit_log_format(ab, " subject=");
+	audit_log_untrustedstring(ab, sad->subject);
+	audit_log_format(ab, " object=");
+	audit_log_untrustedstring(ab, sad->object);
+	audit_log_format(ab, " requested=%s", sad->request);
+}
+
+/**
+ *  smack_log - Audit the granting or denial of permissions.
+ *  @subject_label : smack label of the requester
+ *  @object_label  : smack label of the object being accessed
+ *  @request: requested permissions
+ *  @result: result from smk_access
+ *  @a:  auxiliary audit data
+ *
+ * Audit the granting or denial of permissions in accordance
+ * with the policy.
+ */
+void smack_log(char *subject_label, char *object_label, int request,
+	       int result, struct smk_audit_info *ad)
+{
+	char request_buffer[SMK_NUM_ACCESS_TYPE + 1];
+	struct smack_audit_data *sad;
+	struct common_audit_data *a = &ad->a;
+
+	/* check if we have to log the current event */
+	if (result != 0 && (log_policy & SMACK_AUDIT_DENIED) == 0)
+		return;
+	if (result == 0 && (log_policy & SMACK_AUDIT_ACCEPT) == 0)
+		return;
+
+	if (a->function == NULL)
+		a->function = "unknown";
+
+	/* end preparing the audit data */
+	sad = &a->lsm_priv.smack_audit_data;
+	smack_str_from_perm(request_buffer, request);
+	sad->subject = subject_label;
+	sad->object  = object_label;
+	sad->request = request_buffer;
+	sad->result  = result;
+	a->lsm_pre_audit = smack_log_callback;
+
+	common_lsm_audit(a);
+}
+#else /* #ifdef CONFIG_AUDIT */
+void smack_log(char *subject_label, char *object_label, int request,
+               int result, struct smk_audit_info *ad)
+{
+}
+#endif
+
 static DEFINE_MUTEX(smack_known_lock);
 
 /**
@@ -209,7 +319,8 @@ struct smack_known *smk_import_entry(const char *string, int len)
 		if (found)
 			smack[i] = '\0';
 		else if (i >= len || string[i] > '~' || string[i] <= ' ' ||
-			 string[i] == '/') {
+			 string[i] == '/' || string[i] == '"' ||
+			 string[i] == '\\' || string[i] == '\'') {
 			smack[i] = '\0';
 			found = 1;
 		} else
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 9215149..f557767 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -30,7 +30,6 @@
 #include <net/netlabel.h>
 #include <net/cipso_ipv4.h>
 #include <linux/audit.h>
-
 #include "smack.h"
 
 #define task_security(task)	(task_cred_xxx((task), security))
@@ -103,14 +102,24 @@ struct inode_smack *new_inode_smack(char *smack)
 static int smack_ptrace_may_access(struct task_struct *ctp, unsigned int mode)
 {
 	int rc;
+	struct smk_audit_info ad;
+	char *sp, *tsp;
 
 	rc = cap_ptrace_may_access(ctp, mode);
 	if (rc != 0)
 		return rc;
 
-	rc = smk_access(current_security(), task_security(ctp), MAY_READWRITE);
+	sp = current_security();
+	tsp = task_security(ctp);
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
+	smk_ad_setfield_u_tsk(&ad, ctp);
+
+	/* we won't log here, because rc can be overriden */
+	rc = smk_access(sp, tsp, MAY_READWRITE, NULL);
 	if (rc != 0 && capable(CAP_MAC_OVERRIDE))
-		return 0;
+		rc = 0;
+
+	smack_log(sp, tsp, MAY_READWRITE, rc, &ad);
 	return rc;
 }
 
@@ -125,14 +134,24 @@ static int smack_ptrace_may_access(struct task_struct *ctp, unsigned int mode)
 static int smack_ptrace_traceme(struct task_struct *ptp)
 {
 	int rc;
+	struct smk_audit_info ad;
+	char *sp, *tsp;
 
 	rc = cap_ptrace_traceme(ptp);
 	if (rc != 0)
 		return rc;
 
-	rc = smk_access(task_security(ptp), current_security(), MAY_READWRITE);
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
+	smk_ad_setfield_u_tsk(&ad, ptp);
+
+	sp = current_security();
+	tsp = task_security(ptp);
+	/* we won't log here, because rc can be overriden */
+	rc = smk_access(tsp, sp, MAY_READWRITE, NULL);
 	if (rc != 0 && has_capability(ptp, CAP_MAC_OVERRIDE))
-		return 0;
+		rc = 0;
+
+	smack_log(tsp, sp, MAY_READWRITE, rc, &ad);
 	return rc;
 }
 
@@ -327,8 +346,14 @@ static int smack_sb_kern_mount(struct super_block *sb, int flags, void *data)
 static int smack_sb_statfs(struct dentry *dentry)
 {
 	struct superblock_smack *sbp = dentry->d_sb->s_security;
+	int rc;
+	struct smk_audit_info ad;
 
-	return smk_curacc(sbp->smk_floor, MAY_READ);
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path_dentry(&ad, dentry);
+
+	rc = smk_curacc(sbp->smk_floor, MAY_READ, &ad);
+	return rc;
 }
 
 /**
@@ -346,8 +371,12 @@ static int smack_sb_mount(char *dev_name, struct path *path,
 			  char *type, unsigned long flags, void *data)
 {
 	struct superblock_smack *sbp = path->mnt->mnt_sb->s_security;
+	struct smk_audit_info ad;
+
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path(&ad, *path);
 
-	return smk_curacc(sbp->smk_floor, MAY_WRITE);
+	return smk_curacc(sbp->smk_floor, MAY_WRITE, &ad);
 }
 
 /**
@@ -361,10 +390,14 @@ static int smack_sb_mount(char *dev_name, struct path *path,
 static int smack_sb_umount(struct vfsmount *mnt, int flags)
 {
 	struct superblock_smack *sbp;
+	struct smk_audit_info ad;
 
-	sbp = mnt->mnt_sb->s_security;
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path_dentry(&ad, mnt->mnt_mountpoint);
+	smk_ad_setfield_u_fs_path_mnt(&ad, mnt);
 
-	return smk_curacc(sbp->smk_floor, MAY_WRITE);
+	sbp = mnt->mnt_sb->s_security;
+	return smk_curacc(sbp->smk_floor, MAY_WRITE, &ad);
 }
 
 /*
@@ -441,15 +474,20 @@ static int smack_inode_init_security(struct inode *inode, struct inode *dir,
 static int smack_inode_link(struct dentry *old_dentry, struct inode *dir,
 			    struct dentry *new_dentry)
 {
-	int rc;
 	char *isp;
+	struct smk_audit_info ad;
+	int rc;
+
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path_dentry(&ad, old_dentry);
 
 	isp = smk_of_inode(old_dentry->d_inode);
-	rc = smk_curacc(isp, MAY_WRITE);
+	rc = smk_curacc(isp, MAY_WRITE, &ad);
 
 	if (rc == 0 && new_dentry->d_inode != NULL) {
 		isp = smk_of_inode(new_dentry->d_inode);
-		rc = smk_curacc(isp, MAY_WRITE);
+		smk_ad_setfield_u_fs_path_dentry(&ad, new_dentry);
+		rc = smk_curacc(isp, MAY_WRITE, &ad);
 	}
 
 	return rc;
@@ -466,18 +504,24 @@ static int smack_inode_link(struct dentry *old_dentry, struct inode *dir,
 static int smack_inode_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *ip = dentry->d_inode;
+	struct smk_audit_info ad;
 	int rc;
 
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path_dentry(&ad, dentry);
+
 	/*
 	 * You need write access to the thing you're unlinking
 	 */
-	rc = smk_curacc(smk_of_inode(ip), MAY_WRITE);
-	if (rc == 0)
+	rc = smk_curacc(smk_of_inode(ip), MAY_WRITE, &ad);
+	if (rc == 0) {
 		/*
 		 * You also need write access to the containing directory
 		 */
-		rc = smk_curacc(smk_of_inode(dir), MAY_WRITE);
-
+		smk_ad_setfield_u_fs_path_dentry(&ad, NULL);
+		smk_ad_setfield_u_fs_inode(&ad, dir);
+		rc = smk_curacc(smk_of_inode(dir), MAY_WRITE, &ad);
+	}
 	return rc;
 }
 
@@ -491,17 +535,24 @@ static int smack_inode_unlink(struct inode *dir, struct dentry *dentry)
  */
 static int smack_inode_rmdir(struct inode *dir, struct dentry *dentry)
 {
+	struct smk_audit_info ad;
 	int rc;
 
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path_dentry(&ad, dentry);
+
 	/*
 	 * You need write access to the thing you're removing
 	 */
-	rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE);
-	if (rc == 0)
+	rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE, &ad);
+	if (rc == 0) {
 		/*
 		 * You also need write access to the containing directory
 		 */
-		rc = smk_curacc(smk_of_inode(dir), MAY_WRITE);
+		smk_ad_setfield_u_fs_path_dentry(&ad, NULL);
+		smk_ad_setfield_u_fs_inode(&ad, dir);
+		rc = smk_curacc(smk_of_inode(dir), MAY_WRITE, &ad);
+	}
 
 	return rc;
 }
@@ -525,15 +576,19 @@ static int smack_inode_rename(struct inode *old_inode,
 {
 	int rc;
 	char *isp;
+	struct smk_audit_info ad;
+
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path_dentry(&ad, old_dentry);
 
 	isp = smk_of_inode(old_dentry->d_inode);
-	rc = smk_curacc(isp, MAY_READWRITE);
+	rc = smk_curacc(isp, MAY_READWRITE, &ad);
 
 	if (rc == 0 && new_dentry->d_inode != NULL) {
 		isp = smk_of_inode(new_dentry->d_inode);
-		rc = smk_curacc(isp, MAY_READWRITE);
+		smk_ad_setfield_u_fs_path_dentry(&ad, new_dentry);
+		rc = smk_curacc(isp, MAY_READWRITE, &ad);
 	}
-
 	return rc;
 }
 
@@ -548,13 +603,15 @@ static int smack_inode_rename(struct inode *old_inode,
  */
 static int smack_inode_permission(struct inode *inode, int mask)
 {
+	struct smk_audit_info ad;
 	/*
 	 * No permission to check. Existence test. Yup, it's there.
 	 */
 	if (mask == 0)
 		return 0;
-
-	return smk_curacc(smk_of_inode(inode), mask);
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_inode(&ad, inode);
+	return smk_curacc(smk_of_inode(inode), mask, &ad);
 }
 
 /**
@@ -566,13 +623,16 @@ static int smack_inode_permission(struct inode *inode, int mask)
  */
 static int smack_inode_setattr(struct dentry *dentry, struct iattr *iattr)
 {
+	struct smk_audit_info ad;
 	/*
 	 * Need to allow for clearing the setuid bit.
 	 */
 	if (iattr->ia_valid & ATTR_FORCE)
 		return 0;
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path_dentry(&ad, dentry);
 
-	return smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE);
+	return smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE, &ad);
 }
 
 /**
@@ -584,7 +644,12 @@ static int smack_inode_setattr(struct dentry *dentry, struct iattr *iattr)
  */
 static int smack_inode_getattr(struct vfsmount *mnt, struct dentry *dentry)
 {
-	return smk_curacc(smk_of_inode(dentry->d_inode), MAY_READ);
+	struct smk_audit_info ad;
+
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path_dentry(&ad, dentry);
+	smk_ad_setfield_u_fs_path_mnt(&ad, mnt);
+	return smk_curacc(smk_of_inode(dentry->d_inode), MAY_READ, &ad);
 }
 
 /**
@@ -602,6 +667,7 @@ static int smack_inode_getattr(struct vfsmount *mnt, struct dentry *dentry)
 static int smack_inode_setxattr(struct dentry *dentry, const char *name,
 				const void *value, size_t size, int flags)
 {
+	struct smk_audit_info ad;
 	int rc = 0;
 
 	if (strcmp(name, XATTR_NAME_SMACK) == 0 ||
@@ -615,8 +681,11 @@ static int smack_inode_setxattr(struct dentry *dentry, const char *name,
 	} else
 		rc = cap_inode_setxattr(dentry, name, value, size, flags);
 
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path_dentry(&ad, dentry);
+
 	if (rc == 0)
-		rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE);
+		rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE, &ad);
 
 	return rc;
 }
@@ -671,7 +740,12 @@ static void smack_inode_post_setxattr(struct dentry *dentry, const char *name,
  */
 static int smack_inode_getxattr(struct dentry *dentry, const char *name)
 {
-	return smk_curacc(smk_of_inode(dentry->d_inode), MAY_READ);
+	struct smk_audit_info ad;
+
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path_dentry(&ad, dentry);
+
+	return smk_curacc(smk_of_inode(dentry->d_inode), MAY_READ, &ad);
 }
 
 /*
@@ -685,6 +759,7 @@ static int smack_inode_getxattr(struct dentry *dentry, const char *name)
  */
 static int smack_inode_removexattr(struct dentry *dentry, const char *name)
 {
+	struct smk_audit_info ad;
 	int rc = 0;
 
 	if (strcmp(name, XATTR_NAME_SMACK) == 0 ||
@@ -695,8 +770,10 @@ static int smack_inode_removexattr(struct dentry *dentry, const char *name)
 	} else
 		rc = cap_inode_removexattr(dentry, name);
 
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path_dentry(&ad, dentry);
 	if (rc == 0)
-		rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE);
+		rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE, &ad);
 
 	return rc;
 }
@@ -855,12 +932,16 @@ static int smack_file_ioctl(struct file *file, unsigned int cmd,
 			    unsigned long arg)
 {
 	int rc = 0;
+	struct smk_audit_info ad;
+
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path(&ad, file->f_path);
 
 	if (_IOC_DIR(cmd) & _IOC_WRITE)
-		rc = smk_curacc(file->f_security, MAY_WRITE);
+		rc = smk_curacc(file->f_security, MAY_WRITE, &ad);
 
 	if (rc == 0 && (_IOC_DIR(cmd) & _IOC_READ))
-		rc = smk_curacc(file->f_security, MAY_READ);
+		rc = smk_curacc(file->f_security, MAY_READ, &ad);
 
 	return rc;
 }
@@ -874,7 +955,11 @@ static int smack_file_ioctl(struct file *file, unsigned int cmd,
  */
 static int smack_file_lock(struct file *file, unsigned int cmd)
 {
-	return smk_curacc(file->f_security, MAY_WRITE);
+	struct smk_audit_info ad;
+
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path_dentry(&ad, file->f_path.dentry);
+	return smk_curacc(file->f_security, MAY_WRITE, &ad);
 }
 
 /**
@@ -888,8 +973,12 @@ static int smack_file_lock(struct file *file, unsigned int cmd)
 static int smack_file_fcntl(struct file *file, unsigned int cmd,
 			    unsigned long arg)
 {
+	struct smk_audit_info ad;
 	int rc;
 
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path(&ad, file->f_path);
+
 	switch (cmd) {
 	case F_DUPFD:
 	case F_GETFD:
@@ -897,7 +986,7 @@ static int smack_file_fcntl(struct file *file, unsigned int cmd,
 	case F_GETLK:
 	case F_GETOWN:
 	case F_GETSIG:
-		rc = smk_curacc(file->f_security, MAY_READ);
+		rc = smk_curacc(file->f_security, MAY_READ, &ad);
 		break;
 	case F_SETFD:
 	case F_SETFL:
@@ -905,10 +994,10 @@ static int smack_file_fcntl(struct file *file, unsigned int cmd,
 	case F_SETLKW:
 	case F_SETOWN:
 	case F_SETSIG:
-		rc = smk_curacc(file->f_security, MAY_WRITE);
+		rc = smk_curacc(file->f_security, MAY_WRITE, &ad);
 		break;
 	default:
-		rc = smk_curacc(file->f_security, MAY_READWRITE);
+		rc = smk_curacc(file->f_security, MAY_READWRITE, &ad);
 	}
 
 	return rc;
@@ -943,14 +1032,21 @@ static int smack_file_send_sigiotask(struct task_struct *tsk,
 {
 	struct file *file;
 	int rc;
+	char *tsp = tsk->cred->security;
+	struct smk_audit_info ad;
 
 	/*
 	 * struct fown_struct is never outside the context of a struct file
 	 */
 	file = container_of(fown, struct file, f_owner);
-	rc = smk_access(file->f_security, tsk->cred->security, MAY_WRITE);
+	/* we don't log here as rc can be overriden */
+	rc = smk_access(file->f_security, tsp, MAY_WRITE, NULL);
 	if (rc != 0 && has_capability(tsk, CAP_MAC_OVERRIDE))
-		return 0;
+		rc = 0;
+
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
+	smk_ad_setfield_u_tsk(&ad, tsk);
+	smack_log(file->f_security, tsp, MAY_WRITE, rc, &ad);
 	return rc;
 }
 
@@ -963,7 +1059,10 @@ static int smack_file_send_sigiotask(struct task_struct *tsk,
 static int smack_file_receive(struct file *file)
 {
 	int may = 0;
+	struct smk_audit_info ad;
 
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
+	smk_ad_setfield_u_fs_path(&ad, file->f_path);
 	/*
 	 * This code relies on bitmasks.
 	 */
@@ -972,7 +1071,7 @@ static int smack_file_receive(struct file *file)
 	if (file->f_mode & FMODE_WRITE)
 		may |= MAY_WRITE;
 
-	return smk_curacc(file->f_security, may);
+	return smk_curacc(file->f_security, may, &ad);
 }
 
 /*
@@ -1052,6 +1151,22 @@ static int smack_kernel_create_files_as(struct cred *new,
 }
 
 /**
+ * smk_curacc_on_task - helper to log task related access
+ * @p: the task object
+ * @access : the access requested
+ *
+ * Return 0 if access is permitted
+ */
+static int smk_curacc_on_task(struct task_struct *p, int access)
+{
+	struct smk_audit_info ad;
+
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
+	smk_ad_setfield_u_tsk(&ad, p);
+	return smk_curacc(task_security(p), access, &ad);
+}
+
+/**
  * smack_task_setpgid - Smack check on setting pgid
  * @p: the task object
  * @pgid: unused
@@ -1060,7 +1175,7 @@ static int smack_kernel_create_files_as(struct cred *new,
  */
 static int smack_task_setpgid(struct task_struct *p, pid_t pgid)
 {
-	return smk_curacc(task_security(p), MAY_WRITE);
+	return smk_curacc_on_task(p, MAY_WRITE);
 }
 
 /**
@@ -1071,7 +1186,7 @@ static int smack_task_setpgid(struct task_struct *p, pid_t pgid)
  */
 static int smack_task_getpgid(struct task_struct *p)
 {
-	return smk_curacc(task_security(p), MAY_READ);
+	return smk_curacc_on_task(p, MAY_READ);
 }
 
 /**
@@ -1082,7 +1197,7 @@ static int smack_task_getpgid(struct task_struct *p)
  */
 static int smack_task_getsid(struct task_struct *p)
 {
-	return smk_curacc(task_security(p), MAY_READ);
+	return smk_curacc_on_task(p, MAY_READ);
 }
 
 /**
@@ -1110,7 +1225,7 @@ static int smack_task_setnice(struct task_struct *p, int nice)
 
 	rc = cap_task_setnice(p, nice);
 	if (rc == 0)
-		rc = smk_curacc(task_security(p), MAY_WRITE);
+		rc = smk_curacc_on_task(p, MAY_WRITE);
 	return rc;
 }
 
@@ -1127,7 +1242,7 @@ static int smack_task_setioprio(struct task_struct *p, int ioprio)
 
 	rc = cap_task_setioprio(p, ioprio);
 	if (rc == 0)
-		rc = smk_curacc(task_security(p), MAY_WRITE);
+		rc = smk_curacc_on_task(p, MAY_WRITE);
 	return rc;
 }
 
@@ -1139,7 +1254,7 @@ static int smack_task_setioprio(struct task_struct *p, int ioprio)
  */
 static int smack_task_getioprio(struct task_struct *p)
 {
-	return smk_curacc(task_security(p), MAY_READ);
+	return smk_curacc_on_task(p, MAY_READ);
 }
 
 /**
@@ -1157,7 +1272,7 @@ static int smack_task_setscheduler(struct task_struct *p, int policy,
 
 	rc = cap_task_setscheduler(p, policy, lp);
 	if (rc == 0)
-		rc = smk_curacc(task_security(p), MAY_WRITE);
+		rc = smk_curacc_on_task(p, MAY_WRITE);
 	return rc;
 }
 
@@ -1169,7 +1284,7 @@ static int smack_task_setscheduler(struct task_struct *p, int policy,
  */
 static int smack_task_getscheduler(struct task_struct *p)
 {
-	return smk_curacc(task_security(p), MAY_READ);
+	return smk_curacc_on_task(p, MAY_READ);
 }
 
 /**
@@ -1180,7 +1295,7 @@ static int smack_task_getscheduler(struct task_struct *p)
  */
 static int smack_task_movememory(struct task_struct *p)
 {
-	return smk_curacc(task_security(p), MAY_WRITE);
+	return smk_curacc_on_task(p, MAY_WRITE);
 }
 
 /**
@@ -1198,18 +1313,23 @@ static int smack_task_movememory(struct task_struct *p)
 static int smack_task_kill(struct task_struct *p, struct siginfo *info,
 			   int sig, u32 secid)
 {
+	struct smk_audit_info ad;
+
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
+	smk_ad_setfield_u_tsk(&ad, p);
 	/*
 	 * Sending a signal requires that the sender
 	 * can write the receiver.
 	 */
 	if (secid == 0)
-		return smk_curacc(task_security(p), MAY_WRITE);
+		return smk_curacc(task_security(p), MAY_WRITE, &ad);
 	/*
 	 * If the secid isn't 0 we're dealing with some USB IO
 	 * specific behavior. This is not clean. For one thing
 	 * we can't take privilege into account.
 	 */
-	return smk_access(smack_from_secid(secid), task_security(p), MAY_WRITE);
+	return smk_access(smack_from_secid(secid), task_security(p),
+			  MAY_WRITE, &ad);
 }
 
 /**
@@ -1220,11 +1340,15 @@ static int smack_task_kill(struct task_struct *p, struct siginfo *info,
  */
 static int smack_task_wait(struct task_struct *p)
 {
+	struct smk_audit_info ad;
+	char *sp = current_security();
+	char *tsp = task_security(p);
 	int rc;
 
-	rc = smk_access(current_security(), task_security(p), MAY_WRITE);
+	/* we don't log here, we can be overriden */
+	rc = smk_access(sp, tsp, MAY_WRITE, NULL);
 	if (rc == 0)
-		return 0;
+		goto out_log;
 
 	/*
 	 * Allow the operation to succeed if either task
@@ -1238,8 +1362,12 @@ static int smack_task_wait(struct task_struct *p)
 	 * the smack value.
 	 */
 	if (capable(CAP_MAC_OVERRIDE) || has_capability(p, CAP_MAC_OVERRIDE))
-		return 0;
-
+		rc = 0;
+	/* we log only if we didn't get overriden */
+ out_log:
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
+	smk_ad_setfield_u_tsk(&ad, p);
+	smack_log(sp, tsp, MAY_WRITE, rc, &ad);
 	return rc;
 }
 
@@ -1455,12 +1583,19 @@ static int smack_netlabel_send(struct sock *sk, struct sockaddr_in *sap)
 	int sk_lbl;
 	char *hostsp;
 	struct socket_smack *ssp = sk->sk_security;
+	struct smk_audit_info ad;
 
 	rcu_read_lock();
 	hostsp = smack_host_label(sap);
 	if (hostsp != NULL) {
 		sk_lbl = SMACK_UNLABELED_SOCKET;
-		rc = smk_access(ssp->smk_out, hostsp, MAY_WRITE);
+#ifdef CONFIG_AUDIT
+		smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NET);
+		ad.a.u.net.family = sap->sin_family;
+		ad.a.u.net.dport = sap->sin_port;
+		ad.a.u.net.v4info.daddr = sap->sin_addr.s_addr;
+#endif
+		rc = smk_access(ssp->smk_out, hostsp, MAY_WRITE, &ad);
 	} else {
 		sk_lbl = SMACK_CIPSO_SOCKET;
 		rc = 0;
@@ -1656,6 +1791,25 @@ static void smack_shm_free_security(struct shmid_kernel *shp)
 }
 
 /**
+ * smk_curacc_shm : check if current has access on shm
+ * @shp : the object
+ * @access : access requested
+ *
+ * Returns 0 if current has the requested access, error code otherwise
+ */
+static int smk_curacc_shm(struct shmid_kernel *shp, int access)
+{
+	char *ssp = smack_of_shm(shp);
+	struct smk_audit_info ad;
+
+#ifdef CONFIG_AUDIT
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_IPC);
+	ad.a.u.ipc_id = shp->shm_perm.id;
+#endif
+	return smk_curacc(ssp, access, &ad);
+}
+
+/**
  * smack_shm_associate - Smack access check for shm
  * @shp: the object
  * @shmflg: access requested
@@ -1664,11 +1818,10 @@ static void smack_shm_free_security(struct shmid_kernel *shp)
  */
 static int smack_shm_associate(struct shmid_kernel *shp, int shmflg)
 {
-	char *ssp = smack_of_shm(shp);
 	int may;
 
 	may = smack_flags_to_may(shmflg);
-	return smk_curacc(ssp, may);
+	return smk_curacc_shm(shp, may);
 }
 
 /**
@@ -1680,7 +1833,6 @@ static int smack_shm_associate(struct shmid_kernel *shp, int shmflg)
  */
 static int smack_shm_shmctl(struct shmid_kernel *shp, int cmd)
 {
-	char *ssp;
 	int may;
 
 	switch (cmd) {
@@ -1703,9 +1855,7 @@ static int smack_shm_shmctl(struct shmid_kernel *shp, int cmd)
 	default:
 		return -EINVAL;
 	}
-
-	ssp = smack_of_shm(shp);
-	return smk_curacc(ssp, may);
+	return smk_curacc_shm(shp, may);
 }
 
 /**
@@ -1719,11 +1869,10 @@ static int smack_shm_shmctl(struct shmid_kernel *shp, int cmd)
 static int smack_shm_shmat(struct shmid_kernel *shp, char __user *shmaddr,
 			   int shmflg)
 {
-	char *ssp = smack_of_shm(shp);
 	int may;
 
 	may = smack_flags_to_may(shmflg);
-	return smk_curacc(ssp, may);
+	return smk_curacc_shm(shp, may);
 }
 
 /**
@@ -1765,6 +1914,25 @@ static void smack_sem_free_security(struct sem_array *sma)
 }
 
 /**
+ * smk_curacc_sem : check if current has access on sem
+ * @sma : the object
+ * @access : access requested
+ *
+ * Returns 0 if current has the requested access, error code otherwise
+ */
+static int smk_curacc_sem(struct sem_array *sma, int access)
+{
+	char *ssp = smack_of_sem(sma);
+	struct smk_audit_info ad;
+
+#ifdef CONFIG_AUDIT
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_IPC);
+	ad.a.u.ipc_id = sma->sem_perm.id;
+#endif
+	return smk_curacc(ssp, access, &ad);
+}
+
+/**
  * smack_sem_associate - Smack access check for sem
  * @sma: the object
  * @semflg: access requested
@@ -1773,11 +1941,10 @@ static void smack_sem_free_security(struct sem_array *sma)
  */
 static int smack_sem_associate(struct sem_array *sma, int semflg)
 {
-	char *ssp = smack_of_sem(sma);
 	int may;
 
 	may = smack_flags_to_may(semflg);
-	return smk_curacc(ssp, may);
+	return smk_curacc_sem(sma, may);
 }
 
 /**
@@ -1789,7 +1956,6 @@ static int smack_sem_associate(struct sem_array *sma, int semflg)
  */
 static int smack_sem_semctl(struct sem_array *sma, int cmd)
 {
-	char *ssp;
 	int may;
 
 	switch (cmd) {
@@ -1818,8 +1984,7 @@ static int smack_sem_semctl(struct sem_array *sma, int cmd)
 		return -EINVAL;
 	}
 
-	ssp = smack_of_sem(sma);
-	return smk_curacc(ssp, may);
+	return smk_curacc_sem(sma, may);
 }
 
 /**
@@ -1836,9 +2001,7 @@ static int smack_sem_semctl(struct sem_array *sma, int cmd)
 static int smack_sem_semop(struct sem_array *sma, struct sembuf *sops,
 			   unsigned nsops, int alter)
 {
-	char *ssp = smack_of_sem(sma);
-
-	return smk_curacc(ssp, MAY_READWRITE);
+	return smk_curacc_sem(sma, MAY_READWRITE);
 }
 
 /**
@@ -1880,6 +2043,25 @@ static char *smack_of_msq(struct msg_queue *msq)
 }
 
 /**
+ * smk_curacc_msq : helper to check if current has access on msq
+ * @msq : the msq
+ * @access : access requested
+ *
+ * return 0 if current has access, error otherwise
+ */
+static int smk_curacc_msq(struct msg_queue *msq, int access)
+{
+	char *msp = smack_of_msq(msq);
+	struct smk_audit_info ad;
+
+#ifdef CONFIG_AUDIT
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_IPC);
+	ad.a.u.ipc_id = msq->q_perm.id;
+#endif
+	return smk_curacc(msp, access, &ad);
+}
+
+/**
  * smack_msg_queue_associate - Smack access check for msg_queue
  * @msq: the object
  * @msqflg: access requested
@@ -1888,11 +2070,10 @@ static char *smack_of_msq(struct msg_queue *msq)
  */
 static int smack_msg_queue_associate(struct msg_queue *msq, int msqflg)
 {
-	char *msp = smack_of_msq(msq);
 	int may;
 
 	may = smack_flags_to_may(msqflg);
-	return smk_curacc(msp, may);
+	return smk_curacc_msq(msq, may);
 }
 
 /**
@@ -1904,7 +2085,6 @@ static int smack_msg_queue_associate(struct msg_queue *msq, int msqflg)
  */
 static int smack_msg_queue_msgctl(struct msg_queue *msq, int cmd)
 {
-	char *msp;
 	int may;
 
 	switch (cmd) {
@@ -1926,8 +2106,7 @@ static int smack_msg_queue_msgctl(struct msg_queue *msq, int cmd)
 		return -EINVAL;
 	}
 
-	msp = smack_of_msq(msq);
-	return smk_curacc(msp, may);
+	return smk_curacc_msq(msq, may);
 }
 
 /**
@@ -1941,11 +2120,10 @@ static int smack_msg_queue_msgctl(struct msg_queue *msq, int cmd)
 static int smack_msg_queue_msgsnd(struct msg_queue *msq, struct msg_msg *msg,
 				  int msqflg)
 {
-	char *msp = smack_of_msq(msq);
-	int rc;
+	int may;
 
-	rc = smack_flags_to_may(msqflg);
-	return smk_curacc(msp, rc);
+	may = smack_flags_to_may(msqflg);
+	return smk_curacc_msq(msq, may);
 }
 
 /**
@@ -1961,9 +2139,7 @@ static int smack_msg_queue_msgsnd(struct msg_queue *msq, struct msg_msg *msg,
 static int smack_msg_queue_msgrcv(struct msg_queue *msq, struct msg_msg *msg,
 			struct task_struct *target, long type, int mode)
 {
-	char *msp = smack_of_msq(msq);
-
-	return smk_curacc(msp, MAY_READWRITE);
+	return smk_curacc_msq(msq, MAY_READWRITE);
 }
 
 /**
@@ -1976,10 +2152,14 @@ static int smack_msg_queue_msgrcv(struct msg_queue *msq, struct msg_msg *msg,
 static int smack_ipc_permission(struct kern_ipc_perm *ipp, short flag)
 {
 	char *isp = ipp->security;
-	int may;
+	int may = smack_flags_to_may(flag);
+	struct smk_audit_info ad;
 
-	may = smack_flags_to_may(flag);
-	return smk_curacc(isp, may);
+#ifdef CONFIG_AUDIT
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_IPC);
+	ad.a.u.ipc_id = ipp->id;
+#endif
+	return smk_curacc(isp, may, &ad);
 }
 
 /**
@@ -2238,8 +2418,12 @@ static int smack_unix_stream_connect(struct socket *sock,
 {
 	struct inode *sp = SOCK_INODE(sock);
 	struct inode *op = SOCK_INODE(other);
+	struct smk_audit_info ad;
 
-	return smk_access(smk_of_inode(sp), smk_of_inode(op), MAY_READWRITE);
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NET);
+	smk_ad_setfield_u_net_sk(&ad, other->sk);
+	return smk_access(smk_of_inode(sp), smk_of_inode(op),
+				 MAY_READWRITE, &ad);
 }
 
 /**
@@ -2254,8 +2438,11 @@ static int smack_unix_may_send(struct socket *sock, struct socket *other)
 {
 	struct inode *sp = SOCK_INODE(sock);
 	struct inode *op = SOCK_INODE(other);
+	struct smk_audit_info ad;
 
-	return smk_access(smk_of_inode(sp), smk_of_inode(op), MAY_WRITE);
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NET);
+	smk_ad_setfield_u_net_sk(&ad, other->sk);
+	return smk_access(smk_of_inode(sp), smk_of_inode(op), MAY_WRITE, &ad);
 }
 
 /**
@@ -2370,7 +2557,7 @@ static int smack_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	char smack[SMK_LABELLEN];
 	char *csp;
 	int rc;
-
+	struct smk_audit_info ad;
 	if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
 		return 0;
 
@@ -2388,13 +2575,19 @@ static int smack_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
 
 	netlbl_secattr_destroy(&secattr);
 
+#ifdef CONFIG_AUDIT
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NET);
+	ad.a.u.net.family = sk->sk_family;
+	ad.a.u.net.netif = skb->iif;
+	ipv4_skb_to_auditdata(skb, &ad.a, NULL);
+#endif
 	/*
 	 * Receiving a packet requires that the other end
 	 * be able to write here. Read access is not required.
 	 * This is the simplist possible security model
 	 * for networking.
 	 */
-	rc = smk_access(csp, ssp->smk_in, MAY_WRITE);
+	rc = smk_access(csp, ssp->smk_in, MAY_WRITE, &ad);
 	if (rc != 0)
 		netlbl_skbuff_err(skb, rc, 0);
 	return rc;
@@ -2523,6 +2716,7 @@ static int smack_inet_conn_request(struct sock *sk, struct sk_buff *skb,
 	struct iphdr *hdr;
 	char smack[SMK_LABELLEN];
 	int rc;
+	struct smk_audit_info ad;
 
 	/* handle mapped IPv4 packets arriving via IPv6 sockets */
 	if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
@@ -2536,11 +2730,17 @@ static int smack_inet_conn_request(struct sock *sk, struct sk_buff *skb,
 		strncpy(smack, smack_known_huh.smk_known, SMK_MAXLEN);
 	netlbl_secattr_destroy(&secattr);
 
+#ifdef CONFIG_AUDIT
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NET);
+	ad.a.u.net.family = family;
+	ad.a.u.net.netif = skb->iif;
+	ipv4_skb_to_auditdata(skb, &ad.a, NULL);
+#endif
 	/*
 	 * Receiving a packet requires that the other end be able to write
 	 * here. Read access is not required.
 	 */
-	rc = smk_access(smack, ssp->smk_in, MAY_WRITE);
+	rc = smk_access(smack, ssp->smk_in, MAY_WRITE, &ad);
 	if (rc != 0)
 		return rc;
 
@@ -2642,6 +2842,7 @@ static int smack_key_permission(key_ref_t key_ref,
 				const struct cred *cred, key_perm_t perm)
 {
 	struct key *keyp;
+	struct smk_audit_info ad;
 
 	keyp = key_ref_to_ptr(key_ref);
 	if (keyp == NULL)
@@ -2657,8 +2858,13 @@ static int smack_key_permission(key_ref_t key_ref,
 	 */
 	if (cred->security == NULL)
 		return -EACCES;
-
-	return smk_access(cred->security, keyp->security, MAY_READWRITE);
+#ifdef CONFIG_AUDIT
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_KEY);
+	ad.a.u.key_struct.key = keyp->serial;
+	ad.a.u.key_struct.key_desc = keyp->description;
+#endif
+	return smk_access(cred->security, keyp->security,
+				 MAY_READWRITE, &ad);
 }
 #endif /* CONFIG_KEYS */
 
diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c
index e03a7e1..904af34 100644
--- a/security/smack/smackfs.c
+++ b/security/smack/smackfs.c
@@ -41,6 +41,7 @@ enum smk_inos {
 	SMK_AMBIENT	= 7,	/* internet ambient label */
 	SMK_NETLBLADDR	= 8,	/* single label hosts */
 	SMK_ONLYCAP	= 9,	/* the only "capable" label */
+	SMK_LOGGING	= 10,	/* logging */
 };
 
 /*
@@ -1192,6 +1193,69 @@ static const struct file_operations smk_onlycap_ops = {
 };
 
 /**
+ * smk_read_logging - read() for /smack/logging
+ * @filp: file pointer, not actually used
+ * @buf: where to put the result
+ * @cn: maximum to send along
+ * @ppos: where to start
+ *
+ * Returns number of bytes read or error code, as appropriate
+ */
+static ssize_t smk_read_logging(struct file *filp, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	char temp[32];
+	ssize_t rc;
+
+	if (*ppos != 0)
+		return 0;
+
+	sprintf(temp, "%d\n", log_policy);
+	rc = simple_read_from_buffer(buf, count, ppos, temp, strlen(temp));
+	return rc;
+}
+
+/**
+ * smk_write_logging - write() for /smack/logging
+ * @file: file pointer, not actually used
+ * @buf: where to get the data from
+ * @count: bytes sent
+ * @ppos: where to start
+ *
+ * Returns number of bytes written or error code, as appropriate
+ */
+static ssize_t smk_write_logging(struct file *file, const char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	char temp[32];
+	int i;
+
+	if (!capable(CAP_MAC_ADMIN))
+		return -EPERM;
+
+	if (count >= sizeof(temp) || count == 0)
+		return -EINVAL;
+
+	if (copy_from_user(temp, buf, count) != 0)
+		return -EFAULT;
+
+	temp[count] = '\0';
+
+	if (sscanf(temp, "%d", &i) != 1)
+		return -EINVAL;
+	if (i < 0 || i > 3)
+		return -EINVAL;
+	log_policy = i;
+	return count;
+}
+
+
+
+static const struct file_operations smk_logging_ops = {
+	.read		= smk_read_logging,
+	.write		= smk_write_logging,
+};
+/**
  * smk_fill_super - fill the /smackfs superblock
  * @sb: the empty superblock
  * @data: unused
@@ -1221,6 +1285,8 @@ static int smk_fill_super(struct super_block *sb, void *data, int silent)
 			{"netlabel", &smk_netlbladdr_ops, S_IRUGO|S_IWUSR},
 		[SMK_ONLYCAP]	=
 			{"onlycap", &smk_onlycap_ops, S_IRUGO|S_IWUSR},
+		[SMK_LOGGING]	=
+			{"logging", &smk_logging_ops, S_IRUGO|S_IWUSR},
 		/* last one */ {""}
 	};
 
-- 
cgit v0.10.2


From 7ba5c840e64d4a967379f1ae3eca73278180b11d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 13 Apr 2009 21:31:17 -0700
Subject: rcu: Add __rcu_pending tracing to hierarchical RCU

Add tracing to __rcu_pending() to provide information on why RCU
processing was kicked off.  This is helpful for debugging hierarchical
RCU, and might also be helpful in learning how hierarchical RCU operates.

Located-by: Anton Blanchard <anton@au1.ibm.com>
Tested-by: Anton Blanchard <anton@au1.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: anton@samba.org
Cc: akpm@linux-foundation.org
Cc: dipankar@in.ibm.com
Cc: manfred@colorfullife.com
Cc: cl@linux-foundation.org
Cc: josht@linux.vnet.ibm.com
Cc: schamp@sgi.com
Cc: niv@us.ibm.com
Cc: dvhltc@us.ibm.com
Cc: ego@in.ibm.com
Cc: laijs@cn.fujitsu.com
Cc: rostedt@goodmis.org
Cc: peterz@infradead.org
Cc: penberg@cs.helsinki.fi
Cc: andi@firstfloor.org
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
LKML-Reference: <1239683479943-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 58b2aa5..5a51538 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -161,8 +161,15 @@ struct rcu_data {
 	unsigned long offline_fqs;	/* Kicked due to being offline. */
 	unsigned long resched_ipi;	/* Sent a resched IPI. */
 
-	/* 5) For future __rcu_pending statistics. */
+	/* 5) __rcu_pending() statistics. */
 	long n_rcu_pending;		/* rcu_pending() calls since boot. */
+	long n_rp_qs_pending;
+	long n_rp_cb_ready;
+	long n_rp_cpu_needs_gp;
+	long n_rp_gp_completed;
+	long n_rp_gp_started;
+	long n_rp_need_fqs;
+	long n_rp_need_nothing;
 
 	int cpu;
 };
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d2a372f..0dccfbb 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1259,31 +1259,44 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 	check_cpu_stall(rsp, rdp);
 
 	/* Is the RCU core waiting for a quiescent state from this CPU? */
-	if (rdp->qs_pending)
+	if (rdp->qs_pending) {
+		rdp->n_rp_qs_pending++;
 		return 1;
+	}
 
 	/* Does this CPU have callbacks ready to invoke? */
-	if (cpu_has_callbacks_ready_to_invoke(rdp))
+	if (cpu_has_callbacks_ready_to_invoke(rdp)) {
+		rdp->n_rp_cb_ready++;
 		return 1;
+	}
 
 	/* Has RCU gone idle with this CPU needing another grace period? */
-	if (cpu_needs_another_gp(rsp, rdp))
+	if (cpu_needs_another_gp(rsp, rdp)) {
+		rdp->n_rp_cpu_needs_gp++;
 		return 1;
+	}
 
 	/* Has another RCU grace period completed?  */
-	if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */
+	if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */
+		rdp->n_rp_gp_completed++;
 		return 1;
+	}
 
 	/* Has a new RCU grace period started? */
-	if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */
+	if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */
+		rdp->n_rp_gp_started++;
 		return 1;
+	}
 
 	/* Has an RCU GP gone long enough to send resched IPIs &c? */
 	if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
-	    ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0))
+	    ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) {
+		rdp->n_rp_need_fqs++;
 		return 1;
+	}
 
 	/* nothing to do */
+	rdp->n_rp_need_nothing++;
 	return 0;
 }
 
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 4b1875b..fe1dcdb 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -213,7 +213,63 @@ static struct file_operations rcugp_fops = {
 	.release = single_release,
 };
 
-static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir;
+static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
+{
+	seq_printf(m, "%3d%cnp=%ld "
+		   "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n",
+		   rdp->cpu,
+		   cpu_is_offline(rdp->cpu) ? '!' : ' ',
+		   rdp->n_rcu_pending,
+		   rdp->n_rp_qs_pending,
+		   rdp->n_rp_cb_ready,
+		   rdp->n_rp_cpu_needs_gp,
+		   rdp->n_rp_gp_completed,
+		   rdp->n_rp_gp_started,
+		   rdp->n_rp_need_fqs,
+		   rdp->n_rp_need_nothing);
+}
+
+static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
+{
+	int cpu;
+	struct rcu_data *rdp;
+
+	for_each_possible_cpu(cpu) {
+		rdp = rsp->rda[cpu];
+		if (rdp->beenonline)
+			print_one_rcu_pending(m, rdp);
+	}
+}
+
+static int show_rcu_pending(struct seq_file *m, void *unused)
+{
+	seq_puts(m, "rcu:\n");
+	print_rcu_pendings(m, &rcu_state);
+	seq_puts(m, "rcu_bh:\n");
+	print_rcu_pendings(m, &rcu_bh_state);
+	return 0;
+}
+
+static int rcu_pending_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_rcu_pending, NULL);
+}
+
+static struct file_operations rcu_pending_fops = {
+	.owner = THIS_MODULE,
+	.open = rcu_pending_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static struct dentry *rcudir;
+static struct dentry *datadir;
+static struct dentry *datadir_csv;
+static struct dentry *gpdir;
+static struct dentry *hierdir;
+static struct dentry *rcu_pendingdir;
+
 static int __init rcuclassic_trace_init(void)
 {
 	rcudir = debugfs_create_dir("rcu", NULL);
@@ -238,6 +294,11 @@ static int __init rcuclassic_trace_init(void)
 						NULL, &rcuhier_fops);
 	if (!hierdir)
 		goto free_out;
+
+	rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir,
+						NULL, &rcu_pending_fops);
+	if (!rcu_pendingdir)
+		goto free_out;
 	return 0;
 free_out:
 	if (datadir)
@@ -257,6 +318,7 @@ static void __exit rcuclassic_trace_cleanup(void)
 	debugfs_remove(datadir_csv);
 	debugfs_remove(gpdir);
 	debugfs_remove(hierdir);
+	debugfs_remove(rcu_pendingdir);
 	debugfs_remove(rcudir);
 }
 
-- 
cgit v0.10.2


From 6fd9b3a40b82081d9e6490b0d7cd656e9a78a134 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 13 Apr 2009 21:31:18 -0700
Subject: rcu: Update RCU tracing documentation for __rcu_pending

This patch updates the RCU documentation to reflect the changes in
tracing made in the previous patch in the set.

Located-by: Anton Blanchard <anton@au1.ibm.com>
Tested-by: Anton Blanchard <anton@au1.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: anton@samba.org
Cc: akpm@linux-foundation.org
Cc: dipankar@in.ibm.com
Cc: manfred@colorfullife.com
Cc: cl@linux-foundation.org
Cc: josht@linux.vnet.ibm.com
Cc: schamp@sgi.com
Cc: niv@us.ibm.com
Cc: dvhltc@us.ibm.com
Cc: ego@in.ibm.com
Cc: laijs@cn.fujitsu.com
Cc: rostedt@goodmis.org
Cc: peterz@infradead.org
Cc: penberg@cs.helsinki.fi
Cc: andi@firstfloor.org
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
LKML-Reference: <12396834792865-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index 0688482..02cced1 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -192,23 +192,24 @@ rcu/rcuhier (which displays the struct rcu_node hierarchy).
 The output of "cat rcu/rcudata" looks as follows:
 
 rcu:
-  0 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=1 rp=3c2a dt=23301/73 dn=2 df=1882 of=0 ri=2126 ql=2 b=10
-  1 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=3 rp=39a6 dt=78073/1 dn=2 df=1402 of=0 ri=1875 ql=46 b=10
-  2 c=4010 g=4010 pq=1 pqc=4010 qp=0 rpfq=-5 rp=1d12 dt=16646/0 dn=2 df=3140 of=0 ri=2080 ql=0 b=10
-  3 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=2b50 dt=21159/1 dn=2 df=2230 of=0 ri=1923 ql=72 b=10
-  4 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1644 dt=5783/1 dn=2 df=3348 of=0 ri=2805 ql=7 b=10
-  5 c=4012 g=4013 pq=0 pqc=4011 qp=1 rpfq=3 rp=1aac dt=5879/1 dn=2 df=3140 of=0 ri=2066 ql=10 b=10
-  6 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=ed8 dt=5847/1 dn=2 df=3797 of=0 ri=1266 ql=10 b=10
-  7 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1fa2 dt=6199/1 dn=2 df=2795 of=0 ri=2162 ql=28 b=10
+rcu:
+  0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1 dn=0 df=1101 of=0 ri=36 ql=0 b=10
+  1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1 dn=0 df=1015 of=0 ri=0 ql=0 b=10
+  2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1 dn=0 df=1839 of=0 ri=0 ql=0 b=10
+  3 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=6681/1 dn=0 df=1545 of=0 ri=0 ql=0 b=10
+  4 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1003/1 dn=0 df=1992 of=0 ri=0 ql=0 b=10
+  5 c=17829 g=17830 pq=1 pqc=17829 qp=1 dt=3887/1 dn=0 df=3331 of=0 ri=4 ql=2 b=10
+  6 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=859/1 dn=0 df=3224 of=0 ri=0 ql=0 b=10
+  7 c=17829 g=17830 pq=0 pqc=17829 qp=1 dt=3761/1 dn=0 df=1818 of=0 ri=0 ql=2 b=10
 rcu_bh:
-  0 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-145 rp=21d6 dt=23301/73 dn=2 df=0 of=0 ri=0 ql=0 b=10
-  1 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-170 rp=20ce dt=78073/1 dn=2 df=26 of=0 ri=5 ql=0 b=10
-  2 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-83 rp=fbd dt=16646/0 dn=2 df=28 of=0 ri=4 ql=0 b=10
-  3 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-105 rp=178c dt=21159/1 dn=2 df=28 of=0 ri=2 ql=0 b=10
-  4 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-30 rp=b54 dt=5783/1 dn=2 df=32 of=0 ri=0 ql=0 b=10
-  5 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-29 rp=df5 dt=5879/1 dn=2 df=30 of=0 ri=3 ql=0 b=10
-  6 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-28 rp=788 dt=5847/1 dn=2 df=32 of=0 ri=0 ql=0 b=10
-  7 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-53 rp=1098 dt=6199/1 dn=2 df=30 of=0 ri=3 ql=0 b=10
+  0 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=10951/1 dn=0 df=0 of=0 ri=0 ql=0 b=10
+  1 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=16117/1 dn=0 df=13 of=0 ri=0 ql=0 b=10
+  2 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1445/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
+  3 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=6681/1 dn=0 df=9 of=0 ri=0 ql=0 b=10
+  4 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1003/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
+  5 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3887/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
+  6 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=859/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
+  7 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3761/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
 
 The first section lists the rcu_data structures for rcu, the second for
 rcu_bh.  Each section has one line per CPU, or eight for this 8-CPU system.
@@ -253,12 +254,6 @@ o	"pqc" indicates which grace period the last-observed quiescent
 o	"qp" indicates that RCU still expects a quiescent state from
 	this CPU.
 
-o	"rpfq" is the number of rcu_pending() calls on this CPU required
-	to induce this CPU to invoke force_quiescent_state().
-
-o	"rp" is low-order four hex digits of the count of how many times
-	rcu_pending() has been invoked on this CPU.
-
 o	"dt" is the current value of the dyntick counter that is incremented
 	when entering or leaving dynticks idle state, either by the
 	scheduler or by irq.  The number after the "/" is the interrupt
@@ -305,6 +300,9 @@ o	"b" is the batch limit for this CPU.  If more than this number
 	of RCU callbacks is ready to invoke, then the remainder will
 	be deferred.
 
+There is also an rcu/rcudata.csv file with the same information in
+comma-separated-variable spreadsheet format.
+
 
 The output of "cat rcu/rcugp" looks as follows:
 
@@ -411,3 +409,63 @@ o	Each element of the form "1/1 0:127 ^0" represents one struct
 		For example, the first entry at the lowest level shows
 		"^0", indicating that it corresponds to bit zero in
 		the first entry at the middle level.
+
+
+The output of "cat rcu/rcu_pending" looks as follows:
+
+rcu:
+  0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741
+  1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792
+  2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629
+  3 np=236249 qsp=48766 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723
+  4 np=221310 qsp=46850 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110
+  5 np=237332 qsp=48449 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456
+  6 np=219995 qsp=46718 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834
+  7 np=249893 qsp=49390 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888
+rcu_bh:
+  0 np=146741 qsp=1419 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314
+  1 np=155792 qsp=12597 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180
+  2 np=136629 qsp=18680 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936
+  3 np=137723 qsp=2843 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863
+  4 np=123110 qsp=12433 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671
+  5 np=137456 qsp=4210 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235
+  6 np=120834 qsp=9902 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921
+  7 np=144888 qsp=26336 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542
+
+As always, this is once again split into "rcu" and "rcu_bh" portions.
+The fields are as follows:
+
+o	"np" is the number of times that __rcu_pending() has been invoked
+	for the corresponding flavor of RCU.
+
+o	"qsp" is the number of times that the RCU was waiting for a
+	quiescent state from this CPU.
+
+o	"cbr" is the number of times that this CPU had RCU callbacks
+	that had passed through a grace period, and were thus ready
+	to be invoked.
+
+o	"cng" is the number of times that this CPU needed another
+	grace period while RCU was idle.
+
+o	"gpc" is the number of times that an old grace period had
+	completed, but this CPU was not yet aware of it.
+
+o	"gps" is the number of times that a new grace period had started,
+	but this CPU was not yet aware of it.
+
+o	"nf" is the number of times that this CPU suspected that the
+	current grace period had run for too long, and thus needed to
+	be forced.
+
+	Please note that "forcing" consists of sending resched IPIs
+	to holdout CPUs.  If that CPU really still is in an old RCU
+	read-side critical section, then we really do have to wait for it.
+	The assumption behing "forcing" is that the CPU is not still in
+	an old RCU read-side critical section, but has not yet responded
+	for some other reason.
+
+o	"nn" is the number of times that this CPU needed nothing.  Alert
+	readers will note that the rcu "nn" number for a given CPU very
+	closely matches the rcu_bh "np" number for that same CPU.  This
+	is due to short-circuit evaluation in rcu_pending().
-- 
cgit v0.10.2


From 66aa230e437d89ca56224135f617e2d8e391a3ef Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Tue, 14 Apr 2009 12:54:29 +0530
Subject: x86: page_types.h unification of declarations

Impact: unification of declarations, cleanup

Unification of declarations:
 moved init_memory_mapping, initmem_init and free_initmem from
page_XX_types.h to page_types.h

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1239693869.3033.31.camel@ht.satnam>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index 0f915ae..6f1b733 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -54,10 +54,6 @@ extern unsigned int __VMALLOC_RESERVE;
 extern int sysctl_legacy_va_layout;
 
 extern void find_low_pfn_range(void);
-extern unsigned long init_memory_mapping(unsigned long start,
-					 unsigned long end);
-extern void initmem_init(unsigned long, unsigned long);
-extern void free_initmem(void);
 extern void setup_bootmem_allocator(void);
 
 #endif	/* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index d38c91b..3f58718 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -71,12 +71,6 @@ extern unsigned long __phys_addr(unsigned long);
 
 #define vmemmap ((struct page *)VMEMMAP_START)
 
-extern unsigned long init_memory_mapping(unsigned long start,
-					 unsigned long end);
-
-extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
-extern void free_initmem(void);
-
 extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
 extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
 
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 826ad37..6473f5c 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -46,6 +46,12 @@ extern int devmem_is_allowed(unsigned long pagenr);
 extern unsigned long max_low_pfn_mapped;
 extern unsigned long max_pfn_mapped;
 
+extern unsigned long init_memory_mapping(unsigned long start,
+					 unsigned long end);
+
+extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
+extern void free_initmem(void);
+
 #endif	/* !__ASSEMBLY__ */
 
 #endif	/* _ASM_X86_PAGE_DEFS_H */
-- 
cgit v0.10.2


From e7d43a74cb07cbc4b8e9b5e4a914816b33fb0719 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Tue, 14 Apr 2009 13:18:28 +0530
Subject: x86: avoid multiple declaration of kstack_depth_to_print

Impact: cleanup

asm/stacktrace.h is more appropriate so removing other 2 declarations.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
LKML-Reference: <1239695308.3033.34.camel@ht.satnam>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 0d53425..9aa3ab2 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -74,7 +74,6 @@ static inline int get_si_code(unsigned long condition)
 }
 
 extern int panic_on_unrecovered_nmi;
-extern int kstack_depth_to_print;
 
 void math_error(void __user *);
 void math_emulate(struct math_emu_info *);
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index da87590b..81086c2 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -29,7 +29,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
 		unsigned long *sp, unsigned long bp, char *log_lvl);
 
 extern unsigned int code_bytes;
-extern int kstack_depth_to_print;
 
 /* The form of the top of the frame on the stack */
 struct stack_frame {
-- 
cgit v0.10.2


From f711f6090a81cbd396b63de90f415d33f563af9b Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Tue, 14 Apr 2009 10:25:30 +0530
Subject: sched: Nominate idle load balancer from a semi-idle package.

Currently the nomination of idle-load balancer is done by choosing the first
idle cpu in the nohz.cpu_mask. This may not be power-efficient, since
such an idle cpu could come from a completely idle core/package thereby
preventing the whole core/package from being in a low-power state.

For eg, consider a quad-core dual package system. The cpu numbering need
not be sequential and can something like [0, 2, 4, 6] and [1, 3, 5, 7].
With sched_mc/smt_power_savings and the power-aware IRQ balance, we try to keep
as fewer Packages/Cores active. But the current idle load balancer logic
goes against this by choosing the first_cpu in the nohz.cpu_mask and not
taking the system topology into consideration.

Improve the algorithm to nominate the idle load balancer from a semi idle
cores/packages thereby increasing the probability of the cores/packages being
in deeper sleep states for longer duration.

The algorithm is activated only when sched_mc/smt_power_savings != 0.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090414045530.7645.12175.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 5724508..b0fefa3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4240,10 +4240,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 static struct {
 	atomic_t load_balancer;
 	cpumask_var_t cpu_mask;
+	cpumask_var_t ilb_grp_nohz_mask;
 } nohz ____cacheline_aligned = {
 	.load_balancer = ATOMIC_INIT(-1),
 };
 
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
+ * @cpu:	The cpu whose lowest level of sched domain is to
+ *		be returned.
+ * @flag:	The flag to check for the lowest sched_domain
+ *		for the given cpu.
+ *
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+	struct sched_domain *sd;
+
+	for_each_domain(cpu, sd)
+		if (sd && (sd->flags & flag))
+			break;
+
+	return sd;
+}
+
+/**
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ * @cpu:	The cpu whose domains we're iterating over.
+ * @sd:		variable holding the value of the power_savings_sd
+ *		for cpu.
+ * @flag:	The flag to filter the sched_domains to be iterated.
+ *
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ * set, starting from the lowest sched_domain to the highest.
+ */
+#define for_each_flag_domain(cpu, sd, flag) \
+	for (sd = lowest_flag_domain(cpu, flag); \
+		(sd && (sd->flags & flag)); sd = sd->parent)
+
+/**
+ * is_semi_idle_group - Checks if the given sched_group is semi-idle.
+ * @ilb_group:	group to be checked for semi-idleness
+ *
+ * Returns:	1 if the group is semi-idle. 0 otherwise.
+ *
+ * We define a sched_group to be semi idle if it has atleast one idle-CPU
+ * and atleast one non-idle CPU. This helper function checks if the given
+ * sched_group is semi-idle or not.
+ */
+static inline int is_semi_idle_group(struct sched_group *ilb_group)
+{
+	cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+					sched_group_cpus(ilb_group));
+
+	/*
+	 * A sched_group is semi-idle when it has atleast one busy cpu
+	 * and atleast one idle cpu.
+	 */
+	if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+		return 0;
+
+	if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+		return 0;
+
+	return 1;
+}
+/**
+ * find_new_ilb - Finds the optimum idle load balancer for nomination.
+ * @cpu:	The cpu which is nominating a new idle_load_balancer.
+ *
+ * Returns:	Returns the id of the idle load balancer if it exists,
+ *		Else, returns >= nr_cpu_ids.
+ *
+ * This algorithm picks the idle load balancer such that it belongs to a
+ * semi-idle powersavings sched_domain. The idea is to try and avoid
+ * completely idle packages/cores just for the purpose of idle load balancing
+ * when there are other idle cpu's which are better suited for that job.
+ */
+static int find_new_ilb(int cpu)
+{
+	struct sched_domain *sd;
+	struct sched_group *ilb_group;
+
+	/*
+	 * Have idle load balancer selection from semi-idle packages only
+	 * when power-aware load balancing is enabled
+	 */
+	if (!(sched_smt_power_savings || sched_mc_power_savings))
+		goto out_done;
+
+	/*
+	 * Optimize for the case when we have no idle CPUs or only one
+	 * idle CPU. Don't walk the sched_domain hierarchy in such cases
+	 */
+	if (cpumask_weight(nohz.cpu_mask) < 2)
+		goto out_done;
+
+	for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
+		ilb_group = sd->groups;
+
+		do {
+			if (is_semi_idle_group(ilb_group))
+				return cpumask_first(nohz.ilb_grp_nohz_mask);
+
+			ilb_group = ilb_group->next;
+
+		} while (ilb_group != sd->groups);
+	}
+
+out_done:
+	return cpumask_first(nohz.cpu_mask);
+}
+#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+static inline int find_new_ilb(int call_cpu)
+{
+	return first_cpu(nohz.cpu_mask);
+}
+#endif
+
 /*
  * This routine will try to nominate the ilb (idle load balancing)
  * owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4468,15 +4584,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
 		}
 
 		if (atomic_read(&nohz.load_balancer) == -1) {
-			/*
-			 * simple selection for now: Nominate the
-			 * first cpu in the nohz list to be the next
-			 * ilb owner.
-			 *
-			 * TBD: Traverse the sched domains and nominate
-			 * the nearest cpu in the nohz.cpu_mask.
-			 */
-			int ilb = cpumask_first(nohz.cpu_mask);
+			int ilb = find_new_ilb(cpu);
 
 			if (ilb < nr_cpu_ids)
 				resched_cpu(ilb);
@@ -9051,6 +9159,7 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ
 	alloc_bootmem_cpumask_var(&nohz.cpu_mask);
+	alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask);
 #endif
 	alloc_bootmem_cpumask_var(&cpu_isolated_map);
 #endif /* SMP */
-- 
cgit v0.10.2


From e790fb0ba64bfec158e1219d899cb588275d12ab Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Tue, 14 Apr 2009 10:25:35 +0530
Subject: sched: Nominate a power-efficient ilb in select_nohz_balancer()

The CPU that first goes idle becomes the idle-load-balancer and remains
that until either it picks up a task or till all the CPUs of the system
goes idle.

Optimize this further to allow it to relinquish it's post
once all it's siblings in the power-aware sched_domain go idle, thereby
allowing the whole package-core to go idle. While relinquising the post,
nominate another an idle-load balancer from a semi-idle core/package.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090414045535.7645.31641.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index b0fefa3..36d213b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4414,8 +4414,24 @@ int select_nohz_load_balancer(int stop_tick)
 			/* make me the ilb owner */
 			if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
 				return 1;
-		} else if (atomic_read(&nohz.load_balancer) == cpu)
+		} else if (atomic_read(&nohz.load_balancer) == cpu) {
+			int new_ilb;
+
+			if (!(sched_smt_power_savings ||
+						sched_mc_power_savings))
+				return 1;
+			/*
+			 * Check to see if there is a more power-efficient
+			 * ilb.
+			 */
+			new_ilb = find_new_ilb(cpu);
+			if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
+				atomic_set(&nohz.load_balancer, -1);
+				resched_cpu(new_ilb);
+				return 0;
+			}
 			return 1;
+		}
 	} else {
 		if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
 			return 0;
-- 
cgit v0.10.2


From 6424fb38667fffbbb1b90be0ffd9a0c540db6a4b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 13 Apr 2009 23:51:46 -0700
Subject: x86: remove (null) in /sys kernel_page_tables

Impact: cleanup

%p prints out 0x000000000000000 as (null)
so use %lx instead.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <49E43282.1090607@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index e7277cb..a725b7f 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -161,13 +161,14 @@ static void note_page(struct seq_file *m, struct pg_state *st,
 		   st->current_address >= st->marker[1].start_address) {
 		const char *unit = units;
 		unsigned long delta;
+		int width = sizeof(unsigned long) * 2;
 
 		/*
 		 * Now print the actual finished series
 		 */
-		seq_printf(m, "0x%p-0x%p   ",
-			   (void *)st->start_address,
-			   (void *)st->current_address);
+		seq_printf(m, "0x%0*lx-0x%0*lx   ",
+			   width, st->start_address,
+			   width, st->current_address);
 
 		delta = (st->current_address - st->start_address) >> 10;
 		while (!(delta & 1023) && unit[1]) {
-- 
cgit v0.10.2


From 7e05575c422d45f393c2d9b5900e97a30bf69bea Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 14 Apr 2009 12:12:29 +0900
Subject: x86: calgary: remove IOMMU_DEBUG

CONFIG_IOMMU_DEBUG has depends on CONFIG_GART_IOMMU:

config IOMMU_DEBUG
	bool "Enable IOMMU debugging"
	depends on GART_IOMMU && DEBUG_KERNEL
	depends on X86_64

So it's not useful to have CONFIG_IOMMU_DEBUG in Calgary IOMMU code,
which does the extra checking of the bitmap space management.

And Calgary uses the iommu helper for the bitmap space management now
so it would be better to have the extra checking feature in the iommu
helper rather than Calgary code (if necessary).

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Muli Ben-Yehuda <muli@il.ibm.com>
Cc: Joerg Roedel <joerg.roedel@amd.com>
Cc: alexisb@us.ibm.com
LKML-Reference: <20090414120827G.fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 755c21e..971a3be 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -186,37 +186,6 @@ static struct cal_chipset_ops calioc2_chip_ops = {
 
 static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
 
-/* enable this to stress test the chip's TCE cache */
-#ifdef CONFIG_IOMMU_DEBUG
-static int debugging = 1;
-
-static inline unsigned long verify_bit_range(unsigned long* bitmap,
-	int expected, unsigned long start, unsigned long end)
-{
-	unsigned long idx = start;
-
-	BUG_ON(start >= end);
-
-	while (idx < end) {
-		if (!!test_bit(idx, bitmap) != expected)
-			return idx;
-		++idx;
-	}
-
-	/* all bits have the expected value */
-	return ~0UL;
-}
-#else /* debugging is disabled */
-static int debugging;
-
-static inline unsigned long verify_bit_range(unsigned long* bitmap,
-	int expected, unsigned long start, unsigned long end)
-{
-	return ~0UL;
-}
-
-#endif /* CONFIG_IOMMU_DEBUG */
-
 static inline int translation_enabled(struct iommu_table *tbl)
 {
 	/* only PHBs with translation enabled have an IOMMU table */
@@ -228,7 +197,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,
 {
 	unsigned long index;
 	unsigned long end;
-	unsigned long badbit;
 	unsigned long flags;
 
 	index = start_addr >> PAGE_SHIFT;
@@ -243,14 +211,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,
 
 	spin_lock_irqsave(&tbl->it_lock, flags);
 
-	badbit = verify_bit_range(tbl->it_map, 0, index, end);
-	if (badbit != ~0UL) {
-		if (printk_ratelimit())
-			printk(KERN_ERR "Calgary: entry already allocated at "
-			       "0x%lx tbl %p dma 0x%lx npages %u\n",
-			       badbit, tbl, start_addr, npages);
-	}
-
 	iommu_area_reserve(tbl->it_map, index, npages);
 
 	spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -326,7 +286,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 	unsigned int npages)
 {
 	unsigned long entry;
-	unsigned long badbit;
 	unsigned long badend;
 	unsigned long flags;
 
@@ -346,14 +305,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 
 	spin_lock_irqsave(&tbl->it_lock, flags);
 
-	badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages);
-	if (badbit != ~0UL) {
-		if (printk_ratelimit())
-			printk(KERN_ERR "Calgary: bit is off at 0x%lx "
-			       "tbl %p dma 0x%Lx entry 0x%lx npages %u\n",
-			       badbit, tbl, dma_addr, entry, npages);
-	}
-
 	iommu_area_free(tbl->it_map, entry, npages);
 
 	spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -1488,9 +1439,8 @@ void __init detect_calgary(void)
 		iommu_detected = 1;
 		calgary_detected = 1;
 		printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
-		printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
-		       "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
-		       debugging ? "enabled" : "disabled");
+		printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
+		       specified_table_size);
 
 		/* swiotlb for devices that aren't behind the Calgary. */
 		if (max_pfn > MAX_DMA32_PFN)
-- 
cgit v0.10.2


From ea20d9293ce423a39717ed4375393129a2e701f9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 10 Apr 2009 08:54:16 -0400
Subject: tracing: consolidate trace and trace_event headers

Impact: clean up

Neil Horman (et. al.) criticized the way the trace events were broken up
into two files. The reason for that was that ftrace needed to separate out
the declarations from where the #include <linux/tracepoint.h> was used.
It then dawned on me that the tracepoint.h header only needs to define the
TRACE_EVENT macro if it is not already defined.

The solution is simply to test if TRACE_EVENT is defined, and if it is not
then the linux/tracepoint.h header can define it. This change consolidates
all the <traces>.h and <traces>_event_types.h into the <traces>.h file.

Reported-by: Neil Horman <nhorman@tuxdriver.com>
Reported-by: Theodore Tso <tytso@mit.edu>
Reported-by: Jiaying Zhang <jiayingz@google.com>
Cc: Zhaolei <zhaolei@cn.fujitsu.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index d35a7ee..4353f3f 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -31,6 +31,8 @@ struct tracepoint {
 					 * Keep in sync with vmlinux.lds.h.
 					 */
 
+#ifndef DECLARE_TRACE
+
 #define TP_PROTO(args...)	args
 #define TP_ARGS(args...)		args
 
@@ -114,6 +116,7 @@ static inline void tracepoint_update_probe_range(struct tracepoint *begin,
 	struct tracepoint *end)
 { }
 #endif /* CONFIG_TRACEPOINTS */
+#endif /* DECLARE_TRACE */
 
 /*
  * Connect a probe to a tracepoint.
@@ -154,10 +157,13 @@ static inline void tracepoint_synchronize_unregister(void)
 }
 
 #define PARAMS(args...) args
+
+#ifndef TRACE_FORMAT
 #define TRACE_FORMAT(name, proto, args, fmt)		\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+#endif
 
-
+#ifndef TRACE_EVENT
 /*
  * For use with the TRACE_EVENT macro:
  *
@@ -262,5 +268,6 @@ static inline void tracepoint_synchronize_unregister(void)
 
 #define TRACE_EVENT(name, proto, args, struct, assign, print)	\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+#endif
 
 #endif
diff --git a/include/trace/irq.h b/include/trace/irq.h
index ff5d449..04ab4c6 100644
--- a/include/trace/irq.h
+++ b/include/trace/irq.h
@@ -1,9 +1,54 @@
-#ifndef _TRACE_IRQ_H
+#if !defined(_TRACE_IRQ_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_IRQ_H
 
-#include <linux/interrupt.h>
 #include <linux/tracepoint.h>
+#include <linux/interrupt.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM irq
+
+/*
+ * Tracepoint for entry of interrupt handler:
+ */
+TRACE_FORMAT(irq_handler_entry,
+	TP_PROTO(int irq, struct irqaction *action),
+	TP_ARGS(irq, action),
+	TP_FMT("irq=%d handler=%s", irq, action->name)
+	);
+
+/*
+ * Tracepoint for return of an interrupt handler:
+ */
+TRACE_EVENT(irq_handler_exit,
+
+	TP_PROTO(int irq, struct irqaction *action, int ret),
+
+	TP_ARGS(irq, action, ret),
+
+	TP_STRUCT__entry(
+		__field(	int,	irq	)
+		__field(	int,	ret	)
+	),
+
+	TP_fast_assign(
+		__entry->irq	= irq;
+		__entry->ret	= ret;
+	),
+
+	TP_printk("irq=%d return=%s",
+		  __entry->irq, __entry->ret ? "handled" : "unhandled")
+);
+
+TRACE_FORMAT(softirq_entry,
+	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+	TP_ARGS(h, vec),
+	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
+	);
 
-#include <trace/irq_event_types.h>
+TRACE_FORMAT(softirq_exit,
+	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+	TP_ARGS(h, vec),
+	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
+	);
 
 #endif
diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h
deleted file mode 100644
index 85964eb..0000000
--- a/include/trace/irq_event_types.h
+++ /dev/null
@@ -1,55 +0,0 @@
-
-/* use <trace/irq.h> instead */
-#ifndef TRACE_FORMAT
-# error Do not include this file directly.
-# error Unless you know what you are doing.
-#endif
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM irq
-
-/*
- * Tracepoint for entry of interrupt handler:
- */
-TRACE_FORMAT(irq_handler_entry,
-	TP_PROTO(int irq, struct irqaction *action),
-	TP_ARGS(irq, action),
-	TP_FMT("irq=%d handler=%s", irq, action->name)
-	);
-
-/*
- * Tracepoint for return of an interrupt handler:
- */
-TRACE_EVENT(irq_handler_exit,
-
-	TP_PROTO(int irq, struct irqaction *action, int ret),
-
-	TP_ARGS(irq, action, ret),
-
-	TP_STRUCT__entry(
-		__field(	int,	irq	)
-		__field(	int,	ret	)
-	),
-
-	TP_fast_assign(
-		__entry->irq	= irq;
-		__entry->ret	= ret;
-	),
-
-	TP_printk("irq=%d return=%s",
-		  __entry->irq, __entry->ret ? "handled" : "unhandled")
-);
-
-TRACE_FORMAT(softirq_entry,
-	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
-	TP_ARGS(h, vec),
-	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
-	);
-
-TRACE_FORMAT(softirq_exit,
-	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
-	TP_ARGS(h, vec),
-	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
-	);
-
-#undef TRACE_SYSTEM
diff --git a/include/trace/kmem.h b/include/trace/kmem.h
index 46efc24..d7d1218 100644
--- a/include/trace/kmem.h
+++ b/include/trace/kmem.h
@@ -1,9 +1,192 @@
-#ifndef _TRACE_KMEM_H
+#if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_KMEM_H
 
 #include <linux/types.h>
 #include <linux/tracepoint.h>
 
-#include <trace/kmem_event_types.h>
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kmem
 
-#endif /* _TRACE_KMEM_H */
+TRACE_EVENT(kmalloc,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags)
+);
+
+TRACE_EVENT(kmem_cache_alloc,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags)
+);
+
+TRACE_EVENT(kmalloc_node,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags,
+		 int node),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+		__field(	int,		node		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+		__entry->node		= node;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags,
+		__entry->node)
+);
+
+TRACE_EVENT(kmem_cache_alloc_node,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags,
+		 int node),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+		__field(	int,		node		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+		__entry->node		= node;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags,
+		__entry->node)
+);
+
+TRACE_EVENT(kfree,
+
+	TP_PROTO(unsigned long call_site, const void *ptr),
+
+	TP_ARGS(call_site, ptr),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+	),
+
+	TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
+);
+
+TRACE_EVENT(kmem_cache_free,
+
+	TP_PROTO(unsigned long call_site, const void *ptr),
+
+	TP_ARGS(call_site, ptr),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+	),
+
+	TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
+);
+
+#endif
diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h
index 5ca67df..8ee7900 100644
--- a/include/trace/lockdep.h
+++ b/include/trace/lockdep.h
@@ -1,9 +1,57 @@
-#ifndef _TRACE_LOCKDEP_H
+#if !defined(_TRACE_LOCKDEP_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_LOCKDEP_H
 
 #include <linux/lockdep.h>
 #include <linux/tracepoint.h>
 
-#include <trace/lockdep_event_types.h>
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM lock
+
+#ifdef CONFIG_LOCKDEP
+
+TRACE_FORMAT(lock_acquire,
+	TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
+		int trylock, int read, int check,
+		struct lockdep_map *next_lock, unsigned long ip),
+	TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
+	TP_FMT("%s%s%s", trylock ? "try " : "",
+		read ? "read " : "", lock->name)
+	);
+
+TRACE_FORMAT(lock_release,
+	TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
+	TP_ARGS(lock, nested, ip),
+	TP_FMT("%s", lock->name)
+	);
+
+#ifdef CONFIG_LOCK_STAT
+
+TRACE_FORMAT(lock_contended,
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+	TP_ARGS(lock, ip),
+	TP_FMT("%s", lock->name)
+	);
+
+TRACE_EVENT(lock_acquired,
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime),
+
+	TP_ARGS(lock, ip, waittime),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(unsigned long, wait_usec)
+		__field(unsigned long, wait_nsec_rem)
+	),
+	TP_fast_assign(
+		__entry->name = lock->name;
+		__entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC);
+		__entry->wait_usec = (unsigned long) waittime;
+	),
+	TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec,
+				       __entry->wait_nsec_rem)
+);
 
 #endif
+#endif
+
+#endif /* _TRACE_LOCKDEP_H */
diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h
deleted file mode 100644
index 863f1e4..0000000
--- a/include/trace/lockdep_event_types.h
+++ /dev/null
@@ -1,57 +0,0 @@
-
-#ifndef TRACE_FORMAT
-# error Do not include this file directly.
-# error Unless you know what you are doing.
-#endif
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM lock
-
-#ifdef CONFIG_LOCKDEP
-
-TRACE_FORMAT(lock_acquire,
-	TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
-		int trylock, int read, int check,
-		struct lockdep_map *next_lock, unsigned long ip),
-	TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
-	TP_FMT("%s%s%s", trylock ? "try " : "",
-		read ? "read " : "", lock->name)
-	);
-
-TRACE_FORMAT(lock_release,
-	TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
-	TP_ARGS(lock, nested, ip),
-	TP_FMT("%s", lock->name)
-	);
-
-#ifdef CONFIG_LOCK_STAT
-
-TRACE_FORMAT(lock_contended,
-	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
-	TP_ARGS(lock, ip),
-	TP_FMT("%s", lock->name)
-	);
-
-TRACE_EVENT(lock_acquired,
-	TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime),
-
-	TP_ARGS(lock, ip, waittime),
-
-	TP_STRUCT__entry(
-		__field(const char *, name)
-		__field(unsigned long, wait_usec)
-		__field(unsigned long, wait_nsec_rem)
-	),
-	TP_fast_assign(
-		__entry->name = lock->name;
-		__entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC);
-		__entry->wait_usec = (unsigned long) waittime;
-	),
-	TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec,
-				       __entry->wait_nsec_rem)
-);
-
-#endif
-#endif
-
-#undef TRACE_SYSTEM
diff --git a/include/trace/sched.h b/include/trace/sched.h
index 4e372a1..5b1cf4a 100644
--- a/include/trace/sched.h
+++ b/include/trace/sched.h
@@ -1,9 +1,336 @@
-#ifndef _TRACE_SCHED_H
+#if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_SCHED_H
 
 #include <linux/sched.h>
 #include <linux/tracepoint.h>
 
-#include <trace/sched_event_types.h>
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM sched
 
-#endif
+/*
+ * Tracepoint for calling kthread_stop, performed to end a kthread:
+ */
+TRACE_EVENT(sched_kthread_stop,
+
+	TP_PROTO(struct task_struct *t),
+
+	TP_ARGS(t),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+		__entry->pid	= t->pid;
+	),
+
+	TP_printk("task %s:%d", __entry->comm, __entry->pid)
+);
+
+/*
+ * Tracepoint for the return value of the kthread stopping:
+ */
+TRACE_EVENT(sched_kthread_stop_ret,
+
+	TP_PROTO(int ret),
+
+	TP_ARGS(ret),
+
+	TP_STRUCT__entry(
+		__field(	int,	ret	)
+	),
+
+	TP_fast_assign(
+		__entry->ret	= ret;
+	),
+
+	TP_printk("ret %d", __entry->ret)
+);
+
+/*
+ * Tracepoint for waiting on task to unschedule:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_wait_task,
+
+	TP_PROTO(struct rq *rq, struct task_struct *p),
+
+	TP_ARGS(rq, p),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid	= p->pid;
+		__entry->prio	= p->prio;
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
+);
+
+/*
+ * Tracepoint for waking up a task:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_wakeup,
+
+	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+
+	TP_ARGS(rq, p, success),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+		__field(	int,	success			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->prio		= p->prio;
+		__entry->success	= success;
+	),
+
+	TP_printk("task %s:%d [%d] success=%d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->success)
+);
+
+/*
+ * Tracepoint for waking up a new task:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_wakeup_new,
+
+	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+
+	TP_ARGS(rq, p, success),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+		__field(	int,	success			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->prio		= p->prio;
+		__entry->success	= success;
+	),
+
+	TP_printk("task %s:%d [%d] success=%d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->success)
+);
+
+/*
+ * Tracepoint for task switches, performed by the scheduler:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_switch,
+
+	TP_PROTO(struct rq *rq, struct task_struct *prev,
+		 struct task_struct *next),
+
+	TP_ARGS(rq, prev, next),
+
+	TP_STRUCT__entry(
+		__array(	char,	prev_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	prev_pid			)
+		__field(	int,	prev_prio			)
+		__array(	char,	next_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	next_pid			)
+		__field(	int,	next_prio			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
+		__entry->prev_pid	= prev->pid;
+		__entry->prev_prio	= prev->prio;
+		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
+		__entry->next_pid	= next->pid;
+		__entry->next_prio	= next->prio;
+	),
+
+	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
+		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
+		__entry->next_comm, __entry->next_pid, __entry->next_prio)
+);
+
+/*
+ * Tracepoint for a task being migrated:
+ */
+TRACE_EVENT(sched_migrate_task,
+
+	TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
+
+	TP_ARGS(p, orig_cpu, dest_cpu),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+		__field(	int,	orig_cpu		)
+		__field(	int,	dest_cpu		)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->prio		= p->prio;
+		__entry->orig_cpu	= orig_cpu;
+		__entry->dest_cpu	= dest_cpu;
+	),
+
+	TP_printk("task %s:%d [%d] from: %d  to: %d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->orig_cpu, __entry->dest_cpu)
+);
+
+/*
+ * Tracepoint for freeing a task:
+ */
+TRACE_EVENT(sched_process_free,
+
+	TP_PROTO(struct task_struct *p),
+
+	TP_ARGS(p),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->prio		= p->prio;
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
+);
+
+/*
+ * Tracepoint for a task exiting:
+ */
+TRACE_EVENT(sched_process_exit,
+
+	TP_PROTO(struct task_struct *p),
+
+	TP_ARGS(p),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->prio		= p->prio;
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
+);
+
+/*
+ * Tracepoint for a waiting task:
+ */
+TRACE_EVENT(sched_process_wait,
+
+	TP_PROTO(struct pid *pid),
+
+	TP_ARGS(pid),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+		__entry->pid		= pid_nr(pid);
+		__entry->prio		= current->prio;
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
+);
+
+/*
+ * Tracepoint for do_fork:
+ */
+TRACE_EVENT(sched_process_fork,
+
+	TP_PROTO(struct task_struct *parent, struct task_struct *child),
+
+	TP_ARGS(parent, child),
+
+	TP_STRUCT__entry(
+		__array(	char,	parent_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	parent_pid			)
+		__array(	char,	child_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	child_pid			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN);
+		__entry->parent_pid	= parent->pid;
+		memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN);
+		__entry->child_pid	= child->pid;
+	),
+
+	TP_printk("parent %s:%d  child %s:%d",
+		__entry->parent_comm, __entry->parent_pid,
+		__entry->child_comm, __entry->child_pid)
+);
+
+/*
+ * Tracepoint for sending a signal:
+ */
+TRACE_EVENT(sched_signal_send,
+
+	TP_PROTO(int sig, struct task_struct *p),
+
+	TP_ARGS(sig, p),
+
+	TP_STRUCT__entry(
+		__field(	int,	sig			)
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid	= p->pid;
+		__entry->sig	= sig;
+	),
+
+	TP_printk("sig: %d  task %s:%d",
+		  __entry->sig, __entry->comm, __entry->pid)
+);
+
+#endif /* _TRACE_SCHED_H */
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
deleted file mode 100644
index 63547dc..0000000
--- a/include/trace/sched_event_types.h
+++ /dev/null
@@ -1,337 +0,0 @@
-
-/* use <trace/sched.h> instead */
-#ifndef TRACE_EVENT
-# error Do not include this file directly.
-# error Unless you know what you are doing.
-#endif
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM sched
-
-/*
- * Tracepoint for calling kthread_stop, performed to end a kthread:
- */
-TRACE_EVENT(sched_kthread_stop,
-
-	TP_PROTO(struct task_struct *t),
-
-	TP_ARGS(t),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
-		__entry->pid	= t->pid;
-	),
-
-	TP_printk("task %s:%d", __entry->comm, __entry->pid)
-);
-
-/*
- * Tracepoint for the return value of the kthread stopping:
- */
-TRACE_EVENT(sched_kthread_stop_ret,
-
-	TP_PROTO(int ret),
-
-	TP_ARGS(ret),
-
-	TP_STRUCT__entry(
-		__field(	int,	ret	)
-	),
-
-	TP_fast_assign(
-		__entry->ret	= ret;
-	),
-
-	TP_printk("ret %d", __entry->ret)
-);
-
-/*
- * Tracepoint for waiting on task to unschedule:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
- */
-TRACE_EVENT(sched_wait_task,
-
-	TP_PROTO(struct rq *rq, struct task_struct *p),
-
-	TP_ARGS(rq, p),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid	= p->pid;
-		__entry->prio	= p->prio;
-	),
-
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio)
-);
-
-/*
- * Tracepoint for waking up a task:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
- */
-TRACE_EVENT(sched_wakeup,
-
-	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
-
-	TP_ARGS(rq, p, success),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-		__field(	int,	success			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->prio		= p->prio;
-		__entry->success	= success;
-	),
-
-	TP_printk("task %s:%d [%d] success=%d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->success)
-);
-
-/*
- * Tracepoint for waking up a new task:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
- */
-TRACE_EVENT(sched_wakeup_new,
-
-	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
-
-	TP_ARGS(rq, p, success),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-		__field(	int,	success			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->prio		= p->prio;
-		__entry->success	= success;
-	),
-
-	TP_printk("task %s:%d [%d] success=%d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->success)
-);
-
-/*
- * Tracepoint for task switches, performed by the scheduler:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
- */
-TRACE_EVENT(sched_switch,
-
-	TP_PROTO(struct rq *rq, struct task_struct *prev,
-		 struct task_struct *next),
-
-	TP_ARGS(rq, prev, next),
-
-	TP_STRUCT__entry(
-		__array(	char,	prev_comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	prev_pid			)
-		__field(	int,	prev_prio			)
-		__array(	char,	next_comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	next_pid			)
-		__field(	int,	next_prio			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
-		__entry->prev_pid	= prev->pid;
-		__entry->prev_prio	= prev->prio;
-		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
-		__entry->next_pid	= next->pid;
-		__entry->next_prio	= next->prio;
-	),
-
-	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
-		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
-		__entry->next_comm, __entry->next_pid, __entry->next_prio)
-);
-
-/*
- * Tracepoint for a task being migrated:
- */
-TRACE_EVENT(sched_migrate_task,
-
-	TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
-
-	TP_ARGS(p, orig_cpu, dest_cpu),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-		__field(	int,	orig_cpu		)
-		__field(	int,	dest_cpu		)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->prio		= p->prio;
-		__entry->orig_cpu	= orig_cpu;
-		__entry->dest_cpu	= dest_cpu;
-	),
-
-	TP_printk("task %s:%d [%d] from: %d  to: %d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->orig_cpu, __entry->dest_cpu)
-);
-
-/*
- * Tracepoint for freeing a task:
- */
-TRACE_EVENT(sched_process_free,
-
-	TP_PROTO(struct task_struct *p),
-
-	TP_ARGS(p),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->prio		= p->prio;
-	),
-
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio)
-);
-
-/*
- * Tracepoint for a task exiting:
- */
-TRACE_EVENT(sched_process_exit,
-
-	TP_PROTO(struct task_struct *p),
-
-	TP_ARGS(p),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->prio		= p->prio;
-	),
-
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio)
-);
-
-/*
- * Tracepoint for a waiting task:
- */
-TRACE_EVENT(sched_process_wait,
-
-	TP_PROTO(struct pid *pid),
-
-	TP_ARGS(pid),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
-		__entry->pid		= pid_nr(pid);
-		__entry->prio		= current->prio;
-	),
-
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio)
-);
-
-/*
- * Tracepoint for do_fork:
- */
-TRACE_EVENT(sched_process_fork,
-
-	TP_PROTO(struct task_struct *parent, struct task_struct *child),
-
-	TP_ARGS(parent, child),
-
-	TP_STRUCT__entry(
-		__array(	char,	parent_comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	parent_pid			)
-		__array(	char,	child_comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	child_pid			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN);
-		__entry->parent_pid	= parent->pid;
-		memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN);
-		__entry->child_pid	= child->pid;
-	),
-
-	TP_printk("parent %s:%d  child %s:%d",
-		__entry->parent_comm, __entry->parent_pid,
-		__entry->child_comm, __entry->child_pid)
-);
-
-/*
- * Tracepoint for sending a signal:
- */
-TRACE_EVENT(sched_signal_send,
-
-	TP_PROTO(int sig, struct task_struct *p),
-
-	TP_ARGS(sig, p),
-
-	TP_STRUCT__entry(
-		__field(	int,	sig			)
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid	= p->pid;
-		__entry->sig	= sig;
-	),
-
-	TP_printk("sig: %d  task %s:%d",
-		  __entry->sig, __entry->comm, __entry->pid)
-);
-
-#undef TRACE_SYSTEM
diff --git a/include/trace/skb.h b/include/trace/skb.h
index d2de717..e6fd281 100644
--- a/include/trace/skb.h
+++ b/include/trace/skb.h
@@ -1,9 +1,37 @@
-#ifndef _TRACE_SKB_H_
-#define _TRACE_SKB_H_
+#if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SKB_H
 
 #include <linux/skbuff.h>
 #include <linux/tracepoint.h>
 
-#include <trace/skb_event_types.h>
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM skb
 
-#endif
+/*
+ * Tracepoint for free an sk_buff:
+ */
+TRACE_EVENT(kfree_skb,
+
+	TP_PROTO(struct sk_buff *skb, void *location),
+
+	TP_ARGS(skb, location),
+
+	TP_STRUCT__entry(
+		__field(	void *,		skbaddr		)
+		__field(	unsigned short,	protocol	)
+		__field(	void *,		location	)
+	),
+
+	TP_fast_assign(
+		__entry->skbaddr = skb;
+		if (skb) {
+			__entry->protocol = ntohs(skb->protocol);
+		}
+		__entry->location = location;
+	),
+
+	TP_printk("skbaddr=%p protocol=%u location=%p",
+		__entry->skbaddr, __entry->protocol, __entry->location)
+);
+
+#endif /* _TRACE_SKB_H */
diff --git a/include/trace/skb_event_types.h b/include/trace/skb_event_types.h
deleted file mode 100644
index 4a1c504..0000000
--- a/include/trace/skb_event_types.h
+++ /dev/null
@@ -1,38 +0,0 @@
-
-/* use <trace/skb.h> instead */
-#ifndef TRACE_EVENT
-# error Do not include this file directly.
-# error Unless you know what you are doing.
-#endif
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM skb
-
-/*
- * Tracepoint for free an sk_buff:
- */
-TRACE_EVENT(kfree_skb,
-
-	TP_PROTO(struct sk_buff *skb, void *location),
-
-	TP_ARGS(skb, location),
-
-	TP_STRUCT__entry(
-		__field(	void *,		skbaddr		)
-		__field(	unsigned short,	protocol	)
-		__field(	void *,		location	)
-	),
-
-	TP_fast_assign(
-		__entry->skbaddr = skb;
-		if (skb) {
-			__entry->protocol = ntohs(skb->protocol);
-		}
-		__entry->location = location;
-	),
-
-	TP_printk("skbaddr=%p protocol=%u location=%p",
-		__entry->skbaddr, __entry->protocol, __entry->location)
-);
-
-#undef TRACE_SYSTEM
diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h
deleted file mode 100644
index 552a50e..0000000
--- a/include/trace/trace_event_types.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* trace/<type>_event_types.h here */
-
-#include <trace/sched_event_types.h>
-#include <trace/irq_event_types.h>
-#include <trace/lockdep_event_types.h>
-#include <trace/skb_event_types.h>
-#include <trace/kmem_event_types.h>
diff --git a/kernel/trace/events.c b/kernel/trace/events.c
index 246f2aa..5a35a91 100644
--- a/kernel/trace/events.c
+++ b/kernel/trace/events.c
@@ -8,6 +8,7 @@
 
 #include "trace_output.h"
 
+#define TRACE_HEADER_MULTI_READ
 #include "trace_events_stage_1.h"
 #include "trace_events_stage_2.h"
 #include "trace_events_stage_3.h"
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
index 38985f9..475f46a 100644
--- a/kernel/trace/trace_events_stage_1.h
+++ b/kernel/trace/trace_events_stage_1.h
@@ -1,7 +1,7 @@
 /*
  * Stage 1 of the trace events.
  *
- * Override the macros in <trace/trace_event_types.h> to include the following:
+ * Override the macros in <trace/trace_events.h> to include the following:
  *
  * struct ftrace_raw_<call> {
  *	struct trace_entry		ent;
@@ -36,4 +36,4 @@
 	};							\
 	static struct ftrace_event_call event_##name
 
-#include <trace/trace_event_types.h>
+#include <trace/trace_events.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 59cfd7d..aa4a67a 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -1,7 +1,7 @@
 /*
  * Stage 2 of the trace events.
  *
- * Override the macros in <trace/trace_event_types.h> to include the following:
+ * Override the macros in <trace/trace_events.h> to include the following:
  *
  * enum print_line_t
  * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
@@ -64,7 +64,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 	return TRACE_TYPE_HANDLED;					\
 }
 	
-#include <trace/trace_event_types.h>
+#include <trace/trace_events.h>
 
 /*
  * Setup the showing format of trace point.
@@ -128,7 +128,7 @@ ftrace_format_##call(struct trace_seq *s)				\
 	return ret;							\
 }
 
-#include <trace/trace_event_types.h>
+#include <trace/trace_events.h>
 
 #undef __field
 #define __field(type, item)						\
@@ -167,4 +167,4 @@ ftrace_define_fields_##call(void)					\
 	return ret;							\
 }
 
-#include <trace/trace_event_types.h>
+#include <trace/trace_events.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 5bb1b7f..45c04e1 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -1,7 +1,7 @@
 /*
  * Stage 3 of the trace events.
  *
- * Override the macros in <trace/trace_event_types.h> to include the following:
+ * Override the macros in <trace/trace_events.h> to include the following:
  *
  * static void ftrace_event_<call>(proto)
  * {
@@ -272,7 +272,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	_TRACE_PROFILE_INIT(call)					\
 }
 
-#include <trace/trace_event_types.h>
+#include <trace/trace_events.h>
 
 #undef _TRACE_PROFILE
 #undef _TRACE_PROFILE_INIT
-- 
cgit v0.10.2


From 78ddb08feb7d4fbe3c0a9931804c51ee58be4023 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 14 Apr 2009 16:53:05 +0200
Subject: wait: don't use __wake_up_common()

'777c6c5 wait: prevent exclusive waiter starvation' made
__wake_up_common() global to be used from abort_exclusive_wait().

It was needed to do a wake-up with the waitqueue lock held while
passing down a key to the wake-up function.

Since '4ede816 epoll keyed wakeups: add __wake_up_locked_key() and
__wake_up_sync_key()' there is an appropriate wrapper for this case:
__wake_up_locked_key().

Use it here and make __wake_up_common() private to the scheduler
again.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1239720785-19661-1-git-send-email-hannes@cmpxchg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 5d631c1..67d4e89 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -132,8 +132,6 @@ static inline void __remove_wait_queue(wait_queue_head_t *head,
 	list_del(&old->task_list);
 }
 
-void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
-			int nr_exclusive, int sync, void *key);
 void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
 void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr,
diff --git a/kernel/sched.c b/kernel/sched.c
index 36d213b..92b4b56 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5345,7 +5345,7 @@ EXPORT_SYMBOL(default_wake_function);
  * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
-void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, int sync, void *key)
 {
 	wait_queue_t *curr, *next;
diff --git a/kernel/wait.c b/kernel/wait.c
index 42a2dbc..ea7c3b4 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -154,7 +154,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
 	if (!list_empty(&wait->task_list))
 		list_del_init(&wait->task_list);
 	else if (waitqueue_active(q))
-		__wake_up_common(q, mode, 1, 0, key);
+		__wake_up_locked_key(q, mode, key);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(abort_exclusive_wait);
-- 
cgit v0.10.2


From 56449f437add737a1e5e1cb7e00f63ac8ead1938 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 14 Apr 2009 11:24:36 +0200
Subject: tracing: make the trace clocks available generally

Jeremy Fitzhardinge reported this build failure:

 LD	 .tmp_vmlinux1
 arch/x86/kernel/built-in.o: In function `ds_take_timestamp':
 git/linux/arch/x86/kernel/ds.c:1380: undefined reference to `trace_clock_global'
 git/linux/arch/x86/kernel/ds.c:1380: undefined reference to `trace_clock_global'

Which is due to !CONFIG_TRACING && CONFIG_X86_DS=y.

Expose the trace clock code to CONFIG_X86_DS as well.

[ Unfortunately librarizing doesnt work well - ancient architectures
  with no raw_local_irq_save() primitive break the build. ]

Reported-by: Jeremy Fitzhardinge <jeremy@goop.org>
LKML-Reference: <49E4413F.7070700@goop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/Makefile b/kernel/Makefile
index bab1dff..c8e1be5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -92,6 +92,7 @@ obj-$(CONFIG_LATENCYTOP) += latencytop.o
 obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
+obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_SLOW_WORK) += slow-work.o
 
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 2630f51..ecc671e 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -15,11 +15,16 @@ ifdef CONFIG_TRACING_BRANCHES
 KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
 endif
 
+#
+# Make the trace clocks available generally: it's infrastructure
+# relied on by ptrace for example:
+#
+obj-y += trace_clock.o
+
 obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
 
 obj-$(CONFIG_TRACING) += trace.o
-obj-$(CONFIG_TRACING) += trace_clock.o
 obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_stat.o
 obj-$(CONFIG_TRACING) += trace_printk.o
-- 
cgit v0.10.2


From 72c6a9870f901045f2464c3dc6ee8914bfdc07aa Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Tue, 14 Apr 2009 17:33:57 +0200
Subject: rculist.h: introduce list_entry_rcu() and list_first_entry_rcu()

I've run into the situation where I need to use list_first_entry with
rcu-guarded list. This patch introduces this.

Also simplify list_for_each_entry_rcu() to use new list_entry_rcu()
instead of list_entry().

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: dipankar@in.ibm.com
LKML-Reference: <20090414153356.GC3999@psychotron.englab.brq.redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index e649bd3..5710f43 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -198,6 +198,32 @@ static inline void list_splice_init_rcu(struct list_head *list,
 	at->prev = last;
 }
 
+/**
+ * list_entry_rcu - get the struct for this entry
+ * @ptr:        the &struct list_head pointer.
+ * @type:       the type of the struct this is embedded in.
+ * @member:     the name of the list_struct within the struct.
+ *
+ * This primitive may safely run concurrently with the _rcu list-mutation
+ * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
+ */
+#define list_entry_rcu(ptr, type, member) \
+	container_of(rcu_dereference(ptr), type, member)
+
+/**
+ * list_first_entry_rcu - get the first element from a list
+ * @ptr:        the list head to take the element from.
+ * @type:       the type of the struct this is embedded in.
+ * @member:     the name of the list_struct within the struct.
+ *
+ * Note, that list is expected to be not empty.
+ *
+ * This primitive may safely run concurrently with the _rcu list-mutation
+ * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
+ */
+#define list_first_entry_rcu(ptr, type, member) \
+	list_entry_rcu((ptr)->next, type, member)
+
 #define __list_for_each_rcu(pos, head) \
 	for (pos = rcu_dereference((head)->next); \
 		pos != (head); \
@@ -214,9 +240,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
  * as long as the traversal is guarded by rcu_read_lock().
  */
 #define list_for_each_entry_rcu(pos, head, member) \
-	for (pos = list_entry(rcu_dereference((head)->next), typeof(*pos), member); \
+	for (pos = list_entry_rcu((head)->next, typeof(*pos), member); \
 		prefetch(pos->member.next), &pos->member != (head); \
-		pos = list_entry(rcu_dereference(pos->member.next), typeof(*pos), member))
+		pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
 
 
 /**
-- 
cgit v0.10.2


From a8d154b009168337494fbf345671bab74d3e4b8b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 10 Apr 2009 09:36:00 -0400
Subject: tracing: create automated trace defines

This patch lowers the number of places a developer must modify to add
new tracepoints. The current method to add a new tracepoint
into an existing system is to write the trace point macro in the
trace header with one of the macros TRACE_EVENT, TRACE_FORMAT or
DECLARE_TRACE, then they must add the same named item into the C file
with the macro DEFINE_TRACE(name) and then add the trace point.

This change cuts out the needing to add the DEFINE_TRACE(name).
Every file that uses the tracepoint must still include the trace/<type>.h
file, but the one C file must also add a define before the including
of that file.

 #define CREATE_TRACE_POINTS
 #include <trace/mytrace.h>

This will cause the trace/mytrace.h file to also produce the C code
necessary to implement the trace point.

Note, if more than one trace/<type>.h is used to create the C code
it is best to list them all together.

 #define CREATE_TRACE_POINTS
 #include <trace/foo.h>
 #include <trace/bar.h>
 #include <trace/fido.h>

Thanks to Mathieu Desnoyers and Christoph Hellwig for coming up with
the cleaner solution of the define above the includes over my first
design to have the C code include a "special" header.

This patch converts sched, irq and lockdep and skb to use this new
method.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
new file mode 100644
index 0000000..de9dc7d
--- /dev/null
+++ b/include/trace/define_trace.h
@@ -0,0 +1,75 @@
+/*
+ * Trace files that want to automate creationg of all tracepoints defined
+ * in their file should include this file. The following are macros that the
+ * trace file may define:
+ *
+ * TRACE_SYSTEM defines the system the tracepoint is for
+ *
+ * TRACE_INCLUDE_FILE if the file name is something other than TRACE_SYSTEM.h
+ *     This macro may be defined to tell define_trace.h what file to include.
+ *     Note, leave off the ".h".
+ *
+ * TRACE_INCLUDE_PATH if the path is something other than core kernel include/trace
+ *     then this macro can define the path to use. Note, the path is relative to
+ *     define_trace.h, not the file including it. Full path names for out of tree
+ *     modules must be used.
+ */
+
+#ifdef CREATE_TRACE_POINTS
+
+/* Prevent recursion */
+#undef CREATE_TRACE_POINTS
+
+#include <linux/stringify.h>
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(name, proto, args, tstruct, assign, print)	\
+	DEFINE_TRACE(name)
+
+#undef TRACE_FORMAT
+#define TRACE_FORMAT(name, proto, args, print)	\
+	DEFINE_TRACE(name)
+
+#undef DECLARE_TRACE
+#define DECLARE_TRACE(name, proto, args)	\
+	DEFINE_TRACE(name)
+
+#undef TRACE_INCLUDE
+#undef __TRACE_INCLUDE
+
+#ifndef TRACE_INCLUDE_FILE
+# define TRACE_INCLUDE_FILE TRACE_SYSTEM
+# define UNDEF_TRACE_INCLUDE_FILE
+#endif
+
+#ifndef TRACE_INCLUDE_PATH
+# define __TRACE_INCLUDE(system) <trace/system.h>
+# define UNDEF_TRACE_INCLUDE_FILE
+#else
+# define __TRACE_INCLUDE(system) __stringify(TRACE_INCLUDE_PATH/system.h)
+#endif
+
+# define TRACE_INCLUDE(system) __TRACE_INCLUDE(system)
+
+/* Let the trace headers be reread */
+#define TRACE_HEADER_MULTI_READ
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+#undef TRACE_HEADER_MULTI_READ
+
+/* Only undef what we defined in this file */
+#ifdef UNDEF_TRACE_INCLUDE_FILE
+# undef TRACE_INCLUDE_PATH
+# undef UNDEF_TRACE_INCLUDE_FILE
+#endif
+
+#ifdef UNDEF_TRACE_INCLUDE_FILE
+# undef TRACE_INCLUDE_PATH
+# undef UNDEF_TRACE_INCLUDE_FILE
+#endif
+
+/* We may be processing more files */
+#define CREATE_TRACE_POINTS
+
+#endif /* CREATE_TRACE_POINTS */
diff --git a/include/trace/irq.h b/include/trace/irq.h
index 04ab4c6..75e3468 100644
--- a/include/trace/irq.h
+++ b/include/trace/irq.h
@@ -51,4 +51,7 @@ TRACE_FORMAT(softirq_exit,
 	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
 	);
 
-#endif
+#endif /*  _TRACE_IRQ_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/kmem.h b/include/trace/kmem.h
index d7d1218..c22c42f 100644
--- a/include/trace/kmem.h
+++ b/include/trace/kmem.h
@@ -188,5 +188,7 @@ TRACE_EVENT(kmem_cache_free,
 
 	TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
 );
+#endif /* _TRACE_KMEM_H */
 
-#endif
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h
index 8ee7900..4d301e7 100644
--- a/include/trace/lockdep.h
+++ b/include/trace/lockdep.h
@@ -55,3 +55,6 @@ TRACE_EVENT(lock_acquired,
 #endif
 
 #endif /* _TRACE_LOCKDEP_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/sched.h b/include/trace/sched.h
index 5b1cf4a..ffa1cab 100644
--- a/include/trace/sched.h
+++ b/include/trace/sched.h
@@ -334,3 +334,6 @@ TRACE_EVENT(sched_signal_send,
 );
 
 #endif /* _TRACE_SCHED_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/skb.h b/include/trace/skb.h
index e6fd281..1e8fabb 100644
--- a/include/trace/skb.h
+++ b/include/trace/skb.h
@@ -35,3 +35,6 @@ TRACE_EVENT(kfree_skb,
 );
 
 #endif /* _TRACE_SKB_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/exit.c b/kernel/exit.c
index abf9cf3..2fe9d2c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -56,10 +56,6 @@
 #include <asm/mmu_context.h>
 #include "cred-internals.h"
 
-DEFINE_TRACE(sched_process_free);
-DEFINE_TRACE(sched_process_exit);
-DEFINE_TRACE(sched_process_wait);
-
 static void exit_mm(struct task_struct * tsk);
 
 static void __unhash_process(struct task_struct *p)
diff --git a/kernel/fork.c b/kernel/fork.c
index b9e2edd..4bebf26 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -83,8 +83,6 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
 
-DEFINE_TRACE(sched_process_fork);
-
 int nr_processes(void)
 {
 	int cpu;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index d82142b..983d8be 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -17,9 +17,11 @@
 #include <linux/kernel_stat.h>
 #include <linux/rculist.h>
 #include <linux/hash.h>
-#include <trace/irq.h>
 #include <linux/bootmem.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/irq.h>
+
 #include "internals.h"
 
 /*
@@ -348,9 +350,6 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
 	       "but no thread function available.", irq, action->name);
 }
 
-DEFINE_TRACE(irq_handler_entry);
-DEFINE_TRACE(irq_handler_exit);
-
 /**
  * handle_IRQ_event - irq action chain handler
  * @irq:	the interrupt number
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4ebaf85..e1c7692 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -21,9 +21,6 @@ static DEFINE_SPINLOCK(kthread_create_lock);
 static LIST_HEAD(kthread_create_list);
 struct task_struct *kthreadd_task;
 
-DEFINE_TRACE(sched_kthread_stop);
-DEFINE_TRACE(sched_kthread_stop_ret);
-
 struct kthread_create_info
 {
 	/* Information passed to kthread() from kthreadd. */
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index c4582a6..257f21a 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -42,12 +42,14 @@
 #include <linux/hash.h>
 #include <linux/ftrace.h>
 #include <linux/stringify.h>
-#include <trace/lockdep.h>
 
 #include <asm/sections.h>
 
 #include "lockdep_internals.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/lockdep.h>
+
 #ifdef CONFIG_PROVE_LOCKING
 int prove_locking = 1;
 module_param(prove_locking, int, 0644);
@@ -2929,8 +2931,6 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
 }
 EXPORT_SYMBOL_GPL(lock_set_class);
 
-DEFINE_TRACE(lock_acquire);
-
 /*
  * We are not always called with irqs disabled - do that here,
  * and also avoid lockdep recursion:
@@ -2957,8 +2957,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 }
 EXPORT_SYMBOL_GPL(lock_acquire);
 
-DEFINE_TRACE(lock_release);
-
 void lock_release(struct lockdep_map *lock, int nested,
 			  unsigned long ip)
 {
@@ -3061,8 +3059,6 @@ found_it:
 	put_lock_stats(stats);
 }
 
-DEFINE_TRACE(lock_acquired);
-
 static void
 __lock_acquired(struct lockdep_map *lock, unsigned long ip)
 {
@@ -3118,8 +3114,6 @@ found_it:
 	lock->ip = ip;
 }
 
-DEFINE_TRACE(lock_contended);
-
 void lock_contended(struct lockdep_map *lock, unsigned long ip)
 {
 	unsigned long flags;
diff --git a/kernel/sched.c b/kernel/sched.c
index 5724508..e6d4518 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -72,13 +72,15 @@
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
-#include <trace/sched.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 
 #include "sched_cpupri.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/sched.h>
+
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -118,12 +120,6 @@
  */
 #define RUNTIME_INF	((u64)~0ULL)
 
-DEFINE_TRACE(sched_wait_task);
-DEFINE_TRACE(sched_wakeup);
-DEFINE_TRACE(sched_wakeup_new);
-DEFINE_TRACE(sched_switch);
-DEFINE_TRACE(sched_migrate_task);
-
 #ifdef CONFIG_SMP
 
 static void double_rq_lock(struct rq *rq1, struct rq *rq2);
diff --git a/kernel/signal.c b/kernel/signal.c
index d803473..1d5703f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -41,8 +41,6 @@
 
 static struct kmem_cache *sigqueue_cachep;
 
-DEFINE_TRACE(sched_signal_send);
-
 static void __user *sig_handler(struct task_struct *t, int sig)
 {
 	return t->sighand->action[sig - 1].sa.sa_handler;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 2fecefa..a2d9b45 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -186,9 +186,6 @@ EXPORT_SYMBOL(local_bh_enable_ip);
  */
 #define MAX_SOFTIRQ_RESTART 10
 
-DEFINE_TRACE(softirq_entry);
-DEFINE_TRACE(softirq_exit);
-
 asmlinkage void __do_softirq(void)
 {
 	struct softirq_action *h;
diff --git a/mm/util.c b/mm/util.c
index 2599e83..0e74a22 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,9 +4,11 @@
 #include <linux/module.h>
 #include <linux/err.h>
 #include <linux/sched.h>
-#include <linux/tracepoint.h>
 #include <asm/uaccess.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/kmem.h>
+
 /**
  * kstrdup - allocate space for and copy an existing string
  * @s: the string to duplicate
@@ -239,13 +241,6 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
 EXPORT_SYMBOL_GPL(get_user_pages_fast);
 
 /* Tracepoints definitions. */
-DEFINE_TRACE(kmalloc);
-DEFINE_TRACE(kmem_cache_alloc);
-DEFINE_TRACE(kmalloc_node);
-DEFINE_TRACE(kmem_cache_alloc_node);
-DEFINE_TRACE(kfree);
-DEFINE_TRACE(kmem_cache_free);
-
 EXPORT_TRACEPOINT_SYMBOL(kmalloc);
 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
 EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index c8fb456..8017720 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -19,11 +19,11 @@
 #include <linux/workqueue.h>
 #include <linux/netlink.h>
 #include <linux/net_dropmon.h>
-#include <trace/skb.h>
 
 #include <asm/unaligned.h>
 #include <asm/bitops.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/skb.h>
 
-DEFINE_TRACE(kfree_skb);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
-- 
cgit v0.10.2


From 9504504cbab29ecb694186b1c5b15d3579c43c51 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 11 Apr 2009 12:59:57 -0400
Subject: tracing: make trace_seq operations available for core kernel

In the process to make TRACE_EVENT macro work for modules, the trace_seq
operations must be available for core kernel code.

These operations are quite useful and can be used for other implementations.

The main idea is that we create a trace_seq handle that acts very much
like the seq_file handle.

	struct trace_seq *s = kmalloc(sizeof(*s, GFP_KERNEL);

	trace_seq_init(s);
	trace_seq_printf(s, "some data %d\n", variable);

	printk("%s", s->buffer);

The main use is to allow a top level function call several other functions
that may store printf like data into the buffer. Then at the end, the top
level function can process all the data with any method it would like to.
It could be passed to userspace, output via printk or even use seq_file:

	trace_seq_to_user(s, ubuf, cnt);
	seq_puts(m, s->buffer);

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
new file mode 100644
index 0000000..28051da
--- /dev/null
+++ b/include/linux/trace_seq.h
@@ -0,0 +1,89 @@
+#ifndef _LINUX_TRACE_SEQ_H
+#define _LINUX_TRACE_SEQ_H
+
+/*
+ * Trace sequences are used to allow a function to call several other functions
+ * to create a string of data to use (up to a max of PAGE_SIZE.
+ */
+
+struct trace_seq {
+	unsigned char		buffer[PAGE_SIZE];
+	unsigned int		len;
+	unsigned int		readpos;
+};
+
+static inline void
+trace_seq_init(struct trace_seq *s)
+{
+	s->len = 0;
+	s->readpos = 0;
+}
+
+/*
+ * Currently only defined when tracing is enabled.
+ */
+#ifdef CONFIG_TRACING
+extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+extern int
+trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
+extern void trace_print_seq(struct seq_file *m, struct trace_seq *s);
+extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
+				 size_t cnt);
+extern int trace_seq_puts(struct trace_seq *s, const char *str);
+extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
+extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len);
+extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+				size_t len);
+extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
+extern int trace_seq_path(struct trace_seq *s, struct path *path);
+
+#else /* CONFIG_TRACING */
+static inline int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)))
+{
+	return 0;
+}
+static inline int
+trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+{
+	return 0;
+}
+
+static inline void trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+}
+static inline ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
+				 size_t cnt)
+{
+	return 0;
+}
+static inline int trace_seq_puts(struct trace_seq *s, const char *str)
+{
+	return 0;
+}
+static inline int trace_seq_putc(struct trace_seq *s, unsigned char c);
+{
+	return 0;
+}
+static inline int
+trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
+{
+	return 0;
+}
+static inline int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+				       size_t len)
+{
+	return 0;
+}
+static inline void *trace_seq_reserve(struct trace_seq *s, size_t len)
+{
+	return NULL;
+}
+static inline int trace_seq_path(struct trace_seq *s, struct path *path)
+{
+	return 0;
+}
+#endif /* CONFIG_TRACING */
+
+#endif /* _LINUX_TRACE_SEQ_H */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b05b6ac..1882846 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -12,6 +12,8 @@
 #include <linux/kmemtrace.h>
 #include <trace/power.h>
 
+#include <linux/trace_seq.h>
+
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
 
@@ -423,19 +425,6 @@ struct tracer {
 	struct tracer_stat	*stats;
 };
 
-struct trace_seq {
-	unsigned char		buffer[PAGE_SIZE];
-	unsigned int		len;
-	unsigned int		readpos;
-};
-
-static inline void
-trace_seq_init(struct trace_seq *s)
-{
-	s->len = 0;
-	s->readpos = 0;
-}
-
 
 #define TRACE_PIPE_ALL_CPU	-1
 
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 9163021..5c7cbfb 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -1,6 +1,7 @@
 #ifndef __TRACE_EVENTS_H
 #define __TRACE_EVENTS_H
 
+#include <linux/trace_seq.h>
 #include "trace.h"
 
 typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
@@ -20,24 +21,9 @@ trace_print_bprintk_msg_only(struct trace_iterator *iter);
 extern enum print_line_t
 trace_print_printk_msg_only(struct trace_iterator *iter);
 
-extern void trace_print_seq(struct seq_file *m, struct trace_seq *s);
-
-extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
-extern int
-trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
 extern int
 seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
 		unsigned long sym_flags);
-extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
-				 size_t cnt);
-extern int trace_seq_puts(struct trace_seq *s, const char *str);
-extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
-extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len);
-extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
-				size_t len);
-extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
-extern int trace_seq_path(struct trace_seq *s, struct path *path);
 extern int seq_print_userip_objs(const struct userstack_entry *entry,
 				 struct trace_seq *s, unsigned long sym_flags);
 extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
-- 
cgit v0.10.2


From 97f2025153499faa17267a0d4e18c7afaf73f39d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 13 Apr 2009 11:20:49 -0400
Subject: tracing/events: move declarations from trace directory to core
 include

In preparation to allowing trace events to happen in modules, we need
to move some of the local declarations in the kernel/trace directory
into include/linux.

This patch simply moves the declarations and performs no context changes.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
new file mode 100644
index 0000000..496b76d
--- /dev/null
+++ b/include/linux/ftrace_event.h
@@ -0,0 +1,146 @@
+#ifndef _LINUX_FTRACE_EVENT_H
+#define _LINUX_FTRACE_EVENT_H
+
+#include <linux/trace_seq.h>
+#include <linux/ring_buffer.h>
+
+
+struct trace_array;
+struct tracer;
+
+/*
+ * The trace entry - the most basic unit of tracing. This is what
+ * is printed in the end as a single line in the trace output, such as:
+ *
+ *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
+ */
+struct trace_entry {
+	unsigned char		type;
+	unsigned char		flags;
+	unsigned char		preempt_count;
+	int			pid;
+	int			tgid;
+};
+
+/*
+ * Trace iterator - used by printout routines who present trace
+ * results to users and which routines might sleep, etc:
+ */
+struct trace_iterator {
+	struct trace_array	*tr;
+	struct tracer		*trace;
+	void			*private;
+	int			cpu_file;
+	struct mutex		mutex;
+	struct ring_buffer_iter	*buffer_iter[NR_CPUS];
+
+	/* The below is zeroed out in pipe_read */
+	struct trace_seq	seq;
+	struct trace_entry	*ent;
+	int			cpu;
+	u64			ts;
+
+	unsigned long		iter_flags;
+	loff_t			pos;
+	long			idx;
+
+	cpumask_var_t		started;
+};
+
+
+typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
+					      int flags);
+struct trace_event {
+	struct hlist_node	node;
+	int			type;
+	trace_print_func	trace;
+	trace_print_func	raw;
+	trace_print_func	hex;
+	trace_print_func	binary;
+};
+
+extern int register_ftrace_event(struct trace_event *event);
+extern int unregister_ftrace_event(struct trace_event *event);
+
+/* Return values for print_line callback */
+enum print_line_t {
+	TRACE_TYPE_PARTIAL_LINE	= 0,	/* Retry after flushing the seq */
+	TRACE_TYPE_HANDLED	= 1,
+	TRACE_TYPE_UNHANDLED	= 2,	/* Relay to other output functions */
+	TRACE_TYPE_NO_CONSUME	= 3	/* Handled but ask to not consume */
+};
+
+
+struct ring_buffer_event *
+trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
+				  unsigned long flags, int pc);
+void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
+					unsigned long flags, int pc);
+void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
+					unsigned long flags, int pc);
+void trace_current_buffer_discard_commit(struct ring_buffer_event *event);
+
+void tracing_record_cmdline(struct task_struct *tsk);
+
+struct ftrace_event_call {
+	char			*name;
+	char			*system;
+	struct dentry		*dir;
+	int			enabled;
+	int			(*regfunc)(void);
+	void			(*unregfunc)(void);
+	int			id;
+	int			(*raw_init)(void);
+	int			(*show_format)(struct trace_seq *s);
+	int			(*define_fields)(void);
+	struct list_head	fields;
+	int			n_preds;
+	struct filter_pred	**preds;
+
+#ifdef CONFIG_EVENT_PROFILE
+	atomic_t	profile_count;
+	int		(*profile_enable)(struct ftrace_event_call *);
+	void		(*profile_disable)(struct ftrace_event_call *);
+#endif
+};
+
+#define MAX_FILTER_PRED		8
+#define MAX_FILTER_STR_VAL	128
+
+extern int init_preds(struct ftrace_event_call *call);
+extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
+extern int filter_current_check_discard(struct ftrace_event_call *call,
+					void *rec,
+					struct ring_buffer_event *event);
+
+extern int trace_define_field(struct ftrace_event_call *call, char *type,
+			      char *name, int offset, int size);
+
+
+/*
+ * The double __builtin_constant_p is because gcc will give us an error
+ * if we try to allocate the static variable to fmt if it is not a
+ * constant. Even with the outer if statement optimizing out.
+ */
+#define event_trace_printk(ip, fmt, args...)				\
+do {									\
+	__trace_printk_check_format(fmt, ##args);			\
+	tracing_record_cmdline(current);				\
+	if (__builtin_constant_p(fmt)) {				\
+		static const char *trace_printk_fmt			\
+		  __attribute__((section("__trace_printk_fmt"))) =	\
+			__builtin_constant_p(fmt) ? fmt : NULL;		\
+									\
+		__trace_bprintk(ip, trace_printk_fmt, ##args);		\
+	} else								\
+		__trace_printk(ip, fmt, ##args);			\
+} while (0)
+
+#define __common_field(type, item)					\
+	ret = trace_define_field(event_call, #type, "common_" #item,	\
+				 offsetof(typeof(field.ent), item),	\
+				 sizeof(field.ent.item));		\
+	if (ret)							\
+		return ret;
+
+#endif /* _LINUX_FTRACE_EVENT_H */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1882846..6bcdf4a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -13,6 +13,7 @@
 #include <trace/power.h>
 
 #include <linux/trace_seq.h>
+#include <linux/ftrace_event.h>
 
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
@@ -44,20 +45,6 @@ enum trace_type {
 };
 
 /*
- * The trace entry - the most basic unit of tracing. This is what
- * is printed in the end as a single line in the trace output, such as:
- *
- *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
- */
-struct trace_entry {
-	unsigned char		type;
-	unsigned char		flags;
-	unsigned char		preempt_count;
-	int			pid;
-	int			tgid;
-};
-
-/*
  * Function trace entry - function address and parent function addres:
  */
 struct ftrace_entry {
@@ -265,8 +252,6 @@ struct trace_array_cpu {
 	char			comm[TASK_COMM_LEN];
 };
 
-struct trace_iterator;
-
 /*
  * The trace array - an array of per-CPU trace arrays. This is the
  * highest level data structure that individual tracers deal with.
@@ -341,15 +326,6 @@ extern void __ftrace_bad_type(void);
 		__ftrace_bad_type();					\
 	} while (0)
 
-/* Return values for print_line callback */
-enum print_line_t {
-	TRACE_TYPE_PARTIAL_LINE	= 0,	/* Retry after flushing the seq */
-	TRACE_TYPE_HANDLED	= 1,
-	TRACE_TYPE_UNHANDLED	= 2,	/* Relay to other output functions */
-	TRACE_TYPE_NO_CONSUME	= 3	/* Handled but ask to not consume */
-};
-
-
 /*
  * An option specific to a tracer. This is a boolean value.
  * The bit is the bit index that sets its value on the
@@ -428,31 +404,6 @@ struct tracer {
 
 #define TRACE_PIPE_ALL_CPU	-1
 
-/*
- * Trace iterator - used by printout routines who present trace
- * results to users and which routines might sleep, etc:
- */
-struct trace_iterator {
-	struct trace_array	*tr;
-	struct tracer		*trace;
-	void			*private;
-	int			cpu_file;
-	struct mutex		mutex;
-	struct ring_buffer_iter	*buffer_iter[NR_CPUS];
-
-	/* The below is zeroed out in pipe_read */
-	struct trace_seq	seq;
-	struct trace_entry	*ent;
-	int			cpu;
-	u64			ts;
-
-	unsigned long		iter_flags;
-	loff_t			pos;
-	long			idx;
-
-	cpumask_var_t		started;
-};
-
 int tracer_init(struct tracer *t, struct trace_array *tr);
 int tracing_is_enabled(void);
 void trace_wake_up(void);
@@ -479,15 +430,6 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
 				struct ring_buffer_event *event,
 				unsigned long flags, int pc);
 
-struct ring_buffer_event *
-trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
-				  unsigned long flags, int pc);
-void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
-					unsigned long flags, int pc);
-void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
-					unsigned long flags, int pc);
-void trace_current_buffer_discard_commit(struct ring_buffer_event *event);
-
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 						struct trace_array_cpu *data);
 
@@ -510,7 +452,6 @@ void tracing_sched_switch_trace(struct trace_array *tr,
 				struct task_struct *prev,
 				struct task_struct *next,
 				unsigned long flags, int pc);
-void tracing_record_cmdline(struct task_struct *tsk);
 
 void tracing_sched_wakeup_trace(struct trace_array *tr,
 				struct task_struct *wakee,
@@ -790,28 +731,6 @@ struct ftrace_event_field {
 	int			size;
 };
 
-struct ftrace_event_call {
-	char			*name;
-	char			*system;
-	struct dentry		*dir;
-	int			enabled;
-	int			(*regfunc)(void);
-	void			(*unregfunc)(void);
-	int			id;
-	int			(*raw_init)(void);
-	int			(*show_format)(struct trace_seq *s);
-	int			(*define_fields)(void);
-	struct list_head	fields;
-	int			n_preds;
-	struct filter_pred	**preds;
-
-#ifdef CONFIG_EVENT_PROFILE
-	atomic_t	profile_count;
-	int		(*profile_enable)(struct ftrace_event_call *);
-	void		(*profile_disable)(struct ftrace_event_call *);
-#endif
-};
-
 struct event_subsystem {
 	struct list_head	list;
 	const char		*name;
@@ -825,9 +744,6 @@ struct event_subsystem {
 	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
 	     event++)
 
-#define MAX_FILTER_PRED		8
-#define MAX_FILTER_STR_VAL	128
-
 struct filter_pred;
 
 typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
@@ -845,9 +761,6 @@ struct filter_pred {
 	int clear;
 };
 
-int trace_define_field(struct ftrace_event_call *call, char *type,
-		       char *name, int offset, int size);
-extern int init_preds(struct ftrace_event_call *call);
 extern void filter_free_pred(struct filter_pred *pred);
 extern void filter_print_preds(struct filter_pred **preds, int n_preds,
 			       struct trace_seq *s);
@@ -855,13 +768,9 @@ extern int filter_parse(char **pbuf, struct filter_pred *pred);
 extern int filter_add_pred(struct ftrace_event_call *call,
 			   struct filter_pred *pred);
 extern void filter_disable_preds(struct ftrace_event_call *call);
-extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
 extern void filter_free_subsystem_preds(struct event_subsystem *system);
 extern int filter_add_subsystem_pred(struct event_subsystem *system,
 				     struct filter_pred *pred);
-extern int filter_current_check_discard(struct ftrace_event_call *call,
-					void *rec,
-					struct ring_buffer_event *event);
 
 static inline int
 filter_check_discard(struct ftrace_event_call *call, void *rec,
@@ -876,14 +785,6 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 	return 0;
 }
 
-#define __common_field(type, item)					\
-	ret = trace_define_field(event_call, #type, "common_" #item,	\
-				 offsetof(typeof(field.ent), item),	\
-				 sizeof(field.ent.item));		\
-	if (ret)							\
-		return ret;
-
-void event_trace_printk(unsigned long ip, const char *fmt, ...);
 extern struct ftrace_event_call __start_ftrace_events[];
 extern struct ftrace_event_call __stop_ftrace_events[];
 
@@ -895,25 +796,6 @@ extern struct ftrace_event_call __stop_ftrace_events[];
 extern const char *__start___trace_bprintk_fmt[];
 extern const char *__stop___trace_bprintk_fmt[];
 
-/*
- * The double __builtin_constant_p is because gcc will give us an error
- * if we try to allocate the static variable to fmt if it is not a
- * constant. Even with the outer if statement optimizing out.
- */
-#define event_trace_printk(ip, fmt, args...)				\
-do {									\
-	__trace_printk_check_format(fmt, ##args);			\
-	tracing_record_cmdline(current);				\
-	if (__builtin_constant_p(fmt)) {				\
-		static const char *trace_printk_fmt			\
-		  __attribute__((section("__trace_printk_fmt"))) =	\
-			__builtin_constant_p(fmt) ? fmt : NULL;		\
-									\
-		__trace_bprintk(ip, trace_printk_fmt, ##args);		\
-	} else								\
-		__trace_printk(ip, fmt, ##args);			\
-} while (0)
-
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
 	extern struct ftrace_event_call event_##call;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 5c7cbfb..6e220a8 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -4,18 +4,6 @@
 #include <linux/trace_seq.h>
 #include "trace.h"
 
-typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
-					      int flags);
-
-struct trace_event {
-	struct hlist_node	node;
-	int			type;
-	trace_print_func	trace;
-	trace_print_func	raw;
-	trace_print_func	hex;
-	trace_print_func	binary;
-};
-
 extern enum print_line_t
 trace_print_bprintk_msg_only(struct trace_iterator *iter);
 extern enum print_line_t
@@ -33,8 +21,6 @@ extern int trace_print_context(struct trace_iterator *iter);
 extern int trace_print_lat_context(struct trace_iterator *iter);
 
 extern struct trace_event *ftrace_find_event(int type);
-extern int register_ftrace_event(struct trace_event *event);
-extern int unregister_ftrace_event(struct trace_event *event);
 
 extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
 					 int flags);
-- 
cgit v0.10.2


From f42c85e74faa422cf0bc747ed808681145448f88 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 13 Apr 2009 12:25:37 -0400
Subject: tracing/events: move the ftrace event tracing code to core

This patch moves the ftrace creation into include/trace/ftrace.h and
simplifies the work of developers in adding new tracepoints.
Just the act of creating the trace points in include/trace and including
define_trace.h will create the events in the debugfs/tracing/events
directory.

This patch removes the need of include/trace/trace_events.h

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index de9dc7d..980eb66 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -56,6 +56,10 @@
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
+#ifdef CONFIG_EVENT_TRACER
+#include <trace/ftrace.h>
+#endif
+
 #undef TRACE_HEADER_MULTI_READ
 
 /* Only undef what we defined in this file */
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
new file mode 100644
index 0000000..955b967
--- /dev/null
+++ b/include/trace/ftrace.h
@@ -0,0 +1,492 @@
+/*
+ * Stage 1 of the trace events.
+ *
+ * Override the macros in <trace/trace_events.h> to include the following:
+ *
+ * struct ftrace_raw_<call> {
+ *	struct trace_entry		ent;
+ *	<type>				<item>;
+ *	<type2>				<item2>[<len>];
+ *	[...]
+ * };
+ *
+ * The <type> <item> is created by the __field(type, item) macro or
+ * the __array(type2, item2, len) macro.
+ * We simply do "type item;", and that will create the fields
+ * in the structure.
+ */
+
+#include <linux/ftrace_event.h>
+
+#undef TRACE_FORMAT
+#define TRACE_FORMAT(call, proto, args, fmt)
+
+#undef __array
+#define __array(type, item, len)	type	item[len];
+
+#undef __field
+#define __field(type, item)		type	item;
+
+#undef TP_STRUCT__entry
+#define TP_STRUCT__entry(args...) args
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(name, proto, args, tstruct, assign, print)	\
+	struct ftrace_raw_##name {				\
+		struct trace_entry	ent;			\
+		tstruct						\
+	};							\
+	static struct ftrace_event_call event_##name
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+/*
+ * Stage 2 of the trace events.
+ *
+ * Override the macros in <trace/trace_events.h> to include the following:
+ *
+ * enum print_line_t
+ * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
+ * {
+ *	struct trace_seq *s = &iter->seq;
+ *	struct ftrace_raw_<call> *field; <-- defined in stage 1
+ *	struct trace_entry *entry;
+ *	int ret;
+ *
+ *	entry = iter->ent;
+ *
+ *	if (entry->type != event_<call>.id) {
+ *		WARN_ON_ONCE(1);
+ *		return TRACE_TYPE_UNHANDLED;
+ *	}
+ *
+ *	field = (typeof(field))entry;
+ *
+ *	ret = trace_seq_printf(s, <TP_printk> "\n");
+ *	if (!ret)
+ *		return TRACE_TYPE_PARTIAL_LINE;
+ *
+ *	return TRACE_TYPE_HANDLED;
+ * }
+ *
+ * This is the method used to print the raw event to the trace
+ * output format. Note, this is not needed if the data is read
+ * in binary.
+ */
+
+#undef __entry
+#define __entry field
+
+#undef TP_printk
+#define TP_printk(fmt, args...) fmt "\n", args
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
+enum print_line_t							\
+ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
+{									\
+	struct trace_seq *s = &iter->seq;				\
+	struct ftrace_raw_##call *field;				\
+	struct trace_entry *entry;					\
+	int ret;							\
+									\
+	entry = iter->ent;						\
+									\
+	if (entry->type != event_##call.id) {				\
+		WARN_ON_ONCE(1);					\
+		return TRACE_TYPE_UNHANDLED;				\
+	}								\
+									\
+	field = (typeof(field))entry;					\
+									\
+	ret = trace_seq_printf(s, #call ": " print);			\
+	if (!ret)							\
+		return TRACE_TYPE_PARTIAL_LINE;				\
+									\
+	return TRACE_TYPE_HANDLED;					\
+}
+	
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+/*
+ * Setup the showing format of trace point.
+ *
+ * int
+ * ftrace_format_##call(struct trace_seq *s)
+ * {
+ *	struct ftrace_raw_##call field;
+ *	int ret;
+ *
+ *	ret = trace_seq_printf(s, #type " " #item ";"
+ *			       " offset:%u; size:%u;\n",
+ *			       offsetof(struct ftrace_raw_##call, item),
+ *			       sizeof(field.type));
+ *
+ * }
+ */
+
+#undef TP_STRUCT__entry
+#define TP_STRUCT__entry(args...) args
+
+#undef __field
+#define __field(type, item)					\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
+
+#undef __array
+#define __array(type, item, len)						\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
+
+#undef __entry
+#define __entry REC
+
+#undef TP_printk
+#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
+
+#undef TP_fast_assign
+#define TP_fast_assign(args...) args
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
+static int								\
+ftrace_format_##call(struct trace_seq *s)				\
+{									\
+	struct ftrace_raw_##call field;					\
+	int ret;							\
+									\
+	tstruct;							\
+									\
+	trace_seq_printf(s, "\nprint fmt: " print);			\
+									\
+	return ret;							\
+}
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+#undef __field
+#define __field(type, item)						\
+	ret = trace_define_field(event_call, #type, #item,		\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item));			\
+	if (ret)							\
+		return ret;
+
+#undef __array
+#define __array(type, item, len)					\
+	BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);				\
+	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item));			\
+	if (ret)							\
+		return ret;
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
+int									\
+ftrace_define_fields_##call(void)					\
+{									\
+	struct ftrace_raw_##call field;					\
+	struct ftrace_event_call *event_call = &event_##call;		\
+	int ret;							\
+									\
+	__common_field(unsigned char, type);				\
+	__common_field(unsigned char, flags);				\
+	__common_field(unsigned char, preempt_count);			\
+	__common_field(int, pid);					\
+	__common_field(int, tgid);					\
+									\
+	tstruct;							\
+									\
+	return ret;							\
+}
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+/*
+ * Stage 3 of the trace events.
+ *
+ * Override the macros in <trace/trace_events.h> to include the following:
+ *
+ * static void ftrace_event_<call>(proto)
+ * {
+ *	event_trace_printk(_RET_IP_, "<call>: " <fmt>);
+ * }
+ *
+ * static int ftrace_reg_event_<call>(void)
+ * {
+ *	int ret;
+ *
+ *	ret = register_trace_<call>(ftrace_event_<call>);
+ *	if (!ret)
+ *		pr_info("event trace: Could not activate trace point "
+ *			"probe to  <call>");
+ *	return ret;
+ * }
+ *
+ * static void ftrace_unreg_event_<call>(void)
+ * {
+ *	unregister_trace_<call>(ftrace_event_<call>);
+ * }
+ *
+ * For those macros defined with TRACE_FORMAT:
+ *
+ * static struct ftrace_event_call __used
+ * __attribute__((__aligned__(4)))
+ * __attribute__((section("_ftrace_events"))) event_<call> = {
+ *	.name			= "<call>",
+ *	.regfunc		= ftrace_reg_event_<call>,
+ *	.unregfunc		= ftrace_unreg_event_<call>,
+ * }
+ *
+ *
+ * For those macros defined with TRACE_EVENT:
+ *
+ * static struct ftrace_event_call event_<call>;
+ *
+ * static void ftrace_raw_event_<call>(proto)
+ * {
+ *	struct ring_buffer_event *event;
+ *	struct ftrace_raw_<call> *entry; <-- defined in stage 1
+ *	unsigned long irq_flags;
+ *	int pc;
+ *
+ *	local_save_flags(irq_flags);
+ *	pc = preempt_count();
+ *
+ *	event = trace_current_buffer_lock_reserve(event_<call>.id,
+ *				  sizeof(struct ftrace_raw_<call>),
+ *				  irq_flags, pc);
+ *	if (!event)
+ *		return;
+ *	entry	= ring_buffer_event_data(event);
+ *
+ *	<assign>;  <-- Here we assign the entries by the __field and
+ *			__array macros.
+ *
+ *	trace_current_buffer_unlock_commit(event, irq_flags, pc);
+ * }
+ *
+ * static int ftrace_raw_reg_event_<call>(void)
+ * {
+ *	int ret;
+ *
+ *	ret = register_trace_<call>(ftrace_raw_event_<call>);
+ *	if (!ret)
+ *		pr_info("event trace: Could not activate trace point "
+ *			"probe to <call>");
+ *	return ret;
+ * }
+ *
+ * static void ftrace_unreg_event_<call>(void)
+ * {
+ *	unregister_trace_<call>(ftrace_raw_event_<call>);
+ * }
+ *
+ * static struct trace_event ftrace_event_type_<call> = {
+ *	.trace			= ftrace_raw_output_<call>, <-- stage 2
+ * };
+ *
+ * static int ftrace_raw_init_event_<call>(void)
+ * {
+ *	int id;
+ *
+ *	id = register_ftrace_event(&ftrace_event_type_<call>);
+ *	if (!id)
+ *		return -ENODEV;
+ *	event_<call>.id = id;
+ *	return 0;
+ * }
+ *
+ * static struct ftrace_event_call __used
+ * __attribute__((__aligned__(4)))
+ * __attribute__((section("_ftrace_events"))) event_<call> = {
+ *	.name			= "<call>",
+ *	.system			= "<system>",
+ *	.raw_init		= ftrace_raw_init_event_<call>,
+ *	.regfunc		= ftrace_reg_event_<call>,
+ *	.unregfunc		= ftrace_unreg_event_<call>,
+ *	.show_format		= ftrace_format_<call>,
+ * }
+ *
+ */
+
+#undef TP_FMT
+#define TP_FMT(fmt, args...)	fmt "\n", ##args
+
+#ifdef CONFIG_EVENT_PROFILE
+#define _TRACE_PROFILE(call, proto, args)				\
+static void ftrace_profile_##call(proto)				\
+{									\
+	extern void perf_tpcounter_event(int);				\
+	perf_tpcounter_event(event_##call.id);				\
+}									\
+									\
+static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \
+{									\
+	int ret = 0;							\
+									\
+	if (!atomic_inc_return(&call->profile_count))			\
+		ret = register_trace_##call(ftrace_profile_##call);	\
+									\
+	return ret;							\
+}									\
+									\
+static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \
+{									\
+	if (atomic_add_negative(-1, &call->profile_count))		\
+		unregister_trace_##call(ftrace_profile_##call);		\
+}
+
+#define _TRACE_PROFILE_INIT(call)					\
+	.profile_count = ATOMIC_INIT(-1),				\
+	.profile_enable = ftrace_profile_enable_##call,			\
+	.profile_disable = ftrace_profile_disable_##call,
+
+#else
+#define _TRACE_PROFILE(call, proto, args)
+#define _TRACE_PROFILE_INIT(call)
+#endif
+
+#define _TRACE_FORMAT(call, proto, args, fmt)				\
+static void ftrace_event_##call(proto)					\
+{									\
+	event_trace_printk(_RET_IP_, #call ": " fmt);			\
+}									\
+									\
+static int ftrace_reg_event_##call(void)				\
+{									\
+	int ret;							\
+									\
+	ret = register_trace_##call(ftrace_event_##call);		\
+	if (ret)							\
+		pr_info("event trace: Could not activate trace point "	\
+			"probe to " #call "\n");			\
+	return ret;							\
+}									\
+									\
+static void ftrace_unreg_event_##call(void)				\
+{									\
+	unregister_trace_##call(ftrace_event_##call);			\
+}									\
+									\
+static struct ftrace_event_call event_##call;				\
+									\
+static int ftrace_init_event_##call(void)				\
+{									\
+	int id;								\
+									\
+	id = register_ftrace_event(NULL);				\
+	if (!id)							\
+		return -ENODEV;						\
+	event_##call.id = id;						\
+	return 0;							\
+}
+
+#undef TRACE_FORMAT
+#define TRACE_FORMAT(call, proto, args, fmt)				\
+_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
+_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))			\
+static struct ftrace_event_call __used					\
+__attribute__((__aligned__(4)))						\
+__attribute__((section("_ftrace_events"))) event_##call = {		\
+	.name			= #call,				\
+	.system			= __stringify(TRACE_SYSTEM),		\
+	.raw_init		= ftrace_init_event_##call,		\
+	.regfunc		= ftrace_reg_event_##call,		\
+	.unregfunc		= ftrace_unreg_event_##call,		\
+	_TRACE_PROFILE_INIT(call)					\
+}
+
+#undef __entry
+#define __entry entry
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
+_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))			\
+									\
+static struct ftrace_event_call event_##call;				\
+									\
+static void ftrace_raw_event_##call(proto)				\
+{									\
+	struct ftrace_event_call *call = &event_##call;			\
+	struct ring_buffer_event *event;				\
+	struct ftrace_raw_##call *entry;				\
+	unsigned long irq_flags;					\
+	int pc;								\
+									\
+	local_save_flags(irq_flags);					\
+	pc = preempt_count();						\
+									\
+	event = trace_current_buffer_lock_reserve(event_##call.id,	\
+				  sizeof(struct ftrace_raw_##call),	\
+				  irq_flags, pc);			\
+	if (!event)							\
+		return;							\
+	entry	= ring_buffer_event_data(event);			\
+									\
+	assign;								\
+									\
+	if (!filter_current_check_discard(call, entry, event))		\
+		trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
+}									\
+									\
+static int ftrace_raw_reg_event_##call(void)				\
+{									\
+	int ret;							\
+									\
+	ret = register_trace_##call(ftrace_raw_event_##call);		\
+	if (ret)							\
+		pr_info("event trace: Could not activate trace point "	\
+			"probe to " #call "\n");			\
+	return ret;							\
+}									\
+									\
+static void ftrace_raw_unreg_event_##call(void)				\
+{									\
+	unregister_trace_##call(ftrace_raw_event_##call);		\
+}									\
+									\
+static struct trace_event ftrace_event_type_##call = {			\
+	.trace			= ftrace_raw_output_##call,		\
+};									\
+									\
+static int ftrace_raw_init_event_##call(void)				\
+{									\
+	int id;								\
+									\
+	id = register_ftrace_event(&ftrace_event_type_##call);		\
+	if (!id)							\
+		return -ENODEV;						\
+	event_##call.id = id;						\
+	INIT_LIST_HEAD(&event_##call.fields);				\
+	init_preds(&event_##call);					\
+	return 0;							\
+}									\
+									\
+static struct ftrace_event_call __used					\
+__attribute__((__aligned__(4)))						\
+__attribute__((section("_ftrace_events"))) event_##call = {		\
+	.name			= #call,				\
+	.system			= __stringify(TRACE_SYSTEM),		\
+	.raw_init		= ftrace_raw_init_event_##call,		\
+	.regfunc		= ftrace_raw_reg_event_##call,		\
+	.unregfunc		= ftrace_raw_unreg_event_##call,	\
+	.show_format		= ftrace_format_##call,			\
+	.define_fields		= ftrace_define_fields_##call,		\
+	_TRACE_PROFILE_INIT(call)					\
+}
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+#undef _TRACE_PROFILE
+#undef _TRACE_PROFILE_INIT
+
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
deleted file mode 100644
index 13d6b85..0000000
--- a/include/trace/trace_events.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* trace/<type>.h here */
-
-#include <trace/sched.h>
-#include <trace/irq.h>
-#include <trace/lockdep.h>
-#include <trace/skb.h>
-#include <trace/kmem.h>
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 3ad367e..fb9d7f9 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,7 +41,6 @@ obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events.o
-obj-$(CONFIG_EVENT_TRACER) += events.o
 obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
diff --git a/kernel/trace/events.c b/kernel/trace/events.c
deleted file mode 100644
index 5a35a91..0000000
--- a/kernel/trace/events.c
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * This is the place to register all trace points as events.
- */
-
-#include <linux/stringify.h>
-
-#include <trace/trace_events.h>
-
-#include "trace_output.h"
-
-#define TRACE_HEADER_MULTI_READ
-#include "trace_events_stage_1.h"
-#include "trace_events_stage_2.h"
-#include "trace_events_stage_3.h"
-
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
deleted file mode 100644
index 475f46a..0000000
--- a/kernel/trace/trace_events_stage_1.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Stage 1 of the trace events.
- *
- * Override the macros in <trace/trace_events.h> to include the following:
- *
- * struct ftrace_raw_<call> {
- *	struct trace_entry		ent;
- *	<type>				<item>;
- *	<type2>				<item2>[<len>];
- *	[...]
- * };
- *
- * The <type> <item> is created by the __field(type, item) macro or
- * the __array(type2, item2, len) macro.
- * We simply do "type item;", and that will create the fields
- * in the structure.
- */
-
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt)
-
-#undef __array
-#define __array(type, item, len)	type	item[len];
-
-#undef __field
-#define __field(type, item)		type	item;
-
-#undef TP_STRUCT__entry
-#define TP_STRUCT__entry(args...) args
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(name, proto, args, tstruct, assign, print)	\
-	struct ftrace_raw_##name {				\
-		struct trace_entry	ent;			\
-		tstruct						\
-	};							\
-	static struct ftrace_event_call event_##name
-
-#include <trace/trace_events.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
deleted file mode 100644
index aa4a67a..0000000
--- a/kernel/trace/trace_events_stage_2.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Stage 2 of the trace events.
- *
- * Override the macros in <trace/trace_events.h> to include the following:
- *
- * enum print_line_t
- * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
- * {
- *	struct trace_seq *s = &iter->seq;
- *	struct ftrace_raw_<call> *field; <-- defined in stage 1
- *	struct trace_entry *entry;
- *	int ret;
- *
- *	entry = iter->ent;
- *
- *	if (entry->type != event_<call>.id) {
- *		WARN_ON_ONCE(1);
- *		return TRACE_TYPE_UNHANDLED;
- *	}
- *
- *	field = (typeof(field))entry;
- *
- *	ret = trace_seq_printf(s, <TP_printk> "\n");
- *	if (!ret)
- *		return TRACE_TYPE_PARTIAL_LINE;
- *
- *	return TRACE_TYPE_HANDLED;
- * }
- *
- * This is the method used to print the raw event to the trace
- * output format. Note, this is not needed if the data is read
- * in binary.
- */
-
-#undef __entry
-#define __entry field
-
-#undef TP_printk
-#define TP_printk(fmt, args...) fmt "\n", args
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
-enum print_line_t							\
-ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
-{									\
-	struct trace_seq *s = &iter->seq;				\
-	struct ftrace_raw_##call *field;				\
-	struct trace_entry *entry;					\
-	int ret;							\
-									\
-	entry = iter->ent;						\
-									\
-	if (entry->type != event_##call.id) {				\
-		WARN_ON_ONCE(1);					\
-		return TRACE_TYPE_UNHANDLED;				\
-	}								\
-									\
-	field = (typeof(field))entry;					\
-									\
-	ret = trace_seq_printf(s, #call ": " print);			\
-	if (!ret)							\
-		return TRACE_TYPE_PARTIAL_LINE;				\
-									\
-	return TRACE_TYPE_HANDLED;					\
-}
-	
-#include <trace/trace_events.h>
-
-/*
- * Setup the showing format of trace point.
- *
- * int
- * ftrace_format_##call(struct trace_seq *s)
- * {
- *	struct ftrace_raw_##call field;
- *	int ret;
- *
- *	ret = trace_seq_printf(s, #type " " #item ";"
- *			       " offset:%u; size:%u;\n",
- *			       offsetof(struct ftrace_raw_##call, item),
- *			       sizeof(field.type));
- *
- * }
- */
-
-#undef TP_STRUCT__entry
-#define TP_STRUCT__entry(args...) args
-
-#undef __field
-#define __field(type, item)					\
-	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
-			       "offset:%u;\tsize:%u;\n",		\
-			       (unsigned int)offsetof(typeof(field), item), \
-			       (unsigned int)sizeof(field.item));	\
-	if (!ret)							\
-		return 0;
-
-#undef __array
-#define __array(type, item, len)						\
-	ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t"	\
-			       "offset:%u;\tsize:%u;\n",		\
-			       (unsigned int)offsetof(typeof(field), item), \
-			       (unsigned int)sizeof(field.item));	\
-	if (!ret)							\
-		return 0;
-
-#undef __entry
-#define __entry REC
-
-#undef TP_printk
-#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
-
-#undef TP_fast_assign
-#define TP_fast_assign(args...) args
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
-static int								\
-ftrace_format_##call(struct trace_seq *s)				\
-{									\
-	struct ftrace_raw_##call field;					\
-	int ret;							\
-									\
-	tstruct;							\
-									\
-	trace_seq_printf(s, "\nprint fmt: " print);			\
-									\
-	return ret;							\
-}
-
-#include <trace/trace_events.h>
-
-#undef __field
-#define __field(type, item)						\
-	ret = trace_define_field(event_call, #type, #item,		\
-				 offsetof(typeof(field), item),		\
-				 sizeof(field.item));			\
-	if (ret)							\
-		return ret;
-
-#undef __array
-#define __array(type, item, len)					\
-	BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);				\
-	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
-				 offsetof(typeof(field), item),		\
-				 sizeof(field.item));			\
-	if (ret)							\
-		return ret;
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
-int									\
-ftrace_define_fields_##call(void)					\
-{									\
-	struct ftrace_raw_##call field;					\
-	struct ftrace_event_call *event_call = &event_##call;		\
-	int ret;							\
-									\
-	__common_field(unsigned char, type);				\
-	__common_field(unsigned char, flags);				\
-	__common_field(unsigned char, preempt_count);			\
-	__common_field(int, pid);					\
-	__common_field(int, tgid);					\
-									\
-	tstruct;							\
-									\
-	return ret;							\
-}
-
-#include <trace/trace_events.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
deleted file mode 100644
index 45c04e1..0000000
--- a/kernel/trace/trace_events_stage_3.h
+++ /dev/null
@@ -1,279 +0,0 @@
-/*
- * Stage 3 of the trace events.
- *
- * Override the macros in <trace/trace_events.h> to include the following:
- *
- * static void ftrace_event_<call>(proto)
- * {
- *	event_trace_printk(_RET_IP_, "<call>: " <fmt>);
- * }
- *
- * static int ftrace_reg_event_<call>(void)
- * {
- *	int ret;
- *
- *	ret = register_trace_<call>(ftrace_event_<call>);
- *	if (!ret)
- *		pr_info("event trace: Could not activate trace point "
- *			"probe to  <call>");
- *	return ret;
- * }
- *
- * static void ftrace_unreg_event_<call>(void)
- * {
- *	unregister_trace_<call>(ftrace_event_<call>);
- * }
- *
- * For those macros defined with TRACE_FORMAT:
- *
- * static struct ftrace_event_call __used
- * __attribute__((__aligned__(4)))
- * __attribute__((section("_ftrace_events"))) event_<call> = {
- *	.name			= "<call>",
- *	.regfunc		= ftrace_reg_event_<call>,
- *	.unregfunc		= ftrace_unreg_event_<call>,
- * }
- *
- *
- * For those macros defined with TRACE_EVENT:
- *
- * static struct ftrace_event_call event_<call>;
- *
- * static void ftrace_raw_event_<call>(proto)
- * {
- *	struct ring_buffer_event *event;
- *	struct ftrace_raw_<call> *entry; <-- defined in stage 1
- *	unsigned long irq_flags;
- *	int pc;
- *
- *	local_save_flags(irq_flags);
- *	pc = preempt_count();
- *
- *	event = trace_current_buffer_lock_reserve(event_<call>.id,
- *				  sizeof(struct ftrace_raw_<call>),
- *				  irq_flags, pc);
- *	if (!event)
- *		return;
- *	entry	= ring_buffer_event_data(event);
- *
- *	<assign>;  <-- Here we assign the entries by the __field and
- *			__array macros.
- *
- *	trace_current_buffer_unlock_commit(event, irq_flags, pc);
- * }
- *
- * static int ftrace_raw_reg_event_<call>(void)
- * {
- *	int ret;
- *
- *	ret = register_trace_<call>(ftrace_raw_event_<call>);
- *	if (!ret)
- *		pr_info("event trace: Could not activate trace point "
- *			"probe to <call>");
- *	return ret;
- * }
- *
- * static void ftrace_unreg_event_<call>(void)
- * {
- *	unregister_trace_<call>(ftrace_raw_event_<call>);
- * }
- *
- * static struct trace_event ftrace_event_type_<call> = {
- *	.trace			= ftrace_raw_output_<call>, <-- stage 2
- * };
- *
- * static int ftrace_raw_init_event_<call>(void)
- * {
- *	int id;
- *
- *	id = register_ftrace_event(&ftrace_event_type_<call>);
- *	if (!id)
- *		return -ENODEV;
- *	event_<call>.id = id;
- *	return 0;
- * }
- *
- * static struct ftrace_event_call __used
- * __attribute__((__aligned__(4)))
- * __attribute__((section("_ftrace_events"))) event_<call> = {
- *	.name			= "<call>",
- *	.system			= "<system>",
- *	.raw_init		= ftrace_raw_init_event_<call>,
- *	.regfunc		= ftrace_reg_event_<call>,
- *	.unregfunc		= ftrace_unreg_event_<call>,
- *	.show_format		= ftrace_format_<call>,
- * }
- *
- */
-
-#undef TP_FMT
-#define TP_FMT(fmt, args...)	fmt "\n", ##args
-
-#ifdef CONFIG_EVENT_PROFILE
-#define _TRACE_PROFILE(call, proto, args)				\
-static void ftrace_profile_##call(proto)				\
-{									\
-	extern void perf_tpcounter_event(int);				\
-	perf_tpcounter_event(event_##call.id);				\
-}									\
-									\
-static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \
-{									\
-	int ret = 0;							\
-									\
-	if (!atomic_inc_return(&call->profile_count))			\
-		ret = register_trace_##call(ftrace_profile_##call);	\
-									\
-	return ret;							\
-}									\
-									\
-static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \
-{									\
-	if (atomic_add_negative(-1, &call->profile_count))		\
-		unregister_trace_##call(ftrace_profile_##call);		\
-}
-
-#define _TRACE_PROFILE_INIT(call)					\
-	.profile_count = ATOMIC_INIT(-1),				\
-	.profile_enable = ftrace_profile_enable_##call,			\
-	.profile_disable = ftrace_profile_disable_##call,
-
-#else
-#define _TRACE_PROFILE(call, proto, args)
-#define _TRACE_PROFILE_INIT(call)
-#endif
-
-#define _TRACE_FORMAT(call, proto, args, fmt)				\
-static void ftrace_event_##call(proto)					\
-{									\
-	event_trace_printk(_RET_IP_, #call ": " fmt);			\
-}									\
-									\
-static int ftrace_reg_event_##call(void)				\
-{									\
-	int ret;							\
-									\
-	ret = register_trace_##call(ftrace_event_##call);		\
-	if (ret)							\
-		pr_info("event trace: Could not activate trace point "	\
-			"probe to " #call "\n");			\
-	return ret;							\
-}									\
-									\
-static void ftrace_unreg_event_##call(void)				\
-{									\
-	unregister_trace_##call(ftrace_event_##call);			\
-}									\
-									\
-static struct ftrace_event_call event_##call;				\
-									\
-static int ftrace_init_event_##call(void)				\
-{									\
-	int id;								\
-									\
-	id = register_ftrace_event(NULL);				\
-	if (!id)							\
-		return -ENODEV;						\
-	event_##call.id = id;						\
-	return 0;							\
-}
-
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt)				\
-_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
-_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))			\
-static struct ftrace_event_call __used					\
-__attribute__((__aligned__(4)))						\
-__attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name			= #call,				\
-	.system			= __stringify(TRACE_SYSTEM),		\
-	.raw_init		= ftrace_init_event_##call,		\
-	.regfunc		= ftrace_reg_event_##call,		\
-	.unregfunc		= ftrace_unreg_event_##call,		\
-	_TRACE_PROFILE_INIT(call)					\
-}
-
-#undef __entry
-#define __entry entry
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
-_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))			\
-									\
-static struct ftrace_event_call event_##call;				\
-									\
-static void ftrace_raw_event_##call(proto)				\
-{									\
-	struct ftrace_event_call *call = &event_##call;			\
-	struct ring_buffer_event *event;				\
-	struct ftrace_raw_##call *entry;				\
-	unsigned long irq_flags;					\
-	int pc;								\
-									\
-	local_save_flags(irq_flags);					\
-	pc = preempt_count();						\
-									\
-	event = trace_current_buffer_lock_reserve(event_##call.id,	\
-				  sizeof(struct ftrace_raw_##call),	\
-				  irq_flags, pc);			\
-	if (!event)							\
-		return;							\
-	entry	= ring_buffer_event_data(event);			\
-									\
-	assign;								\
-									\
-	if (!filter_current_check_discard(call, entry, event))		\
-		trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
-}									\
-									\
-static int ftrace_raw_reg_event_##call(void)				\
-{									\
-	int ret;							\
-									\
-	ret = register_trace_##call(ftrace_raw_event_##call);		\
-	if (ret)							\
-		pr_info("event trace: Could not activate trace point "	\
-			"probe to " #call "\n");			\
-	return ret;							\
-}									\
-									\
-static void ftrace_raw_unreg_event_##call(void)				\
-{									\
-	unregister_trace_##call(ftrace_raw_event_##call);		\
-}									\
-									\
-static struct trace_event ftrace_event_type_##call = {			\
-	.trace			= ftrace_raw_output_##call,		\
-};									\
-									\
-static int ftrace_raw_init_event_##call(void)				\
-{									\
-	int id;								\
-									\
-	id = register_ftrace_event(&ftrace_event_type_##call);		\
-	if (!id)							\
-		return -ENODEV;						\
-	event_##call.id = id;						\
-	INIT_LIST_HEAD(&event_##call.fields);				\
-	init_preds(&event_##call);					\
-	return 0;							\
-}									\
-									\
-static struct ftrace_event_call __used					\
-__attribute__((__aligned__(4)))						\
-__attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name			= #call,				\
-	.system			= __stringify(TRACE_SYSTEM),		\
-	.raw_init		= ftrace_raw_init_event_##call,		\
-	.regfunc		= ftrace_raw_reg_event_##call,		\
-	.unregfunc		= ftrace_raw_unreg_event_##call,	\
-	.show_format		= ftrace_format_##call,			\
-	.define_fields		= ftrace_define_fields_##call,		\
-	_TRACE_PROFILE_INIT(call)					\
-}
-
-#include <trace/trace_events.h>
-
-#undef _TRACE_PROFILE
-#undef _TRACE_PROFILE_INIT
-
-- 
cgit v0.10.2


From a59fd6027218bd7c994e39d14afe0242f895144f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 10 Apr 2009 13:52:20 -0400
Subject: tracing/events: convert event call sites to use a link list

Impact: makes it possible to define events in modules

The events are created by reading down the section that they are linked
in by the macros. But this is not scalable to modules. This patch converts
the manipulations to use a global link list, and on boot up it adds
the items in the section to the list.

This change will allow modules to add their tracing events to the list as
well.

Note, this change alone does not permit modules to use the TRACE_EVENT macros,
but the change is needed for them to eventually do so.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 496b76d..1781085 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -83,6 +83,7 @@ void trace_current_buffer_discard_commit(struct ring_buffer_event *event);
 void tracing_record_cmdline(struct task_struct *tsk);
 
 struct ftrace_event_call {
+	struct list_head	list;
 	char			*name;
 	char			*system;
 	struct dentry		*dir;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6bcdf4a..8817c18 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -739,11 +739,6 @@ struct event_subsystem {
 	struct filter_pred	**preds;
 };
 
-#define events_for_each(event)						\
-	for (event = __start_ftrace_events;				\
-	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
-	     event++)
-
 struct filter_pred;
 
 typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
@@ -785,13 +780,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 	return 0;
 }
 
-extern struct ftrace_event_call __start_ftrace_events[];
-extern struct ftrace_event_call __stop_ftrace_events[];
-
-#define for_each_event(event)						\
-	for (event = __start_ftrace_events;				\
-	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
-	     event++)
+extern struct list_head ftrace_events;
 
 extern const char *__start___trace_bprintk_fmt[];
 extern const char *__stop___trace_bprintk_fmt[];
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 199de9c..7bf2ad6 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -11,7 +11,7 @@ int ftrace_profile_enable(int event_id)
 {
 	struct ftrace_event_call *event;
 
-	for_each_event(event) {
+	list_for_each_entry(event, &ftrace_events, list) {
 		if (event->id == event_id)
 			return event->profile_enable(event);
 	}
@@ -23,7 +23,7 @@ void ftrace_profile_disable(int event_id)
 {
 	struct ftrace_event_call *event;
 
-	for_each_event(event) {
+	list_for_each_entry(event, &ftrace_events, list) {
 		if (event->id == event_id)
 			return event->profile_disable(event);
 	}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ead68ac..5c66aaf 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -19,6 +19,8 @@
 
 static DEFINE_MUTEX(event_mutex);
 
+LIST_HEAD(ftrace_events);
+
 int trace_define_field(struct ftrace_event_call *call, char *type,
 		       char *name, int offset, int size)
 {
@@ -54,16 +56,14 @@ err:
 
 static void ftrace_clear_events(void)
 {
-	struct ftrace_event_call *call = (void *)__start_ftrace_events;
-
+	struct ftrace_event_call *call;
 
-	while ((unsigned long)call < (unsigned long)__stop_ftrace_events) {
+	list_for_each_entry(call, &ftrace_events, list) {
 
 		if (call->enabled) {
 			call->enabled = 0;
 			call->unregfunc();
 		}
-		call++;
 	}
 }
 
@@ -89,7 +89,7 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
 
 static int ftrace_set_clr_event(char *buf, int set)
 {
-	struct ftrace_event_call *call = __start_ftrace_events;
+	struct ftrace_event_call *call;
 	char *event = NULL, *sub = NULL, *match;
 	int ret = -EINVAL;
 
@@ -118,7 +118,7 @@ static int ftrace_set_clr_event(char *buf, int set)
 	}
 
 	mutex_lock(&event_mutex);
-	for_each_event(call) {
+	list_for_each_entry(call, &ftrace_events, list) {
 
 		if (!call->name || !call->regfunc)
 			continue;
@@ -224,15 +224,17 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
 static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct ftrace_event_call *call = m->private;
-	struct ftrace_event_call *next = call;
+	struct list_head *list = m->private;
+	struct ftrace_event_call *call;
 
 	(*pos)++;
 
 	for (;;) {
-		if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+		if (list == &ftrace_events)
 			return NULL;
 
+		call = list_entry(list, struct ftrace_event_call, list);
+
 		/*
 		 * The ftrace subsystem is for showing formats only.
 		 * They can not be enabled or disabled via the event files.
@@ -240,11 +242,10 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 		if (call->regfunc)
 			break;
 
-		call++;
-		next = call;
+		list = list->next;
 	}
 
-	m->private = ++next;
+	m->private = list->next;
 
 	return call;
 }
@@ -257,22 +258,23 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 static void *
 s_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct ftrace_event_call *call = m->private;
-	struct ftrace_event_call *next;
+	struct list_head *list = m->private;
+	struct ftrace_event_call *call;
 
 	(*pos)++;
 
  retry:
-	if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+	if (list == &ftrace_events)
 		return NULL;
 
+	call = list_entry(list, struct ftrace_event_call, list);
+
 	if (!call->enabled) {
-		call++;
+		list = list->next;
 		goto retry;
 	}
 
-	next = call;
-	m->private = ++next;
+	m->private = list->next;
 
 	return call;
 }
@@ -312,7 +314,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)
 	if (!ret) {
 		struct seq_file *m = file->private_data;
 
-		m->private = __start_ftrace_events;
+		m->private = ftrace_events.next;
 	}
 	return ret;
 }
@@ -797,9 +799,17 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 	return 0;
 }
 
+extern struct ftrace_event_call __start_ftrace_events[];
+extern struct ftrace_event_call __stop_ftrace_events[];
+
+#define for_each_event(event)						\
+	for (event = __start_ftrace_events;				\
+	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
+	     event++)
+
 static __init int event_trace_init(void)
 {
-	struct ftrace_event_call *call = __start_ftrace_events;
+	struct ftrace_event_call *call;
 	struct dentry *d_tracer;
 	struct dentry *entry;
 	struct dentry *d_events;
@@ -830,6 +840,7 @@ static __init int event_trace_init(void)
 		/* The linker may leave blanks */
 		if (!call->name)
 			continue;
+		list_add(&call->list, &ftrace_events);
 		event_create_dir(call, d_events);
 	}
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index de42dad..d30b06b 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -223,7 +223,7 @@ oom:
 
 void filter_free_subsystem_preds(struct event_subsystem *system)
 {
-	struct ftrace_event_call *call = __start_ftrace_events;
+	struct ftrace_event_call *call;
 	int i;
 
 	if (system->n_preds) {
@@ -234,7 +234,7 @@ void filter_free_subsystem_preds(struct event_subsystem *system)
 		system->n_preds = 0;
 	}
 
-	events_for_each(call) {
+	list_for_each_entry(call, &ftrace_events, list) {
 		if (!call->define_fields)
 			continue;
 
@@ -320,7 +320,7 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
 int filter_add_subsystem_pred(struct event_subsystem *system,
 			      struct filter_pred *pred)
 {
-	struct ftrace_event_call *call = __start_ftrace_events;
+	struct ftrace_event_call *call;
 
 	if (system->n_preds && !pred->compound)
 		filter_free_subsystem_preds(system);
@@ -337,7 +337,7 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 
 	system->preds[system->n_preds] = pred;
 
-	events_for_each(call) {
+	list_for_each_entry(call, &ftrace_events, list) {
 		int err;
 
 		if (!call->define_fields)
-- 
cgit v0.10.2


From 17c873ec280a03894bc718af817f7f24fa787ae1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 10 Apr 2009 18:12:50 -0400
Subject: tracing/events: add export symbols for trace events in modules

Impact: let modules add trace events

The trace event code requires some functions to be exported to allow
modules to use TRACE_EVENT. This patch adds EXPORT_SYMBOL_GPL to the
necessary functions.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c0047fc..2d69b26 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -176,6 +176,7 @@ int filter_current_check_discard(struct ftrace_event_call *call, void *rec,
 {
 	return filter_check_discard(call, rec, global_trace.buffer, event);
 }
+EXPORT_SYMBOL_GPL(filter_current_check_discard);
 
 cycle_t ftrace_now(int cpu)
 {
@@ -886,6 +887,7 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
 	return trace_buffer_lock_reserve(&global_trace,
 					 type, len, flags, pc);
 }
+EXPORT_SYMBOL(trace_current_buffer_lock_reserve);
 
 void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc)
@@ -903,6 +905,7 @@ void trace_current_buffer_discard_commit(struct ring_buffer_event *event)
 {
 	ring_buffer_discard_commit(global_trace.buffer, event);
 }
+EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
 
 void
 trace_function(struct trace_array *tr,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5c66aaf..8b9e621 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -53,6 +53,7 @@ err:
 
 	return -ENOMEM;
 }
+EXPORT_SYMBOL_GPL(trace_define_field);
 
 static void ftrace_clear_events(void)
 {
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index d30b06b..f8e5eab 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -110,6 +110,7 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec)
 
 	return 1;
 }
+EXPORT_SYMBOL_GPL(filter_match_preds);
 
 void filter_print_preds(struct filter_pred **preds, int n_preds,
 			struct trace_seq *s)
@@ -220,6 +221,7 @@ oom:
 
 	return -ENOMEM;
 }
+EXPORT_SYMBOL_GPL(init_preds);
 
 void filter_free_subsystem_preds(struct event_subsystem *system)
 {
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 0e70fb0..83a8abb 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -94,6 +94,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 
 	return len;
 }
+EXPORT_SYMBOL_GPL(trace_seq_printf);
 
 int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
 {
@@ -538,6 +539,7 @@ int register_ftrace_event(struct trace_event *event)
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(register_ftrace_event);
 
 /**
  * unregister_ftrace_event - remove a no longer used event
@@ -551,6 +553,7 @@ int unregister_ftrace_event(struct trace_event *event)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(unregister_ftrace_event);
 
 /*
  * Standard events
-- 
cgit v0.10.2


From 6d723736e472f7a0cd5b62c84152fceead241328 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 10 Apr 2009 14:53:50 -0400
Subject: tracing/events: add support for modules to TRACE_EVENT

Impact: allow modules to add TRACE_EVENTS on load

This patch adds the final hooks to allow modules to use the TRACE_EVENT
macro. A notifier and a data structure are used to link the TRACE_EVENTs
defined in the module to connect them with the ftrace event tracing system.

It also adds the necessary automated clean ups to the trace events when a
module is removed.

Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 1781085..75f3ac0 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -7,6 +7,7 @@
 
 struct trace_array;
 struct tracer;
+struct dentry;
 
 /*
  * The trace entry - the most basic unit of tracing. This is what
@@ -87,6 +88,7 @@ struct ftrace_event_call {
 	char			*name;
 	char			*system;
 	struct dentry		*dir;
+	struct trace_event	*event;
 	int			enabled;
 	int			(*regfunc)(void);
 	void			(*unregfunc)(void);
@@ -97,6 +99,7 @@ struct ftrace_event_call {
 	struct list_head	fields;
 	int			n_preds;
 	struct filter_pred	**preds;
+	void			*mod;
 
 #ifdef CONFIG_EVENT_PROFILE
 	atomic_t	profile_count;
diff --git a/include/linux/module.h b/include/linux/module.h
index 627ac08..6155fa4 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -337,6 +337,10 @@ struct module
 	const char **trace_bprintk_fmt_start;
 	unsigned int num_trace_bprintk_fmt;
 #endif
+#ifdef CONFIG_EVENT_TRACING
+	struct ftrace_event_call *trace_events;
+	unsigned int num_trace_events;
+#endif
 
 #ifdef CONFIG_MODULE_UNLOAD
 	/* What modules depend on me? */
diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index 28051da..15ca2c7 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_TRACE_SEQ_H
 #define _LINUX_TRACE_SEQ_H
 
+#include <linux/fs.h>
+
 /*
  * Trace sequences are used to allow a function to call several other functions
  * to create a string of data to use (up to a max of PAGE_SIZE.
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 955b967..60c5323 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -477,6 +477,7 @@ __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name			= #call,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
+	.event			= &ftrace_event_type_##call,		\
 	.raw_init		= ftrace_raw_init_event_##call,		\
 	.regfunc		= ftrace_raw_reg_event_##call,		\
 	.unregfunc		= ftrace_raw_unreg_event_##call,	\
diff --git a/kernel/module.c b/kernel/module.c
index e797812..a039470 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -18,6 +18,7 @@
 */
 #include <linux/module.h>
 #include <linux/moduleloader.h>
+#include <linux/ftrace_event.h>
 #include <linux/init.h>
 #include <linux/kallsyms.h>
 #include <linux/fs.h>
@@ -2172,6 +2173,12 @@ static noinline struct module *load_module(void __user *umod,
 					sizeof(*mod->tracepoints),
 					&mod->num_tracepoints);
 #endif
+#ifdef CONFIG_EVENT_TRACING
+	mod->trace_events = section_objs(hdr, sechdrs, secstrings,
+					 "_ftrace_events",
+					 sizeof(*mod->trace_events),
+					 &mod->num_trace_events);
+#endif
 
 #ifdef CONFIG_MODVERSIONS
 	if ((mod->num_syms && !mod->crcs)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 8b9e621..a4b1777 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -713,7 +713,13 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 		return d_events;
 	}
 
-	system->name = name;
+	system->name = kstrdup(name, GFP_KERNEL);
+	if (!system->name) {
+		debugfs_remove(system->entry);
+		kfree(system);
+		return d_events;
+	}
+
 	list_add(&system->list, &event_subsystems);
 
 	system->preds = NULL;
@@ -738,7 +744,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 	 * If the trace point header did not define TRACE_SYSTEM
 	 * then the system would be called "TRACE_SYSTEM".
 	 */
-	if (strcmp(call->system, "TRACE_SYSTEM") != 0)
+	if (strcmp(call->system, TRACE_SYSTEM) != 0)
 		d_events = event_subsystem_dir(call->system, d_events);
 
 	if (call->raw_init) {
@@ -757,21 +763,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 		return -1;
 	}
 
-	if (call->regfunc) {
-		entry = debugfs_create_file("enable", 0644, call->dir, call,
-					    &ftrace_enable_fops);
-		if (!entry)
-			pr_warning("Could not create debugfs "
-				   "'%s/enable' entry\n", call->name);
-	}
+	if (call->regfunc)
+		entry = trace_create_file("enable", 0644, call->dir, call,
+					  &ftrace_enable_fops);
 
-	if (call->id) {
-		entry = debugfs_create_file("id", 0444, call->dir, call,
-				&ftrace_event_id_fops);
-		if (!entry)
-			pr_warning("Could not create debugfs '%s/id' entry\n",
-					call->name);
-	}
+	if (call->id)
+		entry = trace_create_file("id", 0444, call->dir, call,
+					  &ftrace_event_id_fops);
 
 	if (call->define_fields) {
 		ret = call->define_fields();
@@ -780,40 +778,102 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 				   " events/%s\n", call->name);
 			return ret;
 		}
-		entry = debugfs_create_file("filter", 0644, call->dir, call,
-					    &ftrace_event_filter_fops);
-		if (!entry)
-			pr_warning("Could not create debugfs "
-				   "'%s/filter' entry\n", call->name);
+		entry = trace_create_file("filter", 0644, call->dir, call,
+					  &ftrace_event_filter_fops);
 	}
 
 	/* A trace may not want to export its format */
 	if (!call->show_format)
 		return 0;
 
-	entry = debugfs_create_file("format", 0444, call->dir, call,
-				    &ftrace_event_format_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'%s/format' entry\n", call->name);
+	entry = trace_create_file("format", 0444, call->dir, call,
+				  &ftrace_event_format_fops);
+
+	return 0;
+}
+
+#define for_each_event(event, start, end)			\
+	for (event = start;					\
+	     (unsigned long)event < (unsigned long)end;		\
+	     event++)
+
+static void trace_module_add_events(struct module *mod)
+{
+	struct ftrace_event_call *call, *start, *end;
+	struct dentry *d_events;
+
+	start = mod->trace_events;
+	end = mod->trace_events + mod->num_trace_events;
+
+	if (start == end)
+		return;
+
+	d_events = event_trace_events_dir();
+	if (!d_events)
+		return;
+
+	for_each_event(call, start, end) {
+		/* The linker may leave blanks */
+		if (!call->name)
+			continue;
+		call->mod = mod;
+		list_add(&call->list, &ftrace_events);
+		event_create_dir(call, d_events);
+	}
+}
+
+static void trace_module_remove_events(struct module *mod)
+{
+	struct ftrace_event_call *call, *p;
+
+	list_for_each_entry_safe(call, p, &ftrace_events, list) {
+		if (call->mod == mod) {
+			if (call->enabled) {
+				call->enabled = 0;
+				call->unregfunc();
+			}
+			if (call->event)
+				unregister_ftrace_event(call->event);
+			debugfs_remove_recursive(call->dir);
+			list_del(&call->list);
+		}
+	}
+}
+
+int trace_module_notify(struct notifier_block *self,
+			unsigned long val, void *data)
+{
+	struct module *mod = data;
+
+	mutex_lock(&event_mutex);
+	switch (val) {
+	case MODULE_STATE_COMING:
+		trace_module_add_events(mod);
+		break;
+	case MODULE_STATE_GOING:
+		trace_module_remove_events(mod);
+		break;
+	}
+	mutex_unlock(&event_mutex);
 
 	return 0;
 }
 
+struct notifier_block trace_module_nb = {
+	.notifier_call = trace_module_notify,
+	.priority = 0,
+};
+
 extern struct ftrace_event_call __start_ftrace_events[];
 extern struct ftrace_event_call __stop_ftrace_events[];
 
-#define for_each_event(event)						\
-	for (event = __start_ftrace_events;				\
-	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
-	     event++)
-
 static __init int event_trace_init(void)
 {
 	struct ftrace_event_call *call;
 	struct dentry *d_tracer;
 	struct dentry *entry;
 	struct dentry *d_events;
+	int ret;
 
 	d_tracer = tracing_init_dentry();
 	if (!d_tracer)
@@ -837,7 +897,7 @@ static __init int event_trace_init(void)
 	if (!d_events)
 		return 0;
 
-	for_each_event(call) {
+	for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
 		/* The linker may leave blanks */
 		if (!call->name)
 			continue;
@@ -845,6 +905,10 @@ static __init int event_trace_init(void)
 		event_create_dir(call, d_events);
 	}
 
+	ret = register_module_notifier(&trace_module_nb);
+	if (!ret)
+		pr_warning("Failed to register trace events module notifier\n");
+
 	return 0;
 }
 fs_initcall(event_trace_init);
-- 
cgit v0.10.2


From 19e4529ee7345079eeacc8e40cf69a304a64dc23 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Tue, 14 Apr 2009 17:27:18 +1000
Subject: modules: Fix up build when CONFIG_MODULE_UNLOAD=n.

Commit 3d43321b7015387cfebbe26436d0e9d299162ea1 ("modules: sysctl to
block module loading") introduces a modules_disabled variable that is
only defined if CONFIG_MODULE_UNLOAD is enabled, despite being used in
other places. This moves it up and fixes up the build.

  CC      kernel/module.o
kernel/module.c: In function 'sys_init_module':
kernel/module.c:2401: error: 'modules_disabled' undeclared (first use in this function)
kernel/module.c:2401: error: (Each undeclared identifier is reported only once
kernel/module.c:2401: error: for each function it appears in.)
make[1]: *** [kernel/module.o] Error 1
make: *** [kernel/module.o] Error 2

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/kernel/module.c b/kernel/module.c
index eeb3f7b..ee7ab61 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -71,6 +71,9 @@
 static DEFINE_MUTEX(module_mutex);
 static LIST_HEAD(modules);
 
+/* Block module loading/unloading? */
+int modules_disabled = 0;
+
 /* Waiting for a module to finish initializing? */
 static DECLARE_WAIT_QUEUE_HEAD(module_wq);
 
@@ -778,9 +781,6 @@ static void wait_for_zero_refcount(struct module *mod)
 	mutex_lock(&module_mutex);
 }
 
-/* Block module loading/unloading? */
-int modules_disabled = 0;
-
 SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 		unsigned int, flags)
 {
-- 
cgit v0.10.2


From 61f919a12fbdc3fd20f980a34a118d597198a392 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 14 Apr 2009 18:22:32 -0400
Subject: tracing/events: fix compile for modules disabled

Impact: compile fix

The addition of TRACE_EVENT for modules breaks the build for when
modules are disabled. This code fixes that.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index a4b1777..6591d83 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -797,6 +797,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 	     (unsigned long)event < (unsigned long)end;		\
 	     event++)
 
+#ifdef CONFIG_MODULES
 static void trace_module_add_events(struct module *mod)
 {
 	struct ftrace_event_call *call, *start, *end;
@@ -840,8 +841,8 @@ static void trace_module_remove_events(struct module *mod)
 	}
 }
 
-int trace_module_notify(struct notifier_block *self,
-			unsigned long val, void *data)
+static int trace_module_notify(struct notifier_block *self,
+			       unsigned long val, void *data)
 {
 	struct module *mod = data;
 
@@ -858,6 +859,13 @@ int trace_module_notify(struct notifier_block *self,
 
 	return 0;
 }
+#else
+static int trace_module_notify(struct notifier_block *self,
+			       unsigned long val, void *data)
+{
+	return 0;
+}
+#endif /* CONFIG_MODULES */
 
 struct notifier_block trace_module_nb = {
 	.notifier_call = trace_module_notify,
-- 
cgit v0.10.2


From ecda8ae02a08ef065ff387f5cb2a2d4999da2408 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 14 Apr 2009 18:49:38 -0400
Subject: tracing/events: fix lockdep system name

Impact: fix compile error of lockdep event tracer

Ingo Molnar pointed out that the system name for the lockdep tracer was "lock"
which is used to include the event trace file name. It should be "lockdep"

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h
index 4d301e7..45e326b 100644
--- a/include/trace/lockdep.h
+++ b/include/trace/lockdep.h
@@ -5,7 +5,7 @@
 #include <linux/tracepoint.h>
 
 #undef TRACE_SYSTEM
-#define TRACE_SYSTEM lock
+#define TRACE_SYSTEM lockdep
 
 #ifdef CONFIG_LOCKDEP
 
-- 
cgit v0.10.2


From ad8d75fff811a6a230f7f43b05a6483099349533 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 14 Apr 2009 19:39:12 -0400
Subject: tracing/events: move trace point headers into include/trace/events

Impact: clean up

Create a sub directory in include/trace called events to keep the
trace point headers in their own separate directory. Only headers that
declare trace points should be defined in this directory.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h
index 15c45a2..b616d39 100644
--- a/include/linux/kmemtrace.h
+++ b/include/linux/kmemtrace.h
@@ -9,7 +9,7 @@
 
 #ifdef __KERNEL__
 
-#include <trace/kmem.h>
+#include <trace/events/kmem.h>
 
 #ifdef CONFIG_KMEMTRACE
 extern void kmemtrace_init(void);
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index 980eb66..1886941 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -43,7 +43,7 @@
 #endif
 
 #ifndef TRACE_INCLUDE_PATH
-# define __TRACE_INCLUDE(system) <trace/system.h>
+# define __TRACE_INCLUDE(system) <trace/events/system.h>
 # define UNDEF_TRACE_INCLUDE_FILE
 #else
 # define __TRACE_INCLUDE(system) __stringify(TRACE_INCLUDE_PATH/system.h)
diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h
new file mode 100644
index 0000000..75e3468
--- /dev/null
+++ b/include/trace/events/irq.h
@@ -0,0 +1,57 @@
+#if !defined(_TRACE_IRQ_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_IRQ_H
+
+#include <linux/tracepoint.h>
+#include <linux/interrupt.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM irq
+
+/*
+ * Tracepoint for entry of interrupt handler:
+ */
+TRACE_FORMAT(irq_handler_entry,
+	TP_PROTO(int irq, struct irqaction *action),
+	TP_ARGS(irq, action),
+	TP_FMT("irq=%d handler=%s", irq, action->name)
+	);
+
+/*
+ * Tracepoint for return of an interrupt handler:
+ */
+TRACE_EVENT(irq_handler_exit,
+
+	TP_PROTO(int irq, struct irqaction *action, int ret),
+
+	TP_ARGS(irq, action, ret),
+
+	TP_STRUCT__entry(
+		__field(	int,	irq	)
+		__field(	int,	ret	)
+	),
+
+	TP_fast_assign(
+		__entry->irq	= irq;
+		__entry->ret	= ret;
+	),
+
+	TP_printk("irq=%d return=%s",
+		  __entry->irq, __entry->ret ? "handled" : "unhandled")
+);
+
+TRACE_FORMAT(softirq_entry,
+	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+	TP_ARGS(h, vec),
+	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
+	);
+
+TRACE_FORMAT(softirq_exit,
+	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+	TP_ARGS(h, vec),
+	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
+	);
+
+#endif /*  _TRACE_IRQ_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
new file mode 100644
index 0000000..c22c42f
--- /dev/null
+++ b/include/trace/events/kmem.h
@@ -0,0 +1,194 @@
+#if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KMEM_H
+
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kmem
+
+TRACE_EVENT(kmalloc,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags)
+);
+
+TRACE_EVENT(kmem_cache_alloc,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags)
+);
+
+TRACE_EVENT(kmalloc_node,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags,
+		 int node),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+		__field(	int,		node		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+		__entry->node		= node;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags,
+		__entry->node)
+);
+
+TRACE_EVENT(kmem_cache_alloc_node,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags,
+		 int node),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+		__field(	int,		node		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+		__entry->node		= node;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags,
+		__entry->node)
+);
+
+TRACE_EVENT(kfree,
+
+	TP_PROTO(unsigned long call_site, const void *ptr),
+
+	TP_ARGS(call_site, ptr),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+	),
+
+	TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
+);
+
+TRACE_EVENT(kmem_cache_free,
+
+	TP_PROTO(unsigned long call_site, const void *ptr),
+
+	TP_ARGS(call_site, ptr),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+	),
+
+	TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
+);
+#endif /* _TRACE_KMEM_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/lockdep.h b/include/trace/events/lockdep.h
new file mode 100644
index 0000000..45e326b
--- /dev/null
+++ b/include/trace/events/lockdep.h
@@ -0,0 +1,60 @@
+#if !defined(_TRACE_LOCKDEP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_LOCKDEP_H
+
+#include <linux/lockdep.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM lockdep
+
+#ifdef CONFIG_LOCKDEP
+
+TRACE_FORMAT(lock_acquire,
+	TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
+		int trylock, int read, int check,
+		struct lockdep_map *next_lock, unsigned long ip),
+	TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
+	TP_FMT("%s%s%s", trylock ? "try " : "",
+		read ? "read " : "", lock->name)
+	);
+
+TRACE_FORMAT(lock_release,
+	TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
+	TP_ARGS(lock, nested, ip),
+	TP_FMT("%s", lock->name)
+	);
+
+#ifdef CONFIG_LOCK_STAT
+
+TRACE_FORMAT(lock_contended,
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+	TP_ARGS(lock, ip),
+	TP_FMT("%s", lock->name)
+	);
+
+TRACE_EVENT(lock_acquired,
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime),
+
+	TP_ARGS(lock, ip, waittime),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(unsigned long, wait_usec)
+		__field(unsigned long, wait_nsec_rem)
+	),
+	TP_fast_assign(
+		__entry->name = lock->name;
+		__entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC);
+		__entry->wait_usec = (unsigned long) waittime;
+	),
+	TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec,
+				       __entry->wait_nsec_rem)
+);
+
+#endif
+#endif
+
+#endif /* _TRACE_LOCKDEP_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
new file mode 100644
index 0000000..ffa1cab
--- /dev/null
+++ b/include/trace/events/sched.h
@@ -0,0 +1,339 @@
+#if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SCHED_H
+
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM sched
+
+/*
+ * Tracepoint for calling kthread_stop, performed to end a kthread:
+ */
+TRACE_EVENT(sched_kthread_stop,
+
+	TP_PROTO(struct task_struct *t),
+
+	TP_ARGS(t),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+		__entry->pid	= t->pid;
+	),
+
+	TP_printk("task %s:%d", __entry->comm, __entry->pid)
+);
+
+/*
+ * Tracepoint for the return value of the kthread stopping:
+ */
+TRACE_EVENT(sched_kthread_stop_ret,
+
+	TP_PROTO(int ret),
+
+	TP_ARGS(ret),
+
+	TP_STRUCT__entry(
+		__field(	int,	ret	)
+	),
+
+	TP_fast_assign(
+		__entry->ret	= ret;
+	),
+
+	TP_printk("ret %d", __entry->ret)
+);
+
+/*
+ * Tracepoint for waiting on task to unschedule:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_wait_task,
+
+	TP_PROTO(struct rq *rq, struct task_struct *p),
+
+	TP_ARGS(rq, p),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid	= p->pid;
+		__entry->prio	= p->prio;
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
+);
+
+/*
+ * Tracepoint for waking up a task:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_wakeup,
+
+	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+
+	TP_ARGS(rq, p, success),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+		__field(	int,	success			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->prio		= p->prio;
+		__entry->success	= success;
+	),
+
+	TP_printk("task %s:%d [%d] success=%d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->success)
+);
+
+/*
+ * Tracepoint for waking up a new task:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_wakeup_new,
+
+	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+
+	TP_ARGS(rq, p, success),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+		__field(	int,	success			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->prio		= p->prio;
+		__entry->success	= success;
+	),
+
+	TP_printk("task %s:%d [%d] success=%d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->success)
+);
+
+/*
+ * Tracepoint for task switches, performed by the scheduler:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_switch,
+
+	TP_PROTO(struct rq *rq, struct task_struct *prev,
+		 struct task_struct *next),
+
+	TP_ARGS(rq, prev, next),
+
+	TP_STRUCT__entry(
+		__array(	char,	prev_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	prev_pid			)
+		__field(	int,	prev_prio			)
+		__array(	char,	next_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	next_pid			)
+		__field(	int,	next_prio			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
+		__entry->prev_pid	= prev->pid;
+		__entry->prev_prio	= prev->prio;
+		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
+		__entry->next_pid	= next->pid;
+		__entry->next_prio	= next->prio;
+	),
+
+	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
+		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
+		__entry->next_comm, __entry->next_pid, __entry->next_prio)
+);
+
+/*
+ * Tracepoint for a task being migrated:
+ */
+TRACE_EVENT(sched_migrate_task,
+
+	TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
+
+	TP_ARGS(p, orig_cpu, dest_cpu),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+		__field(	int,	orig_cpu		)
+		__field(	int,	dest_cpu		)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->prio		= p->prio;
+		__entry->orig_cpu	= orig_cpu;
+		__entry->dest_cpu	= dest_cpu;
+	),
+
+	TP_printk("task %s:%d [%d] from: %d  to: %d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->orig_cpu, __entry->dest_cpu)
+);
+
+/*
+ * Tracepoint for freeing a task:
+ */
+TRACE_EVENT(sched_process_free,
+
+	TP_PROTO(struct task_struct *p),
+
+	TP_ARGS(p),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->prio		= p->prio;
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
+);
+
+/*
+ * Tracepoint for a task exiting:
+ */
+TRACE_EVENT(sched_process_exit,
+
+	TP_PROTO(struct task_struct *p),
+
+	TP_ARGS(p),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->prio		= p->prio;
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
+);
+
+/*
+ * Tracepoint for a waiting task:
+ */
+TRACE_EVENT(sched_process_wait,
+
+	TP_PROTO(struct pid *pid),
+
+	TP_ARGS(pid),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+		__entry->pid		= pid_nr(pid);
+		__entry->prio		= current->prio;
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
+);
+
+/*
+ * Tracepoint for do_fork:
+ */
+TRACE_EVENT(sched_process_fork,
+
+	TP_PROTO(struct task_struct *parent, struct task_struct *child),
+
+	TP_ARGS(parent, child),
+
+	TP_STRUCT__entry(
+		__array(	char,	parent_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	parent_pid			)
+		__array(	char,	child_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	child_pid			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN);
+		__entry->parent_pid	= parent->pid;
+		memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN);
+		__entry->child_pid	= child->pid;
+	),
+
+	TP_printk("parent %s:%d  child %s:%d",
+		__entry->parent_comm, __entry->parent_pid,
+		__entry->child_comm, __entry->child_pid)
+);
+
+/*
+ * Tracepoint for sending a signal:
+ */
+TRACE_EVENT(sched_signal_send,
+
+	TP_PROTO(int sig, struct task_struct *p),
+
+	TP_ARGS(sig, p),
+
+	TP_STRUCT__entry(
+		__field(	int,	sig			)
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid	= p->pid;
+		__entry->sig	= sig;
+	),
+
+	TP_printk("sig: %d  task %s:%d",
+		  __entry->sig, __entry->comm, __entry->pid)
+);
+
+#endif /* _TRACE_SCHED_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h
new file mode 100644
index 0000000..1e8fabb
--- /dev/null
+++ b/include/trace/events/skb.h
@@ -0,0 +1,40 @@
+#if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SKB_H
+
+#include <linux/skbuff.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM skb
+
+/*
+ * Tracepoint for free an sk_buff:
+ */
+TRACE_EVENT(kfree_skb,
+
+	TP_PROTO(struct sk_buff *skb, void *location),
+
+	TP_ARGS(skb, location),
+
+	TP_STRUCT__entry(
+		__field(	void *,		skbaddr		)
+		__field(	unsigned short,	protocol	)
+		__field(	void *,		location	)
+	),
+
+	TP_fast_assign(
+		__entry->skbaddr = skb;
+		if (skb) {
+			__entry->protocol = ntohs(skb->protocol);
+		}
+		__entry->location = location;
+	),
+
+	TP_printk("skbaddr=%p protocol=%u location=%p",
+		__entry->skbaddr, __entry->protocol, __entry->location)
+);
+
+#endif /* _TRACE_SKB_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/irq.h b/include/trace/irq.h
deleted file mode 100644
index 75e3468..0000000
--- a/include/trace/irq.h
+++ /dev/null
@@ -1,57 +0,0 @@
-#if !defined(_TRACE_IRQ_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_IRQ_H
-
-#include <linux/tracepoint.h>
-#include <linux/interrupt.h>
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM irq
-
-/*
- * Tracepoint for entry of interrupt handler:
- */
-TRACE_FORMAT(irq_handler_entry,
-	TP_PROTO(int irq, struct irqaction *action),
-	TP_ARGS(irq, action),
-	TP_FMT("irq=%d handler=%s", irq, action->name)
-	);
-
-/*
- * Tracepoint for return of an interrupt handler:
- */
-TRACE_EVENT(irq_handler_exit,
-
-	TP_PROTO(int irq, struct irqaction *action, int ret),
-
-	TP_ARGS(irq, action, ret),
-
-	TP_STRUCT__entry(
-		__field(	int,	irq	)
-		__field(	int,	ret	)
-	),
-
-	TP_fast_assign(
-		__entry->irq	= irq;
-		__entry->ret	= ret;
-	),
-
-	TP_printk("irq=%d return=%s",
-		  __entry->irq, __entry->ret ? "handled" : "unhandled")
-);
-
-TRACE_FORMAT(softirq_entry,
-	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
-	TP_ARGS(h, vec),
-	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
-	);
-
-TRACE_FORMAT(softirq_exit,
-	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
-	TP_ARGS(h, vec),
-	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
-	);
-
-#endif /*  _TRACE_IRQ_H */
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/include/trace/kmem.h b/include/trace/kmem.h
deleted file mode 100644
index c22c42f..0000000
--- a/include/trace/kmem.h
+++ /dev/null
@@ -1,194 +0,0 @@
-#if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_KMEM_H
-
-#include <linux/types.h>
-#include <linux/tracepoint.h>
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM kmem
-
-TRACE_EVENT(kmalloc,
-
-	TP_PROTO(unsigned long call_site,
-		 const void *ptr,
-		 size_t bytes_req,
-		 size_t bytes_alloc,
-		 gfp_t gfp_flags),
-
-	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	call_site	)
-		__field(	const void *,	ptr		)
-		__field(	size_t,		bytes_req	)
-		__field(	size_t,		bytes_alloc	)
-		__field(	gfp_t,		gfp_flags	)
-	),
-
-	TP_fast_assign(
-		__entry->call_site	= call_site;
-		__entry->ptr		= ptr;
-		__entry->bytes_req	= bytes_req;
-		__entry->bytes_alloc	= bytes_alloc;
-		__entry->gfp_flags	= gfp_flags;
-	),
-
-	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x",
-		__entry->call_site,
-		__entry->ptr,
-		__entry->bytes_req,
-		__entry->bytes_alloc,
-		__entry->gfp_flags)
-);
-
-TRACE_EVENT(kmem_cache_alloc,
-
-	TP_PROTO(unsigned long call_site,
-		 const void *ptr,
-		 size_t bytes_req,
-		 size_t bytes_alloc,
-		 gfp_t gfp_flags),
-
-	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	call_site	)
-		__field(	const void *,	ptr		)
-		__field(	size_t,		bytes_req	)
-		__field(	size_t,		bytes_alloc	)
-		__field(	gfp_t,		gfp_flags	)
-	),
-
-	TP_fast_assign(
-		__entry->call_site	= call_site;
-		__entry->ptr		= ptr;
-		__entry->bytes_req	= bytes_req;
-		__entry->bytes_alloc	= bytes_alloc;
-		__entry->gfp_flags	= gfp_flags;
-	),
-
-	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x",
-		__entry->call_site,
-		__entry->ptr,
-		__entry->bytes_req,
-		__entry->bytes_alloc,
-		__entry->gfp_flags)
-);
-
-TRACE_EVENT(kmalloc_node,
-
-	TP_PROTO(unsigned long call_site,
-		 const void *ptr,
-		 size_t bytes_req,
-		 size_t bytes_alloc,
-		 gfp_t gfp_flags,
-		 int node),
-
-	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	call_site	)
-		__field(	const void *,	ptr		)
-		__field(	size_t,		bytes_req	)
-		__field(	size_t,		bytes_alloc	)
-		__field(	gfp_t,		gfp_flags	)
-		__field(	int,		node		)
-	),
-
-	TP_fast_assign(
-		__entry->call_site	= call_site;
-		__entry->ptr		= ptr;
-		__entry->bytes_req	= bytes_req;
-		__entry->bytes_alloc	= bytes_alloc;
-		__entry->gfp_flags	= gfp_flags;
-		__entry->node		= node;
-	),
-
-	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d",
-		__entry->call_site,
-		__entry->ptr,
-		__entry->bytes_req,
-		__entry->bytes_alloc,
-		__entry->gfp_flags,
-		__entry->node)
-);
-
-TRACE_EVENT(kmem_cache_alloc_node,
-
-	TP_PROTO(unsigned long call_site,
-		 const void *ptr,
-		 size_t bytes_req,
-		 size_t bytes_alloc,
-		 gfp_t gfp_flags,
-		 int node),
-
-	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	call_site	)
-		__field(	const void *,	ptr		)
-		__field(	size_t,		bytes_req	)
-		__field(	size_t,		bytes_alloc	)
-		__field(	gfp_t,		gfp_flags	)
-		__field(	int,		node		)
-	),
-
-	TP_fast_assign(
-		__entry->call_site	= call_site;
-		__entry->ptr		= ptr;
-		__entry->bytes_req	= bytes_req;
-		__entry->bytes_alloc	= bytes_alloc;
-		__entry->gfp_flags	= gfp_flags;
-		__entry->node		= node;
-	),
-
-	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d",
-		__entry->call_site,
-		__entry->ptr,
-		__entry->bytes_req,
-		__entry->bytes_alloc,
-		__entry->gfp_flags,
-		__entry->node)
-);
-
-TRACE_EVENT(kfree,
-
-	TP_PROTO(unsigned long call_site, const void *ptr),
-
-	TP_ARGS(call_site, ptr),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	call_site	)
-		__field(	const void *,	ptr		)
-	),
-
-	TP_fast_assign(
-		__entry->call_site	= call_site;
-		__entry->ptr		= ptr;
-	),
-
-	TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
-);
-
-TRACE_EVENT(kmem_cache_free,
-
-	TP_PROTO(unsigned long call_site, const void *ptr),
-
-	TP_ARGS(call_site, ptr),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	call_site	)
-		__field(	const void *,	ptr		)
-	),
-
-	TP_fast_assign(
-		__entry->call_site	= call_site;
-		__entry->ptr		= ptr;
-	),
-
-	TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
-);
-#endif /* _TRACE_KMEM_H */
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h
deleted file mode 100644
index 45e326b..0000000
--- a/include/trace/lockdep.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#if !defined(_TRACE_LOCKDEP_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_LOCKDEP_H
-
-#include <linux/lockdep.h>
-#include <linux/tracepoint.h>
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM lockdep
-
-#ifdef CONFIG_LOCKDEP
-
-TRACE_FORMAT(lock_acquire,
-	TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
-		int trylock, int read, int check,
-		struct lockdep_map *next_lock, unsigned long ip),
-	TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
-	TP_FMT("%s%s%s", trylock ? "try " : "",
-		read ? "read " : "", lock->name)
-	);
-
-TRACE_FORMAT(lock_release,
-	TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
-	TP_ARGS(lock, nested, ip),
-	TP_FMT("%s", lock->name)
-	);
-
-#ifdef CONFIG_LOCK_STAT
-
-TRACE_FORMAT(lock_contended,
-	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
-	TP_ARGS(lock, ip),
-	TP_FMT("%s", lock->name)
-	);
-
-TRACE_EVENT(lock_acquired,
-	TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime),
-
-	TP_ARGS(lock, ip, waittime),
-
-	TP_STRUCT__entry(
-		__field(const char *, name)
-		__field(unsigned long, wait_usec)
-		__field(unsigned long, wait_nsec_rem)
-	),
-	TP_fast_assign(
-		__entry->name = lock->name;
-		__entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC);
-		__entry->wait_usec = (unsigned long) waittime;
-	),
-	TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec,
-				       __entry->wait_nsec_rem)
-);
-
-#endif
-#endif
-
-#endif /* _TRACE_LOCKDEP_H */
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/include/trace/sched.h b/include/trace/sched.h
deleted file mode 100644
index ffa1cab..0000000
--- a/include/trace/sched.h
+++ /dev/null
@@ -1,339 +0,0 @@
-#if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_SCHED_H
-
-#include <linux/sched.h>
-#include <linux/tracepoint.h>
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM sched
-
-/*
- * Tracepoint for calling kthread_stop, performed to end a kthread:
- */
-TRACE_EVENT(sched_kthread_stop,
-
-	TP_PROTO(struct task_struct *t),
-
-	TP_ARGS(t),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
-		__entry->pid	= t->pid;
-	),
-
-	TP_printk("task %s:%d", __entry->comm, __entry->pid)
-);
-
-/*
- * Tracepoint for the return value of the kthread stopping:
- */
-TRACE_EVENT(sched_kthread_stop_ret,
-
-	TP_PROTO(int ret),
-
-	TP_ARGS(ret),
-
-	TP_STRUCT__entry(
-		__field(	int,	ret	)
-	),
-
-	TP_fast_assign(
-		__entry->ret	= ret;
-	),
-
-	TP_printk("ret %d", __entry->ret)
-);
-
-/*
- * Tracepoint for waiting on task to unschedule:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
- */
-TRACE_EVENT(sched_wait_task,
-
-	TP_PROTO(struct rq *rq, struct task_struct *p),
-
-	TP_ARGS(rq, p),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid	= p->pid;
-		__entry->prio	= p->prio;
-	),
-
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio)
-);
-
-/*
- * Tracepoint for waking up a task:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
- */
-TRACE_EVENT(sched_wakeup,
-
-	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
-
-	TP_ARGS(rq, p, success),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-		__field(	int,	success			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->prio		= p->prio;
-		__entry->success	= success;
-	),
-
-	TP_printk("task %s:%d [%d] success=%d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->success)
-);
-
-/*
- * Tracepoint for waking up a new task:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
- */
-TRACE_EVENT(sched_wakeup_new,
-
-	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
-
-	TP_ARGS(rq, p, success),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-		__field(	int,	success			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->prio		= p->prio;
-		__entry->success	= success;
-	),
-
-	TP_printk("task %s:%d [%d] success=%d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->success)
-);
-
-/*
- * Tracepoint for task switches, performed by the scheduler:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
- */
-TRACE_EVENT(sched_switch,
-
-	TP_PROTO(struct rq *rq, struct task_struct *prev,
-		 struct task_struct *next),
-
-	TP_ARGS(rq, prev, next),
-
-	TP_STRUCT__entry(
-		__array(	char,	prev_comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	prev_pid			)
-		__field(	int,	prev_prio			)
-		__array(	char,	next_comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	next_pid			)
-		__field(	int,	next_prio			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
-		__entry->prev_pid	= prev->pid;
-		__entry->prev_prio	= prev->prio;
-		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
-		__entry->next_pid	= next->pid;
-		__entry->next_prio	= next->prio;
-	),
-
-	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
-		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
-		__entry->next_comm, __entry->next_pid, __entry->next_prio)
-);
-
-/*
- * Tracepoint for a task being migrated:
- */
-TRACE_EVENT(sched_migrate_task,
-
-	TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
-
-	TP_ARGS(p, orig_cpu, dest_cpu),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-		__field(	int,	orig_cpu		)
-		__field(	int,	dest_cpu		)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->prio		= p->prio;
-		__entry->orig_cpu	= orig_cpu;
-		__entry->dest_cpu	= dest_cpu;
-	),
-
-	TP_printk("task %s:%d [%d] from: %d  to: %d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->orig_cpu, __entry->dest_cpu)
-);
-
-/*
- * Tracepoint for freeing a task:
- */
-TRACE_EVENT(sched_process_free,
-
-	TP_PROTO(struct task_struct *p),
-
-	TP_ARGS(p),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->prio		= p->prio;
-	),
-
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio)
-);
-
-/*
- * Tracepoint for a task exiting:
- */
-TRACE_EVENT(sched_process_exit,
-
-	TP_PROTO(struct task_struct *p),
-
-	TP_ARGS(p),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->prio		= p->prio;
-	),
-
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio)
-);
-
-/*
- * Tracepoint for a waiting task:
- */
-TRACE_EVENT(sched_process_wait,
-
-	TP_PROTO(struct pid *pid),
-
-	TP_ARGS(pid),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
-		__entry->pid		= pid_nr(pid);
-		__entry->prio		= current->prio;
-	),
-
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio)
-);
-
-/*
- * Tracepoint for do_fork:
- */
-TRACE_EVENT(sched_process_fork,
-
-	TP_PROTO(struct task_struct *parent, struct task_struct *child),
-
-	TP_ARGS(parent, child),
-
-	TP_STRUCT__entry(
-		__array(	char,	parent_comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	parent_pid			)
-		__array(	char,	child_comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	child_pid			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN);
-		__entry->parent_pid	= parent->pid;
-		memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN);
-		__entry->child_pid	= child->pid;
-	),
-
-	TP_printk("parent %s:%d  child %s:%d",
-		__entry->parent_comm, __entry->parent_pid,
-		__entry->child_comm, __entry->child_pid)
-);
-
-/*
- * Tracepoint for sending a signal:
- */
-TRACE_EVENT(sched_signal_send,
-
-	TP_PROTO(int sig, struct task_struct *p),
-
-	TP_ARGS(sig, p),
-
-	TP_STRUCT__entry(
-		__field(	int,	sig			)
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid	= p->pid;
-		__entry->sig	= sig;
-	),
-
-	TP_printk("sig: %d  task %s:%d",
-		  __entry->sig, __entry->comm, __entry->pid)
-);
-
-#endif /* _TRACE_SCHED_H */
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/include/trace/skb.h b/include/trace/skb.h
deleted file mode 100644
index 1e8fabb..0000000
--- a/include/trace/skb.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_SKB_H
-
-#include <linux/skbuff.h>
-#include <linux/tracepoint.h>
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM skb
-
-/*
- * Tracepoint for free an sk_buff:
- */
-TRACE_EVENT(kfree_skb,
-
-	TP_PROTO(struct sk_buff *skb, void *location),
-
-	TP_ARGS(skb, location),
-
-	TP_STRUCT__entry(
-		__field(	void *,		skbaddr		)
-		__field(	unsigned short,	protocol	)
-		__field(	void *,		location	)
-	),
-
-	TP_fast_assign(
-		__entry->skbaddr = skb;
-		if (skb) {
-			__entry->protocol = ntohs(skb->protocol);
-		}
-		__entry->location = location;
-	),
-
-	TP_printk("skbaddr=%p protocol=%u location=%p",
-		__entry->skbaddr, __entry->protocol, __entry->location)
-);
-
-#endif /* _TRACE_SKB_H */
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/kernel/exit.c b/kernel/exit.c
index 2fe9d2c..cab535c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -48,7 +48,7 @@
 #include <linux/tracehook.h>
 #include <linux/fs_struct.h>
 #include <linux/init_task.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 4bebf26..085f73e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -61,7 +61,6 @@
 #include <linux/proc_fs.h>
 #include <linux/blkdev.h>
 #include <linux/fs_struct.h>
-#include <trace/sched.h>
 #include <linux/magic.h>
 
 #include <asm/pgtable.h>
@@ -71,6 +70,8 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
+#include <trace/events/sched.h>
+
 /*
  * Protected counters by write_lock_irq(&tasklist_lock)
  */
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 983d8be..37c6363 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -20,7 +20,7 @@
 #include <linux/bootmem.h>
 
 #define CREATE_TRACE_POINTS
-#include <trace/irq.h>
+#include <trace/events/irq.h>
 
 #include "internals.h"
 
diff --git a/kernel/kthread.c b/kernel/kthread.c
index e1c7692..41c88fe 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -13,7 +13,7 @@
 #include <linux/file.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #define KTHREAD_NICE_LEVEL (-5)
 
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 257f21a..47b201e 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -48,7 +48,7 @@
 #include "lockdep_internals.h"
 
 #define CREATE_TRACE_POINTS
-#include <trace/lockdep.h>
+#include <trace/events/lockdep.h>
 
 #ifdef CONFIG_PROVE_LOCKING
 int prove_locking = 1;
diff --git a/kernel/sched.c b/kernel/sched.c
index e6d4518..9f7ffd0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -79,7 +79,7 @@
 #include "sched_cpupri.h"
 
 #define CREATE_TRACE_POINTS
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
diff --git a/kernel/signal.c b/kernel/signal.c
index 1d5703f..94ec0a4 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -27,7 +27,7 @@
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
 #include <linux/nsproxy.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #include <asm/param.h>
 #include <asm/uaccess.h>
diff --git a/kernel/softirq.c b/kernel/softirq.c
index a2d9b45..7ab9dfd 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -24,7 +24,7 @@
 #include <linux/ftrace.h>
 #include <linux/smp.h>
 #include <linux/tick.h>
-#include <trace/irq.h>
+#include <trace/events/irq.h>
 
 #include <asm/irq.h>
 /*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8e6a0b5..a234889 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -29,7 +29,7 @@
 #include <linux/list.h>
 #include <linux/hash.h>
 
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #include <asm/ftrace.h>
 
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 9d8cccd..a98106d 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -10,7 +10,7 @@
 #include <linux/kallsyms.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #include "trace.h"
 
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 5bc00e8f..b8b13c5 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -15,7 +15,7 @@
 #include <linux/kallsyms.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #include "trace.h"
 
diff --git a/mm/util.c b/mm/util.c
index 0e74a22..6794a33 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -7,7 +7,7 @@
 #include <asm/uaccess.h>
 
 #define CREATE_TRACE_POINTS
-#include <trace/kmem.h>
+#include <trace/events/kmem.h>
 
 /**
  * kstrdup - allocate space for and copy an existing string
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 9fd0dc3..b75b6ce 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -23,7 +23,7 @@
 #include <linux/bitops.h>
 #include <net/genetlink.h>
 
-#include <trace/skb.h>
+#include <trace/events/skb.h>
 
 #include <asm/unaligned.h>
 
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 8017720..499a67e 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -24,6 +24,6 @@
 #include <asm/bitops.h>
 
 #define CREATE_TRACE_POINTS
-#include <trace/skb.h>
+#include <trace/events/skb.h>
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ce6356c..12806b8 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -65,7 +65,7 @@
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
-#include <trace/skb.h>
+#include <trace/events/skb.h>
 
 #include "kmap_skb.h"
 
-- 
cgit v0.10.2


From 9cfe06f8cd5c8c3ad6ab323973e87dde670642b8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 14 Apr 2009 21:37:03 -0400
Subject: tracing/events: add trace-events-sample

This patch adds a sample to the samples directory on how to create
and use TRACE_EVENT trace points.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/samples/Kconfig b/samples/Kconfig
index 4b02f5a..93f41c0 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -19,6 +19,13 @@ config SAMPLE_TRACEPOINTS
 	help
 	  This build tracepoints example modules.
 
+config SAMPLE_TRACE_EVENTS
+	tristate "Build trace_events examples"
+	depends on EVENT_TRACING
+	default m
+	help
+	  This build trace event example modules.
+
 config SAMPLE_KOBJECT
 	tristate "Build kobject examples"
 	help
diff --git a/samples/Makefile b/samples/Makefile
index 10eaca8..13e4b47 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -1,3 +1,3 @@
 # Makefile for Linux samples code
 
-obj-$(CONFIG_SAMPLES)	+= markers/ kobject/ kprobes/ tracepoints/
+obj-$(CONFIG_SAMPLES)	+= markers/ kobject/ kprobes/ tracepoints/ trace_events/
diff --git a/samples/trace_events/Makefile b/samples/trace_events/Makefile
new file mode 100644
index 0000000..06c6dea
--- /dev/null
+++ b/samples/trace_events/Makefile
@@ -0,0 +1,8 @@
+# builds the trace events example kernel modules;
+# then to use one (as root):  insmod <module_name.ko>
+
+PWD := $(shell pwd)
+
+CFLAGS_trace-events-sample.o := -I$(PWD)/samples/trace_events/
+
+obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace-events-sample.o
diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c
new file mode 100644
index 0000000..f33b3ba
--- /dev/null
+++ b/samples/trace_events/trace-events-sample.c
@@ -0,0 +1,56 @@
+#include <linux/module.h>
+#include <linux/kthread.h>
+
+/*
+ * Any file that uses trace points, must include the header.
+ * But only one file, must include the header by defining
+ * CREATE_TRACE_POINTS first.  This will make the C code that
+ * creates the handles for the trace points.
+ */
+#define CREATE_TRACE_POINTS
+#include "trace-events-sample.h"
+
+
+static void simple_thread_func(int cnt)
+{
+	set_current_state(TASK_INTERRUPTIBLE);
+	schedule_timeout(HZ);
+	trace_foo_bar("hello", cnt);
+
+	if (!(cnt % 10))
+		/* It is really important that I say "hi!" */
+		printk(KERN_EMERG "hi!\n");
+}
+
+static int simple_thread(void *arg)
+{
+	int cnt = 0;
+
+	while (!kthread_should_stop())
+		simple_thread_func(cnt++);
+
+	return 0;
+}
+
+static struct task_struct *simple_tsk;
+
+static int __init trace_event_init(void)
+{
+	simple_tsk = kthread_run(simple_thread, NULL, "event-sample");
+	if (IS_ERR(simple_tsk))
+		return -1;
+
+	return 0;
+}
+
+static void __exit trace_event_exit(void)
+{
+	kthread_stop(simple_tsk);
+}
+
+module_init(trace_event_init);
+module_exit(trace_event_exit);
+
+MODULE_AUTHOR("Steven Rostedt");
+MODULE_DESCRIPTION("trace-events-sample");
+MODULE_LICENSE("GPL");
diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h
new file mode 100644
index 0000000..eab4644
--- /dev/null
+++ b/samples/trace_events/trace-events-sample.h
@@ -0,0 +1,124 @@
+/*
+ * Notice that this file is not protected like a normal header.
+ * We also must allow for rereading of this file. The
+ *
+ *  || defined(TRACE_HEADER_MULTI_READ)
+ *
+ * serves this purpose.
+ */
+#if !defined(_TRACE_EVENT_SAMPLE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_EVENT_SAMPLE_H
+
+/*
+ * All trace headers should include tracepoint.h, until we finally
+ * make it into a standard header.
+ */
+#include <linux/tracepoint.h>
+
+/*
+ * If TRACE_SYSTEM is defined, that will be the directory created
+ * in the ftrace directory under /debugfs/tracing/events/<system>
+ *
+ * The define_trace.h belowe will also look for a file name of
+ * TRACE_SYSTEM.h where TRACE_SYSTEM is what is defined here.
+ *
+ * If you want a different system than file name, you can override
+ * the header name by defining TRACE_INCLUDE_FILE
+ *
+ * If this file was called, goofy.h, then we would define:
+ *
+ * #define TRACE_INCLUDE_FILE goofy
+ *
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM trace-events-sample
+
+/*
+ * The TRACE_EVENT macro is broken up into 5 parts.
+ *
+ * name: name of the trace point. This is also how to enable the tracepoint.
+ *   A function called trace_foo_bar() will be created.
+ *
+ * proto: the prototype of the function trace_foo_bar()
+ *   Here it is trace_foo_bar(char *foo, int bar).
+ *
+ * args:  must match the arguments in the prototype.
+ *    Here it is simply "foo, bar".
+ *
+ * struct:  This defines the way the data will be stored in the ring buffer.
+ *    There are currently two types of elements. __field and __array.
+ *    a __field is broken up into (type, name). Where type can be any
+ *    type but an array.
+ *    For an array. there are three fields. (type, name, size). The
+ *    type of elements in the array, the name of the field and the size
+ *    of the array.
+ *
+ *    __array( char, foo, 10) is the same as saying   char foo[10].
+ *
+ * fast_assign: This is a C like function that is used to store the items
+ *    into the ring buffer.
+ *
+ * printk: This is a way to print out the data in pretty print. This is
+ *    useful if the system crashes and you are logging via a serial line,
+ *    the data can be printed to the console using this "printk" method.
+ *
+ * Note, that for both the assign and the printk, __entry is the handler
+ * to the data structure in the ring buffer, and is defined by the
+ * TP_STRUCT__entry.
+ */
+TRACE_EVENT(foo_bar,
+
+	TP_PROTO(char *foo, int bar),
+
+	TP_ARGS(foo, bar),
+
+	TP_STRUCT__entry(
+		__array(	char,	foo,    10		)
+		__field(	int,	bar			)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->foo, foo, 10);
+		__entry->bar	= bar;
+	),
+
+	TP_printk("foo %s %d", __entry->foo, __entry->bar)
+);
+#endif
+
+/***** NOTICE! The #if protection ends here. *****/
+
+
+/*
+ * There are several ways I could have done this. If I left out the
+ * TRACE_INCLUDE_PATH, then it would default to the kernel source
+ * include/trace/events directory.
+ *
+ * I could specify a path from the define_trace.h file back to this
+ * file.
+ *
+ * #define TRACE_INCLUDE_PATH ../../samples/trace_events
+ *
+ * But I chose to simply make it use the current directory and then in
+ * the Makefile I added:
+ *
+ * CFLAGS_trace-events-sample.o := -I$(PWD)/samples/trace_events/
+ *
+ * This will make sure the current path is part of the include
+ * structure for our file so that we can find it.
+ *
+ * I could have made only the top level directory the include:
+ *
+ * CFLAGS_trace-events-sample.o := -I$(PWD)
+ *
+ * And then let the path to this directory be the TRACE_INCLUDE_PATH:
+ *
+ * #define TRACE_INCLUDE_PATH samples/trace_events
+ *
+ * But then if something defines "samples" or "trace_events" then we
+ * could risk that being converted too, and give us an unexpected
+ * result.
+ */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#include <trace/define_trace.h>
-- 
cgit v0.10.2


From 05725f7eb4b8acb147c5fc7b91397b1f6bcab00d Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Tue, 14 Apr 2009 20:17:16 +0200
Subject: rculist: use list_entry_rcu in places where it's appropriate

Use previously introduced list_entry_rcu instead of an open-coded
list_entry + rcu_dereference combination.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: dipankar@in.ibm.com
LKML-Reference: <20090414181715.GA3634@psychotron.englab.brq.redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4c38bc..886df41 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -77,6 +77,7 @@ struct sched_param {
 #include <linux/proportions.h>
 #include <linux/seccomp.h>
 #include <linux/rcupdate.h>
+#include <linux/rculist.h>
 #include <linux/rtmutex.h>
 
 #include <linux/time.h>
@@ -2010,7 +2011,8 @@ static inline unsigned long wait_task_inactive(struct task_struct *p,
 }
 #endif
 
-#define next_task(p)	list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)
+#define next_task(p) \
+	list_entry_rcu((p)->tasks.next, struct task_struct, tasks)
 
 #define for_each_process(p) \
 	for (p = &init_task ; (p = next_task(p)) != &init_task ; )
@@ -2049,8 +2051,8 @@ int same_thread_group(struct task_struct *p1, struct task_struct *p2)
 
 static inline struct task_struct *next_thread(const struct task_struct *p)
 {
-	return list_entry(rcu_dereference(p->thread_group.next),
-			  struct task_struct, thread_group);
+	return list_entry_rcu(p->thread_group.next,
+			      struct task_struct, thread_group);
 }
 
 static inline int thread_group_empty(struct task_struct *p)
diff --git a/ipc/sem.c b/ipc/sem.c
index 16a2189..87c2b64 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1290,8 +1290,8 @@ void exit_sem(struct task_struct *tsk)
 		int i;
 
 		rcu_read_lock();
-		un = list_entry(rcu_dereference(ulp->list_proc.next),
-					struct sem_undo, list_proc);
+		un = list_entry_rcu(ulp->list_proc.next,
+				    struct sem_undo, list_proc);
 		if (&un->list_proc == &ulp->list_proc)
 			semid = -1;
 		 else
diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c
index ffbe259..510186f 100644
--- a/security/integrity/ima/ima_fs.c
+++ b/security/integrity/ima/ima_fs.c
@@ -84,8 +84,8 @@ static void *ima_measurements_next(struct seq_file *m, void *v, loff_t *pos)
 	 * against concurrent list-extension
 	 */
 	rcu_read_lock();
-	qe = list_entry(rcu_dereference(qe->later.next),
-			struct ima_queue_entry, later);
+	qe = list_entry_rcu(qe->later.next,
+			    struct ima_queue_entry, later);
 	rcu_read_unlock();
 	(*pos)++;
 
diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c
index e03a7e1..11d2cb1 100644
--- a/security/smack/smackfs.c
+++ b/security/smack/smackfs.c
@@ -734,8 +734,8 @@ static void smk_netlbladdr_insert(struct smk_netlbladdr *new)
 		return;
 	}
 
-	m = list_entry(rcu_dereference(smk_netlbladdr_list.next),
-			 struct smk_netlbladdr, list);
+	m = list_entry_rcu(smk_netlbladdr_list.next,
+			   struct smk_netlbladdr, list);
 
 	/* the comparison '>' is a bit hacky, but works */
 	if (new->smk_mask.s_addr > m->smk_mask.s_addr) {
@@ -748,8 +748,8 @@ static void smk_netlbladdr_insert(struct smk_netlbladdr *new)
 			list_add_rcu(&new->list, &m->list);
 			return;
 		}
-		m_next = list_entry(rcu_dereference(m->list.next),
-				 struct smk_netlbladdr, list);
+		m_next = list_entry_rcu(m->list.next,
+					struct smk_netlbladdr, list);
 		if (new->smk_mask.s_addr > m_next->smk_mask.s_addr) {
 			list_add_rcu(&new->list, &m->list);
 			return;
-- 
cgit v0.10.2


From b206525ad1f653b7da35f5827be93770d28eae11 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Tue, 14 Apr 2009 23:04:37 +0530
Subject: x86: k8 convert node_to_k8_nb_misc() from a macro to an inline
 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Converting node_to_k8_nb_misc() from a macro to an inline function
makes compiler see the 'node' parameter in the !CONFIG_K8_NB too,
which eliminates these compiler warnings:

  arch/x86/kernel/cpu/intel_cacheinfo.c: In function ‘show_cache_disable’:
  arch/x86/kernel/cpu/intel_cacheinfo.c:712: warning: unused variable ‘node’
  arch/x86/kernel/cpu/intel_cacheinfo.c: In function ‘store_cache_disable’:
  arch/x86/kernel/cpu/intel_cacheinfo.c:739: warning: unused variable ‘node’

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Mark Langsdorf <mark.langsdorf@amd.com>
LKML-Reference: <1239730477.2966.26.camel@ht.satnam>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
index c23b3d17..c2d1f3b 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -13,10 +13,15 @@ extern void k8_flush_garts(void);
 extern int k8_scan_nodes(unsigned long start, unsigned long end);
 
 #ifdef CONFIG_K8_NB
-#define node_to_k8_nb_misc(node) \
-	(node < num_k8_northbridges) ? k8_northbridges[node] : NULL
+static inline struct pci_dev *node_to_k8_nb_misc(int node)
+{
+	return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL;
+}
 #else
-#define node_to_k8_nb_misc(node) NULL
+static inline struct pci_dev *node_to_k8_nb_misc(int node)
+{
+	return NULL;
+}
 #endif
 
 
-- 
cgit v0.10.2


From e6a1a89d572c31b62d6dcf11a371c7323852d9b2 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Wed, 15 Apr 2009 18:22:41 +0900
Subject: dma-debug: add dma_debug_resize_entries() to adjust the number of
 dma_debug_entries

We use a static value for the number of dma_debug_entries. It can be
overwritten by a kernel command line option.

Some IOMMUs (e.g. GART) can't set an appropriate value by a kernel
command line option because they can't know such value until they
finish initializing up their hardware.

This patch adds dma_debug_resize_entries() enables IOMMUs to adjust
the number of dma_debug_entries anytime.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: fujita.tomonori@lab.ntt.co.jp
Cc: akpm@linux-foundation.org
LKML-Reference: <20090415182234R.fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
index 28d53cb..171ad8a 100644
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -32,6 +32,8 @@ extern void dma_debug_add_bus(struct bus_type *bus);
 
 extern void dma_debug_init(u32 num_entries);
 
+extern int dma_debug_resize_entries(u32 num_entries);
+
 extern void debug_dma_map_page(struct device *dev, struct page *page,
 			       size_t offset, size_t size,
 			       int direction, dma_addr_t dma_addr,
@@ -91,6 +93,11 @@ static inline void dma_debug_init(u32 num_entries)
 {
 }
 
+static inline int dma_debug_resize_entries(u32 num_entries)
+{
+	return 0;
+}
+
 static inline void debug_dma_map_page(struct device *dev, struct page *page,
 				      size_t offset, size_t size,
 				      int direction, dma_addr_t dma_addr,
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index d3da7ed..5d61019 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -85,6 +85,7 @@ static u32 show_num_errors = 1;
 
 static u32 num_free_entries;
 static u32 min_free_entries;
+static u32 nr_total_entries;
 
 /* number of preallocated entries requested by kernel cmdline */
 static u32 req_entries;
@@ -257,6 +258,21 @@ static void add_dma_entry(struct dma_debug_entry *entry)
 	put_hash_bucket(bucket, &flags);
 }
 
+static struct dma_debug_entry *__dma_entry_alloc(void)
+{
+	struct dma_debug_entry *entry;
+
+	entry = list_entry(free_entries.next, struct dma_debug_entry, list);
+	list_del(&entry->list);
+	memset(entry, 0, sizeof(*entry));
+
+	num_free_entries -= 1;
+	if (num_free_entries < min_free_entries)
+		min_free_entries = num_free_entries;
+
+	return entry;
+}
+
 /* struct dma_entry allocator
  *
  * The next two functions implement the allocator for
@@ -276,9 +292,7 @@ static struct dma_debug_entry *dma_entry_alloc(void)
 		goto out;
 	}
 
-	entry = list_entry(free_entries.next, struct dma_debug_entry, list);
-	list_del(&entry->list);
-	memset(entry, 0, sizeof(*entry));
+	entry = __dma_entry_alloc();
 
 #ifdef CONFIG_STACKTRACE
 	entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES;
@@ -286,9 +300,6 @@ static struct dma_debug_entry *dma_entry_alloc(void)
 	entry->stacktrace.skip = 2;
 	save_stack_trace(&entry->stacktrace);
 #endif
-	num_free_entries -= 1;
-	if (num_free_entries < min_free_entries)
-		min_free_entries = num_free_entries;
 
 out:
 	spin_unlock_irqrestore(&free_entries_lock, flags);
@@ -310,6 +321,53 @@ static void dma_entry_free(struct dma_debug_entry *entry)
 	spin_unlock_irqrestore(&free_entries_lock, flags);
 }
 
+int dma_debug_resize_entries(u32 num_entries)
+{
+	int i, delta, ret = 0;
+	unsigned long flags;
+	struct dma_debug_entry *entry;
+	LIST_HEAD(tmp);
+
+	spin_lock_irqsave(&free_entries_lock, flags);
+
+	if (nr_total_entries < num_entries) {
+		delta = num_entries - nr_total_entries;
+
+		spin_unlock_irqrestore(&free_entries_lock, flags);
+
+		for (i = 0; i < delta; i++) {
+			entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+			if (!entry)
+				break;
+
+			list_add_tail(&entry->list, &tmp);
+		}
+
+		spin_lock_irqsave(&free_entries_lock, flags);
+
+		list_splice(&tmp, &free_entries);
+		nr_total_entries += i;
+		num_free_entries += i;
+	} else {
+		delta = nr_total_entries - num_entries;
+
+		for (i = 0; i < delta && !list_empty(&free_entries); i++) {
+			entry = __dma_entry_alloc();
+			kfree(entry);
+		}
+
+		nr_total_entries -= i;
+	}
+
+	if (nr_total_entries != num_entries)
+		ret = 1;
+
+	spin_unlock_irqrestore(&free_entries_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(dma_debug_resize_entries);
+
 /*
  * DMA-API debugging init code
  *
@@ -490,6 +548,8 @@ void dma_debug_init(u32 num_entries)
 		return;
 	}
 
+	nr_total_entries = num_free_entries;
+
 	printk(KERN_INFO "DMA-API: debugging enabled by kernel config\n");
 }
 
-- 
cgit v0.10.2


From 19c1a6f5764d787113fa323ffb18be7991208f82 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 14 Apr 2009 09:43:19 +0900
Subject: x86 gart: reimplement IOMMU_LEAK feature by using DMA_API_DEBUG

IOMMU_LEAK, GART's own feature, dumps the used IOMMU entries when
IOMMU entries is full, which might be useful to find a bad driver that
eats IOMMU entries.

DMA_API_DEBUG provides the similar feature, debug_dma_dump_mappings,
and it's better than GART's IOMMU_LEAK feature. GART's IOMMU_LEAK
feature doesn't say who uses IOMMU entries so it's hard to find a bad
driver.

This patch reimplements the GART's IOMMU_LEAK feature by using
DMA_API_DEBUG.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1239669799-23579-2-git-send-email-fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index d8359e7..5865712 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -161,8 +161,7 @@ config IOMMU_DEBUG
 
 config IOMMU_LEAK
 	bool "IOMMU leak tracing"
-	depends on DEBUG_KERNEL
-	depends on IOMMU_DEBUG
+	depends on IOMMU_DEBUG && DMA_API_DEBUG
 	---help---
 	  Add a simple leak tracer to the IOMMU code. This is useful when you
 	  are debugging a buggy device driver that leaks IOMMU mappings.
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index b284b58..1e8920d 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -144,48 +144,21 @@ static void flush_gart(void)
 }
 
 #ifdef CONFIG_IOMMU_LEAK
-
-#define SET_LEAK(x)							\
-	do {								\
-		if (iommu_leak_tab)					\
-			iommu_leak_tab[x] = __builtin_return_address(0);\
-	} while (0)
-
-#define CLEAR_LEAK(x)							\
-	do {								\
-		if (iommu_leak_tab)					\
-			iommu_leak_tab[x] = NULL;			\
-	} while (0)
-
 /* Debugging aid for drivers that don't free their IOMMU tables */
-static void **iommu_leak_tab;
 static int leak_trace;
 static int iommu_leak_pages = 20;
 
 static void dump_leak(void)
 {
-	int i;
 	static int dump;
 
-	if (dump || !iommu_leak_tab)
+	if (dump)
 		return;
 	dump = 1;
-	show_stack(NULL, NULL);
 
-	/* Very crude. dump some from the end of the table too */
-	printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n",
-	       iommu_leak_pages);
-	for (i = 0; i < iommu_leak_pages; i += 2) {
-		printk(KERN_DEBUG "%lu: ", iommu_pages-i);
-		printk_address((unsigned long) iommu_leak_tab[iommu_pages-i],
-				0);
-		printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
-	}
-	printk(KERN_DEBUG "\n");
+	show_stack(NULL, NULL);
+	debug_dma_dump_mappings(NULL);
 }
-#else
-# define SET_LEAK(x)
-# define CLEAR_LEAK(x)
 #endif
 
 static void iommu_full(struct device *dev, size_t size, int dir)
@@ -248,7 +221,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
 
 	for (i = 0; i < npages; i++) {
 		iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
-		SET_LEAK(iommu_page + i);
 		phys_mem += PAGE_SIZE;
 	}
 	return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
@@ -294,7 +266,6 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
 	npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
 	for (i = 0; i < npages; i++) {
 		iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
-		CLEAR_LEAK(iommu_page + i);
 	}
 	free_iommu(iommu_page, npages);
 }
@@ -377,7 +348,6 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
 		pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);
 		while (pages--) {
 			iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
-			SET_LEAK(iommu_page);
 			addr += PAGE_SIZE;
 			iommu_page++;
 		}
@@ -801,11 +771,12 @@ void __init gart_iommu_init(void)
 
 #ifdef CONFIG_IOMMU_LEAK
 	if (leak_trace) {
-		iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
-				  get_order(iommu_pages*sizeof(void *)));
-		if (!iommu_leak_tab)
+		int ret;
+
+		ret = dma_debug_resize_entries(iommu_pages);
+		if (ret)
 			printk(KERN_DEBUG
-			       "PCI-DMA: Cannot allocate leak trace area\n");
+			       "PCI-DMA: Cannot trace all the entries\n");
 	}
 #endif
 
-- 
cgit v0.10.2


From 13318a7186d8e0ae08c996ea4111a945e7789772 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 15 Apr 2009 09:59:10 +0800
Subject: sched: use group_first_cpu() instead of
 cpumask_first(sched_group_cpus())

Impact: cleanup

This patch changes cpumask_first(sched_group_cpus()) to group_first_cpu()
for maintainability.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 92b4b56..7601cee 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7995,7 +7995,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
 			struct sched_domain *sd;
 
 			sd = &per_cpu(phys_domains, j).sd;
-			if (j != cpumask_first(sched_group_cpus(sd->groups))) {
+			if (j != group_first_cpu(sd->groups)) {
 				/*
 				 * Only add "power" once for each
 				 * physical package.
@@ -8073,7 +8073,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 
 	WARN_ON(!sd || !sd->groups);
 
-	if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
+	if (cpu != group_first_cpu(sd->groups))
 		return;
 
 	child = sd->child;
-- 
cgit v0.10.2


From 77857dc07247ed5fa700a197c96ef842d8dbebdf Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 15 Apr 2009 11:57:01 -0700
Subject: x86: use used_vectors in init_IRQ()

Impact: fix crash with many devices

I found this crash:

[  552.616646] general protection fault: 0403 [#1] SMP
[  552.620013] last sysfs file:
/sys/devices/pci0000:00/0000:00:02.0/usb1/1-1/1-1:1.0/host13/target13:0:0/13:0:0:0/block/sr0/size
[  552.620013] CPU 0
[  552.620013] Modules linked in:
[  552.620013] Pid: 0, comm: swapper Not tainted 2.6.30-rc1-tip-01931-g8fcafd8-dirty #28 Sun Fire X4440
[  552.620013] RIP: 0010:[<ffffffff8023bada>]  [<ffffffff8023bada>] default_idle+0x7d/0xda
[  552.620013] RSP: 0018:ffffffff81345e68  EFLAGS: 00010246
[  552.620013] RAX: 0000000000000000 RBX: ffffffff8133d870 RCX: ffffc20000000000
[  552.620013] RDX: 00000000001d0620 RSI: ffffffff8023bad8 RDI: ffffffff802a3169
[  552.620013] RBP: ffffffff81345e98 R08: 0000000000000000 R09: ffffffff812244a0
[  552.620013] R10: ffffffff81345dc8 R11: 7ebe1b6fa0bcac50 R12: 4ec4ec4ec4ec4ec5
[  552.620013] R13: ffffffff813a54d0 R14: ffffffff813a7a40 R15: 0000000000000000
[  552.620013] FS:  00000000006d1880(0000) GS:ffffc20000000000(0000) knlGS:0000000000000000
[  552.620013] CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
[  552.620013] CR2: 00007fec9d936a50 CR3: 000000007d1a9000 CR4: 00000000000006e0
[  552.620013] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  552.620013] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[  552.620013] Process swapper (pid: 0, threadinfo ffffffff81344000,task ffffffff812244a0)
[  552.620013] Stack:
[  552.620013]  0000000000000000 ffffc20000000000 00000000001d0620 7ebe1b6fa0bcac50
[  552.620013]  ffffffff8133d870 4ec4ec4ec4ec4ec5 ffffffff81345ec8 ffffffff8023bd84
[  552.620013]  4ec4ec4ec4ec4ec5 ffffffff813a54d0 7ebe1b6fa0bcac50 ffffffff8133d870
[  552.620013] Call Trace:
[  552.620013]  [<ffffffff8023bd84>] c1e_idle+0x109/0x124
[  552.620013]  [<ffffffff8023314b>] cpu_idle+0xb8/0x101
[  552.620013]  [<ffffffff80c16d6a>] rest_init+0x7e/0x94
[  552.620013]  [<ffffffff81357efc>] start_kernel+0x3dc/0x3fd
[  552.620013]  [<ffffffff813572a9>] x86_64_start_reservations+0xb9/0xd4
[  552.620013]  [<ffffffff813573b2>] x86_64_start_kernel+0xee/0x109
[  552.620013] Code: 48 8b 04 25 f8 b4 00 00 83 a0 3c e0 ff ff fb 0f ae f0 65 48 8b 04 25 f8 b4 00 00 f6 80 38 e0 ff ff 08 75 09 e8 71 76 06 00 fb f4 <eb> 06 e8 68 76 06 00 fb 65 48 8b 04 25 f8 b4 00 00 83 88 3c e0
[  552.620013] RIP  [<ffffffff8023bada>] default_idle+0x7d/0xda
[  552.620013]  RSP <ffffffff81345e68>
[  552.828646] ---[ end trace 4cbfc5c01382af7f ]---

Joerg Roedel said
	"The 0403 error code means that there was an external interrupt with vector
	0x80. Yinghai, my theory is that the kernel on this machine has no 32bit
	emulation compiled in, right? In this case the selector points to a zero entry
	which may cause the #gpf right after the hlt.
	But I have no idea where the external int 0x80 comes from"

it turns out that we could use 0x80 for external device on 64-bit
when 32-bit emulation is disabled.

But we forgot to set the gate for it.

try to set gate for it by checking used_vectors.

Also move apic_intr_init() early to avoid setting
that gate two times.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Joerg Roedel <joerg.roedel@amd.com>
LKML-Reference: <49E62DFD.6010904@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index b424c32..2e08b10 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -240,19 +240,19 @@ void __init native_init_IRQ(void)
 	/* Execute any quirks before the call gates are initialised: */
 	x86_quirk_pre_intr_init();
 
+	apic_intr_init();
+
 	/*
 	 * Cover the whole vector space, no vector can escape
 	 * us. (some of these will be overridden and become
 	 * 'special' SMP interrupts)
 	 */
 	for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
-		/* IA32_SYSCALL_VECTOR was reserved in trap_init. */
-		if (i != IA32_SYSCALL_VECTOR)
+		/* IA32_SYSCALL_VECTOR could be used in trap_init already. */
+		if (!test_bit(i, used_vectors))
 			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
 	}
 
-	apic_intr_init();
-
 	if (!acpi_ioapic)
 		setup_irq(2, &irq2);
 
-- 
cgit v0.10.2


From ff7b1b4f000cea84f071c1b6aa2918b2119d66f1 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 15 Apr 2009 16:55:05 +0100
Subject: perfcounters: export perf_tpcounter_event

Needed for modular tracepoint support.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 7f9521c..0939609 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2590,6 +2590,7 @@ void perf_tpcounter_event(int event_id)
 
 	__perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0);
 }
+EXPORT_SYMBOL_GPL(perf_tpcounter_event);
 
 extern int ftrace_profile_enable(int);
 extern void ftrace_profile_disable(int);
-- 
cgit v0.10.2


From 0207a2efb43d81e29e23662b5d035945688a103f Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Thu, 16 Apr 2009 14:40:56 +0900
Subject: sh: Add support for SH7724 (SH-Mobile R2R) CPU subtype.

This implements initial support for the SH-Mobile R2R CPU.
Based on Rev 0.11 of the initial SH7724 hardware manual.

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index e7390dd..505d1ac 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -347,6 +347,15 @@ config CPU_SUBTYPE_SH7723
 	help
 	  Select SH7723 if you have an SH-MobileR2 CPU.
 
+config CPU_SUBTYPE_SH7724
+	bool "Support SH7724 processor"
+	select CPU_SH4A
+	select CPU_SHX2
+	select ARCH_SPARSEMEM_ENABLE
+	select SYS_SUPPORTS_CMT
+	help
+	  Select SH7724 if you have an SH-MobileR2R CPU.
+
 config CPU_SUBTYPE_SH7763
 	bool "Support SH7763 processor"
 	select CPU_SH4A
@@ -495,6 +504,7 @@ config SH_PCLK_FREQ
 			      CPU_SUBTYPE_SH7203 || CPU_SUBTYPE_SH7206 || \
 			      CPU_SUBTYPE_SH7263 || CPU_SUBTYPE_MXG    || \
 			      CPU_SUBTYPE_SH7786
+	default "41666666" if CPU_SUBTYPE_SH7724
 	default "60000000" if CPU_SUBTYPE_SH7751 || CPU_SUBTYPE_SH7751R
 	default "66000000" if CPU_SUBTYPE_SH4_202
 	default "50000000"
diff --git a/arch/sh/include/asm/processor.h b/arch/sh/include/asm/processor.h
index 1fd58b4..005c962 100644
--- a/arch/sh/include/asm/processor.h
+++ b/arch/sh/include/asm/processor.h
@@ -32,7 +32,7 @@ enum cpu_type {
 
 	/* SH-4A types */
 	CPU_SH7763, CPU_SH7770, CPU_SH7780, CPU_SH7781, CPU_SH7785, CPU_SH7786,
-	CPU_SH7723, CPU_SHX3,
+	CPU_SH7723, CPU_SH7724, CPU_SHX3,
 
 	/* SH4AL-DSP types */
 	CPU_SH7343, CPU_SH7722, CPU_SH7366,
diff --git a/arch/sh/include/cpu-sh4/cpu/freq.h b/arch/sh/include/cpu-sh4/cpu/freq.h
index 749d1c4..ccf1d99 100644
--- a/arch/sh/include/cpu-sh4/cpu/freq.h
+++ b/arch/sh/include/cpu-sh4/cpu/freq.h
@@ -25,6 +25,24 @@
 #elif defined(CONFIG_CPU_SUBTYPE_SH7763) || \
       defined(CONFIG_CPU_SUBTYPE_SH7780)
 #define	FRQCR			0xffc80000
+#elif defined(CONFIG_CPU_SUBTYPE_SH7724)
+#define FRQCRA			0xa4150000
+#define FRQCRB			0xa4150004
+#define VCLKCR			0xa4150048
+
+#define FCLKACR			0xa4150008
+#define FCLKBCR			0xa415000c
+#define FRQCR			FRQCRA
+#define SCLKACR			FCLKACR
+#define SCLKBCR			FCLKBCR
+#define FCLKACR			0xa4150008
+#define FCLKBCR			0xa415000c
+#define IrDACLKCR		0xa4150018
+
+#define MSTPCR0			0xa4150030
+#define MSTPCR1			0xa4150034
+#define MSTPCR2			0xa4150038
+
 #elif defined(CONFIG_CPU_SUBTYPE_SH7785)
 #define FRQCR0			0xffc80000
 #define FRQCR1			0xffc80004
diff --git a/arch/sh/include/cpu-sh4/cpu/sh7724.h b/arch/sh/include/cpu-sh4/cpu/sh7724.h
new file mode 100644
index 0000000..34605c9e3
--- /dev/null
+++ b/arch/sh/include/cpu-sh4/cpu/sh7724.h
@@ -0,0 +1,255 @@
+#ifndef __ASM_SH7724_H__
+#define __ASM_SH7724_H__
+
+enum {
+	/* PTA */
+	GPIO_PTA7, GPIO_PTA6, GPIO_PTA5, GPIO_PTA4,
+	GPIO_PTA3, GPIO_PTA2, GPIO_PTA1, GPIO_PTA0,
+
+	/* PTB */
+	GPIO_PTB7, GPIO_PTB6, GPIO_PTB5, GPIO_PTB4,
+	GPIO_PTB3, GPIO_PTB2, GPIO_PTB1, GPIO_PTB0,
+
+	/* PTC */
+	GPIO_PTC7, GPIO_PTC6, GPIO_PTC5, GPIO_PTC4,
+	GPIO_PTC3, GPIO_PTC2, GPIO_PTC1, GPIO_PTC0,
+
+	/* PTD */
+	GPIO_PTD7, GPIO_PTD6, GPIO_PTD5, GPIO_PTD4,
+	GPIO_PTD3, GPIO_PTD2, GPIO_PTD1, GPIO_PTD0,
+
+	/* PTE */
+	GPIO_PTE7, GPIO_PTE6, GPIO_PTE5, GPIO_PTE4,
+	GPIO_PTE3, GPIO_PTE2, GPIO_PTE1, GPIO_PTE0,
+
+	/* PTF */
+	GPIO_PTF7, GPIO_PTF6, GPIO_PTF5, GPIO_PTF4,
+	GPIO_PTF3, GPIO_PTF2, GPIO_PTF1, GPIO_PTF0,
+
+	/* PTG */
+			      GPIO_PTG5, GPIO_PTG4,
+	GPIO_PTG3, GPIO_PTG2, GPIO_PTG1, GPIO_PTG0,
+
+	/* PTH */
+	GPIO_PTH7, GPIO_PTH6, GPIO_PTH5, GPIO_PTH4,
+	GPIO_PTH3, GPIO_PTH2, GPIO_PTH1, GPIO_PTH0,
+
+	/* PTJ */
+	GPIO_PTJ7, GPIO_PTJ6, GPIO_PTJ5,
+	GPIO_PTJ3, GPIO_PTJ2, GPIO_PTJ1, GPIO_PTJ0,
+
+	/* PTK */
+	GPIO_PTK7, GPIO_PTK6, GPIO_PTK5, GPIO_PTK4,
+	GPIO_PTK3, GPIO_PTK2, GPIO_PTK1, GPIO_PTK0,
+
+	/* PTL */
+	GPIO_PTL7, GPIO_PTL6, GPIO_PTL5, GPIO_PTL4,
+	GPIO_PTL3, GPIO_PTL2, GPIO_PTL1, GPIO_PTL0,
+
+	/* PTM */
+	GPIO_PTM7, GPIO_PTM6, GPIO_PTM5, GPIO_PTM4,
+	GPIO_PTM3, GPIO_PTM2, GPIO_PTM1, GPIO_PTM0,
+
+	/* PTN */
+	GPIO_PTN7, GPIO_PTN6, GPIO_PTN5, GPIO_PTN4,
+	GPIO_PTN3, GPIO_PTN2, GPIO_PTN1, GPIO_PTN0,
+
+	/* PTQ */
+	GPIO_PTQ7, GPIO_PTQ6, GPIO_PTQ5, GPIO_PTQ4,
+	GPIO_PTQ3, GPIO_PTQ2, GPIO_PTQ1, GPIO_PTQ0,
+
+	/* PTR */
+	GPIO_PTR7, GPIO_PTR6, GPIO_PTR5, GPIO_PTR4,
+	GPIO_PTR3, GPIO_PTR2, GPIO_PTR1, GPIO_PTR0,
+
+	/* PTS */
+		   GPIO_PTS6, GPIO_PTS5, GPIO_PTS4,
+	GPIO_PTS3, GPIO_PTS2, GPIO_PTS1, GPIO_PTS0,
+
+	/* PTT */
+	GPIO_PTT7, GPIO_PTT6, GPIO_PTT5, GPIO_PTT4,
+	GPIO_PTT3, GPIO_PTT2, GPIO_PTT1, GPIO_PTT0,
+
+	/* PTU */
+	GPIO_PTU7, GPIO_PTU6, GPIO_PTU5, GPIO_PTU4,
+	GPIO_PTU3, GPIO_PTU2, GPIO_PTU1, GPIO_PTU0,
+
+	/* PTV */
+	GPIO_PTV7, GPIO_PTV6, GPIO_PTV5, GPIO_PTV4,
+	GPIO_PTV3, GPIO_PTV2, GPIO_PTV1, GPIO_PTV0,
+
+	/* PTW */
+	GPIO_PTW7, GPIO_PTW6, GPIO_PTW5, GPIO_PTW4,
+	GPIO_PTW3, GPIO_PTW2, GPIO_PTW1, GPIO_PTW0,
+
+	/* PTX */
+	GPIO_PTX7, GPIO_PTX6, GPIO_PTX5, GPIO_PTX4,
+	GPIO_PTX3, GPIO_PTX2, GPIO_PTX1, GPIO_PTX0,
+
+	/* PTY */
+	GPIO_PTY7, GPIO_PTY6, GPIO_PTY5, GPIO_PTY4,
+	GPIO_PTY3, GPIO_PTY2, GPIO_PTY1, GPIO_PTY0,
+
+	/* PTZ */
+	GPIO_PTZ7, GPIO_PTZ6, GPIO_PTZ5, GPIO_PTZ4,
+	GPIO_PTZ3, GPIO_PTZ2, GPIO_PTZ1, GPIO_PTZ0,
+
+	/* BSC (PTA/PTB/PTJ/PTQ/PTR/PTT) */
+	GPIO_FN_D31, GPIO_FN_D30, GPIO_FN_D29, GPIO_FN_D28,
+	GPIO_FN_D27, GPIO_FN_D26, GPIO_FN_D25, GPIO_FN_D24,
+	GPIO_FN_D23, GPIO_FN_D22, GPIO_FN_D21, GPIO_FN_D20,
+	GPIO_FN_D19, GPIO_FN_D18, GPIO_FN_D17, GPIO_FN_D16,
+	GPIO_FN_D15, GPIO_FN_D14, GPIO_FN_D13, GPIO_FN_D12,
+	GPIO_FN_D11, GPIO_FN_D10, GPIO_FN_D9,  GPIO_FN_D8,
+	GPIO_FN_D7,  GPIO_FN_D6,  GPIO_FN_D5,  GPIO_FN_D4,
+	GPIO_FN_D3,  GPIO_FN_D2,  GPIO_FN_D1,  GPIO_FN_D0,
+	GPIO_FN_A25, GPIO_FN_A24, GPIO_FN_A23, GPIO_FN_A22,
+	GPIO_FN_CS6B_CE1B,  GPIO_FN_CS6A_CE2B,
+	GPIO_FN_CS5B_CE1A,  GPIO_FN_CS5A_CE2A,
+	GPIO_FN_WE3_ICIOWR, GPIO_FN_WE2_ICIORD,
+	GPIO_FN_IOIS16,     GPIO_FN_WAIT,
+	GPIO_FN_BS,
+
+	/* KEYSC (PTA/PTB)*/
+	GPIO_FN_KEYOUT5_IN5, GPIO_FN_KEYOUT4_IN6, GPIO_FN_KEYIN4,
+	GPIO_FN_KEYIN3,  GPIO_FN_KEYIN2,  GPIO_FN_KEYIN1,  GPIO_FN_KEYIN0,
+	GPIO_FN_KEYOUT3, GPIO_FN_KEYOUT2, GPIO_FN_KEYOUT1, GPIO_FN_KEYOUT0,
+
+	/* ATAPI (PTA/PTB/PTK/PTR/PTS/PTW) */
+	GPIO_FN_IDED15, GPIO_FN_IDED14, GPIO_FN_IDED13, GPIO_FN_IDED12,
+	GPIO_FN_IDED11, GPIO_FN_IDED10, GPIO_FN_IDED9,  GPIO_FN_IDED8,
+	GPIO_FN_IDED7,  GPIO_FN_IDED6,  GPIO_FN_IDED5,  GPIO_FN_IDED4,
+	GPIO_FN_IDED3,  GPIO_FN_IDED2,  GPIO_FN_IDED1,  GPIO_FN_IDED0,
+	GPIO_FN_IDEA2,     GPIO_FN_IDEA1,     GPIO_FN_IDEA0,  GPIO_FN_IDEIOWR,
+	GPIO_FN_IODREQ,    GPIO_FN_IDECS0,    GPIO_FN_IDECS1, GPIO_FN_IDEIORD,
+	GPIO_FN_DIRECTION, GPIO_FN_EXBUF_ENB, GPIO_FN_IDERST, GPIO_FN_IODACK,
+	GPIO_FN_IDEINT,    GPIO_FN_IDEIORDY,
+
+	/* TPU (PTB/PTR/PTS) */
+	GPIO_FN_TPUTO3, GPIO_FN_TPUTO2, GPIO_FN_TPUTO1, GPIO_FN_TPUTO0,
+	GPIO_FN_TPUTI3, GPIO_FN_TPUTI2,
+
+	/* LCDC (PTC/PTD/PTE/PTF/PTM/PTR) */
+	GPIO_FN_LCDD23, GPIO_FN_LCDD22, GPIO_FN_LCDD21, GPIO_FN_LCDD20,
+	GPIO_FN_LCDD19, GPIO_FN_LCDD18, GPIO_FN_LCDD17, GPIO_FN_LCDD16,
+	GPIO_FN_LCDD15, GPIO_FN_LCDD14, GPIO_FN_LCDD13, GPIO_FN_LCDD12,
+	GPIO_FN_LCDD11, GPIO_FN_LCDD10, GPIO_FN_LCDD9,  GPIO_FN_LCDD8,
+	GPIO_FN_LCDD7,  GPIO_FN_LCDD6,  GPIO_FN_LCDD5,  GPIO_FN_LCDD4,
+	GPIO_FN_LCDD3,  GPIO_FN_LCDD2,  GPIO_FN_LCDD1,  GPIO_FN_LCDD0,
+	GPIO_FN_LCDVSYN,  GPIO_FN_LCDDISP,  GPIO_FN_LCDRS,  GPIO_FN_LCDHSYN,
+	GPIO_FN_LCDCS,    GPIO_FN_LCDDON,   GPIO_FN_LCDDCK, GPIO_FN_LCDWR,
+	GPIO_FN_LCDVEPWC, GPIO_FN_LCDVCPWC, GPIO_FN_LCDRD,  GPIO_FN_LCDLCLK,
+
+	/* SCIF0 (PTF/PTM) */
+	GPIO_FN_SCIF0_TXD, GPIO_FN_SCIF0_RXD, GPIO_FN_SCIF0_SCK,
+
+	/* SCIF1 (PTL) */
+	GPIO_FN_SCIF1_SCK, GPIO_FN_SCIF1_RXD, GPIO_FN_SCIF1_TXD,
+
+	/* SCIF2 (PTE/PTF/PTN) with LCDC, VOU */
+	GPIO_FN_SCIF2_L_TXD, GPIO_FN_SCIF2_L_SCK, GPIO_FN_SCIF2_L_RXD,
+	GPIO_FN_SCIF2_V_TXD, GPIO_FN_SCIF2_V_SCK, GPIO_FN_SCIF2_V_RXD,
+
+	/* SCIF3 (PTL/PTN/PTZ) with VOU, IRQ */
+	GPIO_FN_SCIF3_V_SCK, GPIO_FN_SCIF3_V_RXD, GPIO_FN_SCIF3_V_TXD,
+	GPIO_FN_SCIF3_V_CTS, GPIO_FN_SCIF3_V_RTS,
+	GPIO_FN_SCIF3_I_SCK, GPIO_FN_SCIF3_I_RXD, GPIO_FN_SCIF3_I_TXD,
+	GPIO_FN_SCIF3_I_CTS, GPIO_FN_SCIF3_I_RTS,
+
+	/* SCIF4 (PTE) */
+	GPIO_FN_SCIF4_SCK, GPIO_FN_SCIF4_RXD, GPIO_FN_SCIF4_TXD,
+
+	/* SCIF5 (PTS) */
+	GPIO_FN_SCIF5_SCK, GPIO_FN_SCIF5_RXD, GPIO_FN_SCIF5_TXD,
+
+	/* FSI (PTE/PTU/PTV) */
+	GPIO_FN_FSIMCKB,   GPIO_FN_FSIMCKA,    GPIO_FN_FSIOASD,
+	GPIO_FN_FSIIABCK,  GPIO_FN_FSIIALRCK,  GPIO_FN_FSIOABCK,
+	GPIO_FN_FSIOALRCK, GPIO_FN_CLKAUDIOAO, GPIO_FN_FSIIBSD,
+	GPIO_FN_FSIOBSD,   GPIO_FN_FSIIBBCK,   GPIO_FN_FSIIBLRCK,
+	GPIO_FN_FSIOBBCK,  GPIO_FN_FSIOBLRCK,  GPIO_FN_CLKAUDIOBO,
+	GPIO_FN_FSIIASD,
+
+	/* AUD (PTG) */
+	GPIO_FN_AUDCK,   GPIO_FN_AUDSYNC, GPIO_FN_AUDATA3,
+	GPIO_FN_AUDATA2, GPIO_FN_AUDATA1, GPIO_FN_AUDATA0,
+
+	/* VIO (PTS) (common?) */
+	GPIO_FN_VIO_CKO,
+
+	/* VIO0 (PTH/PTK) */
+	GPIO_FN_VIO0_D15, GPIO_FN_VIO0_D14, GPIO_FN_VIO0_D13, GPIO_FN_VIO0_D12,
+	GPIO_FN_VIO0_D11, GPIO_FN_VIO0_D10, GPIO_FN_VIO0_D9,  GPIO_FN_VIO0_D8,
+	GPIO_FN_VIO0_D7,  GPIO_FN_VIO0_D6,  GPIO_FN_VIO0_D5,  GPIO_FN_VIO0_D4,
+	GPIO_FN_VIO0_D3,  GPIO_FN_VIO0_D2,  GPIO_FN_VIO0_D1,  GPIO_FN_VIO0_D0,
+	GPIO_FN_VIO0_VD,  GPIO_FN_VIO0_CLK,
+	GPIO_FN_VIO0_FLD, GPIO_FN_VIO0_HD,
+
+	/* VIO1 (PTK/PTS) */
+	GPIO_FN_VIO1_D7,  GPIO_FN_VIO1_D6, GPIO_FN_VIO1_D5, GPIO_FN_VIO1_D4,
+	GPIO_FN_VIO1_D3,  GPIO_FN_VIO1_D2, GPIO_FN_VIO1_D1, GPIO_FN_VIO1_D0,
+	GPIO_FN_VIO1_FLD, GPIO_FN_VIO1_HD, GPIO_FN_VIO1_VD, GPIO_FN_VIO1_CLK,
+
+	/* Eth  (PTL/PTN/PTX) */
+	GPIO_FN_RMII_RXD0,    GPIO_FN_RMII_RXD1,
+	GPIO_FN_RMII_TXD0,    GPIO_FN_RMII_TXD1,
+	GPIO_FN_RMII_REF_CLK, GPIO_FN_RMII_TX_EN,
+	GPIO_FN_RMII_RX_ER,   GPIO_FN_RMII_CRS_DV,
+	GPIO_FN_LNKSTA,       GPIO_FN_MDIO,
+	GPIO_FN_MDC,
+
+	/* System (PTJ) */
+	GPIO_FN_PDSTATUS, GPIO_FN_STATUS2, GPIO_FN_STATUS0,
+
+	/* VOU (PTL/PTM/PTN*/
+	GPIO_FN_DV_D15,  GPIO_FN_DV_D14, GPIO_FN_DV_D13,   GPIO_FN_DV_D12,
+	GPIO_FN_DV_D11,  GPIO_FN_DV_D10, GPIO_FN_DV_D9,    GPIO_FN_DV_D8,
+	GPIO_FN_DV_D7,   GPIO_FN_DV_D6,  GPIO_FN_DV_D5,    GPIO_FN_DV_D4,
+	GPIO_FN_DV_D3,   GPIO_FN_DV_D2,  GPIO_FN_DV_D1,    GPIO_FN_DV_D0,
+	GPIO_FN_DV_CLKI, GPIO_FN_DV_CLK, GPIO_FN_DV_VSYNC, GPIO_FN_DV_HSYNC,
+
+	/* MSIOF0 (PTL/PTM) */
+	GPIO_FN_MSIOF0_RXD,   GPIO_FN_MSIOF0_TXD,
+	GPIO_FN_MSIOF0_MCK,   GPIO_FN_MSIOF0_TSCK,
+	GPIO_FN_MSIOF0_SS1,   GPIO_FN_MSIOF0_SS2,
+	GPIO_FN_MSIOF0_TSYNC, GPIO_FN_MSIOF0_RSCK,
+	GPIO_FN_MSIOF0_RSYNC,
+
+	/* MSIOF1 (PTV) */
+	GPIO_FN_MSIOF1_RXD,   GPIO_FN_MSIOF1_TXD,
+	GPIO_FN_MSIOF1_MCK,   GPIO_FN_MSIOF1_TSCK,
+	GPIO_FN_MSIOF1_SS1,   GPIO_FN_MSIOF1_SS2,
+	GPIO_FN_MSIOF1_TSYNC, GPIO_FN_MSIOF1_RSCK,
+	GPIO_FN_MSIOF1_RSYNC,
+
+	/* DMAC (PTU/PTX) */
+	GPIO_FN_DMAC_DACK0, GPIO_FN_DMAC_DREQ0,
+	GPIO_FN_DMAC_DACK1, GPIO_FN_DMAC_DREQ1,
+
+	/* SDHI0 (PTY) */
+	GPIO_FN_SDHI0CD, GPIO_FN_SDHI0WP, GPIO_FN_SDHI0CMD, GPIO_FN_SDHI0CLK,
+	GPIO_FN_SDHI0D3, GPIO_FN_SDHI0D2, GPIO_FN_SDHI0D1,  GPIO_FN_SDHI0D0,
+
+	/* SDHI1 (PTW) */
+	GPIO_FN_SDHI1CD, GPIO_FN_SDHI1WP, GPIO_FN_SDHI1CMD, GPIO_FN_SDHI1CLK,
+	GPIO_FN_SDHI1D3, GPIO_FN_SDHI1D2, GPIO_FN_SDHI1D1,  GPIO_FN_SDHI1D0,
+
+	/* MMC (PTW/PTX)*/
+	GPIO_FN_MMC_D7,  GPIO_FN_MMC_D6,  GPIO_FN_MMC_D5, GPIO_FN_MMC_D4,
+	GPIO_FN_MMC_D3,  GPIO_FN_MMC_D2,  GPIO_FN_MMC_D1, GPIO_FN_MMC_D0,
+	GPIO_FN_MMC_CLK, GPIO_FN_MMC_CMD,
+
+	/* IrDA (PTX) */
+	GPIO_FN_IRDA_OUT, GPIO_FN_IRDA_IN,
+
+	/* TSIF (PTX) */
+	GPIO_FN_TSIF_TS0_SDAT, GPIO_FN_TSIF_TS0_SCK,
+	GPIO_FN_TSIF_TS0_SDEN, GPIO_FN_TSIF_TS0_SPSYNC,
+
+	/* IRQ (PTZ) */
+	GPIO_FN_INTC_IRQ7, GPIO_FN_INTC_IRQ6, GPIO_FN_INTC_IRQ5,
+	GPIO_FN_INTC_IRQ4, GPIO_FN_INTC_IRQ3, GPIO_FN_INTC_IRQ2,
+	GPIO_FN_INTC_IRQ1, GPIO_FN_INTC_IRQ0,
+};
+
+#endif /* __ASM_SH7724_H__ */
diff --git a/arch/sh/kernel/cpu/sh4/probe.c b/arch/sh/kernel/cpu/sh4/probe.c
index 91e3677..973ff83 100644
--- a/arch/sh/kernel/cpu/sh4/probe.c
+++ b/arch/sh/kernel/cpu/sh4/probe.c
@@ -156,6 +156,12 @@ int __init detect_cpu_and_cache_system(void)
 			break;
 		}
 		break;
+	case 0x300b:
+		boot_cpu_data.type = CPU_SH7724;
+		boot_cpu_data.icache.ways = 4;
+		boot_cpu_data.dcache.ways = 4;
+		boot_cpu_data.flags |= CPU_HAS_LLSC | CPU_HAS_FPU;
+		break;
 	case 0x4000:	/* 1st cut */
 	case 0x4001:	/* 2nd cut */
 		boot_cpu_data.type = CPU_SHX3;
diff --git a/arch/sh/kernel/cpu/sh4a/Makefile b/arch/sh/kernel/cpu/sh4a/Makefile
index 1a92361..afd6fba4 100644
--- a/arch/sh/kernel/cpu/sh4a/Makefile
+++ b/arch/sh/kernel/cpu/sh4a/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_CPU_SUBTYPE_SH7786)	+= setup-sh7786.o
 obj-$(CONFIG_CPU_SUBTYPE_SH7343)	+= setup-sh7343.o
 obj-$(CONFIG_CPU_SUBTYPE_SH7722)	+= setup-sh7722.o
 obj-$(CONFIG_CPU_SUBTYPE_SH7723)	+= setup-sh7723.o
+obj-$(CONFIG_CPU_SUBTYPE_SH7724)	+= setup-sh7724.o
 obj-$(CONFIG_CPU_SUBTYPE_SH7366)	+= setup-sh7366.o
 obj-$(CONFIG_CPU_SUBTYPE_SHX3)		+= setup-shx3.o
 
@@ -26,12 +27,14 @@ clock-$(CONFIG_CPU_SUBTYPE_SH7786)	:= clock-sh7786.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7343)	:= clock-sh7722.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7722)	:= clock-sh7722.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7723)	:= clock-sh7722.o
+clock-$(CONFIG_CPU_SUBTYPE_SH7724)	:= clock-sh7722.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7366)	:= clock-sh7722.o
 clock-$(CONFIG_CPU_SUBTYPE_SHX3)	:= clock-shx3.o
 
 # Pinmux setup
 pinmux-$(CONFIG_CPU_SUBTYPE_SH7722)	:= pinmux-sh7722.o
 pinmux-$(CONFIG_CPU_SUBTYPE_SH7723)	:= pinmux-sh7723.o
+pinmux-$(CONFIG_CPU_SUBTYPE_SH7724)	:= pinmux-sh7724.o
 pinmux-$(CONFIG_CPU_SUBTYPE_SH7785)	:= pinmux-sh7785.o
 pinmux-$(CONFIG_CPU_SUBTYPE_SH7786)	:= pinmux-sh7786.o
 
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index 0e174af..1ccdfc5 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -130,6 +130,12 @@ static void adjust_clocks(int originate, int *l, unsigned long v[],
  * is quite simple..
  */
 
+#if defined(CONFIG_CPU_SUBTYPE_SH7724)
+#define STCPLL(frqcr) ((((frqcr >> 24) & 0x3f) + 1) * 2)
+#else
+#define STCPLL(frqcr) (((frqcr >> 24) & 0x1f) + 1)
+#endif
+
 /*
  * Instead of having two separate multipliers/divisors set, like this:
  *
@@ -139,13 +145,17 @@ static void adjust_clocks(int originate, int *l, unsigned long v[],
  * I created the divisors2 array, which is used to calculate rate like
  *   rate = parent * 2 / divisors2[ divisor ];
 */
+#if defined(CONFIG_CPU_SUBTYPE_SH7724)
+static int divisors2[] = { 4, 1, 8, 12, 16, 24, 32, 1, 48, 64, 72, 96, 1, 144 };
+#else
 static int divisors2[] = { 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32, 40 };
+#endif
 
 static void master_clk_recalc(struct clk *clk)
 {
 	unsigned frqcr = ctrl_inl(FRQCR);
 
-	clk->rate = CONFIG_SH_PCLK_FREQ * (((frqcr >> 24) & 0x1f) + 1);
+	clk->rate = CONFIG_SH_PCLK_FREQ * STCPLL(frqcr);
 }
 
 static void master_clk_init(struct clk *clk)
@@ -161,13 +171,30 @@ static void module_clk_recalc(struct clk *clk)
 {
 	unsigned long frqcr = ctrl_inl(FRQCR);
 
-	clk->rate = clk->parent->rate / (((frqcr >> 24) & 0x1f) + 1);
+	clk->rate = clk->parent->rate / STCPLL(frqcr);
 }
 
+#if defined(CONFIG_CPU_SUBTYPE_SH7724)
+#define MASTERDIVS	{ 12, 16, 24, 30, 32, 36, 48 }
+#define STCMASK		0x3f
+#define DIVCALC(div)	(div/2-1)
+#define FRQCRKICK	0x80000000
+#elif defined(CONFIG_CPU_SUBTYPE_SH7723)
+#define MASTERDIVS	{ 6, 8, 12, 16 }
+#define STCMASK		0x1f
+#define DIVCALC(div)	(div-1)
+#define FRQCRKICK	0x00000000
+#else
+#define MASTERDIVS	{ 2, 3, 4, 6, 8, 16 }
+#define STCMASK		0x1f
+#define DIVCALC(div)	(div-1)
+#define FRQCRKICK	0x00000000
+#endif
+
 static int master_clk_setrate(struct clk *clk, unsigned long rate, int id)
 {
 	int div = rate / clk->rate;
-	int master_divs[] = { 2, 3, 4, 6, 8, 16 };
+	int master_divs[] = MASTERDIVS;
 	int index;
 	unsigned long frqcr;
 
@@ -180,8 +207,9 @@ static int master_clk_setrate(struct clk *clk, unsigned long rate, int id)
 	div = master_divs[index - 1];
 
 	frqcr = ctrl_inl(FRQCR);
-	frqcr &= ~(0xF << 24);
-	frqcr |= ( (div-1) << 24);
+	frqcr &= ~(STCMASK << 24);
+	frqcr |= (DIVCALC(div) << 24);
+	frqcr |= FRQCRKICK;
 	ctrl_outl(frqcr, FRQCR);
 
 	return 0;
@@ -377,6 +405,7 @@ static int sh7722_frqcr_set_rate(struct clk *clk, unsigned long rate,
 	/* clear FRQCR bits */
 	frqcr &= ~(ctx.mask << ctx.shift);
 	frqcr |= div << ctx.shift;
+	frqcr |= FRQCRKICK;
 
 	/* ...and perform actual change */
 	ctrl_outl(frqcr, FRQCR);
@@ -542,8 +571,8 @@ static struct clk sh7722_r_clock = {
 	.flags = CLK_RATE_PROPAGATES,
 };
 
-#ifndef CONFIG_CPU_SUBTYPE_SH7343
-
+#if !defined(CONFIG_CPU_SUBTYPE_SH7343) &&\
+    !defined(CONFIG_CPU_SUBTYPE_SH7724)
 /*
  * these three clocks - SIU A, SIU B, IrDA - share the same clk_ops
  * methods of clk_ops determine which register they should access by
@@ -560,15 +589,16 @@ static struct clk sh7722_siu_b_clock = {
 	.arch_flags = SCLKBCR,
 	.ops = &sh7722_siu_clk_ops,
 };
+#endif /* CONFIG_CPU_SUBTYPE_SH7343, SH7724 */
 
-#if defined(CONFIG_CPU_SUBTYPE_SH7722)
+#if defined(CONFIG_CPU_SUBTYPE_SH7722) ||\
+    defined(CONFIG_CPU_SUBTYPE_SH7724)
 static struct clk sh7722_irda_clock = {
 	.name = "irda_clk",
 	.arch_flags = IrDACLKCR,
 	.ops = &sh7722_siu_clk_ops,
 };
 #endif
-#endif /* CONFIG_CPU_SUBTYPE_SH7343 */
 
 static struct clk sh7722_video_clock = {
 	.name = "video_clk",
@@ -715,6 +745,61 @@ static struct clk sh7722_mstpcr_clocks[] = {
 	MSTPCR("vpu0", "bus_clk", 2, 1),
 	MSTPCR("lcdc0", "bus_clk", 2, 0),
 #endif
+#if defined(CONFIG_CPU_SUBTYPE_SH7724)
+	/* See Datasheet : Overview -> Block Diagram */
+	MSTPCR("tlb0", "cpu_clk", 0, 31),
+	MSTPCR("ic0", "cpu_clk", 0, 30),
+	MSTPCR("oc0", "cpu_clk", 0, 29),
+	MSTPCR("rs0", "bus_clk", 0, 28),
+	MSTPCR("ilmem0", "cpu_clk", 0, 27),
+	MSTPCR("l2c0", "sh_clk", 0, 26),
+	MSTPCR("fpu0", "cpu_clk", 0, 24),
+	MSTPCR("intc0", "peripheral_clk", 0, 22),
+	MSTPCR("dmac0", "bus_clk", 0, 21),
+	MSTPCR("sh0", "sh_clk", 0, 20),
+	MSTPCR("hudi0", "peripheral_clk", 0, 19),
+	MSTPCR("ubc0", "cpu_clk", 0, 17),
+	MSTPCR("tmu0", "peripheral_clk", 0, 15),
+	MSTPCR("cmt0", "r_clk", 0, 14),
+	MSTPCR("rwdt0", "r_clk", 0, 13),
+	MSTPCR("dmac1", "bus_clk", 0, 12),
+	MSTPCR("tmu1", "peripheral_clk", 0, 10),
+	MSTPCR("scif0", "peripheral_clk", 0, 9),
+	MSTPCR("scif1", "peripheral_clk", 0, 8),
+	MSTPCR("scif2", "peripheral_clk", 0, 7),
+	MSTPCR("scif3", "bus_clk", 0, 6),
+	MSTPCR("scif4", "bus_clk", 0, 5),
+	MSTPCR("scif5", "bus_clk", 0, 4),
+	MSTPCR("msiof0", "bus_clk", 0, 2),
+	MSTPCR("msiof1", "bus_clk", 0, 1),
+	MSTPCR("keysc0", "r_clk", 1, 12),
+	MSTPCR("rtc0", "r_clk", 1, 11),
+	MSTPCR("i2c0", "peripheral_clk", 1, 9),
+	MSTPCR("i2c1", "peripheral_clk", 1, 8),
+	MSTPCR("mmc0", "bus_clk", 2, 29),
+	MSTPCR("eth0", "bus_clk", 2, 28),
+	MSTPCR("atapi0", "bus_clk", 2, 26),
+	MSTPCR("tpu0", "bus_clk", 2, 25),
+	MSTPCR("irda0", "peripheral_clk", 2, 24),
+	MSTPCR("tsif0", "bus_clk", 2, 22),
+	MSTPCR("usb1", "bus_clk", 2, 21),
+	MSTPCR("usb0", "bus_clk", 2, 20),
+	MSTPCR("2dg0", "bus_clk", 2, 19),
+	MSTPCR("sdhi0", "bus_clk", 2, 18),
+	MSTPCR("sdhi1", "bus_clk", 2, 17),
+	MSTPCR("veu1", "bus_clk", 2, 15),
+	MSTPCR("ceu1", "bus_clk", 2, 13),
+	MSTPCR("beu1", "bus_clk", 2, 12),
+	MSTPCR("2ddmac0", "sh_clk", 2, 10),
+	MSTPCR("spu0", "bus_clk", 2, 9),
+	MSTPCR("jpu0", "bus_clk", 2, 6),
+	MSTPCR("vou0", "bus_clk", 2, 5),
+	MSTPCR("beu0", "bus_clk", 2, 4),
+	MSTPCR("ceu0", "bus_clk", 2, 3),
+	MSTPCR("veu0", "bus_clk", 2, 2),
+	MSTPCR("vpu0", "bus_clk", 2, 1),
+	MSTPCR("lcdc0", "bus_clk", 2, 0),
+#endif
 #if defined(CONFIG_CPU_SUBTYPE_SH7343)
 	MSTPCR("uram0", "umem_clk", 0, 28),
 	MSTPCR("xymem0", "bus_clk", 0, 26),
@@ -786,12 +871,15 @@ static struct clk *sh7722_clocks[] = {
 	&sh7722_sh_clock,
 	&sh7722_peripheral_clock,
 	&sh7722_sdram_clock,
-#ifndef CONFIG_CPU_SUBTYPE_SH7343
+#if !defined(CONFIG_CPU_SUBTYPE_SH7343) &&\
+    !defined(CONFIG_CPU_SUBTYPE_SH7724)
 	&sh7722_siu_a_clock,
 	&sh7722_siu_b_clock,
-#if defined(CONFIG_CPU_SUBTYPE_SH7722)
-	&sh7722_irda_clock,
 #endif
+/* 7724 should support FSI clock */
+#if defined(CONFIG_CPU_SUBTYPE_SH7722) || \
+    defined(CONFIG_CPU_SUBTYPE_SH7724)
+	&sh7722_irda_clock,
 #endif
 	&sh7722_video_clock,
 };
diff --git a/arch/sh/kernel/cpu/sh4a/pinmux-sh7724.c b/arch/sh/kernel/cpu/sh4a/pinmux-sh7724.c
new file mode 100644
index 0000000..1af0f95
--- /dev/null
+++ b/arch/sh/kernel/cpu/sh4a/pinmux-sh7724.c
@@ -0,0 +1,2230 @@
+/*
+ * SH7724 Pinmux
+ *
+ * Copyright (C) 2009 Renesas Solutions Corp.
+ *
+ * Kuninori Morimoto <morimoto.kuninori@renesas.com>
+ *
+ * Based on SH7723 Pinmux
+ *  Copyright (C) 2008  Magnus Damm
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/gpio.h>
+#include <cpu/sh7724.h>
+
+enum {
+	PINMUX_RESERVED = 0,
+
+	PINMUX_DATA_BEGIN,
+	PTA7_DATA, PTA6_DATA, PTA5_DATA, PTA4_DATA,
+	PTA3_DATA, PTA2_DATA, PTA1_DATA, PTA0_DATA,
+	PTB7_DATA, PTB6_DATA, PTB5_DATA, PTB4_DATA,
+	PTB3_DATA, PTB2_DATA, PTB1_DATA, PTB0_DATA,
+	PTC7_DATA, PTC6_DATA, PTC5_DATA, PTC4_DATA,
+	PTC3_DATA, PTC2_DATA, PTC1_DATA, PTC0_DATA,
+	PTD7_DATA, PTD6_DATA, PTD5_DATA, PTD4_DATA,
+	PTD3_DATA, PTD2_DATA, PTD1_DATA, PTD0_DATA,
+	PTE7_DATA, PTE6_DATA, PTE5_DATA, PTE4_DATA,
+	PTE3_DATA, PTE2_DATA, PTE1_DATA, PTE0_DATA,
+	PTF7_DATA, PTF6_DATA, PTF5_DATA, PTF4_DATA,
+	PTF3_DATA, PTF2_DATA, PTF1_DATA, PTF0_DATA,
+			      PTG5_DATA, PTG4_DATA,
+	PTG3_DATA, PTG2_DATA, PTG1_DATA, PTG0_DATA,
+	PTH7_DATA, PTH6_DATA, PTH5_DATA, PTH4_DATA,
+	PTH3_DATA, PTH2_DATA, PTH1_DATA, PTH0_DATA,
+	PTJ7_DATA, PTJ6_DATA, PTJ5_DATA,
+	PTJ3_DATA, PTJ2_DATA, PTJ1_DATA, PTJ0_DATA,
+	PTK7_DATA, PTK6_DATA, PTK5_DATA, PTK4_DATA,
+	PTK3_DATA, PTK2_DATA, PTK1_DATA, PTK0_DATA,
+	PTL7_DATA, PTL6_DATA, PTL5_DATA, PTL4_DATA,
+	PTL3_DATA, PTL2_DATA, PTL1_DATA, PTL0_DATA,
+	PTM7_DATA, PTM6_DATA, PTM5_DATA, PTM4_DATA,
+	PTM3_DATA, PTM2_DATA, PTM1_DATA, PTM0_DATA,
+	PTN7_DATA, PTN6_DATA, PTN5_DATA, PTN4_DATA,
+	PTN3_DATA, PTN2_DATA, PTN1_DATA, PTN0_DATA,
+	PTQ7_DATA, PTQ6_DATA, PTQ5_DATA, PTQ4_DATA,
+	PTQ3_DATA, PTQ2_DATA, PTQ1_DATA, PTQ0_DATA,
+	PTR7_DATA, PTR6_DATA, PTR5_DATA, PTR4_DATA,
+	PTR3_DATA, PTR2_DATA, PTR1_DATA, PTR0_DATA,
+		   PTS6_DATA, PTS5_DATA, PTS4_DATA,
+	PTS3_DATA, PTS2_DATA, PTS1_DATA, PTS0_DATA,
+	PTT7_DATA, PTT6_DATA, PTT5_DATA, PTT4_DATA,
+	PTT3_DATA, PTT2_DATA, PTT1_DATA, PTT0_DATA,
+	PTU7_DATA, PTU6_DATA, PTU5_DATA, PTU4_DATA,
+	PTU3_DATA, PTU2_DATA, PTU1_DATA, PTU0_DATA,
+	PTV7_DATA, PTV6_DATA, PTV5_DATA, PTV4_DATA,
+	PTV3_DATA, PTV2_DATA, PTV1_DATA, PTV0_DATA,
+	PTW7_DATA, PTW6_DATA, PTW5_DATA, PTW4_DATA,
+	PTW3_DATA, PTW2_DATA, PTW1_DATA, PTW0_DATA,
+	PTX7_DATA, PTX6_DATA, PTX5_DATA, PTX4_DATA,
+	PTX3_DATA, PTX2_DATA, PTX1_DATA, PTX0_DATA,
+	PTY7_DATA, PTY6_DATA, PTY5_DATA, PTY4_DATA,
+	PTY3_DATA, PTY2_DATA, PTY1_DATA, PTY0_DATA,
+	PTZ7_DATA, PTZ6_DATA, PTZ5_DATA, PTZ4_DATA,
+	PTZ3_DATA, PTZ2_DATA, PTZ1_DATA, PTZ0_DATA,
+	PINMUX_DATA_END,
+
+	PINMUX_INPUT_BEGIN,
+	PTA7_IN, PTA6_IN, PTA5_IN, PTA4_IN,
+	PTA3_IN, PTA2_IN, PTA1_IN, PTA0_IN,
+	PTB7_IN, PTB6_IN, PTB5_IN, PTB4_IN,
+	PTB3_IN, PTB2_IN, PTB1_IN, PTB0_IN,
+	PTC7_IN, PTC6_IN, PTC5_IN, PTC4_IN,
+	PTC3_IN, PTC2_IN, PTC1_IN, PTC0_IN,
+	PTD7_IN, PTD6_IN, PTD5_IN, PTD4_IN,
+	PTD3_IN, PTD2_IN, PTD1_IN, PTD0_IN,
+	PTE7_IN, PTE6_IN, PTE5_IN, PTE4_IN,
+	PTE3_IN, PTE2_IN, PTE1_IN, PTE0_IN,
+	PTF7_IN, PTF6_IN, PTF5_IN, PTF4_IN,
+	PTF3_IN, PTF2_IN, PTF1_IN, PTF0_IN,
+	PTH7_IN, PTH6_IN, PTH5_IN, PTH4_IN,
+	PTH3_IN, PTH2_IN, PTH1_IN, PTH0_IN,
+	PTJ3_IN, PTJ2_IN, PTJ1_IN, PTJ0_IN,
+	PTK7_IN, PTK6_IN, PTK5_IN, PTK4_IN,
+	PTK3_IN, PTK2_IN, PTK1_IN, PTK0_IN,
+	PTL7_IN, PTL6_IN, PTL5_IN, PTL4_IN,
+	PTL3_IN, PTL2_IN, PTL1_IN, PTL0_IN,
+	PTM7_IN, PTM6_IN, PTM5_IN, PTM4_IN,
+	PTM3_IN, PTM2_IN, PTM1_IN, PTM0_IN,
+	PTN7_IN, PTN6_IN, PTN5_IN, PTN4_IN,
+	PTN3_IN, PTN2_IN, PTN1_IN, PTN0_IN,
+	PTQ7_IN, PTQ6_IN, PTQ5_IN, PTQ4_IN,
+	PTQ3_IN, PTQ2_IN, PTQ1_IN, PTQ0_IN,
+	PTR7_IN, PTR6_IN, PTR5_IN, PTR4_IN,
+	PTR3_IN, PTR2_IN, PTR1_IN, PTR0_IN,
+		 PTS6_IN, PTS5_IN, PTS4_IN,
+	PTS3_IN, PTS2_IN, PTS1_IN, PTS0_IN,
+	PTT7_IN, PTT6_IN, PTT5_IN, PTT4_IN,
+	PTT3_IN, PTT2_IN, PTT1_IN, PTT0_IN,
+	PTU7_IN, PTU6_IN, PTU5_IN, PTU4_IN,
+	PTU3_IN, PTU2_IN, PTU1_IN, PTU0_IN,
+	PTV7_IN, PTV6_IN, PTV5_IN, PTV4_IN,
+	PTV3_IN, PTV2_IN, PTV1_IN, PTV0_IN,
+	PTW7_IN, PTW6_IN, PTW5_IN, PTW4_IN,
+	PTW3_IN, PTW2_IN, PTW1_IN, PTW0_IN,
+	PTX7_IN, PTX6_IN, PTX5_IN, PTX4_IN,
+	PTX3_IN, PTX2_IN, PTX1_IN, PTX0_IN,
+	PTY7_IN, PTY6_IN, PTY5_IN, PTY4_IN,
+	PTY3_IN, PTY2_IN, PTY1_IN, PTY0_IN,
+	PTZ7_IN, PTZ6_IN, PTZ5_IN, PTZ4_IN,
+	PTZ3_IN, PTZ2_IN, PTZ1_IN, PTZ0_IN,
+	PINMUX_INPUT_END,
+
+	PINMUX_INPUT_PULLUP_BEGIN,
+	PTA7_IN_PU, PTA6_IN_PU, PTA5_IN_PU, PTA4_IN_PU,
+	PTA3_IN_PU, PTA2_IN_PU, PTA1_IN_PU, PTA0_IN_PU,
+	PTB7_IN_PU, PTB6_IN_PU, PTB5_IN_PU, PTB4_IN_PU,
+	PTB3_IN_PU, PTB2_IN_PU, PTB1_IN_PU, PTB0_IN_PU,
+	PTC7_IN_PU, PTC6_IN_PU, PTC5_IN_PU, PTC4_IN_PU,
+	PTC3_IN_PU, PTC2_IN_PU, PTC1_IN_PU, PTC0_IN_PU,
+	PTD7_IN_PU, PTD6_IN_PU, PTD5_IN_PU, PTD4_IN_PU,
+	PTD3_IN_PU, PTD2_IN_PU, PTD1_IN_PU, PTD0_IN_PU,
+	PTE7_IN_PU, PTE6_IN_PU, PTE5_IN_PU, PTE4_IN_PU,
+	PTE3_IN_PU, PTE2_IN_PU, PTE1_IN_PU, PTE0_IN_PU,
+	PTF7_IN_PU, PTF6_IN_PU, PTF5_IN_PU, PTF4_IN_PU,
+	PTF3_IN_PU, PTF2_IN_PU, PTF1_IN_PU, PTF0_IN_PU,
+	PTH7_IN_PU, PTH6_IN_PU, PTH5_IN_PU, PTH4_IN_PU,
+	PTH3_IN_PU, PTH2_IN_PU, PTH1_IN_PU, PTH0_IN_PU,
+	PTJ3_IN_PU, PTJ2_IN_PU, PTJ1_IN_PU, PTJ0_IN_PU,
+	PTK7_IN_PU, PTK6_IN_PU, PTK5_IN_PU, PTK4_IN_PU,
+	PTK3_IN_PU, PTK2_IN_PU, PTK1_IN_PU, PTK0_IN_PU,
+	PTL7_IN_PU, PTL6_IN_PU, PTL5_IN_PU, PTL4_IN_PU,
+	PTL3_IN_PU, PTL2_IN_PU, PTL1_IN_PU, PTL0_IN_PU,
+	PTM7_IN_PU, PTM6_IN_PU, PTM5_IN_PU, PTM4_IN_PU,
+	PTM3_IN_PU, PTM2_IN_PU, PTM1_IN_PU, PTM0_IN_PU,
+	PTN7_IN_PU, PTN6_IN_PU, PTN5_IN_PU, PTN4_IN_PU,
+	PTN3_IN_PU, PTN2_IN_PU, PTN1_IN_PU, PTN0_IN_PU,
+	PTQ7_IN_PU, PTQ6_IN_PU, PTQ5_IN_PU, PTQ4_IN_PU,
+	PTQ3_IN_PU, PTQ2_IN_PU, PTQ1_IN_PU, PTQ0_IN_PU,
+	PTR7_IN_PU, PTR6_IN_PU, PTR5_IN_PU, PTR4_IN_PU,
+	PTR3_IN_PU, PTR2_IN_PU, PTR1_IN_PU, PTR0_IN_PU,
+		    PTS6_IN_PU, PTS5_IN_PU, PTS4_IN_PU,
+	PTS3_IN_PU, PTS2_IN_PU, PTS1_IN_PU, PTS0_IN_PU,
+	PTT7_IN_PU, PTT6_IN_PU, PTT5_IN_PU, PTT4_IN_PU,
+	PTT3_IN_PU, PTT2_IN_PU, PTT1_IN_PU, PTT0_IN_PU,
+	PTU7_IN_PU, PTU6_IN_PU, PTU5_IN_PU, PTU4_IN_PU,
+	PTU3_IN_PU, PTU2_IN_PU, PTU1_IN_PU, PTU0_IN_PU,
+	PTV7_IN_PU, PTV6_IN_PU, PTV5_IN_PU, PTV4_IN_PU,
+	PTV3_IN_PU, PTV2_IN_PU, PTV1_IN_PU, PTV0_IN_PU,
+	PTW7_IN_PU, PTW6_IN_PU, PTW5_IN_PU, PTW4_IN_PU,
+	PTW3_IN_PU, PTW2_IN_PU, PTW1_IN_PU, PTW0_IN_PU,
+	PTX7_IN_PU, PTX6_IN_PU, PTX5_IN_PU, PTX4_IN_PU,
+	PTX3_IN_PU, PTX2_IN_PU, PTX1_IN_PU, PTX0_IN_PU,
+	PTY7_IN_PU, PTY6_IN_PU, PTY5_IN_PU, PTY4_IN_PU,
+	PTY3_IN_PU, PTY2_IN_PU, PTY1_IN_PU, PTY0_IN_PU,
+	PTZ7_IN_PU, PTZ6_IN_PU, PTZ5_IN_PU, PTZ4_IN_PU,
+	PTZ3_IN_PU, PTZ2_IN_PU, PTZ1_IN_PU, PTZ0_IN_PU,
+	PINMUX_INPUT_PULLUP_END,
+
+	PINMUX_OUTPUT_BEGIN,
+	PTA7_OUT, PTA6_OUT, PTA5_OUT, PTA4_OUT,
+	PTA3_OUT, PTA2_OUT, PTA1_OUT, PTA0_OUT,
+	PTB7_OUT, PTB6_OUT, PTB5_OUT, PTB4_OUT,
+	PTB3_OUT, PTB2_OUT, PTB1_OUT, PTB0_OUT,
+	PTC7_OUT, PTC6_OUT, PTC5_OUT, PTC4_OUT,
+	PTC3_OUT, PTC2_OUT, PTC1_OUT, PTC0_OUT,
+	PTD7_OUT, PTD6_OUT, PTD5_OUT, PTD4_OUT,
+	PTD3_OUT, PTD2_OUT, PTD1_OUT, PTD0_OUT,
+	PTE7_OUT, PTE6_OUT, PTE5_OUT, PTE4_OUT,
+	PTE3_OUT, PTE2_OUT, PTE1_OUT, PTE0_OUT,
+	PTF7_OUT, PTF6_OUT, PTF5_OUT, PTF4_OUT,
+	PTF3_OUT, PTF2_OUT, PTF1_OUT, PTF0_OUT,
+			    PTG5_OUT, PTG4_OUT,
+	PTG3_OUT, PTG2_OUT, PTG1_OUT, PTG0_OUT,
+	PTH7_OUT, PTH6_OUT, PTH5_OUT, PTH4_OUT,
+	PTH3_OUT, PTH2_OUT, PTH1_OUT, PTH0_OUT,
+	PTJ7_OUT, PTJ6_OUT, PTJ5_OUT,
+	PTJ3_OUT, PTJ2_OUT, PTJ1_OUT, PTJ0_OUT,
+	PTK7_OUT, PTK6_OUT, PTK5_OUT, PTK4_OUT,
+	PTK3_OUT, PTK2_OUT, PTK1_OUT, PTK0_OUT,
+	PTL7_OUT, PTL6_OUT, PTL5_OUT, PTL4_OUT,
+	PTL3_OUT, PTL2_OUT, PTL1_OUT, PTL0_OUT,
+	PTM7_OUT, PTM6_OUT, PTM5_OUT, PTM4_OUT,
+	PTM3_OUT, PTM2_OUT, PTM1_OUT, PTM0_OUT,
+	PTN7_OUT, PTN6_OUT, PTN5_OUT, PTN4_OUT,
+	PTN3_OUT, PTN2_OUT, PTN1_OUT, PTN0_OUT,
+	PTQ7_OUT, PTQ6_OUT, PTQ5_OUT, PTQ4_OUT,
+	PTQ3_OUT, PTQ2_OUT, PTQ1_OUT, PTQ0_OUT,
+	PTR7_OUT, PTR6_OUT, PTR5_OUT, PTR4_OUT,
+			    PTR1_OUT, PTR0_OUT,
+		  PTS6_OUT, PTS5_OUT, PTS4_OUT,
+	PTS3_OUT, PTS2_OUT, PTS1_OUT, PTS0_OUT,
+	PTT7_OUT, PTT6_OUT, PTT5_OUT, PTT4_OUT,
+	PTT3_OUT, PTT2_OUT, PTT1_OUT, PTT0_OUT,
+	PTU7_OUT, PTU6_OUT, PTU5_OUT, PTU4_OUT,
+	PTU3_OUT, PTU2_OUT, PTU1_OUT, PTU0_OUT,
+	PTV7_OUT, PTV6_OUT, PTV5_OUT, PTV4_OUT,
+	PTV3_OUT, PTV2_OUT, PTV1_OUT, PTV0_OUT,
+	PTW7_OUT, PTW6_OUT, PTW5_OUT, PTW4_OUT,
+	PTW3_OUT, PTW2_OUT, PTW1_OUT, PTW0_OUT,
+	PTX7_OUT, PTX6_OUT, PTX5_OUT, PTX4_OUT,
+	PTX3_OUT, PTX2_OUT, PTX1_OUT, PTX0_OUT,
+	PTY7_OUT, PTY6_OUT, PTY5_OUT, PTY4_OUT,
+	PTY3_OUT, PTY2_OUT, PTY1_OUT, PTY0_OUT,
+	PTZ7_OUT, PTZ6_OUT, PTZ5_OUT, PTZ4_OUT,
+	PTZ3_OUT, PTZ2_OUT, PTZ1_OUT, PTZ0_OUT,
+	PINMUX_OUTPUT_END,
+
+	PINMUX_FUNCTION_BEGIN,
+	PTA7_FN, PTA6_FN, PTA5_FN, PTA4_FN,
+	PTA3_FN, PTA2_FN, PTA1_FN, PTA0_FN,
+	PTB7_FN, PTB6_FN, PTB5_FN, PTB4_FN,
+	PTB3_FN, PTB2_FN, PTB1_FN, PTB0_FN,
+	PTC7_FN, PTC6_FN, PTC5_FN, PTC4_FN,
+	PTC3_FN, PTC2_FN, PTC1_FN, PTC0_FN,
+	PTD7_FN, PTD6_FN, PTD5_FN, PTD4_FN,
+	PTD3_FN, PTD2_FN, PTD1_FN, PTD0_FN,
+	PTE7_FN, PTE6_FN, PTE5_FN, PTE4_FN,
+	PTE3_FN, PTE2_FN, PTE1_FN, PTE0_FN,
+	PTF7_FN, PTF6_FN, PTF5_FN, PTF4_FN,
+	PTF3_FN, PTF2_FN, PTF1_FN, PTF0_FN,
+			  PTG5_FN, PTG4_FN,
+	PTG3_FN, PTG2_FN, PTG1_FN, PTG0_FN,
+	PTH7_FN, PTH6_FN, PTH5_FN, PTH4_FN,
+	PTH3_FN, PTH2_FN, PTH1_FN, PTH0_FN,
+	PTJ7_FN, PTJ6_FN, PTJ5_FN,
+	PTJ3_FN, PTJ2_FN, PTJ1_FN, PTJ0_FN,
+	PTK7_FN, PTK6_FN, PTK5_FN, PTK4_FN,
+	PTK3_FN, PTK2_FN, PTK1_FN, PTK0_FN,
+	PTL7_FN, PTL6_FN, PTL5_FN, PTL4_FN,
+	PTL3_FN, PTL2_FN, PTL1_FN, PTL0_FN,
+	PTM7_FN, PTM6_FN, PTM5_FN, PTM4_FN,
+	PTM3_FN, PTM2_FN, PTM1_FN, PTM0_FN,
+	PTN7_FN, PTN6_FN, PTN5_FN, PTN4_FN,
+	PTN3_FN, PTN2_FN, PTN1_FN, PTN0_FN,
+	PTQ7_FN, PTQ6_FN, PTQ5_FN, PTQ4_FN,
+	PTQ3_FN, PTQ2_FN, PTQ1_FN, PTQ0_FN,
+	PTR7_FN, PTR6_FN, PTR5_FN, PTR4_FN,
+	PTR3_FN, PTR2_FN, PTR1_FN, PTR0_FN,
+		 PTS6_FN, PTS5_FN, PTS4_FN,
+	PTS3_FN, PTS2_FN, PTS1_FN, PTS0_FN,
+	PTT7_FN, PTT6_FN, PTT5_FN, PTT4_FN,
+	PTT3_FN, PTT2_FN, PTT1_FN, PTT0_FN,
+	PTU7_FN, PTU6_FN, PTU5_FN, PTU4_FN,
+	PTU3_FN, PTU2_FN, PTU1_FN, PTU0_FN,
+	PTV7_FN, PTV6_FN, PTV5_FN, PTV4_FN,
+	PTV3_FN, PTV2_FN, PTV1_FN, PTV0_FN,
+	PTW7_FN, PTW6_FN, PTW5_FN, PTW4_FN,
+	PTW3_FN, PTW2_FN, PTW1_FN, PTW0_FN,
+	PTX7_FN, PTX6_FN, PTX5_FN, PTX4_FN,
+	PTX3_FN, PTX2_FN, PTX1_FN, PTX0_FN,
+	PTY7_FN, PTY6_FN, PTY5_FN, PTY4_FN,
+	PTY3_FN, PTY2_FN, PTY1_FN, PTY0_FN,
+	PTZ7_FN, PTZ6_FN, PTZ5_FN, PTZ4_FN,
+	PTZ3_FN, PTZ2_FN, PTZ1_FN, PTZ0_FN,
+
+
+	PSA15_0, PSA15_1,
+	PSA14_0, PSA14_1,
+	PSA13_0, PSA13_1,
+	PSA12_0, PSA12_1,
+	PSA10_0, PSA10_1,
+	PSA9_0,  PSA9_1,
+	PSA8_0,  PSA8_1,
+	PSA7_0,  PSA7_1,
+	PSA6_0,  PSA6_1,
+	PSA5_0,  PSA5_1,
+	PSA3_0,  PSA3_1,
+	PSA2_0,  PSA2_1,
+	PSA1_0,  PSA1_1,
+	PSA0_0,  PSA0_1,
+
+	PSB14_0, PSB14_1,
+	PSB13_0, PSB13_1,
+	PSB12_0, PSB12_1,
+	PSB11_0, PSB11_1,
+	PSB10_0, PSB10_1,
+	PSB9_0,  PSB9_1,
+	PSB8_0,  PSB8_1,
+	PSB7_0,  PSB7_1,
+	PSB6_0,  PSB6_1,
+	PSB5_0,  PSB5_1,
+	PSB4_0,  PSB4_1,
+	PSB3_0,  PSB3_1,
+	PSB2_0,  PSB2_1,
+	PSB1_0,  PSB1_1,
+	PSB0_0,  PSB0_1,
+
+	PSC15_0, PSC15_1,
+	PSC14_0, PSC14_1,
+	PSC13_0, PSC13_1,
+	PSC12_0, PSC12_1,
+	PSC11_0, PSC11_1,
+	PSC10_0, PSC10_1,
+	PSC9_0,  PSC9_1,
+	PSC8_0,  PSC8_1,
+	PSC7_0,  PSC7_1,
+	PSC6_0,  PSC6_1,
+	PSC5_0,  PSC5_1,
+	PSC4_0,  PSC4_1,
+	PSC2_0,  PSC2_1,
+	PSC1_0,  PSC1_1,
+	PSC0_0,  PSC0_1,
+
+	PSD15_0, PSD15_1,
+	PSD14_0, PSD14_1,
+	PSD13_0, PSD13_1,
+	PSD12_0, PSD12_1,
+	PSD11_0, PSD11_1,
+	PSD10_0, PSD10_1,
+	PSD9_0,  PSD9_1,
+	PSD8_0,  PSD8_1,
+	PSD7_0,  PSD7_1,
+	PSD6_0,  PSD6_1,
+	PSD5_0,  PSD5_1,
+	PSD4_0,  PSD4_1,
+	PSD3_0,  PSD3_1,
+	PSD2_0,  PSD2_1,
+	PSD1_0,  PSD1_1,
+	PSD0_0,  PSD0_1,
+
+	PSE15_0, PSE15_1,
+	PSE14_0, PSE14_1,
+	PSE13_0, PSE13_1,
+	PSE12_0, PSE12_1,
+	PSE11_0, PSE11_1,
+	PSE10_0, PSE10_1,
+	PSE9_0,  PSE9_1,
+	PSE8_0,  PSE8_1,
+	PSE7_0,  PSE7_1,
+	PSE6_0,  PSE6_1,
+	PSE5_0,  PSE5_1,
+	PSE4_0,  PSE4_1,
+	PSE3_0,  PSE3_1,
+	PSE2_0,  PSE2_1,
+	PSE1_0,  PSE1_1,
+	PSE0_0,  PSE0_1,
+	PINMUX_FUNCTION_END,
+
+	PINMUX_MARK_BEGIN,
+	/*PTA*/
+	D23_MARK,	KEYOUT2_MARK,		IDED15_MARK,
+	D22_MARK,	KEYOUT1_MARK,		IDED14_MARK,
+	D21_MARK,	KEYOUT0_MARK,		IDED13_MARK,
+	D20_MARK,	KEYIN4_MARK,		IDED12_MARK,
+	D19_MARK,	KEYIN3_MARK,		IDED11_MARK,
+	D18_MARK,	KEYIN2_MARK,		IDED10_MARK,
+	D17_MARK,	KEYIN1_MARK,		IDED9_MARK,
+	D16_MARK,	KEYIN0_MARK,		IDED8_MARK,
+
+	/*PTB*/
+	D31_MARK,	TPUTO1_MARK,		IDEA1_MARK,
+	D30_MARK,	TPUTO0_MARK,		IDEA0_MARK,
+	D29_MARK,				IODREQ_MARK,
+	D28_MARK,				IDECS0_MARK,
+	D27_MARK,				IDECS1_MARK,
+	D26_MARK,	KEYOUT5_IN5_MARK,	IDEIORD_MARK,
+	D25_MARK,	KEYOUT4_IN6_MARK,	IDEIOWR_MARK,
+	D24_MARK,	KEYOUT3_MARK,		IDEINT_MARK,
+
+	/*PTC*/
+	LCDD7_MARK,
+	LCDD6_MARK,
+	LCDD5_MARK,
+	LCDD4_MARK,
+	LCDD3_MARK,
+	LCDD2_MARK,
+	LCDD1_MARK,
+	LCDD0_MARK,
+
+	/*PTD*/
+	LCDD15_MARK,
+	LCDD14_MARK,
+	LCDD13_MARK,
+	LCDD12_MARK,
+	LCDD11_MARK,
+	LCDD10_MARK,
+	LCDD9_MARK,
+	LCDD8_MARK,
+
+	/*PTE*/
+	FSIMCKB_MARK,
+	FSIMCKA_MARK,
+	LCDD21_MARK,	SCIF2_L_TXD_MARK,
+	LCDD20_MARK,	SCIF4_SCK_MARK,
+	LCDD19_MARK,	SCIF4_RXD_MARK,
+	LCDD18_MARK,	SCIF4_TXD_MARK,
+	LCDD17_MARK,
+	LCDD16_MARK,
+
+	/*PTF*/
+	LCDVSYN_MARK,
+	LCDDISP_MARK,	LCDRS_MARK,
+	LCDHSYN_MARK,	LCDCS_MARK,
+	LCDDON_MARK,
+	LCDDCK_MARK,	LCDWR_MARK,
+	LCDVEPWC_MARK,	SCIF0_TXD_MARK,
+	LCDD23_MARK,	SCIF2_L_SCK_MARK,
+	LCDD22_MARK,	SCIF2_L_RXD_MARK,
+
+	/*PTG*/
+	AUDCK_MARK,
+	AUDSYNC_MARK,
+	AUDATA3_MARK,
+	AUDATA2_MARK,
+	AUDATA1_MARK,
+	AUDATA0_MARK,
+
+	/*PTH*/
+	VIO0_VD_MARK,
+	VIO0_CLK_MARK,
+	VIO0_D7_MARK,
+	VIO0_D6_MARK,
+	VIO0_D5_MARK,
+	VIO0_D4_MARK,
+	VIO0_D3_MARK,
+	VIO0_D2_MARK,
+
+	/*PTJ*/
+	PDSTATUS_MARK,
+	STATUS2_MARK,
+	STATUS0_MARK,
+	A25_MARK,		BS_MARK,
+	A24_MARK,
+	A23_MARK,
+	A22_MARK,
+
+	/*PTK*/
+	VIO1_D5_MARK,	VIO0_D13_MARK,	IDED5_MARK,
+	VIO1_D4_MARK,	VIO0_D12_MARK,	IDED4_MARK,
+	VIO1_D3_MARK,	VIO0_D11_MARK,	IDED3_MARK,
+	VIO1_D2_MARK,	VIO0_D10_MARK,	IDED2_MARK,
+	VIO1_D1_MARK,	VIO0_D9_MARK,	IDED1_MARK,
+	VIO1_D0_MARK,	VIO0_D8_MARK,	IDED0_MARK,
+	VIO0_FLD_MARK,
+	VIO0_HD_MARK,
+
+	/*PTL*/
+	DV_D5_MARK,	SCIF3_V_SCK_MARK,	RMII_RXD0_MARK,
+	DV_D4_MARK,	SCIF3_V_RXD_MARK,	RMII_RXD1_MARK,
+	DV_D3_MARK,	SCIF3_V_TXD_MARK,	RMII_REF_CLK_MARK,
+	DV_D2_MARK,	SCIF1_SCK_MARK,		RMII_TX_EN_MARK,
+	DV_D1_MARK,	SCIF1_RXD_MARK,		RMII_TXD0_MARK,
+	DV_D0_MARK,	SCIF1_TXD_MARK,		RMII_TXD1_MARK,
+	DV_D15_MARK,
+	DV_D14_MARK,	MSIOF0_MCK_MARK,
+
+	/*PTM*/
+	DV_D13_MARK,	MSIOF0_TSCK_MARK,
+	DV_D12_MARK,	MSIOF0_RXD_MARK,
+	DV_D11_MARK,	MSIOF0_TXD_MARK,
+	DV_D10_MARK,	MSIOF0_TSYNC_MARK,
+	DV_D9_MARK,	MSIOF0_SS1_MARK,	MSIOF0_RSCK_MARK,
+	DV_D8_MARK,	MSIOF0_SS2_MARK,	MSIOF0_RSYNC_MARK,
+	LCDVCPWC_MARK,	SCIF0_RXD_MARK,
+	LCDRD_MARK,	SCIF0_SCK_MARK,
+
+	/*PTN*/
+	VIO0_D1_MARK,
+	VIO0_D0_MARK,
+	DV_CLKI_MARK,
+	DV_CLK_MARK,	SCIF2_V_SCK_MARK,
+	DV_VSYNC_MARK,	SCIF2_V_RXD_MARK,
+	DV_HSYNC_MARK,	SCIF2_V_TXD_MARK,
+	DV_D7_MARK,	SCIF3_V_CTS_MARK,	RMII_RX_ER_MARK,
+	DV_D6_MARK,	SCIF3_V_RTS_MARK,	RMII_CRS_DV_MARK,
+
+	/*PTQ*/
+	D7_MARK,
+	D6_MARK,
+	D5_MARK,
+	D4_MARK,
+	D3_MARK,
+	D2_MARK,
+	D1_MARK,
+	D0_MARK,
+
+	/*PTR*/
+	CS6B_CE1B_MARK,
+	CS6A_CE2B_MARK,
+	CS5B_CE1A_MARK,
+	CS5A_CE2A_MARK,
+	IOIS16_MARK,		LCDLCLK_MARK,
+	WAIT_MARK,
+	WE3_ICIOWR_MARK,	TPUTO3_MARK,	TPUTI3_MARK,
+	WE2_ICIORD_MARK,	TPUTO2_MARK,	IDEA2_MARK,
+
+	/*PTS*/
+	VIO_CKO_MARK,
+	VIO1_FLD_MARK,	TPUTI2_MARK,		IDEIORDY_MARK,
+	VIO1_HD_MARK,	SCIF5_SCK_MARK,
+	VIO1_VD_MARK,	SCIF5_RXD_MARK,
+	VIO1_CLK_MARK,	SCIF5_TXD_MARK,
+	VIO1_D7_MARK,	VIO0_D15_MARK,		IDED7_MARK,
+	VIO1_D6_MARK,	VIO0_D14_MARK,		IDED6_MARK,
+
+	/*PTT*/
+	D15_MARK,
+	D14_MARK,
+	D13_MARK,
+	D12_MARK,
+	D11_MARK,
+	D10_MARK,
+	D9_MARK,
+	D8_MARK,
+
+	/*PTU*/
+	DMAC_DACK0_MARK,
+	DMAC_DREQ0_MARK,
+	FSIOASD_MARK,
+	FSIIABCK_MARK,
+	FSIIALRCK_MARK,
+	FSIOABCK_MARK,
+	FSIOALRCK_MARK,
+	CLKAUDIOAO_MARK,
+
+	/*PTV*/
+	FSIIBSD_MARK,		MSIOF1_SS2_MARK,	MSIOF1_RSYNC_MARK,
+	FSIOBSD_MARK,		MSIOF1_SS1_MARK,	MSIOF1_RSCK_MARK,
+	FSIIBBCK_MARK,		MSIOF1_RXD_MARK,
+	FSIIBLRCK_MARK,		MSIOF1_TSYNC_MARK,
+	FSIOBBCK_MARK,		MSIOF1_TSCK_MARK,
+	FSIOBLRCK_MARK,		MSIOF1_TXD_MARK,
+	CLKAUDIOBO_MARK,	MSIOF1_MCK_MARK,
+	FSIIASD_MARK,
+
+	/*PTW*/
+	MMC_D7_MARK,		SDHI1CD_MARK,		IODACK_MARK,
+	MMC_D6_MARK,		SDHI1WP_MARK,		IDERST_MARK,
+	MMC_D5_MARK,		SDHI1D3_MARK,		EXBUF_ENB_MARK,
+	MMC_D4_MARK,		SDHI1D2_MARK,		DIRECTION_MARK,
+	MMC_D3_MARK,		SDHI1D1_MARK,
+	MMC_D2_MARK,		SDHI1D0_MARK,
+	MMC_D1_MARK,		SDHI1CMD_MARK,
+	MMC_D0_MARK,		SDHI1CLK_MARK,
+
+	/*PTX*/
+	DMAC_DACK1_MARK,	IRDA_OUT_MARK,
+	DMAC_DREQ1_MARK,	IRDA_IN_MARK,
+	TSIF_TS0_SDAT_MARK,				LNKSTA_MARK,
+	TSIF_TS0_SCK_MARK,				MDIO_MARK,
+	TSIF_TS0_SDEN_MARK,				MDC_MARK,
+	TSIF_TS0_SPSYNC_MARK,
+	MMC_CLK_MARK,
+	MMC_CMD_MARK,
+
+	/*PTY*/
+	SDHI0CD_MARK,
+	SDHI0WP_MARK,
+	SDHI0D3_MARK,
+	SDHI0D2_MARK,
+	SDHI0D1_MARK,
+	SDHI0D0_MARK,
+	SDHI0CMD_MARK,
+	SDHI0CLK_MARK,
+
+	/*PTZ*/
+	INTC_IRQ7_MARK,		SCIF3_I_CTS_MARK,
+	INTC_IRQ6_MARK,		SCIF3_I_RTS_MARK,
+	INTC_IRQ5_MARK,		SCIF3_I_SCK_MARK,
+	INTC_IRQ4_MARK,		SCIF3_I_RXD_MARK,
+	INTC_IRQ3_MARK,		SCIF3_I_TXD_MARK,
+	INTC_IRQ2_MARK,
+	INTC_IRQ1_MARK,
+	INTC_IRQ0_MARK,
+	PINMUX_MARK_END,
+};
+
+static pinmux_enum_t pinmux_data[] = {
+	/* PTA GPIO */
+	PINMUX_DATA(PTA7_DATA, PTA7_IN, PTA7_OUT, PTA7_IN_PU),
+	PINMUX_DATA(PTA6_DATA, PTA6_IN, PTA6_OUT, PTA6_IN_PU),
+	PINMUX_DATA(PTA5_DATA, PTA5_IN, PTA5_OUT, PTA5_IN_PU),
+	PINMUX_DATA(PTA4_DATA, PTA4_IN, PTA4_OUT, PTA4_IN_PU),
+	PINMUX_DATA(PTA3_DATA, PTA3_IN, PTA3_OUT, PTA3_IN_PU),
+	PINMUX_DATA(PTA2_DATA, PTA2_IN, PTA2_OUT, PTA2_IN_PU),
+	PINMUX_DATA(PTA1_DATA, PTA1_IN, PTA1_OUT, PTA1_IN_PU),
+	PINMUX_DATA(PTA0_DATA, PTA0_IN, PTA0_OUT, PTA0_IN_PU),
+
+	/* PTB GPIO */
+	PINMUX_DATA(PTB7_DATA, PTB7_IN, PTB7_OUT, PTB7_IN_PU),
+	PINMUX_DATA(PTB6_DATA, PTB6_IN, PTB6_OUT, PTB6_IN_PU),
+	PINMUX_DATA(PTB5_DATA, PTB5_IN, PTB5_OUT, PTB5_IN_PU),
+	PINMUX_DATA(PTB4_DATA, PTB4_IN, PTB4_OUT, PTB4_IN_PU),
+	PINMUX_DATA(PTB3_DATA, PTB3_IN, PTB3_OUT, PTB3_IN_PU),
+	PINMUX_DATA(PTB2_DATA, PTB2_IN, PTB2_OUT, PTB2_IN_PU),
+	PINMUX_DATA(PTB1_DATA, PTB1_IN, PTB1_OUT, PTB1_IN_PU),
+	PINMUX_DATA(PTB0_DATA, PTB0_IN, PTB0_OUT, PTB0_IN_PU),
+
+	/* PTC GPIO */
+	PINMUX_DATA(PTC7_DATA, PTC7_IN, PTC7_OUT, PTC7_IN_PU),
+	PINMUX_DATA(PTC6_DATA, PTC6_IN, PTC6_OUT, PTC6_IN_PU),
+	PINMUX_DATA(PTC5_DATA, PTC5_IN, PTC5_OUT, PTC5_IN_PU),
+	PINMUX_DATA(PTC4_DATA, PTC4_IN, PTC4_OUT, PTC4_IN_PU),
+	PINMUX_DATA(PTC3_DATA, PTC3_IN, PTC3_OUT, PTC3_IN_PU),
+	PINMUX_DATA(PTC2_DATA, PTC2_IN, PTC2_OUT, PTC2_IN_PU),
+	PINMUX_DATA(PTC1_DATA, PTC1_IN, PTC1_OUT, PTC1_IN_PU),
+	PINMUX_DATA(PTC0_DATA, PTC0_IN, PTC0_OUT, PTC0_IN_PU),
+
+	/* PTD GPIO */
+	PINMUX_DATA(PTD7_DATA, PTD7_IN, PTD7_OUT, PTD7_IN_PU),
+	PINMUX_DATA(PTD6_DATA, PTD6_IN, PTD6_OUT, PTD6_IN_PU),
+	PINMUX_DATA(PTD5_DATA, PTD5_IN, PTD5_OUT, PTD5_IN_PU),
+	PINMUX_DATA(PTD4_DATA, PTD4_IN, PTD4_OUT, PTD4_IN_PU),
+	PINMUX_DATA(PTD3_DATA, PTD3_IN, PTD3_OUT, PTD3_IN_PU),
+	PINMUX_DATA(PTD2_DATA, PTD2_IN, PTD2_OUT, PTD2_IN_PU),
+	PINMUX_DATA(PTD1_DATA, PTD1_IN, PTD1_OUT, PTD1_IN_PU),
+	PINMUX_DATA(PTD0_DATA, PTD0_IN, PTD0_OUT, PTD0_IN_PU),
+
+	/* PTE GPIO */
+	PINMUX_DATA(PTE7_DATA, PTE7_IN, PTE7_OUT, PTE7_IN_PU),
+	PINMUX_DATA(PTE6_DATA, PTE6_IN, PTE6_OUT, PTE6_IN_PU),
+	PINMUX_DATA(PTE5_DATA, PTE5_IN, PTE5_OUT, PTE5_IN_PU),
+	PINMUX_DATA(PTE4_DATA, PTE4_IN, PTE4_OUT, PTE4_IN_PU),
+	PINMUX_DATA(PTE3_DATA, PTE3_IN, PTE3_OUT, PTE3_IN_PU),
+	PINMUX_DATA(PTE2_DATA, PTE2_IN, PTE2_OUT, PTE2_IN_PU),
+	PINMUX_DATA(PTE1_DATA, PTE1_IN, PTE1_OUT, PTE1_IN_PU),
+	PINMUX_DATA(PTE0_DATA, PTE0_IN, PTE0_OUT, PTE0_IN_PU),
+
+	/* PTF GPIO */
+	PINMUX_DATA(PTF7_DATA, PTF7_IN, PTF7_OUT, PTF7_IN_PU),
+	PINMUX_DATA(PTF6_DATA, PTF6_IN, PTF6_OUT, PTF6_IN_PU),
+	PINMUX_DATA(PTF5_DATA, PTF5_IN, PTF5_OUT, PTF5_IN_PU),
+	PINMUX_DATA(PTF4_DATA, PTF4_IN, PTF4_OUT, PTF4_IN_PU),
+	PINMUX_DATA(PTF3_DATA, PTF3_IN, PTF3_OUT, PTF3_IN_PU),
+	PINMUX_DATA(PTF2_DATA, PTF2_IN, PTF2_OUT, PTF2_IN_PU),
+	PINMUX_DATA(PTF1_DATA, PTF1_IN, PTF1_OUT, PTF1_IN_PU),
+	PINMUX_DATA(PTF0_DATA, PTF0_IN, PTF0_OUT, PTF0_IN_PU),
+
+	/* PTG GPIO */
+	PINMUX_DATA(PTG5_DATA, PTG5_OUT),
+	PINMUX_DATA(PTG4_DATA, PTG4_OUT),
+	PINMUX_DATA(PTG3_DATA, PTG3_OUT),
+	PINMUX_DATA(PTG2_DATA, PTG2_OUT),
+	PINMUX_DATA(PTG1_DATA, PTG1_OUT),
+	PINMUX_DATA(PTG0_DATA, PTG0_OUT),
+
+	/* PTH GPIO */
+	PINMUX_DATA(PTH7_DATA, PTH7_IN, PTH7_OUT, PTH7_IN_PU),
+	PINMUX_DATA(PTH6_DATA, PTH6_IN, PTH6_OUT, PTH6_IN_PU),
+	PINMUX_DATA(PTH5_DATA, PTH5_IN, PTH5_OUT, PTH5_IN_PU),
+	PINMUX_DATA(PTH4_DATA, PTH4_IN, PTH4_OUT, PTH4_IN_PU),
+	PINMUX_DATA(PTH3_DATA, PTH3_IN, PTH3_OUT, PTH3_IN_PU),
+	PINMUX_DATA(PTH2_DATA, PTH2_IN, PTH2_OUT, PTH2_IN_PU),
+	PINMUX_DATA(PTH1_DATA, PTH1_IN, PTH1_OUT, PTH1_IN_PU),
+	PINMUX_DATA(PTH0_DATA, PTH0_IN, PTH0_OUT, PTH0_IN_PU),
+
+	/* PTJ GPIO */
+	PINMUX_DATA(PTJ7_DATA, PTJ7_OUT),
+	PINMUX_DATA(PTJ6_DATA, PTJ6_OUT),
+	PINMUX_DATA(PTJ5_DATA, PTJ5_OUT),
+	PINMUX_DATA(PTJ3_DATA, PTJ3_IN, PTJ3_OUT, PTJ3_IN_PU),
+	PINMUX_DATA(PTJ2_DATA, PTJ2_IN, PTJ2_OUT, PTJ2_IN_PU),
+	PINMUX_DATA(PTJ1_DATA, PTJ1_IN, PTJ1_OUT, PTJ1_IN_PU),
+	PINMUX_DATA(PTJ0_DATA, PTJ0_IN, PTJ0_OUT, PTJ0_IN_PU),
+
+	/* PTK GPIO */
+	PINMUX_DATA(PTK7_DATA, PTK7_IN, PTK7_OUT, PTK7_IN_PU),
+	PINMUX_DATA(PTK6_DATA, PTK6_IN, PTK6_OUT, PTK6_IN_PU),
+	PINMUX_DATA(PTK5_DATA, PTK5_IN, PTK5_OUT, PTK5_IN_PU),
+	PINMUX_DATA(PTK4_DATA, PTK4_IN, PTK4_OUT, PTK4_IN_PU),
+	PINMUX_DATA(PTK3_DATA, PTK3_IN, PTK3_OUT, PTK3_IN_PU),
+	PINMUX_DATA(PTK2_DATA, PTK2_IN, PTK2_OUT, PTK2_IN_PU),
+	PINMUX_DATA(PTK1_DATA, PTK1_IN, PTK1_OUT, PTK1_IN_PU),
+	PINMUX_DATA(PTK0_DATA, PTK0_IN, PTK0_OUT, PTK0_IN_PU),
+
+	/* PTL GPIO */
+	PINMUX_DATA(PTL7_DATA, PTL7_IN, PTL7_OUT, PTL7_IN_PU),
+	PINMUX_DATA(PTL6_DATA, PTL6_IN, PTL6_OUT, PTL6_IN_PU),
+	PINMUX_DATA(PTL5_DATA, PTL5_IN, PTL5_OUT, PTL5_IN_PU),
+	PINMUX_DATA(PTL4_DATA, PTL4_IN, PTL4_OUT, PTL4_IN_PU),
+	PINMUX_DATA(PTL3_DATA, PTL3_IN, PTL3_OUT, PTL3_IN_PU),
+	PINMUX_DATA(PTL2_DATA, PTL2_IN, PTL2_OUT, PTL2_IN_PU),
+	PINMUX_DATA(PTL1_DATA, PTL1_IN, PTL1_OUT, PTL1_IN_PU),
+	PINMUX_DATA(PTL0_DATA, PTL0_IN, PTL0_OUT, PTL0_IN_PU),
+
+	/* PTM GPIO */
+	PINMUX_DATA(PTM7_DATA, PTM7_IN, PTM7_OUT, PTM7_IN_PU),
+	PINMUX_DATA(PTM6_DATA, PTM6_IN, PTM6_OUT, PTM6_IN_PU),
+	PINMUX_DATA(PTM5_DATA, PTM5_IN, PTM5_OUT, PTM5_IN_PU),
+	PINMUX_DATA(PTM4_DATA, PTM4_IN, PTM4_OUT, PTM4_IN_PU),
+	PINMUX_DATA(PTM3_DATA, PTM3_IN, PTM3_OUT, PTM3_IN_PU),
+	PINMUX_DATA(PTM2_DATA, PTM2_IN, PTM2_OUT, PTM2_IN_PU),
+	PINMUX_DATA(PTM1_DATA, PTM1_IN, PTM1_OUT, PTM1_IN_PU),
+	PINMUX_DATA(PTM0_DATA, PTM0_IN, PTM0_OUT, PTM0_IN_PU),
+
+	/* PTN GPIO */
+	PINMUX_DATA(PTN7_DATA, PTN7_IN, PTN7_OUT, PTN7_IN_PU),
+	PINMUX_DATA(PTN6_DATA, PTN6_IN, PTN6_OUT, PTN6_IN_PU),
+	PINMUX_DATA(PTN5_DATA, PTN5_IN, PTN5_OUT, PTN5_IN_PU),
+	PINMUX_DATA(PTN4_DATA, PTN4_IN, PTN4_OUT, PTN4_IN_PU),
+	PINMUX_DATA(PTN3_DATA, PTN3_IN, PTN3_OUT, PTN3_IN_PU),
+	PINMUX_DATA(PTN2_DATA, PTN2_IN, PTN2_OUT, PTN2_IN_PU),
+	PINMUX_DATA(PTN1_DATA, PTN1_IN, PTN1_OUT, PTN1_IN_PU),
+	PINMUX_DATA(PTN0_DATA, PTN0_IN, PTN0_OUT, PTN0_IN_PU),
+
+	/* PTQ GPIO */
+	PINMUX_DATA(PTQ7_DATA, PTQ7_IN, PTQ7_OUT, PTQ7_IN_PU),
+	PINMUX_DATA(PTQ6_DATA, PTQ6_IN, PTQ6_OUT, PTQ6_IN_PU),
+	PINMUX_DATA(PTQ5_DATA, PTQ5_IN, PTQ5_OUT, PTQ5_IN_PU),
+	PINMUX_DATA(PTQ4_DATA, PTQ4_IN, PTQ4_OUT, PTQ4_IN_PU),
+	PINMUX_DATA(PTQ3_DATA, PTQ3_IN, PTQ3_OUT, PTQ3_IN_PU),
+	PINMUX_DATA(PTQ2_DATA, PTQ2_IN, PTQ2_OUT, PTQ2_IN_PU),
+	PINMUX_DATA(PTQ1_DATA, PTQ1_IN, PTQ1_OUT, PTQ1_IN_PU),
+	PINMUX_DATA(PTQ0_DATA, PTQ0_IN, PTQ0_OUT, PTQ0_IN_PU),
+
+	/* PTR GPIO */
+	PINMUX_DATA(PTR7_DATA, PTR7_IN, PTR7_OUT, PTR7_IN_PU),
+	PINMUX_DATA(PTR6_DATA, PTR6_IN, PTR6_OUT, PTR6_IN_PU),
+	PINMUX_DATA(PTR5_DATA, PTR5_IN, PTR5_OUT, PTR5_IN_PU),
+	PINMUX_DATA(PTR4_DATA, PTR4_IN, PTR4_OUT, PTR4_IN_PU),
+	PINMUX_DATA(PTR3_DATA, PTR3_IN,           PTR3_IN_PU),
+	PINMUX_DATA(PTR2_DATA, PTR2_IN,           PTR2_IN_PU),
+	PINMUX_DATA(PTR1_DATA, PTR1_IN, PTR1_OUT, PTR1_IN_PU),
+	PINMUX_DATA(PTR0_DATA, PTR0_IN, PTR0_OUT, PTR0_IN_PU),
+
+	/* PTS GPIO */
+	PINMUX_DATA(PTS6_DATA, PTS6_IN, PTS6_OUT, PTS6_IN_PU),
+	PINMUX_DATA(PTS5_DATA, PTS5_IN, PTS5_OUT, PTS5_IN_PU),
+	PINMUX_DATA(PTS4_DATA, PTS4_IN, PTS4_OUT, PTS4_IN_PU),
+	PINMUX_DATA(PTS3_DATA, PTS3_IN, PTS3_OUT, PTS3_IN_PU),
+	PINMUX_DATA(PTS2_DATA, PTS2_IN, PTS2_OUT, PTS2_IN_PU),
+	PINMUX_DATA(PTS1_DATA, PTS1_IN, PTS1_OUT, PTS1_IN_PU),
+	PINMUX_DATA(PTS0_DATA, PTS0_IN, PTS0_OUT, PTS0_IN_PU),
+
+	/* PTT GPIO */
+	PINMUX_DATA(PTT7_DATA, PTT7_IN, PTT7_OUT, PTT7_IN_PU),
+	PINMUX_DATA(PTT6_DATA, PTT6_IN, PTT6_OUT, PTT6_IN_PU),
+	PINMUX_DATA(PTT5_DATA, PTT5_IN, PTT5_OUT, PTT5_IN_PU),
+	PINMUX_DATA(PTT4_DATA, PTT4_IN, PTT4_OUT, PTT4_IN_PU),
+	PINMUX_DATA(PTT3_DATA, PTT3_IN, PTT3_OUT, PTT3_IN_PU),
+	PINMUX_DATA(PTT2_DATA, PTT2_IN, PTT2_OUT, PTT2_IN_PU),
+	PINMUX_DATA(PTT1_DATA, PTT1_IN, PTT1_OUT, PTT1_IN_PU),
+	PINMUX_DATA(PTT0_DATA, PTT0_IN, PTT0_OUT, PTT0_IN_PU),
+
+	/* PTU GPIO */
+	PINMUX_DATA(PTU7_DATA, PTU7_IN, PTU7_OUT, PTU7_IN_PU),
+	PINMUX_DATA(PTU6_DATA, PTU6_IN, PTU6_OUT, PTU6_IN_PU),
+	PINMUX_DATA(PTU5_DATA, PTU5_IN, PTU5_OUT, PTU5_IN_PU),
+	PINMUX_DATA(PTU4_DATA, PTU4_IN, PTU4_OUT, PTU4_IN_PU),
+	PINMUX_DATA(PTU3_DATA, PTU3_IN, PTU3_OUT, PTU3_IN_PU),
+	PINMUX_DATA(PTU2_DATA, PTU2_IN, PTU2_OUT, PTU2_IN_PU),
+	PINMUX_DATA(PTU1_DATA, PTU1_IN, PTU1_OUT, PTU1_IN_PU),
+	PINMUX_DATA(PTU0_DATA, PTU0_IN, PTU0_OUT, PTU0_IN_PU),
+
+	/* PTV GPIO */
+	PINMUX_DATA(PTV7_DATA, PTV7_IN, PTV7_OUT, PTV7_IN_PU),
+	PINMUX_DATA(PTV6_DATA, PTV6_IN, PTV6_OUT, PTV6_IN_PU),
+	PINMUX_DATA(PTV5_DATA, PTV5_IN, PTV5_OUT, PTV5_IN_PU),
+	PINMUX_DATA(PTV4_DATA, PTV4_IN, PTV4_OUT, PTV4_IN_PU),
+	PINMUX_DATA(PTV3_DATA, PTV3_IN, PTV3_OUT, PTV3_IN_PU),
+	PINMUX_DATA(PTV2_DATA, PTV2_IN, PTV2_OUT, PTV2_IN_PU),
+	PINMUX_DATA(PTV1_DATA, PTV1_IN, PTV1_OUT, PTV1_IN_PU),
+	PINMUX_DATA(PTV0_DATA, PTV0_IN, PTV0_OUT, PTV0_IN_PU),
+
+	/* PTW GPIO */
+	PINMUX_DATA(PTW7_DATA, PTW7_IN, PTW7_OUT, PTW7_IN_PU),
+	PINMUX_DATA(PTW6_DATA, PTW6_IN, PTW6_OUT, PTW6_IN_PU),
+	PINMUX_DATA(PTW5_DATA, PTW5_IN, PTW5_OUT, PTW5_IN_PU),
+	PINMUX_DATA(PTW4_DATA, PTW4_IN, PTW4_OUT, PTW4_IN_PU),
+	PINMUX_DATA(PTW3_DATA, PTW3_IN, PTW3_OUT, PTW3_IN_PU),
+	PINMUX_DATA(PTW2_DATA, PTW2_IN, PTW2_OUT, PTW2_IN_PU),
+	PINMUX_DATA(PTW1_DATA, PTW1_IN, PTW1_OUT, PTW1_IN_PU),
+	PINMUX_DATA(PTW0_DATA, PTW0_IN, PTW0_OUT, PTW0_IN_PU),
+
+	/* PTX GPIO */
+	PINMUX_DATA(PTX7_DATA, PTX7_IN, PTX7_OUT, PTX7_IN_PU),
+	PINMUX_DATA(PTX6_DATA, PTX6_IN, PTX6_OUT, PTX6_IN_PU),
+	PINMUX_DATA(PTX5_DATA, PTX5_IN, PTX5_OUT, PTX5_IN_PU),
+	PINMUX_DATA(PTX4_DATA, PTX4_IN, PTX4_OUT, PTX4_IN_PU),
+	PINMUX_DATA(PTX3_DATA, PTX3_IN, PTX3_OUT, PTX3_IN_PU),
+	PINMUX_DATA(PTX2_DATA, PTX2_IN, PTX2_OUT, PTX2_IN_PU),
+	PINMUX_DATA(PTX1_DATA, PTX1_IN, PTX1_OUT, PTX1_IN_PU),
+	PINMUX_DATA(PTX0_DATA, PTX0_IN, PTX0_OUT, PTX0_IN_PU),
+
+	/* PTY GPIO */
+	PINMUX_DATA(PTY7_DATA, PTY7_IN, PTY7_OUT, PTY7_IN_PU),
+	PINMUX_DATA(PTY6_DATA, PTY6_IN, PTY6_OUT, PTY6_IN_PU),
+	PINMUX_DATA(PTY5_DATA, PTY5_IN, PTY5_OUT, PTY5_IN_PU),
+	PINMUX_DATA(PTY4_DATA, PTY4_IN, PTY4_OUT, PTY4_IN_PU),
+	PINMUX_DATA(PTY3_DATA, PTY3_IN, PTY3_OUT, PTY3_IN_PU),
+	PINMUX_DATA(PTY2_DATA, PTY2_IN, PTY2_OUT, PTY2_IN_PU),
+	PINMUX_DATA(PTY1_DATA, PTY1_IN, PTY1_OUT, PTY1_IN_PU),
+	PINMUX_DATA(PTY0_DATA, PTY0_IN, PTY0_OUT, PTY0_IN_PU),
+
+	/* PTZ GPIO */
+	PINMUX_DATA(PTZ7_DATA, PTZ7_IN, PTZ7_OUT, PTZ7_IN_PU),
+	PINMUX_DATA(PTZ6_DATA, PTZ6_IN, PTZ6_OUT, PTZ6_IN_PU),
+	PINMUX_DATA(PTZ5_DATA, PTZ5_IN, PTZ5_OUT, PTZ5_IN_PU),
+	PINMUX_DATA(PTZ4_DATA, PTZ4_IN, PTZ4_OUT, PTZ4_IN_PU),
+	PINMUX_DATA(PTZ3_DATA, PTZ3_IN, PTZ3_OUT, PTZ3_IN_PU),
+	PINMUX_DATA(PTZ2_DATA, PTZ2_IN, PTZ2_OUT, PTZ2_IN_PU),
+	PINMUX_DATA(PTZ1_DATA, PTZ1_IN, PTZ1_OUT, PTZ1_IN_PU),
+	PINMUX_DATA(PTZ0_DATA, PTZ0_IN, PTZ0_OUT, PTZ0_IN_PU),
+
+	/* PTA FN */
+	PINMUX_DATA(D23_MARK,	PSA15_0, PSA14_0, PTA7_FN),
+	PINMUX_DATA(D22_MARK,	PSA15_0, PSA14_0, PTA6_FN),
+	PINMUX_DATA(D21_MARK,	PSA15_0, PSA14_0, PTA5_FN),
+	PINMUX_DATA(D20_MARK,	PSA15_0, PSA14_0, PTA4_FN),
+	PINMUX_DATA(D19_MARK,	PSA15_0, PSA14_0, PTA3_FN),
+	PINMUX_DATA(D18_MARK,	PSA15_0, PSA14_0, PTA2_FN),
+	PINMUX_DATA(D17_MARK,	PSA15_0, PSA14_0, PTA1_FN),
+	PINMUX_DATA(D16_MARK,	PSA15_0, PSA14_0, PTA0_FN),
+
+	PINMUX_DATA(KEYOUT2_MARK,	PSA15_0, PSA14_1, PTA7_FN),
+	PINMUX_DATA(KEYOUT1_MARK,	PSA15_0, PSA14_1, PTA6_FN),
+	PINMUX_DATA(KEYOUT0_MARK,	PSA15_0, PSA14_1, PTA5_FN),
+	PINMUX_DATA(KEYIN4_MARK,	PSA15_0, PSA14_1, PTA4_FN),
+	PINMUX_DATA(KEYIN3_MARK,	PSA15_0, PSA14_1, PTA3_FN),
+	PINMUX_DATA(KEYIN2_MARK,	PSA15_0, PSA14_1, PTA2_FN),
+	PINMUX_DATA(KEYIN1_MARK,	PSA15_0, PSA14_1, PTA1_FN),
+	PINMUX_DATA(KEYIN0_MARK,	PSA15_0, PSA14_1, PTA0_FN),
+
+	PINMUX_DATA(IDED15_MARK,	PSA15_1, PSA14_0, PTA7_FN),
+	PINMUX_DATA(IDED14_MARK,	PSA15_1, PSA14_0, PTA6_FN),
+	PINMUX_DATA(IDED13_MARK,	PSA15_1, PSA14_0, PTA5_FN),
+	PINMUX_DATA(IDED12_MARK,	PSA15_1, PSA14_0, PTA4_FN),
+	PINMUX_DATA(IDED11_MARK,	PSA15_1, PSA14_0, PTA3_FN),
+	PINMUX_DATA(IDED10_MARK,	PSA15_1, PSA14_0, PTA2_FN),
+	PINMUX_DATA(IDED9_MARK,		PSA15_1, PSA14_0, PTA1_FN),
+	PINMUX_DATA(IDED8_MARK,		PSA15_1, PSA14_0, PTA0_FN),
+
+	/* PTB FN */
+	PINMUX_DATA(D31_MARK,		PSE15_0, PSE14_0, PTB7_FN),
+	PINMUX_DATA(D30_MARK,		PSE15_0, PSE14_0, PTB6_FN),
+	PINMUX_DATA(D29_MARK,		PSE11_0,          PTB5_FN),
+	PINMUX_DATA(D28_MARK,		PSE11_0,          PTB4_FN),
+	PINMUX_DATA(D27_MARK,		PSE11_0,          PTB3_FN),
+	PINMUX_DATA(D26_MARK,		PSA15_0, PSA14_0, PTB2_FN),
+	PINMUX_DATA(D25_MARK,		PSA15_0, PSA14_0, PTB1_FN),
+	PINMUX_DATA(D24_MARK,		PSA15_0, PSA14_0, PTB0_FN),
+
+	PINMUX_DATA(IDEA1_MARK,		PSE15_1, PSE14_0, PTB7_FN),
+	PINMUX_DATA(IDEA0_MARK,		PSE15_1, PSE14_0, PTB6_FN),
+	PINMUX_DATA(IODREQ_MARK,	PSE11_1,          PTB5_FN),
+	PINMUX_DATA(IDECS0_MARK,	PSE11_1,          PTB4_FN),
+	PINMUX_DATA(IDECS1_MARK,	PSE11_1,          PTB3_FN),
+	PINMUX_DATA(IDEIORD_MARK,	PSA15_1, PSA14_0, PTB2_FN),
+	PINMUX_DATA(IDEIOWR_MARK,	PSA15_1, PSA14_0, PTB1_FN),
+	PINMUX_DATA(IDEINT_MARK,	PSA15_1, PSA14_0, PTB0_FN),
+
+	PINMUX_DATA(TPUTO1_MARK,	PSE15_0, PSE14_1, PTB7_FN),
+	PINMUX_DATA(TPUTO0_MARK,	PSE15_0, PSE14_1, PTB6_FN),
+
+	PINMUX_DATA(KEYOUT5_IN5_MARK,	PSA15_0, PSA14_1, PTB2_FN),
+	PINMUX_DATA(KEYOUT4_IN6_MARK,	PSA15_0, PSA14_1, PTB1_FN),
+	PINMUX_DATA(KEYOUT3_MARK,	PSA15_0, PSA14_1, PTB0_FN),
+
+	/* PTC FN */
+	PINMUX_DATA(LCDD7_MARK, PSD5_0, PTC7_FN),
+	PINMUX_DATA(LCDD6_MARK, PSD5_0, PTC6_FN),
+	PINMUX_DATA(LCDD5_MARK, PSD5_0, PTC5_FN),
+	PINMUX_DATA(LCDD4_MARK, PSD5_0, PTC4_FN),
+	PINMUX_DATA(LCDD3_MARK, PSD5_0, PTC3_FN),
+	PINMUX_DATA(LCDD2_MARK, PSD5_0, PTC2_FN),
+	PINMUX_DATA(LCDD1_MARK, PSD5_0, PTC1_FN),
+	PINMUX_DATA(LCDD0_MARK, PSD5_0, PTC0_FN),
+
+	/* PTD FN */
+	PINMUX_DATA(LCDD15_MARK, PSD5_0, PTD7_FN),
+	PINMUX_DATA(LCDD14_MARK, PSD5_0, PTD6_FN),
+	PINMUX_DATA(LCDD13_MARK, PSD5_0, PTD5_FN),
+	PINMUX_DATA(LCDD12_MARK, PSD5_0, PTD4_FN),
+	PINMUX_DATA(LCDD11_MARK, PSD5_0, PTD3_FN),
+	PINMUX_DATA(LCDD10_MARK, PSD5_0, PTD2_FN),
+	PINMUX_DATA(LCDD9_MARK,  PSD5_0, PTD1_FN),
+	PINMUX_DATA(LCDD8_MARK,  PSD5_0, PTD0_FN),
+
+	/* PTE FN */
+	PINMUX_DATA(FSIMCKB_MARK, PTE7_FN),
+	PINMUX_DATA(FSIMCKA_MARK, PTE6_FN),
+
+	PINMUX_DATA(LCDD21_MARK,	PSC5_0, PSC4_0, PTE5_FN),
+	PINMUX_DATA(LCDD20_MARK,	PSD3_0, PSD2_0, PTE4_FN),
+	PINMUX_DATA(LCDD19_MARK,	PSA3_0, PSA2_0, PTE3_FN),
+	PINMUX_DATA(LCDD18_MARK,	PSA3_0, PSA2_0, PTE2_FN),
+	PINMUX_DATA(LCDD17_MARK,	PSD5_0,         PTE1_FN),
+	PINMUX_DATA(LCDD16_MARK,	PSD5_0,         PTE0_FN),
+
+	PINMUX_DATA(SCIF2_L_TXD_MARK,	PSC5_0, PSC4_1, PTE5_FN),
+	PINMUX_DATA(SCIF4_SCK_MARK,	PSD3_0, PSD2_1, PTE4_FN),
+	PINMUX_DATA(SCIF4_RXD_MARK,	PSA3_0, PSA2_1, PTE3_FN),
+	PINMUX_DATA(SCIF4_TXD_MARK,	PSA3_0, PSA2_1, PTE2_FN),
+
+	/* PTF FN */
+	PINMUX_DATA(LCDVSYN_MARK,	PSD8_0,          PTF7_FN),
+	PINMUX_DATA(LCDDISP_MARK,	PSD10_0, PSD9_0, PTF6_FN),
+	PINMUX_DATA(LCDHSYN_MARK,	PSD10_0, PSD9_0, PTF5_FN),
+	PINMUX_DATA(LCDDON_MARK,	PSD8_0,          PTF4_FN),
+	PINMUX_DATA(LCDDCK_MARK,	PSD10_0, PSD9_0, PTF3_FN),
+	PINMUX_DATA(LCDVEPWC_MARK,	PSA6_0,          PTF2_FN),
+	PINMUX_DATA(LCDD23_MARK,	PSC7_0,  PSC6_0, PTF1_FN),
+	PINMUX_DATA(LCDD22_MARK,	PSC5_0,  PSC4_0, PTF0_FN),
+
+	PINMUX_DATA(LCDRS_MARK,		PSD10_0, PSD9_1, PTF6_FN),
+	PINMUX_DATA(LCDCS_MARK,		PSD10_0, PSD9_1, PTF5_FN),
+	PINMUX_DATA(LCDWR_MARK,		PSD10_0, PSD9_1, PTF3_FN),
+
+	PINMUX_DATA(SCIF0_TXD_MARK,	PSA6_1,          PTF2_FN),
+	PINMUX_DATA(SCIF2_L_SCK_MARK,	PSC7_0,  PSC6_1, PTF1_FN),
+	PINMUX_DATA(SCIF2_L_RXD_MARK,	PSC5_0,  PSC4_1, PTF0_FN),
+
+	/* PTG FN */
+	PINMUX_DATA(AUDCK_MARK,   PTG5_FN),
+	PINMUX_DATA(AUDSYNC_MARK, PTG4_FN),
+	PINMUX_DATA(AUDATA3_MARK, PTG3_FN),
+	PINMUX_DATA(AUDATA2_MARK, PTG2_FN),
+	PINMUX_DATA(AUDATA1_MARK, PTG1_FN),
+	PINMUX_DATA(AUDATA0_MARK, PTG0_FN),
+
+	/* PTH FN */
+	PINMUX_DATA(VIO0_VD_MARK,  PTH7_FN),
+	PINMUX_DATA(VIO0_CLK_MARK, PTH6_FN),
+	PINMUX_DATA(VIO0_D7_MARK,  PTH5_FN),
+	PINMUX_DATA(VIO0_D6_MARK,  PTH4_FN),
+	PINMUX_DATA(VIO0_D5_MARK,  PTH3_FN),
+	PINMUX_DATA(VIO0_D4_MARK,  PTH2_FN),
+	PINMUX_DATA(VIO0_D3_MARK,  PTH1_FN),
+	PINMUX_DATA(VIO0_D2_MARK,  PTH0_FN),
+
+	/* PTJ FN */
+	PINMUX_DATA(PDSTATUS_MARK,	PTJ7_FN),
+	PINMUX_DATA(STATUS2_MARK,	PTJ6_FN),
+	PINMUX_DATA(STATUS0_MARK,	PTJ5_FN),
+	PINMUX_DATA(A25_MARK,		PSA8_0, PTJ3_FN),
+	PINMUX_DATA(BS_MARK,		PSA8_1, PTJ3_FN),
+	PINMUX_DATA(A24_MARK,		PTJ2_FN),
+	PINMUX_DATA(A23_MARK,		PTJ1_FN),
+	PINMUX_DATA(A22_MARK,		PTJ0_FN),
+
+	/* PTK FN */
+	PINMUX_DATA(VIO1_D5_MARK,	PSB7_0, PSB6_0, PTK7_FN),
+	PINMUX_DATA(VIO1_D4_MARK,	PSB7_0, PSB6_0, PTK6_FN),
+	PINMUX_DATA(VIO1_D3_MARK,	PSB7_0, PSB6_0, PTK5_FN),
+	PINMUX_DATA(VIO1_D2_MARK,	PSB7_0, PSB6_0, PTK4_FN),
+	PINMUX_DATA(VIO1_D1_MARK,	PSB7_0, PSB6_0, PTK3_FN),
+	PINMUX_DATA(VIO1_D0_MARK,	PSB7_0, PSB6_0, PTK2_FN),
+
+	PINMUX_DATA(VIO0_D13_MARK,	PSB7_0, PSB6_1, PTK7_FN),
+	PINMUX_DATA(VIO0_D12_MARK,	PSB7_0, PSB6_1, PTK6_FN),
+	PINMUX_DATA(VIO0_D11_MARK,	PSB7_0, PSB6_1, PTK5_FN),
+	PINMUX_DATA(VIO0_D10_MARK,	PSB7_0, PSB6_1, PTK4_FN),
+	PINMUX_DATA(VIO0_D9_MARK,	PSB7_0, PSB6_1, PTK3_FN),
+	PINMUX_DATA(VIO0_D8_MARK,	PSB7_0, PSB6_1, PTK2_FN),
+
+	PINMUX_DATA(IDED5_MARK,		PSB7_1, PSB6_0, PTK7_FN),
+	PINMUX_DATA(IDED4_MARK,		PSB7_1, PSB6_0, PTK6_FN),
+	PINMUX_DATA(IDED3_MARK,		PSB7_1, PSB6_0, PTK5_FN),
+	PINMUX_DATA(IDED2_MARK,		PSB7_1, PSB6_0, PTK4_FN),
+	PINMUX_DATA(IDED1_MARK,		PSB7_1, PSB6_0, PTK3_FN),
+	PINMUX_DATA(IDED0_MARK,		PSB7_1, PSB6_0, PTK2_FN),
+
+	PINMUX_DATA(VIO0_FLD_MARK,	PTK1_FN),
+	PINMUX_DATA(VIO0_HD_MARK,	PTK0_FN),
+
+	/* PTL FN */
+	PINMUX_DATA(DV_D5_MARK,		PSB9_0, PSB8_0, PTL7_FN),
+	PINMUX_DATA(DV_D4_MARK,		PSB9_0, PSB8_0, PTL6_FN),
+	PINMUX_DATA(DV_D3_MARK,		PSE7_0, PSE6_0, PTL5_FN),
+	PINMUX_DATA(DV_D2_MARK,		PSC9_0, PSC8_0, PTL4_FN),
+	PINMUX_DATA(DV_D1_MARK,		PSC9_0, PSC8_0, PTL3_FN),
+	PINMUX_DATA(DV_D0_MARK,		PSC9_0, PSC8_0, PTL2_FN),
+	PINMUX_DATA(DV_D15_MARK,	PSD4_0,         PTL1_FN),
+	PINMUX_DATA(DV_D14_MARK,	PSE5_0, PSE4_0, PTL0_FN),
+
+	PINMUX_DATA(SCIF3_V_SCK_MARK,	PSB9_0, PSB8_1, PTL7_FN),
+	PINMUX_DATA(SCIF3_V_RXD_MARK,	PSB9_0, PSB8_1, PTL6_FN),
+	PINMUX_DATA(SCIF3_V_TXD_MARK,	PSE7_0, PSE6_1, PTL5_FN),
+	PINMUX_DATA(SCIF1_SCK_MARK,	PSC9_0, PSC8_1, PTL4_FN),
+	PINMUX_DATA(SCIF1_RXD_MARK,	PSC9_0, PSC8_1, PTL3_FN),
+	PINMUX_DATA(SCIF1_TXD_MARK,	PSC9_0, PSC8_1, PTL2_FN),
+
+	PINMUX_DATA(RMII_RXD0_MARK,	PSB9_1, PSB8_0, PTL7_FN),
+	PINMUX_DATA(RMII_RXD1_MARK,	PSB9_1, PSB8_0, PTL6_FN),
+	PINMUX_DATA(RMII_REF_CLK_MARK,	PSE7_1, PSE6_0, PTL5_FN),
+	PINMUX_DATA(RMII_TX_EN_MARK,	PSC9_1, PSC8_0, PTL4_FN),
+	PINMUX_DATA(RMII_TXD0_MARK,	PSC9_1, PSC8_0, PTL3_FN),
+	PINMUX_DATA(RMII_TXD1_MARK,	PSC9_1, PSC8_0, PTL2_FN),
+
+	PINMUX_DATA(MSIOF0_MCK_MARK,	PSE5_0, PSE4_1, PTL0_FN),
+
+	/* PTM FN */
+	PINMUX_DATA(DV_D13_MARK,	PSC13_0, PSC12_0, PTM7_FN),
+	PINMUX_DATA(DV_D12_MARK,	PSC13_0, PSC12_0, PTM6_FN),
+	PINMUX_DATA(DV_D11_MARK,	PSC13_0, PSC12_0, PTM5_FN),
+	PINMUX_DATA(DV_D10_MARK,	PSC13_0, PSC12_0, PTM4_FN),
+	PINMUX_DATA(DV_D9_MARK,		PSC11_0, PSC10_0, PTM3_FN),
+	PINMUX_DATA(DV_D8_MARK,		PSC11_0, PSC10_0, PTM2_FN),
+
+	PINMUX_DATA(MSIOF0_TSCK_MARK,	PSC13_0, PSC12_1, PTM7_FN),
+	PINMUX_DATA(MSIOF0_RXD_MARK,	PSC13_0, PSC12_1, PTM6_FN),
+	PINMUX_DATA(MSIOF0_TXD_MARK,	PSC13_0, PSC12_1, PTM5_FN),
+	PINMUX_DATA(MSIOF0_TSYNC_MARK,	PSC13_0, PSC12_1, PTM4_FN),
+	PINMUX_DATA(MSIOF0_SS1_MARK,	PSC11_0, PSC10_1, PTM3_FN),
+	PINMUX_DATA(MSIOF0_RSCK_MARK,	PSC11_1, PSC10_0, PTM3_FN),
+	PINMUX_DATA(MSIOF0_SS2_MARK,	PSC11_0, PSC10_1, PTM2_FN),
+	PINMUX_DATA(MSIOF0_RSYNC_MARK,	PSC11_1, PSC10_0, PTM2_FN),
+
+	PINMUX_DATA(LCDVCPWC_MARK,	PSA6_0, PTM1_FN),
+	PINMUX_DATA(LCDRD_MARK,		PSA7_0, PTM0_FN),
+
+	PINMUX_DATA(SCIF0_RXD_MARK,	PSA6_1, PTM1_FN),
+	PINMUX_DATA(SCIF0_SCK_MARK,	PSA7_1, PTM0_FN),
+
+	/* PTN FN */
+	PINMUX_DATA(VIO0_D1_MARK,	PTN7_FN),
+	PINMUX_DATA(VIO0_D0_MARK,	PTN6_FN),
+
+	PINMUX_DATA(DV_CLKI_MARK,	PSD11_0,          PTN5_FN),
+	PINMUX_DATA(DV_CLK_MARK,	PSD13_0, PSD12_0, PTN4_FN),
+	PINMUX_DATA(DV_VSYNC_MARK,	PSD15_0, PSD14_0, PTN3_FN),
+	PINMUX_DATA(DV_HSYNC_MARK,	PSB5_0,  PSB4_0,  PTN2_FN),
+	PINMUX_DATA(DV_D7_MARK,		PSB3_0,  PSB2_0,  PTN1_FN),
+	PINMUX_DATA(DV_D6_MARK,		PSB1_0,  PSB0_0,  PTN0_FN),
+
+	PINMUX_DATA(SCIF2_V_SCK_MARK,	PSD13_0, PSD12_1, PTN4_FN),
+	PINMUX_DATA(SCIF2_V_RXD_MARK,	PSD15_0, PSD14_1, PTN3_FN),
+	PINMUX_DATA(SCIF2_V_TXD_MARK,	PSB5_0,  PSB4_1,  PTN2_FN),
+	PINMUX_DATA(SCIF3_V_CTS_MARK,	PSB3_0,  PSB2_1,  PTN1_FN),
+	PINMUX_DATA(SCIF3_V_RTS_MARK,	PSB1_0,  PSB0_1,  PTN0_FN),
+
+	PINMUX_DATA(RMII_RX_ER_MARK,	PSB3_1, PSB2_0, PTN1_FN),
+	PINMUX_DATA(RMII_CRS_DV_MARK,	PSB1_1, PSB0_0, PTN0_FN),
+
+	/* PTQ FN */
+	PINMUX_DATA(D7_MARK, PTQ7_FN),
+	PINMUX_DATA(D6_MARK, PTQ6_FN),
+	PINMUX_DATA(D5_MARK, PTQ5_FN),
+	PINMUX_DATA(D4_MARK, PTQ4_FN),
+	PINMUX_DATA(D3_MARK, PTQ3_FN),
+	PINMUX_DATA(D2_MARK, PTQ2_FN),
+	PINMUX_DATA(D1_MARK, PTQ1_FN),
+	PINMUX_DATA(D0_MARK, PTQ0_FN),
+
+	/* PTR FN */
+	PINMUX_DATA(CS6B_CE1B_MARK,	                PTR7_FN),
+	PINMUX_DATA(CS6A_CE2B_MARK,	                PTR6_FN),
+	PINMUX_DATA(CS5B_CE1A_MARK,	                PTR5_FN),
+	PINMUX_DATA(CS5A_CE2A_MARK,	                PTR4_FN),
+	PINMUX_DATA(IOIS16_MARK,	PSA5_0,         PTR3_FN),
+	PINMUX_DATA(WAIT_MARK,		                PTR2_FN),
+	PINMUX_DATA(WE3_ICIOWR_MARK,	PSA1_0, PSA0_0, PTR1_FN),
+	PINMUX_DATA(WE2_ICIORD_MARK,	PSD1_0, PSD0_0, PTR0_FN),
+
+	PINMUX_DATA(LCDLCLK_MARK,	PSA5_1,         PTR3_FN),
+
+	PINMUX_DATA(IDEA2_MARK,		PSD1_1, PSD0_0, PTR0_FN),
+
+	PINMUX_DATA(TPUTO3_MARK,	PSA1_0, PSA0_1, PTR1_FN),
+	PINMUX_DATA(TPUTI3_MARK,	PSA1_1, PSA0_0, PTR1_FN),
+	PINMUX_DATA(TPUTO2_MARK,	PSD1_0, PSD0_1, PTR0_FN),
+
+	/* PTS FN */
+	PINMUX_DATA(VIO_CKO_MARK,	PTS6_FN),
+
+	PINMUX_DATA(TPUTI2_MARK,	PSE9_0, PSE8_1, PTS5_FN),
+
+	PINMUX_DATA(IDEIORDY_MARK,	PSE9_1, PSE8_0, PTS5_FN),
+
+	PINMUX_DATA(VIO1_FLD_MARK,	PSE9_0, PSE8_0, PTS5_FN),
+	PINMUX_DATA(VIO1_HD_MARK,	PSA10_0,        PTS4_FN),
+	PINMUX_DATA(VIO1_VD_MARK,	PSA9_0,         PTS3_FN),
+	PINMUX_DATA(VIO1_CLK_MARK,	PSA9_0,         PTS2_FN),
+	PINMUX_DATA(VIO1_D7_MARK,	PSB7_0, PSB6_0, PTS1_FN),
+	PINMUX_DATA(VIO1_D6_MARK,	PSB7_0, PSB6_0, PTS0_FN),
+
+	PINMUX_DATA(SCIF5_SCK_MARK,	PSA10_1, PTS4_FN),
+	PINMUX_DATA(SCIF5_RXD_MARK,	PSA9_1,  PTS3_FN),
+	PINMUX_DATA(SCIF5_TXD_MARK,	PSA9_1,  PTS2_FN),
+
+	PINMUX_DATA(VIO0_D15_MARK,	PSB7_0, PSB6_1, PTS1_FN),
+	PINMUX_DATA(VIO0_D14_MARK,	PSB7_0, PSB6_1, PTS0_FN),
+
+	PINMUX_DATA(IDED7_MARK,		PSB7_1, PSB6_0, PTS1_FN),
+	PINMUX_DATA(IDED6_MARK,		PSB7_1, PSB6_0, PTS0_FN),
+
+	/* PTT FN */
+	PINMUX_DATA(D15_MARK, PTT7_FN),
+	PINMUX_DATA(D14_MARK, PTT6_FN),
+	PINMUX_DATA(D13_MARK, PTT5_FN),
+	PINMUX_DATA(D12_MARK, PTT4_FN),
+	PINMUX_DATA(D11_MARK, PTT3_FN),
+	PINMUX_DATA(D10_MARK, PTT2_FN),
+	PINMUX_DATA(D9_MARK,  PTT1_FN),
+	PINMUX_DATA(D8_MARK,  PTT0_FN),
+
+	/* PTU FN */
+	PINMUX_DATA(DMAC_DACK0_MARK, PTU7_FN),
+	PINMUX_DATA(DMAC_DREQ0_MARK, PTU6_FN),
+
+	PINMUX_DATA(FSIOASD_MARK,	PSE1_0, PTU5_FN),
+	PINMUX_DATA(FSIIABCK_MARK,	PSE1_0, PTU4_FN),
+	PINMUX_DATA(FSIIALRCK_MARK,	PSE1_0, PTU3_FN),
+	PINMUX_DATA(FSIOABCK_MARK,	PSE1_0, PTU2_FN),
+	PINMUX_DATA(FSIOALRCK_MARK,	PSE1_0, PTU1_FN),
+	PINMUX_DATA(CLKAUDIOAO_MARK,	PSE0_0, PTU0_FN),
+
+	/* PTV FN */
+	PINMUX_DATA(FSIIBSD_MARK,	PSD7_0,  PSD6_0,  PTV7_FN),
+	PINMUX_DATA(FSIOBSD_MARK,	PSD7_0,  PSD6_0,  PTV6_FN),
+	PINMUX_DATA(FSIIBBCK_MARK,	PSC15_0, PSC14_0, PTV5_FN),
+	PINMUX_DATA(FSIIBLRCK_MARK,	PSC15_0, PSC14_0, PTV4_FN),
+	PINMUX_DATA(FSIOBBCK_MARK,	PSC15_0, PSC14_0, PTV3_FN),
+	PINMUX_DATA(FSIOBLRCK_MARK,	PSC15_0, PSC14_0, PTV2_FN),
+	PINMUX_DATA(CLKAUDIOBO_MARK,	PSE3_0,  PSE2_0,  PTV1_FN),
+	PINMUX_DATA(FSIIASD_MARK,	PSE10_0,          PTV0_FN),
+
+	PINMUX_DATA(MSIOF1_SS2_MARK,	PSD7_0,  PSD6_1,  PTV7_FN),
+	PINMUX_DATA(MSIOF1_RSYNC_MARK,	PSD7_1,  PSD6_0,  PTV7_FN),
+	PINMUX_DATA(MSIOF1_SS1_MARK,	PSD7_0,  PSD6_1,  PTV6_FN),
+	PINMUX_DATA(MSIOF1_RSCK_MARK,	PSD7_1,  PSD6_0,  PTV6_FN),
+	PINMUX_DATA(MSIOF1_RXD_MARK,	PSC15_0, PSC14_1, PTV5_FN),
+	PINMUX_DATA(MSIOF1_TSYNC_MARK,	PSC15_0, PSC14_1, PTV4_FN),
+	PINMUX_DATA(MSIOF1_TSCK_MARK,	PSC15_0, PSC14_1, PTV3_FN),
+	PINMUX_DATA(MSIOF1_TXD_MARK,	PSC15_0, PSC14_1, PTV2_FN),
+	PINMUX_DATA(MSIOF1_MCK_MARK,	PSE3_0,  PSE2_1,  PTV1_FN),
+
+	/* PTW FN */
+	PINMUX_DATA(MMC_D7_MARK,	PSE13_0, PSE12_0, PTW7_FN),
+	PINMUX_DATA(MMC_D6_MARK,	PSE13_0, PSE12_0, PTW6_FN),
+	PINMUX_DATA(MMC_D5_MARK,	PSE13_0, PSE12_0, PTW5_FN),
+	PINMUX_DATA(MMC_D4_MARK,	PSE13_0, PSE12_0, PTW4_FN),
+	PINMUX_DATA(MMC_D3_MARK,	PSA13_0,          PTW3_FN),
+	PINMUX_DATA(MMC_D2_MARK,	PSA13_0,          PTW2_FN),
+	PINMUX_DATA(MMC_D1_MARK,	PSA13_0,          PTW1_FN),
+	PINMUX_DATA(MMC_D0_MARK,	PSA13_0,          PTW0_FN),
+
+	PINMUX_DATA(SDHI1CD_MARK,	PSE13_0, PSE12_1, PTW7_FN),
+	PINMUX_DATA(SDHI1WP_MARK,	PSE13_0, PSE12_1, PTW6_FN),
+	PINMUX_DATA(SDHI1D3_MARK,	PSE13_0, PSE12_1, PTW5_FN),
+	PINMUX_DATA(SDHI1D2_MARK,	PSE13_0, PSE12_1, PTW4_FN),
+	PINMUX_DATA(SDHI1D1_MARK,	PSA13_1,          PTW3_FN),
+	PINMUX_DATA(SDHI1D0_MARK,	PSA13_1,          PTW2_FN),
+	PINMUX_DATA(SDHI1CMD_MARK,	PSA13_1,          PTW1_FN),
+	PINMUX_DATA(SDHI1CLK_MARK,	PSA13_1,          PTW0_FN),
+
+	PINMUX_DATA(IODACK_MARK,	PSE13_1, PSE12_0, PTW7_FN),
+	PINMUX_DATA(IDERST_MARK,	PSE13_1, PSE12_0, PTW6_FN),
+	PINMUX_DATA(EXBUF_ENB_MARK,	PSE13_1, PSE12_0, PTW5_FN),
+	PINMUX_DATA(DIRECTION_MARK,	PSE13_1, PSE12_0, PTW4_FN),
+
+	/* PTX FN */
+	PINMUX_DATA(DMAC_DACK1_MARK,	PSA12_0, PTX7_FN),
+	PINMUX_DATA(DMAC_DREQ1_MARK,	PSA12_0, PTX6_FN),
+
+	PINMUX_DATA(IRDA_OUT_MARK,	PSA12_1, PTX7_FN),
+	PINMUX_DATA(IRDA_IN_MARK,	PSA12_1, PTX6_FN),
+
+	PINMUX_DATA(TSIF_TS0_SDAT_MARK,	PSC0_0, PTX5_FN),
+	PINMUX_DATA(TSIF_TS0_SCK_MARK,	PSC1_0, PTX4_FN),
+	PINMUX_DATA(TSIF_TS0_SDEN_MARK,	PSC2_0, PTX3_FN),
+	PINMUX_DATA(TSIF_TS0_SPSYNC_MARK,       PTX2_FN),
+
+	PINMUX_DATA(LNKSTA_MARK,	PSC0_1, PTX5_FN),
+	PINMUX_DATA(MDIO_MARK,		PSC1_1, PTX4_FN),
+	PINMUX_DATA(MDC_MARK,		PSC2_1, PTX3_FN),
+
+	PINMUX_DATA(MMC_CLK_MARK, PTX1_FN),
+	PINMUX_DATA(MMC_CMD_MARK, PTX0_FN),
+
+	/* PTY FN */
+	PINMUX_DATA(SDHI0CD_MARK,  PTY7_FN),
+	PINMUX_DATA(SDHI0WP_MARK,  PTY6_FN),
+	PINMUX_DATA(SDHI0D3_MARK,  PTY5_FN),
+	PINMUX_DATA(SDHI0D2_MARK,  PTY4_FN),
+	PINMUX_DATA(SDHI0D1_MARK,  PTY3_FN),
+	PINMUX_DATA(SDHI0D0_MARK,  PTY2_FN),
+	PINMUX_DATA(SDHI0CMD_MARK, PTY1_FN),
+	PINMUX_DATA(SDHI0CLK_MARK, PTY0_FN),
+
+	/* PTZ FN */
+	PINMUX_DATA(INTC_IRQ7_MARK,	PSB10_0, PTZ7_FN),
+	PINMUX_DATA(INTC_IRQ6_MARK,	PSB11_0, PTZ6_FN),
+	PINMUX_DATA(INTC_IRQ5_MARK,	PSB12_0, PTZ5_FN),
+	PINMUX_DATA(INTC_IRQ4_MARK,	PSB13_0, PTZ4_FN),
+	PINMUX_DATA(INTC_IRQ3_MARK,	PSB14_0, PTZ3_FN),
+	PINMUX_DATA(INTC_IRQ2_MARK,	         PTZ2_FN),
+	PINMUX_DATA(INTC_IRQ1_MARK,	         PTZ1_FN),
+	PINMUX_DATA(INTC_IRQ0_MARK,	         PTZ0_FN),
+
+	PINMUX_DATA(SCIF3_I_CTS_MARK,	PSB10_1, PTZ7_FN),
+	PINMUX_DATA(SCIF3_I_RTS_MARK,	PSB11_1, PTZ6_FN),
+	PINMUX_DATA(SCIF3_I_SCK_MARK,	PSB12_1, PTZ5_FN),
+	PINMUX_DATA(SCIF3_I_RXD_MARK,	PSB13_1, PTZ4_FN),
+	PINMUX_DATA(SCIF3_I_TXD_MARK,	PSB14_1, PTZ3_FN),
+};
+
+static struct pinmux_gpio pinmux_gpios[] = {
+	/* PTA */
+	PINMUX_GPIO(GPIO_PTA7, PTA7_DATA),
+	PINMUX_GPIO(GPIO_PTA6, PTA6_DATA),
+	PINMUX_GPIO(GPIO_PTA5, PTA5_DATA),
+	PINMUX_GPIO(GPIO_PTA4, PTA4_DATA),
+	PINMUX_GPIO(GPIO_PTA3, PTA3_DATA),
+	PINMUX_GPIO(GPIO_PTA2, PTA2_DATA),
+	PINMUX_GPIO(GPIO_PTA1, PTA1_DATA),
+	PINMUX_GPIO(GPIO_PTA0, PTA0_DATA),
+
+	/* PTB */
+	PINMUX_GPIO(GPIO_PTB7, PTB7_DATA),
+	PINMUX_GPIO(GPIO_PTB6, PTB6_DATA),
+	PINMUX_GPIO(GPIO_PTB5, PTB5_DATA),
+	PINMUX_GPIO(GPIO_PTB4, PTB4_DATA),
+	PINMUX_GPIO(GPIO_PTB3, PTB3_DATA),
+	PINMUX_GPIO(GPIO_PTB2, PTB2_DATA),
+	PINMUX_GPIO(GPIO_PTB1, PTB1_DATA),
+	PINMUX_GPIO(GPIO_PTB0, PTB0_DATA),
+
+	/* PTC */
+	PINMUX_GPIO(GPIO_PTC7, PTC7_DATA),
+	PINMUX_GPIO(GPIO_PTC6, PTC6_DATA),
+	PINMUX_GPIO(GPIO_PTC5, PTC5_DATA),
+	PINMUX_GPIO(GPIO_PTC4, PTC4_DATA),
+	PINMUX_GPIO(GPIO_PTC3, PTC3_DATA),
+	PINMUX_GPIO(GPIO_PTC2, PTC2_DATA),
+	PINMUX_GPIO(GPIO_PTC1, PTC1_DATA),
+	PINMUX_GPIO(GPIO_PTC0, PTC0_DATA),
+
+	/* PTD */
+	PINMUX_GPIO(GPIO_PTD7, PTD7_DATA),
+	PINMUX_GPIO(GPIO_PTD6, PTD6_DATA),
+	PINMUX_GPIO(GPIO_PTD5, PTD5_DATA),
+	PINMUX_GPIO(GPIO_PTD4, PTD4_DATA),
+	PINMUX_GPIO(GPIO_PTD3, PTD3_DATA),
+	PINMUX_GPIO(GPIO_PTD2, PTD2_DATA),
+	PINMUX_GPIO(GPIO_PTD1, PTD1_DATA),
+	PINMUX_GPIO(GPIO_PTD0, PTD0_DATA),
+
+	/* PTE */
+	PINMUX_GPIO(GPIO_PTE7, PTE7_DATA),
+	PINMUX_GPIO(GPIO_PTE6, PTE6_DATA),
+	PINMUX_GPIO(GPIO_PTE5, PTE5_DATA),
+	PINMUX_GPIO(GPIO_PTE4, PTE4_DATA),
+	PINMUX_GPIO(GPIO_PTE3, PTE3_DATA),
+	PINMUX_GPIO(GPIO_PTE2, PTE2_DATA),
+	PINMUX_GPIO(GPIO_PTE1, PTE1_DATA),
+	PINMUX_GPIO(GPIO_PTE0, PTE0_DATA),
+
+	/* PTF */
+	PINMUX_GPIO(GPIO_PTF7, PTF7_DATA),
+	PINMUX_GPIO(GPIO_PTF6, PTF6_DATA),
+	PINMUX_GPIO(GPIO_PTF5, PTF5_DATA),
+	PINMUX_GPIO(GPIO_PTF4, PTF4_DATA),
+	PINMUX_GPIO(GPIO_PTF3, PTF3_DATA),
+	PINMUX_GPIO(GPIO_PTF2, PTF2_DATA),
+	PINMUX_GPIO(GPIO_PTF1, PTF1_DATA),
+	PINMUX_GPIO(GPIO_PTF0, PTF0_DATA),
+
+	/* PTG */
+	PINMUX_GPIO(GPIO_PTG5, PTG5_DATA),
+	PINMUX_GPIO(GPIO_PTG4, PTG4_DATA),
+	PINMUX_GPIO(GPIO_PTG3, PTG3_DATA),
+	PINMUX_GPIO(GPIO_PTG2, PTG2_DATA),
+	PINMUX_GPIO(GPIO_PTG1, PTG1_DATA),
+	PINMUX_GPIO(GPIO_PTG0, PTG0_DATA),
+
+	/* PTH */
+	PINMUX_GPIO(GPIO_PTH7, PTH7_DATA),
+	PINMUX_GPIO(GPIO_PTH6, PTH6_DATA),
+	PINMUX_GPIO(GPIO_PTH5, PTH5_DATA),
+	PINMUX_GPIO(GPIO_PTH4, PTH4_DATA),
+	PINMUX_GPIO(GPIO_PTH3, PTH3_DATA),
+	PINMUX_GPIO(GPIO_PTH2, PTH2_DATA),
+	PINMUX_GPIO(GPIO_PTH1, PTH1_DATA),
+	PINMUX_GPIO(GPIO_PTH0, PTH0_DATA),
+
+	/* PTJ */
+	PINMUX_GPIO(GPIO_PTJ7, PTJ7_DATA),
+	PINMUX_GPIO(GPIO_PTJ6, PTJ6_DATA),
+	PINMUX_GPIO(GPIO_PTJ5, PTJ5_DATA),
+	PINMUX_GPIO(GPIO_PTJ3, PTJ3_DATA),
+	PINMUX_GPIO(GPIO_PTJ2, PTJ2_DATA),
+	PINMUX_GPIO(GPIO_PTJ1, PTJ1_DATA),
+	PINMUX_GPIO(GPIO_PTJ0, PTJ0_DATA),
+
+	/* PTK */
+	PINMUX_GPIO(GPIO_PTK7, PTK7_DATA),
+	PINMUX_GPIO(GPIO_PTK6, PTK6_DATA),
+	PINMUX_GPIO(GPIO_PTK5, PTK5_DATA),
+	PINMUX_GPIO(GPIO_PTK4, PTK4_DATA),
+	PINMUX_GPIO(GPIO_PTK3, PTK3_DATA),
+	PINMUX_GPIO(GPIO_PTK2, PTK2_DATA),
+	PINMUX_GPIO(GPIO_PTK1, PTK1_DATA),
+	PINMUX_GPIO(GPIO_PTK0, PTK0_DATA),
+
+	/* PTL */
+	PINMUX_GPIO(GPIO_PTL7, PTL7_DATA),
+	PINMUX_GPIO(GPIO_PTL6, PTL6_DATA),
+	PINMUX_GPIO(GPIO_PTL5, PTL5_DATA),
+	PINMUX_GPIO(GPIO_PTL4, PTL4_DATA),
+	PINMUX_GPIO(GPIO_PTL3, PTL3_DATA),
+	PINMUX_GPIO(GPIO_PTL2, PTL2_DATA),
+	PINMUX_GPIO(GPIO_PTL1, PTL1_DATA),
+	PINMUX_GPIO(GPIO_PTL0, PTL0_DATA),
+
+	/* PTM */
+	PINMUX_GPIO(GPIO_PTM7, PTM7_DATA),
+	PINMUX_GPIO(GPIO_PTM6, PTM6_DATA),
+	PINMUX_GPIO(GPIO_PTM5, PTM5_DATA),
+	PINMUX_GPIO(GPIO_PTM4, PTM4_DATA),
+	PINMUX_GPIO(GPIO_PTM3, PTM3_DATA),
+	PINMUX_GPIO(GPIO_PTM2, PTM2_DATA),
+	PINMUX_GPIO(GPIO_PTM1, PTM1_DATA),
+	PINMUX_GPIO(GPIO_PTM0, PTM0_DATA),
+
+	/* PTN */
+	PINMUX_GPIO(GPIO_PTN7, PTN7_DATA),
+	PINMUX_GPIO(GPIO_PTN6, PTN6_DATA),
+	PINMUX_GPIO(GPIO_PTN5, PTN5_DATA),
+	PINMUX_GPIO(GPIO_PTN4, PTN4_DATA),
+	PINMUX_GPIO(GPIO_PTN3, PTN3_DATA),
+	PINMUX_GPIO(GPIO_PTN2, PTN2_DATA),
+	PINMUX_GPIO(GPIO_PTN1, PTN1_DATA),
+	PINMUX_GPIO(GPIO_PTN0, PTN0_DATA),
+
+	/* PTQ */
+	PINMUX_GPIO(GPIO_PTQ7, PTQ7_DATA),
+	PINMUX_GPIO(GPIO_PTQ6, PTQ6_DATA),
+	PINMUX_GPIO(GPIO_PTQ5, PTQ5_DATA),
+	PINMUX_GPIO(GPIO_PTQ4, PTQ4_DATA),
+	PINMUX_GPIO(GPIO_PTQ3, PTQ3_DATA),
+	PINMUX_GPIO(GPIO_PTQ2, PTQ2_DATA),
+	PINMUX_GPIO(GPIO_PTQ1, PTQ1_DATA),
+	PINMUX_GPIO(GPIO_PTQ0, PTQ0_DATA),
+
+	/* PTR */
+	PINMUX_GPIO(GPIO_PTR7, PTR7_DATA),
+	PINMUX_GPIO(GPIO_PTR6, PTR6_DATA),
+	PINMUX_GPIO(GPIO_PTR5, PTR5_DATA),
+	PINMUX_GPIO(GPIO_PTR4, PTR4_DATA),
+	PINMUX_GPIO(GPIO_PTR3, PTR3_DATA),
+	PINMUX_GPIO(GPIO_PTR2, PTR2_DATA),
+	PINMUX_GPIO(GPIO_PTR1, PTR1_DATA),
+	PINMUX_GPIO(GPIO_PTR0, PTR0_DATA),
+
+	/* PTS */
+	PINMUX_GPIO(GPIO_PTS6, PTS6_DATA),
+	PINMUX_GPIO(GPIO_PTS5, PTS5_DATA),
+	PINMUX_GPIO(GPIO_PTS4, PTS4_DATA),
+	PINMUX_GPIO(GPIO_PTS3, PTS3_DATA),
+	PINMUX_GPIO(GPIO_PTS2, PTS2_DATA),
+	PINMUX_GPIO(GPIO_PTS1, PTS1_DATA),
+	PINMUX_GPIO(GPIO_PTS0, PTS0_DATA),
+
+	/* PTT */
+	PINMUX_GPIO(GPIO_PTT7, PTT7_DATA),
+	PINMUX_GPIO(GPIO_PTT6, PTT6_DATA),
+	PINMUX_GPIO(GPIO_PTT5, PTT5_DATA),
+	PINMUX_GPIO(GPIO_PTT4, PTT4_DATA),
+	PINMUX_GPIO(GPIO_PTT3, PTT3_DATA),
+	PINMUX_GPIO(GPIO_PTT2, PTT2_DATA),
+	PINMUX_GPIO(GPIO_PTT1, PTT1_DATA),
+	PINMUX_GPIO(GPIO_PTT0, PTT0_DATA),
+
+	/* PTU */
+	PINMUX_GPIO(GPIO_PTU7, PTU7_DATA),
+	PINMUX_GPIO(GPIO_PTU6, PTU6_DATA),
+	PINMUX_GPIO(GPIO_PTU5, PTU5_DATA),
+	PINMUX_GPIO(GPIO_PTU4, PTU4_DATA),
+	PINMUX_GPIO(GPIO_PTU3, PTU3_DATA),
+	PINMUX_GPIO(GPIO_PTU2, PTU2_DATA),
+	PINMUX_GPIO(GPIO_PTU1, PTU1_DATA),
+	PINMUX_GPIO(GPIO_PTU0, PTU0_DATA),
+
+	/* PTV */
+	PINMUX_GPIO(GPIO_PTV7, PTV7_DATA),
+	PINMUX_GPIO(GPIO_PTV6, PTV6_DATA),
+	PINMUX_GPIO(GPIO_PTV5, PTV5_DATA),
+	PINMUX_GPIO(GPIO_PTV4, PTV4_DATA),
+	PINMUX_GPIO(GPIO_PTV3, PTV3_DATA),
+	PINMUX_GPIO(GPIO_PTV2, PTV2_DATA),
+	PINMUX_GPIO(GPIO_PTV1, PTV1_DATA),
+	PINMUX_GPIO(GPIO_PTV0, PTV0_DATA),
+
+	/* PTW */
+	PINMUX_GPIO(GPIO_PTW7, PTW7_DATA),
+	PINMUX_GPIO(GPIO_PTW6, PTW6_DATA),
+	PINMUX_GPIO(GPIO_PTW5, PTW5_DATA),
+	PINMUX_GPIO(GPIO_PTW4, PTW4_DATA),
+	PINMUX_GPIO(GPIO_PTW3, PTW3_DATA),
+	PINMUX_GPIO(GPIO_PTW2, PTW2_DATA),
+	PINMUX_GPIO(GPIO_PTW1, PTW1_DATA),
+	PINMUX_GPIO(GPIO_PTW0, PTW0_DATA),
+
+	/* PTX */
+	PINMUX_GPIO(GPIO_PTX7, PTX7_DATA),
+	PINMUX_GPIO(GPIO_PTX6, PTX6_DATA),
+	PINMUX_GPIO(GPIO_PTX5, PTX5_DATA),
+	PINMUX_GPIO(GPIO_PTX4, PTX4_DATA),
+	PINMUX_GPIO(GPIO_PTX3, PTX3_DATA),
+	PINMUX_GPIO(GPIO_PTX2, PTX2_DATA),
+	PINMUX_GPIO(GPIO_PTX1, PTX1_DATA),
+	PINMUX_GPIO(GPIO_PTX0, PTX0_DATA),
+
+	/* PTY */
+	PINMUX_GPIO(GPIO_PTY7, PTY7_DATA),
+	PINMUX_GPIO(GPIO_PTY6, PTY6_DATA),
+	PINMUX_GPIO(GPIO_PTY5, PTY5_DATA),
+	PINMUX_GPIO(GPIO_PTY4, PTY4_DATA),
+	PINMUX_GPIO(GPIO_PTY3, PTY3_DATA),
+	PINMUX_GPIO(GPIO_PTY2, PTY2_DATA),
+	PINMUX_GPIO(GPIO_PTY1, PTY1_DATA),
+	PINMUX_GPIO(GPIO_PTY0, PTY0_DATA),
+
+	/* PTZ */
+	PINMUX_GPIO(GPIO_PTZ7, PTZ7_DATA),
+	PINMUX_GPIO(GPIO_PTZ6, PTZ6_DATA),
+	PINMUX_GPIO(GPIO_PTZ5, PTZ5_DATA),
+	PINMUX_GPIO(GPIO_PTZ4, PTZ4_DATA),
+	PINMUX_GPIO(GPIO_PTZ3, PTZ3_DATA),
+	PINMUX_GPIO(GPIO_PTZ2, PTZ2_DATA),
+	PINMUX_GPIO(GPIO_PTZ1, PTZ1_DATA),
+	PINMUX_GPIO(GPIO_PTZ0, PTZ0_DATA),
+
+	/* BSC */
+	PINMUX_GPIO(GPIO_FN_D31,	D31_MARK),
+	PINMUX_GPIO(GPIO_FN_D30,	D30_MARK),
+	PINMUX_GPIO(GPIO_FN_D29,	D29_MARK),
+	PINMUX_GPIO(GPIO_FN_D28,	D28_MARK),
+	PINMUX_GPIO(GPIO_FN_D27,	D27_MARK),
+	PINMUX_GPIO(GPIO_FN_D26,	D26_MARK),
+	PINMUX_GPIO(GPIO_FN_D25,	D25_MARK),
+	PINMUX_GPIO(GPIO_FN_D24,	D24_MARK),
+	PINMUX_GPIO(GPIO_FN_D23,	D23_MARK),
+	PINMUX_GPIO(GPIO_FN_D22,	D22_MARK),
+	PINMUX_GPIO(GPIO_FN_D21,	D21_MARK),
+	PINMUX_GPIO(GPIO_FN_D20,	D20_MARK),
+	PINMUX_GPIO(GPIO_FN_D19,	D19_MARK),
+	PINMUX_GPIO(GPIO_FN_D18,	D18_MARK),
+	PINMUX_GPIO(GPIO_FN_D17,	D17_MARK),
+	PINMUX_GPIO(GPIO_FN_D16,	D16_MARK),
+	PINMUX_GPIO(GPIO_FN_D15,	D15_MARK),
+	PINMUX_GPIO(GPIO_FN_D14,	D14_MARK),
+	PINMUX_GPIO(GPIO_FN_D13,	D13_MARK),
+	PINMUX_GPIO(GPIO_FN_D12,	D12_MARK),
+	PINMUX_GPIO(GPIO_FN_D11,	D11_MARK),
+	PINMUX_GPIO(GPIO_FN_D10,	D10_MARK),
+	PINMUX_GPIO(GPIO_FN_D9,		D9_MARK),
+	PINMUX_GPIO(GPIO_FN_D8,		D8_MARK),
+	PINMUX_GPIO(GPIO_FN_D7,		D7_MARK),
+	PINMUX_GPIO(GPIO_FN_D6,		D6_MARK),
+	PINMUX_GPIO(GPIO_FN_D5,		D5_MARK),
+	PINMUX_GPIO(GPIO_FN_D4,		D4_MARK),
+	PINMUX_GPIO(GPIO_FN_D3,		D3_MARK),
+	PINMUX_GPIO(GPIO_FN_D2,		D2_MARK),
+	PINMUX_GPIO(GPIO_FN_D1,		D1_MARK),
+	PINMUX_GPIO(GPIO_FN_D0,		D0_MARK),
+	PINMUX_GPIO(GPIO_FN_A25,	A25_MARK),
+	PINMUX_GPIO(GPIO_FN_A24,	A24_MARK),
+	PINMUX_GPIO(GPIO_FN_A23,	A23_MARK),
+	PINMUX_GPIO(GPIO_FN_A22,	A22_MARK),
+	PINMUX_GPIO(GPIO_FN_CS6B_CE1B,	CS6B_CE1B_MARK),
+	PINMUX_GPIO(GPIO_FN_CS6A_CE2B,	CS6A_CE2B_MARK),
+	PINMUX_GPIO(GPIO_FN_CS5B_CE1A,	CS5B_CE1A_MARK),
+	PINMUX_GPIO(GPIO_FN_CS5A_CE2A,	CS5A_CE2A_MARK),
+	PINMUX_GPIO(GPIO_FN_WE3_ICIOWR,	WE3_ICIOWR_MARK),
+	PINMUX_GPIO(GPIO_FN_WE2_ICIORD,	WE2_ICIORD_MARK),
+	PINMUX_GPIO(GPIO_FN_IOIS16,	IOIS16_MARK),
+	PINMUX_GPIO(GPIO_FN_WAIT,	WAIT_MARK),
+	PINMUX_GPIO(GPIO_FN_BS,		BS_MARK),
+
+	/* KEYSC */
+	PINMUX_GPIO(GPIO_FN_KEYOUT5_IN5,	KEYOUT5_IN5_MARK),
+	PINMUX_GPIO(GPIO_FN_KEYOUT4_IN6,	KEYOUT4_IN6_MARK),
+	PINMUX_GPIO(GPIO_FN_KEYIN4,		KEYIN4_MARK),
+	PINMUX_GPIO(GPIO_FN_KEYIN3,		KEYIN3_MARK),
+	PINMUX_GPIO(GPIO_FN_KEYIN2,		KEYIN2_MARK),
+	PINMUX_GPIO(GPIO_FN_KEYIN1,		KEYIN1_MARK),
+	PINMUX_GPIO(GPIO_FN_KEYIN0,		KEYIN0_MARK),
+	PINMUX_GPIO(GPIO_FN_KEYOUT3,		KEYOUT3_MARK),
+	PINMUX_GPIO(GPIO_FN_KEYOUT2,		KEYOUT2_MARK),
+	PINMUX_GPIO(GPIO_FN_KEYOUT1,		KEYOUT1_MARK),
+	PINMUX_GPIO(GPIO_FN_KEYOUT0,		KEYOUT0_MARK),
+
+	/* ATAPI */
+	PINMUX_GPIO(GPIO_FN_IDED15,	IDED15_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED14,	IDED14_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED13,	IDED13_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED12,	IDED12_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED11,	IDED11_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED10,	IDED10_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED9,	IDED9_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED8,	IDED8_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED7,	IDED7_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED6,	IDED6_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED5,	IDED5_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED4,	IDED4_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED3,	IDED3_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED2,	IDED2_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED1,	IDED1_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED0,	IDED0_MARK),
+	PINMUX_GPIO(GPIO_FN_IDEA2,	IDEA2_MARK),
+	PINMUX_GPIO(GPIO_FN_IDEA1,	IDEA1_MARK),
+	PINMUX_GPIO(GPIO_FN_IDEA0,	IDEA0_MARK),
+	PINMUX_GPIO(GPIO_FN_IDEIOWR,	IDEIOWR_MARK),
+	PINMUX_GPIO(GPIO_FN_IODREQ,	IODREQ_MARK),
+	PINMUX_GPIO(GPIO_FN_IDECS0,	IDECS0_MARK),
+	PINMUX_GPIO(GPIO_FN_IDECS1,	IDECS1_MARK),
+	PINMUX_GPIO(GPIO_FN_IDEIORD,	IDEIORD_MARK),
+	PINMUX_GPIO(GPIO_FN_DIRECTION,	DIRECTION_MARK),
+	PINMUX_GPIO(GPIO_FN_EXBUF_ENB,	EXBUF_ENB_MARK),
+	PINMUX_GPIO(GPIO_FN_IDERST,	IDERST_MARK),
+	PINMUX_GPIO(GPIO_FN_IODACK,	IODACK_MARK),
+	PINMUX_GPIO(GPIO_FN_IDEINT,	IDEINT_MARK),
+	PINMUX_GPIO(GPIO_FN_IDEIORDY,	IDEIORDY_MARK),
+
+	/* TPU */
+	PINMUX_GPIO(GPIO_FN_TPUTO3,	TPUTO3_MARK),
+	PINMUX_GPIO(GPIO_FN_TPUTO2,	TPUTO2_MARK),
+	PINMUX_GPIO(GPIO_FN_TPUTO1,	TPUTO1_MARK),
+	PINMUX_GPIO(GPIO_FN_TPUTO0,	TPUTO0_MARK),
+	PINMUX_GPIO(GPIO_FN_TPUTI3,	TPUTI3_MARK),
+	PINMUX_GPIO(GPIO_FN_TPUTI2,	TPUTI2_MARK),
+
+	/* LCDC */
+	PINMUX_GPIO(GPIO_FN_LCDD23,	LCDD23_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD22,	LCDD22_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD21,	LCDD21_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD20,	LCDD20_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD19,	LCDD19_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD18,	LCDD18_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD17,	LCDD17_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD16,	LCDD16_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD15,	LCDD15_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD14,	LCDD14_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD13,	LCDD13_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD12,	LCDD12_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD11,	LCDD11_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD10,	LCDD10_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD9,	LCDD9_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD8,	LCDD8_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD7,	LCDD7_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD6,	LCDD6_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD5,	LCDD5_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD4,	LCDD4_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD3,	LCDD3_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD2,	LCDD2_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD1,	LCDD1_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD0,	LCDD0_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDVSYN,	LCDVSYN_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDDISP,	LCDDISP_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDRS,	LCDRS_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDHSYN,	LCDHSYN_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDCS,	LCDCS_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDDON,	LCDDON_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDDCK,	LCDDCK_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDWR,	LCDWR_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDVEPWC,	LCDVEPWC_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDVCPWC,	LCDVCPWC_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDRD,	LCDRD_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDLCLK,	LCDLCLK_MARK),
+
+	/* SCIF0 */
+	PINMUX_GPIO(GPIO_FN_SCIF0_TXD,	SCIF0_TXD_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF0_RXD,	SCIF0_RXD_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF0_SCK,	SCIF0_SCK_MARK),
+
+	/* SCIF1 */
+	PINMUX_GPIO(GPIO_FN_SCIF1_SCK,	SCIF1_SCK_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF1_RXD,	SCIF1_RXD_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF1_TXD,	SCIF1_TXD_MARK),
+
+	/* SCIF2 */
+	PINMUX_GPIO(GPIO_FN_SCIF2_L_TXD,	SCIF2_L_TXD_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF2_L_SCK,	SCIF2_L_SCK_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF2_L_RXD,	SCIF2_L_RXD_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF2_V_TXD,	SCIF2_V_TXD_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF2_V_SCK,	SCIF2_V_SCK_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF2_V_RXD,	SCIF2_V_RXD_MARK),
+
+	/* SCIF3 */
+	PINMUX_GPIO(GPIO_FN_SCIF3_V_SCK,	SCIF3_V_SCK_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF3_V_RXD,	SCIF3_V_RXD_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF3_V_TXD,	SCIF3_V_TXD_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF3_V_CTS,	SCIF3_V_CTS_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF3_V_RTS,	SCIF3_V_RTS_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF3_I_SCK,	SCIF3_I_SCK_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF3_I_RXD,	SCIF3_I_RXD_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF3_I_TXD,	SCIF3_I_TXD_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF3_I_CTS,	SCIF3_I_CTS_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF3_I_RTS,	SCIF3_I_RTS_MARK),
+
+	/* SCIF4 */
+	PINMUX_GPIO(GPIO_FN_SCIF4_SCK,	SCIF4_SCK_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF4_RXD,	SCIF4_RXD_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF4_TXD,	SCIF4_TXD_MARK),
+
+	/* SCIF5 */
+	PINMUX_GPIO(GPIO_FN_SCIF5_SCK,	SCIF5_SCK_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF5_RXD,	SCIF5_RXD_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF5_TXD,	SCIF5_TXD_MARK),
+
+	/* FSI */
+	PINMUX_GPIO(GPIO_FN_FSIMCKB,	FSIMCKB_MARK),
+	PINMUX_GPIO(GPIO_FN_FSIMCKA,	FSIMCKA_MARK),
+	PINMUX_GPIO(GPIO_FN_FSIOASD,	FSIOASD_MARK),
+	PINMUX_GPIO(GPIO_FN_FSIIABCK,	FSIIABCK_MARK),
+	PINMUX_GPIO(GPIO_FN_FSIIALRCK,	FSIIALRCK_MARK),
+	PINMUX_GPIO(GPIO_FN_FSIOABCK,	FSIOABCK_MARK),
+	PINMUX_GPIO(GPIO_FN_FSIOALRCK,	FSIOALRCK_MARK),
+	PINMUX_GPIO(GPIO_FN_CLKAUDIOAO,	CLKAUDIOAO_MARK),
+	PINMUX_GPIO(GPIO_FN_FSIIBSD,	FSIIBSD_MARK),
+	PINMUX_GPIO(GPIO_FN_FSIOBSD,	FSIOBSD_MARK),
+	PINMUX_GPIO(GPIO_FN_FSIIBBCK,	FSIIBBCK_MARK),
+	PINMUX_GPIO(GPIO_FN_FSIIBLRCK,	FSIIBLRCK_MARK),
+	PINMUX_GPIO(GPIO_FN_FSIOBBCK,	FSIOBBCK_MARK),
+	PINMUX_GPIO(GPIO_FN_FSIOBLRCK,	FSIOBLRCK_MARK),
+	PINMUX_GPIO(GPIO_FN_CLKAUDIOBO,	CLKAUDIOBO_MARK),
+	PINMUX_GPIO(GPIO_FN_FSIIASD,	FSIIASD_MARK),
+
+	/* AUD */
+	PINMUX_GPIO(GPIO_FN_AUDCK,	AUDCK_MARK),
+	PINMUX_GPIO(GPIO_FN_AUDSYNC,	AUDSYNC_MARK),
+	PINMUX_GPIO(GPIO_FN_AUDATA3,	AUDATA3_MARK),
+	PINMUX_GPIO(GPIO_FN_AUDATA2,	AUDATA2_MARK),
+	PINMUX_GPIO(GPIO_FN_AUDATA1,	AUDATA1_MARK),
+	PINMUX_GPIO(GPIO_FN_AUDATA0,	AUDATA0_MARK),
+
+	/* VIO */
+	PINMUX_GPIO(GPIO_FN_VIO_CKO,	VIO_CKO_MARK),
+
+	/* VIO0 */
+	PINMUX_GPIO(GPIO_FN_VIO0_D15,	VIO0_D15_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D14,	VIO0_D14_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D13,	VIO0_D13_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D12,	VIO0_D12_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D11,	VIO0_D11_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D10,	VIO0_D10_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D9,	VIO0_D9_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D8,	VIO0_D8_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D7,	VIO0_D7_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D6,	VIO0_D6_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D5,	VIO0_D5_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D4,	VIO0_D4_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D3,	VIO0_D3_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D2,	VIO0_D2_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D1,	VIO0_D1_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D0,	VIO0_D0_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_VD,	VIO0_VD_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_CLK,	VIO0_CLK_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_FLD,	VIO0_FLD_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_HD,	VIO0_HD_MARK),
+
+	/* VIO1 */
+	PINMUX_GPIO(GPIO_FN_VIO1_D7,	VIO1_D7_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO1_D6,	VIO1_D6_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO1_D5,	VIO1_D5_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO1_D4,	VIO1_D4_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO1_D3,	VIO1_D3_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO1_D2,	VIO1_D2_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO1_D1,	VIO1_D1_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO1_D0,	VIO1_D0_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO1_FLD,	VIO1_FLD_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO1_HD,	VIO1_HD_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO1_VD,	VIO1_VD_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO1_CLK,	VIO1_CLK_MARK),
+
+	/* Eth */
+	PINMUX_GPIO(GPIO_FN_RMII_RXD0,		RMII_RXD0_MARK),
+	PINMUX_GPIO(GPIO_FN_RMII_RXD1,		RMII_RXD1_MARK),
+	PINMUX_GPIO(GPIO_FN_RMII_TXD0,		RMII_TXD0_MARK),
+	PINMUX_GPIO(GPIO_FN_RMII_TXD1,		RMII_TXD1_MARK),
+	PINMUX_GPIO(GPIO_FN_RMII_REF_CLK,	RMII_REF_CLK_MARK),
+	PINMUX_GPIO(GPIO_FN_RMII_TX_EN,		RMII_TX_EN_MARK),
+	PINMUX_GPIO(GPIO_FN_RMII_RX_ER,		RMII_RX_ER_MARK),
+	PINMUX_GPIO(GPIO_FN_RMII_CRS_DV,	RMII_CRS_DV_MARK),
+	PINMUX_GPIO(GPIO_FN_LNKSTA,		LNKSTA_MARK),
+	PINMUX_GPIO(GPIO_FN_MDIO,		MDIO_MARK),
+	PINMUX_GPIO(GPIO_FN_MDC,		MDC_MARK),
+
+	/* System */
+	PINMUX_GPIO(GPIO_FN_PDSTATUS,	PDSTATUS_MARK),
+	PINMUX_GPIO(GPIO_FN_STATUS2,	STATUS2_MARK),
+	PINMUX_GPIO(GPIO_FN_STATUS0,	STATUS0_MARK),
+
+	/* VOU */
+	PINMUX_GPIO(GPIO_FN_DV_D15,	DV_D15_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D14,	DV_D14_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D13,	DV_D13_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D12,	DV_D12_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D11,	DV_D11_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D10,	DV_D10_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D9,	DV_D9_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D8,	DV_D8_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D7,	DV_D7_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D6,	DV_D6_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D5,	DV_D5_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D4,	DV_D4_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D3,	DV_D3_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D2,	DV_D2_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D1,	DV_D1_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D0,	DV_D0_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_CLKI,	DV_CLKI_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_CLK,	DV_CLK_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_VSYNC,	DV_VSYNC_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_HSYNC,	DV_HSYNC_MARK),
+
+	/* MSIOF0 */
+	PINMUX_GPIO(GPIO_FN_MSIOF0_RXD,		MSIOF0_RXD_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF0_TXD,		MSIOF0_TXD_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF0_MCK,		MSIOF0_MCK_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF0_TSCK,	MSIOF0_TSCK_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF0_SS1,		MSIOF0_SS1_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF0_SS2,		MSIOF0_SS2_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF0_TSYNC,	MSIOF0_TSYNC_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF0_RSCK,	MSIOF0_RSCK_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF0_RSYNC,	MSIOF0_RSYNC_MARK),
+
+	/* MSIOF1 */
+	PINMUX_GPIO(GPIO_FN_MSIOF1_RXD,		MSIOF1_RXD_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF1_TXD,		MSIOF1_TXD_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF1_MCK,		MSIOF1_MCK_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF1_TSCK,	MSIOF1_TSCK_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF1_SS1,		MSIOF1_SS1_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF1_SS2,		MSIOF1_SS2_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF1_TSYNC,	MSIOF1_TSYNC_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF1_RSCK,	MSIOF1_RSCK_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF1_RSYNC,	MSIOF1_RSYNC_MARK),
+
+	/* DMAC */
+	PINMUX_GPIO(GPIO_FN_DMAC_DACK0,	DMAC_DACK0_MARK),
+	PINMUX_GPIO(GPIO_FN_DMAC_DREQ0,	DMAC_DREQ0_MARK),
+	PINMUX_GPIO(GPIO_FN_DMAC_DACK1,	DMAC_DACK1_MARK),
+	PINMUX_GPIO(GPIO_FN_DMAC_DREQ1,	DMAC_DREQ1_MARK),
+
+	/* SDHI0 */
+	PINMUX_GPIO(GPIO_FN_SDHI0CD,	SDHI0CD_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI0WP,	SDHI0WP_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI0CMD,	SDHI0CMD_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI0CLK,	SDHI0CLK_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI0D3,	SDHI0D3_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI0D2,	SDHI0D2_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI0D1,	SDHI0D1_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI0D0,	SDHI0D0_MARK),
+
+	/* SDHI1 */
+	PINMUX_GPIO(GPIO_FN_SDHI1CD,	SDHI1CD_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI1WP,	SDHI1WP_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI1CMD,	SDHI1CMD_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI1CLK,	SDHI1CLK_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI1D3,	SDHI1D3_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI1D2,	SDHI1D2_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI1D1,	SDHI1D1_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI1D0,	SDHI1D0_MARK),
+
+	/* MMC */
+	PINMUX_GPIO(GPIO_FN_MMC_D7,	MMC_D7_MARK),
+	PINMUX_GPIO(GPIO_FN_MMC_D6,	MMC_D6_MARK),
+	PINMUX_GPIO(GPIO_FN_MMC_D5,	MMC_D5_MARK),
+	PINMUX_GPIO(GPIO_FN_MMC_D4,	MMC_D4_MARK),
+	PINMUX_GPIO(GPIO_FN_MMC_D3,	MMC_D3_MARK),
+	PINMUX_GPIO(GPIO_FN_MMC_D2,	MMC_D2_MARK),
+	PINMUX_GPIO(GPIO_FN_MMC_D1,	MMC_D1_MARK),
+	PINMUX_GPIO(GPIO_FN_MMC_D0,	MMC_D0_MARK),
+	PINMUX_GPIO(GPIO_FN_MMC_CLK,	MMC_CLK_MARK),
+	PINMUX_GPIO(GPIO_FN_MMC_CMD,	MMC_CMD_MARK),
+
+	/* IrDA */
+	PINMUX_GPIO(GPIO_FN_IRDA_OUT,	IRDA_OUT_MARK),
+	PINMUX_GPIO(GPIO_FN_IRDA_IN,	IRDA_IN_MARK),
+
+	/* TSIF */
+	PINMUX_GPIO(GPIO_FN_TSIF_TS0_SDAT,	TSIF_TS0_SDAT_MARK),
+	PINMUX_GPIO(GPIO_FN_TSIF_TS0_SCK,	TSIF_TS0_SCK_MARK),
+	PINMUX_GPIO(GPIO_FN_TSIF_TS0_SDEN,	TSIF_TS0_SDEN_MARK),
+	PINMUX_GPIO(GPIO_FN_TSIF_TS0_SPSYNC,	TSIF_TS0_SPSYNC_MARK),
+
+	/* IRQ */
+	PINMUX_GPIO(GPIO_FN_INTC_IRQ7,	INTC_IRQ7_MARK),
+	PINMUX_GPIO(GPIO_FN_INTC_IRQ6,	INTC_IRQ6_MARK),
+	PINMUX_GPIO(GPIO_FN_INTC_IRQ5,	INTC_IRQ5_MARK),
+	PINMUX_GPIO(GPIO_FN_INTC_IRQ4,	INTC_IRQ4_MARK),
+	PINMUX_GPIO(GPIO_FN_INTC_IRQ3,	INTC_IRQ3_MARK),
+	PINMUX_GPIO(GPIO_FN_INTC_IRQ2,	INTC_IRQ2_MARK),
+	PINMUX_GPIO(GPIO_FN_INTC_IRQ1,	INTC_IRQ1_MARK),
+	PINMUX_GPIO(GPIO_FN_INTC_IRQ0,	INTC_IRQ0_MARK),
+ };
+
+static struct pinmux_cfg_reg pinmux_config_regs[] = {
+	{ PINMUX_CFG_REG("PACR", 0xa4050100, 16, 2) {
+		PTA7_FN, PTA7_OUT, PTA7_IN_PU, PTA7_IN,
+		PTA6_FN, PTA6_OUT, PTA6_IN_PU, PTA6_IN,
+		PTA5_FN, PTA5_OUT, PTA5_IN_PU, PTA5_IN,
+		PTA4_FN, PTA4_OUT, PTA4_IN_PU, PTA4_IN,
+		PTA3_FN, PTA3_OUT, PTA3_IN_PU, PTA3_IN,
+		PTA2_FN, PTA2_OUT, PTA2_IN_PU, PTA2_IN,
+		PTA1_FN, PTA1_OUT, PTA1_IN_PU, PTA1_IN,
+		PTA0_FN, PTA0_OUT, PTA0_IN_PU, PTA0_IN }
+	},
+	{ PINMUX_CFG_REG("PBCR", 0xa4050102, 16, 2) {
+		PTB7_FN, PTB7_OUT, PTB7_IN_PU, PTB7_IN,
+		PTB6_FN, PTB6_OUT, PTB6_IN_PU, PTB6_IN,
+		PTB5_FN, PTB5_OUT, PTB5_IN_PU, PTB5_IN,
+		PTB4_FN, PTB4_OUT, PTB4_IN_PU, PTB4_IN,
+		PTB3_FN, PTB3_OUT, PTB3_IN_PU, PTB3_IN,
+		PTB2_FN, PTB2_OUT, PTB2_IN_PU, PTB2_IN,
+		PTB1_FN, PTB1_OUT, PTB1_IN_PU, PTB1_IN,
+		PTB0_FN, PTB0_OUT, PTB0_IN_PU, PTB0_IN }
+	},
+	{ PINMUX_CFG_REG("PCCR", 0xa4050104, 16, 2) {
+		PTC7_FN, PTC7_OUT, PTC7_IN_PU, PTC7_IN,
+		PTC6_FN, PTC6_OUT, PTC6_IN_PU, PTC6_IN,
+		PTC5_FN, PTC5_OUT, PTC5_IN_PU, PTC5_IN,
+		PTC4_FN, PTC4_OUT, PTC4_IN_PU, PTC4_IN,
+		PTC3_FN, PTC3_OUT, PTC3_IN_PU, PTC3_IN,
+		PTC2_FN, PTC2_OUT, PTC2_IN_PU, PTC2_IN,
+		PTC1_FN, PTC1_OUT, PTC1_IN_PU, PTC1_IN,
+		PTC0_FN, PTC0_OUT, PTC0_IN_PU, PTC0_IN }
+	},
+	{ PINMUX_CFG_REG("PDCR", 0xa4050106, 16, 2) {
+		PTD7_FN, PTD7_OUT, PTD7_IN_PU, PTD7_IN,
+		PTD6_FN, PTD6_OUT, PTD6_IN_PU, PTD6_IN,
+		PTD5_FN, PTD5_OUT, PTD5_IN_PU, PTD5_IN,
+		PTD4_FN, PTD4_OUT, PTD4_IN_PU, PTD4_IN,
+		PTD3_FN, PTD3_OUT, PTD3_IN_PU, PTD3_IN,
+		PTD2_FN, PTD2_OUT, PTD2_IN_PU, PTD2_IN,
+		PTD1_FN, PTD1_OUT, PTD1_IN_PU, PTD1_IN,
+		PTD0_FN, PTD0_OUT, PTD0_IN_PU, PTD0_IN }
+	},
+	{ PINMUX_CFG_REG("PECR", 0xa4050108, 16, 2) {
+		PTE7_FN, PTE7_OUT, PTE7_IN_PU, PTE7_IN,
+		PTE6_FN, PTE6_OUT, PTE6_IN_PU, PTE6_IN,
+		PTE5_FN, PTE5_OUT, PTE5_IN_PU, PTE5_IN,
+		PTE4_FN, PTE4_OUT, PTE4_IN_PU, PTE4_IN,
+		PTE3_FN, PTE3_OUT, PTE3_IN_PU, PTE3_IN,
+		PTE2_FN, PTE2_OUT, PTE2_IN_PU, PTE2_IN,
+		PTE1_FN, PTE1_OUT, PTE1_IN_PU, PTE1_IN,
+		PTE0_FN, PTE0_OUT, PTE0_IN_PU, PTE0_IN }
+	},
+	{ PINMUX_CFG_REG("PFCR", 0xa405010a, 16, 2) {
+		PTF7_FN, PTF7_OUT, PTF7_IN_PU, PTF7_IN,
+		PTF6_FN, PTF6_OUT, PTF6_IN_PU, PTF6_IN,
+		PTF5_FN, PTF5_OUT, PTF5_IN_PU, PTF5_IN,
+		PTF4_FN, PTF4_OUT, PTF4_IN_PU, PTF4_IN,
+		PTF3_FN, PTF3_OUT, PTF3_IN_PU, PTF3_IN,
+		PTF2_FN, PTF2_OUT, PTF2_IN_PU, PTF2_IN,
+		PTF1_FN, PTF1_OUT, PTF1_IN_PU, PTF1_IN,
+		PTF0_FN, PTF0_OUT, PTF0_IN_PU, PTF0_IN }
+	},
+	{ PINMUX_CFG_REG("PGCR", 0xa405010c, 16, 2) {
+		0, 0, 0, 0,
+		0, 0, 0, 0,
+		PTG5_FN, PTG5_OUT, 0, 0,
+		PTG4_FN, PTG4_OUT, 0, 0,
+		PTG3_FN, PTG3_OUT, 0, 0,
+		PTG2_FN, PTG2_OUT, 0, 0,
+		PTG1_FN, PTG1_OUT, 0, 0,
+		PTG0_FN, PTG0_OUT, 0, 0 }
+	},
+	{ PINMUX_CFG_REG("PHCR", 0xa405010e, 16, 2) {
+		PTH7_FN, PTH7_OUT, PTH7_IN_PU, PTH7_IN,
+		PTH6_FN, PTH6_OUT, PTH6_IN_PU, PTH6_IN,
+		PTH5_FN, PTH5_OUT, PTH5_IN_PU, PTH5_IN,
+		PTH4_FN, PTH4_OUT, PTH4_IN_PU, PTH4_IN,
+		PTH3_FN, PTH3_OUT, PTH3_IN_PU, PTH3_IN,
+		PTH2_FN, PTH2_OUT, PTH2_IN_PU, PTH2_IN,
+		PTH1_FN, PTH1_OUT, PTH1_IN_PU, PTH1_IN,
+		PTH0_FN, PTH0_OUT, PTH0_IN_PU, PTH0_IN }
+	},
+	{ PINMUX_CFG_REG("PJCR", 0xa4050110, 16, 2) {
+		PTJ7_FN, PTJ7_OUT, 0, 0,
+		PTJ6_FN, PTJ6_OUT, 0, 0,
+		PTJ5_FN, PTJ5_OUT, 0, 0,
+		0, 0, 0, 0,
+		PTJ3_FN, PTJ3_OUT, PTJ3_IN_PU, PTJ3_IN,
+		PTJ2_FN, PTJ2_OUT, PTJ2_IN_PU, PTJ2_IN,
+		PTJ1_FN, PTJ1_OUT, PTJ1_IN_PU, PTJ1_IN,
+		PTJ0_FN, PTJ0_OUT, PTJ0_IN_PU, PTJ0_IN }
+	},
+	{ PINMUX_CFG_REG("PKCR", 0xa4050112, 16, 2) {
+		PTK7_FN, PTK7_OUT, PTK7_IN_PU, PTK7_IN,
+		PTK6_FN, PTK6_OUT, PTK6_IN_PU, PTK6_IN,
+		PTK5_FN, PTK5_OUT, PTK5_IN_PU, PTK5_IN,
+		PTK4_FN, PTK4_OUT, PTK4_IN_PU, PTK4_IN,
+		PTK3_FN, PTK3_OUT, PTK3_IN_PU, PTK3_IN,
+		PTK2_FN, PTK2_OUT, PTK2_IN_PU, PTK2_IN,
+		PTK1_FN, PTK1_OUT, PTK1_IN_PU, PTK1_IN,
+		PTK0_FN, PTK0_OUT, PTK0_IN_PU, PTK0_IN }
+	},
+	{ PINMUX_CFG_REG("PLCR", 0xa4050114, 16, 2) {
+		PTL7_FN, PTL7_OUT, PTL7_IN_PU, PTL7_IN,
+		PTL6_FN, PTL6_OUT, PTL6_IN_PU, PTL6_IN,
+		PTL5_FN, PTL5_OUT, PTL5_IN_PU, PTL5_IN,
+		PTL4_FN, PTL4_OUT, PTL4_IN_PU, PTL4_IN,
+		PTL3_FN, PTL3_OUT, PTL3_IN_PU, PTL3_IN,
+		PTL2_FN, PTL2_OUT, PTL2_IN_PU, PTL2_IN,
+		PTL1_FN, PTL1_OUT, PTL1_IN_PU, PTL1_IN,
+		PTL0_FN, PTL0_OUT, PTL0_IN_PU, PTL0_IN }
+	},
+	{ PINMUX_CFG_REG("PMCR", 0xa4050116, 16, 2) {
+		PTM7_FN, PTM7_OUT, PTM7_IN_PU, PTM7_IN,
+		PTM6_FN, PTM6_OUT, PTM6_IN_PU, PTM6_IN,
+		PTM5_FN, PTM5_OUT, PTM5_IN_PU, PTM5_IN,
+		PTM4_FN, PTM4_OUT, PTM4_IN_PU, PTM4_IN,
+		PTM3_FN, PTM3_OUT, PTM3_IN_PU, PTM3_IN,
+		PTM2_FN, PTM2_OUT, PTM2_IN_PU, PTM2_IN,
+		PTM1_FN, PTM1_OUT, PTM1_IN_PU, PTM1_IN,
+		PTM0_FN, PTM0_OUT, PTM0_IN_PU, PTM0_IN }
+	},
+	{ PINMUX_CFG_REG("PNCR", 0xa4050118, 16, 2) {
+		PTN7_FN, PTN7_OUT, PTN7_IN_PU, PTN7_IN,
+		PTN6_FN, PTN6_OUT, PTN6_IN_PU, PTN6_IN,
+		PTN5_FN, PTN5_OUT, PTN5_IN_PU, PTN5_IN,
+		PTN4_FN, PTN4_OUT, PTN4_IN_PU, PTN4_IN,
+		PTN3_FN, PTN3_OUT, PTN3_IN_PU, PTN3_IN,
+		PTN2_FN, PTN2_OUT, PTN2_IN_PU, PTN2_IN,
+		PTN1_FN, PTN1_OUT, PTN1_IN_PU, PTN1_IN,
+		PTN0_FN, PTN0_OUT, PTN0_IN_PU, PTN0_IN }
+	},
+	{ PINMUX_CFG_REG("PQCR", 0xa405011a, 16, 2) {
+		PTQ7_FN, PTQ7_OUT, PTQ7_IN_PU, PTQ7_IN,
+		PTQ6_FN, PTQ6_OUT, PTQ6_IN_PU, PTQ6_IN,
+		PTQ5_FN, PTQ5_OUT, PTQ5_IN_PU, PTQ5_IN,
+		PTQ4_FN, PTQ4_OUT, PTQ4_IN_PU, PTQ4_IN,
+		PTQ3_FN, PTQ3_OUT, PTQ3_IN_PU, PTQ3_IN,
+		PTQ2_FN, PTQ2_OUT, PTQ2_IN_PU, PTQ2_IN,
+		PTQ1_FN, PTQ1_OUT, PTQ1_IN_PU, PTQ1_IN,
+		PTQ0_FN, PTQ0_OUT, PTQ0_IN_PU, PTQ0_IN }
+	},
+	{ PINMUX_CFG_REG("PRCR", 0xa405011c, 16, 2) {
+		PTR7_FN, PTR7_OUT, PTR7_IN_PU, PTR7_IN,
+		PTR6_FN, PTR6_OUT, PTR6_IN_PU, PTR6_IN,
+		PTR5_FN, PTR5_OUT, PTR5_IN_PU, PTR5_IN,
+		PTR4_FN, PTR4_OUT, PTR4_IN_PU, PTR4_IN,
+		PTR3_FN, 0,        PTR3_IN_PU, PTR3_IN,
+		PTR2_FN, 0,        PTR2_IN_PU, PTR2_IN,
+		PTR1_FN, PTR1_OUT, PTR1_IN_PU, PTR1_IN,
+		PTR0_FN, PTR0_OUT, PTR0_IN_PU, PTR0_IN }
+	},
+	{ PINMUX_CFG_REG("PSCR", 0xa405011e, 16, 2) {
+		0, 0, 0, 0,
+		PTS6_FN, PTS6_OUT, PTS6_IN_PU, PTS6_IN,
+		PTS5_FN, PTS5_OUT, PTS5_IN_PU, PTS5_IN,
+		PTS4_FN, PTS4_OUT, PTS4_IN_PU, PTS4_IN,
+		PTS3_FN, PTS3_OUT, PTS3_IN_PU, PTS3_IN,
+		PTS2_FN, PTS2_OUT, PTS2_IN_PU, PTS2_IN,
+		PTS1_FN, PTS1_OUT, PTS1_IN_PU, PTS1_IN,
+		PTS0_FN, PTS0_OUT, PTS0_IN_PU, PTS0_IN }
+	},
+	{ PINMUX_CFG_REG("PTCR", 0xa4050140, 16, 2) {
+		PTT7_FN, PTT7_OUT, PTT7_IN_PU, PTT7_IN,
+		PTT6_FN, PTT6_OUT, PTT6_IN_PU, PTT6_IN,
+		PTT5_FN, PTT5_OUT, PTT5_IN_PU, PTT5_IN,
+		PTT4_FN, PTT4_OUT, PTT4_IN_PU, PTT4_IN,
+		PTT3_FN, PTT3_OUT, PTT3_IN_PU, PTT3_IN,
+		PTT2_FN, PTT2_OUT, PTT2_IN_PU, PTT2_IN,
+		PTT1_FN, PTT1_OUT, PTT1_IN_PU, PTT1_IN,
+		PTT0_FN, PTT0_OUT, PTT0_IN_PU, PTT0_IN }
+	},
+	{ PINMUX_CFG_REG("PUCR", 0xa4050142, 16, 2) {
+		PTU7_FN, PTU7_OUT, PTU7_IN_PU, PTU7_IN,
+		PTU6_FN, PTU6_OUT, PTU6_IN_PU, PTU6_IN,
+		PTU5_FN, PTU5_OUT, PTU5_IN_PU, PTU5_IN,
+		PTU4_FN, PTU4_OUT, PTU4_IN_PU, PTU4_IN,
+		PTU3_FN, PTU3_OUT, PTU3_IN_PU, PTU3_IN,
+		PTU2_FN, PTU2_OUT, PTU2_IN_PU, PTU2_IN,
+		PTU1_FN, PTU1_OUT, PTU1_IN_PU, PTU1_IN,
+		PTU0_FN, PTU0_OUT, PTU0_IN_PU, PTU0_IN }
+	},
+	{ PINMUX_CFG_REG("PVCR", 0xa4050144, 16, 2) {
+		PTV7_FN, PTV7_OUT, PTV7_IN_PU, PTV7_IN,
+		PTV6_FN, PTV6_OUT, PTV6_IN_PU, PTV6_IN,
+		PTV5_FN, PTV5_OUT, PTV5_IN_PU, PTV5_IN,
+		PTV4_FN, PTV4_OUT, PTV4_IN_PU, PTV4_IN,
+		PTV3_FN, PTV3_OUT, PTV3_IN_PU, PTV3_IN,
+		PTV2_FN, PTV2_OUT, PTV2_IN_PU, PTV2_IN,
+		PTV1_FN, PTV1_OUT, PTV1_IN_PU, PTV1_IN,
+		PTV0_FN, PTV0_OUT, PTV0_IN_PU, PTV0_IN }
+	},
+	{ PINMUX_CFG_REG("PWCR", 0xa4050146, 16, 2) {
+		PTW7_FN, PTW7_OUT, PTW7_IN_PU, PTW7_IN,
+		PTW6_FN, PTW6_OUT, PTW6_IN_PU, PTW6_IN,
+		PTW5_FN, PTW5_OUT, PTW5_IN_PU, PTW5_IN,
+		PTW4_FN, PTW4_OUT, PTW4_IN_PU, PTW4_IN,
+		PTW3_FN, PTW3_OUT, PTW3_IN_PU, PTW3_IN,
+		PTW2_FN, PTW2_OUT, PTW2_IN_PU, PTW2_IN,
+		PTW1_FN, PTW1_OUT, PTW1_IN_PU, PTW1_IN,
+		PTW0_FN, PTW0_OUT, PTW0_IN_PU, PTW0_IN }
+	},
+	{ PINMUX_CFG_REG("PXCR", 0xa4050148, 16, 2) {
+		PTX7_FN, PTX7_OUT, PTX7_IN_PU, PTX7_IN,
+		PTX6_FN, PTX6_OUT, PTX6_IN_PU, PTX6_IN,
+		PTX5_FN, PTX5_OUT, PTX5_IN_PU, PTX5_IN,
+		PTX4_FN, PTX4_OUT, PTX4_IN_PU, PTX4_IN,
+		PTX3_FN, PTX3_OUT, PTX3_IN_PU, PTX3_IN,
+		PTX2_FN, PTX2_OUT, PTX2_IN_PU, PTX2_IN,
+		PTX1_FN, PTX1_OUT, PTX1_IN_PU, PTX1_IN,
+		PTX0_FN, PTX0_OUT, PTX0_IN_PU, PTX0_IN }
+	},
+	{ PINMUX_CFG_REG("PYCR", 0xa405014a, 16, 2) {
+		PTY7_FN, PTY7_OUT, PTY7_IN_PU, PTY7_IN,
+		PTY6_FN, PTY6_OUT, PTY6_IN_PU, PTY6_IN,
+		PTY5_FN, PTY5_OUT, PTY5_IN_PU, PTY5_IN,
+		PTY4_FN, PTY4_OUT, PTY4_IN_PU, PTY4_IN,
+		PTY3_FN, PTY3_OUT, PTY3_IN_PU, PTY3_IN,
+		PTY2_FN, PTY2_OUT, PTY2_IN_PU, PTY2_IN,
+		PTY1_FN, PTY1_OUT, PTY1_IN_PU, PTY1_IN,
+		PTY0_FN, PTY0_OUT, PTY0_IN_PU, PTY0_IN }
+	},
+	{ PINMUX_CFG_REG("PZCR", 0xa405014c, 16, 2) {
+		PTZ7_FN, PTZ7_OUT, PTZ7_IN_PU, PTZ7_IN,
+		PTZ6_FN, PTZ6_OUT, PTZ6_IN_PU, PTZ6_IN,
+		PTZ5_FN, PTZ5_OUT, PTZ5_IN_PU, PTZ5_IN,
+		PTZ4_FN, PTZ4_OUT, PTZ4_IN_PU, PTZ4_IN,
+		PTZ3_FN, PTZ3_OUT, PTZ3_IN_PU, PTZ3_IN,
+		PTZ2_FN, PTZ2_OUT, PTZ2_IN_PU, PTZ2_IN,
+		PTZ1_FN, PTZ1_OUT, PTZ1_IN_PU, PTZ1_IN,
+		PTZ0_FN, PTZ0_OUT, PTZ0_IN_PU, PTZ0_IN }
+	},
+	{ PINMUX_CFG_REG("PSELA", 0xa405014e, 16, 1) {
+		PSA15_0, PSA15_1,
+		PSA14_0, PSA14_1,
+		PSA13_0, PSA13_1,
+		PSA12_0, PSA12_1,
+		0, 0,
+		PSA10_0, PSA10_1,
+		PSA9_0,  PSA9_1,
+		PSA8_0,  PSA8_1,
+		PSA7_0,  PSA7_1,
+		PSA6_0,  PSA6_1,
+		PSA5_0,  PSA5_1,
+		0, 0,
+		PSA3_0,  PSA3_1,
+		PSA2_0,  PSA2_1,
+		PSA1_0,  PSA1_1,
+		PSA0_0,  PSA0_1}
+	},
+	{ PINMUX_CFG_REG("PSELB", 0xa4050150, 16, 1) {
+		0, 0,
+		PSB14_0, PSB14_1,
+		PSB13_0, PSB13_1,
+		PSB12_0, PSB12_1,
+		PSB11_0, PSB11_1,
+		PSB10_0, PSB10_1,
+		PSB9_0,  PSB9_1,
+		PSB8_0,  PSB8_1,
+		PSB7_0,  PSB7_1,
+		PSB6_0,  PSB6_1,
+		PSB5_0,  PSB5_1,
+		PSB4_0,  PSB4_1,
+		PSB3_0,  PSB3_1,
+		PSB2_0,  PSB2_1,
+		PSB1_0,  PSB1_1,
+		PSB0_0,  PSB0_1}
+	},
+	{ PINMUX_CFG_REG("PSELC", 0xa4050152, 16, 1) {
+		PSC15_0, PSC15_1,
+		PSC14_0, PSC14_1,
+		PSC13_0, PSC13_1,
+		PSC12_0, PSC12_1,
+		PSC11_0, PSC11_1,
+		PSC10_0, PSC10_1,
+		PSC9_0,  PSC9_1,
+		PSC8_0,  PSC8_1,
+		PSC7_0,  PSC7_1,
+		PSC6_0,  PSC6_1,
+		PSC5_0,  PSC5_1,
+		PSC4_0,  PSC4_1,
+		0, 0,
+		PSC2_0,  PSC2_1,
+		PSC1_0,  PSC1_1,
+		PSC0_0,  PSC0_1}
+	},
+	{ PINMUX_CFG_REG("PSELD", 0xa4050154, 16, 1) {
+		PSD15_0, PSD15_1,
+		PSD14_0, PSD14_1,
+		PSD13_0, PSD13_1,
+		PSD12_0, PSD12_1,
+		PSD11_0, PSD11_1,
+		PSD10_0, PSD10_1,
+		PSD9_0,  PSD9_1,
+		PSD8_0,  PSD8_1,
+		PSD7_0,  PSD7_1,
+		PSD6_0,  PSD6_1,
+		PSD5_0,  PSD5_1,
+		PSD4_0,  PSD4_1,
+		PSD3_0,  PSD3_1,
+		PSD2_0,  PSD2_1,
+		PSD1_0,  PSD1_1,
+		PSD0_0,  PSD0_1}
+	},
+	{ PINMUX_CFG_REG("PSELE", 0xa4050156, 16, 1) {
+		PSE15_0, PSE15_1,
+		PSE14_0, PSE14_1,
+		PSE13_0, PSE13_1,
+		PSE12_0, PSE12_1,
+		PSE11_0, PSE11_1,
+		PSE10_0, PSE10_1,
+		PSE9_0,  PSE9_1,
+		PSE8_0,  PSE8_1,
+		PSE7_0,  PSE7_1,
+		PSE6_0,  PSE6_1,
+		PSE5_0,  PSE5_1,
+		PSE4_0,  PSE4_1,
+		PSE3_0,  PSE3_1,
+		PSE2_0,  PSE2_1,
+		PSE1_0,  PSE1_1,
+		PSE0_0,  PSE0_1}
+	},
+	{}
+};
+
+static struct pinmux_data_reg pinmux_data_regs[] = {
+	{ PINMUX_DATA_REG("PADR", 0xa4050120, 8) {
+		PTA7_DATA, PTA6_DATA, PTA5_DATA, PTA4_DATA,
+		PTA3_DATA, PTA2_DATA, PTA1_DATA, PTA0_DATA }
+	},
+	{ PINMUX_DATA_REG("PBDR", 0xa4050122, 8) {
+		PTB7_DATA, PTB6_DATA, PTB5_DATA, PTB4_DATA,
+		PTB3_DATA, PTB2_DATA, PTB1_DATA, PTB0_DATA }
+	},
+	{ PINMUX_DATA_REG("PCDR", 0xa4050124, 8) {
+		PTC7_DATA, PTC6_DATA, PTC5_DATA, PTC4_DATA,
+		PTC3_DATA, PTC2_DATA, PTC1_DATA, PTC0_DATA }
+	},
+	{ PINMUX_DATA_REG("PDDR", 0xa4050126, 8) {
+		PTD7_DATA, PTD6_DATA, PTD5_DATA, PTD4_DATA,
+		PTD3_DATA, PTD2_DATA, PTD1_DATA, PTD0_DATA }
+	},
+	{ PINMUX_DATA_REG("PEDR", 0xa4050128, 8) {
+		PTE7_DATA, PTE6_DATA, PTE5_DATA, PTE4_DATA,
+		PTE3_DATA, PTE2_DATA, PTE1_DATA, PTE0_DATA }
+	},
+	{ PINMUX_DATA_REG("PFDR", 0xa405012a, 8) {
+		PTF7_DATA, PTF6_DATA, PTF5_DATA, PTF4_DATA,
+		PTF3_DATA, PTF2_DATA, PTF1_DATA, PTF0_DATA }
+	},
+	{ PINMUX_DATA_REG("PGDR", 0xa405012c, 8) {
+		0,         0,         PTG5_DATA, PTG4_DATA,
+		PTG3_DATA, PTG2_DATA, PTG1_DATA, PTG0_DATA }
+	},
+	{ PINMUX_DATA_REG("PHDR", 0xa405012e, 8) {
+		PTH7_DATA, PTH6_DATA, PTH5_DATA, PTH4_DATA,
+		PTH3_DATA, PTH2_DATA, PTH1_DATA, PTH0_DATA }
+	},
+	{ PINMUX_DATA_REG("PJDR", 0xa4050130, 8) {
+		PTJ7_DATA, PTJ6_DATA, PTJ5_DATA, 0,
+		PTJ3_DATA, PTJ2_DATA, PTJ1_DATA, PTJ0_DATA }
+	},
+	{ PINMUX_DATA_REG("PKDR", 0xa4050132, 8) {
+		PTK7_DATA, PTK6_DATA, PTK5_DATA, PTK4_DATA,
+		PTK3_DATA, PTK2_DATA, PTK1_DATA, PTK0_DATA }
+	},
+	{ PINMUX_DATA_REG("PLDR", 0xa4050134, 8) {
+		PTL7_DATA, PTL6_DATA, PTL5_DATA, PTL4_DATA,
+		PTL3_DATA, PTL2_DATA, PTL1_DATA, PTL0_DATA }
+	},
+	{ PINMUX_DATA_REG("PMDR", 0xa4050136, 8) {
+		PTM7_DATA, PTM6_DATA, PTM5_DATA, PTM4_DATA,
+		PTM3_DATA, PTM2_DATA, PTM1_DATA, PTM0_DATA }
+	},
+	{ PINMUX_DATA_REG("PNDR", 0xa4050138, 8) {
+		PTN7_DATA, PTN6_DATA, PTN5_DATA, PTN4_DATA,
+		PTN3_DATA, PTN2_DATA, PTN1_DATA, PTN0_DATA }
+	},
+	{ PINMUX_DATA_REG("PQDR", 0xa405013a, 8) {
+		PTQ7_DATA, PTQ6_DATA, PTQ5_DATA, PTQ4_DATA,
+		PTQ3_DATA, PTQ2_DATA, PTQ1_DATA, PTQ0_DATA }
+	},
+	{ PINMUX_DATA_REG("PRDR", 0xa405013c, 8) {
+		PTR7_DATA, PTR6_DATA, PTR5_DATA, PTR4_DATA,
+		PTR3_DATA, PTR2_DATA, PTR1_DATA, PTR0_DATA }
+	},
+	{ PINMUX_DATA_REG("PSDR", 0xa405013e, 8) {
+		0,         PTS6_DATA, PTS5_DATA, PTS4_DATA,
+		PTS3_DATA, PTS2_DATA, PTS1_DATA, PTS0_DATA }
+	},
+	{ PINMUX_DATA_REG("PTDR", 0xa4050160, 8) {
+		PTT7_DATA, PTT6_DATA, PTT5_DATA, PTT4_DATA,
+		PTT3_DATA, PTT2_DATA, PTT1_DATA, PTT0_DATA }
+	},
+	{ PINMUX_DATA_REG("PUDR", 0xa4050162, 8) {
+		PTU7_DATA, PTU6_DATA, PTU5_DATA, PTU4_DATA,
+		PTU3_DATA, PTU2_DATA, PTU1_DATA, PTU0_DATA }
+	},
+	{ PINMUX_DATA_REG("PVDR", 0xa4050164, 8) {
+		PTV7_DATA, PTV6_DATA, PTV5_DATA, PTV4_DATA,
+		PTV3_DATA, PTV2_DATA, PTV1_DATA, PTV0_DATA }
+	},
+	{ PINMUX_DATA_REG("PWDR", 0xa4050166, 8) {
+		PTW7_DATA, PTW6_DATA, PTW5_DATA, PTW4_DATA,
+		PTW3_DATA, PTW2_DATA, PTW1_DATA, PTW0_DATA }
+	},
+	{ PINMUX_DATA_REG("PXDR", 0xa4050168, 8) {
+		PTX7_DATA, PTX6_DATA, PTX5_DATA, PTX4_DATA,
+		PTX3_DATA, PTX2_DATA, PTX1_DATA, PTX0_DATA }
+	},
+	{ PINMUX_DATA_REG("PYDR", 0xa405016a, 8) {
+		PTY7_DATA, PTY6_DATA, PTY5_DATA, PTY4_DATA,
+		PTY3_DATA, PTY2_DATA, PTY1_DATA, PTY0_DATA }
+	},
+	{ PINMUX_DATA_REG("PZDR", 0xa405016c, 8) {
+		PTZ7_DATA, PTZ6_DATA, PTZ5_DATA, PTZ4_DATA,
+		PTZ3_DATA, PTZ2_DATA, PTZ1_DATA, PTZ0_DATA }
+	},
+	{ },
+};
+
+static struct pinmux_info sh7724_pinmux_info = {
+	.name = "sh7724_pfc",
+	.reserved_id = PINMUX_RESERVED,
+	.data = { PINMUX_DATA_BEGIN, PINMUX_DATA_END },
+	.input = { PINMUX_INPUT_BEGIN, PINMUX_INPUT_END },
+	.input_pu = { PINMUX_INPUT_PULLUP_BEGIN, PINMUX_INPUT_PULLUP_END },
+	.output = { PINMUX_OUTPUT_BEGIN, PINMUX_OUTPUT_END },
+	.mark = { PINMUX_MARK_BEGIN, PINMUX_MARK_END },
+	.function = { PINMUX_FUNCTION_BEGIN, PINMUX_FUNCTION_END },
+
+	.first_gpio = GPIO_PTA7,
+	.last_gpio = GPIO_FN_INTC_IRQ0,
+
+	.gpios = pinmux_gpios,
+	.cfg_regs = pinmux_config_regs,
+	.data_regs = pinmux_data_regs,
+
+	.gpio_data = pinmux_data,
+	.gpio_data_size = ARRAY_SIZE(pinmux_data),
+};
+
+static int __init plat_pinmux_setup(void)
+{
+	return register_pinmux(&sh7724_pinmux_info);
+}
+arch_initcall(plat_pinmux_setup);
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
new file mode 100644
index 0000000..4327b1e
--- /dev/null
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -0,0 +1,371 @@
+/*
+ * SH7724 Setup
+ *
+ * Copyright (C) 2009 Renesas Solutions Corp.
+ *
+ * Kuninori Morimoto <morimoto.kuninori@renesas.com>
+ *
+ * Based on SH7723 Setup
+ * Copyright (C) 2008  Paul Mundt
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/platform_device.h>
+#include <linux/init.h>
+#include <linux/serial.h>
+#include <linux/mm.h>
+#include <linux/serial_sci.h>
+#include <linux/uio_driver.h>
+#include <linux/sh_cmt.h>
+#include <linux/io.h>
+#include <asm/clock.h>
+#include <asm/mmzone.h>
+
+/* Serial */
+static struct plat_sci_port sci_platform_data[] = {
+	{
+		.mapbase        = 0xffe00000,
+		.flags          = UPF_BOOT_AUTOCONF,
+		.type           = PORT_SCIF,
+		.irqs           = { 80, 80, 80, 80 },
+	}, {
+		.mapbase        = 0xffe10000,
+		.flags          = UPF_BOOT_AUTOCONF,
+		.type           = PORT_SCIF,
+		.irqs           = { 81, 81, 81, 81 },
+	}, {
+		.mapbase        = 0xffe20000,
+		.flags          = UPF_BOOT_AUTOCONF,
+		.type           = PORT_SCIF,
+		.irqs           = { 82, 82, 82, 82 },
+	}, {
+		.mapbase	= 0xa4e30000,
+		.flags		= UPF_BOOT_AUTOCONF,
+		.type		= PORT_SCIFA,
+		.irqs		= { 56, 56, 56, 56 },
+	}, {
+		.mapbase	= 0xa4e40000,
+		.flags		= UPF_BOOT_AUTOCONF,
+		.type		= PORT_SCIFA,
+		.irqs		= { 88, 88, 88, 88 },
+	}, {
+		.mapbase	= 0xa4e50000,
+		.flags		= UPF_BOOT_AUTOCONF,
+		.type		= PORT_SCIFA,
+		.irqs		= { 109, 109, 109, 109 },
+	}, {
+		.flags = 0,
+	}
+};
+
+static struct platform_device sci_device = {
+	.name		= "sh-sci",
+	.id		= -1,
+	.dev		= {
+		.platform_data	= sci_platform_data,
+	},
+};
+
+/* RTC */
+static struct resource rtc_resources[] = {
+	[0] = {
+		.start	= 0xa465fec0,
+		.end	= 0xa465fec0 + 0x58 - 1,
+		.flags	= IORESOURCE_IO,
+	},
+	[1] = {
+		/* Period IRQ */
+		.start	= 69,
+		.flags	= IORESOURCE_IRQ,
+	},
+	[2] = {
+		/* Carry IRQ */
+		.start	= 70,
+		.flags	= IORESOURCE_IRQ,
+	},
+	[3] = {
+		/* Alarm IRQ */
+		.start	= 68,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device rtc_device = {
+	.name		= "sh-rtc",
+	.id		= -1,
+	.num_resources	= ARRAY_SIZE(rtc_resources),
+	.resource	= rtc_resources,
+};
+
+static struct platform_device *sh7724_devices[] __initdata = {
+	&sci_device,
+	&rtc_device,
+};
+
+static int __init sh7724_devices_setup(void)
+{
+	clk_always_enable("rtc0");   /* RTC */
+
+	return platform_add_devices(sh7724_devices,
+				    ARRAY_SIZE(sh7724_devices));
+}
+device_initcall(sh7724_devices_setup);
+
+enum {
+	UNUSED = 0,
+
+	/* interrupt sources */
+	IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5, IRQ6, IRQ7,
+	HUDI,
+	DMAC1A_DEI0, DMAC1A_DEI1, DMAC1A_DEI2, DMAC1A_DEI3,
+	_2DG_TRI, _2DG_INI, _2DG_CEI, _2DG_BRK,
+	DMAC0A_DEI0, DMAC0A_DEI1, DMAC0A_DEI2, DMAC0A_DEI3,
+	VIO_CEU20I, VIO_BEU20I, VIO_VEU3F1, VIO_VOUI,
+	SCIFA_SCIFA0,
+	VPU_VPUI,
+	TPU_TPUI,
+	CEU21I,
+	BEU21I,
+	USB_USI0,
+	ATAPI,
+	RTC_ATI, RTC_PRI, RTC_CUI,
+	DMAC1B_DEI4, DMAC1B_DEI5, DMAC1B_DADERR,
+	DMAC0B_DEI4, DMAC0B_DEI5, DMAC0B_DADERR,
+	KEYSC_KEYI,
+	SCIF_SCIF0, SCIF_SCIF1, SCIF_SCIF2,
+	VEU3F0I,
+	MSIOF_MSIOFI0, MSIOF_MSIOFI1,
+	SPU_SPUI0, SPU_SPUI1,
+	SCIFA_SCIFA1,
+/*	ICB_ICBI, */
+	ETHI,
+	I2C1_ALI, I2C1_TACKI, I2C1_WAITI, I2C1_DTEI,
+	I2C0_ALI, I2C0_TACKI, I2C0_WAITI, I2C0_DTEI,
+	SDHI0_SDHII0, SDHI0_SDHII1, SDHI0_SDHII2,
+	CMT_CMTI,
+	TSIF_TSIFI,
+/*	ICB_LMBI, */
+	FSI_FSI,
+	SCIFA_SCIFA2,
+	TMU0_TUNI0, TMU0_TUNI1, TMU0_TUNI2,
+	IRDA_IRDAI,
+	SDHI1_SDHII0, SDHI1_SDHII1, SDHI1_SDHII2,
+	JPU_JPUI,
+	MMC_MMCI0, MMC_MMCI1, MMC_MMCI2,
+	LCDC_LCDCI,
+	TMU1_TUNI0, TMU1_TUNI1, TMU1_TUNI2,
+
+	/* interrupt groups */
+	DMAC1A, _2DG, DMAC0A, VIO, RTC,
+	DMAC1B, DMAC0B, I2C0, I2C1, SDHI0, SDHI1, SPU, MMC,
+};
+
+static struct intc_vect vectors[] __initdata = {
+	INTC_VECT(IRQ0, 0x600), INTC_VECT(IRQ1, 0x620),
+	INTC_VECT(IRQ2, 0x640), INTC_VECT(IRQ3, 0x660),
+	INTC_VECT(IRQ4, 0x680), INTC_VECT(IRQ5, 0x6a0),
+	INTC_VECT(IRQ6, 0x6c0), INTC_VECT(IRQ7, 0x6e0),
+
+	INTC_VECT(DMAC1A_DEI0, 0x700),
+	INTC_VECT(DMAC1A_DEI1, 0x720),
+	INTC_VECT(DMAC1A_DEI2, 0x740),
+	INTC_VECT(DMAC1A_DEI3, 0x760),
+
+	INTC_VECT(_2DG_TRI, 0x780),
+	INTC_VECT(_2DG_INI, 0x7A0),
+	INTC_VECT(_2DG_CEI, 0x7C0),
+	INTC_VECT(_2DG_BRK, 0x7E0),
+
+	INTC_VECT(DMAC0A_DEI0, 0x800),
+	INTC_VECT(DMAC0A_DEI1, 0x820),
+	INTC_VECT(DMAC0A_DEI2, 0x840),
+	INTC_VECT(DMAC0A_DEI3, 0x860),
+
+	INTC_VECT(VIO_CEU20I, 0x880),
+	INTC_VECT(VIO_BEU20I, 0x8A0),
+	INTC_VECT(VIO_VEU3F1, 0x8C0),
+	INTC_VECT(VIO_VOUI, 0x8E0),
+
+	INTC_VECT(SCIFA_SCIFA0, 0x900),
+	INTC_VECT(VPU_VPUI, 0x980),
+	INTC_VECT(TPU_TPUI, 0x9A0),
+	INTC_VECT(CEU21I, 0x9E0),
+	INTC_VECT(BEU21I, 0xA00),
+	INTC_VECT(USB_USI0, 0xA20),
+	INTC_VECT(ATAPI, 0xA60),
+
+	INTC_VECT(RTC_ATI, 0xA80),
+	INTC_VECT(RTC_PRI, 0xAA0),
+	INTC_VECT(RTC_CUI, 0xAC0),
+
+	INTC_VECT(DMAC1B_DEI4, 0xB00),
+	INTC_VECT(DMAC1B_DEI5, 0xB20),
+	INTC_VECT(DMAC1B_DADERR, 0xB40),
+
+	INTC_VECT(DMAC0B_DEI4, 0xB80),
+	INTC_VECT(DMAC0B_DEI5, 0xBA0),
+	INTC_VECT(DMAC0B_DADERR, 0xBC0),
+
+	INTC_VECT(KEYSC_KEYI, 0xBE0),
+	INTC_VECT(SCIF_SCIF0, 0xC00),
+	INTC_VECT(SCIF_SCIF1, 0xC20),
+	INTC_VECT(SCIF_SCIF2, 0xC40),
+	INTC_VECT(VEU3F0I, 0xC60),
+	INTC_VECT(MSIOF_MSIOFI0, 0xC80),
+	INTC_VECT(MSIOF_MSIOFI1, 0xCA0),
+	INTC_VECT(SPU_SPUI0, 0xCC0),
+	INTC_VECT(SPU_SPUI1, 0xCE0),
+	INTC_VECT(SCIFA_SCIFA1, 0xD00),
+
+/*	INTC_VECT(ICB_ICBI, 0xD20), */
+	INTC_VECT(ETHI, 0xD60),
+
+	INTC_VECT(I2C1_ALI, 0xD80),
+	INTC_VECT(I2C1_TACKI, 0xDA0),
+	INTC_VECT(I2C1_WAITI, 0xDC0),
+	INTC_VECT(I2C1_DTEI, 0xDE0),
+
+	INTC_VECT(I2C0_ALI, 0xE00),
+	INTC_VECT(I2C0_TACKI, 0xE20),
+	INTC_VECT(I2C0_WAITI, 0xE40),
+	INTC_VECT(I2C0_DTEI, 0xE60),
+
+	INTC_VECT(SDHI0_SDHII0, 0xE80),
+	INTC_VECT(SDHI0_SDHII1, 0xEA0),
+	INTC_VECT(SDHI0_SDHII2, 0xEC0),
+
+	INTC_VECT(CMT_CMTI, 0xF00),
+	INTC_VECT(TSIF_TSIFI, 0xF20),
+/*	INTC_VECT(ICB_LMBI, 0xF60), */
+	INTC_VECT(FSI_FSI, 0xF80),
+	INTC_VECT(SCIFA_SCIFA2, 0xFA0),
+
+	INTC_VECT(TMU0_TUNI0, 0x400),
+	INTC_VECT(TMU0_TUNI1, 0x420),
+	INTC_VECT(TMU0_TUNI2, 0x440),
+
+	INTC_VECT(IRDA_IRDAI, 0x480),
+
+	INTC_VECT(SDHI1_SDHII0, 0x4E0),
+	INTC_VECT(SDHI1_SDHII1, 0x500),
+	INTC_VECT(SDHI1_SDHII2, 0x520),
+
+	INTC_VECT(JPU_JPUI, 0x560),
+
+	INTC_VECT(MMC_MMCI0, 0x580),
+	INTC_VECT(MMC_MMCI1, 0x5A0),
+	INTC_VECT(MMC_MMCI2, 0x5C0),
+
+	INTC_VECT(LCDC_LCDCI, 0xF40),
+
+	INTC_VECT(TMU1_TUNI0, 0x920),
+	INTC_VECT(TMU1_TUNI1, 0x940),
+	INTC_VECT(TMU1_TUNI2, 0x960),
+};
+
+static struct intc_group groups[] __initdata = {
+	INTC_GROUP(DMAC1A, DMAC1A_DEI0, DMAC1A_DEI1, DMAC1A_DEI2, DMAC1A_DEI3),
+	INTC_GROUP(_2DG, _2DG_TRI, _2DG_INI, _2DG_CEI, _2DG_BRK),
+	INTC_GROUP(DMAC0A, DMAC0A_DEI0, DMAC0A_DEI1, DMAC0A_DEI2, DMAC0A_DEI3),
+	INTC_GROUP(VIO, VIO_CEU20I, VIO_BEU20I, VIO_VEU3F1, VIO_VOUI),
+	INTC_GROUP(RTC, RTC_ATI, RTC_PRI, RTC_CUI),
+	INTC_GROUP(DMAC1B, DMAC1B_DEI4, DMAC1B_DEI5, DMAC1B_DADERR),
+	INTC_GROUP(DMAC0B, DMAC0B_DEI4, DMAC0B_DEI5, DMAC0B_DADERR),
+	INTC_GROUP(I2C0, I2C0_ALI, I2C0_TACKI, I2C0_WAITI, I2C0_DTEI),
+	INTC_GROUP(I2C1, I2C1_ALI, I2C1_TACKI, I2C1_WAITI, I2C1_DTEI),
+	INTC_GROUP(SDHI0, SDHI0_SDHII0, SDHI0_SDHII1, SDHI0_SDHII2),
+	INTC_GROUP(SDHI1, SDHI1_SDHII0, SDHI1_SDHII1, SDHI1_SDHII2),
+	INTC_GROUP(SPU, SPU_SPUI0, SPU_SPUI1),
+	INTC_GROUP(MMC, MMC_MMCI0, MMC_MMCI1, MMC_MMCI2),
+};
+
+/* FIXMEEEEEEEEEEEEEEEEEEE !!!!! */
+/* very bad manual !! */
+static struct intc_mask_reg mask_registers[] __initdata = {
+	{ 0xa4080080, 0xa40800c0, 8, /* IMR0 / IMCR0 */
+	  { 0, TMU1_TUNI2, TMU1_TUNI1, TMU1_TUNI0,
+	    /*SDHII3?*/0, SDHI1_SDHII2, SDHI1_SDHII1, SDHI1_SDHII0 } },
+	{ 0xa4080084, 0xa40800c4, 8, /* IMR1 / IMCR1 */
+	  { VIO_VOUI, VIO_VEU3F1, VIO_BEU20I, VIO_CEU20I,
+	    DMAC0A_DEI3, DMAC0A_DEI2, DMAC0A_DEI1, DMAC0A_DEI0 } },
+	{ 0xa4080088, 0xa40800c8, 8, /* IMR2 / IMCR2 */
+	  { 0, 0, 0, VPU_VPUI, ATAPI, ETHI, 0, /*SCIFA3*/SCIFA_SCIFA0 } },
+	{ 0xa408008c, 0xa40800cc, 8, /* IMR3 / IMCR3 */
+	  { DMAC1A_DEI3, DMAC1A_DEI2, DMAC1A_DEI1, DMAC1A_DEI0,
+	    SPU_SPUI1, SPU_SPUI0, BEU21I, IRDA_IRDAI } },
+	{ 0xa4080090, 0xa40800d0, 8, /* IMR4 / IMCR4 */
+	  { 0, TMU0_TUNI2, TMU0_TUNI1, TMU0_TUNI0,
+	    JPU_JPUI, 0, 0, LCDC_LCDCI } },
+	{ 0xa4080094, 0xa40800d4, 8, /* IMR5 / IMCR5 */
+	  { KEYSC_KEYI, DMAC0B_DADERR, DMAC0B_DEI5, DMAC0B_DEI4,
+	    VEU3F0I, SCIF_SCIF2, SCIF_SCIF1, SCIF_SCIF0 } },
+	{ 0xa4080098, 0xa40800d8, 8, /* IMR6 / IMCR6 */
+	  { 0, 0, /*ICB_ICBI*/0, /*SCIFA4*/SCIFA_SCIFA1,
+	    CEU21I, 0, MSIOF_MSIOFI1, MSIOF_MSIOFI0 } },
+	{ 0xa408009c, 0xa40800dc, 8, /* IMR7 / IMCR7 */
+	  { I2C0_DTEI, I2C0_WAITI, I2C0_TACKI, I2C0_ALI,
+	    I2C1_DTEI, I2C1_WAITI, I2C1_TACKI, I2C1_ALI } },
+	{ 0xa40800a0, 0xa40800e0, 8, /* IMR8 / IMCR8 */
+	  { /*SDHII3*/0, SDHI0_SDHII2, SDHI0_SDHII1, SDHI0_SDHII0,
+	    0, 0, /*SCIFA5*/SCIFA_SCIFA2, FSI_FSI } },
+	{ 0xa40800a4, 0xa40800e4, 8, /* IMR9 / IMCR9 */
+	  { 0, 0, 0, CMT_CMTI, 0, /*USB1*/0, USB_USI0, 0 } },
+	{ 0xa40800a8, 0xa40800e8, 8, /* IMR10 / IMCR10 */
+	  { 0, DMAC1B_DADERR, DMAC1B_DEI5, DMAC1B_DEI4,
+	    0, RTC_ATI, RTC_PRI, RTC_CUI } },
+	{ 0xa40800ac, 0xa40800ec, 8, /* IMR11 / IMCR11 */
+	  { _2DG_BRK, _2DG_CEI, _2DG_INI, _2DG_TRI,
+	    0, TPU_TPUI, /*ICB_LMBI*/0, TSIF_TSIFI } },
+	{ 0xa40800b0, 0xa40800f0, 8, /* IMR12 / IMCR12 */
+	  { 0, 0, 0, 0, 0, 0, 0, 0/*2DDMAC*/ } },
+	{ 0xa4140044, 0xa4140064, 8, /* INTMSK00 / INTMSKCLR00 */
+	  { IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5, IRQ6, IRQ7 } },
+};
+
+static struct intc_prio_reg prio_registers[] __initdata = {
+	{ 0xa4080000, 0, 16, 4, /* IPRA */ { TMU0_TUNI0, TMU0_TUNI1,
+					     TMU0_TUNI2, IRDA_IRDAI } },
+	{ 0xa4080004, 0, 16, 4, /* IPRB */ { JPU_JPUI, LCDC_LCDCI,
+					     DMAC1A, BEU21I } },
+	{ 0xa4080008, 0, 16, 4, /* IPRC */ { TMU1_TUNI0, TMU1_TUNI1,
+					     TMU1_TUNI2, SPU } },
+	{ 0xa408000c, 0, 16, 4, /* IPRD */ { 0, MMC, 0, ATAPI } },
+	{ 0xa4080010, 0, 16, 4, /* IPRE */
+	  { DMAC0A, /*BEU?VEU?*/VIO, /*SCIFA3*/SCIFA_SCIFA0, /*VPU5F*/
+	    VPU_VPUI } },
+	{ 0xa4080014, 0, 16, 4, /* IPRF */ { KEYSC_KEYI, DMAC0B,
+					     USB_USI0, CMT_CMTI } },
+	{ 0xa4080018, 0, 16, 4, /* IPRG */ { SCIF_SCIF0, SCIF_SCIF1,
+					     SCIF_SCIF2, VEU3F0I } },
+	{ 0xa408001c, 0, 16, 4, /* IPRH */ { MSIOF_MSIOFI0, MSIOF_MSIOFI1,
+					     I2C1, I2C0 } },
+	{ 0xa4080020, 0, 16, 4, /* IPRI */ { /*SCIFA4*/SCIFA_SCIFA1, /*ICB*/0,
+					     TSIF_TSIFI, _2DG/*ICB?*/ } },
+	{ 0xa4080024, 0, 16, 4, /* IPRJ */ { CEU21I, ETHI, FSI_FSI, SDHI1 } },
+	{ 0xa4080028, 0, 16, 4, /* IPRK */ { RTC, DMAC1B, /*ICB?*/0, SDHI0 } },
+	{ 0xa408002c, 0, 16, 4, /* IPRL */ { /*SCIFA5*/SCIFA_SCIFA2, 0,
+					     TPU_TPUI, /*2DDMAC*/0 } },
+	{ 0xa4140010, 0, 32, 4, /* INTPRI00 */
+	  { IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5, IRQ6, IRQ7 } },
+};
+
+static struct intc_sense_reg sense_registers[] __initdata = {
+	{ 0xa414001c, 16, 2, /* ICR1 */
+	  { IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5, IRQ6, IRQ7 } },
+};
+
+static struct intc_mask_reg ack_registers[] __initdata = {
+	{ 0xa4140024, 0, 8, /* INTREQ00 */
+	  { IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5, IRQ6, IRQ7 } },
+};
+
+static DECLARE_INTC_DESC_ACK(intc_desc, "sh7724", vectors, groups,
+			     mask_registers, prio_registers, sense_registers,
+			     ack_registers);
+
+void __init plat_irq_setup(void)
+{
+	register_intc_controller(&intc_desc);
+}
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index 04a6004..0e6ed80 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -435,7 +435,8 @@ static const char *cpu_name[] = {
 	[CPU_SH7722]	= "SH7722",	[CPU_SHX3]	= "SH-X3",
 	[CPU_SH5_101]	= "SH5-101",	[CPU_SH5_103]	= "SH5-103",
 	[CPU_MXG]	= "MX-G",	[CPU_SH7723]	= "SH7723",
-	[CPU_SH7366]	= "SH7366",	[CPU_SH_NONE]	= "Unknown"
+	[CPU_SH7366]	= "SH7366",	[CPU_SH7724]	= "SH7724",
+	[CPU_SH_NONE]	= "Unknown"
 };
 
 const char *get_cpu_subtype(struct sh_cpuinfo *c)
diff --git a/arch/sh/oprofile/common.c b/arch/sh/oprofile/common.c
index 1b9d430..44f4e31 100644
--- a/arch/sh/oprofile/common.c
+++ b/arch/sh/oprofile/common.c
@@ -109,6 +109,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops)
 	case CPU_SH7785:
 	case CPU_SH7786:
 	case CPU_SH7723:
+	case CPU_SH7724:
 	case CPU_SHX3:
 		lmodel = &op_model_sh4a_ops;
 		break;
-- 
cgit v0.10.2


From 47948d2bd6d27648a107a27357b3bc5ad054ff64 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Wed, 15 Apr 2009 11:42:47 +0900
Subject: serial: sh-sci: SH7724 support.

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/drivers/serial/sh-sci.h b/drivers/serial/sh-sci.h
index d0aa82d..84cc651 100644
--- a/drivers/serial/sh-sci.h
+++ b/drivers/serial/sh-sci.h
@@ -91,6 +91,9 @@
 # define SCSPTR5                0xa4050128
 # define SCIF_ORER              0x0001  /* overrun error bit */
 # define SCSCR_INIT(port)       0x0038  /* TIE=0,RIE=0,TE=1,RE=1,REIE=1 */
+#elif defined(CONFIG_CPU_SUBTYPE_SH7724)
+# define SCIF_ORER              0x0001  /* overrun error bit */
+# define SCSCR_INIT(port)       0x0038  /* TIE=0,RIE=0,TE=1,RE=1,REIE=1 */
 #elif defined(CONFIG_CPU_SUBTYPE_SH4_202)
 # define SCSPTR2 0xffe80020 /* 16 bit SCIF */
 # define SCIF_ORER 0x0001   /* overrun error bit */
@@ -361,7 +364,8 @@
                  h8_sci_offset, h8_sci_size) \
   CPU_SCI_FNS(name, h8_sci_offset, h8_sci_size)
 #define SCIF_FNS(name, sh3_scif_offset, sh3_scif_size, sh4_scif_offset, sh4_scif_size)
-#elif defined(CONFIG_CPU_SUBTYPE_SH7723)
+#elif defined(CONFIG_CPU_SUBTYPE_SH7723) ||\
+      defined(CONFIG_CPU_SUBTYPE_SH7724)
         #define SCIx_FNS(name, sh4_scifa_offset, sh4_scifa_size, sh4_scif_offset, sh4_scif_size) \
                 CPU_SCIx_FNS(name, sh4_scifa_offset, sh4_scifa_size, sh4_scif_offset, sh4_scif_size)
         #define SCIF_FNS(name, sh4_scif_offset, sh4_scif_size) \
@@ -390,7 +394,8 @@ SCIF_FNS(SCFDR,  0x1c, 16)
 SCIF_FNS(SCxTDR, 0x20,  8)
 SCIF_FNS(SCxRDR, 0x24,  8)
 SCIF_FNS(SCLSR,  0x24, 16)
-#elif defined(CONFIG_CPU_SUBTYPE_SH7723)
+#elif defined(CONFIG_CPU_SUBTYPE_SH7723) ||\
+      defined(CONFIG_CPU_SUBTYPE_SH7724)
 SCIx_FNS(SCSMR,  0x00, 16, 0x00, 16)
 SCIx_FNS(SCBRR,  0x04,  8, 0x04,  8)
 SCIx_FNS(SCSCR,  0x08, 16, 0x08, 16)
@@ -604,6 +609,17 @@ static inline int sci_rxd_in(struct uart_port *port)
                 return ctrl_inb(SCSPTR5) & 0x0008 ? 1 : 0; /* SCIF5 */
         return 1;
 }
+#elif defined(CONFIG_CPU_SUBTYPE_SH7724)
+#  define SCFSR    0x0010
+#  define SCASSR   0x0014
+static inline int sci_rxd_in(struct uart_port *port)
+{
+	if (port->type == PORT_SCIF)
+		return ctrl_inw((port->mapbase + SCFSR))  & SCIF_BRK ? 1 : 0;
+	if (port->type == PORT_SCIFA)
+		return ctrl_inw((port->mapbase + SCASSR)) & SCIF_BRK ? 1 : 0;
+	return 1;
+}
 #elif defined(CONFIG_CPU_SUBTYPE_SH5_101) || defined(CONFIG_CPU_SUBTYPE_SH5_103)
 static inline int sci_rxd_in(struct uart_port *port)
 {
@@ -757,7 +773,8 @@ static inline int sci_rxd_in(struct uart_port *port)
       defined(CONFIG_CPU_SUBTYPE_SH7720) || \
       defined(CONFIG_CPU_SUBTYPE_SH7721)
 #define SCBRR_VALUE(bps, clk) (((clk*2)+16*bps)/(32*bps)-1)
-#elif defined(CONFIG_CPU_SUBTYPE_SH7723)
+#elif defined(CONFIG_CPU_SUBTYPE_SH7723) ||\
+      defined(CONFIG_CPU_SUBTYPE_SH7724)
 static inline int scbrr_calc(struct uart_port *port, int bps, int clk)
 {
 	if (port->type == PORT_SCIF)
-- 
cgit v0.10.2


From 40c7e8be556715079d0a9d7454ceb5371a2f0b39 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Thu, 16 Apr 2009 13:16:07 +0900
Subject: sh: sh7724: Add I2C support.

This adds support for the SH-Mobile I2C controller on the SH7724.

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index 4327b1e..f17eda6 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -99,9 +99,55 @@ static struct platform_device rtc_device = {
 	.resource	= rtc_resources,
 };
 
+/* I2C0 */
+static struct resource iic0_resources[] = {
+	[0] = {
+		.name	= "IIC0",
+		.start  = 0x04470000,
+		.end    = 0x04470018 - 1,
+		.flags  = IORESOURCE_MEM,
+	},
+	[1] = {
+		.start  = 96,
+		.end    = 99,
+		.flags  = IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device iic0_device = {
+	.name           = "i2c-sh_mobile",
+	.id             = 0, /* "i2c0" clock */
+	.num_resources  = ARRAY_SIZE(iic0_resources),
+	.resource       = iic0_resources,
+};
+
+/* I2C1 */
+static struct resource iic1_resources[] = {
+	[0] = {
+		.name	= "IIC1",
+		.start  = 0x04750000,
+		.end    = 0x04750018 - 1,
+		.flags  = IORESOURCE_MEM,
+	},
+	[1] = {
+		.start  = 92,
+		.end    = 95,
+		.flags  = IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device iic1_device = {
+	.name           = "i2c-sh_mobile",
+	.id             = 1, /* "i2c1" clock */
+	.num_resources  = ARRAY_SIZE(iic1_resources),
+	.resource       = iic1_resources,
+};
+
 static struct platform_device *sh7724_devices[] __initdata = {
 	&sci_device,
 	&rtc_device,
+	&iic0_device,
+	&iic1_device,
 };
 
 static int __init sh7724_devices_setup(void)
-- 
cgit v0.10.2


From cd5b9ef776feff440e7a889d1a565ceabfecbfa1 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Wed, 15 Apr 2009 11:43:03 +0900
Subject: sh: sh7724: Add VPU support.

This adds uio_pdrv_genirq support for the VPU.

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index f17eda6..499a6fc 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -143,16 +143,49 @@ static struct platform_device iic1_device = {
 	.resource       = iic1_resources,
 };
 
+/* VPU */
+static struct uio_info vpu_platform_data = {
+	.name = "VPU5F",
+	.version = "0",
+	.irq = 60,
+};
+
+static struct resource vpu_resources[] = {
+	[0] = {
+		.name	= "VPU",
+		.start	= 0xfe900000,
+		.end	= 0xfe902807,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		/* place holder for contiguous memory */
+	},
+};
+
+static struct platform_device vpu_device = {
+	.name		= "uio_pdrv_genirq",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &vpu_platform_data,
+	},
+	.resource	= vpu_resources,
+	.num_resources	= ARRAY_SIZE(vpu_resources),
+};
+
 static struct platform_device *sh7724_devices[] __initdata = {
 	&sci_device,
 	&rtc_device,
 	&iic0_device,
 	&iic1_device,
+	&vpu_device,
 };
 
 static int __init sh7724_devices_setup(void)
 {
 	clk_always_enable("rtc0");   /* RTC */
+	clk_always_enable("vpu0");   /* VPU */
+
+	platform_resource_setup_memory(&vpu_device, "vpu", 2 << 20);
 
 	return platform_add_devices(sh7724_devices,
 				    ARRAY_SIZE(sh7724_devices));
-- 
cgit v0.10.2


From ad95b78c9f735da11ff9ec760e9b038cd82aead6 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Wed, 15 Apr 2009 11:43:07 +0900
Subject: sh: sh7724: Add VEU support.

This adds uio_pdrv_genirq support for the VEU.

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index 499a6fc..65570ed 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -172,20 +172,84 @@ static struct platform_device vpu_device = {
 	.num_resources	= ARRAY_SIZE(vpu_resources),
 };
 
+/* VEU0 */
+static struct uio_info veu0_platform_data = {
+	.name = "VEU3F0",
+	.version = "0",
+	.irq = 83,
+};
+
+static struct resource veu0_resources[] = {
+	[0] = {
+		.name	= "VEU3F0",
+		.start	= 0xfe920000,
+		.end	= 0xfe9200cb - 1,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		/* place holder for contiguous memory */
+	},
+};
+
+static struct platform_device veu0_device = {
+	.name		= "uio_pdrv_genirq",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &veu0_platform_data,
+	},
+	.resource	= veu0_resources,
+	.num_resources	= ARRAY_SIZE(veu0_resources),
+};
+
+/* VEU1 */
+static struct uio_info veu1_platform_data = {
+	.name = "VEU3F1",
+	.version = "0",
+	.irq = 54,
+};
+
+static struct resource veu1_resources[] = {
+	[0] = {
+		.name	= "VEU3F1",
+		.start	= 0xfe924000,
+		.end	= 0xfe9240cb - 1,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		/* place holder for contiguous memory */
+	},
+};
+
+static struct platform_device veu1_device = {
+	.name		= "uio_pdrv_genirq",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &veu1_platform_data,
+	},
+	.resource	= veu1_resources,
+	.num_resources	= ARRAY_SIZE(veu1_resources),
+};
+
 static struct platform_device *sh7724_devices[] __initdata = {
 	&sci_device,
 	&rtc_device,
 	&iic0_device,
 	&iic1_device,
 	&vpu_device,
+	&veu0_device,
+	&veu1_device,
 };
 
 static int __init sh7724_devices_setup(void)
 {
 	clk_always_enable("rtc0");   /* RTC */
 	clk_always_enable("vpu0");   /* VPU */
+	clk_always_enable("veu1");   /* VEU3F1 */
+	clk_always_enable("veu0");   /* VEU3F0 */
 
 	platform_resource_setup_memory(&vpu_device, "vpu", 2 << 20);
+	platform_resource_setup_memory(&veu0_device, "veu0", 2 << 20);
+	platform_resource_setup_memory(&veu1_device, "veu1", 2 << 20);
 
 	return platform_add_devices(sh7724_devices,
 				    ARRAY_SIZE(sh7724_devices));
-- 
cgit v0.10.2


From 6a3395beb99d7ae882ddf701c6fa6005ad7edebf Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 16 Apr 2009 15:36:13 +0900
Subject: sh: sh7724: Add CMT clockevents support.

This enables support for the CMT clockevents driver on SH7724.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index 65570ed..8b87ba8 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -230,7 +230,40 @@ static struct platform_device veu1_device = {
 	.num_resources	= ARRAY_SIZE(veu1_resources),
 };
 
+static struct sh_cmt_config cmt_platform_data = {
+	.name = "CMT",
+	.channel_offset = 0x60,
+	.timer_bit = 5,
+	.clk = "cmt0",
+	.clockevent_rating = 125,
+	.clocksource_rating = 200,
+};
+
+static struct resource cmt_resources[] = {
+	[0] = {
+		.name	= "CMT",
+		.start	= 0x044a0060,
+		.end	= 0x044a006b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 104,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device cmt_device = {
+	.name		= "sh_cmt",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &cmt_platform_data,
+	},
+	.resource	= cmt_resources,
+	.num_resources	= ARRAY_SIZE(cmt_resources),
+};
+
 static struct platform_device *sh7724_devices[] __initdata = {
+	&cmt_device,
 	&sci_device,
 	&rtc_device,
 	&iic0_device,
-- 
cgit v0.10.2


From 59fe700dcbf3d6257ae86ca8c0192fc64b2eea1c Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 16 Apr 2009 15:43:42 +0900
Subject: sh: Have SH7724 select ARCH_SHMOBILE.

This is an SH-Mobile CPU, so select ARCH_SHMOBILE. This enables all of
the PM functionality, amongst other things.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 505d1ac..4c68fde 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -351,6 +351,7 @@ config CPU_SUBTYPE_SH7724
 	bool "Support SH7724 processor"
 	select CPU_SH4A
 	select CPU_SHX2
+	select ARCH_SHMOBILE
 	select ARCH_SPARSEMEM_ENABLE
 	select SYS_SUPPORTS_CMT
 	help
-- 
cgit v0.10.2


From bc9f3d4291f8275924a9197a81208a5df0a15095 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 16 Apr 2009 15:44:58 +0900
Subject: sh: Add a generic defconfig for SH7724 platforms.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/configs/sh7724_generic_defconfig b/arch/sh/configs/sh7724_generic_defconfig
new file mode 100644
index 0000000..268d04e
--- /dev/null
+++ b/arch/sh/configs/sh7724_generic_defconfig
@@ -0,0 +1,707 @@
+#
+# Automatically generated make config: don't edit
+# Linux kernel version: 2.6.30-rc2
+# Thu Apr 16 15:42:20 2009
+#
+CONFIG_SUPERH=y
+CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
+CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
+CONFIG_RWSEM_GENERIC_SPINLOCK=y
+CONFIG_GENERIC_BUG=y
+CONFIG_GENERIC_FIND_NEXT_BIT=y
+CONFIG_GENERIC_HWEIGHT=y
+CONFIG_GENERIC_HARDIRQS=y
+CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
+CONFIG_GENERIC_IRQ_PROBE=y
+# CONFIG_GENERIC_GPIO is not set
+CONFIG_GENERIC_TIME=y
+CONFIG_GENERIC_CLOCKEVENTS=y
+CONFIG_ARCH_SUSPEND_POSSIBLE=y
+CONFIG_ARCH_HIBERNATION_POSSIBLE=y
+CONFIG_SYS_SUPPORTS_CMT=y
+CONFIG_STACKTRACE_SUPPORT=y
+CONFIG_LOCKDEP_SUPPORT=y
+CONFIG_HAVE_LATENCYTOP_SUPPORT=y
+# CONFIG_ARCH_HAS_ILOG2_U32 is not set
+# CONFIG_ARCH_HAS_ILOG2_U64 is not set
+CONFIG_ARCH_NO_VIRT_TO_BUS=y
+CONFIG_ARCH_HAS_DEFAULT_IDLE=y
+CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
+
+#
+# General setup
+#
+CONFIG_EXPERIMENTAL=y
+CONFIG_BROKEN_ON_SMP=y
+CONFIG_INIT_ENV_ARG_LIMIT=32
+CONFIG_LOCALVERSION=""
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_SWAP=y
+CONFIG_SYSVIPC=y
+CONFIG_SYSVIPC_SYSCTL=y
+# CONFIG_BSD_PROCESS_ACCT is not set
+
+#
+# RCU Subsystem
+#
+# CONFIG_CLASSIC_RCU is not set
+CONFIG_TREE_RCU=y
+# CONFIG_PREEMPT_RCU is not set
+# CONFIG_RCU_TRACE is not set
+CONFIG_RCU_FANOUT=32
+# CONFIG_RCU_FANOUT_EXACT is not set
+# CONFIG_TREE_RCU_TRACE is not set
+# CONFIG_PREEMPT_RCU_TRACE is not set
+# CONFIG_IKCONFIG is not set
+CONFIG_LOG_BUF_SHIFT=17
+CONFIG_GROUP_SCHED=y
+CONFIG_FAIR_GROUP_SCHED=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_USER_SCHED=y
+# CONFIG_CGROUP_SCHED is not set
+CONFIG_CGROUPS=y
+# CONFIG_CGROUP_DEBUG is not set
+# CONFIG_CGROUP_NS is not set
+# CONFIG_CGROUP_FREEZER is not set
+# CONFIG_CGROUP_DEVICE is not set
+# CONFIG_CPUSETS is not set
+# CONFIG_CGROUP_CPUACCT is not set
+# CONFIG_RESOURCE_COUNTERS is not set
+# CONFIG_RELAY is not set
+# CONFIG_NAMESPACES is not set
+# CONFIG_BLK_DEV_INITRD is not set
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+CONFIG_SYSCTL=y
+CONFIG_ANON_INODES=y
+CONFIG_EMBEDDED=y
+# CONFIG_UID16 is not set
+CONFIG_SYSCTL_SYSCALL=y
+CONFIG_KALLSYMS=y
+# CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
+CONFIG_HOTPLUG=y
+CONFIG_PRINTK=y
+CONFIG_BUG=y
+CONFIG_ELF_CORE=y
+CONFIG_BASE_FULL=y
+CONFIG_FUTEX=y
+CONFIG_EPOLL=y
+CONFIG_SIGNALFD=y
+CONFIG_TIMERFD=y
+CONFIG_EVENTFD=y
+CONFIG_SHMEM=y
+CONFIG_AIO=y
+CONFIG_VM_EVENT_COUNTERS=y
+# CONFIG_COMPAT_BRK is not set
+# CONFIG_SLAB is not set
+CONFIG_SLUB=y
+# CONFIG_SLOB is not set
+CONFIG_PROFILING=y
+CONFIG_TRACEPOINTS=y
+# CONFIG_MARKERS is not set
+CONFIG_OPROFILE=y
+CONFIG_HAVE_OPROFILE=y
+CONFIG_HAVE_IOREMAP_PROT=y
+CONFIG_HAVE_KPROBES=y
+CONFIG_HAVE_KRETPROBES=y
+CONFIG_HAVE_ARCH_TRACEHOOK=y
+CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
+CONFIG_HAVE_GENERIC_DMA_COHERENT=y
+CONFIG_RT_MUTEXES=y
+CONFIG_BASE_SMALL=0
+# CONFIG_MODULES is not set
+CONFIG_BLOCK=y
+# CONFIG_LBD is not set
+# CONFIG_BLK_DEV_BSG is not set
+# CONFIG_BLK_DEV_INTEGRITY is not set
+
+#
+# IO Schedulers
+#
+CONFIG_IOSCHED_NOOP=y
+CONFIG_IOSCHED_AS=y
+CONFIG_IOSCHED_DEADLINE=y
+CONFIG_IOSCHED_CFQ=y
+CONFIG_DEFAULT_AS=y
+# CONFIG_DEFAULT_DEADLINE is not set
+# CONFIG_DEFAULT_CFQ is not set
+# CONFIG_DEFAULT_NOOP is not set
+CONFIG_DEFAULT_IOSCHED="anticipatory"
+CONFIG_FREEZER=y
+
+#
+# System type
+#
+CONFIG_CPU_SH4=y
+CONFIG_CPU_SH4A=y
+CONFIG_CPU_SHX2=y
+CONFIG_ARCH_SHMOBILE=y
+# CONFIG_CPU_SUBTYPE_SH7619 is not set
+# CONFIG_CPU_SUBTYPE_SH7201 is not set
+# CONFIG_CPU_SUBTYPE_SH7203 is not set
+# CONFIG_CPU_SUBTYPE_SH7206 is not set
+# CONFIG_CPU_SUBTYPE_SH7263 is not set
+# CONFIG_CPU_SUBTYPE_MXG is not set
+# CONFIG_CPU_SUBTYPE_SH7705 is not set
+# CONFIG_CPU_SUBTYPE_SH7706 is not set
+# CONFIG_CPU_SUBTYPE_SH7707 is not set
+# CONFIG_CPU_SUBTYPE_SH7708 is not set
+# CONFIG_CPU_SUBTYPE_SH7709 is not set
+# CONFIG_CPU_SUBTYPE_SH7710 is not set
+# CONFIG_CPU_SUBTYPE_SH7712 is not set
+# CONFIG_CPU_SUBTYPE_SH7720 is not set
+# CONFIG_CPU_SUBTYPE_SH7721 is not set
+# CONFIG_CPU_SUBTYPE_SH7750 is not set
+# CONFIG_CPU_SUBTYPE_SH7091 is not set
+# CONFIG_CPU_SUBTYPE_SH7750R is not set
+# CONFIG_CPU_SUBTYPE_SH7750S is not set
+# CONFIG_CPU_SUBTYPE_SH7751 is not set
+# CONFIG_CPU_SUBTYPE_SH7751R is not set
+# CONFIG_CPU_SUBTYPE_SH7760 is not set
+# CONFIG_CPU_SUBTYPE_SH4_202 is not set
+# CONFIG_CPU_SUBTYPE_SH7723 is not set
+CONFIG_CPU_SUBTYPE_SH7724=y
+# CONFIG_CPU_SUBTYPE_SH7763 is not set
+# CONFIG_CPU_SUBTYPE_SH7770 is not set
+# CONFIG_CPU_SUBTYPE_SH7780 is not set
+# CONFIG_CPU_SUBTYPE_SH7785 is not set
+# CONFIG_CPU_SUBTYPE_SH7786 is not set
+# CONFIG_CPU_SUBTYPE_SHX3 is not set
+# CONFIG_CPU_SUBTYPE_SH7343 is not set
+# CONFIG_CPU_SUBTYPE_SH7722 is not set
+# CONFIG_CPU_SUBTYPE_SH7366 is not set
+
+#
+# Memory management options
+#
+CONFIG_QUICKLIST=y
+CONFIG_MMU=y
+CONFIG_PAGE_OFFSET=0x80000000
+CONFIG_MEMORY_START=0x08000000
+CONFIG_MEMORY_SIZE=0x04000000
+CONFIG_29BIT=y
+# CONFIG_X2TLB is not set
+CONFIG_VSYSCALL=y
+CONFIG_ARCH_FLATMEM_ENABLE=y
+CONFIG_ARCH_SPARSEMEM_ENABLE=y
+CONFIG_ARCH_SPARSEMEM_DEFAULT=y
+CONFIG_MAX_ACTIVE_REGIONS=1
+CONFIG_ARCH_POPULATES_NODE_MAP=y
+CONFIG_ARCH_SELECT_MEMORY_MODEL=y
+CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
+CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y
+CONFIG_PAGE_SIZE_4KB=y
+# CONFIG_PAGE_SIZE_8KB is not set
+# CONFIG_PAGE_SIZE_16KB is not set
+# CONFIG_PAGE_SIZE_64KB is not set
+CONFIG_ENTRY_OFFSET=0x00001000
+CONFIG_SELECT_MEMORY_MODEL=y
+# CONFIG_FLATMEM_MANUAL is not set
+# CONFIG_DISCONTIGMEM_MANUAL is not set
+CONFIG_SPARSEMEM_MANUAL=y
+CONFIG_SPARSEMEM=y
+CONFIG_HAVE_MEMORY_PRESENT=y
+CONFIG_SPARSEMEM_STATIC=y
+
+#
+# Memory hotplug is currently incompatible with Software Suspend
+#
+CONFIG_PAGEFLAGS_EXTENDED=y
+CONFIG_SPLIT_PTLOCK_CPUS=4
+CONFIG_MIGRATION=y
+# CONFIG_PHYS_ADDR_T_64BIT is not set
+CONFIG_ZONE_DMA_FLAG=0
+CONFIG_NR_QUICK=2
+CONFIG_UNEVICTABLE_LRU=y
+CONFIG_HAVE_MLOCK=y
+CONFIG_HAVE_MLOCKED_PAGE_BIT=y
+
+#
+# Cache configuration
+#
+CONFIG_CACHE_WRITEBACK=y
+# CONFIG_CACHE_WRITETHROUGH is not set
+# CONFIG_CACHE_OFF is not set
+
+#
+# Processor features
+#
+CONFIG_CPU_LITTLE_ENDIAN=y
+# CONFIG_CPU_BIG_ENDIAN is not set
+CONFIG_SH_FPU=y
+# CONFIG_SH_STORE_QUEUES is not set
+CONFIG_CPU_HAS_INTEVT=y
+CONFIG_CPU_HAS_SR_RB=y
+CONFIG_CPU_HAS_PTEA=y
+CONFIG_CPU_HAS_FPU=y
+
+#
+# Board support
+#
+
+#
+# Timer and clock configuration
+#
+CONFIG_SH_TMU=y
+CONFIG_SH_TIMER_CMT=y
+CONFIG_SH_TIMER_IRQ=16
+CONFIG_SH_PCLK_FREQ=41666666
+CONFIG_TICK_ONESHOT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
+
+#
+# CPU Frequency scaling
+#
+CONFIG_CPU_FREQ=y
+CONFIG_CPU_FREQ_TABLE=y
+# CONFIG_CPU_FREQ_DEBUG is not set
+CONFIG_CPU_FREQ_STAT=y
+# CONFIG_CPU_FREQ_STAT_DETAILS is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y
+# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set
+# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set
+# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set
+# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set
+CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
+# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set
+# CONFIG_CPU_FREQ_GOV_USERSPACE is not set
+# CONFIG_CPU_FREQ_GOV_ONDEMAND is not set
+# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set
+CONFIG_SH_CPU_FREQ=y
+
+#
+# DMA support
+#
+# CONFIG_SH_DMA is not set
+
+#
+# Companion Chips
+#
+
+#
+# Additional SuperH Device Drivers
+#
+# CONFIG_HEARTBEAT is not set
+# CONFIG_PUSH_SWITCH is not set
+
+#
+# Kernel features
+#
+# CONFIG_HZ_100 is not set
+CONFIG_HZ_250=y
+# CONFIG_HZ_300 is not set
+# CONFIG_HZ_1000 is not set
+CONFIG_HZ=250
+CONFIG_SCHED_HRTICK=y
+CONFIG_KEXEC=y
+# CONFIG_CRASH_DUMP is not set
+CONFIG_KEXEC_JUMP=y
+CONFIG_PREEMPT_NONE=y
+# CONFIG_PREEMPT_VOLUNTARY is not set
+# CONFIG_PREEMPT is not set
+CONFIG_GUSA=y
+
+#
+# Boot options
+#
+CONFIG_ZERO_PAGE_OFFSET=0x00001000
+CONFIG_BOOT_LINK_OFFSET=0x00800000
+# CONFIG_CMDLINE_BOOL is not set
+
+#
+# Bus options
+#
+# CONFIG_ARCH_SUPPORTS_MSI is not set
+# CONFIG_PCCARD is not set
+
+#
+# Executable file formats
+#
+CONFIG_BINFMT_ELF=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+# CONFIG_HAVE_AOUT is not set
+# CONFIG_BINFMT_MISC is not set
+
+#
+# Power management options (EXPERIMENTAL)
+#
+CONFIG_PM=y
+# CONFIG_PM_DEBUG is not set
+CONFIG_PM_SLEEP=y
+CONFIG_SUSPEND=y
+CONFIG_SUSPEND_FREEZER=y
+CONFIG_HIBERNATION=y
+CONFIG_PM_STD_PARTITION=""
+CONFIG_CPU_IDLE=y
+CONFIG_CPU_IDLE_GOV_LADDER=y
+CONFIG_CPU_IDLE_GOV_MENU=y
+# CONFIG_NET is not set
+
+#
+# Device Drivers
+#
+
+#
+# Generic Driver Options
+#
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_STANDALONE=y
+# CONFIG_PREVENT_FIRMWARE_BUILD is not set
+CONFIG_FW_LOADER=y
+CONFIG_FIRMWARE_IN_KERNEL=y
+CONFIG_EXTRA_FIRMWARE=""
+# CONFIG_SYS_HYPERVISOR is not set
+# CONFIG_MTD is not set
+# CONFIG_PARPORT is not set
+CONFIG_BLK_DEV=y
+# CONFIG_BLK_DEV_COW_COMMON is not set
+# CONFIG_BLK_DEV_LOOP is not set
+# CONFIG_BLK_DEV_RAM is not set
+# CONFIG_CDROM_PKTCDVD is not set
+# CONFIG_BLK_DEV_HD is not set
+# CONFIG_MISC_DEVICES is not set
+CONFIG_HAVE_IDE=y
+# CONFIG_IDE is not set
+
+#
+# SCSI device support
+#
+# CONFIG_RAID_ATTRS is not set
+# CONFIG_SCSI is not set
+# CONFIG_SCSI_DMA is not set
+# CONFIG_SCSI_NETLINK is not set
+# CONFIG_ATA is not set
+# CONFIG_MD is not set
+# CONFIG_PHONE is not set
+
+#
+# Input device support
+#
+# CONFIG_INPUT is not set
+
+#
+# Hardware I/O ports
+#
+# CONFIG_SERIO is not set
+# CONFIG_GAMEPORT is not set
+
+#
+# Character devices
+#
+# CONFIG_VT is not set
+# CONFIG_DEVKMEM is not set
+# CONFIG_SERIAL_NONSTANDARD is not set
+
+#
+# Serial drivers
+#
+# CONFIG_SERIAL_8250 is not set
+
+#
+# Non-8250 serial port support
+#
+CONFIG_SERIAL_SH_SCI=y
+CONFIG_SERIAL_SH_SCI_NR_UARTS=6
+CONFIG_SERIAL_SH_SCI_CONSOLE=y
+CONFIG_SERIAL_CORE=y
+CONFIG_SERIAL_CORE_CONSOLE=y
+# CONFIG_UNIX98_PTYS is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_IPMI_HANDLER is not set
+# CONFIG_HW_RANDOM is not set
+# CONFIG_R3964 is not set
+# CONFIG_RAW_DRIVER is not set
+# CONFIG_TCG_TPM is not set
+CONFIG_I2C=y
+CONFIG_I2C_BOARDINFO=y
+CONFIG_I2C_CHARDEV=y
+CONFIG_I2C_HELPER_AUTO=y
+
+#
+# I2C Hardware Bus support
+#
+
+#
+# I2C system bus drivers (mostly embedded / system-on-chip)
+#
+# CONFIG_I2C_OCORES is not set
+CONFIG_I2C_SH_MOBILE=y
+# CONFIG_I2C_SIMTEC is not set
+
+#
+# External I2C/SMBus adapter drivers
+#
+# CONFIG_I2C_PARPORT_LIGHT is not set
+# CONFIG_I2C_TAOS_EVM is not set
+
+#
+# Other I2C/SMBus bus drivers
+#
+# CONFIG_I2C_PCA_PLATFORM is not set
+
+#
+# Miscellaneous I2C Chip support
+#
+# CONFIG_DS1682 is not set
+# CONFIG_SENSORS_PCF8574 is not set
+# CONFIG_PCF8575 is not set
+# CONFIG_SENSORS_PCA9539 is not set
+# CONFIG_SENSORS_MAX6875 is not set
+# CONFIG_SENSORS_TSL2550 is not set
+# CONFIG_I2C_DEBUG_CORE is not set
+# CONFIG_I2C_DEBUG_ALGO is not set
+# CONFIG_I2C_DEBUG_BUS is not set
+# CONFIG_I2C_DEBUG_CHIP is not set
+# CONFIG_SPI is not set
+# CONFIG_W1 is not set
+# CONFIG_POWER_SUPPLY is not set
+# CONFIG_HWMON is not set
+# CONFIG_THERMAL is not set
+# CONFIG_THERMAL_HWMON is not set
+# CONFIG_WATCHDOG is not set
+CONFIG_SSB_POSSIBLE=y
+
+#
+# Sonics Silicon Backplane
+#
+# CONFIG_SSB is not set
+
+#
+# Multifunction device drivers
+#
+# CONFIG_MFD_CORE is not set
+# CONFIG_MFD_SM501 is not set
+# CONFIG_HTC_PASIC3 is not set
+# CONFIG_TWL4030_CORE is not set
+# CONFIG_MFD_TMIO is not set
+# CONFIG_PMIC_DA903X is not set
+# CONFIG_MFD_WM8400 is not set
+# CONFIG_MFD_WM8350_I2C is not set
+# CONFIG_MFD_PCF50633 is not set
+# CONFIG_REGULATOR is not set
+
+#
+# Multimedia devices
+#
+
+#
+# Multimedia core support
+#
+# CONFIG_VIDEO_DEV is not set
+# CONFIG_VIDEO_MEDIA is not set
+
+#
+# Multimedia drivers
+#
+# CONFIG_DAB is not set
+
+#
+# Graphics support
+#
+# CONFIG_VGASTATE is not set
+# CONFIG_VIDEO_OUTPUT_CONTROL is not set
+# CONFIG_FB is not set
+# CONFIG_BACKLIGHT_LCD_SUPPORT is not set
+
+#
+# Display device support
+#
+# CONFIG_DISPLAY_SUPPORT is not set
+# CONFIG_SOUND is not set
+# CONFIG_USB_SUPPORT is not set
+# CONFIG_MMC is not set
+# CONFIG_MEMSTICK is not set
+# CONFIG_NEW_LEDS is not set
+# CONFIG_ACCESSIBILITY is not set
+CONFIG_RTC_LIB=y
+CONFIG_RTC_CLASS=y
+CONFIG_RTC_HCTOSYS=y
+CONFIG_RTC_HCTOSYS_DEVICE="rtc0"
+# CONFIG_RTC_DEBUG is not set
+
+#
+# RTC interfaces
+#
+CONFIG_RTC_INTF_DEV=y
+# CONFIG_RTC_INTF_DEV_UIE_EMUL is not set
+# CONFIG_RTC_DRV_TEST is not set
+
+#
+# I2C RTC drivers
+#
+# CONFIG_RTC_DRV_DS1307 is not set
+# CONFIG_RTC_DRV_DS1374 is not set
+# CONFIG_RTC_DRV_DS1672 is not set
+# CONFIG_RTC_DRV_MAX6900 is not set
+# CONFIG_RTC_DRV_RS5C372 is not set
+# CONFIG_RTC_DRV_ISL1208 is not set
+# CONFIG_RTC_DRV_X1205 is not set
+# CONFIG_RTC_DRV_PCF8563 is not set
+# CONFIG_RTC_DRV_PCF8583 is not set
+# CONFIG_RTC_DRV_M41T80 is not set
+# CONFIG_RTC_DRV_S35390A is not set
+# CONFIG_RTC_DRV_FM3130 is not set
+# CONFIG_RTC_DRV_RX8581 is not set
+
+#
+# SPI RTC drivers
+#
+
+#
+# Platform RTC drivers
+#
+# CONFIG_RTC_DRV_DS1286 is not set
+# CONFIG_RTC_DRV_DS1511 is not set
+# CONFIG_RTC_DRV_DS1553 is not set
+# CONFIG_RTC_DRV_DS1742 is not set
+# CONFIG_RTC_DRV_STK17TA8 is not set
+# CONFIG_RTC_DRV_M48T86 is not set
+# CONFIG_RTC_DRV_M48T35 is not set
+# CONFIG_RTC_DRV_M48T59 is not set
+# CONFIG_RTC_DRV_BQ4802 is not set
+# CONFIG_RTC_DRV_V3020 is not set
+
+#
+# on-CPU RTC drivers
+#
+CONFIG_RTC_DRV_SH=y
+# CONFIG_DMADEVICES is not set
+# CONFIG_AUXDISPLAY is not set
+CONFIG_UIO=y
+# CONFIG_UIO_PDRV is not set
+CONFIG_UIO_PDRV_GENIRQ=y
+# CONFIG_UIO_SMX is not set
+# CONFIG_UIO_SERCOS3 is not set
+# CONFIG_STAGING is not set
+
+#
+# File systems
+#
+# CONFIG_EXT2_FS is not set
+# CONFIG_EXT3_FS is not set
+# CONFIG_EXT4_FS is not set
+# CONFIG_REISERFS_FS is not set
+# CONFIG_JFS_FS is not set
+# CONFIG_FS_POSIX_ACL is not set
+CONFIG_FILE_LOCKING=y
+# CONFIG_XFS_FS is not set
+# CONFIG_BTRFS_FS is not set
+# CONFIG_DNOTIFY is not set
+# CONFIG_INOTIFY is not set
+# CONFIG_QUOTA is not set
+# CONFIG_AUTOFS_FS is not set
+# CONFIG_AUTOFS4_FS is not set
+# CONFIG_FUSE_FS is not set
+
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
+# CD-ROM/DVD Filesystems
+#
+# CONFIG_ISO9660_FS is not set
+# CONFIG_UDF_FS is not set
+
+#
+# DOS/FAT/NT Filesystems
+#
+# CONFIG_MSDOS_FS is not set
+# CONFIG_VFAT_FS is not set
+# CONFIG_NTFS_FS is not set
+
+#
+# Pseudo filesystems
+#
+# CONFIG_PROC_FS is not set
+# CONFIG_SYSFS is not set
+# CONFIG_TMPFS is not set
+# CONFIG_HUGETLBFS is not set
+# CONFIG_HUGETLB_PAGE is not set
+# CONFIG_MISC_FILESYSTEMS is not set
+
+#
+# Partition Types
+#
+# CONFIG_PARTITION_ADVANCED is not set
+CONFIG_MSDOS_PARTITION=y
+# CONFIG_NLS is not set
+
+#
+# Kernel hacking
+#
+CONFIG_TRACE_IRQFLAGS_SUPPORT=y
+# CONFIG_PRINTK_TIME is not set
+# CONFIG_ENABLE_WARN_DEPRECATED is not set
+# CONFIG_ENABLE_MUST_CHECK is not set
+CONFIG_FRAME_WARN=1024
+# CONFIG_MAGIC_SYSRQ is not set
+# CONFIG_UNUSED_SYMBOLS is not set
+CONFIG_DEBUG_FS=y
+# CONFIG_HEADERS_CHECK is not set
+# CONFIG_DEBUG_KERNEL is not set
+CONFIG_STACKTRACE=y
+# CONFIG_DEBUG_BUGVERBOSE is not set
+# CONFIG_DEBUG_MEMORY_INIT is not set
+# CONFIG_RCU_CPU_STALL_DETECTOR is not set
+# CONFIG_LATENCYTOP is not set
+# CONFIG_SYSCTL_SYSCALL_CHECK is not set
+CONFIG_NOP_TRACER=y
+CONFIG_HAVE_FUNCTION_TRACER=y
+CONFIG_HAVE_DYNAMIC_FTRACE=y
+CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_RING_BUFFER=y
+CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
+
+#
+# Tracers
+#
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_FTRACE_STARTUP_TEST is not set
+# CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
+# CONFIG_SAMPLES is not set
+CONFIG_HAVE_ARCH_KGDB=y
+# CONFIG_SH_STANDARD_BIOS is not set
+# CONFIG_EARLY_SCIF_CONSOLE is not set
+# CONFIG_MORE_COMPILE_OPTIONS is not set
+
+#
+# Security options
+#
+# CONFIG_KEYS is not set
+# CONFIG_SECURITYFS is not set
+# CONFIG_SECURITY_FILE_CAPABILITIES is not set
+# CONFIG_CRYPTO is not set
+CONFIG_BINARY_PRINTF=y
+
+#
+# Library routines
+#
+CONFIG_GENERIC_FIND_LAST_BIT=y
+# CONFIG_CRC_CCITT is not set
+# CONFIG_CRC16 is not set
+# CONFIG_CRC_T10DIF is not set
+# CONFIG_CRC_ITU_T is not set
+# CONFIG_CRC32 is not set
+# CONFIG_CRC7 is not set
+# CONFIG_LIBCRC32C is not set
+CONFIG_HAS_IOMEM=y
+CONFIG_HAS_IOPORT=y
+CONFIG_HAS_DMA=y
-- 
cgit v0.10.2


From b8b47bfbe4eb1ae0e6891e49c86a5f4fb00413be Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 11 Mar 2009 15:41:51 +0900
Subject: sh: pass along struct pci_channel

These patches rework the pci code for the sh architecture.

Currently each board implements some kind of ioport to address mapping.
Some boards use generic_io_base others try passing addresses as io ports.
This is the first set of patches that try to unify the pci code as much
as possible to avoid duplicated code. This will in the end lead to fewer
lines board specific code and more generic code.

This patch makes sure a struct pci_channel pointer is passed along to
various pci functions such as pci_read_reg(), pci_write_reg(),
pci_fixup_pcic(), sh7751_pcic_init() and sh7780_pcic_init().

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/fixups-lboxre2.c b/arch/sh/drivers/pci/fixups-lboxre2.c
index 1c1d412..a82011d 100644
--- a/arch/sh/drivers/pci/fixups-lboxre2.c
+++ b/arch/sh/drivers/pci/fixups-lboxre2.c
@@ -9,33 +9,34 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  */
+#include <linux/pci.h>
 #include "pci-sh4.h"
 
 #define PCIMCR_MRSET_OFF	0xBFFFFFFF
 #define PCIMCR_RFSH_OFF		0xFFFFFFFB
 
-int pci_fixup_pcic(void)
+int pci_fixup_pcic(struct pci_channel *chan)
 {
 	unsigned long bcr1, mcr;
 
 	bcr1 = ctrl_inl(SH7751_BCR1);
 	bcr1 |= 0x40080000;	/* Enable Bit 19 BREQEN, set PCIC to slave */
-	pci_write_reg(bcr1, SH4_PCIBCR1);
+	pci_write_reg(chan, bcr1, SH4_PCIBCR1);
 
 	/* Enable all interrupts, so we known what to fix */
-	pci_write_reg(0x0000c3ff, SH4_PCIINTM);
-	pci_write_reg(0x0000380f, SH4_PCIAINTM);
-	pci_write_reg(0xfb900047, SH7751_PCICONF1);
-	pci_write_reg(0xab000001, SH7751_PCICONF4);
+	pci_write_reg(chan, 0x0000c3ff, SH4_PCIINTM);
+	pci_write_reg(chan, 0x0000380f, SH4_PCIAINTM);
+	pci_write_reg(chan, 0xfb900047, SH7751_PCICONF1);
+	pci_write_reg(chan, 0xab000001, SH7751_PCICONF4);
 
 	mcr = ctrl_inl(SH7751_MCR);
 	mcr = (mcr & PCIMCR_MRSET_OFF) & PCIMCR_RFSH_OFF;
-	pci_write_reg(mcr, SH4_PCIMCR);
+	pci_write_reg(chan, mcr, SH4_PCIMCR);
 
-	pci_write_reg(0x0c000000, SH7751_PCICONF5);
-	pci_write_reg(0xd0000000, SH7751_PCICONF6);
-	pci_write_reg(0x0c000000, SH4_PCILAR0);
-	pci_write_reg(0x00000000, SH4_PCILAR1);
+	pci_write_reg(chan, 0x0c000000, SH7751_PCICONF5);
+	pci_write_reg(chan, 0xd0000000, SH7751_PCICONF6);
+	pci_write_reg(chan, 0x0c000000, SH4_PCILAR0);
+	pci_write_reg(chan, 0x00000000, SH4_PCILAR1);
 
 	return 0;
 }
diff --git a/arch/sh/drivers/pci/fixups-r7780rp.c b/arch/sh/drivers/pci/fixups-r7780rp.c
index 3e321df..5b25021 100644
--- a/arch/sh/drivers/pci/fixups-r7780rp.c
+++ b/arch/sh/drivers/pci/fixups-r7780rp.c
@@ -14,32 +14,33 @@
 #include "pci-sh4.h"
 #include <asm/io.h>
 
-int pci_fixup_pcic(void)
+int pci_fixup_pcic(struct pci_channel *chan)
 {
-	pci_write_reg(0x000043ff, SH4_PCIINTM);
-	pci_write_reg(0x0000380f, SH4_PCIAINTM);
+	pci_write_reg(chan, 0x000043ff, SH4_PCIINTM);
+	pci_write_reg(chan, 0x0000380f, SH4_PCIAINTM);
 
-	pci_write_reg(0xfbb00047, SH7780_PCICMD);
-	pci_write_reg(0x00000000, SH7780_PCIIBAR);
+	pci_write_reg(chan, 0xfbb00047, SH7780_PCICMD);
+	pci_write_reg(chan, 0x00000000, SH7780_PCIIBAR);
 
-	pci_write_reg(0x00011912, SH7780_PCISVID);
-	pci_write_reg(0x08000000, SH7780_PCICSCR0);
-	pci_write_reg(0x0000001b, SH7780_PCICSAR0);
-	pci_write_reg(0xfd000000, SH7780_PCICSCR1);
-	pci_write_reg(0x0000000f, SH7780_PCICSAR1);
+	pci_write_reg(chan, 0x00011912, SH7780_PCISVID);
+	pci_write_reg(chan, 0x08000000, SH7780_PCICSCR0);
+	pci_write_reg(chan, 0x0000001b, SH7780_PCICSAR0);
+	pci_write_reg(chan, 0xfd000000, SH7780_PCICSCR1);
+	pci_write_reg(chan, 0x0000000f, SH7780_PCICSAR1);
 
-	pci_write_reg(0xfd000000, SH7780_PCIMBR0);
-	pci_write_reg(0x00fc0000, SH7780_PCIMBMR0);
+	pci_write_reg(chan, 0xfd000000, SH7780_PCIMBR0);
+	pci_write_reg(chan, 0x00fc0000, SH7780_PCIMBMR0);
 
 #ifdef CONFIG_32BIT
-	pci_write_reg(0xc0000000, SH7780_PCIMBR2);
-	pci_write_reg(0x20000000 - SH7780_PCI_IO_SIZE, SH7780_PCIMBMR2);
+	pci_write_reg(chan, 0xc0000000, SH7780_PCIMBR2);
+	pci_write_reg(chan, 0x20000000 - SH7780_PCI_IO_SIZE, SH7780_PCIMBMR2);
 #endif
 
 	/* Set IOBR for windows containing area specified in pci.h */
-	pci_write_reg((PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE - 1)),
+	pci_write_reg(chan, (PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE - 1)),
 		      SH7780_PCIIOBR);
-	pci_write_reg(((SH7780_PCI_IO_SIZE-1) & (7<<18)), SH7780_PCIIOBMR);
+	pci_write_reg(chan, ((SH7780_PCI_IO_SIZE-1) & (7<<18)),
+		      SH7780_PCIIOBMR);
 
 	return 0;
 }
diff --git a/arch/sh/drivers/pci/fixups-rts7751r2d.c b/arch/sh/drivers/pci/fixups-rts7751r2d.c
index 904bce8..3885233 100644
--- a/arch/sh/drivers/pci/fixups-rts7751r2d.c
+++ b/arch/sh/drivers/pci/fixups-rts7751r2d.c
@@ -10,34 +10,35 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  */
+#include <linux/pci.h>
 #include "pci-sh4.h"
 
 #define PCIMCR_MRSET_OFF	0xBFFFFFFF
 #define PCIMCR_RFSH_OFF		0xFFFFFFFB
 
-int pci_fixup_pcic(void)
+int pci_fixup_pcic(struct pci_channel *chan)
 {
 	unsigned long bcr1, mcr;
 
 	bcr1 = ctrl_inl(SH7751_BCR1);
 	bcr1 |= 0x40080000;	/* Enable Bit 19 BREQEN, set PCIC to slave */
-	pci_write_reg(bcr1, SH4_PCIBCR1);
+	pci_write_reg(chan, bcr1, SH4_PCIBCR1);
 
 	/* Enable all interrupts, so we known what to fix */
-	pci_write_reg(0x0000c3ff, SH4_PCIINTM);
-	pci_write_reg(0x0000380f, SH4_PCIAINTM);
+	pci_write_reg(chan, 0x0000c3ff, SH4_PCIINTM);
+	pci_write_reg(chan, 0x0000380f, SH4_PCIAINTM);
 
-	pci_write_reg(0xfb900047, SH7751_PCICONF1);
-	pci_write_reg(0xab000001, SH7751_PCICONF4);
+	pci_write_reg(chan, 0xfb900047, SH7751_PCICONF1);
+	pci_write_reg(chan, 0xab000001, SH7751_PCICONF4);
 
 	mcr = ctrl_inl(SH7751_MCR);
 	mcr = (mcr & PCIMCR_MRSET_OFF) & PCIMCR_RFSH_OFF;
-	pci_write_reg(mcr, SH4_PCIMCR);
+	pci_write_reg(chan, mcr, SH4_PCIMCR);
 
-	pci_write_reg(0x0c000000, SH7751_PCICONF5);
-	pci_write_reg(0xd0000000, SH7751_PCICONF6);
-	pci_write_reg(0x0c000000, SH4_PCILAR0);
-	pci_write_reg(0x00000000, SH4_PCILAR1);
+	pci_write_reg(chan, 0x0c000000, SH7751_PCICONF5);
+	pci_write_reg(chan, 0xd0000000, SH7751_PCICONF6);
+	pci_write_reg(chan, 0x0c000000, SH4_PCILAR0);
+	pci_write_reg(chan, 0x00000000, SH4_PCILAR1);
 
 	return 0;
 }
diff --git a/arch/sh/drivers/pci/fixups-sdk7780.c b/arch/sh/drivers/pci/fixups-sdk7780.c
index 2f88630..3f6754a 100644
--- a/arch/sh/drivers/pci/fixups-sdk7780.c
+++ b/arch/sh/drivers/pci/fixups-sdk7780.c
@@ -14,46 +14,48 @@
 #include "pci-sh4.h"
 #include <asm/io.h>
 
-int pci_fixup_pcic(void)
+int pci_fixup_pcic(struct pci_channel *chan)
 {
 	ctrl_outl(0x00000001, SH7780_PCI_VCR2);
 
 	/* Enable all interrupts, so we know what to fix */
-	pci_write_reg(0x0000C3FF, SH7780_PCIIMR);
-	pci_write_reg(0x0000380F, SH7780_PCIAINTM);
+	pci_write_reg(chan, 0x0000C3FF, SH7780_PCIIMR);
+	pci_write_reg(chan, 0x0000380F, SH7780_PCIAINTM);
 
 	/* Set up standard PCI config registers */
-	pci_write_reg(0xFB00, SH7780_PCISTATUS);
-	pci_write_reg(0x0047, SH7780_PCICMD);
-	pci_write_reg(0x00, SH7780_PCIPIF);
-	pci_write_reg(0x00, SH7780_PCISUB);
-	pci_write_reg(0x06, SH7780_PCIBCC);
-	pci_write_reg(0x1912, SH7780_PCISVID);
-	pci_write_reg(0x0001, SH7780_PCISID);
+	pci_write_reg(chan, 0xFB00, SH7780_PCISTATUS);
+	pci_write_reg(chan, 0x0047, SH7780_PCICMD);
+	pci_write_reg(chan, 0x00, SH7780_PCIPIF);
+	pci_write_reg(chan, 0x00, SH7780_PCISUB);
+	pci_write_reg(chan, 0x06, SH7780_PCIBCC);
+	pci_write_reg(chan, 0x1912, SH7780_PCISVID);
+	pci_write_reg(chan, 0x0001, SH7780_PCISID);
 
-	pci_write_reg(0x08000000, SH7780_PCIMBAR0);	/* PCI */
-	pci_write_reg(0x08000000, SH7780_PCILAR0);	/* SHwy */
-	pci_write_reg(0x07F00001, SH7780_PCILSR);	/* size 128M w/ MBAR */
+	pci_write_reg(chan, 0x08000000, SH7780_PCIMBAR0);	/* PCI */
+	pci_write_reg(chan, 0x08000000, SH7780_PCILAR0);	/* SHwy */
+	pci_write_reg(chan, 0x07F00001, SH7780_PCILSR);	/* size 128M w/ MBAR */
 
-	pci_write_reg(0x00000000, SH7780_PCIMBAR1);
-	pci_write_reg(0x00000000, SH7780_PCILAR1);
-	pci_write_reg(0x00000000, SH7780_PCILSR1);
+	pci_write_reg(chan, 0x00000000, SH7780_PCIMBAR1);
+	pci_write_reg(chan, 0x00000000, SH7780_PCILAR1);
+	pci_write_reg(chan, 0x00000000, SH7780_PCILSR1);
 
-	pci_write_reg(0xAB000801, SH7780_PCIIBAR);
+	pci_write_reg(chan, 0xAB000801, SH7780_PCIIBAR);
 
 	/*
 	 * Set the MBR so PCI address is one-to-one with window,
 	 * meaning all calls go straight through... use ifdef to
 	 * catch erroneous assumption.
 	 */
-	pci_write_reg(0xFD000000 , SH7780_PCIMBR0);
-	pci_write_reg(0x00FC0000 , SH7780_PCIMBMR0);	/* 16M */
+	pci_write_reg(chan, 0xFD000000 , SH7780_PCIMBR0);
+	pci_write_reg(chan, 0x00FC0000 , SH7780_PCIMBMR0);	/* 16M */
 
 	/* Set IOBR for window containing area specified in pci.h */
-	pci_write_reg(PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE-1), SH7780_PCIIOBR);
-	pci_write_reg((SH7780_PCI_IO_SIZE-1) & (7 << 18), SH7780_PCIIOBMR);
+	pci_write_reg(chan, PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE-1),
+		      SH7780_PCIIOBR);
+	pci_write_reg(chan, (SH7780_PCI_IO_SIZE-1) & (7 << 18),
+		      SH7780_PCIIOBMR);
 
-	pci_write_reg(0xA5000C01, SH7780_PCICR);
+	pci_write_reg(chan, 0xA5000C01, SH7780_PCICR);
 
 	return 0;
 }
diff --git a/arch/sh/drivers/pci/fixups-se7780.c b/arch/sh/drivers/pci/fixups-se7780.c
index 880cea1..b8e735e 100644
--- a/arch/sh/drivers/pci/fixups-se7780.c
+++ b/arch/sh/drivers/pci/fixups-se7780.c
@@ -15,13 +15,13 @@
 #include "pci-sh4.h"
 #include <asm/io.h>
 
-int pci_fixup_pcic(void)
+int pci_fixup_pcic(struct pci_channel *chan)
 {
 	ctrl_outl(0x00000001, SH7780_PCI_VCR2);
 
 	/* Enable all interrupts, so we know what to fix */
-	pci_write_reg(0x0000C3FF, SH7780_PCIIMR);
-	pci_write_reg(0x0000380F, SH7780_PCIAINTM);
+	pci_write_reg(chan, 0x0000C3FF, SH7780_PCIIMR);
+	pci_write_reg(chan, 0x0000380F, SH7780_PCIAINTM);
 
 	/* Set up standard PCI config registers */
 	ctrl_outw(0xFB00, PCI_REG(SH7780_PCISTATUS));
@@ -32,29 +32,31 @@ int pci_fixup_pcic(void)
 	ctrl_outw(0x1912, PCI_REG(SH7780_PCISVID));
 	ctrl_outw(0x0001, PCI_REG(SH7780_PCISID));
 
-	pci_write_reg(0x08000000, SH7780_PCIMBAR0);     /* PCI */
-	pci_write_reg(0x08000000, SH7780_PCILAR0);     /* SHwy */
-	pci_write_reg(0x07F00001, SH7780_PCILSR);      /* size 128M w/ MBAR */
+	pci_write_reg(chan, 0x08000000, SH7780_PCIMBAR0);     /* PCI */
+	pci_write_reg(chan, 0x08000000, SH7780_PCILAR0);     /* SHwy */
+	pci_write_reg(chan, 0x07F00001, SH7780_PCILSR); /* size 128M w/ MBAR */
 
-	pci_write_reg(0x00000000, SH7780_PCIMBAR1);
-	pci_write_reg(0x00000000, SH7780_PCILAR1);
-	pci_write_reg(0x00000000, SH7780_PCILSR1);
+	pci_write_reg(chan, 0x00000000, SH7780_PCIMBAR1);
+	pci_write_reg(chan, 0x00000000, SH7780_PCILAR1);
+	pci_write_reg(chan, 0x00000000, SH7780_PCILSR1);
 
-	pci_write_reg(0xAB000801, SH7780_PCIIBAR);
+	pci_write_reg(chan, 0xAB000801, SH7780_PCIIBAR);
 
 	/*
 	 * Set the MBR so PCI address is one-to-one with window,
 	 * meaning all calls go straight through... use ifdef to
 	 * catch erroneous assumption.
 	 */
-	pci_write_reg(0xFD000000 , SH7780_PCIMBR0);
-	pci_write_reg(0x00FC0000 , SH7780_PCIMBMR0);    /* 16M */
+	pci_write_reg(chan, 0xFD000000 , SH7780_PCIMBR0);
+	pci_write_reg(chan, 0x00FC0000 , SH7780_PCIMBMR0);    /* 16M */
 
 	/* Set IOBR for window containing area specified in pci.h */
-	pci_write_reg(PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE-1), SH7780_PCIIOBR);
-	pci_write_reg((SH7780_PCI_IO_SIZE-1) & (7 << 18), SH7780_PCIIOBMR);
+	pci_write_reg(chan, PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE-1),
+		      SH7780_PCIIOBR);
+	pci_write_reg(chan, (SH7780_PCI_IO_SIZE-1) & (7 << 18),
+		      SH7780_PCIIOBMR);
 
-	pci_write_reg(0xA5000C01, SH7780_PCICR);
+	pci_write_reg(chan, 0xA5000C01, SH7780_PCICR);
 
 	return 0;
 }
diff --git a/arch/sh/drivers/pci/ops-landisk.c b/arch/sh/drivers/pci/ops-landisk.c
index bff09ec..343c072 100644
--- a/arch/sh/drivers/pci/ops-landisk.c
+++ b/arch/sh/drivers/pci/ops-landisk.c
@@ -45,7 +45,7 @@ static struct sh4_pci_address_map sh7751_pci_map = {
 
 int __init pcibios_init_platform(void)
 {
-	return sh7751_pcic_init(&sh7751_pci_map);
+	return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map);
 }
 
 int pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
diff --git a/arch/sh/drivers/pci/ops-lboxre2.c b/arch/sh/drivers/pci/ops-lboxre2.c
index 86c0b6f..8bff32a 100644
--- a/arch/sh/drivers/pci/ops-lboxre2.c
+++ b/arch/sh/drivers/pci/ops-lboxre2.c
@@ -59,5 +59,5 @@ static struct sh4_pci_address_map sh7751_pci_map = {
 
 int __init pcibios_init_platform(void)
 {
-	return sh7751_pcic_init(&sh7751_pci_map);
+	return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map);
 }
diff --git a/arch/sh/drivers/pci/ops-r7780rp.c b/arch/sh/drivers/pci/ops-r7780rp.c
index 8555238..bf32ee8 100644
--- a/arch/sh/drivers/pci/ops-r7780rp.c
+++ b/arch/sh/drivers/pci/ops-r7780rp.c
@@ -64,5 +64,5 @@ static struct sh4_pci_address_map sh7780_pci_map = {
 
 int __init pcibios_init_platform(void)
 {
-	return sh7780_pcic_init(&sh7780_pci_map);
+	return sh7780_pcic_init(&board_pci_channels[0], &sh7780_pci_map);
 }
diff --git a/arch/sh/drivers/pci/ops-rts7751r2d.c b/arch/sh/drivers/pci/ops-rts7751r2d.c
index d6ca74b..e4208a6 100644
--- a/arch/sh/drivers/pci/ops-rts7751r2d.c
+++ b/arch/sh/drivers/pci/ops-rts7751r2d.c
@@ -69,6 +69,6 @@ static struct sh4_pci_address_map sh7751_pci_map = {
 int __init pcibios_init_platform(void)
 {
 	__set_io_port_base(SH7751_PCI_IO_BASE);
-	return sh7751_pcic_init(&sh7751_pci_map);
+	return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map);
 }
 
diff --git a/arch/sh/drivers/pci/ops-sdk7780.c b/arch/sh/drivers/pci/ops-sdk7780.c
index 4dcc641..21d59d4 100644
--- a/arch/sh/drivers/pci/ops-sdk7780.c
+++ b/arch/sh/drivers/pci/ops-sdk7780.c
@@ -69,5 +69,5 @@ static struct sh4_pci_address_map sdk7780_pci_map = {
 int __init pcibios_init_platform(void)
 {
 	printk(KERN_INFO "SH7780 PCI: Finished initializing PCI controller\n");
-	return sh7780_pcic_init(&sdk7780_pci_map);
+	return sh7780_pcic_init(&board_pci_channels[0], &sdk7780_pci_map);
 }
diff --git a/arch/sh/drivers/pci/ops-se7780.c b/arch/sh/drivers/pci/ops-se7780.c
index 3145c62..78a6f2b 100644
--- a/arch/sh/drivers/pci/ops-se7780.c
+++ b/arch/sh/drivers/pci/ops-se7780.c
@@ -92,5 +92,5 @@ int __init pcibios_init_platform(void)
 	ctrl_outw(0x0013, FPGA_PCI_INTSEL1);
 	ctrl_outw(0xE402, FPGA_PCI_INTSEL2);
 
-	return sh7780_pcic_init(&se7780_pci_map);
+	return sh7780_pcic_init(&board_pci_channels[0], &se7780_pci_map);
 }
diff --git a/arch/sh/drivers/pci/ops-sh4.c b/arch/sh/drivers/pci/ops-sh4.c
index 710a3b0..92d27f7 100644
--- a/arch/sh/drivers/pci/ops-sh4.c
+++ b/arch/sh/drivers/pci/ops-sh4.c
@@ -34,8 +34,8 @@ static int sh4_pci_read(struct pci_bus *bus, unsigned int devfn,
 	 * so we must do byte alignment by hand
 	 */
 	spin_lock_irqsave(&sh4_pci_lock, flags);
-	pci_write_reg(CONFIG_CMD(bus, devfn, where), SH4_PCIPAR);
-	data = pci_read_reg(SH4_PCIPDR);
+	pci_write_reg(NULL, CONFIG_CMD(bus, devfn, where), SH4_PCIPAR);
+	data = pci_read_reg(NULL, SH4_PCIPDR);
 	spin_unlock_irqrestore(&sh4_pci_lock, flags);
 
 	switch (size) {
@@ -68,8 +68,8 @@ static int sh4_pci_write(struct pci_bus *bus, unsigned int devfn,
 	u32 data;
 
 	spin_lock_irqsave(&sh4_pci_lock, flags);
-	pci_write_reg(CONFIG_CMD(bus, devfn, where), SH4_PCIPAR);
-	data = pci_read_reg(SH4_PCIPDR);
+	pci_write_reg(NULL, CONFIG_CMD(bus, devfn, where), SH4_PCIPAR);
+	data = pci_read_reg(NULL, SH4_PCIPDR);
 	spin_unlock_irqrestore(&sh4_pci_lock, flags);
 
 	switch (size) {
@@ -90,7 +90,7 @@ static int sh4_pci_write(struct pci_bus *bus, unsigned int devfn,
 		return PCIBIOS_FUNC_NOT_SUPPORTED;
 	}
 
-	pci_write_reg(data, SH4_PCIPDR);
+	pci_write_reg(NULL, data, SH4_PCIPDR);
 
 	return PCIBIOS_SUCCESSFUL;
 }
@@ -106,25 +106,25 @@ struct pci_ops sh4_pci_ops = {
  */
 static unsigned int pci_probe = PCI_PROBE_CONF1;
 
-int __init sh4_pci_check_direct(void)
+int __init sh4_pci_check_direct(struct pci_channel *chan)
 {
 	/*
 	 * Check if configuration works.
 	 */
 	if (pci_probe & PCI_PROBE_CONF1) {
-		unsigned int tmp = pci_read_reg(SH4_PCIPAR);
+		unsigned int tmp = pci_read_reg(chan, SH4_PCIPAR);
 
-		pci_write_reg(P1SEG, SH4_PCIPAR);
+		pci_write_reg(chan, P1SEG, SH4_PCIPAR);
 
-		if (pci_read_reg(SH4_PCIPAR) == P1SEG) {
-			pci_write_reg(tmp, SH4_PCIPAR);
+		if (pci_read_reg(chan, SH4_PCIPAR) == P1SEG) {
+			pci_write_reg(chan, tmp, SH4_PCIPAR);
 			printk(KERN_INFO "PCI: Using configuration type 1\n");
 			request_region(PCI_REG(SH4_PCIPAR), 8, "PCI conf1");
 
 			return 0;
 		}
 
-		pci_write_reg(tmp, SH4_PCIPAR);
+		pci_write_reg(chan, tmp, SH4_PCIPAR);
 	}
 
 	pr_debug("PCI: pci_check_direct failed\n");
@@ -163,7 +163,7 @@ char * __devinit pcibios_setup(char *str)
 	return str;
 }
 
-int __attribute__((weak)) pci_fixup_pcic(void)
+int __attribute__((weak)) pci_fixup_pcic(struct pci_channel *chan)
 {
 	/* Nothing to do. */
 	return 0;
diff --git a/arch/sh/drivers/pci/ops-snapgear.c b/arch/sh/drivers/pci/ops-snapgear.c
index 53dd893..cba8015 100644
--- a/arch/sh/drivers/pci/ops-snapgear.c
+++ b/arch/sh/drivers/pci/ops-snapgear.c
@@ -66,7 +66,7 @@ static struct sh4_pci_address_map sh7751_pci_map = {
  */
 int __init pcibios_init_platform(void)
 {
-	return sh7751_pcic_init(&sh7751_pci_map);
+	return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map);
 }
 
 int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
diff --git a/arch/sh/drivers/pci/ops-titan.c b/arch/sh/drivers/pci/ops-titan.c
index a8f7801..69fcc5c 100644
--- a/arch/sh/drivers/pci/ops-titan.c
+++ b/arch/sh/drivers/pci/ops-titan.c
@@ -73,5 +73,5 @@ static struct sh4_pci_address_map sh7751_pci_map = {
 
 int __init pcibios_init_platform(void)
 {
-	return sh7751_pcic_init(&sh7751_pci_map);
+	return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map);
 }
diff --git a/arch/sh/drivers/pci/pci-sh4.h b/arch/sh/drivers/pci/pci-sh4.h
index a83dcf7..62ba350 100644
--- a/arch/sh/drivers/pci/pci-sh4.h
+++ b/arch/sh/drivers/pci/pci-sh4.h
@@ -154,8 +154,8 @@
 
 /* arch/sh/kernel/drivers/pci/ops-sh4.c */
 extern struct pci_ops sh4_pci_ops;
-int sh4_pci_check_direct(void);
-int pci_fixup_pcic(void);
+int sh4_pci_check_direct(struct pci_channel *chan);
+int pci_fixup_pcic(struct pci_channel *chan);
 
 struct sh4_pci_address_space {
 	unsigned long base;
@@ -168,13 +168,16 @@ struct sh4_pci_address_map {
 	unsigned long flags;
 };
 
-static inline void pci_write_reg(unsigned long val, unsigned long reg)
+static inline void pci_write_reg(struct pci_channel *chan,
+				 unsigned long val, unsigned long reg)
 {
 	ctrl_outl(val, PCI_REG(reg));
 }
 
-static inline unsigned long pci_read_reg(unsigned long reg)
+static inline unsigned long pci_read_reg(struct pci_channel *chan,
+					 unsigned long reg)
 {
 	return ctrl_inl(PCI_REG(reg));
 }
+
 #endif /* __PCI_SH4_H */
diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c
index 3065eb184..9c2c014 100644
--- a/arch/sh/drivers/pci/pci-sh7751.c
+++ b/arch/sh/drivers/pci/pci-sh7751.c
@@ -40,21 +40,22 @@ static int __init sh7751_pci_init(void)
 	pr_debug("PCI: Starting intialization.\n");
 
 	/* check for SH7751/SH7751R hardware */
-	id = pci_read_reg(SH7751_PCICONF0);
+	id = pci_read_reg(NULL, SH7751_PCICONF0);
 	if (id != ((SH7751_DEVICE_ID << 16) | SH7751_VENDOR_ID) &&
 	    id != ((SH7751R_DEVICE_ID << 16) | SH7751_VENDOR_ID)) {
 		pr_debug("PCI: This is not an SH7751(R) (%x)\n", id);
 		return -ENODEV;
 	}
 
-	if ((ret = sh4_pci_check_direct()) != 0)
+	if ((ret = sh4_pci_check_direct(NULL)) != 0)
 		return ret;
 
 	return pcibios_init_platform();
 }
 subsys_initcall(sh7751_pci_init);
 
-static int __init __area_sdram_check(unsigned int area)
+static int __init __area_sdram_check(struct pci_channel *chan,
+				     unsigned int area)
 {
 	u32 word;
 
@@ -65,7 +66,7 @@ static int __init __area_sdram_check(unsigned int area)
 		       area, word);
 		return 0;
 	}
-	pci_write_reg(word, SH4_PCIBCR1);
+	pci_write_reg(chan, word, SH4_PCIBCR1);
 
 	word = (u16)ctrl_inw(SH7751_BCR2);
 	/* check BCR2 for 32bit SDRAM interface*/
@@ -74,12 +75,13 @@ static int __init __area_sdram_check(unsigned int area)
 		       area, word);
 		return 0;
 	}
-	pci_write_reg(word, SH4_PCIBCR2);
+	pci_write_reg(chan, word, SH4_PCIBCR2);
 
 	return 1;
 }
 
-int __init sh7751_pcic_init(struct sh4_pci_address_map *map)
+int __init sh7751_pcic_init(struct pci_channel *chan,
+			    struct sh4_pci_address_map *map)
 {
 	u32 reg;
 	u32 word;
@@ -90,10 +92,10 @@ int __init sh7751_pcic_init(struct sh4_pci_address_map *map)
 	ctrl_outl(reg, SH7751_BCR1);
 
 	/* Turn the clocks back on (not done in reset)*/
-	pci_write_reg(0, SH4_PCICLKR);
+	pci_write_reg(chan, 0, SH4_PCICLKR);
 	/* Clear Powerdown IRQ's (not done in reset) */
 	word = SH4_PCIPINT_D3 | SH4_PCIPINT_D0;
-	pci_write_reg(word, SH4_PCIPINT);
+	pci_write_reg(chan, word, SH4_PCIPINT);
 
 	/*
 	 * This code is unused for some boards as it is done in the
@@ -103,11 +105,11 @@ int __init sh7751_pcic_init(struct sh4_pci_address_map *map)
 	if (!(map->flags & SH4_PCIC_NO_RESET)) {
 		/* toggle PCI reset pin */
 		word = SH4_PCICR_PREFIX | SH4_PCICR_PRST;
-		pci_write_reg(word, SH4_PCICR);
+		pci_write_reg(chan, word, SH4_PCICR);
 		/* Wait for a long time... not 1 sec. but long enough */
 		mdelay(100);
 		word = SH4_PCICR_PREFIX;
-		pci_write_reg(word, SH4_PCICR);
+		pci_write_reg(chan, word, SH4_PCICR);
 	}
 
 	/* set the command/status bits to:
@@ -116,11 +118,11 @@ int __init sh7751_pcic_init(struct sh4_pci_address_map *map)
 	 */
 	word = SH7751_PCICONF1_WCC | SH7751_PCICONF1_PER |
 	       SH7751_PCICONF1_BUM | SH7751_PCICONF1_MES;
-	pci_write_reg(word, SH7751_PCICONF1);
+	pci_write_reg(chan, word, SH7751_PCICONF1);
 
 	/* define this host as the host bridge */
 	word = PCI_BASE_CLASS_BRIDGE << 24;
-	pci_write_reg(word, SH7751_PCICONF2);
+	pci_write_reg(chan, word, SH7751_PCICONF2);
 
 	/* Set IO and Mem windows to local address
 	 * Make PCI and local address the same for easy 1 to 1 mapping
@@ -128,24 +130,24 @@ int __init sh7751_pcic_init(struct sh4_pci_address_map *map)
 	 * Window1 = map->window1.size @ cached area base = SDRAM
 	 */
 	word = map->window0.size - 1;
-	pci_write_reg(word, SH4_PCILSR0);
+	pci_write_reg(chan, word, SH4_PCILSR0);
 	word = map->window1.size - 1;
-	pci_write_reg(word, SH4_PCILSR1);
+	pci_write_reg(chan, word, SH4_PCILSR1);
 	/* Set the values on window 0 PCI config registers */
 	word = P2SEGADDR(map->window0.base);
-	pci_write_reg(word, SH4_PCILAR0);
-	pci_write_reg(word, SH7751_PCICONF5);
+	pci_write_reg(chan, word, SH4_PCILAR0);
+	pci_write_reg(chan, word, SH7751_PCICONF5);
 	/* Set the values on window 1 PCI config registers */
 	word =  PHYSADDR(map->window1.base);
-	pci_write_reg(word, SH4_PCILAR1);
-	pci_write_reg(word, SH7751_PCICONF6);
+	pci_write_reg(chan, word, SH4_PCILAR1);
+	pci_write_reg(chan, word, SH7751_PCICONF6);
 
 	/* Set the local 16MB PCI memory space window to
 	 * the lowest PCI mapped address
 	 */
 	word = PCIBIOS_MIN_MEM & SH4_PCIMBR_MASK;
 	pr_debug("PCI: Setting upper bits of Memory window to 0x%x\n", word);
-	pci_write_reg(word , SH4_PCIMBR);
+	pci_write_reg(chan, word , SH4_PCIMBR);
 
 	/* Map IO space into PCI IO window
 	 * The IO window is 64K-PCIBIOS_MIN_IO in size
@@ -160,19 +162,19 @@ int __init sh7751_pcic_init(struct sh4_pci_address_map *map)
 	 * correctly */
 	word = PCIBIOS_MIN_IO & SH4_PCIIOBR_MASK;
 	pr_debug("PCI: Setting upper bits of IO window to 0x%x\n", word);
-	pci_write_reg(word, SH4_PCIIOBR);
+	pci_write_reg(chan, word, SH4_PCIIOBR);
 
 	/* Set PCI WCRx, BCRx's, copy from BSC locations */
 
 	/* check BCR for SDRAM in specified area */
 	switch (map->window0.base) {
-	case SH7751_CS0_BASE_ADDR: word = __area_sdram_check(0); break;
-	case SH7751_CS1_BASE_ADDR: word = __area_sdram_check(1); break;
-	case SH7751_CS2_BASE_ADDR: word = __area_sdram_check(2); break;
-	case SH7751_CS3_BASE_ADDR: word = __area_sdram_check(3); break;
-	case SH7751_CS4_BASE_ADDR: word = __area_sdram_check(4); break;
-	case SH7751_CS5_BASE_ADDR: word = __area_sdram_check(5); break;
-	case SH7751_CS6_BASE_ADDR: word = __area_sdram_check(6); break;
+	case SH7751_CS0_BASE_ADDR: word = __area_sdram_check(chan, 0); break;
+	case SH7751_CS1_BASE_ADDR: word = __area_sdram_check(chan, 1); break;
+	case SH7751_CS2_BASE_ADDR: word = __area_sdram_check(chan, 2); break;
+	case SH7751_CS3_BASE_ADDR: word = __area_sdram_check(chan, 3); break;
+	case SH7751_CS4_BASE_ADDR: word = __area_sdram_check(chan, 4); break;
+	case SH7751_CS5_BASE_ADDR: word = __area_sdram_check(chan, 5); break;
+	case SH7751_CS6_BASE_ADDR: word = __area_sdram_check(chan, 6); break;
 	}
 
 	if (!word)
@@ -180,25 +182,25 @@ int __init sh7751_pcic_init(struct sh4_pci_address_map *map)
 
 	/* configure the wait control registers */
 	word = ctrl_inl(SH7751_WCR1);
-	pci_write_reg(word, SH4_PCIWCR1);
+	pci_write_reg(chan, word, SH4_PCIWCR1);
 	word = ctrl_inl(SH7751_WCR2);
-	pci_write_reg(word, SH4_PCIWCR2);
+	pci_write_reg(chan, word, SH4_PCIWCR2);
 	word = ctrl_inl(SH7751_WCR3);
-	pci_write_reg(word, SH4_PCIWCR3);
+	pci_write_reg(chan, word, SH4_PCIWCR3);
 	word = ctrl_inl(SH7751_MCR);
-	pci_write_reg(word, SH4_PCIMCR);
+	pci_write_reg(chan, word, SH4_PCIMCR);
 
 	/* NOTE: I'm ignoring the PCI error IRQs for now..
 	 * TODO: add support for the internal error interrupts and
 	 * DMA interrupts...
 	 */
 
-	pci_fixup_pcic();
+	pci_fixup_pcic(chan);
 
 	/* SH7751 init done, set central function init complete */
 	/* use round robin mode to stop a device starving/overruning */
 	word = SH4_PCICR_PREFIX | SH4_PCICR_CFIN | SH4_PCICR_ARBM;
-	pci_write_reg(word, SH4_PCICR);
+	pci_write_reg(chan, word, SH4_PCICR);
 
 	return 1;
 }
diff --git a/arch/sh/drivers/pci/pci-sh7751.h b/arch/sh/drivers/pci/pci-sh7751.h
index 68e3cb5..6f101e5 100644
--- a/arch/sh/drivers/pci/pci-sh7751.h
+++ b/arch/sh/drivers/pci/pci-sh7751.h
@@ -130,6 +130,7 @@
 struct sh4_pci_address_map;
 
 /* arch/sh/drivers/pci/pci-sh7751.c */
-int sh7751_pcic_init(struct sh4_pci_address_map *map);
+int sh7751_pcic_init(struct pci_channel *chan,
+		     struct sh4_pci_address_map *map);
 
 #endif /* _PCI_SH7751_H_ */
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index bae6a2c..56f673f 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -55,7 +55,7 @@ static int __init sh7780_pci_init(void)
 	ctrl_outl(0x00000001, SH7780_PCI_VCR2); /* Enable PCIC */
 
 	/* check for SH7780/SH7780R hardware */
-	id = pci_read_reg(SH7780_PCIVID);
+	id = pci_read_reg(NULL, SH7780_PCIVID);
 	if ((id & 0xffff) == SH7780_VENDOR_ID) {
 		switch ((id >> 16) & 0xffff) {
 		case SH7763_DEVICE_ID:
@@ -82,14 +82,15 @@ static int __init sh7780_pci_init(void)
 		ctrl_outl(0x33333333, INTC_INTPRI);
 	}
 
-	if ((ret = sh4_pci_check_direct()) != 0)
+	if ((ret = sh4_pci_check_direct(NULL)) != 0)
 		return ret;
 
 	return pcibios_init_platform();
 }
 core_initcall(sh7780_pci_init);
 
-int __init sh7780_pcic_init(struct sh4_pci_address_map *map)
+int __init sh7780_pcic_init(struct pci_channel *chan,
+			    struct sh4_pci_address_map *map)
 {
 	u32 word;
 
@@ -101,34 +102,34 @@ int __init sh7780_pcic_init(struct sh4_pci_address_map *map)
 	if (!(map->flags & SH4_PCIC_NO_RESET)) {
 		/* toggle PCI reset pin */
 		word = SH4_PCICR_PREFIX | SH4_PCICR_PRST;
-		pci_write_reg(word, SH4_PCICR);
+		pci_write_reg(chan, word, SH4_PCICR);
 		/* Wait for a long time... not 1 sec. but long enough */
 		mdelay(100);
 		word = SH4_PCICR_PREFIX;
-		pci_write_reg(word, SH4_PCICR);
+		pci_write_reg(chan, word, SH4_PCICR);
 	}
 
 	/* set the command/status bits to:
 	 * Wait Cycle Control + Parity Enable + Bus Master +
 	 * Mem space enable
 	 */
-	pci_write_reg(0x00000046, SH7780_PCICMD);
+	pci_write_reg(chan, 0x00000046, SH7780_PCICMD);
 
 	/* define this host as the host bridge */
 	word = PCI_BASE_CLASS_BRIDGE << 24;
-	pci_write_reg(word, SH7780_PCIRID);
+	pci_write_reg(chan, word, SH7780_PCIRID);
 
 	/* Set IO and Mem windows to local address
 	 * Make PCI and local address the same for easy 1 to 1 mapping
 	 */
-	pci_write_reg(map->window0.size - 0xfffff, SH4_PCILSR0);
-	pci_write_reg(map->window1.size - 0xfffff, SH4_PCILSR1);
+	pci_write_reg(chan, map->window0.size - 0xfffff, SH4_PCILSR0);
+	pci_write_reg(chan, map->window1.size - 0xfffff, SH4_PCILSR1);
 	/* Set the values on window 0 PCI config registers */
-	pci_write_reg(map->window0.base, SH4_PCILAR0);
-	pci_write_reg(map->window0.base, SH7780_PCIMBAR0);
+	pci_write_reg(chan, map->window0.base, SH4_PCILAR0);
+	pci_write_reg(chan, map->window0.base, SH7780_PCIMBAR0);
 	/* Set the values on window 1 PCI config registers */
-	pci_write_reg(map->window1.base, SH4_PCILAR1);
-	pci_write_reg(map->window1.base, SH7780_PCIMBAR1);
+	pci_write_reg(chan, map->window1.base, SH4_PCILAR1);
+	pci_write_reg(chan, map->window1.base, SH7780_PCIMBAR1);
 
 	/* Map IO space into PCI IO window
 	 * The IO window is 64K-PCIBIOS_MIN_IO in size
@@ -145,12 +146,12 @@ int __init sh7780_pcic_init(struct sh4_pci_address_map *map)
 	 */
 
 	/* Apply any last-minute PCIC fixups */
-	pci_fixup_pcic();
+	pci_fixup_pcic(chan);
 
 	/* SH7780 init done, set central function init complete */
 	/* use round robin mode to stop a device starving/overruning */
 	word = SH4_PCICR_PREFIX | SH4_PCICR_CFIN | SH4_PCICR_FTO;
-	pci_write_reg(word, SH4_PCICR);
+	pci_write_reg(chan, word, SH4_PCICR);
 
 	return 1;
 }
diff --git a/arch/sh/drivers/pci/pci-sh7780.h b/arch/sh/drivers/pci/pci-sh7780.h
index 93adc71..d349611 100644
--- a/arch/sh/drivers/pci/pci-sh7780.h
+++ b/arch/sh/drivers/pci/pci-sh7780.h
@@ -109,6 +109,7 @@
 struct sh4_pci_address_map;
 
 /* arch/sh/drivers/pci/pci-sh7780.c */
-int sh7780_pcic_init(struct sh4_pci_address_map *map);
+int sh7780_pcic_init(struct pci_channel *chan,
+		     struct sh4_pci_address_map *map);
 
 #endif /* _PCI_SH7780_H_ */
-- 
cgit v0.10.2


From d0e3db40e2a1352aa2a2f425a7d4631bddc03d51 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 11 Mar 2009 15:46:14 +0900
Subject: sh: add init member to pci_channel data

This patch adds an init callback to struct pci_channel and makes sure
it is initialized properly. Code is added to call this init function
from pcibios_init(). Return values are adjusted and a warning is is
printed if init fails.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/boards/mach-dreamcast/setup.c b/arch/sh/boards/mach-dreamcast/setup.c
index d1bee48..ebe9922 100644
--- a/arch/sh/boards/mach-dreamcast/setup.c
+++ b/arch/sh/boards/mach-dreamcast/setup.c
@@ -30,7 +30,6 @@
 
 extern struct irq_chip systemasic_int;
 extern void aica_time_init(void);
-extern int gapspci_init(void);
 extern int systemasic_irq_demux(int);
 
 static void __init dreamcast_setup(char **cmdline_p)
@@ -51,11 +50,6 @@ static void __init dreamcast_setup(char **cmdline_p)
 					 handle_level_irq);
 
 	board_time_init = aica_time_init;
-
-#ifdef CONFIG_PCI
-	if (gapspci_init() < 0)
-		printk(KERN_WARNING "GAPSPCI was not detected.\n");
-#endif
 }
 
 static struct sh_machine_vector mv_dreamcast __initmv = {
diff --git a/arch/sh/drivers/pci/ops-cayman.c b/arch/sh/drivers/pci/ops-cayman.c
index 38ef762..f4a5e14 100644
--- a/arch/sh/drivers/pci/ops-cayman.c
+++ b/arch/sh/drivers/pci/ops-cayman.c
@@ -77,7 +77,7 @@ int __init pcibios_map_platform_irq(struct pci_dev *dev, u8 slot, u8 pin)
 }
 
 struct pci_channel board_pci_channels[] = {
-	{ &sh5_pci_ops, NULL, NULL, 0, 0xff },
+	{ sh5_pci_init, &sh5_pci_ops, NULL, NULL, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
 EXPORT_SYMBOL(board_pci_channels);
diff --git a/arch/sh/drivers/pci/ops-dreamcast.c b/arch/sh/drivers/pci/ops-dreamcast.c
index f5d2a2a..f62063e 100644
--- a/arch/sh/drivers/pci/ops-dreamcast.c
+++ b/arch/sh/drivers/pci/ops-dreamcast.c
@@ -42,15 +42,6 @@ static struct resource gapspci_mem_resource = {
 	.flags	= IORESOURCE_MEM,
 };
 
-static struct pci_ops gapspci_pci_ops;
-
-struct pci_channel board_pci_channels[] = {
-	{ &gapspci_pci_ops, &gapspci_io_resource,
-	  &gapspci_mem_resource, 0, 1 },
-	{ 0, }
-};
-EXPORT_SYMBOL(board_pci_channels);
-
 /*
  * The !gapspci_config_access case really shouldn't happen, ever, unless
  * someone implicitly messes around with the last devfn value.. otherwise we
@@ -116,7 +107,7 @@ static struct pci_ops gapspci_pci_ops = {
  * gapspci init
  */
 
-int __init gapspci_init(void)
+static int __init gapspci_init(struct pci_channel *chan)
 {
 	char idbuf[16];
 	int i;
@@ -168,3 +159,10 @@ char * __devinit pcibios_setup(char *str)
 {
 	return str;
 }
+
+struct pci_channel board_pci_channels[] = {
+	{ gapspci_init, &gapspci_pci_ops, &gapspci_io_resource,
+	  &gapspci_mem_resource, 0, 1 },
+	{ 0, }
+};
+EXPORT_SYMBOL(board_pci_channels);
diff --git a/arch/sh/drivers/pci/ops-landisk.c b/arch/sh/drivers/pci/ops-landisk.c
index 343c072..c46911d9 100644
--- a/arch/sh/drivers/pci/ops-landisk.c
+++ b/arch/sh/drivers/pci/ops-landisk.c
@@ -30,7 +30,7 @@ static struct resource sh7751_mem_resource = {
 };
 
 struct pci_channel board_pci_channels[] = {
-	{&sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0x3ff},
+	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0x3ff},
 	{NULL, NULL, NULL, 0, 0},
 };
 
diff --git a/arch/sh/drivers/pci/ops-lboxre2.c b/arch/sh/drivers/pci/ops-lboxre2.c
index 8bff32a..f606df2 100644
--- a/arch/sh/drivers/pci/ops-lboxre2.c
+++ b/arch/sh/drivers/pci/ops-lboxre2.c
@@ -39,7 +39,7 @@ static struct resource sh7751_mem_resource = {
 extern struct pci_ops sh7751_pci_ops;
 
 struct pci_channel board_pci_channels[] = {
-	{ &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
+	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
 
diff --git a/arch/sh/drivers/pci/ops-r7780rp.c b/arch/sh/drivers/pci/ops-r7780rp.c
index bf32ee8..b51b7e4 100644
--- a/arch/sh/drivers/pci/ops-r7780rp.c
+++ b/arch/sh/drivers/pci/ops-r7780rp.c
@@ -43,7 +43,7 @@ static struct resource sh7780_mem_resource = {
 extern struct pci_ops sh7780_pci_ops;
 
 struct pci_channel board_pci_channels[] = {
-	{ &sh4_pci_ops, &sh7780_io_resource, &sh7780_mem_resource, 0, 0xff },
+	{ sh7780_pci_init, &sh4_pci_ops, &sh7780_io_resource, &sh7780_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
 EXPORT_SYMBOL(board_pci_channels);
diff --git a/arch/sh/drivers/pci/ops-rts7751r2d.c b/arch/sh/drivers/pci/ops-rts7751r2d.c
index e4208a6..fe5a231 100644
--- a/arch/sh/drivers/pci/ops-rts7751r2d.c
+++ b/arch/sh/drivers/pci/ops-rts7751r2d.c
@@ -47,7 +47,7 @@ static struct resource sh7751_mem_resource = {
 extern struct pci_ops sh7751_pci_ops;
 
 struct pci_channel board_pci_channels[] = {
-	{ &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
+	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
 EXPORT_SYMBOL(board_pci_channels);
diff --git a/arch/sh/drivers/pci/ops-sdk7780.c b/arch/sh/drivers/pci/ops-sdk7780.c
index 21d59d4..7277cd1 100644
--- a/arch/sh/drivers/pci/ops-sdk7780.c
+++ b/arch/sh/drivers/pci/ops-sdk7780.c
@@ -49,7 +49,7 @@ static struct resource sdk7780_mem_resource = {
 };
 
 struct pci_channel board_pci_channels[] = {
-	{ &sh4_pci_ops, &sdk7780_io_resource, &sdk7780_mem_resource, 0, 0xff },
+	{ sh7780_pci_init, &sh4_pci_ops, &sdk7780_io_resource, &sdk7780_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
 EXPORT_SYMBOL(board_pci_channels);
diff --git a/arch/sh/drivers/pci/ops-se7780.c b/arch/sh/drivers/pci/ops-se7780.c
index 78a6f2b..76a74fb 100644
--- a/arch/sh/drivers/pci/ops-se7780.c
+++ b/arch/sh/drivers/pci/ops-se7780.c
@@ -58,7 +58,7 @@ static struct resource se7780_mem_resource = {
 extern struct pci_ops se7780_pci_ops;
 
 struct pci_channel board_pci_channels[] = {
-	{ &sh4_pci_ops, &se7780_io_resource, &se7780_mem_resource, 0, 0xff },
+	{ sh7780_pci_init, &sh4_pci_ops, &se7780_io_resource, &se7780_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
 EXPORT_SYMBOL(board_pci_channels);
diff --git a/arch/sh/drivers/pci/ops-sh03.c b/arch/sh/drivers/pci/ops-sh03.c
index e1703ff..0218135 100644
--- a/arch/sh/drivers/pci/ops-sh03.c
+++ b/arch/sh/drivers/pci/ops-sh03.c
@@ -39,7 +39,7 @@ static struct resource sh7751_mem_resource = {
 extern struct pci_ops sh4_pci_ops;
 
 struct pci_channel board_pci_channels[] = {
-	{ &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
+	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
 
diff --git a/arch/sh/drivers/pci/ops-snapgear.c b/arch/sh/drivers/pci/ops-snapgear.c
index cba8015..2e254c6 100644
--- a/arch/sh/drivers/pci/ops-snapgear.c
+++ b/arch/sh/drivers/pci/ops-snapgear.c
@@ -40,7 +40,7 @@ static struct resource sh7751_mem_resource = {
 };
 
 struct pci_channel board_pci_channels[] = {
-	{ &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
+	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
 	{ 0, }
 };
 
diff --git a/arch/sh/drivers/pci/ops-titan.c b/arch/sh/drivers/pci/ops-titan.c
index 69fcc5c..ffa79bd 100644
--- a/arch/sh/drivers/pci/ops-titan.c
+++ b/arch/sh/drivers/pci/ops-titan.c
@@ -52,7 +52,7 @@ static struct resource sh7751_mem_resource = {
 };
 
 struct pci_channel board_pci_channels[] = {
-	{ &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
+	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
 EXPORT_SYMBOL(board_pci_channels);
diff --git a/arch/sh/drivers/pci/pci-sh5.c b/arch/sh/drivers/pci/pci-sh5.c
index 7a97438..008a02e 100644
--- a/arch/sh/drivers/pci/pci-sh5.c
+++ b/arch/sh/drivers/pci/pci-sh5.c
@@ -27,6 +27,12 @@
 unsigned long pcicr_virt;
 unsigned long PCI_IO_AREA;
 
+int __init sh5_pci_init(struct pci_channel *chan)
+{
+	pr_debug("PCI: Starting intialization.\n");
+	return pcibios_init_platform();
+}
+
 /* Rounds a number UP to the nearest power of two. Used for
  * sizing the PCI window.
  */
diff --git a/arch/sh/drivers/pci/pci-sh5.h b/arch/sh/drivers/pci/pci-sh5.h
index 7cff3fc..af09f38 100644
--- a/arch/sh/drivers/pci/pci-sh5.h
+++ b/arch/sh/drivers/pci/pci-sh5.h
@@ -108,6 +108,7 @@ extern unsigned long pcicr_virt;
 extern struct pci_ops sh5_pci_ops;
 
 /* arch/sh/drivers/pci/pci-sh5.c */
+int sh5_pci_init(struct pci_channel *chan);
 int sh5pci_init(unsigned long memStart, unsigned long memSize);
 
 #endif /* __PCI_SH5_H */
diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c
index 9c2c014..230db8b 100644
--- a/arch/sh/drivers/pci/pci-sh7751.c
+++ b/arch/sh/drivers/pci/pci-sh7751.c
@@ -32,7 +32,7 @@
  * space mapping) will be called via the platform defined function
  * pcibios_init_platform().
  */
-static int __init sh7751_pci_init(void)
+int __init sh7751_pci_init(struct pci_channel *chan)
 {
 	unsigned int id;
 	int ret;
@@ -40,19 +40,18 @@ static int __init sh7751_pci_init(void)
 	pr_debug("PCI: Starting intialization.\n");
 
 	/* check for SH7751/SH7751R hardware */
-	id = pci_read_reg(NULL, SH7751_PCICONF0);
+	id = pci_read_reg(chan, SH7751_PCICONF0);
 	if (id != ((SH7751_DEVICE_ID << 16) | SH7751_VENDOR_ID) &&
 	    id != ((SH7751R_DEVICE_ID << 16) | SH7751_VENDOR_ID)) {
 		pr_debug("PCI: This is not an SH7751(R) (%x)\n", id);
 		return -ENODEV;
 	}
 
-	if ((ret = sh4_pci_check_direct(NULL)) != 0)
+	if ((ret = sh4_pci_check_direct(chan)) != 0)
 		return ret;
 
 	return pcibios_init_platform();
 }
-subsys_initcall(sh7751_pci_init);
 
 static int __init __area_sdram_check(struct pci_channel *chan,
 				     unsigned int area)
@@ -178,7 +177,7 @@ int __init sh7751_pcic_init(struct pci_channel *chan,
 	}
 
 	if (!word)
-		return 0;
+		return -1;
 
 	/* configure the wait control registers */
 	word = ctrl_inl(SH7751_WCR1);
@@ -202,5 +201,5 @@ int __init sh7751_pcic_init(struct pci_channel *chan,
 	word = SH4_PCICR_PREFIX | SH4_PCICR_CFIN | SH4_PCICR_ARBM;
 	pci_write_reg(chan, word, SH4_PCICR);
 
-	return 1;
+	return 0;
 }
diff --git a/arch/sh/drivers/pci/pci-sh7751.h b/arch/sh/drivers/pci/pci-sh7751.h
index 6f101e5..0ea4387 100644
--- a/arch/sh/drivers/pci/pci-sh7751.h
+++ b/arch/sh/drivers/pci/pci-sh7751.h
@@ -130,6 +130,7 @@
 struct sh4_pci_address_map;
 
 /* arch/sh/drivers/pci/pci-sh7751.c */
+int sh7751_pci_init(struct pci_channel *chan);
 int sh7751_pcic_init(struct pci_channel *chan,
 		     struct sh4_pci_address_map *map);
 
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 56f673f..4706e88 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -45,7 +45,7 @@
  * space mapping) will be called via the platform defined function
  * pcibios_init_platform().
  */
-static int __init sh7780_pci_init(void)
+int __init sh7780_pci_init(struct pci_channel *chan)
 {
 	unsigned int id;
 	int ret, match = 0;
@@ -55,7 +55,7 @@ static int __init sh7780_pci_init(void)
 	ctrl_outl(0x00000001, SH7780_PCI_VCR2); /* Enable PCIC */
 
 	/* check for SH7780/SH7780R hardware */
-	id = pci_read_reg(NULL, SH7780_PCIVID);
+	id = pci_read_reg(chan, SH7780_PCIVID);
 	if ((id & 0xffff) == SH7780_VENDOR_ID) {
 		switch ((id >> 16) & 0xffff) {
 		case SH7763_DEVICE_ID:
@@ -82,12 +82,11 @@ static int __init sh7780_pci_init(void)
 		ctrl_outl(0x33333333, INTC_INTPRI);
 	}
 
-	if ((ret = sh4_pci_check_direct(NULL)) != 0)
+	if ((ret = sh4_pci_check_direct(chan)) != 0)
 		return ret;
 
 	return pcibios_init_platform();
 }
-core_initcall(sh7780_pci_init);
 
 int __init sh7780_pcic_init(struct pci_channel *chan,
 			    struct sh4_pci_address_map *map)
@@ -153,5 +152,5 @@ int __init sh7780_pcic_init(struct pci_channel *chan,
 	word = SH4_PCICR_PREFIX | SH4_PCICR_CFIN | SH4_PCICR_FTO;
 	pci_write_reg(chan, word, SH4_PCICR);
 
-	return 1;
+	return 0;
 }
diff --git a/arch/sh/drivers/pci/pci-sh7780.h b/arch/sh/drivers/pci/pci-sh7780.h
index d349611..2f3c920 100644
--- a/arch/sh/drivers/pci/pci-sh7780.h
+++ b/arch/sh/drivers/pci/pci-sh7780.h
@@ -109,6 +109,7 @@
 struct sh4_pci_address_map;
 
 /* arch/sh/drivers/pci/pci-sh7780.c */
+int sh7780_pci_init(struct pci_channel *chan);
 int sh7780_pcic_init(struct pci_channel *chan,
 		     struct sh4_pci_address_map *map);
 
diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c
index 0d6ac7a1..29ec16e 100644
--- a/arch/sh/drivers/pci/pci.c
+++ b/arch/sh/drivers/pci/pci.c
@@ -28,18 +28,31 @@ static int __init pcibios_init(void)
 	struct pci_bus *bus;
 	int busno;
 
+	/* init channels */
+	busno = 0;
+	for (p = board_pci_channels; p->init; p++) {
+		if (p->init(p) == 0)
+			p->enabled = 1;
+		else
+			pr_err("Unable to init pci channel %d\n", busno);
+		busno++;
+	}
+
 #ifdef CONFIG_PCI_AUTO
 	/* assign resources */
 	busno = 0;
-	for (p = board_pci_channels; p->pci_ops != NULL; p++)
-		busno = pciauto_assign_resources(busno, p) + 1;
+	for (p = board_pci_channels; p->init; p++)
+		if (p->enabled)
+			busno = pciauto_assign_resources(busno, p) + 1;
 #endif
 
 	/* scan the buses */
 	busno = 0;
-	for (p = board_pci_channels; p->pci_ops != NULL; p++) {
-		bus = pci_scan_bus(busno, p->pci_ops, p);
-		busno = bus->subordinate + 1;
+	for (p = board_pci_channels; p->init; p++) {
+		if (p->enabled) {
+			bus = pci_scan_bus(busno, p->pci_ops, p);
+			busno = bus->subordinate + 1;
+		}
 	}
 
 	pci_fixup_irqs(pci_common_swizzle, pcibios_map_platform_irq);
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index df1d383..5c7a8f1 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -17,11 +17,13 @@
  * external) PCI controllers.
  */
 struct pci_channel {
+	int (*init)(struct pci_channel *chan);
 	struct pci_ops *pci_ops;
 	struct resource *io_resource;
 	struct resource *mem_resource;
 	int first_devfn;
 	int last_devfn;
+	int enabled;
 };
 
 /*
-- 
cgit v0.10.2


From 710fa3c81151948ac4d836ef52b57cef91b0ab72 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 11 Mar 2009 15:47:23 +0900
Subject: sh: avoid using PCIBIOS_MIN_xxx

Replaces PCIBIOS_MIN_IO and PCIBIOS_MIN_MEM with direct struct
pci_channel access. This allows us to have more than one pci
channel.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/fixups-r7780rp.c b/arch/sh/drivers/pci/fixups-r7780rp.c
index 5b25021..31f8dfb 100644
--- a/arch/sh/drivers/pci/fixups-r7780rp.c
+++ b/arch/sh/drivers/pci/fixups-r7780rp.c
@@ -37,7 +37,7 @@ int pci_fixup_pcic(struct pci_channel *chan)
 #endif
 
 	/* Set IOBR for windows containing area specified in pci.h */
-	pci_write_reg(chan, (PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE - 1)),
+	pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE-1),
 		      SH7780_PCIIOBR);
 	pci_write_reg(chan, ((SH7780_PCI_IO_SIZE-1) & (7<<18)),
 		      SH7780_PCIIOBMR);
diff --git a/arch/sh/drivers/pci/fixups-sdk7780.c b/arch/sh/drivers/pci/fixups-sdk7780.c
index 3f6754a..c295731 100644
--- a/arch/sh/drivers/pci/fixups-sdk7780.c
+++ b/arch/sh/drivers/pci/fixups-sdk7780.c
@@ -50,7 +50,7 @@ int pci_fixup_pcic(struct pci_channel *chan)
 	pci_write_reg(chan, 0x00FC0000 , SH7780_PCIMBMR0);	/* 16M */
 
 	/* Set IOBR for window containing area specified in pci.h */
-	pci_write_reg(chan, PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE-1),
+	pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE-1),
 		      SH7780_PCIIOBR);
 	pci_write_reg(chan, (SH7780_PCI_IO_SIZE-1) & (7 << 18),
 		      SH7780_PCIIOBMR);
diff --git a/arch/sh/drivers/pci/fixups-se7780.c b/arch/sh/drivers/pci/fixups-se7780.c
index b8e735e..a968af7 100644
--- a/arch/sh/drivers/pci/fixups-se7780.c
+++ b/arch/sh/drivers/pci/fixups-se7780.c
@@ -51,7 +51,7 @@ int pci_fixup_pcic(struct pci_channel *chan)
 	pci_write_reg(chan, 0x00FC0000 , SH7780_PCIMBMR0);    /* 16M */
 
 	/* Set IOBR for window containing area specified in pci.h */
-	pci_write_reg(chan, PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE-1),
+	pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE-1),
 		      SH7780_PCIIOBR);
 	pci_write_reg(chan, (SH7780_PCI_IO_SIZE-1) & (7 << 18),
 		      SH7780_PCIIOBMR);
diff --git a/arch/sh/drivers/pci/pci-sh5.c b/arch/sh/drivers/pci/pci-sh5.c
index 008a02e..7750da2 100644
--- a/arch/sh/drivers/pci/pci-sh5.c
+++ b/arch/sh/drivers/pci/pci-sh5.c
@@ -206,6 +206,9 @@ int __init sh5pci_init(unsigned long memStart, unsigned long memSize)
 	return 0;
 }
 
+#define xPCIBIOS_MIN_IO		board_pci_channels->io_resource->start
+#define xPCIBIOS_MIN_MEM	board_pci_channels->mem_resource->start
+
 void __devinit pcibios_fixup_bus(struct pci_bus *bus)
 {
 	struct pci_dev *dev = bus->self;
@@ -223,9 +226,9 @@ void __devinit pcibios_fixup_bus(struct pci_bus *bus)
 		/* For now, propagate host limits to the bus;
 		 * we'll adjust them later. */
 		bus->resource[0]->end = 64*1024 - 1 ;
-		bus->resource[1]->end = PCIBIOS_MIN_MEM+(256*1024*1024)-1;
-		bus->resource[0]->start = PCIBIOS_MIN_IO;
-		bus->resource[1]->start = PCIBIOS_MIN_MEM;
+		bus->resource[1]->end = xPCIBIOS_MIN_MEM+(256*1024*1024)-1;
+		bus->resource[0]->start = xPCIBIOS_MIN_IO;
+		bus->resource[1]->start = xPCIBIOS_MIN_MEM;
 
 		/* Turn off downstream PF memory address range by default */
 		bus->resource[2]->start = 1024*1024;
diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c
index 230db8b..447234c 100644
--- a/arch/sh/drivers/pci/pci-sh7751.c
+++ b/arch/sh/drivers/pci/pci-sh7751.c
@@ -144,22 +144,20 @@ int __init sh7751_pcic_init(struct pci_channel *chan,
 	/* Set the local 16MB PCI memory space window to
 	 * the lowest PCI mapped address
 	 */
-	word = PCIBIOS_MIN_MEM & SH4_PCIMBR_MASK;
+	word = chan->mem_resource->start & SH4_PCIMBR_MASK;
 	pr_debug("PCI: Setting upper bits of Memory window to 0x%x\n", word);
 	pci_write_reg(chan, word , SH4_PCIMBR);
 
-	/* Map IO space into PCI IO window
-	 * The IO window is 64K-PCIBIOS_MIN_IO in size
-	 * IO addresses will be translated to the
-	 * PCI IO window base address
+	/* Map IO space into PCI IO window:
+	 * IO addresses will be translated to the PCI IO window base address
 	 */
 	pr_debug("PCI: Mapping IO address 0x%x - 0x%x to base 0x%x\n",
-		 PCIBIOS_MIN_IO, (64 << 10),
-		 SH7751_PCI_IO_BASE + PCIBIOS_MIN_IO);
+		 chan->io_resource->start, chan->io_resource->end,
+		 SH7751_PCI_IO_BASE + chan->io_resource->start);
 
 	/* Make sure the MSB's of IO window are set to access PCI space
 	 * correctly */
-	word = PCIBIOS_MIN_IO & SH4_PCIIOBR_MASK;
+	word = chan->io_resource->start & SH4_PCIIOBR_MASK;
 	pr_debug("PCI: Setting upper bits of IO window to 0x%x\n", word);
 	pci_write_reg(chan, word, SH4_PCIIOBR);
 
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 4706e88..e8f3a30 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -130,14 +130,12 @@ int __init sh7780_pcic_init(struct pci_channel *chan,
 	pci_write_reg(chan, map->window1.base, SH4_PCILAR1);
 	pci_write_reg(chan, map->window1.base, SH7780_PCIMBAR1);
 
-	/* Map IO space into PCI IO window
-	 * The IO window is 64K-PCIBIOS_MIN_IO in size
-	 * IO addresses will be translated to the
-	 * PCI IO window base address
+	/* Map IO space into PCI IO window:
+	 * IO addresses will be translated to the PCI IO window base address
 	 */
 	pr_debug("PCI: Mapping IO address 0x%x - 0x%x to base 0x%x\n",
-		 PCIBIOS_MIN_IO, (64 << 10),
-		 SH7780_PCI_IO_BASE + PCIBIOS_MIN_IO);
+		 chan->io_resource->start, chan->io_resource->end,
+		 SH7780_PCI_IO_BASE + chan->io_resource->start);
 
 	/* NOTE: I'm ignoring the PCI error IRQs for now..
 	 * TODO: add support for the internal error interrupts and
-- 
cgit v0.10.2


From b6706ef10f75921733d7275fd45d268f2f6254c8 Mon Sep 17 00:00:00 2001
From: Magnus Damm <magnus.damm@gmail.com>
Date: Tue, 19 Feb 2008 21:34:55 +0900
Subject: sh: hook in struct pci_channel in sysdata

Store a struct pci_channel pointer in bus->sysdata. This makes whatever
struct pci_channel assigned to a bus available for sh4_pci_read() and
sh4_pci_write(). We also modify PCIBIOS_MIN_IO and PCIBIOS_MIN_MEM to
use bus->sysdata - this to gives us support for multiple pci channels.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/ops-sh4.c b/arch/sh/drivers/pci/ops-sh4.c
index 92d27f7..ee62e6d 100644
--- a/arch/sh/drivers/pci/ops-sh4.c
+++ b/arch/sh/drivers/pci/ops-sh4.c
@@ -26,6 +26,7 @@ static DEFINE_SPINLOCK(sh4_pci_lock);
 static int sh4_pci_read(struct pci_bus *bus, unsigned int devfn,
 			   int where, int size, u32 *val)
 {
+	struct pci_channel *chan = bus->sysdata;
 	unsigned long flags;
 	u32 data;
 
@@ -34,8 +35,8 @@ static int sh4_pci_read(struct pci_bus *bus, unsigned int devfn,
 	 * so we must do byte alignment by hand
 	 */
 	spin_lock_irqsave(&sh4_pci_lock, flags);
-	pci_write_reg(NULL, CONFIG_CMD(bus, devfn, where), SH4_PCIPAR);
-	data = pci_read_reg(NULL, SH4_PCIPDR);
+	pci_write_reg(chan, CONFIG_CMD(bus, devfn, where), SH4_PCIPAR);
+	data = pci_read_reg(chan, SH4_PCIPDR);
 	spin_unlock_irqrestore(&sh4_pci_lock, flags);
 
 	switch (size) {
@@ -63,13 +64,14 @@ static int sh4_pci_read(struct pci_bus *bus, unsigned int devfn,
 static int sh4_pci_write(struct pci_bus *bus, unsigned int devfn,
 			 int where, int size, u32 val)
 {
+	struct pci_channel *chan = bus->sysdata;
 	unsigned long flags;
 	int shift;
 	u32 data;
 
 	spin_lock_irqsave(&sh4_pci_lock, flags);
-	pci_write_reg(NULL, CONFIG_CMD(bus, devfn, where), SH4_PCIPAR);
-	data = pci_read_reg(NULL, SH4_PCIPDR);
+	pci_write_reg(chan, CONFIG_CMD(bus, devfn, where), SH4_PCIPAR);
+	data = pci_read_reg(chan, SH4_PCIPDR);
 	spin_unlock_irqrestore(&sh4_pci_lock, flags);
 
 	switch (size) {
@@ -90,7 +92,7 @@ static int sh4_pci_write(struct pci_bus *bus, unsigned int devfn,
 		return PCIBIOS_FUNC_NOT_SUPPORTED;
 	}
 
-	pci_write_reg(NULL, data, SH4_PCIPDR);
+	pci_write_reg(chan, data, SH4_PCIPDR);
 
 	return PCIBIOS_SUCCESSFUL;
 }
diff --git a/arch/sh/drivers/pci/pci-auto.c b/arch/sh/drivers/pci/pci-auto.c
index cf48b12..1d715ec 100644
--- a/arch/sh/drivers/pci/pci-auto.c
+++ b/arch/sh/drivers/pci/pci-auto.c
@@ -67,6 +67,7 @@ static struct pci_dev *fake_pci_dev(struct pci_channel *hose,
 	dev.devfn = devfn;
 	bus.number = busnr;
 	bus.ops = hose->pci_ops;
+	bus.sysdata = hose;
 
 	if(busnr != top_bus)
 		/* Fake a parent bus structure. */
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 5c7a8f1..386587e 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -31,8 +31,10 @@ struct pci_channel {
  */
 extern struct pci_channel board_pci_channels[];
 
-#define PCIBIOS_MIN_IO		board_pci_channels->io_resource->start
-#define PCIBIOS_MIN_MEM		board_pci_channels->mem_resource->start
+/* ugly as hell, but makes drivers/pci/setup-res.c compile and work */
+#define __PCI_CHAN(bus)		((struct pci_channel *)bus->sysdata)
+#define PCIBIOS_MIN_IO		__PCI_CHAN(bus)->io_resource->start
+#define PCIBIOS_MIN_MEM		__PCI_CHAN(bus)->mem_resource->start
 
 /*
  * I/O routine helpers
-- 
cgit v0.10.2


From e4c6a3604e07185046e2ce4be82a201f4447d788 Mon Sep 17 00:00:00 2001
From: Magnus Damm <magnus.damm@gmail.com>
Date: Tue, 19 Feb 2008 21:35:04 +0900
Subject: sh: add reg_base member to pci_channel

Store the base address of the pci host controller registers in struct
pci_channel and use the address in pci_read_reg() and pci_write_reg().

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/ops-sh4.c b/arch/sh/drivers/pci/ops-sh4.c
index ee62e6d..540683d 100644
--- a/arch/sh/drivers/pci/ops-sh4.c
+++ b/arch/sh/drivers/pci/ops-sh4.c
@@ -121,8 +121,8 @@ int __init sh4_pci_check_direct(struct pci_channel *chan)
 		if (pci_read_reg(chan, SH4_PCIPAR) == P1SEG) {
 			pci_write_reg(chan, tmp, SH4_PCIPAR);
 			printk(KERN_INFO "PCI: Using configuration type 1\n");
-			request_region(PCI_REG(SH4_PCIPAR), 8, "PCI conf1");
-
+			request_region(chan->reg_base + SH4_PCIPAR, 8,
+				       "PCI conf1");
 			return 0;
 		}
 
diff --git a/arch/sh/drivers/pci/pci-sh4.h b/arch/sh/drivers/pci/pci-sh4.h
index 62ba350..90abfe3 100644
--- a/arch/sh/drivers/pci/pci-sh4.h
+++ b/arch/sh/drivers/pci/pci-sh4.h
@@ -171,13 +171,13 @@ struct sh4_pci_address_map {
 static inline void pci_write_reg(struct pci_channel *chan,
 				 unsigned long val, unsigned long reg)
 {
-	ctrl_outl(val, PCI_REG(reg));
+	ctrl_outl(val, chan->reg_base + reg);
 }
 
 static inline unsigned long pci_read_reg(struct pci_channel *chan,
 					 unsigned long reg)
 {
-	return ctrl_inl(PCI_REG(reg));
+	return ctrl_inl(chan->reg_base + reg);
 }
 
 #endif /* __PCI_SH4_H */
diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c
index 447234c..201266b 100644
--- a/arch/sh/drivers/pci/pci-sh7751.c
+++ b/arch/sh/drivers/pci/pci-sh7751.c
@@ -39,6 +39,8 @@ int __init sh7751_pci_init(struct pci_channel *chan)
 
 	pr_debug("PCI: Starting intialization.\n");
 
+	chan->reg_base = 0xfe200000;
+
 	/* check for SH7751/SH7751R hardware */
 	id = pci_read_reg(chan, SH7751_PCICONF0);
 	if (id != ((SH7751_DEVICE_ID << 16) | SH7751_VENDOR_ID) &&
diff --git a/arch/sh/drivers/pci/pci-sh7751.h b/arch/sh/drivers/pci/pci-sh7751.h
index 0ea4387..c390dd2 100644
--- a/arch/sh/drivers/pci/pci-sh7751.h
+++ b/arch/sh/drivers/pci/pci-sh7751.h
@@ -26,7 +26,6 @@
 #define SH7751_PCI_IO_SIZE           0x40000     /* Size of IO window */
 
 #define SH7751_PCIREG_BASE           0xFE200000  /* PCI regs base address */
-#define PCI_REG(n)                  (SH7751_PCIREG_BASE+ n)
 
 #define SH7751_PCICONF0            0x0           /* PCI Config Reg 0 */
   #define SH7751_PCICONF0_DEVID      0xFFFF0000  /* Device ID */
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index e8f3a30..9d6483a 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -52,6 +52,8 @@ int __init sh7780_pci_init(struct pci_channel *chan)
 
 	pr_debug("PCI: Starting intialization.\n");
 
+	chan->reg_base = 0xfe040000;
+
 	ctrl_outl(0x00000001, SH7780_PCI_VCR2); /* Enable PCIC */
 
 	/* check for SH7780/SH7780R hardware */
diff --git a/arch/sh/drivers/pci/pci-sh7780.h b/arch/sh/drivers/pci/pci-sh7780.h
index 2f3c920..fffcf1d 100644
--- a/arch/sh/drivers/pci/pci-sh7780.h
+++ b/arch/sh/drivers/pci/pci-sh7780.h
@@ -35,7 +35,6 @@
 #define SH7780_PCI_IO_SIZE	0x00400000	/* Size of IO window */
 
 #define SH7780_PCIREG_BASE	0xFE040000	/* PCI regs base address */
-#define PCI_REG(n)		(SH7780_PCIREG_BASE+n)
 
 /* SH7780 PCI Config Registers */
 #define SH7780_PCIVID		0x000		/* Vendor ID */
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 386587e..8e9e0ed 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -24,6 +24,7 @@ struct pci_channel {
 	int first_devfn;
 	int last_devfn;
 	int enabled;
+	unsigned long reg_base;
 };
 
 /*
-- 
cgit v0.10.2


From ef53fdeb7e0cb139aff33665635b886700137abb Mon Sep 17 00:00:00 2001
From: Magnus Damm <magnus.damm@gmail.com>
Date: Tue, 19 Feb 2008 21:35:14 +0900
Subject: sh: add io_base member to pci_channel

Store the io window base address in struct pci_channel and use that one
instead of SH77xx_PCI_IO_BASE.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c
index 201266b..2a6c7aa 100644
--- a/arch/sh/drivers/pci/pci-sh7751.c
+++ b/arch/sh/drivers/pci/pci-sh7751.c
@@ -40,6 +40,7 @@ int __init sh7751_pci_init(struct pci_channel *chan)
 	pr_debug("PCI: Starting intialization.\n");
 
 	chan->reg_base = 0xfe200000;
+	chan->io_base = 0xfe240000;
 
 	/* check for SH7751/SH7751R hardware */
 	id = pci_read_reg(chan, SH7751_PCICONF0);
@@ -153,9 +154,9 @@ int __init sh7751_pcic_init(struct pci_channel *chan,
 	/* Map IO space into PCI IO window:
 	 * IO addresses will be translated to the PCI IO window base address
 	 */
-	pr_debug("PCI: Mapping IO address 0x%x - 0x%x to base 0x%x\n",
+	pr_debug("PCI: Mapping IO address 0x%x - 0x%x to base 0x%lx\n",
 		 chan->io_resource->start, chan->io_resource->end,
-		 SH7751_PCI_IO_BASE + chan->io_resource->start);
+		 chan->io_base + chan->io_resource->start);
 
 	/* Make sure the MSB's of IO window are set to access PCI space
 	 * correctly */
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 9d6483a..87a7f3b 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -53,6 +53,7 @@ int __init sh7780_pci_init(struct pci_channel *chan)
 	pr_debug("PCI: Starting intialization.\n");
 
 	chan->reg_base = 0xfe040000;
+	chan->io_base = 0xfe200000;
 
 	ctrl_outl(0x00000001, SH7780_PCI_VCR2); /* Enable PCIC */
 
@@ -135,9 +136,9 @@ int __init sh7780_pcic_init(struct pci_channel *chan,
 	/* Map IO space into PCI IO window:
 	 * IO addresses will be translated to the PCI IO window base address
 	 */
-	pr_debug("PCI: Mapping IO address 0x%x - 0x%x to base 0x%x\n",
+	pr_debug("PCI: Mapping IO address 0x%x - 0x%x to base 0x%lx\n",
 		 chan->io_resource->start, chan->io_resource->end,
-		 SH7780_PCI_IO_BASE + chan->io_resource->start);
+		 chan->io_base + chan->io_resource->start);
 
 	/* NOTE: I'm ignoring the PCI error IRQs for now..
 	 * TODO: add support for the internal error interrupts and
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 8e9e0ed..84d12eb 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -25,6 +25,7 @@ struct pci_channel {
 	int last_devfn;
 	int enabled;
 	unsigned long reg_base;
+	unsigned long io_base;
 };
 
 /*
-- 
cgit v0.10.2


From ef339f241b08a16af58897e6288ba200e0c7a8c7 Mon Sep 17 00:00:00 2001
From: Magnus Damm <magnus.damm@gmail.com>
Date: Tue, 19 Feb 2008 21:35:22 +0900
Subject: sh: pci memory range checking code

This patch changes the code to use __is_pci_memory() instead of
is_pci_memaddr(). __is_pci_memory() loops through all the pci
channels on the system to match memory windows.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/boards/mach-titan/io.c b/arch/sh/boards/mach-titan/io.c
index 4badad4..053b3ed 100644
--- a/arch/sh/boards/mach-titan/io.c
+++ b/arch/sh/boards/mach-titan/io.c
@@ -117,7 +117,7 @@ void titan_outsl(unsigned long port, const void *src, unsigned long count)
 
 void __iomem *titan_ioport_map(unsigned long port, unsigned int size)
 {
-	if (PXSEG(port) || is_pci_memaddr(port))
+	if (PXSEG(port))
 		return (void __iomem *)port;
 	else if (is_pci_ioaddr(port))
 		return (void __iomem *)pci_ioaddr(port);
diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c
index 29ec16e..b9aca54 100644
--- a/arch/sh/drivers/pci/pci.c
+++ b/arch/sh/drivers/pci/pci.c
@@ -167,9 +167,8 @@ void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
 	/*
 	 * Presently the IORESOURCE_MEM case is a bit special, most
 	 * SH7751 style PCI controllers have PCI memory at a fixed
-	 * location in the address space where no remapping is desired
-	 * (typically at 0xfd000000, but is_pci_memaddr() will know
-	 * best). With the IORESOURCE_MEM case more care has to be taken
+	 * location in the address space where no remapping is desired.
+	 * With the IORESOURCE_MEM case more care has to be taken
 	 * to inhibit page table mapping for legacy cores, but this is
 	 * punted off to __ioremap().
 	 *					-- PFM.
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 84d12eb..ccf5c5f 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -61,12 +61,8 @@ extern unsigned long PCI_IO_AREA;
 #define is_pci_ioaddr(port)		\
 	(((port) >= PCIBIOS_MIN_IO) &&	\
 	 ((port) < (PCIBIOS_MIN_IO + PCI_IO_SIZE)))
-#define is_pci_memaddr(port)		\
-	(((port) >= PCIBIOS_MIN_MEM) &&	\
-	 ((port) < (PCIBIOS_MIN_MEM + PCI_MEM_SIZE)))
 #else
 #define is_pci_ioaddr(port)	(0)
-#define is_pci_memaddr(port)	(0)
 #endif
 
 struct pci_dev;
@@ -127,6 +123,25 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 	*strat = PCI_DMA_BURST_INFINITY;
 	*strategy_parameter = ~0UL;
 }
+
+static inline int __is_pci_memory(unsigned long phys_addr, unsigned long size)
+{
+	struct pci_channel *p;
+	struct resource *res;
+
+	for (p = board_pci_channels; p->init; p++) {
+		res = p->mem_resource;
+		if (p->enabled && (phys_addr >= res->start) &&
+		    (phys_addr + size) <= (res->end + 1))
+			return 1;
+	}
+	return 0;
+}
+#else
+static inline int __is_pci_memory(unsigned long phys_addr, unsigned long size)
+{
+	return 0;
+}
 #endif
 
 /* Board-specific fixup routines. */
diff --git a/arch/sh/mm/ioremap_32.c b/arch/sh/mm/ioremap_32.c
index 60cc486..7e04cc8 100644
--- a/arch/sh/mm/ioremap_32.c
+++ b/arch/sh/mm/ioremap_32.c
@@ -56,7 +56,7 @@ void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
 	 * P1/P2 space, ioremap() will already do the right thing,
 	 * and we'll never get this far.
 	 */
-	if (is_pci_memaddr(phys_addr) && is_pci_memaddr(last_addr))
+	if (__is_pci_memory(phys_addr, size))
 		return (void __iomem *)phys_addr;
 
 #if !defined(CONFIG_PMB_FIXED)
@@ -121,7 +121,7 @@ void __iounmap(void __iomem *addr)
 	unsigned long seg = PXSEG(vaddr);
 	struct vm_struct *p;
 
-	if (seg < P3SEG || vaddr >= P3_ADDR_MAX || is_pci_memaddr(vaddr))
+	if (seg < P3SEG || vaddr >= P3_ADDR_MAX || __is_pci_memory(vaddr, 0))
 		return;
 
 #ifdef CONFIG_PMB
-- 
cgit v0.10.2


From 8ce0143b11cdc519b8e1fd94a262b654ef0bc3ab Mon Sep 17 00:00:00 2001
From: Magnus Damm <magnus.damm@gmail.com>
Date: Tue, 19 Feb 2008 21:35:31 +0900
Subject: sh: pci io port base address code

Adds a __get_pci_io_base() function which is used to match a port range
against struct pci_channel. This allows us to detect if a port range is
assigned to pci or happens to be legacy port io. While at it, remove unused
cpu-specific cruft.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/ops-rts7751r2d.c b/arch/sh/drivers/pci/ops-rts7751r2d.c
index fe5a231..58ed1d5 100644
--- a/arch/sh/drivers/pci/ops-rts7751r2d.c
+++ b/arch/sh/drivers/pci/ops-rts7751r2d.c
@@ -68,7 +68,6 @@ static struct sh4_pci_address_map sh7751_pci_map = {
 
 int __init pcibios_init_platform(void)
 {
-	__set_io_port_base(SH7751_PCI_IO_BASE);
 	return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map);
 }
 
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index ccf5c5f..bb2c2fc 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -38,33 +38,6 @@ extern struct pci_channel board_pci_channels[];
 #define PCIBIOS_MIN_IO		__PCI_CHAN(bus)->io_resource->start
 #define PCIBIOS_MIN_MEM		__PCI_CHAN(bus)->mem_resource->start
 
-/*
- * I/O routine helpers
- */
-#if defined(CONFIG_CPU_SUBTYPE_SH7780) || defined(CONFIG_CPU_SUBTYPE_SH7785)
-#define PCI_IO_AREA		0xFE400000
-#define PCI_IO_SIZE		0x00400000
-#elif defined(CONFIG_CPU_SH5)
-extern unsigned long PCI_IO_AREA;
-#define PCI_IO_SIZE		0x00010000
-#else
-#define PCI_IO_AREA		0xFE240000
-#define PCI_IO_SIZE		0x00040000
-#endif
-
-#define PCI_MEM_SIZE		0x01000000
-
-#define SH4_PCIIOBR_MASK	0xFFFC0000
-#define pci_ioaddr(addr)	(PCI_IO_AREA + (addr & ~SH4_PCIIOBR_MASK))
-
-#if defined(CONFIG_PCI)
-#define is_pci_ioaddr(port)		\
-	(((port) >= PCIBIOS_MIN_IO) &&	\
-	 ((port) < (PCIBIOS_MIN_IO + PCI_IO_SIZE)))
-#else
-#define is_pci_ioaddr(port)	(0)
-#endif
-
 struct pci_dev;
 
 extern void pcibios_set_master(struct pci_dev *dev);
@@ -137,11 +110,31 @@ static inline int __is_pci_memory(unsigned long phys_addr, unsigned long size)
 	}
 	return 0;
 }
+
+static inline void __iomem *__get_pci_io_base(unsigned long port,
+					      unsigned long size)
+{
+	struct pci_channel *p;
+	struct resource *res;
+
+	for (p = board_pci_channels; p->init; p++) {
+		res = p->io_resource;
+		if (p->enabled && (port >= res->start) &&
+		    (port + size) <= (res->end + 1))
+			return (void __iomem *)(p->io_base + port);
+	}
+	return NULL;
+}
 #else
 static inline int __is_pci_memory(unsigned long phys_addr, unsigned long size)
 {
 	return 0;
 }
+static inline void __iomem *__get_pci_io_base(unsigned long port,
+					      unsigned long size)
+{
+	return NULL;
+}
 #endif
 
 /* Board-specific fixup routines. */
diff --git a/arch/sh/kernel/io.c b/arch/sh/kernel/io.c
index 29cf458..59fb020 100644
--- a/arch/sh/kernel/io.c
+++ b/arch/sh/kernel/io.c
@@ -12,6 +12,7 @@
  * for more details.
  */
 #include <linux/module.h>
+#include <linux/pci.h>
 #include <asm/machvec.h>
 #include <asm/io.h>
 
@@ -69,6 +70,10 @@ void __iomem *ioport_map(unsigned long port, unsigned int nr)
 	if (ret)
 		return ret;
 
+	ret = __get_pci_io_base(port, nr);
+	if (ret)
+		return ret;
+
 	return __ioport_map(port, nr);
 }
 EXPORT_SYMBOL(ioport_map);
-- 
cgit v0.10.2


From aa5d3ff99cc1f3dfe262ab9dd9179131fcfe39fd Mon Sep 17 00:00:00 2001
From: Magnus Damm <magnus.damm@gmail.com>
Date: Tue, 19 Feb 2008 21:35:40 +0900
Subject: sh: export board_pci_channels in one place

Instead of sometimes exporting board_pci_channels[] in the board specific
code just export it in one place.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/ops-cayman.c b/arch/sh/drivers/pci/ops-cayman.c
index f4a5e14..cbaaf02 100644
--- a/arch/sh/drivers/pci/ops-cayman.c
+++ b/arch/sh/drivers/pci/ops-cayman.c
@@ -80,7 +80,6 @@ struct pci_channel board_pci_channels[] = {
 	{ sh5_pci_init, &sh5_pci_ops, NULL, NULL, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
-EXPORT_SYMBOL(board_pci_channels);
 
 int __init pcibios_init_platform(void)
 {
diff --git a/arch/sh/drivers/pci/ops-lboxre2.c b/arch/sh/drivers/pci/ops-lboxre2.c
index f606df2..781496b 100644
--- a/arch/sh/drivers/pci/ops-lboxre2.c
+++ b/arch/sh/drivers/pci/ops-lboxre2.c
@@ -43,8 +43,6 @@ struct pci_channel board_pci_channels[] = {
 	{ NULL, NULL, NULL, 0, 0 },
 };
 
-EXPORT_SYMBOL(board_pci_channels);
-
 static struct sh4_pci_address_map sh7751_pci_map = {
 	.window0	= {
 		.base	= SH7751_CS3_BASE_ADDR,
diff --git a/arch/sh/drivers/pci/ops-r7780rp.c b/arch/sh/drivers/pci/ops-r7780rp.c
index b51b7e4..c58f1cf 100644
--- a/arch/sh/drivers/pci/ops-r7780rp.c
+++ b/arch/sh/drivers/pci/ops-r7780rp.c
@@ -46,7 +46,6 @@ struct pci_channel board_pci_channels[] = {
 	{ sh7780_pci_init, &sh4_pci_ops, &sh7780_io_resource, &sh7780_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
-EXPORT_SYMBOL(board_pci_channels);
 
 static struct sh4_pci_address_map sh7780_pci_map = {
 	.window0	= {
diff --git a/arch/sh/drivers/pci/ops-rts7751r2d.c b/arch/sh/drivers/pci/ops-rts7751r2d.c
index 58ed1d5..d374cd3 100644
--- a/arch/sh/drivers/pci/ops-rts7751r2d.c
+++ b/arch/sh/drivers/pci/ops-rts7751r2d.c
@@ -50,7 +50,6 @@ struct pci_channel board_pci_channels[] = {
 	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
-EXPORT_SYMBOL(board_pci_channels);
 
 static struct sh4_pci_address_map sh7751_pci_map = {
 	.window0	= {
diff --git a/arch/sh/drivers/pci/ops-sdk7780.c b/arch/sh/drivers/pci/ops-sdk7780.c
index 7277cd1..b34fbc5 100644
--- a/arch/sh/drivers/pci/ops-sdk7780.c
+++ b/arch/sh/drivers/pci/ops-sdk7780.c
@@ -52,7 +52,6 @@ struct pci_channel board_pci_channels[] = {
 	{ sh7780_pci_init, &sh4_pci_ops, &sdk7780_io_resource, &sdk7780_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
-EXPORT_SYMBOL(board_pci_channels);
 
 static struct sh4_pci_address_map sdk7780_pci_map = {
 	.window0	= {
diff --git a/arch/sh/drivers/pci/ops-se7780.c b/arch/sh/drivers/pci/ops-se7780.c
index 76a74fb..4730207 100644
--- a/arch/sh/drivers/pci/ops-se7780.c
+++ b/arch/sh/drivers/pci/ops-se7780.c
@@ -61,7 +61,6 @@ struct pci_channel board_pci_channels[] = {
 	{ sh7780_pci_init, &sh4_pci_ops, &se7780_io_resource, &se7780_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
-EXPORT_SYMBOL(board_pci_channels);
 
 static struct sh4_pci_address_map se7780_pci_map = {
 	.window0	= {
diff --git a/arch/sh/drivers/pci/ops-titan.c b/arch/sh/drivers/pci/ops-titan.c
index ffa79bd..31ed037 100644
--- a/arch/sh/drivers/pci/ops-titan.c
+++ b/arch/sh/drivers/pci/ops-titan.c
@@ -55,7 +55,6 @@ struct pci_channel board_pci_channels[] = {
 	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
-EXPORT_SYMBOL(board_pci_channels);
 
 static struct sh4_pci_address_map sh7751_pci_map = {
 	.window0	= {
diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c
index b9aca54..8aaacc9 100644
--- a/arch/sh/drivers/pci/pci.c
+++ b/arch/sh/drivers/pci/pci.c
@@ -187,3 +187,5 @@ void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
 	iounmap(addr);
 }
 EXPORT_SYMBOL(pci_iounmap);
+
+EXPORT_SYMBOL(board_pci_channels);
-- 
cgit v0.10.2


From 10591578c84825a320734e59272f161385edcc41 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 11 Mar 2009 15:58:04 +0900
Subject: sh: drop duplicate symbol export on dreamcast and sh7785lcr.

With board_pci_channels now being exported in a single place, update the
boards that duplicated the export.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/ops-dreamcast.c b/arch/sh/drivers/pci/ops-dreamcast.c
index f62063e..5bc81d5 100644
--- a/arch/sh/drivers/pci/ops-dreamcast.c
+++ b/arch/sh/drivers/pci/ops-dreamcast.c
@@ -165,4 +165,3 @@ struct pci_channel board_pci_channels[] = {
 	  &gapspci_mem_resource, 0, 1 },
 	{ 0, }
 };
-EXPORT_SYMBOL(board_pci_channels);
diff --git a/arch/sh/drivers/pci/ops-sh7785lcr.c b/arch/sh/drivers/pci/ops-sh7785lcr.c
index fb0869f..2a9f9a5 100644
--- a/arch/sh/drivers/pci/ops-sh7785lcr.c
+++ b/arch/sh/drivers/pci/ops-sh7785lcr.c
@@ -44,7 +44,6 @@ struct pci_channel board_pci_channels[] = {
 	{ &sh4_pci_ops, &sh7785_io_resource, &sh7785_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
-EXPORT_SYMBOL(board_pci_channels);
 
 static struct sh4_pci_address_map sh7785_pci_map = {
 	.window0	= {
-- 
cgit v0.10.2


From 3aabce8d3d2f9af2c08c2f590ac9acb272ca8c95 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 11 Mar 2009 16:12:39 +0900
Subject: sh: sh7785lcr: Update for recent PCI changes.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/fixups-sh7785lcr.c b/arch/sh/drivers/pci/fixups-sh7785lcr.c
index 4949e60..9e7dc79 100644
--- a/arch/sh/drivers/pci/fixups-sh7785lcr.c
+++ b/arch/sh/drivers/pci/fixups-sh7785lcr.c
@@ -15,32 +15,33 @@
 #include <linux/pci.h>
 #include "pci-sh4.h"
 
-int pci_fixup_pcic(void)
+int pci_fixup_pcic(struct pci_channel *chan)
 {
-	pci_write_reg(0x000043ff, SH4_PCIINTM);
-	pci_write_reg(0x0000380f, SH4_PCIAINTM);
+	pci_write_reg(chan, 0x000043ff, SH4_PCIINTM);
+	pci_write_reg(chan, 0x0000380f, SH4_PCIAINTM);
 
-	pci_write_reg(0xfbb00047, SH7780_PCICMD);
-	pci_write_reg(0x00000000, SH7780_PCIIBAR);
+	pci_write_reg(chan, 0xfbb00047, SH7780_PCICMD);
+	pci_write_reg(chan, 0x00000000, SH7780_PCIIBAR);
 
-	pci_write_reg(0x00011912, SH7780_PCISVID);
-	pci_write_reg(0x08000000, SH7780_PCICSCR0);
-	pci_write_reg(0x0000001b, SH7780_PCICSAR0);
-	pci_write_reg(0xfd000000, SH7780_PCICSCR1);
-	pci_write_reg(0x0000000f, SH7780_PCICSAR1);
+	pci_write_reg(chan, 0x00011912, SH7780_PCISVID);
+	pci_write_reg(chan, 0x08000000, SH7780_PCICSCR0);
+	pci_write_reg(chan, 0x0000001b, SH7780_PCICSAR0);
+	pci_write_reg(chan, 0xfd000000, SH7780_PCICSCR1);
+	pci_write_reg(chan, 0x0000000f, SH7780_PCICSAR1);
 
-	pci_write_reg(0xfd000000, SH7780_PCIMBR0);
-	pci_write_reg(0x00fc0000, SH7780_PCIMBMR0);
+	pci_write_reg(chan, 0xfd000000, SH7780_PCIMBR0);
+	pci_write_reg(chan, 0x00fc0000, SH7780_PCIMBMR0);
 
 #ifdef CONFIG_32BIT
-	pci_write_reg(0xc0000000, SH7780_PCIMBR2);
-	pci_write_reg(0x20000000 - SH7780_PCI_IO_SIZE, SH7780_PCIMBMR2);
+	pci_write_reg(chan, 0xc0000000, SH7780_PCIMBR2);
+	pci_write_reg(chan, 0x20000000 - SH7780_PCI_IO_SIZE, SH7780_PCIMBMR2);
 #endif
 
 	/* Set IOBR for windows containing area specified in pci.h */
-	pci_write_reg((PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE - 1)),
+	pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE - 1),
 		      SH7780_PCIIOBR);
-	pci_write_reg(((SH7780_PCI_IO_SIZE - 1) & (7 << 18)), SH7780_PCIIOBMR);
+	pci_write_reg(chan, ((SH7780_PCI_IO_SIZE - 1) & (7 << 18)),
+		      SH7780_PCIIOBMR);
 
 	return 0;
 }
diff --git a/arch/sh/drivers/pci/ops-sh7785lcr.c b/arch/sh/drivers/pci/ops-sh7785lcr.c
index 2a9f9a5..afbb9bd 100644
--- a/arch/sh/drivers/pci/ops-sh7785lcr.c
+++ b/arch/sh/drivers/pci/ops-sh7785lcr.c
@@ -41,7 +41,7 @@ static struct resource sh7785_mem_resource = {
 };
 
 struct pci_channel board_pci_channels[] = {
-	{ &sh4_pci_ops, &sh7785_io_resource, &sh7785_mem_resource, 0, 0xff },
+	{ sh7780_pci_init, &sh4_pci_ops, &sh7785_io_resource, &sh7785_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
 
@@ -61,5 +61,5 @@ static struct sh4_pci_address_map sh7785_pci_map = {
 
 int __init pcibios_init_platform(void)
 {
-	return sh7780_pcic_init(&sh7785_pci_map);
+	return sh7780_pcic_init(&board_pci_channels[0], &sh7785_pci_map);
 }
-- 
cgit v0.10.2


From f1a9ba8f15f89f3f444985251efaf809cd16da53 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 11 Mar 2009 16:17:53 +0900
Subject: sh: pci: drop duplicate PCIC fixups for SE7780 and SH7785LCR.

SE7780 has the same PCIC fixup as SDK7780, and SH7785LCR the same
as R7780RP. Switch to using those, and drop the duplicate code.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index 847e9089..67d710a 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -21,6 +21,6 @@ obj-$(CONFIG_SH_SDK7780)		+= ops-sdk7780.o fixups-sdk7780.o
 obj-$(CONFIG_SH_TITAN)			+= ops-titan.o
 obj-$(CONFIG_SH_LANDISK)		+= ops-landisk.o
 obj-$(CONFIG_SH_LBOX_RE2)		+= ops-lboxre2.o fixups-lboxre2.o
-obj-$(CONFIG_SH_7780_SOLUTION_ENGINE)	+= ops-se7780.o fixups-se7780.o
+obj-$(CONFIG_SH_7780_SOLUTION_ENGINE)	+= ops-se7780.o fixups-sdk7780.o
 obj-$(CONFIG_SH_CAYMAN)			+= ops-cayman.o
-obj-$(CONFIG_SH_SH7785LCR)		+= ops-sh7785lcr.o fixups-sh7785lcr.o
+obj-$(CONFIG_SH_SH7785LCR)		+= ops-sh7785lcr.o fixups-r7780rp.o
diff --git a/arch/sh/drivers/pci/fixups-se7780.c b/arch/sh/drivers/pci/fixups-se7780.c
deleted file mode 100644
index a968af7..0000000
--- a/arch/sh/drivers/pci/fixups-se7780.c
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * arch/sh/drivers/pci/fixups-se7780.c
- *
- * HITACHI UL Solution Engine 7780  PCI fixups
- *
- * Copyright (C) 2003  Lineo uSolutions, Inc.
- * Copyright (C) 2004 - 2006  Paul Mundt
- * Copyright (C) 2006  Nobuhiro Iwamatsu
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#include <linux/pci.h>
-#include "pci-sh4.h"
-#include <asm/io.h>
-
-int pci_fixup_pcic(struct pci_channel *chan)
-{
-	ctrl_outl(0x00000001, SH7780_PCI_VCR2);
-
-	/* Enable all interrupts, so we know what to fix */
-	pci_write_reg(chan, 0x0000C3FF, SH7780_PCIIMR);
-	pci_write_reg(chan, 0x0000380F, SH7780_PCIAINTM);
-
-	/* Set up standard PCI config registers */
-	ctrl_outw(0xFB00, PCI_REG(SH7780_PCISTATUS));
-	ctrl_outw(0x0047, PCI_REG(SH7780_PCICMD));
-	ctrl_outb(  0x00, PCI_REG(SH7780_PCIPIF));
-	ctrl_outb(  0x00, PCI_REG(SH7780_PCISUB));
-	ctrl_outb(  0x06, PCI_REG(SH7780_PCIBCC));
-	ctrl_outw(0x1912, PCI_REG(SH7780_PCISVID));
-	ctrl_outw(0x0001, PCI_REG(SH7780_PCISID));
-
-	pci_write_reg(chan, 0x08000000, SH7780_PCIMBAR0);     /* PCI */
-	pci_write_reg(chan, 0x08000000, SH7780_PCILAR0);     /* SHwy */
-	pci_write_reg(chan, 0x07F00001, SH7780_PCILSR); /* size 128M w/ MBAR */
-
-	pci_write_reg(chan, 0x00000000, SH7780_PCIMBAR1);
-	pci_write_reg(chan, 0x00000000, SH7780_PCILAR1);
-	pci_write_reg(chan, 0x00000000, SH7780_PCILSR1);
-
-	pci_write_reg(chan, 0xAB000801, SH7780_PCIIBAR);
-
-	/*
-	 * Set the MBR so PCI address is one-to-one with window,
-	 * meaning all calls go straight through... use ifdef to
-	 * catch erroneous assumption.
-	 */
-	pci_write_reg(chan, 0xFD000000 , SH7780_PCIMBR0);
-	pci_write_reg(chan, 0x00FC0000 , SH7780_PCIMBMR0);    /* 16M */
-
-	/* Set IOBR for window containing area specified in pci.h */
-	pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE-1),
-		      SH7780_PCIIOBR);
-	pci_write_reg(chan, (SH7780_PCI_IO_SIZE-1) & (7 << 18),
-		      SH7780_PCIIOBMR);
-
-	pci_write_reg(chan, 0xA5000C01, SH7780_PCICR);
-
-	return 0;
-}
diff --git a/arch/sh/drivers/pci/fixups-sh7785lcr.c b/arch/sh/drivers/pci/fixups-sh7785lcr.c
deleted file mode 100644
index 9e7dc79..0000000
--- a/arch/sh/drivers/pci/fixups-sh7785lcr.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * arch/sh/drivers/pci/fixups-sh7785lcr.c
- *
- * R0P7785LC0011RL PCI fixups
- * Copyright (C) 2008  Yoshihiro Shimoda
- *
- * Based on arch/sh/drivers/pci/fixups-r7780rp.c
- * Copyright (C) 2003  Lineo uSolutions, Inc.
- * Copyright (C) 2004 - 2006  Paul Mundt
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#include <linux/pci.h>
-#include "pci-sh4.h"
-
-int pci_fixup_pcic(struct pci_channel *chan)
-{
-	pci_write_reg(chan, 0x000043ff, SH4_PCIINTM);
-	pci_write_reg(chan, 0x0000380f, SH4_PCIAINTM);
-
-	pci_write_reg(chan, 0xfbb00047, SH7780_PCICMD);
-	pci_write_reg(chan, 0x00000000, SH7780_PCIIBAR);
-
-	pci_write_reg(chan, 0x00011912, SH7780_PCISVID);
-	pci_write_reg(chan, 0x08000000, SH7780_PCICSCR0);
-	pci_write_reg(chan, 0x0000001b, SH7780_PCICSAR0);
-	pci_write_reg(chan, 0xfd000000, SH7780_PCICSCR1);
-	pci_write_reg(chan, 0x0000000f, SH7780_PCICSAR1);
-
-	pci_write_reg(chan, 0xfd000000, SH7780_PCIMBR0);
-	pci_write_reg(chan, 0x00fc0000, SH7780_PCIMBMR0);
-
-#ifdef CONFIG_32BIT
-	pci_write_reg(chan, 0xc0000000, SH7780_PCIMBR2);
-	pci_write_reg(chan, 0x20000000 - SH7780_PCI_IO_SIZE, SH7780_PCIMBMR2);
-#endif
-
-	/* Set IOBR for windows containing area specified in pci.h */
-	pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE - 1),
-		      SH7780_PCIIOBR);
-	pci_write_reg(chan, ((SH7780_PCI_IO_SIZE - 1) & (7 << 18)),
-		      SH7780_PCIIOBMR);
-
-	return 0;
-}
-- 
cgit v0.10.2


From d0deef5b14af7d5bbd0003a0a2a1a32326e20a6d Mon Sep 17 00:00:00 2001
From: Shawn Du <duyuyang@gmail.com>
Date: Tue, 14 Apr 2009 13:58:56 +0800
Subject: blktrace: support per-partition tracing

Though one can specify '-d /dev/sda1' when using blktrace, it still
traces the whole sda.

To support per-partition tracing, when we start tracing, we initialize
bt->start_lba and bt->end_lba to the start and end sector of that
partition.

Note some actions are per device, thus we don't filter 0-sector events.

The original patch and discussion can be found here:
	http://marc.info/?l=linux-btrace&m=122949374214540&w=2

Signed-off-by: Shawn Du <duyuyang@gmail.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
LKML-Reference: <49E42620.4050701@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index f87615d..f8c218c 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -568,7 +568,7 @@ static int compat_blk_trace_setup(struct block_device *bdev, char __user *arg)
 	memcpy(&buts.name, &cbuts.name, 32);
 
 	mutex_lock(&bdev->bd_mutex);
-	ret = do_blk_trace_setup(q, b, bdev->bd_dev, &buts);
+	ret = do_blk_trace_setup(q, b, bdev->bd_dev, bdev, &buts);
 	mutex_unlock(&bdev->bd_mutex);
 	if (ret)
 		return ret;
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 82312df..49c9873 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1065,6 +1065,7 @@ sg_ioctl(struct inode *inode, struct file *filp,
 		return blk_trace_setup(sdp->device->request_queue,
 				       sdp->disk->disk_name,
 				       MKDEV(SCSI_GENERIC_MAJOR, sdp->index),
+				       NULL,
 				       (char *)arg);
 	case BLKTRACESTART:
 		return blk_trace_startstop(sdp->device->request_queue, 1);
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index d960889..267edc4 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -165,8 +165,9 @@ struct blk_trace {
 
 extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
 extern void blk_trace_shutdown(struct request_queue *);
-extern int do_blk_trace_setup(struct request_queue *q,
-	char *name, dev_t dev, struct blk_user_trace_setup *buts);
+extern int do_blk_trace_setup(struct request_queue *q, char *name,
+			      dev_t dev, struct block_device *bdev,
+			      struct blk_user_trace_setup *buts);
 extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
 
 /**
@@ -193,6 +194,7 @@ extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
 extern void blk_add_driver_data(struct request_queue *q, struct request *rq,
 				void *data, size_t len);
 extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+			   struct block_device *bdev,
 			   char __user *arg);
 extern int blk_trace_startstop(struct request_queue *q, int start);
 extern int blk_trace_remove(struct request_queue *q);
@@ -200,15 +202,15 @@ extern int blk_trace_remove(struct request_queue *q);
 extern struct attribute_group blk_trace_attr_group;
 
 #else /* !CONFIG_BLK_DEV_IO_TRACE */
-#define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
-#define blk_trace_shutdown(q)			do { } while (0)
-#define do_blk_trace_setup(q, name, dev, buts)	(-ENOTTY)
-#define blk_add_driver_data(q, rq, data, len)	do {} while (0)
-#define blk_trace_setup(q, name, dev, arg)	(-ENOTTY)
-#define blk_trace_startstop(q, start)		(-ENOTTY)
-#define blk_trace_remove(q)			(-ENOTTY)
-#define blk_add_trace_msg(q, fmt, ...)		do { } while (0)
-
+# define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
+# define blk_trace_shutdown(q)				do { } while (0)
+# define do_blk_trace_setup(q, name, dev, bdev, buts)	(-ENOTTY)
+# define blk_add_driver_data(q, rq, data, len)		do {} while (0)
+# define blk_trace_setup(q, name, dev, bdev, arg)	(-ENOTTY)
+# define blk_trace_startstop(q, start)			(-ENOTTY)
+# define blk_trace_remove(q)				(-ENOTTY)
+# define blk_add_trace_msg(q, fmt, ...)			do { } while (0)
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
+
 #endif /* __KERNEL__ */
 #endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 2b98195..e932654 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -147,7 +147,7 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
 {
 	if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
 		return 1;
-	if (sector < bt->start_lba || sector > bt->end_lba)
+	if (sector && (sector < bt->start_lba || sector > bt->end_lba))
 		return 1;
 	if (bt->pid && pid != bt->pid)
 		return 1;
@@ -192,7 +192,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	what |= MASK_TC_BIT(rw, DISCARD);
 
 	pid = tsk->pid;
-	if (unlikely(act_log_check(bt, what, sector, pid)))
+	if (act_log_check(bt, what, sector, pid))
 		return;
 	cpu = raw_smp_processor_id();
 
@@ -407,11 +407,13 @@ static struct rchan_callbacks blk_relay_callbacks = {
  * Setup everything required to start tracing
  */
 int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
-			struct blk_user_trace_setup *buts)
+		       struct block_device *bdev,
+		       struct blk_user_trace_setup *buts)
 {
 	struct blk_trace *old_bt, *bt = NULL;
 	struct dentry *dir = NULL;
 	int ret, i;
+	struct hd_struct *part = NULL;
 
 	if (!buts->buf_size || !buts->buf_nr)
 		return -EINVAL;
@@ -480,11 +482,21 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (!bt->act_mask)
 		bt->act_mask = (u16) -1;
 
-	bt->start_lba = buts->start_lba;
-	bt->end_lba = buts->end_lba;
-	if (!bt->end_lba)
+	if (bdev)
+		part = bdev->bd_part;
+
+	if (part) {
+		bt->start_lba = part->start_sect;
+		bt->end_lba = part->start_sect + part->nr_sects;
+	} else
 		bt->end_lba = -1ULL;
 
+	/* overwrite with user settings */
+	if (buts->start_lba)
+		bt->start_lba = buts->start_lba;
+	if (buts->end_lba)
+		bt->end_lba = buts->end_lba;
+
 	bt->pid = buts->pid;
 	bt->trace_state = Blktrace_setup;
 
@@ -505,6 +517,7 @@ err:
 }
 
 int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+		    struct block_device *bdev,
 		    char __user *arg)
 {
 	struct blk_user_trace_setup buts;
@@ -514,7 +527,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (ret)
 		return -EFAULT;
 
-	ret = do_blk_trace_setup(q, name, dev, &buts);
+	ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
 	if (ret)
 		return ret;
 
@@ -582,7 +595,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 	switch (cmd) {
 	case BLKTRACESETUP:
 		bdevname(bdev, b);
-		ret = blk_trace_setup(q, b, bdev->bd_dev, arg);
+		ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
 		break;
 	case BLKTRACESTART:
 		start = 1;
-- 
cgit v0.10.2


From 9908c30997b8a73c95f836170b9998dae9aa3f4a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 14 Apr 2009 13:59:34 +0800
Subject: blktrace: support per-partition tracing for ftrace plugin

The previous patch adds support to trace a single partition for
relay+ioctl blktrace, and this patch is for ftrace plugin blktrace:

  # echo 1 > /sys/block/sda/sda7/enable
  # cat start_lba
  102398373
  # cat end_lba
  102703545

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Shawn Du <duyuyang@gmail.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
LKML-Reference: <49E42646.4060608@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e932654..d109898 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -403,6 +403,23 @@ static struct rchan_callbacks blk_relay_callbacks = {
 	.remove_buf_file	= blk_remove_buf_file_callback,
 };
 
+static void blk_trace_setup_lba(struct blk_trace *bt,
+				struct block_device *bdev)
+{
+	struct hd_struct *part = NULL;
+
+	if (bdev)
+		part = bdev->bd_part;
+
+	if (part) {
+		bt->start_lba = part->start_sect;
+		bt->end_lba = part->start_sect + part->nr_sects;
+	} else {
+		bt->start_lba = 0;
+		bt->end_lba = -1ULL;
+	}
+}
+
 /*
  * Setup everything required to start tracing
  */
@@ -413,7 +430,6 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	struct blk_trace *old_bt, *bt = NULL;
 	struct dentry *dir = NULL;
 	int ret, i;
-	struct hd_struct *part = NULL;
 
 	if (!buts->buf_size || !buts->buf_nr)
 		return -EINVAL;
@@ -482,14 +498,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (!bt->act_mask)
 		bt->act_mask = (u16) -1;
 
-	if (bdev)
-		part = bdev->bd_part;
-
-	if (part) {
-		bt->start_lba = part->start_sect;
-		bt->end_lba = part->start_sect + part->nr_sects;
-	} else
-		bt->end_lba = -1ULL;
+	blk_trace_setup_lba(bt, bdev);
 
 	/* overwrite with user settings */
 	if (buts->start_lba)
@@ -1370,7 +1379,8 @@ static int blk_trace_remove_queue(struct request_queue *q)
 /*
  * Setup everything required to start tracing
  */
-static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
+static int blk_trace_setup_queue(struct request_queue *q,
+				 struct block_device *bdev)
 {
 	struct blk_trace *old_bt, *bt = NULL;
 	int ret = -ENOMEM;
@@ -1383,9 +1393,10 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
 	if (!bt->msg_data)
 		goto free_bt;
 
-	bt->dev = dev;
+	bt->dev = bdev->bd_dev;
 	bt->act_mask = (u16)-1;
-	bt->end_lba = -1ULL;
+
+	blk_trace_setup_lba(bt, bdev);
 
 	old_bt = xchg(&q->blk_trace, bt);
 	if (old_bt != NULL) {
@@ -1602,7 +1613,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 
 	if (attr == &dev_attr_enable) {
 		if (value)
-			ret = blk_trace_setup_queue(q, bdev->bd_dev);
+			ret = blk_trace_setup_queue(q, bdev);
 		else
 			ret = blk_trace_remove_queue(q);
 		goto out_unlock_bdev;
@@ -1610,7 +1621,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 
 	ret = 0;
 	if (q->blk_trace == NULL)
-		ret = blk_trace_setup_queue(q, bdev->bd_dev);
+		ret = blk_trace_setup_queue(q, bdev);
 
 	if (ret == 0) {
 		if (attr == &dev_attr_act_mask)
-- 
cgit v0.10.2


From 1d54ad6da9192fed5dd3b60224d9f2dfea0dcd82 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 14 Apr 2009 14:00:05 +0800
Subject: blktrace: add trace/ to /sys/block/sda

Impact: allow ftrace-plugin blktrace to trace device-mapper devices

To trace a single partition:
  # echo 1 > /sys/block/sda/sda1/enable

To trace the whole sda instead:
  # echo 1 > /sys/block/sda/enable

Thus we also fix an issue reported by Ted, that ftrace-plugin blktrace
can't be used to trace device-mapper devices.

Now:

  # echo 1 > /sys/block/dm-0/trace/enable
  echo: write error: No such device or address
  # mount -t ext4 /dev/dm-0 /mnt
  # echo 1 > /sys/block/dm-0/trace/enable
  # echo blk > /debug/tracing/current_tracer

Reported-by: Theodore Tso <tytso@mit.edu>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Shawn Du <duyuyang@gmail.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
LKML-Reference: <49E42665.6020506@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 73f36be..8653d71 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -387,16 +387,21 @@ struct kobj_type blk_queue_ktype = {
 int blk_register_queue(struct gendisk *disk)
 {
 	int ret;
+	struct device *dev = disk_to_dev(disk);
 
 	struct request_queue *q = disk->queue;
 
 	if (WARN_ON(!q))
 		return -ENXIO;
 
+	ret = blk_trace_init_sysfs(dev);
+	if (ret)
+		return ret;
+
 	if (!q->request_fn)
 		return 0;
 
-	ret = kobject_add(&q->kobj, kobject_get(&disk_to_dev(disk)->kobj),
+	ret = kobject_add(&q->kobj, kobject_get(&dev->kobj),
 			  "%s", "queue");
 	if (ret < 0)
 		return ret;
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 267edc4..62763c9 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -198,6 +198,7 @@ extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 			   char __user *arg);
 extern int blk_trace_startstop(struct request_queue *q, int start);
 extern int blk_trace_remove(struct request_queue *q);
+extern int blk_trace_init_sysfs(struct device *dev);
 
 extern struct attribute_group blk_trace_attr_group;
 
@@ -210,6 +211,11 @@ extern struct attribute_group blk_trace_attr_group;
 # define blk_trace_startstop(q, start)			(-ENOTTY)
 # define blk_trace_remove(q)				(-ENOTTY)
 # define blk_add_trace_msg(q, fmt, ...)			do { } while (0)
+static inline int blk_trace_init_sysfs(struct device *dev)
+{
+	return 0;
+}
+
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
 
 #endif /* __KERNEL__ */
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d109898..8e7c5da 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1644,3 +1644,8 @@ out:
 	return ret ? ret : count;
 }
 
+int blk_trace_init_sysfs(struct device *dev)
+{
+	return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
+}
+
-- 
cgit v0.10.2


From f3948f8857ef5de239f28a61dddb1554a0ae4c2c Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 15 Apr 2009 11:02:56 +0800
Subject: blktrace: fix context-info when mixed-using blk tracer and trace
 events

When current tracer is set to blk tracer, TRACE_ITER_CONTEXT_INFO is
unset, but actually context-info is printed:

    pdflush-431   [000]   821.181576:   8,0    P   N [pdflush]

And then if we enable TRACE_ITER_CONTEXT_INFO:

    # echo context-info > trace_options

We'll see context-info printed twice. What's worse, when we use blk
tracer and trace events at the same time, we'll see no context-info
for trace events at all:

    jbd2_commit_logging: dev dm-0:8 transaction 333227
    jbd2_end_commit: dev dm-0:8 transaction 333227 head 332814
      rm-25433 [001]  9578.307485:   8,18   m   N cfq25433 slice expired t=0
      rm-25433 [001]  9578.307486:   8,18   m   N cfq25433 put_queue

This patch adds blk_tracer->set_flags(), and context-info flag is unset
only when we set the output to classic mode.

Note after this patch, one should unset context-info explicitly if he
wants to get binary output that can be parsed by blkparse:

    # echo nocontext-info > trace_options
    # echo bin > trace_options
    # echo blk > current_tracer
    # cat trace_pipe | blkparse -i -

Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49E54E60.50408@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 8e7c5da..c32062b 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1211,7 +1211,6 @@ static void blk_tracer_print_header(struct seq_file *m)
 static void blk_tracer_start(struct trace_array *tr)
 {
 	blk_tracer_enabled = true;
-	trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
 }
 
 static int blk_tracer_init(struct trace_array *tr)
@@ -1224,7 +1223,6 @@ static int blk_tracer_init(struct trace_array *tr)
 static void blk_tracer_stop(struct trace_array *tr)
 {
 	blk_tracer_enabled = false;
-	trace_flags |= TRACE_ITER_CONTEXT_INFO;
 }
 
 static void blk_tracer_reset(struct trace_array *tr)
@@ -1289,9 +1287,6 @@ out:
 static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
 					       int flags)
 {
-	if (!trace_print_context(iter))
-		return TRACE_TYPE_PARTIAL_LINE;
-
 	return print_one_line(iter, false);
 }
 
@@ -1326,6 +1321,18 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
 	return print_one_line(iter, true);
 }
 
+static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set)
+{
+	/* don't output context-info for blk_classic output */
+	if (bit == TRACE_BLK_OPT_CLASSIC) {
+		if (set)
+			trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
+		else
+			trace_flags |= TRACE_ITER_CONTEXT_INFO;
+	}
+	return 0;
+}
+
 static struct tracer blk_tracer __read_mostly = {
 	.name		= "blk",
 	.init		= blk_tracer_init,
@@ -1335,6 +1342,7 @@ static struct tracer blk_tracer __read_mostly = {
 	.print_header	= blk_tracer_print_header,
 	.print_line	= blk_tracer_print_line,
 	.flags		= &blk_tracer_flags,
+	.set_flag	= blk_tracer_set_flag,
 };
 
 static struct trace_event trace_blk_event = {
-- 
cgit v0.10.2


From 0232ba9ce031d0fd8f331fa8b3c00e16901f54e6 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 16 Apr 2009 18:01:31 +0900
Subject: sh: pci: Kill off unused SH4_PCIC_NO_RESET code.

Nothing ended up using this anymore, so just kill it off.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/ops-landisk.c b/arch/sh/drivers/pci/ops-landisk.c
index c46911d9..178b778 100644
--- a/arch/sh/drivers/pci/ops-landisk.c
+++ b/arch/sh/drivers/pci/ops-landisk.c
@@ -39,8 +39,6 @@ static struct sh4_pci_address_map sh7751_pci_map = {
 		.base	= SH7751_CS3_BASE_ADDR,
 		.size	= (64 << 20),	/* 64MB */
 	},
-
-	.flags = SH4_PCIC_NO_RESET,
 };
 
 int __init pcibios_init_platform(void)
diff --git a/arch/sh/drivers/pci/ops-lboxre2.c b/arch/sh/drivers/pci/ops-lboxre2.c
index 781496b..91cabd8 100644
--- a/arch/sh/drivers/pci/ops-lboxre2.c
+++ b/arch/sh/drivers/pci/ops-lboxre2.c
@@ -48,11 +48,6 @@ static struct sh4_pci_address_map sh7751_pci_map = {
 		.base	= SH7751_CS3_BASE_ADDR,
 		.size	= 0x04000000,
 	},
-	.window1	= {
-		.base	= 0x00000000,	/* Unused */
-		.size	= 0x00000000,	/* Unused */
-	},
-	.flags	= SH4_PCIC_NO_RESET,
 };
 
 int __init pcibios_init_platform(void)
diff --git a/arch/sh/drivers/pci/ops-r7780rp.c b/arch/sh/drivers/pci/ops-r7780rp.c
index c58f1cf..8ec6d22 100644
--- a/arch/sh/drivers/pci/ops-r7780rp.c
+++ b/arch/sh/drivers/pci/ops-r7780rp.c
@@ -57,8 +57,6 @@ static struct sh4_pci_address_map sh7780_pci_map = {
 		.base	= SH7780_CS3_BASE_ADDR,
 		.size	= 0x04000000,
 	},
-
-	.flags	= SH4_PCIC_NO_RESET,
 };
 
 int __init pcibios_init_platform(void)
diff --git a/arch/sh/drivers/pci/ops-rts7751r2d.c b/arch/sh/drivers/pci/ops-rts7751r2d.c
index d374cd3..96b916c 100644
--- a/arch/sh/drivers/pci/ops-rts7751r2d.c
+++ b/arch/sh/drivers/pci/ops-rts7751r2d.c
@@ -56,13 +56,6 @@ static struct sh4_pci_address_map sh7751_pci_map = {
 		.base	= SH7751_CS3_BASE_ADDR,
 		.size	= 0x04000000,
 	},
-
-	.window1	= {
-		.base	= 0x00000000,	/* Unused */
-		.size	= 0x00000000,	/* Unused */
-	},
-
-	.flags	= SH4_PCIC_NO_RESET,
 };
 
 int __init pcibios_init_platform(void)
diff --git a/arch/sh/drivers/pci/ops-sdk7780.c b/arch/sh/drivers/pci/ops-sdk7780.c
index b34fbc5..6a0b7c0 100644
--- a/arch/sh/drivers/pci/ops-sdk7780.c
+++ b/arch/sh/drivers/pci/ops-sdk7780.c
@@ -62,7 +62,6 @@ static struct sh4_pci_address_map sdk7780_pci_map = {
 		.base	= SH7780_CS3_BASE_ADDR,
 		.size	= 0x04000000,
 	},
-	.flags	= SH4_PCIC_NO_RESET,
 };
 
 int __init pcibios_init_platform(void)
diff --git a/arch/sh/drivers/pci/ops-se7780.c b/arch/sh/drivers/pci/ops-se7780.c
index 4730207..583b8e8 100644
--- a/arch/sh/drivers/pci/ops-se7780.c
+++ b/arch/sh/drivers/pci/ops-se7780.c
@@ -67,7 +67,6 @@ static struct sh4_pci_address_map se7780_pci_map = {
 		.base	= SH7780_CS2_BASE_ADDR,
 		.size	= 0x04000000,
 	},
-	.flags  = SH4_PCIC_NO_RESET,
 };
 
 int __init pcibios_init_platform(void)
diff --git a/arch/sh/drivers/pci/ops-sh7785lcr.c b/arch/sh/drivers/pci/ops-sh7785lcr.c
index afbb9bd..ab0d1de 100644
--- a/arch/sh/drivers/pci/ops-sh7785lcr.c
+++ b/arch/sh/drivers/pci/ops-sh7785lcr.c
@@ -55,8 +55,6 @@ static struct sh4_pci_address_map sh7785_pci_map = {
 		.size	= 0x20000000,
 #endif
 	},
-
-	.flags	= SH4_PCIC_NO_RESET,
 };
 
 int __init pcibios_init_platform(void)
diff --git a/arch/sh/drivers/pci/ops-snapgear.c b/arch/sh/drivers/pci/ops-snapgear.c
index 2e254c6..dd2c5df 100644
--- a/arch/sh/drivers/pci/ops-snapgear.c
+++ b/arch/sh/drivers/pci/ops-snapgear.c
@@ -54,8 +54,6 @@ static struct sh4_pci_address_map sh7751_pci_map = {
 		.base	= SH7751_CS2_BASE_ADDR,
 		.size	= SNAPGEAR_LSR1_SIZE,
 	},
-
-	.flags	= SH4_PCIC_NO_RESET,
 };
 
 /*
diff --git a/arch/sh/drivers/pci/ops-titan.c b/arch/sh/drivers/pci/ops-titan.c
index 31ed037..e45bb62 100644
--- a/arch/sh/drivers/pci/ops-titan.c
+++ b/arch/sh/drivers/pci/ops-titan.c
@@ -66,8 +66,6 @@ static struct sh4_pci_address_map sh7751_pci_map = {
 		.base	= SH7751_CS2_BASE_ADDR,
 		.size	= SH7751_MEM_REGION_SIZE*2,
 	},
-
-	.flags	= SH4_PCIC_NO_RESET,
 };
 
 int __init pcibios_init_platform(void)
diff --git a/arch/sh/drivers/pci/pci-sh4.h b/arch/sh/drivers/pci/pci-sh4.h
index 90abfe3..3d5296c 100644
--- a/arch/sh/drivers/pci/pci-sh4.h
+++ b/arch/sh/drivers/pci/pci-sh4.h
@@ -149,9 +149,6 @@
   #define SH4_PCIPDTR_PB0	  0x000000001	/* Port 0 Enable */
 #define SH4_PCIPDR		0x220		/* Port IO Data Register */
 
-/* Flags */
-#define SH4_PCIC_NO_RESET	0x0001
-
 /* arch/sh/kernel/drivers/pci/ops-sh4.c */
 extern struct pci_ops sh4_pci_ops;
 int sh4_pci_check_direct(struct pci_channel *chan);
@@ -165,7 +162,6 @@ struct sh4_pci_address_space {
 struct sh4_pci_address_map {
 	struct sh4_pci_address_space window0;
 	struct sh4_pci_address_space window1;
-	unsigned long flags;
 };
 
 static inline void pci_write_reg(struct pci_channel *chan,
diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c
index 2a6c7aa..af887443 100644
--- a/arch/sh/drivers/pci/pci-sh7751.c
+++ b/arch/sh/drivers/pci/pci-sh7751.c
@@ -99,21 +99,6 @@ int __init sh7751_pcic_init(struct pci_channel *chan,
 	word = SH4_PCIPINT_D3 | SH4_PCIPINT_D0;
 	pci_write_reg(chan, word, SH4_PCIPINT);
 
-	/*
-	 * This code is unused for some boards as it is done in the
-	 * bootloader and doing it here means the MAC addresses loaded
-	 * by the bootloader get lost.
-	 */
-	if (!(map->flags & SH4_PCIC_NO_RESET)) {
-		/* toggle PCI reset pin */
-		word = SH4_PCICR_PREFIX | SH4_PCICR_PRST;
-		pci_write_reg(chan, word, SH4_PCICR);
-		/* Wait for a long time... not 1 sec. but long enough */
-		mdelay(100);
-		word = SH4_PCICR_PREFIX;
-		pci_write_reg(chan, word, SH4_PCICR);
-	}
-
 	/* set the command/status bits to:
 	 * Wait Cycle Control + Parity Enable + Bus Master +
 	 * Mem space enable
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 87a7f3b..282cabe 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -96,21 +96,6 @@ int __init sh7780_pcic_init(struct pci_channel *chan,
 {
 	u32 word;
 
-	/*
-	 * This code is unused for some boards as it is done in the
-	 * bootloader and doing it here means the MAC addresses loaded
-	 * by the bootloader get lost.
-	 */
-	if (!(map->flags & SH4_PCIC_NO_RESET)) {
-		/* toggle PCI reset pin */
-		word = SH4_PCICR_PREFIX | SH4_PCICR_PRST;
-		pci_write_reg(chan, word, SH4_PCICR);
-		/* Wait for a long time... not 1 sec. but long enough */
-		mdelay(100);
-		word = SH4_PCICR_PREFIX;
-		pci_write_reg(chan, word, SH4_PCICR);
-	}
-
 	/* set the command/status bits to:
 	 * Wait Cycle Control + Parity Enable + Bus Master +
 	 * Mem space enable
-- 
cgit v0.10.2


From d1f0ae5e2e45e74cff4c3bdefb0fc77608cdfeec Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 15 Apr 2009 21:34:55 +0200
Subject: x86: standardize Kbuild rules

Introducing this Kbuild file allow us to:

    make arch/x86/

And thus building all the core part of x86.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Jaswinder Singh Rajput <jaswinder@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
new file mode 100644
index 0000000..ad8ec35
--- /dev/null
+++ b/arch/x86/Kbuild
@@ -0,0 +1,16 @@
+
+obj-$(CONFIG_KVM) += kvm/
+
+# Xen paravirtualization support
+obj-$(CONFIG_XEN) += xen/
+
+# lguest paravirtualization support
+obj-$(CONFIG_LGUEST_GUEST) += lguest/
+
+obj-y += kernel/
+obj-y += mm/
+
+obj-y += crypto/
+obj-y += vdso/
+obj-$(CONFIG_IA32_EMULATION) += ia32/
+
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index f05d8c9..e81f0b2 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -7,8 +7,6 @@ else
         KBUILD_DEFCONFIG := $(ARCH)_defconfig
 endif
 
-core-$(CONFIG_KVM) += arch/x86/kvm/
-
 # BITS is used as extension for files which are available in a 32 bit
 # and a 64 bit version to simplify shared Makefiles.
 # e.g.: obj-y += foo_$(BITS).o
@@ -118,21 +116,8 @@ head-y += arch/x86/kernel/init_task.o
 
 libs-y  += arch/x86/lib/
 
-# Sub architecture files that needs linking first
-core-y += $(fcore-y)
-
-# Xen paravirtualization support
-core-$(CONFIG_XEN) += arch/x86/xen/
-
-# lguest paravirtualization support
-core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
-
-core-y += arch/x86/kernel/
-core-y += arch/x86/mm/
-
-core-y += arch/x86/crypto/
-core-y += arch/x86/vdso/
-core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/
+# See arch/x86/Kbuild for content of core part of the kernel
+core-y += arch/x86/
 
 # drivers-y are linked after core-y
 drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/
-- 
cgit v0.10.2


From 84971bb401866d79c6b353cb1d8861c2b4621867 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 17 Apr 2009 12:44:25 +0900
Subject: sh: pci: Kill off useless debugging printk() in pci-sh7780 init.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 282cabe..b826c7b 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -118,18 +118,6 @@ int __init sh7780_pcic_init(struct pci_channel *chan,
 	pci_write_reg(chan, map->window1.base, SH4_PCILAR1);
 	pci_write_reg(chan, map->window1.base, SH7780_PCIMBAR1);
 
-	/* Map IO space into PCI IO window:
-	 * IO addresses will be translated to the PCI IO window base address
-	 */
-	pr_debug("PCI: Mapping IO address 0x%x - 0x%x to base 0x%lx\n",
-		 chan->io_resource->start, chan->io_resource->end,
-		 chan->io_base + chan->io_resource->start);
-
-	/* NOTE: I'm ignoring the PCI error IRQs for now..
-	 * TODO: add support for the internal error interrupts and
-	 * DMA interrupts...
-	 */
-
 	/* Apply any last-minute PCIC fixups */
 	pci_fixup_pcic(chan);
 
-- 
cgit v0.10.2


From b627b4ed3d056c5d00e8f3cb32d033b0ee6619a9 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 17 Apr 2009 13:00:18 +0900
Subject: sh: pci: Move se7780 INTC fixups out of pci-sh7780.c.

These fixups belong in the board INTC setup code, not in the middle of
pci-sh7780.c.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/boards/mach-se/7780/irq.c b/arch/sh/boards/mach-se/7780/irq.c
index 66ad292..44b61a5 100644
--- a/arch/sh/boards/mach-se/7780/irq.c
+++ b/arch/sh/boards/mach-se/7780/irq.c
@@ -12,10 +12,13 @@
 #include <linux/init.h>
 #include <linux/irq.h>
 #include <linux/interrupt.h>
-#include <asm/irq.h>
-#include <asm/io.h>
+#include <linux/irq.h>
+#include <linux/io.h>
 #include <mach-se/mach/se7780.h>
 
+#define INTC_BASE	0xffd00000
+#define INTC_ICR1	(INTC_BASE+0x1c)
+
 /*
  * Initialize IRQ setting
  */
@@ -43,4 +46,7 @@ void __init init_se7780_IRQ(void)
 	ctrl_outw((IRQPIN_PCCPW << IRQPOS_PCCPW), FPGA_INTSEL3);
 
 	plat_irq_setup_pins(IRQ_MODE_IRQ); /* install handlers for IRQ0-7 */
+
+	/* ICR1: detect low level(for 2ndcut) */
+	ctrl_outl(0xAAAA0000, INTC_ICR1);
 }
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index b826c7b..45fa423 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -22,20 +22,6 @@
 #include <linux/delay.h>
 #include "pci-sh4.h"
 
-#define INTC_BASE	0xffd00000
-#define INTC_ICR0	(INTC_BASE+0x0)
-#define INTC_ICR1	(INTC_BASE+0x1c)
-#define INTC_INTPRI	(INTC_BASE+0x10)
-#define INTC_INTREQ	(INTC_BASE+0x24)
-#define INTC_INTMSK0	(INTC_BASE+0x44)
-#define INTC_INTMSK1	(INTC_BASE+0x48)
-#define INTC_INTMSK2	(INTC_BASE+0x40080)
-#define INTC_INTMSKCLR0	(INTC_BASE+0x64)
-#define INTC_INTMSKCLR1	(INTC_BASE+0x68)
-#define INTC_INTMSKCLR2	(INTC_BASE+0x40084)
-#define INTC_INT2MSKR	(INTC_BASE+0x40038)
-#define INTC_INT2MSKCR	(INTC_BASE+0x4003c)
-
 /*
  * Initialization. Try all known PCI access methods. Note that we support
  * using both PCI BIOS and direct access: in such cases, we use I/O ports
@@ -75,16 +61,6 @@ int __init sh7780_pci_init(struct pci_channel *chan)
 		return -ENODEV;
 	}
 
-	/* Setup the INTC */
-	if (mach_is_7780se()) {
-		/* ICR0: IRL=use separately */
-		ctrl_outl(0x00C00020, INTC_ICR0);
-		/* ICR1: detect low level(for 2ndcut) */
-		ctrl_outl(0xAAAA0000, INTC_ICR1);
-		/* INTPRI: priority=3(all) */
-		ctrl_outl(0x33333333, INTC_INTPRI);
-	}
-
 	if ((ret = sh4_pci_check_direct(chan)) != 0)
 		return ret;
 
-- 
cgit v0.10.2


From 7e4ba0d77c96d328ba968ddff4a464d4d2fa7abc Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 17 Apr 2009 14:07:57 +0900
Subject: sh: pci: Prefer P1SEG over P1SEGADDR for CONFIG_CMD.

P1SEGADDR is obsolete and will be killed off completely in the future,
so transition off of it and reference P1SEG explicitly.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/ops-sh4.c b/arch/sh/drivers/pci/ops-sh4.c
index 540683d..2a7f7b5 100644
--- a/arch/sh/drivers/pci/ops-sh4.c
+++ b/arch/sh/drivers/pci/ops-sh4.c
@@ -1,22 +1,22 @@
 /*
  * Generic SH-4 / SH-4A PCIC operations (SH7751, SH7780).
  *
- * Copyright (C) 2002 - 2006  Paul Mundt
+ * Copyright (C) 2002 - 2009  Paul Mundt
  *
  * This file is subject to the terms and conditions of the GNU General Public
  * License v2. See the file "COPYING" in the main directory of this archive
  * for more details.
  */
 #include <linux/pci.h>
+#include <linux/io.h>
 #include <asm/addrspace.h>
-#include <asm/io.h>
 #include "pci-sh4.h"
 
 /*
  * Direct access to PCI hardware...
  */
 #define CONFIG_CMD(bus, devfn, where) \
-	P1SEGADDR((bus->number << 16) | (devfn << 8) | (where & ~3))
+	(P1SEG | (bus->number << 16) | (devfn << 8) | (where & ~3))
 
 static DEFINE_SPINLOCK(sh4_pci_lock);
 
-- 
cgit v0.10.2


From 0bbc9bc3189f24de946777af43c9033c8c4871e4 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 17 Apr 2009 14:09:09 +0900
Subject: sh: pci: Set class/sub-class code correctly for SH7780 PCIC.

The SH7780 PCI host controller implements a configuration header that
requires a fair bit of hand-holding to initialize properly. By default
it appears as a pre-2.0 host controller given the zeroed out class code,
so fix this up properly.

Some boards that happened to be using the R7780RP version of the PCIC
fixups had set this correctly, but this belongs in the standard
initialization, and is by no means board specific.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/fixups-sdk7780.c b/arch/sh/drivers/pci/fixups-sdk7780.c
index c295731..004efd4 100644
--- a/arch/sh/drivers/pci/fixups-sdk7780.c
+++ b/arch/sh/drivers/pci/fixups-sdk7780.c
@@ -16,8 +16,6 @@
 
 int pci_fixup_pcic(struct pci_channel *chan)
 {
-	ctrl_outl(0x00000001, SH7780_PCI_VCR2);
-
 	/* Enable all interrupts, so we know what to fix */
 	pci_write_reg(chan, 0x0000C3FF, SH7780_PCIIMR);
 	pci_write_reg(chan, 0x0000380F, SH7780_PCIAINTM);
@@ -26,8 +24,6 @@ int pci_fixup_pcic(struct pci_channel *chan)
 	pci_write_reg(chan, 0xFB00, SH7780_PCISTATUS);
 	pci_write_reg(chan, 0x0047, SH7780_PCICMD);
 	pci_write_reg(chan, 0x00, SH7780_PCIPIF);
-	pci_write_reg(chan, 0x00, SH7780_PCISUB);
-	pci_write_reg(chan, 0x06, SH7780_PCIBCC);
 	pci_write_reg(chan, 0x1912, SH7780_PCISVID);
 	pci_write_reg(chan, 0x0001, SH7780_PCISID);
 
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 45fa423..7f4f590 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -72,16 +72,15 @@ int __init sh7780_pcic_init(struct pci_channel *chan,
 {
 	u32 word;
 
+	pci_write_reg(chan, PCI_CLASS_BRIDGE_HOST >> 8, SH7780_PCIBCC);
+	pci_write_reg(chan, PCI_CLASS_BRIDGE_HOST & 0xff, SH7780_PCISUB);
+
 	/* set the command/status bits to:
 	 * Wait Cycle Control + Parity Enable + Bus Master +
 	 * Mem space enable
 	 */
 	pci_write_reg(chan, 0x00000046, SH7780_PCICMD);
 
-	/* define this host as the host bridge */
-	word = PCI_BASE_CLASS_BRIDGE << 24;
-	pci_write_reg(chan, word, SH7780_PCIRID);
-
 	/* Set IO and Mem windows to local address
 	 * Make PCI and local address the same for easy 1 to 1 mapping
 	 */
-- 
cgit v0.10.2


From 4e7b7fdb129995640f144b7de114e109c6b46a2a Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 17 Apr 2009 15:05:19 +0900
Subject: sh: pci: Rework SH7780 host controller detection.

This reworks how the host controller is probed, and makes it a bit more
verbose in the event a new type of controller is detected. Additionally,
we also log the revision information.

This now uses the proper access sizes for the vendor/device registers,
rather than relying on a larger access that encapsulated both of them.
Not all devices support 32-bit read cycles for these registers.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 7f4f590..63b5151 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -34,33 +34,39 @@
 int __init sh7780_pci_init(struct pci_channel *chan)
 {
 	unsigned int id;
-	int ret, match = 0;
+	const char *type = NULL;
+	int ret;
 
-	pr_debug("PCI: Starting intialization.\n");
+	printk(KERN_NOTICE "PCI: Starting intialization.\n");
 
 	chan->reg_base = 0xfe040000;
 	chan->io_base = 0xfe200000;
 
-	ctrl_outl(0x00000001, SH7780_PCI_VCR2); /* Enable PCIC */
-
-	/* check for SH7780/SH7780R hardware */
-	id = pci_read_reg(chan, SH7780_PCIVID);
-	if ((id & 0xffff) == SH7780_VENDOR_ID) {
-		switch ((id >> 16) & 0xffff) {
-		case SH7763_DEVICE_ID:
-		case SH7780_DEVICE_ID:
-		case SH7781_DEVICE_ID:
-		case SH7785_DEVICE_ID:
-			match = 1;
-			break;
-		}
-	}
+	/* Enable CPU access to the PCIC registers. */
+	__raw_writel(PCIECR_ENBL, PCIECR);
 
-	if (unlikely(!match)) {
-		printk(KERN_ERR "PCI: This is not an SH7780 (%x)\n", id);
+	id = __raw_readw(chan->reg_base + SH7780_PCIVID);
+	if (id != SH7780_VENDOR_ID) {
+		printk(KERN_ERR "PCI: Unknown vendor ID 0x%04x.\n", id);
 		return -ENODEV;
 	}
 
+	id = __raw_readw(chan->reg_base + SH7780_PCIDID);
+	type = (id == SH7763_DEVICE_ID)	? "SH7763" :
+	       (id == SH7780_DEVICE_ID) ? "SH7780" :
+	       (id == SH7781_DEVICE_ID) ? "SH7781" :
+	       (id == SH7785_DEVICE_ID) ? "SH7785" :
+					  NULL;
+	if (unlikely(!type)) {
+		printk(KERN_ERR "PCI: Found an unsupported Renesas host "
+		       "controller, device id 0x%04x.\n", id);
+		return -EINVAL;
+	}
+
+	printk(KERN_NOTICE "PCI: Found a Renesas %s host "
+	       "controller, revision %d.\n", type,
+	       __raw_readb(chan->reg_base + SH7780_PCIRID));
+
 	if ((ret = sh4_pci_check_direct(chan)) != 0)
 		return ret;
 
diff --git a/arch/sh/drivers/pci/pci-sh7780.h b/arch/sh/drivers/pci/pci-sh7780.h
index fffcf1d..213f1d8 100644
--- a/arch/sh/drivers/pci/pci-sh7780.h
+++ b/arch/sh/drivers/pci/pci-sh7780.h
@@ -20,9 +20,8 @@
 #define SH7785_DEVICE_ID	0x0007
 
 /* SH7780 Control Registers */
-#define	SH7780_PCI_VCR0		0xFE000000
-#define	SH7780_PCI_VCR1		0xFE000004
-#define	SH7780_PCI_VCR2		0xFE000008
+#define	PCIECR			0xFE000008
+#define PCIECR_ENBL		0x01
 
 /* SH7780 Specific Values */
 #define SH7780_PCI_CONFIG_BASE	0xFD000000	/* Config space base addr */
-- 
cgit v0.10.2


From ab78cbcf6877334fc20868b7df7887349e2e01c8 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 17 Apr 2009 15:08:01 +0900
Subject: sh: pci: Use the proper write size for class/sub-class code.

Don't use pci_write_reg() for these, as it defaults to 32-bit. Rather
than using the helper, use __raw_writeb() directly.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 63b5151..19bac21 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -78,8 +78,10 @@ int __init sh7780_pcic_init(struct pci_channel *chan,
 {
 	u32 word;
 
-	pci_write_reg(chan, PCI_CLASS_BRIDGE_HOST >> 8, SH7780_PCIBCC);
-	pci_write_reg(chan, PCI_CLASS_BRIDGE_HOST & 0xff, SH7780_PCISUB);
+	__raw_writeb(PCI_CLASS_BRIDGE_HOST >> 8,
+		     chan->reg_base + SH7780_PCIBCC);
+	__raw_writeb(PCI_CLASS_BRIDGE_HOST & 0xff,
+		     chan->reg_base + SH7780_PCISUB);
 
 	/* set the command/status bits to:
 	 * Wait Cycle Control + Parity Enable + Bus Master +
-- 
cgit v0.10.2


From c66c1d79a94a7a302e2dc6c93da40902423eac3e Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 17 Apr 2009 16:38:00 +0900
Subject: sh: pci: Set pci_cache_line_size on SH7780 via the PCICLS register.

The SH7780 PCIC contains a read-only cache line size register that we can
derive pci_cache_line_size from. So, make sure that the software idea of
the cache line size actually matches the host controller's idea.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 19bac21..fa73b0d 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -22,15 +22,6 @@
 #include <linux/delay.h>
 #include "pci-sh4.h"
 
-/*
- * Initialization. Try all known PCI access methods. Note that we support
- * using both PCI BIOS and direct access: in such cases, we use I/O ports
- * to access config space.
- *
- * Note that the platform specific initialization (BSC registers, and memory
- * space mapping) will be called via the platform defined function
- * pcibios_init_platform().
- */
 int __init sh7780_pci_init(struct pci_channel *chan)
 {
 	unsigned int id;
@@ -70,19 +61,31 @@ int __init sh7780_pci_init(struct pci_channel *chan)
 	if ((ret = sh4_pci_check_direct(chan)) != 0)
 		return ret;
 
+	/*
+	 * Platform specific initialization (BSC registers, and memory space
+	 * mapping) will be called via the platform defined function
+	 * pcibios_init_platform().
+	 */
 	return pcibios_init_platform();
 }
 
+extern u8 pci_cache_line_size;
+
 int __init sh7780_pcic_init(struct pci_channel *chan,
 			    struct sh4_pci_address_map *map)
 {
 	u32 word;
 
+	/*
+	 * Set the class and sub-class codes.
+	 */
 	__raw_writeb(PCI_CLASS_BRIDGE_HOST >> 8,
 		     chan->reg_base + SH7780_PCIBCC);
 	__raw_writeb(PCI_CLASS_BRIDGE_HOST & 0xff,
 		     chan->reg_base + SH7780_PCISUB);
 
+	pci_cache_line_size = pci_read_reg(chan, SH7780_PCICLS) / 4;
+
 	/* set the command/status bits to:
 	 * Wait Cycle Control + Parity Enable + Bus Master +
 	 * Mem space enable
-- 
cgit v0.10.2


From f1dcab756687622b658154ded1657538984edcdb Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 17 Apr 2009 17:00:27 +0900
Subject: sh: pci: Set the I/O port base to the SH7780 I/O window default.

Presently the I/O port base isn't being set anywhere, which allows things
like generic_inl() to blow up. Fix this up to point at the PCI IO window.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index fa73b0d..207b720 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -112,5 +112,7 @@ int __init sh7780_pcic_init(struct pci_channel *chan,
 	word = SH4_PCICR_PREFIX | SH4_PCICR_CFIN | SH4_PCICR_FTO;
 	pci_write_reg(chan, word, SH4_PCICR);
 
+	__set_io_port_base(SH7780_PCI_IO_BASE);
+
 	return 0;
 }
-- 
cgit v0.10.2


From ab1363a8929f32cc163cd3f50ca72f20d901b00c Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 17 Apr 2009 17:07:47 +0900
Subject: sh: pci: Consolidate PCI I/O and mem window definitions for SH7780.

This consolidates all of the PCI I/O and memory window definitions across
the pci-sh7780 users in pci-sh7780 itself. No functional changes, in that
every platform had exactly the same implementation.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/ops-r7780rp.c b/arch/sh/drivers/pci/ops-r7780rp.c
index 8ec6d22..044525df 100644
--- a/arch/sh/drivers/pci/ops-r7780rp.c
+++ b/arch/sh/drivers/pci/ops-r7780rp.c
@@ -26,27 +26,6 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
 	return irq_tab[slot];
 }
 
-static struct resource sh7780_io_resource = {
-	.name	= "SH7780_IO",
-	.start	= SH7780_PCI_IO_BASE,
-	.end	= SH7780_PCI_IO_BASE + SH7780_PCI_IO_SIZE - 1,
-	.flags	= IORESOURCE_IO
-};
-
-static struct resource sh7780_mem_resource = {
-	.name	= "SH7780_mem",
-	.start	= SH7780_PCI_MEMORY_BASE,
-	.end	= SH7780_PCI_MEMORY_BASE + SH7780_PCI_MEM_SIZE - 1,
-	.flags	= IORESOURCE_MEM
-};
-
-extern struct pci_ops sh7780_pci_ops;
-
-struct pci_channel board_pci_channels[] = {
-	{ sh7780_pci_init, &sh4_pci_ops, &sh7780_io_resource, &sh7780_mem_resource, 0, 0xff },
-	{ NULL, NULL, NULL, 0, 0 },
-};
-
 static struct sh4_pci_address_map sh7780_pci_map = {
 	.window0	= {
 		.base	= SH7780_CS2_BASE_ADDR,
@@ -61,5 +40,5 @@ static struct sh4_pci_address_map sh7780_pci_map = {
 
 int __init pcibios_init_platform(void)
 {
-	return sh7780_pcic_init(&board_pci_channels[0], &sh7780_pci_map);
+	return sh7780_pcic_init(&sh7780_pci_map);
 }
diff --git a/arch/sh/drivers/pci/ops-sdk7780.c b/arch/sh/drivers/pci/ops-sdk7780.c
index 6a0b7c0..570fd71 100644
--- a/arch/sh/drivers/pci/ops-sdk7780.c
+++ b/arch/sh/drivers/pci/ops-sdk7780.c
@@ -34,25 +34,6 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
        return sdk7780_irq_tab[pin-1][slot];
 }
 
-static struct resource sdk7780_io_resource = {
-	.name	= "SH7780_IO",
-	.start	= SH7780_PCI_IO_BASE,
-	.end	= SH7780_PCI_IO_BASE + SH7780_PCI_IO_SIZE - 1,
-	.flags	= IORESOURCE_IO
-};
-
-static struct resource sdk7780_mem_resource = {
-	.name	= "SH7780_mem",
-	.start	= SH7780_PCI_MEMORY_BASE,
-	.end	= SH7780_PCI_MEMORY_BASE + SH7780_PCI_MEM_SIZE - 1,
-	.flags	= IORESOURCE_MEM
-};
-
-struct pci_channel board_pci_channels[] = {
-	{ sh7780_pci_init, &sh4_pci_ops, &sdk7780_io_resource, &sdk7780_mem_resource, 0, 0xff },
-	{ NULL, NULL, NULL, 0, 0 },
-};
-
 static struct sh4_pci_address_map sdk7780_pci_map = {
 	.window0	= {
 		.base	= SH7780_CS2_BASE_ADDR,
@@ -67,5 +48,5 @@ static struct sh4_pci_address_map sdk7780_pci_map = {
 int __init pcibios_init_platform(void)
 {
 	printk(KERN_INFO "SH7780 PCI: Finished initializing PCI controller\n");
-	return sh7780_pcic_init(&board_pci_channels[0], &sdk7780_pci_map);
+	return sh7780_pcic_init(&sdk7780_pci_map);
 }
diff --git a/arch/sh/drivers/pci/ops-se7780.c b/arch/sh/drivers/pci/ops-se7780.c
index 583b8e8..8b09415 100644
--- a/arch/sh/drivers/pci/ops-se7780.c
+++ b/arch/sh/drivers/pci/ops-se7780.c
@@ -41,27 +41,6 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
        return se7780_irq_tab[pin-1][slot];
 }
 
-static struct resource se7780_io_resource = {
-	.name	= "SH7780_IO",
-	.start	= SH7780_PCI_IO_BASE,
-	.end	= SH7780_PCI_IO_BASE + SH7780_PCI_IO_SIZE - 1,
-	.flags	= IORESOURCE_IO
-};
-
-static struct resource se7780_mem_resource = {
-	.name	= "SH7780_mem",
-	.start	= SH7780_PCI_MEMORY_BASE,
-	.end	= SH7780_PCI_MEMORY_BASE + SH7780_PCI_MEM_SIZE - 1,
-	.flags	= IORESOURCE_MEM
-};
-
-extern struct pci_ops se7780_pci_ops;
-
-struct pci_channel board_pci_channels[] = {
-	{ sh7780_pci_init, &sh4_pci_ops, &se7780_io_resource, &se7780_mem_resource, 0, 0xff },
-	{ NULL, NULL, NULL, 0, 0 },
-};
-
 static struct sh4_pci_address_map se7780_pci_map = {
 	.window0	= {
 		.base	= SH7780_CS2_BASE_ADDR,
@@ -90,5 +69,5 @@ int __init pcibios_init_platform(void)
 	ctrl_outw(0x0013, FPGA_PCI_INTSEL1);
 	ctrl_outw(0xE402, FPGA_PCI_INTSEL2);
 
-	return sh7780_pcic_init(&board_pci_channels[0], &se7780_pci_map);
+	return sh7780_pcic_init(&se7780_pci_map);
 }
diff --git a/arch/sh/drivers/pci/ops-sh7785lcr.c b/arch/sh/drivers/pci/ops-sh7785lcr.c
index ab0d1de..bac46b5 100644
--- a/arch/sh/drivers/pci/ops-sh7785lcr.c
+++ b/arch/sh/drivers/pci/ops-sh7785lcr.c
@@ -26,25 +26,6 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
 	return irq_tab[slot];
 }
 
-static struct resource sh7785_io_resource = {
-	.name	= "SH7785_IO",
-	.start	= SH7780_PCI_IO_BASE,
-	.end	= SH7780_PCI_IO_BASE + SH7780_PCI_IO_SIZE - 1,
-	.flags	= IORESOURCE_IO
-};
-
-static struct resource sh7785_mem_resource = {
-	.name	= "SH7785_mem",
-	.start	= SH7780_PCI_MEMORY_BASE,
-	.end	= SH7780_PCI_MEMORY_BASE + SH7780_PCI_MEM_SIZE - 1,
-	.flags	= IORESOURCE_MEM
-};
-
-struct pci_channel board_pci_channels[] = {
-	{ sh7780_pci_init, &sh4_pci_ops, &sh7785_io_resource, &sh7785_mem_resource, 0, 0xff },
-	{ NULL, NULL, NULL, 0, 0 },
-};
-
 static struct sh4_pci_address_map sh7785_pci_map = {
 	.window0	= {
 #if defined(CONFIG_32BIT)
@@ -59,5 +40,5 @@ static struct sh4_pci_address_map sh7785_pci_map = {
 
 int __init pcibios_init_platform(void)
 {
-	return sh7780_pcic_init(&board_pci_channels[0], &sh7785_pci_map);
+	return sh7780_pcic_init(&sh7785_pci_map);
 }
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 207b720..eb217dd 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -22,7 +22,7 @@
 #include <linux/delay.h>
 #include "pci-sh4.h"
 
-int __init sh7780_pci_init(struct pci_channel *chan)
+static int __init sh7780_pci_init(struct pci_channel *chan)
 {
 	unsigned int id;
 	const char *type = NULL;
@@ -71,9 +71,28 @@ int __init sh7780_pci_init(struct pci_channel *chan)
 
 extern u8 pci_cache_line_size;
 
-int __init sh7780_pcic_init(struct pci_channel *chan,
-			    struct sh4_pci_address_map *map)
+static struct resource sh7785_io_resource = {
+	.name	= "SH7785_IO",
+	.start	= SH7780_PCI_IO_BASE,
+	.end	= SH7780_PCI_IO_BASE + SH7780_PCI_IO_SIZE - 1,
+	.flags	= IORESOURCE_IO
+};
+
+static struct resource sh7785_mem_resource = {
+	.name	= "SH7785_mem",
+	.start	= SH7780_PCI_MEMORY_BASE,
+	.end	= SH7780_PCI_MEMORY_BASE + SH7780_PCI_MEM_SIZE - 1,
+	.flags	= IORESOURCE_MEM
+};
+
+struct pci_channel board_pci_channels[] = {
+	{ sh7780_pci_init, &sh4_pci_ops, &sh7785_io_resource, &sh7785_mem_resource, 0, 0xff },
+	{ NULL, NULL, NULL, 0, 0 },
+};
+
+int __init sh7780_pcic_init(struct sh4_pci_address_map *map)
 {
+	struct pci_channel *chan = &board_pci_channels[0];
 	u32 word;
 
 	/*
diff --git a/arch/sh/drivers/pci/pci-sh7780.h b/arch/sh/drivers/pci/pci-sh7780.h
index 213f1d8..7a4f8a8 100644
--- a/arch/sh/drivers/pci/pci-sh7780.h
+++ b/arch/sh/drivers/pci/pci-sh7780.h
@@ -107,8 +107,6 @@
 struct sh4_pci_address_map;
 
 /* arch/sh/drivers/pci/pci-sh7780.c */
-int sh7780_pci_init(struct pci_channel *chan);
-int sh7780_pcic_init(struct pci_channel *chan,
-		     struct sh4_pci_address_map *map);
+int sh7780_pcic_init(struct sh4_pci_address_map *map);
 
 #endif /* _PCI_SH7780_H_ */
-- 
cgit v0.10.2


From 4c7a47de897e89c25a40e228ac5319cbac7257fe Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 17 Apr 2009 17:21:36 +0900
Subject: sh: pci: Kill off platform-specific multi-window mappings.

Commit 68b42d1b548be1840aff7122fdebeb804daf0fa3 ("sh: sh7785lcr: Map
whole PCI address space.") changed around the semantics of how various
chip-selects are made accessible to PCI. Now that there is a single
large mapping covering from CS0-CS6, there is no longer any need to
do multi-window mapping. Subsequently, all of the differing
implementations can be consolidated in to pci-sh7780.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/boards/mach-se/7780/irq.c b/arch/sh/boards/mach-se/7780/irq.c
index 44b61a5..b8d43b6 100644
--- a/arch/sh/boards/mach-se/7780/irq.c
+++ b/arch/sh/boards/mach-se/7780/irq.c
@@ -49,4 +49,21 @@ void __init init_se7780_IRQ(void)
 
 	/* ICR1: detect low level(for 2ndcut) */
 	ctrl_outl(0xAAAA0000, INTC_ICR1);
+
+	/*
+	 * FPGA PCISEL register initialize
+	 *
+	 *  CPU  || SLOT1 | SLOT2 | S-ATA | USB
+	 *  -------------------------------------
+	 *  INTA || INTA  | INTD  |  --   | INTB
+	 *  -------------------------------------
+	 *  INTB || INTB  | INTA  |  --   | INTC
+	 *  -------------------------------------
+	 *  INTC || INTC  | INTB  | INTA  |  --
+	 *  -------------------------------------
+	 *  INTD || INTD  | INTC  |  --   | INTA
+	 *  -------------------------------------
+	 */
+	ctrl_outw(0x0013, FPGA_PCI_INTSEL1);
+	ctrl_outw(0xE402, FPGA_PCI_INTSEL2);
 }
diff --git a/arch/sh/drivers/pci/ops-r7780rp.c b/arch/sh/drivers/pci/ops-r7780rp.c
index 044525df..4ea136e 100644
--- a/arch/sh/drivers/pci/ops-r7780rp.c
+++ b/arch/sh/drivers/pci/ops-r7780rp.c
@@ -25,20 +25,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
 {
 	return irq_tab[slot];
 }
-
-static struct sh4_pci_address_map sh7780_pci_map = {
-	.window0	= {
-		.base	= SH7780_CS2_BASE_ADDR,
-		.size	= 0x04000000,
-	},
-
-	.window1	= {
-		.base	= SH7780_CS3_BASE_ADDR,
-		.size	= 0x04000000,
-	},
-};
-
-int __init pcibios_init_platform(void)
-{
-	return sh7780_pcic_init(&sh7780_pci_map);
-}
diff --git a/arch/sh/drivers/pci/ops-sdk7780.c b/arch/sh/drivers/pci/ops-sdk7780.c
index 570fd71..1d9a91b 100644
--- a/arch/sh/drivers/pci/ops-sdk7780.c
+++ b/arch/sh/drivers/pci/ops-sdk7780.c
@@ -33,20 +33,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
 {
        return sdk7780_irq_tab[pin-1][slot];
 }
-
-static struct sh4_pci_address_map sdk7780_pci_map = {
-	.window0	= {
-		.base	= SH7780_CS2_BASE_ADDR,
-		.size	= 0x04000000,
-	},
-	.window1	= {
-		.base	= SH7780_CS3_BASE_ADDR,
-		.size	= 0x04000000,
-	},
-};
-
-int __init pcibios_init_platform(void)
-{
-	printk(KERN_INFO "SH7780 PCI: Finished initializing PCI controller\n");
-	return sh7780_pcic_init(&sdk7780_pci_map);
-}
diff --git a/arch/sh/drivers/pci/ops-se7780.c b/arch/sh/drivers/pci/ops-se7780.c
index 8b09415..6c088cc 100644
--- a/arch/sh/drivers/pci/ops-se7780.c
+++ b/arch/sh/drivers/pci/ops-se7780.c
@@ -40,34 +40,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
 {
        return se7780_irq_tab[pin-1][slot];
 }
-
-static struct sh4_pci_address_map se7780_pci_map = {
-	.window0	= {
-		.base	= SH7780_CS2_BASE_ADDR,
-		.size	= 0x04000000,
-	},
-};
-
-int __init pcibios_init_platform(void)
-{
-	printk("SH7780 PCI: Finished initialization of the PCI controller\n");
-
-	/*
-	 * FPGA PCISEL register initialize
-	 *
-	 *  CPU  || SLOT1 | SLOT2 | S-ATA | USB
-	 *  -------------------------------------
-	 *  INTA || INTA  | INTD  |  --   | INTB
-	 *  -------------------------------------
-	 *  INTB || INTB  | INTA  |  --   | INTC
-	 *  -------------------------------------
-	 *  INTC || INTC  | INTB  | INTA  |  --
-	 *  -------------------------------------
-	 *  INTD || INTD  | INTC  |  --   | INTA
-	 *  -------------------------------------
-	 */
-	ctrl_outw(0x0013, FPGA_PCI_INTSEL1);
-	ctrl_outw(0xE402, FPGA_PCI_INTSEL2);
-
-	return sh7780_pcic_init(&se7780_pci_map);
-}
diff --git a/arch/sh/drivers/pci/ops-sh7785lcr.c b/arch/sh/drivers/pci/ops-sh7785lcr.c
index bac46b5..0fe423e 100644
--- a/arch/sh/drivers/pci/ops-sh7785lcr.c
+++ b/arch/sh/drivers/pci/ops-sh7785lcr.c
@@ -25,20 +25,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
 {
 	return irq_tab[slot];
 }
-
-static struct sh4_pci_address_map sh7785_pci_map = {
-	.window0	= {
-#if defined(CONFIG_32BIT)
-		.base	= SH7780_32BIT_DDR_BASE_ADDR,
-		.size	= 0x40000000,
-#else
-		.base	= SH7780_CS0_BASE_ADDR,
-		.size	= 0x20000000,
-#endif
-	},
-};
-
-int __init pcibios_init_platform(void)
-{
-	return sh7780_pcic_init(&sh7785_pci_map);
-}
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index eb217dd..07c5529 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -90,7 +90,19 @@ struct pci_channel board_pci_channels[] = {
 	{ NULL, NULL, NULL, 0, 0 },
 };
 
-int __init sh7780_pcic_init(struct sh4_pci_address_map *map)
+static struct sh4_pci_address_map sh7780_pci_map = {
+	.window0	= {
+#if defined(CONFIG_32BIT)
+		.base	= SH7780_32BIT_DDR_BASE_ADDR,
+		.size	= 0x40000000,
+#else
+		.base	= SH7780_CS0_BASE_ADDR,
+		.size	= 0x20000000,
+#endif
+	},
+};
+
+int __init pcibios_init_platform(void)
 {
 	struct pci_channel *chan = &board_pci_channels[0];
 	u32 word;
@@ -114,14 +126,10 @@ int __init sh7780_pcic_init(struct sh4_pci_address_map *map)
 	/* Set IO and Mem windows to local address
 	 * Make PCI and local address the same for easy 1 to 1 mapping
 	 */
-	pci_write_reg(chan, map->window0.size - 0xfffff, SH4_PCILSR0);
-	pci_write_reg(chan, map->window1.size - 0xfffff, SH4_PCILSR1);
+	pci_write_reg(chan, sh7780_pci_map.window0.size - 0xfffff, SH4_PCILSR0);
 	/* Set the values on window 0 PCI config registers */
-	pci_write_reg(chan, map->window0.base, SH4_PCILAR0);
-	pci_write_reg(chan, map->window0.base, SH7780_PCIMBAR0);
-	/* Set the values on window 1 PCI config registers */
-	pci_write_reg(chan, map->window1.base, SH4_PCILAR1);
-	pci_write_reg(chan, map->window1.base, SH7780_PCIMBAR1);
+	pci_write_reg(chan, sh7780_pci_map.window0.base, SH4_PCILAR0);
+	pci_write_reg(chan, sh7780_pci_map.window0.base, SH7780_PCIMBAR0);
 
 	/* Apply any last-minute PCIC fixups */
 	pci_fixup_pcic(chan);
diff --git a/arch/sh/drivers/pci/pci-sh7780.h b/arch/sh/drivers/pci/pci-sh7780.h
index 7a4f8a8..4b65d4b 100644
--- a/arch/sh/drivers/pci/pci-sh7780.h
+++ b/arch/sh/drivers/pci/pci-sh7780.h
@@ -104,9 +104,4 @@
 
 #define SH7780_32BIT_DDR_BASE_ADDR	0x40000000
 
-struct sh4_pci_address_map;
-
-/* arch/sh/drivers/pci/pci-sh7780.c */
-int sh7780_pcic_init(struct sh4_pci_address_map *map);
-
 #endif /* _PCI_SH7780_H_ */
-- 
cgit v0.10.2


From a6d377b6969235a3b5a6e87bdcef387d0976b41c Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 17 Apr 2009 20:11:44 +0900
Subject: sh: pci: Consolidate SH7780 PCIC IRQ routing.

Now that the platform code is a bit leaner, we can start consolidating
the various IRQ routing implementations. There are effectively only 2
variants, and the others can use those directly.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index 67d710a..5003971 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -16,11 +16,11 @@ obj-$(CONFIG_SH_DREAMCAST)		+= ops-dreamcast.o fixups-dreamcast.o
 obj-$(CONFIG_SH_SECUREEDGE5410)		+= ops-snapgear.o
 obj-$(CONFIG_SH_RTS7751R2D)		+= ops-rts7751r2d.o fixups-rts7751r2d.o
 obj-$(CONFIG_SH_SH03)			+= ops-sh03.o fixups-sh03.o
-obj-$(CONFIG_SH_HIGHLANDER)		+= ops-r7780rp.o fixups-r7780rp.o
-obj-$(CONFIG_SH_SDK7780)		+= ops-sdk7780.o fixups-sdk7780.o
+obj-$(CONFIG_SH_HIGHLANDER)		+= fixups-r7780rp.o
+obj-$(CONFIG_SH_SH7785LCR)		+= fixups-r7780rp.o
+obj-$(CONFIG_SH_SDK7780)		+= fixups-sdk7780.o
+obj-$(CONFIG_SH_7780_SOLUTION_ENGINE)	+= fixups-sdk7780.o
 obj-$(CONFIG_SH_TITAN)			+= ops-titan.o
 obj-$(CONFIG_SH_LANDISK)		+= ops-landisk.o
 obj-$(CONFIG_SH_LBOX_RE2)		+= ops-lboxre2.o fixups-lboxre2.o
-obj-$(CONFIG_SH_7780_SOLUTION_ENGINE)	+= ops-se7780.o fixups-sdk7780.o
 obj-$(CONFIG_SH_CAYMAN)			+= ops-cayman.o
-obj-$(CONFIG_SH_SH7785LCR)		+= ops-sh7785lcr.o fixups-r7780rp.o
diff --git a/arch/sh/drivers/pci/fixups-r7780rp.c b/arch/sh/drivers/pci/fixups-r7780rp.c
index 31f8dfb..864e92f 100644
--- a/arch/sh/drivers/pci/fixups-r7780rp.c
+++ b/arch/sh/drivers/pci/fixups-r7780rp.c
@@ -11,9 +11,17 @@
  * for more details.
  */
 #include <linux/pci.h>
+#include <linux/io.h>
 #include "pci-sh4.h"
-#include <asm/io.h>
 
+static char irq_tab[] __initdata = {
+	65, 66, 67, 68,
+};
+
+int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
+{
+	return irq_tab[slot];
+}
 int pci_fixup_pcic(struct pci_channel *chan)
 {
 	pci_write_reg(chan, 0x000043ff, SH4_PCIINTM);
diff --git a/arch/sh/drivers/pci/fixups-sdk7780.c b/arch/sh/drivers/pci/fixups-sdk7780.c
index 004efd4..da60e99 100644
--- a/arch/sh/drivers/pci/fixups-sdk7780.c
+++ b/arch/sh/drivers/pci/fixups-sdk7780.c
@@ -5,15 +5,32 @@
  *
  * Copyright (C) 2003  Lineo uSolutions, Inc.
  * Copyright (C) 2004 - 2006  Paul Mundt
+ * Copyright (C) 2006  Nobuhiro Iwamatsu
  *
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  */
 #include <linux/pci.h>
+#include <linux/io.h>
 #include "pci-sh4.h"
-#include <asm/io.h>
 
+/* IDSEL [16][17][18][19][20][21][22][23][24][25][26][27][28][29][30][31] */
+static char sdk7780_irq_tab[4][16] __initdata = {
+	/* INTA */
+	{ 65, 68, 67, 68, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+	/* INTB */
+	{ 66, 65, -1, 65, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+	/* INTC */
+	{ 67, 66, -1, 66, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+	/* INTD */
+	{ 68, 67, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+};
+
+int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
+{
+       return sdk7780_irq_tab[pin-1][slot];
+}
 int pci_fixup_pcic(struct pci_channel *chan)
 {
 	/* Enable all interrupts, so we know what to fix */
diff --git a/arch/sh/drivers/pci/ops-r7780rp.c b/arch/sh/drivers/pci/ops-r7780rp.c
deleted file mode 100644
index 4ea136e..0000000
--- a/arch/sh/drivers/pci/ops-r7780rp.c
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Author:  Ian DaSilva (idasilva@mvista.com)
- *
- * Highly leveraged from pci-bigsur.c, written by Dustin McIntire.
- *
- * May be copied or modified under the terms of the GNU General Public
- * License.  See linux/COPYING for more information.
- *
- * PCI initialization for the Renesas SH7780 Highlander R7780RP-1 board
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/pci.h>
-#include <mach/highlander.h>
-#include <asm/io.h>
-#include "pci-sh4.h"
-
-static char irq_tab[] __initdata = {
-	65, 66, 67, 68,
-};
-
-int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
-{
-	return irq_tab[slot];
-}
diff --git a/arch/sh/drivers/pci/ops-sdk7780.c b/arch/sh/drivers/pci/ops-sdk7780.c
deleted file mode 100644
index 1d9a91b..0000000
--- a/arch/sh/drivers/pci/ops-sdk7780.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * linux/arch/sh/drivers/pci/ops-sdk7780.c
- *
- * Copyright (C) 2006  Nobuhiro Iwamatsu
- *
- * PCI initialization for the SDK7780SE03
- *
- * May be copied or modified under the terms of the GNU General Public
- * License.  See linux/COPYING for more information.
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/pci.h>
-#include <mach/sdk7780.h>
-#include <asm/io.h>
-#include "pci-sh4.h"
-
-/* IDSEL [16][17][18][19][20][21][22][23][24][25][26][27][28][29][30][31] */
-static char sdk7780_irq_tab[4][16] __initdata = {
-	/* INTA */
-	{ 65, 68, 67, 68, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-	/* INTB */
-	{ 66, 65, -1, 65, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-	/* INTC */
-	{ 67, 66, -1, 66, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-	/* INTD */
-	{ 68, 67, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-};
-
-int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
-{
-       return sdk7780_irq_tab[pin-1][slot];
-}
diff --git a/arch/sh/drivers/pci/ops-se7780.c b/arch/sh/drivers/pci/ops-se7780.c
deleted file mode 100644
index 6c088cc..0000000
--- a/arch/sh/drivers/pci/ops-se7780.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * linux/arch/sh/drivers/pci/ops-se7780.c
- *
- * Copyright (C) 2006  Nobuhiro Iwamatsu
- *
- * PCI initialization for the Hitachi UL Solution Engine 7780SE03
- *
- * May be copied or modified under the terms of the GNU General Public
- * License.  See linux/COPYING for more information.
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/pci.h>
-#include <mach-se/mach/se7780.h>
-#include <asm/io.h>
-#include "pci-sh4.h"
-
-/*
- * IDSEL = AD16  PCI slot
- * IDSEL = AD17  PCI slot
- * IDSEL = AD18  Serial ATA Controller (Silicon Image SiL3512A)
- * IDSEL = AD19  USB Host Controller (NEC uPD7210100A)
- */
-
-/* IDSEL [16][17][18][19][20][21][22][23][24][25][26][27][28][29][30][31] */
-static char se7780_irq_tab[4][16] __initdata = {
-	/* INTA */
-	{ 65, 68, 67, 68, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-	/* INTB */
-	{ 66, 65, -1, 65, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-	/* INTC */
-	{ 67, 66, -1, 66, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-	/* INTD */
-	{ 68, 67, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-};
-
-int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
-{
-       return se7780_irq_tab[pin-1][slot];
-}
diff --git a/arch/sh/drivers/pci/ops-sh7785lcr.c b/arch/sh/drivers/pci/ops-sh7785lcr.c
deleted file mode 100644
index 0fe423e..0000000
--- a/arch/sh/drivers/pci/ops-sh7785lcr.c
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Author:  Ian DaSilva (idasilva@mvista.com)
- *
- * Highly leveraged from pci-bigsur.c, written by Dustin McIntire.
- *
- * May be copied or modified under the terms of the GNU General Public
- * License.  See linux/COPYING for more information.
- *
- * PCI initialization for the Renesas R0P7785LC0011RL board
- * Based on arch/sh/drivers/pci/ops-r7780rp.c
- *
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/pci.h>
-#include "pci-sh4.h"
-
-static char irq_tab[] __initdata = {
-	65, 66, 67, 68,
-};
-
-int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
-{
-	return irq_tab[slot];
-}
-- 
cgit v0.10.2


From 62c7ae87cb5962d3dfaa6d916a15e4faa9e07363 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 17 Apr 2009 20:37:16 +0900
Subject: sh: pci: Start unifying the SH7780 PCIC initialization.

This starts moving out the common initialization bits from the various
fixup paths in to the shared init path.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/fixups-r7780rp.c b/arch/sh/drivers/pci/fixups-r7780rp.c
index 864e92f..15ca65c 100644
--- a/arch/sh/drivers/pci/fixups-r7780rp.c
+++ b/arch/sh/drivers/pci/fixups-r7780rp.c
@@ -22,33 +22,15 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
 {
 	return irq_tab[slot];
 }
+
 int pci_fixup_pcic(struct pci_channel *chan)
 {
 	pci_write_reg(chan, 0x000043ff, SH4_PCIINTM);
-	pci_write_reg(chan, 0x0000380f, SH4_PCIAINTM);
-
-	pci_write_reg(chan, 0xfbb00047, SH7780_PCICMD);
 	pci_write_reg(chan, 0x00000000, SH7780_PCIIBAR);
-
-	pci_write_reg(chan, 0x00011912, SH7780_PCISVID);
 	pci_write_reg(chan, 0x08000000, SH7780_PCICSCR0);
 	pci_write_reg(chan, 0x0000001b, SH7780_PCICSAR0);
 	pci_write_reg(chan, 0xfd000000, SH7780_PCICSCR1);
 	pci_write_reg(chan, 0x0000000f, SH7780_PCICSAR1);
 
-	pci_write_reg(chan, 0xfd000000, SH7780_PCIMBR0);
-	pci_write_reg(chan, 0x00fc0000, SH7780_PCIMBMR0);
-
-#ifdef CONFIG_32BIT
-	pci_write_reg(chan, 0xc0000000, SH7780_PCIMBR2);
-	pci_write_reg(chan, 0x20000000 - SH7780_PCI_IO_SIZE, SH7780_PCIMBMR2);
-#endif
-
-	/* Set IOBR for windows containing area specified in pci.h */
-	pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE-1),
-		      SH7780_PCIIOBR);
-	pci_write_reg(chan, ((SH7780_PCI_IO_SIZE-1) & (7<<18)),
-		      SH7780_PCIIOBMR);
-
 	return 0;
 }
diff --git a/arch/sh/drivers/pci/fixups-sdk7780.c b/arch/sh/drivers/pci/fixups-sdk7780.c
index da60e99..250b0ed 100644
--- a/arch/sh/drivers/pci/fixups-sdk7780.c
+++ b/arch/sh/drivers/pci/fixups-sdk7780.c
@@ -35,40 +35,18 @@ int pci_fixup_pcic(struct pci_channel *chan)
 {
 	/* Enable all interrupts, so we know what to fix */
 	pci_write_reg(chan, 0x0000C3FF, SH7780_PCIIMR);
-	pci_write_reg(chan, 0x0000380F, SH7780_PCIAINTM);
 
 	/* Set up standard PCI config registers */
-	pci_write_reg(chan, 0xFB00, SH7780_PCISTATUS);
-	pci_write_reg(chan, 0x0047, SH7780_PCICMD);
-	pci_write_reg(chan, 0x00, SH7780_PCIPIF);
-	pci_write_reg(chan, 0x1912, SH7780_PCISVID);
-	pci_write_reg(chan, 0x0001, SH7780_PCISID);
-
 	pci_write_reg(chan, 0x08000000, SH7780_PCIMBAR0);	/* PCI */
-	pci_write_reg(chan, 0x08000000, SH7780_PCILAR0);	/* SHwy */
-	pci_write_reg(chan, 0x07F00001, SH7780_PCILSR);	/* size 128M w/ MBAR */
+	pci_write_reg(chan, 0x08000000, SH4_PCILAR0);	/* SHwy */
+	pci_write_reg(chan, 0x07F00001, SH4_PCILSR0);	/* size 128M w/ MBAR */
 
 	pci_write_reg(chan, 0x00000000, SH7780_PCIMBAR1);
-	pci_write_reg(chan, 0x00000000, SH7780_PCILAR1);
-	pci_write_reg(chan, 0x00000000, SH7780_PCILSR1);
+	pci_write_reg(chan, 0x00000000, SH4_PCILAR1);
+	pci_write_reg(chan, 0x00000000, SH4_PCILSR1);
 
 	pci_write_reg(chan, 0xAB000801, SH7780_PCIIBAR);
-
-	/*
-	 * Set the MBR so PCI address is one-to-one with window,
-	 * meaning all calls go straight through... use ifdef to
-	 * catch erroneous assumption.
-	 */
-	pci_write_reg(chan, 0xFD000000 , SH7780_PCIMBR0);
-	pci_write_reg(chan, 0x00FC0000 , SH7780_PCIMBMR0);	/* 16M */
-
-	/* Set IOBR for window containing area specified in pci.h */
-	pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE-1),
-		      SH7780_PCIIOBR);
-	pci_write_reg(chan, (SH7780_PCI_IO_SIZE-1) & (7 << 18),
-		      SH7780_PCIIOBMR);
-
-	pci_write_reg(chan, 0xA5000C01, SH7780_PCICR);
+	pci_write_reg(chan, 0xA5000C01, SH4_PCICR);
 
 	return 0;
 }
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 07c5529..f02d9df 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -1,19 +1,12 @@
 /*
- *	Low-Level PCI Support for the SH7780
+ * Low-Level PCI Support for the SH7780
  *
- *  Dustin McIntire (dustin@sensoria.com)
- *	Derived from arch/i386/kernel/pci-*.c which bore the message:
- *	(c) 1999--2000 Martin Mares <mj@ucw.cz>
- *
- *  Ported to the new API by Paul Mundt <lethal@linux-sh.org>
- *  With cleanup by Paul van Gool <pvangool@mimotech.com>
- *
- *  May be copied or modified under the terms of the GNU General Public
- *  License.  See linux/COPYING for more information.
+ *  Copyright (C) 2005 - 2009  Paul Mundt
  *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
  */
-#undef DEBUG
-
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
@@ -117,13 +110,8 @@ int __init pcibios_init_platform(void)
 
 	pci_cache_line_size = pci_read_reg(chan, SH7780_PCICLS) / 4;
 
-	/* set the command/status bits to:
-	 * Wait Cycle Control + Parity Enable + Bus Master +
-	 * Mem space enable
-	 */
-	pci_write_reg(chan, 0x00000046, SH7780_PCICMD);
-
-	/* Set IO and Mem windows to local address
+	/*
+	 * Set IO and Mem windows to local address
 	 * Make PCI and local address the same for easy 1 to 1 mapping
 	 */
 	pci_write_reg(chan, sh7780_pci_map.window0.size - 0xfffff, SH4_PCILSR0);
@@ -131,9 +119,33 @@ int __init pcibios_init_platform(void)
 	pci_write_reg(chan, sh7780_pci_map.window0.base, SH4_PCILAR0);
 	pci_write_reg(chan, sh7780_pci_map.window0.base, SH7780_PCIMBAR0);
 
+	pci_write_reg(chan, 0x0000380f, SH4_PCIAINTM);
+
+	/* Set up standard PCI config registers */
+	__raw_writew(0xFB00, chan->reg_base + SH7780_PCISTATUS);
+	__raw_writew(0x0047, chan->reg_base + SH7780_PCICMD);
+	__raw_writew(0x1912, chan->reg_base + SH7780_PCISVID);
+	__raw_writew(0x0001, chan->reg_base + SH7780_PCISID);
+
+	__raw_writeb(0x00, chan->reg_base + SH7780_PCIPIF);
+
 	/* Apply any last-minute PCIC fixups */
 	pci_fixup_pcic(chan);
 
+	pci_write_reg(chan, 0xfd000000, SH7780_PCIMBR0);
+	pci_write_reg(chan, 0x00fc0000, SH7780_PCIMBMR0);
+
+#ifdef CONFIG_32BIT
+	pci_write_reg(chan, 0xc0000000, SH7780_PCIMBR2);
+	pci_write_reg(chan, 0x20000000 - SH7780_PCI_IO_SIZE, SH7780_PCIMBMR2);
+#endif
+
+	/* Set IOBR for windows containing area specified in pci.h */
+	pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE-1),
+		      SH7780_PCIIOBR);
+	pci_write_reg(chan, ((SH7780_PCI_IO_SIZE-1) & (7<<18)),
+		      SH7780_PCIIOBMR);
+
 	/* SH7780 init done, set central function init complete */
 	/* use round robin mode to stop a device starving/overruning */
 	word = SH4_PCICR_PREFIX | SH4_PCICR_CFIN | SH4_PCICR_FTO;
diff --git a/arch/sh/drivers/pci/pci-sh7780.h b/arch/sh/drivers/pci/pci-sh7780.h
index 4b65d4b..4a52478 100644
--- a/arch/sh/drivers/pci/pci-sh7780.h
+++ b/arch/sh/drivers/pci/pci-sh7780.h
@@ -65,11 +65,6 @@
 #define SH7780_PCIPMCSR_BSE	0x046
 #define SH7780_PCICDD		0x047
 
-#define SH7780_PCICR		0x100		/* PCI Control Register */
-#define SH7780_PCILSR		0x104		/* PCI Local Space Register0 */
-#define SH7780_PCILSR1		0x108		/* PCI Local Space Register1 */
-#define SH7780_PCILAR0		0x10C		/* PCI Local Address Register1 */
-#define SH7780_PCILAR1		0x110		/* PCI Local Address Register1 */
 #define SH7780_PCIIR		0x114		/* PCI Interrupt Register */
 #define SH7780_PCIIMR		0x118		/* PCI Interrupt Mask Register */
 #define SH7780_PCIAIR		0x11C		/* Error Address Register */
-- 
cgit v0.10.2


From 93eb677d74a4f7d3edfb678c94f6c0544d9fbad2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 15 Apr 2009 13:24:06 -0400
Subject: ftrace: use module notifier for function tracer

The hooks in the module code for the function tracer must be called
before any of that module code runs. The function tracer hooks
modify the module (replacing calls to mcount to nops). If the code
is executed while the change occurs, then the CPU can take a GPF.

To handle the above with a bit of paranoia, I originally implemented
the hooks as calls directly from the module code.

After examining the notifier calls, it looks as though the start up
notify is called before any of the module's code is executed. This makes
the use of the notify safe with ftrace.

Only the startup notify is required to be "safe". The shutdown simply
removes the entries from the ftrace function list, and does not modify
any code.

This change has another benefit. It removes a issue with a reverse dependency
in the mutexes of ftrace_lock and module_mutex.

[ Impact: fix lock dependency bug, cleanup ]

Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 53869be..97c83e1 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -233,8 +233,6 @@ extern int ftrace_arch_read_dyn_info(char *buf, int size);
 
 extern int skip_trace(unsigned long ip);
 
-extern void ftrace_release(void *start, unsigned long size);
-
 extern void ftrace_disable_daemon(void);
 extern void ftrace_enable_daemon(void);
 #else
@@ -325,13 +323,8 @@ static inline void __ftrace_enabled_restore(int enabled)
 
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 extern void ftrace_init(void);
-extern void ftrace_init_module(struct module *mod,
-			       unsigned long *start, unsigned long *end);
 #else
 static inline void ftrace_init(void) { }
-static inline void
-ftrace_init_module(struct module *mod,
-		   unsigned long *start, unsigned long *end) { }
 #endif
 
 /*
diff --git a/include/linux/module.h b/include/linux/module.h
index 6155fa4..a8f2c0a 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -341,6 +341,10 @@ struct module
 	struct ftrace_event_call *trace_events;
 	unsigned int num_trace_events;
 #endif
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+	unsigned long *ftrace_callsites;
+	unsigned int num_ftrace_callsites;
+#endif
 
 #ifdef CONFIG_MODULE_UNLOAD
 	/* What modules depend on me? */
diff --git a/kernel/module.c b/kernel/module.c
index a039470..2383e60 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1490,9 +1490,6 @@ static void free_module(struct module *mod)
 	/* Free any allocated parameters. */
 	destroy_params(mod->kp, mod->num_kp);
 
-	/* release any pointers to mcount in this module */
-	ftrace_release(mod->module_core, mod->core_size);
-
 	/* This may be NULL, but that's OK */
 	module_free(mod, mod->module_init);
 	kfree(mod->args);
@@ -1893,11 +1890,9 @@ static noinline struct module *load_module(void __user *umod,
 	unsigned int symindex = 0;
 	unsigned int strindex = 0;
 	unsigned int modindex, versindex, infoindex, pcpuindex;
-	unsigned int num_mcount;
 	struct module *mod;
 	long err = 0;
 	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
-	unsigned long *mseg;
 	mm_segment_t old_fs;
 
 	DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2179,7 +2174,13 @@ static noinline struct module *load_module(void __user *umod,
 					 sizeof(*mod->trace_events),
 					 &mod->num_trace_events);
 #endif
-
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+	/* sechdrs[0].sh_size is always zero */
+	mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings,
+					     "__mcount_loc",
+					     sizeof(*mod->ftrace_callsites),
+					     &mod->num_ftrace_callsites);
+#endif
 #ifdef CONFIG_MODVERSIONS
 	if ((mod->num_syms && !mod->crcs)
 	    || (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2244,11 +2245,6 @@ static noinline struct module *load_module(void __user *umod,
 			dynamic_debug_setup(debug, num_debug);
 	}
 
-	/* sechdrs[0].sh_size is always zero */
-	mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
-			    sizeof(*mseg), &num_mcount);
-	ftrace_init_module(mod, mseg, mseg + num_mcount);
-
 	err = module_finalize(hdr, sechdrs, mod);
 	if (err < 0)
 		goto cleanup;
@@ -2309,7 +2305,6 @@ static noinline struct module *load_module(void __user *umod,
  cleanup:
 	kobject_del(&mod->mkobj.kobj);
 	kobject_put(&mod->mkobj.kobj);
-	ftrace_release(mod->module_core, mod->core_size);
  free_unload:
 	module_unload_free(mod);
 #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a234889..5b606f4 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -916,30 +916,6 @@ static void ftrace_free_rec(struct dyn_ftrace *rec)
 	rec->flags |= FTRACE_FL_FREE;
 }
 
-void ftrace_release(void *start, unsigned long size)
-{
-	struct dyn_ftrace *rec;
-	struct ftrace_page *pg;
-	unsigned long s = (unsigned long)start;
-	unsigned long e = s + size;
-
-	if (ftrace_disabled || !start)
-		return;
-
-	mutex_lock(&ftrace_lock);
-	do_for_each_ftrace_rec(pg, rec) {
-		if ((rec->ip >= s) && (rec->ip < e)) {
-			/*
-			 * rec->ip is changed in ftrace_free_rec()
-			 * It should not between s and e if record was freed.
-			 */
-			FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
-			ftrace_free_rec(rec);
-		}
-	} while_for_each_ftrace_rec();
-	mutex_unlock(&ftrace_lock);
-}
-
 static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 {
 	struct dyn_ftrace *rec;
@@ -2752,14 +2728,72 @@ static int ftrace_convert_nops(struct module *mod,
 	return 0;
 }
 
-void ftrace_init_module(struct module *mod,
-			unsigned long *start, unsigned long *end)
+#ifdef CONFIG_MODULES
+void ftrace_release(void *start, void *end)
+{
+	struct dyn_ftrace *rec;
+	struct ftrace_page *pg;
+	unsigned long s = (unsigned long)start;
+	unsigned long e = (unsigned long)end;
+
+	if (ftrace_disabled || !start || start == end)
+		return;
+
+	mutex_lock(&ftrace_lock);
+	do_for_each_ftrace_rec(pg, rec) {
+		if ((rec->ip >= s) && (rec->ip < e)) {
+			/*
+			 * rec->ip is changed in ftrace_free_rec()
+			 * It should not between s and e if record was freed.
+			 */
+			FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
+			ftrace_free_rec(rec);
+		}
+	} while_for_each_ftrace_rec();
+	mutex_unlock(&ftrace_lock);
+}
+
+static void ftrace_init_module(struct module *mod,
+			       unsigned long *start, unsigned long *end)
 {
 	if (ftrace_disabled || start == end)
 		return;
 	ftrace_convert_nops(mod, start, end);
 }
 
+static int ftrace_module_notify(struct notifier_block *self,
+				unsigned long val, void *data)
+{
+	struct module *mod = data;
+
+	switch (val) {
+	case MODULE_STATE_COMING:
+		ftrace_init_module(mod, mod->ftrace_callsites,
+				   mod->ftrace_callsites +
+				   mod->num_ftrace_callsites);
+		break;
+	case MODULE_STATE_GOING:
+		ftrace_release(mod->ftrace_callsites,
+			       mod->ftrace_callsites +
+			       mod->num_ftrace_callsites);
+		break;
+	}
+
+	return 0;
+}
+#else
+static int ftrace_module_notify(struct notifier_block *self,
+				unsigned long val, void *data)
+{
+	return 0;
+}
+#endif /* CONFIG_MODULES */
+
+struct notifier_block ftrace_module_nb = {
+	.notifier_call = ftrace_module_notify,
+	.priority = 0,
+};
+
 extern unsigned long __start_mcount_loc[];
 extern unsigned long __stop_mcount_loc[];
 
@@ -2791,6 +2825,10 @@ void __init ftrace_init(void)
 				  __start_mcount_loc,
 				  __stop_mcount_loc);
 
+	ret = register_module_notifier(&ftrace_module_nb);
+	if (!ret)
+		pr_warning("Failed to register trace ftrace module notifier\n");
+
 	return;
  failed:
 	ftrace_disabled = 1;
-- 
cgit v0.10.2


From e6187007d6c365b551c69ea3df46f06fd1c8bd19 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 15 Apr 2009 13:36:40 -0400
Subject: tracing/events: add startup tests for events
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As events start to become popular, and the new way to add tracing
infrastructure into ftrace, it is important to catch any problems
that might happen with a mistake in the TRACE_EVENT macro.

This patch introduces a startup self test on the registered trace
events. Note, it can only do a generic test, any type of testing that
needs more involement is needed to be implemented by the tracepoint
creators.

The test goes down one by one enabling a trace point and running
some random tasks (random in the sense that I just made them up).
Those tasks are creating threads, grabbing mutexes and spinlocks
and using workqueues.

After testing each event individually, it does the same test after
enabling each system of trace points. Like sched, irq, lockdep.

Then finally it enables all tracepoints and performs the tasks again.
The output to the console on bootup will look like this when everything
works:

Running tests on trace events:
Testing event kfree_skb: OK
Testing event kmalloc: OK
Testing event kmem_cache_alloc: OK
Testing event kmalloc_node: OK
Testing event kmem_cache_alloc_node: OK
Testing event kfree: OK
Testing event kmem_cache_free: OK
Testing event irq_handler_exit: OK
Testing event irq_handler_entry: OK
Testing event softirq_entry: OK
Testing event softirq_exit: OK
Testing event lock_acquire: OK
Testing event lock_release: OK
Testing event sched_kthread_stop: OK
Testing event sched_kthread_stop_ret: OK
Testing event sched_wait_task: OK
Testing event sched_wakeup: OK
Testing event sched_wakeup_new: OK
Testing event sched_switch: OK
Testing event sched_migrate_task: OK
Testing event sched_process_free: OK
Testing event sched_process_exit: OK
Testing event sched_process_wait: OK
Testing event sched_process_fork: OK
Testing event sched_signal_send: OK
Running tests on trace event systems:
Testing event system skb: OK
Testing event system kmem: OK
Testing event system irq: OK
Testing event system lockdep: OK
Testing event system sched: OK
Running tests on all trace events:
Testing all events: OK

[ folded in:

  tracing: add #include <linux/delay.h> to fix build failure in test_work()

  This build failure occured on a few rare configs:

   kernel/trace/trace_events.c: In function ‘test_work’:
   kernel/trace/trace_events.c:975: error: implicit declaration of function ‘udelay’
   kernel/trace/trace_events.c:980: error: implicit declaration of function ‘msleep’

  delay.h is included in way too many other headers, hiding cases
  where new usage is added without header inclusion.

  [ Impact: build fix ]

  Signed-off-by: Ingo Molnar <mingo@elte.hu>
]

[ Impact: add event tracer self-tests ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 6591d83..f81d6ee 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -8,10 +8,14 @@
  *
  */
 
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
+#include <linux/delay.h>
 
 #include "trace_output.h"
 
@@ -920,3 +924,177 @@ static __init int event_trace_init(void)
 	return 0;
 }
 fs_initcall(event_trace_init);
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+
+static DEFINE_SPINLOCK(test_spinlock);
+static DEFINE_SPINLOCK(test_spinlock_irq);
+static DEFINE_MUTEX(test_mutex);
+
+static __init void test_work(struct work_struct *dummy)
+{
+	spin_lock(&test_spinlock);
+	spin_lock_irq(&test_spinlock_irq);
+	udelay(1);
+	spin_unlock_irq(&test_spinlock_irq);
+	spin_unlock(&test_spinlock);
+
+	mutex_lock(&test_mutex);
+	msleep(1);
+	mutex_unlock(&test_mutex);
+}
+
+static __init int event_test_thread(void *unused)
+{
+	void *test_malloc;
+
+	test_malloc = kmalloc(1234, GFP_KERNEL);
+	if (!test_malloc)
+		pr_info("failed to kmalloc\n");
+
+	schedule_on_each_cpu(test_work);
+
+	kfree(test_malloc);
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop())
+		schedule();
+
+	return 0;
+}
+
+/*
+ * Do various things that may trigger events.
+ */
+static __init void event_test_stuff(void)
+{
+	struct task_struct *test_thread;
+
+	test_thread = kthread_run(event_test_thread, NULL, "test-events");
+	msleep(1);
+	kthread_stop(test_thread);
+}
+
+/*
+ * For every trace event defined, we will test each trace point separately,
+ * and then by groups, and finally all trace points.
+ */
+static __init int event_trace_self_tests(void)
+{
+	struct ftrace_event_call *call;
+	struct event_subsystem *system;
+	char *sysname;
+	int ret;
+
+	pr_info("Running tests on trace events:\n");
+
+	list_for_each_entry(call, &ftrace_events, list) {
+
+		/* Only test those that have a regfunc */
+		if (!call->regfunc)
+			continue;
+
+		pr_info("Testing event %s: ", call->name);
+
+		/*
+		 * If an event is already enabled, someone is using
+		 * it and the self test should not be on.
+		 */
+		if (call->enabled) {
+			pr_warning("Enabled event during self test!\n");
+			WARN_ON_ONCE(1);
+			continue;
+		}
+
+		call->enabled = 1;
+		call->regfunc();
+
+		event_test_stuff();
+
+		call->unregfunc();
+		call->enabled = 0;
+
+		pr_cont("OK\n");
+	}
+
+	/* Now test at the sub system level */
+
+	pr_info("Running tests on trace event systems:\n");
+
+	list_for_each_entry(system, &event_subsystems, list) {
+
+		/* the ftrace system is special, skip it */
+		if (strcmp(system->name, "ftrace") == 0)
+			continue;
+
+		pr_info("Testing event system %s: ", system->name);
+
+		/* ftrace_set_clr_event can modify the name passed in. */
+		sysname = kstrdup(system->name, GFP_KERNEL);
+		if (WARN_ON(!sysname)) {
+			pr_warning("Can't allocate memory, giving up!\n");
+			return 0;
+		}
+		ret = ftrace_set_clr_event(sysname, 1);
+		kfree(sysname);
+		if (WARN_ON_ONCE(ret)) {
+			pr_warning("error enabling system %s\n",
+				   system->name);
+			continue;
+		}
+
+		event_test_stuff();
+
+		sysname = kstrdup(system->name, GFP_KERNEL);
+		if (WARN_ON(!sysname)) {
+			pr_warning("Can't allocate memory, giving up!\n");
+			return 0;
+		}
+		ret = ftrace_set_clr_event(sysname, 0);
+		kfree(sysname);
+
+		if (WARN_ON_ONCE(ret))
+			pr_warning("error disabling system %s\n",
+				   system->name);
+
+		pr_cont("OK\n");
+	}
+
+	/* Test with all events enabled */
+
+	pr_info("Running tests on all trace events:\n");
+	pr_info("Testing all events: ");
+
+	sysname = kmalloc(4, GFP_KERNEL);
+	if (WARN_ON(!sysname)) {
+		pr_warning("Can't allocate memory, giving up!\n");
+		return 0;
+	}
+	memcpy(sysname, "*:*", 4);
+	ret = ftrace_set_clr_event(sysname, 1);
+	if (WARN_ON_ONCE(ret)) {
+		kfree(sysname);
+		pr_warning("error enabling all events\n");
+		return 0;
+	}
+
+	event_test_stuff();
+
+	/* reset sysname */
+	memcpy(sysname, "*:*", 4);
+	ret = ftrace_set_clr_event(sysname, 0);
+	kfree(sysname);
+
+	if (WARN_ON_ONCE(ret)) {
+		pr_warning("error disabling all events\n");
+		return 0;
+	}
+
+	pr_cont("OK\n");
+
+	return 0;
+}
+
+late_initcall(event_trace_self_tests);
+
+#endif
-- 
cgit v0.10.2


From d1b182a8d49ed6416325b4e0a1cb0f17cd4e702a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 15 Apr 2009 16:53:47 -0400
Subject: tracing/events/ring-buffer: expose format of ring buffer headers to
 users

Currently, every thing needed to read the binary output from the
ring buffers is available, with the exception of the way the ring
buffers handles itself internally.

This patch creates two special files in the debugfs/tracing/events
directory:

 # cat /debug/tracing/events/header_page
        field: u64 timestamp;   offset:0;       size:8;
        field: local_t commit;  offset:8;       size:8;
        field: char data;       offset:16;      size:4080;

 # cat /debug/tracing/events/header_event
        type        :    2 bits
        len         :    3 bits
        time_delta  :   27 bits
        array       :   32 bits

        padding     : type == 0
        time_extend : type == 1
        data        : type == 3

This is to allow a userspace app to see if the ring buffer format changes
or not.

[ Impact: allow userspace apps to know of ringbuffer format changes ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index f0aa486..fac8f1a 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -166,6 +166,11 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data);
 int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page,
 			  size_t len, int cpu, int full);
 
+struct trace_seq;
+
+int ring_buffer_print_entry_header(struct trace_seq *s);
+int ring_buffer_print_page_header(struct trace_seq *s);
+
 enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
 };
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f935bd5..84a6055 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -22,6 +22,28 @@
 #include "trace.h"
 
 /*
+ * The ring buffer header is special. We must manually up keep it.
+ */
+int ring_buffer_print_entry_header(struct trace_seq *s)
+{
+	int ret;
+
+	ret = trace_seq_printf(s, "\ttype        :    2 bits\n");
+	ret = trace_seq_printf(s, "\tlen         :    3 bits\n");
+	ret = trace_seq_printf(s, "\ttime_delta  :   27 bits\n");
+	ret = trace_seq_printf(s, "\tarray       :   32 bits\n");
+	ret = trace_seq_printf(s, "\n");
+	ret = trace_seq_printf(s, "\tpadding     : type == %d\n",
+			       RINGBUF_TYPE_PADDING);
+	ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
+			       RINGBUF_TYPE_TIME_EXTEND);
+	ret = trace_seq_printf(s, "\tdata        : type == %d\n",
+			       RINGBUF_TYPE_DATA);
+
+	return ret;
+}
+
+/*
  * The ring buffer is made up of a list of pages. A separate list of pages is
  * allocated for each CPU. A writer may only write to a buffer that is
  * associated with the CPU it is currently executing on.  A reader may read
@@ -340,6 +362,28 @@ static inline int test_time_stamp(u64 delta)
 
 #define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
 
+int ring_buffer_print_page_header(struct trace_seq *s)
+{
+	struct buffer_data_page field;
+	int ret;
+
+	ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
+			       "offset:0;\tsize:%u;\n",
+			       (unsigned int)sizeof(field.time_stamp));
+
+	ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
+			       "offset:%u;\tsize:%u;\n",
+			       (unsigned int)offsetof(typeof(field), commit),
+			       (unsigned int)sizeof(field.commit));
+
+	ret = trace_seq_printf(s, "\tfield: char data;\t"
+			       "offset:%u;\tsize:%u;\n",
+			       (unsigned int)offsetof(typeof(field), data),
+			       (unsigned int)BUF_PAGE_SIZE);
+
+	return ret;
+}
+
 /*
  * head_page == tail_page && head == tail then buffer is empty.
  */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f81d6ee..7163a2b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -610,6 +610,30 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 }
 
+static ssize_t
+show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	int (*func)(struct trace_seq *s) = filp->private_data;
+	struct trace_seq *s;
+	int r;
+
+	if (*ppos)
+		return 0;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	trace_seq_init(s);
+
+	func(s);
+	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+
+	kfree(s);
+
+	return r;
+}
+
 static const struct seq_operations show_event_seq_ops = {
 	.start = t_start,
 	.next = t_next,
@@ -667,6 +691,11 @@ static const struct file_operations ftrace_subsystem_filter_fops = {
 	.write = subsystem_filter_write,
 };
 
+static const struct file_operations ftrace_show_header_fops = {
+	.open = tracing_open_generic,
+	.read = show_header,
+};
+
 static struct dentry *event_trace_events_dir(void)
 {
 	static struct dentry *d_tracer;
@@ -909,6 +938,15 @@ static __init int event_trace_init(void)
 	if (!d_events)
 		return 0;
 
+	/* ring buffer internal formats */
+	trace_create_file("header_page", 0444, d_events,
+			  ring_buffer_print_page_header,
+			  &ftrace_show_header_fops);
+
+	trace_create_file("header_event", 0444, d_events,
+			  ring_buffer_print_entry_header,
+			  &ftrace_show_header_fops);
+
 	for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
 		/* The linker may leave blanks */
 		if (!call->name)
-- 
cgit v0.10.2


From 69abe6a5d18a9394baa325bab8f57748b037c517 Mon Sep 17 00:00:00 2001
From: Avadh Patel <avadh4all@gmail.com>
Date: Fri, 10 Apr 2009 16:04:48 -0400
Subject: tracing: add saved_cmdlines file to show cached task comms

Export the cached task comms to userspace. This allows user apps to translate
the pids from a trace into their respective task command lines.

[ Impact: let userspace apps reading binary buffer know comm's of pids ]

Signed-off-by: Avadh Patel <avadh4all@gmail.com>
[ added error checking and use of buf pointer to index file_buf ]
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2d69b26..031c46f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2422,6 +2422,56 @@ static const struct file_operations tracing_readme_fops = {
 };
 
 static ssize_t
+tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
+				size_t cnt, loff_t *ppos)
+{
+	char *buf_comm;
+	char *file_buf;
+	char *buf;
+	int len = 0;
+	int pid;
+	int i;
+
+	file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL);
+	if (!file_buf)
+		return -ENOMEM;
+
+	buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL);
+	if (!buf_comm) {
+		kfree(file_buf);
+		return -ENOMEM;
+	}
+
+	buf = file_buf;
+
+	for (i = 0; i < SAVED_CMDLINES; i++) {
+		int r;
+
+		pid = map_cmdline_to_pid[i];
+		if (pid == -1 || pid == NO_CMDLINE_MAP)
+			continue;
+
+		trace_find_cmdline(pid, buf_comm);
+		r = sprintf(buf, "%d %s\n", pid, buf_comm);
+		buf += r;
+		len += r;
+	}
+
+	len = simple_read_from_buffer(ubuf, cnt, ppos,
+				      file_buf, len);
+
+	kfree(file_buf);
+	kfree(buf_comm);
+
+	return len;
+}
+
+static const struct file_operations tracing_saved_cmdlines_fops = {
+    .open       = tracing_open_generic,
+    .read       = tracing_saved_cmdlines_read,
+};
+
+static ssize_t
 tracing_ctrl_read(struct file *filp, char __user *ubuf,
 		  size_t cnt, loff_t *ppos)
 {
@@ -3973,6 +4023,9 @@ static __init int tracer_init_debugfs(void)
 	trace_create_file("trace_marker", 0220, d_tracer,
 			NULL, &tracing_mark_fops);
 
+	trace_create_file("saved_cmdlines", 0444, d_tracer,
+			NULL, &tracing_saved_cmdlines_fops);
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 	trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
 			&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
-- 
cgit v0.10.2


From 9ea21c1ecdb35ecdcac5fd9d95f62a1f6a7ffec0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 16 Apr 2009 12:15:44 -0400
Subject: tracing/events: perform function tracing in event selftests

We can find some bugs in the trace events if we stress the writes as well.
The function tracer is a good way to stress the events.

[ Impact: extend scope of event tracer self-tests ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20090416161746.604786131@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 7163a2b..1137f95 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1017,7 +1017,7 @@ static __init void event_test_stuff(void)
  * For every trace event defined, we will test each trace point separately,
  * and then by groups, and finally all trace points.
  */
-static __init int event_trace_self_tests(void)
+static __init void event_trace_self_tests(void)
 {
 	struct ftrace_event_call *call;
 	struct event_subsystem *system;
@@ -1071,7 +1071,7 @@ static __init int event_trace_self_tests(void)
 		sysname = kstrdup(system->name, GFP_KERNEL);
 		if (WARN_ON(!sysname)) {
 			pr_warning("Can't allocate memory, giving up!\n");
-			return 0;
+			return;
 		}
 		ret = ftrace_set_clr_event(sysname, 1);
 		kfree(sysname);
@@ -1086,7 +1086,7 @@ static __init int event_trace_self_tests(void)
 		sysname = kstrdup(system->name, GFP_KERNEL);
 		if (WARN_ON(!sysname)) {
 			pr_warning("Can't allocate memory, giving up!\n");
-			return 0;
+			return;
 		}
 		ret = ftrace_set_clr_event(sysname, 0);
 		kfree(sysname);
@@ -1106,14 +1106,14 @@ static __init int event_trace_self_tests(void)
 	sysname = kmalloc(4, GFP_KERNEL);
 	if (WARN_ON(!sysname)) {
 		pr_warning("Can't allocate memory, giving up!\n");
-		return 0;
+		return;
 	}
 	memcpy(sysname, "*:*", 4);
 	ret = ftrace_set_clr_event(sysname, 1);
 	if (WARN_ON_ONCE(ret)) {
 		kfree(sysname);
 		pr_warning("error enabling all events\n");
-		return 0;
+		return;
 	}
 
 	event_test_stuff();
@@ -1125,10 +1125,76 @@ static __init int event_trace_self_tests(void)
 
 	if (WARN_ON_ONCE(ret)) {
 		pr_warning("error disabling all events\n");
-		return 0;
+		return;
 	}
 
 	pr_cont("OK\n");
+}
+
+#ifdef CONFIG_FUNCTION_TRACER
+
+static DEFINE_PER_CPU(atomic_t, test_event_disable);
+
+static void
+function_test_events_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct ring_buffer_event *event;
+	struct ftrace_entry *entry;
+	unsigned long flags;
+	long disabled;
+	int resched;
+	int cpu;
+	int pc;
+
+	pc = preempt_count();
+	resched = ftrace_preempt_disable();
+	cpu = raw_smp_processor_id();
+	disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu));
+
+	if (disabled != 1)
+		goto out;
+
+	local_save_flags(flags);
+
+	event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry),
+						  flags, pc);
+	if (!event)
+		goto out;
+	entry	= ring_buffer_event_data(event);
+	entry->ip			= ip;
+	entry->parent_ip		= parent_ip;
+
+	trace_current_buffer_unlock_commit(event, flags, pc);
+
+ out:
+	atomic_dec(&per_cpu(test_event_disable, cpu));
+	ftrace_preempt_enable(resched);
+}
+
+static struct ftrace_ops trace_ops __initdata  =
+{
+	.func = function_test_events_call,
+};
+
+static __init void event_trace_self_test_with_function(void)
+{
+	register_ftrace_function(&trace_ops);
+	pr_info("Running tests again, along with the function tracer\n");
+	event_trace_self_tests();
+	unregister_ftrace_function(&trace_ops);
+}
+#else
+static __init void event_trace_self_test_with_function(void)
+{
+}
+#endif
+
+static __init int event_trace_self_tests_init(void)
+{
+
+	event_trace_self_tests();
+
+	event_trace_self_test_with_function();
 
 	return 0;
 }
-- 
cgit v0.10.2


From 76aa81118ddfbb3dc31533030cf3ec329dd067a6 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 16 Apr 2009 23:35:39 -0700
Subject: tracing: avoid warnings from zero-arg tracepoints

Tracepoints with no arguments can issue two warnings:

	"field" defined by not used
	"ret" is uninitialized in this function

Mark field as being OK to leave unused, and initialize ret.

[ Impact: fix false positive compiler warnings. ]

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: mathieu.desnoyers@polymtl.ca
LKML-Reference: <1239950139-1119-5-git-send-email-jeremy@goop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 60c5323..39a3351 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -160,8 +160,8 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 static int								\
 ftrace_format_##call(struct trace_seq *s)				\
 {									\
-	struct ftrace_raw_##call field;					\
-	int ret;							\
+	struct ftrace_raw_##call field __attribute__((unused));		\
+	int ret = 0;							\
 									\
 	tstruct;							\
 									\
-- 
cgit v0.10.2


From 339ae5d3c3fc2025e3657637921495fd600027c7 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 17 Apr 2009 10:34:30 +0800
Subject: tracing: fix file mode of trace and README

trace is read-write and README is read-only.

[ Impact: fix /debug/tracing/ file permissions. ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49E7EAB6.4070605@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 031c46f..f681f64 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4002,7 +4002,7 @@ static __init int tracer_init_debugfs(void)
 	trace_create_file("available_tracers", 0444, d_tracer,
 			&global_trace, &show_traces_fops);
 
-	trace_create_file("current_tracer", 0444, d_tracer,
+	trace_create_file("current_tracer", 0644, d_tracer,
 			&global_trace, &set_tracer_fops);
 
 	trace_create_file("tracing_max_latency", 0644, d_tracer,
@@ -4011,7 +4011,7 @@ static __init int tracer_init_debugfs(void)
 	trace_create_file("tracing_thresh", 0644, d_tracer,
 			&tracing_thresh, &tracing_max_lat_fops);
 
-	trace_create_file("README", 0644, d_tracer,
+	trace_create_file("README", 0444, d_tracer,
 			NULL, &tracing_readme_fops);
 
 	trace_create_file("trace_pipe", 0444, d_tracer,
-- 
cgit v0.10.2


From 9b94b3a19b13e094c10f65f24bc358f6ffe4eacd Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Fri, 17 Apr 2009 12:07:46 +0200
Subject: x86: fixup numa_node information for AMD CPU northbridge functions

Currently the numa_node attribute for these PCI devices is 0 (it
corresponds to the numa_node for PCI bus 0). This is not a big issue
but incorrect.

This inconsistency can be fixed by reading the node number from CPU
NB function 0.

[ Impact: fill in dev->numa_node information, to optimize DMA allocations ]

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: jbarnes@virtuousgeek.org
LKML-Reference: <20090417100746.GG16198@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index e95022e..94ad0c0 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -493,5 +493,42 @@ void force_hpet_resume(void)
 		break;
 	}
 }
+#endif
+
+#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
+/* Set correct numa_node information for AMD NB functions */
+static void __init quirk_amd_nb_node(struct pci_dev *dev)
+{
+	struct pci_dev *nb_ht;
+	unsigned int devfn;
+	u32 val;
+
+	devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
+	nb_ht = pci_get_slot(dev->bus, devfn);
+	if (!nb_ht)
+		return;
+
+	pci_read_config_dword(nb_ht, 0x60, &val);
+	set_dev_node(&dev->dev, val & 7);
+	pci_dev_put(dev);
+}
 
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
+			quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP,
+			quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL,
+			quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC,
+			quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT,
+			quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP,
+			quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM,
+			quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC,
+			quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK,
+			quirk_amd_nb_node);
 #endif
-- 
cgit v0.10.2


From 46de405f25f1d9fa73b657ffbb752aa0cc87a91d Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Fri, 17 Apr 2009 10:53:43 +0800
Subject: tracing: Remove include/trace/kmem_event_types.h

kmem_event_types.h is no longer necessary since tracepoint definitions
are put into include/trace/events/kmem.h

[ Impact: remove now-unused file. ]

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <49E7EF37.2080205@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/trace/kmem_event_types.h b/include/trace/kmem_event_types.h
deleted file mode 100644
index 4ff420f..0000000
--- a/include/trace/kmem_event_types.h
+++ /dev/null
@@ -1,193 +0,0 @@
-
-/* use <trace/kmem.h> instead */
-#ifndef TRACE_EVENT
-# error Do not include this file directly.
-# error Unless you know what you are doing.
-#endif
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM kmem
-
-TRACE_EVENT(kmalloc,
-
-	TP_PROTO(unsigned long call_site,
-		 const void *ptr,
-		 size_t bytes_req,
-		 size_t bytes_alloc,
-		 gfp_t gfp_flags),
-
-	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	call_site	)
-		__field(	const void *,	ptr		)
-		__field(	size_t,		bytes_req	)
-		__field(	size_t,		bytes_alloc	)
-		__field(	gfp_t,		gfp_flags	)
-	),
-
-	TP_fast_assign(
-		__entry->call_site	= call_site;
-		__entry->ptr		= ptr;
-		__entry->bytes_req	= bytes_req;
-		__entry->bytes_alloc	= bytes_alloc;
-		__entry->gfp_flags	= gfp_flags;
-	),
-
-	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x",
-		__entry->call_site,
-		__entry->ptr,
-		__entry->bytes_req,
-		__entry->bytes_alloc,
-		__entry->gfp_flags)
-);
-
-TRACE_EVENT(kmem_cache_alloc,
-
-	TP_PROTO(unsigned long call_site,
-		 const void *ptr,
-		 size_t bytes_req,
-		 size_t bytes_alloc,
-		 gfp_t gfp_flags),
-
-	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	call_site	)
-		__field(	const void *,	ptr		)
-		__field(	size_t,		bytes_req	)
-		__field(	size_t,		bytes_alloc	)
-		__field(	gfp_t,		gfp_flags	)
-	),
-
-	TP_fast_assign(
-		__entry->call_site	= call_site;
-		__entry->ptr		= ptr;
-		__entry->bytes_req	= bytes_req;
-		__entry->bytes_alloc	= bytes_alloc;
-		__entry->gfp_flags	= gfp_flags;
-	),
-
-	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x",
-		__entry->call_site,
-		__entry->ptr,
-		__entry->bytes_req,
-		__entry->bytes_alloc,
-		__entry->gfp_flags)
-);
-
-TRACE_EVENT(kmalloc_node,
-
-	TP_PROTO(unsigned long call_site,
-		 const void *ptr,
-		 size_t bytes_req,
-		 size_t bytes_alloc,
-		 gfp_t gfp_flags,
-		 int node),
-
-	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	call_site	)
-		__field(	const void *,	ptr		)
-		__field(	size_t,		bytes_req	)
-		__field(	size_t,		bytes_alloc	)
-		__field(	gfp_t,		gfp_flags	)
-		__field(	int,		node		)
-	),
-
-	TP_fast_assign(
-		__entry->call_site	= call_site;
-		__entry->ptr		= ptr;
-		__entry->bytes_req	= bytes_req;
-		__entry->bytes_alloc	= bytes_alloc;
-		__entry->gfp_flags	= gfp_flags;
-		__entry->node		= node;
-	),
-
-	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d",
-		__entry->call_site,
-		__entry->ptr,
-		__entry->bytes_req,
-		__entry->bytes_alloc,
-		__entry->gfp_flags,
-		__entry->node)
-);
-
-TRACE_EVENT(kmem_cache_alloc_node,
-
-	TP_PROTO(unsigned long call_site,
-		 const void *ptr,
-		 size_t bytes_req,
-		 size_t bytes_alloc,
-		 gfp_t gfp_flags,
-		 int node),
-
-	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	call_site	)
-		__field(	const void *,	ptr		)
-		__field(	size_t,		bytes_req	)
-		__field(	size_t,		bytes_alloc	)
-		__field(	gfp_t,		gfp_flags	)
-		__field(	int,		node		)
-	),
-
-	TP_fast_assign(
-		__entry->call_site	= call_site;
-		__entry->ptr		= ptr;
-		__entry->bytes_req	= bytes_req;
-		__entry->bytes_alloc	= bytes_alloc;
-		__entry->gfp_flags	= gfp_flags;
-		__entry->node		= node;
-	),
-
-	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d",
-		__entry->call_site,
-		__entry->ptr,
-		__entry->bytes_req,
-		__entry->bytes_alloc,
-		__entry->gfp_flags,
-		__entry->node)
-);
-
-TRACE_EVENT(kfree,
-
-	TP_PROTO(unsigned long call_site, const void *ptr),
-
-	TP_ARGS(call_site, ptr),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	call_site	)
-		__field(	const void *,	ptr		)
-	),
-
-	TP_fast_assign(
-		__entry->call_site	= call_site;
-		__entry->ptr		= ptr;
-	),
-
-	TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
-);
-
-TRACE_EVENT(kmem_cache_free,
-
-	TP_PROTO(unsigned long call_site, const void *ptr),
-
-	TP_ARGS(call_site, ptr),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	call_site	)
-		__field(	const void *,	ptr		)
-	),
-
-	TP_fast_assign(
-		__entry->call_site	= call_site;
-		__entry->ptr		= ptr;
-	),
-
-	TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
-);
-
-#undef TRACE_SYSTEM
-- 
cgit v0.10.2


From ac1adc55fc71c7515caa2eb0e63e49b3d1c6a47c Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Fri, 17 Apr 2009 00:27:08 -0500
Subject: tracing/filters: add filter_mutex to protect filter predicates

This patch adds a filter_mutex to prevent the filter predicates from
being accessed concurrently by various external functions.

It's based on a previous patch by Li Zefan:
        "[PATCH 7/7] tracing/filters: make filter preds RCU safe"

v2 changes:

- fixed wrong value returned in a add_subsystem_pred() failure case
  noticed by Li Zefan.

[ Impact: fix trace filter corruption/crashes on parallel access ]

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Tested-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: paulmck@linux.vnet.ibm.com
LKML-Reference: <1239946028.6639.13.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8817c18..247948e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -757,13 +757,15 @@ struct filter_pred {
 };
 
 extern void filter_free_pred(struct filter_pred *pred);
-extern void filter_print_preds(struct filter_pred **preds, int n_preds,
+extern void filter_print_preds(struct ftrace_event_call *call,
 			       struct trace_seq *s);
 extern int filter_parse(char **pbuf, struct filter_pred *pred);
 extern int filter_add_pred(struct ftrace_event_call *call,
 			   struct filter_pred *pred);
 extern void filter_disable_preds(struct ftrace_event_call *call);
 extern void filter_free_subsystem_preds(struct event_subsystem *system);
+extern void filter_print_subsystem_preds(struct event_subsystem *system,
+					 struct trace_seq *s);
 extern int filter_add_subsystem_pred(struct event_subsystem *system,
 				     struct filter_pred *pred);
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 1137f95..64f9d6d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -488,7 +488,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	trace_seq_init(s);
 
-	filter_print_preds(call->preds, call->n_preds, s);
+	filter_print_preds(call, s);
 	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 	kfree(s);
@@ -558,7 +558,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	trace_seq_init(s);
 
-	filter_print_preds(system->preds, system->n_preds, s);
+	filter_print_subsystem_preds(system, s);
 	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 	kfree(s);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index f8e5eab..e0fcfd2 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -22,10 +22,13 @@
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
+#include <linux/mutex.h>
 
 #include "trace.h"
 #include "trace_output.h"
 
+static DEFINE_MUTEX(filter_mutex);
+
 static int filter_pred_64(struct filter_pred *pred, void *event)
 {
 	u64 *addr = (u64 *)(event + pred->offset);
@@ -112,8 +115,8 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec)
 }
 EXPORT_SYMBOL_GPL(filter_match_preds);
 
-void filter_print_preds(struct filter_pred **preds, int n_preds,
-			struct trace_seq *s)
+static void __filter_print_preds(struct filter_pred **preds, int n_preds,
+				 struct trace_seq *s)
 {
 	char *field_name;
 	struct filter_pred *pred;
@@ -138,6 +141,21 @@ void filter_print_preds(struct filter_pred **preds, int n_preds,
 	}
 }
 
+void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s)
+{
+	mutex_lock(&filter_mutex);
+	__filter_print_preds(call->preds, call->n_preds, s);
+	mutex_unlock(&filter_mutex);
+}
+
+void filter_print_subsystem_preds(struct event_subsystem *system,
+				  struct trace_seq *s)
+{
+	mutex_lock(&filter_mutex);
+	__filter_print_preds(system->preds, system->n_preds, s);
+	mutex_unlock(&filter_mutex);
+}
+
 static struct ftrace_event_field *
 find_event_field(struct ftrace_event_call *call, char *name)
 {
@@ -180,7 +198,7 @@ static int filter_set_pred(struct filter_pred *dest,
 	return 0;
 }
 
-void filter_disable_preds(struct ftrace_event_call *call)
+static void __filter_disable_preds(struct ftrace_event_call *call)
 {
 	int i;
 
@@ -190,6 +208,13 @@ void filter_disable_preds(struct ftrace_event_call *call)
 		call->preds[i]->fn = filter_pred_none;
 }
 
+void filter_disable_preds(struct ftrace_event_call *call)
+{
+	mutex_lock(&filter_mutex);
+	__filter_disable_preds(call);
+	mutex_unlock(&filter_mutex);
+}
+
 int init_preds(struct ftrace_event_call *call)
 {
 	struct filter_pred *pred;
@@ -223,7 +248,7 @@ oom:
 }
 EXPORT_SYMBOL_GPL(init_preds);
 
-void filter_free_subsystem_preds(struct event_subsystem *system)
+static void __filter_free_subsystem_preds(struct event_subsystem *system)
 {
 	struct ftrace_event_call *call;
 	int i;
@@ -241,18 +266,25 @@ void filter_free_subsystem_preds(struct event_subsystem *system)
 			continue;
 
 		if (!strcmp(call->system, system->name))
-			filter_disable_preds(call);
+			__filter_disable_preds(call);
 	}
 }
 
-static int __filter_add_pred(struct ftrace_event_call *call,
-			     struct filter_pred *pred,
-			     filter_pred_fn_t fn)
+void filter_free_subsystem_preds(struct event_subsystem *system)
+{
+	mutex_lock(&filter_mutex);
+	__filter_free_subsystem_preds(system);
+	mutex_unlock(&filter_mutex);
+}
+
+static int filter_add_pred_fn(struct ftrace_event_call *call,
+			      struct filter_pred *pred,
+			      filter_pred_fn_t fn)
 {
 	int idx, err;
 
 	if (call->n_preds && !pred->compound)
-		filter_disable_preds(call);
+		__filter_disable_preds(call);
 
 	if (call->n_preds == MAX_FILTER_PRED)
 		return -ENOSPC;
@@ -276,7 +308,8 @@ static int is_string_field(const char *type)
 	return 0;
 }
 
-int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
+static int __filter_add_pred(struct ftrace_event_call *call,
+			     struct filter_pred *pred)
 {
 	struct ftrace_event_field *field;
 	filter_pred_fn_t fn;
@@ -293,7 +326,7 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
 			return -EINVAL;
 		fn = filter_pred_string;
 		pred->str_len = field->size;
-		return __filter_add_pred(call, pred, fn);
+		return filter_add_pred_fn(call, pred, fn);
 	} else {
 		if (pred->str_len)
 			return -EINVAL;
@@ -316,7 +349,18 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
 		return -EINVAL;
 	}
 
-	return __filter_add_pred(call, pred, fn);
+	return filter_add_pred_fn(call, pred, fn);
+}
+
+int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
+{
+	int err;
+
+	mutex_lock(&filter_mutex);
+	err = __filter_add_pred(call, pred);
+	mutex_unlock(&filter_mutex);
+
+	return err;
 }
 
 int filter_add_subsystem_pred(struct event_subsystem *system,
@@ -324,20 +368,27 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 {
 	struct ftrace_event_call *call;
 
+	mutex_lock(&filter_mutex);
+
 	if (system->n_preds && !pred->compound)
-		filter_free_subsystem_preds(system);
+		__filter_free_subsystem_preds(system);
 
 	if (!system->n_preds) {
 		system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
 					GFP_KERNEL);
-		if (!system->preds)
+		if (!system->preds) {
+			mutex_unlock(&filter_mutex);
 			return -ENOMEM;
+		}
 	}
 
-	if (system->n_preds == MAX_FILTER_PRED)
+	if (system->n_preds == MAX_FILTER_PRED) {
+		mutex_unlock(&filter_mutex);
 		return -ENOSPC;
+	}
 
 	system->preds[system->n_preds] = pred;
+	system->n_preds++;
 
 	list_for_each_entry(call, &ftrace_events, list) {
 		int err;
@@ -348,17 +399,16 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 		if (strcmp(call->system, system->name))
 			continue;
 
-		if (!find_event_field(call, pred->field_name))
-			continue;
-
-		err = filter_add_pred(call, pred);
+		err = __filter_add_pred(call, pred);
 		if (err == -ENOMEM) {
 			system->preds[system->n_preds] = NULL;
+			system->n_preds--;
+			mutex_unlock(&filter_mutex);
 			return err;
 		}
 	}
 
-	system->n_preds++;
+	mutex_unlock(&filter_mutex);
 
 	return 0;
 }
-- 
cgit v0.10.2


From b0afdc126d0515e76890f0a5f26b28501cfa298e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 17 Apr 2009 13:02:22 -0400
Subject: tracing/events: enable code with EVENT_TRACING not EVENT_TRACER

The CONFIG_EVENT_TRACER is the way to turn on event tracing when no
other tracing has been configured. All code to get enabled should
depend on CONFIG_EVENT_TRACING. That is what is enabled when TRACING
(or CONFIG_EVENT_TRACER) is selected.

This patch enables the include/trace/ftrace.h file when
CONFIG_EVENT_TRACING is enabled.

[ Impact: fix warning in event tracer selftest ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index 1886941..7f1f23d 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -56,7 +56,7 @@
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
-#ifdef CONFIG_EVENT_TRACER
+#ifdef CONFIG_EVENT_TRACING
 #include <trace/ftrace.h>
 #endif
 
-- 
cgit v0.10.2


From 12acd473d45cf2e40de3782cb2de712e5cd4d715 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 17 Apr 2009 16:01:56 -0400
Subject: tracing: add EXPORT_SYMBOL_GPL for trace commits

Not all the necessary symbols were exported to allow for tracing
by modules. This patch adds them in.

[ Impact: allow modules to commit data to the ring buffer ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f681f64..183d788 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -894,18 +894,20 @@ void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
 {
 	__trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
 }
+EXPORT_SYMBOL(trace_current_buffer_unlock_commit);
 
 void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc)
 {
 	__trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
 }
+EXPORT_SYMBOL(trace_nowake_buffer_unlock_commit);
 
 void trace_current_buffer_discard_commit(struct ring_buffer_event *event)
 {
 	ring_buffer_discard_commit(global_trace.buffer, event);
 }
-EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
+EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
 
 void
 trace_function(struct trace_array *tr,
-- 
cgit v0.10.2


From 261842b7c9099f56de2eb969c8ad65402d68e00e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 16 Apr 2009 21:41:52 -0400
Subject: tracing: add same level recursion detection

The tracing infrastructure allows for recursion. That is, an interrupt
may interrupt the act of tracing an event, and that interrupt may very well
perform its own trace. This is a recursive trace, and is fine to do.

The problem arises when there is a bug, and the utility doing the trace
calls something that recurses back into the tracer. This recursion is not
caused by an external event like an interrupt, but by code that is not
expected to recurse. The result could be a lockup.

This patch adds a bitmask to the task structure that keeps track
of the trace recursion. To find the interrupt depth, the following
algorithm is used:

  level = hardirq_count() + softirq_count() + in_nmi;

Here, level will be the depth of interrutps and softirqs, and even handles
the nmi. Then the corresponding bit is set in the recursion bitmask.
If the bit was already set, we know we had a recursion at the same level
and we warn about it and fail the writing to the buffer.

After the data has been committed to the buffer, we clear the bit.
No atomics are needed. The only races are with interrupts and they reset
the bitmask before returning anywy.

[ Impact: detect same irq level trace recursion ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 97c83e1..39b95c5 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -488,8 +488,15 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk)
 
 extern int ftrace_dump_on_oops;
 
+#ifdef CONFIG_PREEMPT
+#define INIT_TRACE_RECURSION		.trace_recursion = 0,
+#endif
+
 #endif /* CONFIG_TRACING */
 
+#ifndef INIT_TRACE_RECURSION
+#define INIT_TRACE_RECURSION
+#endif
 
 #ifdef CONFIG_HW_BRANCH_TRACER
 
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index dcfb933..6fc2185 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -187,6 +187,7 @@ extern struct cred init_cred;
 	INIT_TRACE_IRQFLAGS						\
 	INIT_LOCKDEP							\
 	INIT_FTRACE_GRAPH						\
+	INIT_TRACE_RECURSION						\
 }
 
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4c38bc..7ede5e4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1428,7 +1428,9 @@ struct task_struct {
 #ifdef CONFIG_TRACING
 	/* state flags for use by tracers */
 	unsigned long trace;
-#endif
+	/* bitmask of trace recursion */
+	unsigned long trace_recursion;
+#endif /* CONFIG_TRACING */
 };
 
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 84a6055..b421b0e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1481,6 +1481,40 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	return event;
 }
 
+static int trace_irq_level(void)
+{
+	return hardirq_count() + softirq_count() + in_nmi();
+}
+
+static int trace_recursive_lock(void)
+{
+	int level;
+
+	level = trace_irq_level();
+
+	if (unlikely(current->trace_recursion & (1 << level))) {
+		/* Disable all tracing before we do anything else */
+		tracing_off_permanent();
+		WARN_ON_ONCE(1);
+		return -1;
+	}
+
+	current->trace_recursion |= 1 << level;
+
+	return 0;
+}
+
+static void trace_recursive_unlock(void)
+{
+	int level;
+
+	level = trace_irq_level();
+
+	WARN_ON_ONCE(!current->trace_recursion & (1 << level));
+
+	current->trace_recursion &= ~(1 << level);
+}
+
 static DEFINE_PER_CPU(int, rb_need_resched);
 
 /**
@@ -1514,6 +1548,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 	/* If we are tracing schedule, we don't want to recurse */
 	resched = ftrace_preempt_disable();
 
+	if (trace_recursive_lock())
+		goto out_nocheck;
+
 	cpu = raw_smp_processor_id();
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -1543,6 +1580,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 	return event;
 
  out:
+	trace_recursive_unlock();
+
+ out_nocheck:
 	ftrace_preempt_enable(resched);
 	return NULL;
 }
@@ -1581,6 +1621,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
 
 	rb_commit(cpu_buffer, event);
 
+	trace_recursive_unlock();
+
 	/*
 	 * Only the last preempt count needs to restore preemption.
 	 */
-- 
cgit v0.10.2


From 3189cdb31622f4e40688ce5a6fc5d940b42bc805 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 17 Apr 2009 16:13:55 -0400
Subject: tracing: protect trace_printk from recursion

trace_printk can be called from any context, including NMIs.
If this happens, then we must test for for recursion before
grabbing any spinlocks.

This patch prevents trace_printk from being called recursively.

[ Impact: prevent hard lockup in lockdep event tracer ]

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 183d788..b9a3adc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1259,6 +1259,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	struct trace_array_cpu *data;
 	struct bprint_entry *entry;
 	unsigned long flags;
+	int disable;
 	int resched;
 	int cpu, len = 0, size, pc;
 
@@ -1273,7 +1274,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 
-	if (unlikely(atomic_read(&data->disabled)))
+	disable = atomic_inc_return(&data->disabled);
+	if (unlikely(disable != 1))
 		goto out;
 
 	/* Lockdep uses trace_printk for lock tracing */
@@ -1301,6 +1303,7 @@ out_unlock:
 	local_irq_restore(flags);
 
 out:
+	atomic_dec_return(&data->disabled);
 	ftrace_preempt_enable(resched);
 	unpause_graph_tracing();
 
@@ -1320,6 +1323,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 	int cpu, len = 0, size, pc;
 	struct print_entry *entry;
 	unsigned long irq_flags;
+	int disable;
 
 	if (tracing_disabled || tracing_selftest_running)
 		return 0;
@@ -1329,7 +1333,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 
-	if (unlikely(atomic_read(&data->disabled)))
+	disable = atomic_inc_return(&data->disabled);
+	if (unlikely(disable != 1))
 		goto out;
 
 	pause_graph_tracing();
@@ -1357,6 +1362,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 	raw_local_irq_restore(irq_flags);
 	unpause_graph_tracing();
  out:
+	atomic_dec_return(&data->disabled);
 	preempt_enable_notrace();
 
 	return len;
-- 
cgit v0.10.2


From 8e668b5b3455207e4540fc7ccab9ecf70142f288 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 17 Apr 2009 17:17:55 -0400
Subject: tracing: remove format attribute of inline function

Due to a cut and paste error, I added the gcc attribute for printf
format to the static inline stub of trace_seq_printf.

This will cause a compile failure.

[ Impact: fix compiler error when CONFIG_TRACING is off ]

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: =?ISO-8859-15?Q?Fr=E9d=E9ric_Weisbecker?= <fweisbec@gmail.com>
LKML-Reference: <alpine.DEB.2.00.0904171717080.1016@gandalf.stny.rr.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index 15ca2c7..37db9bd 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -42,7 +42,6 @@ extern int trace_seq_path(struct trace_seq *s, struct path *path);
 
 #else /* CONFIG_TRACING */
 static inline int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)))
 {
 	return 0;
 }
-- 
cgit v0.10.2


From 6f41469c627fe118121dfce2a8134ad24da3df28 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:41 +0900
Subject: block: clear req->errors on bio completion only for fs requests

Impact: subtle behavior change

For fs requests, rq is only carrier of bios and rq error status as a
whole doesn't mean much.  This is the reason why rq->errors is being
cleared on each partial completion of a request as on each partial
completion the error status is transferred to the respective bios.

For pc requests, rq->errors is used to carry error status to the
issuer and thus __end_that_request_first() doesn't clear it on such
cases.

The condition was fine till now as only fs and pc requests have used
bio and thus the bio completion path.  However, future changes will
unify data accesses to bio and all non fs users care about rq error
status.  Clear rq->errors on bio completion only for fs requests.

In general, the implicit clearing is a bit too subtle especially as
the meaning of rq->errors is completely dependent on low level
drivers.  Unifying / cleaning up rq->errors usage and letting llds
manage it would be better.  TODO comment added.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Jens Axboe <axboe@kernel.dk>

diff --git a/block/blk-core.c b/block/blk-core.c
index 07ab754..cce7a88 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1739,10 +1739,14 @@ static int __end_that_request_first(struct request *req, int error,
 	trace_block_rq_complete(req->q, req);
 
 	/*
-	 * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual
-	 * sense key with us all the way through
+	 * For fs requests, rq is just carrier of independent bio's
+	 * and each partial completion should be handled separately.
+	 * Reset per-request error on each partial completion.
+	 *
+	 * TODO: tj: This is too subtle.  It would be better to let
+	 * low level drivers do what they see fit.
 	 */
-	if (!blk_pc_request(req))
+	if (blk_fs_request(req))
 		req->errors = 0;
 
 	if (error && (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))) {
-- 
cgit v0.10.2


From 1e75540ec5202cae63cd238c86bd880e3d496546 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:41 +0900
Subject: ide-tape: remove back-to-back REQUEST_SENSE detection

Impact: fix an oops which always triggers

ide_tape_issue_pc() assumed drive->pc isn't NULL on invocation when
checking for back-to-back request sense issues but drive->pc can be
NULL and even when it's not NULL, it's not safe to dereference it once
the previous command is complete because pc could have been freed or
was on stack.  Kill back-to-back REQUEST_SENSE detection.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index cb942a9..3a53e08 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -614,12 +614,6 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive,
 {
 	idetape_tape_t *tape = drive->driver_data;
 
-	if (drive->pc->c[0] == REQUEST_SENSE &&
-	    pc->c[0] == REQUEST_SENSE) {
-		printk(KERN_ERR "ide-tape: possible ide-tape.c bug - "
-			"Two request sense in serial were issued\n");
-	}
-
 	if (drive->failed_pc == NULL && pc->c[0] != REQUEST_SENSE)
 		drive->failed_pc = pc;
 
-- 
cgit v0.10.2


From 853280a4dc8e3cc97ff10c1c02234d96078f437b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:41 +0900
Subject: ide: use blk_run_queue() instead of blk_start_queueing()

blk_start_queueing() is being phased out in favor of
[__]blk_run_queue().  Switch.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index 310d03f..a914023 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -24,11 +24,8 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 			start_queue = 1;
 		spin_unlock_irq(&hwif->lock);
 
-		if (start_queue) {
-			spin_lock_irq(q->queue_lock);
-			blk_start_queueing(q);
-			spin_unlock_irq(q->queue_lock);
-		}
+		if (start_queue)
+			blk_run_queue(q);
 		return;
 	}
 	spin_unlock_irq(&hwif->lock);
-- 
cgit v0.10.2


From 55f3f399422a4a3f6cb84ea4096dfaddf8817399 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:41 +0900
Subject: ide: don't set REQ_SOFTBARRIER

ide doesn't have to worry about REQ_SOFTBARRIER.  Don't set it.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index a9fbe2c..c243880 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -411,7 +411,6 @@ static void idedisk_prepare_flush(struct request_queue *q, struct request *rq)
 	cmd->protocol = ATA_PROT_NODATA;
 
 	rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
-	rq->cmd_flags |= REQ_SOFTBARRIER;
 	rq->special = cmd;
 }
 
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index c1c25eb..5991b23 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -231,7 +231,6 @@ static int generic_drive_reset(ide_drive_t *drive)
 	rq->cmd_type = REQ_TYPE_SPECIAL;
 	rq->cmd_len = 1;
 	rq->cmd[0] = REQ_DRIVE_RESET;
-	rq->cmd_flags |= REQ_SOFTBARRIER;
 	if (blk_execute_rq(drive->queue, NULL, rq, 1))
 		ret = rq->errors;
 	blk_put_request(rq);
-- 
cgit v0.10.2


From 46a802e852c2106d755f697819ea80cd3ffdc222 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:41 +0900
Subject: ide kill unused ide_cmd->special

Impact: removal of unused field

No one uses ide_cmd->special anymore.  Kill it.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/include/linux/ide.h b/include/linux/ide.h
index ff65fff..846a1e1 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -324,7 +324,6 @@ struct ide_cmd {
 	unsigned int		cursg_ofs;
 
 	struct request		*rq;		/* copy of request */
-	void			*special;	/* valid_t generally */
 };
 
 /* ATAPI packet command flags */
-- 
cgit v0.10.2


From 1873b90cdea038715ec7140fccc2116fb930ffb5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:41 +0900
Subject: ide-cd: clear sense buffer before issuing request sense

Impact: code simplification

ide_cd_request_sense_fixup() clears the tail of the sense buffer if
the device didn't completely fill it.  This patch makes
cdrom_queue_request_sense() clear the sense buffer before issuing the
command instead of clearing it afterwards.  This simplifies code and
eases future changes.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 3aec19d..509f129 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -217,6 +217,8 @@ static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense,
 	if (sense == NULL)
 		sense = &info->sense_data;
 
+	memset(sense, 0, 18);
+
 	/* stuff the sense request in front of our current request */
 	blk_rq_init(NULL, rq);
 	rq->cmd_type = REQ_TYPE_ATA_PC;
@@ -504,14 +506,8 @@ static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct ide_cmd *cmd)
 	 * and some drives don't send them.  Sigh.
 	 */
 	if (rq->cmd[0] == GPCMD_REQUEST_SENSE &&
-	    cmd->nleft > 0 && cmd->nleft <= 5) {
-		unsigned int ofs = cmd->nbytes - cmd->nleft;
-
-		while (cmd->nleft > 0) {
-			*((u8 *)rq->data + ofs++) = 0;
-			cmd->nleft--;
-		}
-	}
+	    cmd->nleft > 0 && cmd->nleft <= 5)
+		cmd->nleft = 0;
 }
 
 int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
-- 
cgit v0.10.2


From 7f006dc24fae158131116c9472874f12e16cf040 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: ide-floppy: block pc always uses bio

Impact: remove unnecessary code path

Block pc requests always use bio and rq->data is always NULL.  No need
to worry about !rq->bio cases in idefloppy_block_pc_cmd().  Note that
ide-atapi uses ide_pio_bytes() for bio PIO transfer which handle sg
fine.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>

diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 2b4868d..3b22e06 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -216,15 +216,13 @@ static void idefloppy_blockpc_cmd(struct ide_disk_obj *floppy,
 	ide_init_pc(pc);
 	memcpy(pc->c, rq->cmd, sizeof(pc->c));
 	pc->rq = rq;
-	if (rq->data_len && rq_data_dir(rq) == WRITE)
-		pc->flags |= PC_FLAG_WRITING;
-	pc->buf = rq->data;
-	if (rq->bio)
+	if (rq->data_len) {
 		pc->flags |= PC_FLAG_DMA_OK;
-	/*
-	 * possibly problematic, doesn't look like ide-floppy correctly
-	 * handled scattered requests if dma fails...
-	 */
+		if (rq_data_dir(rq) == WRITE)
+			pc->flags |= PC_FLAG_WRITING;
+	}
+	/* pio will be performed by ide_pio_bytes() which handles sg fine */
+	pc->buf = NULL;
 	pc->req_xfer = pc->buf_size = rq->data_len;
 }
 
-- 
cgit v0.10.2


From eace4cb04c0edc9388e987bf9bbdef461f6daca4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: ide-taskfile: don't abuse rq->buffer

Impact: rq->buffer usage cleanup

ide_raw_taskfile() directly uses rq->buffer to carry pointer to the
data buffer.  This complicates both block interface and ide backend
request handling.  Use blk_rq_map_kern() instead and drop special
handling for REQ_TYPE_ATA_TASKFILE from ide_map_sg().

Note that REQ_RW setting is moved upwards as blk_rq_map_kern() uses it
to initialize bio rw flag.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 35dc38d..9b9e8b1 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -248,10 +248,7 @@ void ide_map_sg(ide_drive_t *drive, struct ide_cmd *cmd)
 	struct scatterlist *sg = hwif->sg_table;
 	struct request *rq = cmd->rq;
 
-	if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE) {
-		sg_init_one(sg, rq->buffer, rq->nr_sectors * SECTOR_SIZE);
-		cmd->sg_nents = 1;
-	} else if (!rq->bio) {
+	if (!rq->bio) {
 		sg_init_one(sg, rq->data, rq->data_len);
 		cmd->sg_nents = 1;
 	} else
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 4aa6223..f400eb4 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -424,7 +424,9 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
 
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
-	rq->buffer = buf;
+
+	if (cmd->tf_flags & IDE_TFLAG_WRITE)
+		rq->cmd_flags |= REQ_RW;
 
 	/*
 	 * (ks) We transfer currently only whole sectors.
@@ -432,18 +434,20 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
 	 * if we would find a solution to transfer any size.
 	 * To support special commands like READ LONG.
 	 */
-	rq->hard_nr_sectors = rq->nr_sectors = nsect;
-	rq->hard_cur_sectors = rq->current_nr_sectors = nsect;
-
-	if (cmd->tf_flags & IDE_TFLAG_WRITE)
-		rq->cmd_flags |= REQ_RW;
+	if (nsect) {
+		error = blk_rq_map_kern(drive->queue, rq, buf,
+					nsect * SECTOR_SIZE, __GFP_WAIT);
+		if (error)
+			goto put_req;
+	}
 
 	rq->special = cmd;
 	cmd->rq = rq;
 
 	error = blk_execute_rq(drive->queue, NULL, rq, 0);
-	blk_put_request(rq);
 
+put_req:
+	blk_put_request(rq);
 	return error;
 }
 
-- 
cgit v0.10.2


From c267cc1c4db4ccb3406d045a8da8660f0bbfe08d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: ide-atapi: don't abuse rq->buffer

Impact: rq->buffer usage cleanup

ide-atapi uses rq->buffer as private opaque value for internal special
requests.  rq->special isn't used for these cases (the only case where
rq->special is used is for ide-tape rw requests).  Use rq->special
instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 7201b17..2894577 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -90,7 +90,7 @@ static void ide_queue_pc_head(ide_drive_t *drive, struct gendisk *disk,
 	blk_rq_init(NULL, rq);
 	rq->cmd_type = REQ_TYPE_SPECIAL;
 	rq->cmd_flags |= REQ_PREEMPT;
-	rq->buffer = (char *)pc;
+	rq->special = (char *)pc;
 	rq->rq_disk = disk;
 
 	if (pc->req_xfer) {
@@ -119,7 +119,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_SPECIAL;
-	rq->buffer = (char *)pc;
+	rq->special = (char *)pc;
 
 	if (pc->req_xfer) {
 		rq->data = pc->buf;
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 3b22e06..9460033 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -264,7 +264,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		pc = &floppy->queued_pc;
 		idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block);
 	} else if (blk_special_request(rq)) {
-		pc = (struct ide_atapi_pc *) rq->buffer;
+		pc = (struct ide_atapi_pc *)rq->special;
 	} else if (blk_pc_request(rq)) {
 		pc = &floppy->queued_pc;
 		idefloppy_blockpc_cmd(floppy, pc, rq);
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 3a53e08..aadf53c 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -828,7 +828,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 		goto out;
 	}
 	if (rq->cmd[13] & REQ_IDETAPE_PC1) {
-		pc = (struct ide_atapi_pc *) rq->buffer;
+		pc = (struct ide_atapi_pc *)rq->special;
 		rq->cmd[13] &= ~(REQ_IDETAPE_PC1);
 		rq->cmd[13] |= REQ_IDETAPE_PC2;
 		goto out;
-- 
cgit v0.10.2


From cbfd082abfcbed8c57a12636f36e9bead8d6cfc6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: ide-cd: don't abuse rq->buffer

Impact: rq->buffer usage cleanup

ide-cd uses rq->buffer to carry pointer to the original request when
issuing REQUEST_SENSE.  Use rq->special instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 509f129..eb3c299 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -232,8 +232,8 @@ static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense,
 	rq->cmd_type = REQ_TYPE_SENSE;
 	rq->cmd_flags |= REQ_PREEMPT;
 
-	/* NOTE! Save the failed command in "rq->buffer" */
-	rq->buffer = (void *) failed_command;
+	/* NOTE! Save the failed command in "rq->special" */
+	rq->special = (void *)failed_command;
 
 	if (failed_command)
 		ide_debug_log(IDE_DBG_SENSE, "failed_cmd: 0x%x",
@@ -247,10 +247,10 @@ static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense,
 static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
 {
 	/*
-	 * For REQ_TYPE_SENSE, "rq->buffer" points to the original
+	 * For REQ_TYPE_SENSE, "rq->special" points to the original
 	 * failed request
 	 */
-	struct request *failed = (struct request *)rq->buffer;
+	struct request *failed = (struct request *)rq->special;
 	struct cdrom_info *info = drive->driver_data;
 	void *sense = &info->sense_data;
 
-- 
cgit v0.10.2


From a1df5169f9bf08f6067029bfb840a05e282b1b97 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: ide: add helpers for preparing sense requests

This is in preparation of removing the queueing of a sense request out
of the IRQ handler path.

Use struct request_sense as a general sense buffer for all ATAPI
devices ide-{floppy,tape,cd}.

tj: * blk_get_request(__GFP_WAIT) can't be called from do_request() as
      it can cause deadlock.  Converted to use inline struct request
      and blk_rq_init().

    * Added xfer / cdb len selection depending on device type.

    * All sense prep logics folded into ide_prep_sense() which never
      fails.

    * hwif->rq clearing and sense_rq used handling moved into
      ide_queue_sense_rq().

    * blk_rq_map_kern() conversion is moved to later patch.

CC: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
CC: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 2894577..c6e0348 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -191,6 +191,67 @@ void ide_create_request_sense_cmd(ide_drive_t *drive, struct ide_atapi_pc *pc)
 }
 EXPORT_SYMBOL_GPL(ide_create_request_sense_cmd);
 
+void ide_prep_sense(ide_drive_t *drive, struct request *rq)
+{
+	struct request_sense *sense = &drive->sense_data;
+	struct request *sense_rq = &drive->sense_rq;
+	unsigned int cmd_len, sense_len;
+
+	debug_log("%s: enter\n", __func__);
+
+	switch (drive->media) {
+	case ide_floppy:
+		cmd_len = 255;
+		sense_len = 18;
+		break;
+	case ide_tape:
+		cmd_len = 20;
+		sense_len = 20;
+		break;
+	default:
+		cmd_len = 18;
+		sense_len = 18;
+	}
+
+	BUG_ON(sense_len > sizeof(*sense));
+
+	if (blk_sense_request(rq) || drive->sense_rq_armed)
+		return;
+
+	memset(sense, 0, sizeof(*sense));
+
+	blk_rq_init(rq->q, sense_rq);
+	sense_rq->rq_disk = rq->rq_disk;
+
+	sense_rq->data = sense;
+	sense_rq->cmd[0] = GPCMD_REQUEST_SENSE;
+	sense_rq->cmd[4] = cmd_len;
+	sense_rq->data_len = sense_len;
+
+	sense_rq->cmd_type = REQ_TYPE_SENSE;
+	sense_rq->cmd_flags |= REQ_PREEMPT;
+
+	if (drive->media == ide_tape)
+		sense_rq->cmd[13] = REQ_IDETAPE_PC1;
+
+	drive->sense_rq_armed = true;
+}
+EXPORT_SYMBOL_GPL(ide_prep_sense);
+
+void ide_queue_sense_rq(ide_drive_t *drive, void *special)
+{
+	BUG_ON(!drive->sense_rq_armed);
+
+	drive->sense_rq.special = special;
+	drive->sense_rq_armed = false;
+
+	drive->hwif->rq = NULL;
+
+	elv_add_request(drive->queue, &drive->sense_rq,
+			ELEVATOR_INSERT_FRONT, 0);
+}
+EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
+
 /*
  * Called when an error was detected during the last packet command.
  * We queue a request sense packet command in the head of the request list.
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 846a1e1..a69ccac 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -26,6 +26,9 @@
 #include <asm/io.h>
 #include <asm/mutex.h>
 
+/* for request_sense */
+#include <linux/cdrom.h>
+
 #if defined(CONFIG_CRIS) || defined(CONFIG_FRV) || defined(CONFIG_MN10300)
 # define SUPPORT_VLB_SYNC 0
 #else
@@ -602,6 +605,11 @@ struct ide_drive_s {
 
 	struct ide_atapi_pc request_sense_pc;
 	struct request request_sense_rq;
+
+	/* current sense rq and buffer */
+	bool sense_rq_armed;
+	struct request sense_rq;
+	struct request_sense sense_data;
 };
 
 typedef struct ide_drive_s ide_drive_t;
@@ -1175,6 +1183,9 @@ int ide_set_media_lock(ide_drive_t *, struct gendisk *, int);
 void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *);
 void ide_retry_pc(ide_drive_t *, struct gendisk *);
 
+void ide_prep_sense(ide_drive_t *drive, struct request *rq);
+void ide_queue_sense_rq(ide_drive_t *drive, void *special);
+
 int ide_cd_expiry(ide_drive_t *);
 
 int ide_cd_get_xferlen(struct request *);
-- 
cgit v0.10.2


From 746d5e43274e9ea6cbd58818afc9239d41fb4e1e Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: ide-cd: convert to using generic sense request

Preallocate a sense request in the ->do_request method and reinitialize
it only on demand, in case it's been consumed in the IRQ handler path.
The reason for this is that we don't want to be mapping rq to bio in
the IRQ path and introduce all kinds of unnecessary hacks to the block
layer.

tj: * Both user and kernel PC requests expect sense data to be stored
      in separate storage other than drive->sense_data.  Copy sense
      data to rq->sense on completion if rq->sense is not NULL.  This
      fixes bogus sense data on PC requests.

As a result, remove cdrom_queue_request_sense.

CC: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
CC: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index eb3c299..7b21c7e 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -206,44 +206,6 @@ static void cdrom_analyze_sense_data(ide_drive_t *drive,
 	ide_cd_log_error(drive->name, failed_command, sense);
 }
 
-static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense,
-				      struct request *failed_command)
-{
-	struct cdrom_info *info		= drive->driver_data;
-	struct request *rq		= &drive->request_sense_rq;
-
-	ide_debug_log(IDE_DBG_SENSE, "enter");
-
-	if (sense == NULL)
-		sense = &info->sense_data;
-
-	memset(sense, 0, 18);
-
-	/* stuff the sense request in front of our current request */
-	blk_rq_init(NULL, rq);
-	rq->cmd_type = REQ_TYPE_ATA_PC;
-	rq->rq_disk = info->disk;
-
-	rq->data = sense;
-	rq->cmd[0] = GPCMD_REQUEST_SENSE;
-	rq->cmd[4] = 18;
-	rq->data_len = 18;
-
-	rq->cmd_type = REQ_TYPE_SENSE;
-	rq->cmd_flags |= REQ_PREEMPT;
-
-	/* NOTE! Save the failed command in "rq->special" */
-	rq->special = (void *)failed_command;
-
-	if (failed_command)
-		ide_debug_log(IDE_DBG_SENSE, "failed_cmd: 0x%x",
-					     failed_command->cmd[0]);
-
-	drive->hwif->rq = NULL;
-
-	elv_add_request(drive->queue, rq, ELEVATOR_INSERT_FRONT, 0);
-}
-
 static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
 {
 	/*
@@ -251,11 +213,16 @@ static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
 	 * failed request
 	 */
 	struct request *failed = (struct request *)rq->special;
-	struct cdrom_info *info = drive->driver_data;
-	void *sense = &info->sense_data;
+	struct request_sense *sense = &drive->sense_data;
 
 	if (failed) {
 		if (failed->sense) {
+			/*
+			 * Sense is always read into drive->sense_data.
+			 * Copy back if the failed request has its
+			 * sense pointer set.
+			 */
+			memcpy(failed->sense, sense, 18);
 			sense = failed->sense;
 			failed->sense_len = rq->sense_len;
 		}
@@ -431,7 +398,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 
 	/* if we got a CHECK_CONDITION status, queue a request sense command */
 	if (stat & ATA_ERR)
-		cdrom_queue_request_sense(drive, NULL, NULL);
+		ide_queue_sense_rq(drive, NULL);
 	return 1;
 
 end_request:
@@ -445,7 +412,7 @@ end_request:
 
 		hwif->rq = NULL;
 
-		cdrom_queue_request_sense(drive, rq->sense, rq);
+		ide_queue_sense_rq(drive, rq);
 		return 1;
 	} else
 		return 2;
@@ -893,6 +860,9 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 		goto out_end;
 	}
 
+	/* prepare sense request for this command */
+	ide_prep_sense(drive, rq);
+
 	memset(&cmd, 0, sizeof(cmd));
 
 	if (rq_data_dir(rq))
diff --git a/drivers/ide/ide-cd.h b/drivers/ide/ide-cd.h
index 1d97101..93a3cf1b 100644
--- a/drivers/ide/ide-cd.h
+++ b/drivers/ide/ide-cd.h
@@ -87,10 +87,6 @@ struct cdrom_info {
 
 	struct atapi_toc *toc;
 
-	/* The result of the last successful request sense command
-	   on this device. */
-	struct request_sense sense_data;
-
 	u8 max_speed;		/* Max speed of the drive. */
 	u8 current_speed;	/* Current speed of the drive. */
 
-- 
cgit v0.10.2


From 6b544fcc8cd0a04eb42de9d1ecdd345e979d6ada Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: ide-atapi: convert ide-{floppy,tape} to using preallocated sense
 buffer

Since we're issuing REQ_TYPE_SENSE now we need to allow those types of
rqs in the ->do_request callbacks. As a future improvement, sense_len
assignment might be unified across all ATAPI devices. Borislav to
check with specs and test.

As a result, get rid of ide_queue_pc_head() and
drive->request_sense_rq.

tj: * Init request sense ide_atapi_pc from sense request.  In the
      longer timer, it would probably better to fold
      ide_create_request_sense_cmd() into its only current user -
      ide_floppy_get_format_progress().

    * ide_retry_pc() no longer takes @disk.

CC: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
CC: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index c6e0348..972c522 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -80,34 +80,6 @@ void ide_init_pc(struct ide_atapi_pc *pc)
 EXPORT_SYMBOL_GPL(ide_init_pc);
 
 /*
- * Generate a new packet command request in front of the request queue, before
- * the current request, so that it will be processed immediately, on the next
- * pass through the driver.
- */
-static void ide_queue_pc_head(ide_drive_t *drive, struct gendisk *disk,
-			      struct ide_atapi_pc *pc, struct request *rq)
-{
-	blk_rq_init(NULL, rq);
-	rq->cmd_type = REQ_TYPE_SPECIAL;
-	rq->cmd_flags |= REQ_PREEMPT;
-	rq->special = (char *)pc;
-	rq->rq_disk = disk;
-
-	if (pc->req_xfer) {
-		rq->data = pc->buf;
-		rq->data_len = pc->req_xfer;
-	}
-
-	memcpy(rq->cmd, pc->c, 12);
-	if (drive->media == ide_tape)
-		rq->cmd[13] = REQ_IDETAPE_PC1;
-
-	drive->hwif->rq = NULL;
-
-	elv_add_request(drive->queue, rq, ELEVATOR_INSERT_FRONT, 0);
-}
-
-/*
  * Add a special packet command request to the tail of the request queue,
  * and wait for it to be serviced.
  */
@@ -254,18 +226,26 @@ EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
 
 /*
  * Called when an error was detected during the last packet command.
- * We queue a request sense packet command in the head of the request list.
+ * We queue a request sense packet command at the head of the request
+ * queue.
  */
-void ide_retry_pc(ide_drive_t *drive, struct gendisk *disk)
+void ide_retry_pc(ide_drive_t *drive)
 {
-	struct request *rq = &drive->request_sense_rq;
+	struct request *sense_rq = &drive->sense_rq;
 	struct ide_atapi_pc *pc = &drive->request_sense_pc;
 
 	(void)ide_read_error(drive);
-	ide_create_request_sense_cmd(drive, pc);
+
+	/* init pc from sense_rq */
+	ide_init_pc(pc);
+	memcpy(pc->c, sense_rq->cmd, 12);
+	pc->buf = sense_rq->data;
+	pc->req_xfer = sense_rq->data_len;
+
 	if (drive->media == ide_tape)
 		set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags);
-	ide_queue_pc_head(drive, disk, pc, rq);
+
+	ide_queue_sense_rq(drive, pc);
 }
 EXPORT_SYMBOL_GPL(ide_retry_pc);
 
@@ -404,7 +384,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 			debug_log("[cmd %x]: check condition\n", rq->cmd[0]);
 
 			/* Retry operation */
-			ide_retry_pc(drive, rq->rq_disk);
+			ide_retry_pc(drive);
 
 			/* queued, but not started */
 			return ide_stopped;
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 9460033..d3302cc 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -263,7 +263,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		}
 		pc = &floppy->queued_pc;
 		idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block);
-	} else if (blk_special_request(rq)) {
+	} else if (blk_special_request(rq) || blk_sense_request(rq)) {
 		pc = (struct ide_atapi_pc *)rq->special;
 	} else if (blk_pc_request(rq)) {
 		pc = &floppy->queued_pc;
@@ -273,6 +273,8 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		goto out_end;
 	}
 
+	ide_prep_sense(drive, rq);
+
 	memset(&cmd, 0, sizeof(cmd));
 
 	if (rq_data_dir(rq))
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index aadf53c..8324dfa 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -695,7 +695,7 @@ static ide_startstop_t idetape_media_access_finished(ide_drive_t *drive)
 				printk(KERN_ERR "ide-tape: %s: I/O error, ",
 						tape->name);
 			/* Retry operation */
-			ide_retry_pc(drive, tape->disk);
+			ide_retry_pc(drive);
 			return ide_stopped;
 		}
 		pc->error = 0;
@@ -752,7 +752,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 			(unsigned long long)rq->sector, rq->nr_sectors,
 			rq->current_nr_sectors);
 
-	if (!blk_special_request(rq)) {
+	if (!(blk_special_request(rq) || blk_sense_request(rq))) {
 		/* We do not support buffer cache originated requests. */
 		printk(KERN_NOTICE "ide-tape: %s: Unsupported request in "
 			"request queue (%d)\n", drive->name, rq->cmd_type);
@@ -840,6 +840,9 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	BUG();
 
 out:
+	/* prepare sense request for this command */
+	ide_prep_sense(drive, rq);
+
 	memset(&cmd, 0, sizeof(cmd));
 
 	if (rq_data_dir(rq))
diff --git a/include/linux/ide.h b/include/linux/ide.h
index a69ccac..9e67cca 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -604,7 +604,6 @@ struct ide_drive_s {
 	unsigned long atapi_flags;
 
 	struct ide_atapi_pc request_sense_pc;
-	struct request request_sense_rq;
 
 	/* current sense rq and buffer */
 	bool sense_rq_armed;
@@ -1181,7 +1180,7 @@ int ide_do_test_unit_ready(ide_drive_t *, struct gendisk *);
 int ide_do_start_stop(ide_drive_t *, struct gendisk *, int);
 int ide_set_media_lock(ide_drive_t *, struct gendisk *, int);
 void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *);
-void ide_retry_pc(ide_drive_t *, struct gendisk *);
+void ide_retry_pc(ide_drive_t *drive);
 
 void ide_prep_sense(ide_drive_t *drive, struct request *rq);
 void ide_queue_sense_rq(ide_drive_t *drive, void *special);
-- 
cgit v0.10.2


From 5c4be57249e2e09136446597d2fe2a967c6ffef0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: ide-cd,atapi: use bio for internal commands

Impact: unify request data buffer handling

rq->data is used mostly to pass kernel buffer through request queue
without using bio.  There are only a couple of places which still do
this in kernel and converting to bio isn't difficult.

This patch converts ide-cd and atapi to use bio instead of rq->data
for request sense and internal pc commands.  With previous change to
unify sense request handling, this is relatively easily achieved by
adding blk_rq_map_kern() during sense_rq prep and PC issue.

If blk_rq_map_kern() fails for sense, the error is deferred till sense
issue and aborts the failed command which triggered the sense.  Note
that this is a slim possibility as sense prep is done on each command
issue, so for the above condition to actually trigger, all preps since
the last sense issue till the issue of the request which would require
a sense should fail.

* do_request functions might sleep now.  This should be okay as ide
  request_fn - do_ide_request() - is invoked only from make_request
  and plug work.  Make sure this is the case by adding might_sleep()
  to do_ide_request().

* Functions which access the read sense data before the sense request
  is complete now should access bio_data(sense_rq->bio) as the sense
  buffer might have been copied during blk_rq_map_kern().

* ide-tape updated to map sg.

* cdrom_do_block_pc() now doesn't have to deal with REQ_TYPE_ATA_PC
  special case.  Simplified.

* tp_ops->output/input_data path dropped from ide_pc_intr().

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 972c522..5cefe12 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -94,16 +94,18 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 	rq->special = (char *)pc;
 
 	if (pc->req_xfer) {
-		rq->data = pc->buf;
-		rq->data_len = pc->req_xfer;
+		error = blk_rq_map_kern(drive->queue, rq, pc->buf, pc->req_xfer,
+					GFP_NOIO);
+		if (error)
+			goto put_req;
 	}
 
 	memcpy(rq->cmd, pc->c, 12);
 	if (drive->media == ide_tape)
 		rq->cmd[13] = REQ_IDETAPE_PC1;
 	error = blk_execute_rq(drive->queue, disk, rq, 0);
+put_req:
 	blk_put_request(rq);
-
 	return error;
 }
 EXPORT_SYMBOL_GPL(ide_queue_pc_tail);
@@ -168,6 +170,7 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 	struct request_sense *sense = &drive->sense_data;
 	struct request *sense_rq = &drive->sense_rq;
 	unsigned int cmd_len, sense_len;
+	int err;
 
 	debug_log("%s: enter\n", __func__);
 
@@ -193,13 +196,19 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 	memset(sense, 0, sizeof(*sense));
 
 	blk_rq_init(rq->q, sense_rq);
-	sense_rq->rq_disk = rq->rq_disk;
 
-	sense_rq->data = sense;
+	err = blk_rq_map_kern(drive->queue, sense_rq, sense, sense_len,
+			      GFP_NOIO);
+	if (unlikely(err)) {
+		if (printk_ratelimit())
+			printk(KERN_WARNING "%s: failed to map sense buffer\n",
+			       drive->name);
+		return;
+	}
+
+	sense_rq->rq_disk = rq->rq_disk;
 	sense_rq->cmd[0] = GPCMD_REQUEST_SENSE;
 	sense_rq->cmd[4] = cmd_len;
-	sense_rq->data_len = sense_len;
-
 	sense_rq->cmd_type = REQ_TYPE_SENSE;
 	sense_rq->cmd_flags |= REQ_PREEMPT;
 
@@ -210,9 +219,14 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 }
 EXPORT_SYMBOL_GPL(ide_prep_sense);
 
-void ide_queue_sense_rq(ide_drive_t *drive, void *special)
+int ide_queue_sense_rq(ide_drive_t *drive, void *special)
 {
-	BUG_ON(!drive->sense_rq_armed);
+	/* deferred failure from ide_prep_sense() */
+	if (!drive->sense_rq_armed) {
+		printk(KERN_WARNING "%s: failed queue sense request\n",
+		       drive->name);
+		return -ENOMEM;
+	}
 
 	drive->sense_rq.special = special;
 	drive->sense_rq_armed = false;
@@ -221,6 +235,7 @@ void ide_queue_sense_rq(ide_drive_t *drive, void *special)
 
 	elv_add_request(drive->queue, &drive->sense_rq,
 			ELEVATOR_INSERT_FRONT, 0);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
 
@@ -239,13 +254,14 @@ void ide_retry_pc(ide_drive_t *drive)
 	/* init pc from sense_rq */
 	ide_init_pc(pc);
 	memcpy(pc->c, sense_rq->cmd, 12);
-	pc->buf = sense_rq->data;
+	pc->buf = bio_data(sense_rq->bio);	/* pointer to mapped address */
 	pc->req_xfer = sense_rq->data_len;
 
 	if (drive->media == ide_tape)
 		set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags);
 
-	ide_queue_sense_rq(drive, pc);
+	if (ide_queue_sense_rq(drive, pc))
+		ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq));
 }
 EXPORT_SYMBOL_GPL(ide_retry_pc);
 
@@ -317,7 +333,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 	struct ide_cmd *cmd = &hwif->cmd;
 	struct request *rq = hwif->rq;
 	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
-	xfer_func_t *xferfunc;
 	unsigned int timeout, done;
 	u16 bcount;
 	u8 stat, ireason, dsc = 0;
@@ -411,7 +426,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 					rq->errors = -EIO;
 			}
 
-			if (drive->media == ide_tape)
+			if (drive->media == ide_tape && !rq->bio)
 				done = ide_rq_bytes(rq); /* FIXME */
 			else
 				done = blk_rq_bytes(rq);
@@ -448,16 +463,11 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		return ide_do_reset(drive);
 	}
 
-	xferfunc = write ? tp_ops->output_data : tp_ops->input_data;
-
-	if (drive->media == ide_floppy && pc->buf == NULL) {
+	if (drive->media == ide_tape && pc->bh)
+		done = drive->pc_io_buffers(drive, pc, bcount, write);
+	else {
 		done = min_t(unsigned int, bcount, cmd->nleft);
 		ide_pio_bytes(drive, cmd, write, done);
-	} else if (drive->media == ide_tape && pc->bh) {
-		done = drive->pc_io_buffers(drive, pc, bcount, write);
-	} else {
-		done = min_t(unsigned int, bcount, pc->req_xfer - pc->xferred);
-		xferfunc(drive, NULL, pc->cur_pos, done);
 	}
 
 	/* Update the current position */
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 7b21c7e..392a5bd 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -210,10 +210,12 @@ static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
 {
 	/*
 	 * For REQ_TYPE_SENSE, "rq->special" points to the original
-	 * failed request
+	 * failed request.  Also, the sense data should be read
+	 * directly from rq which might be different from the original
+	 * sense buffer if it got copied during mapping.
 	 */
 	struct request *failed = (struct request *)rq->special;
-	struct request_sense *sense = &drive->sense_data;
+	void *sense = bio_data(rq->bio);
 
 	if (failed) {
 		if (failed->sense) {
@@ -398,7 +400,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 
 	/* if we got a CHECK_CONDITION status, queue a request sense command */
 	if (stat & ATA_ERR)
-		ide_queue_sense_rq(drive, NULL);
+		return ide_queue_sense_rq(drive, NULL) ? 2 : 1;
 	return 1;
 
 end_request:
@@ -412,8 +414,7 @@ end_request:
 
 		hwif->rq = NULL;
 
-		ide_queue_sense_rq(drive, rq);
-		return 1;
+		return ide_queue_sense_rq(drive, rq) ? 2 : 1;
 	} else
 		return 2;
 }
@@ -507,8 +508,12 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 		rq->cmd_flags |= cmd_flags;
 		rq->timeout = timeout;
 		if (buffer) {
-			rq->data = buffer;
-			rq->data_len = *bufflen;
+			error = blk_rq_map_kern(drive->queue, rq, buffer,
+						*bufflen, GFP_NOIO);
+			if (error) {
+				blk_put_request(rq);
+				return error;
+			}
 		}
 
 		error = blk_execute_rq(drive->queue, info->disk, rq, 0);
@@ -802,15 +807,10 @@ static void cdrom_do_block_pc(ide_drive_t *drive, struct request *rq)
 	drive->dma = 0;
 
 	/* sg request */
-	if (rq->bio || ((rq->cmd_type == REQ_TYPE_ATA_PC) && rq->data_len)) {
+	if (rq->bio) {
 		struct request_queue *q = drive->queue;
+		char *buf = bio_data(rq->bio);
 		unsigned int alignment;
-		char *buf;
-
-		if (rq->bio)
-			buf = bio_data(rq->bio);
-		else
-			buf = rq->data;
 
 		drive->dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA);
 
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 9b9e8b1..3245c2d 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -481,6 +481,9 @@ void do_ide_request(struct request_queue *q)
 
 	spin_unlock_irq(q->queue_lock);
 
+	/* HLD do_request() callback might sleep, make sure it's okay */
+	might_sleep();
+
 	if (ide_lock_host(host, hwif))
 		goto plug_device_2;
 
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 8324dfa..9b762a2 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -850,6 +850,9 @@ out:
 
 	cmd.rq = rq;
 
+	ide_init_sg_cmd(&cmd, pc->req_xfer);
+	ide_map_sg(drive, &cmd);
+
 	return ide_tape_issue_pc(drive, &cmd, pc);
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 9e67cca..1957461 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1183,7 +1183,7 @@ void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *);
 void ide_retry_pc(ide_drive_t *drive);
 
 void ide_prep_sense(ide_drive_t *drive, struct request *rq);
-void ide_queue_sense_rq(ide_drive_t *drive, void *special);
+int ide_queue_sense_rq(ide_drive_t *drive, void *special);
 
 int ide_cd_expiry(ide_drive_t *);
 
-- 
cgit v0.10.2


From fc38b521dcffcb07447cd98fedc56f495c10b90d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:43 +0900
Subject: ide-pm: don't abuse rq->data

Impact: cleanup rq->data usage

ide-pm uses rq->data to carry pointer to struct request_pm_state
through request queue and rq->special is used to carray pointer to
local struct ide_cmd, which isn't necessary.  Use rq->special for
request_pm_state instead and use local ide_cmd in
ide_start_power_step().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 3245c2d..6e3094e 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -368,7 +368,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 		if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE)
 			return execute_drive_cmd(drive, rq);
 		else if (blk_pm_request(rq)) {
-			struct request_pm_state *pm = rq->data;
+			struct request_pm_state *pm = rq->special;
 #ifdef DEBUG_PM
 			printk("%s: start_power_step(step: %d)\n",
 				drive->name, pm->pm_step);
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index 0d8a151..ba1488b 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -7,7 +7,6 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq;
 	struct request_pm_state rqpm;
-	struct ide_cmd cmd;
 	int ret;
 
 	/* call ACPI _GTM only once */
@@ -15,11 +14,9 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 		ide_acpi_get_timing(hwif);
 
 	memset(&rqpm, 0, sizeof(rqpm));
-	memset(&cmd, 0, sizeof(cmd));
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_PM_SUSPEND;
-	rq->special = &cmd;
-	rq->data = &rqpm;
+	rq->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_SUSPEND;
 	if (mesg.event == PM_EVENT_PRETHAW)
 		mesg.event = PM_EVENT_FREEZE;
@@ -41,7 +38,6 @@ int generic_ide_resume(struct device *dev)
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq;
 	struct request_pm_state rqpm;
-	struct ide_cmd cmd;
 	int err;
 
 	/* call ACPI _PS0 / _STM only once */
@@ -53,12 +49,10 @@ int generic_ide_resume(struct device *dev)
 	ide_acpi_exec_tfs(drive);
 
 	memset(&rqpm, 0, sizeof(rqpm));
-	memset(&cmd, 0, sizeof(cmd));
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_PM_RESUME;
 	rq->cmd_flags |= REQ_PREEMPT;
-	rq->special = &cmd;
-	rq->data = &rqpm;
+	rq->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_RESUME;
 	rqpm.pm_state = PM_EVENT_ON;
 
@@ -77,7 +71,7 @@ int generic_ide_resume(struct device *dev)
 
 void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
 {
-	struct request_pm_state *pm = rq->data;
+	struct request_pm_state *pm = rq->special;
 
 #ifdef DEBUG_PM
 	printk(KERN_INFO "%s: complete_power_step(step: %d)\n",
@@ -107,10 +101,8 @@ void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
 
 ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 {
-	struct request_pm_state *pm = rq->data;
-	struct ide_cmd *cmd = rq->special;
-
-	memset(cmd, 0, sizeof(*cmd));
+	struct request_pm_state *pm = rq->special;
+	struct ide_cmd cmd = { };
 
 	switch (pm->pm_step) {
 	case IDE_PM_FLUSH_CACHE:	/* Suspend step 1 (flush cache) */
@@ -123,12 +115,12 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 			return ide_stopped;
 		}
 		if (ata_id_flush_ext_enabled(drive->id))
-			cmd->tf.command = ATA_CMD_FLUSH_EXT;
+			cmd.tf.command = ATA_CMD_FLUSH_EXT;
 		else
-			cmd->tf.command = ATA_CMD_FLUSH;
+			cmd.tf.command = ATA_CMD_FLUSH;
 		goto out_do_tf;
 	case IDE_PM_STANDBY:		/* Suspend step 2 (standby) */
-		cmd->tf.command = ATA_CMD_STANDBYNOW1;
+		cmd.tf.command = ATA_CMD_STANDBYNOW1;
 		goto out_do_tf;
 	case IDE_PM_RESTORE_PIO:	/* Resume step 1 (restore PIO) */
 		ide_set_max_pio(drive);
@@ -141,7 +133,7 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 			ide_complete_power_step(drive, rq);
 		return ide_stopped;
 	case IDE_PM_IDLE:		/* Resume step 2 (idle) */
-		cmd->tf.command = ATA_CMD_IDLEIMMEDIATE;
+		cmd.tf.command = ATA_CMD_IDLEIMMEDIATE;
 		goto out_do_tf;
 	case IDE_PM_RESTORE_DMA:	/* Resume step 3 (restore DMA) */
 		/*
@@ -163,11 +155,11 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 	return ide_stopped;
 
 out_do_tf:
-	cmd->valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
-	cmd->valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
-	cmd->protocol = ATA_PROT_NODATA;
+	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
+	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
+	cmd.protocol = ATA_PROT_NODATA;
 
-	return do_rw_taskfile(drive, cmd);
+	return do_rw_taskfile(drive, &cmd);
 }
 
 /**
@@ -181,7 +173,7 @@ out_do_tf:
 void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 {
 	struct request_queue *q = drive->queue;
-	struct request_pm_state *pm = rq->data;
+	struct request_pm_state *pm = rq->special;
 	unsigned long flags;
 
 	ide_complete_power_step(drive, rq);
@@ -207,7 +199,7 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 
 void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
 {
-	struct request_pm_state *pm = rq->data;
+	struct request_pm_state *pm = rq->special;
 
 	if (blk_pm_suspend_request(rq) &&
 	    pm->pm_step == IDE_PM_START_SUSPEND)
-- 
cgit v0.10.2


From ea7066afcd590e4663e6dc010f93704164050f48 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:02 +0900
Subject: ide-tape,floppy: fix failed command completion after request sense

Impact: fix infinite retry loop

After a command failed, ide-tape and floppy inserts REQUEST_SENSE in
front of the failed command and according to the result, sets
pc->retries, flags and errors.  After REQUEST_SENSE is complete, the
failed command is again at the front of the queue and if the verdict
was to terminate the request, the issue functions tries to complete it
directly by calling drive->pc_callback() and returning ide_stopped.

However, drive->pc_callback() doesn't complete a request.  It only
prepares for completion of the request.  As a result, this creates an
infinite loop where the failed request is retried perpetually.

Fix it by actually ending the request by calling ide_complete_rq().

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index d3302cc..d20704a 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -141,6 +141,7 @@ static ide_startstop_t ide_floppy_issue_pc(ide_drive_t *drive,
 
 		drive->failed_pc = NULL;
 		drive->pc_callback(drive, 0);
+		ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq));
 		return ide_stopped;
 	}
 
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 9b762a2..2b9a136 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -643,6 +643,7 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive,
 		}
 		drive->failed_pc = NULL;
 		drive->pc_callback(drive, 0);
+		ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq));
 		return ide_stopped;
 	}
 	debug_log(DBG_SENSE, "Retry #%d, cmd = %02X\n", pc->retries, pc->c[0]);
-- 
cgit v0.10.2


From b3071d190d6757b14af002a9d79832f12de61bce Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:02 +0900
Subject: ide-atapi,tape,floppy: allow ->pc_callback() to change rq->data_len

Impact: allow residual count implementation in ->pc_callback()

rq->data_len has two duties - carrying the number of input bytes on
issue and carrying residual count back to the issuer on completion.
ide-atapi completion callback ->pc_callback() is the right place to do
this but currently ide-atapi depends on rq->data_len carrying the
original request size after calling ->pc_callback() to complete the pc
request.

This patch makes ide_pc_intr(), ide_tape_issue_pc() and
ide_floppy_issue_pc() cache length to complete before calling
->pc_callback() so that it can modify rq->data_len as necessary.

Note: As using rq->data_len for two purposes can make cases like this
      incorrect in subtle ways, future changes will introduce separate
      field for residual count.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 5cefe12..3df5442 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -409,6 +409,16 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		if ((pc->flags & PC_FLAG_WAIT_FOR_DSC) && (stat & ATA_DSC) == 0)
 			dsc = 1;
 
+		/*
+		 * ->pc_callback() might change rq->data_len for
+		 * residual count, cache total length.
+		 */
+		if (!blk_special_request(rq) &&
+		    (drive->media == ide_tape && !rq->bio))
+			done = ide_rq_bytes(rq);        /* FIXME */
+		else
+			done = blk_rq_bytes(rq);
+
 		/* Command finished - Call the callback function */
 		uptodate = drive->pc_callback(drive, dsc);
 
@@ -417,7 +427,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 
 		if (blk_special_request(rq)) {
 			rq->errors = 0;
-			done = blk_rq_bytes(rq);
 			error = 0;
 		} else {
 
@@ -426,11 +435,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 					rq->errors = -EIO;
 			}
 
-			if (drive->media == ide_tape && !rq->bio)
-				done = ide_rq_bytes(rq); /* FIXME */
-			else
-				done = blk_rq_bytes(rq);
-
 			error = uptodate ? 0 : -EIO;
 		}
 
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index d20704a..537b7c5 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -134,14 +134,17 @@ static ide_startstop_t ide_floppy_issue_pc(ide_drive_t *drive,
 	drive->pc = pc;
 
 	if (pc->retries > IDEFLOPPY_MAX_PC_RETRIES) {
+		unsigned int done = blk_rq_bytes(drive->hwif->rq);
+
 		if (!(pc->flags & PC_FLAG_SUPPRESS_ERROR))
 			ide_floppy_report_error(floppy, pc);
+
 		/* Giving up */
 		pc->error = IDE_DRV_ERROR_GENERAL;
 
 		drive->failed_pc = NULL;
 		drive->pc_callback(drive, 0);
-		ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq));
+		ide_complete_rq(drive, -EIO, done);
 		return ide_stopped;
 	}
 
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 2b9a136..8226d52 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -622,6 +622,8 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive,
 
 	if (pc->retries > IDETAPE_MAX_PC_RETRIES ||
 		(pc->flags & PC_FLAG_ABORT)) {
+		unsigned int done = blk_rq_bytes(drive->hwif->rq);
+
 		/*
 		 * We will "abort" retrying a packet command in case legitimate
 		 * error code was received (crossing a filemark, or end of the
@@ -641,9 +643,10 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive,
 			/* Giving up */
 			pc->error = IDE_DRV_ERROR_GENERAL;
 		}
+
 		drive->failed_pc = NULL;
 		drive->pc_callback(drive, 0);
-		ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq));
+		ide_complete_rq(drive, -EIO, done);
 		return ide_stopped;
 	}
 	debug_log(DBG_SENSE, "Retry #%d, cmd = %02X\n", pc->retries, pc->c[0]);
-- 
cgit v0.10.2


From 35ab8d3251833e4052aa64b09b08195e949518c7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:02 +0900
Subject: ide-tape: use single continuous buffer

Impact: simpler buffer allocation and handling, kills OOM, fix DMA transfers

ide-tape has its own multiple buffer mechanism using struct
idetape_bh.  It allocates buffer with decreasing order-of-two
allocations so that it results in minimum number of segments.
However, the implementation is quite complex and works in a way that
no other block or ide driver works necessitating a lot of special case
handling.

The benefit this complex allocation scheme brings is questionable as
PIO or DMA the number of segments (16 maximum) doesn't make any
noticeable difference and it also doesn't negate the need for multiple
order allocation which can fail under memory pressure or high
fragmentation although it does lower the highest order necessary by
one when the buffer size isn't power of two.

As the first step to remove the custom buffer management, this patch
makes ide-tape allocate single continous buffer.  The maximum order is
four.  I doubt the change would cause any trouble but if it ever
matters, it should be converted to regular sg mechanism like everyone
else and even in that case dropping custom buffer handling and moving
to standard mechanism first make sense as an intermediate step.

This patch makes the first bh to contain the whole buffer and drops
multi bh handling code.  Following patches will make further changes.

This patch has the side effect of killing OOM triggered by allocation
path and fixing DMA transfers.  Previously, bug in alloc path
triggered OOM on command issue and commands were passed to DMA engine
without DMA-mapping all the segments.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 8226d52..b2afbc7 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -134,7 +134,6 @@ enum {
 struct idetape_bh {
 	u32 b_size;
 	atomic_t b_count;
-	struct idetape_bh *b_reqnext;
 	char *b_data;
 };
 
@@ -228,10 +227,6 @@ typedef struct ide_tape_obj {
 	char *b_data;
 	int b_count;
 
-	int pages_per_buffer;
-	/* Wasted space in each stage */
-	int excess_bh_size;
-
 	/* Measures average tape speed */
 	unsigned long avg_time;
 	int avg_size;
@@ -303,9 +298,7 @@ static int idetape_input_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
 	struct idetape_bh *bh = pc->bh;
 	int count;
 
-	while (bcount) {
-		if (bh == NULL)
-			break;
+	if (bcount && bh) {
 		count = min(
 			(unsigned int)(bh->b_size - atomic_read(&bh->b_count)),
 			bcount);
@@ -313,15 +306,10 @@ static int idetape_input_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
 					atomic_read(&bh->b_count), count);
 		bcount -= count;
 		atomic_add(count, &bh->b_count);
-		if (atomic_read(&bh->b_count) == bh->b_size) {
-			bh = bh->b_reqnext;
-			if (bh)
-				atomic_set(&bh->b_count, 0);
-		}
+		if (atomic_read(&bh->b_count) == bh->b_size)
+			pc->bh = NULL;
 	}
 
-	pc->bh = bh;
-
 	return bcount;
 }
 
@@ -331,22 +319,14 @@ static int idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
 	struct idetape_bh *bh = pc->bh;
 	int count;
 
-	while (bcount) {
-		if (bh == NULL)
-			break;
+	if (bcount && bh) {
 		count = min((unsigned int)pc->b_count, (unsigned int)bcount);
 		drive->hwif->tp_ops->output_data(drive, NULL, pc->b_data, count);
 		bcount -= count;
 		pc->b_data += count;
 		pc->b_count -= count;
-		if (!pc->b_count) {
-			bh = bh->b_reqnext;
-			pc->bh = bh;
-			if (bh) {
-				pc->b_data = bh->b_data;
-				pc->b_count = atomic_read(&bh->b_count);
-			}
-		}
+		if (!pc->b_count)
+			pc->bh = NULL;
 	}
 
 	return bcount;
@@ -355,24 +335,20 @@ static int idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
 static void idetape_update_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc)
 {
 	struct idetape_bh *bh = pc->bh;
-	int count;
 	unsigned int bcount = pc->xferred;
 
 	if (pc->flags & PC_FLAG_WRITING)
 		return;
-	while (bcount) {
-		if (bh == NULL) {
+	if (bcount) {
+		if (bh == NULL || bcount > bh->b_size) {
 			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
 					__func__);
 			return;
 		}
-		count = min((unsigned int)bh->b_size, (unsigned int)bcount);
-		atomic_set(&bh->b_count, count);
+		atomic_set(&bh->b_count, bcount);
 		if (atomic_read(&bh->b_count) == bh->b_size)
-			bh = bh->b_reqnext;
-		bcount -= count;
+			pc->bh = NULL;
 	}
-	pc->bh = bh;
 }
 
 /*
@@ -439,24 +415,10 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense)
 /* Free data buffers completely. */
 static void ide_tape_kfree_buffer(idetape_tape_t *tape)
 {
-	struct idetape_bh *prev_bh, *bh = tape->merge_bh;
-
-	while (bh) {
-		u32 size = bh->b_size;
-
-		while (size) {
-			unsigned int order = fls(size >> PAGE_SHIFT)-1;
-
-			if (bh->b_data)
-				free_pages((unsigned long)bh->b_data, order);
+	struct idetape_bh *bh = tape->merge_bh;
 
-			size &= (order-1);
-			bh->b_data += (1 << order) * PAGE_SIZE;
-		}
-		prev_bh = bh;
-		bh = bh->b_reqnext;
-		kfree(prev_bh);
-	}
+	kfree(bh->b_data);
+	kfree(bh);
 }
 
 static void ide_tape_handle_dsc(ide_drive_t *);
@@ -861,117 +823,50 @@ out:
 }
 
 /*
- * The function below uses __get_free_pages to allocate a data buffer of size
- * tape->buffer_size (or a bit more). We attempt to combine sequential pages as
- * much as possible.
- *
- * It returns a pointer to the newly allocated buffer, or NULL in case of
- * failure.
+ * It returns a pointer to the newly allocated buffer, or NULL in case
+ * of failure.
  */
 static struct idetape_bh *ide_tape_kmalloc_buffer(idetape_tape_t *tape,
-						  int full, int clear)
-{
-	struct idetape_bh *prev_bh, *bh, *merge_bh;
-	int pages = tape->pages_per_buffer;
-	unsigned int order, b_allocd;
-	char *b_data = NULL;
-
-	merge_bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL);
-	bh = merge_bh;
-	if (bh == NULL)
-		goto abort;
-
-	order = fls(pages) - 1;
-	bh->b_data = (char *) __get_free_pages(GFP_KERNEL, order);
-	if (!bh->b_data)
-		goto abort;
-	b_allocd = (1 << order) * PAGE_SIZE;
-	pages &= (order-1);
-
-	if (clear)
-		memset(bh->b_data, 0, b_allocd);
-	bh->b_reqnext = NULL;
-	bh->b_size = b_allocd;
-	atomic_set(&bh->b_count, full ? bh->b_size : 0);
+						  int full)
+{
+	struct idetape_bh *bh;
 
-	while (pages) {
-		order = fls(pages) - 1;
-		b_data = (char *) __get_free_pages(GFP_KERNEL, order);
-		if (!b_data)
-			goto abort;
-		b_allocd = (1 << order) * PAGE_SIZE;
-
-		if (clear)
-			memset(b_data, 0, b_allocd);
-
-		/* newly allocated page frames below buffer header or ...*/
-		if (bh->b_data == b_data + b_allocd) {
-			bh->b_size += b_allocd;
-			bh->b_data -= b_allocd;
-			if (full)
-				atomic_add(b_allocd, &bh->b_count);
-			continue;
-		}
-		/* they are above the header */
-		if (b_data == bh->b_data + bh->b_size) {
-			bh->b_size += b_allocd;
-			if (full)
-				atomic_add(b_allocd, &bh->b_count);
-			continue;
-		}
-		prev_bh = bh;
-		bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL);
-		if (!bh) {
-			free_pages((unsigned long) b_data, order);
-			goto abort;
-		}
-		bh->b_reqnext = NULL;
-		bh->b_data = b_data;
-		bh->b_size = b_allocd;
-		atomic_set(&bh->b_count, full ? bh->b_size : 0);
-		prev_bh->b_reqnext = bh;
+	bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL);
+	if (!bh)
+		return NULL;
 
-		pages &= (order-1);
+	bh->b_data = kmalloc(tape->buffer_size, GFP_KERNEL);
+	if (!bh->b_data) {
+		kfree(bh);
+		return NULL;
 	}
 
-	bh->b_size -= tape->excess_bh_size;
-	if (full)
-		atomic_sub(tape->excess_bh_size, &bh->b_count);
-	return merge_bh;
-abort:
-	ide_tape_kfree_buffer(tape);
-	return NULL;
+	bh->b_size = tape->buffer_size;
+	atomic_set(&bh->b_count, full ? bh->b_size : 0);
+
+	return bh;
 }
 
 static int idetape_copy_stage_from_user(idetape_tape_t *tape,
 					const char __user *buf, int n)
 {
 	struct idetape_bh *bh = tape->bh;
-	int count;
 	int ret = 0;
 
-	while (n) {
-		if (bh == NULL) {
+	if (n) {
+		if (bh == NULL || n > bh->b_size - atomic_read(&bh->b_count)) {
 			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
 					__func__);
 			return 1;
 		}
-		count = min((unsigned int)
-				(bh->b_size - atomic_read(&bh->b_count)),
-				(unsigned int)n);
 		if (copy_from_user(bh->b_data + atomic_read(&bh->b_count), buf,
-				count))
+				   n))
 			ret = 1;
-		n -= count;
-		atomic_add(count, &bh->b_count);
-		buf += count;
-		if (atomic_read(&bh->b_count) == bh->b_size) {
-			bh = bh->b_reqnext;
-			if (bh)
-				atomic_set(&bh->b_count, 0);
-		}
+		atomic_add(n, &bh->b_count);
+		if (atomic_read(&bh->b_count) == bh->b_size)
+			tape->bh = NULL;
 	}
-	tape->bh = bh;
+
 	return ret;
 }
 
@@ -979,30 +874,20 @@ static int idetape_copy_stage_to_user(idetape_tape_t *tape, char __user *buf,
 				      int n)
 {
 	struct idetape_bh *bh = tape->bh;
-	int count;
 	int ret = 0;
 
-	while (n) {
-		if (bh == NULL) {
+	if (n) {
+		if (bh == NULL || n > tape->b_count) {
 			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
 					__func__);
 			return 1;
 		}
-		count = min(tape->b_count, n);
-		if  (copy_to_user(buf, tape->b_data, count))
+		if (copy_to_user(buf, tape->b_data, n))
 			ret = 1;
-		n -= count;
-		tape->b_data += count;
-		tape->b_count -= count;
-		buf += count;
-		if (!tape->b_count) {
-			bh = bh->b_reqnext;
-			tape->bh = bh;
-			if (bh) {
-				tape->b_data = bh->b_data;
-				tape->b_count = atomic_read(&bh->b_count);
-			}
-		}
+		tape->b_data += n;
+		tape->b_count -= n;
+		if (!tape->b_count)
+			tape->bh = NULL;
 	}
 	return ret;
 }
@@ -1254,7 +1139,7 @@ static int idetape_add_chrdev_write_request(ide_drive_t *drive, int blocks)
 static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	int blocks, min;
+	int blocks;
 	struct idetape_bh *bh;
 
 	if (tape->chrdev_dir != IDETAPE_DIR_WRITE) {
@@ -1269,31 +1154,16 @@ static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 	if (tape->merge_bh_size) {
 		blocks = tape->merge_bh_size / tape->blk_size;
 		if (tape->merge_bh_size % tape->blk_size) {
-			unsigned int i;
-
+			unsigned int i = tape->blk_size -
+				tape->merge_bh_size % tape->blk_size;
 			blocks++;
-			i = tape->blk_size - tape->merge_bh_size %
-				tape->blk_size;
-			bh = tape->bh->b_reqnext;
-			while (bh) {
-				atomic_set(&bh->b_count, 0);
-				bh = bh->b_reqnext;
-			}
 			bh = tape->bh;
-			while (i) {
-				if (bh == NULL) {
-					printk(KERN_INFO "ide-tape: bug,"
-							 " bh NULL\n");
-					break;
-				}
-				min = min(i, (unsigned int)(bh->b_size -
-						atomic_read(&bh->b_count)));
+			if (bh) {
 				memset(bh->b_data + atomic_read(&bh->b_count),
-						0, min);
-				atomic_add(min, &bh->b_count);
-				i -= min;
-				bh = bh->b_reqnext;
-			}
+				       0, i);
+				atomic_add(i, &bh->b_count);
+			} else
+				printk(KERN_INFO "ide-tape: bug, bh NULL\n");
 		}
 		(void) idetape_add_chrdev_write_request(drive, blocks);
 		tape->merge_bh_size = 0;
@@ -1321,7 +1191,7 @@ static int idetape_init_read(ide_drive_t *drive)
 					 " 0 now\n");
 			tape->merge_bh_size = 0;
 		}
-		tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0, 0);
+		tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0);
 		if (!tape->merge_bh)
 			return -ENOMEM;
 		tape->chrdev_dir = IDETAPE_DIR_READ;
@@ -1368,23 +1238,18 @@ static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks)
 static void idetape_pad_zeros(ide_drive_t *drive, int bcount)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	struct idetape_bh *bh;
+	struct idetape_bh *bh = tape->merge_bh;
 	int blocks;
 
 	while (bcount) {
 		unsigned int count;
 
-		bh = tape->merge_bh;
 		count = min(tape->buffer_size, bcount);
 		bcount -= count;
 		blocks = count / tape->blk_size;
-		while (count) {
-			atomic_set(&bh->b_count,
-				   min(count, (unsigned int)bh->b_size));
-			memset(bh->b_data, 0, atomic_read(&bh->b_count));
-			count -= atomic_read(&bh->b_count);
-			bh = bh->b_reqnext;
-		}
+		atomic_set(&bh->b_count, count);
+		memset(bh->b_data, 0, atomic_read(&bh->b_count));
+
 		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks,
 				      tape->merge_bh);
 	}
@@ -1596,7 +1461,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 				"should be 0 now\n");
 			tape->merge_bh_size = 0;
 		}
-		tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0, 0);
+		tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0);
 		if (!tape->merge_bh)
 			return -ENOMEM;
 		tape->chrdev_dir = IDETAPE_DIR_WRITE;
@@ -1970,7 +1835,7 @@ static void idetape_write_release(ide_drive_t *drive, unsigned int minor)
 	idetape_tape_t *tape = drive->driver_data;
 
 	ide_tape_flush_merge_buffer(drive);
-	tape->merge_bh = ide_tape_kmalloc_buffer(tape, 1, 0);
+	tape->merge_bh = ide_tape_kmalloc_buffer(tape, 1);
 	if (tape->merge_bh != NULL) {
 		idetape_pad_zeros(drive, tape->blk_size *
 				(tape->user_bs_factor - 1));
@@ -2201,11 +2066,6 @@ static void idetape_setup(ide_drive_t *drive, idetape_tape_t *tape, int minor)
 		tape->buffer_size = *ctl * tape->blk_size;
 	}
 	buffer_size = tape->buffer_size;
-	tape->pages_per_buffer = buffer_size / PAGE_SIZE;
-	if (buffer_size % PAGE_SIZE) {
-		tape->pages_per_buffer++;
-		tape->excess_bh_size = PAGE_SIZE - buffer_size % PAGE_SIZE;
-	}
 
 	/* select the "best" DSC read/write polling freq */
 	speed = max(*(u16 *)&tape->caps[14], *(u16 *)&tape->caps[8]);
-- 
cgit v0.10.2


From 21d9c5d227593d15630ae83a336d1519653e9b8a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:02 +0900
Subject: ide-tape: use standard data transfer mechanism

Impact: use standard way to transfer data

ide-tape uses rq in an interesting way.  For r/w requests, rq->special
is used to carry a private buffer management structure idetape_bh and
rq->nr_sectors and current_nr_sectors are initialized to the number of
idetape blocks which isn't necessary 512 bytes.  Also,
rq->current_nr_sectors is used to report back the residual count in
units of idetape blocks.

This peculiarity taxes both block layer and ide.  ide-atapi has
different paths and hooks to accomodate it and what a rq means becomes
quite confusing and making changes at the block layer becomes quite
difficult and error-prone.

This patch makes ide-tape use bio instead.  With the previous patch,
ide-tape currently is using single contiguos buffer so replacing it
isn't difficult.  Data buffer is mapped into bio using
blk_rq_map_kern() in idetape_queue_rw_tail().  idetape_io_buffers()
and idetape_update_buffers() are dropped and pc->bh is set to null to
tell ide-atapi to use standard data transfer mechanism and idetape_bh
byte counts are updated by the issuer on completion using the residual
count.

This change also nicely removes the FIXME in ide_pc_intr() where
ide-tape rqs need to be completed using ide_rq_bytes() instead of
blk_rq_bytes() (although this didn't really matter as the request
didn't have bio).

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 3df5442..b9dd450 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -413,11 +413,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		 * ->pc_callback() might change rq->data_len for
 		 * residual count, cache total length.
 		 */
-		if (!blk_special_request(rq) &&
-		    (drive->media == ide_tape && !rq->bio))
-			done = ide_rq_bytes(rq);        /* FIXME */
-		else
-			done = blk_rq_bytes(rq);
+		done = blk_rq_bytes(rq);
 
 		/* Command finished - Call the callback function */
 		uptodate = drive->pc_callback(drive, dsc);
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index b2afbc7..b373ac8 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -292,65 +292,6 @@ static struct ide_tape_obj *ide_tape_chrdev_get(unsigned int i)
 	return tape;
 }
 
-static int idetape_input_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
-				  unsigned int bcount)
-{
-	struct idetape_bh *bh = pc->bh;
-	int count;
-
-	if (bcount && bh) {
-		count = min(
-			(unsigned int)(bh->b_size - atomic_read(&bh->b_count)),
-			bcount);
-		drive->hwif->tp_ops->input_data(drive, NULL, bh->b_data +
-					atomic_read(&bh->b_count), count);
-		bcount -= count;
-		atomic_add(count, &bh->b_count);
-		if (atomic_read(&bh->b_count) == bh->b_size)
-			pc->bh = NULL;
-	}
-
-	return bcount;
-}
-
-static int idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
-				   unsigned int bcount)
-{
-	struct idetape_bh *bh = pc->bh;
-	int count;
-
-	if (bcount && bh) {
-		count = min((unsigned int)pc->b_count, (unsigned int)bcount);
-		drive->hwif->tp_ops->output_data(drive, NULL, pc->b_data, count);
-		bcount -= count;
-		pc->b_data += count;
-		pc->b_count -= count;
-		if (!pc->b_count)
-			pc->bh = NULL;
-	}
-
-	return bcount;
-}
-
-static void idetape_update_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc)
-{
-	struct idetape_bh *bh = pc->bh;
-	unsigned int bcount = pc->xferred;
-
-	if (pc->flags & PC_FLAG_WRITING)
-		return;
-	if (bcount) {
-		if (bh == NULL || bcount > bh->b_size) {
-			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
-					__func__);
-			return;
-		}
-		atomic_set(&bh->b_count, bcount);
-		if (atomic_read(&bh->b_count) == bh->b_size)
-			pc->bh = NULL;
-	}
-}
-
 /*
  * called on each failed packet command retry to analyze the request sense. We
  * currently do not utilize this information.
@@ -368,12 +309,10 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense)
 		 pc->c[0], tape->sense_key, tape->asc, tape->ascq);
 
 	/* Correct pc->xferred by asking the tape.	 */
-	if (pc->flags & PC_FLAG_DMA_ERROR) {
+	if (pc->flags & PC_FLAG_DMA_ERROR)
 		pc->xferred = pc->req_xfer -
 			tape->blk_size *
 			get_unaligned_be32(&sense[3]);
-		idetape_update_buffers(drive, pc);
-	}
 
 	/*
 	 * If error was the result of a zero-length read or write command,
@@ -458,7 +397,7 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc)
 		}
 
 		tape->first_frame += blocks;
-		rq->current_nr_sectors -= blocks;
+		rq->data_len -= blocks * tape->blk_size;
 
 		if (pc->error) {
 			uptodate = 0;
@@ -520,19 +459,6 @@ static void ide_tape_handle_dsc(ide_drive_t *drive)
 	idetape_postpone_request(drive);
 }
 
-static int ide_tape_io_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
-				unsigned int bcount, int write)
-{
-	unsigned int bleft;
-
-	if (write)
-		bleft = idetape_output_buffers(drive, pc, bcount);
-	else
-		bleft = idetape_input_buffers(drive, pc, bcount);
-
-	return bcount - bleft;
-}
-
 /*
  * Packet Command Interface
  *
@@ -683,7 +609,7 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape,
 	ide_init_pc(pc);
 	put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]);
 	pc->c[1] = 1;
-	pc->bh = bh;
+	pc->bh = NULL;
 	pc->buf = NULL;
 	pc->buf_size = length * tape->blk_size;
 	pc->req_xfer = pc->buf_size;
@@ -1063,10 +989,12 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks,
 				 struct idetape_bh *bh)
 {
 	idetape_tape_t *tape = drive->driver_data;
+	size_t size = blocks * tape->blk_size;
 	struct request *rq;
-	int ret, errors;
+	int ret;
 
 	debug_log(DBG_SENSE, "%s: cmd=%d\n", __func__, cmd);
+	BUG_ON(cmd != REQ_IDETAPE_READ && cmd != REQ_IDETAPE_WRITE);
 
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_SPECIAL;
@@ -1074,21 +1002,29 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks,
 	rq->rq_disk = tape->disk;
 	rq->special = (void *)bh;
 	rq->sector = tape->first_frame;
-	rq->nr_sectors = blocks;
-	rq->current_nr_sectors = blocks;
+
+	if (size) {
+		ret = blk_rq_map_kern(drive->queue, rq, bh->b_data, size,
+				      __GFP_WAIT);
+		if (ret)
+			goto out_put;
+	}
+
 	blk_execute_rq(drive->queue, tape->disk, rq, 0);
 
-	errors = rq->errors;
-	ret = tape->blk_size * (blocks - rq->current_nr_sectors);
-	blk_put_request(rq);
+	/* calculate the number of transferred bytes and update bh */
+	size -= rq->data_len;
+	if (cmd == REQ_IDETAPE_READ)
+		atomic_add(size, &bh->b_count);
 
-	if ((cmd & (REQ_IDETAPE_READ | REQ_IDETAPE_WRITE)) == 0)
-		return 0;
+	ret = size;
+	if (rq->errors == IDE_DRV_ERROR_GENERAL)
+		ret = -EIO;
 
 	if (tape->merge_bh)
 		idetape_init_merge_buffer(tape);
-	if (errors == IDE_DRV_ERROR_GENERAL)
-		return -EIO;
+out_put:
+	blk_put_request(rq);
 	return ret;
 }
 
@@ -2034,8 +1970,6 @@ static void idetape_setup(ide_drive_t *drive, idetape_tape_t *tape, int minor)
 	u16 *ctl = (u16 *)&tape->caps[12];
 
 	drive->pc_callback	 = ide_tape_callback;
-	drive->pc_update_buffers = idetape_update_buffers;
-	drive->pc_io_buffers	 = ide_tape_io_buffers;
 
 	drive->dev_flags |= IDE_DFLAG_DSC_OVERLAP;
 
-- 
cgit v0.10.2


From 963da55c4b9eeeb2085ca74ba927cf77bce966d4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:02 +0900
Subject: ide-tape: kill idetape_bh

Impact: kill now unnecessary idetape_bh

With everything using standard mechanisms, there is no need for
idetape_bh anymore.  Kill it and use tape->buf, cur and valid to
describe data buffer instead.

Changes worth mentioning are...

* idetape_queue_rq_tail() now always queue tape->buf and and adjusts
  buffer state properly before completion.

* idetape_pad_zeros() clears the buffer only once.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index b373ac8..d2e9c09 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -131,12 +131,6 @@ enum {
 	IDETAPE_DIR_WRITE = (1 << 2),
 };
 
-struct idetape_bh {
-	u32 b_size;
-	atomic_t b_count;
-	char *b_data;
-};
-
 /* Tape door status */
 #define DOOR_UNLOCKED			0
 #define DOOR_LOCKED			1
@@ -218,14 +212,12 @@ typedef struct ide_tape_obj {
 
 	/* Data buffer size chosen based on the tape's recommendation */
 	int buffer_size;
-	/* merge buffer */
-	struct idetape_bh *merge_bh;
-	/* size of the merge buffer */
-	int merge_bh_size;
-	/* pointer to current buffer head within the merge buffer */
-	struct idetape_bh *bh;
-	char *b_data;
-	int b_count;
+	/* Staging buffer of buffer_size bytes */
+	void *buf;
+	/* The read/write cursor */
+	void *cur;
+	/* The number of valid bytes in buf */
+	size_t valid;
 
 	/* Measures average tape speed */
 	unsigned long avg_time;
@@ -351,15 +343,6 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense)
 	}
 }
 
-/* Free data buffers completely. */
-static void ide_tape_kfree_buffer(idetape_tape_t *tape)
-{
-	struct idetape_bh *bh = tape->merge_bh;
-
-	kfree(bh->b_data);
-	kfree(bh);
-}
-
 static void ide_tape_handle_dsc(ide_drive_t *);
 
 static int ide_tape_callback(ide_drive_t *drive, int dsc)
@@ -603,8 +586,7 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape,
 				   struct ide_atapi_pc *pc, struct request *rq,
 				   u8 opcode)
 {
-	struct idetape_bh *bh = (struct idetape_bh *)rq->special;
-	unsigned int length = rq->current_nr_sectors;
+	unsigned int length = rq->nr_sectors;
 
 	ide_init_pc(pc);
 	put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]);
@@ -616,14 +598,11 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape,
 	if (pc->req_xfer == tape->buffer_size)
 		pc->flags |= PC_FLAG_DMA_OK;
 
-	if (opcode == READ_6) {
+	if (opcode == READ_6)
 		pc->c[0] = READ_6;
-		atomic_set(&bh->b_count, 0);
-	} else if (opcode == WRITE_6) {
+	else if (opcode == WRITE_6) {
 		pc->c[0] = WRITE_6;
 		pc->flags |= PC_FLAG_WRITING;
-		pc->b_data = bh->b_data;
-		pc->b_count = atomic_read(&bh->b_count);
 	}
 
 	memcpy(rq->cmd, pc->c, 12);
@@ -639,10 +618,8 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	struct ide_cmd cmd;
 	u8 stat;
 
-	debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %lu,"
-			" current_nr_sectors: %u\n",
-			(unsigned long long)rq->sector, rq->nr_sectors,
-			rq->current_nr_sectors);
+	debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %lu\n"
+		  (unsigned long long)rq->sector, rq->nr_sectors);
 
 	if (!(blk_special_request(rq) || blk_sense_request(rq))) {
 		/* We do not support buffer cache originated requests. */
@@ -749,89 +726,6 @@ out:
 }
 
 /*
- * It returns a pointer to the newly allocated buffer, or NULL in case
- * of failure.
- */
-static struct idetape_bh *ide_tape_kmalloc_buffer(idetape_tape_t *tape,
-						  int full)
-{
-	struct idetape_bh *bh;
-
-	bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL);
-	if (!bh)
-		return NULL;
-
-	bh->b_data = kmalloc(tape->buffer_size, GFP_KERNEL);
-	if (!bh->b_data) {
-		kfree(bh);
-		return NULL;
-	}
-
-	bh->b_size = tape->buffer_size;
-	atomic_set(&bh->b_count, full ? bh->b_size : 0);
-
-	return bh;
-}
-
-static int idetape_copy_stage_from_user(idetape_tape_t *tape,
-					const char __user *buf, int n)
-{
-	struct idetape_bh *bh = tape->bh;
-	int ret = 0;
-
-	if (n) {
-		if (bh == NULL || n > bh->b_size - atomic_read(&bh->b_count)) {
-			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
-					__func__);
-			return 1;
-		}
-		if (copy_from_user(bh->b_data + atomic_read(&bh->b_count), buf,
-				   n))
-			ret = 1;
-		atomic_add(n, &bh->b_count);
-		if (atomic_read(&bh->b_count) == bh->b_size)
-			tape->bh = NULL;
-	}
-
-	return ret;
-}
-
-static int idetape_copy_stage_to_user(idetape_tape_t *tape, char __user *buf,
-				      int n)
-{
-	struct idetape_bh *bh = tape->bh;
-	int ret = 0;
-
-	if (n) {
-		if (bh == NULL || n > tape->b_count) {
-			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
-					__func__);
-			return 1;
-		}
-		if (copy_to_user(buf, tape->b_data, n))
-			ret = 1;
-		tape->b_data += n;
-		tape->b_count -= n;
-		if (!tape->b_count)
-			tape->bh = NULL;
-	}
-	return ret;
-}
-
-static void idetape_init_merge_buffer(idetape_tape_t *tape)
-{
-	struct idetape_bh *bh = tape->merge_bh;
-	tape->bh = tape->merge_bh;
-
-	if (tape->chrdev_dir == IDETAPE_DIR_WRITE)
-		atomic_set(&bh->b_count, 0);
-	else {
-		tape->b_data = bh->b_data;
-		tape->b_count = atomic_read(&bh->b_count);
-	}
-}
-
-/*
  * Write a filemark if write_filemark=1. Flush the device buffers without
  * writing a filemark otherwise.
  */
@@ -928,10 +822,10 @@ static void __ide_tape_discard_merge_buffer(ide_drive_t *drive)
 		return;
 
 	clear_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags);
-	tape->merge_bh_size = 0;
-	if (tape->merge_bh != NULL) {
-		ide_tape_kfree_buffer(tape);
-		tape->merge_bh = NULL;
+	tape->valid = 0;
+	if (tape->buf != NULL) {
+		kfree(tape->buf);
+		tape->buf = NULL;
 	}
 
 	tape->chrdev_dir = IDETAPE_DIR_NONE;
@@ -985,8 +879,7 @@ static void ide_tape_discard_merge_buffer(ide_drive_t *drive,
  * Generate a read/write request for the block device interface and wait for it
  * to be serviced.
  */
-static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks,
-				 struct idetape_bh *bh)
+static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	size_t size = blocks * tape->blk_size;
@@ -1000,11 +893,10 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks,
 	rq->cmd_type = REQ_TYPE_SPECIAL;
 	rq->cmd[13] = cmd;
 	rq->rq_disk = tape->disk;
-	rq->special = (void *)bh;
 	rq->sector = tape->first_frame;
 
 	if (size) {
-		ret = blk_rq_map_kern(drive->queue, rq, bh->b_data, size,
+		ret = blk_rq_map_kern(drive->queue, rq, tape->buf, size,
 				      __GFP_WAIT);
 		if (ret)
 			goto out_put;
@@ -1012,17 +904,17 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks,
 
 	blk_execute_rq(drive->queue, tape->disk, rq, 0);
 
-	/* calculate the number of transferred bytes and update bh */
+	/* calculate the number of transferred bytes and update buffer state */
 	size -= rq->data_len;
+	tape->cur = tape->buf;
 	if (cmd == REQ_IDETAPE_READ)
-		atomic_add(size, &bh->b_count);
+		tape->valid = size;
+	else
+		tape->valid = 0;
 
 	ret = size;
 	if (rq->errors == IDE_DRV_ERROR_GENERAL)
 		ret = -EIO;
-
-	if (tape->merge_bh)
-		idetape_init_merge_buffer(tape);
 out_put:
 	blk_put_request(rq);
 	return ret;
@@ -1064,49 +956,33 @@ static void idetape_create_space_cmd(struct ide_atapi_pc *pc, int count, u8 cmd)
 /* Queue up a character device originated write request. */
 static int idetape_add_chrdev_write_request(ide_drive_t *drive, int blocks)
 {
-	idetape_tape_t *tape = drive->driver_data;
-
 	debug_log(DBG_CHRDEV, "Enter %s\n", __func__);
 
-	return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE,
-				     blocks, tape->merge_bh);
+	return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks);
 }
 
 static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	int blocks;
-	struct idetape_bh *bh;
 
 	if (tape->chrdev_dir != IDETAPE_DIR_WRITE) {
 		printk(KERN_ERR "ide-tape: bug: Trying to empty merge buffer"
 				" but we are not writing.\n");
 		return;
 	}
-	if (tape->merge_bh_size > tape->buffer_size) {
-		printk(KERN_ERR "ide-tape: bug: merge_buffer too big\n");
-		tape->merge_bh_size = tape->buffer_size;
-	}
-	if (tape->merge_bh_size) {
-		blocks = tape->merge_bh_size / tape->blk_size;
-		if (tape->merge_bh_size % tape->blk_size) {
-			unsigned int i = tape->blk_size -
-				tape->merge_bh_size % tape->blk_size;
+	if (tape->buf) {
+		blocks = tape->valid / tape->blk_size;
+		if (tape->valid % tape->blk_size) {
 			blocks++;
-			bh = tape->bh;
-			if (bh) {
-				memset(bh->b_data + atomic_read(&bh->b_count),
-				       0, i);
-				atomic_add(i, &bh->b_count);
-			} else
-				printk(KERN_INFO "ide-tape: bug, bh NULL\n");
+			memset(tape->buf + tape->valid, 0,
+			       tape->blk_size - tape->valid % tape->blk_size);
 		}
 		(void) idetape_add_chrdev_write_request(drive, blocks);
-		tape->merge_bh_size = 0;
 	}
-	if (tape->merge_bh != NULL) {
-		ide_tape_kfree_buffer(tape);
-		tape->merge_bh = NULL;
+	if (tape->buf != NULL) {
+		kfree(tape->buf);
+		tape->buf = NULL;
 	}
 	tape->chrdev_dir = IDETAPE_DIR_NONE;
 }
@@ -1122,15 +998,15 @@ static int idetape_init_read(ide_drive_t *drive)
 			ide_tape_flush_merge_buffer(drive);
 			idetape_flush_tape_buffers(drive);
 		}
-		if (tape->merge_bh || tape->merge_bh_size) {
-			printk(KERN_ERR "ide-tape: merge_bh_size should be"
-					 " 0 now\n");
-			tape->merge_bh_size = 0;
+		if (tape->buf || tape->valid) {
+			printk(KERN_ERR "ide-tape: valid should be 0 now\n");
+			tape->valid = 0;
 		}
-		tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0);
-		if (!tape->merge_bh)
+		tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
+		if (!tape->buf)
 			return -ENOMEM;
 		tape->chrdev_dir = IDETAPE_DIR_READ;
+		tape->cur = tape->buf;
 
 		/*
 		 * Issue a read 0 command to ensure that DSC handshake is
@@ -1140,11 +1016,10 @@ static int idetape_init_read(ide_drive_t *drive)
 		 */
 		if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) {
 			bytes_read = idetape_queue_rw_tail(drive,
-							REQ_IDETAPE_READ, 0,
-							tape->merge_bh);
+							REQ_IDETAPE_READ, 0);
 			if (bytes_read < 0) {
-				ide_tape_kfree_buffer(tape);
-				tape->merge_bh = NULL;
+				kfree(tape->buf);
+				tape->buf = NULL;
 				tape->chrdev_dir = IDETAPE_DIR_NONE;
 				return bytes_read;
 			}
@@ -1157,8 +1032,6 @@ static int idetape_init_read(ide_drive_t *drive)
 /* called from idetape_chrdev_read() to service a chrdev read request. */
 static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks)
 {
-	idetape_tape_t *tape = drive->driver_data;
-
 	debug_log(DBG_PROCS, "Enter %s, %d blocks\n", __func__, blocks);
 
 	/* If we are at a filemark, return a read length of 0 */
@@ -1167,27 +1040,21 @@ static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks)
 
 	idetape_init_read(drive);
 
-	return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks,
-				     tape->merge_bh);
+	return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks);
 }
 
 static void idetape_pad_zeros(ide_drive_t *drive, int bcount)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	struct idetape_bh *bh = tape->merge_bh;
-	int blocks;
+
+	memset(tape->buf, 0, tape->buffer_size);
 
 	while (bcount) {
-		unsigned int count;
+		unsigned int count = min(tape->buffer_size, bcount);
 
-		count = min(tape->buffer_size, bcount);
+		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE,
+				      count / tape->blk_size);
 		bcount -= count;
-		blocks = count / tape->blk_size;
-		atomic_set(&bh->b_count, count);
-		memset(bh->b_data, 0, atomic_read(&bh->b_count));
-
-		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks,
-				      tape->merge_bh);
 	}
 }
 
@@ -1267,7 +1134,7 @@ static int idetape_space_over_filemarks(ide_drive_t *drive, short mt_op,
 	}
 
 	if (tape->chrdev_dir == IDETAPE_DIR_READ) {
-		tape->merge_bh_size = 0;
+		tape->valid = 0;
 		if (test_and_clear_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags))
 			++count;
 		ide_tape_discard_merge_buffer(drive, 0);
@@ -1333,20 +1200,20 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 		return rc;
 	if (count == 0)
 		return (0);
-	if (tape->merge_bh_size) {
-		actually_read = min((unsigned int)(tape->merge_bh_size),
-				    (unsigned int)count);
-		if (idetape_copy_stage_to_user(tape, buf, actually_read))
+	if (tape->valid) {
+		actually_read = min_t(unsigned int, tape->valid, count);
+		if (copy_to_user(buf, tape->cur, actually_read))
 			ret = -EFAULT;
 		buf += actually_read;
-		tape->merge_bh_size -= actually_read;
 		count -= actually_read;
+		tape->cur += actually_read;
+		tape->valid -= actually_read;
 	}
 	while (count >= tape->buffer_size) {
 		bytes_read = idetape_add_chrdev_read_request(drive, ctl);
 		if (bytes_read <= 0)
 			goto finish;
-		if (idetape_copy_stage_to_user(tape, buf, bytes_read))
+		if (copy_to_user(buf, tape->cur, bytes_read))
 			ret = -EFAULT;
 		buf += bytes_read;
 		count -= bytes_read;
@@ -1357,10 +1224,11 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 		if (bytes_read <= 0)
 			goto finish;
 		temp = min((unsigned long)count, (unsigned long)bytes_read);
-		if (idetape_copy_stage_to_user(tape, buf, temp))
+		if (copy_to_user(buf, tape->cur, temp))
 			ret = -EFAULT;
 		actually_read += temp;
-		tape->merge_bh_size = bytes_read-temp;
+		tape->cur += temp;
+		tape->valid -= temp;
 	}
 finish:
 	if (!actually_read && test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) {
@@ -1392,16 +1260,15 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	if (tape->chrdev_dir != IDETAPE_DIR_WRITE) {
 		if (tape->chrdev_dir == IDETAPE_DIR_READ)
 			ide_tape_discard_merge_buffer(drive, 1);
-		if (tape->merge_bh || tape->merge_bh_size) {
-			printk(KERN_ERR "ide-tape: merge_bh_size "
-				"should be 0 now\n");
-			tape->merge_bh_size = 0;
+		if (tape->buf || tape->valid) {
+			printk(KERN_ERR "ide-tape: valid should be 0 now\n");
+			tape->valid = 0;
 		}
-		tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0);
-		if (!tape->merge_bh)
+		tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
+		if (!tape->buf)
 			return -ENOMEM;
 		tape->chrdev_dir = IDETAPE_DIR_WRITE;
-		idetape_init_merge_buffer(tape);
+		tape->cur = tape->buf;
 
 		/*
 		 * Issue a write 0 command to ensure that DSC handshake is
@@ -1411,11 +1278,10 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 		 */
 		if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) {
 			ssize_t retval = idetape_queue_rw_tail(drive,
-							REQ_IDETAPE_WRITE, 0,
-							tape->merge_bh);
+							REQ_IDETAPE_WRITE, 0);
 			if (retval < 0) {
-				ide_tape_kfree_buffer(tape);
-				tape->merge_bh = NULL;
+				kfree(tape->buf);
+				tape->buf = NULL;
 				tape->chrdev_dir = IDETAPE_DIR_NONE;
 				return retval;
 			}
@@ -1423,23 +1289,19 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	}
 	if (count == 0)
 		return (0);
-	if (tape->merge_bh_size) {
-		if (tape->merge_bh_size >= tape->buffer_size) {
-			printk(KERN_ERR "ide-tape: bug: merge buf too big\n");
-			tape->merge_bh_size = 0;
-		}
-		actually_written = min((unsigned int)
-				(tape->buffer_size - tape->merge_bh_size),
-				(unsigned int)count);
-		if (idetape_copy_stage_from_user(tape, buf, actually_written))
-				ret = -EFAULT;
+	if (tape->valid < tape->buffer_size) {
+		actually_written = min_t(unsigned int,
+					 tape->buffer_size - tape->valid,
+					 count);
+		if (copy_from_user(tape->cur, buf, actually_written))
+			ret = -EFAULT;
 		buf += actually_written;
-		tape->merge_bh_size += actually_written;
 		count -= actually_written;
+		tape->cur += actually_written;
+		tape->valid += actually_written;
 
-		if (tape->merge_bh_size == tape->buffer_size) {
+		if (tape->valid == tape->buffer_size) {
 			ssize_t retval;
-			tape->merge_bh_size = 0;
 			retval = idetape_add_chrdev_write_request(drive, ctl);
 			if (retval <= 0)
 				return (retval);
@@ -1447,7 +1309,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	}
 	while (count >= tape->buffer_size) {
 		ssize_t retval;
-		if (idetape_copy_stage_from_user(tape, buf, tape->buffer_size))
+		if (copy_from_user(tape->cur, buf, tape->buffer_size))
 			ret = -EFAULT;
 		buf += tape->buffer_size;
 		count -= tape->buffer_size;
@@ -1458,9 +1320,10 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	}
 	if (count) {
 		actually_written += count;
-		if (idetape_copy_stage_from_user(tape, buf, count))
+		if (copy_from_user(tape->cur, buf, count))
 			ret = -EFAULT;
-		tape->merge_bh_size += count;
+		tape->cur += count;
+		tape->valid += count;
 	}
 	return ret ? ret : actually_written;
 }
@@ -1623,7 +1486,7 @@ static int idetape_chrdev_ioctl(struct inode *inode, struct file *file,
 		idetape_flush_tape_buffers(drive);
 	}
 	if (cmd == MTIOCGET || cmd == MTIOCPOS) {
-		block_offset = tape->merge_bh_size /
+		block_offset = tape->valid /
 			(tape->blk_size * tape->user_bs_factor);
 		position = idetape_read_position(drive);
 		if (position < 0)
@@ -1771,12 +1634,12 @@ static void idetape_write_release(ide_drive_t *drive, unsigned int minor)
 	idetape_tape_t *tape = drive->driver_data;
 
 	ide_tape_flush_merge_buffer(drive);
-	tape->merge_bh = ide_tape_kmalloc_buffer(tape, 1);
-	if (tape->merge_bh != NULL) {
+	tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
+	if (tape->buf != NULL) {
 		idetape_pad_zeros(drive, tape->blk_size *
 				(tape->user_bs_factor - 1));
-		ide_tape_kfree_buffer(tape);
-		tape->merge_bh = NULL;
+		kfree(tape->buf);
+		tape->buf = NULL;
 	}
 	idetape_write_filemark(drive);
 	idetape_flush_tape_buffers(drive);
@@ -2042,7 +1905,7 @@ static void ide_tape_release(struct device *dev)
 	ide_drive_t *drive = tape->drive;
 	struct gendisk *g = tape->disk;
 
-	BUG_ON(tape->merge_bh_size);
+	BUG_ON(tape->valid);
 
 	drive->dev_flags &= ~IDE_DFLAG_DSC_OVERLAP;
 	drive->driver_data = NULL;
-- 
cgit v0.10.2


From 88f1b941c5c94016a59144a3c94c9ca31eb16205 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:02 +0900
Subject: ide-tape: unify r/w init paths

Impact: cleanup

Read and write init paths are almost identical.  Unify them into
idetape_init_rw().

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index d2e9c09..4da7fa9 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -987,42 +987,50 @@ static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 	tape->chrdev_dir = IDETAPE_DIR_NONE;
 }
 
-static int idetape_init_read(ide_drive_t *drive)
+static int idetape_init_rw(ide_drive_t *drive, int dir)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	int bytes_read;
+	int rc;
 
-	/* Initialize read operation */
-	if (tape->chrdev_dir != IDETAPE_DIR_READ) {
-		if (tape->chrdev_dir == IDETAPE_DIR_WRITE) {
-			ide_tape_flush_merge_buffer(drive);
-			idetape_flush_tape_buffers(drive);
-		}
-		if (tape->buf || tape->valid) {
-			printk(KERN_ERR "ide-tape: valid should be 0 now\n");
-			tape->valid = 0;
-		}
-		tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
-		if (!tape->buf)
-			return -ENOMEM;
-		tape->chrdev_dir = IDETAPE_DIR_READ;
-		tape->cur = tape->buf;
+	BUG_ON(dir != IDETAPE_DIR_READ && dir != IDETAPE_DIR_WRITE);
 
-		/*
-		 * Issue a read 0 command to ensure that DSC handshake is
-		 * switched from completion mode to buffer available mode.
-		 * No point in issuing this if DSC overlap isn't supported, some
-		 * drives (Seagate STT3401A) will return an error.
-		 */
-		if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) {
-			bytes_read = idetape_queue_rw_tail(drive,
-							REQ_IDETAPE_READ, 0);
-			if (bytes_read < 0) {
-				kfree(tape->buf);
-				tape->buf = NULL;
-				tape->chrdev_dir = IDETAPE_DIR_NONE;
-				return bytes_read;
-			}
+	if (tape->chrdev_dir == dir)
+		return 0;
+
+	if (tape->chrdev_dir == IDETAPE_DIR_READ)
+		ide_tape_discard_merge_buffer(drive, 1);
+	else if (tape->chrdev_dir == IDETAPE_DIR_WRITE) {
+		ide_tape_flush_merge_buffer(drive);
+		idetape_flush_tape_buffers(drive);
+	}
+
+	if (tape->buf || tape->valid) {
+		printk(KERN_ERR "ide-tape: valid should be 0 now\n");
+		tape->valid = 0;
+	}
+
+	tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
+	if (!tape->buf)
+		return -ENOMEM;
+	tape->chrdev_dir = dir;
+	tape->cur = tape->buf;
+
+	/*
+	 * Issue a 0 rw command to ensure that DSC handshake is
+	 * switched from completion mode to buffer available mode.  No
+	 * point in issuing this if DSC overlap isn't supported, some
+	 * drives (Seagate STT3401A) will return an error.
+	 */
+	if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) {
+		int cmd = dir == IDETAPE_DIR_READ ? REQ_IDETAPE_READ
+						  : REQ_IDETAPE_WRITE;
+
+		rc = idetape_queue_rw_tail(drive, cmd, 0);
+		if (rc < 0) {
+			kfree(tape->buf);
+			tape->buf = NULL;
+			tape->chrdev_dir = IDETAPE_DIR_NONE;
+			return rc;
 		}
 	}
 
@@ -1038,7 +1046,7 @@ static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks)
 	if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags))
 		return 0;
 
-	idetape_init_read(drive);
+	idetape_init_rw(drive, IDETAPE_DIR_READ);
 
 	return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks);
 }
@@ -1195,7 +1203,7 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 			    (count % tape->blk_size) == 0)
 				tape->user_bs_factor = count / tape->blk_size;
 	}
-	rc = idetape_init_read(drive);
+	rc = idetape_init_rw(drive, IDETAPE_DIR_READ);
 	if (rc < 0)
 		return rc;
 	if (count == 0)
@@ -1249,6 +1257,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	ssize_t actually_written = 0;
 	ssize_t ret = 0;
 	u16 ctl = *(u16 *)&tape->caps[12];
+	int rc;
 
 	/* The drive is write protected. */
 	if (tape->write_prot)
@@ -1257,36 +1266,9 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count);
 
 	/* Initialize write operation */
-	if (tape->chrdev_dir != IDETAPE_DIR_WRITE) {
-		if (tape->chrdev_dir == IDETAPE_DIR_READ)
-			ide_tape_discard_merge_buffer(drive, 1);
-		if (tape->buf || tape->valid) {
-			printk(KERN_ERR "ide-tape: valid should be 0 now\n");
-			tape->valid = 0;
-		}
-		tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
-		if (!tape->buf)
-			return -ENOMEM;
-		tape->chrdev_dir = IDETAPE_DIR_WRITE;
-		tape->cur = tape->buf;
-
-		/*
-		 * Issue a write 0 command to ensure that DSC handshake is
-		 * switched from completion mode to buffer available mode. No
-		 * point in issuing this if DSC overlap isn't supported, some
-		 * drives (Seagate STT3401A) will return an error.
-		 */
-		if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) {
-			ssize_t retval = idetape_queue_rw_tail(drive,
-							REQ_IDETAPE_WRITE, 0);
-			if (retval < 0) {
-				kfree(tape->buf);
-				tape->buf = NULL;
-				tape->chrdev_dir = IDETAPE_DIR_NONE;
-				return retval;
-			}
-		}
-	}
+	rc = idetape_init_rw(drive, IDETAPE_DIR_WRITE);
+	if (rc < 0)
+		return rc;
 	if (count == 0)
 		return (0);
 	if (tape->valid < tape->buffer_size) {
-- 
cgit v0.10.2


From 6bb11dd14f70228f8dab25fd25dabeb9bc74926d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:03 +0900
Subject: ide-tape: use byte size instead of sectors on rw issue functions

Impact: cleanup

Byte size is what most issue functions deal with, make
idetape_queue_rw_tail() and its wrappers take byte size instead of
sector counts.  idetape_chrdev_read() and write() functions are
converted to use tape->buffer_size instead of ctl from tape->cap.

This cleans up code a little bit and will ease the next r/w
reimplementation.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 4da7fa9..d5e9bb2 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -879,15 +879,15 @@ static void ide_tape_discard_merge_buffer(ide_drive_t *drive,
  * Generate a read/write request for the block device interface and wait for it
  * to be serviced.
  */
-static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks)
+static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	size_t size = blocks * tape->blk_size;
 	struct request *rq;
 	int ret;
 
 	debug_log(DBG_SENSE, "%s: cmd=%d\n", __func__, cmd);
 	BUG_ON(cmd != REQ_IDETAPE_READ && cmd != REQ_IDETAPE_WRITE);
+	BUG_ON(size < 0 || size % tape->blk_size);
 
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_SPECIAL;
@@ -954,17 +954,16 @@ static void idetape_create_space_cmd(struct ide_atapi_pc *pc, int count, u8 cmd)
 }
 
 /* Queue up a character device originated write request. */
-static int idetape_add_chrdev_write_request(ide_drive_t *drive, int blocks)
+static int idetape_add_chrdev_write_request(ide_drive_t *drive, int size)
 {
 	debug_log(DBG_CHRDEV, "Enter %s\n", __func__);
 
-	return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks);
+	return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, size);
 }
 
 static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	int blocks;
 
 	if (tape->chrdev_dir != IDETAPE_DIR_WRITE) {
 		printk(KERN_ERR "ide-tape: bug: Trying to empty merge buffer"
@@ -972,15 +971,10 @@ static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 		return;
 	}
 	if (tape->buf) {
-		blocks = tape->valid / tape->blk_size;
-		if (tape->valid % tape->blk_size) {
-			blocks++;
-			memset(tape->buf + tape->valid, 0,
-			       tape->blk_size - tape->valid % tape->blk_size);
-		}
-		(void) idetape_add_chrdev_write_request(drive, blocks);
-	}
-	if (tape->buf != NULL) {
+		size_t aligned = roundup(tape->valid, tape->blk_size);
+
+		memset(tape->cur, 0, aligned - tape->valid);
+		idetape_add_chrdev_write_request(drive, aligned);
 		kfree(tape->buf);
 		tape->buf = NULL;
 	}
@@ -1038,9 +1032,9 @@ static int idetape_init_rw(ide_drive_t *drive, int dir)
 }
 
 /* called from idetape_chrdev_read() to service a chrdev read request. */
-static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks)
+static int idetape_add_chrdev_read_request(ide_drive_t *drive, int size)
 {
-	debug_log(DBG_PROCS, "Enter %s, %d blocks\n", __func__, blocks);
+	debug_log(DBG_PROCS, "Enter %s, %d bytes\n", __func__, size);
 
 	/* If we are at a filemark, return a read length of 0 */
 	if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags))
@@ -1048,7 +1042,7 @@ static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks)
 
 	idetape_init_rw(drive, IDETAPE_DIR_READ);
 
-	return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks);
+	return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, size);
 }
 
 static void idetape_pad_zeros(ide_drive_t *drive, int bcount)
@@ -1060,8 +1054,7 @@ static void idetape_pad_zeros(ide_drive_t *drive, int bcount)
 	while (bcount) {
 		unsigned int count = min(tape->buffer_size, bcount);
 
-		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE,
-				      count / tape->blk_size);
+		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, count);
 		bcount -= count;
 	}
 }
@@ -1193,7 +1186,6 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 	ide_drive_t *drive = tape->drive;
 	ssize_t bytes_read, temp, actually_read = 0, rc;
 	ssize_t ret = 0;
-	u16 ctl = *(u16 *)&tape->caps[12];
 
 	debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count);
 
@@ -1218,7 +1210,8 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 		tape->valid -= actually_read;
 	}
 	while (count >= tape->buffer_size) {
-		bytes_read = idetape_add_chrdev_read_request(drive, ctl);
+		bytes_read = idetape_add_chrdev_read_request(drive,
+							     tape->buffer_size);
 		if (bytes_read <= 0)
 			goto finish;
 		if (copy_to_user(buf, tape->cur, bytes_read))
@@ -1228,7 +1221,8 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 		actually_read += bytes_read;
 	}
 	if (count) {
-		bytes_read = idetape_add_chrdev_read_request(drive, ctl);
+		bytes_read = idetape_add_chrdev_read_request(drive,
+							     tape->buffer_size);
 		if (bytes_read <= 0)
 			goto finish;
 		temp = min((unsigned long)count, (unsigned long)bytes_read);
@@ -1256,7 +1250,6 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	ide_drive_t *drive = tape->drive;
 	ssize_t actually_written = 0;
 	ssize_t ret = 0;
-	u16 ctl = *(u16 *)&tape->caps[12];
 	int rc;
 
 	/* The drive is write protected. */
@@ -1284,7 +1277,8 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 
 		if (tape->valid == tape->buffer_size) {
 			ssize_t retval;
-			retval = idetape_add_chrdev_write_request(drive, ctl);
+			retval = idetape_add_chrdev_write_request(drive,
+							tape->buffer_size);
 			if (retval <= 0)
 				return (retval);
 		}
@@ -1295,7 +1289,8 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 			ret = -EFAULT;
 		buf += tape->buffer_size;
 		count -= tape->buffer_size;
-		retval = idetape_add_chrdev_write_request(drive, ctl);
+		retval = idetape_add_chrdev_write_request(drive,
+							  tape->buffer_size);
 		actually_written += tape->buffer_size;
 		if (retval <= 0)
 			return (retval);
-- 
cgit v0.10.2


From 07bd9686c50c2b1f10e48089d4fb836a971f5177 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:03 +0900
Subject: ide-tape: simplify read/write functions

Impact: cleanup

idetape_chrdev_read/write() functions are unnecessarily complex when
everything can be handled in a single loop.  Collapse
idetape_add_chrdev_read/write_request() into the rw functions and
simplify the implementation.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index d5e9bb2..2599579 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -953,14 +953,6 @@ static void idetape_create_space_cmd(struct ide_atapi_pc *pc, int count, u8 cmd)
 	pc->flags |= PC_FLAG_WAIT_FOR_DSC;
 }
 
-/* Queue up a character device originated write request. */
-static int idetape_add_chrdev_write_request(ide_drive_t *drive, int size)
-{
-	debug_log(DBG_CHRDEV, "Enter %s\n", __func__);
-
-	return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, size);
-}
-
 static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
@@ -974,7 +966,7 @@ static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 		size_t aligned = roundup(tape->valid, tape->blk_size);
 
 		memset(tape->cur, 0, aligned - tape->valid);
-		idetape_add_chrdev_write_request(drive, aligned);
+		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, aligned);
 		kfree(tape->buf);
 		tape->buf = NULL;
 	}
@@ -1031,20 +1023,6 @@ static int idetape_init_rw(ide_drive_t *drive, int dir)
 	return 0;
 }
 
-/* called from idetape_chrdev_read() to service a chrdev read request. */
-static int idetape_add_chrdev_read_request(ide_drive_t *drive, int size)
-{
-	debug_log(DBG_PROCS, "Enter %s, %d bytes\n", __func__, size);
-
-	/* If we are at a filemark, return a read length of 0 */
-	if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags))
-		return 0;
-
-	idetape_init_rw(drive, IDETAPE_DIR_READ);
-
-	return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, size);
-}
-
 static void idetape_pad_zeros(ide_drive_t *drive, int bcount)
 {
 	idetape_tape_t *tape = drive->driver_data;
@@ -1184,8 +1162,9 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 {
 	struct ide_tape_obj *tape = file->private_data;
 	ide_drive_t *drive = tape->drive;
-	ssize_t bytes_read, temp, actually_read = 0, rc;
+	size_t done = 0;
 	ssize_t ret = 0;
+	int rc;
 
 	debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count);
 
@@ -1195,52 +1174,43 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 			    (count % tape->blk_size) == 0)
 				tape->user_bs_factor = count / tape->blk_size;
 	}
+
 	rc = idetape_init_rw(drive, IDETAPE_DIR_READ);
 	if (rc < 0)
 		return rc;
-	if (count == 0)
-		return (0);
-	if (tape->valid) {
-		actually_read = min_t(unsigned int, tape->valid, count);
-		if (copy_to_user(buf, tape->cur, actually_read))
-			ret = -EFAULT;
-		buf += actually_read;
-		count -= actually_read;
-		tape->cur += actually_read;
-		tape->valid -= actually_read;
-	}
-	while (count >= tape->buffer_size) {
-		bytes_read = idetape_add_chrdev_read_request(drive,
-							     tape->buffer_size);
-		if (bytes_read <= 0)
-			goto finish;
-		if (copy_to_user(buf, tape->cur, bytes_read))
-			ret = -EFAULT;
-		buf += bytes_read;
-		count -= bytes_read;
-		actually_read += bytes_read;
-	}
-	if (count) {
-		bytes_read = idetape_add_chrdev_read_request(drive,
-							     tape->buffer_size);
-		if (bytes_read <= 0)
-			goto finish;
-		temp = min((unsigned long)count, (unsigned long)bytes_read);
-		if (copy_to_user(buf, tape->cur, temp))
+
+	while (done < count) {
+		size_t todo;
+
+		/* refill if staging buffer is empty */
+		if (!tape->valid) {
+			/* If we are at a filemark, nothing more to read */
+			if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags))
+				break;
+			/* read */
+			if (idetape_queue_rw_tail(drive, REQ_IDETAPE_READ,
+						  tape->buffer_size) <= 0)
+				break;
+		}
+
+		/* copy out */
+		todo = min_t(size_t, count - done, tape->valid);
+		if (copy_to_user(buf + done, tape->cur, todo))
 			ret = -EFAULT;
-		actually_read += temp;
-		tape->cur += temp;
-		tape->valid -= temp;
+
+		tape->cur += todo;
+		tape->valid -= todo;
+		done += todo;
 	}
-finish:
-	if (!actually_read && test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) {
+
+	if (!done && test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) {
 		debug_log(DBG_SENSE, "%s: spacing over filemark\n", tape->name);
 
 		idetape_space_over_filemarks(drive, MTFSF, 1);
 		return 0;
 	}
 
-	return ret ? ret : actually_read;
+	return ret ? ret : done;
 }
 
 static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
@@ -1248,7 +1218,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 {
 	struct ide_tape_obj *tape = file->private_data;
 	ide_drive_t *drive = tape->drive;
-	ssize_t actually_written = 0;
+	size_t done = 0;
 	ssize_t ret = 0;
 	int rc;
 
@@ -1262,47 +1232,28 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	rc = idetape_init_rw(drive, IDETAPE_DIR_WRITE);
 	if (rc < 0)
 		return rc;
-	if (count == 0)
-		return (0);
-	if (tape->valid < tape->buffer_size) {
-		actually_written = min_t(unsigned int,
-					 tape->buffer_size - tape->valid,
-					 count);
-		if (copy_from_user(tape->cur, buf, actually_written))
-			ret = -EFAULT;
-		buf += actually_written;
-		count -= actually_written;
-		tape->cur += actually_written;
-		tape->valid += actually_written;
-
-		if (tape->valid == tape->buffer_size) {
-			ssize_t retval;
-			retval = idetape_add_chrdev_write_request(drive,
-							tape->buffer_size);
-			if (retval <= 0)
-				return (retval);
-		}
-	}
-	while (count >= tape->buffer_size) {
-		ssize_t retval;
-		if (copy_from_user(tape->cur, buf, tape->buffer_size))
-			ret = -EFAULT;
-		buf += tape->buffer_size;
-		count -= tape->buffer_size;
-		retval = idetape_add_chrdev_write_request(drive,
-							  tape->buffer_size);
-		actually_written += tape->buffer_size;
-		if (retval <= 0)
-			return (retval);
-	}
-	if (count) {
-		actually_written += count;
-		if (copy_from_user(tape->cur, buf, count))
+
+	while (done < count) {
+		size_t todo;
+
+		/* flush if staging buffer is full */
+		if (tape->valid == tape->buffer_size &&
+		    idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE,
+					  tape->buffer_size) <= 0)
+			return rc;
+
+		/* copy in */
+		todo = min_t(size_t, count - done,
+			     tape->buffer_size - tape->valid);
+		if (copy_from_user(tape->cur, buf + done, todo))
 			ret = -EFAULT;
-		tape->cur += count;
-		tape->valid += count;
+
+		tape->cur += todo;
+		tape->valid += todo;
+		done += todo;
 	}
-	return ret ? ret : actually_written;
+
+	return ret ? ret : done;
 }
 
 static int idetape_write_filemark(ide_drive_t *drive)
-- 
cgit v0.10.2


From 6d7003877c2f0578f1c08f66d05c3f72ef4ae596 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:03 +0900
Subject: ide-atapi: kill unused fields and callbacks

Impact: remove fields and code paths which are no longer necessary

Now that ide-tape uses standard mechanisms to transfer data, special
case handling for bh handling can be dropped from ide-atapi.  Drop the
followings.

* pc->cur_pos, b_count, bh and b_data
* drive->pc_update_buffers() and pc_io_buffers().

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index b9dd450..afe5a43 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -359,11 +359,8 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 					drive->name, rq_data_dir(pc->rq)
 						     ? "write" : "read");
 			pc->flags |= PC_FLAG_DMA_ERROR;
-		} else {
+		} else
 			pc->xferred = pc->req_xfer;
-			if (drive->pc_update_buffers)
-				drive->pc_update_buffers(drive, pc);
-		}
 		debug_log("%s: DMA finished\n", drive->name);
 	}
 
@@ -463,16 +460,11 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		return ide_do_reset(drive);
 	}
 
-	if (drive->media == ide_tape && pc->bh)
-		done = drive->pc_io_buffers(drive, pc, bcount, write);
-	else {
-		done = min_t(unsigned int, bcount, cmd->nleft);
-		ide_pio_bytes(drive, cmd, write, done);
-	}
+	done = min_t(unsigned int, bcount, cmd->nleft);
+	ide_pio_bytes(drive, cmd, write, done);
 
-	/* Update the current position */
+	/* Update transferred byte count */
 	pc->xferred += done;
-	pc->cur_pos += done;
 
 	bcount -= done;
 
@@ -650,7 +642,6 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd)
 
 		/* We haven't transferred any data yet */
 		pc->xferred = 0;
-		pc->cur_pos = pc->buf;
 
 		valid_tf = IDE_VALID_DEVICE;
 		bcount = ((drive->media == ide_tape) ?
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 2599579..8dfc688 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -591,7 +591,6 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape,
 	ide_init_pc(pc);
 	put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]);
 	pc->c[1] = 1;
-	pc->bh = NULL;
 	pc->buf = NULL;
 	pc->buf_size = length * tape->blk_size;
 	pc->req_xfer = pc->buf_size;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 1957461..34c128f 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -362,11 +362,7 @@ struct ide_atapi_pc {
 
 	/* data buffer */
 	u8 *buf;
-	/* current buffer position */
-	u8 *cur_pos;
 	int buf_size;
-	/* missing/available data on the current buffer */
-	int b_count;
 
 	/* the corresponding request */
 	struct request *rq;
@@ -379,10 +375,6 @@ struct ide_atapi_pc {
 	 */
 	u8 pc_buf[IDE_PC_BUFFER_SIZE];
 
-	/* idetape only */
-	struct idetape_bh *bh;
-	char *b_data;
-
 	unsigned long timeout;
 };
 
@@ -595,10 +587,6 @@ struct ide_drive_s {
 	/* callback for packet commands */
 	int  (*pc_callback)(struct ide_drive_s *, int);
 
-	void (*pc_update_buffers)(struct ide_drive_s *, struct ide_atapi_pc *);
-	int  (*pc_io_buffers)(struct ide_drive_s *, struct ide_atapi_pc *,
-			      unsigned int, int);
-
 	ide_startstop_t (*irq_handler)(struct ide_drive_s *);
 
 	unsigned long atapi_flags;
-- 
cgit v0.10.2


From 2c316bb57ad4e9f0f3de2d7ef1ae85530c2a7e69 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:03 +0900
Subject: ide: drop rq->data handling from ide_map_sg()

Impact: remove code path which is no longer necessary

All IDE data transfers now use rq->bio.  Simplify ide_map_sg()
accordingly.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 6e3094e..a0309ea 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -248,11 +248,7 @@ void ide_map_sg(ide_drive_t *drive, struct ide_cmd *cmd)
 	struct scatterlist *sg = hwif->sg_table;
 	struct request *rq = cmd->rq;
 
-	if (!rq->bio) {
-		sg_init_one(sg, rq->data, rq->data_len);
-		cmd->sg_nents = 1;
-	} else
-		cmd->sg_nents = blk_rq_map_sg(drive->queue, rq, sg);
+	cmd->sg_nents = blk_rq_map_sg(drive->queue, rq, sg);
 }
 EXPORT_SYMBOL_GPL(ide_map_sg);
 
-- 
cgit v0.10.2


From e475eedb09ee9a0fd855f3e923aa9af31c17d141 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 15 Apr 2009 10:50:04 +0000
Subject: clocksource: sh_cmt earlytimer support

Add Early Platform Driver support to the sh_cmt driver
using the earlytimer class.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c
index 1c92c39..02bae39 100644
--- a/drivers/clocksource/sh_cmt.c
+++ b/drivers/clocksource/sh_cmt.c
@@ -566,9 +566,19 @@ static int sh_cmt_setup(struct sh_cmt_priv *p, struct platform_device *pdev)
 static int __devinit sh_cmt_probe(struct platform_device *pdev)
 {
 	struct sh_cmt_priv *p = platform_get_drvdata(pdev);
+	struct sh_cmt_config *cfg = pdev->dev.platform_data;
 	int ret;
 
-	p = kmalloc(sizeof(*p), GFP_KERNEL);
+	if (p) {
+		pr_info("sh_cmt: %s kept as earlytimer\n", cfg->name);
+		return 0;
+	}
+
+	if (is_early_platform_device(pdev))
+		p = alloc_bootmem(sizeof(*p));
+	else
+		p = kmalloc(sizeof(*p), GFP_KERNEL);
+
 	if (p == NULL) {
 		dev_err(&pdev->dev, "failed to allocate driver data\n");
 		return -ENOMEM;
@@ -576,7 +586,10 @@ static int __devinit sh_cmt_probe(struct platform_device *pdev)
 
 	ret = sh_cmt_setup(p, pdev);
 	if (ret) {
-		kfree(p);
+		if (is_early_platform_device(pdev))
+			free_bootmem(__pa(p), sizeof(*p));
+		else
+			kfree(p);
 
 		platform_set_drvdata(pdev, NULL);
 	}
@@ -606,6 +619,7 @@ static void __exit sh_cmt_exit(void)
 	platform_driver_unregister(&sh_cmt_device_driver);
 }
 
+early_platform_init("earlytimer", &sh_cmt_device_driver);
 module_init(sh_cmt_init);
 module_exit(sh_cmt_exit);
 
-- 
cgit v0.10.2


From eaab89197b733d0f81f07d6c44294b674479fda8 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 15 Apr 2009 10:50:12 +0000
Subject: sh: arch earlytimer support

Extend the 32-bit SuperH timer code to register and probe
the earlytimer class of Early Platform Drivers.

This registers the sh_cmt driver if compiled-in.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/time_32.c b/arch/sh/kernel/time_32.c
index c34e1e0..04b8c6a 100644
--- a/arch/sh/kernel/time_32.c
+++ b/arch/sh/kernel/time_32.c
@@ -17,6 +17,7 @@
 #include <linux/sched.h>
 #include <linux/clockchips.h>
 #include <linux/mc146818rtc.h>	/* for rtc_lock */
+#include <linux/platform_device.h>
 #include <linux/smp.h>
 #include <asm/clock.h>
 #include <asm/rtc.h>
@@ -229,6 +230,14 @@ void __init time_init(void)
 #endif
 
 	/*
+	 * Make sure all compiled-in early timers register themselves.
+	 * Run probe() for one "earlytimer" device.
+	 */
+	early_platform_driver_register_all("earlytimer");
+	if (early_platform_driver_probe("earlytimer", 1, 0))
+		return;
+
+	/*
 	 * Find the timer to use as the system timer, it will be
 	 * initialized for us.
 	 */
-- 
cgit v0.10.2


From 87a00dc059e3af46303f1f56b0e8df41af988c7b Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 15 Apr 2009 10:50:21 +0000
Subject: sh: Add plat_early_device_setup()

Add a plat_early_device_setup() function to allow
processor-specific code to register Early Platform Data.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/device.h b/arch/sh/include/asm/device.h
index efd511d..8688a88 100644
--- a/arch/sh/include/asm/device.h
+++ b/arch/sh/include/asm/device.h
@@ -10,3 +10,5 @@ struct platform_device;
 int platform_resource_setup_memory(struct platform_device *pdev,
 				   char *name, unsigned long memsize);
 
+void plat_early_device_setup(void);
+
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index 04a6004..22b976d 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -29,6 +29,7 @@
 #include <linux/mmzone.h>
 #include <linux/clk.h>
 #include <linux/delay.h>
+#include <linux/platform_device.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/page.h>
@@ -328,6 +329,10 @@ static int __init parse_elfcorehdr(char *arg)
 early_param("elfcorehdr", parse_elfcorehdr);
 #endif
 
+void __init __attribute__ ((weak)) plat_early_device_setup(void)
+{
+}
+
 void __init setup_arch(char **cmdline_p)
 {
 	enable_mmu();
@@ -381,6 +386,8 @@ void __init setup_arch(char **cmdline_p)
 
 	parse_early_param();
 
+	plat_early_device_setup();
+
 	sh_mv_setup();
 
 	/*
-- 
cgit v0.10.2


From 28fde6863e6ed1f4bfb7cef080e67bf439c20628 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 15 Apr 2009 10:50:30 +0000
Subject: sh: Early Platform Data for SuperH Mobile

Use plat_early_device_setup() to register Early Platform Data
for SuperH Mobile processors.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
index c154938..cb5b4db 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
@@ -234,6 +234,16 @@ static int __init sh7343_devices_setup(void)
 }
 __initcall(sh7343_devices_setup);
 
+static struct platform_device *sh7343_early_devices[] __initdata = {
+	&cmt_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7343_early_devices,
+				   ARRAY_SIZE(sh7343_early_devices));
+}
+
 enum {
 	UNUSED = 0,
 
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
index 93ecf8e..2a771f4 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
@@ -226,6 +226,16 @@ static int __init sh7366_devices_setup(void)
 }
 __initcall(sh7366_devices_setup);
 
+static struct platform_device *sh7366_early_devices[] __initdata = {
+	&cmt_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7366_early_devices,
+				   ARRAY_SIZE(sh7366_early_devices));
+}
+
 enum {
 	UNUSED=0,
 
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
index 0e5d204..8f974d0 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
@@ -270,6 +270,16 @@ static int __init sh7722_devices_setup(void)
 }
 __initcall(sh7722_devices_setup);
 
+static struct platform_device *sh7722_early_devices[] __initdata = {
+	&cmt_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7722_early_devices,
+				   ARRAY_SIZE(sh7722_early_devices));
+}
+
 enum {
 	UNUSED=0,
 
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
index 5338dac..21a58d6 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
@@ -281,6 +281,16 @@ static int __init sh7723_devices_setup(void)
 }
 __initcall(sh7723_devices_setup);
 
+static struct platform_device *sh7723_early_devices[] __initdata = {
+	&cmt_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7723_early_devices,
+				   ARRAY_SIZE(sh7723_early_devices));
+}
+
 enum {
 	UNUSED=0,
 
-- 
cgit v0.10.2


From d9aed8b95f062b2f96e37b0e07373f36a6e7066d Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 19 Apr 2009 13:12:04 +0900
Subject: sh: sh7724: Don't default enable the RTC clock.

rtc-sh takes care of this now, so no need to have this always enabled.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index 8b87ba8..195274a 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -275,7 +275,6 @@ static struct platform_device *sh7724_devices[] __initdata = {
 
 static int __init sh7724_devices_setup(void)
 {
-	clk_always_enable("rtc0");   /* RTC */
 	clk_always_enable("vpu0");   /* VPU */
 	clk_always_enable("veu1");   /* VEU3F1 */
 	clk_always_enable("veu0");   /* VEU3F0 */
-- 
cgit v0.10.2


From 8fb2bae4b41eb64f6e233e9bd3f3a789fbb04a06 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 19 Apr 2009 13:14:29 +0900
Subject: sh: sh7724: Register CMT as an early platform device here too.

Follows the SH7722/SH7723 changes.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index 195274a..8429396 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -288,6 +288,16 @@ static int __init sh7724_devices_setup(void)
 }
 device_initcall(sh7724_devices_setup);
 
+static struct platform_device *sh7724_early_devices[] __initdata = {
+	&cmt_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7724_early_devices,
+				   ARRAY_SIZE(sh7724_early_devices));
+}
+
 enum {
 	UNUSED = 0,
 
-- 
cgit v0.10.2


From 2b2fd87a6ef56ba7647a578e81bb8c8efda166b8 Mon Sep 17 00:00:00 2001
From: Weidong Han <weidong.han@intel.com>
Date: Fri, 17 Apr 2009 16:42:12 +0800
Subject: docs, x86: add nox2apic back to kernel-parameters.txt

"nox2apic" was removed from kernel-parameters.txt by mistake, when
entries were sorted in alpha order (commit 0cb55ad2). But this early
parameter is still there, add it back to kernel-parameters.txt.

[ Impact: add boot parameter description ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Weidong Han <weidong.han@intel.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: iommu@lists.linux-foundation.org
Cc: allen.m.kay@intel.com
Cc: fenghua.yu@intel.com
LKML-Reference: <1239957736-6161-2-git-send-email-weidong.han@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 6172e43..33989d2 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1588,6 +1588,8 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	nowb		[ARM]
 
+	nox2apic	[X86-64,APIC] Do not enable x2APIC mode.
+
 	nptcg=		[IA64] Override max number of concurrent global TLB
 			purges which is reported from either PAL_VM_SUMMARY or
 			SAL PALO.
-- 
cgit v0.10.2


From 5d0ae2db6deac4f15dac4f42f23bc56448fc8d4d Mon Sep 17 00:00:00 2001
From: Weidong Han <weidong.han@intel.com>
Date: Fri, 17 Apr 2009 16:42:13 +0800
Subject: x86, intr-remap: fix ack for interrupt remapping

Shouldn't call ack_apic_edge() in ir_ack_apic_edge(), because
ack_apic_edge() does more than just ack: it also does irq migration
in the non-interrupt-remapping case. But there is no such need for
interrupt-remapping case, as irq migration is done in the process
context.

Similarly, ir_ack_apic_level() shouldn't call ack_apic_level, and
instead should do the local cpu's EOI + directed EOI to the io-apic.

ack_x2APIC_irq() is not neccessary, because ack_APIC_irq() will use MSR
write for x2apic, and uncached write for non-x2apic.

[ Impact: simplify/standardize intr-remap IRQ acking, fix on !x2apic ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Weidong Han <weidong.han@intel.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: iommu@lists.linux-foundation.org
Cc: allen.m.kay@intel.com
Cc: fenghua.yu@intel.com
LKML-Reference: <1239957736-6161-3-git-send-email-weidong.han@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 2bd5a46..d4cb7e5 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -204,14 +204,6 @@ static inline int x2apic_enabled(void)
 
 extern int get_physical_broadcast(void);
 
-#ifdef CONFIG_X86_X2APIC
-static inline void ack_x2APIC_irq(void)
-{
-	/* Docs say use 0 for future compatibility */
-	native_apic_msr_write(APIC_EOI, 0);
-}
-#endif
-
 extern void apic_disable(void);
 extern int lapic_get_maxlvt(void);
 extern void clear_local_APIC(void);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 8499000..ea22a86 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2552,20 +2552,6 @@ eoi_ioapic_irq(struct irq_desc *desc)
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-#ifdef CONFIG_X86_X2APIC
-static void ack_x2apic_level(unsigned int irq)
-{
-	struct irq_desc *desc = irq_to_desc(irq);
-	ack_x2APIC_irq();
-	eoi_ioapic_irq(desc);
-}
-
-static void ack_x2apic_edge(unsigned int irq)
-{
-	ack_x2APIC_irq();
-}
-#endif
-
 static void ack_apic_edge(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
@@ -2629,9 +2615,6 @@ static void ack_apic_level(unsigned int irq)
 	 */
 	ack_APIC_irq();
 
-	if (irq_remapped(irq))
-		eoi_ioapic_irq(desc);
-
 	/* Now we can move and renable the irq */
 	if (unlikely(do_unmask_irq)) {
 		/* Only migrate the irq if the ack has been received.
@@ -2680,20 +2663,15 @@ static void ack_apic_level(unsigned int irq)
 #ifdef CONFIG_INTR_REMAP
 static void ir_ack_apic_edge(unsigned int irq)
 {
-#ifdef CONFIG_X86_X2APIC
-       if (x2apic_enabled())
-               return ack_x2apic_edge(irq);
-#endif
-       return ack_apic_edge(irq);
+	ack_APIC_irq();
 }
 
 static void ir_ack_apic_level(unsigned int irq)
 {
-#ifdef CONFIG_X86_X2APIC
-       if (x2apic_enabled())
-               return ack_x2apic_level(irq);
-#endif
-       return ack_apic_level(irq);
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	ack_APIC_irq();
+	eoi_ioapic_irq(desc);
 }
 #endif /* CONFIG_INTR_REMAP */
 
-- 
cgit v0.10.2


From 937582382c71b75b29fbb92615629494e1a05ac0 Mon Sep 17 00:00:00 2001
From: Weidong Han <weidong.han@intel.com>
Date: Fri, 17 Apr 2009 16:42:14 +0800
Subject: x86, intr-remap: enable interrupt remapping early

Currently, when x2apic is not enabled, interrupt remapping
will be enabled in init_dmars(), where it is too late to remap
ioapic interrupts, that is, ioapic interrupts are really in
compatibility mode, not remappable mode.

This patch always enables interrupt remapping before ioapic
setup, it guarantees all interrupts will be remapped when
interrupt remapping is enabled. Thus it doesn't need to set
the compatibility interrupt bit.

[ Impact: refactor intr-remap init sequence, enable fuller remap mode ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Weidong Han <weidong.han@intel.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: iommu@lists.linux-foundation.org
Cc: allen.m.kay@intel.com
Cc: fenghua.yu@intel.com
LKML-Reference: <1239957736-6161-4-git-send-email-weidong.han@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index d4cb7e5..fbdd654 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -169,7 +169,6 @@ static inline u64 native_x2apic_icr_read(void)
 extern int x2apic, x2apic_phys;
 extern void check_x2apic(void);
 extern void enable_x2apic(void);
-extern void enable_IR_x2apic(void);
 extern void x2apic_icr_write(u32 low, u32 id);
 static inline int x2apic_enabled(void)
 {
@@ -190,18 +189,18 @@ static inline void check_x2apic(void)
 static inline void enable_x2apic(void)
 {
 }
-static inline void enable_IR_x2apic(void)
-{
-}
 static inline int x2apic_enabled(void)
 {
 	return 0;
 }
 
 #define	x2apic	0
+#define	x2apic_preenabled 0
 
 #endif
 
+extern void enable_IR_x2apic(void);
+
 extern int get_physical_broadcast(void);
 
 extern void apic_disable(void);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 83e47fe..0cf1eea 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -141,6 +141,8 @@ static int x2apic_preenabled;
 static int disable_x2apic;
 static __init int setup_nox2apic(char *str)
 {
+	if (x2apic_enabled())
+		panic("Bios already enabled x2apic, can't enforce nox2apic");
 	disable_x2apic = 1;
 	setup_clear_cpu_cap(X86_FEATURE_X2APIC);
 	return 0;
@@ -1345,6 +1347,7 @@ void enable_x2apic(void)
 		wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
 	}
 }
+#endif /* CONFIG_X86_X2APIC */
 
 void __init enable_IR_x2apic(void)
 {
@@ -1353,32 +1356,21 @@ void __init enable_IR_x2apic(void)
 	unsigned long flags;
 	struct IO_APIC_route_entry **ioapic_entries = NULL;
 
-	if (!cpu_has_x2apic)
-		return;
-
-	if (!x2apic_preenabled && disable_x2apic) {
-		pr_info("Skipped enabling x2apic and Interrupt-remapping "
-			"because of nox2apic\n");
-		return;
+	ret = dmar_table_init();
+	if (ret) {
+		pr_debug("dmar_table_init() failed with %d:\n", ret);
+		goto ir_failed;
 	}
 
-	if (x2apic_preenabled && disable_x2apic)
-		panic("Bios already enabled x2apic, can't enforce nox2apic");
-
-	if (!x2apic_preenabled && skip_ioapic_setup) {
-		pr_info("Skipped enabling x2apic and Interrupt-remapping "
-			"because of skipping io-apic setup\n");
-		return;
+	if (!intr_remapping_supported()) {
+		pr_debug("intr-remapping not supported\n");
+		goto ir_failed;
 	}
 
-	ret = dmar_table_init();
-	if (ret) {
-		pr_info("dmar_table_init() failed with %d:\n", ret);
 
-		if (x2apic_preenabled)
-			panic("x2apic enabled by bios. But IR enabling failed");
-		else
-			pr_info("Not enabling x2apic,Intr-remapping\n");
+	if (!x2apic_preenabled && skip_ioapic_setup) {
+		pr_info("Skipped enabling intr-remap because of skipping "
+			"io-apic setup\n");
 		return;
 	}
 
@@ -1398,20 +1390,25 @@ void __init enable_IR_x2apic(void)
 	mask_IO_APIC_setup(ioapic_entries);
 	mask_8259A();
 
-	ret = enable_intr_remapping(EIM_32BIT_APIC_ID);
-
-	if (ret && x2apic_preenabled) {
-		local_irq_restore(flags);
-		panic("x2apic enabled by bios. But IR enabling failed");
-	}
+#ifdef CONFIG_X86_X2APIC
+	if (cpu_has_x2apic)
+		ret = enable_intr_remapping(EIM_32BIT_APIC_ID);
+	else
+#endif
+		ret = enable_intr_remapping(EIM_8BIT_APIC_ID);
 
 	if (ret)
 		goto end_restore;
 
-	if (!x2apic) {
+	pr_info("Enabled Interrupt-remapping\n");
+
+#ifdef CONFIG_X86_X2APIC
+	if (cpu_has_x2apic && !x2apic) {
 		x2apic = 1;
 		enable_x2apic();
+		pr_info("Enabled x2apic\n");
 	}
+#endif
 
 end_restore:
 	if (ret)
@@ -1426,30 +1423,29 @@ end_restore:
 	local_irq_restore(flags);
 
 end:
-	if (!ret) {
-		if (!x2apic_preenabled)
-			pr_info("Enabled x2apic and interrupt-remapping\n");
-		else
-			pr_info("Enabled Interrupt-remapping\n");
-	} else
-		pr_err("Failed to enable Interrupt-remapping and x2apic\n");
 	if (ioapic_entries)
 		free_ioapic_entries(ioapic_entries);
+
+	if (!ret)
+		return;
+
+ir_failed:
+	if (x2apic_preenabled)
+		panic("x2apic enabled by bios. But IR enabling failed");
+	else if (cpu_has_x2apic)
+		pr_info("Not enabling x2apic,Intr-remapping\n");
 #else
 	if (!cpu_has_x2apic)
 		return;
 
 	if (x2apic_preenabled)
 		panic("x2apic enabled prior OS handover,"
-		      " enable CONFIG_INTR_REMAP");
-
-	pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping "
-		" and x2apic\n");
+		      " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP");
 #endif
 
 	return;
 }
-#endif /* CONFIG_X86_X2APIC */
+
 
 #ifdef CONFIG_X86_64
 /*
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 001b328..9ce8f07 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -1968,15 +1968,6 @@ static int __init init_dmars(void)
 		}
 	}
 
-#ifdef CONFIG_INTR_REMAP
-	if (!intr_remapping_enabled) {
-		ret = enable_intr_remapping(0);
-		if (ret)
-			printk(KERN_ERR
-			       "IOMMU: enable interrupt remapping failed\n");
-	}
-#endif
-
 	/*
 	 * For each rmrr
 	 *   for each dev attached to rmrr
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index f5e0ea7..5c21426 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -423,20 +423,6 @@ static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode)
 		      readl, (sts & DMA_GSTS_IRTPS), sts);
 	spin_unlock_irqrestore(&iommu->register_lock, flags);
 
-	if (mode == 0) {
-		spin_lock_irqsave(&iommu->register_lock, flags);
-
-		/* enable comaptiblity format interrupt pass through */
-		cmd = iommu->gcmd | DMA_GCMD_CFI;
-		iommu->gcmd |= DMA_GCMD_CFI;
-		writel(cmd, iommu->reg + DMAR_GCMD_REG);
-
-		IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
-			      readl, (sts & DMA_GSTS_CFIS), sts);
-
-		spin_unlock_irqrestore(&iommu->register_lock, flags);
-	}
-
 	/*
 	 * global invalidation of interrupt entry cache before enabling
 	 * interrupt-remapping.
@@ -516,6 +502,20 @@ end:
 	spin_unlock_irqrestore(&iommu->register_lock, flags);
 }
 
+int __init intr_remapping_supported(void)
+{
+	struct dmar_drhd_unit *drhd;
+
+	for_each_drhd_unit(drhd) {
+		struct intel_iommu *iommu = drhd->iommu;
+
+		if (!ecap_ir_support(iommu->ecap))
+			return 0;
+	}
+
+	return 1;
+}
+
 int __init enable_intr_remapping(int eim)
 {
 	struct dmar_drhd_unit *drhd;
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index e397dc3..06f592a 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -108,6 +108,7 @@ struct irte {
 };
 #ifdef CONFIG_INTR_REMAP
 extern int intr_remapping_enabled;
+extern int intr_remapping_supported(void);
 extern int enable_intr_remapping(int);
 extern void disable_intr_remapping(void);
 extern int reenable_intr_remapping(int);
-- 
cgit v0.10.2


From 03ea81550676296d94596e4337c771c6ba29f542 Mon Sep 17 00:00:00 2001
From: Weidong Han <weidong.han@intel.com>
Date: Fri, 17 Apr 2009 16:42:15 +0800
Subject: x86, intr-remap: add option to disable interrupt remapping

Add option "nointremap" to disable interrupt remapping.

[ Impact: add new boot option ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Weidong Han <weidong.han@intel.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: iommu@lists.linux-foundation.org
Cc: allen.m.kay@intel.com
Cc: fenghua.yu@intel.com
LKML-Reference: <1239957736-6161-5-git-send-email-weidong.han@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 33989d2..843cb66 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1533,6 +1533,9 @@ and is between 256 and 4096 characters. It is defined in the file
 	noinitrd	[RAM] Tells the kernel not to load any configured
 			initial RAM disk.
 
+	nointremap	[X86-64, Intel-IOMMU] Do not enable interrupt
+			remapping.
+
 	nointroute	[IA-64]
 
 	nojitter	[IA64] Disables jitter checking for ITC timers.
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index 5c21426..842039e 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -15,6 +15,14 @@ static struct ioapic_scope ir_ioapic[MAX_IO_APICS];
 static int ir_ioapic_num;
 int intr_remapping_enabled;
 
+static int disable_intremap;
+static __init int setup_nointremap(char *str)
+{
+	disable_intremap = 1;
+	return 0;
+}
+early_param("nointremap", setup_nointremap);
+
 struct irq_2_iommu {
 	struct intel_iommu *iommu;
 	u16 irte_index;
@@ -506,6 +514,9 @@ int __init intr_remapping_supported(void)
 {
 	struct dmar_drhd_unit *drhd;
 
+	if (disable_intremap)
+		return 0;
+
 	for_each_drhd_unit(drhd) {
 		struct intel_iommu *iommu = drhd->iommu;
 
-- 
cgit v0.10.2


From 9a2755c3569e4db92bd9b1daadeddb4045b0cccd Mon Sep 17 00:00:00 2001
From: Weidong Han <weidong.han@intel.com>
Date: Fri, 17 Apr 2009 16:42:16 +0800
Subject: x86, intr-remap: fix x2apic/intr-remap resume

Interrupt remapping was decoupled from x2apic. Shouldn't check
x2apic before resume interrupt remapping. Otherwise, interrupt
remapping won't be resumed when x2apic is not enabled.

[ Impact: fix potential intr-remap resume hang on !x2apic ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Weidong Han <weidong.han@intel.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: iommu@lists.linux-foundation.org
Cc: allen.m.kay@intel.com
Cc: fenghua.yu@intel.com
LKML-Reference: <1239957736-6161-6-git-send-email-weidong.han@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 0cf1eea..7b41a32 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2032,7 +2032,7 @@ static int lapic_resume(struct sys_device *dev)
 		return 0;
 
 	local_irq_save(flags);
-	if (x2apic) {
+	if (intr_remapping_enabled) {
 		ioapic_entries = alloc_ioapic_entries();
 		if (!ioapic_entries) {
 			WARN(1, "Alloc ioapic_entries in lapic resume failed.");
@@ -2048,8 +2048,10 @@ static int lapic_resume(struct sys_device *dev)
 
 		mask_IO_APIC_setup(ioapic_entries);
 		mask_8259A();
-		enable_x2apic();
 	}
+
+	if (x2apic)
+		enable_x2apic();
 #else
 	if (!apic_pm_state.active)
 		return 0;
@@ -2097,10 +2099,12 @@ static int lapic_resume(struct sys_device *dev)
 	apic_read(APIC_ESR);
 
 #ifdef CONFIG_INTR_REMAP
-	if (intr_remapping_enabled)
-		reenable_intr_remapping(EIM_32BIT_APIC_ID);
+	if (intr_remapping_enabled) {
+		if (x2apic)
+			reenable_intr_remapping(EIM_32BIT_APIC_ID);
+		else
+			reenable_intr_remapping(EIM_8BIT_APIC_ID);
 
-	if (x2apic) {
 		unmask_8259A();
 		restore_IO_APIC_setup(ioapic_entries);
 		free_ioapic_entries(ioapic_entries);
@@ -2109,7 +2113,6 @@ static int lapic_resume(struct sys_device *dev)
 
 	local_irq_restore(flags);
 
-
 	return 0;
 }
 
-- 
cgit v0.10.2


From cece3155d869a50ba534ed161b5a05e8a29dcad0 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Sat, 18 Apr 2009 23:45:28 +0400
Subject: x86: smpboot - wakeup_secondary should be done via __cpuinit section

A caller (do_boot_cpu) already has __cpuinit attribute.

Since HOTPLUG_CPU depends on SMP && HOTPLUG it doesn't
lead to panic at moment.

[ Impact: cleanup ]

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <20090418194528.GD25510@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index bf8ad63..d2e8de9 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -504,7 +504,7 @@ void __inquire_remote_apic(int apicid)
  * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
  * won't ... remember to clear down the APIC, etc later.
  */
-int __devinit
+int __cpuinit
 wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
 {
 	unsigned long send_status, accept_status = 0;
@@ -538,7 +538,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
 	return (send_status | accept_status);
 }
 
-static int __devinit
+static int __cpuinit
 wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 {
 	unsigned long send_status, accept_status = 0;
-- 
cgit v0.10.2


From 667c5296cc76fefe0abcb79228952b28d9af45e3 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sun, 19 Apr 2009 11:43:11 +0400
Subject: x86: es7000, uv - use __cpuinit for kicking secondary cpus

The caller already has __cpuinit attribute.

[ Impact: save memory, address section mismatch warning ]

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
LKML-Reference: <20090419074311.GA8670@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 1c11b81..8e07c14 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -145,7 +145,7 @@ es7000_rename_gsi(int ioapic, int gsi)
 	return gsi;
 }
 
-static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
+static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
 {
 	unsigned long vect = 0, psaival = 0;
 
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index de1a50a..873bf71 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -91,7 +91,7 @@ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
 	cpumask_set_cpu(cpu, retmask);
 }
 
-static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
 {
 #ifdef CONFIG_SMP
 	unsigned long val;
-- 
cgit v0.10.2


From e057a5e5647a1c9d0d0054fbd298bfa04b3d1cb4 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 19 Apr 2009 23:38:12 +0200
Subject: tracing/core: Add current context on tracing recursion warning

In case of tracing recursion detection, we only get the stacktrace.
But the current context may be very useful to debug the issue.

This patch adds the softirq/hardirq/nmi context with the warning
using lockdep context display to have a familiar output.

v2: Use printk_once()
v3: drop {hardirq,softirq}_context which depend on lockdep,
    only keep what is part of current->trace_recursion,
    sufficient to debug the warning source.

[ Impact: print context necessary to debug recursion ]

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b421b0e..bffde63 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1495,6 +1495,13 @@ static int trace_recursive_lock(void)
 	if (unlikely(current->trace_recursion & (1 << level))) {
 		/* Disable all tracing before we do anything else */
 		tracing_off_permanent();
+
+		printk_once(KERN_WARNING "Tracing recursion: "
+			    "HC[%lu]:SC[%lu]:NMI[%lu]\n",
+			    hardirq_count() >> HARDIRQ_SHIFT,
+			    softirq_count() >> SOFTIRQ_SHIFT,
+			    in_nmi());
+
 		WARN_ON_ONCE(1);
 		return -1;
 	}
-- 
cgit v0.10.2


From 9ade1217c9ba39ad2f004a898ddfbb815fd5fe74 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 15:38:25 +0900
Subject: sh: pci: Drop asm-generic/pci.h, so we can use our own fixups.

The new PCI code wants its own bus<->resource mappings instead of the
generic equivalents, so drop the asm-generic include in preparation.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c
index 8aaacc9..6d659cd 100644
--- a/arch/sh/drivers/pci/pci.c
+++ b/arch/sh/drivers/pci/pci.c
@@ -72,6 +72,20 @@ void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus)
 	pci_read_bridge_bases(bus);
 }
 
+void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region,
+			     struct resource *res)
+{
+	region->start = res->start;
+	region->end = res->end;
+}
+
+void pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
+			     struct pci_bus_region *region)
+{
+	res->start = region->start;
+	res->end = region->end;
+}
+
 void pcibios_align_resource(void *data, struct resource *res,
 			    resource_size_t size, resource_size_t align)
 			    __attribute__ ((weak));
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index bb2c2fc..69cb615 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -146,13 +146,34 @@ int pcibios_map_platform_irq(struct pci_dev *dev, u8 slot, u8 pin);
 int pciauto_assign_resources(int busno, struct pci_channel *hose);
 #endif
 
-#endif /* __KERNEL__ */
+extern void pcibios_resource_to_bus(struct pci_dev *dev,
+	struct pci_bus_region *region, struct resource *res);
+
+extern void pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
+				    struct pci_bus_region *region);
+
+static inline struct resource *
+pcibios_select_root(struct pci_dev *pdev, struct resource *res)
+{
+	struct resource *root = NULL;
+
+	if (res->flags & IORESOURCE_IO)
+		root = &ioport_resource;
+	if (res->flags & IORESOURCE_MEM)
+		root = &iomem_resource;
 
-/* generic pci stuff */
-#include <asm-generic/pci.h>
+	return root;
+}
+
+/* Chances are this interrupt is wired PC-style ...  */
+static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
+{
+	return channel ? 15 : 14;
+}
 
 /* generic DMA-mapping stuff */
 #include <asm-generic/pci-dma-compat.h>
 
+#endif /* __KERNEL__ */
 #endif /* __ASM_SH_PCI_H */
 
-- 
cgit v0.10.2


From 4c5107e44514a9bde74d0af77982705d602d9e39 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 15:43:36 +0900
Subject: sh: pci: Split out new-style PCI core.

This splits off a 'pci-new.c' which is aimed at gradually replacing the
pci-auto backend and the arch/sh/drivers/pci/pci.c core respectively.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/Kconfig b/arch/sh/drivers/pci/Kconfig
index 7e816ed..efe8cf9 100644
--- a/arch/sh/drivers/pci/Kconfig
+++ b/arch/sh/drivers/pci/Kconfig
@@ -18,10 +18,17 @@ config SH_PCIDMA_NONCOHERENT
 	  bridge integrated with your SH CPU, refer carefully to the chip specs
 	  to see if you can say 'N' here. Otherwise, leave it as 'Y'.
 
+# Temporary config option for transitioning off of PCI_AUTO
+config PCI_NEW
+	bool
+	depends on PCI
+	default y if CPU_SUBTYPE_SH7763 || CPU_SUBTYPE_SH7780 || \
+		     CPU_SUBTYPE_SH7785
+
 # This is also board-specific
 config PCI_AUTO
 	bool
-	depends on PCI
+	depends on PCI && !PCI_NEW
 	default y
 
 config PCI_AUTO_UPDATE_RESOURCES
@@ -34,4 +41,3 @@ config PCI_AUTO_UPDATE_RESOURCES
 	  for some reason, you have a board that simply refuses to work
 	  with its resources updated beyond what they are when the device
 	  is powered up, set this to N. Everyone else will want this as Y.
-
diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index 5003971..362a1ee 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -2,8 +2,8 @@
 # Makefile for the PCI specific kernel interface routines under Linux.
 #
 
-obj-y					+= pci.o
-obj-$(CONFIG_PCI_AUTO)			+= pci-auto.o
+obj-$(CONFIG_PCI_AUTO)			:= pci.o pci-auto.o
+obj-$(CONFIG_PCI_NEW)			:= pci-new.o
 
 obj-$(CONFIG_CPU_SUBTYPE_SH7751)	+= pci-sh7751.o ops-sh4.o
 obj-$(CONFIG_CPU_SUBTYPE_SH7751R)	+= pci-sh7751.o ops-sh4.o
diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci-new.c
new file mode 100644
index 0000000..097eb88
--- /dev/null
+++ b/arch/sh/drivers/pci/pci-new.c
@@ -0,0 +1,248 @@
+/*
+ * New-style PCI core.
+ *
+ * Copyright (c) 2002  M. R. Brown
+ * Copyright (c) 2004 - 2009  Paul Mundt
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/dma-debug.h>
+#include <linux/io.h>
+
+static int __init pcibios_init(void)
+{
+	struct pci_channel *p;
+	struct pci_bus *bus;
+	int busno;
+
+	/* init channels */
+	busno = 0;
+	for (p = board_pci_channels; p->init; p++) {
+		if (p->init(p) == 0)
+			p->enabled = 1;
+		else
+			pr_err("Unable to init pci channel %d\n", busno);
+		busno++;
+	}
+
+	/* scan the buses */
+	busno = 0;
+	for (p = board_pci_channels; p->init; p++) {
+		if (p->enabled) {
+			bus = pci_scan_bus(busno, p->pci_ops, p);
+			busno = bus->subordinate + 1;
+
+			pci_bus_size_bridges(bus);
+			pci_bus_assign_resources(bus);
+			pci_enable_bridges(bus);
+		}
+	}
+
+	pci_fixup_irqs(pci_common_swizzle, pcibios_map_platform_irq);
+
+	dma_debug_add_bus(&pci_bus_type);
+
+	return 0;
+}
+subsys_initcall(pcibios_init);
+
+static void pcibios_fixup_device_resources(struct pci_dev *dev,
+	struct pci_bus *bus)
+{
+	/* Update device resources.  */
+	struct pci_channel *chan = bus->sysdata;
+	unsigned long offset = 0;
+	int i;
+
+	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+		if (!dev->resource[i].start)
+			continue;
+		if (dev->resource[i].flags & IORESOURCE_PCI_FIXED)
+			continue;
+		if (dev->resource[i].flags & IORESOURCE_IO)
+			offset = chan->io_base;
+		else if (dev->resource[i].flags & IORESOURCE_MEM)
+			offset = 0;
+
+		dev->resource[i].start += offset;
+		dev->resource[i].end += offset;
+	}
+}
+
+
+/*
+ *  Called after each bus is probed, but before its children
+ *  are examined.
+ */
+void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus)
+{
+	struct pci_dev *dev = bus->self;
+	struct list_head *ln;
+	struct pci_channel *chan = bus->sysdata;
+
+	if (!dev) {
+		bus->resource[0] = chan->io_resource;
+		bus->resource[1] = chan->mem_resource;
+	}
+
+	for (ln = bus->devices.next; ln != &bus->devices; ln = ln->next) {
+		dev = pci_dev_b(ln);
+
+		if ((dev->class >> 8) != PCI_CLASS_BRIDGE_PCI)
+			pcibios_fixup_device_resources(dev, bus);
+	}
+}
+
+void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region,
+			 struct resource *res)
+{
+	struct pci_channel *chan = dev->sysdata;
+	unsigned long offset = 0;
+
+	if (res->flags & IORESOURCE_IO)
+		offset = chan->io_base;
+	else if (res->flags & IORESOURCE_MEM)
+		offset = 0;
+
+	region->start = res->start - offset;
+	region->end = res->end - offset;
+}
+
+void __devinit
+pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
+			struct pci_bus_region *region)
+{
+	struct pci_channel *chan = dev->sysdata;
+	unsigned long offset = 0;
+
+	if (res->flags & IORESOURCE_IO)
+		offset = chan->io_base;
+	else if (res->flags & IORESOURCE_MEM)
+		offset = 0;
+
+	res->start = region->start + offset;
+	res->end = region->end + offset;
+}
+
+void pcibios_align_resource(void *data, struct resource *res,
+			    resource_size_t size, resource_size_t align)
+			    __attribute__ ((weak));
+
+/*
+ * We need to avoid collisions with `mirrored' VGA ports
+ * and other strange ISA hardware, so we always want the
+ * addresses to be allocated in the 0x000-0x0ff region
+ * modulo 0x400.
+ */
+void pcibios_align_resource(void *data, struct resource *res,
+			    resource_size_t size, resource_size_t align)
+{
+	if (res->flags & IORESOURCE_IO) {
+		resource_size_t start = res->start;
+
+		if (start & 0x300) {
+			start = (start + 0x3ff) & ~0x3ff;
+			res->start = start;
+		}
+	}
+}
+
+int pcibios_enable_device(struct pci_dev *dev, int mask)
+{
+	u16 cmd, old_cmd;
+	int idx;
+	struct resource *r;
+
+	pci_read_config_word(dev, PCI_COMMAND, &cmd);
+	old_cmd = cmd;
+	for(idx=0; idx<6; idx++) {
+		if (!(mask & (1 << idx)))
+			continue;
+		r = &dev->resource[idx];
+		if (!r->start && r->end) {
+			printk(KERN_ERR "PCI: Device %s not available because "
+			       "of resource collisions\n", pci_name(dev));
+			return -EINVAL;
+		}
+		if (r->flags & IORESOURCE_IO)
+			cmd |= PCI_COMMAND_IO;
+		if (r->flags & IORESOURCE_MEM)
+			cmd |= PCI_COMMAND_MEMORY;
+	}
+	if (dev->resource[PCI_ROM_RESOURCE].start)
+		cmd |= PCI_COMMAND_MEMORY;
+	if (cmd != old_cmd) {
+		printk(KERN_INFO "PCI: Enabling device %s (%04x -> %04x)\n",
+		       pci_name(dev), old_cmd, cmd);
+		pci_write_config_word(dev, PCI_COMMAND, cmd);
+	}
+	return 0;
+}
+
+/*
+ *  If we set up a device for bus mastering, we need to check and set
+ *  the latency timer as it may not be properly set.
+ */
+static unsigned int pcibios_max_latency = 255;
+
+void pcibios_set_master(struct pci_dev *dev)
+{
+	u8 lat;
+	pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat);
+	if (lat < 16)
+		lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency;
+	else if (lat > pcibios_max_latency)
+		lat = pcibios_max_latency;
+	else
+		return;
+	printk(KERN_INFO "PCI: Setting latency timer of device %s to %d\n",
+	       pci_name(dev), lat);
+	pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
+}
+
+void __init pcibios_update_irq(struct pci_dev *dev, int irq)
+{
+	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
+}
+
+void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
+{
+	resource_size_t start = pci_resource_start(dev, bar);
+	resource_size_t len = pci_resource_len(dev, bar);
+	unsigned long flags = pci_resource_flags(dev, bar);
+
+	if (unlikely(!len || !start))
+		return NULL;
+	if (maxlen && len > maxlen)
+		len = maxlen;
+
+	/*
+	 * Presently the IORESOURCE_MEM case is a bit special, most
+	 * SH7751 style PCI controllers have PCI memory at a fixed
+	 * location in the address space where no remapping is desired.
+	 * With the IORESOURCE_MEM case more care has to be taken
+	 * to inhibit page table mapping for legacy cores, but this is
+	 * punted off to __ioremap().
+	 *					-- PFM.
+	 */
+	if (flags & IORESOURCE_IO)
+		return ioport_map(start, len);
+	if (flags & IORESOURCE_MEM)
+		return ioremap(start, len);
+
+	return NULL;
+}
+EXPORT_SYMBOL(pci_iomap);
+
+void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
+{
+	iounmap(addr);
+}
+EXPORT_SYMBOL(pci_iounmap);
+
+EXPORT_SYMBOL(board_pci_channels);
-- 
cgit v0.10.2


From 9833385131fc4e8c52f95320ab899051d1c06831 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 15:51:45 +0900
Subject: sh: pci: HAVE_PCI_MMAP support.

Derived from the MIPS version, now uses pgprot_noncached().

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index 362a1ee..c8eab14 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -1,9 +1,9 @@
 #
 # Makefile for the PCI specific kernel interface routines under Linux.
 #
-
-obj-$(CONFIG_PCI_AUTO)			:= pci.o pci-auto.o
-obj-$(CONFIG_PCI_NEW)			:= pci-new.o
+obj-y					+= pci-lib.o
+obj-$(CONFIG_PCI_AUTO)			+= pci.o pci-auto.o
+obj-$(CONFIG_PCI_NEW)			+= pci-new.o
 
 obj-$(CONFIG_CPU_SUBTYPE_SH7751)	+= pci-sh7751.o ops-sh4.o
 obj-$(CONFIG_CPU_SUBTYPE_SH7751R)	+= pci-sh7751.o ops-sh4.o
diff --git a/arch/sh/drivers/pci/pci-lib.c b/arch/sh/drivers/pci/pci-lib.c
new file mode 100644
index 0000000..1a43a35
--- /dev/null
+++ b/arch/sh/drivers/pci/pci-lib.c
@@ -0,0 +1,26 @@
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/pci.h>
+
+int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+			enum pci_mmap_state mmap_state, int write_combine)
+{
+	/*
+	 * I/O space can be accessed via normal processor loads and stores on
+	 * this platform but for now we elect not to do this and portable
+	 * drivers should not do this anyway.
+	 */
+	if (mmap_state == pci_mmap_io)
+		return -EINVAL;
+
+	/*
+	 * Ignore write-combine; for now only return uncached mappings.
+	 */
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+			       vma->vm_end - vma->vm_start,
+			       vma->vm_page_prot);
+}
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 69cb615..46afd44 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -40,6 +40,9 @@ extern struct pci_channel board_pci_channels[];
 
 struct pci_dev;
 
+#define HAVE_PCI_MMAP
+extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+	enum pci_mmap_state mmap_state, int write_combine);
 extern void pcibios_set_master(struct pci_dev *dev);
 
 static inline void pcibios_penalize_isa_irq(int irq, int active)
-- 
cgit v0.10.2


From a3c0e0d0032d5bbfd7dc04827a257c717d432a5b Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 16:14:29 +0900
Subject: sh: pci: Consolidate pcibios_align_resource() definitions.

This introduces a saner pcibios_align_resource() that can be used
regardless of whether pci-auto or pci-new are being used, and
consolidates it in pci-lib.c.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/boards/mach-se/7751/pci.c b/arch/sh/boards/mach-se/7751/pci.c
index 203b292..9ec64a4 100644
--- a/arch/sh/boards/mach-se/7751/pci.c
+++ b/arch/sh/boards/mach-se/7751/pci.c
@@ -30,6 +30,9 @@
 #define PCIC_WRITE(x,v) writel((v), PCI_REG(x))
 #define PCIC_READ(x) readl(PCI_REG(x))
 
+#define xPCIBIOS_MIN_IO		board_pci_channels->io_resource->start
+#define xPCIBIOS_MIN_MEM	board_pci_channels->mem_resource->start
+
 /*
  * Description:  This function sets up and initializes the pcic, sets
  * up the BARS, maps the DRAM into the address space etc, etc.
@@ -97,12 +100,12 @@ int __init pcibios_init_platform(void)
     * meaning all calls go straight through... use BUG_ON to
     * catch erroneous assumption.
     */
-   BUG_ON(PCIBIOS_MIN_MEM != SH7751_PCI_MEMORY_BASE);
+   BUG_ON(xPCIBIOS_MIN_MEM != SH7751_PCI_MEMORY_BASE);
 
-   PCIC_WRITE(SH7751_PCIMBR, PCIBIOS_MIN_MEM);
+   PCIC_WRITE(SH7751_PCIMBR, xPCIBIOS_MIN_MEM);
 
    /* Set IOBR for window containing area specified in pci.h */
-   PCIC_WRITE(SH7751_PCIIOBR, (PCIBIOS_MIN_IO & SH7751_PCIIOBR_MASK));
+   PCIC_WRITE(SH7751_PCIIOBR, (xPCIBIOS_MIN_IO & SH7751_PCIIOBR_MASK));
 
    /* All done, may as well say so... */
    printk("SH7751 PCI: Finished initialization of the PCI controller\n");
diff --git a/arch/sh/drivers/pci/pci-lib.c b/arch/sh/drivers/pci/pci-lib.c
index 1a43a35..8ab1a2d 100644
--- a/arch/sh/drivers/pci/pci-lib.c
+++ b/arch/sh/drivers/pci/pci-lib.c
@@ -4,6 +4,41 @@
 #include <linux/types.h>
 #include <linux/pci.h>
 
+unsigned long PCIBIOS_MIN_IO = 0x0000;
+unsigned long PCIBIOS_MIN_MEM = 0;
+
+/*
+ * We need to avoid collisions with `mirrored' VGA ports
+ * and other strange ISA hardware, so we always want the
+ * addresses to be allocated in the 0x000-0x0ff region
+ * modulo 0x400.
+ */
+void pcibios_align_resource(void *data, struct resource *res,
+			    resource_size_t size, resource_size_t align)
+{
+	struct pci_dev *dev = data;
+	struct pci_channel *chan = dev->sysdata;
+	resource_size_t start = res->start;
+
+	if (res->flags & IORESOURCE_IO) {
+		if (start < PCIBIOS_MIN_IO + chan->io_resource->start)
+			start = PCIBIOS_MIN_IO + chan->io_resource->start;
+
+		/*
+                 * Put everything into 0x00-0xff region modulo 0x400.
+		 */
+		if (start & 0x300) {
+			start = (start + 0x3ff) & ~0x3ff;
+			res->start = start;
+		}
+	} else if (res->flags & IORESOURCE_MEM) {
+		if (start < PCIBIOS_MIN_MEM + chan->mem_resource->start)
+			start = PCIBIOS_MIN_MEM + chan->mem_resource->start;
+	}
+
+	res->start = start;
+}
+
 int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 			enum pci_mmap_state mmap_state, int write_combine)
 {
@@ -24,3 +59,10 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 			       vma->vm_end - vma->vm_start,
 			       vma->vm_page_prot);
 }
+
+#ifdef CONFIG_HOTPLUG
+EXPORT_SYMBOL(pcibios_resource_to_bus);
+EXPORT_SYMBOL(pcibios_bus_to_resource);
+EXPORT_SYMBOL(PCIBIOS_MIN_IO);
+EXPORT_SYMBOL(PCIBIOS_MIN_MEM);
+#endif
diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci-new.c
index 097eb88..4e9251f 100644
--- a/arch/sh/drivers/pci/pci-new.c
+++ b/arch/sh/drivers/pci/pci-new.c
@@ -129,29 +129,6 @@ pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
 	res->end = region->end + offset;
 }
 
-void pcibios_align_resource(void *data, struct resource *res,
-			    resource_size_t size, resource_size_t align)
-			    __attribute__ ((weak));
-
-/*
- * We need to avoid collisions with `mirrored' VGA ports
- * and other strange ISA hardware, so we always want the
- * addresses to be allocated in the 0x000-0x0ff region
- * modulo 0x400.
- */
-void pcibios_align_resource(void *data, struct resource *res,
-			    resource_size_t size, resource_size_t align)
-{
-	if (res->flags & IORESOURCE_IO) {
-		resource_size_t start = res->start;
-
-		if (start & 0x300) {
-			start = (start + 0x3ff) & ~0x3ff;
-			res->start = start;
-		}
-	}
-}
-
 int pcibios_enable_device(struct pci_dev *dev, int mask)
 {
 	u16 cmd, old_cmd;
diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c
index 6d659cd..f670988 100644
--- a/arch/sh/drivers/pci/pci.c
+++ b/arch/sh/drivers/pci/pci.c
@@ -86,29 +86,6 @@ void pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
 	res->end = region->end;
 }
 
-void pcibios_align_resource(void *data, struct resource *res,
-			    resource_size_t size, resource_size_t align)
-			    __attribute__ ((weak));
-
-/*
- * We need to avoid collisions with `mirrored' VGA ports
- * and other strange ISA hardware, so we always want the
- * addresses to be allocated in the 0x000-0x0ff region
- * modulo 0x400.
- */
-void pcibios_align_resource(void *data, struct resource *res,
-			    resource_size_t size, resource_size_t align)
-{
-	if (res->flags & IORESOURCE_IO) {
-		resource_size_t start = res->start;
-
-		if (start & 0x300) {
-			start = (start + 0x3ff) & ~0x3ff;
-			res->start = start;
-		}
-	}
-}
-
 int pcibios_enable_device(struct pci_dev *dev, int mask)
 {
 	u16 cmd, old_cmd;
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 46afd44..5212bf6 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -33,10 +33,7 @@ struct pci_channel {
  */
 extern struct pci_channel board_pci_channels[];
 
-/* ugly as hell, but makes drivers/pci/setup-res.c compile and work */
-#define __PCI_CHAN(bus)		((struct pci_channel *)bus->sysdata)
-#define PCIBIOS_MIN_IO		__PCI_CHAN(bus)->io_resource->start
-#define PCIBIOS_MIN_MEM		__PCI_CHAN(bus)->mem_resource->start
+extern unsigned long PCIBIOS_MIN_IO, PCIBIOS_MIN_MEM;
 
 struct pci_dev;
 
-- 
cgit v0.10.2


From 394b6d2fe624246e258a218dac68d44fe9a8411f Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 16:18:46 +0900
Subject: sh: pci: Kill off unused pcibios_fixup().

This is left over cruft that hasn't been used by anything in a long time,
kill off bits that weren't purged previously.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/ops-snapgear.c b/arch/sh/drivers/pci/ops-snapgear.c
index dd2c5df..b64f2b9 100644
--- a/arch/sh/drivers/pci/ops-snapgear.c
+++ b/arch/sh/drivers/pci/ops-snapgear.c
@@ -85,8 +85,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
 
 	return irq;
 }
-
-void __init pcibios_fixup(void)
-{
-	/* Nothing to fixup .. */
-}
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 5212bf6..e8265fd 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -138,7 +138,6 @@ static inline void __iomem *__get_pci_io_base(unsigned long port,
 #endif
 
 /* Board-specific fixup routines. */
-void pcibios_fixup(void);
 int pcibios_init_platform(void);
 int pcibios_map_platform_irq(struct pci_dev *dev, u8 slot, u8 pin);
 
-- 
cgit v0.10.2


From 0bb34a6bf1f71d5ad2abfda582a2c2794957bc7b Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 16:38:00 +0900
Subject: sh: pci: Consolidate pci_iomap() and use the generic I/O base.

This consolidates the pci_iomap() definitions and reworks how the I/O
port base is handled. PCI channels can register their own I/O map base,
or if none is provided, the system-wide generic I/O base is used instead.

Functionally nothing changes, while this allows us to kill off lots of
I/O address special casing and lookups.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/pci-lib.c b/arch/sh/drivers/pci/pci-lib.c
index 8ab1a2d..654ffcc 100644
--- a/arch/sh/drivers/pci/pci-lib.c
+++ b/arch/sh/drivers/pci/pci-lib.c
@@ -60,6 +60,57 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 			       vma->vm_page_prot);
 }
 
+static void __iomem *ioport_map_pci(struct pci_dev *dev,
+				    unsigned long port, unsigned int nr)
+{
+	struct pci_channel *chan = dev->sysdata;
+
+	if (!chan->io_map_base)
+		chan->io_map_base = generic_io_base;
+
+	return (void __iomem *)(chan->io_map_base + port);
+}
+
+void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
+{
+	resource_size_t start = pci_resource_start(dev, bar);
+	resource_size_t len = pci_resource_len(dev, bar);
+	unsigned long flags = pci_resource_flags(dev, bar);
+
+	if (unlikely(!len || !start))
+		return NULL;
+	if (maxlen && len > maxlen)
+		len = maxlen;
+
+	if (flags & IORESOURCE_IO)
+		return ioport_map_pci(dev, start, len);
+
+	/*
+	 * Presently the IORESOURCE_MEM case is a bit special, most
+	 * SH7751 style PCI controllers have PCI memory at a fixed
+	 * location in the address space where no remapping is desired.
+	 * With the IORESOURCE_MEM case more care has to be taken
+	 * to inhibit page table mapping for legacy cores, but this is
+	 * punted off to __ioremap().
+	 *					-- PFM.
+	 */
+	if (flags & IORESOURCE_MEM) {
+		if (flags & IORESOURCE_CACHEABLE)
+			return ioremap(start, len);
+
+		return ioremap_nocache(start, len);
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(pci_iomap);
+
+void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
+{
+	iounmap(addr);
+}
+EXPORT_SYMBOL(pci_iounmap);
+
 #ifdef CONFIG_HOTPLUG
 EXPORT_SYMBOL(pcibios_resource_to_bus);
 EXPORT_SYMBOL(pcibios_bus_to_resource);
diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci-new.c
index 4e9251f..c92e650 100644
--- a/arch/sh/drivers/pci/pci-new.c
+++ b/arch/sh/drivers/pci/pci-new.c
@@ -187,39 +187,4 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq)
 	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
 }
 
-void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
-{
-	resource_size_t start = pci_resource_start(dev, bar);
-	resource_size_t len = pci_resource_len(dev, bar);
-	unsigned long flags = pci_resource_flags(dev, bar);
-
-	if (unlikely(!len || !start))
-		return NULL;
-	if (maxlen && len > maxlen)
-		len = maxlen;
-
-	/*
-	 * Presently the IORESOURCE_MEM case is a bit special, most
-	 * SH7751 style PCI controllers have PCI memory at a fixed
-	 * location in the address space where no remapping is desired.
-	 * With the IORESOURCE_MEM case more care has to be taken
-	 * to inhibit page table mapping for legacy cores, but this is
-	 * punted off to __ioremap().
-	 *					-- PFM.
-	 */
-	if (flags & IORESOURCE_IO)
-		return ioport_map(start, len);
-	if (flags & IORESOURCE_MEM)
-		return ioremap(start, len);
-
-	return NULL;
-}
-EXPORT_SYMBOL(pci_iomap);
-
-void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
-{
-	iounmap(addr);
-}
-EXPORT_SYMBOL(pci_iounmap);
-
 EXPORT_SYMBOL(board_pci_channels);
diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c
index f670988..d39f240 100644
--- a/arch/sh/drivers/pci/pci.c
+++ b/arch/sh/drivers/pci/pci.c
@@ -144,39 +144,4 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq)
 	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
 }
 
-void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
-{
-	resource_size_t start = pci_resource_start(dev, bar);
-	resource_size_t len = pci_resource_len(dev, bar);
-	unsigned long flags = pci_resource_flags(dev, bar);
-
-	if (unlikely(!len || !start))
-		return NULL;
-	if (maxlen && len > maxlen)
-		len = maxlen;
-
-	/*
-	 * Presently the IORESOURCE_MEM case is a bit special, most
-	 * SH7751 style PCI controllers have PCI memory at a fixed
-	 * location in the address space where no remapping is desired.
-	 * With the IORESOURCE_MEM case more care has to be taken
-	 * to inhibit page table mapping for legacy cores, but this is
-	 * punted off to __ioremap().
-	 *					-- PFM.
-	 */
-	if (flags & IORESOURCE_IO)
-		return ioport_map(start, len);
-	if (flags & IORESOURCE_MEM)
-		return ioremap(start, len);
-
-	return NULL;
-}
-EXPORT_SYMBOL(pci_iomap);
-
-void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
-{
-	iounmap(addr);
-}
-EXPORT_SYMBOL(pci_iounmap);
-
 EXPORT_SYMBOL(board_pci_channels);
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index e8265fd..5324282 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -26,6 +26,8 @@ struct pci_channel {
 	int enabled;
 	unsigned long reg_base;
 	unsigned long io_base;
+
+	unsigned long io_map_base;
 };
 
 /*
@@ -110,31 +112,11 @@ static inline int __is_pci_memory(unsigned long phys_addr, unsigned long size)
 	}
 	return 0;
 }
-
-static inline void __iomem *__get_pci_io_base(unsigned long port,
-					      unsigned long size)
-{
-	struct pci_channel *p;
-	struct resource *res;
-
-	for (p = board_pci_channels; p->init; p++) {
-		res = p->io_resource;
-		if (p->enabled && (port >= res->start) &&
-		    (port + size) <= (res->end + 1))
-			return (void __iomem *)(p->io_base + port);
-	}
-	return NULL;
-}
 #else
 static inline int __is_pci_memory(unsigned long phys_addr, unsigned long size)
 {
 	return 0;
 }
-static inline void __iomem *__get_pci_io_base(unsigned long port,
-					      unsigned long size)
-{
-	return NULL;
-}
 #endif
 
 /* Board-specific fixup routines. */
diff --git a/arch/sh/kernel/io.c b/arch/sh/kernel/io.c
index 59fb020..4f85fff 100644
--- a/arch/sh/kernel/io.c
+++ b/arch/sh/kernel/io.c
@@ -70,10 +70,6 @@ void __iomem *ioport_map(unsigned long port, unsigned int nr)
 	if (ret)
 		return ret;
 
-	ret = __get_pci_io_base(port, nr);
-	if (ret)
-		return ret;
-
 	return __ioport_map(port, nr);
 }
 EXPORT_SYMBOL(ioport_map);
-- 
cgit v0.10.2


From f3b9aae16219aaeca2dd5a9ca69f7a10faa063df Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 19 Apr 2009 23:39:33 +0200
Subject: tracing/ring-buffer: Add unlock recursion protection on discard

The pair of helpers trace_recursive_lock() and trace_recursive_unlock()
have been introduced recently to provide generic tracing recursion
protection.

They are used in a symetric way:

 - trace_recursive_lock() on buffer reserve
 - trace_recursive_unlock() on buffer commit

However sometimes, we don't commit but discard on entry
to the buffer, ie: in case of filter checking.

Then we must also unlock the recursion protection on discard time,
otherwise the tracing gets definitely deactivated and a warning
is raised spuriously, such as:

111.119821] ------------[ cut here ]------------
[  111.119829] WARNING: at kernel/trace/ring_buffer.c:1498 ring_buffer_lock_reserve+0x1b7/0x1d0()
[  111.119835] Hardware name: AMILO Li 2727
[  111.119839] Modules linked in:
[  111.119846] Pid: 5731, comm: Xorg Tainted: G        W  2.6.30-rc1 #69
[  111.119851] Call Trace:
[  111.119863]  [<ffffffff8025ce68>] warn_slowpath+0xd8/0x130
[  111.119873]  [<ffffffff8028a30f>] ? __lock_acquire+0x19f/0x1ae0
[  111.119882]  [<ffffffff8028a30f>] ? __lock_acquire+0x19f/0x1ae0
[  111.119891]  [<ffffffff802199b0>] ? native_sched_clock+0x20/0x70
[  111.119899]  [<ffffffff80286dee>] ? put_lock_stats+0xe/0x30
[  111.119906]  [<ffffffff80286eb8>] ? lock_release_holdtime+0xa8/0x150
[  111.119913]  [<ffffffff802c8ae7>] ring_buffer_lock_reserve+0x1b7/0x1d0
[  111.119921]  [<ffffffff802cd110>] trace_buffer_lock_reserve+0x30/0x70
[  111.119930]  [<ffffffff802ce000>] trace_current_buffer_lock_reserve+0x20/0x30
[  111.119939]  [<ffffffff802474e8>] ftrace_raw_event_sched_switch+0x58/0x100
[  111.119948]  [<ffffffff808103b7>] __schedule+0x3a7/0x4cd
[  111.119957]  [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b
[  111.119964]  [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b
[  111.119971]  [<ffffffff80810c08>] schedule+0x18/0x40
[  111.119977]  [<ffffffff80810e09>] preempt_schedule+0x39/0x60
[  111.119985]  [<ffffffff80813bd3>] _read_unlock+0x53/0x60
[  111.119993]  [<ffffffff807259d2>] sock_def_readable+0x72/0x80
[  111.120002]  [<ffffffff807ad5ed>] unix_stream_sendmsg+0x24d/0x3d0
[  111.120011]  [<ffffffff807219a3>] sock_aio_write+0x143/0x160
[  111.120019]  [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b
[  111.120026]  [<ffffffff80721860>] ? sock_aio_write+0x0/0x160
[  111.120033]  [<ffffffff80721860>] ? sock_aio_write+0x0/0x160
[  111.120042]  [<ffffffff8031c283>] do_sync_readv_writev+0xf3/0x140
[  111.120049]  [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b
[  111.120057]  [<ffffffff80276ff0>] ? autoremove_wake_function+0x0/0x40
[  111.120067]  [<ffffffff8045d489>] ? cap_file_permission+0x9/0x10
[  111.120074]  [<ffffffff8045c1e6>] ? security_file_permission+0x16/0x20
[  111.120082]  [<ffffffff8031cab4>] do_readv_writev+0xd4/0x1f0
[  111.120089]  [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b
[  111.120097]  [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b
[  111.120105]  [<ffffffff8031cc18>] vfs_writev+0x48/0x70
[  111.120111]  [<ffffffff8031cd65>] sys_writev+0x55/0xc0
[  111.120119]  [<ffffffff80211e32>] system_call_fastpath+0x16/0x1b
[  111.120125] ---[ end trace 15605f4e98d5ccb5 ]---

[ Impact: fix spurious warning triggering tracing shutdown ]

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bffde63..e145969 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1642,6 +1642,14 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
 
+static inline void rb_event_discard(struct ring_buffer_event *event)
+{
+	event->type = RINGBUF_TYPE_PADDING;
+	/* time delta must be non zero */
+	if (!event->time_delta)
+		event->time_delta = 1;
+}
+
 /**
  * ring_buffer_event_discard - discard any event in the ring buffer
  * @event: the event to discard
@@ -1656,10 +1664,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
  */
 void ring_buffer_event_discard(struct ring_buffer_event *event)
 {
-	event->type = RINGBUF_TYPE_PADDING;
-	/* time delta must be non zero */
-	if (!event->time_delta)
-		event->time_delta = 1;
+	rb_event_discard(event);
+	trace_recursive_unlock();
 }
 EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
 
@@ -1690,7 +1696,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 	int cpu;
 
 	/* The event is discarded regardless */
-	ring_buffer_event_discard(event);
+	rb_event_discard(event);
 
 	/*
 	 * This must only be called if the event has not been
@@ -1735,6 +1741,8 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 	if (rb_is_commit(cpu_buffer, event))
 		rb_set_commit_to_write(cpu_buffer);
 
+	trace_recursive_unlock();
+
 	/*
 	 * Only the last preempt count needs to restore preemption.
 	 */
-- 
cgit v0.10.2


From 99f95f117848088f2708b45c70be73152e78bb8a Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 18:24:57 +0900
Subject: sh: pci: Rework fixed region checks in ioremap().

Not all PCI channels have non-translatable memory windows, this is a
special property of the on-chip PCIC with its 0xfd00... mapping, handle
this explicitly.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 5324282..82a9369 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -90,7 +90,6 @@ static inline void pcibios_penalize_isa_irq(int irq, int active)
 #define pci_unmap_len_set(PTR, LEN_NAME, VAL)	do { } while (0)
 #endif
 
-#ifdef CONFIG_PCI
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
 					unsigned long *strategy_parameter)
@@ -99,24 +98,18 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 	*strategy_parameter = ~0UL;
 }
 
-static inline int __is_pci_memory(unsigned long phys_addr, unsigned long size)
-{
-	struct pci_channel *p;
-	struct resource *res;
-
-	for (p = board_pci_channels; p->init; p++) {
-		res = p->mem_resource;
-		if (p->enabled && (phys_addr >= res->start) &&
-		    (phys_addr + size) <= (res->end + 1))
-			return 1;
-	}
-	return 0;
-}
+#ifdef CONFIG_SUPERH32
+/*
+ * If we're on an SH7751 or SH7780 PCI controller, PCI memory is mapped
+ * at the end of the address space in a special non-translatable area.
+ */
+#define PCI_MEM_FIXED_START	0xfd000000
+#define PCI_MEM_FIXED_END	(PCI_MEM_FIXED_START + 0x01000000)
+
+#define is_pci_memory_fixed_range(s, e)	\
+	((s) >= PCI_MEM_FIXED_START && (e) < PCI_MEM_FIXED_END)
 #else
-static inline int __is_pci_memory(unsigned long phys_addr, unsigned long size)
-{
-	return 0;
-}
+#define is_pci_memory_fixed_range(s, e)	(0)
 #endif
 
 /* Board-specific fixup routines. */
diff --git a/arch/sh/mm/ioremap_32.c b/arch/sh/mm/ioremap_32.c
index 7e04cc8..da2f418 100644
--- a/arch/sh/mm/ioremap_32.c
+++ b/arch/sh/mm/ioremap_32.c
@@ -46,17 +46,15 @@ void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
 		return NULL;
 
 	/*
-	 * If we're on an SH7751 or SH7780 PCI controller, PCI memory is
-	 * mapped at the end of the address space (typically 0xfd000000)
-	 * in a non-translatable area, so mapping through page tables for
-	 * this area is not only pointless, but also fundamentally
-	 * broken. Just return the physical address instead.
+	 * If we're in the fixed PCI memory range, mapping through page
+	 * tables is not only pointless, but also fundamentally broken.
+	 * Just return the physical address instead.
 	 *
 	 * For boards that map a small PCI memory aperture somewhere in
 	 * P1/P2 space, ioremap() will already do the right thing,
 	 * and we'll never get this far.
 	 */
-	if (__is_pci_memory(phys_addr, size))
+	if (is_pci_memory_fixed_range(phys_addr, size))
 		return (void __iomem *)phys_addr;
 
 #if !defined(CONFIG_PMB_FIXED)
@@ -121,7 +119,9 @@ void __iounmap(void __iomem *addr)
 	unsigned long seg = PXSEG(vaddr);
 	struct vm_struct *p;
 
-	if (seg < P3SEG || vaddr >= P3_ADDR_MAX || __is_pci_memory(vaddr, 0))
+	if (seg < P3SEG || vaddr >= P3_ADDR_MAX)
+		return;
+	if (is_pci_memory_fixed_range(vaddr, 0))
 		return;
 
 #ifdef CONFIG_PMB
-- 
cgit v0.10.2


From e79066a659b893debe19010179d3f3f015d76d1c Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 18:29:22 +0900
Subject: sh: pci: New-style controller registration.

This moves off of the board_pci_channels[] approach for bus registration
and over to a cleaner register_pci_controller(), all derived from the
MIPS code.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci-new.c
index c92e650..78b7292 100644
--- a/arch/sh/drivers/pci/pci-new.c
+++ b/arch/sh/drivers/pci/pci-new.c
@@ -13,40 +13,90 @@
 #include <linux/init.h>
 #include <linux/dma-debug.h>
 #include <linux/io.h>
+#include <linux/mutex.h>
 
-static int __init pcibios_init(void)
+/*
+ * The PCI controller list.
+ */
+static struct pci_channel *hose_head, **hose_tail = &hose_head;
+
+static int pci_initialized;
+
+static void __devinit pcibios_scanbus(struct pci_channel *hose)
 {
-	struct pci_channel *p;
+	static int next_busno;
 	struct pci_bus *bus;
-	int busno;
-
-	/* init channels */
-	busno = 0;
-	for (p = board_pci_channels; p->init; p++) {
-		if (p->init(p) == 0)
-			p->enabled = 1;
-		else
-			pr_err("Unable to init pci channel %d\n", busno);
-		busno++;
+
+	/* Catch botched conversion attempts */
+	BUG_ON(hose->init);
+
+	bus = pci_scan_bus(next_busno, hose->pci_ops, hose);
+	if (bus) {
+		next_busno = bus->subordinate + 1;
+		/* Don't allow 8-bit bus number overflow inside the hose -
+		   reserve some space for bridges. */
+		if (next_busno > 224)
+			next_busno = 0;
+
+		pci_bus_size_bridges(bus);
+		pci_bus_assign_resources(bus);
+		pci_enable_bridges(bus);
 	}
+}
 
-	/* scan the buses */
-	busno = 0;
-	for (p = board_pci_channels; p->init; p++) {
-		if (p->enabled) {
-			bus = pci_scan_bus(busno, p->pci_ops, p);
-			busno = bus->subordinate + 1;
+static DEFINE_MUTEX(pci_scan_mutex);
 
-			pci_bus_size_bridges(bus);
-			pci_bus_assign_resources(bus);
-			pci_enable_bridges(bus);
-		}
+void __devinit register_pci_controller(struct pci_channel *hose)
+{
+	if (request_resource(&iomem_resource, hose->mem_resource) < 0)
+		goto out;
+	if (request_resource(&ioport_resource, hose->io_resource) < 0) {
+		release_resource(hose->mem_resource);
+		goto out;
+	}
+
+	*hose_tail = hose;
+	hose_tail = &hose->next;
+
+	/*
+	 * Do not panic here but later - this might hapen before console init.
+	 */
+	if (!hose->io_map_base) {
+		printk(KERN_WARNING
+		       "registering PCI controller with io_map_base unset\n");
+	}
+
+	/*
+	 * Scan the bus if it is register after the PCI subsystem
+	 * initialization.
+	 */
+	if (pci_initialized) {
+		mutex_lock(&pci_scan_mutex);
+		pcibios_scanbus(hose);
+		mutex_unlock(&pci_scan_mutex);
 	}
 
+	return;
+
+out:
+	printk(KERN_WARNING
+	       "Skipping PCI bus scan due to resource conflict\n");
+}
+
+static int __init pcibios_init(void)
+{
+	struct pci_channel *hose;
+
+	/* Scan all of the recorded PCI controllers.  */
+	for (hose = hose_head; hose; hose = hose->next)
+		pcibios_scanbus(hose);
+
 	pci_fixup_irqs(pci_common_swizzle, pcibios_map_platform_irq);
 
 	dma_debug_add_bus(&pci_bus_type);
 
+	pci_initialized = 1;
+
 	return 0;
 }
 subsys_initcall(pcibios_init);
@@ -74,7 +124,6 @@ static void pcibios_fixup_device_resources(struct pci_dev *dev,
 	}
 }
 
-
 /*
  *  Called after each bus is probed, but before its children
  *  are examined.
@@ -186,5 +235,3 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq)
 {
 	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
 }
-
-EXPORT_SYMBOL(board_pci_channels);
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index f02d9df..4dd6e3b 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -15,11 +15,47 @@
 #include <linux/delay.h>
 #include "pci-sh4.h"
 
-static int __init sh7780_pci_init(struct pci_channel *chan)
+extern u8 pci_cache_line_size;
+
+static struct resource sh7785_io_resource = {
+	.name	= "SH7785_IO",
+	.start	= SH7780_PCI_IO_BASE,
+	.end	= SH7780_PCI_IO_BASE + SH7780_PCI_IO_SIZE - 1,
+	.flags	= IORESOURCE_IO
+};
+
+static struct resource sh7785_mem_resource = {
+	.name	= "SH7785_mem",
+	.start	= SH7780_PCI_MEMORY_BASE,
+	.end	= SH7780_PCI_MEMORY_BASE + SH7780_PCI_MEM_SIZE - 1,
+	.flags	= IORESOURCE_MEM
+};
+
+static struct pci_channel sh7780_pci_controller = {
+	.pci_ops	= &sh4_pci_ops,
+	.mem_resource	= &sh7785_mem_resource,
+	.io_resource	= &sh7785_io_resource,
+};
+
+static struct sh4_pci_address_map sh7780_pci_map = {
+	.window0	= {
+#if defined(CONFIG_32BIT)
+		.base	= SH7780_32BIT_DDR_BASE_ADDR,
+		.size	= 0x40000000,
+#else
+		.base	= SH7780_CS0_BASE_ADDR,
+		.size	= 0x20000000,
+#endif
+	},
+};
+
+static int __init sh7780_pci_init(void)
 {
+	struct pci_channel *chan = &sh7780_pci_controller;
 	unsigned int id;
 	const char *type = NULL;
 	int ret;
+	u32 word;
 
 	printk(KERN_NOTICE "PCI: Starting intialization.\n");
 
@@ -55,52 +91,6 @@ static int __init sh7780_pci_init(struct pci_channel *chan)
 		return ret;
 
 	/*
-	 * Platform specific initialization (BSC registers, and memory space
-	 * mapping) will be called via the platform defined function
-	 * pcibios_init_platform().
-	 */
-	return pcibios_init_platform();
-}
-
-extern u8 pci_cache_line_size;
-
-static struct resource sh7785_io_resource = {
-	.name	= "SH7785_IO",
-	.start	= SH7780_PCI_IO_BASE,
-	.end	= SH7780_PCI_IO_BASE + SH7780_PCI_IO_SIZE - 1,
-	.flags	= IORESOURCE_IO
-};
-
-static struct resource sh7785_mem_resource = {
-	.name	= "SH7785_mem",
-	.start	= SH7780_PCI_MEMORY_BASE,
-	.end	= SH7780_PCI_MEMORY_BASE + SH7780_PCI_MEM_SIZE - 1,
-	.flags	= IORESOURCE_MEM
-};
-
-struct pci_channel board_pci_channels[] = {
-	{ sh7780_pci_init, &sh4_pci_ops, &sh7785_io_resource, &sh7785_mem_resource, 0, 0xff },
-	{ NULL, NULL, NULL, 0, 0 },
-};
-
-static struct sh4_pci_address_map sh7780_pci_map = {
-	.window0	= {
-#if defined(CONFIG_32BIT)
-		.base	= SH7780_32BIT_DDR_BASE_ADDR,
-		.size	= 0x40000000,
-#else
-		.base	= SH7780_CS0_BASE_ADDR,
-		.size	= 0x20000000,
-#endif
-	},
-};
-
-int __init pcibios_init_platform(void)
-{
-	struct pci_channel *chan = &board_pci_channels[0];
-	u32 word;
-
-	/*
 	 * Set the class and sub-class codes.
 	 */
 	__raw_writeb(PCI_CLASS_BRIDGE_HOST >> 8,
@@ -153,5 +143,8 @@ int __init pcibios_init_platform(void)
 
 	__set_io_port_base(SH7780_PCI_IO_BASE);
 
+	register_pci_controller(chan);
+
 	return 0;
 }
+arch_initcall(sh7780_pci_init);
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 82a9369..e057ebd 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -17,17 +17,22 @@
  * external) PCI controllers.
  */
 struct pci_channel {
-	int (*init)(struct pci_channel *chan);
-	struct pci_ops *pci_ops;
-	struct resource *io_resource;
-	struct resource *mem_resource;
-	int first_devfn;
-	int last_devfn;
-	int enabled;
-	unsigned long reg_base;
-	unsigned long io_base;
-
-	unsigned long io_map_base;
+	struct pci_channel	*next;
+
+	int			(*init)(struct pci_channel *chan);
+
+	struct pci_ops		*pci_ops;
+	struct resource		*io_resource;
+	struct resource		*mem_resource;
+
+	int			first_devfn;
+	int			last_devfn;
+	int			enabled;
+
+	unsigned long		reg_base;
+	unsigned long		io_base;
+
+	unsigned long		io_map_base;
 };
 
 /*
@@ -35,6 +40,8 @@ struct pci_channel {
  */
 extern struct pci_channel board_pci_channels[];
 
+extern void register_pci_controller(struct pci_channel *hose);
+
 extern unsigned long PCIBIOS_MIN_IO, PCIBIOS_MIN_MEM;
 
 struct pci_dev;
-- 
cgit v0.10.2


From 09cfeb133e3cac39b8b9a2cb1d8ab4f77e396248 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 18:42:00 +0900
Subject: sh: pci: Track io and mem_offset per-channel.

This implements a per-hose offset for I/O and mem resources.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci-new.c
index 78b7292..e8ac8da 100644
--- a/arch/sh/drivers/pci/pci-new.c
+++ b/arch/sh/drivers/pci/pci-new.c
@@ -105,7 +105,7 @@ static void pcibios_fixup_device_resources(struct pci_dev *dev,
 	struct pci_bus *bus)
 {
 	/* Update device resources.  */
-	struct pci_channel *chan = bus->sysdata;
+	struct pci_channel *hose = bus->sysdata;
 	unsigned long offset = 0;
 	int i;
 
@@ -115,9 +115,9 @@ static void pcibios_fixup_device_resources(struct pci_dev *dev,
 		if (dev->resource[i].flags & IORESOURCE_PCI_FIXED)
 			continue;
 		if (dev->resource[i].flags & IORESOURCE_IO)
-			offset = chan->io_base;
+			offset = hose->io_offset;
 		else if (dev->resource[i].flags & IORESOURCE_MEM)
-			offset = 0;
+			offset = hose->mem_offset;
 
 		dev->resource[i].start += offset;
 		dev->resource[i].end += offset;
@@ -150,13 +150,13 @@ void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus)
 void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region,
 			 struct resource *res)
 {
-	struct pci_channel *chan = dev->sysdata;
+	struct pci_channel *hose = dev->sysdata;
 	unsigned long offset = 0;
 
 	if (res->flags & IORESOURCE_IO)
-		offset = chan->io_base;
+		offset = hose->io_offset;
 	else if (res->flags & IORESOURCE_MEM)
-		offset = 0;
+		offset = hose->mem_offset;
 
 	region->start = res->start - offset;
 	region->end = res->end - offset;
@@ -166,13 +166,13 @@ void __devinit
 pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
 			struct pci_bus_region *region)
 {
-	struct pci_channel *chan = dev->sysdata;
+	struct pci_channel *hose = dev->sysdata;
 	unsigned long offset = 0;
 
 	if (res->flags & IORESOURCE_IO)
-		offset = chan->io_base;
+		offset = hose->io_offset;
 	else if (res->flags & IORESOURCE_MEM)
-		offset = 0;
+		offset = hose->mem_offset;
 
 	res->start = region->start + offset;
 	res->end = region->end + offset;
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 4dd6e3b..57a3b87 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -34,7 +34,9 @@ static struct resource sh7785_mem_resource = {
 static struct pci_channel sh7780_pci_controller = {
 	.pci_ops	= &sh4_pci_ops,
 	.mem_resource	= &sh7785_mem_resource,
+	.mem_offset	= 0x00000000,
 	.io_resource	= &sh7785_io_resource,
+	.io_offset	= 0x00000000,
 };
 
 static struct sh4_pci_address_map sh7780_pci_map = {
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index e057ebd..0be2052 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -25,6 +25,9 @@ struct pci_channel {
 	struct resource		*io_resource;
 	struct resource		*mem_resource;
 
+	unsigned long		io_offset;
+	unsigned long		mem_offset;
+
 	int			first_devfn;
 	int			last_devfn;
 	int			enabled;
-- 
cgit v0.10.2


From 5160d3f782a5e0cdb3bdaa8a891a1fb9d9ab83ec Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 18:47:21 +0900
Subject: sh: pci: Consolidate bus<->resource mapping in pci-lib.

Now that the io and mem offsets are tracked accordingly, the pci-new
version of the bus<->resource mappers can be used generically. This
moves them in to pci-lib.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/pci-lib.c b/arch/sh/drivers/pci/pci-lib.c
index 654ffcc..9fd3af9 100644
--- a/arch/sh/drivers/pci/pci-lib.c
+++ b/arch/sh/drivers/pci/pci-lib.c
@@ -39,6 +39,37 @@ void pcibios_align_resource(void *data, struct resource *res,
 	res->start = start;
 }
 
+void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region,
+			 struct resource *res)
+{
+	struct pci_channel *hose = dev->sysdata;
+	unsigned long offset = 0;
+
+	if (res->flags & IORESOURCE_IO)
+		offset = hose->io_offset;
+	else if (res->flags & IORESOURCE_MEM)
+		offset = hose->mem_offset;
+
+	region->start = res->start - offset;
+	region->end = res->end - offset;
+}
+
+void __devinit
+pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
+			struct pci_bus_region *region)
+{
+	struct pci_channel *hose = dev->sysdata;
+	unsigned long offset = 0;
+
+	if (res->flags & IORESOURCE_IO)
+		offset = hose->io_offset;
+	else if (res->flags & IORESOURCE_MEM)
+		offset = hose->mem_offset;
+
+	res->start = region->start + offset;
+	res->end = region->end + offset;
+}
+
 int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 			enum pci_mmap_state mmap_state, int write_combine)
 {
diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci-new.c
index e8ac8da..9d42614 100644
--- a/arch/sh/drivers/pci/pci-new.c
+++ b/arch/sh/drivers/pci/pci-new.c
@@ -147,37 +147,6 @@ void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus)
 	}
 }
 
-void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region,
-			 struct resource *res)
-{
-	struct pci_channel *hose = dev->sysdata;
-	unsigned long offset = 0;
-
-	if (res->flags & IORESOURCE_IO)
-		offset = hose->io_offset;
-	else if (res->flags & IORESOURCE_MEM)
-		offset = hose->mem_offset;
-
-	region->start = res->start - offset;
-	region->end = res->end - offset;
-}
-
-void __devinit
-pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
-			struct pci_bus_region *region)
-{
-	struct pci_channel *hose = dev->sysdata;
-	unsigned long offset = 0;
-
-	if (res->flags & IORESOURCE_IO)
-		offset = hose->io_offset;
-	else if (res->flags & IORESOURCE_MEM)
-		offset = hose->mem_offset;
-
-	res->start = region->start + offset;
-	res->end = region->end + offset;
-}
-
 int pcibios_enable_device(struct pci_dev *dev, int mask)
 {
 	u16 cmd, old_cmd;
diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c
index d39f240..c15a6f0 100644
--- a/arch/sh/drivers/pci/pci.c
+++ b/arch/sh/drivers/pci/pci.c
@@ -72,20 +72,6 @@ void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus)
 	pci_read_bridge_bases(bus);
 }
 
-void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region,
-			     struct resource *res)
-{
-	region->start = res->start;
-	region->end = res->end;
-}
-
-void pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
-			     struct pci_bus_region *region)
-{
-	res->start = region->start;
-	res->end = region->end;
-}
-
 int pcibios_enable_device(struct pci_dev *dev, int mask)
 {
 	u16 cmd, old_cmd;
-- 
cgit v0.10.2


From 3f8daeacd7ed7a502daf0998e2515cea4f467f21 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 18:53:41 +0900
Subject: sh: pci: Consolidate the remaining common bits.

This moves the remaining common bits in to pci-lib. Thereby reducing
pci.c/pci-new.c to simple bus fixups and controller registration.

As more platforms are moved over, the old code will disappear completely
and the pci-new bits will be rolled in to pci-lib, eventually replacing
pci.c completely.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/pci-lib.c b/arch/sh/drivers/pci/pci-lib.c
index 9fd3af9..ea83629 100644
--- a/arch/sh/drivers/pci/pci-lib.c
+++ b/arch/sh/drivers/pci/pci-lib.c
@@ -70,6 +70,70 @@ pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
 	res->end = region->end + offset;
 }
 
+int pcibios_enable_device(struct pci_dev *dev, int mask)
+{
+	u16 cmd, old_cmd;
+	int idx;
+	struct resource *r;
+
+	pci_read_config_word(dev, PCI_COMMAND, &cmd);
+	old_cmd = cmd;
+	for (idx=0; idx < PCI_NUM_RESOURCES; idx++) {
+		/* Only set up the requested stuff */
+		if (!(mask & (1<<idx)))
+			continue;
+
+		r = &dev->resource[idx];
+		if (!(r->flags & (IORESOURCE_IO | IORESOURCE_MEM)))
+			continue;
+		if ((idx == PCI_ROM_RESOURCE) &&
+				(!(r->flags & IORESOURCE_ROM_ENABLE)))
+			continue;
+		if (!r->start && r->end) {
+			printk(KERN_ERR "PCI: Device %s not available "
+			       "because of resource collisions\n",
+			       pci_name(dev));
+			return -EINVAL;
+		}
+		if (r->flags & IORESOURCE_IO)
+			cmd |= PCI_COMMAND_IO;
+		if (r->flags & IORESOURCE_MEM)
+			cmd |= PCI_COMMAND_MEMORY;
+	}
+	if (cmd != old_cmd) {
+		printk("PCI: Enabling device %s (%04x -> %04x)\n",
+		       pci_name(dev), old_cmd, cmd);
+		pci_write_config_word(dev, PCI_COMMAND, cmd);
+	}
+	return 0;
+}
+
+/*
+ *  If we set up a device for bus mastering, we need to check and set
+ *  the latency timer as it may not be properly set.
+ */
+static unsigned int pcibios_max_latency = 255;
+
+void pcibios_set_master(struct pci_dev *dev)
+{
+	u8 lat;
+	pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat);
+	if (lat < 16)
+		lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency;
+	else if (lat > pcibios_max_latency)
+		lat = pcibios_max_latency;
+	else
+		return;
+	printk(KERN_INFO "PCI: Setting latency timer of device %s to %d\n",
+	       pci_name(dev), lat);
+	pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
+}
+
+void __init pcibios_update_irq(struct pci_dev *dev, int irq)
+{
+	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
+}
+
 int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 			enum pci_mmap_state mmap_state, int write_combine)
 {
diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci-new.c
index 9d42614..8c0b136 100644
--- a/arch/sh/drivers/pci/pci-new.c
+++ b/arch/sh/drivers/pci/pci-new.c
@@ -146,61 +146,3 @@ void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus)
 			pcibios_fixup_device_resources(dev, bus);
 	}
 }
-
-int pcibios_enable_device(struct pci_dev *dev, int mask)
-{
-	u16 cmd, old_cmd;
-	int idx;
-	struct resource *r;
-
-	pci_read_config_word(dev, PCI_COMMAND, &cmd);
-	old_cmd = cmd;
-	for(idx=0; idx<6; idx++) {
-		if (!(mask & (1 << idx)))
-			continue;
-		r = &dev->resource[idx];
-		if (!r->start && r->end) {
-			printk(KERN_ERR "PCI: Device %s not available because "
-			       "of resource collisions\n", pci_name(dev));
-			return -EINVAL;
-		}
-		if (r->flags & IORESOURCE_IO)
-			cmd |= PCI_COMMAND_IO;
-		if (r->flags & IORESOURCE_MEM)
-			cmd |= PCI_COMMAND_MEMORY;
-	}
-	if (dev->resource[PCI_ROM_RESOURCE].start)
-		cmd |= PCI_COMMAND_MEMORY;
-	if (cmd != old_cmd) {
-		printk(KERN_INFO "PCI: Enabling device %s (%04x -> %04x)\n",
-		       pci_name(dev), old_cmd, cmd);
-		pci_write_config_word(dev, PCI_COMMAND, cmd);
-	}
-	return 0;
-}
-
-/*
- *  If we set up a device for bus mastering, we need to check and set
- *  the latency timer as it may not be properly set.
- */
-static unsigned int pcibios_max_latency = 255;
-
-void pcibios_set_master(struct pci_dev *dev)
-{
-	u8 lat;
-	pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat);
-	if (lat < 16)
-		lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency;
-	else if (lat > pcibios_max_latency)
-		lat = pcibios_max_latency;
-	else
-		return;
-	printk(KERN_INFO "PCI: Setting latency timer of device %s to %d\n",
-	       pci_name(dev), lat);
-	pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
-}
-
-void __init pcibios_update_irq(struct pci_dev *dev, int irq)
-{
-	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
-}
diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c
index c15a6f0..8c332b2 100644
--- a/arch/sh/drivers/pci/pci.c
+++ b/arch/sh/drivers/pci/pci.c
@@ -72,62 +72,4 @@ void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus)
 	pci_read_bridge_bases(bus);
 }
 
-int pcibios_enable_device(struct pci_dev *dev, int mask)
-{
-	u16 cmd, old_cmd;
-	int idx;
-	struct resource *r;
-
-	pci_read_config_word(dev, PCI_COMMAND, &cmd);
-	old_cmd = cmd;
-	for(idx=0; idx<6; idx++) {
-		if (!(mask & (1 << idx)))
-			continue;
-		r = &dev->resource[idx];
-		if (!r->start && r->end) {
-			printk(KERN_ERR "PCI: Device %s not available because "
-			       "of resource collisions\n", pci_name(dev));
-			return -EINVAL;
-		}
-		if (r->flags & IORESOURCE_IO)
-			cmd |= PCI_COMMAND_IO;
-		if (r->flags & IORESOURCE_MEM)
-			cmd |= PCI_COMMAND_MEMORY;
-	}
-	if (dev->resource[PCI_ROM_RESOURCE].start)
-		cmd |= PCI_COMMAND_MEMORY;
-	if (cmd != old_cmd) {
-		printk(KERN_INFO "PCI: Enabling device %s (%04x -> %04x)\n",
-		       pci_name(dev), old_cmd, cmd);
-		pci_write_config_word(dev, PCI_COMMAND, cmd);
-	}
-	return 0;
-}
-
-/*
- *  If we set up a device for bus mastering, we need to check and set
- *  the latency timer as it may not be properly set.
- */
-static unsigned int pcibios_max_latency = 255;
-
-void pcibios_set_master(struct pci_dev *dev)
-{
-	u8 lat;
-	pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat);
-	if (lat < 16)
-		lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency;
-	else if (lat > pcibios_max_latency)
-		lat = pcibios_max_latency;
-	else
-		return;
-	printk(KERN_INFO "PCI: Setting latency timer of device %s to %d\n",
-	       pci_name(dev), lat);
-	pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
-}
-
-void __init pcibios_update_irq(struct pci_dev *dev, int irq)
-{
-	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
-}
-
 EXPORT_SYMBOL(board_pci_channels);
-- 
cgit v0.10.2


From 5ba7205fc49ff72e88784c94fb661f93e7ae7d36 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 19:00:32 +0900
Subject: sh: pci: Kill off the now unused hose->io_base.

Nothing is using this any more, so kill it off.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c
index af887443..4c08fd7 100644
--- a/arch/sh/drivers/pci/pci-sh7751.c
+++ b/arch/sh/drivers/pci/pci-sh7751.c
@@ -40,7 +40,6 @@ int __init sh7751_pci_init(struct pci_channel *chan)
 	pr_debug("PCI: Starting intialization.\n");
 
 	chan->reg_base = 0xfe200000;
-	chan->io_base = 0xfe240000;
 
 	/* check for SH7751/SH7751R hardware */
 	id = pci_read_reg(chan, SH7751_PCICONF0);
@@ -136,13 +135,6 @@ int __init sh7751_pcic_init(struct pci_channel *chan,
 	pr_debug("PCI: Setting upper bits of Memory window to 0x%x\n", word);
 	pci_write_reg(chan, word , SH4_PCIMBR);
 
-	/* Map IO space into PCI IO window:
-	 * IO addresses will be translated to the PCI IO window base address
-	 */
-	pr_debug("PCI: Mapping IO address 0x%x - 0x%x to base 0x%lx\n",
-		 chan->io_resource->start, chan->io_resource->end,
-		 chan->io_base + chan->io_resource->start);
-
 	/* Make sure the MSB's of IO window are set to access PCI space
 	 * correctly */
 	word = chan->io_resource->start & SH4_PCIIOBR_MASK;
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 57a3b87..ae13ff9 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -62,7 +62,6 @@ static int __init sh7780_pci_init(void)
 	printk(KERN_NOTICE "PCI: Starting intialization.\n");
 
 	chan->reg_base = 0xfe040000;
-	chan->io_base = 0xfe200000;
 
 	/* Enable CPU access to the PCIC registers. */
 	__raw_writel(PCIECR_ENBL, PCIECR);
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 0be2052..f36c789 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -33,7 +33,6 @@ struct pci_channel {
 	int			enabled;
 
 	unsigned long		reg_base;
-	unsigned long		io_base;
 
 	unsigned long		io_map_base;
 };
-- 
cgit v0.10.2


From bb3396477bf46c7cae6bd04b969580277957966e Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 19:26:45 +0900
Subject: sh: pci: Kill off superfluous lboxre2 pci fixups.

This is a verbatim copy of the r2d one, use that instead.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index c8eab14..4cac866 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -22,5 +22,5 @@ obj-$(CONFIG_SH_SDK7780)		+= fixups-sdk7780.o
 obj-$(CONFIG_SH_7780_SOLUTION_ENGINE)	+= fixups-sdk7780.o
 obj-$(CONFIG_SH_TITAN)			+= ops-titan.o
 obj-$(CONFIG_SH_LANDISK)		+= ops-landisk.o
-obj-$(CONFIG_SH_LBOX_RE2)		+= ops-lboxre2.o fixups-lboxre2.o
+obj-$(CONFIG_SH_LBOX_RE2)		+= ops-lboxre2.o fixups-rts7751r2d.o
 obj-$(CONFIG_SH_CAYMAN)			+= ops-cayman.o
diff --git a/arch/sh/drivers/pci/fixups-lboxre2.c b/arch/sh/drivers/pci/fixups-lboxre2.c
deleted file mode 100644
index a82011d..0000000
--- a/arch/sh/drivers/pci/fixups-lboxre2.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * arch/sh/drivers/pci/fixups-lboxre2.c
- *
- * L-BOX RE2 PCI fixups
- *
- * Copyright (C) 2007 Nobuhiro Iwamatsu
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#include <linux/pci.h>
-#include "pci-sh4.h"
-
-#define PCIMCR_MRSET_OFF	0xBFFFFFFF
-#define PCIMCR_RFSH_OFF		0xFFFFFFFB
-
-int pci_fixup_pcic(struct pci_channel *chan)
-{
-	unsigned long bcr1, mcr;
-
-	bcr1 = ctrl_inl(SH7751_BCR1);
-	bcr1 |= 0x40080000;	/* Enable Bit 19 BREQEN, set PCIC to slave */
-	pci_write_reg(chan, bcr1, SH4_PCIBCR1);
-
-	/* Enable all interrupts, so we known what to fix */
-	pci_write_reg(chan, 0x0000c3ff, SH4_PCIINTM);
-	pci_write_reg(chan, 0x0000380f, SH4_PCIAINTM);
-	pci_write_reg(chan, 0xfb900047, SH7751_PCICONF1);
-	pci_write_reg(chan, 0xab000001, SH7751_PCICONF4);
-
-	mcr = ctrl_inl(SH7751_MCR);
-	mcr = (mcr & PCIMCR_MRSET_OFF) & PCIMCR_RFSH_OFF;
-	pci_write_reg(chan, mcr, SH4_PCIMCR);
-
-	pci_write_reg(chan, 0x0c000000, SH7751_PCICONF5);
-	pci_write_reg(chan, 0xd0000000, SH7751_PCICONF6);
-	pci_write_reg(chan, 0x0c000000, SH4_PCILAR0);
-	pci_write_reg(chan, 0x00000000, SH4_PCILAR1);
-
-	return 0;
-}
-- 
cgit v0.10.2


From d556fcc101c4b0d57ac9742ab806a6bfed78eac1 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 19:31:20 +0900
Subject: sh: pci: Flag the dreamcast BBA as IORESOURCE_PCI_FIXED.

This isn't a real BAR, so prevent any attempts to move it, as we don't
wish to encourage a bus luck by overzealous PCI initialization code.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/fixups-dreamcast.c b/arch/sh/drivers/pci/fixups-dreamcast.c
index 2bf85cf..48c6381 100644
--- a/arch/sh/drivers/pci/fixups-dreamcast.c
+++ b/arch/sh/drivers/pci/fixups-dreamcast.c
@@ -41,6 +41,13 @@ static void __init gapspci_fixup_resources(struct pci_dev *dev)
 		 */
 		dev->resource[1].start	= p->io_resource->start  + 0x100;
 		dev->resource[1].end	= dev->resource[1].start + 0x200 - 1;
+
+		/*
+		 * This is not a normal BAR, prevent any attempts to move
+		 * the BAR, as this will result in a bus lock.
+		 */
+		dev->resource[1].flags |= IORESOURCE_PCI_FIXED;
+
 		/*
 		 * Redirect dma memory allocations to special memory window.
 		 */
-- 
cgit v0.10.2


From c563bf0965c86cc6087b09c34ca063fe96d6deca Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 19:39:57 +0900
Subject: sh: pci: Kill off dead references to is_pci_ioaddr and friends.

Some old boards are still using this in their I/O routines, kill it off.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/boards/mach-se/7751/io.c b/arch/sh/boards/mach-se/7751/io.c
index 6287ae5..6e75bd4 100644
--- a/arch/sh/boards/mach-se/7751/io.c
+++ b/arch/sh/boards/mach-se/7751/io.c
@@ -34,8 +34,6 @@ unsigned char sh7751se_inb(unsigned long port)
 {
 	if (PXSEG(port))
 		return *(volatile unsigned char *)port;
-	else if (is_pci_ioaddr(port))
-		return *(volatile unsigned char *)pci_ioaddr(port);
 	else
 		return (*port2adr(port)) & 0xff;
 }
@@ -46,8 +44,6 @@ unsigned char sh7751se_inb_p(unsigned long port)
 
         if (PXSEG(port))
                 v = *(volatile unsigned char *)port;
-	else if (is_pci_ioaddr(port))
-                v = *(volatile unsigned char *)pci_ioaddr(port);
 	else
 		v = (*port2adr(port)) & 0xff;
 	ctrl_delay();
@@ -58,8 +54,6 @@ unsigned short sh7751se_inw(unsigned long port)
 {
         if (PXSEG(port))
                 return *(volatile unsigned short *)port;
-	else if (is_pci_ioaddr(port))
-                return *(volatile unsigned short *)pci_ioaddr(port);
 	else if (port >= 0x2000)
 		return *port2adr(port);
 	else
@@ -71,8 +65,6 @@ unsigned int sh7751se_inl(unsigned long port)
 {
         if (PXSEG(port))
                 return *(volatile unsigned long *)port;
-	else if (is_pci_ioaddr(port))
-                return *(volatile unsigned int *)pci_ioaddr(port);
 	else if (port >= 0x2000)
 		return *port2adr(port);
 	else
@@ -85,8 +77,6 @@ void sh7751se_outb(unsigned char value, unsigned long port)
 
         if (PXSEG(port))
                 *(volatile unsigned char *)port = value;
-	else if (is_pci_ioaddr(port))
-		*((unsigned char*)pci_ioaddr(port)) = value;
 	else
 		*(port2adr(port)) = value;
 }
@@ -95,8 +85,6 @@ void sh7751se_outb_p(unsigned char value, unsigned long port)
 {
         if (PXSEG(port))
                 *(volatile unsigned char *)port = value;
-	else if (is_pci_ioaddr(port))
-		*((unsigned char*)pci_ioaddr(port)) = value;
 	else
 		*(port2adr(port)) = value;
 	ctrl_delay();
@@ -106,8 +94,6 @@ void sh7751se_outw(unsigned short value, unsigned long port)
 {
         if (PXSEG(port))
                 *(volatile unsigned short *)port = value;
-	else if (is_pci_ioaddr(port))
-		*((unsigned short *)pci_ioaddr(port)) = value;
 	else if (port >= 0x2000)
 		*port2adr(port) = value;
 	else
@@ -118,8 +104,6 @@ void sh7751se_outl(unsigned int value, unsigned long port)
 {
         if (PXSEG(port))
                 *(volatile unsigned long *)port = value;
-	else if (is_pci_ioaddr(port))
-		*((unsigned long*)pci_ioaddr(port)) = value;
 	else
 		maybebadio(port);
 }
diff --git a/arch/sh/boards/mach-snapgear/io.c b/arch/sh/boards/mach-snapgear/io.c
index 0f48242..476650e 100644
--- a/arch/sh/boards/mach-snapgear/io.c
+++ b/arch/sh/boards/mach-snapgear/io.c
@@ -36,8 +36,6 @@ unsigned char snapgear_inb(unsigned long port)
 {
 	if (PXSEG(port))
 		return *(volatile unsigned char *)port;
-	else if (is_pci_ioaddr(port))
-		return *(volatile unsigned char *)pci_ioaddr(port);
 	else
 		return (*port2adr(port)) & 0xff;
 }
@@ -48,8 +46,6 @@ unsigned char snapgear_inb_p(unsigned long port)
 
 	if (PXSEG(port))
 		v = *(volatile unsigned char *)port;
-	else if (is_pci_ioaddr(port))
-		v = *(volatile unsigned char *)pci_ioaddr(port);
 	else
 		v = (*port2adr(port))&0xff;
 	ctrl_delay();
@@ -60,8 +56,6 @@ unsigned short snapgear_inw(unsigned long port)
 {
 	if (PXSEG(port))
 		return *(volatile unsigned short *)port;
-	else if (is_pci_ioaddr(port))
-		return *(volatile unsigned short *)pci_ioaddr(port);
 	else if (port >= 0x2000)
 		return *port2adr(port);
 	else
@@ -73,8 +67,6 @@ unsigned int snapgear_inl(unsigned long port)
 {
 	if (PXSEG(port))
 		return *(volatile unsigned long *)port;
-	else if (is_pci_ioaddr(port))
-		return *(volatile unsigned int *)pci_ioaddr(port);
 	else if (port >= 0x2000)
 		return *port2adr(port);
 	else
@@ -87,8 +79,6 @@ void snapgear_outb(unsigned char value, unsigned long port)
 
 	if (PXSEG(port))
 		*(volatile unsigned char *)port = value;
-	else if (is_pci_ioaddr(port))
-		*((unsigned char*)pci_ioaddr(port)) = value;
 	else
 		*(port2adr(port)) = value;
 }
@@ -97,8 +87,6 @@ void snapgear_outb_p(unsigned char value, unsigned long port)
 {
 	if (PXSEG(port))
 		*(volatile unsigned char *)port = value;
-	else if (is_pci_ioaddr(port))
-		*((unsigned char*)pci_ioaddr(port)) = value;
 	else
 		*(port2adr(port)) = value;
 	ctrl_delay();
@@ -108,8 +96,6 @@ void snapgear_outw(unsigned short value, unsigned long port)
 {
 	if (PXSEG(port))
 		*(volatile unsigned short *)port = value;
-	else if (is_pci_ioaddr(port))
-		*((unsigned short *)pci_ioaddr(port)) = value;
 	else if (port >= 0x2000)
 		*port2adr(port) = value;
 	else
@@ -120,8 +106,6 @@ void snapgear_outl(unsigned int value, unsigned long port)
 {
 	if (PXSEG(port))
 		*(volatile unsigned long *)port = value;
-	else if (is_pci_ioaddr(port))
-		*((unsigned long*)pci_ioaddr(port)) = value;
 	else
 		maybebadio(port);
 }
diff --git a/arch/sh/boards/mach-systemh/io.c b/arch/sh/boards/mach-systemh/io.c
index dec3db0..15577ff 100644
--- a/arch/sh/boards/mach-systemh/io.c
+++ b/arch/sh/boards/mach-systemh/io.c
@@ -35,8 +35,6 @@ unsigned char sh7751systemh_inb(unsigned long port)
 {
 	if (PXSEG(port))
 		return *(volatile unsigned char *)port;
-	else if (is_pci_ioaddr(port))
-		return *(volatile unsigned char *)pci_ioaddr(port);
 	else if (port <= 0x3F1)
 		return *(volatile unsigned char *)ETHER_IOMAP(port);
 	else
@@ -49,8 +47,6 @@ unsigned char sh7751systemh_inb_p(unsigned long port)
 
         if (PXSEG(port))
                 v = *(volatile unsigned char *)port;
-	else if (is_pci_ioaddr(port))
-                v = *(volatile unsigned char *)pci_ioaddr(port);
 	else if (port <= 0x3F1)
 		v = *(volatile unsigned char *)ETHER_IOMAP(port);
 	else
@@ -63,8 +59,6 @@ unsigned short sh7751systemh_inw(unsigned long port)
 {
         if (PXSEG(port))
                 return *(volatile unsigned short *)port;
-	else if (is_pci_ioaddr(port))
-                return *(volatile unsigned short *)pci_ioaddr(port);
 	else if (port >= 0x2000)
 		return *port2adr(port);
 	else if (port <= 0x3F1)
@@ -78,8 +72,6 @@ unsigned int sh7751systemh_inl(unsigned long port)
 {
         if (PXSEG(port))
                 return *(volatile unsigned long *)port;
-	else if (is_pci_ioaddr(port))
-                return *(volatile unsigned int *)pci_ioaddr(port);
 	else if (port >= 0x2000)
 		return *port2adr(port);
 	else if (port <= 0x3F1)
@@ -94,8 +86,6 @@ void sh7751systemh_outb(unsigned char value, unsigned long port)
 
         if (PXSEG(port))
                 *(volatile unsigned char *)port = value;
-	else if (is_pci_ioaddr(port))
-		*((unsigned char*)pci_ioaddr(port)) = value;
 	else if (port <= 0x3F1)
 		*(volatile unsigned char *)ETHER_IOMAP(port) = value;
 	else
@@ -106,8 +96,6 @@ void sh7751systemh_outb_p(unsigned char value, unsigned long port)
 {
         if (PXSEG(port))
                 *(volatile unsigned char *)port = value;
-	else if (is_pci_ioaddr(port))
-		*((unsigned char*)pci_ioaddr(port)) = value;
 	else if (port <= 0x3F1)
 		*(volatile unsigned char *)ETHER_IOMAP(port) = value;
 	else
@@ -119,8 +107,6 @@ void sh7751systemh_outw(unsigned short value, unsigned long port)
 {
         if (PXSEG(port))
                 *(volatile unsigned short *)port = value;
-	else if (is_pci_ioaddr(port))
-		*((unsigned short *)pci_ioaddr(port)) = value;
 	else if (port >= 0x2000)
 		*port2adr(port) = value;
 	else if (port <= 0x3F1)
@@ -133,8 +119,6 @@ void sh7751systemh_outl(unsigned int value, unsigned long port)
 {
         if (PXSEG(port))
                 *(volatile unsigned long *)port = value;
-	else if (is_pci_ioaddr(port))
-		*((unsigned long*)pci_ioaddr(port)) = value;
 	else
 		maybebadio(port);
 }
diff --git a/arch/sh/boards/mach-titan/io.c b/arch/sh/boards/mach-titan/io.c
index 053b3ed..0130e98 100644
--- a/arch/sh/boards/mach-titan/io.c
+++ b/arch/sh/boards/mach-titan/io.c
@@ -17,8 +17,6 @@ u8 titan_inb(unsigned long port)
 {
         if (PXSEG(port))
                 return ctrl_inb(port);
-        else if (is_pci_ioaddr(port))
-                return ctrl_inb(pci_ioaddr(port));
         return ctrl_inw(port2adr(port)) & 0xff;
 }
 
@@ -28,8 +26,6 @@ u8 titan_inb_p(unsigned long port)
 
         if (PXSEG(port))
                 v = ctrl_inb(port);
-        else if (is_pci_ioaddr(port))
-                v = ctrl_inb(pci_ioaddr(port));
         else
                 v = ctrl_inw(port2adr(port)) & 0xff;
         ctrl_delay();
@@ -40,8 +36,6 @@ u16 titan_inw(unsigned long port)
 {
         if (PXSEG(port))
                 return ctrl_inw(port);
-        else if (is_pci_ioaddr(port))
-                return ctrl_inw(pci_ioaddr(port));
         else if (port >= 0x2000)
                 return ctrl_inw(port2adr(port));
         else
@@ -53,8 +47,6 @@ u32 titan_inl(unsigned long port)
 {
         if (PXSEG(port))
                 return ctrl_inl(port);
-        else if (is_pci_ioaddr(port))
-                return ctrl_inl(pci_ioaddr(port));
         else if (port >= 0x2000)
                 return ctrl_inw(port2adr(port));
         else
@@ -66,8 +58,6 @@ void titan_outb(u8 value, unsigned long port)
 {
         if (PXSEG(port))
                 ctrl_outb(value, port);
-        else if (is_pci_ioaddr(port))
-                ctrl_outb(value, pci_ioaddr(port));
         else
                 ctrl_outw(value, port2adr(port));
 }
@@ -76,8 +66,6 @@ void titan_outb_p(u8 value, unsigned long port)
 {
         if (PXSEG(port))
                 ctrl_outb(value, port);
-        else if (is_pci_ioaddr(port))
-                ctrl_outb(value, pci_ioaddr(port));
         else
                 ctrl_outw(value, port2adr(port));
         ctrl_delay();
@@ -87,8 +75,6 @@ void titan_outw(u16 value, unsigned long port)
 {
         if (PXSEG(port))
                 ctrl_outw(value, port);
-        else if (is_pci_ioaddr(port))
-                ctrl_outw(value, pci_ioaddr(port));
         else if (port >= 0x2000)
                 ctrl_outw(value, port2adr(port));
         else
@@ -99,8 +85,6 @@ void titan_outl(u32 value, unsigned long port)
 {
         if (PXSEG(port))
                 ctrl_outl(value, port);
-        else if (is_pci_ioaddr(port))
-                ctrl_outl(value, pci_ioaddr(port));
         else
                 maybebadio(port);
 }
@@ -119,8 +103,6 @@ void __iomem *titan_ioport_map(unsigned long port, unsigned int size)
 {
 	if (PXSEG(port))
 		return (void __iomem *)port;
-	else if (is_pci_ioaddr(port))
-		return (void __iomem *)pci_ioaddr(port);
 
 	return (void __iomem *)port2adr(port);
 }
-- 
cgit v0.10.2


From 0e75148108914062cb46ad3dc8f054a628018df7 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 19:48:48 +0900
Subject: sh: pci: Consolidate pcibios_setup() in pci-lib.

This wasn't really being used for anything useful, so just stub it in
pci-lib.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/ops-dreamcast.c b/arch/sh/drivers/pci/ops-dreamcast.c
index 5bc81d5..529aa4f 100644
--- a/arch/sh/drivers/pci/ops-dreamcast.c
+++ b/arch/sh/drivers/pci/ops-dreamcast.c
@@ -154,12 +154,6 @@ static int __init gapspci_init(struct pci_channel *chan)
 	return 0;
 }
 
-/* Haven't done anything here as yet */
-char * __devinit pcibios_setup(char *str)
-{
-	return str;
-}
-
 struct pci_channel board_pci_channels[] = {
 	{ gapspci_init, &gapspci_pci_ops, &gapspci_io_resource,
 	  &gapspci_mem_resource, 0, 1 },
diff --git a/arch/sh/drivers/pci/ops-sh4.c b/arch/sh/drivers/pci/ops-sh4.c
index 2a7f7b5..7cc1fcc 100644
--- a/arch/sh/drivers/pci/ops-sh4.c
+++ b/arch/sh/drivers/pci/ops-sh4.c
@@ -106,30 +106,27 @@ struct pci_ops sh4_pci_ops = {
  * Not really related to pci_ops, but it's common and not worth shoving
  * somewhere else for now..
  */
-static unsigned int pci_probe = PCI_PROBE_CONF1;
-
 int __init sh4_pci_check_direct(struct pci_channel *chan)
 {
 	/*
 	 * Check if configuration works.
 	 */
-	if (pci_probe & PCI_PROBE_CONF1) {
-		unsigned int tmp = pci_read_reg(chan, SH4_PCIPAR);
-
-		pci_write_reg(chan, P1SEG, SH4_PCIPAR);
+	unsigned int tmp = pci_read_reg(chan, SH4_PCIPAR);
 
-		if (pci_read_reg(chan, SH4_PCIPAR) == P1SEG) {
-			pci_write_reg(chan, tmp, SH4_PCIPAR);
-			printk(KERN_INFO "PCI: Using configuration type 1\n");
-			request_region(chan->reg_base + SH4_PCIPAR, 8,
-				       "PCI conf1");
-			return 0;
-		}
+	pci_write_reg(chan, P1SEG, SH4_PCIPAR);
 
+	if (pci_read_reg(chan, SH4_PCIPAR) == P1SEG) {
 		pci_write_reg(chan, tmp, SH4_PCIPAR);
+		printk(KERN_INFO "PCI: Using configuration type 1\n");
+		request_region(chan->reg_base + SH4_PCIPAR, 8,
+			       "PCI conf1");
+		return 0;
 	}
 
-	pr_debug("PCI: pci_check_direct failed\n");
+	pci_write_reg(chan, tmp, SH4_PCIPAR);
+
+	printk(KERN_ERR "PCI: %s failed\n", __func__);
+
 	return -EINVAL;
 }
 
@@ -155,16 +152,6 @@ static void __init pci_fixup_ide_bases(struct pci_dev *d)
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_ide_bases);
 
-char * __devinit pcibios_setup(char *str)
-{
-	if (!strcmp(str, "off")) {
-		pci_probe = 0;
-		return NULL;
-	}
-
-	return str;
-}
-
 int __attribute__((weak)) pci_fixup_pcic(struct pci_channel *chan)
 {
 	/* Nothing to do. */
diff --git a/arch/sh/drivers/pci/ops-sh5.c b/arch/sh/drivers/pci/ops-sh5.c
index 729e38a..b10e2d5 100644
--- a/arch/sh/drivers/pci/ops-sh5.c
+++ b/arch/sh/drivers/pci/ops-sh5.c
@@ -42,11 +42,6 @@ static void __init pci_fixup_ide_bases(struct pci_dev *d)
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_ide_bases);
 
-char * __devinit pcibios_setup(char *str)
-{
-	return str;
-}
-
 static int sh5pci_read(struct pci_bus *bus, unsigned int devfn, int where,
 			int size, u32 *val)
 {
diff --git a/arch/sh/drivers/pci/pci-lib.c b/arch/sh/drivers/pci/pci-lib.c
index ea83629..f072aca 100644
--- a/arch/sh/drivers/pci/pci-lib.c
+++ b/arch/sh/drivers/pci/pci-lib.c
@@ -134,6 +134,11 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq)
 	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
 }
 
+char * __devinit pcibios_setup(char *str)
+{
+	return str;
+}
+
 int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 			enum pci_mmap_state mmap_state, int write_combine)
 {
-- 
cgit v0.10.2


From 0db38cea69fc478a5c25b3c915ec680cc5538783 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 19:54:47 +0900
Subject: sh: pci: Kill off legacy ide quirks.

These fixups seem to have bitrotted a bit since their introduction in the
2.4 days. As we never had much use for them in the first place, and
nothing is using them any more, kill them off the rest of the way.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/ops-sh4.c b/arch/sh/drivers/pci/ops-sh4.c
index 7cc1fcc..78bebeb 100644
--- a/arch/sh/drivers/pci/ops-sh4.c
+++ b/arch/sh/drivers/pci/ops-sh4.c
@@ -130,28 +130,6 @@ int __init sh4_pci_check_direct(struct pci_channel *chan)
 	return -EINVAL;
 }
 
-/* Handle generic fixups */
-static void __init pci_fixup_ide_bases(struct pci_dev *d)
-{
-	int i;
-
-	/*
-	 * PCI IDE controllers use non-standard I/O port decoding, respect it.
-	 */
-	if ((d->class >> 8) != PCI_CLASS_STORAGE_IDE)
-		return;
-	pr_debug("PCI: IDE base address fixup for %s\n", pci_name(d));
-	for(i = 0; i < 4; i++) {
-		struct resource *r = &d->resource[i];
-
-		if ((r->start & ~0x80) == 0x374) {
-			r->start |= 2;
-			r->end = r->start;
-		}
-	}
-}
-DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_ide_bases);
-
 int __attribute__((weak)) pci_fixup_pcic(struct pci_channel *chan)
 {
 	/* Nothing to do. */
diff --git a/arch/sh/drivers/pci/ops-sh5.c b/arch/sh/drivers/pci/ops-sh5.c
index b10e2d5..4ce95a0 100644
--- a/arch/sh/drivers/pci/ops-sh5.c
+++ b/arch/sh/drivers/pci/ops-sh5.c
@@ -22,26 +22,6 @@
 #include <asm/io.h>
 #include "pci-sh5.h"
 
-static void __init pci_fixup_ide_bases(struct pci_dev *d)
-{
-	int i;
-
-	/*
-	 * PCI IDE controllers use non-standard I/O port decoding, respect it.
-	 */
-	if ((d->class >> 8) != PCI_CLASS_STORAGE_IDE)
-		return;
-	printk("PCI: IDE base address fixup for %s\n", pci_name(d));
-	for(i=0; i<4; i++) {
-		struct resource *r = &d->resource[i];
-		if ((r->start & ~0x80) == 0x374) {
-			r->start |= 2;
-			r->end = r->start;
-		}
-	}
-}
-DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_ide_bases);
-
 static int sh5pci_read(struct pci_bus *bus, unsigned int devfn, int where,
 			int size, u32 *val)
 {
-- 
cgit v0.10.2


From 3444f5ec49bc6cb901ffea38e085db1d76e1189c Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 20:22:05 +0900
Subject: sh: pci: Tidy up the dreamcast PCI support.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index 4cac866..2160a06 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -12,7 +12,8 @@ obj-$(CONFIG_CPU_SUBTYPE_SH7780)	+= pci-sh7780.o ops-sh4.o
 obj-$(CONFIG_CPU_SUBTYPE_SH7785)	+= pci-sh7780.o ops-sh4.o
 obj-$(CONFIG_CPU_SH5)			+= pci-sh5.o ops-sh5.o
 
-obj-$(CONFIG_SH_DREAMCAST)		+= ops-dreamcast.o fixups-dreamcast.o
+obj-$(CONFIG_SH_DREAMCAST)		+= ops-dreamcast.o fixups-dreamcast.o \
+					   pci-dreamcast.o
 obj-$(CONFIG_SH_SECUREEDGE5410)		+= ops-snapgear.o
 obj-$(CONFIG_SH_RTS7751R2D)		+= ops-rts7751r2d.o fixups-rts7751r2d.o
 obj-$(CONFIG_SH_SH03)			+= ops-sh03.o fixups-sh03.o
diff --git a/arch/sh/drivers/pci/ops-dreamcast.c b/arch/sh/drivers/pci/ops-dreamcast.c
index 529aa4f..e83d0d3 100644
--- a/arch/sh/drivers/pci/ops-dreamcast.c
+++ b/arch/sh/drivers/pci/ops-dreamcast.c
@@ -1,15 +1,9 @@
 /*
- * arch/sh/drivers/pci/ops-dreamcast.c
- *
  * PCI operations for the Sega Dreamcast
  *
  * Copyright (C) 2001, 2002  M. R. Brown
  * Copyright (C) 2002, 2003  Paul Mundt
  *
- * This file originally bore the message (with enclosed-$):
- *	Id: pci.c,v 1.3 2003/05/04 19:29:46 lethal Exp
- *	Dreamcast PCI: Supports SEGA Broadband Adaptor only.
- *
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
@@ -23,25 +17,10 @@
 #include <linux/irq.h>
 #include <linux/pci.h>
 #include <linux/module.h>
-
-#include <asm/io.h>
-#include <asm/irq.h>
+#include <linux/io.h>
+#include <linux/irq.h>
 #include <mach/pci.h>
 
-static struct resource gapspci_io_resource = {
-	.name	= "GAPSPCI IO",
-	.start	= GAPSPCI_BBA_CONFIG,
-	.end	= GAPSPCI_BBA_CONFIG + GAPSPCI_BBA_CONFIG_SIZE - 1,
-	.flags	= IORESOURCE_IO,
-};
-
-static struct resource gapspci_mem_resource = {
-	.name	= "GAPSPCI mem",
-	.start	= GAPSPCI_DMA_BASE,
-	.end	= GAPSPCI_DMA_BASE + GAPSPCI_DMA_SIZE - 1,
-	.flags	= IORESOURCE_MEM,
-};
-
 /*
  * The !gapspci_config_access case really shouldn't happen, ever, unless
  * someone implicitly messes around with the last devfn value.. otherwise we
@@ -76,10 +55,10 @@ static int gapspci_read(struct pci_bus *bus, unsigned int devfn, int where, int
 		return PCIBIOS_DEVICE_NOT_FOUND;
 
 	switch (size) {
-		case 1: *val = inb(GAPSPCI_BBA_CONFIG+where); break;
-		case 2: *val = inw(GAPSPCI_BBA_CONFIG+where); break;
-		case 4: *val = inl(GAPSPCI_BBA_CONFIG+where); break;
-	}	
+	case 1: *val = inb(GAPSPCI_BBA_CONFIG+where); break;
+	case 2: *val = inw(GAPSPCI_BBA_CONFIG+where); break;
+	case 4: *val = inl(GAPSPCI_BBA_CONFIG+where); break;
+	}
 
         return PCIBIOS_SUCCESSFUL;
 }
@@ -90,72 +69,15 @@ static int gapspci_write(struct pci_bus *bus, unsigned int devfn, int where, int
 		return PCIBIOS_DEVICE_NOT_FOUND;
 
 	switch (size) {
-		case 1: outb(( u8)val, GAPSPCI_BBA_CONFIG+where); break;
-		case 2: outw((u16)val, GAPSPCI_BBA_CONFIG+where); break;
-		case 4: outl((u32)val, GAPSPCI_BBA_CONFIG+where); break;
+	case 1: outb(( u8)val, GAPSPCI_BBA_CONFIG+where); break;
+	case 2: outw((u16)val, GAPSPCI_BBA_CONFIG+where); break;
+	case 4: outl((u32)val, GAPSPCI_BBA_CONFIG+where); break;
 	}
 
         return PCIBIOS_SUCCESSFUL;
 }
 
-static struct pci_ops gapspci_pci_ops = {
+struct pci_ops gapspci_pci_ops = {
 	.read	= gapspci_read,
 	.write	= gapspci_write,
 };
-
-/*
- * gapspci init
- */
-
-static int __init gapspci_init(struct pci_channel *chan)
-{
-	char idbuf[16];
-	int i;
-
-	/*
-	 * FIXME: All of this wants documenting to some degree,
-	 * even some basic register definitions would be nice.
-	 *
-	 * I haven't seen anything this ugly since.. maple.
-	 */
-
-	for (i=0; i<16; i++)
-		idbuf[i] = inb(GAPSPCI_REGS+i);
-
-	if (strncmp(idbuf, "GAPSPCI_BRIDGE_2", 16))
-		return -ENODEV;
-
-	outl(0x5a14a501, GAPSPCI_REGS+0x18);
-
-	for (i=0; i<1000000; i++)
-		;
-
-	if (inl(GAPSPCI_REGS+0x18) != 1)
-		return -EINVAL;
-
-	outl(0x01000000, GAPSPCI_REGS+0x20);
-	outl(0x01000000, GAPSPCI_REGS+0x24);
-
-	outl(GAPSPCI_DMA_BASE, GAPSPCI_REGS+0x28);
-	outl(GAPSPCI_DMA_BASE+GAPSPCI_DMA_SIZE, GAPSPCI_REGS+0x2c);
-
-	outl(1, GAPSPCI_REGS+0x14);
-	outl(1, GAPSPCI_REGS+0x34);
-
-	/* Setting Broadband Adapter */
-	outw(0xf900, GAPSPCI_BBA_CONFIG+0x06);
-	outl(0x00000000, GAPSPCI_BBA_CONFIG+0x30);
-	outb(0x00, GAPSPCI_BBA_CONFIG+0x3c);
-	outb(0xf0, GAPSPCI_BBA_CONFIG+0x0d);
-	outw(0x0006, GAPSPCI_BBA_CONFIG+0x04);
-	outl(0x00002001, GAPSPCI_BBA_CONFIG+0x10);
-	outl(0x01000000, GAPSPCI_BBA_CONFIG+0x14);
-
-	return 0;
-}
-
-struct pci_channel board_pci_channels[] = {
-	{ gapspci_init, &gapspci_pci_ops, &gapspci_io_resource,
-	  &gapspci_mem_resource, 0, 1 },
-	{ 0, }
-};
diff --git a/arch/sh/drivers/pci/pci-dreamcast.c b/arch/sh/drivers/pci/pci-dreamcast.c
new file mode 100644
index 0000000..0897be5
--- /dev/null
+++ b/arch/sh/drivers/pci/pci-dreamcast.c
@@ -0,0 +1,105 @@
+/*
+ * PCI support for the Sega Dreamcast
+ *
+ * Copyright (C) 2001, 2002  M. R. Brown
+ * Copyright (C) 2002, 2003  Paul Mundt
+ *
+ * This file originally bore the message (with enclosed-$):
+ *	Id: pci.c,v 1.3 2003/05/04 19:29:46 lethal Exp
+ *	Dreamcast PCI: Supports SEGA Broadband Adaptor only.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <mach/pci.h>
+
+static struct resource gapspci_io_resource = {
+	.name	= "GAPSPCI IO",
+	.start	= GAPSPCI_BBA_CONFIG,
+	.end	= GAPSPCI_BBA_CONFIG + GAPSPCI_BBA_CONFIG_SIZE - 1,
+	.flags	= IORESOURCE_IO,
+};
+
+static struct resource gapspci_mem_resource = {
+	.name	= "GAPSPCI mem",
+	.start	= GAPSPCI_DMA_BASE,
+	.end	= GAPSPCI_DMA_BASE + GAPSPCI_DMA_SIZE - 1,
+	.flags	= IORESOURCE_MEM,
+};
+
+/*
+ * gapspci init
+ */
+
+static int __init gapspci_init(struct pci_channel *chan)
+{
+	char idbuf[16];
+	int i;
+
+	/*
+	 * FIXME: All of this wants documenting to some degree,
+	 * even some basic register definitions would be nice.
+	 *
+	 * I haven't seen anything this ugly since.. maple.
+	 */
+
+	for (i=0; i<16; i++)
+		idbuf[i] = inb(GAPSPCI_REGS+i);
+
+	if (strncmp(idbuf, "GAPSPCI_BRIDGE_2", 16))
+		return -ENODEV;
+
+	outl(0x5a14a501, GAPSPCI_REGS+0x18);
+
+	for (i=0; i<1000000; i++)
+		cpu_relax();
+
+	if (inl(GAPSPCI_REGS+0x18) != 1)
+		return -EINVAL;
+
+	outl(0x01000000, GAPSPCI_REGS+0x20);
+	outl(0x01000000, GAPSPCI_REGS+0x24);
+
+	outl(GAPSPCI_DMA_BASE, GAPSPCI_REGS+0x28);
+	outl(GAPSPCI_DMA_BASE+GAPSPCI_DMA_SIZE, GAPSPCI_REGS+0x2c);
+
+	outl(1, GAPSPCI_REGS+0x14);
+	outl(1, GAPSPCI_REGS+0x34);
+
+	/* Setting Broadband Adapter */
+	outw(0xf900, GAPSPCI_BBA_CONFIG+0x06);
+	outl(0x00000000, GAPSPCI_BBA_CONFIG+0x30);
+	outb(0x00, GAPSPCI_BBA_CONFIG+0x3c);
+	outb(0xf0, GAPSPCI_BBA_CONFIG+0x0d);
+	outw(0x0006, GAPSPCI_BBA_CONFIG+0x04);
+	outl(0x00002001, GAPSPCI_BBA_CONFIG+0x10);
+	outl(0x01000000, GAPSPCI_BBA_CONFIG+0x14);
+
+	return 0;
+}
+
+struct pci_channel board_pci_channels[] = {
+	{
+		.init		= gapspci_init,
+		.pci_ops	= &gapspci_pci_ops,
+		.io_resource	= &gapspci_io_resource,
+		.mem_resource	= &gapspci_mem_resource,
+		.first_devfn	= 0,
+		.last_devfn	= 1,
+	}, {
+		.init		= NULL,
+	}
+};
diff --git a/arch/sh/include/mach-dreamcast/mach/pci.h b/arch/sh/include/mach-dreamcast/mach/pci.h
index 75fc900..0314d97 100644
--- a/arch/sh/include/mach-dreamcast/mach/pci.h
+++ b/arch/sh/include/mach-dreamcast/mach/pci.h
@@ -21,5 +21,7 @@
 
 #define	GAPSPCI_IRQ		HW_EVENT_EXTERNAL
 
+extern struct pci_ops gapspci_pci_ops;
+
 #endif /* __ASM_SH_DREAMCAST_PCI_H */
 
-- 
cgit v0.10.2


From 48e4237d96fdcb1237b63bcddb37771f97452eec Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 20:40:48 +0900
Subject: sh: pci: Convert the SH-5 code over to the new interface.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/Kconfig b/arch/sh/drivers/pci/Kconfig
index efe8cf9..f9fb1d1 100644
--- a/arch/sh/drivers/pci/Kconfig
+++ b/arch/sh/drivers/pci/Kconfig
@@ -23,7 +23,7 @@ config PCI_NEW
 	bool
 	depends on PCI
 	default y if CPU_SUBTYPE_SH7763 || CPU_SUBTYPE_SH7780 || \
-		     CPU_SUBTYPE_SH7785
+		     CPU_SUBTYPE_SH7785 || CPU_SH5
 
 # This is also board-specific
 config PCI_AUTO
diff --git a/arch/sh/drivers/pci/ops-cayman.c b/arch/sh/drivers/pci/ops-cayman.c
index cbaaf02..b68b61d 100644
--- a/arch/sh/drivers/pci/ops-cayman.c
+++ b/arch/sh/drivers/pci/ops-cayman.c
@@ -75,14 +75,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *dev, u8 slot, u8 pin)
 
 	return result;
 }
-
-struct pci_channel board_pci_channels[] = {
-	{ sh5_pci_init, &sh5_pci_ops, NULL, NULL, 0, 0xff },
-	{ NULL, NULL, NULL, 0, 0 },
-};
-
-int __init pcibios_init_platform(void)
-{
-	return sh5pci_init(__pa(memory_start),
-			   __pa(memory_end) - __pa(memory_start));
-}
diff --git a/arch/sh/drivers/pci/pci-sh5.c b/arch/sh/drivers/pci/pci-sh5.c
index 7750da2..cf43185 100644
--- a/arch/sh/drivers/pci/pci-sh5.c
+++ b/arch/sh/drivers/pci/pci-sh5.c
@@ -27,12 +27,6 @@
 unsigned long pcicr_virt;
 unsigned long PCI_IO_AREA;
 
-int __init sh5_pci_init(struct pci_channel *chan)
-{
-	pr_debug("PCI: Starting intialization.\n");
-	return pcibios_init_platform();
-}
-
 /* Rounds a number UP to the nearest power of two. Used for
  * sizing the PCI window.
  */
@@ -95,8 +89,21 @@ static irqreturn_t pcish5_serr_irq(int irq, void *dev_id)
 	return IRQ_NONE;
 }
 
-int __init sh5pci_init(unsigned long memStart, unsigned long memSize)
+static struct resource sh5_io_resource = { /* place holder */ };
+static struct resource sh5_mem_resource = { /* place holder */ };
+
+static struct pci_channel sh5pci_controller = {
+	.pci_ops		= &sh5_pci_ops,
+	.mem_resource		= &sh5_mem_resource,
+	.mem_offset		= 0x00000000,
+	.io_resource		= &sh5_io_resource,
+	.io_offset		= 0x00000000,
+};
+
+static int __init sh5pci_init(void)
 {
+	unsigned long memStart = __pa(memory_start);
+	unsigned long memSize = __pa(memory_end) - memStart;
 	u32 lsr0;
 	u32 uval;
 
@@ -203,35 +210,14 @@ int __init sh5pci_init(unsigned long memStart, unsigned long memSize)
         SH5PCI_WRITE(AINTM, ~0);
         SH5PCI_WRITE(PINTM, ~0);
 
-	return 0;
-}
+	sh5_io_resource.start = PCI_IO_AREA;
+	sh5_io_resource.end = PCI_IO_AREA + 0x10000;
 
-#define xPCIBIOS_MIN_IO		board_pci_channels->io_resource->start
-#define xPCIBIOS_MIN_MEM	board_pci_channels->mem_resource->start
+	sh5_mem_resource.start = memStart;
+	sh5_mem_resource.end = memStart + memSize;
 
-void __devinit pcibios_fixup_bus(struct pci_bus *bus)
-{
-	struct pci_dev *dev = bus->self;
-	int i;
-
-	if (dev) {
-		for (i= 0; i < 3; i++) {
-			bus->resource[i] =
-				&dev->resource[PCI_BRIDGE_RESOURCES+i];
-			bus->resource[i]->name = bus->name;
-		}
-		bus->resource[0]->flags |= IORESOURCE_IO;
-		bus->resource[1]->flags |= IORESOURCE_MEM;
-
-		/* For now, propagate host limits to the bus;
-		 * we'll adjust them later. */
-		bus->resource[0]->end = 64*1024 - 1 ;
-		bus->resource[1]->end = xPCIBIOS_MIN_MEM+(256*1024*1024)-1;
-		bus->resource[0]->start = xPCIBIOS_MIN_IO;
-		bus->resource[1]->start = xPCIBIOS_MIN_MEM;
-
-		/* Turn off downstream PF memory address range by default */
-		bus->resource[2]->start = 1024*1024;
-		bus->resource[2]->end = bus->resource[2]->start - 1;
-	}
+	register_pci_controller(&sh5pci_controller);
+
+	return 0;
 }
+arch_initcall(sh5pci_init);
diff --git a/arch/sh/drivers/pci/pci-sh5.h b/arch/sh/drivers/pci/pci-sh5.h
index af09f38..f277628 100644
--- a/arch/sh/drivers/pci/pci-sh5.h
+++ b/arch/sh/drivers/pci/pci-sh5.h
@@ -107,8 +107,4 @@ extern unsigned long pcicr_virt;
 
 extern struct pci_ops sh5_pci_ops;
 
-/* arch/sh/drivers/pci/pci-sh5.c */
-int sh5_pci_init(struct pci_channel *chan);
-int sh5pci_init(unsigned long memStart, unsigned long memSize);
-
 #endif /* __PCI_SH5_H */
-- 
cgit v0.10.2


From a5b08047129f214af1899bd9088605c7adc21ed5 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 20:41:45 +0900
Subject: sh: pci: Rename ops-cayman -> fixups-cayman.

Now that ops-cayman.c only contains IRQ routing fixups, rename it.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index 2160a06..cb2190c 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -24,4 +24,4 @@ obj-$(CONFIG_SH_7780_SOLUTION_ENGINE)	+= fixups-sdk7780.o
 obj-$(CONFIG_SH_TITAN)			+= ops-titan.o
 obj-$(CONFIG_SH_LANDISK)		+= ops-landisk.o
 obj-$(CONFIG_SH_LBOX_RE2)		+= ops-lboxre2.o fixups-rts7751r2d.o
-obj-$(CONFIG_SH_CAYMAN)			+= ops-cayman.o
+obj-$(CONFIG_SH_CAYMAN)			+= fixups-cayman.o
diff --git a/arch/sh/drivers/pci/fixups-cayman.c b/arch/sh/drivers/pci/fixups-cayman.c
new file mode 100644
index 0000000..b68b61d
--- /dev/null
+++ b/arch/sh/drivers/pci/fixups-cayman.c
@@ -0,0 +1,77 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/types.h>
+#include <cpu/irq.h>
+#include "pci-sh5.h"
+
+int __init pcibios_map_platform_irq(struct pci_dev *dev, u8 slot, u8 pin)
+{
+	int result = -1;
+
+	/* The complication here is that the PCI IRQ lines from the Cayman's 2
+	   5V slots get into the CPU via a different path from the IRQ lines
+	   from the 3 3.3V slots.  Thus, we have to detect whether the card's
+	   interrupts go via the 5V or 3.3V path, i.e. the 'bridge swizzling'
+	   at the point where we cross from 5V to 3.3V is not the normal case.
+
+	   The added complication is that we don't know that the 5V slots are
+	   always bus 2, because a card containing a PCI-PCI bridge may be
+	   plugged into a 3.3V slot, and this changes the bus numbering.
+
+	   Also, the Cayman has an intermediate PCI bus that goes a custom
+	   expansion board header (and to the secondary bridge).  This bus has
+	   never been used in practice.
+
+	   The 1ary onboard PCI-PCI bridge is device 3 on bus 0
+	   The 2ary onboard PCI-PCI bridge is device 0 on the 2ary bus of
+	   the 1ary bridge.
+	   */
+
+	struct slot_pin {
+		int slot;
+		int pin;
+	} path[4];
+	int i=0;
+
+	while (dev->bus->number > 0) {
+
+		slot = path[i].slot = PCI_SLOT(dev->devfn);
+		pin = path[i].pin = pci_swizzle_interrupt_pin(dev, pin);
+		dev = dev->bus->self;
+		i++;
+		if (i > 3) panic("PCI path to root bus too long!\n");
+	}
+
+	slot = PCI_SLOT(dev->devfn);
+	/* This is the slot on bus 0 through which the device is eventually
+	   reachable. */
+
+	/* Now work back up. */
+	if ((slot < 3) || (i == 0)) {
+		/* Bus 0 (incl. PCI-PCI bridge itself) : perform the final
+		   swizzle now. */
+		result = IRQ_INTA + pci_swizzle_interrupt_pin(dev, pin) - 1;
+	} else {
+		i--;
+		slot = path[i].slot;
+		pin  = path[i].pin;
+		if (slot > 0) {
+			panic("PCI expansion bus device found - not handled!\n");
+		} else {
+			if (i > 0) {
+				/* 5V slots */
+				i--;
+				slot = path[i].slot;
+				pin  = path[i].pin;
+				/* 'pin' was swizzled earlier wrt slot, don't do it again. */
+				result = IRQ_P2INTA + (pin - 1);
+			} else {
+				/* IRQ for 2ary PCI-PCI bridge : unused */
+				result = -1;
+			}
+		}
+	}
+
+	return result;
+}
diff --git a/arch/sh/drivers/pci/ops-cayman.c b/arch/sh/drivers/pci/ops-cayman.c
deleted file mode 100644
index b68b61d..0000000
--- a/arch/sh/drivers/pci/ops-cayman.c
+++ /dev/null
@@ -1,77 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/types.h>
-#include <cpu/irq.h>
-#include "pci-sh5.h"
-
-int __init pcibios_map_platform_irq(struct pci_dev *dev, u8 slot, u8 pin)
-{
-	int result = -1;
-
-	/* The complication here is that the PCI IRQ lines from the Cayman's 2
-	   5V slots get into the CPU via a different path from the IRQ lines
-	   from the 3 3.3V slots.  Thus, we have to detect whether the card's
-	   interrupts go via the 5V or 3.3V path, i.e. the 'bridge swizzling'
-	   at the point where we cross from 5V to 3.3V is not the normal case.
-
-	   The added complication is that we don't know that the 5V slots are
-	   always bus 2, because a card containing a PCI-PCI bridge may be
-	   plugged into a 3.3V slot, and this changes the bus numbering.
-
-	   Also, the Cayman has an intermediate PCI bus that goes a custom
-	   expansion board header (and to the secondary bridge).  This bus has
-	   never been used in practice.
-
-	   The 1ary onboard PCI-PCI bridge is device 3 on bus 0
-	   The 2ary onboard PCI-PCI bridge is device 0 on the 2ary bus of
-	   the 1ary bridge.
-	   */
-
-	struct slot_pin {
-		int slot;
-		int pin;
-	} path[4];
-	int i=0;
-
-	while (dev->bus->number > 0) {
-
-		slot = path[i].slot = PCI_SLOT(dev->devfn);
-		pin = path[i].pin = pci_swizzle_interrupt_pin(dev, pin);
-		dev = dev->bus->self;
-		i++;
-		if (i > 3) panic("PCI path to root bus too long!\n");
-	}
-
-	slot = PCI_SLOT(dev->devfn);
-	/* This is the slot on bus 0 through which the device is eventually
-	   reachable. */
-
-	/* Now work back up. */
-	if ((slot < 3) || (i == 0)) {
-		/* Bus 0 (incl. PCI-PCI bridge itself) : perform the final
-		   swizzle now. */
-		result = IRQ_INTA + pci_swizzle_interrupt_pin(dev, pin) - 1;
-	} else {
-		i--;
-		slot = path[i].slot;
-		pin  = path[i].pin;
-		if (slot > 0) {
-			panic("PCI expansion bus device found - not handled!\n");
-		} else {
-			if (i > 0) {
-				/* 5V slots */
-				i--;
-				slot = path[i].slot;
-				pin  = path[i].pin;
-				/* 'pin' was swizzled earlier wrt slot, don't do it again. */
-				result = IRQ_P2INTA + (pin - 1);
-			} else {
-				/* IRQ for 2ary PCI-PCI bridge : unused */
-				result = -1;
-			}
-		}
-	}
-
-	return result;
-}
-- 
cgit v0.10.2


From 757e3c16f8bafa2a470aebf9b04671c5d4d18f49 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 21:11:07 +0900
Subject: sh: pci: Rewrite SH7751 PCI support to follow SH7780.

This follows the similar sort of scheme that the refactored SH7780 code
uses, using a 64MB CS3 mapping to handle the window0 case, and simply
discarding window1. This vastly simplifies the code, and allows most of
the board-specific setup to go die.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/Kconfig b/arch/sh/drivers/pci/Kconfig
index f9fb1d1..5aaee3c 100644
--- a/arch/sh/drivers/pci/Kconfig
+++ b/arch/sh/drivers/pci/Kconfig
@@ -23,7 +23,8 @@ config PCI_NEW
 	bool
 	depends on PCI
 	default y if CPU_SUBTYPE_SH7763 || CPU_SUBTYPE_SH7780 || \
-		     CPU_SUBTYPE_SH7785 || CPU_SH5
+		     CPU_SUBTYPE_SH7785 || CPU_SH5 || \
+		     CPU_SUBTYPE_SH7751 || CPU_SUBTYPE_SH7751R
 
 # This is also board-specific
 config PCI_AUTO
diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index cb2190c..b8667de 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -16,7 +16,7 @@ obj-$(CONFIG_SH_DREAMCAST)		+= ops-dreamcast.o fixups-dreamcast.o \
 					   pci-dreamcast.o
 obj-$(CONFIG_SH_SECUREEDGE5410)		+= ops-snapgear.o
 obj-$(CONFIG_SH_RTS7751R2D)		+= ops-rts7751r2d.o fixups-rts7751r2d.o
-obj-$(CONFIG_SH_SH03)			+= ops-sh03.o fixups-sh03.o
+obj-$(CONFIG_SH_SH03)			+= fixups-sh03.o
 obj-$(CONFIG_SH_HIGHLANDER)		+= fixups-r7780rp.o
 obj-$(CONFIG_SH_SH7785LCR)		+= fixups-r7780rp.o
 obj-$(CONFIG_SH_SDK7780)		+= fixups-sdk7780.o
diff --git a/arch/sh/drivers/pci/ops-landisk.c b/arch/sh/drivers/pci/ops-landisk.c
index 178b778..bb1a6bb 100644
--- a/arch/sh/drivers/pci/ops-landisk.c
+++ b/arch/sh/drivers/pci/ops-landisk.c
@@ -15,37 +15,6 @@
 #include <linux/pci.h>
 #include "pci-sh4.h"
 
-static struct resource sh7751_io_resource = {
-	.name = "SH7751 IO",
-	.start = SH7751_PCI_IO_BASE,
-	.end = SH7751_PCI_IO_BASE + SH7751_PCI_IO_SIZE - 1,
-	.flags = IORESOURCE_IO
-};
-
-static struct resource sh7751_mem_resource = {
-	.name = "SH7751 mem",
-	.start = SH7751_PCI_MEMORY_BASE,
-	.end = SH7751_PCI_MEMORY_BASE + SH7751_PCI_MEM_SIZE - 1,
-	.flags = IORESOURCE_MEM
-};
-
-struct pci_channel board_pci_channels[] = {
-	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0x3ff},
-	{NULL, NULL, NULL, 0, 0},
-};
-
-static struct sh4_pci_address_map sh7751_pci_map = {
-	.window0 = {
-		.base	= SH7751_CS3_BASE_ADDR,
-		.size	= (64 << 20),	/* 64MB */
-	},
-};
-
-int __init pcibios_init_platform(void)
-{
-	return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map);
-}
-
 int pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
 {
 	/*
diff --git a/arch/sh/drivers/pci/ops-lboxre2.c b/arch/sh/drivers/pci/ops-lboxre2.c
index 91cabd8..6db2c20 100644
--- a/arch/sh/drivers/pci/ops-lboxre2.c
+++ b/arch/sh/drivers/pci/ops-lboxre2.c
@@ -21,36 +21,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
 {
 	return lboxre2_irq_tab[slot];
 }
-
-static struct resource sh7751_io_resource = {
-	.name	= "SH7751_IO",
-	.start	= SH7751_PCI_IO_BASE ,
-	.end	= SH7751_PCI_IO_BASE + SH7751_PCI_IO_SIZE - 1,
-	.flags	= IORESOURCE_IO
-};
-
-static struct resource sh7751_mem_resource = {
-	.name	= "SH7751_mem",
-	.start	= SH7751_PCI_MEMORY_BASE,
-	.end	= SH7751_PCI_MEMORY_BASE + SH7751_PCI_MEM_SIZE - 1,
-	.flags	= IORESOURCE_MEM
-};
-
-extern struct pci_ops sh7751_pci_ops;
-
-struct pci_channel board_pci_channels[] = {
-	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
-	{ NULL, NULL, NULL, 0, 0 },
-};
-
-static struct sh4_pci_address_map sh7751_pci_map = {
-	.window0	= {
-		.base	= SH7751_CS3_BASE_ADDR,
-		.size	= 0x04000000,
-	},
-};
-
-int __init pcibios_init_platform(void)
-{
-	return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map);
-}
diff --git a/arch/sh/drivers/pci/ops-rts7751r2d.c b/arch/sh/drivers/pci/ops-rts7751r2d.c
index 96b916c..d950b8a 100644
--- a/arch/sh/drivers/pci/ops-rts7751r2d.c
+++ b/arch/sh/drivers/pci/ops-rts7751r2d.c
@@ -29,37 +29,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
 {
 	return rts7751r2d_irq_tab[slot];
 }
-
-static struct resource sh7751_io_resource = {
-	.name	= "SH7751_IO",
-	.start	= 0x4000,
-	.end	= SH7751_PCI_IO_SIZE - 1,
-	.flags	= IORESOURCE_IO
-};
-
-static struct resource sh7751_mem_resource = {
-	.name	= "SH7751_mem",
-	.start	= SH7751_PCI_MEMORY_BASE,
-	.end	= SH7751_PCI_MEMORY_BASE + SH7751_PCI_MEM_SIZE - 1,
-	.flags	= IORESOURCE_MEM
-};
-
-extern struct pci_ops sh7751_pci_ops;
-
-struct pci_channel board_pci_channels[] = {
-	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
-	{ NULL, NULL, NULL, 0, 0 },
-};
-
-static struct sh4_pci_address_map sh7751_pci_map = {
-	.window0	= {
-		.base	= SH7751_CS3_BASE_ADDR,
-		.size	= 0x04000000,
-	},
-};
-
-int __init pcibios_init_platform(void)
-{
-	return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map);
-}
-
diff --git a/arch/sh/drivers/pci/ops-sh03.c b/arch/sh/drivers/pci/ops-sh03.c
deleted file mode 100644
index 0218135..0000000
--- a/arch/sh/drivers/pci/ops-sh03.c
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * linux/arch/sh/drivers/pci/ops-sh03.c
- *
- * PCI initialization for the Interface CTP/PCI-SH03 board
- */
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/pci.h>
-#include <asm/io.h>
-#include "pci-sh7751.h"
-
-/*
- * Description:  This function sets up and initializes the pcic, sets
- * up the BARS, maps the DRAM into the address space etc, etc.
- */
-int __init pcibios_init_platform(void)
-{
-	__set_io_port_base(SH7751_PCI_IO_BASE);
-	return 1;
-}
-
-static struct resource sh7751_io_resource = {
-	.name   = "SH03 IO",
-	.start  = SH7751_PCI_IO_BASE,
-	.end    = SH7751_PCI_IO_BASE + SH7751_PCI_IO_SIZE - 1,
-	.flags  = IORESOURCE_IO
-};
-
-static struct resource sh7751_mem_resource = {
-	.name   = "SH03 mem",
-	.start  = SH7751_PCI_MEMORY_BASE,
-	.end    = SH7751_PCI_MEMORY_BASE + SH7751_PCI_MEM_SIZE - 1,
-	.flags  = IORESOURCE_MEM
-};
-
-extern struct pci_ops sh4_pci_ops;
-
-struct pci_channel board_pci_channels[] = {
-	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
-	{ NULL, NULL, NULL, 0, 0 },
-};
-
diff --git a/arch/sh/drivers/pci/ops-snapgear.c b/arch/sh/drivers/pci/ops-snapgear.c
index b64f2b9..5a39ecc 100644
--- a/arch/sh/drivers/pci/ops-snapgear.c
+++ b/arch/sh/drivers/pci/ops-snapgear.c
@@ -18,55 +18,6 @@
 #include <linux/pci.h>
 #include "pci-sh4.h"
 
-#define SNAPGEAR_PCI_IO		0x4000
-#define SNAPGEAR_PCI_MEM	0xfd000000
-
-/* PCI: default LOCAL memory window sizes (seen from PCI bus) */
-#define SNAPGEAR_LSR0_SIZE    (64*(1<<20)) //64MB
-#define SNAPGEAR_LSR1_SIZE    (64*(1<<20)) //64MB
-
-static struct resource sh7751_io_resource = {
-	.name		= "SH7751 IO",
-	.start		= SNAPGEAR_PCI_IO,
-	.end		= SNAPGEAR_PCI_IO + (64*1024) - 1, /* 64KiB I/O */
-	.flags		= IORESOURCE_IO,
-};
-
-static struct resource sh7751_mem_resource = {
-	.name		= "SH7751 mem",
-	.start		= SNAPGEAR_PCI_MEM,
-	.end		= SNAPGEAR_PCI_MEM + (64*1024*1024) - 1, /* 64MiB mem */
-	.flags		= IORESOURCE_MEM,
-};
-
-struct pci_channel board_pci_channels[] = {
-	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
-	{ 0, }
-};
-
-static struct sh4_pci_address_map sh7751_pci_map = {
-	.window0	= {
-		.base	= SH7751_CS2_BASE_ADDR,
-		.size	= SNAPGEAR_LSR0_SIZE,
-	},
-
-	.window1	= {
-		.base	= SH7751_CS2_BASE_ADDR,
-		.size	= SNAPGEAR_LSR1_SIZE,
-	},
-};
-
-/*
- * Initialize the SnapGear PCI interface
- * Setup hardware to be Central Funtion
- * Copy the BSR regs to the PCI interface
- * Setup PCI windows into local RAM
- */
-int __init pcibios_init_platform(void)
-{
-	return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map);
-}
-
 int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
 {
 	int irq = -1;
diff --git a/arch/sh/drivers/pci/ops-titan.c b/arch/sh/drivers/pci/ops-titan.c
index e45bb62..3a79fa8 100644
--- a/arch/sh/drivers/pci/ops-titan.c
+++ b/arch/sh/drivers/pci/ops-titan.c
@@ -36,39 +36,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
 
 	return irq;
 }
-
-static struct resource sh7751_io_resource = {
-	.name	= "SH7751_IO",
-	.start	= SH7751_PCI_IO_BASE,
-	.end	= SH7751_PCI_IO_BASE + SH7751_PCI_IO_SIZE - 1,
-	.flags	= IORESOURCE_IO
-};
-
-static struct resource sh7751_mem_resource = {
-	.name	= "SH7751_mem",
-	.start	= SH7751_PCI_MEMORY_BASE,
-	.end	= SH7751_PCI_MEMORY_BASE + SH7751_PCI_MEM_SIZE - 1,
-	.flags	= IORESOURCE_MEM
-};
-
-struct pci_channel board_pci_channels[] = {
-	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
-	{ NULL, NULL, NULL, 0, 0 },
-};
-
-static struct sh4_pci_address_map sh7751_pci_map = {
-	.window0	= {
-		.base	= SH7751_CS2_BASE_ADDR,
-		.size	= SH7751_MEM_REGION_SIZE*2,	/* cs2 and cs3 */
-	},
-
-	.window1	= {
-		.base	= SH7751_CS2_BASE_ADDR,
-		.size	= SH7751_MEM_REGION_SIZE*2,
-	},
-};
-
-int __init pcibios_init_platform(void)
-{
-	return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map);
-}
diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c
index 4c08fd7..c4fa0bb1 100644
--- a/arch/sh/drivers/pci/pci-sh7751.c
+++ b/arch/sh/drivers/pci/pci-sh7751.c
@@ -1,78 +1,41 @@
 /*
- *	Low-Level PCI Support for the SH7751
+ * Low-Level PCI Support for the SH7751
  *
- *  Dustin McIntire (dustin@sensoria.com)
- *	Derived from arch/i386/kernel/pci-*.c which bore the message:
- *	(c) 1999--2000 Martin Mares <mj@ucw.cz>
+ *  Copyright (C) 2003 - 2009  Paul Mundt
+ *  Copyright (C) 2001  Dustin McIntire
  *
- *  Ported to the new API by Paul Mundt <lethal@linux-sh.org>
- *  With cleanup by Paul van Gool <pvangool@mimotech.com>
- *
- *  May be copied or modified under the terms of the GNU General Public
- *  License.  See linux/COPYING for more information.
+ *  With cleanup by Paul van Gool <pvangool@mimotech.com>, 2003.
  *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
  */
-#undef DEBUG
-
 #include <linux/init.h>
 #include <linux/pci.h>
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <linux/delay.h>
+#include <linux/io.h>
 #include "pci-sh4.h"
 #include <asm/addrspace.h>
-#include <asm/io.h>
-
-/*
- * Initialization. Try all known PCI access methods. Note that we support
- * using both PCI BIOS and direct access: in such cases, we use I/O ports
- * to access config space.
- *
- * Note that the platform specific initialization (BSC registers, and memory
- * space mapping) will be called via the platform defined function
- * pcibios_init_platform().
- */
-int __init sh7751_pci_init(struct pci_channel *chan)
-{
-	unsigned int id;
-	int ret;
-
-	pr_debug("PCI: Starting intialization.\n");
-
-	chan->reg_base = 0xfe200000;
-
-	/* check for SH7751/SH7751R hardware */
-	id = pci_read_reg(chan, SH7751_PCICONF0);
-	if (id != ((SH7751_DEVICE_ID << 16) | SH7751_VENDOR_ID) &&
-	    id != ((SH7751R_DEVICE_ID << 16) | SH7751_VENDOR_ID)) {
-		pr_debug("PCI: This is not an SH7751(R) (%x)\n", id);
-		return -ENODEV;
-	}
-
-	if ((ret = sh4_pci_check_direct(chan)) != 0)
-		return ret;
-
-	return pcibios_init_platform();
-}
 
 static int __init __area_sdram_check(struct pci_channel *chan,
 				     unsigned int area)
 {
-	u32 word;
+	unsigned long word;
 
-	word = ctrl_inl(SH7751_BCR1);
+	word = __raw_readl(SH7751_BCR1);
 	/* check BCR for SDRAM in area */
 	if (((word >> area) & 1) == 0) {
-		printk("PCI: Area %d is not configured for SDRAM. BCR1=0x%x\n",
+		printk("PCI: Area %d is not configured for SDRAM. BCR1=0x%lx\n",
 		       area, word);
 		return 0;
 	}
 	pci_write_reg(chan, word, SH4_PCIBCR1);
 
-	word = (u16)ctrl_inw(SH7751_BCR2);
+	word = __raw_readw(SH7751_BCR2);
 	/* check BCR2 for 32bit SDRAM interface*/
 	if (((word >> (area << 1)) & 0x3) != 0x3) {
-		printk("PCI: Area %d is not 32 bit SDRAM. BCR2=0x%x\n",
+		printk("PCI: Area %d is not 32 bit SDRAM. BCR2=0x%lx\n",
 		       area, word);
 		return 0;
 	}
@@ -81,11 +44,56 @@ static int __init __area_sdram_check(struct pci_channel *chan,
 	return 1;
 }
 
-int __init sh7751_pcic_init(struct pci_channel *chan,
-			    struct sh4_pci_address_map *map)
+static struct resource sh7751_io_resource = {
+	.name	= "SH7751_IO",
+	.start	= SH7751_PCI_IO_BASE,
+	.end	= SH7751_PCI_IO_BASE + SH7751_PCI_IO_SIZE - 1,
+	.flags	= IORESOURCE_IO
+};
+
+static struct resource sh7751_mem_resource = {
+	.name	= "SH7785_mem",
+	.start	= SH7751_PCI_MEMORY_BASE,
+	.end	= SH7751_PCI_MEMORY_BASE + SH7751_PCI_MEM_SIZE - 1,
+	.flags	= IORESOURCE_MEM
+};
+
+static struct pci_channel sh7751_pci_controller = {
+	.pci_ops	= &sh4_pci_ops,
+	.mem_resource	= &sh7751_mem_resource,
+	.mem_offset	= 0x00000000,
+	.io_resource	= &sh7751_io_resource,
+	.io_offset	= 0x00000000,
+};
+
+static struct sh4_pci_address_map sh7751_pci_map = {
+	.window0	= {
+		.base	= SH7751_CS3_BASE_ADDR,
+		.size	= 0x04000000,
+	},
+};
+
+static int __init sh7751_pci_init(void)
 {
-	u32 reg;
-	u32 word;
+	struct pci_channel *chan = &sh7751_pci_controller;
+	unsigned int id;
+	u32 word, reg;
+	int ret;
+
+	printk(KERN_NOTICE "PCI: Starting intialization.\n");
+
+	chan->reg_base = 0xfe200000;
+
+	/* check for SH7751/SH7751R hardware */
+	id = pci_read_reg(chan, SH7751_PCICONF0);
+	if (id != ((SH7751_DEVICE_ID << 16) | SH7751_VENDOR_ID) &&
+	    id != ((SH7751R_DEVICE_ID << 16) | SH7751_VENDOR_ID)) {
+		pr_debug("PCI: This is not an SH7751(R) (%x)\n", id);
+		return -ENODEV;
+	}
+
+	if ((ret = sh4_pci_check_direct(chan)) != 0)
+		return ret;
 
 	/* Set the BCR's to enable PCI access */
 	reg = ctrl_inl(SH7751_BCR1);
@@ -112,21 +120,13 @@ int __init sh7751_pcic_init(struct pci_channel *chan,
 
 	/* Set IO and Mem windows to local address
 	 * Make PCI and local address the same for easy 1 to 1 mapping
-	 * Window0 = map->window0.size @ non-cached area base = SDRAM
-	 * Window1 = map->window1.size @ cached area base = SDRAM
 	 */
-	word = map->window0.size - 1;
+	word = sh7751_pci_map.window0.size - 1;
 	pci_write_reg(chan, word, SH4_PCILSR0);
-	word = map->window1.size - 1;
-	pci_write_reg(chan, word, SH4_PCILSR1);
 	/* Set the values on window 0 PCI config registers */
-	word = P2SEGADDR(map->window0.base);
+	word = P2SEGADDR(sh7751_pci_map.window0.base);
 	pci_write_reg(chan, word, SH4_PCILAR0);
 	pci_write_reg(chan, word, SH7751_PCICONF5);
-	/* Set the values on window 1 PCI config registers */
-	word =  PHYSADDR(map->window1.base);
-	pci_write_reg(chan, word, SH4_PCILAR1);
-	pci_write_reg(chan, word, SH7751_PCICONF6);
 
 	/* Set the local 16MB PCI memory space window to
 	 * the lowest PCI mapped address
@@ -144,7 +144,7 @@ int __init sh7751_pcic_init(struct pci_channel *chan,
 	/* Set PCI WCRx, BCRx's, copy from BSC locations */
 
 	/* check BCR for SDRAM in specified area */
-	switch (map->window0.base) {
+	switch (sh7751_pci_map.window0.base) {
 	case SH7751_CS0_BASE_ADDR: word = __area_sdram_check(chan, 0); break;
 	case SH7751_CS1_BASE_ADDR: word = __area_sdram_check(chan, 1); break;
 	case SH7751_CS2_BASE_ADDR: word = __area_sdram_check(chan, 2); break;
@@ -179,5 +179,10 @@ int __init sh7751_pcic_init(struct pci_channel *chan,
 	word = SH4_PCICR_PREFIX | SH4_PCICR_CFIN | SH4_PCICR_ARBM;
 	pci_write_reg(chan, word, SH4_PCICR);
 
+	__set_io_port_base(SH7751_PCI_IO_BASE);
+
+	register_pci_controller(chan);
+
 	return 0;
 }
+arch_initcall(sh7751_pci_init);
diff --git a/arch/sh/drivers/pci/pci-sh7751.h b/arch/sh/drivers/pci/pci-sh7751.h
index c390dd2..4983a4d 100644
--- a/arch/sh/drivers/pci/pci-sh7751.h
+++ b/arch/sh/drivers/pci/pci-sh7751.h
@@ -57,7 +57,7 @@
   #define SH7751_PCICONF2_SCC        0x00FF0000  /* Sub-Class Code */
   #define SH7751_PCICONF2_RLPI       0x0000FF00  /* Programming Interface */
   #define SH7751_PCICONF2_REV        0x000000FF  /* Revision ID */
-#define SH7751_PCICONF3            0xC           /* PCI Config Reg 3 */ 
+#define SH7751_PCICONF3            0xC           /* PCI Config Reg 3 */
   #define SH7751_PCICONF3_BIST7      0x80000000  /* Bist Supported */
   #define SH7751_PCICONF3_BIST6      0x40000000  /* Bist Executing */
   #define SH7751_PCICONF3_BIST3_0    0x0F000000  /* Bist Passed */
@@ -72,12 +72,12 @@
   #define SH7751_PCICONF5_BASE       0xFFFFFFF0  /* Mem Space Base Addr */
   #define SH7751_PCICONF5_LAP        0x00000008  /* Prefetch Enabled */
   #define SH7751_PCICONF5_LAT        0x00000006  /* Local Memory type */
-  #define SH7751_PCICONF5_ASI        0x00000001  /* Address Space Type */  
+  #define SH7751_PCICONF5_ASI        0x00000001  /* Address Space Type */
 #define SH7751_PCICONF6            0x18          /* PCI Config Reg 6 */
   #define SH7751_PCICONF6_BASE       0xFFFFFFF0  /* Mem Space Base Addr */
   #define SH7751_PCICONF6_LAP        0x00000008  /* Prefetch Enabled */
   #define SH7751_PCICONF6_LAT        0x00000006  /* Local Memory type */
-  #define SH7751_PCICONF6_ASI        0x00000001  /* Address Space Type */  
+  #define SH7751_PCICONF6_ASI        0x00000001  /* Address Space Type */
 /* PCICONF7 - PCICONF10 are undefined */
 #define SH7751_PCICONF11           0x2C          /* PCI Config Reg 11 */
   #define SH7751_PCICONF11_SSID      0xFFFF0000  /* Subsystem ID */
@@ -126,11 +126,4 @@
 #define SH7751_CS5_BASE_ADDR       (SH7751_CS4_BASE_ADDR + SH7751_MEM_REGION_SIZE)
 #define SH7751_CS6_BASE_ADDR       (SH7751_CS5_BASE_ADDR + SH7751_MEM_REGION_SIZE)
 
-struct sh4_pci_address_map;
-
-/* arch/sh/drivers/pci/pci-sh7751.c */
-int sh7751_pci_init(struct pci_channel *chan);
-int sh7751_pcic_init(struct pci_channel *chan,
-		     struct sh4_pci_address_map *map);
-
 #endif /* _PCI_SH7751_H_ */
-- 
cgit v0.10.2


From 84972ec0c206b73ffb74e5114b574186ce6166b8 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 21:17:10 +0900
Subject: sh: pci: Rename SH7751 platform ops files to fixups.

None of these contain pci_ops, only IRQ routing bits, rename them
accordingly.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index b8667de..abd931c 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -14,14 +14,14 @@ obj-$(CONFIG_CPU_SH5)			+= pci-sh5.o ops-sh5.o
 
 obj-$(CONFIG_SH_DREAMCAST)		+= ops-dreamcast.o fixups-dreamcast.o \
 					   pci-dreamcast.o
-obj-$(CONFIG_SH_SECUREEDGE5410)		+= ops-snapgear.o
+obj-$(CONFIG_SH_SECUREEDGE5410)		+= fixups-snapgear.o
 obj-$(CONFIG_SH_RTS7751R2D)		+= ops-rts7751r2d.o fixups-rts7751r2d.o
 obj-$(CONFIG_SH_SH03)			+= fixups-sh03.o
 obj-$(CONFIG_SH_HIGHLANDER)		+= fixups-r7780rp.o
 obj-$(CONFIG_SH_SH7785LCR)		+= fixups-r7780rp.o
 obj-$(CONFIG_SH_SDK7780)		+= fixups-sdk7780.o
 obj-$(CONFIG_SH_7780_SOLUTION_ENGINE)	+= fixups-sdk7780.o
-obj-$(CONFIG_SH_TITAN)			+= ops-titan.o
-obj-$(CONFIG_SH_LANDISK)		+= ops-landisk.o
-obj-$(CONFIG_SH_LBOX_RE2)		+= ops-lboxre2.o fixups-rts7751r2d.o
+obj-$(CONFIG_SH_TITAN)			+= fixups-titan.o
+obj-$(CONFIG_SH_LANDISK)		+= fixups-landisk.o
+obj-$(CONFIG_SH_LBOX_RE2)		+= fixups-lboxre2.o fixups-rts7751r2d.o
 obj-$(CONFIG_SH_CAYMAN)			+= fixups-cayman.o
diff --git a/arch/sh/drivers/pci/fixups-landisk.c b/arch/sh/drivers/pci/fixups-landisk.c
new file mode 100644
index 0000000..bb1a6bb
--- /dev/null
+++ b/arch/sh/drivers/pci/fixups-landisk.c
@@ -0,0 +1,34 @@
+/*
+ * arch/sh/drivers/pci/ops-landisk.c
+ *
+ * PCI initialization for the I-O DATA Device, Inc. LANDISK board
+ *
+ * Copyright (C) 2006 kogiidena
+ *
+ * May be copied or modified under the terms of the GNU General Public
+ * License.  See linux/COPYING for more information.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include "pci-sh4.h"
+
+int pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
+{
+	/*
+	 * slot0: pin1-4 = irq5,6,7,8
+	 * slot1: pin1-4 = irq6,7,8,5
+	 * slot2: pin1-4 = irq7,8,5,6
+	 * slot3: pin1-4 = irq8,5,6,7
+	 */
+	int irq = ((slot + pin - 1) & 0x3) + 5;
+
+	if ((slot | (pin - 1)) > 0x3) {
+		printk("PCI: Bad IRQ mapping request for slot %d pin %c\n",
+		       slot, pin - 1 + 'A');
+		return -1;
+	}
+	return irq;
+}
diff --git a/arch/sh/drivers/pci/fixups-lboxre2.c b/arch/sh/drivers/pci/fixups-lboxre2.c
new file mode 100644
index 0000000..6db2c20
--- /dev/null
+++ b/arch/sh/drivers/pci/fixups-lboxre2.c
@@ -0,0 +1,23 @@
+/*
+ * linux/arch/sh/drivers/pci/ops-lboxre2.c
+ *
+ * Copyright (C) 2007 Nobuhiro Iwamatsu
+ *
+ * PCI initialization for the NTT COMWARE L-BOX RE2
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <mach/lboxre2.h>
+#include "pci-sh4.h"
+
+static char lboxre2_irq_tab[] __initdata = {
+	IRQ_ETH0, IRQ_ETH1, IRQ_INTA, IRQ_INTD,
+};
+
+int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
+{
+	return lboxre2_irq_tab[slot];
+}
diff --git a/arch/sh/drivers/pci/fixups-snapgear.c b/arch/sh/drivers/pci/fixups-snapgear.c
new file mode 100644
index 0000000..5a39ecc
--- /dev/null
+++ b/arch/sh/drivers/pci/fixups-snapgear.c
@@ -0,0 +1,38 @@
+/*
+ * arch/sh/drivers/pci/ops-snapgear.c
+ *
+ * Author:  David McCullough <davidm@snapgear.com>
+ *
+ * Ported to new API by Paul Mundt <lethal@linux-sh.org>
+ *
+ * Highly leveraged from pci-bigsur.c, written by Dustin McIntire.
+ *
+ * May be copied or modified under the terms of the GNU General Public
+ * License.  See linux/COPYING for more information.
+ *
+ * PCI initialization for the SnapGear boards
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include "pci-sh4.h"
+
+int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
+{
+	int irq = -1;
+
+	switch (slot) {
+	case 8:  /* the PCI bridge */ break;
+	case 11: irq = 8;  break; /* USB    */
+	case 12: irq = 11; break; /* PCMCIA */
+	case 13: irq = 5;  break; /* eth0   */
+	case 14: irq = 8;  break; /* eth1   */
+	case 15: irq = 11; break; /* safenet (unused) */
+	}
+
+	printk("PCI: Mapping SnapGear IRQ for slot %d, pin %c to irq %d\n",
+	       slot, pin - 1 + 'A', irq);
+
+	return irq;
+}
diff --git a/arch/sh/drivers/pci/fixups-titan.c b/arch/sh/drivers/pci/fixups-titan.c
new file mode 100644
index 0000000..3a79fa8
--- /dev/null
+++ b/arch/sh/drivers/pci/fixups-titan.c
@@ -0,0 +1,38 @@
+/*
+ * arch/sh/drivers/pci/ops-titan.c
+ *
+ * Ported to new API by Paul Mundt <lethal@linux-sh.org>
+ *
+ * Modified from ops-snapgear.c written by  David McCullough
+ * Highly leveraged from pci-bigsur.c, written by Dustin McIntire.
+ *
+ * May be copied or modified under the terms of the GNU General Public
+ * License.  See linux/COPYING for more information.
+ *
+ * PCI initialization for the Titan boards
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <mach/titan.h>
+#include "pci-sh4.h"
+
+static char titan_irq_tab[] __initdata = {
+	TITAN_IRQ_WAN,
+	TITAN_IRQ_LAN,
+	TITAN_IRQ_MPCIA,
+	TITAN_IRQ_MPCIB,
+	TITAN_IRQ_USB,
+};
+
+int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
+{
+	int irq = titan_irq_tab[slot];
+
+	printk("PCI: Mapping TITAN IRQ for slot %d, pin %c to irq %d\n",
+		slot, pin - 1 + 'A', irq);
+
+	return irq;
+}
diff --git a/arch/sh/drivers/pci/ops-landisk.c b/arch/sh/drivers/pci/ops-landisk.c
deleted file mode 100644
index bb1a6bb..0000000
--- a/arch/sh/drivers/pci/ops-landisk.c
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * arch/sh/drivers/pci/ops-landisk.c
- *
- * PCI initialization for the I-O DATA Device, Inc. LANDISK board
- *
- * Copyright (C) 2006 kogiidena
- *
- * May be copied or modified under the terms of the GNU General Public
- * License.  See linux/COPYING for more information.
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/pci.h>
-#include "pci-sh4.h"
-
-int pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
-{
-	/*
-	 * slot0: pin1-4 = irq5,6,7,8
-	 * slot1: pin1-4 = irq6,7,8,5
-	 * slot2: pin1-4 = irq7,8,5,6
-	 * slot3: pin1-4 = irq8,5,6,7
-	 */
-	int irq = ((slot + pin - 1) & 0x3) + 5;
-
-	if ((slot | (pin - 1)) > 0x3) {
-		printk("PCI: Bad IRQ mapping request for slot %d pin %c\n",
-		       slot, pin - 1 + 'A');
-		return -1;
-	}
-	return irq;
-}
diff --git a/arch/sh/drivers/pci/ops-lboxre2.c b/arch/sh/drivers/pci/ops-lboxre2.c
deleted file mode 100644
index 6db2c20..0000000
--- a/arch/sh/drivers/pci/ops-lboxre2.c
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * linux/arch/sh/drivers/pci/ops-lboxre2.c
- *
- * Copyright (C) 2007 Nobuhiro Iwamatsu
- *
- * PCI initialization for the NTT COMWARE L-BOX RE2
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/io.h>
-#include <mach/lboxre2.h>
-#include "pci-sh4.h"
-
-static char lboxre2_irq_tab[] __initdata = {
-	IRQ_ETH0, IRQ_ETH1, IRQ_INTA, IRQ_INTD,
-};
-
-int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
-{
-	return lboxre2_irq_tab[slot];
-}
diff --git a/arch/sh/drivers/pci/ops-snapgear.c b/arch/sh/drivers/pci/ops-snapgear.c
deleted file mode 100644
index 5a39ecc..0000000
--- a/arch/sh/drivers/pci/ops-snapgear.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * arch/sh/drivers/pci/ops-snapgear.c
- *
- * Author:  David McCullough <davidm@snapgear.com>
- *
- * Ported to new API by Paul Mundt <lethal@linux-sh.org>
- *
- * Highly leveraged from pci-bigsur.c, written by Dustin McIntire.
- *
- * May be copied or modified under the terms of the GNU General Public
- * License.  See linux/COPYING for more information.
- *
- * PCI initialization for the SnapGear boards
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include "pci-sh4.h"
-
-int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
-{
-	int irq = -1;
-
-	switch (slot) {
-	case 8:  /* the PCI bridge */ break;
-	case 11: irq = 8;  break; /* USB    */
-	case 12: irq = 11; break; /* PCMCIA */
-	case 13: irq = 5;  break; /* eth0   */
-	case 14: irq = 8;  break; /* eth1   */
-	case 15: irq = 11; break; /* safenet (unused) */
-	}
-
-	printk("PCI: Mapping SnapGear IRQ for slot %d, pin %c to irq %d\n",
-	       slot, pin - 1 + 'A', irq);
-
-	return irq;
-}
diff --git a/arch/sh/drivers/pci/ops-titan.c b/arch/sh/drivers/pci/ops-titan.c
deleted file mode 100644
index 3a79fa8..0000000
--- a/arch/sh/drivers/pci/ops-titan.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * arch/sh/drivers/pci/ops-titan.c
- *
- * Ported to new API by Paul Mundt <lethal@linux-sh.org>
- *
- * Modified from ops-snapgear.c written by  David McCullough
- * Highly leveraged from pci-bigsur.c, written by Dustin McIntire.
- *
- * May be copied or modified under the terms of the GNU General Public
- * License.  See linux/COPYING for more information.
- *
- * PCI initialization for the Titan boards
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/io.h>
-#include <mach/titan.h>
-#include "pci-sh4.h"
-
-static char titan_irq_tab[] __initdata = {
-	TITAN_IRQ_WAN,
-	TITAN_IRQ_LAN,
-	TITAN_IRQ_MPCIA,
-	TITAN_IRQ_MPCIB,
-	TITAN_IRQ_USB,
-};
-
-int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
-{
-	int irq = titan_irq_tab[slot];
-
-	printk("PCI: Mapping TITAN IRQ for slot %d, pin %c to irq %d\n",
-		slot, pin - 1 + 'A', irq);
-
-	return irq;
-}
-- 
cgit v0.10.2


From 37c8ac361efdbae969eff1a603bc39412d3e873f Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 21:27:15 +0900
Subject: sh: pci: Consolidate lboxre2 and r2d IRQ fixups.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index abd931c..fbebd73 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -15,7 +15,7 @@ obj-$(CONFIG_CPU_SH5)			+= pci-sh5.o ops-sh5.o
 obj-$(CONFIG_SH_DREAMCAST)		+= ops-dreamcast.o fixups-dreamcast.o \
 					   pci-dreamcast.o
 obj-$(CONFIG_SH_SECUREEDGE5410)		+= fixups-snapgear.o
-obj-$(CONFIG_SH_RTS7751R2D)		+= ops-rts7751r2d.o fixups-rts7751r2d.o
+obj-$(CONFIG_SH_RTS7751R2D)		+= fixups-rts7751r2d.o
 obj-$(CONFIG_SH_SH03)			+= fixups-sh03.o
 obj-$(CONFIG_SH_HIGHLANDER)		+= fixups-r7780rp.o
 obj-$(CONFIG_SH_SH7785LCR)		+= fixups-r7780rp.o
@@ -23,5 +23,5 @@ obj-$(CONFIG_SH_SDK7780)		+= fixups-sdk7780.o
 obj-$(CONFIG_SH_7780_SOLUTION_ENGINE)	+= fixups-sdk7780.o
 obj-$(CONFIG_SH_TITAN)			+= fixups-titan.o
 obj-$(CONFIG_SH_LANDISK)		+= fixups-landisk.o
-obj-$(CONFIG_SH_LBOX_RE2)		+= fixups-lboxre2.o fixups-rts7751r2d.o
+obj-$(CONFIG_SH_LBOX_RE2)		+= fixups-rts7751r2d.o
 obj-$(CONFIG_SH_CAYMAN)			+= fixups-cayman.o
diff --git a/arch/sh/drivers/pci/fixups-lboxre2.c b/arch/sh/drivers/pci/fixups-lboxre2.c
deleted file mode 100644
index 6db2c20..0000000
--- a/arch/sh/drivers/pci/fixups-lboxre2.c
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * linux/arch/sh/drivers/pci/ops-lboxre2.c
- *
- * Copyright (C) 2007 Nobuhiro Iwamatsu
- *
- * PCI initialization for the NTT COMWARE L-BOX RE2
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/io.h>
-#include <mach/lboxre2.h>
-#include "pci-sh4.h"
-
-static char lboxre2_irq_tab[] __initdata = {
-	IRQ_ETH0, IRQ_ETH1, IRQ_INTA, IRQ_INTD,
-};
-
-int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
-{
-	return lboxre2_irq_tab[slot];
-}
diff --git a/arch/sh/drivers/pci/fixups-rts7751r2d.c b/arch/sh/drivers/pci/fixups-rts7751r2d.c
index 3885233..052b354 100644
--- a/arch/sh/drivers/pci/fixups-rts7751r2d.c
+++ b/arch/sh/drivers/pci/fixups-rts7751r2d.c
@@ -1,21 +1,44 @@
 /*
  * arch/sh/drivers/pci/fixups-rts7751r2d.c
  *
- * RTS7751R2D PCI fixups
+ * RTS7751R2D / LBOXRE2 PCI fixups
  *
  * Copyright (C) 2003  Lineo uSolutions, Inc.
  * Copyright (C) 2004  Paul Mundt
+ * Copyright (C) 2007  Nobuhiro Iwamatsu
  *
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  */
 #include <linux/pci.h>
+#include <mach/lboxre2.h>
+#include <mach/r2d.h>
 #include "pci-sh4.h"
+#include <asm/machtypes.h>
 
 #define PCIMCR_MRSET_OFF	0xBFFFFFFF
 #define PCIMCR_RFSH_OFF		0xFFFFFFFB
 
+static u8 rts7751r2d_irq_tab[] __initdata = {
+	IRQ_PCI_INTA,
+	IRQ_PCI_INTB,
+	IRQ_PCI_INTC,
+	IRQ_PCI_INTD,
+};
+
+static char lboxre2_irq_tab[] __initdata = {
+	IRQ_ETH0, IRQ_ETH1, IRQ_INTA, IRQ_INTD,
+};
+
+int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
+{
+	if (mach_is_lboxre2())
+		return lboxre2_irq_tab[slot];
+	else
+		return rts7751r2d_irq_tab[slot];
+}
+
 int pci_fixup_pcic(struct pci_channel *chan)
 {
 	unsigned long bcr1, mcr;
diff --git a/arch/sh/drivers/pci/ops-rts7751r2d.c b/arch/sh/drivers/pci/ops-rts7751r2d.c
deleted file mode 100644
index d950b8a..0000000
--- a/arch/sh/drivers/pci/ops-rts7751r2d.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * linux/arch/sh/drivers/pci/ops-rts7751r2d.c
- *
- * Author:  Ian DaSilva (idasilva@mvista.com)
- *
- * Highly leveraged from pci-bigsur.c, written by Dustin McIntire.
- *
- * May be copied or modified under the terms of the GNU General Public
- * License.  See linux/COPYING for more information.
- *
- * PCI initialization for the Renesas SH7751R RTS7751R2D board
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/io.h>
-#include <mach/r2d.h>
-#include "pci-sh4.h"
-
-static u8 rts7751r2d_irq_tab[] __initdata = {
-	IRQ_PCI_INTA,
-	IRQ_PCI_INTB,
-	IRQ_PCI_INTC,
-	IRQ_PCI_INTD,
-};
-
-int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
-{
-	return rts7751r2d_irq_tab[slot];
-}
-- 
cgit v0.10.2


From 951a681bda844491de8699b3bdc6c3899cbd4c9f Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 21:34:36 +0900
Subject: sh: pci: Convert dreamcast to new-style interface.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/Kconfig b/arch/sh/drivers/pci/Kconfig
index 5aaee3c..1d53496 100644
--- a/arch/sh/drivers/pci/Kconfig
+++ b/arch/sh/drivers/pci/Kconfig
@@ -20,11 +20,8 @@ config SH_PCIDMA_NONCOHERENT
 
 # Temporary config option for transitioning off of PCI_AUTO
 config PCI_NEW
-	bool
+	def_bool y
 	depends on PCI
-	default y if CPU_SUBTYPE_SH7763 || CPU_SUBTYPE_SH7780 || \
-		     CPU_SUBTYPE_SH7785 || CPU_SH5 || \
-		     CPU_SUBTYPE_SH7751 || CPU_SUBTYPE_SH7751R
 
 # This is also board-specific
 config PCI_AUTO
diff --git a/arch/sh/drivers/pci/fixups-dreamcast.c b/arch/sh/drivers/pci/fixups-dreamcast.c
index 48c6381..ed7f489 100644
--- a/arch/sh/drivers/pci/fixups-dreamcast.c
+++ b/arch/sh/drivers/pci/fixups-dreamcast.c
@@ -30,7 +30,7 @@
 
 static void __init gapspci_fixup_resources(struct pci_dev *dev)
 {
-	struct pci_channel *p = board_pci_channels;
+	struct pci_channel *p = dev->sysdata;
 
 	printk(KERN_NOTICE "PCI: Fixing up device %s\n", pci_name(dev));
 
diff --git a/arch/sh/drivers/pci/pci-dreamcast.c b/arch/sh/drivers/pci/pci-dreamcast.c
index 0897be5..210f9d4 100644
--- a/arch/sh/drivers/pci/pci-dreamcast.c
+++ b/arch/sh/drivers/pci/pci-dreamcast.c
@@ -21,7 +21,6 @@
 #include <linux/irq.h>
 #include <linux/pci.h>
 #include <linux/module.h>
-
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <mach/pci.h>
@@ -40,11 +39,19 @@ static struct resource gapspci_mem_resource = {
 	.flags	= IORESOURCE_MEM,
 };
 
+static struct pci_channel dreamcast_pci_controller = {
+	.pci_ops	= &gapspci_pci_ops,
+	.io_resource	= &gapspci_io_resource,
+	.io_offset	= 0x00000000,
+	.mem_resource	= &gapspci_mem_resource,
+	.mem_offset	= 0x00000000,
+};
+
 /*
  * gapspci init
  */
 
-static int __init gapspci_init(struct pci_channel *chan)
+static int __init gapspci_init(void)
 {
 	char idbuf[16];
 	int i;
@@ -88,18 +95,8 @@ static int __init gapspci_init(struct pci_channel *chan)
 	outl(0x00002001, GAPSPCI_BBA_CONFIG+0x10);
 	outl(0x01000000, GAPSPCI_BBA_CONFIG+0x14);
 
+	register_pci_controller(&dreamcast_pci_controller);
+
 	return 0;
 }
-
-struct pci_channel board_pci_channels[] = {
-	{
-		.init		= gapspci_init,
-		.pci_ops	= &gapspci_pci_ops,
-		.io_resource	= &gapspci_io_resource,
-		.mem_resource	= &gapspci_mem_resource,
-		.first_devfn	= 0,
-		.last_devfn	= 1,
-	}, {
-		.init		= NULL,
-	}
-};
+arch_initcall(gapspci_init);
-- 
cgit v0.10.2


From 2d5efc190eb415dbff79ffab4f8ea731ab0288a9 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 21:42:59 +0900
Subject: sh: pci: Move the se7751 fixups in to arch/sh/drivers/pci/.

The se7751 was still doing the PCI fixups in its own board directory,
so we move it over to arch/sh/drivers/pci/ with the rest of the board
fixups. It has bitrotted significantly over the years, so will still
likely need a bit of work to bring back up to date.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/boards/mach-se/7751/Makefile b/arch/sh/boards/mach-se/7751/Makefile
index dbc29f3..e6f4341 100644
--- a/arch/sh/boards/mach-se/7751/Makefile
+++ b/arch/sh/boards/mach-se/7751/Makefile
@@ -3,5 +3,3 @@
 #
 
 obj-y	 := setup.o io.o irq.o
-
-obj-$(CONFIG_PCI) += pci.o
diff --git a/arch/sh/boards/mach-se/7751/pci.c b/arch/sh/boards/mach-se/7751/pci.c
deleted file mode 100644
index 9ec64a4..0000000
--- a/arch/sh/boards/mach-se/7751/pci.c
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * linux/arch/sh/boards/se/7751/pci.c
- *
- * Author:  Ian DaSilva (idasilva@mvista.com)
- *
- * Highly leveraged from pci-bigsur.c, written by Dustin McIntire.
- *
- * May be copied or modified under the terms of the GNU General Public
- * License.  See linux/COPYING for more information.
- *
- * PCI initialization for the Hitachi SH7751 Solution Engine board (MS7751SE01)
- */
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/pci.h>
-
-#include <asm/io.h>
-#include "../../../drivers/pci/pci-sh7751.h"
-
-#define PCIMCR_MRSET_OFF	0xBFFFFFFF
-#define PCIMCR_RFSH_OFF		0xFFFFFFFB
-
-/*
- * Only long word accesses of the PCIC's internal local registers and the
- * configuration registers from the CPU is supported.
- */
-#define PCIC_WRITE(x,v) writel((v), PCI_REG(x))
-#define PCIC_READ(x) readl(PCI_REG(x))
-
-#define xPCIBIOS_MIN_IO		board_pci_channels->io_resource->start
-#define xPCIBIOS_MIN_MEM	board_pci_channels->mem_resource->start
-
-/*
- * Description:  This function sets up and initializes the pcic, sets
- * up the BARS, maps the DRAM into the address space etc, etc.
- */
-int __init pcibios_init_platform(void)
-{
-   unsigned long bcr1, wcr1, wcr2, wcr3, mcr;
-   unsigned short bcr2;
-
-   /*
-    * Initialize the slave bus controller on the pcic.  The values used
-    * here should not be hardcoded, but they should be taken from the bsc
-    * on the processor, to make this function as generic as possible.
-    * (i.e. Another sbc may usr different SDRAM timing settings -- in order
-    * for the pcic to work, its settings need to be exactly the same.)
-    */
-   bcr1 = (*(volatile unsigned long*)(SH7751_BCR1));
-   bcr2 = (*(volatile unsigned short*)(SH7751_BCR2));
-   wcr1 = (*(volatile unsigned long*)(SH7751_WCR1));
-   wcr2 = (*(volatile unsigned long*)(SH7751_WCR2));
-   wcr3 = (*(volatile unsigned long*)(SH7751_WCR3));
-   mcr = (*(volatile unsigned long*)(SH7751_MCR));
-
-   bcr1 = bcr1 | 0x00080000;  /* Enable Bit 19, BREQEN */
-   (*(volatile unsigned long*)(SH7751_BCR1)) = bcr1;   
-
-   bcr1 = bcr1 | 0x40080000;  /* Enable Bit 19 BREQEN, set PCIC to slave */
-   PCIC_WRITE(SH7751_PCIBCR1, bcr1);	 /* PCIC BCR1 */
-   PCIC_WRITE(SH7751_PCIBCR2, bcr2);     /* PCIC BCR2 */
-   PCIC_WRITE(SH7751_PCIWCR1, wcr1);     /* PCIC WCR1 */
-   PCIC_WRITE(SH7751_PCIWCR2, wcr2);     /* PCIC WCR2 */
-   PCIC_WRITE(SH7751_PCIWCR3, wcr3);     /* PCIC WCR3 */
-   mcr = (mcr & PCIMCR_MRSET_OFF) & PCIMCR_RFSH_OFF;
-   PCIC_WRITE(SH7751_PCIMCR, mcr);      /* PCIC MCR */
-
-
-   /* Enable all interrupts, so we know what to fix */
-   PCIC_WRITE(SH7751_PCIINTM, 0x0000c3ff);
-   PCIC_WRITE(SH7751_PCIAINTM, 0x0000380f);
-
-   /* Set up standard PCI config registers */
-   PCIC_WRITE(SH7751_PCICONF1, 	0xF39000C7); /* Bus Master, Mem & I/O access */
-   PCIC_WRITE(SH7751_PCICONF2, 	0x00000000); /* PCI Class code & Revision ID */
-   PCIC_WRITE(SH7751_PCICONF4, 	0xab000001); /* PCI I/O address (local regs) */
-   PCIC_WRITE(SH7751_PCICONF5, 	0x0c000000); /* PCI MEM address (local RAM)  */
-   PCIC_WRITE(SH7751_PCICONF6, 	0xd0000000); /* PCI MEM address (unused)     */
-   PCIC_WRITE(SH7751_PCICONF11, 0x35051054); /* PCI Subsystem ID & Vendor ID */
-   PCIC_WRITE(SH7751_PCILSR0, 0x03f00000);   /* MEM (full 64M exposed)       */
-   PCIC_WRITE(SH7751_PCILSR1, 0x00000000);   /* MEM (unused)                 */
-   PCIC_WRITE(SH7751_PCILAR0, 0x0c000000);   /* MEM (direct map from PCI)    */
-   PCIC_WRITE(SH7751_PCILAR1, 0x00000000);   /* MEM (unused)                 */
-
-   /* Now turn it on... */
-   PCIC_WRITE(SH7751_PCICR, 0xa5000001);
-
-   /*
-    * Set PCIMBR and PCIIOBR here, assuming a single window
-    * (16M MEM, 256K IO) is enough.  If a larger space is
-    * needed, the readx/writex and inx/outx functions will
-    * have to do more (e.g. setting registers for each call).
-    */
-
-   /*
-    * Set the MBR so PCI address is one-to-one with window,
-    * meaning all calls go straight through... use BUG_ON to
-    * catch erroneous assumption.
-    */
-   BUG_ON(xPCIBIOS_MIN_MEM != SH7751_PCI_MEMORY_BASE);
-
-   PCIC_WRITE(SH7751_PCIMBR, xPCIBIOS_MIN_MEM);
-
-   /* Set IOBR for window containing area specified in pci.h */
-   PCIC_WRITE(SH7751_PCIIOBR, (xPCIBIOS_MIN_IO & SH7751_PCIIOBR_MASK));
-
-   /* All done, may as well say so... */
-   printk("SH7751 PCI: Finished initialization of the PCI controller\n");
-
-   return 1;
-}
-
-int __init pcibios_map_platform_irq(u8 slot, u8 pin)
-{
-        switch (slot) {
-        case 0: return 13;
-        case 1: return 13; 	/* AMD Ethernet controller */
-        case 2: return -1;
-        case 3: return -1;
-        case 4: return -1;
-        default:
-                printk("PCI: Bad IRQ mapping request for slot %d\n", slot);
-                return -1;
-        }
-}
-
-static struct resource sh7751_io_resource = {
-	.name   = "SH7751 IO",
-	.start  = SH7751_PCI_IO_BASE,
-	.end    = SH7751_PCI_IO_BASE + SH7751_PCI_IO_SIZE - 1,
-	.flags  = IORESOURCE_IO
-};
-
-static struct resource sh7751_mem_resource = {
-	.name   = "SH7751 mem",
-	.start  = SH7751_PCI_MEMORY_BASE,
-	.end    = SH7751_PCI_MEMORY_BASE + SH7751_PCI_MEM_SIZE - 1,
-	.flags  = IORESOURCE_MEM
-};
-
-extern struct pci_ops sh7751_pci_ops;
-
-struct pci_channel board_pci_channels[] = {
-	{ &sh7751_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
-	{ NULL, NULL, NULL, 0, 0 },
-};
-
diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index fbebd73..e388a70 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_CPU_SH5)			+= pci-sh5.o ops-sh5.o
 obj-$(CONFIG_SH_DREAMCAST)		+= ops-dreamcast.o fixups-dreamcast.o \
 					   pci-dreamcast.o
 obj-$(CONFIG_SH_SECUREEDGE5410)		+= fixups-snapgear.o
+obj-$(CONFIG_SH_7751_SOLUTION_ENGINE)	+= fixups-se7751.o
 obj-$(CONFIG_SH_RTS7751R2D)		+= fixups-rts7751r2d.o
 obj-$(CONFIG_SH_SH03)			+= fixups-sh03.o
 obj-$(CONFIG_SH_HIGHLANDER)		+= fixups-r7780rp.o
diff --git a/arch/sh/drivers/pci/fixups-se7751.c b/arch/sh/drivers/pci/fixups-se7751.c
new file mode 100644
index 0000000..475fa9f
--- /dev/null
+++ b/arch/sh/drivers/pci/fixups-se7751.c
@@ -0,0 +1,111 @@
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include "pci-sh4.h"
+
+int __init pcibios_map_platform_irq(u8 slot, u8 pin)
+{
+        switch (slot) {
+        case 0: return 13;
+        case 1: return 13;	/* AMD Ethernet controller */
+        case 2: return -1;
+        case 3: return -1;
+        case 4: return -1;
+        default:
+                printk("PCI: Bad IRQ mapping request for slot %d\n", slot);
+                return -1;
+        }
+}
+
+#define PCIMCR_MRSET_OFF	0xBFFFFFFF
+#define PCIMCR_RFSH_OFF		0xFFFFFFFB
+
+/*
+ * Only long word accesses of the PCIC's internal local registers and the
+ * configuration registers from the CPU is supported.
+ */
+#define PCIC_WRITE(x,v) writel((v), PCI_REG(x))
+#define PCIC_READ(x) readl(PCI_REG(x))
+
+/*
+ * Description:  This function sets up and initializes the pcic, sets
+ * up the BARS, maps the DRAM into the address space etc, etc.
+ */
+int pci_fixup_pcic(struct pci_channel *chan)
+{
+	unsigned long bcr1, wcr1, wcr2, wcr3, mcr;
+	unsigned short bcr2;
+
+	/*
+	* Initialize the slave bus controller on the pcic.  The values used
+	* here should not be hardcoded, but they should be taken from the bsc
+	* on the processor, to make this function as generic as possible.
+	* (i.e. Another sbc may usr different SDRAM timing settings -- in order
+	* for the pcic to work, its settings need to be exactly the same.)
+	*/
+	bcr1 = (*(volatile unsigned long*)(SH7751_BCR1));
+	bcr2 = (*(volatile unsigned short*)(SH7751_BCR2));
+	wcr1 = (*(volatile unsigned long*)(SH7751_WCR1));
+	wcr2 = (*(volatile unsigned long*)(SH7751_WCR2));
+	wcr3 = (*(volatile unsigned long*)(SH7751_WCR3));
+	mcr = (*(volatile unsigned long*)(SH7751_MCR));
+
+	bcr1 = bcr1 | 0x00080000;  /* Enable Bit 19, BREQEN */
+	(*(volatile unsigned long*)(SH7751_BCR1)) = bcr1;
+
+	bcr1 = bcr1 | 0x40080000;  /* Enable Bit 19 BREQEN, set PCIC to slave */
+	PCIC_WRITE(SH7751_PCIBCR1, bcr1);	 /* PCIC BCR1 */
+	PCIC_WRITE(SH7751_PCIBCR2, bcr2);     /* PCIC BCR2 */
+	PCIC_WRITE(SH7751_PCIWCR1, wcr1);     /* PCIC WCR1 */
+	PCIC_WRITE(SH7751_PCIWCR2, wcr2);     /* PCIC WCR2 */
+	PCIC_WRITE(SH7751_PCIWCR3, wcr3);     /* PCIC WCR3 */
+	mcr = (mcr & PCIMCR_MRSET_OFF) & PCIMCR_RFSH_OFF;
+	PCIC_WRITE(SH7751_PCIMCR, mcr);      /* PCIC MCR */
+
+
+	/* Enable all interrupts, so we know what to fix */
+	PCIC_WRITE(SH7751_PCIINTM, 0x0000c3ff);
+	PCIC_WRITE(SH7751_PCIAINTM, 0x0000380f);
+
+	/* Set up standard PCI config registers */
+	PCIC_WRITE(SH7751_PCICONF1,	0xF39000C7); /* Bus Master, Mem & I/O access */
+	PCIC_WRITE(SH7751_PCICONF2,	0x00000000); /* PCI Class code & Revision ID */
+	PCIC_WRITE(SH7751_PCICONF4,	0xab000001); /* PCI I/O address (local regs) */
+	PCIC_WRITE(SH7751_PCICONF5,	0x0c000000); /* PCI MEM address (local RAM)  */
+	PCIC_WRITE(SH7751_PCICONF6,	0xd0000000); /* PCI MEM address (unused)     */
+	PCIC_WRITE(SH7751_PCICONF11, 0x35051054); /* PCI Subsystem ID & Vendor ID */
+	PCIC_WRITE(SH7751_PCILSR0, 0x03f00000);   /* MEM (full 64M exposed)       */
+	PCIC_WRITE(SH7751_PCILSR1, 0x00000000);   /* MEM (unused)                 */
+	PCIC_WRITE(SH7751_PCILAR0, 0x0c000000);   /* MEM (direct map from PCI)    */
+	PCIC_WRITE(SH7751_PCILAR1, 0x00000000);   /* MEM (unused)                 */
+
+	/* Now turn it on... */
+	PCIC_WRITE(SH7751_PCICR, 0xa5000001);
+
+	/*
+	* Set PCIMBR and PCIIOBR here, assuming a single window
+	* (16M MEM, 256K IO) is enough.  If a larger space is
+	* needed, the readx/writex and inx/outx functions will
+	* have to do more (e.g. setting registers for each call).
+	*/
+
+	/*
+	* Set the MBR so PCI address is one-to-one with window,
+	* meaning all calls go straight through... use BUG_ON to
+	* catch erroneous assumption.
+	*/
+	BUG_ON(chan->mem_resource->start != SH7751_PCI_MEMORY_BASE);
+
+	PCIC_WRITE(SH7751_PCIMBR, chan->mem_resource->start);
+
+	/* Set IOBR for window containing area specified in pci.h */
+	PCIC_WRITE(SH7751_PCIIOBR, (chan->io_resource->start & SH7751_PCIIOBR_MASK));
+
+	/* All done, may as well say so... */
+	printk("SH7751 PCI: Finished initialization of the PCI controller\n");
+
+	return 1;
+}
-- 
cgit v0.10.2


From 805fcc88999162b361ef0b0ce25782ef65f147d7 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 21:46:42 +0900
Subject: sh: pci: Kill off the last remnants of the now unused pci-auto code.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/Kconfig b/arch/sh/drivers/pci/Kconfig
index 1d53496..ea903a9 100644
--- a/arch/sh/drivers/pci/Kconfig
+++ b/arch/sh/drivers/pci/Kconfig
@@ -18,24 +18,6 @@ config SH_PCIDMA_NONCOHERENT
 	  bridge integrated with your SH CPU, refer carefully to the chip specs
 	  to see if you can say 'N' here. Otherwise, leave it as 'Y'.
 
-# Temporary config option for transitioning off of PCI_AUTO
 config PCI_NEW
 	def_bool y
 	depends on PCI
-
-# This is also board-specific
-config PCI_AUTO
-	bool
-	depends on PCI && !PCI_NEW
-	default y
-
-config PCI_AUTO_UPDATE_RESOURCES
-	bool
-	depends on PCI_AUTO
-	default y if !SH_DREAMCAST
-	help
-	  Selecting this option will cause the PCI auto code to leave your
-	  BAR values alone. Otherwise they will be updated automatically. If
-	  for some reason, you have a board that simply refuses to work
-	  with its resources updated beyond what they are when the device
-	  is powered up, set this to N. Everyone else will want this as Y.
diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index e388a70..a8350cc 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -2,7 +2,6 @@
 # Makefile for the PCI specific kernel interface routines under Linux.
 #
 obj-y					+= pci-lib.o
-obj-$(CONFIG_PCI_AUTO)			+= pci.o pci-auto.o
 obj-$(CONFIG_PCI_NEW)			+= pci-new.o
 
 obj-$(CONFIG_CPU_SUBTYPE_SH7751)	+= pci-sh7751.o ops-sh4.o
diff --git a/arch/sh/drivers/pci/pci-auto.c b/arch/sh/drivers/pci/pci-auto.c
deleted file mode 100644
index 1d715ec..0000000
--- a/arch/sh/drivers/pci/pci-auto.c
+++ /dev/null
@@ -1,546 +0,0 @@
-/*
- * PCI autoconfiguration library
- *
- * Author: Matt Porter <mporter@mvista.com>
- *
- * Copyright 2000, 2001 MontaVista Software Inc.
- * Copyright 2001 Bradley D. LaRonde <brad@ltc.com>
- * Copyright 2003 Paul Mundt <lethal@linux-sh.org>
- *
- * This program is free software; you can redistribute  it and/or modify it
- * under  the terms of  the GNU General  Public License as published by the
- * Free Software Foundation;  either version 2 of the  License, or (at your
- * option) any later version.
- */
-
-/*
- * Modified for MIPS by Jun Sun, jsun@mvista.com
- *
- * . Simplify the interface between pci_auto and the rest: a single function.
- * . Assign resources from low address to upper address.
- * . change most int to u32.
- *
- * Further modified to include it as mips generic code, ppopov@mvista.com.
- *
- * 2001-10-26  Bradley D. LaRonde <brad@ltc.com>
- * - Add a top_bus argument to the "early config" functions so that
- *   they can set a fake parent bus pointer to convince the underlying
- *   pci ops to use type 1 configuration for sub busses.
- * - Set bridge base and limit registers correctly.
- * - Align io and memory base properly before and after bridge setup.
- * - Don't fall through to pci_setup_bars for bridge.
- * - Reformat the debug output to look more like lspci's output.
- *
- * Cloned for SuperH by M. R. Brown, mrbrown@0xd6.org
- *
- * 2003-08-05  Paul Mundt <lethal@linux-sh.org>
- * - Don't update the BAR values on systems that already have valid addresses
- *   and don't want these updated for whatever reason, by way of a new config
- *   option check. However, we still read in the old BAR values so that they
- *   can still be reported through the debug output.
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/pci.h>
-
-#define	DEBUG
-#ifdef	DEBUG
-#define	DBG(x...)	printk(x)
-#else
-#define	DBG(x...)
-#endif
-
-/*
- * These functions are used early on before PCI scanning is done
- * and all of the pci_dev and pci_bus structures have been created.
- */
-static struct pci_dev *fake_pci_dev(struct pci_channel *hose,
-	int top_bus, int busnr, int devfn)
-{
-	static struct pci_dev dev;
-	static struct pci_bus bus;
-
-	dev.bus = &bus;
-	dev.sysdata = hose;
-	dev.devfn = devfn;
-	bus.number = busnr;
-	bus.ops = hose->pci_ops;
-	bus.sysdata = hose;
-
-	if(busnr != top_bus)
-		/* Fake a parent bus structure. */
-		bus.parent = &bus;
-	else
-		bus.parent = NULL;
-
-	return &dev;
-}
-
-#define EARLY_PCI_OP(rw, size, type)					\
-static int early_##rw##_config_##size(struct pci_channel *hose,		\
-	int top_bus, int bus, int devfn, int offset, type value)	\
-{									\
-	return pci_##rw##_config_##size(				\
-		fake_pci_dev(hose, top_bus, bus, devfn),		\
-		offset, value);						\
-}
-
-EARLY_PCI_OP(read, byte, u8 *)
-EARLY_PCI_OP(read, word, u16 *)
-EARLY_PCI_OP(read, dword, u32 *)
-EARLY_PCI_OP(write, byte, u8)
-EARLY_PCI_OP(write, word, u16)
-EARLY_PCI_OP(write, dword, u32)
-
-static struct resource *io_resource_inuse;
-static struct resource *mem_resource_inuse;
-
-static u32 pciauto_lower_iospc;
-static u32 pciauto_upper_iospc;
-
-static u32 pciauto_lower_memspc;
-static u32 pciauto_upper_memspc;
-
-static void __init
-pciauto_setup_bars(struct pci_channel *hose,
-		   int top_bus,
-		   int current_bus,
-		   int pci_devfn,
-		   int bar_limit)
-{
-	u32 bar_response, bar_size, bar_value;
-	u32 bar, addr_mask, bar_nr = 0;
-	u32 * upper_limit;
-	u32 * lower_limit;
-	int found_mem64 = 0;
-
-	for (bar = PCI_BASE_ADDRESS_0; bar <= bar_limit; bar+=4) {
-		u32 bar_addr;
-
-		/* Read the old BAR value */
-		early_read_config_dword(hose, top_bus,
-					current_bus,
-					pci_devfn,
-					bar,
-					&bar_addr);
-
-		/* Tickle the BAR and get the response */
-		early_write_config_dword(hose, top_bus,
-					 current_bus,
-					 pci_devfn,
-					 bar,
-					 0xffffffff);
-
-		early_read_config_dword(hose, top_bus,
-					current_bus,
-					pci_devfn,
-					bar,
-					&bar_response);
-
-		/*
-		 * Write the old BAR value back out, only update the BAR
-		 * if we implicitly want resources to be updated, which
-		 * is done by the generic code further down. -- PFM.
-		 */
-		early_write_config_dword(hose, top_bus,
-					 current_bus,
-					 pci_devfn,
-					 bar,
-					 bar_addr);
-
-		/* If BAR is not implemented go to the next BAR */
-		if (!bar_response)
-			continue;
-
-		/*
-		 * Workaround for a BAR that doesn't use its upper word,
-		 * like the ALi 1535D+ PCI DC-97 Controller Modem (M5457).
-		 * bdl <brad@ltc.com>
-		 */
-		if (!(bar_response & 0xffff0000))
-			bar_response |= 0xffff0000;
-
-retry:
-		/* Check the BAR type and set our address mask */
-		if (bar_response & PCI_BASE_ADDRESS_SPACE) {
-			addr_mask = PCI_BASE_ADDRESS_IO_MASK;
-			upper_limit = &pciauto_upper_iospc;
-			lower_limit = &pciauto_lower_iospc;
-			DBG("        I/O");
-		} else {
-			if ((bar_response & PCI_BASE_ADDRESS_MEM_TYPE_MASK) ==
-			    PCI_BASE_ADDRESS_MEM_TYPE_64)
-				found_mem64 = 1;
-
-			addr_mask = PCI_BASE_ADDRESS_MEM_MASK;
-			upper_limit = &pciauto_upper_memspc;
-			lower_limit = &pciauto_lower_memspc;
-			DBG("        Mem");
-		}
-
-
-		/* Calculate requested size */
-		bar_size = ~(bar_response & addr_mask) + 1;
-
-		/* Allocate a base address */
-		bar_value = ((*lower_limit - 1) & ~(bar_size - 1)) + bar_size;
-
-		if ((bar_value + bar_size) > *upper_limit) {
-			if (bar_response & PCI_BASE_ADDRESS_SPACE) {
-				if (io_resource_inuse->child) {
-					io_resource_inuse =
-						io_resource_inuse->child;
-					pciauto_lower_iospc =
-						io_resource_inuse->start;
-					pciauto_upper_iospc =
-						io_resource_inuse->end + 1;
-					goto retry;
-				}
-
-			} else {
-				if (mem_resource_inuse->child) {
-					mem_resource_inuse =
-						mem_resource_inuse->child;
-					pciauto_lower_memspc =
-						mem_resource_inuse->start;
-					pciauto_upper_memspc =
-						mem_resource_inuse->end + 1;
-					goto retry;
-				}
-			}
-			DBG(" unavailable -- skipping, value %x size %x\n",
-					bar_value, bar_size);
-			continue;
-		}
-
-		if (bar_value < *lower_limit || (bar_value + bar_size) >= *upper_limit) {
-			DBG(" unavailable -- skipping, value %x size %x\n",
-					bar_value, bar_size);
-			continue;
-		}
-
-#ifdef CONFIG_PCI_AUTO_UPDATE_RESOURCES
-		/* Write it out and update our limit */
-		early_write_config_dword(hose, top_bus, current_bus, pci_devfn,
-					 bar, bar_value);
-#endif
-
-		*lower_limit = bar_value + bar_size;
-
-		/*
-		 * If we are a 64-bit decoder then increment to the
-		 * upper 32 bits of the bar and force it to locate
-		 * in the lower 4GB of memory.
-		 */
-		if (found_mem64) {
-			bar += 4;
-			early_write_config_dword(hose, top_bus,
-						 current_bus,
-						 pci_devfn,
-						 bar,
-						 0x00000000);
-		}
-
-		DBG(" at 0x%.8x [size=0x%x]\n", bar_value, bar_size);
-
-		bar_nr++;
-	}
-
-}
-
-static void __init
-pciauto_prescan_setup_bridge(struct pci_channel *hose,
-			     int top_bus,
-			     int current_bus,
-			     int pci_devfn,
-			     int sub_bus)
-{
-	/* Configure bus number registers */
-	early_write_config_byte(hose, top_bus, current_bus, pci_devfn,
-	                        PCI_PRIMARY_BUS, current_bus);
-	early_write_config_byte(hose, top_bus, current_bus, pci_devfn,
-				PCI_SECONDARY_BUS, sub_bus + 1);
-	early_write_config_byte(hose, top_bus, current_bus, pci_devfn,
-				PCI_SUBORDINATE_BUS, 0xff);
-
-	/* Align memory and I/O to 1MB and 4KB boundaries. */
-	pciauto_lower_memspc = (pciauto_lower_memspc + (0x100000 - 1))
-		& ~(0x100000 - 1);
-	pciauto_lower_iospc = (pciauto_lower_iospc + (0x1000 - 1))
-		& ~(0x1000 - 1);
-
-	/* Set base (lower limit) of address range behind bridge. */
-	early_write_config_word(hose, top_bus, current_bus, pci_devfn,
-		PCI_MEMORY_BASE, pciauto_lower_memspc >> 16);
-	early_write_config_byte(hose, top_bus, current_bus, pci_devfn,
-		PCI_IO_BASE, (pciauto_lower_iospc & 0x0000f000) >> 8);
-	early_write_config_word(hose, top_bus, current_bus, pci_devfn,
-		PCI_IO_BASE_UPPER16, pciauto_lower_iospc >> 16);
-
-	/* We don't support prefetchable memory for now, so disable */
-	early_write_config_word(hose, top_bus, current_bus, pci_devfn,
-				PCI_PREF_MEMORY_BASE, 0);
-	early_write_config_word(hose, top_bus, current_bus, pci_devfn,
-				PCI_PREF_MEMORY_LIMIT, 0);
-}
-
-static void __init
-pciauto_postscan_setup_bridge(struct pci_channel *hose,
-			      int top_bus,
-			      int current_bus,
-			      int pci_devfn,
-			      int sub_bus)
-{
-	u32 temp;
-
-	/*
-	 * [jsun] we always bump up baselines a little, so that if there
-	 * nothing behind P2P bridge, we don't wind up overlapping IO/MEM
-	 * spaces.
-	 */
-	pciauto_lower_memspc += 1;
-	pciauto_lower_iospc += 1;
-
-	/* Configure bus number registers */
-	early_write_config_byte(hose, top_bus, current_bus, pci_devfn,
-				PCI_SUBORDINATE_BUS, sub_bus);
-
-	/* Set upper limit of address range behind bridge. */
-	early_write_config_word(hose, top_bus, current_bus, pci_devfn,
-		PCI_MEMORY_LIMIT, pciauto_lower_memspc >> 16);
-	early_write_config_byte(hose, top_bus, current_bus, pci_devfn,
-		PCI_IO_LIMIT, (pciauto_lower_iospc & 0x0000f000) >> 8);
-	early_write_config_word(hose, top_bus, current_bus, pci_devfn,
-		PCI_IO_LIMIT_UPPER16, pciauto_lower_iospc >> 16);
-
-	/* Align memory and I/O to 1MB and 4KB boundaries. */
-	pciauto_lower_memspc = (pciauto_lower_memspc + (0x100000 - 1))
-		& ~(0x100000 - 1);
-	pciauto_lower_iospc = (pciauto_lower_iospc + (0x1000 - 1))
-		& ~(0x1000 - 1);
-
-	/* Enable memory and I/O accesses, enable bus master */
-	early_read_config_dword(hose, top_bus, current_bus, pci_devfn,
-		PCI_COMMAND, &temp);
-	early_write_config_dword(hose, top_bus, current_bus, pci_devfn,
-		PCI_COMMAND, temp | PCI_COMMAND_IO | PCI_COMMAND_MEMORY
-		| PCI_COMMAND_MASTER);
-}
-
-static void __init
-pciauto_prescan_setup_cardbus_bridge(struct pci_channel *hose,
-			int top_bus,
-			int current_bus,
-			int pci_devfn,
-			int sub_bus)
-{
-	/* Configure bus number registers */
-	early_write_config_byte(hose, top_bus, current_bus, pci_devfn,
-				PCI_PRIMARY_BUS, current_bus);
-	early_write_config_byte(hose, top_bus, current_bus, pci_devfn,
-				PCI_SECONDARY_BUS, sub_bus + 1);
-	early_write_config_byte(hose, top_bus, current_bus, pci_devfn,
-				PCI_SUBORDINATE_BUS, 0xff);
-
-	/* Align memory and I/O to 4KB and 4 byte boundaries. */
-	pciauto_lower_memspc = (pciauto_lower_memspc + (0x1000 - 1))
-		& ~(0x1000 - 1);
-	pciauto_lower_iospc = (pciauto_lower_iospc + (0x4 - 1))
-		& ~(0x4 - 1);
-
-	early_write_config_dword(hose, top_bus, current_bus, pci_devfn,
-		PCI_CB_MEMORY_BASE_0, pciauto_lower_memspc);
-	early_write_config_dword(hose, top_bus, current_bus, pci_devfn,
-		PCI_CB_IO_BASE_0, pciauto_lower_iospc);
-}
-
-static void __init
-pciauto_postscan_setup_cardbus_bridge(struct pci_channel *hose,
-			int top_bus,
-			int current_bus,
-			int pci_devfn,
-			int sub_bus)
-{
-	u32 temp;
-
-	/*
-	 * [jsun] we always bump up baselines a little, so that if there
-	 * nothing behind P2P bridge, we don't wind up overlapping IO/MEM
-	 * spaces.
-	 */
-	pciauto_lower_memspc += 1;
-	pciauto_lower_iospc += 1;
-
-	/*
-	 * Configure subordinate bus number.  The PCI subsystem
-	 * bus scan will renumber buses (reserving three additional
-	 * for this PCI<->CardBus bridge for the case where a CardBus
-	 * adapter contains a P2P or CB2CB bridge.
-	 */
-
-	early_write_config_byte(hose, top_bus, current_bus, pci_devfn,
-				PCI_SUBORDINATE_BUS, sub_bus);
-
-	/*
-	 * Reserve an additional 4MB for mem space and 16KB for
-	 * I/O space.  This should cover any additional space
-	 * requirement of unusual CardBus devices with
-	 * additional bridges that can consume more address space.
-	 *
-	 * Although pcmcia-cs currently will reprogram bridge
-	 * windows, the goal is to add an option to leave them
-	 * alone and use the bridge window ranges as the regions
-	 * that are searched for free resources upon hot-insertion
-	 * of a device.  This will allow a PCI<->CardBus bridge
-	 * configured by this routine to happily live behind a
-	 * P2P bridge in a system.
-	 */
-	/* Align memory and I/O to 4KB and 4 byte boundaries. */
-	pciauto_lower_memspc = (pciauto_lower_memspc + (0x1000 - 1))
-		& ~(0x1000 - 1);
-	pciauto_lower_iospc = (pciauto_lower_iospc + (0x4 - 1))
-		& ~(0x4 - 1);
-	/* Set up memory and I/O filter limits, assume 32-bit I/O space */
-	early_write_config_dword(hose, top_bus, current_bus, pci_devfn,
-		PCI_CB_MEMORY_LIMIT_0, pciauto_lower_memspc - 1);
-	early_write_config_dword(hose, top_bus, current_bus, pci_devfn,
-		PCI_CB_IO_LIMIT_0, pciauto_lower_iospc - 1);
-
-	/* Enable memory and I/O accesses, enable bus master */
-	early_read_config_dword(hose, top_bus, current_bus, pci_devfn,
-		PCI_COMMAND, &temp);
-	early_write_config_dword(hose, top_bus, current_bus, pci_devfn,
-		PCI_COMMAND, temp | PCI_COMMAND_IO | PCI_COMMAND_MEMORY |
-		PCI_COMMAND_MASTER);
-}
-
-#define	PCIAUTO_IDE_MODE_MASK		0x05
-
-static int __init
-pciauto_bus_scan(struct pci_channel *hose, int top_bus, int current_bus)
-{
-	int sub_bus;
-	u32 pci_devfn, pci_class, cmdstat, found_multi=0;
-	unsigned short vid, did;
-	unsigned char header_type;
-	int devfn_start = 0;
-	int devfn_stop = 0xff;
-
-	sub_bus = current_bus;
-
-	if (hose->first_devfn)
-		devfn_start = hose->first_devfn;
-	if (hose->last_devfn)
-		devfn_stop = hose->last_devfn;
-
-	for (pci_devfn=devfn_start; pci_devfn<devfn_stop; pci_devfn++) {
-
-		if (PCI_FUNC(pci_devfn) && !found_multi)
-			continue;
-
-		early_read_config_word(hose, top_bus, current_bus, pci_devfn,
-				       PCI_VENDOR_ID, &vid);
-
-		if (vid == 0xffff) continue;
-
-		early_read_config_byte(hose, top_bus, current_bus, pci_devfn,
-				       PCI_HEADER_TYPE, &header_type);
-
-		if (!PCI_FUNC(pci_devfn))
-			found_multi = header_type & 0x80;
-
-		early_read_config_word(hose, top_bus, current_bus, pci_devfn,
-				       PCI_DEVICE_ID, &did);
-
-		early_read_config_dword(hose, top_bus, current_bus, pci_devfn,
-					PCI_CLASS_REVISION, &pci_class);
-
-		DBG("%.2x:%.2x.%x Class %.4x: %.4x:%.4x",
-			current_bus, PCI_SLOT(pci_devfn), PCI_FUNC(pci_devfn),
-			pci_class >> 16, vid, did);
-		if (pci_class & 0xff)
-			DBG(" (rev %.2x)", pci_class & 0xff);
-		DBG("\n");
-
-		if ((pci_class >> 16) == PCI_CLASS_BRIDGE_PCI) {
-			DBG("        Bridge: primary=%.2x, secondary=%.2x\n",
-				current_bus, sub_bus + 1);
-			pciauto_prescan_setup_bridge(hose, top_bus, current_bus,
-						     pci_devfn, sub_bus);
-			DBG("Scanning sub bus %.2x, I/O 0x%.8x, Mem 0x%.8x\n",
-				sub_bus + 1,
-				pciauto_lower_iospc, pciauto_lower_memspc);
-			sub_bus = pciauto_bus_scan(hose, top_bus, sub_bus+1);
-			DBG("Back to bus %.2x\n", current_bus);
-			pciauto_postscan_setup_bridge(hose, top_bus, current_bus,
-							pci_devfn, sub_bus);
-			continue;
-		} else if ((pci_class >> 16) == PCI_CLASS_BRIDGE_CARDBUS) {
-			DBG("  CARDBUS  Bridge: primary=%.2x, secondary=%.2x\n",
-				current_bus, sub_bus + 1);
-			DBG("PCI Autoconfig: Found CardBus bridge, device %d function %d\n", PCI_SLOT(pci_devfn), PCI_FUNC(pci_devfn));
-			/* Place CardBus Socket/ExCA registers */
-			pciauto_setup_bars(hose, top_bus, current_bus, pci_devfn, PCI_BASE_ADDRESS_0);
-
-			pciauto_prescan_setup_cardbus_bridge(hose, top_bus,
-					current_bus, pci_devfn, sub_bus);
-
-			DBG("Scanning sub bus %.2x, I/O 0x%.8x, Mem 0x%.8x\n",
-				sub_bus + 1,
-				pciauto_lower_iospc, pciauto_lower_memspc);
-			sub_bus = pciauto_bus_scan(hose, top_bus, sub_bus+1);
-			DBG("Back to bus %.2x, sub_bus is %x\n", current_bus, sub_bus);
-			pciauto_postscan_setup_cardbus_bridge(hose, top_bus,
-					current_bus, pci_devfn, sub_bus);
-			continue;
-		} else if ((pci_class >> 16) == PCI_CLASS_STORAGE_IDE) {
-
-			unsigned char prg_iface;
-
-			early_read_config_byte(hose, top_bus, current_bus,
-				pci_devfn, PCI_CLASS_PROG, &prg_iface);
-			if (!(prg_iface & PCIAUTO_IDE_MODE_MASK)) {
-				DBG("Skipping legacy mode IDE controller\n");
-				continue;
-			}
-		}
-
-		/*
-		 * Found a peripheral, enable some standard
-		 * settings
-		 */
-		early_read_config_dword(hose, top_bus, current_bus, pci_devfn,
-					PCI_COMMAND, &cmdstat);
-		early_write_config_dword(hose, top_bus, current_bus, pci_devfn,
-					 PCI_COMMAND, cmdstat | PCI_COMMAND_IO |
-					 PCI_COMMAND_MEMORY |
-					 PCI_COMMAND_MASTER);
-		early_write_config_byte(hose, top_bus, current_bus, pci_devfn,
-					PCI_LATENCY_TIMER, 0x80);
-
-		/* Allocate PCI I/O and/or memory space */
-		pciauto_setup_bars(hose, top_bus, current_bus, pci_devfn, PCI_BASE_ADDRESS_5);
-	}
-	return sub_bus;
-}
-
-int __init
-pciauto_assign_resources(int busno, struct pci_channel *hose)
-{
-	/* setup resource limits */
-	io_resource_inuse = hose->io_resource;
-	mem_resource_inuse = hose->mem_resource;
-
-	pciauto_lower_iospc = io_resource_inuse->start;
-	pciauto_upper_iospc = io_resource_inuse->end + 1;
-	pciauto_lower_memspc = mem_resource_inuse->start;
-	pciauto_upper_memspc = mem_resource_inuse->end + 1;
-	DBG("Autoconfig PCI channel 0x%p\n", hose);
-	DBG("Scanning bus %.2x, I/O 0x%.8x:0x%.8x, Mem 0x%.8x:0x%.8x\n",
-		busno, pciauto_lower_iospc, pciauto_upper_iospc, 
-		pciauto_lower_memspc, pciauto_upper_memspc);
-
-	return pciauto_bus_scan(hose, busno, busno);
-}
diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c
deleted file mode 100644
index 8c332b2..0000000
--- a/arch/sh/drivers/pci/pci.c
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * arch/sh/drivers/pci/pci.c
- *
- * Copyright (c) 2002 M. R. Brown  <mrbrown@linux-sh.org>
- * Copyright (c) 2004 - 2006 Paul Mundt  <lethal@linux-sh.org>
- *
- * These functions are collected here to reduce duplication of common
- * code amongst the many platform-specific PCI support code files.
- *
- * These routines require the following board-specific routines:
- * void pcibios_fixup_irqs();
- *
- * See include/asm-sh/pci.h for more information.
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/init.h>
-#include <linux/dma-debug.h>
-#include <asm/io.h>
-
-static int __init pcibios_init(void)
-{
-	struct pci_channel *p;
-	struct pci_bus *bus;
-	int busno;
-
-	/* init channels */
-	busno = 0;
-	for (p = board_pci_channels; p->init; p++) {
-		if (p->init(p) == 0)
-			p->enabled = 1;
-		else
-			pr_err("Unable to init pci channel %d\n", busno);
-		busno++;
-	}
-
-#ifdef CONFIG_PCI_AUTO
-	/* assign resources */
-	busno = 0;
-	for (p = board_pci_channels; p->init; p++)
-		if (p->enabled)
-			busno = pciauto_assign_resources(busno, p) + 1;
-#endif
-
-	/* scan the buses */
-	busno = 0;
-	for (p = board_pci_channels; p->init; p++) {
-		if (p->enabled) {
-			bus = pci_scan_bus(busno, p->pci_ops, p);
-			busno = bus->subordinate + 1;
-		}
-	}
-
-	pci_fixup_irqs(pci_common_swizzle, pcibios_map_platform_irq);
-
-	dma_debug_add_bus(&pci_bus_type);
-
-	return 0;
-}
-subsys_initcall(pcibios_init);
-
-/*
- *  Called after each bus is probed, but before its children
- *  are examined.
- */
-void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus)
-{
-	pci_read_bridge_bases(bus);
-}
-
-EXPORT_SYMBOL(board_pci_channels);
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index f36c789..f9101215 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -19,8 +19,6 @@
 struct pci_channel {
 	struct pci_channel	*next;
 
-	int			(*init)(struct pci_channel *chan);
-
 	struct pci_ops		*pci_ops;
 	struct resource		*io_resource;
 	struct resource		*mem_resource;
@@ -28,20 +26,11 @@ struct pci_channel {
 	unsigned long		io_offset;
 	unsigned long		mem_offset;
 
-	int			first_devfn;
-	int			last_devfn;
-	int			enabled;
-
 	unsigned long		reg_base;
 
 	unsigned long		io_map_base;
 };
 
-/*
- * Each board initializes this array and terminates it with a NULL entry.
- */
-extern struct pci_channel board_pci_channels[];
-
 extern void register_pci_controller(struct pci_channel *hose);
 
 extern unsigned long PCIBIOS_MIN_IO, PCIBIOS_MIN_MEM;
@@ -122,13 +111,8 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 #endif
 
 /* Board-specific fixup routines. */
-int pcibios_init_platform(void);
 int pcibios_map_platform_irq(struct pci_dev *dev, u8 slot, u8 pin);
 
-#ifdef CONFIG_PCI_AUTO
-int pciauto_assign_resources(int busno, struct pci_channel *hose);
-#endif
-
 extern void pcibios_resource_to_bus(struct pci_dev *dev,
 	struct pci_bus_region *region, struct resource *res);
 
-- 
cgit v0.10.2


From 35bcfffd86aac933205fbf8401e3e7e4dde68264 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 21:51:19 +0900
Subject: sh: pci: Roll pci-lib in to pci-new.

Now that the pci-auto cruft is gone, pci-lib can go away.
Roll it back in to pci-new.c where it originally split off from.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index a8350cc..eacac7f 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -1,7 +1,6 @@
 #
 # Makefile for the PCI specific kernel interface routines under Linux.
 #
-obj-y					+= pci-lib.o
 obj-$(CONFIG_PCI_NEW)			+= pci-new.o
 
 obj-$(CONFIG_CPU_SUBTYPE_SH7751)	+= pci-sh7751.o ops-sh4.o
diff --git a/arch/sh/drivers/pci/pci-lib.c b/arch/sh/drivers/pci/pci-lib.c
deleted file mode 100644
index f072aca..0000000
--- a/arch/sh/drivers/pci/pci-lib.c
+++ /dev/null
@@ -1,219 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/pci.h>
-
-unsigned long PCIBIOS_MIN_IO = 0x0000;
-unsigned long PCIBIOS_MIN_MEM = 0;
-
-/*
- * We need to avoid collisions with `mirrored' VGA ports
- * and other strange ISA hardware, so we always want the
- * addresses to be allocated in the 0x000-0x0ff region
- * modulo 0x400.
- */
-void pcibios_align_resource(void *data, struct resource *res,
-			    resource_size_t size, resource_size_t align)
-{
-	struct pci_dev *dev = data;
-	struct pci_channel *chan = dev->sysdata;
-	resource_size_t start = res->start;
-
-	if (res->flags & IORESOURCE_IO) {
-		if (start < PCIBIOS_MIN_IO + chan->io_resource->start)
-			start = PCIBIOS_MIN_IO + chan->io_resource->start;
-
-		/*
-                 * Put everything into 0x00-0xff region modulo 0x400.
-		 */
-		if (start & 0x300) {
-			start = (start + 0x3ff) & ~0x3ff;
-			res->start = start;
-		}
-	} else if (res->flags & IORESOURCE_MEM) {
-		if (start < PCIBIOS_MIN_MEM + chan->mem_resource->start)
-			start = PCIBIOS_MIN_MEM + chan->mem_resource->start;
-	}
-
-	res->start = start;
-}
-
-void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region,
-			 struct resource *res)
-{
-	struct pci_channel *hose = dev->sysdata;
-	unsigned long offset = 0;
-
-	if (res->flags & IORESOURCE_IO)
-		offset = hose->io_offset;
-	else if (res->flags & IORESOURCE_MEM)
-		offset = hose->mem_offset;
-
-	region->start = res->start - offset;
-	region->end = res->end - offset;
-}
-
-void __devinit
-pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
-			struct pci_bus_region *region)
-{
-	struct pci_channel *hose = dev->sysdata;
-	unsigned long offset = 0;
-
-	if (res->flags & IORESOURCE_IO)
-		offset = hose->io_offset;
-	else if (res->flags & IORESOURCE_MEM)
-		offset = hose->mem_offset;
-
-	res->start = region->start + offset;
-	res->end = region->end + offset;
-}
-
-int pcibios_enable_device(struct pci_dev *dev, int mask)
-{
-	u16 cmd, old_cmd;
-	int idx;
-	struct resource *r;
-
-	pci_read_config_word(dev, PCI_COMMAND, &cmd);
-	old_cmd = cmd;
-	for (idx=0; idx < PCI_NUM_RESOURCES; idx++) {
-		/* Only set up the requested stuff */
-		if (!(mask & (1<<idx)))
-			continue;
-
-		r = &dev->resource[idx];
-		if (!(r->flags & (IORESOURCE_IO | IORESOURCE_MEM)))
-			continue;
-		if ((idx == PCI_ROM_RESOURCE) &&
-				(!(r->flags & IORESOURCE_ROM_ENABLE)))
-			continue;
-		if (!r->start && r->end) {
-			printk(KERN_ERR "PCI: Device %s not available "
-			       "because of resource collisions\n",
-			       pci_name(dev));
-			return -EINVAL;
-		}
-		if (r->flags & IORESOURCE_IO)
-			cmd |= PCI_COMMAND_IO;
-		if (r->flags & IORESOURCE_MEM)
-			cmd |= PCI_COMMAND_MEMORY;
-	}
-	if (cmd != old_cmd) {
-		printk("PCI: Enabling device %s (%04x -> %04x)\n",
-		       pci_name(dev), old_cmd, cmd);
-		pci_write_config_word(dev, PCI_COMMAND, cmd);
-	}
-	return 0;
-}
-
-/*
- *  If we set up a device for bus mastering, we need to check and set
- *  the latency timer as it may not be properly set.
- */
-static unsigned int pcibios_max_latency = 255;
-
-void pcibios_set_master(struct pci_dev *dev)
-{
-	u8 lat;
-	pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat);
-	if (lat < 16)
-		lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency;
-	else if (lat > pcibios_max_latency)
-		lat = pcibios_max_latency;
-	else
-		return;
-	printk(KERN_INFO "PCI: Setting latency timer of device %s to %d\n",
-	       pci_name(dev), lat);
-	pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
-}
-
-void __init pcibios_update_irq(struct pci_dev *dev, int irq)
-{
-	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
-}
-
-char * __devinit pcibios_setup(char *str)
-{
-	return str;
-}
-
-int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
-			enum pci_mmap_state mmap_state, int write_combine)
-{
-	/*
-	 * I/O space can be accessed via normal processor loads and stores on
-	 * this platform but for now we elect not to do this and portable
-	 * drivers should not do this anyway.
-	 */
-	if (mmap_state == pci_mmap_io)
-		return -EINVAL;
-
-	/*
-	 * Ignore write-combine; for now only return uncached mappings.
-	 */
-	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-
-	return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
-			       vma->vm_end - vma->vm_start,
-			       vma->vm_page_prot);
-}
-
-static void __iomem *ioport_map_pci(struct pci_dev *dev,
-				    unsigned long port, unsigned int nr)
-{
-	struct pci_channel *chan = dev->sysdata;
-
-	if (!chan->io_map_base)
-		chan->io_map_base = generic_io_base;
-
-	return (void __iomem *)(chan->io_map_base + port);
-}
-
-void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
-{
-	resource_size_t start = pci_resource_start(dev, bar);
-	resource_size_t len = pci_resource_len(dev, bar);
-	unsigned long flags = pci_resource_flags(dev, bar);
-
-	if (unlikely(!len || !start))
-		return NULL;
-	if (maxlen && len > maxlen)
-		len = maxlen;
-
-	if (flags & IORESOURCE_IO)
-		return ioport_map_pci(dev, start, len);
-
-	/*
-	 * Presently the IORESOURCE_MEM case is a bit special, most
-	 * SH7751 style PCI controllers have PCI memory at a fixed
-	 * location in the address space where no remapping is desired.
-	 * With the IORESOURCE_MEM case more care has to be taken
-	 * to inhibit page table mapping for legacy cores, but this is
-	 * punted off to __ioremap().
-	 *					-- PFM.
-	 */
-	if (flags & IORESOURCE_MEM) {
-		if (flags & IORESOURCE_CACHEABLE)
-			return ioremap(start, len);
-
-		return ioremap_nocache(start, len);
-	}
-
-	return NULL;
-}
-EXPORT_SYMBOL(pci_iomap);
-
-void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
-{
-	iounmap(addr);
-}
-EXPORT_SYMBOL(pci_iounmap);
-
-#ifdef CONFIG_HOTPLUG
-EXPORT_SYMBOL(pcibios_resource_to_bus);
-EXPORT_SYMBOL(pcibios_bus_to_resource);
-EXPORT_SYMBOL(PCIBIOS_MIN_IO);
-EXPORT_SYMBOL(PCIBIOS_MIN_MEM);
-#endif
diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci-new.c
index 8c0b136..54d77cb 100644
--- a/arch/sh/drivers/pci/pci-new.c
+++ b/arch/sh/drivers/pci/pci-new.c
@@ -1,20 +1,28 @@
 /*
  * New-style PCI core.
  *
- * Copyright (c) 2002  M. R. Brown
  * Copyright (c) 2004 - 2009  Paul Mundt
+ * Copyright (c) 2002  M. R. Brown
+ *
+ * Modelled after arch/mips/pci/pci.c:
+ *  Copyright (C) 2003, 04 Ralf Baechle (ralf@linux-mips.org)
  *
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  */
 #include <linux/kernel.h>
+#include <linux/mm.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/types.h>
 #include <linux/dma-debug.h>
 #include <linux/io.h>
 #include <linux/mutex.h>
 
+unsigned long PCIBIOS_MIN_IO = 0x0000;
+unsigned long PCIBIOS_MIN_MEM = 0;
+
 /*
  * The PCI controller list.
  */
@@ -27,9 +35,6 @@ static void __devinit pcibios_scanbus(struct pci_channel *hose)
 	static int next_busno;
 	struct pci_bus *bus;
 
-	/* Catch botched conversion attempts */
-	BUG_ON(hose->init);
-
 	bus = pci_scan_bus(next_busno, hose->pci_ops, hose);
 	if (bus) {
 		next_busno = bus->subordinate + 1;
@@ -128,7 +133,7 @@ static void pcibios_fixup_device_resources(struct pci_dev *dev,
  *  Called after each bus is probed, but before its children
  *  are examined.
  */
-void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus)
+void __devinit pcibios_fixup_bus(struct pci_bus *bus)
 {
 	struct pci_dev *dev = bus->self;
 	struct list_head *ln;
@@ -146,3 +151,214 @@ void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus)
 			pcibios_fixup_device_resources(dev, bus);
 	}
 }
+
+/*
+ * We need to avoid collisions with `mirrored' VGA ports
+ * and other strange ISA hardware, so we always want the
+ * addresses to be allocated in the 0x000-0x0ff region
+ * modulo 0x400.
+ */
+void pcibios_align_resource(void *data, struct resource *res,
+			    resource_size_t size, resource_size_t align)
+{
+	struct pci_dev *dev = data;
+	struct pci_channel *chan = dev->sysdata;
+	resource_size_t start = res->start;
+
+	if (res->flags & IORESOURCE_IO) {
+		if (start < PCIBIOS_MIN_IO + chan->io_resource->start)
+			start = PCIBIOS_MIN_IO + chan->io_resource->start;
+
+		/*
+                 * Put everything into 0x00-0xff region modulo 0x400.
+		 */
+		if (start & 0x300) {
+			start = (start + 0x3ff) & ~0x3ff;
+			res->start = start;
+		}
+	} else if (res->flags & IORESOURCE_MEM) {
+		if (start < PCIBIOS_MIN_MEM + chan->mem_resource->start)
+			start = PCIBIOS_MIN_MEM + chan->mem_resource->start;
+	}
+
+	res->start = start;
+}
+
+void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region,
+			 struct resource *res)
+{
+	struct pci_channel *hose = dev->sysdata;
+	unsigned long offset = 0;
+
+	if (res->flags & IORESOURCE_IO)
+		offset = hose->io_offset;
+	else if (res->flags & IORESOURCE_MEM)
+		offset = hose->mem_offset;
+
+	region->start = res->start - offset;
+	region->end = res->end - offset;
+}
+
+void __devinit
+pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
+			struct pci_bus_region *region)
+{
+	struct pci_channel *hose = dev->sysdata;
+	unsigned long offset = 0;
+
+	if (res->flags & IORESOURCE_IO)
+		offset = hose->io_offset;
+	else if (res->flags & IORESOURCE_MEM)
+		offset = hose->mem_offset;
+
+	res->start = region->start + offset;
+	res->end = region->end + offset;
+}
+
+int pcibios_enable_device(struct pci_dev *dev, int mask)
+{
+	u16 cmd, old_cmd;
+	int idx;
+	struct resource *r;
+
+	pci_read_config_word(dev, PCI_COMMAND, &cmd);
+	old_cmd = cmd;
+	for (idx=0; idx < PCI_NUM_RESOURCES; idx++) {
+		/* Only set up the requested stuff */
+		if (!(mask & (1<<idx)))
+			continue;
+
+		r = &dev->resource[idx];
+		if (!(r->flags & (IORESOURCE_IO | IORESOURCE_MEM)))
+			continue;
+		if ((idx == PCI_ROM_RESOURCE) &&
+				(!(r->flags & IORESOURCE_ROM_ENABLE)))
+			continue;
+		if (!r->start && r->end) {
+			printk(KERN_ERR "PCI: Device %s not available "
+			       "because of resource collisions\n",
+			       pci_name(dev));
+			return -EINVAL;
+		}
+		if (r->flags & IORESOURCE_IO)
+			cmd |= PCI_COMMAND_IO;
+		if (r->flags & IORESOURCE_MEM)
+			cmd |= PCI_COMMAND_MEMORY;
+	}
+	if (cmd != old_cmd) {
+		printk("PCI: Enabling device %s (%04x -> %04x)\n",
+		       pci_name(dev), old_cmd, cmd);
+		pci_write_config_word(dev, PCI_COMMAND, cmd);
+	}
+	return 0;
+}
+
+/*
+ *  If we set up a device for bus mastering, we need to check and set
+ *  the latency timer as it may not be properly set.
+ */
+static unsigned int pcibios_max_latency = 255;
+
+void pcibios_set_master(struct pci_dev *dev)
+{
+	u8 lat;
+	pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat);
+	if (lat < 16)
+		lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency;
+	else if (lat > pcibios_max_latency)
+		lat = pcibios_max_latency;
+	else
+		return;
+	printk(KERN_INFO "PCI: Setting latency timer of device %s to %d\n",
+	       pci_name(dev), lat);
+	pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
+}
+
+void __init pcibios_update_irq(struct pci_dev *dev, int irq)
+{
+	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
+}
+
+char * __devinit pcibios_setup(char *str)
+{
+	return str;
+}
+
+int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+			enum pci_mmap_state mmap_state, int write_combine)
+{
+	/*
+	 * I/O space can be accessed via normal processor loads and stores on
+	 * this platform but for now we elect not to do this and portable
+	 * drivers should not do this anyway.
+	 */
+	if (mmap_state == pci_mmap_io)
+		return -EINVAL;
+
+	/*
+	 * Ignore write-combine; for now only return uncached mappings.
+	 */
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+			       vma->vm_end - vma->vm_start,
+			       vma->vm_page_prot);
+}
+
+static void __iomem *ioport_map_pci(struct pci_dev *dev,
+				    unsigned long port, unsigned int nr)
+{
+	struct pci_channel *chan = dev->sysdata;
+
+	if (!chan->io_map_base)
+		chan->io_map_base = generic_io_base;
+
+	return (void __iomem *)(chan->io_map_base + port);
+}
+
+void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
+{
+	resource_size_t start = pci_resource_start(dev, bar);
+	resource_size_t len = pci_resource_len(dev, bar);
+	unsigned long flags = pci_resource_flags(dev, bar);
+
+	if (unlikely(!len || !start))
+		return NULL;
+	if (maxlen && len > maxlen)
+		len = maxlen;
+
+	if (flags & IORESOURCE_IO)
+		return ioport_map_pci(dev, start, len);
+
+	/*
+	 * Presently the IORESOURCE_MEM case is a bit special, most
+	 * SH7751 style PCI controllers have PCI memory at a fixed
+	 * location in the address space where no remapping is desired.
+	 * With the IORESOURCE_MEM case more care has to be taken
+	 * to inhibit page table mapping for legacy cores, but this is
+	 * punted off to __ioremap().
+	 *					-- PFM.
+	 */
+	if (flags & IORESOURCE_MEM) {
+		if (flags & IORESOURCE_CACHEABLE)
+			return ioremap(start, len);
+
+		return ioremap_nocache(start, len);
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(pci_iomap);
+
+void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
+{
+	iounmap(addr);
+}
+EXPORT_SYMBOL(pci_iounmap);
+
+#ifdef CONFIG_HOTPLUG
+EXPORT_SYMBOL(pcibios_resource_to_bus);
+EXPORT_SYMBOL(pcibios_bus_to_resource);
+EXPORT_SYMBOL(PCIBIOS_MIN_IO);
+EXPORT_SYMBOL(PCIBIOS_MIN_MEM);
+#endif
-- 
cgit v0.10.2


From cf242007a1eb29dcf93d1cb34713ec9b3f4a0e1b Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 21:53:33 +0900
Subject: sh: pci: Rename pci-new.c to pci.c.

pci-new.c is now in a state to replace the old pci.c, rename it
accordingly.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/Kconfig b/arch/sh/drivers/pci/Kconfig
index ea903a9..e8db585 100644
--- a/arch/sh/drivers/pci/Kconfig
+++ b/arch/sh/drivers/pci/Kconfig
@@ -17,7 +17,3 @@ config SH_PCIDMA_NONCOHERENT
 	  code will not have to flush the CPU's caches. If you have a PCI host
 	  bridge integrated with your SH CPU, refer carefully to the chip specs
 	  to see if you can say 'N' here. Otherwise, leave it as 'Y'.
-
-config PCI_NEW
-	def_bool y
-	depends on PCI
diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index eacac7f..d2ffc47 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -1,7 +1,7 @@
 #
 # Makefile for the PCI specific kernel interface routines under Linux.
 #
-obj-$(CONFIG_PCI_NEW)			+= pci-new.o
+obj-y					+= pci.o
 
 obj-$(CONFIG_CPU_SUBTYPE_SH7751)	+= pci-sh7751.o ops-sh4.o
 obj-$(CONFIG_CPU_SUBTYPE_SH7751R)	+= pci-sh7751.o ops-sh4.o
diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci-new.c
deleted file mode 100644
index 54d77cb..0000000
--- a/arch/sh/drivers/pci/pci-new.c
+++ /dev/null
@@ -1,364 +0,0 @@
-/*
- * New-style PCI core.
- *
- * Copyright (c) 2004 - 2009  Paul Mundt
- * Copyright (c) 2002  M. R. Brown
- *
- * Modelled after arch/mips/pci/pci.c:
- *  Copyright (C) 2003, 04 Ralf Baechle (ralf@linux-mips.org)
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/pci.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/dma-debug.h>
-#include <linux/io.h>
-#include <linux/mutex.h>
-
-unsigned long PCIBIOS_MIN_IO = 0x0000;
-unsigned long PCIBIOS_MIN_MEM = 0;
-
-/*
- * The PCI controller list.
- */
-static struct pci_channel *hose_head, **hose_tail = &hose_head;
-
-static int pci_initialized;
-
-static void __devinit pcibios_scanbus(struct pci_channel *hose)
-{
-	static int next_busno;
-	struct pci_bus *bus;
-
-	bus = pci_scan_bus(next_busno, hose->pci_ops, hose);
-	if (bus) {
-		next_busno = bus->subordinate + 1;
-		/* Don't allow 8-bit bus number overflow inside the hose -
-		   reserve some space for bridges. */
-		if (next_busno > 224)
-			next_busno = 0;
-
-		pci_bus_size_bridges(bus);
-		pci_bus_assign_resources(bus);
-		pci_enable_bridges(bus);
-	}
-}
-
-static DEFINE_MUTEX(pci_scan_mutex);
-
-void __devinit register_pci_controller(struct pci_channel *hose)
-{
-	if (request_resource(&iomem_resource, hose->mem_resource) < 0)
-		goto out;
-	if (request_resource(&ioport_resource, hose->io_resource) < 0) {
-		release_resource(hose->mem_resource);
-		goto out;
-	}
-
-	*hose_tail = hose;
-	hose_tail = &hose->next;
-
-	/*
-	 * Do not panic here but later - this might hapen before console init.
-	 */
-	if (!hose->io_map_base) {
-		printk(KERN_WARNING
-		       "registering PCI controller with io_map_base unset\n");
-	}
-
-	/*
-	 * Scan the bus if it is register after the PCI subsystem
-	 * initialization.
-	 */
-	if (pci_initialized) {
-		mutex_lock(&pci_scan_mutex);
-		pcibios_scanbus(hose);
-		mutex_unlock(&pci_scan_mutex);
-	}
-
-	return;
-
-out:
-	printk(KERN_WARNING
-	       "Skipping PCI bus scan due to resource conflict\n");
-}
-
-static int __init pcibios_init(void)
-{
-	struct pci_channel *hose;
-
-	/* Scan all of the recorded PCI controllers.  */
-	for (hose = hose_head; hose; hose = hose->next)
-		pcibios_scanbus(hose);
-
-	pci_fixup_irqs(pci_common_swizzle, pcibios_map_platform_irq);
-
-	dma_debug_add_bus(&pci_bus_type);
-
-	pci_initialized = 1;
-
-	return 0;
-}
-subsys_initcall(pcibios_init);
-
-static void pcibios_fixup_device_resources(struct pci_dev *dev,
-	struct pci_bus *bus)
-{
-	/* Update device resources.  */
-	struct pci_channel *hose = bus->sysdata;
-	unsigned long offset = 0;
-	int i;
-
-	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
-		if (!dev->resource[i].start)
-			continue;
-		if (dev->resource[i].flags & IORESOURCE_PCI_FIXED)
-			continue;
-		if (dev->resource[i].flags & IORESOURCE_IO)
-			offset = hose->io_offset;
-		else if (dev->resource[i].flags & IORESOURCE_MEM)
-			offset = hose->mem_offset;
-
-		dev->resource[i].start += offset;
-		dev->resource[i].end += offset;
-	}
-}
-
-/*
- *  Called after each bus is probed, but before its children
- *  are examined.
- */
-void __devinit pcibios_fixup_bus(struct pci_bus *bus)
-{
-	struct pci_dev *dev = bus->self;
-	struct list_head *ln;
-	struct pci_channel *chan = bus->sysdata;
-
-	if (!dev) {
-		bus->resource[0] = chan->io_resource;
-		bus->resource[1] = chan->mem_resource;
-	}
-
-	for (ln = bus->devices.next; ln != &bus->devices; ln = ln->next) {
-		dev = pci_dev_b(ln);
-
-		if ((dev->class >> 8) != PCI_CLASS_BRIDGE_PCI)
-			pcibios_fixup_device_resources(dev, bus);
-	}
-}
-
-/*
- * We need to avoid collisions with `mirrored' VGA ports
- * and other strange ISA hardware, so we always want the
- * addresses to be allocated in the 0x000-0x0ff region
- * modulo 0x400.
- */
-void pcibios_align_resource(void *data, struct resource *res,
-			    resource_size_t size, resource_size_t align)
-{
-	struct pci_dev *dev = data;
-	struct pci_channel *chan = dev->sysdata;
-	resource_size_t start = res->start;
-
-	if (res->flags & IORESOURCE_IO) {
-		if (start < PCIBIOS_MIN_IO + chan->io_resource->start)
-			start = PCIBIOS_MIN_IO + chan->io_resource->start;
-
-		/*
-                 * Put everything into 0x00-0xff region modulo 0x400.
-		 */
-		if (start & 0x300) {
-			start = (start + 0x3ff) & ~0x3ff;
-			res->start = start;
-		}
-	} else if (res->flags & IORESOURCE_MEM) {
-		if (start < PCIBIOS_MIN_MEM + chan->mem_resource->start)
-			start = PCIBIOS_MIN_MEM + chan->mem_resource->start;
-	}
-
-	res->start = start;
-}
-
-void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region,
-			 struct resource *res)
-{
-	struct pci_channel *hose = dev->sysdata;
-	unsigned long offset = 0;
-
-	if (res->flags & IORESOURCE_IO)
-		offset = hose->io_offset;
-	else if (res->flags & IORESOURCE_MEM)
-		offset = hose->mem_offset;
-
-	region->start = res->start - offset;
-	region->end = res->end - offset;
-}
-
-void __devinit
-pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
-			struct pci_bus_region *region)
-{
-	struct pci_channel *hose = dev->sysdata;
-	unsigned long offset = 0;
-
-	if (res->flags & IORESOURCE_IO)
-		offset = hose->io_offset;
-	else if (res->flags & IORESOURCE_MEM)
-		offset = hose->mem_offset;
-
-	res->start = region->start + offset;
-	res->end = region->end + offset;
-}
-
-int pcibios_enable_device(struct pci_dev *dev, int mask)
-{
-	u16 cmd, old_cmd;
-	int idx;
-	struct resource *r;
-
-	pci_read_config_word(dev, PCI_COMMAND, &cmd);
-	old_cmd = cmd;
-	for (idx=0; idx < PCI_NUM_RESOURCES; idx++) {
-		/* Only set up the requested stuff */
-		if (!(mask & (1<<idx)))
-			continue;
-
-		r = &dev->resource[idx];
-		if (!(r->flags & (IORESOURCE_IO | IORESOURCE_MEM)))
-			continue;
-		if ((idx == PCI_ROM_RESOURCE) &&
-				(!(r->flags & IORESOURCE_ROM_ENABLE)))
-			continue;
-		if (!r->start && r->end) {
-			printk(KERN_ERR "PCI: Device %s not available "
-			       "because of resource collisions\n",
-			       pci_name(dev));
-			return -EINVAL;
-		}
-		if (r->flags & IORESOURCE_IO)
-			cmd |= PCI_COMMAND_IO;
-		if (r->flags & IORESOURCE_MEM)
-			cmd |= PCI_COMMAND_MEMORY;
-	}
-	if (cmd != old_cmd) {
-		printk("PCI: Enabling device %s (%04x -> %04x)\n",
-		       pci_name(dev), old_cmd, cmd);
-		pci_write_config_word(dev, PCI_COMMAND, cmd);
-	}
-	return 0;
-}
-
-/*
- *  If we set up a device for bus mastering, we need to check and set
- *  the latency timer as it may not be properly set.
- */
-static unsigned int pcibios_max_latency = 255;
-
-void pcibios_set_master(struct pci_dev *dev)
-{
-	u8 lat;
-	pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat);
-	if (lat < 16)
-		lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency;
-	else if (lat > pcibios_max_latency)
-		lat = pcibios_max_latency;
-	else
-		return;
-	printk(KERN_INFO "PCI: Setting latency timer of device %s to %d\n",
-	       pci_name(dev), lat);
-	pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
-}
-
-void __init pcibios_update_irq(struct pci_dev *dev, int irq)
-{
-	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
-}
-
-char * __devinit pcibios_setup(char *str)
-{
-	return str;
-}
-
-int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
-			enum pci_mmap_state mmap_state, int write_combine)
-{
-	/*
-	 * I/O space can be accessed via normal processor loads and stores on
-	 * this platform but for now we elect not to do this and portable
-	 * drivers should not do this anyway.
-	 */
-	if (mmap_state == pci_mmap_io)
-		return -EINVAL;
-
-	/*
-	 * Ignore write-combine; for now only return uncached mappings.
-	 */
-	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-
-	return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
-			       vma->vm_end - vma->vm_start,
-			       vma->vm_page_prot);
-}
-
-static void __iomem *ioport_map_pci(struct pci_dev *dev,
-				    unsigned long port, unsigned int nr)
-{
-	struct pci_channel *chan = dev->sysdata;
-
-	if (!chan->io_map_base)
-		chan->io_map_base = generic_io_base;
-
-	return (void __iomem *)(chan->io_map_base + port);
-}
-
-void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
-{
-	resource_size_t start = pci_resource_start(dev, bar);
-	resource_size_t len = pci_resource_len(dev, bar);
-	unsigned long flags = pci_resource_flags(dev, bar);
-
-	if (unlikely(!len || !start))
-		return NULL;
-	if (maxlen && len > maxlen)
-		len = maxlen;
-
-	if (flags & IORESOURCE_IO)
-		return ioport_map_pci(dev, start, len);
-
-	/*
-	 * Presently the IORESOURCE_MEM case is a bit special, most
-	 * SH7751 style PCI controllers have PCI memory at a fixed
-	 * location in the address space where no remapping is desired.
-	 * With the IORESOURCE_MEM case more care has to be taken
-	 * to inhibit page table mapping for legacy cores, but this is
-	 * punted off to __ioremap().
-	 *					-- PFM.
-	 */
-	if (flags & IORESOURCE_MEM) {
-		if (flags & IORESOURCE_CACHEABLE)
-			return ioremap(start, len);
-
-		return ioremap_nocache(start, len);
-	}
-
-	return NULL;
-}
-EXPORT_SYMBOL(pci_iomap);
-
-void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
-{
-	iounmap(addr);
-}
-EXPORT_SYMBOL(pci_iounmap);
-
-#ifdef CONFIG_HOTPLUG
-EXPORT_SYMBOL(pcibios_resource_to_bus);
-EXPORT_SYMBOL(pcibios_bus_to_resource);
-EXPORT_SYMBOL(PCIBIOS_MIN_IO);
-EXPORT_SYMBOL(PCIBIOS_MIN_MEM);
-#endif
diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c
new file mode 100644
index 0000000..54d77cb
--- /dev/null
+++ b/arch/sh/drivers/pci/pci.c
@@ -0,0 +1,364 @@
+/*
+ * New-style PCI core.
+ *
+ * Copyright (c) 2004 - 2009  Paul Mundt
+ * Copyright (c) 2002  M. R. Brown
+ *
+ * Modelled after arch/mips/pci/pci.c:
+ *  Copyright (C) 2003, 04 Ralf Baechle (ralf@linux-mips.org)
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/dma-debug.h>
+#include <linux/io.h>
+#include <linux/mutex.h>
+
+unsigned long PCIBIOS_MIN_IO = 0x0000;
+unsigned long PCIBIOS_MIN_MEM = 0;
+
+/*
+ * The PCI controller list.
+ */
+static struct pci_channel *hose_head, **hose_tail = &hose_head;
+
+static int pci_initialized;
+
+static void __devinit pcibios_scanbus(struct pci_channel *hose)
+{
+	static int next_busno;
+	struct pci_bus *bus;
+
+	bus = pci_scan_bus(next_busno, hose->pci_ops, hose);
+	if (bus) {
+		next_busno = bus->subordinate + 1;
+		/* Don't allow 8-bit bus number overflow inside the hose -
+		   reserve some space for bridges. */
+		if (next_busno > 224)
+			next_busno = 0;
+
+		pci_bus_size_bridges(bus);
+		pci_bus_assign_resources(bus);
+		pci_enable_bridges(bus);
+	}
+}
+
+static DEFINE_MUTEX(pci_scan_mutex);
+
+void __devinit register_pci_controller(struct pci_channel *hose)
+{
+	if (request_resource(&iomem_resource, hose->mem_resource) < 0)
+		goto out;
+	if (request_resource(&ioport_resource, hose->io_resource) < 0) {
+		release_resource(hose->mem_resource);
+		goto out;
+	}
+
+	*hose_tail = hose;
+	hose_tail = &hose->next;
+
+	/*
+	 * Do not panic here but later - this might hapen before console init.
+	 */
+	if (!hose->io_map_base) {
+		printk(KERN_WARNING
+		       "registering PCI controller with io_map_base unset\n");
+	}
+
+	/*
+	 * Scan the bus if it is register after the PCI subsystem
+	 * initialization.
+	 */
+	if (pci_initialized) {
+		mutex_lock(&pci_scan_mutex);
+		pcibios_scanbus(hose);
+		mutex_unlock(&pci_scan_mutex);
+	}
+
+	return;
+
+out:
+	printk(KERN_WARNING
+	       "Skipping PCI bus scan due to resource conflict\n");
+}
+
+static int __init pcibios_init(void)
+{
+	struct pci_channel *hose;
+
+	/* Scan all of the recorded PCI controllers.  */
+	for (hose = hose_head; hose; hose = hose->next)
+		pcibios_scanbus(hose);
+
+	pci_fixup_irqs(pci_common_swizzle, pcibios_map_platform_irq);
+
+	dma_debug_add_bus(&pci_bus_type);
+
+	pci_initialized = 1;
+
+	return 0;
+}
+subsys_initcall(pcibios_init);
+
+static void pcibios_fixup_device_resources(struct pci_dev *dev,
+	struct pci_bus *bus)
+{
+	/* Update device resources.  */
+	struct pci_channel *hose = bus->sysdata;
+	unsigned long offset = 0;
+	int i;
+
+	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+		if (!dev->resource[i].start)
+			continue;
+		if (dev->resource[i].flags & IORESOURCE_PCI_FIXED)
+			continue;
+		if (dev->resource[i].flags & IORESOURCE_IO)
+			offset = hose->io_offset;
+		else if (dev->resource[i].flags & IORESOURCE_MEM)
+			offset = hose->mem_offset;
+
+		dev->resource[i].start += offset;
+		dev->resource[i].end += offset;
+	}
+}
+
+/*
+ *  Called after each bus is probed, but before its children
+ *  are examined.
+ */
+void __devinit pcibios_fixup_bus(struct pci_bus *bus)
+{
+	struct pci_dev *dev = bus->self;
+	struct list_head *ln;
+	struct pci_channel *chan = bus->sysdata;
+
+	if (!dev) {
+		bus->resource[0] = chan->io_resource;
+		bus->resource[1] = chan->mem_resource;
+	}
+
+	for (ln = bus->devices.next; ln != &bus->devices; ln = ln->next) {
+		dev = pci_dev_b(ln);
+
+		if ((dev->class >> 8) != PCI_CLASS_BRIDGE_PCI)
+			pcibios_fixup_device_resources(dev, bus);
+	}
+}
+
+/*
+ * We need to avoid collisions with `mirrored' VGA ports
+ * and other strange ISA hardware, so we always want the
+ * addresses to be allocated in the 0x000-0x0ff region
+ * modulo 0x400.
+ */
+void pcibios_align_resource(void *data, struct resource *res,
+			    resource_size_t size, resource_size_t align)
+{
+	struct pci_dev *dev = data;
+	struct pci_channel *chan = dev->sysdata;
+	resource_size_t start = res->start;
+
+	if (res->flags & IORESOURCE_IO) {
+		if (start < PCIBIOS_MIN_IO + chan->io_resource->start)
+			start = PCIBIOS_MIN_IO + chan->io_resource->start;
+
+		/*
+                 * Put everything into 0x00-0xff region modulo 0x400.
+		 */
+		if (start & 0x300) {
+			start = (start + 0x3ff) & ~0x3ff;
+			res->start = start;
+		}
+	} else if (res->flags & IORESOURCE_MEM) {
+		if (start < PCIBIOS_MIN_MEM + chan->mem_resource->start)
+			start = PCIBIOS_MIN_MEM + chan->mem_resource->start;
+	}
+
+	res->start = start;
+}
+
+void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region,
+			 struct resource *res)
+{
+	struct pci_channel *hose = dev->sysdata;
+	unsigned long offset = 0;
+
+	if (res->flags & IORESOURCE_IO)
+		offset = hose->io_offset;
+	else if (res->flags & IORESOURCE_MEM)
+		offset = hose->mem_offset;
+
+	region->start = res->start - offset;
+	region->end = res->end - offset;
+}
+
+void __devinit
+pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
+			struct pci_bus_region *region)
+{
+	struct pci_channel *hose = dev->sysdata;
+	unsigned long offset = 0;
+
+	if (res->flags & IORESOURCE_IO)
+		offset = hose->io_offset;
+	else if (res->flags & IORESOURCE_MEM)
+		offset = hose->mem_offset;
+
+	res->start = region->start + offset;
+	res->end = region->end + offset;
+}
+
+int pcibios_enable_device(struct pci_dev *dev, int mask)
+{
+	u16 cmd, old_cmd;
+	int idx;
+	struct resource *r;
+
+	pci_read_config_word(dev, PCI_COMMAND, &cmd);
+	old_cmd = cmd;
+	for (idx=0; idx < PCI_NUM_RESOURCES; idx++) {
+		/* Only set up the requested stuff */
+		if (!(mask & (1<<idx)))
+			continue;
+
+		r = &dev->resource[idx];
+		if (!(r->flags & (IORESOURCE_IO | IORESOURCE_MEM)))
+			continue;
+		if ((idx == PCI_ROM_RESOURCE) &&
+				(!(r->flags & IORESOURCE_ROM_ENABLE)))
+			continue;
+		if (!r->start && r->end) {
+			printk(KERN_ERR "PCI: Device %s not available "
+			       "because of resource collisions\n",
+			       pci_name(dev));
+			return -EINVAL;
+		}
+		if (r->flags & IORESOURCE_IO)
+			cmd |= PCI_COMMAND_IO;
+		if (r->flags & IORESOURCE_MEM)
+			cmd |= PCI_COMMAND_MEMORY;
+	}
+	if (cmd != old_cmd) {
+		printk("PCI: Enabling device %s (%04x -> %04x)\n",
+		       pci_name(dev), old_cmd, cmd);
+		pci_write_config_word(dev, PCI_COMMAND, cmd);
+	}
+	return 0;
+}
+
+/*
+ *  If we set up a device for bus mastering, we need to check and set
+ *  the latency timer as it may not be properly set.
+ */
+static unsigned int pcibios_max_latency = 255;
+
+void pcibios_set_master(struct pci_dev *dev)
+{
+	u8 lat;
+	pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat);
+	if (lat < 16)
+		lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency;
+	else if (lat > pcibios_max_latency)
+		lat = pcibios_max_latency;
+	else
+		return;
+	printk(KERN_INFO "PCI: Setting latency timer of device %s to %d\n",
+	       pci_name(dev), lat);
+	pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
+}
+
+void __init pcibios_update_irq(struct pci_dev *dev, int irq)
+{
+	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
+}
+
+char * __devinit pcibios_setup(char *str)
+{
+	return str;
+}
+
+int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+			enum pci_mmap_state mmap_state, int write_combine)
+{
+	/*
+	 * I/O space can be accessed via normal processor loads and stores on
+	 * this platform but for now we elect not to do this and portable
+	 * drivers should not do this anyway.
+	 */
+	if (mmap_state == pci_mmap_io)
+		return -EINVAL;
+
+	/*
+	 * Ignore write-combine; for now only return uncached mappings.
+	 */
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+			       vma->vm_end - vma->vm_start,
+			       vma->vm_page_prot);
+}
+
+static void __iomem *ioport_map_pci(struct pci_dev *dev,
+				    unsigned long port, unsigned int nr)
+{
+	struct pci_channel *chan = dev->sysdata;
+
+	if (!chan->io_map_base)
+		chan->io_map_base = generic_io_base;
+
+	return (void __iomem *)(chan->io_map_base + port);
+}
+
+void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
+{
+	resource_size_t start = pci_resource_start(dev, bar);
+	resource_size_t len = pci_resource_len(dev, bar);
+	unsigned long flags = pci_resource_flags(dev, bar);
+
+	if (unlikely(!len || !start))
+		return NULL;
+	if (maxlen && len > maxlen)
+		len = maxlen;
+
+	if (flags & IORESOURCE_IO)
+		return ioport_map_pci(dev, start, len);
+
+	/*
+	 * Presently the IORESOURCE_MEM case is a bit special, most
+	 * SH7751 style PCI controllers have PCI memory at a fixed
+	 * location in the address space where no remapping is desired.
+	 * With the IORESOURCE_MEM case more care has to be taken
+	 * to inhibit page table mapping for legacy cores, but this is
+	 * punted off to __ioremap().
+	 *					-- PFM.
+	 */
+	if (flags & IORESOURCE_MEM) {
+		if (flags & IORESOURCE_CACHEABLE)
+			return ioremap(start, len);
+
+		return ioremap_nocache(start, len);
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(pci_iomap);
+
+void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
+{
+	iounmap(addr);
+}
+EXPORT_SYMBOL(pci_iounmap);
+
+#ifdef CONFIG_HOTPLUG
+EXPORT_SYMBOL(pcibios_resource_to_bus);
+EXPORT_SYMBOL(pcibios_bus_to_resource);
+EXPORT_SYMBOL(PCIBIOS_MIN_IO);
+EXPORT_SYMBOL(PCIBIOS_MIN_MEM);
+#endif
-- 
cgit v0.10.2


From 9ae5b8790037d05d32746f521af146c32089bfec Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 10:27:58 -0400
Subject: tracing: change branch profiling to a choice selection

This patch makes the branch profiling into a choice selection:

  None               - no branch profiling
  likely/unlikely    - only profile likely/unlikely branches
  all                - profile all branches

The all profiler will also enable the likely/unlikely branches.

This does not change the way the profiler works or the dependencies
between the profilers.

What this patch does, is keep the branch profiling from being selected
by an allyesconfig make. The branch profiler is very intrusive and
it is known to break various architecture builds when selected as an
allyesconfig.

[ Impact: prevent branch profiler from being selected in allyesconfig ]

Reported-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 57981d3..3ee28db 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -212,8 +212,36 @@ config BOOT_TRACER
 	  to enable this on bootup.
 
 config TRACE_BRANCH_PROFILING
-	bool "Trace likely/unlikely profiler"
+	bool
 	select TRACING
+
+choice
+	prompt "Branch Profiling"
+	default BRANCH_PROFILE_NONE
+	help
+	 The branch profiling is a software profiler. It will add hooks
+	 into the C conditionals to test which path a branch takes.
+
+	 The likely/unlikely profiler only looks at the conditions that
+	 are annotated with a likely or unlikely macro.
+
+	 The "all branch" profiler will profile every if statement in the
+	 kernel. This profiler will also enable the likely/unlikely
+	 profiler as well.
+
+	 Either of the above profilers add a bit of overhead to the system.
+	 If unsure choose "No branch profiling".
+
+config BRANCH_PROFILE_NONE
+	bool "No branch profiling"
+	help
+	 No branch profiling. Branch profiling adds a bit of overhead.
+	 Only enable it if you want to analyse the branching behavior.
+	 Otherwise keep it disabled.
+
+config PROFILE_ANNOTATED_BRANCHES
+	bool "Trace likely/unlikely profiler"
+	select TRACE_BRANCH_PROFILING
 	help
 	  This tracer profiles all the the likely and unlikely macros
 	  in the kernel. It will display the results in:
@@ -223,11 +251,9 @@ config TRACE_BRANCH_PROFILING
 	  Note: this will add a significant overhead, only turn this
 	  on if you need to profile the system's use of these macros.
 
-	  Say N if unsure.
-
 config PROFILE_ALL_BRANCHES
 	bool "Profile all if conditionals"
-	depends on TRACE_BRANCH_PROFILING
+	select TRACE_BRANCH_PROFILING
 	help
 	  This tracer profiles all branch conditions. Every if ()
 	  taken in the kernel is recorded whether it hit or miss.
@@ -235,11 +261,12 @@ config PROFILE_ALL_BRANCHES
 
 	  /debugfs/tracing/profile_branch
 
+	  This option also enables the likely/unlikely profiler.
+
 	  This configuration, when enabled, will impose a great overhead
 	  on the system. This should only be enabled when the system
 	  is to be analyzed
-
-	  Say N if unsure.
+endchoice
 
 config TRACING_BRANCHES
 	bool
-- 
cgit v0.10.2


From 4ed9f0716e46bb9646f26e73f4a1b5b24db7947a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 10:47:36 -0400
Subject: tracing: create menuconfig for tracing infrastructure

During testing we often use randconfig to test various kernels.
The current configuration set up does not give an easy way to disable
all tracing with a single config. The case where randconfig would
test all tracing disabled is very unlikely.

This patch adds a config option to enable or disable all tracing.
It is hooked into the tracing menu just like other submenus are done.

[ Impact: allow randconfig to easily produce all traces disabled ]

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 3ee28db..3fa36d2 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -77,7 +77,12 @@ config TRACING_SUPPORT
 
 if TRACING_SUPPORT
 
-menu "Tracers"
+menuconfig FTRACE
+	bool "Tracers"
+	help
+	 Enable the kernel tracing infrastructure.
+
+if FTRACE
 
 config FUNCTION_TRACER
 	bool "Kernel Function Tracer"
@@ -462,7 +467,7 @@ config MMIOTRACE_TEST
 
 	  Say N, unless you absolutely know what you are doing.
 
-endmenu
+endif # FTRACE
 
 endif # TRACING_SUPPORT
 
-- 
cgit v0.10.2


From d24e473e5b2ca86d1288b9416227ccc603313d0f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 20 Apr 2009 13:32:07 +0200
Subject: perf_counter: copy in Git's top Makefile

We'd like to have a similar user-space structure as Git has, for the
perfcounter tools - so copy in Git's toplevel makefile as-is.

We'll strip it down in subsequent commits to make it fit the
perfcounters code.

The Git version used: 66996ec: Sync with 1.6.2.4

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 1dd37ee..6e0838b 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -1,18 +1,1731 @@
-BINS = kerneltop perfstat perf-record perf-report
+# The default target of this Makefile is...
+all::
 
-all: $(BINS)
+# Define V=1 to have a more verbose compile.
+#
+# Define SNPRINTF_RETURNS_BOGUS if your are on a system which snprintf()
+# or vsnprintf() return -1 instead of number of characters which would
+# have been written to the final string if enough space had been available.
+#
+# Define FREAD_READS_DIRECTORIES if your are on a system which succeeds
+# when attempting to read from an fopen'ed directory.
+#
+# Define NO_OPENSSL environment variable if you do not have OpenSSL.
+# This also implies MOZILLA_SHA1.
+#
+# Define NO_CURL if you do not have libcurl installed.  git-http-pull and
+# git-http-push are not built, and you cannot use http:// and https://
+# transports.
+#
+# Define CURLDIR=/foo/bar if your curl header and library files are in
+# /foo/bar/include and /foo/bar/lib directories.
+#
+# Define NO_EXPAT if you do not have expat installed.  git-http-push is
+# not built, and you cannot push using http:// and https:// transports.
+#
+# Define EXPATDIR=/foo/bar if your expat header and library files are in
+# /foo/bar/include and /foo/bar/lib directories.
+#
+# Define NO_D_INO_IN_DIRENT if you don't have d_ino in your struct dirent.
+#
+# Define NO_D_TYPE_IN_DIRENT if your platform defines DT_UNKNOWN but lacks
+# d_type in struct dirent (latest Cygwin -- will be fixed soonish).
+#
+# Define NO_C99_FORMAT if your formatted IO functions (printf/scanf et.al.)
+# do not support the 'size specifiers' introduced by C99, namely ll, hh,
+# j, z, t. (representing long long int, char, intmax_t, size_t, ptrdiff_t).
+# some C compilers supported these specifiers prior to C99 as an extension.
+#
+# Define NO_STRCASESTR if you don't have strcasestr.
+#
+# Define NO_MEMMEM if you don't have memmem.
+#
+# Define NO_STRLCPY if you don't have strlcpy.
+#
+# Define NO_STRTOUMAX if you don't have strtoumax in the C library.
+# If your compiler also does not support long long or does not have
+# strtoull, define NO_STRTOULL.
+#
+# Define NO_SETENV if you don't have setenv in the C library.
+#
+# Define NO_UNSETENV if you don't have unsetenv in the C library.
+#
+# Define NO_MKDTEMP if you don't have mkdtemp in the C library.
+#
+# Define NO_SYS_SELECT_H if you don't have sys/select.h.
+#
+# Define NO_SYMLINK_HEAD if you never want .git/HEAD to be a symbolic link.
+# Enable it on Windows.  By default, symrefs are still used.
+#
+# Define NO_SVN_TESTS if you want to skip time-consuming SVN interoperability
+# tests.  These tests take up a significant amount of the total test time
+# but are not needed unless you plan to talk to SVN repos.
+#
+# Define NO_FINK if you are building on Darwin/Mac OS X, have Fink
+# installed in /sw, but don't want GIT to link against any libraries
+# installed there.  If defined you may specify your own (or Fink's)
+# include directories and library directories by defining CFLAGS
+# and LDFLAGS appropriately.
+#
+# Define NO_DARWIN_PORTS if you are building on Darwin/Mac OS X,
+# have DarwinPorts installed in /opt/local, but don't want GIT to
+# link against any libraries installed there.  If defined you may
+# specify your own (or DarwinPort's) include directories and
+# library directories by defining CFLAGS and LDFLAGS appropriately.
+#
+# Define PPC_SHA1 environment variable when running make to make use of
+# a bundled SHA1 routine optimized for PowerPC.
+#
+# Define ARM_SHA1 environment variable when running make to make use of
+# a bundled SHA1 routine optimized for ARM.
+#
+# Define MOZILLA_SHA1 environment variable when running make to make use of
+# a bundled SHA1 routine coming from Mozilla. It is GPL'd and should be fast
+# on non-x86 architectures (e.g. PowerPC), while the OpenSSL version (default
+# choice) has very fast version optimized for i586.
+#
+# Define NEEDS_SSL_WITH_CRYPTO if you need -lcrypto with -lssl (Darwin).
+#
+# Define NEEDS_LIBICONV if linking with libc is not enough (Darwin).
+#
+# Define NEEDS_SOCKET if linking with libc is not enough (SunOS,
+# Patrick Mauritz).
+#
+# Define NO_MMAP if you want to avoid mmap.
+#
+# Define NO_PTHREADS if you do not have or do not want to use Pthreads.
+#
+# Define NO_PREAD if you have a problem with pread() system call (e.g.
+# cygwin.dll before v1.5.22).
+#
+# Define NO_FAST_WORKING_DIRECTORY if accessing objects in pack files is
+# generally faster on your platform than accessing the working directory.
+#
+# Define NO_TRUSTABLE_FILEMODE if your filesystem may claim to support
+# the executable mode bit, but doesn't really do so.
+#
+# Define NO_IPV6 if you lack IPv6 support and getaddrinfo().
+#
+# Define NO_SOCKADDR_STORAGE if your platform does not have struct
+# sockaddr_storage.
+#
+# Define NO_ICONV if your libc does not properly support iconv.
+#
+# Define OLD_ICONV if your library has an old iconv(), where the second
+# (input buffer pointer) parameter is declared with type (const char **).
+#
+# Define NO_DEFLATE_BOUND if your zlib does not have deflateBound.
+#
+# Define NO_R_TO_GCC_LINKER if your gcc does not like "-R/path/lib"
+# that tells runtime paths to dynamic libraries;
+# "-Wl,-rpath=/path/lib" is used instead.
+#
+# Define USE_NSEC below if you want git to care about sub-second file mtimes
+# and ctimes. Note that you need recent glibc (at least 2.2.4) for this, and
+# it will BREAK YOUR LOCAL DIFFS! show-diff and anything using it will likely
+# randomly break unless your underlying filesystem supports those sub-second
+# times (my ext3 doesn't).
+#
+# Define USE_ST_TIMESPEC if your "struct stat" uses "st_ctimespec" instead of
+# "st_ctim"
+#
+# Define NO_NSEC if your "struct stat" does not have "st_ctim.tv_nsec"
+# available.  This automatically turns USE_NSEC off.
+#
+# Define USE_STDEV below if you want git to care about the underlying device
+# change being considered an inode change from the update-index perspective.
+#
+# Define NO_ST_BLOCKS_IN_STRUCT_STAT if your platform does not have st_blocks
+# field that counts the on-disk footprint in 512-byte blocks.
+#
+# Define ASCIIDOC8 if you want to format documentation with AsciiDoc 8
+#
+# Define DOCBOOK_XSL_172 if you want to format man pages with DocBook XSL v1.72.
+#
+# Define NO_PERL_MAKEMAKER if you cannot use Makefiles generated by perl's
+# MakeMaker (e.g. using ActiveState under Cygwin).
+#
+# Define NO_PERL if you do not want Perl scripts or libraries at all.
+#
+# Define NO_TCLTK if you do not want Tcl/Tk GUI.
+#
+# The TCL_PATH variable governs the location of the Tcl interpreter
+# used to optimize git-gui for your system.  Only used if NO_TCLTK
+# is not set.  Defaults to the bare 'tclsh'.
+#
+# The TCLTK_PATH variable governs the location of the Tcl/Tk interpreter.
+# If not set it defaults to the bare 'wish'. If it is set to the empty
+# string then NO_TCLTK will be forced (this is used by configure script).
+#
+# Define THREADED_DELTA_SEARCH if you have pthreads and wish to exploit
+# parallel delta searching when packing objects.
+#
+# Define INTERNAL_QSORT to use Git's implementation of qsort(), which
+# is a simplified version of the merge sort used in glibc. This is
+# recommended if Git triggers O(n^2) behavior in your platform's qsort().
+#
+# Define NO_EXTERNAL_GREP if you don't want "git grep" to ever call
+# your external grep (e.g., if your system lacks grep, if its grep is
+# broken, or spawning external process is slower than built-in grep git has).
 
-kerneltop: kerneltop.c ../../include/linux/perf_counter.h
-	cc -O6 -Wall -lrt -o $@ $<
+GIT-VERSION-FILE: .FORCE-GIT-VERSION-FILE
+	@$(SHELL_PATH) ./GIT-VERSION-GEN
+-include GIT-VERSION-FILE
 
-perf-record: perf-record.c ../../include/linux/perf_counter.h
-	cc -O6 -Wall -lrt -o $@ $<
+uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
+uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not')
+uname_O := $(shell sh -c 'uname -o 2>/dev/null || echo not')
+uname_R := $(shell sh -c 'uname -r 2>/dev/null || echo not')
+uname_P := $(shell sh -c 'uname -p 2>/dev/null || echo not')
+uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
 
-perf-report: perf-report.cc ../../include/linux/perf_counter.h
-	g++ -O6 -Wall -lrt -o $@ $<
+# CFLAGS and LDFLAGS are for the users to override from the command line.
 
-perfstat: kerneltop
-	ln -sf kerneltop perfstat
+CFLAGS = -g -O2 -Wall
+LDFLAGS =
+ALL_CFLAGS = $(CFLAGS)
+ALL_LDFLAGS = $(LDFLAGS)
+STRIP ?= strip
+
+# Among the variables below, these:
+#   gitexecdir
+#   template_dir
+#   mandir
+#   infodir
+#   htmldir
+#   ETC_GITCONFIG (but not sysconfdir)
+# can be specified as a relative path some/where/else;
+# this is interpreted as relative to $(prefix) and "git" at
+# runtime figures out where they are based on the path to the executable.
+# This can help installing the suite in a relocatable way.
+
+prefix = $(HOME)
+bindir_relative = bin
+bindir = $(prefix)/$(bindir_relative)
+mandir = share/man
+infodir = share/info
+gitexecdir = libexec/git-core
+sharedir = $(prefix)/share
+template_dir = share/git-core/templates
+htmldir = share/doc/git-doc
+ifeq ($(prefix),/usr)
+sysconfdir = /etc
+ETC_GITCONFIG = $(sysconfdir)/gitconfig
+else
+sysconfdir = $(prefix)/etc
+ETC_GITCONFIG = etc/gitconfig
+endif
+lib = lib
+# DESTDIR=
+
+# default configuration for gitweb
+GITWEB_CONFIG = gitweb_config.perl
+GITWEB_CONFIG_SYSTEM = /etc/gitweb.conf
+GITWEB_HOME_LINK_STR = projects
+GITWEB_SITENAME =
+GITWEB_PROJECTROOT = /pub/git
+GITWEB_PROJECT_MAXDEPTH = 2007
+GITWEB_EXPORT_OK =
+GITWEB_STRICT_EXPORT =
+GITWEB_BASE_URL =
+GITWEB_LIST =
+GITWEB_HOMETEXT = indextext.html
+GITWEB_CSS = gitweb.css
+GITWEB_LOGO = git-logo.png
+GITWEB_FAVICON = git-favicon.png
+GITWEB_SITE_HEADER =
+GITWEB_SITE_FOOTER =
+
+export prefix bindir sharedir sysconfdir
+
+CC = gcc
+AR = ar
+RM = rm -f
+TAR = tar
+FIND = find
+INSTALL = install
+RPMBUILD = rpmbuild
+TCL_PATH = tclsh
+TCLTK_PATH = wish
+PTHREAD_LIBS = -lpthread
+
+export TCL_PATH TCLTK_PATH
+
+# sparse is architecture-neutral, which means that we need to tell it
+# explicitly what architecture to check for. Fix this up for yours..
+SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__
+
+
+
+### --- END CONFIGURATION SECTION ---
+
+# Those must not be GNU-specific; they are shared with perl/ which may
+# be built by a different compiler. (Note that this is an artifact now
+# but it still might be nice to keep that distinction.)
+BASIC_CFLAGS =
+BASIC_LDFLAGS =
+
+# Guard against environment variables
+BUILTIN_OBJS =
+BUILT_INS =
+COMPAT_CFLAGS =
+COMPAT_OBJS =
+LIB_H =
+LIB_OBJS =
+PROGRAMS =
+SCRIPT_PERL =
+SCRIPT_SH =
+TEST_PROGRAMS =
+
+SCRIPT_SH += git-am.sh
+SCRIPT_SH += git-bisect.sh
+SCRIPT_SH += git-difftool--helper.sh
+SCRIPT_SH += git-filter-branch.sh
+SCRIPT_SH += git-lost-found.sh
+SCRIPT_SH += git-merge-octopus.sh
+SCRIPT_SH += git-merge-one-file.sh
+SCRIPT_SH += git-merge-resolve.sh
+SCRIPT_SH += git-mergetool.sh
+SCRIPT_SH += git-mergetool--lib.sh
+SCRIPT_SH += git-parse-remote.sh
+SCRIPT_SH += git-pull.sh
+SCRIPT_SH += git-quiltimport.sh
+SCRIPT_SH += git-rebase--interactive.sh
+SCRIPT_SH += git-rebase.sh
+SCRIPT_SH += git-repack.sh
+SCRIPT_SH += git-request-pull.sh
+SCRIPT_SH += git-sh-setup.sh
+SCRIPT_SH += git-stash.sh
+SCRIPT_SH += git-submodule.sh
+SCRIPT_SH += git-web--browse.sh
+
+SCRIPT_PERL += git-add--interactive.perl
+SCRIPT_PERL += git-difftool.perl
+SCRIPT_PERL += git-archimport.perl
+SCRIPT_PERL += git-cvsexportcommit.perl
+SCRIPT_PERL += git-cvsimport.perl
+SCRIPT_PERL += git-cvsserver.perl
+SCRIPT_PERL += git-relink.perl
+SCRIPT_PERL += git-send-email.perl
+SCRIPT_PERL += git-svn.perl
+
+SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH)) \
+	  $(patsubst %.perl,%,$(SCRIPT_PERL)) \
+	  git-instaweb
+
+# Empty...
+EXTRA_PROGRAMS =
+
+# ... and all the rest that could be moved out of bindir to gitexecdir
+PROGRAMS += $(EXTRA_PROGRAMS)
+PROGRAMS += git-fast-import$X
+PROGRAMS += git-hash-object$X
+PROGRAMS += git-index-pack$X
+PROGRAMS += git-merge-index$X
+PROGRAMS += git-merge-tree$X
+PROGRAMS += git-mktag$X
+PROGRAMS += git-mktree$X
+PROGRAMS += git-pack-redundant$X
+PROGRAMS += git-patch-id$X
+PROGRAMS += git-shell$X
+PROGRAMS += git-show-index$X
+PROGRAMS += git-unpack-file$X
+PROGRAMS += git-update-server-info$X
+PROGRAMS += git-upload-pack$X
+PROGRAMS += git-var$X
+
+# List built-in command $C whose implementation cmd_$C() is not in
+# builtin-$C.o but is linked in as part of some other command.
+BUILT_INS += $(patsubst builtin-%.o,git-%$X,$(BUILTIN_OBJS))
+
+BUILT_INS += git-cherry$X
+BUILT_INS += git-cherry-pick$X
+BUILT_INS += git-format-patch$X
+BUILT_INS += git-fsck-objects$X
+BUILT_INS += git-get-tar-commit-id$X
+BUILT_INS += git-init$X
+BUILT_INS += git-merge-subtree$X
+BUILT_INS += git-peek-remote$X
+BUILT_INS += git-repo-config$X
+BUILT_INS += git-show$X
+BUILT_INS += git-stage$X
+BUILT_INS += git-status$X
+BUILT_INS += git-whatchanged$X
+
+# what 'all' will build and 'install' will install, in gitexecdir
+ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS)
+
+# what 'all' will build but not install in gitexecdir
+OTHER_PROGRAMS = git$X
+ifndef NO_PERL
+OTHER_PROGRAMS += gitweb/gitweb.cgi
+endif
+
+# Set paths to tools early so that they can be used for version tests.
+ifndef SHELL_PATH
+	SHELL_PATH = /bin/sh
+endif
+ifndef PERL_PATH
+	PERL_PATH = /usr/bin/perl
+endif
+
+export PERL_PATH
+
+LIB_FILE=libgit.a
+XDIFF_LIB=xdiff/lib.a
+
+LIB_H += archive.h
+LIB_H += attr.h
+LIB_H += blob.h
+LIB_H += builtin.h
+LIB_H += cache.h
+LIB_H += cache-tree.h
+LIB_H += commit.h
+LIB_H += compat/cygwin.h
+LIB_H += compat/mingw.h
+LIB_H += csum-file.h
+LIB_H += decorate.h
+LIB_H += delta.h
+LIB_H += diffcore.h
+LIB_H += diff.h
+LIB_H += dir.h
+LIB_H += fsck.h
+LIB_H += git-compat-util.h
+LIB_H += graph.h
+LIB_H += grep.h
+LIB_H += hash.h
+LIB_H += help.h
+LIB_H += levenshtein.h
+LIB_H += list-objects.h
+LIB_H += ll-merge.h
+LIB_H += log-tree.h
+LIB_H += mailmap.h
+LIB_H += merge-recursive.h
+LIB_H += object.h
+LIB_H += pack.h
+LIB_H += pack-refs.h
+LIB_H += pack-revindex.h
+LIB_H += parse-options.h
+LIB_H += patch-ids.h
+LIB_H += pkt-line.h
+LIB_H += progress.h
+LIB_H += quote.h
+LIB_H += reflog-walk.h
+LIB_H += refs.h
+LIB_H += remote.h
+LIB_H += rerere.h
+LIB_H += revision.h
+LIB_H += run-command.h
+LIB_H += sha1-lookup.h
+LIB_H += sideband.h
+LIB_H += sigchain.h
+LIB_H += strbuf.h
+LIB_H += string-list.h
+LIB_H += tag.h
+LIB_H += transport.h
+LIB_H += tree.h
+LIB_H += tree-walk.h
+LIB_H += unpack-trees.h
+LIB_H += userdiff.h
+LIB_H += utf8.h
+LIB_H += wt-status.h
+
+LIB_OBJS += abspath.o
+LIB_OBJS += alias.o
+LIB_OBJS += alloc.o
+LIB_OBJS += archive.o
+LIB_OBJS += archive-tar.o
+LIB_OBJS += archive-zip.o
+LIB_OBJS += attr.o
+LIB_OBJS += base85.o
+LIB_OBJS += bisect.o
+LIB_OBJS += blob.o
+LIB_OBJS += branch.o
+LIB_OBJS += bundle.o
+LIB_OBJS += cache-tree.o
+LIB_OBJS += color.o
+LIB_OBJS += combine-diff.o
+LIB_OBJS += commit.o
+LIB_OBJS += config.o
+LIB_OBJS += connect.o
+LIB_OBJS += convert.o
+LIB_OBJS += copy.o
+LIB_OBJS += csum-file.o
+LIB_OBJS += ctype.o
+LIB_OBJS += date.o
+LIB_OBJS += decorate.o
+LIB_OBJS += diffcore-break.o
+LIB_OBJS += diffcore-delta.o
+LIB_OBJS += diffcore-order.o
+LIB_OBJS += diffcore-pickaxe.o
+LIB_OBJS += diffcore-rename.o
+LIB_OBJS += diff-delta.o
+LIB_OBJS += diff-lib.o
+LIB_OBJS += diff-no-index.o
+LIB_OBJS += diff.o
+LIB_OBJS += dir.o
+LIB_OBJS += editor.o
+LIB_OBJS += entry.o
+LIB_OBJS += environment.o
+LIB_OBJS += exec_cmd.o
+LIB_OBJS += fsck.o
+LIB_OBJS += graph.o
+LIB_OBJS += grep.o
+LIB_OBJS += hash.o
+LIB_OBJS += help.o
+LIB_OBJS += ident.o
+LIB_OBJS += levenshtein.o
+LIB_OBJS += list-objects.o
+LIB_OBJS += ll-merge.o
+LIB_OBJS += lockfile.o
+LIB_OBJS += log-tree.o
+LIB_OBJS += mailmap.o
+LIB_OBJS += match-trees.o
+LIB_OBJS += merge-file.o
+LIB_OBJS += merge-recursive.o
+LIB_OBJS += name-hash.o
+LIB_OBJS += object.o
+LIB_OBJS += pack-check.o
+LIB_OBJS += pack-refs.o
+LIB_OBJS += pack-revindex.o
+LIB_OBJS += pack-write.o
+LIB_OBJS += pager.o
+LIB_OBJS += parse-options.o
+LIB_OBJS += patch-delta.o
+LIB_OBJS += patch-ids.o
+LIB_OBJS += path.o
+LIB_OBJS += pkt-line.o
+LIB_OBJS += preload-index.o
+LIB_OBJS += pretty.o
+LIB_OBJS += progress.o
+LIB_OBJS += quote.o
+LIB_OBJS += reachable.o
+LIB_OBJS += read-cache.o
+LIB_OBJS += reflog-walk.o
+LIB_OBJS += refs.o
+LIB_OBJS += remote.o
+LIB_OBJS += rerere.o
+LIB_OBJS += revision.o
+LIB_OBJS += run-command.o
+LIB_OBJS += server-info.o
+LIB_OBJS += setup.o
+LIB_OBJS += sha1-lookup.o
+LIB_OBJS += sha1_file.o
+LIB_OBJS += sha1_name.o
+LIB_OBJS += shallow.o
+LIB_OBJS += sideband.o
+LIB_OBJS += sigchain.o
+LIB_OBJS += strbuf.o
+LIB_OBJS += string-list.o
+LIB_OBJS += symlinks.o
+LIB_OBJS += tag.o
+LIB_OBJS += trace.o
+LIB_OBJS += transport.o
+LIB_OBJS += tree-diff.o
+LIB_OBJS += tree.o
+LIB_OBJS += tree-walk.o
+LIB_OBJS += unpack-trees.o
+LIB_OBJS += usage.o
+LIB_OBJS += userdiff.o
+LIB_OBJS += utf8.o
+LIB_OBJS += walker.o
+LIB_OBJS += wrapper.o
+LIB_OBJS += write_or_die.o
+LIB_OBJS += ws.o
+LIB_OBJS += wt-status.o
+LIB_OBJS += xdiff-interface.o
+
+BUILTIN_OBJS += builtin-add.o
+BUILTIN_OBJS += builtin-annotate.o
+BUILTIN_OBJS += builtin-apply.o
+BUILTIN_OBJS += builtin-archive.o
+BUILTIN_OBJS += builtin-bisect--helper.o
+BUILTIN_OBJS += builtin-blame.o
+BUILTIN_OBJS += builtin-branch.o
+BUILTIN_OBJS += builtin-bundle.o
+BUILTIN_OBJS += builtin-cat-file.o
+BUILTIN_OBJS += builtin-check-attr.o
+BUILTIN_OBJS += builtin-check-ref-format.o
+BUILTIN_OBJS += builtin-checkout-index.o
+BUILTIN_OBJS += builtin-checkout.o
+BUILTIN_OBJS += builtin-clean.o
+BUILTIN_OBJS += builtin-clone.o
+BUILTIN_OBJS += builtin-commit-tree.o
+BUILTIN_OBJS += builtin-commit.o
+BUILTIN_OBJS += builtin-config.o
+BUILTIN_OBJS += builtin-count-objects.o
+BUILTIN_OBJS += builtin-describe.o
+BUILTIN_OBJS += builtin-diff-files.o
+BUILTIN_OBJS += builtin-diff-index.o
+BUILTIN_OBJS += builtin-diff-tree.o
+BUILTIN_OBJS += builtin-diff.o
+BUILTIN_OBJS += builtin-fast-export.o
+BUILTIN_OBJS += builtin-fetch--tool.o
+BUILTIN_OBJS += builtin-fetch-pack.o
+BUILTIN_OBJS += builtin-fetch.o
+BUILTIN_OBJS += builtin-fmt-merge-msg.o
+BUILTIN_OBJS += builtin-for-each-ref.o
+BUILTIN_OBJS += builtin-fsck.o
+BUILTIN_OBJS += builtin-gc.o
+BUILTIN_OBJS += builtin-grep.o
+BUILTIN_OBJS += builtin-help.o
+BUILTIN_OBJS += builtin-init-db.o
+BUILTIN_OBJS += builtin-log.o
+BUILTIN_OBJS += builtin-ls-files.o
+BUILTIN_OBJS += builtin-ls-remote.o
+BUILTIN_OBJS += builtin-ls-tree.o
+BUILTIN_OBJS += builtin-mailinfo.o
+BUILTIN_OBJS += builtin-mailsplit.o
+BUILTIN_OBJS += builtin-merge.o
+BUILTIN_OBJS += builtin-merge-base.o
+BUILTIN_OBJS += builtin-merge-file.o
+BUILTIN_OBJS += builtin-merge-ours.o
+BUILTIN_OBJS += builtin-merge-recursive.o
+BUILTIN_OBJS += builtin-mv.o
+BUILTIN_OBJS += builtin-name-rev.o
+BUILTIN_OBJS += builtin-pack-objects.o
+BUILTIN_OBJS += builtin-pack-refs.o
+BUILTIN_OBJS += builtin-prune-packed.o
+BUILTIN_OBJS += builtin-prune.o
+BUILTIN_OBJS += builtin-push.o
+BUILTIN_OBJS += builtin-read-tree.o
+BUILTIN_OBJS += builtin-receive-pack.o
+BUILTIN_OBJS += builtin-reflog.o
+BUILTIN_OBJS += builtin-remote.o
+BUILTIN_OBJS += builtin-rerere.o
+BUILTIN_OBJS += builtin-reset.o
+BUILTIN_OBJS += builtin-rev-list.o
+BUILTIN_OBJS += builtin-rev-parse.o
+BUILTIN_OBJS += builtin-revert.o
+BUILTIN_OBJS += builtin-rm.o
+BUILTIN_OBJS += builtin-send-pack.o
+BUILTIN_OBJS += builtin-shortlog.o
+BUILTIN_OBJS += builtin-show-branch.o
+BUILTIN_OBJS += builtin-show-ref.o
+BUILTIN_OBJS += builtin-stripspace.o
+BUILTIN_OBJS += builtin-symbolic-ref.o
+BUILTIN_OBJS += builtin-tag.o
+BUILTIN_OBJS += builtin-tar-tree.o
+BUILTIN_OBJS += builtin-unpack-objects.o
+BUILTIN_OBJS += builtin-update-index.o
+BUILTIN_OBJS += builtin-update-ref.o
+BUILTIN_OBJS += builtin-upload-archive.o
+BUILTIN_OBJS += builtin-verify-pack.o
+BUILTIN_OBJS += builtin-verify-tag.o
+BUILTIN_OBJS += builtin-write-tree.o
+
+GITLIBS = $(LIB_FILE) $(XDIFF_LIB)
+EXTLIBS =
+
+#
+# Platform specific tweaks
+#
+
+# We choose to avoid "if .. else if .. else .. endif endif"
+# because maintaining the nesting to match is a pain.  If
+# we had "elif" things would have been much nicer...
+
+ifeq ($(uname_S),Linux)
+	NO_STRLCPY = YesPlease
+	THREADED_DELTA_SEARCH = YesPlease
+endif
+ifeq ($(uname_S),GNU/kFreeBSD)
+	NO_STRLCPY = YesPlease
+	THREADED_DELTA_SEARCH = YesPlease
+endif
+ifeq ($(uname_S),UnixWare)
+	CC = cc
+	NEEDS_SOCKET = YesPlease
+	NEEDS_NSL = YesPlease
+	NEEDS_SSL_WITH_CRYPTO = YesPlease
+	NEEDS_LIBICONV = YesPlease
+	SHELL_PATH = /usr/local/bin/bash
+	NO_IPV6 = YesPlease
+	NO_HSTRERROR = YesPlease
+	BASIC_CFLAGS += -Kthread
+	BASIC_CFLAGS += -I/usr/local/include
+	BASIC_LDFLAGS += -L/usr/local/lib
+	INSTALL = ginstall
+	TAR = gtar
+	NO_STRCASESTR = YesPlease
+	NO_MEMMEM = YesPlease
+endif
+ifeq ($(uname_S),SCO_SV)
+	ifeq ($(uname_R),3.2)
+		CFLAGS = -O2
+	endif
+	ifeq ($(uname_R),5)
+		CC = cc
+		BASIC_CFLAGS += -Kthread
+	endif
+	NEEDS_SOCKET = YesPlease
+	NEEDS_NSL = YesPlease
+	NEEDS_SSL_WITH_CRYPTO = YesPlease
+	NEEDS_LIBICONV = YesPlease
+	SHELL_PATH = /usr/bin/bash
+	NO_IPV6 = YesPlease
+	NO_HSTRERROR = YesPlease
+	BASIC_CFLAGS += -I/usr/local/include
+	BASIC_LDFLAGS += -L/usr/local/lib
+	NO_STRCASESTR = YesPlease
+	NO_MEMMEM = YesPlease
+	INSTALL = ginstall
+	TAR = gtar
+endif
+ifeq ($(uname_S),Darwin)
+	NEEDS_SSL_WITH_CRYPTO = YesPlease
+	NEEDS_LIBICONV = YesPlease
+	ifeq ($(shell expr "$(uname_R)" : '[15678]\.'),2)
+		OLD_ICONV = UnfortunatelyYes
+	endif
+	ifeq ($(shell expr "$(uname_R)" : '[15]\.'),2)
+		NO_STRLCPY = YesPlease
+	endif
+	NO_MEMMEM = YesPlease
+	THREADED_DELTA_SEARCH = YesPlease
+	USE_ST_TIMESPEC = YesPlease
+endif
+ifeq ($(uname_S),SunOS)
+	NEEDS_SOCKET = YesPlease
+	NEEDS_NSL = YesPlease
+	SHELL_PATH = /bin/bash
+	NO_STRCASESTR = YesPlease
+	NO_MEMMEM = YesPlease
+	NO_HSTRERROR = YesPlease
+	NO_MKDTEMP = YesPlease
+	OLD_ICONV = UnfortunatelyYes
+	ifeq ($(uname_R),5.8)
+		NO_UNSETENV = YesPlease
+		NO_SETENV = YesPlease
+		NO_C99_FORMAT = YesPlease
+		NO_STRTOUMAX = YesPlease
+	endif
+	ifeq ($(uname_R),5.9)
+		NO_UNSETENV = YesPlease
+		NO_SETENV = YesPlease
+		NO_C99_FORMAT = YesPlease
+		NO_STRTOUMAX = YesPlease
+	endif
+	INSTALL = ginstall
+	TAR = gtar
+	BASIC_CFLAGS += -D__EXTENSIONS__
+endif
+ifeq ($(uname_O),Cygwin)
+	NO_D_TYPE_IN_DIRENT = YesPlease
+	NO_D_INO_IN_DIRENT = YesPlease
+	NO_STRCASESTR = YesPlease
+	NO_MEMMEM = YesPlease
+	NO_SYMLINK_HEAD = YesPlease
+	NEEDS_LIBICONV = YesPlease
+	NO_FAST_WORKING_DIRECTORY = UnfortunatelyYes
+	NO_TRUSTABLE_FILEMODE = UnfortunatelyYes
+	OLD_ICONV = UnfortunatelyYes
+	# There are conflicting reports about this.
+	# On some boxes NO_MMAP is needed, and not so elsewhere.
+	# Try commenting this out if you suspect MMAP is more efficient
+	NO_MMAP = YesPlease
+	NO_IPV6 = YesPlease
+	X = .exe
+endif
+ifeq ($(uname_S),FreeBSD)
+	NEEDS_LIBICONV = YesPlease
+	NO_MEMMEM = YesPlease
+	BASIC_CFLAGS += -I/usr/local/include
+	BASIC_LDFLAGS += -L/usr/local/lib
+	DIR_HAS_BSD_GROUP_SEMANTICS = YesPlease
+	USE_ST_TIMESPEC = YesPlease
+	THREADED_DELTA_SEARCH = YesPlease
+	ifeq ($(shell expr "$(uname_R)" : '4\.'),2)
+		PTHREAD_LIBS = -pthread
+		NO_UINTMAX_T = YesPlease
+		NO_STRTOUMAX = YesPlease
+	endif
+endif
+ifeq ($(uname_S),OpenBSD)
+	NO_STRCASESTR = YesPlease
+	NO_MEMMEM = YesPlease
+	NEEDS_LIBICONV = YesPlease
+	BASIC_CFLAGS += -I/usr/local/include
+	BASIC_LDFLAGS += -L/usr/local/lib
+	THREADED_DELTA_SEARCH = YesPlease
+endif
+ifeq ($(uname_S),NetBSD)
+	ifeq ($(shell expr "$(uname_R)" : '[01]\.'),2)
+		NEEDS_LIBICONV = YesPlease
+	endif
+	BASIC_CFLAGS += -I/usr/pkg/include
+	BASIC_LDFLAGS += -L/usr/pkg/lib $(CC_LD_DYNPATH)/usr/pkg/lib
+	THREADED_DELTA_SEARCH = YesPlease
+endif
+ifeq ($(uname_S),AIX)
+	NO_STRCASESTR=YesPlease
+	NO_MEMMEM = YesPlease
+	NO_MKDTEMP = YesPlease
+	NO_STRLCPY = YesPlease
+	NO_NSEC = YesPlease
+	FREAD_READS_DIRECTORIES = UnfortunatelyYes
+	INTERNAL_QSORT = UnfortunatelyYes
+	NEEDS_LIBICONV=YesPlease
+	BASIC_CFLAGS += -D_LARGE_FILES
+	ifneq ($(shell expr "$(uname_V)" : '[1234]'),1)
+		THREADED_DELTA_SEARCH = YesPlease
+	else
+		NO_PTHREADS = YesPlease
+	endif
+endif
+ifeq ($(uname_S),GNU)
+	# GNU/Hurd
+	NO_STRLCPY=YesPlease
+endif
+ifeq ($(uname_S),IRIX64)
+	NO_IPV6=YesPlease
+	NO_SETENV=YesPlease
+	NO_STRCASESTR=YesPlease
+	NO_MEMMEM = YesPlease
+	NO_STRLCPY = YesPlease
+	NO_SOCKADDR_STORAGE=YesPlease
+	SHELL_PATH=/usr/gnu/bin/bash
+	BASIC_CFLAGS += -DPATH_MAX=1024
+	# for now, build 32-bit version
+	BASIC_LDFLAGS += -L/usr/lib32
+endif
+ifeq ($(uname_S),HP-UX)
+	NO_IPV6=YesPlease
+	NO_SETENV=YesPlease
+	NO_STRCASESTR=YesPlease
+	NO_MEMMEM = YesPlease
+	NO_STRLCPY = YesPlease
+	NO_MKDTEMP = YesPlease
+	NO_UNSETENV = YesPlease
+	NO_HSTRERROR = YesPlease
+	NO_SYS_SELECT_H = YesPlease
+	SNPRINTF_RETURNS_BOGUS = YesPlease
+endif
+ifneq (,$(findstring CYGWIN,$(uname_S)))
+	COMPAT_OBJS += compat/cygwin.o
+endif
+ifneq (,$(findstring MINGW,$(uname_S)))
+	NO_PREAD = YesPlease
+	NO_OPENSSL = YesPlease
+	NO_CURL = YesPlease
+	NO_SYMLINK_HEAD = YesPlease
+	NO_IPV6 = YesPlease
+	NO_SETENV = YesPlease
+	NO_UNSETENV = YesPlease
+	NO_STRCASESTR = YesPlease
+	NO_STRLCPY = YesPlease
+	NO_MEMMEM = YesPlease
+	NO_PTHREADS = YesPlease
+	NEEDS_LIBICONV = YesPlease
+	OLD_ICONV = YesPlease
+	NO_C99_FORMAT = YesPlease
+	NO_STRTOUMAX = YesPlease
+	NO_MKDTEMP = YesPlease
+	SNPRINTF_RETURNS_BOGUS = YesPlease
+	NO_SVN_TESTS = YesPlease
+	NO_PERL_MAKEMAKER = YesPlease
+	RUNTIME_PREFIX = YesPlease
+	NO_POSIX_ONLY_PROGRAMS = YesPlease
+	NO_ST_BLOCKS_IN_STRUCT_STAT = YesPlease
+	NO_NSEC = YesPlease
+	USE_WIN32_MMAP = YesPlease
+	COMPAT_CFLAGS += -D__USE_MINGW_ACCESS -DNOGDI -Icompat -Icompat/regex -Icompat/fnmatch
+	COMPAT_CFLAGS += -DSNPRINTF_SIZE_CORR=1
+	COMPAT_CFLAGS += -DSTRIP_EXTENSION=\".exe\"
+	COMPAT_OBJS += compat/mingw.o compat/fnmatch/fnmatch.o compat/regex/regex.o compat/winansi.o
+	EXTLIBS += -lws2_32
+	X = .exe
+endif
+ifneq (,$(findstring arm,$(uname_M)))
+	ARM_SHA1 = YesPlease
+endif
+
+-include config.mak.autogen
+-include config.mak
+
+ifeq ($(uname_S),Darwin)
+	ifndef NO_FINK
+		ifeq ($(shell test -d /sw/lib && echo y),y)
+			BASIC_CFLAGS += -I/sw/include
+			BASIC_LDFLAGS += -L/sw/lib
+		endif
+	endif
+	ifndef NO_DARWIN_PORTS
+		ifeq ($(shell test -d /opt/local/lib && echo y),y)
+			BASIC_CFLAGS += -I/opt/local/include
+			BASIC_LDFLAGS += -L/opt/local/lib
+		endif
+	endif
+	PTHREAD_LIBS =
+endif
+
+ifndef CC_LD_DYNPATH
+	ifdef NO_R_TO_GCC_LINKER
+		# Some gcc does not accept and pass -R to the linker to specify
+		# the runtime dynamic library path.
+		CC_LD_DYNPATH = -Wl,-rpath,
+	else
+		CC_LD_DYNPATH = -R
+	endif
+endif
+
+ifdef NO_CURL
+	BASIC_CFLAGS += -DNO_CURL
+else
+	ifdef CURLDIR
+		# Try "-Wl,-rpath=$(CURLDIR)/$(lib)" in such a case.
+		BASIC_CFLAGS += -I$(CURLDIR)/include
+		CURL_LIBCURL = -L$(CURLDIR)/$(lib) $(CC_LD_DYNPATH)$(CURLDIR)/$(lib) -lcurl
+	else
+		CURL_LIBCURL = -lcurl
+	endif
+	BUILTIN_OBJS += builtin-http-fetch.o
+	EXTLIBS += $(CURL_LIBCURL)
+	LIB_OBJS += http.o http-walker.o
+	curl_check := $(shell (echo 070908; curl-config --vernum) | sort -r | sed -ne 2p)
+	ifeq "$(curl_check)" "070908"
+		ifndef NO_EXPAT
+			PROGRAMS += git-http-push$X
+		endif
+	endif
+	ifndef NO_EXPAT
+		ifdef EXPATDIR
+			BASIC_CFLAGS += -I$(EXPATDIR)/include
+			EXPAT_LIBEXPAT = -L$(EXPATDIR)/$(lib) $(CC_LD_DYNPATH)$(EXPATDIR)/$(lib) -lexpat
+		else
+			EXPAT_LIBEXPAT = -lexpat
+		endif
+	endif
+endif
+
+ifdef ZLIB_PATH
+	BASIC_CFLAGS += -I$(ZLIB_PATH)/include
+	EXTLIBS += -L$(ZLIB_PATH)/$(lib) $(CC_LD_DYNPATH)$(ZLIB_PATH)/$(lib)
+endif
+EXTLIBS += -lz
+
+ifndef NO_POSIX_ONLY_PROGRAMS
+	PROGRAMS += git-daemon$X
+	PROGRAMS += git-imap-send$X
+endif
+ifndef NO_OPENSSL
+	OPENSSL_LIBSSL = -lssl
+	ifdef OPENSSLDIR
+		BASIC_CFLAGS += -I$(OPENSSLDIR)/include
+		OPENSSL_LINK = -L$(OPENSSLDIR)/$(lib) $(CC_LD_DYNPATH)$(OPENSSLDIR)/$(lib)
+	else
+		OPENSSL_LINK =
+	endif
+else
+	BASIC_CFLAGS += -DNO_OPENSSL
+	MOZILLA_SHA1 = 1
+	OPENSSL_LIBSSL =
+endif
+ifdef NEEDS_SSL_WITH_CRYPTO
+	LIB_4_CRYPTO = $(OPENSSL_LINK) -lcrypto -lssl
+else
+	LIB_4_CRYPTO = $(OPENSSL_LINK) -lcrypto
+endif
+ifdef NEEDS_LIBICONV
+	ifdef ICONVDIR
+		BASIC_CFLAGS += -I$(ICONVDIR)/include
+		ICONV_LINK = -L$(ICONVDIR)/$(lib) $(CC_LD_DYNPATH)$(ICONVDIR)/$(lib)
+	else
+		ICONV_LINK =
+	endif
+	EXTLIBS += $(ICONV_LINK) -liconv
+endif
+ifdef NEEDS_SOCKET
+	EXTLIBS += -lsocket
+endif
+ifdef NEEDS_NSL
+	EXTLIBS += -lnsl
+endif
+ifdef NO_D_TYPE_IN_DIRENT
+	BASIC_CFLAGS += -DNO_D_TYPE_IN_DIRENT
+endif
+ifdef NO_D_INO_IN_DIRENT
+	BASIC_CFLAGS += -DNO_D_INO_IN_DIRENT
+endif
+ifdef NO_ST_BLOCKS_IN_STRUCT_STAT
+	BASIC_CFLAGS += -DNO_ST_BLOCKS_IN_STRUCT_STAT
+endif
+ifdef USE_NSEC
+	BASIC_CFLAGS += -DUSE_NSEC
+endif
+ifdef USE_ST_TIMESPEC
+	BASIC_CFLAGS += -DUSE_ST_TIMESPEC
+endif
+ifdef NO_NSEC
+	BASIC_CFLAGS += -DNO_NSEC
+endif
+ifdef NO_C99_FORMAT
+	BASIC_CFLAGS += -DNO_C99_FORMAT
+endif
+ifdef SNPRINTF_RETURNS_BOGUS
+	COMPAT_CFLAGS += -DSNPRINTF_RETURNS_BOGUS
+	COMPAT_OBJS += compat/snprintf.o
+endif
+ifdef FREAD_READS_DIRECTORIES
+	COMPAT_CFLAGS += -DFREAD_READS_DIRECTORIES
+	COMPAT_OBJS += compat/fopen.o
+endif
+ifdef NO_SYMLINK_HEAD
+	BASIC_CFLAGS += -DNO_SYMLINK_HEAD
+endif
+ifdef NO_STRCASESTR
+	COMPAT_CFLAGS += -DNO_STRCASESTR
+	COMPAT_OBJS += compat/strcasestr.o
+endif
+ifdef NO_STRLCPY
+	COMPAT_CFLAGS += -DNO_STRLCPY
+	COMPAT_OBJS += compat/strlcpy.o
+endif
+ifdef NO_STRTOUMAX
+	COMPAT_CFLAGS += -DNO_STRTOUMAX
+	COMPAT_OBJS += compat/strtoumax.o
+endif
+ifdef NO_STRTOULL
+	COMPAT_CFLAGS += -DNO_STRTOULL
+endif
+ifdef NO_SETENV
+	COMPAT_CFLAGS += -DNO_SETENV
+	COMPAT_OBJS += compat/setenv.o
+endif
+ifdef NO_MKDTEMP
+	COMPAT_CFLAGS += -DNO_MKDTEMP
+	COMPAT_OBJS += compat/mkdtemp.o
+endif
+ifdef NO_UNSETENV
+	COMPAT_CFLAGS += -DNO_UNSETENV
+	COMPAT_OBJS += compat/unsetenv.o
+endif
+ifdef NO_SYS_SELECT_H
+	BASIC_CFLAGS += -DNO_SYS_SELECT_H
+endif
+ifdef NO_MMAP
+	COMPAT_CFLAGS += -DNO_MMAP
+	COMPAT_OBJS += compat/mmap.o
+else
+	ifdef USE_WIN32_MMAP
+		COMPAT_CFLAGS += -DUSE_WIN32_MMAP
+		COMPAT_OBJS += compat/win32mmap.o
+	endif
+endif
+ifdef NO_PREAD
+	COMPAT_CFLAGS += -DNO_PREAD
+	COMPAT_OBJS += compat/pread.o
+endif
+ifdef NO_FAST_WORKING_DIRECTORY
+	BASIC_CFLAGS += -DNO_FAST_WORKING_DIRECTORY
+endif
+ifdef NO_TRUSTABLE_FILEMODE
+	BASIC_CFLAGS += -DNO_TRUSTABLE_FILEMODE
+endif
+ifdef NO_IPV6
+	BASIC_CFLAGS += -DNO_IPV6
+endif
+ifdef NO_UINTMAX_T
+	BASIC_CFLAGS += -Duintmax_t=uint32_t
+endif
+ifdef NO_SOCKADDR_STORAGE
+ifdef NO_IPV6
+	BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in
+else
+	BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in6
+endif
+endif
+ifdef NO_INET_NTOP
+	LIB_OBJS += compat/inet_ntop.o
+endif
+ifdef NO_INET_PTON
+	LIB_OBJS += compat/inet_pton.o
+endif
+
+ifdef NO_ICONV
+	BASIC_CFLAGS += -DNO_ICONV
+endif
+
+ifdef OLD_ICONV
+	BASIC_CFLAGS += -DOLD_ICONV
+endif
+
+ifdef NO_DEFLATE_BOUND
+	BASIC_CFLAGS += -DNO_DEFLATE_BOUND
+endif
+
+ifdef PPC_SHA1
+	SHA1_HEADER = "ppc/sha1.h"
+	LIB_OBJS += ppc/sha1.o ppc/sha1ppc.o
+else
+ifdef ARM_SHA1
+	SHA1_HEADER = "arm/sha1.h"
+	LIB_OBJS += arm/sha1.o arm/sha1_arm.o
+else
+ifdef MOZILLA_SHA1
+	SHA1_HEADER = "mozilla-sha1/sha1.h"
+	LIB_OBJS += mozilla-sha1/sha1.o
+else
+	SHA1_HEADER = <openssl/sha.h>
+	EXTLIBS += $(LIB_4_CRYPTO)
+endif
+endif
+endif
+ifdef NO_PERL_MAKEMAKER
+	export NO_PERL_MAKEMAKER
+endif
+ifdef NO_HSTRERROR
+	COMPAT_CFLAGS += -DNO_HSTRERROR
+	COMPAT_OBJS += compat/hstrerror.o
+endif
+ifdef NO_MEMMEM
+	COMPAT_CFLAGS += -DNO_MEMMEM
+	COMPAT_OBJS += compat/memmem.o
+endif
+ifdef INTERNAL_QSORT
+	COMPAT_CFLAGS += -DINTERNAL_QSORT
+	COMPAT_OBJS += compat/qsort.o
+endif
+ifdef RUNTIME_PREFIX
+	COMPAT_CFLAGS += -DRUNTIME_PREFIX
+endif
+
+ifdef NO_PTHREADS
+	THREADED_DELTA_SEARCH =
+	BASIC_CFLAGS += -DNO_PTHREADS
+else
+	EXTLIBS += $(PTHREAD_LIBS)
+endif
+
+ifdef THREADED_DELTA_SEARCH
+	BASIC_CFLAGS += -DTHREADED_DELTA_SEARCH
+	LIB_OBJS += thread-utils.o
+endif
+ifdef DIR_HAS_BSD_GROUP_SEMANTICS
+	COMPAT_CFLAGS += -DDIR_HAS_BSD_GROUP_SEMANTICS
+endif
+ifdef NO_EXTERNAL_GREP
+	BASIC_CFLAGS += -DNO_EXTERNAL_GREP
+endif
+
+ifeq ($(TCLTK_PATH),)
+NO_TCLTK=NoThanks
+endif
+
+ifeq ($(PERL_PATH),)
+NO_PERL=NoThanks
+endif
+
+QUIET_SUBDIR0  = +$(MAKE) -C # space to separate -C and subdir
+QUIET_SUBDIR1  =
+
+ifneq ($(findstring $(MAKEFLAGS),w),w)
+PRINT_DIR = --no-print-directory
+else # "make -w"
+NO_SUBDIR = :
+endif
+
+ifneq ($(findstring $(MAKEFLAGS),s),s)
+ifndef V
+	QUIET_CC       = @echo '   ' CC $@;
+	QUIET_AR       = @echo '   ' AR $@;
+	QUIET_LINK     = @echo '   ' LINK $@;
+	QUIET_BUILT_IN = @echo '   ' BUILTIN $@;
+	QUIET_GEN      = @echo '   ' GEN $@;
+	QUIET_SUBDIR0  = +@subdir=
+	QUIET_SUBDIR1  = ;$(NO_SUBDIR) echo '   ' SUBDIR $$subdir; \
+			 $(MAKE) $(PRINT_DIR) -C $$subdir
+	export V
+	export QUIET_GEN
+	export QUIET_BUILT_IN
+endif
+endif
+
+ifdef ASCIIDOC8
+	export ASCIIDOC8
+endif
+
+# Shell quote (do not use $(call) to accommodate ancient setups);
+
+SHA1_HEADER_SQ = $(subst ','\'',$(SHA1_HEADER))
+ETC_GITCONFIG_SQ = $(subst ','\'',$(ETC_GITCONFIG))
+
+DESTDIR_SQ = $(subst ','\'',$(DESTDIR))
+bindir_SQ = $(subst ','\'',$(bindir))
+bindir_relative_SQ = $(subst ','\'',$(bindir_relative))
+mandir_SQ = $(subst ','\'',$(mandir))
+infodir_SQ = $(subst ','\'',$(infodir))
+gitexecdir_SQ = $(subst ','\'',$(gitexecdir))
+template_dir_SQ = $(subst ','\'',$(template_dir))
+htmldir_SQ = $(subst ','\'',$(htmldir))
+prefix_SQ = $(subst ','\'',$(prefix))
+
+SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
+PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH))
+TCLTK_PATH_SQ = $(subst ','\'',$(TCLTK_PATH))
+
+LIBS = $(GITLIBS) $(EXTLIBS)
+
+BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \
+	$(COMPAT_CFLAGS)
+LIB_OBJS += $(COMPAT_OBJS)
+
+ALL_CFLAGS += $(BASIC_CFLAGS)
+ALL_LDFLAGS += $(BASIC_LDFLAGS)
+
+export TAR INSTALL DESTDIR SHELL_PATH
+
+
+### Build rules
+
+SHELL = $(SHELL_PATH)
+
+all:: shell_compatibility_test $(ALL_PROGRAMS) $(BUILT_INS) $(OTHER_PROGRAMS) GIT-BUILD-OPTIONS
+ifneq (,$X)
+	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) git$X)), test '$p' -ef '$p$X' || $(RM) '$p';)
+endif
+
+all::
+ifndef NO_TCLTK
+	$(QUIET_SUBDIR0)git-gui $(QUIET_SUBDIR1) gitexecdir='$(gitexec_instdir_SQ)' all
+	$(QUIET_SUBDIR0)gitk-git $(QUIET_SUBDIR1) all
+endif
+ifndef NO_PERL
+	$(QUIET_SUBDIR0)perl $(QUIET_SUBDIR1) PERL_PATH='$(PERL_PATH_SQ)' prefix='$(prefix_SQ)' all
+endif
+	$(QUIET_SUBDIR0)templates $(QUIET_SUBDIR1)
+
+please_set_SHELL_PATH_to_a_more_modern_shell:
+	@$$(:)
+
+shell_compatibility_test: please_set_SHELL_PATH_to_a_more_modern_shell
+
+strip: $(PROGRAMS) git$X
+	$(STRIP) $(STRIP_OPTS) $(PROGRAMS) git$X
+
+git.o: git.c common-cmds.h GIT-CFLAGS
+	$(QUIET_CC)$(CC) -DGIT_VERSION='"$(GIT_VERSION)"' \
+		'-DGIT_HTML_PATH="$(htmldir_SQ)"' \
+		$(ALL_CFLAGS) -c $(filter %.c,$^)
+
+git$X: git.o $(BUILTIN_OBJS) $(GITLIBS)
+	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ git.o \
+		$(BUILTIN_OBJS) $(ALL_LDFLAGS) $(LIBS)
+
+builtin-help.o: builtin-help.c common-cmds.h GIT-CFLAGS
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
+		'-DGIT_HTML_PATH="$(htmldir_SQ)"' \
+		'-DGIT_MAN_PATH="$(mandir_SQ)"' \
+		'-DGIT_INFO_PATH="$(infodir_SQ)"' $<
+
+$(BUILT_INS): git$X
+	$(QUIET_BUILT_IN)$(RM) $@ && \
+	ln git$X $@ 2>/dev/null || \
+	ln -s git$X $@ 2>/dev/null || \
+	cp git$X $@
+
+common-cmds.h: ./generate-cmdlist.sh command-list.txt
+
+common-cmds.h: $(wildcard Documentation/git-*.txt)
+	$(QUIET_GEN)./generate-cmdlist.sh > $@+ && mv $@+ $@
+
+$(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh
+	$(QUIET_GEN)$(RM) $@ $@+ && \
+	sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \
+	    -e 's|@SHELL_PATH@|$(SHELL_PATH_SQ)|' \
+	    -e 's|@@PERL@@|$(PERL_PATH_SQ)|g' \
+	    -e 's/@@GIT_VERSION@@/$(GIT_VERSION)/g' \
+	    -e 's/@@NO_CURL@@/$(NO_CURL)/g' \
+	    $@.sh >$@+ && \
+	chmod +x $@+ && \
+	mv $@+ $@
+
+ifndef NO_PERL
+$(patsubst %.perl,%,$(SCRIPT_PERL)): perl/perl.mak
+
+perl/perl.mak: GIT-CFLAGS perl/Makefile perl/Makefile.PL
+	$(QUIET_SUBDIR0)perl $(QUIET_SUBDIR1) PERL_PATH='$(PERL_PATH_SQ)' prefix='$(prefix_SQ)' $(@F)
+
+$(patsubst %.perl,%,$(SCRIPT_PERL)): % : %.perl
+	$(QUIET_GEN)$(RM) $@ $@+ && \
+	INSTLIBDIR=`MAKEFLAGS= $(MAKE) -C perl -s --no-print-directory instlibdir` && \
+	sed -e '1{' \
+	    -e '	s|#!.*perl|#!$(PERL_PATH_SQ)|' \
+	    -e '	h' \
+	    -e '	s=.*=use lib (split(/:/, $$ENV{GITPERLLIB} || "@@INSTLIBDIR@@"));=' \
+	    -e '	H' \
+	    -e '	x' \
+	    -e '}' \
+	    -e 's|@@INSTLIBDIR@@|'"$$INSTLIBDIR"'|g' \
+	    -e 's/@@GIT_VERSION@@/$(GIT_VERSION)/g' \
+	    $@.perl >$@+ && \
+	chmod +x $@+ && \
+	mv $@+ $@
+
+gitweb/gitweb.cgi: gitweb/gitweb.perl
+	$(QUIET_GEN)$(RM) $@ $@+ && \
+	sed -e '1s|#!.*perl|#!$(PERL_PATH_SQ)|' \
+	    -e 's|++GIT_VERSION++|$(GIT_VERSION)|g' \
+	    -e 's|++GIT_BINDIR++|$(bindir)|g' \
+	    -e 's|++GITWEB_CONFIG++|$(GITWEB_CONFIG)|g' \
+	    -e 's|++GITWEB_CONFIG_SYSTEM++|$(GITWEB_CONFIG_SYSTEM)|g' \
+	    -e 's|++GITWEB_HOME_LINK_STR++|$(GITWEB_HOME_LINK_STR)|g' \
+	    -e 's|++GITWEB_SITENAME++|$(GITWEB_SITENAME)|g' \
+	    -e 's|++GITWEB_PROJECTROOT++|$(GITWEB_PROJECTROOT)|g' \
+	    -e 's|"++GITWEB_PROJECT_MAXDEPTH++"|$(GITWEB_PROJECT_MAXDEPTH)|g' \
+	    -e 's|++GITWEB_EXPORT_OK++|$(GITWEB_EXPORT_OK)|g' \
+	    -e 's|++GITWEB_STRICT_EXPORT++|$(GITWEB_STRICT_EXPORT)|g' \
+	    -e 's|++GITWEB_BASE_URL++|$(GITWEB_BASE_URL)|g' \
+	    -e 's|++GITWEB_LIST++|$(GITWEB_LIST)|g' \
+	    -e 's|++GITWEB_HOMETEXT++|$(GITWEB_HOMETEXT)|g' \
+	    -e 's|++GITWEB_CSS++|$(GITWEB_CSS)|g' \
+	    -e 's|++GITWEB_LOGO++|$(GITWEB_LOGO)|g' \
+	    -e 's|++GITWEB_FAVICON++|$(GITWEB_FAVICON)|g' \
+	    -e 's|++GITWEB_SITE_HEADER++|$(GITWEB_SITE_HEADER)|g' \
+	    -e 's|++GITWEB_SITE_FOOTER++|$(GITWEB_SITE_FOOTER)|g' \
+	    $< >$@+ && \
+	chmod +x $@+ && \
+	mv $@+ $@
+
+git-instaweb: git-instaweb.sh gitweb/gitweb.cgi gitweb/gitweb.css
+	$(QUIET_GEN)$(RM) $@ $@+ && \
+	sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \
+	    -e 's/@@GIT_VERSION@@/$(GIT_VERSION)/g' \
+	    -e 's/@@NO_CURL@@/$(NO_CURL)/g' \
+	    -e '/@@GITWEB_CGI@@/r gitweb/gitweb.cgi' \
+	    -e '/@@GITWEB_CGI@@/d' \
+	    -e '/@@GITWEB_CSS@@/r gitweb/gitweb.css' \
+	    -e '/@@GITWEB_CSS@@/d' \
+	    -e 's|@@PERL@@|$(PERL_PATH_SQ)|g' \
+	    $@.sh > $@+ && \
+	chmod +x $@+ && \
+	mv $@+ $@
+else # NO_PERL
+$(patsubst %.perl,%,$(SCRIPT_PERL)) git-instaweb: % : unimplemented.sh
+	$(QUIET_GEN)$(RM) $@ $@+ && \
+	sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \
+	    -e 's|@@REASON@@|NO_PERL=$(NO_PERL)|g' \
+	    unimplemented.sh >$@+ && \
+	chmod +x $@+ && \
+	mv $@+ $@
+endif # NO_PERL
+
+configure: configure.ac
+	$(QUIET_GEN)$(RM) $@ $<+ && \
+	sed -e 's/@@GIT_VERSION@@/$(GIT_VERSION)/g' \
+	    $< > $<+ && \
+	autoconf -o $@ $<+ && \
+	$(RM) $<+
+
+# These can record GIT_VERSION
+git.o git.spec \
+	$(patsubst %.sh,%,$(SCRIPT_SH)) \
+	$(patsubst %.perl,%,$(SCRIPT_PERL)) \
+	: GIT-VERSION-FILE
+
+%.o: %.c GIT-CFLAGS
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
+%.s: %.c GIT-CFLAGS
+	$(QUIET_CC)$(CC) -S $(ALL_CFLAGS) $<
+%.o: %.S
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
+
+exec_cmd.o: exec_cmd.c GIT-CFLAGS
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
+		'-DGIT_EXEC_PATH="$(gitexecdir_SQ)"' \
+		'-DBINDIR="$(bindir_relative_SQ)"' \
+		'-DPREFIX="$(prefix_SQ)"' \
+		$<
+
+builtin-init-db.o: builtin-init-db.c GIT-CFLAGS
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DDEFAULT_GIT_TEMPLATE_DIR='"$(template_dir_SQ)"' $<
+
+config.o: config.c GIT-CFLAGS
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DETC_GITCONFIG='"$(ETC_GITCONFIG_SQ)"' $<
+
+http.o: http.c GIT-CFLAGS
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DGIT_USER_AGENT='"git/$(GIT_VERSION)"' $<
+
+ifdef NO_EXPAT
+http-walker.o: http-walker.c http.h GIT-CFLAGS
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DNO_EXPAT $<
+endif
+
+git-%$X: %.o $(GITLIBS)
+	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS)
+
+git-imap-send$X: imap-send.o $(GITLIBS)
+	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \
+		$(LIBS) $(OPENSSL_LINK) $(OPENSSL_LIBSSL)
+
+http.o http-walker.o http-push.o transport.o: http.h
+
+git-http-push$X: revision.o http.o http-push.o $(GITLIBS)
+	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \
+		$(LIBS) $(CURL_LIBCURL) $(EXPAT_LIBEXPAT)
+
+$(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H)
+$(patsubst git-%$X,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h)
+builtin-revert.o wt-status.o: wt-status.h
+
+$(LIB_FILE): $(LIB_OBJS)
+	$(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS)
+
+XDIFF_OBJS=xdiff/xdiffi.o xdiff/xprepare.o xdiff/xutils.o xdiff/xemit.o \
+	xdiff/xmerge.o xdiff/xpatience.o
+$(XDIFF_OBJS): xdiff/xinclude.h xdiff/xmacros.h xdiff/xdiff.h xdiff/xtypes.h \
+	xdiff/xutils.h xdiff/xprepare.h xdiff/xdiffi.h xdiff/xemit.h
+
+$(XDIFF_LIB): $(XDIFF_OBJS)
+	$(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(XDIFF_OBJS)
+
+
+doc:
+	$(MAKE) -C Documentation all
+
+man:
+	$(MAKE) -C Documentation man
+
+html:
+	$(MAKE) -C Documentation html
+
+info:
+	$(MAKE) -C Documentation info
+
+pdf:
+	$(MAKE) -C Documentation pdf
+
+TAGS:
+	$(RM) TAGS
+	$(FIND) . -name '*.[hcS]' -print | xargs etags -a
+
+tags:
+	$(RM) tags
+	$(FIND) . -name '*.[hcS]' -print | xargs ctags -a
+
+cscope:
+	$(RM) cscope*
+	$(FIND) . -name '*.[hcS]' -print | xargs cscope -b
+
+### Detect prefix changes
+TRACK_CFLAGS = $(subst ','\'',$(ALL_CFLAGS)):\
+             $(bindir_SQ):$(gitexecdir_SQ):$(template_dir_SQ):$(prefix_SQ)
+
+GIT-CFLAGS: .FORCE-GIT-CFLAGS
+	@FLAGS='$(TRACK_CFLAGS)'; \
+	    if test x"$$FLAGS" != x"`cat GIT-CFLAGS 2>/dev/null`" ; then \
+		echo 1>&2 "    * new build flags or prefix"; \
+		echo "$$FLAGS" >GIT-CFLAGS; \
+            fi
+
+# We need to apply sq twice, once to protect from the shell
+# that runs GIT-BUILD-OPTIONS, and then again to protect it
+# and the first level quoting from the shell that runs "echo".
+GIT-BUILD-OPTIONS: .FORCE-GIT-BUILD-OPTIONS
+	@echo SHELL_PATH=\''$(subst ','\'',$(SHELL_PATH_SQ))'\' >$@
+	@echo TAR=\''$(subst ','\'',$(subst ','\'',$(TAR)))'\' >>$@
+	@echo NO_CURL=\''$(subst ','\'',$(subst ','\'',$(NO_CURL)))'\' >>$@
+	@echo NO_PERL=\''$(subst ','\'',$(subst ','\'',$(NO_PERL)))'\' >>$@
+
+### Detect Tck/Tk interpreter path changes
+ifndef NO_TCLTK
+TRACK_VARS = $(subst ','\'',-DTCLTK_PATH='$(TCLTK_PATH_SQ)')
+
+GIT-GUI-VARS: .FORCE-GIT-GUI-VARS
+	@VARS='$(TRACK_VARS)'; \
+	    if test x"$$VARS" != x"`cat $@ 2>/dev/null`" ; then \
+		echo 1>&2 "    * new Tcl/Tk interpreter location"; \
+		echo "$$VARS" >$@; \
+            fi
+
+.PHONY: .FORCE-GIT-GUI-VARS
+endif
+
+### Testing rules
+
+TEST_PROGRAMS += test-chmtime$X
+TEST_PROGRAMS += test-ctype$X
+TEST_PROGRAMS += test-date$X
+TEST_PROGRAMS += test-delta$X
+TEST_PROGRAMS += test-dump-cache-tree$X
+TEST_PROGRAMS += test-genrandom$X
+TEST_PROGRAMS += test-match-trees$X
+TEST_PROGRAMS += test-parse-options$X
+TEST_PROGRAMS += test-path-utils$X
+TEST_PROGRAMS += test-sha1$X
+TEST_PROGRAMS += test-sigchain$X
+
+all:: $(TEST_PROGRAMS)
+
+# GNU make supports exporting all variables by "export" without parameters.
+# However, the environment gets quite big, and some programs have problems
+# with that.
+
+export NO_SVN_TESTS
+
+test: all
+	$(MAKE) -C t/ all
+
+test-ctype$X: ctype.o
+
+test-date$X: date.o ctype.o
+
+test-delta$X: diff-delta.o patch-delta.o
+
+test-parse-options$X: parse-options.o
+
+.PRECIOUS: $(patsubst test-%$X,test-%.o,$(TEST_PROGRAMS))
+
+test-%$X: test-%.o $(GITLIBS)
+	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS)
+
+check-sha1:: test-sha1$X
+	./test-sha1.sh
+
+check: common-cmds.h
+	if sparse; \
+	then \
+		for i in *.c; \
+		do \
+			sparse $(ALL_CFLAGS) $(SPARSE_FLAGS) $$i || exit; \
+		done; \
+	else \
+		echo 2>&1 "Did you mean 'make test'?"; \
+		exit 1; \
+	fi
+
+remove-dashes:
+	./fixup-builtins $(BUILT_INS) $(PROGRAMS) $(SCRIPTS)
+
+### Installation rules
+
+ifneq ($(filter /%,$(firstword $(template_dir))),)
+template_instdir = $(template_dir)
+else
+template_instdir = $(prefix)/$(template_dir)
+endif
+export template_instdir
+
+ifneq ($(filter /%,$(firstword $(gitexecdir))),)
+gitexec_instdir = $(gitexecdir)
+else
+gitexec_instdir = $(prefix)/$(gitexecdir)
+endif
+gitexec_instdir_SQ = $(subst ','\'',$(gitexec_instdir))
+export gitexec_instdir
+
+install: all
+	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
+	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(gitexec_instdir_SQ)'
+	$(INSTALL) $(ALL_PROGRAMS) '$(DESTDIR_SQ)$(gitexec_instdir_SQ)'
+	$(INSTALL) git$X git-upload-pack$X git-receive-pack$X git-upload-archive$X git-shell$X git-cvsserver '$(DESTDIR_SQ)$(bindir_SQ)'
+	$(MAKE) -C templates DESTDIR='$(DESTDIR_SQ)' install
+	$(MAKE) -C perl prefix='$(prefix_SQ)' DESTDIR='$(DESTDIR_SQ)' install
+ifndef NO_TCLTK
+	$(MAKE) -C gitk-git install
+	$(MAKE) -C git-gui gitexecdir='$(gitexec_instdir_SQ)' install
+endif
+ifneq (,$X)
+	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) git$X)), $(RM) '$(DESTDIR_SQ)$(gitexec_instdir_SQ)/$p';)
+endif
+	bindir=$$(cd '$(DESTDIR_SQ)$(bindir_SQ)' && pwd) && \
+	execdir=$$(cd '$(DESTDIR_SQ)$(gitexec_instdir_SQ)' && pwd) && \
+	{ $(RM) "$$execdir/git-add$X" && \
+		ln "$$bindir/git$X" "$$execdir/git-add$X" 2>/dev/null || \
+		cp "$$bindir/git$X" "$$execdir/git-add$X"; } && \
+	{ for p in $(filter-out git-add$X,$(BUILT_INS)); do \
+		$(RM) "$$execdir/$$p" && \
+		ln "$$execdir/git-add$X" "$$execdir/$$p" 2>/dev/null || \
+		ln -s "git-add$X" "$$execdir/$$p" 2>/dev/null || \
+		cp "$$execdir/git-add$X" "$$execdir/$$p" || exit; \
+	  done } && \
+	./check_bindir "z$$bindir" "z$$execdir" "$$bindir/git-add$X"
+
+install-doc:
+	$(MAKE) -C Documentation install
+
+install-man:
+	$(MAKE) -C Documentation install-man
+
+install-html:
+	$(MAKE) -C Documentation install-html
+
+install-info:
+	$(MAKE) -C Documentation install-info
+
+install-pdf:
+	$(MAKE) -C Documentation install-pdf
+
+quick-install-doc:
+	$(MAKE) -C Documentation quick-install
+
+quick-install-man:
+	$(MAKE) -C Documentation quick-install-man
+
+quick-install-html:
+	$(MAKE) -C Documentation quick-install-html
+
+
+
+### Maintainer's dist rules
+
+git.spec: git.spec.in
+	sed -e 's/@@VERSION@@/$(GIT_VERSION)/g' < $< > $@+
+	mv $@+ $@
+
+GIT_TARNAME=git-$(GIT_VERSION)
+dist: git.spec git-archive$(X) configure
+	./git-archive --format=tar \
+		--prefix=$(GIT_TARNAME)/ HEAD^{tree} > $(GIT_TARNAME).tar
+	@mkdir -p $(GIT_TARNAME)
+	@cp git.spec configure $(GIT_TARNAME)
+	@echo $(GIT_VERSION) > $(GIT_TARNAME)/version
+	@$(MAKE) -C git-gui TARDIR=../$(GIT_TARNAME)/git-gui dist-version
+	$(TAR) rf $(GIT_TARNAME).tar \
+		$(GIT_TARNAME)/git.spec \
+		$(GIT_TARNAME)/configure \
+		$(GIT_TARNAME)/version \
+		$(GIT_TARNAME)/git-gui/version
+	@$(RM) -r $(GIT_TARNAME)
+	gzip -f -9 $(GIT_TARNAME).tar
+
+rpm: dist
+	$(RPMBUILD) -ta $(GIT_TARNAME).tar.gz
+
+htmldocs = git-htmldocs-$(GIT_VERSION)
+manpages = git-manpages-$(GIT_VERSION)
+dist-doc:
+	$(RM) -r .doc-tmp-dir
+	mkdir .doc-tmp-dir
+	$(MAKE) -C Documentation WEBDOC_DEST=../.doc-tmp-dir install-webdoc
+	cd .doc-tmp-dir && $(TAR) cf ../$(htmldocs).tar .
+	gzip -n -9 -f $(htmldocs).tar
+	:
+	$(RM) -r .doc-tmp-dir
+	mkdir -p .doc-tmp-dir/man1 .doc-tmp-dir/man5 .doc-tmp-dir/man7
+	$(MAKE) -C Documentation DESTDIR=./ \
+		man1dir=../.doc-tmp-dir/man1 \
+		man5dir=../.doc-tmp-dir/man5 \
+		man7dir=../.doc-tmp-dir/man7 \
+		install
+	cd .doc-tmp-dir && $(TAR) cf ../$(manpages).tar .
+	gzip -n -9 -f $(manpages).tar
+	$(RM) -r .doc-tmp-dir
+
+### Cleaning rules
+
+distclean: clean
+	$(RM) configure
 
 clean:
-	rm $(BINS)
+	$(RM) *.o mozilla-sha1/*.o arm/*.o ppc/*.o compat/*.o xdiff/*.o \
+		$(LIB_FILE) $(XDIFF_LIB)
+	$(RM) $(ALL_PROGRAMS) $(BUILT_INS) git$X
+	$(RM) $(TEST_PROGRAMS)
+	$(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo common-cmds.h TAGS tags cscope*
+	$(RM) -r autom4te.cache
+	$(RM) config.log config.mak.autogen config.mak.append config.status config.cache
+	$(RM) -r $(GIT_TARNAME) .doc-tmp-dir
+	$(RM) $(GIT_TARNAME).tar.gz git-core_$(GIT_VERSION)-*.tar.gz
+	$(RM) $(htmldocs).tar.gz $(manpages).tar.gz
+	$(MAKE) -C Documentation/ clean
+ifndef NO_PERL
+	$(RM) gitweb/gitweb.cgi
+	$(MAKE) -C perl clean
+endif
+	$(MAKE) -C templates/ clean
+	$(MAKE) -C t/ clean
+ifndef NO_TCLTK
+	$(MAKE) -C gitk-git clean
+	$(MAKE) -C git-gui clean
+endif
+	$(RM) GIT-VERSION-FILE GIT-CFLAGS GIT-GUI-VARS GIT-BUILD-OPTIONS
+
+.PHONY: all install clean strip
+.PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
+.PHONY: .FORCE-GIT-VERSION-FILE TAGS tags cscope .FORCE-GIT-CFLAGS
+.PHONY: .FORCE-GIT-BUILD-OPTIONS
+
+### Check documentation
+#
+check-docs::
+	@(for v in $(ALL_PROGRAMS) $(BUILT_INS) git gitk; \
+	do \
+		case "$$v" in \
+		git-merge-octopus | git-merge-ours | git-merge-recursive | \
+		git-merge-resolve | git-merge-subtree | \
+		git-fsck-objects | git-init-db | \
+		git-?*--?* ) continue ;; \
+		esac ; \
+		test -f "Documentation/$$v.txt" || \
+		echo "no doc: $$v"; \
+		sed -e '/^#/d' command-list.txt | \
+		grep -q "^$$v[ 	]" || \
+		case "$$v" in \
+		git) ;; \
+		*) echo "no link: $$v";; \
+		esac ; \
+	done; \
+	( \
+		sed -e '/^#/d' \
+		    -e 's/[ 	].*//' \
+		    -e 's/^/listed /' command-list.txt; \
+		ls -1 Documentation/git*txt | \
+		sed -e 's|Documentation/|documented |' \
+		    -e 's/\.txt//'; \
+	) | while read how cmd; \
+	do \
+		case "$$how,$$cmd" in \
+		*,git-citool | \
+		*,git-gui | \
+		*,git-help | \
+		documented,gitattributes | \
+		documented,gitignore | \
+		documented,gitmodules | \
+		documented,gitcli | \
+		documented,git-tools | \
+		documented,gitcore-tutorial | \
+		documented,gitcvs-migration | \
+		documented,gitdiffcore | \
+		documented,gitglossary | \
+		documented,githooks | \
+		documented,gitrepository-layout | \
+		documented,gittutorial | \
+		documented,gittutorial-2 | \
+		sentinel,not,matching,is,ok ) continue ;; \
+		esac; \
+		case " $(ALL_PROGRAMS) $(BUILT_INS) git gitk " in \
+		*" $$cmd "*)	;; \
+		*) echo "removed but $$how: $$cmd" ;; \
+		esac; \
+	done ) | sort
+
+### Make sure built-ins do not have dups and listed in git.c
+#
+check-builtins::
+	./check-builtins.sh
+
+### Test suite coverage testing
+#
+.PHONY: coverage coverage-clean coverage-build coverage-report
+
+coverage:
+	$(MAKE) coverage-build
+	$(MAKE) coverage-report
+
+coverage-clean:
+	rm -f *.gcda *.gcno
+
+COVERAGE_CFLAGS = $(CFLAGS) -O0 -ftest-coverage -fprofile-arcs
+COVERAGE_LDFLAGS = $(CFLAGS)  -O0 -lgcov
+
+coverage-build: coverage-clean
+	$(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" all
+	$(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" \
+		-j1 test
+
+coverage-report:
+	gcov -b *.c
+	grep '^function.*called 0 ' *.c.gcov \
+		| sed -e 's/\([^:]*\)\.gcov: *function \([^ ]*\) called.*/\1: \2/' \
+		| tee coverage-untested-functions
-- 
cgit v0.10.2


From 0780060124011b94af55830939c86cc0916be0f5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 20 Apr 2009 15:00:56 +0200
Subject: perf_counter tools: add in basic glue from Git

First very raw version at having a central 'perf' command and
a list of subcommands:

  perf top
  perf stat
  perf record
  perf report
  ...

This is done by picking up Git's collection of utility functions,
and hacking them to build fine in this new environment.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/.gitignore b/Documentation/perf_counter/.gitignore
new file mode 100644
index 0000000..41c0b20
--- /dev/null
+++ b/Documentation/perf_counter/.gitignore
@@ -0,0 +1,179 @@
+GIT-BUILD-OPTIONS
+GIT-CFLAGS
+GIT-GUI-VARS
+GIT-VERSION-FILE
+git
+git-add
+git-add--interactive
+git-am
+git-annotate
+git-apply
+git-archimport
+git-archive
+git-bisect
+git-bisect--helper
+git-blame
+git-branch
+git-bundle
+git-cat-file
+git-check-attr
+git-check-ref-format
+git-checkout
+git-checkout-index
+git-cherry
+git-cherry-pick
+git-clean
+git-clone
+git-commit
+git-commit-tree
+git-config
+git-count-objects
+git-cvsexportcommit
+git-cvsimport
+git-cvsserver
+git-daemon
+git-diff
+git-diff-files
+git-diff-index
+git-diff-tree
+git-difftool
+git-difftool--helper
+git-describe
+git-fast-export
+git-fast-import
+git-fetch
+git-fetch--tool
+git-fetch-pack
+git-filter-branch
+git-fmt-merge-msg
+git-for-each-ref
+git-format-patch
+git-fsck
+git-fsck-objects
+git-gc
+git-get-tar-commit-id
+git-grep
+git-hash-object
+git-help
+git-http-fetch
+git-http-push
+git-imap-send
+git-index-pack
+git-init
+git-init-db
+git-instaweb
+git-log
+git-lost-found
+git-ls-files
+git-ls-remote
+git-ls-tree
+git-mailinfo
+git-mailsplit
+git-merge
+git-merge-base
+git-merge-index
+git-merge-file
+git-merge-tree
+git-merge-octopus
+git-merge-one-file
+git-merge-ours
+git-merge-recursive
+git-merge-resolve
+git-merge-subtree
+git-mergetool
+git-mergetool--lib
+git-mktag
+git-mktree
+git-name-rev
+git-mv
+git-pack-redundant
+git-pack-objects
+git-pack-refs
+git-parse-remote
+git-patch-id
+git-peek-remote
+git-prune
+git-prune-packed
+git-pull
+git-push
+git-quiltimport
+git-read-tree
+git-rebase
+git-rebase--interactive
+git-receive-pack
+git-reflog
+git-relink
+git-remote
+git-repack
+git-repo-config
+git-request-pull
+git-rerere
+git-reset
+git-rev-list
+git-rev-parse
+git-revert
+git-rm
+git-send-email
+git-send-pack
+git-sh-setup
+git-shell
+git-shortlog
+git-show
+git-show-branch
+git-show-index
+git-show-ref
+git-stage
+git-stash
+git-status
+git-stripspace
+git-submodule
+git-svn
+git-symbolic-ref
+git-tag
+git-tar-tree
+git-unpack-file
+git-unpack-objects
+git-update-index
+git-update-ref
+git-update-server-info
+git-upload-archive
+git-upload-pack
+git-var
+git-verify-pack
+git-verify-tag
+git-web--browse
+git-whatchanged
+git-write-tree
+git-core-*/?*
+gitk-wish
+gitweb/gitweb.cgi
+test-chmtime
+test-ctype
+test-date
+test-delta
+test-dump-cache-tree
+test-genrandom
+test-match-trees
+test-parse-options
+test-path-utils
+test-sha1
+test-sigchain
+common-cmds.h
+*.tar.gz
+*.dsc
+*.deb
+git.spec
+*.exe
+*.[aos]
+*.py[co]
+config.mak
+autom4te.cache
+config.cache
+config.log
+config.status
+config.mak.autogen
+config.mak.append
+configure
+tags
+TAGS
+cscope*
diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 6e0838b..11809b9 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -13,16 +13,9 @@ all::
 # Define NO_OPENSSL environment variable if you do not have OpenSSL.
 # This also implies MOZILLA_SHA1.
 #
-# Define NO_CURL if you do not have libcurl installed.  git-http-pull and
-# git-http-push are not built, and you cannot use http:// and https://
-# transports.
-#
 # Define CURLDIR=/foo/bar if your curl header and library files are in
 # /foo/bar/include and /foo/bar/lib directories.
 #
-# Define NO_EXPAT if you do not have expat installed.  git-http-push is
-# not built, and you cannot push using http:// and https:// transports.
-#
 # Define EXPATDIR=/foo/bar if your expat header and library files are in
 # /foo/bar/include and /foo/bar/lib directories.
 #
@@ -40,8 +33,6 @@ all::
 #
 # Define NO_MEMMEM if you don't have memmem.
 #
-# Define NO_STRLCPY if you don't have strlcpy.
-#
 # Define NO_STRTOUMAX if you don't have strtoumax in the C library.
 # If your compiler also does not support long long or does not have
 # strtoull, define NO_STRTOULL.
@@ -54,7 +45,7 @@ all::
 #
 # Define NO_SYS_SELECT_H if you don't have sys/select.h.
 #
-# Define NO_SYMLINK_HEAD if you never want .git/HEAD to be a symbolic link.
+# Define NO_SYMLINK_HEAD if you never want .perf/HEAD to be a symbolic link.
 # Enable it on Windows.  By default, symrefs are still used.
 #
 # Define NO_SVN_TESTS if you want to skip time-consuming SVN interoperability
@@ -62,13 +53,13 @@ all::
 # but are not needed unless you plan to talk to SVN repos.
 #
 # Define NO_FINK if you are building on Darwin/Mac OS X, have Fink
-# installed in /sw, but don't want GIT to link against any libraries
+# installed in /sw, but don't want PERF to link against any libraries
 # installed there.  If defined you may specify your own (or Fink's)
 # include directories and library directories by defining CFLAGS
 # and LDFLAGS appropriately.
 #
 # Define NO_DARWIN_PORTS if you are building on Darwin/Mac OS X,
-# have DarwinPorts installed in /opt/local, but don't want GIT to
+# have DarwinPorts installed in /opt/local, but don't want PERF to
 # link against any libraries installed there.  If defined you may
 # specify your own (or DarwinPort's) include directories and
 # library directories by defining CFLAGS and LDFLAGS appropriately.
@@ -120,7 +111,7 @@ all::
 # that tells runtime paths to dynamic libraries;
 # "-Wl,-rpath=/path/lib" is used instead.
 #
-# Define USE_NSEC below if you want git to care about sub-second file mtimes
+# Define USE_NSEC below if you want perf to care about sub-second file mtimes
 # and ctimes. Note that you need recent glibc (at least 2.2.4) for this, and
 # it will BREAK YOUR LOCAL DIFFS! show-diff and anything using it will likely
 # randomly break unless your underlying filesystem supports those sub-second
@@ -132,7 +123,7 @@ all::
 # Define NO_NSEC if your "struct stat" does not have "st_ctim.tv_nsec"
 # available.  This automatically turns USE_NSEC off.
 #
-# Define USE_STDEV below if you want git to care about the underlying device
+# Define USE_STDEV below if you want perf to care about the underlying device
 # change being considered an inode change from the update-index perspective.
 #
 # Define NO_ST_BLOCKS_IN_STRUCT_STAT if your platform does not have st_blocks
@@ -150,27 +141,24 @@ all::
 # Define NO_TCLTK if you do not want Tcl/Tk GUI.
 #
 # The TCL_PATH variable governs the location of the Tcl interpreter
-# used to optimize git-gui for your system.  Only used if NO_TCLTK
+# used to optimize perf-gui for your system.  Only used if NO_TCLTK
 # is not set.  Defaults to the bare 'tclsh'.
 #
 # The TCLTK_PATH variable governs the location of the Tcl/Tk interpreter.
 # If not set it defaults to the bare 'wish'. If it is set to the empty
 # string then NO_TCLTK will be forced (this is used by configure script).
 #
-# Define THREADED_DELTA_SEARCH if you have pthreads and wish to exploit
-# parallel delta searching when packing objects.
-#
 # Define INTERNAL_QSORT to use Git's implementation of qsort(), which
 # is a simplified version of the merge sort used in glibc. This is
 # recommended if Git triggers O(n^2) behavior in your platform's qsort().
 #
-# Define NO_EXTERNAL_GREP if you don't want "git grep" to ever call
+# Define NO_EXTERNAL_GREP if you don't want "perf grep" to ever call
 # your external grep (e.g., if your system lacks grep, if its grep is
-# broken, or spawning external process is slower than built-in grep git has).
+# broken, or spawning external process is slower than built-in grep perf has).
 
-GIT-VERSION-FILE: .FORCE-GIT-VERSION-FILE
-	@$(SHELL_PATH) ./GIT-VERSION-GEN
--include GIT-VERSION-FILE
+PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
+	@$(SHELL_PATH) ./PERF-VERSION-GEN
+-include PERF-VERSION-FILE
 
 uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
 uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not')
@@ -182,20 +170,20 @@ uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
 # CFLAGS and LDFLAGS are for the users to override from the command line.
 
 CFLAGS = -g -O2 -Wall
-LDFLAGS =
+LDFLAGS = -lpthread -lrt
 ALL_CFLAGS = $(CFLAGS)
 ALL_LDFLAGS = $(LDFLAGS)
 STRIP ?= strip
 
 # Among the variables below, these:
-#   gitexecdir
+#   perfexecdir
 #   template_dir
 #   mandir
 #   infodir
 #   htmldir
-#   ETC_GITCONFIG (but not sysconfdir)
+#   ETC_PERFCONFIG (but not sysconfdir)
 # can be specified as a relative path some/where/else;
-# this is interpreted as relative to $(prefix) and "git" at
+# this is interpreted as relative to $(prefix) and "perf" at
 # runtime figures out where they are based on the path to the executable.
 # This can help installing the suite in a relocatable way.
 
@@ -204,38 +192,20 @@ bindir_relative = bin
 bindir = $(prefix)/$(bindir_relative)
 mandir = share/man
 infodir = share/info
-gitexecdir = libexec/git-core
+perfexecdir = libexec/perf-core
 sharedir = $(prefix)/share
-template_dir = share/git-core/templates
-htmldir = share/doc/git-doc
+template_dir = share/perf-core/templates
+htmldir = share/doc/perf-doc
 ifeq ($(prefix),/usr)
 sysconfdir = /etc
-ETC_GITCONFIG = $(sysconfdir)/gitconfig
+ETC_PERFCONFIG = $(sysconfdir)/perfconfig
 else
 sysconfdir = $(prefix)/etc
-ETC_GITCONFIG = etc/gitconfig
+ETC_PERFCONFIG = etc/perfconfig
 endif
 lib = lib
 # DESTDIR=
 
-# default configuration for gitweb
-GITWEB_CONFIG = gitweb_config.perl
-GITWEB_CONFIG_SYSTEM = /etc/gitweb.conf
-GITWEB_HOME_LINK_STR = projects
-GITWEB_SITENAME =
-GITWEB_PROJECTROOT = /pub/git
-GITWEB_PROJECT_MAXDEPTH = 2007
-GITWEB_EXPORT_OK =
-GITWEB_STRICT_EXPORT =
-GITWEB_BASE_URL =
-GITWEB_LIST =
-GITWEB_HOMETEXT = indextext.html
-GITWEB_CSS = gitweb.css
-GITWEB_LOGO = git-logo.png
-GITWEB_FAVICON = git-favicon.png
-GITWEB_SITE_HEADER =
-GITWEB_SITE_FOOTER =
-
 export prefix bindir sharedir sysconfdir
 
 CC = gcc
@@ -277,89 +247,46 @@ SCRIPT_PERL =
 SCRIPT_SH =
 TEST_PROGRAMS =
 
-SCRIPT_SH += git-am.sh
-SCRIPT_SH += git-bisect.sh
-SCRIPT_SH += git-difftool--helper.sh
-SCRIPT_SH += git-filter-branch.sh
-SCRIPT_SH += git-lost-found.sh
-SCRIPT_SH += git-merge-octopus.sh
-SCRIPT_SH += git-merge-one-file.sh
-SCRIPT_SH += git-merge-resolve.sh
-SCRIPT_SH += git-mergetool.sh
-SCRIPT_SH += git-mergetool--lib.sh
-SCRIPT_SH += git-parse-remote.sh
-SCRIPT_SH += git-pull.sh
-SCRIPT_SH += git-quiltimport.sh
-SCRIPT_SH += git-rebase--interactive.sh
-SCRIPT_SH += git-rebase.sh
-SCRIPT_SH += git-repack.sh
-SCRIPT_SH += git-request-pull.sh
-SCRIPT_SH += git-sh-setup.sh
-SCRIPT_SH += git-stash.sh
-SCRIPT_SH += git-submodule.sh
-SCRIPT_SH += git-web--browse.sh
-
-SCRIPT_PERL += git-add--interactive.perl
-SCRIPT_PERL += git-difftool.perl
-SCRIPT_PERL += git-archimport.perl
-SCRIPT_PERL += git-cvsexportcommit.perl
-SCRIPT_PERL += git-cvsimport.perl
-SCRIPT_PERL += git-cvsserver.perl
-SCRIPT_PERL += git-relink.perl
-SCRIPT_PERL += git-send-email.perl
-SCRIPT_PERL += git-svn.perl
+#
+# No scripts right now:
+#
+
+# SCRIPT_SH += perf-am.sh
+
+#
+# No Perl scripts right now:
+#
+
+# SCRIPT_PERL += perf-add--interactive.perl
 
 SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH)) \
-	  $(patsubst %.perl,%,$(SCRIPT_PERL)) \
-	  git-instaweb
+	  $(patsubst %.perl,%,$(SCRIPT_PERL))
 
 # Empty...
 EXTRA_PROGRAMS =
 
-# ... and all the rest that could be moved out of bindir to gitexecdir
+# ... and all the rest that could be moved out of bindir to perfexecdir
 PROGRAMS += $(EXTRA_PROGRAMS)
-PROGRAMS += git-fast-import$X
-PROGRAMS += git-hash-object$X
-PROGRAMS += git-index-pack$X
-PROGRAMS += git-merge-index$X
-PROGRAMS += git-merge-tree$X
-PROGRAMS += git-mktag$X
-PROGRAMS += git-mktree$X
-PROGRAMS += git-pack-redundant$X
-PROGRAMS += git-patch-id$X
-PROGRAMS += git-shell$X
-PROGRAMS += git-show-index$X
-PROGRAMS += git-unpack-file$X
-PROGRAMS += git-update-server-info$X
-PROGRAMS += git-upload-pack$X
-PROGRAMS += git-var$X
+
+#
+# None right now:
+#
+# PROGRAMS += perf-fast-import$X
 
 # List built-in command $C whose implementation cmd_$C() is not in
 # builtin-$C.o but is linked in as part of some other command.
-BUILT_INS += $(patsubst builtin-%.o,git-%$X,$(BUILTIN_OBJS))
-
-BUILT_INS += git-cherry$X
-BUILT_INS += git-cherry-pick$X
-BUILT_INS += git-format-patch$X
-BUILT_INS += git-fsck-objects$X
-BUILT_INS += git-get-tar-commit-id$X
-BUILT_INS += git-init$X
-BUILT_INS += git-merge-subtree$X
-BUILT_INS += git-peek-remote$X
-BUILT_INS += git-repo-config$X
-BUILT_INS += git-show$X
-BUILT_INS += git-stage$X
-BUILT_INS += git-status$X
-BUILT_INS += git-whatchanged$X
-
-# what 'all' will build and 'install' will install, in gitexecdir
+BUILT_INS += $(patsubst builtin-%.o,perf-%$X,$(BUILTIN_OBJS))
+
+#
+# None right now:
+#
+# BUILT_INS += perf-init $X
+
+# what 'all' will build and 'install' will install, in perfexecdir
 ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS)
 
-# what 'all' will build but not install in gitexecdir
-OTHER_PROGRAMS = git$X
-ifndef NO_PERL
-OTHER_PROGRAMS += gitweb/gitweb.cgi
-endif
+# what 'all' will build but not install in perfexecdir
+OTHER_PROGRAMS = perf$X
 
 # Set paths to tools early so that they can be used for version tests.
 ifndef SHELL_PATH
@@ -371,250 +298,34 @@ endif
 
 export PERL_PATH
 
-LIB_FILE=libgit.a
-XDIFF_LIB=xdiff/lib.a
-
-LIB_H += archive.h
-LIB_H += attr.h
-LIB_H += blob.h
-LIB_H += builtin.h
-LIB_H += cache.h
-LIB_H += cache-tree.h
-LIB_H += commit.h
-LIB_H += compat/cygwin.h
-LIB_H += compat/mingw.h
-LIB_H += csum-file.h
-LIB_H += decorate.h
-LIB_H += delta.h
-LIB_H += diffcore.h
-LIB_H += diff.h
-LIB_H += dir.h
-LIB_H += fsck.h
-LIB_H += git-compat-util.h
-LIB_H += graph.h
-LIB_H += grep.h
-LIB_H += hash.h
-LIB_H += help.h
+LIB_FILE=libperf.a
+
+LIB_H += ../../include/linux/perf_counter.h
 LIB_H += levenshtein.h
-LIB_H += list-objects.h
-LIB_H += ll-merge.h
-LIB_H += log-tree.h
-LIB_H += mailmap.h
-LIB_H += merge-recursive.h
-LIB_H += object.h
-LIB_H += pack.h
-LIB_H += pack-refs.h
-LIB_H += pack-revindex.h
 LIB_H += parse-options.h
-LIB_H += patch-ids.h
-LIB_H += pkt-line.h
-LIB_H += progress.h
 LIB_H += quote.h
-LIB_H += reflog-walk.h
-LIB_H += refs.h
-LIB_H += remote.h
-LIB_H += rerere.h
-LIB_H += revision.h
-LIB_H += run-command.h
-LIB_H += sha1-lookup.h
-LIB_H += sideband.h
-LIB_H += sigchain.h
 LIB_H += strbuf.h
-LIB_H += string-list.h
-LIB_H += tag.h
-LIB_H += transport.h
-LIB_H += tree.h
-LIB_H += tree-walk.h
-LIB_H += unpack-trees.h
-LIB_H += userdiff.h
-LIB_H += utf8.h
-LIB_H += wt-status.h
+LIB_H += run-command.h
 
 LIB_OBJS += abspath.o
 LIB_OBJS += alias.o
-LIB_OBJS += alloc.o
-LIB_OBJS += archive.o
-LIB_OBJS += archive-tar.o
-LIB_OBJS += archive-zip.o
-LIB_OBJS += attr.o
-LIB_OBJS += base85.o
-LIB_OBJS += bisect.o
-LIB_OBJS += blob.o
-LIB_OBJS += branch.o
-LIB_OBJS += bundle.o
-LIB_OBJS += cache-tree.o
-LIB_OBJS += color.o
-LIB_OBJS += combine-diff.o
-LIB_OBJS += commit.o
 LIB_OBJS += config.o
-LIB_OBJS += connect.o
-LIB_OBJS += convert.o
-LIB_OBJS += copy.o
-LIB_OBJS += csum-file.o
 LIB_OBJS += ctype.o
-LIB_OBJS += date.o
-LIB_OBJS += decorate.o
-LIB_OBJS += diffcore-break.o
-LIB_OBJS += diffcore-delta.o
-LIB_OBJS += diffcore-order.o
-LIB_OBJS += diffcore-pickaxe.o
-LIB_OBJS += diffcore-rename.o
-LIB_OBJS += diff-delta.o
-LIB_OBJS += diff-lib.o
-LIB_OBJS += diff-no-index.o
-LIB_OBJS += diff.o
-LIB_OBJS += dir.o
-LIB_OBJS += editor.o
-LIB_OBJS += entry.o
-LIB_OBJS += environment.o
 LIB_OBJS += exec_cmd.o
-LIB_OBJS += fsck.o
-LIB_OBJS += graph.o
-LIB_OBJS += grep.o
-LIB_OBJS += hash.o
 LIB_OBJS += help.o
-LIB_OBJS += ident.o
 LIB_OBJS += levenshtein.o
-LIB_OBJS += list-objects.o
-LIB_OBJS += ll-merge.o
-LIB_OBJS += lockfile.o
-LIB_OBJS += log-tree.o
-LIB_OBJS += mailmap.o
-LIB_OBJS += match-trees.o
-LIB_OBJS += merge-file.o
-LIB_OBJS += merge-recursive.o
-LIB_OBJS += name-hash.o
-LIB_OBJS += object.o
-LIB_OBJS += pack-check.o
-LIB_OBJS += pack-refs.o
-LIB_OBJS += pack-revindex.o
-LIB_OBJS += pack-write.o
-LIB_OBJS += pager.o
 LIB_OBJS += parse-options.o
-LIB_OBJS += patch-delta.o
-LIB_OBJS += patch-ids.o
 LIB_OBJS += path.o
-LIB_OBJS += pkt-line.o
-LIB_OBJS += preload-index.o
-LIB_OBJS += pretty.o
-LIB_OBJS += progress.o
-LIB_OBJS += quote.o
-LIB_OBJS += reachable.o
-LIB_OBJS += read-cache.o
-LIB_OBJS += reflog-walk.o
-LIB_OBJS += refs.o
-LIB_OBJS += remote.o
-LIB_OBJS += rerere.o
-LIB_OBJS += revision.o
 LIB_OBJS += run-command.o
-LIB_OBJS += server-info.o
-LIB_OBJS += setup.o
-LIB_OBJS += sha1-lookup.o
-LIB_OBJS += sha1_file.o
-LIB_OBJS += sha1_name.o
-LIB_OBJS += shallow.o
-LIB_OBJS += sideband.o
-LIB_OBJS += sigchain.o
+LIB_OBJS += quote.o
 LIB_OBJS += strbuf.o
-LIB_OBJS += string-list.o
-LIB_OBJS += symlinks.o
-LIB_OBJS += tag.o
-LIB_OBJS += trace.o
-LIB_OBJS += transport.o
-LIB_OBJS += tree-diff.o
-LIB_OBJS += tree.o
-LIB_OBJS += tree-walk.o
-LIB_OBJS += unpack-trees.o
 LIB_OBJS += usage.o
-LIB_OBJS += userdiff.o
-LIB_OBJS += utf8.o
-LIB_OBJS += walker.o
 LIB_OBJS += wrapper.o
-LIB_OBJS += write_or_die.o
-LIB_OBJS += ws.o
-LIB_OBJS += wt-status.o
-LIB_OBJS += xdiff-interface.o
-
-BUILTIN_OBJS += builtin-add.o
-BUILTIN_OBJS += builtin-annotate.o
-BUILTIN_OBJS += builtin-apply.o
-BUILTIN_OBJS += builtin-archive.o
-BUILTIN_OBJS += builtin-bisect--helper.o
-BUILTIN_OBJS += builtin-blame.o
-BUILTIN_OBJS += builtin-branch.o
-BUILTIN_OBJS += builtin-bundle.o
-BUILTIN_OBJS += builtin-cat-file.o
-BUILTIN_OBJS += builtin-check-attr.o
-BUILTIN_OBJS += builtin-check-ref-format.o
-BUILTIN_OBJS += builtin-checkout-index.o
-BUILTIN_OBJS += builtin-checkout.o
-BUILTIN_OBJS += builtin-clean.o
-BUILTIN_OBJS += builtin-clone.o
-BUILTIN_OBJS += builtin-commit-tree.o
-BUILTIN_OBJS += builtin-commit.o
-BUILTIN_OBJS += builtin-config.o
-BUILTIN_OBJS += builtin-count-objects.o
-BUILTIN_OBJS += builtin-describe.o
-BUILTIN_OBJS += builtin-diff-files.o
-BUILTIN_OBJS += builtin-diff-index.o
-BUILTIN_OBJS += builtin-diff-tree.o
-BUILTIN_OBJS += builtin-diff.o
-BUILTIN_OBJS += builtin-fast-export.o
-BUILTIN_OBJS += builtin-fetch--tool.o
-BUILTIN_OBJS += builtin-fetch-pack.o
-BUILTIN_OBJS += builtin-fetch.o
-BUILTIN_OBJS += builtin-fmt-merge-msg.o
-BUILTIN_OBJS += builtin-for-each-ref.o
-BUILTIN_OBJS += builtin-fsck.o
-BUILTIN_OBJS += builtin-gc.o
-BUILTIN_OBJS += builtin-grep.o
+
 BUILTIN_OBJS += builtin-help.o
-BUILTIN_OBJS += builtin-init-db.o
-BUILTIN_OBJS += builtin-log.o
-BUILTIN_OBJS += builtin-ls-files.o
-BUILTIN_OBJS += builtin-ls-remote.o
-BUILTIN_OBJS += builtin-ls-tree.o
-BUILTIN_OBJS += builtin-mailinfo.o
-BUILTIN_OBJS += builtin-mailsplit.o
-BUILTIN_OBJS += builtin-merge.o
-BUILTIN_OBJS += builtin-merge-base.o
-BUILTIN_OBJS += builtin-merge-file.o
-BUILTIN_OBJS += builtin-merge-ours.o
-BUILTIN_OBJS += builtin-merge-recursive.o
-BUILTIN_OBJS += builtin-mv.o
-BUILTIN_OBJS += builtin-name-rev.o
-BUILTIN_OBJS += builtin-pack-objects.o
-BUILTIN_OBJS += builtin-pack-refs.o
-BUILTIN_OBJS += builtin-prune-packed.o
-BUILTIN_OBJS += builtin-prune.o
-BUILTIN_OBJS += builtin-push.o
-BUILTIN_OBJS += builtin-read-tree.o
-BUILTIN_OBJS += builtin-receive-pack.o
-BUILTIN_OBJS += builtin-reflog.o
-BUILTIN_OBJS += builtin-remote.o
-BUILTIN_OBJS += builtin-rerere.o
-BUILTIN_OBJS += builtin-reset.o
-BUILTIN_OBJS += builtin-rev-list.o
-BUILTIN_OBJS += builtin-rev-parse.o
-BUILTIN_OBJS += builtin-revert.o
-BUILTIN_OBJS += builtin-rm.o
-BUILTIN_OBJS += builtin-send-pack.o
-BUILTIN_OBJS += builtin-shortlog.o
-BUILTIN_OBJS += builtin-show-branch.o
-BUILTIN_OBJS += builtin-show-ref.o
-BUILTIN_OBJS += builtin-stripspace.o
-BUILTIN_OBJS += builtin-symbolic-ref.o
-BUILTIN_OBJS += builtin-tag.o
-BUILTIN_OBJS += builtin-tar-tree.o
-BUILTIN_OBJS += builtin-unpack-objects.o
-BUILTIN_OBJS += builtin-update-index.o
-BUILTIN_OBJS += builtin-update-ref.o
-BUILTIN_OBJS += builtin-upload-archive.o
-BUILTIN_OBJS += builtin-verify-pack.o
-BUILTIN_OBJS += builtin-verify-tag.o
-BUILTIN_OBJS += builtin-write-tree.o
-
-GITLIBS = $(LIB_FILE) $(XDIFF_LIB)
+BUILTIN_OBJS += builtin-top.o
+
+PERFLIBS = $(LIB_FILE)
 EXTLIBS =
 
 #
@@ -625,221 +336,6 @@ EXTLIBS =
 # because maintaining the nesting to match is a pain.  If
 # we had "elif" things would have been much nicer...
 
-ifeq ($(uname_S),Linux)
-	NO_STRLCPY = YesPlease
-	THREADED_DELTA_SEARCH = YesPlease
-endif
-ifeq ($(uname_S),GNU/kFreeBSD)
-	NO_STRLCPY = YesPlease
-	THREADED_DELTA_SEARCH = YesPlease
-endif
-ifeq ($(uname_S),UnixWare)
-	CC = cc
-	NEEDS_SOCKET = YesPlease
-	NEEDS_NSL = YesPlease
-	NEEDS_SSL_WITH_CRYPTO = YesPlease
-	NEEDS_LIBICONV = YesPlease
-	SHELL_PATH = /usr/local/bin/bash
-	NO_IPV6 = YesPlease
-	NO_HSTRERROR = YesPlease
-	BASIC_CFLAGS += -Kthread
-	BASIC_CFLAGS += -I/usr/local/include
-	BASIC_LDFLAGS += -L/usr/local/lib
-	INSTALL = ginstall
-	TAR = gtar
-	NO_STRCASESTR = YesPlease
-	NO_MEMMEM = YesPlease
-endif
-ifeq ($(uname_S),SCO_SV)
-	ifeq ($(uname_R),3.2)
-		CFLAGS = -O2
-	endif
-	ifeq ($(uname_R),5)
-		CC = cc
-		BASIC_CFLAGS += -Kthread
-	endif
-	NEEDS_SOCKET = YesPlease
-	NEEDS_NSL = YesPlease
-	NEEDS_SSL_WITH_CRYPTO = YesPlease
-	NEEDS_LIBICONV = YesPlease
-	SHELL_PATH = /usr/bin/bash
-	NO_IPV6 = YesPlease
-	NO_HSTRERROR = YesPlease
-	BASIC_CFLAGS += -I/usr/local/include
-	BASIC_LDFLAGS += -L/usr/local/lib
-	NO_STRCASESTR = YesPlease
-	NO_MEMMEM = YesPlease
-	INSTALL = ginstall
-	TAR = gtar
-endif
-ifeq ($(uname_S),Darwin)
-	NEEDS_SSL_WITH_CRYPTO = YesPlease
-	NEEDS_LIBICONV = YesPlease
-	ifeq ($(shell expr "$(uname_R)" : '[15678]\.'),2)
-		OLD_ICONV = UnfortunatelyYes
-	endif
-	ifeq ($(shell expr "$(uname_R)" : '[15]\.'),2)
-		NO_STRLCPY = YesPlease
-	endif
-	NO_MEMMEM = YesPlease
-	THREADED_DELTA_SEARCH = YesPlease
-	USE_ST_TIMESPEC = YesPlease
-endif
-ifeq ($(uname_S),SunOS)
-	NEEDS_SOCKET = YesPlease
-	NEEDS_NSL = YesPlease
-	SHELL_PATH = /bin/bash
-	NO_STRCASESTR = YesPlease
-	NO_MEMMEM = YesPlease
-	NO_HSTRERROR = YesPlease
-	NO_MKDTEMP = YesPlease
-	OLD_ICONV = UnfortunatelyYes
-	ifeq ($(uname_R),5.8)
-		NO_UNSETENV = YesPlease
-		NO_SETENV = YesPlease
-		NO_C99_FORMAT = YesPlease
-		NO_STRTOUMAX = YesPlease
-	endif
-	ifeq ($(uname_R),5.9)
-		NO_UNSETENV = YesPlease
-		NO_SETENV = YesPlease
-		NO_C99_FORMAT = YesPlease
-		NO_STRTOUMAX = YesPlease
-	endif
-	INSTALL = ginstall
-	TAR = gtar
-	BASIC_CFLAGS += -D__EXTENSIONS__
-endif
-ifeq ($(uname_O),Cygwin)
-	NO_D_TYPE_IN_DIRENT = YesPlease
-	NO_D_INO_IN_DIRENT = YesPlease
-	NO_STRCASESTR = YesPlease
-	NO_MEMMEM = YesPlease
-	NO_SYMLINK_HEAD = YesPlease
-	NEEDS_LIBICONV = YesPlease
-	NO_FAST_WORKING_DIRECTORY = UnfortunatelyYes
-	NO_TRUSTABLE_FILEMODE = UnfortunatelyYes
-	OLD_ICONV = UnfortunatelyYes
-	# There are conflicting reports about this.
-	# On some boxes NO_MMAP is needed, and not so elsewhere.
-	# Try commenting this out if you suspect MMAP is more efficient
-	NO_MMAP = YesPlease
-	NO_IPV6 = YesPlease
-	X = .exe
-endif
-ifeq ($(uname_S),FreeBSD)
-	NEEDS_LIBICONV = YesPlease
-	NO_MEMMEM = YesPlease
-	BASIC_CFLAGS += -I/usr/local/include
-	BASIC_LDFLAGS += -L/usr/local/lib
-	DIR_HAS_BSD_GROUP_SEMANTICS = YesPlease
-	USE_ST_TIMESPEC = YesPlease
-	THREADED_DELTA_SEARCH = YesPlease
-	ifeq ($(shell expr "$(uname_R)" : '4\.'),2)
-		PTHREAD_LIBS = -pthread
-		NO_UINTMAX_T = YesPlease
-		NO_STRTOUMAX = YesPlease
-	endif
-endif
-ifeq ($(uname_S),OpenBSD)
-	NO_STRCASESTR = YesPlease
-	NO_MEMMEM = YesPlease
-	NEEDS_LIBICONV = YesPlease
-	BASIC_CFLAGS += -I/usr/local/include
-	BASIC_LDFLAGS += -L/usr/local/lib
-	THREADED_DELTA_SEARCH = YesPlease
-endif
-ifeq ($(uname_S),NetBSD)
-	ifeq ($(shell expr "$(uname_R)" : '[01]\.'),2)
-		NEEDS_LIBICONV = YesPlease
-	endif
-	BASIC_CFLAGS += -I/usr/pkg/include
-	BASIC_LDFLAGS += -L/usr/pkg/lib $(CC_LD_DYNPATH)/usr/pkg/lib
-	THREADED_DELTA_SEARCH = YesPlease
-endif
-ifeq ($(uname_S),AIX)
-	NO_STRCASESTR=YesPlease
-	NO_MEMMEM = YesPlease
-	NO_MKDTEMP = YesPlease
-	NO_STRLCPY = YesPlease
-	NO_NSEC = YesPlease
-	FREAD_READS_DIRECTORIES = UnfortunatelyYes
-	INTERNAL_QSORT = UnfortunatelyYes
-	NEEDS_LIBICONV=YesPlease
-	BASIC_CFLAGS += -D_LARGE_FILES
-	ifneq ($(shell expr "$(uname_V)" : '[1234]'),1)
-		THREADED_DELTA_SEARCH = YesPlease
-	else
-		NO_PTHREADS = YesPlease
-	endif
-endif
-ifeq ($(uname_S),GNU)
-	# GNU/Hurd
-	NO_STRLCPY=YesPlease
-endif
-ifeq ($(uname_S),IRIX64)
-	NO_IPV6=YesPlease
-	NO_SETENV=YesPlease
-	NO_STRCASESTR=YesPlease
-	NO_MEMMEM = YesPlease
-	NO_STRLCPY = YesPlease
-	NO_SOCKADDR_STORAGE=YesPlease
-	SHELL_PATH=/usr/gnu/bin/bash
-	BASIC_CFLAGS += -DPATH_MAX=1024
-	# for now, build 32-bit version
-	BASIC_LDFLAGS += -L/usr/lib32
-endif
-ifeq ($(uname_S),HP-UX)
-	NO_IPV6=YesPlease
-	NO_SETENV=YesPlease
-	NO_STRCASESTR=YesPlease
-	NO_MEMMEM = YesPlease
-	NO_STRLCPY = YesPlease
-	NO_MKDTEMP = YesPlease
-	NO_UNSETENV = YesPlease
-	NO_HSTRERROR = YesPlease
-	NO_SYS_SELECT_H = YesPlease
-	SNPRINTF_RETURNS_BOGUS = YesPlease
-endif
-ifneq (,$(findstring CYGWIN,$(uname_S)))
-	COMPAT_OBJS += compat/cygwin.o
-endif
-ifneq (,$(findstring MINGW,$(uname_S)))
-	NO_PREAD = YesPlease
-	NO_OPENSSL = YesPlease
-	NO_CURL = YesPlease
-	NO_SYMLINK_HEAD = YesPlease
-	NO_IPV6 = YesPlease
-	NO_SETENV = YesPlease
-	NO_UNSETENV = YesPlease
-	NO_STRCASESTR = YesPlease
-	NO_STRLCPY = YesPlease
-	NO_MEMMEM = YesPlease
-	NO_PTHREADS = YesPlease
-	NEEDS_LIBICONV = YesPlease
-	OLD_ICONV = YesPlease
-	NO_C99_FORMAT = YesPlease
-	NO_STRTOUMAX = YesPlease
-	NO_MKDTEMP = YesPlease
-	SNPRINTF_RETURNS_BOGUS = YesPlease
-	NO_SVN_TESTS = YesPlease
-	NO_PERL_MAKEMAKER = YesPlease
-	RUNTIME_PREFIX = YesPlease
-	NO_POSIX_ONLY_PROGRAMS = YesPlease
-	NO_ST_BLOCKS_IN_STRUCT_STAT = YesPlease
-	NO_NSEC = YesPlease
-	USE_WIN32_MMAP = YesPlease
-	COMPAT_CFLAGS += -D__USE_MINGW_ACCESS -DNOGDI -Icompat -Icompat/regex -Icompat/fnmatch
-	COMPAT_CFLAGS += -DSNPRINTF_SIZE_CORR=1
-	COMPAT_CFLAGS += -DSTRIP_EXTENSION=\".exe\"
-	COMPAT_OBJS += compat/mingw.o compat/fnmatch/fnmatch.o compat/regex/regex.o compat/winansi.o
-	EXTLIBS += -lws2_32
-	X = .exe
-endif
-ifneq (,$(findstring arm,$(uname_M)))
-	ARM_SHA1 = YesPlease
-endif
-
 -include config.mak.autogen
 -include config.mak
 
@@ -869,72 +365,12 @@ ifndef CC_LD_DYNPATH
 	endif
 endif
 
-ifdef NO_CURL
-	BASIC_CFLAGS += -DNO_CURL
-else
-	ifdef CURLDIR
-		# Try "-Wl,-rpath=$(CURLDIR)/$(lib)" in such a case.
-		BASIC_CFLAGS += -I$(CURLDIR)/include
-		CURL_LIBCURL = -L$(CURLDIR)/$(lib) $(CC_LD_DYNPATH)$(CURLDIR)/$(lib) -lcurl
-	else
-		CURL_LIBCURL = -lcurl
-	endif
-	BUILTIN_OBJS += builtin-http-fetch.o
-	EXTLIBS += $(CURL_LIBCURL)
-	LIB_OBJS += http.o http-walker.o
-	curl_check := $(shell (echo 070908; curl-config --vernum) | sort -r | sed -ne 2p)
-	ifeq "$(curl_check)" "070908"
-		ifndef NO_EXPAT
-			PROGRAMS += git-http-push$X
-		endif
-	endif
-	ifndef NO_EXPAT
-		ifdef EXPATDIR
-			BASIC_CFLAGS += -I$(EXPATDIR)/include
-			EXPAT_LIBEXPAT = -L$(EXPATDIR)/$(lib) $(CC_LD_DYNPATH)$(EXPATDIR)/$(lib) -lexpat
-		else
-			EXPAT_LIBEXPAT = -lexpat
-		endif
-	endif
-endif
-
 ifdef ZLIB_PATH
 	BASIC_CFLAGS += -I$(ZLIB_PATH)/include
 	EXTLIBS += -L$(ZLIB_PATH)/$(lib) $(CC_LD_DYNPATH)$(ZLIB_PATH)/$(lib)
 endif
 EXTLIBS += -lz
 
-ifndef NO_POSIX_ONLY_PROGRAMS
-	PROGRAMS += git-daemon$X
-	PROGRAMS += git-imap-send$X
-endif
-ifndef NO_OPENSSL
-	OPENSSL_LIBSSL = -lssl
-	ifdef OPENSSLDIR
-		BASIC_CFLAGS += -I$(OPENSSLDIR)/include
-		OPENSSL_LINK = -L$(OPENSSLDIR)/$(lib) $(CC_LD_DYNPATH)$(OPENSSLDIR)/$(lib)
-	else
-		OPENSSL_LINK =
-	endif
-else
-	BASIC_CFLAGS += -DNO_OPENSSL
-	MOZILLA_SHA1 = 1
-	OPENSSL_LIBSSL =
-endif
-ifdef NEEDS_SSL_WITH_CRYPTO
-	LIB_4_CRYPTO = $(OPENSSL_LINK) -lcrypto -lssl
-else
-	LIB_4_CRYPTO = $(OPENSSL_LINK) -lcrypto
-endif
-ifdef NEEDS_LIBICONV
-	ifdef ICONVDIR
-		BASIC_CFLAGS += -I$(ICONVDIR)/include
-		ICONV_LINK = -L$(ICONVDIR)/$(lib) $(CC_LD_DYNPATH)$(ICONVDIR)/$(lib)
-	else
-		ICONV_LINK =
-	endif
-	EXTLIBS += $(ICONV_LINK) -liconv
-endif
 ifdef NEEDS_SOCKET
 	EXTLIBS += -lsocket
 endif
@@ -977,10 +413,6 @@ ifdef NO_STRCASESTR
 	COMPAT_CFLAGS += -DNO_STRCASESTR
 	COMPAT_OBJS += compat/strcasestr.o
 endif
-ifdef NO_STRLCPY
-	COMPAT_CFLAGS += -DNO_STRLCPY
-	COMPAT_OBJS += compat/strlcpy.o
-endif
 ifdef NO_STRTOUMAX
 	COMPAT_CFLAGS += -DNO_STRTOUMAX
 	COMPAT_OBJS += compat/strtoumax.o
@@ -1090,17 +522,6 @@ ifdef RUNTIME_PREFIX
 	COMPAT_CFLAGS += -DRUNTIME_PREFIX
 endif
 
-ifdef NO_PTHREADS
-	THREADED_DELTA_SEARCH =
-	BASIC_CFLAGS += -DNO_PTHREADS
-else
-	EXTLIBS += $(PTHREAD_LIBS)
-endif
-
-ifdef THREADED_DELTA_SEARCH
-	BASIC_CFLAGS += -DTHREADED_DELTA_SEARCH
-	LIB_OBJS += thread-utils.o
-endif
 ifdef DIR_HAS_BSD_GROUP_SEMANTICS
 	COMPAT_CFLAGS += -DDIR_HAS_BSD_GROUP_SEMANTICS
 endif
@@ -1148,14 +569,14 @@ endif
 # Shell quote (do not use $(call) to accommodate ancient setups);
 
 SHA1_HEADER_SQ = $(subst ','\'',$(SHA1_HEADER))
-ETC_GITCONFIG_SQ = $(subst ','\'',$(ETC_GITCONFIG))
+ETC_PERFCONFIG_SQ = $(subst ','\'',$(ETC_PERFCONFIG))
 
 DESTDIR_SQ = $(subst ','\'',$(DESTDIR))
 bindir_SQ = $(subst ','\'',$(bindir))
 bindir_relative_SQ = $(subst ','\'',$(bindir_relative))
 mandir_SQ = $(subst ','\'',$(mandir))
 infodir_SQ = $(subst ','\'',$(infodir))
-gitexecdir_SQ = $(subst ','\'',$(gitexecdir))
+perfexecdir_SQ = $(subst ','\'',$(perfexecdir))
 template_dir_SQ = $(subst ','\'',$(template_dir))
 htmldir_SQ = $(subst ','\'',$(htmldir))
 prefix_SQ = $(subst ','\'',$(prefix))
@@ -1164,7 +585,7 @@ SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
 PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH))
 TCLTK_PATH_SQ = $(subst ','\'',$(TCLTK_PATH))
 
-LIBS = $(GITLIBS) $(EXTLIBS)
+LIBS = $(PERFLIBS) $(EXTLIBS)
 
 BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \
 	$(COMPAT_CFLAGS)
@@ -1180,15 +601,15 @@ export TAR INSTALL DESTDIR SHELL_PATH
 
 SHELL = $(SHELL_PATH)
 
-all:: shell_compatibility_test $(ALL_PROGRAMS) $(BUILT_INS) $(OTHER_PROGRAMS) GIT-BUILD-OPTIONS
+all:: shell_compatibility_test $(ALL_PROGRAMS) $(BUILT_INS) $(OTHER_PROGRAMS) PERF-BUILD-OPTIONS
 ifneq (,$X)
-	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) git$X)), test '$p' -ef '$p$X' || $(RM) '$p';)
+	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), test '$p' -ef '$p$X' || $(RM) '$p';)
 endif
 
 all::
 ifndef NO_TCLTK
-	$(QUIET_SUBDIR0)git-gui $(QUIET_SUBDIR1) gitexecdir='$(gitexec_instdir_SQ)' all
-	$(QUIET_SUBDIR0)gitk-git $(QUIET_SUBDIR1) all
+	$(QUIET_SUBDIR0)perf-gui $(QUIET_SUBDIR1) perfexecdir='$(perfexec_instdir_SQ)' all
+	$(QUIET_SUBDIR0)perfk-perf $(QUIET_SUBDIR1) all
 endif
 ifndef NO_PERL
 	$(QUIET_SUBDIR0)perl $(QUIET_SUBDIR1) PERL_PATH='$(PERL_PATH_SQ)' prefix='$(prefix_SQ)' all
@@ -1200,33 +621,33 @@ please_set_SHELL_PATH_to_a_more_modern_shell:
 
 shell_compatibility_test: please_set_SHELL_PATH_to_a_more_modern_shell
 
-strip: $(PROGRAMS) git$X
-	$(STRIP) $(STRIP_OPTS) $(PROGRAMS) git$X
+strip: $(PROGRAMS) perf$X
+	$(STRIP) $(STRIP_OPTS) $(PROGRAMS) perf$X
 
-git.o: git.c common-cmds.h GIT-CFLAGS
-	$(QUIET_CC)$(CC) -DGIT_VERSION='"$(GIT_VERSION)"' \
-		'-DGIT_HTML_PATH="$(htmldir_SQ)"' \
+perf.o: perf.c common-cmds.h PERF-CFLAGS
+	$(QUIET_CC)$(CC) -DPERF_VERSION='"$(PERF_VERSION)"' \
+		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \
 		$(ALL_CFLAGS) -c $(filter %.c,$^)
 
-git$X: git.o $(BUILTIN_OBJS) $(GITLIBS)
-	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ git.o \
+perf$X: perf.o $(BUILTIN_OBJS) $(PERFLIBS)
+	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ perf.o \
 		$(BUILTIN_OBJS) $(ALL_LDFLAGS) $(LIBS)
 
-builtin-help.o: builtin-help.c common-cmds.h GIT-CFLAGS
+builtin-help.o: builtin-help.c common-cmds.h PERF-CFLAGS
 	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
-		'-DGIT_HTML_PATH="$(htmldir_SQ)"' \
-		'-DGIT_MAN_PATH="$(mandir_SQ)"' \
-		'-DGIT_INFO_PATH="$(infodir_SQ)"' $<
+		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \
+		'-DPERF_MAN_PATH="$(mandir_SQ)"' \
+		'-DPERF_INFO_PATH="$(infodir_SQ)"' $<
 
-$(BUILT_INS): git$X
+$(BUILT_INS): perf$X
 	$(QUIET_BUILT_IN)$(RM) $@ && \
-	ln git$X $@ 2>/dev/null || \
-	ln -s git$X $@ 2>/dev/null || \
-	cp git$X $@
+	ln perf$X $@ 2>/dev/null || \
+	ln -s perf$X $@ 2>/dev/null || \
+	cp perf$X $@
 
 common-cmds.h: ./generate-cmdlist.sh command-list.txt
 
-common-cmds.h: $(wildcard Documentation/git-*.txt)
+common-cmds.h: $(wildcard Documentation/perf-*.txt)
 	$(QUIET_GEN)./generate-cmdlist.sh > $@+ && mv $@+ $@
 
 $(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh
@@ -1234,152 +655,55 @@ $(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh
 	sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \
 	    -e 's|@SHELL_PATH@|$(SHELL_PATH_SQ)|' \
 	    -e 's|@@PERL@@|$(PERL_PATH_SQ)|g' \
-	    -e 's/@@GIT_VERSION@@/$(GIT_VERSION)/g' \
+	    -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
 	    -e 's/@@NO_CURL@@/$(NO_CURL)/g' \
 	    $@.sh >$@+ && \
 	chmod +x $@+ && \
 	mv $@+ $@
 
-ifndef NO_PERL
-$(patsubst %.perl,%,$(SCRIPT_PERL)): perl/perl.mak
-
-perl/perl.mak: GIT-CFLAGS perl/Makefile perl/Makefile.PL
-	$(QUIET_SUBDIR0)perl $(QUIET_SUBDIR1) PERL_PATH='$(PERL_PATH_SQ)' prefix='$(prefix_SQ)' $(@F)
-
-$(patsubst %.perl,%,$(SCRIPT_PERL)): % : %.perl
-	$(QUIET_GEN)$(RM) $@ $@+ && \
-	INSTLIBDIR=`MAKEFLAGS= $(MAKE) -C perl -s --no-print-directory instlibdir` && \
-	sed -e '1{' \
-	    -e '	s|#!.*perl|#!$(PERL_PATH_SQ)|' \
-	    -e '	h' \
-	    -e '	s=.*=use lib (split(/:/, $$ENV{GITPERLLIB} || "@@INSTLIBDIR@@"));=' \
-	    -e '	H' \
-	    -e '	x' \
-	    -e '}' \
-	    -e 's|@@INSTLIBDIR@@|'"$$INSTLIBDIR"'|g' \
-	    -e 's/@@GIT_VERSION@@/$(GIT_VERSION)/g' \
-	    $@.perl >$@+ && \
-	chmod +x $@+ && \
-	mv $@+ $@
-
-gitweb/gitweb.cgi: gitweb/gitweb.perl
-	$(QUIET_GEN)$(RM) $@ $@+ && \
-	sed -e '1s|#!.*perl|#!$(PERL_PATH_SQ)|' \
-	    -e 's|++GIT_VERSION++|$(GIT_VERSION)|g' \
-	    -e 's|++GIT_BINDIR++|$(bindir)|g' \
-	    -e 's|++GITWEB_CONFIG++|$(GITWEB_CONFIG)|g' \
-	    -e 's|++GITWEB_CONFIG_SYSTEM++|$(GITWEB_CONFIG_SYSTEM)|g' \
-	    -e 's|++GITWEB_HOME_LINK_STR++|$(GITWEB_HOME_LINK_STR)|g' \
-	    -e 's|++GITWEB_SITENAME++|$(GITWEB_SITENAME)|g' \
-	    -e 's|++GITWEB_PROJECTROOT++|$(GITWEB_PROJECTROOT)|g' \
-	    -e 's|"++GITWEB_PROJECT_MAXDEPTH++"|$(GITWEB_PROJECT_MAXDEPTH)|g' \
-	    -e 's|++GITWEB_EXPORT_OK++|$(GITWEB_EXPORT_OK)|g' \
-	    -e 's|++GITWEB_STRICT_EXPORT++|$(GITWEB_STRICT_EXPORT)|g' \
-	    -e 's|++GITWEB_BASE_URL++|$(GITWEB_BASE_URL)|g' \
-	    -e 's|++GITWEB_LIST++|$(GITWEB_LIST)|g' \
-	    -e 's|++GITWEB_HOMETEXT++|$(GITWEB_HOMETEXT)|g' \
-	    -e 's|++GITWEB_CSS++|$(GITWEB_CSS)|g' \
-	    -e 's|++GITWEB_LOGO++|$(GITWEB_LOGO)|g' \
-	    -e 's|++GITWEB_FAVICON++|$(GITWEB_FAVICON)|g' \
-	    -e 's|++GITWEB_SITE_HEADER++|$(GITWEB_SITE_HEADER)|g' \
-	    -e 's|++GITWEB_SITE_FOOTER++|$(GITWEB_SITE_FOOTER)|g' \
-	    $< >$@+ && \
-	chmod +x $@+ && \
-	mv $@+ $@
-
-git-instaweb: git-instaweb.sh gitweb/gitweb.cgi gitweb/gitweb.css
-	$(QUIET_GEN)$(RM) $@ $@+ && \
-	sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \
-	    -e 's/@@GIT_VERSION@@/$(GIT_VERSION)/g' \
-	    -e 's/@@NO_CURL@@/$(NO_CURL)/g' \
-	    -e '/@@GITWEB_CGI@@/r gitweb/gitweb.cgi' \
-	    -e '/@@GITWEB_CGI@@/d' \
-	    -e '/@@GITWEB_CSS@@/r gitweb/gitweb.css' \
-	    -e '/@@GITWEB_CSS@@/d' \
-	    -e 's|@@PERL@@|$(PERL_PATH_SQ)|g' \
-	    $@.sh > $@+ && \
-	chmod +x $@+ && \
-	mv $@+ $@
-else # NO_PERL
-$(patsubst %.perl,%,$(SCRIPT_PERL)) git-instaweb: % : unimplemented.sh
-	$(QUIET_GEN)$(RM) $@ $@+ && \
-	sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \
-	    -e 's|@@REASON@@|NO_PERL=$(NO_PERL)|g' \
-	    unimplemented.sh >$@+ && \
-	chmod +x $@+ && \
-	mv $@+ $@
-endif # NO_PERL
-
 configure: configure.ac
 	$(QUIET_GEN)$(RM) $@ $<+ && \
-	sed -e 's/@@GIT_VERSION@@/$(GIT_VERSION)/g' \
+	sed -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
 	    $< > $<+ && \
 	autoconf -o $@ $<+ && \
 	$(RM) $<+
 
-# These can record GIT_VERSION
-git.o git.spec \
+# These can record PERF_VERSION
+perf.o perf.spec \
 	$(patsubst %.sh,%,$(SCRIPT_SH)) \
 	$(patsubst %.perl,%,$(SCRIPT_PERL)) \
-	: GIT-VERSION-FILE
+	: PERF-VERSION-FILE
 
-%.o: %.c GIT-CFLAGS
+%.o: %.c PERF-CFLAGS
 	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
-%.s: %.c GIT-CFLAGS
+%.s: %.c PERF-CFLAGS
 	$(QUIET_CC)$(CC) -S $(ALL_CFLAGS) $<
 %.o: %.S
 	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
 
-exec_cmd.o: exec_cmd.c GIT-CFLAGS
+exec_cmd.o: exec_cmd.c PERF-CFLAGS
 	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
-		'-DGIT_EXEC_PATH="$(gitexecdir_SQ)"' \
+		'-DPERF_EXEC_PATH="$(perfexecdir_SQ)"' \
 		'-DBINDIR="$(bindir_relative_SQ)"' \
 		'-DPREFIX="$(prefix_SQ)"' \
 		$<
 
-builtin-init-db.o: builtin-init-db.c GIT-CFLAGS
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DDEFAULT_GIT_TEMPLATE_DIR='"$(template_dir_SQ)"' $<
-
-config.o: config.c GIT-CFLAGS
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DETC_GITCONFIG='"$(ETC_GITCONFIG_SQ)"' $<
+builtin-init-db.o: builtin-init-db.c PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DDEFAULT_PERF_TEMPLATE_DIR='"$(template_dir_SQ)"' $<
 
-http.o: http.c GIT-CFLAGS
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DGIT_USER_AGENT='"git/$(GIT_VERSION)"' $<
+config.o: config.c PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
 
-ifdef NO_EXPAT
-http-walker.o: http-walker.c http.h GIT-CFLAGS
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DNO_EXPAT $<
-endif
-
-git-%$X: %.o $(GITLIBS)
+perf-%$X: %.o $(PERFLIBS)
 	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS)
 
-git-imap-send$X: imap-send.o $(GITLIBS)
-	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \
-		$(LIBS) $(OPENSSL_LINK) $(OPENSSL_LIBSSL)
-
-http.o http-walker.o http-push.o transport.o: http.h
-
-git-http-push$X: revision.o http.o http-push.o $(GITLIBS)
-	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \
-		$(LIBS) $(CURL_LIBCURL) $(EXPAT_LIBEXPAT)
-
 $(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H)
-$(patsubst git-%$X,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h)
+$(patsubst perf-%$X,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h)
 builtin-revert.o wt-status.o: wt-status.h
 
 $(LIB_FILE): $(LIB_OBJS)
 	$(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS)
 
-XDIFF_OBJS=xdiff/xdiffi.o xdiff/xprepare.o xdiff/xutils.o xdiff/xemit.o \
-	xdiff/xmerge.o xdiff/xpatience.o
-$(XDIFF_OBJS): xdiff/xinclude.h xdiff/xmacros.h xdiff/xdiff.h xdiff/xtypes.h \
-	xdiff/xutils.h xdiff/xprepare.h xdiff/xdiffi.h xdiff/xemit.h
-
-$(XDIFF_LIB): $(XDIFF_OBJS)
-	$(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(XDIFF_OBJS)
-
-
 doc:
 	$(MAKE) -C Documentation all
 
@@ -1409,19 +733,19 @@ cscope:
 
 ### Detect prefix changes
 TRACK_CFLAGS = $(subst ','\'',$(ALL_CFLAGS)):\
-             $(bindir_SQ):$(gitexecdir_SQ):$(template_dir_SQ):$(prefix_SQ)
+             $(bindir_SQ):$(perfexecdir_SQ):$(template_dir_SQ):$(prefix_SQ)
 
-GIT-CFLAGS: .FORCE-GIT-CFLAGS
+PERF-CFLAGS: .FORCE-PERF-CFLAGS
 	@FLAGS='$(TRACK_CFLAGS)'; \
-	    if test x"$$FLAGS" != x"`cat GIT-CFLAGS 2>/dev/null`" ; then \
+	    if test x"$$FLAGS" != x"`cat PERF-CFLAGS 2>/dev/null`" ; then \
 		echo 1>&2 "    * new build flags or prefix"; \
-		echo "$$FLAGS" >GIT-CFLAGS; \
+		echo "$$FLAGS" >PERF-CFLAGS; \
             fi
 
 # We need to apply sq twice, once to protect from the shell
-# that runs GIT-BUILD-OPTIONS, and then again to protect it
+# that runs PERF-BUILD-OPTIONS, and then again to protect it
 # and the first level quoting from the shell that runs "echo".
-GIT-BUILD-OPTIONS: .FORCE-GIT-BUILD-OPTIONS
+PERF-BUILD-OPTIONS: .FORCE-PERF-BUILD-OPTIONS
 	@echo SHELL_PATH=\''$(subst ','\'',$(SHELL_PATH_SQ))'\' >$@
 	@echo TAR=\''$(subst ','\'',$(subst ','\'',$(TAR)))'\' >>$@
 	@echo NO_CURL=\''$(subst ','\'',$(subst ','\'',$(NO_CURL)))'\' >>$@
@@ -1431,14 +755,14 @@ GIT-BUILD-OPTIONS: .FORCE-GIT-BUILD-OPTIONS
 ifndef NO_TCLTK
 TRACK_VARS = $(subst ','\'',-DTCLTK_PATH='$(TCLTK_PATH_SQ)')
 
-GIT-GUI-VARS: .FORCE-GIT-GUI-VARS
+PERF-GUI-VARS: .FORCE-PERF-GUI-VARS
 	@VARS='$(TRACK_VARS)'; \
 	    if test x"$$VARS" != x"`cat $@ 2>/dev/null`" ; then \
 		echo 1>&2 "    * new Tcl/Tk interpreter location"; \
 		echo "$$VARS" >$@; \
             fi
 
-.PHONY: .FORCE-GIT-GUI-VARS
+.PHONY: .FORCE-PERF-GUI-VARS
 endif
 
 ### Testing rules
@@ -1476,7 +800,7 @@ test-parse-options$X: parse-options.o
 
 .PRECIOUS: $(patsubst test-%$X,test-%.o,$(TEST_PROGRAMS))
 
-test-%$X: test-%.o $(GITLIBS)
+test-%$X: test-%.o $(PERFLIBS)
 	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS)
 
 check-sha1:: test-sha1$X
@@ -1506,40 +830,40 @@ template_instdir = $(prefix)/$(template_dir)
 endif
 export template_instdir
 
-ifneq ($(filter /%,$(firstword $(gitexecdir))),)
-gitexec_instdir = $(gitexecdir)
+ifneq ($(filter /%,$(firstword $(perfexecdir))),)
+perfexec_instdir = $(perfexecdir)
 else
-gitexec_instdir = $(prefix)/$(gitexecdir)
+perfexec_instdir = $(prefix)/$(perfexecdir)
 endif
-gitexec_instdir_SQ = $(subst ','\'',$(gitexec_instdir))
-export gitexec_instdir
+perfexec_instdir_SQ = $(subst ','\'',$(perfexec_instdir))
+export perfexec_instdir
 
 install: all
 	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
-	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(gitexec_instdir_SQ)'
-	$(INSTALL) $(ALL_PROGRAMS) '$(DESTDIR_SQ)$(gitexec_instdir_SQ)'
-	$(INSTALL) git$X git-upload-pack$X git-receive-pack$X git-upload-archive$X git-shell$X git-cvsserver '$(DESTDIR_SQ)$(bindir_SQ)'
+	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
+	$(INSTALL) $(ALL_PROGRAMS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
+	$(INSTALL) perf$X perf-upload-pack$X perf-receive-pack$X perf-upload-archive$X perf-shell$X perf-cvsserver '$(DESTDIR_SQ)$(bindir_SQ)'
 	$(MAKE) -C templates DESTDIR='$(DESTDIR_SQ)' install
 	$(MAKE) -C perl prefix='$(prefix_SQ)' DESTDIR='$(DESTDIR_SQ)' install
 ifndef NO_TCLTK
-	$(MAKE) -C gitk-git install
-	$(MAKE) -C git-gui gitexecdir='$(gitexec_instdir_SQ)' install
+	$(MAKE) -C perfk-perf install
+	$(MAKE) -C perf-gui perfexecdir='$(perfexec_instdir_SQ)' install
 endif
 ifneq (,$X)
-	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) git$X)), $(RM) '$(DESTDIR_SQ)$(gitexec_instdir_SQ)/$p';)
+	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';)
 endif
 	bindir=$$(cd '$(DESTDIR_SQ)$(bindir_SQ)' && pwd) && \
-	execdir=$$(cd '$(DESTDIR_SQ)$(gitexec_instdir_SQ)' && pwd) && \
-	{ $(RM) "$$execdir/git-add$X" && \
-		ln "$$bindir/git$X" "$$execdir/git-add$X" 2>/dev/null || \
-		cp "$$bindir/git$X" "$$execdir/git-add$X"; } && \
-	{ for p in $(filter-out git-add$X,$(BUILT_INS)); do \
+	execdir=$$(cd '$(DESTDIR_SQ)$(perfexec_instdir_SQ)' && pwd) && \
+	{ $(RM) "$$execdir/perf-add$X" && \
+		ln "$$bindir/perf$X" "$$execdir/perf-add$X" 2>/dev/null || \
+		cp "$$bindir/perf$X" "$$execdir/perf-add$X"; } && \
+	{ for p in $(filter-out perf-add$X,$(BUILT_INS)); do \
 		$(RM) "$$execdir/$$p" && \
-		ln "$$execdir/git-add$X" "$$execdir/$$p" 2>/dev/null || \
-		ln -s "git-add$X" "$$execdir/$$p" 2>/dev/null || \
-		cp "$$execdir/git-add$X" "$$execdir/$$p" || exit; \
+		ln "$$execdir/perf-add$X" "$$execdir/$$p" 2>/dev/null || \
+		ln -s "perf-add$X" "$$execdir/$$p" 2>/dev/null || \
+		cp "$$execdir/perf-add$X" "$$execdir/$$p" || exit; \
 	  done } && \
-	./check_bindir "z$$bindir" "z$$execdir" "$$bindir/git-add$X"
+	./check_bindir "z$$bindir" "z$$execdir" "$$bindir/perf-add$X"
 
 install-doc:
 	$(MAKE) -C Documentation install
@@ -1569,31 +893,31 @@ quick-install-html:
 
 ### Maintainer's dist rules
 
-git.spec: git.spec.in
-	sed -e 's/@@VERSION@@/$(GIT_VERSION)/g' < $< > $@+
+perf.spec: perf.spec.in
+	sed -e 's/@@VERSION@@/$(PERF_VERSION)/g' < $< > $@+
 	mv $@+ $@
 
-GIT_TARNAME=git-$(GIT_VERSION)
-dist: git.spec git-archive$(X) configure
-	./git-archive --format=tar \
-		--prefix=$(GIT_TARNAME)/ HEAD^{tree} > $(GIT_TARNAME).tar
-	@mkdir -p $(GIT_TARNAME)
-	@cp git.spec configure $(GIT_TARNAME)
-	@echo $(GIT_VERSION) > $(GIT_TARNAME)/version
-	@$(MAKE) -C git-gui TARDIR=../$(GIT_TARNAME)/git-gui dist-version
-	$(TAR) rf $(GIT_TARNAME).tar \
-		$(GIT_TARNAME)/git.spec \
-		$(GIT_TARNAME)/configure \
-		$(GIT_TARNAME)/version \
-		$(GIT_TARNAME)/git-gui/version
-	@$(RM) -r $(GIT_TARNAME)
-	gzip -f -9 $(GIT_TARNAME).tar
+PERF_TARNAME=perf-$(PERF_VERSION)
+dist: perf.spec perf-archive$(X) configure
+	./perf-archive --format=tar \
+		--prefix=$(PERF_TARNAME)/ HEAD^{tree} > $(PERF_TARNAME).tar
+	@mkdir -p $(PERF_TARNAME)
+	@cp perf.spec configure $(PERF_TARNAME)
+	@echo $(PERF_VERSION) > $(PERF_TARNAME)/version
+	@$(MAKE) -C perf-gui TARDIR=../$(PERF_TARNAME)/perf-gui dist-version
+	$(TAR) rf $(PERF_TARNAME).tar \
+		$(PERF_TARNAME)/perf.spec \
+		$(PERF_TARNAME)/configure \
+		$(PERF_TARNAME)/version \
+		$(PERF_TARNAME)/perf-gui/version
+	@$(RM) -r $(PERF_TARNAME)
+	gzip -f -9 $(PERF_TARNAME).tar
 
 rpm: dist
-	$(RPMBUILD) -ta $(GIT_TARNAME).tar.gz
+	$(RPMBUILD) -ta $(PERF_TARNAME).tar.gz
 
-htmldocs = git-htmldocs-$(GIT_VERSION)
-manpages = git-manpages-$(GIT_VERSION)
+htmldocs = perf-htmldocs-$(PERF_VERSION)
+manpages = perf-manpages-$(PERF_VERSION)
 dist-doc:
 	$(RM) -r .doc-tmp-dir
 	mkdir .doc-tmp-dir
@@ -1618,51 +942,46 @@ distclean: clean
 	$(RM) configure
 
 clean:
-	$(RM) *.o mozilla-sha1/*.o arm/*.o ppc/*.o compat/*.o xdiff/*.o \
-		$(LIB_FILE) $(XDIFF_LIB)
-	$(RM) $(ALL_PROGRAMS) $(BUILT_INS) git$X
+	$(RM) *.o $(LIB_FILE)
+	$(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf$X
 	$(RM) $(TEST_PROGRAMS)
 	$(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo common-cmds.h TAGS tags cscope*
 	$(RM) -r autom4te.cache
 	$(RM) config.log config.mak.autogen config.mak.append config.status config.cache
-	$(RM) -r $(GIT_TARNAME) .doc-tmp-dir
-	$(RM) $(GIT_TARNAME).tar.gz git-core_$(GIT_VERSION)-*.tar.gz
+	$(RM) -r $(PERF_TARNAME) .doc-tmp-dir
+	$(RM) $(PERF_TARNAME).tar.gz perf-core_$(PERF_VERSION)-*.tar.gz
 	$(RM) $(htmldocs).tar.gz $(manpages).tar.gz
 	$(MAKE) -C Documentation/ clean
-ifndef NO_PERL
-	$(RM) gitweb/gitweb.cgi
-	$(MAKE) -C perl clean
-endif
 	$(MAKE) -C templates/ clean
 	$(MAKE) -C t/ clean
 ifndef NO_TCLTK
-	$(MAKE) -C gitk-git clean
-	$(MAKE) -C git-gui clean
+	$(MAKE) -C perfk-perf clean
+	$(MAKE) -C perf-gui clean
 endif
-	$(RM) GIT-VERSION-FILE GIT-CFLAGS GIT-GUI-VARS GIT-BUILD-OPTIONS
+	$(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-GUI-VARS PERF-BUILD-OPTIONS
 
 .PHONY: all install clean strip
 .PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
-.PHONY: .FORCE-GIT-VERSION-FILE TAGS tags cscope .FORCE-GIT-CFLAGS
-.PHONY: .FORCE-GIT-BUILD-OPTIONS
+.PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope .FORCE-PERF-CFLAGS
+.PHONY: .FORCE-PERF-BUILD-OPTIONS
 
 ### Check documentation
 #
 check-docs::
-	@(for v in $(ALL_PROGRAMS) $(BUILT_INS) git gitk; \
+	@(for v in $(ALL_PROGRAMS) $(BUILT_INS) perf perfk; \
 	do \
 		case "$$v" in \
-		git-merge-octopus | git-merge-ours | git-merge-recursive | \
-		git-merge-resolve | git-merge-subtree | \
-		git-fsck-objects | git-init-db | \
-		git-?*--?* ) continue ;; \
+		perf-merge-octopus | perf-merge-ours | perf-merge-recursive | \
+		perf-merge-resolve | perf-merge-subtree | \
+		perf-fsck-objects | perf-init-db | \
+		perf-?*--?* ) continue ;; \
 		esac ; \
 		test -f "Documentation/$$v.txt" || \
 		echo "no doc: $$v"; \
 		sed -e '/^#/d' command-list.txt | \
 		grep -q "^$$v[ 	]" || \
 		case "$$v" in \
-		git) ;; \
+		perf) ;; \
 		*) echo "no link: $$v";; \
 		esac ; \
 	done; \
@@ -1670,37 +989,37 @@ check-docs::
 		sed -e '/^#/d' \
 		    -e 's/[ 	].*//' \
 		    -e 's/^/listed /' command-list.txt; \
-		ls -1 Documentation/git*txt | \
+		ls -1 Documentation/perf*txt | \
 		sed -e 's|Documentation/|documented |' \
 		    -e 's/\.txt//'; \
 	) | while read how cmd; \
 	do \
 		case "$$how,$$cmd" in \
-		*,git-citool | \
-		*,git-gui | \
-		*,git-help | \
-		documented,gitattributes | \
-		documented,gitignore | \
-		documented,gitmodules | \
-		documented,gitcli | \
-		documented,git-tools | \
-		documented,gitcore-tutorial | \
-		documented,gitcvs-migration | \
-		documented,gitdiffcore | \
-		documented,gitglossary | \
-		documented,githooks | \
-		documented,gitrepository-layout | \
-		documented,gittutorial | \
-		documented,gittutorial-2 | \
+		*,perf-citool | \
+		*,perf-gui | \
+		*,perf-help | \
+		documented,perfattributes | \
+		documented,perfignore | \
+		documented,perfmodules | \
+		documented,perfcli | \
+		documented,perf-tools | \
+		documented,perfcore-tutorial | \
+		documented,perfcvs-migration | \
+		documented,perfdiffcore | \
+		documented,perfglossary | \
+		documented,perfhooks | \
+		documented,perfrepository-layout | \
+		documented,perftutorial | \
+		documented,perftutorial-2 | \
 		sentinel,not,matching,is,ok ) continue ;; \
 		esac; \
-		case " $(ALL_PROGRAMS) $(BUILT_INS) git gitk " in \
+		case " $(ALL_PROGRAMS) $(BUILT_INS) perf perfk " in \
 		*" $$cmd "*)	;; \
 		*) echo "removed but $$how: $$cmd" ;; \
 		esac; \
 	done ) | sort
 
-### Make sure built-ins do not have dups and listed in git.c
+### Make sure built-ins do not have dups and listed in perf.c
 #
 check-builtins::
 	./check-builtins.sh
diff --git a/Documentation/perf_counter/PERF-BUILD-OPTIONS b/Documentation/perf_counter/PERF-BUILD-OPTIONS
new file mode 100644
index 0000000..46d8d6c
--- /dev/null
+++ b/Documentation/perf_counter/PERF-BUILD-OPTIONS
@@ -0,0 +1,4 @@
+SHELL_PATH='/bin/sh'
+TAR='tar'
+NO_CURL=''
+NO_PERL=''
diff --git a/Documentation/perf_counter/PERF-CFLAGS b/Documentation/perf_counter/PERF-CFLAGS
new file mode 100644
index 0000000..f24906c
--- /dev/null
+++ b/Documentation/perf_counter/PERF-CFLAGS
@@ -0,0 +1 @@
+-g -O2 -Wall  -DSHA1_HEADER='<openssl/sha.h>' : /home/mingo/bin:libexec/perf-core:share/perf-core/templates:/home/mingo
diff --git a/Documentation/perf_counter/PERF-VERSION-FILE b/Documentation/perf_counter/PERF-VERSION-FILE
new file mode 100644
index 0000000..328e244
--- /dev/null
+++ b/Documentation/perf_counter/PERF-VERSION-FILE
@@ -0,0 +1 @@
+PERF_VERSION = 0.0.1.PERF
diff --git a/Documentation/perf_counter/PERF-VERSION-GEN b/Documentation/perf_counter/PERF-VERSION-GEN
new file mode 100755
index 0000000..c561d15
--- /dev/null
+++ b/Documentation/perf_counter/PERF-VERSION-GEN
@@ -0,0 +1,42 @@
+#!/bin/sh
+
+GVF=PERF-VERSION-FILE
+DEF_VER=v0.0.1.PERF
+
+LF='
+'
+
+# First see if there is a version file (included in release tarballs),
+# then try git-describe, then default.
+if test -f version
+then
+	VN=$(cat version) || VN="$DEF_VER"
+elif test -d .git -o -f .git &&
+	VN=$(git describe --abbrev=4 HEAD 2>/dev/null) &&
+	case "$VN" in
+	*$LF*) (exit 1) ;;
+	v[0-9]*)
+		git update-index -q --refresh
+		test -z "$(git diff-index --name-only HEAD --)" ||
+		VN="$VN-dirty" ;;
+	esac
+then
+	VN=$(echo "$VN" | sed -e 's/-/./g');
+else
+	VN="$DEF_VER"
+fi
+
+VN=$(expr "$VN" : v*'\(.*\)')
+
+if test -r $GVF
+then
+	VC=$(sed -e 's/^PERF_VERSION = //' <$GVF)
+else
+	VC=unset
+fi
+test "$VN" = "$VC" || {
+	echo >&2 "PERF_VERSION = $VN"
+	echo "PERF_VERSION = $VN" >$GVF
+}
+
+
diff --git a/Documentation/perf_counter/abspath.c b/Documentation/perf_counter/abspath.c
new file mode 100644
index 0000000..649f34f
--- /dev/null
+++ b/Documentation/perf_counter/abspath.c
@@ -0,0 +1,117 @@
+#include "cache.h"
+
+/*
+ * Do not use this for inspecting *tracked* content.  When path is a
+ * symlink to a directory, we do not want to say it is a directory when
+ * dealing with tracked content in the working tree.
+ */
+int is_directory(const char *path)
+{
+	struct stat st;
+	return (!stat(path, &st) && S_ISDIR(st.st_mode));
+}
+
+/* We allow "recursive" symbolic links. Only within reason, though. */
+#define MAXDEPTH 5
+
+const char *make_absolute_path(const char *path)
+{
+	static char bufs[2][PATH_MAX + 1], *buf = bufs[0], *next_buf = bufs[1];
+	char cwd[1024] = "";
+	int buf_index = 1, len;
+
+	int depth = MAXDEPTH;
+	char *last_elem = NULL;
+	struct stat st;
+
+	if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
+		die ("Too long path: %.*s", 60, path);
+
+	while (depth--) {
+		if (!is_directory(buf)) {
+			char *last_slash = strrchr(buf, '/');
+			if (last_slash) {
+				*last_slash = '\0';
+				last_elem = xstrdup(last_slash + 1);
+			} else {
+				last_elem = xstrdup(buf);
+				*buf = '\0';
+			}
+		}
+
+		if (*buf) {
+			if (!*cwd && !getcwd(cwd, sizeof(cwd)))
+				die ("Could not get current working directory");
+
+			if (chdir(buf))
+				die ("Could not switch to '%s'", buf);
+		}
+		if (!getcwd(buf, PATH_MAX))
+			die ("Could not get current working directory");
+
+		if (last_elem) {
+			int len = strlen(buf);
+			if (len + strlen(last_elem) + 2 > PATH_MAX)
+				die ("Too long path name: '%s/%s'",
+						buf, last_elem);
+			buf[len] = '/';
+			strcpy(buf + len + 1, last_elem);
+			free(last_elem);
+			last_elem = NULL;
+		}
+
+		if (!lstat(buf, &st) && S_ISLNK(st.st_mode)) {
+			len = readlink(buf, next_buf, PATH_MAX);
+			if (len < 0)
+				die ("Invalid symlink: %s", buf);
+			if (PATH_MAX <= len)
+				die("symbolic link too long: %s", buf);
+			next_buf[len] = '\0';
+			buf = next_buf;
+			buf_index = 1 - buf_index;
+			next_buf = bufs[buf_index];
+		} else
+			break;
+	}
+
+	if (*cwd && chdir(cwd))
+		die ("Could not change back to '%s'", cwd);
+
+	return buf;
+}
+
+static const char *get_pwd_cwd(void)
+{
+	static char cwd[PATH_MAX + 1];
+	char *pwd;
+	struct stat cwd_stat, pwd_stat;
+	if (getcwd(cwd, PATH_MAX) == NULL)
+		return NULL;
+	pwd = getenv("PWD");
+	if (pwd && strcmp(pwd, cwd)) {
+		stat(cwd, &cwd_stat);
+		if (!stat(pwd, &pwd_stat) &&
+		    pwd_stat.st_dev == cwd_stat.st_dev &&
+		    pwd_stat.st_ino == cwd_stat.st_ino) {
+			strlcpy(cwd, pwd, PATH_MAX);
+		}
+	}
+	return cwd;
+}
+
+const char *make_nonrelative_path(const char *path)
+{
+	static char buf[PATH_MAX + 1];
+
+	if (is_absolute_path(path)) {
+		if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
+			die("Too long path: %.*s", 60, path);
+	} else {
+		const char *cwd = get_pwd_cwd();
+		if (!cwd)
+			die("Cannot determine the current working directory");
+		if (snprintf(buf, PATH_MAX, "%s/%s", cwd, path) >= PATH_MAX)
+			die("Too long path: %.*s", 60, path);
+	}
+	return buf;
+}
diff --git a/Documentation/perf_counter/alias.c b/Documentation/perf_counter/alias.c
new file mode 100644
index 0000000..9b3dd2b
--- /dev/null
+++ b/Documentation/perf_counter/alias.c
@@ -0,0 +1,77 @@
+#include "cache.h"
+
+static const char *alias_key;
+static char *alias_val;
+
+static int alias_lookup_cb(const char *k, const char *v, void *cb)
+{
+	if (!prefixcmp(k, "alias.") && !strcmp(k+6, alias_key)) {
+		if (!v)
+			return config_error_nonbool(k);
+		alias_val = strdup(v);
+		return 0;
+	}
+	return 0;
+}
+
+char *alias_lookup(const char *alias)
+{
+	alias_key = alias;
+	alias_val = NULL;
+	perf_config(alias_lookup_cb, NULL);
+	return alias_val;
+}
+
+int split_cmdline(char *cmdline, const char ***argv)
+{
+	int src, dst, count = 0, size = 16;
+	char quoted = 0;
+
+	*argv = malloc(sizeof(char*) * size);
+
+	/* split alias_string */
+	(*argv)[count++] = cmdline;
+	for (src = dst = 0; cmdline[src];) {
+		char c = cmdline[src];
+		if (!quoted && isspace(c)) {
+			cmdline[dst++] = 0;
+			while (cmdline[++src]
+					&& isspace(cmdline[src]))
+				; /* skip */
+			if (count >= size) {
+				size += 16;
+				*argv = realloc(*argv, sizeof(char*) * size);
+			}
+			(*argv)[count++] = cmdline + dst;
+		} else if (!quoted && (c == '\'' || c == '"')) {
+			quoted = c;
+			src++;
+		} else if (c == quoted) {
+			quoted = 0;
+			src++;
+		} else {
+			if (c == '\\' && quoted != '\'') {
+				src++;
+				c = cmdline[src];
+				if (!c) {
+					free(*argv);
+					*argv = NULL;
+					return error("cmdline ends with \\");
+				}
+			}
+			cmdline[dst++] = c;
+			src++;
+		}
+	}
+
+	cmdline[dst] = 0;
+
+	if (quoted) {
+		free(*argv);
+		*argv = NULL;
+		return error("unclosed quote");
+	}
+
+	return count;
+}
+
diff --git a/Documentation/perf_counter/builtin-help.c b/Documentation/perf_counter/builtin-help.c
new file mode 100644
index 0000000..125fcc2
--- /dev/null
+++ b/Documentation/perf_counter/builtin-help.c
@@ -0,0 +1,463 @@
+/*
+ * builtin-help.c
+ *
+ * Builtin help command
+ */
+#include "cache.h"
+#include "builtin.h"
+#include "exec_cmd.h"
+#include "common-cmds.h"
+#include "parse-options.h"
+#include "run-command.h"
+#include "help.h"
+
+static struct man_viewer_list {
+	struct man_viewer_list *next;
+	char name[FLEX_ARRAY];
+} *man_viewer_list;
+
+static struct man_viewer_info_list {
+	struct man_viewer_info_list *next;
+	const char *info;
+	char name[FLEX_ARRAY];
+} *man_viewer_info_list;
+
+enum help_format {
+	HELP_FORMAT_MAN,
+	HELP_FORMAT_INFO,
+	HELP_FORMAT_WEB,
+};
+
+static int show_all = 0;
+static enum help_format help_format = HELP_FORMAT_MAN;
+static struct option builtin_help_options[] = {
+	OPT_BOOLEAN('a', "all", &show_all, "print all available commands"),
+	OPT_SET_INT('m', "man", &help_format, "show man page", HELP_FORMAT_MAN),
+	OPT_SET_INT('w', "web", &help_format, "show manual in web browser",
+			HELP_FORMAT_WEB),
+	OPT_SET_INT('i', "info", &help_format, "show info page",
+			HELP_FORMAT_INFO),
+	OPT_END(),
+};
+
+static const char * const builtin_help_usage[] = {
+	"perf help [--all] [--man|--web|--info] [command]",
+	NULL
+};
+
+static enum help_format parse_help_format(const char *format)
+{
+	if (!strcmp(format, "man"))
+		return HELP_FORMAT_MAN;
+	if (!strcmp(format, "info"))
+		return HELP_FORMAT_INFO;
+	if (!strcmp(format, "web") || !strcmp(format, "html"))
+		return HELP_FORMAT_WEB;
+	die("unrecognized help format '%s'", format);
+}
+
+static const char *get_man_viewer_info(const char *name)
+{
+	struct man_viewer_info_list *viewer;
+
+	for (viewer = man_viewer_info_list; viewer; viewer = viewer->next)
+	{
+		if (!strcasecmp(name, viewer->name))
+			return viewer->info;
+	}
+	return NULL;
+}
+
+static int check_emacsclient_version(void)
+{
+	struct strbuf buffer = STRBUF_INIT;
+	struct child_process ec_process;
+	const char *argv_ec[] = { "emacsclient", "--version", NULL };
+	int version;
+
+	/* emacsclient prints its version number on stderr */
+	memset(&ec_process, 0, sizeof(ec_process));
+	ec_process.argv = argv_ec;
+	ec_process.err = -1;
+	ec_process.stdout_to_stderr = 1;
+	if (start_command(&ec_process)) {
+		fprintf(stderr, "Failed to start emacsclient.\n");
+		return -1;
+	}
+	strbuf_read(&buffer, ec_process.err, 20);
+	close(ec_process.err);
+
+	/*
+	 * Don't bother checking return value, because "emacsclient --version"
+	 * seems to always exits with code 1.
+	 */
+	finish_command(&ec_process);
+
+	if (prefixcmp(buffer.buf, "emacsclient")) {
+		fprintf(stderr, "Failed to parse emacsclient version.\n");
+		strbuf_release(&buffer);
+		return -1;
+	}
+
+	strbuf_remove(&buffer, 0, strlen("emacsclient"));
+	version = atoi(buffer.buf);
+
+	if (version < 22) {
+		fprintf(stderr,
+			"emacsclient version '%d' too old (< 22).\n",
+			version);
+		strbuf_release(&buffer);
+		return -1;
+	}
+
+	strbuf_release(&buffer);
+	return 0;
+}
+
+static void exec_woman_emacs(const char* path, const char *page)
+{
+	if (!check_emacsclient_version()) {
+		/* This works only with emacsclient version >= 22. */
+		struct strbuf man_page = STRBUF_INIT;
+
+		if (!path)
+			path = "emacsclient";
+		strbuf_addf(&man_page, "(woman \"%s\")", page);
+		execlp(path, "emacsclient", "-e", man_page.buf, NULL);
+		warning("failed to exec '%s': %s", path, strerror(errno));
+	}
+}
+
+static void exec_man_konqueror(const char* path, const char *page)
+{
+	const char *display = getenv("DISPLAY");
+	if (display && *display) {
+		struct strbuf man_page = STRBUF_INIT;
+		const char *filename = "kfmclient";
+
+		/* It's simpler to launch konqueror using kfmclient. */
+		if (path) {
+			const char *file = strrchr(path, '/');
+			if (file && !strcmp(file + 1, "konqueror")) {
+				char *new = strdup(path);
+				char *dest = strrchr(new, '/');
+
+				/* strlen("konqueror") == strlen("kfmclient") */
+				strcpy(dest + 1, "kfmclient");
+				path = new;
+			}
+			if (file)
+				filename = file;
+		} else
+			path = "kfmclient";
+		strbuf_addf(&man_page, "man:%s(1)", page);
+		execlp(path, filename, "newTab", man_page.buf, NULL);
+		warning("failed to exec '%s': %s", path, strerror(errno));
+	}
+}
+
+static void exec_man_man(const char* path, const char *page)
+{
+	if (!path)
+		path = "man";
+	execlp(path, "man", page, NULL);
+	warning("failed to exec '%s': %s", path, strerror(errno));
+}
+
+static void exec_man_cmd(const char *cmd, const char *page)
+{
+	struct strbuf shell_cmd = STRBUF_INIT;
+	strbuf_addf(&shell_cmd, "%s %s", cmd, page);
+	execl("/bin/sh", "sh", "-c", shell_cmd.buf, NULL);
+	warning("failed to exec '%s': %s", cmd, strerror(errno));
+}
+
+static void add_man_viewer(const char *name)
+{
+	struct man_viewer_list **p = &man_viewer_list;
+	size_t len = strlen(name);
+
+	while (*p)
+		p = &((*p)->next);
+	*p = calloc(1, (sizeof(**p) + len + 1));
+	strncpy((*p)->name, name, len);
+}
+
+static int supported_man_viewer(const char *name, size_t len)
+{
+	return (!strncasecmp("man", name, len) ||
+		!strncasecmp("woman", name, len) ||
+		!strncasecmp("konqueror", name, len));
+}
+
+static void do_add_man_viewer_info(const char *name,
+				   size_t len,
+				   const char *value)
+{
+	struct man_viewer_info_list *new = calloc(1, sizeof(*new) + len + 1);
+
+	strncpy(new->name, name, len);
+	new->info = strdup(value);
+	new->next = man_viewer_info_list;
+	man_viewer_info_list = new;
+}
+
+static int add_man_viewer_path(const char *name,
+			       size_t len,
+			       const char *value)
+{
+	if (supported_man_viewer(name, len))
+		do_add_man_viewer_info(name, len, value);
+	else
+		warning("'%s': path for unsupported man viewer.\n"
+			"Please consider using 'man.<tool>.cmd' instead.",
+			name);
+
+	return 0;
+}
+
+static int add_man_viewer_cmd(const char *name,
+			      size_t len,
+			      const char *value)
+{
+	if (supported_man_viewer(name, len))
+		warning("'%s': cmd for supported man viewer.\n"
+			"Please consider using 'man.<tool>.path' instead.",
+			name);
+	else
+		do_add_man_viewer_info(name, len, value);
+
+	return 0;
+}
+
+static int add_man_viewer_info(const char *var, const char *value)
+{
+	const char *name = var + 4;
+	const char *subkey = strrchr(name, '.');
+
+	if (!subkey)
+		return error("Config with no key for man viewer: %s", name);
+
+	if (!strcmp(subkey, ".path")) {
+		if (!value)
+			return config_error_nonbool(var);
+		return add_man_viewer_path(name, subkey - name, value);
+	}
+	if (!strcmp(subkey, ".cmd")) {
+		if (!value)
+			return config_error_nonbool(var);
+		return add_man_viewer_cmd(name, subkey - name, value);
+	}
+
+	warning("'%s': unsupported man viewer sub key.", subkey);
+	return 0;
+}
+
+static int perf_help_config(const char *var, const char *value, void *cb)
+{
+	if (!strcmp(var, "help.format")) {
+		if (!value)
+			return config_error_nonbool(var);
+		help_format = parse_help_format(value);
+		return 0;
+	}
+	if (!strcmp(var, "man.viewer")) {
+		if (!value)
+			return config_error_nonbool(var);
+		add_man_viewer(value);
+		return 0;
+	}
+	if (!prefixcmp(var, "man."))
+		return add_man_viewer_info(var, value);
+
+	return perf_default_config(var, value, cb);
+}
+
+static struct cmdnames main_cmds, other_cmds;
+
+void list_common_cmds_help(void)
+{
+	int i, longest = 0;
+
+	for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
+		if (longest < strlen(common_cmds[i].name))
+			longest = strlen(common_cmds[i].name);
+	}
+
+	puts("The most commonly used perf commands are:");
+	for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
+		printf("   %s   ", common_cmds[i].name);
+		mput_char(' ', longest - strlen(common_cmds[i].name));
+		puts(common_cmds[i].help);
+	}
+}
+
+static int is_perf_command(const char *s)
+{
+	return is_in_cmdlist(&main_cmds, s) ||
+		is_in_cmdlist(&other_cmds, s);
+}
+
+static const char *prepend(const char *prefix, const char *cmd)
+{
+	size_t pre_len = strlen(prefix);
+	size_t cmd_len = strlen(cmd);
+	char *p = malloc(pre_len + cmd_len + 1);
+	memcpy(p, prefix, pre_len);
+	strcpy(p + pre_len, cmd);
+	return p;
+}
+
+static const char *cmd_to_page(const char *perf_cmd)
+{
+	if (!perf_cmd)
+		return "perf";
+	else if (!prefixcmp(perf_cmd, "perf"))
+		return perf_cmd;
+	else if (is_perf_command(perf_cmd))
+		return prepend("perf-", perf_cmd);
+	else
+		return prepend("perf", perf_cmd);
+}
+
+static void setup_man_path(void)
+{
+	struct strbuf new_path = STRBUF_INIT;
+	const char *old_path = getenv("MANPATH");
+
+	/* We should always put ':' after our path. If there is no
+	 * old_path, the ':' at the end will let 'man' to try
+	 * system-wide paths after ours to find the manual page. If
+	 * there is old_path, we need ':' as delimiter. */
+	strbuf_addstr(&new_path, system_path(PERF_MAN_PATH));
+	strbuf_addch(&new_path, ':');
+	if (old_path)
+		strbuf_addstr(&new_path, old_path);
+
+	setenv("MANPATH", new_path.buf, 1);
+
+	strbuf_release(&new_path);
+}
+
+static void exec_viewer(const char *name, const char *page)
+{
+	const char *info = get_man_viewer_info(name);
+
+	if (!strcasecmp(name, "man"))
+		exec_man_man(info, page);
+	else if (!strcasecmp(name, "woman"))
+		exec_woman_emacs(info, page);
+	else if (!strcasecmp(name, "konqueror"))
+		exec_man_konqueror(info, page);
+	else if (info)
+		exec_man_cmd(info, page);
+	else
+		warning("'%s': unknown man viewer.", name);
+}
+
+static void show_man_page(const char *perf_cmd)
+{
+	struct man_viewer_list *viewer;
+	const char *page = cmd_to_page(perf_cmd);
+	const char *fallback = getenv("PERF_MAN_VIEWER");
+
+	setup_man_path();
+	for (viewer = man_viewer_list; viewer; viewer = viewer->next)
+	{
+		exec_viewer(viewer->name, page); /* will return when unable */
+	}
+	if (fallback)
+		exec_viewer(fallback, page);
+	exec_viewer("man", page);
+	die("no man viewer handled the request");
+}
+
+static void show_info_page(const char *perf_cmd)
+{
+	const char *page = cmd_to_page(perf_cmd);
+	setenv("INFOPATH", system_path(PERF_INFO_PATH), 1);
+	execlp("info", "info", "perfman", page, NULL);
+}
+
+static void get_html_page_path(struct strbuf *page_path, const char *page)
+{
+	struct stat st;
+	const char *html_path = system_path(PERF_HTML_PATH);
+
+	/* Check that we have a perf documentation directory. */
+	if (stat(mkpath("%s/perf.html", html_path), &st)
+	    || !S_ISREG(st.st_mode))
+		die("'%s': not a documentation directory.", html_path);
+
+	strbuf_init(page_path, 0);
+	strbuf_addf(page_path, "%s/%s.html", html_path, page);
+}
+
+/*
+ * If open_html is not defined in a platform-specific way (see for
+ * example compat/mingw.h), we use the script web--browse to display
+ * HTML.
+ */
+#ifndef open_html
+void open_html(const char *path)
+{
+	execl_perf_cmd("web--browse", "-c", "help.browser", path, NULL);
+}
+#endif
+
+static void show_html_page(const char *perf_cmd)
+{
+	const char *page = cmd_to_page(perf_cmd);
+	struct strbuf page_path; /* it leaks but we exec bellow */
+
+	get_html_page_path(&page_path, page);
+
+	open_html(page_path.buf);
+}
+
+int cmd_help(int argc, const char **argv, const char *prefix)
+{
+	int nonperf;
+	const char *alias;
+	load_command_list("perf-", &main_cmds, &other_cmds);
+
+	/* setup_perf_directory_gently(&nonperf); */
+	perf_config(perf_help_config, NULL);
+
+	argc = parse_options(argc, argv, builtin_help_options,
+			builtin_help_usage, 0);
+
+	if (show_all) {
+		printf("usage: %s\n\n", perf_usage_string);
+		list_commands("perf commands", &main_cmds, &other_cmds);
+		printf("%s\n", perf_more_info_string);
+		return 0;
+	}
+
+	if (!argv[0]) {
+		printf("usage: %s\n\n", perf_usage_string);
+		list_common_cmds_help();
+		printf("\n%s\n", perf_more_info_string);
+		return 0;
+	}
+
+	alias = alias_lookup(argv[0]);
+	if (alias && !is_perf_command(argv[0])) {
+		printf("`perf %s' is aliased to `%s'\n", argv[0], alias);
+		return 0;
+	}
+
+	switch (help_format) {
+	case HELP_FORMAT_MAN:
+		show_man_page(argv[0]);
+		break;
+	case HELP_FORMAT_INFO:
+		show_info_page(argv[0]);
+		break;
+	case HELP_FORMAT_WEB:
+		show_html_page(argv[0]);
+		break;
+	}
+
+	return 0;
+}
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
new file mode 100644
index 0000000..9d2c769
--- /dev/null
+++ b/Documentation/perf_counter/builtin-top.c
@@ -0,0 +1,1411 @@
+/*
+ * kerneltop.c: show top kernel functions - performance counters showcase
+
+   Build with:
+
+     cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
+
+   Sample output:
+
+------------------------------------------------------------------------------
+ KernelTop:    2669 irqs/sec  [NMI, cache-misses/cache-refs],  (all, cpu: 2)
+------------------------------------------------------------------------------
+
+             weight         RIP          kernel function
+             ______   ________________   _______________
+
+              35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
+              33.00 - ffffffff804cb740 : sock_alloc_send_skb
+              31.26 - ffffffff804ce808 : skb_push
+              22.43 - ffffffff80510004 : tcp_established_options
+              19.00 - ffffffff8027d250 : find_get_page
+              15.76 - ffffffff804e4fc9 : eth_type_trans
+              15.20 - ffffffff804d8baa : dst_release
+              14.86 - ffffffff804cf5d8 : skb_release_head_state
+              14.00 - ffffffff802217d5 : read_hpet
+              12.00 - ffffffff804ffb7f : __ip_local_out
+              11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
+               8.54 - ffffffff805001a3 : ip_queue_xmit
+ */
+
+/*
+ * perfstat:  /usr/bin/time -alike performance counter statistics utility
+
+          It summarizes the counter events of all tasks (and child tasks),
+          covering all CPUs that the command (or workload) executes on.
+          It only counts the per-task events of the workload started,
+          independent of how many other tasks run on those CPUs.
+
+   Sample output:
+
+   $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
+
+   Performance counter stats for 'ls':
+
+           163516953 instructions
+                2295 cache-misses
+             2855182 branch-misses
+ */
+
+ /*
+  * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
+  *
+  * Improvements and fixes by:
+  *
+  *   Arjan van de Ven <arjan@linux.intel.com>
+  *   Yanmin Zhang <yanmin.zhang@intel.com>
+  *   Wu Fengguang <fengguang.wu@intel.com>
+  *   Mike Galbraith <efault@gmx.de>
+  *   Paul Mackerras <paulus@samba.org>
+  *
+  * Released under the GPL v2. (and only v2, not any later version)
+  */
+
+#include "util.h"
+
+#define _GNU_SOURCE
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <getopt.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <errno.h>
+#include <ctype.h>
+#include <time.h>
+#include <sched.h>
+#include <pthread.h>
+
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <sys/uio.h>
+#include <sys/mman.h>
+
+#include <linux/unistd.h>
+#include <linux/types.h>
+
+#include "../../include/linux/perf_counter.h"
+
+
+/*
+ * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
+ * counters in the current task.
+ */
+#define PR_TASK_PERF_COUNTERS_DISABLE   31
+#define PR_TASK_PERF_COUNTERS_ENABLE    32
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+#define rdclock()                                       \
+({                                                      \
+        struct timespec ts;                             \
+                                                        \
+        clock_gettime(CLOCK_MONOTONIC, &ts);            \
+        ts.tv_sec * 1000000000ULL + ts.tv_nsec;         \
+})
+
+/*
+ * Pick up some kernel type conventions:
+ */
+#define __user
+#define asmlinkage
+
+#ifdef __x86_64__
+#define __NR_perf_counter_open 295
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#endif
+
+#ifdef __i386__
+#define __NR_perf_counter_open 333
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#endif
+
+#ifdef __powerpc__
+#define __NR_perf_counter_open 319
+#define rmb() 		asm volatile ("sync" ::: "memory")
+#define cpu_relax()	asm volatile ("" ::: "memory");
+#endif
+
+#define unlikely(x)	__builtin_expect(!!(x), 0)
+#define min(x, y) ({				\
+	typeof(x) _min1 = (x);			\
+	typeof(y) _min2 = (y);			\
+	(void) (&_min1 == &_min2);		\
+	_min1 < _min2 ? _min1 : _min2; })
+
+asmlinkage int sys_perf_counter_open(
+        struct perf_counter_hw_event    *hw_event_uptr          __user,
+        pid_t                           pid,
+        int                             cpu,
+        int                             group_fd,
+        unsigned long                   flags)
+{
+        return syscall(
+                __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
+}
+
+#define MAX_COUNTERS			64
+#define MAX_NR_CPUS			256
+
+#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
+
+static int			run_perfstat			=  0;
+static int			system_wide			=  0;
+
+static int			nr_counters			=  0;
+static __u64			event_id[MAX_COUNTERS]		= {
+	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
+	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
+	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
+	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
+
+	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
+	EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
+	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
+	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
+};
+static int			default_interval = 100000;
+static int			event_count[MAX_COUNTERS];
+static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+
+static __u64			count_filter		       = 100;
+
+static int			tid				= -1;
+static int			profile_cpu			= -1;
+static int			nr_cpus				=  0;
+static int			nmi				=  1;
+static unsigned int		realtime_prio			=  0;
+static int			group				=  0;
+static unsigned int		page_size;
+static unsigned int		mmap_pages			=  16;
+static int			use_mmap			= 0;
+static int			use_munmap			= 0;
+
+static char			*vmlinux;
+
+static char			*sym_filter;
+static unsigned long		filter_start;
+static unsigned long		filter_end;
+
+static int			delay_secs			=  2;
+static int			zero;
+static int			dump_symtab;
+
+static int			scale;
+
+struct source_line {
+	uint64_t		EIP;
+	unsigned long		count;
+	char			*line;
+	struct source_line	*next;
+};
+
+static struct source_line	*lines;
+static struct source_line	**lines_tail;
+
+const unsigned int default_count[] = {
+	1000000,
+	1000000,
+	  10000,
+	  10000,
+	1000000,
+	  10000,
+};
+
+static char *hw_event_names[] = {
+	"CPU cycles",
+	"instructions",
+	"cache references",
+	"cache misses",
+	"branches",
+	"branch misses",
+	"bus cycles",
+};
+
+static char *sw_event_names[] = {
+	"cpu clock ticks",
+	"task clock ticks",
+	"pagefaults",
+	"context switches",
+	"CPU migrations",
+	"minor faults",
+	"major faults",
+};
+
+struct event_symbol {
+	__u64 event;
+	char *symbol;
+};
+
+static struct event_symbol event_symbols[] = {
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
+
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
+};
+
+#define __PERF_COUNTER_FIELD(config, name) \
+	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
+
+#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
+#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
+#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
+#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
+
+static void display_events_help(void)
+{
+	unsigned int i;
+	__u64 e;
+
+	printf(
+	" -e EVENT     --event=EVENT   #  symbolic-name        abbreviations");
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		int type, id;
+
+		e = event_symbols[i].event;
+		type = PERF_COUNTER_TYPE(e);
+		id = PERF_COUNTER_ID(e);
+
+		printf("\n                             %d:%d: %-20s",
+				type, id, event_symbols[i].symbol);
+	}
+
+	printf("\n"
+	"                           rNNN: raw PMU events (eventsel+umask)\n\n");
+}
+
+static void display_perfstat_help(void)
+{
+	printf(
+	"Usage: perfstat [<events...>] <cmd...>\n\n"
+	"PerfStat Options (up to %d event types can be specified):\n\n",
+		 MAX_COUNTERS);
+
+	display_events_help();
+
+	printf(
+	" -l                           # scale counter values\n"
+	" -a                           # system-wide collection\n");
+	exit(0);
+}
+
+static void display_help(void)
+{
+	if (run_perfstat)
+		return display_perfstat_help();
+
+	printf(
+	"Usage: kerneltop [<options>]\n"
+	"   Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
+	"KernelTop Options (up to %d event types can be specified at once):\n\n",
+		 MAX_COUNTERS);
+
+	display_events_help();
+
+	printf(
+	" -S        --stat             # perfstat COMMAND\n"
+	" -a                           # system-wide collection (for perfstat)\n\n"
+	" -c CNT    --count=CNT        # event period to sample\n\n"
+	" -C CPU    --cpu=CPU          # CPU (-1 for all)                 [default: -1]\n"
+	" -p PID    --pid=PID          # PID of sampled task (-1 for all) [default: -1]\n\n"
+	" -l                           # show scale factor for RR events\n"
+	" -d delay  --delay=<seconds>  # sampling/display delay           [default:  2]\n"
+	" -f CNT    --filter=CNT       # min-event-count filter          [default: 100]\n\n"
+	" -r prio   --realtime=<prio>  # event acquisition runs with SCHED_FIFO policy\n"
+	" -s symbol --symbol=<symbol>  # function to be showed annotated one-shot\n"
+	" -x path   --vmlinux=<path>   # the vmlinux binary, required for -s use\n"
+	" -z        --zero             # zero counts after display\n"
+	" -D        --dump_symtab      # dump symbol table to stderr on startup\n"
+	" -m pages  --mmap_pages=<pages> # number of mmap data pages\n"
+	" -M        --mmap_info        # print mmap info stream\n"
+	" -U        --munmap_info      # print munmap info stream\n"
+	);
+
+	exit(0);
+}
+
+static char *event_name(int ctr)
+{
+	__u64 config = event_id[ctr];
+	int type = PERF_COUNTER_TYPE(config);
+	int id = PERF_COUNTER_ID(config);
+	static char buf[32];
+
+	if (PERF_COUNTER_RAW(config)) {
+		sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
+		return buf;
+	}
+
+	switch (type) {
+	case PERF_TYPE_HARDWARE:
+		if (id < PERF_HW_EVENTS_MAX)
+			return hw_event_names[id];
+		return "unknown-hardware";
+
+	case PERF_TYPE_SOFTWARE:
+		if (id < PERF_SW_EVENTS_MAX)
+			return sw_event_names[id];
+		return "unknown-software";
+
+	default:
+		break;
+	}
+
+	return "unknown";
+}
+
+/*
+ * Each event can have multiple symbolic names.
+ * Symbolic names are (almost) exactly matched.
+ */
+static __u64 match_event_symbols(char *str)
+{
+	__u64 config, id;
+	int type;
+	unsigned int i;
+
+	if (sscanf(str, "r%llx", &config) == 1)
+		return config | PERF_COUNTER_RAW_MASK;
+
+	if (sscanf(str, "%d:%llu", &type, &id) == 2)
+		return EID(type, id);
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		if (!strncmp(str, event_symbols[i].symbol,
+			     strlen(event_symbols[i].symbol)))
+			return event_symbols[i].event;
+	}
+
+	return ~0ULL;
+}
+
+static int parse_events(char *str)
+{
+	__u64 config;
+
+again:
+	if (nr_counters == MAX_COUNTERS)
+		return -1;
+
+	config = match_event_symbols(str);
+	if (config == ~0ULL)
+		return -1;
+
+	event_id[nr_counters] = config;
+	nr_counters++;
+
+	str = strstr(str, ",");
+	if (str) {
+		str++;
+		goto again;
+	}
+
+	return 0;
+}
+
+
+/*
+ * perfstat
+ */
+
+char fault_here[1000000];
+
+static void create_perfstat_counter(int counter)
+{
+	struct perf_counter_hw_event hw_event;
+
+	memset(&hw_event, 0, sizeof(hw_event));
+	hw_event.config		= event_id[counter];
+	hw_event.record_type	= 0;
+	hw_event.nmi		= 0;
+	if (scale)
+		hw_event.read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
+					  PERF_FORMAT_TOTAL_TIME_RUNNING;
+
+	if (system_wide) {
+		int cpu;
+		for (cpu = 0; cpu < nr_cpus; cpu ++) {
+			fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
+			if (fd[cpu][counter] < 0) {
+				printf("perfstat error: syscall returned with %d (%s)\n",
+						fd[cpu][counter], strerror(errno));
+				exit(-1);
+			}
+		}
+	} else {
+		hw_event.inherit	= 1;
+		hw_event.disabled	= 1;
+
+		fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
+		if (fd[0][counter] < 0) {
+			printf("perfstat error: syscall returned with %d (%s)\n",
+					fd[0][counter], strerror(errno));
+			exit(-1);
+		}
+	}
+}
+
+int do_perfstat(int argc, char *argv[])
+{
+	unsigned long long t0, t1;
+	int counter;
+	ssize_t res;
+	int status;
+	int pid;
+
+	if (!system_wide)
+		nr_cpus = 1;
+
+	for (counter = 0; counter < nr_counters; counter++)
+		create_perfstat_counter(counter);
+
+	argc -= optind;
+	argv += optind;
+
+	if (!argc)
+		display_help();
+
+	/*
+	 * Enable counters and exec the command:
+	 */
+	t0 = rdclock();
+	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
+
+	if ((pid = fork()) < 0)
+		perror("failed to fork");
+	if (!pid) {
+		if (execvp(argv[0], argv)) {
+			perror(argv[0]);
+			exit(-1);
+		}
+	}
+	while (wait(&status) >= 0)
+		;
+	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
+	t1 = rdclock();
+
+	fflush(stdout);
+
+	fprintf(stderr, "\n");
+	fprintf(stderr, " Performance counter stats for \'%s\':\n",
+		argv[0]);
+	fprintf(stderr, "\n");
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		int cpu, nv;
+		__u64 count[3], single_count[3];
+		int scaled;
+
+		count[0] = count[1] = count[2] = 0;
+		nv = scale ? 3 : 1;
+		for (cpu = 0; cpu < nr_cpus; cpu ++) {
+			res = read(fd[cpu][counter],
+				   single_count, nv * sizeof(__u64));
+			assert(res == nv * sizeof(__u64));
+
+			count[0] += single_count[0];
+			if (scale) {
+				count[1] += single_count[1];
+				count[2] += single_count[2];
+			}
+		}
+
+		scaled = 0;
+		if (scale) {
+			if (count[2] == 0) {
+				fprintf(stderr, " %14s  %-20s\n",
+					"<not counted>", event_name(counter));
+				continue;
+			}
+			if (count[2] < count[1]) {
+				scaled = 1;
+				count[0] = (unsigned long long)
+					((double)count[0] * count[1] / count[2] + 0.5);
+			}
+		}
+
+		if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
+		    event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
+
+			double msecs = (double)count[0] / 1000000;
+
+			fprintf(stderr, " %14.6f  %-20s (msecs)",
+				msecs, event_name(counter));
+		} else {
+			fprintf(stderr, " %14Ld  %-20s (events)",
+				count[0], event_name(counter));
+		}
+		if (scaled)
+			fprintf(stderr, "  (scaled from %.2f%%)",
+				(double) count[2] / count[1] * 100);
+		fprintf(stderr, "\n");
+	}
+	fprintf(stderr, "\n");
+	fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
+			(double)(t1-t0)/1e6);
+	fprintf(stderr, "\n");
+
+	return 0;
+}
+
+/*
+ * Symbols
+ */
+
+static uint64_t			min_ip;
+static uint64_t			max_ip = -1ll;
+
+struct sym_entry {
+	unsigned long long	addr;
+	char			*sym;
+	unsigned long		count[MAX_COUNTERS];
+	int			skip;
+	struct source_line	*source;
+};
+
+#define MAX_SYMS		100000
+
+static int sym_table_count;
+
+struct sym_entry		*sym_filter_entry;
+
+static struct sym_entry		sym_table[MAX_SYMS];
+
+static void show_details(struct sym_entry *sym);
+
+/*
+ * Ordering weight: count-1 * count-2 * ... / count-n
+ */
+static double sym_weight(const struct sym_entry *sym)
+{
+	double weight;
+	int counter;
+
+	weight = sym->count[0];
+
+	for (counter = 1; counter < nr_counters-1; counter++)
+		weight *= sym->count[counter];
+
+	weight /= (sym->count[counter] + 1);
+
+	return weight;
+}
+
+static int compare(const void *__sym1, const void *__sym2)
+{
+	const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
+
+	return sym_weight(sym1) < sym_weight(sym2);
+}
+
+static long			events;
+static long			userspace_events;
+static const char		CONSOLE_CLEAR[] = "[H[2J";
+
+static struct sym_entry		tmp[MAX_SYMS];
+
+static void print_sym_table(void)
+{
+	int i, printed;
+	int counter;
+	float events_per_sec = events/delay_secs;
+	float kevents_per_sec = (events-userspace_events)/delay_secs;
+	float sum_kevents = 0.0;
+
+	events = userspace_events = 0;
+	memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
+	qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
+
+	for (i = 0; i < sym_table_count && tmp[i].count[0]; i++)
+		sum_kevents += tmp[i].count[0];
+
+	write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
+
+	printf(
+"------------------------------------------------------------------------------\n");
+	printf( " KernelTop:%8.0f irqs/sec  kernel:%4.1f%% [%s, ",
+		events_per_sec,
+		100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
+		nmi ? "NMI" : "IRQ");
+
+	if (nr_counters == 1)
+		printf("%d ", event_count[0]);
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		if (counter)
+			printf("/");
+
+		printf("%s", event_name(counter));
+	}
+
+	printf( "], ");
+
+	if (tid != -1)
+		printf(" (tid: %d", tid);
+	else
+		printf(" (all");
+
+	if (profile_cpu != -1)
+		printf(", cpu: %d)\n", profile_cpu);
+	else {
+		if (tid != -1)
+			printf(")\n");
+		else
+			printf(", %d CPUs)\n", nr_cpus);
+	}
+
+	printf("------------------------------------------------------------------------------\n\n");
+
+	if (nr_counters == 1)
+		printf("             events    pcnt");
+	else
+		printf("  weight     events    pcnt");
+
+	printf("         RIP          kernel function\n"
+	       	       "  ______     ______   _____   ________________   _______________\n\n"
+	);
+
+	for (i = 0, printed = 0; i < sym_table_count; i++) {
+		float pcnt;
+		int count;
+
+		if (printed <= 18 && tmp[i].count[0] >= count_filter) {
+			pcnt = 100.0 - (100.0*((sum_kevents-tmp[i].count[0])/sum_kevents));
+
+			if (nr_counters == 1)
+				printf("%19.2f - %4.1f%% - %016llx : %s\n",
+					sym_weight(tmp + i),
+					pcnt, tmp[i].addr, tmp[i].sym);
+			else
+				printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n",
+					sym_weight(tmp + i),
+					tmp[i].count[0],
+					pcnt, tmp[i].addr, tmp[i].sym);
+			printed++;
+		}
+		/*
+		 * Add decay to the counts:
+		 */
+		for (count = 0; count < nr_counters; count++)
+			sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
+	}
+
+	if (sym_filter_entry)
+		show_details(sym_filter_entry);
+
+	{
+		struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
+
+		if (poll(&stdin_poll, 1, 0) == 1) {
+			printf("key pressed - exiting.\n");
+			exit(0);
+		}
+	}
+}
+
+static void *display_thread(void *arg)
+{
+	printf("KernelTop refresh period: %d seconds\n", delay_secs);
+
+	while (!sleep(delay_secs))
+		print_sym_table();
+
+	return NULL;
+}
+
+static int read_symbol(FILE *in, struct sym_entry *s)
+{
+	static int filter_match = 0;
+	char *sym, stype;
+	char str[500];
+	int rc, pos;
+
+	rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
+	if (rc == EOF)
+		return -1;
+
+	assert(rc == 3);
+
+	/* skip until end of line: */
+	pos = strlen(str);
+	do {
+		rc = fgetc(in);
+		if (rc == '\n' || rc == EOF || pos >= 499)
+			break;
+		str[pos] = rc;
+		pos++;
+	} while (1);
+	str[pos] = 0;
+
+	sym = str;
+
+	/* Filter out known duplicates and non-text symbols. */
+	if (!strcmp(sym, "_text"))
+		return 1;
+	if (!min_ip && !strcmp(sym, "_stext"))
+		return 1;
+	if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
+		return 1;
+	if (stype != 'T' && stype != 't')
+		return 1;
+	if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
+		return 1;
+	if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
+		return 1;
+
+	s->sym = malloc(strlen(str));
+	assert(s->sym);
+
+	strcpy((char *)s->sym, str);
+	s->skip = 0;
+
+	/* Tag events to be skipped. */
+	if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
+		s->skip = 1;
+	else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
+		s->skip = 1;
+	else if (!strcmp("mwait_idle", s->sym))
+		s->skip = 1;
+
+	if (filter_match == 1) {
+		filter_end = s->addr;
+		filter_match = -1;
+		if (filter_end - filter_start > 10000) {
+			printf("hm, too large filter symbol <%s> - skipping.\n",
+				sym_filter);
+			printf("symbol filter start: %016lx\n", filter_start);
+			printf("                end: %016lx\n", filter_end);
+			filter_end = filter_start = 0;
+			sym_filter = NULL;
+			sleep(1);
+		}
+	}
+	if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
+		filter_match = 1;
+		filter_start = s->addr;
+	}
+
+	return 0;
+}
+
+int compare_addr(const void *__sym1, const void *__sym2)
+{
+	const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
+
+	return sym1->addr > sym2->addr;
+}
+
+static void sort_symbol_table(void)
+{
+	int i, dups;
+
+	do {
+		qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
+		for (i = 0, dups = 0; i < sym_table_count; i++) {
+			if (sym_table[i].addr == sym_table[i+1].addr) {
+				sym_table[i+1].addr = -1ll;
+				dups++;
+			}
+		}
+		sym_table_count -= dups;
+	} while(dups);
+}
+
+static void parse_symbols(void)
+{
+	struct sym_entry *last;
+
+	FILE *kallsyms = fopen("/proc/kallsyms", "r");
+
+	if (!kallsyms) {
+		printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
+		exit(-1);
+	}
+
+	while (!feof(kallsyms)) {
+		if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
+			sym_table_count++;
+			assert(sym_table_count <= MAX_SYMS);
+		}
+	}
+
+	sort_symbol_table();
+	min_ip = sym_table[0].addr;
+	max_ip = sym_table[sym_table_count-1].addr;
+	last = sym_table + sym_table_count++;
+
+	last->addr = -1ll;
+	last->sym = "<end>";
+
+	if (filter_end) {
+		int count;
+		for (count=0; count < sym_table_count; count ++) {
+			if (!strcmp(sym_table[count].sym, sym_filter)) {
+				sym_filter_entry = &sym_table[count];
+				break;
+			}
+		}
+	}
+	if (dump_symtab) {
+		int i;
+
+		for (i = 0; i < sym_table_count; i++)
+			fprintf(stderr, "%llx %s\n",
+				sym_table[i].addr, sym_table[i].sym);
+	}
+}
+
+/*
+ * Source lines
+ */
+
+static void parse_vmlinux(char *filename)
+{
+	FILE *file;
+	char command[PATH_MAX*2];
+	if (!filename)
+		return;
+
+	sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
+
+	file = popen(command, "r");
+	if (!file)
+		return;
+
+	lines_tail = &lines;
+	while (!feof(file)) {
+		struct source_line *src;
+		size_t dummy = 0;
+		char *c;
+
+		src = malloc(sizeof(struct source_line));
+		assert(src != NULL);
+		memset(src, 0, sizeof(struct source_line));
+
+		if (getline(&src->line, &dummy, file) < 0)
+			break;
+		if (!src->line)
+			break;
+
+		c = strchr(src->line, '\n');
+		if (c)
+			*c = 0;
+
+		src->next = NULL;
+		*lines_tail = src;
+		lines_tail = &src->next;
+
+		if (strlen(src->line)>8 && src->line[8] == ':')
+			src->EIP = strtoull(src->line, NULL, 16);
+		if (strlen(src->line)>8 && src->line[16] == ':')
+			src->EIP = strtoull(src->line, NULL, 16);
+	}
+	pclose(file);
+}
+
+static void record_precise_ip(uint64_t ip)
+{
+	struct source_line *line;
+
+	for (line = lines; line; line = line->next) {
+		if (line->EIP == ip)
+			line->count++;
+		if (line->EIP > ip)
+			break;
+	}
+}
+
+static void lookup_sym_in_vmlinux(struct sym_entry *sym)
+{
+	struct source_line *line;
+	char pattern[PATH_MAX];
+	sprintf(pattern, "<%s>:", sym->sym);
+
+	for (line = lines; line; line = line->next) {
+		if (strstr(line->line, pattern)) {
+			sym->source = line;
+			break;
+		}
+	}
+}
+
+static void show_lines(struct source_line *line_queue, int line_queue_count)
+{
+	int i;
+	struct source_line *line;
+
+	line = line_queue;
+	for (i = 0; i < line_queue_count; i++) {
+		printf("%8li\t%s\n", line->count, line->line);
+		line = line->next;
+	}
+}
+
+#define TRACE_COUNT     3
+
+static void show_details(struct sym_entry *sym)
+{
+	struct source_line *line;
+	struct source_line *line_queue = NULL;
+	int displayed = 0;
+	int line_queue_count = 0;
+
+	if (!sym->source)
+		lookup_sym_in_vmlinux(sym);
+	if (!sym->source)
+		return;
+
+	printf("Showing details for %s\n", sym->sym);
+
+	line = sym->source;
+	while (line) {
+		if (displayed && strstr(line->line, ">:"))
+			break;
+
+		if (!line_queue_count)
+			line_queue = line;
+		line_queue_count ++;
+
+		if (line->count >= count_filter) {
+			show_lines(line_queue, line_queue_count);
+			line_queue_count = 0;
+			line_queue = NULL;
+		} else if (line_queue_count > TRACE_COUNT) {
+			line_queue = line_queue->next;
+			line_queue_count --;
+		}
+
+		line->count = 0;
+		displayed++;
+		if (displayed > 300)
+			break;
+		line = line->next;
+	}
+}
+
+/*
+ * Binary search in the histogram table and record the hit:
+ */
+static void record_ip(uint64_t ip, int counter)
+{
+	int left_idx, middle_idx, right_idx, idx;
+	unsigned long left, middle, right;
+
+	record_precise_ip(ip);
+
+	left_idx = 0;
+	right_idx = sym_table_count-1;
+	assert(ip <= max_ip && ip >= min_ip);
+
+	while (left_idx + 1 < right_idx) {
+		middle_idx = (left_idx + right_idx) / 2;
+
+		left   = sym_table[  left_idx].addr;
+		middle = sym_table[middle_idx].addr;
+		right  = sym_table[ right_idx].addr;
+
+		if (!(left <= middle && middle <= right)) {
+			printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
+			printf("%d %d %d\n", left_idx, middle_idx, right_idx);
+		}
+		assert(left <= middle && middle <= right);
+		if (!(left <= ip && ip <= right)) {
+			printf(" left: %016lx\n", left);
+			printf("   ip: %016lx\n", (unsigned long)ip);
+			printf("right: %016lx\n", right);
+		}
+		assert(left <= ip && ip <= right);
+		/*
+		 * [ left .... target .... middle .... right ]
+		 *   => right := middle
+		 */
+		if (ip < middle) {
+			right_idx = middle_idx;
+			continue;
+		}
+		/*
+		 * [ left .... middle ... target ... right ]
+		 *   => left := middle
+		 */
+		left_idx = middle_idx;
+	}
+
+	idx = left_idx;
+
+	if (!sym_table[idx].skip)
+		sym_table[idx].count[counter]++;
+	else events--;
+}
+
+static void process_event(uint64_t ip, int counter)
+{
+	events++;
+
+	if (ip < min_ip || ip > max_ip) {
+		userspace_events++;
+		return;
+	}
+
+	record_ip(ip, counter);
+}
+
+static void process_options(int argc, char *argv[])
+{
+	int error = 0, counter;
+
+	if (strstr(argv[0], "perfstat"))
+		run_perfstat = 1;
+
+	for (;;) {
+		int option_index = 0;
+		/** Options for getopt */
+		static struct option long_options[] = {
+			{"count",	required_argument,	NULL, 'c'},
+			{"cpu",		required_argument,	NULL, 'C'},
+			{"delay",	required_argument,	NULL, 'd'},
+			{"dump_symtab",	no_argument,		NULL, 'D'},
+			{"event",	required_argument,	NULL, 'e'},
+			{"filter",	required_argument,	NULL, 'f'},
+			{"group",	required_argument,	NULL, 'g'},
+			{"help",	no_argument,		NULL, 'h'},
+			{"nmi",		required_argument,	NULL, 'n'},
+			{"mmap_info",	no_argument,		NULL, 'M'},
+			{"mmap_pages",	required_argument,	NULL, 'm'},
+			{"munmap_info",	no_argument,		NULL, 'U'},
+			{"pid",		required_argument,	NULL, 'p'},
+			{"realtime",	required_argument,	NULL, 'r'},
+			{"scale",	no_argument,		NULL, 'l'},
+			{"symbol",	required_argument,	NULL, 's'},
+			{"stat",	no_argument,		NULL, 'S'},
+			{"vmlinux",	required_argument,	NULL, 'x'},
+			{"zero",	no_argument,		NULL, 'z'},
+			{NULL,		0,			NULL,  0 }
+		};
+		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
+				    long_options, &option_index);
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 'a': system_wide			=	       1; break;
+		case 'c': default_interval		=   atoi(optarg); break;
+		case 'C':
+			/* CPU and PID are mutually exclusive */
+			if (tid != -1) {
+				printf("WARNING: CPU switch overriding PID\n");
+				sleep(1);
+				tid = -1;
+			}
+			profile_cpu			=   atoi(optarg); break;
+		case 'd': delay_secs			=   atoi(optarg); break;
+		case 'D': dump_symtab			=              1; break;
+
+		case 'e': error				= parse_events(optarg); break;
+
+		case 'f': count_filter			=   atoi(optarg); break;
+		case 'g': group				=   atoi(optarg); break;
+		case 'h':      				  display_help(); break;
+		case 'l': scale				=	       1; break;
+		case 'n': nmi				=   atoi(optarg); break;
+		case 'p':
+			/* CPU and PID are mutually exclusive */
+			if (profile_cpu != -1) {
+				printf("WARNING: PID switch overriding CPU\n");
+				sleep(1);
+				profile_cpu = -1;
+			}
+			tid				=   atoi(optarg); break;
+		case 'r': realtime_prio			=   atoi(optarg); break;
+		case 's': sym_filter			= strdup(optarg); break;
+		case 'S': run_perfstat			=	       1; break;
+		case 'x': vmlinux			= strdup(optarg); break;
+		case 'z': zero				=              1; break;
+		case 'm': mmap_pages			=   atoi(optarg); break;
+		case 'M': use_mmap			=              1; break;
+		case 'U': use_munmap			=              1; break;
+		default: error = 1; break;
+		}
+	}
+	if (error)
+		display_help();
+
+	if (!nr_counters) {
+		if (run_perfstat)
+			nr_counters = 8;
+		else {
+			nr_counters = 1;
+			event_id[0] = 0;
+		}
+	}
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		if (event_count[counter])
+			continue;
+
+		event_count[counter] = default_interval;
+	}
+}
+
+struct mmap_data {
+	int counter;
+	void *base;
+	unsigned int mask;
+	unsigned int prev;
+};
+
+static unsigned int mmap_read_head(struct mmap_data *md)
+{
+	struct perf_counter_mmap_page *pc = md->base;
+	int head;
+
+	head = pc->data_head;
+	rmb();
+
+	return head;
+}
+
+struct timeval last_read, this_read;
+
+static void mmap_read(struct mmap_data *md)
+{
+	unsigned int head = mmap_read_head(md);
+	unsigned int old = md->prev;
+	unsigned char *data = md->base + page_size;
+	int diff;
+
+	gettimeofday(&this_read, NULL);
+
+	/*
+	 * If we're further behind than half the buffer, there's a chance
+	 * the writer will bite our tail and screw up the events under us.
+	 *
+	 * If we somehow ended up ahead of the head, we got messed up.
+	 *
+	 * In either case, truncate and restart at head.
+	 */
+	diff = head - old;
+	if (diff > md->mask / 2 || diff < 0) {
+		struct timeval iv;
+		unsigned long msecs;
+
+		timersub(&this_read, &last_read, &iv);
+		msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
+
+		fprintf(stderr, "WARNING: failed to keep up with mmap data."
+				"  Last read %lu msecs ago.\n", msecs);
+
+		/*
+		 * head points to a known good entry, start there.
+		 */
+		old = head;
+	}
+
+	last_read = this_read;
+
+	for (; old != head;) {
+		struct ip_event {
+			struct perf_event_header header;
+			__u64 ip;
+			__u32 pid, tid;
+		};
+		struct mmap_event {
+			struct perf_event_header header;
+			__u32 pid, tid;
+			__u64 start;
+			__u64 len;
+			__u64 pgoff;
+			char filename[PATH_MAX];
+		};
+
+		typedef union event_union {
+			struct perf_event_header header;
+			struct ip_event ip;
+			struct mmap_event mmap;
+		} event_t;
+
+		event_t *event = (event_t *)&data[old & md->mask];
+
+		event_t event_copy;
+
+		unsigned int size = event->header.size;
+
+		/*
+		 * Event straddles the mmap boundary -- header should always
+		 * be inside due to u64 alignment of output.
+		 */
+		if ((old & md->mask) + size != ((old + size) & md->mask)) {
+			unsigned int offset = old;
+			unsigned int len = min(sizeof(*event), size), cpy;
+			void *dst = &event_copy;
+
+			do {
+				cpy = min(md->mask + 1 - (offset & md->mask), len);
+				memcpy(dst, &data[offset & md->mask], cpy);
+				offset += cpy;
+				dst += cpy;
+				len -= cpy;
+			} while (len);
+
+			event = &event_copy;
+		}
+
+		old += size;
+
+		if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
+			if (event->header.type & PERF_RECORD_IP)
+				process_event(event->ip.ip, md->counter);
+		} else {
+			switch (event->header.type) {
+				case PERF_EVENT_MMAP:
+				case PERF_EVENT_MUNMAP:
+					printf("%s: %Lu %Lu %Lu %s\n",
+							event->header.type == PERF_EVENT_MMAP
+							? "mmap" : "munmap",
+							event->mmap.start,
+							event->mmap.len,
+							event->mmap.pgoff,
+							event->mmap.filename);
+					break;
+			}
+		}
+	}
+
+	md->prev = old;
+}
+
+int cmd_top(int argc, const char **argv, const char *prefix)
+{
+	struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
+	struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
+	struct perf_counter_hw_event hw_event;
+	pthread_t thread;
+	int i, counter, group_fd, nr_poll = 0;
+	unsigned int cpu;
+	int ret;
+
+	page_size = sysconf(_SC_PAGE_SIZE);
+
+	process_options(argc, argv);
+
+	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	assert(nr_cpus <= MAX_NR_CPUS);
+	assert(nr_cpus >= 0);
+
+	if (run_perfstat)
+		return do_perfstat(argc, argv);
+
+	if (tid != -1 || profile_cpu != -1)
+		nr_cpus = 1;
+
+	parse_symbols();
+	if (vmlinux && sym_filter_entry)
+		parse_vmlinux(vmlinux);
+
+	for (i = 0; i < nr_cpus; i++) {
+		group_fd = -1;
+		for (counter = 0; counter < nr_counters; counter++) {
+
+			cpu	= profile_cpu;
+			if (tid == -1 && profile_cpu == -1)
+				cpu = i;
+
+			memset(&hw_event, 0, sizeof(hw_event));
+			hw_event.config		= event_id[counter];
+			hw_event.irq_period	= event_count[counter];
+			hw_event.record_type	= PERF_RECORD_IP | PERF_RECORD_TID;
+			hw_event.nmi		= nmi;
+			hw_event.mmap		= use_mmap;
+			hw_event.munmap		= use_munmap;
+
+			fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
+			if (fd[i][counter] < 0) {
+				int err = errno;
+				printf("kerneltop error: syscall returned with %d (%s)\n",
+					fd[i][counter], strerror(err));
+				if (err == EPERM)
+					printf("Are you root?\n");
+				exit(-1);
+			}
+			assert(fd[i][counter] >= 0);
+			fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
+
+			/*
+			 * First counter acts as the group leader:
+			 */
+			if (group && group_fd == -1)
+				group_fd = fd[i][counter];
+
+			event_array[nr_poll].fd = fd[i][counter];
+			event_array[nr_poll].events = POLLIN;
+			nr_poll++;
+
+			mmap_array[i][counter].counter = counter;
+			mmap_array[i][counter].prev = 0;
+			mmap_array[i][counter].mask = mmap_pages*page_size - 1;
+			mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
+					PROT_READ, MAP_SHARED, fd[i][counter], 0);
+			if (mmap_array[i][counter].base == MAP_FAILED) {
+				printf("kerneltop error: failed to mmap with %d (%s)\n",
+						errno, strerror(errno));
+				exit(-1);
+			}
+		}
+	}
+
+	if (pthread_create(&thread, NULL, display_thread, NULL)) {
+		printf("Could not create display thread.\n");
+		exit(-1);
+	}
+
+	if (realtime_prio) {
+		struct sched_param param;
+
+		param.sched_priority = realtime_prio;
+		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
+			printf("Could not set realtime priority.\n");
+			exit(-1);
+		}
+	}
+
+	while (1) {
+		int hits = events;
+
+		for (i = 0; i < nr_cpus; i++) {
+			for (counter = 0; counter < nr_counters; counter++)
+				mmap_read(&mmap_array[i][counter]);
+		}
+
+		if (hits == events)
+			ret = poll(event_array, nr_poll, 100);
+	}
+
+	return 0;
+}
diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h
new file mode 100644
index 0000000..4163744
--- /dev/null
+++ b/Documentation/perf_counter/builtin.h
@@ -0,0 +1,18 @@
+#ifndef BUILTIN_H
+#define BUILTIN_H
+
+#include "util.h"
+#include "strbuf.h"
+
+extern const char perf_version_string[];
+extern const char perf_usage_string[];
+extern const char perf_more_info_string[];
+
+extern void list_common_cmds_help(void);
+extern const char *help_unknown_cmd(const char *cmd);
+extern void prune_packed_objects(int);
+extern int read_line_with_nul(char *buf, int size, FILE *file);
+extern int check_pager_config(const char *cmd);
+
+extern int cmd_top(int argc, const char **argv, const char *prefix);
+#endif
diff --git a/Documentation/perf_counter/cache.h b/Documentation/perf_counter/cache.h
new file mode 100644
index 0000000..dc08564
--- /dev/null
+++ b/Documentation/perf_counter/cache.h
@@ -0,0 +1,97 @@
+#ifndef CACHE_H
+#define CACHE_H
+
+#include "util.h"
+#include "strbuf.h"
+
+#define PERF_DIR_ENVIRONMENT "PERF_DIR"
+#define PERF_WORK_TREE_ENVIRONMENT "PERF_WORK_TREE"
+#define DEFAULT_PERF_DIR_ENVIRONMENT ".perf"
+#define DB_ENVIRONMENT "PERF_OBJECT_DIRECTORY"
+#define INDEX_ENVIRONMENT "PERF_INDEX_FILE"
+#define GRAFT_ENVIRONMENT "PERF_GRAFT_FILE"
+#define TEMPLATE_DIR_ENVIRONMENT "PERF_TEMPLATE_DIR"
+#define CONFIG_ENVIRONMENT "PERF_CONFIG"
+#define EXEC_PATH_ENVIRONMENT "PERF_EXEC_PATH"
+#define CEILING_DIRECTORIES_ENVIRONMENT "PERF_CEILING_DIRECTORIES"
+#define PERFATTRIBUTES_FILE ".perfattributes"
+#define INFOATTRIBUTES_FILE "info/attributes"
+#define ATTRIBUTE_MACRO_PREFIX "[attr]"
+
+typedef int (*config_fn_t)(const char *, const char *, void *);
+extern int perf_default_config(const char *, const char *, void *);
+extern int perf_config_from_file(config_fn_t fn, const char *, void *);
+extern int perf_config(config_fn_t fn, void *);
+extern int perf_parse_ulong(const char *, unsigned long *);
+extern int perf_config_int(const char *, const char *);
+extern unsigned long perf_config_ulong(const char *, const char *);
+extern int perf_config_bool_or_int(const char *, const char *, int *);
+extern int perf_config_bool(const char *, const char *);
+extern int perf_config_string(const char **, const char *, const char *);
+extern int perf_config_set(const char *, const char *);
+extern int perf_config_set_multivar(const char *, const char *, const char *, int);
+extern int perf_config_rename_section(const char *, const char *);
+extern const char *perf_etc_perfconfig(void);
+extern int check_repository_format_version(const char *var, const char *value, void *cb);
+extern int perf_config_system(void);
+extern int perf_config_global(void);
+extern int config_error_nonbool(const char *);
+extern const char *config_exclusive_filename;
+
+#define MAX_PERFNAME (1000)
+extern char perf_default_email[MAX_PERFNAME];
+extern char perf_default_name[MAX_PERFNAME];
+extern int user_ident_explicitly_given;
+
+extern const char *perf_log_output_encoding;
+extern const char *perf_mailmap_file;
+
+/* IO helper functions */
+extern void maybe_flush_or_die(FILE *, const char *);
+extern int copy_fd(int ifd, int ofd);
+extern int copy_file(const char *dst, const char *src, int mode);
+extern ssize_t read_in_full(int fd, void *buf, size_t count);
+extern ssize_t write_in_full(int fd, const void *buf, size_t count);
+extern void write_or_die(int fd, const void *buf, size_t count);
+extern int write_or_whine(int fd, const void *buf, size_t count, const char *msg);
+extern int write_or_whine_pipe(int fd, const void *buf, size_t count, const char *msg);
+extern void fsync_or_die(int fd, const char *);
+
+/* pager.c */
+extern void setup_pager(void);
+extern const char *pager_program;
+extern int pager_in_use(void);
+extern int pager_use_color;
+
+extern const char *editor_program;
+extern const char *excludes_file;
+
+char *alias_lookup(const char *alias);
+int split_cmdline(char *cmdline, const char ***argv);
+
+#define alloc_nr(x) (((x)+16)*3/2)
+
+/*
+ * Realloc the buffer pointed at by variable 'x' so that it can hold
+ * at least 'nr' entries; the number of entries currently allocated
+ * is 'alloc', using the standard growing factor alloc_nr() macro.
+ *
+ * DO NOT USE any expression with side-effect for 'x' or 'alloc'.
+ */
+#define ALLOC_GROW(x, nr, alloc) \
+	do { \
+		if ((nr) > alloc) { \
+			if (alloc_nr(alloc) < (nr)) \
+				alloc = (nr); \
+			else \
+				alloc = alloc_nr(alloc); \
+			x = xrealloc((x), alloc * sizeof(*(x))); \
+		} \
+	} while(0)
+
+
+static inline int is_absolute_path(const char *path)
+{
+	return path[0] == '/';
+}
+#endif /* CACHE_H */
diff --git a/Documentation/perf_counter/command-list.txt b/Documentation/perf_counter/command-list.txt
new file mode 100644
index 0000000..1eab365
--- /dev/null
+++ b/Documentation/perf_counter/command-list.txt
@@ -0,0 +1,4 @@
+# List of known perf commands.
+# command name				category [deprecated] [common]
+perf-top                                mainporcelain common
+
diff --git a/Documentation/perf_counter/config.c b/Documentation/perf_counter/config.c
new file mode 100644
index 0000000..672d539
--- /dev/null
+++ b/Documentation/perf_counter/config.c
@@ -0,0 +1,966 @@
+/*
+ * GIT - The information manager from hell
+ *
+ * Copyright (C) Linus Torvalds, 2005
+ * Copyright (C) Johannes Schindelin, 2005
+ *
+ */
+#include "util.h"
+#include "cache.h"
+#include "exec_cmd.h"
+
+#define MAXNAME (256)
+
+static FILE *config_file;
+static const char *config_file_name;
+static int config_linenr;
+static int config_file_eof;
+static int zlib_compression_seen;
+
+const char *config_exclusive_filename = NULL;
+
+static int get_next_char(void)
+{
+	int c;
+	FILE *f;
+
+	c = '\n';
+	if ((f = config_file) != NULL) {
+		c = fgetc(f);
+		if (c == '\r') {
+			/* DOS like systems */
+			c = fgetc(f);
+			if (c != '\n') {
+				ungetc(c, f);
+				c = '\r';
+			}
+		}
+		if (c == '\n')
+			config_linenr++;
+		if (c == EOF) {
+			config_file_eof = 1;
+			c = '\n';
+		}
+	}
+	return c;
+}
+
+static char *parse_value(void)
+{
+	static char value[1024];
+	int quote = 0, comment = 0, len = 0, space = 0;
+
+	for (;;) {
+		int c = get_next_char();
+		if (len >= sizeof(value) - 1)
+			return NULL;
+		if (c == '\n') {
+			if (quote)
+				return NULL;
+			value[len] = 0;
+			return value;
+		}
+		if (comment)
+			continue;
+		if (isspace(c) && !quote) {
+			space = 1;
+			continue;
+		}
+		if (!quote) {
+			if (c == ';' || c == '#') {
+				comment = 1;
+				continue;
+			}
+		}
+		if (space) {
+			if (len)
+				value[len++] = ' ';
+			space = 0;
+		}
+		if (c == '\\') {
+			c = get_next_char();
+			switch (c) {
+			case '\n':
+				continue;
+			case 't':
+				c = '\t';
+				break;
+			case 'b':
+				c = '\b';
+				break;
+			case 'n':
+				c = '\n';
+				break;
+			/* Some characters escape as themselves */
+			case '\\': case '"':
+				break;
+			/* Reject unknown escape sequences */
+			default:
+				return NULL;
+			}
+			value[len++] = c;
+			continue;
+		}
+		if (c == '"') {
+			quote = 1-quote;
+			continue;
+		}
+		value[len++] = c;
+	}
+}
+
+static inline int iskeychar(int c)
+{
+	return isalnum(c) || c == '-';
+}
+
+static int get_value(config_fn_t fn, void *data, char *name, unsigned int len)
+{
+	int c;
+	char *value;
+
+	/* Get the full name */
+	for (;;) {
+		c = get_next_char();
+		if (config_file_eof)
+			break;
+		if (!iskeychar(c))
+			break;
+		name[len++] = tolower(c);
+		if (len >= MAXNAME)
+			return -1;
+	}
+	name[len] = 0;
+	while (c == ' ' || c == '\t')
+		c = get_next_char();
+
+	value = NULL;
+	if (c != '\n') {
+		if (c != '=')
+			return -1;
+		value = parse_value();
+		if (!value)
+			return -1;
+	}
+	return fn(name, value, data);
+}
+
+static int get_extended_base_var(char *name, int baselen, int c)
+{
+	do {
+		if (c == '\n')
+			return -1;
+		c = get_next_char();
+	} while (isspace(c));
+
+	/* We require the format to be '[base "extension"]' */
+	if (c != '"')
+		return -1;
+	name[baselen++] = '.';
+
+	for (;;) {
+		int c = get_next_char();
+		if (c == '\n')
+			return -1;
+		if (c == '"')
+			break;
+		if (c == '\\') {
+			c = get_next_char();
+			if (c == '\n')
+				return -1;
+		}
+		name[baselen++] = c;
+		if (baselen > MAXNAME / 2)
+			return -1;
+	}
+
+	/* Final ']' */
+	if (get_next_char() != ']')
+		return -1;
+	return baselen;
+}
+
+static int get_base_var(char *name)
+{
+	int baselen = 0;
+
+	for (;;) {
+		int c = get_next_char();
+		if (config_file_eof)
+			return -1;
+		if (c == ']')
+			return baselen;
+		if (isspace(c))
+			return get_extended_base_var(name, baselen, c);
+		if (!iskeychar(c) && c != '.')
+			return -1;
+		if (baselen > MAXNAME / 2)
+			return -1;
+		name[baselen++] = tolower(c);
+	}
+}
+
+static int perf_parse_file(config_fn_t fn, void *data)
+{
+	int comment = 0;
+	int baselen = 0;
+	static char var[MAXNAME];
+
+	/* U+FEFF Byte Order Mark in UTF8 */
+	static const unsigned char *utf8_bom = (unsigned char *) "\xef\xbb\xbf";
+	const unsigned char *bomptr = utf8_bom;
+
+	for (;;) {
+		int c = get_next_char();
+		if (bomptr && *bomptr) {
+			/* We are at the file beginning; skip UTF8-encoded BOM
+			 * if present. Sane editors won't put this in on their
+			 * own, but e.g. Windows Notepad will do it happily. */
+			if ((unsigned char) c == *bomptr) {
+				bomptr++;
+				continue;
+			} else {
+				/* Do not tolerate partial BOM. */
+				if (bomptr != utf8_bom)
+					break;
+				/* No BOM at file beginning. Cool. */
+				bomptr = NULL;
+			}
+		}
+		if (c == '\n') {
+			if (config_file_eof)
+				return 0;
+			comment = 0;
+			continue;
+		}
+		if (comment || isspace(c))
+			continue;
+		if (c == '#' || c == ';') {
+			comment = 1;
+			continue;
+		}
+		if (c == '[') {
+			baselen = get_base_var(var);
+			if (baselen <= 0)
+				break;
+			var[baselen++] = '.';
+			var[baselen] = 0;
+			continue;
+		}
+		if (!isalpha(c))
+			break;
+		var[baselen] = tolower(c);
+		if (get_value(fn, data, var, baselen+1) < 0)
+			break;
+	}
+	die("bad config file line %d in %s", config_linenr, config_file_name);
+}
+
+static int parse_unit_factor(const char *end, unsigned long *val)
+{
+	if (!*end)
+		return 1;
+	else if (!strcasecmp(end, "k")) {
+		*val *= 1024;
+		return 1;
+	}
+	else if (!strcasecmp(end, "m")) {
+		*val *= 1024 * 1024;
+		return 1;
+	}
+	else if (!strcasecmp(end, "g")) {
+		*val *= 1024 * 1024 * 1024;
+		return 1;
+	}
+	return 0;
+}
+
+static int perf_parse_long(const char *value, long *ret)
+{
+	if (value && *value) {
+		char *end;
+		long val = strtol(value, &end, 0);
+		unsigned long factor = 1;
+		if (!parse_unit_factor(end, &factor))
+			return 0;
+		*ret = val * factor;
+		return 1;
+	}
+	return 0;
+}
+
+int perf_parse_ulong(const char *value, unsigned long *ret)
+{
+	if (value && *value) {
+		char *end;
+		unsigned long val = strtoul(value, &end, 0);
+		if (!parse_unit_factor(end, &val))
+			return 0;
+		*ret = val;
+		return 1;
+	}
+	return 0;
+}
+
+static void die_bad_config(const char *name)
+{
+	if (config_file_name)
+		die("bad config value for '%s' in %s", name, config_file_name);
+	die("bad config value for '%s'", name);
+}
+
+int perf_config_int(const char *name, const char *value)
+{
+	long ret = 0;
+	if (!perf_parse_long(value, &ret))
+		die_bad_config(name);
+	return ret;
+}
+
+unsigned long perf_config_ulong(const char *name, const char *value)
+{
+	unsigned long ret;
+	if (!perf_parse_ulong(value, &ret))
+		die_bad_config(name);
+	return ret;
+}
+
+int perf_config_bool_or_int(const char *name, const char *value, int *is_bool)
+{
+	*is_bool = 1;
+	if (!value)
+		return 1;
+	if (!*value)
+		return 0;
+	if (!strcasecmp(value, "true") || !strcasecmp(value, "yes") || !strcasecmp(value, "on"))
+		return 1;
+	if (!strcasecmp(value, "false") || !strcasecmp(value, "no") || !strcasecmp(value, "off"))
+		return 0;
+	*is_bool = 0;
+	return perf_config_int(name, value);
+}
+
+int perf_config_bool(const char *name, const char *value)
+{
+	int discard;
+	return !!perf_config_bool_or_int(name, value, &discard);
+}
+
+int perf_config_string(const char **dest, const char *var, const char *value)
+{
+	if (!value)
+		return config_error_nonbool(var);
+	*dest = strdup(value);
+	return 0;
+}
+
+static int perf_default_core_config(const char *var, const char *value)
+{
+	/* Add other config variables here and to Documentation/config.txt. */
+	return 0;
+}
+
+int perf_default_config(const char *var, const char *value, void *dummy)
+{
+	if (!prefixcmp(var, "core."))
+		return perf_default_core_config(var, value);
+
+	/* Add other config variables here and to Documentation/config.txt. */
+	return 0;
+}
+
+int perf_config_from_file(config_fn_t fn, const char *filename, void *data)
+{
+	int ret;
+	FILE *f = fopen(filename, "r");
+
+	ret = -1;
+	if (f) {
+		config_file = f;
+		config_file_name = filename;
+		config_linenr = 1;
+		config_file_eof = 0;
+		ret = perf_parse_file(fn, data);
+		fclose(f);
+		config_file_name = NULL;
+	}
+	return ret;
+}
+
+const char *perf_etc_perfconfig(void)
+{
+	static const char *system_wide;
+	if (!system_wide)
+		system_wide = system_path(ETC_PERFCONFIG);
+	return system_wide;
+}
+
+static int perf_env_bool(const char *k, int def)
+{
+	const char *v = getenv(k);
+	return v ? perf_config_bool(k, v) : def;
+}
+
+int perf_config_system(void)
+{
+	return !perf_env_bool("PERF_CONFIG_NOSYSTEM", 0);
+}
+
+int perf_config_global(void)
+{
+	return !perf_env_bool("PERF_CONFIG_NOGLOBAL", 0);
+}
+
+int perf_config(config_fn_t fn, void *data)
+{
+	int ret = 0, found = 0;
+	char *repo_config = NULL;
+	const char *home = NULL;
+
+	/* Setting $PERF_CONFIG makes perf read _only_ the given config file. */
+	if (config_exclusive_filename)
+		return perf_config_from_file(fn, config_exclusive_filename, data);
+	if (perf_config_system() && !access(perf_etc_perfconfig(), R_OK)) {
+		ret += perf_config_from_file(fn, perf_etc_perfconfig(),
+					    data);
+		found += 1;
+	}
+
+	home = getenv("HOME");
+	if (perf_config_global() && home) {
+		char *user_config = strdup(mkpath("%s/.perfconfig", home));
+		if (!access(user_config, R_OK)) {
+			ret += perf_config_from_file(fn, user_config, data);
+			found += 1;
+		}
+		free(user_config);
+	}
+
+	repo_config = perf_pathdup("config");
+	if (!access(repo_config, R_OK)) {
+		ret += perf_config_from_file(fn, repo_config, data);
+		found += 1;
+	}
+	free(repo_config);
+	if (found == 0)
+		return -1;
+	return ret;
+}
+
+/*
+ * Find all the stuff for perf_config_set() below.
+ */
+
+#define MAX_MATCHES 512
+
+static struct {
+	int baselen;
+	char* key;
+	int do_not_match;
+	regex_t* value_regex;
+	int multi_replace;
+	size_t offset[MAX_MATCHES];
+	enum { START, SECTION_SEEN, SECTION_END_SEEN, KEY_SEEN } state;
+	int seen;
+} store;
+
+static int matches(const char* key, const char* value)
+{
+	return !strcmp(key, store.key) &&
+		(store.value_regex == NULL ||
+		 (store.do_not_match ^
+		  !regexec(store.value_regex, value, 0, NULL, 0)));
+}
+
+static int store_aux(const char* key, const char* value, void *cb)
+{
+	const char *ep;
+	size_t section_len;
+
+	switch (store.state) {
+	case KEY_SEEN:
+		if (matches(key, value)) {
+			if (store.seen == 1 && store.multi_replace == 0) {
+				warning("%s has multiple values", key);
+			} else if (store.seen >= MAX_MATCHES) {
+				error("too many matches for %s", key);
+				return 1;
+			}
+
+			store.offset[store.seen] = ftell(config_file);
+			store.seen++;
+		}
+		break;
+	case SECTION_SEEN:
+		/*
+		 * What we are looking for is in store.key (both
+		 * section and var), and its section part is baselen
+		 * long.  We found key (again, both section and var).
+		 * We would want to know if this key is in the same
+		 * section as what we are looking for.  We already
+		 * know we are in the same section as what should
+		 * hold store.key.
+		 */
+		ep = strrchr(key, '.');
+		section_len = ep - key;
+
+		if ((section_len != store.baselen) ||
+		    memcmp(key, store.key, section_len+1)) {
+			store.state = SECTION_END_SEEN;
+			break;
+		}
+
+		/*
+		 * Do not increment matches: this is no match, but we
+		 * just made sure we are in the desired section.
+		 */
+		store.offset[store.seen] = ftell(config_file);
+		/* fallthru */
+	case SECTION_END_SEEN:
+	case START:
+		if (matches(key, value)) {
+			store.offset[store.seen] = ftell(config_file);
+			store.state = KEY_SEEN;
+			store.seen++;
+		} else {
+			if (strrchr(key, '.') - key == store.baselen &&
+			      !strncmp(key, store.key, store.baselen)) {
+					store.state = SECTION_SEEN;
+					store.offset[store.seen] = ftell(config_file);
+			}
+		}
+	}
+	return 0;
+}
+
+static int write_error(const char *filename)
+{
+	error("failed to write new configuration file %s", filename);
+
+	/* Same error code as "failed to rename". */
+	return 4;
+}
+
+static int store_write_section(int fd, const char* key)
+{
+	const char *dot;
+	int i, success;
+	struct strbuf sb = STRBUF_INIT;
+
+	dot = memchr(key, '.', store.baselen);
+	if (dot) {
+		strbuf_addf(&sb, "[%.*s \"", (int)(dot - key), key);
+		for (i = dot - key + 1; i < store.baselen; i++) {
+			if (key[i] == '"' || key[i] == '\\')
+				strbuf_addch(&sb, '\\');
+			strbuf_addch(&sb, key[i]);
+		}
+		strbuf_addstr(&sb, "\"]\n");
+	} else {
+		strbuf_addf(&sb, "[%.*s]\n", store.baselen, key);
+	}
+
+	success = write_in_full(fd, sb.buf, sb.len) == sb.len;
+	strbuf_release(&sb);
+
+	return success;
+}
+
+static int store_write_pair(int fd, const char* key, const char* value)
+{
+	int i, success;
+	int length = strlen(key + store.baselen + 1);
+	const char *quote = "";
+	struct strbuf sb = STRBUF_INIT;
+
+	/*
+	 * Check to see if the value needs to be surrounded with a dq pair.
+	 * Note that problematic characters are always backslash-quoted; this
+	 * check is about not losing leading or trailing SP and strings that
+	 * follow beginning-of-comment characters (i.e. ';' and '#') by the
+	 * configuration parser.
+	 */
+	if (value[0] == ' ')
+		quote = "\"";
+	for (i = 0; value[i]; i++)
+		if (value[i] == ';' || value[i] == '#')
+			quote = "\"";
+	if (i && value[i - 1] == ' ')
+		quote = "\"";
+
+	strbuf_addf(&sb, "\t%.*s = %s",
+		    length, key + store.baselen + 1, quote);
+
+	for (i = 0; value[i]; i++)
+		switch (value[i]) {
+		case '\n':
+			strbuf_addstr(&sb, "\\n");
+			break;
+		case '\t':
+			strbuf_addstr(&sb, "\\t");
+			break;
+		case '"':
+		case '\\':
+			strbuf_addch(&sb, '\\');
+		default:
+			strbuf_addch(&sb, value[i]);
+			break;
+		}
+	strbuf_addf(&sb, "%s\n", quote);
+
+	success = write_in_full(fd, sb.buf, sb.len) == sb.len;
+	strbuf_release(&sb);
+
+	return success;
+}
+
+static ssize_t find_beginning_of_line(const char* contents, size_t size,
+	size_t offset_, int* found_bracket)
+{
+	size_t equal_offset = size, bracket_offset = size;
+	ssize_t offset;
+
+contline:
+	for (offset = offset_-2; offset > 0
+			&& contents[offset] != '\n'; offset--)
+		switch (contents[offset]) {
+			case '=': equal_offset = offset; break;
+			case ']': bracket_offset = offset; break;
+		}
+	if (offset > 0 && contents[offset-1] == '\\') {
+		offset_ = offset;
+		goto contline;
+	}
+	if (bracket_offset < equal_offset) {
+		*found_bracket = 1;
+		offset = bracket_offset+1;
+	} else
+		offset++;
+
+	return offset;
+}
+
+int perf_config_set(const char* key, const char* value)
+{
+	return perf_config_set_multivar(key, value, NULL, 0);
+}
+
+/*
+ * If value==NULL, unset in (remove from) config,
+ * if value_regex!=NULL, disregard key/value pairs where value does not match.
+ * if multi_replace==0, nothing, or only one matching key/value is replaced,
+ *     else all matching key/values (regardless how many) are removed,
+ *     before the new pair is written.
+ *
+ * Returns 0 on success.
+ *
+ * This function does this:
+ *
+ * - it locks the config file by creating ".perf/config.lock"
+ *
+ * - it then parses the config using store_aux() as validator to find
+ *   the position on the key/value pair to replace. If it is to be unset,
+ *   it must be found exactly once.
+ *
+ * - the config file is mmap()ed and the part before the match (if any) is
+ *   written to the lock file, then the changed part and the rest.
+ *
+ * - the config file is removed and the lock file rename()d to it.
+ *
+ */
+int perf_config_set_multivar(const char* key, const char* value,
+	const char* value_regex, int multi_replace)
+{
+	int i, dot;
+	int fd = -1, in_fd;
+	int ret;
+	char* config_filename;
+	const char* last_dot = strrchr(key, '.');
+
+	if (config_exclusive_filename)
+		config_filename = strdup(config_exclusive_filename);
+	else
+		config_filename = perf_pathdup("config");
+
+	/*
+	 * Since "key" actually contains the section name and the real
+	 * key name separated by a dot, we have to know where the dot is.
+	 */
+
+	if (last_dot == NULL) {
+		error("key does not contain a section: %s", key);
+		ret = 2;
+		goto out_free;
+	}
+	store.baselen = last_dot - key;
+
+	store.multi_replace = multi_replace;
+
+	/*
+	 * Validate the key and while at it, lower case it for matching.
+	 */
+	store.key = malloc(strlen(key) + 1);
+	dot = 0;
+	for (i = 0; key[i]; i++) {
+		unsigned char c = key[i];
+		if (c == '.')
+			dot = 1;
+		/* Leave the extended basename untouched.. */
+		if (!dot || i > store.baselen) {
+			if (!iskeychar(c) || (i == store.baselen+1 && !isalpha(c))) {
+				error("invalid key: %s", key);
+				free(store.key);
+				ret = 1;
+				goto out_free;
+			}
+			c = tolower(c);
+		} else if (c == '\n') {
+			error("invalid key (newline): %s", key);
+			free(store.key);
+			ret = 1;
+			goto out_free;
+		}
+		store.key[i] = c;
+	}
+	store.key[i] = 0;
+
+	/*
+	 * If .perf/config does not exist yet, write a minimal version.
+	 */
+	in_fd = open(config_filename, O_RDONLY);
+	if ( in_fd < 0 ) {
+		free(store.key);
+
+		if ( ENOENT != errno ) {
+			error("opening %s: %s", config_filename,
+			      strerror(errno));
+			ret = 3; /* same as "invalid config file" */
+			goto out_free;
+		}
+		/* if nothing to unset, error out */
+		if (value == NULL) {
+			ret = 5;
+			goto out_free;
+		}
+
+		store.key = (char*)key;
+		if (!store_write_section(fd, key) ||
+		    !store_write_pair(fd, key, value))
+			goto write_err_out;
+	} else {
+		struct stat st;
+		char* contents;
+		size_t contents_sz, copy_begin, copy_end;
+		int i, new_line = 0;
+
+		if (value_regex == NULL)
+			store.value_regex = NULL;
+		else {
+			if (value_regex[0] == '!') {
+				store.do_not_match = 1;
+				value_regex++;
+			} else
+				store.do_not_match = 0;
+
+			store.value_regex = (regex_t*)malloc(sizeof(regex_t));
+			if (regcomp(store.value_regex, value_regex,
+					REG_EXTENDED)) {
+				error("invalid pattern: %s", value_regex);
+				free(store.value_regex);
+				ret = 6;
+				goto out_free;
+			}
+		}
+
+		store.offset[0] = 0;
+		store.state = START;
+		store.seen = 0;
+
+		/*
+		 * After this, store.offset will contain the *end* offset
+		 * of the last match, or remain at 0 if no match was found.
+		 * As a side effect, we make sure to transform only a valid
+		 * existing config file.
+		 */
+		if (perf_config_from_file(store_aux, config_filename, NULL)) {
+			error("invalid config file %s", config_filename);
+			free(store.key);
+			if (store.value_regex != NULL) {
+				regfree(store.value_regex);
+				free(store.value_regex);
+			}
+			ret = 3;
+			goto out_free;
+		}
+
+		free(store.key);
+		if (store.value_regex != NULL) {
+			regfree(store.value_regex);
+			free(store.value_regex);
+		}
+
+		/* if nothing to unset, or too many matches, error out */
+		if ((store.seen == 0 && value == NULL) ||
+				(store.seen > 1 && multi_replace == 0)) {
+			ret = 5;
+			goto out_free;
+		}
+
+		fstat(in_fd, &st);
+		contents_sz = xsize_t(st.st_size);
+		contents = mmap(NULL, contents_sz, PROT_READ,
+			MAP_PRIVATE, in_fd, 0);
+		close(in_fd);
+
+		if (store.seen == 0)
+			store.seen = 1;
+
+		for (i = 0, copy_begin = 0; i < store.seen; i++) {
+			if (store.offset[i] == 0) {
+				store.offset[i] = copy_end = contents_sz;
+			} else if (store.state != KEY_SEEN) {
+				copy_end = store.offset[i];
+			} else
+				copy_end = find_beginning_of_line(
+					contents, contents_sz,
+					store.offset[i]-2, &new_line);
+
+			if (copy_end > 0 && contents[copy_end-1] != '\n')
+				new_line = 1;
+
+			/* write the first part of the config */
+			if (copy_end > copy_begin) {
+				if (write_in_full(fd, contents + copy_begin,
+						  copy_end - copy_begin) <
+				    copy_end - copy_begin)
+					goto write_err_out;
+				if (new_line &&
+				    write_in_full(fd, "\n", 1) != 1)
+					goto write_err_out;
+			}
+			copy_begin = store.offset[i];
+		}
+
+		/* write the pair (value == NULL means unset) */
+		if (value != NULL) {
+			if (store.state == START) {
+				if (!store_write_section(fd, key))
+					goto write_err_out;
+			}
+			if (!store_write_pair(fd, key, value))
+				goto write_err_out;
+		}
+
+		/* write the rest of the config */
+		if (copy_begin < contents_sz)
+			if (write_in_full(fd, contents + copy_begin,
+					  contents_sz - copy_begin) <
+			    contents_sz - copy_begin)
+				goto write_err_out;
+
+		munmap(contents, contents_sz);
+	}
+
+	ret = 0;
+
+out_free:
+	free(config_filename);
+	return ret;
+
+write_err_out:
+	goto out_free;
+
+}
+
+static int section_name_match (const char *buf, const char *name)
+{
+	int i = 0, j = 0, dot = 0;
+	for (; buf[i] && buf[i] != ']'; i++) {
+		if (!dot && isspace(buf[i])) {
+			dot = 1;
+			if (name[j++] != '.')
+				break;
+			for (i++; isspace(buf[i]); i++)
+				; /* do nothing */
+			if (buf[i] != '"')
+				break;
+			continue;
+		}
+		if (buf[i] == '\\' && dot)
+			i++;
+		else if (buf[i] == '"' && dot) {
+			for (i++; isspace(buf[i]); i++)
+				; /* do_nothing */
+			break;
+		}
+		if (buf[i] != name[j++])
+			break;
+	}
+	return (buf[i] == ']' && name[j] == 0);
+}
+
+/* if new_name == NULL, the section is removed instead */
+int perf_config_rename_section(const char *old_name, const char *new_name)
+{
+	int ret = 0, remove = 0;
+	char *config_filename;
+	int out_fd;
+	char buf[1024];
+
+	if (config_exclusive_filename)
+		config_filename = strdup(config_exclusive_filename);
+	else
+		config_filename = perf_pathdup("config");
+	if (out_fd < 0) {
+		ret = error("could not lock config file %s", config_filename);
+		goto out;
+	}
+
+	if (!(config_file = fopen(config_filename, "rb"))) {
+		/* no config file means nothing to rename, no error */
+		goto unlock_and_out;
+	}
+
+	while (fgets(buf, sizeof(buf), config_file)) {
+		int i;
+		int length;
+		for (i = 0; buf[i] && isspace(buf[i]); i++)
+			; /* do nothing */
+		if (buf[i] == '[') {
+			/* it's a section */
+			if (section_name_match (&buf[i+1], old_name)) {
+				ret++;
+				if (new_name == NULL) {
+					remove = 1;
+					continue;
+				}
+				store.baselen = strlen(new_name);
+				if (!store_write_section(out_fd, new_name)) {
+					goto out;
+				}
+				continue;
+			}
+			remove = 0;
+		}
+		if (remove)
+			continue;
+		length = strlen(buf);
+		if (write_in_full(out_fd, buf, length) != length) {
+			goto out;
+		}
+	}
+	fclose(config_file);
+ unlock_and_out:
+ out:
+	free(config_filename);
+	return ret;
+}
+
+/*
+ * Call this to report error for your variable that should not
+ * get a boolean value (i.e. "[my] var" means "true").
+ */
+int config_error_nonbool(const char *var)
+{
+	return error("Missing value for '%s'", var);
+}
diff --git a/Documentation/perf_counter/ctype.c b/Documentation/perf_counter/ctype.c
new file mode 100644
index 0000000..b90ec00
--- /dev/null
+++ b/Documentation/perf_counter/ctype.c
@@ -0,0 +1,26 @@
+/*
+ * Sane locale-independent, ASCII ctype.
+ *
+ * No surprises, and works with signed and unsigned chars.
+ */
+#include "cache.h"
+
+enum {
+	S = GIT_SPACE,
+	A = GIT_ALPHA,
+	D = GIT_DIGIT,
+	G = GIT_GLOB_SPECIAL,	/* *, ?, [, \\ */
+	R = GIT_REGEX_SPECIAL,	/* $, (, ), +, ., ^, {, | * */
+};
+
+unsigned char sane_ctype[256] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, 0, S, 0, 0,		/*   0.. 15 */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,		/*  16.. 31 */
+	S, 0, 0, 0, R, 0, 0, 0, R, R, G, R, 0, 0, R, 0,		/*  32.. 47 */
+	D, D, D, D, D, D, D, D, D, D, 0, 0, 0, 0, 0, G,		/*  48.. 63 */
+	0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  64.. 79 */
+	A, A, A, A, A, A, A, A, A, A, A, G, G, 0, R, 0,		/*  80.. 95 */
+	0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  96..111 */
+	A, A, A, A, A, A, A, A, A, A, A, R, R, 0, 0, 0,		/* 112..127 */
+	/* Nothing in the 128.. range */
+};
diff --git a/Documentation/perf_counter/exec_cmd.c b/Documentation/perf_counter/exec_cmd.c
new file mode 100644
index 0000000..d392922
--- /dev/null
+++ b/Documentation/perf_counter/exec_cmd.c
@@ -0,0 +1,165 @@
+#include "cache.h"
+#include "exec_cmd.h"
+#include "quote.h"
+#define MAX_ARGS	32
+
+extern char **environ;
+static const char *argv_exec_path;
+static const char *argv0_path;
+
+const char *system_path(const char *path)
+{
+#ifdef RUNTIME_PREFIX
+	static const char *prefix;
+#else
+	static const char *prefix = PREFIX;
+#endif
+	struct strbuf d = STRBUF_INIT;
+
+	if (is_absolute_path(path))
+		return path;
+
+#ifdef RUNTIME_PREFIX
+	assert(argv0_path);
+	assert(is_absolute_path(argv0_path));
+
+	if (!prefix &&
+	    !(prefix = strip_path_suffix(argv0_path, PERF_EXEC_PATH)) &&
+	    !(prefix = strip_path_suffix(argv0_path, BINDIR)) &&
+	    !(prefix = strip_path_suffix(argv0_path, "perf"))) {
+		prefix = PREFIX;
+		fprintf(stderr, "RUNTIME_PREFIX requested, "
+				"but prefix computation failed.  "
+				"Using static fallback '%s'.\n", prefix);
+	}
+#endif
+
+	strbuf_addf(&d, "%s/%s", prefix, path);
+	path = strbuf_detach(&d, NULL);
+	return path;
+}
+
+const char *perf_extract_argv0_path(const char *argv0)
+{
+	const char *slash;
+
+	if (!argv0 || !*argv0)
+		return NULL;
+	slash = argv0 + strlen(argv0);
+
+	while (argv0 <= slash && !is_dir_sep(*slash))
+		slash--;
+
+	if (slash >= argv0) {
+		argv0_path = strndup(argv0, slash - argv0);
+		return slash + 1;
+	}
+
+	return argv0;
+}
+
+void perf_set_argv_exec_path(const char *exec_path)
+{
+	argv_exec_path = exec_path;
+	/*
+	 * Propagate this setting to external programs.
+	 */
+	setenv(EXEC_PATH_ENVIRONMENT, exec_path, 1);
+}
+
+
+/* Returns the highest-priority, location to look for perf programs. */
+const char *perf_exec_path(void)
+{
+	const char *env;
+
+	if (argv_exec_path)
+		return argv_exec_path;
+
+	env = getenv(EXEC_PATH_ENVIRONMENT);
+	if (env && *env) {
+		return env;
+	}
+
+	return system_path(PERF_EXEC_PATH);
+}
+
+static void add_path(struct strbuf *out, const char *path)
+{
+	if (path && *path) {
+		if (is_absolute_path(path))
+			strbuf_addstr(out, path);
+		else
+			strbuf_addstr(out, make_nonrelative_path(path));
+
+		strbuf_addch(out, PATH_SEP);
+	}
+}
+
+void setup_path(void)
+{
+	const char *old_path = getenv("PATH");
+	struct strbuf new_path = STRBUF_INIT;
+
+	add_path(&new_path, perf_exec_path());
+	add_path(&new_path, argv0_path);
+
+	if (old_path)
+		strbuf_addstr(&new_path, old_path);
+	else
+		strbuf_addstr(&new_path, "/usr/local/bin:/usr/bin:/bin");
+
+	setenv("PATH", new_path.buf, 1);
+
+	strbuf_release(&new_path);
+}
+
+const char **prepare_perf_cmd(const char **argv)
+{
+	int argc;
+	const char **nargv;
+
+	for (argc = 0; argv[argc]; argc++)
+		; /* just counting */
+	nargv = malloc(sizeof(*nargv) * (argc + 2));
+
+	nargv[0] = "perf";
+	for (argc = 0; argv[argc]; argc++)
+		nargv[argc + 1] = argv[argc];
+	nargv[argc + 1] = NULL;
+	return nargv;
+}
+
+int execv_perf_cmd(const char **argv) {
+	const char **nargv = prepare_perf_cmd(argv);
+
+	/* execvp() can only ever return if it fails */
+	execvp("perf", (char **)nargv);
+
+	free(nargv);
+	return -1;
+}
+
+
+int execl_perf_cmd(const char *cmd,...)
+{
+	int argc;
+	const char *argv[MAX_ARGS + 1];
+	const char *arg;
+	va_list param;
+
+	va_start(param, cmd);
+	argv[0] = cmd;
+	argc = 1;
+	while (argc < MAX_ARGS) {
+		arg = argv[argc++] = va_arg(param, char *);
+		if (!arg)
+			break;
+	}
+	va_end(param);
+	if (MAX_ARGS <= argc)
+		return error("too many args to run %s", cmd);
+
+	argv[argc] = NULL;
+	return execv_perf_cmd(argv);
+}
diff --git a/Documentation/perf_counter/exec_cmd.h b/Documentation/perf_counter/exec_cmd.h
new file mode 100644
index 0000000..effe25e
--- /dev/null
+++ b/Documentation/perf_counter/exec_cmd.h
@@ -0,0 +1,13 @@
+#ifndef PERF_EXEC_CMD_H
+#define PERF_EXEC_CMD_H
+
+extern void perf_set_argv_exec_path(const char *exec_path);
+extern const char *perf_extract_argv0_path(const char *path);
+extern const char *perf_exec_path(void);
+extern void setup_path(void);
+extern const char **prepare_perf_cmd(const char **argv);
+extern int execv_perf_cmd(const char **argv); /* NULL terminated */
+extern int execl_perf_cmd(const char *cmd, ...);
+extern const char *system_path(const char *path);
+
+#endif /* PERF_EXEC_CMD_H */
diff --git a/Documentation/perf_counter/generate-cmdlist.sh b/Documentation/perf_counter/generate-cmdlist.sh
new file mode 100755
index 0000000..75c68d9
--- /dev/null
+++ b/Documentation/perf_counter/generate-cmdlist.sh
@@ -0,0 +1,24 @@
+#!/bin/sh
+
+echo "/* Automatically generated by $0 */
+struct cmdname_help
+{
+    char name[16];
+    char help[80];
+};
+
+static struct cmdname_help common_cmds[] = {"
+
+sed -n -e 's/^git-\([^ 	]*\)[ 	].* common.*/\1/p' command-list.txt |
+sort |
+while read cmd
+do
+     sed -n '
+     /^NAME/,/git-'"$cmd"'/H
+     ${
+            x
+            s/.*git-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/
+	    p
+     }' "Documentation/git-$cmd.txt"
+done
+echo "};"
diff --git a/Documentation/perf_counter/help.c b/Documentation/perf_counter/help.c
new file mode 100644
index 0000000..ec01167
--- /dev/null
+++ b/Documentation/perf_counter/help.c
@@ -0,0 +1,366 @@
+#include "cache.h"
+#include "builtin.h"
+#include "exec_cmd.h"
+#include "levenshtein.h"
+#include "help.h"
+
+/* most GUI terminals set COLUMNS (although some don't export it) */
+static int term_columns(void)
+{
+	char *col_string = getenv("COLUMNS");
+	int n_cols;
+
+	if (col_string && (n_cols = atoi(col_string)) > 0)
+		return n_cols;
+
+#ifdef TIOCGWINSZ
+	{
+		struct winsize ws;
+		if (!ioctl(1, TIOCGWINSZ, &ws)) {
+			if (ws.ws_col)
+				return ws.ws_col;
+		}
+	}
+#endif
+
+	return 80;
+}
+
+void add_cmdname(struct cmdnames *cmds, const char *name, int len)
+{
+	struct cmdname *ent = malloc(sizeof(*ent) + len + 1);
+
+	ent->len = len;
+	memcpy(ent->name, name, len);
+	ent->name[len] = 0;
+
+	ALLOC_GROW(cmds->names, cmds->cnt + 1, cmds->alloc);
+	cmds->names[cmds->cnt++] = ent;
+}
+
+static void clean_cmdnames(struct cmdnames *cmds)
+{
+	int i;
+	for (i = 0; i < cmds->cnt; ++i)
+		free(cmds->names[i]);
+	free(cmds->names);
+	cmds->cnt = 0;
+	cmds->alloc = 0;
+}
+
+static int cmdname_compare(const void *a_, const void *b_)
+{
+	struct cmdname *a = *(struct cmdname **)a_;
+	struct cmdname *b = *(struct cmdname **)b_;
+	return strcmp(a->name, b->name);
+}
+
+static void uniq(struct cmdnames *cmds)
+{
+	int i, j;
+
+	if (!cmds->cnt)
+		return;
+
+	for (i = j = 1; i < cmds->cnt; i++)
+		if (strcmp(cmds->names[i]->name, cmds->names[i-1]->name))
+			cmds->names[j++] = cmds->names[i];
+
+	cmds->cnt = j;
+}
+
+void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes)
+{
+	int ci, cj, ei;
+	int cmp;
+
+	ci = cj = ei = 0;
+	while (ci < cmds->cnt && ei < excludes->cnt) {
+		cmp = strcmp(cmds->names[ci]->name, excludes->names[ei]->name);
+		if (cmp < 0)
+			cmds->names[cj++] = cmds->names[ci++];
+		else if (cmp == 0)
+			ci++, ei++;
+		else if (cmp > 0)
+			ei++;
+	}
+
+	while (ci < cmds->cnt)
+		cmds->names[cj++] = cmds->names[ci++];
+
+	cmds->cnt = cj;
+}
+
+static void pretty_print_string_list(struct cmdnames *cmds, int longest)
+{
+	int cols = 1, rows;
+	int space = longest + 1; /* min 1 SP between words */
+	int max_cols = term_columns() - 1; /* don't print *on* the edge */
+	int i, j;
+
+	if (space < max_cols)
+		cols = max_cols / space;
+	rows = (cmds->cnt + cols - 1) / cols;
+
+	for (i = 0; i < rows; i++) {
+		printf("  ");
+
+		for (j = 0; j < cols; j++) {
+			int n = j * rows + i;
+			int size = space;
+			if (n >= cmds->cnt)
+				break;
+			if (j == cols-1 || n + rows >= cmds->cnt)
+				size = 1;
+			printf("%-*s", size, cmds->names[n]->name);
+		}
+		putchar('\n');
+	}
+}
+
+static int is_executable(const char *name)
+{
+	struct stat st;
+
+	if (stat(name, &st) || /* stat, not lstat */
+	    !S_ISREG(st.st_mode))
+		return 0;
+
+#ifdef __MINGW32__
+	/* cannot trust the executable bit, peek into the file instead */
+	char buf[3] = { 0 };
+	int n;
+	int fd = open(name, O_RDONLY);
+	st.st_mode &= ~S_IXUSR;
+	if (fd >= 0) {
+		n = read(fd, buf, 2);
+		if (n == 2)
+			/* DOS executables start with "MZ" */
+			if (!strcmp(buf, "#!") || !strcmp(buf, "MZ"))
+				st.st_mode |= S_IXUSR;
+		close(fd);
+	}
+#endif
+	return st.st_mode & S_IXUSR;
+}
+
+static void list_commands_in_dir(struct cmdnames *cmds,
+					 const char *path,
+					 const char *prefix)
+{
+	int prefix_len;
+	DIR *dir = opendir(path);
+	struct dirent *de;
+	struct strbuf buf = STRBUF_INIT;
+	int len;
+
+	if (!dir)
+		return;
+	if (!prefix)
+		prefix = "perf-";
+	prefix_len = strlen(prefix);
+
+	strbuf_addf(&buf, "%s/", path);
+	len = buf.len;
+
+	while ((de = readdir(dir)) != NULL) {
+		int entlen;
+
+		if (prefixcmp(de->d_name, prefix))
+			continue;
+
+		strbuf_setlen(&buf, len);
+		strbuf_addstr(&buf, de->d_name);
+		if (!is_executable(buf.buf))
+			continue;
+
+		entlen = strlen(de->d_name) - prefix_len;
+		if (has_extension(de->d_name, ".exe"))
+			entlen -= 4;
+
+		add_cmdname(cmds, de->d_name + prefix_len, entlen);
+	}
+	closedir(dir);
+	strbuf_release(&buf);
+}
+
+void load_command_list(const char *prefix,
+		struct cmdnames *main_cmds,
+		struct cmdnames *other_cmds)
+{
+	const char *env_path = getenv("PATH");
+	const char *exec_path = perf_exec_path();
+
+	if (exec_path) {
+		list_commands_in_dir(main_cmds, exec_path, prefix);
+		qsort(main_cmds->names, main_cmds->cnt,
+		      sizeof(*main_cmds->names), cmdname_compare);
+		uniq(main_cmds);
+	}
+
+	if (env_path) {
+		char *paths, *path, *colon;
+		path = paths = strdup(env_path);
+		while (1) {
+			if ((colon = strchr(path, PATH_SEP)))
+				*colon = 0;
+			if (!exec_path || strcmp(path, exec_path))
+				list_commands_in_dir(other_cmds, path, prefix);
+
+			if (!colon)
+				break;
+			path = colon + 1;
+		}
+		free(paths);
+
+		qsort(other_cmds->names, other_cmds->cnt,
+		      sizeof(*other_cmds->names), cmdname_compare);
+		uniq(other_cmds);
+	}
+	exclude_cmds(other_cmds, main_cmds);
+}
+
+void list_commands(const char *title, struct cmdnames *main_cmds,
+		   struct cmdnames *other_cmds)
+{
+	int i, longest = 0;
+
+	for (i = 0; i < main_cmds->cnt; i++)
+		if (longest < main_cmds->names[i]->len)
+			longest = main_cmds->names[i]->len;
+	for (i = 0; i < other_cmds->cnt; i++)
+		if (longest < other_cmds->names[i]->len)
+			longest = other_cmds->names[i]->len;
+
+	if (main_cmds->cnt) {
+		const char *exec_path = perf_exec_path();
+		printf("available %s in '%s'\n", title, exec_path);
+		printf("----------------");
+		mput_char('-', strlen(title) + strlen(exec_path));
+		putchar('\n');
+		pretty_print_string_list(main_cmds, longest);
+		putchar('\n');
+	}
+
+	if (other_cmds->cnt) {
+		printf("%s available from elsewhere on your $PATH\n", title);
+		printf("---------------------------------------");
+		mput_char('-', strlen(title));
+		putchar('\n');
+		pretty_print_string_list(other_cmds, longest);
+		putchar('\n');
+	}
+}
+
+int is_in_cmdlist(struct cmdnames *c, const char *s)
+{
+	int i;
+	for (i = 0; i < c->cnt; i++)
+		if (!strcmp(s, c->names[i]->name))
+			return 1;
+	return 0;
+}
+
+static int autocorrect;
+static struct cmdnames aliases;
+
+static int perf_unknown_cmd_config(const char *var, const char *value, void *cb)
+{
+	if (!strcmp(var, "help.autocorrect"))
+		autocorrect = perf_config_int(var,value);
+	/* Also use aliases for command lookup */
+	if (!prefixcmp(var, "alias."))
+		add_cmdname(&aliases, var + 6, strlen(var + 6));
+
+	return perf_default_config(var, value, cb);
+}
+
+static int levenshtein_compare(const void *p1, const void *p2)
+{
+	const struct cmdname *const *c1 = p1, *const *c2 = p2;
+	const char *s1 = (*c1)->name, *s2 = (*c2)->name;
+	int l1 = (*c1)->len;
+	int l2 = (*c2)->len;
+	return l1 != l2 ? l1 - l2 : strcmp(s1, s2);
+}
+
+static void add_cmd_list(struct cmdnames *cmds, struct cmdnames *old)
+{
+	int i;
+	ALLOC_GROW(cmds->names, cmds->cnt + old->cnt, cmds->alloc);
+
+	for (i = 0; i < old->cnt; i++)
+		cmds->names[cmds->cnt++] = old->names[i];
+	free(old->names);
+	old->cnt = 0;
+	old->names = NULL;
+}
+
+const char *help_unknown_cmd(const char *cmd)
+{
+	int i, n, best_similarity = 0;
+	struct cmdnames main_cmds, other_cmds;
+
+	memset(&main_cmds, 0, sizeof(main_cmds));
+	memset(&other_cmds, 0, sizeof(main_cmds));
+	memset(&aliases, 0, sizeof(aliases));
+
+	perf_config(perf_unknown_cmd_config, NULL);
+
+	load_command_list("perf-", &main_cmds, &other_cmds);
+
+	add_cmd_list(&main_cmds, &aliases);
+	add_cmd_list(&main_cmds, &other_cmds);
+	qsort(main_cmds.names, main_cmds.cnt,
+	      sizeof(main_cmds.names), cmdname_compare);
+	uniq(&main_cmds);
+
+	/* This reuses cmdname->len for similarity index */
+	for (i = 0; i < main_cmds.cnt; ++i)
+		main_cmds.names[i]->len =
+			levenshtein(cmd, main_cmds.names[i]->name, 0, 2, 1, 4);
+
+	qsort(main_cmds.names, main_cmds.cnt,
+	      sizeof(*main_cmds.names), levenshtein_compare);
+
+	if (!main_cmds.cnt)
+		die ("Uh oh. Your system reports no Git commands at all.");
+
+	best_similarity = main_cmds.names[0]->len;
+	n = 1;
+	while (n < main_cmds.cnt && best_similarity == main_cmds.names[n]->len)
+		++n;
+	if (autocorrect && n == 1) {
+		const char *assumed = main_cmds.names[0]->name;
+		main_cmds.names[0] = NULL;
+		clean_cmdnames(&main_cmds);
+		fprintf(stderr, "WARNING: You called a Git program named '%s', "
+			"which does not exist.\n"
+			"Continuing under the assumption that you meant '%s'\n",
+			cmd, assumed);
+		if (autocorrect > 0) {
+			fprintf(stderr, "in %0.1f seconds automatically...\n",
+				(float)autocorrect/10.0);
+			poll(NULL, 0, autocorrect * 100);
+		}
+		return assumed;
+	}
+
+	fprintf(stderr, "perf: '%s' is not a perf-command. See 'perf --help'.\n", cmd);
+
+	if (best_similarity < 6) {
+		fprintf(stderr, "\nDid you mean %s?\n",
+			n < 2 ? "this": "one of these");
+
+		for (i = 0; i < n; i++)
+			fprintf(stderr, "\t%s\n", main_cmds.names[i]->name);
+	}
+
+	exit(1);
+}
+
+int cmd_version(int argc, const char **argv, const char *prefix)
+{
+	printf("perf version %s\n", perf_version_string);
+	return 0;
+}
diff --git a/Documentation/perf_counter/help.h b/Documentation/perf_counter/help.h
new file mode 100644
index 0000000..56bc154
--- /dev/null
+++ b/Documentation/perf_counter/help.h
@@ -0,0 +1,29 @@
+#ifndef HELP_H
+#define HELP_H
+
+struct cmdnames {
+	int alloc;
+	int cnt;
+	struct cmdname {
+		size_t len; /* also used for similarity index in help.c */
+		char name[FLEX_ARRAY];
+	} **names;
+};
+
+static inline void mput_char(char c, unsigned int num)
+{
+	while(num--)
+		putchar(c);
+}
+
+void load_command_list(const char *prefix,
+		struct cmdnames *main_cmds,
+		struct cmdnames *other_cmds);
+void add_cmdname(struct cmdnames *cmds, const char *name, int len);
+/* Here we require that excludes is a sorted list. */
+void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes);
+int is_in_cmdlist(struct cmdnames *c, const char *s);
+void list_commands(const char *title, struct cmdnames *main_cmds,
+		   struct cmdnames *other_cmds);
+
+#endif /* HELP_H */
diff --git a/Documentation/perf_counter/levenshtein.c b/Documentation/perf_counter/levenshtein.c
new file mode 100644
index 0000000..e521d15
--- /dev/null
+++ b/Documentation/perf_counter/levenshtein.c
@@ -0,0 +1,84 @@
+#include "cache.h"
+#include "levenshtein.h"
+
+/*
+ * This function implements the Damerau-Levenshtein algorithm to
+ * calculate a distance between strings.
+ *
+ * Basically, it says how many letters need to be swapped, substituted,
+ * deleted from, or added to string1, at least, to get string2.
+ *
+ * The idea is to build a distance matrix for the substrings of both
+ * strings.  To avoid a large space complexity, only the last three rows
+ * are kept in memory (if swaps had the same or higher cost as one deletion
+ * plus one insertion, only two rows would be needed).
+ *
+ * At any stage, "i + 1" denotes the length of the current substring of
+ * string1 that the distance is calculated for.
+ *
+ * row2 holds the current row, row1 the previous row (i.e. for the substring
+ * of string1 of length "i"), and row0 the row before that.
+ *
+ * In other words, at the start of the big loop, row2[j + 1] contains the
+ * Damerau-Levenshtein distance between the substring of string1 of length
+ * "i" and the substring of string2 of length "j + 1".
+ *
+ * All the big loop does is determine the partial minimum-cost paths.
+ *
+ * It does so by calculating the costs of the path ending in characters
+ * i (in string1) and j (in string2), respectively, given that the last
+ * operation is a substition, a swap, a deletion, or an insertion.
+ *
+ * This implementation allows the costs to be weighted:
+ *
+ * - w (as in "sWap")
+ * - s (as in "Substitution")
+ * - a (for insertion, AKA "Add")
+ * - d (as in "Deletion")
+ *
+ * Note that this algorithm calculates a distance _iff_ d == a.
+ */
+int levenshtein(const char *string1, const char *string2,
+		int w, int s, int a, int d)
+{
+	int len1 = strlen(string1), len2 = strlen(string2);
+	int *row0 = malloc(sizeof(int) * (len2 + 1));
+	int *row1 = malloc(sizeof(int) * (len2 + 1));
+	int *row2 = malloc(sizeof(int) * (len2 + 1));
+	int i, j;
+
+	for (j = 0; j <= len2; j++)
+		row1[j] = j * a;
+	for (i = 0; i < len1; i++) {
+		int *dummy;
+
+		row2[0] = (i + 1) * d;
+		for (j = 0; j < len2; j++) {
+			/* substitution */
+			row2[j + 1] = row1[j] + s * (string1[i] != string2[j]);
+			/* swap */
+			if (i > 0 && j > 0 && string1[i - 1] == string2[j] &&
+					string1[i] == string2[j - 1] &&
+					row2[j + 1] > row0[j - 1] + w)
+				row2[j + 1] = row0[j - 1] + w;
+			/* deletion */
+			if (row2[j + 1] > row1[j + 1] + d)
+				row2[j + 1] = row1[j + 1] + d;
+			/* insertion */
+			if (row2[j + 1] > row2[j] + a)
+				row2[j + 1] = row2[j] + a;
+		}
+
+		dummy = row0;
+		row0 = row1;
+		row1 = row2;
+		row2 = dummy;
+	}
+
+	i = row1[len2];
+	free(row0);
+	free(row1);
+	free(row2);
+
+	return i;
+}
diff --git a/Documentation/perf_counter/levenshtein.h b/Documentation/perf_counter/levenshtein.h
new file mode 100644
index 0000000..0173abe
--- /dev/null
+++ b/Documentation/perf_counter/levenshtein.h
@@ -0,0 +1,8 @@
+#ifndef LEVENSHTEIN_H
+#define LEVENSHTEIN_H
+
+int levenshtein(const char *string1, const char *string2,
+	int swap_penalty, int substition_penalty,
+	int insertion_penalty, int deletion_penalty);
+
+#endif
diff --git a/Documentation/perf_counter/parse-options.c b/Documentation/perf_counter/parse-options.c
new file mode 100644
index 0000000..7464f34
--- /dev/null
+++ b/Documentation/perf_counter/parse-options.c
@@ -0,0 +1,495 @@
+#include "util.h"
+#include "parse-options.h"
+#include "cache.h"
+
+#define OPT_SHORT 1
+#define OPT_UNSET 2
+
+static int opterror(const struct option *opt, const char *reason, int flags)
+{
+	if (flags & OPT_SHORT)
+		return error("switch `%c' %s", opt->short_name, reason);
+	if (flags & OPT_UNSET)
+		return error("option `no-%s' %s", opt->long_name, reason);
+	return error("option `%s' %s", opt->long_name, reason);
+}
+
+static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt,
+		   int flags, const char **arg)
+{
+	if (p->opt) {
+		*arg = p->opt;
+		p->opt = NULL;
+	} else if (p->argc == 1 && (opt->flags & PARSE_OPT_LASTARG_DEFAULT)) {
+		*arg = (const char *)opt->defval;
+	} else if (p->argc > 1) {
+		p->argc--;
+		*arg = *++p->argv;
+	} else
+		return opterror(opt, "requires a value", flags);
+	return 0;
+}
+
+static int get_value(struct parse_opt_ctx_t *p,
+		     const struct option *opt, int flags)
+{
+	const char *s, *arg;
+	const int unset = flags & OPT_UNSET;
+
+	if (unset && p->opt)
+		return opterror(opt, "takes no value", flags);
+	if (unset && (opt->flags & PARSE_OPT_NONEG))
+		return opterror(opt, "isn't available", flags);
+
+	if (!(flags & OPT_SHORT) && p->opt) {
+		switch (opt->type) {
+		case OPTION_CALLBACK:
+			if (!(opt->flags & PARSE_OPT_NOARG))
+				break;
+			/* FALLTHROUGH */
+		case OPTION_BOOLEAN:
+		case OPTION_BIT:
+		case OPTION_SET_INT:
+		case OPTION_SET_PTR:
+			return opterror(opt, "takes no value", flags);
+		default:
+			break;
+		}
+	}
+
+	switch (opt->type) {
+	case OPTION_BIT:
+		if (unset)
+			*(int *)opt->value &= ~opt->defval;
+		else
+			*(int *)opt->value |= opt->defval;
+		return 0;
+
+	case OPTION_BOOLEAN:
+		*(int *)opt->value = unset ? 0 : *(int *)opt->value + 1;
+		return 0;
+
+	case OPTION_SET_INT:
+		*(int *)opt->value = unset ? 0 : opt->defval;
+		return 0;
+
+	case OPTION_SET_PTR:
+		*(void **)opt->value = unset ? NULL : (void *)opt->defval;
+		return 0;
+
+	case OPTION_STRING:
+		if (unset)
+			*(const char **)opt->value = NULL;
+		else if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+			*(const char **)opt->value = (const char *)opt->defval;
+		else
+			return get_arg(p, opt, flags, (const char **)opt->value);
+		return 0;
+
+	case OPTION_CALLBACK:
+		if (unset)
+			return (*opt->callback)(opt, NULL, 1) ? (-1) : 0;
+		if (opt->flags & PARSE_OPT_NOARG)
+			return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+			return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		return (*opt->callback)(opt, arg, 0) ? (-1) : 0;
+
+	case OPTION_INTEGER:
+		if (unset) {
+			*(int *)opt->value = 0;
+			return 0;
+		}
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+			*(int *)opt->value = opt->defval;
+			return 0;
+		}
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		*(int *)opt->value = strtol(arg, (char **)&s, 10);
+		if (*s)
+			return opterror(opt, "expects a numerical value", flags);
+		return 0;
+
+	default:
+		die("should not happen, someone must be hit on the forehead");
+	}
+}
+
+static int parse_short_opt(struct parse_opt_ctx_t *p, const struct option *options)
+{
+	for (; options->type != OPTION_END; options++) {
+		if (options->short_name == *p->opt) {
+			p->opt = p->opt[1] ? p->opt + 1 : NULL;
+			return get_value(p, options, OPT_SHORT);
+		}
+	}
+	return -2;
+}
+
+static int parse_long_opt(struct parse_opt_ctx_t *p, const char *arg,
+                          const struct option *options)
+{
+	const char *arg_end = strchr(arg, '=');
+	const struct option *abbrev_option = NULL, *ambiguous_option = NULL;
+	int abbrev_flags = 0, ambiguous_flags = 0;
+
+	if (!arg_end)
+		arg_end = arg + strlen(arg);
+
+	for (; options->type != OPTION_END; options++) {
+		const char *rest;
+		int flags = 0;
+
+		if (!options->long_name)
+			continue;
+
+		rest = skip_prefix(arg, options->long_name);
+		if (options->type == OPTION_ARGUMENT) {
+			if (!rest)
+				continue;
+			if (*rest == '=')
+				return opterror(options, "takes no value", flags);
+			if (*rest)
+				continue;
+			p->out[p->cpidx++] = arg - 2;
+			return 0;
+		}
+		if (!rest) {
+			/* abbreviated? */
+			if (!strncmp(options->long_name, arg, arg_end - arg)) {
+is_abbreviated:
+				if (abbrev_option) {
+					/*
+					 * If this is abbreviated, it is
+					 * ambiguous. So when there is no
+					 * exact match later, we need to
+					 * error out.
+					 */
+					ambiguous_option = abbrev_option;
+					ambiguous_flags = abbrev_flags;
+				}
+				if (!(flags & OPT_UNSET) && *arg_end)
+					p->opt = arg_end + 1;
+				abbrev_option = options;
+				abbrev_flags = flags;
+				continue;
+			}
+			/* negated and abbreviated very much? */
+			if (!prefixcmp("no-", arg)) {
+				flags |= OPT_UNSET;
+				goto is_abbreviated;
+			}
+			/* negated? */
+			if (strncmp(arg, "no-", 3))
+				continue;
+			flags |= OPT_UNSET;
+			rest = skip_prefix(arg + 3, options->long_name);
+			/* abbreviated and negated? */
+			if (!rest && !prefixcmp(options->long_name, arg + 3))
+				goto is_abbreviated;
+			if (!rest)
+				continue;
+		}
+		if (*rest) {
+			if (*rest != '=')
+				continue;
+			p->opt = rest + 1;
+		}
+		return get_value(p, options, flags);
+	}
+
+	if (ambiguous_option)
+		return error("Ambiguous option: %s "
+			"(could be --%s%s or --%s%s)",
+			arg,
+			(ambiguous_flags & OPT_UNSET) ?  "no-" : "",
+			ambiguous_option->long_name,
+			(abbrev_flags & OPT_UNSET) ?  "no-" : "",
+			abbrev_option->long_name);
+	if (abbrev_option)
+		return get_value(p, abbrev_option, abbrev_flags);
+	return -2;
+}
+
+static void check_typos(const char *arg, const struct option *options)
+{
+	if (strlen(arg) < 3)
+		return;
+
+	if (!prefixcmp(arg, "no-")) {
+		error ("did you mean `--%s` (with two dashes ?)", arg);
+		exit(129);
+	}
+
+	for (; options->type != OPTION_END; options++) {
+		if (!options->long_name)
+			continue;
+		if (!prefixcmp(options->long_name, arg)) {
+			error ("did you mean `--%s` (with two dashes ?)", arg);
+			exit(129);
+		}
+	}
+}
+
+void parse_options_start(struct parse_opt_ctx_t *ctx,
+			 int argc, const char **argv, int flags)
+{
+	memset(ctx, 0, sizeof(*ctx));
+	ctx->argc = argc - 1;
+	ctx->argv = argv + 1;
+	ctx->out  = argv;
+	ctx->cpidx = ((flags & PARSE_OPT_KEEP_ARGV0) != 0);
+	ctx->flags = flags;
+	if ((flags & PARSE_OPT_KEEP_UNKNOWN) &&
+	    (flags & PARSE_OPT_STOP_AT_NON_OPTION))
+		die("STOP_AT_NON_OPTION and KEEP_UNKNOWN don't go together");
+}
+
+static int usage_with_options_internal(const char * const *,
+				       const struct option *, int);
+
+int parse_options_step(struct parse_opt_ctx_t *ctx,
+		       const struct option *options,
+		       const char * const usagestr[])
+{
+	int internal_help = !(ctx->flags & PARSE_OPT_NO_INTERNAL_HELP);
+
+	/* we must reset ->opt, unknown short option leave it dangling */
+	ctx->opt = NULL;
+
+	for (; ctx->argc; ctx->argc--, ctx->argv++) {
+		const char *arg = ctx->argv[0];
+
+		if (*arg != '-' || !arg[1]) {
+			if (ctx->flags & PARSE_OPT_STOP_AT_NON_OPTION)
+				break;
+			ctx->out[ctx->cpidx++] = ctx->argv[0];
+			continue;
+		}
+
+		if (arg[1] != '-') {
+			ctx->opt = arg + 1;
+			if (internal_help && *ctx->opt == 'h')
+				return parse_options_usage(usagestr, options);
+			switch (parse_short_opt(ctx, options)) {
+			case -1:
+				return parse_options_usage(usagestr, options);
+			case -2:
+				goto unknown;
+			}
+			if (ctx->opt)
+				check_typos(arg + 1, options);
+			while (ctx->opt) {
+				if (internal_help && *ctx->opt == 'h')
+					return parse_options_usage(usagestr, options);
+				switch (parse_short_opt(ctx, options)) {
+				case -1:
+					return parse_options_usage(usagestr, options);
+				case -2:
+					/* fake a short option thing to hide the fact that we may have
+					 * started to parse aggregated stuff
+					 *
+					 * This is leaky, too bad.
+					 */
+					ctx->argv[0] = strdup(ctx->opt - 1);
+					*(char *)ctx->argv[0] = '-';
+					goto unknown;
+				}
+			}
+			continue;
+		}
+
+		if (!arg[2]) { /* "--" */
+			if (!(ctx->flags & PARSE_OPT_KEEP_DASHDASH)) {
+				ctx->argc--;
+				ctx->argv++;
+			}
+			break;
+		}
+
+		if (internal_help && !strcmp(arg + 2, "help-all"))
+			return usage_with_options_internal(usagestr, options, 1);
+		if (internal_help && !strcmp(arg + 2, "help"))
+			return parse_options_usage(usagestr, options);
+		switch (parse_long_opt(ctx, arg + 2, options)) {
+		case -1:
+			return parse_options_usage(usagestr, options);
+		case -2:
+			goto unknown;
+		}
+		continue;
+unknown:
+		if (!(ctx->flags & PARSE_OPT_KEEP_UNKNOWN))
+			return PARSE_OPT_UNKNOWN;
+		ctx->out[ctx->cpidx++] = ctx->argv[0];
+		ctx->opt = NULL;
+	}
+	return PARSE_OPT_DONE;
+}
+
+int parse_options_end(struct parse_opt_ctx_t *ctx)
+{
+	memmove(ctx->out + ctx->cpidx, ctx->argv, ctx->argc * sizeof(*ctx->out));
+	ctx->out[ctx->cpidx + ctx->argc] = NULL;
+	return ctx->cpidx + ctx->argc;
+}
+
+int parse_options(int argc, const char **argv, const struct option *options,
+		  const char * const usagestr[], int flags)
+{
+	struct parse_opt_ctx_t ctx;
+
+	parse_options_start(&ctx, argc, argv, flags);
+	switch (parse_options_step(&ctx, options, usagestr)) {
+	case PARSE_OPT_HELP:
+		exit(129);
+	case PARSE_OPT_DONE:
+		break;
+	default: /* PARSE_OPT_UNKNOWN */
+		if (ctx.argv[0][1] == '-') {
+			error("unknown option `%s'", ctx.argv[0] + 2);
+		} else {
+			error("unknown switch `%c'", *ctx.opt);
+		}
+		usage_with_options(usagestr, options);
+	}
+
+	return parse_options_end(&ctx);
+}
+
+#define USAGE_OPTS_WIDTH 24
+#define USAGE_GAP         2
+
+int usage_with_options_internal(const char * const *usagestr,
+				const struct option *opts, int full)
+{
+	if (!usagestr)
+		return PARSE_OPT_HELP;
+
+	fprintf(stderr, "usage: %s\n", *usagestr++);
+	while (*usagestr && **usagestr)
+		fprintf(stderr, "   or: %s\n", *usagestr++);
+	while (*usagestr) {
+		fprintf(stderr, "%s%s\n",
+				**usagestr ? "    " : "",
+				*usagestr);
+		usagestr++;
+	}
+
+	if (opts->type != OPTION_GROUP)
+		fputc('\n', stderr);
+
+	for (; opts->type != OPTION_END; opts++) {
+		size_t pos;
+		int pad;
+
+		if (opts->type == OPTION_GROUP) {
+			fputc('\n', stderr);
+			if (*opts->help)
+				fprintf(stderr, "%s\n", opts->help);
+			continue;
+		}
+		if (!full && (opts->flags & PARSE_OPT_HIDDEN))
+			continue;
+
+		pos = fprintf(stderr, "    ");
+		if (opts->short_name)
+			pos += fprintf(stderr, "-%c", opts->short_name);
+		if (opts->long_name && opts->short_name)
+			pos += fprintf(stderr, ", ");
+		if (opts->long_name)
+			pos += fprintf(stderr, "--%s", opts->long_name);
+
+		switch (opts->type) {
+		case OPTION_ARGUMENT:
+			break;
+		case OPTION_INTEGER:
+			if (opts->flags & PARSE_OPT_OPTARG)
+				if (opts->long_name)
+					pos += fprintf(stderr, "[=<n>]");
+				else
+					pos += fprintf(stderr, "[<n>]");
+			else
+				pos += fprintf(stderr, " <n>");
+			break;
+		case OPTION_CALLBACK:
+			if (opts->flags & PARSE_OPT_NOARG)
+				break;
+			/* FALLTHROUGH */
+		case OPTION_STRING:
+			if (opts->argh) {
+				if (opts->flags & PARSE_OPT_OPTARG)
+					if (opts->long_name)
+						pos += fprintf(stderr, "[=<%s>]", opts->argh);
+					else
+						pos += fprintf(stderr, "[<%s>]", opts->argh);
+				else
+					pos += fprintf(stderr, " <%s>", opts->argh);
+			} else {
+				if (opts->flags & PARSE_OPT_OPTARG)
+					if (opts->long_name)
+						pos += fprintf(stderr, "[=...]");
+					else
+						pos += fprintf(stderr, "[...]");
+				else
+					pos += fprintf(stderr, " ...");
+			}
+			break;
+		default: /* OPTION_{BIT,BOOLEAN,SET_INT,SET_PTR} */
+			break;
+		}
+
+		if (pos <= USAGE_OPTS_WIDTH)
+			pad = USAGE_OPTS_WIDTH - pos;
+		else {
+			fputc('\n', stderr);
+			pad = USAGE_OPTS_WIDTH;
+		}
+		fprintf(stderr, "%*s%s\n", pad + USAGE_GAP, "", opts->help);
+	}
+	fputc('\n', stderr);
+
+	return PARSE_OPT_HELP;
+}
+
+void usage_with_options(const char * const *usagestr,
+			const struct option *opts)
+{
+	usage_with_options_internal(usagestr, opts, 0);
+	exit(129);
+}
+
+int parse_options_usage(const char * const *usagestr,
+			const struct option *opts)
+{
+	return usage_with_options_internal(usagestr, opts, 0);
+}
+
+
+/*----- some often used options -----*/
+#include "cache.h"
+
+int parse_opt_verbosity_cb(const struct option *opt, const char *arg,
+			   int unset)
+{
+	int *target = opt->value;
+
+	if (unset)
+		/* --no-quiet, --no-verbose */
+		*target = 0;
+	else if (opt->short_name == 'v') {
+		if (*target >= 0)
+			(*target)++;
+		else
+			*target = 1;
+	} else {
+		if (*target <= 0)
+			(*target)--;
+		else
+			*target = -1;
+	}
+	return 0;
+}
diff --git a/Documentation/perf_counter/parse-options.h b/Documentation/perf_counter/parse-options.h
new file mode 100644
index 0000000..a81c7fa
--- /dev/null
+++ b/Documentation/perf_counter/parse-options.h
@@ -0,0 +1,172 @@
+#ifndef PARSE_OPTIONS_H
+#define PARSE_OPTIONS_H
+
+enum parse_opt_type {
+	/* special types */
+	OPTION_END,
+	OPTION_ARGUMENT,
+	OPTION_GROUP,
+	/* options with no arguments */
+	OPTION_BIT,
+	OPTION_BOOLEAN, /* _INCR would have been a better name */
+	OPTION_SET_INT,
+	OPTION_SET_PTR,
+	/* options with arguments (usually) */
+	OPTION_STRING,
+	OPTION_INTEGER,
+	OPTION_CALLBACK,
+};
+
+enum parse_opt_flags {
+	PARSE_OPT_KEEP_DASHDASH = 1,
+	PARSE_OPT_STOP_AT_NON_OPTION = 2,
+	PARSE_OPT_KEEP_ARGV0 = 4,
+	PARSE_OPT_KEEP_UNKNOWN = 8,
+	PARSE_OPT_NO_INTERNAL_HELP = 16,
+};
+
+enum parse_opt_option_flags {
+	PARSE_OPT_OPTARG  = 1,
+	PARSE_OPT_NOARG   = 2,
+	PARSE_OPT_NONEG   = 4,
+	PARSE_OPT_HIDDEN  = 8,
+	PARSE_OPT_LASTARG_DEFAULT = 16,
+};
+
+struct option;
+typedef int parse_opt_cb(const struct option *, const char *arg, int unset);
+
+/*
+ * `type`::
+ *   holds the type of the option, you must have an OPTION_END last in your
+ *   array.
+ *
+ * `short_name`::
+ *   the character to use as a short option name, '\0' if none.
+ *
+ * `long_name`::
+ *   the long option name, without the leading dashes, NULL if none.
+ *
+ * `value`::
+ *   stores pointers to the values to be filled.
+ *
+ * `argh`::
+ *   token to explain the kind of argument this option wants. Keep it
+ *   homogenous across the repository.
+ *
+ * `help`::
+ *   the short help associated to what the option does.
+ *   Must never be NULL (except for OPTION_END).
+ *   OPTION_GROUP uses this pointer to store the group header.
+ *
+ * `flags`::
+ *   mask of parse_opt_option_flags.
+ *   PARSE_OPT_OPTARG: says that the argument is optionnal (not for BOOLEANs)
+ *   PARSE_OPT_NOARG: says that this option takes no argument, for CALLBACKs
+ *   PARSE_OPT_NONEG: says that this option cannot be negated
+ *   PARSE_OPT_HIDDEN this option is skipped in the default usage, showed in
+ *                    the long one.
+ *
+ * `callback`::
+ *   pointer to the callback to use for OPTION_CALLBACK.
+ *
+ * `defval`::
+ *   default value to fill (*->value) with for PARSE_OPT_OPTARG.
+ *   OPTION_{BIT,SET_INT,SET_PTR} store the {mask,integer,pointer} to put in
+ *   the value when met.
+ *   CALLBACKS can use it like they want.
+ */
+struct option {
+	enum parse_opt_type type;
+	int short_name;
+	const char *long_name;
+	void *value;
+	const char *argh;
+	const char *help;
+
+	int flags;
+	parse_opt_cb *callback;
+	intptr_t defval;
+};
+
+#define OPT_END()                   { OPTION_END }
+#define OPT_ARGUMENT(l, h)          { OPTION_ARGUMENT, 0, (l), NULL, NULL, (h) }
+#define OPT_GROUP(h)                { OPTION_GROUP, 0, NULL, NULL, NULL, (h) }
+#define OPT_BIT(s, l, v, h, b)      { OPTION_BIT, (s), (l), (v), NULL, (h), 0, NULL, (b) }
+#define OPT_BOOLEAN(s, l, v, h)     { OPTION_BOOLEAN, (s), (l), (v), NULL, (h) }
+#define OPT_SET_INT(s, l, v, h, i)  { OPTION_SET_INT, (s), (l), (v), NULL, (h), 0, NULL, (i) }
+#define OPT_SET_PTR(s, l, v, h, p)  { OPTION_SET_PTR, (s), (l), (v), NULL, (h), 0, NULL, (p) }
+#define OPT_INTEGER(s, l, v, h)     { OPTION_INTEGER, (s), (l), (v), NULL, (h) }
+#define OPT_STRING(s, l, v, a, h)   { OPTION_STRING,  (s), (l), (v), (a), (h) }
+#define OPT_DATE(s, l, v, h) \
+	{ OPTION_CALLBACK, (s), (l), (v), "time",(h), 0, \
+	  parse_opt_approxidate_cb }
+#define OPT_CALLBACK(s, l, v, a, h, f) \
+	{ OPTION_CALLBACK, (s), (l), (v), (a), (h), 0, (f) }
+
+/* parse_options() will filter out the processed options and leave the
+ * non-option argments in argv[].
+ * Returns the number of arguments left in argv[].
+ */
+extern int parse_options(int argc, const char **argv,
+                         const struct option *options,
+                         const char * const usagestr[], int flags);
+
+extern NORETURN void usage_with_options(const char * const *usagestr,
+                                        const struct option *options);
+
+/*----- incremantal advanced APIs -----*/
+
+enum {
+	PARSE_OPT_HELP = -1,
+	PARSE_OPT_DONE,
+	PARSE_OPT_UNKNOWN,
+};
+
+/*
+ * It's okay for the caller to consume argv/argc in the usual way.
+ * Other fields of that structure are private to parse-options and should not
+ * be modified in any way.
+ */
+struct parse_opt_ctx_t {
+	const char **argv;
+	const char **out;
+	int argc, cpidx;
+	const char *opt;
+	int flags;
+};
+
+extern int parse_options_usage(const char * const *usagestr,
+			       const struct option *opts);
+
+extern void parse_options_start(struct parse_opt_ctx_t *ctx,
+				int argc, const char **argv, int flags);
+
+extern int parse_options_step(struct parse_opt_ctx_t *ctx,
+			      const struct option *options,
+			      const char * const usagestr[]);
+
+extern int parse_options_end(struct parse_opt_ctx_t *ctx);
+
+
+/*----- some often used options -----*/
+extern int parse_opt_abbrev_cb(const struct option *, const char *, int);
+extern int parse_opt_approxidate_cb(const struct option *, const char *, int);
+extern int parse_opt_verbosity_cb(const struct option *, const char *, int);
+
+#define OPT__VERBOSE(var)  OPT_BOOLEAN('v', "verbose", (var), "be verbose")
+#define OPT__QUIET(var)    OPT_BOOLEAN('q', "quiet",   (var), "be quiet")
+#define OPT__VERBOSITY(var) \
+	{ OPTION_CALLBACK, 'v', "verbose", (var), NULL, "be more verbose", \
+	  PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }, \
+	{ OPTION_CALLBACK, 'q', "quiet", (var), NULL, "be more quiet", \
+	  PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }
+#define OPT__DRY_RUN(var)  OPT_BOOLEAN('n', "dry-run", (var), "dry run")
+#define OPT__ABBREV(var)  \
+	{ OPTION_CALLBACK, 0, "abbrev", (var), "n", \
+	  "use <n> digits to display SHA-1s", \
+	  PARSE_OPT_OPTARG, &parse_opt_abbrev_cb, 0 }
+
+extern const char *parse_options_fix_filename(const char *prefix, const char *file);
+
+#endif
diff --git a/Documentation/perf_counter/path.c b/Documentation/perf_counter/path.c
new file mode 100644
index 0000000..891b612
--- /dev/null
+++ b/Documentation/perf_counter/path.c
@@ -0,0 +1,392 @@
+/*
+ * I'm tired of doing "vsnprintf()" etc just to open a
+ * file, so here's a "return static buffer with printf"
+ * interface for paths.
+ *
+ * It's obviously not thread-safe. Sue me. But it's quite
+ * useful for doing things like
+ *
+ *   f = open(mkpath("%s/%s.perf", base, name), O_RDONLY);
+ *
+ * which is what it's designed for.
+ */
+#include "cache.h"
+
+static char bad_path[] = "/bad-path/";
+/*
+ * Two hacks:
+ */
+
+static char *get_perf_dir(void)
+{
+	return ".";
+}
+
+size_t strlcpy(char *dest, const char *src, size_t size)
+{
+	size_t ret = strlen(src);
+
+	if (size) {
+		size_t len = (ret >= size) ? size - 1 : ret;
+		memcpy(dest, src, len);
+		dest[len] = '\0';
+	}
+	return ret;
+}
+
+
+static char *get_pathname(void)
+{
+	static char pathname_array[4][PATH_MAX];
+	static int index;
+	return pathname_array[3 & ++index];
+}
+
+static char *cleanup_path(char *path)
+{
+	/* Clean it up */
+	if (!memcmp(path, "./", 2)) {
+		path += 2;
+		while (*path == '/')
+			path++;
+	}
+	return path;
+}
+
+char *mksnpath(char *buf, size_t n, const char *fmt, ...)
+{
+	va_list args;
+	unsigned len;
+
+	va_start(args, fmt);
+	len = vsnprintf(buf, n, fmt, args);
+	va_end(args);
+	if (len >= n) {
+		strlcpy(buf, bad_path, n);
+		return buf;
+	}
+	return cleanup_path(buf);
+}
+
+static char *perf_vsnpath(char *buf, size_t n, const char *fmt, va_list args)
+{
+	const char *perf_dir = get_perf_dir();
+	size_t len;
+
+	len = strlen(perf_dir);
+	if (n < len + 1)
+		goto bad;
+	memcpy(buf, perf_dir, len);
+	if (len && !is_dir_sep(perf_dir[len-1]))
+		buf[len++] = '/';
+	len += vsnprintf(buf + len, n - len, fmt, args);
+	if (len >= n)
+		goto bad;
+	return cleanup_path(buf);
+bad:
+	strlcpy(buf, bad_path, n);
+	return buf;
+}
+
+char *perf_snpath(char *buf, size_t n, const char *fmt, ...)
+{
+	va_list args;
+	va_start(args, fmt);
+	(void)perf_vsnpath(buf, n, fmt, args);
+	va_end(args);
+	return buf;
+}
+
+char *perf_pathdup(const char *fmt, ...)
+{
+	char path[PATH_MAX];
+	va_list args;
+	va_start(args, fmt);
+	(void)perf_vsnpath(path, sizeof(path), fmt, args);
+	va_end(args);
+	return xstrdup(path);
+}
+
+char *mkpath(const char *fmt, ...)
+{
+	va_list args;
+	unsigned len;
+	char *pathname = get_pathname();
+
+	va_start(args, fmt);
+	len = vsnprintf(pathname, PATH_MAX, fmt, args);
+	va_end(args);
+	if (len >= PATH_MAX)
+		return bad_path;
+	return cleanup_path(pathname);
+}
+
+char *perf_path(const char *fmt, ...)
+{
+	const char *perf_dir = get_perf_dir();
+	char *pathname = get_pathname();
+	va_list args;
+	unsigned len;
+
+	len = strlen(perf_dir);
+	if (len > PATH_MAX-100)
+		return bad_path;
+	memcpy(pathname, perf_dir, len);
+	if (len && perf_dir[len-1] != '/')
+		pathname[len++] = '/';
+	va_start(args, fmt);
+	len += vsnprintf(pathname + len, PATH_MAX - len, fmt, args);
+	va_end(args);
+	if (len >= PATH_MAX)
+		return bad_path;
+	return cleanup_path(pathname);
+}
+
+
+/* perf_mkstemp() - create tmp file honoring TMPDIR variable */
+int perf_mkstemp(char *path, size_t len, const char *template)
+{
+	const char *tmp;
+	size_t n;
+
+	tmp = getenv("TMPDIR");
+	if (!tmp)
+		tmp = "/tmp";
+	n = snprintf(path, len, "%s/%s", tmp, template);
+	if (len <= n) {
+		errno = ENAMETOOLONG;
+		return -1;
+	}
+	return mkstemp(path);
+}
+
+
+static char *user_path(char *buf, char *path, int sz)
+{
+	struct passwd *pw;
+	char *slash;
+	int len, baselen;
+
+	if (!path || path[0] != '~')
+		return NULL;
+	path++;
+	slash = strchr(path, '/');
+	if (path[0] == '/' || !path[0]) {
+		pw = getpwuid(getuid());
+	}
+	else {
+		if (slash) {
+			*slash = 0;
+			pw = getpwnam(path);
+			*slash = '/';
+		}
+		else
+			pw = getpwnam(path);
+	}
+	if (!pw || !pw->pw_dir || sz <= strlen(pw->pw_dir))
+		return NULL;
+	baselen = strlen(pw->pw_dir);
+	memcpy(buf, pw->pw_dir, baselen);
+	while ((1 < baselen) && (buf[baselen-1] == '/')) {
+		buf[baselen-1] = 0;
+		baselen--;
+	}
+	if (slash && slash[1]) {
+		len = strlen(slash);
+		if (sz <= baselen + len)
+			return NULL;
+		memcpy(buf + baselen, slash, len + 1);
+	}
+	return buf;
+}
+
+const char *make_relative_path(const char *abs, const char *base)
+{
+	static char buf[PATH_MAX + 1];
+	int baselen;
+	if (!base)
+		return abs;
+	baselen = strlen(base);
+	if (prefixcmp(abs, base))
+		return abs;
+	if (abs[baselen] == '/')
+		baselen++;
+	else if (base[baselen - 1] != '/')
+		return abs;
+	strcpy(buf, abs + baselen);
+	return buf;
+}
+
+/*
+ * It is okay if dst == src, but they should not overlap otherwise.
+ *
+ * Performs the following normalizations on src, storing the result in dst:
+ * - Ensures that components are separated by '/' (Windows only)
+ * - Squashes sequences of '/'.
+ * - Removes "." components.
+ * - Removes ".." components, and the components the precede them.
+ * Returns failure (non-zero) if a ".." component appears as first path
+ * component anytime during the normalization. Otherwise, returns success (0).
+ *
+ * Note that this function is purely textual.  It does not follow symlinks,
+ * verify the existence of the path, or make any system calls.
+ */
+int normalize_path_copy(char *dst, const char *src)
+{
+	char *dst0;
+
+	if (has_dos_drive_prefix(src)) {
+		*dst++ = *src++;
+		*dst++ = *src++;
+	}
+	dst0 = dst;
+
+	if (is_dir_sep(*src)) {
+		*dst++ = '/';
+		while (is_dir_sep(*src))
+			src++;
+	}
+
+	for (;;) {
+		char c = *src;
+
+		/*
+		 * A path component that begins with . could be
+		 * special:
+		 * (1) "." and ends   -- ignore and terminate.
+		 * (2) "./"           -- ignore them, eat slash and continue.
+		 * (3) ".." and ends  -- strip one and terminate.
+		 * (4) "../"          -- strip one, eat slash and continue.
+		 */
+		if (c == '.') {
+			if (!src[1]) {
+				/* (1) */
+				src++;
+			} else if (is_dir_sep(src[1])) {
+				/* (2) */
+				src += 2;
+				while (is_dir_sep(*src))
+					src++;
+				continue;
+			} else if (src[1] == '.') {
+				if (!src[2]) {
+					/* (3) */
+					src += 2;
+					goto up_one;
+				} else if (is_dir_sep(src[2])) {
+					/* (4) */
+					src += 3;
+					while (is_dir_sep(*src))
+						src++;
+					goto up_one;
+				}
+			}
+		}
+
+		/* copy up to the next '/', and eat all '/' */
+		while ((c = *src++) != '\0' && !is_dir_sep(c))
+			*dst++ = c;
+		if (is_dir_sep(c)) {
+			*dst++ = '/';
+			while (is_dir_sep(c))
+				c = *src++;
+			src--;
+		} else if (!c)
+			break;
+		continue;
+
+	up_one:
+		/*
+		 * dst0..dst is prefix portion, and dst[-1] is '/';
+		 * go up one level.
+		 */
+		dst--;	/* go to trailing '/' */
+		if (dst <= dst0)
+			return -1;
+		/* Windows: dst[-1] cannot be backslash anymore */
+		while (dst0 < dst && dst[-1] != '/')
+			dst--;
+	}
+	*dst = '\0';
+	return 0;
+}
+
+/*
+ * path = Canonical absolute path
+ * prefix_list = Colon-separated list of absolute paths
+ *
+ * Determines, for each path in prefix_list, whether the "prefix" really
+ * is an ancestor directory of path.  Returns the length of the longest
+ * ancestor directory, excluding any trailing slashes, or -1 if no prefix
+ * is an ancestor.  (Note that this means 0 is returned if prefix_list is
+ * "/".) "/foo" is not considered an ancestor of "/foobar".  Directories
+ * are not considered to be their own ancestors.  path must be in a
+ * canonical form: empty components, or "." or ".." components are not
+ * allowed.  prefix_list may be null, which is like "".
+ */
+int longest_ancestor_length(const char *path, const char *prefix_list)
+{
+	char buf[PATH_MAX+1];
+	const char *ceil, *colon;
+	int len, max_len = -1;
+
+	if (prefix_list == NULL || !strcmp(path, "/"))
+		return -1;
+
+	for (colon = ceil = prefix_list; *colon; ceil = colon+1) {
+		for (colon = ceil; *colon && *colon != PATH_SEP; colon++);
+		len = colon - ceil;
+		if (len == 0 || len > PATH_MAX || !is_absolute_path(ceil))
+			continue;
+		strlcpy(buf, ceil, len+1);
+		if (normalize_path_copy(buf, buf) < 0)
+			continue;
+		len = strlen(buf);
+		if (len > 0 && buf[len-1] == '/')
+			buf[--len] = '\0';
+
+		if (!strncmp(path, buf, len) &&
+		    path[len] == '/' &&
+		    len > max_len) {
+			max_len = len;
+		}
+	}
+
+	return max_len;
+}
+
+/* strip arbitrary amount of directory separators at end of path */
+static inline int chomp_trailing_dir_sep(const char *path, int len)
+{
+	while (len && is_dir_sep(path[len - 1]))
+		len--;
+	return len;
+}
+
+/*
+ * If path ends with suffix (complete path components), returns the
+ * part before suffix (sans trailing directory separators).
+ * Otherwise returns NULL.
+ */
+char *strip_path_suffix(const char *path, const char *suffix)
+{
+	int path_len = strlen(path), suffix_len = strlen(suffix);
+
+	while (suffix_len) {
+		if (!path_len)
+			return NULL;
+
+		if (is_dir_sep(path[path_len - 1])) {
+			if (!is_dir_sep(suffix[suffix_len - 1]))
+				return NULL;
+			path_len = chomp_trailing_dir_sep(path, path_len);
+			suffix_len = chomp_trailing_dir_sep(suffix, suffix_len);
+		}
+		else if (path[--path_len] != suffix[--suffix_len])
+			return NULL;
+	}
+
+	if (path_len && !is_dir_sep(path[path_len - 1]))
+		return NULL;
+	return xstrndup(path, chomp_trailing_dir_sep(path, path_len));
+}
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
new file mode 100644
index 0000000..9256f6a
--- /dev/null
+++ b/Documentation/perf_counter/perf.c
@@ -0,0 +1,411 @@
+#include "builtin.h"
+#include "exec_cmd.h"
+#include "cache.h"
+//#include "quote.h"
+#include "run-command.h"
+
+const char perf_usage_string[] =
+	"perf [--version] [--exec-path[=PERF_EXEC_PATH]] [--html-path] [-p|--paginate|--no-pager] [--bare] [--perf-dir=PERF_DIR] [--work-tree=PERF_WORK_TREE] [--help] COMMAND [ARGS]";
+
+const char perf_more_info_string[] =
+	"See 'perf help COMMAND' for more information on a specific command.";
+
+static int use_pager = -1;
+struct pager_config {
+	const char *cmd;
+	int val;
+};
+
+static int pager_command_config(const char *var, const char *value, void *data)
+{
+	struct pager_config *c = data;
+	if (!prefixcmp(var, "pager.") && !strcmp(var + 6, c->cmd))
+		c->val = perf_config_bool(var, value);
+	return 0;
+}
+
+/* returns 0 for "no pager", 1 for "use pager", and -1 for "not specified" */
+int check_pager_config(const char *cmd)
+{
+	struct pager_config c;
+	c.cmd = cmd;
+	c.val = -1;
+	perf_config(pager_command_config, &c);
+	return c.val;
+}
+
+static void commit_pager_choice(void) {
+	switch (use_pager) {
+	case 0:
+		setenv("PERF_PAGER", "cat", 1);
+		break;
+	case 1:
+		/* setup_pager(); */
+		break;
+	default:
+		break;
+	}
+}
+
+static int handle_options(const char*** argv, int* argc, int* envchanged)
+{
+	int handled = 0;
+
+	while (*argc > 0) {
+		const char *cmd = (*argv)[0];
+		if (cmd[0] != '-')
+			break;
+
+		/*
+		 * For legacy reasons, the "version" and "help"
+		 * commands can be written with "--" prepended
+		 * to make them look like flags.
+		 */
+		if (!strcmp(cmd, "--help") || !strcmp(cmd, "--version"))
+			break;
+
+		/*
+		 * Check remaining flags.
+		 */
+		if (!prefixcmp(cmd, "--exec-path")) {
+			cmd += 11;
+			if (*cmd == '=')
+				perf_set_argv_exec_path(cmd + 1);
+			else {
+				puts(perf_exec_path());
+				exit(0);
+			}
+		} else if (!strcmp(cmd, "--html-path")) {
+			puts(system_path(PERF_HTML_PATH));
+			exit(0);
+		} else if (!strcmp(cmd, "-p") || !strcmp(cmd, "--paginate")) {
+			use_pager = 1;
+		} else if (!strcmp(cmd, "--no-pager")) {
+			use_pager = 0;
+			if (envchanged)
+				*envchanged = 1;
+		} else if (!strcmp(cmd, "--perf-dir")) {
+			if (*argc < 2) {
+				fprintf(stderr, "No directory given for --perf-dir.\n" );
+				usage(perf_usage_string);
+			}
+			setenv(PERF_DIR_ENVIRONMENT, (*argv)[1], 1);
+			if (envchanged)
+				*envchanged = 1;
+			(*argv)++;
+			(*argc)--;
+			handled++;
+		} else if (!prefixcmp(cmd, "--perf-dir=")) {
+			setenv(PERF_DIR_ENVIRONMENT, cmd + 10, 1);
+			if (envchanged)
+				*envchanged = 1;
+		} else if (!strcmp(cmd, "--work-tree")) {
+			if (*argc < 2) {
+				fprintf(stderr, "No directory given for --work-tree.\n" );
+				usage(perf_usage_string);
+			}
+			setenv(PERF_WORK_TREE_ENVIRONMENT, (*argv)[1], 1);
+			if (envchanged)
+				*envchanged = 1;
+			(*argv)++;
+			(*argc)--;
+		} else if (!prefixcmp(cmd, "--work-tree=")) {
+			setenv(PERF_WORK_TREE_ENVIRONMENT, cmd + 12, 1);
+			if (envchanged)
+				*envchanged = 1;
+		} else {
+			fprintf(stderr, "Unknown option: %s\n", cmd);
+			usage(perf_usage_string);
+		}
+
+		(*argv)++;
+		(*argc)--;
+		handled++;
+	}
+	return handled;
+}
+
+static int handle_alias(int *argcp, const char ***argv)
+{
+	int envchanged = 0, ret = 0, saved_errno = errno;
+	int count, option_count;
+	const char** new_argv;
+	const char *alias_command;
+	char *alias_string;
+	int unused_nonperf;
+
+	alias_command = (*argv)[0];
+	alias_string = alias_lookup(alias_command);
+	if (alias_string) {
+		if (alias_string[0] == '!') {
+			if (*argcp > 1) {
+				struct strbuf buf;
+
+				strbuf_init(&buf, PATH_MAX);
+				strbuf_addstr(&buf, alias_string);
+				sq_quote_argv(&buf, (*argv) + 1, PATH_MAX);
+				free(alias_string);
+				alias_string = buf.buf;
+			}
+			ret = system(alias_string + 1);
+			if (ret >= 0 && WIFEXITED(ret) &&
+			    WEXITSTATUS(ret) != 127)
+				exit(WEXITSTATUS(ret));
+			die("Failed to run '%s' when expanding alias '%s'",
+			    alias_string + 1, alias_command);
+		}
+		count = split_cmdline(alias_string, &new_argv);
+		if (count < 0)
+			die("Bad alias.%s string", alias_command);
+		option_count = handle_options(&new_argv, &count, &envchanged);
+		if (envchanged)
+			die("alias '%s' changes environment variables\n"
+				 "You can use '!perf' in the alias to do this.",
+				 alias_command);
+		memmove(new_argv - option_count, new_argv,
+				count * sizeof(char *));
+		new_argv -= option_count;
+
+		if (count < 1)
+			die("empty alias for %s", alias_command);
+
+		if (!strcmp(alias_command, new_argv[0]))
+			die("recursive alias: %s", alias_command);
+
+		new_argv = realloc(new_argv, sizeof(char*) *
+				    (count + *argcp + 1));
+		/* insert after command name */
+		memcpy(new_argv + count, *argv + 1, sizeof(char*) * *argcp);
+		new_argv[count+*argcp] = NULL;
+
+		*argv = new_argv;
+		*argcp += count - 1;
+
+		ret = 1;
+	}
+
+	errno = saved_errno;
+
+	return ret;
+}
+
+const char perf_version_string[] = PERF_VERSION;
+
+#define RUN_SETUP	(1<<0)
+#define USE_PAGER	(1<<1)
+/*
+ * require working tree to be present -- anything uses this needs
+ * RUN_SETUP for reading from the configuration file.
+ */
+#define NEED_WORK_TREE	(1<<2)
+
+struct cmd_struct {
+	const char *cmd;
+	int (*fn)(int, const char **, const char *);
+	int option;
+};
+
+static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
+{
+	int status;
+	struct stat st;
+	const char *prefix;
+
+	prefix = NULL;
+	if (p->option & RUN_SETUP)
+		prefix = NULL; /* setup_perf_directory(); */
+
+	if (use_pager == -1 && p->option & RUN_SETUP)
+		use_pager = check_pager_config(p->cmd);
+	if (use_pager == -1 && p->option & USE_PAGER)
+		use_pager = 1;
+	commit_pager_choice();
+
+	if (p->option & NEED_WORK_TREE)
+		/* setup_work_tree() */;
+
+	status = p->fn(argc, argv, prefix);
+	if (status)
+		return status & 0xff;
+
+	/* Somebody closed stdout? */
+	if (fstat(fileno(stdout), &st))
+		return 0;
+	/* Ignore write errors for pipes and sockets.. */
+	if (S_ISFIFO(st.st_mode) || S_ISSOCK(st.st_mode))
+		return 0;
+
+	/* Check for ENOSPC and EIO errors.. */
+	if (fflush(stdout))
+		die("write failure on standard output: %s", strerror(errno));
+	if (ferror(stdout))
+		die("unknown write failure on standard output");
+	if (fclose(stdout))
+		die("close failed on standard output: %s", strerror(errno));
+	return 0;
+}
+
+static void handle_internal_command(int argc, const char **argv)
+{
+	const char *cmd = argv[0];
+	static struct cmd_struct commands[] = {
+		{ "top", cmd_top, 0 },
+	};
+	int i;
+	static const char ext[] = STRIP_EXTENSION;
+
+	if (sizeof(ext) > 1) {
+		i = strlen(argv[0]) - strlen(ext);
+		if (i > 0 && !strcmp(argv[0] + i, ext)) {
+			char *argv0 = strdup(argv[0]);
+			argv[0] = cmd = argv0;
+			argv0[i] = '\0';
+		}
+	}
+
+	/* Turn "perf cmd --help" into "perf help cmd" */
+	if (argc > 1 && !strcmp(argv[1], "--help")) {
+		argv[1] = argv[0];
+		argv[0] = cmd = "help";
+	}
+
+	for (i = 0; i < ARRAY_SIZE(commands); i++) {
+		struct cmd_struct *p = commands+i;
+		if (strcmp(p->cmd, cmd))
+			continue;
+		exit(run_builtin(p, argc, argv));
+	}
+}
+
+static void execv_dashed_external(const char **argv)
+{
+	struct strbuf cmd = STRBUF_INIT;
+	const char *tmp;
+	int status;
+
+	strbuf_addf(&cmd, "perf-%s", argv[0]);
+
+	/*
+	 * argv[0] must be the perf command, but the argv array
+	 * belongs to the caller, and may be reused in
+	 * subsequent loop iterations. Save argv[0] and
+	 * restore it on error.
+	 */
+	tmp = argv[0];
+	argv[0] = cmd.buf;
+
+	/*
+	 * if we fail because the command is not found, it is
+	 * OK to return. Otherwise, we just pass along the status code.
+	 */
+	status = run_command_v_opt(argv, 0);
+	if (status != -ERR_RUN_COMMAND_EXEC) {
+		if (IS_RUN_COMMAND_ERR(status))
+			die("unable to run '%s'", argv[0]);
+		exit(-status);
+	}
+	errno = ENOENT; /* as if we called execvp */
+
+	argv[0] = tmp;
+
+	strbuf_release(&cmd);
+}
+
+static int run_argv(int *argcp, const char ***argv)
+{
+	int done_alias = 0;
+
+	while (1) {
+		/* See if it's an internal command */
+		handle_internal_command(*argcp, *argv);
+
+		/* .. then try the external ones */
+		execv_dashed_external(*argv);
+
+		/* It could be an alias -- this works around the insanity
+		 * of overriding "perf log" with "perf show" by having
+		 * alias.log = show
+		 */
+		if (done_alias || !handle_alias(argcp, argv))
+			break;
+		done_alias = 1;
+	}
+
+	return done_alias;
+}
+
+
+int main(int argc, const char **argv)
+{
+	const char *cmd;
+
+	cmd = perf_extract_argv0_path(argv[0]);
+	if (!cmd)
+		cmd = "perf-help";
+
+	/*
+	 * "perf-xxxx" is the same as "perf xxxx", but we obviously:
+	 *
+	 *  - cannot take flags in between the "perf" and the "xxxx".
+	 *  - cannot execute it externally (since it would just do
+	 *    the same thing over again)
+	 *
+	 * So we just directly call the internal command handler, and
+	 * die if that one cannot handle it.
+	 */
+	if (!prefixcmp(cmd, "perf-")) {
+		cmd += 4;
+		argv[0] = cmd;
+		handle_internal_command(argc, argv);
+		die("cannot handle %s internally", cmd);
+	}
+
+	/* Look for flags.. */
+	argv++;
+	argc--;
+	handle_options(&argv, &argc, NULL);
+	commit_pager_choice();
+	if (argc > 0) {
+		if (!prefixcmp(argv[0], "--"))
+			argv[0] += 2;
+	} else {
+		/* The user didn't specify a command; give them help */
+		printf("usage: %s\n\n", perf_usage_string);
+		list_common_cmds_help();
+		printf("\n%s\n", perf_more_info_string);
+		exit(1);
+	}
+	cmd = argv[0];
+
+	/*
+	 * We use PATH to find perf commands, but we prepend some higher
+	 * precidence paths: the "--exec-path" option, the PERF_EXEC_PATH
+	 * environment, and the $(perfexecdir) from the Makefile at build
+	 * time.
+	 */
+	setup_path();
+
+	while (1) {
+		static int done_help = 0;
+		static int was_alias = 0;
+		was_alias = run_argv(&argc, &argv);
+		if (errno != ENOENT)
+			break;
+		if (was_alias) {
+			fprintf(stderr, "Expansion of alias '%s' failed; "
+				"'%s' is not a perf-command\n",
+				cmd, argv[0]);
+			exit(1);
+		}
+		if (!done_help) {
+			cmd = argv[0] = help_unknown_cmd(cmd);
+			done_help = 1;
+		} else
+			break;
+	}
+
+	fprintf(stderr, "Failed to run command '%s': %s\n",
+		cmd, strerror(errno));
+
+	return 1;
+}
diff --git a/Documentation/perf_counter/quote.c b/Documentation/perf_counter/quote.c
new file mode 100644
index 0000000..7a49fcf
--- /dev/null
+++ b/Documentation/perf_counter/quote.c
@@ -0,0 +1,478 @@
+#include "cache.h"
+#include "quote.h"
+
+int quote_path_fully = 1;
+
+/* Help to copy the thing properly quoted for the shell safety.
+ * any single quote is replaced with '\'', any exclamation point
+ * is replaced with '\!', and the whole thing is enclosed in a
+ *
+ * E.g.
+ *  original     sq_quote     result
+ *  name     ==> name      ==> 'name'
+ *  a b      ==> a b       ==> 'a b'
+ *  a'b      ==> a'\''b    ==> 'a'\''b'
+ *  a!b      ==> a'\!'b    ==> 'a'\!'b'
+ */
+static inline int need_bs_quote(char c)
+{
+	return (c == '\'' || c == '!');
+}
+
+void sq_quote_buf(struct strbuf *dst, const char *src)
+{
+	char *to_free = NULL;
+
+	if (dst->buf == src)
+		to_free = strbuf_detach(dst, NULL);
+
+	strbuf_addch(dst, '\'');
+	while (*src) {
+		size_t len = strcspn(src, "'!");
+		strbuf_add(dst, src, len);
+		src += len;
+		while (need_bs_quote(*src)) {
+			strbuf_addstr(dst, "'\\");
+			strbuf_addch(dst, *src++);
+			strbuf_addch(dst, '\'');
+		}
+	}
+	strbuf_addch(dst, '\'');
+	free(to_free);
+}
+
+void sq_quote_print(FILE *stream, const char *src)
+{
+	char c;
+
+	fputc('\'', stream);
+	while ((c = *src++)) {
+		if (need_bs_quote(c)) {
+			fputs("'\\", stream);
+			fputc(c, stream);
+			fputc('\'', stream);
+		} else {
+			fputc(c, stream);
+		}
+	}
+	fputc('\'', stream);
+}
+
+void sq_quote_argv(struct strbuf *dst, const char** argv, size_t maxlen)
+{
+	int i;
+
+	/* Copy into destination buffer. */
+	strbuf_grow(dst, 255);
+	for (i = 0; argv[i]; ++i) {
+		strbuf_addch(dst, ' ');
+		sq_quote_buf(dst, argv[i]);
+		if (maxlen && dst->len > maxlen)
+			die("Too many or long arguments");
+	}
+}
+
+char *sq_dequote_step(char *arg, char **next)
+{
+	char *dst = arg;
+	char *src = arg;
+	char c;
+
+	if (*src != '\'')
+		return NULL;
+	for (;;) {
+		c = *++src;
+		if (!c)
+			return NULL;
+		if (c != '\'') {
+			*dst++ = c;
+			continue;
+		}
+		/* We stepped out of sq */
+		switch (*++src) {
+		case '\0':
+			*dst = 0;
+			if (next)
+				*next = NULL;
+			return arg;
+		case '\\':
+			c = *++src;
+			if (need_bs_quote(c) && *++src == '\'') {
+				*dst++ = c;
+				continue;
+			}
+		/* Fallthrough */
+		default:
+			if (!next || !isspace(*src))
+				return NULL;
+			do {
+				c = *++src;
+			} while (isspace(c));
+			*dst = 0;
+			*next = src;
+			return arg;
+		}
+	}
+}
+
+char *sq_dequote(char *arg)
+{
+	return sq_dequote_step(arg, NULL);
+}
+
+int sq_dequote_to_argv(char *arg, const char ***argv, int *nr, int *alloc)
+{
+	char *next = arg;
+
+	if (!*arg)
+		return 0;
+	do {
+		char *dequoted = sq_dequote_step(next, &next);
+		if (!dequoted)
+			return -1;
+		ALLOC_GROW(*argv, *nr + 1, *alloc);
+		(*argv)[(*nr)++] = dequoted;
+	} while (next);
+
+	return 0;
+}
+
+/* 1 means: quote as octal
+ * 0 means: quote as octal if (quote_path_fully)
+ * -1 means: never quote
+ * c: quote as "\\c"
+ */
+#define X8(x)   x, x, x, x, x, x, x, x
+#define X16(x)  X8(x), X8(x)
+static signed char const sq_lookup[256] = {
+	/*           0    1    2    3    4    5    6    7 */
+	/* 0x00 */   1,   1,   1,   1,   1,   1,   1, 'a',
+	/* 0x08 */ 'b', 't', 'n', 'v', 'f', 'r',   1,   1,
+	/* 0x10 */ X16(1),
+	/* 0x20 */  -1,  -1, '"',  -1,  -1,  -1,  -1,  -1,
+	/* 0x28 */ X16(-1), X16(-1), X16(-1),
+	/* 0x58 */  -1,  -1,  -1,  -1,'\\',  -1,  -1,  -1,
+	/* 0x60 */ X16(-1), X8(-1),
+	/* 0x78 */  -1,  -1,  -1,  -1,  -1,  -1,  -1,   1,
+	/* 0x80 */ /* set to 0 */
+};
+
+static inline int sq_must_quote(char c)
+{
+	return sq_lookup[(unsigned char)c] + quote_path_fully > 0;
+}
+
+/* returns the longest prefix not needing a quote up to maxlen if positive.
+   This stops at the first \0 because it's marked as a character needing an
+   escape */
+static size_t next_quote_pos(const char *s, ssize_t maxlen)
+{
+	size_t len;
+	if (maxlen < 0) {
+		for (len = 0; !sq_must_quote(s[len]); len++);
+	} else {
+		for (len = 0; len < maxlen && !sq_must_quote(s[len]); len++);
+	}
+	return len;
+}
+
+/*
+ * C-style name quoting.
+ *
+ * (1) if sb and fp are both NULL, inspect the input name and counts the
+ *     number of bytes that are needed to hold c_style quoted version of name,
+ *     counting the double quotes around it but not terminating NUL, and
+ *     returns it.
+ *     However, if name does not need c_style quoting, it returns 0.
+ *
+ * (2) if sb or fp are not NULL, it emits the c_style quoted version
+ *     of name, enclosed with double quotes if asked and needed only.
+ *     Return value is the same as in (1).
+ */
+static size_t quote_c_style_counted(const char *name, ssize_t maxlen,
+                                    struct strbuf *sb, FILE *fp, int no_dq)
+{
+#undef EMIT
+#define EMIT(c)                                 \
+	do {                                        \
+		if (sb) strbuf_addch(sb, (c));          \
+		if (fp) fputc((c), fp);                 \
+		count++;                                \
+	} while (0)
+#define EMITBUF(s, l)                           \
+	do {                                        \
+		if (sb) strbuf_add(sb, (s), (l));       \
+		if (fp) fwrite((s), (l), 1, fp);        \
+		count += (l);                           \
+	} while (0)
+
+	size_t len, count = 0;
+	const char *p = name;
+
+	for (;;) {
+		int ch;
+
+		len = next_quote_pos(p, maxlen);
+		if (len == maxlen || !p[len])
+			break;
+
+		if (!no_dq && p == name)
+			EMIT('"');
+
+		EMITBUF(p, len);
+		EMIT('\\');
+		p += len;
+		ch = (unsigned char)*p++;
+		if (sq_lookup[ch] >= ' ') {
+			EMIT(sq_lookup[ch]);
+		} else {
+			EMIT(((ch >> 6) & 03) + '0');
+			EMIT(((ch >> 3) & 07) + '0');
+			EMIT(((ch >> 0) & 07) + '0');
+		}
+	}
+
+	EMITBUF(p, len);
+	if (p == name)   /* no ending quote needed */
+		return 0;
+
+	if (!no_dq)
+		EMIT('"');
+	return count;
+}
+
+size_t quote_c_style(const char *name, struct strbuf *sb, FILE *fp, int nodq)
+{
+	return quote_c_style_counted(name, -1, sb, fp, nodq);
+}
+
+void quote_two_c_style(struct strbuf *sb, const char *prefix, const char *path, int nodq)
+{
+	if (quote_c_style(prefix, NULL, NULL, 0) ||
+	    quote_c_style(path, NULL, NULL, 0)) {
+		if (!nodq)
+			strbuf_addch(sb, '"');
+		quote_c_style(prefix, sb, NULL, 1);
+		quote_c_style(path, sb, NULL, 1);
+		if (!nodq)
+			strbuf_addch(sb, '"');
+	} else {
+		strbuf_addstr(sb, prefix);
+		strbuf_addstr(sb, path);
+	}
+}
+
+void write_name_quoted(const char *name, FILE *fp, int terminator)
+{
+	if (terminator) {
+		quote_c_style(name, NULL, fp, 0);
+	} else {
+		fputs(name, fp);
+	}
+	fputc(terminator, fp);
+}
+
+extern void write_name_quotedpfx(const char *pfx, size_t pfxlen,
+                                 const char *name, FILE *fp, int terminator)
+{
+	int needquote = 0;
+
+	if (terminator) {
+		needquote = next_quote_pos(pfx, pfxlen) < pfxlen
+			|| name[next_quote_pos(name, -1)];
+	}
+	if (needquote) {
+		fputc('"', fp);
+		quote_c_style_counted(pfx, pfxlen, NULL, fp, 1);
+		quote_c_style(name, NULL, fp, 1);
+		fputc('"', fp);
+	} else {
+		fwrite(pfx, pfxlen, 1, fp);
+		fputs(name, fp);
+	}
+	fputc(terminator, fp);
+}
+
+/* quote path as relative to the given prefix */
+char *quote_path_relative(const char *in, int len,
+			  struct strbuf *out, const char *prefix)
+{
+	int needquote;
+
+	if (len < 0)
+		len = strlen(in);
+
+	/* "../" prefix itself does not need quoting, but "in" might. */
+	needquote = next_quote_pos(in, len) < len;
+	strbuf_setlen(out, 0);
+	strbuf_grow(out, len);
+
+	if (needquote)
+		strbuf_addch(out, '"');
+	if (prefix) {
+		int off = 0;
+		while (prefix[off] && off < len && prefix[off] == in[off])
+			if (prefix[off] == '/') {
+				prefix += off + 1;
+				in += off + 1;
+				len -= off + 1;
+				off = 0;
+			} else
+				off++;
+
+		for (; *prefix; prefix++)
+			if (*prefix == '/')
+				strbuf_addstr(out, "../");
+	}
+
+	quote_c_style_counted (in, len, out, NULL, 1);
+
+	if (needquote)
+		strbuf_addch(out, '"');
+	if (!out->len)
+		strbuf_addstr(out, "./");
+
+	return out->buf;
+}
+
+/*
+ * C-style name unquoting.
+ *
+ * Quoted should point at the opening double quote.
+ * + Returns 0 if it was able to unquote the string properly, and appends the
+ *   result in the strbuf `sb'.
+ * + Returns -1 in case of error, and doesn't touch the strbuf. Though note
+ *   that this function will allocate memory in the strbuf, so calling
+ *   strbuf_release is mandatory whichever result unquote_c_style returns.
+ *
+ * Updates endp pointer to point at one past the ending double quote if given.
+ */
+int unquote_c_style(struct strbuf *sb, const char *quoted, const char **endp)
+{
+	size_t oldlen = sb->len, len;
+	int ch, ac;
+
+	if (*quoted++ != '"')
+		return -1;
+
+	for (;;) {
+		len = strcspn(quoted, "\"\\");
+		strbuf_add(sb, quoted, len);
+		quoted += len;
+
+		switch (*quoted++) {
+		  case '"':
+			if (endp)
+				*endp = quoted;
+			return 0;
+		  case '\\':
+			break;
+		  default:
+			goto error;
+		}
+
+		switch ((ch = *quoted++)) {
+		case 'a': ch = '\a'; break;
+		case 'b': ch = '\b'; break;
+		case 'f': ch = '\f'; break;
+		case 'n': ch = '\n'; break;
+		case 'r': ch = '\r'; break;
+		case 't': ch = '\t'; break;
+		case 'v': ch = '\v'; break;
+
+		case '\\': case '"':
+			break; /* verbatim */
+
+		/* octal values with first digit over 4 overflow */
+		case '0': case '1': case '2': case '3':
+					ac = ((ch - '0') << 6);
+			if ((ch = *quoted++) < '0' || '7' < ch)
+				goto error;
+					ac |= ((ch - '0') << 3);
+			if ((ch = *quoted++) < '0' || '7' < ch)
+				goto error;
+					ac |= (ch - '0');
+					ch = ac;
+					break;
+				default:
+			goto error;
+			}
+		strbuf_addch(sb, ch);
+		}
+
+  error:
+	strbuf_setlen(sb, oldlen);
+	return -1;
+}
+
+/* quoting as a string literal for other languages */
+
+void perl_quote_print(FILE *stream, const char *src)
+{
+	const char sq = '\'';
+	const char bq = '\\';
+	char c;
+
+	fputc(sq, stream);
+	while ((c = *src++)) {
+		if (c == sq || c == bq)
+			fputc(bq, stream);
+		fputc(c, stream);
+	}
+	fputc(sq, stream);
+}
+
+void python_quote_print(FILE *stream, const char *src)
+{
+	const char sq = '\'';
+	const char bq = '\\';
+	const char nl = '\n';
+	char c;
+
+	fputc(sq, stream);
+	while ((c = *src++)) {
+		if (c == nl) {
+			fputc(bq, stream);
+			fputc('n', stream);
+			continue;
+		}
+		if (c == sq || c == bq)
+			fputc(bq, stream);
+		fputc(c, stream);
+	}
+	fputc(sq, stream);
+}
+
+void tcl_quote_print(FILE *stream, const char *src)
+{
+	char c;
+
+	fputc('"', stream);
+	while ((c = *src++)) {
+		switch (c) {
+		case '[': case ']':
+		case '{': case '}':
+		case '$': case '\\': case '"':
+			fputc('\\', stream);
+		default:
+			fputc(c, stream);
+			break;
+		case '\f':
+			fputs("\\f", stream);
+			break;
+		case '\r':
+			fputs("\\r", stream);
+			break;
+		case '\n':
+			fputs("\\n", stream);
+			break;
+		case '\t':
+			fputs("\\t", stream);
+			break;
+		case '\v':
+			fputs("\\v", stream);
+			break;
+		}
+	}
+	fputc('"', stream);
+}
diff --git a/Documentation/perf_counter/quote.h b/Documentation/perf_counter/quote.h
new file mode 100644
index 0000000..66730f2
--- /dev/null
+++ b/Documentation/perf_counter/quote.h
@@ -0,0 +1,68 @@
+#ifndef QUOTE_H
+#define QUOTE_H
+
+#include <stddef.h>
+#include <stdio.h>
+
+/* Help to copy the thing properly quoted for the shell safety.
+ * any single quote is replaced with '\'', any exclamation point
+ * is replaced with '\!', and the whole thing is enclosed in a
+ * single quote pair.
+ *
+ * For example, if you are passing the result to system() as an
+ * argument:
+ *
+ * sprintf(cmd, "foobar %s %s", sq_quote(arg0), sq_quote(arg1))
+ *
+ * would be appropriate.  If the system() is going to call ssh to
+ * run the command on the other side:
+ *
+ * sprintf(cmd, "git-diff-tree %s %s", sq_quote(arg0), sq_quote(arg1));
+ * sprintf(rcmd, "ssh %s %s", sq_quote(host), sq_quote(cmd));
+ *
+ * Note that the above examples leak memory!  Remember to free result from
+ * sq_quote() in a real application.
+ *
+ * sq_quote_buf() writes to an existing buffer of specified size; it
+ * will return the number of characters that would have been written
+ * excluding the final null regardless of the buffer size.
+ */
+
+extern void sq_quote_print(FILE *stream, const char *src);
+
+extern void sq_quote_buf(struct strbuf *, const char *src);
+extern void sq_quote_argv(struct strbuf *, const char **argv, size_t maxlen);
+
+/* This unwraps what sq_quote() produces in place, but returns
+ * NULL if the input does not look like what sq_quote would have
+ * produced.
+ */
+extern char *sq_dequote(char *);
+
+/*
+ * Same as the above, but can be used to unwrap many arguments in the
+ * same string separated by space. "next" is changed to point to the
+ * next argument that should be passed as first parameter. When there
+ * is no more argument to be dequoted, "next" is updated to point to NULL.
+ */
+extern char *sq_dequote_step(char *arg, char **next);
+extern int sq_dequote_to_argv(char *arg, const char ***argv, int *nr, int *alloc);
+
+extern int unquote_c_style(struct strbuf *, const char *quoted, const char **endp);
+extern size_t quote_c_style(const char *name, struct strbuf *, FILE *, int no_dq);
+extern void quote_two_c_style(struct strbuf *, const char *, const char *, int);
+
+extern void write_name_quoted(const char *name, FILE *, int terminator);
+extern void write_name_quotedpfx(const char *pfx, size_t pfxlen,
+                                 const char *name, FILE *, int terminator);
+
+/* quote path as relative to the given prefix */
+char *quote_path_relative(const char *in, int len,
+			  struct strbuf *out, const char *prefix);
+
+/* quoting as a string literal for other languages */
+extern void perl_quote_print(FILE *stream, const char *src);
+extern void python_quote_print(FILE *stream, const char *src);
+extern void tcl_quote_print(FILE *stream, const char *src);
+
+#endif
diff --git a/Documentation/perf_counter/run-command.c b/Documentation/perf_counter/run-command.c
new file mode 100644
index 0000000..b2f5e85
--- /dev/null
+++ b/Documentation/perf_counter/run-command.c
@@ -0,0 +1,395 @@
+#include "cache.h"
+#include "run-command.h"
+#include "exec_cmd.h"
+
+static inline void close_pair(int fd[2])
+{
+	close(fd[0]);
+	close(fd[1]);
+}
+
+static inline void dup_devnull(int to)
+{
+	int fd = open("/dev/null", O_RDWR);
+	dup2(fd, to);
+	close(fd);
+}
+
+int start_command(struct child_process *cmd)
+{
+	int need_in, need_out, need_err;
+	int fdin[2], fdout[2], fderr[2];
+
+	/*
+	 * In case of errors we must keep the promise to close FDs
+	 * that have been passed in via ->in and ->out.
+	 */
+
+	need_in = !cmd->no_stdin && cmd->in < 0;
+	if (need_in) {
+		if (pipe(fdin) < 0) {
+			if (cmd->out > 0)
+				close(cmd->out);
+			return -ERR_RUN_COMMAND_PIPE;
+		}
+		cmd->in = fdin[1];
+	}
+
+	need_out = !cmd->no_stdout
+		&& !cmd->stdout_to_stderr
+		&& cmd->out < 0;
+	if (need_out) {
+		if (pipe(fdout) < 0) {
+			if (need_in)
+				close_pair(fdin);
+			else if (cmd->in)
+				close(cmd->in);
+			return -ERR_RUN_COMMAND_PIPE;
+		}
+		cmd->out = fdout[0];
+	}
+
+	need_err = !cmd->no_stderr && cmd->err < 0;
+	if (need_err) {
+		if (pipe(fderr) < 0) {
+			if (need_in)
+				close_pair(fdin);
+			else if (cmd->in)
+				close(cmd->in);
+			if (need_out)
+				close_pair(fdout);
+			else if (cmd->out)
+				close(cmd->out);
+			return -ERR_RUN_COMMAND_PIPE;
+		}
+		cmd->err = fderr[0];
+	}
+
+#ifndef __MINGW32__
+	fflush(NULL);
+	cmd->pid = fork();
+	if (!cmd->pid) {
+		if (cmd->no_stdin)
+			dup_devnull(0);
+		else if (need_in) {
+			dup2(fdin[0], 0);
+			close_pair(fdin);
+		} else if (cmd->in) {
+			dup2(cmd->in, 0);
+			close(cmd->in);
+		}
+
+		if (cmd->no_stderr)
+			dup_devnull(2);
+		else if (need_err) {
+			dup2(fderr[1], 2);
+			close_pair(fderr);
+		}
+
+		if (cmd->no_stdout)
+			dup_devnull(1);
+		else if (cmd->stdout_to_stderr)
+			dup2(2, 1);
+		else if (need_out) {
+			dup2(fdout[1], 1);
+			close_pair(fdout);
+		} else if (cmd->out > 1) {
+			dup2(cmd->out, 1);
+			close(cmd->out);
+		}
+
+		if (cmd->dir && chdir(cmd->dir))
+			die("exec %s: cd to %s failed (%s)", cmd->argv[0],
+			    cmd->dir, strerror(errno));
+		if (cmd->env) {
+			for (; *cmd->env; cmd->env++) {
+				if (strchr(*cmd->env, '='))
+					putenv((char*)*cmd->env);
+				else
+					unsetenv(*cmd->env);
+			}
+		}
+		if (cmd->preexec_cb)
+			cmd->preexec_cb();
+		if (cmd->perf_cmd) {
+			execv_perf_cmd(cmd->argv);
+		} else {
+			execvp(cmd->argv[0], (char *const*) cmd->argv);
+		}
+		exit(127);
+	}
+#else
+	int s0 = -1, s1 = -1, s2 = -1;	/* backups of stdin, stdout, stderr */
+	const char **sargv = cmd->argv;
+	char **env = environ;
+
+	if (cmd->no_stdin) {
+		s0 = dup(0);
+		dup_devnull(0);
+	} else if (need_in) {
+		s0 = dup(0);
+		dup2(fdin[0], 0);
+	} else if (cmd->in) {
+		s0 = dup(0);
+		dup2(cmd->in, 0);
+	}
+
+	if (cmd->no_stderr) {
+		s2 = dup(2);
+		dup_devnull(2);
+	} else if (need_err) {
+		s2 = dup(2);
+		dup2(fderr[1], 2);
+	}
+
+	if (cmd->no_stdout) {
+		s1 = dup(1);
+		dup_devnull(1);
+	} else if (cmd->stdout_to_stderr) {
+		s1 = dup(1);
+		dup2(2, 1);
+	} else if (need_out) {
+		s1 = dup(1);
+		dup2(fdout[1], 1);
+	} else if (cmd->out > 1) {
+		s1 = dup(1);
+		dup2(cmd->out, 1);
+	}
+
+	if (cmd->dir)
+		die("chdir in start_command() not implemented");
+	if (cmd->env) {
+		env = copy_environ();
+		for (; *cmd->env; cmd->env++)
+			env = env_setenv(env, *cmd->env);
+	}
+
+	if (cmd->perf_cmd) {
+		cmd->argv = prepare_perf_cmd(cmd->argv);
+	}
+
+	cmd->pid = mingw_spawnvpe(cmd->argv[0], cmd->argv, env);
+
+	if (cmd->env)
+		free_environ(env);
+	if (cmd->perf_cmd)
+		free(cmd->argv);
+
+	cmd->argv = sargv;
+	if (s0 >= 0)
+		dup2(s0, 0), close(s0);
+	if (s1 >= 0)
+		dup2(s1, 1), close(s1);
+	if (s2 >= 0)
+		dup2(s2, 2), close(s2);
+#endif
+
+	if (cmd->pid < 0) {
+		int err = errno;
+		if (need_in)
+			close_pair(fdin);
+		else if (cmd->in)
+			close(cmd->in);
+		if (need_out)
+			close_pair(fdout);
+		else if (cmd->out)
+			close(cmd->out);
+		if (need_err)
+			close_pair(fderr);
+		return err == ENOENT ?
+			-ERR_RUN_COMMAND_EXEC :
+			-ERR_RUN_COMMAND_FORK;
+	}
+
+	if (need_in)
+		close(fdin[0]);
+	else if (cmd->in)
+		close(cmd->in);
+
+	if (need_out)
+		close(fdout[1]);
+	else if (cmd->out)
+		close(cmd->out);
+
+	if (need_err)
+		close(fderr[1]);
+
+	return 0;
+}
+
+static int wait_or_whine(pid_t pid)
+{
+	for (;;) {
+		int status, code;
+		pid_t waiting = waitpid(pid, &status, 0);
+
+		if (waiting < 0) {
+			if (errno == EINTR)
+				continue;
+			error("waitpid failed (%s)", strerror(errno));
+			return -ERR_RUN_COMMAND_WAITPID;
+		}
+		if (waiting != pid)
+			return -ERR_RUN_COMMAND_WAITPID_WRONG_PID;
+		if (WIFSIGNALED(status))
+			return -ERR_RUN_COMMAND_WAITPID_SIGNAL;
+
+		if (!WIFEXITED(status))
+			return -ERR_RUN_COMMAND_WAITPID_NOEXIT;
+		code = WEXITSTATUS(status);
+		switch (code) {
+		case 127:
+			return -ERR_RUN_COMMAND_EXEC;
+		case 0:
+			return 0;
+		default:
+			return -code;
+		}
+	}
+}
+
+int finish_command(struct child_process *cmd)
+{
+	return wait_or_whine(cmd->pid);
+}
+
+int run_command(struct child_process *cmd)
+{
+	int code = start_command(cmd);
+	if (code)
+		return code;
+	return finish_command(cmd);
+}
+
+static void prepare_run_command_v_opt(struct child_process *cmd,
+				      const char **argv,
+				      int opt)
+{
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->argv = argv;
+	cmd->no_stdin = opt & RUN_COMMAND_NO_STDIN ? 1 : 0;
+	cmd->perf_cmd = opt & RUN_PERF_CMD ? 1 : 0;
+	cmd->stdout_to_stderr = opt & RUN_COMMAND_STDOUT_TO_STDERR ? 1 : 0;
+}
+
+int run_command_v_opt(const char **argv, int opt)
+{
+	struct child_process cmd;
+	prepare_run_command_v_opt(&cmd, argv, opt);
+	return run_command(&cmd);
+}
+
+int run_command_v_opt_cd_env(const char **argv, int opt, const char *dir, const char *const *env)
+{
+	struct child_process cmd;
+	prepare_run_command_v_opt(&cmd, argv, opt);
+	cmd.dir = dir;
+	cmd.env = env;
+	return run_command(&cmd);
+}
+
+#ifdef __MINGW32__
+static __stdcall unsigned run_thread(void *data)
+{
+	struct async *async = data;
+	return async->proc(async->fd_for_proc, async->data);
+}
+#endif
+
+int start_async(struct async *async)
+{
+	int pipe_out[2];
+
+	if (pipe(pipe_out) < 0)
+		return error("cannot create pipe: %s", strerror(errno));
+	async->out = pipe_out[0];
+
+#ifndef __MINGW32__
+	/* Flush stdio before fork() to avoid cloning buffers */
+	fflush(NULL);
+
+	async->pid = fork();
+	if (async->pid < 0) {
+		error("fork (async) failed: %s", strerror(errno));
+		close_pair(pipe_out);
+		return -1;
+	}
+	if (!async->pid) {
+		close(pipe_out[0]);
+		exit(!!async->proc(pipe_out[1], async->data));
+	}
+	close(pipe_out[1]);
+#else
+	async->fd_for_proc = pipe_out[1];
+	async->tid = (HANDLE) _beginthreadex(NULL, 0, run_thread, async, 0, NULL);
+	if (!async->tid) {
+		error("cannot create thread: %s", strerror(errno));
+		close_pair(pipe_out);
+		return -1;
+	}
+#endif
+	return 0;
+}
+
+int finish_async(struct async *async)
+{
+#ifndef __MINGW32__
+	int ret = 0;
+
+	if (wait_or_whine(async->pid))
+		ret = error("waitpid (async) failed");
+#else
+	DWORD ret = 0;
+	if (WaitForSingleObject(async->tid, INFINITE) != WAIT_OBJECT_0)
+		ret = error("waiting for thread failed: %lu", GetLastError());
+	else if (!GetExitCodeThread(async->tid, &ret))
+		ret = error("cannot get thread exit code: %lu", GetLastError());
+	CloseHandle(async->tid);
+#endif
+	return ret;
+}
+
+int run_hook(const char *index_file, const char *name, ...)
+{
+	struct child_process hook;
+	const char **argv = NULL, *env[2];
+	char index[PATH_MAX];
+	va_list args;
+	int ret;
+	size_t i = 0, alloc = 0;
+
+	if (access(perf_path("hooks/%s", name), X_OK) < 0)
+		return 0;
+
+	va_start(args, name);
+	ALLOC_GROW(argv, i + 1, alloc);
+	argv[i++] = perf_path("hooks/%s", name);
+	while (argv[i-1]) {
+		ALLOC_GROW(argv, i + 1, alloc);
+		argv[i++] = va_arg(args, const char *);
+	}
+	va_end(args);
+
+	memset(&hook, 0, sizeof(hook));
+	hook.argv = argv;
+	hook.no_stdin = 1;
+	hook.stdout_to_stderr = 1;
+	if (index_file) {
+		snprintf(index, sizeof(index), "PERF_INDEX_FILE=%s", index_file);
+		env[0] = index;
+		env[1] = NULL;
+		hook.env = env;
+	}
+
+	ret = start_command(&hook);
+	free(argv);
+	if (ret) {
+		warning("Could not spawn %s", argv[0]);
+		return ret;
+	}
+	ret = finish_command(&hook);
+	if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
+		warning("%s exited due to uncaught signal", argv[0]);
+
+	return ret;
+}
diff --git a/Documentation/perf_counter/run-command.h b/Documentation/perf_counter/run-command.h
new file mode 100644
index 0000000..328289f
--- /dev/null
+++ b/Documentation/perf_counter/run-command.h
@@ -0,0 +1,93 @@
+#ifndef RUN_COMMAND_H
+#define RUN_COMMAND_H
+
+enum {
+	ERR_RUN_COMMAND_FORK = 10000,
+	ERR_RUN_COMMAND_EXEC,
+	ERR_RUN_COMMAND_PIPE,
+	ERR_RUN_COMMAND_WAITPID,
+	ERR_RUN_COMMAND_WAITPID_WRONG_PID,
+	ERR_RUN_COMMAND_WAITPID_SIGNAL,
+	ERR_RUN_COMMAND_WAITPID_NOEXIT,
+};
+#define IS_RUN_COMMAND_ERR(x) (-(x) >= ERR_RUN_COMMAND_FORK)
+
+struct child_process {
+	const char **argv;
+	pid_t pid;
+	/*
+	 * Using .in, .out, .err:
+	 * - Specify 0 for no redirections (child inherits stdin, stdout,
+	 *   stderr from parent).
+	 * - Specify -1 to have a pipe allocated as follows:
+	 *     .in: returns the writable pipe end; parent writes to it,
+	 *          the readable pipe end becomes child's stdin
+	 *     .out, .err: returns the readable pipe end; parent reads from
+	 *          it, the writable pipe end becomes child's stdout/stderr
+	 *   The caller of start_command() must close the returned FDs
+	 *   after it has completed reading from/writing to it!
+	 * - Specify > 0 to set a channel to a particular FD as follows:
+	 *     .in: a readable FD, becomes child's stdin
+	 *     .out: a writable FD, becomes child's stdout/stderr
+	 *     .err > 0 not supported
+	 *   The specified FD is closed by start_command(), even in case
+	 *   of errors!
+	 */
+	int in;
+	int out;
+	int err;
+	const char *dir;
+	const char *const *env;
+	unsigned no_stdin:1;
+	unsigned no_stdout:1;
+	unsigned no_stderr:1;
+	unsigned perf_cmd:1; /* if this is to be perf sub-command */
+	unsigned stdout_to_stderr:1;
+	void (*preexec_cb)(void);
+};
+
+int start_command(struct child_process *);
+int finish_command(struct child_process *);
+int run_command(struct child_process *);
+
+extern int run_hook(const char *index_file, const char *name, ...);
+
+#define RUN_COMMAND_NO_STDIN 1
+#define RUN_PERF_CMD	     2	/*If this is to be perf sub-command */
+#define RUN_COMMAND_STDOUT_TO_STDERR 4
+int run_command_v_opt(const char **argv, int opt);
+
+/*
+ * env (the environment) is to be formatted like environ: "VAR=VALUE".
+ * To unset an environment variable use just "VAR".
+ */
+int run_command_v_opt_cd_env(const char **argv, int opt, const char *dir, const char *const *env);
+
+/*
+ * The purpose of the following functions is to feed a pipe by running
+ * a function asynchronously and providing output that the caller reads.
+ *
+ * It is expected that no synchronization and mutual exclusion between
+ * the caller and the feed function is necessary so that the function
+ * can run in a thread without interfering with the caller.
+ */
+struct async {
+	/*
+	 * proc writes to fd and closes it;
+	 * returns 0 on success, non-zero on failure
+	 */
+	int (*proc)(int fd, void *data);
+	void *data;
+	int out;	/* caller reads from here and closes it */
+#ifndef __MINGW32__
+	pid_t pid;
+#else
+	HANDLE tid;
+	int fd_for_proc;
+#endif
+};
+
+int start_async(struct async *async);
+int finish_async(struct async *async);
+
+#endif
diff --git a/Documentation/perf_counter/strbuf.c b/Documentation/perf_counter/strbuf.c
new file mode 100644
index 0000000..eaba093
--- /dev/null
+++ b/Documentation/perf_counter/strbuf.c
@@ -0,0 +1,359 @@
+#include "cache.h"
+
+int prefixcmp(const char *str, const char *prefix)
+{
+	for (; ; str++, prefix++)
+		if (!*prefix)
+			return 0;
+		else if (*str != *prefix)
+			return (unsigned char)*prefix - (unsigned char)*str;
+}
+
+/*
+ * Used as the default ->buf value, so that people can always assume
+ * buf is non NULL and ->buf is NUL terminated even for a freshly
+ * initialized strbuf.
+ */
+char strbuf_slopbuf[1];
+
+void strbuf_init(struct strbuf *sb, size_t hint)
+{
+	sb->alloc = sb->len = 0;
+	sb->buf = strbuf_slopbuf;
+	if (hint)
+		strbuf_grow(sb, hint);
+}
+
+void strbuf_release(struct strbuf *sb)
+{
+	if (sb->alloc) {
+		free(sb->buf);
+		strbuf_init(sb, 0);
+	}
+}
+
+char *strbuf_detach(struct strbuf *sb, size_t *sz)
+{
+	char *res = sb->alloc ? sb->buf : NULL;
+	if (sz)
+		*sz = sb->len;
+	strbuf_init(sb, 0);
+	return res;
+}
+
+void strbuf_attach(struct strbuf *sb, void *buf, size_t len, size_t alloc)
+{
+	strbuf_release(sb);
+	sb->buf   = buf;
+	sb->len   = len;
+	sb->alloc = alloc;
+	strbuf_grow(sb, 0);
+	sb->buf[sb->len] = '\0';
+}
+
+void strbuf_grow(struct strbuf *sb, size_t extra)
+{
+	if (sb->len + extra + 1 <= sb->len)
+		die("you want to use way too much memory");
+	if (!sb->alloc)
+		sb->buf = NULL;
+	ALLOC_GROW(sb->buf, sb->len + extra + 1, sb->alloc);
+}
+
+void strbuf_trim(struct strbuf *sb)
+{
+	char *b = sb->buf;
+	while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1]))
+		sb->len--;
+	while (sb->len > 0 && isspace(*b)) {
+		b++;
+		sb->len--;
+	}
+	memmove(sb->buf, b, sb->len);
+	sb->buf[sb->len] = '\0';
+}
+void strbuf_rtrim(struct strbuf *sb)
+{
+	while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1]))
+		sb->len--;
+	sb->buf[sb->len] = '\0';
+}
+
+void strbuf_ltrim(struct strbuf *sb)
+{
+	char *b = sb->buf;
+	while (sb->len > 0 && isspace(*b)) {
+		b++;
+		sb->len--;
+	}
+	memmove(sb->buf, b, sb->len);
+	sb->buf[sb->len] = '\0';
+}
+
+void strbuf_tolower(struct strbuf *sb)
+{
+	int i;
+	for (i = 0; i < sb->len; i++)
+		sb->buf[i] = tolower(sb->buf[i]);
+}
+
+struct strbuf **strbuf_split(const struct strbuf *sb, int delim)
+{
+	int alloc = 2, pos = 0;
+	char *n, *p;
+	struct strbuf **ret;
+	struct strbuf *t;
+
+	ret = calloc(alloc, sizeof(struct strbuf *));
+	p = n = sb->buf;
+	while (n < sb->buf + sb->len) {
+		int len;
+		n = memchr(n, delim, sb->len - (n - sb->buf));
+		if (pos + 1 >= alloc) {
+			alloc = alloc * 2;
+			ret = realloc(ret, sizeof(struct strbuf *) * alloc);
+		}
+		if (!n)
+			n = sb->buf + sb->len - 1;
+		len = n - p + 1;
+		t = malloc(sizeof(struct strbuf));
+		strbuf_init(t, len);
+		strbuf_add(t, p, len);
+		ret[pos] = t;
+		ret[++pos] = NULL;
+		p = ++n;
+	}
+	return ret;
+}
+
+void strbuf_list_free(struct strbuf **sbs)
+{
+	struct strbuf **s = sbs;
+
+	while (*s) {
+		strbuf_release(*s);
+		free(*s++);
+	}
+	free(sbs);
+}
+
+int strbuf_cmp(const struct strbuf *a, const struct strbuf *b)
+{
+	int len = a->len < b->len ? a->len: b->len;
+	int cmp = memcmp(a->buf, b->buf, len);
+	if (cmp)
+		return cmp;
+	return a->len < b->len ? -1: a->len != b->len;
+}
+
+void strbuf_splice(struct strbuf *sb, size_t pos, size_t len,
+				   const void *data, size_t dlen)
+{
+	if (pos + len < pos)
+		die("you want to use way too much memory");
+	if (pos > sb->len)
+		die("`pos' is too far after the end of the buffer");
+	if (pos + len > sb->len)
+		die("`pos + len' is too far after the end of the buffer");
+
+	if (dlen >= len)
+		strbuf_grow(sb, dlen - len);
+	memmove(sb->buf + pos + dlen,
+			sb->buf + pos + len,
+			sb->len - pos - len);
+	memcpy(sb->buf + pos, data, dlen);
+	strbuf_setlen(sb, sb->len + dlen - len);
+}
+
+void strbuf_insert(struct strbuf *sb, size_t pos, const void *data, size_t len)
+{
+	strbuf_splice(sb, pos, 0, data, len);
+}
+
+void strbuf_remove(struct strbuf *sb, size_t pos, size_t len)
+{
+	strbuf_splice(sb, pos, len, NULL, 0);
+}
+
+void strbuf_add(struct strbuf *sb, const void *data, size_t len)
+{
+	strbuf_grow(sb, len);
+	memcpy(sb->buf + sb->len, data, len);
+	strbuf_setlen(sb, sb->len + len);
+}
+
+void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len)
+{
+	strbuf_grow(sb, len);
+	memcpy(sb->buf + sb->len, sb->buf + pos, len);
+	strbuf_setlen(sb, sb->len + len);
+}
+
+void strbuf_addf(struct strbuf *sb, const char *fmt, ...)
+{
+	int len;
+	va_list ap;
+
+	if (!strbuf_avail(sb))
+		strbuf_grow(sb, 64);
+	va_start(ap, fmt);
+	len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap);
+	va_end(ap);
+	if (len < 0)
+		die("your vsnprintf is broken");
+	if (len > strbuf_avail(sb)) {
+		strbuf_grow(sb, len);
+		va_start(ap, fmt);
+		len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap);
+		va_end(ap);
+		if (len > strbuf_avail(sb)) {
+			die("this should not happen, your snprintf is broken");
+		}
+	}
+	strbuf_setlen(sb, sb->len + len);
+}
+
+void strbuf_expand(struct strbuf *sb, const char *format, expand_fn_t fn,
+		   void *context)
+{
+	for (;;) {
+		const char *percent;
+		size_t consumed;
+
+		percent = strchrnul(format, '%');
+		strbuf_add(sb, format, percent - format);
+		if (!*percent)
+			break;
+		format = percent + 1;
+
+		consumed = fn(sb, format, context);
+		if (consumed)
+			format += consumed;
+		else
+			strbuf_addch(sb, '%');
+	}
+}
+
+size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder,
+		void *context)
+{
+	struct strbuf_expand_dict_entry *e = context;
+	size_t len;
+
+	for (; e->placeholder && (len = strlen(e->placeholder)); e++) {
+		if (!strncmp(placeholder, e->placeholder, len)) {
+			if (e->value)
+				strbuf_addstr(sb, e->value);
+			return len;
+		}
+	}
+	return 0;
+}
+
+size_t strbuf_fread(struct strbuf *sb, size_t size, FILE *f)
+{
+	size_t res;
+	size_t oldalloc = sb->alloc;
+
+	strbuf_grow(sb, size);
+	res = fread(sb->buf + sb->len, 1, size, f);
+	if (res > 0)
+		strbuf_setlen(sb, sb->len + res);
+	else if (res < 0 && oldalloc == 0)
+		strbuf_release(sb);
+	return res;
+}
+
+ssize_t strbuf_read(struct strbuf *sb, int fd, size_t hint)
+{
+	size_t oldlen = sb->len;
+	size_t oldalloc = sb->alloc;
+
+	strbuf_grow(sb, hint ? hint : 8192);
+	for (;;) {
+		ssize_t cnt;
+
+		cnt = read(fd, sb->buf + sb->len, sb->alloc - sb->len - 1);
+		if (cnt < 0) {
+			if (oldalloc == 0)
+				strbuf_release(sb);
+			else
+				strbuf_setlen(sb, oldlen);
+			return -1;
+		}
+		if (!cnt)
+			break;
+		sb->len += cnt;
+		strbuf_grow(sb, 8192);
+	}
+
+	sb->buf[sb->len] = '\0';
+	return sb->len - oldlen;
+}
+
+#define STRBUF_MAXLINK (2*PATH_MAX)
+
+int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint)
+{
+	size_t oldalloc = sb->alloc;
+
+	if (hint < 32)
+		hint = 32;
+
+	while (hint < STRBUF_MAXLINK) {
+		int len;
+
+		strbuf_grow(sb, hint);
+		len = readlink(path, sb->buf, hint);
+		if (len < 0) {
+			if (errno != ERANGE)
+				break;
+		} else if (len < hint) {
+			strbuf_setlen(sb, len);
+			return 0;
+		}
+
+		/* .. the buffer was too small - try again */
+		hint *= 2;
+	}
+	if (oldalloc == 0)
+		strbuf_release(sb);
+	return -1;
+}
+
+int strbuf_getline(struct strbuf *sb, FILE *fp, int term)
+{
+	int ch;
+
+	strbuf_grow(sb, 0);
+	if (feof(fp))
+		return EOF;
+
+	strbuf_reset(sb);
+	while ((ch = fgetc(fp)) != EOF) {
+		if (ch == term)
+			break;
+		strbuf_grow(sb, 1);
+		sb->buf[sb->len++] = ch;
+	}
+	if (ch == EOF && sb->len == 0)
+		return EOF;
+
+	sb->buf[sb->len] = '\0';
+	return 0;
+}
+
+int strbuf_read_file(struct strbuf *sb, const char *path, size_t hint)
+{
+	int fd, len;
+
+	fd = open(path, O_RDONLY);
+	if (fd < 0)
+		return -1;
+	len = strbuf_read(sb, fd, hint);
+	close(fd);
+	if (len < 0)
+		return -1;
+
+	return len;
+}
diff --git a/Documentation/perf_counter/strbuf.h b/Documentation/perf_counter/strbuf.h
new file mode 100644
index 0000000..9ee908a
--- /dev/null
+++ b/Documentation/perf_counter/strbuf.h
@@ -0,0 +1,137 @@
+#ifndef STRBUF_H
+#define STRBUF_H
+
+/*
+ * Strbuf's can be use in many ways: as a byte array, or to store arbitrary
+ * long, overflow safe strings.
+ *
+ * Strbufs has some invariants that are very important to keep in mind:
+ *
+ * 1. the ->buf member is always malloc-ed, hence strbuf's can be used to
+ *    build complex strings/buffers whose final size isn't easily known.
+ *
+ *    It is NOT legal to copy the ->buf pointer away.
+ *    `strbuf_detach' is the operation that detachs a buffer from its shell
+ *    while keeping the shell valid wrt its invariants.
+ *
+ * 2. the ->buf member is a byte array that has at least ->len + 1 bytes
+ *    allocated. The extra byte is used to store a '\0', allowing the ->buf
+ *    member to be a valid C-string. Every strbuf function ensure this
+ *    invariant is preserved.
+ *
+ *    Note that it is OK to "play" with the buffer directly if you work it
+ *    that way:
+ *
+ *    strbuf_grow(sb, SOME_SIZE);
+ *       ... Here, the memory array starting at sb->buf, and of length
+ *       ... strbuf_avail(sb) is all yours, and you are sure that
+ *       ... strbuf_avail(sb) is at least SOME_SIZE.
+ *    strbuf_setlen(sb, sb->len + SOME_OTHER_SIZE);
+ *
+ *    Of course, SOME_OTHER_SIZE must be smaller or equal to strbuf_avail(sb).
+ *
+ *    Doing so is safe, though if it has to be done in many places, adding the
+ *    missing API to the strbuf module is the way to go.
+ *
+ *    XXX: do _not_ assume that the area that is yours is of size ->alloc - 1
+ *         even if it's true in the current implementation. Alloc is somehow a
+ *         "private" member that should not be messed with.
+ */
+
+#include <assert.h>
+
+extern char strbuf_slopbuf[];
+struct strbuf {
+	size_t alloc;
+	size_t len;
+	char *buf;
+};
+
+#define STRBUF_INIT  { 0, 0, strbuf_slopbuf }
+
+/*----- strbuf life cycle -----*/
+extern void strbuf_init(struct strbuf *, size_t);
+extern void strbuf_release(struct strbuf *);
+extern char *strbuf_detach(struct strbuf *, size_t *);
+extern void strbuf_attach(struct strbuf *, void *, size_t, size_t);
+static inline void strbuf_swap(struct strbuf *a, struct strbuf *b) {
+	struct strbuf tmp = *a;
+	*a = *b;
+	*b = tmp;
+}
+
+/*----- strbuf size related -----*/
+static inline size_t strbuf_avail(const struct strbuf *sb) {
+	return sb->alloc ? sb->alloc - sb->len - 1 : 0;
+}
+
+extern void strbuf_grow(struct strbuf *, size_t);
+
+static inline void strbuf_setlen(struct strbuf *sb, size_t len) {
+	if (!sb->alloc)
+		strbuf_grow(sb, 0);
+	assert(len < sb->alloc);
+	sb->len = len;
+	sb->buf[len] = '\0';
+}
+#define strbuf_reset(sb)  strbuf_setlen(sb, 0)
+
+/*----- content related -----*/
+extern void strbuf_trim(struct strbuf *);
+extern void strbuf_rtrim(struct strbuf *);
+extern void strbuf_ltrim(struct strbuf *);
+extern int strbuf_cmp(const struct strbuf *, const struct strbuf *);
+extern void strbuf_tolower(struct strbuf *);
+
+extern struct strbuf **strbuf_split(const struct strbuf *, int delim);
+extern void strbuf_list_free(struct strbuf **);
+
+/*----- add data in your buffer -----*/
+static inline void strbuf_addch(struct strbuf *sb, int c) {
+	strbuf_grow(sb, 1);
+	sb->buf[sb->len++] = c;
+	sb->buf[sb->len] = '\0';
+}
+
+extern void strbuf_insert(struct strbuf *, size_t pos, const void *, size_t);
+extern void strbuf_remove(struct strbuf *, size_t pos, size_t len);
+
+/* splice pos..pos+len with given data */
+extern void strbuf_splice(struct strbuf *, size_t pos, size_t len,
+                          const void *, size_t);
+
+extern void strbuf_add(struct strbuf *, const void *, size_t);
+static inline void strbuf_addstr(struct strbuf *sb, const char *s) {
+	strbuf_add(sb, s, strlen(s));
+}
+static inline void strbuf_addbuf(struct strbuf *sb, const struct strbuf *sb2) {
+	strbuf_add(sb, sb2->buf, sb2->len);
+}
+extern void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len);
+
+typedef size_t (*expand_fn_t) (struct strbuf *sb, const char *placeholder, void *context);
+extern void strbuf_expand(struct strbuf *sb, const char *format, expand_fn_t fn, void *context);
+struct strbuf_expand_dict_entry {
+	const char *placeholder;
+	const char *value;
+};
+extern size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder, void *context);
+
+__attribute__((format(printf,2,3)))
+extern void strbuf_addf(struct strbuf *sb, const char *fmt, ...);
+
+extern size_t strbuf_fread(struct strbuf *, size_t, FILE *);
+/* XXX: if read fails, any partial read is undone */
+extern ssize_t strbuf_read(struct strbuf *, int fd, size_t hint);
+extern int strbuf_read_file(struct strbuf *sb, const char *path, size_t hint);
+extern int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint);
+
+extern int strbuf_getline(struct strbuf *, FILE *, int);
+
+extern void stripspace(struct strbuf *buf, int skip_comments);
+extern int launch_editor(const char *path, struct strbuf *buffer, const char *const *env);
+
+extern int strbuf_branchname(struct strbuf *sb, const char *name);
+extern int strbuf_check_branch_ref(struct strbuf *sb, const char *name);
+
+#endif /* STRBUF_H */
diff --git a/Documentation/perf_counter/usage.c b/Documentation/perf_counter/usage.c
new file mode 100644
index 0000000..7a10421
--- /dev/null
+++ b/Documentation/perf_counter/usage.c
@@ -0,0 +1,80 @@
+/*
+ * GIT - The information manager from hell
+ *
+ * Copyright (C) Linus Torvalds, 2005
+ */
+#include "util.h"
+
+static void report(const char *prefix, const char *err, va_list params)
+{
+	char msg[1024];
+	vsnprintf(msg, sizeof(msg), err, params);
+	fprintf(stderr, "%s%s\n", prefix, msg);
+}
+
+static NORETURN void usage_builtin(const char *err)
+{
+	fprintf(stderr, "usage: %s\n", err);
+	exit(129);
+}
+
+static NORETURN void die_builtin(const char *err, va_list params)
+{
+	report("fatal: ", err, params);
+	exit(128);
+}
+
+static void error_builtin(const char *err, va_list params)
+{
+	report("error: ", err, params);
+}
+
+static void warn_builtin(const char *warn, va_list params)
+{
+	report("warning: ", warn, params);
+}
+
+/* If we are in a dlopen()ed .so write to a global variable would segfault
+ * (ugh), so keep things static. */
+static void (*usage_routine)(const char *err) NORETURN = usage_builtin;
+static void (*die_routine)(const char *err, va_list params) NORETURN = die_builtin;
+static void (*error_routine)(const char *err, va_list params) = error_builtin;
+static void (*warn_routine)(const char *err, va_list params) = warn_builtin;
+
+void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN)
+{
+	die_routine = routine;
+}
+
+void usage(const char *err)
+{
+	usage_routine(err);
+}
+
+void die(const char *err, ...)
+{
+	va_list params;
+
+	va_start(params, err);
+	die_routine(err, params);
+	va_end(params);
+}
+
+int error(const char *err, ...)
+{
+	va_list params;
+
+	va_start(params, err);
+	error_routine(err, params);
+	va_end(params);
+	return -1;
+}
+
+void warning(const char *warn, ...)
+{
+	va_list params;
+
+	va_start(params, warn);
+	warn_routine(warn, params);
+	va_end(params);
+}
diff --git a/Documentation/perf_counter/util.h b/Documentation/perf_counter/util.h
new file mode 100644
index 0000000..13f8bdc
--- /dev/null
+++ b/Documentation/perf_counter/util.h
@@ -0,0 +1,394 @@
+#ifndef GIT_COMPAT_UTIL_H
+#define GIT_COMPAT_UTIL_H
+
+#define _FILE_OFFSET_BITS 64
+
+#ifndef FLEX_ARRAY
+/*
+ * See if our compiler is known to support flexible array members.
+ */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
+# define FLEX_ARRAY /* empty */
+#elif defined(__GNUC__)
+# if (__GNUC__ >= 3)
+#  define FLEX_ARRAY /* empty */
+# else
+#  define FLEX_ARRAY 0 /* older GNU extension */
+# endif
+#endif
+
+/*
+ * Otherwise, default to safer but a bit wasteful traditional style
+ */
+#ifndef FLEX_ARRAY
+# define FLEX_ARRAY 1
+#endif
+#endif
+
+#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
+
+#ifdef __GNUC__
+#define TYPEOF(x) (__typeof__(x))
+#else
+#define TYPEOF(x)
+#endif
+
+#define MSB(x, bits) ((x) & TYPEOF(x)(~0ULL << (sizeof(x) * 8 - (bits))))
+#define HAS_MULTI_BITS(i)  ((i) & ((i) - 1))  /* checks if an integer has more than 1 bit set */
+
+/* Approximation of the length of the decimal representation of this type. */
+#define decimal_length(x)	((int)(sizeof(x) * 2.56 + 0.5) + 1)
+
+#if !defined(__APPLE__) && !defined(__FreeBSD__)  && !defined(__USLC__) && !defined(_M_UNIX)
+#define _XOPEN_SOURCE 600 /* glibc2 and AIX 5.3L need 500, OpenBSD needs 600 for S_ISLNK() */
+#define _XOPEN_SOURCE_EXTENDED 1 /* AIX 5.3L needs this */
+#endif
+#define _ALL_SOURCE 1
+#define _GNU_SOURCE 1
+#define _BSD_SOURCE 1
+
+#include <unistd.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <errno.h>
+#include <limits.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <sys/time.h>
+#include <time.h>
+#include <signal.h>
+#include <fnmatch.h>
+#include <assert.h>
+#include <regex.h>
+#include <utime.h>
+#ifndef __MINGW32__
+#include <sys/wait.h>
+#include <sys/poll.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#ifndef NO_SYS_SELECT_H
+#include <sys/select.h>
+#endif
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <pwd.h>
+#include <inttypes.h>
+#if defined(__CYGWIN__)
+#undef _XOPEN_SOURCE
+#include <grp.h>
+#define _XOPEN_SOURCE 600
+#include "compat/cygwin.h"
+#else
+#undef _ALL_SOURCE /* AIX 5.3L defines a struct list with _ALL_SOURCE. */
+#include <grp.h>
+#define _ALL_SOURCE 1
+#endif
+#else 	/* __MINGW32__ */
+/* pull in Windows compatibility stuff */
+#include "compat/mingw.h"
+#endif	/* __MINGW32__ */
+
+#ifndef NO_ICONV
+#include <iconv.h>
+#endif
+
+#ifndef NO_OPENSSL
+#include <openssl/ssl.h>
+#include <openssl/err.h>
+#endif
+
+/* On most systems <limits.h> would have given us this, but
+ * not on some systems (e.g. GNU/Hurd).
+ */
+#ifndef PATH_MAX
+#define PATH_MAX 4096
+#endif
+
+#ifndef PRIuMAX
+#define PRIuMAX "llu"
+#endif
+
+#ifndef PRIu32
+#define PRIu32 "u"
+#endif
+
+#ifndef PRIx32
+#define PRIx32 "x"
+#endif
+
+#ifndef PATH_SEP
+#define PATH_SEP ':'
+#endif
+
+#ifndef STRIP_EXTENSION
+#define STRIP_EXTENSION ""
+#endif
+
+#ifndef has_dos_drive_prefix
+#define has_dos_drive_prefix(path) 0
+#endif
+
+#ifndef is_dir_sep
+#define is_dir_sep(c) ((c) == '/')
+#endif
+
+#ifdef __GNUC__
+#define NORETURN __attribute__((__noreturn__))
+#else
+#define NORETURN
+#ifndef __attribute__
+#define __attribute__(x)
+#endif
+#endif
+
+/* General helper functions */
+extern void usage(const char *err) NORETURN;
+extern void die(const char *err, ...) NORETURN __attribute__((format (printf, 1, 2)));
+extern int error(const char *err, ...) __attribute__((format (printf, 1, 2)));
+extern void warning(const char *err, ...) __attribute__((format (printf, 1, 2)));
+
+extern void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN);
+
+extern int prefixcmp(const char *str, const char *prefix);
+extern time_t tm_to_time_t(const struct tm *tm);
+
+static inline const char *skip_prefix(const char *str, const char *prefix)
+{
+	size_t len = strlen(prefix);
+	return strncmp(str, prefix, len) ? NULL : str + len;
+}
+
+#if defined(NO_MMAP) || defined(USE_WIN32_MMAP)
+
+#ifndef PROT_READ
+#define PROT_READ 1
+#define PROT_WRITE 2
+#define MAP_PRIVATE 1
+#define MAP_FAILED ((void*)-1)
+#endif
+
+#define mmap git_mmap
+#define munmap git_munmap
+extern void *git_mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
+extern int git_munmap(void *start, size_t length);
+
+#else /* NO_MMAP || USE_WIN32_MMAP */
+
+#include <sys/mman.h>
+
+#endif /* NO_MMAP || USE_WIN32_MMAP */
+
+#ifdef NO_MMAP
+
+/* This value must be multiple of (pagesize * 2) */
+#define DEFAULT_PACKED_GIT_WINDOW_SIZE (1 * 1024 * 1024)
+
+#else /* NO_MMAP */
+
+/* This value must be multiple of (pagesize * 2) */
+#define DEFAULT_PACKED_GIT_WINDOW_SIZE \
+	(sizeof(void*) >= 8 \
+		?  1 * 1024 * 1024 * 1024 \
+		: 32 * 1024 * 1024)
+
+#endif /* NO_MMAP */
+
+#ifdef NO_ST_BLOCKS_IN_STRUCT_STAT
+#define on_disk_bytes(st) ((st).st_size)
+#else
+#define on_disk_bytes(st) ((st).st_blocks * 512)
+#endif
+
+#define DEFAULT_PACKED_GIT_LIMIT \
+	((1024L * 1024L) * (sizeof(void*) >= 8 ? 8192 : 256))
+
+#ifdef NO_PREAD
+#define pread git_pread
+extern ssize_t git_pread(int fd, void *buf, size_t count, off_t offset);
+#endif
+/*
+ * Forward decl that will remind us if its twin in cache.h changes.
+ * This function is used in compat/pread.c.  But we can't include
+ * cache.h there.
+ */
+extern ssize_t read_in_full(int fd, void *buf, size_t count);
+
+#ifdef NO_SETENV
+#define setenv gitsetenv
+extern int gitsetenv(const char *, const char *, int);
+#endif
+
+#ifdef NO_MKDTEMP
+#define mkdtemp gitmkdtemp
+extern char *gitmkdtemp(char *);
+#endif
+
+#ifdef NO_UNSETENV
+#define unsetenv gitunsetenv
+extern void gitunsetenv(const char *);
+#endif
+
+#ifdef NO_STRCASESTR
+#define strcasestr gitstrcasestr
+extern char *gitstrcasestr(const char *haystack, const char *needle);
+#endif
+
+#ifdef NO_STRLCPY
+#define strlcpy gitstrlcpy
+extern size_t gitstrlcpy(char *, const char *, size_t);
+#endif
+
+#ifdef NO_STRTOUMAX
+#define strtoumax gitstrtoumax
+extern uintmax_t gitstrtoumax(const char *, char **, int);
+#endif
+
+#ifdef NO_HSTRERROR
+#define hstrerror githstrerror
+extern const char *githstrerror(int herror);
+#endif
+
+#ifdef NO_MEMMEM
+#define memmem gitmemmem
+void *gitmemmem(const void *haystack, size_t haystacklen,
+                const void *needle, size_t needlelen);
+#endif
+
+#ifdef FREAD_READS_DIRECTORIES
+#ifdef fopen
+#undef fopen
+#endif
+#define fopen(a,b) git_fopen(a,b)
+extern FILE *git_fopen(const char*, const char*);
+#endif
+
+#ifdef SNPRINTF_RETURNS_BOGUS
+#define snprintf git_snprintf
+extern int git_snprintf(char *str, size_t maxsize,
+			const char *format, ...);
+#define vsnprintf git_vsnprintf
+extern int git_vsnprintf(char *str, size_t maxsize,
+			 const char *format, va_list ap);
+#endif
+
+#ifdef __GLIBC_PREREQ
+#if __GLIBC_PREREQ(2, 1)
+#define HAVE_STRCHRNUL
+#endif
+#endif
+
+#ifndef HAVE_STRCHRNUL
+#define strchrnul gitstrchrnul
+static inline char *gitstrchrnul(const char *s, int c)
+{
+	while (*s && *s != c)
+		s++;
+	return (char *)s;
+}
+#endif
+
+static inline size_t xsize_t(off_t len)
+{
+	return (size_t)len;
+}
+
+static inline int has_extension(const char *filename, const char *ext)
+{
+	size_t len = strlen(filename);
+	size_t extlen = strlen(ext);
+	return len > extlen && !memcmp(filename + len - extlen, ext, extlen);
+}
+
+/* Sane ctype - no locale, and works with signed chars */
+#undef isascii
+#undef isspace
+#undef isdigit
+#undef isalpha
+#undef isalnum
+#undef tolower
+#undef toupper
+extern unsigned char sane_ctype[256];
+#define GIT_SPACE 0x01
+#define GIT_DIGIT 0x02
+#define GIT_ALPHA 0x04
+#define GIT_GLOB_SPECIAL 0x08
+#define GIT_REGEX_SPECIAL 0x10
+#define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0)
+#define isascii(x) (((x) & ~0x7f) == 0)
+#define isspace(x) sane_istest(x,GIT_SPACE)
+#define isdigit(x) sane_istest(x,GIT_DIGIT)
+#define isalpha(x) sane_istest(x,GIT_ALPHA)
+#define isalnum(x) sane_istest(x,GIT_ALPHA | GIT_DIGIT)
+#define is_glob_special(x) sane_istest(x,GIT_GLOB_SPECIAL)
+#define is_regex_special(x) sane_istest(x,GIT_GLOB_SPECIAL | GIT_REGEX_SPECIAL)
+#define tolower(x) sane_case((unsigned char)(x), 0x20)
+#define toupper(x) sane_case((unsigned char)(x), 0)
+
+static inline int sane_case(int x, int high)
+{
+	if (sane_istest(x, GIT_ALPHA))
+		x = (x & ~0x20) | high;
+	return x;
+}
+
+static inline int strtoul_ui(char const *s, int base, unsigned int *result)
+{
+	unsigned long ul;
+	char *p;
+
+	errno = 0;
+	ul = strtoul(s, &p, base);
+	if (errno || *p || p == s || (unsigned int) ul != ul)
+		return -1;
+	*result = ul;
+	return 0;
+}
+
+static inline int strtol_i(char const *s, int base, int *result)
+{
+	long ul;
+	char *p;
+
+	errno = 0;
+	ul = strtol(s, &p, base);
+	if (errno || *p || p == s || (int) ul != ul)
+		return -1;
+	*result = ul;
+	return 0;
+}
+
+#ifdef INTERNAL_QSORT
+void git_qsort(void *base, size_t nmemb, size_t size,
+	       int(*compar)(const void *, const void *));
+#define qsort git_qsort
+#endif
+
+#ifndef DIR_HAS_BSD_GROUP_SEMANTICS
+# define FORCE_DIR_SET_GID S_ISGID
+#else
+# define FORCE_DIR_SET_GID 0
+#endif
+
+#ifdef NO_NSEC
+#undef USE_NSEC
+#define ST_CTIME_NSEC(st) 0
+#define ST_MTIME_NSEC(st) 0
+#else
+#ifdef USE_ST_TIMESPEC
+#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctimespec.tv_nsec))
+#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtimespec.tv_nsec))
+#else
+#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctim.tv_nsec))
+#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtim.tv_nsec))
+#endif
+#endif
+
+#endif
diff --git a/Documentation/perf_counter/wrapper.c b/Documentation/perf_counter/wrapper.c
new file mode 100644
index 0000000..6350d65
--- /dev/null
+++ b/Documentation/perf_counter/wrapper.c
@@ -0,0 +1,206 @@
+/*
+ * Various trivial helper wrappers around standard functions
+ */
+#include "cache.h"
+
+/*
+ * There's no pack memory to release - but stay close to the Git
+ * version so wrap this away:
+ */
+static inline void release_pack_memory(size_t size, int flag)
+{
+}
+
+char *xstrdup(const char *str)
+{
+	char *ret = strdup(str);
+	if (!ret) {
+		release_pack_memory(strlen(str) + 1, -1);
+		ret = strdup(str);
+		if (!ret)
+			die("Out of memory, strdup failed");
+	}
+	return ret;
+}
+
+void *xmalloc(size_t size)
+{
+	void *ret = malloc(size);
+	if (!ret && !size)
+		ret = malloc(1);
+	if (!ret) {
+		release_pack_memory(size, -1);
+		ret = malloc(size);
+		if (!ret && !size)
+			ret = malloc(1);
+		if (!ret)
+			die("Out of memory, malloc failed");
+	}
+#ifdef XMALLOC_POISON
+	memset(ret, 0xA5, size);
+#endif
+	return ret;
+}
+
+/*
+ * xmemdupz() allocates (len + 1) bytes of memory, duplicates "len" bytes of
+ * "data" to the allocated memory, zero terminates the allocated memory,
+ * and returns a pointer to the allocated memory. If the allocation fails,
+ * the program dies.
+ */
+void *xmemdupz(const void *data, size_t len)
+{
+	char *p = xmalloc(len + 1);
+	memcpy(p, data, len);
+	p[len] = '\0';
+	return p;
+}
+
+char *xstrndup(const char *str, size_t len)
+{
+	char *p = memchr(str, '\0', len);
+	return xmemdupz(str, p ? p - str : len);
+}
+
+void *xrealloc(void *ptr, size_t size)
+{
+	void *ret = realloc(ptr, size);
+	if (!ret && !size)
+		ret = realloc(ptr, 1);
+	if (!ret) {
+		release_pack_memory(size, -1);
+		ret = realloc(ptr, size);
+		if (!ret && !size)
+			ret = realloc(ptr, 1);
+		if (!ret)
+			die("Out of memory, realloc failed");
+	}
+	return ret;
+}
+
+void *xcalloc(size_t nmemb, size_t size)
+{
+	void *ret = calloc(nmemb, size);
+	if (!ret && (!nmemb || !size))
+		ret = calloc(1, 1);
+	if (!ret) {
+		release_pack_memory(nmemb * size, -1);
+		ret = calloc(nmemb, size);
+		if (!ret && (!nmemb || !size))
+			ret = calloc(1, 1);
+		if (!ret)
+			die("Out of memory, calloc failed");
+	}
+	return ret;
+}
+
+void *xmmap(void *start, size_t length,
+	int prot, int flags, int fd, off_t offset)
+{
+	void *ret = mmap(start, length, prot, flags, fd, offset);
+	if (ret == MAP_FAILED) {
+		if (!length)
+			return NULL;
+		release_pack_memory(length, fd);
+		ret = mmap(start, length, prot, flags, fd, offset);
+		if (ret == MAP_FAILED)
+			die("Out of memory? mmap failed: %s", strerror(errno));
+	}
+	return ret;
+}
+
+/*
+ * xread() is the same a read(), but it automatically restarts read()
+ * operations with a recoverable error (EAGAIN and EINTR). xread()
+ * DOES NOT GUARANTEE that "len" bytes is read even if the data is available.
+ */
+ssize_t xread(int fd, void *buf, size_t len)
+{
+	ssize_t nr;
+	while (1) {
+		nr = read(fd, buf, len);
+		if ((nr < 0) && (errno == EAGAIN || errno == EINTR))
+			continue;
+		return nr;
+	}
+}
+
+/*
+ * xwrite() is the same a write(), but it automatically restarts write()
+ * operations with a recoverable error (EAGAIN and EINTR). xwrite() DOES NOT
+ * GUARANTEE that "len" bytes is written even if the operation is successful.
+ */
+ssize_t xwrite(int fd, const void *buf, size_t len)
+{
+	ssize_t nr;
+	while (1) {
+		nr = write(fd, buf, len);
+		if ((nr < 0) && (errno == EAGAIN || errno == EINTR))
+			continue;
+		return nr;
+	}
+}
+
+ssize_t read_in_full(int fd, void *buf, size_t count)
+{
+	char *p = buf;
+	ssize_t total = 0;
+
+	while (count > 0) {
+		ssize_t loaded = xread(fd, p, count);
+		if (loaded <= 0)
+			return total ? total : loaded;
+		count -= loaded;
+		p += loaded;
+		total += loaded;
+	}
+
+	return total;
+}
+
+ssize_t write_in_full(int fd, const void *buf, size_t count)
+{
+	const char *p = buf;
+	ssize_t total = 0;
+
+	while (count > 0) {
+		ssize_t written = xwrite(fd, p, count);
+		if (written < 0)
+			return -1;
+		if (!written) {
+			errno = ENOSPC;
+			return -1;
+		}
+		count -= written;
+		p += written;
+		total += written;
+	}
+
+	return total;
+}
+
+int xdup(int fd)
+{
+	int ret = dup(fd);
+	if (ret < 0)
+		die("dup failed: %s", strerror(errno));
+	return ret;
+}
+
+FILE *xfdopen(int fd, const char *mode)
+{
+	FILE *stream = fdopen(fd, mode);
+	if (stream == NULL)
+		die("Out of memory? fdopen failed: %s", strerror(errno));
+	return stream;
+}
+
+int xmkstemp(char *template)
+{
+	int fd;
+
+	fd = mkstemp(template);
+	if (fd < 0)
+		die("Unable to create temporary file: %s", strerror(errno));
+	return fd;
+}
-- 
cgit v0.10.2


From 6f06ccbc86f8a02aa32271263249657ce484eb25 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 20 Apr 2009 15:22:22 +0200
Subject: perf_counter tools: clean up after introduction of the Git command
 framework

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 11809b9..1b60265 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -138,16 +138,6 @@ all::
 #
 # Define NO_PERL if you do not want Perl scripts or libraries at all.
 #
-# Define NO_TCLTK if you do not want Tcl/Tk GUI.
-#
-# The TCL_PATH variable governs the location of the Tcl interpreter
-# used to optimize perf-gui for your system.  Only used if NO_TCLTK
-# is not set.  Defaults to the bare 'tclsh'.
-#
-# The TCLTK_PATH variable governs the location of the Tcl/Tk interpreter.
-# If not set it defaults to the bare 'wish'. If it is set to the empty
-# string then NO_TCLTK will be forced (this is used by configure script).
-#
 # Define INTERNAL_QSORT to use Git's implementation of qsort(), which
 # is a simplified version of the merge sort used in glibc. This is
 # recommended if Git triggers O(n^2) behavior in your platform's qsort().
@@ -215,12 +205,8 @@ TAR = tar
 FIND = find
 INSTALL = install
 RPMBUILD = rpmbuild
-TCL_PATH = tclsh
-TCLTK_PATH = wish
 PTHREAD_LIBS = -lpthread
 
-export TCL_PATH TCLTK_PATH
-
 # sparse is architecture-neutral, which means that we need to tell it
 # explicitly what architecture to check for. Fix this up for yours..
 SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__
@@ -529,10 +515,6 @@ ifdef NO_EXTERNAL_GREP
 	BASIC_CFLAGS += -DNO_EXTERNAL_GREP
 endif
 
-ifeq ($(TCLTK_PATH),)
-NO_TCLTK=NoThanks
-endif
-
 ifeq ($(PERL_PATH),)
 NO_PERL=NoThanks
 endif
@@ -583,7 +565,6 @@ prefix_SQ = $(subst ','\'',$(prefix))
 
 SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
 PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH))
-TCLTK_PATH_SQ = $(subst ','\'',$(TCLTK_PATH))
 
 LIBS = $(PERFLIBS) $(EXTLIBS)
 
@@ -607,14 +588,6 @@ ifneq (,$X)
 endif
 
 all::
-ifndef NO_TCLTK
-	$(QUIET_SUBDIR0)perf-gui $(QUIET_SUBDIR1) perfexecdir='$(perfexec_instdir_SQ)' all
-	$(QUIET_SUBDIR0)perfk-perf $(QUIET_SUBDIR1) all
-endif
-ifndef NO_PERL
-	$(QUIET_SUBDIR0)perl $(QUIET_SUBDIR1) PERL_PATH='$(PERL_PATH_SQ)' prefix='$(prefix_SQ)' all
-endif
-	$(QUIET_SUBDIR0)templates $(QUIET_SUBDIR1)
 
 please_set_SHELL_PATH_to_a_more_modern_shell:
 	@$$(:)
@@ -704,21 +677,6 @@ builtin-revert.o wt-status.o: wt-status.h
 $(LIB_FILE): $(LIB_OBJS)
 	$(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS)
 
-doc:
-	$(MAKE) -C Documentation all
-
-man:
-	$(MAKE) -C Documentation man
-
-html:
-	$(MAKE) -C Documentation html
-
-info:
-	$(MAKE) -C Documentation info
-
-pdf:
-	$(MAKE) -C Documentation pdf
-
 TAGS:
 	$(RM) TAGS
 	$(FIND) . -name '*.[hcS]' -print | xargs etags -a
@@ -751,33 +709,12 @@ PERF-BUILD-OPTIONS: .FORCE-PERF-BUILD-OPTIONS
 	@echo NO_CURL=\''$(subst ','\'',$(subst ','\'',$(NO_CURL)))'\' >>$@
 	@echo NO_PERL=\''$(subst ','\'',$(subst ','\'',$(NO_PERL)))'\' >>$@
 
-### Detect Tck/Tk interpreter path changes
-ifndef NO_TCLTK
-TRACK_VARS = $(subst ','\'',-DTCLTK_PATH='$(TCLTK_PATH_SQ)')
-
-PERF-GUI-VARS: .FORCE-PERF-GUI-VARS
-	@VARS='$(TRACK_VARS)'; \
-	    if test x"$$VARS" != x"`cat $@ 2>/dev/null`" ; then \
-		echo 1>&2 "    * new Tcl/Tk interpreter location"; \
-		echo "$$VARS" >$@; \
-            fi
-
-.PHONY: .FORCE-PERF-GUI-VARS
-endif
-
 ### Testing rules
 
-TEST_PROGRAMS += test-chmtime$X
-TEST_PROGRAMS += test-ctype$X
-TEST_PROGRAMS += test-date$X
-TEST_PROGRAMS += test-delta$X
-TEST_PROGRAMS += test-dump-cache-tree$X
-TEST_PROGRAMS += test-genrandom$X
-TEST_PROGRAMS += test-match-trees$X
-TEST_PROGRAMS += test-parse-options$X
-TEST_PROGRAMS += test-path-utils$X
-TEST_PROGRAMS += test-sha1$X
-TEST_PROGRAMS += test-sigchain$X
+#
+# None right now:
+#
+# TEST_PROGRAMS += test-something$X
 
 all:: $(TEST_PROGRAMS)
 
@@ -787,25 +724,6 @@ all:: $(TEST_PROGRAMS)
 
 export NO_SVN_TESTS
 
-test: all
-	$(MAKE) -C t/ all
-
-test-ctype$X: ctype.o
-
-test-date$X: date.o ctype.o
-
-test-delta$X: diff-delta.o patch-delta.o
-
-test-parse-options$X: parse-options.o
-
-.PRECIOUS: $(patsubst test-%$X,test-%.o,$(TEST_PROGRAMS))
-
-test-%$X: test-%.o $(PERFLIBS)
-	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS)
-
-check-sha1:: test-sha1$X
-	./test-sha1.sh
-
 check: common-cmds.h
 	if sparse; \
 	then \
@@ -845,10 +763,6 @@ install: all
 	$(INSTALL) perf$X perf-upload-pack$X perf-receive-pack$X perf-upload-archive$X perf-shell$X perf-cvsserver '$(DESTDIR_SQ)$(bindir_SQ)'
 	$(MAKE) -C templates DESTDIR='$(DESTDIR_SQ)' install
 	$(MAKE) -C perl prefix='$(prefix_SQ)' DESTDIR='$(DESTDIR_SQ)' install
-ifndef NO_TCLTK
-	$(MAKE) -C perfk-perf install
-	$(MAKE) -C perf-gui perfexecdir='$(perfexec_instdir_SQ)' install
-endif
 ifneq (,$X)
 	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';)
 endif
@@ -865,32 +779,6 @@ endif
 	  done } && \
 	./check_bindir "z$$bindir" "z$$execdir" "$$bindir/perf-add$X"
 
-install-doc:
-	$(MAKE) -C Documentation install
-
-install-man:
-	$(MAKE) -C Documentation install-man
-
-install-html:
-	$(MAKE) -C Documentation install-html
-
-install-info:
-	$(MAKE) -C Documentation install-info
-
-install-pdf:
-	$(MAKE) -C Documentation install-pdf
-
-quick-install-doc:
-	$(MAKE) -C Documentation quick-install
-
-quick-install-man:
-	$(MAKE) -C Documentation quick-install-man
-
-quick-install-html:
-	$(MAKE) -C Documentation quick-install-html
-
-
-
 ### Maintainer's dist rules
 
 perf.spec: perf.spec.in
@@ -904,38 +792,16 @@ dist: perf.spec perf-archive$(X) configure
 	@mkdir -p $(PERF_TARNAME)
 	@cp perf.spec configure $(PERF_TARNAME)
 	@echo $(PERF_VERSION) > $(PERF_TARNAME)/version
-	@$(MAKE) -C perf-gui TARDIR=../$(PERF_TARNAME)/perf-gui dist-version
 	$(TAR) rf $(PERF_TARNAME).tar \
 		$(PERF_TARNAME)/perf.spec \
 		$(PERF_TARNAME)/configure \
-		$(PERF_TARNAME)/version \
-		$(PERF_TARNAME)/perf-gui/version
+		$(PERF_TARNAME)/version
 	@$(RM) -r $(PERF_TARNAME)
 	gzip -f -9 $(PERF_TARNAME).tar
 
 rpm: dist
 	$(RPMBUILD) -ta $(PERF_TARNAME).tar.gz
 
-htmldocs = perf-htmldocs-$(PERF_VERSION)
-manpages = perf-manpages-$(PERF_VERSION)
-dist-doc:
-	$(RM) -r .doc-tmp-dir
-	mkdir .doc-tmp-dir
-	$(MAKE) -C Documentation WEBDOC_DEST=../.doc-tmp-dir install-webdoc
-	cd .doc-tmp-dir && $(TAR) cf ../$(htmldocs).tar .
-	gzip -n -9 -f $(htmldocs).tar
-	:
-	$(RM) -r .doc-tmp-dir
-	mkdir -p .doc-tmp-dir/man1 .doc-tmp-dir/man5 .doc-tmp-dir/man7
-	$(MAKE) -C Documentation DESTDIR=./ \
-		man1dir=../.doc-tmp-dir/man1 \
-		man5dir=../.doc-tmp-dir/man5 \
-		man7dir=../.doc-tmp-dir/man7 \
-		install
-	cd .doc-tmp-dir && $(TAR) cf ../$(manpages).tar .
-	gzip -n -9 -f $(manpages).tar
-	$(RM) -r .doc-tmp-dir
-
 ### Cleaning rules
 
 distclean: clean
@@ -951,74 +817,13 @@ clean:
 	$(RM) -r $(PERF_TARNAME) .doc-tmp-dir
 	$(RM) $(PERF_TARNAME).tar.gz perf-core_$(PERF_VERSION)-*.tar.gz
 	$(RM) $(htmldocs).tar.gz $(manpages).tar.gz
-	$(MAKE) -C Documentation/ clean
-	$(MAKE) -C templates/ clean
-	$(MAKE) -C t/ clean
-ifndef NO_TCLTK
-	$(MAKE) -C perfk-perf clean
-	$(MAKE) -C perf-gui clean
-endif
-	$(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-GUI-VARS PERF-BUILD-OPTIONS
+	$(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-BUILD-OPTIONS
 
 .PHONY: all install clean strip
 .PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
 .PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope .FORCE-PERF-CFLAGS
 .PHONY: .FORCE-PERF-BUILD-OPTIONS
 
-### Check documentation
-#
-check-docs::
-	@(for v in $(ALL_PROGRAMS) $(BUILT_INS) perf perfk; \
-	do \
-		case "$$v" in \
-		perf-merge-octopus | perf-merge-ours | perf-merge-recursive | \
-		perf-merge-resolve | perf-merge-subtree | \
-		perf-fsck-objects | perf-init-db | \
-		perf-?*--?* ) continue ;; \
-		esac ; \
-		test -f "Documentation/$$v.txt" || \
-		echo "no doc: $$v"; \
-		sed -e '/^#/d' command-list.txt | \
-		grep -q "^$$v[ 	]" || \
-		case "$$v" in \
-		perf) ;; \
-		*) echo "no link: $$v";; \
-		esac ; \
-	done; \
-	( \
-		sed -e '/^#/d' \
-		    -e 's/[ 	].*//' \
-		    -e 's/^/listed /' command-list.txt; \
-		ls -1 Documentation/perf*txt | \
-		sed -e 's|Documentation/|documented |' \
-		    -e 's/\.txt//'; \
-	) | while read how cmd; \
-	do \
-		case "$$how,$$cmd" in \
-		*,perf-citool | \
-		*,perf-gui | \
-		*,perf-help | \
-		documented,perfattributes | \
-		documented,perfignore | \
-		documented,perfmodules | \
-		documented,perfcli | \
-		documented,perf-tools | \
-		documented,perfcore-tutorial | \
-		documented,perfcvs-migration | \
-		documented,perfdiffcore | \
-		documented,perfglossary | \
-		documented,perfhooks | \
-		documented,perfrepository-layout | \
-		documented,perftutorial | \
-		documented,perftutorial-2 | \
-		sentinel,not,matching,is,ok ) continue ;; \
-		esac; \
-		case " $(ALL_PROGRAMS) $(BUILT_INS) perf perfk " in \
-		*" $$cmd "*)	;; \
-		*) echo "removed but $$how: $$cmd" ;; \
-		esac; \
-	done ) | sort
-
 ### Make sure built-ins do not have dups and listed in perf.c
 #
 check-builtins::
diff --git a/Documentation/perf_counter/builtin-help.c b/Documentation/perf_counter/builtin-help.c
index 125fcc2..a136d61 100644
--- a/Documentation/perf_counter/builtin-help.c
+++ b/Documentation/perf_counter/builtin-help.c
@@ -417,11 +417,9 @@ static void show_html_page(const char *perf_cmd)
 
 int cmd_help(int argc, const char **argv, const char *prefix)
 {
-	int nonperf;
 	const char *alias;
 	load_command_list("perf-", &main_cmds, &other_cmds);
 
-	/* setup_perf_directory_gently(&nonperf); */
 	perf_config(perf_help_config, NULL);
 
 	argc = parse_options(argc, argv, builtin_help_options,
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 9d2c769..601bddb 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -63,15 +63,6 @@
 
 #include "util.h"
 
-#define _GNU_SOURCE
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <unistd.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
 #include <getopt.h>
 #include <assert.h>
 #include <fcntl.h>
@@ -103,8 +94,6 @@
 #define PR_TASK_PERF_COUNTERS_DISABLE   31
 #define PR_TASK_PERF_COUNTERS_ENABLE    32
 
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-
 #define rdclock()                                       \
 ({                                                      \
         struct timespec ts;                             \
@@ -1077,7 +1066,7 @@ static void process_event(uint64_t ip, int counter)
 	record_ip(ip, counter);
 }
 
-static void process_options(int argc, char *argv[])
+static void process_options(int argc, char **argv)
 {
 	int error = 0, counter;
 
@@ -1255,7 +1244,7 @@ static void mmap_read(struct mmap_data *md)
 
 		event_t event_copy;
 
-		unsigned int size = event->header.size;
+		size_t size = event->header.size;
 
 		/*
 		 * Event straddles the mmap boundary -- header should always
@@ -1301,7 +1290,7 @@ static void mmap_read(struct mmap_data *md)
 	md->prev = old;
 }
 
-int cmd_top(int argc, const char **argv, const char *prefix)
+int cmd_top(int argc, char **argv, const char *prefix)
 {
 	struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
 	struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
diff --git a/Documentation/perf_counter/cache.h b/Documentation/perf_counter/cache.h
index dc08564..7108051 100644
--- a/Documentation/perf_counter/cache.h
+++ b/Documentation/perf_counter/cache.h
@@ -94,4 +94,24 @@ static inline int is_absolute_path(const char *path)
 {
 	return path[0] == '/';
 }
+
+const char *make_absolute_path(const char *path);
+const char *make_nonrelative_path(const char *path);
+const char *make_relative_path(const char *abs, const char *base);
+int normalize_path_copy(char *dst, const char *src);
+int longest_ancestor_length(const char *path, const char *prefix_list);
+char *strip_path_suffix(const char *path, const char *suffix);
+
+extern char *mkpath(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
+extern char *perf_path(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
+
+extern char *mksnpath(char *buf, size_t n, const char *fmt, ...)
+	__attribute__((format (printf, 3, 4)));
+extern char *perf_snpath(char *buf, size_t n, const char *fmt, ...)
+	__attribute__((format (printf, 3, 4)));
+extern char *perf_pathdup(const char *fmt, ...)
+	__attribute__((format (printf, 1, 2)));
+
+extern size_t strlcpy(char *dest, const char *src, size_t size);
+
 #endif /* CACHE_H */
diff --git a/Documentation/perf_counter/config.c b/Documentation/perf_counter/config.c
index 672d539..3dd13fa 100644
--- a/Documentation/perf_counter/config.c
+++ b/Documentation/perf_counter/config.c
@@ -15,7 +15,6 @@ static FILE *config_file;
 static const char *config_file_name;
 static int config_linenr;
 static int config_file_eof;
-static int zlib_compression_seen;
 
 const char *config_exclusive_filename = NULL;
 
@@ -533,14 +532,6 @@ static int store_aux(const char* key, const char* value, void *cb)
 	return 0;
 }
 
-static int write_error(const char *filename)
-{
-	error("failed to write new configuration file %s", filename);
-
-	/* Same error code as "failed to rename". */
-	return 4;
-}
-
 static int store_write_section(int fd, const char* key)
 {
 	const char *dot;
@@ -673,7 +664,7 @@ int perf_config_set_multivar(const char* key, const char* value,
 {
 	int i, dot;
 	int fd = -1, in_fd;
-	int ret;
+	int ret = 0;
 	char* config_filename;
 	const char* last_dot = strrchr(key, '.');
 
@@ -872,90 +863,6 @@ write_err_out:
 
 }
 
-static int section_name_match (const char *buf, const char *name)
-{
-	int i = 0, j = 0, dot = 0;
-	for (; buf[i] && buf[i] != ']'; i++) {
-		if (!dot && isspace(buf[i])) {
-			dot = 1;
-			if (name[j++] != '.')
-				break;
-			for (i++; isspace(buf[i]); i++)
-				; /* do nothing */
-			if (buf[i] != '"')
-				break;
-			continue;
-		}
-		if (buf[i] == '\\' && dot)
-			i++;
-		else if (buf[i] == '"' && dot) {
-			for (i++; isspace(buf[i]); i++)
-				; /* do_nothing */
-			break;
-		}
-		if (buf[i] != name[j++])
-			break;
-	}
-	return (buf[i] == ']' && name[j] == 0);
-}
-
-/* if new_name == NULL, the section is removed instead */
-int perf_config_rename_section(const char *old_name, const char *new_name)
-{
-	int ret = 0, remove = 0;
-	char *config_filename;
-	int out_fd;
-	char buf[1024];
-
-	if (config_exclusive_filename)
-		config_filename = strdup(config_exclusive_filename);
-	else
-		config_filename = perf_pathdup("config");
-	if (out_fd < 0) {
-		ret = error("could not lock config file %s", config_filename);
-		goto out;
-	}
-
-	if (!(config_file = fopen(config_filename, "rb"))) {
-		/* no config file means nothing to rename, no error */
-		goto unlock_and_out;
-	}
-
-	while (fgets(buf, sizeof(buf), config_file)) {
-		int i;
-		int length;
-		for (i = 0; buf[i] && isspace(buf[i]); i++)
-			; /* do nothing */
-		if (buf[i] == '[') {
-			/* it's a section */
-			if (section_name_match (&buf[i+1], old_name)) {
-				ret++;
-				if (new_name == NULL) {
-					remove = 1;
-					continue;
-				}
-				store.baselen = strlen(new_name);
-				if (!store_write_section(out_fd, new_name)) {
-					goto out;
-				}
-				continue;
-			}
-			remove = 0;
-		}
-		if (remove)
-			continue;
-		length = strlen(buf);
-		if (write_in_full(out_fd, buf, length) != length) {
-			goto out;
-		}
-	}
-	fclose(config_file);
- unlock_and_out:
- out:
-	free(config_filename);
-	return ret;
-}
-
 /*
  * Call this to report error for your variable that should not
  * get a boolean value (i.e. "[my] var" means "true").
diff --git a/Documentation/perf_counter/path.c b/Documentation/perf_counter/path.c
index 891b612..a501a40 100644
--- a/Documentation/perf_counter/path.c
+++ b/Documentation/perf_counter/path.c
@@ -161,45 +161,6 @@ int perf_mkstemp(char *path, size_t len, const char *template)
 }
 
 
-static char *user_path(char *buf, char *path, int sz)
-{
-	struct passwd *pw;
-	char *slash;
-	int len, baselen;
-
-	if (!path || path[0] != '~')
-		return NULL;
-	path++;
-	slash = strchr(path, '/');
-	if (path[0] == '/' || !path[0]) {
-		pw = getpwuid(getuid());
-	}
-	else {
-		if (slash) {
-			*slash = 0;
-			pw = getpwnam(path);
-			*slash = '/';
-		}
-		else
-			pw = getpwnam(path);
-	}
-	if (!pw || !pw->pw_dir || sz <= strlen(pw->pw_dir))
-		return NULL;
-	baselen = strlen(pw->pw_dir);
-	memcpy(buf, pw->pw_dir, baselen);
-	while ((1 < baselen) && (buf[baselen-1] == '/')) {
-		buf[baselen-1] = 0;
-		baselen--;
-	}
-	if (slash && slash[1]) {
-		len = strlen(slash);
-		if (sz <= baselen + len)
-			return NULL;
-		memcpy(buf + baselen, slash, len + 1);
-	}
-	return buf;
-}
-
 const char *make_relative_path(const char *abs, const char *base)
 {
 	static char buf[PATH_MAX + 1];
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
index 9256f6a..63f8a89 100644
--- a/Documentation/perf_counter/perf.c
+++ b/Documentation/perf_counter/perf.c
@@ -1,7 +1,7 @@
 #include "builtin.h"
 #include "exec_cmd.h"
 #include "cache.h"
-//#include "quote.h"
+#include "quote.h"
 #include "run-command.h"
 
 const char perf_usage_string[] =
@@ -132,7 +132,6 @@ static int handle_alias(int *argcp, const char ***argv)
 	const char** new_argv;
 	const char *alias_command;
 	char *alias_string;
-	int unused_nonperf;
 
 	alias_command = (*argv)[0];
 	alias_string = alias_lookup(alias_command);
diff --git a/Documentation/perf_counter/util.h b/Documentation/perf_counter/util.h
index 13f8bdc..36e40c3 100644
--- a/Documentation/perf_counter/util.h
+++ b/Documentation/perf_counter/util.h
@@ -295,6 +295,20 @@ static inline char *gitstrchrnul(const char *s, int c)
 }
 #endif
 
+/*
+ * Wrappers:
+ */
+extern char *xstrdup(const char *str);
+extern void *xmalloc(size_t size);
+extern void *xmemdupz(const void *data, size_t len);
+extern char *xstrndup(const char *str, size_t len);
+extern void *xrealloc(void *ptr, size_t size);
+extern void *xcalloc(size_t nmemb, size_t size);
+extern void *xmmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
+extern ssize_t xread(int fd, void *buf, size_t len);
+extern ssize_t xwrite(int fd, const void *buf, size_t len);
+extern int xdup(int fd);
+extern FILE *xfdopen(int fd, const char *mode);
 static inline size_t xsize_t(off_t len)
 {
 	return (size_t)len;
-- 
cgit v0.10.2


From ddcacfa0febff6454dba6cea1931f3020a9f6c24 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 20 Apr 2009 15:37:32 +0200
Subject: perf_counter tools: separate kerneltop into 'perf top' and 'perf
 stat'

Lets use the Git framework of built-in commands.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 1b60265..fb8b717 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -309,6 +309,7 @@ LIB_OBJS += usage.o
 LIB_OBJS += wrapper.o
 
 BUILTIN_OBJS += builtin-help.o
+BUILTIN_OBJS += builtin-stat.o
 BUILTIN_OBJS += builtin-top.o
 
 PERFLIBS = $(LIB_FILE)
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
new file mode 100644
index 0000000..169a2d1
--- /dev/null
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -0,0 +1,592 @@
+/*
+ * kerneltop.c: show top kernel functions - performance counters showcase
+
+   Build with:
+
+     cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
+
+   Sample output:
+
+------------------------------------------------------------------------------
+ KernelTop:    2669 irqs/sec  [NMI, cache-misses/cache-refs],  (all, cpu: 2)
+------------------------------------------------------------------------------
+
+             weight         RIP          kernel function
+             ______   ________________   _______________
+
+              35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
+              33.00 - ffffffff804cb740 : sock_alloc_send_skb
+              31.26 - ffffffff804ce808 : skb_push
+              22.43 - ffffffff80510004 : tcp_established_options
+              19.00 - ffffffff8027d250 : find_get_page
+              15.76 - ffffffff804e4fc9 : eth_type_trans
+              15.20 - ffffffff804d8baa : dst_release
+              14.86 - ffffffff804cf5d8 : skb_release_head_state
+              14.00 - ffffffff802217d5 : read_hpet
+              12.00 - ffffffff804ffb7f : __ip_local_out
+              11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
+               8.54 - ffffffff805001a3 : ip_queue_xmit
+ */
+
+/*
+ * perfstat:  /usr/bin/time -alike performance counter statistics utility
+
+          It summarizes the counter events of all tasks (and child tasks),
+          covering all CPUs that the command (or workload) executes on.
+          It only counts the per-task events of the workload started,
+          independent of how many other tasks run on those CPUs.
+
+   Sample output:
+
+   $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
+
+   Performance counter stats for 'ls':
+
+           163516953 instructions
+                2295 cache-misses
+             2855182 branch-misses
+ */
+
+ /*
+  * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
+  *
+  * Improvements and fixes by:
+  *
+  *   Arjan van de Ven <arjan@linux.intel.com>
+  *   Yanmin Zhang <yanmin.zhang@intel.com>
+  *   Wu Fengguang <fengguang.wu@intel.com>
+  *   Mike Galbraith <efault@gmx.de>
+  *   Paul Mackerras <paulus@samba.org>
+  *
+  * Released under the GPL v2. (and only v2, not any later version)
+  */
+
+#include "util.h"
+
+#include <getopt.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <errno.h>
+#include <ctype.h>
+#include <time.h>
+#include <sched.h>
+#include <pthread.h>
+
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <sys/uio.h>
+#include <sys/mman.h>
+
+#include <linux/unistd.h>
+#include <linux/types.h>
+
+#include "../../include/linux/perf_counter.h"
+
+
+/*
+ * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
+ * counters in the current task.
+ */
+#define PR_TASK_PERF_COUNTERS_DISABLE   31
+#define PR_TASK_PERF_COUNTERS_ENABLE    32
+
+#define rdclock()                                       \
+({                                                      \
+        struct timespec ts;                             \
+                                                        \
+        clock_gettime(CLOCK_MONOTONIC, &ts);            \
+        ts.tv_sec * 1000000000ULL + ts.tv_nsec;         \
+})
+
+/*
+ * Pick up some kernel type conventions:
+ */
+#define __user
+#define asmlinkage
+
+#ifdef __x86_64__
+#define __NR_perf_counter_open 295
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#endif
+
+#ifdef __i386__
+#define __NR_perf_counter_open 333
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#endif
+
+#ifdef __powerpc__
+#define __NR_perf_counter_open 319
+#define rmb() 		asm volatile ("sync" ::: "memory")
+#define cpu_relax()	asm volatile ("" ::: "memory");
+#endif
+
+#define unlikely(x)	__builtin_expect(!!(x), 0)
+#define min(x, y) ({				\
+	typeof(x) _min1 = (x);			\
+	typeof(y) _min2 = (y);			\
+	(void) (&_min1 == &_min2);		\
+	_min1 < _min2 ? _min1 : _min2; })
+
+extern asmlinkage int sys_perf_counter_open(
+        struct perf_counter_hw_event    *hw_event_uptr          __user,
+        pid_t                           pid,
+        int                             cpu,
+        int                             group_fd,
+        unsigned long                   flags);
+
+#define MAX_COUNTERS			64
+#define MAX_NR_CPUS			256
+
+#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
+
+static int			system_wide			=  0;
+
+static int			nr_counters			=  0;
+static __u64			event_id[MAX_COUNTERS]		= {
+	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
+	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
+	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
+	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
+
+	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
+	EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
+	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
+	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
+};
+static int			default_interval = 100000;
+static int			event_count[MAX_COUNTERS];
+static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+
+static int			tid				= -1;
+static int			profile_cpu			= -1;
+static int			nr_cpus				=  0;
+static int			nmi				=  1;
+static int			group				=  0;
+static unsigned int		page_size;
+
+static int			zero;
+
+static int			scale;
+
+static const unsigned int default_count[] = {
+	1000000,
+	1000000,
+	  10000,
+	  10000,
+	1000000,
+	  10000,
+};
+
+static char *hw_event_names[] = {
+	"CPU cycles",
+	"instructions",
+	"cache references",
+	"cache misses",
+	"branches",
+	"branch misses",
+	"bus cycles",
+};
+
+static char *sw_event_names[] = {
+	"cpu clock ticks",
+	"task clock ticks",
+	"pagefaults",
+	"context switches",
+	"CPU migrations",
+	"minor faults",
+	"major faults",
+};
+
+struct event_symbol {
+	__u64 event;
+	char *symbol;
+};
+
+static struct event_symbol event_symbols[] = {
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
+
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
+};
+
+#define __PERF_COUNTER_FIELD(config, name) \
+	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
+
+#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
+#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
+#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
+#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
+
+static void display_events_help(void)
+{
+	unsigned int i;
+	__u64 e;
+
+	printf(
+	" -e EVENT     --event=EVENT   #  symbolic-name        abbreviations");
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		int type, id;
+
+		e = event_symbols[i].event;
+		type = PERF_COUNTER_TYPE(e);
+		id = PERF_COUNTER_ID(e);
+
+		printf("\n                             %d:%d: %-20s",
+				type, id, event_symbols[i].symbol);
+	}
+
+	printf("\n"
+	"                           rNNN: raw PMU events (eventsel+umask)\n\n");
+}
+
+static void display_help(void)
+{
+	printf(
+	"Usage: perfstat [<events...>] <cmd...>\n\n"
+	"PerfStat Options (up to %d event types can be specified):\n\n",
+		 MAX_COUNTERS);
+
+	display_events_help();
+
+	printf(
+	" -l                           # scale counter values\n"
+	" -a                           # system-wide collection\n");
+	exit(0);
+}
+
+static char *event_name(int ctr)
+{
+	__u64 config = event_id[ctr];
+	int type = PERF_COUNTER_TYPE(config);
+	int id = PERF_COUNTER_ID(config);
+	static char buf[32];
+
+	if (PERF_COUNTER_RAW(config)) {
+		sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
+		return buf;
+	}
+
+	switch (type) {
+	case PERF_TYPE_HARDWARE:
+		if (id < PERF_HW_EVENTS_MAX)
+			return hw_event_names[id];
+		return "unknown-hardware";
+
+	case PERF_TYPE_SOFTWARE:
+		if (id < PERF_SW_EVENTS_MAX)
+			return sw_event_names[id];
+		return "unknown-software";
+
+	default:
+		break;
+	}
+
+	return "unknown";
+}
+
+/*
+ * Each event can have multiple symbolic names.
+ * Symbolic names are (almost) exactly matched.
+ */
+static __u64 match_event_symbols(char *str)
+{
+	__u64 config, id;
+	int type;
+	unsigned int i;
+
+	if (sscanf(str, "r%llx", &config) == 1)
+		return config | PERF_COUNTER_RAW_MASK;
+
+	if (sscanf(str, "%d:%llu", &type, &id) == 2)
+		return EID(type, id);
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		if (!strncmp(str, event_symbols[i].symbol,
+			     strlen(event_symbols[i].symbol)))
+			return event_symbols[i].event;
+	}
+
+	return ~0ULL;
+}
+
+static int parse_events(char *str)
+{
+	__u64 config;
+
+again:
+	if (nr_counters == MAX_COUNTERS)
+		return -1;
+
+	config = match_event_symbols(str);
+	if (config == ~0ULL)
+		return -1;
+
+	event_id[nr_counters] = config;
+	nr_counters++;
+
+	str = strstr(str, ",");
+	if (str) {
+		str++;
+		goto again;
+	}
+
+	return 0;
+}
+
+
+/*
+ * perfstat
+ */
+
+char fault_here[1000000];
+
+static void create_perfstat_counter(int counter)
+{
+	struct perf_counter_hw_event hw_event;
+
+	memset(&hw_event, 0, sizeof(hw_event));
+	hw_event.config		= event_id[counter];
+	hw_event.record_type	= 0;
+	hw_event.nmi		= 0;
+	if (scale)
+		hw_event.read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
+					  PERF_FORMAT_TOTAL_TIME_RUNNING;
+
+	if (system_wide) {
+		int cpu;
+		for (cpu = 0; cpu < nr_cpus; cpu ++) {
+			fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
+			if (fd[cpu][counter] < 0) {
+				printf("perfstat error: syscall returned with %d (%s)\n",
+						fd[cpu][counter], strerror(errno));
+				exit(-1);
+			}
+		}
+	} else {
+		hw_event.inherit	= 1;
+		hw_event.disabled	= 1;
+
+		fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
+		if (fd[0][counter] < 0) {
+			printf("perfstat error: syscall returned with %d (%s)\n",
+					fd[0][counter], strerror(errno));
+			exit(-1);
+		}
+	}
+}
+
+int do_perfstat(int argc, char *argv[])
+{
+	unsigned long long t0, t1;
+	int counter;
+	ssize_t res;
+	int status;
+	int pid;
+
+	if (!system_wide)
+		nr_cpus = 1;
+
+	for (counter = 0; counter < nr_counters; counter++)
+		create_perfstat_counter(counter);
+
+	argc -= optind;
+	argv += optind;
+
+	if (!argc)
+		display_help();
+
+	/*
+	 * Enable counters and exec the command:
+	 */
+	t0 = rdclock();
+	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
+
+	if ((pid = fork()) < 0)
+		perror("failed to fork");
+	if (!pid) {
+		if (execvp(argv[0], argv)) {
+			perror(argv[0]);
+			exit(-1);
+		}
+	}
+	while (wait(&status) >= 0)
+		;
+	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
+	t1 = rdclock();
+
+	fflush(stdout);
+
+	fprintf(stderr, "\n");
+	fprintf(stderr, " Performance counter stats for \'%s\':\n",
+		argv[0]);
+	fprintf(stderr, "\n");
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		int cpu, nv;
+		__u64 count[3], single_count[3];
+		int scaled;
+
+		count[0] = count[1] = count[2] = 0;
+		nv = scale ? 3 : 1;
+		for (cpu = 0; cpu < nr_cpus; cpu ++) {
+			res = read(fd[cpu][counter],
+				   single_count, nv * sizeof(__u64));
+			assert(res == nv * sizeof(__u64));
+
+			count[0] += single_count[0];
+			if (scale) {
+				count[1] += single_count[1];
+				count[2] += single_count[2];
+			}
+		}
+
+		scaled = 0;
+		if (scale) {
+			if (count[2] == 0) {
+				fprintf(stderr, " %14s  %-20s\n",
+					"<not counted>", event_name(counter));
+				continue;
+			}
+			if (count[2] < count[1]) {
+				scaled = 1;
+				count[0] = (unsigned long long)
+					((double)count[0] * count[1] / count[2] + 0.5);
+			}
+		}
+
+		if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
+		    event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
+
+			double msecs = (double)count[0] / 1000000;
+
+			fprintf(stderr, " %14.6f  %-20s (msecs)",
+				msecs, event_name(counter));
+		} else {
+			fprintf(stderr, " %14Ld  %-20s (events)",
+				count[0], event_name(counter));
+		}
+		if (scaled)
+			fprintf(stderr, "  (scaled from %.2f%%)",
+				(double) count[2] / count[1] * 100);
+		fprintf(stderr, "\n");
+	}
+	fprintf(stderr, "\n");
+	fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
+			(double)(t1-t0)/1e6);
+	fprintf(stderr, "\n");
+
+	return 0;
+}
+
+static void process_options(int argc, char **argv)
+{
+	int error = 0, counter;
+
+	for (;;) {
+		int option_index = 0;
+		/** Options for getopt */
+		static struct option long_options[] = {
+			{"count",	required_argument,	NULL, 'c'},
+			{"cpu",		required_argument,	NULL, 'C'},
+			{"delay",	required_argument,	NULL, 'd'},
+			{"dump_symtab",	no_argument,		NULL, 'D'},
+			{"event",	required_argument,	NULL, 'e'},
+			{"filter",	required_argument,	NULL, 'f'},
+			{"group",	required_argument,	NULL, 'g'},
+			{"help",	no_argument,		NULL, 'h'},
+			{"nmi",		required_argument,	NULL, 'n'},
+			{"munmap_info",	no_argument,		NULL, 'U'},
+			{"pid",		required_argument,	NULL, 'p'},
+			{"realtime",	required_argument,	NULL, 'r'},
+			{"scale",	no_argument,		NULL, 'l'},
+			{"symbol",	required_argument,	NULL, 's'},
+			{"stat",	no_argument,		NULL, 'S'},
+			{"vmlinux",	required_argument,	NULL, 'x'},
+			{"zero",	no_argument,		NULL, 'z'},
+			{NULL,		0,			NULL,  0 }
+		};
+		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
+				    long_options, &option_index);
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 'a': system_wide			=	       1; break;
+		case 'c': default_interval		=   atoi(optarg); break;
+		case 'C':
+			/* CPU and PID are mutually exclusive */
+			if (tid != -1) {
+				printf("WARNING: CPU switch overriding PID\n");
+				sleep(1);
+				tid = -1;
+			}
+			profile_cpu			=   atoi(optarg); break;
+
+		case 'e': error				= parse_events(optarg); break;
+
+		case 'g': group				=   atoi(optarg); break;
+		case 'h':      				  display_help(); break;
+		case 'l': scale				=	       1; break;
+		case 'n': nmi				=   atoi(optarg); break;
+		case 'p':
+			/* CPU and PID are mutually exclusive */
+			if (profile_cpu != -1) {
+				printf("WARNING: PID switch overriding CPU\n");
+				sleep(1);
+				profile_cpu = -1;
+			}
+			tid				=   atoi(optarg); break;
+		case 'z': zero				=              1; break;
+		default: error = 1; break;
+		}
+	}
+	if (error)
+		display_help();
+
+	if (!nr_counters) {
+		nr_counters = 8;
+	}
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		if (event_count[counter])
+			continue;
+
+		event_count[counter] = default_interval;
+	}
+}
+
+int cmd_stat(int argc, char **argv, const char *prefix)
+{
+	page_size = sysconf(_SC_PAGE_SIZE);
+
+	process_options(argc, argv);
+
+	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	assert(nr_cpus <= MAX_NR_CPUS);
+	assert(nr_cpus >= 0);
+
+	return do_perfstat(argc, argv);
+}
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 601bddb..98e8690 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -28,25 +28,6 @@
                8.54 - ffffffff805001a3 : ip_queue_xmit
  */
 
-/*
- * perfstat:  /usr/bin/time -alike performance counter statistics utility
-
-          It summarizes the counter events of all tasks (and child tasks),
-          covering all CPUs that the command (or workload) executes on.
-          It only counts the per-task events of the workload started,
-          independent of how many other tasks run on those CPUs.
-
-   Sample output:
-
-   $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
-
-   Performance counter stats for 'ls':
-
-           163516953 instructions
-                2295 cache-misses
-             2855182 branch-misses
- */
-
  /*
   * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
   *
@@ -149,7 +130,6 @@ asmlinkage int sys_perf_counter_open(
 
 #define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
 
-static int			run_perfstat			=  0;
 static int			system_wide			=  0;
 
 static int			nr_counters			=  0;
@@ -203,7 +183,7 @@ struct source_line {
 static struct source_line	*lines;
 static struct source_line	**lines_tail;
 
-const unsigned int default_count[] = {
+static const unsigned int default_count[] = {
 	1000000,
 	1000000,
 	  10000,
@@ -291,26 +271,8 @@ static void display_events_help(void)
 	"                           rNNN: raw PMU events (eventsel+umask)\n\n");
 }
 
-static void display_perfstat_help(void)
-{
-	printf(
-	"Usage: perfstat [<events...>] <cmd...>\n\n"
-	"PerfStat Options (up to %d event types can be specified):\n\n",
-		 MAX_COUNTERS);
-
-	display_events_help();
-
-	printf(
-	" -l                           # scale counter values\n"
-	" -a                           # system-wide collection\n");
-	exit(0);
-}
-
 static void display_help(void)
 {
-	if (run_perfstat)
-		return display_perfstat_help();
-
 	printf(
 	"Usage: kerneltop [<options>]\n"
 	"   Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
@@ -320,8 +282,6 @@ static void display_help(void)
 	display_events_help();
 
 	printf(
-	" -S        --stat             # perfstat COMMAND\n"
-	" -a                           # system-wide collection (for perfstat)\n\n"
 	" -c CNT    --count=CNT        # event period to sample\n\n"
 	" -C CPU    --cpu=CPU          # CPU (-1 for all)                 [default: -1]\n"
 	" -p PID    --pid=PID          # PID of sampled task (-1 for all) [default: -1]\n\n"
@@ -420,151 +380,6 @@ again:
 	return 0;
 }
 
-
-/*
- * perfstat
- */
-
-char fault_here[1000000];
-
-static void create_perfstat_counter(int counter)
-{
-	struct perf_counter_hw_event hw_event;
-
-	memset(&hw_event, 0, sizeof(hw_event));
-	hw_event.config		= event_id[counter];
-	hw_event.record_type	= 0;
-	hw_event.nmi		= 0;
-	if (scale)
-		hw_event.read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
-					  PERF_FORMAT_TOTAL_TIME_RUNNING;
-
-	if (system_wide) {
-		int cpu;
-		for (cpu = 0; cpu < nr_cpus; cpu ++) {
-			fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
-			if (fd[cpu][counter] < 0) {
-				printf("perfstat error: syscall returned with %d (%s)\n",
-						fd[cpu][counter], strerror(errno));
-				exit(-1);
-			}
-		}
-	} else {
-		hw_event.inherit	= 1;
-		hw_event.disabled	= 1;
-
-		fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
-		if (fd[0][counter] < 0) {
-			printf("perfstat error: syscall returned with %d (%s)\n",
-					fd[0][counter], strerror(errno));
-			exit(-1);
-		}
-	}
-}
-
-int do_perfstat(int argc, char *argv[])
-{
-	unsigned long long t0, t1;
-	int counter;
-	ssize_t res;
-	int status;
-	int pid;
-
-	if (!system_wide)
-		nr_cpus = 1;
-
-	for (counter = 0; counter < nr_counters; counter++)
-		create_perfstat_counter(counter);
-
-	argc -= optind;
-	argv += optind;
-
-	if (!argc)
-		display_help();
-
-	/*
-	 * Enable counters and exec the command:
-	 */
-	t0 = rdclock();
-	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
-
-	if ((pid = fork()) < 0)
-		perror("failed to fork");
-	if (!pid) {
-		if (execvp(argv[0], argv)) {
-			perror(argv[0]);
-			exit(-1);
-		}
-	}
-	while (wait(&status) >= 0)
-		;
-	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
-	t1 = rdclock();
-
-	fflush(stdout);
-
-	fprintf(stderr, "\n");
-	fprintf(stderr, " Performance counter stats for \'%s\':\n",
-		argv[0]);
-	fprintf(stderr, "\n");
-
-	for (counter = 0; counter < nr_counters; counter++) {
-		int cpu, nv;
-		__u64 count[3], single_count[3];
-		int scaled;
-
-		count[0] = count[1] = count[2] = 0;
-		nv = scale ? 3 : 1;
-		for (cpu = 0; cpu < nr_cpus; cpu ++) {
-			res = read(fd[cpu][counter],
-				   single_count, nv * sizeof(__u64));
-			assert(res == nv * sizeof(__u64));
-
-			count[0] += single_count[0];
-			if (scale) {
-				count[1] += single_count[1];
-				count[2] += single_count[2];
-			}
-		}
-
-		scaled = 0;
-		if (scale) {
-			if (count[2] == 0) {
-				fprintf(stderr, " %14s  %-20s\n",
-					"<not counted>", event_name(counter));
-				continue;
-			}
-			if (count[2] < count[1]) {
-				scaled = 1;
-				count[0] = (unsigned long long)
-					((double)count[0] * count[1] / count[2] + 0.5);
-			}
-		}
-
-		if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
-		    event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
-
-			double msecs = (double)count[0] / 1000000;
-
-			fprintf(stderr, " %14.6f  %-20s (msecs)",
-				msecs, event_name(counter));
-		} else {
-			fprintf(stderr, " %14Ld  %-20s (events)",
-				count[0], event_name(counter));
-		}
-		if (scaled)
-			fprintf(stderr, "  (scaled from %.2f%%)",
-				(double) count[2] / count[1] * 100);
-		fprintf(stderr, "\n");
-	}
-	fprintf(stderr, "\n");
-	fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
-			(double)(t1-t0)/1e6);
-	fprintf(stderr, "\n");
-
-	return 0;
-}
-
 /*
  * Symbols
  */
@@ -805,7 +620,7 @@ static int read_symbol(FILE *in, struct sym_entry *s)
 	return 0;
 }
 
-int compare_addr(const void *__sym1, const void *__sym2)
+static int compare_addr(const void *__sym1, const void *__sym2)
 {
 	const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
 
@@ -1070,9 +885,6 @@ static void process_options(int argc, char **argv)
 {
 	int error = 0, counter;
 
-	if (strstr(argv[0], "perfstat"))
-		run_perfstat = 1;
-
 	for (;;) {
 		int option_index = 0;
 		/** Options for getopt */
@@ -1134,7 +946,6 @@ static void process_options(int argc, char **argv)
 			tid				=   atoi(optarg); break;
 		case 'r': realtime_prio			=   atoi(optarg); break;
 		case 's': sym_filter			= strdup(optarg); break;
-		case 'S': run_perfstat			=	       1; break;
 		case 'x': vmlinux			= strdup(optarg); break;
 		case 'z': zero				=              1; break;
 		case 'm': mmap_pages			=   atoi(optarg); break;
@@ -1147,12 +958,8 @@ static void process_options(int argc, char **argv)
 		display_help();
 
 	if (!nr_counters) {
-		if (run_perfstat)
-			nr_counters = 8;
-		else {
-			nr_counters = 1;
-			event_id[0] = 0;
-		}
+		nr_counters = 1;
+		event_id[0] = 0;
 	}
 
 	for (counter = 0; counter < nr_counters; counter++) {
@@ -1308,9 +1115,6 @@ int cmd_top(int argc, char **argv, const char *prefix)
 	assert(nr_cpus <= MAX_NR_CPUS);
 	assert(nr_cpus >= 0);
 
-	if (run_perfstat)
-		return do_perfstat(argc, argv);
-
 	if (tid != -1 || profile_cpu != -1)
 		nr_cpus = 1;
 
diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h
index 4163744..a3bb6cd 100644
--- a/Documentation/perf_counter/builtin.h
+++ b/Documentation/perf_counter/builtin.h
@@ -14,5 +14,6 @@ extern void prune_packed_objects(int);
 extern int read_line_with_nul(char *buf, int size, FILE *file);
 extern int check_pager_config(const char *cmd);
 
-extern int cmd_top(int argc, const char **argv, const char *prefix);
+extern int cmd_top(int argc, char **argv, const char *prefix);
+extern int cmd_stat(int argc, char **argv, const char *prefix);
 #endif
diff --git a/Documentation/perf_counter/command-list.txt b/Documentation/perf_counter/command-list.txt
index 1eab365..52455d4 100644
--- a/Documentation/perf_counter/command-list.txt
+++ b/Documentation/perf_counter/command-list.txt
@@ -1,4 +1,5 @@
 # List of known perf commands.
 # command name				category [deprecated] [common]
 perf-top                                mainporcelain common
+perf-stat                               mainporcelain common
 
diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
deleted file mode 100644
index 042c1b8..0000000
--- a/Documentation/perf_counter/kerneltop.c
+++ /dev/null
@@ -1,1409 +0,0 @@
-/*
- * kerneltop.c: show top kernel functions - performance counters showcase
-
-   Build with:
-
-     cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
-
-   Sample output:
-
-------------------------------------------------------------------------------
- KernelTop:    2669 irqs/sec  [NMI, cache-misses/cache-refs],  (all, cpu: 2)
-------------------------------------------------------------------------------
-
-             weight         RIP          kernel function
-             ______   ________________   _______________
-
-              35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
-              33.00 - ffffffff804cb740 : sock_alloc_send_skb
-              31.26 - ffffffff804ce808 : skb_push
-              22.43 - ffffffff80510004 : tcp_established_options
-              19.00 - ffffffff8027d250 : find_get_page
-              15.76 - ffffffff804e4fc9 : eth_type_trans
-              15.20 - ffffffff804d8baa : dst_release
-              14.86 - ffffffff804cf5d8 : skb_release_head_state
-              14.00 - ffffffff802217d5 : read_hpet
-              12.00 - ffffffff804ffb7f : __ip_local_out
-              11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
-               8.54 - ffffffff805001a3 : ip_queue_xmit
- */
-
-/*
- * perfstat:  /usr/bin/time -alike performance counter statistics utility
-
-          It summarizes the counter events of all tasks (and child tasks),
-          covering all CPUs that the command (or workload) executes on.
-          It only counts the per-task events of the workload started,
-          independent of how many other tasks run on those CPUs.
-
-   Sample output:
-
-   $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
-
-   Performance counter stats for 'ls':
-
-           163516953 instructions
-                2295 cache-misses
-             2855182 branch-misses
- */
-
- /*
-  * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
-  *
-  * Improvements and fixes by:
-  *
-  *   Arjan van de Ven <arjan@linux.intel.com>
-  *   Yanmin Zhang <yanmin.zhang@intel.com>
-  *   Wu Fengguang <fengguang.wu@intel.com>
-  *   Mike Galbraith <efault@gmx.de>
-  *   Paul Mackerras <paulus@samba.org>
-  *
-  * Released under the GPL v2. (and only v2, not any later version)
-  */
-
-#define _GNU_SOURCE
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <unistd.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#include <getopt.h>
-#include <assert.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <errno.h>
-#include <ctype.h>
-#include <time.h>
-#include <sched.h>
-#include <pthread.h>
-
-#include <sys/syscall.h>
-#include <sys/ioctl.h>
-#include <sys/poll.h>
-#include <sys/prctl.h>
-#include <sys/wait.h>
-#include <sys/uio.h>
-#include <sys/mman.h>
-
-#include <linux/unistd.h>
-#include <linux/types.h>
-
-#include "../../include/linux/perf_counter.h"
-
-
-/*
- * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
- * counters in the current task.
- */
-#define PR_TASK_PERF_COUNTERS_DISABLE   31
-#define PR_TASK_PERF_COUNTERS_ENABLE    32
-
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-
-#define rdclock()                                       \
-({                                                      \
-        struct timespec ts;                             \
-                                                        \
-        clock_gettime(CLOCK_MONOTONIC, &ts);            \
-        ts.tv_sec * 1000000000ULL + ts.tv_nsec;         \
-})
-
-/*
- * Pick up some kernel type conventions:
- */
-#define __user
-#define asmlinkage
-
-#ifdef __x86_64__
-#define __NR_perf_counter_open 295
-#define rmb()		asm volatile("lfence" ::: "memory")
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#endif
-
-#ifdef __i386__
-#define __NR_perf_counter_open 333
-#define rmb()		asm volatile("lfence" ::: "memory")
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#endif
-
-#ifdef __powerpc__
-#define __NR_perf_counter_open 319
-#define rmb() 		asm volatile ("sync" ::: "memory")
-#define cpu_relax()	asm volatile ("" ::: "memory");
-#endif
-
-#define unlikely(x)	__builtin_expect(!!(x), 0)
-#define min(x, y) ({				\
-	typeof(x) _min1 = (x);			\
-	typeof(y) _min2 = (y);			\
-	(void) (&_min1 == &_min2);		\
-	_min1 < _min2 ? _min1 : _min2; })
-
-asmlinkage int sys_perf_counter_open(
-        struct perf_counter_hw_event    *hw_event_uptr          __user,
-        pid_t                           pid,
-        int                             cpu,
-        int                             group_fd,
-        unsigned long                   flags)
-{
-        return syscall(
-                __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
-}
-
-#define MAX_COUNTERS			64
-#define MAX_NR_CPUS			256
-
-#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
-
-static int			run_perfstat			=  0;
-static int			system_wide			=  0;
-
-static int			nr_counters			=  0;
-static __u64			event_id[MAX_COUNTERS]		= {
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
-
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
-};
-static int			default_interval = 100000;
-static int			event_count[MAX_COUNTERS];
-static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
-
-static __u64			count_filter		       = 100;
-
-static int			tid				= -1;
-static int			profile_cpu			= -1;
-static int			nr_cpus				=  0;
-static int			nmi				=  1;
-static unsigned int		realtime_prio			=  0;
-static int			group				=  0;
-static unsigned int		page_size;
-static unsigned int		mmap_pages			=  16;
-static int			use_mmap			= 0;
-static int			use_munmap			= 0;
-
-static char			*vmlinux;
-
-static char			*sym_filter;
-static unsigned long		filter_start;
-static unsigned long		filter_end;
-
-static int			delay_secs			=  2;
-static int			zero;
-static int			dump_symtab;
-
-static int			scale;
-
-struct source_line {
-	uint64_t		EIP;
-	unsigned long		count;
-	char			*line;
-	struct source_line	*next;
-};
-
-static struct source_line	*lines;
-static struct source_line	**lines_tail;
-
-const unsigned int default_count[] = {
-	1000000,
-	1000000,
-	  10000,
-	  10000,
-	1000000,
-	  10000,
-};
-
-static char *hw_event_names[] = {
-	"CPU cycles",
-	"instructions",
-	"cache references",
-	"cache misses",
-	"branches",
-	"branch misses",
-	"bus cycles",
-};
-
-static char *sw_event_names[] = {
-	"cpu clock ticks",
-	"task clock ticks",
-	"pagefaults",
-	"context switches",
-	"CPU migrations",
-	"minor faults",
-	"major faults",
-};
-
-struct event_symbol {
-	__u64 event;
-	char *symbol;
-};
-
-static struct event_symbol event_symbols[] = {
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
-
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
-};
-
-#define __PERF_COUNTER_FIELD(config, name) \
-	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
-
-#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
-#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
-#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
-#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
-
-static void display_events_help(void)
-{
-	unsigned int i;
-	__u64 e;
-
-	printf(
-	" -e EVENT     --event=EVENT   #  symbolic-name        abbreviations");
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		int type, id;
-
-		e = event_symbols[i].event;
-		type = PERF_COUNTER_TYPE(e);
-		id = PERF_COUNTER_ID(e);
-
-		printf("\n                             %d:%d: %-20s",
-				type, id, event_symbols[i].symbol);
-	}
-
-	printf("\n"
-	"                           rNNN: raw PMU events (eventsel+umask)\n\n");
-}
-
-static void display_perfstat_help(void)
-{
-	printf(
-	"Usage: perfstat [<events...>] <cmd...>\n\n"
-	"PerfStat Options (up to %d event types can be specified):\n\n",
-		 MAX_COUNTERS);
-
-	display_events_help();
-
-	printf(
-	" -l                           # scale counter values\n"
-	" -a                           # system-wide collection\n");
-	exit(0);
-}
-
-static void display_help(void)
-{
-	if (run_perfstat)
-		return display_perfstat_help();
-
-	printf(
-	"Usage: kerneltop [<options>]\n"
-	"   Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
-	"KernelTop Options (up to %d event types can be specified at once):\n\n",
-		 MAX_COUNTERS);
-
-	display_events_help();
-
-	printf(
-	" -S        --stat             # perfstat COMMAND\n"
-	" -a                           # system-wide collection (for perfstat)\n\n"
-	" -c CNT    --count=CNT        # event period to sample\n\n"
-	" -C CPU    --cpu=CPU          # CPU (-1 for all)                 [default: -1]\n"
-	" -p PID    --pid=PID          # PID of sampled task (-1 for all) [default: -1]\n\n"
-	" -l                           # show scale factor for RR events\n"
-	" -d delay  --delay=<seconds>  # sampling/display delay           [default:  2]\n"
-	" -f CNT    --filter=CNT       # min-event-count filter          [default: 100]\n\n"
-	" -r prio   --realtime=<prio>  # event acquisition runs with SCHED_FIFO policy\n"
-	" -s symbol --symbol=<symbol>  # function to be showed annotated one-shot\n"
-	" -x path   --vmlinux=<path>   # the vmlinux binary, required for -s use\n"
-	" -z        --zero             # zero counts after display\n"
-	" -D        --dump_symtab      # dump symbol table to stderr on startup\n"
-	" -m pages  --mmap_pages=<pages> # number of mmap data pages\n"
-	" -M        --mmap_info        # print mmap info stream\n"
-	" -U        --munmap_info      # print munmap info stream\n"
-	);
-
-	exit(0);
-}
-
-static char *event_name(int ctr)
-{
-	__u64 config = event_id[ctr];
-	int type = PERF_COUNTER_TYPE(config);
-	int id = PERF_COUNTER_ID(config);
-	static char buf[32];
-
-	if (PERF_COUNTER_RAW(config)) {
-		sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
-		return buf;
-	}
-
-	switch (type) {
-	case PERF_TYPE_HARDWARE:
-		if (id < PERF_HW_EVENTS_MAX)
-			return hw_event_names[id];
-		return "unknown-hardware";
-
-	case PERF_TYPE_SOFTWARE:
-		if (id < PERF_SW_EVENTS_MAX)
-			return sw_event_names[id];
-		return "unknown-software";
-
-	default:
-		break;
-	}
-
-	return "unknown";
-}
-
-/*
- * Each event can have multiple symbolic names.
- * Symbolic names are (almost) exactly matched.
- */
-static __u64 match_event_symbols(char *str)
-{
-	__u64 config, id;
-	int type;
-	unsigned int i;
-
-	if (sscanf(str, "r%llx", &config) == 1)
-		return config | PERF_COUNTER_RAW_MASK;
-
-	if (sscanf(str, "%d:%llu", &type, &id) == 2)
-		return EID(type, id);
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		if (!strncmp(str, event_symbols[i].symbol,
-			     strlen(event_symbols[i].symbol)))
-			return event_symbols[i].event;
-	}
-
-	return ~0ULL;
-}
-
-static int parse_events(char *str)
-{
-	__u64 config;
-
-again:
-	if (nr_counters == MAX_COUNTERS)
-		return -1;
-
-	config = match_event_symbols(str);
-	if (config == ~0ULL)
-		return -1;
-
-	event_id[nr_counters] = config;
-	nr_counters++;
-
-	str = strstr(str, ",");
-	if (str) {
-		str++;
-		goto again;
-	}
-
-	return 0;
-}
-
-
-/*
- * perfstat
- */
-
-char fault_here[1000000];
-
-static void create_perfstat_counter(int counter)
-{
-	struct perf_counter_hw_event hw_event;
-
-	memset(&hw_event, 0, sizeof(hw_event));
-	hw_event.config		= event_id[counter];
-	hw_event.record_type	= 0;
-	hw_event.nmi		= 0;
-	if (scale)
-		hw_event.read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
-					  PERF_FORMAT_TOTAL_TIME_RUNNING;
-
-	if (system_wide) {
-		int cpu;
-		for (cpu = 0; cpu < nr_cpus; cpu ++) {
-			fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
-			if (fd[cpu][counter] < 0) {
-				printf("perfstat error: syscall returned with %d (%s)\n",
-						fd[cpu][counter], strerror(errno));
-				exit(-1);
-			}
-		}
-	} else {
-		hw_event.inherit	= 1;
-		hw_event.disabled	= 1;
-
-		fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
-		if (fd[0][counter] < 0) {
-			printf("perfstat error: syscall returned with %d (%s)\n",
-					fd[0][counter], strerror(errno));
-			exit(-1);
-		}
-	}
-}
-
-int do_perfstat(int argc, char *argv[])
-{
-	unsigned long long t0, t1;
-	int counter;
-	ssize_t res;
-	int status;
-	int pid;
-
-	if (!system_wide)
-		nr_cpus = 1;
-
-	for (counter = 0; counter < nr_counters; counter++)
-		create_perfstat_counter(counter);
-
-	argc -= optind;
-	argv += optind;
-
-	if (!argc)
-		display_help();
-
-	/*
-	 * Enable counters and exec the command:
-	 */
-	t0 = rdclock();
-	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
-
-	if ((pid = fork()) < 0)
-		perror("failed to fork");
-	if (!pid) {
-		if (execvp(argv[0], argv)) {
-			perror(argv[0]);
-			exit(-1);
-		}
-	}
-	while (wait(&status) >= 0)
-		;
-	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
-	t1 = rdclock();
-
-	fflush(stdout);
-
-	fprintf(stderr, "\n");
-	fprintf(stderr, " Performance counter stats for \'%s\':\n",
-		argv[0]);
-	fprintf(stderr, "\n");
-
-	for (counter = 0; counter < nr_counters; counter++) {
-		int cpu, nv;
-		__u64 count[3], single_count[3];
-		int scaled;
-
-		count[0] = count[1] = count[2] = 0;
-		nv = scale ? 3 : 1;
-		for (cpu = 0; cpu < nr_cpus; cpu ++) {
-			res = read(fd[cpu][counter],
-				   single_count, nv * sizeof(__u64));
-			assert(res == nv * sizeof(__u64));
-
-			count[0] += single_count[0];
-			if (scale) {
-				count[1] += single_count[1];
-				count[2] += single_count[2];
-			}
-		}
-
-		scaled = 0;
-		if (scale) {
-			if (count[2] == 0) {
-				fprintf(stderr, " %14s  %-20s\n",
-					"<not counted>", event_name(counter));
-				continue;
-			}
-			if (count[2] < count[1]) {
-				scaled = 1;
-				count[0] = (unsigned long long)
-					((double)count[0] * count[1] / count[2] + 0.5);
-			}
-		}
-
-		if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
-		    event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
-
-			double msecs = (double)count[0] / 1000000;
-
-			fprintf(stderr, " %14.6f  %-20s (msecs)",
-				msecs, event_name(counter));
-		} else {
-			fprintf(stderr, " %14Ld  %-20s (events)",
-				count[0], event_name(counter));
-		}
-		if (scaled)
-			fprintf(stderr, "  (scaled from %.2f%%)",
-				(double) count[2] / count[1] * 100);
-		fprintf(stderr, "\n");
-	}
-	fprintf(stderr, "\n");
-	fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
-			(double)(t1-t0)/1e6);
-	fprintf(stderr, "\n");
-
-	return 0;
-}
-
-/*
- * Symbols
- */
-
-static uint64_t			min_ip;
-static uint64_t			max_ip = -1ll;
-
-struct sym_entry {
-	unsigned long long	addr;
-	char			*sym;
-	unsigned long		count[MAX_COUNTERS];
-	int			skip;
-	struct source_line	*source;
-};
-
-#define MAX_SYMS		100000
-
-static int sym_table_count;
-
-struct sym_entry		*sym_filter_entry;
-
-static struct sym_entry		sym_table[MAX_SYMS];
-
-static void show_details(struct sym_entry *sym);
-
-/*
- * Ordering weight: count-1 * count-2 * ... / count-n
- */
-static double sym_weight(const struct sym_entry *sym)
-{
-	double weight;
-	int counter;
-
-	weight = sym->count[0];
-
-	for (counter = 1; counter < nr_counters-1; counter++)
-		weight *= sym->count[counter];
-
-	weight /= (sym->count[counter] + 1);
-
-	return weight;
-}
-
-static int compare(const void *__sym1, const void *__sym2)
-{
-	const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
-
-	return sym_weight(sym1) < sym_weight(sym2);
-}
-
-static long			events;
-static long			userspace_events;
-static const char		CONSOLE_CLEAR[] = "[H[2J";
-
-static struct sym_entry		tmp[MAX_SYMS];
-
-static void print_sym_table(void)
-{
-	int i, printed;
-	int counter;
-	float events_per_sec = events/delay_secs;
-	float kevents_per_sec = (events-userspace_events)/delay_secs;
-	float sum_kevents = 0.0;
-
-	events = userspace_events = 0;
-	memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
-	qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
-
-	for (i = 0; i < sym_table_count && tmp[i].count[0]; i++)
-		sum_kevents += tmp[i].count[0];
-
-	write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
-
-	printf(
-"------------------------------------------------------------------------------\n");
-	printf( " KernelTop:%8.0f irqs/sec  kernel:%4.1f%% [%s, ",
-		events_per_sec,
-		100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
-		nmi ? "NMI" : "IRQ");
-
-	if (nr_counters == 1)
-		printf("%d ", event_count[0]);
-
-	for (counter = 0; counter < nr_counters; counter++) {
-		if (counter)
-			printf("/");
-
-		printf("%s", event_name(counter));
-	}
-
-	printf( "], ");
-
-	if (tid != -1)
-		printf(" (tid: %d", tid);
-	else
-		printf(" (all");
-
-	if (profile_cpu != -1)
-		printf(", cpu: %d)\n", profile_cpu);
-	else {
-		if (tid != -1)
-			printf(")\n");
-		else
-			printf(", %d CPUs)\n", nr_cpus);
-	}
-
-	printf("------------------------------------------------------------------------------\n\n");
-
-	if (nr_counters == 1)
-		printf("             events    pcnt");
-	else
-		printf("  weight     events    pcnt");
-
-	printf("         RIP          kernel function\n"
-	       	       "  ______     ______   _____   ________________   _______________\n\n"
-	);
-
-	for (i = 0, printed = 0; i < sym_table_count; i++) {
-		float pcnt;
-		int count;
-
-		if (printed <= 18 && tmp[i].count[0] >= count_filter) {
-			pcnt = 100.0 - (100.0*((sum_kevents-tmp[i].count[0])/sum_kevents));
-
-			if (nr_counters == 1)
-				printf("%19.2f - %4.1f%% - %016llx : %s\n",
-					sym_weight(tmp + i),
-					pcnt, tmp[i].addr, tmp[i].sym);
-			else
-				printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n",
-					sym_weight(tmp + i),
-					tmp[i].count[0],
-					pcnt, tmp[i].addr, tmp[i].sym);
-			printed++;
-		}
-		/*
-		 * Add decay to the counts:
-		 */
-		for (count = 0; count < nr_counters; count++)
-			sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
-	}
-
-	if (sym_filter_entry)
-		show_details(sym_filter_entry);
-
-	{
-		struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
-
-		if (poll(&stdin_poll, 1, 0) == 1) {
-			printf("key pressed - exiting.\n");
-			exit(0);
-		}
-	}
-}
-
-static void *display_thread(void *arg)
-{
-	printf("KernelTop refresh period: %d seconds\n", delay_secs);
-
-	while (!sleep(delay_secs))
-		print_sym_table();
-
-	return NULL;
-}
-
-static int read_symbol(FILE *in, struct sym_entry *s)
-{
-	static int filter_match = 0;
-	char *sym, stype;
-	char str[500];
-	int rc, pos;
-
-	rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
-	if (rc == EOF)
-		return -1;
-
-	assert(rc == 3);
-
-	/* skip until end of line: */
-	pos = strlen(str);
-	do {
-		rc = fgetc(in);
-		if (rc == '\n' || rc == EOF || pos >= 499)
-			break;
-		str[pos] = rc;
-		pos++;
-	} while (1);
-	str[pos] = 0;
-
-	sym = str;
-
-	/* Filter out known duplicates and non-text symbols. */
-	if (!strcmp(sym, "_text"))
-		return 1;
-	if (!min_ip && !strcmp(sym, "_stext"))
-		return 1;
-	if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
-		return 1;
-	if (stype != 'T' && stype != 't')
-		return 1;
-	if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
-		return 1;
-	if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
-		return 1;
-
-	s->sym = malloc(strlen(str));
-	assert(s->sym);
-
-	strcpy((char *)s->sym, str);
-	s->skip = 0;
-
-	/* Tag events to be skipped. */
-	if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
-		s->skip = 1;
-	else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
-		s->skip = 1;
-	else if (!strcmp("mwait_idle", s->sym))
-		s->skip = 1;
-
-	if (filter_match == 1) {
-		filter_end = s->addr;
-		filter_match = -1;
-		if (filter_end - filter_start > 10000) {
-			printf("hm, too large filter symbol <%s> - skipping.\n",
-				sym_filter);
-			printf("symbol filter start: %016lx\n", filter_start);
-			printf("                end: %016lx\n", filter_end);
-			filter_end = filter_start = 0;
-			sym_filter = NULL;
-			sleep(1);
-		}
-	}
-	if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
-		filter_match = 1;
-		filter_start = s->addr;
-	}
-
-	return 0;
-}
-
-int compare_addr(const void *__sym1, const void *__sym2)
-{
-	const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
-
-	return sym1->addr > sym2->addr;
-}
-
-static void sort_symbol_table(void)
-{
-	int i, dups;
-
-	do {
-		qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
-		for (i = 0, dups = 0; i < sym_table_count; i++) {
-			if (sym_table[i].addr == sym_table[i+1].addr) {
-				sym_table[i+1].addr = -1ll;
-				dups++;
-			}
-		}
-		sym_table_count -= dups;
-	} while(dups);
-}
-
-static void parse_symbols(void)
-{
-	struct sym_entry *last;
-
-	FILE *kallsyms = fopen("/proc/kallsyms", "r");
-
-	if (!kallsyms) {
-		printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
-		exit(-1);
-	}
-
-	while (!feof(kallsyms)) {
-		if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
-			sym_table_count++;
-			assert(sym_table_count <= MAX_SYMS);
-		}
-	}
-
-	sort_symbol_table();
-	min_ip = sym_table[0].addr;
-	max_ip = sym_table[sym_table_count-1].addr;
-	last = sym_table + sym_table_count++;
-
-	last->addr = -1ll;
-	last->sym = "<end>";
-
-	if (filter_end) {
-		int count;
-		for (count=0; count < sym_table_count; count ++) {
-			if (!strcmp(sym_table[count].sym, sym_filter)) {
-				sym_filter_entry = &sym_table[count];
-				break;
-			}
-		}
-	}
-	if (dump_symtab) {
-		int i;
-
-		for (i = 0; i < sym_table_count; i++)
-			fprintf(stderr, "%llx %s\n",
-				sym_table[i].addr, sym_table[i].sym);
-	}
-}
-
-/*
- * Source lines
- */
-
-static void parse_vmlinux(char *filename)
-{
-	FILE *file;
-	char command[PATH_MAX*2];
-	if (!filename)
-		return;
-
-	sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
-
-	file = popen(command, "r");
-	if (!file)
-		return;
-
-	lines_tail = &lines;
-	while (!feof(file)) {
-		struct source_line *src;
-		size_t dummy = 0;
-		char *c;
-
-		src = malloc(sizeof(struct source_line));
-		assert(src != NULL);
-		memset(src, 0, sizeof(struct source_line));
-
-		if (getline(&src->line, &dummy, file) < 0)
-			break;
-		if (!src->line)
-			break;
-
-		c = strchr(src->line, '\n');
-		if (c)
-			*c = 0;
-
-		src->next = NULL;
-		*lines_tail = src;
-		lines_tail = &src->next;
-
-		if (strlen(src->line)>8 && src->line[8] == ':')
-			src->EIP = strtoull(src->line, NULL, 16);
-		if (strlen(src->line)>8 && src->line[16] == ':')
-			src->EIP = strtoull(src->line, NULL, 16);
-	}
-	pclose(file);
-}
-
-static void record_precise_ip(uint64_t ip)
-{
-	struct source_line *line;
-
-	for (line = lines; line; line = line->next) {
-		if (line->EIP == ip)
-			line->count++;
-		if (line->EIP > ip)
-			break;
-	}
-}
-
-static void lookup_sym_in_vmlinux(struct sym_entry *sym)
-{
-	struct source_line *line;
-	char pattern[PATH_MAX];
-	sprintf(pattern, "<%s>:", sym->sym);
-
-	for (line = lines; line; line = line->next) {
-		if (strstr(line->line, pattern)) {
-			sym->source = line;
-			break;
-		}
-	}
-}
-
-static void show_lines(struct source_line *line_queue, int line_queue_count)
-{
-	int i;
-	struct source_line *line;
-
-	line = line_queue;
-	for (i = 0; i < line_queue_count; i++) {
-		printf("%8li\t%s\n", line->count, line->line);
-		line = line->next;
-	}
-}
-
-#define TRACE_COUNT     3
-
-static void show_details(struct sym_entry *sym)
-{
-	struct source_line *line;
-	struct source_line *line_queue = NULL;
-	int displayed = 0;
-	int line_queue_count = 0;
-
-	if (!sym->source)
-		lookup_sym_in_vmlinux(sym);
-	if (!sym->source)
-		return;
-
-	printf("Showing details for %s\n", sym->sym);
-
-	line = sym->source;
-	while (line) {
-		if (displayed && strstr(line->line, ">:"))
-			break;
-
-		if (!line_queue_count)
-			line_queue = line;
-		line_queue_count ++;
-
-		if (line->count >= count_filter) {
-			show_lines(line_queue, line_queue_count);
-			line_queue_count = 0;
-			line_queue = NULL;
-		} else if (line_queue_count > TRACE_COUNT) {
-			line_queue = line_queue->next;
-			line_queue_count --;
-		}
-
-		line->count = 0;
-		displayed++;
-		if (displayed > 300)
-			break;
-		line = line->next;
-	}
-}
-
-/*
- * Binary search in the histogram table and record the hit:
- */
-static void record_ip(uint64_t ip, int counter)
-{
-	int left_idx, middle_idx, right_idx, idx;
-	unsigned long left, middle, right;
-
-	record_precise_ip(ip);
-
-	left_idx = 0;
-	right_idx = sym_table_count-1;
-	assert(ip <= max_ip && ip >= min_ip);
-
-	while (left_idx + 1 < right_idx) {
-		middle_idx = (left_idx + right_idx) / 2;
-
-		left   = sym_table[  left_idx].addr;
-		middle = sym_table[middle_idx].addr;
-		right  = sym_table[ right_idx].addr;
-
-		if (!(left <= middle && middle <= right)) {
-			printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
-			printf("%d %d %d\n", left_idx, middle_idx, right_idx);
-		}
-		assert(left <= middle && middle <= right);
-		if (!(left <= ip && ip <= right)) {
-			printf(" left: %016lx\n", left);
-			printf("   ip: %016lx\n", (unsigned long)ip);
-			printf("right: %016lx\n", right);
-		}
-		assert(left <= ip && ip <= right);
-		/*
-		 * [ left .... target .... middle .... right ]
-		 *   => right := middle
-		 */
-		if (ip < middle) {
-			right_idx = middle_idx;
-			continue;
-		}
-		/*
-		 * [ left .... middle ... target ... right ]
-		 *   => left := middle
-		 */
-		left_idx = middle_idx;
-	}
-
-	idx = left_idx;
-
-	if (!sym_table[idx].skip)
-		sym_table[idx].count[counter]++;
-	else events--;
-}
-
-static void process_event(uint64_t ip, int counter)
-{
-	events++;
-
-	if (ip < min_ip || ip > max_ip) {
-		userspace_events++;
-		return;
-	}
-
-	record_ip(ip, counter);
-}
-
-static void process_options(int argc, char *argv[])
-{
-	int error = 0, counter;
-
-	if (strstr(argv[0], "perfstat"))
-		run_perfstat = 1;
-
-	for (;;) {
-		int option_index = 0;
-		/** Options for getopt */
-		static struct option long_options[] = {
-			{"count",	required_argument,	NULL, 'c'},
-			{"cpu",		required_argument,	NULL, 'C'},
-			{"delay",	required_argument,	NULL, 'd'},
-			{"dump_symtab",	no_argument,		NULL, 'D'},
-			{"event",	required_argument,	NULL, 'e'},
-			{"filter",	required_argument,	NULL, 'f'},
-			{"group",	required_argument,	NULL, 'g'},
-			{"help",	no_argument,		NULL, 'h'},
-			{"nmi",		required_argument,	NULL, 'n'},
-			{"mmap_info",	no_argument,		NULL, 'M'},
-			{"mmap_pages",	required_argument,	NULL, 'm'},
-			{"munmap_info",	no_argument,		NULL, 'U'},
-			{"pid",		required_argument,	NULL, 'p'},
-			{"realtime",	required_argument,	NULL, 'r'},
-			{"scale",	no_argument,		NULL, 'l'},
-			{"symbol",	required_argument,	NULL, 's'},
-			{"stat",	no_argument,		NULL, 'S'},
-			{"vmlinux",	required_argument,	NULL, 'x'},
-			{"zero",	no_argument,		NULL, 'z'},
-			{NULL,		0,			NULL,  0 }
-		};
-		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
-				    long_options, &option_index);
-		if (c == -1)
-			break;
-
-		switch (c) {
-		case 'a': system_wide			=	       1; break;
-		case 'c': default_interval		=   atoi(optarg); break;
-		case 'C':
-			/* CPU and PID are mutually exclusive */
-			if (tid != -1) {
-				printf("WARNING: CPU switch overriding PID\n");
-				sleep(1);
-				tid = -1;
-			}
-			profile_cpu			=   atoi(optarg); break;
-		case 'd': delay_secs			=   atoi(optarg); break;
-		case 'D': dump_symtab			=              1; break;
-
-		case 'e': error				= parse_events(optarg); break;
-
-		case 'f': count_filter			=   atoi(optarg); break;
-		case 'g': group				=   atoi(optarg); break;
-		case 'h':      				  display_help(); break;
-		case 'l': scale				=	       1; break;
-		case 'n': nmi				=   atoi(optarg); break;
-		case 'p':
-			/* CPU and PID are mutually exclusive */
-			if (profile_cpu != -1) {
-				printf("WARNING: PID switch overriding CPU\n");
-				sleep(1);
-				profile_cpu = -1;
-			}
-			tid				=   atoi(optarg); break;
-		case 'r': realtime_prio			=   atoi(optarg); break;
-		case 's': sym_filter			= strdup(optarg); break;
-		case 'S': run_perfstat			=	       1; break;
-		case 'x': vmlinux			= strdup(optarg); break;
-		case 'z': zero				=              1; break;
-		case 'm': mmap_pages			=   atoi(optarg); break;
-		case 'M': use_mmap			=              1; break;
-		case 'U': use_munmap			=              1; break;
-		default: error = 1; break;
-		}
-	}
-	if (error)
-		display_help();
-
-	if (!nr_counters) {
-		if (run_perfstat)
-			nr_counters = 8;
-		else {
-			nr_counters = 1;
-			event_id[0] = 0;
-		}
-	}
-
-	for (counter = 0; counter < nr_counters; counter++) {
-		if (event_count[counter])
-			continue;
-
-		event_count[counter] = default_interval;
-	}
-}
-
-struct mmap_data {
-	int counter;
-	void *base;
-	unsigned int mask;
-	unsigned int prev;
-};
-
-static unsigned int mmap_read_head(struct mmap_data *md)
-{
-	struct perf_counter_mmap_page *pc = md->base;
-	int head;
-
-	head = pc->data_head;
-	rmb();
-
-	return head;
-}
-
-struct timeval last_read, this_read;
-
-static void mmap_read(struct mmap_data *md)
-{
-	unsigned int head = mmap_read_head(md);
-	unsigned int old = md->prev;
-	unsigned char *data = md->base + page_size;
-	int diff;
-
-	gettimeofday(&this_read, NULL);
-
-	/*
-	 * If we're further behind than half the buffer, there's a chance
-	 * the writer will bite our tail and screw up the events under us.
-	 *
-	 * If we somehow ended up ahead of the head, we got messed up.
-	 *
-	 * In either case, truncate and restart at head.
-	 */
-	diff = head - old;
-	if (diff > md->mask / 2 || diff < 0) {
-		struct timeval iv;
-		unsigned long msecs;
-
-		timersub(&this_read, &last_read, &iv);
-		msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
-
-		fprintf(stderr, "WARNING: failed to keep up with mmap data."
-				"  Last read %lu msecs ago.\n", msecs);
-
-		/*
-		 * head points to a known good entry, start there.
-		 */
-		old = head;
-	}
-
-	last_read = this_read;
-
-	for (; old != head;) {
-		struct ip_event {
-			struct perf_event_header header;
-			__u64 ip;
-			__u32 pid, tid;
-		};
-		struct mmap_event {
-			struct perf_event_header header;
-			__u32 pid, tid;
-			__u64 start;
-			__u64 len;
-			__u64 pgoff;
-			char filename[PATH_MAX];
-		};
-
-		typedef union event_union {
-			struct perf_event_header header;
-			struct ip_event ip;
-			struct mmap_event mmap;
-		} event_t;
-
-		event_t *event = (event_t *)&data[old & md->mask];
-
-		event_t event_copy;
-
-		unsigned int size = event->header.size;
-
-		/*
-		 * Event straddles the mmap boundary -- header should always
-		 * be inside due to u64 alignment of output.
-		 */
-		if ((old & md->mask) + size != ((old + size) & md->mask)) {
-			unsigned int offset = old;
-			unsigned int len = min(sizeof(*event), size), cpy;
-			void *dst = &event_copy;
-
-			do {
-				cpy = min(md->mask + 1 - (offset & md->mask), len);
-				memcpy(dst, &data[offset & md->mask], cpy);
-				offset += cpy;
-				dst += cpy;
-				len -= cpy;
-			} while (len);
-
-			event = &event_copy;
-		}
-
-		old += size;
-
-		if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
-			if (event->header.type & PERF_RECORD_IP)
-				process_event(event->ip.ip, md->counter);
-		} else {
-			switch (event->header.type) {
-				case PERF_EVENT_MMAP:
-				case PERF_EVENT_MUNMAP:
-					printf("%s: %Lu %Lu %Lu %s\n",
-							event->header.type == PERF_EVENT_MMAP
-							? "mmap" : "munmap",
-							event->mmap.start,
-							event->mmap.len,
-							event->mmap.pgoff,
-							event->mmap.filename);
-					break;
-			}
-		}
-	}
-
-	md->prev = old;
-}
-
-int main(int argc, char *argv[])
-{
-	struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
-	struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
-	struct perf_counter_hw_event hw_event;
-	pthread_t thread;
-	int i, counter, group_fd, nr_poll = 0;
-	unsigned int cpu;
-	int ret;
-
-	page_size = sysconf(_SC_PAGE_SIZE);
-
-	process_options(argc, argv);
-
-	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
-	assert(nr_cpus <= MAX_NR_CPUS);
-	assert(nr_cpus >= 0);
-
-	if (run_perfstat)
-		return do_perfstat(argc, argv);
-
-	if (tid != -1 || profile_cpu != -1)
-		nr_cpus = 1;
-
-	parse_symbols();
-	if (vmlinux && sym_filter_entry)
-		parse_vmlinux(vmlinux);
-
-	for (i = 0; i < nr_cpus; i++) {
-		group_fd = -1;
-		for (counter = 0; counter < nr_counters; counter++) {
-
-			cpu	= profile_cpu;
-			if (tid == -1 && profile_cpu == -1)
-				cpu = i;
-
-			memset(&hw_event, 0, sizeof(hw_event));
-			hw_event.config		= event_id[counter];
-			hw_event.irq_period	= event_count[counter];
-			hw_event.record_type	= PERF_RECORD_IP | PERF_RECORD_TID;
-			hw_event.nmi		= nmi;
-			hw_event.mmap		= use_mmap;
-			hw_event.munmap		= use_munmap;
-
-			fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
-			if (fd[i][counter] < 0) {
-				int err = errno;
-				printf("kerneltop error: syscall returned with %d (%s)\n",
-					fd[i][counter], strerror(err));
-				if (err == EPERM)
-					printf("Are you root?\n");
-				exit(-1);
-			}
-			assert(fd[i][counter] >= 0);
-			fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
-
-			/*
-			 * First counter acts as the group leader:
-			 */
-			if (group && group_fd == -1)
-				group_fd = fd[i][counter];
-
-			event_array[nr_poll].fd = fd[i][counter];
-			event_array[nr_poll].events = POLLIN;
-			nr_poll++;
-
-			mmap_array[i][counter].counter = counter;
-			mmap_array[i][counter].prev = 0;
-			mmap_array[i][counter].mask = mmap_pages*page_size - 1;
-			mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
-					PROT_READ, MAP_SHARED, fd[i][counter], 0);
-			if (mmap_array[i][counter].base == MAP_FAILED) {
-				printf("kerneltop error: failed to mmap with %d (%s)\n",
-						errno, strerror(errno));
-				exit(-1);
-			}
-		}
-	}
-
-	if (pthread_create(&thread, NULL, display_thread, NULL)) {
-		printf("Could not create display thread.\n");
-		exit(-1);
-	}
-
-	if (realtime_prio) {
-		struct sched_param param;
-
-		param.sched_priority = realtime_prio;
-		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
-			printf("Could not set realtime priority.\n");
-			exit(-1);
-		}
-	}
-
-	while (1) {
-		int hits = events;
-
-		for (i = 0; i < nr_cpus; i++) {
-			for (counter = 0; counter < nr_counters; counter++)
-				mmap_read(&mmap_array[i][counter]);
-		}
-
-		if (hits == events)
-			ret = poll(event_array, nr_poll, 100);
-	}
-
-	return 0;
-}
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
index 63f8a89..ff8658f 100644
--- a/Documentation/perf_counter/perf.c
+++ b/Documentation/perf_counter/perf.c
@@ -249,6 +249,7 @@ static void handle_internal_command(int argc, const char **argv)
 	const char *cmd = argv[0];
 	static struct cmd_struct commands[] = {
 		{ "top", cmd_top, 0 },
+		{ "stat", cmd_stat, 0 },
 	};
 	int i;
 	static const char ext[] = STRIP_EXTENSION;
-- 
cgit v0.10.2


From 1d8c8b209e9351a7de1307d7b9b6df4222b8d742 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 20 Apr 2009 15:52:29 +0200
Subject: perf_counter tools: add help texts

Add Documentation/perf-stat.txt and Documentation/perf-top.txt.

The template that was used for it: Documentation/git-add.txt from Git.

Fix up small bugs to make these help texts show up both in the 'perf'
common-command summary output screen, and on the individual help screens.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Documentation/perf-stat.txt b/Documentation/perf_counter/Documentation/perf-stat.txt
new file mode 100644
index 0000000..7fcab27
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf-stat.txt
@@ -0,0 +1,76 @@
+perf-stat(1)
+==========
+
+NAME
+----
+perf-stat - Run a command and gather performance counter statistics
+
+SYNOPSIS
+--------
+[verse]
+'perf stat' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
+
+DESCRIPTION
+-----------
+This command runs a command and gathers performance counter statistics
+from it.
+
+
+OPTIONS
+-------
+<command>...::
+	Any command you can specify in a shell.
+
+-e::
+--event=::
+                             0:0: cpu-cycles          
+                             0:0: cycles              
+                             0:1: instructions        
+                             0:2: cache-references    
+                             0:3: cache-misses        
+                             0:4: branch-instructions 
+                             0:4: branches            
+                             0:5: branch-misses       
+                             0:6: bus-cycles          
+                             1:0: cpu-clock           
+                             1:1: task-clock          
+                             1:2: page-faults         
+                             1:2: faults              
+                             1:5: minor-faults        
+                             1:6: major-faults        
+                             1:3: context-switches    
+                             1:3: cs                  
+                             1:4: cpu-migrations      
+                             1:4: migrations          
+                           rNNN: raw PMU events (eventsel+umask)
+
+-a::
+        system-wide collection
+
+-l::
+        scale counter values
+
+Configuration
+-------------
+
+EXAMPLES
+--------
+
+$ perf stat sleep 1
+
+ Performance counter stats for 'sleep':
+
+       0.678356  task clock ticks     (msecs)
+              7  context switches     (events)
+              4  CPU migrations       (events)
+            232  pagefaults           (events)
+        1810403  CPU cycles           (events)
+         946759  instructions         (events)
+          18952  cache references     (events)
+           4885  cache misses         (events)
+
+ Wall-clock time elapsed:  1001.252894 msecs
+
+SEE ALSO
+--------
+linkperf:git-tops[1]
diff --git a/Documentation/perf_counter/Documentation/perf-top.txt b/Documentation/perf_counter/Documentation/perf-top.txt
new file mode 100644
index 0000000..057333b
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf-top.txt
@@ -0,0 +1,61 @@
+perf-top(1)
+==========
+
+NAME
+----
+perf-top - Run a command and profile it
+
+SYNOPSIS
+--------
+[verse]
+'perf top' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
+
+DESCRIPTION
+-----------
+This command runs a command and gathers a performance counter profile
+from it.
+
+
+OPTIONS
+-------
+<command>...::
+	Any command you can specify in a shell.
+
+-e::
+--event=::
+                             0:0: cpu-cycles          
+                             0:0: cycles              
+                             0:1: instructions        
+                             0:2: cache-references    
+                             0:3: cache-misses        
+                             0:4: branch-instructions 
+                             0:4: branches            
+                             0:5: branch-misses       
+                             0:6: bus-cycles          
+                             1:0: cpu-clock           
+                             1:1: task-clock          
+                             1:2: page-faults         
+                             1:2: faults              
+                             1:5: minor-faults        
+                             1:6: major-faults        
+                             1:3: context-switches    
+                             1:3: cs                  
+                             1:4: cpu-migrations      
+                             1:4: migrations          
+                           rNNN: raw PMU events (eventsel+umask)
+
+-a::
+        system-wide collection
+
+-l::
+        scale counter values
+
+Configuration
+-------------
+
+EXAMPLES
+--------
+
+SEE ALSO
+--------
+linkperf:git-stat[1]
diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h
index a3bb6cd..605323c 100644
--- a/Documentation/perf_counter/builtin.h
+++ b/Documentation/perf_counter/builtin.h
@@ -14,6 +14,6 @@ extern void prune_packed_objects(int);
 extern int read_line_with_nul(char *buf, int size, FILE *file);
 extern int check_pager_config(const char *cmd);
 
-extern int cmd_top(int argc, char **argv, const char *prefix);
-extern int cmd_stat(int argc, char **argv, const char *prefix);
+extern int cmd_top(int argc, const char **argv, const char *prefix);
+extern int cmd_stat(int argc, const char **argv, const char *prefix);
 #endif
diff --git a/Documentation/perf_counter/generate-cmdlist.sh b/Documentation/perf_counter/generate-cmdlist.sh
index 75c68d9..f06f6fd 100755
--- a/Documentation/perf_counter/generate-cmdlist.sh
+++ b/Documentation/perf_counter/generate-cmdlist.sh
@@ -9,16 +9,16 @@ struct cmdname_help
 
 static struct cmdname_help common_cmds[] = {"
 
-sed -n -e 's/^git-\([^ 	]*\)[ 	].* common.*/\1/p' command-list.txt |
+sed -n -e 's/^perf-\([^ 	]*\)[ 	].* common.*/\1/p' command-list.txt |
 sort |
 while read cmd
 do
      sed -n '
-     /^NAME/,/git-'"$cmd"'/H
+     /^NAME/,/perf-'"$cmd"'/H
      ${
             x
-            s/.*git-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/
+            s/.*perf-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/
 	    p
-     }' "Documentation/git-$cmd.txt"
+     }' "Documentation/perf-$cmd.txt"
 done
 echo "};"
-- 
cgit v0.10.2


From e33e0a43736307512422e41aee6e24d5a8c39181 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 20 Apr 2009 15:58:01 +0200
Subject: perf_counter tools: add 'perf record' command

Move perf-record.c into the perf suite of commands.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Documentation/perf-record.txt b/Documentation/perf_counter/Documentation/perf-record.txt
new file mode 100644
index 0000000..d07700e
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf-record.txt
@@ -0,0 +1,63 @@
+perf-record(1)
+==========
+
+NAME
+----
+perf-record - Run a command and record its profile into output.perf
+
+SYNOPSIS
+--------
+[verse]
+'perf record' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
+
+DESCRIPTION
+-----------
+This command runs a command and gathers a performance counter profile
+from it, into output.perf - without displaying anything.
+
+This file can then be inspected later on, using 'perf report'.
+
+
+OPTIONS
+-------
+<command>...::
+	Any command you can specify in a shell.
+
+-e::
+--event=::
+                             0:0: cpu-cycles          
+                             0:0: cycles              
+                             0:1: instructions        
+                             0:2: cache-references    
+                             0:3: cache-misses        
+                             0:4: branch-instructions 
+                             0:4: branches            
+                             0:5: branch-misses       
+                             0:6: bus-cycles          
+                             1:0: cpu-clock           
+                             1:1: task-clock          
+                             1:2: page-faults         
+                             1:2: faults              
+                             1:5: minor-faults        
+                             1:6: major-faults        
+                             1:3: context-switches    
+                             1:3: cs                  
+                             1:4: cpu-migrations      
+                             1:4: migrations          
+                           rNNN: raw PMU events (eventsel+umask)
+
+-a::
+        system-wide collection
+
+-l::
+        scale counter values
+
+Configuration
+-------------
+
+EXAMPLES
+--------
+
+SEE ALSO
+--------
+linkperf:git-stat[1]
diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index fb8b717..b6c665eb 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -309,6 +309,7 @@ LIB_OBJS += usage.o
 LIB_OBJS += wrapper.o
 
 BUILTIN_OBJS += builtin-help.o
+BUILTIN_OBJS += builtin-record.o
 BUILTIN_OBJS += builtin-stat.o
 BUILTIN_OBJS += builtin-top.o
 
diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
new file mode 100644
index 0000000..4a50abf
--- /dev/null
+++ b/Documentation/perf_counter/builtin-record.c
@@ -0,0 +1,506 @@
+
+
+#define _GNU_SOURCE
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <getopt.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <errno.h>
+#include <ctype.h>
+#include <time.h>
+#include <sched.h>
+#include <pthread.h>
+
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <sys/uio.h>
+#include <sys/mman.h>
+
+#include <linux/unistd.h>
+#include <linux/types.h>
+
+#include "../../include/linux/perf_counter.h"
+
+
+/*
+ * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
+ * counters in the current task.
+ */
+#define PR_TASK_PERF_COUNTERS_DISABLE   31
+#define PR_TASK_PERF_COUNTERS_ENABLE    32
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+#define rdclock()                                       \
+({                                                      \
+        struct timespec ts;                             \
+                                                        \
+        clock_gettime(CLOCK_MONOTONIC, &ts);            \
+        ts.tv_sec * 1000000000ULL + ts.tv_nsec;         \
+})
+
+/*
+ * Pick up some kernel type conventions:
+ */
+#define __user
+#define asmlinkage
+
+#ifdef __x86_64__
+#define __NR_perf_counter_open 295
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#endif
+
+#ifdef __i386__
+#define __NR_perf_counter_open 333
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#endif
+
+#ifdef __powerpc__
+#define __NR_perf_counter_open 319
+#define rmb() 		asm volatile ("sync" ::: "memory")
+#define cpu_relax()	asm volatile ("" ::: "memory");
+#endif
+
+#define unlikely(x)	__builtin_expect(!!(x), 0)
+#define min(x, y) ({				\
+	typeof(x) _min1 = (x);			\
+	typeof(y) _min2 = (y);			\
+	(void) (&_min1 == &_min2);		\
+	_min1 < _min2 ? _min1 : _min2; })
+
+extern asmlinkage int sys_perf_counter_open(
+        struct perf_counter_hw_event    *hw_event_uptr          __user,
+        pid_t                           pid,
+        int                             cpu,
+        int                             group_fd,
+        unsigned long                   flags);
+
+#define MAX_COUNTERS			64
+#define MAX_NR_CPUS			256
+
+#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
+
+static int			nr_counters			=  0;
+static __u64			event_id[MAX_COUNTERS]		= { };
+static int			default_interval = 100000;
+static int			event_count[MAX_COUNTERS];
+static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+static int			nr_cpus				=  0;
+static unsigned int		page_size;
+static unsigned int		mmap_pages			= 16;
+static int			output;
+static char 			*output_name			= "output.perf";
+static int			group				= 0;
+static unsigned int		realtime_prio			=  0;
+
+const unsigned int default_count[] = {
+	1000000,
+	1000000,
+	  10000,
+	  10000,
+	1000000,
+	  10000,
+};
+
+struct event_symbol {
+	__u64 event;
+	char *symbol;
+};
+
+static struct event_symbol event_symbols[] = {
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
+
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
+};
+
+/*
+ * Each event can have multiple symbolic names.
+ * Symbolic names are (almost) exactly matched.
+ */
+static __u64 match_event_symbols(char *str)
+{
+	__u64 config, id;
+	int type;
+	unsigned int i;
+
+	if (sscanf(str, "r%llx", &config) == 1)
+		return config | PERF_COUNTER_RAW_MASK;
+
+	if (sscanf(str, "%d:%llu", &type, &id) == 2)
+		return EID(type, id);
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		if (!strncmp(str, event_symbols[i].symbol,
+			     strlen(event_symbols[i].symbol)))
+			return event_symbols[i].event;
+	}
+
+	return ~0ULL;
+}
+
+static int parse_events(char *str)
+{
+	__u64 config;
+
+again:
+	if (nr_counters == MAX_COUNTERS)
+		return -1;
+
+	config = match_event_symbols(str);
+	if (config == ~0ULL)
+		return -1;
+
+	event_id[nr_counters] = config;
+	nr_counters++;
+
+	str = strstr(str, ",");
+	if (str) {
+		str++;
+		goto again;
+	}
+
+	return 0;
+}
+
+#define __PERF_COUNTER_FIELD(config, name) \
+	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
+
+#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
+#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
+#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
+#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
+
+static void display_events_help(void)
+{
+	unsigned int i;
+	__u64 e;
+
+	printf(
+	" -e EVENT     --event=EVENT   #  symbolic-name        abbreviations");
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		int type, id;
+
+		e = event_symbols[i].event;
+		type = PERF_COUNTER_TYPE(e);
+		id = PERF_COUNTER_ID(e);
+
+		printf("\n                             %d:%d: %-20s",
+				type, id, event_symbols[i].symbol);
+	}
+
+	printf("\n"
+	"                           rNNN: raw PMU events (eventsel+umask)\n\n");
+}
+
+static void display_help(void)
+{
+	printf(
+	"Usage: perf-record [<options>]\n"
+	"perf-record Options (up to %d event types can be specified at once):\n\n",
+		 MAX_COUNTERS);
+
+	display_events_help();
+
+	printf(
+	" -c CNT    --count=CNT          # event period to sample\n"
+	" -m pages  --mmap_pages=<pages> # number of mmap data pages\n"
+	" -o file   --output=<file>      # output file\n"
+	" -r prio   --realtime=<prio>    # use RT prio\n"
+	);
+
+	exit(0);
+}
+
+static void process_options(int argc, char *argv[])
+{
+	int error = 0, counter;
+
+	for (;;) {
+		int option_index = 0;
+		/** Options for getopt */
+		static struct option long_options[] = {
+			{"count",	required_argument,	NULL, 'c'},
+			{"event",	required_argument,	NULL, 'e'},
+			{"mmap_pages",	required_argument,	NULL, 'm'},
+			{"output",	required_argument,	NULL, 'o'},
+			{"realtime",	required_argument,	NULL, 'r'},
+			{NULL,		0,			NULL,  0 }
+		};
+		int c = getopt_long(argc, argv, "+:c:e:m:o:r:",
+				    long_options, &option_index);
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 'c': default_interval		=   atoi(optarg); break;
+		case 'e': error				= parse_events(optarg); break;
+		case 'm': mmap_pages			=   atoi(optarg); break;
+		case 'o': output_name			= strdup(optarg); break;
+		case 'r': realtime_prio			=   atoi(optarg); break;
+		default: error = 1; break;
+		}
+	}
+	if (error)
+		display_help();
+
+	if (!nr_counters) {
+		nr_counters = 1;
+		event_id[0] = 0;
+	}
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		if (event_count[counter])
+			continue;
+
+		event_count[counter] = default_interval;
+	}
+}
+
+struct mmap_data {
+	int counter;
+	void *base;
+	unsigned int mask;
+	unsigned int prev;
+};
+
+static unsigned int mmap_read_head(struct mmap_data *md)
+{
+	struct perf_counter_mmap_page *pc = md->base;
+	int head;
+
+	head = pc->data_head;
+	rmb();
+
+	return head;
+}
+
+static long events;
+static struct timeval last_read, this_read;
+
+static void mmap_read(struct mmap_data *md)
+{
+	unsigned int head = mmap_read_head(md);
+	unsigned int old = md->prev;
+	unsigned char *data = md->base + page_size;
+	unsigned long size;
+	void *buf;
+	int diff;
+
+	gettimeofday(&this_read, NULL);
+
+	/*
+	 * If we're further behind than half the buffer, there's a chance
+	 * the writer will bite our tail and screw up the events under us.
+	 *
+	 * If we somehow ended up ahead of the head, we got messed up.
+	 *
+	 * In either case, truncate and restart at head.
+	 */
+	diff = head - old;
+	if (diff > md->mask / 2 || diff < 0) {
+		struct timeval iv;
+		unsigned long msecs;
+
+		timersub(&this_read, &last_read, &iv);
+		msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
+
+		fprintf(stderr, "WARNING: failed to keep up with mmap data."
+				"  Last read %lu msecs ago.\n", msecs);
+
+		/*
+		 * head points to a known good entry, start there.
+		 */
+		old = head;
+	}
+
+	last_read = this_read;
+
+	if (old != head)
+		events++;
+
+	size = head - old;
+
+	if ((old & md->mask) + size != (head & md->mask)) {
+		buf = &data[old & md->mask];
+		size = md->mask + 1 - (old & md->mask);
+		old += size;
+		while (size) {
+			int ret = write(output, buf, size);
+			if (ret < 0) {
+				perror("failed to write");
+				exit(-1);
+			}
+			size -= ret;
+			buf += ret;
+		}
+	}
+
+	buf = &data[old & md->mask];
+	size = head - old;
+	old += size;
+	while (size) {
+		int ret = write(output, buf, size);
+		if (ret < 0) {
+			perror("failed to write");
+			exit(-1);
+		}
+		size -= ret;
+		buf += ret;
+	}
+
+	md->prev = old;
+}
+
+static volatile int done = 0;
+
+static void sigchld_handler(int sig)
+{
+	if (sig == SIGCHLD)
+		done = 1;
+}
+
+int cmd_record(int argc, const char **argv)
+{
+	struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
+	struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
+	struct perf_counter_hw_event hw_event;
+	int i, counter, group_fd, nr_poll = 0;
+	pid_t pid;
+	int ret;
+
+	page_size = sysconf(_SC_PAGE_SIZE);
+
+	process_options(argc, argv);
+
+	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	assert(nr_cpus <= MAX_NR_CPUS);
+	assert(nr_cpus >= 0);
+
+	output = open(output_name, O_CREAT|O_RDWR, S_IRWXU);
+	if (output < 0) {
+		perror("failed to create output file");
+		exit(-1);
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	for (i = 0; i < nr_cpus; i++) {
+		group_fd = -1;
+		for (counter = 0; counter < nr_counters; counter++) {
+
+			memset(&hw_event, 0, sizeof(hw_event));
+			hw_event.config		= event_id[counter];
+			hw_event.irq_period	= event_count[counter];
+			hw_event.record_type	= PERF_RECORD_IP | PERF_RECORD_TID;
+			hw_event.nmi		= 1;
+			hw_event.mmap		= 1;
+			hw_event.comm		= 1;
+
+			fd[i][counter] = sys_perf_counter_open(&hw_event, -1, i, group_fd, 0);
+			if (fd[i][counter] < 0) {
+				int err = errno;
+				printf("kerneltop error: syscall returned with %d (%s)\n",
+					fd[i][counter], strerror(err));
+				if (err == EPERM)
+					printf("Are you root?\n");
+				exit(-1);
+			}
+			assert(fd[i][counter] >= 0);
+			fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
+
+			/*
+			 * First counter acts as the group leader:
+			 */
+			if (group && group_fd == -1)
+				group_fd = fd[i][counter];
+
+			event_array[nr_poll].fd = fd[i][counter];
+			event_array[nr_poll].events = POLLIN;
+			nr_poll++;
+
+			mmap_array[i][counter].counter = counter;
+			mmap_array[i][counter].prev = 0;
+			mmap_array[i][counter].mask = mmap_pages*page_size - 1;
+			mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
+					PROT_READ, MAP_SHARED, fd[i][counter], 0);
+			if (mmap_array[i][counter].base == MAP_FAILED) {
+				printf("kerneltop error: failed to mmap with %d (%s)\n",
+						errno, strerror(errno));
+				exit(-1);
+			}
+		}
+	}
+
+	signal(SIGCHLD, sigchld_handler);
+
+	pid = fork();
+	if (pid < 0)
+		perror("failed to fork");
+
+	if (!pid) {
+		if (execvp(argv[0], argv)) {
+			perror(argv[0]);
+			exit(-1);
+		}
+	}
+
+	if (realtime_prio) {
+		struct sched_param param;
+
+		param.sched_priority = realtime_prio;
+		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
+			printf("Could not set realtime priority.\n");
+			exit(-1);
+		}
+	}
+
+	/*
+	 * TODO: store the current /proc/$/maps information somewhere
+	 */
+
+	while (!done) {
+		int hits = events;
+
+		for (i = 0; i < nr_cpus; i++) {
+			for (counter = 0; counter < nr_counters; counter++)
+				mmap_read(&mmap_array[i][counter]);
+		}
+
+		if (hits == events)
+			ret = poll(event_array, nr_poll, 100);
+	}
+
+	return 0;
+}
diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h
index 605323c..5854b17 100644
--- a/Documentation/perf_counter/builtin.h
+++ b/Documentation/perf_counter/builtin.h
@@ -14,6 +14,7 @@ extern void prune_packed_objects(int);
 extern int read_line_with_nul(char *buf, int size, FILE *file);
 extern int check_pager_config(const char *cmd);
 
-extern int cmd_top(int argc, const char **argv, const char *prefix);
+extern int cmd_record(int argc, const char **argv, const char *prefix);
 extern int cmd_stat(int argc, const char **argv, const char *prefix);
+extern int cmd_top(int argc, const char **argv, const char *prefix);
 #endif
diff --git a/Documentation/perf_counter/command-list.txt b/Documentation/perf_counter/command-list.txt
index 52455d4..d15210a 100644
--- a/Documentation/perf_counter/command-list.txt
+++ b/Documentation/perf_counter/command-list.txt
@@ -1,5 +1,6 @@
 # List of known perf commands.
 # command name				category [deprecated] [common]
-perf-top                                mainporcelain common
+perf-record                             mainporcelain common
 perf-stat                               mainporcelain common
+perf-top                                mainporcelain common
 
diff --git a/Documentation/perf_counter/perf-record.c b/Documentation/perf_counter/perf-record.c
deleted file mode 100644
index 614de7c..0000000
--- a/Documentation/perf_counter/perf-record.c
+++ /dev/null
@@ -1,530 +0,0 @@
-
-
-#define _GNU_SOURCE
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <unistd.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#include <getopt.h>
-#include <assert.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <errno.h>
-#include <ctype.h>
-#include <time.h>
-#include <sched.h>
-#include <pthread.h>
-
-#include <sys/syscall.h>
-#include <sys/ioctl.h>
-#include <sys/poll.h>
-#include <sys/prctl.h>
-#include <sys/wait.h>
-#include <sys/uio.h>
-#include <sys/mman.h>
-
-#include <linux/unistd.h>
-#include <linux/types.h>
-
-#include "../../include/linux/perf_counter.h"
-
-
-/*
- * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
- * counters in the current task.
- */
-#define PR_TASK_PERF_COUNTERS_DISABLE   31
-#define PR_TASK_PERF_COUNTERS_ENABLE    32
-
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-
-#define rdclock()                                       \
-({                                                      \
-        struct timespec ts;                             \
-                                                        \
-        clock_gettime(CLOCK_MONOTONIC, &ts);            \
-        ts.tv_sec * 1000000000ULL + ts.tv_nsec;         \
-})
-
-/*
- * Pick up some kernel type conventions:
- */
-#define __user
-#define asmlinkage
-
-#ifdef __x86_64__
-#define __NR_perf_counter_open 295
-#define rmb()		asm volatile("lfence" ::: "memory")
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#endif
-
-#ifdef __i386__
-#define __NR_perf_counter_open 333
-#define rmb()		asm volatile("lfence" ::: "memory")
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#endif
-
-#ifdef __powerpc__
-#define __NR_perf_counter_open 319
-#define rmb() 		asm volatile ("sync" ::: "memory")
-#define cpu_relax()	asm volatile ("" ::: "memory");
-#endif
-
-#define unlikely(x)	__builtin_expect(!!(x), 0)
-#define min(x, y) ({				\
-	typeof(x) _min1 = (x);			\
-	typeof(y) _min2 = (y);			\
-	(void) (&_min1 == &_min2);		\
-	_min1 < _min2 ? _min1 : _min2; })
-
-asmlinkage int sys_perf_counter_open(
-        struct perf_counter_hw_event    *hw_event_uptr          __user,
-        pid_t                           pid,
-        int                             cpu,
-        int                             group_fd,
-        unsigned long                   flags)
-{
-        return syscall(
-                __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
-}
-
-#define MAX_COUNTERS			64
-#define MAX_NR_CPUS			256
-
-#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
-
-static int			nr_counters			=  0;
-static __u64			event_id[MAX_COUNTERS]		= { };
-static int			default_interval = 100000;
-static int			event_count[MAX_COUNTERS];
-static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
-static int			nr_cpus				=  0;
-static unsigned int		page_size;
-static unsigned int		mmap_pages			= 16;
-static int			output;
-static char 			*output_name			= "output.perf";
-static int			group				= 0;
-static unsigned int		realtime_prio			=  0;
-
-const unsigned int default_count[] = {
-	1000000,
-	1000000,
-	  10000,
-	  10000,
-	1000000,
-	  10000,
-};
-
-static char *hw_event_names[] = {
-	"CPU cycles",
-	"instructions",
-	"cache references",
-	"cache misses",
-	"branches",
-	"branch misses",
-	"bus cycles",
-};
-
-static char *sw_event_names[] = {
-	"cpu clock ticks",
-	"task clock ticks",
-	"pagefaults",
-	"context switches",
-	"CPU migrations",
-	"minor faults",
-	"major faults",
-};
-
-struct event_symbol {
-	__u64 event;
-	char *symbol;
-};
-
-static struct event_symbol event_symbols[] = {
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
-
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
-};
-
-/*
- * Each event can have multiple symbolic names.
- * Symbolic names are (almost) exactly matched.
- */
-static __u64 match_event_symbols(char *str)
-{
-	__u64 config, id;
-	int type;
-	unsigned int i;
-
-	if (sscanf(str, "r%llx", &config) == 1)
-		return config | PERF_COUNTER_RAW_MASK;
-
-	if (sscanf(str, "%d:%llu", &type, &id) == 2)
-		return EID(type, id);
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		if (!strncmp(str, event_symbols[i].symbol,
-			     strlen(event_symbols[i].symbol)))
-			return event_symbols[i].event;
-	}
-
-	return ~0ULL;
-}
-
-static int parse_events(char *str)
-{
-	__u64 config;
-
-again:
-	if (nr_counters == MAX_COUNTERS)
-		return -1;
-
-	config = match_event_symbols(str);
-	if (config == ~0ULL)
-		return -1;
-
-	event_id[nr_counters] = config;
-	nr_counters++;
-
-	str = strstr(str, ",");
-	if (str) {
-		str++;
-		goto again;
-	}
-
-	return 0;
-}
-
-#define __PERF_COUNTER_FIELD(config, name) \
-	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
-
-#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
-#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
-#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
-#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
-
-static void display_events_help(void)
-{
-	unsigned int i;
-	__u64 e;
-
-	printf(
-	" -e EVENT     --event=EVENT   #  symbolic-name        abbreviations");
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		int type, id;
-
-		e = event_symbols[i].event;
-		type = PERF_COUNTER_TYPE(e);
-		id = PERF_COUNTER_ID(e);
-
-		printf("\n                             %d:%d: %-20s",
-				type, id, event_symbols[i].symbol);
-	}
-
-	printf("\n"
-	"                           rNNN: raw PMU events (eventsel+umask)\n\n");
-}
-
-static void display_help(void)
-{
-	printf(
-	"Usage: perf-record [<options>]\n"
-	"perf-record Options (up to %d event types can be specified at once):\n\n",
-		 MAX_COUNTERS);
-
-	display_events_help();
-
-	printf(
-	" -c CNT    --count=CNT          # event period to sample\n"
-	" -m pages  --mmap_pages=<pages> # number of mmap data pages\n"
-	" -o file   --output=<file>      # output file\n"
-	" -r prio   --realtime=<prio>    # use RT prio\n"
-	);
-
-	exit(0);
-}
-
-static void process_options(int argc, char *argv[])
-{
-	int error = 0, counter;
-
-	for (;;) {
-		int option_index = 0;
-		/** Options for getopt */
-		static struct option long_options[] = {
-			{"count",	required_argument,	NULL, 'c'},
-			{"event",	required_argument,	NULL, 'e'},
-			{"mmap_pages",	required_argument,	NULL, 'm'},
-			{"output",	required_argument,	NULL, 'o'},
-			{"realtime",	required_argument,	NULL, 'r'},
-			{NULL,		0,			NULL,  0 }
-		};
-		int c = getopt_long(argc, argv, "+:c:e:m:o:r:",
-				    long_options, &option_index);
-		if (c == -1)
-			break;
-
-		switch (c) {
-		case 'c': default_interval		=   atoi(optarg); break;
-		case 'e': error				= parse_events(optarg); break;
-		case 'm': mmap_pages			=   atoi(optarg); break;
-		case 'o': output_name			= strdup(optarg); break;
-		case 'r': realtime_prio			=   atoi(optarg); break;
-		default: error = 1; break;
-		}
-	}
-	if (error)
-		display_help();
-
-	if (!nr_counters) {
-		nr_counters = 1;
-		event_id[0] = 0;
-	}
-
-	for (counter = 0; counter < nr_counters; counter++) {
-		if (event_count[counter])
-			continue;
-
-		event_count[counter] = default_interval;
-	}
-}
-
-struct mmap_data {
-	int counter;
-	void *base;
-	unsigned int mask;
-	unsigned int prev;
-};
-
-static unsigned int mmap_read_head(struct mmap_data *md)
-{
-	struct perf_counter_mmap_page *pc = md->base;
-	int head;
-
-	head = pc->data_head;
-	rmb();
-
-	return head;
-}
-
-static long events;
-static struct timeval last_read, this_read;
-
-static void mmap_read(struct mmap_data *md)
-{
-	unsigned int head = mmap_read_head(md);
-	unsigned int old = md->prev;
-	unsigned char *data = md->base + page_size;
-	unsigned long size;
-	void *buf;
-	int diff;
-
-	gettimeofday(&this_read, NULL);
-
-	/*
-	 * If we're further behind than half the buffer, there's a chance
-	 * the writer will bite our tail and screw up the events under us.
-	 *
-	 * If we somehow ended up ahead of the head, we got messed up.
-	 *
-	 * In either case, truncate and restart at head.
-	 */
-	diff = head - old;
-	if (diff > md->mask / 2 || diff < 0) {
-		struct timeval iv;
-		unsigned long msecs;
-
-		timersub(&this_read, &last_read, &iv);
-		msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
-
-		fprintf(stderr, "WARNING: failed to keep up with mmap data."
-				"  Last read %lu msecs ago.\n", msecs);
-
-		/*
-		 * head points to a known good entry, start there.
-		 */
-		old = head;
-	}
-
-	last_read = this_read;
-
-	if (old != head)
-		events++;
-
-	size = head - old;
-
-	if ((old & md->mask) + size != (head & md->mask)) {
-		buf = &data[old & md->mask];
-		size = md->mask + 1 - (old & md->mask);
-		old += size;
-		while (size) {
-			int ret = write(output, buf, size);
-			if (ret < 0) {
-				perror("failed to write");
-				exit(-1);
-			}
-			size -= ret;
-			buf += ret;
-		}
-	}
-
-	buf = &data[old & md->mask];
-	size = head - old;
-	old += size;
-	while (size) {
-		int ret = write(output, buf, size);
-		if (ret < 0) {
-			perror("failed to write");
-			exit(-1);
-		}
-		size -= ret;
-		buf += ret;
-	}
-
-	md->prev = old;
-}
-
-static volatile int done = 0;
-
-static void sigchld_handler(int sig)
-{
-	if (sig == SIGCHLD)
-		done = 1;
-}
-
-int main(int argc, char *argv[])
-{
-	struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
-	struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
-	struct perf_counter_hw_event hw_event;
-	int i, counter, group_fd, nr_poll = 0;
-	pid_t pid;
-	int ret;
-
-	page_size = sysconf(_SC_PAGE_SIZE);
-
-	process_options(argc, argv);
-
-	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
-	assert(nr_cpus <= MAX_NR_CPUS);
-	assert(nr_cpus >= 0);
-
-	output = open(output_name, O_CREAT|O_RDWR, S_IRWXU);
-	if (output < 0) {
-		perror("failed to create output file");
-		exit(-1);
-	}
-
-	argc -= optind;
-	argv += optind;
-
-	for (i = 0; i < nr_cpus; i++) {
-		group_fd = -1;
-		for (counter = 0; counter < nr_counters; counter++) {
-
-			memset(&hw_event, 0, sizeof(hw_event));
-			hw_event.config		= event_id[counter];
-			hw_event.irq_period	= event_count[counter];
-			hw_event.record_type	= PERF_RECORD_IP | PERF_RECORD_TID;
-			hw_event.nmi		= 1;
-			hw_event.mmap		= 1;
-			hw_event.comm		= 1;
-
-			fd[i][counter] = sys_perf_counter_open(&hw_event, -1, i, group_fd, 0);
-			if (fd[i][counter] < 0) {
-				int err = errno;
-				printf("kerneltop error: syscall returned with %d (%s)\n",
-					fd[i][counter], strerror(err));
-				if (err == EPERM)
-					printf("Are you root?\n");
-				exit(-1);
-			}
-			assert(fd[i][counter] >= 0);
-			fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
-
-			/*
-			 * First counter acts as the group leader:
-			 */
-			if (group && group_fd == -1)
-				group_fd = fd[i][counter];
-
-			event_array[nr_poll].fd = fd[i][counter];
-			event_array[nr_poll].events = POLLIN;
-			nr_poll++;
-
-			mmap_array[i][counter].counter = counter;
-			mmap_array[i][counter].prev = 0;
-			mmap_array[i][counter].mask = mmap_pages*page_size - 1;
-			mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
-					PROT_READ, MAP_SHARED, fd[i][counter], 0);
-			if (mmap_array[i][counter].base == MAP_FAILED) {
-				printf("kerneltop error: failed to mmap with %d (%s)\n",
-						errno, strerror(errno));
-				exit(-1);
-			}
-		}
-	}
-
-	signal(SIGCHLD, sigchld_handler);
-
-	pid = fork();
-	if (pid < 0)
-		perror("failed to fork");
-
-	if (!pid) {
-		if (execvp(argv[0], argv)) {
-			perror(argv[0]);
-			exit(-1);
-		}
-	}
-
-	if (realtime_prio) {
-		struct sched_param param;
-
-		param.sched_priority = realtime_prio;
-		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
-			printf("Could not set realtime priority.\n");
-			exit(-1);
-		}
-	}
-
-	/*
-	 * TODO: store the current /proc/$/maps information somewhere
-	 */
-
-	while (!done) {
-		int hits = events;
-
-		for (i = 0; i < nr_cpus; i++) {
-			for (counter = 0; counter < nr_counters; counter++)
-				mmap_read(&mmap_array[i][counter]);
-		}
-
-		if (hits == events)
-			ret = poll(event_array, nr_poll, 100);
-	}
-
-	return 0;
-}
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
index ff8658f..e849dd6 100644
--- a/Documentation/perf_counter/perf.c
+++ b/Documentation/perf_counter/perf.c
@@ -248,8 +248,9 @@ static void handle_internal_command(int argc, const char **argv)
 {
 	const char *cmd = argv[0];
 	static struct cmd_struct commands[] = {
-		{ "top", cmd_top, 0 },
+		{ "record", cmd_record, 0 },
 		{ "stat", cmd_stat, 0 },
+		{ "top", cmd_top, 0 },
 	};
 	int i;
 	static const char ext[] = STRIP_EXTENSION;
-- 
cgit v0.10.2


From cc13a5913797365b683212f5fca5fb04bb3582eb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 20 Apr 2009 16:01:30 +0200
Subject: perf_counter tools: fix --version

Hook up the 'perf version' built-in command.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h
index 5854b17..aec5ae3 100644
--- a/Documentation/perf_counter/builtin.h
+++ b/Documentation/perf_counter/builtin.h
@@ -17,4 +17,5 @@ extern int check_pager_config(const char *cmd);
 extern int cmd_record(int argc, const char **argv, const char *prefix);
 extern int cmd_stat(int argc, const char **argv, const char *prefix);
 extern int cmd_top(int argc, const char **argv, const char *prefix);
+extern int cmd_version(int argc, const char **argv, const char *prefix);
 #endif
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
index e849dd6..20d508c 100644
--- a/Documentation/perf_counter/perf.c
+++ b/Documentation/perf_counter/perf.c
@@ -5,7 +5,7 @@
 #include "run-command.h"
 
 const char perf_usage_string[] =
-	"perf [--version] [--exec-path[=PERF_EXEC_PATH]] [--html-path] [-p|--paginate|--no-pager] [--bare] [--perf-dir=PERF_DIR] [--work-tree=PERF_WORK_TREE] [--help] COMMAND [ARGS]";
+	"perf [--version] [--help] COMMAND [ARGS]";
 
 const char perf_more_info_string[] =
 	"See 'perf help COMMAND' for more information on a specific command.";
@@ -251,6 +251,7 @@ static void handle_internal_command(int argc, const char **argv)
 		{ "record", cmd_record, 0 },
 		{ "stat", cmd_stat, 0 },
 		{ "top", cmd_top, 0 },
+		{ "version", cmd_version, 0 },
 	};
 	int i;
 	static const char ext[] = STRIP_EXTENSION;
-- 
cgit v0.10.2


From 6142fdd968c76a0f2ee753c39bd5be8d1bb4ef04 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 20 Apr 2009 16:05:55 +0200
Subject: perf_counter tools: add 'perf help'

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Documentation/perf-help.txt b/Documentation/perf_counter/Documentation/perf-help.txt
new file mode 100644
index 0000000..f85fed5
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf-help.txt
@@ -0,0 +1,38 @@
+perf-help(1)
+===========
+
+NAME
+----
+perf-help - display help information about perf
+
+SYNOPSIS
+--------
+'perf help' [-a|--all] [COMMAND]
+
+DESCRIPTION
+-----------
+
+With no options and no COMMAND given, the synopsis of the 'perf'
+command and a list of the most commonly used perf commands are printed
+on the standard output.
+
+If the option '--all' or '-a' is given, then all available commands are
+printed on the standard output.
+
+If a perf command is named, a manual page for that command is brought
+up. The 'man' program is used by default for this purpose, but this
+can be overridden by other options or configuration variables.
+
+Note that `perf --help ...` is identical to `perf help ...` because the
+former is internally converted into the latter.
+
+OPTIONS
+-------
+-a::
+--all::
+	Prints all the available commands on the standard output. This
+	option supersedes any other option.
+
+PERF
+----
+Part of the linkperf:perf[1] suite
diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h
index aec5ae3..800f86c 100644
--- a/Documentation/perf_counter/builtin.h
+++ b/Documentation/perf_counter/builtin.h
@@ -14,6 +14,7 @@ extern void prune_packed_objects(int);
 extern int read_line_with_nul(char *buf, int size, FILE *file);
 extern int check_pager_config(const char *cmd);
 
+extern int cmd_help(int argc, const char **argv, const char *prefix);
 extern int cmd_record(int argc, const char **argv, const char *prefix);
 extern int cmd_stat(int argc, const char **argv, const char *prefix);
 extern int cmd_top(int argc, const char **argv, const char *prefix);
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
index 20d508c..8d6faec 100644
--- a/Documentation/perf_counter/perf.c
+++ b/Documentation/perf_counter/perf.c
@@ -248,6 +248,7 @@ static void handle_internal_command(int argc, const char **argv)
 {
 	const char *cmd = argv[0];
 	static struct cmd_struct commands[] = {
+		{ "help", cmd_help, 0 },
 		{ "record", cmd_record, 0 },
 		{ "stat", cmd_stat, 0 },
 		{ "top", cmd_top, 0 },
-- 
cgit v0.10.2


From 125e702b09a28a502e145fb434678ee27720fc48 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 20 Apr 2009 16:13:46 +0200
Subject: perf_counter tools: fix 'make install'

Remove Git leftovers from this area.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index b6c665eb..690045e 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -255,9 +255,9 @@ EXTRA_PROGRAMS =
 PROGRAMS += $(EXTRA_PROGRAMS)
 
 #
-# None right now:
+# Single 'perf' binary right now:
 #
-# PROGRAMS += perf-fast-import$X
+PROGRAMS += perf
 
 # List built-in command $C whose implementation cmd_$C() is not in
 # builtin-$C.o but is linked in as part of some other command.
@@ -762,24 +762,9 @@ install: all
 	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
 	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
 	$(INSTALL) $(ALL_PROGRAMS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
-	$(INSTALL) perf$X perf-upload-pack$X perf-receive-pack$X perf-upload-archive$X perf-shell$X perf-cvsserver '$(DESTDIR_SQ)$(bindir_SQ)'
-	$(MAKE) -C templates DESTDIR='$(DESTDIR_SQ)' install
-	$(MAKE) -C perl prefix='$(prefix_SQ)' DESTDIR='$(DESTDIR_SQ)' install
 ifneq (,$X)
 	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';)
 endif
-	bindir=$$(cd '$(DESTDIR_SQ)$(bindir_SQ)' && pwd) && \
-	execdir=$$(cd '$(DESTDIR_SQ)$(perfexec_instdir_SQ)' && pwd) && \
-	{ $(RM) "$$execdir/perf-add$X" && \
-		ln "$$bindir/perf$X" "$$execdir/perf-add$X" 2>/dev/null || \
-		cp "$$bindir/perf$X" "$$execdir/perf-add$X"; } && \
-	{ for p in $(filter-out perf-add$X,$(BUILT_INS)); do \
-		$(RM) "$$execdir/$$p" && \
-		ln "$$execdir/perf-add$X" "$$execdir/$$p" 2>/dev/null || \
-		ln -s "perf-add$X" "$$execdir/$$p" 2>/dev/null || \
-		cp "$$execdir/perf-add$X" "$$execdir/$$p" || exit; \
-	  done } && \
-	./check_bindir "z$$bindir" "z$$execdir" "$$bindir/perf-add$X"
 
 ### Maintainer's dist rules
 
-- 
cgit v0.10.2


From a7abe97fd8e7a6ccabba5a04a9f17be9211d418c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 10:59:34 -0400
Subject: tracing: rename EVENT_TRACER config to ENABLE_EVENT_TRACING

Currently we have two configs: EVENT_TRACING and EVENT_TRACER.
All tracers enable EVENT_TRACING. The EVENT_TRACER is only a
convenience to enable the EVENT_TRACING when no other tracers
are enabled.

The names EVENT_TRACER and EVENT_TRACING are too similar and confusing.
This patch renames EVENT_TRACER to ENABLE_EVENT_TRACING to be more
appropriate to what it actually does, as well as add a comment in
the help menu to explain the option's purpose.

[ Impact: rename config option to reduce confusion ]

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 3fa36d2..450d3c2 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -183,7 +183,7 @@ config CONTEXT_SWITCH_TRACER
 	  This tracer gets called from the context switch and records
 	  all switching of tasks.
 
-config EVENT_TRACER
+config ENABLE_EVENT_TRACING
 	bool "Trace various events in the kernel"
 	select TRACING
 	help
@@ -191,6 +191,10 @@ config EVENT_TRACER
 	  allowing the user to pick and choose which trace point they
 	  want to trace.
 
+	  Note, all tracers enable event tracing. This option is
+	  only a convenience to enable event tracing when no other
+	  tracers are selected.
+
 config FTRACE_SYSCALLS
 	bool "Trace syscalls"
 	depends on HAVE_FTRACE_SYSCALLS
-- 
cgit v0.10.2


From 28d20e2d6e94434827e11c310788b87204b84559 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 12:12:44 -0400
Subject: tracing/events: call the correct event trace selftest init function

The late_initcall calls a helper function instead of the proper
init event selftest function.

This update may have been lost due to conflicting merges.

[ Impact: fix compiler warning and call extended event trace self tests ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 64f9d6d..98daf5d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1199,6 +1199,6 @@ static __init int event_trace_self_tests_init(void)
 	return 0;
 }
 
-late_initcall(event_trace_self_tests);
+late_initcall(event_trace_self_tests_init);
 
 #endif
-- 
cgit v0.10.2


From 23de29de2d8b227943be191d59fb6d983996d55e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 12:59:29 -0400
Subject: tracing: remove dangling semicolon

Due to a cut and paste error, the trace_seq_putc had a semicolon
after the prototype but before the stub function when tracing is
disabled.

[Impact: fix compile error ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index 37db9bd..ba9627f 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -63,7 +63,7 @@ static inline int trace_seq_puts(struct trace_seq *s, const char *str)
 {
 	return 0;
 }
-static inline int trace_seq_putc(struct trace_seq *s, unsigned char c);
+static inline int trace_seq_putc(struct trace_seq *s, unsigned char c)
 {
 	return 0;
 }
-- 
cgit v0.10.2


From 17487bfeb6cfb05920e6a9d5a54f345f2917b4e7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 13:24:21 -0400
Subject: tracing: fix recursive test level calculation

The recursive tests to detect same level recursion in the ring buffers
did not account for the hard/softirq_counts to be shifted. Thus the
numbers could be larger than then mask to be tested.

This patch includes the shift for the calculation of the irq depth.

[ Impact: stop false positives in trace recursion detection ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e145969..aa40ae9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1483,7 +1483,9 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 
 static int trace_irq_level(void)
 {
-	return hardirq_count() + softirq_count() + in_nmi();
+	return (hardirq_count() >> HARDIRQ_SHIFT) +
+		(softirq_count() >> + SOFTIRQ_SHIFT) +
+		!!in_nmi();
 }
 
 static int trace_recursive_lock(void)
-- 
cgit v0.10.2


From e395898e98119085f666febbc7b631dd69bc637f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 13:32:44 -0400
Subject: tracing: remove recursive test from ring_buffer_event_discard

The ring_buffer_event_discard is not tied to ring_buffer_lock_reserve.
It can be called inside or outside the reserve/commit. Even if it
is called inside the reserve/commit the commit part must also be called.

Only ring_buffer_discard_commit can be used as a replacement for
ring_buffer_unlock_commit.

This patch removes the trace_recursive_unlock from ring_buffer_event_discard
since it would be the wrong place to do so.

[Impact: prevent breakage in trace recursive testing ]

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index aa40ae9..a699767 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1667,7 +1667,6 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
 void ring_buffer_event_discard(struct ring_buffer_event *event)
 {
 	rb_event_discard(event);
-	trace_recursive_unlock();
 }
 EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
 
-- 
cgit v0.10.2


From f1f9b3b1795da8625e0e6096813c9d18d4a344ce Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 20 Apr 2009 20:38:21 +0200
Subject: perfcounters, sched: remove __task_delta_exec()

This function was left orphan by the latest round of sw-counter
cleanups.

[ Impact: remove unused kernel function ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 080d1fd..a77c600 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -85,7 +85,6 @@ static inline unsigned int kstat_irqs(unsigned int irq)
 /*
  * Lock/unlock the current runqueue - to extract task statistics:
  */
-extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update);
 extern unsigned long long task_delta_exec(struct task_struct *);
 
 extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
diff --git a/kernel/sched.c b/kernel/sched.c
index b66a08c..a69278e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4550,29 +4550,6 @@ EXPORT_PER_CPU_SYMBOL(kstat);
  * Return any ns on the sched_clock that have not yet been banked in
  * @p in case that task is currently running.
  */
-unsigned long long __task_delta_exec(struct task_struct *p, int update)
-{
-	s64 delta_exec;
-	struct rq *rq;
-
-	rq = task_rq(p);
-	WARN_ON_ONCE(!runqueue_is_locked());
-	WARN_ON_ONCE(!task_current(rq, p));
-
-	if (update)
-		update_rq_clock(rq);
-
-	delta_exec = rq->clock - p->se.exec_start;
-
-	WARN_ON_ONCE(delta_exec < 0);
-
-	return delta_exec;
-}
-
-/*
- * Return any ns on the sched_clock that have not yet been banked in
- * @p in case that task is currently running.
- */
 unsigned long long task_delta_exec(struct task_struct *p)
 {
 	unsigned long flags;
-- 
cgit v0.10.2


From ff743345bf7685a207868048a70e23164c4785e5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:26 +0100
Subject: sched: remove extra call overhead for schedule()

Lai Jiangshan's patch reminded me that I promised Nick to remove
that extra call overhead in schedule().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090313112300.927414207@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/mutex.c b/kernel/mutex.c
index 5d79781..e1fb735 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -248,7 +248,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 
 		/* didnt get the lock, go to sleep: */
 		spin_unlock_mutex(&lock->wait_lock, flags);
-		__schedule();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 		spin_lock_mutex(&lock->wait_lock, flags);
 	}
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 7601cee..797f6fd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5131,13 +5131,15 @@ pick_next_task(struct rq *rq)
 /*
  * schedule() is the main scheduler function.
  */
-asmlinkage void __sched __schedule(void)
+asmlinkage void __sched schedule(void)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
 	int cpu;
 
+need_resched:
+	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	rcu_qsctr_inc(cpu);
@@ -5194,15 +5196,9 @@ need_resched_nonpreemptible:
 
 	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
-}
 
-asmlinkage void __sched schedule(void)
-{
-need_resched:
-	preempt_disable();
-	__schedule();
 	preempt_enable_no_resched();
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+	if (need_resched())
 		goto need_resched;
 }
 EXPORT_SYMBOL(schedule);
-- 
cgit v0.10.2


From aa18efb2a2f07e1cf062039848e9d369bb358724 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 16:16:11 -0400
Subject: tracing: use recursive counter over irq level

Althought using the irq level (hardirq_count, softirq_count and in_nmi)
was nice to detect bad recursion right away, but since the counters are
not atomically updated with respect to the interrupts, the function tracer
might trigger the test from an interrupt handler before the hardirq_count
is updated. This will trigger a false warning.

This patch converts the recursive detection to a simple counter.
If the depth is greater than 16 then the recursive detection will trigger.
16 is more than enough for any nested interrupts.

[ Impact: fix false positive trace recursion detection ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a699767..7bcfd3e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1481,47 +1481,34 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	return event;
 }
 
-static int trace_irq_level(void)
-{
-	return (hardirq_count() >> HARDIRQ_SHIFT) +
-		(softirq_count() >> + SOFTIRQ_SHIFT) +
-		!!in_nmi();
-}
+#define TRACE_RECURSIVE_DEPTH 16
 
 static int trace_recursive_lock(void)
 {
-	int level;
-
-	level = trace_irq_level();
+	current->trace_recursion++;
 
-	if (unlikely(current->trace_recursion & (1 << level))) {
-		/* Disable all tracing before we do anything else */
-		tracing_off_permanent();
+	if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
+		return 0;
 
-		printk_once(KERN_WARNING "Tracing recursion: "
-			    "HC[%lu]:SC[%lu]:NMI[%lu]\n",
-			    hardirq_count() >> HARDIRQ_SHIFT,
-			    softirq_count() >> SOFTIRQ_SHIFT,
-			    in_nmi());
+	/* Disable all tracing before we do anything else */
+	tracing_off_permanent();
 
-		WARN_ON_ONCE(1);
-		return -1;
-	}
+	printk_once(KERN_WARNING "Tracing recursion: depth[%d]:"
+		    "HC[%lu]:SC[%lu]:NMI[%lu]\n",
+		    current->trace_recursion,
+		    hardirq_count() >> HARDIRQ_SHIFT,
+		    softirq_count() >> SOFTIRQ_SHIFT,
+		    in_nmi());
 
-	current->trace_recursion |= 1 << level;
-
-	return 0;
+	WARN_ON_ONCE(1);
+	return -1;
 }
 
 static void trace_recursive_unlock(void)
 {
-	int level;
-
-	level = trace_irq_level();
-
-	WARN_ON_ONCE(!current->trace_recursion & (1 << level));
+	WARN_ON_ONCE(!current->trace_recursion);
 
-	current->trace_recursion &= ~(1 << level);
+	current->trace_recursion--;
 }
 
 static DEFINE_PER_CPU(int, rb_need_resched);
-- 
cgit v0.10.2


From cb4764a6dbffd9bb3cf759421ae82384071a933d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 18:16:44 -0400
Subject: tracing: use nowakeup version of commit for function event trace
 tests

The startup tests for the event tracer also runs with the function
tracer enabled. The "wakeup" version of the trace commit was used
which can grab spinlocks. If a task was preempted by an NMI
that called a function being traced, it could deadlock due to the
function tracer trying to grab the same lock.

Thanks to Frederic Weisbecker for pointing out where the bug was.

Reported-by: Ingo Molnar <mingo@elte.hu>
Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 98daf5d..672b195 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1164,7 +1164,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
 	entry->ip			= ip;
 	entry->parent_ip		= parent_ip;
 
-	trace_current_buffer_unlock_commit(event, flags, pc);
+	trace_nowake_buffer_unlock_commit(event, flags, pc);
 
  out:
 	atomic_dec(&per_cpu(test_event_disable, cpu));
-- 
cgit v0.10.2


From 2ecf0a57c60dcb588f310d94412118e15c510532 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 21 Apr 2009 12:16:56 +0900
Subject: ide-dma: don't reset request fields on dma_timeout_retry()

Impact: drop unnecessary code

Now that everything uses bio and block operations, there is no need to
reset request fields manually when retrying a request.  Every field is
guaranteed to be always valid.  Drop unnecessary request field
resetting from ide_dma_timeout_retry().

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index a0b8cab1..d9123ec 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -510,23 +510,11 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 	/*
 	 * un-busy drive etc and make sure request is sane
 	 */
-
 	rq = hwif->rq;
-	if (!rq)
-		goto out;
-
-	hwif->rq = NULL;
-
-	rq->errors = 0;
-
-	if (!rq->bio)
-		goto out;
-
-	rq->sector = rq->bio->bi_sector;
-	rq->current_nr_sectors = bio_iovec(rq->bio)->bv_len >> 9;
-	rq->hard_cur_sectors = rq->current_nr_sectors;
-	rq->buffer = bio_data(rq->bio);
-out:
+	if (rq) {
+		hwif->rq = NULL;
+		rq->errors = 0;
+	}
 	return ret;
 }
 
-- 
cgit v0.10.2


From 6e29ec5701e9d44fa02b96c1c5c45f7516182b65 Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Tue, 21 Apr 2009 08:40:49 +0530
Subject: sched: Replace first_cpu() with cpumask_first() in ILB nomination
 code

Stephen Rothwell reported this build warning:

>  kernel/sched.c: In function 'find_new_ilb':
>  kernel/sched.c:4355: warning: passing argument 1 of '__first_cpu' from incompatible pointer type
>
> Possibly caused by commit f711f6090a81cbd396b63de90f415d33f563af9b
> ("sched: Nominate idle load balancer from a semi-idle package") from
> the sched tree.  Should this call to first_cpu be cpumask_first?

For !(CONFIG_SCHED_MC || CONFIG_SCHED_SMT), find_new_ilb() nominates the
Idle load balancer as the first cpu from the nohz.cpu_mask.

This code uses the older API first_cpu(). Replace it with cpumask_first(),
which is the correct API here.

[ Impact: cleanup, address build warning ]

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <20090421031049.GA4140@in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 797f6fd..54d67b9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4356,7 +4356,7 @@ out_done:
 #else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
 static inline int find_new_ilb(int call_cpu)
 {
-	return first_cpu(nohz.cpu_mask);
+	return cpumask_first(nohz.cpu_mask);
 }
 #endif
 
-- 
cgit v0.10.2


From fc1edaf9e7cc4d4696f83dee495b8f158d01c4eb Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 20 Apr 2009 13:02:27 -0700
Subject: x86: x2apic, IR: Clean up X86_X2APIC and INTR_REMAP config checks

Add x2apic_supported() to clean up CONFIG_X86_X2APIC checks.

Fix CONFIG_INTR_REMAP checks.

[ Impact: cleanup ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: dwmw2@infradead.org
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Weidong Han <weidong.han@intel.com>
LKML-Reference: <20090420200450.128993000@linux-os.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index fbdd654..3738438 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -107,8 +107,7 @@ extern u32 native_safe_apic_wait_icr_idle(void);
 extern void native_apic_icr_write(u32 low, u32 id);
 extern u64 native_apic_icr_read(void);
 
-#define EIM_8BIT_APIC_ID	0
-#define EIM_32BIT_APIC_ID	1
+extern int x2apic_mode;
 
 #ifdef CONFIG_X86_X2APIC
 /*
@@ -166,7 +165,7 @@ static inline u64 native_x2apic_icr_read(void)
 	return val;
 }
 
-extern int x2apic, x2apic_phys;
+extern int x2apic_phys;
 extern void check_x2apic(void);
 extern void enable_x2apic(void);
 extern void x2apic_icr_write(u32 low, u32 id);
@@ -182,6 +181,8 @@ static inline int x2apic_enabled(void)
 		return 1;
 	return 0;
 }
+
+#define x2apic_supported()	(cpu_has_x2apic)
 #else
 static inline void check_x2apic(void)
 {
@@ -194,9 +195,8 @@ static inline int x2apic_enabled(void)
 	return 0;
 }
 
-#define	x2apic	0
 #define	x2apic_preenabled 0
-
+#define	x2apic_supported()	0
 #endif
 
 extern void enable_IR_x2apic(void);
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 9d826e4..34eaa37 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -161,7 +161,6 @@ extern int io_apic_set_pci_routing(int ioapic, int pin, int irq,
 extern int (*ioapic_renumber_irq)(int ioapic, int irq);
 extern void ioapic_init_mappings(void);
 
-#ifdef CONFIG_X86_64
 extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
 extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
 extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
@@ -169,7 +168,6 @@ extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
 extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
 extern void reinit_intr_remapped_IO_APIC(int intr_remapping,
 	struct IO_APIC_route_entry **ioapic_entries);
-#endif
 
 extern void probe_nr_irqs_gsi(void);
 
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 0396760..f275e22 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -1,6 +1,6 @@
 #ifndef _ASM_X86_IRQ_REMAPPING_H
 #define _ASM_X86_IRQ_REMAPPING_H
 
-#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8)
+#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8)
 
 #endif	/* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 7b41a32..2b30e52 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -134,8 +134,8 @@ static __init int setup_apicpmtimer(char *s)
 __setup("apicpmtimer", setup_apicpmtimer);
 #endif
 
+int x2apic_mode;
 #ifdef CONFIG_X86_X2APIC
-int x2apic;
 /* x2apic enabled before OS handover */
 static int x2apic_preenabled;
 static int disable_x2apic;
@@ -858,7 +858,7 @@ void clear_local_APIC(void)
 	u32 v;
 
 	/* APIC hasn't been mapped yet */
-	if (!x2apic && !apic_phys)
+	if (!x2apic_mode && !apic_phys)
 		return;
 
 	maxlvt = lapic_get_maxlvt();
@@ -1330,7 +1330,7 @@ void check_x2apic(void)
 {
 	if (x2apic_enabled()) {
 		pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
-		x2apic_preenabled = x2apic = 1;
+		x2apic_preenabled = x2apic_mode = 1;
 	}
 }
 
@@ -1338,7 +1338,7 @@ void enable_x2apic(void)
 {
 	int msr, msr2;
 
-	if (!x2apic)
+	if (!x2apic_mode)
 		return;
 
 	rdmsr(MSR_IA32_APICBASE, msr, msr2);
@@ -1390,25 +1390,17 @@ void __init enable_IR_x2apic(void)
 	mask_IO_APIC_setup(ioapic_entries);
 	mask_8259A();
 
-#ifdef CONFIG_X86_X2APIC
-	if (cpu_has_x2apic)
-		ret = enable_intr_remapping(EIM_32BIT_APIC_ID);
-	else
-#endif
-		ret = enable_intr_remapping(EIM_8BIT_APIC_ID);
-
+	ret = enable_intr_remapping(x2apic_supported());
 	if (ret)
 		goto end_restore;
 
 	pr_info("Enabled Interrupt-remapping\n");
 
-#ifdef CONFIG_X86_X2APIC
-	if (cpu_has_x2apic && !x2apic) {
-		x2apic = 1;
+	if (x2apic_supported() && !x2apic_mode) {
+		x2apic_mode = 1;
 		enable_x2apic();
 		pr_info("Enabled x2apic\n");
 	}
-#endif
 
 end_restore:
 	if (ret)
@@ -1576,7 +1568,7 @@ void __init early_init_lapic_mapping(void)
  */
 void __init init_apic_mappings(void)
 {
-	if (x2apic) {
+	if (x2apic_mode) {
 		boot_cpu_physical_apicid = read_apic_id();
 		return;
 	}
@@ -2010,10 +2002,10 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
 
 	local_irq_save(flags);
 	disable_local_APIC();
-#ifdef CONFIG_INTR_REMAP
+
 	if (intr_remapping_enabled)
 		disable_intr_remapping();
-#endif
+
 	local_irq_restore(flags);
 	return 0;
 }
@@ -2023,8 +2015,6 @@ static int lapic_resume(struct sys_device *dev)
 	unsigned int l, h;
 	unsigned long flags;
 	int maxlvt;
-
-#ifdef CONFIG_INTR_REMAP
 	int ret;
 	struct IO_APIC_route_entry **ioapic_entries = NULL;
 
@@ -2050,17 +2040,8 @@ static int lapic_resume(struct sys_device *dev)
 		mask_8259A();
 	}
 
-	if (x2apic)
+	if (x2apic_mode)
 		enable_x2apic();
-#else
-	if (!apic_pm_state.active)
-		return 0;
-
-	local_irq_save(flags);
-	if (x2apic)
-		enable_x2apic();
-#endif
-
 	else {
 		/*
 		 * Make sure the APICBASE points to the right address
@@ -2098,18 +2079,12 @@ static int lapic_resume(struct sys_device *dev)
 	apic_write(APIC_ESR, 0);
 	apic_read(APIC_ESR);
 
-#ifdef CONFIG_INTR_REMAP
 	if (intr_remapping_enabled) {
-		if (x2apic)
-			reenable_intr_remapping(EIM_32BIT_APIC_ID);
-		else
-			reenable_intr_remapping(EIM_8BIT_APIC_ID);
-
+		reenable_intr_remapping(x2apic_mode);
 		unmask_8259A();
 		restore_IO_APIC_setup(ioapic_entries);
 		free_ioapic_entries(ioapic_entries);
 	}
-#endif
 
 	local_irq_restore(flags);
 
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index ea22a86..3a45d2e 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -736,7 +736,6 @@ static int __init ioapic_pirq_setup(char *str)
 __setup("pirq=", ioapic_pirq_setup);
 #endif /* CONFIG_X86_32 */
 
-#ifdef CONFIG_INTR_REMAP
 struct IO_APIC_route_entry **alloc_ioapic_entries(void)
 {
 	int apic;
@@ -857,7 +856,6 @@ void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
 
 	kfree(ioapic_entries);
 }
-#endif
 
 /*
  * Find the IRQ entry number of a certain pin.
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 1783652..bc3e880 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -50,7 +50,7 @@ static struct apic *apic_probe[] __initdata = {
 void __init default_setup_apic_routing(void)
 {
 #ifdef CONFIG_X86_X2APIC
-	if (x2apic && (apic != &apic_x2apic_phys &&
+	if (x2apic_mode && (apic != &apic_x2apic_phys &&
 #ifdef CONFIG_X86_UV
 		       apic != &apic_x2apic_uv_x &&
 #endif
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 06f592a..10ff5c4 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -158,6 +158,8 @@ static inline struct intel_iommu *map_ioapic_to_ir(int apic)
 }
 #define irq_remapped(irq)		(0)
 #define enable_intr_remapping(mode)	(-1)
+#define disable_intr_remapping()	(0)
+#define reenable_intr_remapping(mode)	(0)
 #define intr_remapping_enabled		(0)
 #endif
 
-- 
cgit v0.10.2


From 25629d810a52176758401184d9b437fbb7f79195 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 20 Apr 2009 13:02:28 -0700
Subject: x86: x2apic, IR: Move eoi_ioapic_irq() into a CONFIG_INTR_REMAP
 section

Address the following complier warning:

   arch/x86/kernel/apic/io_apic.c:2543: warning: `eoi_ioapic_irq' defined but not used

By moving that function (and eoi_ioapic_irq()) into an existing
#ifdef CONFIG_INTR_REMAP section of the code.

[ Impact: cleanup ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: dwmw2@infradead.org
Cc: Weidong Han <weidong.han@intel.com>
LKML-Reference: <20090420200450.271099000@linux-os.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Weidong Han <weidong.han@intel.com>

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 3a45d2e..4baa9cb 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2517,39 +2517,6 @@ static void irq_complete_move(struct irq_desc **descp)
 static inline void irq_complete_move(struct irq_desc **descp) {}
 #endif
 
-static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
-{
-	int apic, pin;
-	struct irq_pin_list *entry;
-
-	entry = cfg->irq_2_pin;
-	for (;;) {
-
-		if (!entry)
-			break;
-
-		apic = entry->apic;
-		pin = entry->pin;
-		io_apic_eoi(apic, pin);
-		entry = entry->next;
-	}
-}
-
-static void
-eoi_ioapic_irq(struct irq_desc *desc)
-{
-	struct irq_cfg *cfg;
-	unsigned long flags;
-	unsigned int irq;
-
-	irq = desc->irq;
-	cfg = desc->chip_data;
-
-	spin_lock_irqsave(&ioapic_lock, flags);
-	__eoi_ioapic_irq(irq, cfg);
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
 static void ack_apic_edge(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
@@ -2659,6 +2626,39 @@ static void ack_apic_level(unsigned int irq)
 }
 
 #ifdef CONFIG_INTR_REMAP
+static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+{
+	int apic, pin;
+	struct irq_pin_list *entry;
+
+	entry = cfg->irq_2_pin;
+	for (;;) {
+
+		if (!entry)
+			break;
+
+		apic = entry->apic;
+		pin = entry->pin;
+		io_apic_eoi(apic, pin);
+		entry = entry->next;
+	}
+}
+
+static void
+eoi_ioapic_irq(struct irq_desc *desc)
+{
+	struct irq_cfg *cfg;
+	unsigned long flags;
+	unsigned int irq;
+
+	irq = desc->irq;
+	cfg = desc->chip_data;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__eoi_ioapic_irq(irq, cfg);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
 static void ir_ack_apic_edge(unsigned int irq)
 {
 	ack_APIC_irq();
-- 
cgit v0.10.2


From 39d83a5d684a457046aa2a6dac60f105966e78e9 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 20 Apr 2009 13:02:29 -0700
Subject: x86: x2apic, IR: Clean up panic() with nox2apic boot option

Instead of panic() ignore the "nox2apic" boot option when BIOS
has already enabled x2apic prior to OS handover.

[ Impact: printk warning instead of panic() when BIOS has enabled x2apic already ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: dwmw2@infradead.org
Cc: Weidong Han <weidong.han@intel.com>
LKML-Reference: <20090420200450.425091000@linux-os.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 2b30e52..d32f558 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -141,8 +141,12 @@ static int x2apic_preenabled;
 static int disable_x2apic;
 static __init int setup_nox2apic(char *str)
 {
-	if (x2apic_enabled())
-		panic("Bios already enabled x2apic, can't enforce nox2apic");
+	if (x2apic_enabled()) {
+		pr_warning("Bios already enabled x2apic, "
+			   "can't enforce nox2apic");
+		return 0;
+	}
+
 	disable_x2apic = 1;
 	setup_clear_cpu_cap(X86_FEATURE_X2APIC);
 	return 0;
-- 
cgit v0.10.2


From 9d6c26e73bd248c286bb3597aaf788716e8fcceb Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 20 Apr 2009 13:02:31 -0700
Subject: x86: x2apic, IR: Make config X86_UV dependent on X86_X2APIC

Instead of selecting X86_X2APIC, make config X86_UV dependent
on X86_X2APIC.

This will eliminate enabling CONFIG_X86_X2APIC with out
enabling CONFIG_INTR_REMAP.

[ Impact: cleanup ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Jack Steiner <steiner@sgi.com>
Cc: dwmw2@infradead.org
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Weidong Han <weidong.han@intel.com>
LKML-Reference: <20090420200450.694598000@linux-os.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c9086e6..58fb7b3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -354,7 +354,7 @@ config X86_UV
 	depends on X86_64
 	depends on X86_EXTENDED_PLATFORM
 	depends on NUMA
-	select X86_X2APIC
+	depends on X86_X2APIC
 	---help---
 	  This option is needed in order to support SGI Ultraviolet systems.
 	  If you don't have one of these, you should say N here.
-- 
cgit v0.10.2


From 89388913f2c88a2cd15d24abab571b17a2596127 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Tue, 21 Apr 2009 11:39:27 +0300
Subject: x86: unify noexec handling

This patch unifies noexec handling on 32-bit and 64-bit.

[ Impact: cleanup ]

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
[ mingo@elte.hu: build fix ]
LKML-Reference: <1240303167.771.69.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index b8238dc..4d258ad 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -273,7 +273,6 @@ typedef struct page *pgtable_t;
 
 extern pteval_t __supported_pte_mask;
 extern int nx_enabled;
-extern void set_nx(void);
 
 #define pgprot_writecombine	pgprot_writecombine
 extern pgprot_t pgprot_writecombine(pgprot_t prot);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index fd3da1d..fedde53 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -22,6 +22,69 @@ int direct_gbpages
 #endif
 ;
 
+int nx_enabled;
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+static int disable_nx __cpuinitdata;
+
+/*
+ * noexec = on|off
+ *
+ * Control non-executable mappings for processes.
+ *
+ * on      Enable
+ * off     Disable
+ */
+static int __init noexec_setup(char *str)
+{
+	if (!str)
+		return -EINVAL;
+	if (!strncmp(str, "on", 2)) {
+		__supported_pte_mask |= _PAGE_NX;
+		disable_nx = 0;
+	} else if (!strncmp(str, "off", 3)) {
+		disable_nx = 1;
+		__supported_pte_mask &= ~_PAGE_NX;
+	}
+	return 0;
+}
+early_param("noexec", noexec_setup);
+#endif
+
+#ifdef CONFIG_X86_PAE
+static void __init set_nx(void)
+{
+	unsigned int v[4], l, h;
+
+	if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
+		cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
+
+		if ((v[3] & (1 << 20)) && !disable_nx) {
+			rdmsr(MSR_EFER, l, h);
+			l |= EFER_NX;
+			wrmsr(MSR_EFER, l, h);
+			nx_enabled = 1;
+			__supported_pte_mask |= _PAGE_NX;
+		}
+	}
+}
+#else
+static inline void set_nx(void)
+{
+}
+#endif
+
+#ifdef CONFIG_X86_64
+void __cpuinit check_efer(void)
+{
+	unsigned long efer;
+
+	rdmsrl(MSR_EFER, efer);
+	if (!(efer & EFER_NX) || disable_nx)
+		__supported_pte_mask &= ~_PAGE_NX;
+}
+#endif
+
 static void __init find_early_table_space(unsigned long end, int use_pse,
 					  int use_gbpages)
 {
@@ -158,12 +221,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	use_gbpages = direct_gbpages;
 #endif
 
-#ifdef CONFIG_X86_32
-#ifdef CONFIG_X86_PAE
 	set_nx();
 	if (nx_enabled)
 		printk(KERN_INFO "NX (Execute Disable) protection: active\n");
-#endif
 
 	/* Enable PSE if available */
 	if (cpu_has_pse)
@@ -174,7 +234,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 		set_in_cr4(X86_CR4_PGE);
 		__supported_pte_mask |= _PAGE_GLOBAL;
 	}
-#endif
 
 	if (use_gbpages)
 		page_size_mask |= 1 << PG_LEVEL_1G;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 749559e..2b27120 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -587,61 +587,9 @@ void zap_low_mappings(void)
 	flush_tlb_all();
 }
 
-int nx_enabled;
-
 pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
 EXPORT_SYMBOL_GPL(__supported_pte_mask);
 
-#ifdef CONFIG_X86_PAE
-
-static int disable_nx __initdata;
-
-/*
- * noexec = on|off
- *
- * Control non executable mappings.
- *
- * on      Enable
- * off     Disable
- */
-static int __init noexec_setup(char *str)
-{
-	if (!str || !strcmp(str, "on")) {
-		if (cpu_has_nx) {
-			__supported_pte_mask |= _PAGE_NX;
-			disable_nx = 0;
-		}
-	} else {
-		if (!strcmp(str, "off")) {
-			disable_nx = 1;
-			__supported_pte_mask &= ~_PAGE_NX;
-		} else {
-			return -EINVAL;
-		}
-	}
-
-	return 0;
-}
-early_param("noexec", noexec_setup);
-
-void __init set_nx(void)
-{
-	unsigned int v[4], l, h;
-
-	if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
-		cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
-
-		if ((v[3] & (1 << 20)) && !disable_nx) {
-			rdmsr(MSR_EFER, l, h);
-			l |= EFER_NX;
-			wrmsr(MSR_EFER, l, h);
-			nx_enabled = 1;
-			__supported_pte_mask |= _PAGE_NX;
-		}
-	}
-}
-#endif
-
 /* user-defined highmem size */
 static unsigned int highmem_pages = -1;
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1753e80..a4e7846 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -85,39 +85,6 @@ early_param("gbpages", parse_direct_gbpages_on);
 pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
 EXPORT_SYMBOL_GPL(__supported_pte_mask);
 
-static int disable_nx __cpuinitdata;
-
-/*
- * noexec=on|off
- * Control non-executable mappings for 64-bit processes.
- *
- * on	Enable (default)
- * off	Disable
- */
-static int __init nonx_setup(char *str)
-{
-	if (!str)
-		return -EINVAL;
-	if (!strncmp(str, "on", 2)) {
-		__supported_pte_mask |= _PAGE_NX;
-		disable_nx = 0;
-	} else if (!strncmp(str, "off", 3)) {
-		disable_nx = 1;
-		__supported_pte_mask &= ~_PAGE_NX;
-	}
-	return 0;
-}
-early_param("noexec", nonx_setup);
-
-void __cpuinit check_efer(void)
-{
-	unsigned long efer;
-
-	rdmsrl(MSR_EFER, efer);
-	if (!(efer & EFER_NX) || disable_nx)
-		__supported_pte_mask &= ~_PAGE_NX;
-}
-
 int force_personality32;
 
 /*
-- 
cgit v0.10.2


From e8082f3f5a17d7a7bfc7dd1050a3f958dc034e9a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 21 Apr 2009 17:11:46 +0800
Subject: tracing/filters: don't remove old filters when failed to write
 subsys->filter

If writing subsys->filter returns EINVAL or ENOSPC, the original
filters in subsys/ and subsys/events/ will be removed. This is
definitely wrong.

[ Impact: fix filter setting semantics on error condition ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49ED8DD2.2070700@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 672b195..9ea55a7 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -600,7 +600,6 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 	err = filter_add_subsystem_pred(system, pred);
 	if (err < 0) {
-		filter_free_subsystem_preds(system);
 		filter_free_pred(pred);
 		return err;
 	}
-- 
cgit v0.10.2


From f66578a7637b87810cbb9041c4e3a77fd2fa4706 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 21 Apr 2009 17:12:11 +0800
Subject: tracing/filters: allow user-input to be integer-like string

Suppose we would like to trace all tasks named '123', but this
will fail:

 # echo 'parent_comm == 123' > events/sched/sched_process_fork/filter
 bash: echo: write error: Invalid argument

Don't guess the type of the filter pred in filter_parse(), but instead
we check it in __filter_add_pred().

[ Impact: extend allowed filter field string values ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49ED8DEB.6000700@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e0fcfd2..6541828 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -313,6 +313,7 @@ static int __filter_add_pred(struct ftrace_event_call *call,
 {
 	struct ftrace_event_field *field;
 	filter_pred_fn_t fn;
+	unsigned long long val;
 
 	field = find_event_field(call, pred->field_name);
 	if (!field)
@@ -322,14 +323,13 @@ static int __filter_add_pred(struct ftrace_event_call *call,
 	pred->offset = field->offset;
 
 	if (is_string_field(field->type)) {
-		if (!pred->str_len)
-			return -EINVAL;
 		fn = filter_pred_string;
 		pred->str_len = field->size;
 		return filter_add_pred_fn(call, pred, fn);
 	} else {
-		if (pred->str_len)
+		if (strict_strtoull(pred->str_val, 0, &val))
 			return -EINVAL;
+		pred->val = val;
 	}
 
 	switch (field->size) {
@@ -413,12 +413,16 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 	return 0;
 }
 
+/*
+ * The filter format can be
+ *   - 0, which means remove all filter preds
+ *   - [||/&&] <field> ==/!= <val>
+ */
 int filter_parse(char **pbuf, struct filter_pred *pred)
 {
-	char *tmp, *tok, *val_str = NULL;
+	char *tok, *val_str = NULL;
 	int tok_n = 0;
 
-	/* field ==/!= number, or/and field ==/!= number, number */
 	while ((tok = strsep(pbuf, " \n"))) {
 		if (tok_n == 0) {
 			if (!strcmp(tok, "0")) {
@@ -478,19 +482,13 @@ int filter_parse(char **pbuf, struct filter_pred *pred)
 		return -EINVAL;
 	}
 
+	strcpy(pred->str_val, val_str);
+	pred->str_len = strlen(val_str);
+
 	pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
 	if (!pred->field_name)
 		return -ENOMEM;
 
-	pred->str_len = 0;
-	pred->val = simple_strtoull(val_str, &tmp, 0);
-	if (tmp == val_str) {
-		strncpy(pred->str_val, val_str, MAX_FILTER_STR_VAL);
-		pred->str_len = strlen(val_str);
-		pred->str_val[pred->str_len] = '\0';
-	} else if (*tmp != '\0')
-		return -EINVAL;
-
 	return 0;
 }
 
-- 
cgit v0.10.2


From 3554228d4289098a8fe5cfd87512ec32a19bbe5a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 21 Apr 2009 09:41:26 -0400
Subject: ring-buffer: only warn on wrap if buffer is bigger than two pages

On boot up, to save memory, ftrace allocates the minimum buffer
which is two pages. Ftrace also goes through a series of tests
(when configured) on boot up. These tests can fill up a page within
a single interrupt.

The ring buffer also has a WARN_ON when it detects that the buffer was
completely filled within a single commit (other commits are allowed to
be nested).

Combine the small buffer on start up, with the tests that can fill more
than a single page within an interrupt, this can trigger the WARN_ON.

This patch makes the WARN_ON only happen when the ring buffer consists
of more than two pages.

[ Impact: prevent false WARN_ON in ftrace startup tests ]

Reported-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <20090421094616.GA14561@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7bcfd3e..61dbdf2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1241,7 +1241,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		 * about it.
 		 */
 		if (unlikely(next_page == commit_page)) {
-			WARN_ON_ONCE(1);
+			/* This can easily happen on small ring buffers */
+			WARN_ON_ONCE(buffer->pages > 2);
 			goto out_reset;
 		}
 
-- 
cgit v0.10.2


From 19bdc9d061bcb71efd2b53083d96b59bbe1a1751 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Fri, 17 Apr 2009 05:26:31 +0000
Subject: clocksource: sh_cmt clocksource support

Add clocksource support to the sh_cmt driver. With this in
place we can do tickless with a single CMT channel.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c
index 02bae39..c247564 100644
--- a/drivers/clocksource/sh_cmt.c
+++ b/drivers/clocksource/sh_cmt.c
@@ -47,6 +47,7 @@ struct sh_cmt_priv {
 	unsigned long rate;
 	spinlock_t lock;
 	struct clock_event_device ced;
+	struct clocksource cs;
 	unsigned long total_cycles;
 };
 
@@ -376,6 +377,68 @@ static void sh_cmt_stop(struct sh_cmt_priv *p, unsigned long flag)
 	spin_unlock_irqrestore(&p->lock, flags);
 }
 
+static struct sh_cmt_priv *cs_to_sh_cmt(struct clocksource *cs)
+{
+	return container_of(cs, struct sh_cmt_priv, cs);
+}
+
+static cycle_t sh_cmt_clocksource_read(struct clocksource *cs)
+{
+	struct sh_cmt_priv *p = cs_to_sh_cmt(cs);
+	unsigned long flags, raw;
+	unsigned long value;
+	int has_wrapped;
+
+	spin_lock_irqsave(&p->lock, flags);
+	value = p->total_cycles;
+	raw = sh_cmt_get_counter(p, &has_wrapped);
+
+	if (unlikely(has_wrapped))
+		raw = p->match_value;
+	spin_unlock_irqrestore(&p->lock, flags);
+
+	return value + raw;
+}
+
+static int sh_cmt_clocksource_enable(struct clocksource *cs)
+{
+	struct sh_cmt_priv *p = cs_to_sh_cmt(cs);
+	int ret;
+
+	p->total_cycles = 0;
+
+	ret = sh_cmt_start(p, FLAG_CLOCKSOURCE);
+	if (ret)
+		return ret;
+
+	/* TODO: calculate good shift from rate and counter bit width */
+	cs->shift = 0;
+	cs->mult = clocksource_hz2mult(p->rate, cs->shift);
+	return 0;
+}
+
+static void sh_cmt_clocksource_disable(struct clocksource *cs)
+{
+	sh_cmt_stop(cs_to_sh_cmt(cs), FLAG_CLOCKSOURCE);
+}
+
+static int sh_cmt_register_clocksource(struct sh_cmt_priv *p,
+				       char *name, unsigned long rating)
+{
+	struct clocksource *cs = &p->cs;
+
+	cs->name = name;
+	cs->rating = rating;
+	cs->read = sh_cmt_clocksource_read;
+	cs->enable = sh_cmt_clocksource_enable;
+	cs->disable = sh_cmt_clocksource_disable;
+	cs->mask = CLOCKSOURCE_MASK(sizeof(unsigned long) * 8);
+	cs->flags = CLOCK_SOURCE_IS_CONTINUOUS;
+	pr_info("sh_cmt: %s used as clock source\n", cs->name);
+	clocksource_register(cs);
+	return 0;
+}
+
 static struct sh_cmt_priv *ced_to_sh_cmt(struct clock_event_device *ced)
 {
 	return container_of(ced, struct sh_cmt_priv, ced);
@@ -483,6 +546,9 @@ int sh_cmt_register(struct sh_cmt_priv *p, char *name,
 	if (clockevent_rating)
 		sh_cmt_register_clockevent(p, name, clockevent_rating);
 
+	if (clocksource_rating)
+		sh_cmt_register_clocksource(p, name, clocksource_rating);
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From 7a4f453b6d7379a7c380825949977c5a838aa012 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 22 Apr 2009 16:53:34 +0800
Subject: tracing/events: make struct trace_entry->type to be int type

struct trace_entry->type is unsigned char, while trace event's id is
int type, thus for a event with id >= 256, it's entry->type is cast
to (id % 256), and then we can't see the trace output of this event.

 # insmod trace-events-sample.ko
 # echo foo_bar > /mnt/tracing/set_event
 # cat /debug/tracing/events/trace-events-sample/foo_bar/id
 256
 # cat /mnt/tracing/trace_pipe
           <...>-3548  [001]   215.091142: Unknown type 0
           <...>-3548  [001]   216.089207: Unknown type 0
           <...>-3548  [001]   217.087271: Unknown type 0
           <...>-3548  [001]   218.085332: Unknown type 0

[ Impact: fix output for trace events with id >= 256 ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <49EEDB0E.5070207@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 75f3ac0..2a4a407 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -16,7 +16,7 @@ struct dentry;
  *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
  */
 struct trace_entry {
-	unsigned char		type;
+	int			type;
 	unsigned char		flags;
 	unsigned char		preempt_count;
 	int			pid;
@@ -73,7 +73,7 @@ enum print_line_t {
 
 
 struct ring_buffer_event *
-trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
+trace_current_buffer_lock_reserve(int type, unsigned long len,
 				  unsigned long flags, int pc);
 void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc);
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 39a3351..15ef08d 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -198,7 +198,7 @@ ftrace_define_fields_##call(void)					\
 	struct ftrace_event_call *event_call = &event_##call;		\
 	int ret;							\
 									\
-	__common_field(unsigned char, type);				\
+	__common_field(int, type);					\
 	__common_field(unsigned char, flags);				\
 	__common_field(unsigned char, preempt_count);			\
 	__common_field(int, pid);					\
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b9a3adc..b6183bc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -838,7 +838,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 }
 
 struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
-						    unsigned char type,
+						    int type,
 						    unsigned long len,
 						    unsigned long flags, int pc)
 {
@@ -881,7 +881,7 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
 }
 
 struct ring_buffer_event *
-trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
+trace_current_buffer_lock_reserve(int type, unsigned long len,
 				  unsigned long flags, int pc)
 {
 	return trace_buffer_lock_reserve(&global_trace,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 247948e..7d55bcf 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -422,7 +422,7 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
 struct ring_buffer_event;
 
 struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
-						    unsigned char type,
+						    int type,
 						    unsigned long len,
 						    unsigned long flags,
 						    int pc);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 9ea55a7..5d6e879 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -398,7 +398,7 @@ static int trace_write_header(struct trace_seq *s)
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\n",
-				FIELD(unsigned char, type),
+				FIELD(int, type),
 				FIELD(unsigned char, flags),
 				FIELD(unsigned char, preempt_count),
 				FIELD(int, pid),
-- 
cgit v0.10.2


From ff166cb57a17124af75714a9c11f448f56f1a4a3 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 20 Apr 2009 13:02:30 -0700
Subject: x86: x2apic, IR: remove reinit_intr_remapped_IO_APIC()

When interrupt-remapping is enabled, we are relying on
setup_IO_APIC_irqs() to configure remapped entries in the
IO-APIC, which comes little bit later after enabling
interrupt-remapping.

Meanwhile, restoration of old io-apic entries after enabling
interrupt-remapping will not make the interrupts through
io-apic functional anyway.

So remove the unnecessary reinit_intr_remapped_IO_APIC() step.

The longer story:

When interrupt-remapping is enabled, IO-APIC entries need to be
setup in the re-mappable format (pointing to
interrupt-remapping table entries setup by the OS). This
remapping configuration is happening in the same place where we
traditionally configure IO-APIC (i.e., in
setup_IO_APIC_irqs()).

So when we enable interrupt-remapping successfully, there is no
need to restore old io-apic RTE entries before we actually do a
complete configuration shortly in setup_IO_APIC_irqs(). Old
IO-APIC RTE's may be in traditional format (non re-mappable) or
in re-mappable format pointing to interrupt-remapping table
entries setup by BIOS. Restoring both of these will not make
IO-APIC functional. We have to rely on setup_IO_APIC_irqs() for
proper configuration by OS.

So I am removing this unnecessary and broken step.

[ Impact: remove unnecessary/broken IO-APIC setup step ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Weidong Han <weidong.han@intel.com>
Cc: dwmw2@infradead.org
LKML-Reference: <20090420200450.552359000@linux-os.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 34eaa37..1cf1450 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -166,8 +166,6 @@ extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
 extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
 extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
 extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
-extern void reinit_intr_remapped_IO_APIC(int intr_remapping,
-	struct IO_APIC_route_entry **ioapic_entries);
 
 extern void probe_nr_irqs_gsi(void);
 
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index d32f558..1386dbe 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1412,8 +1412,6 @@ end_restore:
 		 * IR enabling failed
 		 */
 		restore_IO_APIC_setup(ioapic_entries);
-	else
-		reinit_intr_remapped_IO_APIC(x2apic_preenabled, ioapic_entries);
 
 	unmask_8259A();
 	local_irq_restore(flags);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 4baa9cb..8aef5f9 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -833,20 +833,6 @@ int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
 	return 0;
 }
 
-void reinit_intr_remapped_IO_APIC(int intr_remapping,
-	struct IO_APIC_route_entry **ioapic_entries)
-
-{
-	/*
-	 * for now plain restore of previous settings.
-	 * TBD: In the case of OS enabling interrupt-remapping,
-	 * IO-APIC RTE's need to be setup to point to interrupt-remapping
-	 * table entries. for now, do a plain restore, and wait for
-	 * the setup_IO_APIC_irqs() to do proper initialization.
-	 */
-	restore_IO_APIC_setup(ioapic_entries);
-}
-
 void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
 {
 	int apic;
-- 
cgit v0.10.2


From 9cbf117662e24c6d33245666804487f92c21b59d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 19 Apr 2009 04:51:29 +0200
Subject: tracing/events: provide string with undefined size support

This patch provides the support for dynamic size strings on
event tracing.

The key concept is to use a structure with an ending char array field of
undefined size and use such ability to allocate the minimal size on the
ring buffer to make one or more string entries fit inside, as opposite
to a fixed length strings with upper bound.

The strings themselves are represented using fields which have an offset
value from the beginning of the entry.

This patch provides three new macros:

__string(item, src)

This one declares a string to the structure inside TP_STRUCT__entry.
You need to provide the name of the string field and the source that will
be copied inside.
This will also add the dynamic size of the string needed for the ring
buffer entry allocation.
A stack allocated structure is used to temporarily store the offset
of each strings, avoiding double calls to strlen() on each event
insertion.

__get_str(field)

This one will give you a pointer to the string you have created. This
is an abstract helper to resolve the absolute address given the field
name which is a relative address from the beginning of the trace_structure.

__assign_str(dst, src)

Use this macro to automatically perform the string copy from src to
dst. src must be a variable to assign and dst is the name of a __string
field.

Example on how to use it:

TRACE_EVENT(my_event,
	TP_PROTO(char *src1, char *src2),

	TP_ARGS(src1, src2),
	TP_STRUCT__entry(
		__string(str1, src1)
		__string(str2, src2)
	),
	TP_fast_assign(
		__assign_str(str1, src1);
		__assign_str(str2, src2);
	),
	TP_printk("%s %s", __get_str(src1), __get_str(src2))
)

Of course you can mix-up any __field or __array inside this
TRACE_EVENT. The position of the __string or __assign_str
doesn't matter.

Changes in v2:

Address the suggestion of Steven Rostedt: drop the opening_string() macro
and redefine __ending_string() to get the size of the string to be copied
instead of overwritting the whole ring buffer allocation.

Changes in v3:

Address other suggestions of Steven Rostedt and Peter Zijlstra with
some changes: drop the __ending_string and the need to have only one
string field.
Use offsets instead of absolute addresses.

[ Impact: allow more compact memory usage for string tracing ]

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 15ef08d..5a7d18c 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -27,6 +27,9 @@
 #undef __field
 #define __field(type, item)		type	item;
 
+#undef __string
+#define __string(item, src)		int	__str_loc_##item;
+
 #undef TP_STRUCT__entry
 #define TP_STRUCT__entry(args...) args
 
@@ -35,14 +38,53 @@
 	struct ftrace_raw_##name {				\
 		struct trace_entry	ent;			\
 		tstruct						\
+		char			__str_data[0];		\
 	};							\
 	static struct ftrace_event_call event_##name
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
+
 /*
  * Stage 2 of the trace events.
  *
+ * Include the following:
+ *
+ * struct ftrace_str_offsets_<call> {
+ *	int				<str1>;
+ *	int				<str2>;
+ *	[...]
+ * };
+ *
+ * The __string() macro will create each int <str>, this is to
+ * keep the offset of each string from the beggining of the event
+ * once we perform the strlen() of the src strings.
+ *
+ */
+
+#undef TRACE_FORMAT
+#define TRACE_FORMAT(call, proto, args, fmt)
+
+#undef __array
+#define __array(type, item, len)
+
+#undef __field
+#define __field(type, item);
+
+#undef __string
+#define __string(item, src)	int item;
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
+	struct ftrace_str_offsets_##call {				\
+		tstruct;						\
+	};
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+/*
+ * Stage 3 of the trace events.
+ *
  * Override the macros in <trace/trace_events.h> to include the following:
  *
  * enum print_line_t
@@ -80,6 +122,9 @@
 #undef TP_printk
 #define TP_printk(fmt, args...) fmt "\n", args
 
+#undef __get_str
+#define __get_str(field)	(char *)__entry + __entry->__str_loc_##field
+
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
 enum print_line_t							\
@@ -146,6 +191,16 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 	if (!ret)							\
 		return 0;
 
+#undef __string
+#define __string(item, src)						       \
+	ret = trace_seq_printf(s, "\tfield: __str_loc " #item ";\t"	       \
+			       "offset:%u;tsize:%u;\n",			       \
+			       (unsigned int)offsetof(typeof(field),	       \
+					__str_loc_##item),		       \
+			       (unsigned int)sizeof(field.__str_loc_##item));  \
+	if (!ret)							       \
+		return 0;
+
 #undef __entry
 #define __entry REC
 
@@ -189,6 +244,12 @@ ftrace_format_##call(struct trace_seq *s)				\
 	if (ret)							\
 		return ret;
 
+#undef __string
+#define __string(item, src)						       \
+	ret = trace_define_field(event_call, "__str_loc", #item,	       \
+				offsetof(typeof(field), __str_loc_##item),     \
+				sizeof(field.__str_loc_##item));
+
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
 int									\
@@ -212,7 +273,7 @@ ftrace_define_fields_##call(void)					\
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 /*
- * Stage 3 of the trace events.
+ * Stage 4 of the trace events.
  *
  * Override the macros in <trace/trace_events.h> to include the following:
  *
@@ -409,6 +470,23 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #undef __entry
 #define __entry entry
 
+#undef __field
+#define __field(type, item)
+
+#undef __array
+#define __array(type, item, len)
+
+#undef __string
+#define __string(item, src)						       \
+	__str_offsets.item = __str_size +				       \
+			     offsetof(typeof(*entry), __str_data);	       \
+	__str_size += strlen(src) + 1;
+
+#undef __assign_str
+#define __assign_str(dst, src)						\
+	__entry->__str_loc_##dst = __str_offsets.dst;			\
+	strcpy(__get_str(dst), src);
+
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
 _TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))			\
@@ -417,18 +495,22 @@ static struct ftrace_event_call event_##call;				\
 									\
 static void ftrace_raw_event_##call(proto)				\
 {									\
+	struct ftrace_str_offsets_##call __maybe_unused __str_offsets;  \
 	struct ftrace_event_call *call = &event_##call;			\
 	struct ring_buffer_event *event;				\
 	struct ftrace_raw_##call *entry;				\
 	unsigned long irq_flags;					\
+	int __str_size = 0;						\
 	int pc;								\
 									\
 	local_save_flags(irq_flags);					\
 	pc = preempt_count();						\
 									\
+	tstruct;							\
+									\
 	event = trace_current_buffer_lock_reserve(event_##call.id,	\
-				  sizeof(struct ftrace_raw_##call),	\
-				  irq_flags, pc);			\
+				 sizeof(struct ftrace_raw_##call) + __str_size,\
+				 irq_flags, pc);			\
 	if (!event)							\
 		return;							\
 	entry	= ring_buffer_event_data(event);			\
-- 
cgit v0.10.2


From 7e7ca9a22dbbc5c91763cd16923c7509918709b6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 19 Apr 2009 04:54:49 +0200
Subject: tracing/lock: provide lock_acquired event support for dynamic size
 string

Now that we can support the dynamic sized string, make the lock tracing
able to use it, making it safe against modules removal and consuming
the right amount of memory needed for each lock name

Changes in v2:
adapt to the __ending_string() updates and the opening_string() removal.

[ Impact: protect lock tracer against module removal ]

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/trace/events/lockdep.h b/include/trace/events/lockdep.h
index 45e326b..3ca315c 100644
--- a/include/trace/events/lockdep.h
+++ b/include/trace/events/lockdep.h
@@ -38,16 +38,16 @@ TRACE_EVENT(lock_acquired,
 	TP_ARGS(lock, ip, waittime),
 
 	TP_STRUCT__entry(
-		__field(const char *, name)
+		__string(name, lock->name)
 		__field(unsigned long, wait_usec)
 		__field(unsigned long, wait_nsec_rem)
 	),
 	TP_fast_assign(
-		__entry->name = lock->name;
+		__assign_str(name, lock->name);
 		__entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC);
 		__entry->wait_usec = (unsigned long) waittime;
 	),
-	TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec,
+	TP_printk("%s (%lu.%03lu us)", __get_str(name), __entry->wait_usec,
 				       __entry->wait_nsec_rem)
 );
 
-- 
cgit v0.10.2


From 6a74aa40907757ec98d8710ff66cd4cfe064e7d8 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 22 Apr 2009 00:41:09 +0200
Subject: tracing/events: protect __get_str()

The __get_str() macro is used in a code part then its content should be
protected with parenthesis.

[ Impact: make macro definition more robust ]

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 5a7d18c..a77f71a 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -123,7 +123,7 @@
 #define TP_printk(fmt, args...) fmt "\n", args
 
 #undef __get_str
-#define __get_str(field)	(char *)__entry + __entry->__str_loc_##field
+#define __get_str(field)	((char *)__entry + __entry->__str_loc_##field)
 
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
-- 
cgit v0.10.2


From 9be24414aad047dcf9d8d2a9a929321536c7ebec Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 26 Mar 2009 10:25:24 -0400
Subject: tracing/wakeup: move access to wakeup_cpu into spinlock

The code had the following outside the lock:

        if (next != wakeup_task)
                return;

        pc = preempt_count();

        /* The task we are waiting for is waking up */
        data = wakeup_trace->data[wakeup_cpu];

On initialization, wakeup_task is NULL and wakeup_cpu -1. This code
is not under a lock. If wakeup_task is set on another CPU as that
task is waking up, we can see the wakeup_task before wakeup_cpu is
set. If we read wakeup_cpu while it is still -1 then we will have
a bad data pointer.

This patch moves the reading of wakeup_cpu within the protection of
the spinlock used to protect the writing of wakeup_cpu and wakeup_task.

[ Impact: remove possible race causing invalid pointer dereference ]

Reported-by: Maneesh Soni <maneesh@in.ibm.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index b8b13c5..eacb272 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -138,9 +138,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
 
 	pc = preempt_count();
 
-	/* The task we are waiting for is waking up */
-	data = wakeup_trace->data[wakeup_cpu];
-
 	/* disable local data, not wakeup_cpu data */
 	cpu = raw_smp_processor_id();
 	disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
@@ -154,6 +151,9 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
 	if (unlikely(!tracer_enabled || next != wakeup_task))
 		goto out_unlock;
 
+	/* The task we are waiting for is waking up */
+	data = wakeup_trace->data[wakeup_cpu];
+
 	trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
 	tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
 
-- 
cgit v0.10.2


From 89ec0dee9eba6275d47be0b878cf5f6d5c2fb6eb Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 26 Mar 2009 11:03:29 -0400
Subject: tracing: increase size of number of possible events

With the new event tracing registration, we must increase the number
of events that can be registered. Currently the type field is only
one byte, which leaves us only 256 possible events.

Since we do not save the CPU number in the tracer anymore (it is determined
by the per cpu ring buffer that is used) we have an extra byte to use.

This patch increases the size of type from 1 byte (256 events) to
2 bytes (65,536 events).

It also adds a WARN_ON_ONCE if we exceed that limit.

[ Impact: allow more than 255 events ]

Signed-off-by: Steven Rostedt <srostedt@redhat.com>

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 2a4a407..07e0a6d 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -16,13 +16,16 @@ struct dentry;
  *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
  */
 struct trace_entry {
-	int			type;
+	unsigned short		type;
 	unsigned char		flags;
 	unsigned char		preempt_count;
 	int			pid;
 	int			tgid;
 };
 
+#define FTRACE_MAX_EVENT						\
+	((1 << (sizeof(((struct trace_entry *)0)->type) * 8)) - 1)
+
 /*
  * Trace iterator - used by printout routines who present trace
  * results to users and which routines might sleep, etc:
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5d6e879..9887131 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -398,7 +398,7 @@ static int trace_write_header(struct trace_seq *s)
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\n",
-				FIELD(int, type),
+				FIELD(unsigned short, type),
 				FIELD(unsigned char, flags),
 				FIELD(unsigned char, preempt_count),
 				FIELD(int, pid),
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 83a8abb..06997e7 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -537,6 +537,8 @@ int register_ftrace_event(struct trace_event *event)
  out:
 	mutex_unlock(&trace_event_mutex);
 
+	WARN_ON_ONCE(next_event_type > FTRACE_MAX_EVENT);
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(register_ftrace_event);
-- 
cgit v0.10.2


From 75db37d2f4c0ad9466ead57d467277d097b4105c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 26 Mar 2009 11:43:36 -0400
Subject: tracing: add size checks for exported ftrace internal structures

The events exported by TRACE_EVENT are automated and are guaranteed
to be correct when used.

The internal ftrace structures on the other hand are more manually
exported. These require the ftrace maintainer to make sure they
are up to date.

This patch adds a size check to help flag when a type changes in
an internal ftrace data structure, and the update needs to be reflected
in the export.

If a export is incorrect, then the only harm is that the user space
tools will not know how to correctly read the internal structures of
ftrace.

[ Impact: help prevent inconsistent ftrace format print outs ]

Signed-off-by: Steven Rostedt <srostedt@redhat.com>

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 9887131..b920815 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -381,8 +381,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 }
 
+extern char *__bad_type_size(void);
+
 #undef FIELD
 #define FIELD(type, name)						\
+	sizeof(type) != sizeof(field.name) ? __bad_type_size() :	\
 	#type, "common_" #name, offsetof(typeof(field), name),		\
 		sizeof(field.name)
 
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 48fc02f..0cb1a14 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -19,8 +19,12 @@
 #undef TRACE_STRUCT
 #define TRACE_STRUCT(args...) args
 
+extern void __bad_type_size(void);
+
 #undef TRACE_FIELD
 #define TRACE_FIELD(type, item, assign)					\
+	if (sizeof(type) != sizeof(field.item))				\
+		__bad_type_size();					\
 	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
 			       "offset:%u;\tsize:%u;\n",		\
 			       (unsigned int)offsetof(typeof(field), item), \
-- 
cgit v0.10.2


From d7285c6b5c54397fdf112c2fb98ee43193173aa9 Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@sous-sol.org>
Date: Thu, 23 Apr 2009 10:21:38 -0700
Subject: x86: use native register access for native tlb flushing

currently these are paravirtulaized, doesn't appear any callers rely on
this (no pv_ops backends are using native_tlb and overriding cr3/4
access).

[ Impact: fix lockdep warning with paravirt and function tracer ]

Signed-off-by: Chris Wright <chrisw@sous-sol.org>
LKML-Reference: <20090423172138.GR3036@sequoia.sous-sol.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index d3539f9..e2927c5 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -17,7 +17,7 @@
 
 static inline void __native_flush_tlb(void)
 {
-	write_cr3(read_cr3());
+	native_write_cr3(native_read_cr3());
 }
 
 static inline void __native_flush_tlb_global(void)
@@ -32,11 +32,11 @@ static inline void __native_flush_tlb_global(void)
 	 */
 	raw_local_irq_save(flags);
 
-	cr4 = read_cr4();
+	cr4 = native_read_cr4();
 	/* clear PGE */
-	write_cr4(cr4 & ~X86_CR4_PGE);
+	native_write_cr4(cr4 & ~X86_CR4_PGE);
 	/* write old PGE again and flush TLBs */
-	write_cr4(cr4);
+	native_write_cr4(cr4);
 
 	raw_local_irq_restore(flags);
 }
-- 
cgit v0.10.2


From c2518c4366f087ebc10b3919cb2461bbe4f42d0c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 23 Apr 2009 23:26:18 -0400
Subject: tracing: fix cut and paste macro error

In case a module uses the TRACE_EVENT macro for creating automated
events in ftrace, it may choose to use a different file name
than the defined system name, or choose to use a different path than
the default "include/trace/events" include path.

If this is done, then before including trace/define_trace.h the
header would define either "TRACE_INCLUDE_FILE" for the file
name or "TRACE_INCLUDE_PATH" for the include path.

If it does not define these, then the define_trace.h defines them
instead. If define trace defines them, then define_trace.h should
also undefine them before exiting. To do this a macro is used
to note this:

 #ifndef TRACE_INCLUDE_FILE
 # define TRACE_INCLUDE_FILE TRACE_SYSTEM
 # define UNDEF_TRACE_INCLUDE_FILE
 #endif

[...]

 #ifdef UNDEF_TRACE_INCLUDE_FILE
 # undef TRACE_INCLUDE_FILE
 # undef UNDEF_TRACE_INCLUDE_FILE
 #endif

The UNDEF_TRACE_INCLUDE_FILE acts as a CPP variable to know to undef
the TRACE_INCLUDE_FILE before leaving define_trace.h.

Unfortunately, due to cut and paste errors, the macros between
FILE and PATH got mixed up.

[ Impact: undef TRACE_INCLUDE_FILE and/or TRACE_INCLUDE_PATH when needed ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index 7f1f23d..abc611f 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -44,7 +44,7 @@
 
 #ifndef TRACE_INCLUDE_PATH
 # define __TRACE_INCLUDE(system) <trace/events/system.h>
-# define UNDEF_TRACE_INCLUDE_FILE
+# define UNDEF_TRACE_INCLUDE_PATH
 #else
 # define __TRACE_INCLUDE(system) __stringify(TRACE_INCLUDE_PATH/system.h)
 #endif
@@ -64,13 +64,13 @@
 
 /* Only undef what we defined in this file */
 #ifdef UNDEF_TRACE_INCLUDE_FILE
-# undef TRACE_INCLUDE_PATH
+# undef TRACE_INCLUDE_FILE
 # undef UNDEF_TRACE_INCLUDE_FILE
 #endif
 
-#ifdef UNDEF_TRACE_INCLUDE_FILE
+#ifdef UNDEF_TRACE_INCLUDE_PATH
 # undef TRACE_INCLUDE_PATH
-# undef UNDEF_TRACE_INCLUDE_FILE
+# undef UNDEF_TRACE_INCLUDE_PATH
 #endif
 
 /* We may be processing more files */
-- 
cgit v0.10.2


From 334d4169a6592d3fcd863bbe822a8f6985ffa9af Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 24 Apr 2009 11:27:05 +0800
Subject: ring_buffer: compressed event header

RB_MAX_SMALL_DATA = 28bytes is too small for most tracers, it wastes
an 'u32' to save the actually length for events which data size > 28.

This fix uses compressed event header and enlarges RB_MAX_SMALL_DATA.

[ Impact: saves about 0%-12.5%(depends on tracer) memory in ring_buffer ]

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <49F13189.3090000@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index fac8f1a..1c2f809 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -11,7 +11,7 @@ struct ring_buffer_iter;
  * Don't refer to this struct directly, use functions below.
  */
 struct ring_buffer_event {
-	u32		type:2, len:3, time_delta:27;
+	u32		type_len:5, time_delta:27;
 	u32		array[];
 };
 
@@ -24,7 +24,8 @@ struct ring_buffer_event {
  *				  size is variable depending on how much
  *				  padding is needed
  *				 If time_delta is non zero:
- *				  everything else same as RINGBUF_TYPE_DATA
+ *				  array[0] holds the actual length
+ *				  size = 4 + length (bytes)
  *
  * @RINGBUF_TYPE_TIME_EXTEND:	Extend the time delta
  *				 array[0] = time delta (28 .. 59)
@@ -35,22 +36,23 @@ struct ring_buffer_event {
  *				 array[1..2] = tv_sec
  *				 size = 16 bytes
  *
- * @RINGBUF_TYPE_DATA:		Data record
- *				 If len is zero:
+ * <= @RINGBUF_TYPE_DATA_TYPE_LEN_MAX:
+ *				Data record
+ *				 If type_len is zero:
  *				  array[0] holds the actual length
  *				  array[1..(length+3)/4] holds data
- *				  size = 4 + 4 + length (bytes)
+ *				  size = 4 + length (bytes)
  *				 else
- *				  length = len << 2
+ *				  length = type_len << 2
  *				  array[0..(length+3)/4-1] holds data
  *				  size = 4 + length (bytes)
  */
 enum ring_buffer_type {
+	RINGBUF_TYPE_DATA_TYPE_LEN_MAX = 28,
 	RINGBUF_TYPE_PADDING,
 	RINGBUF_TYPE_TIME_EXTEND,
 	/* FIXME: RINGBUF_TYPE_TIME_STAMP not implemented */
 	RINGBUF_TYPE_TIME_STAMP,
-	RINGBUF_TYPE_DATA,
 };
 
 unsigned ring_buffer_event_length(struct ring_buffer_event *event);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 61dbdf2..9692f10 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -28,8 +28,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
 {
 	int ret;
 
-	ret = trace_seq_printf(s, "\ttype        :    2 bits\n");
-	ret = trace_seq_printf(s, "\tlen         :    3 bits\n");
+	ret = trace_seq_printf(s, "# compressed entry header\n");
+	ret = trace_seq_printf(s, "\ttype_len    :    5 bits\n");
 	ret = trace_seq_printf(s, "\ttime_delta  :   27 bits\n");
 	ret = trace_seq_printf(s, "\tarray       :   32 bits\n");
 	ret = trace_seq_printf(s, "\n");
@@ -37,8 +37,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
 			       RINGBUF_TYPE_PADDING);
 	ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
 			       RINGBUF_TYPE_TIME_EXTEND);
-	ret = trace_seq_printf(s, "\tdata        : type == %d\n",
-			       RINGBUF_TYPE_DATA);
+	ret = trace_seq_printf(s, "\tdata max type_len  == %d\n",
+			       RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
 
 	return ret;
 }
@@ -204,7 +204,10 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
 
 #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
 #define RB_ALIGNMENT		4U
-#define RB_MAX_SMALL_DATA	28
+#define RB_MAX_SMALL_DATA	(RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
+
+/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
+#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
 
 enum {
 	RB_LEN_TIME_EXTEND = 8,
@@ -213,17 +216,18 @@ enum {
 
 static inline int rb_null_event(struct ring_buffer_event *event)
 {
-	return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0;
+	return event->type_len == RINGBUF_TYPE_PADDING
+			&& event->time_delta == 0;
 }
 
 static inline int rb_discarded_event(struct ring_buffer_event *event)
 {
-	return event->type == RINGBUF_TYPE_PADDING && event->time_delta;
+	return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta;
 }
 
 static void rb_event_set_padding(struct ring_buffer_event *event)
 {
-	event->type = RINGBUF_TYPE_PADDING;
+	event->type_len = RINGBUF_TYPE_PADDING;
 	event->time_delta = 0;
 }
 
@@ -232,8 +236,8 @@ rb_event_data_length(struct ring_buffer_event *event)
 {
 	unsigned length;
 
-	if (event->len)
-		length = event->len * RB_ALIGNMENT;
+	if (event->type_len)
+		length = event->type_len * RB_ALIGNMENT;
 	else
 		length = event->array[0];
 	return length + RB_EVNT_HDR_SIZE;
@@ -243,12 +247,12 @@ rb_event_data_length(struct ring_buffer_event *event)
 static unsigned
 rb_event_length(struct ring_buffer_event *event)
 {
-	switch (event->type) {
+	switch (event->type_len) {
 	case RINGBUF_TYPE_PADDING:
 		if (rb_null_event(event))
 			/* undefined */
 			return -1;
-		return rb_event_data_length(event);
+		return  event->array[0] + RB_EVNT_HDR_SIZE;
 
 	case RINGBUF_TYPE_TIME_EXTEND:
 		return RB_LEN_TIME_EXTEND;
@@ -272,7 +276,7 @@ rb_event_length(struct ring_buffer_event *event)
 unsigned ring_buffer_event_length(struct ring_buffer_event *event)
 {
 	unsigned length = rb_event_length(event);
-	if (event->type != RINGBUF_TYPE_DATA)
+	if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 		return length;
 	length -= RB_EVNT_HDR_SIZE;
 	if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]))
@@ -285,9 +289,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
 static void *
 rb_event_data(struct ring_buffer_event *event)
 {
-	BUG_ON(event->type != RINGBUF_TYPE_DATA);
+	BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
 	/* If length is in len field, then array[0] has the data */
-	if (event->len)
+	if (event->type_len)
 		return (void *)&event->array[0];
 	/* Otherwise length is in array[0] and array[1] has the data */
 	return (void *)&event->array[1];
@@ -988,7 +992,7 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
 		if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
 			return;
 		/* Only count data entries */
-		if (event->type != RINGBUF_TYPE_DATA)
+		if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 			continue;
 		cpu_buffer->overrun++;
 		cpu_buffer->entries--;
@@ -1133,28 +1137,21 @@ static void
 rb_update_event(struct ring_buffer_event *event,
 			 unsigned type, unsigned length)
 {
-	event->type = type;
+	event->type_len = type;
 
 	switch (type) {
 
 	case RINGBUF_TYPE_PADDING:
-		break;
-
 	case RINGBUF_TYPE_TIME_EXTEND:
-		event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT);
-		break;
-
 	case RINGBUF_TYPE_TIME_STAMP:
-		event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT);
 		break;
 
-	case RINGBUF_TYPE_DATA:
+	case 0:
 		length -= RB_EVNT_HDR_SIZE;
-		if (length > RB_MAX_SMALL_DATA) {
-			event->len = 0;
+		if (length > RB_MAX_SMALL_DATA)
 			event->array[0] = length;
-		} else
-			event->len = DIV_ROUND_UP(length, RB_ALIGNMENT);
+		else
+			event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
 		break;
 	default:
 		BUG();
@@ -1562,7 +1559,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 	if (length > BUF_PAGE_SIZE)
 		goto out;
 
-	event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length);
+	event = rb_reserve_next_event(cpu_buffer, 0, length);
 	if (!event)
 		goto out;
 
@@ -1634,7 +1631,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
 
 static inline void rb_event_discard(struct ring_buffer_event *event)
 {
-	event->type = RINGBUF_TYPE_PADDING;
+	/* array[0] holds the actual length for the discarded event */
+	event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
+	event->type_len = RINGBUF_TYPE_PADDING;
 	/* time delta must be non zero */
 	if (!event->time_delta)
 		event->time_delta = 1;
@@ -1786,8 +1785,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
 		goto out;
 
 	event_length = rb_calculate_event_length(length);
-	event = rb_reserve_next_event(cpu_buffer,
-				      RINGBUF_TYPE_DATA, event_length);
+	event = rb_reserve_next_event(cpu_buffer, 0, event_length);
 	if (!event)
 		goto out;
 
@@ -2035,7 +2033,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 {
 	u64 delta;
 
-	switch (event->type) {
+	switch (event->type_len) {
 	case RINGBUF_TYPE_PADDING:
 		return;
 
@@ -2066,7 +2064,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
 {
 	u64 delta;
 
-	switch (event->type) {
+	switch (event->type_len) {
 	case RINGBUF_TYPE_PADDING:
 		return;
 
@@ -2181,7 +2179,8 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
 
 	event = rb_reader_event(cpu_buffer);
 
-	if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event))
+	if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX
+			|| rb_discarded_event(event))
 		cpu_buffer->entries--;
 
 	rb_update_read_stamp(cpu_buffer, event);
@@ -2262,7 +2261,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 
 	event = rb_reader_event(cpu_buffer);
 
-	switch (event->type) {
+	switch (event->type_len) {
 	case RINGBUF_TYPE_PADDING:
 		if (rb_null_event(event))
 			RB_WARN_ON(cpu_buffer, 1);
@@ -2334,7 +2333,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 
 	event = rb_iter_head_event(iter);
 
-	switch (event->type) {
+	switch (event->type_len) {
 	case RINGBUF_TYPE_PADDING:
 		if (rb_null_event(event)) {
 			rb_inc_iter(iter);
@@ -2393,7 +2392,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	event = rb_buffer_peek(buffer, cpu, ts);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-	if (event && event->type == RINGBUF_TYPE_PADDING) {
+	if (event && event->type_len == RINGBUF_TYPE_PADDING) {
 		cpu_relax();
 		goto again;
 	}
@@ -2421,7 +2420,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	event = rb_iter_peek(iter, ts);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-	if (event && event->type == RINGBUF_TYPE_PADDING) {
+	if (event && event->type_len == RINGBUF_TYPE_PADDING) {
 		cpu_relax();
 		goto again;
 	}
@@ -2466,7 +2465,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
  out:
 	preempt_enable();
 
-	if (event && event->type == RINGBUF_TYPE_PADDING) {
+	if (event && event->type_len == RINGBUF_TYPE_PADDING) {
 		cpu_relax();
 		goto again;
 	}
@@ -2559,7 +2558,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
  out:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-	if (event && event->type == RINGBUF_TYPE_PADDING) {
+	if (event && event->type_len == RINGBUF_TYPE_PADDING) {
 		cpu_relax();
 		goto again;
 	}
@@ -2766,7 +2765,7 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
 		if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
 			return;
 		/* Only count data entries */
-		if (event->type != RINGBUF_TYPE_DATA)
+		if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 			continue;
 		cpu_buffer->entries--;
 	}
-- 
cgit v0.10.2


From 3e98f9f15e916c48dfc5231d7e6a59be7f122764 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 24 Apr 2009 15:39:39 +0900
Subject: sh: pci: Fix up the build for CONFIG_PCI=n.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index f9101215..5b2e0fc 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -88,6 +88,7 @@ static inline void pcibios_penalize_isa_irq(int irq, int active)
 #define pci_unmap_len_set(PTR, LEN_NAME, VAL)	do { } while (0)
 #endif
 
+#ifdef CONFIG_PCI
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
 					unsigned long *strategy_parameter)
@@ -95,6 +96,7 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 	*strat = PCI_DMA_BURST_INFINITY;
 	*strategy_parameter = ~0UL;
 }
+#endif
 
 #ifdef CONFIG_SUPERH32
 /*
-- 
cgit v0.10.2


From 782cc5ae6331d63b4febaa312c9d14493aafa9b8 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 24 Apr 2009 09:43:09 +0200
Subject: x86, ds: fix buffer alignment in debug store selftest

The debug store selftest code uses a stack-allocated buffer, which is
not necessarily correctly aligned.

For tests using a buffer to hold a single entry, the buffer that is
passed to ds_request must already be suitably aligned.

Pass a suitably aligned portion of the bigger buffer.

[ Impact: fix hw-branch-tracer self-test failure ]

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: markus.t.metzger@gmail.com
LKML-Reference: <20090424094309.A30145@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
index 5f104a0..6bc7c19 100644
--- a/arch/x86/kernel/ds_selftest.c
+++ b/arch/x86/kernel/ds_selftest.c
@@ -323,13 +323,15 @@ static int ds_selftest_bts_bad_request_task(void *buffer)
 int ds_selftest_bts(void)
 {
 	struct ds_selftest_bts_conf conf;
-	unsigned char buffer[BUFFER_SIZE];
+	unsigned char buffer[BUFFER_SIZE], *small_buffer;
 	unsigned long irq;
 	int cpu;
 
 	printk(KERN_INFO "[ds] bts selftest...");
 	conf.error = 0;
 
+	small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8;
+
 	get_online_cpus();
 	for_each_online_cpu(cpu) {
 		conf.suspend = ds_suspend_bts_wrap;
@@ -381,7 +383,7 @@ int ds_selftest_bts(void)
 	conf.suspend = ds_suspend_bts_noirq;
 	conf.resume = ds_resume_bts_noirq;
 	conf.tracer =
-		ds_request_bts_task(current, buffer, SMALL_BUFFER_SIZE,
+		ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE,
 				   NULL, (size_t)-1, BTS_KERNEL);
 	local_irq_save(irq);
 	ds_selftest_bts_cpu(&conf);
-- 
cgit v0.10.2


From 7e0bfad24d85de7cf2202a7b0ce51de11a077b21 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 24 Apr 2009 09:44:48 +0200
Subject: x86, bts: reenable ptrace branch trace support

The races found by Oleg Nesterov have been fixed.

Reenable branch trace support.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
LKML-Reference: <20090424094448.A30216@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 8130334..924e156 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -506,7 +506,6 @@ config X86_PTRACE_BTS
 	bool "Branch Trace Store"
 	default y
 	depends on X86_DEBUGCTLMSR
-	depends on BROKEN
 	---help---
 	  This adds a ptrace interface to the hardware's branch trace store.
 
-- 
cgit v0.10.2


From 1cb81b143fa8f0e4629f10690862e2e52ca792ff Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 24 Apr 2009 09:51:43 +0200
Subject: x86, bts, mm: clean up buffer allocation

The current mm interface is asymetric. One function allocates a locked
buffer, another function only refunds the memory.

Change this to have two functions for accounting and refunding locked
memory, respectively; and do the actual buffer allocation in ptrace.

[ Impact: refactor BTS buffer allocation code ]

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090424095143.A30265@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index d5252ae..09ecbde 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -617,17 +617,28 @@ struct bts_context {
 	struct work_struct	work;
 };
 
-static inline void alloc_bts_buffer(struct bts_context *context,
-				    unsigned int size)
+static int alloc_bts_buffer(struct bts_context *context, unsigned int size)
 {
-	void *buffer;
+	void *buffer = NULL;
+	int err = -ENOMEM;
 
-	buffer = alloc_locked_buffer(size);
-	if (buffer) {
-		context->buffer = buffer;
-		context->size = size;
-		context->mm = get_task_mm(current);
-	}
+	err = account_locked_memory(current->mm, current->signal->rlim, size);
+	if (err < 0)
+		return err;
+
+	buffer = kzalloc(size, GFP_KERNEL);
+	if (!buffer)
+		goto out_refund;
+
+	context->buffer = buffer;
+	context->size = size;
+	context->mm = get_task_mm(current);
+
+	return 0;
+
+ out_refund:
+	refund_locked_memory(current->mm, size);
+	return err;
 }
 
 static inline void free_bts_buffer(struct bts_context *context)
@@ -638,7 +649,7 @@ static inline void free_bts_buffer(struct bts_context *context)
 	kfree(context->buffer);
 	context->buffer = NULL;
 
-	refund_locked_buffer_memory(context->mm, context->size);
+	refund_locked_memory(context->mm, context->size);
 	context->size = 0;
 
 	mmput(context->mm);
@@ -786,13 +797,15 @@ static int ptrace_bts_config(struct task_struct *child,
 	context->tracer = NULL;
 
 	if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
+		int err;
+
 		free_bts_buffer(context);
 		if (!cfg.size)
 			return 0;
 
-		alloc_bts_buffer(context, cfg.size);
-		if (!context->buffer)
-			return -ENOMEM;
+		err = alloc_bts_buffer(context, cfg.size);
+		if (err < 0)
+			return err;
 	}
 
 	if (cfg.flags & PTRACE_BTS_O_TRACE)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a3963ba..009eabd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -19,6 +19,7 @@ struct anon_vma;
 struct file_ra_state;
 struct user_struct;
 struct writeback_control;
+struct rlimit;
 
 #ifndef CONFIG_DISCONTIGMEM          /* Don't use mapnrs, do it properly */
 extern unsigned long max_mapnr;
@@ -1319,7 +1320,8 @@ int vmemmap_populate_basepages(struct page *start_page,
 int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
 void vmemmap_populate_print_last(void);
 
-extern void *alloc_locked_buffer(size_t size);
-extern void refund_locked_buffer_memory(struct mm_struct *mm, size_t size);
+extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
+				 size_t size);
+extern void refund_locked_memory(struct mm_struct *mm, size_t size);
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/mm/mlock.c b/mm/mlock.c
index 28be15e..ac13043 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -629,38 +629,36 @@ void user_shm_unlock(size_t size, struct user_struct *user)
 	free_uid(user);
 }
 
-void *alloc_locked_buffer(size_t size)
+int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
+			  size_t size)
 {
-	unsigned long rlim, vm, pgsz;
-	void *buffer = NULL;
+	unsigned long lim, vm, pgsz;
+	int error = -ENOMEM;
 
 	pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
 
-	down_write(&current->mm->mmap_sem);
-
-	rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
-	vm   = current->mm->total_vm + pgsz;
-	if (rlim < vm)
-		goto out;
+	down_write(&mm->mmap_sem);
 
-	rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
-	vm   = current->mm->locked_vm + pgsz;
-	if (rlim < vm)
+	lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
+	vm   = mm->total_vm + pgsz;
+	if (lim < vm)
 		goto out;
 
-	buffer = kzalloc(size, GFP_KERNEL);
-	if (!buffer)
+	lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+	vm   = mm->locked_vm + pgsz;
+	if (lim < vm)
 		goto out;
 
-	current->mm->total_vm  += pgsz;
-	current->mm->locked_vm += pgsz;
+	mm->total_vm  += pgsz;
+	mm->locked_vm += pgsz;
 
+	error = 0;
  out:
-	up_write(&current->mm->mmap_sem);
-	return buffer;
+	up_write(&mm->mmap_sem);
+	return error;
 }
 
-void refund_locked_buffer_memory(struct mm_struct *mm, size_t size)
+void refund_locked_memory(struct mm_struct *mm, size_t size)
 {
 	unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
 
-- 
cgit v0.10.2


From 39517091f88fae32b52254b561ced78da1eaf0a7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 24 Apr 2009 11:05:52 -0400
Subject: tracing/lockdep: convert lockdep to use TRACE_EVENT macro

The TRACE_FORMAT will soon be deprecated. This patch converts it to
the TRACE_EVENT macro.

Note, this change should also speed up the tracing.

[ Impact: remove a user of deprecated TRACE_FORMAT ]

Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/trace/events/lockdep.h b/include/trace/events/lockdep.h
index 3ca315c..0e956c9 100644
--- a/include/trace/events/lockdep.h
+++ b/include/trace/events/lockdep.h
@@ -9,28 +9,64 @@
 
 #ifdef CONFIG_LOCKDEP
 
-TRACE_FORMAT(lock_acquire,
+TRACE_EVENT(lock_acquire,
+
 	TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
 		int trylock, int read, int check,
 		struct lockdep_map *next_lock, unsigned long ip),
+
 	TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
-	TP_FMT("%s%s%s", trylock ? "try " : "",
-		read ? "read " : "", lock->name)
-	);
 
-TRACE_FORMAT(lock_release,
+	TP_STRUCT__entry(
+		__field(unsigned int, flags)
+		__string(name, lock->name)
+	),
+
+	TP_fast_assign(
+		__entry->flags = (trylock ? 1 : 0) | (read ? 2 : 0);
+		__assign_str(name, lock->name);
+	),
+
+	TP_printk("%s%s%s", (__entry->flags & 1) ? "try " : "",
+		  (__entry->flags & 2) ? "read " : "",
+		  __get_str(name))
+);
+
+TRACE_EVENT(lock_release,
+
 	TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
+
 	TP_ARGS(lock, nested, ip),
-	TP_FMT("%s", lock->name)
-	);
+
+	TP_STRUCT__entry(
+		__string(name, lock->name)
+	),
+
+	TP_fast_assign(
+		__assign_str(name, lock->name);
+	),
+
+	TP_printk("%s", __get_str(name))
+);
 
 #ifdef CONFIG_LOCK_STAT
 
-TRACE_FORMAT(lock_contended,
+TRACE_EVENT(lock_contended,
+
 	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+
 	TP_ARGS(lock, ip),
-	TP_FMT("%s", lock->name)
-	);
+
+	TP_STRUCT__entry(
+		__string(name, lock->name)
+	),
+
+	TP_fast_assign(
+		__assign_str(name, lock->name);
+	),
+
+	TP_printk("%s", __get_str(name))
+);
 
 TRACE_EVENT(lock_acquired,
 	TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime),
-- 
cgit v0.10.2


From 160031b556e93590fa8635210d73d93c3d3853a9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 24 Apr 2009 11:26:55 -0400
Subject: tracing/irq: convert irq traces to use TRACE_EVENT macro

The TRACE_FORMAT will soon be deprecated. This patch converts it to
the TRACE_EVENT macro.

Note, this change should also speed up the tracing.

[ Impact: remove a user of deprecated TRACE_FORMAT ]

Cc: Jason Baron <jbaron@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h
index 75e3468..7686864 100644
--- a/include/trace/events/irq.h
+++ b/include/trace/events/irq.h
@@ -10,11 +10,24 @@
 /*
  * Tracepoint for entry of interrupt handler:
  */
-TRACE_FORMAT(irq_handler_entry,
+TRACE_EVENT(irq_handler_entry,
+
 	TP_PROTO(int irq, struct irqaction *action),
+
 	TP_ARGS(irq, action),
-	TP_FMT("irq=%d handler=%s", irq, action->name)
-	);
+
+	TP_STRUCT__entry(
+		__field(	int,	irq		)
+		__string(	name,	action->name	)
+	),
+
+	TP_fast_assign(
+		__entry->irq = irq;
+		__assign_str(name, action->name);
+	),
+
+	TP_printk("irq=%d handler=%s", __entry->irq, __get_str(name))
+);
 
 /*
  * Tracepoint for return of an interrupt handler:
@@ -39,17 +52,43 @@ TRACE_EVENT(irq_handler_exit,
 		  __entry->irq, __entry->ret ? "handled" : "unhandled")
 );
 
-TRACE_FORMAT(softirq_entry,
+TRACE_EVENT(softirq_entry,
+
 	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+
 	TP_ARGS(h, vec),
-	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
-	);
 
-TRACE_FORMAT(softirq_exit,
+	TP_STRUCT__entry(
+		__field(	int,	vec			)
+		__string(	name,	softirq_to_name[h-vec]	)
+	),
+
+	TP_fast_assign(
+		__entry->vec = (int)(h - vec);
+		__assign_str(name, softirq_to_name[h-vec]);
+	),
+
+	TP_printk("softirq=%d action=%s", __entry->vec, __get_str(name))
+);
+
+TRACE_EVENT(softirq_exit,
+
 	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+
 	TP_ARGS(h, vec),
-	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
-	);
+
+	TP_STRUCT__entry(
+		__field(	int,	vec			)
+		__string(	name,	softirq_to_name[h-vec]	)
+	),
+
+	TP_fast_assign(
+		__entry->vec = (int)(h - vec);
+		__assign_str(name, softirq_to_name[h-vec]);
+	),
+
+	TP_printk("softirq=%d action=%s", __entry->vec, __get_str(name))
+);
 
 #endif /*  _TRACE_IRQ_H */
 
-- 
cgit v0.10.2


From b8e65554d80b4c560d201362d0e8fa02109d89fd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 24 Apr 2009 11:50:39 -0400
Subject: tracing: remove deprecated TRACE_FORMAT

The TRACE_FORMAT macro has been deprecated by the TRACE_EVENT macro.
There are no more users. All new users must use the TRACE_EVENT macro.

[ Impact: remove old functionality ]

Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 4353f3f..14df7e6 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -158,11 +158,6 @@ static inline void tracepoint_synchronize_unregister(void)
 
 #define PARAMS(args...) args
 
-#ifndef TRACE_FORMAT
-#define TRACE_FORMAT(name, proto, args, fmt)		\
-	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
-#endif
-
 #ifndef TRACE_EVENT
 /*
  * For use with the TRACE_EVENT macro:
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index abc611f..f7a7ae1 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -26,10 +26,6 @@
 #define TRACE_EVENT(name, proto, args, tstruct, assign, print)	\
 	DEFINE_TRACE(name)
 
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(name, proto, args, print)	\
-	DEFINE_TRACE(name)
-
 #undef DECLARE_TRACE
 #define DECLARE_TRACE(name, proto, args)	\
 	DEFINE_TRACE(name)
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index a77f71a..1e68114 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -18,9 +18,6 @@
 
 #include <linux/ftrace_event.h>
 
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt)
-
 #undef __array
 #define __array(type, item, len)	type	item[len];
 
@@ -62,9 +59,6 @@
  *
  */
 
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt)
-
 #undef __array
 #define __array(type, item, len)
 
@@ -298,16 +292,6 @@ ftrace_define_fields_##call(void)					\
  *	unregister_trace_<call>(ftrace_event_<call>);
  * }
  *
- * For those macros defined with TRACE_FORMAT:
- *
- * static struct ftrace_event_call __used
- * __attribute__((__aligned__(4)))
- * __attribute__((section("_ftrace_events"))) event_<call> = {
- *	.name			= "<call>",
- *	.regfunc		= ftrace_reg_event_<call>,
- *	.unregfunc		= ftrace_unreg_event_<call>,
- * }
- *
  *
  * For those macros defined with TRACE_EVENT:
  *
@@ -417,56 +401,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \
 #define _TRACE_PROFILE_INIT(call)
 #endif
 
-#define _TRACE_FORMAT(call, proto, args, fmt)				\
-static void ftrace_event_##call(proto)					\
-{									\
-	event_trace_printk(_RET_IP_, #call ": " fmt);			\
-}									\
-									\
-static int ftrace_reg_event_##call(void)				\
-{									\
-	int ret;							\
-									\
-	ret = register_trace_##call(ftrace_event_##call);		\
-	if (ret)							\
-		pr_info("event trace: Could not activate trace point "	\
-			"probe to " #call "\n");			\
-	return ret;							\
-}									\
-									\
-static void ftrace_unreg_event_##call(void)				\
-{									\
-	unregister_trace_##call(ftrace_event_##call);			\
-}									\
-									\
-static struct ftrace_event_call event_##call;				\
-									\
-static int ftrace_init_event_##call(void)				\
-{									\
-	int id;								\
-									\
-	id = register_ftrace_event(NULL);				\
-	if (!id)							\
-		return -ENODEV;						\
-	event_##call.id = id;						\
-	return 0;							\
-}
-
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt)				\
-_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
-_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))			\
-static struct ftrace_event_call __used					\
-__attribute__((__aligned__(4)))						\
-__attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name			= #call,				\
-	.system			= __stringify(TRACE_SYSTEM),		\
-	.raw_init		= ftrace_init_event_##call,		\
-	.regfunc		= ftrace_reg_event_##call,		\
-	.unregfunc		= ftrace_unreg_event_##call,		\
-	_TRACE_PROFILE_INIT(call)					\
-}
-
 #undef __entry
 #define __entry entry
 
-- 
cgit v0.10.2


From 060fa5c83e67901ba47ab484cfcdb32737d630ba Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 24 Apr 2009 12:20:52 -0400
Subject: tracing/events: reuse trace event ids after overflow

With modules being able to add trace events, and the max trace event
counter is 16 bits (65536) we can overflow the counter easily
with a simple while loop adding and removing modules that contain
trace events.

This patch links together the registered trace events and on overflow
searches for available trace event ids. It will still fail if
over 65536 events are registered, but considering that a typical
kernel only has 22000 functions, 65000 events should be sufficient.

Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 07e0a6d..78a9ba2 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -56,6 +56,7 @@ typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
 					      int flags);
 struct trace_event {
 	struct hlist_node	node;
+	struct list_head	list;
 	int			type;
 	trace_print_func	trace;
 	trace_print_func	raw;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 06997e7..5fc51f0 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -483,6 +483,36 @@ struct trace_event *ftrace_find_event(int type)
 	return NULL;
 }
 
+static LIST_HEAD(ftrace_event_list);
+
+static int trace_search_list(struct list_head **list)
+{
+	struct trace_event *e;
+	int last = __TRACE_LAST_TYPE;
+
+	if (list_empty(&ftrace_event_list)) {
+		*list = &ftrace_event_list;
+		return last + 1;
+	}
+
+	/*
+	 * We used up all possible max events,
+	 * lets see if somebody freed one.
+	 */
+	list_for_each_entry(e, &ftrace_event_list, list) {
+		if (e->type != last + 1)
+			break;
+		last++;
+	}
+
+	/* Did we used up all 65 thousand events??? */
+	if ((last + 1) > FTRACE_MAX_EVENT)
+		return 0;
+
+	*list = &e->list;
+	return last + 1;
+}
+
 /**
  * register_ftrace_event - register output for an event type
  * @event: the event type to register
@@ -505,20 +535,40 @@ int register_ftrace_event(struct trace_event *event)
 
 	mutex_lock(&trace_event_mutex);
 
-	if (!event) {
-		ret = next_event_type++;
+	if (WARN_ON(!event))
 		goto out;
-	}
 
-	if (!event->type)
-		event->type = next_event_type++;
-	else if (event->type > __TRACE_LAST_TYPE) {
+	INIT_LIST_HEAD(&event->list);
+
+	if (!event->type) {
+		struct list_head *list;
+
+		if (next_event_type > FTRACE_MAX_EVENT) {
+
+			event->type = trace_search_list(&list);
+			if (!event->type)
+				goto out;
+
+		} else {
+			
+			event->type = next_event_type++;
+			list = &ftrace_event_list;
+		}
+
+		if (WARN_ON(ftrace_find_event(event->type)))
+			goto out;
+
+		list_add_tail(&event->list, list);
+
+	} else if (event->type > __TRACE_LAST_TYPE) {
 		printk(KERN_WARNING "Need to add type to trace.h\n");
 		WARN_ON(1);
-	}
-
-	if (ftrace_find_event(event->type))
 		goto out;
+	} else {
+		/* Is this event already used */
+		if (ftrace_find_event(event->type))
+			goto out;
+	}
 
 	if (event->trace == NULL)
 		event->trace = trace_nop_print;
@@ -537,8 +587,6 @@ int register_ftrace_event(struct trace_event *event)
  out:
 	mutex_unlock(&trace_event_mutex);
 
-	WARN_ON_ONCE(next_event_type > FTRACE_MAX_EVENT);
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(register_ftrace_event);
@@ -551,6 +599,7 @@ int unregister_ftrace_event(struct trace_event *event)
 {
 	mutex_lock(&trace_event_mutex);
 	hlist_del(&event->node);
+	list_del(&event->list);
 	mutex_unlock(&trace_event_mutex);
 
 	return 0;
-- 
cgit v0.10.2


From 79ffab34391933ee3b95dac7f25c0478fa2f8f1e Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 13 May 2009 15:13:42 -0400
Subject: ext4: Properly initialize the buffer_head state

These struct buffer_heads are allocated on the stack (and hence are
initialized with stack garbage).  They are only used to call a
get_blocks() function, so that's mostly OK, but b_state must be
initialized to be 0 so we don't have any unexpected BH_* flags set by
accident, such as BH_Unwritten or BH_Delay.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e3a55eb..a953214 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3150,6 +3150,7 @@ retry:
 			ret = PTR_ERR(handle);
 			break;
 		}
+		map_bh.b_state = 0;
 		ret = ext4_get_blocks_wrap(handle, inode, block,
 					  max_blocks, &map_bh,
 					  EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2a9ffd5..d7ad0bb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2055,7 +2055,20 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	if ((mpd->b_state  & (1 << BH_Mapped)) &&
 	    !(mpd->b_state & (1 << BH_Delay)))
 		return 0;
-	new.b_state = mpd->b_state;
+	/*
+	 * We need to make sure the BH_Delay flag is passed down to
+	 * ext4_da_get_block_write(), since it calls
+	 * ext4_get_blocks_wrap() with the EXT4_DELALLOC_RSVED flag.
+	 * This flag causes ext4_get_blocks_wrap() to call
+	 * ext4_da_update_reserve_space() if the passed buffer head
+	 * has the BH_Delay flag set.  In the future, once we clean up
+	 * the interfaces to ext4_get_blocks_wrap(), we should pass in
+	 * a separate flag which requests that the delayed allocation
+	 * statistics should be updated, instead of depending on the
+	 * state information getting passed down via the map_bh's
+	 * state bitmasks plus the magic EXT4_DELALLOC_RSVED flag.
+	 */
+	new.b_state = mpd->b_state & (1 << BH_Delay);
 	new.b_blocknr = 0;
 	new.b_size = mpd->b_size;
 	next = mpd->b_blocknr;
diff --git a/fs/mpage.c b/fs/mpage.c
index 680ba60..42381bd 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -379,7 +379,8 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
 	struct buffer_head map_bh;
 	unsigned long first_logical_block = 0;
 
-	clear_buffer_mapped(&map_bh);
+	map_bh.b_state = 0;
+	map_bh.b_size = 0;
 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
 		struct page *page = list_entry(pages->prev, struct page, lru);
 
@@ -412,7 +413,8 @@ int mpage_readpage(struct page *page, get_block_t get_block)
 	struct buffer_head map_bh;
 	unsigned long first_logical_block = 0;
 
-	clear_buffer_mapped(&map_bh);
+	map_bh.b_state = 0;
+	map_bh.b_size = 0;
 	bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
 			&map_bh, &first_logical_block, get_block);
 	if (bio)
-- 
cgit v0.10.2


From 8fb0e342481c4d80040670fec915f0b9c7c6499a Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 12 May 2009 16:22:37 -0400
Subject: vfs: Add BUG_ON for delayed and unwritten flags in submit_bh()

The BH_Delay and BH_Unwritten flags should never leak out to
submit_bh().  So add some BUG_ON() checks to submit_bh so we can get a
stack trace and determine how and why this might have happened.

(Note that only XFS and ext4 use these buffer head flags, and XFS does
not use submit_bh().  So this patch should only modify behavior for
ext4.)

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: linux-fsdevel@vger.kernel.org

diff --git a/fs/buffer.c b/fs/buffer.c
index aed2977..ad01129 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2933,6 +2933,8 @@ int submit_bh(int rw, struct buffer_head * bh)
 	BUG_ON(!buffer_locked(bh));
 	BUG_ON(!buffer_mapped(bh));
 	BUG_ON(!bh->b_end_io);
+	BUG_ON(buffer_delay(bh));
+	BUG_ON(buffer_unwritten(bh));
 
 	/*
 	 * Mask in barrier bit for a write (could be either a WRITE or a
-- 
cgit v0.10.2


From 29fa89d088941d79765d60f22d5ccdd6b8696e11 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 12 May 2009 16:30:27 -0400
Subject: ext4: Mark the unwritten buffer_head as mapped during write_begin

Setting BH_Unwritten buffer_heads as BH_Mapped avoids multiple
(unnecessary) calls to get_block() during the call to the write(2)
system call.  Setting BH_Unwritten buffer heads as BH_Mapped requires
that the writepages() functions can handle BH_Unwritten buffer_heads.

After this commit, things work as follows:

ext4_ext_get_block() returns unmapped, unwritten, buffer head when
called with create = 0 for prealloc space. This makes sure we handle
the read path and non-delayed allocation case correctly.  Even though
the buffer head is marked unmapped we have valid b_blocknr and b_bdev
values in the buffer_head.

ext4_da_get_block_prep() called for block resrevation will now return
mapped, unwritten, new buffer_head for prealloc space. This avoids
multiple calls to get_block() for write to same offset. By making such
buffers as BH_New, we also assure that sub-block zeroing of buffered
writes happens correctly.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index a953214..ea5c476 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2872,6 +2872,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			if (create == EXT4_CREATE_UNINITIALIZED_EXT)
 				goto out;
 			if (!create) {
+				if (allocated > max_blocks)
+					allocated = max_blocks;
 				/*
 				 * We have blocks reserved already.  We
 				 * return allocated blocks so that delalloc
@@ -2879,8 +2881,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 				 * the buffer head will be unmapped so that
 				 * a read from the block returns 0s.
 				 */
-				if (allocated > max_blocks)
-					allocated = max_blocks;
 				set_buffer_unwritten(bh_result);
 				bh_result->b_bdev = inode->i_sb->s_bdev;
 				bh_result->b_blocknr = newblock;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d7ad0bb..96f3366 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1852,7 +1852,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
  * @logical - first logical block to start assignment with
  *
  * the function goes through all passed space and put actual disk
- * block numbers into buffer heads, dropping BH_Delay
+ * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
  */
 static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
 				 struct buffer_head *exbh)
@@ -1902,16 +1902,24 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
 			do {
 				if (cur_logical >= logical + blocks)
 					break;
-				if (buffer_delay(bh)) {
-					bh->b_blocknr = pblock;
-					clear_buffer_delay(bh);
-					bh->b_bdev = inode->i_sb->s_bdev;
-				} else if (buffer_unwritten(bh)) {
-					bh->b_blocknr = pblock;
-					clear_buffer_unwritten(bh);
-					set_buffer_mapped(bh);
-					set_buffer_new(bh);
-					bh->b_bdev = inode->i_sb->s_bdev;
+
+				if (buffer_delay(bh) ||
+						buffer_unwritten(bh)) {
+
+					BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
+
+					if (buffer_delay(bh)) {
+						clear_buffer_delay(bh);
+						bh->b_blocknr = pblock;
+					} else {
+						/*
+						 * unwritten already should have
+						 * blocknr assigned. Verify that
+						 */
+						clear_buffer_unwritten(bh);
+						BUG_ON(bh->b_blocknr != pblock);
+					}
+
 				} else if (buffer_mapped(bh))
 					BUG_ON(bh->b_blocknr != pblock);
 
@@ -2053,7 +2061,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	 * We consider only non-mapped and non-allocated blocks
 	 */
 	if ((mpd->b_state  & (1 << BH_Mapped)) &&
-	    !(mpd->b_state & (1 << BH_Delay)))
+		!(mpd->b_state & (1 << BH_Delay)) &&
+		!(mpd->b_state & (1 << BH_Unwritten)))
 		return 0;
 	/*
 	 * We need to make sure the BH_Delay flag is passed down to
@@ -2205,6 +2214,17 @@ flush_it:
 	return;
 }
 
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+	/*
+	 * unmapped buffer is possible for holes.
+	 * delay buffer is possible with delayed allocation.
+	 * We also need to consider unwritten buffer as unmapped.
+	 */
+	return (!buffer_mapped(bh) || buffer_delay(bh) ||
+				buffer_unwritten(bh)) && buffer_dirty(bh);
+}
+
 /*
  * __mpage_da_writepage - finds extent of pages and blocks
  *
@@ -2289,8 +2309,7 @@ static int __mpage_da_writepage(struct page *page,
 			 * Otherwise we won't make progress
 			 * with the page in ext4_da_writepage
 			 */
-			if (buffer_dirty(bh) &&
-			    (!buffer_mapped(bh) || buffer_delay(bh))) {
+			if (ext4_bh_unmapped_or_delay(NULL, bh)) {
 				mpage_add_bh_to_extent(mpd, logical,
 						       bh->b_size,
 						       bh->b_state);
@@ -2318,6 +2337,14 @@ static int __mpage_da_writepage(struct page *page,
 /*
  * this is a special callback for ->write_begin() only
  * it's intention is to return mapped block or reserve space
+ *
+ * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
+ * We also have b_blocknr = -1 and b_bdev initialized properly
+ *
+ * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
+ * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
+ * initialized properly.
+ *
  */
 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 				  struct buffer_head *bh_result, int create)
@@ -2353,28 +2380,23 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 		set_buffer_delay(bh_result);
 	} else if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
-		/*
-		 * With sub-block writes into unwritten extents
-		 * we also need to mark the buffer as new so that
-		 * the unwritten parts of the buffer gets correctly zeroed.
-		 */
-		if (buffer_unwritten(bh_result))
+		if (buffer_unwritten(bh_result)) {
+			/* A delayed write to unwritten bh should
+			 * be marked new and mapped.  Mapped ensures
+			 * that we don't do get_block multiple times
+			 * when we write to the same offset and new
+			 * ensures that we do proper zero out for
+			 * partial write.
+			 */
 			set_buffer_new(bh_result);
+			set_buffer_mapped(bh_result);
+		}
 		ret = 0;
 	}
 
 	return ret;
 }
 
-static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
-{
-	/*
-	 * unmapped buffer is possible for holes.
-	 * delay buffer is possible with delayed allocation
-	 */
-	return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
-}
-
 static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
 {
@@ -2828,7 +2850,7 @@ static int ext4_da_should_update_i_disksize(struct page *page,
 	for (i = 0; i < idx; i++)
 		bh = bh->b_this_page;
 
-	if (!buffer_mapped(bh) || (buffer_delay(bh)))
+	if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
 		return 0;
 	return 1;
 }
-- 
cgit v0.10.2


From c5ca7c7636fa689a9746b6032f83aa7fffec31c6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 27 Apr 2009 22:48:48 -0400
Subject: ext4: Fallback to vmalloc if kmalloc can't allocate s_flex_groups
 array

For very large filesystems, the s_flex_groups array can get quite big.
For example, a filesystem that can be resized up to 16TB will have
8192 flex groups (assuming the default flex_bg size of 16), so the
array is 96k, which is *very* marginal for kmalloc().  On the other
hand, a 160GB filesystem without the resize_inode feature will only
require 960 bytes.  So we try to allocate the array first using
kmalloc(), and if that fails, we'll try to use vmalloc() instead.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2958f4e..3f4475d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -20,6 +20,7 @@
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/time.h>
+#include <linux/vmalloc.h>
 #include <linux/jbd2.h>
 #include <linux/slab.h>
 #include <linux/init.h>
@@ -586,7 +587,10 @@ static void ext4_put_super(struct super_block *sb)
 	for (i = 0; i < sbi->s_gdb_count; i++)
 		brelse(sbi->s_group_desc[i]);
 	kfree(sbi->s_group_desc);
-	kfree(sbi->s_flex_groups);
+	if (is_vmalloc_addr(sbi->s_flex_groups))
+		vfree(sbi->s_flex_groups);
+	else
+		kfree(sbi->s_flex_groups);
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -1620,6 +1624,7 @@ static int ext4_fill_flex_info(struct super_block *sb)
 	ext4_group_t flex_group_count;
 	ext4_group_t flex_group;
 	int groups_per_flex = 0;
+	size_t size;
 	int i;
 
 	if (!sbi->s_es->s_log_groups_per_flex) {
@@ -1634,8 +1639,13 @@ static int ext4_fill_flex_info(struct super_block *sb)
 	flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
 			((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
 			      EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
-	sbi->s_flex_groups = kzalloc(flex_group_count *
-				     sizeof(struct flex_groups), GFP_KERNEL);
+	size = flex_group_count * sizeof(struct flex_groups);
+	sbi->s_flex_groups = kzalloc(size, GFP_KERNEL);
+	if (sbi->s_flex_groups == NULL) {
+		sbi->s_flex_groups = vmalloc(size);
+		if (sbi->s_flex_groups)
+			memset(sbi->s_flex_groups, 0, size);
+	}
 	if (sbi->s_flex_groups == NULL) {
 		printk(KERN_ERR "EXT4-fs: not enough memory for "
 				"%u flex groups\n", flex_group_count);
@@ -2842,6 +2852,12 @@ failed_mount4:
 		sbi->s_journal = NULL;
 	}
 failed_mount3:
+	if (sbi->s_flex_groups) {
+		if (is_vmalloc_addr(sbi->s_flex_groups))
+			vfree(sbi->s_flex_groups);
+		else
+			kfree(sbi->s_flex_groups);
+	}
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
-- 
cgit v0.10.2


From f7c439504ccba0cca43271e651013ab97a221c62 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 24 Apr 2009 23:31:59 -0400
Subject: ext4: Use is_power_of_2() for clarity

Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3f4475d..3e509bc 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1483,7 +1483,7 @@ set_qf_format:
 				return 0;
 			if (option < 0 || option > (1 << 30))
 				return 0;
-			if (option & (option - 1)) {
+			if (!is_power_of_2(option)) {
 				printk(KERN_ERR "EXT4-fs: inode_readahead_blks"
 				       " must be a power of 2\n");
 				return 0;
@@ -2101,8 +2101,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
 	if (parse_strtoul(buf, 0x40000000, &t))
 		return -EINVAL;
 
-	/* inode_readahead_blks must be a power of 2 */
-	if (t & (t-1))
+	if (!is_power_of_2(t))
 		return -EINVAL;
 
 	sbi->s_inode_readahead_blks = t;
-- 
cgit v0.10.2


From e2d670523c6c4ccb0fca9f3ab1b8f066d9aa57d6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 1 May 2009 00:33:44 -0400
Subject: ext4: Simplify ext4_commit_super()'s function signature

The ext4_commit_super() function took both a struct super_block * and
a struct ext4_super_block *, but the struct ext4_super_block can be
derived from the struct super_block.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3e509bc..ad4c9be 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -54,8 +54,7 @@ static struct kset *ext4_kset;
 
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
 			     unsigned long journal_devnum);
-static int ext4_commit_super(struct super_block *sb,
-			      struct ext4_super_block *es, int sync);
+static int ext4_commit_super(struct super_block *sb, int sync);
 static void ext4_mark_recovery_complete(struct super_block *sb,
 					struct ext4_super_block *es);
 static void ext4_clear_journal_err(struct super_block *sb,
@@ -306,7 +305,7 @@ static void ext4_handle_error(struct super_block *sb)
 		printk(KERN_CRIT "Remounting filesystem read-only\n");
 		sb->s_flags |= MS_RDONLY;
 	}
-	ext4_commit_super(sb, es, 1);
+	ext4_commit_super(sb, 1);
 	if (test_opt(sb, ERRORS_PANIC))
 		panic("EXT4-fs (device %s): panic forced after error\n",
 			sb->s_id);
@@ -448,7 +447,7 @@ __acquires(bitlock)
 	if (test_opt(sb, ERRORS_CONT)) {
 		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-		ext4_commit_super(sb, es, 0);
+		ext4_commit_super(sb, 0);
 		return;
 	}
 	ext4_unlock_group(sb, grp);
@@ -577,7 +576,7 @@ static void ext4_put_super(struct super_block *sb)
 	if (!(sb->s_flags & MS_RDONLY)) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
-		ext4_commit_super(sb, es, 1);
+		ext4_commit_super(sb, 1);
 	}
 	if (sbi->s_proc) {
 		remove_proc_entry(sb->s_id, ext4_proc_root);
@@ -1596,7 +1595,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 	if (sbi->s_journal)
 		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 
-	ext4_commit_super(sb, es, 1);
+	ext4_commit_super(sb, 1);
 	if (test_opt(sb, DEBUG))
 		printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
 				"bpg=%lu, ipg=%lu, mo=%04lx]\n",
@@ -2655,7 +2654,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			if (test_opt(sb, ERRORS_PANIC)) {
 				EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 				es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-				ext4_commit_super(sb, es, 1);
+				ext4_commit_super(sb, 1);
 				goto failed_mount4;
 			}
 		}
@@ -3132,15 +3131,15 @@ static int ext4_load_journal(struct super_block *sb,
 		sb->s_dirt = 1;
 
 		/* Make sure we flush the recovery flag to disk. */
-		ext4_commit_super(sb, es, 1);
+		ext4_commit_super(sb, 1);
 	}
 
 	return 0;
 }
 
-static int ext4_commit_super(struct super_block *sb,
-			      struct ext4_super_block *es, int sync)
+static int ext4_commit_super(struct super_block *sb, int sync)
 {
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 	struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
 	int error = 0;
 
@@ -3212,7 +3211,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
 	    sb->s_flags & MS_RDONLY) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		sb->s_dirt = 0;
-		ext4_commit_super(sb, es, 1);
+		ext4_commit_super(sb, 1);
 	}
 	unlock_super(sb);
 
@@ -3253,7 +3252,7 @@ static void ext4_clear_journal_err(struct super_block *sb,
 
 		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-		ext4_commit_super(sb, es, 1);
+		ext4_commit_super(sb, 1);
 
 		jbd2_journal_clear_err(journal);
 	}
@@ -3293,7 +3292,7 @@ static void ext4_write_super(struct super_block *sb)
 			BUG();
 		sb->s_dirt = 0;
 	} else {
-		ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+		ext4_commit_super(sb, 1);
 	}
 }
 
@@ -3312,7 +3311,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 						     target);
 		}
 	} else {
-		ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
+		ext4_commit_super(sb, wait);
 	}
 	return ret;
 }
@@ -3345,7 +3344,7 @@ static int ext4_freeze(struct super_block *sb)
 
 		/* Journal blocked and flushed, clear needs_recovery flag. */
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-		error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+		error = ext4_commit_super(sb, 1);
 		if (error)
 			goto out;
 	}
@@ -3365,7 +3364,7 @@ static int ext4_unfreeze(struct super_block *sb)
 		lock_super(sb);
 		/* Reser the needs_recovery flag before the fs is unlocked. */
 		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-		ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+		ext4_commit_super(sb, 1);
 		unlock_super(sb);
 		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
 	}
@@ -3520,7 +3519,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 		}
 	}
 	if (sbi->s_journal == NULL)
-		ext4_commit_super(sb, es, 1);
+		ext4_commit_super(sb, 1);
 
 #ifdef CONFIG_QUOTA
 	/* Release old quota file names */
-- 
cgit v0.10.2


From 7234ab2a55e77784b44cf2d862136d9e41b8d98a Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 30 Apr 2009 21:24:04 -0400
Subject: ext4: Fix and simplify s_dirt handling

The s_dirt flag wasn't completely handled correctly, but it didn't
really matter when journalling was enabled.  It turns out that when
ext4 runs without a journal, we don't clear s_dirt in places where we
should have, with the result that the high-level write_super()
function was writing the superblock when it wasn't necessary.

So we fix this by making ext4_commit_super() clear the s_dirt flag,
and removing many of the other places where s_dirt is manipulated.
When journalling is enabled, the s_dirt flag might be left set more
often, but s_dirt really doesn't matter when journalling is enabled.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ad4c9be..7c7a08a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3128,7 +3128,6 @@ static int ext4_load_journal(struct super_block *sb,
 	if (journal_devnum &&
 	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
 		es->s_journal_dev = cpu_to_le32(journal_devnum);
-		sb->s_dirt = 1;
 
 		/* Make sure we flush the recovery flag to disk. */
 		ext4_commit_super(sb, 1);
@@ -3168,7 +3167,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 					&EXT4_SB(sb)->s_freeblocks_counter));
 	es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
 					&EXT4_SB(sb)->s_freeinodes_counter));
-
+	sb->s_dirt = 0;
 	BUFFER_TRACE(sbh, "marking dirty");
 	mark_buffer_dirty(sbh);
 	if (sync) {
@@ -3210,7 +3209,6 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
 	    sb->s_flags & MS_RDONLY) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-		sb->s_dirt = 0;
 		ext4_commit_super(sb, 1);
 	}
 	unlock_super(sb);
@@ -3271,10 +3269,8 @@ int ext4_force_commit(struct super_block *sb)
 		return 0;
 
 	journal = EXT4_SB(sb)->s_journal;
-	if (journal) {
-		sb->s_dirt = 0;
+	if (journal)
 		ret = ext4_journal_force_commit(journal);
-	}
 
 	return ret;
 }
@@ -3282,15 +3278,13 @@ int ext4_force_commit(struct super_block *sb)
 /*
  * Ext4 always journals updates to the superblock itself, so we don't
  * have to propagate any other updates to the superblock on disk at this
- * point.  (We can probably nuke this function altogether, and remove
- * any mention to sb->s_dirt in all of fs/ext4; eventual cleanup...)
+ * point if the journalling is enabled.
  */
 static void ext4_write_super(struct super_block *sb)
 {
 	if (EXT4_SB(sb)->s_journal) {
 		if (mutex_trylock(&sb->s_lock) != 0)
 			BUG();
-		sb->s_dirt = 0;
 	} else {
 		ext4_commit_super(sb, 1);
 	}
@@ -3302,7 +3296,6 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 	tid_t target;
 
 	trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
-	sb->s_dirt = 0;
 	if (EXT4_SB(sb)->s_journal) {
 		if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal,
 					      &target)) {
@@ -3324,7 +3317,6 @@ static int ext4_freeze(struct super_block *sb)
 {
 	int error = 0;
 	journal_t *journal;
-	sb->s_dirt = 0;
 
 	if (!(sb->s_flags & MS_RDONLY)) {
 		journal = EXT4_SB(sb)->s_journal;
-- 
cgit v0.10.2


From 9ca92389c5312a51e819c15c762f0abdc7f3129b Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 1 May 2009 12:52:25 -0400
Subject: ext4: Use separate super_operations structure for no_journal
 filesystems

By using a separate super_operations structure for filesystems that
have and don't have journals, we can simply ext4_write_super() ---
which is only needed when no journal is present --- and ext4_freeze(),
ext4_unfreeze(), and ext4_sync_fs(), which are only needed when the
journal is present.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7c7a08a..68c3a44 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -995,7 +995,6 @@ static const struct super_operations ext4_sops = {
 	.dirty_inode	= ext4_dirty_inode,
 	.delete_inode	= ext4_delete_inode,
 	.put_super	= ext4_put_super,
-	.write_super	= ext4_write_super,
 	.sync_fs	= ext4_sync_fs,
 	.freeze_fs	= ext4_freeze,
 	.unfreeze_fs	= ext4_unfreeze,
@@ -1010,6 +1009,25 @@ static const struct super_operations ext4_sops = {
 	.bdev_try_to_free_page = bdev_try_to_free_page,
 };
 
+static const struct super_operations ext4_nojournal_sops = {
+	.alloc_inode	= ext4_alloc_inode,
+	.destroy_inode	= ext4_destroy_inode,
+	.write_inode	= ext4_write_inode,
+	.dirty_inode	= ext4_dirty_inode,
+	.delete_inode	= ext4_delete_inode,
+	.write_super	= ext4_write_super,
+	.put_super	= ext4_put_super,
+	.statfs		= ext4_statfs,
+	.remount_fs	= ext4_remount,
+	.clear_inode	= ext4_clear_inode,
+	.show_options	= ext4_show_options,
+#ifdef CONFIG_QUOTA
+	.quota_read	= ext4_quota_read,
+	.quota_write	= ext4_quota_write,
+#endif
+	.bdev_try_to_free_page = bdev_try_to_free_page,
+};
+
 static const struct export_operations ext4_export_ops = {
 	.fh_to_dentry = ext4_fh_to_dentry,
 	.fh_to_parent = ext4_fh_to_parent,
@@ -2615,7 +2633,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	/*
 	 * set up enough so that it can read an inode
 	 */
-	sb->s_op = &ext4_sops;
+	if (!test_opt(sb, NOLOAD) &&
+	    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+		sb->s_op = &ext4_sops;
+	else
+		sb->s_op = &ext4_nojournal_sops;
 	sb->s_export_op = &ext4_export_ops;
 	sb->s_xattr = ext4_xattr_handlers;
 #ifdef CONFIG_QUOTA
@@ -3275,19 +3297,9 @@ int ext4_force_commit(struct super_block *sb)
 	return ret;
 }
 
-/*
- * Ext4 always journals updates to the superblock itself, so we don't
- * have to propagate any other updates to the superblock on disk at this
- * point if the journalling is enabled.
- */
 static void ext4_write_super(struct super_block *sb)
 {
-	if (EXT4_SB(sb)->s_journal) {
-		if (mutex_trylock(&sb->s_lock) != 0)
-			BUG();
-	} else {
-		ext4_commit_super(sb, 1);
-	}
+	ext4_commit_super(sb, 1);
 }
 
 static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -3296,15 +3308,9 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 	tid_t target;
 
 	trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
-	if (EXT4_SB(sb)->s_journal) {
-		if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal,
-					      &target)) {
-			if (wait)
-				jbd2_log_wait_commit(EXT4_SB(sb)->s_journal,
-						     target);
-		}
-	} else {
-		ext4_commit_super(sb, wait);
+	if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
+		if (wait)
+			jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
 	}
 	return ret;
 }
@@ -3318,32 +3324,31 @@ static int ext4_freeze(struct super_block *sb)
 	int error = 0;
 	journal_t *journal;
 
-	if (!(sb->s_flags & MS_RDONLY)) {
-		journal = EXT4_SB(sb)->s_journal;
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
 
-		if (journal) {
-			/* Now we set up the journal barrier. */
-			jbd2_journal_lock_updates(journal);
+	journal = EXT4_SB(sb)->s_journal;
 
-			/*
-			 * We don't want to clear needs_recovery flag when we
-			 * failed to flush the journal.
-			 */
-			error = jbd2_journal_flush(journal);
-			if (error < 0)
-				goto out;
-		}
+	/* Now we set up the journal barrier. */
+	jbd2_journal_lock_updates(journal);
 
-		/* Journal blocked and flushed, clear needs_recovery flag. */
-		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-		error = ext4_commit_super(sb, 1);
-		if (error)
-			goto out;
+	/*
+	 * Don't clear the needs_recovery flag if we failed to flush
+	 * the journal.
+	 */
+	error = jbd2_journal_flush(journal);
+	if (error < 0) {
+	out:
+		jbd2_journal_unlock_updates(journal);
+		return error;
 	}
+
+	/* Journal blocked and flushed, clear needs_recovery flag. */
+	EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+	error = ext4_commit_super(sb, 1);
+	if (error)
+		goto out;
 	return 0;
-out:
-	jbd2_journal_unlock_updates(journal);
-	return error;
 }
 
 /*
@@ -3352,14 +3357,15 @@ out:
  */
 static int ext4_unfreeze(struct super_block *sb)
 {
-	if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) {
-		lock_super(sb);
-		/* Reser the needs_recovery flag before the fs is unlocked. */
-		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-		ext4_commit_super(sb, 1);
-		unlock_super(sb);
-		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
-	}
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	lock_super(sb);
+	/* Reset the needs_recovery flag before the fs is unlocked. */
+	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+	ext4_commit_super(sb, 1);
+	unlock_super(sb);
+	jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
 	return 0;
 }
 
-- 
cgit v0.10.2


From 8df9675f8b498d0bfa1f0b5b06f56bf1ff366dd5 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 1 May 2009 08:50:38 -0400
Subject: ext4: Avoid races caused by on-line resizing and SMP memory
 reordering

Ext4's on-line resizing adds a new block group and then, only at the
last step adjusts s_groups_count.  However, it's possible on SMP
systems that another CPU could see the updated the s_group_count and
not see the newly initialized data structures for the just-added block
group.  For this reason, it's important to insert a SMP read barrier
after reading s_groups_count and before reading any (for example) the
new block group descriptors allowed by the increased value of
s_groups_count.

Unfortunately, we rather blatently violate this locking protocol
documented in fs/ext4/resize.c.  Fortunately, (1) on-line resizes
happen relatively rarely, and (2) it seems rare that the filesystem
code will immediately try to use just-added block group before any
memory ordering issues resolve themselves.  So apparently problems
here are relatively hard to hit, since ext3 has been vulnerable to the
same issue for years with no one apparently complaining.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 53c72ad..a5ba039 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -88,6 +88,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		 ext4_group_t block_group, struct ext4_group_desc *gdp)
 {
 	int bit, bit_max;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	unsigned free_blocks, group_blocks;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
@@ -123,7 +124,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		bit_max += ext4_bg_num_gdb(sb, block_group);
 	}
 
-	if (block_group == sbi->s_groups_count - 1) {
+	if (block_group == ngroups - 1) {
 		/*
 		 * Even though mke2fs always initialize first and last group
 		 * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
@@ -131,7 +132,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		 */
 		group_blocks = ext4_blocks_count(sbi->s_es) -
 			le32_to_cpu(sbi->s_es->s_first_data_block) -
-			(EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1));
+			(EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1));
 	} else {
 		group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
 	}
@@ -205,18 +206,18 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 {
 	unsigned int group_desc;
 	unsigned int offset;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	struct ext4_group_desc *desc;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	if (block_group >= sbi->s_groups_count) {
+	if (block_group >= ngroups) {
 		ext4_error(sb, "ext4_get_group_desc",
 			   "block_group >= groups_count - "
 			   "block_group = %u, groups_count = %u",
-			   block_group, sbi->s_groups_count);
+			   block_group, ngroups);
 
 		return NULL;
 	}
-	smp_rmb();
 
 	group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
 	offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
@@ -665,7 +666,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 	ext4_fsblk_t desc_count;
 	struct ext4_group_desc *gdp;
 	ext4_group_t i;
-	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
 #ifdef EXT4FS_DEBUG
 	struct ext4_super_block *es;
 	ext4_fsblk_t bitmap_count;
@@ -677,7 +678,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 	bitmap_count = 0;
 	gdp = NULL;
 
-	smp_rmb();
 	for (i = 0; i < ngroups; i++) {
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
@@ -700,7 +700,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 	return bitmap_count;
 #else
 	desc_count = 0;
-	smp_rmb();
 	for (i = 0; i < ngroups; i++) {
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d0f15ef..02ec44b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1228,6 +1228,18 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
 	 return grp_info[indexv][indexh];
 }
 
+/*
+ * Reading s_groups_count requires using smp_rmb() afterwards.  See
+ * the locking protocol documented in the comments of ext4_group_add()
+ * in resize.c
+ */
+static inline ext4_group_t ext4_get_groups_count(struct super_block *sb)
+{
+	ext4_group_t	ngroups = EXT4_SB(sb)->s_groups_count;
+
+	smp_rmb();
+	return ngroups;
+}
 
 static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
 					     ext4_group_t block_group)
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f18e0a0..55ba419 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -316,7 +316,7 @@ error_return:
 static int find_group_dir(struct super_block *sb, struct inode *parent,
 				ext4_group_t *best_group)
 {
-	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	unsigned int freei, avefreei;
 	struct ext4_group_desc *desc, *best_desc = NULL;
 	ext4_group_t group;
@@ -353,7 +353,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
 	struct flex_groups *flex_group = sbi->s_flex_groups;
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 	ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
-	ext4_group_t ngroups = sbi->s_groups_count;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	int flex_size = ext4_flex_bg_size(sbi);
 	ext4_group_t best_flex = parent_fbg_group;
 	int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
@@ -362,7 +362,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
 	ext4_group_t n_fbg_groups;
 	ext4_group_t i;
 
-	n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
+	n_fbg_groups = (ngroups + flex_size - 1) >>
 		sbi->s_log_groups_per_flex;
 
 find_close_to_parent:
@@ -478,20 +478,21 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 {
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	ext4_group_t ngroups = sbi->s_groups_count;
+	ext4_group_t real_ngroups = ext4_get_groups_count(sb);
 	int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
 	unsigned int freei, avefreei;
 	ext4_fsblk_t freeb, avefreeb;
 	unsigned int ndirs;
 	int max_dirs, min_inodes;
 	ext4_grpblk_t min_blocks;
-	ext4_group_t i, grp, g;
+	ext4_group_t i, grp, g, ngroups;
 	struct ext4_group_desc *desc;
 	struct orlov_stats stats;
 	int flex_size = ext4_flex_bg_size(sbi);
 
+	ngroups = real_ngroups;
 	if (flex_size > 1) {
-		ngroups = (ngroups + flex_size - 1) >>
+		ngroups = (real_ngroups + flex_size - 1) >>
 			sbi->s_log_groups_per_flex;
 		parent_group >>= sbi->s_log_groups_per_flex;
 	}
@@ -543,7 +544,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 		 */
 		grp *= flex_size;
 		for (i = 0; i < flex_size; i++) {
-			if (grp+i >= sbi->s_groups_count)
+			if (grp+i >= real_ngroups)
 				break;
 			desc = ext4_get_group_desc(sb, grp+i, NULL);
 			if (desc && ext4_free_inodes_count(sb, desc)) {
@@ -583,7 +584,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 	}
 
 fallback:
-	ngroups = sbi->s_groups_count;
+	ngroups = real_ngroups;
 	avefreei = freei / ngroups;
 fallback_retry:
 	parent_group = EXT4_I(parent)->i_block_group;
@@ -613,9 +614,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 			    ext4_group_t *group, int mode)
 {
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
-	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+	ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
 	struct ext4_group_desc *desc;
-	ext4_group_t i, last;
 	int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
 
 	/*
@@ -799,11 +799,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 	struct super_block *sb;
 	struct buffer_head *inode_bitmap_bh = NULL;
 	struct buffer_head *group_desc_bh;
-	ext4_group_t group = 0;
+	ext4_group_t ngroups, group = 0;
 	unsigned long ino = 0;
 	struct inode *inode;
 	struct ext4_group_desc *gdp = NULL;
-	struct ext4_super_block *es;
 	struct ext4_inode_info *ei;
 	struct ext4_sb_info *sbi;
 	int ret2, err = 0;
@@ -818,15 +817,14 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 		return ERR_PTR(-EPERM);
 
 	sb = dir->i_sb;
+	ngroups = ext4_get_groups_count(sb);
 	trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id,
 		   dir->i_ino, mode);
 	inode = new_inode(sb);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	ei = EXT4_I(inode);
-
 	sbi = EXT4_SB(sb);
-	es = sbi->s_es;
 
 	if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
 		ret2 = find_group_flex(sb, dir, &group);
@@ -856,7 +854,7 @@ got_group:
 	if (ret2 == -1)
 		goto out;
 
-	for (i = 0; i < sbi->s_groups_count; i++) {
+	for (i = 0; i < ngroups; i++) {
 		err = -EIO;
 
 		gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
@@ -917,7 +915,7 @@ repeat_in_this_group:
 		 * group descriptor metadata has not yet been updated.
 		 * So we just go onto the next blockgroup.
 		 */
-		if (++group == sbi->s_groups_count)
+		if (++group == ngroups)
 			group = 0;
 	}
 	err = -ENOSPC;
@@ -1158,7 +1156,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 {
 	unsigned long desc_count;
 	struct ext4_group_desc *gdp;
-	ext4_group_t i;
+	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
 #ifdef EXT4FS_DEBUG
 	struct ext4_super_block *es;
 	unsigned long bitmap_count, x;
@@ -1168,7 +1166,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 	desc_count = 0;
 	bitmap_count = 0;
 	gdp = NULL;
-	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+	for (i = 0; i < ngroups; i++) {
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
@@ -1190,7 +1188,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 	return desc_count;
 #else
 	desc_count = 0;
-	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+	for (i = 0; i < ngroups; i++) {
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
@@ -1205,9 +1203,9 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 unsigned long ext4_count_dirs(struct super_block * sb)
 {
 	unsigned long count = 0;
-	ext4_group_t i;
+	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
 
-	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+	for (i = 0; i < ngroups; i++) {
 		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 96f3366..4e7f363 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4965,7 +4965,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
  */
 int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
-	int groups, gdpblocks;
+	ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
+	int gdpblocks;
 	int idxblocks;
 	int ret = 0;
 
@@ -4992,8 +4993,8 @@ int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 		groups += nrblocks;
 
 	gdpblocks = groups;
-	if (groups > EXT4_SB(inode->i_sb)->s_groups_count)
-		groups = EXT4_SB(inode->i_sb)->s_groups_count;
+	if (groups > ngroups)
+		groups = ngroups;
 	if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
 		gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
 
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f871677..c3af9e6 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -739,6 +739,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
 
 static int ext4_mb_init_cache(struct page *page, char *incore)
 {
+	ext4_group_t ngroups;
 	int blocksize;
 	int blocks_per_page;
 	int groups_per_page;
@@ -757,6 +758,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 
 	inode = page->mapping->host;
 	sb = inode->i_sb;
+	ngroups = ext4_get_groups_count(sb);
 	blocksize = 1 << inode->i_blkbits;
 	blocks_per_page = PAGE_CACHE_SIZE / blocksize;
 
@@ -780,7 +782,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 	for (i = 0; i < groups_per_page; i++) {
 		struct ext4_group_desc *desc;
 
-		if (first_group + i >= EXT4_SB(sb)->s_groups_count)
+		if (first_group + i >= ngroups)
 			break;
 
 		err = -EIO;
@@ -852,7 +854,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 		struct ext4_group_info *grinfo;
 
 		group = (first_block + i) >> 1;
-		if (group >= EXT4_SB(sb)->s_groups_count)
+		if (group >= ngroups)
 			break;
 
 		/*
@@ -1788,6 +1790,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
 	int block, pnum;
 	int blocks_per_page;
 	int groups_per_page;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	ext4_group_t first_group;
 	struct ext4_group_info *grp;
 
@@ -1807,7 +1810,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
 	/* read all groups the page covers into the cache */
 	for (i = 0; i < groups_per_page; i++) {
 
-		if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
+		if ((first_group + i) >= ngroups)
 			break;
 		grp = ext4_get_group_info(sb, first_group + i);
 		/* take all groups write allocation
@@ -1945,8 +1948,7 @@ err:
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
-	ext4_group_t group;
-	ext4_group_t i;
+	ext4_group_t ngroups, group, i;
 	int cr;
 	int err = 0;
 	int bsbits;
@@ -1957,6 +1959,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 
 	sb = ac->ac_sb;
 	sbi = EXT4_SB(sb);
+	ngroups = ext4_get_groups_count(sb);
 	BUG_ON(ac->ac_status == AC_STATUS_FOUND);
 
 	/* first, try the goal */
@@ -2017,11 +2020,11 @@ repeat:
 		 */
 		group = ac->ac_g_ex.fe_group;
 
-		for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
+		for (i = 0; i < ngroups; group++, i++) {
 			struct ext4_group_info *grp;
 			struct ext4_group_desc *desc;
 
-			if (group == EXT4_SB(sb)->s_groups_count)
+			if (group == ngroups)
 				group = 0;
 
 			/* quick check to skip empty groups */
@@ -2315,12 +2318,10 @@ static struct file_operations ext4_mb_seq_history_fops = {
 static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
 {
 	struct super_block *sb = seq->private;
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_group_t group;
 
-	if (*pos < 0 || *pos >= sbi->s_groups_count)
+	if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
 		return NULL;
-
 	group = *pos + 1;
 	return (void *) ((unsigned long) group);
 }
@@ -2328,11 +2329,10 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct super_block *sb = seq->private;
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_group_t group;
 
 	++*pos;
-	if (*pos < 0 || *pos >= sbi->s_groups_count)
+	if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
 		return NULL;
 	group = *pos + 1;
 	return (void *) ((unsigned long) group);
@@ -2587,6 +2587,7 @@ void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
 
 static int ext4_mb_init_backend(struct super_block *sb)
 {
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	ext4_group_t i;
 	int metalen;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2598,7 +2599,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
 	struct ext4_group_desc *desc;
 
 	/* This is the number of blocks used by GDT */
-	num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
+	num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
 				1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
 
 	/*
@@ -2644,7 +2645,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
 	for (i = 0; i < num_meta_group_infos; i++) {
 		if ((i + 1) == num_meta_group_infos)
 			metalen = sizeof(*meta_group_info) *
-				(sbi->s_groups_count -
+				(ngroups -
 					(i << EXT4_DESC_PER_BLOCK_BITS(sb)));
 		meta_group_info = kmalloc(metalen, GFP_KERNEL);
 		if (meta_group_info == NULL) {
@@ -2655,7 +2656,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
 		sbi->s_group_info[i] = meta_group_info;
 	}
 
-	for (i = 0; i < sbi->s_groups_count; i++) {
+	for (i = 0; i < ngroups; i++) {
 		desc = ext4_get_group_desc(sb, i, NULL);
 		if (desc == NULL) {
 			printk(KERN_ERR
@@ -2781,13 +2782,14 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
 
 int ext4_mb_release(struct super_block *sb)
 {
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	ext4_group_t i;
 	int num_meta_group_infos;
 	struct ext4_group_info *grinfo;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 	if (sbi->s_group_info) {
-		for (i = 0; i < sbi->s_groups_count; i++) {
+		for (i = 0; i < ngroups; i++) {
 			grinfo = ext4_get_group_info(sb, i);
 #ifdef DOUBLE_CHECK
 			kfree(grinfo->bb_bitmap);
@@ -2797,7 +2799,7 @@ int ext4_mb_release(struct super_block *sb)
 			ext4_unlock_group(sb, i);
 			kfree(grinfo);
 		}
-		num_meta_group_infos = (sbi->s_groups_count +
+		num_meta_group_infos = (ngroups +
 				EXT4_DESC_PER_BLOCK(sb) - 1) >>
 			EXT4_DESC_PER_BLOCK_BITS(sb);
 		for (i = 0; i < num_meta_group_infos; i++)
@@ -4121,7 +4123,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode,
 static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 {
 	struct super_block *sb = ac->ac_sb;
-	ext4_group_t i;
+	ext4_group_t ngroups, i;
 
 	printk(KERN_ERR "EXT4-fs: Can't allocate:"
 			" Allocation context details:\n");
@@ -4145,7 +4147,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 	printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
 		ac->ac_found);
 	printk(KERN_ERR "EXT4-fs: groups: \n");
-	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+	ngroups = ext4_get_groups_count(sb);
+	for (i = 0; i < ngroups; i++) {
 		struct ext4_group_info *grp = ext4_get_group_info(sb, i);
 		struct ext4_prealloc_space *pa;
 		ext4_grpblk_t start;
@@ -4469,13 +4472,13 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 
 static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
 {
-	ext4_group_t i;
+	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
 	int ret;
 	int freed = 0;
 
 	trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d",
 		   sb->s_id, needed);
-	for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
+	for (i = 0; i < ngroups && needed > 0; i++) {
 		ret = ext4_mb_discard_group_preallocations(sb, i, needed);
 		freed += ret;
 		needed -= ret;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 68c3a44..fcd7b24 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3557,9 +3557,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	if (test_opt(sb, MINIX_DF)) {
 		sbi->s_overhead_last = 0;
 	} else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
-		ext4_group_t ngroups = sbi->s_groups_count, i;
+		ext4_group_t i, ngroups = ext4_get_groups_count(sb);
 		ext4_fsblk_t overhead = 0;
-		smp_rmb();
 
 		/*
 		 * Compute the overhead (FS structures).  This is constant
-- 
cgit v0.10.2


From 114e9fc90703bd6aac0229fb559e97caa6c49770 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 25 Apr 2009 15:48:07 -0400
Subject: ext4: Remove outdated comment about lock_super()

ext4_fill_super() is no longer called by read_super(), and it is no
longer called with the superblock locked.  The
unlock_super()/lock_super() is no longer present, so this comment is
entirely superfluous.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index fcd7b24..e3b35f2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2828,14 +2828,6 @@ no_journal:
 		goto failed_mount4;
 	};
 
-	/*
-	 * akpm: core read_super() calls in here with the superblock locked.
-	 * That deadlocks, because orphan cleanup needs to lock the superblock
-	 * in numerous places.  Here we just pop the lock - it's relatively
-	 * harmless, because we are now ready to accept write_super() requests,
-	 * and aviro says that's the only reason for hanging onto the
-	 * superblock lock.
-	 */
 	EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
 	ext4_orphan_cleanup(sb, es);
 	EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
-- 
cgit v0.10.2


From a63c9eb2ce6f5028da90f282798232c4f398ceb8 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 1 May 2009 01:59:42 -0400
Subject: ext4: ext4_mark_recovery_complete() doesn't need to use lock_super

The function ext4_mark_recovery_complete() is called from two call
paths: either (a) while mounting the filesystem, in which case there's
no danger of any other CPU calling write_super() until the mount is
completed, and (b) while remounting the filesystem read-write, in
which case the fs core has already locked the superblock.  This also
allows us to take out a very vile unlock_super()/lock_super() pair in
ext4_remount().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e3b35f2..45d0ada 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3219,13 +3219,11 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
 	if (jbd2_journal_flush(journal) < 0)
 		goto out;
 
-	lock_super(sb);
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
 	    sb->s_flags & MS_RDONLY) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		ext4_commit_super(sb, 1);
 	}
-	unlock_super(sb);
 
 out:
 	jbd2_journal_unlock_updates(journal);
@@ -3436,15 +3434,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			    (sbi->s_mount_state & EXT4_VALID_FS))
 				es->s_state = cpu_to_le16(sbi->s_mount_state);
 
-			/*
-			 * We have to unlock super so that we can wait for
-			 * transactions.
-			 */
-			if (sbi->s_journal) {
-				unlock_super(sb);
+			if (sbi->s_journal)
 				ext4_mark_recovery_complete(sb, es);
-				lock_super(sb);
-			}
 		} else {
 			int ret;
 			if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
-- 
cgit v0.10.2


From 3b9d4ed26680771295d904a6b83e88e620780893 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 25 Apr 2009 22:54:04 -0400
Subject: ext4: Replace lock/unlock_super() with an explicit lock for the
 orphan list

Use a separate lock to protect the orphan list, so we can stop
overloading the use of lock_super().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 57b71fe..4bda2f7 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -71,6 +71,7 @@ struct ext4_sb_info {
 	struct inode *s_journal_inode;
 	struct journal_s *s_journal;
 	struct list_head s_orphan;
+	struct mutex s_orphan_lock;
 	unsigned long s_commit_interval;
 	u32 s_max_batch_time;
 	u32 s_min_batch_time;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 22098e1..8018e49 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1997,7 +1997,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 	if (!ext4_handle_valid(handle))
 		return 0;
 
-	lock_super(sb);
+	mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
 	if (!list_empty(&EXT4_I(inode)->i_orphan))
 		goto out_unlock;
 
@@ -2006,9 +2006,13 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 
 	/* @@@ FIXME: Observation from aviro:
 	 * I think I can trigger J_ASSERT in ext4_orphan_add().  We block
-	 * here (on lock_super()), so race with ext4_link() which might bump
+	 * here (on s_orphan_lock), so race with ext4_link() which might bump
 	 * ->i_nlink. For, say it, character device. Not a regular file,
 	 * not a directory, not a symlink and ->i_nlink > 0.
+	 *
+	 * tytso, 4/25/2009: I'm not sure how that could happen;
+	 * shouldn't the fs core protect us from these sort of
+	 * unlink()/link() races?
 	 */
 	J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 		  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
@@ -2045,7 +2049,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 	jbd_debug(4, "orphan inode %lu will point to %d\n",
 			inode->i_ino, NEXT_ORPHAN(inode));
 out_unlock:
-	unlock_super(sb);
+	mutex_unlock(&EXT4_SB(sb)->s_orphan_lock);
 	ext4_std_error(inode->i_sb, err);
 	return err;
 }
@@ -2066,11 +2070,9 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 	if (!ext4_handle_valid(handle))
 		return 0;
 
-	lock_super(inode->i_sb);
-	if (list_empty(&ei->i_orphan)) {
-		unlock_super(inode->i_sb);
-		return 0;
-	}
+	mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
+	if (list_empty(&ei->i_orphan))
+		goto out;
 
 	ino_next = NEXT_ORPHAN(inode);
 	prev = ei->i_orphan.prev;
@@ -2120,7 +2122,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 out_err:
 	ext4_std_error(inode->i_sb, err);
 out:
-	unlock_super(inode->i_sb);
+	mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
 	return err;
 
 out_brelse:
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 45d0ada..7f43fde 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2645,6 +2645,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sb->dq_op = &ext4_quota_operations;
 #endif
 	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
+	mutex_init(&sbi->s_orphan_lock);
 
 	sb->s_root = NULL;
 
-- 
cgit v0.10.2


From 32ed5058ce90024efcd811254b4b1de0468099df Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 25 Apr 2009 22:53:39 -0400
Subject: ext4: Replace lock/unlock_super() with an explicit lock for resizing

Use a separate lock to protect s_groups_count and the other block
group descriptors which get changed via an on-line resize operation,
so we can stop overloading the use of lock_super().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 4bda2f7..2d36223 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -72,6 +72,7 @@ struct ext4_sb_info {
 	struct journal_s *s_journal;
 	struct list_head s_orphan;
 	struct mutex s_orphan_lock;
+	struct mutex s_resize_lock;
 	unsigned long s_commit_interval;
 	u32 s_max_batch_time;
 	u32 s_min_batch_time;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 546c7dd..e8ded13 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -193,7 +193,7 @@ static int setup_new_group_blocks(struct super_block *sb,
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
-	lock_super(sb);
+	mutex_lock(&sbi->s_resize_lock);
 	if (input->group != sbi->s_groups_count) {
 		err = -EBUSY;
 		goto exit_journal;
@@ -302,7 +302,7 @@ exit_bh:
 	brelse(bh);
 
 exit_journal:
-	unlock_super(sb);
+	mutex_unlock(&sbi->s_resize_lock);
 	if ((err2 = ext4_journal_stop(handle)) && !err)
 		err = err2;
 
@@ -643,11 +643,12 @@ exit_free:
  * important part is that the new block and inode counts are in the backup
  * superblocks, and the location of the new group metadata in the GDT backups.
  *
- * We do not need lock_super() for this, because these blocks are not
- * otherwise touched by the filesystem code when it is mounted.  We don't
- * need to worry about last changing from sbi->s_groups_count, because the
- * worst that can happen is that we do not copy the full number of backups
- * at this time.  The resize which changed s_groups_count will backup again.
+ * We do not need take the s_resize_lock for this, because these
+ * blocks are not otherwise touched by the filesystem code when it is
+ * mounted.  We don't need to worry about last changing from
+ * sbi->s_groups_count, because the worst that can happen is that we
+ * do not copy the full number of backups at this time.  The resize
+ * which changed s_groups_count will backup again.
  */
 static void update_backups(struct super_block *sb,
 			   int blk_off, char *data, int size)
@@ -809,7 +810,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 		goto exit_put;
 	}
 
-	lock_super(sb);
+	mutex_lock(&sbi->s_resize_lock);
 	if (input->group != sbi->s_groups_count) {
 		ext4_warning(sb, __func__,
 			     "multiple resizers run on filesystem!");
@@ -840,7 +841,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
         /*
          * OK, now we've set up the new group.  Time to make it active.
          *
-         * Current kernels don't lock all allocations via lock_super(),
+         * We do not lock all allocations via s_resize_lock
          * so we have to be safe wrt. concurrent accesses the group
          * data.  So we need to be careful to set all of the relevant
          * group descriptor data etc. *before* we enable the group.
@@ -900,12 +901,12 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	 *
 	 * The precise rules we use are:
 	 *
-	 * * Writers of s_groups_count *must* hold lock_super
+	 * * Writers of s_groups_count *must* hold s_resize_lock
 	 * AND
 	 * * Writers must perform a smp_wmb() after updating all dependent
 	 *   data and before modifying the groups count
 	 *
-	 * * Readers must hold lock_super() over the access
+	 * * Readers must hold s_resize_lock over the access
 	 * OR
 	 * * Readers must perform an smp_rmb() after reading the groups count
 	 *   and before reading any dependent data.
@@ -948,7 +949,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	sb->s_dirt = 1;
 
 exit_journal:
-	unlock_super(sb);
+	mutex_unlock(&sbi->s_resize_lock);
 	if ((err2 = ext4_journal_stop(handle)) && !err)
 		err = err2;
 	if (!err) {
@@ -986,7 +987,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 
 	/* We don't need to worry about locking wrt other resizers just
 	 * yet: we're going to revalidate es->s_blocks_count after
-	 * taking lock_super() below. */
+	 * taking the s_resize_lock below. */
 	o_blocks_count = ext4_blocks_count(es);
 	o_groups_count = EXT4_SB(sb)->s_groups_count;
 
@@ -1056,11 +1057,11 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 		goto exit_put;
 	}
 
-	lock_super(sb);
+	mutex_lock(&EXT4_SB(sb)->s_resize_lock);
 	if (o_blocks_count != ext4_blocks_count(es)) {
 		ext4_warning(sb, __func__,
 			     "multiple resizers run on filesystem!");
-		unlock_super(sb);
+		mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
 		ext4_journal_stop(handle);
 		err = -EBUSY;
 		goto exit_put;
@@ -1070,14 +1071,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 						 EXT4_SB(sb)->s_sbh))) {
 		ext4_warning(sb, __func__,
 			     "error %d on journal write access", err);
-		unlock_super(sb);
+		mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
 		ext4_journal_stop(handle);
 		goto exit_put;
 	}
 	ext4_blocks_count_set(es, o_blocks_count + add);
 	ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
 	sb->s_dirt = 1;
-	unlock_super(sb);
+	mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
 	ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
 		   o_blocks_count + add);
 	/* We add the blocks to the bitmap and set the group need init bit */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7f43fde..1fbf090 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2646,6 +2646,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 #endif
 	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
 	mutex_init(&sbi->s_orphan_lock);
+	mutex_init(&sbi->s_resize_lock);
 
 	sb->s_root = NULL;
 
-- 
cgit v0.10.2


From 701970b3a83cc639c1ec8fc6f40a7871cb99426f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 24 Apr 2009 23:11:22 -0400
Subject: tracing/events: make modules have their own file_operations structure

For proper module reference counting, the file_operations that modules use
must have the "owner" field set to the module. Unfortunately, the trace events
use share file_operations. The same file_operations are used by all both
kernel core and all modules.

This patch makes the modules allocate their own file_operations and
copies the functions from the core kernel. This allows those file
operations to be owned by the module.

Care is taken to free this code on module unload.

Thanks to Greg KH for reminding me that file_operations must be owned
by the module to have reference counting take place.

[ Impact: fix modular tracepoints / potential crash ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index b920815..be4d3a4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -770,7 +770,11 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 }
 
 static int
-event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
+event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
+		 const struct file_operations *id,
+		 const struct file_operations *enable,
+		 const struct file_operations *filter,
+		 const struct file_operations *format)
 {
 	struct dentry *entry;
 	int ret;
@@ -800,11 +804,11 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 
 	if (call->regfunc)
 		entry = trace_create_file("enable", 0644, call->dir, call,
-					  &ftrace_enable_fops);
+					  enable);
 
 	if (call->id)
 		entry = trace_create_file("id", 0444, call->dir, call,
-					  &ftrace_event_id_fops);
+					  id);
 
 	if (call->define_fields) {
 		ret = call->define_fields();
@@ -814,7 +818,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 			return ret;
 		}
 		entry = trace_create_file("filter", 0644, call->dir, call,
-					  &ftrace_event_filter_fops);
+					  filter);
 	}
 
 	/* A trace may not want to export its format */
@@ -822,7 +826,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 		return 0;
 
 	entry = trace_create_file("format", 0444, call->dir, call,
-				  &ftrace_event_format_fops);
+				  format);
 
 	return 0;
 }
@@ -833,8 +837,60 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 	     event++)
 
 #ifdef CONFIG_MODULES
+
+static LIST_HEAD(ftrace_module_file_list);
+
+/*
+ * Modules must own their file_operations to keep up with
+ * reference counting.
+ */
+struct ftrace_module_file_ops {
+	struct list_head		list;
+	struct module			*mod;
+	struct file_operations		id;
+	struct file_operations		enable;
+	struct file_operations		format;
+	struct file_operations		filter;
+};
+
+static struct ftrace_module_file_ops *
+trace_create_file_ops(struct module *mod)
+{
+	struct ftrace_module_file_ops *file_ops;
+
+	/*
+	 * This is a bit of a PITA. To allow for correct reference
+	 * counting, modules must "own" their file_operations.
+	 * To do this, we allocate the file operations that will be
+	 * used in the event directory.
+	 */
+
+	file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL);
+	if (!file_ops)
+		return NULL;
+
+	file_ops->mod = mod;
+
+	file_ops->id = ftrace_event_id_fops;
+	file_ops->id.owner = mod;
+
+	file_ops->enable = ftrace_enable_fops;
+	file_ops->enable.owner = mod;
+
+	file_ops->filter = ftrace_event_filter_fops;
+	file_ops->filter.owner = mod;
+
+	file_ops->format = ftrace_event_format_fops;
+	file_ops->format.owner = mod;
+
+	list_add(&file_ops->list, &ftrace_module_file_list);
+
+	return file_ops;
+}
+
 static void trace_module_add_events(struct module *mod)
 {
+	struct ftrace_module_file_ops *file_ops = NULL;
 	struct ftrace_event_call *call, *start, *end;
 	struct dentry *d_events;
 
@@ -852,14 +908,27 @@ static void trace_module_add_events(struct module *mod)
 		/* The linker may leave blanks */
 		if (!call->name)
 			continue;
+
+		/*
+		 * This module has events, create file ops for this module
+		 * if not already done.
+		 */
+		if (!file_ops) {
+			file_ops = trace_create_file_ops(mod);
+			if (!file_ops)
+				return;
+		}
 		call->mod = mod;
 		list_add(&call->list, &ftrace_events);
-		event_create_dir(call, d_events);
+		event_create_dir(call, d_events,
+				 &file_ops->id, &file_ops->enable,
+				 &file_ops->filter, &file_ops->format);
 	}
 }
 
 static void trace_module_remove_events(struct module *mod)
 {
+	struct ftrace_module_file_ops *file_ops;
 	struct ftrace_event_call *call, *p;
 
 	list_for_each_entry_safe(call, p, &ftrace_events, list) {
@@ -874,6 +943,16 @@ static void trace_module_remove_events(struct module *mod)
 			list_del(&call->list);
 		}
 	}
+
+	/* Now free the file_operations */
+	list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
+		if (file_ops->mod == mod)
+			break;
+	}
+	if (&file_ops->list != &ftrace_module_file_list) {
+		list_del(&file_ops->list);
+		kfree(file_ops);
+	}
 }
 
 static int trace_module_notify(struct notifier_block *self,
@@ -954,7 +1033,9 @@ static __init int event_trace_init(void)
 		if (!call->name)
 			continue;
 		list_add(&call->list, &ftrace_events);
-		event_create_dir(call, d_events);
+		event_create_dir(call, d_events, &ftrace_event_id_fops,
+				 &ftrace_enable_fops, &ftrace_event_filter_fops,
+				 &ftrace_event_format_fops);
 	}
 
 	ret = register_module_notifier(&trace_module_nb);
-- 
cgit v0.10.2


From 0a3ec21fcd311b26ab0f249d62960e127bc20ca8 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Sun, 26 Apr 2009 23:07:42 +0200
Subject: x86: beautify vmlinux_64.lds.S

Beautify vmlinux_64.lds.S:

 - Use tabs for indent
 - Located curly braces like in C code
 - Rearranged a few comments

There is no functional changes in this patch

The beautification is done to prepare a unification
of the _32 and the _64 variants of the linker scripts.

[ Impact: cleanup ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <20090426210742.GA3464@uranus.ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index c874250..6d5a5b0 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -15,69 +15,79 @@ OUTPUT_ARCH(i386:x86-64)
 ENTRY(phys_startup_64)
 jiffies_64 = jiffies;
 PHDRS {
-	text PT_LOAD FLAGS(5);	/* R_E */
-	data PT_LOAD FLAGS(7);	/* RWE */
-	user PT_LOAD FLAGS(7);	/* RWE */
+	text PT_LOAD FLAGS(5);		/* R_E */
+	data PT_LOAD FLAGS(7);		/* RWE */
+	user PT_LOAD FLAGS(7);		/* RWE */
 	data.init PT_LOAD FLAGS(7);	/* RWE */
 #ifdef CONFIG_SMP
 	percpu PT_LOAD FLAGS(7);	/* RWE */
 #endif
 	data.init2 PT_LOAD FLAGS(7);	/* RWE */
-	note PT_NOTE FLAGS(0);	/* ___ */
+	note PT_NOTE FLAGS(0);		/* ___ */
 }
 SECTIONS
 {
-  . = __START_KERNEL;
-  phys_startup_64 = startup_64 - LOAD_OFFSET;
-  .text :  AT(ADDR(.text) - LOAD_OFFSET) {
-	_text = .;			/* Text and read-only data */
-	/* First the code that has to be first for bootstrapping */
-	*(.text.head)
-	_stext = .;
-	/* Then the rest */
-	TEXT_TEXT
-	SCHED_TEXT
-	LOCK_TEXT
-	KPROBES_TEXT
-	IRQENTRY_TEXT
-	*(.fixup)
-	*(.gnu.warning)
-	_etext = .;		/* End of text section */
-  } :text = 0x9090
-
-  NOTES :text :note
-
-  . = ALIGN(16);		/* Exception table */
-  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
-  	__start___ex_table = .;
-	 *(__ex_table)
-  	__stop___ex_table = .;
-  } :text = 0x9090
-
-  RODATA
-
-  . = ALIGN(PAGE_SIZE);		/* Align data segment to page size boundary */
-				/* Data */
-  .data : AT(ADDR(.data) - LOAD_OFFSET) {
-	DATA_DATA
-	CONSTRUCTORS
-	_edata = .;			/* End of data section */
-	} :data
+	. = __START_KERNEL;
+	phys_startup_64 = startup_64 - LOAD_OFFSET;
+
+	/* Text and read-only data */
+	.text :  AT(ADDR(.text) - LOAD_OFFSET) {
+		_text = .;
+		/* First the code that has to be first for bootstrapping */
+		*(.text.head)
+		_stext = .;
+		/* Then the rest */
+		TEXT_TEXT
+		SCHED_TEXT
+		LOCK_TEXT
+		KPROBES_TEXT
+		IRQENTRY_TEXT
+		*(.fixup)
+		*(.gnu.warning)
+		/* End of text section */
+		_etext = .;
+	} :text = 0x9090
+
+	NOTES :text :note
+
+	/* Exception table */
+	. = ALIGN(16);
+	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
+		__start___ex_table = .;
+		 *(__ex_table)
+		__stop___ex_table = .;
+	} :text = 0x9090
 
+	RODATA
 
-  .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+	/* Align data segment to page size boundary */
 	. = ALIGN(PAGE_SIZE);
-	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-	*(.data.cacheline_aligned)
-  }
-  . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
-  .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
-  	*(.data.read_mostly)
-  }
+	/* Data */
+	.data : AT(ADDR(.data) - LOAD_OFFSET) {
+		DATA_DATA
+		CONSTRUCTORS
+		/* End of data section */
+		_edata = .;
+	} :data
+
+
+	.data.cacheline_aligned :
+		AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+		. = ALIGN(PAGE_SIZE);
+		. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+		*(.data.cacheline_aligned)
+	}
+
+	. = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
+	.data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
+		*(.data.read_mostly)
+	}
 
 #define VSYSCALL_ADDR (-10*1024*1024)
-#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
-#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \
+                            SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \
+                            SIZEOF(.data.read_mostly) + 4095) & ~(4095))
 
 #define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
 #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
@@ -85,37 +95,53 @@ SECTIONS
 #define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
 #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
 
-  . = VSYSCALL_ADDR;
-  .vsyscall_0 :	 AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
-  __vsyscall_0 = VSYSCALL_VIRT_ADDR;
+	. = VSYSCALL_ADDR;
+	.vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) {
+		*(.vsyscall_0)
+	} :user
+
+	__vsyscall_0 = VSYSCALL_VIRT_ADDR;
+
+	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+	.vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
+		*(.vsyscall_fn)
+	}
+
+	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+	.vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
+		*(.vsyscall_gtod_data)
+	}
 
-  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-  .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
-  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-  .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data))
-		{ *(.vsyscall_gtod_data) }
-  vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
-  .vsyscall_clock : AT(VLOAD(.vsyscall_clock))
-		{ *(.vsyscall_clock) }
-  vsyscall_clock = VVIRT(.vsyscall_clock);
+	vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
+	.vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
+		*(.vsyscall_clock)
+	}
+	vsyscall_clock = VVIRT(.vsyscall_clock);
 
 
-  .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
-		{ *(.vsyscall_1) }
-  .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2))
-		{ *(.vsyscall_2) }
+	.vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
+		*(.vsyscall_1)
+	}
+	.vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
+		*(.vsyscall_2)
+	}
 
-  .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
-  vgetcpu_mode = VVIRT(.vgetcpu_mode);
+	.vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
+		*(.vgetcpu_mode)
+	}
+	vgetcpu_mode = VVIRT(.vgetcpu_mode);
 
-  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-  .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
-  jiffies = VVIRT(.jiffies);
+	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+	.jiffies : AT(VLOAD(.jiffies)) {
+		*(.jiffies)
+	}
+	jiffies = VVIRT(.jiffies);
 
-  .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3))
-		{ *(.vsyscall_3) }
+	.vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
+		*(.vsyscall_3)
+	}
 
-  . = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
+	. = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
 
 #undef VSYSCALL_ADDR
 #undef VSYSCALL_PHYS_ADDR
@@ -125,156 +151,168 @@ SECTIONS
 #undef VVIRT_OFFSET
 #undef VVIRT
 
-  .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
-	. = ALIGN(THREAD_SIZE);	/* init_task */
-	*(.data.init_task)
-  }:data.init
+	/* init_task */
+	.data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
+		. = ALIGN(THREAD_SIZE);
+		*(.data.init_task)
+	} :data.init
 
-  .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
-	. = ALIGN(PAGE_SIZE);
-	*(.data.page_aligned)
-  }
+	.data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+		. = ALIGN(PAGE_SIZE);
+		*(.data.page_aligned)
+	}
 
-  .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
-	/* might get freed after init */
-	. = ALIGN(PAGE_SIZE);
-	__smp_alt_begin = .;
-	__smp_locks = .;
-	*(.smp_locks)
-	__smp_locks_end = .;
+	.smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+		/* might get freed after init */
+		. = ALIGN(PAGE_SIZE);
+		__smp_alt_begin = .;
+		__smp_locks = .;
+		*(.smp_locks)
+		__smp_locks_end = .;
+		. = ALIGN(PAGE_SIZE);
+		__smp_alt_end = .;
+	}
+
+	/* Init code and data */
 	. = ALIGN(PAGE_SIZE);
-	__smp_alt_end = .;
-  }
-
-  . = ALIGN(PAGE_SIZE);		/* Init code and data */
-  __init_begin = .;	/* paired with __init_end */
-  .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
-	_sinittext = .;
-	INIT_TEXT
-	_einittext = .;
-  }
-  .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
-	__initdata_begin = .;
-	INIT_DATA
-	__initdata_end = .;
-   }
-
-  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
-	. = ALIGN(16);
-	__setup_start = .;
-	*(.init.setup)
-	__setup_end = .;
-  }
-  .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
-	__initcall_start = .;
-	INITCALLS
-	__initcall_end = .;
-  }
-  .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
-	__con_initcall_start = .;
-	*(.con_initcall.init)
-	__con_initcall_end = .;
-  }
-  .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
-	__x86_cpu_dev_start = .;
-	*(.x86_cpu_dev.init)
-	__x86_cpu_dev_end = .;
-  }
-  SECURITY_INIT
-
-  . = ALIGN(8);
-  .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
-	__parainstructions = .;
-       *(.parainstructions)
-	__parainstructions_end = .;
-  }
-
-  .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+	__init_begin = .;	/* paired with __init_end */
+	.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
+		_sinittext = .;
+		INIT_TEXT
+		_einittext = .;
+	}
+
+	.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
+		__initdata_begin = .;
+		INIT_DATA
+		__initdata_end = .;
+	}
+
+	.init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+		. = ALIGN(16);
+		__setup_start = .;
+		*(.init.setup)
+		__setup_end = .;
+	}
+
+	.initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+		__initcall_start = .;
+		INITCALLS
+		__initcall_end = .;
+	}
+
+	.con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+		__con_initcall_start = .;
+		*(.con_initcall.init)
+		__con_initcall_end = .;
+	}
+
+	.x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
+		__x86_cpu_dev_start = .;
+		*(.x86_cpu_dev.init)
+		__x86_cpu_dev_end = .;
+	}
+
+	SECURITY_INIT
+
 	. = ALIGN(8);
-	__alt_instructions = .;
-	*(.altinstructions)
-	__alt_instructions_end = .;
-  }
-  .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
-	*(.altinstr_replacement)
-  }
-  /* .exit.text is discard at runtime, not link time, to deal with references
-     from .altinstructions and .eh_frame */
-  .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
-	EXIT_TEXT
-  }
-  .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
-	EXIT_DATA
-  }
+	.parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
+		__parainstructions = .;
+		*(.parainstructions)
+		__parainstructions_end = .;
+	}
+
+	.altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+		. = ALIGN(8);
+		__alt_instructions = .;
+		*(.altinstructions)
+		__alt_instructions_end = .;
+	}
+
+	.altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
+		*(.altinstr_replacement)
+	}
+
+	/*
+	 * .exit.text is discard at runtime, not link time, to deal with
+	 *  references from .altinstructions and .eh_frame
+	 */
+	.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
+		EXIT_TEXT
+	}
+
+	.exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
+		EXIT_DATA
+	}
 
 #ifdef CONFIG_BLK_DEV_INITRD
-  . = ALIGN(PAGE_SIZE);
-  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
-	__initramfs_start = .;
-	*(.init.ramfs)
-	__initramfs_end = .;
-  }
+	. = ALIGN(PAGE_SIZE);
+	.init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
+		__initramfs_start = .;
+		*(.init.ramfs)
+		__initramfs_end = .;
+	}
 #endif
 
 #ifdef CONFIG_SMP
-  /*
-   * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
-   * output PHDR, so the next output section - __data_nosave - should
-   * start another section data.init2.  Also, pda should be at the head of
-   * percpu area.  Preallocate it and define the percpu offset symbol
-   * so that it can be accessed as a percpu variable.
-   */
-  . = ALIGN(PAGE_SIZE);
-  PERCPU_VADDR(0, :percpu)
+	/*
+	 * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
+	 * output PHDR, so the next output section - __data_nosave - should
+	 * start another section data.init2.  Also, pda should be at the head of
+	 * percpu area.  Preallocate it and define the percpu offset symbol
+	 * so that it can be accessed as a percpu variable.
+	 */
+	. = ALIGN(PAGE_SIZE);
+	PERCPU_VADDR(0, :percpu)
 #else
-  PERCPU(PAGE_SIZE)
+	PERCPU(PAGE_SIZE)
 #endif
 
-  . = ALIGN(PAGE_SIZE);
-  __init_end = .;
-
-  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
-	. = ALIGN(PAGE_SIZE);
-	__nosave_begin = .;
-	*(.data.nosave)
-	. = ALIGN(PAGE_SIZE);
-	__nosave_end = .;
-  } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
-
-  .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
 	. = ALIGN(PAGE_SIZE);
-	__bss_start = .;		/* BSS */
-	*(.bss.page_aligned)
-	*(.bss)
-	__bss_stop = .;
-  }
+	__init_end = .;
+
+	.data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+		. = ALIGN(PAGE_SIZE);
+		__nosave_begin = .;
+		*(.data.nosave)
+		. = ALIGN(PAGE_SIZE);
+		__nosave_end = .;
+	} :data.init2
+	/* use another section data.init2, see PERCPU_VADDR() above */
+
+	.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+		. = ALIGN(PAGE_SIZE);
+		__bss_start = .;		/* BSS */
+		*(.bss.page_aligned)
+		*(.bss)
+		__bss_stop = .;
+	}
 
-  .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
-	. = ALIGN(PAGE_SIZE);
-	__brk_base = . ;
- 	. += 64 * 1024 ;	/* 64k alignment slop space */
-	*(.brk_reservation)	/* areas brk users have reserved */
-	__brk_limit = . ;
-  }
-
-  _end = . ;
-
-  /* Sections to be discarded */
-  /DISCARD/ : {
-	*(.exitcall.exit)
-	*(.eh_frame)
-	*(.discard)
+	.brk : AT(ADDR(.brk) - LOAD_OFFSET) {
+		. = ALIGN(PAGE_SIZE);
+		__brk_base = .;
+		. += 64 * 1024;		/* 64k alignment slop space */
+		*(.brk_reservation)	/* areas brk users have reserved */
+		__brk_limit = .;
 	}
 
-  STABS_DEBUG
+	_end = . ;
 
-  DWARF_DEBUG
+	/* Sections to be discarded */
+	/DISCARD/ : {
+		*(.exitcall.exit)
+		*(.eh_frame)
+		*(.discard)
+	}
+
+	STABS_DEBUG
+	DWARF_DEBUG
 }
 
- /*
-  * Per-cpu symbols which need to be offset from __per_cpu_load
-  * for the boot processor.
-  */
+/*
+ * Per-cpu symbols which need to be offset from __per_cpu_load
+ * for the boot processor.
+ */
 #define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
 INIT_PER_CPU(gdt_page);
 INIT_PER_CPU(irq_stack_union);
-- 
cgit v0.10.2


From 51b26ada79b605ed709ddcedbb6012e8f8e0ebed Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 26 Apr 2009 10:12:47 -0700
Subject: x86: unify arch/x86/boot/compressed/vmlinux_*.lds

Look at the:

	diff -u arch/x86/boot/compressed/vmlinux_*.lds

output and realize that they're basially exactly the same except for
trivial naming differences, and the fact that the 64-bit version has a
"pgtable" thing.

So unify them.

There's some trivial cleanup there (make the output format a Kconfig thing
rather than doing #ifdef's for it, and unify both 32-bit and 64-bit BSS
end to "_ebss", where 32-bit used to use the traditional "_end"), but
other than that it's really very mindless and straigt conversion.

For example, I think we should aim to remove "startup_32" vs "startup_64",
and just call it "startup", and get rid of one more difference. I didn't
do that.

Also, notice the comment in the unified vmlinux.lds.S talks about
"head_64" and "startup_32" which is an odd and incorrect mix, but that was
actually what the old 64-bit only lds file had, so the confusion isn't
new, and now that mixing is arguably more accurate thanks to the
vmlinux.lds.S file being shared between the two cases ;)

[ Impact: cleanup, unification ]

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index bc25b9f..039c3f0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -47,6 +47,11 @@ config X86
 	select HAVE_KERNEL_BZIP2
 	select HAVE_KERNEL_LZMA
 
+config OUTPUT_FORMAT
+	string
+	default "elf32-i386" if X86_32
+	default "elf64-x86-64" if X86_64
+
 config ARCH_DEFCONFIG
 	string
 	default "arch/x86/configs/i386_defconfig" if X86_32
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 65551c9..0f4b5e2 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -19,7 +19,7 @@ KBUILD_AFLAGS  := $(KBUILD_CFLAGS) -D__ASSEMBLY__
 LDFLAGS := -m elf_$(UTS_MACHINE)
 LDFLAGS_vmlinux := -T
 
-$(obj)/vmlinux: $(src)/vmlinux_$(BITS).lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
+$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
 	$(call if_changed,ld)
 	@:
 
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 3a8a866..85bd328 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -88,9 +88,9 @@ ENTRY(startup_32)
  * where decompression in place becomes safe.
  */
 	pushl %esi
-	leal _end(%ebp), %esi
-	leal _end(%ebx), %edi
-	movl $(_end - startup_32), %ecx
+	leal _ebss(%ebp), %esi
+	leal _ebss(%ebx), %edi
+	movl $(_ebss - startup_32), %ecx
 	std
 	rep
 	movsb
@@ -121,7 +121,7 @@ relocated:
  */
 	xorl %eax,%eax
 	leal _edata(%ebx),%edi
-	leal _end(%ebx), %ecx
+	leal _ebss(%ebx), %ecx
 	subl %edi,%ecx
 	cld
 	rep
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
new file mode 100644
index 0000000..ffcb1913
--- /dev/null
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -0,0 +1,57 @@
+OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
+
+#ifdef CONFIG_X86_64
+OUTPUT_ARCH(i386:x86-64)
+ENTRY(startup_64)
+#else
+OUTPUT_ARCH(i386)
+ENTRY(startup_32)
+#endif
+
+SECTIONS
+{
+	/* Be careful parts of head_64.S assume startup_32 is at
+	 * address 0.
+	 */
+	. = 0;
+	.text.head : {
+		_head = . ;
+		*(.text.head)
+		_ehead = . ;
+	}
+	.rodata.compressed : {
+		*(.rodata.compressed)
+	}
+	.text :	{
+		_text = .; 	/* Text */
+		*(.text)
+		*(.text.*)
+		_etext = . ;
+	}
+	.rodata : {
+		_rodata = . ;
+		*(.rodata)	 /* read-only data */
+		*(.rodata.*)
+		_erodata = . ;
+	}
+	.data :	{
+		_data = . ;
+		*(.data)
+		*(.data.*)
+		_edata = . ;
+	}
+	.bss : {
+		_bss = . ;
+		*(.bss)
+		*(.bss.*)
+		*(COMMON)
+#ifdef CONFIG_X86_64
+		. = ALIGN(8);
+		_end_before_pgt = . ;
+		. = ALIGN(4096);
+		pgtable = . ;
+		. = . + 4096 * 6;
+#endif
+		_ebss = .;
+	}
+}
diff --git a/arch/x86/boot/compressed/vmlinux_32.lds b/arch/x86/boot/compressed/vmlinux_32.lds
deleted file mode 100644
index bb3c483..0000000
--- a/arch/x86/boot/compressed/vmlinux_32.lds
+++ /dev/null
@@ -1,43 +0,0 @@
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
-OUTPUT_ARCH(i386)
-ENTRY(startup_32)
-SECTIONS
-{
-	/* Be careful parts of head_32.S assume startup_32 is at
-	 * address 0.
-	 */
-	. = 0;
-	.text.head : {
-		_head = . ;
-		*(.text.head)
-		_ehead = . ;
-	}
-	.rodata.compressed : {
-		*(.rodata.compressed)
-	}
-	.text :	{
-		_text = .; 	/* Text */
-		*(.text)
-		*(.text.*)
-		_etext = . ;
-	}
-	.rodata : {
-		_rodata = . ;
-		*(.rodata)	 /* read-only data */
-		*(.rodata.*)
-		_erodata = . ;
-	}
-	.data :	{
-		_data = . ;
-		*(.data)
-		*(.data.*)
-		_edata = . ;
-	}
-	.bss : {
-		_bss = . ;
-		*(.bss)
-		*(.bss.*)
-		*(COMMON)
-		_end = . ;
-	}
-}
diff --git a/arch/x86/boot/compressed/vmlinux_64.lds b/arch/x86/boot/compressed/vmlinux_64.lds
deleted file mode 100644
index bef1ac8..0000000
--- a/arch/x86/boot/compressed/vmlinux_64.lds
+++ /dev/null
@@ -1,48 +0,0 @@
-OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
-OUTPUT_ARCH(i386:x86-64)
-ENTRY(startup_64)
-SECTIONS
-{
-	/* Be careful parts of head_64.S assume startup_32 is at
-	 * address 0.
-	 */
-	. = 0;
-	.text.head : {
-		_head = . ;
-		*(.text.head)
-		_ehead = . ;
-	}
-	.rodata.compressed : {
-		*(.rodata.compressed)
-	}
-	.text :	{
-		_text = .; 	/* Text */
-		*(.text)
-		*(.text.*)
-		_etext = . ;
-	}
-	.rodata : {
-		_rodata = . ;
-		*(.rodata)	 /* read-only data */
-		*(.rodata.*)
-		_erodata = . ;
-	}
-	.data :	{
-		_data = . ;
-		*(.data)
-		*(.data.*)
-		_edata = . ;
-	}
-	.bss : {
-		_bss = . ;
-		*(.bss)
-		*(.bss.*)
-		*(COMMON)
-		. = ALIGN(8);
-		_end_before_pgt = . ;
-		. = ALIGN(4096);
-		pgtable = . ;
-		. = . + 4096 * 6;
-		_ebss = .;
-	}
-}
-- 
cgit v0.10.2


From fc4967b8c6a1540ebce9ac48e44b8d44e7fac971 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 27 Apr 2009 14:06:26 +0900
Subject: sh: update defconfigs for PCI changes.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/configs/ap325rxa_defconfig b/arch/sh/configs/ap325rxa_defconfig
index c8d982a..022f70e 100644
--- a/arch/sh/configs/ap325rxa_defconfig
+++ b/arch/sh/configs/ap325rxa_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 17:46:53 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:42:06 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -74,6 +75,7 @@ CONFIG_EMBEDDED=y
 CONFIG_UID16=y
 CONFIG_SYSCTL_SYSCALL=y
 # CONFIG_KALLSYMS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -92,12 +94,15 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -110,7 +115,6 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -159,6 +163,7 @@ CONFIG_ARCH_SHMOBILE=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 CONFIG_CPU_SUBTYPE_SH7723=y
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -168,8 +173,6 @@ CONFIG_CPU_SUBTYPE_SH7723=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -573,6 +576,7 @@ CONFIG_SCSI_WAIT_SCAN=m
 CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_ISCSI_TCP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_SCSI_DEBUG is not set
 # CONFIG_SCSI_DH is not set
 # CONFIG_SCSI_OSD_INITIATOR is not set
@@ -695,6 +699,7 @@ CONFIG_DEVKMEM=y
 #
 # Non-8250 serial port support
 #
+# CONFIG_SERIAL_MAX3100 is not set
 CONFIG_SERIAL_SH_SCI=y
 CONFIG_SERIAL_SH_SCI_NR_UARTS=6
 CONFIG_SERIAL_SH_SCI_CONSOLE=y
@@ -1030,6 +1035,7 @@ CONFIG_EXT2_FS_POSIX_ACL=y
 CONFIG_EXT2_FS_SECURITY=y
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 CONFIG_EXT3_FS_POSIX_ACL=y
 CONFIG_EXT3_FS_SECURITY=y
@@ -1052,6 +1058,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -1100,6 +1111,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -1191,10 +1203,24 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_PREEMPT_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -1303,6 +1329,7 @@ CONFIG_CRYPTO_CBC=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/cayman_defconfig b/arch/sh/configs/cayman_defconfig
index fa5fc1e..40301f8 100644
--- a/arch/sh/configs/cayman_defconfig
+++ b/arch/sh/configs/cayman_defconfig
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 17:49:14 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:42:53 2009
 #
 CONFIG_SUPERH=y
 # CONFIG_SUPERH32 is not set
@@ -40,6 +40,7 @@ CONFIG_LOCALVERSION_AUTO=y
 CONFIG_SWAP=y
 # CONFIG_SYSVIPC is not set
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 # CONFIG_BSD_PROCESS_ACCT is not set
 # CONFIG_TASKSTATS is not set
 # CONFIG_AUDIT is not set
@@ -70,6 +71,7 @@ CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_ALL is not set
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -89,10 +91,13 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -105,7 +110,6 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -127,39 +131,6 @@ CONFIG_DEFAULT_IOSCHED="cfq"
 # System type
 #
 CONFIG_CPU_SH5=y
-# CONFIG_CPU_SUBTYPE_SH7619 is not set
-# CONFIG_CPU_SUBTYPE_SH7201 is not set
-# CONFIG_CPU_SUBTYPE_SH7203 is not set
-# CONFIG_CPU_SUBTYPE_SH7206 is not set
-# CONFIG_CPU_SUBTYPE_SH7263 is not set
-# CONFIG_CPU_SUBTYPE_MXG is not set
-# CONFIG_CPU_SUBTYPE_SH7705 is not set
-# CONFIG_CPU_SUBTYPE_SH7706 is not set
-# CONFIG_CPU_SUBTYPE_SH7707 is not set
-# CONFIG_CPU_SUBTYPE_SH7708 is not set
-# CONFIG_CPU_SUBTYPE_SH7709 is not set
-# CONFIG_CPU_SUBTYPE_SH7710 is not set
-# CONFIG_CPU_SUBTYPE_SH7712 is not set
-# CONFIG_CPU_SUBTYPE_SH7720 is not set
-# CONFIG_CPU_SUBTYPE_SH7721 is not set
-# CONFIG_CPU_SUBTYPE_SH7750 is not set
-# CONFIG_CPU_SUBTYPE_SH7091 is not set
-# CONFIG_CPU_SUBTYPE_SH7750R is not set
-# CONFIG_CPU_SUBTYPE_SH7750S is not set
-# CONFIG_CPU_SUBTYPE_SH7751 is not set
-# CONFIG_CPU_SUBTYPE_SH7751R is not set
-# CONFIG_CPU_SUBTYPE_SH7760 is not set
-# CONFIG_CPU_SUBTYPE_SH4_202 is not set
-# CONFIG_CPU_SUBTYPE_SH7723 is not set
-# CONFIG_CPU_SUBTYPE_SH7763 is not set
-# CONFIG_CPU_SUBTYPE_SH7770 is not set
-# CONFIG_CPU_SUBTYPE_SH7780 is not set
-# CONFIG_CPU_SUBTYPE_SH7785 is not set
-# CONFIG_CPU_SUBTYPE_SH7786 is not set
-# CONFIG_CPU_SUBTYPE_SHX3 is not set
-# CONFIG_CPU_SUBTYPE_SH7343 is not set
-# CONFIG_CPU_SUBTYPE_SH7722 is not set
-# CONFIG_CPU_SUBTYPE_SH7366 is not set
 CONFIG_CPU_SUBTYPE_SH5_101=y
 # CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
@@ -279,8 +250,6 @@ CONFIG_BOOT_LINK_OFFSET=0x00800000
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 CONFIG_PCI_LEGACY=y
@@ -492,6 +461,7 @@ CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_SCSI_MPT2SAS is not set
 # CONFIG_SCSI_HPTIOP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_FCOE is not set
 # CONFIG_SCSI_DMX3191D is not set
 # CONFIG_SCSI_FUTURE_DOMAIN is not set
@@ -568,6 +538,7 @@ CONFIG_NETDEV_1000=y
 # CONFIG_E1000E is not set
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -591,6 +562,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_IXGBE is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
 # CONFIG_NIU is not set
@@ -779,6 +751,7 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_F71805F is not set
 # CONFIG_SENSORS_F71882FG is not set
 # CONFIG_SENSORS_F75375S is not set
+# CONFIG_SENSORS_G760A is not set
 # CONFIG_SENSORS_GL518SM is not set
 # CONFIG_SENSORS_GL520SM is not set
 # CONFIG_SENSORS_IT87 is not set
@@ -1042,7 +1015,6 @@ CONFIG_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 CONFIG_USB_ARCH_HAS_OHCI=y
@@ -1082,6 +1054,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -1105,6 +1078,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -1146,8 +1124,13 @@ CONFIG_MINIX_FS=y
 # CONFIG_HPFS_FS is not set
 # CONFIG_QNX4FS_FS is not set
 CONFIG_ROMFS_FS=y
+CONFIG_ROMFS_BACKED_BY_BLOCK=y
+# CONFIG_ROMFS_BACKED_BY_MTD is not set
+# CONFIG_ROMFS_BACKED_BY_BOTH is not set
+CONFIG_ROMFS_ON_BLOCK=y
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -1208,6 +1191,9 @@ CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_SOFTLOCKUP=y
 # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0
+CONFIG_DETECT_HUNG_TASK=y
+# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0
 CONFIG_SCHED_DEBUG=y
 CONFIG_SCHEDSTATS=y
 # CONFIG_TIMER_STATS is not set
@@ -1241,15 +1227,21 @@ CONFIG_FRAME_POINTER=y
 # CONFIG_LATENCYTOP is not set
 # CONFIG_SYSCTL_SYSCALL_CHECK is not set
 # CONFIG_PAGE_POISONING is not set
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 # CONFIG_EARLY_SCIF_CONSOLE is not set
 # CONFIG_DEBUG_BOOTMEM is not set
@@ -1354,6 +1346,7 @@ CONFIG_CRYPTO=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/dreamcast_defconfig b/arch/sh/configs/dreamcast_defconfig
index 5c11236..1f3cc98 100644
--- a/arch/sh/configs/dreamcast_defconfig
+++ b/arch/sh/configs/dreamcast_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 17:51:48 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:44:27 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -71,6 +72,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -90,6 +92,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 CONFIG_PROFILING=y
+# CONFIG_MARKERS is not set
 # CONFIG_OPROFILE is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
@@ -98,6 +101,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -110,7 +115,6 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -156,6 +160,7 @@ CONFIG_CPU_SUBTYPE_SH7091=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -165,8 +170,6 @@ CONFIG_CPU_SUBTYPE_SH7091=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -271,7 +274,7 @@ CONFIG_CPU_FREQ_GOV_USERSPACE=y
 CONFIG_SH_DMA_API=y
 CONFIG_SH_DMA=y
 CONFIG_SH_DMA_IRQ_MULTI=y
-CONFIG_NR_ONCHIP_DMA_CHANNELS=6
+CONFIG_NR_ONCHIP_DMA_CHANNELS=4
 CONFIG_NR_DMA_CHANNELS_BOOL=y
 CONFIG_NR_DMA_CHANNELS=9
 # CONFIG_PVR2_DMA is not set
@@ -320,7 +323,6 @@ CONFIG_CMDLINE="console=ttySC1,115200 panic=3"
 CONFIG_MAPLE=y
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 CONFIG_PCI_LEGACY=y
@@ -602,6 +604,7 @@ CONFIG_INPUT_MOUSE=y
 # CONFIG_MOUSE_APPLETOUCH is not set
 # CONFIG_MOUSE_BCM5974 is not set
 # CONFIG_MOUSE_VSXXXAA is not set
+# CONFIG_MOUSE_MAPLE is not set
 # CONFIG_INPUT_JOYSTICK is not set
 # CONFIG_INPUT_TABLET is not set
 # CONFIG_INPUT_TOUCHSCREEN is not set
@@ -812,7 +815,6 @@ CONFIG_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 CONFIG_USB_ARCH_HAS_OHCI=y
@@ -867,6 +869,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -910,6 +917,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 # CONFIG_NFS_FS is not set
 # CONFIG_NFSD is not set
@@ -947,10 +955,24 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_PREEMPT_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -1051,6 +1073,7 @@ CONFIG_CRYPTO=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/edosk7705_defconfig b/arch/sh/configs/edosk7705_defconfig
index f4c34b0..d709245 100644
--- a/arch/sh/configs/edosk7705_defconfig
+++ b/arch/sh/configs/edosk7705_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 17:54:02 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:45:04 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_FIND_NEXT_BIT=y
@@ -56,6 +57,7 @@ CONFIG_EMBEDDED=y
 # CONFIG_UID16 is not set
 # CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_KALLSYMS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 # CONFIG_HOTPLUG is not set
 # CONFIG_PRINTK is not set
 # CONFIG_BUG is not set
@@ -74,12 +76,15 @@ CONFIG_SHMEM=y
 CONFIG_SLUB=y
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_BASE_SMALL=1
 # CONFIG_MODULES is not set
@@ -114,6 +119,7 @@ CONFIG_CPU_SUBTYPE_SH7705=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -123,8 +129,6 @@ CONFIG_CPU_SUBTYPE_SH7705=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -382,6 +386,10 @@ CONFIG_SSB_POSSIBLE=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+
+#
 # Pseudo filesystems
 #
 # CONFIG_PROC_FS is not set
@@ -409,10 +417,22 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -426,6 +446,7 @@ CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SECURITYFS is not set
 # CONFIG_SECURITY_FILE_CAPABILITIES is not set
 # CONFIG_CRYPTO is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/edosk7760_defconfig b/arch/sh/configs/edosk7760_defconfig
index 7825c26..a822b1d 100644
--- a/arch/sh/configs/edosk7760_defconfig
+++ b/arch/sh/configs/edosk7760_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 17:54:57 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:45:25 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -40,6 +41,7 @@ CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 # CONFIG_BSD_PROCESS_ACCT_V3 is not set
 # CONFIG_TASKSTATS is not set
@@ -67,7 +69,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -77,6 +78,7 @@ CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -96,6 +98,7 @@ CONFIG_COMPAT_BRK=y
 CONFIG_SLUB=y
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -103,6 +106,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -115,7 +120,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -161,6 +165,7 @@ CONFIG_CPU_SH4=y
 CONFIG_CPU_SUBTYPE_SH7760=y
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -170,8 +175,6 @@ CONFIG_CPU_SUBTYPE_SH7760=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -815,6 +818,7 @@ CONFIG_EXT2_FS_XATTR=y
 # CONFIG_EXT2_FS_SECURITY is not set
 CONFIG_EXT2_FS_XIP=y
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -839,6 +843,11 @@ CONFIG_INOTIFY_USER=y
 CONFIG_GENERIC_ACL=y
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -883,6 +892,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 # CONFIG_NFS_V3 is not set
@@ -964,6 +974,9 @@ CONFIG_DEBUG_SHIRQ=y
 CONFIG_DETECT_SOFTLOCKUP=y
 # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0
+CONFIG_DETECT_HUNG_TASK=y
+# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0
 # CONFIG_SCHED_DEBUG is not set
 # CONFIG_SCHEDSTATS is not set
 CONFIG_TIMER_STATS=y
@@ -1001,6 +1014,7 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -1010,9 +1024,14 @@ CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 # CONFIG_PREEMPT_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -1126,6 +1145,7 @@ CONFIG_CRYPTO_DES=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/espt_defconfig b/arch/sh/configs/espt_defconfig
index ebb4c37..c5b5007 100644
--- a/arch/sh/configs/espt_defconfig
+++ b/arch/sh/configs/espt_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 17:58:18 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:46:26 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -78,6 +79,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -106,6 +108,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -117,7 +121,6 @@ CONFIG_MODULES=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -164,6 +167,7 @@ CONFIG_CPU_SH4A=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 CONFIG_CPU_SUBTYPE_SH7763=y
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -173,8 +177,6 @@ CONFIG_CPU_SUBTYPE_SH7763=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -548,6 +550,7 @@ CONFIG_SCSI_WAIT_SCAN=m
 CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_ISCSI_TCP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_SCSI_DEBUG is not set
 # CONFIG_SCSI_DH is not set
 # CONFIG_SCSI_OSD_INITIATOR is not set
@@ -919,6 +922,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -943,6 +947,11 @@ CONFIG_AUTOFS4_FS=y
 CONFIG_GENERIC_ACL=y
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -985,8 +994,13 @@ CONFIG_CRAMFS=y
 # CONFIG_HPFS_FS is not set
 # CONFIG_QNX4FS_FS is not set
 CONFIG_ROMFS_FS=y
+CONFIG_ROMFS_BACKED_BY_BLOCK=y
+# CONFIG_ROMFS_BACKED_BY_MTD is not set
+# CONFIG_ROMFS_BACKED_BY_BOTH is not set
+CONFIG_ROMFS_ON_BLOCK=y
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 # CONFIG_NFS_V3 is not set
@@ -1075,11 +1089,25 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_RING_BUFFER=y
 CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_FTRACE_STARTUP_TEST is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -1179,6 +1207,7 @@ CONFIG_CRYPTO=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
diff --git a/arch/sh/configs/hp6xx_defconfig b/arch/sh/configs/hp6xx_defconfig
index 82b113a..8e13027 100644
--- a/arch/sh/configs/hp6xx_defconfig
+++ b/arch/sh/configs/hp6xx_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:01:05 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:47:15 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -67,6 +68,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -85,12 +87,15 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -98,7 +103,6 @@ CONFIG_BASE_SMALL=0
 # CONFIG_MODULES is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -144,6 +148,7 @@ CONFIG_CPU_SUBTYPE_SH7709=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -153,8 +158,6 @@ CONFIG_CPU_SUBTYPE_SH7709=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -385,6 +388,7 @@ CONFIG_BLK_DEV_SD=y
 # CONFIG_SCSI_SRP_ATTRS is not set
 CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_SCSI_DEBUG is not set
 # CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
 # CONFIG_SCSI_DH is not set
@@ -431,6 +435,7 @@ CONFIG_KEYBOARD_HP6XX=y
 # CONFIG_INPUT_JOYSTICK is not set
 # CONFIG_INPUT_TABLET is not set
 CONFIG_INPUT_TOUCHSCREEN=y
+# CONFIG_TOUCHSCREEN_AD7879 is not set
 # CONFIG_TOUCHSCREEN_FUJITSU is not set
 # CONFIG_TOUCHSCREEN_GUNZE is not set
 # CONFIG_TOUCHSCREEN_ELO is not set
@@ -674,6 +679,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -719,6 +729,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 
 #
 # Partition Types
@@ -786,10 +797,23 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -898,6 +922,7 @@ CONFIG_CRYPTO_MD5=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/landisk_defconfig b/arch/sh/configs/landisk_defconfig
index b6fa4a7..7f549ae 100644
--- a/arch/sh/configs/landisk_defconfig
+++ b/arch/sh/configs/landisk_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:02:54 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:47:48 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -69,6 +70,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_EXTRA_PASS=y
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -88,6 +90,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -95,6 +98,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -107,7 +112,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -153,6 +157,7 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -162,8 +167,6 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -292,8 +295,6 @@ CONFIG_BOOT_LINK_OFFSET=0x00800000
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 CONFIG_PCI_LEGACY=y
@@ -602,6 +603,7 @@ CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_SCSI_MPT2SAS is not set
 # CONFIG_SCSI_HPTIOP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_FCOE is not set
 # CONFIG_SCSI_DMX3191D is not set
 # CONFIG_SCSI_FUTURE_DOMAIN is not set
@@ -706,6 +708,7 @@ CONFIG_NETDEV_1000=y
 # CONFIG_E1000E is not set
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -729,6 +732,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_IXGBE is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
 # CONFIG_NIU is not set
@@ -922,6 +926,7 @@ CONFIG_VIDEO_HELPER_CHIPS_AUTO=y
 # CONFIG_SOC_CAMERA is not set
 CONFIG_V4L_USB_DRIVERS=y
 # CONFIG_USB_VIDEO_CLASS is not set
+CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y
 # CONFIG_USB_GSPCA is not set
 # CONFIG_VIDEO_HDPVR is not set
 CONFIG_VIDEO_USBVIDEO=m
@@ -994,15 +999,17 @@ CONFIG_USB_HID=m
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=m
 CONFIG_HID_APPLE=m
 CONFIG_HID_BELKIN=m
 CONFIG_HID_CHERRY=m
 CONFIG_HID_CHICONY=m
 CONFIG_HID_CYPRESS=m
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=m
+# CONFIG_HID_KYE is not set
 CONFIG_HID_GYRATION=m
+# CONFIG_HID_KENSINGTON is not set
 CONFIG_HID_LOGITECH=m
 # CONFIG_LOGITECH_FF is not set
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1197,6 +1204,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -1222,6 +1230,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 CONFIG_ISO9660_FS=m
@@ -1270,10 +1283,15 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_HPFS_FS is not set
 # CONFIG_QNX4FS_FS is not set
 CONFIG_ROMFS_FS=y
+CONFIG_ROMFS_BACKED_BY_BLOCK=y
+# CONFIG_ROMFS_BACKED_BY_MTD is not set
+# CONFIG_ROMFS_BACKED_BY_BOTH is not set
+CONFIG_ROMFS_ON_BLOCK=y
 # CONFIG_SYSV_FS is not set
 CONFIG_UFS_FS=m
 # CONFIG_UFS_FS_WRITE is not set
 # CONFIG_UFS_DEBUG is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=m
 CONFIG_NFS_V3=y
@@ -1364,10 +1382,23 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 CONFIG_SH_STANDARD_BIOS=y
@@ -1469,6 +1500,7 @@ CONFIG_CRYPTO=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/lboxre2_defconfig b/arch/sh/configs/lboxre2_defconfig
index 92c515c..a7db539 100644
--- a/arch/sh/configs/lboxre2_defconfig
+++ b/arch/sh/configs/lboxre2_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:06:51 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:48:54 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -69,6 +70,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_EXTRA_PASS=y
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -88,6 +90,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -95,6 +98,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -107,7 +112,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -153,6 +157,7 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -162,8 +167,6 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -293,8 +296,6 @@ CONFIG_CMDLINE="console=ttySC1,115200 root=/dev/sda1"
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 CONFIG_PCI_LEGACY=y
@@ -542,6 +543,7 @@ CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_SCSI_MPT2SAS is not set
 # CONFIG_SCSI_HPTIOP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_FCOE is not set
 # CONFIG_SCSI_DMX3191D is not set
 # CONFIG_SCSI_FUTURE_DOMAIN is not set
@@ -702,6 +704,7 @@ CONFIG_NETDEV_1000=y
 # CONFIG_E1000E is not set
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -725,6 +728,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_IXGBE is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
 # CONFIG_NIU is not set
@@ -931,7 +935,6 @@ CONFIG_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 CONFIG_USB_ARCH_HAS_OHCI=y
@@ -1007,6 +1010,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -1029,6 +1033,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -1073,8 +1082,13 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_HPFS_FS is not set
 # CONFIG_QNX4FS_FS is not set
 CONFIG_ROMFS_FS=y
+CONFIG_ROMFS_BACKED_BY_BLOCK=y
+# CONFIG_ROMFS_BACKED_BY_MTD is not set
+# CONFIG_ROMFS_BACKED_BY_BOTH is not set
+CONFIG_ROMFS_ON_BLOCK=y
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 # CONFIG_NFS_FS is not set
 # CONFIG_NFSD is not set
@@ -1151,10 +1165,23 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 CONFIG_SH_STANDARD_BIOS=y
@@ -1256,6 +1283,7 @@ CONFIG_CRYPTO=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/magicpanelr2_defconfig b/arch/sh/configs/magicpanelr2_defconfig
index 26586c2..58bec61 100644
--- a/arch/sh/configs/magicpanelr2_defconfig
+++ b/arch/sh/configs/magicpanelr2_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:07:39 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:49:32 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -39,6 +40,7 @@ CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_BSD_PROCESS_ACCT_V3=y
 # CONFIG_TASKSTATS is not set
@@ -66,7 +68,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -76,6 +77,7 @@ CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -94,6 +96,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -101,6 +104,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -113,7 +118,6 @@ CONFIG_MODVERSIONS=y
 CONFIG_MODULE_SRCVERSION_ALL=y
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -159,6 +163,7 @@ CONFIG_CPU_SUBTYPE_SH7720=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -168,8 +173,6 @@ CONFIG_CPU_SUBTYPE_SH7720=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -813,6 +816,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 # CONFIG_EXT3_FS_XATTR is not set
 # CONFIG_EXT4_FS is not set
 CONFIG_JBD=y
@@ -831,6 +835,11 @@ CONFIG_FILE_LOCKING=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -884,6 +893,7 @@ CONFIG_JFFS2_RTIME=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -965,6 +975,7 @@ CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_KERNEL=y
 # CONFIG_DEBUG_SHIRQ is not set
 # CONFIG_DETECT_SOFTLOCKUP is not set
+# CONFIG_DETECT_HUNG_TASK is not set
 # CONFIG_SCHED_DEBUG is not set
 # CONFIG_SCHEDSTATS is not set
 # CONFIG_TIMER_STATS is not set
@@ -1000,6 +1011,7 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -1008,9 +1020,14 @@ CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 # CONFIG_IRQSOFF_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -1035,6 +1052,7 @@ CONFIG_DUMP_CODE=y
 # CONFIG_SECURITYFS is not set
 # CONFIG_SECURITY_FILE_CAPABILITIES is not set
 # CONFIG_CRYPTO is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/microdev_defconfig b/arch/sh/configs/microdev_defconfig
index 7517835..2886fc8 100644
--- a/arch/sh/configs/microdev_defconfig
+++ b/arch/sh/configs/microdev_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:11:13 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:50:51 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -65,7 +66,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -74,6 +74,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -92,12 +93,15 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -105,7 +109,6 @@ CONFIG_BASE_SMALL=0
 # CONFIG_MODULES is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -151,6 +154,7 @@ CONFIG_CPU_SH4=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 CONFIG_CPU_SUBTYPE_SH4_202=y
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -160,8 +164,6 @@ CONFIG_CPU_SUBTYPE_SH4_202=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -647,6 +649,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -669,6 +672,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -715,6 +723,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -802,10 +811,24 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_PREEMPT_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -914,6 +937,7 @@ CONFIG_CRYPTO_DES=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/migor_defconfig b/arch/sh/configs/migor_defconfig
index a8720f9..8ecceb4 100644
--- a/arch/sh/configs/migor_defconfig
+++ b/arch/sh/configs/migor_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:14:03 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:51:34 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -67,7 +68,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -76,6 +76,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -104,6 +105,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -115,7 +118,6 @@ CONFIG_MODULES=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -165,6 +167,7 @@ CONFIG_ARCH_SHMOBILE=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -174,8 +177,6 @@ CONFIG_ARCH_SHMOBILE=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 CONFIG_CPU_SUBTYPE_SH7722=y
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -577,6 +578,7 @@ CONFIG_SCSI_WAIT_SCAN=m
 CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_ISCSI_TCP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_SCSI_DEBUG is not set
 # CONFIG_SCSI_DH is not set
 # CONFIG_SCSI_OSD_INITIATOR is not set
@@ -873,7 +875,6 @@ CONFIG_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 # CONFIG_USB_ARCH_HAS_OHCI is not set
@@ -1012,6 +1013,11 @@ CONFIG_FILE_LOCKING=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -1056,6 +1062,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 # CONFIG_NFS_V3 is not set
@@ -1105,11 +1112,25 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_RING_BUFFER=y
 CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_FTRACE_STARTUP_TEST is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -1218,6 +1239,7 @@ CONFIG_CRYPTO_WORKQUEUE=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
diff --git a/arch/sh/configs/polaris_defconfig b/arch/sh/configs/polaris_defconfig
index df2d177d..2b95072 100644
--- a/arch/sh/configs/polaris_defconfig
+++ b/arch/sh/configs/polaris_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:16:48 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:52:19 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -40,6 +41,7 @@ CONFIG_LOCALVERSION=""
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_BSD_PROCESS_ACCT_V3=y
 # CONFIG_TASKSTATS is not set
@@ -76,6 +78,7 @@ CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -94,6 +97,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -101,6 +105,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -113,7 +119,6 @@ CONFIG_MODVERSIONS=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -159,6 +164,7 @@ CONFIG_CPU_SUBTYPE_SH7709=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -168,8 +174,6 @@ CONFIG_CPU_SUBTYPE_SH7709=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -778,6 +782,11 @@ CONFIG_FILE_LOCKING=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -831,6 +840,7 @@ CONFIG_JFFS2_RTIME=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -874,6 +884,9 @@ CONFIG_DEBUG_SHIRQ=y
 CONFIG_DETECT_SOFTLOCKUP=y
 # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0
+CONFIG_DETECT_HUNG_TASK=y
+# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0
 # CONFIG_SCHED_DEBUG is not set
 # CONFIG_SCHEDSTATS is not set
 # CONFIG_TIMER_STATS is not set
@@ -914,6 +927,7 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -923,9 +937,14 @@ CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 # CONFIG_PREEMPT_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -950,6 +969,7 @@ CONFIG_DUMP_CODE=y
 # CONFIG_SECURITYFS is not set
 # CONFIG_SECURITY_FILE_CAPABILITIES is not set
 # CONFIG_CRYPTO is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/r7780mp_defconfig b/arch/sh/configs/r7780mp_defconfig
index 7def4df..68e5eff 100644
--- a/arch/sh/configs/r7780mp_defconfig
+++ b/arch/sh/configs/r7780mp_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:20:17 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:53:28 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -78,6 +79,7 @@ CONFIG_UID16=y
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_ALL is not set
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -107,6 +109,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_BASE_SMALL=0
@@ -118,7 +122,6 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -165,6 +168,7 @@ CONFIG_CPU_SH4A=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 CONFIG_CPU_SUBTYPE_SH7780=y
@@ -174,8 +178,6 @@ CONFIG_CPU_SUBTYPE_SH7780=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -313,8 +315,6 @@ CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1"
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 CONFIG_PCI_LEGACY=y
@@ -536,6 +536,7 @@ CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_SCSI_MPT2SAS is not set
 # CONFIG_SCSI_HPTIOP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_FCOE is not set
 # CONFIG_SCSI_DMX3191D is not set
 # CONFIG_SCSI_FUTURE_DOMAIN is not set
@@ -696,6 +697,7 @@ CONFIG_E1000=m
 # CONFIG_E1000E is not set
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -719,6 +721,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_IXGBE is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
 # CONFIG_NIU is not set
@@ -920,6 +923,7 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_F71805F is not set
 # CONFIG_SENSORS_F71882FG is not set
 # CONFIG_SENSORS_F75375S is not set
+# CONFIG_SENSORS_G760A is not set
 # CONFIG_SENSORS_GL518SM is not set
 # CONFIG_SENSORS_GL520SM is not set
 # CONFIG_SENSORS_IT87 is not set
@@ -1027,7 +1031,6 @@ CONFIG_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 CONFIG_USB_ARCH_HAS_OHCI=y
@@ -1120,6 +1123,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -1143,6 +1147,11 @@ CONFIG_INOTIFY_USER=y
 CONFIG_FUSE_FS=m
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -1191,6 +1200,7 @@ CONFIG_MINIX_FS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -1279,6 +1289,9 @@ CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_SOFTLOCKUP=y
 # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0
+CONFIG_DETECT_HUNG_TASK=y
+# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0
 CONFIG_SCHED_DEBUG=y
 # CONFIG_SCHEDSTATS is not set
 # CONFIG_TIMER_STATS is not set
@@ -1316,6 +1329,7 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_RING_BUFFER=y
 CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -1325,11 +1339,16 @@ CONFIG_TRACING=y
 # CONFIG_PREEMPT_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_FTRACE_STARTUP_TEST is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -1449,6 +1468,7 @@ CONFIG_CRYPTO_DES=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
diff --git a/arch/sh/configs/r7785rp_defconfig b/arch/sh/configs/r7785rp_defconfig
index cb134ff..890fe3c 100644
--- a/arch/sh/configs/r7785rp_defconfig
+++ b/arch/sh/configs/r7785rp_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:24:35 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:55:10 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -43,6 +44,7 @@ CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 # CONFIG_BSD_PROCESS_ACCT_V3 is not set
 # CONFIG_TASKSTATS is not set
@@ -78,6 +80,7 @@ CONFIG_UID16=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -108,6 +111,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -120,7 +125,6 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -168,6 +172,7 @@ CONFIG_CPU_SHX2=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -177,8 +182,6 @@ CONFIG_CPU_SUBTYPE_SH7785=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -337,8 +340,6 @@ CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1"
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 # CONFIG_PCI_LEGACY is not set
@@ -561,6 +562,7 @@ CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_SCSI_MPT2SAS is not set
 # CONFIG_SCSI_HPTIOP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_FCOE is not set
 # CONFIG_SCSI_DMX3191D is not set
 # CONFIG_SCSI_FUTURE_DOMAIN is not set
@@ -698,6 +700,7 @@ CONFIG_NETDEV_1000=y
 # CONFIG_E1000E is not set
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -721,6 +724,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_IXGBE is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
 # CONFIG_NIU is not set
@@ -948,6 +952,7 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_F71805F is not set
 # CONFIG_SENSORS_F71882FG is not set
 # CONFIG_SENSORS_F75375S is not set
+# CONFIG_SENSORS_G760A is not set
 # CONFIG_SENSORS_GL518SM is not set
 # CONFIG_SENSORS_GL520SM is not set
 # CONFIG_SENSORS_IT87 is not set
@@ -970,6 +975,7 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_PC87360 is not set
 # CONFIG_SENSORS_PC87427 is not set
 # CONFIG_SENSORS_PCF8591 is not set
+# CONFIG_SENSORS_SHT15 is not set
 # CONFIG_SENSORS_SIS5595 is not set
 # CONFIG_SENSORS_DME1737 is not set
 # CONFIG_SENSORS_SMSC47M1 is not set
@@ -1109,7 +1115,6 @@ CONFIG_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 CONFIG_USB_ARCH_HAS_OHCI=y
@@ -1202,6 +1207,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -1225,6 +1231,11 @@ CONFIG_INOTIFY_USER=y
 CONFIG_FUSE_FS=m
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -1273,6 +1284,7 @@ CONFIG_MINIX_FS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -1359,6 +1371,7 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 # CONFIG_DEBUG_SHIRQ is not set
 # CONFIG_DETECT_SOFTLOCKUP is not set
+# CONFIG_DETECT_HUNG_TASK is not set
 CONFIG_SCHED_DEBUG=y
 # CONFIG_SCHEDSTATS is not set
 # CONFIG_TIMER_STATS is not set
@@ -1401,6 +1414,7 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_RING_BUFFER=y
 CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -1410,11 +1424,16 @@ CONFIG_TRACING=y
 # CONFIG_PREEMPT_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_FTRACE_STARTUP_TEST is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -1534,6 +1553,7 @@ CONFIG_CRYPTO_DES=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
diff --git a/arch/sh/configs/rsk7201_defconfig b/arch/sh/configs/rsk7201_defconfig
index a037c74..fa43957 100644
--- a/arch/sh/configs/rsk7201_defconfig
+++ b/arch/sh/configs/rsk7201_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:29:08 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:56:29 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -65,7 +66,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -74,6 +74,7 @@ CONFIG_UID16=y
 CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -100,6 +101,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_RT_MUTEXES=y
 CONFIG_BASE_SMALL=0
@@ -110,7 +113,6 @@ CONFIG_MODULES=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -157,6 +159,7 @@ CONFIG_CPU_SUBTYPE_SH7201=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -166,8 +169,6 @@ CONFIG_CPU_SUBTYPE_SH7201=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -603,6 +604,11 @@ CONFIG_EXT2_FS=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -651,8 +657,13 @@ CONFIG_JFFS2_RTIME=y
 # CONFIG_HPFS_FS is not set
 # CONFIG_QNX4FS_FS is not set
 CONFIG_ROMFS_FS=y
+CONFIG_ROMFS_BACKED_BY_BLOCK=y
+# CONFIG_ROMFS_BACKED_BY_MTD is not set
+# CONFIG_ROMFS_BACKED_BY_BOTH is not set
+CONFIG_ROMFS_ON_BLOCK=y
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 
 #
 # Partition Types
@@ -686,11 +697,24 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_RING_BUFFER=y
 CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_FTRACE_STARTUP_TEST is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -705,6 +729,7 @@ CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SECURITYFS is not set
 # CONFIG_SECURITY_FILE_CAPABILITIES is not set
 # CONFIG_CRYPTO is not set
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
diff --git a/arch/sh/configs/rsk7203_defconfig b/arch/sh/configs/rsk7203_defconfig
index 9ae28e8..e3a65f8 100644
--- a/arch/sh/configs/rsk7203_defconfig
+++ b/arch/sh/configs/rsk7203_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:30:34 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:57:06 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -39,6 +40,7 @@ CONFIG_LOCALVERSION=""
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 # CONFIG_BSD_PROCESS_ACCT_V3 is not set
 # CONFIG_TASKSTATS is not set
@@ -70,7 +72,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -80,6 +81,7 @@ CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -106,6 +108,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_RT_MUTEXES=y
 CONFIG_BASE_SMALL=0
@@ -116,7 +120,6 @@ CONFIG_MODULES=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -163,6 +166,7 @@ CONFIG_CPU_SUBTYPE_SH7203=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -172,8 +176,6 @@ CONFIG_CPU_SUBTYPE_SH7203=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -750,15 +752,17 @@ CONFIG_USB_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=y
 CONFIG_HID_APPLE=y
 CONFIG_HID_BELKIN=y
 CONFIG_HID_CHERRY=y
 CONFIG_HID_CHICONY=y
 CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=y
+# CONFIG_HID_KYE is not set
 CONFIG_HID_GYRATION=y
+# CONFIG_HID_KENSINGTON is not set
 CONFIG_HID_LOGITECH=y
 # CONFIG_LOGITECH_FF is not set
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -874,6 +878,7 @@ CONFIG_LEDS_CLASS=y
 # LED drivers
 #
 CONFIG_LEDS_GPIO=y
+CONFIG_LEDS_GPIO_PLATFORM=y
 
 #
 # LED Triggers
@@ -882,6 +887,7 @@ CONFIG_LEDS_TRIGGERS=y
 CONFIG_LEDS_TRIGGER_TIMER=y
 CONFIG_LEDS_TRIGGER_HEARTBEAT=y
 CONFIG_LEDS_TRIGGER_BACKLIGHT=y
+# CONFIG_LEDS_TRIGGER_GPIO is not set
 CONFIG_LEDS_TRIGGER_DEFAULT_ON=y
 
 #
@@ -951,6 +957,11 @@ CONFIG_FILE_LOCKING=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -989,8 +1000,13 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_HPFS_FS is not set
 # CONFIG_QNX4FS_FS is not set
 CONFIG_ROMFS_FS=y
+CONFIG_ROMFS_BACKED_BY_BLOCK=y
+# CONFIG_ROMFS_BACKED_BY_MTD is not set
+# CONFIG_ROMFS_BACKED_BY_BOTH is not set
+CONFIG_ROMFS_ON_BLOCK=y
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 # CONFIG_NFS_V3 is not set
@@ -1033,6 +1049,9 @@ CONFIG_DEBUG_SHIRQ=y
 CONFIG_DETECT_SOFTLOCKUP=y
 # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0
+CONFIG_DETECT_HUNG_TASK=y
+# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0
 CONFIG_SCHED_DEBUG=y
 # CONFIG_SCHEDSTATS is not set
 # CONFIG_TIMER_STATS is not set
@@ -1076,6 +1095,7 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_RING_BUFFER=y
 CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -1083,11 +1103,16 @@ CONFIG_TRACING=y
 # CONFIG_FUNCTION_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_FTRACE_STARTUP_TEST is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -1111,6 +1136,7 @@ CONFIG_DUMP_CODE=y
 # CONFIG_SECURITYFS is not set
 # CONFIG_SECURITY_FILE_CAPABILITIES is not set
 # CONFIG_CRYPTO is not set
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
diff --git a/arch/sh/configs/rts7751r2d1_defconfig b/arch/sh/configs/rts7751r2d1_defconfig
index c0f741a..a4a59f6 100644
--- a/arch/sh/configs/rts7751r2d1_defconfig
+++ b/arch/sh/configs/rts7751r2d1_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:33:25 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:58:13 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -70,6 +71,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -99,6 +101,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -110,7 +114,6 @@ CONFIG_MODULES=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -156,6 +159,7 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -165,8 +169,6 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -302,8 +304,6 @@ CONFIG_CMDLINE="console=tty0 console=ttySC0,115200 root=/dev/sda1 earlyprintk=se
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 CONFIG_PCI_LEGACY=y
@@ -513,6 +513,7 @@ CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_SCSI_MPT2SAS is not set
 # CONFIG_SCSI_HPTIOP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_FCOE is not set
 # CONFIG_SCSI_DMX3191D is not set
 # CONFIG_SCSI_FUTURE_DOMAIN is not set
@@ -672,6 +673,7 @@ CONFIG_NETDEV_1000=y
 # CONFIG_E1000E is not set
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -695,6 +697,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_IXGBE is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
 # CONFIG_NIU is not set
@@ -793,6 +796,7 @@ CONFIG_SERIAL_8250_RUNTIME_UARTS=4
 #
 # Non-8250 serial port support
 #
+# CONFIG_SERIAL_MAX3100 is not set
 CONFIG_SERIAL_SH_SCI=y
 CONFIG_SERIAL_SH_SCI_NR_UARTS=1
 CONFIG_SERIAL_SH_SCI_CONSOLE=y
@@ -1079,15 +1083,17 @@ CONFIG_USB_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=y
 CONFIG_HID_APPLE=y
 CONFIG_HID_BELKIN=y
 CONFIG_HID_CHERRY=y
 CONFIG_HID_CHICONY=y
 CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=y
+# CONFIG_HID_KYE is not set
 CONFIG_HID_GYRATION=y
+# CONFIG_HID_KENSINGTON is not set
 CONFIG_HID_LOGITECH=y
 # CONFIG_LOGITECH_FF is not set
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1291,6 +1297,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -1337,6 +1348,7 @@ CONFIG_MINIX_FS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 # CONFIG_NFS_FS is not set
 # CONFIG_NFSD is not set
@@ -1417,11 +1429,25 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_RING_BUFFER=y
 CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_FTRACE_STARTUP_TEST is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -1524,6 +1550,7 @@ CONFIG_CRYPTO=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
diff --git a/arch/sh/configs/rts7751r2dplus_defconfig b/arch/sh/configs/rts7751r2dplus_defconfig
index 8feef62..2bc1c97 100644
--- a/arch/sh/configs/rts7751r2dplus_defconfig
+++ b/arch/sh/configs/rts7751r2dplus_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:34:12 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:59:01 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -70,6 +71,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -99,6 +101,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -110,7 +114,6 @@ CONFIG_MODULES=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -156,6 +159,7 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -165,8 +169,6 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -302,8 +304,6 @@ CONFIG_CMDLINE="console=tty0 console=ttySC0,115200 root=/dev/sda1 earlyprintk=se
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 CONFIG_PCI_LEGACY=y
@@ -513,6 +513,7 @@ CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_SCSI_MPT2SAS is not set
 # CONFIG_SCSI_HPTIOP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_FCOE is not set
 # CONFIG_SCSI_DMX3191D is not set
 # CONFIG_SCSI_FUTURE_DOMAIN is not set
@@ -672,6 +673,7 @@ CONFIG_NETDEV_1000=y
 # CONFIG_E1000E is not set
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -695,6 +697,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_IXGBE is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
 # CONFIG_NIU is not set
@@ -793,6 +796,7 @@ CONFIG_SERIAL_8250_RUNTIME_UARTS=4
 #
 # Non-8250 serial port support
 #
+# CONFIG_SERIAL_MAX3100 is not set
 CONFIG_SERIAL_SH_SCI=y
 CONFIG_SERIAL_SH_SCI_NR_UARTS=1
 CONFIG_SERIAL_SH_SCI_CONSOLE=y
@@ -1079,15 +1083,17 @@ CONFIG_USB_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=y
 CONFIG_HID_APPLE=y
 CONFIG_HID_BELKIN=y
 CONFIG_HID_CHERRY=y
 CONFIG_HID_CHICONY=y
 CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=y
+# CONFIG_HID_KYE is not set
 CONFIG_HID_GYRATION=y
+# CONFIG_HID_KENSINGTON is not set
 CONFIG_HID_LOGITECH=y
 # CONFIG_LOGITECH_FF is not set
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1291,6 +1297,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -1337,6 +1348,7 @@ CONFIG_MINIX_FS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 # CONFIG_NFS_FS is not set
 # CONFIG_NFSD is not set
@@ -1417,11 +1429,25 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_RING_BUFFER=y
 CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_FTRACE_STARTUP_TEST is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -1524,6 +1550,7 @@ CONFIG_CRYPTO=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
diff --git a/arch/sh/configs/sdk7780_defconfig b/arch/sh/configs/sdk7780_defconfig
index 739e229..a629b79 100644
--- a/arch/sh/configs/sdk7780_defconfig
+++ b/arch/sh/configs/sdk7780_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:34:43 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:59:32 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -41,6 +42,7 @@ CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 # CONFIG_BSD_PROCESS_ACCT_V3 is not set
 # CONFIG_TASKSTATS is not set
@@ -73,6 +75,7 @@ CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -93,6 +96,7 @@ CONFIG_COMPAT_BRK=y
 CONFIG_SLUB=y
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -100,6 +104,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -112,7 +118,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 CONFIG_LBD=y
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -159,6 +164,7 @@ CONFIG_CPU_SH4A=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 CONFIG_CPU_SUBTYPE_SH7780=y
@@ -168,8 +174,6 @@ CONFIG_CPU_SUBTYPE_SH7780=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -310,8 +314,6 @@ CONFIG_CMDLINE="mem=128M console=tty0 console=ttySC0,115200 ip=bootp root=/dev/n
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 # CONFIG_PCI_LEGACY is not set
@@ -646,6 +648,7 @@ CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_SCSI_MPT2SAS is not set
 # CONFIG_SCSI_HPTIOP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_FCOE is not set
 # CONFIG_SCSI_DMX3191D is not set
 # CONFIG_SCSI_FUTURE_DOMAIN is not set
@@ -1091,15 +1094,17 @@ CONFIG_USB_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=y
 CONFIG_HID_APPLE=y
 CONFIG_HID_BELKIN=y
 CONFIG_HID_CHERRY=y
 CONFIG_HID_CHICONY=y
 CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=y
+# CONFIG_HID_KYE is not set
 CONFIG_HID_GYRATION=y
+# CONFIG_HID_KENSINGTON is not set
 CONFIG_HID_LOGITECH=y
 # CONFIG_LOGITECH_FF is not set
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1257,6 +1262,7 @@ CONFIG_EXT2_FS_POSIX_ACL=y
 # CONFIG_EXT2_FS_SECURITY is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 CONFIG_EXT3_FS_POSIX_ACL=y
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -1281,6 +1287,11 @@ CONFIG_AUTOFS4_FS=y
 CONFIG_GENERIC_ACL=y
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 CONFIG_ISO9660_FS=y
@@ -1331,6 +1342,7 @@ CONFIG_MINIX_FS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -1418,6 +1430,9 @@ CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_SOFTLOCKUP=y
 # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0
+CONFIG_DETECT_HUNG_TASK=y
+# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0
 # CONFIG_SCHED_DEBUG is not set
 # CONFIG_SCHEDSTATS is not set
 CONFIG_TIMER_STATS=y
@@ -1455,6 +1470,7 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -1464,9 +1480,14 @@ CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 # CONFIG_PREEMPT_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -1580,6 +1601,7 @@ CONFIG_CRYPTO_DES=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/se7206_defconfig b/arch/sh/configs/se7206_defconfig
index d30e0a7..5caf85a 100644
--- a/arch/sh/configs/se7206_defconfig
+++ b/arch/sh/configs/se7206_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:39:37 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:01:02 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -40,6 +41,7 @@ CONFIG_LOCALVERSION_AUTO=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 # CONFIG_BSD_PROCESS_ACCT_V3 is not set
 # CONFIG_TASKSTATS is not set
@@ -63,6 +65,7 @@ CONFIG_CGROUP_DEBUG=y
 CONFIG_CGROUP_NS=y
 # CONFIG_CGROUP_FREEZER is not set
 CONFIG_CGROUP_DEVICE=y
+# CONFIG_CPUSETS is not set
 CONFIG_CGROUP_CPUACCT=y
 CONFIG_RESOURCE_COUNTERS=y
 CONFIG_CGROUP_MEM_RES_CTLR=y
@@ -80,7 +83,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -90,6 +92,7 @@ CONFIG_EMBEDDED=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -116,6 +119,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_RT_MUTEXES=y
 CONFIG_BASE_SMALL=0
@@ -127,7 +132,6 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -174,6 +178,7 @@ CONFIG_CPU_SUBTYPE_SH7206=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -183,8 +188,6 @@ CONFIG_CPU_SUBTYPE_SH7206=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -746,6 +749,11 @@ CONFIG_FILE_LOCKING=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -785,8 +793,13 @@ CONFIG_CRAMFS=y
 # CONFIG_HPFS_FS is not set
 # CONFIG_QNX4FS_FS is not set
 CONFIG_ROMFS_FS=y
+CONFIG_ROMFS_BACKED_BY_BLOCK=y
+# CONFIG_ROMFS_BACKED_BY_MTD is not set
+# CONFIG_ROMFS_BACKED_BY_BOTH is not set
+CONFIG_ROMFS_ON_BLOCK=y
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -831,6 +844,9 @@ CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_SOFTLOCKUP=y
 # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0
+CONFIG_DETECT_HUNG_TASK=y
+# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0
 CONFIG_SCHED_DEBUG=y
 # CONFIG_SCHEDSTATS is not set
 # CONFIG_TIMER_STATS is not set
@@ -869,6 +885,7 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_RING_BUFFER=y
 CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -876,11 +893,16 @@ CONFIG_TRACING=y
 # CONFIG_FUNCTION_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_FTRACE_STARTUP_TEST is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -991,6 +1013,7 @@ CONFIG_CRYPTO_LZO=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
diff --git a/arch/sh/configs/se7343_defconfig b/arch/sh/configs/se7343_defconfig
index fbb72d0..004d531 100644
--- a/arch/sh/configs/se7343_defconfig
+++ b/arch/sh/configs/se7343_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:42:00 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:01:44 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -40,6 +41,7 @@ CONFIG_LOCALVERSION_AUTO=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 # CONFIG_BSD_PROCESS_ACCT is not set
 # CONFIG_TASKSTATS is not set
 # CONFIG_AUDIT is not set
@@ -73,6 +75,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -91,6 +94,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -98,6 +102,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_BASE_SMALL=0
@@ -109,7 +115,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -158,6 +163,7 @@ CONFIG_ARCH_SHMOBILE=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -167,8 +173,6 @@ CONFIG_ARCH_SHMOBILE=y
 CONFIG_CPU_SUBTYPE_SH7343=y
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -769,6 +773,7 @@ CONFIG_VIDEO_HELPER_CHIPS_AUTO=y
 # CONFIG_SOC_CAMERA is not set
 CONFIG_V4L_USB_DRIVERS=y
 # CONFIG_USB_VIDEO_CLASS is not set
+CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y
 CONFIG_USB_GSPCA=m
 # CONFIG_USB_M5602 is not set
 # CONFIG_USB_STV06XX is not set
@@ -800,6 +805,7 @@ CONFIG_USB_GSPCA=m
 # CONFIG_VIDEO_PVRUSB2 is not set
 # CONFIG_VIDEO_HDPVR is not set
 # CONFIG_VIDEO_EM28XX is not set
+# CONFIG_VIDEO_CX231XX is not set
 # CONFIG_VIDEO_USBVISION is not set
 # CONFIG_USB_VICAM is not set
 # CONFIG_USB_IBMCAM is not set
@@ -813,6 +819,7 @@ CONFIG_USB_GSPCA=m
 # CONFIG_USB_STV680 is not set
 # CONFIG_USB_ZC0301 is not set
 # CONFIG_USB_PWC is not set
+CONFIG_USB_PWC_INPUT_EVDEV=y
 # CONFIG_USB_ZR364XX is not set
 # CONFIG_USB_STKWEBCAM is not set
 # CONFIG_USB_S2255 is not set
@@ -914,15 +921,17 @@ CONFIG_USB_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=y
 CONFIG_HID_APPLE=y
 CONFIG_HID_BELKIN=y
 CONFIG_HID_CHERRY=y
 CONFIG_HID_CHICONY=y
 CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=y
+# CONFIG_HID_KYE is not set
 CONFIG_HID_GYRATION=y
+# CONFIG_HID_KENSINGTON is not set
 CONFIG_HID_LOGITECH=y
 # CONFIG_LOGITECH_FF is not set
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1050,6 +1059,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -1071,6 +1081,11 @@ CONFIG_FILE_LOCKING=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -1125,6 +1140,7 @@ CONFIG_CRAMFS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -1174,10 +1190,23 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -1279,6 +1308,7 @@ CONFIG_CRYPTO=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/se7619_defconfig b/arch/sh/configs/se7619_defconfig
index 125304e..edbece5 100644
--- a/arch/sh/configs/se7619_defconfig
+++ b/arch/sh/configs/se7619_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:44:53 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:02:32 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -61,6 +62,7 @@ CONFIG_EMBEDDED=y
 # CONFIG_UID16 is not set
 # CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_KALLSYMS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 # CONFIG_HOTPLUG is not set
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -78,11 +80,14 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_BASE_SMALL=1
@@ -134,6 +139,7 @@ CONFIG_CPU_SUBTYPE_SH7619=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -143,8 +149,6 @@ CONFIG_CPU_SUBTYPE_SH7619=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -513,7 +517,6 @@ CONFIG_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 # CONFIG_USB_ARCH_HAS_OHCI is not set
@@ -564,6 +567,11 @@ CONFIG_FILE_LOCKING=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -601,8 +609,13 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_HPFS_FS is not set
 # CONFIG_QNX4FS_FS is not set
 CONFIG_ROMFS_FS=y
+CONFIG_ROMFS_BACKED_BY_BLOCK=y
+# CONFIG_ROMFS_BACKED_BY_MTD is not set
+# CONFIG_ROMFS_BACKED_BY_BOTH is not set
+CONFIG_ROMFS_ON_BLOCK=y
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 
 #
 # Partition Types
@@ -630,10 +643,21 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -647,6 +671,7 @@ CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SECURITYFS is not set
 # CONFIG_SECURITY_FILE_CAPABILITIES is not set
 # CONFIG_CRYPTO is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/se7705_defconfig b/arch/sh/configs/se7705_defconfig
index 0308abf..bae161c 100644
--- a/arch/sh/configs/se7705_defconfig
+++ b/arch/sh/configs/se7705_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:45:56 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:02:52 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -62,7 +63,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -70,6 +70,7 @@ CONFIG_EMBEDDED=y
 CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_KALLSYMS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 # CONFIG_HOTPLUG is not set
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -88,12 +89,15 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -150,6 +154,7 @@ CONFIG_CPU_SUBTYPE_SH7705=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -159,8 +164,6 @@ CONFIG_CPU_SUBTYPE_SH7705=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -698,7 +701,6 @@ CONFIG_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 # CONFIG_USB_ARCH_HAS_OHCI is not set
@@ -752,6 +754,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -804,6 +811,7 @@ CONFIG_JFFS2_RTIME=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 # CONFIG_NFS_V3 is not set
@@ -847,10 +855,23 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_PREEMPT_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -949,6 +970,7 @@ CONFIG_CRYPTO=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig
index a8c24b7..330043f 100644
--- a/arch/sh/configs/se7712_defconfig
+++ b/arch/sh/configs/se7712_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:48:18 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:03:27 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_FIND_NEXT_BIT=y
@@ -38,6 +39,7 @@ CONFIG_LOCALVERSION=""
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 # CONFIG_BSD_PROCESS_ACCT_V3 is not set
 # CONFIG_TASKSTATS is not set
@@ -69,6 +71,7 @@ CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 # CONFIG_BUG is not set
@@ -87,6 +90,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -94,6 +98,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -105,7 +111,6 @@ CONFIG_MODULES=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -151,6 +156,7 @@ CONFIG_CPU_SUBTYPE_SH7712=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -160,8 +166,6 @@ CONFIG_CPU_SUBTYPE_SH7712=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -585,6 +589,7 @@ CONFIG_SCSI_WAIT_SCAN=m
 CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_ISCSI_TCP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_SCSI_DEBUG is not set
 # CONFIG_SCSI_DH is not set
 # CONFIG_SCSI_OSD_INITIATOR is not set
@@ -812,6 +817,7 @@ CONFIG_EXT2_FS_POSIX_ACL=y
 CONFIG_EXT2_FS_SECURITY=y
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -833,6 +839,11 @@ CONFIG_FILE_LOCKING=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -887,6 +898,7 @@ CONFIG_CRAMFS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 # CONFIG_NFS_V3 is not set
@@ -927,6 +939,7 @@ CONFIG_FRAME_WARN=1024
 CONFIG_DEBUG_KERNEL=y
 # CONFIG_DEBUG_SHIRQ is not set
 # CONFIG_DETECT_SOFTLOCKUP is not set
+# CONFIG_DETECT_HUNG_TASK is not set
 CONFIG_SCHED_DEBUG=y
 # CONFIG_SCHEDSTATS is not set
 # CONFIG_TIMER_STATS is not set
@@ -961,6 +974,7 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -969,9 +983,14 @@ CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 # CONFIG_IRQSOFF_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -1090,6 +1109,7 @@ CONFIG_CRYPTO_DEFLATE=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig
index 4b79c25..5647891 100644
--- a/arch/sh/configs/se7721_defconfig
+++ b/arch/sh/configs/se7721_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:51:44 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:04:19 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_FIND_NEXT_BIT=y
@@ -38,6 +39,7 @@ CONFIG_LOCALVERSION=""
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 # CONFIG_BSD_PROCESS_ACCT_V3 is not set
 # CONFIG_TASKSTATS is not set
@@ -73,6 +75,7 @@ CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 # CONFIG_BUG is not set
@@ -91,6 +94,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -98,6 +102,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -109,7 +115,6 @@ CONFIG_MODULES=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -155,6 +160,7 @@ CONFIG_CPU_SUBTYPE_SH7721=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -164,8 +170,6 @@ CONFIG_CPU_SUBTYPE_SH7721=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -776,15 +780,17 @@ CONFIG_USB_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=y
 CONFIG_HID_APPLE=y
 CONFIG_HID_BELKIN=y
 CONFIG_HID_CHERRY=y
 CONFIG_HID_CHICONY=y
 CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=y
+# CONFIG_HID_KYE is not set
 CONFIG_HID_GYRATION=y
+# CONFIG_HID_KENSINGTON is not set
 CONFIG_HID_LOGITECH=y
 # CONFIG_LOGITECH_FF is not set
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -943,6 +949,7 @@ CONFIG_EXT2_FS_POSIX_ACL=y
 CONFIG_EXT2_FS_SECURITY=y
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -964,6 +971,11 @@ CONFIG_FILE_LOCKING=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -1021,6 +1033,7 @@ CONFIG_CRAMFS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 # CONFIG_NETWORK_FILESYSTEMS is not set
 
 #
@@ -1085,6 +1098,7 @@ CONFIG_FRAME_WARN=1024
 CONFIG_DEBUG_KERNEL=y
 # CONFIG_DEBUG_SHIRQ is not set
 # CONFIG_DETECT_SOFTLOCKUP is not set
+# CONFIG_DETECT_HUNG_TASK is not set
 CONFIG_SCHED_DEBUG=y
 # CONFIG_SCHEDSTATS is not set
 # CONFIG_TIMER_STATS is not set
@@ -1119,6 +1133,7 @@ CONFIG_FRAME_POINTER=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -1127,9 +1142,14 @@ CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 # CONFIG_IRQSOFF_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -1248,6 +1268,7 @@ CONFIG_CRYPTO_DEFLATE=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/se7722_defconfig b/arch/sh/configs/se7722_defconfig
index 82bdaac..726fdbd 100644
--- a/arch/sh/configs/se7722_defconfig
+++ b/arch/sh/configs/se7722_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:55:10 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:05:29 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -69,7 +70,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -78,6 +78,7 @@ CONFIG_UID16=y
 CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -97,6 +98,7 @@ CONFIG_COMPAT_BRK=y
 CONFIG_SLUB=y
 # CONFIG_SLOB is not set
 CONFIG_PROFILING=y
+# CONFIG_MARKERS is not set
 # CONFIG_OPROFILE is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
@@ -105,6 +107,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -117,7 +121,6 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -167,6 +170,7 @@ CONFIG_ARCH_SHMOBILE=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -176,8 +180,6 @@ CONFIG_ARCH_SHMOBILE=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 CONFIG_CPU_SUBTYPE_SH7722=y
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -486,6 +488,7 @@ CONFIG_SCSI_WAIT_SCAN=m
 CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_ISCSI_TCP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_SCSI_DEBUG is not set
 # CONFIG_SCSI_DH is not set
 # CONFIG_SCSI_OSD_INITIATOR is not set
@@ -692,7 +695,6 @@ CONFIG_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 # CONFIG_USB_ARCH_HAS_OHCI is not set
@@ -766,6 +768,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -789,6 +792,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -832,6 +840,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 # CONFIG_NFS_FS is not set
 # CONFIG_NFSD is not set
@@ -872,11 +881,25 @@ CONFIG_DEBUG_FS=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_PREEMPT_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 CONFIG_SH_STANDARD_BIOS=y
@@ -977,6 +1000,7 @@ CONFIG_CRYPTO=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/se7750_defconfig b/arch/sh/configs/se7750_defconfig
index ceef6d9..ed1a123 100644
--- a/arch/sh/configs/se7750_defconfig
+++ b/arch/sh/configs/se7750_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:57:31 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:06:02 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -70,6 +71,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 # CONFIG_HOTPLUG is not set
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -88,6 +90,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -95,6 +98,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -106,7 +111,6 @@ CONFIG_MODULES=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -152,6 +156,7 @@ CONFIG_CPU_SUBTYPE_SH7750=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -161,8 +166,6 @@ CONFIG_CPU_SUBTYPE_SH7750=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -553,6 +556,7 @@ CONFIG_SCSI_WAIT_SCAN=m
 CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_ISCSI_TCP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_SCSI_DEBUG is not set
 # CONFIG_SCSI_DH is not set
 # CONFIG_SCSI_OSD_INITIATOR is not set
@@ -775,6 +779,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -829,6 +838,7 @@ CONFIG_JFFS2_RTIME=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 # CONFIG_NFS_V3 is not set
@@ -886,10 +896,23 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -989,6 +1012,7 @@ CONFIG_CRYPTO=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/se7751_defconfig b/arch/sh/configs/se7751_defconfig
index 67fc26b3..55f3b78 100644
--- a/arch/sh/configs/se7751_defconfig
+++ b/arch/sh/configs/se7751_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:59:59 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:06:44 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -65,7 +66,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -74,6 +74,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 # CONFIG_HOTPLUG is not set
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -92,6 +93,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -99,6 +101,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -110,7 +114,6 @@ CONFIG_MODULES=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -156,6 +159,7 @@ CONFIG_CPU_SUBTYPE_SH7751=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -165,8 +169,6 @@ CONFIG_CPU_SUBTYPE_SH7751=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -742,6 +744,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -796,6 +803,7 @@ CONFIG_JFFS2_RTIME=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 # CONFIG_NFS_FS is not set
 # CONFIG_NFSD is not set
@@ -833,10 +841,23 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -936,6 +957,7 @@ CONFIG_CRYPTO=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/se7780_defconfig b/arch/sh/configs/se7780_defconfig
index ebce23c..c4f0af3 100644
--- a/arch/sh/configs/se7780_defconfig
+++ b/arch/sh/configs/se7780_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 19:02:05 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:07:14 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -67,6 +68,7 @@ CONFIG_EMBEDDED=y
 CONFIG_UID16=y
 CONFIG_SYSCTL_SYSCALL=y
 # CONFIG_KALLSYMS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 # CONFIG_HOTPLUG is not set
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -86,12 +88,15 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -103,7 +108,6 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
 #
@@ -149,6 +153,7 @@ CONFIG_CPU_SH4A=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 CONFIG_CPU_SUBTYPE_SH7780=y
@@ -158,8 +163,6 @@ CONFIG_CPU_SUBTYPE_SH7780=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -285,8 +288,6 @@ CONFIG_CMDLINE="console=ttySC0.115200 root=/dev/sda1"
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 CONFIG_PCI_LEGACY=y
@@ -558,6 +559,7 @@ CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_SCSI_MPT2SAS is not set
 # CONFIG_SCSI_HPTIOP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_FCOE is not set
 # CONFIG_SCSI_DMX3191D is not set
 # CONFIG_SCSI_FUTURE_DOMAIN is not set
@@ -957,15 +959,17 @@ CONFIG_USB_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=y
 CONFIG_HID_APPLE=y
 CONFIG_HID_BELKIN=y
 CONFIG_HID_CHERRY=y
 CONFIG_HID_CHICONY=y
 CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=y
+# CONFIG_HID_KYE is not set
 CONFIG_HID_GYRATION=y
+# CONFIG_HID_KENSINGTON is not set
 CONFIG_HID_LOGITECH=y
 # CONFIG_LOGITECH_FF is not set
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1122,6 +1126,10 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -1245,11 +1253,24 @@ CONFIG_DEBUG_FS=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -1345,6 +1366,7 @@ CONFIG_CRYPTO=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/sh03_defconfig b/arch/sh/configs/sh03_defconfig
index 6fcdb09..f9c6e51 100644
--- a/arch/sh/configs/sh03_defconfig
+++ b/arch/sh/configs/sh03_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 19:04:59 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:07:56 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -41,6 +42,7 @@ CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 # CONFIG_BSD_PROCESS_ACCT_V3 is not set
 # CONFIG_TASKSTATS is not set
@@ -67,7 +69,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -76,6 +77,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -105,6 +107,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -117,7 +121,6 @@ CONFIG_MODVERSIONS=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -163,6 +166,7 @@ CONFIG_CPU_SUBTYPE_SH7751=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -172,8 +176,6 @@ CONFIG_CPU_SUBTYPE_SH7751=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -300,8 +302,6 @@ CONFIG_CMDLINE="console=ttySC1,115200 mem=64M root=/dev/nfs"
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 CONFIG_PCI_LEGACY=y
@@ -562,6 +562,7 @@ CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_SCSI_MPT2SAS is not set
 # CONFIG_SCSI_HPTIOP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_FCOE is not set
 # CONFIG_SCSI_DMX3191D is not set
 # CONFIG_SCSI_FUTURE_DOMAIN is not set
@@ -656,6 +657,7 @@ CONFIG_NETDEV_1000=y
 # CONFIG_E1000E is not set
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -679,6 +681,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_IXGBE is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
 # CONFIG_NIU is not set
@@ -887,7 +890,6 @@ CONFIG_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 CONFIG_USB_ARCH_HAS_OHCI=y
@@ -929,6 +931,7 @@ CONFIG_EXT2_FS_XATTR=y
 # CONFIG_EXT2_FS_SECURITY is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 CONFIG_EXT3_FS_POSIX_ACL=y
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -952,6 +955,11 @@ CONFIG_AUTOFS4_FS=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 CONFIG_ISO9660_FS=m
@@ -1001,6 +1009,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -1112,11 +1121,26 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_RING_BUFFER=y
 CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_PREEMPT_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_FTRACE_STARTUP_TEST is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 CONFIG_SH_STANDARD_BIOS=y
@@ -1228,6 +1252,7 @@ CONFIG_CRYPTO_DEFLATE=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
diff --git a/arch/sh/configs/sh7710voipgw_defconfig b/arch/sh/configs/sh7710voipgw_defconfig
index 1ab37c0..48b5809 100644
--- a/arch/sh/configs/sh7710voipgw_defconfig
+++ b/arch/sh/configs/sh7710voipgw_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 19:09:01 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:09:16 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -39,6 +40,7 @@ CONFIG_LOCALVERSION_AUTO=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 # CONFIG_BSD_PROCESS_ACCT is not set
 # CONFIG_TASKSTATS is not set
 # CONFIG_AUDIT is not set
@@ -72,6 +74,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -90,6 +93,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -97,6 +101,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_BASE_SMALL=0
@@ -108,7 +114,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -154,6 +159,7 @@ CONFIG_CPU_SUBTYPE_SH7710=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -163,8 +169,6 @@ CONFIG_CPU_SUBTYPE_SH7710=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -728,7 +732,6 @@ CONFIG_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 # CONFIG_USB_ARCH_HAS_OHCI is not set
@@ -780,6 +783,11 @@ CONFIG_FILE_LOCKING=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -834,6 +842,7 @@ CONFIG_JFFS2_RTIME=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 # CONFIG_NFS_FS is not set
 # CONFIG_NFSD is not set
@@ -871,11 +880,24 @@ CONFIG_DEBUG_FS=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -975,6 +997,7 @@ CONFIG_CRYPTO=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/sh7724_generic_defconfig b/arch/sh/configs/sh7724_generic_defconfig
index 268d04e..ec8f18c 100644
--- a/arch/sh/configs/sh7724_generic_defconfig
+++ b/arch/sh/configs/sh7724_generic_defconfig
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.30-rc2
-# Thu Apr 16 15:42:20 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:09:47 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
diff --git a/arch/sh/configs/sh7763rdp_defconfig b/arch/sh/configs/sh7763rdp_defconfig
index c79bb84..f77bc79 100644
--- a/arch/sh/configs/sh7763rdp_defconfig
+++ b/arch/sh/configs/sh7763rdp_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 19:10:57 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:10:12 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -78,6 +79,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -106,6 +108,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -117,7 +121,6 @@ CONFIG_MODULES=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -164,6 +167,7 @@ CONFIG_CPU_SH4A=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 CONFIG_CPU_SUBTYPE_SH7763=y
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -173,8 +177,6 @@ CONFIG_CPU_SUBTYPE_SH7763=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -555,6 +557,7 @@ CONFIG_SCSI_WAIT_SCAN=m
 CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_ISCSI_TCP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_SCSI_DEBUG is not set
 # CONFIG_SCSI_DH is not set
 # CONFIG_SCSI_OSD_INITIATOR is not set
@@ -941,6 +944,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -965,6 +969,11 @@ CONFIG_AUTOFS4_FS=y
 CONFIG_GENERIC_ACL=y
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -1012,6 +1021,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 # CONFIG_NFS_V3 is not set
@@ -1100,11 +1110,25 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_RING_BUFFER=y
 CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_FTRACE_STARTUP_TEST is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -1204,6 +1228,7 @@ CONFIG_CRYPTO=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
diff --git a/arch/sh/configs/sh7785lcr_32bit_defconfig b/arch/sh/configs/sh7785lcr_32bit_defconfig
index a6cf4505..1c55b80 100644
--- a/arch/sh/configs/sh7785lcr_32bit_defconfig
+++ b/arch/sh/configs/sh7785lcr_32bit_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 19:12:18 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:10:53 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -79,6 +80,7 @@ CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_ALL is not set
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -98,6 +100,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 CONFIG_PROFILING=y
+# CONFIG_MARKERS is not set
 # CONFIG_OPROFILE is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
@@ -106,6 +109,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -118,7 +123,6 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -166,6 +170,7 @@ CONFIG_CPU_SHX2=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -175,8 +180,6 @@ CONFIG_CPU_SUBTYPE_SH7785=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -310,8 +313,6 @@ CONFIG_BOOT_LINK_OFFSET=0x00800000
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 CONFIG_PCI_LEGACY=y
@@ -672,6 +673,7 @@ CONFIG_NETDEV_1000=y
 # CONFIG_E1000E is not set
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -1009,15 +1011,17 @@ CONFIG_USB_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=y
 CONFIG_HID_APPLE=y
 CONFIG_HID_BELKIN=y
 CONFIG_HID_CHERRY=y
 CONFIG_HID_CHICONY=y
 CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=y
+# CONFIG_HID_KYE is not set
 CONFIG_HID_GYRATION=y
+# CONFIG_HID_KENSINGTON is not set
 CONFIG_HID_LOGITECH=y
 # CONFIG_LOGITECH_FF is not set
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1218,6 +1222,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -1240,6 +1245,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -1289,6 +1299,7 @@ CONFIG_MINIX_FS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -1377,6 +1388,9 @@ CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_SOFTLOCKUP=y
 # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0
+CONFIG_DETECT_HUNG_TASK=y
+# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0
 CONFIG_SCHED_DEBUG=y
 # CONFIG_SCHEDSTATS is not set
 # CONFIG_TIMER_STATS is not set
@@ -1413,6 +1427,7 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -1422,9 +1437,14 @@ CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 # CONFIG_PREEMPT_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -1542,6 +1562,7 @@ CONFIG_CRYPTO_DES=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/sh7785lcr_defconfig b/arch/sh/configs/sh7785lcr_defconfig
index e4fac2e..4385fe9 100644
--- a/arch/sh/configs/sh7785lcr_defconfig
+++ b/arch/sh/configs/sh7785lcr_defconfig
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.30-rc2
-# Wed Apr 22 19:17:56 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:11:48 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
@@ -307,8 +307,6 @@ CONFIG_BOOT_LINK_OFFSET=0x00800000
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 CONFIG_PCI_LEGACY=y
diff --git a/arch/sh/configs/shmin_defconfig b/arch/sh/configs/shmin_defconfig
index d695e90..4e12025 100644
--- a/arch/sh/configs/shmin_defconfig
+++ b/arch/sh/configs/shmin_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 19:19:03 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:12:41 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_FIND_NEXT_BIT=y
@@ -63,6 +64,7 @@ CONFIG_EMBEDDED=y
 # CONFIG_UID16 is not set
 # CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_KALLSYMS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 # CONFIG_HOTPLUG is not set
 CONFIG_PRINTK=y
 # CONFIG_BUG is not set
@@ -81,12 +83,15 @@ CONFIG_COMPAT_BRK=y
 # CONFIG_SLUB is not set
 CONFIG_SLOB=y
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_BASE_SMALL=1
 # CONFIG_MODULES is not set
@@ -137,6 +142,7 @@ CONFIG_CPU_SUBTYPE_SH7706=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -146,8 +152,6 @@ CONFIG_CPU_SUBTYPE_SH7706=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -675,6 +679,11 @@ CONFIG_FILE_LOCKING=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -718,6 +727,7 @@ CONFIG_CRAMFS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -762,10 +772,22 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 CONFIG_SH_STANDARD_BIOS=y
@@ -864,6 +886,7 @@ CONFIG_CRYPTO=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/shx3_defconfig b/arch/sh/configs/shx3_defconfig
index e3651f5..c088144 100644
--- a/arch/sh/configs/shx3_defconfig
+++ b/arch/sh/configs/shx3_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 19:20:54 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:13:12 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -42,6 +43,7 @@ CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 # CONFIG_BSD_PROCESS_ACCT_V3 is not set
 # CONFIG_TASKSTATS is not set
@@ -96,6 +98,7 @@ CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -126,6 +129,8 @@ CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_USE_GENERIC_SMP_HELPERS=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_RT_MUTEXES=y
 CONFIG_BASE_SMALL=0
@@ -138,7 +143,6 @@ CONFIG_MODULE_UNLOAD=y
 CONFIG_STOP_MACHINE=y
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -186,6 +190,7 @@ CONFIG_CPU_SHX3=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -195,8 +200,6 @@ CONFIG_CPU_SUBTYPE_SHX3=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -553,6 +556,7 @@ CONFIG_SCSI_WAIT_SCAN=m
 CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_ISCSI_TCP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_SCSI_DEBUG is not set
 # CONFIG_SCSI_DH is not set
 # CONFIG_SCSI_OSD_INITIATOR is not set
@@ -646,6 +650,7 @@ CONFIG_DEVKMEM=y
 #
 # Non-8250 serial port support
 #
+# CONFIG_SERIAL_MAX3100 is not set
 CONFIG_SERIAL_SH_SCI=y
 CONFIG_SERIAL_SH_SCI_NR_UARTS=2
 CONFIG_SERIAL_SH_SCI_CONSOLE=y
@@ -985,6 +990,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -1008,6 +1014,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -1051,6 +1062,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 # CONFIG_NFS_FS is not set
 # CONFIG_NFSD is not set
@@ -1085,6 +1097,9 @@ CONFIG_DEBUG_SHIRQ=y
 CONFIG_DETECT_SOFTLOCKUP=y
 # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0
+CONFIG_DETECT_HUNG_TASK=y
+# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0
 CONFIG_SCHED_DEBUG=y
 # CONFIG_SCHEDSTATS is not set
 # CONFIG_TIMER_STATS is not set
@@ -1124,6 +1139,7 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_RING_BUFFER=y
 CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -1133,11 +1149,16 @@ CONFIG_TRACING=y
 # CONFIG_PREEMPT_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_FTRACE_STARTUP_TEST is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -1245,6 +1266,7 @@ CONFIG_CRYPTO=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
diff --git a/arch/sh/configs/snapgear_defconfig b/arch/sh/configs/snapgear_defconfig
index 6960f60..54a7a3c 100644
--- a/arch/sh/configs/snapgear_defconfig
+++ b/arch/sh/configs/snapgear_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 19:21:39 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:14:00 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -64,7 +65,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -73,6 +73,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 # CONFIG_HOTPLUG is not set
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -92,12 +93,15 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -105,7 +109,6 @@ CONFIG_BASE_SMALL=0
 # CONFIG_MODULES is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -151,6 +154,7 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -160,8 +164,6 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -295,8 +297,6 @@ CONFIG_BOOT_LINK_OFFSET=0x00800000
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 CONFIG_PCI_LEGACY=y
@@ -763,6 +763,11 @@ CONFIG_FILE_LOCKING=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -805,8 +810,13 @@ CONFIG_CRAMFS=y
 # CONFIG_HPFS_FS is not set
 # CONFIG_QNX4FS_FS is not set
 CONFIG_ROMFS_FS=y
+CONFIG_ROMFS_BACKED_BY_BLOCK=y
+# CONFIG_ROMFS_BACKED_BY_MTD is not set
+# CONFIG_ROMFS_BACKED_BY_BOTH is not set
+CONFIG_ROMFS_ON_BLOCK=y
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 # CONFIG_NFS_FS is not set
 # CONFIG_NFSD is not set
@@ -844,10 +854,23 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -862,6 +885,7 @@ CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SECURITYFS is not set
 # CONFIG_SECURITY_FILE_CAPABILITIES is not set
 # CONFIG_CRYPTO is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/systemh_defconfig b/arch/sh/configs/systemh_defconfig
index 7ea639b..dbe7e54 100644
--- a/arch/sh/configs/systemh_defconfig
+++ b/arch/sh/configs/systemh_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 19:23:31 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:14:33 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -61,7 +62,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -70,6 +70,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 # CONFIG_HOTPLUG is not set
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -88,6 +89,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -95,6 +97,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -107,7 +111,6 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -153,6 +156,7 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -162,8 +166,6 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -506,6 +508,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -547,8 +554,13 @@ CONFIG_CRAMFS=y
 # CONFIG_HPFS_FS is not set
 # CONFIG_QNX4FS_FS is not set
 CONFIG_ROMFS_FS=y
+CONFIG_ROMFS_BACKED_BY_BLOCK=y
+# CONFIG_ROMFS_BACKED_BY_MTD is not set
+# CONFIG_ROMFS_BACKED_BY_BOTH is not set
+CONFIG_ROMFS_ON_BLOCK=y
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 
 #
 # Partition Types
@@ -577,10 +589,24 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_PREEMPT_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -595,6 +621,7 @@ CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SECURITYFS is not set
 # CONFIG_SECURITY_FILE_CAPABILITIES is not set
 # CONFIG_CRYPTO is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig
index bbeb4c6..8ca94ef 100644
--- a/arch/sh/configs/titan_defconfig
+++ b/arch/sh/configs/titan_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 19:24:55 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:14:55 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -40,6 +41,7 @@ CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 # CONFIG_BSD_PROCESS_ACCT is not set
 # CONFIG_TASKSTATS is not set
 # CONFIG_AUDIT is not set
@@ -66,7 +68,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -76,6 +77,7 @@ CONFIG_UID16=y
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_ALL is not set
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -95,6 +97,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -102,6 +105,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -114,7 +119,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -160,6 +164,7 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -169,8 +174,6 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -305,8 +308,6 @@ CONFIG_CMDLINE="console=ttySC1,38400N81 root=/dev/nfs ip=:::::eth1:autoconf rw"
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 CONFIG_PCI_LEGACY=y
@@ -789,6 +790,7 @@ CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_SCSI_MPT2SAS is not set
 # CONFIG_SCSI_HPTIOP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_FCOE is not set
 # CONFIG_SCSI_DMX3191D is not set
 # CONFIG_SCSI_FUTURE_DOMAIN is not set
@@ -906,6 +908,7 @@ CONFIG_NETDEV_1000=y
 # CONFIG_E1000E is not set
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -929,6 +932,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_IXGBE is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
 # CONFIG_NIU is not set
@@ -1182,7 +1186,6 @@ CONFIG_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 CONFIG_USB_ARCH_HAS_OHCI=y
@@ -1393,6 +1396,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 # CONFIG_EXT3_FS_XATTR is not set
 # CONFIG_EXT4_FS is not set
 CONFIG_JBD=y
@@ -1419,6 +1423,11 @@ CONFIG_INOTIFY_USER=y
 CONFIG_FUSE_FS=m
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 CONFIG_ISO9660_FS=m
@@ -1467,8 +1476,13 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_HPFS_FS is not set
 # CONFIG_QNX4FS_FS is not set
 CONFIG_ROMFS_FS=y
+CONFIG_ROMFS_BACKED_BY_BLOCK=y
+# CONFIG_ROMFS_BACKED_BY_MTD is not set
+# CONFIG_ROMFS_BACKED_BY_BOTH is not set
+CONFIG_ROMFS_ON_BLOCK=y
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -1576,6 +1590,7 @@ CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_KERNEL=y
 # CONFIG_DEBUG_SHIRQ is not set
 # CONFIG_DETECT_SOFTLOCKUP is not set
+# CONFIG_DETECT_HUNG_TASK is not set
 CONFIG_SCHED_DEBUG=y
 # CONFIG_SCHEDSTATS is not set
 # CONFIG_TIMER_STATS is not set
@@ -1610,6 +1625,7 @@ CONFIG_SCHED_DEBUG=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -1618,9 +1634,14 @@ CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 # CONFIG_IRQSOFF_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -1741,6 +1762,7 @@ CONFIG_CRYPTO_DEFLATE=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/ul2_defconfig b/arch/sh/configs/ul2_defconfig
index 34f5192..bfb4d98 100644
--- a/arch/sh/configs/ul2_defconfig
+++ b/arch/sh/configs/ul2_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 19:30:27 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:17:05 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -69,7 +70,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -78,6 +78,7 @@ CONFIG_UID16=y
 CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -97,6 +98,7 @@ CONFIG_COMPAT_BRK=y
 CONFIG_SLUB=y
 # CONFIG_SLOB is not set
 CONFIG_PROFILING=y
+# CONFIG_MARKERS is not set
 # CONFIG_OPROFILE is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
@@ -105,6 +107,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -117,7 +121,6 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -167,6 +170,7 @@ CONFIG_ARCH_SHMOBILE=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -176,8 +180,6 @@ CONFIG_ARCH_SHMOBILE=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 CONFIG_CPU_SUBTYPE_SH7366=y
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -584,6 +586,7 @@ CONFIG_SCSI_WAIT_SCAN=m
 CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_ISCSI_TCP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_SCSI_DEBUG is not set
 # CONFIG_SCSI_DH is not set
 # CONFIG_SCSI_OSD_INITIATOR is not set
@@ -936,6 +939,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -958,6 +962,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -1005,6 +1014,7 @@ CONFIG_CRAMFS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 # CONFIG_NFS_V3 is not set
@@ -1095,10 +1105,24 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_PREEMPT_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -1208,6 +1232,7 @@ CONFIG_CRYPTO_ARC4=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/urquell_defconfig b/arch/sh/configs/urquell_defconfig
index d174b1a..512664f 100644
--- a/arch/sh/configs/urquell_defconfig
+++ b/arch/sh/configs/urquell_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 19:33:39 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 14:02:55 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -76,6 +77,7 @@ CONFIG_UID16=y
 CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -94,6 +96,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 CONFIG_PROFILING=y
+# CONFIG_MARKERS is not set
 # CONFIG_OPROFILE is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
@@ -102,6 +105,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -114,7 +119,6 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -162,6 +166,7 @@ CONFIG_CPU_SHX3=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -171,8 +176,6 @@ CONFIG_CPU_SUBTYPE_SH7786=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -900,15 +903,17 @@ CONFIG_USB_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=y
 CONFIG_HID_APPLE=y
 CONFIG_HID_BELKIN=y
 CONFIG_HID_CHERRY=y
 CONFIG_HID_CHICONY=y
 CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=y
+# CONFIG_HID_KYE is not set
 CONFIG_HID_GYRATION=y
+# CONFIG_HID_KENSINGTON is not set
 CONFIG_HID_LOGITECH=y
 # CONFIG_LOGITECH_FF is not set
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1046,6 +1051,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -1068,6 +1074,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_FUSE_FS is not set
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 # CONFIG_ISO9660_FS is not set
@@ -1117,6 +1128,7 @@ CONFIG_MINIX_FS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -1209,10 +1221,24 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_PREEMPT_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -1322,6 +1348,7 @@ CONFIG_CRYPTO_DES=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
-- 
cgit v0.10.2


From 5be7c0a4d3dfe25091f2e4e524103e81d9e7e180 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 27 Apr 2009 14:40:47 +0900
Subject: sh: select GENERIC_TIME for new CMT driver.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 4c68fde..1647489 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -476,10 +476,11 @@ config SH_TIMER_CMT
 	bool "CMT clockevents driver"
 	depends on SYS_SUPPORTS_CMT && !SH_CMT
 	select GENERIC_CLOCKEVENTS
+	select GENERIC_TIME
 
 config SH_MTU2
 	bool "MTU2 timer support"
-	depends on CPU_SH2A
+	depends on CPU_SH2A && !GENERIC_TIME
 	default y
 	help
 	  This enables the use of the MTU2 as the system timer.
-- 
cgit v0.10.2


From 148be2c15d4a866fbc7a8f55342e4fd4de73be61 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 27 Apr 2009 08:02:14 +0200
Subject: perf_counter tools: move helper library to util/*

Clean up the top level directory a bit by moving all the helper libraries
to util/*.[ch].

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 690045e..543ccf2 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -147,7 +147,7 @@ all::
 # broken, or spawning external process is slower than built-in grep perf has).
 
 PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
-	@$(SHELL_PATH) ./PERF-VERSION-GEN
+	@$(SHELL_PATH) util/PERF-VERSION-GEN
 -include PERF-VERSION-FILE
 
 uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
@@ -287,26 +287,28 @@ export PERL_PATH
 LIB_FILE=libperf.a
 
 LIB_H += ../../include/linux/perf_counter.h
-LIB_H += levenshtein.h
-LIB_H += parse-options.h
-LIB_H += quote.h
-LIB_H += strbuf.h
-LIB_H += run-command.h
-
-LIB_OBJS += abspath.o
-LIB_OBJS += alias.o
-LIB_OBJS += config.o
-LIB_OBJS += ctype.o
-LIB_OBJS += exec_cmd.o
-LIB_OBJS += help.o
-LIB_OBJS += levenshtein.o
-LIB_OBJS += parse-options.o
-LIB_OBJS += path.o
-LIB_OBJS += run-command.o
-LIB_OBJS += quote.o
-LIB_OBJS += strbuf.o
-LIB_OBJS += usage.o
-LIB_OBJS += wrapper.o
+LIB_H += util/levenshtein.h
+LIB_H += util/parse-options.h
+LIB_H += util/quote.h
+LIB_H += util/util.h
+LIB_H += util/help.h
+LIB_H += util/strbuf.h
+LIB_H += util/run-command.h
+
+LIB_OBJS += util/abspath.o
+LIB_OBJS += util/alias.o
+LIB_OBJS += util/config.o
+LIB_OBJS += util/ctype.o
+LIB_OBJS += util/exec_cmd.o
+LIB_OBJS += util/help.o
+LIB_OBJS += util/levenshtein.o
+LIB_OBJS += util/parse-options.o
+LIB_OBJS += util/path.o
+LIB_OBJS += util/run-command.o
+LIB_OBJS += util/quote.o
+LIB_OBJS += util/strbuf.o
+LIB_OBJS += util/usage.o
+LIB_OBJS += util/wrapper.o
 
 BUILTIN_OBJS += builtin-help.o
 BUILTIN_OBJS += builtin-record.o
@@ -620,10 +622,10 @@ $(BUILT_INS): perf$X
 	ln -s perf$X $@ 2>/dev/null || \
 	cp perf$X $@
 
-common-cmds.h: ./generate-cmdlist.sh command-list.txt
+common-cmds.h: util/generate-cmdlist.sh command-list.txt
 
 common-cmds.h: $(wildcard Documentation/perf-*.txt)
-	$(QUIET_GEN)./generate-cmdlist.sh > $@+ && mv $@+ $@
+	$(QUIET_GEN)util/generate-cmdlist.sh > $@+ && mv $@+ $@
 
 $(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh
 	$(QUIET_GEN)$(RM) $@ $@+ && \
@@ -656,7 +658,7 @@ perf.o perf.spec \
 %.o: %.S
 	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
 
-exec_cmd.o: exec_cmd.c PERF-CFLAGS
+util/exec_cmd.o: util/exec_cmd.c PERF-CFLAGS
 	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
 		'-DPERF_EXEC_PATH="$(perfexecdir_SQ)"' \
 		'-DBINDIR="$(bindir_relative_SQ)"' \
@@ -666,7 +668,7 @@ exec_cmd.o: exec_cmd.c PERF-CFLAGS
 builtin-init-db.o: builtin-init-db.c PERF-CFLAGS
 	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DDEFAULT_PERF_TEMPLATE_DIR='"$(template_dir_SQ)"' $<
 
-config.o: config.c PERF-CFLAGS
+util/config.o: util/config.c PERF-CFLAGS
 	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
 
 perf-%$X: %.o $(PERFLIBS)
@@ -729,7 +731,7 @@ export NO_SVN_TESTS
 check: common-cmds.h
 	if sparse; \
 	then \
-		for i in *.c; \
+		for i in *.c */*.c; \
 		do \
 			sparse $(ALL_CFLAGS) $(SPARSE_FLAGS) $$i || exit; \
 		done; \
@@ -795,7 +797,7 @@ distclean: clean
 	$(RM) configure
 
 clean:
-	$(RM) *.o $(LIB_FILE)
+	$(RM) *.o */*.o $(LIB_FILE)
 	$(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf$X
 	$(RM) $(TEST_PROGRAMS)
 	$(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo common-cmds.h TAGS tags cscope*
@@ -836,7 +838,7 @@ coverage-build: coverage-clean
 		-j1 test
 
 coverage-report:
-	gcov -b *.c
-	grep '^function.*called 0 ' *.c.gcov \
+	gcov -b *.c */*.c
+	grep '^function.*called 0 ' *.c.gcov */*.c.gcov \
 		| sed -e 's/\([^:]*\)\.gcov: *function \([^ ]*\) called.*/\1: \2/' \
 		| tee coverage-untested-functions
diff --git a/Documentation/perf_counter/PERF-VERSION-GEN b/Documentation/perf_counter/PERF-VERSION-GEN
deleted file mode 100755
index c561d15..0000000
--- a/Documentation/perf_counter/PERF-VERSION-GEN
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/sh
-
-GVF=PERF-VERSION-FILE
-DEF_VER=v0.0.1.PERF
-
-LF='
-'
-
-# First see if there is a version file (included in release tarballs),
-# then try git-describe, then default.
-if test -f version
-then
-	VN=$(cat version) || VN="$DEF_VER"
-elif test -d .git -o -f .git &&
-	VN=$(git describe --abbrev=4 HEAD 2>/dev/null) &&
-	case "$VN" in
-	*$LF*) (exit 1) ;;
-	v[0-9]*)
-		git update-index -q --refresh
-		test -z "$(git diff-index --name-only HEAD --)" ||
-		VN="$VN-dirty" ;;
-	esac
-then
-	VN=$(echo "$VN" | sed -e 's/-/./g');
-else
-	VN="$DEF_VER"
-fi
-
-VN=$(expr "$VN" : v*'\(.*\)')
-
-if test -r $GVF
-then
-	VC=$(sed -e 's/^PERF_VERSION = //' <$GVF)
-else
-	VC=unset
-fi
-test "$VN" = "$VC" || {
-	echo >&2 "PERF_VERSION = $VN"
-	echo "PERF_VERSION = $VN" >$GVF
-}
-
-
diff --git a/Documentation/perf_counter/abspath.c b/Documentation/perf_counter/abspath.c
deleted file mode 100644
index 649f34f..0000000
--- a/Documentation/perf_counter/abspath.c
+++ /dev/null
@@ -1,117 +0,0 @@
-#include "cache.h"
-
-/*
- * Do not use this for inspecting *tracked* content.  When path is a
- * symlink to a directory, we do not want to say it is a directory when
- * dealing with tracked content in the working tree.
- */
-int is_directory(const char *path)
-{
-	struct stat st;
-	return (!stat(path, &st) && S_ISDIR(st.st_mode));
-}
-
-/* We allow "recursive" symbolic links. Only within reason, though. */
-#define MAXDEPTH 5
-
-const char *make_absolute_path(const char *path)
-{
-	static char bufs[2][PATH_MAX + 1], *buf = bufs[0], *next_buf = bufs[1];
-	char cwd[1024] = "";
-	int buf_index = 1, len;
-
-	int depth = MAXDEPTH;
-	char *last_elem = NULL;
-	struct stat st;
-
-	if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
-		die ("Too long path: %.*s", 60, path);
-
-	while (depth--) {
-		if (!is_directory(buf)) {
-			char *last_slash = strrchr(buf, '/');
-			if (last_slash) {
-				*last_slash = '\0';
-				last_elem = xstrdup(last_slash + 1);
-			} else {
-				last_elem = xstrdup(buf);
-				*buf = '\0';
-			}
-		}
-
-		if (*buf) {
-			if (!*cwd && !getcwd(cwd, sizeof(cwd)))
-				die ("Could not get current working directory");
-
-			if (chdir(buf))
-				die ("Could not switch to '%s'", buf);
-		}
-		if (!getcwd(buf, PATH_MAX))
-			die ("Could not get current working directory");
-
-		if (last_elem) {
-			int len = strlen(buf);
-			if (len + strlen(last_elem) + 2 > PATH_MAX)
-				die ("Too long path name: '%s/%s'",
-						buf, last_elem);
-			buf[len] = '/';
-			strcpy(buf + len + 1, last_elem);
-			free(last_elem);
-			last_elem = NULL;
-		}
-
-		if (!lstat(buf, &st) && S_ISLNK(st.st_mode)) {
-			len = readlink(buf, next_buf, PATH_MAX);
-			if (len < 0)
-				die ("Invalid symlink: %s", buf);
-			if (PATH_MAX <= len)
-				die("symbolic link too long: %s", buf);
-			next_buf[len] = '\0';
-			buf = next_buf;
-			buf_index = 1 - buf_index;
-			next_buf = bufs[buf_index];
-		} else
-			break;
-	}
-
-	if (*cwd && chdir(cwd))
-		die ("Could not change back to '%s'", cwd);
-
-	return buf;
-}
-
-static const char *get_pwd_cwd(void)
-{
-	static char cwd[PATH_MAX + 1];
-	char *pwd;
-	struct stat cwd_stat, pwd_stat;
-	if (getcwd(cwd, PATH_MAX) == NULL)
-		return NULL;
-	pwd = getenv("PWD");
-	if (pwd && strcmp(pwd, cwd)) {
-		stat(cwd, &cwd_stat);
-		if (!stat(pwd, &pwd_stat) &&
-		    pwd_stat.st_dev == cwd_stat.st_dev &&
-		    pwd_stat.st_ino == cwd_stat.st_ino) {
-			strlcpy(cwd, pwd, PATH_MAX);
-		}
-	}
-	return cwd;
-}
-
-const char *make_nonrelative_path(const char *path)
-{
-	static char buf[PATH_MAX + 1];
-
-	if (is_absolute_path(path)) {
-		if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
-			die("Too long path: %.*s", 60, path);
-	} else {
-		const char *cwd = get_pwd_cwd();
-		if (!cwd)
-			die("Cannot determine the current working directory");
-		if (snprintf(buf, PATH_MAX, "%s/%s", cwd, path) >= PATH_MAX)
-			die("Too long path: %.*s", 60, path);
-	}
-	return buf;
-}
diff --git a/Documentation/perf_counter/alias.c b/Documentation/perf_counter/alias.c
deleted file mode 100644
index 9b3dd2b..0000000
--- a/Documentation/perf_counter/alias.c
+++ /dev/null
@@ -1,77 +0,0 @@
-#include "cache.h"
-
-static const char *alias_key;
-static char *alias_val;
-
-static int alias_lookup_cb(const char *k, const char *v, void *cb)
-{
-	if (!prefixcmp(k, "alias.") && !strcmp(k+6, alias_key)) {
-		if (!v)
-			return config_error_nonbool(k);
-		alias_val = strdup(v);
-		return 0;
-	}
-	return 0;
-}
-
-char *alias_lookup(const char *alias)
-{
-	alias_key = alias;
-	alias_val = NULL;
-	perf_config(alias_lookup_cb, NULL);
-	return alias_val;
-}
-
-int split_cmdline(char *cmdline, const char ***argv)
-{
-	int src, dst, count = 0, size = 16;
-	char quoted = 0;
-
-	*argv = malloc(sizeof(char*) * size);
-
-	/* split alias_string */
-	(*argv)[count++] = cmdline;
-	for (src = dst = 0; cmdline[src];) {
-		char c = cmdline[src];
-		if (!quoted && isspace(c)) {
-			cmdline[dst++] = 0;
-			while (cmdline[++src]
-					&& isspace(cmdline[src]))
-				; /* skip */
-			if (count >= size) {
-				size += 16;
-				*argv = realloc(*argv, sizeof(char*) * size);
-			}
-			(*argv)[count++] = cmdline + dst;
-		} else if (!quoted && (c == '\'' || c == '"')) {
-			quoted = c;
-			src++;
-		} else if (c == quoted) {
-			quoted = 0;
-			src++;
-		} else {
-			if (c == '\\' && quoted != '\'') {
-				src++;
-				c = cmdline[src];
-				if (!c) {
-					free(*argv);
-					*argv = NULL;
-					return error("cmdline ends with \\");
-				}
-			}
-			cmdline[dst++] = c;
-			src++;
-		}
-	}
-
-	cmdline[dst] = 0;
-
-	if (quoted) {
-		free(*argv);
-		*argv = NULL;
-		return error("unclosed quote");
-	}
-
-	return count;
-}
-
diff --git a/Documentation/perf_counter/builtin-help.c b/Documentation/perf_counter/builtin-help.c
index a136d61..6616de0 100644
--- a/Documentation/perf_counter/builtin-help.c
+++ b/Documentation/perf_counter/builtin-help.c
@@ -3,13 +3,13 @@
  *
  * Builtin help command
  */
-#include "cache.h"
+#include "util/cache.h"
 #include "builtin.h"
-#include "exec_cmd.h"
+#include "util/exec_cmd.h"
 #include "common-cmds.h"
-#include "parse-options.h"
-#include "run-command.h"
-#include "help.h"
+#include "util/parse-options.h"
+#include "util/run-command.h"
+#include "util/help.h"
 
 static struct man_viewer_list {
 	struct man_viewer_list *next;
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 169a2d1..d7ace63 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -61,7 +61,7 @@
   * Released under the GPL v2. (and only v2, not any later version)
   */
 
-#include "util.h"
+#include "util/util.h"
 
 #include <getopt.h>
 #include <assert.h>
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 98e8690..dea016f 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -42,7 +42,7 @@
   * Released under the GPL v2. (and only v2, not any later version)
   */
 
-#include "util.h"
+#include "util/util.h"
 
 #include <getopt.h>
 #include <assert.h>
diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h
index 800f86c..d32318a 100644
--- a/Documentation/perf_counter/builtin.h
+++ b/Documentation/perf_counter/builtin.h
@@ -1,8 +1,8 @@
 #ifndef BUILTIN_H
 #define BUILTIN_H
 
-#include "util.h"
-#include "strbuf.h"
+#include "util/util.h"
+#include "util/strbuf.h"
 
 extern const char perf_version_string[];
 extern const char perf_usage_string[];
diff --git a/Documentation/perf_counter/cache.h b/Documentation/perf_counter/cache.h
deleted file mode 100644
index 7108051..0000000
--- a/Documentation/perf_counter/cache.h
+++ /dev/null
@@ -1,117 +0,0 @@
-#ifndef CACHE_H
-#define CACHE_H
-
-#include "util.h"
-#include "strbuf.h"
-
-#define PERF_DIR_ENVIRONMENT "PERF_DIR"
-#define PERF_WORK_TREE_ENVIRONMENT "PERF_WORK_TREE"
-#define DEFAULT_PERF_DIR_ENVIRONMENT ".perf"
-#define DB_ENVIRONMENT "PERF_OBJECT_DIRECTORY"
-#define INDEX_ENVIRONMENT "PERF_INDEX_FILE"
-#define GRAFT_ENVIRONMENT "PERF_GRAFT_FILE"
-#define TEMPLATE_DIR_ENVIRONMENT "PERF_TEMPLATE_DIR"
-#define CONFIG_ENVIRONMENT "PERF_CONFIG"
-#define EXEC_PATH_ENVIRONMENT "PERF_EXEC_PATH"
-#define CEILING_DIRECTORIES_ENVIRONMENT "PERF_CEILING_DIRECTORIES"
-#define PERFATTRIBUTES_FILE ".perfattributes"
-#define INFOATTRIBUTES_FILE "info/attributes"
-#define ATTRIBUTE_MACRO_PREFIX "[attr]"
-
-typedef int (*config_fn_t)(const char *, const char *, void *);
-extern int perf_default_config(const char *, const char *, void *);
-extern int perf_config_from_file(config_fn_t fn, const char *, void *);
-extern int perf_config(config_fn_t fn, void *);
-extern int perf_parse_ulong(const char *, unsigned long *);
-extern int perf_config_int(const char *, const char *);
-extern unsigned long perf_config_ulong(const char *, const char *);
-extern int perf_config_bool_or_int(const char *, const char *, int *);
-extern int perf_config_bool(const char *, const char *);
-extern int perf_config_string(const char **, const char *, const char *);
-extern int perf_config_set(const char *, const char *);
-extern int perf_config_set_multivar(const char *, const char *, const char *, int);
-extern int perf_config_rename_section(const char *, const char *);
-extern const char *perf_etc_perfconfig(void);
-extern int check_repository_format_version(const char *var, const char *value, void *cb);
-extern int perf_config_system(void);
-extern int perf_config_global(void);
-extern int config_error_nonbool(const char *);
-extern const char *config_exclusive_filename;
-
-#define MAX_PERFNAME (1000)
-extern char perf_default_email[MAX_PERFNAME];
-extern char perf_default_name[MAX_PERFNAME];
-extern int user_ident_explicitly_given;
-
-extern const char *perf_log_output_encoding;
-extern const char *perf_mailmap_file;
-
-/* IO helper functions */
-extern void maybe_flush_or_die(FILE *, const char *);
-extern int copy_fd(int ifd, int ofd);
-extern int copy_file(const char *dst, const char *src, int mode);
-extern ssize_t read_in_full(int fd, void *buf, size_t count);
-extern ssize_t write_in_full(int fd, const void *buf, size_t count);
-extern void write_or_die(int fd, const void *buf, size_t count);
-extern int write_or_whine(int fd, const void *buf, size_t count, const char *msg);
-extern int write_or_whine_pipe(int fd, const void *buf, size_t count, const char *msg);
-extern void fsync_or_die(int fd, const char *);
-
-/* pager.c */
-extern void setup_pager(void);
-extern const char *pager_program;
-extern int pager_in_use(void);
-extern int pager_use_color;
-
-extern const char *editor_program;
-extern const char *excludes_file;
-
-char *alias_lookup(const char *alias);
-int split_cmdline(char *cmdline, const char ***argv);
-
-#define alloc_nr(x) (((x)+16)*3/2)
-
-/*
- * Realloc the buffer pointed at by variable 'x' so that it can hold
- * at least 'nr' entries; the number of entries currently allocated
- * is 'alloc', using the standard growing factor alloc_nr() macro.
- *
- * DO NOT USE any expression with side-effect for 'x' or 'alloc'.
- */
-#define ALLOC_GROW(x, nr, alloc) \
-	do { \
-		if ((nr) > alloc) { \
-			if (alloc_nr(alloc) < (nr)) \
-				alloc = (nr); \
-			else \
-				alloc = alloc_nr(alloc); \
-			x = xrealloc((x), alloc * sizeof(*(x))); \
-		} \
-	} while(0)
-
-
-static inline int is_absolute_path(const char *path)
-{
-	return path[0] == '/';
-}
-
-const char *make_absolute_path(const char *path);
-const char *make_nonrelative_path(const char *path);
-const char *make_relative_path(const char *abs, const char *base);
-int normalize_path_copy(char *dst, const char *src);
-int longest_ancestor_length(const char *path, const char *prefix_list);
-char *strip_path_suffix(const char *path, const char *suffix);
-
-extern char *mkpath(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
-extern char *perf_path(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
-
-extern char *mksnpath(char *buf, size_t n, const char *fmt, ...)
-	__attribute__((format (printf, 3, 4)));
-extern char *perf_snpath(char *buf, size_t n, const char *fmt, ...)
-	__attribute__((format (printf, 3, 4)));
-extern char *perf_pathdup(const char *fmt, ...)
-	__attribute__((format (printf, 1, 2)));
-
-extern size_t strlcpy(char *dest, const char *src, size_t size);
-
-#endif /* CACHE_H */
diff --git a/Documentation/perf_counter/config.c b/Documentation/perf_counter/config.c
deleted file mode 100644
index 3dd13fa..0000000
--- a/Documentation/perf_counter/config.c
+++ /dev/null
@@ -1,873 +0,0 @@
-/*
- * GIT - The information manager from hell
- *
- * Copyright (C) Linus Torvalds, 2005
- * Copyright (C) Johannes Schindelin, 2005
- *
- */
-#include "util.h"
-#include "cache.h"
-#include "exec_cmd.h"
-
-#define MAXNAME (256)
-
-static FILE *config_file;
-static const char *config_file_name;
-static int config_linenr;
-static int config_file_eof;
-
-const char *config_exclusive_filename = NULL;
-
-static int get_next_char(void)
-{
-	int c;
-	FILE *f;
-
-	c = '\n';
-	if ((f = config_file) != NULL) {
-		c = fgetc(f);
-		if (c == '\r') {
-			/* DOS like systems */
-			c = fgetc(f);
-			if (c != '\n') {
-				ungetc(c, f);
-				c = '\r';
-			}
-		}
-		if (c == '\n')
-			config_linenr++;
-		if (c == EOF) {
-			config_file_eof = 1;
-			c = '\n';
-		}
-	}
-	return c;
-}
-
-static char *parse_value(void)
-{
-	static char value[1024];
-	int quote = 0, comment = 0, len = 0, space = 0;
-
-	for (;;) {
-		int c = get_next_char();
-		if (len >= sizeof(value) - 1)
-			return NULL;
-		if (c == '\n') {
-			if (quote)
-				return NULL;
-			value[len] = 0;
-			return value;
-		}
-		if (comment)
-			continue;
-		if (isspace(c) && !quote) {
-			space = 1;
-			continue;
-		}
-		if (!quote) {
-			if (c == ';' || c == '#') {
-				comment = 1;
-				continue;
-			}
-		}
-		if (space) {
-			if (len)
-				value[len++] = ' ';
-			space = 0;
-		}
-		if (c == '\\') {
-			c = get_next_char();
-			switch (c) {
-			case '\n':
-				continue;
-			case 't':
-				c = '\t';
-				break;
-			case 'b':
-				c = '\b';
-				break;
-			case 'n':
-				c = '\n';
-				break;
-			/* Some characters escape as themselves */
-			case '\\': case '"':
-				break;
-			/* Reject unknown escape sequences */
-			default:
-				return NULL;
-			}
-			value[len++] = c;
-			continue;
-		}
-		if (c == '"') {
-			quote = 1-quote;
-			continue;
-		}
-		value[len++] = c;
-	}
-}
-
-static inline int iskeychar(int c)
-{
-	return isalnum(c) || c == '-';
-}
-
-static int get_value(config_fn_t fn, void *data, char *name, unsigned int len)
-{
-	int c;
-	char *value;
-
-	/* Get the full name */
-	for (;;) {
-		c = get_next_char();
-		if (config_file_eof)
-			break;
-		if (!iskeychar(c))
-			break;
-		name[len++] = tolower(c);
-		if (len >= MAXNAME)
-			return -1;
-	}
-	name[len] = 0;
-	while (c == ' ' || c == '\t')
-		c = get_next_char();
-
-	value = NULL;
-	if (c != '\n') {
-		if (c != '=')
-			return -1;
-		value = parse_value();
-		if (!value)
-			return -1;
-	}
-	return fn(name, value, data);
-}
-
-static int get_extended_base_var(char *name, int baselen, int c)
-{
-	do {
-		if (c == '\n')
-			return -1;
-		c = get_next_char();
-	} while (isspace(c));
-
-	/* We require the format to be '[base "extension"]' */
-	if (c != '"')
-		return -1;
-	name[baselen++] = '.';
-
-	for (;;) {
-		int c = get_next_char();
-		if (c == '\n')
-			return -1;
-		if (c == '"')
-			break;
-		if (c == '\\') {
-			c = get_next_char();
-			if (c == '\n')
-				return -1;
-		}
-		name[baselen++] = c;
-		if (baselen > MAXNAME / 2)
-			return -1;
-	}
-
-	/* Final ']' */
-	if (get_next_char() != ']')
-		return -1;
-	return baselen;
-}
-
-static int get_base_var(char *name)
-{
-	int baselen = 0;
-
-	for (;;) {
-		int c = get_next_char();
-		if (config_file_eof)
-			return -1;
-		if (c == ']')
-			return baselen;
-		if (isspace(c))
-			return get_extended_base_var(name, baselen, c);
-		if (!iskeychar(c) && c != '.')
-			return -1;
-		if (baselen > MAXNAME / 2)
-			return -1;
-		name[baselen++] = tolower(c);
-	}
-}
-
-static int perf_parse_file(config_fn_t fn, void *data)
-{
-	int comment = 0;
-	int baselen = 0;
-	static char var[MAXNAME];
-
-	/* U+FEFF Byte Order Mark in UTF8 */
-	static const unsigned char *utf8_bom = (unsigned char *) "\xef\xbb\xbf";
-	const unsigned char *bomptr = utf8_bom;
-
-	for (;;) {
-		int c = get_next_char();
-		if (bomptr && *bomptr) {
-			/* We are at the file beginning; skip UTF8-encoded BOM
-			 * if present. Sane editors won't put this in on their
-			 * own, but e.g. Windows Notepad will do it happily. */
-			if ((unsigned char) c == *bomptr) {
-				bomptr++;
-				continue;
-			} else {
-				/* Do not tolerate partial BOM. */
-				if (bomptr != utf8_bom)
-					break;
-				/* No BOM at file beginning. Cool. */
-				bomptr = NULL;
-			}
-		}
-		if (c == '\n') {
-			if (config_file_eof)
-				return 0;
-			comment = 0;
-			continue;
-		}
-		if (comment || isspace(c))
-			continue;
-		if (c == '#' || c == ';') {
-			comment = 1;
-			continue;
-		}
-		if (c == '[') {
-			baselen = get_base_var(var);
-			if (baselen <= 0)
-				break;
-			var[baselen++] = '.';
-			var[baselen] = 0;
-			continue;
-		}
-		if (!isalpha(c))
-			break;
-		var[baselen] = tolower(c);
-		if (get_value(fn, data, var, baselen+1) < 0)
-			break;
-	}
-	die("bad config file line %d in %s", config_linenr, config_file_name);
-}
-
-static int parse_unit_factor(const char *end, unsigned long *val)
-{
-	if (!*end)
-		return 1;
-	else if (!strcasecmp(end, "k")) {
-		*val *= 1024;
-		return 1;
-	}
-	else if (!strcasecmp(end, "m")) {
-		*val *= 1024 * 1024;
-		return 1;
-	}
-	else if (!strcasecmp(end, "g")) {
-		*val *= 1024 * 1024 * 1024;
-		return 1;
-	}
-	return 0;
-}
-
-static int perf_parse_long(const char *value, long *ret)
-{
-	if (value && *value) {
-		char *end;
-		long val = strtol(value, &end, 0);
-		unsigned long factor = 1;
-		if (!parse_unit_factor(end, &factor))
-			return 0;
-		*ret = val * factor;
-		return 1;
-	}
-	return 0;
-}
-
-int perf_parse_ulong(const char *value, unsigned long *ret)
-{
-	if (value && *value) {
-		char *end;
-		unsigned long val = strtoul(value, &end, 0);
-		if (!parse_unit_factor(end, &val))
-			return 0;
-		*ret = val;
-		return 1;
-	}
-	return 0;
-}
-
-static void die_bad_config(const char *name)
-{
-	if (config_file_name)
-		die("bad config value for '%s' in %s", name, config_file_name);
-	die("bad config value for '%s'", name);
-}
-
-int perf_config_int(const char *name, const char *value)
-{
-	long ret = 0;
-	if (!perf_parse_long(value, &ret))
-		die_bad_config(name);
-	return ret;
-}
-
-unsigned long perf_config_ulong(const char *name, const char *value)
-{
-	unsigned long ret;
-	if (!perf_parse_ulong(value, &ret))
-		die_bad_config(name);
-	return ret;
-}
-
-int perf_config_bool_or_int(const char *name, const char *value, int *is_bool)
-{
-	*is_bool = 1;
-	if (!value)
-		return 1;
-	if (!*value)
-		return 0;
-	if (!strcasecmp(value, "true") || !strcasecmp(value, "yes") || !strcasecmp(value, "on"))
-		return 1;
-	if (!strcasecmp(value, "false") || !strcasecmp(value, "no") || !strcasecmp(value, "off"))
-		return 0;
-	*is_bool = 0;
-	return perf_config_int(name, value);
-}
-
-int perf_config_bool(const char *name, const char *value)
-{
-	int discard;
-	return !!perf_config_bool_or_int(name, value, &discard);
-}
-
-int perf_config_string(const char **dest, const char *var, const char *value)
-{
-	if (!value)
-		return config_error_nonbool(var);
-	*dest = strdup(value);
-	return 0;
-}
-
-static int perf_default_core_config(const char *var, const char *value)
-{
-	/* Add other config variables here and to Documentation/config.txt. */
-	return 0;
-}
-
-int perf_default_config(const char *var, const char *value, void *dummy)
-{
-	if (!prefixcmp(var, "core."))
-		return perf_default_core_config(var, value);
-
-	/* Add other config variables here and to Documentation/config.txt. */
-	return 0;
-}
-
-int perf_config_from_file(config_fn_t fn, const char *filename, void *data)
-{
-	int ret;
-	FILE *f = fopen(filename, "r");
-
-	ret = -1;
-	if (f) {
-		config_file = f;
-		config_file_name = filename;
-		config_linenr = 1;
-		config_file_eof = 0;
-		ret = perf_parse_file(fn, data);
-		fclose(f);
-		config_file_name = NULL;
-	}
-	return ret;
-}
-
-const char *perf_etc_perfconfig(void)
-{
-	static const char *system_wide;
-	if (!system_wide)
-		system_wide = system_path(ETC_PERFCONFIG);
-	return system_wide;
-}
-
-static int perf_env_bool(const char *k, int def)
-{
-	const char *v = getenv(k);
-	return v ? perf_config_bool(k, v) : def;
-}
-
-int perf_config_system(void)
-{
-	return !perf_env_bool("PERF_CONFIG_NOSYSTEM", 0);
-}
-
-int perf_config_global(void)
-{
-	return !perf_env_bool("PERF_CONFIG_NOGLOBAL", 0);
-}
-
-int perf_config(config_fn_t fn, void *data)
-{
-	int ret = 0, found = 0;
-	char *repo_config = NULL;
-	const char *home = NULL;
-
-	/* Setting $PERF_CONFIG makes perf read _only_ the given config file. */
-	if (config_exclusive_filename)
-		return perf_config_from_file(fn, config_exclusive_filename, data);
-	if (perf_config_system() && !access(perf_etc_perfconfig(), R_OK)) {
-		ret += perf_config_from_file(fn, perf_etc_perfconfig(),
-					    data);
-		found += 1;
-	}
-
-	home = getenv("HOME");
-	if (perf_config_global() && home) {
-		char *user_config = strdup(mkpath("%s/.perfconfig", home));
-		if (!access(user_config, R_OK)) {
-			ret += perf_config_from_file(fn, user_config, data);
-			found += 1;
-		}
-		free(user_config);
-	}
-
-	repo_config = perf_pathdup("config");
-	if (!access(repo_config, R_OK)) {
-		ret += perf_config_from_file(fn, repo_config, data);
-		found += 1;
-	}
-	free(repo_config);
-	if (found == 0)
-		return -1;
-	return ret;
-}
-
-/*
- * Find all the stuff for perf_config_set() below.
- */
-
-#define MAX_MATCHES 512
-
-static struct {
-	int baselen;
-	char* key;
-	int do_not_match;
-	regex_t* value_regex;
-	int multi_replace;
-	size_t offset[MAX_MATCHES];
-	enum { START, SECTION_SEEN, SECTION_END_SEEN, KEY_SEEN } state;
-	int seen;
-} store;
-
-static int matches(const char* key, const char* value)
-{
-	return !strcmp(key, store.key) &&
-		(store.value_regex == NULL ||
-		 (store.do_not_match ^
-		  !regexec(store.value_regex, value, 0, NULL, 0)));
-}
-
-static int store_aux(const char* key, const char* value, void *cb)
-{
-	const char *ep;
-	size_t section_len;
-
-	switch (store.state) {
-	case KEY_SEEN:
-		if (matches(key, value)) {
-			if (store.seen == 1 && store.multi_replace == 0) {
-				warning("%s has multiple values", key);
-			} else if (store.seen >= MAX_MATCHES) {
-				error("too many matches for %s", key);
-				return 1;
-			}
-
-			store.offset[store.seen] = ftell(config_file);
-			store.seen++;
-		}
-		break;
-	case SECTION_SEEN:
-		/*
-		 * What we are looking for is in store.key (both
-		 * section and var), and its section part is baselen
-		 * long.  We found key (again, both section and var).
-		 * We would want to know if this key is in the same
-		 * section as what we are looking for.  We already
-		 * know we are in the same section as what should
-		 * hold store.key.
-		 */
-		ep = strrchr(key, '.');
-		section_len = ep - key;
-
-		if ((section_len != store.baselen) ||
-		    memcmp(key, store.key, section_len+1)) {
-			store.state = SECTION_END_SEEN;
-			break;
-		}
-
-		/*
-		 * Do not increment matches: this is no match, but we
-		 * just made sure we are in the desired section.
-		 */
-		store.offset[store.seen] = ftell(config_file);
-		/* fallthru */
-	case SECTION_END_SEEN:
-	case START:
-		if (matches(key, value)) {
-			store.offset[store.seen] = ftell(config_file);
-			store.state = KEY_SEEN;
-			store.seen++;
-		} else {
-			if (strrchr(key, '.') - key == store.baselen &&
-			      !strncmp(key, store.key, store.baselen)) {
-					store.state = SECTION_SEEN;
-					store.offset[store.seen] = ftell(config_file);
-			}
-		}
-	}
-	return 0;
-}
-
-static int store_write_section(int fd, const char* key)
-{
-	const char *dot;
-	int i, success;
-	struct strbuf sb = STRBUF_INIT;
-
-	dot = memchr(key, '.', store.baselen);
-	if (dot) {
-		strbuf_addf(&sb, "[%.*s \"", (int)(dot - key), key);
-		for (i = dot - key + 1; i < store.baselen; i++) {
-			if (key[i] == '"' || key[i] == '\\')
-				strbuf_addch(&sb, '\\');
-			strbuf_addch(&sb, key[i]);
-		}
-		strbuf_addstr(&sb, "\"]\n");
-	} else {
-		strbuf_addf(&sb, "[%.*s]\n", store.baselen, key);
-	}
-
-	success = write_in_full(fd, sb.buf, sb.len) == sb.len;
-	strbuf_release(&sb);
-
-	return success;
-}
-
-static int store_write_pair(int fd, const char* key, const char* value)
-{
-	int i, success;
-	int length = strlen(key + store.baselen + 1);
-	const char *quote = "";
-	struct strbuf sb = STRBUF_INIT;
-
-	/*
-	 * Check to see if the value needs to be surrounded with a dq pair.
-	 * Note that problematic characters are always backslash-quoted; this
-	 * check is about not losing leading or trailing SP and strings that
-	 * follow beginning-of-comment characters (i.e. ';' and '#') by the
-	 * configuration parser.
-	 */
-	if (value[0] == ' ')
-		quote = "\"";
-	for (i = 0; value[i]; i++)
-		if (value[i] == ';' || value[i] == '#')
-			quote = "\"";
-	if (i && value[i - 1] == ' ')
-		quote = "\"";
-
-	strbuf_addf(&sb, "\t%.*s = %s",
-		    length, key + store.baselen + 1, quote);
-
-	for (i = 0; value[i]; i++)
-		switch (value[i]) {
-		case '\n':
-			strbuf_addstr(&sb, "\\n");
-			break;
-		case '\t':
-			strbuf_addstr(&sb, "\\t");
-			break;
-		case '"':
-		case '\\':
-			strbuf_addch(&sb, '\\');
-		default:
-			strbuf_addch(&sb, value[i]);
-			break;
-		}
-	strbuf_addf(&sb, "%s\n", quote);
-
-	success = write_in_full(fd, sb.buf, sb.len) == sb.len;
-	strbuf_release(&sb);
-
-	return success;
-}
-
-static ssize_t find_beginning_of_line(const char* contents, size_t size,
-	size_t offset_, int* found_bracket)
-{
-	size_t equal_offset = size, bracket_offset = size;
-	ssize_t offset;
-
-contline:
-	for (offset = offset_-2; offset > 0
-			&& contents[offset] != '\n'; offset--)
-		switch (contents[offset]) {
-			case '=': equal_offset = offset; break;
-			case ']': bracket_offset = offset; break;
-		}
-	if (offset > 0 && contents[offset-1] == '\\') {
-		offset_ = offset;
-		goto contline;
-	}
-	if (bracket_offset < equal_offset) {
-		*found_bracket = 1;
-		offset = bracket_offset+1;
-	} else
-		offset++;
-
-	return offset;
-}
-
-int perf_config_set(const char* key, const char* value)
-{
-	return perf_config_set_multivar(key, value, NULL, 0);
-}
-
-/*
- * If value==NULL, unset in (remove from) config,
- * if value_regex!=NULL, disregard key/value pairs where value does not match.
- * if multi_replace==0, nothing, or only one matching key/value is replaced,
- *     else all matching key/values (regardless how many) are removed,
- *     before the new pair is written.
- *
- * Returns 0 on success.
- *
- * This function does this:
- *
- * - it locks the config file by creating ".perf/config.lock"
- *
- * - it then parses the config using store_aux() as validator to find
- *   the position on the key/value pair to replace. If it is to be unset,
- *   it must be found exactly once.
- *
- * - the config file is mmap()ed and the part before the match (if any) is
- *   written to the lock file, then the changed part and the rest.
- *
- * - the config file is removed and the lock file rename()d to it.
- *
- */
-int perf_config_set_multivar(const char* key, const char* value,
-	const char* value_regex, int multi_replace)
-{
-	int i, dot;
-	int fd = -1, in_fd;
-	int ret = 0;
-	char* config_filename;
-	const char* last_dot = strrchr(key, '.');
-
-	if (config_exclusive_filename)
-		config_filename = strdup(config_exclusive_filename);
-	else
-		config_filename = perf_pathdup("config");
-
-	/*
-	 * Since "key" actually contains the section name and the real
-	 * key name separated by a dot, we have to know where the dot is.
-	 */
-
-	if (last_dot == NULL) {
-		error("key does not contain a section: %s", key);
-		ret = 2;
-		goto out_free;
-	}
-	store.baselen = last_dot - key;
-
-	store.multi_replace = multi_replace;
-
-	/*
-	 * Validate the key and while at it, lower case it for matching.
-	 */
-	store.key = malloc(strlen(key) + 1);
-	dot = 0;
-	for (i = 0; key[i]; i++) {
-		unsigned char c = key[i];
-		if (c == '.')
-			dot = 1;
-		/* Leave the extended basename untouched.. */
-		if (!dot || i > store.baselen) {
-			if (!iskeychar(c) || (i == store.baselen+1 && !isalpha(c))) {
-				error("invalid key: %s", key);
-				free(store.key);
-				ret = 1;
-				goto out_free;
-			}
-			c = tolower(c);
-		} else if (c == '\n') {
-			error("invalid key (newline): %s", key);
-			free(store.key);
-			ret = 1;
-			goto out_free;
-		}
-		store.key[i] = c;
-	}
-	store.key[i] = 0;
-
-	/*
-	 * If .perf/config does not exist yet, write a minimal version.
-	 */
-	in_fd = open(config_filename, O_RDONLY);
-	if ( in_fd < 0 ) {
-		free(store.key);
-
-		if ( ENOENT != errno ) {
-			error("opening %s: %s", config_filename,
-			      strerror(errno));
-			ret = 3; /* same as "invalid config file" */
-			goto out_free;
-		}
-		/* if nothing to unset, error out */
-		if (value == NULL) {
-			ret = 5;
-			goto out_free;
-		}
-
-		store.key = (char*)key;
-		if (!store_write_section(fd, key) ||
-		    !store_write_pair(fd, key, value))
-			goto write_err_out;
-	} else {
-		struct stat st;
-		char* contents;
-		size_t contents_sz, copy_begin, copy_end;
-		int i, new_line = 0;
-
-		if (value_regex == NULL)
-			store.value_regex = NULL;
-		else {
-			if (value_regex[0] == '!') {
-				store.do_not_match = 1;
-				value_regex++;
-			} else
-				store.do_not_match = 0;
-
-			store.value_regex = (regex_t*)malloc(sizeof(regex_t));
-			if (regcomp(store.value_regex, value_regex,
-					REG_EXTENDED)) {
-				error("invalid pattern: %s", value_regex);
-				free(store.value_regex);
-				ret = 6;
-				goto out_free;
-			}
-		}
-
-		store.offset[0] = 0;
-		store.state = START;
-		store.seen = 0;
-
-		/*
-		 * After this, store.offset will contain the *end* offset
-		 * of the last match, or remain at 0 if no match was found.
-		 * As a side effect, we make sure to transform only a valid
-		 * existing config file.
-		 */
-		if (perf_config_from_file(store_aux, config_filename, NULL)) {
-			error("invalid config file %s", config_filename);
-			free(store.key);
-			if (store.value_regex != NULL) {
-				regfree(store.value_regex);
-				free(store.value_regex);
-			}
-			ret = 3;
-			goto out_free;
-		}
-
-		free(store.key);
-		if (store.value_regex != NULL) {
-			regfree(store.value_regex);
-			free(store.value_regex);
-		}
-
-		/* if nothing to unset, or too many matches, error out */
-		if ((store.seen == 0 && value == NULL) ||
-				(store.seen > 1 && multi_replace == 0)) {
-			ret = 5;
-			goto out_free;
-		}
-
-		fstat(in_fd, &st);
-		contents_sz = xsize_t(st.st_size);
-		contents = mmap(NULL, contents_sz, PROT_READ,
-			MAP_PRIVATE, in_fd, 0);
-		close(in_fd);
-
-		if (store.seen == 0)
-			store.seen = 1;
-
-		for (i = 0, copy_begin = 0; i < store.seen; i++) {
-			if (store.offset[i] == 0) {
-				store.offset[i] = copy_end = contents_sz;
-			} else if (store.state != KEY_SEEN) {
-				copy_end = store.offset[i];
-			} else
-				copy_end = find_beginning_of_line(
-					contents, contents_sz,
-					store.offset[i]-2, &new_line);
-
-			if (copy_end > 0 && contents[copy_end-1] != '\n')
-				new_line = 1;
-
-			/* write the first part of the config */
-			if (copy_end > copy_begin) {
-				if (write_in_full(fd, contents + copy_begin,
-						  copy_end - copy_begin) <
-				    copy_end - copy_begin)
-					goto write_err_out;
-				if (new_line &&
-				    write_in_full(fd, "\n", 1) != 1)
-					goto write_err_out;
-			}
-			copy_begin = store.offset[i];
-		}
-
-		/* write the pair (value == NULL means unset) */
-		if (value != NULL) {
-			if (store.state == START) {
-				if (!store_write_section(fd, key))
-					goto write_err_out;
-			}
-			if (!store_write_pair(fd, key, value))
-				goto write_err_out;
-		}
-
-		/* write the rest of the config */
-		if (copy_begin < contents_sz)
-			if (write_in_full(fd, contents + copy_begin,
-					  contents_sz - copy_begin) <
-			    contents_sz - copy_begin)
-				goto write_err_out;
-
-		munmap(contents, contents_sz);
-	}
-
-	ret = 0;
-
-out_free:
-	free(config_filename);
-	return ret;
-
-write_err_out:
-	goto out_free;
-
-}
-
-/*
- * Call this to report error for your variable that should not
- * get a boolean value (i.e. "[my] var" means "true").
- */
-int config_error_nonbool(const char *var)
-{
-	return error("Missing value for '%s'", var);
-}
diff --git a/Documentation/perf_counter/ctype.c b/Documentation/perf_counter/ctype.c
deleted file mode 100644
index b90ec00..0000000
--- a/Documentation/perf_counter/ctype.c
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Sane locale-independent, ASCII ctype.
- *
- * No surprises, and works with signed and unsigned chars.
- */
-#include "cache.h"
-
-enum {
-	S = GIT_SPACE,
-	A = GIT_ALPHA,
-	D = GIT_DIGIT,
-	G = GIT_GLOB_SPECIAL,	/* *, ?, [, \\ */
-	R = GIT_REGEX_SPECIAL,	/* $, (, ), +, ., ^, {, | * */
-};
-
-unsigned char sane_ctype[256] = {
-	0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, 0, S, 0, 0,		/*   0.. 15 */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,		/*  16.. 31 */
-	S, 0, 0, 0, R, 0, 0, 0, R, R, G, R, 0, 0, R, 0,		/*  32.. 47 */
-	D, D, D, D, D, D, D, D, D, D, 0, 0, 0, 0, 0, G,		/*  48.. 63 */
-	0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  64.. 79 */
-	A, A, A, A, A, A, A, A, A, A, A, G, G, 0, R, 0,		/*  80.. 95 */
-	0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  96..111 */
-	A, A, A, A, A, A, A, A, A, A, A, R, R, 0, 0, 0,		/* 112..127 */
-	/* Nothing in the 128.. range */
-};
diff --git a/Documentation/perf_counter/exec_cmd.c b/Documentation/perf_counter/exec_cmd.c
deleted file mode 100644
index d392922..0000000
--- a/Documentation/perf_counter/exec_cmd.c
+++ /dev/null
@@ -1,165 +0,0 @@
-#include "cache.h"
-#include "exec_cmd.h"
-#include "quote.h"
-#define MAX_ARGS	32
-
-extern char **environ;
-static const char *argv_exec_path;
-static const char *argv0_path;
-
-const char *system_path(const char *path)
-{
-#ifdef RUNTIME_PREFIX
-	static const char *prefix;
-#else
-	static const char *prefix = PREFIX;
-#endif
-	struct strbuf d = STRBUF_INIT;
-
-	if (is_absolute_path(path))
-		return path;
-
-#ifdef RUNTIME_PREFIX
-	assert(argv0_path);
-	assert(is_absolute_path(argv0_path));
-
-	if (!prefix &&
-	    !(prefix = strip_path_suffix(argv0_path, PERF_EXEC_PATH)) &&
-	    !(prefix = strip_path_suffix(argv0_path, BINDIR)) &&
-	    !(prefix = strip_path_suffix(argv0_path, "perf"))) {
-		prefix = PREFIX;
-		fprintf(stderr, "RUNTIME_PREFIX requested, "
-				"but prefix computation failed.  "
-				"Using static fallback '%s'.\n", prefix);
-	}
-#endif
-
-	strbuf_addf(&d, "%s/%s", prefix, path);
-	path = strbuf_detach(&d, NULL);
-	return path;
-}
-
-const char *perf_extract_argv0_path(const char *argv0)
-{
-	const char *slash;
-
-	if (!argv0 || !*argv0)
-		return NULL;
-	slash = argv0 + strlen(argv0);
-
-	while (argv0 <= slash && !is_dir_sep(*slash))
-		slash--;
-
-	if (slash >= argv0) {
-		argv0_path = strndup(argv0, slash - argv0);
-		return slash + 1;
-	}
-
-	return argv0;
-}
-
-void perf_set_argv_exec_path(const char *exec_path)
-{
-	argv_exec_path = exec_path;
-	/*
-	 * Propagate this setting to external programs.
-	 */
-	setenv(EXEC_PATH_ENVIRONMENT, exec_path, 1);
-}
-
-
-/* Returns the highest-priority, location to look for perf programs. */
-const char *perf_exec_path(void)
-{
-	const char *env;
-
-	if (argv_exec_path)
-		return argv_exec_path;
-
-	env = getenv(EXEC_PATH_ENVIRONMENT);
-	if (env && *env) {
-		return env;
-	}
-
-	return system_path(PERF_EXEC_PATH);
-}
-
-static void add_path(struct strbuf *out, const char *path)
-{
-	if (path && *path) {
-		if (is_absolute_path(path))
-			strbuf_addstr(out, path);
-		else
-			strbuf_addstr(out, make_nonrelative_path(path));
-
-		strbuf_addch(out, PATH_SEP);
-	}
-}
-
-void setup_path(void)
-{
-	const char *old_path = getenv("PATH");
-	struct strbuf new_path = STRBUF_INIT;
-
-	add_path(&new_path, perf_exec_path());
-	add_path(&new_path, argv0_path);
-
-	if (old_path)
-		strbuf_addstr(&new_path, old_path);
-	else
-		strbuf_addstr(&new_path, "/usr/local/bin:/usr/bin:/bin");
-
-	setenv("PATH", new_path.buf, 1);
-
-	strbuf_release(&new_path);
-}
-
-const char **prepare_perf_cmd(const char **argv)
-{
-	int argc;
-	const char **nargv;
-
-	for (argc = 0; argv[argc]; argc++)
-		; /* just counting */
-	nargv = malloc(sizeof(*nargv) * (argc + 2));
-
-	nargv[0] = "perf";
-	for (argc = 0; argv[argc]; argc++)
-		nargv[argc + 1] = argv[argc];
-	nargv[argc + 1] = NULL;
-	return nargv;
-}
-
-int execv_perf_cmd(const char **argv) {
-	const char **nargv = prepare_perf_cmd(argv);
-
-	/* execvp() can only ever return if it fails */
-	execvp("perf", (char **)nargv);
-
-	free(nargv);
-	return -1;
-}
-
-
-int execl_perf_cmd(const char *cmd,...)
-{
-	int argc;
-	const char *argv[MAX_ARGS + 1];
-	const char *arg;
-	va_list param;
-
-	va_start(param, cmd);
-	argv[0] = cmd;
-	argc = 1;
-	while (argc < MAX_ARGS) {
-		arg = argv[argc++] = va_arg(param, char *);
-		if (!arg)
-			break;
-	}
-	va_end(param);
-	if (MAX_ARGS <= argc)
-		return error("too many args to run %s", cmd);
-
-	argv[argc] = NULL;
-	return execv_perf_cmd(argv);
-}
diff --git a/Documentation/perf_counter/exec_cmd.h b/Documentation/perf_counter/exec_cmd.h
deleted file mode 100644
index effe25e..0000000
--- a/Documentation/perf_counter/exec_cmd.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef PERF_EXEC_CMD_H
-#define PERF_EXEC_CMD_H
-
-extern void perf_set_argv_exec_path(const char *exec_path);
-extern const char *perf_extract_argv0_path(const char *path);
-extern const char *perf_exec_path(void);
-extern void setup_path(void);
-extern const char **prepare_perf_cmd(const char **argv);
-extern int execv_perf_cmd(const char **argv); /* NULL terminated */
-extern int execl_perf_cmd(const char *cmd, ...);
-extern const char *system_path(const char *path);
-
-#endif /* PERF_EXEC_CMD_H */
diff --git a/Documentation/perf_counter/generate-cmdlist.sh b/Documentation/perf_counter/generate-cmdlist.sh
deleted file mode 100755
index f06f6fd..0000000
--- a/Documentation/perf_counter/generate-cmdlist.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/sh
-
-echo "/* Automatically generated by $0 */
-struct cmdname_help
-{
-    char name[16];
-    char help[80];
-};
-
-static struct cmdname_help common_cmds[] = {"
-
-sed -n -e 's/^perf-\([^ 	]*\)[ 	].* common.*/\1/p' command-list.txt |
-sort |
-while read cmd
-do
-     sed -n '
-     /^NAME/,/perf-'"$cmd"'/H
-     ${
-            x
-            s/.*perf-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/
-	    p
-     }' "Documentation/perf-$cmd.txt"
-done
-echo "};"
diff --git a/Documentation/perf_counter/help.c b/Documentation/perf_counter/help.c
deleted file mode 100644
index ec01167..0000000
--- a/Documentation/perf_counter/help.c
+++ /dev/null
@@ -1,366 +0,0 @@
-#include "cache.h"
-#include "builtin.h"
-#include "exec_cmd.h"
-#include "levenshtein.h"
-#include "help.h"
-
-/* most GUI terminals set COLUMNS (although some don't export it) */
-static int term_columns(void)
-{
-	char *col_string = getenv("COLUMNS");
-	int n_cols;
-
-	if (col_string && (n_cols = atoi(col_string)) > 0)
-		return n_cols;
-
-#ifdef TIOCGWINSZ
-	{
-		struct winsize ws;
-		if (!ioctl(1, TIOCGWINSZ, &ws)) {
-			if (ws.ws_col)
-				return ws.ws_col;
-		}
-	}
-#endif
-
-	return 80;
-}
-
-void add_cmdname(struct cmdnames *cmds, const char *name, int len)
-{
-	struct cmdname *ent = malloc(sizeof(*ent) + len + 1);
-
-	ent->len = len;
-	memcpy(ent->name, name, len);
-	ent->name[len] = 0;
-
-	ALLOC_GROW(cmds->names, cmds->cnt + 1, cmds->alloc);
-	cmds->names[cmds->cnt++] = ent;
-}
-
-static void clean_cmdnames(struct cmdnames *cmds)
-{
-	int i;
-	for (i = 0; i < cmds->cnt; ++i)
-		free(cmds->names[i]);
-	free(cmds->names);
-	cmds->cnt = 0;
-	cmds->alloc = 0;
-}
-
-static int cmdname_compare(const void *a_, const void *b_)
-{
-	struct cmdname *a = *(struct cmdname **)a_;
-	struct cmdname *b = *(struct cmdname **)b_;
-	return strcmp(a->name, b->name);
-}
-
-static void uniq(struct cmdnames *cmds)
-{
-	int i, j;
-
-	if (!cmds->cnt)
-		return;
-
-	for (i = j = 1; i < cmds->cnt; i++)
-		if (strcmp(cmds->names[i]->name, cmds->names[i-1]->name))
-			cmds->names[j++] = cmds->names[i];
-
-	cmds->cnt = j;
-}
-
-void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes)
-{
-	int ci, cj, ei;
-	int cmp;
-
-	ci = cj = ei = 0;
-	while (ci < cmds->cnt && ei < excludes->cnt) {
-		cmp = strcmp(cmds->names[ci]->name, excludes->names[ei]->name);
-		if (cmp < 0)
-			cmds->names[cj++] = cmds->names[ci++];
-		else if (cmp == 0)
-			ci++, ei++;
-		else if (cmp > 0)
-			ei++;
-	}
-
-	while (ci < cmds->cnt)
-		cmds->names[cj++] = cmds->names[ci++];
-
-	cmds->cnt = cj;
-}
-
-static void pretty_print_string_list(struct cmdnames *cmds, int longest)
-{
-	int cols = 1, rows;
-	int space = longest + 1; /* min 1 SP between words */
-	int max_cols = term_columns() - 1; /* don't print *on* the edge */
-	int i, j;
-
-	if (space < max_cols)
-		cols = max_cols / space;
-	rows = (cmds->cnt + cols - 1) / cols;
-
-	for (i = 0; i < rows; i++) {
-		printf("  ");
-
-		for (j = 0; j < cols; j++) {
-			int n = j * rows + i;
-			int size = space;
-			if (n >= cmds->cnt)
-				break;
-			if (j == cols-1 || n + rows >= cmds->cnt)
-				size = 1;
-			printf("%-*s", size, cmds->names[n]->name);
-		}
-		putchar('\n');
-	}
-}
-
-static int is_executable(const char *name)
-{
-	struct stat st;
-
-	if (stat(name, &st) || /* stat, not lstat */
-	    !S_ISREG(st.st_mode))
-		return 0;
-
-#ifdef __MINGW32__
-	/* cannot trust the executable bit, peek into the file instead */
-	char buf[3] = { 0 };
-	int n;
-	int fd = open(name, O_RDONLY);
-	st.st_mode &= ~S_IXUSR;
-	if (fd >= 0) {
-		n = read(fd, buf, 2);
-		if (n == 2)
-			/* DOS executables start with "MZ" */
-			if (!strcmp(buf, "#!") || !strcmp(buf, "MZ"))
-				st.st_mode |= S_IXUSR;
-		close(fd);
-	}
-#endif
-	return st.st_mode & S_IXUSR;
-}
-
-static void list_commands_in_dir(struct cmdnames *cmds,
-					 const char *path,
-					 const char *prefix)
-{
-	int prefix_len;
-	DIR *dir = opendir(path);
-	struct dirent *de;
-	struct strbuf buf = STRBUF_INIT;
-	int len;
-
-	if (!dir)
-		return;
-	if (!prefix)
-		prefix = "perf-";
-	prefix_len = strlen(prefix);
-
-	strbuf_addf(&buf, "%s/", path);
-	len = buf.len;
-
-	while ((de = readdir(dir)) != NULL) {
-		int entlen;
-
-		if (prefixcmp(de->d_name, prefix))
-			continue;
-
-		strbuf_setlen(&buf, len);
-		strbuf_addstr(&buf, de->d_name);
-		if (!is_executable(buf.buf))
-			continue;
-
-		entlen = strlen(de->d_name) - prefix_len;
-		if (has_extension(de->d_name, ".exe"))
-			entlen -= 4;
-
-		add_cmdname(cmds, de->d_name + prefix_len, entlen);
-	}
-	closedir(dir);
-	strbuf_release(&buf);
-}
-
-void load_command_list(const char *prefix,
-		struct cmdnames *main_cmds,
-		struct cmdnames *other_cmds)
-{
-	const char *env_path = getenv("PATH");
-	const char *exec_path = perf_exec_path();
-
-	if (exec_path) {
-		list_commands_in_dir(main_cmds, exec_path, prefix);
-		qsort(main_cmds->names, main_cmds->cnt,
-		      sizeof(*main_cmds->names), cmdname_compare);
-		uniq(main_cmds);
-	}
-
-	if (env_path) {
-		char *paths, *path, *colon;
-		path = paths = strdup(env_path);
-		while (1) {
-			if ((colon = strchr(path, PATH_SEP)))
-				*colon = 0;
-			if (!exec_path || strcmp(path, exec_path))
-				list_commands_in_dir(other_cmds, path, prefix);
-
-			if (!colon)
-				break;
-			path = colon + 1;
-		}
-		free(paths);
-
-		qsort(other_cmds->names, other_cmds->cnt,
-		      sizeof(*other_cmds->names), cmdname_compare);
-		uniq(other_cmds);
-	}
-	exclude_cmds(other_cmds, main_cmds);
-}
-
-void list_commands(const char *title, struct cmdnames *main_cmds,
-		   struct cmdnames *other_cmds)
-{
-	int i, longest = 0;
-
-	for (i = 0; i < main_cmds->cnt; i++)
-		if (longest < main_cmds->names[i]->len)
-			longest = main_cmds->names[i]->len;
-	for (i = 0; i < other_cmds->cnt; i++)
-		if (longest < other_cmds->names[i]->len)
-			longest = other_cmds->names[i]->len;
-
-	if (main_cmds->cnt) {
-		const char *exec_path = perf_exec_path();
-		printf("available %s in '%s'\n", title, exec_path);
-		printf("----------------");
-		mput_char('-', strlen(title) + strlen(exec_path));
-		putchar('\n');
-		pretty_print_string_list(main_cmds, longest);
-		putchar('\n');
-	}
-
-	if (other_cmds->cnt) {
-		printf("%s available from elsewhere on your $PATH\n", title);
-		printf("---------------------------------------");
-		mput_char('-', strlen(title));
-		putchar('\n');
-		pretty_print_string_list(other_cmds, longest);
-		putchar('\n');
-	}
-}
-
-int is_in_cmdlist(struct cmdnames *c, const char *s)
-{
-	int i;
-	for (i = 0; i < c->cnt; i++)
-		if (!strcmp(s, c->names[i]->name))
-			return 1;
-	return 0;
-}
-
-static int autocorrect;
-static struct cmdnames aliases;
-
-static int perf_unknown_cmd_config(const char *var, const char *value, void *cb)
-{
-	if (!strcmp(var, "help.autocorrect"))
-		autocorrect = perf_config_int(var,value);
-	/* Also use aliases for command lookup */
-	if (!prefixcmp(var, "alias."))
-		add_cmdname(&aliases, var + 6, strlen(var + 6));
-
-	return perf_default_config(var, value, cb);
-}
-
-static int levenshtein_compare(const void *p1, const void *p2)
-{
-	const struct cmdname *const *c1 = p1, *const *c2 = p2;
-	const char *s1 = (*c1)->name, *s2 = (*c2)->name;
-	int l1 = (*c1)->len;
-	int l2 = (*c2)->len;
-	return l1 != l2 ? l1 - l2 : strcmp(s1, s2);
-}
-
-static void add_cmd_list(struct cmdnames *cmds, struct cmdnames *old)
-{
-	int i;
-	ALLOC_GROW(cmds->names, cmds->cnt + old->cnt, cmds->alloc);
-
-	for (i = 0; i < old->cnt; i++)
-		cmds->names[cmds->cnt++] = old->names[i];
-	free(old->names);
-	old->cnt = 0;
-	old->names = NULL;
-}
-
-const char *help_unknown_cmd(const char *cmd)
-{
-	int i, n, best_similarity = 0;
-	struct cmdnames main_cmds, other_cmds;
-
-	memset(&main_cmds, 0, sizeof(main_cmds));
-	memset(&other_cmds, 0, sizeof(main_cmds));
-	memset(&aliases, 0, sizeof(aliases));
-
-	perf_config(perf_unknown_cmd_config, NULL);
-
-	load_command_list("perf-", &main_cmds, &other_cmds);
-
-	add_cmd_list(&main_cmds, &aliases);
-	add_cmd_list(&main_cmds, &other_cmds);
-	qsort(main_cmds.names, main_cmds.cnt,
-	      sizeof(main_cmds.names), cmdname_compare);
-	uniq(&main_cmds);
-
-	/* This reuses cmdname->len for similarity index */
-	for (i = 0; i < main_cmds.cnt; ++i)
-		main_cmds.names[i]->len =
-			levenshtein(cmd, main_cmds.names[i]->name, 0, 2, 1, 4);
-
-	qsort(main_cmds.names, main_cmds.cnt,
-	      sizeof(*main_cmds.names), levenshtein_compare);
-
-	if (!main_cmds.cnt)
-		die ("Uh oh. Your system reports no Git commands at all.");
-
-	best_similarity = main_cmds.names[0]->len;
-	n = 1;
-	while (n < main_cmds.cnt && best_similarity == main_cmds.names[n]->len)
-		++n;
-	if (autocorrect && n == 1) {
-		const char *assumed = main_cmds.names[0]->name;
-		main_cmds.names[0] = NULL;
-		clean_cmdnames(&main_cmds);
-		fprintf(stderr, "WARNING: You called a Git program named '%s', "
-			"which does not exist.\n"
-			"Continuing under the assumption that you meant '%s'\n",
-			cmd, assumed);
-		if (autocorrect > 0) {
-			fprintf(stderr, "in %0.1f seconds automatically...\n",
-				(float)autocorrect/10.0);
-			poll(NULL, 0, autocorrect * 100);
-		}
-		return assumed;
-	}
-
-	fprintf(stderr, "perf: '%s' is not a perf-command. See 'perf --help'.\n", cmd);
-
-	if (best_similarity < 6) {
-		fprintf(stderr, "\nDid you mean %s?\n",
-			n < 2 ? "this": "one of these");
-
-		for (i = 0; i < n; i++)
-			fprintf(stderr, "\t%s\n", main_cmds.names[i]->name);
-	}
-
-	exit(1);
-}
-
-int cmd_version(int argc, const char **argv, const char *prefix)
-{
-	printf("perf version %s\n", perf_version_string);
-	return 0;
-}
diff --git a/Documentation/perf_counter/help.h b/Documentation/perf_counter/help.h
deleted file mode 100644
index 56bc154..0000000
--- a/Documentation/perf_counter/help.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef HELP_H
-#define HELP_H
-
-struct cmdnames {
-	int alloc;
-	int cnt;
-	struct cmdname {
-		size_t len; /* also used for similarity index in help.c */
-		char name[FLEX_ARRAY];
-	} **names;
-};
-
-static inline void mput_char(char c, unsigned int num)
-{
-	while(num--)
-		putchar(c);
-}
-
-void load_command_list(const char *prefix,
-		struct cmdnames *main_cmds,
-		struct cmdnames *other_cmds);
-void add_cmdname(struct cmdnames *cmds, const char *name, int len);
-/* Here we require that excludes is a sorted list. */
-void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes);
-int is_in_cmdlist(struct cmdnames *c, const char *s);
-void list_commands(const char *title, struct cmdnames *main_cmds,
-		   struct cmdnames *other_cmds);
-
-#endif /* HELP_H */
diff --git a/Documentation/perf_counter/levenshtein.c b/Documentation/perf_counter/levenshtein.c
deleted file mode 100644
index e521d15..0000000
--- a/Documentation/perf_counter/levenshtein.c
+++ /dev/null
@@ -1,84 +0,0 @@
-#include "cache.h"
-#include "levenshtein.h"
-
-/*
- * This function implements the Damerau-Levenshtein algorithm to
- * calculate a distance between strings.
- *
- * Basically, it says how many letters need to be swapped, substituted,
- * deleted from, or added to string1, at least, to get string2.
- *
- * The idea is to build a distance matrix for the substrings of both
- * strings.  To avoid a large space complexity, only the last three rows
- * are kept in memory (if swaps had the same or higher cost as one deletion
- * plus one insertion, only two rows would be needed).
- *
- * At any stage, "i + 1" denotes the length of the current substring of
- * string1 that the distance is calculated for.
- *
- * row2 holds the current row, row1 the previous row (i.e. for the substring
- * of string1 of length "i"), and row0 the row before that.
- *
- * In other words, at the start of the big loop, row2[j + 1] contains the
- * Damerau-Levenshtein distance between the substring of string1 of length
- * "i" and the substring of string2 of length "j + 1".
- *
- * All the big loop does is determine the partial minimum-cost paths.
- *
- * It does so by calculating the costs of the path ending in characters
- * i (in string1) and j (in string2), respectively, given that the last
- * operation is a substition, a swap, a deletion, or an insertion.
- *
- * This implementation allows the costs to be weighted:
- *
- * - w (as in "sWap")
- * - s (as in "Substitution")
- * - a (for insertion, AKA "Add")
- * - d (as in "Deletion")
- *
- * Note that this algorithm calculates a distance _iff_ d == a.
- */
-int levenshtein(const char *string1, const char *string2,
-		int w, int s, int a, int d)
-{
-	int len1 = strlen(string1), len2 = strlen(string2);
-	int *row0 = malloc(sizeof(int) * (len2 + 1));
-	int *row1 = malloc(sizeof(int) * (len2 + 1));
-	int *row2 = malloc(sizeof(int) * (len2 + 1));
-	int i, j;
-
-	for (j = 0; j <= len2; j++)
-		row1[j] = j * a;
-	for (i = 0; i < len1; i++) {
-		int *dummy;
-
-		row2[0] = (i + 1) * d;
-		for (j = 0; j < len2; j++) {
-			/* substitution */
-			row2[j + 1] = row1[j] + s * (string1[i] != string2[j]);
-			/* swap */
-			if (i > 0 && j > 0 && string1[i - 1] == string2[j] &&
-					string1[i] == string2[j - 1] &&
-					row2[j + 1] > row0[j - 1] + w)
-				row2[j + 1] = row0[j - 1] + w;
-			/* deletion */
-			if (row2[j + 1] > row1[j + 1] + d)
-				row2[j + 1] = row1[j + 1] + d;
-			/* insertion */
-			if (row2[j + 1] > row2[j] + a)
-				row2[j + 1] = row2[j] + a;
-		}
-
-		dummy = row0;
-		row0 = row1;
-		row1 = row2;
-		row2 = dummy;
-	}
-
-	i = row1[len2];
-	free(row0);
-	free(row1);
-	free(row2);
-
-	return i;
-}
diff --git a/Documentation/perf_counter/levenshtein.h b/Documentation/perf_counter/levenshtein.h
deleted file mode 100644
index 0173abe..0000000
--- a/Documentation/perf_counter/levenshtein.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef LEVENSHTEIN_H
-#define LEVENSHTEIN_H
-
-int levenshtein(const char *string1, const char *string2,
-	int swap_penalty, int substition_penalty,
-	int insertion_penalty, int deletion_penalty);
-
-#endif
diff --git a/Documentation/perf_counter/parse-options.c b/Documentation/perf_counter/parse-options.c
deleted file mode 100644
index 7464f34..0000000
--- a/Documentation/perf_counter/parse-options.c
+++ /dev/null
@@ -1,495 +0,0 @@
-#include "util.h"
-#include "parse-options.h"
-#include "cache.h"
-
-#define OPT_SHORT 1
-#define OPT_UNSET 2
-
-static int opterror(const struct option *opt, const char *reason, int flags)
-{
-	if (flags & OPT_SHORT)
-		return error("switch `%c' %s", opt->short_name, reason);
-	if (flags & OPT_UNSET)
-		return error("option `no-%s' %s", opt->long_name, reason);
-	return error("option `%s' %s", opt->long_name, reason);
-}
-
-static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt,
-		   int flags, const char **arg)
-{
-	if (p->opt) {
-		*arg = p->opt;
-		p->opt = NULL;
-	} else if (p->argc == 1 && (opt->flags & PARSE_OPT_LASTARG_DEFAULT)) {
-		*arg = (const char *)opt->defval;
-	} else if (p->argc > 1) {
-		p->argc--;
-		*arg = *++p->argv;
-	} else
-		return opterror(opt, "requires a value", flags);
-	return 0;
-}
-
-static int get_value(struct parse_opt_ctx_t *p,
-		     const struct option *opt, int flags)
-{
-	const char *s, *arg;
-	const int unset = flags & OPT_UNSET;
-
-	if (unset && p->opt)
-		return opterror(opt, "takes no value", flags);
-	if (unset && (opt->flags & PARSE_OPT_NONEG))
-		return opterror(opt, "isn't available", flags);
-
-	if (!(flags & OPT_SHORT) && p->opt) {
-		switch (opt->type) {
-		case OPTION_CALLBACK:
-			if (!(opt->flags & PARSE_OPT_NOARG))
-				break;
-			/* FALLTHROUGH */
-		case OPTION_BOOLEAN:
-		case OPTION_BIT:
-		case OPTION_SET_INT:
-		case OPTION_SET_PTR:
-			return opterror(opt, "takes no value", flags);
-		default:
-			break;
-		}
-	}
-
-	switch (opt->type) {
-	case OPTION_BIT:
-		if (unset)
-			*(int *)opt->value &= ~opt->defval;
-		else
-			*(int *)opt->value |= opt->defval;
-		return 0;
-
-	case OPTION_BOOLEAN:
-		*(int *)opt->value = unset ? 0 : *(int *)opt->value + 1;
-		return 0;
-
-	case OPTION_SET_INT:
-		*(int *)opt->value = unset ? 0 : opt->defval;
-		return 0;
-
-	case OPTION_SET_PTR:
-		*(void **)opt->value = unset ? NULL : (void *)opt->defval;
-		return 0;
-
-	case OPTION_STRING:
-		if (unset)
-			*(const char **)opt->value = NULL;
-		else if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
-			*(const char **)opt->value = (const char *)opt->defval;
-		else
-			return get_arg(p, opt, flags, (const char **)opt->value);
-		return 0;
-
-	case OPTION_CALLBACK:
-		if (unset)
-			return (*opt->callback)(opt, NULL, 1) ? (-1) : 0;
-		if (opt->flags & PARSE_OPT_NOARG)
-			return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
-		if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
-			return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
-		if (get_arg(p, opt, flags, &arg))
-			return -1;
-		return (*opt->callback)(opt, arg, 0) ? (-1) : 0;
-
-	case OPTION_INTEGER:
-		if (unset) {
-			*(int *)opt->value = 0;
-			return 0;
-		}
-		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
-			*(int *)opt->value = opt->defval;
-			return 0;
-		}
-		if (get_arg(p, opt, flags, &arg))
-			return -1;
-		*(int *)opt->value = strtol(arg, (char **)&s, 10);
-		if (*s)
-			return opterror(opt, "expects a numerical value", flags);
-		return 0;
-
-	default:
-		die("should not happen, someone must be hit on the forehead");
-	}
-}
-
-static int parse_short_opt(struct parse_opt_ctx_t *p, const struct option *options)
-{
-	for (; options->type != OPTION_END; options++) {
-		if (options->short_name == *p->opt) {
-			p->opt = p->opt[1] ? p->opt + 1 : NULL;
-			return get_value(p, options, OPT_SHORT);
-		}
-	}
-	return -2;
-}
-
-static int parse_long_opt(struct parse_opt_ctx_t *p, const char *arg,
-                          const struct option *options)
-{
-	const char *arg_end = strchr(arg, '=');
-	const struct option *abbrev_option = NULL, *ambiguous_option = NULL;
-	int abbrev_flags = 0, ambiguous_flags = 0;
-
-	if (!arg_end)
-		arg_end = arg + strlen(arg);
-
-	for (; options->type != OPTION_END; options++) {
-		const char *rest;
-		int flags = 0;
-
-		if (!options->long_name)
-			continue;
-
-		rest = skip_prefix(arg, options->long_name);
-		if (options->type == OPTION_ARGUMENT) {
-			if (!rest)
-				continue;
-			if (*rest == '=')
-				return opterror(options, "takes no value", flags);
-			if (*rest)
-				continue;
-			p->out[p->cpidx++] = arg - 2;
-			return 0;
-		}
-		if (!rest) {
-			/* abbreviated? */
-			if (!strncmp(options->long_name, arg, arg_end - arg)) {
-is_abbreviated:
-				if (abbrev_option) {
-					/*
-					 * If this is abbreviated, it is
-					 * ambiguous. So when there is no
-					 * exact match later, we need to
-					 * error out.
-					 */
-					ambiguous_option = abbrev_option;
-					ambiguous_flags = abbrev_flags;
-				}
-				if (!(flags & OPT_UNSET) && *arg_end)
-					p->opt = arg_end + 1;
-				abbrev_option = options;
-				abbrev_flags = flags;
-				continue;
-			}
-			/* negated and abbreviated very much? */
-			if (!prefixcmp("no-", arg)) {
-				flags |= OPT_UNSET;
-				goto is_abbreviated;
-			}
-			/* negated? */
-			if (strncmp(arg, "no-", 3))
-				continue;
-			flags |= OPT_UNSET;
-			rest = skip_prefix(arg + 3, options->long_name);
-			/* abbreviated and negated? */
-			if (!rest && !prefixcmp(options->long_name, arg + 3))
-				goto is_abbreviated;
-			if (!rest)
-				continue;
-		}
-		if (*rest) {
-			if (*rest != '=')
-				continue;
-			p->opt = rest + 1;
-		}
-		return get_value(p, options, flags);
-	}
-
-	if (ambiguous_option)
-		return error("Ambiguous option: %s "
-			"(could be --%s%s or --%s%s)",
-			arg,
-			(ambiguous_flags & OPT_UNSET) ?  "no-" : "",
-			ambiguous_option->long_name,
-			(abbrev_flags & OPT_UNSET) ?  "no-" : "",
-			abbrev_option->long_name);
-	if (abbrev_option)
-		return get_value(p, abbrev_option, abbrev_flags);
-	return -2;
-}
-
-static void check_typos(const char *arg, const struct option *options)
-{
-	if (strlen(arg) < 3)
-		return;
-
-	if (!prefixcmp(arg, "no-")) {
-		error ("did you mean `--%s` (with two dashes ?)", arg);
-		exit(129);
-	}
-
-	for (; options->type != OPTION_END; options++) {
-		if (!options->long_name)
-			continue;
-		if (!prefixcmp(options->long_name, arg)) {
-			error ("did you mean `--%s` (with two dashes ?)", arg);
-			exit(129);
-		}
-	}
-}
-
-void parse_options_start(struct parse_opt_ctx_t *ctx,
-			 int argc, const char **argv, int flags)
-{
-	memset(ctx, 0, sizeof(*ctx));
-	ctx->argc = argc - 1;
-	ctx->argv = argv + 1;
-	ctx->out  = argv;
-	ctx->cpidx = ((flags & PARSE_OPT_KEEP_ARGV0) != 0);
-	ctx->flags = flags;
-	if ((flags & PARSE_OPT_KEEP_UNKNOWN) &&
-	    (flags & PARSE_OPT_STOP_AT_NON_OPTION))
-		die("STOP_AT_NON_OPTION and KEEP_UNKNOWN don't go together");
-}
-
-static int usage_with_options_internal(const char * const *,
-				       const struct option *, int);
-
-int parse_options_step(struct parse_opt_ctx_t *ctx,
-		       const struct option *options,
-		       const char * const usagestr[])
-{
-	int internal_help = !(ctx->flags & PARSE_OPT_NO_INTERNAL_HELP);
-
-	/* we must reset ->opt, unknown short option leave it dangling */
-	ctx->opt = NULL;
-
-	for (; ctx->argc; ctx->argc--, ctx->argv++) {
-		const char *arg = ctx->argv[0];
-
-		if (*arg != '-' || !arg[1]) {
-			if (ctx->flags & PARSE_OPT_STOP_AT_NON_OPTION)
-				break;
-			ctx->out[ctx->cpidx++] = ctx->argv[0];
-			continue;
-		}
-
-		if (arg[1] != '-') {
-			ctx->opt = arg + 1;
-			if (internal_help && *ctx->opt == 'h')
-				return parse_options_usage(usagestr, options);
-			switch (parse_short_opt(ctx, options)) {
-			case -1:
-				return parse_options_usage(usagestr, options);
-			case -2:
-				goto unknown;
-			}
-			if (ctx->opt)
-				check_typos(arg + 1, options);
-			while (ctx->opt) {
-				if (internal_help && *ctx->opt == 'h')
-					return parse_options_usage(usagestr, options);
-				switch (parse_short_opt(ctx, options)) {
-				case -1:
-					return parse_options_usage(usagestr, options);
-				case -2:
-					/* fake a short option thing to hide the fact that we may have
-					 * started to parse aggregated stuff
-					 *
-					 * This is leaky, too bad.
-					 */
-					ctx->argv[0] = strdup(ctx->opt - 1);
-					*(char *)ctx->argv[0] = '-';
-					goto unknown;
-				}
-			}
-			continue;
-		}
-
-		if (!arg[2]) { /* "--" */
-			if (!(ctx->flags & PARSE_OPT_KEEP_DASHDASH)) {
-				ctx->argc--;
-				ctx->argv++;
-			}
-			break;
-		}
-
-		if (internal_help && !strcmp(arg + 2, "help-all"))
-			return usage_with_options_internal(usagestr, options, 1);
-		if (internal_help && !strcmp(arg + 2, "help"))
-			return parse_options_usage(usagestr, options);
-		switch (parse_long_opt(ctx, arg + 2, options)) {
-		case -1:
-			return parse_options_usage(usagestr, options);
-		case -2:
-			goto unknown;
-		}
-		continue;
-unknown:
-		if (!(ctx->flags & PARSE_OPT_KEEP_UNKNOWN))
-			return PARSE_OPT_UNKNOWN;
-		ctx->out[ctx->cpidx++] = ctx->argv[0];
-		ctx->opt = NULL;
-	}
-	return PARSE_OPT_DONE;
-}
-
-int parse_options_end(struct parse_opt_ctx_t *ctx)
-{
-	memmove(ctx->out + ctx->cpidx, ctx->argv, ctx->argc * sizeof(*ctx->out));
-	ctx->out[ctx->cpidx + ctx->argc] = NULL;
-	return ctx->cpidx + ctx->argc;
-}
-
-int parse_options(int argc, const char **argv, const struct option *options,
-		  const char * const usagestr[], int flags)
-{
-	struct parse_opt_ctx_t ctx;
-
-	parse_options_start(&ctx, argc, argv, flags);
-	switch (parse_options_step(&ctx, options, usagestr)) {
-	case PARSE_OPT_HELP:
-		exit(129);
-	case PARSE_OPT_DONE:
-		break;
-	default: /* PARSE_OPT_UNKNOWN */
-		if (ctx.argv[0][1] == '-') {
-			error("unknown option `%s'", ctx.argv[0] + 2);
-		} else {
-			error("unknown switch `%c'", *ctx.opt);
-		}
-		usage_with_options(usagestr, options);
-	}
-
-	return parse_options_end(&ctx);
-}
-
-#define USAGE_OPTS_WIDTH 24
-#define USAGE_GAP         2
-
-int usage_with_options_internal(const char * const *usagestr,
-				const struct option *opts, int full)
-{
-	if (!usagestr)
-		return PARSE_OPT_HELP;
-
-	fprintf(stderr, "usage: %s\n", *usagestr++);
-	while (*usagestr && **usagestr)
-		fprintf(stderr, "   or: %s\n", *usagestr++);
-	while (*usagestr) {
-		fprintf(stderr, "%s%s\n",
-				**usagestr ? "    " : "",
-				*usagestr);
-		usagestr++;
-	}
-
-	if (opts->type != OPTION_GROUP)
-		fputc('\n', stderr);
-
-	for (; opts->type != OPTION_END; opts++) {
-		size_t pos;
-		int pad;
-
-		if (opts->type == OPTION_GROUP) {
-			fputc('\n', stderr);
-			if (*opts->help)
-				fprintf(stderr, "%s\n", opts->help);
-			continue;
-		}
-		if (!full && (opts->flags & PARSE_OPT_HIDDEN))
-			continue;
-
-		pos = fprintf(stderr, "    ");
-		if (opts->short_name)
-			pos += fprintf(stderr, "-%c", opts->short_name);
-		if (opts->long_name && opts->short_name)
-			pos += fprintf(stderr, ", ");
-		if (opts->long_name)
-			pos += fprintf(stderr, "--%s", opts->long_name);
-
-		switch (opts->type) {
-		case OPTION_ARGUMENT:
-			break;
-		case OPTION_INTEGER:
-			if (opts->flags & PARSE_OPT_OPTARG)
-				if (opts->long_name)
-					pos += fprintf(stderr, "[=<n>]");
-				else
-					pos += fprintf(stderr, "[<n>]");
-			else
-				pos += fprintf(stderr, " <n>");
-			break;
-		case OPTION_CALLBACK:
-			if (opts->flags & PARSE_OPT_NOARG)
-				break;
-			/* FALLTHROUGH */
-		case OPTION_STRING:
-			if (opts->argh) {
-				if (opts->flags & PARSE_OPT_OPTARG)
-					if (opts->long_name)
-						pos += fprintf(stderr, "[=<%s>]", opts->argh);
-					else
-						pos += fprintf(stderr, "[<%s>]", opts->argh);
-				else
-					pos += fprintf(stderr, " <%s>", opts->argh);
-			} else {
-				if (opts->flags & PARSE_OPT_OPTARG)
-					if (opts->long_name)
-						pos += fprintf(stderr, "[=...]");
-					else
-						pos += fprintf(stderr, "[...]");
-				else
-					pos += fprintf(stderr, " ...");
-			}
-			break;
-		default: /* OPTION_{BIT,BOOLEAN,SET_INT,SET_PTR} */
-			break;
-		}
-
-		if (pos <= USAGE_OPTS_WIDTH)
-			pad = USAGE_OPTS_WIDTH - pos;
-		else {
-			fputc('\n', stderr);
-			pad = USAGE_OPTS_WIDTH;
-		}
-		fprintf(stderr, "%*s%s\n", pad + USAGE_GAP, "", opts->help);
-	}
-	fputc('\n', stderr);
-
-	return PARSE_OPT_HELP;
-}
-
-void usage_with_options(const char * const *usagestr,
-			const struct option *opts)
-{
-	usage_with_options_internal(usagestr, opts, 0);
-	exit(129);
-}
-
-int parse_options_usage(const char * const *usagestr,
-			const struct option *opts)
-{
-	return usage_with_options_internal(usagestr, opts, 0);
-}
-
-
-/*----- some often used options -----*/
-#include "cache.h"
-
-int parse_opt_verbosity_cb(const struct option *opt, const char *arg,
-			   int unset)
-{
-	int *target = opt->value;
-
-	if (unset)
-		/* --no-quiet, --no-verbose */
-		*target = 0;
-	else if (opt->short_name == 'v') {
-		if (*target >= 0)
-			(*target)++;
-		else
-			*target = 1;
-	} else {
-		if (*target <= 0)
-			(*target)--;
-		else
-			*target = -1;
-	}
-	return 0;
-}
diff --git a/Documentation/perf_counter/parse-options.h b/Documentation/perf_counter/parse-options.h
deleted file mode 100644
index a81c7fa..0000000
--- a/Documentation/perf_counter/parse-options.h
+++ /dev/null
@@ -1,172 +0,0 @@
-#ifndef PARSE_OPTIONS_H
-#define PARSE_OPTIONS_H
-
-enum parse_opt_type {
-	/* special types */
-	OPTION_END,
-	OPTION_ARGUMENT,
-	OPTION_GROUP,
-	/* options with no arguments */
-	OPTION_BIT,
-	OPTION_BOOLEAN, /* _INCR would have been a better name */
-	OPTION_SET_INT,
-	OPTION_SET_PTR,
-	/* options with arguments (usually) */
-	OPTION_STRING,
-	OPTION_INTEGER,
-	OPTION_CALLBACK,
-};
-
-enum parse_opt_flags {
-	PARSE_OPT_KEEP_DASHDASH = 1,
-	PARSE_OPT_STOP_AT_NON_OPTION = 2,
-	PARSE_OPT_KEEP_ARGV0 = 4,
-	PARSE_OPT_KEEP_UNKNOWN = 8,
-	PARSE_OPT_NO_INTERNAL_HELP = 16,
-};
-
-enum parse_opt_option_flags {
-	PARSE_OPT_OPTARG  = 1,
-	PARSE_OPT_NOARG   = 2,
-	PARSE_OPT_NONEG   = 4,
-	PARSE_OPT_HIDDEN  = 8,
-	PARSE_OPT_LASTARG_DEFAULT = 16,
-};
-
-struct option;
-typedef int parse_opt_cb(const struct option *, const char *arg, int unset);
-
-/*
- * `type`::
- *   holds the type of the option, you must have an OPTION_END last in your
- *   array.
- *
- * `short_name`::
- *   the character to use as a short option name, '\0' if none.
- *
- * `long_name`::
- *   the long option name, without the leading dashes, NULL if none.
- *
- * `value`::
- *   stores pointers to the values to be filled.
- *
- * `argh`::
- *   token to explain the kind of argument this option wants. Keep it
- *   homogenous across the repository.
- *
- * `help`::
- *   the short help associated to what the option does.
- *   Must never be NULL (except for OPTION_END).
- *   OPTION_GROUP uses this pointer to store the group header.
- *
- * `flags`::
- *   mask of parse_opt_option_flags.
- *   PARSE_OPT_OPTARG: says that the argument is optionnal (not for BOOLEANs)
- *   PARSE_OPT_NOARG: says that this option takes no argument, for CALLBACKs
- *   PARSE_OPT_NONEG: says that this option cannot be negated
- *   PARSE_OPT_HIDDEN this option is skipped in the default usage, showed in
- *                    the long one.
- *
- * `callback`::
- *   pointer to the callback to use for OPTION_CALLBACK.
- *
- * `defval`::
- *   default value to fill (*->value) with for PARSE_OPT_OPTARG.
- *   OPTION_{BIT,SET_INT,SET_PTR} store the {mask,integer,pointer} to put in
- *   the value when met.
- *   CALLBACKS can use it like they want.
- */
-struct option {
-	enum parse_opt_type type;
-	int short_name;
-	const char *long_name;
-	void *value;
-	const char *argh;
-	const char *help;
-
-	int flags;
-	parse_opt_cb *callback;
-	intptr_t defval;
-};
-
-#define OPT_END()                   { OPTION_END }
-#define OPT_ARGUMENT(l, h)          { OPTION_ARGUMENT, 0, (l), NULL, NULL, (h) }
-#define OPT_GROUP(h)                { OPTION_GROUP, 0, NULL, NULL, NULL, (h) }
-#define OPT_BIT(s, l, v, h, b)      { OPTION_BIT, (s), (l), (v), NULL, (h), 0, NULL, (b) }
-#define OPT_BOOLEAN(s, l, v, h)     { OPTION_BOOLEAN, (s), (l), (v), NULL, (h) }
-#define OPT_SET_INT(s, l, v, h, i)  { OPTION_SET_INT, (s), (l), (v), NULL, (h), 0, NULL, (i) }
-#define OPT_SET_PTR(s, l, v, h, p)  { OPTION_SET_PTR, (s), (l), (v), NULL, (h), 0, NULL, (p) }
-#define OPT_INTEGER(s, l, v, h)     { OPTION_INTEGER, (s), (l), (v), NULL, (h) }
-#define OPT_STRING(s, l, v, a, h)   { OPTION_STRING,  (s), (l), (v), (a), (h) }
-#define OPT_DATE(s, l, v, h) \
-	{ OPTION_CALLBACK, (s), (l), (v), "time",(h), 0, \
-	  parse_opt_approxidate_cb }
-#define OPT_CALLBACK(s, l, v, a, h, f) \
-	{ OPTION_CALLBACK, (s), (l), (v), (a), (h), 0, (f) }
-
-/* parse_options() will filter out the processed options and leave the
- * non-option argments in argv[].
- * Returns the number of arguments left in argv[].
- */
-extern int parse_options(int argc, const char **argv,
-                         const struct option *options,
-                         const char * const usagestr[], int flags);
-
-extern NORETURN void usage_with_options(const char * const *usagestr,
-                                        const struct option *options);
-
-/*----- incremantal advanced APIs -----*/
-
-enum {
-	PARSE_OPT_HELP = -1,
-	PARSE_OPT_DONE,
-	PARSE_OPT_UNKNOWN,
-};
-
-/*
- * It's okay for the caller to consume argv/argc in the usual way.
- * Other fields of that structure are private to parse-options and should not
- * be modified in any way.
- */
-struct parse_opt_ctx_t {
-	const char **argv;
-	const char **out;
-	int argc, cpidx;
-	const char *opt;
-	int flags;
-};
-
-extern int parse_options_usage(const char * const *usagestr,
-			       const struct option *opts);
-
-extern void parse_options_start(struct parse_opt_ctx_t *ctx,
-				int argc, const char **argv, int flags);
-
-extern int parse_options_step(struct parse_opt_ctx_t *ctx,
-			      const struct option *options,
-			      const char * const usagestr[]);
-
-extern int parse_options_end(struct parse_opt_ctx_t *ctx);
-
-
-/*----- some often used options -----*/
-extern int parse_opt_abbrev_cb(const struct option *, const char *, int);
-extern int parse_opt_approxidate_cb(const struct option *, const char *, int);
-extern int parse_opt_verbosity_cb(const struct option *, const char *, int);
-
-#define OPT__VERBOSE(var)  OPT_BOOLEAN('v', "verbose", (var), "be verbose")
-#define OPT__QUIET(var)    OPT_BOOLEAN('q', "quiet",   (var), "be quiet")
-#define OPT__VERBOSITY(var) \
-	{ OPTION_CALLBACK, 'v', "verbose", (var), NULL, "be more verbose", \
-	  PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }, \
-	{ OPTION_CALLBACK, 'q', "quiet", (var), NULL, "be more quiet", \
-	  PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }
-#define OPT__DRY_RUN(var)  OPT_BOOLEAN('n', "dry-run", (var), "dry run")
-#define OPT__ABBREV(var)  \
-	{ OPTION_CALLBACK, 0, "abbrev", (var), "n", \
-	  "use <n> digits to display SHA-1s", \
-	  PARSE_OPT_OPTARG, &parse_opt_abbrev_cb, 0 }
-
-extern const char *parse_options_fix_filename(const char *prefix, const char *file);
-
-#endif
diff --git a/Documentation/perf_counter/path.c b/Documentation/perf_counter/path.c
deleted file mode 100644
index a501a40..0000000
--- a/Documentation/perf_counter/path.c
+++ /dev/null
@@ -1,353 +0,0 @@
-/*
- * I'm tired of doing "vsnprintf()" etc just to open a
- * file, so here's a "return static buffer with printf"
- * interface for paths.
- *
- * It's obviously not thread-safe. Sue me. But it's quite
- * useful for doing things like
- *
- *   f = open(mkpath("%s/%s.perf", base, name), O_RDONLY);
- *
- * which is what it's designed for.
- */
-#include "cache.h"
-
-static char bad_path[] = "/bad-path/";
-/*
- * Two hacks:
- */
-
-static char *get_perf_dir(void)
-{
-	return ".";
-}
-
-size_t strlcpy(char *dest, const char *src, size_t size)
-{
-	size_t ret = strlen(src);
-
-	if (size) {
-		size_t len = (ret >= size) ? size - 1 : ret;
-		memcpy(dest, src, len);
-		dest[len] = '\0';
-	}
-	return ret;
-}
-
-
-static char *get_pathname(void)
-{
-	static char pathname_array[4][PATH_MAX];
-	static int index;
-	return pathname_array[3 & ++index];
-}
-
-static char *cleanup_path(char *path)
-{
-	/* Clean it up */
-	if (!memcmp(path, "./", 2)) {
-		path += 2;
-		while (*path == '/')
-			path++;
-	}
-	return path;
-}
-
-char *mksnpath(char *buf, size_t n, const char *fmt, ...)
-{
-	va_list args;
-	unsigned len;
-
-	va_start(args, fmt);
-	len = vsnprintf(buf, n, fmt, args);
-	va_end(args);
-	if (len >= n) {
-		strlcpy(buf, bad_path, n);
-		return buf;
-	}
-	return cleanup_path(buf);
-}
-
-static char *perf_vsnpath(char *buf, size_t n, const char *fmt, va_list args)
-{
-	const char *perf_dir = get_perf_dir();
-	size_t len;
-
-	len = strlen(perf_dir);
-	if (n < len + 1)
-		goto bad;
-	memcpy(buf, perf_dir, len);
-	if (len && !is_dir_sep(perf_dir[len-1]))
-		buf[len++] = '/';
-	len += vsnprintf(buf + len, n - len, fmt, args);
-	if (len >= n)
-		goto bad;
-	return cleanup_path(buf);
-bad:
-	strlcpy(buf, bad_path, n);
-	return buf;
-}
-
-char *perf_snpath(char *buf, size_t n, const char *fmt, ...)
-{
-	va_list args;
-	va_start(args, fmt);
-	(void)perf_vsnpath(buf, n, fmt, args);
-	va_end(args);
-	return buf;
-}
-
-char *perf_pathdup(const char *fmt, ...)
-{
-	char path[PATH_MAX];
-	va_list args;
-	va_start(args, fmt);
-	(void)perf_vsnpath(path, sizeof(path), fmt, args);
-	va_end(args);
-	return xstrdup(path);
-}
-
-char *mkpath(const char *fmt, ...)
-{
-	va_list args;
-	unsigned len;
-	char *pathname = get_pathname();
-
-	va_start(args, fmt);
-	len = vsnprintf(pathname, PATH_MAX, fmt, args);
-	va_end(args);
-	if (len >= PATH_MAX)
-		return bad_path;
-	return cleanup_path(pathname);
-}
-
-char *perf_path(const char *fmt, ...)
-{
-	const char *perf_dir = get_perf_dir();
-	char *pathname = get_pathname();
-	va_list args;
-	unsigned len;
-
-	len = strlen(perf_dir);
-	if (len > PATH_MAX-100)
-		return bad_path;
-	memcpy(pathname, perf_dir, len);
-	if (len && perf_dir[len-1] != '/')
-		pathname[len++] = '/';
-	va_start(args, fmt);
-	len += vsnprintf(pathname + len, PATH_MAX - len, fmt, args);
-	va_end(args);
-	if (len >= PATH_MAX)
-		return bad_path;
-	return cleanup_path(pathname);
-}
-
-
-/* perf_mkstemp() - create tmp file honoring TMPDIR variable */
-int perf_mkstemp(char *path, size_t len, const char *template)
-{
-	const char *tmp;
-	size_t n;
-
-	tmp = getenv("TMPDIR");
-	if (!tmp)
-		tmp = "/tmp";
-	n = snprintf(path, len, "%s/%s", tmp, template);
-	if (len <= n) {
-		errno = ENAMETOOLONG;
-		return -1;
-	}
-	return mkstemp(path);
-}
-
-
-const char *make_relative_path(const char *abs, const char *base)
-{
-	static char buf[PATH_MAX + 1];
-	int baselen;
-	if (!base)
-		return abs;
-	baselen = strlen(base);
-	if (prefixcmp(abs, base))
-		return abs;
-	if (abs[baselen] == '/')
-		baselen++;
-	else if (base[baselen - 1] != '/')
-		return abs;
-	strcpy(buf, abs + baselen);
-	return buf;
-}
-
-/*
- * It is okay if dst == src, but they should not overlap otherwise.
- *
- * Performs the following normalizations on src, storing the result in dst:
- * - Ensures that components are separated by '/' (Windows only)
- * - Squashes sequences of '/'.
- * - Removes "." components.
- * - Removes ".." components, and the components the precede them.
- * Returns failure (non-zero) if a ".." component appears as first path
- * component anytime during the normalization. Otherwise, returns success (0).
- *
- * Note that this function is purely textual.  It does not follow symlinks,
- * verify the existence of the path, or make any system calls.
- */
-int normalize_path_copy(char *dst, const char *src)
-{
-	char *dst0;
-
-	if (has_dos_drive_prefix(src)) {
-		*dst++ = *src++;
-		*dst++ = *src++;
-	}
-	dst0 = dst;
-
-	if (is_dir_sep(*src)) {
-		*dst++ = '/';
-		while (is_dir_sep(*src))
-			src++;
-	}
-
-	for (;;) {
-		char c = *src;
-
-		/*
-		 * A path component that begins with . could be
-		 * special:
-		 * (1) "." and ends   -- ignore and terminate.
-		 * (2) "./"           -- ignore them, eat slash and continue.
-		 * (3) ".." and ends  -- strip one and terminate.
-		 * (4) "../"          -- strip one, eat slash and continue.
-		 */
-		if (c == '.') {
-			if (!src[1]) {
-				/* (1) */
-				src++;
-			} else if (is_dir_sep(src[1])) {
-				/* (2) */
-				src += 2;
-				while (is_dir_sep(*src))
-					src++;
-				continue;
-			} else if (src[1] == '.') {
-				if (!src[2]) {
-					/* (3) */
-					src += 2;
-					goto up_one;
-				} else if (is_dir_sep(src[2])) {
-					/* (4) */
-					src += 3;
-					while (is_dir_sep(*src))
-						src++;
-					goto up_one;
-				}
-			}
-		}
-
-		/* copy up to the next '/', and eat all '/' */
-		while ((c = *src++) != '\0' && !is_dir_sep(c))
-			*dst++ = c;
-		if (is_dir_sep(c)) {
-			*dst++ = '/';
-			while (is_dir_sep(c))
-				c = *src++;
-			src--;
-		} else if (!c)
-			break;
-		continue;
-
-	up_one:
-		/*
-		 * dst0..dst is prefix portion, and dst[-1] is '/';
-		 * go up one level.
-		 */
-		dst--;	/* go to trailing '/' */
-		if (dst <= dst0)
-			return -1;
-		/* Windows: dst[-1] cannot be backslash anymore */
-		while (dst0 < dst && dst[-1] != '/')
-			dst--;
-	}
-	*dst = '\0';
-	return 0;
-}
-
-/*
- * path = Canonical absolute path
- * prefix_list = Colon-separated list of absolute paths
- *
- * Determines, for each path in prefix_list, whether the "prefix" really
- * is an ancestor directory of path.  Returns the length of the longest
- * ancestor directory, excluding any trailing slashes, or -1 if no prefix
- * is an ancestor.  (Note that this means 0 is returned if prefix_list is
- * "/".) "/foo" is not considered an ancestor of "/foobar".  Directories
- * are not considered to be their own ancestors.  path must be in a
- * canonical form: empty components, or "." or ".." components are not
- * allowed.  prefix_list may be null, which is like "".
- */
-int longest_ancestor_length(const char *path, const char *prefix_list)
-{
-	char buf[PATH_MAX+1];
-	const char *ceil, *colon;
-	int len, max_len = -1;
-
-	if (prefix_list == NULL || !strcmp(path, "/"))
-		return -1;
-
-	for (colon = ceil = prefix_list; *colon; ceil = colon+1) {
-		for (colon = ceil; *colon && *colon != PATH_SEP; colon++);
-		len = colon - ceil;
-		if (len == 0 || len > PATH_MAX || !is_absolute_path(ceil))
-			continue;
-		strlcpy(buf, ceil, len+1);
-		if (normalize_path_copy(buf, buf) < 0)
-			continue;
-		len = strlen(buf);
-		if (len > 0 && buf[len-1] == '/')
-			buf[--len] = '\0';
-
-		if (!strncmp(path, buf, len) &&
-		    path[len] == '/' &&
-		    len > max_len) {
-			max_len = len;
-		}
-	}
-
-	return max_len;
-}
-
-/* strip arbitrary amount of directory separators at end of path */
-static inline int chomp_trailing_dir_sep(const char *path, int len)
-{
-	while (len && is_dir_sep(path[len - 1]))
-		len--;
-	return len;
-}
-
-/*
- * If path ends with suffix (complete path components), returns the
- * part before suffix (sans trailing directory separators).
- * Otherwise returns NULL.
- */
-char *strip_path_suffix(const char *path, const char *suffix)
-{
-	int path_len = strlen(path), suffix_len = strlen(suffix);
-
-	while (suffix_len) {
-		if (!path_len)
-			return NULL;
-
-		if (is_dir_sep(path[path_len - 1])) {
-			if (!is_dir_sep(suffix[suffix_len - 1]))
-				return NULL;
-			path_len = chomp_trailing_dir_sep(path, path_len);
-			suffix_len = chomp_trailing_dir_sep(suffix, suffix_len);
-		}
-		else if (path[--path_len] != suffix[--suffix_len])
-			return NULL;
-	}
-
-	if (path_len && !is_dir_sep(path[path_len - 1]))
-		return NULL;
-	return xstrndup(path, chomp_trailing_dir_sep(path, path_len));
-}
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
index 8d6faec..594d270 100644
--- a/Documentation/perf_counter/perf.c
+++ b/Documentation/perf_counter/perf.c
@@ -1,8 +1,8 @@
 #include "builtin.h"
-#include "exec_cmd.h"
-#include "cache.h"
-#include "quote.h"
-#include "run-command.h"
+#include "util/exec_cmd.h"
+#include "util/cache.h"
+#include "util/quote.h"
+#include "util/run-command.h"
 
 const char perf_usage_string[] =
 	"perf [--version] [--help] COMMAND [ARGS]";
diff --git a/Documentation/perf_counter/quote.c b/Documentation/perf_counter/quote.c
deleted file mode 100644
index 7a49fcf..0000000
--- a/Documentation/perf_counter/quote.c
+++ /dev/null
@@ -1,478 +0,0 @@
-#include "cache.h"
-#include "quote.h"
-
-int quote_path_fully = 1;
-
-/* Help to copy the thing properly quoted for the shell safety.
- * any single quote is replaced with '\'', any exclamation point
- * is replaced with '\!', and the whole thing is enclosed in a
- *
- * E.g.
- *  original     sq_quote     result
- *  name     ==> name      ==> 'name'
- *  a b      ==> a b       ==> 'a b'
- *  a'b      ==> a'\''b    ==> 'a'\''b'
- *  a!b      ==> a'\!'b    ==> 'a'\!'b'
- */
-static inline int need_bs_quote(char c)
-{
-	return (c == '\'' || c == '!');
-}
-
-void sq_quote_buf(struct strbuf *dst, const char *src)
-{
-	char *to_free = NULL;
-
-	if (dst->buf == src)
-		to_free = strbuf_detach(dst, NULL);
-
-	strbuf_addch(dst, '\'');
-	while (*src) {
-		size_t len = strcspn(src, "'!");
-		strbuf_add(dst, src, len);
-		src += len;
-		while (need_bs_quote(*src)) {
-			strbuf_addstr(dst, "'\\");
-			strbuf_addch(dst, *src++);
-			strbuf_addch(dst, '\'');
-		}
-	}
-	strbuf_addch(dst, '\'');
-	free(to_free);
-}
-
-void sq_quote_print(FILE *stream, const char *src)
-{
-	char c;
-
-	fputc('\'', stream);
-	while ((c = *src++)) {
-		if (need_bs_quote(c)) {
-			fputs("'\\", stream);
-			fputc(c, stream);
-			fputc('\'', stream);
-		} else {
-			fputc(c, stream);
-		}
-	}
-	fputc('\'', stream);
-}
-
-void sq_quote_argv(struct strbuf *dst, const char** argv, size_t maxlen)
-{
-	int i;
-
-	/* Copy into destination buffer. */
-	strbuf_grow(dst, 255);
-	for (i = 0; argv[i]; ++i) {
-		strbuf_addch(dst, ' ');
-		sq_quote_buf(dst, argv[i]);
-		if (maxlen && dst->len > maxlen)
-			die("Too many or long arguments");
-	}
-}
-
-char *sq_dequote_step(char *arg, char **next)
-{
-	char *dst = arg;
-	char *src = arg;
-	char c;
-
-	if (*src != '\'')
-		return NULL;
-	for (;;) {
-		c = *++src;
-		if (!c)
-			return NULL;
-		if (c != '\'') {
-			*dst++ = c;
-			continue;
-		}
-		/* We stepped out of sq */
-		switch (*++src) {
-		case '\0':
-			*dst = 0;
-			if (next)
-				*next = NULL;
-			return arg;
-		case '\\':
-			c = *++src;
-			if (need_bs_quote(c) && *++src == '\'') {
-				*dst++ = c;
-				continue;
-			}
-		/* Fallthrough */
-		default:
-			if (!next || !isspace(*src))
-				return NULL;
-			do {
-				c = *++src;
-			} while (isspace(c));
-			*dst = 0;
-			*next = src;
-			return arg;
-		}
-	}
-}
-
-char *sq_dequote(char *arg)
-{
-	return sq_dequote_step(arg, NULL);
-}
-
-int sq_dequote_to_argv(char *arg, const char ***argv, int *nr, int *alloc)
-{
-	char *next = arg;
-
-	if (!*arg)
-		return 0;
-	do {
-		char *dequoted = sq_dequote_step(next, &next);
-		if (!dequoted)
-			return -1;
-		ALLOC_GROW(*argv, *nr + 1, *alloc);
-		(*argv)[(*nr)++] = dequoted;
-	} while (next);
-
-	return 0;
-}
-
-/* 1 means: quote as octal
- * 0 means: quote as octal if (quote_path_fully)
- * -1 means: never quote
- * c: quote as "\\c"
- */
-#define X8(x)   x, x, x, x, x, x, x, x
-#define X16(x)  X8(x), X8(x)
-static signed char const sq_lookup[256] = {
-	/*           0    1    2    3    4    5    6    7 */
-	/* 0x00 */   1,   1,   1,   1,   1,   1,   1, 'a',
-	/* 0x08 */ 'b', 't', 'n', 'v', 'f', 'r',   1,   1,
-	/* 0x10 */ X16(1),
-	/* 0x20 */  -1,  -1, '"',  -1,  -1,  -1,  -1,  -1,
-	/* 0x28 */ X16(-1), X16(-1), X16(-1),
-	/* 0x58 */  -1,  -1,  -1,  -1,'\\',  -1,  -1,  -1,
-	/* 0x60 */ X16(-1), X8(-1),
-	/* 0x78 */  -1,  -1,  -1,  -1,  -1,  -1,  -1,   1,
-	/* 0x80 */ /* set to 0 */
-};
-
-static inline int sq_must_quote(char c)
-{
-	return sq_lookup[(unsigned char)c] + quote_path_fully > 0;
-}
-
-/* returns the longest prefix not needing a quote up to maxlen if positive.
-   This stops at the first \0 because it's marked as a character needing an
-   escape */
-static size_t next_quote_pos(const char *s, ssize_t maxlen)
-{
-	size_t len;
-	if (maxlen < 0) {
-		for (len = 0; !sq_must_quote(s[len]); len++);
-	} else {
-		for (len = 0; len < maxlen && !sq_must_quote(s[len]); len++);
-	}
-	return len;
-}
-
-/*
- * C-style name quoting.
- *
- * (1) if sb and fp are both NULL, inspect the input name and counts the
- *     number of bytes that are needed to hold c_style quoted version of name,
- *     counting the double quotes around it but not terminating NUL, and
- *     returns it.
- *     However, if name does not need c_style quoting, it returns 0.
- *
- * (2) if sb or fp are not NULL, it emits the c_style quoted version
- *     of name, enclosed with double quotes if asked and needed only.
- *     Return value is the same as in (1).
- */
-static size_t quote_c_style_counted(const char *name, ssize_t maxlen,
-                                    struct strbuf *sb, FILE *fp, int no_dq)
-{
-#undef EMIT
-#define EMIT(c)                                 \
-	do {                                        \
-		if (sb) strbuf_addch(sb, (c));          \
-		if (fp) fputc((c), fp);                 \
-		count++;                                \
-	} while (0)
-#define EMITBUF(s, l)                           \
-	do {                                        \
-		if (sb) strbuf_add(sb, (s), (l));       \
-		if (fp) fwrite((s), (l), 1, fp);        \
-		count += (l);                           \
-	} while (0)
-
-	size_t len, count = 0;
-	const char *p = name;
-
-	for (;;) {
-		int ch;
-
-		len = next_quote_pos(p, maxlen);
-		if (len == maxlen || !p[len])
-			break;
-
-		if (!no_dq && p == name)
-			EMIT('"');
-
-		EMITBUF(p, len);
-		EMIT('\\');
-		p += len;
-		ch = (unsigned char)*p++;
-		if (sq_lookup[ch] >= ' ') {
-			EMIT(sq_lookup[ch]);
-		} else {
-			EMIT(((ch >> 6) & 03) + '0');
-			EMIT(((ch >> 3) & 07) + '0');
-			EMIT(((ch >> 0) & 07) + '0');
-		}
-	}
-
-	EMITBUF(p, len);
-	if (p == name)   /* no ending quote needed */
-		return 0;
-
-	if (!no_dq)
-		EMIT('"');
-	return count;
-}
-
-size_t quote_c_style(const char *name, struct strbuf *sb, FILE *fp, int nodq)
-{
-	return quote_c_style_counted(name, -1, sb, fp, nodq);
-}
-
-void quote_two_c_style(struct strbuf *sb, const char *prefix, const char *path, int nodq)
-{
-	if (quote_c_style(prefix, NULL, NULL, 0) ||
-	    quote_c_style(path, NULL, NULL, 0)) {
-		if (!nodq)
-			strbuf_addch(sb, '"');
-		quote_c_style(prefix, sb, NULL, 1);
-		quote_c_style(path, sb, NULL, 1);
-		if (!nodq)
-			strbuf_addch(sb, '"');
-	} else {
-		strbuf_addstr(sb, prefix);
-		strbuf_addstr(sb, path);
-	}
-}
-
-void write_name_quoted(const char *name, FILE *fp, int terminator)
-{
-	if (terminator) {
-		quote_c_style(name, NULL, fp, 0);
-	} else {
-		fputs(name, fp);
-	}
-	fputc(terminator, fp);
-}
-
-extern void write_name_quotedpfx(const char *pfx, size_t pfxlen,
-                                 const char *name, FILE *fp, int terminator)
-{
-	int needquote = 0;
-
-	if (terminator) {
-		needquote = next_quote_pos(pfx, pfxlen) < pfxlen
-			|| name[next_quote_pos(name, -1)];
-	}
-	if (needquote) {
-		fputc('"', fp);
-		quote_c_style_counted(pfx, pfxlen, NULL, fp, 1);
-		quote_c_style(name, NULL, fp, 1);
-		fputc('"', fp);
-	} else {
-		fwrite(pfx, pfxlen, 1, fp);
-		fputs(name, fp);
-	}
-	fputc(terminator, fp);
-}
-
-/* quote path as relative to the given prefix */
-char *quote_path_relative(const char *in, int len,
-			  struct strbuf *out, const char *prefix)
-{
-	int needquote;
-
-	if (len < 0)
-		len = strlen(in);
-
-	/* "../" prefix itself does not need quoting, but "in" might. */
-	needquote = next_quote_pos(in, len) < len;
-	strbuf_setlen(out, 0);
-	strbuf_grow(out, len);
-
-	if (needquote)
-		strbuf_addch(out, '"');
-	if (prefix) {
-		int off = 0;
-		while (prefix[off] && off < len && prefix[off] == in[off])
-			if (prefix[off] == '/') {
-				prefix += off + 1;
-				in += off + 1;
-				len -= off + 1;
-				off = 0;
-			} else
-				off++;
-
-		for (; *prefix; prefix++)
-			if (*prefix == '/')
-				strbuf_addstr(out, "../");
-	}
-
-	quote_c_style_counted (in, len, out, NULL, 1);
-
-	if (needquote)
-		strbuf_addch(out, '"');
-	if (!out->len)
-		strbuf_addstr(out, "./");
-
-	return out->buf;
-}
-
-/*
- * C-style name unquoting.
- *
- * Quoted should point at the opening double quote.
- * + Returns 0 if it was able to unquote the string properly, and appends the
- *   result in the strbuf `sb'.
- * + Returns -1 in case of error, and doesn't touch the strbuf. Though note
- *   that this function will allocate memory in the strbuf, so calling
- *   strbuf_release is mandatory whichever result unquote_c_style returns.
- *
- * Updates endp pointer to point at one past the ending double quote if given.
- */
-int unquote_c_style(struct strbuf *sb, const char *quoted, const char **endp)
-{
-	size_t oldlen = sb->len, len;
-	int ch, ac;
-
-	if (*quoted++ != '"')
-		return -1;
-
-	for (;;) {
-		len = strcspn(quoted, "\"\\");
-		strbuf_add(sb, quoted, len);
-		quoted += len;
-
-		switch (*quoted++) {
-		  case '"':
-			if (endp)
-				*endp = quoted;
-			return 0;
-		  case '\\':
-			break;
-		  default:
-			goto error;
-		}
-
-		switch ((ch = *quoted++)) {
-		case 'a': ch = '\a'; break;
-		case 'b': ch = '\b'; break;
-		case 'f': ch = '\f'; break;
-		case 'n': ch = '\n'; break;
-		case 'r': ch = '\r'; break;
-		case 't': ch = '\t'; break;
-		case 'v': ch = '\v'; break;
-
-		case '\\': case '"':
-			break; /* verbatim */
-
-		/* octal values with first digit over 4 overflow */
-		case '0': case '1': case '2': case '3':
-					ac = ((ch - '0') << 6);
-			if ((ch = *quoted++) < '0' || '7' < ch)
-				goto error;
-					ac |= ((ch - '0') << 3);
-			if ((ch = *quoted++) < '0' || '7' < ch)
-				goto error;
-					ac |= (ch - '0');
-					ch = ac;
-					break;
-				default:
-			goto error;
-			}
-		strbuf_addch(sb, ch);
-		}
-
-  error:
-	strbuf_setlen(sb, oldlen);
-	return -1;
-}
-
-/* quoting as a string literal for other languages */
-
-void perl_quote_print(FILE *stream, const char *src)
-{
-	const char sq = '\'';
-	const char bq = '\\';
-	char c;
-
-	fputc(sq, stream);
-	while ((c = *src++)) {
-		if (c == sq || c == bq)
-			fputc(bq, stream);
-		fputc(c, stream);
-	}
-	fputc(sq, stream);
-}
-
-void python_quote_print(FILE *stream, const char *src)
-{
-	const char sq = '\'';
-	const char bq = '\\';
-	const char nl = '\n';
-	char c;
-
-	fputc(sq, stream);
-	while ((c = *src++)) {
-		if (c == nl) {
-			fputc(bq, stream);
-			fputc('n', stream);
-			continue;
-		}
-		if (c == sq || c == bq)
-			fputc(bq, stream);
-		fputc(c, stream);
-	}
-	fputc(sq, stream);
-}
-
-void tcl_quote_print(FILE *stream, const char *src)
-{
-	char c;
-
-	fputc('"', stream);
-	while ((c = *src++)) {
-		switch (c) {
-		case '[': case ']':
-		case '{': case '}':
-		case '$': case '\\': case '"':
-			fputc('\\', stream);
-		default:
-			fputc(c, stream);
-			break;
-		case '\f':
-			fputs("\\f", stream);
-			break;
-		case '\r':
-			fputs("\\r", stream);
-			break;
-		case '\n':
-			fputs("\\n", stream);
-			break;
-		case '\t':
-			fputs("\\t", stream);
-			break;
-		case '\v':
-			fputs("\\v", stream);
-			break;
-		}
-	}
-	fputc('"', stream);
-}
diff --git a/Documentation/perf_counter/quote.h b/Documentation/perf_counter/quote.h
deleted file mode 100644
index 66730f2..0000000
--- a/Documentation/perf_counter/quote.h
+++ /dev/null
@@ -1,68 +0,0 @@
-#ifndef QUOTE_H
-#define QUOTE_H
-
-#include <stddef.h>
-#include <stdio.h>
-
-/* Help to copy the thing properly quoted for the shell safety.
- * any single quote is replaced with '\'', any exclamation point
- * is replaced with '\!', and the whole thing is enclosed in a
- * single quote pair.
- *
- * For example, if you are passing the result to system() as an
- * argument:
- *
- * sprintf(cmd, "foobar %s %s", sq_quote(arg0), sq_quote(arg1))
- *
- * would be appropriate.  If the system() is going to call ssh to
- * run the command on the other side:
- *
- * sprintf(cmd, "git-diff-tree %s %s", sq_quote(arg0), sq_quote(arg1));
- * sprintf(rcmd, "ssh %s %s", sq_quote(host), sq_quote(cmd));
- *
- * Note that the above examples leak memory!  Remember to free result from
- * sq_quote() in a real application.
- *
- * sq_quote_buf() writes to an existing buffer of specified size; it
- * will return the number of characters that would have been written
- * excluding the final null regardless of the buffer size.
- */
-
-extern void sq_quote_print(FILE *stream, const char *src);
-
-extern void sq_quote_buf(struct strbuf *, const char *src);
-extern void sq_quote_argv(struct strbuf *, const char **argv, size_t maxlen);
-
-/* This unwraps what sq_quote() produces in place, but returns
- * NULL if the input does not look like what sq_quote would have
- * produced.
- */
-extern char *sq_dequote(char *);
-
-/*
- * Same as the above, but can be used to unwrap many arguments in the
- * same string separated by space. "next" is changed to point to the
- * next argument that should be passed as first parameter. When there
- * is no more argument to be dequoted, "next" is updated to point to NULL.
- */
-extern char *sq_dequote_step(char *arg, char **next);
-extern int sq_dequote_to_argv(char *arg, const char ***argv, int *nr, int *alloc);
-
-extern int unquote_c_style(struct strbuf *, const char *quoted, const char **endp);
-extern size_t quote_c_style(const char *name, struct strbuf *, FILE *, int no_dq);
-extern void quote_two_c_style(struct strbuf *, const char *, const char *, int);
-
-extern void write_name_quoted(const char *name, FILE *, int terminator);
-extern void write_name_quotedpfx(const char *pfx, size_t pfxlen,
-                                 const char *name, FILE *, int terminator);
-
-/* quote path as relative to the given prefix */
-char *quote_path_relative(const char *in, int len,
-			  struct strbuf *out, const char *prefix);
-
-/* quoting as a string literal for other languages */
-extern void perl_quote_print(FILE *stream, const char *src);
-extern void python_quote_print(FILE *stream, const char *src);
-extern void tcl_quote_print(FILE *stream, const char *src);
-
-#endif
diff --git a/Documentation/perf_counter/run-command.c b/Documentation/perf_counter/run-command.c
deleted file mode 100644
index b2f5e85..0000000
--- a/Documentation/perf_counter/run-command.c
+++ /dev/null
@@ -1,395 +0,0 @@
-#include "cache.h"
-#include "run-command.h"
-#include "exec_cmd.h"
-
-static inline void close_pair(int fd[2])
-{
-	close(fd[0]);
-	close(fd[1]);
-}
-
-static inline void dup_devnull(int to)
-{
-	int fd = open("/dev/null", O_RDWR);
-	dup2(fd, to);
-	close(fd);
-}
-
-int start_command(struct child_process *cmd)
-{
-	int need_in, need_out, need_err;
-	int fdin[2], fdout[2], fderr[2];
-
-	/*
-	 * In case of errors we must keep the promise to close FDs
-	 * that have been passed in via ->in and ->out.
-	 */
-
-	need_in = !cmd->no_stdin && cmd->in < 0;
-	if (need_in) {
-		if (pipe(fdin) < 0) {
-			if (cmd->out > 0)
-				close(cmd->out);
-			return -ERR_RUN_COMMAND_PIPE;
-		}
-		cmd->in = fdin[1];
-	}
-
-	need_out = !cmd->no_stdout
-		&& !cmd->stdout_to_stderr
-		&& cmd->out < 0;
-	if (need_out) {
-		if (pipe(fdout) < 0) {
-			if (need_in)
-				close_pair(fdin);
-			else if (cmd->in)
-				close(cmd->in);
-			return -ERR_RUN_COMMAND_PIPE;
-		}
-		cmd->out = fdout[0];
-	}
-
-	need_err = !cmd->no_stderr && cmd->err < 0;
-	if (need_err) {
-		if (pipe(fderr) < 0) {
-			if (need_in)
-				close_pair(fdin);
-			else if (cmd->in)
-				close(cmd->in);
-			if (need_out)
-				close_pair(fdout);
-			else if (cmd->out)
-				close(cmd->out);
-			return -ERR_RUN_COMMAND_PIPE;
-		}
-		cmd->err = fderr[0];
-	}
-
-#ifndef __MINGW32__
-	fflush(NULL);
-	cmd->pid = fork();
-	if (!cmd->pid) {
-		if (cmd->no_stdin)
-			dup_devnull(0);
-		else if (need_in) {
-			dup2(fdin[0], 0);
-			close_pair(fdin);
-		} else if (cmd->in) {
-			dup2(cmd->in, 0);
-			close(cmd->in);
-		}
-
-		if (cmd->no_stderr)
-			dup_devnull(2);
-		else if (need_err) {
-			dup2(fderr[1], 2);
-			close_pair(fderr);
-		}
-
-		if (cmd->no_stdout)
-			dup_devnull(1);
-		else if (cmd->stdout_to_stderr)
-			dup2(2, 1);
-		else if (need_out) {
-			dup2(fdout[1], 1);
-			close_pair(fdout);
-		} else if (cmd->out > 1) {
-			dup2(cmd->out, 1);
-			close(cmd->out);
-		}
-
-		if (cmd->dir && chdir(cmd->dir))
-			die("exec %s: cd to %s failed (%s)", cmd->argv[0],
-			    cmd->dir, strerror(errno));
-		if (cmd->env) {
-			for (; *cmd->env; cmd->env++) {
-				if (strchr(*cmd->env, '='))
-					putenv((char*)*cmd->env);
-				else
-					unsetenv(*cmd->env);
-			}
-		}
-		if (cmd->preexec_cb)
-			cmd->preexec_cb();
-		if (cmd->perf_cmd) {
-			execv_perf_cmd(cmd->argv);
-		} else {
-			execvp(cmd->argv[0], (char *const*) cmd->argv);
-		}
-		exit(127);
-	}
-#else
-	int s0 = -1, s1 = -1, s2 = -1;	/* backups of stdin, stdout, stderr */
-	const char **sargv = cmd->argv;
-	char **env = environ;
-
-	if (cmd->no_stdin) {
-		s0 = dup(0);
-		dup_devnull(0);
-	} else if (need_in) {
-		s0 = dup(0);
-		dup2(fdin[0], 0);
-	} else if (cmd->in) {
-		s0 = dup(0);
-		dup2(cmd->in, 0);
-	}
-
-	if (cmd->no_stderr) {
-		s2 = dup(2);
-		dup_devnull(2);
-	} else if (need_err) {
-		s2 = dup(2);
-		dup2(fderr[1], 2);
-	}
-
-	if (cmd->no_stdout) {
-		s1 = dup(1);
-		dup_devnull(1);
-	} else if (cmd->stdout_to_stderr) {
-		s1 = dup(1);
-		dup2(2, 1);
-	} else if (need_out) {
-		s1 = dup(1);
-		dup2(fdout[1], 1);
-	} else if (cmd->out > 1) {
-		s1 = dup(1);
-		dup2(cmd->out, 1);
-	}
-
-	if (cmd->dir)
-		die("chdir in start_command() not implemented");
-	if (cmd->env) {
-		env = copy_environ();
-		for (; *cmd->env; cmd->env++)
-			env = env_setenv(env, *cmd->env);
-	}
-
-	if (cmd->perf_cmd) {
-		cmd->argv = prepare_perf_cmd(cmd->argv);
-	}
-
-	cmd->pid = mingw_spawnvpe(cmd->argv[0], cmd->argv, env);
-
-	if (cmd->env)
-		free_environ(env);
-	if (cmd->perf_cmd)
-		free(cmd->argv);
-
-	cmd->argv = sargv;
-	if (s0 >= 0)
-		dup2(s0, 0), close(s0);
-	if (s1 >= 0)
-		dup2(s1, 1), close(s1);
-	if (s2 >= 0)
-		dup2(s2, 2), close(s2);
-#endif
-
-	if (cmd->pid < 0) {
-		int err = errno;
-		if (need_in)
-			close_pair(fdin);
-		else if (cmd->in)
-			close(cmd->in);
-		if (need_out)
-			close_pair(fdout);
-		else if (cmd->out)
-			close(cmd->out);
-		if (need_err)
-			close_pair(fderr);
-		return err == ENOENT ?
-			-ERR_RUN_COMMAND_EXEC :
-			-ERR_RUN_COMMAND_FORK;
-	}
-
-	if (need_in)
-		close(fdin[0]);
-	else if (cmd->in)
-		close(cmd->in);
-
-	if (need_out)
-		close(fdout[1]);
-	else if (cmd->out)
-		close(cmd->out);
-
-	if (need_err)
-		close(fderr[1]);
-
-	return 0;
-}
-
-static int wait_or_whine(pid_t pid)
-{
-	for (;;) {
-		int status, code;
-		pid_t waiting = waitpid(pid, &status, 0);
-
-		if (waiting < 0) {
-			if (errno == EINTR)
-				continue;
-			error("waitpid failed (%s)", strerror(errno));
-			return -ERR_RUN_COMMAND_WAITPID;
-		}
-		if (waiting != pid)
-			return -ERR_RUN_COMMAND_WAITPID_WRONG_PID;
-		if (WIFSIGNALED(status))
-			return -ERR_RUN_COMMAND_WAITPID_SIGNAL;
-
-		if (!WIFEXITED(status))
-			return -ERR_RUN_COMMAND_WAITPID_NOEXIT;
-		code = WEXITSTATUS(status);
-		switch (code) {
-		case 127:
-			return -ERR_RUN_COMMAND_EXEC;
-		case 0:
-			return 0;
-		default:
-			return -code;
-		}
-	}
-}
-
-int finish_command(struct child_process *cmd)
-{
-	return wait_or_whine(cmd->pid);
-}
-
-int run_command(struct child_process *cmd)
-{
-	int code = start_command(cmd);
-	if (code)
-		return code;
-	return finish_command(cmd);
-}
-
-static void prepare_run_command_v_opt(struct child_process *cmd,
-				      const char **argv,
-				      int opt)
-{
-	memset(cmd, 0, sizeof(*cmd));
-	cmd->argv = argv;
-	cmd->no_stdin = opt & RUN_COMMAND_NO_STDIN ? 1 : 0;
-	cmd->perf_cmd = opt & RUN_PERF_CMD ? 1 : 0;
-	cmd->stdout_to_stderr = opt & RUN_COMMAND_STDOUT_TO_STDERR ? 1 : 0;
-}
-
-int run_command_v_opt(const char **argv, int opt)
-{
-	struct child_process cmd;
-	prepare_run_command_v_opt(&cmd, argv, opt);
-	return run_command(&cmd);
-}
-
-int run_command_v_opt_cd_env(const char **argv, int opt, const char *dir, const char *const *env)
-{
-	struct child_process cmd;
-	prepare_run_command_v_opt(&cmd, argv, opt);
-	cmd.dir = dir;
-	cmd.env = env;
-	return run_command(&cmd);
-}
-
-#ifdef __MINGW32__
-static __stdcall unsigned run_thread(void *data)
-{
-	struct async *async = data;
-	return async->proc(async->fd_for_proc, async->data);
-}
-#endif
-
-int start_async(struct async *async)
-{
-	int pipe_out[2];
-
-	if (pipe(pipe_out) < 0)
-		return error("cannot create pipe: %s", strerror(errno));
-	async->out = pipe_out[0];
-
-#ifndef __MINGW32__
-	/* Flush stdio before fork() to avoid cloning buffers */
-	fflush(NULL);
-
-	async->pid = fork();
-	if (async->pid < 0) {
-		error("fork (async) failed: %s", strerror(errno));
-		close_pair(pipe_out);
-		return -1;
-	}
-	if (!async->pid) {
-		close(pipe_out[0]);
-		exit(!!async->proc(pipe_out[1], async->data));
-	}
-	close(pipe_out[1]);
-#else
-	async->fd_for_proc = pipe_out[1];
-	async->tid = (HANDLE) _beginthreadex(NULL, 0, run_thread, async, 0, NULL);
-	if (!async->tid) {
-		error("cannot create thread: %s", strerror(errno));
-		close_pair(pipe_out);
-		return -1;
-	}
-#endif
-	return 0;
-}
-
-int finish_async(struct async *async)
-{
-#ifndef __MINGW32__
-	int ret = 0;
-
-	if (wait_or_whine(async->pid))
-		ret = error("waitpid (async) failed");
-#else
-	DWORD ret = 0;
-	if (WaitForSingleObject(async->tid, INFINITE) != WAIT_OBJECT_0)
-		ret = error("waiting for thread failed: %lu", GetLastError());
-	else if (!GetExitCodeThread(async->tid, &ret))
-		ret = error("cannot get thread exit code: %lu", GetLastError());
-	CloseHandle(async->tid);
-#endif
-	return ret;
-}
-
-int run_hook(const char *index_file, const char *name, ...)
-{
-	struct child_process hook;
-	const char **argv = NULL, *env[2];
-	char index[PATH_MAX];
-	va_list args;
-	int ret;
-	size_t i = 0, alloc = 0;
-
-	if (access(perf_path("hooks/%s", name), X_OK) < 0)
-		return 0;
-
-	va_start(args, name);
-	ALLOC_GROW(argv, i + 1, alloc);
-	argv[i++] = perf_path("hooks/%s", name);
-	while (argv[i-1]) {
-		ALLOC_GROW(argv, i + 1, alloc);
-		argv[i++] = va_arg(args, const char *);
-	}
-	va_end(args);
-
-	memset(&hook, 0, sizeof(hook));
-	hook.argv = argv;
-	hook.no_stdin = 1;
-	hook.stdout_to_stderr = 1;
-	if (index_file) {
-		snprintf(index, sizeof(index), "PERF_INDEX_FILE=%s", index_file);
-		env[0] = index;
-		env[1] = NULL;
-		hook.env = env;
-	}
-
-	ret = start_command(&hook);
-	free(argv);
-	if (ret) {
-		warning("Could not spawn %s", argv[0]);
-		return ret;
-	}
-	ret = finish_command(&hook);
-	if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
-		warning("%s exited due to uncaught signal", argv[0]);
-
-	return ret;
-}
diff --git a/Documentation/perf_counter/run-command.h b/Documentation/perf_counter/run-command.h
deleted file mode 100644
index 328289f..0000000
--- a/Documentation/perf_counter/run-command.h
+++ /dev/null
@@ -1,93 +0,0 @@
-#ifndef RUN_COMMAND_H
-#define RUN_COMMAND_H
-
-enum {
-	ERR_RUN_COMMAND_FORK = 10000,
-	ERR_RUN_COMMAND_EXEC,
-	ERR_RUN_COMMAND_PIPE,
-	ERR_RUN_COMMAND_WAITPID,
-	ERR_RUN_COMMAND_WAITPID_WRONG_PID,
-	ERR_RUN_COMMAND_WAITPID_SIGNAL,
-	ERR_RUN_COMMAND_WAITPID_NOEXIT,
-};
-#define IS_RUN_COMMAND_ERR(x) (-(x) >= ERR_RUN_COMMAND_FORK)
-
-struct child_process {
-	const char **argv;
-	pid_t pid;
-	/*
-	 * Using .in, .out, .err:
-	 * - Specify 0 for no redirections (child inherits stdin, stdout,
-	 *   stderr from parent).
-	 * - Specify -1 to have a pipe allocated as follows:
-	 *     .in: returns the writable pipe end; parent writes to it,
-	 *          the readable pipe end becomes child's stdin
-	 *     .out, .err: returns the readable pipe end; parent reads from
-	 *          it, the writable pipe end becomes child's stdout/stderr
-	 *   The caller of start_command() must close the returned FDs
-	 *   after it has completed reading from/writing to it!
-	 * - Specify > 0 to set a channel to a particular FD as follows:
-	 *     .in: a readable FD, becomes child's stdin
-	 *     .out: a writable FD, becomes child's stdout/stderr
-	 *     .err > 0 not supported
-	 *   The specified FD is closed by start_command(), even in case
-	 *   of errors!
-	 */
-	int in;
-	int out;
-	int err;
-	const char *dir;
-	const char *const *env;
-	unsigned no_stdin:1;
-	unsigned no_stdout:1;
-	unsigned no_stderr:1;
-	unsigned perf_cmd:1; /* if this is to be perf sub-command */
-	unsigned stdout_to_stderr:1;
-	void (*preexec_cb)(void);
-};
-
-int start_command(struct child_process *);
-int finish_command(struct child_process *);
-int run_command(struct child_process *);
-
-extern int run_hook(const char *index_file, const char *name, ...);
-
-#define RUN_COMMAND_NO_STDIN 1
-#define RUN_PERF_CMD	     2	/*If this is to be perf sub-command */
-#define RUN_COMMAND_STDOUT_TO_STDERR 4
-int run_command_v_opt(const char **argv, int opt);
-
-/*
- * env (the environment) is to be formatted like environ: "VAR=VALUE".
- * To unset an environment variable use just "VAR".
- */
-int run_command_v_opt_cd_env(const char **argv, int opt, const char *dir, const char *const *env);
-
-/*
- * The purpose of the following functions is to feed a pipe by running
- * a function asynchronously and providing output that the caller reads.
- *
- * It is expected that no synchronization and mutual exclusion between
- * the caller and the feed function is necessary so that the function
- * can run in a thread without interfering with the caller.
- */
-struct async {
-	/*
-	 * proc writes to fd and closes it;
-	 * returns 0 on success, non-zero on failure
-	 */
-	int (*proc)(int fd, void *data);
-	void *data;
-	int out;	/* caller reads from here and closes it */
-#ifndef __MINGW32__
-	pid_t pid;
-#else
-	HANDLE tid;
-	int fd_for_proc;
-#endif
-};
-
-int start_async(struct async *async);
-int finish_async(struct async *async);
-
-#endif
diff --git a/Documentation/perf_counter/strbuf.c b/Documentation/perf_counter/strbuf.c
deleted file mode 100644
index eaba093..0000000
--- a/Documentation/perf_counter/strbuf.c
+++ /dev/null
@@ -1,359 +0,0 @@
-#include "cache.h"
-
-int prefixcmp(const char *str, const char *prefix)
-{
-	for (; ; str++, prefix++)
-		if (!*prefix)
-			return 0;
-		else if (*str != *prefix)
-			return (unsigned char)*prefix - (unsigned char)*str;
-}
-
-/*
- * Used as the default ->buf value, so that people can always assume
- * buf is non NULL and ->buf is NUL terminated even for a freshly
- * initialized strbuf.
- */
-char strbuf_slopbuf[1];
-
-void strbuf_init(struct strbuf *sb, size_t hint)
-{
-	sb->alloc = sb->len = 0;
-	sb->buf = strbuf_slopbuf;
-	if (hint)
-		strbuf_grow(sb, hint);
-}
-
-void strbuf_release(struct strbuf *sb)
-{
-	if (sb->alloc) {
-		free(sb->buf);
-		strbuf_init(sb, 0);
-	}
-}
-
-char *strbuf_detach(struct strbuf *sb, size_t *sz)
-{
-	char *res = sb->alloc ? sb->buf : NULL;
-	if (sz)
-		*sz = sb->len;
-	strbuf_init(sb, 0);
-	return res;
-}
-
-void strbuf_attach(struct strbuf *sb, void *buf, size_t len, size_t alloc)
-{
-	strbuf_release(sb);
-	sb->buf   = buf;
-	sb->len   = len;
-	sb->alloc = alloc;
-	strbuf_grow(sb, 0);
-	sb->buf[sb->len] = '\0';
-}
-
-void strbuf_grow(struct strbuf *sb, size_t extra)
-{
-	if (sb->len + extra + 1 <= sb->len)
-		die("you want to use way too much memory");
-	if (!sb->alloc)
-		sb->buf = NULL;
-	ALLOC_GROW(sb->buf, sb->len + extra + 1, sb->alloc);
-}
-
-void strbuf_trim(struct strbuf *sb)
-{
-	char *b = sb->buf;
-	while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1]))
-		sb->len--;
-	while (sb->len > 0 && isspace(*b)) {
-		b++;
-		sb->len--;
-	}
-	memmove(sb->buf, b, sb->len);
-	sb->buf[sb->len] = '\0';
-}
-void strbuf_rtrim(struct strbuf *sb)
-{
-	while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1]))
-		sb->len--;
-	sb->buf[sb->len] = '\0';
-}
-
-void strbuf_ltrim(struct strbuf *sb)
-{
-	char *b = sb->buf;
-	while (sb->len > 0 && isspace(*b)) {
-		b++;
-		sb->len--;
-	}
-	memmove(sb->buf, b, sb->len);
-	sb->buf[sb->len] = '\0';
-}
-
-void strbuf_tolower(struct strbuf *sb)
-{
-	int i;
-	for (i = 0; i < sb->len; i++)
-		sb->buf[i] = tolower(sb->buf[i]);
-}
-
-struct strbuf **strbuf_split(const struct strbuf *sb, int delim)
-{
-	int alloc = 2, pos = 0;
-	char *n, *p;
-	struct strbuf **ret;
-	struct strbuf *t;
-
-	ret = calloc(alloc, sizeof(struct strbuf *));
-	p = n = sb->buf;
-	while (n < sb->buf + sb->len) {
-		int len;
-		n = memchr(n, delim, sb->len - (n - sb->buf));
-		if (pos + 1 >= alloc) {
-			alloc = alloc * 2;
-			ret = realloc(ret, sizeof(struct strbuf *) * alloc);
-		}
-		if (!n)
-			n = sb->buf + sb->len - 1;
-		len = n - p + 1;
-		t = malloc(sizeof(struct strbuf));
-		strbuf_init(t, len);
-		strbuf_add(t, p, len);
-		ret[pos] = t;
-		ret[++pos] = NULL;
-		p = ++n;
-	}
-	return ret;
-}
-
-void strbuf_list_free(struct strbuf **sbs)
-{
-	struct strbuf **s = sbs;
-
-	while (*s) {
-		strbuf_release(*s);
-		free(*s++);
-	}
-	free(sbs);
-}
-
-int strbuf_cmp(const struct strbuf *a, const struct strbuf *b)
-{
-	int len = a->len < b->len ? a->len: b->len;
-	int cmp = memcmp(a->buf, b->buf, len);
-	if (cmp)
-		return cmp;
-	return a->len < b->len ? -1: a->len != b->len;
-}
-
-void strbuf_splice(struct strbuf *sb, size_t pos, size_t len,
-				   const void *data, size_t dlen)
-{
-	if (pos + len < pos)
-		die("you want to use way too much memory");
-	if (pos > sb->len)
-		die("`pos' is too far after the end of the buffer");
-	if (pos + len > sb->len)
-		die("`pos + len' is too far after the end of the buffer");
-
-	if (dlen >= len)
-		strbuf_grow(sb, dlen - len);
-	memmove(sb->buf + pos + dlen,
-			sb->buf + pos + len,
-			sb->len - pos - len);
-	memcpy(sb->buf + pos, data, dlen);
-	strbuf_setlen(sb, sb->len + dlen - len);
-}
-
-void strbuf_insert(struct strbuf *sb, size_t pos, const void *data, size_t len)
-{
-	strbuf_splice(sb, pos, 0, data, len);
-}
-
-void strbuf_remove(struct strbuf *sb, size_t pos, size_t len)
-{
-	strbuf_splice(sb, pos, len, NULL, 0);
-}
-
-void strbuf_add(struct strbuf *sb, const void *data, size_t len)
-{
-	strbuf_grow(sb, len);
-	memcpy(sb->buf + sb->len, data, len);
-	strbuf_setlen(sb, sb->len + len);
-}
-
-void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len)
-{
-	strbuf_grow(sb, len);
-	memcpy(sb->buf + sb->len, sb->buf + pos, len);
-	strbuf_setlen(sb, sb->len + len);
-}
-
-void strbuf_addf(struct strbuf *sb, const char *fmt, ...)
-{
-	int len;
-	va_list ap;
-
-	if (!strbuf_avail(sb))
-		strbuf_grow(sb, 64);
-	va_start(ap, fmt);
-	len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap);
-	va_end(ap);
-	if (len < 0)
-		die("your vsnprintf is broken");
-	if (len > strbuf_avail(sb)) {
-		strbuf_grow(sb, len);
-		va_start(ap, fmt);
-		len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap);
-		va_end(ap);
-		if (len > strbuf_avail(sb)) {
-			die("this should not happen, your snprintf is broken");
-		}
-	}
-	strbuf_setlen(sb, sb->len + len);
-}
-
-void strbuf_expand(struct strbuf *sb, const char *format, expand_fn_t fn,
-		   void *context)
-{
-	for (;;) {
-		const char *percent;
-		size_t consumed;
-
-		percent = strchrnul(format, '%');
-		strbuf_add(sb, format, percent - format);
-		if (!*percent)
-			break;
-		format = percent + 1;
-
-		consumed = fn(sb, format, context);
-		if (consumed)
-			format += consumed;
-		else
-			strbuf_addch(sb, '%');
-	}
-}
-
-size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder,
-		void *context)
-{
-	struct strbuf_expand_dict_entry *e = context;
-	size_t len;
-
-	for (; e->placeholder && (len = strlen(e->placeholder)); e++) {
-		if (!strncmp(placeholder, e->placeholder, len)) {
-			if (e->value)
-				strbuf_addstr(sb, e->value);
-			return len;
-		}
-	}
-	return 0;
-}
-
-size_t strbuf_fread(struct strbuf *sb, size_t size, FILE *f)
-{
-	size_t res;
-	size_t oldalloc = sb->alloc;
-
-	strbuf_grow(sb, size);
-	res = fread(sb->buf + sb->len, 1, size, f);
-	if (res > 0)
-		strbuf_setlen(sb, sb->len + res);
-	else if (res < 0 && oldalloc == 0)
-		strbuf_release(sb);
-	return res;
-}
-
-ssize_t strbuf_read(struct strbuf *sb, int fd, size_t hint)
-{
-	size_t oldlen = sb->len;
-	size_t oldalloc = sb->alloc;
-
-	strbuf_grow(sb, hint ? hint : 8192);
-	for (;;) {
-		ssize_t cnt;
-
-		cnt = read(fd, sb->buf + sb->len, sb->alloc - sb->len - 1);
-		if (cnt < 0) {
-			if (oldalloc == 0)
-				strbuf_release(sb);
-			else
-				strbuf_setlen(sb, oldlen);
-			return -1;
-		}
-		if (!cnt)
-			break;
-		sb->len += cnt;
-		strbuf_grow(sb, 8192);
-	}
-
-	sb->buf[sb->len] = '\0';
-	return sb->len - oldlen;
-}
-
-#define STRBUF_MAXLINK (2*PATH_MAX)
-
-int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint)
-{
-	size_t oldalloc = sb->alloc;
-
-	if (hint < 32)
-		hint = 32;
-
-	while (hint < STRBUF_MAXLINK) {
-		int len;
-
-		strbuf_grow(sb, hint);
-		len = readlink(path, sb->buf, hint);
-		if (len < 0) {
-			if (errno != ERANGE)
-				break;
-		} else if (len < hint) {
-			strbuf_setlen(sb, len);
-			return 0;
-		}
-
-		/* .. the buffer was too small - try again */
-		hint *= 2;
-	}
-	if (oldalloc == 0)
-		strbuf_release(sb);
-	return -1;
-}
-
-int strbuf_getline(struct strbuf *sb, FILE *fp, int term)
-{
-	int ch;
-
-	strbuf_grow(sb, 0);
-	if (feof(fp))
-		return EOF;
-
-	strbuf_reset(sb);
-	while ((ch = fgetc(fp)) != EOF) {
-		if (ch == term)
-			break;
-		strbuf_grow(sb, 1);
-		sb->buf[sb->len++] = ch;
-	}
-	if (ch == EOF && sb->len == 0)
-		return EOF;
-
-	sb->buf[sb->len] = '\0';
-	return 0;
-}
-
-int strbuf_read_file(struct strbuf *sb, const char *path, size_t hint)
-{
-	int fd, len;
-
-	fd = open(path, O_RDONLY);
-	if (fd < 0)
-		return -1;
-	len = strbuf_read(sb, fd, hint);
-	close(fd);
-	if (len < 0)
-		return -1;
-
-	return len;
-}
diff --git a/Documentation/perf_counter/strbuf.h b/Documentation/perf_counter/strbuf.h
deleted file mode 100644
index 9ee908a..0000000
--- a/Documentation/perf_counter/strbuf.h
+++ /dev/null
@@ -1,137 +0,0 @@
-#ifndef STRBUF_H
-#define STRBUF_H
-
-/*
- * Strbuf's can be use in many ways: as a byte array, or to store arbitrary
- * long, overflow safe strings.
- *
- * Strbufs has some invariants that are very important to keep in mind:
- *
- * 1. the ->buf member is always malloc-ed, hence strbuf's can be used to
- *    build complex strings/buffers whose final size isn't easily known.
- *
- *    It is NOT legal to copy the ->buf pointer away.
- *    `strbuf_detach' is the operation that detachs a buffer from its shell
- *    while keeping the shell valid wrt its invariants.
- *
- * 2. the ->buf member is a byte array that has at least ->len + 1 bytes
- *    allocated. The extra byte is used to store a '\0', allowing the ->buf
- *    member to be a valid C-string. Every strbuf function ensure this
- *    invariant is preserved.
- *
- *    Note that it is OK to "play" with the buffer directly if you work it
- *    that way:
- *
- *    strbuf_grow(sb, SOME_SIZE);
- *       ... Here, the memory array starting at sb->buf, and of length
- *       ... strbuf_avail(sb) is all yours, and you are sure that
- *       ... strbuf_avail(sb) is at least SOME_SIZE.
- *    strbuf_setlen(sb, sb->len + SOME_OTHER_SIZE);
- *
- *    Of course, SOME_OTHER_SIZE must be smaller or equal to strbuf_avail(sb).
- *
- *    Doing so is safe, though if it has to be done in many places, adding the
- *    missing API to the strbuf module is the way to go.
- *
- *    XXX: do _not_ assume that the area that is yours is of size ->alloc - 1
- *         even if it's true in the current implementation. Alloc is somehow a
- *         "private" member that should not be messed with.
- */
-
-#include <assert.h>
-
-extern char strbuf_slopbuf[];
-struct strbuf {
-	size_t alloc;
-	size_t len;
-	char *buf;
-};
-
-#define STRBUF_INIT  { 0, 0, strbuf_slopbuf }
-
-/*----- strbuf life cycle -----*/
-extern void strbuf_init(struct strbuf *, size_t);
-extern void strbuf_release(struct strbuf *);
-extern char *strbuf_detach(struct strbuf *, size_t *);
-extern void strbuf_attach(struct strbuf *, void *, size_t, size_t);
-static inline void strbuf_swap(struct strbuf *a, struct strbuf *b) {
-	struct strbuf tmp = *a;
-	*a = *b;
-	*b = tmp;
-}
-
-/*----- strbuf size related -----*/
-static inline size_t strbuf_avail(const struct strbuf *sb) {
-	return sb->alloc ? sb->alloc - sb->len - 1 : 0;
-}
-
-extern void strbuf_grow(struct strbuf *, size_t);
-
-static inline void strbuf_setlen(struct strbuf *sb, size_t len) {
-	if (!sb->alloc)
-		strbuf_grow(sb, 0);
-	assert(len < sb->alloc);
-	sb->len = len;
-	sb->buf[len] = '\0';
-}
-#define strbuf_reset(sb)  strbuf_setlen(sb, 0)
-
-/*----- content related -----*/
-extern void strbuf_trim(struct strbuf *);
-extern void strbuf_rtrim(struct strbuf *);
-extern void strbuf_ltrim(struct strbuf *);
-extern int strbuf_cmp(const struct strbuf *, const struct strbuf *);
-extern void strbuf_tolower(struct strbuf *);
-
-extern struct strbuf **strbuf_split(const struct strbuf *, int delim);
-extern void strbuf_list_free(struct strbuf **);
-
-/*----- add data in your buffer -----*/
-static inline void strbuf_addch(struct strbuf *sb, int c) {
-	strbuf_grow(sb, 1);
-	sb->buf[sb->len++] = c;
-	sb->buf[sb->len] = '\0';
-}
-
-extern void strbuf_insert(struct strbuf *, size_t pos, const void *, size_t);
-extern void strbuf_remove(struct strbuf *, size_t pos, size_t len);
-
-/* splice pos..pos+len with given data */
-extern void strbuf_splice(struct strbuf *, size_t pos, size_t len,
-                          const void *, size_t);
-
-extern void strbuf_add(struct strbuf *, const void *, size_t);
-static inline void strbuf_addstr(struct strbuf *sb, const char *s) {
-	strbuf_add(sb, s, strlen(s));
-}
-static inline void strbuf_addbuf(struct strbuf *sb, const struct strbuf *sb2) {
-	strbuf_add(sb, sb2->buf, sb2->len);
-}
-extern void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len);
-
-typedef size_t (*expand_fn_t) (struct strbuf *sb, const char *placeholder, void *context);
-extern void strbuf_expand(struct strbuf *sb, const char *format, expand_fn_t fn, void *context);
-struct strbuf_expand_dict_entry {
-	const char *placeholder;
-	const char *value;
-};
-extern size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder, void *context);
-
-__attribute__((format(printf,2,3)))
-extern void strbuf_addf(struct strbuf *sb, const char *fmt, ...);
-
-extern size_t strbuf_fread(struct strbuf *, size_t, FILE *);
-/* XXX: if read fails, any partial read is undone */
-extern ssize_t strbuf_read(struct strbuf *, int fd, size_t hint);
-extern int strbuf_read_file(struct strbuf *sb, const char *path, size_t hint);
-extern int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint);
-
-extern int strbuf_getline(struct strbuf *, FILE *, int);
-
-extern void stripspace(struct strbuf *buf, int skip_comments);
-extern int launch_editor(const char *path, struct strbuf *buffer, const char *const *env);
-
-extern int strbuf_branchname(struct strbuf *sb, const char *name);
-extern int strbuf_check_branch_ref(struct strbuf *sb, const char *name);
-
-#endif /* STRBUF_H */
diff --git a/Documentation/perf_counter/usage.c b/Documentation/perf_counter/usage.c
deleted file mode 100644
index 7a10421..0000000
--- a/Documentation/perf_counter/usage.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * GIT - The information manager from hell
- *
- * Copyright (C) Linus Torvalds, 2005
- */
-#include "util.h"
-
-static void report(const char *prefix, const char *err, va_list params)
-{
-	char msg[1024];
-	vsnprintf(msg, sizeof(msg), err, params);
-	fprintf(stderr, "%s%s\n", prefix, msg);
-}
-
-static NORETURN void usage_builtin(const char *err)
-{
-	fprintf(stderr, "usage: %s\n", err);
-	exit(129);
-}
-
-static NORETURN void die_builtin(const char *err, va_list params)
-{
-	report("fatal: ", err, params);
-	exit(128);
-}
-
-static void error_builtin(const char *err, va_list params)
-{
-	report("error: ", err, params);
-}
-
-static void warn_builtin(const char *warn, va_list params)
-{
-	report("warning: ", warn, params);
-}
-
-/* If we are in a dlopen()ed .so write to a global variable would segfault
- * (ugh), so keep things static. */
-static void (*usage_routine)(const char *err) NORETURN = usage_builtin;
-static void (*die_routine)(const char *err, va_list params) NORETURN = die_builtin;
-static void (*error_routine)(const char *err, va_list params) = error_builtin;
-static void (*warn_routine)(const char *err, va_list params) = warn_builtin;
-
-void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN)
-{
-	die_routine = routine;
-}
-
-void usage(const char *err)
-{
-	usage_routine(err);
-}
-
-void die(const char *err, ...)
-{
-	va_list params;
-
-	va_start(params, err);
-	die_routine(err, params);
-	va_end(params);
-}
-
-int error(const char *err, ...)
-{
-	va_list params;
-
-	va_start(params, err);
-	error_routine(err, params);
-	va_end(params);
-	return -1;
-}
-
-void warning(const char *warn, ...)
-{
-	va_list params;
-
-	va_start(params, warn);
-	warn_routine(warn, params);
-	va_end(params);
-}
diff --git a/Documentation/perf_counter/util.h b/Documentation/perf_counter/util.h
deleted file mode 100644
index 36e40c3..0000000
--- a/Documentation/perf_counter/util.h
+++ /dev/null
@@ -1,408 +0,0 @@
-#ifndef GIT_COMPAT_UTIL_H
-#define GIT_COMPAT_UTIL_H
-
-#define _FILE_OFFSET_BITS 64
-
-#ifndef FLEX_ARRAY
-/*
- * See if our compiler is known to support flexible array members.
- */
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
-# define FLEX_ARRAY /* empty */
-#elif defined(__GNUC__)
-# if (__GNUC__ >= 3)
-#  define FLEX_ARRAY /* empty */
-# else
-#  define FLEX_ARRAY 0 /* older GNU extension */
-# endif
-#endif
-
-/*
- * Otherwise, default to safer but a bit wasteful traditional style
- */
-#ifndef FLEX_ARRAY
-# define FLEX_ARRAY 1
-#endif
-#endif
-
-#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
-
-#ifdef __GNUC__
-#define TYPEOF(x) (__typeof__(x))
-#else
-#define TYPEOF(x)
-#endif
-
-#define MSB(x, bits) ((x) & TYPEOF(x)(~0ULL << (sizeof(x) * 8 - (bits))))
-#define HAS_MULTI_BITS(i)  ((i) & ((i) - 1))  /* checks if an integer has more than 1 bit set */
-
-/* Approximation of the length of the decimal representation of this type. */
-#define decimal_length(x)	((int)(sizeof(x) * 2.56 + 0.5) + 1)
-
-#if !defined(__APPLE__) && !defined(__FreeBSD__)  && !defined(__USLC__) && !defined(_M_UNIX)
-#define _XOPEN_SOURCE 600 /* glibc2 and AIX 5.3L need 500, OpenBSD needs 600 for S_ISLNK() */
-#define _XOPEN_SOURCE_EXTENDED 1 /* AIX 5.3L needs this */
-#endif
-#define _ALL_SOURCE 1
-#define _GNU_SOURCE 1
-#define _BSD_SOURCE 1
-
-#include <unistd.h>
-#include <stdio.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <string.h>
-#include <errno.h>
-#include <limits.h>
-#include <sys/param.h>
-#include <sys/types.h>
-#include <dirent.h>
-#include <sys/time.h>
-#include <time.h>
-#include <signal.h>
-#include <fnmatch.h>
-#include <assert.h>
-#include <regex.h>
-#include <utime.h>
-#ifndef __MINGW32__
-#include <sys/wait.h>
-#include <sys/poll.h>
-#include <sys/socket.h>
-#include <sys/ioctl.h>
-#ifndef NO_SYS_SELECT_H
-#include <sys/select.h>
-#endif
-#include <netinet/in.h>
-#include <netinet/tcp.h>
-#include <arpa/inet.h>
-#include <netdb.h>
-#include <pwd.h>
-#include <inttypes.h>
-#if defined(__CYGWIN__)
-#undef _XOPEN_SOURCE
-#include <grp.h>
-#define _XOPEN_SOURCE 600
-#include "compat/cygwin.h"
-#else
-#undef _ALL_SOURCE /* AIX 5.3L defines a struct list with _ALL_SOURCE. */
-#include <grp.h>
-#define _ALL_SOURCE 1
-#endif
-#else 	/* __MINGW32__ */
-/* pull in Windows compatibility stuff */
-#include "compat/mingw.h"
-#endif	/* __MINGW32__ */
-
-#ifndef NO_ICONV
-#include <iconv.h>
-#endif
-
-#ifndef NO_OPENSSL
-#include <openssl/ssl.h>
-#include <openssl/err.h>
-#endif
-
-/* On most systems <limits.h> would have given us this, but
- * not on some systems (e.g. GNU/Hurd).
- */
-#ifndef PATH_MAX
-#define PATH_MAX 4096
-#endif
-
-#ifndef PRIuMAX
-#define PRIuMAX "llu"
-#endif
-
-#ifndef PRIu32
-#define PRIu32 "u"
-#endif
-
-#ifndef PRIx32
-#define PRIx32 "x"
-#endif
-
-#ifndef PATH_SEP
-#define PATH_SEP ':'
-#endif
-
-#ifndef STRIP_EXTENSION
-#define STRIP_EXTENSION ""
-#endif
-
-#ifndef has_dos_drive_prefix
-#define has_dos_drive_prefix(path) 0
-#endif
-
-#ifndef is_dir_sep
-#define is_dir_sep(c) ((c) == '/')
-#endif
-
-#ifdef __GNUC__
-#define NORETURN __attribute__((__noreturn__))
-#else
-#define NORETURN
-#ifndef __attribute__
-#define __attribute__(x)
-#endif
-#endif
-
-/* General helper functions */
-extern void usage(const char *err) NORETURN;
-extern void die(const char *err, ...) NORETURN __attribute__((format (printf, 1, 2)));
-extern int error(const char *err, ...) __attribute__((format (printf, 1, 2)));
-extern void warning(const char *err, ...) __attribute__((format (printf, 1, 2)));
-
-extern void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN);
-
-extern int prefixcmp(const char *str, const char *prefix);
-extern time_t tm_to_time_t(const struct tm *tm);
-
-static inline const char *skip_prefix(const char *str, const char *prefix)
-{
-	size_t len = strlen(prefix);
-	return strncmp(str, prefix, len) ? NULL : str + len;
-}
-
-#if defined(NO_MMAP) || defined(USE_WIN32_MMAP)
-
-#ifndef PROT_READ
-#define PROT_READ 1
-#define PROT_WRITE 2
-#define MAP_PRIVATE 1
-#define MAP_FAILED ((void*)-1)
-#endif
-
-#define mmap git_mmap
-#define munmap git_munmap
-extern void *git_mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
-extern int git_munmap(void *start, size_t length);
-
-#else /* NO_MMAP || USE_WIN32_MMAP */
-
-#include <sys/mman.h>
-
-#endif /* NO_MMAP || USE_WIN32_MMAP */
-
-#ifdef NO_MMAP
-
-/* This value must be multiple of (pagesize * 2) */
-#define DEFAULT_PACKED_GIT_WINDOW_SIZE (1 * 1024 * 1024)
-
-#else /* NO_MMAP */
-
-/* This value must be multiple of (pagesize * 2) */
-#define DEFAULT_PACKED_GIT_WINDOW_SIZE \
-	(sizeof(void*) >= 8 \
-		?  1 * 1024 * 1024 * 1024 \
-		: 32 * 1024 * 1024)
-
-#endif /* NO_MMAP */
-
-#ifdef NO_ST_BLOCKS_IN_STRUCT_STAT
-#define on_disk_bytes(st) ((st).st_size)
-#else
-#define on_disk_bytes(st) ((st).st_blocks * 512)
-#endif
-
-#define DEFAULT_PACKED_GIT_LIMIT \
-	((1024L * 1024L) * (sizeof(void*) >= 8 ? 8192 : 256))
-
-#ifdef NO_PREAD
-#define pread git_pread
-extern ssize_t git_pread(int fd, void *buf, size_t count, off_t offset);
-#endif
-/*
- * Forward decl that will remind us if its twin in cache.h changes.
- * This function is used in compat/pread.c.  But we can't include
- * cache.h there.
- */
-extern ssize_t read_in_full(int fd, void *buf, size_t count);
-
-#ifdef NO_SETENV
-#define setenv gitsetenv
-extern int gitsetenv(const char *, const char *, int);
-#endif
-
-#ifdef NO_MKDTEMP
-#define mkdtemp gitmkdtemp
-extern char *gitmkdtemp(char *);
-#endif
-
-#ifdef NO_UNSETENV
-#define unsetenv gitunsetenv
-extern void gitunsetenv(const char *);
-#endif
-
-#ifdef NO_STRCASESTR
-#define strcasestr gitstrcasestr
-extern char *gitstrcasestr(const char *haystack, const char *needle);
-#endif
-
-#ifdef NO_STRLCPY
-#define strlcpy gitstrlcpy
-extern size_t gitstrlcpy(char *, const char *, size_t);
-#endif
-
-#ifdef NO_STRTOUMAX
-#define strtoumax gitstrtoumax
-extern uintmax_t gitstrtoumax(const char *, char **, int);
-#endif
-
-#ifdef NO_HSTRERROR
-#define hstrerror githstrerror
-extern const char *githstrerror(int herror);
-#endif
-
-#ifdef NO_MEMMEM
-#define memmem gitmemmem
-void *gitmemmem(const void *haystack, size_t haystacklen,
-                const void *needle, size_t needlelen);
-#endif
-
-#ifdef FREAD_READS_DIRECTORIES
-#ifdef fopen
-#undef fopen
-#endif
-#define fopen(a,b) git_fopen(a,b)
-extern FILE *git_fopen(const char*, const char*);
-#endif
-
-#ifdef SNPRINTF_RETURNS_BOGUS
-#define snprintf git_snprintf
-extern int git_snprintf(char *str, size_t maxsize,
-			const char *format, ...);
-#define vsnprintf git_vsnprintf
-extern int git_vsnprintf(char *str, size_t maxsize,
-			 const char *format, va_list ap);
-#endif
-
-#ifdef __GLIBC_PREREQ
-#if __GLIBC_PREREQ(2, 1)
-#define HAVE_STRCHRNUL
-#endif
-#endif
-
-#ifndef HAVE_STRCHRNUL
-#define strchrnul gitstrchrnul
-static inline char *gitstrchrnul(const char *s, int c)
-{
-	while (*s && *s != c)
-		s++;
-	return (char *)s;
-}
-#endif
-
-/*
- * Wrappers:
- */
-extern char *xstrdup(const char *str);
-extern void *xmalloc(size_t size);
-extern void *xmemdupz(const void *data, size_t len);
-extern char *xstrndup(const char *str, size_t len);
-extern void *xrealloc(void *ptr, size_t size);
-extern void *xcalloc(size_t nmemb, size_t size);
-extern void *xmmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
-extern ssize_t xread(int fd, void *buf, size_t len);
-extern ssize_t xwrite(int fd, const void *buf, size_t len);
-extern int xdup(int fd);
-extern FILE *xfdopen(int fd, const char *mode);
-static inline size_t xsize_t(off_t len)
-{
-	return (size_t)len;
-}
-
-static inline int has_extension(const char *filename, const char *ext)
-{
-	size_t len = strlen(filename);
-	size_t extlen = strlen(ext);
-	return len > extlen && !memcmp(filename + len - extlen, ext, extlen);
-}
-
-/* Sane ctype - no locale, and works with signed chars */
-#undef isascii
-#undef isspace
-#undef isdigit
-#undef isalpha
-#undef isalnum
-#undef tolower
-#undef toupper
-extern unsigned char sane_ctype[256];
-#define GIT_SPACE 0x01
-#define GIT_DIGIT 0x02
-#define GIT_ALPHA 0x04
-#define GIT_GLOB_SPECIAL 0x08
-#define GIT_REGEX_SPECIAL 0x10
-#define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0)
-#define isascii(x) (((x) & ~0x7f) == 0)
-#define isspace(x) sane_istest(x,GIT_SPACE)
-#define isdigit(x) sane_istest(x,GIT_DIGIT)
-#define isalpha(x) sane_istest(x,GIT_ALPHA)
-#define isalnum(x) sane_istest(x,GIT_ALPHA | GIT_DIGIT)
-#define is_glob_special(x) sane_istest(x,GIT_GLOB_SPECIAL)
-#define is_regex_special(x) sane_istest(x,GIT_GLOB_SPECIAL | GIT_REGEX_SPECIAL)
-#define tolower(x) sane_case((unsigned char)(x), 0x20)
-#define toupper(x) sane_case((unsigned char)(x), 0)
-
-static inline int sane_case(int x, int high)
-{
-	if (sane_istest(x, GIT_ALPHA))
-		x = (x & ~0x20) | high;
-	return x;
-}
-
-static inline int strtoul_ui(char const *s, int base, unsigned int *result)
-{
-	unsigned long ul;
-	char *p;
-
-	errno = 0;
-	ul = strtoul(s, &p, base);
-	if (errno || *p || p == s || (unsigned int) ul != ul)
-		return -1;
-	*result = ul;
-	return 0;
-}
-
-static inline int strtol_i(char const *s, int base, int *result)
-{
-	long ul;
-	char *p;
-
-	errno = 0;
-	ul = strtol(s, &p, base);
-	if (errno || *p || p == s || (int) ul != ul)
-		return -1;
-	*result = ul;
-	return 0;
-}
-
-#ifdef INTERNAL_QSORT
-void git_qsort(void *base, size_t nmemb, size_t size,
-	       int(*compar)(const void *, const void *));
-#define qsort git_qsort
-#endif
-
-#ifndef DIR_HAS_BSD_GROUP_SEMANTICS
-# define FORCE_DIR_SET_GID S_ISGID
-#else
-# define FORCE_DIR_SET_GID 0
-#endif
-
-#ifdef NO_NSEC
-#undef USE_NSEC
-#define ST_CTIME_NSEC(st) 0
-#define ST_MTIME_NSEC(st) 0
-#else
-#ifdef USE_ST_TIMESPEC
-#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctimespec.tv_nsec))
-#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtimespec.tv_nsec))
-#else
-#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctim.tv_nsec))
-#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtim.tv_nsec))
-#endif
-#endif
-
-#endif
diff --git a/Documentation/perf_counter/util/PERF-VERSION-GEN b/Documentation/perf_counter/util/PERF-VERSION-GEN
new file mode 100755
index 0000000..c561d15
--- /dev/null
+++ b/Documentation/perf_counter/util/PERF-VERSION-GEN
@@ -0,0 +1,42 @@
+#!/bin/sh
+
+GVF=PERF-VERSION-FILE
+DEF_VER=v0.0.1.PERF
+
+LF='
+'
+
+# First see if there is a version file (included in release tarballs),
+# then try git-describe, then default.
+if test -f version
+then
+	VN=$(cat version) || VN="$DEF_VER"
+elif test -d .git -o -f .git &&
+	VN=$(git describe --abbrev=4 HEAD 2>/dev/null) &&
+	case "$VN" in
+	*$LF*) (exit 1) ;;
+	v[0-9]*)
+		git update-index -q --refresh
+		test -z "$(git diff-index --name-only HEAD --)" ||
+		VN="$VN-dirty" ;;
+	esac
+then
+	VN=$(echo "$VN" | sed -e 's/-/./g');
+else
+	VN="$DEF_VER"
+fi
+
+VN=$(expr "$VN" : v*'\(.*\)')
+
+if test -r $GVF
+then
+	VC=$(sed -e 's/^PERF_VERSION = //' <$GVF)
+else
+	VC=unset
+fi
+test "$VN" = "$VC" || {
+	echo >&2 "PERF_VERSION = $VN"
+	echo "PERF_VERSION = $VN" >$GVF
+}
+
+
diff --git a/Documentation/perf_counter/util/abspath.c b/Documentation/perf_counter/util/abspath.c
new file mode 100644
index 0000000..649f34f
--- /dev/null
+++ b/Documentation/perf_counter/util/abspath.c
@@ -0,0 +1,117 @@
+#include "cache.h"
+
+/*
+ * Do not use this for inspecting *tracked* content.  When path is a
+ * symlink to a directory, we do not want to say it is a directory when
+ * dealing with tracked content in the working tree.
+ */
+int is_directory(const char *path)
+{
+	struct stat st;
+	return (!stat(path, &st) && S_ISDIR(st.st_mode));
+}
+
+/* We allow "recursive" symbolic links. Only within reason, though. */
+#define MAXDEPTH 5
+
+const char *make_absolute_path(const char *path)
+{
+	static char bufs[2][PATH_MAX + 1], *buf = bufs[0], *next_buf = bufs[1];
+	char cwd[1024] = "";
+	int buf_index = 1, len;
+
+	int depth = MAXDEPTH;
+	char *last_elem = NULL;
+	struct stat st;
+
+	if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
+		die ("Too long path: %.*s", 60, path);
+
+	while (depth--) {
+		if (!is_directory(buf)) {
+			char *last_slash = strrchr(buf, '/');
+			if (last_slash) {
+				*last_slash = '\0';
+				last_elem = xstrdup(last_slash + 1);
+			} else {
+				last_elem = xstrdup(buf);
+				*buf = '\0';
+			}
+		}
+
+		if (*buf) {
+			if (!*cwd && !getcwd(cwd, sizeof(cwd)))
+				die ("Could not get current working directory");
+
+			if (chdir(buf))
+				die ("Could not switch to '%s'", buf);
+		}
+		if (!getcwd(buf, PATH_MAX))
+			die ("Could not get current working directory");
+
+		if (last_elem) {
+			int len = strlen(buf);
+			if (len + strlen(last_elem) + 2 > PATH_MAX)
+				die ("Too long path name: '%s/%s'",
+						buf, last_elem);
+			buf[len] = '/';
+			strcpy(buf + len + 1, last_elem);
+			free(last_elem);
+			last_elem = NULL;
+		}
+
+		if (!lstat(buf, &st) && S_ISLNK(st.st_mode)) {
+			len = readlink(buf, next_buf, PATH_MAX);
+			if (len < 0)
+				die ("Invalid symlink: %s", buf);
+			if (PATH_MAX <= len)
+				die("symbolic link too long: %s", buf);
+			next_buf[len] = '\0';
+			buf = next_buf;
+			buf_index = 1 - buf_index;
+			next_buf = bufs[buf_index];
+		} else
+			break;
+	}
+
+	if (*cwd && chdir(cwd))
+		die ("Could not change back to '%s'", cwd);
+
+	return buf;
+}
+
+static const char *get_pwd_cwd(void)
+{
+	static char cwd[PATH_MAX + 1];
+	char *pwd;
+	struct stat cwd_stat, pwd_stat;
+	if (getcwd(cwd, PATH_MAX) == NULL)
+		return NULL;
+	pwd = getenv("PWD");
+	if (pwd && strcmp(pwd, cwd)) {
+		stat(cwd, &cwd_stat);
+		if (!stat(pwd, &pwd_stat) &&
+		    pwd_stat.st_dev == cwd_stat.st_dev &&
+		    pwd_stat.st_ino == cwd_stat.st_ino) {
+			strlcpy(cwd, pwd, PATH_MAX);
+		}
+	}
+	return cwd;
+}
+
+const char *make_nonrelative_path(const char *path)
+{
+	static char buf[PATH_MAX + 1];
+
+	if (is_absolute_path(path)) {
+		if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
+			die("Too long path: %.*s", 60, path);
+	} else {
+		const char *cwd = get_pwd_cwd();
+		if (!cwd)
+			die("Cannot determine the current working directory");
+		if (snprintf(buf, PATH_MAX, "%s/%s", cwd, path) >= PATH_MAX)
+			die("Too long path: %.*s", 60, path);
+	}
+	return buf;
+}
diff --git a/Documentation/perf_counter/util/alias.c b/Documentation/perf_counter/util/alias.c
new file mode 100644
index 0000000..9b3dd2b
--- /dev/null
+++ b/Documentation/perf_counter/util/alias.c
@@ -0,0 +1,77 @@
+#include "cache.h"
+
+static const char *alias_key;
+static char *alias_val;
+
+static int alias_lookup_cb(const char *k, const char *v, void *cb)
+{
+	if (!prefixcmp(k, "alias.") && !strcmp(k+6, alias_key)) {
+		if (!v)
+			return config_error_nonbool(k);
+		alias_val = strdup(v);
+		return 0;
+	}
+	return 0;
+}
+
+char *alias_lookup(const char *alias)
+{
+	alias_key = alias;
+	alias_val = NULL;
+	perf_config(alias_lookup_cb, NULL);
+	return alias_val;
+}
+
+int split_cmdline(char *cmdline, const char ***argv)
+{
+	int src, dst, count = 0, size = 16;
+	char quoted = 0;
+
+	*argv = malloc(sizeof(char*) * size);
+
+	/* split alias_string */
+	(*argv)[count++] = cmdline;
+	for (src = dst = 0; cmdline[src];) {
+		char c = cmdline[src];
+		if (!quoted && isspace(c)) {
+			cmdline[dst++] = 0;
+			while (cmdline[++src]
+					&& isspace(cmdline[src]))
+				; /* skip */
+			if (count >= size) {
+				size += 16;
+				*argv = realloc(*argv, sizeof(char*) * size);
+			}
+			(*argv)[count++] = cmdline + dst;
+		} else if (!quoted && (c == '\'' || c == '"')) {
+			quoted = c;
+			src++;
+		} else if (c == quoted) {
+			quoted = 0;
+			src++;
+		} else {
+			if (c == '\\' && quoted != '\'') {
+				src++;
+				c = cmdline[src];
+				if (!c) {
+					free(*argv);
+					*argv = NULL;
+					return error("cmdline ends with \\");
+				}
+			}
+			cmdline[dst++] = c;
+			src++;
+		}
+	}
+
+	cmdline[dst] = 0;
+
+	if (quoted) {
+		free(*argv);
+		*argv = NULL;
+		return error("unclosed quote");
+	}
+
+	return count;
+}
+
diff --git a/Documentation/perf_counter/util/cache.h b/Documentation/perf_counter/util/cache.h
new file mode 100644
index 0000000..7108051
--- /dev/null
+++ b/Documentation/perf_counter/util/cache.h
@@ -0,0 +1,117 @@
+#ifndef CACHE_H
+#define CACHE_H
+
+#include "util.h"
+#include "strbuf.h"
+
+#define PERF_DIR_ENVIRONMENT "PERF_DIR"
+#define PERF_WORK_TREE_ENVIRONMENT "PERF_WORK_TREE"
+#define DEFAULT_PERF_DIR_ENVIRONMENT ".perf"
+#define DB_ENVIRONMENT "PERF_OBJECT_DIRECTORY"
+#define INDEX_ENVIRONMENT "PERF_INDEX_FILE"
+#define GRAFT_ENVIRONMENT "PERF_GRAFT_FILE"
+#define TEMPLATE_DIR_ENVIRONMENT "PERF_TEMPLATE_DIR"
+#define CONFIG_ENVIRONMENT "PERF_CONFIG"
+#define EXEC_PATH_ENVIRONMENT "PERF_EXEC_PATH"
+#define CEILING_DIRECTORIES_ENVIRONMENT "PERF_CEILING_DIRECTORIES"
+#define PERFATTRIBUTES_FILE ".perfattributes"
+#define INFOATTRIBUTES_FILE "info/attributes"
+#define ATTRIBUTE_MACRO_PREFIX "[attr]"
+
+typedef int (*config_fn_t)(const char *, const char *, void *);
+extern int perf_default_config(const char *, const char *, void *);
+extern int perf_config_from_file(config_fn_t fn, const char *, void *);
+extern int perf_config(config_fn_t fn, void *);
+extern int perf_parse_ulong(const char *, unsigned long *);
+extern int perf_config_int(const char *, const char *);
+extern unsigned long perf_config_ulong(const char *, const char *);
+extern int perf_config_bool_or_int(const char *, const char *, int *);
+extern int perf_config_bool(const char *, const char *);
+extern int perf_config_string(const char **, const char *, const char *);
+extern int perf_config_set(const char *, const char *);
+extern int perf_config_set_multivar(const char *, const char *, const char *, int);
+extern int perf_config_rename_section(const char *, const char *);
+extern const char *perf_etc_perfconfig(void);
+extern int check_repository_format_version(const char *var, const char *value, void *cb);
+extern int perf_config_system(void);
+extern int perf_config_global(void);
+extern int config_error_nonbool(const char *);
+extern const char *config_exclusive_filename;
+
+#define MAX_PERFNAME (1000)
+extern char perf_default_email[MAX_PERFNAME];
+extern char perf_default_name[MAX_PERFNAME];
+extern int user_ident_explicitly_given;
+
+extern const char *perf_log_output_encoding;
+extern const char *perf_mailmap_file;
+
+/* IO helper functions */
+extern void maybe_flush_or_die(FILE *, const char *);
+extern int copy_fd(int ifd, int ofd);
+extern int copy_file(const char *dst, const char *src, int mode);
+extern ssize_t read_in_full(int fd, void *buf, size_t count);
+extern ssize_t write_in_full(int fd, const void *buf, size_t count);
+extern void write_or_die(int fd, const void *buf, size_t count);
+extern int write_or_whine(int fd, const void *buf, size_t count, const char *msg);
+extern int write_or_whine_pipe(int fd, const void *buf, size_t count, const char *msg);
+extern void fsync_or_die(int fd, const char *);
+
+/* pager.c */
+extern void setup_pager(void);
+extern const char *pager_program;
+extern int pager_in_use(void);
+extern int pager_use_color;
+
+extern const char *editor_program;
+extern const char *excludes_file;
+
+char *alias_lookup(const char *alias);
+int split_cmdline(char *cmdline, const char ***argv);
+
+#define alloc_nr(x) (((x)+16)*3/2)
+
+/*
+ * Realloc the buffer pointed at by variable 'x' so that it can hold
+ * at least 'nr' entries; the number of entries currently allocated
+ * is 'alloc', using the standard growing factor alloc_nr() macro.
+ *
+ * DO NOT USE any expression with side-effect for 'x' or 'alloc'.
+ */
+#define ALLOC_GROW(x, nr, alloc) \
+	do { \
+		if ((nr) > alloc) { \
+			if (alloc_nr(alloc) < (nr)) \
+				alloc = (nr); \
+			else \
+				alloc = alloc_nr(alloc); \
+			x = xrealloc((x), alloc * sizeof(*(x))); \
+		} \
+	} while(0)
+
+
+static inline int is_absolute_path(const char *path)
+{
+	return path[0] == '/';
+}
+
+const char *make_absolute_path(const char *path);
+const char *make_nonrelative_path(const char *path);
+const char *make_relative_path(const char *abs, const char *base);
+int normalize_path_copy(char *dst, const char *src);
+int longest_ancestor_length(const char *path, const char *prefix_list);
+char *strip_path_suffix(const char *path, const char *suffix);
+
+extern char *mkpath(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
+extern char *perf_path(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
+
+extern char *mksnpath(char *buf, size_t n, const char *fmt, ...)
+	__attribute__((format (printf, 3, 4)));
+extern char *perf_snpath(char *buf, size_t n, const char *fmt, ...)
+	__attribute__((format (printf, 3, 4)));
+extern char *perf_pathdup(const char *fmt, ...)
+	__attribute__((format (printf, 1, 2)));
+
+extern size_t strlcpy(char *dest, const char *src, size_t size);
+
+#endif /* CACHE_H */
diff --git a/Documentation/perf_counter/util/config.c b/Documentation/perf_counter/util/config.c
new file mode 100644
index 0000000..3dd13fa
--- /dev/null
+++ b/Documentation/perf_counter/util/config.c
@@ -0,0 +1,873 @@
+/*
+ * GIT - The information manager from hell
+ *
+ * Copyright (C) Linus Torvalds, 2005
+ * Copyright (C) Johannes Schindelin, 2005
+ *
+ */
+#include "util.h"
+#include "cache.h"
+#include "exec_cmd.h"
+
+#define MAXNAME (256)
+
+static FILE *config_file;
+static const char *config_file_name;
+static int config_linenr;
+static int config_file_eof;
+
+const char *config_exclusive_filename = NULL;
+
+static int get_next_char(void)
+{
+	int c;
+	FILE *f;
+
+	c = '\n';
+	if ((f = config_file) != NULL) {
+		c = fgetc(f);
+		if (c == '\r') {
+			/* DOS like systems */
+			c = fgetc(f);
+			if (c != '\n') {
+				ungetc(c, f);
+				c = '\r';
+			}
+		}
+		if (c == '\n')
+			config_linenr++;
+		if (c == EOF) {
+			config_file_eof = 1;
+			c = '\n';
+		}
+	}
+	return c;
+}
+
+static char *parse_value(void)
+{
+	static char value[1024];
+	int quote = 0, comment = 0, len = 0, space = 0;
+
+	for (;;) {
+		int c = get_next_char();
+		if (len >= sizeof(value) - 1)
+			return NULL;
+		if (c == '\n') {
+			if (quote)
+				return NULL;
+			value[len] = 0;
+			return value;
+		}
+		if (comment)
+			continue;
+		if (isspace(c) && !quote) {
+			space = 1;
+			continue;
+		}
+		if (!quote) {
+			if (c == ';' || c == '#') {
+				comment = 1;
+				continue;
+			}
+		}
+		if (space) {
+			if (len)
+				value[len++] = ' ';
+			space = 0;
+		}
+		if (c == '\\') {
+			c = get_next_char();
+			switch (c) {
+			case '\n':
+				continue;
+			case 't':
+				c = '\t';
+				break;
+			case 'b':
+				c = '\b';
+				break;
+			case 'n':
+				c = '\n';
+				break;
+			/* Some characters escape as themselves */
+			case '\\': case '"':
+				break;
+			/* Reject unknown escape sequences */
+			default:
+				return NULL;
+			}
+			value[len++] = c;
+			continue;
+		}
+		if (c == '"') {
+			quote = 1-quote;
+			continue;
+		}
+		value[len++] = c;
+	}
+}
+
+static inline int iskeychar(int c)
+{
+	return isalnum(c) || c == '-';
+}
+
+static int get_value(config_fn_t fn, void *data, char *name, unsigned int len)
+{
+	int c;
+	char *value;
+
+	/* Get the full name */
+	for (;;) {
+		c = get_next_char();
+		if (config_file_eof)
+			break;
+		if (!iskeychar(c))
+			break;
+		name[len++] = tolower(c);
+		if (len >= MAXNAME)
+			return -1;
+	}
+	name[len] = 0;
+	while (c == ' ' || c == '\t')
+		c = get_next_char();
+
+	value = NULL;
+	if (c != '\n') {
+		if (c != '=')
+			return -1;
+		value = parse_value();
+		if (!value)
+			return -1;
+	}
+	return fn(name, value, data);
+}
+
+static int get_extended_base_var(char *name, int baselen, int c)
+{
+	do {
+		if (c == '\n')
+			return -1;
+		c = get_next_char();
+	} while (isspace(c));
+
+	/* We require the format to be '[base "extension"]' */
+	if (c != '"')
+		return -1;
+	name[baselen++] = '.';
+
+	for (;;) {
+		int c = get_next_char();
+		if (c == '\n')
+			return -1;
+		if (c == '"')
+			break;
+		if (c == '\\') {
+			c = get_next_char();
+			if (c == '\n')
+				return -1;
+		}
+		name[baselen++] = c;
+		if (baselen > MAXNAME / 2)
+			return -1;
+	}
+
+	/* Final ']' */
+	if (get_next_char() != ']')
+		return -1;
+	return baselen;
+}
+
+static int get_base_var(char *name)
+{
+	int baselen = 0;
+
+	for (;;) {
+		int c = get_next_char();
+		if (config_file_eof)
+			return -1;
+		if (c == ']')
+			return baselen;
+		if (isspace(c))
+			return get_extended_base_var(name, baselen, c);
+		if (!iskeychar(c) && c != '.')
+			return -1;
+		if (baselen > MAXNAME / 2)
+			return -1;
+		name[baselen++] = tolower(c);
+	}
+}
+
+static int perf_parse_file(config_fn_t fn, void *data)
+{
+	int comment = 0;
+	int baselen = 0;
+	static char var[MAXNAME];
+
+	/* U+FEFF Byte Order Mark in UTF8 */
+	static const unsigned char *utf8_bom = (unsigned char *) "\xef\xbb\xbf";
+	const unsigned char *bomptr = utf8_bom;
+
+	for (;;) {
+		int c = get_next_char();
+		if (bomptr && *bomptr) {
+			/* We are at the file beginning; skip UTF8-encoded BOM
+			 * if present. Sane editors won't put this in on their
+			 * own, but e.g. Windows Notepad will do it happily. */
+			if ((unsigned char) c == *bomptr) {
+				bomptr++;
+				continue;
+			} else {
+				/* Do not tolerate partial BOM. */
+				if (bomptr != utf8_bom)
+					break;
+				/* No BOM at file beginning. Cool. */
+				bomptr = NULL;
+			}
+		}
+		if (c == '\n') {
+			if (config_file_eof)
+				return 0;
+			comment = 0;
+			continue;
+		}
+		if (comment || isspace(c))
+			continue;
+		if (c == '#' || c == ';') {
+			comment = 1;
+			continue;
+		}
+		if (c == '[') {
+			baselen = get_base_var(var);
+			if (baselen <= 0)
+				break;
+			var[baselen++] = '.';
+			var[baselen] = 0;
+			continue;
+		}
+		if (!isalpha(c))
+			break;
+		var[baselen] = tolower(c);
+		if (get_value(fn, data, var, baselen+1) < 0)
+			break;
+	}
+	die("bad config file line %d in %s", config_linenr, config_file_name);
+}
+
+static int parse_unit_factor(const char *end, unsigned long *val)
+{
+	if (!*end)
+		return 1;
+	else if (!strcasecmp(end, "k")) {
+		*val *= 1024;
+		return 1;
+	}
+	else if (!strcasecmp(end, "m")) {
+		*val *= 1024 * 1024;
+		return 1;
+	}
+	else if (!strcasecmp(end, "g")) {
+		*val *= 1024 * 1024 * 1024;
+		return 1;
+	}
+	return 0;
+}
+
+static int perf_parse_long(const char *value, long *ret)
+{
+	if (value && *value) {
+		char *end;
+		long val = strtol(value, &end, 0);
+		unsigned long factor = 1;
+		if (!parse_unit_factor(end, &factor))
+			return 0;
+		*ret = val * factor;
+		return 1;
+	}
+	return 0;
+}
+
+int perf_parse_ulong(const char *value, unsigned long *ret)
+{
+	if (value && *value) {
+		char *end;
+		unsigned long val = strtoul(value, &end, 0);
+		if (!parse_unit_factor(end, &val))
+			return 0;
+		*ret = val;
+		return 1;
+	}
+	return 0;
+}
+
+static void die_bad_config(const char *name)
+{
+	if (config_file_name)
+		die("bad config value for '%s' in %s", name, config_file_name);
+	die("bad config value for '%s'", name);
+}
+
+int perf_config_int(const char *name, const char *value)
+{
+	long ret = 0;
+	if (!perf_parse_long(value, &ret))
+		die_bad_config(name);
+	return ret;
+}
+
+unsigned long perf_config_ulong(const char *name, const char *value)
+{
+	unsigned long ret;
+	if (!perf_parse_ulong(value, &ret))
+		die_bad_config(name);
+	return ret;
+}
+
+int perf_config_bool_or_int(const char *name, const char *value, int *is_bool)
+{
+	*is_bool = 1;
+	if (!value)
+		return 1;
+	if (!*value)
+		return 0;
+	if (!strcasecmp(value, "true") || !strcasecmp(value, "yes") || !strcasecmp(value, "on"))
+		return 1;
+	if (!strcasecmp(value, "false") || !strcasecmp(value, "no") || !strcasecmp(value, "off"))
+		return 0;
+	*is_bool = 0;
+	return perf_config_int(name, value);
+}
+
+int perf_config_bool(const char *name, const char *value)
+{
+	int discard;
+	return !!perf_config_bool_or_int(name, value, &discard);
+}
+
+int perf_config_string(const char **dest, const char *var, const char *value)
+{
+	if (!value)
+		return config_error_nonbool(var);
+	*dest = strdup(value);
+	return 0;
+}
+
+static int perf_default_core_config(const char *var, const char *value)
+{
+	/* Add other config variables here and to Documentation/config.txt. */
+	return 0;
+}
+
+int perf_default_config(const char *var, const char *value, void *dummy)
+{
+	if (!prefixcmp(var, "core."))
+		return perf_default_core_config(var, value);
+
+	/* Add other config variables here and to Documentation/config.txt. */
+	return 0;
+}
+
+int perf_config_from_file(config_fn_t fn, const char *filename, void *data)
+{
+	int ret;
+	FILE *f = fopen(filename, "r");
+
+	ret = -1;
+	if (f) {
+		config_file = f;
+		config_file_name = filename;
+		config_linenr = 1;
+		config_file_eof = 0;
+		ret = perf_parse_file(fn, data);
+		fclose(f);
+		config_file_name = NULL;
+	}
+	return ret;
+}
+
+const char *perf_etc_perfconfig(void)
+{
+	static const char *system_wide;
+	if (!system_wide)
+		system_wide = system_path(ETC_PERFCONFIG);
+	return system_wide;
+}
+
+static int perf_env_bool(const char *k, int def)
+{
+	const char *v = getenv(k);
+	return v ? perf_config_bool(k, v) : def;
+}
+
+int perf_config_system(void)
+{
+	return !perf_env_bool("PERF_CONFIG_NOSYSTEM", 0);
+}
+
+int perf_config_global(void)
+{
+	return !perf_env_bool("PERF_CONFIG_NOGLOBAL", 0);
+}
+
+int perf_config(config_fn_t fn, void *data)
+{
+	int ret = 0, found = 0;
+	char *repo_config = NULL;
+	const char *home = NULL;
+
+	/* Setting $PERF_CONFIG makes perf read _only_ the given config file. */
+	if (config_exclusive_filename)
+		return perf_config_from_file(fn, config_exclusive_filename, data);
+	if (perf_config_system() && !access(perf_etc_perfconfig(), R_OK)) {
+		ret += perf_config_from_file(fn, perf_etc_perfconfig(),
+					    data);
+		found += 1;
+	}
+
+	home = getenv("HOME");
+	if (perf_config_global() && home) {
+		char *user_config = strdup(mkpath("%s/.perfconfig", home));
+		if (!access(user_config, R_OK)) {
+			ret += perf_config_from_file(fn, user_config, data);
+			found += 1;
+		}
+		free(user_config);
+	}
+
+	repo_config = perf_pathdup("config");
+	if (!access(repo_config, R_OK)) {
+		ret += perf_config_from_file(fn, repo_config, data);
+		found += 1;
+	}
+	free(repo_config);
+	if (found == 0)
+		return -1;
+	return ret;
+}
+
+/*
+ * Find all the stuff for perf_config_set() below.
+ */
+
+#define MAX_MATCHES 512
+
+static struct {
+	int baselen;
+	char* key;
+	int do_not_match;
+	regex_t* value_regex;
+	int multi_replace;
+	size_t offset[MAX_MATCHES];
+	enum { START, SECTION_SEEN, SECTION_END_SEEN, KEY_SEEN } state;
+	int seen;
+} store;
+
+static int matches(const char* key, const char* value)
+{
+	return !strcmp(key, store.key) &&
+		(store.value_regex == NULL ||
+		 (store.do_not_match ^
+		  !regexec(store.value_regex, value, 0, NULL, 0)));
+}
+
+static int store_aux(const char* key, const char* value, void *cb)
+{
+	const char *ep;
+	size_t section_len;
+
+	switch (store.state) {
+	case KEY_SEEN:
+		if (matches(key, value)) {
+			if (store.seen == 1 && store.multi_replace == 0) {
+				warning("%s has multiple values", key);
+			} else if (store.seen >= MAX_MATCHES) {
+				error("too many matches for %s", key);
+				return 1;
+			}
+
+			store.offset[store.seen] = ftell(config_file);
+			store.seen++;
+		}
+		break;
+	case SECTION_SEEN:
+		/*
+		 * What we are looking for is in store.key (both
+		 * section and var), and its section part is baselen
+		 * long.  We found key (again, both section and var).
+		 * We would want to know if this key is in the same
+		 * section as what we are looking for.  We already
+		 * know we are in the same section as what should
+		 * hold store.key.
+		 */
+		ep = strrchr(key, '.');
+		section_len = ep - key;
+
+		if ((section_len != store.baselen) ||
+		    memcmp(key, store.key, section_len+1)) {
+			store.state = SECTION_END_SEEN;
+			break;
+		}
+
+		/*
+		 * Do not increment matches: this is no match, but we
+		 * just made sure we are in the desired section.
+		 */
+		store.offset[store.seen] = ftell(config_file);
+		/* fallthru */
+	case SECTION_END_SEEN:
+	case START:
+		if (matches(key, value)) {
+			store.offset[store.seen] = ftell(config_file);
+			store.state = KEY_SEEN;
+			store.seen++;
+		} else {
+			if (strrchr(key, '.') - key == store.baselen &&
+			      !strncmp(key, store.key, store.baselen)) {
+					store.state = SECTION_SEEN;
+					store.offset[store.seen] = ftell(config_file);
+			}
+		}
+	}
+	return 0;
+}
+
+static int store_write_section(int fd, const char* key)
+{
+	const char *dot;
+	int i, success;
+	struct strbuf sb = STRBUF_INIT;
+
+	dot = memchr(key, '.', store.baselen);
+	if (dot) {
+		strbuf_addf(&sb, "[%.*s \"", (int)(dot - key), key);
+		for (i = dot - key + 1; i < store.baselen; i++) {
+			if (key[i] == '"' || key[i] == '\\')
+				strbuf_addch(&sb, '\\');
+			strbuf_addch(&sb, key[i]);
+		}
+		strbuf_addstr(&sb, "\"]\n");
+	} else {
+		strbuf_addf(&sb, "[%.*s]\n", store.baselen, key);
+	}
+
+	success = write_in_full(fd, sb.buf, sb.len) == sb.len;
+	strbuf_release(&sb);
+
+	return success;
+}
+
+static int store_write_pair(int fd, const char* key, const char* value)
+{
+	int i, success;
+	int length = strlen(key + store.baselen + 1);
+	const char *quote = "";
+	struct strbuf sb = STRBUF_INIT;
+
+	/*
+	 * Check to see if the value needs to be surrounded with a dq pair.
+	 * Note that problematic characters are always backslash-quoted; this
+	 * check is about not losing leading or trailing SP and strings that
+	 * follow beginning-of-comment characters (i.e. ';' and '#') by the
+	 * configuration parser.
+	 */
+	if (value[0] == ' ')
+		quote = "\"";
+	for (i = 0; value[i]; i++)
+		if (value[i] == ';' || value[i] == '#')
+			quote = "\"";
+	if (i && value[i - 1] == ' ')
+		quote = "\"";
+
+	strbuf_addf(&sb, "\t%.*s = %s",
+		    length, key + store.baselen + 1, quote);
+
+	for (i = 0; value[i]; i++)
+		switch (value[i]) {
+		case '\n':
+			strbuf_addstr(&sb, "\\n");
+			break;
+		case '\t':
+			strbuf_addstr(&sb, "\\t");
+			break;
+		case '"':
+		case '\\':
+			strbuf_addch(&sb, '\\');
+		default:
+			strbuf_addch(&sb, value[i]);
+			break;
+		}
+	strbuf_addf(&sb, "%s\n", quote);
+
+	success = write_in_full(fd, sb.buf, sb.len) == sb.len;
+	strbuf_release(&sb);
+
+	return success;
+}
+
+static ssize_t find_beginning_of_line(const char* contents, size_t size,
+	size_t offset_, int* found_bracket)
+{
+	size_t equal_offset = size, bracket_offset = size;
+	ssize_t offset;
+
+contline:
+	for (offset = offset_-2; offset > 0
+			&& contents[offset] != '\n'; offset--)
+		switch (contents[offset]) {
+			case '=': equal_offset = offset; break;
+			case ']': bracket_offset = offset; break;
+		}
+	if (offset > 0 && contents[offset-1] == '\\') {
+		offset_ = offset;
+		goto contline;
+	}
+	if (bracket_offset < equal_offset) {
+		*found_bracket = 1;
+		offset = bracket_offset+1;
+	} else
+		offset++;
+
+	return offset;
+}
+
+int perf_config_set(const char* key, const char* value)
+{
+	return perf_config_set_multivar(key, value, NULL, 0);
+}
+
+/*
+ * If value==NULL, unset in (remove from) config,
+ * if value_regex!=NULL, disregard key/value pairs where value does not match.
+ * if multi_replace==0, nothing, or only one matching key/value is replaced,
+ *     else all matching key/values (regardless how many) are removed,
+ *     before the new pair is written.
+ *
+ * Returns 0 on success.
+ *
+ * This function does this:
+ *
+ * - it locks the config file by creating ".perf/config.lock"
+ *
+ * - it then parses the config using store_aux() as validator to find
+ *   the position on the key/value pair to replace. If it is to be unset,
+ *   it must be found exactly once.
+ *
+ * - the config file is mmap()ed and the part before the match (if any) is
+ *   written to the lock file, then the changed part and the rest.
+ *
+ * - the config file is removed and the lock file rename()d to it.
+ *
+ */
+int perf_config_set_multivar(const char* key, const char* value,
+	const char* value_regex, int multi_replace)
+{
+	int i, dot;
+	int fd = -1, in_fd;
+	int ret = 0;
+	char* config_filename;
+	const char* last_dot = strrchr(key, '.');
+
+	if (config_exclusive_filename)
+		config_filename = strdup(config_exclusive_filename);
+	else
+		config_filename = perf_pathdup("config");
+
+	/*
+	 * Since "key" actually contains the section name and the real
+	 * key name separated by a dot, we have to know where the dot is.
+	 */
+
+	if (last_dot == NULL) {
+		error("key does not contain a section: %s", key);
+		ret = 2;
+		goto out_free;
+	}
+	store.baselen = last_dot - key;
+
+	store.multi_replace = multi_replace;
+
+	/*
+	 * Validate the key and while at it, lower case it for matching.
+	 */
+	store.key = malloc(strlen(key) + 1);
+	dot = 0;
+	for (i = 0; key[i]; i++) {
+		unsigned char c = key[i];
+		if (c == '.')
+			dot = 1;
+		/* Leave the extended basename untouched.. */
+		if (!dot || i > store.baselen) {
+			if (!iskeychar(c) || (i == store.baselen+1 && !isalpha(c))) {
+				error("invalid key: %s", key);
+				free(store.key);
+				ret = 1;
+				goto out_free;
+			}
+			c = tolower(c);
+		} else if (c == '\n') {
+			error("invalid key (newline): %s", key);
+			free(store.key);
+			ret = 1;
+			goto out_free;
+		}
+		store.key[i] = c;
+	}
+	store.key[i] = 0;
+
+	/*
+	 * If .perf/config does not exist yet, write a minimal version.
+	 */
+	in_fd = open(config_filename, O_RDONLY);
+	if ( in_fd < 0 ) {
+		free(store.key);
+
+		if ( ENOENT != errno ) {
+			error("opening %s: %s", config_filename,
+			      strerror(errno));
+			ret = 3; /* same as "invalid config file" */
+			goto out_free;
+		}
+		/* if nothing to unset, error out */
+		if (value == NULL) {
+			ret = 5;
+			goto out_free;
+		}
+
+		store.key = (char*)key;
+		if (!store_write_section(fd, key) ||
+		    !store_write_pair(fd, key, value))
+			goto write_err_out;
+	} else {
+		struct stat st;
+		char* contents;
+		size_t contents_sz, copy_begin, copy_end;
+		int i, new_line = 0;
+
+		if (value_regex == NULL)
+			store.value_regex = NULL;
+		else {
+			if (value_regex[0] == '!') {
+				store.do_not_match = 1;
+				value_regex++;
+			} else
+				store.do_not_match = 0;
+
+			store.value_regex = (regex_t*)malloc(sizeof(regex_t));
+			if (regcomp(store.value_regex, value_regex,
+					REG_EXTENDED)) {
+				error("invalid pattern: %s", value_regex);
+				free(store.value_regex);
+				ret = 6;
+				goto out_free;
+			}
+		}
+
+		store.offset[0] = 0;
+		store.state = START;
+		store.seen = 0;
+
+		/*
+		 * After this, store.offset will contain the *end* offset
+		 * of the last match, or remain at 0 if no match was found.
+		 * As a side effect, we make sure to transform only a valid
+		 * existing config file.
+		 */
+		if (perf_config_from_file(store_aux, config_filename, NULL)) {
+			error("invalid config file %s", config_filename);
+			free(store.key);
+			if (store.value_regex != NULL) {
+				regfree(store.value_regex);
+				free(store.value_regex);
+			}
+			ret = 3;
+			goto out_free;
+		}
+
+		free(store.key);
+		if (store.value_regex != NULL) {
+			regfree(store.value_regex);
+			free(store.value_regex);
+		}
+
+		/* if nothing to unset, or too many matches, error out */
+		if ((store.seen == 0 && value == NULL) ||
+				(store.seen > 1 && multi_replace == 0)) {
+			ret = 5;
+			goto out_free;
+		}
+
+		fstat(in_fd, &st);
+		contents_sz = xsize_t(st.st_size);
+		contents = mmap(NULL, contents_sz, PROT_READ,
+			MAP_PRIVATE, in_fd, 0);
+		close(in_fd);
+
+		if (store.seen == 0)
+			store.seen = 1;
+
+		for (i = 0, copy_begin = 0; i < store.seen; i++) {
+			if (store.offset[i] == 0) {
+				store.offset[i] = copy_end = contents_sz;
+			} else if (store.state != KEY_SEEN) {
+				copy_end = store.offset[i];
+			} else
+				copy_end = find_beginning_of_line(
+					contents, contents_sz,
+					store.offset[i]-2, &new_line);
+
+			if (copy_end > 0 && contents[copy_end-1] != '\n')
+				new_line = 1;
+
+			/* write the first part of the config */
+			if (copy_end > copy_begin) {
+				if (write_in_full(fd, contents + copy_begin,
+						  copy_end - copy_begin) <
+				    copy_end - copy_begin)
+					goto write_err_out;
+				if (new_line &&
+				    write_in_full(fd, "\n", 1) != 1)
+					goto write_err_out;
+			}
+			copy_begin = store.offset[i];
+		}
+
+		/* write the pair (value == NULL means unset) */
+		if (value != NULL) {
+			if (store.state == START) {
+				if (!store_write_section(fd, key))
+					goto write_err_out;
+			}
+			if (!store_write_pair(fd, key, value))
+				goto write_err_out;
+		}
+
+		/* write the rest of the config */
+		if (copy_begin < contents_sz)
+			if (write_in_full(fd, contents + copy_begin,
+					  contents_sz - copy_begin) <
+			    contents_sz - copy_begin)
+				goto write_err_out;
+
+		munmap(contents, contents_sz);
+	}
+
+	ret = 0;
+
+out_free:
+	free(config_filename);
+	return ret;
+
+write_err_out:
+	goto out_free;
+
+}
+
+/*
+ * Call this to report error for your variable that should not
+ * get a boolean value (i.e. "[my] var" means "true").
+ */
+int config_error_nonbool(const char *var)
+{
+	return error("Missing value for '%s'", var);
+}
diff --git a/Documentation/perf_counter/util/ctype.c b/Documentation/perf_counter/util/ctype.c
new file mode 100644
index 0000000..b90ec00
--- /dev/null
+++ b/Documentation/perf_counter/util/ctype.c
@@ -0,0 +1,26 @@
+/*
+ * Sane locale-independent, ASCII ctype.
+ *
+ * No surprises, and works with signed and unsigned chars.
+ */
+#include "cache.h"
+
+enum {
+	S = GIT_SPACE,
+	A = GIT_ALPHA,
+	D = GIT_DIGIT,
+	G = GIT_GLOB_SPECIAL,	/* *, ?, [, \\ */
+	R = GIT_REGEX_SPECIAL,	/* $, (, ), +, ., ^, {, | * */
+};
+
+unsigned char sane_ctype[256] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, 0, S, 0, 0,		/*   0.. 15 */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,		/*  16.. 31 */
+	S, 0, 0, 0, R, 0, 0, 0, R, R, G, R, 0, 0, R, 0,		/*  32.. 47 */
+	D, D, D, D, D, D, D, D, D, D, 0, 0, 0, 0, 0, G,		/*  48.. 63 */
+	0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  64.. 79 */
+	A, A, A, A, A, A, A, A, A, A, A, G, G, 0, R, 0,		/*  80.. 95 */
+	0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  96..111 */
+	A, A, A, A, A, A, A, A, A, A, A, R, R, 0, 0, 0,		/* 112..127 */
+	/* Nothing in the 128.. range */
+};
diff --git a/Documentation/perf_counter/util/exec_cmd.c b/Documentation/perf_counter/util/exec_cmd.c
new file mode 100644
index 0000000..d392922
--- /dev/null
+++ b/Documentation/perf_counter/util/exec_cmd.c
@@ -0,0 +1,165 @@
+#include "cache.h"
+#include "exec_cmd.h"
+#include "quote.h"
+#define MAX_ARGS	32
+
+extern char **environ;
+static const char *argv_exec_path;
+static const char *argv0_path;
+
+const char *system_path(const char *path)
+{
+#ifdef RUNTIME_PREFIX
+	static const char *prefix;
+#else
+	static const char *prefix = PREFIX;
+#endif
+	struct strbuf d = STRBUF_INIT;
+
+	if (is_absolute_path(path))
+		return path;
+
+#ifdef RUNTIME_PREFIX
+	assert(argv0_path);
+	assert(is_absolute_path(argv0_path));
+
+	if (!prefix &&
+	    !(prefix = strip_path_suffix(argv0_path, PERF_EXEC_PATH)) &&
+	    !(prefix = strip_path_suffix(argv0_path, BINDIR)) &&
+	    !(prefix = strip_path_suffix(argv0_path, "perf"))) {
+		prefix = PREFIX;
+		fprintf(stderr, "RUNTIME_PREFIX requested, "
+				"but prefix computation failed.  "
+				"Using static fallback '%s'.\n", prefix);
+	}
+#endif
+
+	strbuf_addf(&d, "%s/%s", prefix, path);
+	path = strbuf_detach(&d, NULL);
+	return path;
+}
+
+const char *perf_extract_argv0_path(const char *argv0)
+{
+	const char *slash;
+
+	if (!argv0 || !*argv0)
+		return NULL;
+	slash = argv0 + strlen(argv0);
+
+	while (argv0 <= slash && !is_dir_sep(*slash))
+		slash--;
+
+	if (slash >= argv0) {
+		argv0_path = strndup(argv0, slash - argv0);
+		return slash + 1;
+	}
+
+	return argv0;
+}
+
+void perf_set_argv_exec_path(const char *exec_path)
+{
+	argv_exec_path = exec_path;
+	/*
+	 * Propagate this setting to external programs.
+	 */
+	setenv(EXEC_PATH_ENVIRONMENT, exec_path, 1);
+}
+
+
+/* Returns the highest-priority, location to look for perf programs. */
+const char *perf_exec_path(void)
+{
+	const char *env;
+
+	if (argv_exec_path)
+		return argv_exec_path;
+
+	env = getenv(EXEC_PATH_ENVIRONMENT);
+	if (env && *env) {
+		return env;
+	}
+
+	return system_path(PERF_EXEC_PATH);
+}
+
+static void add_path(struct strbuf *out, const char *path)
+{
+	if (path && *path) {
+		if (is_absolute_path(path))
+			strbuf_addstr(out, path);
+		else
+			strbuf_addstr(out, make_nonrelative_path(path));
+
+		strbuf_addch(out, PATH_SEP);
+	}
+}
+
+void setup_path(void)
+{
+	const char *old_path = getenv("PATH");
+	struct strbuf new_path = STRBUF_INIT;
+
+	add_path(&new_path, perf_exec_path());
+	add_path(&new_path, argv0_path);
+
+	if (old_path)
+		strbuf_addstr(&new_path, old_path);
+	else
+		strbuf_addstr(&new_path, "/usr/local/bin:/usr/bin:/bin");
+
+	setenv("PATH", new_path.buf, 1);
+
+	strbuf_release(&new_path);
+}
+
+const char **prepare_perf_cmd(const char **argv)
+{
+	int argc;
+	const char **nargv;
+
+	for (argc = 0; argv[argc]; argc++)
+		; /* just counting */
+	nargv = malloc(sizeof(*nargv) * (argc + 2));
+
+	nargv[0] = "perf";
+	for (argc = 0; argv[argc]; argc++)
+		nargv[argc + 1] = argv[argc];
+	nargv[argc + 1] = NULL;
+	return nargv;
+}
+
+int execv_perf_cmd(const char **argv) {
+	const char **nargv = prepare_perf_cmd(argv);
+
+	/* execvp() can only ever return if it fails */
+	execvp("perf", (char **)nargv);
+
+	free(nargv);
+	return -1;
+}
+
+
+int execl_perf_cmd(const char *cmd,...)
+{
+	int argc;
+	const char *argv[MAX_ARGS + 1];
+	const char *arg;
+	va_list param;
+
+	va_start(param, cmd);
+	argv[0] = cmd;
+	argc = 1;
+	while (argc < MAX_ARGS) {
+		arg = argv[argc++] = va_arg(param, char *);
+		if (!arg)
+			break;
+	}
+	va_end(param);
+	if (MAX_ARGS <= argc)
+		return error("too many args to run %s", cmd);
+
+	argv[argc] = NULL;
+	return execv_perf_cmd(argv);
+}
diff --git a/Documentation/perf_counter/util/exec_cmd.h b/Documentation/perf_counter/util/exec_cmd.h
new file mode 100644
index 0000000..effe25e
--- /dev/null
+++ b/Documentation/perf_counter/util/exec_cmd.h
@@ -0,0 +1,13 @@
+#ifndef PERF_EXEC_CMD_H
+#define PERF_EXEC_CMD_H
+
+extern void perf_set_argv_exec_path(const char *exec_path);
+extern const char *perf_extract_argv0_path(const char *path);
+extern const char *perf_exec_path(void);
+extern void setup_path(void);
+extern const char **prepare_perf_cmd(const char **argv);
+extern int execv_perf_cmd(const char **argv); /* NULL terminated */
+extern int execl_perf_cmd(const char *cmd, ...);
+extern const char *system_path(const char *path);
+
+#endif /* PERF_EXEC_CMD_H */
diff --git a/Documentation/perf_counter/util/generate-cmdlist.sh b/Documentation/perf_counter/util/generate-cmdlist.sh
new file mode 100755
index 0000000..f06f6fd
--- /dev/null
+++ b/Documentation/perf_counter/util/generate-cmdlist.sh
@@ -0,0 +1,24 @@
+#!/bin/sh
+
+echo "/* Automatically generated by $0 */
+struct cmdname_help
+{
+    char name[16];
+    char help[80];
+};
+
+static struct cmdname_help common_cmds[] = {"
+
+sed -n -e 's/^perf-\([^ 	]*\)[ 	].* common.*/\1/p' command-list.txt |
+sort |
+while read cmd
+do
+     sed -n '
+     /^NAME/,/perf-'"$cmd"'/H
+     ${
+            x
+            s/.*perf-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/
+	    p
+     }' "Documentation/perf-$cmd.txt"
+done
+echo "};"
diff --git a/Documentation/perf_counter/util/help.c b/Documentation/perf_counter/util/help.c
new file mode 100644
index 0000000..edde541
--- /dev/null
+++ b/Documentation/perf_counter/util/help.c
@@ -0,0 +1,366 @@
+#include "cache.h"
+#include "../builtin.h"
+#include "exec_cmd.h"
+#include "levenshtein.h"
+#include "help.h"
+
+/* most GUI terminals set COLUMNS (although some don't export it) */
+static int term_columns(void)
+{
+	char *col_string = getenv("COLUMNS");
+	int n_cols;
+
+	if (col_string && (n_cols = atoi(col_string)) > 0)
+		return n_cols;
+
+#ifdef TIOCGWINSZ
+	{
+		struct winsize ws;
+		if (!ioctl(1, TIOCGWINSZ, &ws)) {
+			if (ws.ws_col)
+				return ws.ws_col;
+		}
+	}
+#endif
+
+	return 80;
+}
+
+void add_cmdname(struct cmdnames *cmds, const char *name, int len)
+{
+	struct cmdname *ent = malloc(sizeof(*ent) + len + 1);
+
+	ent->len = len;
+	memcpy(ent->name, name, len);
+	ent->name[len] = 0;
+
+	ALLOC_GROW(cmds->names, cmds->cnt + 1, cmds->alloc);
+	cmds->names[cmds->cnt++] = ent;
+}
+
+static void clean_cmdnames(struct cmdnames *cmds)
+{
+	int i;
+	for (i = 0; i < cmds->cnt; ++i)
+		free(cmds->names[i]);
+	free(cmds->names);
+	cmds->cnt = 0;
+	cmds->alloc = 0;
+}
+
+static int cmdname_compare(const void *a_, const void *b_)
+{
+	struct cmdname *a = *(struct cmdname **)a_;
+	struct cmdname *b = *(struct cmdname **)b_;
+	return strcmp(a->name, b->name);
+}
+
+static void uniq(struct cmdnames *cmds)
+{
+	int i, j;
+
+	if (!cmds->cnt)
+		return;
+
+	for (i = j = 1; i < cmds->cnt; i++)
+		if (strcmp(cmds->names[i]->name, cmds->names[i-1]->name))
+			cmds->names[j++] = cmds->names[i];
+
+	cmds->cnt = j;
+}
+
+void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes)
+{
+	int ci, cj, ei;
+	int cmp;
+
+	ci = cj = ei = 0;
+	while (ci < cmds->cnt && ei < excludes->cnt) {
+		cmp = strcmp(cmds->names[ci]->name, excludes->names[ei]->name);
+		if (cmp < 0)
+			cmds->names[cj++] = cmds->names[ci++];
+		else if (cmp == 0)
+			ci++, ei++;
+		else if (cmp > 0)
+			ei++;
+	}
+
+	while (ci < cmds->cnt)
+		cmds->names[cj++] = cmds->names[ci++];
+
+	cmds->cnt = cj;
+}
+
+static void pretty_print_string_list(struct cmdnames *cmds, int longest)
+{
+	int cols = 1, rows;
+	int space = longest + 1; /* min 1 SP between words */
+	int max_cols = term_columns() - 1; /* don't print *on* the edge */
+	int i, j;
+
+	if (space < max_cols)
+		cols = max_cols / space;
+	rows = (cmds->cnt + cols - 1) / cols;
+
+	for (i = 0; i < rows; i++) {
+		printf("  ");
+
+		for (j = 0; j < cols; j++) {
+			int n = j * rows + i;
+			int size = space;
+			if (n >= cmds->cnt)
+				break;
+			if (j == cols-1 || n + rows >= cmds->cnt)
+				size = 1;
+			printf("%-*s", size, cmds->names[n]->name);
+		}
+		putchar('\n');
+	}
+}
+
+static int is_executable(const char *name)
+{
+	struct stat st;
+
+	if (stat(name, &st) || /* stat, not lstat */
+	    !S_ISREG(st.st_mode))
+		return 0;
+
+#ifdef __MINGW32__
+	/* cannot trust the executable bit, peek into the file instead */
+	char buf[3] = { 0 };
+	int n;
+	int fd = open(name, O_RDONLY);
+	st.st_mode &= ~S_IXUSR;
+	if (fd >= 0) {
+		n = read(fd, buf, 2);
+		if (n == 2)
+			/* DOS executables start with "MZ" */
+			if (!strcmp(buf, "#!") || !strcmp(buf, "MZ"))
+				st.st_mode |= S_IXUSR;
+		close(fd);
+	}
+#endif
+	return st.st_mode & S_IXUSR;
+}
+
+static void list_commands_in_dir(struct cmdnames *cmds,
+					 const char *path,
+					 const char *prefix)
+{
+	int prefix_len;
+	DIR *dir = opendir(path);
+	struct dirent *de;
+	struct strbuf buf = STRBUF_INIT;
+	int len;
+
+	if (!dir)
+		return;
+	if (!prefix)
+		prefix = "perf-";
+	prefix_len = strlen(prefix);
+
+	strbuf_addf(&buf, "%s/", path);
+	len = buf.len;
+
+	while ((de = readdir(dir)) != NULL) {
+		int entlen;
+
+		if (prefixcmp(de->d_name, prefix))
+			continue;
+
+		strbuf_setlen(&buf, len);
+		strbuf_addstr(&buf, de->d_name);
+		if (!is_executable(buf.buf))
+			continue;
+
+		entlen = strlen(de->d_name) - prefix_len;
+		if (has_extension(de->d_name, ".exe"))
+			entlen -= 4;
+
+		add_cmdname(cmds, de->d_name + prefix_len, entlen);
+	}
+	closedir(dir);
+	strbuf_release(&buf);
+}
+
+void load_command_list(const char *prefix,
+		struct cmdnames *main_cmds,
+		struct cmdnames *other_cmds)
+{
+	const char *env_path = getenv("PATH");
+	const char *exec_path = perf_exec_path();
+
+	if (exec_path) {
+		list_commands_in_dir(main_cmds, exec_path, prefix);
+		qsort(main_cmds->names, main_cmds->cnt,
+		      sizeof(*main_cmds->names), cmdname_compare);
+		uniq(main_cmds);
+	}
+
+	if (env_path) {
+		char *paths, *path, *colon;
+		path = paths = strdup(env_path);
+		while (1) {
+			if ((colon = strchr(path, PATH_SEP)))
+				*colon = 0;
+			if (!exec_path || strcmp(path, exec_path))
+				list_commands_in_dir(other_cmds, path, prefix);
+
+			if (!colon)
+				break;
+			path = colon + 1;
+		}
+		free(paths);
+
+		qsort(other_cmds->names, other_cmds->cnt,
+		      sizeof(*other_cmds->names), cmdname_compare);
+		uniq(other_cmds);
+	}
+	exclude_cmds(other_cmds, main_cmds);
+}
+
+void list_commands(const char *title, struct cmdnames *main_cmds,
+		   struct cmdnames *other_cmds)
+{
+	int i, longest = 0;
+
+	for (i = 0; i < main_cmds->cnt; i++)
+		if (longest < main_cmds->names[i]->len)
+			longest = main_cmds->names[i]->len;
+	for (i = 0; i < other_cmds->cnt; i++)
+		if (longest < other_cmds->names[i]->len)
+			longest = other_cmds->names[i]->len;
+
+	if (main_cmds->cnt) {
+		const char *exec_path = perf_exec_path();
+		printf("available %s in '%s'\n", title, exec_path);
+		printf("----------------");
+		mput_char('-', strlen(title) + strlen(exec_path));
+		putchar('\n');
+		pretty_print_string_list(main_cmds, longest);
+		putchar('\n');
+	}
+
+	if (other_cmds->cnt) {
+		printf("%s available from elsewhere on your $PATH\n", title);
+		printf("---------------------------------------");
+		mput_char('-', strlen(title));
+		putchar('\n');
+		pretty_print_string_list(other_cmds, longest);
+		putchar('\n');
+	}
+}
+
+int is_in_cmdlist(struct cmdnames *c, const char *s)
+{
+	int i;
+	for (i = 0; i < c->cnt; i++)
+		if (!strcmp(s, c->names[i]->name))
+			return 1;
+	return 0;
+}
+
+static int autocorrect;
+static struct cmdnames aliases;
+
+static int perf_unknown_cmd_config(const char *var, const char *value, void *cb)
+{
+	if (!strcmp(var, "help.autocorrect"))
+		autocorrect = perf_config_int(var,value);
+	/* Also use aliases for command lookup */
+	if (!prefixcmp(var, "alias."))
+		add_cmdname(&aliases, var + 6, strlen(var + 6));
+
+	return perf_default_config(var, value, cb);
+}
+
+static int levenshtein_compare(const void *p1, const void *p2)
+{
+	const struct cmdname *const *c1 = p1, *const *c2 = p2;
+	const char *s1 = (*c1)->name, *s2 = (*c2)->name;
+	int l1 = (*c1)->len;
+	int l2 = (*c2)->len;
+	return l1 != l2 ? l1 - l2 : strcmp(s1, s2);
+}
+
+static void add_cmd_list(struct cmdnames *cmds, struct cmdnames *old)
+{
+	int i;
+	ALLOC_GROW(cmds->names, cmds->cnt + old->cnt, cmds->alloc);
+
+	for (i = 0; i < old->cnt; i++)
+		cmds->names[cmds->cnt++] = old->names[i];
+	free(old->names);
+	old->cnt = 0;
+	old->names = NULL;
+}
+
+const char *help_unknown_cmd(const char *cmd)
+{
+	int i, n, best_similarity = 0;
+	struct cmdnames main_cmds, other_cmds;
+
+	memset(&main_cmds, 0, sizeof(main_cmds));
+	memset(&other_cmds, 0, sizeof(main_cmds));
+	memset(&aliases, 0, sizeof(aliases));
+
+	perf_config(perf_unknown_cmd_config, NULL);
+
+	load_command_list("perf-", &main_cmds, &other_cmds);
+
+	add_cmd_list(&main_cmds, &aliases);
+	add_cmd_list(&main_cmds, &other_cmds);
+	qsort(main_cmds.names, main_cmds.cnt,
+	      sizeof(main_cmds.names), cmdname_compare);
+	uniq(&main_cmds);
+
+	/* This reuses cmdname->len for similarity index */
+	for (i = 0; i < main_cmds.cnt; ++i)
+		main_cmds.names[i]->len =
+			levenshtein(cmd, main_cmds.names[i]->name, 0, 2, 1, 4);
+
+	qsort(main_cmds.names, main_cmds.cnt,
+	      sizeof(*main_cmds.names), levenshtein_compare);
+
+	if (!main_cmds.cnt)
+		die ("Uh oh. Your system reports no Git commands at all.");
+
+	best_similarity = main_cmds.names[0]->len;
+	n = 1;
+	while (n < main_cmds.cnt && best_similarity == main_cmds.names[n]->len)
+		++n;
+	if (autocorrect && n == 1) {
+		const char *assumed = main_cmds.names[0]->name;
+		main_cmds.names[0] = NULL;
+		clean_cmdnames(&main_cmds);
+		fprintf(stderr, "WARNING: You called a Git program named '%s', "
+			"which does not exist.\n"
+			"Continuing under the assumption that you meant '%s'\n",
+			cmd, assumed);
+		if (autocorrect > 0) {
+			fprintf(stderr, "in %0.1f seconds automatically...\n",
+				(float)autocorrect/10.0);
+			poll(NULL, 0, autocorrect * 100);
+		}
+		return assumed;
+	}
+
+	fprintf(stderr, "perf: '%s' is not a perf-command. See 'perf --help'.\n", cmd);
+
+	if (best_similarity < 6) {
+		fprintf(stderr, "\nDid you mean %s?\n",
+			n < 2 ? "this": "one of these");
+
+		for (i = 0; i < n; i++)
+			fprintf(stderr, "\t%s\n", main_cmds.names[i]->name);
+	}
+
+	exit(1);
+}
+
+int cmd_version(int argc, const char **argv, const char *prefix)
+{
+	printf("perf version %s\n", perf_version_string);
+	return 0;
+}
diff --git a/Documentation/perf_counter/util/help.h b/Documentation/perf_counter/util/help.h
new file mode 100644
index 0000000..56bc154
--- /dev/null
+++ b/Documentation/perf_counter/util/help.h
@@ -0,0 +1,29 @@
+#ifndef HELP_H
+#define HELP_H
+
+struct cmdnames {
+	int alloc;
+	int cnt;
+	struct cmdname {
+		size_t len; /* also used for similarity index in help.c */
+		char name[FLEX_ARRAY];
+	} **names;
+};
+
+static inline void mput_char(char c, unsigned int num)
+{
+	while(num--)
+		putchar(c);
+}
+
+void load_command_list(const char *prefix,
+		struct cmdnames *main_cmds,
+		struct cmdnames *other_cmds);
+void add_cmdname(struct cmdnames *cmds, const char *name, int len);
+/* Here we require that excludes is a sorted list. */
+void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes);
+int is_in_cmdlist(struct cmdnames *c, const char *s);
+void list_commands(const char *title, struct cmdnames *main_cmds,
+		   struct cmdnames *other_cmds);
+
+#endif /* HELP_H */
diff --git a/Documentation/perf_counter/util/levenshtein.c b/Documentation/perf_counter/util/levenshtein.c
new file mode 100644
index 0000000..e521d15
--- /dev/null
+++ b/Documentation/perf_counter/util/levenshtein.c
@@ -0,0 +1,84 @@
+#include "cache.h"
+#include "levenshtein.h"
+
+/*
+ * This function implements the Damerau-Levenshtein algorithm to
+ * calculate a distance between strings.
+ *
+ * Basically, it says how many letters need to be swapped, substituted,
+ * deleted from, or added to string1, at least, to get string2.
+ *
+ * The idea is to build a distance matrix for the substrings of both
+ * strings.  To avoid a large space complexity, only the last three rows
+ * are kept in memory (if swaps had the same or higher cost as one deletion
+ * plus one insertion, only two rows would be needed).
+ *
+ * At any stage, "i + 1" denotes the length of the current substring of
+ * string1 that the distance is calculated for.
+ *
+ * row2 holds the current row, row1 the previous row (i.e. for the substring
+ * of string1 of length "i"), and row0 the row before that.
+ *
+ * In other words, at the start of the big loop, row2[j + 1] contains the
+ * Damerau-Levenshtein distance between the substring of string1 of length
+ * "i" and the substring of string2 of length "j + 1".
+ *
+ * All the big loop does is determine the partial minimum-cost paths.
+ *
+ * It does so by calculating the costs of the path ending in characters
+ * i (in string1) and j (in string2), respectively, given that the last
+ * operation is a substition, a swap, a deletion, or an insertion.
+ *
+ * This implementation allows the costs to be weighted:
+ *
+ * - w (as in "sWap")
+ * - s (as in "Substitution")
+ * - a (for insertion, AKA "Add")
+ * - d (as in "Deletion")
+ *
+ * Note that this algorithm calculates a distance _iff_ d == a.
+ */
+int levenshtein(const char *string1, const char *string2,
+		int w, int s, int a, int d)
+{
+	int len1 = strlen(string1), len2 = strlen(string2);
+	int *row0 = malloc(sizeof(int) * (len2 + 1));
+	int *row1 = malloc(sizeof(int) * (len2 + 1));
+	int *row2 = malloc(sizeof(int) * (len2 + 1));
+	int i, j;
+
+	for (j = 0; j <= len2; j++)
+		row1[j] = j * a;
+	for (i = 0; i < len1; i++) {
+		int *dummy;
+
+		row2[0] = (i + 1) * d;
+		for (j = 0; j < len2; j++) {
+			/* substitution */
+			row2[j + 1] = row1[j] + s * (string1[i] != string2[j]);
+			/* swap */
+			if (i > 0 && j > 0 && string1[i - 1] == string2[j] &&
+					string1[i] == string2[j - 1] &&
+					row2[j + 1] > row0[j - 1] + w)
+				row2[j + 1] = row0[j - 1] + w;
+			/* deletion */
+			if (row2[j + 1] > row1[j + 1] + d)
+				row2[j + 1] = row1[j + 1] + d;
+			/* insertion */
+			if (row2[j + 1] > row2[j] + a)
+				row2[j + 1] = row2[j] + a;
+		}
+
+		dummy = row0;
+		row0 = row1;
+		row1 = row2;
+		row2 = dummy;
+	}
+
+	i = row1[len2];
+	free(row0);
+	free(row1);
+	free(row2);
+
+	return i;
+}
diff --git a/Documentation/perf_counter/util/levenshtein.h b/Documentation/perf_counter/util/levenshtein.h
new file mode 100644
index 0000000..0173abe
--- /dev/null
+++ b/Documentation/perf_counter/util/levenshtein.h
@@ -0,0 +1,8 @@
+#ifndef LEVENSHTEIN_H
+#define LEVENSHTEIN_H
+
+int levenshtein(const char *string1, const char *string2,
+	int swap_penalty, int substition_penalty,
+	int insertion_penalty, int deletion_penalty);
+
+#endif
diff --git a/Documentation/perf_counter/util/parse-options.c b/Documentation/perf_counter/util/parse-options.c
new file mode 100644
index 0000000..28b34c1
--- /dev/null
+++ b/Documentation/perf_counter/util/parse-options.c
@@ -0,0 +1,492 @@
+#include "util.h"
+#include "parse-options.h"
+#include "cache.h"
+
+#define OPT_SHORT 1
+#define OPT_UNSET 2
+
+static int opterror(const struct option *opt, const char *reason, int flags)
+{
+	if (flags & OPT_SHORT)
+		return error("switch `%c' %s", opt->short_name, reason);
+	if (flags & OPT_UNSET)
+		return error("option `no-%s' %s", opt->long_name, reason);
+	return error("option `%s' %s", opt->long_name, reason);
+}
+
+static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt,
+		   int flags, const char **arg)
+{
+	if (p->opt) {
+		*arg = p->opt;
+		p->opt = NULL;
+	} else if (p->argc == 1 && (opt->flags & PARSE_OPT_LASTARG_DEFAULT)) {
+		*arg = (const char *)opt->defval;
+	} else if (p->argc > 1) {
+		p->argc--;
+		*arg = *++p->argv;
+	} else
+		return opterror(opt, "requires a value", flags);
+	return 0;
+}
+
+static int get_value(struct parse_opt_ctx_t *p,
+		     const struct option *opt, int flags)
+{
+	const char *s, *arg;
+	const int unset = flags & OPT_UNSET;
+
+	if (unset && p->opt)
+		return opterror(opt, "takes no value", flags);
+	if (unset && (opt->flags & PARSE_OPT_NONEG))
+		return opterror(opt, "isn't available", flags);
+
+	if (!(flags & OPT_SHORT) && p->opt) {
+		switch (opt->type) {
+		case OPTION_CALLBACK:
+			if (!(opt->flags & PARSE_OPT_NOARG))
+				break;
+			/* FALLTHROUGH */
+		case OPTION_BOOLEAN:
+		case OPTION_BIT:
+		case OPTION_SET_INT:
+		case OPTION_SET_PTR:
+			return opterror(opt, "takes no value", flags);
+		default:
+			break;
+		}
+	}
+
+	switch (opt->type) {
+	case OPTION_BIT:
+		if (unset)
+			*(int *)opt->value &= ~opt->defval;
+		else
+			*(int *)opt->value |= opt->defval;
+		return 0;
+
+	case OPTION_BOOLEAN:
+		*(int *)opt->value = unset ? 0 : *(int *)opt->value + 1;
+		return 0;
+
+	case OPTION_SET_INT:
+		*(int *)opt->value = unset ? 0 : opt->defval;
+		return 0;
+
+	case OPTION_SET_PTR:
+		*(void **)opt->value = unset ? NULL : (void *)opt->defval;
+		return 0;
+
+	case OPTION_STRING:
+		if (unset)
+			*(const char **)opt->value = NULL;
+		else if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+			*(const char **)opt->value = (const char *)opt->defval;
+		else
+			return get_arg(p, opt, flags, (const char **)opt->value);
+		return 0;
+
+	case OPTION_CALLBACK:
+		if (unset)
+			return (*opt->callback)(opt, NULL, 1) ? (-1) : 0;
+		if (opt->flags & PARSE_OPT_NOARG)
+			return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+			return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		return (*opt->callback)(opt, arg, 0) ? (-1) : 0;
+
+	case OPTION_INTEGER:
+		if (unset) {
+			*(int *)opt->value = 0;
+			return 0;
+		}
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+			*(int *)opt->value = opt->defval;
+			return 0;
+		}
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		*(int *)opt->value = strtol(arg, (char **)&s, 10);
+		if (*s)
+			return opterror(opt, "expects a numerical value", flags);
+		return 0;
+
+	default:
+		die("should not happen, someone must be hit on the forehead");
+	}
+}
+
+static int parse_short_opt(struct parse_opt_ctx_t *p, const struct option *options)
+{
+	for (; options->type != OPTION_END; options++) {
+		if (options->short_name == *p->opt) {
+			p->opt = p->opt[1] ? p->opt + 1 : NULL;
+			return get_value(p, options, OPT_SHORT);
+		}
+	}
+	return -2;
+}
+
+static int parse_long_opt(struct parse_opt_ctx_t *p, const char *arg,
+                          const struct option *options)
+{
+	const char *arg_end = strchr(arg, '=');
+	const struct option *abbrev_option = NULL, *ambiguous_option = NULL;
+	int abbrev_flags = 0, ambiguous_flags = 0;
+
+	if (!arg_end)
+		arg_end = arg + strlen(arg);
+
+	for (; options->type != OPTION_END; options++) {
+		const char *rest;
+		int flags = 0;
+
+		if (!options->long_name)
+			continue;
+
+		rest = skip_prefix(arg, options->long_name);
+		if (options->type == OPTION_ARGUMENT) {
+			if (!rest)
+				continue;
+			if (*rest == '=')
+				return opterror(options, "takes no value", flags);
+			if (*rest)
+				continue;
+			p->out[p->cpidx++] = arg - 2;
+			return 0;
+		}
+		if (!rest) {
+			/* abbreviated? */
+			if (!strncmp(options->long_name, arg, arg_end - arg)) {
+is_abbreviated:
+				if (abbrev_option) {
+					/*
+					 * If this is abbreviated, it is
+					 * ambiguous. So when there is no
+					 * exact match later, we need to
+					 * error out.
+					 */
+					ambiguous_option = abbrev_option;
+					ambiguous_flags = abbrev_flags;
+				}
+				if (!(flags & OPT_UNSET) && *arg_end)
+					p->opt = arg_end + 1;
+				abbrev_option = options;
+				abbrev_flags = flags;
+				continue;
+			}
+			/* negated and abbreviated very much? */
+			if (!prefixcmp("no-", arg)) {
+				flags |= OPT_UNSET;
+				goto is_abbreviated;
+			}
+			/* negated? */
+			if (strncmp(arg, "no-", 3))
+				continue;
+			flags |= OPT_UNSET;
+			rest = skip_prefix(arg + 3, options->long_name);
+			/* abbreviated and negated? */
+			if (!rest && !prefixcmp(options->long_name, arg + 3))
+				goto is_abbreviated;
+			if (!rest)
+				continue;
+		}
+		if (*rest) {
+			if (*rest != '=')
+				continue;
+			p->opt = rest + 1;
+		}
+		return get_value(p, options, flags);
+	}
+
+	if (ambiguous_option)
+		return error("Ambiguous option: %s "
+			"(could be --%s%s or --%s%s)",
+			arg,
+			(ambiguous_flags & OPT_UNSET) ?  "no-" : "",
+			ambiguous_option->long_name,
+			(abbrev_flags & OPT_UNSET) ?  "no-" : "",
+			abbrev_option->long_name);
+	if (abbrev_option)
+		return get_value(p, abbrev_option, abbrev_flags);
+	return -2;
+}
+
+static void check_typos(const char *arg, const struct option *options)
+{
+	if (strlen(arg) < 3)
+		return;
+
+	if (!prefixcmp(arg, "no-")) {
+		error ("did you mean `--%s` (with two dashes ?)", arg);
+		exit(129);
+	}
+
+	for (; options->type != OPTION_END; options++) {
+		if (!options->long_name)
+			continue;
+		if (!prefixcmp(options->long_name, arg)) {
+			error ("did you mean `--%s` (with two dashes ?)", arg);
+			exit(129);
+		}
+	}
+}
+
+void parse_options_start(struct parse_opt_ctx_t *ctx,
+			 int argc, const char **argv, int flags)
+{
+	memset(ctx, 0, sizeof(*ctx));
+	ctx->argc = argc - 1;
+	ctx->argv = argv + 1;
+	ctx->out  = argv;
+	ctx->cpidx = ((flags & PARSE_OPT_KEEP_ARGV0) != 0);
+	ctx->flags = flags;
+	if ((flags & PARSE_OPT_KEEP_UNKNOWN) &&
+	    (flags & PARSE_OPT_STOP_AT_NON_OPTION))
+		die("STOP_AT_NON_OPTION and KEEP_UNKNOWN don't go together");
+}
+
+static int usage_with_options_internal(const char * const *,
+				       const struct option *, int);
+
+int parse_options_step(struct parse_opt_ctx_t *ctx,
+		       const struct option *options,
+		       const char * const usagestr[])
+{
+	int internal_help = !(ctx->flags & PARSE_OPT_NO_INTERNAL_HELP);
+
+	/* we must reset ->opt, unknown short option leave it dangling */
+	ctx->opt = NULL;
+
+	for (; ctx->argc; ctx->argc--, ctx->argv++) {
+		const char *arg = ctx->argv[0];
+
+		if (*arg != '-' || !arg[1]) {
+			if (ctx->flags & PARSE_OPT_STOP_AT_NON_OPTION)
+				break;
+			ctx->out[ctx->cpidx++] = ctx->argv[0];
+			continue;
+		}
+
+		if (arg[1] != '-') {
+			ctx->opt = arg + 1;
+			if (internal_help && *ctx->opt == 'h')
+				return parse_options_usage(usagestr, options);
+			switch (parse_short_opt(ctx, options)) {
+			case -1:
+				return parse_options_usage(usagestr, options);
+			case -2:
+				goto unknown;
+			}
+			if (ctx->opt)
+				check_typos(arg + 1, options);
+			while (ctx->opt) {
+				if (internal_help && *ctx->opt == 'h')
+					return parse_options_usage(usagestr, options);
+				switch (parse_short_opt(ctx, options)) {
+				case -1:
+					return parse_options_usage(usagestr, options);
+				case -2:
+					/* fake a short option thing to hide the fact that we may have
+					 * started to parse aggregated stuff
+					 *
+					 * This is leaky, too bad.
+					 */
+					ctx->argv[0] = strdup(ctx->opt - 1);
+					*(char *)ctx->argv[0] = '-';
+					goto unknown;
+				}
+			}
+			continue;
+		}
+
+		if (!arg[2]) { /* "--" */
+			if (!(ctx->flags & PARSE_OPT_KEEP_DASHDASH)) {
+				ctx->argc--;
+				ctx->argv++;
+			}
+			break;
+		}
+
+		if (internal_help && !strcmp(arg + 2, "help-all"))
+			return usage_with_options_internal(usagestr, options, 1);
+		if (internal_help && !strcmp(arg + 2, "help"))
+			return parse_options_usage(usagestr, options);
+		switch (parse_long_opt(ctx, arg + 2, options)) {
+		case -1:
+			return parse_options_usage(usagestr, options);
+		case -2:
+			goto unknown;
+		}
+		continue;
+unknown:
+		if (!(ctx->flags & PARSE_OPT_KEEP_UNKNOWN))
+			return PARSE_OPT_UNKNOWN;
+		ctx->out[ctx->cpidx++] = ctx->argv[0];
+		ctx->opt = NULL;
+	}
+	return PARSE_OPT_DONE;
+}
+
+int parse_options_end(struct parse_opt_ctx_t *ctx)
+{
+	memmove(ctx->out + ctx->cpidx, ctx->argv, ctx->argc * sizeof(*ctx->out));
+	ctx->out[ctx->cpidx + ctx->argc] = NULL;
+	return ctx->cpidx + ctx->argc;
+}
+
+int parse_options(int argc, const char **argv, const struct option *options,
+		  const char * const usagestr[], int flags)
+{
+	struct parse_opt_ctx_t ctx;
+
+	parse_options_start(&ctx, argc, argv, flags);
+	switch (parse_options_step(&ctx, options, usagestr)) {
+	case PARSE_OPT_HELP:
+		exit(129);
+	case PARSE_OPT_DONE:
+		break;
+	default: /* PARSE_OPT_UNKNOWN */
+		if (ctx.argv[0][1] == '-') {
+			error("unknown option `%s'", ctx.argv[0] + 2);
+		} else {
+			error("unknown switch `%c'", *ctx.opt);
+		}
+		usage_with_options(usagestr, options);
+	}
+
+	return parse_options_end(&ctx);
+}
+
+#define USAGE_OPTS_WIDTH 24
+#define USAGE_GAP         2
+
+int usage_with_options_internal(const char * const *usagestr,
+				const struct option *opts, int full)
+{
+	if (!usagestr)
+		return PARSE_OPT_HELP;
+
+	fprintf(stderr, "usage: %s\n", *usagestr++);
+	while (*usagestr && **usagestr)
+		fprintf(stderr, "   or: %s\n", *usagestr++);
+	while (*usagestr) {
+		fprintf(stderr, "%s%s\n",
+				**usagestr ? "    " : "",
+				*usagestr);
+		usagestr++;
+	}
+
+	if (opts->type != OPTION_GROUP)
+		fputc('\n', stderr);
+
+	for (; opts->type != OPTION_END; opts++) {
+		size_t pos;
+		int pad;
+
+		if (opts->type == OPTION_GROUP) {
+			fputc('\n', stderr);
+			if (*opts->help)
+				fprintf(stderr, "%s\n", opts->help);
+			continue;
+		}
+		if (!full && (opts->flags & PARSE_OPT_HIDDEN))
+			continue;
+
+		pos = fprintf(stderr, "    ");
+		if (opts->short_name)
+			pos += fprintf(stderr, "-%c", opts->short_name);
+		if (opts->long_name && opts->short_name)
+			pos += fprintf(stderr, ", ");
+		if (opts->long_name)
+			pos += fprintf(stderr, "--%s", opts->long_name);
+
+		switch (opts->type) {
+		case OPTION_ARGUMENT:
+			break;
+		case OPTION_INTEGER:
+			if (opts->flags & PARSE_OPT_OPTARG)
+				if (opts->long_name)
+					pos += fprintf(stderr, "[=<n>]");
+				else
+					pos += fprintf(stderr, "[<n>]");
+			else
+				pos += fprintf(stderr, " <n>");
+			break;
+		case OPTION_CALLBACK:
+			if (opts->flags & PARSE_OPT_NOARG)
+				break;
+			/* FALLTHROUGH */
+		case OPTION_STRING:
+			if (opts->argh) {
+				if (opts->flags & PARSE_OPT_OPTARG)
+					if (opts->long_name)
+						pos += fprintf(stderr, "[=<%s>]", opts->argh);
+					else
+						pos += fprintf(stderr, "[<%s>]", opts->argh);
+				else
+					pos += fprintf(stderr, " <%s>", opts->argh);
+			} else {
+				if (opts->flags & PARSE_OPT_OPTARG)
+					if (opts->long_name)
+						pos += fprintf(stderr, "[=...]");
+					else
+						pos += fprintf(stderr, "[...]");
+				else
+					pos += fprintf(stderr, " ...");
+			}
+			break;
+		default: /* OPTION_{BIT,BOOLEAN,SET_INT,SET_PTR} */
+			break;
+		}
+
+		if (pos <= USAGE_OPTS_WIDTH)
+			pad = USAGE_OPTS_WIDTH - pos;
+		else {
+			fputc('\n', stderr);
+			pad = USAGE_OPTS_WIDTH;
+		}
+		fprintf(stderr, "%*s%s\n", pad + USAGE_GAP, "", opts->help);
+	}
+	fputc('\n', stderr);
+
+	return PARSE_OPT_HELP;
+}
+
+void usage_with_options(const char * const *usagestr,
+			const struct option *opts)
+{
+	usage_with_options_internal(usagestr, opts, 0);
+	exit(129);
+}
+
+int parse_options_usage(const char * const *usagestr,
+			const struct option *opts)
+{
+	return usage_with_options_internal(usagestr, opts, 0);
+}
+
+
+int parse_opt_verbosity_cb(const struct option *opt, const char *arg,
+			   int unset)
+{
+	int *target = opt->value;
+
+	if (unset)
+		/* --no-quiet, --no-verbose */
+		*target = 0;
+	else if (opt->short_name == 'v') {
+		if (*target >= 0)
+			(*target)++;
+		else
+			*target = 1;
+	} else {
+		if (*target <= 0)
+			(*target)--;
+		else
+			*target = -1;
+	}
+	return 0;
+}
diff --git a/Documentation/perf_counter/util/parse-options.h b/Documentation/perf_counter/util/parse-options.h
new file mode 100644
index 0000000..a81c7fa
--- /dev/null
+++ b/Documentation/perf_counter/util/parse-options.h
@@ -0,0 +1,172 @@
+#ifndef PARSE_OPTIONS_H
+#define PARSE_OPTIONS_H
+
+enum parse_opt_type {
+	/* special types */
+	OPTION_END,
+	OPTION_ARGUMENT,
+	OPTION_GROUP,
+	/* options with no arguments */
+	OPTION_BIT,
+	OPTION_BOOLEAN, /* _INCR would have been a better name */
+	OPTION_SET_INT,
+	OPTION_SET_PTR,
+	/* options with arguments (usually) */
+	OPTION_STRING,
+	OPTION_INTEGER,
+	OPTION_CALLBACK,
+};
+
+enum parse_opt_flags {
+	PARSE_OPT_KEEP_DASHDASH = 1,
+	PARSE_OPT_STOP_AT_NON_OPTION = 2,
+	PARSE_OPT_KEEP_ARGV0 = 4,
+	PARSE_OPT_KEEP_UNKNOWN = 8,
+	PARSE_OPT_NO_INTERNAL_HELP = 16,
+};
+
+enum parse_opt_option_flags {
+	PARSE_OPT_OPTARG  = 1,
+	PARSE_OPT_NOARG   = 2,
+	PARSE_OPT_NONEG   = 4,
+	PARSE_OPT_HIDDEN  = 8,
+	PARSE_OPT_LASTARG_DEFAULT = 16,
+};
+
+struct option;
+typedef int parse_opt_cb(const struct option *, const char *arg, int unset);
+
+/*
+ * `type`::
+ *   holds the type of the option, you must have an OPTION_END last in your
+ *   array.
+ *
+ * `short_name`::
+ *   the character to use as a short option name, '\0' if none.
+ *
+ * `long_name`::
+ *   the long option name, without the leading dashes, NULL if none.
+ *
+ * `value`::
+ *   stores pointers to the values to be filled.
+ *
+ * `argh`::
+ *   token to explain the kind of argument this option wants. Keep it
+ *   homogenous across the repository.
+ *
+ * `help`::
+ *   the short help associated to what the option does.
+ *   Must never be NULL (except for OPTION_END).
+ *   OPTION_GROUP uses this pointer to store the group header.
+ *
+ * `flags`::
+ *   mask of parse_opt_option_flags.
+ *   PARSE_OPT_OPTARG: says that the argument is optionnal (not for BOOLEANs)
+ *   PARSE_OPT_NOARG: says that this option takes no argument, for CALLBACKs
+ *   PARSE_OPT_NONEG: says that this option cannot be negated
+ *   PARSE_OPT_HIDDEN this option is skipped in the default usage, showed in
+ *                    the long one.
+ *
+ * `callback`::
+ *   pointer to the callback to use for OPTION_CALLBACK.
+ *
+ * `defval`::
+ *   default value to fill (*->value) with for PARSE_OPT_OPTARG.
+ *   OPTION_{BIT,SET_INT,SET_PTR} store the {mask,integer,pointer} to put in
+ *   the value when met.
+ *   CALLBACKS can use it like they want.
+ */
+struct option {
+	enum parse_opt_type type;
+	int short_name;
+	const char *long_name;
+	void *value;
+	const char *argh;
+	const char *help;
+
+	int flags;
+	parse_opt_cb *callback;
+	intptr_t defval;
+};
+
+#define OPT_END()                   { OPTION_END }
+#define OPT_ARGUMENT(l, h)          { OPTION_ARGUMENT, 0, (l), NULL, NULL, (h) }
+#define OPT_GROUP(h)                { OPTION_GROUP, 0, NULL, NULL, NULL, (h) }
+#define OPT_BIT(s, l, v, h, b)      { OPTION_BIT, (s), (l), (v), NULL, (h), 0, NULL, (b) }
+#define OPT_BOOLEAN(s, l, v, h)     { OPTION_BOOLEAN, (s), (l), (v), NULL, (h) }
+#define OPT_SET_INT(s, l, v, h, i)  { OPTION_SET_INT, (s), (l), (v), NULL, (h), 0, NULL, (i) }
+#define OPT_SET_PTR(s, l, v, h, p)  { OPTION_SET_PTR, (s), (l), (v), NULL, (h), 0, NULL, (p) }
+#define OPT_INTEGER(s, l, v, h)     { OPTION_INTEGER, (s), (l), (v), NULL, (h) }
+#define OPT_STRING(s, l, v, a, h)   { OPTION_STRING,  (s), (l), (v), (a), (h) }
+#define OPT_DATE(s, l, v, h) \
+	{ OPTION_CALLBACK, (s), (l), (v), "time",(h), 0, \
+	  parse_opt_approxidate_cb }
+#define OPT_CALLBACK(s, l, v, a, h, f) \
+	{ OPTION_CALLBACK, (s), (l), (v), (a), (h), 0, (f) }
+
+/* parse_options() will filter out the processed options and leave the
+ * non-option argments in argv[].
+ * Returns the number of arguments left in argv[].
+ */
+extern int parse_options(int argc, const char **argv,
+                         const struct option *options,
+                         const char * const usagestr[], int flags);
+
+extern NORETURN void usage_with_options(const char * const *usagestr,
+                                        const struct option *options);
+
+/*----- incremantal advanced APIs -----*/
+
+enum {
+	PARSE_OPT_HELP = -1,
+	PARSE_OPT_DONE,
+	PARSE_OPT_UNKNOWN,
+};
+
+/*
+ * It's okay for the caller to consume argv/argc in the usual way.
+ * Other fields of that structure are private to parse-options and should not
+ * be modified in any way.
+ */
+struct parse_opt_ctx_t {
+	const char **argv;
+	const char **out;
+	int argc, cpidx;
+	const char *opt;
+	int flags;
+};
+
+extern int parse_options_usage(const char * const *usagestr,
+			       const struct option *opts);
+
+extern void parse_options_start(struct parse_opt_ctx_t *ctx,
+				int argc, const char **argv, int flags);
+
+extern int parse_options_step(struct parse_opt_ctx_t *ctx,
+			      const struct option *options,
+			      const char * const usagestr[]);
+
+extern int parse_options_end(struct parse_opt_ctx_t *ctx);
+
+
+/*----- some often used options -----*/
+extern int parse_opt_abbrev_cb(const struct option *, const char *, int);
+extern int parse_opt_approxidate_cb(const struct option *, const char *, int);
+extern int parse_opt_verbosity_cb(const struct option *, const char *, int);
+
+#define OPT__VERBOSE(var)  OPT_BOOLEAN('v', "verbose", (var), "be verbose")
+#define OPT__QUIET(var)    OPT_BOOLEAN('q', "quiet",   (var), "be quiet")
+#define OPT__VERBOSITY(var) \
+	{ OPTION_CALLBACK, 'v', "verbose", (var), NULL, "be more verbose", \
+	  PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }, \
+	{ OPTION_CALLBACK, 'q', "quiet", (var), NULL, "be more quiet", \
+	  PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }
+#define OPT__DRY_RUN(var)  OPT_BOOLEAN('n', "dry-run", (var), "dry run")
+#define OPT__ABBREV(var)  \
+	{ OPTION_CALLBACK, 0, "abbrev", (var), "n", \
+	  "use <n> digits to display SHA-1s", \
+	  PARSE_OPT_OPTARG, &parse_opt_abbrev_cb, 0 }
+
+extern const char *parse_options_fix_filename(const char *prefix, const char *file);
+
+#endif
diff --git a/Documentation/perf_counter/util/path.c b/Documentation/perf_counter/util/path.c
new file mode 100644
index 0000000..a501a40
--- /dev/null
+++ b/Documentation/perf_counter/util/path.c
@@ -0,0 +1,353 @@
+/*
+ * I'm tired of doing "vsnprintf()" etc just to open a
+ * file, so here's a "return static buffer with printf"
+ * interface for paths.
+ *
+ * It's obviously not thread-safe. Sue me. But it's quite
+ * useful for doing things like
+ *
+ *   f = open(mkpath("%s/%s.perf", base, name), O_RDONLY);
+ *
+ * which is what it's designed for.
+ */
+#include "cache.h"
+
+static char bad_path[] = "/bad-path/";
+/*
+ * Two hacks:
+ */
+
+static char *get_perf_dir(void)
+{
+	return ".";
+}
+
+size_t strlcpy(char *dest, const char *src, size_t size)
+{
+	size_t ret = strlen(src);
+
+	if (size) {
+		size_t len = (ret >= size) ? size - 1 : ret;
+		memcpy(dest, src, len);
+		dest[len] = '\0';
+	}
+	return ret;
+}
+
+
+static char *get_pathname(void)
+{
+	static char pathname_array[4][PATH_MAX];
+	static int index;
+	return pathname_array[3 & ++index];
+}
+
+static char *cleanup_path(char *path)
+{
+	/* Clean it up */
+	if (!memcmp(path, "./", 2)) {
+		path += 2;
+		while (*path == '/')
+			path++;
+	}
+	return path;
+}
+
+char *mksnpath(char *buf, size_t n, const char *fmt, ...)
+{
+	va_list args;
+	unsigned len;
+
+	va_start(args, fmt);
+	len = vsnprintf(buf, n, fmt, args);
+	va_end(args);
+	if (len >= n) {
+		strlcpy(buf, bad_path, n);
+		return buf;
+	}
+	return cleanup_path(buf);
+}
+
+static char *perf_vsnpath(char *buf, size_t n, const char *fmt, va_list args)
+{
+	const char *perf_dir = get_perf_dir();
+	size_t len;
+
+	len = strlen(perf_dir);
+	if (n < len + 1)
+		goto bad;
+	memcpy(buf, perf_dir, len);
+	if (len && !is_dir_sep(perf_dir[len-1]))
+		buf[len++] = '/';
+	len += vsnprintf(buf + len, n - len, fmt, args);
+	if (len >= n)
+		goto bad;
+	return cleanup_path(buf);
+bad:
+	strlcpy(buf, bad_path, n);
+	return buf;
+}
+
+char *perf_snpath(char *buf, size_t n, const char *fmt, ...)
+{
+	va_list args;
+	va_start(args, fmt);
+	(void)perf_vsnpath(buf, n, fmt, args);
+	va_end(args);
+	return buf;
+}
+
+char *perf_pathdup(const char *fmt, ...)
+{
+	char path[PATH_MAX];
+	va_list args;
+	va_start(args, fmt);
+	(void)perf_vsnpath(path, sizeof(path), fmt, args);
+	va_end(args);
+	return xstrdup(path);
+}
+
+char *mkpath(const char *fmt, ...)
+{
+	va_list args;
+	unsigned len;
+	char *pathname = get_pathname();
+
+	va_start(args, fmt);
+	len = vsnprintf(pathname, PATH_MAX, fmt, args);
+	va_end(args);
+	if (len >= PATH_MAX)
+		return bad_path;
+	return cleanup_path(pathname);
+}
+
+char *perf_path(const char *fmt, ...)
+{
+	const char *perf_dir = get_perf_dir();
+	char *pathname = get_pathname();
+	va_list args;
+	unsigned len;
+
+	len = strlen(perf_dir);
+	if (len > PATH_MAX-100)
+		return bad_path;
+	memcpy(pathname, perf_dir, len);
+	if (len && perf_dir[len-1] != '/')
+		pathname[len++] = '/';
+	va_start(args, fmt);
+	len += vsnprintf(pathname + len, PATH_MAX - len, fmt, args);
+	va_end(args);
+	if (len >= PATH_MAX)
+		return bad_path;
+	return cleanup_path(pathname);
+}
+
+
+/* perf_mkstemp() - create tmp file honoring TMPDIR variable */
+int perf_mkstemp(char *path, size_t len, const char *template)
+{
+	const char *tmp;
+	size_t n;
+
+	tmp = getenv("TMPDIR");
+	if (!tmp)
+		tmp = "/tmp";
+	n = snprintf(path, len, "%s/%s", tmp, template);
+	if (len <= n) {
+		errno = ENAMETOOLONG;
+		return -1;
+	}
+	return mkstemp(path);
+}
+
+
+const char *make_relative_path(const char *abs, const char *base)
+{
+	static char buf[PATH_MAX + 1];
+	int baselen;
+	if (!base)
+		return abs;
+	baselen = strlen(base);
+	if (prefixcmp(abs, base))
+		return abs;
+	if (abs[baselen] == '/')
+		baselen++;
+	else if (base[baselen - 1] != '/')
+		return abs;
+	strcpy(buf, abs + baselen);
+	return buf;
+}
+
+/*
+ * It is okay if dst == src, but they should not overlap otherwise.
+ *
+ * Performs the following normalizations on src, storing the result in dst:
+ * - Ensures that components are separated by '/' (Windows only)
+ * - Squashes sequences of '/'.
+ * - Removes "." components.
+ * - Removes ".." components, and the components the precede them.
+ * Returns failure (non-zero) if a ".." component appears as first path
+ * component anytime during the normalization. Otherwise, returns success (0).
+ *
+ * Note that this function is purely textual.  It does not follow symlinks,
+ * verify the existence of the path, or make any system calls.
+ */
+int normalize_path_copy(char *dst, const char *src)
+{
+	char *dst0;
+
+	if (has_dos_drive_prefix(src)) {
+		*dst++ = *src++;
+		*dst++ = *src++;
+	}
+	dst0 = dst;
+
+	if (is_dir_sep(*src)) {
+		*dst++ = '/';
+		while (is_dir_sep(*src))
+			src++;
+	}
+
+	for (;;) {
+		char c = *src;
+
+		/*
+		 * A path component that begins with . could be
+		 * special:
+		 * (1) "." and ends   -- ignore and terminate.
+		 * (2) "./"           -- ignore them, eat slash and continue.
+		 * (3) ".." and ends  -- strip one and terminate.
+		 * (4) "../"          -- strip one, eat slash and continue.
+		 */
+		if (c == '.') {
+			if (!src[1]) {
+				/* (1) */
+				src++;
+			} else if (is_dir_sep(src[1])) {
+				/* (2) */
+				src += 2;
+				while (is_dir_sep(*src))
+					src++;
+				continue;
+			} else if (src[1] == '.') {
+				if (!src[2]) {
+					/* (3) */
+					src += 2;
+					goto up_one;
+				} else if (is_dir_sep(src[2])) {
+					/* (4) */
+					src += 3;
+					while (is_dir_sep(*src))
+						src++;
+					goto up_one;
+				}
+			}
+		}
+
+		/* copy up to the next '/', and eat all '/' */
+		while ((c = *src++) != '\0' && !is_dir_sep(c))
+			*dst++ = c;
+		if (is_dir_sep(c)) {
+			*dst++ = '/';
+			while (is_dir_sep(c))
+				c = *src++;
+			src--;
+		} else if (!c)
+			break;
+		continue;
+
+	up_one:
+		/*
+		 * dst0..dst is prefix portion, and dst[-1] is '/';
+		 * go up one level.
+		 */
+		dst--;	/* go to trailing '/' */
+		if (dst <= dst0)
+			return -1;
+		/* Windows: dst[-1] cannot be backslash anymore */
+		while (dst0 < dst && dst[-1] != '/')
+			dst--;
+	}
+	*dst = '\0';
+	return 0;
+}
+
+/*
+ * path = Canonical absolute path
+ * prefix_list = Colon-separated list of absolute paths
+ *
+ * Determines, for each path in prefix_list, whether the "prefix" really
+ * is an ancestor directory of path.  Returns the length of the longest
+ * ancestor directory, excluding any trailing slashes, or -1 if no prefix
+ * is an ancestor.  (Note that this means 0 is returned if prefix_list is
+ * "/".) "/foo" is not considered an ancestor of "/foobar".  Directories
+ * are not considered to be their own ancestors.  path must be in a
+ * canonical form: empty components, or "." or ".." components are not
+ * allowed.  prefix_list may be null, which is like "".
+ */
+int longest_ancestor_length(const char *path, const char *prefix_list)
+{
+	char buf[PATH_MAX+1];
+	const char *ceil, *colon;
+	int len, max_len = -1;
+
+	if (prefix_list == NULL || !strcmp(path, "/"))
+		return -1;
+
+	for (colon = ceil = prefix_list; *colon; ceil = colon+1) {
+		for (colon = ceil; *colon && *colon != PATH_SEP; colon++);
+		len = colon - ceil;
+		if (len == 0 || len > PATH_MAX || !is_absolute_path(ceil))
+			continue;
+		strlcpy(buf, ceil, len+1);
+		if (normalize_path_copy(buf, buf) < 0)
+			continue;
+		len = strlen(buf);
+		if (len > 0 && buf[len-1] == '/')
+			buf[--len] = '\0';
+
+		if (!strncmp(path, buf, len) &&
+		    path[len] == '/' &&
+		    len > max_len) {
+			max_len = len;
+		}
+	}
+
+	return max_len;
+}
+
+/* strip arbitrary amount of directory separators at end of path */
+static inline int chomp_trailing_dir_sep(const char *path, int len)
+{
+	while (len && is_dir_sep(path[len - 1]))
+		len--;
+	return len;
+}
+
+/*
+ * If path ends with suffix (complete path components), returns the
+ * part before suffix (sans trailing directory separators).
+ * Otherwise returns NULL.
+ */
+char *strip_path_suffix(const char *path, const char *suffix)
+{
+	int path_len = strlen(path), suffix_len = strlen(suffix);
+
+	while (suffix_len) {
+		if (!path_len)
+			return NULL;
+
+		if (is_dir_sep(path[path_len - 1])) {
+			if (!is_dir_sep(suffix[suffix_len - 1]))
+				return NULL;
+			path_len = chomp_trailing_dir_sep(path, path_len);
+			suffix_len = chomp_trailing_dir_sep(suffix, suffix_len);
+		}
+		else if (path[--path_len] != suffix[--suffix_len])
+			return NULL;
+	}
+
+	if (path_len && !is_dir_sep(path[path_len - 1]))
+		return NULL;
+	return xstrndup(path, chomp_trailing_dir_sep(path, path_len));
+}
diff --git a/Documentation/perf_counter/util/quote.c b/Documentation/perf_counter/util/quote.c
new file mode 100644
index 0000000..7a49fcf
--- /dev/null
+++ b/Documentation/perf_counter/util/quote.c
@@ -0,0 +1,478 @@
+#include "cache.h"
+#include "quote.h"
+
+int quote_path_fully = 1;
+
+/* Help to copy the thing properly quoted for the shell safety.
+ * any single quote is replaced with '\'', any exclamation point
+ * is replaced with '\!', and the whole thing is enclosed in a
+ *
+ * E.g.
+ *  original     sq_quote     result
+ *  name     ==> name      ==> 'name'
+ *  a b      ==> a b       ==> 'a b'
+ *  a'b      ==> a'\''b    ==> 'a'\''b'
+ *  a!b      ==> a'\!'b    ==> 'a'\!'b'
+ */
+static inline int need_bs_quote(char c)
+{
+	return (c == '\'' || c == '!');
+}
+
+void sq_quote_buf(struct strbuf *dst, const char *src)
+{
+	char *to_free = NULL;
+
+	if (dst->buf == src)
+		to_free = strbuf_detach(dst, NULL);
+
+	strbuf_addch(dst, '\'');
+	while (*src) {
+		size_t len = strcspn(src, "'!");
+		strbuf_add(dst, src, len);
+		src += len;
+		while (need_bs_quote(*src)) {
+			strbuf_addstr(dst, "'\\");
+			strbuf_addch(dst, *src++);
+			strbuf_addch(dst, '\'');
+		}
+	}
+	strbuf_addch(dst, '\'');
+	free(to_free);
+}
+
+void sq_quote_print(FILE *stream, const char *src)
+{
+	char c;
+
+	fputc('\'', stream);
+	while ((c = *src++)) {
+		if (need_bs_quote(c)) {
+			fputs("'\\", stream);
+			fputc(c, stream);
+			fputc('\'', stream);
+		} else {
+			fputc(c, stream);
+		}
+	}
+	fputc('\'', stream);
+}
+
+void sq_quote_argv(struct strbuf *dst, const char** argv, size_t maxlen)
+{
+	int i;
+
+	/* Copy into destination buffer. */
+	strbuf_grow(dst, 255);
+	for (i = 0; argv[i]; ++i) {
+		strbuf_addch(dst, ' ');
+		sq_quote_buf(dst, argv[i]);
+		if (maxlen && dst->len > maxlen)
+			die("Too many or long arguments");
+	}
+}
+
+char *sq_dequote_step(char *arg, char **next)
+{
+	char *dst = arg;
+	char *src = arg;
+	char c;
+
+	if (*src != '\'')
+		return NULL;
+	for (;;) {
+		c = *++src;
+		if (!c)
+			return NULL;
+		if (c != '\'') {
+			*dst++ = c;
+			continue;
+		}
+		/* We stepped out of sq */
+		switch (*++src) {
+		case '\0':
+			*dst = 0;
+			if (next)
+				*next = NULL;
+			return arg;
+		case '\\':
+			c = *++src;
+			if (need_bs_quote(c) && *++src == '\'') {
+				*dst++ = c;
+				continue;
+			}
+		/* Fallthrough */
+		default:
+			if (!next || !isspace(*src))
+				return NULL;
+			do {
+				c = *++src;
+			} while (isspace(c));
+			*dst = 0;
+			*next = src;
+			return arg;
+		}
+	}
+}
+
+char *sq_dequote(char *arg)
+{
+	return sq_dequote_step(arg, NULL);
+}
+
+int sq_dequote_to_argv(char *arg, const char ***argv, int *nr, int *alloc)
+{
+	char *next = arg;
+
+	if (!*arg)
+		return 0;
+	do {
+		char *dequoted = sq_dequote_step(next, &next);
+		if (!dequoted)
+			return -1;
+		ALLOC_GROW(*argv, *nr + 1, *alloc);
+		(*argv)[(*nr)++] = dequoted;
+	} while (next);
+
+	return 0;
+}
+
+/* 1 means: quote as octal
+ * 0 means: quote as octal if (quote_path_fully)
+ * -1 means: never quote
+ * c: quote as "\\c"
+ */
+#define X8(x)   x, x, x, x, x, x, x, x
+#define X16(x)  X8(x), X8(x)
+static signed char const sq_lookup[256] = {
+	/*           0    1    2    3    4    5    6    7 */
+	/* 0x00 */   1,   1,   1,   1,   1,   1,   1, 'a',
+	/* 0x08 */ 'b', 't', 'n', 'v', 'f', 'r',   1,   1,
+	/* 0x10 */ X16(1),
+	/* 0x20 */  -1,  -1, '"',  -1,  -1,  -1,  -1,  -1,
+	/* 0x28 */ X16(-1), X16(-1), X16(-1),
+	/* 0x58 */  -1,  -1,  -1,  -1,'\\',  -1,  -1,  -1,
+	/* 0x60 */ X16(-1), X8(-1),
+	/* 0x78 */  -1,  -1,  -1,  -1,  -1,  -1,  -1,   1,
+	/* 0x80 */ /* set to 0 */
+};
+
+static inline int sq_must_quote(char c)
+{
+	return sq_lookup[(unsigned char)c] + quote_path_fully > 0;
+}
+
+/* returns the longest prefix not needing a quote up to maxlen if positive.
+   This stops at the first \0 because it's marked as a character needing an
+   escape */
+static size_t next_quote_pos(const char *s, ssize_t maxlen)
+{
+	size_t len;
+	if (maxlen < 0) {
+		for (len = 0; !sq_must_quote(s[len]); len++);
+	} else {
+		for (len = 0; len < maxlen && !sq_must_quote(s[len]); len++);
+	}
+	return len;
+}
+
+/*
+ * C-style name quoting.
+ *
+ * (1) if sb and fp are both NULL, inspect the input name and counts the
+ *     number of bytes that are needed to hold c_style quoted version of name,
+ *     counting the double quotes around it but not terminating NUL, and
+ *     returns it.
+ *     However, if name does not need c_style quoting, it returns 0.
+ *
+ * (2) if sb or fp are not NULL, it emits the c_style quoted version
+ *     of name, enclosed with double quotes if asked and needed only.
+ *     Return value is the same as in (1).
+ */
+static size_t quote_c_style_counted(const char *name, ssize_t maxlen,
+                                    struct strbuf *sb, FILE *fp, int no_dq)
+{
+#undef EMIT
+#define EMIT(c)                                 \
+	do {                                        \
+		if (sb) strbuf_addch(sb, (c));          \
+		if (fp) fputc((c), fp);                 \
+		count++;                                \
+	} while (0)
+#define EMITBUF(s, l)                           \
+	do {                                        \
+		if (sb) strbuf_add(sb, (s), (l));       \
+		if (fp) fwrite((s), (l), 1, fp);        \
+		count += (l);                           \
+	} while (0)
+
+	size_t len, count = 0;
+	const char *p = name;
+
+	for (;;) {
+		int ch;
+
+		len = next_quote_pos(p, maxlen);
+		if (len == maxlen || !p[len])
+			break;
+
+		if (!no_dq && p == name)
+			EMIT('"');
+
+		EMITBUF(p, len);
+		EMIT('\\');
+		p += len;
+		ch = (unsigned char)*p++;
+		if (sq_lookup[ch] >= ' ') {
+			EMIT(sq_lookup[ch]);
+		} else {
+			EMIT(((ch >> 6) & 03) + '0');
+			EMIT(((ch >> 3) & 07) + '0');
+			EMIT(((ch >> 0) & 07) + '0');
+		}
+	}
+
+	EMITBUF(p, len);
+	if (p == name)   /* no ending quote needed */
+		return 0;
+
+	if (!no_dq)
+		EMIT('"');
+	return count;
+}
+
+size_t quote_c_style(const char *name, struct strbuf *sb, FILE *fp, int nodq)
+{
+	return quote_c_style_counted(name, -1, sb, fp, nodq);
+}
+
+void quote_two_c_style(struct strbuf *sb, const char *prefix, const char *path, int nodq)
+{
+	if (quote_c_style(prefix, NULL, NULL, 0) ||
+	    quote_c_style(path, NULL, NULL, 0)) {
+		if (!nodq)
+			strbuf_addch(sb, '"');
+		quote_c_style(prefix, sb, NULL, 1);
+		quote_c_style(path, sb, NULL, 1);
+		if (!nodq)
+			strbuf_addch(sb, '"');
+	} else {
+		strbuf_addstr(sb, prefix);
+		strbuf_addstr(sb, path);
+	}
+}
+
+void write_name_quoted(const char *name, FILE *fp, int terminator)
+{
+	if (terminator) {
+		quote_c_style(name, NULL, fp, 0);
+	} else {
+		fputs(name, fp);
+	}
+	fputc(terminator, fp);
+}
+
+extern void write_name_quotedpfx(const char *pfx, size_t pfxlen,
+                                 const char *name, FILE *fp, int terminator)
+{
+	int needquote = 0;
+
+	if (terminator) {
+		needquote = next_quote_pos(pfx, pfxlen) < pfxlen
+			|| name[next_quote_pos(name, -1)];
+	}
+	if (needquote) {
+		fputc('"', fp);
+		quote_c_style_counted(pfx, pfxlen, NULL, fp, 1);
+		quote_c_style(name, NULL, fp, 1);
+		fputc('"', fp);
+	} else {
+		fwrite(pfx, pfxlen, 1, fp);
+		fputs(name, fp);
+	}
+	fputc(terminator, fp);
+}
+
+/* quote path as relative to the given prefix */
+char *quote_path_relative(const char *in, int len,
+			  struct strbuf *out, const char *prefix)
+{
+	int needquote;
+
+	if (len < 0)
+		len = strlen(in);
+
+	/* "../" prefix itself does not need quoting, but "in" might. */
+	needquote = next_quote_pos(in, len) < len;
+	strbuf_setlen(out, 0);
+	strbuf_grow(out, len);
+
+	if (needquote)
+		strbuf_addch(out, '"');
+	if (prefix) {
+		int off = 0;
+		while (prefix[off] && off < len && prefix[off] == in[off])
+			if (prefix[off] == '/') {
+				prefix += off + 1;
+				in += off + 1;
+				len -= off + 1;
+				off = 0;
+			} else
+				off++;
+
+		for (; *prefix; prefix++)
+			if (*prefix == '/')
+				strbuf_addstr(out, "../");
+	}
+
+	quote_c_style_counted (in, len, out, NULL, 1);
+
+	if (needquote)
+		strbuf_addch(out, '"');
+	if (!out->len)
+		strbuf_addstr(out, "./");
+
+	return out->buf;
+}
+
+/*
+ * C-style name unquoting.
+ *
+ * Quoted should point at the opening double quote.
+ * + Returns 0 if it was able to unquote the string properly, and appends the
+ *   result in the strbuf `sb'.
+ * + Returns -1 in case of error, and doesn't touch the strbuf. Though note
+ *   that this function will allocate memory in the strbuf, so calling
+ *   strbuf_release is mandatory whichever result unquote_c_style returns.
+ *
+ * Updates endp pointer to point at one past the ending double quote if given.
+ */
+int unquote_c_style(struct strbuf *sb, const char *quoted, const char **endp)
+{
+	size_t oldlen = sb->len, len;
+	int ch, ac;
+
+	if (*quoted++ != '"')
+		return -1;
+
+	for (;;) {
+		len = strcspn(quoted, "\"\\");
+		strbuf_add(sb, quoted, len);
+		quoted += len;
+
+		switch (*quoted++) {
+		  case '"':
+			if (endp)
+				*endp = quoted;
+			return 0;
+		  case '\\':
+			break;
+		  default:
+			goto error;
+		}
+
+		switch ((ch = *quoted++)) {
+		case 'a': ch = '\a'; break;
+		case 'b': ch = '\b'; break;
+		case 'f': ch = '\f'; break;
+		case 'n': ch = '\n'; break;
+		case 'r': ch = '\r'; break;
+		case 't': ch = '\t'; break;
+		case 'v': ch = '\v'; break;
+
+		case '\\': case '"':
+			break; /* verbatim */
+
+		/* octal values with first digit over 4 overflow */
+		case '0': case '1': case '2': case '3':
+					ac = ((ch - '0') << 6);
+			if ((ch = *quoted++) < '0' || '7' < ch)
+				goto error;
+					ac |= ((ch - '0') << 3);
+			if ((ch = *quoted++) < '0' || '7' < ch)
+				goto error;
+					ac |= (ch - '0');
+					ch = ac;
+					break;
+				default:
+			goto error;
+			}
+		strbuf_addch(sb, ch);
+		}
+
+  error:
+	strbuf_setlen(sb, oldlen);
+	return -1;
+}
+
+/* quoting as a string literal for other languages */
+
+void perl_quote_print(FILE *stream, const char *src)
+{
+	const char sq = '\'';
+	const char bq = '\\';
+	char c;
+
+	fputc(sq, stream);
+	while ((c = *src++)) {
+		if (c == sq || c == bq)
+			fputc(bq, stream);
+		fputc(c, stream);
+	}
+	fputc(sq, stream);
+}
+
+void python_quote_print(FILE *stream, const char *src)
+{
+	const char sq = '\'';
+	const char bq = '\\';
+	const char nl = '\n';
+	char c;
+
+	fputc(sq, stream);
+	while ((c = *src++)) {
+		if (c == nl) {
+			fputc(bq, stream);
+			fputc('n', stream);
+			continue;
+		}
+		if (c == sq || c == bq)
+			fputc(bq, stream);
+		fputc(c, stream);
+	}
+	fputc(sq, stream);
+}
+
+void tcl_quote_print(FILE *stream, const char *src)
+{
+	char c;
+
+	fputc('"', stream);
+	while ((c = *src++)) {
+		switch (c) {
+		case '[': case ']':
+		case '{': case '}':
+		case '$': case '\\': case '"':
+			fputc('\\', stream);
+		default:
+			fputc(c, stream);
+			break;
+		case '\f':
+			fputs("\\f", stream);
+			break;
+		case '\r':
+			fputs("\\r", stream);
+			break;
+		case '\n':
+			fputs("\\n", stream);
+			break;
+		case '\t':
+			fputs("\\t", stream);
+			break;
+		case '\v':
+			fputs("\\v", stream);
+			break;
+		}
+	}
+	fputc('"', stream);
+}
diff --git a/Documentation/perf_counter/util/quote.h b/Documentation/perf_counter/util/quote.h
new file mode 100644
index 0000000..5dfad89
--- /dev/null
+++ b/Documentation/perf_counter/util/quote.h
@@ -0,0 +1,68 @@
+#ifndef QUOTE_H
+#define QUOTE_H
+
+#include <stddef.h>
+#include <stdio.h>
+
+/* Help to copy the thing properly quoted for the shell safety.
+ * any single quote is replaced with '\'', any exclamation point
+ * is replaced with '\!', and the whole thing is enclosed in a
+ * single quote pair.
+ *
+ * For example, if you are passing the result to system() as an
+ * argument:
+ *
+ * sprintf(cmd, "foobar %s %s", sq_quote(arg0), sq_quote(arg1))
+ *
+ * would be appropriate.  If the system() is going to call ssh to
+ * run the command on the other side:
+ *
+ * sprintf(cmd, "git-diff-tree %s %s", sq_quote(arg0), sq_quote(arg1));
+ * sprintf(rcmd, "ssh %s %s", sq_util/quote.host), sq_quote(cmd));
+ *
+ * Note that the above examples leak memory!  Remember to free result from
+ * sq_quote() in a real application.
+ *
+ * sq_quote_buf() writes to an existing buffer of specified size; it
+ * will return the number of characters that would have been written
+ * excluding the final null regardless of the buffer size.
+ */
+
+extern void sq_quote_print(FILE *stream, const char *src);
+
+extern void sq_quote_buf(struct strbuf *, const char *src);
+extern void sq_quote_argv(struct strbuf *, const char **argv, size_t maxlen);
+
+/* This unwraps what sq_quote() produces in place, but returns
+ * NULL if the input does not look like what sq_quote would have
+ * produced.
+ */
+extern char *sq_dequote(char *);
+
+/*
+ * Same as the above, but can be used to unwrap many arguments in the
+ * same string separated by space. "next" is changed to point to the
+ * next argument that should be passed as first parameter. When there
+ * is no more argument to be dequoted, "next" is updated to point to NULL.
+ */
+extern char *sq_dequote_step(char *arg, char **next);
+extern int sq_dequote_to_argv(char *arg, const char ***argv, int *nr, int *alloc);
+
+extern int unquote_c_style(struct strbuf *, const char *quoted, const char **endp);
+extern size_t quote_c_style(const char *name, struct strbuf *, FILE *, int no_dq);
+extern void quote_two_c_style(struct strbuf *, const char *, const char *, int);
+
+extern void write_name_quoted(const char *name, FILE *, int terminator);
+extern void write_name_quotedpfx(const char *pfx, size_t pfxlen,
+                                 const char *name, FILE *, int terminator);
+
+/* quote path as relative to the given prefix */
+char *quote_path_relative(const char *in, int len,
+			  struct strbuf *out, const char *prefix);
+
+/* quoting as a string literal for other languages */
+extern void perl_quote_print(FILE *stream, const char *src);
+extern void python_quote_print(FILE *stream, const char *src);
+extern void tcl_quote_print(FILE *stream, const char *src);
+
+#endif
diff --git a/Documentation/perf_counter/util/run-command.c b/Documentation/perf_counter/util/run-command.c
new file mode 100644
index 0000000..b2f5e85
--- /dev/null
+++ b/Documentation/perf_counter/util/run-command.c
@@ -0,0 +1,395 @@
+#include "cache.h"
+#include "run-command.h"
+#include "exec_cmd.h"
+
+static inline void close_pair(int fd[2])
+{
+	close(fd[0]);
+	close(fd[1]);
+}
+
+static inline void dup_devnull(int to)
+{
+	int fd = open("/dev/null", O_RDWR);
+	dup2(fd, to);
+	close(fd);
+}
+
+int start_command(struct child_process *cmd)
+{
+	int need_in, need_out, need_err;
+	int fdin[2], fdout[2], fderr[2];
+
+	/*
+	 * In case of errors we must keep the promise to close FDs
+	 * that have been passed in via ->in and ->out.
+	 */
+
+	need_in = !cmd->no_stdin && cmd->in < 0;
+	if (need_in) {
+		if (pipe(fdin) < 0) {
+			if (cmd->out > 0)
+				close(cmd->out);
+			return -ERR_RUN_COMMAND_PIPE;
+		}
+		cmd->in = fdin[1];
+	}
+
+	need_out = !cmd->no_stdout
+		&& !cmd->stdout_to_stderr
+		&& cmd->out < 0;
+	if (need_out) {
+		if (pipe(fdout) < 0) {
+			if (need_in)
+				close_pair(fdin);
+			else if (cmd->in)
+				close(cmd->in);
+			return -ERR_RUN_COMMAND_PIPE;
+		}
+		cmd->out = fdout[0];
+	}
+
+	need_err = !cmd->no_stderr && cmd->err < 0;
+	if (need_err) {
+		if (pipe(fderr) < 0) {
+			if (need_in)
+				close_pair(fdin);
+			else if (cmd->in)
+				close(cmd->in);
+			if (need_out)
+				close_pair(fdout);
+			else if (cmd->out)
+				close(cmd->out);
+			return -ERR_RUN_COMMAND_PIPE;
+		}
+		cmd->err = fderr[0];
+	}
+
+#ifndef __MINGW32__
+	fflush(NULL);
+	cmd->pid = fork();
+	if (!cmd->pid) {
+		if (cmd->no_stdin)
+			dup_devnull(0);
+		else if (need_in) {
+			dup2(fdin[0], 0);
+			close_pair(fdin);
+		} else if (cmd->in) {
+			dup2(cmd->in, 0);
+			close(cmd->in);
+		}
+
+		if (cmd->no_stderr)
+			dup_devnull(2);
+		else if (need_err) {
+			dup2(fderr[1], 2);
+			close_pair(fderr);
+		}
+
+		if (cmd->no_stdout)
+			dup_devnull(1);
+		else if (cmd->stdout_to_stderr)
+			dup2(2, 1);
+		else if (need_out) {
+			dup2(fdout[1], 1);
+			close_pair(fdout);
+		} else if (cmd->out > 1) {
+			dup2(cmd->out, 1);
+			close(cmd->out);
+		}
+
+		if (cmd->dir && chdir(cmd->dir))
+			die("exec %s: cd to %s failed (%s)", cmd->argv[0],
+			    cmd->dir, strerror(errno));
+		if (cmd->env) {
+			for (; *cmd->env; cmd->env++) {
+				if (strchr(*cmd->env, '='))
+					putenv((char*)*cmd->env);
+				else
+					unsetenv(*cmd->env);
+			}
+		}
+		if (cmd->preexec_cb)
+			cmd->preexec_cb();
+		if (cmd->perf_cmd) {
+			execv_perf_cmd(cmd->argv);
+		} else {
+			execvp(cmd->argv[0], (char *const*) cmd->argv);
+		}
+		exit(127);
+	}
+#else
+	int s0 = -1, s1 = -1, s2 = -1;	/* backups of stdin, stdout, stderr */
+	const char **sargv = cmd->argv;
+	char **env = environ;
+
+	if (cmd->no_stdin) {
+		s0 = dup(0);
+		dup_devnull(0);
+	} else if (need_in) {
+		s0 = dup(0);
+		dup2(fdin[0], 0);
+	} else if (cmd->in) {
+		s0 = dup(0);
+		dup2(cmd->in, 0);
+	}
+
+	if (cmd->no_stderr) {
+		s2 = dup(2);
+		dup_devnull(2);
+	} else if (need_err) {
+		s2 = dup(2);
+		dup2(fderr[1], 2);
+	}
+
+	if (cmd->no_stdout) {
+		s1 = dup(1);
+		dup_devnull(1);
+	} else if (cmd->stdout_to_stderr) {
+		s1 = dup(1);
+		dup2(2, 1);
+	} else if (need_out) {
+		s1 = dup(1);
+		dup2(fdout[1], 1);
+	} else if (cmd->out > 1) {
+		s1 = dup(1);
+		dup2(cmd->out, 1);
+	}
+
+	if (cmd->dir)
+		die("chdir in start_command() not implemented");
+	if (cmd->env) {
+		env = copy_environ();
+		for (; *cmd->env; cmd->env++)
+			env = env_setenv(env, *cmd->env);
+	}
+
+	if (cmd->perf_cmd) {
+		cmd->argv = prepare_perf_cmd(cmd->argv);
+	}
+
+	cmd->pid = mingw_spawnvpe(cmd->argv[0], cmd->argv, env);
+
+	if (cmd->env)
+		free_environ(env);
+	if (cmd->perf_cmd)
+		free(cmd->argv);
+
+	cmd->argv = sargv;
+	if (s0 >= 0)
+		dup2(s0, 0), close(s0);
+	if (s1 >= 0)
+		dup2(s1, 1), close(s1);
+	if (s2 >= 0)
+		dup2(s2, 2), close(s2);
+#endif
+
+	if (cmd->pid < 0) {
+		int err = errno;
+		if (need_in)
+			close_pair(fdin);
+		else if (cmd->in)
+			close(cmd->in);
+		if (need_out)
+			close_pair(fdout);
+		else if (cmd->out)
+			close(cmd->out);
+		if (need_err)
+			close_pair(fderr);
+		return err == ENOENT ?
+			-ERR_RUN_COMMAND_EXEC :
+			-ERR_RUN_COMMAND_FORK;
+	}
+
+	if (need_in)
+		close(fdin[0]);
+	else if (cmd->in)
+		close(cmd->in);
+
+	if (need_out)
+		close(fdout[1]);
+	else if (cmd->out)
+		close(cmd->out);
+
+	if (need_err)
+		close(fderr[1]);
+
+	return 0;
+}
+
+static int wait_or_whine(pid_t pid)
+{
+	for (;;) {
+		int status, code;
+		pid_t waiting = waitpid(pid, &status, 0);
+
+		if (waiting < 0) {
+			if (errno == EINTR)
+				continue;
+			error("waitpid failed (%s)", strerror(errno));
+			return -ERR_RUN_COMMAND_WAITPID;
+		}
+		if (waiting != pid)
+			return -ERR_RUN_COMMAND_WAITPID_WRONG_PID;
+		if (WIFSIGNALED(status))
+			return -ERR_RUN_COMMAND_WAITPID_SIGNAL;
+
+		if (!WIFEXITED(status))
+			return -ERR_RUN_COMMAND_WAITPID_NOEXIT;
+		code = WEXITSTATUS(status);
+		switch (code) {
+		case 127:
+			return -ERR_RUN_COMMAND_EXEC;
+		case 0:
+			return 0;
+		default:
+			return -code;
+		}
+	}
+}
+
+int finish_command(struct child_process *cmd)
+{
+	return wait_or_whine(cmd->pid);
+}
+
+int run_command(struct child_process *cmd)
+{
+	int code = start_command(cmd);
+	if (code)
+		return code;
+	return finish_command(cmd);
+}
+
+static void prepare_run_command_v_opt(struct child_process *cmd,
+				      const char **argv,
+				      int opt)
+{
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->argv = argv;
+	cmd->no_stdin = opt & RUN_COMMAND_NO_STDIN ? 1 : 0;
+	cmd->perf_cmd = opt & RUN_PERF_CMD ? 1 : 0;
+	cmd->stdout_to_stderr = opt & RUN_COMMAND_STDOUT_TO_STDERR ? 1 : 0;
+}
+
+int run_command_v_opt(const char **argv, int opt)
+{
+	struct child_process cmd;
+	prepare_run_command_v_opt(&cmd, argv, opt);
+	return run_command(&cmd);
+}
+
+int run_command_v_opt_cd_env(const char **argv, int opt, const char *dir, const char *const *env)
+{
+	struct child_process cmd;
+	prepare_run_command_v_opt(&cmd, argv, opt);
+	cmd.dir = dir;
+	cmd.env = env;
+	return run_command(&cmd);
+}
+
+#ifdef __MINGW32__
+static __stdcall unsigned run_thread(void *data)
+{
+	struct async *async = data;
+	return async->proc(async->fd_for_proc, async->data);
+}
+#endif
+
+int start_async(struct async *async)
+{
+	int pipe_out[2];
+
+	if (pipe(pipe_out) < 0)
+		return error("cannot create pipe: %s", strerror(errno));
+	async->out = pipe_out[0];
+
+#ifndef __MINGW32__
+	/* Flush stdio before fork() to avoid cloning buffers */
+	fflush(NULL);
+
+	async->pid = fork();
+	if (async->pid < 0) {
+		error("fork (async) failed: %s", strerror(errno));
+		close_pair(pipe_out);
+		return -1;
+	}
+	if (!async->pid) {
+		close(pipe_out[0]);
+		exit(!!async->proc(pipe_out[1], async->data));
+	}
+	close(pipe_out[1]);
+#else
+	async->fd_for_proc = pipe_out[1];
+	async->tid = (HANDLE) _beginthreadex(NULL, 0, run_thread, async, 0, NULL);
+	if (!async->tid) {
+		error("cannot create thread: %s", strerror(errno));
+		close_pair(pipe_out);
+		return -1;
+	}
+#endif
+	return 0;
+}
+
+int finish_async(struct async *async)
+{
+#ifndef __MINGW32__
+	int ret = 0;
+
+	if (wait_or_whine(async->pid))
+		ret = error("waitpid (async) failed");
+#else
+	DWORD ret = 0;
+	if (WaitForSingleObject(async->tid, INFINITE) != WAIT_OBJECT_0)
+		ret = error("waiting for thread failed: %lu", GetLastError());
+	else if (!GetExitCodeThread(async->tid, &ret))
+		ret = error("cannot get thread exit code: %lu", GetLastError());
+	CloseHandle(async->tid);
+#endif
+	return ret;
+}
+
+int run_hook(const char *index_file, const char *name, ...)
+{
+	struct child_process hook;
+	const char **argv = NULL, *env[2];
+	char index[PATH_MAX];
+	va_list args;
+	int ret;
+	size_t i = 0, alloc = 0;
+
+	if (access(perf_path("hooks/%s", name), X_OK) < 0)
+		return 0;
+
+	va_start(args, name);
+	ALLOC_GROW(argv, i + 1, alloc);
+	argv[i++] = perf_path("hooks/%s", name);
+	while (argv[i-1]) {
+		ALLOC_GROW(argv, i + 1, alloc);
+		argv[i++] = va_arg(args, const char *);
+	}
+	va_end(args);
+
+	memset(&hook, 0, sizeof(hook));
+	hook.argv = argv;
+	hook.no_stdin = 1;
+	hook.stdout_to_stderr = 1;
+	if (index_file) {
+		snprintf(index, sizeof(index), "PERF_INDEX_FILE=%s", index_file);
+		env[0] = index;
+		env[1] = NULL;
+		hook.env = env;
+	}
+
+	ret = start_command(&hook);
+	free(argv);
+	if (ret) {
+		warning("Could not spawn %s", argv[0]);
+		return ret;
+	}
+	ret = finish_command(&hook);
+	if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
+		warning("%s exited due to uncaught signal", argv[0]);
+
+	return ret;
+}
diff --git a/Documentation/perf_counter/util/run-command.h b/Documentation/perf_counter/util/run-command.h
new file mode 100644
index 0000000..328289f
--- /dev/null
+++ b/Documentation/perf_counter/util/run-command.h
@@ -0,0 +1,93 @@
+#ifndef RUN_COMMAND_H
+#define RUN_COMMAND_H
+
+enum {
+	ERR_RUN_COMMAND_FORK = 10000,
+	ERR_RUN_COMMAND_EXEC,
+	ERR_RUN_COMMAND_PIPE,
+	ERR_RUN_COMMAND_WAITPID,
+	ERR_RUN_COMMAND_WAITPID_WRONG_PID,
+	ERR_RUN_COMMAND_WAITPID_SIGNAL,
+	ERR_RUN_COMMAND_WAITPID_NOEXIT,
+};
+#define IS_RUN_COMMAND_ERR(x) (-(x) >= ERR_RUN_COMMAND_FORK)
+
+struct child_process {
+	const char **argv;
+	pid_t pid;
+	/*
+	 * Using .in, .out, .err:
+	 * - Specify 0 for no redirections (child inherits stdin, stdout,
+	 *   stderr from parent).
+	 * - Specify -1 to have a pipe allocated as follows:
+	 *     .in: returns the writable pipe end; parent writes to it,
+	 *          the readable pipe end becomes child's stdin
+	 *     .out, .err: returns the readable pipe end; parent reads from
+	 *          it, the writable pipe end becomes child's stdout/stderr
+	 *   The caller of start_command() must close the returned FDs
+	 *   after it has completed reading from/writing to it!
+	 * - Specify > 0 to set a channel to a particular FD as follows:
+	 *     .in: a readable FD, becomes child's stdin
+	 *     .out: a writable FD, becomes child's stdout/stderr
+	 *     .err > 0 not supported
+	 *   The specified FD is closed by start_command(), even in case
+	 *   of errors!
+	 */
+	int in;
+	int out;
+	int err;
+	const char *dir;
+	const char *const *env;
+	unsigned no_stdin:1;
+	unsigned no_stdout:1;
+	unsigned no_stderr:1;
+	unsigned perf_cmd:1; /* if this is to be perf sub-command */
+	unsigned stdout_to_stderr:1;
+	void (*preexec_cb)(void);
+};
+
+int start_command(struct child_process *);
+int finish_command(struct child_process *);
+int run_command(struct child_process *);
+
+extern int run_hook(const char *index_file, const char *name, ...);
+
+#define RUN_COMMAND_NO_STDIN 1
+#define RUN_PERF_CMD	     2	/*If this is to be perf sub-command */
+#define RUN_COMMAND_STDOUT_TO_STDERR 4
+int run_command_v_opt(const char **argv, int opt);
+
+/*
+ * env (the environment) is to be formatted like environ: "VAR=VALUE".
+ * To unset an environment variable use just "VAR".
+ */
+int run_command_v_opt_cd_env(const char **argv, int opt, const char *dir, const char *const *env);
+
+/*
+ * The purpose of the following functions is to feed a pipe by running
+ * a function asynchronously and providing output that the caller reads.
+ *
+ * It is expected that no synchronization and mutual exclusion between
+ * the caller and the feed function is necessary so that the function
+ * can run in a thread without interfering with the caller.
+ */
+struct async {
+	/*
+	 * proc writes to fd and closes it;
+	 * returns 0 on success, non-zero on failure
+	 */
+	int (*proc)(int fd, void *data);
+	void *data;
+	int out;	/* caller reads from here and closes it */
+#ifndef __MINGW32__
+	pid_t pid;
+#else
+	HANDLE tid;
+	int fd_for_proc;
+#endif
+};
+
+int start_async(struct async *async);
+int finish_async(struct async *async);
+
+#endif
diff --git a/Documentation/perf_counter/util/strbuf.c b/Documentation/perf_counter/util/strbuf.c
new file mode 100644
index 0000000..eaba093
--- /dev/null
+++ b/Documentation/perf_counter/util/strbuf.c
@@ -0,0 +1,359 @@
+#include "cache.h"
+
+int prefixcmp(const char *str, const char *prefix)
+{
+	for (; ; str++, prefix++)
+		if (!*prefix)
+			return 0;
+		else if (*str != *prefix)
+			return (unsigned char)*prefix - (unsigned char)*str;
+}
+
+/*
+ * Used as the default ->buf value, so that people can always assume
+ * buf is non NULL and ->buf is NUL terminated even for a freshly
+ * initialized strbuf.
+ */
+char strbuf_slopbuf[1];
+
+void strbuf_init(struct strbuf *sb, size_t hint)
+{
+	sb->alloc = sb->len = 0;
+	sb->buf = strbuf_slopbuf;
+	if (hint)
+		strbuf_grow(sb, hint);
+}
+
+void strbuf_release(struct strbuf *sb)
+{
+	if (sb->alloc) {
+		free(sb->buf);
+		strbuf_init(sb, 0);
+	}
+}
+
+char *strbuf_detach(struct strbuf *sb, size_t *sz)
+{
+	char *res = sb->alloc ? sb->buf : NULL;
+	if (sz)
+		*sz = sb->len;
+	strbuf_init(sb, 0);
+	return res;
+}
+
+void strbuf_attach(struct strbuf *sb, void *buf, size_t len, size_t alloc)
+{
+	strbuf_release(sb);
+	sb->buf   = buf;
+	sb->len   = len;
+	sb->alloc = alloc;
+	strbuf_grow(sb, 0);
+	sb->buf[sb->len] = '\0';
+}
+
+void strbuf_grow(struct strbuf *sb, size_t extra)
+{
+	if (sb->len + extra + 1 <= sb->len)
+		die("you want to use way too much memory");
+	if (!sb->alloc)
+		sb->buf = NULL;
+	ALLOC_GROW(sb->buf, sb->len + extra + 1, sb->alloc);
+}
+
+void strbuf_trim(struct strbuf *sb)
+{
+	char *b = sb->buf;
+	while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1]))
+		sb->len--;
+	while (sb->len > 0 && isspace(*b)) {
+		b++;
+		sb->len--;
+	}
+	memmove(sb->buf, b, sb->len);
+	sb->buf[sb->len] = '\0';
+}
+void strbuf_rtrim(struct strbuf *sb)
+{
+	while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1]))
+		sb->len--;
+	sb->buf[sb->len] = '\0';
+}
+
+void strbuf_ltrim(struct strbuf *sb)
+{
+	char *b = sb->buf;
+	while (sb->len > 0 && isspace(*b)) {
+		b++;
+		sb->len--;
+	}
+	memmove(sb->buf, b, sb->len);
+	sb->buf[sb->len] = '\0';
+}
+
+void strbuf_tolower(struct strbuf *sb)
+{
+	int i;
+	for (i = 0; i < sb->len; i++)
+		sb->buf[i] = tolower(sb->buf[i]);
+}
+
+struct strbuf **strbuf_split(const struct strbuf *sb, int delim)
+{
+	int alloc = 2, pos = 0;
+	char *n, *p;
+	struct strbuf **ret;
+	struct strbuf *t;
+
+	ret = calloc(alloc, sizeof(struct strbuf *));
+	p = n = sb->buf;
+	while (n < sb->buf + sb->len) {
+		int len;
+		n = memchr(n, delim, sb->len - (n - sb->buf));
+		if (pos + 1 >= alloc) {
+			alloc = alloc * 2;
+			ret = realloc(ret, sizeof(struct strbuf *) * alloc);
+		}
+		if (!n)
+			n = sb->buf + sb->len - 1;
+		len = n - p + 1;
+		t = malloc(sizeof(struct strbuf));
+		strbuf_init(t, len);
+		strbuf_add(t, p, len);
+		ret[pos] = t;
+		ret[++pos] = NULL;
+		p = ++n;
+	}
+	return ret;
+}
+
+void strbuf_list_free(struct strbuf **sbs)
+{
+	struct strbuf **s = sbs;
+
+	while (*s) {
+		strbuf_release(*s);
+		free(*s++);
+	}
+	free(sbs);
+}
+
+int strbuf_cmp(const struct strbuf *a, const struct strbuf *b)
+{
+	int len = a->len < b->len ? a->len: b->len;
+	int cmp = memcmp(a->buf, b->buf, len);
+	if (cmp)
+		return cmp;
+	return a->len < b->len ? -1: a->len != b->len;
+}
+
+void strbuf_splice(struct strbuf *sb, size_t pos, size_t len,
+				   const void *data, size_t dlen)
+{
+	if (pos + len < pos)
+		die("you want to use way too much memory");
+	if (pos > sb->len)
+		die("`pos' is too far after the end of the buffer");
+	if (pos + len > sb->len)
+		die("`pos + len' is too far after the end of the buffer");
+
+	if (dlen >= len)
+		strbuf_grow(sb, dlen - len);
+	memmove(sb->buf + pos + dlen,
+			sb->buf + pos + len,
+			sb->len - pos - len);
+	memcpy(sb->buf + pos, data, dlen);
+	strbuf_setlen(sb, sb->len + dlen - len);
+}
+
+void strbuf_insert(struct strbuf *sb, size_t pos, const void *data, size_t len)
+{
+	strbuf_splice(sb, pos, 0, data, len);
+}
+
+void strbuf_remove(struct strbuf *sb, size_t pos, size_t len)
+{
+	strbuf_splice(sb, pos, len, NULL, 0);
+}
+
+void strbuf_add(struct strbuf *sb, const void *data, size_t len)
+{
+	strbuf_grow(sb, len);
+	memcpy(sb->buf + sb->len, data, len);
+	strbuf_setlen(sb, sb->len + len);
+}
+
+void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len)
+{
+	strbuf_grow(sb, len);
+	memcpy(sb->buf + sb->len, sb->buf + pos, len);
+	strbuf_setlen(sb, sb->len + len);
+}
+
+void strbuf_addf(struct strbuf *sb, const char *fmt, ...)
+{
+	int len;
+	va_list ap;
+
+	if (!strbuf_avail(sb))
+		strbuf_grow(sb, 64);
+	va_start(ap, fmt);
+	len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap);
+	va_end(ap);
+	if (len < 0)
+		die("your vsnprintf is broken");
+	if (len > strbuf_avail(sb)) {
+		strbuf_grow(sb, len);
+		va_start(ap, fmt);
+		len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap);
+		va_end(ap);
+		if (len > strbuf_avail(sb)) {
+			die("this should not happen, your snprintf is broken");
+		}
+	}
+	strbuf_setlen(sb, sb->len + len);
+}
+
+void strbuf_expand(struct strbuf *sb, const char *format, expand_fn_t fn,
+		   void *context)
+{
+	for (;;) {
+		const char *percent;
+		size_t consumed;
+
+		percent = strchrnul(format, '%');
+		strbuf_add(sb, format, percent - format);
+		if (!*percent)
+			break;
+		format = percent + 1;
+
+		consumed = fn(sb, format, context);
+		if (consumed)
+			format += consumed;
+		else
+			strbuf_addch(sb, '%');
+	}
+}
+
+size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder,
+		void *context)
+{
+	struct strbuf_expand_dict_entry *e = context;
+	size_t len;
+
+	for (; e->placeholder && (len = strlen(e->placeholder)); e++) {
+		if (!strncmp(placeholder, e->placeholder, len)) {
+			if (e->value)
+				strbuf_addstr(sb, e->value);
+			return len;
+		}
+	}
+	return 0;
+}
+
+size_t strbuf_fread(struct strbuf *sb, size_t size, FILE *f)
+{
+	size_t res;
+	size_t oldalloc = sb->alloc;
+
+	strbuf_grow(sb, size);
+	res = fread(sb->buf + sb->len, 1, size, f);
+	if (res > 0)
+		strbuf_setlen(sb, sb->len + res);
+	else if (res < 0 && oldalloc == 0)
+		strbuf_release(sb);
+	return res;
+}
+
+ssize_t strbuf_read(struct strbuf *sb, int fd, size_t hint)
+{
+	size_t oldlen = sb->len;
+	size_t oldalloc = sb->alloc;
+
+	strbuf_grow(sb, hint ? hint : 8192);
+	for (;;) {
+		ssize_t cnt;
+
+		cnt = read(fd, sb->buf + sb->len, sb->alloc - sb->len - 1);
+		if (cnt < 0) {
+			if (oldalloc == 0)
+				strbuf_release(sb);
+			else
+				strbuf_setlen(sb, oldlen);
+			return -1;
+		}
+		if (!cnt)
+			break;
+		sb->len += cnt;
+		strbuf_grow(sb, 8192);
+	}
+
+	sb->buf[sb->len] = '\0';
+	return sb->len - oldlen;
+}
+
+#define STRBUF_MAXLINK (2*PATH_MAX)
+
+int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint)
+{
+	size_t oldalloc = sb->alloc;
+
+	if (hint < 32)
+		hint = 32;
+
+	while (hint < STRBUF_MAXLINK) {
+		int len;
+
+		strbuf_grow(sb, hint);
+		len = readlink(path, sb->buf, hint);
+		if (len < 0) {
+			if (errno != ERANGE)
+				break;
+		} else if (len < hint) {
+			strbuf_setlen(sb, len);
+			return 0;
+		}
+
+		/* .. the buffer was too small - try again */
+		hint *= 2;
+	}
+	if (oldalloc == 0)
+		strbuf_release(sb);
+	return -1;
+}
+
+int strbuf_getline(struct strbuf *sb, FILE *fp, int term)
+{
+	int ch;
+
+	strbuf_grow(sb, 0);
+	if (feof(fp))
+		return EOF;
+
+	strbuf_reset(sb);
+	while ((ch = fgetc(fp)) != EOF) {
+		if (ch == term)
+			break;
+		strbuf_grow(sb, 1);
+		sb->buf[sb->len++] = ch;
+	}
+	if (ch == EOF && sb->len == 0)
+		return EOF;
+
+	sb->buf[sb->len] = '\0';
+	return 0;
+}
+
+int strbuf_read_file(struct strbuf *sb, const char *path, size_t hint)
+{
+	int fd, len;
+
+	fd = open(path, O_RDONLY);
+	if (fd < 0)
+		return -1;
+	len = strbuf_read(sb, fd, hint);
+	close(fd);
+	if (len < 0)
+		return -1;
+
+	return len;
+}
diff --git a/Documentation/perf_counter/util/strbuf.h b/Documentation/perf_counter/util/strbuf.h
new file mode 100644
index 0000000..9ee908a
--- /dev/null
+++ b/Documentation/perf_counter/util/strbuf.h
@@ -0,0 +1,137 @@
+#ifndef STRBUF_H
+#define STRBUF_H
+
+/*
+ * Strbuf's can be use in many ways: as a byte array, or to store arbitrary
+ * long, overflow safe strings.
+ *
+ * Strbufs has some invariants that are very important to keep in mind:
+ *
+ * 1. the ->buf member is always malloc-ed, hence strbuf's can be used to
+ *    build complex strings/buffers whose final size isn't easily known.
+ *
+ *    It is NOT legal to copy the ->buf pointer away.
+ *    `strbuf_detach' is the operation that detachs a buffer from its shell
+ *    while keeping the shell valid wrt its invariants.
+ *
+ * 2. the ->buf member is a byte array that has at least ->len + 1 bytes
+ *    allocated. The extra byte is used to store a '\0', allowing the ->buf
+ *    member to be a valid C-string. Every strbuf function ensure this
+ *    invariant is preserved.
+ *
+ *    Note that it is OK to "play" with the buffer directly if you work it
+ *    that way:
+ *
+ *    strbuf_grow(sb, SOME_SIZE);
+ *       ... Here, the memory array starting at sb->buf, and of length
+ *       ... strbuf_avail(sb) is all yours, and you are sure that
+ *       ... strbuf_avail(sb) is at least SOME_SIZE.
+ *    strbuf_setlen(sb, sb->len + SOME_OTHER_SIZE);
+ *
+ *    Of course, SOME_OTHER_SIZE must be smaller or equal to strbuf_avail(sb).
+ *
+ *    Doing so is safe, though if it has to be done in many places, adding the
+ *    missing API to the strbuf module is the way to go.
+ *
+ *    XXX: do _not_ assume that the area that is yours is of size ->alloc - 1
+ *         even if it's true in the current implementation. Alloc is somehow a
+ *         "private" member that should not be messed with.
+ */
+
+#include <assert.h>
+
+extern char strbuf_slopbuf[];
+struct strbuf {
+	size_t alloc;
+	size_t len;
+	char *buf;
+};
+
+#define STRBUF_INIT  { 0, 0, strbuf_slopbuf }
+
+/*----- strbuf life cycle -----*/
+extern void strbuf_init(struct strbuf *, size_t);
+extern void strbuf_release(struct strbuf *);
+extern char *strbuf_detach(struct strbuf *, size_t *);
+extern void strbuf_attach(struct strbuf *, void *, size_t, size_t);
+static inline void strbuf_swap(struct strbuf *a, struct strbuf *b) {
+	struct strbuf tmp = *a;
+	*a = *b;
+	*b = tmp;
+}
+
+/*----- strbuf size related -----*/
+static inline size_t strbuf_avail(const struct strbuf *sb) {
+	return sb->alloc ? sb->alloc - sb->len - 1 : 0;
+}
+
+extern void strbuf_grow(struct strbuf *, size_t);
+
+static inline void strbuf_setlen(struct strbuf *sb, size_t len) {
+	if (!sb->alloc)
+		strbuf_grow(sb, 0);
+	assert(len < sb->alloc);
+	sb->len = len;
+	sb->buf[len] = '\0';
+}
+#define strbuf_reset(sb)  strbuf_setlen(sb, 0)
+
+/*----- content related -----*/
+extern void strbuf_trim(struct strbuf *);
+extern void strbuf_rtrim(struct strbuf *);
+extern void strbuf_ltrim(struct strbuf *);
+extern int strbuf_cmp(const struct strbuf *, const struct strbuf *);
+extern void strbuf_tolower(struct strbuf *);
+
+extern struct strbuf **strbuf_split(const struct strbuf *, int delim);
+extern void strbuf_list_free(struct strbuf **);
+
+/*----- add data in your buffer -----*/
+static inline void strbuf_addch(struct strbuf *sb, int c) {
+	strbuf_grow(sb, 1);
+	sb->buf[sb->len++] = c;
+	sb->buf[sb->len] = '\0';
+}
+
+extern void strbuf_insert(struct strbuf *, size_t pos, const void *, size_t);
+extern void strbuf_remove(struct strbuf *, size_t pos, size_t len);
+
+/* splice pos..pos+len with given data */
+extern void strbuf_splice(struct strbuf *, size_t pos, size_t len,
+                          const void *, size_t);
+
+extern void strbuf_add(struct strbuf *, const void *, size_t);
+static inline void strbuf_addstr(struct strbuf *sb, const char *s) {
+	strbuf_add(sb, s, strlen(s));
+}
+static inline void strbuf_addbuf(struct strbuf *sb, const struct strbuf *sb2) {
+	strbuf_add(sb, sb2->buf, sb2->len);
+}
+extern void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len);
+
+typedef size_t (*expand_fn_t) (struct strbuf *sb, const char *placeholder, void *context);
+extern void strbuf_expand(struct strbuf *sb, const char *format, expand_fn_t fn, void *context);
+struct strbuf_expand_dict_entry {
+	const char *placeholder;
+	const char *value;
+};
+extern size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder, void *context);
+
+__attribute__((format(printf,2,3)))
+extern void strbuf_addf(struct strbuf *sb, const char *fmt, ...);
+
+extern size_t strbuf_fread(struct strbuf *, size_t, FILE *);
+/* XXX: if read fails, any partial read is undone */
+extern ssize_t strbuf_read(struct strbuf *, int fd, size_t hint);
+extern int strbuf_read_file(struct strbuf *sb, const char *path, size_t hint);
+extern int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint);
+
+extern int strbuf_getline(struct strbuf *, FILE *, int);
+
+extern void stripspace(struct strbuf *buf, int skip_comments);
+extern int launch_editor(const char *path, struct strbuf *buffer, const char *const *env);
+
+extern int strbuf_branchname(struct strbuf *sb, const char *name);
+extern int strbuf_check_branch_ref(struct strbuf *sb, const char *name);
+
+#endif /* STRBUF_H */
diff --git a/Documentation/perf_counter/util/usage.c b/Documentation/perf_counter/util/usage.c
new file mode 100644
index 0000000..7a10421
--- /dev/null
+++ b/Documentation/perf_counter/util/usage.c
@@ -0,0 +1,80 @@
+/*
+ * GIT - The information manager from hell
+ *
+ * Copyright (C) Linus Torvalds, 2005
+ */
+#include "util.h"
+
+static void report(const char *prefix, const char *err, va_list params)
+{
+	char msg[1024];
+	vsnprintf(msg, sizeof(msg), err, params);
+	fprintf(stderr, "%s%s\n", prefix, msg);
+}
+
+static NORETURN void usage_builtin(const char *err)
+{
+	fprintf(stderr, "usage: %s\n", err);
+	exit(129);
+}
+
+static NORETURN void die_builtin(const char *err, va_list params)
+{
+	report("fatal: ", err, params);
+	exit(128);
+}
+
+static void error_builtin(const char *err, va_list params)
+{
+	report("error: ", err, params);
+}
+
+static void warn_builtin(const char *warn, va_list params)
+{
+	report("warning: ", warn, params);
+}
+
+/* If we are in a dlopen()ed .so write to a global variable would segfault
+ * (ugh), so keep things static. */
+static void (*usage_routine)(const char *err) NORETURN = usage_builtin;
+static void (*die_routine)(const char *err, va_list params) NORETURN = die_builtin;
+static void (*error_routine)(const char *err, va_list params) = error_builtin;
+static void (*warn_routine)(const char *err, va_list params) = warn_builtin;
+
+void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN)
+{
+	die_routine = routine;
+}
+
+void usage(const char *err)
+{
+	usage_routine(err);
+}
+
+void die(const char *err, ...)
+{
+	va_list params;
+
+	va_start(params, err);
+	die_routine(err, params);
+	va_end(params);
+}
+
+int error(const char *err, ...)
+{
+	va_list params;
+
+	va_start(params, err);
+	error_routine(err, params);
+	va_end(params);
+	return -1;
+}
+
+void warning(const char *warn, ...)
+{
+	va_list params;
+
+	va_start(params, warn);
+	warn_routine(warn, params);
+	va_end(params);
+}
diff --git a/Documentation/perf_counter/util/util.h b/Documentation/perf_counter/util/util.h
new file mode 100644
index 0000000..36e40c3
--- /dev/null
+++ b/Documentation/perf_counter/util/util.h
@@ -0,0 +1,408 @@
+#ifndef GIT_COMPAT_UTIL_H
+#define GIT_COMPAT_UTIL_H
+
+#define _FILE_OFFSET_BITS 64
+
+#ifndef FLEX_ARRAY
+/*
+ * See if our compiler is known to support flexible array members.
+ */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
+# define FLEX_ARRAY /* empty */
+#elif defined(__GNUC__)
+# if (__GNUC__ >= 3)
+#  define FLEX_ARRAY /* empty */
+# else
+#  define FLEX_ARRAY 0 /* older GNU extension */
+# endif
+#endif
+
+/*
+ * Otherwise, default to safer but a bit wasteful traditional style
+ */
+#ifndef FLEX_ARRAY
+# define FLEX_ARRAY 1
+#endif
+#endif
+
+#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
+
+#ifdef __GNUC__
+#define TYPEOF(x) (__typeof__(x))
+#else
+#define TYPEOF(x)
+#endif
+
+#define MSB(x, bits) ((x) & TYPEOF(x)(~0ULL << (sizeof(x) * 8 - (bits))))
+#define HAS_MULTI_BITS(i)  ((i) & ((i) - 1))  /* checks if an integer has more than 1 bit set */
+
+/* Approximation of the length of the decimal representation of this type. */
+#define decimal_length(x)	((int)(sizeof(x) * 2.56 + 0.5) + 1)
+
+#if !defined(__APPLE__) && !defined(__FreeBSD__)  && !defined(__USLC__) && !defined(_M_UNIX)
+#define _XOPEN_SOURCE 600 /* glibc2 and AIX 5.3L need 500, OpenBSD needs 600 for S_ISLNK() */
+#define _XOPEN_SOURCE_EXTENDED 1 /* AIX 5.3L needs this */
+#endif
+#define _ALL_SOURCE 1
+#define _GNU_SOURCE 1
+#define _BSD_SOURCE 1
+
+#include <unistd.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <errno.h>
+#include <limits.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <sys/time.h>
+#include <time.h>
+#include <signal.h>
+#include <fnmatch.h>
+#include <assert.h>
+#include <regex.h>
+#include <utime.h>
+#ifndef __MINGW32__
+#include <sys/wait.h>
+#include <sys/poll.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#ifndef NO_SYS_SELECT_H
+#include <sys/select.h>
+#endif
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <pwd.h>
+#include <inttypes.h>
+#if defined(__CYGWIN__)
+#undef _XOPEN_SOURCE
+#include <grp.h>
+#define _XOPEN_SOURCE 600
+#include "compat/cygwin.h"
+#else
+#undef _ALL_SOURCE /* AIX 5.3L defines a struct list with _ALL_SOURCE. */
+#include <grp.h>
+#define _ALL_SOURCE 1
+#endif
+#else 	/* __MINGW32__ */
+/* pull in Windows compatibility stuff */
+#include "compat/mingw.h"
+#endif	/* __MINGW32__ */
+
+#ifndef NO_ICONV
+#include <iconv.h>
+#endif
+
+#ifndef NO_OPENSSL
+#include <openssl/ssl.h>
+#include <openssl/err.h>
+#endif
+
+/* On most systems <limits.h> would have given us this, but
+ * not on some systems (e.g. GNU/Hurd).
+ */
+#ifndef PATH_MAX
+#define PATH_MAX 4096
+#endif
+
+#ifndef PRIuMAX
+#define PRIuMAX "llu"
+#endif
+
+#ifndef PRIu32
+#define PRIu32 "u"
+#endif
+
+#ifndef PRIx32
+#define PRIx32 "x"
+#endif
+
+#ifndef PATH_SEP
+#define PATH_SEP ':'
+#endif
+
+#ifndef STRIP_EXTENSION
+#define STRIP_EXTENSION ""
+#endif
+
+#ifndef has_dos_drive_prefix
+#define has_dos_drive_prefix(path) 0
+#endif
+
+#ifndef is_dir_sep
+#define is_dir_sep(c) ((c) == '/')
+#endif
+
+#ifdef __GNUC__
+#define NORETURN __attribute__((__noreturn__))
+#else
+#define NORETURN
+#ifndef __attribute__
+#define __attribute__(x)
+#endif
+#endif
+
+/* General helper functions */
+extern void usage(const char *err) NORETURN;
+extern void die(const char *err, ...) NORETURN __attribute__((format (printf, 1, 2)));
+extern int error(const char *err, ...) __attribute__((format (printf, 1, 2)));
+extern void warning(const char *err, ...) __attribute__((format (printf, 1, 2)));
+
+extern void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN);
+
+extern int prefixcmp(const char *str, const char *prefix);
+extern time_t tm_to_time_t(const struct tm *tm);
+
+static inline const char *skip_prefix(const char *str, const char *prefix)
+{
+	size_t len = strlen(prefix);
+	return strncmp(str, prefix, len) ? NULL : str + len;
+}
+
+#if defined(NO_MMAP) || defined(USE_WIN32_MMAP)
+
+#ifndef PROT_READ
+#define PROT_READ 1
+#define PROT_WRITE 2
+#define MAP_PRIVATE 1
+#define MAP_FAILED ((void*)-1)
+#endif
+
+#define mmap git_mmap
+#define munmap git_munmap
+extern void *git_mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
+extern int git_munmap(void *start, size_t length);
+
+#else /* NO_MMAP || USE_WIN32_MMAP */
+
+#include <sys/mman.h>
+
+#endif /* NO_MMAP || USE_WIN32_MMAP */
+
+#ifdef NO_MMAP
+
+/* This value must be multiple of (pagesize * 2) */
+#define DEFAULT_PACKED_GIT_WINDOW_SIZE (1 * 1024 * 1024)
+
+#else /* NO_MMAP */
+
+/* This value must be multiple of (pagesize * 2) */
+#define DEFAULT_PACKED_GIT_WINDOW_SIZE \
+	(sizeof(void*) >= 8 \
+		?  1 * 1024 * 1024 * 1024 \
+		: 32 * 1024 * 1024)
+
+#endif /* NO_MMAP */
+
+#ifdef NO_ST_BLOCKS_IN_STRUCT_STAT
+#define on_disk_bytes(st) ((st).st_size)
+#else
+#define on_disk_bytes(st) ((st).st_blocks * 512)
+#endif
+
+#define DEFAULT_PACKED_GIT_LIMIT \
+	((1024L * 1024L) * (sizeof(void*) >= 8 ? 8192 : 256))
+
+#ifdef NO_PREAD
+#define pread git_pread
+extern ssize_t git_pread(int fd, void *buf, size_t count, off_t offset);
+#endif
+/*
+ * Forward decl that will remind us if its twin in cache.h changes.
+ * This function is used in compat/pread.c.  But we can't include
+ * cache.h there.
+ */
+extern ssize_t read_in_full(int fd, void *buf, size_t count);
+
+#ifdef NO_SETENV
+#define setenv gitsetenv
+extern int gitsetenv(const char *, const char *, int);
+#endif
+
+#ifdef NO_MKDTEMP
+#define mkdtemp gitmkdtemp
+extern char *gitmkdtemp(char *);
+#endif
+
+#ifdef NO_UNSETENV
+#define unsetenv gitunsetenv
+extern void gitunsetenv(const char *);
+#endif
+
+#ifdef NO_STRCASESTR
+#define strcasestr gitstrcasestr
+extern char *gitstrcasestr(const char *haystack, const char *needle);
+#endif
+
+#ifdef NO_STRLCPY
+#define strlcpy gitstrlcpy
+extern size_t gitstrlcpy(char *, const char *, size_t);
+#endif
+
+#ifdef NO_STRTOUMAX
+#define strtoumax gitstrtoumax
+extern uintmax_t gitstrtoumax(const char *, char **, int);
+#endif
+
+#ifdef NO_HSTRERROR
+#define hstrerror githstrerror
+extern const char *githstrerror(int herror);
+#endif
+
+#ifdef NO_MEMMEM
+#define memmem gitmemmem
+void *gitmemmem(const void *haystack, size_t haystacklen,
+                const void *needle, size_t needlelen);
+#endif
+
+#ifdef FREAD_READS_DIRECTORIES
+#ifdef fopen
+#undef fopen
+#endif
+#define fopen(a,b) git_fopen(a,b)
+extern FILE *git_fopen(const char*, const char*);
+#endif
+
+#ifdef SNPRINTF_RETURNS_BOGUS
+#define snprintf git_snprintf
+extern int git_snprintf(char *str, size_t maxsize,
+			const char *format, ...);
+#define vsnprintf git_vsnprintf
+extern int git_vsnprintf(char *str, size_t maxsize,
+			 const char *format, va_list ap);
+#endif
+
+#ifdef __GLIBC_PREREQ
+#if __GLIBC_PREREQ(2, 1)
+#define HAVE_STRCHRNUL
+#endif
+#endif
+
+#ifndef HAVE_STRCHRNUL
+#define strchrnul gitstrchrnul
+static inline char *gitstrchrnul(const char *s, int c)
+{
+	while (*s && *s != c)
+		s++;
+	return (char *)s;
+}
+#endif
+
+/*
+ * Wrappers:
+ */
+extern char *xstrdup(const char *str);
+extern void *xmalloc(size_t size);
+extern void *xmemdupz(const void *data, size_t len);
+extern char *xstrndup(const char *str, size_t len);
+extern void *xrealloc(void *ptr, size_t size);
+extern void *xcalloc(size_t nmemb, size_t size);
+extern void *xmmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
+extern ssize_t xread(int fd, void *buf, size_t len);
+extern ssize_t xwrite(int fd, const void *buf, size_t len);
+extern int xdup(int fd);
+extern FILE *xfdopen(int fd, const char *mode);
+static inline size_t xsize_t(off_t len)
+{
+	return (size_t)len;
+}
+
+static inline int has_extension(const char *filename, const char *ext)
+{
+	size_t len = strlen(filename);
+	size_t extlen = strlen(ext);
+	return len > extlen && !memcmp(filename + len - extlen, ext, extlen);
+}
+
+/* Sane ctype - no locale, and works with signed chars */
+#undef isascii
+#undef isspace
+#undef isdigit
+#undef isalpha
+#undef isalnum
+#undef tolower
+#undef toupper
+extern unsigned char sane_ctype[256];
+#define GIT_SPACE 0x01
+#define GIT_DIGIT 0x02
+#define GIT_ALPHA 0x04
+#define GIT_GLOB_SPECIAL 0x08
+#define GIT_REGEX_SPECIAL 0x10
+#define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0)
+#define isascii(x) (((x) & ~0x7f) == 0)
+#define isspace(x) sane_istest(x,GIT_SPACE)
+#define isdigit(x) sane_istest(x,GIT_DIGIT)
+#define isalpha(x) sane_istest(x,GIT_ALPHA)
+#define isalnum(x) sane_istest(x,GIT_ALPHA | GIT_DIGIT)
+#define is_glob_special(x) sane_istest(x,GIT_GLOB_SPECIAL)
+#define is_regex_special(x) sane_istest(x,GIT_GLOB_SPECIAL | GIT_REGEX_SPECIAL)
+#define tolower(x) sane_case((unsigned char)(x), 0x20)
+#define toupper(x) sane_case((unsigned char)(x), 0)
+
+static inline int sane_case(int x, int high)
+{
+	if (sane_istest(x, GIT_ALPHA))
+		x = (x & ~0x20) | high;
+	return x;
+}
+
+static inline int strtoul_ui(char const *s, int base, unsigned int *result)
+{
+	unsigned long ul;
+	char *p;
+
+	errno = 0;
+	ul = strtoul(s, &p, base);
+	if (errno || *p || p == s || (unsigned int) ul != ul)
+		return -1;
+	*result = ul;
+	return 0;
+}
+
+static inline int strtol_i(char const *s, int base, int *result)
+{
+	long ul;
+	char *p;
+
+	errno = 0;
+	ul = strtol(s, &p, base);
+	if (errno || *p || p == s || (int) ul != ul)
+		return -1;
+	*result = ul;
+	return 0;
+}
+
+#ifdef INTERNAL_QSORT
+void git_qsort(void *base, size_t nmemb, size_t size,
+	       int(*compar)(const void *, const void *));
+#define qsort git_qsort
+#endif
+
+#ifndef DIR_HAS_BSD_GROUP_SEMANTICS
+# define FORCE_DIR_SET_GID S_ISGID
+#else
+# define FORCE_DIR_SET_GID 0
+#endif
+
+#ifdef NO_NSEC
+#undef USE_NSEC
+#define ST_CTIME_NSEC(st) 0
+#define ST_MTIME_NSEC(st) 0
+#else
+#ifdef USE_ST_TIMESPEC
+#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctimespec.tv_nsec))
+#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtimespec.tv_nsec))
+#else
+#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctim.tv_nsec))
+#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtim.tv_nsec))
+#endif
+#endif
+
+#endif
diff --git a/Documentation/perf_counter/util/wrapper.c b/Documentation/perf_counter/util/wrapper.c
new file mode 100644
index 0000000..6350d65
--- /dev/null
+++ b/Documentation/perf_counter/util/wrapper.c
@@ -0,0 +1,206 @@
+/*
+ * Various trivial helper wrappers around standard functions
+ */
+#include "cache.h"
+
+/*
+ * There's no pack memory to release - but stay close to the Git
+ * version so wrap this away:
+ */
+static inline void release_pack_memory(size_t size, int flag)
+{
+}
+
+char *xstrdup(const char *str)
+{
+	char *ret = strdup(str);
+	if (!ret) {
+		release_pack_memory(strlen(str) + 1, -1);
+		ret = strdup(str);
+		if (!ret)
+			die("Out of memory, strdup failed");
+	}
+	return ret;
+}
+
+void *xmalloc(size_t size)
+{
+	void *ret = malloc(size);
+	if (!ret && !size)
+		ret = malloc(1);
+	if (!ret) {
+		release_pack_memory(size, -1);
+		ret = malloc(size);
+		if (!ret && !size)
+			ret = malloc(1);
+		if (!ret)
+			die("Out of memory, malloc failed");
+	}
+#ifdef XMALLOC_POISON
+	memset(ret, 0xA5, size);
+#endif
+	return ret;
+}
+
+/*
+ * xmemdupz() allocates (len + 1) bytes of memory, duplicates "len" bytes of
+ * "data" to the allocated memory, zero terminates the allocated memory,
+ * and returns a pointer to the allocated memory. If the allocation fails,
+ * the program dies.
+ */
+void *xmemdupz(const void *data, size_t len)
+{
+	char *p = xmalloc(len + 1);
+	memcpy(p, data, len);
+	p[len] = '\0';
+	return p;
+}
+
+char *xstrndup(const char *str, size_t len)
+{
+	char *p = memchr(str, '\0', len);
+	return xmemdupz(str, p ? p - str : len);
+}
+
+void *xrealloc(void *ptr, size_t size)
+{
+	void *ret = realloc(ptr, size);
+	if (!ret && !size)
+		ret = realloc(ptr, 1);
+	if (!ret) {
+		release_pack_memory(size, -1);
+		ret = realloc(ptr, size);
+		if (!ret && !size)
+			ret = realloc(ptr, 1);
+		if (!ret)
+			die("Out of memory, realloc failed");
+	}
+	return ret;
+}
+
+void *xcalloc(size_t nmemb, size_t size)
+{
+	void *ret = calloc(nmemb, size);
+	if (!ret && (!nmemb || !size))
+		ret = calloc(1, 1);
+	if (!ret) {
+		release_pack_memory(nmemb * size, -1);
+		ret = calloc(nmemb, size);
+		if (!ret && (!nmemb || !size))
+			ret = calloc(1, 1);
+		if (!ret)
+			die("Out of memory, calloc failed");
+	}
+	return ret;
+}
+
+void *xmmap(void *start, size_t length,
+	int prot, int flags, int fd, off_t offset)
+{
+	void *ret = mmap(start, length, prot, flags, fd, offset);
+	if (ret == MAP_FAILED) {
+		if (!length)
+			return NULL;
+		release_pack_memory(length, fd);
+		ret = mmap(start, length, prot, flags, fd, offset);
+		if (ret == MAP_FAILED)
+			die("Out of memory? mmap failed: %s", strerror(errno));
+	}
+	return ret;
+}
+
+/*
+ * xread() is the same a read(), but it automatically restarts read()
+ * operations with a recoverable error (EAGAIN and EINTR). xread()
+ * DOES NOT GUARANTEE that "len" bytes is read even if the data is available.
+ */
+ssize_t xread(int fd, void *buf, size_t len)
+{
+	ssize_t nr;
+	while (1) {
+		nr = read(fd, buf, len);
+		if ((nr < 0) && (errno == EAGAIN || errno == EINTR))
+			continue;
+		return nr;
+	}
+}
+
+/*
+ * xwrite() is the same a write(), but it automatically restarts write()
+ * operations with a recoverable error (EAGAIN and EINTR). xwrite() DOES NOT
+ * GUARANTEE that "len" bytes is written even if the operation is successful.
+ */
+ssize_t xwrite(int fd, const void *buf, size_t len)
+{
+	ssize_t nr;
+	while (1) {
+		nr = write(fd, buf, len);
+		if ((nr < 0) && (errno == EAGAIN || errno == EINTR))
+			continue;
+		return nr;
+	}
+}
+
+ssize_t read_in_full(int fd, void *buf, size_t count)
+{
+	char *p = buf;
+	ssize_t total = 0;
+
+	while (count > 0) {
+		ssize_t loaded = xread(fd, p, count);
+		if (loaded <= 0)
+			return total ? total : loaded;
+		count -= loaded;
+		p += loaded;
+		total += loaded;
+	}
+
+	return total;
+}
+
+ssize_t write_in_full(int fd, const void *buf, size_t count)
+{
+	const char *p = buf;
+	ssize_t total = 0;
+
+	while (count > 0) {
+		ssize_t written = xwrite(fd, p, count);
+		if (written < 0)
+			return -1;
+		if (!written) {
+			errno = ENOSPC;
+			return -1;
+		}
+		count -= written;
+		p += written;
+		total += written;
+	}
+
+	return total;
+}
+
+int xdup(int fd)
+{
+	int ret = dup(fd);
+	if (ret < 0)
+		die("dup failed: %s", strerror(errno));
+	return ret;
+}
+
+FILE *xfdopen(int fd, const char *mode)
+{
+	FILE *stream = fdopen(fd, mode);
+	if (stream == NULL)
+		die("Out of memory? fdopen failed: %s", strerror(errno));
+	return stream;
+}
+
+int xmkstemp(char *template)
+{
+	int fd;
+
+	fd = mkstemp(template);
+	if (fd < 0)
+		die("Unable to create temporary file: %s", strerror(errno));
+	return fd;
+}
diff --git a/Documentation/perf_counter/wrapper.c b/Documentation/perf_counter/wrapper.c
deleted file mode 100644
index 6350d65..0000000
--- a/Documentation/perf_counter/wrapper.c
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Various trivial helper wrappers around standard functions
- */
-#include "cache.h"
-
-/*
- * There's no pack memory to release - but stay close to the Git
- * version so wrap this away:
- */
-static inline void release_pack_memory(size_t size, int flag)
-{
-}
-
-char *xstrdup(const char *str)
-{
-	char *ret = strdup(str);
-	if (!ret) {
-		release_pack_memory(strlen(str) + 1, -1);
-		ret = strdup(str);
-		if (!ret)
-			die("Out of memory, strdup failed");
-	}
-	return ret;
-}
-
-void *xmalloc(size_t size)
-{
-	void *ret = malloc(size);
-	if (!ret && !size)
-		ret = malloc(1);
-	if (!ret) {
-		release_pack_memory(size, -1);
-		ret = malloc(size);
-		if (!ret && !size)
-			ret = malloc(1);
-		if (!ret)
-			die("Out of memory, malloc failed");
-	}
-#ifdef XMALLOC_POISON
-	memset(ret, 0xA5, size);
-#endif
-	return ret;
-}
-
-/*
- * xmemdupz() allocates (len + 1) bytes of memory, duplicates "len" bytes of
- * "data" to the allocated memory, zero terminates the allocated memory,
- * and returns a pointer to the allocated memory. If the allocation fails,
- * the program dies.
- */
-void *xmemdupz(const void *data, size_t len)
-{
-	char *p = xmalloc(len + 1);
-	memcpy(p, data, len);
-	p[len] = '\0';
-	return p;
-}
-
-char *xstrndup(const char *str, size_t len)
-{
-	char *p = memchr(str, '\0', len);
-	return xmemdupz(str, p ? p - str : len);
-}
-
-void *xrealloc(void *ptr, size_t size)
-{
-	void *ret = realloc(ptr, size);
-	if (!ret && !size)
-		ret = realloc(ptr, 1);
-	if (!ret) {
-		release_pack_memory(size, -1);
-		ret = realloc(ptr, size);
-		if (!ret && !size)
-			ret = realloc(ptr, 1);
-		if (!ret)
-			die("Out of memory, realloc failed");
-	}
-	return ret;
-}
-
-void *xcalloc(size_t nmemb, size_t size)
-{
-	void *ret = calloc(nmemb, size);
-	if (!ret && (!nmemb || !size))
-		ret = calloc(1, 1);
-	if (!ret) {
-		release_pack_memory(nmemb * size, -1);
-		ret = calloc(nmemb, size);
-		if (!ret && (!nmemb || !size))
-			ret = calloc(1, 1);
-		if (!ret)
-			die("Out of memory, calloc failed");
-	}
-	return ret;
-}
-
-void *xmmap(void *start, size_t length,
-	int prot, int flags, int fd, off_t offset)
-{
-	void *ret = mmap(start, length, prot, flags, fd, offset);
-	if (ret == MAP_FAILED) {
-		if (!length)
-			return NULL;
-		release_pack_memory(length, fd);
-		ret = mmap(start, length, prot, flags, fd, offset);
-		if (ret == MAP_FAILED)
-			die("Out of memory? mmap failed: %s", strerror(errno));
-	}
-	return ret;
-}
-
-/*
- * xread() is the same a read(), but it automatically restarts read()
- * operations with a recoverable error (EAGAIN and EINTR). xread()
- * DOES NOT GUARANTEE that "len" bytes is read even if the data is available.
- */
-ssize_t xread(int fd, void *buf, size_t len)
-{
-	ssize_t nr;
-	while (1) {
-		nr = read(fd, buf, len);
-		if ((nr < 0) && (errno == EAGAIN || errno == EINTR))
-			continue;
-		return nr;
-	}
-}
-
-/*
- * xwrite() is the same a write(), but it automatically restarts write()
- * operations with a recoverable error (EAGAIN and EINTR). xwrite() DOES NOT
- * GUARANTEE that "len" bytes is written even if the operation is successful.
- */
-ssize_t xwrite(int fd, const void *buf, size_t len)
-{
-	ssize_t nr;
-	while (1) {
-		nr = write(fd, buf, len);
-		if ((nr < 0) && (errno == EAGAIN || errno == EINTR))
-			continue;
-		return nr;
-	}
-}
-
-ssize_t read_in_full(int fd, void *buf, size_t count)
-{
-	char *p = buf;
-	ssize_t total = 0;
-
-	while (count > 0) {
-		ssize_t loaded = xread(fd, p, count);
-		if (loaded <= 0)
-			return total ? total : loaded;
-		count -= loaded;
-		p += loaded;
-		total += loaded;
-	}
-
-	return total;
-}
-
-ssize_t write_in_full(int fd, const void *buf, size_t count)
-{
-	const char *p = buf;
-	ssize_t total = 0;
-
-	while (count > 0) {
-		ssize_t written = xwrite(fd, p, count);
-		if (written < 0)
-			return -1;
-		if (!written) {
-			errno = ENOSPC;
-			return -1;
-		}
-		count -= written;
-		p += written;
-		total += written;
-	}
-
-	return total;
-}
-
-int xdup(int fd)
-{
-	int ret = dup(fd);
-	if (ret < 0)
-		die("dup failed: %s", strerror(errno));
-	return ret;
-}
-
-FILE *xfdopen(int fd, const char *mode)
-{
-	FILE *stream = fdopen(fd, mode);
-	if (stream == NULL)
-		die("Out of memory? fdopen failed: %s", strerror(errno));
-	return stream;
-}
-
-int xmkstemp(char *template)
-{
-	int fd;
-
-	fd = mkstemp(template);
-	if (fd < 0)
-		die("Unable to create temporary file: %s", strerror(errno));
-	return fd;
-}
-- 
cgit v0.10.2


From b2ba83ff4f4405cebc10884121ee71338a1a6c94 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sun, 26 Apr 2009 23:38:08 -0700
Subject: x86: apic: Remove duplicated macros

XAPIC_DEST_* is dupliicated to the one in apicdef.h

[ Impact: cleanup ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <49F552D0.5050505@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 9cfe1f4..344eee4 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -173,13 +173,6 @@ static inline int is_WPEG(struct rio_detail *rio){
 		rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
 }
 
-
-/* In clustered mode, the high nibble of APIC ID is a cluster number.
- * The low nibble is a 4-bit bitmap. */
-#define XAPIC_DEST_CPUS_SHIFT	4
-#define XAPIC_DEST_CPUS_MASK	((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
-#define XAPIC_DEST_CLUSTER_MASK	(XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
-
 #define SUMMIT_APIC_DFR_VALUE	(APIC_DFR_CLUSTER)
 
 static const struct cpumask *summit_target_cpus(void)
-- 
cgit v0.10.2


From e0e42142bab96404de535cceb85d6533d5ad7942 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sun, 26 Apr 2009 23:39:38 -0700
Subject: x86: Use dmi check in apic_is_clustered() on 64-bit to mark the TSC
 unstable

We will have systems with 2 and more sockets 8cores/2thread,
but we treat them as multi chassis - while they could have
a stable TSC domain.

Use DMI check instead.

[ Impact: do not turn possibly stable TSCs off incorrectly ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Ravikiran Thirumalai <kiran@scalex86.org>
LKML-Reference: <49F5532A.5000802@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 1386dbe..28f747d 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2138,31 +2138,14 @@ static void apic_pm_activate(void) { }
 #endif	/* CONFIG_PM */
 
 #ifdef CONFIG_X86_64
-/*
- * apic_is_clustered_box() -- Check if we can expect good TSC
- *
- * Thus far, the major user of this is IBM's Summit2 series:
- *
- * Clustered boxes may have unsynced TSC problems if they are
- * multi-chassis. Use available data to take a good guess.
- * If in doubt, go HPET.
- */
-__cpuinit int apic_is_clustered_box(void)
+
+static int __cpuinit apic_cluster_num(void)
 {
 	int i, clusters, zeros;
 	unsigned id;
 	u16 *bios_cpu_apicid;
 	DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
 
-	/*
-	 * there is not this kind of box with AMD CPU yet.
-	 * Some AMD box with quadcore cpu and 8 sockets apicid
-	 * will be [4, 0x23] or [8, 0x27] could be thought to
-	 * vsmp box still need checking...
-	 */
-	if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
-		return 0;
-
 	bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
 	bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
 
@@ -2198,18 +2181,67 @@ __cpuinit int apic_is_clustered_box(void)
 			++zeros;
 	}
 
-	/* ScaleMP vSMPowered boxes have one cluster per board and TSCs are
-	 * not guaranteed to be synced between boards
-	 */
-	if (is_vsmp_box() && clusters > 1)
+	return clusters;
+}
+
+static int __cpuinitdata multi_checked;
+static int __cpuinitdata multi;
+
+static int __cpuinit set_multi(const struct dmi_system_id *d)
+{
+	if (multi)
+		return 0;
+	printk(KERN_INFO "APIC: %s detected, Multi Chassis\n", d->ident);
+	multi = 1;
+	return 0;
+}
+
+static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = {
+	{
+		.callback = set_multi,
+		.ident = "IBM System Summit2",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Summit2"),
+		},
+	},
+	{}
+};
+
+static void __cpuinit dmi_check_multi(void)
+{
+	if (multi_checked)
+		return;
+
+	dmi_check_system(multi_dmi_table);
+	multi_checked = 1;
+}
+
+/*
+ * apic_is_clustered_box() -- Check if we can expect good TSC
+ *
+ * Thus far, the major user of this is IBM's Summit2 series:
+ * Clustered boxes may have unsynced TSC problems if they are
+ * multi-chassis.
+ * Use DMI to check them
+ */
+__cpuinit int apic_is_clustered_box(void)
+{
+	dmi_check_multi();
+	if (multi)
 		return 1;
 
+	if (!is_vsmp_box())
+		return 0;
+
 	/*
-	 * If clusters > 2, then should be multi-chassis.
-	 * May have to revisit this when multi-core + hyperthreaded CPUs come
-	 * out, but AFAIK this will work even for them.
+	 * ScaleMP vSMPowered boxes have one cluster per board and TSCs are
+	 * not guaranteed to be synced between boards
 	 */
-	return (clusters > 2);
+	if (apic_cluster_num() > 1)
+		return 1;
+
+	return 0;
 }
 #endif
 
-- 
cgit v0.10.2


From 47c8a08bbe77ad3c06f63919a14b0f0b0cd54390 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 27 Apr 2009 17:34:39 +0900
Subject: sh: rtc-generic support.

This adds rtc-generic support for SUPERH32.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 1647489..9db02ce 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -30,6 +30,7 @@ config SUPERH32
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_ARCH_KGDB
 	select ARCH_HIBERNATION_POSSIBLE if MMU
+	select RTC_LIB
 
 config SUPERH64
 	def_bool ARCH = "sh64"
diff --git a/arch/sh/include/asm/rtc.h b/arch/sh/include/asm/rtc.h
index f7b010d..52b0c2d 100644
--- a/arch/sh/include/asm/rtc.h
+++ b/arch/sh/include/asm/rtc.h
@@ -6,6 +6,17 @@ extern void (*board_time_init)(void);
 extern void (*rtc_sh_get_time)(struct timespec *);
 extern int (*rtc_sh_set_time)(const time_t);
 
+/* some dummy definitions */
+#define RTC_BATT_BAD 0x100	/* battery bad */
+#define RTC_SQWE 0x08		/* enable square-wave output */
+#define RTC_DM_BINARY 0x04	/* all time/date values are BCD if clear */
+#define RTC_24H 0x02		/* 24 hour mode - else hours bit 7 means pm */
+#define RTC_DST_EN 0x01	        /* auto switch DST - works f. USA only */
+
+struct rtc_time;
+unsigned int get_rtc_time(struct rtc_time *);
+int set_rtc_time(struct rtc_time *);
+
 #define RTC_CAP_4_DIGIT_YEAR	(1 << 0)
 
 struct sh_rtc_platform_info {
diff --git a/arch/sh/kernel/time_32.c b/arch/sh/kernel/time_32.c
index c770413..109409f 100644
--- a/arch/sh/kernel/time_32.c
+++ b/arch/sh/kernel/time_32.c
@@ -19,6 +19,7 @@
 #include <linux/mc146818rtc.h>	/* for rtc_lock */
 #include <linux/platform_device.h>
 #include <linux/smp.h>
+#include <linux/rtc.h>
 #include <asm/clock.h>
 #include <asm/rtc.h>
 #include <asm/timer.h>
@@ -45,6 +46,28 @@ static int null_rtc_set_time(const time_t secs)
 void (*rtc_sh_get_time)(struct timespec *) = null_rtc_get_time;
 int (*rtc_sh_set_time)(const time_t) = null_rtc_set_time;
 
+unsigned int get_rtc_time(struct rtc_time *tm)
+{
+	if (rtc_sh_get_time != null_rtc_get_time) {
+		struct timespec tv;
+
+		rtc_sh_get_time(&tv);
+		rtc_time_to_tm(tv.tv_sec, tm);
+	}
+
+	return RTC_24H;
+}
+EXPORT_SYMBOL(get_rtc_time);
+
+int set_rtc_time(struct rtc_time *tm)
+{
+	unsigned long secs;
+
+	rtc_tm_to_time(tm, &secs);
+	return rtc_sh_set_time(secs);
+}
+EXPORT_SYMBOL(set_rtc_time);
+
 #ifndef CONFIG_GENERIC_TIME
 void do_gettimeofday(struct timeval *tv)
 {
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 4e9851f..277d35d 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -692,7 +692,7 @@ config RTC_DRV_GENERIC
 	tristate "Generic RTC support"
 	# Please consider writing a new RTC driver instead of using the generic
 	# RTC abstraction
-	depends on PARISC || M68K || PPC
+	depends on PARISC || M68K || PPC || SUPERH32
 	help
 	  Say Y or M here to enable RTC support on systems using the generic
 	  RTC abstraction. If you do not know what you are doing, you should
-- 
cgit v0.10.2


From c2e0090c668fc99f5be65fd9907da781cb6a2ef5 Mon Sep 17 00:00:00 2001
From: Jean-Christophe PLAGNIOL-VILLARD <plagnioj@jcrosoft.com>
Date: Mon, 27 Apr 2009 17:54:41 +0900
Subject: sh: mach-r2d: add physmap-flash support for R2D+ boards.

The RTS7751R2D_1 boards only support 1MB of socket-mounted MBM29F040
flash, which we just leave alone as it's not terribly interesting.

This adds support for the s29gl256p on the r2d+ boards that makes a bit
more sense to expose to the user.

Signed-off-by: Jean-Christophe PLAGNIOL-VILLARD <plagnioj@jcrosoft.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/boards/mach-r2d/setup.c b/arch/sh/boards/mach-r2d/setup.c
index c585be0..a625ecb 100644
--- a/arch/sh/boards/mach-r2d/setup.c
+++ b/arch/sh/boards/mach-r2d/setup.c
@@ -10,6 +10,9 @@
  */
 #include <linux/init.h>
 #include <linux/platform_device.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/partitions.h>
+#include <linux/mtd/physmap.h>
 #include <linux/ata_platform.h>
 #include <linux/sm501.h>
 #include <linux/sm501-regs.h>
@@ -181,6 +184,50 @@ static struct platform_device sm501_device = {
 	.resource	= sm501_resources,
 };
 
+static struct mtd_partition r2d_partitions[] = {
+	{
+		.name		= "U-Boot",
+		.offset		= 0x00000000,
+		.size		= 0x00040000,
+		.mask_flags	= MTD_WRITEABLE,
+	}, {
+		.name		= "Environment",
+		.offset		= MTDPART_OFS_NXTBLK,
+		.size		= 0x00040000,
+		.mask_flags	= MTD_WRITEABLE,
+	}, {
+		.name		= "Kernel",
+		.offset		= MTDPART_OFS_NXTBLK,
+		.size		= 0x001c0000,
+	}, {
+		.name		= "Flash_FS",
+		.offset		= MTDPART_OFS_NXTBLK,
+		.size		= MTDPART_SIZ_FULL,
+	}
+};
+
+static struct physmap_flash_data flash_data = {
+	.width		= 2,
+	.nr_parts	= ARRAY_SIZE(r2d_partitions),
+	.parts		= r2d_partitions,
+};
+
+static struct resource flash_resource = {
+	.start		= 0x00000000,
+	.end		= 0x02000000,
+	.flags		= IORESOURCE_MEM,
+};
+
+static struct platform_device flash_device = {
+	.name		= "physmap-flash",
+	.id		= -1,
+	.resource	= &flash_resource,
+	.num_resources	= 1,
+	.dev		= {
+		.platform_data = &flash_data,
+	},
+};
+
 static struct platform_device *rts7751r2d_devices[] __initdata = {
 	&sm501_device,
 	&heartbeat_device,
@@ -203,6 +250,9 @@ static int __init rts7751r2d_devices_setup(void)
 	if (register_trapped_io(&cf_trapped_io) == 0)
 		platform_device_register(&cf_ide_device);
 
+	if (mach_is_r2d_plus())
+		platform_device_register(&flash_device);
+
 	spi_register_board_info(spi_bus, ARRAY_SIZE(spi_bus));
 
 	return platform_add_devices(rts7751r2d_devices,
diff --git a/arch/sh/configs/rts7751r2dplus_defconfig b/arch/sh/configs/rts7751r2dplus_defconfig
index 2bc1c97..a860435 100644
--- a/arch/sh/configs/rts7751r2dplus_defconfig
+++ b/arch/sh/configs/rts7751r2dplus_defconfig
@@ -424,7 +424,92 @@ CONFIG_FIRMWARE_IN_KERNEL=y
 CONFIG_EXTRA_FIRMWARE=""
 # CONFIG_SYS_HYPERVISOR is not set
 # CONFIG_CONNECTOR is not set
-# CONFIG_MTD is not set
+CONFIG_MTD=y
+# CONFIG_MTD_DEBUG is not set
+CONFIG_MTD_CONCAT=y
+CONFIG_MTD_PARTITIONS=y
+# CONFIG_MTD_TESTS is not set
+# CONFIG_MTD_REDBOOT_PARTS is not set
+CONFIG_MTD_CMDLINE_PARTS=y
+# CONFIG_MTD_AR7_PARTS is not set
+
+#
+# User Modules And Translation Layers
+#
+CONFIG_MTD_CHAR=y
+# CONFIG_MTD_BLKDEVS is not set
+# CONFIG_MTD_BLOCK is not set
+# CONFIG_MTD_BLOCK_RO is not set
+# CONFIG_FTL is not set
+# CONFIG_NFTL is not set
+# CONFIG_INFTL is not set
+# CONFIG_RFD_FTL is not set
+# CONFIG_SSFDC is not set
+# CONFIG_MTD_OOPS is not set
+
+#
+# RAM/ROM/Flash chip drivers
+#
+CONFIG_MTD_CFI=y
+# CONFIG_MTD_JEDECPROBE is not set
+CONFIG_MTD_GEN_PROBE=y
+# CONFIG_MTD_CFI_ADV_OPTIONS is not set
+CONFIG_MTD_MAP_BANK_WIDTH_1=y
+CONFIG_MTD_MAP_BANK_WIDTH_2=y
+CONFIG_MTD_MAP_BANK_WIDTH_4=y
+# CONFIG_MTD_MAP_BANK_WIDTH_8 is not set
+# CONFIG_MTD_MAP_BANK_WIDTH_16 is not set
+# CONFIG_MTD_MAP_BANK_WIDTH_32 is not set
+CONFIG_MTD_CFI_I1=y
+CONFIG_MTD_CFI_I2=y
+# CONFIG_MTD_CFI_I4 is not set
+# CONFIG_MTD_CFI_I8 is not set
+# CONFIG_MTD_CFI_INTELEXT is not set
+CONFIG_MTD_CFI_AMDSTD=y
+# CONFIG_MTD_CFI_STAA is not set
+CONFIG_MTD_CFI_UTIL=y
+# CONFIG_MTD_RAM is not set
+# CONFIG_MTD_ROM is not set
+# CONFIG_MTD_ABSENT is not set
+
+#
+# Mapping drivers for chip access
+#
+# CONFIG_MTD_COMPLEX_MAPPINGS is not set
+CONFIG_MTD_PHYSMAP=y
+# CONFIG_MTD_PHYSMAP_COMPAT is not set
+# CONFIG_MTD_INTEL_VR_NOR is not set
+# CONFIG_MTD_PLATRAM is not set
+
+#
+# Self-contained MTD device drivers
+#
+# CONFIG_MTD_PMC551 is not set
+# CONFIG_MTD_DATAFLASH is not set
+# CONFIG_MTD_M25P80 is not set
+# CONFIG_MTD_SLRAM is not set
+# CONFIG_MTD_PHRAM is not set
+# CONFIG_MTD_MTDRAM is not set
+# CONFIG_MTD_BLOCK2MTD is not set
+
+#
+# Disk-On-Chip Device Drivers
+#
+# CONFIG_MTD_DOC2000 is not set
+# CONFIG_MTD_DOC2001 is not set
+# CONFIG_MTD_DOC2001PLUS is not set
+# CONFIG_MTD_NAND is not set
+# CONFIG_MTD_ONENAND is not set
+
+#
+# LPDDR flash memory drivers
+#
+# CONFIG_MTD_LPDDR is not set
+
+#
+# UBI - Unsorted block images
+#
+# CONFIG_MTD_UBI is not set
 # CONFIG_PARPORT is not set
 CONFIG_BLK_DEV=y
 # CONFIG_BLK_CPQ_CISS_DA is not set
-- 
cgit v0.10.2


From ac2ff946a53e7bd0ae98f4e5d1d6c1b1dced82e5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 12:38:32 +0900
Subject: mg_disk: fix locking

IRQ and timeout handlers call functions which expect locked queue lock
without locking it.  Fix it.

While at it, convert 0s used as null pointer constant to NULLs.

[ Impact: fix locking, cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: unsik Kim <donari75@gmail.com>

diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index fb39d9a..d3e72ad 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -160,11 +160,16 @@ static irqreturn_t mg_irq(int irq, void *dev_id)
 	struct mg_host *host = dev_id;
 	void (*handler)(struct mg_host *) = host->mg_do_intr;
 
-	host->mg_do_intr = 0;
+	spin_lock(&host->lock);
+
+	host->mg_do_intr = NULL;
 	del_timer(&host->timer);
 	if (!handler)
 		handler = mg_unexpected_intr;
 	handler(host);
+
+	spin_unlock(&host->lock);
+
 	return IRQ_HANDLED;
 }
 
@@ -319,7 +324,7 @@ static void mg_read(struct request *req)
 
 	remains = req->nr_sectors;
 
-	if (mg_out(host, req->sector, req->nr_sectors, MG_CMD_RD, 0) !=
+	if (mg_out(host, req->sector, req->nr_sectors, MG_CMD_RD, NULL) !=
 			MG_ERR_NONE)
 		mg_bad_rw_intr(host);
 
@@ -363,7 +368,7 @@ static void mg_write(struct request *req)
 
 	remains = req->nr_sectors;
 
-	if (mg_out(host, req->sector, req->nr_sectors, MG_CMD_WR, 0) !=
+	if (mg_out(host, req->sector, req->nr_sectors, MG_CMD_WR, NULL) !=
 			MG_ERR_NONE) {
 		mg_bad_rw_intr(host);
 		return;
@@ -521,9 +526,11 @@ void mg_times_out(unsigned long data)
 	char *name;
 	struct request *req;
 
+	spin_lock_irq(&host->lock);
+
 	req = elv_next_request(host->breq);
 	if (!req)
-		return;
+		goto out_unlock;
 
 	host->mg_do_intr = NULL;
 
@@ -534,6 +541,8 @@ void mg_times_out(unsigned long data)
 	mg_bad_rw_intr(host);
 
 	mg_request(host->breq);
+out_unlock:
+	spin_unlock_irq(&host->lock);
 }
 
 static void mg_request_poll(struct request_queue *q)
-- 
cgit v0.10.2


From 7090a0a97f55cbf47547a140fcc5a349f32c598c Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 28 Apr 2009 12:38:33 +0900
Subject: mg_disk: fix CONFIG_LBD=y warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

drivers/block/mg_disk.c: In function ‘mg_dump_status’:
drivers/block/mg_disk.c:265: warning: format ‘%ld’ expects type ‘long int’, but
argument 2 has type ‘sector_t’

[ Impact: kill build warning ]

Cc: unsik Kim <donari75@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index d3e72ad..f389835 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -79,7 +79,7 @@ static void mg_dump_status(const char *msg, unsigned int stat,
 			if (host->breq) {
 				req = elv_next_request(host->breq);
 				if (req)
-					printk(", sector=%ld", req->sector);
+					printk(", sector=%u", (u32)req->sector);
 			}
 
 		}
-- 
cgit v0.10.2


From e93b9fb7d85da4fd9d5171649e5ddcac1dd572bf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 12:38:33 +0900
Subject: hd: fix locking

hd dance around local irq and HD_IRQ enable without achieving much.
It ends up transferring data from irq handler with both local irq and
HD_IRQ disabled.  The only place it actually does something is while
transferring the first block of a request which it does with HD_IRQ
disabled but local irq enabled.

Unfortunately, the dancing is horribly broken from locking POV.  IRQ
and timeout handlers access block queue without grabbing the queue
lock and running the driver in SMP configuration crashes the whole
machine pretty quickly.

Remove meaningless irq enable/disable dancing and add proper locking
in issue, irq and timeout paths.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index 3c11f06..baaa9e4 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -509,7 +509,6 @@ ok_to_write:
 	if (i > 0) {
 		SET_HANDLER(&write_intr);
 		outsw(HD_DATA, req->buffer, 256);
-		local_irq_enable();
 	} else {
 #if (HD_DELAY > 0)
 		last_req = read_timer();
@@ -541,8 +540,7 @@ static void hd_times_out(unsigned long dummy)
 	if (!CURRENT)
 		return;
 
-	disable_irq(HD_IRQ);
-	local_irq_enable();
+	spin_lock_irq(hd_queue->queue_lock);
 	reset = 1;
 	name = CURRENT->rq_disk->disk_name;
 	printk("%s: timeout\n", name);
@@ -552,9 +550,8 @@ static void hd_times_out(unsigned long dummy)
 #endif
 		end_request(CURRENT, 0);
 	}
-	local_irq_disable();
 	hd_request();
-	enable_irq(HD_IRQ);
+	spin_unlock_irq(hd_queue->queue_lock);
 }
 
 static int do_special_op(struct hd_i_struct *disk, struct request *req)
@@ -592,7 +589,6 @@ static void hd_request(void)
 		return;
 repeat:
 	del_timer(&device_timer);
-	local_irq_enable();
 
 	req = CURRENT;
 	if (!req) {
@@ -601,7 +597,6 @@ repeat:
 	}
 
 	if (reset) {
-		local_irq_disable();
 		reset_hd();
 		return;
 	}
@@ -660,9 +655,7 @@ repeat:
 
 static void do_hd_request(struct request_queue *q)
 {
-	disable_irq(HD_IRQ);
 	hd_request();
-	enable_irq(HD_IRQ);
 }
 
 static int hd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
@@ -684,12 +677,16 @@ static irqreturn_t hd_interrupt(int irq, void *dev_id)
 {
 	void (*handler)(void) = do_hd;
 
+	spin_lock(hd_queue->queue_lock);
+
 	do_hd = NULL;
 	del_timer(&device_timer);
 	if (!handler)
 		handler = unexpected_hd_interrupt;
 	handler();
-	local_irq_enable();
+
+	spin_unlock(hd_queue->queue_lock);
+
 	return IRQ_HANDLED;
 }
 
-- 
cgit v0.10.2


From e686307fdc84f249490e6c9da92fcb2424491f14 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Fri, 17 Apr 2009 08:41:21 +0200
Subject: loop: use BIO list management functions

Now that the bio list management stuff is generic, convert loop to use
bio lists instead of its own private bio list implementation.

Cc:  Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index ddae808..9ca4bb0 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -511,11 +511,7 @@ out:
  */
 static void loop_add_bio(struct loop_device *lo, struct bio *bio)
 {
-	if (lo->lo_biotail) {
-		lo->lo_biotail->bi_next = bio;
-		lo->lo_biotail = bio;
-	} else
-		lo->lo_bio = lo->lo_biotail = bio;
+	bio_list_add(&lo->lo_bio_list, bio);
 }
 
 /*
@@ -523,16 +519,7 @@ static void loop_add_bio(struct loop_device *lo, struct bio *bio)
  */
 static struct bio *loop_get_bio(struct loop_device *lo)
 {
-	struct bio *bio;
-
-	if ((bio = lo->lo_bio)) {
-		if (bio == lo->lo_biotail)
-			lo->lo_biotail = NULL;
-		lo->lo_bio = bio->bi_next;
-		bio->bi_next = NULL;
-	}
-
-	return bio;
+	return bio_list_pop(&lo->lo_bio_list);
 }
 
 static int loop_make_request(struct request_queue *q, struct bio *old_bio)
@@ -609,12 +596,13 @@ static int loop_thread(void *data)
 
 	set_user_nice(current, -20);
 
-	while (!kthread_should_stop() || lo->lo_bio) {
+	while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) {
 
 		wait_event_interruptible(lo->lo_event,
-				lo->lo_bio || kthread_should_stop());
+				!bio_list_empty(&lo->lo_bio_list) ||
+				kthread_should_stop());
 
-		if (!lo->lo_bio)
+		if (bio_list_empty(&lo->lo_bio_list))
 			continue;
 		spin_lock_irq(&lo->lo_lock);
 		bio = loop_get_bio(lo);
@@ -841,7 +829,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	lo->old_gfp_mask = mapping_gfp_mask(mapping);
 	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
 
-	lo->lo_bio = lo->lo_biotail = NULL;
+	bio_list_init(&lo->lo_bio_list);
 
 	/*
 	 * set queue make_request_fn, and add limits based on lower level
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7b214fd..f37ca8c 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -506,7 +506,7 @@ static inline int bio_has_data(struct bio *bio)
 }
 
 /*
- * BIO list managment for use by remapping drivers (e.g. DM or MD).
+ * BIO list management for use by remapping drivers (e.g. DM or MD) and loop.
  *
  * A bio_list anchors a singly-linked list of bios chained through the bi_next
  * member of the bio.  The bio_list also caches the last list member to allow
diff --git a/include/linux/loop.h b/include/linux/loop.h
index 4072544..66c194e 100644
--- a/include/linux/loop.h
+++ b/include/linux/loop.h
@@ -56,8 +56,7 @@ struct loop_device {
 	gfp_t		old_gfp_mask;
 
 	spinlock_t		lo_lock;
-	struct bio 		*lo_bio;
-	struct bio		*lo_biotail;
+	struct bio_list		lo_bio_list;
 	int			lo_state;
 	struct mutex		lo_ctl_mutex;
 	struct task_struct	*lo_thread;
-- 
cgit v0.10.2


From 924cec7789f65ab7f022256f6533ecba0747b5f3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:41 +0900
Subject: block: clear req->errors on bio completion only for fs requests

Impact: subtle behavior change

For fs requests, rq is only carrier of bios and rq error status as a
whole doesn't mean much.  This is the reason why rq->errors is being
cleared on each partial completion of a request as on each partial
completion the error status is transferred to the respective bios.

For pc requests, rq->errors is used to carry error status to the
issuer and thus __end_that_request_first() doesn't clear it on such
cases.

The condition was fine till now as only fs and pc requests have used
bio and thus the bio completion path.  However, future changes will
unify data accesses to bio and all non fs users care about rq error
status.  Clear rq->errors on bio completion only for fs requests.

In general, the implicit clearing is a bit too subtle especially as
the meaning of rq->errors is completely dependent on low level
drivers.  Unifying / cleaning up rq->errors usage and letting llds
manage it would be better.  TODO comment added.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Jens Axboe <axboe@kernel.dk>

diff --git a/block/blk-core.c b/block/blk-core.c
index 2998fe3..41bc0ff 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1741,10 +1741,14 @@ static int __end_that_request_first(struct request *req, int error,
 	trace_block_rq_complete(req->q, req);
 
 	/*
-	 * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual
-	 * sense key with us all the way through
+	 * For fs requests, rq is just carrier of independent bio's
+	 * and each partial completion should be handled separately.
+	 * Reset per-request error on each partial completion.
+	 *
+	 * TODO: tj: This is too subtle.  It would be better to let
+	 * low level drivers do what they see fit.
 	 */
-	if (!blk_pc_request(req))
+	if (blk_fs_request(req))
 		req->errors = 0;
 
 	if (error && (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))) {
-- 
cgit v0.10.2


From 0de57fb93b1daaeaecb658a98b3299ae460c02e9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:41 +0900
Subject: ide-tape: remove back-to-back REQUEST_SENSE detection

Impact: fix an oops which always triggers

ide_tape_issue_pc() assumed drive->pc isn't NULL on invocation when
checking for back-to-back request sense issues but drive->pc can be
NULL and even when it's not NULL, it's not safe to dereference it once
the previous command is complete because pc could have been freed or
was on stack.  Kill back-to-back REQUEST_SENSE detection.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index cb942a9..3a53e08 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -614,12 +614,6 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive,
 {
 	idetape_tape_t *tape = drive->driver_data;
 
-	if (drive->pc->c[0] == REQUEST_SENSE &&
-	    pc->c[0] == REQUEST_SENSE) {
-		printk(KERN_ERR "ide-tape: possible ide-tape.c bug - "
-			"Two request sense in serial were issued\n");
-	}
-
 	if (drive->failed_pc == NULL && pc->c[0] != REQUEST_SENSE)
 		drive->failed_pc = pc;
 
-- 
cgit v0.10.2


From 220d06b5531e7b8a6226b2fdfb21198c3ccc4f76 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:41 +0900
Subject: ide: use blk_run_queue() instead of blk_start_queueing()

blk_start_queueing() is being phased out in favor of
[__]blk_run_queue().  Switch.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index 310d03f..a914023 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -24,11 +24,8 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 			start_queue = 1;
 		spin_unlock_irq(&hwif->lock);
 
-		if (start_queue) {
-			spin_lock_irq(q->queue_lock);
-			blk_start_queueing(q);
-			spin_unlock_irq(q->queue_lock);
-		}
+		if (start_queue)
+			blk_run_queue(q);
 		return;
 	}
 	spin_unlock_irq(&hwif->lock);
-- 
cgit v0.10.2


From b2963ac1738542d30305d7e12c8c078a383a425c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:41 +0900
Subject: ide: don't set REQ_SOFTBARRIER

ide doesn't have to worry about REQ_SOFTBARRIER.  Don't set it.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index a9fbe2c..c243880 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -411,7 +411,6 @@ static void idedisk_prepare_flush(struct request_queue *q, struct request *rq)
 	cmd->protocol = ATA_PROT_NODATA;
 
 	rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
-	rq->cmd_flags |= REQ_SOFTBARRIER;
 	rq->special = cmd;
 }
 
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index c1c25eb..5991b23 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -231,7 +231,6 @@ static int generic_drive_reset(ide_drive_t *drive)
 	rq->cmd_type = REQ_TYPE_SPECIAL;
 	rq->cmd_len = 1;
 	rq->cmd[0] = REQ_DRIVE_RESET;
-	rq->cmd_flags |= REQ_SOFTBARRIER;
 	if (blk_execute_rq(drive->queue, NULL, rq, 1))
 		ret = rq->errors;
 	blk_put_request(rq);
-- 
cgit v0.10.2


From 214ae19104141404f18ad6651752eb38e3dd584e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:41 +0900
Subject: ide kill unused ide_cmd->special

Impact: removal of unused field

No one uses ide_cmd->special anymore.  Kill it.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/include/linux/ide.h b/include/linux/ide.h
index ff65fff..846a1e1 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -324,7 +324,6 @@ struct ide_cmd {
 	unsigned int		cursg_ofs;
 
 	struct request		*rq;		/* copy of request */
-	void			*special;	/* valid_t generally */
 };
 
 /* ATAPI packet command flags */
-- 
cgit v0.10.2


From 59a4f6f355fc718581ddcf1bb45a469d4756c035 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:41 +0900
Subject: ide-cd: clear sense buffer before issuing request sense

Impact: code simplification

ide_cd_request_sense_fixup() clears the tail of the sense buffer if
the device didn't completely fill it.  This patch makes
cdrom_queue_request_sense() clear the sense buffer before issuing the
command instead of clearing it afterwards.  This simplifies code and
eases future changes.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 3d4e099..b6a0d12 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -217,6 +217,8 @@ static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense,
 	if (sense == NULL)
 		sense = &info->sense_data;
 
+	memset(sense, 0, 18);
+
 	/* stuff the sense request in front of our current request */
 	blk_rq_init(NULL, rq);
 	rq->cmd_type = REQ_TYPE_ATA_PC;
@@ -504,14 +506,8 @@ static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct ide_cmd *cmd)
 	 * and some drives don't send them.  Sigh.
 	 */
 	if (rq->cmd[0] == GPCMD_REQUEST_SENSE &&
-	    cmd->nleft > 0 && cmd->nleft <= 5) {
-		unsigned int ofs = cmd->nbytes - cmd->nleft;
-
-		while (cmd->nleft > 0) {
-			*((u8 *)rq->data + ofs++) = 0;
-			cmd->nleft--;
-		}
-	}
+	    cmd->nleft > 0 && cmd->nleft <= 5)
+		cmd->nleft = 0;
 }
 
 int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
-- 
cgit v0.10.2


From 8968932e54db35cf9d69cfbbd50c26dfaaa586c7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: ide-floppy: block pc always uses bio

Impact: remove unnecessary code path

Block pc requests always use bio and rq->data is always NULL.  No need
to worry about !rq->bio cases in idefloppy_block_pc_cmd().  Note that
ide-atapi uses ide_pio_bytes() for bio PIO transfer which handle sg
fine.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>

diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 2b4868d..3b22e06 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -216,15 +216,13 @@ static void idefloppy_blockpc_cmd(struct ide_disk_obj *floppy,
 	ide_init_pc(pc);
 	memcpy(pc->c, rq->cmd, sizeof(pc->c));
 	pc->rq = rq;
-	if (rq->data_len && rq_data_dir(rq) == WRITE)
-		pc->flags |= PC_FLAG_WRITING;
-	pc->buf = rq->data;
-	if (rq->bio)
+	if (rq->data_len) {
 		pc->flags |= PC_FLAG_DMA_OK;
-	/*
-	 * possibly problematic, doesn't look like ide-floppy correctly
-	 * handled scattered requests if dma fails...
-	 */
+		if (rq_data_dir(rq) == WRITE)
+			pc->flags |= PC_FLAG_WRITING;
+	}
+	/* pio will be performed by ide_pio_bytes() which handles sg fine */
+	pc->buf = NULL;
 	pc->req_xfer = pc->buf_size = rq->data_len;
 }
 
-- 
cgit v0.10.2


From d868ca24302e99a0e8a86071ca2c66273edf97d9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: ide-taskfile: don't abuse rq->buffer

Impact: rq->buffer usage cleanup

ide_raw_taskfile() directly uses rq->buffer to carry pointer to the
data buffer.  This complicates both block interface and ide backend
request handling.  Use blk_rq_map_kern() instead and drop special
handling for REQ_TYPE_ATA_TASKFILE from ide_map_sg().

Note that REQ_RW setting is moved upwards as blk_rq_map_kern() uses it
to initialize bio rw flag.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 35dc38d..9b9e8b1 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -248,10 +248,7 @@ void ide_map_sg(ide_drive_t *drive, struct ide_cmd *cmd)
 	struct scatterlist *sg = hwif->sg_table;
 	struct request *rq = cmd->rq;
 
-	if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE) {
-		sg_init_one(sg, rq->buffer, rq->nr_sectors * SECTOR_SIZE);
-		cmd->sg_nents = 1;
-	} else if (!rq->bio) {
+	if (!rq->bio) {
 		sg_init_one(sg, rq->data, rq->data_len);
 		cmd->sg_nents = 1;
 	} else
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 4aa6223..f400eb4 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -424,7 +424,9 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
 
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
-	rq->buffer = buf;
+
+	if (cmd->tf_flags & IDE_TFLAG_WRITE)
+		rq->cmd_flags |= REQ_RW;
 
 	/*
 	 * (ks) We transfer currently only whole sectors.
@@ -432,18 +434,20 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
 	 * if we would find a solution to transfer any size.
 	 * To support special commands like READ LONG.
 	 */
-	rq->hard_nr_sectors = rq->nr_sectors = nsect;
-	rq->hard_cur_sectors = rq->current_nr_sectors = nsect;
-
-	if (cmd->tf_flags & IDE_TFLAG_WRITE)
-		rq->cmd_flags |= REQ_RW;
+	if (nsect) {
+		error = blk_rq_map_kern(drive->queue, rq, buf,
+					nsect * SECTOR_SIZE, __GFP_WAIT);
+		if (error)
+			goto put_req;
+	}
 
 	rq->special = cmd;
 	cmd->rq = rq;
 
 	error = blk_execute_rq(drive->queue, NULL, rq, 0);
-	blk_put_request(rq);
 
+put_req:
+	blk_put_request(rq);
 	return error;
 }
 
-- 
cgit v0.10.2


From ac0b0113ddbab3ed2388132d368c97292f9f3c84 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: ide-atapi: don't abuse rq->buffer

Impact: rq->buffer usage cleanup

ide-atapi uses rq->buffer as private opaque value for internal special
requests.  rq->special isn't used for these cases (the only case where
rq->special is used is for ide-tape rw requests).  Use rq->special
instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 7201b17..2894577 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -90,7 +90,7 @@ static void ide_queue_pc_head(ide_drive_t *drive, struct gendisk *disk,
 	blk_rq_init(NULL, rq);
 	rq->cmd_type = REQ_TYPE_SPECIAL;
 	rq->cmd_flags |= REQ_PREEMPT;
-	rq->buffer = (char *)pc;
+	rq->special = (char *)pc;
 	rq->rq_disk = disk;
 
 	if (pc->req_xfer) {
@@ -119,7 +119,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_SPECIAL;
-	rq->buffer = (char *)pc;
+	rq->special = (char *)pc;
 
 	if (pc->req_xfer) {
 		rq->data = pc->buf;
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 3b22e06..9460033 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -264,7 +264,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		pc = &floppy->queued_pc;
 		idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block);
 	} else if (blk_special_request(rq)) {
-		pc = (struct ide_atapi_pc *) rq->buffer;
+		pc = (struct ide_atapi_pc *)rq->special;
 	} else if (blk_pc_request(rq)) {
 		pc = &floppy->queued_pc;
 		idefloppy_blockpc_cmd(floppy, pc, rq);
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 3a53e08..aadf53c 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -828,7 +828,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 		goto out;
 	}
 	if (rq->cmd[13] & REQ_IDETAPE_PC1) {
-		pc = (struct ide_atapi_pc *) rq->buffer;
+		pc = (struct ide_atapi_pc *)rq->special;
 		rq->cmd[13] &= ~(REQ_IDETAPE_PC1);
 		rq->cmd[13] |= REQ_IDETAPE_PC2;
 		goto out;
-- 
cgit v0.10.2


From 1f181d2b1569dfb88a584a6e1847e9e1c7645951 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: ide-cd: don't abuse rq->buffer

Impact: rq->buffer usage cleanup

ide-cd uses rq->buffer to carry pointer to the original request when
issuing REQUEST_SENSE.  Use rq->special instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index b6a0d12..bb77c79 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -232,8 +232,8 @@ static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense,
 	rq->cmd_type = REQ_TYPE_SENSE;
 	rq->cmd_flags |= REQ_PREEMPT;
 
-	/* NOTE! Save the failed command in "rq->buffer" */
-	rq->buffer = (void *) failed_command;
+	/* NOTE! Save the failed command in "rq->special" */
+	rq->special = (void *)failed_command;
 
 	if (failed_command)
 		ide_debug_log(IDE_DBG_SENSE, "failed_cmd: 0x%x",
@@ -247,10 +247,10 @@ static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense,
 static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
 {
 	/*
-	 * For REQ_TYPE_SENSE, "rq->buffer" points to the original
+	 * For REQ_TYPE_SENSE, "rq->special" points to the original
 	 * failed request
 	 */
-	struct request *failed = (struct request *)rq->buffer;
+	struct request *failed = (struct request *)rq->special;
 	struct cdrom_info *info = drive->driver_data;
 	void *sense = &info->sense_data;
 
-- 
cgit v0.10.2


From e69d800f7e8797a8e3423380ee9d8ca1cb90c388 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: ide: add helpers for preparing sense requests

This is in preparation of removing the queueing of a sense request out
of the IRQ handler path.

Use struct request_sense as a general sense buffer for all ATAPI
devices ide-{floppy,tape,cd}.

tj: * blk_get_request(__GFP_WAIT) can't be called from do_request() as
      it can cause deadlock.  Converted to use inline struct request
      and blk_rq_init().

    * Added xfer / cdb len selection depending on device type.

    * All sense prep logics folded into ide_prep_sense() which never
      fails.

    * hwif->rq clearing and sense_rq used handling moved into
      ide_queue_sense_rq().

    * blk_rq_map_kern() conversion is moved to later patch.

CC: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
CC: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 2894577..c6e0348 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -191,6 +191,67 @@ void ide_create_request_sense_cmd(ide_drive_t *drive, struct ide_atapi_pc *pc)
 }
 EXPORT_SYMBOL_GPL(ide_create_request_sense_cmd);
 
+void ide_prep_sense(ide_drive_t *drive, struct request *rq)
+{
+	struct request_sense *sense = &drive->sense_data;
+	struct request *sense_rq = &drive->sense_rq;
+	unsigned int cmd_len, sense_len;
+
+	debug_log("%s: enter\n", __func__);
+
+	switch (drive->media) {
+	case ide_floppy:
+		cmd_len = 255;
+		sense_len = 18;
+		break;
+	case ide_tape:
+		cmd_len = 20;
+		sense_len = 20;
+		break;
+	default:
+		cmd_len = 18;
+		sense_len = 18;
+	}
+
+	BUG_ON(sense_len > sizeof(*sense));
+
+	if (blk_sense_request(rq) || drive->sense_rq_armed)
+		return;
+
+	memset(sense, 0, sizeof(*sense));
+
+	blk_rq_init(rq->q, sense_rq);
+	sense_rq->rq_disk = rq->rq_disk;
+
+	sense_rq->data = sense;
+	sense_rq->cmd[0] = GPCMD_REQUEST_SENSE;
+	sense_rq->cmd[4] = cmd_len;
+	sense_rq->data_len = sense_len;
+
+	sense_rq->cmd_type = REQ_TYPE_SENSE;
+	sense_rq->cmd_flags |= REQ_PREEMPT;
+
+	if (drive->media == ide_tape)
+		sense_rq->cmd[13] = REQ_IDETAPE_PC1;
+
+	drive->sense_rq_armed = true;
+}
+EXPORT_SYMBOL_GPL(ide_prep_sense);
+
+void ide_queue_sense_rq(ide_drive_t *drive, void *special)
+{
+	BUG_ON(!drive->sense_rq_armed);
+
+	drive->sense_rq.special = special;
+	drive->sense_rq_armed = false;
+
+	drive->hwif->rq = NULL;
+
+	elv_add_request(drive->queue, &drive->sense_rq,
+			ELEVATOR_INSERT_FRONT, 0);
+}
+EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
+
 /*
  * Called when an error was detected during the last packet command.
  * We queue a request sense packet command in the head of the request list.
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 846a1e1..a69ccac 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -26,6 +26,9 @@
 #include <asm/io.h>
 #include <asm/mutex.h>
 
+/* for request_sense */
+#include <linux/cdrom.h>
+
 #if defined(CONFIG_CRIS) || defined(CONFIG_FRV) || defined(CONFIG_MN10300)
 # define SUPPORT_VLB_SYNC 0
 #else
@@ -602,6 +605,11 @@ struct ide_drive_s {
 
 	struct ide_atapi_pc request_sense_pc;
 	struct request request_sense_rq;
+
+	/* current sense rq and buffer */
+	bool sense_rq_armed;
+	struct request sense_rq;
+	struct request_sense sense_data;
 };
 
 typedef struct ide_drive_s ide_drive_t;
@@ -1175,6 +1183,9 @@ int ide_set_media_lock(ide_drive_t *, struct gendisk *, int);
 void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *);
 void ide_retry_pc(ide_drive_t *, struct gendisk *);
 
+void ide_prep_sense(ide_drive_t *drive, struct request *rq);
+void ide_queue_sense_rq(ide_drive_t *drive, void *special);
+
 int ide_cd_expiry(ide_drive_t *);
 
 int ide_cd_get_xferlen(struct request *);
-- 
cgit v0.10.2


From c457ce874a0f3dfa3d5e9f2309789f6f34e24325 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: ide-cd: convert to using generic sense request

Preallocate a sense request in the ->do_request method and reinitialize
it only on demand, in case it's been consumed in the IRQ handler path.
The reason for this is that we don't want to be mapping rq to bio in
the IRQ path and introduce all kinds of unnecessary hacks to the block
layer.

tj: * Both user and kernel PC requests expect sense data to be stored
      in separate storage other than drive->sense_data.  Copy sense
      data to rq->sense on completion if rq->sense is not NULL.  This
      fixes bogus sense data on PC requests.

As a result, remove cdrom_queue_request_sense.

CC: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
CC: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index bb77c79..061d7bb 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -206,44 +206,6 @@ static void cdrom_analyze_sense_data(ide_drive_t *drive,
 	ide_cd_log_error(drive->name, failed_command, sense);
 }
 
-static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense,
-				      struct request *failed_command)
-{
-	struct cdrom_info *info		= drive->driver_data;
-	struct request *rq		= &drive->request_sense_rq;
-
-	ide_debug_log(IDE_DBG_SENSE, "enter");
-
-	if (sense == NULL)
-		sense = &info->sense_data;
-
-	memset(sense, 0, 18);
-
-	/* stuff the sense request in front of our current request */
-	blk_rq_init(NULL, rq);
-	rq->cmd_type = REQ_TYPE_ATA_PC;
-	rq->rq_disk = info->disk;
-
-	rq->data = sense;
-	rq->cmd[0] = GPCMD_REQUEST_SENSE;
-	rq->cmd[4] = 18;
-	rq->data_len = 18;
-
-	rq->cmd_type = REQ_TYPE_SENSE;
-	rq->cmd_flags |= REQ_PREEMPT;
-
-	/* NOTE! Save the failed command in "rq->special" */
-	rq->special = (void *)failed_command;
-
-	if (failed_command)
-		ide_debug_log(IDE_DBG_SENSE, "failed_cmd: 0x%x",
-					     failed_command->cmd[0]);
-
-	drive->hwif->rq = NULL;
-
-	elv_add_request(drive->queue, rq, ELEVATOR_INSERT_FRONT, 0);
-}
-
 static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
 {
 	/*
@@ -251,11 +213,16 @@ static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
 	 * failed request
 	 */
 	struct request *failed = (struct request *)rq->special;
-	struct cdrom_info *info = drive->driver_data;
-	void *sense = &info->sense_data;
+	struct request_sense *sense = &drive->sense_data;
 
 	if (failed) {
 		if (failed->sense) {
+			/*
+			 * Sense is always read into drive->sense_data.
+			 * Copy back if the failed request has its
+			 * sense pointer set.
+			 */
+			memcpy(failed->sense, sense, 18);
 			sense = failed->sense;
 			failed->sense_len = rq->sense_len;
 		}
@@ -431,7 +398,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 
 	/* if we got a CHECK_CONDITION status, queue a request sense command */
 	if (stat & ATA_ERR)
-		cdrom_queue_request_sense(drive, NULL, NULL);
+		ide_queue_sense_rq(drive, NULL);
 	return 1;
 
 end_request:
@@ -445,7 +412,7 @@ end_request:
 
 		hwif->rq = NULL;
 
-		cdrom_queue_request_sense(drive, rq->sense, rq);
+		ide_queue_sense_rq(drive, rq);
 		return 1;
 	} else
 		return 2;
@@ -893,6 +860,9 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 		goto out_end;
 	}
 
+	/* prepare sense request for this command */
+	ide_prep_sense(drive, rq);
+
 	memset(&cmd, 0, sizeof(cmd));
 
 	if (rq_data_dir(rq))
diff --git a/drivers/ide/ide-cd.h b/drivers/ide/ide-cd.h
index 1d97101..93a3cf1b 100644
--- a/drivers/ide/ide-cd.h
+++ b/drivers/ide/ide-cd.h
@@ -87,10 +87,6 @@ struct cdrom_info {
 
 	struct atapi_toc *toc;
 
-	/* The result of the last successful request sense command
-	   on this device. */
-	struct request_sense sense_data;
-
 	u8 max_speed;		/* Max speed of the drive. */
 	u8 current_speed;	/* Current speed of the drive. */
 
-- 
cgit v0.10.2


From 068753203e6cd085664a62e0fc0636e19b148a12 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: ide-atapi: convert ide-{floppy,tape} to using preallocated sense
 buffer

Since we're issuing REQ_TYPE_SENSE now we need to allow those types of
rqs in the ->do_request callbacks. As a future improvement, sense_len
assignment might be unified across all ATAPI devices. Borislav to
check with specs and test.

As a result, get rid of ide_queue_pc_head() and
drive->request_sense_rq.

tj: * Init request sense ide_atapi_pc from sense request.  In the
      longer timer, it would probably better to fold
      ide_create_request_sense_cmd() into its only current user -
      ide_floppy_get_format_progress().

    * ide_retry_pc() no longer takes @disk.

CC: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
CC: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index c6e0348..972c522 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -80,34 +80,6 @@ void ide_init_pc(struct ide_atapi_pc *pc)
 EXPORT_SYMBOL_GPL(ide_init_pc);
 
 /*
- * Generate a new packet command request in front of the request queue, before
- * the current request, so that it will be processed immediately, on the next
- * pass through the driver.
- */
-static void ide_queue_pc_head(ide_drive_t *drive, struct gendisk *disk,
-			      struct ide_atapi_pc *pc, struct request *rq)
-{
-	blk_rq_init(NULL, rq);
-	rq->cmd_type = REQ_TYPE_SPECIAL;
-	rq->cmd_flags |= REQ_PREEMPT;
-	rq->special = (char *)pc;
-	rq->rq_disk = disk;
-
-	if (pc->req_xfer) {
-		rq->data = pc->buf;
-		rq->data_len = pc->req_xfer;
-	}
-
-	memcpy(rq->cmd, pc->c, 12);
-	if (drive->media == ide_tape)
-		rq->cmd[13] = REQ_IDETAPE_PC1;
-
-	drive->hwif->rq = NULL;
-
-	elv_add_request(drive->queue, rq, ELEVATOR_INSERT_FRONT, 0);
-}
-
-/*
  * Add a special packet command request to the tail of the request queue,
  * and wait for it to be serviced.
  */
@@ -254,18 +226,26 @@ EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
 
 /*
  * Called when an error was detected during the last packet command.
- * We queue a request sense packet command in the head of the request list.
+ * We queue a request sense packet command at the head of the request
+ * queue.
  */
-void ide_retry_pc(ide_drive_t *drive, struct gendisk *disk)
+void ide_retry_pc(ide_drive_t *drive)
 {
-	struct request *rq = &drive->request_sense_rq;
+	struct request *sense_rq = &drive->sense_rq;
 	struct ide_atapi_pc *pc = &drive->request_sense_pc;
 
 	(void)ide_read_error(drive);
-	ide_create_request_sense_cmd(drive, pc);
+
+	/* init pc from sense_rq */
+	ide_init_pc(pc);
+	memcpy(pc->c, sense_rq->cmd, 12);
+	pc->buf = sense_rq->data;
+	pc->req_xfer = sense_rq->data_len;
+
 	if (drive->media == ide_tape)
 		set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags);
-	ide_queue_pc_head(drive, disk, pc, rq);
+
+	ide_queue_sense_rq(drive, pc);
 }
 EXPORT_SYMBOL_GPL(ide_retry_pc);
 
@@ -404,7 +384,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 			debug_log("[cmd %x]: check condition\n", rq->cmd[0]);
 
 			/* Retry operation */
-			ide_retry_pc(drive, rq->rq_disk);
+			ide_retry_pc(drive);
 
 			/* queued, but not started */
 			return ide_stopped;
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 9460033..d3302cc 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -263,7 +263,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		}
 		pc = &floppy->queued_pc;
 		idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block);
-	} else if (blk_special_request(rq)) {
+	} else if (blk_special_request(rq) || blk_sense_request(rq)) {
 		pc = (struct ide_atapi_pc *)rq->special;
 	} else if (blk_pc_request(rq)) {
 		pc = &floppy->queued_pc;
@@ -273,6 +273,8 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		goto out_end;
 	}
 
+	ide_prep_sense(drive, rq);
+
 	memset(&cmd, 0, sizeof(cmd));
 
 	if (rq_data_dir(rq))
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index aadf53c..8324dfa 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -695,7 +695,7 @@ static ide_startstop_t idetape_media_access_finished(ide_drive_t *drive)
 				printk(KERN_ERR "ide-tape: %s: I/O error, ",
 						tape->name);
 			/* Retry operation */
-			ide_retry_pc(drive, tape->disk);
+			ide_retry_pc(drive);
 			return ide_stopped;
 		}
 		pc->error = 0;
@@ -752,7 +752,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 			(unsigned long long)rq->sector, rq->nr_sectors,
 			rq->current_nr_sectors);
 
-	if (!blk_special_request(rq)) {
+	if (!(blk_special_request(rq) || blk_sense_request(rq))) {
 		/* We do not support buffer cache originated requests. */
 		printk(KERN_NOTICE "ide-tape: %s: Unsupported request in "
 			"request queue (%d)\n", drive->name, rq->cmd_type);
@@ -840,6 +840,9 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	BUG();
 
 out:
+	/* prepare sense request for this command */
+	ide_prep_sense(drive, rq);
+
 	memset(&cmd, 0, sizeof(cmd));
 
 	if (rq_data_dir(rq))
diff --git a/include/linux/ide.h b/include/linux/ide.h
index a69ccac..9e67cca 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -604,7 +604,6 @@ struct ide_drive_s {
 	unsigned long atapi_flags;
 
 	struct ide_atapi_pc request_sense_pc;
-	struct request request_sense_rq;
 
 	/* current sense rq and buffer */
 	bool sense_rq_armed;
@@ -1181,7 +1180,7 @@ int ide_do_test_unit_ready(ide_drive_t *, struct gendisk *);
 int ide_do_start_stop(ide_drive_t *, struct gendisk *, int);
 int ide_set_media_lock(ide_drive_t *, struct gendisk *, int);
 void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *);
-void ide_retry_pc(ide_drive_t *, struct gendisk *);
+void ide_retry_pc(ide_drive_t *drive);
 
 void ide_prep_sense(ide_drive_t *drive, struct request *rq);
 void ide_queue_sense_rq(ide_drive_t *drive, void *special);
-- 
cgit v0.10.2


From 02e7cf8f848841ca21864ccd019e480b73c323b7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: ide-cd,atapi: use bio for internal commands

Impact: unify request data buffer handling

rq->data is used mostly to pass kernel buffer through request queue
without using bio.  There are only a couple of places which still do
this in kernel and converting to bio isn't difficult.

This patch converts ide-cd and atapi to use bio instead of rq->data
for request sense and internal pc commands.  With previous change to
unify sense request handling, this is relatively easily achieved by
adding blk_rq_map_kern() during sense_rq prep and PC issue.

If blk_rq_map_kern() fails for sense, the error is deferred till sense
issue and aborts the failed command which triggered the sense.  Note
that this is a slim possibility as sense prep is done on each command
issue, so for the above condition to actually trigger, all preps since
the last sense issue till the issue of the request which would require
a sense should fail.

* do_request functions might sleep now.  This should be okay as ide
  request_fn - do_ide_request() - is invoked only from make_request
  and plug work.  Make sure this is the case by adding might_sleep()
  to do_ide_request().

* Functions which access the read sense data before the sense request
  is complete now should access bio_data(sense_rq->bio) as the sense
  buffer might have been copied during blk_rq_map_kern().

* ide-tape updated to map sg.

* cdrom_do_block_pc() now doesn't have to deal with REQ_TYPE_ATA_PC
  special case.  Simplified.

* tp_ops->output/input_data path dropped from ide_pc_intr().

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 972c522..5cefe12 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -94,16 +94,18 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 	rq->special = (char *)pc;
 
 	if (pc->req_xfer) {
-		rq->data = pc->buf;
-		rq->data_len = pc->req_xfer;
+		error = blk_rq_map_kern(drive->queue, rq, pc->buf, pc->req_xfer,
+					GFP_NOIO);
+		if (error)
+			goto put_req;
 	}
 
 	memcpy(rq->cmd, pc->c, 12);
 	if (drive->media == ide_tape)
 		rq->cmd[13] = REQ_IDETAPE_PC1;
 	error = blk_execute_rq(drive->queue, disk, rq, 0);
+put_req:
 	blk_put_request(rq);
-
 	return error;
 }
 EXPORT_SYMBOL_GPL(ide_queue_pc_tail);
@@ -168,6 +170,7 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 	struct request_sense *sense = &drive->sense_data;
 	struct request *sense_rq = &drive->sense_rq;
 	unsigned int cmd_len, sense_len;
+	int err;
 
 	debug_log("%s: enter\n", __func__);
 
@@ -193,13 +196,19 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 	memset(sense, 0, sizeof(*sense));
 
 	blk_rq_init(rq->q, sense_rq);
-	sense_rq->rq_disk = rq->rq_disk;
 
-	sense_rq->data = sense;
+	err = blk_rq_map_kern(drive->queue, sense_rq, sense, sense_len,
+			      GFP_NOIO);
+	if (unlikely(err)) {
+		if (printk_ratelimit())
+			printk(KERN_WARNING "%s: failed to map sense buffer\n",
+			       drive->name);
+		return;
+	}
+
+	sense_rq->rq_disk = rq->rq_disk;
 	sense_rq->cmd[0] = GPCMD_REQUEST_SENSE;
 	sense_rq->cmd[4] = cmd_len;
-	sense_rq->data_len = sense_len;
-
 	sense_rq->cmd_type = REQ_TYPE_SENSE;
 	sense_rq->cmd_flags |= REQ_PREEMPT;
 
@@ -210,9 +219,14 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 }
 EXPORT_SYMBOL_GPL(ide_prep_sense);
 
-void ide_queue_sense_rq(ide_drive_t *drive, void *special)
+int ide_queue_sense_rq(ide_drive_t *drive, void *special)
 {
-	BUG_ON(!drive->sense_rq_armed);
+	/* deferred failure from ide_prep_sense() */
+	if (!drive->sense_rq_armed) {
+		printk(KERN_WARNING "%s: failed queue sense request\n",
+		       drive->name);
+		return -ENOMEM;
+	}
 
 	drive->sense_rq.special = special;
 	drive->sense_rq_armed = false;
@@ -221,6 +235,7 @@ void ide_queue_sense_rq(ide_drive_t *drive, void *special)
 
 	elv_add_request(drive->queue, &drive->sense_rq,
 			ELEVATOR_INSERT_FRONT, 0);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
 
@@ -239,13 +254,14 @@ void ide_retry_pc(ide_drive_t *drive)
 	/* init pc from sense_rq */
 	ide_init_pc(pc);
 	memcpy(pc->c, sense_rq->cmd, 12);
-	pc->buf = sense_rq->data;
+	pc->buf = bio_data(sense_rq->bio);	/* pointer to mapped address */
 	pc->req_xfer = sense_rq->data_len;
 
 	if (drive->media == ide_tape)
 		set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags);
 
-	ide_queue_sense_rq(drive, pc);
+	if (ide_queue_sense_rq(drive, pc))
+		ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq));
 }
 EXPORT_SYMBOL_GPL(ide_retry_pc);
 
@@ -317,7 +333,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 	struct ide_cmd *cmd = &hwif->cmd;
 	struct request *rq = hwif->rq;
 	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
-	xfer_func_t *xferfunc;
 	unsigned int timeout, done;
 	u16 bcount;
 	u8 stat, ireason, dsc = 0;
@@ -411,7 +426,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 					rq->errors = -EIO;
 			}
 
-			if (drive->media == ide_tape)
+			if (drive->media == ide_tape && !rq->bio)
 				done = ide_rq_bytes(rq); /* FIXME */
 			else
 				done = blk_rq_bytes(rq);
@@ -448,16 +463,11 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		return ide_do_reset(drive);
 	}
 
-	xferfunc = write ? tp_ops->output_data : tp_ops->input_data;
-
-	if (drive->media == ide_floppy && pc->buf == NULL) {
+	if (drive->media == ide_tape && pc->bh)
+		done = drive->pc_io_buffers(drive, pc, bcount, write);
+	else {
 		done = min_t(unsigned int, bcount, cmd->nleft);
 		ide_pio_bytes(drive, cmd, write, done);
-	} else if (drive->media == ide_tape && pc->bh) {
-		done = drive->pc_io_buffers(drive, pc, bcount, write);
-	} else {
-		done = min_t(unsigned int, bcount, pc->req_xfer - pc->xferred);
-		xferfunc(drive, NULL, pc->cur_pos, done);
 	}
 
 	/* Update the current position */
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 061d7bb..6736287 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -210,10 +210,12 @@ static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
 {
 	/*
 	 * For REQ_TYPE_SENSE, "rq->special" points to the original
-	 * failed request
+	 * failed request.  Also, the sense data should be read
+	 * directly from rq which might be different from the original
+	 * sense buffer if it got copied during mapping.
 	 */
 	struct request *failed = (struct request *)rq->special;
-	struct request_sense *sense = &drive->sense_data;
+	void *sense = bio_data(rq->bio);
 
 	if (failed) {
 		if (failed->sense) {
@@ -398,7 +400,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 
 	/* if we got a CHECK_CONDITION status, queue a request sense command */
 	if (stat & ATA_ERR)
-		ide_queue_sense_rq(drive, NULL);
+		return ide_queue_sense_rq(drive, NULL) ? 2 : 1;
 	return 1;
 
 end_request:
@@ -412,8 +414,7 @@ end_request:
 
 		hwif->rq = NULL;
 
-		ide_queue_sense_rq(drive, rq);
-		return 1;
+		return ide_queue_sense_rq(drive, rq) ? 2 : 1;
 	} else
 		return 2;
 }
@@ -507,8 +508,12 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 		rq->cmd_flags |= cmd_flags;
 		rq->timeout = timeout;
 		if (buffer) {
-			rq->data = buffer;
-			rq->data_len = *bufflen;
+			error = blk_rq_map_kern(drive->queue, rq, buffer,
+						*bufflen, GFP_NOIO);
+			if (error) {
+				blk_put_request(rq);
+				return error;
+			}
 		}
 
 		error = blk_execute_rq(drive->queue, info->disk, rq, 0);
@@ -802,15 +807,10 @@ static void cdrom_do_block_pc(ide_drive_t *drive, struct request *rq)
 	drive->dma = 0;
 
 	/* sg request */
-	if (rq->bio || ((rq->cmd_type == REQ_TYPE_ATA_PC) && rq->data_len)) {
+	if (rq->bio) {
 		struct request_queue *q = drive->queue;
+		char *buf = bio_data(rq->bio);
 		unsigned int alignment;
-		char *buf;
-
-		if (rq->bio)
-			buf = bio_data(rq->bio);
-		else
-			buf = rq->data;
 
 		drive->dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA);
 
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 9b9e8b1..3245c2d 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -481,6 +481,9 @@ void do_ide_request(struct request_queue *q)
 
 	spin_unlock_irq(q->queue_lock);
 
+	/* HLD do_request() callback might sleep, make sure it's okay */
+	might_sleep();
+
 	if (ide_lock_host(host, hwif))
 		goto plug_device_2;
 
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 8324dfa..9b762a2 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -850,6 +850,9 @@ out:
 
 	cmd.rq = rq;
 
+	ide_init_sg_cmd(&cmd, pc->req_xfer);
+	ide_map_sg(drive, &cmd);
+
 	return ide_tape_issue_pc(drive, &cmd, pc);
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 9e67cca..1957461 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1183,7 +1183,7 @@ void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *);
 void ide_retry_pc(ide_drive_t *drive);
 
 void ide_prep_sense(ide_drive_t *drive, struct request *rq);
-void ide_queue_sense_rq(ide_drive_t *drive, void *special);
+int ide_queue_sense_rq(ide_drive_t *drive, void *special);
 
 int ide_cd_expiry(ide_drive_t *);
 
-- 
cgit v0.10.2


From 765139ef5f1a4b1d5cb1f1a7a12de7ee61f6500f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:43 +0900
Subject: ide-pm: don't abuse rq->data

Impact: cleanup rq->data usage

ide-pm uses rq->data to carry pointer to struct request_pm_state
through request queue and rq->special is used to carray pointer to
local struct ide_cmd, which isn't necessary.  Use rq->special for
request_pm_state instead and use local ide_cmd in
ide_start_power_step().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 3245c2d..6e3094e 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -368,7 +368,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 		if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE)
 			return execute_drive_cmd(drive, rq);
 		else if (blk_pm_request(rq)) {
-			struct request_pm_state *pm = rq->data;
+			struct request_pm_state *pm = rq->special;
 #ifdef DEBUG_PM
 			printk("%s: start_power_step(step: %d)\n",
 				drive->name, pm->pm_step);
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index 0d8a151..ba1488b 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -7,7 +7,6 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq;
 	struct request_pm_state rqpm;
-	struct ide_cmd cmd;
 	int ret;
 
 	/* call ACPI _GTM only once */
@@ -15,11 +14,9 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 		ide_acpi_get_timing(hwif);
 
 	memset(&rqpm, 0, sizeof(rqpm));
-	memset(&cmd, 0, sizeof(cmd));
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_PM_SUSPEND;
-	rq->special = &cmd;
-	rq->data = &rqpm;
+	rq->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_SUSPEND;
 	if (mesg.event == PM_EVENT_PRETHAW)
 		mesg.event = PM_EVENT_FREEZE;
@@ -41,7 +38,6 @@ int generic_ide_resume(struct device *dev)
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq;
 	struct request_pm_state rqpm;
-	struct ide_cmd cmd;
 	int err;
 
 	/* call ACPI _PS0 / _STM only once */
@@ -53,12 +49,10 @@ int generic_ide_resume(struct device *dev)
 	ide_acpi_exec_tfs(drive);
 
 	memset(&rqpm, 0, sizeof(rqpm));
-	memset(&cmd, 0, sizeof(cmd));
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_PM_RESUME;
 	rq->cmd_flags |= REQ_PREEMPT;
-	rq->special = &cmd;
-	rq->data = &rqpm;
+	rq->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_RESUME;
 	rqpm.pm_state = PM_EVENT_ON;
 
@@ -77,7 +71,7 @@ int generic_ide_resume(struct device *dev)
 
 void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
 {
-	struct request_pm_state *pm = rq->data;
+	struct request_pm_state *pm = rq->special;
 
 #ifdef DEBUG_PM
 	printk(KERN_INFO "%s: complete_power_step(step: %d)\n",
@@ -107,10 +101,8 @@ void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
 
 ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 {
-	struct request_pm_state *pm = rq->data;
-	struct ide_cmd *cmd = rq->special;
-
-	memset(cmd, 0, sizeof(*cmd));
+	struct request_pm_state *pm = rq->special;
+	struct ide_cmd cmd = { };
 
 	switch (pm->pm_step) {
 	case IDE_PM_FLUSH_CACHE:	/* Suspend step 1 (flush cache) */
@@ -123,12 +115,12 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 			return ide_stopped;
 		}
 		if (ata_id_flush_ext_enabled(drive->id))
-			cmd->tf.command = ATA_CMD_FLUSH_EXT;
+			cmd.tf.command = ATA_CMD_FLUSH_EXT;
 		else
-			cmd->tf.command = ATA_CMD_FLUSH;
+			cmd.tf.command = ATA_CMD_FLUSH;
 		goto out_do_tf;
 	case IDE_PM_STANDBY:		/* Suspend step 2 (standby) */
-		cmd->tf.command = ATA_CMD_STANDBYNOW1;
+		cmd.tf.command = ATA_CMD_STANDBYNOW1;
 		goto out_do_tf;
 	case IDE_PM_RESTORE_PIO:	/* Resume step 1 (restore PIO) */
 		ide_set_max_pio(drive);
@@ -141,7 +133,7 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 			ide_complete_power_step(drive, rq);
 		return ide_stopped;
 	case IDE_PM_IDLE:		/* Resume step 2 (idle) */
-		cmd->tf.command = ATA_CMD_IDLEIMMEDIATE;
+		cmd.tf.command = ATA_CMD_IDLEIMMEDIATE;
 		goto out_do_tf;
 	case IDE_PM_RESTORE_DMA:	/* Resume step 3 (restore DMA) */
 		/*
@@ -163,11 +155,11 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 	return ide_stopped;
 
 out_do_tf:
-	cmd->valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
-	cmd->valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
-	cmd->protocol = ATA_PROT_NODATA;
+	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
+	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
+	cmd.protocol = ATA_PROT_NODATA;
 
-	return do_rw_taskfile(drive, cmd);
+	return do_rw_taskfile(drive, &cmd);
 }
 
 /**
@@ -181,7 +173,7 @@ out_do_tf:
 void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 {
 	struct request_queue *q = drive->queue;
-	struct request_pm_state *pm = rq->data;
+	struct request_pm_state *pm = rq->special;
 	unsigned long flags;
 
 	ide_complete_power_step(drive, rq);
@@ -207,7 +199,7 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 
 void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
 {
-	struct request_pm_state *pm = rq->data;
+	struct request_pm_state *pm = rq->special;
 
 	if (blk_pm_suspend_request(rq) &&
 	    pm->pm_step == IDE_PM_START_SUSPEND)
-- 
cgit v0.10.2


From 08f370f0a2fb223bf48d0bfa2a173be0393c19dc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:02 +0900
Subject: ide-tape,floppy: fix failed command completion after request sense

Impact: fix infinite retry loop

After a command failed, ide-tape and floppy inserts REQUEST_SENSE in
front of the failed command and according to the result, sets
pc->retries, flags and errors.  After REQUEST_SENSE is complete, the
failed command is again at the front of the queue and if the verdict
was to terminate the request, the issue functions tries to complete it
directly by calling drive->pc_callback() and returning ide_stopped.

However, drive->pc_callback() doesn't complete a request.  It only
prepares for completion of the request.  As a result, this creates an
infinite loop where the failed request is retried perpetually.

Fix it by actually ending the request by calling ide_complete_rq().

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index d3302cc..d20704a 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -141,6 +141,7 @@ static ide_startstop_t ide_floppy_issue_pc(ide_drive_t *drive,
 
 		drive->failed_pc = NULL;
 		drive->pc_callback(drive, 0);
+		ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq));
 		return ide_stopped;
 	}
 
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 9b762a2..2b9a136 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -643,6 +643,7 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive,
 		}
 		drive->failed_pc = NULL;
 		drive->pc_callback(drive, 0);
+		ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq));
 		return ide_stopped;
 	}
 	debug_log(DBG_SENSE, "Retry #%d, cmd = %02X\n", pc->retries, pc->c[0]);
-- 
cgit v0.10.2


From eb6a61bb9543aa54d62595e27206b3d3c0293bcc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:02 +0900
Subject: ide-atapi,tape,floppy: allow ->pc_callback() to change rq->data_len

Impact: allow residual count implementation in ->pc_callback()

rq->data_len has two duties - carrying the number of input bytes on
issue and carrying residual count back to the issuer on completion.
ide-atapi completion callback ->pc_callback() is the right place to do
this but currently ide-atapi depends on rq->data_len carrying the
original request size after calling ->pc_callback() to complete the pc
request.

This patch makes ide_pc_intr(), ide_tape_issue_pc() and
ide_floppy_issue_pc() cache length to complete before calling
->pc_callback() so that it can modify rq->data_len as necessary.

Note: As using rq->data_len for two purposes can make cases like this
      incorrect in subtle ways, future changes will introduce separate
      field for residual count.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 5cefe12..3df5442 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -409,6 +409,16 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		if ((pc->flags & PC_FLAG_WAIT_FOR_DSC) && (stat & ATA_DSC) == 0)
 			dsc = 1;
 
+		/*
+		 * ->pc_callback() might change rq->data_len for
+		 * residual count, cache total length.
+		 */
+		if (!blk_special_request(rq) &&
+		    (drive->media == ide_tape && !rq->bio))
+			done = ide_rq_bytes(rq);        /* FIXME */
+		else
+			done = blk_rq_bytes(rq);
+
 		/* Command finished - Call the callback function */
 		uptodate = drive->pc_callback(drive, dsc);
 
@@ -417,7 +427,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 
 		if (blk_special_request(rq)) {
 			rq->errors = 0;
-			done = blk_rq_bytes(rq);
 			error = 0;
 		} else {
 
@@ -426,11 +435,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 					rq->errors = -EIO;
 			}
 
-			if (drive->media == ide_tape && !rq->bio)
-				done = ide_rq_bytes(rq); /* FIXME */
-			else
-				done = blk_rq_bytes(rq);
-
 			error = uptodate ? 0 : -EIO;
 		}
 
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index d20704a..537b7c5 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -134,14 +134,17 @@ static ide_startstop_t ide_floppy_issue_pc(ide_drive_t *drive,
 	drive->pc = pc;
 
 	if (pc->retries > IDEFLOPPY_MAX_PC_RETRIES) {
+		unsigned int done = blk_rq_bytes(drive->hwif->rq);
+
 		if (!(pc->flags & PC_FLAG_SUPPRESS_ERROR))
 			ide_floppy_report_error(floppy, pc);
+
 		/* Giving up */
 		pc->error = IDE_DRV_ERROR_GENERAL;
 
 		drive->failed_pc = NULL;
 		drive->pc_callback(drive, 0);
-		ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq));
+		ide_complete_rq(drive, -EIO, done);
 		return ide_stopped;
 	}
 
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 2b9a136..8226d52 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -622,6 +622,8 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive,
 
 	if (pc->retries > IDETAPE_MAX_PC_RETRIES ||
 		(pc->flags & PC_FLAG_ABORT)) {
+		unsigned int done = blk_rq_bytes(drive->hwif->rq);
+
 		/*
 		 * We will "abort" retrying a packet command in case legitimate
 		 * error code was received (crossing a filemark, or end of the
@@ -641,9 +643,10 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive,
 			/* Giving up */
 			pc->error = IDE_DRV_ERROR_GENERAL;
 		}
+
 		drive->failed_pc = NULL;
 		drive->pc_callback(drive, 0);
-		ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq));
+		ide_complete_rq(drive, -EIO, done);
 		return ide_stopped;
 	}
 	debug_log(DBG_SENSE, "Retry #%d, cmd = %02X\n", pc->retries, pc->c[0]);
-- 
cgit v0.10.2


From 7b13354eeaabaf6283b8c669a7d67d104ce7c638 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:02 +0900
Subject: ide-tape: use single continuous buffer

Impact: simpler buffer allocation and handling, kills OOM, fix DMA transfers

ide-tape has its own multiple buffer mechanism using struct
idetape_bh.  It allocates buffer with decreasing order-of-two
allocations so that it results in minimum number of segments.
However, the implementation is quite complex and works in a way that
no other block or ide driver works necessitating a lot of special case
handling.

The benefit this complex allocation scheme brings is questionable as
PIO or DMA the number of segments (16 maximum) doesn't make any
noticeable difference and it also doesn't negate the need for multiple
order allocation which can fail under memory pressure or high
fragmentation although it does lower the highest order necessary by
one when the buffer size isn't power of two.

As the first step to remove the custom buffer management, this patch
makes ide-tape allocate single continous buffer.  The maximum order is
four.  I doubt the change would cause any trouble but if it ever
matters, it should be converted to regular sg mechanism like everyone
else and even in that case dropping custom buffer handling and moving
to standard mechanism first make sense as an intermediate step.

This patch makes the first bh to contain the whole buffer and drops
multi bh handling code.  Following patches will make further changes.

This patch has the side effect of killing OOM triggered by allocation
path and fixing DMA transfers.  Previously, bug in alloc path
triggered OOM on command issue and commands were passed to DMA engine
without DMA-mapping all the segments.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 8226d52..b2afbc7 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -134,7 +134,6 @@ enum {
 struct idetape_bh {
 	u32 b_size;
 	atomic_t b_count;
-	struct idetape_bh *b_reqnext;
 	char *b_data;
 };
 
@@ -228,10 +227,6 @@ typedef struct ide_tape_obj {
 	char *b_data;
 	int b_count;
 
-	int pages_per_buffer;
-	/* Wasted space in each stage */
-	int excess_bh_size;
-
 	/* Measures average tape speed */
 	unsigned long avg_time;
 	int avg_size;
@@ -303,9 +298,7 @@ static int idetape_input_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
 	struct idetape_bh *bh = pc->bh;
 	int count;
 
-	while (bcount) {
-		if (bh == NULL)
-			break;
+	if (bcount && bh) {
 		count = min(
 			(unsigned int)(bh->b_size - atomic_read(&bh->b_count)),
 			bcount);
@@ -313,15 +306,10 @@ static int idetape_input_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
 					atomic_read(&bh->b_count), count);
 		bcount -= count;
 		atomic_add(count, &bh->b_count);
-		if (atomic_read(&bh->b_count) == bh->b_size) {
-			bh = bh->b_reqnext;
-			if (bh)
-				atomic_set(&bh->b_count, 0);
-		}
+		if (atomic_read(&bh->b_count) == bh->b_size)
+			pc->bh = NULL;
 	}
 
-	pc->bh = bh;
-
 	return bcount;
 }
 
@@ -331,22 +319,14 @@ static int idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
 	struct idetape_bh *bh = pc->bh;
 	int count;
 
-	while (bcount) {
-		if (bh == NULL)
-			break;
+	if (bcount && bh) {
 		count = min((unsigned int)pc->b_count, (unsigned int)bcount);
 		drive->hwif->tp_ops->output_data(drive, NULL, pc->b_data, count);
 		bcount -= count;
 		pc->b_data += count;
 		pc->b_count -= count;
-		if (!pc->b_count) {
-			bh = bh->b_reqnext;
-			pc->bh = bh;
-			if (bh) {
-				pc->b_data = bh->b_data;
-				pc->b_count = atomic_read(&bh->b_count);
-			}
-		}
+		if (!pc->b_count)
+			pc->bh = NULL;
 	}
 
 	return bcount;
@@ -355,24 +335,20 @@ static int idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
 static void idetape_update_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc)
 {
 	struct idetape_bh *bh = pc->bh;
-	int count;
 	unsigned int bcount = pc->xferred;
 
 	if (pc->flags & PC_FLAG_WRITING)
 		return;
-	while (bcount) {
-		if (bh == NULL) {
+	if (bcount) {
+		if (bh == NULL || bcount > bh->b_size) {
 			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
 					__func__);
 			return;
 		}
-		count = min((unsigned int)bh->b_size, (unsigned int)bcount);
-		atomic_set(&bh->b_count, count);
+		atomic_set(&bh->b_count, bcount);
 		if (atomic_read(&bh->b_count) == bh->b_size)
-			bh = bh->b_reqnext;
-		bcount -= count;
+			pc->bh = NULL;
 	}
-	pc->bh = bh;
 }
 
 /*
@@ -439,24 +415,10 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense)
 /* Free data buffers completely. */
 static void ide_tape_kfree_buffer(idetape_tape_t *tape)
 {
-	struct idetape_bh *prev_bh, *bh = tape->merge_bh;
-
-	while (bh) {
-		u32 size = bh->b_size;
-
-		while (size) {
-			unsigned int order = fls(size >> PAGE_SHIFT)-1;
-
-			if (bh->b_data)
-				free_pages((unsigned long)bh->b_data, order);
+	struct idetape_bh *bh = tape->merge_bh;
 
-			size &= (order-1);
-			bh->b_data += (1 << order) * PAGE_SIZE;
-		}
-		prev_bh = bh;
-		bh = bh->b_reqnext;
-		kfree(prev_bh);
-	}
+	kfree(bh->b_data);
+	kfree(bh);
 }
 
 static void ide_tape_handle_dsc(ide_drive_t *);
@@ -861,117 +823,50 @@ out:
 }
 
 /*
- * The function below uses __get_free_pages to allocate a data buffer of size
- * tape->buffer_size (or a bit more). We attempt to combine sequential pages as
- * much as possible.
- *
- * It returns a pointer to the newly allocated buffer, or NULL in case of
- * failure.
+ * It returns a pointer to the newly allocated buffer, or NULL in case
+ * of failure.
  */
 static struct idetape_bh *ide_tape_kmalloc_buffer(idetape_tape_t *tape,
-						  int full, int clear)
-{
-	struct idetape_bh *prev_bh, *bh, *merge_bh;
-	int pages = tape->pages_per_buffer;
-	unsigned int order, b_allocd;
-	char *b_data = NULL;
-
-	merge_bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL);
-	bh = merge_bh;
-	if (bh == NULL)
-		goto abort;
-
-	order = fls(pages) - 1;
-	bh->b_data = (char *) __get_free_pages(GFP_KERNEL, order);
-	if (!bh->b_data)
-		goto abort;
-	b_allocd = (1 << order) * PAGE_SIZE;
-	pages &= (order-1);
-
-	if (clear)
-		memset(bh->b_data, 0, b_allocd);
-	bh->b_reqnext = NULL;
-	bh->b_size = b_allocd;
-	atomic_set(&bh->b_count, full ? bh->b_size : 0);
+						  int full)
+{
+	struct idetape_bh *bh;
 
-	while (pages) {
-		order = fls(pages) - 1;
-		b_data = (char *) __get_free_pages(GFP_KERNEL, order);
-		if (!b_data)
-			goto abort;
-		b_allocd = (1 << order) * PAGE_SIZE;
-
-		if (clear)
-			memset(b_data, 0, b_allocd);
-
-		/* newly allocated page frames below buffer header or ...*/
-		if (bh->b_data == b_data + b_allocd) {
-			bh->b_size += b_allocd;
-			bh->b_data -= b_allocd;
-			if (full)
-				atomic_add(b_allocd, &bh->b_count);
-			continue;
-		}
-		/* they are above the header */
-		if (b_data == bh->b_data + bh->b_size) {
-			bh->b_size += b_allocd;
-			if (full)
-				atomic_add(b_allocd, &bh->b_count);
-			continue;
-		}
-		prev_bh = bh;
-		bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL);
-		if (!bh) {
-			free_pages((unsigned long) b_data, order);
-			goto abort;
-		}
-		bh->b_reqnext = NULL;
-		bh->b_data = b_data;
-		bh->b_size = b_allocd;
-		atomic_set(&bh->b_count, full ? bh->b_size : 0);
-		prev_bh->b_reqnext = bh;
+	bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL);
+	if (!bh)
+		return NULL;
 
-		pages &= (order-1);
+	bh->b_data = kmalloc(tape->buffer_size, GFP_KERNEL);
+	if (!bh->b_data) {
+		kfree(bh);
+		return NULL;
 	}
 
-	bh->b_size -= tape->excess_bh_size;
-	if (full)
-		atomic_sub(tape->excess_bh_size, &bh->b_count);
-	return merge_bh;
-abort:
-	ide_tape_kfree_buffer(tape);
-	return NULL;
+	bh->b_size = tape->buffer_size;
+	atomic_set(&bh->b_count, full ? bh->b_size : 0);
+
+	return bh;
 }
 
 static int idetape_copy_stage_from_user(idetape_tape_t *tape,
 					const char __user *buf, int n)
 {
 	struct idetape_bh *bh = tape->bh;
-	int count;
 	int ret = 0;
 
-	while (n) {
-		if (bh == NULL) {
+	if (n) {
+		if (bh == NULL || n > bh->b_size - atomic_read(&bh->b_count)) {
 			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
 					__func__);
 			return 1;
 		}
-		count = min((unsigned int)
-				(bh->b_size - atomic_read(&bh->b_count)),
-				(unsigned int)n);
 		if (copy_from_user(bh->b_data + atomic_read(&bh->b_count), buf,
-				count))
+				   n))
 			ret = 1;
-		n -= count;
-		atomic_add(count, &bh->b_count);
-		buf += count;
-		if (atomic_read(&bh->b_count) == bh->b_size) {
-			bh = bh->b_reqnext;
-			if (bh)
-				atomic_set(&bh->b_count, 0);
-		}
+		atomic_add(n, &bh->b_count);
+		if (atomic_read(&bh->b_count) == bh->b_size)
+			tape->bh = NULL;
 	}
-	tape->bh = bh;
+
 	return ret;
 }
 
@@ -979,30 +874,20 @@ static int idetape_copy_stage_to_user(idetape_tape_t *tape, char __user *buf,
 				      int n)
 {
 	struct idetape_bh *bh = tape->bh;
-	int count;
 	int ret = 0;
 
-	while (n) {
-		if (bh == NULL) {
+	if (n) {
+		if (bh == NULL || n > tape->b_count) {
 			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
 					__func__);
 			return 1;
 		}
-		count = min(tape->b_count, n);
-		if  (copy_to_user(buf, tape->b_data, count))
+		if (copy_to_user(buf, tape->b_data, n))
 			ret = 1;
-		n -= count;
-		tape->b_data += count;
-		tape->b_count -= count;
-		buf += count;
-		if (!tape->b_count) {
-			bh = bh->b_reqnext;
-			tape->bh = bh;
-			if (bh) {
-				tape->b_data = bh->b_data;
-				tape->b_count = atomic_read(&bh->b_count);
-			}
-		}
+		tape->b_data += n;
+		tape->b_count -= n;
+		if (!tape->b_count)
+			tape->bh = NULL;
 	}
 	return ret;
 }
@@ -1254,7 +1139,7 @@ static int idetape_add_chrdev_write_request(ide_drive_t *drive, int blocks)
 static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	int blocks, min;
+	int blocks;
 	struct idetape_bh *bh;
 
 	if (tape->chrdev_dir != IDETAPE_DIR_WRITE) {
@@ -1269,31 +1154,16 @@ static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 	if (tape->merge_bh_size) {
 		blocks = tape->merge_bh_size / tape->blk_size;
 		if (tape->merge_bh_size % tape->blk_size) {
-			unsigned int i;
-
+			unsigned int i = tape->blk_size -
+				tape->merge_bh_size % tape->blk_size;
 			blocks++;
-			i = tape->blk_size - tape->merge_bh_size %
-				tape->blk_size;
-			bh = tape->bh->b_reqnext;
-			while (bh) {
-				atomic_set(&bh->b_count, 0);
-				bh = bh->b_reqnext;
-			}
 			bh = tape->bh;
-			while (i) {
-				if (bh == NULL) {
-					printk(KERN_INFO "ide-tape: bug,"
-							 " bh NULL\n");
-					break;
-				}
-				min = min(i, (unsigned int)(bh->b_size -
-						atomic_read(&bh->b_count)));
+			if (bh) {
 				memset(bh->b_data + atomic_read(&bh->b_count),
-						0, min);
-				atomic_add(min, &bh->b_count);
-				i -= min;
-				bh = bh->b_reqnext;
-			}
+				       0, i);
+				atomic_add(i, &bh->b_count);
+			} else
+				printk(KERN_INFO "ide-tape: bug, bh NULL\n");
 		}
 		(void) idetape_add_chrdev_write_request(drive, blocks);
 		tape->merge_bh_size = 0;
@@ -1321,7 +1191,7 @@ static int idetape_init_read(ide_drive_t *drive)
 					 " 0 now\n");
 			tape->merge_bh_size = 0;
 		}
-		tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0, 0);
+		tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0);
 		if (!tape->merge_bh)
 			return -ENOMEM;
 		tape->chrdev_dir = IDETAPE_DIR_READ;
@@ -1368,23 +1238,18 @@ static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks)
 static void idetape_pad_zeros(ide_drive_t *drive, int bcount)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	struct idetape_bh *bh;
+	struct idetape_bh *bh = tape->merge_bh;
 	int blocks;
 
 	while (bcount) {
 		unsigned int count;
 
-		bh = tape->merge_bh;
 		count = min(tape->buffer_size, bcount);
 		bcount -= count;
 		blocks = count / tape->blk_size;
-		while (count) {
-			atomic_set(&bh->b_count,
-				   min(count, (unsigned int)bh->b_size));
-			memset(bh->b_data, 0, atomic_read(&bh->b_count));
-			count -= atomic_read(&bh->b_count);
-			bh = bh->b_reqnext;
-		}
+		atomic_set(&bh->b_count, count);
+		memset(bh->b_data, 0, atomic_read(&bh->b_count));
+
 		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks,
 				      tape->merge_bh);
 	}
@@ -1596,7 +1461,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 				"should be 0 now\n");
 			tape->merge_bh_size = 0;
 		}
-		tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0, 0);
+		tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0);
 		if (!tape->merge_bh)
 			return -ENOMEM;
 		tape->chrdev_dir = IDETAPE_DIR_WRITE;
@@ -1970,7 +1835,7 @@ static void idetape_write_release(ide_drive_t *drive, unsigned int minor)
 	idetape_tape_t *tape = drive->driver_data;
 
 	ide_tape_flush_merge_buffer(drive);
-	tape->merge_bh = ide_tape_kmalloc_buffer(tape, 1, 0);
+	tape->merge_bh = ide_tape_kmalloc_buffer(tape, 1);
 	if (tape->merge_bh != NULL) {
 		idetape_pad_zeros(drive, tape->blk_size *
 				(tape->user_bs_factor - 1));
@@ -2201,11 +2066,6 @@ static void idetape_setup(ide_drive_t *drive, idetape_tape_t *tape, int minor)
 		tape->buffer_size = *ctl * tape->blk_size;
 	}
 	buffer_size = tape->buffer_size;
-	tape->pages_per_buffer = buffer_size / PAGE_SIZE;
-	if (buffer_size % PAGE_SIZE) {
-		tape->pages_per_buffer++;
-		tape->excess_bh_size = PAGE_SIZE - buffer_size % PAGE_SIZE;
-	}
 
 	/* select the "best" DSC read/write polling freq */
 	speed = max(*(u16 *)&tape->caps[14], *(u16 *)&tape->caps[8]);
-- 
cgit v0.10.2


From e998f30b45efb99a3c3ce7b5483f76317a17abed Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:02 +0900
Subject: ide-tape: use standard data transfer mechanism

Impact: use standard way to transfer data

ide-tape uses rq in an interesting way.  For r/w requests, rq->special
is used to carry a private buffer management structure idetape_bh and
rq->nr_sectors and current_nr_sectors are initialized to the number of
idetape blocks which isn't necessary 512 bytes.  Also,
rq->current_nr_sectors is used to report back the residual count in
units of idetape blocks.

This peculiarity taxes both block layer and ide.  ide-atapi has
different paths and hooks to accomodate it and what a rq means becomes
quite confusing and making changes at the block layer becomes quite
difficult and error-prone.

This patch makes ide-tape use bio instead.  With the previous patch,
ide-tape currently is using single contiguos buffer so replacing it
isn't difficult.  Data buffer is mapped into bio using
blk_rq_map_kern() in idetape_queue_rw_tail().  idetape_io_buffers()
and idetape_update_buffers() are dropped and pc->bh is set to null to
tell ide-atapi to use standard data transfer mechanism and idetape_bh
byte counts are updated by the issuer on completion using the residual
count.

This change also nicely removes the FIXME in ide_pc_intr() where
ide-tape rqs need to be completed using ide_rq_bytes() instead of
blk_rq_bytes() (although this didn't really matter as the request
didn't have bio).

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 3df5442..b9dd450 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -413,11 +413,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		 * ->pc_callback() might change rq->data_len for
 		 * residual count, cache total length.
 		 */
-		if (!blk_special_request(rq) &&
-		    (drive->media == ide_tape && !rq->bio))
-			done = ide_rq_bytes(rq);        /* FIXME */
-		else
-			done = blk_rq_bytes(rq);
+		done = blk_rq_bytes(rq);
 
 		/* Command finished - Call the callback function */
 		uptodate = drive->pc_callback(drive, dsc);
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index b2afbc7..b373ac8 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -292,65 +292,6 @@ static struct ide_tape_obj *ide_tape_chrdev_get(unsigned int i)
 	return tape;
 }
 
-static int idetape_input_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
-				  unsigned int bcount)
-{
-	struct idetape_bh *bh = pc->bh;
-	int count;
-
-	if (bcount && bh) {
-		count = min(
-			(unsigned int)(bh->b_size - atomic_read(&bh->b_count)),
-			bcount);
-		drive->hwif->tp_ops->input_data(drive, NULL, bh->b_data +
-					atomic_read(&bh->b_count), count);
-		bcount -= count;
-		atomic_add(count, &bh->b_count);
-		if (atomic_read(&bh->b_count) == bh->b_size)
-			pc->bh = NULL;
-	}
-
-	return bcount;
-}
-
-static int idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
-				   unsigned int bcount)
-{
-	struct idetape_bh *bh = pc->bh;
-	int count;
-
-	if (bcount && bh) {
-		count = min((unsigned int)pc->b_count, (unsigned int)bcount);
-		drive->hwif->tp_ops->output_data(drive, NULL, pc->b_data, count);
-		bcount -= count;
-		pc->b_data += count;
-		pc->b_count -= count;
-		if (!pc->b_count)
-			pc->bh = NULL;
-	}
-
-	return bcount;
-}
-
-static void idetape_update_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc)
-{
-	struct idetape_bh *bh = pc->bh;
-	unsigned int bcount = pc->xferred;
-
-	if (pc->flags & PC_FLAG_WRITING)
-		return;
-	if (bcount) {
-		if (bh == NULL || bcount > bh->b_size) {
-			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
-					__func__);
-			return;
-		}
-		atomic_set(&bh->b_count, bcount);
-		if (atomic_read(&bh->b_count) == bh->b_size)
-			pc->bh = NULL;
-	}
-}
-
 /*
  * called on each failed packet command retry to analyze the request sense. We
  * currently do not utilize this information.
@@ -368,12 +309,10 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense)
 		 pc->c[0], tape->sense_key, tape->asc, tape->ascq);
 
 	/* Correct pc->xferred by asking the tape.	 */
-	if (pc->flags & PC_FLAG_DMA_ERROR) {
+	if (pc->flags & PC_FLAG_DMA_ERROR)
 		pc->xferred = pc->req_xfer -
 			tape->blk_size *
 			get_unaligned_be32(&sense[3]);
-		idetape_update_buffers(drive, pc);
-	}
 
 	/*
 	 * If error was the result of a zero-length read or write command,
@@ -458,7 +397,7 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc)
 		}
 
 		tape->first_frame += blocks;
-		rq->current_nr_sectors -= blocks;
+		rq->data_len -= blocks * tape->blk_size;
 
 		if (pc->error) {
 			uptodate = 0;
@@ -520,19 +459,6 @@ static void ide_tape_handle_dsc(ide_drive_t *drive)
 	idetape_postpone_request(drive);
 }
 
-static int ide_tape_io_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
-				unsigned int bcount, int write)
-{
-	unsigned int bleft;
-
-	if (write)
-		bleft = idetape_output_buffers(drive, pc, bcount);
-	else
-		bleft = idetape_input_buffers(drive, pc, bcount);
-
-	return bcount - bleft;
-}
-
 /*
  * Packet Command Interface
  *
@@ -683,7 +609,7 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape,
 	ide_init_pc(pc);
 	put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]);
 	pc->c[1] = 1;
-	pc->bh = bh;
+	pc->bh = NULL;
 	pc->buf = NULL;
 	pc->buf_size = length * tape->blk_size;
 	pc->req_xfer = pc->buf_size;
@@ -1063,10 +989,12 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks,
 				 struct idetape_bh *bh)
 {
 	idetape_tape_t *tape = drive->driver_data;
+	size_t size = blocks * tape->blk_size;
 	struct request *rq;
-	int ret, errors;
+	int ret;
 
 	debug_log(DBG_SENSE, "%s: cmd=%d\n", __func__, cmd);
+	BUG_ON(cmd != REQ_IDETAPE_READ && cmd != REQ_IDETAPE_WRITE);
 
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_SPECIAL;
@@ -1074,21 +1002,29 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks,
 	rq->rq_disk = tape->disk;
 	rq->special = (void *)bh;
 	rq->sector = tape->first_frame;
-	rq->nr_sectors = blocks;
-	rq->current_nr_sectors = blocks;
+
+	if (size) {
+		ret = blk_rq_map_kern(drive->queue, rq, bh->b_data, size,
+				      __GFP_WAIT);
+		if (ret)
+			goto out_put;
+	}
+
 	blk_execute_rq(drive->queue, tape->disk, rq, 0);
 
-	errors = rq->errors;
-	ret = tape->blk_size * (blocks - rq->current_nr_sectors);
-	blk_put_request(rq);
+	/* calculate the number of transferred bytes and update bh */
+	size -= rq->data_len;
+	if (cmd == REQ_IDETAPE_READ)
+		atomic_add(size, &bh->b_count);
 
-	if ((cmd & (REQ_IDETAPE_READ | REQ_IDETAPE_WRITE)) == 0)
-		return 0;
+	ret = size;
+	if (rq->errors == IDE_DRV_ERROR_GENERAL)
+		ret = -EIO;
 
 	if (tape->merge_bh)
 		idetape_init_merge_buffer(tape);
-	if (errors == IDE_DRV_ERROR_GENERAL)
-		return -EIO;
+out_put:
+	blk_put_request(rq);
 	return ret;
 }
 
@@ -2034,8 +1970,6 @@ static void idetape_setup(ide_drive_t *drive, idetape_tape_t *tape, int minor)
 	u16 *ctl = (u16 *)&tape->caps[12];
 
 	drive->pc_callback	 = ide_tape_callback;
-	drive->pc_update_buffers = idetape_update_buffers;
-	drive->pc_io_buffers	 = ide_tape_io_buffers;
 
 	drive->dev_flags |= IDE_DFLAG_DSC_OVERLAP;
 
-- 
cgit v0.10.2


From 6cf3d545f7d71b183e5b89960d4cc850a42c410d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:02 +0900
Subject: ide-tape: kill idetape_bh

Impact: kill now unnecessary idetape_bh

With everything using standard mechanisms, there is no need for
idetape_bh anymore.  Kill it and use tape->buf, cur and valid to
describe data buffer instead.

Changes worth mentioning are...

* idetape_queue_rq_tail() now always queue tape->buf and and adjusts
  buffer state properly before completion.

* idetape_pad_zeros() clears the buffer only once.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index b373ac8..d2e9c09 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -131,12 +131,6 @@ enum {
 	IDETAPE_DIR_WRITE = (1 << 2),
 };
 
-struct idetape_bh {
-	u32 b_size;
-	atomic_t b_count;
-	char *b_data;
-};
-
 /* Tape door status */
 #define DOOR_UNLOCKED			0
 #define DOOR_LOCKED			1
@@ -218,14 +212,12 @@ typedef struct ide_tape_obj {
 
 	/* Data buffer size chosen based on the tape's recommendation */
 	int buffer_size;
-	/* merge buffer */
-	struct idetape_bh *merge_bh;
-	/* size of the merge buffer */
-	int merge_bh_size;
-	/* pointer to current buffer head within the merge buffer */
-	struct idetape_bh *bh;
-	char *b_data;
-	int b_count;
+	/* Staging buffer of buffer_size bytes */
+	void *buf;
+	/* The read/write cursor */
+	void *cur;
+	/* The number of valid bytes in buf */
+	size_t valid;
 
 	/* Measures average tape speed */
 	unsigned long avg_time;
@@ -351,15 +343,6 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense)
 	}
 }
 
-/* Free data buffers completely. */
-static void ide_tape_kfree_buffer(idetape_tape_t *tape)
-{
-	struct idetape_bh *bh = tape->merge_bh;
-
-	kfree(bh->b_data);
-	kfree(bh);
-}
-
 static void ide_tape_handle_dsc(ide_drive_t *);
 
 static int ide_tape_callback(ide_drive_t *drive, int dsc)
@@ -603,8 +586,7 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape,
 				   struct ide_atapi_pc *pc, struct request *rq,
 				   u8 opcode)
 {
-	struct idetape_bh *bh = (struct idetape_bh *)rq->special;
-	unsigned int length = rq->current_nr_sectors;
+	unsigned int length = rq->nr_sectors;
 
 	ide_init_pc(pc);
 	put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]);
@@ -616,14 +598,11 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape,
 	if (pc->req_xfer == tape->buffer_size)
 		pc->flags |= PC_FLAG_DMA_OK;
 
-	if (opcode == READ_6) {
+	if (opcode == READ_6)
 		pc->c[0] = READ_6;
-		atomic_set(&bh->b_count, 0);
-	} else if (opcode == WRITE_6) {
+	else if (opcode == WRITE_6) {
 		pc->c[0] = WRITE_6;
 		pc->flags |= PC_FLAG_WRITING;
-		pc->b_data = bh->b_data;
-		pc->b_count = atomic_read(&bh->b_count);
 	}
 
 	memcpy(rq->cmd, pc->c, 12);
@@ -639,10 +618,8 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	struct ide_cmd cmd;
 	u8 stat;
 
-	debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %lu,"
-			" current_nr_sectors: %u\n",
-			(unsigned long long)rq->sector, rq->nr_sectors,
-			rq->current_nr_sectors);
+	debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %lu\n"
+		  (unsigned long long)rq->sector, rq->nr_sectors);
 
 	if (!(blk_special_request(rq) || blk_sense_request(rq))) {
 		/* We do not support buffer cache originated requests. */
@@ -749,89 +726,6 @@ out:
 }
 
 /*
- * It returns a pointer to the newly allocated buffer, or NULL in case
- * of failure.
- */
-static struct idetape_bh *ide_tape_kmalloc_buffer(idetape_tape_t *tape,
-						  int full)
-{
-	struct idetape_bh *bh;
-
-	bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL);
-	if (!bh)
-		return NULL;
-
-	bh->b_data = kmalloc(tape->buffer_size, GFP_KERNEL);
-	if (!bh->b_data) {
-		kfree(bh);
-		return NULL;
-	}
-
-	bh->b_size = tape->buffer_size;
-	atomic_set(&bh->b_count, full ? bh->b_size : 0);
-
-	return bh;
-}
-
-static int idetape_copy_stage_from_user(idetape_tape_t *tape,
-					const char __user *buf, int n)
-{
-	struct idetape_bh *bh = tape->bh;
-	int ret = 0;
-
-	if (n) {
-		if (bh == NULL || n > bh->b_size - atomic_read(&bh->b_count)) {
-			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
-					__func__);
-			return 1;
-		}
-		if (copy_from_user(bh->b_data + atomic_read(&bh->b_count), buf,
-				   n))
-			ret = 1;
-		atomic_add(n, &bh->b_count);
-		if (atomic_read(&bh->b_count) == bh->b_size)
-			tape->bh = NULL;
-	}
-
-	return ret;
-}
-
-static int idetape_copy_stage_to_user(idetape_tape_t *tape, char __user *buf,
-				      int n)
-{
-	struct idetape_bh *bh = tape->bh;
-	int ret = 0;
-
-	if (n) {
-		if (bh == NULL || n > tape->b_count) {
-			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
-					__func__);
-			return 1;
-		}
-		if (copy_to_user(buf, tape->b_data, n))
-			ret = 1;
-		tape->b_data += n;
-		tape->b_count -= n;
-		if (!tape->b_count)
-			tape->bh = NULL;
-	}
-	return ret;
-}
-
-static void idetape_init_merge_buffer(idetape_tape_t *tape)
-{
-	struct idetape_bh *bh = tape->merge_bh;
-	tape->bh = tape->merge_bh;
-
-	if (tape->chrdev_dir == IDETAPE_DIR_WRITE)
-		atomic_set(&bh->b_count, 0);
-	else {
-		tape->b_data = bh->b_data;
-		tape->b_count = atomic_read(&bh->b_count);
-	}
-}
-
-/*
  * Write a filemark if write_filemark=1. Flush the device buffers without
  * writing a filemark otherwise.
  */
@@ -928,10 +822,10 @@ static void __ide_tape_discard_merge_buffer(ide_drive_t *drive)
 		return;
 
 	clear_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags);
-	tape->merge_bh_size = 0;
-	if (tape->merge_bh != NULL) {
-		ide_tape_kfree_buffer(tape);
-		tape->merge_bh = NULL;
+	tape->valid = 0;
+	if (tape->buf != NULL) {
+		kfree(tape->buf);
+		tape->buf = NULL;
 	}
 
 	tape->chrdev_dir = IDETAPE_DIR_NONE;
@@ -985,8 +879,7 @@ static void ide_tape_discard_merge_buffer(ide_drive_t *drive,
  * Generate a read/write request for the block device interface and wait for it
  * to be serviced.
  */
-static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks,
-				 struct idetape_bh *bh)
+static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	size_t size = blocks * tape->blk_size;
@@ -1000,11 +893,10 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks,
 	rq->cmd_type = REQ_TYPE_SPECIAL;
 	rq->cmd[13] = cmd;
 	rq->rq_disk = tape->disk;
-	rq->special = (void *)bh;
 	rq->sector = tape->first_frame;
 
 	if (size) {
-		ret = blk_rq_map_kern(drive->queue, rq, bh->b_data, size,
+		ret = blk_rq_map_kern(drive->queue, rq, tape->buf, size,
 				      __GFP_WAIT);
 		if (ret)
 			goto out_put;
@@ -1012,17 +904,17 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks,
 
 	blk_execute_rq(drive->queue, tape->disk, rq, 0);
 
-	/* calculate the number of transferred bytes and update bh */
+	/* calculate the number of transferred bytes and update buffer state */
 	size -= rq->data_len;
+	tape->cur = tape->buf;
 	if (cmd == REQ_IDETAPE_READ)
-		atomic_add(size, &bh->b_count);
+		tape->valid = size;
+	else
+		tape->valid = 0;
 
 	ret = size;
 	if (rq->errors == IDE_DRV_ERROR_GENERAL)
 		ret = -EIO;
-
-	if (tape->merge_bh)
-		idetape_init_merge_buffer(tape);
 out_put:
 	blk_put_request(rq);
 	return ret;
@@ -1064,49 +956,33 @@ static void idetape_create_space_cmd(struct ide_atapi_pc *pc, int count, u8 cmd)
 /* Queue up a character device originated write request. */
 static int idetape_add_chrdev_write_request(ide_drive_t *drive, int blocks)
 {
-	idetape_tape_t *tape = drive->driver_data;
-
 	debug_log(DBG_CHRDEV, "Enter %s\n", __func__);
 
-	return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE,
-				     blocks, tape->merge_bh);
+	return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks);
 }
 
 static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	int blocks;
-	struct idetape_bh *bh;
 
 	if (tape->chrdev_dir != IDETAPE_DIR_WRITE) {
 		printk(KERN_ERR "ide-tape: bug: Trying to empty merge buffer"
 				" but we are not writing.\n");
 		return;
 	}
-	if (tape->merge_bh_size > tape->buffer_size) {
-		printk(KERN_ERR "ide-tape: bug: merge_buffer too big\n");
-		tape->merge_bh_size = tape->buffer_size;
-	}
-	if (tape->merge_bh_size) {
-		blocks = tape->merge_bh_size / tape->blk_size;
-		if (tape->merge_bh_size % tape->blk_size) {
-			unsigned int i = tape->blk_size -
-				tape->merge_bh_size % tape->blk_size;
+	if (tape->buf) {
+		blocks = tape->valid / tape->blk_size;
+		if (tape->valid % tape->blk_size) {
 			blocks++;
-			bh = tape->bh;
-			if (bh) {
-				memset(bh->b_data + atomic_read(&bh->b_count),
-				       0, i);
-				atomic_add(i, &bh->b_count);
-			} else
-				printk(KERN_INFO "ide-tape: bug, bh NULL\n");
+			memset(tape->buf + tape->valid, 0,
+			       tape->blk_size - tape->valid % tape->blk_size);
 		}
 		(void) idetape_add_chrdev_write_request(drive, blocks);
-		tape->merge_bh_size = 0;
 	}
-	if (tape->merge_bh != NULL) {
-		ide_tape_kfree_buffer(tape);
-		tape->merge_bh = NULL;
+	if (tape->buf != NULL) {
+		kfree(tape->buf);
+		tape->buf = NULL;
 	}
 	tape->chrdev_dir = IDETAPE_DIR_NONE;
 }
@@ -1122,15 +998,15 @@ static int idetape_init_read(ide_drive_t *drive)
 			ide_tape_flush_merge_buffer(drive);
 			idetape_flush_tape_buffers(drive);
 		}
-		if (tape->merge_bh || tape->merge_bh_size) {
-			printk(KERN_ERR "ide-tape: merge_bh_size should be"
-					 " 0 now\n");
-			tape->merge_bh_size = 0;
+		if (tape->buf || tape->valid) {
+			printk(KERN_ERR "ide-tape: valid should be 0 now\n");
+			tape->valid = 0;
 		}
-		tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0);
-		if (!tape->merge_bh)
+		tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
+		if (!tape->buf)
 			return -ENOMEM;
 		tape->chrdev_dir = IDETAPE_DIR_READ;
+		tape->cur = tape->buf;
 
 		/*
 		 * Issue a read 0 command to ensure that DSC handshake is
@@ -1140,11 +1016,10 @@ static int idetape_init_read(ide_drive_t *drive)
 		 */
 		if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) {
 			bytes_read = idetape_queue_rw_tail(drive,
-							REQ_IDETAPE_READ, 0,
-							tape->merge_bh);
+							REQ_IDETAPE_READ, 0);
 			if (bytes_read < 0) {
-				ide_tape_kfree_buffer(tape);
-				tape->merge_bh = NULL;
+				kfree(tape->buf);
+				tape->buf = NULL;
 				tape->chrdev_dir = IDETAPE_DIR_NONE;
 				return bytes_read;
 			}
@@ -1157,8 +1032,6 @@ static int idetape_init_read(ide_drive_t *drive)
 /* called from idetape_chrdev_read() to service a chrdev read request. */
 static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks)
 {
-	idetape_tape_t *tape = drive->driver_data;
-
 	debug_log(DBG_PROCS, "Enter %s, %d blocks\n", __func__, blocks);
 
 	/* If we are at a filemark, return a read length of 0 */
@@ -1167,27 +1040,21 @@ static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks)
 
 	idetape_init_read(drive);
 
-	return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks,
-				     tape->merge_bh);
+	return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks);
 }
 
 static void idetape_pad_zeros(ide_drive_t *drive, int bcount)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	struct idetape_bh *bh = tape->merge_bh;
-	int blocks;
+
+	memset(tape->buf, 0, tape->buffer_size);
 
 	while (bcount) {
-		unsigned int count;
+		unsigned int count = min(tape->buffer_size, bcount);
 
-		count = min(tape->buffer_size, bcount);
+		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE,
+				      count / tape->blk_size);
 		bcount -= count;
-		blocks = count / tape->blk_size;
-		atomic_set(&bh->b_count, count);
-		memset(bh->b_data, 0, atomic_read(&bh->b_count));
-
-		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks,
-				      tape->merge_bh);
 	}
 }
 
@@ -1267,7 +1134,7 @@ static int idetape_space_over_filemarks(ide_drive_t *drive, short mt_op,
 	}
 
 	if (tape->chrdev_dir == IDETAPE_DIR_READ) {
-		tape->merge_bh_size = 0;
+		tape->valid = 0;
 		if (test_and_clear_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags))
 			++count;
 		ide_tape_discard_merge_buffer(drive, 0);
@@ -1333,20 +1200,20 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 		return rc;
 	if (count == 0)
 		return (0);
-	if (tape->merge_bh_size) {
-		actually_read = min((unsigned int)(tape->merge_bh_size),
-				    (unsigned int)count);
-		if (idetape_copy_stage_to_user(tape, buf, actually_read))
+	if (tape->valid) {
+		actually_read = min_t(unsigned int, tape->valid, count);
+		if (copy_to_user(buf, tape->cur, actually_read))
 			ret = -EFAULT;
 		buf += actually_read;
-		tape->merge_bh_size -= actually_read;
 		count -= actually_read;
+		tape->cur += actually_read;
+		tape->valid -= actually_read;
 	}
 	while (count >= tape->buffer_size) {
 		bytes_read = idetape_add_chrdev_read_request(drive, ctl);
 		if (bytes_read <= 0)
 			goto finish;
-		if (idetape_copy_stage_to_user(tape, buf, bytes_read))
+		if (copy_to_user(buf, tape->cur, bytes_read))
 			ret = -EFAULT;
 		buf += bytes_read;
 		count -= bytes_read;
@@ -1357,10 +1224,11 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 		if (bytes_read <= 0)
 			goto finish;
 		temp = min((unsigned long)count, (unsigned long)bytes_read);
-		if (idetape_copy_stage_to_user(tape, buf, temp))
+		if (copy_to_user(buf, tape->cur, temp))
 			ret = -EFAULT;
 		actually_read += temp;
-		tape->merge_bh_size = bytes_read-temp;
+		tape->cur += temp;
+		tape->valid -= temp;
 	}
 finish:
 	if (!actually_read && test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) {
@@ -1392,16 +1260,15 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	if (tape->chrdev_dir != IDETAPE_DIR_WRITE) {
 		if (tape->chrdev_dir == IDETAPE_DIR_READ)
 			ide_tape_discard_merge_buffer(drive, 1);
-		if (tape->merge_bh || tape->merge_bh_size) {
-			printk(KERN_ERR "ide-tape: merge_bh_size "
-				"should be 0 now\n");
-			tape->merge_bh_size = 0;
+		if (tape->buf || tape->valid) {
+			printk(KERN_ERR "ide-tape: valid should be 0 now\n");
+			tape->valid = 0;
 		}
-		tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0);
-		if (!tape->merge_bh)
+		tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
+		if (!tape->buf)
 			return -ENOMEM;
 		tape->chrdev_dir = IDETAPE_DIR_WRITE;
-		idetape_init_merge_buffer(tape);
+		tape->cur = tape->buf;
 
 		/*
 		 * Issue a write 0 command to ensure that DSC handshake is
@@ -1411,11 +1278,10 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 		 */
 		if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) {
 			ssize_t retval = idetape_queue_rw_tail(drive,
-							REQ_IDETAPE_WRITE, 0,
-							tape->merge_bh);
+							REQ_IDETAPE_WRITE, 0);
 			if (retval < 0) {
-				ide_tape_kfree_buffer(tape);
-				tape->merge_bh = NULL;
+				kfree(tape->buf);
+				tape->buf = NULL;
 				tape->chrdev_dir = IDETAPE_DIR_NONE;
 				return retval;
 			}
@@ -1423,23 +1289,19 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	}
 	if (count == 0)
 		return (0);
-	if (tape->merge_bh_size) {
-		if (tape->merge_bh_size >= tape->buffer_size) {
-			printk(KERN_ERR "ide-tape: bug: merge buf too big\n");
-			tape->merge_bh_size = 0;
-		}
-		actually_written = min((unsigned int)
-				(tape->buffer_size - tape->merge_bh_size),
-				(unsigned int)count);
-		if (idetape_copy_stage_from_user(tape, buf, actually_written))
-				ret = -EFAULT;
+	if (tape->valid < tape->buffer_size) {
+		actually_written = min_t(unsigned int,
+					 tape->buffer_size - tape->valid,
+					 count);
+		if (copy_from_user(tape->cur, buf, actually_written))
+			ret = -EFAULT;
 		buf += actually_written;
-		tape->merge_bh_size += actually_written;
 		count -= actually_written;
+		tape->cur += actually_written;
+		tape->valid += actually_written;
 
-		if (tape->merge_bh_size == tape->buffer_size) {
+		if (tape->valid == tape->buffer_size) {
 			ssize_t retval;
-			tape->merge_bh_size = 0;
 			retval = idetape_add_chrdev_write_request(drive, ctl);
 			if (retval <= 0)
 				return (retval);
@@ -1447,7 +1309,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	}
 	while (count >= tape->buffer_size) {
 		ssize_t retval;
-		if (idetape_copy_stage_from_user(tape, buf, tape->buffer_size))
+		if (copy_from_user(tape->cur, buf, tape->buffer_size))
 			ret = -EFAULT;
 		buf += tape->buffer_size;
 		count -= tape->buffer_size;
@@ -1458,9 +1320,10 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	}
 	if (count) {
 		actually_written += count;
-		if (idetape_copy_stage_from_user(tape, buf, count))
+		if (copy_from_user(tape->cur, buf, count))
 			ret = -EFAULT;
-		tape->merge_bh_size += count;
+		tape->cur += count;
+		tape->valid += count;
 	}
 	return ret ? ret : actually_written;
 }
@@ -1623,7 +1486,7 @@ static int idetape_chrdev_ioctl(struct inode *inode, struct file *file,
 		idetape_flush_tape_buffers(drive);
 	}
 	if (cmd == MTIOCGET || cmd == MTIOCPOS) {
-		block_offset = tape->merge_bh_size /
+		block_offset = tape->valid /
 			(tape->blk_size * tape->user_bs_factor);
 		position = idetape_read_position(drive);
 		if (position < 0)
@@ -1771,12 +1634,12 @@ static void idetape_write_release(ide_drive_t *drive, unsigned int minor)
 	idetape_tape_t *tape = drive->driver_data;
 
 	ide_tape_flush_merge_buffer(drive);
-	tape->merge_bh = ide_tape_kmalloc_buffer(tape, 1);
-	if (tape->merge_bh != NULL) {
+	tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
+	if (tape->buf != NULL) {
 		idetape_pad_zeros(drive, tape->blk_size *
 				(tape->user_bs_factor - 1));
-		ide_tape_kfree_buffer(tape);
-		tape->merge_bh = NULL;
+		kfree(tape->buf);
+		tape->buf = NULL;
 	}
 	idetape_write_filemark(drive);
 	idetape_flush_tape_buffers(drive);
@@ -2042,7 +1905,7 @@ static void ide_tape_release(struct device *dev)
 	ide_drive_t *drive = tape->drive;
 	struct gendisk *g = tape->disk;
 
-	BUG_ON(tape->merge_bh_size);
+	BUG_ON(tape->valid);
 
 	drive->dev_flags &= ~IDE_DFLAG_DSC_OVERLAP;
 	drive->driver_data = NULL;
-- 
cgit v0.10.2


From 3596b66452491a3cff26256a5e6e6061a66c4142 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:02 +0900
Subject: ide-tape: unify r/w init paths

Impact: cleanup

Read and write init paths are almost identical.  Unify them into
idetape_init_rw().

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index d2e9c09..4da7fa9 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -987,42 +987,50 @@ static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 	tape->chrdev_dir = IDETAPE_DIR_NONE;
 }
 
-static int idetape_init_read(ide_drive_t *drive)
+static int idetape_init_rw(ide_drive_t *drive, int dir)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	int bytes_read;
+	int rc;
 
-	/* Initialize read operation */
-	if (tape->chrdev_dir != IDETAPE_DIR_READ) {
-		if (tape->chrdev_dir == IDETAPE_DIR_WRITE) {
-			ide_tape_flush_merge_buffer(drive);
-			idetape_flush_tape_buffers(drive);
-		}
-		if (tape->buf || tape->valid) {
-			printk(KERN_ERR "ide-tape: valid should be 0 now\n");
-			tape->valid = 0;
-		}
-		tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
-		if (!tape->buf)
-			return -ENOMEM;
-		tape->chrdev_dir = IDETAPE_DIR_READ;
-		tape->cur = tape->buf;
+	BUG_ON(dir != IDETAPE_DIR_READ && dir != IDETAPE_DIR_WRITE);
 
-		/*
-		 * Issue a read 0 command to ensure that DSC handshake is
-		 * switched from completion mode to buffer available mode.
-		 * No point in issuing this if DSC overlap isn't supported, some
-		 * drives (Seagate STT3401A) will return an error.
-		 */
-		if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) {
-			bytes_read = idetape_queue_rw_tail(drive,
-							REQ_IDETAPE_READ, 0);
-			if (bytes_read < 0) {
-				kfree(tape->buf);
-				tape->buf = NULL;
-				tape->chrdev_dir = IDETAPE_DIR_NONE;
-				return bytes_read;
-			}
+	if (tape->chrdev_dir == dir)
+		return 0;
+
+	if (tape->chrdev_dir == IDETAPE_DIR_READ)
+		ide_tape_discard_merge_buffer(drive, 1);
+	else if (tape->chrdev_dir == IDETAPE_DIR_WRITE) {
+		ide_tape_flush_merge_buffer(drive);
+		idetape_flush_tape_buffers(drive);
+	}
+
+	if (tape->buf || tape->valid) {
+		printk(KERN_ERR "ide-tape: valid should be 0 now\n");
+		tape->valid = 0;
+	}
+
+	tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
+	if (!tape->buf)
+		return -ENOMEM;
+	tape->chrdev_dir = dir;
+	tape->cur = tape->buf;
+
+	/*
+	 * Issue a 0 rw command to ensure that DSC handshake is
+	 * switched from completion mode to buffer available mode.  No
+	 * point in issuing this if DSC overlap isn't supported, some
+	 * drives (Seagate STT3401A) will return an error.
+	 */
+	if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) {
+		int cmd = dir == IDETAPE_DIR_READ ? REQ_IDETAPE_READ
+						  : REQ_IDETAPE_WRITE;
+
+		rc = idetape_queue_rw_tail(drive, cmd, 0);
+		if (rc < 0) {
+			kfree(tape->buf);
+			tape->buf = NULL;
+			tape->chrdev_dir = IDETAPE_DIR_NONE;
+			return rc;
 		}
 	}
 
@@ -1038,7 +1046,7 @@ static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks)
 	if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags))
 		return 0;
 
-	idetape_init_read(drive);
+	idetape_init_rw(drive, IDETAPE_DIR_READ);
 
 	return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks);
 }
@@ -1195,7 +1203,7 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 			    (count % tape->blk_size) == 0)
 				tape->user_bs_factor = count / tape->blk_size;
 	}
-	rc = idetape_init_read(drive);
+	rc = idetape_init_rw(drive, IDETAPE_DIR_READ);
 	if (rc < 0)
 		return rc;
 	if (count == 0)
@@ -1249,6 +1257,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	ssize_t actually_written = 0;
 	ssize_t ret = 0;
 	u16 ctl = *(u16 *)&tape->caps[12];
+	int rc;
 
 	/* The drive is write protected. */
 	if (tape->write_prot)
@@ -1257,36 +1266,9 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count);
 
 	/* Initialize write operation */
-	if (tape->chrdev_dir != IDETAPE_DIR_WRITE) {
-		if (tape->chrdev_dir == IDETAPE_DIR_READ)
-			ide_tape_discard_merge_buffer(drive, 1);
-		if (tape->buf || tape->valid) {
-			printk(KERN_ERR "ide-tape: valid should be 0 now\n");
-			tape->valid = 0;
-		}
-		tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
-		if (!tape->buf)
-			return -ENOMEM;
-		tape->chrdev_dir = IDETAPE_DIR_WRITE;
-		tape->cur = tape->buf;
-
-		/*
-		 * Issue a write 0 command to ensure that DSC handshake is
-		 * switched from completion mode to buffer available mode. No
-		 * point in issuing this if DSC overlap isn't supported, some
-		 * drives (Seagate STT3401A) will return an error.
-		 */
-		if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) {
-			ssize_t retval = idetape_queue_rw_tail(drive,
-							REQ_IDETAPE_WRITE, 0);
-			if (retval < 0) {
-				kfree(tape->buf);
-				tape->buf = NULL;
-				tape->chrdev_dir = IDETAPE_DIR_NONE;
-				return retval;
-			}
-		}
-	}
+	rc = idetape_init_rw(drive, IDETAPE_DIR_WRITE);
+	if (rc < 0)
+		return rc;
 	if (count == 0)
 		return (0);
 	if (tape->valid < tape->buffer_size) {
-- 
cgit v0.10.2


From 71294cf93d22ceaa75448cc6ebee2c65897be8c2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:03 +0900
Subject: ide-tape: use byte size instead of sectors on rw issue functions

Impact: cleanup

Byte size is what most issue functions deal with, make
idetape_queue_rw_tail() and its wrappers take byte size instead of
sector counts.  idetape_chrdev_read() and write() functions are
converted to use tape->buffer_size instead of ctl from tape->cap.

This cleans up code a little bit and will ease the next r/w
reimplementation.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 4da7fa9..d5e9bb2 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -879,15 +879,15 @@ static void ide_tape_discard_merge_buffer(ide_drive_t *drive,
  * Generate a read/write request for the block device interface and wait for it
  * to be serviced.
  */
-static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks)
+static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	size_t size = blocks * tape->blk_size;
 	struct request *rq;
 	int ret;
 
 	debug_log(DBG_SENSE, "%s: cmd=%d\n", __func__, cmd);
 	BUG_ON(cmd != REQ_IDETAPE_READ && cmd != REQ_IDETAPE_WRITE);
+	BUG_ON(size < 0 || size % tape->blk_size);
 
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_SPECIAL;
@@ -954,17 +954,16 @@ static void idetape_create_space_cmd(struct ide_atapi_pc *pc, int count, u8 cmd)
 }
 
 /* Queue up a character device originated write request. */
-static int idetape_add_chrdev_write_request(ide_drive_t *drive, int blocks)
+static int idetape_add_chrdev_write_request(ide_drive_t *drive, int size)
 {
 	debug_log(DBG_CHRDEV, "Enter %s\n", __func__);
 
-	return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks);
+	return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, size);
 }
 
 static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	int blocks;
 
 	if (tape->chrdev_dir != IDETAPE_DIR_WRITE) {
 		printk(KERN_ERR "ide-tape: bug: Trying to empty merge buffer"
@@ -972,15 +971,10 @@ static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 		return;
 	}
 	if (tape->buf) {
-		blocks = tape->valid / tape->blk_size;
-		if (tape->valid % tape->blk_size) {
-			blocks++;
-			memset(tape->buf + tape->valid, 0,
-			       tape->blk_size - tape->valid % tape->blk_size);
-		}
-		(void) idetape_add_chrdev_write_request(drive, blocks);
-	}
-	if (tape->buf != NULL) {
+		size_t aligned = roundup(tape->valid, tape->blk_size);
+
+		memset(tape->cur, 0, aligned - tape->valid);
+		idetape_add_chrdev_write_request(drive, aligned);
 		kfree(tape->buf);
 		tape->buf = NULL;
 	}
@@ -1038,9 +1032,9 @@ static int idetape_init_rw(ide_drive_t *drive, int dir)
 }
 
 /* called from idetape_chrdev_read() to service a chrdev read request. */
-static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks)
+static int idetape_add_chrdev_read_request(ide_drive_t *drive, int size)
 {
-	debug_log(DBG_PROCS, "Enter %s, %d blocks\n", __func__, blocks);
+	debug_log(DBG_PROCS, "Enter %s, %d bytes\n", __func__, size);
 
 	/* If we are at a filemark, return a read length of 0 */
 	if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags))
@@ -1048,7 +1042,7 @@ static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks)
 
 	idetape_init_rw(drive, IDETAPE_DIR_READ);
 
-	return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks);
+	return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, size);
 }
 
 static void idetape_pad_zeros(ide_drive_t *drive, int bcount)
@@ -1060,8 +1054,7 @@ static void idetape_pad_zeros(ide_drive_t *drive, int bcount)
 	while (bcount) {
 		unsigned int count = min(tape->buffer_size, bcount);
 
-		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE,
-				      count / tape->blk_size);
+		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, count);
 		bcount -= count;
 	}
 }
@@ -1193,7 +1186,6 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 	ide_drive_t *drive = tape->drive;
 	ssize_t bytes_read, temp, actually_read = 0, rc;
 	ssize_t ret = 0;
-	u16 ctl = *(u16 *)&tape->caps[12];
 
 	debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count);
 
@@ -1218,7 +1210,8 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 		tape->valid -= actually_read;
 	}
 	while (count >= tape->buffer_size) {
-		bytes_read = idetape_add_chrdev_read_request(drive, ctl);
+		bytes_read = idetape_add_chrdev_read_request(drive,
+							     tape->buffer_size);
 		if (bytes_read <= 0)
 			goto finish;
 		if (copy_to_user(buf, tape->cur, bytes_read))
@@ -1228,7 +1221,8 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 		actually_read += bytes_read;
 	}
 	if (count) {
-		bytes_read = idetape_add_chrdev_read_request(drive, ctl);
+		bytes_read = idetape_add_chrdev_read_request(drive,
+							     tape->buffer_size);
 		if (bytes_read <= 0)
 			goto finish;
 		temp = min((unsigned long)count, (unsigned long)bytes_read);
@@ -1256,7 +1250,6 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	ide_drive_t *drive = tape->drive;
 	ssize_t actually_written = 0;
 	ssize_t ret = 0;
-	u16 ctl = *(u16 *)&tape->caps[12];
 	int rc;
 
 	/* The drive is write protected. */
@@ -1284,7 +1277,8 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 
 		if (tape->valid == tape->buffer_size) {
 			ssize_t retval;
-			retval = idetape_add_chrdev_write_request(drive, ctl);
+			retval = idetape_add_chrdev_write_request(drive,
+							tape->buffer_size);
 			if (retval <= 0)
 				return (retval);
 		}
@@ -1295,7 +1289,8 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 			ret = -EFAULT;
 		buf += tape->buffer_size;
 		count -= tape->buffer_size;
-		retval = idetape_add_chrdev_write_request(drive, ctl);
+		retval = idetape_add_chrdev_write_request(drive,
+							  tape->buffer_size);
 		actually_written += tape->buffer_size;
 		if (retval <= 0)
 			return (retval);
-- 
cgit v0.10.2


From 4344d07fb8dbf0cbfec1f7d7c1afeccaceaaa120 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:03 +0900
Subject: ide-tape: simplify read/write functions

Impact: cleanup

idetape_chrdev_read/write() functions are unnecessarily complex when
everything can be handled in a single loop.  Collapse
idetape_add_chrdev_read/write_request() into the rw functions and
simplify the implementation.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index d5e9bb2..2599579 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -953,14 +953,6 @@ static void idetape_create_space_cmd(struct ide_atapi_pc *pc, int count, u8 cmd)
 	pc->flags |= PC_FLAG_WAIT_FOR_DSC;
 }
 
-/* Queue up a character device originated write request. */
-static int idetape_add_chrdev_write_request(ide_drive_t *drive, int size)
-{
-	debug_log(DBG_CHRDEV, "Enter %s\n", __func__);
-
-	return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, size);
-}
-
 static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
@@ -974,7 +966,7 @@ static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 		size_t aligned = roundup(tape->valid, tape->blk_size);
 
 		memset(tape->cur, 0, aligned - tape->valid);
-		idetape_add_chrdev_write_request(drive, aligned);
+		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, aligned);
 		kfree(tape->buf);
 		tape->buf = NULL;
 	}
@@ -1031,20 +1023,6 @@ static int idetape_init_rw(ide_drive_t *drive, int dir)
 	return 0;
 }
 
-/* called from idetape_chrdev_read() to service a chrdev read request. */
-static int idetape_add_chrdev_read_request(ide_drive_t *drive, int size)
-{
-	debug_log(DBG_PROCS, "Enter %s, %d bytes\n", __func__, size);
-
-	/* If we are at a filemark, return a read length of 0 */
-	if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags))
-		return 0;
-
-	idetape_init_rw(drive, IDETAPE_DIR_READ);
-
-	return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, size);
-}
-
 static void idetape_pad_zeros(ide_drive_t *drive, int bcount)
 {
 	idetape_tape_t *tape = drive->driver_data;
@@ -1184,8 +1162,9 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 {
 	struct ide_tape_obj *tape = file->private_data;
 	ide_drive_t *drive = tape->drive;
-	ssize_t bytes_read, temp, actually_read = 0, rc;
+	size_t done = 0;
 	ssize_t ret = 0;
+	int rc;
 
 	debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count);
 
@@ -1195,52 +1174,43 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 			    (count % tape->blk_size) == 0)
 				tape->user_bs_factor = count / tape->blk_size;
 	}
+
 	rc = idetape_init_rw(drive, IDETAPE_DIR_READ);
 	if (rc < 0)
 		return rc;
-	if (count == 0)
-		return (0);
-	if (tape->valid) {
-		actually_read = min_t(unsigned int, tape->valid, count);
-		if (copy_to_user(buf, tape->cur, actually_read))
-			ret = -EFAULT;
-		buf += actually_read;
-		count -= actually_read;
-		tape->cur += actually_read;
-		tape->valid -= actually_read;
-	}
-	while (count >= tape->buffer_size) {
-		bytes_read = idetape_add_chrdev_read_request(drive,
-							     tape->buffer_size);
-		if (bytes_read <= 0)
-			goto finish;
-		if (copy_to_user(buf, tape->cur, bytes_read))
-			ret = -EFAULT;
-		buf += bytes_read;
-		count -= bytes_read;
-		actually_read += bytes_read;
-	}
-	if (count) {
-		bytes_read = idetape_add_chrdev_read_request(drive,
-							     tape->buffer_size);
-		if (bytes_read <= 0)
-			goto finish;
-		temp = min((unsigned long)count, (unsigned long)bytes_read);
-		if (copy_to_user(buf, tape->cur, temp))
+
+	while (done < count) {
+		size_t todo;
+
+		/* refill if staging buffer is empty */
+		if (!tape->valid) {
+			/* If we are at a filemark, nothing more to read */
+			if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags))
+				break;
+			/* read */
+			if (idetape_queue_rw_tail(drive, REQ_IDETAPE_READ,
+						  tape->buffer_size) <= 0)
+				break;
+		}
+
+		/* copy out */
+		todo = min_t(size_t, count - done, tape->valid);
+		if (copy_to_user(buf + done, tape->cur, todo))
 			ret = -EFAULT;
-		actually_read += temp;
-		tape->cur += temp;
-		tape->valid -= temp;
+
+		tape->cur += todo;
+		tape->valid -= todo;
+		done += todo;
 	}
-finish:
-	if (!actually_read && test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) {
+
+	if (!done && test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) {
 		debug_log(DBG_SENSE, "%s: spacing over filemark\n", tape->name);
 
 		idetape_space_over_filemarks(drive, MTFSF, 1);
 		return 0;
 	}
 
-	return ret ? ret : actually_read;
+	return ret ? ret : done;
 }
 
 static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
@@ -1248,7 +1218,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 {
 	struct ide_tape_obj *tape = file->private_data;
 	ide_drive_t *drive = tape->drive;
-	ssize_t actually_written = 0;
+	size_t done = 0;
 	ssize_t ret = 0;
 	int rc;
 
@@ -1262,47 +1232,28 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	rc = idetape_init_rw(drive, IDETAPE_DIR_WRITE);
 	if (rc < 0)
 		return rc;
-	if (count == 0)
-		return (0);
-	if (tape->valid < tape->buffer_size) {
-		actually_written = min_t(unsigned int,
-					 tape->buffer_size - tape->valid,
-					 count);
-		if (copy_from_user(tape->cur, buf, actually_written))
-			ret = -EFAULT;
-		buf += actually_written;
-		count -= actually_written;
-		tape->cur += actually_written;
-		tape->valid += actually_written;
-
-		if (tape->valid == tape->buffer_size) {
-			ssize_t retval;
-			retval = idetape_add_chrdev_write_request(drive,
-							tape->buffer_size);
-			if (retval <= 0)
-				return (retval);
-		}
-	}
-	while (count >= tape->buffer_size) {
-		ssize_t retval;
-		if (copy_from_user(tape->cur, buf, tape->buffer_size))
-			ret = -EFAULT;
-		buf += tape->buffer_size;
-		count -= tape->buffer_size;
-		retval = idetape_add_chrdev_write_request(drive,
-							  tape->buffer_size);
-		actually_written += tape->buffer_size;
-		if (retval <= 0)
-			return (retval);
-	}
-	if (count) {
-		actually_written += count;
-		if (copy_from_user(tape->cur, buf, count))
+
+	while (done < count) {
+		size_t todo;
+
+		/* flush if staging buffer is full */
+		if (tape->valid == tape->buffer_size &&
+		    idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE,
+					  tape->buffer_size) <= 0)
+			return rc;
+
+		/* copy in */
+		todo = min_t(size_t, count - done,
+			     tape->buffer_size - tape->valid);
+		if (copy_from_user(tape->cur, buf + done, todo))
 			ret = -EFAULT;
-		tape->cur += count;
-		tape->valid += count;
+
+		tape->cur += todo;
+		tape->valid += todo;
+		done += todo;
 	}
-	return ret ? ret : actually_written;
+
+	return ret ? ret : done;
 }
 
 static int idetape_write_filemark(ide_drive_t *drive)
-- 
cgit v0.10.2


From 29d1a4371035e01b0d079bc5aa88b50f5af7a566 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:03 +0900
Subject: ide-atapi: kill unused fields and callbacks

Impact: remove fields and code paths which are no longer necessary

Now that ide-tape uses standard mechanisms to transfer data, special
case handling for bh handling can be dropped from ide-atapi.  Drop the
followings.

* pc->cur_pos, b_count, bh and b_data
* drive->pc_update_buffers() and pc_io_buffers().

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index b9dd450..afe5a43 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -359,11 +359,8 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 					drive->name, rq_data_dir(pc->rq)
 						     ? "write" : "read");
 			pc->flags |= PC_FLAG_DMA_ERROR;
-		} else {
+		} else
 			pc->xferred = pc->req_xfer;
-			if (drive->pc_update_buffers)
-				drive->pc_update_buffers(drive, pc);
-		}
 		debug_log("%s: DMA finished\n", drive->name);
 	}
 
@@ -463,16 +460,11 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		return ide_do_reset(drive);
 	}
 
-	if (drive->media == ide_tape && pc->bh)
-		done = drive->pc_io_buffers(drive, pc, bcount, write);
-	else {
-		done = min_t(unsigned int, bcount, cmd->nleft);
-		ide_pio_bytes(drive, cmd, write, done);
-	}
+	done = min_t(unsigned int, bcount, cmd->nleft);
+	ide_pio_bytes(drive, cmd, write, done);
 
-	/* Update the current position */
+	/* Update transferred byte count */
 	pc->xferred += done;
-	pc->cur_pos += done;
 
 	bcount -= done;
 
@@ -650,7 +642,6 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd)
 
 		/* We haven't transferred any data yet */
 		pc->xferred = 0;
-		pc->cur_pos = pc->buf;
 
 		valid_tf = IDE_VALID_DEVICE;
 		bcount = ((drive->media == ide_tape) ?
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 2599579..8dfc688 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -591,7 +591,6 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape,
 	ide_init_pc(pc);
 	put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]);
 	pc->c[1] = 1;
-	pc->bh = NULL;
 	pc->buf = NULL;
 	pc->buf_size = length * tape->blk_size;
 	pc->req_xfer = pc->buf_size;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 1957461..34c128f 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -362,11 +362,7 @@ struct ide_atapi_pc {
 
 	/* data buffer */
 	u8 *buf;
-	/* current buffer position */
-	u8 *cur_pos;
 	int buf_size;
-	/* missing/available data on the current buffer */
-	int b_count;
 
 	/* the corresponding request */
 	struct request *rq;
@@ -379,10 +375,6 @@ struct ide_atapi_pc {
 	 */
 	u8 pc_buf[IDE_PC_BUFFER_SIZE];
 
-	/* idetape only */
-	struct idetape_bh *bh;
-	char *b_data;
-
 	unsigned long timeout;
 };
 
@@ -595,10 +587,6 @@ struct ide_drive_s {
 	/* callback for packet commands */
 	int  (*pc_callback)(struct ide_drive_s *, int);
 
-	void (*pc_update_buffers)(struct ide_drive_s *, struct ide_atapi_pc *);
-	int  (*pc_io_buffers)(struct ide_drive_s *, struct ide_atapi_pc *,
-			      unsigned int, int);
-
 	ide_startstop_t (*irq_handler)(struct ide_drive_s *);
 
 	unsigned long atapi_flags;
-- 
cgit v0.10.2


From 5ad960fe8d0e4f99fe2b8dded45e8251137293c9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:03 +0900
Subject: ide: drop rq->data handling from ide_map_sg()

Impact: remove code path which is no longer necessary

All IDE data transfers now use rq->bio.  Simplify ide_map_sg()
accordingly.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 6e3094e..a0309ea 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -248,11 +248,7 @@ void ide_map_sg(ide_drive_t *drive, struct ide_cmd *cmd)
 	struct scatterlist *sg = hwif->sg_table;
 	struct request *rq = cmd->rq;
 
-	if (!rq->bio) {
-		sg_init_one(sg, rq->data, rq->data_len);
-		cmd->sg_nents = 1;
-	} else
-		cmd->sg_nents = blk_rq_map_sg(drive->queue, rq, sg);
+	cmd->sg_nents = blk_rq_map_sg(drive->queue, rq, sg);
 }
 EXPORT_SYMBOL_GPL(ide_map_sg);
 
-- 
cgit v0.10.2


From 586cf2681f527ce8b85b9bd57c8b9f7945fbe051 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 21 Apr 2009 12:16:56 +0900
Subject: ide-dma: don't reset request fields on dma_timeout_retry()

Impact: drop unnecessary code

Now that everything uses bio and block operations, there is no need to
reset request fields manually when retrying a request.  Every field is
guaranteed to be always valid.  Drop unnecessary request field
resetting from ide_dma_timeout_retry().

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index a0b8cab1..d9123ec 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -510,23 +510,11 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 	/*
 	 * un-busy drive etc and make sure request is sane
 	 */
-
 	rq = hwif->rq;
-	if (!rq)
-		goto out;
-
-	hwif->rq = NULL;
-
-	rq->errors = 0;
-
-	if (!rq->bio)
-		goto out;
-
-	rq->sector = rq->bio->bi_sector;
-	rq->current_nr_sectors = bio_iovec(rq->bio)->bv_len >> 9;
-	rq->hard_cur_sectors = rq->current_nr_sectors;
-	rq->buffer = bio_data(rq->bio);
-out:
+	if (rq) {
+		hwif->rq = NULL;
+		rq->errors = 0;
+	}
 	return ret;
 }
 
-- 
cgit v0.10.2


From db29a6b49674085f136331014ba0eee249c16a2c Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 21 Apr 2009 09:27:03 +0200
Subject: block: enable by default support for large devices and files on
 32-bit archs

Enable by default support for large devices and files (CONFIG_LBD):

- With 1TB disks being a commodity hardware it is quite easy to hit 2TB
  limitation while building RAIDs etc. and many distros have been using
  CONFIG_LBD=y by default already (at least Fedora 10 and openSUSE 11.1).

- This should also prevent a subtle ext4 filesystem compatibility issue:
  mke2fs.ext4 defaults to creating filesystems with huge_files feature
  enabled and such filesystems cannot be later mounted read-write on
  machines with CONFIG_LBD=n (it should be quite easy to hit this issue
  when trying to use filesystem created using distro kernel on system
  running the self-build kernel, think about USB disk enclosures & co.).

While at it:

- Clarify config option help text w.r.t. mounting ext4 filesystems
  (they can be mounted with CONFIG_LBD=n but in the read-only mode).

Cc: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/Kconfig b/block/Kconfig
index e7d1278..2c39527 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -26,6 +26,7 @@ if BLOCK
 config LBD
 	bool "Support for large block devices and files"
 	depends on !64BIT
+	default y
 	help
 	  Enable block devices or files of size 2TB and larger.
 
@@ -38,11 +39,13 @@ config LBD
 
 	  The ext4 filesystem requires that this feature be enabled in
 	  order to support filesystems that have the huge_file feature
-	  enabled.    Otherwise, it will refuse to mount any filesystems
-	  that use the huge_file feature, which is enabled by default
-	  by mke2fs.ext4.   The GFS2 filesystem also requires this feature.
+	  enabled.  Otherwise, it will refuse to mount in the read-write
+	  mode any filesystems that use the huge_file feature, which is
+	  enabled by default by mke2fs.ext4.
 
-	  If unsure, say N.
+	  The GFS2 filesystem also requires this feature.
+
+	  If unsure, say Y.
 
 config BLK_DEV_BSG
 	bool "Block layer SG support v4 (EXPERIMENTAL)"
-- 
cgit v0.10.2


From db2dbb12dc47a50c7a4c5678f526014063e486f6 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Wed, 22 Apr 2009 14:08:13 +0200
Subject: block: implement blkdev_readpages

Doing a proper block dev ->readpages() speeds up the crazy dump(8)
approach of using interleaved process IO.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/block_dev.c b/fs/block_dev.c
index f45dbc1..a85fe31 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -331,6 +331,12 @@ static int blkdev_readpage(struct file * file, struct page * page)
 	return block_read_full_page(page, blkdev_get_block);
 }
 
+static int blkdev_readpages(struct file *file, struct address_space *mapping,
+			struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block);
+}
+
 static int blkdev_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
@@ -1399,6 +1405,7 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
 
 static const struct address_space_operations def_blk_aops = {
 	.readpage	= blkdev_readpage,
+	.readpages	= blkdev_readpages,
 	.writepage	= blkdev_writepage,
 	.sync_page	= block_sync_page,
 	.write_begin	= blkdev_write_begin,
-- 
cgit v0.10.2


From a538cd03be6f363d039daa94199c28cfbd508455 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:17 +0900
Subject: block: merge blk_invoke_request_fn() into __blk_run_queue()

__blk_run_queue wraps blk_invoke_request_fn() such that it
additionally removes plug and bails out early if the queue is empty.
Both extra operations have their own pending mechanisms and don't
cause any harm correctness-wise when they are done superflously.

The only user of blk_invoke_request_fn() being blk_start_queue(),
there isn't much reason to keep both functions around.  Merge
blk_invoke_request_fn() into __blk_run_queue() and make
blk_start_queue() use __blk_run_queue() instead.

[ Impact: merge two subtly different internal functions ]

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/block/blk-core.c b/block/blk-core.c
index 41bc0ff..02f53bc 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -333,24 +333,6 @@ void blk_unplug(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_unplug);
 
-static void blk_invoke_request_fn(struct request_queue *q)
-{
-	if (unlikely(blk_queue_stopped(q)))
-		return;
-
-	/*
-	 * one level of recursion is ok and is much faster than kicking
-	 * the unplug handling
-	 */
-	if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
-		q->request_fn(q);
-		queue_flag_clear(QUEUE_FLAG_REENTER, q);
-	} else {
-		queue_flag_set(QUEUE_FLAG_PLUGGED, q);
-		kblockd_schedule_work(q, &q->unplug_work);
-	}
-}
-
 /**
  * blk_start_queue - restart a previously stopped queue
  * @q:    The &struct request_queue in question
@@ -365,7 +347,7 @@ void blk_start_queue(struct request_queue *q)
 	WARN_ON(!irqs_disabled());
 
 	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
-	blk_invoke_request_fn(q);
+	__blk_run_queue(q);
 }
 EXPORT_SYMBOL(blk_start_queue);
 
@@ -425,12 +407,23 @@ void __blk_run_queue(struct request_queue *q)
 {
 	blk_remove_plug(q);
 
+	if (unlikely(blk_queue_stopped(q)))
+		return;
+
+	if (elv_queue_empty(q))
+		return;
+
 	/*
 	 * Only recurse once to avoid overrunning the stack, let the unplug
 	 * handling reinvoke the handler shortly if we already got there.
 	 */
-	if (!elv_queue_empty(q))
-		blk_invoke_request_fn(q);
+	if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
+		q->request_fn(q);
+		queue_flag_clear(QUEUE_FLAG_REENTER, q);
+	} else {
+		queue_flag_set(QUEUE_FLAG_PLUGGED, q);
+		kblockd_schedule_work(q, &q->unplug_work);
+	}
 }
 EXPORT_SYMBOL(__blk_run_queue);
 
-- 
cgit v0.10.2


From a7f557923441186a3cdbabc54f1bcacf42b63bf5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:17 +0900
Subject: block: kill blk_start_queueing()

blk_start_queueing() is identical to __blk_run_queue() except that it
doesn't check for recursion.  None of the current users depends on
blk_start_queueing() running request_fn directly.  Replace usages of
blk_start_queueing() with [__]blk_run_queue() and kill it.

[ Impact: removal of mostly duplicate interface function ]

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/block/as-iosched.c b/block/as-iosched.c
index c48fa67..45bd070 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -1312,12 +1312,8 @@ static void as_merged_requests(struct request_queue *q, struct request *req,
 static void as_work_handler(struct work_struct *work)
 {
 	struct as_data *ad = container_of(work, struct as_data, antic_work);
-	struct request_queue *q = ad->q;
-	unsigned long flags;
 
-	spin_lock_irqsave(q->queue_lock, flags);
-	blk_start_queueing(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	blk_run_queue(ad->q);
 }
 
 static int as_may_queue(struct request_queue *q, int rw)
diff --git a/block/blk-core.c b/block/blk-core.c
index 02f53bc..8b4a0af 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -433,9 +433,7 @@ EXPORT_SYMBOL(__blk_run_queue);
  *
  * Description:
  *    Invoke request handling on this queue, if it has pending work to do.
- *    May be used to restart queueing when a request has completed. Also
- *    See @blk_start_queueing.
- *
+ *    May be used to restart queueing when a request has completed.
  */
 void blk_run_queue(struct request_queue *q)
 {
@@ -895,28 +893,6 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 EXPORT_SYMBOL(blk_get_request);
 
 /**
- * blk_start_queueing - initiate dispatch of requests to device
- * @q:		request queue to kick into gear
- *
- * This is basically a helper to remove the need to know whether a queue
- * is plugged or not if someone just wants to initiate dispatch of requests
- * for this queue. Should be used to start queueing on a device outside
- * of ->request_fn() context. Also see @blk_run_queue.
- *
- * The queue lock must be held with interrupts disabled.
- */
-void blk_start_queueing(struct request_queue *q)
-{
-	if (!blk_queue_plugged(q)) {
-		if (unlikely(blk_queue_stopped(q)))
-			return;
-		q->request_fn(q);
-	} else
-		__generic_unplug_device(q);
-}
-EXPORT_SYMBOL(blk_start_queueing);
-
-/**
  * blk_requeue_request - put a request back on queue
  * @q:		request queue where request should be inserted
  * @rq:		request to be inserted
@@ -984,7 +960,7 @@ void blk_insert_request(struct request_queue *q, struct request *rq,
 
 	drive_stat_acct(rq, 1);
 	__elv_add_request(q, rq, where, 0);
-	blk_start_queueing(q);
+	__blk_run_queue(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_insert_request);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index a55a9bd..def0c69 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2088,7 +2088,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
 			    cfqd->busy_queues > 1) {
 				del_timer(&cfqd->idle_slice_timer);
-				blk_start_queueing(cfqd->queue);
+			__blk_run_queue(cfqd->queue);
 			}
 			cfq_mark_cfqq_must_dispatch(cfqq);
 		}
@@ -2100,7 +2100,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		 * this new queue is RT and the current one is BE
 		 */
 		cfq_preempt_queue(cfqd, cfqq);
-		blk_start_queueing(cfqd->queue);
+		__blk_run_queue(cfqd->queue);
 	}
 }
 
@@ -2345,7 +2345,7 @@ static void cfq_kick_queue(struct work_struct *work)
 	struct request_queue *q = cfqd->queue;
 
 	spin_lock_irq(q->queue_lock);
-	blk_start_queueing(q);
+	__blk_run_queue(cfqd->queue);
 	spin_unlock_irq(q->queue_lock);
 }
 
diff --git a/block/elevator.c b/block/elevator.c
index 7073a90..2e0fb21 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -599,7 +599,7 @@ void elv_quiesce_start(struct request_queue *q)
 	 */
 	elv_drain_elevator(q);
 	while (q->rq.elvpriv) {
-		blk_start_queueing(q);
+		__blk_run_queue(q);
 		spin_unlock_irq(q->queue_lock);
 		msleep(10);
 		spin_lock_irq(q->queue_lock);
@@ -643,8 +643,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 		 *   with anything.  There's no point in delaying queue
 		 *   processing.
 		 */
-		blk_remove_plug(q);
-		blk_start_queueing(q);
+		__blk_run_queue(q);
 		break;
 
 	case ELEVATOR_INSERT_SORT:
@@ -971,7 +970,7 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 		    blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
 		    (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) {
 			blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
-			blk_start_queueing(q);
+			__blk_run_queue(q);
 		}
 	}
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2755d5c..12e20de 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -797,7 +797,6 @@ extern void blk_sync_queue(struct request_queue *q);
 extern void __blk_stop_queue(struct request_queue *q);
 extern void __blk_run_queue(struct request_queue *);
 extern void blk_run_queue(struct request_queue *);
-extern void blk_start_queueing(struct request_queue *);
 extern int blk_rq_map_user(struct request_queue *, struct request *,
 			   struct rq_map_data *, void __user *, unsigned long,
 			   gfp_t);
-- 
cgit v0.10.2


From e4025f6c21f1389696c069be2dc647f364925c45 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:17 +0900
Subject: block: don't set REQ_NOMERGE unnecessarily

RQ_NOMERGE_FLAGS already clears defines which REQ flags aren't
mergeable.  There is no reason to specify it superflously.  It only
adds to confusion.  Don't set REQ_NOMERGE for barriers and requests
with specific queueing directive.  REQ_NOMERGE is now exclusively used
by the merging code.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/block/blk-core.c b/block/blk-core.c
index 8b4a0af..7e0fab5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1082,16 +1082,13 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 	if (bio_failfast_driver(bio))
 		req->cmd_flags |= REQ_FAILFAST_DRIVER;
 
-	/*
-	 * REQ_BARRIER implies no merging, but lets make it explicit
-	 */
 	if (unlikely(bio_discard(bio))) {
 		req->cmd_flags |= REQ_DISCARD;
 		if (bio_barrier(bio))
 			req->cmd_flags |= REQ_SOFTBARRIER;
 		req->q->prepare_discard_fn(req->q, req);
 	} else if (unlikely(bio_barrier(bio)))
-		req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
+		req->cmd_flags |= REQ_HARDBARRIER;
 
 	if (bio_sync(bio))
 		req->cmd_flags |= REQ_RW_SYNC;
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 6af716d..49557e9 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -51,7 +51,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
 
 	rq->rq_disk = bd_disk;
-	rq->cmd_flags |= REQ_NOMERGE;
 	rq->end_io = done;
 	WARN_ON(irqs_disabled());
 	spin_lock_irq(q->queue_lock);
-- 
cgit v0.10.2


From 10732f5661fb7adf62e20733b0030fc0fc93b0c4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:18 +0900
Subject: block: cleanup REQ_SOFTBARRIER usages

blk_insert_request() doesn't need to worry about REQ_SOFTBARRIER.
Don't set it.  Combined with recent ide updates, REQ_SOFTBARRIER is
now only used in elevator proper and for discard requests.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/block/blk-core.c b/block/blk-core.c
index 7e0fab5..cf10dfc 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -946,7 +946,6 @@ void blk_insert_request(struct request_queue *q, struct request *rq,
 	 * barrier
 	 */
 	rq->cmd_type = REQ_TYPE_SPECIAL;
-	rq->cmd_flags |= REQ_SOFTBARRIER;
 
 	rq->special = data;
 
-- 
cgit v0.10.2


From 2eef33e439ba9ae387cdc3f1abcef2f3f6c4e7a8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:18 +0900
Subject: block: clean up misc stuff after block layer timeout conversion

* In blk_rq_timed_out_timer(), else { if } to else if

* In blk_add_timer(), simplify if/else block

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 1ec0d50..1ba7e0a 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -122,10 +122,8 @@ void blk_rq_timed_out_timer(unsigned long data)
 			if (blk_mark_rq_complete(rq))
 				continue;
 			blk_rq_timed_out(rq);
-		} else {
-			if (!next || time_after(next, rq->deadline))
-				next = rq->deadline;
-		}
+		} else if (!next || time_after(next, rq->deadline))
+			next = rq->deadline;
 	}
 
 	/*
@@ -176,16 +174,14 @@ void blk_add_timer(struct request *req)
 	BUG_ON(!list_empty(&req->timeout_list));
 	BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
 
-	if (req->timeout)
-		req->deadline = jiffies + req->timeout;
-	else {
-		req->deadline = jiffies + q->rq_timeout;
-		/*
-		 * Some LLDs, like scsi, peek at the timeout to prevent
-		 * a command from being retried forever.
-		 */
+	/*
+	 * Some LLDs, like scsi, peek at the timeout to prevent a
+	 * command from being retried forever.
+	 */
+	if (!req->timeout)
 		req->timeout = q->rq_timeout;
-	}
+
+	req->deadline = jiffies + req->timeout;
 	list_add_tail(&req->timeout_list, &q->timeout_list);
 
 	/*
-- 
cgit v0.10.2


From 5efccd17ceb0fc43837a331297c2c407969d7201 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:18 +0900
Subject: block: reorder request completion functions

Reorder request completion functions such that

* All request completion functions are located together.

* Functions which are used by only one caller is put right above the
  caller.

* end_request() is put after other completion functions but before
  blk_update_request().

This change is for completion function cleanup which will follow.

[ Impact: cleanup, code reorganization ]

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/block/blk-core.c b/block/blk-core.c
index cf10dfc..406a93e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1684,6 +1684,35 @@ static void blk_account_io_done(struct request *req)
 }
 
 /**
+ * blk_rq_bytes - Returns bytes left to complete in the entire request
+ * @rq: the request being processed
+ **/
+unsigned int blk_rq_bytes(struct request *rq)
+{
+	if (blk_fs_request(rq))
+		return rq->hard_nr_sectors << 9;
+
+	return rq->data_len;
+}
+EXPORT_SYMBOL_GPL(blk_rq_bytes);
+
+/**
+ * blk_rq_cur_bytes - Returns bytes left to complete in the current segment
+ * @rq: the request being processed
+ **/
+unsigned int blk_rq_cur_bytes(struct request *rq)
+{
+	if (blk_fs_request(rq))
+		return rq->current_nr_sectors << 9;
+
+	if (rq->bio)
+		return rq->bio->bi_size;
+
+	return rq->data_len;
+}
+EXPORT_SYMBOL_GPL(blk_rq_cur_bytes);
+
+/**
  * __end_that_request_first - end I/O on a request
  * @req:      the request being processed
  * @error:    %0 for success, < %0 for error
@@ -1797,6 +1826,22 @@ static int __end_that_request_first(struct request *req, int error,
 	return 1;
 }
 
+static int end_that_request_data(struct request *rq, int error,
+				 unsigned int nr_bytes, unsigned int bidi_bytes)
+{
+	if (rq->bio) {
+		if (__end_that_request_first(rq, error, nr_bytes))
+			return 1;
+
+		/* Bidi request must be completed as a whole */
+		if (blk_bidi_rq(rq) &&
+		    __end_that_request_first(rq->next_rq, error, bidi_bytes))
+			return 1;
+	}
+
+	return 0;
+}
+
 /*
  * queue lock must be held
  */
@@ -1826,78 +1871,6 @@ static void end_that_request_last(struct request *req, int error)
 }
 
 /**
- * blk_rq_bytes - Returns bytes left to complete in the entire request
- * @rq: the request being processed
- **/
-unsigned int blk_rq_bytes(struct request *rq)
-{
-	if (blk_fs_request(rq))
-		return rq->hard_nr_sectors << 9;
-
-	return rq->data_len;
-}
-EXPORT_SYMBOL_GPL(blk_rq_bytes);
-
-/**
- * blk_rq_cur_bytes - Returns bytes left to complete in the current segment
- * @rq: the request being processed
- **/
-unsigned int blk_rq_cur_bytes(struct request *rq)
-{
-	if (blk_fs_request(rq))
-		return rq->current_nr_sectors << 9;
-
-	if (rq->bio)
-		return rq->bio->bi_size;
-
-	return rq->data_len;
-}
-EXPORT_SYMBOL_GPL(blk_rq_cur_bytes);
-
-/**
- * end_request - end I/O on the current segment of the request
- * @req:	the request being processed
- * @uptodate:	error value or %0/%1 uptodate flag
- *
- * Description:
- *     Ends I/O on the current segment of a request. If that is the only
- *     remaining segment, the request is also completed and freed.
- *
- *     This is a remnant of how older block drivers handled I/O completions.
- *     Modern drivers typically end I/O on the full request in one go, unless
- *     they have a residual value to account for. For that case this function
- *     isn't really useful, unless the residual just happens to be the
- *     full current segment. In other words, don't use this function in new
- *     code. Use blk_end_request() or __blk_end_request() to end a request.
- **/
-void end_request(struct request *req, int uptodate)
-{
-	int error = 0;
-
-	if (uptodate <= 0)
-		error = uptodate ? uptodate : -EIO;
-
-	__blk_end_request(req, error, req->hard_cur_sectors << 9);
-}
-EXPORT_SYMBOL(end_request);
-
-static int end_that_request_data(struct request *rq, int error,
-				 unsigned int nr_bytes, unsigned int bidi_bytes)
-{
-	if (rq->bio) {
-		if (__end_that_request_first(rq, error, nr_bytes))
-			return 1;
-
-		/* Bidi request must be completed as a whole */
-		if (blk_bidi_rq(rq) &&
-		    __end_that_request_first(rq->next_rq, error, bidi_bytes))
-			return 1;
-	}
-
-	return 0;
-}
-
-/**
  * blk_end_io - Generic end_io function to complete a request.
  * @rq:           the request being processed
  * @error:        %0 for success, < %0 for error
@@ -2007,6 +1980,33 @@ int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes,
 EXPORT_SYMBOL_GPL(blk_end_bidi_request);
 
 /**
+ * end_request - end I/O on the current segment of the request
+ * @req:	the request being processed
+ * @uptodate:	error value or %0/%1 uptodate flag
+ *
+ * Description:
+ *     Ends I/O on the current segment of a request. If that is the only
+ *     remaining segment, the request is also completed and freed.
+ *
+ *     This is a remnant of how older block drivers handled I/O completions.
+ *     Modern drivers typically end I/O on the full request in one go, unless
+ *     they have a residual value to account for. For that case this function
+ *     isn't really useful, unless the residual just happens to be the
+ *     full current segment. In other words, don't use this function in new
+ *     code. Use blk_end_request() or __blk_end_request() to end a request.
+ **/
+void end_request(struct request *req, int uptodate)
+{
+	int error = 0;
+
+	if (uptodate <= 0)
+		error = uptodate ? uptodate : -EIO;
+
+	__blk_end_request(req, error, req->hard_cur_sectors << 9);
+}
+EXPORT_SYMBOL(end_request);
+
+/**
  * blk_update_request - Special helper function for request stacking drivers
  * @rq:           the request being processed
  * @error:        %0 for success, < %0 for error
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 12e20de..156ffd9 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -832,6 +832,14 @@ static inline void blk_run_address_space(struct address_space *mapping)
 extern void blkdev_dequeue_request(struct request *req);
 
 /*
+ * blk_end_request() takes bytes instead of sectors as a complete size.
+ * blk_rq_bytes() returns bytes left to complete in the entire request.
+ * blk_rq_cur_bytes() returns bytes left to complete in the current segment.
+ */
+extern unsigned int blk_rq_bytes(struct request *rq);
+extern unsigned int blk_rq_cur_bytes(struct request *rq);
+
+/*
  * blk_end_request() and friends.
  * __blk_end_request() and end_request() must be called with
  * the request queue spinlock acquired.
@@ -858,14 +866,6 @@ extern void blk_update_request(struct request *rq, int error,
 			       unsigned int nr_bytes);
 
 /*
- * blk_end_request() takes bytes instead of sectors as a complete size.
- * blk_rq_bytes() returns bytes left to complete in the entire request.
- * blk_rq_cur_bytes() returns bytes left to complete in the current segment.
- */
-extern unsigned int blk_rq_bytes(struct request *rq);
-extern unsigned int blk_rq_cur_bytes(struct request *rq);
-
-/*
  * Access functions for manipulating queue properties
  */
 extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn,
-- 
cgit v0.10.2


From 158dbda0068e63c7cce7bd47c123bd1dfa5a902c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:18 +0900
Subject: block: reorganize request fetching functions

Impact: code reorganization

elv_next_request() and elv_dequeue_request() are public block layer
interface than actual elevator implementation.  They mostly deal with
how requests interact with block layer and low level drivers at the
beginning of rqeuest processing whereas __elv_next_request() is the
actual eleveator request fetching interface.

Move the two functions to blk-core.c.  This prepares for further
interface cleanup.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/block/blk-core.c b/block/blk-core.c
index 406a93e..678ede2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1712,6 +1712,101 @@ unsigned int blk_rq_cur_bytes(struct request *rq)
 }
 EXPORT_SYMBOL_GPL(blk_rq_cur_bytes);
 
+struct request *elv_next_request(struct request_queue *q)
+{
+	struct request *rq;
+	int ret;
+
+	while ((rq = __elv_next_request(q)) != NULL) {
+		if (!(rq->cmd_flags & REQ_STARTED)) {
+			/*
+			 * This is the first time the device driver
+			 * sees this request (possibly after
+			 * requeueing).  Notify IO scheduler.
+			 */
+			if (blk_sorted_rq(rq))
+				elv_activate_rq(q, rq);
+
+			/*
+			 * just mark as started even if we don't start
+			 * it, a request that has been delayed should
+			 * not be passed by new incoming requests
+			 */
+			rq->cmd_flags |= REQ_STARTED;
+			trace_block_rq_issue(q, rq);
+		}
+
+		if (!q->boundary_rq || q->boundary_rq == rq) {
+			q->end_sector = rq_end_sector(rq);
+			q->boundary_rq = NULL;
+		}
+
+		if (rq->cmd_flags & REQ_DONTPREP)
+			break;
+
+		if (q->dma_drain_size && rq->data_len) {
+			/*
+			 * make sure space for the drain appears we
+			 * know we can do this because max_hw_segments
+			 * has been adjusted to be one fewer than the
+			 * device can handle
+			 */
+			rq->nr_phys_segments++;
+		}
+
+		if (!q->prep_rq_fn)
+			break;
+
+		ret = q->prep_rq_fn(q, rq);
+		if (ret == BLKPREP_OK) {
+			break;
+		} else if (ret == BLKPREP_DEFER) {
+			/*
+			 * the request may have been (partially) prepped.
+			 * we need to keep this request in the front to
+			 * avoid resource deadlock.  REQ_STARTED will
+			 * prevent other fs requests from passing this one.
+			 */
+			if (q->dma_drain_size && rq->data_len &&
+			    !(rq->cmd_flags & REQ_DONTPREP)) {
+				/*
+				 * remove the space for the drain we added
+				 * so that we don't add it again
+				 */
+				--rq->nr_phys_segments;
+			}
+
+			rq = NULL;
+			break;
+		} else if (ret == BLKPREP_KILL) {
+			rq->cmd_flags |= REQ_QUIET;
+			__blk_end_request(rq, -EIO, blk_rq_bytes(rq));
+		} else {
+			printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
+			break;
+		}
+	}
+
+	return rq;
+}
+EXPORT_SYMBOL(elv_next_request);
+
+void elv_dequeue_request(struct request_queue *q, struct request *rq)
+{
+	BUG_ON(list_empty(&rq->queuelist));
+	BUG_ON(ELV_ON_HASH(rq));
+
+	list_del_init(&rq->queuelist);
+
+	/*
+	 * the time frame between a request being removed from the lists
+	 * and to it is freed is accounted as io that is in progress at
+	 * the driver side.
+	 */
+	if (blk_account_rq(rq))
+		q->in_flight++;
+}
+
 /**
  * __end_that_request_first - end I/O on a request
  * @req:      the request being processed
diff --git a/block/blk.h b/block/blk.h
index 79c85f7..9b2c324 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -43,6 +43,43 @@ static inline void blk_clear_rq_complete(struct request *rq)
 	clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
 }
 
+/*
+ * Internal elevator interface
+ */
+#define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
+
+static inline struct request *__elv_next_request(struct request_queue *q)
+{
+	struct request *rq;
+
+	while (1) {
+		while (!list_empty(&q->queue_head)) {
+			rq = list_entry_rq(q->queue_head.next);
+			if (blk_do_ordered(q, &rq))
+				return rq;
+		}
+
+		if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
+			return NULL;
+	}
+}
+
+static inline void elv_activate_rq(struct request_queue *q, struct request *rq)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (e->ops->elevator_activate_req_fn)
+		e->ops->elevator_activate_req_fn(q, rq);
+}
+
+static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (e->ops->elevator_deactivate_req_fn)
+		e->ops->elevator_deactivate_req_fn(q, rq);
+}
+
 #ifdef CONFIG_FAIL_IO_TIMEOUT
 int blk_should_fake_timeout(struct request_queue *);
 ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
diff --git a/block/elevator.c b/block/elevator.c
index 2e0fb21..b03b875 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -53,7 +53,6 @@ static const int elv_hash_shift = 6;
 		(hash_long(ELV_HASH_BLOCK((sec)), elv_hash_shift))
 #define ELV_HASH_ENTRIES	(1 << elv_hash_shift)
 #define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
-#define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
 
 DEFINE_TRACE(block_rq_insert);
 DEFINE_TRACE(block_rq_issue);
@@ -310,22 +309,6 @@ void elevator_exit(struct elevator_queue *e)
 }
 EXPORT_SYMBOL(elevator_exit);
 
-static void elv_activate_rq(struct request_queue *q, struct request *rq)
-{
-	struct elevator_queue *e = q->elevator;
-
-	if (e->ops->elevator_activate_req_fn)
-		e->ops->elevator_activate_req_fn(q, rq);
-}
-
-static void elv_deactivate_rq(struct request_queue *q, struct request *rq)
-{
-	struct elevator_queue *e = q->elevator;
-
-	if (e->ops->elevator_deactivate_req_fn)
-		e->ops->elevator_deactivate_req_fn(q, rq);
-}
-
 static inline void __elv_rqhash_del(struct request *rq)
 {
 	hlist_del_init(&rq->hash);
@@ -758,117 +741,6 @@ void elv_add_request(struct request_queue *q, struct request *rq, int where,
 }
 EXPORT_SYMBOL(elv_add_request);
 
-static inline struct request *__elv_next_request(struct request_queue *q)
-{
-	struct request *rq;
-
-	while (1) {
-		while (!list_empty(&q->queue_head)) {
-			rq = list_entry_rq(q->queue_head.next);
-			if (blk_do_ordered(q, &rq))
-				return rq;
-		}
-
-		if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
-			return NULL;
-	}
-}
-
-struct request *elv_next_request(struct request_queue *q)
-{
-	struct request *rq;
-	int ret;
-
-	while ((rq = __elv_next_request(q)) != NULL) {
-		if (!(rq->cmd_flags & REQ_STARTED)) {
-			/*
-			 * This is the first time the device driver
-			 * sees this request (possibly after
-			 * requeueing).  Notify IO scheduler.
-			 */
-			if (blk_sorted_rq(rq))
-				elv_activate_rq(q, rq);
-
-			/*
-			 * just mark as started even if we don't start
-			 * it, a request that has been delayed should
-			 * not be passed by new incoming requests
-			 */
-			rq->cmd_flags |= REQ_STARTED;
-			trace_block_rq_issue(q, rq);
-		}
-
-		if (!q->boundary_rq || q->boundary_rq == rq) {
-			q->end_sector = rq_end_sector(rq);
-			q->boundary_rq = NULL;
-		}
-
-		if (rq->cmd_flags & REQ_DONTPREP)
-			break;
-
-		if (q->dma_drain_size && rq->data_len) {
-			/*
-			 * make sure space for the drain appears we
-			 * know we can do this because max_hw_segments
-			 * has been adjusted to be one fewer than the
-			 * device can handle
-			 */
-			rq->nr_phys_segments++;
-		}
-
-		if (!q->prep_rq_fn)
-			break;
-
-		ret = q->prep_rq_fn(q, rq);
-		if (ret == BLKPREP_OK) {
-			break;
-		} else if (ret == BLKPREP_DEFER) {
-			/*
-			 * the request may have been (partially) prepped.
-			 * we need to keep this request in the front to
-			 * avoid resource deadlock.  REQ_STARTED will
-			 * prevent other fs requests from passing this one.
-			 */
-			if (q->dma_drain_size && rq->data_len &&
-			    !(rq->cmd_flags & REQ_DONTPREP)) {
-				/*
-				 * remove the space for the drain we added
-				 * so that we don't add it again
-				 */
-				--rq->nr_phys_segments;
-			}
-
-			rq = NULL;
-			break;
-		} else if (ret == BLKPREP_KILL) {
-			rq->cmd_flags |= REQ_QUIET;
-			__blk_end_request(rq, -EIO, blk_rq_bytes(rq));
-		} else {
-			printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
-			break;
-		}
-	}
-
-	return rq;
-}
-EXPORT_SYMBOL(elv_next_request);
-
-void elv_dequeue_request(struct request_queue *q, struct request *rq)
-{
-	BUG_ON(list_empty(&rq->queuelist));
-	BUG_ON(ELV_ON_HASH(rq));
-
-	list_del_init(&rq->queuelist);
-
-	/*
-	 * the time frame between a request being removed from the lists
-	 * and to it is freed is accounted as io that is in progress at
-	 * the driver side.
-	 */
-	if (blk_account_rq(rq))
-		q->in_flight++;
-}
-
 int elv_queue_empty(struct request_queue *q)
 {
 	struct elevator_queue *e = q->elevator;
-- 
cgit v0.10.2


From 0b302d5aa7975006fa2ec3d66386610b9b36c669 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:18 +0900
Subject: block: kill blk_end_request_callback()

With recent IDE updates, blk_end_request_callback() doesn't have any
user now.  Kill it.

[ Impact: removal of unused convoluted interface ]

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/block/blk-core.c b/block/blk-core.c
index 678ede2..2f277ea 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1971,10 +1971,6 @@ static void end_that_request_last(struct request *req, int error)
  * @error:        %0 for success, < %0 for error
  * @nr_bytes:     number of bytes to complete @rq
  * @bidi_bytes:   number of bytes to complete @rq->next_rq
- * @drv_callback: function called between completion of bios in the request
- *                and completion of the request.
- *                If the callback returns non %0, this helper returns without
- *                completion of the request.
  *
  * Description:
  *     Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
@@ -1985,8 +1981,7 @@ static void end_that_request_last(struct request *req, int error)
  *     %1 - this request is not freed yet, it still has pending buffers.
  **/
 static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
-		      unsigned int bidi_bytes,
-		      int (drv_callback)(struct request *))
+		      unsigned int bidi_bytes)
 {
 	struct request_queue *q = rq->q;
 	unsigned long flags = 0UL;
@@ -1994,10 +1989,6 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
 	if (end_that_request_data(rq, error, nr_bytes, bidi_bytes))
 		return 1;
 
-	/* Special feature for tricky drivers */
-	if (drv_callback && drv_callback(rq))
-		return 1;
-
 	add_disk_randomness(rq->rq_disk);
 
 	spin_lock_irqsave(q->queue_lock, flags);
@@ -2023,7 +2014,7 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
  **/
 int blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
 {
-	return blk_end_io(rq, error, nr_bytes, 0, NULL);
+	return blk_end_io(rq, error, nr_bytes, 0);
 }
 EXPORT_SYMBOL_GPL(blk_end_request);
 
@@ -2070,7 +2061,7 @@ EXPORT_SYMBOL_GPL(__blk_end_request);
 int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes,
 			 unsigned int bidi_bytes)
 {
-	return blk_end_io(rq, error, nr_bytes, bidi_bytes, NULL);
+	return blk_end_io(rq, error, nr_bytes, bidi_bytes);
 }
 EXPORT_SYMBOL_GPL(blk_end_bidi_request);
 
@@ -2131,39 +2122,6 @@ void blk_update_request(struct request *rq, int error, unsigned int nr_bytes)
 }
 EXPORT_SYMBOL_GPL(blk_update_request);
 
-/**
- * blk_end_request_callback - Special helper function for tricky drivers
- * @rq:           the request being processed
- * @error:        %0 for success, < %0 for error
- * @nr_bytes:     number of bytes to complete
- * @drv_callback: function called between completion of bios in the request
- *                and completion of the request.
- *                If the callback returns non %0, this helper returns without
- *                completion of the request.
- *
- * Description:
- *     Ends I/O on a number of bytes attached to @rq.
- *     If @rq has leftover, sets it up for the next range of segments.
- *
- *     This special helper function is used only for existing tricky drivers.
- *     (e.g. cdrom_newpc_intr() of ide-cd)
- *     This interface will be removed when such drivers are rewritten.
- *     Don't use this interface in other places anymore.
- *
- * Return:
- *     %0 - we are done with this request
- *     %1 - this request is not freed yet.
- *          this request still has pending buffers or
- *          the driver doesn't want to finish this request yet.
- **/
-int blk_end_request_callback(struct request *rq, int error,
-			     unsigned int nr_bytes,
-			     int (drv_callback)(struct request *))
-{
-	return blk_end_io(rq, error, nr_bytes, 0, drv_callback);
-}
-EXPORT_SYMBOL_GPL(blk_end_request_callback);
-
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 156ffd9..1fa9dcf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -855,9 +855,6 @@ extern int __blk_end_request(struct request *rq, int error,
 extern int blk_end_bidi_request(struct request *rq, int error,
 				unsigned int nr_bytes, unsigned int bidi_bytes);
 extern void end_request(struct request *, int);
-extern int blk_end_request_callback(struct request *rq, int error,
-				unsigned int nr_bytes,
-				int (drv_callback)(struct request *));
 extern void blk_complete_request(struct request *);
 extern void __blk_complete_request(struct request *);
 extern void blk_abort_request(struct request *);
-- 
cgit v0.10.2


From 2e60e02297cf54e367567f2d85b2ca56b1c4a906 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:18 +0900
Subject: block: clean up request completion API

Request completion has gone through several changes and became a bit
messy over the time.  Clean it up.

1. end_that_request_data() is a thin wrapper around
   end_that_request_data_first() which checks whether bio is NULL
   before doing anything and handles bidi completion.
   blk_update_request() is a thin wrapper around
   end_that_request_data() which clears nr_sectors on the last
   iteration but doesn't use the bidi completion.

   Clean it up by moving the initial bio NULL check and nr_sectors
   clearing on the last iteration into end_that_request_data() and
   renaming it to blk_update_request(), which makes blk_end_io() the
   only user of end_that_request_data().  Collapse
   end_that_request_data() into blk_end_io().

2. There are four visible completion variants - blk_end_request(),
   __blk_end_request(), blk_end_bidi_request() and end_request().
   blk_end_request() and blk_end_bidi_request() uses blk_end_request()
   as the backend but __blk_end_request() and end_request() use
   separate implementation in __blk_end_request() due to different
   locking rules.

   blk_end_bidi_request() is identical to blk_end_io().  Collapse
   blk_end_io() into blk_end_bidi_request(), separate out request
   update into internal helper blk_update_bidi_request() and add
   __blk_end_bidi_request().  Redefine [__]blk_end_request() as thin
   inline wrappers around [__]blk_end_bidi_request().

3. As the whole request issue/completion usages are about to be
   modified and audited, it's a good chance to convert completion
   functions return bool which better indicates the intended meaning
   of return values.

4. The function name end_that_request_last() is from the days when it
   was a public interface and slighly confusing.  Give it a proper
   internal name - blk_finish_request().

5. Add description explaning that blk_end_bidi_request() can be safely
   used for uni requests as suggested by Boaz Harrosh.

The only visible behavior change is from #1.  nr_sectors counts are
cleared after the final iteration no matter which function is used to
complete the request.  I couldn't find any place where the code
assumes those nr_sectors counters contain the values for the last
segment and this change is good as it makes the API much more
consistent as the end result is now same whether a request is
completed using [__]blk_end_request() alone or in combination with
blk_update_request().

API further cleaned up per Christoph's suggestion.

[ Impact: cleanup, rq->*nr_sectors always updated after req completion ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Boaz Harrosh <bharrosh@panasas.com>
Cc: Christoph Hellwig <hch@infradead.org>

diff --git a/block/blk-core.c b/block/blk-core.c
index 2f277ea..89cc05d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1808,25 +1808,35 @@ void elv_dequeue_request(struct request_queue *q, struct request *rq)
 }
 
 /**
- * __end_that_request_first - end I/O on a request
- * @req:      the request being processed
+ * blk_update_request - Special helper function for request stacking drivers
+ * @rq:	      the request being processed
  * @error:    %0 for success, < %0 for error
- * @nr_bytes: number of bytes to complete
+ * @nr_bytes: number of bytes to complete @rq
  *
  * Description:
- *     Ends I/O on a number of bytes attached to @req, and sets it up
- *     for the next range of segments (if any) in the cluster.
+ *     Ends I/O on a number of bytes attached to @rq, but doesn't complete
+ *     the request structure even if @rq doesn't have leftover.
+ *     If @rq has leftover, sets it up for the next range of segments.
+ *
+ *     This special helper function is only for request stacking drivers
+ *     (e.g. request-based dm) so that they can handle partial completion.
+ *     Actual device drivers should use blk_end_request instead.
+ *
+ *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
+ *     %false return from this function.
  *
  * Return:
- *     %0 - we are done with this request, call end_that_request_last()
- *     %1 - still buffers pending for this request
+ *     %false - this request doesn't have any more data
+ *     %true  - this request has more data
  **/
-static int __end_that_request_first(struct request *req, int error,
-				    int nr_bytes)
+bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 {
 	int total_bytes, bio_nbytes, next_idx = 0;
 	struct bio *bio;
 
+	if (!req->bio)
+		return false;
+
 	trace_block_rq_complete(req->q, req);
 
 	/*
@@ -1903,8 +1913,16 @@ static int __end_that_request_first(struct request *req, int error,
 	/*
 	 * completely done
 	 */
-	if (!req->bio)
-		return 0;
+	if (!req->bio) {
+		/*
+		 * Reset counters so that the request stacking driver
+		 * can find how many bytes remain in the request
+		 * later.
+		 */
+		req->nr_sectors = req->hard_nr_sectors = 0;
+		req->current_nr_sectors = req->hard_cur_sectors = 0;
+		return false;
+	}
 
 	/*
 	 * if the request wasn't completed, update state
@@ -1918,29 +1936,31 @@ static int __end_that_request_first(struct request *req, int error,
 
 	blk_recalc_rq_sectors(req, total_bytes >> 9);
 	blk_recalc_rq_segments(req);
-	return 1;
+	return true;
 }
+EXPORT_SYMBOL_GPL(blk_update_request);
 
-static int end_that_request_data(struct request *rq, int error,
-				 unsigned int nr_bytes, unsigned int bidi_bytes)
+static bool blk_update_bidi_request(struct request *rq, int error,
+				    unsigned int nr_bytes,
+				    unsigned int bidi_bytes)
 {
-	if (rq->bio) {
-		if (__end_that_request_first(rq, error, nr_bytes))
-			return 1;
+	if (blk_update_request(rq, error, nr_bytes))
+		return true;
 
-		/* Bidi request must be completed as a whole */
-		if (blk_bidi_rq(rq) &&
-		    __end_that_request_first(rq->next_rq, error, bidi_bytes))
-			return 1;
-	}
+	/* Bidi request must be completed as a whole */
+	if (unlikely(blk_bidi_rq(rq)) &&
+	    blk_update_request(rq->next_rq, error, bidi_bytes))
+		return true;
 
-	return 0;
+	add_disk_randomness(rq->rq_disk);
+
+	return false;
 }
 
 /*
  * queue lock must be held
  */
-static void end_that_request_last(struct request *req, int error)
+static void blk_finish_request(struct request *req, int error)
 {
 	if (blk_rq_tagged(req))
 		blk_queue_end_tag(req->q, req);
@@ -1966,161 +1986,65 @@ static void end_that_request_last(struct request *req, int error)
 }
 
 /**
- * blk_end_io - Generic end_io function to complete a request.
- * @rq:           the request being processed
- * @error:        %0 for success, < %0 for error
- * @nr_bytes:     number of bytes to complete @rq
- * @bidi_bytes:   number of bytes to complete @rq->next_rq
+ * blk_end_bidi_request - Complete a bidi request
+ * @rq:         the request to complete
+ * @error:      %0 for success, < %0 for error
+ * @nr_bytes:   number of bytes to complete @rq
+ * @bidi_bytes: number of bytes to complete @rq->next_rq
  *
  * Description:
  *     Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
- *     If @rq has leftover, sets it up for the next range of segments.
+ *     Drivers that supports bidi can safely call this member for any
+ *     type of request, bidi or uni.  In the later case @bidi_bytes is
+ *     just ignored.
  *
  * Return:
- *     %0 - we are done with this request
- *     %1 - this request is not freed yet, it still has pending buffers.
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
  **/
-static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
-		      unsigned int bidi_bytes)
+bool blk_end_bidi_request(struct request *rq, int error,
+			  unsigned int nr_bytes, unsigned int bidi_bytes)
 {
 	struct request_queue *q = rq->q;
-	unsigned long flags = 0UL;
-
-	if (end_that_request_data(rq, error, nr_bytes, bidi_bytes))
-		return 1;
+	unsigned long flags;
 
-	add_disk_randomness(rq->rq_disk);
+	if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
+		return true;
 
 	spin_lock_irqsave(q->queue_lock, flags);
-	end_that_request_last(rq, error);
+	blk_finish_request(rq, error);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
-	return 0;
-}
-
-/**
- * blk_end_request - Helper function for drivers to complete the request.
- * @rq:       the request being processed
- * @error:    %0 for success, < %0 for error
- * @nr_bytes: number of bytes to complete
- *
- * Description:
- *     Ends I/O on a number of bytes attached to @rq.
- *     If @rq has leftover, sets it up for the next range of segments.
- *
- * Return:
- *     %0 - we are done with this request
- *     %1 - still buffers pending for this request
- **/
-int blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
-{
-	return blk_end_io(rq, error, nr_bytes, 0);
-}
-EXPORT_SYMBOL_GPL(blk_end_request);
-
-/**
- * __blk_end_request - Helper function for drivers to complete the request.
- * @rq:       the request being processed
- * @error:    %0 for success, < %0 for error
- * @nr_bytes: number of bytes to complete
- *
- * Description:
- *     Must be called with queue lock held unlike blk_end_request().
- *
- * Return:
- *     %0 - we are done with this request
- *     %1 - still buffers pending for this request
- **/
-int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
-{
-	if (rq->bio && __end_that_request_first(rq, error, nr_bytes))
-		return 1;
-
-	add_disk_randomness(rq->rq_disk);
-
-	end_that_request_last(rq, error);
-
-	return 0;
+	return false;
 }
-EXPORT_SYMBOL_GPL(__blk_end_request);
+EXPORT_SYMBOL_GPL(blk_end_bidi_request);
 
 /**
- * blk_end_bidi_request - Helper function for drivers to complete bidi request.
- * @rq:         the bidi request being processed
+ * __blk_end_bidi_request - Complete a bidi request with queue lock held
+ * @rq:         the request to complete
  * @error:      %0 for success, < %0 for error
  * @nr_bytes:   number of bytes to complete @rq
  * @bidi_bytes: number of bytes to complete @rq->next_rq
  *
  * Description:
- *     Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
+ *     Identical to blk_end_bidi_request() except that queue lock is
+ *     assumed to be locked on entry and remains so on return.
  *
  * Return:
- *     %0 - we are done with this request
- *     %1 - still buffers pending for this request
- **/
-int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes,
-			 unsigned int bidi_bytes)
-{
-	return blk_end_io(rq, error, nr_bytes, bidi_bytes);
-}
-EXPORT_SYMBOL_GPL(blk_end_bidi_request);
-
-/**
- * end_request - end I/O on the current segment of the request
- * @req:	the request being processed
- * @uptodate:	error value or %0/%1 uptodate flag
- *
- * Description:
- *     Ends I/O on the current segment of a request. If that is the only
- *     remaining segment, the request is also completed and freed.
- *
- *     This is a remnant of how older block drivers handled I/O completions.
- *     Modern drivers typically end I/O on the full request in one go, unless
- *     they have a residual value to account for. For that case this function
- *     isn't really useful, unless the residual just happens to be the
- *     full current segment. In other words, don't use this function in new
- *     code. Use blk_end_request() or __blk_end_request() to end a request.
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
  **/
-void end_request(struct request *req, int uptodate)
+bool __blk_end_bidi_request(struct request *rq, int error,
+			    unsigned int nr_bytes, unsigned int bidi_bytes)
 {
-	int error = 0;
+	if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
+		return true;
 
-	if (uptodate <= 0)
-		error = uptodate ? uptodate : -EIO;
+	blk_finish_request(rq, error);
 
-	__blk_end_request(req, error, req->hard_cur_sectors << 9);
-}
-EXPORT_SYMBOL(end_request);
-
-/**
- * blk_update_request - Special helper function for request stacking drivers
- * @rq:           the request being processed
- * @error:        %0 for success, < %0 for error
- * @nr_bytes:     number of bytes to complete @rq
- *
- * Description:
- *     Ends I/O on a number of bytes attached to @rq, but doesn't complete
- *     the request structure even if @rq doesn't have leftover.
- *     If @rq has leftover, sets it up for the next range of segments.
- *
- *     This special helper function is only for request stacking drivers
- *     (e.g. request-based dm) so that they can handle partial completion.
- *     Actual device drivers should use blk_end_request instead.
- */
-void blk_update_request(struct request *rq, int error, unsigned int nr_bytes)
-{
-	if (!end_that_request_data(rq, error, nr_bytes, 0)) {
-		/*
-		 * These members are not updated in end_that_request_data()
-		 * when all bios are completed.
-		 * Update them so that the request stacking driver can find
-		 * how many bytes remain in the request later.
-		 */
-		rq->nr_sectors = rq->hard_nr_sectors = 0;
-		rq->current_nr_sectors = rq->hard_cur_sectors = 0;
-	}
+	return false;
 }
-EXPORT_SYMBOL_GPL(blk_update_request);
+EXPORT_SYMBOL_GPL(__blk_end_bidi_request);
 
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1fa9dcf..501f684 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -840,27 +840,97 @@ extern unsigned int blk_rq_bytes(struct request *rq);
 extern unsigned int blk_rq_cur_bytes(struct request *rq);
 
 /*
- * blk_end_request() and friends.
- * __blk_end_request() and end_request() must be called with
- * the request queue spinlock acquired.
+ * Request completion related functions.
+ *
+ * blk_update_request() completes given number of bytes and updates
+ * the request without completing it.
+ *
+ * blk_end_request() and friends.  __blk_end_request() and
+ * end_request() must be called with the request queue spinlock
+ * acquired.
  *
  * Several drivers define their own end_request and call
  * blk_end_request() for parts of the original function.
  * This prevents code duplication in drivers.
  */
-extern int blk_end_request(struct request *rq, int error,
-				unsigned int nr_bytes);
-extern int __blk_end_request(struct request *rq, int error,
-				unsigned int nr_bytes);
-extern int blk_end_bidi_request(struct request *rq, int error,
-				unsigned int nr_bytes, unsigned int bidi_bytes);
-extern void end_request(struct request *, int);
+extern bool blk_update_request(struct request *rq, int error,
+			       unsigned int nr_bytes);
+extern bool blk_end_bidi_request(struct request *rq, int error,
+				 unsigned int nr_bytes,
+				 unsigned int bidi_bytes);
+extern bool __blk_end_bidi_request(struct request *rq, int error,
+				   unsigned int nr_bytes,
+				   unsigned int bidi_bytes);
+
+/**
+ * blk_end_request - Helper function for drivers to complete the request.
+ * @rq:       the request being processed
+ * @error:    %0 for success, < %0 for error
+ * @nr_bytes: number of bytes to complete
+ *
+ * Description:
+ *     Ends I/O on a number of bytes attached to @rq.
+ *     If @rq has leftover, sets it up for the next range of segments.
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
+ **/
+static inline bool blk_end_request(struct request *rq, int error,
+				   unsigned int nr_bytes)
+{
+	return blk_end_bidi_request(rq, error, nr_bytes, 0);
+}
+
+/**
+ * __blk_end_request - Helper function for drivers to complete the request.
+ * @rq:       the request being processed
+ * @error:    %0 for success, < %0 for error
+ * @nr_bytes: number of bytes to complete
+ *
+ * Description:
+ *     Must be called with queue lock held unlike blk_end_request().
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
+ **/
+static inline bool __blk_end_request(struct request *rq, int error,
+				     unsigned int nr_bytes)
+{
+	return __blk_end_bidi_request(rq, error, nr_bytes, 0);
+}
+
+/**
+ * end_request - end I/O on the current segment of the request
+ * @rq:		the request being processed
+ * @uptodate:	error value or %0/%1 uptodate flag
+ *
+ * Description:
+ *     Ends I/O on the current segment of a request. If that is the only
+ *     remaining segment, the request is also completed and freed.
+ *
+ *     This is a remnant of how older block drivers handled I/O completions.
+ *     Modern drivers typically end I/O on the full request in one go, unless
+ *     they have a residual value to account for. For that case this function
+ *     isn't really useful, unless the residual just happens to be the
+ *     full current segment. In other words, don't use this function in new
+ *     code. Use blk_end_request() or __blk_end_request() to end a request.
+ **/
+static inline void end_request(struct request *rq, int uptodate)
+{
+	int error = 0;
+
+	if (uptodate <= 0)
+		error = uptodate ? uptodate : -EIO;
+
+	__blk_end_bidi_request(rq, error, rq->hard_cur_sectors << 9, 0);
+}
+
 extern void blk_complete_request(struct request *);
 extern void __blk_complete_request(struct request *);
 extern void blk_abort_request(struct request *);
 extern void blk_abort_queue(struct request_queue *);
-extern void blk_update_request(struct request *rq, int error,
-			       unsigned int nr_bytes);
 
 /*
  * Access functions for manipulating queue properties
-- 
cgit v0.10.2


From b243ddcbe9be146172baa544dadecebf156eda0e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:18 +0900
Subject: block: move rq->start_time initialization to blk_rq_init()

rq->start_time was initialized in init_request_from_bio() so special
requests didn't have start_time set.  This has been okay as start_time
has been used only for fs requests; however, there is no indication of
this actually is the case or not.  Set rq->start_time in blk_rq_init()
and guarantee that all initialized rq's have its start_time set.  This
improves consistency at virtually no cost and future changes will make
use of the timestamp for !bio requests.

[ Impact: rq->start_time is valid for all requests ]

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/block/blk-core.c b/block/blk-core.c
index 89cc05d..b84250d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -134,6 +134,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	rq->cmd_len = BLK_MAX_CDB;
 	rq->tag = -1;
 	rq->ref_count = 1;
+	rq->start_time = jiffies;
 }
 EXPORT_SYMBOL(blk_rq_init);
 
@@ -1099,7 +1100,6 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 	req->errors = 0;
 	req->hard_sector = req->sector = bio->bi_sector;
 	req->ioprio = bio_prio(bio);
-	req->start_time = jiffies;
 	blk_rq_bio_prep(req->q, req, bio);
 }
 
-- 
cgit v0.10.2


From 40cbbb781d3eba5d6ac0860db078af490e5c7c6b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:19 +0900
Subject: block: implement and use [__]blk_end_request_all()

There are many [__]blk_end_request() call sites which call it with
full request length and expect full completion.  Many of them ensure
that the request actually completes by doing BUG_ON() the return
value, which is awkward and error-prone.

This patch adds [__]blk_end_request_all() which takes @rq and @error
and fully completes the request.  BUG_ON() is added to to ensure that
this actually happens.

Most conversions are simple but there are a few noteworthy ones.

* cdrom/viocd: viocd_end_request() replaced with direct calls to
  __blk_end_request_all().

* s390/block/dasd: dasd_end_request() replaced with direct calls to
  __blk_end_request_all().

* s390/char/tape_block: tapeblock_end_request() replaced with direct
  calls to blk_end_request_all().

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Mike Miller <mike.miller@hp.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Jeff Garzik <jgarzik@pobox.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Alex Dubov <oakad@yahoo.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>

diff --git a/arch/arm/plat-omap/mailbox.c b/arch/arm/plat-omap/mailbox.c
index 0abfbaa..cf81bad 100644
--- a/arch/arm/plat-omap/mailbox.c
+++ b/arch/arm/plat-omap/mailbox.c
@@ -192,8 +192,7 @@ static void mbox_tx_work(struct work_struct *work)
 		}
 
 		spin_lock(q->queue_lock);
-		if (__blk_end_request(rq, 0, 0))
-			BUG();
+		__blk_end_request_all(rq, 0);
 		spin_unlock(q->queue_lock);
 	}
 }
@@ -224,10 +223,7 @@ static void mbox_rx_work(struct work_struct *work)
 			break;
 
 		msg = (mbox_msg_t) rq->data;
-
-		if (blk_end_request(rq, 0, 0))
-			BUG();
-
+		blk_end_request_all(rq, 0);
 		mbox->rxq->callback((void *)msg);
 	}
 }
@@ -337,8 +333,7 @@ omap_mbox_read(struct device *dev, struct device_attribute *attr, char *buf)
 
 		*p = (mbox_msg_t) rq->data;
 
-		if (blk_end_request(rq, 0, 0))
-			BUG();
+		blk_end_request_all(rq, 0);
 
 		if (unlikely(mbox_seq_test(mbox, *p))) {
 			pr_info("mbox: Illegal seq bit!(%08x) ignored\n", *p);
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 20b4111..c8d0876 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -106,10 +106,7 @@ bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
 	 */
 	q->ordseq = 0;
 	rq = q->orig_bar_rq;
-
-	if (__blk_end_request(rq, q->orderr, blk_rq_bytes(rq)))
-		BUG();
-
+	__blk_end_request_all(rq, q->orderr);
 	return true;
 }
 
@@ -252,9 +249,7 @@ bool blk_do_ordered(struct request_queue *q, struct request **rqp)
 			 * with prejudice.
 			 */
 			elv_dequeue_request(q, rq);
-			if (__blk_end_request(rq, -EOPNOTSUPP,
-					      blk_rq_bytes(rq)))
-				BUG();
+			__blk_end_request_all(rq, -EOPNOTSUPP);
 			*rqp = NULL;
 			return false;
 		}
diff --git a/block/blk-core.c b/block/blk-core.c
index b84250d..0520cc7 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1780,7 +1780,7 @@ struct request *elv_next_request(struct request_queue *q)
 			break;
 		} else if (ret == BLKPREP_KILL) {
 			rq->cmd_flags |= REQ_QUIET;
-			__blk_end_request(rq, -EIO, blk_rq_bytes(rq));
+			__blk_end_request_all(rq, -EIO);
 		} else {
 			printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
 			break;
diff --git a/block/elevator.c b/block/elevator.c
index b03b875..1af5d9f 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -810,7 +810,7 @@ void elv_abort_queue(struct request_queue *q)
 		rq = list_entry_rq(q->queue_head.next);
 		rq->cmd_flags |= REQ_QUIET;
 		trace_block_rq_abort(q, rq);
-		__blk_end_request(rq, -EIO, blk_rq_bytes(rq));
+		__blk_end_request_all(rq, -EIO);
 	}
 }
 EXPORT_SYMBOL(elv_abort_queue);
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index ca268ca..488a8f4 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -1024,8 +1024,7 @@ static inline void complete_command(cmdlist_t *cmd, int timeout)
 				cmd->req.sg[i].size, ddir);
 
 	DBGPX(printk("Done with %p\n", rq););
-	if (__blk_end_request(rq, error, blk_rq_bytes(rq)))
-		BUG();
+	__blk_end_request_all(rq, error);
 }
 
 /*
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index ff0448e..60e85bb 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -749,8 +749,7 @@ static inline void carm_end_request_queued(struct carm_host *host,
 	struct request *req = crq->rq;
 	int rc;
 
-	rc = __blk_end_request(req, error, blk_rq_bytes(req));
-	assert(rc == 0);
+	__blk_end_request_all(req, error);
 
 	rc = carm_put_request(host, crq);
 	assert(rc == 0);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 5d34764..50745e6 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -62,7 +62,7 @@ static void blk_done(struct virtqueue *vq)
 			break;
 		}
 
-		__blk_end_request(vbr->req, error, blk_rq_bytes(vbr->req));
+		__blk_end_request_all(vbr->req, error);
 		list_del(&vbr->list);
 		mempool_free(vbr, vblk->pool);
 	}
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 8f90508..cd6cfe3 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -551,7 +551,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 
 	for (i = info->ring.rsp_cons; i != rp; i++) {
 		unsigned long id;
-		int ret;
 
 		bret = RING_GET_RESPONSE(&info->ring, i);
 		id   = bret->id;
@@ -578,8 +577,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 				dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
 					"request: %x\n", bret->status);
 
-			ret = __blk_end_request(req, error, blk_rq_bytes(req));
-			BUG_ON(ret);
+			__blk_end_request_all(req, error);
 			break;
 		default:
 			BUG();
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 2eecb77..fee9a9e 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -632,7 +632,7 @@ static void gdrom_readdisk_dma(struct work_struct *work)
 		* before handling ending the request */
 		spin_lock(&gdrom_lock);
 		list_del_init(&req->queuelist);
-		__blk_end_request(req, err, blk_rq_bytes(req));
+		__blk_end_request_all(req, err);
 	}
 	spin_unlock(&gdrom_lock);
 	kfree(read_command);
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index 1392935..cc3efa0 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -291,23 +291,6 @@ static int send_request(struct request *req)
 	return 0;
 }
 
-static void viocd_end_request(struct request *req, int error)
-{
-	int nsectors = req->hard_nr_sectors;
-
-	/*
-	 * Make sure it's fully ended, and ensure that we process
-	 * at least one sector.
-	 */
-	if (blk_pc_request(req))
-		nsectors = (req->data_len + 511) >> 9;
-	if (!nsectors)
-		nsectors = 1;
-
-	if (__blk_end_request(req, error, nsectors << 9))
-		BUG();
-}
-
 static int rwreq;
 
 static void do_viocd_request(struct request_queue *q)
@@ -316,11 +299,11 @@ static void do_viocd_request(struct request_queue *q)
 
 	while ((rwreq == 0) && ((req = elv_next_request(q)) != NULL)) {
 		if (!blk_fs_request(req))
-			viocd_end_request(req, -EIO);
+			__blk_end_request_all(req, -EIO);
 		else if (send_request(req) < 0) {
 			printk(VIOCD_KERN_WARNING
 					"unable to send message to OS/400!");
-			viocd_end_request(req, -EIO);
+			__blk_end_request_all(req, -EIO);
 		} else
 			rwreq++;
 	}
@@ -531,9 +514,9 @@ return_complete:
 					"with rc %d:0x%04X: %s\n",
 					req, event->xRc,
 					bevent->sub_result, err->msg);
-			viocd_end_request(req, -EIO);
+			__blk_end_request_all(req, -EIO);
 		} else
-			viocd_end_request(req, 0);
+			__blk_end_request_all(req, 0);
 
 		/* restart handling of incoming requests */
 		spin_unlock_irqrestore(&viocd_reqlock, flags);
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index de143de..a416346 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -826,7 +826,7 @@ static void mspro_block_submit_req(struct request_queue *q)
 
 	if (msb->eject) {
 		while ((req = elv_next_request(q)) != NULL)
-			__blk_end_request(req, -ENODEV, blk_rq_bytes(req));
+			__blk_end_request_all(req, -ENODEV);
 
 		return;
 	}
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index d181527..fabec95 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -1614,15 +1614,6 @@ void dasd_block_clear_timer(struct dasd_block *block)
 }
 
 /*
- * posts the buffer_cache about a finalized request
- */
-static inline void dasd_end_request(struct request *req, int error)
-{
-	if (__blk_end_request(req, error, blk_rq_bytes(req)))
-		BUG();
-}
-
-/*
  * Process finished error recovery ccw.
  */
 static inline void __dasd_block_process_erp(struct dasd_block *block,
@@ -1676,7 +1667,7 @@ static void __dasd_process_request_queue(struct dasd_block *block)
 				      "Rejecting write request %p",
 				      req);
 			blkdev_dequeue_request(req);
-			dasd_end_request(req, -EIO);
+			__blk_end_request_all(req, -EIO);
 			continue;
 		}
 		cqr = basedev->discipline->build_cp(basedev, block, req);
@@ -1705,7 +1696,7 @@ static void __dasd_process_request_queue(struct dasd_block *block)
 				      "on request %p",
 				      PTR_ERR(cqr), req);
 			blkdev_dequeue_request(req);
-			dasd_end_request(req, -EIO);
+			__blk_end_request_all(req, -EIO);
 			continue;
 		}
 		/*
@@ -1731,7 +1722,7 @@ static void __dasd_cleanup_cqr(struct dasd_ccw_req *cqr)
 	status = cqr->block->base->discipline->free_cp(cqr, req);
 	if (status <= 0)
 		error = status ? status : -EIO;
-	dasd_end_request(req, error);
+	__blk_end_request_all(req, error);
 }
 
 /*
@@ -2040,7 +2031,7 @@ static void dasd_flush_request_queue(struct dasd_block *block)
 	spin_lock_irq(&block->request_queue_lock);
 	while ((req = elv_next_request(block->request_queue))) {
 		blkdev_dequeue_request(req);
-		dasd_end_request(req, -EIO);
+		__blk_end_request_all(req, -EIO);
 	}
 	spin_unlock_irq(&block->request_queue_lock);
 }
diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c
index f32e89e..86596d3 100644
--- a/drivers/s390/char/tape_block.c
+++ b/drivers/s390/char/tape_block.c
@@ -74,13 +74,6 @@ tapeblock_trigger_requeue(struct tape_device *device)
  * Post finished request.
  */
 static void
-tapeblock_end_request(struct request *req, int error)
-{
-	if (blk_end_request(req, error, blk_rq_bytes(req)))
-		BUG();
-}
-
-static void
 __tapeblock_end_request(struct tape_request *ccw_req, void *data)
 {
 	struct tape_device *device;
@@ -90,7 +83,7 @@ __tapeblock_end_request(struct tape_request *ccw_req, void *data)
 
 	device = ccw_req->device;
 	req = (struct request *) data;
-	tapeblock_end_request(req, (ccw_req->rc == 0) ? 0 : -EIO);
+	blk_end_request_all(req, (ccw_req->rc == 0) ? 0 : -EIO);
 	if (ccw_req->rc == 0)
 		/* Update position. */
 		device->blk_data.block_position =
@@ -118,7 +111,7 @@ tapeblock_start_request(struct tape_device *device, struct request *req)
 	ccw_req = device->discipline->bread(device, req);
 	if (IS_ERR(ccw_req)) {
 		DBF_EVENT(1, "TBLOCK: bread failed\n");
-		tapeblock_end_request(req, -EIO);
+		blk_end_request_all(req, -EIO);
 		return PTR_ERR(ccw_req);
 	}
 	ccw_req->callback = __tapeblock_end_request;
@@ -131,7 +124,7 @@ tapeblock_start_request(struct tape_device *device, struct request *req)
 		 * Start/enqueueing failed. No retries in
 		 * this case.
 		 */
-		tapeblock_end_request(req, -EIO);
+		blk_end_request_all(req, -EIO);
 		device->discipline->free_bread(ccw_req);
 	}
 
@@ -177,7 +170,7 @@ tapeblock_requeue(struct work_struct *work) {
 			DBF_EVENT(1, "TBLOCK: Rejecting write request\n");
 			blkdev_dequeue_request(req);
 			spin_unlock_irq(&device->blk_data.request_queue_lock);
-			tapeblock_end_request(req, -EIO);
+			blk_end_request_all(req, -EIO);
 			spin_lock_irq(&device->blk_data.request_queue_lock);
 			continue;
 		}
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index d1cb64a..756ac7c 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -922,7 +922,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 			if (driver_byte(result) & DRIVER_SENSE)
 				scsi_print_sense("", cmd);
 		}
-		blk_end_request(req, -EIO, blk_rq_bytes(req));
+		blk_end_request_all(req, -EIO);
 		scsi_next_command(cmd);
 		break;
 	case ACTION_REPREP:
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 501f684..e33c835 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -883,6 +883,22 @@ static inline bool blk_end_request(struct request *rq, int error,
 }
 
 /**
+ * blk_end_request_all - Helper function for drives to finish the request.
+ * @rq: the request to finish
+ * @err: %0 for success, < %0 for error
+ *
+ * Description:
+ *     Completely finish @rq.
+ */
+static inline void blk_end_request_all(struct request *rq, int error)
+{
+	bool pending;
+
+	pending = blk_end_request(rq, error, blk_rq_bytes(rq));
+	BUG_ON(pending);
+}
+
+/**
  * __blk_end_request - Helper function for drivers to complete the request.
  * @rq:       the request being processed
  * @error:    %0 for success, < %0 for error
@@ -902,6 +918,22 @@ static inline bool __blk_end_request(struct request *rq, int error,
 }
 
 /**
+ * __blk_end_request_all - Helper function for drives to finish the request.
+ * @rq: the request to finish
+ * @err: %0 for success, < %0 for error
+ *
+ * Description:
+ *     Completely finish @rq.  Must be called with queue lock held.
+ */
+static inline void __blk_end_request_all(struct request *rq, int error)
+{
+	bool pending;
+
+	pending = __blk_end_request(rq, error, blk_rq_bytes(rq));
+	BUG_ON(pending);
+}
+
+/**
  * end_request - end I/O on the current segment of the request
  * @rq:		the request being processed
  * @uptodate:	error value or %0/%1 uptodate flag
-- 
cgit v0.10.2


From f06d9a2b52e246a66b606130cea3f0d7b7be17a7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:19 +0900
Subject: block: replace end_request() with [__]blk_end_request_cur()

end_request() has been kept around for backward compatibility;
however, it's about time for it to go away.

* There aren't too many users left.

* Its use of @updtodate is pretty confusing.

* In some cases, newer code ends up using mixture of end_request() and
  [__]blk_end_request[_all](), which is way too confusing.

So, add [__]blk_end_request_cur() and replace end_request() with it.
Most conversions are straightforward.  Noteworthy ones are...

* paride/pcd: next_request() updated to take 0/-errno instead of 1/0.

* paride/pf: pf_end_request() and next_request() updated to take
  0/-errno instead of 1/0.

* xd: xd_readwrite() updated to return 0/-errno instead of 1/0.

* mtd/mtd_blkdevs: blktrans_discard_request() updated to return
  0/-errno instead of 1/0.  Unnecessary local variable res
  initialization removed from mtd_blktrans_thread().

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Joerg Dorchain <joerg@dorchain.net>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Laurent Vivier <Laurent@lvivier.info>
Cc: Tim Waugh <tim@cyberelk.net>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Markus Lidel <Markus.Lidel@shadowconnect.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Pete Zaitcev <zaitcev@redhat.com>
Cc: unsik Kim <donari75@gmail.com>

diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 8df436f..b99a2a6 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1359,7 +1359,7 @@ static void redo_fd_request(void)
 #endif
 		block = CURRENT->sector + cnt;
 		if ((int)block > floppy->blocks) {
-			end_request(CURRENT, 0);
+			__blk_end_request_cur(CURRENT, -EIO);
 			goto repeat;
 		}
 
@@ -1373,11 +1373,11 @@ static void redo_fd_request(void)
 
 		if ((rq_data_dir(CURRENT) != READ) && (rq_data_dir(CURRENT) != WRITE)) {
 			printk(KERN_WARNING "do_fd_request: unknown command\n");
-			end_request(CURRENT, 0);
+			__blk_end_request_cur(CURRENT, -EIO);
 			goto repeat;
 		}
 		if (get_track(drive, track) == -1) {
-			end_request(CURRENT, 0);
+			__blk_end_request_cur(CURRENT, -EIO);
 			goto repeat;
 		}
 
@@ -1391,7 +1391,7 @@ static void redo_fd_request(void)
 
 			/* keep the drive spinning while writes are scheduled */
 			if (!fd_motor_on(drive)) {
-				end_request(CURRENT, 0);
+				__blk_end_request_cur(CURRENT, -EIO);
 				goto repeat;
 			}
 			/*
@@ -1410,7 +1410,7 @@ static void redo_fd_request(void)
 	CURRENT->nr_sectors -= CURRENT->current_nr_sectors;
 	CURRENT->sector += CURRENT->current_nr_sectors;
 
-	end_request(CURRENT, 1);
+	__blk_end_request_cur(CURRENT, 0);
 	goto repeat;
 }
 
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 4234c11..44a8702 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -612,7 +612,7 @@ static void fd_error( void )
 	CURRENT->errors++;
 	if (CURRENT->errors >= MAX_ERRORS) {
 		printk(KERN_ERR "fd%d: too many errors.\n", SelectedDrive );
-		end_request(CURRENT, 0);
+		__blk_end_request_cur(CURRENT, -EIO);
 	}
 	else if (CURRENT->errors == RECALIBRATE_ERRORS) {
 		printk(KERN_WARNING "fd%d: recalibrating\n", SelectedDrive );
@@ -734,7 +734,7 @@ static void do_fd_action( int drive )
 			/* all sectors finished */
 			CURRENT->nr_sectors -= CURRENT->current_nr_sectors;
 			CURRENT->sector += CURRENT->current_nr_sectors;
-			end_request(CURRENT, 1);
+			__blk_end_request_cur(CURRENT, 0);
 			redo_fd_request();
 			return;
 		    }
@@ -1141,7 +1141,7 @@ static void fd_rwsec_done1(int status)
 		/* all sectors finished */
 		CURRENT->nr_sectors -= CURRENT->current_nr_sectors;
 		CURRENT->sector += CURRENT->current_nr_sectors;
-		end_request(CURRENT, 1);
+		__blk_end_request_cur(CURRENT, 0);
 		redo_fd_request();
 	}
 	return;
@@ -1414,7 +1414,7 @@ repeat:
 	if (!UD.connected) {
 		/* drive not connected */
 		printk(KERN_ERR "Unknown Device: fd%d\n", drive );
-		end_request(CURRENT, 0);
+		__blk_end_request_cur(CURRENT, -EIO);
 		goto repeat;
 	}
 		
@@ -1430,12 +1430,12 @@ repeat:
 		/* user supplied disk type */
 		if (--type >= NUM_DISK_MINORS) {
 			printk(KERN_WARNING "fd%d: invalid disk format", drive );
-			end_request(CURRENT, 0);
+			__blk_end_request_cur(CURRENT, -EIO);
 			goto repeat;
 		}
 		if (minor2disktype[type].drive_types > DriveType)  {
 			printk(KERN_WARNING "fd%d: unsupported disk format", drive );
-			end_request(CURRENT, 0);
+			__blk_end_request_cur(CURRENT, -EIO);
 			goto repeat;
 		}
 		type = minor2disktype[type].index;
@@ -1445,7 +1445,7 @@ repeat:
 	}
 	
 	if (CURRENT->sector + 1 > UDT->blocks) {
-		end_request(CURRENT, 0);
+		__blk_end_request_cur(CURRENT, -EIO);
 		goto repeat;
 	}
 
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index baaa9e4..5cb300b 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -410,7 +410,7 @@ static void bad_rw_intr(void)
 	if (req != NULL) {
 		struct hd_i_struct *disk = req->rq_disk->private_data;
 		if (++req->errors >= MAX_ERRORS || (hd_error & BBD_ERR)) {
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			disk->special_op = disk->recalibrate = 1;
 		} else if (req->errors % RESET_FREQ == 0)
 			reset = 1;
@@ -466,7 +466,7 @@ ok_to_read:
 		req->buffer+512);
 #endif
 	if (req->current_nr_sectors <= 0)
-		end_request(req, 1);
+		__blk_end_request_cur(req, 0);
 	if (i > 0) {
 		SET_HANDLER(&read_intr);
 		return;
@@ -505,7 +505,7 @@ ok_to_write:
 	--req->current_nr_sectors;
 	req->buffer += 512;
 	if (!i || (req->bio && req->current_nr_sectors <= 0))
-		end_request(req, 1);
+		__blk_end_request_cur(req, 0);
 	if (i > 0) {
 		SET_HANDLER(&write_intr);
 		outsw(HD_DATA, req->buffer, 256);
@@ -548,7 +548,7 @@ static void hd_times_out(unsigned long dummy)
 #ifdef DEBUG
 		printk("%s: too many errors\n", name);
 #endif
-		end_request(CURRENT, 0);
+		__blk_end_request_cur(CURRENT, -EIO);
 	}
 	hd_request();
 	spin_unlock_irq(hd_queue->queue_lock);
@@ -563,7 +563,7 @@ static int do_special_op(struct hd_i_struct *disk, struct request *req)
 	}
 	if (disk->head > 16) {
 		printk("%s: cannot handle device with more than 16 heads - giving up\n", req->rq_disk->disk_name);
-		end_request(req, 0);
+		__blk_end_request_cur(req, -EIO);
 	}
 	disk->special_op = 0;
 	return 1;
@@ -607,7 +607,7 @@ repeat:
 	    ((block+nsect) > get_capacity(req->rq_disk))) {
 		printk("%s: bad access: block=%d, count=%d\n",
 			req->rq_disk->disk_name, block, nsect);
-		end_request(req, 0);
+		__blk_end_request_cur(req, -EIO);
 		goto repeat;
 	}
 
@@ -647,7 +647,7 @@ repeat:
 			break;
 		default:
 			printk("unknown hd-command\n");
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			break;
 		}
 	}
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index f389835..408c2bd 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -285,7 +285,7 @@ static void mg_bad_rw_intr(struct mg_host *host)
 	if (req != NULL)
 		if (++req->errors >= MG_MAX_ERRORS ||
 				host->error == MG_ERR_TIMEOUT)
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 }
 
 static unsigned int mg_out(struct mg_host *host,
@@ -351,7 +351,7 @@ static void mg_read(struct request *req)
 
 		if (req->current_nr_sectors <= 0) {
 			MG_DBG("remain : %d sects\n", remains);
-			end_request(req, 1);
+			__blk_end_request_cur(req, 0);
 			if (remains > 0)
 				req = elv_next_request(host->breq);
 		}
@@ -395,7 +395,7 @@ static void mg_write(struct request *req)
 
 		if (req->current_nr_sectors <= 0) {
 			MG_DBG("remain : %d sects\n", remains);
-			end_request(req, 1);
+			__blk_end_request_cur(req, 0);
 			if (remains > 0)
 				req = elv_next_request(host->breq);
 		}
@@ -448,7 +448,7 @@ ok_to_read:
 
 	/* let know if current segment done */
 	if (req->current_nr_sectors <= 0)
-		end_request(req, 1);
+		__blk_end_request_cur(req, 0);
 
 	/* set handler if read remains */
 	if (i > 0) {
@@ -497,7 +497,7 @@ ok_to_write:
 
 	/* let know if current segment or all done */
 	if (!i || (req->bio && req->current_nr_sectors <= 0))
-		end_request(req, 1);
+		__blk_end_request_cur(req, 0);
 
 	/* write 1 sector and set handler if remains */
 	if (i > 0) {
@@ -563,7 +563,7 @@ static void mg_request_poll(struct request_queue *q)
 			default:
 				printk(KERN_WARNING "%s:%d unknown command\n",
 						__func__, __LINE__);
-				end_request(req, 0);
+				__blk_end_request_cur(req, -EIO);
 				break;
 			}
 		}
@@ -617,7 +617,7 @@ static unsigned int mg_issue_req(struct request *req,
 	default:
 		printk(KERN_WARNING "%s:%d unknown command\n",
 				__func__, __LINE__);
-		end_request(req, 0);
+		__blk_end_request_cur(req, -EIO);
 		break;
 	}
 	return MG_ERR_NONE;
@@ -655,7 +655,7 @@ static void mg_request(struct request_queue *q)
 					"%s: bad access: sector=%d, count=%d\n",
 					req->rq_disk->disk_name,
 					sect_num, sect_cnt);
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index e91d4b4..9fd57c2 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -735,16 +735,16 @@ static void do_pcd_request(struct request_queue * q)
 			ps_set_intr(do_pcd_read, NULL, 0, nice);
 			return;
 		} else
-			end_request(pcd_req, 0);
+			__blk_end_request_cur(pcd_req, -EIO);
 	}
 }
 
-static inline void next_request(int success)
+static inline void next_request(int err)
 {
 	unsigned long saved_flags;
 
 	spin_lock_irqsave(&pcd_lock, saved_flags);
-	end_request(pcd_req, success);
+	__blk_end_request_cur(pcd_req, err);
 	pcd_busy = 0;
 	do_pcd_request(pcd_queue);
 	spin_unlock_irqrestore(&pcd_lock, saved_flags);
@@ -781,7 +781,7 @@ static void pcd_start(void)
 
 	if (pcd_command(pcd_current, rd_cmd, 2048, "read block")) {
 		pcd_bufblk = -1;
-		next_request(0);
+		next_request(-EIO);
 		return;
 	}
 
@@ -796,7 +796,7 @@ static void do_pcd_read(void)
 	pcd_retries = 0;
 	pcd_transfer();
 	if (!pcd_count) {
-		next_request(1);
+		next_request(0);
 		return;
 	}
 
@@ -815,7 +815,7 @@ static void do_pcd_read_drq(void)
 			return;
 		}
 		pcd_bufblk = -1;
-		next_request(0);
+		next_request(-EIO);
 		return;
 	}
 
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 9299455..0732df4 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -410,7 +410,8 @@ static void run_fsm(void)
 				pd_claimed = 0;
 				phase = NULL;
 				spin_lock_irqsave(&pd_lock, saved_flags);
-				end_request(pd_req, res);
+				__blk_end_request_cur(pd_req,
+						      res == Ok ? 0 : -EIO);
 				pd_req = elv_next_request(pd_queue);
 				if (!pd_req)
 					stop = 1;
@@ -477,7 +478,7 @@ static int pd_next_buf(void)
 	if (pd_count)
 		return 0;
 	spin_lock_irqsave(&pd_lock, saved_flags);
-	end_request(pd_req, 1);
+	__blk_end_request_cur(pd_req, 0);
 	pd_count = pd_req->current_nr_sectors;
 	pd_buf = pd_req->buffer;
 	spin_unlock_irqrestore(&pd_lock, saved_flags);
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index bef3b99..3871e35 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -750,10 +750,10 @@ static int pf_ready(void)
 
 static struct request_queue *pf_queue;
 
-static void pf_end_request(int uptodate)
+static void pf_end_request(int err)
 {
 	if (pf_req) {
-		end_request(pf_req, uptodate);
+		__blk_end_request_cur(pf_req, err);
 		pf_req = NULL;
 	}
 }
@@ -773,7 +773,7 @@ repeat:
 	pf_count = pf_req->current_nr_sectors;
 
 	if (pf_block + pf_count > get_capacity(pf_req->rq_disk)) {
-		pf_end_request(0);
+		pf_end_request(-EIO);
 		goto repeat;
 	}
 
@@ -788,7 +788,7 @@ repeat:
 		pi_do_claimed(pf_current->pi, do_pf_write);
 	else {
 		pf_busy = 0;
-		pf_end_request(0);
+		pf_end_request(-EIO);
 		goto repeat;
 	}
 }
@@ -805,7 +805,7 @@ static int pf_next_buf(void)
 		return 1;
 	if (!pf_count) {
 		spin_lock_irqsave(&pf_spin_lock, saved_flags);
-		pf_end_request(1);
+		pf_end_request(0);
 		pf_req = elv_next_request(pf_queue);
 		spin_unlock_irqrestore(&pf_spin_lock, saved_flags);
 		if (!pf_req)
@@ -816,12 +816,12 @@ static int pf_next_buf(void)
 	return 0;
 }
 
-static inline void next_request(int success)
+static inline void next_request(int err)
 {
 	unsigned long saved_flags;
 
 	spin_lock_irqsave(&pf_spin_lock, saved_flags);
-	pf_end_request(success);
+	pf_end_request(err);
 	pf_busy = 0;
 	do_pf_request(pf_queue);
 	spin_unlock_irqrestore(&pf_spin_lock, saved_flags);
@@ -844,7 +844,7 @@ static void do_pf_read_start(void)
 			pi_do_claimed(pf_current->pi, do_pf_read_start);
 			return;
 		}
-		next_request(0);
+		next_request(-EIO);
 		return;
 	}
 	pf_mask = STAT_DRQ;
@@ -863,7 +863,7 @@ static void do_pf_read_drq(void)
 				pi_do_claimed(pf_current->pi, do_pf_read_start);
 				return;
 			}
-			next_request(0);
+			next_request(-EIO);
 			return;
 		}
 		pi_read_block(pf_current->pi, pf_buf, 512);
@@ -871,7 +871,7 @@ static void do_pf_read_drq(void)
 			break;
 	}
 	pi_disconnect(pf_current->pi);
-	next_request(1);
+	next_request(0);
 }
 
 static void do_pf_write(void)
@@ -890,7 +890,7 @@ static void do_pf_write_start(void)
 			pi_do_claimed(pf_current->pi, do_pf_write_start);
 			return;
 		}
-		next_request(0);
+		next_request(-EIO);
 		return;
 	}
 
@@ -903,7 +903,7 @@ static void do_pf_write_start(void)
 				pi_do_claimed(pf_current->pi, do_pf_write_start);
 				return;
 			}
-			next_request(0);
+			next_request(-EIO);
 			return;
 		}
 		pi_write_block(pf_current->pi, pf_buf, 512);
@@ -923,11 +923,11 @@ static void do_pf_write_done(void)
 			pi_do_claimed(pf_current->pi, do_pf_write_start);
 			return;
 		}
-		next_request(0);
+		next_request(-EIO);
 		return;
 	}
 	pi_disconnect(pf_current->pi);
-	next_request(1);
+	next_request(0);
 }
 
 static int __init pf_init(void)
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index bccc42b..d23b54b 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -158,7 +158,7 @@ static int ps3disk_submit_request_sg(struct ps3_storage_device *dev,
 	if (res) {
 		dev_err(&dev->sbd.core, "%s:%u: %s failed %d\n", __func__,
 			__LINE__, op, res);
-		end_request(req, 0);
+		__blk_end_request_cur(req, -EIO);
 		return 0;
 	}
 
@@ -180,7 +180,7 @@ static int ps3disk_submit_flush_request(struct ps3_storage_device *dev,
 	if (res) {
 		dev_err(&dev->sbd.core, "%s:%u: sync cache failed 0x%llx\n",
 			__func__, __LINE__, res);
-		end_request(req, 0);
+		__blk_end_request_cur(req, -EIO);
 		return 0;
 	}
 
@@ -205,7 +205,7 @@ static void ps3disk_do_request(struct ps3_storage_device *dev,
 				break;
 		} else {
 			blk_dump_rq_flags(req, DEVICE_NAME " bad request");
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 	}
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index d22cc38..6544a7b 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -532,39 +532,39 @@ static void redo_fd_request(struct request_queue *q)
 
 		fs = req->rq_disk->private_data;
 		if (req->sector < 0 || req->sector >= fs->total_secs) {
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 		if (req->current_nr_sectors == 0) {
-			end_request(req, 1);
+			__blk_end_request_cur(req, 0);
 			continue;
 		}
 		if (!fs->disk_in) {
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 		if (rq_data_dir(req) == WRITE) {
 			if (fs->write_protected) {
-				end_request(req, 0);
+				__blk_end_request_cur(req, -EIO);
 				continue;
 			}
 		}
 		switch (rq_data_dir(req)) {
 		case WRITE:
 			/* NOT IMPLEMENTED */
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			break;
 		case READ:
 			if (floppy_read_sectors(fs, req->sector,
 						req->current_nr_sectors,
 						req->buffer)) {
-				end_request(req, 0);
+				__blk_end_request_cur(req, -EIO);
 				continue;
 			}
 			req->nr_sectors -= req->current_nr_sectors;
 			req->sector += req->current_nr_sectors;
 			req->buffer += req->current_nr_sectors * 512;
-			end_request(req, 1);
+			__blk_end_request_cur(req, 0);
 			break;
 		}
 	}
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index 61296530..5904f7b 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -320,15 +320,15 @@ static void start_request(struct floppy_state *fs)
 #endif
 
 		if (req->sector < 0 || req->sector >= fs->total_secs) {
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 		if (req->current_nr_sectors == 0) {
-			end_request(req, 1);
+			__blk_end_request_cur(req, 0);
 			continue;
 		}
 		if (fs->ejected) {
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 
@@ -336,7 +336,7 @@ static void start_request(struct floppy_state *fs)
 			if (fs->write_prot < 0)
 				fs->write_prot = swim3_readbit(fs, WRITE_PROT);
 			if (fs->write_prot) {
-				end_request(req, 0);
+				__blk_end_request_cur(req, -EIO);
 				continue;
 			}
 		}
@@ -508,7 +508,7 @@ static void act(struct floppy_state *fs)
 		case do_transfer:
 			if (fs->cur_cyl != fs->req_cyl) {
 				if (fs->retries > 5) {
-					end_request(fd_req, 0);
+					__blk_end_request_cur(fd_req, -EIO);
 					fs->state = idle;
 					return;
 				}
@@ -540,7 +540,7 @@ static void scan_timeout(unsigned long data)
 	out_8(&sw->intr_enable, 0);
 	fs->cur_cyl = -1;
 	if (fs->retries > 5) {
-		end_request(fd_req, 0);
+		__blk_end_request_cur(fd_req, -EIO);
 		fs->state = idle;
 		start_request(fs);
 	} else {
@@ -559,7 +559,7 @@ static void seek_timeout(unsigned long data)
 	out_8(&sw->select, RELAX);
 	out_8(&sw->intr_enable, 0);
 	printk(KERN_ERR "swim3: seek timeout\n");
-	end_request(fd_req, 0);
+	__blk_end_request_cur(fd_req, -EIO);
 	fs->state = idle;
 	start_request(fs);
 }
@@ -583,7 +583,7 @@ static void settle_timeout(unsigned long data)
 		return;
 	}
 	printk(KERN_ERR "swim3: seek settle timeout\n");
-	end_request(fd_req, 0);
+	__blk_end_request_cur(fd_req, -EIO);
 	fs->state = idle;
 	start_request(fs);
 }
@@ -615,7 +615,7 @@ static void xfer_timeout(unsigned long data)
 	fd_req->current_nr_sectors -= s;
 	printk(KERN_ERR "swim3: timeout %sing sector %ld\n",
 	       (rq_data_dir(fd_req)==WRITE? "writ": "read"), (long)fd_req->sector);
-	end_request(fd_req, 0);
+	__blk_end_request_cur(fd_req, -EIO);
 	fs->state = idle;
 	start_request(fs);
 }
@@ -646,7 +646,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 				printk(KERN_ERR "swim3: seen sector but cyl=ff?\n");
 				fs->cur_cyl = -1;
 				if (fs->retries > 5) {
-					end_request(fd_req, 0);
+					__blk_end_request_cur(fd_req, -EIO);
 					fs->state = idle;
 					start_request(fs);
 				} else {
@@ -731,7 +731,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 				printk("swim3: error %sing block %ld (err=%x)\n",
 				       rq_data_dir(fd_req) == WRITE? "writ": "read",
 				       (long)fd_req->sector, err);
-				end_request(fd_req, 0);
+				__blk_end_request_cur(fd_req, -EIO);
 				fs->state = idle;
 			}
 		} else {
@@ -740,7 +740,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 				printk(KERN_ERR "swim3: fd dma: stat=%x resid=%d\n", stat, resid);
 				printk(KERN_ERR "  state=%d, dir=%x, intr=%x, err=%x\n",
 				       fs->state, rq_data_dir(fd_req), intr, err);
-				end_request(fd_req, 0);
+				__blk_end_request_cur(fd_req, -EIO);
 				fs->state = idle;
 				start_request(fs);
 				break;
@@ -749,7 +749,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 			fd_req->current_nr_sectors -= fs->scount;
 			fd_req->buffer += fs->scount * 512;
 			if (fd_req->current_nr_sectors <= 0) {
-				end_request(fd_req, 1);
+				__blk_end_request_cur(fd_req, 0);
 				fs->state = idle;
 			} else {
 				fs->req_sector += fs->scount;
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index 64b496f..6f6ad82 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -314,21 +314,22 @@ static void do_xd_request (struct request_queue * q)
 		int retry;
 
 		if (!blk_fs_request(req)) {
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 		if (block + count > get_capacity(req->rq_disk)) {
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 		if (rw != READ && rw != WRITE) {
 			printk("do_xd_request: unknown request\n");
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 		for (retry = 0; (retry < XD_RETRIES) && !res; retry++)
 			res = xd_readwrite(rw, disk, req->buffer, block, count);
-		end_request(req, res);	/* wrap up, 0 = fail, 1 = success */
+		/* wrap up, 0 = success, -errno = fail */
+		__blk_end_request_cur(req, res);
 	}
 }
 
@@ -418,7 +419,7 @@ static int xd_readwrite (u_char operation,XD_INFO *p,char *buffer,u_int block,u_
 				printk("xd%c: %s timeout, recalibrating drive\n",'a'+drive,(operation == READ ? "read" : "write"));
 				xd_recalibrate(drive);
 				spin_lock_irq(&xd_lock);
-				return (0);
+				return -EIO;
 			case 2:
 				if (sense[0] & 0x30) {
 					printk("xd%c: %s - ",'a'+drive,(operation == READ ? "reading" : "writing"));
@@ -439,7 +440,7 @@ static int xd_readwrite (u_char operation,XD_INFO *p,char *buffer,u_int block,u_
 				else
 					printk(" - no valid disk address\n");
 				spin_lock_irq(&xd_lock);
-				return (0);
+				return -EIO;
 		}
 		if (xd_dma_buffer)
 			for (i=0; i < (temp * 0x200); i++)
@@ -448,7 +449,7 @@ static int xd_readwrite (u_char operation,XD_INFO *p,char *buffer,u_int block,u_
 		count -= temp, buffer += temp * 0x200, block += temp;
 	}
 	spin_lock_irq(&xd_lock);
-	return (1);
+	return 0;
 }
 
 /* xd_recalibrate: recalibrate a given drive and reset controller if necessary */
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index cd6cfe3..b456447 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -302,7 +302,7 @@ static void do_blkif_request(struct request_queue *rq)
 	while ((req = elv_next_request(rq)) != NULL) {
 		info = req->rq_disk->private_data;
 		if (!blk_fs_request(req)) {
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 4aecf5d..b1e1d7e 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -466,7 +466,7 @@ struct request *ace_get_next_request(struct request_queue * q)
 	while ((req = elv_next_request(q)) != NULL) {
 		if (blk_fs_request(req))
 			break;
-		end_request(req, 0);
+		__blk_end_request_cur(req, -EIO);
 	}
 	return req;
 }
@@ -494,7 +494,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
 
 		/* Drop all pending requests */
 		while ((req = elv_next_request(ace->queue)) != NULL)
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 
 		/* Drop back to IDLE state and notify waiters */
 		ace->fsm_state = ACE_FSM_STATE_IDLE;
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index 80754cd..b66ad58 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -77,7 +77,7 @@ static void do_z2_request(struct request_queue *q)
 		if (start + len > z2ram_size) {
 			printk( KERN_ERR DEVICE_NAME ": bad access: block=%lu, count=%u\n",
 				req->sector, req->current_nr_sectors);
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 		while (len) {
@@ -93,7 +93,7 @@ static void do_z2_request(struct request_queue *q)
 			start += size;
 			len -= size;
 		}
-		end_request(req, 1);
+		__blk_end_request_cur(req, 0);
 	}
 }
 
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index fee9a9e..cab2b1f 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -654,17 +654,17 @@ static void gdrom_request(struct request_queue *rq)
 	while ((req = elv_next_request(rq)) != NULL) {
 		if (!blk_fs_request(req)) {
 			printk(KERN_DEBUG "GDROM: Non-fs request ignored\n");
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 		}
 		if (rq_data_dir(req) != READ) {
 			printk(KERN_NOTICE "GDROM: Read only device -");
 			printk(" write request ignored\n");
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 		}
 		if (req->nr_sectors)
 			gdrom_request_handler_dma(req);
 		else
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 	}
 }
 
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index a443e13..221317e 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -923,7 +923,7 @@ static void i2o_block_request_fn(struct request_queue *q)
 				break;
 			}
 		} else
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 	}
 };
 
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index a49a9c8..76c4c8d 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -54,33 +54,33 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 
 	if (req->cmd_type == REQ_TYPE_LINUX_BLOCK &&
 	    req->cmd[0] == REQ_LB_OP_DISCARD)
-		return !tr->discard(dev, block, nsect);
+		return tr->discard(dev, block, nsect);
 
 	if (!blk_fs_request(req))
-		return 0;
+		return -EIO;
 
 	if (req->sector + req->current_nr_sectors > get_capacity(req->rq_disk))
-		return 0;
+		return -EIO;
 
 	switch(rq_data_dir(req)) {
 	case READ:
 		for (; nsect > 0; nsect--, block++, buf += tr->blksize)
 			if (tr->readsect(dev, block, buf))
-				return 0;
-		return 1;
+				return -EIO;
+		return 0;
 
 	case WRITE:
 		if (!tr->writesect)
-			return 0;
+			return -EIO;
 
 		for (; nsect > 0; nsect--, block++, buf += tr->blksize)
 			if (tr->writesect(dev, block, buf))
-				return 0;
-		return 1;
+				return -EIO;
+		return 0;
 
 	default:
 		printk(KERN_NOTICE "Unknown request %u\n", rq_data_dir(req));
-		return 0;
+		return -EIO;
 	}
 }
 
@@ -96,7 +96,7 @@ static int mtd_blktrans_thread(void *arg)
 	while (!kthread_should_stop()) {
 		struct request *req;
 		struct mtd_blktrans_dev *dev;
-		int res = 0;
+		int res;
 
 		req = elv_next_request(rq);
 
@@ -119,7 +119,7 @@ static int mtd_blktrans_thread(void *arg)
 
 		spin_lock_irq(rq->queue_lock);
 
-		end_request(req, res);
+		__blk_end_request_cur(req, res);
 	}
 	spin_unlock_irq(rq->queue_lock);
 
diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c
index a85ad05..0961788 100644
--- a/drivers/sbus/char/jsflash.c
+++ b/drivers/sbus/char/jsflash.c
@@ -192,25 +192,25 @@ static void jsfd_do_request(struct request_queue *q)
 		size_t len = req->current_nr_sectors << 9;
 
 		if ((offset + len) > jdp->dsize) {
-               		end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 
 		if (rq_data_dir(req) != READ) {
 			printk(KERN_ERR "jsfd: write\n");
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 
 		if ((jdp->dbase & 0xff000000) != 0x20000000) {
 			printk(KERN_ERR "jsfd: bad base %x\n", (int)jdp->dbase);
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 
 		jsfd_read(req->buffer, jdp->dbase + offset, len);
 
-		end_request(req, 1);
+		__blk_end_request_cur(req, 0);
 	}
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e33c835..cfeb3c2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -845,9 +845,8 @@ extern unsigned int blk_rq_cur_bytes(struct request *rq);
  * blk_update_request() completes given number of bytes and updates
  * the request without completing it.
  *
- * blk_end_request() and friends.  __blk_end_request() and
- * end_request() must be called with the request queue spinlock
- * acquired.
+ * blk_end_request() and friends.  __blk_end_request() must be called
+ * with the request queue spinlock acquired.
  *
  * Several drivers define their own end_request and call
  * blk_end_request() for parts of the original function.
@@ -899,6 +898,19 @@ static inline void blk_end_request_all(struct request *rq, int error)
 }
 
 /**
+ * blk_end_request_cur - Helper function to finish the current request chunk.
+ * @rq: the request to finish the current chunk for
+ * @err: %0 for success, < %0 for error
+ *
+ * Description:
+ *     Complete the current consecutively mapped chunk from @rq.
+ */
+static inline void blk_end_request_cur(struct request *rq, int error)
+{
+	blk_end_request(rq, error, rq->hard_cur_sectors << 9);
+}
+
+/**
  * __blk_end_request - Helper function for drivers to complete the request.
  * @rq:       the request being processed
  * @error:    %0 for success, < %0 for error
@@ -934,29 +946,17 @@ static inline void __blk_end_request_all(struct request *rq, int error)
 }
 
 /**
- * end_request - end I/O on the current segment of the request
- * @rq:		the request being processed
- * @uptodate:	error value or %0/%1 uptodate flag
+ * __blk_end_request_cur - Helper function to finish the current request chunk.
+ * @rq: the request to finish the current chunk for
+ * @err: %0 for success, < %0 for error
  *
  * Description:
- *     Ends I/O on the current segment of a request. If that is the only
- *     remaining segment, the request is also completed and freed.
- *
- *     This is a remnant of how older block drivers handled I/O completions.
- *     Modern drivers typically end I/O on the full request in one go, unless
- *     they have a residual value to account for. For that case this function
- *     isn't really useful, unless the residual just happens to be the
- *     full current segment. In other words, don't use this function in new
- *     code. Use blk_end_request() or __blk_end_request() to end a request.
- **/
-static inline void end_request(struct request *rq, int uptodate)
+ *     Complete the current consecutively mapped chunk from @rq.  Must
+ *     be called with queue lock held.
+ */
+static inline void __blk_end_request_cur(struct request *rq, int error)
 {
-	int error = 0;
-
-	if (uptodate <= 0)
-		error = uptodate ? uptodate : -EIO;
-
-	__blk_end_bidi_request(rq, error, rq->hard_cur_sectors << 9, 0);
+	__blk_end_request(rq, error, rq->hard_cur_sectors << 9);
 }
 
 extern void blk_complete_request(struct request *);
-- 
cgit v0.10.2


From ec24751a6b57e1373a12361e581b2458bc9bb791 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:20 +0900
Subject: arm-omap: don't abuse rq->data

omap mailbox uses rq->data as the second opaque pointer to carry
mbox_msg_t and rq->special message argument which is needed only for
tx.  Add and use omap_msg_tx_data struct for tx and use rq->special
for mbox_msg_t for rx such that only rq->special is used as opaque
pointer.

[ Impact: cleanup rq->data usage, extra kmalloc in msg_send ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Russell King <rmk@arm.linux.org.uk>

diff --git a/arch/arm/plat-omap/mailbox.c b/arch/arm/plat-omap/mailbox.c
index cf81bad..538ba75 100644
--- a/arch/arm/plat-omap/mailbox.c
+++ b/arch/arm/plat-omap/mailbox.c
@@ -147,24 +147,40 @@ static int __mbox_msg_send(struct omap_mbox *mbox, mbox_msg_t msg, void *arg)
 	return ret;
 }
 
+struct omap_msg_tx_data {
+	mbox_msg_t	msg;
+	void		*arg;
+};
+
+static void omap_msg_tx_end_io(struct request *rq, int error)
+{
+	kfree(rq->special);
+	__blk_put_request(rq->q, rq);
+}
+
 int omap_mbox_msg_send(struct omap_mbox *mbox, mbox_msg_t msg, void* arg)
 {
+	struct omap_msg_tx_data *tx_data;
 	struct request *rq;
 	struct request_queue *q = mbox->txq->queue;
-	int ret = 0;
+
+	tx_data = kmalloc(sizeof(*tx_data), GFP_ATOMIC);
+	if (unlikely(!tx_data))
+		return -ENOMEM;
 
 	rq = blk_get_request(q, WRITE, GFP_ATOMIC);
 	if (unlikely(!rq)) {
-		ret = -ENOMEM;
-		goto fail;
+		kfree(tx_data);
+		return -ENOMEM;
 	}
 
-	rq->data = (void *)msg;
-	blk_insert_request(q, rq, 0, arg);
+	tx_data->msg = msg;
+	tx_data->arg = arg;
+	rq->end_io = omap_msg_tx_end_io;
+	blk_insert_request(q, rq, 0, tx_data);
 
 	schedule_work(&mbox->txq->work);
- fail:
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL(omap_mbox_msg_send);
 
@@ -178,6 +194,8 @@ static void mbox_tx_work(struct work_struct *work)
 	struct request_queue *q = mbox->txq->queue;
 
 	while (1) {
+		struct omap_msg_tx_data *tx_data;
+
 		spin_lock(q->queue_lock);
 		rq = elv_next_request(q);
 		spin_unlock(q->queue_lock);
@@ -185,7 +203,9 @@ static void mbox_tx_work(struct work_struct *work)
 		if (!rq)
 			break;
 
-		ret = __mbox_msg_send(mbox, (mbox_msg_t) rq->data, rq->special);
+		tx_data = rq->special;
+
+		ret = __mbox_msg_send(mbox, tx_data->msg, tx_data->arg);
 		if (ret) {
 			enable_mbox_irq(mbox, IRQ_TX);
 			return;
@@ -222,7 +242,7 @@ static void mbox_rx_work(struct work_struct *work)
 		if (!rq)
 			break;
 
-		msg = (mbox_msg_t) rq->data;
+		msg = (mbox_msg_t)rq->special;
 		blk_end_request_all(rq, 0);
 		mbox->rxq->callback((void *)msg);
 	}
@@ -260,7 +280,6 @@ static void __mbox_rx_interrupt(struct omap_mbox *mbox)
 			goto nomem;
 
 		msg = mbox_fifo_read(mbox);
-		rq->data = (void *)msg;
 
 		if (unlikely(mbox_seq_test(mbox, msg))) {
 			pr_info("mbox: Illegal seq bit!(%08x)\n", msg);
@@ -268,7 +287,7 @@ static void __mbox_rx_interrupt(struct omap_mbox *mbox)
 				mbox->err_notify();
 		}
 
-		blk_insert_request(q, rq, 0, NULL);
+		blk_insert_request(q, rq, 0, (void *)msg);
 		if (mbox->ops->type == OMAP_MBOX_TYPE1)
 			break;
 	}
@@ -331,7 +350,7 @@ omap_mbox_read(struct device *dev, struct device_attribute *attr, char *buf)
 		if (!rq)
 			break;
 
-		*p = (mbox_msg_t) rq->data;
+		*p = (mbox_msg_t)rq->special;
 
 		blk_end_request_all(rq, 0);
 
-- 
cgit v0.10.2


From 731ec497e5888c6792ad62613ae9be97eebcd7ca Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:20 +0900
Subject: block: kill rq->data

Now that all block request data transfer is done via bio, rq->data
isn't used.  Kill it.

While at it, make the roles of rq->special and buffer clear.

[ Impact: drop now unncessary field from struct request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Boaz Harrosh <bharrosh@panasas.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index 0520cc7..6dd180c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -189,10 +189,9 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
 						(unsigned long long)rq->sector,
 						rq->nr_sectors,
 						rq->current_nr_sectors);
-	printk(KERN_INFO "  bio %p, biotail %p, buffer %p, data %p, len %u\n",
+	printk(KERN_INFO "  bio %p, biotail %p, buffer %p, len %u\n",
 						rq->bio, rq->biotail,
-						rq->buffer, rq->data,
-						rq->data_len);
+						rq->buffer, rq->data_len);
 
 	if (blk_pc_request(rq)) {
 		printk(KERN_INFO "  cdb: ");
diff --git a/block/blk-map.c b/block/blk-map.c
index f103729..694fefa 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -156,7 +156,7 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
 	if (!bio_flagged(bio, BIO_USER_MAPPED))
 		rq->cmd_flags |= REQ_COPY_USER;
 
-	rq->buffer = rq->data = NULL;
+	rq->buffer = NULL;
 	return 0;
 unmap_rq:
 	blk_rq_unmap_user(bio);
@@ -235,7 +235,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 	blk_queue_bounce(q, &bio);
 	bio_get(bio);
 	blk_rq_bio_prep(q, rq, bio);
-	rq->buffer = rq->data = NULL;
+	rq->buffer = NULL;
 	return 0;
 }
 EXPORT_SYMBOL(blk_rq_map_user_iov);
@@ -313,7 +313,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 
 	blk_rq_bio_prep(q, rq, bio);
 	blk_queue_bounce(q, &rq->bio);
-	rq->buffer = rq->data = NULL;
+	rq->buffer = NULL;
 	return 0;
 }
 EXPORT_SYMBOL(blk_rq_map_kern);
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 82a0ca2..bb9aa43 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -500,7 +500,6 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
 
 	rq = blk_get_request(q, WRITE, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_BLOCK_PC;
-	rq->data = NULL;
 	rq->data_len = 0;
 	rq->extra_len = 0;
 	rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 756ac7c..aa9fc57 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1088,7 +1088,6 @@ int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
 			return ret;
 	} else {
 		BUG_ON(req->data_len);
-		BUG_ON(req->data);
 
 		memset(&cmd->sdb, 0, sizeof(cmd->sdb));
 		req->buffer = NULL;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index cfeb3c2..12c545e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -211,8 +211,8 @@ struct request {
 
 	unsigned short ioprio;
 
-	void *special;
-	char *buffer;
+	void *special;		/* opaque pointer available for LLD use */
+	char *buffer;		/* kaddr of the current segment if available */
 
 	int tag;
 	int errors;
@@ -229,7 +229,6 @@ struct request {
 	unsigned int data_len;
 	unsigned int extra_len;	/* length of alignment and padding */
 	unsigned int sense_len;
-	void *data;
 	void *sense;
 
 	unsigned long deadline;
-- 
cgit v0.10.2


From c2553b5844b06910435e40cfab9e6f384840cb97 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 24 Apr 2009 08:10:11 +0200
Subject: block: make blk_do_io_stat() do the full "is this rq accountable"
 checks

We currently check for file system requests outside of blk_do_io_stat(rq),
but we may as well just include it.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index 6dd180c..1e3b97f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -68,7 +68,7 @@ static void drive_stat_acct(struct request *rq, int new_io)
 	int rw = rq_data_dir(rq);
 	int cpu;
 
-	if (!blk_fs_request(rq) || !blk_do_io_stat(rq))
+	if (!blk_do_io_stat(rq))
 		return;
 
 	cpu = part_stat_lock();
@@ -1639,10 +1639,7 @@ EXPORT_SYMBOL(blkdev_dequeue_request);
 
 static void blk_account_io_completion(struct request *req, unsigned int bytes)
 {
-	if (!blk_do_io_stat(req))
-		return;
-
-	if (blk_fs_request(req)) {
+	if (blk_do_io_stat(req)) {
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part;
 		int cpu;
@@ -1656,15 +1653,12 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
 
 static void blk_account_io_done(struct request *req)
 {
-	if (!blk_do_io_stat(req))
-		return;
-
 	/*
 	 * Account IO completion.  bar_rq isn't accounted as a normal
 	 * IO on queueing nor completion.  Accounting the containing
 	 * request is enough.
 	 */
-	if (blk_fs_request(req) && req != &req->q->bar_rq) {
+	if (blk_do_io_stat(req) && req != &req->q->bar_rq) {
 		unsigned long duration = jiffies - req->start_time;
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part;
diff --git a/block/blk.h b/block/blk.h
index 9b2c324..404c10b 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -149,9 +149,16 @@ static inline int blk_cpu_to_group(int cpu)
 #endif
 }
 
+/*
+ * Contribute to IO statistics IFF:
+ *
+ *	a) it's attached to a gendisk, and
+ *	b) the queue had IO stats enabled when this request was started, and
+ *	c) it's a file system request
+ */
 static inline int blk_do_io_stat(struct request *rq)
 {
-	return rq->rq_disk && blk_rq_io_stat(rq);
+	return rq->rq_disk && blk_rq_io_stat(rq) && blk_fs_request(rq);
 }
 
 #endif
-- 
cgit v0.10.2


From c69d48540c201394d08cb4d48b905e001313d9b8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 24 Apr 2009 08:12:19 +0200
Subject: block: include discard requests in IO accounting

We currently don't do merging on discard requests, but we potentially
could. If we do, then we need to include discard requests in the IO
accounting, or merging would end up decrementing in_flight IO counters
for an IO which never incremented them.

So enable accounting for discard requests.

Problem found by Nikanth Karthikesan <knikanth@suse.de>

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk.h b/block/blk.h
index 404c10b..5111559 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -158,7 +158,8 @@ static inline int blk_cpu_to_group(int cpu)
  */
 static inline int blk_do_io_stat(struct request *rq)
 {
-	return rq->rq_disk && blk_rq_io_stat(rq) && blk_fs_request(rq);
+	return rq->rq_disk && blk_rq_io_stat(rq) && blk_fs_request(rq) &&
+		blk_discard_rq(rq);
 }
 
 #endif
-- 
cgit v0.10.2


From 9eb55b030c4b3227334ee4482402096cd1d1a6fe Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Mon, 27 Apr 2009 14:53:54 +0200
Subject: block: catch trying to use more bits than request->cmd_flags has

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index 1e3b97f..394c5bd 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2097,6 +2097,9 @@ EXPORT_SYMBOL(kblockd_schedule_work);
 
 int __init blk_dev_init(void)
 {
+	BUILD_BUG_ON(__REQ_NR_BITS > 8 *
+			sizeof(((struct request *)0)->cmd_flags));
+
 	kblockd_workqueue = create_workqueue("kblockd");
 	if (!kblockd_workqueue)
 		panic("Failed to create kblockd\n");
-- 
cgit v0.10.2


From 9fd8d0e1bcb848257968d9a7d73ca4d890ea8bd1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 13:06:04 +0900
Subject: block: make blk_end_request_cur() return bool

In the process of mindlessly copying [__]blk_end_request_all(),
[__]blk_end_request_cur() ended up returning void even though they're
partial completion functions.  Fix it.

[ Impact: fix braindead API ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 12c545e..3a5b1bd 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -903,10 +903,14 @@ static inline void blk_end_request_all(struct request *rq, int error)
  *
  * Description:
  *     Complete the current consecutively mapped chunk from @rq.
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
  */
-static inline void blk_end_request_cur(struct request *rq, int error)
+static inline bool blk_end_request_cur(struct request *rq, int error)
 {
-	blk_end_request(rq, error, rq->hard_cur_sectors << 9);
+	return blk_end_request(rq, error, rq->hard_cur_sectors << 9);
 }
 
 /**
@@ -952,10 +956,14 @@ static inline void __blk_end_request_all(struct request *rq, int error)
  * Description:
  *     Complete the current consecutively mapped chunk from @rq.  Must
  *     be called with queue lock held.
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
  */
-static inline void __blk_end_request_cur(struct request *rq, int error)
+static inline bool __blk_end_request_cur(struct request *rq, int error)
 {
-	__blk_end_request(rq, error, rq->hard_cur_sectors << 9);
+	return __blk_end_request(rq, error, rq->hard_cur_sectors << 9);
 }
 
 extern void blk_complete_request(struct request *);
-- 
cgit v0.10.2


From 4c94dece1baf320d925cedb231489c4e0358ac5a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 13:06:05 +0900
Subject: block: don't init rq fields unnecessarily

blk_get_request() always returns properly zeroed requests.  Don't set
fields to zero/NULL unnecessarily.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index bb9aa43..58cf456 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -500,8 +500,6 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
 
 	rq = blk_get_request(q, WRITE, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_BLOCK_PC;
-	rq->data_len = 0;
-	rq->extra_len = 0;
 	rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
 	rq->cmd[0] = cmd;
 	rq->cmd[4] = data;
-- 
cgit v0.10.2


From 5b5c5d12b91cb6b2a2967f06aef35d59008dc2e7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 13:06:06 +0900
Subject: amiflop,ataflop,xd,mg_disk: clean up unnecessary stuff from block
 drivers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

rq_data_dir() can only be READ or WRITE and rq->sector and nr_sectors
are always automatically updated after partial request completion.
Don't worry about rq_data_dir() not being either READ or WRITE or
manually update sector and nr_sectors.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jörg Dorchain <joerg@dorchain.net>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: unsik Kim <donari75@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index b99a2a6..8ff95f2 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1371,11 +1371,6 @@ static void redo_fd_request(void)
 		       "0x%08lx\n", track, sector, data);
 #endif
 
-		if ((rq_data_dir(CURRENT) != READ) && (rq_data_dir(CURRENT) != WRITE)) {
-			printk(KERN_WARNING "do_fd_request: unknown command\n");
-			__blk_end_request_cur(CURRENT, -EIO);
-			goto repeat;
-		}
 		if (get_track(drive, track) == -1) {
 			__blk_end_request_cur(CURRENT, -EIO);
 			goto repeat;
@@ -1407,8 +1402,6 @@ static void redo_fd_request(void)
 			break;
 		}
 	}
-	CURRENT->nr_sectors -= CURRENT->current_nr_sectors;
-	CURRENT->sector += CURRENT->current_nr_sectors;
 
 	__blk_end_request_cur(CURRENT, 0);
 	goto repeat;
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 44a8702..2506728 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -732,8 +732,6 @@ static void do_fd_action( int drive )
 		    }
 		    else {
 			/* all sectors finished */
-			CURRENT->nr_sectors -= CURRENT->current_nr_sectors;
-			CURRENT->sector += CURRENT->current_nr_sectors;
 			__blk_end_request_cur(CURRENT, 0);
 			redo_fd_request();
 			return;
@@ -1139,8 +1137,6 @@ static void fd_rwsec_done1(int status)
 	}
 	else {
 		/* all sectors finished */
-		CURRENT->nr_sectors -= CURRENT->current_nr_sectors;
-		CURRENT->sector += CURRENT->current_nr_sectors;
 		__blk_end_request_cur(CURRENT, 0);
 		redo_fd_request();
 	}
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 408c2bd..2c00ad9 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -560,11 +560,6 @@ static void mg_request_poll(struct request_queue *q)
 			case WRITE:
 				mg_write(req);
 				break;
-			default:
-				printk(KERN_WARNING "%s:%d unknown command\n",
-						__func__, __LINE__);
-				__blk_end_request_cur(req, -EIO);
-				break;
 			}
 		}
 	}
@@ -614,11 +609,6 @@ static unsigned int mg_issue_req(struct request *req,
 		outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
 				MG_REG_COMMAND);
 		break;
-	default:
-		printk(KERN_WARNING "%s:%d unknown command\n",
-				__func__, __LINE__);
-		__blk_end_request_cur(req, -EIO);
-		break;
 	}
 	return MG_ERR_NONE;
 }
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index 6f6ad82..14be4c1 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -308,7 +308,6 @@ static void do_xd_request (struct request_queue * q)
 	while ((req = elv_next_request(q)) != NULL) {
 		unsigned block = req->sector;
 		unsigned count = req->nr_sectors;
-		int rw = rq_data_dir(req);
 		XD_INFO *disk = req->rq_disk->private_data;
 		int res = 0;
 		int retry;
@@ -321,13 +320,9 @@ static void do_xd_request (struct request_queue * q)
 			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
-		if (rw != READ && rw != WRITE) {
-			printk("do_xd_request: unknown request\n");
-			__blk_end_request_cur(req, -EIO);
-			continue;
-		}
 		for (retry = 0; (retry < XD_RETRIES) && !res; retry++)
-			res = xd_readwrite(rw, disk, req->buffer, block, count);
+			res = xd_readwrite(rq_data_dir(req), disk, req->buffer,
+					   block, count);
 		/* wrap up, 0 = success, -errno = fail */
 		__blk_end_request_cur(req, res);
 	}
-- 
cgit v0.10.2


From cd4c34ebec155e5c10f897d9bebf618248121e70 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 13:06:07 +0900
Subject: ps3disk: simplify request completion

ps3disk_interrupt() always completes requests fully but it uses
rq->hard_cur_sectors for FLUSH requests for some reason.  Drop them
and simply use __blk_end_request_all().

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index d23b54b..f6586e4 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -231,7 +231,6 @@ static irqreturn_t ps3disk_interrupt(int irq, void *data)
 	struct request *req;
 	int res, read, error;
 	u64 tag, status;
-	unsigned long num_sectors;
 	const char *op;
 
 	res = lv1_storage_get_async_status(dev->sbd.dev_id, &tag, &status);
@@ -261,11 +260,9 @@ static irqreturn_t ps3disk_interrupt(int irq, void *data)
 	if (req->cmd_type == REQ_TYPE_LINUX_BLOCK &&
 	    req->cmd[0] == REQ_LB_OP_FLUSH) {
 		read = 0;
-		num_sectors = req->hard_cur_sectors;
 		op = "flush";
 	} else {
 		read = !rq_data_dir(req);
-		num_sectors = req->nr_sectors;
 		op = read ? "read" : "write";
 	}
 	if (status) {
@@ -281,7 +278,7 @@ static irqreturn_t ps3disk_interrupt(int irq, void *data)
 	}
 
 	spin_lock(&priv->lock);
-	__blk_end_request(req, error, num_sectors << 9);
+	__blk_end_request_all(req, error);
 	priv->req = NULL;
 	ps3disk_do_request(dev, priv->queue);
 	spin_unlock(&priv->lock);
-- 
cgit v0.10.2


From 044208506d35bd62396c4673176e2c12393905b8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 13:06:08 +0900
Subject: sunvdc: kill vdc_end_request()

vdc_end_request() is a thin silly wrapper on top of
__blk_end_request().  Kill it.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 5861e33..f59887c 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -212,11 +212,6 @@ static void vdc_end_special(struct vdc_port *port, struct vio_disk_desc *desc)
 	vdc_finish(&port->vio, -err, WAITING_FOR_GEN_CMD);
 }
 
-static void vdc_end_request(struct request *req, int error, int num_sectors)
-{
-	__blk_end_request(req, error, num_sectors << 9);
-}
-
 static void vdc_end_one(struct vdc_port *port, struct vio_dring_state *dr,
 			unsigned int index)
 {
@@ -239,7 +234,7 @@ static void vdc_end_one(struct vdc_port *port, struct vio_dring_state *dr,
 
 	rqe->req = NULL;
 
-	vdc_end_request(req, (desc->status ? -EIO : 0), desc->size >> 9);
+	__blk_end_request(req, (desc->status ? -EIO : 0), desc->size);
 
 	if (blk_queue_stopped(port->disk->queue))
 		blk_start_queue(port->disk->queue);
@@ -453,7 +448,7 @@ static void do_vdc_request(struct request_queue *q)
 
 		blkdev_dequeue_request(req);
 		if (__send_request(req) < 0)
-			vdc_end_request(req, -EIO, req->hard_nr_sectors);
+			__blk_end_request_all(req, -EIO);
 	}
 }
 
-- 
cgit v0.10.2


From 4d6c84d91d1a539ebc47d1a36a35e9390ba11fdc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 13:06:09 +0900
Subject: ubd: cleanup completion path

ubd had its own block request partial completion mechanism, which is
unnecessary as block layer already does it.  Kill ubd_end_request()
and ubd_finish() and replace them with direct call to
blk_end_request().

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index f934225..36ca9fa 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -451,23 +451,6 @@ static void do_ubd_request(struct request_queue * q);
 
 /* Only changed by ubd_init, which is an initcall. */
 static int thread_fd = -1;
-
-static void ubd_end_request(struct request *req, int bytes, int error)
-{
-	blk_end_request(req, error, bytes);
-}
-
-/* Callable only from interrupt context - otherwise you need to do
- * spin_lock_irq()/spin_lock_irqsave() */
-static inline void ubd_finish(struct request *req, int bytes)
-{
-	if(bytes < 0){
-		ubd_end_request(req, 0, -EIO);
-		return;
-	}
-	ubd_end_request(req, bytes, 0);
-}
-
 static LIST_HEAD(restart);
 
 /* XXX - move this inside ubd_intr. */
@@ -475,7 +458,6 @@ static LIST_HEAD(restart);
 static void ubd_handler(void)
 {
 	struct io_thread_req *req;
-	struct request *rq;
 	struct ubd *ubd;
 	struct list_head *list, *next_ele;
 	unsigned long flags;
@@ -492,10 +474,7 @@ static void ubd_handler(void)
 			return;
 		}
 
-		rq = req->req;
-		rq->nr_sectors -= req->length >> 9;
-		if(rq->nr_sectors == 0)
-			ubd_finish(rq, rq->hard_nr_sectors << 9);
+		blk_end_request(req->req, 0, req->length);
 		kfree(req);
 	}
 	reactivate_fd(thread_fd, UBD_IRQ);
-- 
cgit v0.10.2


From f81f2f7c9fee307e371f37424577d46f9eaf8692 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 13:06:10 +0900
Subject: ubd: drop unnecessary rq->sector manipulation

ubd curiously updates rq->sector while issuing the request in multiple
pieces.  Don't do it and simply use local copy of sector.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 36ca9fa..4330127 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -1222,7 +1222,8 @@ static void do_ubd_request(struct request_queue *q)
 {
 	struct io_thread_req *io_req;
 	struct request *req;
-	int n, last_sectors;
+	sector_t sector;
+	int n;
 
 	while(1){
 		struct ubd *dev = q->queuedata;
@@ -1238,11 +1239,10 @@ static void do_ubd_request(struct request_queue *q)
 		}
 
 		req = dev->request;
-		last_sectors = 0;
+		sector = req->sector;
 		while(dev->start_sg < dev->end_sg){
 			struct scatterlist *sg = &dev->sg[dev->start_sg];
 
-			req->sector += last_sectors;
 			io_req = kmalloc(sizeof(struct io_thread_req),
 					 GFP_ATOMIC);
 			if(io_req == NULL){
@@ -1251,10 +1251,10 @@ static void do_ubd_request(struct request_queue *q)
 				return;
 			}
 			prepare_request(req, io_req,
-					(unsigned long long) req->sector << 9,
+					(unsigned long long)sector << 9,
 					sg->offset, sg->length, sg_page(sg));
 
-			last_sectors = sg->length >> 9;
+			sector += sg->length >> 9;
 			n = os_write_file(thread_fd, &io_req,
 					  sizeof(struct io_thread_req *));
 			if(n != sizeof(struct io_thread_req *)){
-- 
cgit v0.10.2


From e091eb67af957bac4e4f7410c5d1aa263ee483a4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 13:06:11 +0900
Subject: hd: clean up request completion paths

hd read/write_intr() functions manually manipulate request to
incrementally complete it, which block layer already supports.  Simply
use block layer completion routines instead of manual partial
completion.

While at it, clear unnecessary elv_next_request() check at the tail of
read_intr().  This also makes read and write_intr() more consistent.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index 5cb300b..75b9ca9 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -452,32 +452,25 @@ static void read_intr(void)
 	bad_rw_intr();
 	hd_request();
 	return;
+
 ok_to_read:
 	req = CURRENT;
 	insw(HD_DATA, req->buffer, 256);
-	req->sector++;
-	req->buffer += 512;
-	req->errors = 0;
-	i = --req->nr_sectors;
-	--req->current_nr_sectors;
 #ifdef DEBUG
 	printk("%s: read: sector %ld, remaining = %ld, buffer=%p\n",
-		req->rq_disk->disk_name, req->sector, req->nr_sectors,
+		req->rq_disk->disk_name, req->sector + 1, req->nr_sectors - 1,
 		req->buffer+512);
 #endif
-	if (req->current_nr_sectors <= 0)
-		__blk_end_request_cur(req, 0);
-	if (i > 0) {
+	if (__blk_end_request(req, 0, 512)) {
 		SET_HANDLER(&read_intr);
 		return;
 	}
+
 	(void) inb_p(HD_STATUS);
 #if (HD_DELAY > 0)
 	last_req = read_timer();
 #endif
-	if (elv_next_request(QUEUE))
-		hd_request();
-	return;
+	hd_request();
 }
 
 static void write_intr(void)
@@ -499,23 +492,18 @@ static void write_intr(void)
 	bad_rw_intr();
 	hd_request();
 	return;
+
 ok_to_write:
-	req->sector++;
-	i = --req->nr_sectors;
-	--req->current_nr_sectors;
-	req->buffer += 512;
-	if (!i || (req->bio && req->current_nr_sectors <= 0))
-		__blk_end_request_cur(req, 0);
-	if (i > 0) {
+	if (__blk_end_request(req, 0, 512)) {
 		SET_HANDLER(&write_intr);
 		outsw(HD_DATA, req->buffer, 256);
-	} else {
+		return;
+	}
+
 #if (HD_DELAY > 0)
-		last_req = read_timer();
+	last_req = read_timer();
 #endif
-		hd_request();
-	}
-	return;
+	hd_request();
 }
 
 static void recal_intr(void)
-- 
cgit v0.10.2


From 467ca759fc83fc35cb7d15aec0d74c62cffc4481 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 13:06:12 +0900
Subject: swim3: clean up request completion paths

swim3 curiously tries to update request parameters before calling
__blk_end_request() when __blk_end_request() will do it anyway, and it
updates request for partial completion manually instead of using
blk_update_request().  Also, it does some spurious checks on rq such
as testing whether rq->sector is negative or current_nr_sectors is
zero right after fetching.

Drop unnecessary stuff and use standard block layer mechanisms.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index 5904f7b..4248559 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -319,14 +319,10 @@ static void start_request(struct floppy_state *fs)
 		       req->errors, req->current_nr_sectors);
 #endif
 
-		if (req->sector < 0 || req->sector >= fs->total_secs) {
+		if (req->sector >= fs->total_secs) {
 			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
-		if (req->current_nr_sectors == 0) {
-			__blk_end_request_cur(req, 0);
-			continue;
-		}
 		if (fs->ejected) {
 			__blk_end_request_cur(req, -EIO);
 			continue;
@@ -593,8 +589,6 @@ static void xfer_timeout(unsigned long data)
 	struct floppy_state *fs = (struct floppy_state *) data;
 	struct swim3 __iomem *sw = fs->swim3;
 	struct dbdma_regs __iomem *dr = fs->dma;
-	struct dbdma_cmd *cp = fs->dma_cmd;
-	unsigned long s;
 	int n;
 
 	fs->timeout_pending = 0;
@@ -605,14 +599,6 @@ static void xfer_timeout(unsigned long data)
 	out_8(&sw->intr_enable, 0);
 	out_8(&sw->control_bic, WRITE_SECTORS | DO_ACTION);
 	out_8(&sw->select, RELAX);
-	if (rq_data_dir(fd_req) == WRITE)
-		++cp;
-	if (ld_le16(&cp->xfer_status) != 0)
-		s = fs->scount - ((ld_le16(&cp->res_count) + 511) >> 9);
-	else
-		s = 0;
-	fd_req->sector += s;
-	fd_req->current_nr_sectors -= s;
 	printk(KERN_ERR "swim3: timeout %sing sector %ld\n",
 	       (rq_data_dir(fd_req)==WRITE? "writ": "read"), (long)fd_req->sector);
 	__blk_end_request_cur(fd_req, -EIO);
@@ -719,9 +705,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 		if (intr & ERROR_INTR) {
 			n = fs->scount - 1 - resid / 512;
 			if (n > 0) {
-				fd_req->sector += n;
-				fd_req->current_nr_sectors -= n;
-				fd_req->buffer += n * 512;
+				blk_update_request(fd_req, 0, n << 9);
 				fs->req_sector += n;
 			}
 			if (fs->retries < 5) {
@@ -745,13 +729,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 				start_request(fs);
 				break;
 			}
-			fd_req->sector += fs->scount;
-			fd_req->current_nr_sectors -= fs->scount;
-			fd_req->buffer += fs->scount * 512;
-			if (fd_req->current_nr_sectors <= 0) {
-				__blk_end_request_cur(fd_req, 0);
-				fs->state = idle;
-			} else {
+			if (__blk_end_request(fd_req, 0, fs->scount << 9)) {
 				fs->req_sector += fs->scount;
 				if (fs->req_sector > fs->secpertrack) {
 					fs->req_sector -= fs->secpertrack;
@@ -761,7 +739,8 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 					}
 				}
 				act(fs);
-			}
+			} else
+				fs->state = idle;
 		}
 		if (fs->state == idle)
 			start_request(fs);
-- 
cgit v0.10.2


From e138b4e08ef65771000fbe6d93d67e3960ff862b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 13:06:13 +0900
Subject: swim: clean up request completion paths

swim curiously tries to update request parameters before calling
__blk_end_request() when __blk_end_request() will do it anyway and
unnecessarily checks whether current_nr_sectors is zero right after
fetching.

Drop unnecessary stuff and use standard block layer mechanisms.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Laurent Vivier <Laurent@lvivier.info>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index 6544a7b..97ef426 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -535,10 +535,6 @@ static void redo_fd_request(struct request_queue *q)
 			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
-		if (req->current_nr_sectors == 0) {
-			__blk_end_request_cur(req, 0);
-			continue;
-		}
 		if (!fs->disk_in) {
 			__blk_end_request_cur(req, -EIO);
 			continue;
@@ -561,9 +557,6 @@ static void redo_fd_request(struct request_queue *q)
 				__blk_end_request_cur(req, -EIO);
 				continue;
 			}
-			req->nr_sectors -= req->current_nr_sectors;
-			req->sector += req->current_nr_sectors;
-			req->buffer += req->current_nr_sectors * 512;
 			__blk_end_request_cur(req, 0);
 			break;
 		}
-- 
cgit v0.10.2


From eec9462088a26c046d4db3100796a340a50890b8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 13:06:14 +0900
Subject: mg_disk: fold mg_disk.h into mg_disk.c

include/linux/mg_disk.h is used only by drivers/block/mg_disk.c.  No
reason to put it in a separate header.  Fold it into mg_disk.c.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: unsik Kim <donari75@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 2c00ad9..2c6127ee 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -22,10 +22,194 @@
 #include <linux/delay.h>
 #include <linux/platform_device.h>
 #include <linux/gpio.h>
-#include <linux/mg_disk.h>
 
 #define MG_RES_SEC (CONFIG_MG_DISK_RES << 1)
 
+/* name for block device */
+#define MG_DISK_NAME "mgd"
+/* name for platform device */
+#define MG_DEV_NAME "mg_disk"
+
+#define MG_DISK_MAJ 0
+#define MG_DISK_MAX_PART 16
+#define MG_SECTOR_SIZE 512
+#define MG_MAX_SECTS 256
+
+/* Register offsets */
+#define MG_BUFF_OFFSET			0x8000
+#define MG_STORAGE_BUFFER_SIZE		0x200
+#define MG_REG_OFFSET			0xC000
+#define MG_REG_FEATURE			(MG_REG_OFFSET + 2)	/* write case */
+#define MG_REG_ERROR			(MG_REG_OFFSET + 2)	/* read case */
+#define MG_REG_SECT_CNT			(MG_REG_OFFSET + 4)
+#define MG_REG_SECT_NUM			(MG_REG_OFFSET + 6)
+#define MG_REG_CYL_LOW			(MG_REG_OFFSET + 8)
+#define MG_REG_CYL_HIGH			(MG_REG_OFFSET + 0xA)
+#define MG_REG_DRV_HEAD			(MG_REG_OFFSET + 0xC)
+#define MG_REG_COMMAND			(MG_REG_OFFSET + 0xE)	/* write case */
+#define MG_REG_STATUS			(MG_REG_OFFSET + 0xE)	/* read  case */
+#define MG_REG_DRV_CTRL			(MG_REG_OFFSET + 0x10)
+#define MG_REG_BURST_CTRL		(MG_REG_OFFSET + 0x12)
+
+/* "Drive Select/Head Register" bit values */
+#define MG_REG_HEAD_MUST_BE_ON		0xA0 /* These 2 bits are always on */
+#define MG_REG_HEAD_DRIVE_MASTER	(0x00 | MG_REG_HEAD_MUST_BE_ON)
+#define MG_REG_HEAD_DRIVE_SLAVE		(0x10 | MG_REG_HEAD_MUST_BE_ON)
+#define MG_REG_HEAD_LBA_MODE		(0x40 | MG_REG_HEAD_MUST_BE_ON)
+
+
+/* "Device Control Register" bit values */
+#define MG_REG_CTRL_INTR_ENABLE			0x0
+#define MG_REG_CTRL_INTR_DISABLE		(0x1<<1)
+#define MG_REG_CTRL_RESET			(0x1<<2)
+#define MG_REG_CTRL_INTR_POLA_ACTIVE_HIGH	0x0
+#define MG_REG_CTRL_INTR_POLA_ACTIVE_LOW	(0x1<<4)
+#define MG_REG_CTRL_DPD_POLA_ACTIVE_LOW		0x0
+#define MG_REG_CTRL_DPD_POLA_ACTIVE_HIGH	(0x1<<5)
+#define MG_REG_CTRL_DPD_DISABLE			0x0
+#define MG_REG_CTRL_DPD_ENABLE			(0x1<<6)
+
+/* Status register bit */
+/* error bit in status register */
+#define MG_REG_STATUS_BIT_ERROR			0x01
+/* corrected error in status register */
+#define MG_REG_STATUS_BIT_CORRECTED_ERROR	0x04
+/* data request bit in status register */
+#define MG_REG_STATUS_BIT_DATA_REQ		0x08
+/* DSC - Drive Seek Complete */
+#define MG_REG_STATUS_BIT_SEEK_DONE		0x10
+/* DWF - Drive Write Fault */
+#define MG_REG_STATUS_BIT_WRITE_FAULT		0x20
+#define MG_REG_STATUS_BIT_READY			0x40
+#define MG_REG_STATUS_BIT_BUSY			0x80
+
+/* handy status */
+#define MG_STAT_READY	(MG_REG_STATUS_BIT_READY | MG_REG_STATUS_BIT_SEEK_DONE)
+#define MG_READY_OK(s)	(((s) & (MG_STAT_READY | \
+				(MG_REG_STATUS_BIT_BUSY | \
+				 MG_REG_STATUS_BIT_WRITE_FAULT | \
+				 MG_REG_STATUS_BIT_ERROR))) == MG_STAT_READY)
+
+/* Error register */
+#define MG_REG_ERR_AMNF		0x01
+#define MG_REG_ERR_ABRT		0x04
+#define MG_REG_ERR_IDNF		0x10
+#define MG_REG_ERR_UNC		0x40
+#define MG_REG_ERR_BBK		0x80
+
+/* error code for others */
+#define MG_ERR_NONE		0
+#define MG_ERR_TIMEOUT		0x100
+#define MG_ERR_INIT_STAT	0x101
+#define MG_ERR_TRANSLATION	0x102
+#define MG_ERR_CTRL_RST		0x103
+#define MG_ERR_INV_STAT		0x104
+#define MG_ERR_RSTOUT		0x105
+
+#define MG_MAX_ERRORS	6	/* Max read/write errors */
+
+/* command */
+#define MG_CMD_RD 0x20
+#define MG_CMD_WR 0x30
+#define MG_CMD_SLEEP 0x99
+#define MG_CMD_WAKEUP 0xC3
+#define MG_CMD_ID 0xEC
+#define MG_CMD_WR_CONF 0x3C
+#define MG_CMD_RD_CONF 0x40
+
+/* operation mode */
+#define MG_OP_CASCADE (1 << 0)
+#define MG_OP_CASCADE_SYNC_RD (1 << 1)
+#define MG_OP_CASCADE_SYNC_WR (1 << 2)
+#define MG_OP_INTERLEAVE (1 << 3)
+
+/* synchronous */
+#define MG_BURST_LAT_4 (3 << 4)
+#define MG_BURST_LAT_5 (4 << 4)
+#define MG_BURST_LAT_6 (5 << 4)
+#define MG_BURST_LAT_7 (6 << 4)
+#define MG_BURST_LAT_8 (7 << 4)
+#define MG_BURST_LEN_4 (1 << 1)
+#define MG_BURST_LEN_8 (2 << 1)
+#define MG_BURST_LEN_16 (3 << 1)
+#define MG_BURST_LEN_32 (4 << 1)
+#define MG_BURST_LEN_CONT (0 << 1)
+
+/* timeout value (unit: ms) */
+#define MG_TMAX_CONF_TO_CMD	1
+#define MG_TMAX_WAIT_RD_DRQ	10
+#define MG_TMAX_WAIT_WR_DRQ	500
+#define MG_TMAX_RST_TO_BUSY	10
+#define MG_TMAX_HDRST_TO_RDY	500
+#define MG_TMAX_SWRST_TO_RDY	500
+#define MG_TMAX_RSTOUT		3000
+
+/* device attribution */
+/* use mflash as boot device */
+#define MG_BOOT_DEV		(1 << 0)
+/* use mflash as storage device */
+#define MG_STORAGE_DEV		(1 << 1)
+/* same as MG_STORAGE_DEV, but bootloader already done reset sequence */
+#define MG_STORAGE_DEV_SKIP_RST	(1 << 2)
+
+#define MG_DEV_MASK (MG_BOOT_DEV | MG_STORAGE_DEV | MG_STORAGE_DEV_SKIP_RST)
+
+/* names of GPIO resource */
+#define MG_RST_PIN	"mg_rst"
+/* except MG_BOOT_DEV, reset-out pin should be assigned */
+#define MG_RSTOUT_PIN	"mg_rstout"
+
+/* private driver data */
+struct mg_drv_data {
+	/* disk resource */
+	u32 use_polling;
+
+	/* device attribution */
+	u32 dev_attr;
+
+	/* internally used */
+	struct mg_host *host;
+};
+
+/* main structure for mflash driver */
+struct mg_host {
+	struct device *dev;
+
+	struct request_queue *breq;
+	spinlock_t lock;
+	struct gendisk *gd;
+
+	struct timer_list timer;
+	void (*mg_do_intr) (struct mg_host *);
+
+	u16 id[ATA_ID_WORDS];
+
+	u16 cyls;
+	u16 heads;
+	u16 sectors;
+	u32 n_sectors;
+	u32 nres_sectors;
+
+	void __iomem *dev_base;
+	unsigned int irq;
+	unsigned int rst;
+	unsigned int rstout;
+
+	u32 major;
+	u32 error;
+};
+
+/*
+ * Debugging macro and defines
+ */
+#undef DO_MG_DEBUG
+#ifdef DO_MG_DEBUG
+#  define MG_DBG(fmt, args...) \
+	printk(KERN_DEBUG "%s:%d "fmt, __func__, __LINE__, ##args)
+#else /* CONFIG_MG_DEBUG */
+#  define MG_DBG(fmt, args...) do { } while (0)
+#endif /* CONFIG_MG_DEBUG */
+
 static void mg_request(struct request_queue *);
 
 static void mg_dump_status(const char *msg, unsigned int stat,
diff --git a/include/linux/mg_disk.h b/include/linux/mg_disk.h
deleted file mode 100644
index 1f76b1e..0000000
--- a/include/linux/mg_disk.h
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- *  include/linux/mg_disk.c
- *
- *  Support for the mGine m[g]flash IO mode.
- *  Based on legacy hd.c
- *
- * (c) 2008 mGine Co.,LTD
- * (c) 2008 unsik Kim <donari75@gmail.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
- */
-
-#ifndef __MG_DISK_H__
-#define __MG_DISK_H__
-
-#include <linux/blkdev.h>
-#include <linux/ata.h>
-
-/* name for block device */
-#define MG_DISK_NAME "mgd"
-/* name for platform device */
-#define MG_DEV_NAME "mg_disk"
-
-#define MG_DISK_MAJ 0
-#define MG_DISK_MAX_PART 16
-#define MG_SECTOR_SIZE 512
-#define MG_MAX_SECTS 256
-
-/* Register offsets */
-#define MG_BUFF_OFFSET			0x8000
-#define MG_STORAGE_BUFFER_SIZE		0x200
-#define MG_REG_OFFSET			0xC000
-#define MG_REG_FEATURE			(MG_REG_OFFSET + 2)	/* write case */
-#define MG_REG_ERROR			(MG_REG_OFFSET + 2)	/* read case */
-#define MG_REG_SECT_CNT			(MG_REG_OFFSET + 4)
-#define MG_REG_SECT_NUM			(MG_REG_OFFSET + 6)
-#define MG_REG_CYL_LOW			(MG_REG_OFFSET + 8)
-#define MG_REG_CYL_HIGH			(MG_REG_OFFSET + 0xA)
-#define MG_REG_DRV_HEAD			(MG_REG_OFFSET + 0xC)
-#define MG_REG_COMMAND			(MG_REG_OFFSET + 0xE)	/* write case */
-#define MG_REG_STATUS			(MG_REG_OFFSET + 0xE)	/* read  case */
-#define MG_REG_DRV_CTRL			(MG_REG_OFFSET + 0x10)
-#define MG_REG_BURST_CTRL		(MG_REG_OFFSET + 0x12)
-
-/* "Drive Select/Head Register" bit values */
-#define MG_REG_HEAD_MUST_BE_ON		0xA0 /* These 2 bits are always on */
-#define MG_REG_HEAD_DRIVE_MASTER	(0x00 | MG_REG_HEAD_MUST_BE_ON)
-#define MG_REG_HEAD_DRIVE_SLAVE		(0x10 | MG_REG_HEAD_MUST_BE_ON)
-#define MG_REG_HEAD_LBA_MODE		(0x40 | MG_REG_HEAD_MUST_BE_ON)
-
-
-/* "Device Control Register" bit values */
-#define MG_REG_CTRL_INTR_ENABLE			0x0
-#define MG_REG_CTRL_INTR_DISABLE		(0x1<<1)
-#define MG_REG_CTRL_RESET			(0x1<<2)
-#define MG_REG_CTRL_INTR_POLA_ACTIVE_HIGH	0x0
-#define MG_REG_CTRL_INTR_POLA_ACTIVE_LOW	(0x1<<4)
-#define MG_REG_CTRL_DPD_POLA_ACTIVE_LOW		0x0
-#define MG_REG_CTRL_DPD_POLA_ACTIVE_HIGH	(0x1<<5)
-#define MG_REG_CTRL_DPD_DISABLE			0x0
-#define MG_REG_CTRL_DPD_ENABLE			(0x1<<6)
-
-/* Status register bit */
-/* error bit in status register */
-#define MG_REG_STATUS_BIT_ERROR			0x01
-/* corrected error in status register */
-#define MG_REG_STATUS_BIT_CORRECTED_ERROR	0x04
-/* data request bit in status register */
-#define MG_REG_STATUS_BIT_DATA_REQ		0x08
-/* DSC - Drive Seek Complete */
-#define MG_REG_STATUS_BIT_SEEK_DONE		0x10
-/* DWF - Drive Write Fault */
-#define MG_REG_STATUS_BIT_WRITE_FAULT		0x20
-#define MG_REG_STATUS_BIT_READY			0x40
-#define MG_REG_STATUS_BIT_BUSY			0x80
-
-/* handy status */
-#define MG_STAT_READY	(MG_REG_STATUS_BIT_READY | MG_REG_STATUS_BIT_SEEK_DONE)
-#define MG_READY_OK(s)	(((s) & (MG_STAT_READY | \
-				(MG_REG_STATUS_BIT_BUSY | \
-				 MG_REG_STATUS_BIT_WRITE_FAULT | \
-				 MG_REG_STATUS_BIT_ERROR))) == MG_STAT_READY)
-
-/* Error register */
-#define MG_REG_ERR_AMNF		0x01
-#define MG_REG_ERR_ABRT		0x04
-#define MG_REG_ERR_IDNF		0x10
-#define MG_REG_ERR_UNC		0x40
-#define MG_REG_ERR_BBK		0x80
-
-/* error code for others */
-#define MG_ERR_NONE		0
-#define MG_ERR_TIMEOUT		0x100
-#define MG_ERR_INIT_STAT	0x101
-#define MG_ERR_TRANSLATION	0x102
-#define MG_ERR_CTRL_RST		0x103
-#define MG_ERR_INV_STAT		0x104
-#define MG_ERR_RSTOUT		0x105
-
-#define MG_MAX_ERRORS	6	/* Max read/write errors */
-
-/* command */
-#define MG_CMD_RD 0x20
-#define MG_CMD_WR 0x30
-#define MG_CMD_SLEEP 0x99
-#define MG_CMD_WAKEUP 0xC3
-#define MG_CMD_ID 0xEC
-#define MG_CMD_WR_CONF 0x3C
-#define MG_CMD_RD_CONF 0x40
-
-/* operation mode */
-#define MG_OP_CASCADE (1 << 0)
-#define MG_OP_CASCADE_SYNC_RD (1 << 1)
-#define MG_OP_CASCADE_SYNC_WR (1 << 2)
-#define MG_OP_INTERLEAVE (1 << 3)
-
-/* synchronous */
-#define MG_BURST_LAT_4 (3 << 4)
-#define MG_BURST_LAT_5 (4 << 4)
-#define MG_BURST_LAT_6 (5 << 4)
-#define MG_BURST_LAT_7 (6 << 4)
-#define MG_BURST_LAT_8 (7 << 4)
-#define MG_BURST_LEN_4 (1 << 1)
-#define MG_BURST_LEN_8 (2 << 1)
-#define MG_BURST_LEN_16 (3 << 1)
-#define MG_BURST_LEN_32 (4 << 1)
-#define MG_BURST_LEN_CONT (0 << 1)
-
-/* timeout value (unit: ms) */
-#define MG_TMAX_CONF_TO_CMD	1
-#define MG_TMAX_WAIT_RD_DRQ	10
-#define MG_TMAX_WAIT_WR_DRQ	500
-#define MG_TMAX_RST_TO_BUSY	10
-#define MG_TMAX_HDRST_TO_RDY	500
-#define MG_TMAX_SWRST_TO_RDY	500
-#define MG_TMAX_RSTOUT		3000
-
-/* device attribution */
-/* use mflash as boot device */
-#define MG_BOOT_DEV		(1 << 0)
-/* use mflash as storage device */
-#define MG_STORAGE_DEV		(1 << 1)
-/* same as MG_STORAGE_DEV, but bootloader already done reset sequence */
-#define MG_STORAGE_DEV_SKIP_RST	(1 << 2)
-
-#define MG_DEV_MASK (MG_BOOT_DEV | MG_STORAGE_DEV | MG_STORAGE_DEV_SKIP_RST)
-
-/* names of GPIO resource */
-#define MG_RST_PIN	"mg_rst"
-/* except MG_BOOT_DEV, reset-out pin should be assigned */
-#define MG_RSTOUT_PIN	"mg_rstout"
-
-/* private driver data */
-struct mg_drv_data {
-	/* disk resource */
-	u32 use_polling;
-
-	/* device attribution */
-	u32 dev_attr;
-
-	/* internally used */
-	struct mg_host *host;
-};
-
-/* main structure for mflash driver */
-struct mg_host {
-	struct device *dev;
-
-	struct request_queue *breq;
-	spinlock_t lock;
-	struct gendisk *gd;
-
-	struct timer_list timer;
-	void (*mg_do_intr) (struct mg_host *);
-
-	u16 id[ATA_ID_WORDS];
-
-	u16 cyls;
-	u16 heads;
-	u16 sectors;
-	u32 n_sectors;
-	u32 nres_sectors;
-
-	void __iomem *dev_base;
-	unsigned int irq;
-	unsigned int rst;
-	unsigned int rstout;
-
-	u32 major;
-	u32 error;
-};
-
-/*
- * Debugging macro and defines
- */
-#undef DO_MG_DEBUG
-#ifdef DO_MG_DEBUG
-#  define MG_DBG(fmt, args...) \
-	printk(KERN_DEBUG "%s:%d "fmt, __func__, __LINE__, ##args)
-#else /* CONFIG_MG_DEBUG */
-#  define MG_DBG(fmt, args...) do { } while (0)
-#endif /* CONFIG_MG_DEBUG */
-
-#endif
-- 
cgit v0.10.2


From a03bb5a32fff4aa23f081a3cff7e98d4084104cd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 13:06:15 +0900
Subject: mg_disk: clean up request completion paths

mg_disk implements its own partial completion.  Convert to standard
block layer partial completion.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: unsik Kim <donari75@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 2c6127ee..1a4cc96 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -503,95 +503,68 @@ static unsigned int mg_out(struct mg_host *host,
 
 static void mg_read(struct request *req)
 {
-	u32 remains, j;
+	u32 j;
 	struct mg_host *host = req->rq_disk->private_data;
 
-	remains = req->nr_sectors;
-
 	if (mg_out(host, req->sector, req->nr_sectors, MG_CMD_RD, NULL) !=
 			MG_ERR_NONE)
 		mg_bad_rw_intr(host);
 
 	MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
-			remains, req->sector, req->buffer);
+	       req->nr_sectors, req->sector, req->buffer);
+
+	do {
+		u16 *buff = (u16 *)req->buffer;
 
-	while (remains) {
 		if (mg_wait(host, MG_REG_STATUS_BIT_DATA_REQ,
 					MG_TMAX_WAIT_RD_DRQ) != MG_ERR_NONE) {
 			mg_bad_rw_intr(host);
 			return;
 		}
-		for (j = 0; j < MG_SECTOR_SIZE >> 1; j++) {
-			*(u16 *)req->buffer =
-				inw((unsigned long)host->dev_base +
-						MG_BUFF_OFFSET + (j << 1));
-			req->buffer += 2;
-		}
-
-		req->sector++;
-		req->errors = 0;
-		remains = --req->nr_sectors;
-		--req->current_nr_sectors;
-
-		if (req->current_nr_sectors <= 0) {
-			MG_DBG("remain : %d sects\n", remains);
-			__blk_end_request_cur(req, 0);
-			if (remains > 0)
-				req = elv_next_request(host->breq);
-		}
+		for (j = 0; j < MG_SECTOR_SIZE >> 1; j++)
+			*buff++ = inw((unsigned long)host->dev_base +
+				      MG_BUFF_OFFSET + (j << 1));
 
 		outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base +
 				MG_REG_COMMAND);
-	}
+	} while (__blk_end_request(req, 0, MG_SECTOR_SIZE));
 }
 
 static void mg_write(struct request *req)
 {
-	u32 remains, j;
+	u32 j;
 	struct mg_host *host = req->rq_disk->private_data;
 
-	remains = req->nr_sectors;
-
 	if (mg_out(host, req->sector, req->nr_sectors, MG_CMD_WR, NULL) !=
 			MG_ERR_NONE) {
 		mg_bad_rw_intr(host);
 		return;
 	}
 
-
 	MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
-			remains, req->sector, req->buffer);
-	while (remains) {
+	       req->nr_sectors, req->sector, req->buffer);
+
+	do {
+		u16 *buff = (u16 *)req->buffer;
+
 		if (mg_wait(host, MG_REG_STATUS_BIT_DATA_REQ,
 					MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
 			mg_bad_rw_intr(host);
 			return;
 		}
-		for (j = 0; j < MG_SECTOR_SIZE >> 1; j++) {
-			outw(*(u16 *)req->buffer,
-					(unsigned long)host->dev_base +
-					MG_BUFF_OFFSET + (j << 1));
-			req->buffer += 2;
-		}
-		req->sector++;
-		remains = --req->nr_sectors;
-		--req->current_nr_sectors;
-
-		if (req->current_nr_sectors <= 0) {
-			MG_DBG("remain : %d sects\n", remains);
-			__blk_end_request_cur(req, 0);
-			if (remains > 0)
-				req = elv_next_request(host->breq);
-		}
+		for (j = 0; j < MG_SECTOR_SIZE >> 1; j++)
+			outw(*buff++, (unsigned long)host->dev_base +
+				      MG_BUFF_OFFSET + (j << 1));
 
 		outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
 				MG_REG_COMMAND);
-	}
+	} while (__blk_end_request(req, 0, MG_SECTOR_SIZE));
 }
 
 static void mg_read_intr(struct mg_host *host)
 {
 	u32 i;
+	u16 *buff;
 	struct request *req;
 
 	/* check status */
@@ -612,39 +585,24 @@ static void mg_read_intr(struct mg_host *host)
 ok_to_read:
 	/* get current segment of request */
 	req = elv_next_request(host->breq);
+	buff = (u16 *)req->buffer;
 
 	/* read 1 sector */
-	for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) {
-		*(u16 *)req->buffer =
-			inw((unsigned long)host->dev_base + MG_BUFF_OFFSET +
-					(i << 1));
-		req->buffer += 2;
-	}
+	for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
+		*buff++ = inw((unsigned long)host->dev_base + MG_BUFF_OFFSET +
+			      (i << 1));
 
-	/* manipulate request */
 	MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
 			req->sector, req->nr_sectors - 1, req->buffer);
 
-	req->sector++;
-	req->errors = 0;
-	i = --req->nr_sectors;
-	--req->current_nr_sectors;
-
-	/* let know if current segment done */
-	if (req->current_nr_sectors <= 0)
-		__blk_end_request_cur(req, 0);
-
-	/* set handler if read remains */
-	if (i > 0) {
-		host->mg_do_intr = mg_read_intr;
-		mod_timer(&host->timer, jiffies + 3 * HZ);
-	}
-
 	/* send read confirm */
 	outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
 
-	/* goto next request */
-	if (!i)
+	if (__blk_end_request(req, 0, MG_SECTOR_SIZE)) {
+		/* set handler if read remains */
+		host->mg_do_intr = mg_read_intr;
+		mod_timer(&host->timer, jiffies + 3 * HZ);
+	} else /* goto next request */
 		mg_request(host->breq);
 }
 
@@ -653,6 +611,7 @@ static void mg_write_intr(struct mg_host *host)
 	u32 i, j;
 	u16 *buff;
 	struct request *req;
+	bool rem;
 
 	/* get current segment of request */
 	req = elv_next_request(host->breq);
@@ -673,18 +632,8 @@ static void mg_write_intr(struct mg_host *host)
 	return;
 
 ok_to_write:
-	/* manipulate request */
-	req->sector++;
-	i = --req->nr_sectors;
-	--req->current_nr_sectors;
-	req->buffer += MG_SECTOR_SIZE;
-
-	/* let know if current segment or all done */
-	if (!i || (req->bio && req->current_nr_sectors <= 0))
-		__blk_end_request_cur(req, 0);
-
-	/* write 1 sector and set handler if remains */
-	if (i > 0) {
+	if ((rem = __blk_end_request(req, 0, MG_SECTOR_SIZE))) {
+		/* write 1 sector and set handler if remains */
 		buff = (u16 *)req->buffer;
 		for (j = 0; j < MG_STORAGE_BUFFER_SIZE >> 1; j++) {
 			outw(*buff, (unsigned long)host->dev_base +
@@ -700,7 +649,7 @@ ok_to_write:
 	/* send write confirm */
 	outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
 
-	if (!i)
+	if (!rem)
 		mg_request(host->breq);
 }
 
-- 
cgit v0.10.2


From 8a11a789c39cd6f61de4243234cf44a46c23ffd0 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 28 Apr 2009 13:06:16 +0900
Subject: mg_disk: fix dependency on libata

Add local copies of ata_id_string() and ata_id_c_string() to mg_disk
so there is no need for the driver to depend on ATA and SCSI.

[ Impact: break dependency on libata by copying ata id string functions ]

Cc: unsik Kim <donari75@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index ddea8e4..f42fa50 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -412,7 +412,7 @@ config ATA_OVER_ETH
 
 config MG_DISK
 	tristate "mGine mflash, gflash support"
-	depends on ARM && ATA && GPIOLIB
+	depends on ARM && GPIOLIB
 	help
 	  mGine mFlash(gFlash) block device driver
 
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 1a4cc96..8e53cae 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -17,7 +17,7 @@
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/hdreg.h>
-#include <linux/libata.h>
+#include <linux/ata.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
 #include <linux/platform_device.h>
@@ -357,6 +357,42 @@ static irqreturn_t mg_irq(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
+/* local copy of ata_id_string() */
+static void mg_id_string(const u16 *id, unsigned char *s,
+			 unsigned int ofs, unsigned int len)
+{
+	unsigned int c;
+
+	BUG_ON(len & 1);
+
+	while (len > 0) {
+		c = id[ofs] >> 8;
+		*s = c;
+		s++;
+
+		c = id[ofs] & 0xff;
+		*s = c;
+		s++;
+
+		ofs++;
+		len -= 2;
+	}
+}
+
+/* local copy of ata_id_c_string() */
+static void mg_id_c_string(const u16 *id, unsigned char *s,
+			   unsigned int ofs, unsigned int len)
+{
+	unsigned char *p;
+
+	mg_id_string(id, s, ofs, len - 1);
+
+	p = s + strnlen(s, len - 1);
+	while (p > s && p[-1] == ' ')
+		p--;
+	*p = '\0';
+}
+
 static int mg_get_disk_id(struct mg_host *host)
 {
 	u32 i;
@@ -403,9 +439,9 @@ static int mg_get_disk_id(struct mg_host *host)
 		host->n_sectors -= host->nres_sectors;
 	}
 
-	ata_id_c_string(id, fwrev, ATA_ID_FW_REV, sizeof(fwrev));
-	ata_id_c_string(id, model, ATA_ID_PROD, sizeof(model));
-	ata_id_c_string(id, serial, ATA_ID_SERNO, sizeof(serial));
+	mg_id_c_string(id, fwrev, ATA_ID_FW_REV, sizeof(fwrev));
+	mg_id_c_string(id, model, ATA_ID_PROD, sizeof(model));
+	mg_id_c_string(id, serial, ATA_ID_SERNO, sizeof(serial));
 	printk(KERN_INFO "mg_disk: model: %s\n", model);
 	printk(KERN_INFO "mg_disk: firm: %.8s\n", fwrev);
 	printk(KERN_INFO "mg_disk: serial: %s\n", serial);
-- 
cgit v0.10.2


From f68adec3c7155a8bbf32a90cb4c4d0737df045d9 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 28 Apr 2009 13:06:17 +0900
Subject: mg_disk: use defines from <linux/ata.h>

While at it:
- remove MG_REG_HEAD_MUST_BE_ON define
- remove MG_REG_CTRL_INTR_ENABLE define
- remove MG_REG_HEAD_LBA_MODE define
- remove unused defines

Cc: unsik Kim <donari75@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 8e53cae..71e56cc 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -51,51 +51,10 @@
 #define MG_REG_DRV_CTRL			(MG_REG_OFFSET + 0x10)
 #define MG_REG_BURST_CTRL		(MG_REG_OFFSET + 0x12)
 
-/* "Drive Select/Head Register" bit values */
-#define MG_REG_HEAD_MUST_BE_ON		0xA0 /* These 2 bits are always on */
-#define MG_REG_HEAD_DRIVE_MASTER	(0x00 | MG_REG_HEAD_MUST_BE_ON)
-#define MG_REG_HEAD_DRIVE_SLAVE		(0x10 | MG_REG_HEAD_MUST_BE_ON)
-#define MG_REG_HEAD_LBA_MODE		(0x40 | MG_REG_HEAD_MUST_BE_ON)
-
-
-/* "Device Control Register" bit values */
-#define MG_REG_CTRL_INTR_ENABLE			0x0
-#define MG_REG_CTRL_INTR_DISABLE		(0x1<<1)
-#define MG_REG_CTRL_RESET			(0x1<<2)
-#define MG_REG_CTRL_INTR_POLA_ACTIVE_HIGH	0x0
-#define MG_REG_CTRL_INTR_POLA_ACTIVE_LOW	(0x1<<4)
-#define MG_REG_CTRL_DPD_POLA_ACTIVE_LOW		0x0
-#define MG_REG_CTRL_DPD_POLA_ACTIVE_HIGH	(0x1<<5)
-#define MG_REG_CTRL_DPD_DISABLE			0x0
-#define MG_REG_CTRL_DPD_ENABLE			(0x1<<6)
-
-/* Status register bit */
-/* error bit in status register */
-#define MG_REG_STATUS_BIT_ERROR			0x01
-/* corrected error in status register */
-#define MG_REG_STATUS_BIT_CORRECTED_ERROR	0x04
-/* data request bit in status register */
-#define MG_REG_STATUS_BIT_DATA_REQ		0x08
-/* DSC - Drive Seek Complete */
-#define MG_REG_STATUS_BIT_SEEK_DONE		0x10
-/* DWF - Drive Write Fault */
-#define MG_REG_STATUS_BIT_WRITE_FAULT		0x20
-#define MG_REG_STATUS_BIT_READY			0x40
-#define MG_REG_STATUS_BIT_BUSY			0x80
-
 /* handy status */
-#define MG_STAT_READY	(MG_REG_STATUS_BIT_READY | MG_REG_STATUS_BIT_SEEK_DONE)
-#define MG_READY_OK(s)	(((s) & (MG_STAT_READY | \
-				(MG_REG_STATUS_BIT_BUSY | \
-				 MG_REG_STATUS_BIT_WRITE_FAULT | \
-				 MG_REG_STATUS_BIT_ERROR))) == MG_STAT_READY)
-
-/* Error register */
-#define MG_REG_ERR_AMNF		0x01
-#define MG_REG_ERR_ABRT		0x04
-#define MG_REG_ERR_IDNF		0x10
-#define MG_REG_ERR_UNC		0x40
-#define MG_REG_ERR_BBK		0x80
+#define MG_STAT_READY	(ATA_DRDY | ATA_DSC)
+#define MG_READY_OK(s)	(((s) & (MG_STAT_READY | (ATA_BUSY | ATA_DF | \
+				 ATA_ERR))) == MG_STAT_READY)
 
 /* error code for others */
 #define MG_ERR_NONE		0
@@ -225,41 +184,39 @@ static void mg_dump_status(const char *msg, unsigned int stat,
 	}
 
 	printk(KERN_ERR "%s: %s: status=0x%02x { ", name, msg, stat & 0xff);
-	if (stat & MG_REG_STATUS_BIT_BUSY)
+	if (stat & ATA_BUSY)
 		printk("Busy ");
-	if (stat & MG_REG_STATUS_BIT_READY)
+	if (stat & ATA_DRDY)
 		printk("DriveReady ");
-	if (stat & MG_REG_STATUS_BIT_WRITE_FAULT)
+	if (stat & ATA_DF)
 		printk("WriteFault ");
-	if (stat & MG_REG_STATUS_BIT_SEEK_DONE)
+	if (stat & ATA_DSC)
 		printk("SeekComplete ");
-	if (stat & MG_REG_STATUS_BIT_DATA_REQ)
+	if (stat & ATA_DRQ)
 		printk("DataRequest ");
-	if (stat & MG_REG_STATUS_BIT_CORRECTED_ERROR)
+	if (stat & ATA_CORR)
 		printk("CorrectedError ");
-	if (stat & MG_REG_STATUS_BIT_ERROR)
+	if (stat & ATA_ERR)
 		printk("Error ");
 	printk("}\n");
-	if ((stat & MG_REG_STATUS_BIT_ERROR) == 0) {
+	if ((stat & ATA_ERR) == 0) {
 		host->error = 0;
 	} else {
 		host->error = inb((unsigned long)host->dev_base + MG_REG_ERROR);
 		printk(KERN_ERR "%s: %s: error=0x%02x { ", name, msg,
 				host->error & 0xff);
-		if (host->error & MG_REG_ERR_BBK)
+		if (host->error & ATA_BBK)
 			printk("BadSector ");
-		if (host->error & MG_REG_ERR_UNC)
+		if (host->error & ATA_UNC)
 			printk("UncorrectableError ");
-		if (host->error & MG_REG_ERR_IDNF)
+		if (host->error & ATA_IDNF)
 			printk("SectorIdNotFound ");
-		if (host->error & MG_REG_ERR_ABRT)
+		if (host->error & ATA_ABORTED)
 			printk("DriveStatusError ");
-		if (host->error & MG_REG_ERR_AMNF)
+		if (host->error & ATA_AMNF)
 			printk("AddrMarkNotFound ");
 		printk("}");
-		if (host->error &
-				(MG_REG_ERR_BBK | MG_REG_ERR_UNC |
-				 MG_REG_ERR_IDNF | MG_REG_ERR_AMNF)) {
+		if (host->error & (ATA_BBK | ATA_UNC | ATA_IDNF | ATA_AMNF)) {
 			if (host->breq) {
 				req = elv_next_request(host->breq);
 				if (req)
@@ -284,12 +241,12 @@ static unsigned int mg_wait(struct mg_host *host, u32 expect, u32 msec)
 
 	do {
 		cur_jiffies = jiffies;
-		if (status & MG_REG_STATUS_BIT_BUSY) {
-			if (expect == MG_REG_STATUS_BIT_BUSY)
+		if (status & ATA_BUSY) {
+			if (expect == ATA_BUSY)
 				break;
 		} else {
 			/* Check the error condition! */
-			if (status & MG_REG_STATUS_BIT_ERROR) {
+			if (status & ATA_ERR) {
 				mg_dump_status("mg_wait", status, host);
 				break;
 			}
@@ -298,8 +255,8 @@ static unsigned int mg_wait(struct mg_host *host, u32 expect, u32 msec)
 				if (MG_READY_OK(status))
 					break;
 
-			if (expect == MG_REG_STATUS_BIT_DATA_REQ)
-				if (status & MG_REG_STATUS_BIT_DATA_REQ)
+			if (expect == ATA_DRQ)
+				if (status & ATA_DRQ)
 					break;
 		}
 		if (!msec) {
@@ -404,12 +361,10 @@ static int mg_get_disk_id(struct mg_host *host)
 	char serial[ATA_ID_SERNO_LEN + 1];
 
 	if (!prv_data->use_polling)
-		outb(MG_REG_CTRL_INTR_DISABLE,
-				(unsigned long)host->dev_base +
-				MG_REG_DRV_CTRL);
+		outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
 
 	outb(MG_CMD_ID, (unsigned long)host->dev_base + MG_REG_COMMAND);
-	err = mg_wait(host, MG_REG_STATUS_BIT_DATA_REQ, MG_TMAX_WAIT_RD_DRQ);
+	err = mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_RD_DRQ);
 	if (err)
 		return err;
 
@@ -449,8 +404,7 @@ static int mg_get_disk_id(struct mg_host *host)
 			host->n_sectors, host->nres_sectors);
 
 	if (!prv_data->use_polling)
-		outb(MG_REG_CTRL_INTR_ENABLE, (unsigned long)host->dev_base +
-				MG_REG_DRV_CTRL);
+		outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
 
 	return err;
 }
@@ -464,7 +418,7 @@ static int mg_disk_init(struct mg_host *host)
 
 	/* hdd rst low */
 	gpio_set_value(host->rst, 0);
-	err = mg_wait(host, MG_REG_STATUS_BIT_BUSY, MG_TMAX_RST_TO_BUSY);
+	err = mg_wait(host, ATA_BUSY, MG_TMAX_RST_TO_BUSY);
 	if (err)
 		return err;
 
@@ -475,17 +429,14 @@ static int mg_disk_init(struct mg_host *host)
 		return err;
 
 	/* soft reset on */
-	outb(MG_REG_CTRL_RESET |
-			(prv_data->use_polling ? MG_REG_CTRL_INTR_DISABLE :
-			 MG_REG_CTRL_INTR_ENABLE),
+	outb(ATA_SRST | (prv_data->use_polling ? ATA_NIEN : 0),
 			(unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-	err = mg_wait(host, MG_REG_STATUS_BIT_BUSY, MG_TMAX_RST_TO_BUSY);
+	err = mg_wait(host, ATA_BUSY, MG_TMAX_RST_TO_BUSY);
 	if (err)
 		return err;
 
 	/* soft reset off */
-	outb(prv_data->use_polling ? MG_REG_CTRL_INTR_DISABLE :
-			MG_REG_CTRL_INTR_ENABLE,
+	outb(prv_data->use_polling ? ATA_NIEN : 0,
 			(unsigned long)host->dev_base + MG_REG_DRV_CTRL);
 	err = mg_wait(host, MG_STAT_READY, MG_TMAX_SWRST_TO_RDY);
 	if (err)
@@ -531,7 +482,7 @@ static unsigned int mg_out(struct mg_host *host,
 			MG_REG_CYL_LOW);
 	outb((u8)(sect_num >> 16), (unsigned long)host->dev_base +
 			MG_REG_CYL_HIGH);
-	outb((u8)((sect_num >> 24) | MG_REG_HEAD_LBA_MODE),
+	outb((u8)((sect_num >> 24) | ATA_LBA | ATA_DEVICE_OBS),
 			(unsigned long)host->dev_base + MG_REG_DRV_HEAD);
 	outb(cmd, (unsigned long)host->dev_base + MG_REG_COMMAND);
 	return MG_ERR_NONE;
@@ -552,8 +503,8 @@ static void mg_read(struct request *req)
 	do {
 		u16 *buff = (u16 *)req->buffer;
 
-		if (mg_wait(host, MG_REG_STATUS_BIT_DATA_REQ,
-					MG_TMAX_WAIT_RD_DRQ) != MG_ERR_NONE) {
+		if (mg_wait(host, ATA_DRQ,
+			    MG_TMAX_WAIT_RD_DRQ) != MG_ERR_NONE) {
 			mg_bad_rw_intr(host);
 			return;
 		}
@@ -583,8 +534,7 @@ static void mg_write(struct request *req)
 	do {
 		u16 *buff = (u16 *)req->buffer;
 
-		if (mg_wait(host, MG_REG_STATUS_BIT_DATA_REQ,
-					MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
+	if (mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
 			mg_bad_rw_intr(host);
 			return;
 		}
@@ -606,11 +556,11 @@ static void mg_read_intr(struct mg_host *host)
 	/* check status */
 	do {
 		i = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-		if (i & MG_REG_STATUS_BIT_BUSY)
+		if (i & ATA_BUSY)
 			break;
 		if (!MG_READY_OK(i))
 			break;
-		if (i & MG_REG_STATUS_BIT_DATA_REQ)
+		if (i & ATA_DRQ)
 			goto ok_to_read;
 	} while (0);
 	mg_dump_status("mg_read_intr", i, host);
@@ -655,11 +605,11 @@ static void mg_write_intr(struct mg_host *host)
 	/* check status */
 	do {
 		i = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-		if (i & MG_REG_STATUS_BIT_BUSY)
+		if (i & ATA_BUSY)
 			break;
 		if (!MG_READY_OK(i))
 			break;
-		if ((req->nr_sectors <= 1) || (i & MG_REG_STATUS_BIT_DATA_REQ))
+		if ((req->nr_sectors <= 1) || (i & ATA_DRQ))
 			goto ok_to_write;
 	} while (0);
 	mg_dump_status("mg_write_intr", i, host);
@@ -752,18 +702,15 @@ static unsigned int mg_issue_req(struct request *req,
 		break;
 	case WRITE:
 		/* TODO : handler */
-		outb(MG_REG_CTRL_INTR_DISABLE,
-				(unsigned long)host->dev_base +
-				MG_REG_DRV_CTRL);
+		outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
 		if (mg_out(host, sect_num, sect_cnt, MG_CMD_WR, &mg_write_intr)
 				!= MG_ERR_NONE) {
 			mg_bad_rw_intr(host);
 			return host->error;
 		}
 		del_timer(&host->timer);
-		mg_wait(host, MG_REG_STATUS_BIT_DATA_REQ, MG_TMAX_WAIT_WR_DRQ);
-		outb(MG_REG_CTRL_INTR_ENABLE, (unsigned long)host->dev_base +
-				MG_REG_DRV_CTRL);
+		mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_WR_DRQ);
+		outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
 		if (host->error) {
 			mg_bad_rw_intr(host);
 			return host->error;
@@ -849,9 +796,7 @@ static int mg_suspend(struct platform_device *plat_dev, pm_message_t state)
 		return -EIO;
 
 	if (!prv_data->use_polling)
-		outb(MG_REG_CTRL_INTR_DISABLE,
-				(unsigned long)host->dev_base +
-				MG_REG_DRV_CTRL);
+		outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
 
 	outb(MG_CMD_SLEEP, (unsigned long)host->dev_base + MG_REG_COMMAND);
 	/* wait until mflash deep sleep */
@@ -859,9 +804,7 @@ static int mg_suspend(struct platform_device *plat_dev, pm_message_t state)
 
 	if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD)) {
 		if (!prv_data->use_polling)
-			outb(MG_REG_CTRL_INTR_ENABLE,
-					(unsigned long)host->dev_base +
-					MG_REG_DRV_CTRL);
+			outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
 		return -EIO;
 	}
 
@@ -884,8 +827,7 @@ static int mg_resume(struct platform_device *plat_dev)
 		return -EIO;
 
 	if (!prv_data->use_polling)
-		outb(MG_REG_CTRL_INTR_ENABLE, (unsigned long)host->dev_base +
-				MG_REG_DRV_CTRL);
+		outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
 
 	return 0;
 }
-- 
cgit v0.10.2


From 5b644c7a218702668d7b610994e7dcbc3d4705d3 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 28 Apr 2009 08:17:54 +0000
Subject: clocksource: improve sh_cmt clocksource overflow handling

This patch improves the sh_cmt clocksource handling.

Currently the counter value is ignored in the case of
overflow. With this patch the overflow flag is read
before and after reading the counter, removing any
counter value and overflow flag mismatch issues.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c
index c247564..d607ac2 100644
--- a/drivers/clocksource/sh_cmt.c
+++ b/drivers/clocksource/sh_cmt.c
@@ -111,16 +111,21 @@ static unsigned long sh_cmt_get_counter(struct sh_cmt_priv *p,
 					int *has_wrapped)
 {
 	unsigned long v1, v2, v3;
+	int o1, o2;
+
+	o1 = sh_cmt_read(p, CMCSR) & p->overflow_bit;
 
 	/* Make sure the timer value is stable. Stolen from acpi_pm.c */
 	do {
+		o2 = o1;
 		v1 = sh_cmt_read(p, CMCNT);
 		v2 = sh_cmt_read(p, CMCNT);
 		v3 = sh_cmt_read(p, CMCNT);
-	} while (unlikely((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1)
-			  || (v3 > v1 && v3 < v2)));
+		o1 = sh_cmt_read(p, CMCSR) & p->overflow_bit;
+	} while (unlikely((o1 != o2) || (v1 > v2 && v1 < v3)
+			  || (v2 > v3 && v2 < v1) || (v3 > v1 && v3 < v2)));
 
-	*has_wrapped = sh_cmt_read(p, CMCSR) & p->overflow_bit;
+	*has_wrapped = o1;
 	return v2;
 }
 
@@ -394,7 +399,7 @@ static cycle_t sh_cmt_clocksource_read(struct clocksource *cs)
 	raw = sh_cmt_get_counter(p, &has_wrapped);
 
 	if (unlikely(has_wrapped))
-		raw = p->match_value;
+		raw += p->match_value;
 	spin_unlock_irqrestore(&p->lock, flags);
 
 	return value + raw;
-- 
cgit v0.10.2


From 8e0b842948156e3463879caed12b4ce51bed772e Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 28 Apr 2009 08:19:50 +0000
Subject: sh: setup timers in late_time_init()

This patch moves the SuperH timer setup code from time_init()
to late_time_init(). Good things about this change:
 - interrupts: they are enabled at late_time_init()
 - mm: regular kmalloc() can be used at late_time_init()

Together with moving to late_time_init() this patch changes
the sh_cmt driver to always allocate with kmalloc(). This
simplifies the code a bit and also fixes section mismatches.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/time_32.c b/arch/sh/kernel/time_32.c
index 109409f..a2458bf 100644
--- a/arch/sh/kernel/time_32.c
+++ b/arch/sh/kernel/time_32.c
@@ -237,21 +237,8 @@ unsigned long long sched_clock(void)
 }
 #endif
 
-void __init time_init(void)
+static void __init sh_late_time_init(void)
 {
-	if (board_time_init)
-		board_time_init();
-
-	clk_init();
-
-	rtc_sh_get_time(&xtime);
-	set_normalized_timespec(&wall_to_monotonic,
-				-xtime.tv_sec, -xtime.tv_nsec);
-
-#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
-	local_timer_setup(smp_processor_id());
-#endif
-
 	/*
 	 * Make sure all compiled-in early timers register themselves.
 	 * Run probe() for one "earlytimer" device.
@@ -270,3 +257,22 @@ void __init time_init(void)
 
 	printk(KERN_INFO "Using %s for system timer\n", sys_timer->name);
 }
+
+void __init time_init(void)
+{
+	if (board_time_init)
+		board_time_init();
+
+	clk_init();
+
+	rtc_sh_get_time(&xtime);
+	set_normalized_timespec(&wall_to_monotonic,
+				-xtime.tv_sec, -xtime.tv_nsec);
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+	local_timer_setup(smp_processor_id());
+#endif
+
+	late_time_init = sh_late_time_init;
+}
+
diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c
index d607ac2..bf3e4c1 100644
--- a/drivers/clocksource/sh_cmt.c
+++ b/drivers/clocksource/sh_cmt.c
@@ -18,7 +18,6 @@
  */
 
 #include <linux/init.h>
-#include <linux/bootmem.h>
 #include <linux/platform_device.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
@@ -645,11 +644,7 @@ static int __devinit sh_cmt_probe(struct platform_device *pdev)
 		return 0;
 	}
 
-	if (is_early_platform_device(pdev))
-		p = alloc_bootmem(sizeof(*p));
-	else
-		p = kmalloc(sizeof(*p), GFP_KERNEL);
-
+	p = kmalloc(sizeof(*p), GFP_KERNEL);
 	if (p == NULL) {
 		dev_err(&pdev->dev, "failed to allocate driver data\n");
 		return -ENOMEM;
@@ -657,11 +652,7 @@ static int __devinit sh_cmt_probe(struct platform_device *pdev)
 
 	ret = sh_cmt_setup(p, pdev);
 	if (ret) {
-		if (is_early_platform_device(pdev))
-			free_bootmem(__pa(p), sizeof(*p));
-		else
-			kfree(p);
-
+		kfree(p);
 		platform_set_drvdata(pdev, NULL);
 	}
 	return ret;
-- 
cgit v0.10.2


From 9ec4fa271faf2db3b8e1419c998da1ca6b094eb6 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Apr 2009 17:57:18 -0700
Subject: irq, cpumask: correct CPUMASKS_OFFSTACK typo and fix fallout

CPUMASKS_OFFSTACK is not defined anywhere (it is CPUMASK_OFFSTACK).
It is a typo and init_allocate_desc_masks() is called before it set
affinity to all cpus...

Split init_alloc_desc_masks() into all_desc_masks() and init_desc_masks().

Also use CPUMASK_OFFSTACK in alloc_desc_masks().

[ Impact: fix smp_affinity copying/setup when moving irq_desc between CPUs ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
LKML-Reference: <49F6546E.3040406@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index b7cbeed..c4953cf 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -424,27 +424,25 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
 
 #ifdef CONFIG_SMP
 /**
- * init_alloc_desc_masks - allocate cpumasks for irq_desc
+ * alloc_desc_masks - allocate cpumasks for irq_desc
  * @desc:	pointer to irq_desc struct
  * @cpu:	cpu which will be handling the cpumasks
  * @boot:	true if need bootmem
  *
  * Allocates affinity and pending_mask cpumask if required.
  * Returns true if successful (or not required).
- * Side effect: affinity has all bits set, pending_mask has all bits clear.
  */
-static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
+static inline bool alloc_desc_masks(struct irq_desc *desc, int cpu,
 								bool boot)
 {
+#ifdef CONFIG_CPUMASK_OFFSTACK
 	int node;
 
 	if (boot) {
 		alloc_bootmem_cpumask_var(&desc->affinity);
-		cpumask_setall(desc->affinity);
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 		alloc_bootmem_cpumask_var(&desc->pending_mask);
-		cpumask_clear(desc->pending_mask);
 #endif
 		return true;
 	}
@@ -453,18 +451,25 @@ static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
 
 	if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
 		return false;
-	cpumask_setall(desc->affinity);
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) {
 		free_cpumask_var(desc->affinity);
 		return false;
 	}
-	cpumask_clear(desc->pending_mask);
+#endif
 #endif
 	return true;
 }
 
+static inline void init_desc_masks(struct irq_desc *desc)
+{
+	cpumask_setall(desc->affinity);
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+	cpumask_clear(desc->pending_mask);
+#endif
+}
+
 /**
  * init_copy_desc_masks - copy cpumasks for irq_desc
  * @old_desc:	pointer to old irq_desc struct
@@ -478,7 +483,7 @@ static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
 static inline void init_copy_desc_masks(struct irq_desc *old_desc,
 					struct irq_desc *new_desc)
 {
-#ifdef CONFIG_CPUMASKS_OFFSTACK
+#ifdef CONFIG_CPUMASK_OFFSTACK
 	cpumask_copy(new_desc->affinity, old_desc->affinity);
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
@@ -499,12 +504,16 @@ static inline void free_desc_masks(struct irq_desc *old_desc,
 
 #else /* !CONFIG_SMP */
 
-static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
+static inline bool alloc_desc_masks(struct irq_desc *desc, int cpu,
 								bool boot)
 {
 	return true;
 }
 
+static inline void init_desc_masks(struct irq_desc *desc)
+{
+}
+
 static inline void init_copy_desc_masks(struct irq_desc *old_desc,
 					struct irq_desc *new_desc)
 {
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index d82142b..882c798 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -115,10 +115,11 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
 		printk(KERN_ERR "can not alloc kstat_irqs\n");
 		BUG_ON(1);
 	}
-	if (!init_alloc_desc_masks(desc, cpu, false)) {
+	if (!alloc_desc_masks(desc, cpu, false)) {
 		printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
 		BUG_ON(1);
 	}
+	init_desc_masks(desc);
 	arch_init_chip_data(desc, cpu);
 }
 
@@ -169,7 +170,8 @@ int __init early_irq_init(void)
 		desc[i].irq = i;
 		desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
 		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
-		init_alloc_desc_masks(&desc[i], 0, true);
+		alloc_desc_masks(&desc[i], 0, true);
+		init_desc_masks(&desc[i]);
 		irq_desc_ptrs[i] = desc + i;
 	}
 
@@ -256,7 +258,8 @@ int __init early_irq_init(void)
 
 	for (i = 0; i < count; i++) {
 		desc[i].irq = i;
-		init_alloc_desc_masks(&desc[i], 0, true);
+		alloc_desc_masks(&desc[i], 0, true);
+		init_desc_masks(&desc[i]);
 		desc[i].kstat_irqs = kstat_irqs_all[i];
 	}
 	return arch_early_irq_init();
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 44bbdcb..5760d72 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -37,7 +37,7 @@ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
 		 struct irq_desc *desc, int cpu)
 {
 	memcpy(desc, old_desc, sizeof(struct irq_desc));
-	if (!init_alloc_desc_masks(desc, cpu, false)) {
+	if (!alloc_desc_masks(desc, cpu, false)) {
 		printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
 				"for migration.\n", irq);
 		return false;
-- 
cgit v0.10.2


From fcef5911c7ea89b80d5bfc727f402f37c9eefd57 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Apr 2009 17:58:23 -0700
Subject: x86/irq: remove leftover code from NUMA_MIGRATE_IRQ_DESC

The original feature of migrating irq_desc dynamic was too fragile
and was causing problems: it caused crashes on systems with lots of
cards with MSI-X when user-space irq-balancer was enabled.

We now have new patches that create irq_desc according to device
numa node. This patch removes the leftover bits of the dynamic balancer.

[ Impact: remove dead code ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <49F654AF.8000808@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index df9e885..e1b2543 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -274,16 +274,6 @@ config SPARSE_IRQ
 
 	  If you don't know what to do here, say N.
 
-config NUMA_MIGRATE_IRQ_DESC
-	bool "Move irq desc when changing irq smp_affinity"
-	depends on SPARSE_IRQ && NUMA
-	depends on BROKEN
-	default n
-	---help---
-	  This enables moving irq_desc to cpu/node that irq will use handled.
-
-	  If you don't know what to do here, say N.
-
 config X86_MPPARSE
 	bool "Enable MPS table" if ACPI
 	default y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 9fe5d21..27b8ce0 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -195,7 +195,6 @@ CONFIG_HIGH_RES_TIMERS=y
 CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
 CONFIG_SMP=y
 CONFIG_SPARSE_IRQ=y
-# CONFIG_NUMA_MIGRATE_IRQ_DESC is not set
 CONFIG_X86_FIND_SMP_CONFIG=y
 CONFIG_X86_MPPARSE=y
 # CONFIG_X86_ELAN is not set
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 30da617..9fbf0f7 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -148,9 +148,6 @@ struct irq_cfg {
 	unsigned move_cleanup_count;
 	u8 vector;
 	u8 move_in_progress : 1;
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-	u8 move_desc_pending : 1;
-#endif
 };
 
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
@@ -254,8 +251,7 @@ int arch_init_chip_data(struct irq_desc *desc, int cpu)
 	return 0;
 }
 
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-
+/* for move_irq_desc */
 static void
 init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
 {
@@ -356,19 +352,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
 		old_desc->chip_data = NULL;
 	}
 }
-
-static void
-set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
-{
-	struct irq_cfg *cfg = desc->chip_data;
-
-	if (!cfg->move_in_progress) {
-		/* it means that domain is not changed */
-		if (!cpumask_intersects(desc->affinity, mask))
-			cfg->move_desc_pending = 1;
-	}
-}
-#endif
+/* end for move_irq_desc */
 
 #else
 static struct irq_cfg *irq_cfg(unsigned int irq)
@@ -378,13 +362,6 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
 
 #endif
 
-#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
-static inline void
-set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
-{
-}
-#endif
-
 struct io_apic {
 	unsigned int index;
 	unsigned int unused[3];
@@ -592,9 +569,6 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
 	if (assign_irq_vector(irq, cfg, mask))
 		return BAD_APICID;
 
-	/* check that before desc->addinity get updated */
-	set_extra_move_desc(desc, mask);
-
 	cpumask_copy(desc->affinity, mask);
 
 	return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
@@ -2393,8 +2367,6 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	set_extra_move_desc(desc, mask);
-
 	dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
 
 	irte.vector = cfg->vector;
@@ -2491,34 +2463,14 @@ static void irq_complete_move(struct irq_desc **descp)
 	struct irq_cfg *cfg = desc->chip_data;
 	unsigned vector, me;
 
-	if (likely(!cfg->move_in_progress)) {
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-		if (likely(!cfg->move_desc_pending))
-			return;
-
-		/* domain has not changed, but affinity did */
-		me = smp_processor_id();
-		if (cpumask_test_cpu(me, desc->affinity)) {
-			*descp = desc = move_irq_desc(desc, me);
-			/* get the new one */
-			cfg = desc->chip_data;
-			cfg->move_desc_pending = 0;
-		}
-#endif
+	if (likely(!cfg->move_in_progress))
 		return;
-	}
 
 	vector = ~get_irq_regs()->orig_ax;
 	me = smp_processor_id();
 
-	if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) {
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-		*descp = desc = move_irq_desc(desc, me);
-		/* get the new one */
-		cfg = desc->chip_data;
-#endif
+	if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
 		send_cleanup_vector(cfg);
-	}
 }
 #else
 static inline void irq_complete_move(struct irq_desc **descp) {}
diff --git a/include/linux/irq.h b/include/linux/irq.h
index c4953cf..2a34cd6 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -212,16 +212,6 @@ extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
 
 extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
 
-static inline struct irq_desc *
-irq_remap_to_desc(unsigned int irq, struct irq_desc *desc)
-{
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-	return irq_to_desc(irq);
-#else
-	return desc;
-#endif
-}
-
 /*
  * Migration helpers for obsolete names, they will go away:
  */
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 3394f8f..2f06527 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -3,5 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
-obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o
+obj-$(CONFIG_SPARSE_IRQ) += numa_migrate.o
 obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c687ba4..13c68e7 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -359,7 +359,6 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 
 	spin_lock(&desc->lock);
 	mask_ack_irq(desc, irq);
-	desc = irq_remap_to_desc(irq, desc);
 
 	if (unlikely(desc->status & IRQ_INPROGRESS))
 		goto out_unlock;
@@ -438,7 +437,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 	desc->status &= ~IRQ_INPROGRESS;
 out:
 	desc->chip->eoi(irq);
-	desc = irq_remap_to_desc(irq, desc);
 
 	spin_unlock(&desc->lock);
 }
@@ -475,7 +473,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 		    !desc->action)) {
 		desc->status |= (IRQ_PENDING | IRQ_MASKED);
 		mask_ack_irq(desc, irq);
-		desc = irq_remap_to_desc(irq, desc);
 		goto out_unlock;
 	}
 	kstat_incr_irqs_this_cpu(irq, desc);
@@ -483,7 +480,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 	/* Start handling the irq */
 	if (desc->chip->ack)
 		desc->chip->ack(irq);
-	desc = irq_remap_to_desc(irq, desc);
 
 	/* Mark the IRQ currently in progress.*/
 	desc->status |= IRQ_INPROGRESS;
@@ -544,10 +540,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
 	if (!noirqdebug)
 		note_interrupt(irq, desc, action_ret);
 
-	if (desc->chip->eoi) {
+	if (desc->chip->eoi)
 		desc->chip->eoi(irq);
-		desc = irq_remap_to_desc(irq, desc);
-	}
 }
 
 void
@@ -582,10 +576,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 
 	/* Uninstall? */
 	if (handle == handle_bad_irq) {
-		if (desc->chip != &no_irq_chip) {
+		if (desc->chip != &no_irq_chip)
 			mask_ack_irq(desc, irq);
-			desc = irq_remap_to_desc(irq, desc);
-		}
 		desc->status |= IRQ_DISABLED;
 		desc->depth = 1;
 	}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 882c798..3e0cbc4 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -458,11 +458,8 @@ unsigned int __do_IRQ(unsigned int irq)
 		/*
 		 * No locking required for CPU-local interrupts:
 		 */
-		if (desc->chip->ack) {
+		if (desc->chip->ack)
 			desc->chip->ack(irq);
-			/* get new one */
-			desc = irq_remap_to_desc(irq, desc);
-		}
 		if (likely(!(desc->status & IRQ_DISABLED))) {
 			action_ret = handle_IRQ_event(irq, desc->action);
 			if (!noirqdebug)
@@ -473,10 +470,8 @@ unsigned int __do_IRQ(unsigned int irq)
 	}
 
 	spin_lock(&desc->lock);
-	if (desc->chip->ack) {
+	if (desc->chip->ack)
 		desc->chip->ack(irq);
-		desc = irq_remap_to_desc(irq, desc);
-	}
 	/*
 	 * REPLAY is when Linux resends an IRQ that was dropped earlier
 	 * WAITING is used by probe to mark irqs that are being tested
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 5760d72..ce72bc3 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -97,9 +97,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
 
 	/* free the old one */
 	free_one_irq_desc(old_desc, desc);
-	spin_unlock(&old_desc->lock);
 	kfree(old_desc);
-	spin_lock(&desc->lock);
 
 	return desc;
 
-- 
cgit v0.10.2


From d5dedd4507d307eb3f35f21b6e16f336fdc0d82a Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Apr 2009 17:59:21 -0700
Subject: irq: change ->set_affinity() to return status

according to Ingo, change set_affinity() in irq_chip should return int,
because that way we can handle failure cases in a much cleaner way, in
the genirq layer.

v2: fix two typos

[ Impact: extend API ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: linux-arch@vger.kernel.org
LKML-Reference: <49F654E9.4070809@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/alpha/kernel/sys_dp264.c b/arch/alpha/kernel/sys_dp264.c
index 9c9d1fd..5bd5259 100644
--- a/arch/alpha/kernel/sys_dp264.c
+++ b/arch/alpha/kernel/sys_dp264.c
@@ -176,22 +176,26 @@ cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity)
 	}
 }
 
-static void
+static int
 dp264_set_affinity(unsigned int irq, const struct cpumask *affinity)
 { 
 	spin_lock(&dp264_irq_lock);
 	cpu_set_irq_affinity(irq, *affinity);
 	tsunami_update_irq_hw(cached_irq_mask);
 	spin_unlock(&dp264_irq_lock);
+
+	return 0;
 }
 
-static void
+static int
 clipper_set_affinity(unsigned int irq, const struct cpumask *affinity)
 { 
 	spin_lock(&dp264_irq_lock);
 	cpu_set_irq_affinity(irq - 16, *affinity);
 	tsunami_update_irq_hw(cached_irq_mask);
 	spin_unlock(&dp264_irq_lock);
+
+	return 0;
 }
 
 static struct hw_interrupt_type dp264_irq_type = {
diff --git a/arch/alpha/kernel/sys_titan.c b/arch/alpha/kernel/sys_titan.c
index 27f840a..8dd239e 100644
--- a/arch/alpha/kernel/sys_titan.c
+++ b/arch/alpha/kernel/sys_titan.c
@@ -157,13 +157,15 @@ titan_cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity)
 
 }
 
-static void
+static int
 titan_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
 { 
 	spin_lock(&titan_irq_lock);
 	titan_cpu_set_irq_affinity(irq - 16, *affinity);
 	titan_update_irq_hw(titan_cached_irq_mask);
 	spin_unlock(&titan_irq_lock);
+
+	return 0;
 }
 
 static void
diff --git a/arch/arm/common/gic.c b/arch/arm/common/gic.c
index c6884ba..90f6b7f 100644
--- a/arch/arm/common/gic.c
+++ b/arch/arm/common/gic.c
@@ -109,7 +109,7 @@ static void gic_unmask_irq(unsigned int irq)
 }
 
 #ifdef CONFIG_SMP
-static void gic_set_cpu(unsigned int irq, const struct cpumask *mask_val)
+static int gic_set_cpu(unsigned int irq, const struct cpumask *mask_val)
 {
 	void __iomem *reg = gic_dist_base(irq) + GIC_DIST_TARGET + (gic_irq(irq) & ~3);
 	unsigned int shift = (irq % 4) * 8;
@@ -122,6 +122,8 @@ static void gic_set_cpu(unsigned int irq, const struct cpumask *mask_val)
 	val |= 1 << (cpu + shift);
 	writel(val, reg);
 	spin_unlock(&irq_controller_lock);
+
+	return 0;
 }
 #endif
 
diff --git a/arch/cris/arch-v32/kernel/irq.c b/arch/cris/arch-v32/kernel/irq.c
index df3925c..d70b445 100644
--- a/arch/cris/arch-v32/kernel/irq.c
+++ b/arch/cris/arch-v32/kernel/irq.c
@@ -325,12 +325,14 @@ static void end_crisv32_irq(unsigned int irq)
 {
 }
 
-void set_affinity_crisv32_irq(unsigned int irq, const struct cpumask *dest)
+int set_affinity_crisv32_irq(unsigned int irq, const struct cpumask *dest)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&irq_lock, flags);
 	irq_allocations[irq - FIRST_IRQ].mask = *dest;
 	spin_unlock_irqrestore(&irq_lock, flags);
+
+	return 0;
 }
 
 static struct irq_chip crisv32_irq_type = {
diff --git a/arch/ia64/hp/sim/hpsim_irq.c b/arch/ia64/hp/sim/hpsim_irq.c
index cc0a318..acb5047 100644
--- a/arch/ia64/hp/sim/hpsim_irq.c
+++ b/arch/ia64/hp/sim/hpsim_irq.c
@@ -21,9 +21,10 @@ hpsim_irq_noop (unsigned int irq)
 {
 }
 
-static void
+static int
 hpsim_set_affinity_noop(unsigned int a, const struct cpumask *b)
 {
+	return 0;
 }
 
 static struct hw_interrupt_type irq_type_hp_sim = {
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
index 166e0d8..f92cef4 100644
--- a/arch/ia64/kernel/iosapic.c
+++ b/arch/ia64/kernel/iosapic.c
@@ -329,7 +329,7 @@ unmask_irq (unsigned int irq)
 }
 
 
-static void
+static int
 iosapic_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 #ifdef CONFIG_SMP
@@ -343,15 +343,15 @@ iosapic_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 	cpu = cpumask_first_and(cpu_online_mask, mask);
 	if (cpu >= nr_cpu_ids)
-		return;
+		return -1;
 
 	if (irq_prepare_move(irq, cpu))
-		return;
+		return -1;
 
 	dest = cpu_physical_id(cpu);
 
 	if (!iosapic_intr_info[irq].count)
-		return;			/* not an IOSAPIC interrupt */
+		return -1;			/* not an IOSAPIC interrupt */
 
 	set_irq_affinity_info(irq, dest, redir);
 
@@ -376,7 +376,9 @@ iosapic_set_affinity(unsigned int irq, const struct cpumask *mask)
 		iosapic_write(iosapic, IOSAPIC_RTE_HIGH(rte_index), high32);
 		iosapic_write(iosapic, IOSAPIC_RTE_LOW(rte_index), low32);
 	}
+
 #endif
+	return 0;
 }
 
 /*
diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c
index 2b15e23..0f8ade9 100644
--- a/arch/ia64/kernel/msi_ia64.c
+++ b/arch/ia64/kernel/msi_ia64.c
@@ -12,7 +12,7 @@
 static struct irq_chip	ia64_msi_chip;
 
 #ifdef CONFIG_SMP
-static void ia64_set_msi_irq_affinity(unsigned int irq,
+static int ia64_set_msi_irq_affinity(unsigned int irq,
 				      const cpumask_t *cpu_mask)
 {
 	struct msi_msg msg;
@@ -20,10 +20,10 @@ static void ia64_set_msi_irq_affinity(unsigned int irq,
 	int cpu = first_cpu(*cpu_mask);
 
 	if (!cpu_online(cpu))
-		return;
+		return -1;
 
 	if (irq_prepare_move(irq, cpu))
-		return;
+		return -1;
 
 	read_msi_msg(irq, &msg);
 
@@ -39,6 +39,8 @@ static void ia64_set_msi_irq_affinity(unsigned int irq,
 
 	write_msi_msg(irq, &msg);
 	cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu));
+
+	return 0;
 }
 #endif /* CONFIG_SMP */
 
@@ -130,17 +132,17 @@ void arch_teardown_msi_irq(unsigned int irq)
 
 #ifdef CONFIG_DMAR
 #ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_cfg *cfg = irq_cfg + irq;
 	struct msi_msg msg;
 	int cpu = cpumask_first(mask);
 
 	if (!cpu_online(cpu))
-		return;
+		return -1;
 
 	if (irq_prepare_move(irq, cpu))
-		return;
+		return -1;
 
 	dmar_msi_read(irq, &msg);
 
@@ -151,6 +153,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 	dmar_msi_write(irq, &msg);
 	cpumask_copy(irq_desc[irq].affinity, mask);
+
+	return 0;
 }
 #endif /* CONFIG_SMP */
 
diff --git a/arch/ia64/sn/kernel/irq.c b/arch/ia64/sn/kernel/irq.c
index 66fd705..764f26a 100644
--- a/arch/ia64/sn/kernel/irq.c
+++ b/arch/ia64/sn/kernel/irq.c
@@ -227,7 +227,7 @@ finish_up:
 	return new_irq_info;
 }
 
-static void sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask)
+static int sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask)
 {
 	struct sn_irq_info *sn_irq_info, *sn_irq_info_safe;
 	nasid_t nasid;
@@ -239,6 +239,8 @@ static void sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask)
 	list_for_each_entry_safe(sn_irq_info, sn_irq_info_safe,
 				 sn_irq_lh[irq], list)
 		(void)sn_retarget_vector(sn_irq_info, nasid, slice);
+
+	return 0;
 }
 
 #ifdef CONFIG_SMP
diff --git a/arch/ia64/sn/kernel/msi_sn.c b/arch/ia64/sn/kernel/msi_sn.c
index 81e4289..fbbfb97 100644
--- a/arch/ia64/sn/kernel/msi_sn.c
+++ b/arch/ia64/sn/kernel/msi_sn.c
@@ -151,7 +151,7 @@ int sn_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *entry)
 }
 
 #ifdef CONFIG_SMP
-static void sn_set_msi_irq_affinity(unsigned int irq,
+static int sn_set_msi_irq_affinity(unsigned int irq,
 				    const struct cpumask *cpu_mask)
 {
 	struct msi_msg msg;
@@ -168,7 +168,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq,
 	cpu = cpumask_first(cpu_mask);
 	sn_irq_info = sn_msi_info[irq].sn_irq_info;
 	if (sn_irq_info == NULL || sn_irq_info->irq_int_bit >= 0)
-		return;
+		return -1;
 
 	/*
 	 * Release XIO resources for the old MSI PCI address
@@ -189,7 +189,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq,
 	new_irq_info = sn_retarget_vector(sn_irq_info, nasid, slice);
 	sn_msi_info[irq].sn_irq_info = new_irq_info;
 	if (new_irq_info == NULL)
-		return;
+		return -1;
 
 	/*
 	 * Map the xio address into bus space
@@ -206,6 +206,8 @@ static void sn_set_msi_irq_affinity(unsigned int irq,
 
 	write_msi_msg(irq, &msg);
 	cpumask_copy(irq_desc[irq].affinity, cpu_mask);
+
+	return 0;
 }
 #endif /* CONFIG_SMP */
 
diff --git a/arch/mips/cavium-octeon/octeon-irq.c b/arch/mips/cavium-octeon/octeon-irq.c
index 1c19af8..d3a0c81 100644
--- a/arch/mips/cavium-octeon/octeon-irq.c
+++ b/arch/mips/cavium-octeon/octeon-irq.c
@@ -177,7 +177,7 @@ static void octeon_irq_ciu0_disable(unsigned int irq)
 }
 
 #ifdef CONFIG_SMP
-static void octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask *dest)
+static int octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask *dest)
 {
 	int cpu;
 	int bit = irq - OCTEON_IRQ_WORKQ0;	/* Bit 0-63 of EN0 */
@@ -199,6 +199,8 @@ static void octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask
 	 */
 	cvmx_read_csr(CVMX_CIU_INTX_EN0(cvmx_get_core_num() * 2));
 	write_unlock(&octeon_irq_ciu0_rwlock);
+
+	return 0;
 }
 #endif
 
@@ -292,7 +294,7 @@ static void octeon_irq_ciu1_disable(unsigned int irq)
 }
 
 #ifdef CONFIG_SMP
-static void octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask *dest)
+static int octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask *dest)
 {
 	int cpu;
 	int bit = irq - OCTEON_IRQ_WDOG0;	/* Bit 0-63 of EN1 */
@@ -315,6 +317,8 @@ static void octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask
 	 */
 	cvmx_read_csr(CVMX_CIU_INTX_EN1(cvmx_get_core_num() * 2 + 1));
 	write_unlock(&octeon_irq_ciu1_rwlock);
+
+	return 0;
 }
 #endif
 
diff --git a/arch/mips/include/asm/irq.h b/arch/mips/include/asm/irq.h
index 3214ade..4f1eed1 100644
--- a/arch/mips/include/asm/irq.h
+++ b/arch/mips/include/asm/irq.h
@@ -49,7 +49,7 @@ static inline void smtc_im_ack_irq(unsigned int irq)
 #ifdef CONFIG_MIPS_MT_SMTC_IRQAFF
 #include <linux/cpumask.h>
 
-extern void plat_set_irq_affinity(unsigned int irq,
+extern int plat_set_irq_affinity(unsigned int irq,
 				  const struct cpumask *affinity);
 extern void smtc_forward_irq(unsigned int irq);
 
diff --git a/arch/mips/kernel/irq-gic.c b/arch/mips/kernel/irq-gic.c
index 87deb8f..3f43c2e 100644
--- a/arch/mips/kernel/irq-gic.c
+++ b/arch/mips/kernel/irq-gic.c
@@ -155,7 +155,7 @@ static void gic_unmask_irq(unsigned int irq)
 
 static DEFINE_SPINLOCK(gic_lock);
 
-static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
+static int gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 {
 	cpumask_t	tmp = CPU_MASK_NONE;
 	unsigned long	flags;
@@ -166,7 +166,7 @@ static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 
 	cpumask_and(&tmp, cpumask, cpu_online_mask);
 	if (cpus_empty(tmp))
-		return;
+		return -1;
 
 	/* Assumption : cpumask refers to a single CPU */
 	spin_lock_irqsave(&gic_lock, flags);
@@ -190,6 +190,7 @@ static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 	cpumask_copy(irq_desc[irq].affinity, cpumask);
 	spin_unlock_irqrestore(&gic_lock, flags);
 
+	return 0;
 }
 #endif
 
diff --git a/arch/mips/mti-malta/malta-smtc.c b/arch/mips/mti-malta/malta-smtc.c
index 5ba3188..499ffe5 100644
--- a/arch/mips/mti-malta/malta-smtc.c
+++ b/arch/mips/mti-malta/malta-smtc.c
@@ -114,7 +114,7 @@ struct plat_smp_ops msmtc_smp_ops = {
  */
 
 
-void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
+int plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
 {
 	cpumask_t tmask;
 	int cpu = 0;
@@ -156,5 +156,7 @@ void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
 
 	/* Do any generic SMTC IRQ affinity setup */
 	smtc_set_irq_affinity(irq, tmask);
+
+	return 0;
 }
 #endif /* CONFIG_MIPS_MT_SMTC_IRQAFF */
diff --git a/arch/mips/sibyte/bcm1480/irq.c b/arch/mips/sibyte/bcm1480/irq.c
index 352352b..4f256a1 100644
--- a/arch/mips/sibyte/bcm1480/irq.c
+++ b/arch/mips/sibyte/bcm1480/irq.c
@@ -50,7 +50,7 @@ static void enable_bcm1480_irq(unsigned int irq);
 static void disable_bcm1480_irq(unsigned int irq);
 static void ack_bcm1480_irq(unsigned int irq);
 #ifdef CONFIG_SMP
-static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask);
+static int bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask);
 #endif
 
 #ifdef CONFIG_PCI
@@ -109,7 +109,7 @@ void bcm1480_unmask_irq(int cpu, int irq)
 }
 
 #ifdef CONFIG_SMP
-static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	int i = 0, old_cpu, cpu, int_on, k;
 	u64 cur_ints;
@@ -119,7 +119,7 @@ static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 	if (cpumask_weight(mask) != 1) {
 		printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq);
-		return;
+		return -1;
 	}
 	i = cpumask_first(mask);
 
@@ -155,6 +155,8 @@ static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
 	}
 	spin_unlock(&bcm1480_imr_lock);
 	spin_unlock_irqrestore(&desc->lock, flags);
+
+	return 0;
 }
 #endif
 
diff --git a/arch/mips/sibyte/sb1250/irq.c b/arch/mips/sibyte/sb1250/irq.c
index c08ff58..e389507 100644
--- a/arch/mips/sibyte/sb1250/irq.c
+++ b/arch/mips/sibyte/sb1250/irq.c
@@ -50,7 +50,7 @@ static void enable_sb1250_irq(unsigned int irq);
 static void disable_sb1250_irq(unsigned int irq);
 static void ack_sb1250_irq(unsigned int irq);
 #ifdef CONFIG_SMP
-static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask);
+static int sb1250_set_affinity(unsigned int irq, const struct cpumask *mask);
 #endif
 
 #ifdef CONFIG_SIBYTE_HAS_LDT
@@ -103,7 +103,7 @@ void sb1250_unmask_irq(int cpu, int irq)
 }
 
 #ifdef CONFIG_SMP
-static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	int i = 0, old_cpu, cpu, int_on;
 	u64 cur_ints;
@@ -114,7 +114,7 @@ static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 	if (cpumask_weight(mask) > 1) {
 		printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq);
-		return;
+		return -1;
 	}
 
 	/* Convert logical CPU to physical CPU */
@@ -146,6 +146,8 @@ static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
 	}
 	spin_unlock(&sb1250_imr_lock);
 	spin_unlock_irqrestore(&desc->lock, flags);
+
+	return 0;
 }
 #endif
 
diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c
index 4ea4229..8007f1e 100644
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -130,15 +130,17 @@ int cpu_check_affinity(unsigned int irq, const struct cpumask *dest)
 	return cpu_dest;
 }
 
-static void cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest)
+static int cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest)
 {
 	int cpu_dest;
 
 	cpu_dest = cpu_check_affinity(irq, dest);
 	if (cpu_dest < 0)
-		return;
+		return -1;
 
 	cpumask_copy(&irq_desc[irq].affinity, dest);
+
+	return 0;
 }
 #endif
 
diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c
index 80b5134..be3581a 100644
--- a/arch/powerpc/platforms/pseries/xics.c
+++ b/arch/powerpc/platforms/pseries/xics.c
@@ -333,7 +333,7 @@ static void xics_eoi_lpar(unsigned int virq)
 	lpar_xirr_info_set((0xff << 24) | irq);
 }
 
-static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
+static int xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
 {
 	unsigned int irq;
 	int status;
@@ -342,14 +342,14 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
 
 	irq = (unsigned int)irq_map[virq].hwirq;
 	if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS)
-		return;
+		return -1;
 
 	status = rtas_call(ibm_get_xive, 1, 3, xics_status, irq);
 
 	if (status) {
 		printk(KERN_ERR "%s: ibm,get-xive irq=%u returns %d\n",
 			__func__, irq, status);
-		return;
+		return -1;
 	}
 
 	/*
@@ -363,7 +363,7 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
 		printk(KERN_WARNING
 			"%s: No online cpus in the mask %s for irq %d\n",
 			__func__, cpulist, virq);
-		return;
+		return -1;
 	}
 
 	status = rtas_call(ibm_set_xive, 3, 1, NULL,
@@ -372,8 +372,10 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
 	if (status) {
 		printk(KERN_ERR "%s: ibm,set-xive irq=%u returns %d\n",
 			__func__, irq, status);
-		return;
+		return -1;
 	}
+
+	return 0;
 }
 
 static struct irq_chip xics_pic_direct = {
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index 21b9567..f4cbd15 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -807,7 +807,7 @@ static void mpic_end_ipi(unsigned int irq)
 
 #endif /* CONFIG_SMP */
 
-void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
+int mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 {
 	struct mpic *mpic = mpic_from_irq(irq);
 	unsigned int src = mpic_irq_to_hw(irq);
@@ -824,6 +824,8 @@ void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 		mpic_irq_write(src, MPIC_INFO(IRQ_DESTINATION),
 			       mpic_physmask(cpus_addr(tmp)[0]));
 	}
+
+	return 0;
 }
 
 static unsigned int mpic_type_to_vecpri(struct mpic *mpic, unsigned int type)
diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
index 5deabe9..e5e78f9 100644
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -318,10 +318,12 @@ static void sun4u_irq_enable(unsigned int virt_irq)
 	}
 }
 
-static void sun4u_set_affinity(unsigned int virt_irq,
+static int sun4u_set_affinity(unsigned int virt_irq,
 			       const struct cpumask *mask)
 {
 	sun4u_irq_enable(virt_irq);
+
+	return 0;
 }
 
 /* Don't do anything.  The desc->status check for IRQ_DISABLED in
@@ -377,7 +379,7 @@ static void sun4v_irq_enable(unsigned int virt_irq)
 		       ino, err);
 }
 
-static void sun4v_set_affinity(unsigned int virt_irq,
+static int sun4v_set_affinity(unsigned int virt_irq,
 			       const struct cpumask *mask)
 {
 	unsigned int ino = virt_irq_table[virt_irq].dev_ino;
@@ -388,6 +390,8 @@ static void sun4v_set_affinity(unsigned int virt_irq,
 	if (err != HV_EOK)
 		printk(KERN_ERR "sun4v_intr_settarget(%x,%lu): "
 		       "err(%d)\n", ino, cpuid, err);
+
+	return 0;
 }
 
 static void sun4v_irq_disable(unsigned int virt_irq)
@@ -445,7 +449,7 @@ static void sun4v_virq_enable(unsigned int virt_irq)
 		       dev_handle, dev_ino, err);
 }
 
-static void sun4v_virt_set_affinity(unsigned int virt_irq,
+static int sun4v_virt_set_affinity(unsigned int virt_irq,
 				    const struct cpumask *mask)
 {
 	unsigned long cpuid, dev_handle, dev_ino;
@@ -461,6 +465,8 @@ static void sun4v_virt_set_affinity(unsigned int virt_irq,
 		printk(KERN_ERR "sun4v_vintr_set_target(%lx,%lx,%lu): "
 		       "err(%d)\n",
 		       dev_handle, dev_ino, cpuid, err);
+
+	return 0;
 }
 
 static void sun4v_virq_disable(unsigned int virt_irq)
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 9fbf0f7..5c7630b 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -574,13 +574,14 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
 	return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
 }
 
-static void
+static int
 set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 {
 	struct irq_cfg *cfg;
 	unsigned long flags;
 	unsigned int dest;
 	unsigned int irq;
+	int ret = -1;
 
 	irq = desc->irq;
 	cfg = desc->chip_data;
@@ -591,18 +592,21 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 		/* Only the high 8 bits are valid. */
 		dest = SET_APIC_LOGICAL_ID(dest);
 		__target_IO_APIC_irq(irq, dest, cfg);
+		ret = 0;
 	}
 	spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	return ret;
 }
 
-static void
+static int
 set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_desc *desc;
 
 	desc = irq_to_desc(irq);
 
-	set_ioapic_affinity_irq_desc(desc, mask);
+	return set_ioapic_affinity_irq_desc(desc, mask);
 }
 #endif /* CONFIG_SMP */
 
@@ -2348,24 +2352,25 @@ static int ioapic_retrigger_irq(unsigned int irq)
  * Real vector that is used for interrupting cpu will be coming from
  * the interrupt-remapping table entry.
  */
-static void
+static int
 migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 {
 	struct irq_cfg *cfg;
 	struct irte irte;
 	unsigned int dest;
 	unsigned int irq;
+	int ret = -1;
 
 	if (!cpumask_intersects(mask, cpu_online_mask))
-		return;
+		return ret;
 
 	irq = desc->irq;
 	if (get_irte(irq, &irte))
-		return;
+		return ret;
 
 	cfg = desc->chip_data;
 	if (assign_irq_vector(irq, cfg, mask))
-		return;
+		return ret;
 
 	dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
 
@@ -2381,27 +2386,30 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 		send_cleanup_vector(cfg);
 
 	cpumask_copy(desc->affinity, mask);
+
+	return 0;
 }
 
 /*
  * Migrates the IRQ destination in the process context.
  */
-static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
 					    const struct cpumask *mask)
 {
-	migrate_ioapic_irq_desc(desc, mask);
+	return migrate_ioapic_irq_desc(desc, mask);
 }
-static void set_ir_ioapic_affinity_irq(unsigned int irq,
+static int set_ir_ioapic_affinity_irq(unsigned int irq,
 				       const struct cpumask *mask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	set_ir_ioapic_affinity_irq_desc(desc, mask);
+	return set_ir_ioapic_affinity_irq_desc(desc, mask);
 }
 #else
-static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
 						   const struct cpumask *mask)
 {
+	return 0;
 }
 #endif
 
@@ -3318,7 +3326,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 }
 
 #ifdef CONFIG_SMP
-static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg;
@@ -3327,7 +3335,7 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 
 	dest = set_desc_affinity(desc, mask);
 	if (dest == BAD_APICID)
-		return;
+		return -1;
 
 	cfg = desc->chip_data;
 
@@ -3339,13 +3347,15 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	write_msi_msg_desc(desc, &msg);
+
+	return 0;
 }
 #ifdef CONFIG_INTR_REMAP
 /*
  * Migrate the MSI irq to another cpumask. This migration is
  * done in the process context using interrupt-remapping hardware.
  */
-static void
+static int
 ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
@@ -3354,11 +3364,11 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 	struct irte irte;
 
 	if (get_irte(irq, &irte))
-		return;
+		return -1;
 
 	dest = set_desc_affinity(desc, mask);
 	if (dest == BAD_APICID)
-		return;
+		return -1;
 
 	irte.vector = cfg->vector;
 	irte.dest_id = IRTE_DEST(dest);
@@ -3375,6 +3385,8 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 	 */
 	if (cfg->move_in_progress)
 		send_cleanup_vector(cfg);
+
+	return 0;
 }
 
 #endif
@@ -3528,7 +3540,7 @@ void arch_teardown_msi_irq(unsigned int irq)
 
 #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
 #ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg;
@@ -3537,7 +3549,7 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 	dest = set_desc_affinity(desc, mask);
 	if (dest == BAD_APICID)
-		return;
+		return -1;
 
 	cfg = desc->chip_data;
 
@@ -3549,6 +3561,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	dmar_msi_write(irq, &msg);
+
+	return 0;
 }
 
 #endif /* CONFIG_SMP */
@@ -3582,7 +3596,7 @@ int arch_setup_dmar_msi(unsigned int irq)
 #ifdef CONFIG_HPET_TIMER
 
 #ifdef CONFIG_SMP
-static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg;
@@ -3591,7 +3605,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 	dest = set_desc_affinity(desc, mask);
 	if (dest == BAD_APICID)
-		return;
+		return -1;
 
 	cfg = desc->chip_data;
 
@@ -3603,6 +3617,8 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	hpet_msi_write(irq, &msg);
+
+	return 0;
 }
 
 #endif /* CONFIG_SMP */
@@ -3659,7 +3675,7 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
 	write_ht_irq_msg(irq, &msg);
 }
 
-static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg;
@@ -3667,11 +3683,13 @@ static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
 
 	dest = set_desc_affinity(desc, mask);
 	if (dest == BAD_APICID)
-		return;
+		return -1;
 
 	cfg = desc->chip_data;
 
 	target_ht_irq(irq, dest, cfg->vector);
+
+	return 0;
 }
 
 #endif
diff --git a/drivers/parisc/iosapic.c b/drivers/parisc/iosapic.c
index 73348c4..4a9cc92 100644
--- a/drivers/parisc/iosapic.c
+++ b/drivers/parisc/iosapic.c
@@ -702,7 +702,7 @@ static unsigned int iosapic_startup_irq(unsigned int irq)
 }
 
 #ifdef CONFIG_SMP
-static void iosapic_set_affinity_irq(unsigned int irq,
+static int iosapic_set_affinity_irq(unsigned int irq,
 				     const struct cpumask *dest)
 {
 	struct vector_info *vi = iosapic_get_vector(irq);
@@ -712,7 +712,7 @@ static void iosapic_set_affinity_irq(unsigned int irq,
 
 	dest_cpu = cpu_check_affinity(irq, dest);
 	if (dest_cpu < 0)
-		return;
+		return -1;
 
 	cpumask_copy(irq_desc[irq].affinity, cpumask_of(dest_cpu));
 	vi->txn_addr = txn_affinity_addr(irq, dest_cpu);
@@ -724,6 +724,8 @@ static void iosapic_set_affinity_irq(unsigned int irq,
 	iosapic_set_irt_data(vi, &dummy_d0, &d1);
 	iosapic_wr_irt_entry(vi, d0, d1);
 	spin_unlock_irqrestore(&iosapic_lock, flags);
+
+	return 0;
 }
 #endif
 
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 30963af..3338988 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -688,13 +688,13 @@ void rebind_evtchn_irq(int evtchn, int irq)
 }
 
 /* Rebind an evtchn so that it gets delivered to a specific cpu */
-static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
+static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
 {
 	struct evtchn_bind_vcpu bind_vcpu;
 	int evtchn = evtchn_from_irq(irq);
 
 	if (!VALID_EVTCHN(evtchn))
-		return;
+		return -1;
 
 	/* Send future instances of this interrupt to other vcpu. */
 	bind_vcpu.port = evtchn;
@@ -707,13 +707,15 @@ static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
 	 */
 	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
 		bind_evtchn_to_cpu(evtchn, tcpu);
-}
 
+	return 0;
+}
 
-static void set_affinity_irq(unsigned irq, const struct cpumask *dest)
+static int set_affinity_irq(unsigned irq, const struct cpumask *dest)
 {
 	unsigned tcpu = cpumask_first(dest);
-	rebind_irq_to_cpu(irq, tcpu);
+
+	return rebind_irq_to_cpu(irq, tcpu);
 }
 
 int resend_irq_on_evtchn(unsigned int irq)
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 2a34cd6..8e4c18b 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -117,7 +117,7 @@ struct irq_chip {
 	void		(*eoi)(unsigned int irq);
 
 	void		(*end)(unsigned int irq);
-	void		(*set_affinity)(unsigned int irq,
+	int		(*set_affinity)(unsigned int irq,
 					const struct cpumask *dest);
 	int		(*retrigger)(unsigned int irq);
 	int		(*set_type)(unsigned int irq, unsigned int flow_type);
-- 
cgit v0.10.2


From 57b150cce8e004ddd36330490a68bfb59b7271e9 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Apr 2009 17:59:53 -0700
Subject: irq: only update affinity if ->set_affinity() is sucessfull

irq_set_affinity() and move_masked_irq() try to assign affinity
before calling chip set_affinity(). Some archs are assigning it
in ->set_affinity() again.

We do something like:

 cpumask_cpy(desc->affinity, mask);
 desc->chip->set_affinity(mask);

But in the failure path, affinity should not be touched - otherwise
we'll end up with a different affinity mask despite the failure to
migrate the IRQ.

So try to update the afffinity only if set_affinity returns with 0.
Also call irq_set_thread_affinity accordingly.

v2: update after "irq, x86: Remove IRQ_DISABLED check in process context IRQ move"
v3: according to Ingo, change set_affinity() in irq_chip should return int.
v4: update comments by removing moving irq_desc code.

[ Impact: fix /proc/irq/*/smp_affinity setting corner case bug ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <49F65509.60307@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 01ce20e..de5f412 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -42,6 +42,9 @@ static inline void unregister_handler_proc(unsigned int irq,
 
 extern int irq_select_affinity_usr(unsigned int irq);
 
+extern void
+irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask);
+
 /*
  * Debugging printout:
  */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 2734eca..aaf5c9d 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -80,7 +80,7 @@ int irq_can_set_affinity(unsigned int irq)
 	return 1;
 }
 
-static void
+void
 irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask)
 {
 	struct irqaction *action = desc->action;
@@ -109,17 +109,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 	spin_lock_irqsave(&desc->lock, flags);
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-	if (desc->status & IRQ_MOVE_PCNTXT)
-		desc->chip->set_affinity(irq, cpumask);
+	if (desc->status & IRQ_MOVE_PCNTXT) {
+		if (!desc->chip->set_affinity(irq, cpumask)) {
+			cpumask_copy(desc->affinity, cpumask);
+			irq_set_thread_affinity(desc, cpumask);
+		}
+	}
 	else {
 		desc->status |= IRQ_MOVE_PENDING;
 		cpumask_copy(desc->pending_mask, cpumask);
 	}
 #else
-	cpumask_copy(desc->affinity, cpumask);
-	desc->chip->set_affinity(irq, cpumask);
+	if (!desc->chip->set_affinity(irq, cpumask)) {
+		cpumask_copy(desc->affinity, cpumask);
+		irq_set_thread_affinity(desc, cpumask);
+	}
 #endif
-	irq_set_thread_affinity(desc, cpumask);
 	desc->status |= IRQ_AFFINITY_SET;
 	spin_unlock_irqrestore(&desc->lock, flags);
 	return 0;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index e05ad9b..cfe767c 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -1,5 +1,8 @@
 
 #include <linux/irq.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
 
 void move_masked_irq(int irq)
 {
@@ -39,11 +42,12 @@ void move_masked_irq(int irq)
 	 * masking the irqs.
 	 */
 	if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
-		   < nr_cpu_ids)) {
-		cpumask_and(desc->affinity,
-			    desc->pending_mask, cpu_online_mask);
-		desc->chip->set_affinity(irq, desc->affinity);
-	}
+		   < nr_cpu_ids))
+		if (!desc->chip->set_affinity(irq, desc->pending_mask)) {
+			cpumask_copy(desc->affinity, desc->pending_mask);
+			irq_set_thread_affinity(desc, desc->pending_mask);
+		}
+
 	cpumask_clear(desc->pending_mask);
 }
 
-- 
cgit v0.10.2


From 85ac16d033370caf6f48d743c8dc8103700f5cc5 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Apr 2009 18:00:38 -0700
Subject: x86/irq: change irq_desc_alloc() to take node instead of cpu

This simplifies the node awareness of the code. All our allocators
only deal with a NUMA node ID locality not with CPU ids anyway - so
there's no need to maintain (and transform) a CPU id all across the
IRq layer.

v2: keep move_irq_desc related

[ Impact: cleanup, prepare IRQ code to be NUMA-aware ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
LKML-Reference: <49F65536.2020300@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 5c7630b..560b887 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -129,12 +129,9 @@ struct irq_pin_list {
 	struct irq_pin_list *next;
 };
 
-static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
+static struct irq_pin_list *get_one_free_irq_2_pin(int node)
 {
 	struct irq_pin_list *pin;
-	int node;
-
-	node = cpu_to_node(cpu);
 
 	pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
 
@@ -209,12 +206,9 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
 	return cfg;
 }
 
-static struct irq_cfg *get_one_free_irq_cfg(int cpu)
+static struct irq_cfg *get_one_free_irq_cfg(int node)
 {
 	struct irq_cfg *cfg;
-	int node;
-
-	node = cpu_to_node(cpu);
 
 	cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
 	if (cfg) {
@@ -235,13 +229,13 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu)
 	return cfg;
 }
 
-int arch_init_chip_data(struct irq_desc *desc, int cpu)
+int arch_init_chip_data(struct irq_desc *desc, int node)
 {
 	struct irq_cfg *cfg;
 
 	cfg = desc->chip_data;
 	if (!cfg) {
-		desc->chip_data = get_one_free_irq_cfg(cpu);
+		desc->chip_data = get_one_free_irq_cfg(node);
 		if (!desc->chip_data) {
 			printk(KERN_ERR "can not alloc irq_cfg\n");
 			BUG_ON(1);
@@ -253,7 +247,7 @@ int arch_init_chip_data(struct irq_desc *desc, int cpu)
 
 /* for move_irq_desc */
 static void
-init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
+init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node)
 {
 	struct irq_pin_list *old_entry, *head, *tail, *entry;
 
@@ -262,7 +256,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
 	if (!old_entry)
 		return;
 
-	entry = get_one_free_irq_2_pin(cpu);
+	entry = get_one_free_irq_2_pin(node);
 	if (!entry)
 		return;
 
@@ -272,7 +266,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
 	tail		= entry;
 	old_entry	= old_entry->next;
 	while (old_entry) {
-		entry = get_one_free_irq_2_pin(cpu);
+		entry = get_one_free_irq_2_pin(node);
 		if (!entry) {
 			entry = head;
 			while (entry) {
@@ -312,12 +306,12 @@ static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
 }
 
 void arch_init_copy_chip_data(struct irq_desc *old_desc,
-				 struct irq_desc *desc, int cpu)
+				 struct irq_desc *desc, int node)
 {
 	struct irq_cfg *cfg;
 	struct irq_cfg *old_cfg;
 
-	cfg = get_one_free_irq_cfg(cpu);
+	cfg = get_one_free_irq_cfg(node);
 
 	if (!cfg)
 		return;
@@ -328,7 +322,7 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc,
 
 	memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
 
-	init_copy_irq_2_pin(old_cfg, cfg, cpu);
+	init_copy_irq_2_pin(old_cfg, cfg, node);
 }
 
 static void free_irq_cfg(struct irq_cfg *old_cfg)
@@ -615,13 +609,13 @@ set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
  * shared ISA-space IRQs, so we have to support them. We are super
  * fast in the common case, and fast for shared ISA-space IRQs.
  */
-static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
+static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
 {
 	struct irq_pin_list *entry;
 
 	entry = cfg->irq_2_pin;
 	if (!entry) {
-		entry = get_one_free_irq_2_pin(cpu);
+		entry = get_one_free_irq_2_pin(node);
 		if (!entry) {
 			printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
 					apic, pin);
@@ -641,7 +635,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
 		entry = entry->next;
 	}
 
-	entry->next = get_one_free_irq_2_pin(cpu);
+	entry->next = get_one_free_irq_2_pin(node);
 	entry = entry->next;
 	entry->apic = apic;
 	entry->pin = pin;
@@ -650,7 +644,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
 /*
  * Reroute an IRQ to a different pin.
  */
-static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
+static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
 				      int oldapic, int oldpin,
 				      int newapic, int newpin)
 {
@@ -670,7 +664,7 @@ static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
 
 	/* why? call replace before add? */
 	if (!replaced)
-		add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
+		add_pin_to_irq_node(cfg, node, newapic, newpin);
 }
 
 static inline void io_apic_modify_irq(struct irq_cfg *cfg,
@@ -1612,7 +1606,7 @@ static void __init setup_IO_APIC_irqs(void)
 	int notcon = 0;
 	struct irq_desc *desc;
 	struct irq_cfg *cfg;
-	int cpu = boot_cpu_id;
+	int node = cpu_to_node(boot_cpu_id);
 
 	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
 
@@ -1647,13 +1641,13 @@ static void __init setup_IO_APIC_irqs(void)
 					apic->multi_timer_check(apic_id, irq))
 				continue;
 
-			desc = irq_to_desc_alloc_cpu(irq, cpu);
+			desc = irq_to_desc_alloc_node(irq, node);
 			if (!desc) {
 				printk(KERN_INFO "can not get irq_desc for %d\n", irq);
 				continue;
 			}
 			cfg = desc->chip_data;
-			add_pin_to_irq_cpu(cfg, cpu, apic_id, pin);
+			add_pin_to_irq_node(cfg, node, apic_id, pin);
 
 			setup_IO_APIC_irq(apic_id, pin, irq, desc,
 					irq_trigger(idx), irq_polarity(idx));
@@ -2863,7 +2857,7 @@ static inline void __init check_timer(void)
 {
 	struct irq_desc *desc = irq_to_desc(0);
 	struct irq_cfg *cfg = desc->chip_data;
-	int cpu = boot_cpu_id;
+	int node = cpu_to_node(boot_cpu_id);
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
 	int no_pin1 = 0;
@@ -2929,7 +2923,7 @@ static inline void __init check_timer(void)
 		 * Ok, does IRQ0 through the IOAPIC work?
 		 */
 		if (no_pin1) {
-			add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
+			add_pin_to_irq_node(cfg, node, apic1, pin1);
 			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
 		} else {
 			/* for edge trigger, setup_IO_APIC_irq already
@@ -2966,7 +2960,7 @@ static inline void __init check_timer(void)
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
+		replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
 		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
 		enable_8259A_irq(0);
 		if (timer_irq_works()) {
@@ -3185,7 +3179,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
 	unsigned int new;
 	unsigned long flags;
 	struct irq_cfg *cfg_new = NULL;
-	int cpu = boot_cpu_id;
+	int node = cpu_to_node(boot_cpu_id);
 	struct irq_desc *desc_new = NULL;
 
 	irq = 0;
@@ -3194,7 +3188,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
 
 	spin_lock_irqsave(&vector_lock, flags);
 	for (new = irq_want; new < nr_irqs; new++) {
-		desc_new = irq_to_desc_alloc_cpu(new, cpu);
+		desc_new = irq_to_desc_alloc_node(new, node);
 		if (!desc_new) {
 			printk(KERN_INFO "can not get irq_desc for %d\n", new);
 			continue;
@@ -3968,7 +3962,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p
 {
 	struct irq_desc *desc;
 	struct irq_cfg *cfg;
-	int cpu = boot_cpu_id;
+	int node = cpu_to_node(boot_cpu_id);
 
 	if (!IO_APIC_IRQ(irq)) {
 		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
@@ -3976,7 +3970,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p
 		return -EINVAL;
 	}
 
-	desc = irq_to_desc_alloc_cpu(irq, cpu);
+	desc = irq_to_desc_alloc_node(irq, node);
 	if (!desc) {
 		printk(KERN_INFO "can not get irq_desc %d\n", irq);
 		return 0;
@@ -3987,7 +3981,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p
 	 */
 	if (irq >= NR_IRQS_LEGACY) {
 		cfg = desc->chip_data;
-		add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
+		add_pin_to_irq_node(cfg, node, ioapic, pin);
 	}
 
 	setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index ca7ec44..45acbcf 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -636,7 +636,7 @@ static void __init lguest_init_IRQ(void)
 
 void lguest_setup_irq(unsigned int irq)
 {
-	irq_to_desc_alloc_cpu(irq, 0);
+	irq_to_desc_alloc_node(irq, 0);
 	set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
 				      handle_level_irq, "level");
 }
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index f5e0ea7..9eff36a 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -23,15 +23,12 @@ struct irq_2_iommu {
 };
 
 #ifdef CONFIG_GENERIC_HARDIRQS
-static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu)
+static struct irq_2_iommu *get_one_free_irq_2_iommu(int node)
 {
 	struct irq_2_iommu *iommu;
-	int node;
-
-	node = cpu_to_node(cpu);
 
 	iommu = kzalloc_node(sizeof(*iommu), GFP_ATOMIC, node);
-	printk(KERN_DEBUG "alloc irq_2_iommu on cpu %d node %d\n", cpu, node);
+	printk(KERN_DEBUG "alloc irq_2_iommu on node %d\n", node);
 
 	return iommu;
 }
@@ -48,7 +45,7 @@ static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
 	return desc->irq_2_iommu;
 }
 
-static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
+static struct irq_2_iommu *irq_2_iommu_alloc_node(unsigned int irq, int node)
 {
 	struct irq_desc *desc;
 	struct irq_2_iommu *irq_iommu;
@@ -56,7 +53,7 @@ static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
 	/*
 	 * alloc irq desc if not allocated already.
 	 */
-	desc = irq_to_desc_alloc_cpu(irq, cpu);
+	desc = irq_to_desc_alloc_node(irq, node);
 	if (!desc) {
 		printk(KERN_INFO "can not get irq_desc for %d\n", irq);
 		return NULL;
@@ -65,14 +62,14 @@ static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
 	irq_iommu = desc->irq_2_iommu;
 
 	if (!irq_iommu)
-		desc->irq_2_iommu = get_one_free_irq_2_iommu(cpu);
+		desc->irq_2_iommu = get_one_free_irq_2_iommu(node);
 
 	return desc->irq_2_iommu;
 }
 
 static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
 {
-	return irq_2_iommu_alloc_cpu(irq, boot_cpu_id);
+	return irq_2_iommu_alloc_node(irq, cpu_to_node(boot_cpu_id));
 }
 
 #else /* !CONFIG_SPARSE_IRQ */
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 3338988..be437c2 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -335,7 +335,7 @@ static int find_unbound_irq(void)
 	if (irq == nr_irqs)
 		panic("No available IRQ to bind to: increase nr_irqs!\n");
 
-	desc = irq_to_desc_alloc_cpu(irq, 0);
+	desc = irq_to_desc_alloc_node(irq, 0);
 	if (WARN_ON(desc == NULL))
 		return -1;
 
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 91bb76f..ff374ce 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -566,6 +566,6 @@ struct irq_desc;
 extern int early_irq_init(void);
 extern int arch_probe_nr_irqs(void);
 extern int arch_early_irq_init(void);
-extern int arch_init_chip_data(struct irq_desc *desc, int cpu);
+extern int arch_init_chip_data(struct irq_desc *desc, int node);
 
 #endif
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 8e4c18b..a09baf8 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -187,7 +187,7 @@ struct irq_desc {
 	spinlock_t		lock;
 #ifdef CONFIG_SMP
 	cpumask_var_t		affinity;
-	unsigned int		cpu;
+	unsigned int		node;
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	cpumask_var_t		pending_mask;
 #endif
@@ -201,16 +201,16 @@ struct irq_desc {
 } ____cacheline_internodealigned_in_smp;
 
 extern void arch_init_copy_chip_data(struct irq_desc *old_desc,
-					struct irq_desc *desc, int cpu);
+					struct irq_desc *desc, int node);
 extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc);
 
 #ifndef CONFIG_SPARSE_IRQ
 extern struct irq_desc irq_desc[NR_IRQS];
 #else /* CONFIG_SPARSE_IRQ */
-extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
+extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int node);
 #endif /* CONFIG_SPARSE_IRQ */
 
-extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
+extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node);
 
 /*
  * Migration helpers for obsolete names, they will go away:
@@ -422,12 +422,10 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
  * Allocates affinity and pending_mask cpumask if required.
  * Returns true if successful (or not required).
  */
-static inline bool alloc_desc_masks(struct irq_desc *desc, int cpu,
+static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
 								bool boot)
 {
 #ifdef CONFIG_CPUMASK_OFFSTACK
-	int node;
-
 	if (boot) {
 		alloc_bootmem_cpumask_var(&desc->affinity);
 
@@ -437,8 +435,6 @@ static inline bool alloc_desc_masks(struct irq_desc *desc, int cpu,
 		return true;
 	}
 
-	node = cpu_to_node(cpu);
-
 	if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
 		return false;
 
@@ -494,7 +490,7 @@ static inline void free_desc_masks(struct irq_desc *old_desc,
 
 #else /* !CONFIG_SMP */
 
-static inline bool alloc_desc_masks(struct irq_desc *desc, int cpu,
+static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
 								bool boot)
 {
 	return true;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3e0cbc4..a6368db 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -81,12 +81,10 @@ static struct irq_desc irq_desc_init = {
 	.lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
 };
 
-void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
+void init_kstat_irqs(struct irq_desc *desc, int node, int nr)
 {
-	int node;
 	void *ptr;
 
-	node = cpu_to_node(cpu);
 	ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node);
 
 	/*
@@ -94,33 +92,32 @@ void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
 	 * init_copy_kstat_irqs() could still use old one
 	 */
 	if (ptr) {
-		printk(KERN_DEBUG "  alloc kstat_irqs on cpu %d node %d\n",
-			 cpu, node);
+		printk(KERN_DEBUG "  alloc kstat_irqs on node %d\n", node);
 		desc->kstat_irqs = ptr;
 	}
 }
 
-static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
+static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
 {
 	memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
 
 	spin_lock_init(&desc->lock);
 	desc->irq = irq;
 #ifdef CONFIG_SMP
-	desc->cpu = cpu;
+	desc->node = node;
 #endif
 	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-	init_kstat_irqs(desc, cpu, nr_cpu_ids);
+	init_kstat_irqs(desc, node, nr_cpu_ids);
 	if (!desc->kstat_irqs) {
 		printk(KERN_ERR "can not alloc kstat_irqs\n");
 		BUG_ON(1);
 	}
-	if (!alloc_desc_masks(desc, cpu, false)) {
+	if (!alloc_desc_masks(desc, node, false)) {
 		printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
 		BUG_ON(1);
 	}
 	init_desc_masks(desc);
-	arch_init_chip_data(desc, cpu);
+	arch_init_chip_data(desc, node);
 }
 
 /*
@@ -189,11 +186,10 @@ struct irq_desc *irq_to_desc(unsigned int irq)
 	return NULL;
 }
 
-struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
 {
 	struct irq_desc *desc;
 	unsigned long flags;
-	int node;
 
 	if (irq >= nr_irqs) {
 		WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
@@ -212,15 +208,13 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
 	if (desc)
 		goto out_unlock;
 
-	node = cpu_to_node(cpu);
 	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-	printk(KERN_DEBUG "  alloc irq_desc for %d on cpu %d node %d\n",
-		 irq, cpu, node);
+	printk(KERN_DEBUG "  alloc irq_desc for %d on node %d\n", irq, node);
 	if (!desc) {
 		printk(KERN_ERR "can not alloc irq_desc\n");
 		BUG_ON(1);
 	}
-	init_one_irq_desc(irq, desc, cpu);
+	init_one_irq_desc(irq, desc, node);
 
 	irq_desc_ptrs[irq] = desc;
 
@@ -270,7 +264,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
 	return (irq < NR_IRQS) ? irq_desc + irq : NULL;
 }
 
-struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
 {
 	return irq_to_desc(irq);
 }
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index de5f412..7346825 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -16,7 +16,7 @@ extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
 extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
 
 extern struct lock_class_key irq_desc_lock_class;
-extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
+extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
 extern void clear_kstat_irqs(struct irq_desc *desc);
 extern spinlock_t sparse_irq_lock;
 
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index ce72bc3..2f69bee 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -15,9 +15,9 @@
 
 static void init_copy_kstat_irqs(struct irq_desc *old_desc,
 				 struct irq_desc *desc,
-				 int cpu, int nr)
+				 int node, int nr)
 {
-	init_kstat_irqs(desc, cpu, nr);
+	init_kstat_irqs(desc, node, nr);
 
 	if (desc->kstat_irqs != old_desc->kstat_irqs)
 		memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
@@ -34,20 +34,20 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
 }
 
 static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
-		 struct irq_desc *desc, int cpu)
+		 struct irq_desc *desc, int node)
 {
 	memcpy(desc, old_desc, sizeof(struct irq_desc));
-	if (!alloc_desc_masks(desc, cpu, false)) {
+	if (!alloc_desc_masks(desc, node, false)) {
 		printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
 				"for migration.\n", irq);
 		return false;
 	}
 	spin_lock_init(&desc->lock);
-	desc->cpu = cpu;
+	desc->node = node;
 	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-	init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+	init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
 	init_copy_desc_masks(old_desc, desc);
-	arch_init_copy_chip_data(old_desc, desc, cpu);
+	arch_init_copy_chip_data(old_desc, desc, node);
 	return true;
 }
 
@@ -59,12 +59,11 @@ static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
 }
 
 static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
-						int cpu)
+						int node)
 {
 	struct irq_desc *desc;
 	unsigned int irq;
 	unsigned long flags;
-	int node;
 
 	irq = old_desc->irq;
 
@@ -76,7 +75,6 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
 	if (desc && old_desc != desc)
 		goto out_unlock;
 
-	node = cpu_to_node(cpu);
 	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
 	if (!desc) {
 		printk(KERN_ERR "irq %d: can not get new irq_desc "
@@ -85,7 +83,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
 		desc = old_desc;
 		goto out_unlock;
 	}
-	if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) {
+	if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) {
 		/* still use old one */
 		kfree(desc);
 		desc = old_desc;
@@ -107,24 +105,14 @@ out_unlock:
 	return desc;
 }
 
-struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
+struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
 {
-	int old_cpu;
-	int node, old_node;
-
 	/* those all static, do move them */
 	if (desc->irq < NR_IRQS_LEGACY)
 		return desc;
 
-	old_cpu = desc->cpu;
-	if (old_cpu != cpu) {
-		node = cpu_to_node(cpu);
-		old_node = cpu_to_node(old_cpu);
-		if (old_node != node)
-			desc = __real_move_irq_desc(desc, cpu);
-		else
-			desc->cpu = cpu;
-	}
+	if (desc->node != node)
+		desc = __real_move_irq_desc(desc, node);
 
 	return desc;
 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b525dd3..f674f33 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -828,7 +828,7 @@ int __init __weak arch_early_irq_init(void)
 	return 0;
 }
 
-int __weak arch_init_chip_data(struct irq_desc *desc, int cpu)
+int __weak arch_init_chip_data(struct irq_desc *desc, int node)
 {
 	return 0;
 }
-- 
cgit v0.10.2


From a2f809b08ae4dddc1015c7dcd8659e5729e45b3e Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Apr 2009 18:01:20 -0700
Subject: irq: change ACPI GSI APIs to also take a device argument

We want to use dev_to_node() later on, to be aware of the 'home node'
of the GSI in question.

[ Impact: cleanup, prepare the IRQ code to be more NUMA aware ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Len Brown <lenb@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Len Brown <lenb@kernel.org>
Cc: Bjorn Helgaas <bjorn.helgaas@hp.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-acpi@vger.kernel.org
Cc: linux-ia64@vger.kernel.org
LKML-Reference: <49F65560.20904@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 5510317..baec6f0 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -636,7 +636,7 @@ void __init acpi_numa_arch_fixup(void)
  * success: return IRQ number (>=0)
  * failure: return < 0
  */
-int acpi_register_gsi(u32 gsi, int triggering, int polarity)
+int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 {
 	if (acpi_irq_model == ACPI_IRQ_MODEL_PLATFORM)
 		return gsi;
@@ -678,7 +678,8 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
 
 	fadt = (struct acpi_table_fadt *)fadt_header;
 
-	acpi_register_gsi(fadt->sci_interrupt, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW);
+	acpi_register_gsi(NULL, fadt->sci_interrupt, ACPI_LEVEL_SENSITIVE,
+				 ACPI_ACTIVE_LOW);
 	return 0;
 }
 
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 9d826e4..07f2913 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -154,8 +154,8 @@ extern int timer_through_8259;
 extern int io_apic_get_unique_id(int ioapic, int apic_id);
 extern int io_apic_get_version(int ioapic);
 extern int io_apic_get_redir_entries(int ioapic);
-extern int io_apic_set_pci_routing(int ioapic, int pin, int irq,
-				   int edge_level, int active_high_low);
+extern int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin,
+				  int irq, int edge_level, int active_high_low);
 #endif /* CONFIG_ACPI */
 
 extern int (*ioapic_renumber_irq)(int ioapic, int irq);
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 642fc7f..3ea1f53 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -72,7 +72,9 @@ extern void mp_register_ioapic(int id, u32 address, u32 gsi_base);
 extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
 				   u32 gsi);
 extern void mp_config_acpi_legacy_irqs(void);
-extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low);
+struct device;
+extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
+				 int active_high_low);
 extern int acpi_probe_gsi(void);
 #ifdef CONFIG_X86_IO_APIC
 extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 723989d..6ee96b5 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -522,7 +522,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
  * success: return IRQ number (>=0)
  * failure: return < 0
  */
-int acpi_register_gsi(u32 gsi, int triggering, int polarity)
+int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 {
 	unsigned int irq;
 	unsigned int plat_gsi = gsi;
@@ -539,7 +539,7 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity)
 
 #ifdef CONFIG_X86_IO_APIC
 	if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
-		plat_gsi = mp_register_gsi(gsi, triggering, polarity);
+		plat_gsi = mp_register_gsi(dev, gsi, triggering, polarity);
 	}
 #endif
 	acpi_gsi_to_irq(plat_gsi, &irq);
@@ -1158,7 +1158,7 @@ void __init mp_config_acpi_legacy_irqs(void)
 	}
 }
 
-int mp_register_gsi(u32 gsi, int triggering, int polarity)
+int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 {
 	int ioapic;
 	int ioapic_pin;
@@ -1253,7 +1253,7 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
 		}
 	}
 #endif
-	io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
+	io_apic_set_pci_routing(dev, ioapic, ioapic_pin, gsi,
 				triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
 				polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
 	return gsi;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 560b887..d934662 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3958,7 +3958,8 @@ int __init io_apic_get_version(int ioapic)
 }
 #endif
 
-int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
+int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
+				 int triggering, int polarity)
 {
 	struct irq_desc *desc;
 	struct irq_cfg *cfg;
diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c
index 51b9f82..2faa9e2 100644
--- a/drivers/acpi/pci_irq.c
+++ b/drivers/acpi/pci_irq.c
@@ -401,7 +401,8 @@ int acpi_pci_irq_enable(struct pci_dev *dev)
 		/* Interrupt Line values above 0xF are forbidden */
 		if (dev->irq > 0 && (dev->irq <= 0xF)) {
 			printk(" - using IRQ %d\n", dev->irq);
-			acpi_register_gsi(dev->irq, ACPI_LEVEL_SENSITIVE,
+			acpi_register_gsi(&dev->dev, dev->irq,
+					  ACPI_LEVEL_SENSITIVE,
 					  ACPI_ACTIVE_LOW);
 			return 0;
 		} else {
@@ -410,7 +411,7 @@ int acpi_pci_irq_enable(struct pci_dev *dev)
 		}
 	}
 
-	rc = acpi_register_gsi(gsi, triggering, polarity);
+	rc = acpi_register_gsi(&dev->dev, gsi, triggering, polarity);
 	if (rc < 0) {
 		dev_warn(&dev->dev, "PCI INT %c: failed to register GSI\n",
 			 pin_name(pin));
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 340ba4f..4a9f349 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -224,7 +224,7 @@ static void hpet_timer_set_irq(struct hpet_dev *devp)
 			break;
 		}
 
-		gsi = acpi_register_gsi(irq, ACPI_LEVEL_SENSITIVE,
+		gsi = acpi_register_gsi(NULL, irq, ACPI_LEVEL_SENSITIVE,
 					ACPI_ACTIVE_LOW);
 		if (gsi > 0)
 			break;
@@ -939,7 +939,7 @@ static acpi_status hpet_resources(struct acpi_resource *res, void *data)
 		irqp = &res->data.extended_irq;
 
 		for (i = 0; i < irqp->interrupt_count; i++) {
-			irq = acpi_register_gsi(irqp->interrupts[i],
+			irq = acpi_register_gsi(NULL, irqp->interrupts[i],
 				      irqp->triggering, irqp->polarity);
 			if (irq < 0)
 				return AE_ERROR;
diff --git a/drivers/pnp/pnpacpi/rsparser.c b/drivers/pnp/pnpacpi/rsparser.c
index adf1785..7f207f3 100644
--- a/drivers/pnp/pnpacpi/rsparser.c
+++ b/drivers/pnp/pnpacpi/rsparser.c
@@ -123,7 +123,7 @@ static void pnpacpi_parse_allocated_irqresource(struct pnp_dev *dev,
 	}
 
 	flags = irq_flags(triggering, polarity, shareable);
-	irq = acpi_register_gsi(gsi, triggering, polarity);
+	irq = acpi_register_gsi(&dev->dev, gsi, triggering, polarity);
 	if (irq >= 0)
 		pcibios_penalize_isa_irq(irq, 1);
 	else
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 88be890..51b4b0a 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -119,7 +119,7 @@ extern int pci_mmcfg_config_num;
 extern int sbf_port;
 extern unsigned long acpi_realmode_flags;
 
-int acpi_register_gsi (u32 gsi, int triggering, int polarity);
+int acpi_register_gsi (struct device *dev, u32 gsi, int triggering, int polarity);
 int acpi_gsi_to_irq (u32 gsi, unsigned int *irq);
 
 #ifdef CONFIG_X86_IO_APIC
-- 
cgit v0.10.2


From 024154cfdd802654cb236a18c78b6e37351e2c49 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Apr 2009 18:01:50 -0700
Subject: irq: change io_apic_set_pci_routing() to use device parameter

Make actual use of the device parameter passed down to
io_apic_set_pci_routing() - to have the IRQ descriptor
on the home node of the device.

If no device has been passed down, we assume it's a platform
device and use the boot node ID for the IRQ descriptor.

[ Impact: optimization, make IO-APIC code more NUMA aware ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <49F6557E.3080101@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index d934662..82376e0 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3963,7 +3963,7 @@ int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
 {
 	struct irq_desc *desc;
 	struct irq_cfg *cfg;
-	int node = cpu_to_node(boot_cpu_id);
+	int node;
 
 	if (!IO_APIC_IRQ(irq)) {
 		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
@@ -3971,6 +3971,11 @@ int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
 		return -EINVAL;
 	}
 
+	if (dev)
+		node = dev_to_node(dev);
+	else
+		node = cpu_to_node(boot_cpu_id);
+
 	desc = irq_to_desc_alloc_node(irq, node);
 	if (!desc) {
 		printk(KERN_INFO "can not get irq_desc %d\n", irq);
-- 
cgit v0.10.2


From d047f53a2ecce37e3bdf79eac5a326fbaadb3628 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Apr 2009 18:02:23 -0700
Subject: x86/irq: change MSI irq_desc to be more numa aware

Try to get irq_desc on the home node in create_irq_nr().

v2: don't check if we can move it when sparse_irq is not used
v3: use move_irq_des, if that node is not what we want

[ Impact: optimization, make MSI IRQ descriptors more NUMA aware ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <49F6559F.7070005@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 82376e0..9cd4806 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3172,14 +3172,13 @@ static int nr_irqs_gsi = NR_IRQS_LEGACY;
 /*
  * Dynamic irq allocate and deallocation
  */
-unsigned int create_irq_nr(unsigned int irq_want)
+unsigned int create_irq_nr(unsigned int irq_want, int node)
 {
 	/* Allocate an unused irq */
 	unsigned int irq;
 	unsigned int new;
 	unsigned long flags;
 	struct irq_cfg *cfg_new = NULL;
-	int node = cpu_to_node(boot_cpu_id);
 	struct irq_desc *desc_new = NULL;
 
 	irq = 0;
@@ -3197,6 +3196,13 @@ unsigned int create_irq_nr(unsigned int irq_want)
 
 		if (cfg_new->vector != 0)
 			continue;
+
+#ifdef CONFIG_NUMA_IRQ_DESC
+		/* different node ?*/
+		if (desc_new->node != node)
+			desc = move_irq_desc(desc, node);
+#endif
+
 		if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
 			irq = new;
 		break;
@@ -3214,11 +3220,12 @@ unsigned int create_irq_nr(unsigned int irq_want)
 
 int create_irq(void)
 {
+	int node = cpu_to_node(boot_cpu_id);
 	unsigned int irq_want;
 	int irq;
 
 	irq_want = nr_irqs_gsi;
-	irq = create_irq_nr(irq_want);
+	irq = create_irq_nr(irq_want, node);
 
 	if (irq == 0)
 		irq = -1;
@@ -3476,15 +3483,17 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	unsigned int irq_want;
 	struct intel_iommu *iommu = NULL;
 	int index = 0;
+	int node;
 
 	/* x86 doesn't support multiple MSI yet */
 	if (type == PCI_CAP_ID_MSI && nvec > 1)
 		return 1;
 
+	node = dev_to_node(&dev->dev);
 	irq_want = nr_irqs_gsi;
 	sub_handle = 0;
 	list_for_each_entry(msidesc, &dev->msi_list, list) {
-		irq = create_irq_nr(irq_want);
+		irq = create_irq_nr(irq_want, node);
 		if (irq == 0)
 			return -1;
 		irq_want = irq + 1;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index a09baf8..4b95ddb 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -376,7 +376,7 @@ extern void set_irq_noprobe(unsigned int irq);
 extern void set_irq_probe(unsigned int irq);
 
 /* Handle dynamic irq creation and destruction */
-extern unsigned int create_irq_nr(unsigned int irq_want);
+extern unsigned int create_irq_nr(unsigned int irq_want, int node);
 extern int create_irq(void);
 extern void destroy_irq(unsigned int irq);
 
-- 
cgit v0.10.2


From 56b581ea9591b5767b1e0204c6a06c7d0c49396e Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Apr 2009 18:02:46 -0700
Subject: irq: make ht irq_desc more numa aware

Try to get irq_desc on the same node as create_irq_nr().

[ Impact: optimization, make HT IRQs more NUMA-aware ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <49F655B6.8020109@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c
index bf7d6ce..4e9dd0f 100644
--- a/drivers/pci/htirq.c
+++ b/drivers/pci/htirq.c
@@ -98,6 +98,7 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update)
 	int max_irq;
 	int pos;
 	int irq;
+	int node;
 
 	pos = pci_find_ht_capability(dev, HT_CAPTYPE_IRQ);
 	if (!pos)
@@ -125,7 +126,8 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update)
 	cfg->msg.address_lo = 0xffffffff;
 	cfg->msg.address_hi = 0xffffffff;
 
-	irq = create_irq();
+	node = dev_to_node(&dev->dev);
+	irq = create_irq_nr(0, node);
 
 	if (irq <= 0) {
 		kfree(cfg);
-- 
cgit v0.10.2


From 4278600644dee621bd50d7498e244b135612e0f6 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 28 Apr 2009 23:12:10 +0900
Subject: sh: register the rtc-generic platform device properly.

The device registration was accidentally omitted, add it back in. Do some
basic device probing as well, so this doesn't show up for platforms that
tie in to the RTC interface properly.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/time_32.c b/arch/sh/kernel/time_32.c
index a2458bf..4573321 100644
--- a/arch/sh/kernel/time_32.c
+++ b/arch/sh/kernel/time_32.c
@@ -3,7 +3,7 @@
  *
  *  Copyright (C) 1999  Tetsuya Okada & Niibe Yutaka
  *  Copyright (C) 2000  Philipp Rumpf <prumpf@tux.org>
- *  Copyright (C) 2002 - 2008  Paul Mundt
+ *  Copyright (C) 2002 - 2009  Paul Mundt
  *  Copyright (C) 2002  M. R. Brown  <mrbrown@linux-sh.org>
  *
  *  Some code taken from i386 version.
@@ -68,6 +68,21 @@ int set_rtc_time(struct rtc_time *tm)
 }
 EXPORT_SYMBOL(set_rtc_time);
 
+static int __init rtc_generic_init(void)
+{
+	struct platform_device *pdev;
+
+	if (rtc_sh_get_time == null_rtc_get_time)
+		return -ENODEV;
+
+	pdev = platform_device_register_simple("rtc-generic", -1, NULL, 0);
+	if (IS_ERR(pdev))
+		return PTR_ERR(pdev);
+
+	return 0;
+}
+module_init(rtc_generic_init);
+
 #ifndef CONFIG_GENERIC_TIME
 void do_gettimeofday(struct timeval *tv)
 {
-- 
cgit v0.10.2


From 1130b0296184bc21806225fd06d533515a99d2db Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 28 Apr 2009 14:56:18 +0200
Subject: perf_counter tools: fix Documentation/perf_counter build error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mike Galbraith reported:

> marge:..Documentation/perf_counter # make
>     CC builtin-stat.o
> In file included from builtin-stat.c:71:
> /usr/include/ctype.h:102: error: expected expression before ‘]’ token

Remove the ctype.h include.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index d7ace63..112b94e 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -68,7 +68,6 @@
 #include <fcntl.h>
 #include <stdio.h>
 #include <errno.h>
-#include <ctype.h>
 #include <time.h>
 #include <sched.h>
 #include <pthread.h>
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index dea016f..6a276d2 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -49,7 +49,6 @@
 #include <fcntl.h>
 #include <stdio.h>
 #include <errno.h>
-#include <ctype.h>
 #include <time.h>
 #include <sched.h>
 #include <pthread.h>
-- 
cgit v0.10.2


From cd891ae0305601bdb4d2e7e85282961c4ff256cd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 28 Apr 2009 11:39:34 -0400
Subject: tracing: convert ftrace_dump spinlocks to raw

ftrace_dump is used for printing out the contents of the ftrace ring buffer
to the console on failure. Currently it uses a spinlock to synchronize
the output from multiple failures on different CPUs. This spin lock
currently is a normal spinlock and can cause issues with lockdep and
lock tracing.

This patch converts it to raw since it is for error handling only.
The lock is local to the ftrace_dump and is not used by any other
infrastructure.

[ Impact: prevent ftrace_dump from locking up by internal tracing ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b6183bc..5d704a4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4114,7 +4114,8 @@ trace_printk_seq(struct trace_seq *s)
 
 static void __ftrace_dump(bool disable_tracing)
 {
-	static DEFINE_SPINLOCK(ftrace_dump_lock);
+	static raw_spinlock_t ftrace_dump_lock =
+		(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 	/* use static because iter can be a bit big for the stack */
 	static struct trace_iterator iter;
 	unsigned int old_userobj;
@@ -4123,7 +4124,8 @@ static void __ftrace_dump(bool disable_tracing)
 	int cnt = 0, cpu;
 
 	/* only one dump */
-	spin_lock_irqsave(&ftrace_dump_lock, flags);
+	local_irq_save(flags);
+	__raw_spin_lock(&ftrace_dump_lock);
 	if (dump_ran)
 		goto out;
 
@@ -4195,7 +4197,8 @@ static void __ftrace_dump(bool disable_tracing)
 	}
 
  out:
-	spin_unlock_irqrestore(&ftrace_dump_lock, flags);
+	__raw_spin_unlock(&ftrace_dump_lock);
+	local_irq_restore(flags);
 }
 
 /* By default: disable tracing after the dump */
-- 
cgit v0.10.2


From edc953fa4ebc0265ef3b1754fe116a9fd4264e15 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Tue, 28 Apr 2009 11:13:46 -0400
Subject: x86: clean up alternative.h

Alternative header duplicates assembly that could be merged in
one single macro.  Merging this into this macro also allows to
directly declare ALTERNATIVE() statements within assembly code.

Uses a __stringify() of the feature bits rather than passing a
"i" operand.  Leave the old %0 operand as-is (set to 0), unused
to stay compatible with API.

(v2: tab alignment fixes)

[ Impact: cleanup ]

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
LKML-Reference: <20090428151346.GA31212@Krystal>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index f6aa18e..1a37bcd 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -3,6 +3,7 @@
 
 #include <linux/types.h>
 #include <linux/stddef.h>
+#include <linux/stringify.h>
 #include <asm/asm.h>
 
 /*
@@ -74,6 +75,22 @@ static inline void alternatives_smp_switch(int smp) {}
 
 const unsigned char *const *find_nop_table(void);
 
+/* alternative assembly primitive: */
+#define ALTERNATIVE(oldinstr, newinstr, feature)			\
+									\
+      "661:\n\t" oldinstr "\n662:\n"					\
+      ".section .altinstructions,\"a\"\n"				\
+      _ASM_ALIGN "\n"							\
+      _ASM_PTR "661b\n"				/* label           */	\
+      _ASM_PTR "663f\n"				/* new instruction */	\
+      "	 .byte " __stringify(feature) "\n"	/* feature bit     */	\
+      "	 .byte 662b-661b\n"			/* sourcelen       */	\
+      "	 .byte 664f-663f\n"			/* replacementlen  */	\
+      ".previous\n"							\
+      ".section .altinstr_replacement, \"ax\"\n"			\
+      "663:\n\t" newinstr "\n664:\n"		/* replacement     */	\
+      ".previous"
+
 /*
  * Alternative instructions for different CPU types or capabilities.
  *
@@ -87,18 +104,7 @@ const unsigned char *const *find_nop_table(void);
  * without volatile and memory clobber.
  */
 #define alternative(oldinstr, newinstr, feature)			\
-	asm volatile ("661:\n\t" oldinstr "\n662:\n"			\
-		      ".section .altinstructions,\"a\"\n"		\
-		      _ASM_ALIGN "\n"					\
-		      _ASM_PTR "661b\n"		/* label */		\
-		      _ASM_PTR "663f\n"		/* new instruction */	\
-		      "	 .byte %c0\n"		/* feature bit */	\
-		      "	 .byte 662b-661b\n"	/* sourcelen */		\
-		      "	 .byte 664f-663f\n"	/* replacementlen */	\
-		      ".previous\n"					\
-		      ".section .altinstr_replacement,\"ax\"\n"		\
-		      "663:\n\t" newinstr "\n664:\n"  /* replacement */	\
-		      ".previous" :: "i" (feature) : "memory")
+	asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
 
 /*
  * Alternative inline assembly with input.
@@ -109,35 +115,16 @@ const unsigned char *const *find_nop_table(void);
  * Best is to use constraints that are fixed size (like (%1) ... "r")
  * If you use variable sized constraints like "m" or "g" in the
  * replacement make sure to pad to the worst case length.
+ * Leaving an unused argument 0 to keep API compatibility.
  */
 #define alternative_input(oldinstr, newinstr, feature, input...)	\
-	asm volatile ("661:\n\t" oldinstr "\n662:\n"			\
-		      ".section .altinstructions,\"a\"\n"		\
-		      _ASM_ALIGN "\n"					\
-		      _ASM_PTR "661b\n"		/* label */		\
-		      _ASM_PTR "663f\n"		/* new instruction */	\
-		      "	 .byte %c0\n"		/* feature bit */	\
-		      "	 .byte 662b-661b\n"	/* sourcelen */		\
-		      "	 .byte 664f-663f\n"	/* replacementlen */	\
-		      ".previous\n"					\
-		      ".section .altinstr_replacement,\"ax\"\n"		\
-		      "663:\n\t" newinstr "\n664:\n"  /* replacement */	\
-		      ".previous" :: "i" (feature), ##input)
+	asm volatile (ALTERNATIVE(oldinstr, newinstr, feature)		\
+		: : "i" (0), ## input)
 
 /* Like alternative_input, but with a single output argument */
 #define alternative_io(oldinstr, newinstr, feature, output, input...)	\
-	asm volatile ("661:\n\t" oldinstr "\n662:\n"			\
-		      ".section .altinstructions,\"a\"\n"		\
-		      _ASM_ALIGN "\n"					\
-		      _ASM_PTR "661b\n"		/* label */		\
-		      _ASM_PTR "663f\n"		/* new instruction */	\
-		      "	 .byte %c[feat]\n"	/* feature bit */	\
-		      "	 .byte 662b-661b\n"	/* sourcelen */		\
-		      "	 .byte 664f-663f\n"	/* replacementlen */	\
-		      ".previous\n"					\
-		      ".section .altinstr_replacement,\"ax\"\n"		\
-		      "663:\n\t" newinstr "\n664:\n"  /* replacement */ \
-		      ".previous" : output : [feat] "i" (feature), ##input)
+	asm volatile (ALTERNATIVE(oldinstr, newinstr, feature)		\
+		: output : "i" (0), ## input)
 
 /*
  * use this macro(s) if you need more than one output parameter
-- 
cgit v0.10.2


From 5beae6efd1004b44c3e257dc96087978e4c763c1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 29 Apr 2009 00:16:21 -0400
Subject: tracing: fix ref count in splice pages

The pages allocated for the splice binary buffer did not initialize
the ref count correctly. This caused pages not to be freed and causes
a drastic memory leak.

Thanks to logdev I was able to trace the tracer to find where the leak
was.

[ Impact: stop memory leak when using splice ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5d704a4..9058240 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3531,6 +3531,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		if (!ref)
 			break;
 
+		ref->ref = 1;
 		ref->buffer = info->tr->buffer;
 		ref->page = ring_buffer_alloc_read_page(ref->buffer);
 		if (!ref->page) {
-- 
cgit v0.10.2


From 93459c6cb9816c52200993d29dd18cea1daee335 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 29 Apr 2009 00:23:13 -0400
Subject: tracing: only add splice page if entries exist

The splice code allocates a page even when the ring buffer is empty.
It detects the ring buffer being empty when it it fails to copy
anything from the ring buffer into the page.

This patch adds a check to see if there is anything in the ring buffer
before allocating a page.

Thanks to logdev for letting me trace the tracer to find this.

[ Impact: speed up due to removing unnecessary allocation ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9058240..0aeb3b9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3508,7 +3508,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		.spd_release	= buffer_spd_release,
 	};
 	struct buffer_ref *ref;
-	int size, i;
+	int entries, size, i;
 	size_t ret;
 
 	if (*ppos & (PAGE_SIZE - 1)) {
@@ -3523,7 +3523,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		len &= PAGE_MASK;
 	}
 
-	for (i = 0; i < PIPE_BUFFERS && len; i++, len -= PAGE_SIZE) {
+	entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
+
+	for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
 		struct page *page;
 		int r;
 
@@ -3564,6 +3566,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		spd.partial[i].private = (unsigned long)ref;
 		spd.nr_pages++;
 		*ppos += PAGE_SIZE;
+
+		entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
 	}
 
 	spd.nr_pages = i;
-- 
cgit v0.10.2


From f2957f1f196b0217644a17c1379855a118a37d72 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 29 Apr 2009 00:26:30 -0400
Subject: tracing: have splice only copy full pages

Splice works with pages, it is much more effecient to use an entire
page than to copy bits over several pages.

Using logdev to trace the internals of the splice mechanism, I was
able to see that splice can be very aggressive. When tracing is
occurring, and the reader caught up to the writer, and the writer
is on the reader page, the reader will copy what is there into the
splice page. Splice may iterate over several pages and if the
writer is still writing to the page, the reader will keep copying
bits to new pages to pass to userspace.

This patch changes it to only pass data to userspace if the page
is full (the writer has left the page). This has a small side effect
that splice can not read a partial page, and must wait for the
page to fill. This should not be an issue. If tracing has stopped,
then a use of "read" will still read all of the page.

[ Impact: better performance for ring buffer splice code ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0aeb3b9..f5427e0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3542,7 +3542,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		}
 
 		r = ring_buffer_read_page(ref->buffer, &ref->page,
-					  len, info->cpu, 0);
+					  len, info->cpu, 1);
 		if (r < 0) {
 			ring_buffer_free_read_page(ref->buffer,
 						   ref->page);
-- 
cgit v0.10.2


From 7d7d2b803159d4edeb051b0e5efbc1a8d9ef1c67 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 27 Apr 2009 12:37:49 -0400
Subject: ring-buffer: fix printk output

The warning output in trace_recursive_lock uses %d for a long when
it should be %ld.

[ Impact: fix compile warning ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9692f10..f4cc590 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1491,7 +1491,7 @@ static int trace_recursive_lock(void)
 	/* Disable all tracing before we do anything else */
 	tracing_off_permanent();
 
-	printk_once(KERN_WARNING "Tracing recursion: depth[%d]:"
+	printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
 		    "HC[%lu]:SC[%lu]:NMI[%lu]\n",
 		    current->trace_recursion,
 		    hardirq_count() >> HARDIRQ_SHIFT,
-- 
cgit v0.10.2


From aee6a166a5401dcfcb17fcdc055e5edf2a4f4042 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 29 Apr 2009 09:47:17 +0200
Subject: x86: beautify vmlinux_32.lds.S

Beautify vmlinux_32.lds.S:

 - Use tabs for indent
 - Located curly braces like in C code
 - Rearranged a few comments

To see actual differences use "git diff -b" which
ignore 'whitespace' changes.

The beautification is done to prepare a unification
of the _32 and _64 variants of the linker scripts.

[ Impact: cleanup ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-1-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 62ad500..fffa45a 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -22,196 +22,220 @@ ENTRY(phys_startup_32)
 jiffies = jiffies_64;
 
 PHDRS {
-	text PT_LOAD FLAGS(5);	/* R_E */
-	data PT_LOAD FLAGS(7);	/* RWE */
-	note PT_NOTE FLAGS(0);	/* ___ */
+	text PT_LOAD FLAGS(5);		/* R_E */
+	data PT_LOAD FLAGS(7);		/* RWE */
+	note PT_NOTE FLAGS(0);		/* ___ */
 }
 SECTIONS
 {
-  . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
-  phys_startup_32 = startup_32 - LOAD_OFFSET;
-
-  .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
-  	_text = .;			/* Text and read-only data */
-	*(.text.head)
-  } :text = 0x9090
-
-  /* read-only */
-  .text : AT(ADDR(.text) - LOAD_OFFSET) {
-	. = ALIGN(PAGE_SIZE); /* not really needed, already page aligned */
-	*(.text.page_aligned)
-	TEXT_TEXT
-	SCHED_TEXT
-	LOCK_TEXT
-	KPROBES_TEXT
-	IRQENTRY_TEXT
-	*(.fixup)
-	*(.gnu.warning)
-  	_etext = .;			/* End of text section */
-  } :text = 0x9090
-
-  NOTES :text :note
-
-  . = ALIGN(16);		/* Exception table */
-  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
-  	__start___ex_table = .;
-	 *(__ex_table)
-  	__stop___ex_table = .;
-  } :text = 0x9090
-
-  RODATA
-
-  /* writeable */
-  . = ALIGN(PAGE_SIZE);
-  .data : AT(ADDR(.data) - LOAD_OFFSET) {	/* Data */
-	DATA_DATA
-	CONSTRUCTORS
+	. = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
+	phys_startup_32 = startup_32 - LOAD_OFFSET;
+
+	/* Text and read-only data */
+	.text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
+		_text = .;
+		*(.text.head)
+	} :text = 0x9090
+
+	/* read-only */
+	.text : AT(ADDR(.text) - LOAD_OFFSET) {
+		/* not really needed, already page aligned */
+		. = ALIGN(PAGE_SIZE);
+		*(.text.page_aligned)
+		TEXT_TEXT
+		SCHED_TEXT
+		LOCK_TEXT
+		KPROBES_TEXT
+		IRQENTRY_TEXT
+		*(.fixup)
+		*(.gnu.warning)
+		/* End of text section */
+		_etext = .;
+	} :text = 0x9090
+
+	NOTES :text :note
+
+	/* Exception table */
+	. = ALIGN(16);
+	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
+		__start___ex_table = .;
+		 *(__ex_table)
+		__stop___ex_table = .;
+	} :text = 0x9090
+
+	RODATA
+
+	/* writeable */
+	. = ALIGN(PAGE_SIZE);
+	/* Data */
+	.data : AT(ADDR(.data) - LOAD_OFFSET) {
+		DATA_DATA
+		CONSTRUCTORS
 	} :data
 
-  . = ALIGN(PAGE_SIZE);
-  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
-  	__nosave_begin = .;
-	*(.data.nosave)
-  	. = ALIGN(PAGE_SIZE);
-  	__nosave_end = .;
-  }
-
-  . = ALIGN(PAGE_SIZE);
-  .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
-	*(.data.page_aligned)
-	*(.data.idt)
-  }
-
-  . = ALIGN(32);
-  .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
-	*(.data.cacheline_aligned)
-  }
-
-  /* rarely changed data like cpu maps */
-  . = ALIGN(32);
-  .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
-	*(.data.read_mostly)
-	_edata = .;		/* End of data section */
-  }
-
-  . = ALIGN(THREAD_SIZE);	/* init_task */
-  .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
-	*(.data.init_task)
-  }
-
-  /* might get freed after init */
-  . = ALIGN(PAGE_SIZE);
-  .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
-  	__smp_locks = .;
-	*(.smp_locks)
-	__smp_locks_end = .;
-  }
-  /* will be freed after init
-   * Following ALIGN() is required to make sure no other data falls on the
-   * same page where __smp_alt_end is pointing as that page might be freed
-   * after boot. Always make sure that ALIGN() directive is present after
-   * the section which contains __smp_alt_end.
-   */
-  . = ALIGN(PAGE_SIZE);
-
-  /* will be freed after init */
-  . = ALIGN(PAGE_SIZE);		/* Init code and data */
-  .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
-  	__init_begin = .;
-	_sinittext = .;
-	INIT_TEXT
-	_einittext = .;
-  }
-  .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
-	INIT_DATA
-  }
-  . = ALIGN(16);
-  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
-  	__setup_start = .;
-	*(.init.setup)
-  	__setup_end = .;
-   }
-  .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
-  	__initcall_start = .;
-	INITCALLS
-  	__initcall_end = .;
-  }
-  .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
-  	__con_initcall_start = .;
-	*(.con_initcall.init)
-  	__con_initcall_end = .;
-  }
-  .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
-	__x86_cpu_dev_start = .;
-	*(.x86_cpu_dev.init)
-	__x86_cpu_dev_end = .;
-  }
-  SECURITY_INIT
-  . = ALIGN(4);
-  .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
-  	__alt_instructions = .;
-	*(.altinstructions)
-	__alt_instructions_end = .;
-  }
-  .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
-	*(.altinstr_replacement)
-  }
-  . = ALIGN(4);
-  .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
-  	__parainstructions = .;
-	*(.parainstructions)
-  	__parainstructions_end = .;
-  }
-  /* .exit.text is discard at runtime, not link time, to deal with references
-     from .altinstructions and .eh_frame */
-  .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
-	EXIT_TEXT
-  }
-  .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
-	EXIT_DATA
-  }
+	. = ALIGN(PAGE_SIZE);
+	.data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+		__nosave_begin = .;
+		*(.data.nosave)
+		. = ALIGN(PAGE_SIZE);
+		__nosave_end = .;
+	}
+
+	. = ALIGN(PAGE_SIZE);
+	.data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+		*(.data.page_aligned)
+		*(.data.idt)
+	}
+
+	. = ALIGN(32);
+	.data.cacheline_aligned :
+		AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+		*(.data.cacheline_aligned)
+	}
+
+	/* rarely changed data like cpu maps */
+	. = ALIGN(32);
+	.data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
+		*(.data.read_mostly)
+
+		/* End of data section */
+		_edata = .;
+	}
+
+	/* init_task */
+	. = ALIGN(THREAD_SIZE);
+	.data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
+		*(.data.init_task)
+	}
+
+	. = ALIGN(PAGE_SIZE);
+	.smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+		/* might get freed after init */
+		__smp_locks = .;
+		*(.smp_locks)
+		__smp_locks_end = .;
+	}
+	/* will be freed after init
+	 * Following ALIGN() is required to make sure no other data falls on the
+	 * same page where __smp_alt_end is pointing as that page might be freed
+	 * after boot. Always make sure that ALIGN() directive is present after
+	 * the section which contains __smp_alt_end.
+	 */
+	. = ALIGN(PAGE_SIZE);
+
+	/* Init code and data - will be freed after init */
+	. = ALIGN(PAGE_SIZE);
+	.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
+		__init_begin = .;
+		_sinittext = .;
+		INIT_TEXT
+		_einittext = .;
+	}
+
+	.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
+		INIT_DATA
+	}
+
+	. = ALIGN(16);
+	.init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+		__setup_start = .;
+		*(.init.setup)
+		__setup_end = .;
+	}
+	.initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+		__initcall_start = .;
+		INITCALLS
+		__initcall_end = .;
+	}
+
+	.con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+		__con_initcall_start = .;
+		*(.con_initcall.init)
+		__con_initcall_end = .;
+	}
+
+	.x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
+		__x86_cpu_dev_start = .;
+		*(.x86_cpu_dev.init)
+		__x86_cpu_dev_end = .;
+	}
+
+	SECURITY_INIT
+
+	. = ALIGN(4);
+	.altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+		__alt_instructions = .;
+		*(.altinstructions)
+		__alt_instructions_end = .;
+	}
+
+	.altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
+		*(.altinstr_replacement)
+	}
+
+	. = ALIGN(4);
+	.parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
+		__parainstructions = .;
+		*(.parainstructions)
+		__parainstructions_end = .;
+	}
+
+	/*
+	 * .exit.text is discard at runtime, not link time, to deal with
+	 *  references from .altinstructions and .eh_frame
+	 */
+	.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
+		EXIT_TEXT
+	}
+
+	.exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
+		EXIT_DATA
+	}
+
 #if defined(CONFIG_BLK_DEV_INITRD)
-  . = ALIGN(PAGE_SIZE);
-  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
-	__initramfs_start = .;
-	*(.init.ramfs)
-	__initramfs_end = .;
-  }
+	. = ALIGN(PAGE_SIZE);
+	.init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
+		__initramfs_start = .;
+		*(.init.ramfs)
+		__initramfs_end = .;
+	}
 #endif
-  PERCPU(PAGE_SIZE)
-  . = ALIGN(PAGE_SIZE);
-  /* freed after init ends here */
-
-  .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
-	__init_end = .;
-	__bss_start = .;		/* BSS */
-	*(.bss.page_aligned)
-	*(.bss)
-	. = ALIGN(4);
-	__bss_stop = .;
-  }
 
-  .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
+	PERCPU(PAGE_SIZE)
+
 	. = ALIGN(PAGE_SIZE);
-	__brk_base = . ;
- 	. += 64 * 1024 ;	/* 64k alignment slop space */
-	*(.brk_reservation)	/* areas brk users have reserved */
-	__brk_limit = . ;
-  }
+	/* freed after init ends here */
+
+	/* BSS */
+	.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+		__init_end = .;
+		__bss_start = .;
+		*(.bss.page_aligned)
+		*(.bss)
+		. = ALIGN(4);
+		__bss_stop = .;
+	}
 
-  .end : AT(ADDR(.end) - LOAD_OFFSET) {
-	_end = . ;
-  }
+	.brk : AT(ADDR(.brk) - LOAD_OFFSET) {
+		. = ALIGN(PAGE_SIZE);
+		__brk_base = .;
+		. += 64 * 1024;		/* 64k alignment slop space */
+		*(.brk_reservation)	/* areas brk users have reserved */
+		__brk_limit = .;
+	}
 
-  /* Sections to be discarded */
-  /DISCARD/ : {
-	*(.exitcall.exit)
-	*(.discard)
+	.end : AT(ADDR(.end) - LOAD_OFFSET) {
+		_end = . ;
 	}
 
-  STABS_DEBUG
+	/* Sections to be discarded */
+	/DISCARD/ : {
+		*(.exitcall.exit)
+		*(.discard)
+	}
 
-  DWARF_DEBUG
+	STABS_DEBUG
+	DWARF_DEBUG
 }
 
 /*
-- 
cgit v0.10.2


From 17ce265d6a1789eae5eb739a3bb7fcffdb3e87c5 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 29 Apr 2009 09:47:18 +0200
Subject: x86, vmlinux.lds: unify header/footer

Merge everything except PHDRS and SECTIONS into
vmlinux.lds.S.

[ Impact: cleanup ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-2-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 849ee61..d113642 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -1,5 +1,82 @@
+/*
+ * ld script for the x86 kernel
+ *
+ * Historic 32-bit version written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ * Modernisation and unification done by Sam Ravnborg <sam@ravnborg.org>
+ *
+ *
+ * Don't define absolute symbols until and unless you know that symbol
+ * value is should remain constant even if kernel image is relocated
+ * at run time. Absolute symbols are not relocated. If symbol value should
+ * change if kernel is relocated, make the symbol section relative and
+ * put it inside the section definition.
+ */
+
+#ifdef CONFIG_X86_32
+#define LOAD_OFFSET __PAGE_OFFSET
+#else
+#define LOAD_OFFSET __START_KERNEL_map
+#endif
+
+#include <asm-generic/vmlinux.lds.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/page_types.h>
+#include <asm/cache.h>
+#include <asm/boot.h>
+
+#undef i386     /* in case the preprocessor is a 32bit one */
+
+OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
+
+#ifdef CONFIG_X86_32
+OUTPUT_ARCH(i386)
+ENTRY(phys_startup_32)
+jiffies = jiffies_64;
+#else
+OUTPUT_ARCH(i386:x86-64)
+ENTRY(phys_startup_64)
+jiffies_64 = jiffies;
+#endif
+
+
 #ifdef CONFIG_X86_32
 # include "vmlinux_32.lds.S"
 #else
 # include "vmlinux_64.lds.S"
 #endif
+
+
+#ifdef CONFIG_X86_32
+ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
+        "kernel image bigger than KERNEL_IMAGE_SIZE")
+#else
+/*
+ * Per-cpu symbols which need to be offset from __per_cpu_load
+ * for the boot processor.
+ */
+#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
+INIT_PER_CPU(gdt_page);
+INIT_PER_CPU(irq_stack_union);
+
+/*
+ * Build-time check on the image size:
+ */
+ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
+	"kernel image bigger than KERNEL_IMAGE_SIZE")
+
+#ifdef CONFIG_SMP
+ASSERT((per_cpu__irq_stack_union == 0),
+        "irq_stack_union is not at start of per-cpu area");
+#endif
+
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_KEXEC
+#include <asm/kexec.h>
+
+ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
+       "kexec control code size is too big")
+#endif
+
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index fffa45a..4c985fc 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -1,26 +1,3 @@
-/* ld script to make i386 Linux kernel
- * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
- *
- * Don't define absolute symbols until and unless you know that symbol
- * value is should remain constant even if kernel image is relocated
- * at run time. Absolute symbols are not relocated. If symbol value should
- * change if kernel is relocated, make the symbol section relative and
- * put it inside the section definition.
- */
-
-#define LOAD_OFFSET __PAGE_OFFSET
-
-#include <asm-generic/vmlinux.lds.h>
-#include <asm/thread_info.h>
-#include <asm/page_types.h>
-#include <asm/cache.h>
-#include <asm/boot.h>
-
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
-OUTPUT_ARCH(i386)
-ENTRY(phys_startup_32)
-jiffies = jiffies_64;
-
 PHDRS {
 	text PT_LOAD FLAGS(5);		/* R_E */
 	data PT_LOAD FLAGS(7);		/* RWE */
@@ -237,17 +214,3 @@ SECTIONS
 	STABS_DEBUG
 	DWARF_DEBUG
 }
-
-/*
- * Build-time check on the image size:
- */
-ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
-	"kernel image bigger than KERNEL_IMAGE_SIZE")
-
-#ifdef CONFIG_KEXEC
-/* Link time checks */
-#include <asm/kexec.h>
-
-ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
-       "kexec control code size is too big")
-#endif
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 6d5a5b0..7f1cc3d 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -1,19 +1,3 @@
-/* ld script to make x86-64 Linux kernel
- * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
- */
-
-#define LOAD_OFFSET __START_KERNEL_map
-
-#include <asm-generic/vmlinux.lds.h>
-#include <asm/asm-offsets.h>
-#include <asm/page_types.h>
-
-#undef i386	/* in case the preprocessor is a 32bit one */
-
-OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
-OUTPUT_ARCH(i386:x86-64)
-ENTRY(phys_startup_64)
-jiffies_64 = jiffies;
 PHDRS {
 	text PT_LOAD FLAGS(5);		/* R_E */
 	data PT_LOAD FLAGS(7);		/* RWE */
@@ -308,29 +292,3 @@ SECTIONS
 	STABS_DEBUG
 	DWARF_DEBUG
 }
-
-/*
- * Per-cpu symbols which need to be offset from __per_cpu_load
- * for the boot processor.
- */
-#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
-INIT_PER_CPU(gdt_page);
-INIT_PER_CPU(irq_stack_union);
-
-/*
- * Build-time check on the image size:
- */
-ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
-	"kernel image bigger than KERNEL_IMAGE_SIZE")
-
-#ifdef CONFIG_SMP
-ASSERT((per_cpu__irq_stack_union == 0),
-        "irq_stack_union is not at start of per-cpu area");
-#endif
-
-#ifdef CONFIG_KEXEC
-#include <asm/kexec.h>
-
-ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
-       "kexec control code size is too big")
-#endif
-- 
cgit v0.10.2


From afb8095a7eab32e5760613fa73d2f80a39cc45bf Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 29 Apr 2009 09:47:19 +0200
Subject: x86, vmlinux.lds: unify PHDRS

PHDRS are not equal for the two - so
use ifdefs to cover up for that.

On the assumption that they may become equal the ifdef
is inside the PHDRS definiton.

[ Impact: cleanup ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-3-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index d113642..1a1b303 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -40,6 +40,19 @@ ENTRY(phys_startup_64)
 jiffies_64 = jiffies;
 #endif
 
+PHDRS {
+	text PT_LOAD FLAGS(5);          /* R_E */
+	data PT_LOAD FLAGS(7);          /* RWE */
+#ifdef CONFIG_X86_64
+	user PT_LOAD FLAGS(7);          /* RWE */
+	data.init PT_LOAD FLAGS(7);     /* RWE */
+#ifdef CONFIG_SMP
+	percpu PT_LOAD FLAGS(7);        /* RWE */
+#endif
+	data.init2 PT_LOAD FLAGS(7);    /* RWE */
+#endif
+	note PT_NOTE FLAGS(0);          /* ___ */
+}
 
 #ifdef CONFIG_X86_32
 # include "vmlinux_32.lds.S"
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 4c985fc..4fd40dc 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -1,8 +1,3 @@
-PHDRS {
-	text PT_LOAD FLAGS(5);		/* R_E */
-	data PT_LOAD FLAGS(7);		/* RWE */
-	note PT_NOTE FLAGS(0);		/* ___ */
-}
 SECTIONS
 {
 	. = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 7f1cc3d..6e7cbee 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -1,14 +1,3 @@
-PHDRS {
-	text PT_LOAD FLAGS(5);		/* R_E */
-	data PT_LOAD FLAGS(7);		/* RWE */
-	user PT_LOAD FLAGS(7);		/* RWE */
-	data.init PT_LOAD FLAGS(7);	/* RWE */
-#ifdef CONFIG_SMP
-	percpu PT_LOAD FLAGS(7);	/* RWE */
-#endif
-	data.init2 PT_LOAD FLAGS(7);	/* RWE */
-	note PT_NOTE FLAGS(0);		/* ___ */
-}
 SECTIONS
 {
 	. = __START_KERNEL;
-- 
cgit v0.10.2


From 444e0ae4831f99ba25062d9a5ccb7117c62841a0 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 29 Apr 2009 09:47:20 +0200
Subject: x86, vmlinux.lds: unify start/end of SECTIONS

[ Impact: cleanup ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-4-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 1a1b303..845776f 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -54,12 +54,26 @@ PHDRS {
 	note PT_NOTE FLAGS(0);          /* ___ */
 }
 
+SECTIONS
+{
+#ifdef CONFIG_X86_32
+        . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
+        phys_startup_32 = startup_32 - LOAD_OFFSET;
+#else
+        . = __START_KERNEL;
+        phys_startup_64 = startup_64 - LOAD_OFFSET;
+#endif
+
 #ifdef CONFIG_X86_32
 # include "vmlinux_32.lds.S"
 #else
 # include "vmlinux_64.lds.S"
 #endif
 
+        STABS_DEBUG
+        DWARF_DEBUG
+}
+
 
 #ifdef CONFIG_X86_32
 ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 4fd40dc..3d3d49c 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -1,8 +1,3 @@
-SECTIONS
-{
-	. = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
-	phys_startup_32 = startup_32 - LOAD_OFFSET;
-
 	/* Text and read-only data */
 	.text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
 		_text = .;
@@ -205,7 +200,3 @@ SECTIONS
 		*(.exitcall.exit)
 		*(.discard)
 	}
-
-	STABS_DEBUG
-	DWARF_DEBUG
-}
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 6e7cbee..2d7fa20 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -1,8 +1,3 @@
-SECTIONS
-{
-	. = __START_KERNEL;
-	phys_startup_64 = startup_64 - LOAD_OFFSET;
-
 	/* Text and read-only data */
 	.text :  AT(ADDR(.text) - LOAD_OFFSET) {
 		_text = .;
@@ -277,7 +272,3 @@ SECTIONS
 		*(.eh_frame)
 		*(.discard)
 	}
-
-	STABS_DEBUG
-	DWARF_DEBUG
-}
-- 
cgit v0.10.2


From dfc20895d944cfa81d8ff00809b68ecb8f72cbb0 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 29 Apr 2009 09:47:21 +0200
Subject: x86, vmlinux.lds: unify .text output sections

32 bit x86 had a dedicated .text.head output section,
whereas 64 bit had it all in a single output section.

In the unified version the dedicated .text.head output section
was kept to have full control over the head code.

32 bit:

- Moved definition of _stext to the linker script.
  The definition is located _after_ .text.page_aligned as this
  is what 32 bit did before.

The ALIGN(8) was introduced so we hit the exact same address
(on the tested config) before and after the move.

I assume that it is a bug that _stext did not cover the
.text.page_aligned section - if this is true it can be fixed
in a follow-up patch (and the ugly ALIGN() can be dropped).

[ Impact: 64-bit: cleanup, 32-bit: use the 64-bit linker script ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-5-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 3068388..dc5ed4b 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -608,13 +608,6 @@ ignore_int:
 ENTRY(initial_code)
 	.long i386_start_kernel
 
-.section .text
-/*
- * Real beginning of normal "text" segment
- */
-ENTRY(stext)
-ENTRY(_stext)
-
 /*
  * BSS section
  */
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 845776f..a7c88bb 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -64,6 +64,37 @@ SECTIONS
         phys_startup_64 = startup_64 - LOAD_OFFSET;
 #endif
 
+	/* Text and read-only data */
+
+	/* bootstrapping code */
+	.text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
+		_text = .;
+		*(.text.head)
+	} :text = 0x9090
+
+	/* The rest of the text */
+	.text :  AT(ADDR(.text) - LOAD_OFFSET) {
+#ifdef CONFIG_X86_32
+		/* not really needed, already page aligned */
+		. = ALIGN(PAGE_SIZE);
+		*(.text.page_aligned)
+#endif
+		. = ALIGN(8);
+		_stext = .;
+		TEXT_TEXT
+		SCHED_TEXT
+		LOCK_TEXT
+		KPROBES_TEXT
+		IRQENTRY_TEXT
+		*(.fixup)
+		*(.gnu.warning)
+		/* End of text section */
+		_etext = .;
+	} :text = 0x9090
+
+	NOTES :text :note
+
+
 #ifdef CONFIG_X86_32
 # include "vmlinux_32.lds.S"
 #else
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 3d3d49c..8540092 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -1,27 +1,3 @@
-	/* Text and read-only data */
-	.text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
-		_text = .;
-		*(.text.head)
-	} :text = 0x9090
-
-	/* read-only */
-	.text : AT(ADDR(.text) - LOAD_OFFSET) {
-		/* not really needed, already page aligned */
-		. = ALIGN(PAGE_SIZE);
-		*(.text.page_aligned)
-		TEXT_TEXT
-		SCHED_TEXT
-		LOCK_TEXT
-		KPROBES_TEXT
-		IRQENTRY_TEXT
-		*(.fixup)
-		*(.gnu.warning)
-		/* End of text section */
-		_etext = .;
-	} :text = 0x9090
-
-	NOTES :text :note
-
 	/* Exception table */
 	. = ALIGN(16);
 	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 2d7fa20..b5d4367 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -1,23 +1,3 @@
-	/* Text and read-only data */
-	.text :  AT(ADDR(.text) - LOAD_OFFSET) {
-		_text = .;
-		/* First the code that has to be first for bootstrapping */
-		*(.text.head)
-		_stext = .;
-		/* Then the rest */
-		TEXT_TEXT
-		SCHED_TEXT
-		LOCK_TEXT
-		KPROBES_TEXT
-		IRQENTRY_TEXT
-		*(.fixup)
-		*(.gnu.warning)
-		/* End of text section */
-		_etext = .;
-	} :text = 0x9090
-
-	NOTES :text :note
-
 	/* Exception table */
 	. = ALIGN(16);
 	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
-- 
cgit v0.10.2


From 448bc3ab0d03e77fee8e4264de0d001fc87bc164 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 29 Apr 2009 09:47:22 +0200
Subject: x86, vmlinux.lds: unify exception table

[ Impact: cleanup ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-6-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index a7c88bb..67164f6 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -94,6 +94,16 @@ SECTIONS
 
 	NOTES :text :note
 
+	/* Exception table */
+	. = ALIGN(16);
+	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
+		__start___ex_table = .;
+		*(__ex_table)
+		__stop___ex_table = .;
+	} :text = 0x9090
+
+	RODATA
+
 
 #ifdef CONFIG_X86_32
 # include "vmlinux_32.lds.S"
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 8540092..920cc69 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -1,13 +1,3 @@
-	/* Exception table */
-	. = ALIGN(16);
-	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
-		__start___ex_table = .;
-		 *(__ex_table)
-		__stop___ex_table = .;
-	} :text = 0x9090
-
-	RODATA
-
 	/* writeable */
 	. = ALIGN(PAGE_SIZE);
 	/* Data */
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index b5d4367..641f3f9 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -1,13 +1,3 @@
-	/* Exception table */
-	. = ALIGN(16);
-	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
-		__start___ex_table = .;
-		 *(__ex_table)
-		__stop___ex_table = .;
-	} :text = 0x9090
-
-	RODATA
-
 	/* Align data segment to page size boundary */
 	. = ALIGN(PAGE_SIZE);
 	/* Data */
-- 
cgit v0.10.2


From 1f6397bac55040cd520d9eaf299e155a7aa01d5f Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 29 Apr 2009 09:47:23 +0200
Subject: x86, vmlinux.lds: unify data output sections

For 64 bit the following functional changes are introduced:

 - .data.page_aligned has moved
 - .data.cacheline_aligned has moved
 - .data.read_mostly has moved
 - ALIGN() moved out of output section for .data.cacheline_aligned
 - ALIGN() moved out of output section for .data.page_aligned

Notice that 32 bit and 64 bit has different location of _edata.
.data_nosave is 32 bit only as 64 bit is special due to PERCPU.

[ Impact: 32-bit: cleanup, 64-bit: use 32-bit linker script ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-7-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 67164f6..067bdb0 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -104,6 +104,61 @@ SECTIONS
 
 	RODATA
 
+	/* Data */
+	. = ALIGN(PAGE_SIZE);
+	.data : AT(ADDR(.data) - LOAD_OFFSET) {
+		DATA_DATA
+		CONSTRUCTORS
+
+#ifdef CONFIG_X86_64
+		/* End of data section */
+		_edata = .;
+#endif
+	} :data
+
+#ifdef CONFIG_X86_32
+	/* 32 bit has nosave before _edata */
+	. = ALIGN(PAGE_SIZE);
+	.data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+		__nosave_begin = .;
+		*(.data.nosave)
+		. = ALIGN(PAGE_SIZE);
+		__nosave_end = .;
+	}
+#endif
+
+	. = ALIGN(PAGE_SIZE);
+	.data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+		*(.data.page_aligned)
+		*(.data.idt)
+	}
+
+#ifdef CONFIG_X86_32
+	. = ALIGN(32);
+#else
+	. = ALIGN(PAGE_SIZE);
+	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+#endif
+	.data.cacheline_aligned :
+		AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+		*(.data.cacheline_aligned)
+	}
+
+	/* rarely changed data like cpu maps */
+#ifdef CONFIG_X86_32
+	. = ALIGN(32);
+#else
+	. = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
+#endif
+	.data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
+		*(.data.read_mostly)
+
+#ifdef CONFIG_X86_32
+		/* End of data section */
+		_edata = .;
+#endif
+	}
+
 
 #ifdef CONFIG_X86_32
 # include "vmlinux_32.lds.S"
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 920cc69..8ade846 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -1,40 +1,3 @@
-	/* writeable */
-	. = ALIGN(PAGE_SIZE);
-	/* Data */
-	.data : AT(ADDR(.data) - LOAD_OFFSET) {
-		DATA_DATA
-		CONSTRUCTORS
-	} :data
-
-	. = ALIGN(PAGE_SIZE);
-	.data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
-		__nosave_begin = .;
-		*(.data.nosave)
-		. = ALIGN(PAGE_SIZE);
-		__nosave_end = .;
-	}
-
-	. = ALIGN(PAGE_SIZE);
-	.data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
-		*(.data.page_aligned)
-		*(.data.idt)
-	}
-
-	. = ALIGN(32);
-	.data.cacheline_aligned :
-		AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
-		*(.data.cacheline_aligned)
-	}
-
-	/* rarely changed data like cpu maps */
-	. = ALIGN(32);
-	.data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
-		*(.data.read_mostly)
-
-		/* End of data section */
-		_edata = .;
-	}
-
 	/* init_task */
 	. = ALIGN(THREAD_SIZE);
 	.data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 641f3f9..8262701 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -1,26 +1,3 @@
-	/* Align data segment to page size boundary */
-	. = ALIGN(PAGE_SIZE);
-	/* Data */
-	.data : AT(ADDR(.data) - LOAD_OFFSET) {
-		DATA_DATA
-		CONSTRUCTORS
-		/* End of data section */
-		_edata = .;
-	} :data
-
-
-	.data.cacheline_aligned :
-		AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
-		. = ALIGN(PAGE_SIZE);
-		. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-		*(.data.cacheline_aligned)
-	}
-
-	. = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
-	.data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
-		*(.data.read_mostly)
-	}
-
 #define VSYSCALL_ADDR (-10*1024*1024)
 #define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \
                             SIZEOF(.data.read_mostly) + 4095) & ~(4095))
@@ -95,11 +72,6 @@
 		*(.data.init_task)
 	} :data.init
 
-	.data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
-		. = ALIGN(PAGE_SIZE);
-		*(.data.page_aligned)
-	}
-
 	.smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
 		/* might get freed after init */
 		. = ALIGN(PAGE_SIZE);
-- 
cgit v0.10.2


From ff6f87e1626e10beef675084c9b5384a9477e3d5 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 29 Apr 2009 09:47:24 +0200
Subject: x86, vmlinux.lds: move vsyscall output sections

[ Impact: cleanup ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-8-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 067bdb0..b3106c2 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -159,6 +159,77 @@ SECTIONS
 #endif
 	}
 
+#ifdef CONFIG_X86_64
+
+#define VSYSCALL_ADDR (-10*1024*1024)
+#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \
+                            SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \
+                            SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+
+#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
+#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
+
+#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
+#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
+
+	. = VSYSCALL_ADDR;
+	.vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) {
+		*(.vsyscall_0)
+	} :user
+
+	__vsyscall_0 = VSYSCALL_VIRT_ADDR;
+
+	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+	.vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
+		*(.vsyscall_fn)
+	}
+
+	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+	.vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
+		*(.vsyscall_gtod_data)
+	}
+
+	vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
+	.vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
+		*(.vsyscall_clock)
+	}
+	vsyscall_clock = VVIRT(.vsyscall_clock);
+
+
+	.vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
+		*(.vsyscall_1)
+	}
+	.vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
+		*(.vsyscall_2)
+	}
+
+	.vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
+		*(.vgetcpu_mode)
+	}
+	vgetcpu_mode = VVIRT(.vgetcpu_mode);
+
+	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+	.jiffies : AT(VLOAD(.jiffies)) {
+		*(.jiffies)
+	}
+	jiffies = VVIRT(.jiffies);
+
+	.vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
+		*(.vsyscall_3)
+	}
+
+	. = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
+
+#undef VSYSCALL_ADDR
+#undef VSYSCALL_PHYS_ADDR
+#undef VSYSCALL_VIRT_ADDR
+#undef VLOAD_OFFSET
+#undef VLOAD
+#undef VVIRT_OFFSET
+#undef VVIRT
+
+#endif /* CONFIG_X86_64 */
 
 #ifdef CONFIG_X86_32
 # include "vmlinux_32.lds.S"
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 8262701..013aa0e 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -1,71 +1,3 @@
-#define VSYSCALL_ADDR (-10*1024*1024)
-#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \
-                            SIZEOF(.data.read_mostly) + 4095) & ~(4095))
-#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \
-                            SIZEOF(.data.read_mostly) + 4095) & ~(4095))
-
-#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
-#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
-
-#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
-#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
-
-	. = VSYSCALL_ADDR;
-	.vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) {
-		*(.vsyscall_0)
-	} :user
-
-	__vsyscall_0 = VSYSCALL_VIRT_ADDR;
-
-	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-	.vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
-		*(.vsyscall_fn)
-	}
-
-	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-	.vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
-		*(.vsyscall_gtod_data)
-	}
-
-	vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
-	.vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
-		*(.vsyscall_clock)
-	}
-	vsyscall_clock = VVIRT(.vsyscall_clock);
-
-
-	.vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
-		*(.vsyscall_1)
-	}
-	.vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
-		*(.vsyscall_2)
-	}
-
-	.vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
-		*(.vgetcpu_mode)
-	}
-	vgetcpu_mode = VVIRT(.vgetcpu_mode);
-
-	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-	.jiffies : AT(VLOAD(.jiffies)) {
-		*(.jiffies)
-	}
-	jiffies = VVIRT(.jiffies);
-
-	.vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
-		*(.vsyscall_3)
-	}
-
-	. = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
-
-#undef VSYSCALL_ADDR
-#undef VSYSCALL_PHYS_ADDR
-#undef VSYSCALL_VIRT_ADDR
-#undef VLOAD_OFFSET
-#undef VLOAD
-#undef VVIRT_OFFSET
-#undef VVIRT
-
 	/* init_task */
 	.data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
 		. = ALIGN(THREAD_SIZE);
-- 
cgit v0.10.2


From e58bdaa8f810332e5c1760ce496b01e07d51642c Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 29 Apr 2009 09:47:25 +0200
Subject: x86, vmlinux.lds: unify first part of initdata

32-bit:

 - Move definition of __init_begin outside output_section
   because it covers more than one section
 - Move ALIGN() for end-of-section inside .smp_locks output section.
   Same effect but the intent is better documented that
   we need both start and end aligned.

64-bit:

 - Move ALIGN() outside output section in .init.setup
 - Deleted unused __smp_alt_* symbols

None of the above should result in any functional change.

[ Impact: refactor and unify linker script ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-9-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index b3106c2..8b203c4 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -231,6 +231,67 @@ SECTIONS
 
 #endif /* CONFIG_X86_64 */
 
+	/* init_task */
+	. = ALIGN(THREAD_SIZE);
+	.data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
+		*(.data.init_task)
+	}
+#ifdef CONFIG_X86_64
+	 :data.init
+#endif
+
+	/*
+	 * smp_locks might be freed after init
+	 * start/end must be page aligned
+	 */
+	. = ALIGN(PAGE_SIZE);
+	.smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+		__smp_locks = .;
+		*(.smp_locks)
+		__smp_locks_end = .;
+		. = ALIGN(PAGE_SIZE);
+	}
+
+	/* Init code and data - will be freed after init */
+	. = ALIGN(PAGE_SIZE);
+	__init_begin = .; /* paired with __init_end */
+	.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
+		_sinittext = .;
+		INIT_TEXT
+		_einittext = .;
+	}
+
+	.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
+		INIT_DATA
+	}
+
+	. = ALIGN(16);
+	.init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+		__setup_start = .;
+		*(.init.setup)
+		__setup_end = .;
+	}
+	.initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+		__initcall_start = .;
+		INITCALLS
+		__initcall_end = .;
+	}
+
+	.con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+		__con_initcall_start = .;
+		*(.con_initcall.init)
+		__con_initcall_end = .;
+	}
+
+	.x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
+		__x86_cpu_dev_start = .;
+		*(.x86_cpu_dev.init)
+		__x86_cpu_dev_end = .;
+	}
+
+	SECURITY_INIT
+
+
 #ifdef CONFIG_X86_32
 # include "vmlinux_32.lds.S"
 #else
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 8ade846..d8ba539 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -1,63 +1,3 @@
-	/* init_task */
-	. = ALIGN(THREAD_SIZE);
-	.data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
-		*(.data.init_task)
-	}
-
-	. = ALIGN(PAGE_SIZE);
-	.smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
-		/* might get freed after init */
-		__smp_locks = .;
-		*(.smp_locks)
-		__smp_locks_end = .;
-	}
-	/* will be freed after init
-	 * Following ALIGN() is required to make sure no other data falls on the
-	 * same page where __smp_alt_end is pointing as that page might be freed
-	 * after boot. Always make sure that ALIGN() directive is present after
-	 * the section which contains __smp_alt_end.
-	 */
-	. = ALIGN(PAGE_SIZE);
-
-	/* Init code and data - will be freed after init */
-	. = ALIGN(PAGE_SIZE);
-	.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
-		__init_begin = .;
-		_sinittext = .;
-		INIT_TEXT
-		_einittext = .;
-	}
-
-	.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
-		INIT_DATA
-	}
-
-	. = ALIGN(16);
-	.init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
-		__setup_start = .;
-		*(.init.setup)
-		__setup_end = .;
-	}
-	.initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
-		__initcall_start = .;
-		INITCALLS
-		__initcall_end = .;
-	}
-
-	.con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
-		__con_initcall_start = .;
-		*(.con_initcall.init)
-		__con_initcall_end = .;
-	}
-
-	.x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
-		__x86_cpu_dev_start = .;
-		*(.x86_cpu_dev.init)
-		__x86_cpu_dev_end = .;
-	}
-
-	SECURITY_INIT
-
 	. = ALIGN(4);
 	.altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
 		__alt_instructions = .;
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 013aa0e..0e8054e 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -1,62 +1,3 @@
-	/* init_task */
-	.data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
-		. = ALIGN(THREAD_SIZE);
-		*(.data.init_task)
-	} :data.init
-
-	.smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
-		/* might get freed after init */
-		. = ALIGN(PAGE_SIZE);
-		__smp_alt_begin = .;
-		__smp_locks = .;
-		*(.smp_locks)
-		__smp_locks_end = .;
-		. = ALIGN(PAGE_SIZE);
-		__smp_alt_end = .;
-	}
-
-	/* Init code and data */
-	. = ALIGN(PAGE_SIZE);
-	__init_begin = .;	/* paired with __init_end */
-	.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
-		_sinittext = .;
-		INIT_TEXT
-		_einittext = .;
-	}
-
-	.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
-		__initdata_begin = .;
-		INIT_DATA
-		__initdata_end = .;
-	}
-
-	.init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
-		. = ALIGN(16);
-		__setup_start = .;
-		*(.init.setup)
-		__setup_end = .;
-	}
-
-	.initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
-		__initcall_start = .;
-		INITCALLS
-		__initcall_end = .;
-	}
-
-	.con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
-		__con_initcall_start = .;
-		*(.con_initcall.init)
-		__con_initcall_end = .;
-	}
-
-	.x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
-		__x86_cpu_dev_start = .;
-		*(.x86_cpu_dev.init)
-		__x86_cpu_dev_end = .;
-	}
-
-	SECURITY_INIT
-
 	. = ALIGN(8);
 	.parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
 		__parainstructions = .;
-- 
cgit v0.10.2


From ae61836289a415351caa524d328110aaeae100d4 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 29 Apr 2009 09:47:26 +0200
Subject: x86, vmlinux.lds: unify parainstructions

32 bit:

 - increase alignment from 4 to 8 for .parainstructions
 - increase alignment from 4 to 8 for .altinstructions

64 bit:

 - move ALIGN() outside output section for .altinstructions

None of the above should result in any functional change.

[ Impact: refactor and unify linker script ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-10-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 8b203c4..c8dd71e 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -291,6 +291,24 @@ SECTIONS
 
 	SECURITY_INIT
 
+	. = ALIGN(8);
+	.parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
+		__parainstructions = .;
+		*(.parainstructions)
+		__parainstructions_end = .;
+	}
+
+	. = ALIGN(8);
+	.altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+		__alt_instructions = .;
+		*(.altinstructions)
+		__alt_instructions_end = .;
+	}
+
+	.altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
+		*(.altinstr_replacement)
+	}
+
 
 #ifdef CONFIG_X86_32
 # include "vmlinux_32.lds.S"
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index d8ba539..5df9647 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -1,21 +1,3 @@
-	. = ALIGN(4);
-	.altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
-		__alt_instructions = .;
-		*(.altinstructions)
-		__alt_instructions_end = .;
-	}
-
-	.altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
-		*(.altinstr_replacement)
-	}
-
-	. = ALIGN(4);
-	.parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
-		__parainstructions = .;
-		*(.parainstructions)
-		__parainstructions_end = .;
-	}
-
 	/*
 	 * .exit.text is discard at runtime, not link time, to deal with
 	 *  references from .altinstructions and .eh_frame
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 0e8054e..9ef7096 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -1,21 +1,3 @@
-	. = ALIGN(8);
-	.parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
-		__parainstructions = .;
-		*(.parainstructions)
-		__parainstructions_end = .;
-	}
-
-	.altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
-		. = ALIGN(8);
-		__alt_instructions = .;
-		*(.altinstructions)
-		__alt_instructions_end = .;
-	}
-
-	.altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
-		*(.altinstr_replacement)
-	}
-
 	/*
 	 * .exit.text is discard at runtime, not link time, to deal with
 	 *  references from .altinstructions and .eh_frame
-- 
cgit v0.10.2


From bf6a57418d5445c98047edbec022c9e54d1526e6 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 29 Apr 2009 09:47:27 +0200
Subject: x86, vmlinux.lds: unify .exit.* and .init.ramfs

[ Impact: cleanup ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-11-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index c8dd71e..1ab62a5 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -309,6 +309,26 @@ SECTIONS
 		*(.altinstr_replacement)
 	}
 
+	/*
+	 * .exit.text is discard at runtime, not link time, to deal with
+	 *  references from .altinstructions and .eh_frame
+	 */
+	.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
+		EXIT_TEXT
+	}
+
+	.exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
+		EXIT_DATA
+	}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+	. = ALIGN(PAGE_SIZE);
+	.init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
+		__initramfs_start = .;
+		*(.init.ramfs)
+		__initramfs_end = .;
+	}
+#endif
 
 #ifdef CONFIG_X86_32
 # include "vmlinux_32.lds.S"
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 5df9647..36c8fbd 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -1,24 +1,3 @@
-	/*
-	 * .exit.text is discard at runtime, not link time, to deal with
-	 *  references from .altinstructions and .eh_frame
-	 */
-	.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
-		EXIT_TEXT
-	}
-
-	.exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
-		EXIT_DATA
-	}
-
-#if defined(CONFIG_BLK_DEV_INITRD)
-	. = ALIGN(PAGE_SIZE);
-	.init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
-		__initramfs_start = .;
-		*(.init.ramfs)
-		__initramfs_end = .;
-	}
-#endif
-
 	PERCPU(PAGE_SIZE)
 
 	. = ALIGN(PAGE_SIZE);
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 9ef7096..1aa5362 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -1,24 +1,3 @@
-	/*
-	 * .exit.text is discard at runtime, not link time, to deal with
-	 *  references from .altinstructions and .eh_frame
-	 */
-	.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
-		EXIT_TEXT
-	}
-
-	.exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
-		EXIT_DATA
-	}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-	. = ALIGN(PAGE_SIZE);
-	.init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
-		__initramfs_start = .;
-		*(.init.ramfs)
-		__initramfs_end = .;
-	}
-#endif
-
 #ifdef CONFIG_SMP
 	/*
 	 * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
-- 
cgit v0.10.2


From 9d16e78318f174fd4b07916a93e41749d5199267 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 29 Apr 2009 09:47:28 +0200
Subject: x86, vmlinux.lds: unify percpu

32 bit:
- move __init_end outside the .bss output section
  It really did not belong in there

[ Impact: 64-bit: cleanup, 32-bit: refactor linker script ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-12-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 1ab62a5..1ea2b85 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -330,6 +330,36 @@ SECTIONS
 	}
 #endif
 
+#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
+	/*
+	 * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
+	 * output PHDR, so the next output section - __data_nosave - should
+	 * start another section data.init2.  Also, pda should be at the head of
+	 * percpu area.  Preallocate it and define the percpu offset symbol
+	 * so that it can be accessed as a percpu variable.
+	 */
+	. = ALIGN(PAGE_SIZE);
+	PERCPU_VADDR(0, :percpu)
+#else
+	PERCPU(PAGE_SIZE)
+#endif
+
+	. = ALIGN(PAGE_SIZE);
+	/* freed after init ends here */
+	__init_end = .;
+
+#ifdef CONFIG_X86_64
+	.data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+		. = ALIGN(PAGE_SIZE);
+		__nosave_begin = .;
+		*(.data.nosave)
+		. = ALIGN(PAGE_SIZE);
+		__nosave_end = .;
+	} :data.init2
+	/* use another section data.init2, see PERCPU_VADDR() above */
+#endif
+
+
 #ifdef CONFIG_X86_32
 # include "vmlinux_32.lds.S"
 #else
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 36c8fbd..d23ee2c 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -1,11 +1,5 @@
-	PERCPU(PAGE_SIZE)
-
-	. = ALIGN(PAGE_SIZE);
-	/* freed after init ends here */
-
 	/* BSS */
 	.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
-		__init_end = .;
 		__bss_start = .;
 		*(.bss.page_aligned)
 		*(.bss)
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 1aa5362..a539366 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -1,29 +1,3 @@
-#ifdef CONFIG_SMP
-	/*
-	 * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
-	 * output PHDR, so the next output section - __data_nosave - should
-	 * start another section data.init2.  Also, pda should be at the head of
-	 * percpu area.  Preallocate it and define the percpu offset symbol
-	 * so that it can be accessed as a percpu variable.
-	 */
-	. = ALIGN(PAGE_SIZE);
-	PERCPU_VADDR(0, :percpu)
-#else
-	PERCPU(PAGE_SIZE)
-#endif
-
-	. = ALIGN(PAGE_SIZE);
-	__init_end = .;
-
-	.data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
-		. = ALIGN(PAGE_SIZE);
-		__nosave_begin = .;
-		*(.data.nosave)
-		. = ALIGN(PAGE_SIZE);
-		__nosave_end = .;
-	} :data.init2
-	/* use another section data.init2, see PERCPU_VADDR() above */
-
 	.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
 		. = ALIGN(PAGE_SIZE);
 		__bss_start = .;		/* BSS */
-- 
cgit v0.10.2


From 091e52c3551d3031343df24b573b770b4c6c72b6 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 29 Apr 2009 09:47:29 +0200
Subject: x86, vmlinux.lds: unify remaining parts

32 bit:
- explicit page align .bss
- move ALING() out of .brk output section
- discard *(.eh_frame)

64 bit:
- move ALIGN() out of .bss output section
- move ALIGN() out of .brk output section
- use a dedicated section to define _end

[ Impact: unify and fix section alignments in linker script ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-13-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 1ea2b85..ef3e4f1 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -359,12 +359,34 @@ SECTIONS
 	/* use another section data.init2, see PERCPU_VADDR() above */
 #endif
 
+	/* BSS */
+	. = ALIGN(PAGE_SIZE);
+	.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+		__bss_start = .;
+		*(.bss.page_aligned)
+		*(.bss)
+		. = ALIGN(4);
+		__bss_stop = .;
+	}
 
-#ifdef CONFIG_X86_32
-# include "vmlinux_32.lds.S"
-#else
-# include "vmlinux_64.lds.S"
-#endif
+	. = ALIGN(PAGE_SIZE);
+	.brk : AT(ADDR(.brk) - LOAD_OFFSET) {
+		__brk_base = .;
+		. += 64 * 1024;		/* 64k alignment slop space */
+		*(.brk_reservation)	/* areas brk users have reserved */
+		__brk_limit = .;
+	}
+
+	.end : AT(ADDR(.end) - LOAD_OFFSET) {
+		_end = .;
+	}
+
+	/* Sections to be discarded */
+	/DISCARD/ : {
+		*(.exitcall.exit)
+		*(.eh_frame)
+		*(.discard)
+	}
 
         STABS_DEBUG
         DWARF_DEBUG
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
deleted file mode 100644
index d23ee2c..0000000
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ /dev/null
@@ -1,26 +0,0 @@
-	/* BSS */
-	.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
-		__bss_start = .;
-		*(.bss.page_aligned)
-		*(.bss)
-		. = ALIGN(4);
-		__bss_stop = .;
-	}
-
-	.brk : AT(ADDR(.brk) - LOAD_OFFSET) {
-		. = ALIGN(PAGE_SIZE);
-		__brk_base = .;
-		. += 64 * 1024;		/* 64k alignment slop space */
-		*(.brk_reservation)	/* areas brk users have reserved */
-		__brk_limit = .;
-	}
-
-	.end : AT(ADDR(.end) - LOAD_OFFSET) {
-		_end = . ;
-	}
-
-	/* Sections to be discarded */
-	/DISCARD/ : {
-		*(.exitcall.exit)
-		*(.discard)
-	}
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
deleted file mode 100644
index a539366..0000000
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ /dev/null
@@ -1,24 +0,0 @@
-	.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
-		. = ALIGN(PAGE_SIZE);
-		__bss_start = .;		/* BSS */
-		*(.bss.page_aligned)
-		*(.bss)
-		__bss_stop = .;
-	}
-
-	.brk : AT(ADDR(.brk) - LOAD_OFFSET) {
-		. = ALIGN(PAGE_SIZE);
-		__brk_base = .;
-		. += 64 * 1024;		/* 64k alignment slop space */
-		*(.brk_reservation)	/* areas brk users have reserved */
-		__brk_limit = .;
-	}
-
-	_end = . ;
-
-	/* Sections to be discarded */
-	/DISCARD/ : {
-		*(.exitcall.exit)
-		*(.eh_frame)
-		*(.discard)
-	}
-- 
cgit v0.10.2


From 91fd7fe809bdf4d8aa56559d17b9f25a1a6fe732 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 29 Apr 2009 10:58:38 +0200
Subject: x86, vmlinux.lds: add copyright

Acked-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-2-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index ef3e4f1..0bdbaa5 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -3,7 +3,8 @@
  *
  * Historic 32-bit version written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
  *
- * Modernisation and unification done by Sam Ravnborg <sam@ravnborg.org>
+ * Modernisation, unification and other changes and fixes:
+ *   Copyright (C) 2007-2009  Sam Ravnborg <sam@ravnborg.org>
  *
  *
  * Don't define absolute symbols until and unless you know that symbol
-- 
cgit v0.10.2


From 0492e1bb8fe7d122901c9f3af75e537d4129712e Mon Sep 17 00:00:00 2001
From: Stuart Bennett <stuart@freedesktop.org>
Date: Tue, 28 Apr 2009 20:17:49 +0100
Subject: tracing: x86, mmiotrace: code consistency/legibility improvement

kmmio_probe being *p and kmmio_fault_page being sometimes *f and
sometimes *p is not helpful.

[ Impact: cleanup ]

Signed-off-by: Stuart Bennett <stuart@freedesktop.org>
Acked-by: Pekka Paalanen <pq@iki.fi>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1240946271-7083-3-git-send-email-stuart@freedesktop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 4f115e0..869181a 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -97,13 +97,13 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
 static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
 {
 	struct list_head *head;
-	struct kmmio_fault_page *p;
+	struct kmmio_fault_page *f;
 
 	page &= PAGE_MASK;
 	head = kmmio_page_list(page);
-	list_for_each_entry_rcu(p, head, list) {
-		if (p->page == page)
-			return p;
+	list_for_each_entry_rcu(f, head, list) {
+		if (f->page == page)
+			return f;
 	}
 	return NULL;
 }
@@ -439,12 +439,12 @@ static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
 						head,
 						struct kmmio_delayed_release,
 						rcu);
-	struct kmmio_fault_page *p = dr->release_list;
-	while (p) {
-		struct kmmio_fault_page *next = p->release_next;
-		BUG_ON(p->count);
-		kfree(p);
-		p = next;
+	struct kmmio_fault_page *f = dr->release_list;
+	while (f) {
+		struct kmmio_fault_page *next = f->release_next;
+		BUG_ON(f->count);
+		kfree(f);
+		f = next;
 	}
 	kfree(dr);
 }
@@ -453,19 +453,19 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
 {
 	struct kmmio_delayed_release *dr =
 		container_of(head, struct kmmio_delayed_release, rcu);
-	struct kmmio_fault_page *p = dr->release_list;
+	struct kmmio_fault_page *f = dr->release_list;
 	struct kmmio_fault_page **prevp = &dr->release_list;
 	unsigned long flags;
 
 	spin_lock_irqsave(&kmmio_lock, flags);
-	while (p) {
-		if (!p->count) {
-			list_del_rcu(&p->list);
-			prevp = &p->release_next;
+	while (f) {
+		if (!f->count) {
+			list_del_rcu(&f->list);
+			prevp = &f->release_next;
 		} else {
-			*prevp = p->release_next;
+			*prevp = f->release_next;
 		}
-		p = p->release_next;
+		f = f->release_next;
 	}
 	spin_unlock_irqrestore(&kmmio_lock, flags);
 
-- 
cgit v0.10.2


From 46e91d00b1165b14b484aa33800e1bba0794ae1a Mon Sep 17 00:00:00 2001
From: Stuart Bennett <stuart@freedesktop.org>
Date: Tue, 28 Apr 2009 20:17:50 +0100
Subject: tracing: x86, mmiotrace: refactor clearing/restore of page presence

* change function names to clear_* from set_*: in reality we only clear
  and restore page presence, and never unconditionally set present.
  Using clear_*({true, false}, ...) is therefore more honest than
  set_*({false, true}, ...)

* upgrade presence storage to pteval_t: doing user-space tracing will
  require saving and manipulation of the _PAGE_PROTNONE bit, in addition
  to the existing _PAGE_PRESENT changes, and having multiple bools stored
  and passed around does not seem optimal

[ Impact: refactor, clean up mmiotrace code ]

Signed-off-by: Stuart Bennett <stuart@freedesktop.org>
Acked-by: Pekka Paalanen <pq@iki.fi>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1240946271-7083-4-git-send-email-stuart@freedesktop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 869181a..a769d1a 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -32,7 +32,7 @@ struct kmmio_fault_page {
 	struct list_head list;
 	struct kmmio_fault_page *release_next;
 	unsigned long page; /* location of the fault page */
-	bool old_presence; /* page presence prior to arming */
+	pteval_t old_presence; /* page presence prior to arming */
 	bool armed;
 
 	/*
@@ -108,49 +108,51 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
 	return NULL;
 }
 
-static void set_pmd_presence(pmd_t *pmd, bool present, bool *old)
+static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
 {
 	pmdval_t v = pmd_val(*pmd);
-	*old = !!(v & _PAGE_PRESENT);
-	v &= ~_PAGE_PRESENT;
-	if (present)
-		v |= _PAGE_PRESENT;
+	if (clear) {
+		*old = v & _PAGE_PRESENT;
+		v &= ~_PAGE_PRESENT;
+	} else	/* presume this has been called with clear==true previously */
+		v |= *old;
 	set_pmd(pmd, __pmd(v));
 }
 
-static void set_pte_presence(pte_t *pte, bool present, bool *old)
+static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
 {
 	pteval_t v = pte_val(*pte);
-	*old = !!(v & _PAGE_PRESENT);
-	v &= ~_PAGE_PRESENT;
-	if (present)
-		v |= _PAGE_PRESENT;
+	if (clear) {
+		*old = v & _PAGE_PRESENT;
+		v &= ~_PAGE_PRESENT;
+	} else	/* presume this has been called with clear==true previously */
+		v |= *old;
 	set_pte_atomic(pte, __pte(v));
 }
 
-static int set_page_presence(unsigned long addr, bool present, bool *old)
+static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
 {
 	unsigned int level;
-	pte_t *pte = lookup_address(addr, &level);
+	pte_t *pte = lookup_address(f->page, &level);
 
 	if (!pte) {
-		pr_err("kmmio: no pte for page 0x%08lx\n", addr);
+		pr_err("kmmio: no pte for page 0x%08lx\n", f->page);
 		return -1;
 	}
 
 	switch (level) {
 	case PG_LEVEL_2M:
-		set_pmd_presence((pmd_t *)pte, present, old);
+		clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence);
 		break;
 	case PG_LEVEL_4K:
-		set_pte_presence(pte, present, old);
+		clear_pte_presence(pte, clear, &f->old_presence);
 		break;
 	default:
 		pr_err("kmmio: unexpected page level 0x%x.\n", level);
 		return -1;
 	}
 
-	__flush_tlb_one(addr);
+	__flush_tlb_one(f->page);
 	return 0;
 }
 
@@ -171,9 +173,9 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
 	WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n");
 	if (f->armed) {
 		pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n",
-					f->page, f->count, f->old_presence);
+					f->page, f->count, !!f->old_presence);
 	}
-	ret = set_page_presence(f->page, false, &f->old_presence);
+	ret = clear_page_presence(f, true);
 	WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page);
 	f->armed = true;
 	return ret;
@@ -182,8 +184,7 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
 /** Restore the given page to saved presence state. */
 static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
 {
-	bool tmp;
-	int ret = set_page_presence(f->page, f->old_presence, &tmp);
+	int ret = clear_page_presence(f, false);
 	WARN_ONCE(ret < 0,
 			KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
 	f->armed = false;
-- 
cgit v0.10.2


From 0f9a623dd6c9b5b4dd00c232f29525bfc7a8ecf2 Mon Sep 17 00:00:00 2001
From: Stuart Bennett <stuart@freedesktop.org>
Date: Tue, 28 Apr 2009 20:17:51 +0100
Subject: tracing: x86, mmiotrace: only register for die notifier when tracer
 active

Follow up to afcfe024aebd74b0984a41af9a34e009cf5badaf in Linus' tree
("x86: mmiotrace: quieten spurious warning message")

Signed-off-by: Stuart Bennett <stuart@freedesktop.org>
Acked-by: Pekka Paalanen <pq@iki.fi>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1240946271-7083-5-git-send-email-stuart@freedesktop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index a769d1a..256ce64 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -311,7 +311,12 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
 
 	if (!ctx->active) {
-		pr_debug("kmmio: spurious debug trap on CPU %d.\n",
+		/*
+		 * debug traps without an active context are due to either
+		 * something external causing them (f.e. using a debugger while
+		 * mmio tracing enabled), or erroneous behaviour
+		 */
+		pr_warning("kmmio: unexpected debug trap on CPU %d.\n",
 							smp_processor_id());
 		goto out;
 	}
@@ -529,8 +534,8 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
 }
 EXPORT_SYMBOL(unregister_kmmio_probe);
 
-static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
-								void *args)
+static int
+kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
 {
 	struct die_args *arg = args;
 
@@ -545,11 +550,23 @@ static struct notifier_block nb_die = {
 	.notifier_call = kmmio_die_notifier
 };
 
-static int __init init_kmmio(void)
+int kmmio_init(void)
 {
 	int i;
+
 	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
 		INIT_LIST_HEAD(&kmmio_page_table[i]);
+
 	return register_die_notifier(&nb_die);
 }
-fs_initcall(init_kmmio); /* should be before device_initcall() */
+
+void kmmio_cleanup(void)
+{
+	int i;
+
+	unregister_die_notifier(&nb_die);
+	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) {
+		WARN_ONCE(!list_empty(&kmmio_page_table[i]),
+			KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n");
+	}
+}
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index c9342ed..132772a 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -451,6 +451,7 @@ void enable_mmiotrace(void)
 
 	if (nommiotrace)
 		pr_info(NAME "MMIO tracing disabled.\n");
+	kmmio_init();
 	enter_uniprocessor();
 	spin_lock_irq(&trace_lock);
 	atomic_inc(&mmiotrace_enabled);
@@ -473,6 +474,7 @@ void disable_mmiotrace(void)
 
 	clear_trace_list(); /* guarantees: no more kmmio callbacks */
 	leave_uniprocessor();
+	kmmio_cleanup();
 	pr_info(NAME "disabled.\n");
 out:
 	mutex_unlock(&mmiotrace_mutex);
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index 3d1b7bd..97491f7 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -30,6 +30,8 @@ extern unsigned int kmmio_count;
 
 extern int register_kmmio_probe(struct kmmio_probe *p);
 extern void unregister_kmmio_probe(struct kmmio_probe *p);
+extern int kmmio_init(void);
+extern void kmmio_cleanup(void);
 
 #ifdef CONFIG_MMIOTRACE
 /* kmmio is active by some kmmio_probes? */
-- 
cgit v0.10.2


From fd0731944333db6e9e91b6954c6ef95f4b71ab04 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 29 Apr 2009 12:56:58 +0200
Subject: x86, vmlinux.lds: fix relocatable symbols

__init_begin/_end symbols should be inside sections as well,
otherwise the relocatable kernel gets confused when freeing
init sections in the wrong place.

[ Impact: fix bootup crash ]

Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <20090429105056.GA28720@uranus.ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 0bdbaa5..4c85b2e 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -255,8 +255,8 @@ SECTIONS
 
 	/* Init code and data - will be freed after init */
 	. = ALIGN(PAGE_SIZE);
-	__init_begin = .; /* paired with __init_end */
 	.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
+		__init_begin = .; /* paired with __init_end */
 		_sinittext = .;
 		INIT_TEXT
 		_einittext = .;
@@ -346,8 +346,11 @@ SECTIONS
 #endif
 
 	. = ALIGN(PAGE_SIZE);
+
 	/* freed after init ends here */
-	__init_end = .;
+	.init.end : AT(ADDR(.init.end) - LOAD_OFFSET) {
+		__init_end = .;
+	}
 
 #ifdef CONFIG_X86_64
 	.data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
-- 
cgit v0.10.2


From 30e673b230f9d556eb81ef68a7b1a08c8b3b142c Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 28 Apr 2009 03:04:47 -0500
Subject: tracing/filters: move preds into event_filter object

Create a new event_filter object, and move the pred-related members
out of the call and subsystem objects and into the filter object - the
details of the filter implementation don't need to be exposed in the
call and subsystem in any case, and it will also help make the new
parser implementation a little cleaner.

[ Impact: refactor trace-filter code to prepare for new features ]

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
Cc: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <1240905887.6416.119.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 78a9ba2..46a27f2 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -101,8 +101,8 @@ struct ftrace_event_call {
 	int			(*show_format)(struct trace_seq *s);
 	int			(*define_fields)(void);
 	struct list_head	fields;
-	int			n_preds;
-	struct filter_pred	**preds;
+	int			filter_active;
+	void			*filter;
 	void			*mod;
 
 #ifdef CONFIG_EVENT_PROFILE
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 7d55bcf..1fb7d6c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -731,12 +731,16 @@ struct ftrace_event_field {
 	int			size;
 };
 
+struct event_filter {
+	int			n_preds;
+	struct filter_pred	**preds;
+};
+
 struct event_subsystem {
 	struct list_head	list;
 	const char		*name;
 	struct dentry		*entry;
-	int			n_preds;
-	struct filter_pred	**preds;
+	void			*filter;
 };
 
 struct filter_pred;
@@ -774,7 +778,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 		     struct ring_buffer *buffer,
 		     struct ring_buffer_event *event)
 {
-	if (unlikely(call->n_preds) && !filter_match_preds(call, rec)) {
+	if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) {
 		ring_buffer_discard_commit(buffer, event);
 		return 1;
 	}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index be4d3a4..1cd1f37 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -757,8 +757,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 
 	list_add(&system->list, &event_subsystems);
 
-	system->preds = NULL;
-	system->n_preds = 0;
+	system->filter = NULL;
 
 	entry = debugfs_create_file("filter", 0644, system->entry, system,
 				    &ftrace_subsystem_filter_fops);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 6541828..1e861ec 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -93,11 +93,12 @@ static int filter_pred_none(struct filter_pred *pred, void *event)
 /* return 1 if event matches, 0 otherwise (discard) */
 int filter_match_preds(struct ftrace_event_call *call, void *rec)
 {
+	struct event_filter *filter = call->filter;
 	int i, matched, and_failed = 0;
 	struct filter_pred *pred;
 
-	for (i = 0; i < call->n_preds; i++) {
-		pred = call->preds[i];
+	for (i = 0; i < filter->n_preds; i++) {
+		pred = filter->preds[i];
 		if (and_failed && !pred->or)
 			continue;
 		matched = pred->fn(pred, rec);
@@ -115,20 +116,20 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec)
 }
 EXPORT_SYMBOL_GPL(filter_match_preds);
 
-static void __filter_print_preds(struct filter_pred **preds, int n_preds,
+static void __filter_print_preds(struct event_filter *filter,
 				 struct trace_seq *s)
 {
-	char *field_name;
 	struct filter_pred *pred;
+	char *field_name;
 	int i;
 
-	if (!n_preds) {
+	if (!filter || !filter->n_preds) {
 		trace_seq_printf(s, "none\n");
 		return;
 	}
 
-	for (i = 0; i < n_preds; i++) {
-		pred = preds[i];
+	for (i = 0; i < filter->n_preds; i++) {
+		pred = filter->preds[i];
 		field_name = pred->field_name;
 		if (i)
 			trace_seq_printf(s, pred->or ? "|| " : "&& ");
@@ -144,7 +145,7 @@ static void __filter_print_preds(struct filter_pred **preds, int n_preds,
 void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s)
 {
 	mutex_lock(&filter_mutex);
-	__filter_print_preds(call->preds, call->n_preds, s);
+	__filter_print_preds(call->filter, s);
 	mutex_unlock(&filter_mutex);
 }
 
@@ -152,7 +153,7 @@ void filter_print_subsystem_preds(struct event_subsystem *system,
 				  struct trace_seq *s)
 {
 	mutex_lock(&filter_mutex);
-	__filter_print_preds(system->preds, system->n_preds, s);
+	__filter_print_preds(system->filter, s);
 	mutex_unlock(&filter_mutex);
 }
 
@@ -200,12 +201,14 @@ static int filter_set_pred(struct filter_pred *dest,
 
 static void __filter_disable_preds(struct ftrace_event_call *call)
 {
+	struct event_filter *filter = call->filter;
 	int i;
 
-	call->n_preds = 0;
+	call->filter_active = 0;
+	filter->n_preds = 0;
 
 	for (i = 0; i < MAX_FILTER_PRED; i++)
-		call->preds[i]->fn = filter_pred_none;
+		filter->preds[i]->fn = filter_pred_none;
 }
 
 void filter_disable_preds(struct ftrace_event_call *call)
@@ -217,32 +220,39 @@ void filter_disable_preds(struct ftrace_event_call *call)
 
 int init_preds(struct ftrace_event_call *call)
 {
+	struct event_filter *filter;
 	struct filter_pred *pred;
 	int i;
 
-	call->n_preds = 0;
-
-	call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
-	if (!call->preds)
+	filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+	if (!call->filter)
 		return -ENOMEM;
 
+	call->filter_active = 0;
+	filter->n_preds = 0;
+
+	filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
+	if (!filter->preds)
+		goto oom;
+
 	for (i = 0; i < MAX_FILTER_PRED; i++) {
 		pred = kzalloc(sizeof(*pred), GFP_KERNEL);
 		if (!pred)
 			goto oom;
 		pred->fn = filter_pred_none;
-		call->preds[i] = pred;
+		filter->preds[i] = pred;
 	}
 
 	return 0;
 
 oom:
 	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		if (call->preds[i])
-			filter_free_pred(call->preds[i]);
+		if (filter->preds[i])
+			filter_free_pred(filter->preds[i]);
 	}
-	kfree(call->preds);
-	call->preds = NULL;
+	kfree(filter->preds);
+	kfree(call->filter);
+	call->filter = NULL;
 
 	return -ENOMEM;
 }
@@ -250,15 +260,16 @@ EXPORT_SYMBOL_GPL(init_preds);
 
 static void __filter_free_subsystem_preds(struct event_subsystem *system)
 {
+	struct event_filter *filter = system->filter;
 	struct ftrace_event_call *call;
 	int i;
 
-	if (system->n_preds) {
-		for (i = 0; i < system->n_preds; i++)
-			filter_free_pred(system->preds[i]);
-		kfree(system->preds);
-		system->preds = NULL;
-		system->n_preds = 0;
+	if (filter && filter->n_preds) {
+		for (i = 0; i < filter->n_preds; i++)
+			filter_free_pred(filter->preds[i]);
+		kfree(filter->preds);
+		kfree(filter);
+		system->filter = NULL;
 	}
 
 	list_for_each_entry(call, &ftrace_events, list) {
@@ -281,21 +292,23 @@ static int filter_add_pred_fn(struct ftrace_event_call *call,
 			      struct filter_pred *pred,
 			      filter_pred_fn_t fn)
 {
+	struct event_filter *filter = call->filter;
 	int idx, err;
 
-	if (call->n_preds && !pred->compound)
+	if (filter->n_preds && !pred->compound)
 		__filter_disable_preds(call);
 
-	if (call->n_preds == MAX_FILTER_PRED)
+	if (filter->n_preds == MAX_FILTER_PRED)
 		return -ENOSPC;
 
-	idx = call->n_preds;
-	filter_clear_pred(call->preds[idx]);
-	err = filter_set_pred(call->preds[idx], pred, fn);
+	idx = filter->n_preds;
+	filter_clear_pred(filter->preds[idx]);
+	err = filter_set_pred(filter->preds[idx], pred, fn);
 	if (err)
 		return err;
 
-	call->n_preds++;
+	filter->n_preds++;
+	call->filter_active = 1;
 
 	return 0;
 }
@@ -366,29 +379,41 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
 int filter_add_subsystem_pred(struct event_subsystem *system,
 			      struct filter_pred *pred)
 {
+	struct event_filter *filter = system->filter;
 	struct ftrace_event_call *call;
 
 	mutex_lock(&filter_mutex);
 
-	if (system->n_preds && !pred->compound)
+	if (filter && filter->n_preds && !pred->compound) {
 		__filter_free_subsystem_preds(system);
+		filter = NULL;
+	}
 
-	if (!system->n_preds) {
-		system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
+	if (!filter) {
+		system->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+		if (!system->filter) {
+			mutex_unlock(&filter_mutex);
+			return -ENOMEM;
+		}
+		filter = system->filter;
+		filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
 					GFP_KERNEL);
-		if (!system->preds) {
+
+		if (!filter->preds) {
+			kfree(system->filter);
+			system->filter = NULL;
 			mutex_unlock(&filter_mutex);
 			return -ENOMEM;
 		}
 	}
 
-	if (system->n_preds == MAX_FILTER_PRED) {
+	if (filter->n_preds == MAX_FILTER_PRED) {
 		mutex_unlock(&filter_mutex);
 		return -ENOSPC;
 	}
 
-	system->preds[system->n_preds] = pred;
-	system->n_preds++;
+	filter->preds[filter->n_preds] = pred;
+	filter->n_preds++;
 
 	list_for_each_entry(call, &ftrace_events, list) {
 		int err;
@@ -401,8 +426,8 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 
 		err = __filter_add_pred(call, pred);
 		if (err == -ENOMEM) {
-			system->preds[system->n_preds] = NULL;
-			system->n_preds--;
+			filter->preds[filter->n_preds] = NULL;
+			filter->n_preds--;
 			mutex_unlock(&filter_mutex);
 			return err;
 		}
-- 
cgit v0.10.2


From a118e4d1402f1349fe3d953493e4168a300a752d Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 28 Apr 2009 03:04:53 -0500
Subject: tracing/filters: distinguish between signed and unsigned fields

The new filter comparison ops need to be able to distinguish between
signed and unsigned field types, so add an is_signed flag/param to the
event field struct/trace_define_fields().  Also define a simple macro,
is_signed_type() to determine the signedness at compile time, used in the
trace macros.  If the is_signed_type() macro won't work with a specific
type, a new slightly modified version of TRACE_FIELD() called
TRACE_FIELD_SIGN(), allows the signedness to be set explicitly.

[ Impact: extend trace-filter code for new feature ]

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
Cc: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <1240905893.6416.120.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 46a27f2..e61a740 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -122,8 +122,9 @@ extern int filter_current_check_discard(struct ftrace_event_call *call,
 					struct ring_buffer_event *event);
 
 extern int trace_define_field(struct ftrace_event_call *call, char *type,
-			      char *name, int offset, int size);
+			      char *name, int offset, int size, int is_signed);
 
+#define is_signed_type(type)	(((type)(-1)) < 0)
 
 /*
  * The double __builtin_constant_p is because gcc will give us an error
@@ -144,10 +145,10 @@ do {									\
 		__trace_printk(ip, fmt, ##args);			\
 } while (0)
 
-#define __common_field(type, item)					\
+#define __common_field(type, item, is_signed)				\
 	ret = trace_define_field(event_call, #type, "common_" #item,	\
 				 offsetof(typeof(field.ent), item),	\
-				 sizeof(field.ent.item));		\
+				 sizeof(field.ent.item), is_signed);	\
 	if (ret)							\
 		return ret;
 
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 1e68114..edb02bc 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -225,7 +225,7 @@ ftrace_format_##call(struct trace_seq *s)				\
 #define __field(type, item)						\
 	ret = trace_define_field(event_call, #type, #item,		\
 				 offsetof(typeof(field), item),		\
-				 sizeof(field.item));			\
+				 sizeof(field.item), is_signed_type(type));	\
 	if (ret)							\
 		return ret;
 
@@ -234,7 +234,7 @@ ftrace_format_##call(struct trace_seq *s)				\
 	BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);				\
 	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
 				 offsetof(typeof(field), item),		\
-				 sizeof(field.item));			\
+				 sizeof(field.item), 0);		\
 	if (ret)							\
 		return ret;
 
@@ -242,7 +242,7 @@ ftrace_format_##call(struct trace_seq *s)				\
 #define __string(item, src)						       \
 	ret = trace_define_field(event_call, "__str_loc", #item,	       \
 				offsetof(typeof(field), __str_loc_##item),     \
-				sizeof(field.__str_loc_##item));
+				 sizeof(field.__str_loc_##item), 0);
 
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
@@ -253,11 +253,11 @@ ftrace_define_fields_##call(void)					\
 	struct ftrace_event_call *event_call = &event_##call;		\
 	int ret;							\
 									\
-	__common_field(int, type);					\
-	__common_field(unsigned char, flags);				\
-	__common_field(unsigned char, preempt_count);			\
-	__common_field(int, pid);					\
-	__common_field(int, tgid);					\
+	__common_field(int, type, 1);					\
+	__common_field(unsigned char, flags, 0);			\
+	__common_field(unsigned char, preempt_count, 0);		\
+	__common_field(int, pid, 1);					\
+	__common_field(int, tgid, 1);					\
 									\
 	tstruct;							\
 									\
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1fb7d6c..866d010 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -729,6 +729,7 @@ struct ftrace_event_field {
 	char			*type;
 	int			offset;
 	int			size;
+	int			is_signed;
 };
 
 struct event_filter {
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index cfcecc4..5e32e37 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -141,8 +141,8 @@ TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
 
 TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
 	TRACE_STRUCT(
-		TRACE_FIELD(ktime_t, state_data.stamp, stamp)
-		TRACE_FIELD(ktime_t, state_data.end, end)
+		TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1)
+		TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1)
 		TRACE_FIELD(int, state_data.type, type)
 		TRACE_FIELD(int, state_data.state, state)
 	),
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 1cd1f37..bbbea74 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -26,7 +26,7 @@ static DEFINE_MUTEX(event_mutex);
 LIST_HEAD(ftrace_events);
 
 int trace_define_field(struct ftrace_event_call *call, char *type,
-		       char *name, int offset, int size)
+		       char *name, int offset, int size, int is_signed)
 {
 	struct ftrace_event_field *field;
 
@@ -44,6 +44,7 @@ int trace_define_field(struct ftrace_event_call *call, char *type,
 
 	field->offset = offset;
 	field->size = size;
+	field->is_signed = is_signed;
 	list_add(&field->link, &call->fields);
 
 	return 0;
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 0cb1a14..d06cf89 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -50,6 +50,9 @@ extern void __bad_type_size(void);
 	if (!ret)							\
 		return 0;
 
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed)	\
+	TRACE_FIELD(type, item, assign)
 
 #undef TP_RAW_FMT
 #define TP_RAW_FMT(args...) args
@@ -98,6 +101,10 @@ ftrace_format_##call(struct trace_seq *s)				\
 #define TRACE_FIELD(type, item, assign)\
 	entry->item = assign;
 
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed)	\
+	TRACE_FIELD(type, item, assign)
+
 #undef TP_CMD
 #define TP_CMD(cmd...)	cmd
 
@@ -149,7 +156,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #define TRACE_FIELD(type, item, assign)					\
 	ret = trace_define_field(event_call, #type, #item,		\
 				 offsetof(typeof(field), item),		\
-				 sizeof(field.item));			\
+				 sizeof(field.item), is_signed_type(type));	\
 	if (ret)							\
 		return ret;
 
@@ -157,7 +164,15 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #define TRACE_FIELD_SPECIAL(type, item, len, cmd)			\
 	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
 				 offsetof(typeof(field), item),		\
-				 sizeof(field.item));			\
+				 sizeof(field.item), 0);		\
+	if (ret)							\
+		return ret;
+
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed)			\
+	ret = trace_define_field(event_call, #type, #item,		\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item), is_signed);	\
 	if (ret)							\
 		return ret;
 
@@ -173,11 +188,11 @@ ftrace_define_fields_##call(void)					\
 	struct args field;						\
 	int ret;							\
 									\
-	__common_field(unsigned char, type);				\
-	__common_field(unsigned char, flags);				\
-	__common_field(unsigned char, preempt_count);			\
-	__common_field(int, pid);					\
-	__common_field(int, tgid);					\
+	__common_field(unsigned char, type, 0);				\
+	__common_field(unsigned char, flags, 0);			\
+	__common_field(unsigned char, preempt_count, 0);		\
+	__common_field(int, pid, 1);					\
+	__common_field(int, tgid, 1);					\
 									\
 	tstruct;							\
 									\
-- 
cgit v0.10.2


From 8b3725621074040d380664964ffbc40610aef8c6 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 28 Apr 2009 03:04:59 -0500
Subject: tracing/filters: a better event parser

Replace the current event parser hack with a better one.  Filters are
no longer specified predicate by predicate, but all at once and can
use parens and any of the following operators:

numeric fields:

==, !=, <, <=, >, >=

string fields:

==, !=

predicates can be combined with the logical operators:

&&, ||

examples:

"common_preempt_count > 4" > filter

"((sig >= 10 && sig < 15) || sig == 17) && comm != bash" > filter

If there was an error, the erroneous string along with an error
message can be seen by looking at the filter e.g.:

((sig >= 10 && sig < 15) || dsig == 17) && comm != bash
^
parse_error: Field not found

Currently the caret for an error always appears at the beginning of
the filter; a real position should be used, but the error message
should be useful even without it.

To clear a filter, '0' can be written to the filter file.

Filters can also be set or cleared for a complete subsystem by writing
the same filter as would be written to an individual event to the
filter file at the root of the subsytem.  Note however, that if any
event in the subsystem lacks a field specified in the filter being
set, the set will fail and all filters in the subsytem are
automatically cleared.  This change from the previous version was made
because using only the fields that happen to exist for a given event
would most likely result in a meaningless filter.

Because the logical operators are now implemented as predicates, the
maximum number of predicates in a filter was increased from 8 to 16.

[ Impact: add new, extended trace-filter implementation ]

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
Cc: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <1240905899.6416.121.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index e61a740..5fff40c 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -112,7 +112,7 @@ struct ftrace_event_call {
 #endif
 };
 
-#define MAX_FILTER_PRED		8
+#define MAX_FILTER_PRED		32
 #define MAX_FILTER_STR_VAL	128
 
 extern int init_preds(struct ftrace_event_call *call);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 866d010..7736fe8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -735,6 +735,7 @@ struct ftrace_event_field {
 struct event_filter {
 	int			n_preds;
 	struct filter_pred	**preds;
+	char			*filter_string;
 };
 
 struct event_subsystem {
@@ -746,7 +747,8 @@ struct event_subsystem {
 
 struct filter_pred;
 
-typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
+typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
+				 int val1, int val2);
 
 struct filter_pred {
 	filter_pred_fn_t fn;
@@ -756,23 +758,18 @@ struct filter_pred {
 	char *field_name;
 	int offset;
 	int not;
-	int or;
-	int compound;
-	int clear;
+	int op;
+	int pop_n;
 };
 
-extern void filter_free_pred(struct filter_pred *pred);
-extern void filter_print_preds(struct ftrace_event_call *call,
+extern void print_event_filter(struct ftrace_event_call *call,
 			       struct trace_seq *s);
-extern int filter_parse(char **pbuf, struct filter_pred *pred);
-extern int filter_add_pred(struct ftrace_event_call *call,
-			   struct filter_pred *pred);
-extern void filter_disable_preds(struct ftrace_event_call *call);
-extern void filter_free_subsystem_preds(struct event_subsystem *system);
-extern void filter_print_subsystem_preds(struct event_subsystem *system,
+extern int apply_event_filter(struct ftrace_event_call *call,
+			      char *filter_string);
+extern int apply_subsystem_event_filter(struct event_subsystem *system,
+					char *filter_string);
+extern void print_subsystem_event_filter(struct event_subsystem *system,
 					 struct trace_seq *s);
-extern int filter_add_subsystem_pred(struct event_subsystem *system,
-				     struct filter_pred *pred);
 
 static inline int
 filter_check_discard(struct ftrace_event_call *call, void *rec,
@@ -787,6 +784,47 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 	return 0;
 }
 
+#define DEFINE_COMPARISON_PRED(type)					\
+static int filter_pred_##type(struct filter_pred *pred, void *event,	\
+			      int val1, int val2)			\
+{									\
+	type *addr = (type *)(event + pred->offset);			\
+	type val = (type)pred->val;					\
+	int match = 0;							\
+									\
+	switch (pred->op) {						\
+	case OP_LT:							\
+		match = (*addr < val);					\
+		break;							\
+	case OP_LE:							\
+		match = (*addr <= val);					\
+		break;							\
+	case OP_GT:							\
+		match = (*addr > val);					\
+		break;							\
+	case OP_GE:							\
+		match = (*addr >= val);					\
+		break;							\
+	default:							\
+		break;							\
+	}								\
+									\
+	return match;							\
+}
+
+#define DEFINE_EQUALITY_PRED(size)					\
+static int filter_pred_##size(struct filter_pred *pred, void *event,	\
+			      int val1, int val2)			\
+{									\
+	u##size *addr = (u##size *)(event + pred->offset);		\
+	u##size val = (u##size)pred->val;				\
+	int match;							\
+									\
+	match = (val == *addr) ^ pred->not;				\
+									\
+	return match;							\
+}
+
 extern struct list_head ftrace_events;
 
 extern const char *__start___trace_bprintk_fmt[];
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index bbbea74..f789ca5 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -492,7 +492,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	trace_seq_init(s);
 
-	filter_print_preds(call, s);
+	print_event_filter(call, s);
 	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 	kfree(s);
@@ -505,40 +505,26 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		   loff_t *ppos)
 {
 	struct ftrace_event_call *call = filp->private_data;
-	char buf[64], *pbuf = buf;
-	struct filter_pred *pred;
+	char *buf;
 	int err;
 
-	if (cnt >= sizeof(buf))
+	if (cnt >= PAGE_SIZE)
 		return -EINVAL;
 
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
-	buf[cnt] = '\0';
-
-	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-	if (!pred)
+	buf = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!buf)
 		return -ENOMEM;
 
-	err = filter_parse(&pbuf, pred);
-	if (err < 0) {
-		filter_free_pred(pred);
-		return err;
-	}
-
-	if (pred->clear) {
-		filter_disable_preds(call);
-		filter_free_pred(pred);
-		return cnt;
+	if (copy_from_user(buf, ubuf, cnt)) {
+		free_page((unsigned long) buf);
+		return -EFAULT;
 	}
+	buf[cnt] = '\0';
 
-	err = filter_add_pred(call, pred);
-	if (err < 0) {
-		filter_free_pred(pred);
+	err = apply_event_filter(call, buf);
+	free_page((unsigned long) buf);
+	if (err < 0)
 		return err;
-	}
-
-	filter_free_pred(pred);
 
 	*ppos += cnt;
 
@@ -562,7 +548,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	trace_seq_init(s);
 
-	filter_print_subsystem_preds(system, s);
+	print_subsystem_event_filter(system, s);
 	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 	kfree(s);
@@ -575,38 +561,26 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		       loff_t *ppos)
 {
 	struct event_subsystem *system = filp->private_data;
-	char buf[64], *pbuf = buf;
-	struct filter_pred *pred;
+	char *buf;
 	int err;
 
-	if (cnt >= sizeof(buf))
+	if (cnt >= PAGE_SIZE)
 		return -EINVAL;
 
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
-	buf[cnt] = '\0';
-
-	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-	if (!pred)
+	buf = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!buf)
 		return -ENOMEM;
 
-	err = filter_parse(&pbuf, pred);
-	if (err < 0) {
-		filter_free_pred(pred);
-		return err;
-	}
-
-	if (pred->clear) {
-		filter_free_subsystem_preds(system);
-		filter_free_pred(pred);
-		return cnt;
+	if (copy_from_user(buf, ubuf, cnt)) {
+		free_page((unsigned long) buf);
+		return -EFAULT;
 	}
+	buf[cnt] = '\0';
 
-	err = filter_add_subsystem_pred(system, pred);
-	if (err < 0) {
-		filter_free_pred(pred);
+	err = apply_subsystem_event_filter(system, buf);
+	free_page((unsigned long) buf);
+	if (err < 0)
 		return err;
-	}
 
 	*ppos += cnt;
 
@@ -760,11 +734,21 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 
 	system->filter = NULL;
 
+	system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
+	if (!system->filter) {
+		pr_warning("Could not allocate filter for subsystem "
+			   "'%s'\n", name);
+		return system->entry;
+	}
+
 	entry = debugfs_create_file("filter", 0644, system->entry, system,
 				    &ftrace_subsystem_filter_fops);
-	if (!entry)
+	if (!entry) {
+		kfree(system->filter);
+		system->filter = NULL;
 		pr_warning("Could not create debugfs "
 			   "'%s/filter' entry\n", name);
+	}
 
 	return system->entry;
 }
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 1e861ec..f494866 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -29,51 +29,130 @@
 
 static DEFINE_MUTEX(filter_mutex);
 
-static int filter_pred_64(struct filter_pred *pred, void *event)
+enum filter_op_ids
 {
-	u64 *addr = (u64 *)(event + pred->offset);
-	u64 val = (u64)pred->val;
-	int match;
-
-	match = (val == *addr) ^ pred->not;
-
-	return match;
-}
-
-static int filter_pred_32(struct filter_pred *pred, void *event)
-{
-	u32 *addr = (u32 *)(event + pred->offset);
-	u32 val = (u32)pred->val;
-	int match;
-
-	match = (val == *addr) ^ pred->not;
-
-	return match;
-}
-
-static int filter_pred_16(struct filter_pred *pred, void *event)
+	OP_OR,
+	OP_AND,
+	OP_NE,
+	OP_EQ,
+	OP_LT,
+	OP_LE,
+	OP_GT,
+	OP_GE,
+	OP_NONE,
+	OP_OPEN_PAREN,
+};
+
+struct filter_op {
+	int id;
+	char *string;
+	int precedence;
+};
+
+static struct filter_op filter_ops[] = {
+	{ OP_OR, "||", 1 },
+	{ OP_AND, "&&", 2 },
+	{ OP_NE, "!=", 4 },
+	{ OP_EQ, "==", 4 },
+	{ OP_LT, "<", 5 },
+	{ OP_LE, "<=", 5 },
+	{ OP_GT, ">", 5 },
+	{ OP_GE, ">=", 5 },
+	{ OP_NONE, "OP_NONE", 0 },
+	{ OP_OPEN_PAREN, "(", 0 },
+};
+
+enum {
+	FILT_ERR_NONE,
+	FILT_ERR_INVALID_OP,
+	FILT_ERR_UNBALANCED_PAREN,
+	FILT_ERR_TOO_MANY_OPERANDS,
+	FILT_ERR_OPERAND_TOO_LONG,
+	FILT_ERR_FIELD_NOT_FOUND,
+	FILT_ERR_ILLEGAL_FIELD_OP,
+	FILT_ERR_ILLEGAL_INTVAL,
+	FILT_ERR_BAD_SUBSYS_FILTER,
+	FILT_ERR_TOO_MANY_PREDS,
+	FILT_ERR_MISSING_FIELD,
+	FILT_ERR_INVALID_FILTER,
+};
+
+static char *err_text[] = {
+	"No error",
+	"Invalid operator",
+	"Unbalanced parens",
+	"Too many operands",
+	"Operand too long",
+	"Field not found",
+	"Illegal operation for field type",
+	"Illegal integer value",
+	"Couldn't find or set field in one of a subsystem's events",
+	"Too many terms in predicate expression",
+	"Missing field name and/or value",
+	"Meaningless filter expression",
+};
+
+struct opstack_op {
+	int op;
+	struct list_head list;
+};
+
+struct postfix_elt {
+	int op;
+	char *operand;
+	struct list_head list;
+};
+
+struct filter_parse_state {
+	struct filter_op *ops;
+	struct list_head opstack;
+	struct list_head postfix;
+	int lasterr;
+	int lasterr_pos;
+
+	struct {
+		char *string;
+		unsigned int cnt;
+		unsigned int tail;
+	} infix;
+
+	struct {
+		char string[MAX_FILTER_STR_VAL];
+		int pos;
+		unsigned int tail;
+	} operand;
+};
+
+DEFINE_COMPARISON_PRED(s64);
+DEFINE_COMPARISON_PRED(u64);
+DEFINE_COMPARISON_PRED(s32);
+DEFINE_COMPARISON_PRED(u32);
+DEFINE_COMPARISON_PRED(s16);
+DEFINE_COMPARISON_PRED(u16);
+DEFINE_COMPARISON_PRED(s8);
+DEFINE_COMPARISON_PRED(u8);
+
+DEFINE_EQUALITY_PRED(64);
+DEFINE_EQUALITY_PRED(32);
+DEFINE_EQUALITY_PRED(16);
+DEFINE_EQUALITY_PRED(8);
+
+static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
+			   void *event __attribute((unused)),
+			   int val1, int val2)
 {
-	u16 *addr = (u16 *)(event + pred->offset);
-	u16 val = (u16)pred->val;
-	int match;
-
-	match = (val == *addr) ^ pred->not;
-
-	return match;
+	return val1 && val2;
 }
 
-static int filter_pred_8(struct filter_pred *pred, void *event)
+static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
+			  void *event __attribute((unused)),
+			  int val1, int val2)
 {
-	u8 *addr = (u8 *)(event + pred->offset);
-	u8 val = (u8)pred->val;
-	int match;
-
-	match = (val == *addr) ^ pred->not;
-
-	return match;
+	return val1 || val2;
 }
 
-static int filter_pred_string(struct filter_pred *pred, void *event)
+static int filter_pred_string(struct filter_pred *pred, void *event,
+			      int val1, int val2)
 {
 	char *addr = (char *)(event + pred->offset);
 	int cmp, match;
@@ -85,7 +164,8 @@ static int filter_pred_string(struct filter_pred *pred, void *event)
 	return match;
 }
 
-static int filter_pred_none(struct filter_pred *pred, void *event)
+static int filter_pred_none(struct filter_pred *pred, void *event,
+			    int val1, int val2)
 {
 	return 0;
 }
@@ -94,66 +174,119 @@ static int filter_pred_none(struct filter_pred *pred, void *event)
 int filter_match_preds(struct ftrace_event_call *call, void *rec)
 {
 	struct event_filter *filter = call->filter;
-	int i, matched, and_failed = 0;
+	int match, top = 0, val1 = 0, val2 = 0;
+	int stack[MAX_FILTER_PRED];
 	struct filter_pred *pred;
+	int i;
 
 	for (i = 0; i < filter->n_preds; i++) {
 		pred = filter->preds[i];
-		if (and_failed && !pred->or)
+		if (!pred->pop_n) {
+			match = pred->fn(pred, rec, val1, val2);
+			stack[top++] = match;
 			continue;
-		matched = pred->fn(pred, rec);
-		if (!matched && !pred->or) {
-			and_failed = 1;
-			continue;
-		} else if (matched && pred->or)
-			return 1;
+		}
+		if (pred->pop_n > top) {
+			WARN_ON_ONCE(1);
+			return 0;
+		}
+		val1 = stack[--top];
+		val2 = stack[--top];
+		match = pred->fn(pred, rec, val1, val2);
+		stack[top++] = match;
 	}
 
-	if (and_failed)
-		return 0;
-
-	return 1;
+	return stack[--top];
 }
 EXPORT_SYMBOL_GPL(filter_match_preds);
 
-static void __filter_print_preds(struct event_filter *filter,
-				 struct trace_seq *s)
+static void parse_error(struct filter_parse_state *ps, int err, int pos)
 {
-	struct filter_pred *pred;
-	char *field_name;
-	int i;
+	ps->lasterr = err;
+	ps->lasterr_pos = pos;
+}
 
-	if (!filter || !filter->n_preds) {
-		trace_seq_printf(s, "none\n");
+static void remove_filter_string(struct event_filter *filter)
+{
+	kfree(filter->filter_string);
+	filter->filter_string = NULL;
+}
+
+static int replace_filter_string(struct event_filter *filter,
+				 char *filter_string)
+{
+	kfree(filter->filter_string);
+	filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
+	if (!filter->filter_string)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int append_filter_string(struct event_filter *filter,
+				char *string)
+{
+	int newlen;
+	char *new_filter_string;
+
+	BUG_ON(!filter->filter_string);
+	newlen = strlen(filter->filter_string) + strlen(string) + 1;
+	new_filter_string = kmalloc(newlen, GFP_KERNEL);
+	if (!new_filter_string)
+		return -ENOMEM;
+
+	strcpy(new_filter_string, filter->filter_string);
+	strcat(new_filter_string, string);
+	kfree(filter->filter_string);
+	filter->filter_string = new_filter_string;
+
+	return 0;
+}
+
+static void append_filter_err(struct filter_parse_state *ps,
+			      struct event_filter *filter)
+{
+	int pos = ps->lasterr_pos;
+	char *buf, *pbuf;
+
+	buf = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!buf)
 		return;
-	}
 
-	for (i = 0; i < filter->n_preds; i++) {
-		pred = filter->preds[i];
-		field_name = pred->field_name;
-		if (i)
-			trace_seq_printf(s, pred->or ? "|| " : "&& ");
-		trace_seq_printf(s, "%s ", field_name);
-		trace_seq_printf(s, pred->not ? "!= " : "== ");
-		if (pred->str_len)
-			trace_seq_printf(s, "%s\n", pred->str_val);
-		else
-			trace_seq_printf(s, "%llu\n", pred->val);
-	}
+	append_filter_string(filter, "\n");
+	memset(buf, ' ', PAGE_SIZE);
+	if (pos > PAGE_SIZE - 128)
+		pos = 0;
+	buf[pos] = '^';
+	pbuf = &buf[pos] + 1;
+
+	sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]);
+	append_filter_string(filter, buf);
+	free_page((unsigned long) buf);
 }
 
-void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s)
+void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
 {
+	struct event_filter *filter = call->filter;
+
 	mutex_lock(&filter_mutex);
-	__filter_print_preds(call->filter, s);
+	if (filter->filter_string)
+		trace_seq_printf(s, "%s\n", filter->filter_string);
+	else
+		trace_seq_printf(s, "none\n");
 	mutex_unlock(&filter_mutex);
 }
 
-void filter_print_subsystem_preds(struct event_subsystem *system,
+void print_subsystem_event_filter(struct event_subsystem *system,
 				  struct trace_seq *s)
 {
+	struct event_filter *filter = system->filter;
+
 	mutex_lock(&filter_mutex);
-	__filter_print_preds(system->filter, s);
+	if (filter->filter_string)
+		trace_seq_printf(s, "%s\n", filter->filter_string);
+	else
+		trace_seq_printf(s, "none\n");
 	mutex_unlock(&filter_mutex);
 }
 
@@ -170,7 +303,7 @@ find_event_field(struct ftrace_event_call *call, char *name)
 	return NULL;
 }
 
-void filter_free_pred(struct filter_pred *pred)
+static void filter_free_pred(struct filter_pred *pred)
 {
 	if (!pred)
 		return;
@@ -191,15 +324,17 @@ static int filter_set_pred(struct filter_pred *dest,
 			   filter_pred_fn_t fn)
 {
 	*dest = *src;
-	dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
-	if (!dest->field_name)
-		return -ENOMEM;
+	if (src->field_name) {
+		dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
+		if (!dest->field_name)
+			return -ENOMEM;
+	}
 	dest->fn = fn;
 
 	return 0;
 }
 
-static void __filter_disable_preds(struct ftrace_event_call *call)
+static void filter_disable_preds(struct ftrace_event_call *call)
 {
 	struct event_filter *filter = call->filter;
 	int i;
@@ -211,13 +346,6 @@ static void __filter_disable_preds(struct ftrace_event_call *call)
 		filter->preds[i]->fn = filter_pred_none;
 }
 
-void filter_disable_preds(struct ftrace_event_call *call)
-{
-	mutex_lock(&filter_mutex);
-	__filter_disable_preds(call);
-	mutex_unlock(&filter_mutex);
-}
-
 int init_preds(struct ftrace_event_call *call)
 {
 	struct event_filter *filter;
@@ -258,48 +386,43 @@ oom:
 }
 EXPORT_SYMBOL_GPL(init_preds);
 
-static void __filter_free_subsystem_preds(struct event_subsystem *system)
+static void filter_free_subsystem_preds(struct event_subsystem *system)
 {
 	struct event_filter *filter = system->filter;
 	struct ftrace_event_call *call;
 	int i;
 
-	if (filter && filter->n_preds) {
+	if (filter->n_preds) {
 		for (i = 0; i < filter->n_preds; i++)
 			filter_free_pred(filter->preds[i]);
 		kfree(filter->preds);
-		kfree(filter);
-		system->filter = NULL;
+		filter->preds = NULL;
+		filter->n_preds = 0;
 	}
 
 	list_for_each_entry(call, &ftrace_events, list) {
 		if (!call->define_fields)
 			continue;
 
-		if (!strcmp(call->system, system->name))
-			__filter_disable_preds(call);
+		if (!strcmp(call->system, system->name)) {
+			filter_disable_preds(call);
+			remove_filter_string(call->filter);
+		}
 	}
 }
 
-void filter_free_subsystem_preds(struct event_subsystem *system)
-{
-	mutex_lock(&filter_mutex);
-	__filter_free_subsystem_preds(system);
-	mutex_unlock(&filter_mutex);
-}
-
-static int filter_add_pred_fn(struct ftrace_event_call *call,
+static int filter_add_pred_fn(struct filter_parse_state *ps,
+			      struct ftrace_event_call *call,
 			      struct filter_pred *pred,
 			      filter_pred_fn_t fn)
 {
 	struct event_filter *filter = call->filter;
 	int idx, err;
 
-	if (filter->n_preds && !pred->compound)
-		__filter_disable_preds(call);
-
-	if (filter->n_preds == MAX_FILTER_PRED)
+	if (filter->n_preds == MAX_FILTER_PRED) {
+		parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
 		return -ENOSPC;
+	}
 
 	idx = filter->n_preds;
 	filter_clear_pred(filter->preds[idx]);
@@ -321,94 +444,132 @@ static int is_string_field(const char *type)
 	return 0;
 }
 
-static int __filter_add_pred(struct ftrace_event_call *call,
-			     struct filter_pred *pred)
+static int is_legal_op(struct ftrace_event_field *field, int op)
+{
+	if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE))
+		return 0;
+
+	return 1;
+}
+
+static filter_pred_fn_t select_comparison_fn(int op, int field_size,
+					     int field_is_signed)
+{
+	filter_pred_fn_t fn = NULL;
+
+	switch (field_size) {
+	case 8:
+		if (op == OP_EQ || op == OP_NE)
+			fn = filter_pred_64;
+		else if (field_is_signed)
+			fn = filter_pred_s64;
+		else
+			fn = filter_pred_u64;
+		break;
+	case 4:
+		if (op == OP_EQ || op == OP_NE)
+			fn = filter_pred_32;
+		else if (field_is_signed)
+			fn = filter_pred_s32;
+		else
+			fn = filter_pred_u32;
+		break;
+	case 2:
+		if (op == OP_EQ || op == OP_NE)
+			fn = filter_pred_16;
+		else if (field_is_signed)
+			fn = filter_pred_s16;
+		else
+			fn = filter_pred_u16;
+		break;
+	case 1:
+		if (op == OP_EQ || op == OP_NE)
+			fn = filter_pred_8;
+		else if (field_is_signed)
+			fn = filter_pred_s8;
+		else
+			fn = filter_pred_u8;
+		break;
+	}
+
+	return fn;
+}
+
+static int filter_add_pred(struct filter_parse_state *ps,
+			   struct ftrace_event_call *call,
+			   struct filter_pred *pred)
 {
 	struct ftrace_event_field *field;
 	filter_pred_fn_t fn;
 	unsigned long long val;
 
+	pred->fn = filter_pred_none;
+
+	if (pred->op == OP_AND) {
+		pred->pop_n = 2;
+		return filter_add_pred_fn(ps, call, pred, filter_pred_and);
+	} else if (pred->op == OP_OR) {
+		pred->pop_n = 2;
+		return filter_add_pred_fn(ps, call, pred, filter_pred_or);
+	}
+
 	field = find_event_field(call, pred->field_name);
-	if (!field)
+	if (!field) {
+		parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
 		return -EINVAL;
+	}
 
-	pred->fn = filter_pred_none;
 	pred->offset = field->offset;
 
+	if (!is_legal_op(field, pred->op)) {
+		parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0);
+		return -EINVAL;
+	}
+
 	if (is_string_field(field->type)) {
 		fn = filter_pred_string;
 		pred->str_len = field->size;
-		return filter_add_pred_fn(call, pred, fn);
+		if (pred->op == OP_NE)
+			pred->not = 1;
+		return filter_add_pred_fn(ps, call, pred, fn);
 	} else {
-		if (strict_strtoull(pred->str_val, 0, &val))
+		if (strict_strtoull(pred->str_val, 0, &val)) {
+			parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
 			return -EINVAL;
+		}
 		pred->val = val;
 	}
 
-	switch (field->size) {
-	case 8:
-		fn = filter_pred_64;
-		break;
-	case 4:
-		fn = filter_pred_32;
-		break;
-	case 2:
-		fn = filter_pred_16;
-		break;
-	case 1:
-		fn = filter_pred_8;
-		break;
-	default:
+	fn = select_comparison_fn(pred->op, field->size, field->is_signed);
+	if (!fn) {
+		parse_error(ps, FILT_ERR_INVALID_OP, 0);
 		return -EINVAL;
 	}
 
-	return filter_add_pred_fn(call, pred, fn);
-}
-
-int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
-{
-	int err;
-
-	mutex_lock(&filter_mutex);
-	err = __filter_add_pred(call, pred);
-	mutex_unlock(&filter_mutex);
+	if (pred->op == OP_NE)
+		pred->not = 1;
 
-	return err;
+	return filter_add_pred_fn(ps, call, pred, fn);
 }
 
-int filter_add_subsystem_pred(struct event_subsystem *system,
-			      struct filter_pred *pred)
+static int filter_add_subsystem_pred(struct filter_parse_state *ps,
+				     struct event_subsystem *system,
+				     struct filter_pred *pred,
+				     char *filter_string)
 {
 	struct event_filter *filter = system->filter;
 	struct ftrace_event_call *call;
 
-	mutex_lock(&filter_mutex);
-
-	if (filter && filter->n_preds && !pred->compound) {
-		__filter_free_subsystem_preds(system);
-		filter = NULL;
-	}
-
-	if (!filter) {
-		system->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
-		if (!system->filter) {
-			mutex_unlock(&filter_mutex);
-			return -ENOMEM;
-		}
-		filter = system->filter;
+	if (!filter->preds) {
 		filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
 					GFP_KERNEL);
 
-		if (!filter->preds) {
-			kfree(system->filter);
-			system->filter = NULL;
-			mutex_unlock(&filter_mutex);
+		if (!filter->preds)
 			return -ENOMEM;
-		}
 	}
 
 	if (filter->n_preds == MAX_FILTER_PRED) {
-		mutex_unlock(&filter_mutex);
+		parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
 		return -ENOSPC;
 	}
 
@@ -424,97 +585,508 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 		if (strcmp(call->system, system->name))
 			continue;
 
-		err = __filter_add_pred(call, pred);
-		if (err == -ENOMEM) {
-			filter->preds[filter->n_preds] = NULL;
-			filter->n_preds--;
-			mutex_unlock(&filter_mutex);
+		err = filter_add_pred(ps, call, pred);
+		if (err) {
+			filter_free_subsystem_preds(system);
+			parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
 			return err;
 		}
+		replace_filter_string(call->filter, filter_string);
 	}
 
-	mutex_unlock(&filter_mutex);
+	return 0;
+}
+
+static void parse_init(struct filter_parse_state *ps,
+		       struct filter_op *ops,
+		       char *infix_string)
+{
+	memset(ps, '\0', sizeof(*ps));
+
+	ps->infix.string = infix_string;
+	ps->infix.cnt = strlen(infix_string);
+	ps->ops = ops;
+
+	INIT_LIST_HEAD(&ps->opstack);
+	INIT_LIST_HEAD(&ps->postfix);
+}
+
+static char infix_next(struct filter_parse_state *ps)
+{
+	ps->infix.cnt--;
+
+	return ps->infix.string[ps->infix.tail++];
+}
+
+static char infix_peek(struct filter_parse_state *ps)
+{
+	if (ps->infix.tail == strlen(ps->infix.string))
+		return 0;
+
+	return ps->infix.string[ps->infix.tail];
+}
+
+static void infix_advance(struct filter_parse_state *ps)
+{
+	ps->infix.cnt--;
+	ps->infix.tail++;
+}
+
+static inline int is_precedence_lower(struct filter_parse_state *ps,
+				      int a, int b)
+{
+	return ps->ops[a].precedence < ps->ops[b].precedence;
+}
+
+static inline int is_op_char(struct filter_parse_state *ps, char c)
+{
+	int i;
+
+	for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+		if (ps->ops[i].string[0] == c)
+			return 1;
+	}
 
 	return 0;
 }
 
-/*
- * The filter format can be
- *   - 0, which means remove all filter preds
- *   - [||/&&] <field> ==/!= <val>
- */
-int filter_parse(char **pbuf, struct filter_pred *pred)
-{
-	char *tok, *val_str = NULL;
-	int tok_n = 0;
-
-	while ((tok = strsep(pbuf, " \n"))) {
-		if (tok_n == 0) {
-			if (!strcmp(tok, "0")) {
-				pred->clear = 1;
-				return 0;
-			} else if (!strcmp(tok, "&&")) {
-				pred->or = 0;
-				pred->compound = 1;
-			} else if (!strcmp(tok, "||")) {
-				pred->or = 1;
-				pred->compound = 1;
-			} else
-				pred->field_name = tok;
-			tok_n = 1;
-			continue;
+static int infix_get_op(struct filter_parse_state *ps, char firstc)
+{
+	char nextc = infix_peek(ps);
+	char opstr[3];
+	int i;
+
+	opstr[0] = firstc;
+	opstr[1] = nextc;
+	opstr[2] = '\0';
+
+	for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+		if (!strcmp(opstr, ps->ops[i].string)) {
+			infix_advance(ps);
+			return ps->ops[i].id;
 		}
-		if (tok_n == 1) {
-			if (!pred->field_name)
-				pred->field_name = tok;
-			else if (!strcmp(tok, "!="))
-				pred->not = 1;
-			else if (!strcmp(tok, "=="))
-				pred->not = 0;
-			else {
-				pred->field_name = NULL;
+	}
+
+	opstr[1] = '\0';
+
+	for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+		if (!strcmp(opstr, ps->ops[i].string))
+			return ps->ops[i].id;
+	}
+
+	return OP_NONE;
+}
+
+static inline void clear_operand_string(struct filter_parse_state *ps)
+{
+	memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL);
+	ps->operand.tail = 0;
+}
+
+static inline int append_operand_char(struct filter_parse_state *ps, char c)
+{
+	if (ps->operand.tail == MAX_FILTER_STR_VAL)
+		return -EINVAL;
+
+	ps->operand.string[ps->operand.tail++] = c;
+
+	return 0;
+}
+
+static int filter_opstack_push(struct filter_parse_state *ps, int op)
+{
+	struct opstack_op *opstack_op;
+
+	opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL);
+	if (!opstack_op)
+		return -ENOMEM;
+
+	opstack_op->op = op;
+	list_add(&opstack_op->list, &ps->opstack);
+
+	return 0;
+}
+
+static int filter_opstack_empty(struct filter_parse_state *ps)
+{
+	return list_empty(&ps->opstack);
+}
+
+static int filter_opstack_top(struct filter_parse_state *ps)
+{
+	struct opstack_op *opstack_op;
+
+	if (filter_opstack_empty(ps))
+		return OP_NONE;
+
+	opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
+
+	return opstack_op->op;
+}
+
+static int filter_opstack_pop(struct filter_parse_state *ps)
+{
+	struct opstack_op *opstack_op;
+	int op;
+
+	if (filter_opstack_empty(ps))
+		return OP_NONE;
+
+	opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
+	op = opstack_op->op;
+	list_del(&opstack_op->list);
+
+	kfree(opstack_op);
+
+	return op;
+}
+
+static void filter_opstack_clear(struct filter_parse_state *ps)
+{
+	while (!filter_opstack_empty(ps))
+		filter_opstack_pop(ps);
+}
+
+static char *curr_operand(struct filter_parse_state *ps)
+{
+	return ps->operand.string;
+}
+
+static int postfix_append_operand(struct filter_parse_state *ps, char *operand)
+{
+	struct postfix_elt *elt;
+
+	elt = kmalloc(sizeof(*elt), GFP_KERNEL);
+	if (!elt)
+		return -ENOMEM;
+
+	elt->op = OP_NONE;
+	elt->operand = kstrdup(operand, GFP_KERNEL);
+	if (!elt->operand) {
+		kfree(elt);
+		return -ENOMEM;
+	}
+
+	list_add_tail(&elt->list, &ps->postfix);
+
+	return 0;
+}
+
+static int postfix_append_op(struct filter_parse_state *ps, int op)
+{
+	struct postfix_elt *elt;
+
+	elt = kmalloc(sizeof(*elt), GFP_KERNEL);
+	if (!elt)
+		return -ENOMEM;
+
+	elt->op = op;
+	elt->operand = NULL;
+
+	list_add_tail(&elt->list, &ps->postfix);
+
+	return 0;
+}
+
+static void postfix_clear(struct filter_parse_state *ps)
+{
+	struct postfix_elt *elt;
+
+	while (!list_empty(&ps->postfix)) {
+		elt = list_first_entry(&ps->postfix, struct postfix_elt, list);
+		kfree(elt->operand);
+		list_del(&elt->list);
+	}
+}
+
+static int filter_parse(struct filter_parse_state *ps)
+{
+	int op, top_op;
+	char ch;
+
+	while ((ch = infix_next(ps))) {
+		if (isspace(ch))
+			continue;
+
+		if (is_op_char(ps, ch)) {
+			op = infix_get_op(ps, ch);
+			if (op == OP_NONE) {
+				parse_error(ps, FILT_ERR_INVALID_OP, 0);
 				return -EINVAL;
 			}
-			tok_n = 2;
+
+			if (strlen(curr_operand(ps))) {
+				postfix_append_operand(ps, curr_operand(ps));
+				clear_operand_string(ps);
+			}
+
+			while (!filter_opstack_empty(ps)) {
+				top_op = filter_opstack_top(ps);
+				if (!is_precedence_lower(ps, top_op, op)) {
+					top_op = filter_opstack_pop(ps);
+					postfix_append_op(ps, top_op);
+					continue;
+				}
+				break;
+			}
+
+			filter_opstack_push(ps, op);
 			continue;
 		}
-		if (tok_n == 2) {
-			if (pred->compound) {
-				if (!strcmp(tok, "!="))
-					pred->not = 1;
-				else if (!strcmp(tok, "=="))
-					pred->not = 0;
-				else {
-					pred->field_name = NULL;
-					return -EINVAL;
-				}
-			} else {
-				val_str = tok;
-				break; /* done */
+
+		if (ch == '(') {
+			filter_opstack_push(ps, OP_OPEN_PAREN);
+			continue;
+		}
+
+		if (ch == ')') {
+			if (strlen(curr_operand(ps))) {
+				postfix_append_operand(ps, curr_operand(ps));
+				clear_operand_string(ps);
+			}
+
+			top_op = filter_opstack_pop(ps);
+			while (top_op != OP_NONE) {
+				if (top_op == OP_OPEN_PAREN)
+					break;
+				postfix_append_op(ps, top_op);
+				top_op = filter_opstack_pop(ps);
+			}
+			if (top_op == OP_NONE) {
+				parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
+				return -EINVAL;
 			}
-			tok_n = 3;
 			continue;
 		}
-		if (tok_n == 3) {
-			val_str = tok;
-			break; /* done */
+		if (append_operand_char(ps, ch)) {
+			parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0);
+			return -EINVAL;
+		}
+	}
+
+	if (strlen(curr_operand(ps)))
+		postfix_append_operand(ps, curr_operand(ps));
+
+	while (!filter_opstack_empty(ps)) {
+		top_op = filter_opstack_pop(ps);
+		if (top_op == OP_NONE)
+			break;
+		if (top_op == OP_OPEN_PAREN) {
+			parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
+			return -EINVAL;
+		}
+		postfix_append_op(ps, top_op);
+	}
+
+	return 0;
+}
+
+static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
+{
+	struct filter_pred *pred;
+
+	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+	if (!pred)
+		return NULL;
+
+	pred->field_name = kstrdup(operand1, GFP_KERNEL);
+	if (!pred->field_name) {
+		kfree(pred);
+		return NULL;
+	}
+
+	strcpy(pred->str_val, operand2);
+	pred->str_len = strlen(operand2);
+
+	pred->op = op;
+
+	return pred;
+}
+
+static struct filter_pred *create_logical_pred(int op)
+{
+	struct filter_pred *pred;
+
+	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+	if (!pred)
+		return NULL;
+
+	pred->op = op;
+
+	return pred;
+}
+
+static int check_preds(struct filter_parse_state *ps)
+{
+	int n_normal_preds = 0, n_logical_preds = 0;
+	struct postfix_elt *elt;
+
+	list_for_each_entry(elt, &ps->postfix, list) {
+		if (elt->op == OP_NONE)
+			continue;
+
+		if (elt->op == OP_AND || elt->op == OP_OR) {
+			n_logical_preds++;
+			continue;
 		}
+		n_normal_preds++;
 	}
 
-	if (!val_str || !strlen(val_str)
-	    || strlen(val_str) >= MAX_FILTER_STR_VAL) {
-		pred->field_name = NULL;
+	if (!n_normal_preds || n_logical_preds >= n_normal_preds) {
+		parse_error(ps, FILT_ERR_INVALID_FILTER, 0);
 		return -EINVAL;
 	}
 
-	strcpy(pred->str_val, val_str);
-	pred->str_len = strlen(val_str);
+	return 0;
+}
 
-	pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
-	if (!pred->field_name)
-		return -ENOMEM;
+static int replace_preds(struct event_subsystem *system,
+			 struct ftrace_event_call *call,
+			 struct filter_parse_state *ps,
+			 char *filter_string)
+{
+	char *operand1 = NULL, *operand2 = NULL;
+	struct filter_pred *pred;
+	struct postfix_elt *elt;
+	int err;
+
+	err = check_preds(ps);
+	if (err)
+		return err;
+
+	list_for_each_entry(elt, &ps->postfix, list) {
+		if (elt->op == OP_NONE) {
+			if (!operand1)
+				operand1 = elt->operand;
+			else if (!operand2)
+				operand2 = elt->operand;
+			else {
+				parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
+				return -EINVAL;
+			}
+			continue;
+		}
+
+		if (elt->op == OP_AND || elt->op == OP_OR) {
+			pred = create_logical_pred(elt->op);
+			if (call) {
+				err = filter_add_pred(ps, call, pred);
+				filter_free_pred(pred);
+			} else
+				err = filter_add_subsystem_pred(ps, system,
+							pred, filter_string);
+			if (err)
+				return err;
+
+			operand1 = operand2 = NULL;
+			continue;
+		}
+
+		if (!operand1 || !operand2) {
+			parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
+			return -EINVAL;
+		}
+
+		pred = create_pred(elt->op, operand1, operand2);
+		if (call) {
+			err = filter_add_pred(ps, call, pred);
+			filter_free_pred(pred);
+		} else
+			err = filter_add_subsystem_pred(ps, system, pred,
+							filter_string);
+		if (err)
+			return err;
+
+		operand1 = operand2 = NULL;
+	}
 
 	return 0;
 }
 
+int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
+{
+	int err;
+
+	struct filter_parse_state *ps;
+
+	mutex_lock(&filter_mutex);
+
+	if (!strcmp(strstrip(filter_string), "0")) {
+		filter_disable_preds(call);
+		remove_filter_string(call->filter);
+		mutex_unlock(&filter_mutex);
+		return 0;
+	}
+
+	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+	if (!ps)
+		return -ENOMEM;
+
+	filter_disable_preds(call);
+	replace_filter_string(call->filter, filter_string);
+
+	parse_init(ps, filter_ops, filter_string);
+	err = filter_parse(ps);
+	if (err) {
+		append_filter_err(ps, call->filter);
+		goto out;
+	}
+
+	err = replace_preds(NULL, call, ps, filter_string);
+	if (err)
+		append_filter_err(ps, call->filter);
+
+out:
+	filter_opstack_clear(ps);
+	postfix_clear(ps);
+	kfree(ps);
+
+	mutex_unlock(&filter_mutex);
+
+	return err;
+}
+
+int apply_subsystem_event_filter(struct event_subsystem *system,
+				 char *filter_string)
+{
+	int err;
+
+	struct filter_parse_state *ps;
+
+	mutex_lock(&filter_mutex);
+
+	if (!strcmp(strstrip(filter_string), "0")) {
+		filter_free_subsystem_preds(system);
+		remove_filter_string(system->filter);
+		mutex_unlock(&filter_mutex);
+		return 0;
+	}
+
+	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+	if (!ps)
+		return -ENOMEM;
+
+	filter_free_subsystem_preds(system);
+	replace_filter_string(system->filter, filter_string);
+
+	parse_init(ps, filter_ops, filter_string);
+	err = filter_parse(ps);
+	if (err) {
+		append_filter_err(ps, system->filter);
+		goto out;
+	}
+
+	err = replace_preds(system, NULL, ps, filter_string);
+	if (err)
+		append_filter_err(ps, system->filter);
+
+out:
+	filter_opstack_clear(ps);
+	postfix_clear(ps);
+	kfree(ps);
+
+	mutex_unlock(&filter_mutex);
+
+	return err;
+}
 
-- 
cgit v0.10.2


From a0e39ed378fb6ba916522764cd508fa7d42ad495 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 29 Apr 2009 13:51:39 +0200
Subject: tracing: fix build failure on s390

"tracing: create automated trace defines" causes this compile error on s390,
as reported by Sachin Sant against linux-next:

 kernel/built-in.o: In function `__do_softirq':
 (.text+0x1c680): undefined reference to `__tracepoint_softirq_entry'

This happens because the definitions of the softirq tracepoints were moved
from kernel/softirq.c to kernel/irq/handle.c. Since s390 doesn't support
generic hardirqs handle.c doesn't get compiled and the definitions are
missing.

So move the tracepoints to softirq.c again.

[ Impact: fix build failure on s390 ]

Reported-by: Sachin Sant <sachinp@in.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
LKML-Reference: <20090429135139.5fac79b8@osiris.boeblingen.de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 37c6363..e68bb5a 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -18,8 +18,6 @@
 #include <linux/rculist.h>
 #include <linux/hash.h>
 #include <linux/bootmem.h>
-
-#define CREATE_TRACE_POINTS
 #include <trace/events/irq.h>
 
 #include "internals.h"
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7ab9dfd..d4ba347 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -24,6 +24,8 @@
 #include <linux/ftrace.h>
 #include <linux/smp.h>
 #include <linux/tick.h>
+
+#define CREATE_TRACE_POINTS
 #include <trace/events/irq.h>
 
 #include <asm/irq.h>
-- 
cgit v0.10.2


From 50fa610a3b6ba7cf91d7a92229177dfaff2b81a1 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 28 Apr 2009 15:01:38 +0100
Subject: sched: Document memory barriers implied by sleep/wake-up primitives

Add a section to the memory barriers document to note the implied
memory barriers of sleep primitives (set_current_state() and wrappers)
and wake-up primitives (wake_up() and co.).

Also extend the in-code comments on the wake_up() functions to note
these implied barriers.

[ Impact: add documentation ]

Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <20090428140138.1192.94723.stgit@warthog.procyon.org.uk>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index f5b7127..7f5809e 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -31,6 +31,7 @@ Contents:
 
      - Locking functions.
      - Interrupt disabling functions.
+     - Sleep and wake-up functions.
      - Miscellaneous functions.
 
  (*) Inter-CPU locking barrier effects.
@@ -1217,6 +1218,132 @@ barriers are required in such a situation, they must be provided from some
 other means.
 
 
+SLEEP AND WAKE-UP FUNCTIONS
+---------------------------
+
+Sleeping and waking on an event flagged in global data can be viewed as an
+interaction between two pieces of data: the task state of the task waiting for
+the event and the global data used to indicate the event.  To make sure that
+these appear to happen in the right order, the primitives to begin the process
+of going to sleep, and the primitives to initiate a wake up imply certain
+barriers.
+
+Firstly, the sleeper normally follows something like this sequence of events:
+
+	for (;;) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (event_indicated)
+			break;
+		schedule();
+	}
+
+A general memory barrier is interpolated automatically by set_current_state()
+after it has altered the task state:
+
+	CPU 1
+	===============================
+	set_current_state();
+	  set_mb();
+	    STORE current->state
+	    <general barrier>
+	LOAD event_indicated
+
+set_current_state() may be wrapped by:
+
+	prepare_to_wait();
+	prepare_to_wait_exclusive();
+
+which therefore also imply a general memory barrier after setting the state.
+The whole sequence above is available in various canned forms, all of which
+interpolate the memory barrier in the right place:
+
+	wait_event();
+	wait_event_interruptible();
+	wait_event_interruptible_exclusive();
+	wait_event_interruptible_timeout();
+	wait_event_killable();
+	wait_event_timeout();
+	wait_on_bit();
+	wait_on_bit_lock();
+
+
+Secondly, code that performs a wake up normally follows something like this:
+
+	event_indicated = 1;
+	wake_up(&event_wait_queue);
+
+or:
+
+	event_indicated = 1;
+	wake_up_process(event_daemon);
+
+A write memory barrier is implied by wake_up() and co. if and only if they wake
+something up.  The barrier occurs before the task state is cleared, and so sits
+between the STORE to indicate the event and the STORE to set TASK_RUNNING:
+
+	CPU 1				CPU 2
+	===============================	===============================
+	set_current_state();		STORE event_indicated
+	  set_mb();			wake_up();
+	    STORE current->state	  <write barrier>
+	    <general barrier>		  STORE current->state
+	LOAD event_indicated
+
+The available waker functions include:
+
+	complete();
+	wake_up();
+	wake_up_all();
+	wake_up_bit();
+	wake_up_interruptible();
+	wake_up_interruptible_all();
+	wake_up_interruptible_nr();
+	wake_up_interruptible_poll();
+	wake_up_interruptible_sync();
+	wake_up_interruptible_sync_poll();
+	wake_up_locked();
+	wake_up_locked_poll();
+	wake_up_nr();
+	wake_up_poll();
+	wake_up_process();
+
+
+[!] Note that the memory barriers implied by the sleeper and the waker do _not_
+order multiple stores before the wake-up with respect to loads of those stored
+values after the sleeper has called set_current_state().  For instance, if the
+sleeper does:
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	if (event_indicated)
+		break;
+	__set_current_state(TASK_RUNNING);
+	do_something(my_data);
+
+and the waker does:
+
+	my_data = value;
+	event_indicated = 1;
+	wake_up(&event_wait_queue);
+
+there's no guarantee that the change to event_indicated will be perceived by
+the sleeper as coming after the change to my_data.  In such a circumstance, the
+code on both sides must interpolate its own memory barriers between the
+separate data accesses.  Thus the above sleeper ought to do:
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	if (event_indicated) {
+		smp_rmb();
+		do_something(my_data);
+	}
+
+and the waker should do:
+
+	my_data = value;
+	smp_wmb();
+	event_indicated = 1;
+	wake_up(&event_wait_queue);
+
+
 MISCELLANEOUS FUNCTIONS
 -----------------------
 
@@ -1366,7 +1493,7 @@ WHERE ARE MEMORY BARRIERS NEEDED?
 
 Under normal operation, memory operation reordering is generally not going to
 be a problem as a single-threaded linear piece of code will still appear to
-work correctly, even if it's in an SMP kernel.  There are, however, three
+work correctly, even if it's in an SMP kernel.  There are, however, four
 circumstances in which reordering definitely _could_ be a problem:
 
  (*) Interprocessor interaction.
diff --git a/kernel/sched.c b/kernel/sched.c
index b902e58..fd0c2ce 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2458,6 +2458,17 @@ out:
 	return success;
 }
 
+/**
+ * wake_up_process - Wake up a specific process
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes.  Returns 1 if the process was woken up, 0 if it was already
+ * running.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
 int wake_up_process(struct task_struct *p)
 {
 	return try_to_wake_up(p, TASK_ALL, 0);
@@ -5241,6 +5252,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
  * @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
  */
 void __wake_up(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, void *key)
@@ -5279,6 +5293,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
  * with each other. This can prevent needless bouncing between CPUs.
  *
  * On UP it can prevent extra preemption.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
  */
 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, void *key)
@@ -5315,6 +5332,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
  * awakened in the same order in which they were queued.
  *
  * See also complete_all(), wait_for_completion() and related routines.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
  */
 void complete(struct completion *x)
 {
@@ -5332,6 +5352,9 @@ EXPORT_SYMBOL(complete);
  * @x:  holds the state of this particular completion
  *
  * This will wake up all threads waiting on this particular completion event.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
  */
 void complete_all(struct completion *x)
 {
-- 
cgit v0.10.2


From da1a776be1ac7f78bb30ececbec4c1383163b079 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:46:58 +0200
Subject: perf_counter, x86: remove X86_FEATURE_ARCH_PERFMON flag for AMD cpus

X86_FEATURE_ARCH_PERFMON is an Intel hardware feature that does not
work on AMD CPUs. The flag is now only used in Intel specific code
(especially initialization).

[ Impact: refactor code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1241002046-8832-2-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index fd69c51..7e4a459 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -420,10 +420,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	if (c->x86 >= 6)
 		set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
 
-	/* Enable Performance counter for K7 and later */
-	if (c->x86 > 6 && c->x86 <= 0x11)
-		set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
-
 	if (!c->x86_model_id[0]) {
 		switch (c->x86) {
 		case 0xf:
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 0fcbaab..7d0f81d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -949,6 +949,9 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 	unsigned int unused;
 	unsigned int ebx;
 
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+		return NULL;
+
 	/*
 	 * Check whether the Architectural PerfMon supports
 	 * Branch Misses Retired Event or not.
@@ -987,9 +990,6 @@ static struct pmc_x86_ops *pmc_amd_init(void)
 
 void __init init_hw_perf_counters(void)
 {
-	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
-		return;
-
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_INTEL:
 		pmc_ops = pmc_intel_init();
-- 
cgit v0.10.2


From 829b42dd395c5801f6ae87da87ecbdcfd5ef1a6c Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:46:59 +0200
Subject: perf_counter, x86: declare perf_max_counters only for
 CONFIG_PERF_COUNTERS

This is only needed for CONFIG_PERF_COUNTERS enabled.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1241002046-8832-3-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 9814328..be10b3f 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -512,12 +512,13 @@ struct perf_cpu_context {
 	int			recursion[4];
 };
 
+#ifdef CONFIG_PERF_COUNTERS
+
 /*
  * Set by architecture code:
  */
 extern int perf_max_counters;
 
-#ifdef CONFIG_PERF_COUNTERS
 extern const struct hw_perf_counter_ops *
 hw_perf_counter_init(struct perf_counter *counter);
 
-- 
cgit v0.10.2


From 4138960a9251a265002b5cf07e671a49f8495381 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:00 +0200
Subject: perf_counter, x86: add default path to cpu detection

This quits hw counter initialization immediately if no cpu is
detected.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-4-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 7d0f81d..d6d6529 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -997,6 +997,8 @@ void __init init_hw_perf_counters(void)
 	case X86_VENDOR_AMD:
 		pmc_ops = pmc_amd_init();
 		break;
+	default:
+		return;
 	}
 	if (!pmc_ops)
 		return;
-- 
cgit v0.10.2


From 4295ee62660b13ddb87d41539f49b239e6e7d56f Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:01 +0200
Subject: perf_counter, x86: rework pmc_amd_save_disable_all() and
 pmc_amd_restore_all()

MSR reads and writes are expensive. This patch adds checks to avoid
its usage where possible.

[ Impact: micro-optimization on AMD CPUs ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-5-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d6d6529..75a0903 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -334,11 +334,13 @@ static u64 pmc_amd_save_disable_all(void)
 	for (idx = 0; idx < nr_counters_generic; idx++) {
 		u64 val;
 
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
 		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
-		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) {
-			val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
-			wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
-		}
+		if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
+			continue;
+		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
 	}
 
 	return enabled;
@@ -372,13 +374,15 @@ static void pmc_amd_restore_all(u64 ctrl)
 		return;
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-		if (test_bit(idx, cpuc->active_mask)) {
-			u64 val;
+		u64 val;
 
-			rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
-			val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-			wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
-		}
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
+		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
+			continue;
+		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
 	}
 }
 
-- 
cgit v0.10.2


From 527e26af3741a2168986d8b82653ffe173891324 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:02 +0200
Subject: perf_counter, x86: protect per-cpu variables with compile barriers
 only

Per-cpu variables needn't to be protected with cpu barriers
(smp_wmb()). Protection is only needed for preemption on the same cpu
(rescheduling or the nmi handler). This can be done using a compiler
barrier only.

[ Impact: micro-optimization ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-6-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 75a0903..ad663d5 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -673,7 +673,7 @@ try_generic:
 	/*
 	 * Make it visible before enabling the hw:
 	 */
-	smp_wmb();
+	barrier();
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
 	__pmc_generic_enable(counter, hwc, idx);
@@ -745,7 +745,7 @@ static void pmc_generic_disable(struct perf_counter *counter)
 	 * Make sure the cleared pointer becomes visible before we
 	 * (potentially) free the counter:
 	 */
-	smp_wmb();
+	barrier();
 
 	/*
 	 * Drain the remaining delta count out of a counter
-- 
cgit v0.10.2


From 4aeb0b4239bb3b67ed402cb9cef3e000c892cadf Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:03 +0200
Subject: perfcounters: rename struct hw_perf_counter_ops into struct pmu

This patch renames struct hw_perf_counter_ops into struct pmu. It
introduces a structure to describe a cpu specific pmu (performance
monitoring unit). It may contain ops and data. The new name of the
structure fits better, is shorter, and thus better to handle. Where it
was appropriate, names of function and variable have been changed too.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-7-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index bd76d0f..d9bbe5e 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -256,7 +256,7 @@ static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new)
 	return 0;
 }
 
-static void power_perf_read(struct perf_counter *counter)
+static void power_pmu_read(struct perf_counter *counter)
 {
 	long val, delta, prev;
 
@@ -405,7 +405,7 @@ void hw_perf_restore(u64 disable)
 	for (i = 0; i < cpuhw->n_counters; ++i) {
 		counter = cpuhw->counter[i];
 		if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
-			power_perf_read(counter);
+			power_pmu_read(counter);
 			write_pmc(counter->hw.idx, 0);
 			counter->hw.idx = 0;
 		}
@@ -477,7 +477,7 @@ static void counter_sched_in(struct perf_counter *counter, int cpu)
 	counter->oncpu = cpu;
 	counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped;
 	if (is_software_counter(counter))
-		counter->hw_ops->enable(counter);
+		counter->pmu->enable(counter);
 }
 
 /*
@@ -533,7 +533,7 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
  * re-enable the PMU in order to get hw_perf_restore to do the
  * actual work of reconfiguring the PMU.
  */
-static int power_perf_enable(struct perf_counter *counter)
+static int power_pmu_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuhw;
 	unsigned long flags;
@@ -573,7 +573,7 @@ static int power_perf_enable(struct perf_counter *counter)
 /*
  * Remove a counter from the PMU.
  */
-static void power_perf_disable(struct perf_counter *counter)
+static void power_pmu_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuhw;
 	long i;
@@ -583,7 +583,7 @@ static void power_perf_disable(struct perf_counter *counter)
 	local_irq_save(flags);
 	pmudis = hw_perf_save_disable();
 
-	power_perf_read(counter);
+	power_pmu_read(counter);
 
 	cpuhw = &__get_cpu_var(cpu_hw_counters);
 	for (i = 0; i < cpuhw->n_counters; ++i) {
@@ -607,10 +607,10 @@ static void power_perf_disable(struct perf_counter *counter)
 	local_irq_restore(flags);
 }
 
-struct hw_perf_counter_ops power_perf_ops = {
-	.enable = power_perf_enable,
-	.disable = power_perf_disable,
-	.read = power_perf_read
+struct pmu power_pmu = {
+	.enable		= power_pmu_enable,
+	.disable	= power_pmu_disable,
+	.read		= power_pmu_read,
 };
 
 /* Number of perf_counters counting hardware events */
@@ -631,8 +631,7 @@ static void hw_perf_counter_destroy(struct perf_counter *counter)
 	}
 }
 
-const struct hw_perf_counter_ops *
-hw_perf_counter_init(struct perf_counter *counter)
+const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 {
 	unsigned long ev;
 	struct perf_counter *ctrs[MAX_HWCOUNTERS];
@@ -705,7 +704,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 
 	if (err)
 		return ERR_PTR(err);
-	return &power_perf_ops;
+	return &power_pmu;
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index ad663d5..95de980 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -515,8 +515,8 @@ __pmc_fixed_disable(struct perf_counter *counter,
 }
 
 static inline void
-__pmc_generic_disable(struct perf_counter *counter,
-			   struct hw_perf_counter *hwc, unsigned int idx)
+__x86_pmu_disable(struct perf_counter *counter,
+		  struct hw_perf_counter *hwc, unsigned int idx)
 {
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
 		__pmc_fixed_disable(counter, hwc, idx);
@@ -591,8 +591,8 @@ __pmc_fixed_enable(struct perf_counter *counter,
 }
 
 static void
-__pmc_generic_enable(struct perf_counter *counter,
-			  struct hw_perf_counter *hwc, int idx)
+__x86_pmu_enable(struct perf_counter *counter,
+		 struct hw_perf_counter *hwc, int idx)
 {
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
 		__pmc_fixed_enable(counter, hwc, idx);
@@ -626,7 +626,7 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 /*
  * Find a PMC slot for the freshly enabled / scheduled in counter:
  */
-static int pmc_generic_enable(struct perf_counter *counter)
+static int x86_pmu_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -667,7 +667,7 @@ try_generic:
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	__pmc_generic_disable(counter, hwc, idx);
+	__x86_pmu_disable(counter, hwc, idx);
 
 	cpuc->counters[idx] = counter;
 	/*
@@ -676,7 +676,7 @@ try_generic:
 	barrier();
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
-	__pmc_generic_enable(counter, hwc, idx);
+	__x86_pmu_enable(counter, hwc, idx);
 
 	return 0;
 }
@@ -731,13 +731,13 @@ void perf_counter_print_debug(void)
 	local_irq_enable();
 }
 
-static void pmc_generic_disable(struct perf_counter *counter)
+static void x86_pmu_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
-	__pmc_generic_disable(counter, hwc, idx);
+	__x86_pmu_disable(counter, hwc, idx);
 
 	clear_bit(idx, cpuc->used);
 	cpuc->counters[idx] = NULL;
@@ -767,7 +767,7 @@ static void perf_save_and_restart(struct perf_counter *counter)
 	__hw_perf_counter_set_period(counter, hwc, idx);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
-		__pmc_generic_enable(counter, hwc, idx);
+		__x86_pmu_enable(counter, hwc, idx);
 }
 
 /*
@@ -805,7 +805,7 @@ again:
 
 		perf_save_and_restart(counter);
 		if (perf_counter_overflow(counter, nmi, regs, 0))
-			__pmc_generic_disable(counter, &counter->hw, bit);
+			__x86_pmu_disable(counter, &counter->hw, bit);
 	}
 
 	hw_perf_ack_status(ack);
@@ -1034,19 +1034,18 @@ void __init init_hw_perf_counters(void)
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 
-static void pmc_generic_read(struct perf_counter *counter)
+static void x86_pmu_read(struct perf_counter *counter)
 {
 	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
 }
 
-static const struct hw_perf_counter_ops x86_perf_counter_ops = {
-	.enable		= pmc_generic_enable,
-	.disable	= pmc_generic_disable,
-	.read		= pmc_generic_read,
+static const struct pmu pmu = {
+	.enable		= x86_pmu_enable,
+	.disable	= x86_pmu_disable,
+	.read		= x86_pmu_read,
 };
 
-const struct hw_perf_counter_ops *
-hw_perf_counter_init(struct perf_counter *counter)
+const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 {
 	int err;
 
@@ -1054,7 +1053,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 	if (err)
 		return ERR_PTR(err);
 
-	return &x86_perf_counter_ops;
+	return &pmu;
 }
 
 /*
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index be10b3f..c3db52d 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -334,9 +334,9 @@ struct hw_perf_counter {
 struct perf_counter;
 
 /**
- * struct hw_perf_counter_ops - performance counter hw ops
+ * struct pmu - generic performance monitoring unit
  */
-struct hw_perf_counter_ops {
+struct pmu {
 	int (*enable)			(struct perf_counter *counter);
 	void (*disable)			(struct perf_counter *counter);
 	void (*read)			(struct perf_counter *counter);
@@ -381,7 +381,7 @@ struct perf_counter {
 	struct list_head		sibling_list;
 	int 				nr_siblings;
 	struct perf_counter		*group_leader;
-	const struct hw_perf_counter_ops *hw_ops;
+	const struct pmu		*pmu;
 
 	enum perf_counter_active_state	state;
 	enum perf_counter_active_state	prev_state;
@@ -519,8 +519,7 @@ struct perf_cpu_context {
  */
 extern int perf_max_counters;
 
-extern const struct hw_perf_counter_ops *
-hw_perf_counter_init(struct perf_counter *counter);
+extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter);
 
 extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
 extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0939609..582108a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -52,8 +52,7 @@ static DEFINE_MUTEX(perf_resource_mutex);
 /*
  * Architecture provided APIs - weak aliases:
  */
-extern __weak const struct hw_perf_counter_ops *
-hw_perf_counter_init(struct perf_counter *counter)
+extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 {
 	return NULL;
 }
@@ -124,7 +123,7 @@ counter_sched_out(struct perf_counter *counter,
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	counter->tstamp_stopped = ctx->time;
-	counter->hw_ops->disable(counter);
+	counter->pmu->disable(counter);
 	counter->oncpu = -1;
 
 	if (!is_software_counter(counter))
@@ -417,7 +416,7 @@ counter_sched_in(struct perf_counter *counter,
 	 */
 	smp_wmb();
 
-	if (counter->hw_ops->enable(counter)) {
+	if (counter->pmu->enable(counter)) {
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
 		counter->oncpu = -1;
 		return -EAGAIN;
@@ -1096,7 +1095,7 @@ static void __read(void *info)
 	local_irq_save(flags);
 	if (ctx->is_active)
 		update_context_time(ctx);
-	counter->hw_ops->read(counter);
+	counter->pmu->read(counter);
 	update_counter_times(counter);
 	local_irq_restore(flags);
 }
@@ -1922,7 +1921,7 @@ static void perf_counter_output(struct perf_counter *counter,
 		leader = counter->group_leader;
 		list_for_each_entry(sub, &leader->sibling_list, list_entry) {
 			if (sub != counter)
-				sub->hw_ops->read(sub);
+				sub->pmu->read(sub);
 
 			group_entry.event = sub->hw_event.config;
 			group_entry.counter = atomic64_read(&sub->count);
@@ -2264,7 +2263,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 	struct pt_regs *regs;
 
 	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer);
-	counter->hw_ops->read(counter);
+	counter->pmu->read(counter);
 
 	regs = get_irq_regs();
 	/*
@@ -2410,7 +2409,7 @@ static void perf_swcounter_disable(struct perf_counter *counter)
 	perf_swcounter_update(counter);
 }
 
-static const struct hw_perf_counter_ops perf_ops_generic = {
+static const struct pmu perf_ops_generic = {
 	.enable		= perf_swcounter_enable,
 	.disable	= perf_swcounter_disable,
 	.read		= perf_swcounter_read,
@@ -2460,7 +2459,7 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter)
 	cpu_clock_perf_counter_update(counter);
 }
 
-static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
+static const struct pmu perf_ops_cpu_clock = {
 	.enable		= cpu_clock_perf_counter_enable,
 	.disable	= cpu_clock_perf_counter_disable,
 	.read		= cpu_clock_perf_counter_read,
@@ -2522,7 +2521,7 @@ static void task_clock_perf_counter_read(struct perf_counter *counter)
 	task_clock_perf_counter_update(counter, time);
 }
 
-static const struct hw_perf_counter_ops perf_ops_task_clock = {
+static const struct pmu perf_ops_task_clock = {
 	.enable		= task_clock_perf_counter_enable,
 	.disable	= task_clock_perf_counter_disable,
 	.read		= task_clock_perf_counter_read,
@@ -2574,7 +2573,7 @@ static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
 	cpu_migrations_perf_counter_update(counter);
 }
 
-static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
+static const struct pmu perf_ops_cpu_migrations = {
 	.enable		= cpu_migrations_perf_counter_enable,
 	.disable	= cpu_migrations_perf_counter_disable,
 	.read		= cpu_migrations_perf_counter_read,
@@ -2600,8 +2599,7 @@ static void tp_perf_counter_destroy(struct perf_counter *counter)
 	ftrace_profile_disable(perf_event_id(&counter->hw_event));
 }
 
-static const struct hw_perf_counter_ops *
-tp_perf_counter_init(struct perf_counter *counter)
+static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 {
 	int event_id = perf_event_id(&counter->hw_event);
 	int ret;
@@ -2616,18 +2614,16 @@ tp_perf_counter_init(struct perf_counter *counter)
 	return &perf_ops_generic;
 }
 #else
-static const struct hw_perf_counter_ops *
-tp_perf_counter_init(struct perf_counter *counter)
+static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 {
 	return NULL;
 }
 #endif
 
-static const struct hw_perf_counter_ops *
-sw_perf_counter_init(struct perf_counter *counter)
+static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 {
 	struct perf_counter_hw_event *hw_event = &counter->hw_event;
-	const struct hw_perf_counter_ops *hw_ops = NULL;
+	const struct pmu *pmu = NULL;
 	struct hw_perf_counter *hwc = &counter->hw;
 
 	/*
@@ -2639,7 +2635,7 @@ sw_perf_counter_init(struct perf_counter *counter)
 	 */
 	switch (perf_event_id(&counter->hw_event)) {
 	case PERF_COUNT_CPU_CLOCK:
-		hw_ops = &perf_ops_cpu_clock;
+		pmu = &perf_ops_cpu_clock;
 
 		if (hw_event->irq_period && hw_event->irq_period < 10000)
 			hw_event->irq_period = 10000;
@@ -2650,9 +2646,9 @@ sw_perf_counter_init(struct perf_counter *counter)
 		 * use the cpu_clock counter instead.
 		 */
 		if (counter->ctx->task)
-			hw_ops = &perf_ops_task_clock;
+			pmu = &perf_ops_task_clock;
 		else
-			hw_ops = &perf_ops_cpu_clock;
+			pmu = &perf_ops_cpu_clock;
 
 		if (hw_event->irq_period && hw_event->irq_period < 10000)
 			hw_event->irq_period = 10000;
@@ -2661,18 +2657,18 @@ sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_PAGE_FAULTS_MIN:
 	case PERF_COUNT_PAGE_FAULTS_MAJ:
 	case PERF_COUNT_CONTEXT_SWITCHES:
-		hw_ops = &perf_ops_generic;
+		pmu = &perf_ops_generic;
 		break;
 	case PERF_COUNT_CPU_MIGRATIONS:
 		if (!counter->hw_event.exclude_kernel)
-			hw_ops = &perf_ops_cpu_migrations;
+			pmu = &perf_ops_cpu_migrations;
 		break;
 	}
 
-	if (hw_ops)
+	if (pmu)
 		hwc->irq_period = hw_event->irq_period;
 
-	return hw_ops;
+	return pmu;
 }
 
 /*
@@ -2685,7 +2681,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		   struct perf_counter *group_leader,
 		   gfp_t gfpflags)
 {
-	const struct hw_perf_counter_ops *hw_ops;
+	const struct pmu *pmu;
 	struct perf_counter *counter;
 	long err;
 
@@ -2713,46 +2709,46 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->cpu			= cpu;
 	counter->hw_event		= *hw_event;
 	counter->group_leader		= group_leader;
-	counter->hw_ops			= NULL;
+	counter->pmu			= NULL;
 	counter->ctx			= ctx;
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	if (hw_event->disabled)
 		counter->state = PERF_COUNTER_STATE_OFF;
 
-	hw_ops = NULL;
+	pmu = NULL;
 
 	if (perf_event_raw(hw_event)) {
-		hw_ops = hw_perf_counter_init(counter);
+		pmu = hw_perf_counter_init(counter);
 		goto done;
 	}
 
 	switch (perf_event_type(hw_event)) {
 	case PERF_TYPE_HARDWARE:
-		hw_ops = hw_perf_counter_init(counter);
+		pmu = hw_perf_counter_init(counter);
 		break;
 
 	case PERF_TYPE_SOFTWARE:
-		hw_ops = sw_perf_counter_init(counter);
+		pmu = sw_perf_counter_init(counter);
 		break;
 
 	case PERF_TYPE_TRACEPOINT:
-		hw_ops = tp_perf_counter_init(counter);
+		pmu = tp_perf_counter_init(counter);
 		break;
 	}
 done:
 	err = 0;
-	if (!hw_ops)
+	if (!pmu)
 		err = -EINVAL;
-	else if (IS_ERR(hw_ops))
-		err = PTR_ERR(hw_ops);
+	else if (IS_ERR(pmu))
+		err = PTR_ERR(pmu);
 
 	if (err) {
 		kfree(counter);
 		return ERR_PTR(err);
 	}
 
-	counter->hw_ops = hw_ops;
+	counter->pmu = pmu;
 
 	if (counter->hw_event.mmap)
 		atomic_inc(&nr_mmap_tracking);
-- 
cgit v0.10.2


From 5f4ec28ffe77c840354cce1820a3436106e9e0f1 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:04 +0200
Subject: perf_counter, x86: rename struct pmc_x86_ops into struct x86_pmu

This patch renames struct pmc_x86_ops into struct x86_pmu. It
introduces a structure to describe an x86 model specific pmu
(performance monitoring unit). It may contain ops and data. The new
name of the structure fits better, is shorter, and thus better to
handle. Where it was appropriate, names of function and variable have
been changed too.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-8-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 95de980..808a1a1 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -44,9 +44,9 @@ struct cpu_hw_counters {
 };
 
 /*
- * struct pmc_x86_ops - performance counter x86 ops
+ * struct x86_pmu - generic x86 pmu
  */
-struct pmc_x86_ops {
+struct x86_pmu {
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
 	u64		(*get_status)(u64);
@@ -60,7 +60,7 @@ struct pmc_x86_ops {
 	int		max_events;
 };
 
-static struct pmc_x86_ops *pmc_ops __read_mostly;
+static struct x86_pmu *x86_pmu __read_mostly;
 
 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
 	.enabled = 1,
@@ -82,12 +82,12 @@ static const u64 intel_perfmon_event_map[] =
   [PERF_COUNT_BUS_CYCLES]		= 0x013c,
 };
 
-static u64 pmc_intel_event_map(int event)
+static u64 intel_pmu_event_map(int event)
 {
 	return intel_perfmon_event_map[event];
 }
 
-static u64 pmc_intel_raw_event(u64 event)
+static u64 intel_pmu_raw_event(u64 event)
 {
 #define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
 #define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
@@ -114,12 +114,12 @@ static const u64 amd_perfmon_event_map[] =
   [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
 };
 
-static u64 pmc_amd_event_map(int event)
+static u64 amd_pmu_event_map(int event)
 {
 	return amd_perfmon_event_map[event];
 }
 
-static u64 pmc_amd_raw_event(u64 event)
+static u64 amd_pmu_raw_event(u64 event)
 {
 #define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL
 #define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL
@@ -184,12 +184,12 @@ static bool reserve_pmc_hardware(void)
 		disable_lapic_nmi_watchdog();
 
 	for (i = 0; i < nr_counters_generic; i++) {
-		if (!reserve_perfctr_nmi(pmc_ops->perfctr + i))
+		if (!reserve_perfctr_nmi(x86_pmu->perfctr + i))
 			goto perfctr_fail;
 	}
 
 	for (i = 0; i < nr_counters_generic; i++) {
-		if (!reserve_evntsel_nmi(pmc_ops->eventsel + i))
+		if (!reserve_evntsel_nmi(x86_pmu->eventsel + i))
 			goto eventsel_fail;
 	}
 
@@ -197,13 +197,13 @@ static bool reserve_pmc_hardware(void)
 
 eventsel_fail:
 	for (i--; i >= 0; i--)
-		release_evntsel_nmi(pmc_ops->eventsel + i);
+		release_evntsel_nmi(x86_pmu->eventsel + i);
 
 	i = nr_counters_generic;
 
 perfctr_fail:
 	for (i--; i >= 0; i--)
-		release_perfctr_nmi(pmc_ops->perfctr + i);
+		release_perfctr_nmi(x86_pmu->perfctr + i);
 
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		enable_lapic_nmi_watchdog();
@@ -216,8 +216,8 @@ static void release_pmc_hardware(void)
 	int i;
 
 	for (i = 0; i < nr_counters_generic; i++) {
-		release_perfctr_nmi(pmc_ops->perfctr + i);
-		release_evntsel_nmi(pmc_ops->eventsel + i);
+		release_perfctr_nmi(x86_pmu->perfctr + i);
+		release_evntsel_nmi(x86_pmu->eventsel + i);
 	}
 
 	if (nmi_watchdog == NMI_LOCAL_APIC)
@@ -293,14 +293,14 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * Raw event type provide the config in the event structure
 	 */
 	if (perf_event_raw(hw_event)) {
-		hwc->config |= pmc_ops->raw_event(perf_event_config(hw_event));
+		hwc->config |= x86_pmu->raw_event(perf_event_config(hw_event));
 	} else {
-		if (perf_event_id(hw_event) >= pmc_ops->max_events)
+		if (perf_event_id(hw_event) >= x86_pmu->max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= pmc_ops->event_map(perf_event_id(hw_event));
+		hwc->config |= x86_pmu->event_map(perf_event_id(hw_event));
 	}
 
 	counter->destroy = hw_perf_counter_destroy;
@@ -308,7 +308,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	return 0;
 }
 
-static u64 pmc_intel_save_disable_all(void)
+static u64 intel_pmu_save_disable_all(void)
 {
 	u64 ctrl;
 
@@ -318,7 +318,7 @@ static u64 pmc_intel_save_disable_all(void)
 	return ctrl;
 }
 
-static u64 pmc_amd_save_disable_all(void)
+static u64 amd_pmu_save_disable_all(void)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	int enabled, idx;
@@ -327,7 +327,8 @@ static u64 pmc_amd_save_disable_all(void)
 	cpuc->enabled = 0;
 	/*
 	 * ensure we write the disable before we start disabling the
-	 * counters proper, so that pcm_amd_enable() does the right thing.
+	 * counters proper, so that amd_pmu_enable_counter() does the
+	 * right thing.
 	 */
 	barrier();
 
@@ -351,19 +352,19 @@ u64 hw_perf_save_disable(void)
 	if (unlikely(!perf_counters_initialized))
 		return 0;
 
-	return pmc_ops->save_disable_all();
+	return x86_pmu->save_disable_all();
 }
 /*
  * Exported because of ACPI idle
  */
 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
-static void pmc_intel_restore_all(u64 ctrl)
+static void intel_pmu_restore_all(u64 ctrl)
 {
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 }
 
-static void pmc_amd_restore_all(u64 ctrl)
+static void amd_pmu_restore_all(u64 ctrl)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	int idx;
@@ -391,14 +392,14 @@ void hw_perf_restore(u64 ctrl)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	pmc_ops->restore_all(ctrl);
+	x86_pmu->restore_all(ctrl);
 }
 /*
  * Exported because of ACPI idle
  */
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
-static u64 pmc_intel_get_status(u64 mask)
+static u64 intel_pmu_get_status(u64 mask)
 {
 	u64 status;
 
@@ -407,7 +408,7 @@ static u64 pmc_intel_get_status(u64 mask)
 	return status;
 }
 
-static u64 pmc_amd_get_status(u64 mask)
+static u64 amd_pmu_get_status(u64 mask)
 {
 	u64 status = 0;
 	int idx;
@@ -432,15 +433,15 @@ static u64 hw_perf_get_status(u64 mask)
 	if (unlikely(!perf_counters_initialized))
 		return 0;
 
-	return pmc_ops->get_status(mask);
+	return x86_pmu->get_status(mask);
 }
 
-static void pmc_intel_ack_status(u64 ack)
+static void intel_pmu_ack_status(u64 ack)
 {
 	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
 }
 
-static void pmc_amd_ack_status(u64 ack)
+static void amd_pmu_ack_status(u64 ack)
 {
 }
 
@@ -449,16 +450,16 @@ static void hw_perf_ack_status(u64 ack)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	pmc_ops->ack_status(ack);
+	x86_pmu->ack_status(ack);
 }
 
-static void pmc_intel_enable(int idx, u64 config)
+static void intel_pmu_enable_counter(int idx, u64 config)
 {
 	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx,
 			config | ARCH_PERFMON_EVENTSEL0_ENABLE);
 }
 
-static void pmc_amd_enable(int idx, u64 config)
+static void amd_pmu_enable_counter(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
@@ -474,15 +475,15 @@ static void hw_perf_enable(int idx, u64 config)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	pmc_ops->enable(idx, config);
+	x86_pmu->enable(idx, config);
 }
 
-static void pmc_intel_disable(int idx, u64 config)
+static void intel_pmu_disable_counter(int idx, u64 config)
 {
 	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config);
 }
 
-static void pmc_amd_disable(int idx, u64 config)
+static void amd_pmu_disable_counter(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
@@ -496,7 +497,7 @@ static void hw_perf_disable(int idx, u64 config)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	pmc_ops->disable(idx, config);
+	x86_pmu->disable(idx, config);
 }
 
 static inline void
@@ -613,11 +614,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 
 	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
 
-	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_INSTRUCTIONS)))
+	if (unlikely(event == x86_pmu->event_map(PERF_COUNT_INSTRUCTIONS)))
 		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
-	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_CPU_CYCLES)))
+	if (unlikely(event == x86_pmu->event_map(PERF_COUNT_CPU_CYCLES)))
 		return X86_PMC_IDX_FIXED_CPU_CYCLES;
-	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_BUS_CYCLES)))
+	if (unlikely(event == x86_pmu->event_map(PERF_COUNT_BUS_CYCLES)))
 		return X86_PMC_IDX_FIXED_BUS_CYCLES;
 
 	return -1;
@@ -661,8 +662,8 @@ try_generic:
 			set_bit(idx, cpuc->used);
 			hwc->idx = idx;
 		}
-		hwc->config_base  = pmc_ops->eventsel;
-		hwc->counter_base = pmc_ops->perfctr;
+		hwc->config_base  = x86_pmu->eventsel;
+		hwc->counter_base = x86_pmu->perfctr;
 	}
 
 	perf_counters_lapic_init(hwc->nmi);
@@ -710,8 +711,8 @@ void perf_counter_print_debug(void)
 	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-		rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl);
-		rdmsrl(pmc_ops->perfctr  + idx, pmc_count);
+		rdmsrl(x86_pmu->eventsel + idx, pmc_ctrl);
+		rdmsrl(x86_pmu->perfctr  + idx, pmc_count);
 
 		prev_left = per_cpu(prev_left[idx], cpu);
 
@@ -918,35 +919,35 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 	.priority		= 1
 };
 
-static struct pmc_x86_ops pmc_intel_ops = {
-	.save_disable_all	= pmc_intel_save_disable_all,
-	.restore_all		= pmc_intel_restore_all,
-	.get_status		= pmc_intel_get_status,
-	.ack_status		= pmc_intel_ack_status,
-	.enable			= pmc_intel_enable,
-	.disable		= pmc_intel_disable,
+static struct x86_pmu intel_pmu = {
+	.save_disable_all	= intel_pmu_save_disable_all,
+	.restore_all		= intel_pmu_restore_all,
+	.get_status		= intel_pmu_get_status,
+	.ack_status		= intel_pmu_ack_status,
+	.enable			= intel_pmu_enable_counter,
+	.disable		= intel_pmu_disable_counter,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
-	.event_map		= pmc_intel_event_map,
-	.raw_event		= pmc_intel_raw_event,
+	.event_map		= intel_pmu_event_map,
+	.raw_event		= intel_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 };
 
-static struct pmc_x86_ops pmc_amd_ops = {
-	.save_disable_all	= pmc_amd_save_disable_all,
-	.restore_all		= pmc_amd_restore_all,
-	.get_status		= pmc_amd_get_status,
-	.ack_status		= pmc_amd_ack_status,
-	.enable			= pmc_amd_enable,
-	.disable		= pmc_amd_disable,
+static struct x86_pmu amd_pmu = {
+	.save_disable_all	= amd_pmu_save_disable_all,
+	.restore_all		= amd_pmu_restore_all,
+	.get_status		= amd_pmu_get_status,
+	.ack_status		= amd_pmu_ack_status,
+	.enable			= amd_pmu_enable_counter,
+	.disable		= amd_pmu_disable_counter,
 	.eventsel		= MSR_K7_EVNTSEL0,
 	.perfctr		= MSR_K7_PERFCTR0,
-	.event_map		= pmc_amd_event_map,
-	.raw_event		= pmc_amd_raw_event,
+	.event_map		= amd_pmu_event_map,
+	.raw_event		= amd_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
 };
 
-static struct pmc_x86_ops *pmc_intel_init(void)
+static struct x86_pmu *intel_pmu_init(void)
 {
 	union cpuid10_edx edx;
 	union cpuid10_eax eax;
@@ -977,10 +978,10 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 	nr_counters_fixed = edx.split.num_counters_fixed;
 	counter_value_mask = (1ULL << eax.split.bit_width) - 1;
 
-	return &pmc_intel_ops;
+	return &intel_pmu;
 }
 
-static struct pmc_x86_ops *pmc_amd_init(void)
+static struct x86_pmu *amd_pmu_init(void)
 {
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
@@ -989,22 +990,22 @@ static struct pmc_x86_ops *pmc_amd_init(void)
 
 	pr_info("AMD Performance Monitoring support detected.\n");
 
-	return &pmc_amd_ops;
+	return &amd_pmu;
 }
 
 void __init init_hw_perf_counters(void)
 {
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_INTEL:
-		pmc_ops = pmc_intel_init();
+		x86_pmu = intel_pmu_init();
 		break;
 	case X86_VENDOR_AMD:
-		pmc_ops = pmc_amd_init();
+		x86_pmu = amd_pmu_init();
 		break;
 	default:
 		return;
 	}
-	if (!pmc_ops)
+	if (!x86_pmu)
 		return;
 
 	pr_info("... num counters:    %d\n", nr_counters_generic);
-- 
cgit v0.10.2


From 39d81eab2374d71b2d9c82f66258a1a4f57ddd2e Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:05 +0200
Subject: perf_counter, x86: make interrupt handler model specific

This separates the perfcounter interrupt handler for AMD and Intel
cpus. The AMD interrupt handler implementation is a follow-on patch.

[ Impact: refactor and clean up code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-9-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 808a1a1..9d90de0 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -4,6 +4,7 @@
  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
  *  Copyright(C) 2009 Jaswinder Singh Rajput
+ *  Copyright(C) 2009 Advanced Micro Devices, Inc., Robert Richter
  *
  *  For licencing details see kernel-base/COPYING
  */
@@ -47,6 +48,7 @@ struct cpu_hw_counters {
  * struct x86_pmu - generic x86 pmu
  */
 struct x86_pmu {
+	int		(*handle_irq)(struct pt_regs *, int);
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
 	u64		(*get_status)(u64);
@@ -241,6 +243,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	int err;
 
+	/* disable temporarily */
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+		return -ENOSYS;
+
 	if (unlikely(!perf_counters_initialized))
 		return -EINVAL;
 
@@ -780,7 +786,7 @@ static void perf_save_and_restart(struct perf_counter *counter)
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
  */
-static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
+static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 {
 	int bit, cpu = smp_processor_id();
 	u64 ack, status;
@@ -827,6 +833,8 @@ out:
 	return ret;
 }
 
+static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { return 0; }
+
 void perf_counter_unthrottle(void)
 {
 	struct cpu_hw_counters *cpuc;
@@ -851,7 +859,7 @@ void smp_perf_counter_interrupt(struct pt_regs *regs)
 	irq_enter();
 	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
 	ack_APIC_irq();
-	__smp_perf_counter_interrupt(regs, 0);
+	x86_pmu->handle_irq(regs, 0);
 	irq_exit();
 }
 
@@ -908,7 +916,7 @@ perf_counter_nmi_handler(struct notifier_block *self,
 	regs = args->regs;
 
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	ret = __smp_perf_counter_interrupt(regs, 1);
+	ret = x86_pmu->handle_irq(regs, 1);
 
 	return ret ? NOTIFY_STOP : NOTIFY_OK;
 }
@@ -920,6 +928,7 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 };
 
 static struct x86_pmu intel_pmu = {
+	.handle_irq		= intel_pmu_handle_irq,
 	.save_disable_all	= intel_pmu_save_disable_all,
 	.restore_all		= intel_pmu_restore_all,
 	.get_status		= intel_pmu_get_status,
@@ -934,6 +943,7 @@ static struct x86_pmu intel_pmu = {
 };
 
 static struct x86_pmu amd_pmu = {
+	.handle_irq		= amd_pmu_handle_irq,
 	.save_disable_all	= amd_pmu_save_disable_all,
 	.restore_all		= amd_pmu_restore_all,
 	.get_status		= amd_pmu_get_status,
-- 
cgit v0.10.2


From b7f8859a8ed1937e2139c17b84878f1d413fa659 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:06 +0200
Subject: perf_counter, x86: remove get_status() from struct x86_pmu

This function is Intel only and not necessary for AMD cpus.

[ Impact: simplify code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-10-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 9d90de0..d0bb029 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -51,7 +51,6 @@ struct x86_pmu {
 	int		(*handle_irq)(struct pt_regs *, int);
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
-	u64		(*get_status)(u64);
 	void		(*ack_status)(u64);
 	void		(*enable)(int, u64);
 	void		(*disable)(int, u64);
@@ -405,41 +404,15 @@ void hw_perf_restore(u64 ctrl)
  */
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
-static u64 intel_pmu_get_status(u64 mask)
+static inline u64 intel_pmu_get_status(u64 mask)
 {
 	u64 status;
 
-	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-
-	return status;
-}
-
-static u64 amd_pmu_get_status(u64 mask)
-{
-	u64 status = 0;
-	int idx;
-
-	for (idx = 0; idx < nr_counters_generic; idx++) {
-		s64 val;
-
-		if (!(mask & (1 << idx)))
-			continue;
-
-		rdmsrl(MSR_K7_PERFCTR0 + idx, val);
-		val <<= (64 - counter_value_bits);
-		if (val >= 0)
-			status |= (1 << idx);
-	}
-
-	return status;
-}
-
-static u64 hw_perf_get_status(u64 mask)
-{
 	if (unlikely(!perf_counters_initialized))
 		return 0;
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 
-	return x86_pmu->get_status(mask);
+	return status;
 }
 
 static void intel_pmu_ack_status(u64 ack)
@@ -795,7 +768,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 
 	cpuc->throttle_ctrl = hw_perf_save_disable();
 
-	status = hw_perf_get_status(cpuc->throttle_ctrl);
+	status = intel_pmu_get_status(cpuc->throttle_ctrl);
 	if (!status)
 		goto out;
 
@@ -820,7 +793,7 @@ again:
 	/*
 	 * Repeat if there is more work to be done:
 	 */
-	status = hw_perf_get_status(cpuc->throttle_ctrl);
+	status = intel_pmu_get_status(cpuc->throttle_ctrl);
 	if (status)
 		goto again;
 out:
@@ -931,7 +904,6 @@ static struct x86_pmu intel_pmu = {
 	.handle_irq		= intel_pmu_handle_irq,
 	.save_disable_all	= intel_pmu_save_disable_all,
 	.restore_all		= intel_pmu_restore_all,
-	.get_status		= intel_pmu_get_status,
 	.ack_status		= intel_pmu_ack_status,
 	.enable			= intel_pmu_enable_counter,
 	.disable		= intel_pmu_disable_counter,
@@ -946,7 +918,6 @@ static struct x86_pmu amd_pmu = {
 	.handle_irq		= amd_pmu_handle_irq,
 	.save_disable_all	= amd_pmu_save_disable_all,
 	.restore_all		= amd_pmu_restore_all,
-	.get_status		= amd_pmu_get_status,
 	.ack_status		= amd_pmu_ack_status,
 	.enable			= amd_pmu_enable_counter,
 	.disable		= amd_pmu_disable_counter,
-- 
cgit v0.10.2


From dee5d9067ca78b317538fd67930be4e09a83dbc5 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:07 +0200
Subject: perf_counter, x86: remove ack_status() from struct x86_pmu

This function is Intel only and not necessary for AMD cpus.

[ Impact: simplify code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-11-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d0bb029..6bbdc16 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -51,7 +51,6 @@ struct x86_pmu {
 	int		(*handle_irq)(struct pt_regs *, int);
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
-	void		(*ack_status)(u64);
 	void		(*enable)(int, u64);
 	void		(*disable)(int, u64);
 	unsigned	eventsel;
@@ -415,23 +414,11 @@ static inline u64 intel_pmu_get_status(u64 mask)
 	return status;
 }
 
-static void intel_pmu_ack_status(u64 ack)
+static inline void intel_pmu_ack_status(u64 ack)
 {
 	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
 }
 
-static void amd_pmu_ack_status(u64 ack)
-{
-}
-
-static void hw_perf_ack_status(u64 ack)
-{
-	if (unlikely(!perf_counters_initialized))
-		return;
-
-	x86_pmu->ack_status(ack);
-}
-
 static void intel_pmu_enable_counter(int idx, u64 config)
 {
 	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx,
@@ -788,7 +775,7 @@ again:
 			__x86_pmu_disable(counter, &counter->hw, bit);
 	}
 
-	hw_perf_ack_status(ack);
+	intel_pmu_ack_status(ack);
 
 	/*
 	 * Repeat if there is more work to be done:
@@ -904,7 +891,6 @@ static struct x86_pmu intel_pmu = {
 	.handle_irq		= intel_pmu_handle_irq,
 	.save_disable_all	= intel_pmu_save_disable_all,
 	.restore_all		= intel_pmu_restore_all,
-	.ack_status		= intel_pmu_ack_status,
 	.enable			= intel_pmu_enable_counter,
 	.disable		= intel_pmu_disable_counter,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
@@ -918,7 +904,6 @@ static struct x86_pmu amd_pmu = {
 	.handle_irq		= amd_pmu_handle_irq,
 	.save_disable_all	= amd_pmu_save_disable_all,
 	.restore_all		= amd_pmu_restore_all,
-	.ack_status		= amd_pmu_ack_status,
 	.enable			= amd_pmu_enable_counter,
 	.disable		= amd_pmu_disable_counter,
 	.eventsel		= MSR_K7_EVNTSEL0,
-- 
cgit v0.10.2


From 26816c287e13eedc67bc4ed0cd40c138314b7c7d Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:08 +0200
Subject: perf_counter, x86: rename __hw_perf_counter_set_period into
 x86_perf_counter_set_period

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-12-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6bbdc16..fa6541d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -498,7 +498,7 @@ static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
  * To be called with the counter disabled in hw:
  */
 static void
-__hw_perf_counter_set_period(struct perf_counter *counter,
+x86_perf_counter_set_period(struct perf_counter *counter,
 			     struct hw_perf_counter *hwc, int idx)
 {
 	s64 left = atomic64_read(&hwc->period_left);
@@ -642,7 +642,7 @@ try_generic:
 	 */
 	barrier();
 
-	__hw_perf_counter_set_period(counter, hwc, idx);
+	x86_perf_counter_set_period(counter, hwc, idx);
 	__x86_pmu_enable(counter, hwc, idx);
 
 	return 0;
@@ -731,7 +731,7 @@ static void perf_save_and_restart(struct perf_counter *counter)
 	int idx = hwc->idx;
 
 	x86_perf_counter_update(counter, hwc, idx);
-	__hw_perf_counter_set_period(counter, hwc, idx);
+	x86_perf_counter_set_period(counter, hwc, idx);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		__x86_pmu_enable(counter, hwc, idx);
-- 
cgit v0.10.2


From 55de0f2e57994b525324bf0d04d242d9358a2417 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:09 +0200
Subject: perf_counter, x86: rename intel only functions

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-13-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index fa6541d..5a52d73 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -725,7 +725,7 @@ static void x86_pmu_disable(struct perf_counter *counter)
  * Save and restart an expired counter. Called by NMI contexts,
  * so it has to be careful about preempting normal counter ops:
  */
-static void perf_save_and_restart(struct perf_counter *counter)
+static void intel_pmu_save_and_restart(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	int idx = hwc->idx;
@@ -753,7 +753,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
 	int ret = 0;
 
-	cpuc->throttle_ctrl = hw_perf_save_disable();
+	cpuc->throttle_ctrl = intel_pmu_save_disable_all();
 
 	status = intel_pmu_get_status(cpuc->throttle_ctrl);
 	if (!status)
@@ -770,7 +770,7 @@ again:
 		if (!counter)
 			continue;
 
-		perf_save_and_restart(counter);
+		intel_pmu_save_and_restart(counter);
 		if (perf_counter_overflow(counter, nmi, regs, 0))
 			__x86_pmu_disable(counter, &counter->hw, bit);
 	}
@@ -788,7 +788,7 @@ out:
 	 * Restore - do not reenable when global enable is off or throttled:
 	 */
 	if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
-		hw_perf_restore(cpuc->throttle_ctrl);
+		intel_pmu_restore_all(cpuc->throttle_ctrl);
 
 	return ret;
 }
-- 
cgit v0.10.2


From 72eae04d3a3075c26d39e1e685acfc8e8c29db64 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:10 +0200
Subject: perf_counter, x86: modify initialization of struct x86_pmu

This patch adds an error handler and changes initialization of struct
x86_pmu. No functional changes. Needed for follow-on patches.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-14-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 5a52d73..7c72a94 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -913,7 +913,7 @@ static struct x86_pmu amd_pmu = {
 	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
 };
 
-static struct x86_pmu *intel_pmu_init(void)
+static int intel_pmu_init(void)
 {
 	union cpuid10_edx edx;
 	union cpuid10_eax eax;
@@ -921,7 +921,7 @@ static struct x86_pmu *intel_pmu_init(void)
 	unsigned int ebx;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
-		return NULL;
+		return -ENODEV;
 
 	/*
 	 * Check whether the Architectural PerfMon supports
@@ -929,49 +929,54 @@ static struct x86_pmu *intel_pmu_init(void)
 	 */
 	cpuid(10, &eax.full, &ebx, &unused, &edx.full);
 	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
-		return NULL;
+		return -ENODEV;
 
 	intel_perfmon_version = eax.split.version_id;
 	if (intel_perfmon_version < 2)
-		return NULL;
+		return -ENODEV;
 
 	pr_info("Intel Performance Monitoring support detected.\n");
 	pr_info("... version:         %d\n", intel_perfmon_version);
 	pr_info("... bit width:       %d\n", eax.split.bit_width);
 	pr_info("... mask length:     %d\n", eax.split.mask_length);
 
+	x86_pmu = &intel_pmu;
+
 	nr_counters_generic = eax.split.num_counters;
 	nr_counters_fixed = edx.split.num_counters_fixed;
 	counter_value_mask = (1ULL << eax.split.bit_width) - 1;
 
-	return &intel_pmu;
+	return 0;
 }
 
-static struct x86_pmu *amd_pmu_init(void)
+static int amd_pmu_init(void)
 {
+	x86_pmu = &amd_pmu;
+
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
 	counter_value_mask = 0x0000FFFFFFFFFFFFULL;
 	counter_value_bits = 48;
 
 	pr_info("AMD Performance Monitoring support detected.\n");
-
-	return &amd_pmu;
+	return 0;
 }
 
 void __init init_hw_perf_counters(void)
 {
+	int err;
+
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_INTEL:
-		x86_pmu = intel_pmu_init();
+		err = intel_pmu_init();
 		break;
 	case X86_VENDOR_AMD:
-		x86_pmu = amd_pmu_init();
+		err = amd_pmu_init();
 		break;
 	default:
 		return;
 	}
-	if (!x86_pmu)
+	if (err != 0)
 		return;
 
 	pr_info("... num counters:    %d\n", nr_counters_generic);
-- 
cgit v0.10.2


From 4a06bd8508f65ad1dd5cd2046b85694813fa36a2 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:11 +0200
Subject: perf_counter, x86: make x86_pmu data a static struct

Instead of using a pointer to reference to the x86 pmu we now have one
single data structure that is initialized at the beginning. This saves
the pointer access when using this memory.

[ Impact: micro-optimization ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-15-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 7c72a94..68597d7 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -60,7 +60,7 @@ struct x86_pmu {
 	int		max_events;
 };
 
-static struct x86_pmu *x86_pmu __read_mostly;
+static struct x86_pmu x86_pmu __read_mostly;
 
 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
 	.enabled = 1,
@@ -184,12 +184,12 @@ static bool reserve_pmc_hardware(void)
 		disable_lapic_nmi_watchdog();
 
 	for (i = 0; i < nr_counters_generic; i++) {
-		if (!reserve_perfctr_nmi(x86_pmu->perfctr + i))
+		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
 			goto perfctr_fail;
 	}
 
 	for (i = 0; i < nr_counters_generic; i++) {
-		if (!reserve_evntsel_nmi(x86_pmu->eventsel + i))
+		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
 			goto eventsel_fail;
 	}
 
@@ -197,13 +197,13 @@ static bool reserve_pmc_hardware(void)
 
 eventsel_fail:
 	for (i--; i >= 0; i--)
-		release_evntsel_nmi(x86_pmu->eventsel + i);
+		release_evntsel_nmi(x86_pmu.eventsel + i);
 
 	i = nr_counters_generic;
 
 perfctr_fail:
 	for (i--; i >= 0; i--)
-		release_perfctr_nmi(x86_pmu->perfctr + i);
+		release_perfctr_nmi(x86_pmu.perfctr + i);
 
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		enable_lapic_nmi_watchdog();
@@ -216,8 +216,8 @@ static void release_pmc_hardware(void)
 	int i;
 
 	for (i = 0; i < nr_counters_generic; i++) {
-		release_perfctr_nmi(x86_pmu->perfctr + i);
-		release_evntsel_nmi(x86_pmu->eventsel + i);
+		release_perfctr_nmi(x86_pmu.perfctr + i);
+		release_evntsel_nmi(x86_pmu.eventsel + i);
 	}
 
 	if (nmi_watchdog == NMI_LOCAL_APIC)
@@ -297,14 +297,14 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * Raw event type provide the config in the event structure
 	 */
 	if (perf_event_raw(hw_event)) {
-		hwc->config |= x86_pmu->raw_event(perf_event_config(hw_event));
+		hwc->config |= x86_pmu.raw_event(perf_event_config(hw_event));
 	} else {
-		if (perf_event_id(hw_event) >= x86_pmu->max_events)
+		if (perf_event_id(hw_event) >= x86_pmu.max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= x86_pmu->event_map(perf_event_id(hw_event));
+		hwc->config |= x86_pmu.event_map(perf_event_id(hw_event));
 	}
 
 	counter->destroy = hw_perf_counter_destroy;
@@ -356,7 +356,7 @@ u64 hw_perf_save_disable(void)
 	if (unlikely(!perf_counters_initialized))
 		return 0;
 
-	return x86_pmu->save_disable_all();
+	return x86_pmu.save_disable_all();
 }
 /*
  * Exported because of ACPI idle
@@ -396,7 +396,7 @@ void hw_perf_restore(u64 ctrl)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	x86_pmu->restore_all(ctrl);
+	x86_pmu.restore_all(ctrl);
 }
 /*
  * Exported because of ACPI idle
@@ -441,7 +441,7 @@ static void hw_perf_enable(int idx, u64 config)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	x86_pmu->enable(idx, config);
+	x86_pmu.enable(idx, config);
 }
 
 static void intel_pmu_disable_counter(int idx, u64 config)
@@ -463,7 +463,7 @@ static void hw_perf_disable(int idx, u64 config)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	x86_pmu->disable(idx, config);
+	x86_pmu.disable(idx, config);
 }
 
 static inline void
@@ -580,11 +580,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 
 	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
 
-	if (unlikely(event == x86_pmu->event_map(PERF_COUNT_INSTRUCTIONS)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS)))
 		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
-	if (unlikely(event == x86_pmu->event_map(PERF_COUNT_CPU_CYCLES)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_CPU_CYCLES)))
 		return X86_PMC_IDX_FIXED_CPU_CYCLES;
-	if (unlikely(event == x86_pmu->event_map(PERF_COUNT_BUS_CYCLES)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_BUS_CYCLES)))
 		return X86_PMC_IDX_FIXED_BUS_CYCLES;
 
 	return -1;
@@ -628,8 +628,8 @@ try_generic:
 			set_bit(idx, cpuc->used);
 			hwc->idx = idx;
 		}
-		hwc->config_base  = x86_pmu->eventsel;
-		hwc->counter_base = x86_pmu->perfctr;
+		hwc->config_base  = x86_pmu.eventsel;
+		hwc->counter_base = x86_pmu.perfctr;
 	}
 
 	perf_counters_lapic_init(hwc->nmi);
@@ -677,8 +677,8 @@ void perf_counter_print_debug(void)
 	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-		rdmsrl(x86_pmu->eventsel + idx, pmc_ctrl);
-		rdmsrl(x86_pmu->perfctr  + idx, pmc_count);
+		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
+		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
 
 		prev_left = per_cpu(prev_left[idx], cpu);
 
@@ -819,7 +819,7 @@ void smp_perf_counter_interrupt(struct pt_regs *regs)
 	irq_enter();
 	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
 	ack_APIC_irq();
-	x86_pmu->handle_irq(regs, 0);
+	x86_pmu.handle_irq(regs, 0);
 	irq_exit();
 }
 
@@ -876,7 +876,7 @@ perf_counter_nmi_handler(struct notifier_block *self,
 	regs = args->regs;
 
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	ret = x86_pmu->handle_irq(regs, 1);
+	ret = x86_pmu.handle_irq(regs, 1);
 
 	return ret ? NOTIFY_STOP : NOTIFY_OK;
 }
@@ -940,7 +940,7 @@ static int intel_pmu_init(void)
 	pr_info("... bit width:       %d\n", eax.split.bit_width);
 	pr_info("... mask length:     %d\n", eax.split.mask_length);
 
-	x86_pmu = &intel_pmu;
+	x86_pmu = intel_pmu;
 
 	nr_counters_generic = eax.split.num_counters;
 	nr_counters_fixed = edx.split.num_counters_fixed;
@@ -951,7 +951,7 @@ static int intel_pmu_init(void)
 
 static int amd_pmu_init(void)
 {
-	x86_pmu = &amd_pmu;
+	x86_pmu = amd_pmu;
 
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
-- 
cgit v0.10.2


From 0933e5c6a680ba8d8d786a6f7fa377b7ec0d1e49 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:12 +0200
Subject: perf_counter, x86: move counter parameters to struct x86_pmu

[ Impact: refactor and generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-16-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 68597d7..75dbb1f 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -24,16 +24,7 @@
 #include <asm/nmi.h>
 
 static bool perf_counters_initialized __read_mostly;
-
-/*
- * Number of (generic) HW counters:
- */
-static int nr_counters_generic __read_mostly;
 static u64 perf_counter_mask __read_mostly;
-static u64 counter_value_mask __read_mostly;
-static int counter_value_bits __read_mostly;
-
-static int nr_counters_fixed __read_mostly;
 
 struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
@@ -58,6 +49,10 @@ struct x86_pmu {
 	u64		(*event_map)(int);
 	u64		(*raw_event)(u64);
 	int		max_events;
+	int		num_counters;
+	int		num_counters_fixed;
+	int		counter_bits;
+	u64		counter_mask;
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -183,12 +178,12 @@ static bool reserve_pmc_hardware(void)
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		disable_lapic_nmi_watchdog();
 
-	for (i = 0; i < nr_counters_generic; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
 			goto perfctr_fail;
 	}
 
-	for (i = 0; i < nr_counters_generic; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
 			goto eventsel_fail;
 	}
@@ -199,7 +194,7 @@ eventsel_fail:
 	for (i--; i >= 0; i--)
 		release_evntsel_nmi(x86_pmu.eventsel + i);
 
-	i = nr_counters_generic;
+	i = x86_pmu.num_counters;
 
 perfctr_fail:
 	for (i--; i >= 0; i--)
@@ -215,7 +210,7 @@ static void release_pmc_hardware(void)
 {
 	int i;
 
-	for (i = 0; i < nr_counters_generic; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		release_perfctr_nmi(x86_pmu.perfctr + i);
 		release_evntsel_nmi(x86_pmu.eventsel + i);
 	}
@@ -336,7 +331,7 @@ static u64 amd_pmu_save_disable_all(void)
 	 */
 	barrier();
 
-	for (idx = 0; idx < nr_counters_generic; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
 		if (!test_bit(idx, cpuc->active_mask))
@@ -378,7 +373,7 @@ static void amd_pmu_restore_all(u64 ctrl)
 	if (!ctrl)
 		return;
 
-	for (idx = 0; idx < nr_counters_generic; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
 		if (!test_bit(idx, cpuc->active_mask))
@@ -527,7 +522,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 	atomic64_set(&hwc->prev_count, (u64)-left);
 
 	err = checking_wrmsrl(hwc->counter_base + idx,
-			     (u64)(-left) & counter_value_mask);
+			     (u64)(-left) & x86_pmu.counter_mask);
 }
 
 static inline void
@@ -621,8 +616,9 @@ static int x86_pmu_enable(struct perf_counter *counter)
 		/* Try to get the previous generic counter again */
 		if (test_and_set_bit(idx, cpuc->used)) {
 try_generic:
-			idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
-			if (idx == nr_counters_generic)
+			idx = find_first_zero_bit(cpuc->used,
+						  x86_pmu.num_counters);
+			if (idx == x86_pmu.num_counters)
 				return -EAGAIN;
 
 			set_bit(idx, cpuc->used);
@@ -654,7 +650,7 @@ void perf_counter_print_debug(void)
 	struct cpu_hw_counters *cpuc;
 	int cpu, idx;
 
-	if (!nr_counters_generic)
+	if (!x86_pmu.num_counters)
 		return;
 
 	local_irq_disable();
@@ -676,7 +672,7 @@ void perf_counter_print_debug(void)
 	}
 	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
-	for (idx = 0; idx < nr_counters_generic; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
 		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
 
@@ -689,7 +685,7 @@ void perf_counter_print_debug(void)
 		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
 			cpu, idx, prev_left);
 	}
-	for (idx = 0; idx < nr_counters_fixed; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
 		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
 
 		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -911,6 +907,9 @@ static struct x86_pmu amd_pmu = {
 	.event_map		= amd_pmu_event_map,
 	.raw_event		= amd_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
+	.num_counters		= 4,
+	.counter_bits		= 48,
+	.counter_mask		= (1ULL << 48) - 1,
 };
 
 static int intel_pmu_init(void)
@@ -941,10 +940,10 @@ static int intel_pmu_init(void)
 	pr_info("... mask length:     %d\n", eax.split.mask_length);
 
 	x86_pmu = intel_pmu;
-
-	nr_counters_generic = eax.split.num_counters;
-	nr_counters_fixed = edx.split.num_counters_fixed;
-	counter_value_mask = (1ULL << eax.split.bit_width) - 1;
+	x86_pmu.num_counters = eax.split.num_counters;
+	x86_pmu.num_counters_fixed = edx.split.num_counters_fixed;
+	x86_pmu.counter_bits = eax.split.bit_width;
+	x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
 
 	return 0;
 }
@@ -952,12 +951,6 @@ static int intel_pmu_init(void)
 static int amd_pmu_init(void)
 {
 	x86_pmu = amd_pmu;
-
-	nr_counters_generic = 4;
-	nr_counters_fixed = 0;
-	counter_value_mask = 0x0000FFFFFFFFFFFFULL;
-	counter_value_bits = 48;
-
 	pr_info("AMD Performance Monitoring support detected.\n");
 	return 0;
 }
@@ -979,25 +972,26 @@ void __init init_hw_perf_counters(void)
 	if (err != 0)
 		return;
 
-	pr_info("... num counters:    %d\n", nr_counters_generic);
-	if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
-		nr_counters_generic = X86_PMC_MAX_GENERIC;
+	pr_info("... num counters:    %d\n", x86_pmu.num_counters);
+	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
+		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
 		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
-			nr_counters_generic, X86_PMC_MAX_GENERIC);
+		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
 	}
-	perf_counter_mask = (1 << nr_counters_generic) - 1;
-	perf_max_counters = nr_counters_generic;
+	perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
+	perf_max_counters = x86_pmu.num_counters;
 
-	pr_info("... value mask:      %016Lx\n", counter_value_mask);
+	pr_info("... value mask:      %016Lx\n", x86_pmu.counter_mask);
 
-	if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
-		nr_counters_fixed = X86_PMC_MAX_FIXED;
+	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
+		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
 		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
-			nr_counters_fixed, X86_PMC_MAX_FIXED);
+		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
 	}
-	pr_info("... fixed counters:  %d\n", nr_counters_fixed);
+	pr_info("... fixed counters:  %d\n", x86_pmu.num_counters_fixed);
 
-	perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED;
+	perf_counter_mask |=
+		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
 
 	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
 	perf_counters_initialized = true;
-- 
cgit v0.10.2


From faa28ae018ed004a22aa4a7704e04ccdde4a941e Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:13 +0200
Subject: perf_counter, x86: make pmu version generic

This makes the use of the version variable generic. Also, some debug
messages have been generalized.

[ Impact: refactor and generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-17-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 75dbb1f..15d2c03 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -39,6 +39,8 @@ struct cpu_hw_counters {
  * struct x86_pmu - generic x86 pmu
  */
 struct x86_pmu {
+	const char	*name;
+	int		version;
 	int		(*handle_irq)(struct pt_regs *, int);
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
@@ -61,8 +63,6 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
 	.enabled = 1,
 };
 
-static __read_mostly int intel_perfmon_version;
-
 /*
  * Intel PerfMon v3. Used on Core2 and later.
  */
@@ -658,7 +658,7 @@ void perf_counter_print_debug(void)
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
-	if (intel_perfmon_version >= 2) {
+	if (x86_pmu.version >= 2) {
 		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
@@ -884,6 +884,7 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 };
 
 static struct x86_pmu intel_pmu = {
+	.name			= "Intel",
 	.handle_irq		= intel_pmu_handle_irq,
 	.save_disable_all	= intel_pmu_save_disable_all,
 	.restore_all		= intel_pmu_restore_all,
@@ -897,6 +898,7 @@ static struct x86_pmu intel_pmu = {
 };
 
 static struct x86_pmu amd_pmu = {
+	.name			= "AMD",
 	.handle_irq		= amd_pmu_handle_irq,
 	.save_disable_all	= amd_pmu_save_disable_all,
 	.restore_all		= amd_pmu_restore_all,
@@ -918,6 +920,7 @@ static int intel_pmu_init(void)
 	union cpuid10_eax eax;
 	unsigned int unused;
 	unsigned int ebx;
+	int version;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 		return -ENODEV;
@@ -930,16 +933,12 @@ static int intel_pmu_init(void)
 	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
 		return -ENODEV;
 
-	intel_perfmon_version = eax.split.version_id;
-	if (intel_perfmon_version < 2)
+	version = eax.split.version_id;
+	if (version < 2)
 		return -ENODEV;
 
-	pr_info("Intel Performance Monitoring support detected.\n");
-	pr_info("... version:         %d\n", intel_perfmon_version);
-	pr_info("... bit width:       %d\n", eax.split.bit_width);
-	pr_info("... mask length:     %d\n", eax.split.mask_length);
-
 	x86_pmu = intel_pmu;
+	x86_pmu.version = version;
 	x86_pmu.num_counters = eax.split.num_counters;
 	x86_pmu.num_counters_fixed = edx.split.num_counters_fixed;
 	x86_pmu.counter_bits = eax.split.bit_width;
@@ -951,7 +950,6 @@ static int intel_pmu_init(void)
 static int amd_pmu_init(void)
 {
 	x86_pmu = amd_pmu;
-	pr_info("AMD Performance Monitoring support detected.\n");
 	return 0;
 }
 
@@ -972,6 +970,10 @@ void __init init_hw_perf_counters(void)
 	if (err != 0)
 		return;
 
+	pr_info("%s Performance Monitoring support detected.\n", x86_pmu.name);
+	pr_info("... version:         %d\n", x86_pmu.version);
+	pr_info("... bit width:       %d\n", x86_pmu.counter_bits);
+
 	pr_info("... num counters:    %d\n", x86_pmu.num_counters);
 	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
 		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
-- 
cgit v0.10.2


From bb775fc2d1dcd1aa6eafde37a8289ba2d80783aa Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:14 +0200
Subject: perf_counter, x86: make x86_pmu_read() static inline

[ Impact: micro-optimization ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-18-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 15d2c03..3f3ae47 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1002,7 +1002,7 @@ void __init init_hw_perf_counters(void)
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 
-static void x86_pmu_read(struct perf_counter *counter)
+static inline void x86_pmu_read(struct perf_counter *counter)
 {
 	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
 }
-- 
cgit v0.10.2


From 93904966934193204ad08e951f806d5631c29eb3 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:15 +0200
Subject: perf_counter, x86: rename cpuc->active_mask

This is to have a consistent naming scheme with cpuc->used.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-19-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 3f3ae47..9ec51a6 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -29,9 +29,9 @@ static u64 perf_counter_mask __read_mostly;
 struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
 	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	unsigned long		active[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		interrupts;
 	u64			throttle_ctrl;
-	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	int			enabled;
 };
 
@@ -334,7 +334,7 @@ static u64 amd_pmu_save_disable_all(void)
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
-		if (!test_bit(idx, cpuc->active_mask))
+		if (!test_bit(idx, cpuc->active))
 			continue;
 		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
 		if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
@@ -376,7 +376,7 @@ static void amd_pmu_restore_all(u64 ctrl)
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
-		if (!test_bit(idx, cpuc->active_mask))
+		if (!test_bit(idx, cpuc->active))
 			continue;
 		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
 		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
@@ -424,7 +424,7 @@ static void amd_pmu_enable_counter(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
-	set_bit(idx, cpuc->active_mask);
+	set_bit(idx, cpuc->active);
 	if (cpuc->enabled)
 		config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
 
@@ -448,7 +448,7 @@ static void amd_pmu_disable_counter(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
-	clear_bit(idx, cpuc->active_mask);
+	clear_bit(idx, cpuc->active);
 	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
 
 }
-- 
cgit v0.10.2


From 095342389e2ed8deed07b3076f990260ce3c7c9f Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:16 +0200
Subject: perf_counter, x86: generic use of cpuc->active

cpuc->active will now be used to indicate an enabled counter which
implies also valid pointers of cpuc->counters[]. In contrast,
cpuc->used only locks the counter, but it can be still uninitialized.

[ Impact: refactor and generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-20-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 9ec51a6..f7fd4a3 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -424,7 +424,6 @@ static void amd_pmu_enable_counter(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
-	set_bit(idx, cpuc->active);
 	if (cpuc->enabled)
 		config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
 
@@ -446,9 +445,6 @@ static void intel_pmu_disable_counter(int idx, u64 config)
 
 static void amd_pmu_disable_counter(int idx, u64 config)
 {
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	clear_bit(idx, cpuc->active);
 	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
 
 }
@@ -633,10 +629,7 @@ try_generic:
 	__x86_pmu_disable(counter, hwc, idx);
 
 	cpuc->counters[idx] = counter;
-	/*
-	 * Make it visible before enabling the hw:
-	 */
-	barrier();
+	set_bit(idx, cpuc->active);
 
 	x86_perf_counter_set_period(counter, hwc, idx);
 	__x86_pmu_enable(counter, hwc, idx);
@@ -700,10 +693,13 @@ static void x86_pmu_disable(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
+	/*
+	 * Must be done before we disable, otherwise the nmi handler
+	 * could reenable again:
+	 */
+	clear_bit(idx, cpuc->active);
 	__x86_pmu_disable(counter, hwc, idx);
 
-	clear_bit(idx, cpuc->used);
-	cpuc->counters[idx] = NULL;
 	/*
 	 * Make sure the cleared pointer becomes visible before we
 	 * (potentially) free the counter:
@@ -715,6 +711,8 @@ static void x86_pmu_disable(struct perf_counter *counter)
 	 * that we are disabling:
 	 */
 	x86_perf_counter_update(counter, hwc, idx);
+	cpuc->counters[idx] = NULL;
+	clear_bit(idx, cpuc->used);
 }
 
 /*
@@ -763,7 +761,7 @@ again:
 		struct perf_counter *counter = cpuc->counters[bit];
 
 		clear_bit(bit, (unsigned long *) &status);
-		if (!counter)
+		if (!test_bit(bit, cpuc->active))
 			continue;
 
 		intel_pmu_save_and_restart(counter);
-- 
cgit v0.10.2


From 6f00cada07bb5da7f751929d3173494dcc5446cc Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:17 +0200
Subject: perf_counter, x86: consistent use of type int for counter index

The type of counter index is sometimes implemented as unsigned
int. This patch changes this to have a consistent usage of int.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-21-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f7fd4a3..d8beebe 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -459,7 +459,7 @@ static void hw_perf_disable(int idx, u64 config)
 
 static inline void
 __pmc_fixed_disable(struct perf_counter *counter,
-		    struct hw_perf_counter *hwc, unsigned int __idx)
+		    struct hw_perf_counter *hwc, int __idx)
 {
 	int idx = __idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, mask;
@@ -474,7 +474,7 @@ __pmc_fixed_disable(struct perf_counter *counter,
 
 static inline void
 __x86_pmu_disable(struct perf_counter *counter,
-		  struct hw_perf_counter *hwc, unsigned int idx)
+		  struct hw_perf_counter *hwc, int idx)
 {
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
 		__pmc_fixed_disable(counter, hwc, idx);
@@ -523,7 +523,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 
 static inline void
 __pmc_fixed_enable(struct perf_counter *counter,
-		   struct hw_perf_counter *hwc, unsigned int __idx)
+		   struct hw_perf_counter *hwc, int __idx)
 {
 	int idx = __idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, bits, mask;
@@ -691,7 +691,7 @@ static void x86_pmu_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
-	unsigned int idx = hwc->idx;
+	int idx = hwc->idx;
 
 	/*
 	 * Must be done before we disable, otherwise the nmi handler
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c3db52d..41aed42 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -318,7 +318,7 @@ struct hw_perf_counter {
 			unsigned long			config_base;
 			unsigned long			counter_base;
 			int				nmi;
-			unsigned int			idx;
+			int				idx;
 		};
 		union { /* software */
 			atomic64_t			count;
-- 
cgit v0.10.2


From 7c90cc45f89af4dd4617f97d452740ad95b800d5 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:18 +0200
Subject: perf_counter, x86: rework counter enable functions

There is vendor specific code in generic x86 code, and there is vendor
specific code that could be generic. This patch introduces
x86_pmu_enable_counter() for x86 generic code. Fixed counter code for
Intel is moved to Intel only functions. In the end, checks and calls
via function pointers were reduced to the necessary. Also, the
internal function i/f changed.

[ Impact: refactor and generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-22-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d8beebe..ae55933 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -44,7 +44,7 @@ struct x86_pmu {
 	int		(*handle_irq)(struct pt_regs *, int);
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
-	void		(*enable)(int, u64);
+	void		(*enable)(struct hw_perf_counter *, int);
 	void		(*disable)(int, u64);
 	unsigned	eventsel;
 	unsigned	perfctr;
@@ -414,28 +414,15 @@ static inline void intel_pmu_ack_status(u64 ack)
 	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
 }
 
-static void intel_pmu_enable_counter(int idx, u64 config)
+static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 {
-	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx,
-			config | ARCH_PERFMON_EVENTSEL0_ENABLE);
-}
-
-static void amd_pmu_enable_counter(int idx, u64 config)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	if (cpuc->enabled)
-		config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-
-	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
-}
+	int err;
 
-static void hw_perf_enable(int idx, u64 config)
-{
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	x86_pmu.enable(idx, config);
+	err = checking_wrmsrl(hwc->config_base + idx,
+			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
 }
 
 static void intel_pmu_disable_counter(int idx, u64 config)
@@ -522,8 +509,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 }
 
 static inline void
-__pmc_fixed_enable(struct perf_counter *counter,
-		   struct hw_perf_counter *hwc, int __idx)
+intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
 {
 	int idx = __idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, bits, mask;
@@ -548,14 +534,24 @@ __pmc_fixed_enable(struct perf_counter *counter,
 	err = checking_wrmsrl(hwc->config_base, ctrl_val);
 }
 
-static void
-__x86_pmu_enable(struct perf_counter *counter,
-		 struct hw_perf_counter *hwc, int idx)
+static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 {
-	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
-		__pmc_fixed_enable(counter, hwc, idx);
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		intel_pmu_enable_fixed(hwc, idx);
+		return;
+	}
+
+	x86_pmu_enable_counter(hwc, idx);
+}
+
+static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+	if (cpuc->enabled)
+		x86_pmu_enable_counter(hwc, idx);
 	else
-		hw_perf_enable(idx, hwc->config);
+		amd_pmu_disable_counter(idx, hwc->config);
 }
 
 static int
@@ -632,7 +628,7 @@ try_generic:
 	set_bit(idx, cpuc->active);
 
 	x86_perf_counter_set_period(counter, hwc, idx);
-	__x86_pmu_enable(counter, hwc, idx);
+	x86_pmu.enable(hwc, idx);
 
 	return 0;
 }
@@ -728,7 +724,7 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter)
 	x86_perf_counter_set_period(counter, hwc, idx);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
-		__x86_pmu_enable(counter, hwc, idx);
+		intel_pmu_enable_counter(hwc, idx);
 }
 
 /*
-- 
cgit v0.10.2


From d43698918bd46c71d494555fb92195fbea1fcb6c Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:19 +0200
Subject: perf_counter, x86: rework counter disable functions

As for the enable function, this patch reworks the disable functions
and introduces x86_pmu_disable_counter(). The internal function i/f in
struct x86_pmu changed too.

[ Impact: refactor and generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-23-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index ae55933..df9012b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -45,7 +45,7 @@ struct x86_pmu {
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
 	void		(*enable)(struct hw_perf_counter *, int);
-	void		(*disable)(int, u64);
+	void		(*disable)(struct hw_perf_counter *, int);
 	unsigned	eventsel;
 	unsigned	perfctr;
 	u64		(*event_map)(int);
@@ -425,28 +425,19 @@ static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
 }
 
-static void intel_pmu_disable_counter(int idx, u64 config)
+static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
 {
-	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config);
-}
-
-static void amd_pmu_disable_counter(int idx, u64 config)
-{
-	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
-
-}
+	int err;
 
-static void hw_perf_disable(int idx, u64 config)
-{
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	x86_pmu.disable(idx, config);
+	err = checking_wrmsrl(hwc->config_base + idx,
+			      hwc->config);
 }
 
 static inline void
-__pmc_fixed_disable(struct perf_counter *counter,
-		    struct hw_perf_counter *hwc, int __idx)
+intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
 {
 	int idx = __idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, mask;
@@ -460,13 +451,20 @@ __pmc_fixed_disable(struct perf_counter *counter,
 }
 
 static inline void
-__x86_pmu_disable(struct perf_counter *counter,
-		  struct hw_perf_counter *hwc, int idx)
+intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
 {
-	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
-		__pmc_fixed_disable(counter, hwc, idx);
-	else
-		hw_perf_disable(idx, hwc->config);
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		intel_pmu_disable_fixed(hwc, idx);
+		return;
+	}
+
+	x86_pmu_disable_counter(hwc, idx);
+}
+
+static inline void
+amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+{
+	x86_pmu_disable_counter(hwc, idx);
 }
 
 static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
@@ -551,7 +549,7 @@ static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 	if (cpuc->enabled)
 		x86_pmu_enable_counter(hwc, idx);
 	else
-		amd_pmu_disable_counter(idx, hwc->config);
+		x86_pmu_disable_counter(hwc, idx);
 }
 
 static int
@@ -622,7 +620,7 @@ try_generic:
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	__x86_pmu_disable(counter, hwc, idx);
+	x86_pmu.disable(hwc, idx);
 
 	cpuc->counters[idx] = counter;
 	set_bit(idx, cpuc->active);
@@ -694,7 +692,7 @@ static void x86_pmu_disable(struct perf_counter *counter)
 	 * could reenable again:
 	 */
 	clear_bit(idx, cpuc->active);
-	__x86_pmu_disable(counter, hwc, idx);
+	x86_pmu.disable(hwc, idx);
 
 	/*
 	 * Make sure the cleared pointer becomes visible before we
@@ -762,7 +760,7 @@ again:
 
 		intel_pmu_save_and_restart(counter);
 		if (perf_counter_overflow(counter, nmi, regs, 0))
-			__x86_pmu_disable(counter, &counter->hw, bit);
+			intel_pmu_disable_counter(&counter->hw, bit);
 	}
 
 	intel_pmu_ack_status(ack);
-- 
cgit v0.10.2


From 85cf9dba92152bb4edec118b2f4f0be1ae7fdcab Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:20 +0200
Subject: perf_counter, x86: change and remove pmu initialization checks

Some functions are only called if the pmu was proper initialized. That
initalization checks can be removed. The way to check initialization
changed too. Now, the pointer to the interrupt handler is checked. If
it exists the pmu is initialized. This also removes a static variable
and uses struct x86_pmu as only data source for the check.

[ Impact: simplify code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-24-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index df9012b..2d3681b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -23,7 +23,6 @@
 #include <asm/stacktrace.h>
 #include <asm/nmi.h>
 
-static bool perf_counters_initialized __read_mostly;
 static u64 perf_counter_mask __read_mostly;
 
 struct cpu_hw_counters {
@@ -227,6 +226,11 @@ static void hw_perf_counter_destroy(struct perf_counter *counter)
 	}
 }
 
+static inline int x86_pmu_initialized(void)
+{
+	return x86_pmu.handle_irq != NULL;
+}
+
 /*
  * Setup the hardware configuration for a given hw_event_type
  */
@@ -240,8 +244,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
 		return -ENOSYS;
 
-	if (unlikely(!perf_counters_initialized))
-		return -EINVAL;
+	if (!x86_pmu_initialized())
+		return -ENODEV;
 
 	err = 0;
 	if (atomic_inc_not_zero(&num_counters)) {
@@ -348,9 +352,8 @@ static u64 amd_pmu_save_disable_all(void)
 
 u64 hw_perf_save_disable(void)
 {
-	if (unlikely(!perf_counters_initialized))
+	if (!x86_pmu_initialized())
 		return 0;
-
 	return x86_pmu.save_disable_all();
 }
 /*
@@ -388,9 +391,8 @@ static void amd_pmu_restore_all(u64 ctrl)
 
 void hw_perf_restore(u64 ctrl)
 {
-	if (unlikely(!perf_counters_initialized))
+	if (!x86_pmu_initialized())
 		return;
-
 	x86_pmu.restore_all(ctrl);
 }
 /*
@@ -402,8 +404,6 @@ static inline u64 intel_pmu_get_status(u64 mask)
 {
 	u64 status;
 
-	if (unlikely(!perf_counters_initialized))
-		return 0;
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 
 	return status;
@@ -417,10 +417,6 @@ static inline void intel_pmu_ack_status(u64 ack)
 static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 {
 	int err;
-
-	if (unlikely(!perf_counters_initialized))
-		return;
-
 	err = checking_wrmsrl(hwc->config_base + idx,
 			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
 }
@@ -428,10 +424,6 @@ static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
 {
 	int err;
-
-	if (unlikely(!perf_counters_initialized))
-		return;
-
 	err = checking_wrmsrl(hwc->config_base + idx,
 			      hwc->config);
 }
@@ -787,10 +779,10 @@ void perf_counter_unthrottle(void)
 {
 	struct cpu_hw_counters *cpuc;
 
-	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+	if (!x86_pmu_initialized())
 		return;
 
-	if (unlikely(!perf_counters_initialized))
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 		return;
 
 	cpuc = &__get_cpu_var(cpu_hw_counters);
@@ -829,8 +821,9 @@ void perf_counters_lapic_init(int nmi)
 {
 	u32 apic_val;
 
-	if (!perf_counters_initialized)
+	if (!x86_pmu_initialized())
 		return;
+
 	/*
 	 * Enable the performance counter vector in the APIC LVT:
 	 */
@@ -988,7 +981,6 @@ void __init init_hw_perf_counters(void)
 		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
 
 	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
-	perf_counters_initialized = true;
 
 	perf_counters_lapic_init(0);
 	register_die_notifier(&perf_counter_nmi_notifier);
-- 
cgit v0.10.2


From a29aa8a7ff93e4196d558036928597e68337dd8d Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:21 +0200
Subject: perf_counter, x86: implement the interrupt handler for AMD cpus

This patch implements the interrupt handler for AMD performance
counters. In difference to the Intel pmu, there is no single status
register and also there are no fixed counters. This makes the handler
very different and it is useful to make the handler vendor
specific. To check if a counter is overflowed the upper bit of the
counter is checked. Only counters where the active bit is set are
checked.

With this patch throttling is enabled for AMD performance counters.

This patch also reenables Linux performance counters on AMD cpus.

[ Impact: re-enable perfcounters on AMD CPUs ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-25-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 2d3681b..f4d59d4 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -240,10 +240,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	int err;
 
-	/* disable temporarily */
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
-		return -ENOSYS;
-
 	if (!x86_pmu_initialized())
 		return -ENODEV;
 
@@ -773,7 +769,43 @@ out:
 	return ret;
 }
 
-static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { return 0; }
+static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
+{
+	int cpu = smp_processor_id();
+	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
+	u64 val;
+	int handled = 0;
+	struct perf_counter *counter;
+	struct hw_perf_counter *hwc;
+	int idx;
+
+	++cpuc->interrupts;
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		if (!test_bit(idx, cpuc->active))
+			continue;
+		counter = cpuc->counters[idx];
+		hwc = &counter->hw;
+		x86_perf_counter_update(counter, hwc, idx);
+		val = atomic64_read(&hwc->prev_count);
+		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
+			continue;
+		/* counter overflow */
+		x86_perf_counter_set_period(counter, hwc, idx);
+		handled = 1;
+		inc_irq_stat(apic_perf_irqs);
+		if (perf_counter_overflow(counter, nmi, regs, 0))
+			amd_pmu_disable_counter(hwc, idx);
+		else if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS)
+			/*
+			 * do not reenable when throttled, but reload
+			 * the register
+			 */
+			amd_pmu_disable_counter(hwc, idx);
+		else if (counter->state == PERF_COUNTER_STATE_ACTIVE)
+			amd_pmu_enable_counter(hwc, idx);
+	}
+	return handled;
+}
 
 void perf_counter_unthrottle(void)
 {
@@ -782,9 +814,6 @@ void perf_counter_unthrottle(void)
 	if (!x86_pmu_initialized())
 		return;
 
-	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
-		return;
-
 	cpuc = &__get_cpu_var(cpu_hw_counters);
 	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
 		if (printk_ratelimit())
-- 
cgit v0.10.2


From 4b7bfd0d276da3a006d37e85d3cf900d7a14ae2a Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:22 +0200
Subject: perf_counter, x86: return raw count with x86_perf_counter_update()

To check on AMD cpus if a counter overflows, the upper bit of the raw
counter value must be checked. This value is already internally
available in x86_perf_counter_update(). Now, the value is returned so
that it can be used directly to check for overflows.

[ Impact: micro-optimization ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-26-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f4d59d4..a8a53ab 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -132,7 +132,7 @@ static u64 amd_pmu_raw_event(u64 event)
  * Can only be executed on the CPU where the counter is active.
  * Returns the delta events processed.
  */
-static void
+static u64
 x86_perf_counter_update(struct perf_counter *counter,
 			struct hw_perf_counter *hwc, int idx)
 {
@@ -165,6 +165,8 @@ again:
 
 	atomic64_add(delta, &counter->count);
 	atomic64_sub(delta, &hwc->period_left);
+
+	return new_raw_count;
 }
 
 static atomic_t num_counters;
@@ -785,8 +787,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 			continue;
 		counter = cpuc->counters[idx];
 		hwc = &counter->hw;
-		x86_perf_counter_update(counter, hwc, idx);
-		val = atomic64_read(&hwc->prev_count);
+		val = x86_perf_counter_update(counter, hwc, idx);
 		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
 			continue;
 		/* counter overflow */
-- 
cgit v0.10.2


From c619b8ffb1cec6a431687a35695dc6fd292a79e6 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:23 +0200
Subject: perf_counter, x86: introduce max_period variable

In x86 pmus the allowed counter period to programm differs. This
introduces a max_period value and allows the generic implementation
for all models to check the max period.

[ Impact: generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-27-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a8a53ab..4b8715b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -54,6 +54,7 @@ struct x86_pmu {
 	int		num_counters_fixed;
 	int		counter_bits;
 	u64		counter_mask;
+	u64		max_period;
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -279,14 +280,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->nmi = 1;
 
 	hwc->irq_period		= hw_event->irq_period;
-	/*
-	 * Intel PMCs cannot be accessed sanely above 32 bit width,
-	 * so we install an artificial 1<<31 period regardless of
-	 * the generic counter period:
-	 */
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-		if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
-			hwc->irq_period = 0x7FFFFFFF;
+	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period)
+		hwc->irq_period = x86_pmu.max_period;
 
 	atomic64_set(&hwc->period_left, hwc->irq_period);
 
@@ -910,6 +905,12 @@ static struct x86_pmu intel_pmu = {
 	.event_map		= intel_pmu_event_map,
 	.raw_event		= intel_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
+	/*
+	 * Intel PMCs cannot be accessed sanely above 32 bit width,
+	 * so we install an artificial 1<<31 period regardless of
+	 * the generic counter period:
+	 */
+	.max_period		= (1ULL << 31) - 1,
 };
 
 static struct x86_pmu amd_pmu = {
@@ -927,6 +928,8 @@ static struct x86_pmu amd_pmu = {
 	.num_counters		= 4,
 	.counter_bits		= 48,
 	.counter_mask		= (1ULL << 48) - 1,
+	/* use highest bit to detect overflow */
+	.max_period		= (1ULL << 47) - 1,
 };
 
 static int intel_pmu_init(void)
@@ -999,6 +1002,7 @@ void __init init_hw_perf_counters(void)
 	perf_max_counters = x86_pmu.num_counters;
 
 	pr_info("... value mask:      %016Lx\n", x86_pmu.counter_mask);
+	pr_info("... max period:      %016Lx\n", x86_pmu.max_period);
 
 	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
 		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
-- 
cgit v0.10.2


From ef7b3e09ffdcd5200aea9523f6b56d331d1c4fc0 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:24 +0200
Subject: perf_counter, x86: remove vendor check in fixed_mode_idx()

The function fixed_mode_idx() is used generically. Now it checks the
num_counters_fixed value instead of the vendor to decide if fixed
counters are present.

[ Impact: generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-28-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 4b8715b..d1c8036 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -542,7 +542,7 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 {
 	unsigned int event;
 
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+	if (!x86_pmu.num_counters_fixed)
 		return -1;
 
 	if (unlikely(hwc->nmi))
-- 
cgit v0.10.2


From 19d84dab55a383d75c885b5c1a618f5ead96f2f6 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:25 +0200
Subject: perf_counter, x86: remove unused function argument in
 intel_pmu_get_status()

The mask argument is unused and thus can be removed.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-29-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d1c8036..856b0b8 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -393,7 +393,7 @@ void hw_perf_restore(u64 ctrl)
  */
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
-static inline u64 intel_pmu_get_status(u64 mask)
+static inline u64 intel_pmu_get_status(void)
 {
 	u64 status;
 
@@ -728,7 +728,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 
 	cpuc->throttle_ctrl = intel_pmu_save_disable_all();
 
-	status = intel_pmu_get_status(cpuc->throttle_ctrl);
+	status = intel_pmu_get_status();
 	if (!status)
 		goto out;
 
@@ -753,7 +753,7 @@ again:
 	/*
 	 * Repeat if there is more work to be done:
 	 */
-	status = intel_pmu_get_status(cpuc->throttle_ctrl);
+	status = intel_pmu_get_status();
 	if (status)
 		goto again;
 out:
-- 
cgit v0.10.2


From 38105f0234d4795c77c7c6845916caf3a395c451 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:26 +0200
Subject: perf_counter: update 'perf top' documentation

The documentation about the perf-top build was outdated after
perfstat has been implemented. This updates it.

[ Impact: update documentation ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-30-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 6a276d2..8d28864 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -3,7 +3,7 @@
 
    Build with:
 
-     cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
+     make -C Documentation/perf_counter/
 
    Sample output:
 
-- 
cgit v0.10.2


From 98144511427c192e4249ff66a3f9debc55c59411 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 29 Apr 2009 14:52:50 +0200
Subject: perf_counter: add/update copyrights

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 856b0b8..47e563b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1,10 +1,11 @@
 /*
  * Performance counter x86 architecture code
  *
- *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
- *  Copyright(C) 2009 Jaswinder Singh Rajput
- *  Copyright(C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2009 Jaswinder Singh Rajput
+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  *
  *  For licencing details see kernel-base/COPYING
  */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 582108a..a95a171 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1,9 +1,9 @@
 /*
  * Performance counter core code
  *
- *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
- *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  *
  *  For licensing details see kernel-base/COPYING
  */
-- 
cgit v0.10.2


From ab7ef2e50a557af92f4f90689f51fadadafc16b2 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Apr 2009 22:38:51 +1000
Subject: perf_counter: powerpc: allow use of limited-function counters

POWER5+ and POWER6 have two hardware counters with limited functionality:
PMC5 counts instructions completed in run state and PMC6 counts cycles
in run state.  (Run state is the state when a hardware RUN bit is 1;
the idle task clears RUN while waiting for work to do and sets it when
there is work to do.)

These counters can't be written to by the kernel, can't generate
interrupts, and don't obey the freeze conditions.  That means we can
only use them for per-task counters (where we know we'll always be in
run state; we can't put a per-task counter on an idle task), and only
if we don't want interrupts and we do want to count in all processor
modes.

Obviously some counters can't go on a limited hardware counter, but there
are also situations where we can only put a counter on a limited hardware
counter - if there are already counters on that exclude some processor
modes and we want to put on a per-task cycle or instruction counter that
doesn't exclude any processor mode, it could go on if it can use a
limited hardware counter.

To keep track of these constraints, this adds a flags argument to the
processor-specific get_alternatives() functions, with three bits defined:
one to say that we can accept alternative event codes that go on limited
counters, one to say we only want alternatives on limited counters, and
one to say that this is a per-task counter and therefore events that are
gated by run state are equivalent to those that aren't (e.g. a "cycles"
event is equivalent to a "cycles in run state" event).  These flags
are computed for each counter and stored in the counter->hw.counter_base
field (slightly wonky name for what it does, but it was an existing
unused field).

Since the limited counters don't freeze when we freeze the other counters,
we need some special handling to avoid getting skew between things counted
on the limited counters and those counted on normal counters.  To minimize
this skew, if we are using any limited counters, we read PMC5 and PMC6
immediately after setting and clearing the freeze bit.  This is done in
a single asm in the new write_mmcr0() function.

The code here is specific to PMC5 and PMC6 being the limited hardware
counters.  Being more general (e.g. having a bitmap of limited hardware
counter numbers) would have meant more complex code to read the limited
counters when freezing and unfreezing the normal counters, with
conditional branches, which would have increased the skew.  Since it
isn't necessary for the code to be more general at this stage, it isn't.

This also extends the back-ends for POWER5+ and POWER6 to be able to
handle up to 6 counters rather than the 4 they previously handled.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
LKML-Reference: <18936.19035.163066.892208@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
index 9d7ff6d..56d66c3 100644
--- a/arch/powerpc/include/asm/perf_counter.h
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -12,6 +12,7 @@
 
 #define MAX_HWCOUNTERS		8
 #define MAX_EVENT_ALTERNATIVES	8
+#define MAX_LIMITED_HWCOUNTERS	2
 
 /*
  * This struct provides the constants and functions needed to
@@ -25,8 +26,11 @@ struct power_pmu {
 	int	(*compute_mmcr)(unsigned int events[], int n_ev,
 				unsigned int hwc[], u64 mmcr[]);
 	int	(*get_constraint)(unsigned int event, u64 *mskp, u64 *valp);
-	int	(*get_alternatives)(unsigned int event, unsigned int alt[]);
+	int	(*get_alternatives)(unsigned int event, unsigned int flags,
+				    unsigned int alt[]);
 	void	(*disable_pmc)(unsigned int pmc, u64 mmcr[]);
+	int	(*limited_pmc_event)(unsigned int event);
+	int	limited_pmc5_6;	/* PMC5 and PMC6 have limited function */
 	int	n_generic;
 	int	*generic_events;
 };
@@ -34,6 +38,13 @@ struct power_pmu {
 extern struct power_pmu *ppmu;
 
 /*
+ * Values for flags to get_alternatives()
+ */
+#define PPMU_LIMITED_PMC_OK	1	/* can put this on a limited PMC */
+#define PPMU_LIMITED_PMC_REQD	2	/* have to put this on a limited PMC */
+#define PPMU_ONLY_COUNT_RUN	4	/* only counting in run state */
+
+/*
  * The power_pmu.get_constraint function returns a 64-bit value and
  * a 64-bit mask that express the constraints between this event and
  * other events.
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index d9bbe5e..15cdc8e 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -23,10 +23,14 @@ struct cpu_hw_counters {
 	int n_percpu;
 	int disabled;
 	int n_added;
+	int n_limited;
+	u8  pmcs_enabled;
 	struct perf_counter *counter[MAX_HWCOUNTERS];
 	unsigned int events[MAX_HWCOUNTERS];
+	unsigned int flags[MAX_HWCOUNTERS];
 	u64 mmcr[3];
-	u8 pmcs_enabled;
+	struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS];
+	u8  limited_hwidx[MAX_LIMITED_HWCOUNTERS];
 };
 DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
 
@@ -127,7 +131,8 @@ static void write_pmc(int idx, unsigned long val)
  * and see if any combination of alternative codes is feasible.
  * The feasible set is returned in event[].
  */
-static int power_check_constraints(unsigned int event[], int n_ev)
+static int power_check_constraints(unsigned int event[], unsigned int cflags[],
+				   int n_ev)
 {
 	u64 mask, value, nv;
 	unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
@@ -144,11 +149,15 @@ static int power_check_constraints(unsigned int event[], int n_ev)
 
 	/* First see if the events will go on as-is */
 	for (i = 0; i < n_ev; ++i) {
-		alternatives[i][0] = event[i];
+		if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
+		    && !ppmu->limited_pmc_event(event[i])) {
+			ppmu->get_alternatives(event[i], cflags[i],
+					       alternatives[i]);
+			event[i] = alternatives[i][0];
+		}
 		if (ppmu->get_constraint(event[i], &amasks[i][0],
 					 &avalues[i][0]))
 			return -1;
-		choice[i] = 0;
 	}
 	value = mask = 0;
 	for (i = 0; i < n_ev; ++i) {
@@ -166,7 +175,9 @@ static int power_check_constraints(unsigned int event[], int n_ev)
 	if (!ppmu->get_alternatives)
 		return -1;
 	for (i = 0; i < n_ev; ++i) {
-		n_alt[i] = ppmu->get_alternatives(event[i], alternatives[i]);
+		choice[i] = 0;
+		n_alt[i] = ppmu->get_alternatives(event[i], cflags[i],
+						  alternatives[i]);
 		for (j = 1; j < n_alt[i]; ++j)
 			ppmu->get_constraint(alternatives[i][j],
 					     &amasks[i][j], &avalues[i][j]);
@@ -231,28 +242,41 @@ static int power_check_constraints(unsigned int event[], int n_ev)
  * exclude_{user,kernel,hv} with each other and any previously
  * added counters.
  */
-static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new)
+static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[],
+			  int n_prev, int n_new)
 {
-	int eu, ek, eh;
-	int i, n;
+	int eu = 0, ek = 0, eh = 0;
+	int i, n, first;
 	struct perf_counter *counter;
 
 	n = n_prev + n_new;
 	if (n <= 1)
 		return 0;
 
-	eu = ctrs[0]->hw_event.exclude_user;
-	ek = ctrs[0]->hw_event.exclude_kernel;
-	eh = ctrs[0]->hw_event.exclude_hv;
-	if (n_prev == 0)
-		n_prev = 1;
-	for (i = n_prev; i < n; ++i) {
+	first = 1;
+	for (i = 0; i < n; ++i) {
+		if (cflags[i] & PPMU_LIMITED_PMC_OK) {
+			cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
+			continue;
+		}
 		counter = ctrs[i];
-		if (counter->hw_event.exclude_user != eu ||
-		    counter->hw_event.exclude_kernel != ek ||
-		    counter->hw_event.exclude_hv != eh)
+		if (first) {
+			eu = counter->hw_event.exclude_user;
+			ek = counter->hw_event.exclude_kernel;
+			eh = counter->hw_event.exclude_hv;
+			first = 0;
+		} else if (counter->hw_event.exclude_user != eu ||
+			   counter->hw_event.exclude_kernel != ek ||
+			   counter->hw_event.exclude_hv != eh) {
 			return -EAGAIN;
+		}
 	}
+
+	if (eu || ek || eh)
+		for (i = 0; i < n; ++i)
+			if (cflags[i] & PPMU_LIMITED_PMC_OK)
+				cflags[i] |= PPMU_LIMITED_PMC_REQD;
+
 	return 0;
 }
 
@@ -280,6 +304,85 @@ static void power_pmu_read(struct perf_counter *counter)
 }
 
 /*
+ * On some machines, PMC5 and PMC6 can't be written, don't respect
+ * the freeze conditions, and don't generate interrupts.  This tells
+ * us if `counter' is using such a PMC.
+ */
+static int is_limited_pmc(int pmcnum)
+{
+	return ppmu->limited_pmc5_6 && (pmcnum == 5 || pmcnum == 6);
+}
+
+static void freeze_limited_counters(struct cpu_hw_counters *cpuhw,
+				    unsigned long pmc5, unsigned long pmc6)
+{
+	struct perf_counter *counter;
+	u64 val, prev, delta;
+	int i;
+
+	for (i = 0; i < cpuhw->n_limited; ++i) {
+		counter = cpuhw->limited_counter[i];
+		if (!counter->hw.idx)
+			continue;
+		val = (counter->hw.idx == 5) ? pmc5 : pmc6;
+		prev = atomic64_read(&counter->hw.prev_count);
+		counter->hw.idx = 0;
+		delta = (val - prev) & 0xfffffffful;
+		atomic64_add(delta, &counter->count);
+	}
+}
+
+static void thaw_limited_counters(struct cpu_hw_counters *cpuhw,
+				  unsigned long pmc5, unsigned long pmc6)
+{
+	struct perf_counter *counter;
+	u64 val;
+	int i;
+
+	for (i = 0; i < cpuhw->n_limited; ++i) {
+		counter = cpuhw->limited_counter[i];
+		counter->hw.idx = cpuhw->limited_hwidx[i];
+		val = (counter->hw.idx == 5) ? pmc5 : pmc6;
+		atomic64_set(&counter->hw.prev_count, val);
+		perf_counter_update_userpage(counter);
+	}
+}
+
+/*
+ * Since limited counters don't respect the freeze conditions, we
+ * have to read them immediately after freezing or unfreezing the
+ * other counters.  We try to keep the values from the limited
+ * counters as consistent as possible by keeping the delay (in
+ * cycles and instructions) between freezing/unfreezing and reading
+ * the limited counters as small and consistent as possible.
+ * Therefore, if any limited counters are in use, we read them
+ * both, and always in the same order, to minimize variability,
+ * and do it inside the same asm that writes MMCR0.
+ */
+static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0)
+{
+	unsigned long pmc5, pmc6;
+
+	if (!cpuhw->n_limited) {
+		mtspr(SPRN_MMCR0, mmcr0);
+		return;
+	}
+
+	/*
+	 * Write MMCR0, then read PMC5 and PMC6 immediately.
+	 */
+	asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
+		     : "=&r" (pmc5), "=&r" (pmc6)
+		     : "r" (mmcr0), "i" (SPRN_MMCR0),
+		       "i" (SPRN_PMC5), "i" (SPRN_PMC6));
+
+	if (mmcr0 & MMCR0_FC)
+		freeze_limited_counters(cpuhw, pmc5, pmc6);
+	else
+		thaw_limited_counters(cpuhw, pmc5, pmc6);
+}
+
+/*
  * Disable all counters to prevent PMU interrupts and to allow
  * counters to be added or removed.
  */
@@ -321,7 +424,7 @@ u64 hw_perf_save_disable(void)
 		 * executed and the PMU has frozen the counters
 		 * before we return.
 		 */
-		mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC);
+		write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
 		mb();
 	}
 	local_irq_restore(flags);
@@ -342,6 +445,8 @@ void hw_perf_restore(u64 disable)
 	unsigned long val;
 	s64 left;
 	unsigned int hwc_index[MAX_HWCOUNTERS];
+	int n_lim;
+	int idx;
 
 	if (disable)
 		return;
@@ -414,10 +519,18 @@ void hw_perf_restore(u64 disable)
 	/*
 	 * Initialize the PMCs for all the new and moved counters.
 	 */
+	cpuhw->n_limited = n_lim = 0;
 	for (i = 0; i < cpuhw->n_counters; ++i) {
 		counter = cpuhw->counter[i];
 		if (counter->hw.idx)
 			continue;
+		idx = hwc_index[i] + 1;
+		if (is_limited_pmc(idx)) {
+			cpuhw->limited_counter[n_lim] = counter;
+			cpuhw->limited_hwidx[n_lim] = idx;
+			++n_lim;
+			continue;
+		}
 		val = 0;
 		if (counter->hw_event.irq_period) {
 			left = atomic64_read(&counter->hw.period_left);
@@ -425,15 +538,16 @@ void hw_perf_restore(u64 disable)
 				val = 0x80000000L - left;
 		}
 		atomic64_set(&counter->hw.prev_count, val);
-		counter->hw.idx = hwc_index[i] + 1;
-		write_pmc(counter->hw.idx, val);
+		counter->hw.idx = idx;
+		write_pmc(idx, val);
 		perf_counter_update_userpage(counter);
 	}
+	cpuhw->n_limited = n_lim;
 	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
 
  out_enable:
 	mb();
-	mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
+	write_mmcr0(cpuhw, cpuhw->mmcr[0]);
 
 	/*
 	 * Enable instruction sampling if necessary
@@ -448,7 +562,8 @@ void hw_perf_restore(u64 disable)
 }
 
 static int collect_events(struct perf_counter *group, int max_count,
-			  struct perf_counter *ctrs[], unsigned int *events)
+			  struct perf_counter *ctrs[], unsigned int *events,
+			  unsigned int *flags)
 {
 	int n = 0;
 	struct perf_counter *counter;
@@ -457,6 +572,7 @@ static int collect_events(struct perf_counter *group, int max_count,
 		if (n >= max_count)
 			return -1;
 		ctrs[n] = group;
+		flags[n] = group->hw.counter_base;
 		events[n++] = group->hw.config;
 	}
 	list_for_each_entry(counter, &group->sibling_list, list_entry) {
@@ -465,6 +581,7 @@ static int collect_events(struct perf_counter *group, int max_count,
 			if (n >= max_count)
 				return -1;
 			ctrs[n] = counter;
+			flags[n] = counter->hw.counter_base;
 			events[n++] = counter->hw.config;
 		}
 	}
@@ -497,12 +614,14 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
 	cpuhw = &__get_cpu_var(cpu_hw_counters);
 	n0 = cpuhw->n_counters;
 	n = collect_events(group_leader, ppmu->n_counter - n0,
-			   &cpuhw->counter[n0], &cpuhw->events[n0]);
+			   &cpuhw->counter[n0], &cpuhw->events[n0],
+			   &cpuhw->flags[n0]);
 	if (n < 0)
 		return -EAGAIN;
-	if (check_excludes(cpuhw->counter, n0, n))
+	if (check_excludes(cpuhw->counter, cpuhw->flags, n0, n))
 		return -EAGAIN;
-	if (power_check_constraints(cpuhw->events, n + n0))
+	i = power_check_constraints(cpuhw->events, cpuhw->flags, n + n0);
+	if (i < 0)
 		return -EAGAIN;
 	cpuhw->n_counters = n0 + n;
 	cpuhw->n_added += n;
@@ -554,9 +673,10 @@ static int power_pmu_enable(struct perf_counter *counter)
 		goto out;
 	cpuhw->counter[n0] = counter;
 	cpuhw->events[n0] = counter->hw.config;
-	if (check_excludes(cpuhw->counter, n0, 1))
+	cpuhw->flags[n0] = counter->hw.counter_base;
+	if (check_excludes(cpuhw->counter, cpuhw->flags, n0, 1))
 		goto out;
-	if (power_check_constraints(cpuhw->events, n0 + 1))
+	if (power_check_constraints(cpuhw->events, cpuhw->flags, n0 + 1))
 		goto out;
 
 	counter->hw.config = cpuhw->events[n0];
@@ -592,12 +712,24 @@ static void power_pmu_disable(struct perf_counter *counter)
 				cpuhw->counter[i-1] = cpuhw->counter[i];
 			--cpuhw->n_counters;
 			ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
-			write_pmc(counter->hw.idx, 0);
-			counter->hw.idx = 0;
+			if (counter->hw.idx) {
+				write_pmc(counter->hw.idx, 0);
+				counter->hw.idx = 0;
+			}
 			perf_counter_update_userpage(counter);
 			break;
 		}
 	}
+	for (i = 0; i < cpuhw->n_limited; ++i)
+		if (counter == cpuhw->limited_counter[i])
+			break;
+	if (i < cpuhw->n_limited) {
+		while (++i < cpuhw->n_limited) {
+			cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i];
+			cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
+		}
+		--cpuhw->n_limited;
+	}
 	if (cpuhw->n_counters == 0) {
 		/* disable exceptions if no counters are running */
 		cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
@@ -613,6 +745,61 @@ struct pmu power_pmu = {
 	.read		= power_pmu_read,
 };
 
+/*
+ * Return 1 if we might be able to put counter on a limited PMC,
+ * or 0 if not.
+ * A counter can only go on a limited PMC if it counts something
+ * that a limited PMC can count, doesn't require interrupts, and
+ * doesn't exclude any processor mode.
+ */
+static int can_go_on_limited_pmc(struct perf_counter *counter, unsigned int ev,
+				 unsigned int flags)
+{
+	int n;
+	unsigned int alt[MAX_EVENT_ALTERNATIVES];
+
+	if (counter->hw_event.exclude_user
+	    || counter->hw_event.exclude_kernel
+	    || counter->hw_event.exclude_hv
+	    || counter->hw_event.irq_period)
+		return 0;
+
+	if (ppmu->limited_pmc_event(ev))
+		return 1;
+
+	/*
+	 * The requested event isn't on a limited PMC already;
+	 * see if any alternative code goes on a limited PMC.
+	 */
+	if (!ppmu->get_alternatives)
+		return 0;
+
+	flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
+	n = ppmu->get_alternatives(ev, flags, alt);
+	if (n)
+		return alt[0];
+
+	return 0;
+}
+
+/*
+ * Find an alternative event that goes on a normal PMC, if possible,
+ * and return the event code, or 0 if there is no such alternative.
+ * (Note: event code 0 is "don't count" on all machines.)
+ */
+static unsigned long normal_pmc_alternative(unsigned long ev,
+					    unsigned long flags)
+{
+	unsigned int alt[MAX_EVENT_ALTERNATIVES];
+	int n;
+
+	flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
+	n = ppmu->get_alternatives(ev, flags, alt);
+	if (!n)
+		return 0;
+	return alt[0];
+}
+
 /* Number of perf_counters counting hardware events */
 static atomic_t num_counters;
 /* Used to avoid races in calling reserve/release_pmc_hardware */
@@ -633,9 +820,10 @@ static void hw_perf_counter_destroy(struct perf_counter *counter)
 
 const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 {
-	unsigned long ev;
+	unsigned long ev, flags;
 	struct perf_counter *ctrs[MAX_HWCOUNTERS];
 	unsigned int events[MAX_HWCOUNTERS];
+	unsigned int cflags[MAX_HWCOUNTERS];
 	int n;
 	int err;
 
@@ -661,7 +849,36 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 	 */
 	if (!firmware_has_feature(FW_FEATURE_LPAR))
 		counter->hw_event.exclude_hv = 0;
-	
+
+	/*
+	 * If this is a per-task counter, then we can use
+	 * PM_RUN_* events interchangeably with their non RUN_*
+	 * equivalents, e.g. PM_RUN_CYC instead of PM_CYC.
+	 * XXX we should check if the task is an idle task.
+	 */
+	flags = 0;
+	if (counter->ctx->task)
+		flags |= PPMU_ONLY_COUNT_RUN;
+
+	/*
+	 * If this machine has limited counters, check whether this
+	 * event could go on a limited counter.
+	 */
+	if (ppmu->limited_pmc5_6) {
+		if (can_go_on_limited_pmc(counter, ev, flags)) {
+			flags |= PPMU_LIMITED_PMC_OK;
+		} else if (ppmu->limited_pmc_event(ev)) {
+			/*
+			 * The requested event is on a limited PMC,
+			 * but we can't use a limited PMC; see if any
+			 * alternative goes on a normal PMC.
+			 */
+			ev = normal_pmc_alternative(ev, flags);
+			if (!ev)
+				return ERR_PTR(-EINVAL);
+		}
+	}
+
 	/*
 	 * If this is in a group, check if it can go on with all the
 	 * other hardware counters in the group.  We assume the counter
@@ -670,18 +887,20 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 	n = 0;
 	if (counter->group_leader != counter) {
 		n = collect_events(counter->group_leader, ppmu->n_counter - 1,
-				   ctrs, events);
+				   ctrs, events, cflags);
 		if (n < 0)
 			return ERR_PTR(-EINVAL);
 	}
 	events[n] = ev;
 	ctrs[n] = counter;
-	if (check_excludes(ctrs, n, 1))
+	cflags[n] = flags;
+	if (check_excludes(ctrs, cflags, n, 1))
 		return ERR_PTR(-EINVAL);
-	if (power_check_constraints(events, n + 1))
+	if (power_check_constraints(events, cflags, n + 1))
 		return ERR_PTR(-EINVAL);
 
 	counter->hw.config = events[n];
+	counter->hw.counter_base = cflags[n];
 	atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
 
 	/*
@@ -763,6 +982,10 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	int found = 0;
 	int nmi;
 
+	if (cpuhw->n_limited)
+		freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
+					mfspr(SPRN_PMC6));
+
 	/*
 	 * If interrupts were soft-disabled when this PMU interrupt
 	 * occurred, treat it as an NMI.
@@ -775,6 +998,8 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 
 	for (i = 0; i < cpuhw->n_counters; ++i) {
 		counter = cpuhw->counter[i];
+		if (is_limited_pmc(counter->hw.idx))
+			continue;
 		val = read_pmc(counter->hw.idx);
 		if ((int)val < 0) {
 			/* counter has overflowed */
@@ -791,6 +1016,8 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	 */
 	if (!found) {
 		for (i = 0; i < ppmu->n_counter; ++i) {
+			if (is_limited_pmc(i + 1))
+				continue;
 			val = read_pmc(i + 1);
 			if ((int)val < 0)
 				write_pmc(i + 1, 0);
@@ -804,7 +1031,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	 * XXX might want to use MSR.PM to keep the counters frozen until
 	 * we get back out of this interrupt.
 	 */
-	mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
+	write_mmcr0(cpuhw, cpuhw->mmcr[0]);
 
 	if (nmi)
 		nmi_exit();
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
index 1407b19..744a275 100644
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -320,7 +320,8 @@ static unsigned int ppc_inst_cmpl[] = {
 	0x1001, 0x4001, 0x6001, 0x7001, 0x8001
 };
 
-static int p4_get_alternatives(unsigned int event, unsigned int alt[])
+static int p4_get_alternatives(unsigned int event, unsigned int flags,
+			       unsigned int alt[])
 {
 	int i, j, na;
 
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index 1222c8e..8154eaa 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -78,8 +78,8 @@
  * Layout of constraint bits:
  * 6666555555555544444444443333333333222222222211111111110000000000
  * 3210987654321098765432109876543210987654321098765432109876543210
- *             [  ><><>< ><> <><>[  >      <  ><  ><  ><  ><><><><>
- *             NC  G0G1G2 G3 T0T1 UC        B0  B1  B2  B3 P4P3P2P1
+ *             [  ><><>< ><> <><>[  >  <  ><  ><  ><  ><><><><><><>
+ *             NC  G0G1G2 G3 T0T1 UC    B0  B1  B2  B3 P6P5P4P3P2P1
  *
  * NC - number of counters
  *     51: NC error 0x0008_0000_0000_0000
@@ -105,18 +105,18 @@
  *     30: IDU|GRS events needed 0x00_4000_0000
  *
  * B0
- *     20-23: Byte 0 event source 0x00f0_0000
+ *     24-27: Byte 0 event source 0x0f00_0000
  *	      Encoding as for the event code
  *
  * B1, B2, B3
- *     16-19, 12-15, 8-11: Byte 1, 2, 3 event sources
+ *     20-23, 16-19, 12-15: Byte 1, 2, 3 event sources
  *
- * P4
- *     7: P1 error 0x80
- *     6-7: Count of events needing PMC4
+ * P6
+ *     11: P6 error 0x800
+ *     10-11: Count of events needing PMC6
  *
- * P1..P3
- *     0-6: Count of events needing PMC1..PMC3
+ * P1..P5
+ *     0-9: Count of events needing PMC1..PMC5
  */
 
 static const int grsel_shift[8] = {
@@ -143,11 +143,13 @@ static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 
 	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
 	if (pmc) {
-		if (pmc > 4)
+		if (pmc > 6)
 			return -1;
 		sh = (pmc - 1) * 2;
 		mask |= 2 << sh;
 		value |= 1 << sh;
+		if (pmc >= 5 && !(event == 0x500009 || event == 0x600005))
+			return -1;
 	}
 	if (event & PM_BUSEVENT_MSK) {
 		unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
@@ -173,16 +175,26 @@ static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 			value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
 		}
 		/* Set byte lane select field */
-		mask  |= 0xfULL << (20 - 4 * byte);
-		value |= (u64)unit << (20 - 4 * byte);
+		mask  |= 0xfULL << (24 - 4 * byte);
+		value |= (u64)unit << (24 - 4 * byte);
+	}
+	if (pmc < 5) {
+		/* need a counter from PMC1-4 set */
+		mask  |= 0x8000000000000ull;
+		value |= 0x1000000000000ull;
 	}
-	mask  |= 0x8000000000000ull;
-	value |= 0x1000000000000ull;
 	*maskp = mask;
 	*valp = value;
 	return 0;
 }
 
+static int power5p_limited_pmc_event(unsigned int event)
+{
+	int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+
+	return pmc == 5 || pmc == 6;
+}
+
 #define MAX_ALT	3	/* at most 3 alternatives for any event */
 
 static const unsigned int event_alternatives[][MAX_ALT] = {
@@ -193,6 +205,7 @@ static const unsigned int event_alternatives[][MAX_ALT] = {
 	{ 0x410c7,  0x441084 },			/* PM_THRD_L2MISS_BOTH_CYC */
 	{ 0x800c4,  0xc20e0 },			/* PM_DTLB_MISS */
 	{ 0xc50c6,  0xc60e0 },			/* PM_MRK_DTLB_MISS */
+	{ 0x100005, 0x600005 },			/* PM_RUN_CYC */
 	{ 0x100009, 0x200009 },			/* PM_INST_CMPL */
 	{ 0x200015, 0x300015 },			/* PM_LSU_LMQ_SRQ_EMPTY_CYC */
 	{ 0x300009, 0x400009 },			/* PM_INST_DISP */
@@ -260,24 +273,85 @@ static int find_alternative_bdecode(unsigned int event)
 	return -1;
 }
 
-static int power5p_get_alternatives(unsigned int event, unsigned int alt[])
+static int power5p_get_alternatives(unsigned int event, unsigned int flags,
+				    unsigned int alt[])
 {
 	int i, j, ae, nalt = 1;
+	int nlim;
 
 	alt[0] = event;
 	nalt = 1;
+	nlim = power5p_limited_pmc_event(event);
 	i = find_alternative(event);
 	if (i >= 0) {
 		for (j = 0; j < MAX_ALT; ++j) {
 			ae = event_alternatives[i][j];
 			if (ae && ae != event)
 				alt[nalt++] = ae;
+			nlim += power5p_limited_pmc_event(ae);
 		}
 	} else {
 		ae = find_alternative_bdecode(event);
 		if (ae > 0)
 			alt[nalt++] = ae;
 	}
+
+	if (flags & PPMU_ONLY_COUNT_RUN) {
+		/*
+		 * We're only counting in RUN state,
+		 * so PM_CYC is equivalent to PM_RUN_CYC
+		 * and PM_INST_CMPL === PM_RUN_INST_CMPL.
+		 * This doesn't include alternatives that don't provide
+		 * any extra flexibility in assigning PMCs (e.g.
+		 * 0x100005 for PM_RUN_CYC vs. 0xf for PM_CYC).
+		 * Note that even with these additional alternatives
+		 * we never end up with more than 3 alternatives for any event.
+		 */
+		j = nalt;
+		for (i = 0; i < nalt; ++i) {
+			switch (alt[i]) {
+			case 0xf:	/* PM_CYC */
+				alt[j++] = 0x600005;	/* PM_RUN_CYC */
+				++nlim;
+				break;
+			case 0x600005:	/* PM_RUN_CYC */
+				alt[j++] = 0xf;
+				break;
+			case 0x100009:	/* PM_INST_CMPL */
+				alt[j++] = 0x500009;	/* PM_RUN_INST_CMPL */
+				++nlim;
+				break;
+			case 0x500009:	/* PM_RUN_INST_CMPL */
+				alt[j++] = 0x100009;	/* PM_INST_CMPL */
+				alt[j++] = 0x200009;
+				break;
+			}
+		}
+		nalt = j;
+	}
+
+	if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) {
+		/* remove the limited PMC events */
+		j = 0;
+		for (i = 0; i < nalt; ++i) {
+			if (!power5p_limited_pmc_event(alt[i])) {
+				alt[j] = alt[i];
+				++j;
+			}
+		}
+		nalt = j;
+	} else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) {
+		/* remove all but the limited PMC events */
+		j = 0;
+		for (i = 0; i < nalt; ++i) {
+			if (power5p_limited_pmc_event(alt[i])) {
+				alt[j] = alt[i];
+				++j;
+			}
+		}
+		nalt = j;
+	}
+
 	return nalt;
 }
 
@@ -390,7 +464,7 @@ static int power5p_compute_mmcr(unsigned int event[], int n_ev,
 	unsigned char unituse[16];
 	int ttmuse;
 
-	if (n_ev > 4)
+	if (n_ev > 6)
 		return -1;
 
 	/* First pass to count resource use */
@@ -399,7 +473,7 @@ static int power5p_compute_mmcr(unsigned int event[], int n_ev,
 	for (i = 0; i < n_ev; ++i) {
 		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
 		if (pmc) {
-			if (pmc > 4)
+			if (pmc > 6)
 				return -1;
 			if (pmc_inuse & (1 << (pmc - 1)))
 				return -1;
@@ -488,13 +562,16 @@ static int power5p_compute_mmcr(unsigned int event[], int n_ev,
 			if (pmc >= 4)
 				return -1;
 			pmc_inuse |= 1 << pmc;
-		} else {
+		} else if (pmc <= 4) {
 			/* Direct event */
 			--pmc;
 			if (isbus && (byte & 2) &&
 			    (psel == 8 || psel == 0x10 || psel == 0x28))
 				/* add events on higher-numbered bus */
 				mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
+		} else {
+			/* Instructions or run cycles on PMC5/6 */
+			--pmc;
 		}
 		if (isbus && unit == PM_GRS) {
 			bit = psel & 7;
@@ -538,7 +615,7 @@ static int power5p_generic_events[] = {
 };
 
 struct power_pmu power5p_pmu = {
-	.n_counter = 4,
+	.n_counter = 6,
 	.max_alternatives = MAX_ALT,
 	.add_fields = 0x7000000000055ull,
 	.test_adder = 0x3000040000000ull,
@@ -548,4 +625,6 @@ struct power_pmu power5p_pmu = {
 	.disable_pmc = power5p_disable_pmc,
 	.n_generic = ARRAY_SIZE(power5p_generic_events),
 	.generic_events = power5p_generic_events,
+	.limited_pmc5_6 = 1,
+	.limited_pmc_event = power5p_limited_pmc_event,
 };
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index 116c4bb..6e667dc 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -269,7 +269,8 @@ static int find_alternative_bdecode(unsigned int event)
 	return -1;
 }
 
-static int power5_get_alternatives(unsigned int event, unsigned int alt[])
+static int power5_get_alternatives(unsigned int event, unsigned int flags,
+				   unsigned int alt[])
 {
 	int i, j, ae, nalt = 1;
 
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index fce1fc2..d44049f 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -182,7 +182,7 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev,
 	unsigned int ttmset = 0;
 	unsigned int pmc_inuse = 0;
 
-	if (n_ev > 4)
+	if (n_ev > 6)
 		return -1;
 	for (i = 0; i < n_ev; ++i) {
 		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
@@ -202,6 +202,8 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev,
 			for (pmc = 0; pmc < 4; ++pmc)
 				if (!(pmc_inuse & (1 << pmc)))
 					break;
+			if (pmc >= 4)
+				return -1;
 			pmc_inuse |= 1 << pmc;
 		}
 		hwc[i] = pmc;
@@ -240,7 +242,8 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev,
 		}
 		if (power6_marked_instr_event(event[i]))
 			mmcra |= MMCRA_SAMPLE_ENABLE;
-		mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc);
+		if (pmc < 4)
+			mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc);
 	}
 	mmcr[0] = 0;
 	if (pmc_inuse & 1)
@@ -256,19 +259,20 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev,
  * Layout of constraint bits:
  *
  *	0-1	add field: number of uses of PMC1 (max 1)
- *	2-3, 4-5, 6-7: ditto for PMC2, 3, 4
- *	8-10	select field: nest (subunit) event selector
+ *	2-3, 4-5, 6-7, 8-9, 10-11: ditto for PMC2, 3, 4, 5, 6
+ *	12-15	add field: number of uses of PMC1-4 (max 4)
  *	16-19	select field: unit on byte 0 of event bus
  *	20-23, 24-27, 28-31 ditto for bytes 1, 2, 3
+ *	32-34	select field: nest (subunit) event selector
  */
 static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 {
-	int pmc, byte, sh;
-	unsigned int mask = 0, value = 0;
+	int pmc, byte, sh, subunit;
+	u64 mask = 0, value = 0;
 
 	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
 	if (pmc) {
-		if (pmc > 4)
+		if (pmc > 4 && !(event == 0x500009 || event == 0x600005))
 			return -1;
 		sh = (pmc - 1) * 2;
 		mask |= 2 << sh;
@@ -276,26 +280,38 @@ static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 	}
 	if (event & PM_BUSEVENT_MSK) {
 		byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
-		sh = byte * 4;
+		sh = byte * 4 + (16 - PM_UNIT_SH);
 		mask |= PM_UNIT_MSKS << sh;
-		value |= (event & PM_UNIT_MSKS) << sh;
+		value |= (u64)(event & PM_UNIT_MSKS) << sh;
 		if ((event & PM_UNIT_MSKS) == (5 << PM_UNIT_SH)) {
-			mask |= PM_SUBUNIT_MSKS;
-			value |= event & PM_SUBUNIT_MSKS;
+			subunit = (event >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK;
+			mask  |= (u64)PM_SUBUNIT_MSK << 32;
+			value |= (u64)subunit << 32;
 		}
 	}
+	if (pmc <= 4) {
+		mask  |= 0x8000;	/* add field for count of PMC1-4 uses */
+		value |= 0x1000;
+	}
 	*maskp = mask;
 	*valp = value;
 	return 0;
 }
 
+static int p6_limited_pmc_event(unsigned int event)
+{
+	int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+
+	return pmc == 5 || pmc == 6;
+}
+
 #define MAX_ALT	4	/* at most 4 alternatives for any event */
 
 static const unsigned int event_alternatives[][MAX_ALT] = {
 	{ 0x0130e8, 0x2000f6, 0x3000fc },	/* PM_PTEG_RELOAD_VALID */
 	{ 0x080080, 0x10000d, 0x30000c, 0x4000f0 }, /* PM_LD_MISS_L1 */
 	{ 0x080088, 0x200054, 0x3000f0 },	/* PM_ST_MISS_L1 */
-	{ 0x10000a, 0x2000f4 },			/* PM_RUN_CYC */
+	{ 0x10000a, 0x2000f4, 0x600005 },	/* PM_RUN_CYC */
 	{ 0x10000b, 0x2000f5 },			/* PM_RUN_COUNT */
 	{ 0x10000e, 0x400010 },			/* PM_PURR */
 	{ 0x100010, 0x4000f8 },			/* PM_FLUSH */
@@ -340,13 +356,15 @@ static int find_alternatives_list(unsigned int event)
 	return -1;
 }
 
-static int p6_get_alternatives(unsigned int event, unsigned int alt[])
+static int p6_get_alternatives(unsigned int event, unsigned int flags,
+			       unsigned int alt[])
 {
-	int i, j;
+	int i, j, nlim;
 	unsigned int aevent, psel, pmc;
 	unsigned int nalt = 1;
 
 	alt[0] = event;
+	nlim = p6_limited_pmc_event(event);
 
 	/* check the alternatives table */
 	i = find_alternatives_list(event);
@@ -358,6 +376,7 @@ static int p6_get_alternatives(unsigned int event, unsigned int alt[])
 				break;
 			if (aevent != event)
 				alt[nalt++] = aevent;
+			nlim += p6_limited_pmc_event(aevent);
 		}
 
 	} else {
@@ -375,13 +394,75 @@ static int p6_get_alternatives(unsigned int event, unsigned int alt[])
 				((pmc > 2? pmc - 2: pmc + 2) << PM_PMC_SH);
 	}
 
+	if (flags & PPMU_ONLY_COUNT_RUN) {
+		/*
+		 * We're only counting in RUN state,
+		 * so PM_CYC is equivalent to PM_RUN_CYC,
+		 * PM_INST_CMPL === PM_RUN_INST_CMPL, PM_PURR === PM_RUN_PURR.
+		 * This doesn't include alternatives that don't provide
+		 * any extra flexibility in assigning PMCs (e.g.
+		 * 0x10000a for PM_RUN_CYC vs. 0x1e for PM_CYC).
+		 * Note that even with these additional alternatives
+		 * we never end up with more than 4 alternatives for any event.
+		 */
+		j = nalt;
+		for (i = 0; i < nalt; ++i) {
+			switch (alt[i]) {
+			case 0x1e:	/* PM_CYC */
+				alt[j++] = 0x600005;	/* PM_RUN_CYC */
+				++nlim;
+				break;
+			case 0x10000a:	/* PM_RUN_CYC */
+				alt[j++] = 0x1e;	/* PM_CYC */
+				break;
+			case 2:		/* PM_INST_CMPL */
+				alt[j++] = 0x500009;	/* PM_RUN_INST_CMPL */
+				++nlim;
+				break;
+			case 0x500009:	/* PM_RUN_INST_CMPL */
+				alt[j++] = 2;		/* PM_INST_CMPL */
+				break;
+			case 0x10000e:	/* PM_PURR */
+				alt[j++] = 0x4000f4;	/* PM_RUN_PURR */
+				break;
+			case 0x4000f4:	/* PM_RUN_PURR */
+				alt[j++] = 0x10000e;	/* PM_PURR */
+				break;
+			}
+		}
+		nalt = j;
+	}
+
+	if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) {
+		/* remove the limited PMC events */
+		j = 0;
+		for (i = 0; i < nalt; ++i) {
+			if (!p6_limited_pmc_event(alt[i])) {
+				alt[j] = alt[i];
+				++j;
+			}
+		}
+		nalt = j;
+	} else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) {
+		/* remove all but the limited PMC events */
+		j = 0;
+		for (i = 0; i < nalt; ++i) {
+			if (p6_limited_pmc_event(alt[i])) {
+				alt[j] = alt[i];
+				++j;
+			}
+		}
+		nalt = j;
+	}
+
 	return nalt;
 }
 
 static void p6_disable_pmc(unsigned int pmc, u64 mmcr[])
 {
 	/* Set PMCxSEL to 0 to disable PMCx */
-	mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc));
+	if (pmc <= 3)
+		mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc));
 }
 
 static int power6_generic_events[] = {
@@ -394,14 +475,16 @@ static int power6_generic_events[] = {
 };
 
 struct power_pmu power6_pmu = {
-	.n_counter = 4,
+	.n_counter = 6,
 	.max_alternatives = MAX_ALT,
-	.add_fields = 0x55,
-	.test_adder = 0,
+	.add_fields = 0x1555,
+	.test_adder = 0x3000,
 	.compute_mmcr = p6_compute_mmcr,
 	.get_constraint = p6_get_constraint,
 	.get_alternatives = p6_get_alternatives,
 	.disable_pmc = p6_disable_pmc,
 	.n_generic = ARRAY_SIZE(power6_generic_events),
 	.generic_events = power6_generic_events,
+	.limited_pmc5_6 = 1,
+	.limited_pmc_event = p6_limited_pmc_event,
 };
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index aed8ccd..af2d188 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -243,7 +243,8 @@ static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 	return 0;
 }
 
-static int p970_get_alternatives(unsigned int event, unsigned int alt[])
+static int p970_get_alternatives(unsigned int event, unsigned int flags,
+				 unsigned int alt[])
 {
 	alt[0] = event;
 
-- 
cgit v0.10.2


From b1fca26631f76a5e8b18435a43f5d82b8734da4b Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 23 Mar 2009 18:22:09 +0100
Subject: mutex: add atomic_dec_and_mutex_lock()

Much like the atomic_dec_and_lock() function in which we take an hold a
spin_lock if we drop the atomic to 0 this function takes and holds the
mutex if we dec the atomic to 0.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090323172417.410913479@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 3069ec7..93054fc 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -151,4 +151,27 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
 extern int mutex_trylock(struct mutex *lock);
 extern void mutex_unlock(struct mutex *lock);
 
+/**
+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+ * @cnt: the atomic which we are to dec
+ * @lock: the mutex to return holding if we dec to 0
+ *
+ * return true and hold lock if we dec to 0, return false otherwise
+ */
+static inline int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
+{
+	/* dec if we can't possibly hit 0 */
+	if (atomic_add_unless(cnt, -1, 1))
+		return 0;
+	/* we might hit 0, so take the lock */
+	mutex_lock(lock);
+	if (!atomic_dec_and_test(cnt)) {
+		/* when we actually did the dec, we didn't hit 0 */
+		mutex_unlock(lock);
+		return 0;
+	}
+	/* we hit 0, and we hold the lock */
+	return 1;
+}
+
 #endif
-- 
cgit v0.10.2


From 0c8b946e3ebb3846103486420ea7430a4b5e5b1b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 15 Apr 2009 17:48:18 +0200
Subject: vsprintf: introduce %pf format specifier

A printf format specifier which would allow us to print a pure
function name has been suggested by Andrew Morton a couple of
months ago.

The current %pF is very convenient to print a function symbol,
but often we only want to print the name of the function, without
its asm offset.

That's what  %pf does in this patch.  The lowecase f has been chosen
for its intuitive meaning of a 'weak kind of %pF'.

The support for this new format would be welcome by the tracing code
where the need to print pure function names is often needed. This is
also true for other parts of the kernel:

$ git-grep -E "kallsyms_lookup\(.+?\)"
arch/blackfin/kernel/traps.c:   symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf);
arch/powerpc/xmon/xmon.c:               name = kallsyms_lookup(pc, &size, &offset, NULL, tmpstr);
arch/sh/kernel/cpu/sh5/unwind.c:        sym = kallsyms_lookup(pc, NULL, &offset, NULL, namebuf);
arch/x86/kernel/ftrace.c:       kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str);
kernel/kprobes.c:               sym = kallsyms_lookup((unsigned long)p->addr, NULL,
kernel/lockdep.c:       return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str);
kernel/trace/ftrace.c:  kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
kernel/trace/ftrace.c:  kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
kernel/trace/ftrace.c:  kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str);
kernel/trace/ftrace.c:  kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
kernel/trace/ftrace.c:  kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
kernel/trace/ftrace.c:  kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
kernel/trace/ftrace.c:  kallsyms_lookup(*ptr, NULL, NULL, NULL, str);
kernel/trace/trace_functions.c: kallsyms_lookup(ip, NULL, NULL, NULL, str);
kernel/trace/trace_output.c:    kallsyms_lookup(address, NULL, NULL, NULL, str);

Changes in v2:

- Add the explanation of the %pf role for vsnprintf() and bstr_printf()

- Change the comments by dropping the "asm offset" notion and only
  define the %pf against the actual function offset notion.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Mike Frysinger <vapier@gentoo.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Zhaolei <zhaolei@cn.fujitsu.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090415154817.GC5989@nowhere>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index b56f6d0..756ccaf 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -575,12 +575,15 @@ static char *string(char *buf, char *end, char *s, struct printf_spec spec)
 }
 
 static char *symbol_string(char *buf, char *end, void *ptr,
-				struct printf_spec spec)
+				struct printf_spec spec, char ext)
 {
 	unsigned long value = (unsigned long) ptr;
 #ifdef CONFIG_KALLSYMS
 	char sym[KSYM_SYMBOL_LEN];
-	sprint_symbol(sym, value);
+	if (ext != 'f')
+		sprint_symbol(sym, value);
+	else
+		kallsyms_lookup(value, NULL, NULL, NULL, sym);
 	return string(buf, end, sym, spec);
 #else
 	spec.field_width = 2*sizeof(void *);
@@ -692,7 +695,8 @@ static char *ip4_addr_string(char *buf, char *end, u8 *addr,
  *
  * Right now we handle:
  *
- * - 'F' For symbolic function descriptor pointers
+ * - 'F' For symbolic function descriptor pointers with offset
+ * - 'f' For simple symbolic function names without offset
  * - 'S' For symbolic direct pointers
  * - 'R' For a struct resource pointer, it prints the range of
  *       addresses (not the name nor the flags)
@@ -715,10 +719,11 @@ static char *pointer(const char *fmt, char *buf, char *end, void *ptr,
 
 	switch (*fmt) {
 	case 'F':
+	case 'f':
 		ptr = dereference_function_descriptor(ptr);
 		/* Fallthrough */
 	case 'S':
-		return symbol_string(buf, end, ptr, spec);
+		return symbol_string(buf, end, ptr, spec, *fmt);
 	case 'R':
 		return resource_string(buf, end, ptr, spec);
 	case 'm':
@@ -954,7 +959,8 @@ qualifier:
  *
  * This function follows C99 vsnprintf, but has some extensions:
  * %pS output the name of a text symbol
- * %pF output the name of a function pointer
+ * %pF output the name of a function pointer with its offset
+ * %pf output the name of a function pointer without its offset
  * %pR output the address range in a struct resource
  *
  * The return value is the number of characters which would
@@ -1412,7 +1418,8 @@ EXPORT_SYMBOL_GPL(vbin_printf);
  *
  * The format follows C99 vsnprintf, but has some extensions:
  * %pS output the name of a text symbol
- * %pF output the name of a function pointer
+ * %pF output the name of a function pointer with its offset
+ * %pf output the name of a function pointer without its offset
  * %pR output the address range in a struct resource
  * %n is ignored
  *
-- 
cgit v0.10.2


From 43f6201a22dbf1c5abe1cab96b49bd56fa9df8f4 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 16:55:56 +0200
Subject: perf_counter, x86: rename bitmasks to ->used_mask and ->active_mask

Standardize on explicitly mentioning '_mask' in fields that
are not plain flags but masks. This avoids typos like:

       if (cpuc->used)

(which could easily slip through review unnoticed), while if a
typo looks like this:

       if (cpuc->used_mask)

it might get noticed during review.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1241016956-24648-1-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 47e563b..fc06f4d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -28,8 +28,8 @@ static u64 perf_counter_mask __read_mostly;
 
 struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
-	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	unsigned long		active[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	unsigned long		used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		interrupts;
 	u64			throttle_ctrl;
 	int			enabled;
@@ -332,7 +332,7 @@ static u64 amd_pmu_save_disable_all(void)
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
-		if (!test_bit(idx, cpuc->active))
+		if (!test_bit(idx, cpuc->active_mask))
 			continue;
 		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
 		if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
@@ -373,7 +373,7 @@ static void amd_pmu_restore_all(u64 ctrl)
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
-		if (!test_bit(idx, cpuc->active))
+		if (!test_bit(idx, cpuc->active_mask))
 			continue;
 		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
 		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
@@ -576,7 +576,7 @@ static int x86_pmu_enable(struct perf_counter *counter)
 		 * Try to get the fixed counter, if that is already taken
 		 * then try to get a generic counter:
 		 */
-		if (test_and_set_bit(idx, cpuc->used))
+		if (test_and_set_bit(idx, cpuc->used_mask))
 			goto try_generic;
 
 		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
@@ -590,14 +590,14 @@ static int x86_pmu_enable(struct perf_counter *counter)
 	} else {
 		idx = hwc->idx;
 		/* Try to get the previous generic counter again */
-		if (test_and_set_bit(idx, cpuc->used)) {
+		if (test_and_set_bit(idx, cpuc->used_mask)) {
 try_generic:
-			idx = find_first_zero_bit(cpuc->used,
+			idx = find_first_zero_bit(cpuc->used_mask,
 						  x86_pmu.num_counters);
 			if (idx == x86_pmu.num_counters)
 				return -EAGAIN;
 
-			set_bit(idx, cpuc->used);
+			set_bit(idx, cpuc->used_mask);
 			hwc->idx = idx;
 		}
 		hwc->config_base  = x86_pmu.eventsel;
@@ -609,7 +609,7 @@ try_generic:
 	x86_pmu.disable(hwc, idx);
 
 	cpuc->counters[idx] = counter;
-	set_bit(idx, cpuc->active);
+	set_bit(idx, cpuc->active_mask);
 
 	x86_perf_counter_set_period(counter, hwc, idx);
 	x86_pmu.enable(hwc, idx);
@@ -643,7 +643,7 @@ void perf_counter_print_debug(void)
 		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
 		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
 	}
-	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
+	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used_mask);
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
@@ -677,7 +677,7 @@ static void x86_pmu_disable(struct perf_counter *counter)
 	 * Must be done before we disable, otherwise the nmi handler
 	 * could reenable again:
 	 */
-	clear_bit(idx, cpuc->active);
+	clear_bit(idx, cpuc->active_mask);
 	x86_pmu.disable(hwc, idx);
 
 	/*
@@ -692,7 +692,7 @@ static void x86_pmu_disable(struct perf_counter *counter)
 	 */
 	x86_perf_counter_update(counter, hwc, idx);
 	cpuc->counters[idx] = NULL;
-	clear_bit(idx, cpuc->used);
+	clear_bit(idx, cpuc->used_mask);
 }
 
 /*
@@ -741,7 +741,7 @@ again:
 		struct perf_counter *counter = cpuc->counters[bit];
 
 		clear_bit(bit, (unsigned long *) &status);
-		if (!test_bit(bit, cpuc->active))
+		if (!test_bit(bit, cpuc->active_mask))
 			continue;
 
 		intel_pmu_save_and_restart(counter);
@@ -779,7 +779,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 
 	++cpuc->interrupts;
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		if (!test_bit(idx, cpuc->active))
+		if (!test_bit(idx, cpuc->active_mask))
 			continue;
 		counter = cpuc->counters[idx];
 		hwc = &counter->hw;
-- 
cgit v0.10.2


From 23b94b967f118bef941369238f33c8140be46539 Mon Sep 17 00:00:00 2001
From: Luis Henriques <henrix@sapo.pt>
Date: Wed, 29 Apr 2009 21:54:51 +0100
Subject: locking, rtmutex.c: Documentation cleanup

Two minor updates on functions documentation:
 - Updated documentation for function rt_mutex_unlock(), which contained an
   incorrect name
 - Removed extra '*' from comment in function rt_mutex_destroy()

[ Impact: cleanup ]

Signed-off-by: Luis Henriques <henrix@sapo.pt>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090429205451.GA23154@hades.domain.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 69d9cb9..013882e 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -864,9 +864,9 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
 EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
 
 /**
- * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible
- *				       the timeout structure is provided
- *				       by the caller
+ * rt_mutex_timed_lock - lock a rt_mutex interruptible
+ *			the timeout structure is provided
+ *			by the caller
  *
  * @lock: 		the rt_mutex to be locked
  * @timeout:		timeout structure or NULL (no timeout)
@@ -913,7 +913,7 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
 }
 EXPORT_SYMBOL_GPL(rt_mutex_unlock);
 
-/***
+/**
  * rt_mutex_destroy - mark a mutex unusable
  * @lock: the mutex to be destroyed
  *
-- 
cgit v0.10.2


From 88c48db9788862d0290831d081bc3c64e13b592f Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 29 Apr 2009 14:00:25 -0400
Subject: SELinux: drop secondary_ops->sysctl

We are still calling secondary_ops->sysctl even though the capabilities
module does not define a sysctl operation.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by:  Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index ba808ef..dd19ba8 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1980,10 +1980,6 @@ static int selinux_sysctl(ctl_table *table, int op)
 	u32 tsid, sid;
 	int rc;
 
-	rc = secondary_ops->sysctl(table, op);
-	if (rc)
-		return rc;
-
 	sid = current_sid();
 
 	rc = selinux_sysctl_get_sid(table, (op == 0001) ?
-- 
cgit v0.10.2


From 3bcac0263f0b45e67a64034ebcb69eb9abb742f4 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 29 Apr 2009 13:45:05 +0100
Subject: SELinux: Don't flush inherited SIGKILL during execve()

Don't flush inherited SIGKILL during execve() in SELinux's post cred commit
hook.  This isn't really a security problem: if the SIGKILL came before the
credentials were changed, then we were right to receive it at the time, and
should honour it; if it came after the creds were changed, then we definitely
should honour it; and in any case, all that will happen is that the process
will be scrapped before it ever returns to userspace.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1d19c02..d3b787c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1875,6 +1875,7 @@ extern void sched_dead(struct task_struct *p);
 
 extern void proc_caches_init(void);
 extern void flush_signals(struct task_struct *);
+extern void __flush_signals(struct task_struct *);
 extern void ignore_signals(struct task_struct *);
 extern void flush_signal_handlers(struct task_struct *, int force_default);
 extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
diff --git a/kernel/signal.c b/kernel/signal.c
index 1c88144..f93efec 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -238,14 +238,19 @@ void flush_sigqueue(struct sigpending *queue)
 /*
  * Flush all pending signals for a task.
  */
+void __flush_signals(struct task_struct *t)
+{
+	clear_tsk_thread_flag(t, TIF_SIGPENDING);
+	flush_sigqueue(&t->pending);
+	flush_sigqueue(&t->signal->shared_pending);
+}
+
 void flush_signals(struct task_struct *t)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&t->sighand->siglock, flags);
-	clear_tsk_thread_flag(t, TIF_SIGPENDING);
-	flush_sigqueue(&t->pending);
-	flush_sigqueue(&t->signal->shared_pending);
+	__flush_signals(t);
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 }
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index dd19ba8..5a34511 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2394,11 +2394,12 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm)
 		memset(&itimer, 0, sizeof itimer);
 		for (i = 0; i < 3; i++)
 			do_setitimer(i, &itimer, NULL);
-		flush_signals(current);
 		spin_lock_irq(&current->sighand->siglock);
-		flush_signal_handlers(current, 1);
-		sigemptyset(&current->blocked);
-		recalc_sigpending();
+		if (!(current->signal->flags & SIGNAL_GROUP_EXIT)) {
+			__flush_signals(current);
+			flush_signal_handlers(current, 1);
+			sigemptyset(&current->blocked);
+		}
 		spin_unlock_irq(&current->sighand->siglock);
 	}
 
-- 
cgit v0.10.2


From ecd6de3c88e8cbcad175b2eab48ba05c2014f7b6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 29 Apr 2009 16:02:24 +0200
Subject: selinux: selinux_bprm_committed_creds() should wake up ->real_parent,
 not ->parent.

We shouldn't worry about the tracer if current is ptraced, exec() must not
succeed if the tracer has no rights to trace this task after cred changing.
But we should notify ->real_parent which is, well, real parent.

Also, we don't need _irq to take tasklist, and we don't need parent's
->siglock to wake_up_interruptible(real_parent->signal->wait_chldexit).
Since we hold tasklist, real_parent->signal must be stable. Otherwise
spin_lock(siglock) is not safe too and can't help anyway.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 5a34511..39046dd 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2371,10 +2371,8 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm)
 {
 	const struct task_security_struct *tsec = current_security();
 	struct itimerval itimer;
-	struct sighand_struct *psig;
 	u32 osid, sid;
 	int rc, i;
-	unsigned long flags;
 
 	osid = tsec->osid;
 	sid = tsec->sid;
@@ -2405,12 +2403,9 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm)
 
 	/* Wake up the parent if it is waiting so that it can recheck
 	 * wait permission to the new task SID. */
-	read_lock_irq(&tasklist_lock);
-	psig = current->parent->sighand;
-	spin_lock_irqsave(&psig->siglock, flags);
-	wake_up_interruptible(&current->parent->signal->wait_chldexit);
-	spin_unlock_irqrestore(&psig->siglock, flags);
-	read_unlock_irq(&tasklist_lock);
+	read_lock(&tasklist_lock);
+	wake_up_interruptible(&current->real_parent->signal->wait_chldexit);
+	read_unlock(&tasklist_lock);
 }
 
 /* superblock security operations */
-- 
cgit v0.10.2


From e6be3a25861429166f945499c7ee616875bc3db9 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 30 Apr 2009 12:56:37 +0900
Subject: sh: pass through ioremap() for non-mmu processors.

All 32-bit SuperH processors currently go through __ioremap_mode()
and check for IO_TRAPPED and directly mapped segments. With this
patch we simplify the MMU less case with a pass through version of
 __ioremap_mode() which just returns the physical address.

The effects of this is change are:
 - fix non-MMU ioremap() of high address hardware blocks (sh7203 CMT)
 - make sure IO_TRAPPED is not selected

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/boards/Kconfig b/arch/sh/boards/Kconfig
index dcc1af8..99e2d47 100644
--- a/arch/sh/boards/Kconfig
+++ b/arch/sh/boards/Kconfig
@@ -121,7 +121,7 @@ config SH_RTS7751R2D
 	bool "RTS7751R2D"
 	depends on CPU_SUBTYPE_SH7751R
 	select SYS_SUPPORTS_PCI
-	select IO_TRAPPED
+	select IO_TRAPPED if MMU
 	help
 	  Select RTS7751R2D if configuring for a Renesas Technology
 	  Sales SH-Graphics board.
@@ -145,13 +145,13 @@ config SH_HIGHLANDER
 	bool "Highlander"
 	depends on CPU_SUBTYPE_SH7780 || CPU_SUBTYPE_SH7785
 	select SYS_SUPPORTS_PCI
-	select IO_TRAPPED
+	select IO_TRAPPED if MMU
 
 config SH_SH7785LCR
 	bool "SH7785LCR"
 	depends on CPU_SUBTYPE_SH7785
 	select SYS_SUPPORTS_PCI
-	select IO_TRAPPED
+	select IO_TRAPPED if MMU
 
 config SH_SH7785LCR_29BIT_PHYSMAPS
 	bool "SH7785LCR 29bit physmaps"
diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h
index 0454f8d..ef21734 100644
--- a/arch/sh/include/asm/io.h
+++ b/arch/sh/include/asm/io.h
@@ -228,12 +228,6 @@ void __iounmap(void __iomem *addr);
 unsigned long onchip_remap(unsigned long addr, unsigned long size,
 			   const char *name);
 extern void onchip_unmap(unsigned long vaddr);
-#else
-#define __ioremap(offset, size, flags)	((void __iomem *)(offset))
-#define __iounmap(addr)			do { } while (0)
-#define onchip_remap(addr, size, name)	(addr)
-#define onchip_unmap(addr)		do { } while (0)
-#endif /* CONFIG_MMU */
 
 static inline void __iomem *
 __ioremap_mode(unsigned long offset, unsigned long size, unsigned long flags)
@@ -268,6 +262,12 @@ __ioremap_mode(unsigned long offset, unsigned long size, unsigned long flags)
 
 	return __ioremap(offset, size, flags);
 }
+#else
+#define onchip_remap(addr, size, name)		(addr)
+#define onchip_unmap(addr)			do { } while (0)
+#define __ioremap_mode(offset, size, flags)	((void __iomem *)(offset))
+#define __iounmap(addr)				do { } while (0)
+#endif /* CONFIG_MMU */
 
 #define ioremap(offset, size)				\
 	__ioremap_mode((offset), (size), 0)
-- 
cgit v0.10.2


From 3014f47460ecfb13d4169daae51f26a20bacfa17 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 29 Apr 2009 14:50:37 +0000
Subject: clocksource: sh_cmt 16-bit fixes

This patch contains various fixes for 16-bit cmt hardware.
With this applied periodic clockevents work fine on sh7203.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c
index bf3e4c1..4ff1508 100644
--- a/drivers/clocksource/sh_cmt.c
+++ b/drivers/clocksource/sh_cmt.c
@@ -158,16 +158,18 @@ static int sh_cmt_enable(struct sh_cmt_priv *p, unsigned long *rate)
 		pr_err("sh_cmt: cannot enable clock \"%s\"\n", cfg->clk);
 		return ret;
 	}
-	*rate = clk_get_rate(p->clk) / 8;
 
 	/* make sure channel is disabled */
 	sh_cmt_start_stop_ch(p, 0);
 
 	/* configure channel, periodic mode and maximum timeout */
-	if (p->width == 16)
-		sh_cmt_write(p, CMCSR, 0);
-	else
+	if (p->width == 16) {
+		*rate = clk_get_rate(p->clk) / 512;
+		sh_cmt_write(p, CMCSR, 0x43);
+	} else {
+		*rate = clk_get_rate(p->clk) / 8;
 		sh_cmt_write(p, CMCSR, 0x01a4);
+	}
 
 	sh_cmt_write(p, CMCOR, 0xffffffff);
 	sh_cmt_write(p, CMCNT, 0);
@@ -615,7 +617,7 @@ static int sh_cmt_setup(struct sh_cmt_priv *p, struct platform_device *pdev)
 	if (resource_size(res) == 6) {
 		p->width = 16;
 		p->overflow_bit = 0x80;
-		p->clear_bits = ~0xc0;
+		p->clear_bits = ~0x80;
 	} else {
 		p->width = 32;
 		p->overflow_bit = 0x8000;
-- 
cgit v0.10.2


From 698aa99da5f5e2b4c666fd21ab77306f0225b8f5 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 30 Apr 2009 04:08:18 +0000
Subject: sh: sh2/sh2a 16-bit CMT platform data

This patch adds 16-bit cmt platform data for the following cpus:
 - sh7619 (2 channels)
 - sh7203/sh7263 (2 channels)
 - sh7206 (2 channels)

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh2/setup-sh7619.c b/arch/sh/kernel/cpu/sh2/setup-sh7619.c
index 0e32d8e..d70c263 100644
--- a/arch/sh/kernel/cpu/sh2/setup-sh7619.c
+++ b/arch/sh/kernel/cpu/sh2/setup-sh7619.c
@@ -12,6 +12,8 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
+#include <linux/sh_cmt.h>
+#include <linux/io.h>
 
 enum {
 	UNUSED = 0,
@@ -109,9 +111,75 @@ static struct platform_device eth_device = {
 	.resource = eth_resources,
 };
 
+static struct sh_cmt_config cmt0_platform_data = {
+	.name = "CMT0",
+	.channel_offset = 0x02,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 125,
+	.clocksource_rating = 0, /* disabled due to code generation issues */
+};
+
+static struct resource cmt0_resources[] = {
+	[0] = {
+		.name	= "CMT0",
+		.start	= 0xf84a0072,
+		.end	= 0xf84a0077,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 86,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device cmt0_device = {
+	.name		= "sh_cmt",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &cmt0_platform_data,
+	},
+	.resource	= cmt0_resources,
+	.num_resources	= ARRAY_SIZE(cmt0_resources),
+};
+
+static struct sh_cmt_config cmt1_platform_data = {
+	.name = "CMT1",
+	.channel_offset = 0x08,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clockevent_rating = 125,
+	.clocksource_rating = 0, /* disabled due to code generation issues */
+};
+
+static struct resource cmt1_resources[] = {
+	[0] = {
+		.name	= "CMT1",
+		.start	= 0xf84a0078,
+		.end	= 0xf84a007d,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 87,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device cmt1_device = {
+	.name		= "sh_cmt",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &cmt1_platform_data,
+	},
+	.resource	= cmt1_resources,
+	.num_resources	= ARRAY_SIZE(cmt1_resources),
+};
+
 static struct platform_device *sh7619_devices[] __initdata = {
 	&sci_device,
 	&eth_device,
+	&cmt0_device,
+	&cmt1_device,
 };
 
 static int __init sh7619_devices_setup(void)
@@ -125,3 +193,19 @@ void __init plat_irq_setup(void)
 {
 	register_intc_controller(&intc_desc);
 }
+
+static struct platform_device *sh7619_early_devices[] __initdata = {
+	&cmt0_device,
+	&cmt1_device,
+};
+
+#define STBCR3 0xf80a0000
+
+void __init plat_early_device_setup(void)
+{
+	/* enable CMT clock */
+	__raw_writeb(__raw_readb(STBCR3) & ~0x10, STBCR3);
+
+	early_platform_add_devices(sh7619_early_devices,
+				   ARRAY_SIZE(sh7619_early_devices));
+}
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
index 820dfb2..0836ace 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
@@ -11,6 +11,8 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
+#include <linux/sh_cmt.h>
+#include <linux/io.h>
 
 enum {
 	UNUSED = 0,
@@ -205,6 +207,70 @@ static struct platform_device sci_device = {
 	},
 };
 
+static struct sh_cmt_config cmt0_platform_data = {
+	.name = "CMT0",
+	.channel_offset = 0x02,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 125,
+	.clocksource_rating = 0, /* disabled due to code generation issues */
+};
+
+static struct resource cmt0_resources[] = {
+	[0] = {
+		.name	= "CMT0",
+		.start	= 0xfffec002,
+		.end	= 0xfffec007,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 142,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device cmt0_device = {
+	.name		= "sh_cmt",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &cmt0_platform_data,
+	},
+	.resource	= cmt0_resources,
+	.num_resources	= ARRAY_SIZE(cmt0_resources),
+};
+
+static struct sh_cmt_config cmt1_platform_data = {
+	.name = "CMT1",
+	.channel_offset = 0x08,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clockevent_rating = 125,
+	.clocksource_rating = 0, /* disabled due to code generation issues */
+};
+
+static struct resource cmt1_resources[] = {
+	[0] = {
+		.name	= "CMT1",
+		.start	= 0xfffec008,
+		.end	= 0xfffec00d,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 143,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device cmt1_device = {
+	.name		= "sh_cmt",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &cmt1_platform_data,
+	},
+	.resource	= cmt1_resources,
+	.num_resources	= ARRAY_SIZE(cmt1_resources),
+};
+
 static struct resource rtc_resources[] = {
 	[0] = {
 		.start	= 0xffff2000,
@@ -227,6 +293,8 @@ static struct platform_device rtc_device = {
 
 static struct platform_device *sh7203_devices[] __initdata = {
 	&sci_device,
+	&cmt0_device,
+	&cmt1_device,
 	&rtc_device,
 };
 
@@ -241,3 +309,19 @@ void __init plat_irq_setup(void)
 {
 	register_intc_controller(&intc_desc);
 }
+
+static struct platform_device *sh7203_early_devices[] __initdata = {
+	&cmt0_device,
+	&cmt1_device,
+};
+
+#define STBCR4 0xfffe040c
+
+void __init plat_early_device_setup(void)
+{
+	/* enable CMT clock */
+	__raw_writeb(__raw_readb(STBCR4) & ~0x04, STBCR4);
+
+	early_platform_add_devices(sh7203_early_devices,
+				   ARRAY_SIZE(sh7203_early_devices));
+}
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
index c46a835..f9606e3 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
@@ -12,6 +12,8 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
+#include <linux/sh_cmt.h>
+#include <linux/io.h>
 
 enum {
 	UNUSED = 0,
@@ -165,8 +167,74 @@ static struct platform_device sci_device = {
 	},
 };
 
+static struct sh_cmt_config cmt0_platform_data = {
+	.name = "CMT0",
+	.channel_offset = 0x02,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 125,
+	.clocksource_rating = 0, /* disabled due to code generation issues */
+};
+
+static struct resource cmt0_resources[] = {
+	[0] = {
+		.name	= "CMT0",
+		.start	= 0xfffec002,
+		.end	= 0xfffec007,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 140,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device cmt0_device = {
+	.name		= "sh_cmt",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &cmt0_platform_data,
+	},
+	.resource	= cmt0_resources,
+	.num_resources	= ARRAY_SIZE(cmt0_resources),
+};
+
+static struct sh_cmt_config cmt1_platform_data = {
+	.name = "CMT1",
+	.channel_offset = 0x08,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clockevent_rating = 125,
+	.clocksource_rating = 0, /* disabled due to code generation issues */
+};
+
+static struct resource cmt1_resources[] = {
+	[0] = {
+		.name	= "CMT1",
+		.start	= 0xfffec008,
+		.end	= 0xfffec00d,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 144,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device cmt1_device = {
+	.name		= "sh_cmt",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &cmt1_platform_data,
+	},
+	.resource	= cmt1_resources,
+	.num_resources	= ARRAY_SIZE(cmt1_resources),
+};
+
 static struct platform_device *sh7206_devices[] __initdata = {
 	&sci_device,
+	&cmt0_device,
+	&cmt1_device,
 };
 
 static int __init sh7206_devices_setup(void)
@@ -180,3 +248,19 @@ void __init plat_irq_setup(void)
 {
 	register_intc_controller(&intc_desc);
 }
+
+static struct platform_device *sh7206_early_devices[] __initdata = {
+	&cmt0_device,
+	&cmt1_device,
+};
+
+#define STBCR4 0xfffe040c
+
+void __init plat_early_device_setup(void)
+{
+	/* enable CMT clock */
+	__raw_writeb(__raw_readb(STBCR4) & ~0x04, STBCR4);
+
+	early_platform_add_devices(sh7206_early_devices,
+				   ARRAY_SIZE(sh7206_early_devices));
+}
-- 
cgit v0.10.2


From f425752fc66acf1d4e47970ea704ed7d31c14173 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 30 Apr 2009 04:09:26 +0000
Subject: sh: remove old CMT driver

This patch removes the old CMT driver (CONFIG_SH_CMT/timer-cmt.c)

As replacement, select the sh_cmt driver with CONFIG_SH_TIMER_CMT
and configure timer channel using platform data.

If multiple CMT channels are enabled using platform data, use the
earlytimer parameter on the kernel command line to select channel.
For instance, use "earlytimer=sh_cmt.0" to select the first channel.

To verify which timer is being used, look at printouts or the timer
irq count in /proc/interrupts.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 9db02ce..7e96ade 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -462,22 +462,14 @@ config SH_TMU
 	help
 	  This enables the use of the TMU as the system timer.
 
-config SH_CMT
-	bool "CMT timer support"
-	depends on SYS_SUPPORTS_CMT && CPU_SH2
-	default y
-	help
-	  This enables the use of the CMT as the system timer.
-
-#
-# Support for the new-style CMT driver. This will replace SH_CMT
-# once its other dependencies are merged.
-#
 config SH_TIMER_CMT
-	bool "CMT clockevents driver"
-	depends on SYS_SUPPORTS_CMT && !SH_CMT
+	bool "CMT timer driver"
+	depends on SYS_SUPPORTS_CMT
+	default y
 	select GENERIC_CLOCKEVENTS
 	select GENERIC_TIME
+	help
+	  This enables build of the CMT timer driver.
 
 config SH_MTU2
 	bool "MTU2 timer support"
diff --git a/arch/sh/include/asm/timer.h b/arch/sh/include/asm/timer.h
index 4c3b66e3..8f80a55 100644
--- a/arch/sh/include/asm/timer.h
+++ b/arch/sh/include/asm/timer.h
@@ -23,7 +23,7 @@ struct sys_timer {
 
 #define TICK_SIZE (tick_nsec / 1000)
 
-extern struct sys_timer tmu_timer, cmt_timer, mtu2_timer;
+extern struct sys_timer tmu_timer, mtu2_timer;
 extern struct sys_timer *sys_timer;
 
 #ifndef CONFIG_GENERIC_TIME
diff --git a/arch/sh/kernel/timers/Makefile b/arch/sh/kernel/timers/Makefile
index 0b7f857..1e9a104 100644
--- a/arch/sh/kernel/timers/Makefile
+++ b/arch/sh/kernel/timers/Makefile
@@ -6,6 +6,5 @@ obj-y	:= timer.o
 
 obj-$(CONFIG_SH_TMU)		+= timer-tmu.o
 obj-$(CONFIG_SH_MTU2)		+= timer-mtu2.o
-obj-$(CONFIG_SH_CMT)		+= timer-cmt.o
 
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)	+= timer-broadcast.o
diff --git a/arch/sh/kernel/timers/timer-cmt.c b/arch/sh/kernel/timers/timer-cmt.c
deleted file mode 100644
index 9aa3486..0000000
--- a/arch/sh/kernel/timers/timer-cmt.c
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * arch/sh/kernel/timers/timer-cmt.c - CMT Timer Support
- *
- *  Copyright (C) 2005  Yoshinori Sato
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/seqlock.h>
-#include <asm/timer.h>
-#include <asm/rtc.h>
-#include <asm/io.h>
-#include <asm/irq.h>
-#include <asm/clock.h>
-
-#if defined(CONFIG_CPU_SUBTYPE_SH7619)
-#define CMT_CMSTR	0xf84a0070
-#define CMT_CMCSR_0	0xf84a0072
-#define CMT_CMCNT_0	0xf84a0074
-#define CMT_CMCOR_0	0xf84a0076
-#define CMT_CMCSR_1	0xf84a0078
-#define CMT_CMCNT_1	0xf84a007a
-#define CMT_CMCOR_1	0xf84a007c
-
-#define STBCR3		0xf80a0000
-#define cmt_clock_enable() do {	ctrl_outb(ctrl_inb(STBCR3) & ~0x10, STBCR3); } while(0)
-#define CMT_CMCSR_INIT	0x0040
-#define CMT_CMCSR_CALIB	0x0000
-#elif defined(CONFIG_CPU_SUBTYPE_SH7203) || \
-      defined(CONFIG_CPU_SUBTYPE_SH7206) || \
-      defined(CONFIG_CPU_SUBTYPE_SH7263)
-#define CMT_CMSTR	0xfffec000
-#define CMT_CMCSR_0	0xfffec002
-#define CMT_CMCNT_0	0xfffec004
-#define CMT_CMCOR_0	0xfffec006
-
-#define STBCR4		0xfffe040c
-#define cmt_clock_enable() do {	ctrl_outb(ctrl_inb(STBCR4) & ~0x04, STBCR4); } while(0)
-#define CMT_CMCSR_INIT	0x0040
-#define CMT_CMCSR_CALIB	0x0000
-#else
-#error "Unknown CPU SUBTYPE"
-#endif
-
-static unsigned long cmt_timer_get_offset(void)
-{
-	int count;
-	static unsigned short count_p = 0xffff;    /* for the first call after boot */
-	static unsigned long jiffies_p = 0;
-
-	/*
-	 * cache volatile jiffies temporarily; we have IRQs turned off.
-	 */
-	unsigned long jiffies_t;
-
-	/* timer count may underflow right here */
-	count =  ctrl_inw(CMT_CMCOR_0);
-	count -= ctrl_inw(CMT_CMCNT_0);
-
-	jiffies_t = jiffies;
-
-	/*
-	 * avoiding timer inconsistencies (they are rare, but they happen)...
-	 * there is one kind of problem that must be avoided here:
-	 *  1. the timer counter underflows
-	 */
-
-	if (jiffies_t == jiffies_p) {
-		if (count > count_p) {
-			/* the nutcase */
-			if (ctrl_inw(CMT_CMCSR_0) & 0x80) { /* Check CMF bit */
-				count -= LATCH;
-			} else {
-				printk("%s (): hardware timer problem?\n",
-				       __func__);
-			}
-		}
-	} else
-		jiffies_p = jiffies_t;
-
-	count_p = count;
-
-	count = ((LATCH-1) - count) * TICK_SIZE;
-	count = (count + LATCH/2) / LATCH;
-
-	return count;
-}
-
-static irqreturn_t cmt_timer_interrupt(int irq, void *dev_id)
-{
-	unsigned long timer_status;
-
-	/* Clear CMF bit */
-	timer_status = ctrl_inw(CMT_CMCSR_0);
-	timer_status &= ~0x80;
-	ctrl_outw(timer_status, CMT_CMCSR_0);
-
-	handle_timer_tick();
-
-	return IRQ_HANDLED;
-}
-
-static struct irqaction cmt_irq = {
-	.name		= "timer",
-	.handler	= cmt_timer_interrupt,
-	.flags		= IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL,
-};
-
-static void cmt_clk_init(struct clk *clk)
-{
-	u8 divisor = CMT_CMCSR_INIT & 0x3;
-	ctrl_inw(CMT_CMCSR_0);
-	ctrl_outw(CMT_CMCSR_INIT, CMT_CMCSR_0);
-	clk->parent = clk_get(NULL, "module_clk");
-	clk->rate = clk->parent->rate / (8 << (divisor << 1));
-}
-
-static void cmt_clk_recalc(struct clk *clk)
-{
-	u8 divisor = ctrl_inw(CMT_CMCSR_0) & 0x3;
-	clk->rate = clk->parent->rate / (8 << (divisor << 1));
-}
-
-static struct clk_ops cmt_clk_ops = {
-	.init		= cmt_clk_init,
-	.recalc		= cmt_clk_recalc,
-};
-
-static struct clk cmt0_clk = {
-	.name		= "cmt0_clk",
-	.ops		= &cmt_clk_ops,
-};
-
-static int cmt_timer_start(void)
-{
-	ctrl_outw(ctrl_inw(CMT_CMSTR) | 0x01, CMT_CMSTR);
-	return 0;
-}
-
-static int cmt_timer_stop(void)
-{
-	ctrl_outw(ctrl_inw(CMT_CMSTR) & ~0x01, CMT_CMSTR);
-	return 0;
-}
-
-static int cmt_timer_init(void)
-{
-	unsigned long interval;
-
-	cmt_clock_enable();
-
-	setup_irq(CONFIG_SH_TIMER_IRQ, &cmt_irq);
-
-	cmt0_clk.parent = clk_get(NULL, "module_clk");
-
-	cmt_timer_stop();
-
-	interval = cmt0_clk.parent->rate / 8 / HZ;
-	printk(KERN_INFO "Interval = %ld\n", interval);
-
-	ctrl_outw(interval, CMT_CMCOR_0);
-
-	clk_register(&cmt0_clk);
-	clk_enable(&cmt0_clk);
-
-	cmt_timer_start();
-
-	return 0;
-}
-
-static struct sys_timer_ops cmt_timer_ops = {
-	.init		= cmt_timer_init,
-	.start		= cmt_timer_start,
-	.stop		= cmt_timer_stop,
-#ifndef CONFIG_GENERIC_TIME
-	.get_offset	= cmt_timer_get_offset,
-#endif
-};
-
-struct sys_timer cmt_timer = {
-	.name	= "cmt",
-	.ops	= &cmt_timer_ops,
-};
diff --git a/arch/sh/kernel/timers/timer.c b/arch/sh/kernel/timers/timer.c
index 4e7e747..f3bd141 100644
--- a/arch/sh/kernel/timers/timer.c
+++ b/arch/sh/kernel/timers/timer.c
@@ -20,9 +20,6 @@ static struct sys_timer *sys_timers[] = {
 #ifdef CONFIG_SH_MTU2
 	&mtu2_timer,
 #endif
-#ifdef CONFIG_SH_CMT
-	&cmt_timer,
-#endif
 	NULL,
 };
 
-- 
cgit v0.10.2


From c5dd016cdf0a040e1de0b691e274fbfe642b2cdc Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 30 Apr 2009 09:48:16 +1000
Subject: perf_counter: update copyright notice

This adds my name to the list of copyright holders on the core
perf_counter.c, since I have contributed a significant amount of the
code in there.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
LKML-Reference: <18936.59200.888049.746658@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index a95a171..75f2b6c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -4,6 +4,7 @@
  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
  *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
  *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright  �  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  *
  *  For licensing details see kernel-base/COPYING
  */
-- 
cgit v0.10.2


From a511e3f968c462a55ef58697257f5347c73d306e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 29 Apr 2009 15:59:58 -0700
Subject: mutex: add atomic_dec_and_mutex_lock(), fix

include/linux/mutex.h:136: warning: 'mutex_lock' declared inline after being called
 include/linux/mutex.h:136: warning: previous declaration of 'mutex_lock' was here

uninline it.

[ Impact: clean up and uninline, address compiler warning ]

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Eric Paris <eparis@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <200904292318.n3TNIsi6028340@imap1.linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 93054fc..878cab4 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -150,28 +150,6 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
  */
 extern int mutex_trylock(struct mutex *lock);
 extern void mutex_unlock(struct mutex *lock);
-
-/**
- * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
- * @cnt: the atomic which we are to dec
- * @lock: the mutex to return holding if we dec to 0
- *
- * return true and hold lock if we dec to 0, return false otherwise
- */
-static inline int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
-{
-	/* dec if we can't possibly hit 0 */
-	if (atomic_add_unless(cnt, -1, 1))
-		return 0;
-	/* we might hit 0, so take the lock */
-	mutex_lock(lock);
-	if (!atomic_dec_and_test(cnt)) {
-		/* when we actually did the dec, we didn't hit 0 */
-		mutex_unlock(lock);
-		return 0;
-	}
-	/* we hit 0, and we hold the lock */
-	return 1;
-}
+extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
 
 #endif
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 507cf2b..e2d25e9 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -471,5 +471,28 @@ int __sched mutex_trylock(struct mutex *lock)
 
 	return ret;
 }
-
 EXPORT_SYMBOL(mutex_trylock);
+
+/**
+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+ * @cnt: the atomic which we are to dec
+ * @lock: the mutex to return holding if we dec to 0
+ *
+ * return true and hold lock if we dec to 0, return false otherwise
+ */
+int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
+{
+	/* dec if we can't possibly hit 0 */
+	if (atomic_add_unless(cnt, -1, 1))
+		return 0;
+	/* we might hit 0, so take the lock */
+	mutex_lock(lock);
+	if (!atomic_dec_and_test(cnt)) {
+		/* when we actually did the dec, we didn't hit 0 */
+		mutex_unlock(lock);
+		return 0;
+	}
+	/* we hit 0, and we hold the lock */
+	return 1;
+}
+EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
-- 
cgit v0.10.2


From 2b72394e4089643f11669d9610907a1442fe044a Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Tue, 28 Apr 2009 16:00:49 +0300
Subject: x86: move max_pfn_mapped and max_low_pfn_mapped to setup.c

This patch moves the max_pfn_mapped and max_low_pfn_mapped global
variables to kernel/setup.c where they're initialized.

[ Impact: cleanup ]

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
LKML-Reference: <1240923649.1982.21.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b415843..0d77e56 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,6 +112,14 @@
 #define ARCH_SETUP
 #endif
 
+/*
+ * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
+ * The direct mapping extends to max_pfn_mapped, so that we can directly access
+ * apertures, ACPI and other tables without having to play with fixmaps.
+ */
+unsigned long max_low_pfn_mapped;
+unsigned long max_pfn_mapped;
+
 RESERVE_BRK(dmi_alloc, 65536);
 
 unsigned int boot_cpu_id __read_mostly;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 2b27120..a640a7f 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,11 +49,9 @@
 #include <asm/paravirt.h>
 #include <asm/setup.h>
 #include <asm/cacheflush.h>
+#include <asm/page_types.h>
 #include <asm/init.h>
 
-unsigned long max_low_pfn_mapped;
-unsigned long max_pfn_mapped;
-
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 unsigned long highstart_pfn, highend_pfn;
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a4e7846..1016ea0 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -50,14 +50,6 @@
 #include <asm/cacheflush.h>
 #include <asm/init.h>
 
-/*
- * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
- * The direct mapping extends to max_pfn_mapped, so that we can directly access
- * apertures, ACPI and other tables without having to play with fixmaps.
- */
-unsigned long max_low_pfn_mapped;
-unsigned long max_pfn_mapped;
-
 static unsigned long dma_reserve __initdata;
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-- 
cgit v0.10.2


From 9518e0e4350a5ea8ca200ce320b28d6284a7b0ce Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Tue, 28 Apr 2009 16:00:50 +0300
Subject: x86: move per-cpu mmu_gathers to mm/init.c

[ Impact: cleanup ]

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
LKML-Reference: <1240923650.1982.22.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index fedde53..4d67c33 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -9,6 +9,9 @@
 #include <asm/sections.h>
 #include <asm/system.h>
 #include <asm/tlbflush.h>
+#include <asm/tlb.h>
+
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 
 unsigned long __initdata e820_table_start;
 unsigned long __meminitdata e820_table_end;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index a640a7f..fef1d90 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -52,7 +52,6 @@
 #include <asm/page_types.h>
 #include <asm/init.h>
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 unsigned long highstart_pfn, highend_pfn;
 
 static noinline int do_test_wp_bit(void);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1016ea0..6a1a573 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -52,8 +52,6 @@
 
 static unsigned long dma_reserve __initdata;
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-
 static int __init parse_direct_gbpages_off(char *arg)
 {
 	direct_gbpages = 0;
-- 
cgit v0.10.2


From ba9c22f2c01cf5c88beed5a6b9e07d42e10bd358 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Mon, 20 Apr 2009 22:22:22 -0700
Subject: futex: remove FUTEX_REQUEUE_PI (non CMP)

The new requeue PI futex op codes were modeled after the existing
FUTEX_REQUEUE and FUTEX_CMP_REQUEUE calls.  I was unaware at the time
that FUTEX_REQUEUE was only around for compatibility reasons and
shouldn't be used in new code.  Ulrich Drepper elaborates on this in his
Futexes are Tricky paper: http://people.redhat.com/drepper/futex.pdf.
The deprecated call doesn't catch changes to the futex corresponding to
the destination futex which can lead to deadlock.

Therefor, I feel it best to remove FUTEX_REQUEUE_PI and leave only
FUTEX_CMP_REQUEUE_PI as there are not yet any existing users of the API.
This patch does change the OP code value of FUTEX_CMP_REQUEUE_PI to 12
from 13.  Since my test case is the only known user of this API, I felt
this was the right thing to do, rather than leave a hole in the
enumeration.

I chose to continue using the _CMP_ modifier in the OP code to make it
explicit to the user that the test is being done.

Builds, boots, and ran several hundred iterations requeue_pi.c.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
LKML-Reference: <49ED580E.1050502@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/futex.h b/include/linux/futex.h
index b05519c..34956c8 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -24,8 +24,7 @@ union ktime;
 #define FUTEX_WAIT_BITSET	9
 #define FUTEX_WAKE_BITSET	10
 #define FUTEX_WAIT_REQUEUE_PI	11
-#define FUTEX_REQUEUE_PI	12
-#define FUTEX_CMP_REQUEUE_PI	13
+#define FUTEX_CMP_REQUEUE_PI	12
 
 #define FUTEX_PRIVATE_FLAG	128
 #define FUTEX_CLOCK_REALTIME	256
@@ -43,7 +42,6 @@ union ktime;
 #define FUTEX_WAKE_BITSET_PRIVATE	(FUTEX_WAKE_BITS | FUTEX_PRIVATE_FLAG)
 #define FUTEX_WAIT_REQUEUE_PI_PRIVATE	(FUTEX_WAIT_REQUEUE_PI | \
 					 FUTEX_PRIVATE_FLAG)
-#define FUTEX_REQUEUE_PI_PRIVATE	(FUTEX_REQUEUE_PI | FUTEX_PRIVATE_FLAG)
 #define FUTEX_CMP_REQUEUE_PI_PRIVATE	(FUTEX_CMP_REQUEUE_PI | \
 					 FUTEX_PRIVATE_FLAG)
 
diff --git a/kernel/futex.c b/kernel/futex.c
index 6d2daa4..aec8bf8 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2555,9 +2555,6 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
 					    clockrt, uaddr2);
 		break;
-	case FUTEX_REQUEUE_PI:
-		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 1);
-		break;
 	case FUTEX_CMP_REQUEUE_PI:
 		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
 				    1);
@@ -2596,8 +2593,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 	 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
 	 */
 	if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
-	    cmd == FUTEX_REQUEUE_PI || cmd == FUTEX_CMP_REQUEUE_PI ||
-	    cmd == FUTEX_WAKE_OP)
+	    cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
 		val2 = (u32) (unsigned long) utime;
 
 	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
-- 
cgit v0.10.2


From 83c4832683bc8ebcd1687b3c0bf3ba1ab253dd4f Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Thu, 30 Apr 2009 12:03:16 +0200
Subject: x86: boot/compressed/vmlinux.lds.S: fix build of bzImage with 64 bit
 compiler

Jesper reported that he saw following build issue:

 > ld:arch/x86/boot/compressed/vmlinux.lds:9: syntax error
 > make[2]: *** [arch/x86/boot/compressed/vmlinux] Error 1
 > make[1]: *** [arch/x86/boot/compressed/vmlinux] Error 2
 > make: *** [bzImage] Error 2

CPP defines the symbol "i386" to "1".
Undefine this to fix it.

[ Impact: build fix with certain tool chains ]

Reported-by: Jesper Dangaard Brouer <jdb@comx.dk>
Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <alpine.LFD.2.00.0904260958190.3101@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index ffcb1913..0d26c92 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -1,5 +1,7 @@
 OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
 
+#undef i386
+
 #ifdef CONFIG_X86_64
 OUTPUT_ARCH(i386:x86-64)
 ENTRY(startup_64)
-- 
cgit v0.10.2


From aac3f3c2c41ce49a6dbb98d9145265c00a964dc2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 30 Apr 2009 13:52:19 +0200
Subject: perf_counter tools: add perf-report to the Makefile

Build it explicitly until it's a proper builtin command.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 543ccf2..877cf5d 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -228,7 +228,7 @@ COMPAT_CFLAGS =
 COMPAT_OBJS =
 LIB_H =
 LIB_OBJS =
-PROGRAMS =
+PROGRAMS = perf-report
 SCRIPT_PERL =
 SCRIPT_SH =
 TEST_PROGRAMS =
@@ -808,6 +808,10 @@ clean:
 	$(RM) $(htmldocs).tar.gz $(manpages).tar.gz
 	$(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-BUILD-OPTIONS
 
+# temporary hack:
+perf-report: perf-report.cc ../../include/linux/perf_counter.h Makefile
+	g++ -g -O2 -Wall -lrt -o $@ $<
+
 .PHONY: all install clean strip
 .PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
 .PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope .FORCE-PERF-CFLAGS
-- 
cgit v0.10.2


From 66cf782996f3d57d3cc199f0a2d47a54e2aa5991 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 30 Apr 2009 13:53:33 +0200
Subject: perf_counter tools: perf stat: make -l default-on

Turn on scaling display by default - this is less confusing.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 112b94e..1fde127 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -171,7 +171,7 @@ static unsigned int		page_size;
 
 static int			zero;
 
-static int			scale;
+static int			scale				=  1;
 
 static const unsigned int default_count[] = {
 	1000000,
-- 
cgit v0.10.2


From bad760089c1ef7fe525c0f268a4078b9cb483903 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 30 Apr 2009 14:14:37 +0200
Subject: perf_counter tools: fix infinite loop in perf-report on zeroed event
 records

Bail out early if a record has zero size - we have no chance to make
reliable progress in that case. Print out the offset where this happens,
and print the number of bytes we missed out on.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/perf-report.cc b/Documentation/perf_counter/perf-report.cc
index 1727317..933a075 100644
--- a/Documentation/perf_counter/perf-report.cc
+++ b/Documentation/perf_counter/perf-report.cc
@@ -13,6 +13,7 @@
 #include <ctype.h>
 #include <time.h>
 #include <getopt.h>
+#include <assert.h>
 
 #include <sys/ioctl.h>
 #include <sys/poll.h>
@@ -226,7 +227,7 @@ void load_kallsyms(void)
 	while (!feof(file)) {
 		uint64_t start;
 		char c;
-		char sym[1024];
+		char sym[1024000];
 
 		if (getline(&line, &n, file) < 0)
 			break;
@@ -416,12 +417,23 @@ more:
 
 	if (head + event->header.size >= page_size * mmap_window) {
 		unsigned long shift = page_size * (head / page_size);
+		int ret;
+
+		ret = munmap(buf, page_size * mmap_window);
+		assert(ret == 0);
 
-		munmap(buf, page_size * mmap_window);
 		offset += shift;
 		head -= shift;
 		goto remap;
 	}
+
+
+	if (!event->header.size) {
+		fprintf(stderr, "zero-sized event at file offset %ld\n", offset + head);
+		fprintf(stderr, "skipping %ld bytes of events.\n", stat.st_size - offset - head);
+		goto done;
+	}
+
 	head += event->header.size;
 
 	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
@@ -458,6 +470,8 @@ more:
 	if (offset + head < stat.st_size)
 		goto more;
 
+done:
+
 	close(input);
 
 	std::map<std::string, int>::iterator hi = hist.begin();
-- 
cgit v0.10.2


From 03682411b1ccd38cbde2e9a6ab43884ff34fbefc Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Thu, 30 Apr 2009 18:38:01 +0200
Subject: alim15x3: Remove historical hacks, re-enable init_hwif for PowerPC

Some time ago we had to disable init_hwif callback for PowerPC builds.
That was because of a historical IRQ overwrite in the driver, which
was causing IDE malfunction on the MPC8610HPCD PowerPC boards.

It's unclear whether this overwrite is still useful, but it is proven
to cause a bit of harm, and today some PowerPC targets (Xilinx ML510,
as reported by Roderick Colenbrander) need the init_hwif, so we have
to re-enable it and remove the overwrite.

Reported-by: Roderick Colenbrander <thunderbird2k@gmail.com>
Suggested-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>

diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
index 537da1c..e59b6de 100644
--- a/drivers/ide/alim15x3.c
+++ b/drivers/ide/alim15x3.c
@@ -402,27 +402,23 @@ static u8 ali_cable_detect(ide_hwif_t *hwif)
 	return cbl;
 }
 
-#if !defined(CONFIG_SPARC64) && !defined(CONFIG_PPC)
+#ifndef CONFIG_SPARC64
 /**
  *	init_hwif_ali15x3	-	Initialize the ALI IDE x86 stuff
  *	@hwif: interface to configure
  *
  *	Obtain the IRQ tables for an ALi based IDE solution on the PC
  *	class platforms. This part of the code isn't applicable to the
- *	Sparc and PowerPC systems.
+ *	Sparc systems.
  */
 
 static void __devinit init_hwif_ali15x3 (ide_hwif_t *hwif)
 {
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
 	u8 ideic, inmir;
 	s8 irq_routing_table[] = { -1,  9, 3, 10, 4,  5, 7,  6,
 				      1, 11, 0, 12, 0, 14, 0, 15 };
 	int irq = -1;
 
-	if (dev->device == PCI_DEVICE_ID_AL_M5229)
-		hwif->irq = hwif->channel ? 15 : 14;
-
 	if (isa_dev) {
 		/*
 		 * read IDE interface control
@@ -455,7 +451,7 @@ static void __devinit init_hwif_ali15x3 (ide_hwif_t *hwif)
 }
 #else
 #define init_hwif_ali15x3 NULL
-#endif /* !defined(CONFIG_SPARC64) && !defined(CONFIG_PPC) */
+#endif /* CONFIG_SPARC64 */
 
 /**
  *	init_dma_ali15x3	-	set up DMA on ALi15x3
-- 
cgit v0.10.2


From 30b4ae8a4498543863501f707879b7220b649602 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 4 Apr 2009 21:01:01 +0000
Subject: signals: split do_tkill

Split out the code from do_tkill to make it reusable by the follow up
patch which implements sys_rt_tgsigqueueinfo

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>

diff --git a/kernel/signal.c b/kernel/signal.c
index d803473..56d27ac 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2278,24 +2278,17 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
 	return kill_something_info(sig, &info, pid);
 }
 
-static int do_tkill(pid_t tgid, pid_t pid, int sig)
+static int
+do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
 {
-	int error;
-	struct siginfo info;
 	struct task_struct *p;
 	unsigned long flags;
-
-	error = -ESRCH;
-	info.si_signo = sig;
-	info.si_errno = 0;
-	info.si_code = SI_TKILL;
-	info.si_pid = task_tgid_vnr(current);
-	info.si_uid = current_uid();
+	int error = -ESRCH;
 
 	rcu_read_lock();
 	p = find_task_by_vpid(pid);
 	if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
-		error = check_kill_permission(sig, &info, p);
+		error = check_kill_permission(sig, info, p);
 		/*
 		 * The null signal is a permissions and process existence
 		 * probe.  No signal is actually delivered.
@@ -2305,7 +2298,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
 		 * signal is private anyway.
 		 */
 		if (!error && sig && lock_task_sighand(p, &flags)) {
-			error = specific_send_sig_info(sig, &info, p);
+			error = specific_send_sig_info(sig, info, p);
 			unlock_task_sighand(p, &flags);
 		}
 	}
@@ -2314,6 +2307,19 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
 	return error;
 }
 
+static int do_tkill(pid_t tgid, pid_t pid, int sig)
+{
+	struct siginfo info;
+
+	info.si_signo = sig;
+	info.si_errno = 0;
+	info.si_code = SI_TKILL;
+	info.si_pid = task_tgid_vnr(current);
+	info.si_uid = current_uid();
+
+	return do_send_specific(tgid, pid, sig, &info);
+}
+
 /**
  *  sys_tgkill - send signal to one specific thread
  *  @tgid: the thread group ID of the thread
-- 
cgit v0.10.2


From 62ab4505e3efaf67784f84059e0fb9cedb1728ea Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 4 Apr 2009 21:01:06 +0000
Subject: signals: implement sys_rt_tgsigqueueinfo

sys_kill has the per thread counterpart sys_tgkill. sigqueueinfo is
missing a thread directed counterpart. Such an interface is important
for migrating applications from other OSes which have the per thread
delivery implemented.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Acked-by: Ulrich Drepper <drepper@redhat.com>

diff --git a/include/linux/compat.h b/include/linux/compat.h
index f2ded21..af931ee 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -222,6 +222,8 @@ int copy_siginfo_from_user32(siginfo_t *to, struct compat_siginfo __user *from);
 int copy_siginfo_to_user32(struct compat_siginfo __user *to, siginfo_t *from);
 int get_compat_sigevent(struct sigevent *event,
 		const struct compat_sigevent __user *u_event);
+long compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
+				  struct compat_siginfo __user *uinfo);
 
 static inline int compat_timeval_compare(struct compat_timeval *lhs,
 					struct compat_timeval *rhs)
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 84f997f..c755283 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -235,6 +235,8 @@ static inline int valid_signal(unsigned long sig)
 extern int next_signal(struct sigpending *pending, sigset_t *mask);
 extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p);
 extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *);
+extern long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig,
+				 siginfo_t *info);
 extern long do_sigpending(void __user *, unsigned long);
 extern int sigprocmask(int, sigset_t *, sigset_t *);
 extern int show_unhandled_signals;
diff --git a/kernel/compat.c b/kernel/compat.c
index 42d5654..f6c204f 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -882,6 +882,17 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
 
 }
 
+asmlinkage long
+compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
+			     struct compat_siginfo __user *uinfo)
+{
+	siginfo_t info;
+
+	if (copy_siginfo_from_user32(&info, uinfo))
+		return -EFAULT;
+	return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
+}
+
 #ifdef __ARCH_WANT_COMPAT_SYS_TIME
 
 /* compat_time_t is a 32 bit "long" and needs to get converted. */
diff --git a/kernel/signal.c b/kernel/signal.c
index 56d27ac..f79b3b9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2369,6 +2369,32 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
 	return kill_proc_info(sig, &info, pid);
 }
 
+long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
+{
+	/* This is only valid for single tasks */
+	if (pid <= 0 || tgid <= 0)
+		return -EINVAL;
+
+	/* Not even root can pretend to send signals from the kernel.
+	   Nor can they impersonate a kill(), which adds source info.  */
+	if (info->si_code >= 0)
+		return -EPERM;
+	info->si_signo = sig;
+
+	return do_send_specific(tgid, pid, sig, info);
+}
+
+SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
+		siginfo_t __user *, uinfo)
+{
+	siginfo_t info;
+
+	if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
+		return -EFAULT;
+
+	return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
+}
+
 int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 {
 	struct task_struct *t = current;
-- 
cgit v0.10.2


From 12d161147f828192b5bcc33166f468a827832767 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 4 Apr 2009 21:01:10 +0000
Subject: x86: hookup sys_rt_tgsigqueueinfo

Make the new sys_rt_tgsigqueueinfo available for x86.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a505202..dcef387 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -830,4 +830,5 @@ ia32_sys_call_table:
 	.quad sys_inotify_init1
 	.quad compat_sys_preadv
 	.quad compat_sys_pwritev
+	.quad compat_sys_rt_tgsigqueueinfo	/* 335 */
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 6e72d74..708dae6 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -340,6 +340,7 @@
 #define __NR_inotify_init1	332
 #define __NR_preadv		333
 #define __NR_pwritev		334
+#define __NR_rt_tgsigqueueinfo	335
 
 #ifdef __KERNEL__
 
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index f818294..4e2b054 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -657,6 +657,8 @@ __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
 __SYSCALL(__NR_preadv, sys_preadv)
 #define __NR_pwritev				296
 __SYSCALL(__NR_pwritev, sys_pwritev)
+#define __NR_rt_tgsigqueueinfo			297
+__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
 
 
 #ifndef __NO_STUBS
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index ff5c873..734f92c 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -334,3 +334,4 @@ ENTRY(sys_call_table)
 	.long sys_inotify_init1
 	.long sys_preadv
 	.long sys_pwritev
+	.long sys_rt_tgsigqueueinfo	/* 335 */
-- 
cgit v0.10.2


From bf293c17b26b8854241df08b9b63f7270cbde012 Mon Sep 17 00:00:00 2001
From: Remis Lima Baima <remis.developer@googlemail.com>
Date: Thu, 30 Apr 2009 18:36:23 +0200
Subject: x86: added 'ifndef _ASM_X86_IOMAP_H' to iomap.h

iomap.h misses the include guards.

[ Impact: cleanup ]

Signed-off-by: Remis Lima Baima <remis.developer@googlemail.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
LKML-Reference: <200904301836.23885.arnd@arndb.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index 86af260..0e9fe1d 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -1,3 +1,6 @@
+#ifndef _ASM_X86_IOMAP_H
+#define _ASM_X86_IOMAP_H
+
 /*
  * Copyright © 2008 Ingo Molnar
  *
@@ -31,3 +34,5 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
 
 void
 iounmap_atomic(void *kvaddr, enum km_type type);
+
+#endif /* _ASM_X86_IOMAP_H */
-- 
cgit v0.10.2


From 78a3d9d5654a7fd99cf8b2ab06b9497b9c7aad64 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 29 Apr 2009 18:01:23 +0200
Subject: do_wait: do take security_task_wait() into account

I was never able to understand what should we actually do when
security_task_wait() fails, but the current code doesn't look right.

If ->task_wait() returns the error, we update *notask_error correctly.
But then we either reap the child (despite the fact this was forbidden)
or clear *notask_error (and hide the securiy policy problems).

This patch assumes that "stolen by ptrace" doesn't matter. If selinux
denies the child we should ignore it but make sure we report -EACCESS
instead of -ECHLD if there are no other eligible children.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/kernel/exit.c b/kernel/exit.c
index 167e1e3..d2e8239 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1582,6 +1582,7 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
 		 */
 		if (*notask_error)
 			*notask_error = ret;
+		return 0;
 	}
 
 	if (likely(!ptrace) && unlikely(p->ptrace)) {
-- 
cgit v0.10.2


From c33a0bc4e41ef169d6e807d8abb9502544b518e5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 1 May 2009 12:23:16 +0200
Subject: perf_counter: fix race in perf_output_*

When two (or more) contexts output to the same buffer, it is possible
to observe half written output.

Suppose we have CPU0 doing perf_counter_mmap(), CPU1 doing
perf_counter_overflow(). If CPU1 does a wakeup and exposes head to
user-space, then CPU2 can observe the data CPU0 is still writing.

[ Impact: fix occasionally corrupted profiling records ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090501102533.007821627@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 41aed42..f776851 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -358,10 +358,13 @@ struct perf_mmap_data {
 	struct rcu_head			rcu_head;
 	int				nr_pages;	/* nr of data pages  */
 
-	atomic_t			wakeup;		/* POLL_ for wakeups */
+	atomic_t			poll;		/* POLL_ for wakeups */
 	atomic_t			head;		/* write position    */
 	atomic_t			events;		/* event limit       */
 
+	atomic_t			wakeup_head;	/* completed head    */
+	atomic_t			lock;		/* concurrent writes */
+
 	struct perf_counter_mmap_page   *user_page;
 	void 				*data_pages[0];
 };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 75f2b6c..8660ae5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1279,14 +1279,12 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
 {
 	struct perf_counter *counter = file->private_data;
 	struct perf_mmap_data *data;
-	unsigned int events;
+	unsigned int events = POLL_HUP;
 
 	rcu_read_lock();
 	data = rcu_dereference(counter->data);
 	if (data)
-		events = atomic_xchg(&data->wakeup, 0);
-	else
-		events = POLL_HUP;
+		events = atomic_xchg(&data->poll, 0);
 	rcu_read_unlock();
 
 	poll_wait(file, &counter->waitq, wait);
@@ -1568,22 +1566,6 @@ static const struct file_operations perf_fops = {
 
 void perf_counter_wakeup(struct perf_counter *counter)
 {
-	struct perf_mmap_data *data;
-
-	rcu_read_lock();
-	data = rcu_dereference(counter->data);
-	if (data) {
-		atomic_set(&data->wakeup, POLL_IN);
-		/*
-		 * Ensure all data writes are issued before updating the
-		 * user-space data head information. The matching rmb()
-		 * will be in userspace after reading this value.
-		 */
-		smp_wmb();
-		data->user_page->data_head = atomic_read(&data->head);
-	}
-	rcu_read_unlock();
-
 	wake_up_all(&counter->waitq);
 
 	if (counter->pending_kill) {
@@ -1721,10 +1703,14 @@ struct perf_output_handle {
 	int			wakeup;
 	int			nmi;
 	int			overflow;
+	int			locked;
+	unsigned long		flags;
 };
 
-static inline void __perf_output_wakeup(struct perf_output_handle *handle)
+static void perf_output_wakeup(struct perf_output_handle *handle)
 {
+	atomic_set(&handle->data->poll, POLL_IN);
+
 	if (handle->nmi) {
 		handle->counter->pending_wakeup = 1;
 		perf_pending_queue(&handle->counter->pending,
@@ -1733,6 +1719,86 @@ static inline void __perf_output_wakeup(struct perf_output_handle *handle)
 		perf_counter_wakeup(handle->counter);
 }
 
+/*
+ * Curious locking construct.
+ *
+ * We need to ensure a later event doesn't publish a head when a former
+ * event isn't done writing. However since we need to deal with NMIs we
+ * cannot fully serialize things.
+ *
+ * What we do is serialize between CPUs so we only have to deal with NMI
+ * nesting on a single CPU.
+ *
+ * We only publish the head (and generate a wakeup) when the outer-most
+ * event completes.
+ */
+static void perf_output_lock(struct perf_output_handle *handle)
+{
+	struct perf_mmap_data *data = handle->data;
+	int cpu;
+
+	handle->locked = 0;
+
+	local_irq_save(handle->flags);
+	cpu = smp_processor_id();
+
+	if (in_nmi() && atomic_read(&data->lock) == cpu)
+		return;
+
+	while (atomic_cmpxchg(&data->lock, 0, cpu) != 0)
+		cpu_relax();
+
+	handle->locked = 1;
+}
+
+static void perf_output_unlock(struct perf_output_handle *handle)
+{
+	struct perf_mmap_data *data = handle->data;
+	int head, cpu;
+
+	if (handle->wakeup)
+		data->wakeup_head = data->head;
+
+	if (!handle->locked)
+		goto out;
+
+again:
+	/*
+	 * The xchg implies a full barrier that ensures all writes are done
+	 * before we publish the new head, matched by a rmb() in userspace when
+	 * reading this position.
+	 */
+	while ((head = atomic_xchg(&data->wakeup_head, 0))) {
+		data->user_page->data_head = head;
+		handle->wakeup = 1;
+	}
+
+	/*
+	 * NMI can happen here, which means we can miss a wakeup_head update.
+	 */
+
+	cpu = atomic_xchg(&data->lock, 0);
+	WARN_ON_ONCE(cpu != smp_processor_id());
+
+	/*
+	 * Therefore we have to validate we did not indeed do so.
+	 */
+	if (unlikely(atomic_read(&data->wakeup_head))) {
+		/*
+		 * Since we had it locked, we can lock it again.
+		 */
+		while (atomic_cmpxchg(&data->lock, 0, cpu) != 0)
+			cpu_relax();
+
+		goto again;
+	}
+
+	if (handle->wakeup)
+		perf_output_wakeup(handle);
+out:
+	local_irq_restore(handle->flags);
+}
+
 static int perf_output_begin(struct perf_output_handle *handle,
 			     struct perf_counter *counter, unsigned int size,
 			     int nmi, int overflow)
@@ -1745,6 +1811,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	if (!data)
 		goto out;
 
+	handle->data	 = data;
 	handle->counter	 = counter;
 	handle->nmi	 = nmi;
 	handle->overflow = overflow;
@@ -1752,12 +1819,13 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	if (!data->nr_pages)
 		goto fail;
 
+	perf_output_lock(handle);
+
 	do {
 		offset = head = atomic_read(&data->head);
 		head += size;
 	} while (atomic_cmpxchg(&data->head, offset, head) != offset);
 
-	handle->data	= data;
 	handle->offset	= offset;
 	handle->head	= head;
 	handle->wakeup	= (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
@@ -1765,7 +1833,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	return 0;
 
 fail:
-	__perf_output_wakeup(handle);
+	perf_output_wakeup(handle);
 out:
 	rcu_read_unlock();
 
@@ -1809,16 +1877,20 @@ static void perf_output_copy(struct perf_output_handle *handle,
 
 static void perf_output_end(struct perf_output_handle *handle)
 {
-	int wakeup_events = handle->counter->hw_event.wakeup_events;
+	struct perf_counter *counter = handle->counter;
+	struct perf_mmap_data *data = handle->data;
+
+	int wakeup_events = counter->hw_event.wakeup_events;
 
 	if (handle->overflow && wakeup_events) {
-		int events = atomic_inc_return(&handle->data->events);
+		int events = atomic_inc_return(&data->events);
 		if (events >= wakeup_events) {
-			atomic_sub(wakeup_events, &handle->data->events);
-			__perf_output_wakeup(handle);
+			atomic_sub(wakeup_events, &data->events);
+			handle->wakeup = 1;
 		}
-	} else if (handle->wakeup)
-		__perf_output_wakeup(handle);
+	}
+
+	perf_output_unlock(handle);
 	rcu_read_unlock();
 }
 
-- 
cgit v0.10.2


From 63a809a2dc53b91268dd915bbcbd425063893676 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 1 May 2009 12:23:17 +0200
Subject: perf_counter: fix nmi-watchdog interaction

When we don't have any perf-counters active, don't act like we know
what the NMI is for.

[ Impact: fix hard hang with nmi_watchdog=2 ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090501102533.109867793@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index fc06f4d..d4c0cc9 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -871,6 +871,9 @@ perf_counter_nmi_handler(struct notifier_block *self,
 	struct pt_regs *regs;
 	int ret;
 
+	if (!atomic_read(&num_counters))
+		return NOTIFY_DONE;
+
 	switch (cmd) {
 	case DIE_NMI:
 	case DIE_NMI_IPI:
-- 
cgit v0.10.2


From 585e3374d9d29376c2c37d821c8b7637dd48ca95 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 1 May 2009 12:23:18 +0200
Subject: perf_counter: tool: handle 0-length data files

Avoid perf-report barfing on 0-length data files.

[ Impact: fix perf-report SIGBUS ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090501102533.196245693@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/perf-report.cc b/Documentation/perf_counter/perf-report.cc
index 933a075..911d7f3 100644
--- a/Documentation/perf_counter/perf-report.cc
+++ b/Documentation/perf_counter/perf-report.cc
@@ -402,6 +402,11 @@ int main(int argc, char *argv[])
 		exit(-1);
 	}
 
+	if (!stat.st_size) {
+		fprintf(stderr, "zero-sized file, nothing to do!\n");
+		exit(0);
+	}
+
 	load_kallsyms();
 
 remap:
-- 
cgit v0.10.2


From e5791a808ae91a9e7e1b65ea9b8de0f96a043d88 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 1 May 2009 12:23:19 +0200
Subject: perf_counter: documentation update

Update the documentation to reflect the current state of affairs

[ Impact: documentation update ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090501102533.296727903@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/design.txt b/Documentation/perf_counter/design.txt
index aaf105c..9930c4b 100644
--- a/Documentation/perf_counter/design.txt
+++ b/Documentation/perf_counter/design.txt
@@ -34,41 +34,47 @@ can be poll()ed.
 
 When creating a new counter fd, 'perf_counter_hw_event' is:
 
-/*
- * Event to monitor via a performance monitoring counter:
- */
 struct perf_counter_hw_event {
-	__u64			event_config;
-
-	__u64			irq_period;
-	__u64			record_type;
-	__u64			read_format;
-
-	__u64			disabled       :  1, /* off by default        */
-				nmi	       :  1, /* NMI sampling          */
-				inherit	       :  1, /* children inherit it   */
-				pinned	       :  1, /* must always be on PMU */
-				exclusive      :  1, /* only group on PMU     */
-				exclude_user   :  1, /* don't count user      */
-				exclude_kernel :  1, /* ditto kernel          */
-				exclude_hv     :  1, /* ditto hypervisor      */
-				exclude_idle   :  1, /* don't count when idle */
-
-				__reserved_1   : 55;
-
-	__u32			extra_config_len;
-
-	__u32			__reserved_4;
-	__u64			__reserved_2;
-	__u64			__reserved_3;
+        /*
+         * The MSB of the config word signifies if the rest contains cpu
+         * specific (raw) counter configuration data, if unset, the next
+         * 7 bits are an event type and the rest of the bits are the event
+         * identifier.
+         */
+        __u64                   config;
+
+        __u64                   irq_period;
+        __u32                   record_type;
+        __u32                   read_format;
+
+        __u64                   disabled       :  1, /* off by default        */
+                                nmi            :  1, /* NMI sampling          */
+                                inherit        :  1, /* children inherit it   */
+                                pinned         :  1, /* must always be on PMU */
+                                exclusive      :  1, /* only group on PMU     */
+                                exclude_user   :  1, /* don't count user      */
+                                exclude_kernel :  1, /* ditto kernel          */
+                                exclude_hv     :  1, /* ditto hypervisor      */
+                                exclude_idle   :  1, /* don't count when idle */
+                                mmap           :  1, /* include mmap data     */
+                                munmap         :  1, /* include munmap data   */
+                                comm           :  1, /* include comm data     */
+
+                                __reserved_1   : 52;
+
+        __u32                   extra_config_len;
+        __u32                   wakeup_events;  /* wakeup every n events */
+
+        __u64                   __reserved_2;
+        __u64                   __reserved_3;
 };
 
-The 'event_config' field specifies what the counter should count.  It
+The 'config' field specifies what the counter should count.  It
 is divided into 3 bit-fields:
 
-raw_type: 1 bit (most significant bit)		0x8000_0000_0000_0000
-type:	  7 bits (next most significant)	0x7f00_0000_0000_0000
-event_id: 56 bits (least significant)		0x00ff_0000_0000_0000
+raw_type: 1 bit   (most significant bit)	0x8000_0000_0000_0000
+type:	  7 bits  (next most significant)	0x7f00_0000_0000_0000
+event_id: 56 bits (least significant)		0x00ff_ffff_ffff_ffff
 
 If 'raw_type' is 1, then the counter will count a hardware event
 specified by the remaining 63 bits of event_config.  The encoding is
@@ -134,41 +140,56 @@ enum sw_event_ids {
 	PERF_COUNT_PAGE_FAULTS_MAJ	= 6,
 };
 
+Counters of the type PERF_TYPE_TRACEPOINT are available when the ftrace event
+tracer is available, and event_id values can be obtained from
+/debug/tracing/events/*/*/id
+
+
 Counters come in two flavours: counting counters and sampling
 counters.  A "counting" counter is one that is used for counting the
 number of events that occur, and is characterised by having
-irq_period = 0 and record_type = PERF_RECORD_SIMPLE.  A read() on a
-counting counter simply returns the current value of the counter as
-an 8-byte number.
+irq_period = 0.
+
+
+A read() on a counter returns the current value of the counter and possible
+additional values as specified by 'read_format', each value is a u64 (8 bytes)
+in size.
+
+/*
+ * Bits that can be set in hw_event.read_format to request that
+ * reads on the counter should return the indicated quantities,
+ * in increasing order of bit value, after the counter value.
+ */
+enum perf_counter_read_format {
+        PERF_FORMAT_TOTAL_TIME_ENABLED  =  1,
+        PERF_FORMAT_TOTAL_TIME_RUNNING  =  2,
+};
+
+Using these additional values one can establish the overcommit ratio for a
+particular counter allowing one to take the round-robin scheduling effect
+into account.
+
 
 A "sampling" counter is one that is set up to generate an interrupt
 every N events, where N is given by 'irq_period'.  A sampling counter
-has irq_period > 0 and record_type != PERF_RECORD_SIMPLE.  The
-record_type controls what data is recorded on each interrupt, and the
-available values are currently:
+has irq_period > 0. The record_type controls what data is recorded on each
+interrupt:
 
 /*
- * IRQ-notification data record type:
+ * Bits that can be set in hw_event.record_type to request information
+ * in the overflow packets.
  */
-enum perf_counter_record_type {
-	PERF_RECORD_SIMPLE		= 0,
-	PERF_RECORD_IRQ			= 1,
-	PERF_RECORD_GROUP		= 2,
+enum perf_counter_record_format {
+        PERF_RECORD_IP          = 1U << 0,
+        PERF_RECORD_TID         = 1U << 1,
+        PERF_RECORD_TIME        = 1U << 2,
+        PERF_RECORD_ADDR        = 1U << 3,
+        PERF_RECORD_GROUP       = 1U << 4,
+        PERF_RECORD_CALLCHAIN   = 1U << 5,
 };
 
-A record_type value of PERF_RECORD_IRQ will record the instruction
-pointer (IP) at which the interrupt occurred.  A record_type value of
-PERF_RECORD_GROUP will record the event_config and counter value of
-all of the other counters in the group, and should only be used on a
-group leader (see below).  Currently these two values are mutually
-exclusive, but record_type will become a bit-mask in future and
-support other values.
-
-A sampling counter has an event queue, into which an event is placed
-on each interrupt.  A read() on a sampling counter will read the next
-event from the event queue.  If the queue is empty, the read() will
-either block or return an EAGAIN error, depending on whether the fd
-has been set to non-blocking mode or not.
+Such (and other) events will be recorded in a ring-buffer, which is
+available to user-space using mmap() (see below).
 
 The 'disabled' bit specifies whether the counter starts out disabled
 or enabled.  If it is initially disabled, it can be enabled by ioctl
@@ -206,6 +227,13 @@ The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a
 way to request that counting of events be restricted to times when the
 CPU is in user, kernel and/or hypervisor mode.
 
+The 'mmap' and 'munmap' bits allow recording of PROT_EXEC mmap/munmap
+operations, these can be used to relate userspace IP addresses to actual
+code, even after the mapping (or even the whole process) is gone,
+these events are recorded in the ring-buffer (see below).
+
+The 'comm' bit allows tracking of process comm data on process creation.
+This too is recorded in the ring-buffer (see below).
 
 The 'pid' parameter to the perf_counter_open() system call allows the
 counter to be specific to a task:
@@ -250,6 +278,138 @@ can be meaningfully compared, added, divided (to get ratios), etc.,
 with each other, since they have counted events for the same set of
 executed instructions.
 
+
+Like stated, asynchronous events, like counter overflow or PROT_EXEC mmap
+tracking are logged into a ring-buffer. This ring-buffer is created and
+accessed through mmap().
+
+The mmap size should be 1+2^n pages, where the first page is a meta-data page
+(struct perf_counter_mmap_page) that contains various bits of information such
+as where the ring-buffer head is.
+
+/*
+ * Structure of the page that can be mapped via mmap
+ */
+struct perf_counter_mmap_page {
+        __u32   version;                /* version number of this structure */
+        __u32   compat_version;         /* lowest version this is compat with */
+
+        /*
+         * Bits needed to read the hw counters in user-space.
+         *
+         *   u32 seq;
+         *   s64 count;
+         *
+         *   do {
+         *     seq = pc->lock;
+         *
+         *     barrier()
+         *     if (pc->index) {
+         *       count = pmc_read(pc->index - 1);
+         *       count += pc->offset;
+         *     } else
+         *       goto regular_read;
+         *
+         *     barrier();
+         *   } while (pc->lock != seq);
+         *
+         * NOTE: for obvious reason this only works on self-monitoring
+         *       processes.
+         */
+        __u32   lock;                   /* seqlock for synchronization */
+        __u32   index;                  /* hardware counter identifier */
+        __s64   offset;                 /* add to hardware counter value */
+
+        /*
+         * Control data for the mmap() data buffer.
+         *
+         * User-space reading this value should issue an rmb(), on SMP capable
+         * platforms, after reading this value -- see perf_counter_wakeup().
+         */
+        __u32   data_head;              /* head in the data section */
+};
+
+NOTE: the hw-counter userspace bits are arch specific and are currently only
+      implemented on powerpc.
+
+The following 2^n pages are the ring-buffer which contains events of the form:
+
+#define PERF_EVENT_MISC_KERNEL          (1 << 0)
+#define PERF_EVENT_MISC_USER            (1 << 1)
+#define PERF_EVENT_MISC_OVERFLOW        (1 << 2)
+
+struct perf_event_header {
+        __u32   type;
+        __u16   misc;
+        __u16   size;
+};
+
+enum perf_event_type {
+
+        /*
+         * The MMAP events record the PROT_EXEC mappings so that we can
+         * correlate userspace IPs to code. They have the following structure:
+         *
+         * struct {
+         *      struct perf_event_header        header;
+         *
+         *      u32                             pid, tid;
+         *      u64                             addr;
+         *      u64                             len;
+         *      u64                             pgoff;
+         *      char                            filename[];
+         * };
+         */
+        PERF_EVENT_MMAP                 = 1,
+        PERF_EVENT_MUNMAP               = 2,
+
+        /*
+         * struct {
+         *      struct perf_event_header        header;
+         *
+         *      u32                             pid, tid;
+         *      char                            comm[];
+         * };
+         */
+        PERF_EVENT_COMM                 = 3,
+
+        /*
+         * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
+         * will be PERF_RECORD_*
+         *
+         * struct {
+         *      struct perf_event_header        header;
+         *
+         *      { u64                   ip;       } && PERF_RECORD_IP
+         *      { u32                   pid, tid; } && PERF_RECORD_TID
+         *      { u64                   time;     } && PERF_RECORD_TIME
+         *      { u64                   addr;     } && PERF_RECORD_ADDR
+         *
+         *      { u64                   nr;
+         *        { u64 event, val; }   cnt[nr];  } && PERF_RECORD_GROUP
+         *
+         *      { u16                   nr,
+         *                              hv,
+         *                              kernel,
+         *                              user;
+         *        u64                   ips[nr];  } && PERF_RECORD_CALLCHAIN
+         * };
+         */
+};
+
+NOTE: PERF_RECORD_CALLCHAIN is arch specific and currently only implemented
+      on x86.
+
+Notification of new events is possible through poll()/select()/epoll() and
+fcntl() managing signals.
+
+Normally a notification is generated for every page filled, however one can
+additionally set perf_counter_hw_event.wakeup_events to generate one every
+so many counter overflow events.
+
+Future work will include a splice() interface to the ring-buffer.
+
+
 Counters can be enabled and disabled in two ways: via ioctl and via
 prctl.  When a counter is disabled, it doesn't count or generate
 events but does continue to exist and maintain its count value.
@@ -269,6 +429,12 @@ group other than the leader only affects that counter - disabling an
 non-leader stops that counter from counting but doesn't affect any
 other counter.
 
+Additionally, non-inherited overflow counters can use
+
+	ioctl(fd, PERF_COUNTER_IOC_REFRESH, nr);
+
+to enable a counter for 'nr' events, after which it gets disabled again.
+
 A process can enable or disable all the counter groups that are
 attached to it, using prctl:
 
-- 
cgit v0.10.2


From 56afb0f8823650f53a5f0e96d69a282e8892c61b Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Thu, 30 Apr 2009 13:29:36 -0400
Subject: kerneldoc, tracing: make kernel-doc understand TRACE_EVENT() macro
 (take #2)

Add support to kernel-doc for tracepoint comments above TRACE_EVENT()
macro definitions. Paves the way for tracepoint docbook.

[ Impact: extend DocBook infrastructure ]

Signed-off-by: Jason Baron <jbaron@redhat.com>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: akpm@linux-foundation.org
Cc: rostedt@goodmis.org
Cc: fweisbec@gmail.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: wcohen@redhat.com
LKML-Reference: <d80706b6797e277924d2f3ec9af176c6b2951f88.1241107197.git.jbaron@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index 0f11870..2b53a55 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -1827,6 +1827,25 @@ sub reset_state {
     $state = 0;
 }
 
+sub tracepoint_munge($) {
+	my $file = shift;
+	my $tracepointname = 0;
+	my $tracepointargs = 0;
+
+	if($prototype =~ m/TRACE_EVENT\((.*?),/) {
+		$tracepointname = $1;
+	}
+	if($prototype =~ m/TP_PROTO\((.*?)\)/) {
+		$tracepointargs = $1;
+	}
+	if (($tracepointname eq 0) || ($tracepointargs eq 0)) {
+		print STDERR "Warning(${file}:$.): Unrecognized tracepoint format: \n".
+			     "$prototype\n";
+	} else {
+		$prototype = "static inline void trace_$tracepointname($tracepointargs)";
+	}
+}
+
 sub syscall_munge() {
 	my $void = 0;
 
@@ -1881,6 +1900,9 @@ sub process_state3_function($$) {
 	if ($prototype =~ /SYSCALL_DEFINE/) {
 		syscall_munge();
 	}
+	if ($prototype =~ /TRACE_EVENT/) {
+		tracepoint_munge($file);
+	}
 	dump_function($prototype, $file);
 	reset_state();
     }
-- 
cgit v0.10.2


From a76f8c6da1e48fd4ef025f42c736389532ff30ba Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Thu, 30 Apr 2009 13:29:42 -0400
Subject: tracing: add new tracepoints docbook

Add tracepoint docbook. This will help us document and understand
what tracepoints are in the kernel. Since there are multiple
macros, and files that contain tracepoints.

[ Impact: add documentation ]

Signed-off-by: Jason Baron <jbaron@redhat.com>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: akpm@linux-foundation.org
Cc: rostedt@goodmis.org
Cc: fweisbec@gmail.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: wcohen@redhat.com
LKML-Reference: <84160b6bd94aff02455da7e12bad054d34c579a0.1241107197.git.jbaron@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile
index 8918a32..4c8f4d6 100644
--- a/Documentation/DocBook/Makefile
+++ b/Documentation/DocBook/Makefile
@@ -13,7 +13,8 @@ DOCBOOKS := z8530book.xml mcabook.xml device-drivers.xml \
 	    gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \
 	    genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \
 	    mac80211.xml debugobjects.xml sh.xml regulator.xml \
-	    alsa-driver-api.xml writing-an-alsa-driver.xml
+	    alsa-driver-api.xml writing-an-alsa-driver.xml \
+	    tracepoint.xml
 
 ###
 # The build process is as follows (targets):
diff --git a/Documentation/DocBook/tracepoint.tmpl b/Documentation/DocBook/tracepoint.tmpl
new file mode 100644
index 0000000..70891bc
--- /dev/null
+++ b/Documentation/DocBook/tracepoint.tmpl
@@ -0,0 +1,84 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="Tracepoints">
+ <bookinfo>
+  <title>The Linux Kernel Tracepoint API</title>
+
+  <authorgroup>
+   <author>
+    <firstname>Jason</firstname>
+    <surname>Baron</surname>
+    <affiliation>
+     <address>
+      <email>jbaron@redhat.com</email>
+     </address>
+    </affiliation>
+   </author>
+  </authorgroup>
+
+  <legalnotice>
+   <para>
+     This documentation is free software; you can redistribute
+     it and/or modify it under the terms of the GNU General Public
+     License as published by the Free Software Foundation; either
+     version 2 of the License, or (at your option) any later
+     version.
+   </para>
+
+   <para>
+     This program is distributed in the hope that it will be
+     useful, but WITHOUT ANY WARRANTY; without even the implied
+     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+     See the GNU General Public License for more details.
+   </para>
+
+   <para>
+     You should have received a copy of the GNU General Public
+     License along with this program; if not, write to the Free
+     Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+     MA 02111-1307 USA
+   </para>
+
+   <para>
+     For more details see the file COPYING in the source
+     distribution of Linux.
+   </para>
+  </legalnotice>
+ </bookinfo>
+
+ <toc></toc>
+  <chapter id="intro">
+   <title>Introduction</title>
+   <para>
+     Tracepoints are static probe points that are located in strategic points
+     throughout the kernel. 'Probes' register/unregister with tracepoints
+     via a callback mechanism. The 'probes' are strictly typed functions that
+     are passed a unique set of parameters defined by each tracepoint.
+   </para>
+
+   <para>
+     From this simple callback mechanism, 'probes' can be used to profile, debug,
+     and understand kernel behavior. There are a number of tools that provide a
+     framework for using 'probes'. These tools include Systemtap, ftrace, and
+     LTTng.
+   </para>
+
+   <para>
+     Tracepoints are defined in a number of header files via various macros. Thus,
+     the purpose of this document is to provide a clear accounting of the available
+     tracepoints. The intention is to understand not only what tracepoints are
+     available but also to understand where future tracepoints might be added.
+   </para>
+
+   <para>
+     The API presented has functions of the form:
+     <function>trace_tracepointname(function parameters)</function>. These are the
+     tracepoints callbacks that are found throughout the code. Registering and
+     unregistering probes with these callback sites is covered in the
+     <filename>Documentation/trace/*</filename> directory.
+   </para>
+  </chapter>
+
+</book>
-- 
cgit v0.10.2


From 9ee1983c9aa18f12388ef660d0c76a23dc112959 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Thu, 30 Apr 2009 13:29:47 -0400
Subject: tracing: add irq tracepoint documentation

Document irqs for the newly created docbook.

[ Impact: add documentation ]

Signed-off-by: Jason Baron <jbaron@redhat.com>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: akpm@linux-foundation.org
Cc: rostedt@goodmis.org
Cc: fweisbec@gmail.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: wcohen@redhat.com
LKML-Reference: <73ff42be3420157667ec548e9b0e409c3cfad05f.1241107197.git.jbaron@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/DocBook/tracepoint.tmpl b/Documentation/DocBook/tracepoint.tmpl
index 70891bc..b0756d0 100644
--- a/Documentation/DocBook/tracepoint.tmpl
+++ b/Documentation/DocBook/tracepoint.tmpl
@@ -81,4 +81,9 @@
    </para>
   </chapter>
 
+  <chapter id="irq">
+   <title>IRQ</title>
+!Iinclude/trace/events/irq.h
+  </chapter>
+
 </book>
diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h
index 7686864..32a9f7e 100644
--- a/include/trace/events/irq.h
+++ b/include/trace/events/irq.h
@@ -7,8 +7,16 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM irq
 
-/*
- * Tracepoint for entry of interrupt handler:
+/**
+ * irq_handler_entry - called immediately before the irq action handler
+ * @irq: irq number
+ * @action: pointer to struct irqaction
+ *
+ * The struct irqaction pointed to by @action contains various
+ * information about the handler, including the device name,
+ * @action->name, and the device id, @action->dev_id. When used in
+ * conjunction with the irq_handler_exit tracepoint, we can figure
+ * out irq handler latencies.
  */
 TRACE_EVENT(irq_handler_entry,
 
@@ -29,8 +37,16 @@ TRACE_EVENT(irq_handler_entry,
 	TP_printk("irq=%d handler=%s", __entry->irq, __get_str(name))
 );
 
-/*
- * Tracepoint for return of an interrupt handler:
+/**
+ * irq_handler_exit - called immediately after the irq action handler returns
+ * @irq: irq number
+ * @action: pointer to struct irqaction
+ * @ret: return value
+ *
+ * If the @ret value is set to IRQ_HANDLED, then we know that the corresponding
+ * @action->handler scuccessully handled this irq. Otherwise, the irq might be
+ * a shared irq line, or the irq was not handled successfully. Can be used in
+ * conjunction with the irq_handler_entry to understand irq handler latencies.
  */
 TRACE_EVENT(irq_handler_exit,
 
@@ -52,6 +68,17 @@ TRACE_EVENT(irq_handler_exit,
 		  __entry->irq, __entry->ret ? "handled" : "unhandled")
 );
 
+/**
+ * softirq_entry - called immediately before the softirq handler
+ * @h: pointer to struct softirq_action
+ * @vec: pointer to first struct softirq_action in softirq_vec array
+ *
+ * The @h parameter, contains a pointer to the struct softirq_action
+ * which has a pointer to the action handler that is called. By subtracting
+ * the @vec pointer from the @h pointer, we can determine the softirq
+ * number. Also, when used in combination with the softirq_exit tracepoint
+ * we can determine the softirq latency.
+ */
 TRACE_EVENT(softirq_entry,
 
 	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
@@ -71,6 +98,17 @@ TRACE_EVENT(softirq_entry,
 	TP_printk("softirq=%d action=%s", __entry->vec, __get_str(name))
 );
 
+/**
+ * softirq_exit - called immediately after the softirq handler returns
+ * @h: pointer to struct softirq_action
+ * @vec: pointer to first struct softirq_action in softirq_vec array
+ *
+ * The @h parameter contains a pointer to the struct softirq_action
+ * that has handled the softirq. By subtracting the @vec pointer from
+ * the @h pointer, we can determine the softirq number. Also, when used in
+ * combination with the softirq_entry tracepoint we can determine the softirq
+ * latency.
+ */
 TRACE_EVENT(softirq_exit,
 
 	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
-- 
cgit v0.10.2


From e0202f56a82cd1170c6f1c520db669431cf26ddc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 1 May 2009 16:51:44 +0200
Subject: perf_counter tools: fix x86 syscall numbers

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 4a50abf..9cff266 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -57,13 +57,13 @@
 #define asmlinkage
 
 #ifdef __x86_64__
-#define __NR_perf_counter_open 295
+#define __NR_perf_counter_open 298
 #define rmb()		asm volatile("lfence" ::: "memory")
 #define cpu_relax()	asm volatile("rep; nop" ::: "memory");
 #endif
 
 #ifdef __i386__
-#define __NR_perf_counter_open 333
+#define __NR_perf_counter_open 336
 #define rmb()		asm volatile("lfence" ::: "memory")
 #define cpu_relax()	asm volatile("rep; nop" ::: "memory");
 #endif
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 1fde127..9fbc661 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -108,13 +108,13 @@
 #define asmlinkage
 
 #ifdef __x86_64__
-#define __NR_perf_counter_open 295
+#define __NR_perf_counter_open 298
 #define rmb()		asm volatile("lfence" ::: "memory")
 #define cpu_relax()	asm volatile("rep; nop" ::: "memory");
 #endif
 
 #ifdef __i386__
-#define __NR_perf_counter_open 333
+#define __NR_perf_counter_open 336
 #define rmb()		asm volatile("lfence" ::: "memory")
 #define cpu_relax()	asm volatile("rep; nop" ::: "memory");
 #endif
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 8d28864..b6d989e 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -89,13 +89,13 @@
 #define asmlinkage
 
 #ifdef __x86_64__
-#define __NR_perf_counter_open 295
+#define __NR_perf_counter_open 298
 #define rmb()		asm volatile("lfence" ::: "memory")
 #define cpu_relax()	asm volatile("rep; nop" ::: "memory");
 #endif
 
 #ifdef __i386__
-#define __NR_perf_counter_open 333
+#define __NR_perf_counter_open 336
 #define rmb()		asm volatile("lfence" ::: "memory")
 #define cpu_relax()	asm volatile("rep; nop" ::: "memory");
 #endif
-- 
cgit v0.10.2


From 3666932bf2212a8fa77e344c5d946e86787bdbbe Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 1 May 2009 17:37:51 +0200
Subject: perf_counter tools: remove build generated files

These files are generated during the build process. No need to have
them in the git repository.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/Documentation/perf_counter/PERF-BUILD-OPTIONS b/Documentation/perf_counter/PERF-BUILD-OPTIONS
deleted file mode 100644
index 46d8d6c..0000000
--- a/Documentation/perf_counter/PERF-BUILD-OPTIONS
+++ /dev/null
@@ -1,4 +0,0 @@
-SHELL_PATH='/bin/sh'
-TAR='tar'
-NO_CURL=''
-NO_PERL=''
diff --git a/Documentation/perf_counter/PERF-CFLAGS b/Documentation/perf_counter/PERF-CFLAGS
deleted file mode 100644
index f24906c..0000000
--- a/Documentation/perf_counter/PERF-CFLAGS
+++ /dev/null
@@ -1 +0,0 @@
--g -O2 -Wall  -DSHA1_HEADER='<openssl/sha.h>' : /home/mingo/bin:libexec/perf-core:share/perf-core/templates:/home/mingo
diff --git a/Documentation/perf_counter/PERF-VERSION-FILE b/Documentation/perf_counter/PERF-VERSION-FILE
deleted file mode 100644
index 328e244..0000000
--- a/Documentation/perf_counter/PERF-VERSION-FILE
+++ /dev/null
@@ -1 +0,0 @@
-PERF_VERSION = 0.0.1.PERF
-- 
cgit v0.10.2


From 6eda5838bc5771578986429cde4a0870e1e5f5e1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 1 May 2009 18:29:57 +0200
Subject: perfcounter tools: move common defines ... to local header file

No change, move of duplicated stuff only.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 877cf5d..481e4c2 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -287,6 +287,7 @@ export PERL_PATH
 LIB_FILE=libperf.a
 
 LIB_H += ../../include/linux/perf_counter.h
+LIB_H += perf.h
 LIB_H += util/levenshtein.h
 LIB_H += util/parse-options.h
 LIB_H += util/quote.h
diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 9cff266..59f1d87 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -1,6 +1,7 @@
 
 
-#define _GNU_SOURCE
+#include "util/util.h"
+
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/time.h>
@@ -32,66 +33,7 @@
 
 #include "../../include/linux/perf_counter.h"
 
-
-/*
- * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
- * counters in the current task.
- */
-#define PR_TASK_PERF_COUNTERS_DISABLE   31
-#define PR_TASK_PERF_COUNTERS_ENABLE    32
-
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-
-#define rdclock()                                       \
-({                                                      \
-        struct timespec ts;                             \
-                                                        \
-        clock_gettime(CLOCK_MONOTONIC, &ts);            \
-        ts.tv_sec * 1000000000ULL + ts.tv_nsec;         \
-})
-
-/*
- * Pick up some kernel type conventions:
- */
-#define __user
-#define asmlinkage
-
-#ifdef __x86_64__
-#define __NR_perf_counter_open 298
-#define rmb()		asm volatile("lfence" ::: "memory")
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#endif
-
-#ifdef __i386__
-#define __NR_perf_counter_open 336
-#define rmb()		asm volatile("lfence" ::: "memory")
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#endif
-
-#ifdef __powerpc__
-#define __NR_perf_counter_open 319
-#define rmb() 		asm volatile ("sync" ::: "memory")
-#define cpu_relax()	asm volatile ("" ::: "memory");
-#endif
-
-#define unlikely(x)	__builtin_expect(!!(x), 0)
-#define min(x, y) ({				\
-	typeof(x) _min1 = (x);			\
-	typeof(y) _min2 = (y);			\
-	(void) (&_min1 == &_min2);		\
-	_min1 < _min2 ? _min1 : _min2; })
-
-extern asmlinkage int sys_perf_counter_open(
-        struct perf_counter_hw_event    *hw_event_uptr          __user,
-        pid_t                           pid,
-        int                             cpu,
-        int                             group_fd,
-        unsigned long                   flags);
-
-#define MAX_COUNTERS			64
-#define MAX_NR_CPUS			256
-
-#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
+#include "perf.h"
 
 static int			nr_counters			=  0;
 static __u64			event_id[MAX_COUNTERS]		= { };
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 9fbc661..6de38d2 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -85,64 +85,7 @@
 
 #include "../../include/linux/perf_counter.h"
 
-
-/*
- * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
- * counters in the current task.
- */
-#define PR_TASK_PERF_COUNTERS_DISABLE   31
-#define PR_TASK_PERF_COUNTERS_ENABLE    32
-
-#define rdclock()                                       \
-({                                                      \
-        struct timespec ts;                             \
-                                                        \
-        clock_gettime(CLOCK_MONOTONIC, &ts);            \
-        ts.tv_sec * 1000000000ULL + ts.tv_nsec;         \
-})
-
-/*
- * Pick up some kernel type conventions:
- */
-#define __user
-#define asmlinkage
-
-#ifdef __x86_64__
-#define __NR_perf_counter_open 298
-#define rmb()		asm volatile("lfence" ::: "memory")
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#endif
-
-#ifdef __i386__
-#define __NR_perf_counter_open 336
-#define rmb()		asm volatile("lfence" ::: "memory")
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#endif
-
-#ifdef __powerpc__
-#define __NR_perf_counter_open 319
-#define rmb() 		asm volatile ("sync" ::: "memory")
-#define cpu_relax()	asm volatile ("" ::: "memory");
-#endif
-
-#define unlikely(x)	__builtin_expect(!!(x), 0)
-#define min(x, y) ({				\
-	typeof(x) _min1 = (x);			\
-	typeof(y) _min2 = (y);			\
-	(void) (&_min1 == &_min2);		\
-	_min1 < _min2 ? _min1 : _min2; })
-
-extern asmlinkage int sys_perf_counter_open(
-        struct perf_counter_hw_event    *hw_event_uptr          __user,
-        pid_t                           pid,
-        int                             cpu,
-        int                             group_fd,
-        unsigned long                   flags);
-
-#define MAX_COUNTERS			64
-#define MAX_NR_CPUS			256
-
-#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
+#include "perf.h"
 
 static int			system_wide			=  0;
 
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index b6d989e..cd6f61d 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -66,68 +66,7 @@
 
 #include "../../include/linux/perf_counter.h"
 
-
-/*
- * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
- * counters in the current task.
- */
-#define PR_TASK_PERF_COUNTERS_DISABLE   31
-#define PR_TASK_PERF_COUNTERS_ENABLE    32
-
-#define rdclock()                                       \
-({                                                      \
-        struct timespec ts;                             \
-                                                        \
-        clock_gettime(CLOCK_MONOTONIC, &ts);            \
-        ts.tv_sec * 1000000000ULL + ts.tv_nsec;         \
-})
-
-/*
- * Pick up some kernel type conventions:
- */
-#define __user
-#define asmlinkage
-
-#ifdef __x86_64__
-#define __NR_perf_counter_open 298
-#define rmb()		asm volatile("lfence" ::: "memory")
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#endif
-
-#ifdef __i386__
-#define __NR_perf_counter_open 336
-#define rmb()		asm volatile("lfence" ::: "memory")
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#endif
-
-#ifdef __powerpc__
-#define __NR_perf_counter_open 319
-#define rmb() 		asm volatile ("sync" ::: "memory")
-#define cpu_relax()	asm volatile ("" ::: "memory");
-#endif
-
-#define unlikely(x)	__builtin_expect(!!(x), 0)
-#define min(x, y) ({				\
-	typeof(x) _min1 = (x);			\
-	typeof(y) _min2 = (y);			\
-	(void) (&_min1 == &_min2);		\
-	_min1 < _min2 ? _min1 : _min2; })
-
-asmlinkage int sys_perf_counter_open(
-        struct perf_counter_hw_event    *hw_event_uptr          __user,
-        pid_t                           pid,
-        int                             cpu,
-        int                             group_fd,
-        unsigned long                   flags)
-{
-        return syscall(
-                __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
-}
-
-#define MAX_COUNTERS			64
-#define MAX_NR_CPUS			256
-
-#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
+#include "perf.h"
 
 static int			system_wide			=  0;
 
diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h
new file mode 100644
index 0000000..391fcc7
--- /dev/null
+++ b/Documentation/perf_counter/perf.h
@@ -0,0 +1,64 @@
+#ifndef _PERF_PERF_H
+#define _PERF_PERF_H
+
+/*
+ * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
+ * counters in the current task.
+ */
+#define PR_TASK_PERF_COUNTERS_DISABLE   31
+#define PR_TASK_PERF_COUNTERS_ENABLE    32
+
+#define rdclock()					\
+({							\
+	struct timespec ts;				\
+							\
+	clock_gettime(CLOCK_MONOTONIC, &ts);		\
+	ts.tv_sec * 1000000000ULL + ts.tv_nsec;		\
+})
+
+/*
+ * Pick up some kernel type conventions:
+ */
+#define __user
+#define asmlinkage
+
+#ifdef __x86_64__
+#define __NR_perf_counter_open 298
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#endif
+
+#ifdef __i386__
+#define __NR_perf_counter_open 336
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#endif
+
+#ifdef __powerpc__
+#define __NR_perf_counter_open 319
+#define rmb()		asm volatile ("sync" ::: "memory")
+#define cpu_relax()	asm volatile ("" ::: "memory");
+#endif
+
+#define unlikely(x)	__builtin_expect(!!(x), 0)
+#define min(x, y) ({				\
+	typeof(x) _min1 = (x);			\
+	typeof(y) _min2 = (y);			\
+	(void) (&_min1 == &_min2);		\
+	_min1 < _min2 ? _min1 : _min2; })
+
+static inline int
+sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
+		      pid_t pid, int cpu, int group_fd,
+		      unsigned long flags)
+{
+	return syscall(__NR_perf_counter_open, hw_event_uptr, pid, cpu,
+		       group_fd, flags);
+}
+
+#define MAX_COUNTERS			64
+#define MAX_NR_CPUS			256
+
+#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
+
+#endif
-- 
cgit v0.10.2


From a92e70237c8abbd1c3241133bf72f2cd07c90eae Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 1 May 2009 18:39:47 +0200
Subject: perfcounter tools: make rdclock an inline function

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h
index 391fcc7..fb14230 100644
--- a/Documentation/perf_counter/perf.h
+++ b/Documentation/perf_counter/perf.h
@@ -8,13 +8,17 @@
 #define PR_TASK_PERF_COUNTERS_DISABLE   31
 #define PR_TASK_PERF_COUNTERS_ENABLE    32
 
-#define rdclock()					\
-({							\
-	struct timespec ts;				\
-							\
-	clock_gettime(CLOCK_MONOTONIC, &ts);		\
-	ts.tv_sec * 1000000000ULL + ts.tv_nsec;		\
-})
+#ifndef NSEC_PER_SEC
+# define NSEC_PER_SEC			1000000000ULL
+#endif
+
+static inline unsigned long long rdclock(void)
+{
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
+}
 
 /*
  * Pick up some kernel type conventions:
-- 
cgit v0.10.2


From 7bd5469cd938eec6a76b3135e6becd9b5e096e98 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 1 May 2009 18:42:47 +0200
Subject: perfcounter tools: fix pointer mismatch

Neither process_options nor execvp take an const **char as argument.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 59f1d87..3a3deb3 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -332,7 +332,7 @@ static void sigchld_handler(int sig)
 		done = 1;
 }
 
-int cmd_record(int argc, const char **argv)
+int cmd_record(int argc, char **argv)
 {
 	struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
 	struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
-- 
cgit v0.10.2


From 4ba67c1d48aeedcc31630bb40b6179fc7d360f90 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 1 May 2009 18:48:06 +0200
Subject: perfcounter tools: get the syscall number from
 arch/*/include/asm/unistd.h

Avoid further confusion during development

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h
index fb14230..6fa3656 100644
--- a/Documentation/perf_counter/perf.h
+++ b/Documentation/perf_counter/perf.h
@@ -26,20 +26,14 @@ static inline unsigned long long rdclock(void)
 #define __user
 #define asmlinkage
 
-#ifdef __x86_64__
-#define __NR_perf_counter_open 298
-#define rmb()		asm volatile("lfence" ::: "memory")
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#endif
-
-#ifdef __i386__
-#define __NR_perf_counter_open 336
+#if defined(__x86_64__) || defined(__i386__)
+#include "../../arch/x86/include/asm/unistd.h"
 #define rmb()		asm volatile("lfence" ::: "memory")
 #define cpu_relax()	asm volatile("rep; nop" ::: "memory");
 #endif
 
 #ifdef __powerpc__
-#define __NR_perf_counter_open 319
+#include "../../arch/powerpc/include/asm/unistd.h"
 #define rmb()		asm volatile ("sync" ::: "memory")
 #define cpu_relax()	asm volatile ("" ::: "memory");
 #endif
-- 
cgit v0.10.2


From 75507efb1372b6acf1aa6bf00ebd49ce196fd994 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 1 May 2009 12:58:36 -0400
Subject: ext4: Don't avoid using BLOCK_UNINIT block groups in mballoc

By avoiding the use of not-yet-used block groups (i.e., block groups
with the BLOCK_UNINIT flag), mballoc had a tendency to create large
files with large non-contiguous gaps.  In addition avoiding the use of
new block groups had a tendency to push regular file data into the
first block group in a flex_bg group, which slows down the speed of
e2fsck pass 2, since it has a tendency to seek much more.  For
example:

               Before Patch                       After Patch
              Time in seconds                   Time in seconds
            Real /  User/  Sys   MB/s      Real /  User/  Sys    MB/s
Pass 1      8.52 / 2.21 / 0.46  20.43      8.84 / 4.97 / 1.11   19.68
Pass 2     21.16 / 1.02 / 1.86  11.30      6.54 / 1.77 / 1.78   36.39
Pass 3      0.01 / 0.00 / 0.00 139.00      0.01 / 0.01 / 0.00  128.90
Pass 4      0.16 / 0.15 / 0.00   0.00      0.17 / 0.17 / 0.00    0.00
Pass 5      2.52 / 1.99 / 0.09   0.79      2.31 / 1.78 / 0.06    0.86
Total      32.40 / 5.11 / 2.49  12.81     17.99 / 8.75 / 2.98   23.01

This was on a sample 80 gig root filesystem which was approximately
50% full.  Note the improved e2fsck pass 2 performance, by over a
factor of 3, due to a decreased number of seeks.  (The total amount of
I/O in pass 2 was unchanged; the layout of the directory blocks was
simply much better from e2fsck's's perspective.)

Other changes as a result of this patch on this sample filesystem:

                             Before Patch    After Patch
# of non-contig files           762             779
# of non-contig directories     571             570
# of BLOCK_UNINIT bg's          307             293
# of INODE_UNINIT bg's          503             503

Out of 640 block groups, of which 333 were in use, this patch caused
an extra 14 block groups to be utilized.  The number of non-contiguous
files did go up slightly, but when measured against the 99.9% of the
files (603,154) which were contiguously allocated, this is pretty
insignificant.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andreas Dilger <adilger@sun.com>

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c3af9e6..dbd47ea 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1728,7 +1728,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 	unsigned free, fragments;
 	unsigned i, bits;
 	int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
-	struct ext4_group_desc *desc;
 	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
 
 	BUG_ON(cr < 0 || cr >= 4);
@@ -1744,10 +1743,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 	switch (cr) {
 	case 0:
 		BUG_ON(ac->ac_2order == 0);
-		/* If this group is uninitialized, skip it initially */
-		desc = ext4_get_group_desc(ac->ac_sb, group, NULL);
-		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
-			return 0;
 
 		/* Avoid using the first bg of a flexgroup for data files */
 		if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
@@ -2067,9 +2062,7 @@ repeat:
 
 			ac->ac_groups_scanned++;
 			desc = ext4_get_group_desc(sb, group, NULL);
-			if (cr == 0 || (desc->bg_flags &
-					cpu_to_le16(EXT4_BG_BLOCK_UNINIT) &&
-					ac->ac_2order != 0))
+			if (cr == 0)
 				ext4_mb_simple_scan_group(ac, &e4b);
 			else if (cr == 1 &&
 					ac->ac_g_ex.fe_len == sbi->s_stripe)
-- 
cgit v0.10.2


From 15e957d08dd4a841359cfec59ecb74041e0097aa Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 30 Apr 2009 01:17:50 -0700
Subject: x86/irq: use move_irq_desc() in create_irq_nr()

move_irq_desc() will try to move irq_desc to the home node if
the allocated one is not correct, in create_irq_nr().

( This can happen on devices that are on different nodes that
  are using MSI, when drivers are loaded and unloaded randomly. )

v2: fix non-smp build
v3: add NUMA_IRQ_DESC to eliminate #ifdefs

[ Impact: improve irq descriptor locality on NUMA systems ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <49F95EAE.2050903@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e1b2543..674e21e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -274,6 +274,10 @@ config SPARSE_IRQ
 
 	  If you don't know what to do here, say N.
 
+config NUMA_IRQ_DESC
+	def_bool y
+	depends on SPARSE_IRQ && NUMA
+
 config X86_MPPARSE
 	bool "Enable MPS table" if ACPI
 	default y
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 9cd4806..e583291 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3197,11 +3197,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
 		if (cfg_new->vector != 0)
 			continue;
 
-#ifdef CONFIG_NUMA_IRQ_DESC
-		/* different node ?*/
-		if (desc_new->node != node)
-			desc = move_irq_desc(desc, node);
-#endif
+		desc_new = move_irq_desc(desc_new, node);
 
 		if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
 			irq = new;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 4b95ddb..eedbb8e 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -206,9 +206,16 @@ extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc
 
 #ifndef CONFIG_SPARSE_IRQ
 extern struct irq_desc irq_desc[NR_IRQS];
-#else /* CONFIG_SPARSE_IRQ */
+#endif
+
+#ifdef CONFIG_NUMA_IRQ_DESC
 extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int node);
-#endif /* CONFIG_SPARSE_IRQ */
+#else
+static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
+{
+	return desc;
+}
+#endif
 
 extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node);
 
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 2f06527..7d04780 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -3,5 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
-obj-$(CONFIG_SPARSE_IRQ) += numa_migrate.o
+obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o
 obj-$(CONFIG_PM_SLEEP) += pm.o
-- 
cgit v0.10.2


From d444c3c38189b3f18337a213855ac1c07af4e2d9 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 1 May 2009 13:44:33 -0400
Subject: ext4: Move the ext4_i.h header file into ext4.h

There is no longer a reason for a separate ext4_i.h header file, so
move it into ext4.h just to make life easier for developers to find
the relevant data structures and typedefs.  Should also speed up
compiles slightly, too.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 02ec44b..ba57d66 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -21,7 +21,10 @@
 #include <linux/magic.h>
 #include <linux/jbd2.h>
 #include <linux/quota.h>
-#include "ext4_i.h"
+#include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/seqlock.h>
+#include <linux/mutex.h>
 
 /*
  * The fourth extended filesystem constants/structures
@@ -46,6 +49,19 @@
 #define ext4_debug(f, a...)	do {} while (0)
 #endif
 
+/* data type for block offset of block group */
+typedef int ext4_grpblk_t;
+
+/* data type for filesystem-wide blocks number */
+typedef unsigned long long ext4_fsblk_t;
+
+/* data type for file logical block number */
+typedef __u32 ext4_lblk_t;
+
+/* data type for block group number */
+typedef unsigned int ext4_group_t;
+
+
 /* prefer goal again. length */
 #define EXT4_MB_HINT_MERGE		1
 /* blocks already reserved */
@@ -516,6 +532,110 @@ do {									       \
 #endif /* defined(__KERNEL__) || defined(__linux__) */
 
 /*
+ * storage for cached extent
+ */
+struct ext4_ext_cache {
+	ext4_fsblk_t	ec_start;
+	ext4_lblk_t	ec_block;
+	__u32		ec_len; /* must be 32bit to return holes */
+	__u32		ec_type;
+};
+
+/*
+ * fourth extended file system inode data in memory
+ */
+struct ext4_inode_info {
+	__le32	i_data[15];	/* unconverted */
+	__u32	i_flags;
+	ext4_fsblk_t	i_file_acl;
+	__u32	i_dtime;
+
+	/*
+	 * i_block_group is the number of the block group which contains
+	 * this file's inode.  Constant across the lifetime of the inode,
+	 * it is ued for making block allocation decisions - we try to
+	 * place a file's data blocks near its inode block, and new inodes
+	 * near to their parent directory's inode.
+	 */
+	ext4_group_t	i_block_group;
+	__u32	i_state;		/* Dynamic state flags for ext4 */
+
+	ext4_lblk_t		i_dir_start_lookup;
+#ifdef CONFIG_EXT4_FS_XATTR
+	/*
+	 * Extended attributes can be read independently of the main file
+	 * data. Taking i_mutex even when reading would cause contention
+	 * between readers of EAs and writers of regular file data, so
+	 * instead we synchronize on xattr_sem when reading or changing
+	 * EAs.
+	 */
+	struct rw_semaphore xattr_sem;
+#endif
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+	struct posix_acl	*i_acl;
+	struct posix_acl	*i_default_acl;
+#endif
+
+	struct list_head i_orphan;	/* unlinked but open inodes */
+
+	/*
+	 * i_disksize keeps track of what the inode size is ON DISK, not
+	 * in memory.  During truncate, i_size is set to the new size by
+	 * the VFS prior to calling ext4_truncate(), but the filesystem won't
+	 * set i_disksize to 0 until the truncate is actually under way.
+	 *
+	 * The intent is that i_disksize always represents the blocks which
+	 * are used by this file.  This allows recovery to restart truncate
+	 * on orphans if we crash during truncate.  We actually write i_disksize
+	 * into the on-disk inode when writing inodes out, instead of i_size.
+	 *
+	 * The only time when i_disksize and i_size may be different is when
+	 * a truncate is in progress.  The only things which change i_disksize
+	 * are ext4_get_block (growth) and ext4_truncate (shrinkth).
+	 */
+	loff_t	i_disksize;
+
+	/*
+	 * i_data_sem is for serialising ext4_truncate() against
+	 * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
+	 * data tree are chopped off during truncate. We can't do that in
+	 * ext4 because whenever we perform intermediate commits during
+	 * truncate, the inode and all the metadata blocks *must* be in a
+	 * consistent state which allows truncation of the orphans to restart
+	 * during recovery.  Hence we must fix the get_block-vs-truncate race
+	 * by other means, so we have i_data_sem.
+	 */
+	struct rw_semaphore i_data_sem;
+	struct inode vfs_inode;
+	struct jbd2_inode jinode;
+
+	struct ext4_ext_cache i_cached_extent;
+	/*
+	 * File creation time. Its function is same as that of
+	 * struct timespec i_{a,c,m}time in the generic inode.
+	 */
+	struct timespec i_crtime;
+
+	/* mballoc */
+	struct list_head i_prealloc_list;
+	spinlock_t i_prealloc_lock;
+
+	/* ialloc */
+	ext4_group_t	i_last_alloc_group;
+
+	/* allocation reservation info for delalloc */
+	unsigned int i_reserved_data_blocks;
+	unsigned int i_reserved_meta_blocks;
+	unsigned int i_allocated_meta_blocks;
+	unsigned short i_delalloc_reserved_flag;
+
+	/* on-disk additional length */
+	__u16 i_extra_isize;
+
+	spinlock_t i_block_reservation_lock;
+};
+
+/*
  * File system states
  */
 #define	EXT4_VALID_FS			0x0001	/* Unmounted cleanly */
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
deleted file mode 100644
index 4ce2187..0000000
--- a/fs/ext4/ext4_i.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- *  ext4_i.h
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- *  from
- *
- *  linux/include/linux/minix_fs_i.h
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- */
-
-#ifndef _EXT4_I
-#define _EXT4_I
-
-#include <linux/rwsem.h>
-#include <linux/rbtree.h>
-#include <linux/seqlock.h>
-#include <linux/mutex.h>
-
-/* data type for block offset of block group */
-typedef int ext4_grpblk_t;
-
-/* data type for filesystem-wide blocks number */
-typedef unsigned long long ext4_fsblk_t;
-
-/* data type for file logical block number */
-typedef __u32 ext4_lblk_t;
-
-/* data type for block group number */
-typedef unsigned int ext4_group_t;
-
-/*
- * storage for cached extent
- */
-struct ext4_ext_cache {
-	ext4_fsblk_t	ec_start;
-	ext4_lblk_t	ec_block;
-	__u32		ec_len; /* must be 32bit to return holes */
-	__u32		ec_type;
-};
-
-/*
- * fourth extended file system inode data in memory
- */
-struct ext4_inode_info {
-	__le32	i_data[15];	/* unconverted */
-	__u32	i_flags;
-	ext4_fsblk_t	i_file_acl;
-	__u32	i_dtime;
-
-	/*
-	 * i_block_group is the number of the block group which contains
-	 * this file's inode.  Constant across the lifetime of the inode,
-	 * it is ued for making block allocation decisions - we try to
-	 * place a file's data blocks near its inode block, and new inodes
-	 * near to their parent directory's inode.
-	 */
-	ext4_group_t	i_block_group;
-	__u32	i_state;		/* Dynamic state flags for ext4 */
-
-	ext4_lblk_t		i_dir_start_lookup;
-#ifdef CONFIG_EXT4_FS_XATTR
-	/*
-	 * Extended attributes can be read independently of the main file
-	 * data. Taking i_mutex even when reading would cause contention
-	 * between readers of EAs and writers of regular file data, so
-	 * instead we synchronize on xattr_sem when reading or changing
-	 * EAs.
-	 */
-	struct rw_semaphore xattr_sem;
-#endif
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
-	struct posix_acl	*i_acl;
-	struct posix_acl	*i_default_acl;
-#endif
-
-	struct list_head i_orphan;	/* unlinked but open inodes */
-
-	/*
-	 * i_disksize keeps track of what the inode size is ON DISK, not
-	 * in memory.  During truncate, i_size is set to the new size by
-	 * the VFS prior to calling ext4_truncate(), but the filesystem won't
-	 * set i_disksize to 0 until the truncate is actually under way.
-	 *
-	 * The intent is that i_disksize always represents the blocks which
-	 * are used by this file.  This allows recovery to restart truncate
-	 * on orphans if we crash during truncate.  We actually write i_disksize
-	 * into the on-disk inode when writing inodes out, instead of i_size.
-	 *
-	 * The only time when i_disksize and i_size may be different is when
-	 * a truncate is in progress.  The only things which change i_disksize
-	 * are ext4_get_block (growth) and ext4_truncate (shrinkth).
-	 */
-	loff_t	i_disksize;
-
-	/*
-	 * i_data_sem is for serialising ext4_truncate() against
-	 * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
-	 * data tree are chopped off during truncate. We can't do that in
-	 * ext4 because whenever we perform intermediate commits during
-	 * truncate, the inode and all the metadata blocks *must* be in a
-	 * consistent state which allows truncation of the orphans to restart
-	 * during recovery.  Hence we must fix the get_block-vs-truncate race
-	 * by other means, so we have i_data_sem.
-	 */
-	struct rw_semaphore i_data_sem;
-	struct inode vfs_inode;
-	struct jbd2_inode jinode;
-
-	struct ext4_ext_cache i_cached_extent;
-	/*
-	 * File creation time. Its function is same as that of
-	 * struct timespec i_{a,c,m}time in the generic inode.
-	 */
-	struct timespec i_crtime;
-
-	/* mballoc */
-	struct list_head i_prealloc_list;
-	spinlock_t i_prealloc_lock;
-
-	/* ialloc */
-	ext4_group_t	i_last_alloc_group;
-
-	/* allocation reservation info for delalloc */
-	unsigned int i_reserved_data_blocks;
-	unsigned int i_reserved_meta_blocks;
-	unsigned int i_allocated_meta_blocks;
-	unsigned short i_delalloc_reserved_flag;
-
-	/* on-disk additional length */
-	__u16 i_extra_isize;
-
-	spinlock_t i_block_reservation_lock;
-};
-
-#endif	/* _EXT4_I */
-- 
cgit v0.10.2


From ca0faba0e8ac844dc0279825eb8db876b5962ea5 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 3 May 2009 16:33:44 -0400
Subject: ext4: Move the ext4_sb.h header file into ext4.h

There is no longer a reason for a separate ext4_sb.h header file, so
move it into ext4.h just to make life easier for developers to find
the relevant data structures and typedefs.  Should also speed up
compiles slightly, too.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ba57d66..af3c906 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -25,6 +25,10 @@
 #include <linux/rbtree.h>
 #include <linux/seqlock.h>
 #include <linux/mutex.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/blockgroup_lock.h>
+#include <linux/percpu_counter.h>
 
 /*
  * The fourth extended filesystem constants/structures
@@ -195,9 +199,6 @@ struct flex_groups {
 #define EXT4_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not in use */
 #define EXT4_BG_INODE_ZEROED	0x0004 /* On-disk itable initialized to zero */
 
-#ifdef __KERNEL__
-#include "ext4_sb.h"
-#endif
 /*
  * Macro-instructions used to manage group descriptors
  */
@@ -809,6 +810,136 @@ struct ext4_super_block {
 };
 
 #ifdef __KERNEL__
+/*
+ * fourth extended-fs super-block data in memory
+ */
+struct ext4_sb_info {
+	unsigned long s_desc_size;	/* Size of a group descriptor in bytes */
+	unsigned long s_inodes_per_block;/* Number of inodes per block */
+	unsigned long s_blocks_per_group;/* Number of blocks in a group */
+	unsigned long s_inodes_per_group;/* Number of inodes in a group */
+	unsigned long s_itb_per_group;	/* Number of inode table blocks per group */
+	unsigned long s_gdb_count;	/* Number of group descriptor blocks */
+	unsigned long s_desc_per_block;	/* Number of group descriptors per block */
+	ext4_group_t s_groups_count;	/* Number of groups in the fs */
+	unsigned long s_overhead_last;  /* Last calculated overhead */
+	unsigned long s_blocks_last;    /* Last seen block count */
+	loff_t s_bitmap_maxbytes;	/* max bytes for bitmap files */
+	struct buffer_head * s_sbh;	/* Buffer containing the super block */
+	struct ext4_super_block *s_es;	/* Pointer to the super block in the buffer */
+	struct buffer_head **s_group_desc;
+	unsigned long  s_mount_opt;
+	ext4_fsblk_t s_sb_block;
+	uid_t s_resuid;
+	gid_t s_resgid;
+	unsigned short s_mount_state;
+	unsigned short s_pad;
+	int s_addr_per_block_bits;
+	int s_desc_per_block_bits;
+	int s_inode_size;
+	int s_first_ino;
+	unsigned int s_inode_readahead_blks;
+	spinlock_t s_next_gen_lock;
+	u32 s_next_generation;
+	u32 s_hash_seed[4];
+	int s_def_hash_version;
+	int s_hash_unsigned;	/* 3 if hash should be signed, 0 if not */
+	struct percpu_counter s_freeblocks_counter;
+	struct percpu_counter s_freeinodes_counter;
+	struct percpu_counter s_dirs_counter;
+	struct percpu_counter s_dirtyblocks_counter;
+	struct blockgroup_lock *s_blockgroup_lock;
+	struct proc_dir_entry *s_proc;
+	struct kobject s_kobj;
+	struct completion s_kobj_unregister;
+
+	/* Journaling */
+	struct inode *s_journal_inode;
+	struct journal_s *s_journal;
+	struct list_head s_orphan;
+	struct mutex s_orphan_lock;
+	struct mutex s_resize_lock;
+	unsigned long s_commit_interval;
+	u32 s_max_batch_time;
+	u32 s_min_batch_time;
+	struct block_device *journal_bdev;
+#ifdef CONFIG_JBD2_DEBUG
+	struct timer_list turn_ro_timer;	/* For turning read-only (crash simulation) */
+	wait_queue_head_t ro_wait_queue;	/* For people waiting for the fs to go read-only */
+#endif
+#ifdef CONFIG_QUOTA
+	char *s_qf_names[MAXQUOTAS];		/* Names of quota files with journalled quota */
+	int s_jquota_fmt;			/* Format of quota to use */
+#endif
+	unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
+
+#ifdef EXTENTS_STATS
+	/* ext4 extents stats */
+	unsigned long s_ext_min;
+	unsigned long s_ext_max;
+	unsigned long s_depth_max;
+	spinlock_t s_ext_stats_lock;
+	unsigned long s_ext_blocks;
+	unsigned long s_ext_extents;
+#endif
+
+	/* for buddy allocator */
+	struct ext4_group_info ***s_group_info;
+	struct inode *s_buddy_cache;
+	long s_blocks_reserved;
+	spinlock_t s_reserve_lock;
+	spinlock_t s_md_lock;
+	tid_t s_last_transaction;
+	unsigned short *s_mb_offsets;
+	unsigned int *s_mb_maxs;
+
+	/* tunables */
+	unsigned long s_stripe;
+	unsigned int s_mb_stream_request;
+	unsigned int s_mb_max_to_scan;
+	unsigned int s_mb_min_to_scan;
+	unsigned int s_mb_stats;
+	unsigned int s_mb_order2_reqs;
+	unsigned int s_mb_group_prealloc;
+	/* where last allocation was done - for stream allocation */
+	unsigned long s_mb_last_group;
+	unsigned long s_mb_last_start;
+
+	/* history to debug policy */
+	struct ext4_mb_history *s_mb_history;
+	int s_mb_history_cur;
+	int s_mb_history_max;
+	int s_mb_history_num;
+	spinlock_t s_mb_history_lock;
+	int s_mb_history_filter;
+
+	/* stats for buddy allocator */
+	spinlock_t s_mb_pa_lock;
+	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
+	atomic_t s_bal_success;	/* we found long enough chunks */
+	atomic_t s_bal_allocated;	/* in blocks */
+	atomic_t s_bal_ex_scanned;	/* total extents scanned */
+	atomic_t s_bal_goals;	/* goal hits */
+	atomic_t s_bal_breaks;	/* too long searches */
+	atomic_t s_bal_2orders;	/* 2^order hits */
+	spinlock_t s_bal_lock;
+	unsigned long s_mb_buddies_generated;
+	unsigned long long s_mb_generation_time;
+	atomic_t s_mb_lost_chunks;
+	atomic_t s_mb_preallocated;
+	atomic_t s_mb_discarded;
+
+	/* locality groups */
+	struct ext4_locality_group *s_locality_groups;
+
+	/* for write statistics */
+	unsigned long s_sectors_written_start;
+	u64 s_kbytes_written;
+
+	unsigned int s_log_groups_per_flex;
+	struct flex_groups *s_flex_groups;
+};
+
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
 {
 	return sb->s_fs_info;
@@ -824,7 +955,6 @@ static inline struct timespec ext4_current_time(struct inode *inode)
 		current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
 }
 
-
 static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 {
 	return ino == EXT4_ROOT_INO ||
@@ -833,6 +963,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 		(ino >= EXT4_FIRST_INO(sb) &&
 		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
 }
+
+static inline spinlock_t *
+sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
+{
+	return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
+}
 #else
 /* Assume that user mode programs are passing in an ext4fs superblock, not
  * a kernel struct super_block.  This will allow us to call the feature-test
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
deleted file mode 100644
index 2d36223..0000000
--- a/fs/ext4/ext4_sb.h
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- *  ext4_sb.h
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- *  from
- *
- *  linux/include/linux/minix_fs_sb.h
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- */
-
-#ifndef _EXT4_SB
-#define _EXT4_SB
-
-#ifdef __KERNEL__
-#include <linux/timer.h>
-#include <linux/wait.h>
-#include <linux/blockgroup_lock.h>
-#include <linux/percpu_counter.h>
-#endif
-#include <linux/rbtree.h>
-
-/*
- * fourth extended-fs super-block data in memory
- */
-struct ext4_sb_info {
-	unsigned long s_desc_size;	/* Size of a group descriptor in bytes */
-	unsigned long s_inodes_per_block;/* Number of inodes per block */
-	unsigned long s_blocks_per_group;/* Number of blocks in a group */
-	unsigned long s_inodes_per_group;/* Number of inodes in a group */
-	unsigned long s_itb_per_group;	/* Number of inode table blocks per group */
-	unsigned long s_gdb_count;	/* Number of group descriptor blocks */
-	unsigned long s_desc_per_block;	/* Number of group descriptors per block */
-	ext4_group_t s_groups_count;	/* Number of groups in the fs */
-	unsigned long s_overhead_last;  /* Last calculated overhead */
-	unsigned long s_blocks_last;    /* Last seen block count */
-	loff_t s_bitmap_maxbytes;	/* max bytes for bitmap files */
-	struct buffer_head * s_sbh;	/* Buffer containing the super block */
-	struct ext4_super_block *s_es;	/* Pointer to the super block in the buffer */
-	struct buffer_head **s_group_desc;
-	unsigned long  s_mount_opt;
-	ext4_fsblk_t s_sb_block;
-	uid_t s_resuid;
-	gid_t s_resgid;
-	unsigned short s_mount_state;
-	unsigned short s_pad;
-	int s_addr_per_block_bits;
-	int s_desc_per_block_bits;
-	int s_inode_size;
-	int s_first_ino;
-	unsigned int s_inode_readahead_blks;
-	spinlock_t s_next_gen_lock;
-	u32 s_next_generation;
-	u32 s_hash_seed[4];
-	int s_def_hash_version;
-	int s_hash_unsigned;	/* 3 if hash should be signed, 0 if not */
-	struct percpu_counter s_freeblocks_counter;
-	struct percpu_counter s_freeinodes_counter;
-	struct percpu_counter s_dirs_counter;
-	struct percpu_counter s_dirtyblocks_counter;
-	struct blockgroup_lock *s_blockgroup_lock;
-	struct proc_dir_entry *s_proc;
-	struct kobject s_kobj;
-	struct completion s_kobj_unregister;
-
-	/* Journaling */
-	struct inode *s_journal_inode;
-	struct journal_s *s_journal;
-	struct list_head s_orphan;
-	struct mutex s_orphan_lock;
-	struct mutex s_resize_lock;
-	unsigned long s_commit_interval;
-	u32 s_max_batch_time;
-	u32 s_min_batch_time;
-	struct block_device *journal_bdev;
-#ifdef CONFIG_JBD2_DEBUG
-	struct timer_list turn_ro_timer;	/* For turning read-only (crash simulation) */
-	wait_queue_head_t ro_wait_queue;	/* For people waiting for the fs to go read-only */
-#endif
-#ifdef CONFIG_QUOTA
-	char *s_qf_names[MAXQUOTAS];		/* Names of quota files with journalled quota */
-	int s_jquota_fmt;			/* Format of quota to use */
-#endif
-	unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
-
-#ifdef EXTENTS_STATS
-	/* ext4 extents stats */
-	unsigned long s_ext_min;
-	unsigned long s_ext_max;
-	unsigned long s_depth_max;
-	spinlock_t s_ext_stats_lock;
-	unsigned long s_ext_blocks;
-	unsigned long s_ext_extents;
-#endif
-
-	/* for buddy allocator */
-	struct ext4_group_info ***s_group_info;
-	struct inode *s_buddy_cache;
-	long s_blocks_reserved;
-	spinlock_t s_reserve_lock;
-	spinlock_t s_md_lock;
-	tid_t s_last_transaction;
-	unsigned short *s_mb_offsets;
-	unsigned int *s_mb_maxs;
-
-	/* tunables */
-	unsigned long s_stripe;
-	unsigned int s_mb_stream_request;
-	unsigned int s_mb_max_to_scan;
-	unsigned int s_mb_min_to_scan;
-	unsigned int s_mb_stats;
-	unsigned int s_mb_order2_reqs;
-	unsigned int s_mb_group_prealloc;
-	/* where last allocation was done - for stream allocation */
-	unsigned long s_mb_last_group;
-	unsigned long s_mb_last_start;
-
-	/* history to debug policy */
-	struct ext4_mb_history *s_mb_history;
-	int s_mb_history_cur;
-	int s_mb_history_max;
-	int s_mb_history_num;
-	spinlock_t s_mb_history_lock;
-	int s_mb_history_filter;
-
-	/* stats for buddy allocator */
-	spinlock_t s_mb_pa_lock;
-	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
-	atomic_t s_bal_success;	/* we found long enough chunks */
-	atomic_t s_bal_allocated;	/* in blocks */
-	atomic_t s_bal_ex_scanned;	/* total extents scanned */
-	atomic_t s_bal_goals;	/* goal hits */
-	atomic_t s_bal_breaks;	/* too long searches */
-	atomic_t s_bal_2orders;	/* 2^order hits */
-	spinlock_t s_bal_lock;
-	unsigned long s_mb_buddies_generated;
-	unsigned long long s_mb_generation_time;
-	atomic_t s_mb_lost_chunks;
-	atomic_t s_mb_preallocated;
-	atomic_t s_mb_discarded;
-
-	/* locality groups */
-	struct ext4_locality_group *s_locality_groups;
-
-	/* for write statistics */
-	unsigned long s_sectors_written_start;
-	u64 s_kbytes_written;
-
-	unsigned int s_log_groups_per_flex;
-	struct flex_groups *s_flex_groups;
-};
-
-static inline spinlock_t *
-sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
-{
-	return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
-}
-
-#endif	/* _EXT4_SB */
-- 
cgit v0.10.2


From 596397b77c895d0fa3674f579c94ad5ea88ef01d Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 1 May 2009 13:49:15 -0400
Subject: ext4: Move fs/ext4/namei.h into ext4.h

The fs/ext4/namei.h header file had only a single function
declaration, and should have never been a standalone file.  Move it
into ext4.h, where should have been from the beginning.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index af3c906..d9c5251 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1594,6 +1594,7 @@ extern const struct file_operations ext4_file_operations;
 /* namei.c */
 extern const struct inode_operations ext4_dir_inode_operations;
 extern const struct inode_operations ext4_special_inode_operations;
+extern struct dentry *ext4_get_parent(struct dentry *child);
 
 /* symlink.c */
 extern const struct inode_operations ext4_symlink_inode_operations;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 8018e49..c9690b2 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -37,7 +37,6 @@
 #include "ext4.h"
 #include "ext4_jbd2.h"
 
-#include "namei.h"
 #include "xattr.h"
 #include "acl.h"
 
diff --git a/fs/ext4/namei.h b/fs/ext4/namei.h
deleted file mode 100644
index 5e4dfff..0000000
--- a/fs/ext4/namei.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/*  linux/fs/ext4/namei.h
- *
- * Copyright (C) 2005 Simtec Electronics
- *	Ben Dooks <ben@simtec.co.uk>
- *
-*/
-
-extern struct dentry *ext4_get_parent(struct dentry *child);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1fbf090..d79e1c4 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -46,7 +46,6 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "namei.h"
 #include "group.h"
 
 struct proc_dir_entry *ext4_proc_root;
-- 
cgit v0.10.2


From 6f0aced639d346e5f54eea9fcb2784b633493d09 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Fri, 1 May 2009 23:54:25 +0400
Subject: x86, apic: use pr_ macro

Replace recenly appeared printk with pr_ macro
(the file already use a lot of them).

[ Impact: cleanup ]

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <20090501195425.GB4633@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 28f747d..e258bed 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2191,7 +2191,7 @@ static int __cpuinit set_multi(const struct dmi_system_id *d)
 {
 	if (multi)
 		return 0;
-	printk(KERN_INFO "APIC: %s detected, Multi Chassis\n", d->ident);
+	pr_info("APIC: %s detected, Multi Chassis\n", d->ident);
 	multi = 1;
 	return 0;
 }
-- 
cgit v0.10.2


From bb23c20a851a5038b255a3c0d0aa56093c1da3f8 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 1 May 2009 19:44:44 -0400
Subject: ext4: Move fs/ext4/group.h into ext4.h

Move the function prototypes in group.h into ext4.h so they are all
defined in one place.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index a5ba039..92f557d 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -19,7 +19,6 @@
 #include <linux/buffer_head.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
-#include "group.h"
 #include "mballoc.h"
 
 /*
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d9c5251..5973f32 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1270,6 +1270,14 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 						    ext4_group_t block_group,
 						    struct buffer_head ** bh);
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
+struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
+				      ext4_group_t block_group);
+extern unsigned ext4_init_block_bitmap(struct super_block *sb,
+				       struct buffer_head *bh,
+				       ext4_group_t group,
+				       struct ext4_group_desc *desc);
+#define ext4_free_blocks_after_init(sb, group, desc)			\
+		ext4_init_block_bitmap(sb, NULL, group, desc)
 
 /* dir.c */
 extern int ext4_check_dir_entry(const char *, struct inode *,
@@ -1294,6 +1302,11 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
 extern unsigned long ext4_count_dirs(struct super_block *);
 extern void ext4_check_inodes_bitmap(struct super_block *);
+extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
+				       struct buffer_head *bh,
+				       ext4_group_t group,
+				       struct ext4_group_desc *desc);
+extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
 
 /* mballoc.c */
 extern long ext4_mb_stats;
@@ -1417,6 +1430,10 @@ extern void ext4_used_dirs_set(struct super_block *sb,
 				struct ext4_group_desc *bg, __u32 count);
 extern void ext4_itable_unused_set(struct super_block *sb,
 				   struct ext4_group_desc *bg, __u32 count);
+extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
+				   struct ext4_group_desc *gdp);
+extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
+				       struct ext4_group_desc *gdp);
 
 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 {
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
deleted file mode 100644
index c2c0a8d..0000000
--- a/fs/ext4/group.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- *  linux/fs/ext4/group.h
- *
- * Copyright (C) 2007 Cluster File Systems, Inc
- *
- * Author: Andreas Dilger <adilger@clusterfs.com>
- */
-
-#ifndef _LINUX_EXT4_GROUP_H
-#define _LINUX_EXT4_GROUP_H
-
-extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
-				   struct ext4_group_desc *gdp);
-extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
-				       struct ext4_group_desc *gdp);
-struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
-				      ext4_group_t block_group);
-extern unsigned ext4_init_block_bitmap(struct super_block *sb,
-				       struct buffer_head *bh,
-				       ext4_group_t group,
-				       struct ext4_group_desc *desc);
-#define ext4_free_blocks_after_init(sb, group, desc)			\
-		ext4_init_block_bitmap(sb, NULL, group, desc)
-extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
-				       struct buffer_head *bh,
-				       ext4_group_t group,
-				       struct ext4_group_desc *desc);
-extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
-#endif /* _LINUX_EXT4_GROUP_H */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 55ba419..916d05c 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -27,7 +27,6 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "group.h"
 
 /*
  * ialloc.c contains the inodes allocation and deallocation routines
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index dd9e6cd..75e34f6 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -23,7 +23,6 @@
 #include <linux/mutex.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
-#include "group.h"
 
 /*
  * with AGGRESSIVE_CHECK allocator runs consistency checks over
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index e8ded13..27eb289 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -15,7 +15,6 @@
 #include <linux/slab.h>
 
 #include "ext4_jbd2.h"
-#include "group.h"
 
 #define outside(b, first, last)	((b) < (first) || (b) >= (last))
 #define inside(b, first, last)	((b) >= (first) && (b) < (last))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d79e1c4..7903f20 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -46,7 +46,6 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "group.h"
 
 struct proc_dir_entry *ext4_proc_root;
 static struct kset *ext4_kset;
-- 
cgit v0.10.2


From f40339031b04279c3fdde7ac5fe97db33b2a7694 Mon Sep 17 00:00:00 2001
From: Curt Wohlgemuth <curtw@google.com>
Date: Fri, 1 May 2009 20:27:20 -0400
Subject: ext4: Make the length of the mb_history file tunable

In memory-constrained systems with many partitions, the ~68K for each
partition for the mb_history buffer can be excessive.

This patch adds a new mount option, mb_history_length, as well as a
way of setting the default via a module parameter (or via a sysfs
parameter in /sys/module/ext4/parameter/default_mb_history_length).
If the mb_history_length is set to zero, the mb_history facility is
disabled entirely.

Signed-off-by: Curt Wohlgemuth <curtw@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index dbd47ea..df75855 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2413,7 +2413,8 @@ static void ext4_mb_history_release(struct super_block *sb)
 
 	if (sbi->s_proc != NULL) {
 		remove_proc_entry("mb_groups", sbi->s_proc);
-		remove_proc_entry("mb_history", sbi->s_proc);
+		if (sbi->s_mb_history_max)
+			remove_proc_entry("mb_history", sbi->s_proc);
 	}
 	kfree(sbi->s_mb_history);
 }
@@ -2424,17 +2425,17 @@ static void ext4_mb_history_init(struct super_block *sb)
 	int i;
 
 	if (sbi->s_proc != NULL) {
-		proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
-				 &ext4_mb_seq_history_fops, sb);
+		if (sbi->s_mb_history_max)
+			proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
+					 &ext4_mb_seq_history_fops, sb);
 		proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
 				 &ext4_mb_seq_groups_fops, sb);
 	}
 
-	sbi->s_mb_history_max = 1000;
 	sbi->s_mb_history_cur = 0;
 	spin_lock_init(&sbi->s_mb_history_lock);
 	i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
-	sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
+	sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL;
 	/* if we can't allocate history, then we simple won't use it */
 }
 
@@ -2444,7 +2445,7 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
 	struct ext4_mb_history h;
 
-	if (unlikely(sbi->s_mb_history == NULL))
+	if (sbi->s_mb_history == NULL)
 		return;
 
 	if (!(ac->ac_op & sbi->s_mb_history_filter))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7903f20..39223a5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -47,6 +47,13 @@
 #include "xattr.h"
 #include "acl.h"
 
+static int default_mb_history_length = 1000;
+
+module_param_named(default_mb_history_length, default_mb_history_length,
+		   int, 0644);
+MODULE_PARM_DESC(default_mb_history_length,
+		 "Default number of entries saved for mb_history");
+
 struct proc_dir_entry *ext4_proc_root;
 static struct kset *ext4_kset;
 
@@ -1042,7 +1049,7 @@ enum {
 	Opt_journal_update, Opt_journal_dev,
 	Opt_journal_checksum, Opt_journal_async_commit,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
-	Opt_data_err_abort, Opt_data_err_ignore,
+	Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
@@ -1088,6 +1095,7 @@ static const match_table_t tokens = {
 	{Opt_data_writeback, "data=writeback"},
 	{Opt_data_err_abort, "data_err=abort"},
 	{Opt_data_err_ignore, "data_err=ignore"},
+	{Opt_mb_history_length, "mb_history_length=%u"},
 	{Opt_offusrjquota, "usrjquota="},
 	{Opt_usrjquota, "usrjquota=%s"},
 	{Opt_offgrpjquota, "grpjquota="},
@@ -1329,6 +1337,13 @@ static int parse_options(char *options, struct super_block *sb,
 		case Opt_data_err_ignore:
 			clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
 			break;
+		case Opt_mb_history_length:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0)
+				return 0;
+			sbi->s_mb_history_max = option;
+			break;
 #ifdef CONFIG_QUOTA
 		case Opt_usrjquota:
 			qtype = USRQUOTA;
@@ -2345,6 +2360,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
 	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
 	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
+	sbi->s_mb_history_max = default_mb_history_length;
 
 	set_opt(sbi->s_mount_opt, BARRIER);
 
-- 
cgit v0.10.2


From abc8746eb91fb01e8d411896f80f7687c0d8372e Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 2 May 2009 22:54:32 -0400
Subject: ext4: hook fiemap operation for directories

Add fiemap callback for directories

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index c9690b2..f2bc160 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2534,6 +2534,7 @@ const struct inode_operations ext4_dir_inode_operations = {
 	.removexattr	= generic_removexattr,
 #endif
 	.permission	= ext4_permission,
+	.fiemap         = ext4_fiemap,
 };
 
 const struct inode_operations ext4_special_inode_operations = {
-- 
cgit v0.10.2


From 19ba0559f9ce104171ab16706893ce01f03ef116 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 13 May 2009 18:12:05 -0400
Subject: vfs: Enable FS_IOC_FIEMAP and FIGETBSZ for all filetypes

The fiemap and get_blk_size ioctls should be enabled even for
directories.  So move it outisde file_ioctl.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ioctl.c b/fs/ioctl.c
index 82d9c42..286f38d 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -414,10 +414,6 @@ static int file_ioctl(struct file *filp, unsigned int cmd,
 	switch (cmd) {
 	case FIBMAP:
 		return ioctl_fibmap(filp, p);
-	case FS_IOC_FIEMAP:
-		return ioctl_fiemap(filp, arg);
-	case FIGETBSZ:
-		return put_user(inode->i_sb->s_blocksize, p);
 	case FIONREAD:
 		return put_user(i_size_read(inode) - filp->f_pos, p);
 	}
@@ -557,6 +553,16 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 		error = ioctl_fsthaw(filp);
 		break;
 
+	case FS_IOC_FIEMAP:
+		return ioctl_fiemap(filp, arg);
+
+	case FIGETBSZ:
+	{
+		struct inode *inode = filp->f_path.dentry->d_inode;
+		int __user *p = (int __user *)arg;
+		return put_user(inode->i_sb->s_blocksize, p);
+	}
+
 	default:
 		if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
 			error = file_ioctl(filp, cmd, arg);
-- 
cgit v0.10.2


From c9877b205f6ce7943bb95281342f4001cc1c00ec Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Fri, 1 May 2009 23:32:06 -0400
Subject: ext4: fix for fiemap last-block test

Carl Henrik Lunde reported and debugged this; the test for the
last allocated block was comparing bytes to blocks in this test:

	if (logical + length - 1 == EXT_MAX_BLOCK ||
	    ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK)
		flags |= FIEMAP_EXTENT_LAST;

so any extent which ended right at 4G was stopping the extent
walk.  Just replacing these values with the extent block &
length should fix it.

Also give blksize_bits a saner type, and reverse the order
of the tests to make the more likely case tested first.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reported-by: Carl Henrik Lunde <chlunde@ping.uio.no>
Tested-by: Carl Henrik Lunde <chlunde@ping.uio.no>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ea5c476..5f72952 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3196,7 +3196,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
 		       void *data)
 {
 	struct fiemap_extent_info *fieinfo = data;
-	unsigned long blksize_bits = inode->i_sb->s_blocksize_bits;
+	unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
 	__u64	logical;
 	__u64	physical;
 	__u64	length;
@@ -3243,8 +3243,8 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
 	 *
 	 * XXX this might miss a single-block extent at EXT_MAX_BLOCK
 	 */
-	if (logical + length - 1 == EXT_MAX_BLOCK ||
-	    ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK)
+	if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
+	    newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK)
 		flags |= FIEMAP_EXTENT_LAST;
 
 	error = fiemap_fill_next_extent(fieinfo, logical, physical,
-- 
cgit v0.10.2


From a25cbd045a2ffc42787d4dbcbb9c7118f5f42732 Mon Sep 17 00:00:00 2001
From: Magnus Damm <magnus.damm@gmail.com>
Date: Fri, 1 May 2009 14:45:46 +0900
Subject: clocksource: setup mult_orig in clocksource_enable()

Setup clocksource mult_orig in clocksource_enable().

Clocksource drivers can save power by using keeping the
device clock disabled while the clocksource is unused.

In practice this means that the enable() and disable()
callbacks perform clk_enable() and clk_disable().

The enable() callback may also use clk_get_rate() to get
the clock rate from the clock framework. This information
can then be used to calculate the shift and mult variables.

Currently the mult_orig variable is setup from mult at
registration time only. This is conflicting with the above
case since the clock is disabled and the mult variable is
not yet calculated at the time of registration.

Moving the mult_orig setup code to clocksource_enable()
allows us to both handle the common case with no enable()
callback and the mult-changed-after-enable() case.

[ Impact: allow dynamic clock source usage ]

Signed-off-by: Magnus Damm <damm@igel.co.jp>
LKML-Reference: <20090501054546.8193.10688.sendpatchset@rx1.opensource.se>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 5a40d14..c56457c 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -288,7 +288,15 @@ static inline cycle_t clocksource_read(struct clocksource *cs)
  */
 static inline int clocksource_enable(struct clocksource *cs)
 {
-	return cs->enable ? cs->enable(cs) : 0;
+	int ret = 0;
+
+	if (cs->enable)
+		ret = cs->enable(cs);
+
+	/* save mult_orig on enable */
+	cs->mult_orig = cs->mult;
+
+	return ret;
 }
 
 /**
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index ecfd7b5..80189f6 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -402,9 +402,6 @@ int clocksource_register(struct clocksource *c)
 	unsigned long flags;
 	int ret;
 
-	/* save mult_orig on registration */
-	c->mult_orig = c->mult;
-
 	spin_lock_irqsave(&clocksource_lock, flags);
 	ret = clocksource_enqueue(c);
 	if (!ret)
-- 
cgit v0.10.2


From 7d27558c4138ac6b3684dea35c2f4379b940a7dd Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Fri, 1 May 2009 13:10:26 -0700
Subject: timekeeping: create arch_gettimeoffset infrastructure

Some arches don't supply their own clocksource. This is mainly the
case in architectures that get their inter-tick times by reading the
counter on their interval timer.  Since these timers wrap every tick,
they're not really useful as clocksources.  Wrapping them to act like
one is possible but not very efficient. So we provide a callout these
arches can implement for use with the jiffies clocksource to provide
finer then tick granular time.

[ Impact: ease the migration to generic time keeping ]

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/time.h b/include/linux/time.h
index 242f624..ea16c1a 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -113,6 +113,21 @@ struct timespec current_kernel_time(void);
 #define CURRENT_TIME		(current_kernel_time())
 #define CURRENT_TIME_SEC	((struct timespec) { get_seconds(), 0 })
 
+/* Some architectures do not supply their own clocksource.
+ * This is mainly the case in architectures that get their
+ * inter-tick times by reading the counter on their interval
+ * timer. Since these timers wrap every tick, they're not really
+ * useful as clocksources. Wrapping them to act like one is possible
+ * but not very efficient. So we provide a callout these arches
+ * can implement for use with the jiffies clocksource to provide
+ * finer then tick granular time.
+ */
+#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
+extern u32 arch_gettimeoffset(void);
+#else
+static inline u32 arch_gettimeoffset(void) { return 0; }
+#endif
+
 extern void do_gettimeofday(struct timeval *tv);
 extern int do_settimeofday(struct timespec *tv);
 extern int do_sys_settimeofday(struct timespec *tv, struct timezone *tz);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 687dff4..e97c50f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -77,6 +77,10 @@ static void clocksource_forward_now(void)
 	clock->cycle_last = cycle_now;
 
 	nsec = cyc2ns(clock, cycle_delta);
+
+	/* If arch requires, add in gettimeoffset() */
+	nsec += arch_gettimeoffset();
+
 	timespec_add_ns(&xtime, nsec);
 
 	nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
@@ -111,6 +115,9 @@ void getnstimeofday(struct timespec *ts)
 		/* convert to nanoseconds: */
 		nsecs = cyc2ns(clock, cycle_delta);
 
+		/* If arch requires, add in gettimeoffset() */
+		nsecs += arch_gettimeoffset();
+
 	} while (read_seqretry(&xtime_lock, seq));
 
 	timespec_add_ns(ts, nsecs);
-- 
cgit v0.10.2


From eefd7f03b86b8a319890e7fac5a6fcc7f8694b76 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 2 May 2009 19:05:37 -0400
Subject: ext4: fix the length returned by fiemap for an unallocated extent

If the file's blocks have not yet been allocated because of delayed
allocation, the length of the extent returned by fiemap is incorrect.
This commit fixes this bug.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 5f72952..4fec6b7 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3244,8 +3244,15 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
 	 * XXX this might miss a single-block extent at EXT_MAX_BLOCK
 	 */
 	if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
-	    newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK)
+	    newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) {
+		loff_t size = i_size_read(inode);
+		loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb);
+
 		flags |= FIEMAP_EXTENT_LAST;
+		if ((flags & FIEMAP_EXTENT_DELALLOC) &&
+		    logical+length > size)
+			length = (size - logical + bs - 1) & ~(bs-1);
+	}
 
 	error = fiemap_fill_next_extent(fieinfo, logical, physical,
 					length, flags);
-- 
cgit v0.10.2


From 955ce5f5be67dfe0d1d096b543af33fe8a1ce3dd Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 2 May 2009 20:35:09 -0400
Subject: ext4: Convert ext4_lock_group to use sb_bgl_lock

We have sb_bgl_lock() and ext4_group_info.bb_state
bit spinlock to protech group information. The later is only
used within mballoc code. Consolidate them to use sb_bgl_lock().
This makes the mballoc.c code much simpler and also avoid
confusion with two locks protecting same info.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 92f557d..e2126d7 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -326,16 +326,16 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 		unlock_buffer(bh);
 		return bh;
 	}
-	spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
+	ext4_lock_group(sb, block_group);
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 		ext4_init_block_bitmap(sb, bh, block_group, desc);
 		set_bitmap_uptodate(bh);
 		set_buffer_uptodate(bh);
-		spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+		ext4_unlock_group(sb, block_group);
 		unlock_buffer(bh);
 		return bh;
 	}
-	spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+	ext4_unlock_group(sb, block_group);
 	if (buffer_uptodate(bh)) {
 		/*
 		 * if not uninit if bh is uptodate,
@@ -451,7 +451,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 	down_write(&grp->alloc_sem);
 	for (i = 0, blocks_freed = 0; i < count; i++) {
 		BUFFER_TRACE(bitmap_bh, "clear bit");
-		if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
+		if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
 						bit + i, bitmap_bh->b_data)) {
 			ext4_error(sb, __func__,
 				   "bit already cleared for block %llu",
@@ -461,11 +461,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 			blocks_freed++;
 		}
 	}
-	spin_lock(sb_bgl_lock(sbi, block_group));
+	ext4_lock_group(sb, block_group);
 	blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
 	ext4_free_blks_set(sb, desc, blk_free_count);
 	desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
-	spin_unlock(sb_bgl_lock(sbi, block_group));
+	ext4_unlock_group(sb, block_group);
 	percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
 
 	if (sbi->s_log_groups_per_flex) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5973f32..149e02d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -963,12 +963,6 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 		(ino >= EXT4_FIRST_INO(sb) &&
 		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
 }
-
-static inline spinlock_t *
-sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
-{
-	return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
-}
 #else
 /* Assume that user mode programs are passing in an ext4fs superblock, not
  * a kernel struct super_block.  This will allow us to call the feature-test
@@ -1568,33 +1562,31 @@ struct ext4_group_info {
 };
 
 #define EXT4_GROUP_INFO_NEED_INIT_BIT	0
-#define EXT4_GROUP_INFO_LOCKED_BIT	1
 
 #define EXT4_MB_GRP_NEED_INIT(grp)	\
 	(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
 
-static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
+					      ext4_group_t group)
 {
-	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+	return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
+}
 
-	bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+{
+	spin_lock(ext4_group_lock_ptr(sb, group));
 }
 
 static inline void ext4_unlock_group(struct super_block *sb,
 					ext4_group_t group)
 {
-	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
-	bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+	spin_unlock(ext4_group_lock_ptr(sb, group));
 }
 
 static inline int ext4_is_group_locked(struct super_block *sb,
 					ext4_group_t group)
 {
-	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
-	return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
-						&(grinfo->bb_state));
+	return spin_is_locked(ext4_group_lock_ptr(sb, group));
 }
 
 /*
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 916d05c..82f7d1d 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -122,16 +122,16 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 		unlock_buffer(bh);
 		return bh;
 	}
-	spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
+	ext4_lock_group(sb, block_group);
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
 		ext4_init_inode_bitmap(sb, bh, block_group, desc);
 		set_bitmap_uptodate(bh);
 		set_buffer_uptodate(bh);
-		spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+		ext4_unlock_group(sb, block_group);
 		unlock_buffer(bh);
 		return bh;
 	}
-	spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+	ext4_unlock_group(sb, block_group);
 	if (buffer_uptodate(bh)) {
 		/*
 		 * if not uninit if bh is uptodate,
@@ -246,9 +246,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 		goto error_return;
 
 	/* Ok, now we can actually update the inode bitmaps.. */
-	spin_lock(sb_bgl_lock(sbi, block_group));
-	cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
-	spin_unlock(sb_bgl_lock(sbi, block_group));
+	cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
+					bit, bitmap_bh->b_data);
 	if (!cleared)
 		ext4_error(sb, "ext4_free_inode",
 			   "bit already cleared for inode %lu", ino);
@@ -260,7 +259,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 		if (fatal) goto error_return;
 
 		if (gdp) {
-			spin_lock(sb_bgl_lock(sbi, block_group));
+			ext4_lock_group(sb, block_group);
 			count = ext4_free_inodes_count(sb, gdp) + 1;
 			ext4_free_inodes_set(sb, gdp, count);
 			if (is_directory) {
@@ -276,7 +275,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 			}
 			gdp->bg_checksum = ext4_group_desc_csum(sbi,
 							block_group, gdp);
-			spin_unlock(sb_bgl_lock(sbi, block_group));
+			ext4_unlock_group(sb, block_group);
 			percpu_counter_inc(&sbi->s_freeinodes_counter);
 			if (is_directory)
 				percpu_counter_dec(&sbi->s_dirs_counter);
@@ -707,10 +706,10 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 
 /*
  * claim the inode from the inode bitmap. If the group
- * is uninit we need to take the groups's sb_bgl_lock
+ * is uninit we need to take the groups's ext4_group_lock
  * and clear the uninit flag. The inode bitmap update
  * and group desc uninit flag clear should be done
- * after holding sb_bgl_lock so that ext4_read_inode_bitmap
+ * after holding ext4_group_lock so that ext4_read_inode_bitmap
  * doesn't race with the ext4_claim_inode
  */
 static int ext4_claim_inode(struct super_block *sb,
@@ -721,7 +720,7 @@ static int ext4_claim_inode(struct super_block *sb,
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
 
-	spin_lock(sb_bgl_lock(sbi, group));
+	ext4_lock_group(sb, group);
 	if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
 		/* not a free inode */
 		retval = 1;
@@ -730,7 +729,7 @@ static int ext4_claim_inode(struct super_block *sb,
 	ino++;
 	if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
 			ino > EXT4_INODES_PER_GROUP(sb)) {
-		spin_unlock(sb_bgl_lock(sbi, group));
+		ext4_unlock_group(sb, group);
 		ext4_error(sb, __func__,
 			   "reserved inode or inode > inodes count - "
 			   "block_group = %u, inode=%lu", group,
@@ -779,7 +778,7 @@ static int ext4_claim_inode(struct super_block *sb,
 	}
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 err_ret:
-	spin_unlock(sb_bgl_lock(sbi, group));
+	ext4_unlock_group(sb, group);
 	return retval;
 }
 
@@ -935,7 +934,7 @@ got:
 		}
 
 		free = 0;
-		spin_lock(sb_bgl_lock(sbi, group));
+		ext4_lock_group(sb, group);
 		/* recheck and clear flag under lock if we still need to */
 		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 			free = ext4_free_blocks_after_init(sb, group, gdp);
@@ -944,7 +943,7 @@ got:
 			gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
 								gdp);
 		}
-		spin_unlock(sb_bgl_lock(sbi, group));
+		ext4_unlock_group(sb, group);
 
 		/* Don't need to dirty bitmap block if we didn't change it */
 		if (free) {
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index df75855..e76459c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -372,24 +372,12 @@ static inline void mb_set_bit(int bit, void *addr)
 	ext4_set_bit(bit, addr);
 }
 
-static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
-{
-	addr = mb_correct_addr_and_bit(&bit, addr);
-	ext4_set_bit_atomic(lock, bit, addr);
-}
-
 static inline void mb_clear_bit(int bit, void *addr)
 {
 	addr = mb_correct_addr_and_bit(&bit, addr);
 	ext4_clear_bit(bit, addr);
 }
 
-static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
-{
-	addr = mb_correct_addr_and_bit(&bit, addr);
-	ext4_clear_bit_atomic(lock, bit, addr);
-}
-
 static inline int mb_find_next_zero_bit(void *addr, int max, int start)
 {
 	int fix = 0, ret, tmpmax;
@@ -803,17 +791,17 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 			unlock_buffer(bh[i]);
 			continue;
 		}
-		spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+		ext4_lock_group(sb, first_group + i);
 		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 			ext4_init_block_bitmap(sb, bh[i],
 						first_group + i, desc);
 			set_bitmap_uptodate(bh[i]);
 			set_buffer_uptodate(bh[i]);
-			spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+			ext4_unlock_group(sb, first_group + i);
 			unlock_buffer(bh[i]);
 			continue;
 		}
-		spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+		ext4_unlock_group(sb, first_group + i);
 		if (buffer_uptodate(bh[i])) {
 			/*
 			 * if not uninit if bh is uptodate,
@@ -1080,7 +1068,7 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
 	return 0;
 }
 
-static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
+static void mb_clear_bits(void *bm, int cur, int len)
 {
 	__u32 *addr;
 
@@ -1093,15 +1081,12 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
 			cur += 32;
 			continue;
 		}
-		if (lock)
-			mb_clear_bit_atomic(lock, cur, bm);
-		else
-			mb_clear_bit(cur, bm);
+		mb_clear_bit(cur, bm);
 		cur++;
 	}
 }
 
-static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
+static void mb_set_bits(void *bm, int cur, int len)
 {
 	__u32 *addr;
 
@@ -1114,10 +1099,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
 			cur += 32;
 			continue;
 		}
-		if (lock)
-			mb_set_bit_atomic(lock, cur, bm);
-		else
-			mb_set_bit(cur, bm);
+		mb_set_bit(cur, bm);
 		cur++;
 	}
 }
@@ -1332,8 +1314,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
 		e4b->bd_info->bb_counters[ord]++;
 	}
 
-	mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group),
-			EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
+	mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
 	mb_check_buddy(e4b);
 
 	return ret;
@@ -2756,7 +2737,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	return 0;
 }
 
-/* need to called with ext4 group lock (ext4_lock_group) */
+/* need to called with the ext4 group lock held */
 static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
 {
 	struct ext4_prealloc_space *pa;
@@ -2993,14 +2974,17 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 		 * Fix the bitmap and repeat the block allocation
 		 * We leak some of the blocks here.
 		 */
-		mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group),
-				bitmap_bh->b_data, ac->ac_b_ex.fe_start,
-				ac->ac_b_ex.fe_len);
+		ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+		mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+			    ac->ac_b_ex.fe_len);
+		ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
 		err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
 		if (!err)
 			err = -EAGAIN;
 		goto out_err;
 	}
+
+	ext4_lock_group(sb, ac->ac_b_ex.fe_group);
 #ifdef AGGRESSIVE_CHECK
 	{
 		int i;
@@ -3010,9 +2994,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 		}
 	}
 #endif
-	spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
-	mb_set_bits(NULL, bitmap_bh->b_data,
-				ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
+	mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len);
 	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
 		ext4_free_blks_set(sb, gdp,
@@ -3022,7 +3004,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
 	ext4_free_blks_set(sb, gdp, len);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
-	spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+
+	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
 	percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
 	/*
 	 * Now reduce the dirty block count also. Should not go negative
@@ -3455,7 +3438,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
  * the function goes through all block freed in the group
  * but not yet committed and marks them used in in-core bitmap.
  * buddy must be generated from this bitmap
- * Need to be called with ext4 group lock (ext4_lock_group)
+ * Need to be called with the ext4 group lock held
  */
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 						ext4_group_t group)
@@ -3469,9 +3452,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 
 	while (n) {
 		entry = rb_entry(n, struct ext4_free_data, node);
-		mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
-				bitmap, entry->start_blk,
-				entry->count);
+		mb_set_bits(bitmap, entry->start_blk, entry->count);
 		n = rb_next(n);
 	}
 	return;
@@ -3480,7 +3461,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 /*
  * the function goes through all preallocation in this group and marks them
  * used in in-core bitmap. buddy must be generated from this bitmap
- * Need to be called with ext4 group lock (ext4_lock_group)
+ * Need to be called with ext4 group lock held
  */
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 					ext4_group_t group)
@@ -3512,8 +3493,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 		if (unlikely(len == 0))
 			continue;
 		BUG_ON(groupnr != group);
-		mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
-						bitmap, start, len);
+		mb_set_bits(bitmap, start, len);
 		preallocated += len;
 		count++;
 	}
@@ -4856,29 +4836,25 @@ do_more:
 		new_entry->group  = block_group;
 		new_entry->count = count;
 		new_entry->t_tid = handle->h_transaction->t_tid;
+
 		ext4_lock_group(sb, block_group);
-		mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
-				bit, count);
+		mb_clear_bits(bitmap_bh->b_data, bit, count);
 		ext4_mb_free_metadata(handle, &e4b, new_entry);
-		ext4_unlock_group(sb, block_group);
 	} else {
-		ext4_lock_group(sb, block_group);
 		/* need to update group_info->bb_free and bitmap
 		 * with group lock held. generate_buddy look at
 		 * them with group lock_held
 		 */
-		mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
-				bit, count);
+		ext4_lock_group(sb, block_group);
+		mb_clear_bits(bitmap_bh->b_data, bit, count);
 		mb_free_blocks(inode, &e4b, bit, count);
 		ext4_mb_return_to_preallocation(inode, &e4b, block, count);
-		ext4_unlock_group(sb, block_group);
 	}
 
-	spin_lock(sb_bgl_lock(sbi, block_group));
 	ret = ext4_free_blks_count(sb, gdp) + count;
 	ext4_free_blks_set(sb, gdp, ret);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
-	spin_unlock(sb_bgl_lock(sbi, block_group));
+	ext4_unlock_group(sb, block_group);
 	percpu_counter_add(&sbi->s_freeblocks_counter, count);
 
 	if (sbi->s_log_groups_per_flex) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 39223a5..dc34ed3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1784,18 +1784,18 @@ static int ext4_check_descriptors(struct super_block *sb)
 			       "(block %llu)!\n", i, inode_table);
 			return 0;
 		}
-		spin_lock(sb_bgl_lock(sbi, i));
+		ext4_lock_group(sb, i);
 		if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
 			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
 			       "Checksum for group %u failed (%u!=%u)\n",
 			       i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
 			       gdp)), le16_to_cpu(gdp->bg_checksum));
 			if (!(sb->s_flags & MS_RDONLY)) {
-				spin_unlock(sb_bgl_lock(sbi, i));
+				ext4_unlock_group(sb, i);
 				return 0;
 			}
 		}
-		spin_unlock(sb_bgl_lock(sbi, i));
+		ext4_unlock_group(sb, i);
 		if (!flexbg_flag)
 			first_block += EXT4_BLOCKS_PER_GROUP(sb);
 	}
-- 
cgit v0.10.2


From dab6f6a3401f596fe934f41fc5da3f401adfdfb1 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Sat, 2 May 2009 08:02:36 +0200
Subject: perf_counter tools: fix build error

ctype.h crawled out of the bit bucket :)

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 3a3deb3..ddfdcf8 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -15,7 +15,6 @@
 #include <fcntl.h>
 #include <stdio.h>
 #include <errno.h>
-#include <ctype.h>
 #include <time.h>
 #include <sched.h>
 #include <pthread.h>
-- 
cgit v0.10.2


From a454ab3110175d710f4f9a96226a26ce4d5d5de2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 3 May 2009 10:09:03 +0200
Subject: x86, mm: fault.c, use printk_once() in is_errata93()

Andrew pointed out that the 'once' variable has a needlessly
function-global scope. We can in fact eliminate it completely,
via the use of printk_once().

[ Impact: cleanup ]

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 24a36a6..b9ca6d7 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -514,8 +514,6 @@ bad:
 static int is_errata93(struct pt_regs *regs, unsigned long address)
 {
 #ifdef CONFIG_X86_64
-	static int once;
-
 	if (address != regs->ip)
 		return 0;
 
@@ -525,10 +523,7 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
 	address |= 0xffffffffUL << 32;
 	if ((address >= (u64)_stext && address <= (u64)_etext) ||
 	    (address >= MODULES_VADDR && address <= MODULES_END)) {
-		if (!once) {
-			printk(errata93_warning);
-			once = 1;
-		}
+		printk_once(errata93_warning);
 		regs->ip = address;
 		return 1;
 	}
-- 
cgit v0.10.2


From 0cd5f7b0c7fd96f9f00187ef2ad2328de28ae326 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 2 May 2009 20:00:44 +0000
Subject: sh: remove obsolete no_irq_type

Impact: cleanup

convert the last remaining users to no_irq_chip

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/sh_ksyms_32.c b/arch/sh/kernel/sh_ksyms_32.c
index 528de29..2d868c6 100644
--- a/arch/sh/kernel/sh_ksyms_32.c
+++ b/arch/sh/kernel/sh_ksyms_32.c
@@ -19,13 +19,12 @@
 #include <asm/ftrace.h>
 
 extern int dump_fpu(struct pt_regs *, elf_fpregset_t *);
-extern struct hw_interrupt_type no_irq_type;
 
 /* platform dependent support */
 EXPORT_SYMBOL(dump_fpu);
 EXPORT_SYMBOL(kernel_thread);
 EXPORT_SYMBOL(irq_desc);
-EXPORT_SYMBOL(no_irq_type);
+EXPORT_SYMBOL(no_irq_chip);
 
 EXPORT_SYMBOL(strlen);
 
-- 
cgit v0.10.2


From d80498398276ca8eee7ebdbe0d47e06d01317439 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 2 May 2009 20:01:20 +0000
Subject: sh: remove obsolete hw_interrupt_type

Impact: cleanup

Convert the last remaining users to struct irq_chip and remove the
define.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/irq/imask.c b/arch/sh/kernel/cpu/irq/imask.c
index 301b505..f43753b 100644
--- a/arch/sh/kernel/cpu/irq/imask.c
+++ b/arch/sh/kernel/cpu/irq/imask.c
@@ -39,7 +39,7 @@ static unsigned int startup_imask_irq(unsigned int irq)
 	return 0; /* never anything pending */
 }
 
-static struct hw_interrupt_type imask_irq_type = {
+static struct irq_chip imask_irq_type = {
 	.typename = "SR.IMASK",
 	.startup = startup_imask_irq,
 	.shutdown = shutdown_imask_irq,
diff --git a/arch/sh/kernel/cpu/irq/intc-sh5.c b/arch/sh/kernel/cpu/irq/intc-sh5.c
index 726f033..d8679a4 100644
--- a/arch/sh/kernel/cpu/irq/intc-sh5.c
+++ b/arch/sh/kernel/cpu/irq/intc-sh5.c
@@ -84,7 +84,7 @@ static void disable_intc_irq(unsigned int irq);
 static void mask_and_ack_intc(unsigned int);
 static void end_intc_irq(unsigned int irq);
 
-static struct hw_interrupt_type intc_irq_type = {
+static struct irq_chip intc_irq_type = {
 	.typename = "INTC",
 	.startup = startup_intc_irq,
 	.shutdown = shutdown_intc_irq,
-- 
cgit v0.10.2


From 7563431107f6debf57c1dbecfb9498cf31a1c036 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Fri, 1 May 2009 13:10:28 -0700
Subject: time: sh: convert to use arch_getoffset() infrastructure

Convert sh to use GENERIC_TIME via the arch_getoffset() infrastructure.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 7e96ade..6f91478 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -75,7 +75,7 @@ config GENERIC_IOMAP
 	bool
 
 config GENERIC_TIME
-	def_bool n
+	def_bool y
 
 config GENERIC_CLOCKEVENTS
 	def_bool n
diff --git a/arch/sh/include/asm/timer.h b/arch/sh/include/asm/timer.h
index 8f80a55..9c968d1 100644
--- a/arch/sh/include/asm/timer.h
+++ b/arch/sh/include/asm/timer.h
@@ -9,7 +9,7 @@ struct sys_timer_ops {
 	int (*init)(void);
 	int (*start)(void);
 	int (*stop)(void);
-#ifndef CONFIG_GENERIC_TIME
+#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
 	unsigned long (*get_offset)(void);
 #endif
 };
@@ -26,7 +26,7 @@ struct sys_timer {
 extern struct sys_timer tmu_timer, mtu2_timer;
 extern struct sys_timer *sys_timer;
 
-#ifndef CONFIG_GENERIC_TIME
+#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
 static inline unsigned long get_timer_offset(void)
 {
 	return sys_timer->ops->get_offset();
diff --git a/arch/sh/kernel/time_32.c b/arch/sh/kernel/time_32.c
index 4573321..9d34dff 100644
--- a/arch/sh/kernel/time_32.c
+++ b/arch/sh/kernel/time_32.c
@@ -83,65 +83,12 @@ static int __init rtc_generic_init(void)
 }
 module_init(rtc_generic_init);
 
-#ifndef CONFIG_GENERIC_TIME
-void do_gettimeofday(struct timeval *tv)
+#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
+u32 arch_gettimeoffset(void)
 {
-	unsigned long flags;
-	unsigned long seq;
-	unsigned long usec, sec;
-
-	do {
-		/*
-		 * Turn off IRQs when grabbing xtime_lock, so that
-		 * the sys_timer get_offset code doesn't have to handle it.
-		 */
-		seq = read_seqbegin_irqsave(&xtime_lock, flags);
-		usec = get_timer_offset();
-		sec = xtime.tv_sec;
-		usec += xtime.tv_nsec / NSEC_PER_USEC;
-	} while (read_seqretry_irqrestore(&xtime_lock, seq, flags));
-
-	while (usec >= 1000000) {
-		usec -= 1000000;
-		sec++;
-	}
-
-	tv->tv_sec = sec;
-	tv->tv_usec = usec;
-}
-EXPORT_SYMBOL(do_gettimeofday);
-
-int do_settimeofday(struct timespec *tv)
-{
-	time_t wtm_sec, sec = tv->tv_sec;
-	long wtm_nsec, nsec = tv->tv_nsec;
-
-	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
-		return -EINVAL;
-
-	write_seqlock_irq(&xtime_lock);
-	/*
-	 * This is revolting. We need to set "xtime" correctly. However, the
-	 * value in this location is the value at the most recent update of
-	 * wall time.  Discover what correction gettimeofday() would have
-	 * made, and then undo it!
-	 */
-	nsec -= get_timer_offset() * NSEC_PER_USEC;
-
-	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
-	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
-
-	set_normalized_timespec(&xtime, sec, nsec);
-	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
-
-	ntp_clear();
-	write_sequnlock_irq(&xtime_lock);
-	clock_was_set();
-
-	return 0;
+	return get_timer_offset() * 1000;
 }
-EXPORT_SYMBOL(do_settimeofday);
-#endif /* !CONFIG_GENERIC_TIME */
+#endif /* CONFIG_ARCH_USES_GETTIMEOFFSET */
 
 /* last time the RTC clock got updated */
 static long last_rtc_update;
@@ -238,7 +185,7 @@ struct clocksource clocksource_sh = {
 	.name		= "SuperH",
 };
 
-#ifdef CONFIG_GENERIC_TIME
+#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
 unsigned long long sched_clock(void)
 {
 	unsigned long long cycles;
diff --git a/arch/sh/kernel/time_64.c b/arch/sh/kernel/time_64.c
index 988c77c..f4f5e8a 100644
--- a/arch/sh/kernel/time_64.c
+++ b/arch/sh/kernel/time_64.c
@@ -144,59 +144,10 @@ static unsigned long usecs_since_tick(void)
 	return result;
 }
 
-void do_gettimeofday(struct timeval *tv)
+u32 arch_gettimeoffset(void)
 {
-	unsigned long flags;
-	unsigned long seq;
-	unsigned long usec, sec;
-
-	do {
-		seq = read_seqbegin_irqsave(&xtime_lock, flags);
-		usec = usecs_since_tick();
-		sec = xtime.tv_sec;
-		usec += xtime.tv_nsec / 1000;
-	} while (read_seqretry_irqrestore(&xtime_lock, seq, flags));
-
-	while (usec >= 1000000) {
-		usec -= 1000000;
-		sec++;
-	}
-
-	tv->tv_sec = sec;
-	tv->tv_usec = usec;
-}
-EXPORT_SYMBOL(do_gettimeofday);
-
-int do_settimeofday(struct timespec *tv)
-{
-	time_t wtm_sec, sec = tv->tv_sec;
-	long wtm_nsec, nsec = tv->tv_nsec;
-
-	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
-		return -EINVAL;
-
-	write_seqlock_irq(&xtime_lock);
-	/*
-	 * This is revolting. We need to set "xtime" correctly. However, the
-	 * value in this location is the value at the most recent update of
-	 * wall time.  Discover what correction gettimeofday() would have
-	 * made, and then undo it!
-	 */
-	nsec -= 1000 * usecs_since_tick();
-
-	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
-	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
-
-	set_normalized_timespec(&xtime, sec, nsec);
-	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
-
-	ntp_clear();
-	write_sequnlock_irq(&xtime_lock);
-	clock_was_set();
-
-	return 0;
+	return usecs_since_tick() * 1000;
 }
-EXPORT_SYMBOL(do_settimeofday);
 
 /* Dummy RTC ops */
 static void null_rtc_get_time(struct timespec *tv)
diff --git a/arch/sh/kernel/timers/timer-mtu2.c b/arch/sh/kernel/timers/timer-mtu2.c
index 9b0ef01..8a1dcc2 100644
--- a/arch/sh/kernel/timers/timer-mtu2.c
+++ b/arch/sh/kernel/timers/timer-mtu2.c
@@ -191,7 +191,7 @@ struct sys_timer_ops mtu2_timer_ops = {
 	.init		= mtu2_timer_init,
 	.start		= mtu2_timer_start,
 	.stop		= mtu2_timer_stop,
-#ifndef CONFIG_GENERIC_TIME
+#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
 	.get_offset	= mtu2_timer_get_offset,
 #endif
 };
-- 
cgit v0.10.2


From d5ed4c2e5ce9f5f6fd6a5a39ee1196a1f8a46eed Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 30 Apr 2009 07:02:49 +0000
Subject: clocksource: SuperH MTU2 Timer driver

This patch adds a MTU2 driver for the SuperH architecture.

The MTU2 driver is a platform driver with early platform
support to allow using a MTU2 channel as only clockevent
during system bootup.

Clocksource on sh2a is currently unsupported due to code
generation issues with 64-bit math, so at this point only
periodic clockevent support is in place.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 6f91478..6d0dd37 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -113,6 +113,9 @@ config SYS_SUPPORTS_PCI
 config SYS_SUPPORTS_CMT
 	bool
 
+config SYS_SUPPORTS_MTU2
+	bool
+
 config STACKTRACE_SUPPORT
 	def_bool y
 
@@ -478,6 +481,14 @@ config SH_MTU2
 	help
 	  This enables the use of the MTU2 as the system timer.
 
+config SH_TIMER_MTU2
+	bool "MTU2 timer driver"
+	depends on SYS_SUPPORTS_MTU2 && !SH_MTU2
+	default y
+	select GENERIC_CLOCKEVENTS
+	help
+	  This enables build of the MTU2 timer driver.
+
 config SH_TIMER_IRQ
 	int
 	default "28" if CPU_SUBTYPE_SH7780 || CPU_SUBTYPE_SH7785 || \
diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile
index 1efb287..9785586 100644
--- a/drivers/clocksource/Makefile
+++ b/drivers/clocksource/Makefile
@@ -3,3 +3,4 @@ obj-$(CONFIG_X86_CYCLONE_TIMER)	+= cyclone.o
 obj-$(CONFIG_X86_PM_TIMER)	+= acpi_pm.o
 obj-$(CONFIG_SCx200HR_TIMER)	+= scx200_hrt.o
 obj-$(CONFIG_SH_TIMER_CMT)	+= sh_cmt.o
+obj-$(CONFIG_SH_TIMER_MTU2)	+= sh_mtu2.o
diff --git a/drivers/clocksource/sh_mtu2.c b/drivers/clocksource/sh_mtu2.c
new file mode 100644
index 0000000..420566f
--- /dev/null
+++ b/drivers/clocksource/sh_mtu2.c
@@ -0,0 +1,357 @@
+/*
+ * SuperH Timer Support - MTU2
+ *
+ *  Copyright (C) 2009 Magnus Damm
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/ioport.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/clk.h>
+#include <linux/irq.h>
+#include <linux/err.h>
+#include <linux/clockchips.h>
+#include <linux/sh_mtu2.h>
+
+struct sh_mtu2_priv {
+	void __iomem *mapbase;
+	struct clk *clk;
+	struct irqaction irqaction;
+	struct platform_device *pdev;
+	unsigned long rate;
+	unsigned long periodic;
+	struct clock_event_device ced;
+};
+
+static DEFINE_SPINLOCK(sh_mtu2_lock);
+
+#define TSTR -1 /* shared register */
+#define TCR  0 /* channel register */
+#define TMDR 1 /* channel register */
+#define TIOR 2 /* channel register */
+#define TIER 3 /* channel register */
+#define TSR  4 /* channel register */
+#define TCNT 5 /* channel register */
+#define TGR  6 /* channel register */
+
+static unsigned long mtu2_reg_offs[] = {
+	[TCR] = 0,
+	[TMDR] = 1,
+	[TIOR] = 2,
+	[TIER] = 4,
+	[TSR] = 5,
+	[TCNT] = 6,
+	[TGR] = 8,
+};
+
+static inline unsigned long sh_mtu2_read(struct sh_mtu2_priv *p, int reg_nr)
+{
+	struct sh_mtu2_config *cfg = p->pdev->dev.platform_data;
+	void __iomem *base = p->mapbase;
+	unsigned long offs;
+
+	if (reg_nr == TSTR)
+		return ioread8(base + cfg->channel_offset);
+
+	offs = mtu2_reg_offs[reg_nr];
+
+	if ((reg_nr == TCNT) || (reg_nr == TGR))
+		return ioread16(base + offs);
+	else
+		return ioread8(base + offs);
+}
+
+static inline void sh_mtu2_write(struct sh_mtu2_priv *p, int reg_nr,
+				unsigned long value)
+{
+	struct sh_mtu2_config *cfg = p->pdev->dev.platform_data;
+	void __iomem *base = p->mapbase;
+	unsigned long offs;
+
+	if (reg_nr == TSTR) {
+		iowrite8(value, base + cfg->channel_offset);
+		return;
+	}
+
+	offs = mtu2_reg_offs[reg_nr];
+
+	if ((reg_nr == TCNT) || (reg_nr == TGR))
+		iowrite16(value, base + offs);
+	else
+		iowrite8(value, base + offs);
+}
+
+static void sh_mtu2_start_stop_ch(struct sh_mtu2_priv *p, int start)
+{
+	struct sh_mtu2_config *cfg = p->pdev->dev.platform_data;
+	unsigned long flags, value;
+
+	/* start stop register shared by multiple timer channels */
+	spin_lock_irqsave(&sh_mtu2_lock, flags);
+	value = sh_mtu2_read(p, TSTR);
+
+	if (start)
+		value |= 1 << cfg->timer_bit;
+	else
+		value &= ~(1 << cfg->timer_bit);
+
+	sh_mtu2_write(p, TSTR, value);
+	spin_unlock_irqrestore(&sh_mtu2_lock, flags);
+}
+
+static int sh_mtu2_enable(struct sh_mtu2_priv *p)
+{
+	struct sh_mtu2_config *cfg = p->pdev->dev.platform_data;
+	int ret;
+
+	/* enable clock */
+	ret = clk_enable(p->clk);
+	if (ret) {
+		pr_err("sh_mtu2: cannot enable clock \"%s\"\n", cfg->clk);
+		return ret;
+	}
+
+	/* make sure channel is disabled */
+	sh_mtu2_start_stop_ch(p, 0);
+
+	p->rate = clk_get_rate(p->clk) / 64;
+	p->periodic = (p->rate + HZ/2) / HZ;
+
+	/* "Periodic Counter Operation" */
+	sh_mtu2_write(p, TCR, 0x23); /* TGRA clear, divide clock by 64 */
+	sh_mtu2_write(p, TIOR, 0);
+	sh_mtu2_write(p, TGR, p->periodic);
+	sh_mtu2_write(p, TCNT, 0);
+	sh_mtu2_write(p, TMDR, 0);
+	sh_mtu2_write(p, TIER, 0x01);
+
+	/* enable channel */
+	sh_mtu2_start_stop_ch(p, 1);
+
+	return 0;
+}
+
+static void sh_mtu2_disable(struct sh_mtu2_priv *p)
+{
+	/* disable channel */
+	sh_mtu2_start_stop_ch(p, 0);
+
+	/* stop clock */
+	clk_disable(p->clk);
+}
+
+static irqreturn_t sh_mtu2_interrupt(int irq, void *dev_id)
+{
+	struct sh_mtu2_priv *p = dev_id;
+
+	/* acknowledge interrupt */
+	sh_mtu2_read(p, TSR);
+	sh_mtu2_write(p, TSR, 0xfe);
+
+	/* notify clockevent layer */
+	p->ced.event_handler(&p->ced);
+	return IRQ_HANDLED;
+}
+
+static struct sh_mtu2_priv *ced_to_sh_mtu2(struct clock_event_device *ced)
+{
+	return container_of(ced, struct sh_mtu2_priv, ced);
+}
+
+static void sh_mtu2_clock_event_mode(enum clock_event_mode mode,
+				    struct clock_event_device *ced)
+{
+	struct sh_mtu2_priv *p = ced_to_sh_mtu2(ced);
+	int disabled = 0;
+
+	/* deal with old setting first */
+	switch (ced->mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		sh_mtu2_disable(p);
+		disabled = 1;
+		break;
+	default:
+		break;
+	}
+
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		pr_info("sh_mtu2: %s used for periodic clock events\n",
+			ced->name);
+		sh_mtu2_enable(p);
+		break;
+	case CLOCK_EVT_MODE_UNUSED:
+		if (!disabled)
+			sh_mtu2_disable(p);
+		break;
+	case CLOCK_EVT_MODE_SHUTDOWN:
+	default:
+		break;
+	}
+}
+
+static void sh_mtu2_register_clockevent(struct sh_mtu2_priv *p,
+				       char *name, unsigned long rating)
+{
+	struct clock_event_device *ced = &p->ced;
+	int ret;
+
+	memset(ced, 0, sizeof(*ced));
+
+	ced->name = name;
+	ced->features = CLOCK_EVT_FEAT_PERIODIC;
+	ced->rating = rating;
+	ced->cpumask = cpumask_of(0);
+	ced->set_mode = sh_mtu2_clock_event_mode;
+
+	ret = setup_irq(p->irqaction.irq, &p->irqaction);
+	if (ret) {
+		pr_err("sh_mtu2: failed to request irq %d\n",
+		       p->irqaction.irq);
+		return;
+	}
+
+	pr_info("sh_mtu2: %s used for clock events\n", ced->name);
+	clockevents_register_device(ced);
+}
+
+int sh_mtu2_register(struct sh_mtu2_priv *p, char *name,
+		     unsigned long clockevent_rating)
+{
+	if (clockevent_rating)
+		sh_mtu2_register_clockevent(p, name, clockevent_rating);
+
+	return 0;
+}
+
+static int sh_mtu2_setup(struct sh_mtu2_priv *p, struct platform_device *pdev)
+{
+	struct sh_mtu2_config *cfg = pdev->dev.platform_data;
+	struct resource *res;
+	int irq, ret;
+	ret = -ENXIO;
+
+	memset(p, 0, sizeof(*p));
+	p->pdev = pdev;
+
+	if (!cfg) {
+		dev_err(&p->pdev->dev, "missing platform data\n");
+		goto err0;
+	}
+
+	platform_set_drvdata(pdev, p);
+
+	res = platform_get_resource(p->pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(&p->pdev->dev, "failed to get I/O memory\n");
+		goto err0;
+	}
+
+	irq = platform_get_irq(p->pdev, 0);
+	if (irq < 0) {
+		dev_err(&p->pdev->dev, "failed to get irq\n");
+		goto err0;
+	}
+
+	/* map memory, let mapbase point to our channel */
+	p->mapbase = ioremap_nocache(res->start, resource_size(res));
+	if (p->mapbase == NULL) {
+		pr_err("sh_mtu2: failed to remap I/O memory\n");
+		goto err0;
+	}
+
+	/* setup data for setup_irq() (too early for request_irq()) */
+	p->irqaction.name = cfg->name;
+	p->irqaction.handler = sh_mtu2_interrupt;
+	p->irqaction.dev_id = p;
+	p->irqaction.irq = irq;
+	p->irqaction.flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL;
+	p->irqaction.mask = CPU_MASK_NONE;
+
+	/* get hold of clock */
+	p->clk = clk_get(&p->pdev->dev, cfg->clk);
+	if (IS_ERR(p->clk)) {
+		pr_err("sh_mtu2: cannot get clock \"%s\"\n", cfg->clk);
+		ret = PTR_ERR(p->clk);
+		goto err1;
+	}
+
+	return sh_mtu2_register(p, cfg->name, cfg->clockevent_rating);
+ err1:
+	iounmap(p->mapbase);
+ err0:
+	return ret;
+}
+
+static int __devinit sh_mtu2_probe(struct platform_device *pdev)
+{
+	struct sh_mtu2_priv *p = platform_get_drvdata(pdev);
+	struct sh_mtu2_config *cfg = pdev->dev.platform_data;
+	int ret;
+
+	if (p) {
+		pr_info("sh_mtu2: %s kept as earlytimer\n", cfg->name);
+		return 0;
+	}
+
+	p = kmalloc(sizeof(*p), GFP_KERNEL);
+	if (p == NULL) {
+		dev_err(&pdev->dev, "failed to allocate driver data\n");
+		return -ENOMEM;
+	}
+
+	ret = sh_mtu2_setup(p, pdev);
+	if (ret) {
+		kfree(p);
+		platform_set_drvdata(pdev, NULL);
+	}
+	return ret;
+}
+
+static int __devexit sh_mtu2_remove(struct platform_device *pdev)
+{
+	return -EBUSY; /* cannot unregister clockevent */
+}
+
+static struct platform_driver sh_mtu2_device_driver = {
+	.probe		= sh_mtu2_probe,
+	.remove		= __devexit_p(sh_mtu2_remove),
+	.driver		= {
+		.name	= "sh_mtu2",
+	}
+};
+
+static int __init sh_mtu2_init(void)
+{
+	return platform_driver_register(&sh_mtu2_device_driver);
+}
+
+static void __exit sh_mtu2_exit(void)
+{
+	platform_driver_unregister(&sh_mtu2_device_driver);
+}
+
+early_platform_init("earlytimer", &sh_mtu2_device_driver);
+module_init(sh_mtu2_init);
+module_exit(sh_mtu2_exit);
+
+MODULE_AUTHOR("Magnus Damm");
+MODULE_DESCRIPTION("SuperH MTU2 Timer Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/sh_mtu2.h b/include/linux/sh_mtu2.h
new file mode 100644
index 0000000..a988763
--- /dev/null
+++ b/include/linux/sh_mtu2.h
@@ -0,0 +1,12 @@
+#ifndef __SH_MTU2_H__
+#define __SH_MTU2_H__
+
+struct sh_mtu2_config {
+	char *name;
+	int channel_offset;
+	int timer_bit;
+	char *clk;
+	unsigned long clockevent_rating;
+};
+
+#endif /* __SH_MTU2_H__ */
-- 
cgit v0.10.2


From da107c6ef919b3afd9c9b405a4f71e03b5725b04 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 30 Apr 2009 07:06:26 +0000
Subject: sh: sh2a MTU2 platform data

This patch adds MTU2 platform data for the following cpus:
 - sh7201 (3/5 channels)
 - sh7203/sh7263 (2/4 channels)
 - sh7206 (3/5 channels)
 - MXG (3/5 channels)

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 6d0dd37..2061488 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -214,27 +214,32 @@ config CPU_SUBTYPE_SH7201
 	bool "Support SH7201 processor"
 	select CPU_SH2A
 	select CPU_HAS_FPU
+	select SYS_SUPPORTS_MTU2
  
 config CPU_SUBTYPE_SH7203
 	bool "Support SH7203 processor"
 	select CPU_SH2A
 	select CPU_HAS_FPU
 	select SYS_SUPPORTS_CMT
+	select SYS_SUPPORTS_MTU2
 
 config CPU_SUBTYPE_SH7206
 	bool "Support SH7206 processor"
 	select CPU_SH2A
 	select SYS_SUPPORTS_CMT
+	select SYS_SUPPORTS_MTU2
 
 config CPU_SUBTYPE_SH7263
 	bool "Support SH7263 processor"
 	select CPU_SH2A
 	select CPU_HAS_FPU
 	select SYS_SUPPORTS_CMT
+	select SYS_SUPPORTS_MTU2
 
 config CPU_SUBTYPE_MXG
 	bool "Support MX-G processor"
 	select CPU_SH2A
+	select SYS_SUPPORTS_MTU2
 	help
 	  Select MX-G if running on an R8A03022BG part.
 
diff --git a/arch/sh/kernel/cpu/sh2a/setup-mxg.c b/arch/sh/kernel/cpu/sh2a/setup-mxg.c
index 8442937..870030a 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-mxg.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-mxg.c
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
+#include <linux/sh_mtu2.h>
 
 enum {
 	UNUSED = 0,
@@ -24,7 +25,7 @@ enum {
 
 	SCIF0, SCIF1,
 
-	MTU2_GROUP1, MTU2_GROUP2, MTU2_GROUP3, MTU2_GROUP4, MTU2_GROUP5
+	MTU2_GROUP1, MTU2_GROUP2, MTU2_GROUP3, MTU2_GROUP4, MTU2_GROUP5,
 	MTU2_TGI3B, MTU2_TGI3C,
 
 	/* interrupt groups */
@@ -113,6 +114,99 @@ static struct intc_mask_reg mask_registers[] __initdata = {
 static DECLARE_INTC_DESC(intc_desc, "mxg", vectors, groups,
 			 mask_registers, prio_registers, NULL);
 
+static struct sh_mtu2_config mtu2_0_platform_data = {
+	.name = "MTU2_0",
+	.channel_offset = -0x80,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource mtu2_0_resources[] = {
+	[0] = {
+		.name	= "MTU2_0",
+		.start	= 0xff801300,
+		.end	= 0xff801326,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 228,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device mtu2_0_device = {
+	.name		= "sh_mtu2",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &mtu2_0_platform_data,
+	},
+	.resource	= mtu2_0_resources,
+	.num_resources	= ARRAY_SIZE(mtu2_0_resources),
+};
+
+static struct sh_mtu2_config mtu2_1_platform_data = {
+	.name = "MTU2_1",
+	.channel_offset = -0x100,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource mtu2_1_resources[] = {
+	[0] = {
+		.name	= "MTU2_1",
+		.start	= 0xff801380,
+		.end	= 0xff801390,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 234,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device mtu2_1_device = {
+	.name		= "sh_mtu2",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &mtu2_1_platform_data,
+	},
+	.resource	= mtu2_1_resources,
+	.num_resources	= ARRAY_SIZE(mtu2_1_resources),
+};
+
+static struct sh_mtu2_config mtu2_2_platform_data = {
+	.name = "MTU2_2",
+	.channel_offset = 0x80,
+	.timer_bit = 2,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource mtu2_2_resources[] = {
+	[0] = {
+		.name	= "MTU2_2",
+		.start	= 0xff801000,
+		.end	= 0xff80100a,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 240,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device mtu2_2_device = {
+	.name		= "sh_mtu2",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &mtu2_2_platform_data,
+	},
+	.resource	= mtu2_2_resources,
+	.num_resources	= ARRAY_SIZE(mtu2_2_resources),
+};
+
 static struct plat_sci_port sci_platform_data[] = {
 	{
 		.mapbase	= 0xff804000,
@@ -134,6 +228,9 @@ static struct platform_device sci_device = {
 
 static struct platform_device *mxg_devices[] __initdata = {
 	&sci_device,
+	&mtu2_0_device,
+	&mtu2_1_device,
+	&mtu2_2_device,
 };
 
 static int __init mxg_devices_setup(void)
@@ -147,3 +244,15 @@ void __init plat_irq_setup(void)
 {
 	register_intc_controller(&intc_desc);
 }
+
+static struct platform_device *mxg_early_devices[] __initdata = {
+	&mtu2_0_device,
+	&mtu2_1_device,
+	&mtu2_2_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(mxg_early_devices,
+				   ARRAY_SIZE(mxg_early_devices));
+}
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
index 00f42f9..074aa90 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
@@ -12,6 +12,8 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
+#include <linux/sh_mtu2.h>
+#include <linux/io.h>
 
 enum {
 	UNUSED = 0,
@@ -249,9 +251,105 @@ static struct platform_device rtc_device = {
 	.resource	= rtc_resources,
 };
 
+static struct sh_mtu2_config mtu2_0_platform_data = {
+	.name = "MTU2_0",
+	.channel_offset = -0x80,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource mtu2_0_resources[] = {
+	[0] = {
+		.name	= "MTU2_0",
+		.start	= 0xfffe4300,
+		.end	= 0xfffe4326,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 108,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device mtu2_0_device = {
+	.name		= "sh_mtu2",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &mtu2_0_platform_data,
+	},
+	.resource	= mtu2_0_resources,
+	.num_resources	= ARRAY_SIZE(mtu2_0_resources),
+};
+
+static struct sh_mtu2_config mtu2_1_platform_data = {
+	.name = "MTU2_1",
+	.channel_offset = -0x100,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource mtu2_1_resources[] = {
+	[0] = {
+		.name	= "MTU2_1",
+		.start	= 0xfffe4380,
+		.end	= 0xfffe4390,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 116,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device mtu2_1_device = {
+	.name		= "sh_mtu2",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &mtu2_1_platform_data,
+	},
+	.resource	= mtu2_1_resources,
+	.num_resources	= ARRAY_SIZE(mtu2_1_resources),
+};
+
+static struct sh_mtu2_config mtu2_2_platform_data = {
+	.name = "MTU2_2",
+	.channel_offset = 0x80,
+	.timer_bit = 2,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource mtu2_2_resources[] = {
+	[0] = {
+		.name	= "MTU2_2",
+		.start	= 0xfffe4000,
+		.end	= 0xfffe400a,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 124,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device mtu2_2_device = {
+	.name		= "sh_mtu2",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &mtu2_2_platform_data,
+	},
+	.resource	= mtu2_2_resources,
+	.num_resources	= ARRAY_SIZE(mtu2_2_resources),
+};
+
 static struct platform_device *sh7201_devices[] __initdata = {
 	&sci_device,
 	&rtc_device,
+	&mtu2_0_device,
+	&mtu2_1_device,
+	&mtu2_2_device,
 };
 
 static int __init sh7201_devices_setup(void)
@@ -265,3 +363,20 @@ void __init plat_irq_setup(void)
 {
 	register_intc_controller(&intc_desc);
 }
+
+static struct platform_device *sh7201_early_devices[] __initdata = {
+	&mtu2_0_device,
+	&mtu2_1_device,
+	&mtu2_2_device,
+};
+
+#define STBCR3 0xfffe0408
+
+void __init plat_early_device_setup(void)
+{
+	/* enable MTU2 clock */
+	__raw_writeb(__raw_readb(STBCR3) & ~0x20, STBCR3);
+
+	early_platform_add_devices(sh7201_early_devices,
+				   ARRAY_SIZE(sh7201_early_devices));
+}
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
index 0836ace..3448164 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
@@ -12,6 +12,7 @@
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
 #include <linux/sh_cmt.h>
+#include <linux/sh_mtu2.h>
 #include <linux/io.h>
 
 enum {
@@ -271,6 +272,68 @@ static struct platform_device cmt1_device = {
 	.num_resources	= ARRAY_SIZE(cmt1_resources),
 };
 
+static struct sh_mtu2_config mtu2_0_platform_data = {
+	.name = "MTU2_0",
+	.channel_offset = -0x80,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource mtu2_0_resources[] = {
+	[0] = {
+		.name	= "MTU2_0",
+		.start	= 0xfffe4300,
+		.end	= 0xfffe4326,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 146,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device mtu2_0_device = {
+	.name		= "sh_mtu2",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &mtu2_0_platform_data,
+	},
+	.resource	= mtu2_0_resources,
+	.num_resources	= ARRAY_SIZE(mtu2_0_resources),
+};
+
+static struct sh_mtu2_config mtu2_1_platform_data = {
+	.name = "MTU2_1",
+	.channel_offset = -0x100,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource mtu2_1_resources[] = {
+	[0] = {
+		.name	= "MTU2_1",
+		.start	= 0xfffe4380,
+		.end	= 0xfffe4390,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 153,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device mtu2_1_device = {
+	.name		= "sh_mtu2",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &mtu2_1_platform_data,
+	},
+	.resource	= mtu2_1_resources,
+	.num_resources	= ARRAY_SIZE(mtu2_1_resources),
+};
+
 static struct resource rtc_resources[] = {
 	[0] = {
 		.start	= 0xffff2000,
@@ -295,6 +358,8 @@ static struct platform_device *sh7203_devices[] __initdata = {
 	&sci_device,
 	&cmt0_device,
 	&cmt1_device,
+	&mtu2_0_device,
+	&mtu2_1_device,
 	&rtc_device,
 };
 
@@ -313,8 +378,11 @@ void __init plat_irq_setup(void)
 static struct platform_device *sh7203_early_devices[] __initdata = {
 	&cmt0_device,
 	&cmt1_device,
+	&mtu2_0_device,
+	&mtu2_1_device,
 };
 
+#define STBCR3 0xfffe0408
 #define STBCR4 0xfffe040c
 
 void __init plat_early_device_setup(void)
@@ -322,6 +390,9 @@ void __init plat_early_device_setup(void)
 	/* enable CMT clock */
 	__raw_writeb(__raw_readb(STBCR4) & ~0x04, STBCR4);
 
+	/* enable MTU2 clock */
+	__raw_writeb(__raw_readb(STBCR3) & ~0x20, STBCR3);
+
 	early_platform_add_devices(sh7203_early_devices,
 				   ARRAY_SIZE(sh7203_early_devices));
 }
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
index f9606e3..e700559 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
@@ -13,6 +13,7 @@
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
 #include <linux/sh_cmt.h>
+#include <linux/sh_mtu2.h>
 #include <linux/io.h>
 
 enum {
@@ -231,10 +232,106 @@ static struct platform_device cmt1_device = {
 	.num_resources	= ARRAY_SIZE(cmt1_resources),
 };
 
+static struct sh_mtu2_config mtu2_0_platform_data = {
+	.name = "MTU2_0",
+	.channel_offset = -0x80,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource mtu2_0_resources[] = {
+	[0] = {
+		.name	= "MTU2_0",
+		.start	= 0xfffe4300,
+		.end	= 0xfffe4326,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 156,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device mtu2_0_device = {
+	.name		= "sh_mtu2",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &mtu2_0_platform_data,
+	},
+	.resource	= mtu2_0_resources,
+	.num_resources	= ARRAY_SIZE(mtu2_0_resources),
+};
+
+static struct sh_mtu2_config mtu2_1_platform_data = {
+	.name = "MTU2_1",
+	.channel_offset = -0x100,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource mtu2_1_resources[] = {
+	[0] = {
+		.name	= "MTU2_1",
+		.start	= 0xfffe4380,
+		.end	= 0xfffe4390,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 164,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device mtu2_1_device = {
+	.name		= "sh_mtu2",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &mtu2_1_platform_data,
+	},
+	.resource	= mtu2_1_resources,
+	.num_resources	= ARRAY_SIZE(mtu2_1_resources),
+};
+
+static struct sh_mtu2_config mtu2_2_platform_data = {
+	.name = "MTU2_2",
+	.channel_offset = 0x80,
+	.timer_bit = 2,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource mtu2_2_resources[] = {
+	[0] = {
+		.name	= "MTU2_2",
+		.start	= 0xfffe4000,
+		.end	= 0xfffe400a,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 180,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device mtu2_2_device = {
+	.name		= "sh_mtu2",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &mtu2_2_platform_data,
+	},
+	.resource	= mtu2_2_resources,
+	.num_resources	= ARRAY_SIZE(mtu2_2_resources),
+};
+
 static struct platform_device *sh7206_devices[] __initdata = {
 	&sci_device,
 	&cmt0_device,
 	&cmt1_device,
+	&mtu2_0_device,
+	&mtu2_1_device,
+	&mtu2_2_device,
 };
 
 static int __init sh7206_devices_setup(void)
@@ -252,8 +349,12 @@ void __init plat_irq_setup(void)
 static struct platform_device *sh7206_early_devices[] __initdata = {
 	&cmt0_device,
 	&cmt1_device,
+	&mtu2_0_device,
+	&mtu2_1_device,
+	&mtu2_2_device,
 };
 
+#define STBCR3 0xfffe0408
 #define STBCR4 0xfffe040c
 
 void __init plat_early_device_setup(void)
@@ -261,6 +362,9 @@ void __init plat_early_device_setup(void)
 	/* enable CMT clock */
 	__raw_writeb(__raw_readb(STBCR4) & ~0x04, STBCR4);
 
+	/* enable MTU2 clock */
+	__raw_writeb(__raw_readb(STBCR3) & ~0x20, STBCR3);
+
 	early_platform_add_devices(sh7206_early_devices,
 				   ARRAY_SIZE(sh7206_early_devices));
 }
-- 
cgit v0.10.2


From 1cbac972ba28e706fa9ce4d4c81830040bc811ee Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sat, 2 May 2009 13:39:56 +0400
Subject: x86: uv io-apic - use BUILD_BUG_ON instead of BUG_ON

The expression is known to be true/false at compilation
time so we're allowed to use build-time instead of
run-time check. Also align 'entry' items assignment.

[ Impact: shrink kernel a bit, cleanup ]

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20090502093956.GB4791@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 8aef5f9..a80335b 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3749,6 +3749,8 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
 	unsigned long flags;
 	int err;
 
+	BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
+
 	cfg = irq_cfg(irq);
 
 	err = assign_irq_vector(irq, cfg, eligible_cpu);
@@ -3762,15 +3764,13 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
 
 	mmr_value = 0;
 	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
-	BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
-
-	entry->vector = cfg->vector;
-	entry->delivery_mode = apic->irq_delivery_mode;
-	entry->dest_mode = apic->irq_dest_mode;
-	entry->polarity = 0;
-	entry->trigger = 0;
-	entry->mask = 0;
-	entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
+	entry->vector		= cfg->vector;
+	entry->delivery_mode	= apic->irq_delivery_mode;
+	entry->dest_mode	= apic->irq_dest_mode;
+	entry->polarity		= 0;
+	entry->trigger		= 0;
+	entry->mask		= 0;
+	entry->dest		= apic->cpu_mask_to_apicid(eligible_cpu);
 
 	mmr_pnode = uv_blade_to_pnode(mmr_blade);
 	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -3788,10 +3788,10 @@ void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
 	struct uv_IO_APIC_route_entry *entry;
 	int mmr_pnode;
 
+	BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
+
 	mmr_value = 0;
 	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
-	BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
-
 	entry->mask = 1;
 
 	mmr_pnode = uv_blade_to_pnode(mmr_blade);
-- 
cgit v0.10.2


From 3280c8865e1b738604bacdea54738acef31e8c12 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 30 Apr 2009 07:12:09 +0000
Subject: sh: remove old MTU2 driver

This patch removes the old MTU2 driver (CONFIG_SH_MTU2/timer-mtu2.c)

As replacement, select the sh_cmt driver with CONFIG_SH_TIMER_MTU2
and configure timer channel using platform data.

If multiple MTU channels are enabled using platform data, use the
earlytimer parameter on the kernel command line to select channel.
For instance, use "earlytimer=sh_mtu2.0" to select the first channel.

To verify which timer is being used, look at printouts or the timer
irq count in /proc/interrupts.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 2061488..bda3dc1 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -479,16 +479,9 @@ config SH_TIMER_CMT
 	help
 	  This enables build of the CMT timer driver.
 
-config SH_MTU2
-	bool "MTU2 timer support"
-	depends on CPU_SH2A && !GENERIC_TIME
-	default y
-	help
-	  This enables the use of the MTU2 as the system timer.
-
 config SH_TIMER_MTU2
 	bool "MTU2 timer driver"
-	depends on SYS_SUPPORTS_MTU2 && !SH_MTU2
+	depends on SYS_SUPPORTS_MTU2
 	default y
 	select GENERIC_CLOCKEVENTS
 	help
diff --git a/arch/sh/include/asm/timer.h b/arch/sh/include/asm/timer.h
index 9c968d1..581e3fe 100644
--- a/arch/sh/include/asm/timer.h
+++ b/arch/sh/include/asm/timer.h
@@ -23,7 +23,7 @@ struct sys_timer {
 
 #define TICK_SIZE (tick_nsec / 1000)
 
-extern struct sys_timer tmu_timer, mtu2_timer;
+extern struct sys_timer tmu_timer;
 extern struct sys_timer *sys_timer;
 
 #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
diff --git a/arch/sh/kernel/timers/Makefile b/arch/sh/kernel/timers/Makefile
index 1e9a104..44156df 100644
--- a/arch/sh/kernel/timers/Makefile
+++ b/arch/sh/kernel/timers/Makefile
@@ -5,6 +5,5 @@
 obj-y	:= timer.o
 
 obj-$(CONFIG_SH_TMU)		+= timer-tmu.o
-obj-$(CONFIG_SH_MTU2)		+= timer-mtu2.o
 
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)	+= timer-broadcast.o
diff --git a/arch/sh/kernel/timers/timer-mtu2.c b/arch/sh/kernel/timers/timer-mtu2.c
deleted file mode 100644
index 8a1dcc2..0000000
--- a/arch/sh/kernel/timers/timer-mtu2.c
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- * arch/sh/kernel/timers/timer-mtu2.c - MTU2 Timer Support
- *
- *  Copyright (C) 2005  Paul Mundt
- *
- * Based off of arch/sh/kernel/timers/timer-tmu.c
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/seqlock.h>
-#include <asm/timer.h>
-#include <asm/io.h>
-#include <asm/irq.h>
-#include <asm/clock.h>
-
-/*
- * We use channel 1 for our lowly system timer. Channel 2 would be the other
- * likely candidate, but we leave it alone as it has higher divisors that
- * would be of more use to other more interesting applications.
- *
- * TODO: Presently we only implement a 16-bit single-channel system timer.
- * However, we can implement channel cascade if we go the overflow route and
- * get away with using 2 MTU2 channels as a 32-bit timer.
- */
-#define MTU2_TSTR	0xfffe4280
-#define MTU2_TCR_1	0xfffe4380
-#define MTU2_TMDR_1	0xfffe4381
-#define MTU2_TIOR_1	0xfffe4382
-#define MTU2_TIER_1	0xfffe4384
-#define MTU2_TSR_1	0xfffe4385
-#define MTU2_TCNT_1	0xfffe4386	/* 16-bit counter */
-
-#if defined(CONFIG_CPU_SUBTYPE_SH7201) || \
-    defined(CONFIG_CPU_SUBTYPE_SH7203)
-#define MTU2_TGRA_1	0xfffe4388
-#else
-#define MTU2_TGRA_1	0xfffe438a
-#endif
-
-#define STBCR3		0xfffe0408
-
-#define MTU2_TSTR_CST1	(1 << 1)	/* Counter Start 1 */
-
-#define MTU2_TSR_TGFA	(1 << 0)	/* GRA compare match */
-
-#define MTU2_TIER_TGIEA	(1 << 0)	/* GRA compare match  interrupt enable */
-
-#define MTU2_TCR_INIT	0x22
-
-#define MTU2_TCR_CALIB  0x00
-
-static unsigned long mtu2_timer_get_offset(void)
-{
-	int count;
-	static int count_p = 0x7fff;	/* for the first call after boot */
-	static unsigned long jiffies_p = 0;
-
-	/*
-	 * cache volatile jiffies temporarily; we have IRQs turned off.
-	 */
-	unsigned long jiffies_t;
-
-	/* timer count may underflow right here */
-	count = ctrl_inw(MTU2_TCNT_1);	/* read the latched count */
-
-	jiffies_t = jiffies;
-
-	/*
-	 * avoiding timer inconsistencies (they are rare, but they happen)...
-	 * there is one kind of problem that must be avoided here:
-	 *  1. the timer counter underflows
-	 */
-
-	if (jiffies_t == jiffies_p) {
-		if (count > count_p) {
-			if (ctrl_inb(MTU2_TSR_1) & MTU2_TSR_TGFA) {
-				count -= LATCH;
-			} else {
-				printk("%s (): hardware timer problem?\n",
-				       __func__);
-			}
-		}
-	} else
-		jiffies_p = jiffies_t;
-
-	count_p = count;
-
-	count = ((LATCH-1) - count) * TICK_SIZE;
-	count = (count + LATCH/2) / LATCH;
-
-	return count;
-}
-
-static irqreturn_t mtu2_timer_interrupt(int irq, void *dev_id)
-{
-	unsigned long timer_status;
-
-	/* Clear TGFA bit */
-	timer_status = ctrl_inb(MTU2_TSR_1);
-	timer_status &= ~MTU2_TSR_TGFA;
-	ctrl_outb(timer_status, MTU2_TSR_1);
-
-	/* Do timer tick */
-	handle_timer_tick();
-
-	return IRQ_HANDLED;
-}
-
-static struct irqaction mtu2_irq = {
-	.name		= "timer",
-	.handler	= mtu2_timer_interrupt,
-	.flags		= IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL,
-};
-
-static unsigned int divisors[] = { 1, 4, 16, 64, 1, 1, 256 };
-
-static void mtu2_clk_init(struct clk *clk)
-{
-	u8 idx = MTU2_TCR_INIT & 0x7;
-
-	clk->rate = clk->parent->rate / divisors[idx];
-	/* Start TCNT counting */
-	ctrl_outb(ctrl_inb(MTU2_TSTR) | MTU2_TSTR_CST1, MTU2_TSTR);
-
-}
-
-static void mtu2_clk_recalc(struct clk *clk)
-{
-	u8 idx = ctrl_inb(MTU2_TCR_1) & 0x7;
-	clk->rate = clk->parent->rate / divisors[idx];
-}
-
-static struct clk_ops mtu2_clk_ops = {
-	.init		= mtu2_clk_init,
-	.recalc		= mtu2_clk_recalc,
-};
-
-static struct clk mtu2_clk1 = {
-	.name		= "mtu2_clk1",
-	.ops		= &mtu2_clk_ops,
-};
-
-static int mtu2_timer_start(void)
-{
-	ctrl_outb(ctrl_inb(MTU2_TSTR) | MTU2_TSTR_CST1, MTU2_TSTR);
-	return 0;
-}
-
-static int mtu2_timer_stop(void)
-{
-	ctrl_outb(ctrl_inb(MTU2_TSTR) & ~MTU2_TSTR_CST1, MTU2_TSTR);
-	return 0;
-}
-
-static int mtu2_timer_init(void)
-{
-	unsigned long interval;
-
-	setup_irq(CONFIG_SH_TIMER_IRQ, &mtu2_irq);
-
-	mtu2_clk1.parent = clk_get(NULL, "module_clk");
-
-	ctrl_outb(ctrl_inb(STBCR3) & (~0x20), STBCR3);
-
-	/* Normal operation */
-	ctrl_outb(0, MTU2_TMDR_1);
-	ctrl_outb(MTU2_TCR_INIT, MTU2_TCR_1);
-	ctrl_outb(0x01, MTU2_TIOR_1);
-
-	/* Enable underflow interrupt */
-	ctrl_outb(ctrl_inb(MTU2_TIER_1) | MTU2_TIER_TGIEA, MTU2_TIER_1);
-
-	interval = CONFIG_SH_PCLK_FREQ / 16 / HZ;
-	printk(KERN_INFO "Interval = %ld\n", interval);
-
-	ctrl_outw(interval, MTU2_TGRA_1);
-	ctrl_outw(0, MTU2_TCNT_1);
-
-	clk_register(&mtu2_clk1);
-	clk_enable(&mtu2_clk1);
-
-	return 0;
-}
-
-struct sys_timer_ops mtu2_timer_ops = {
-	.init		= mtu2_timer_init,
-	.start		= mtu2_timer_start,
-	.stop		= mtu2_timer_stop,
-#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
-	.get_offset	= mtu2_timer_get_offset,
-#endif
-};
-
-struct sys_timer mtu2_timer = {
-	.name	= "mtu2",
-	.ops	= &mtu2_timer_ops,
-};
diff --git a/arch/sh/kernel/timers/timer.c b/arch/sh/kernel/timers/timer.c
index f3bd141..920891a 100644
--- a/arch/sh/kernel/timers/timer.c
+++ b/arch/sh/kernel/timers/timer.c
@@ -17,9 +17,6 @@ static struct sys_timer *sys_timers[] = {
 #ifdef CONFIG_SH_TMU
 	&tmu_timer,
 #endif
-#ifdef CONFIG_SH_MTU2
-	&mtu2_timer,
-#endif
 	NULL,
 };
 
-- 
cgit v0.10.2


From 3969c52d4d2fef5a4b9e3ab0e51b3901e1cc8b83 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Sun, 3 May 2009 11:11:35 +0530
Subject: x86: cpufeature.h fix name for X86_FEATURE_MCE

X86_FEATURE_MCE = Machine Check Exception
X86_FEATURE_MCA = Machine Check Architecture

[ Impact: cleanup ]

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
LKML-Reference: <1241329295.6321.1.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index bb83b1c..ccc1061 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -22,7 +22,7 @@
 #define X86_FEATURE_TSC		(0*32+ 4) /* Time Stamp Counter */
 #define X86_FEATURE_MSR		(0*32+ 5) /* Model-Specific Registers */
 #define X86_FEATURE_PAE		(0*32+ 6) /* Physical Address Extensions */
-#define X86_FEATURE_MCE		(0*32+ 7) /* Machine Check Architecture */
+#define X86_FEATURE_MCE		(0*32+ 7) /* Machine Check Exception */
 #define X86_FEATURE_CX8		(0*32+ 8) /* CMPXCHG8 instruction */
 #define X86_FEATURE_APIC	(0*32+ 9) /* Onboard APIC */
 #define X86_FEATURE_SEP		(0*32+11) /* SYSENTER/SYSEXIT */
-- 
cgit v0.10.2


From 9570ef20423b549757aa484ad388f9a7d5bdc4d9 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Fri, 1 May 2009 06:51:00 +0000
Subject: clocksource: SuperH TMU Timer driver

This patch adds a TMU driver for the SuperH architecture.

The TMU driver is a platform driver with early platform
support to allow using a TMU channel as clockevent or
clocksource during system bootup or later.

Clocksource or clockevent can be selected.
Both periodic and oneshot clockevents are supported.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index bda3dc1..1f74737 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -116,6 +116,9 @@ config SYS_SUPPORTS_CMT
 config SYS_SUPPORTS_MTU2
 	bool
 
+config SYS_SUPPORTS_TMU
+	bool
+
 config STACKTRACE_SUPPORT
 	def_bool y
 
@@ -470,6 +473,15 @@ config SH_TMU
 	help
 	  This enables the use of the TMU as the system timer.
 
+config SH_TIMER_TMU
+	bool "TMU timer driver"
+	depends on !SH_TMU && SYS_SUPPORTS_TMU
+	default y
+	select GENERIC_TIME
+	select GENERIC_CLOCKEVENTS
+	help
+	  This enables the build of the TMU timer driver.
+
 config SH_TIMER_CMT
 	bool "CMT timer driver"
 	depends on SYS_SUPPORTS_CMT
diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile
index 9785586..eef216f 100644
--- a/drivers/clocksource/Makefile
+++ b/drivers/clocksource/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_X86_PM_TIMER)	+= acpi_pm.o
 obj-$(CONFIG_SCx200HR_TIMER)	+= scx200_hrt.o
 obj-$(CONFIG_SH_TIMER_CMT)	+= sh_cmt.o
 obj-$(CONFIG_SH_TIMER_MTU2)	+= sh_mtu2.o
+obj-$(CONFIG_SH_TIMER_TMU)	+= sh_tmu.o
diff --git a/drivers/clocksource/sh_tmu.c b/drivers/clocksource/sh_tmu.c
new file mode 100644
index 0000000..21bd77a
--- /dev/null
+++ b/drivers/clocksource/sh_tmu.c
@@ -0,0 +1,461 @@
+/*
+ * SuperH Timer Support - TMU
+ *
+ *  Copyright (C) 2009 Magnus Damm
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/ioport.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/clk.h>
+#include <linux/irq.h>
+#include <linux/err.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/sh_tmu.h>
+
+struct sh_tmu_priv {
+	void __iomem *mapbase;
+	struct clk *clk;
+	struct irqaction irqaction;
+	struct platform_device *pdev;
+	unsigned long rate;
+	unsigned long periodic;
+	struct clock_event_device ced;
+	struct clocksource cs;
+};
+
+static DEFINE_SPINLOCK(sh_tmu_lock);
+
+#define TSTR -1 /* shared register */
+#define TCOR  0 /* channel register */
+#define TCNT 1 /* channel register */
+#define TCR 2 /* channel register */
+
+static inline unsigned long sh_tmu_read(struct sh_tmu_priv *p, int reg_nr)
+{
+	struct sh_tmu_config *cfg = p->pdev->dev.platform_data;
+	void __iomem *base = p->mapbase;
+	unsigned long offs;
+
+	if (reg_nr == TSTR)
+		return ioread8(base - cfg->channel_offset);
+
+	offs = reg_nr << 2;
+
+	if (reg_nr == TCR)
+		return ioread16(base + offs);
+	else
+		return ioread32(base + offs);
+}
+
+static inline void sh_tmu_write(struct sh_tmu_priv *p, int reg_nr,
+				unsigned long value)
+{
+	struct sh_tmu_config *cfg = p->pdev->dev.platform_data;
+	void __iomem *base = p->mapbase;
+	unsigned long offs;
+
+	if (reg_nr == TSTR) {
+		iowrite8(value, base - cfg->channel_offset);
+		return;
+	}
+
+	offs = reg_nr << 2;
+
+	if (reg_nr == TCR)
+		iowrite16(value, base + offs);
+	else
+		iowrite32(value, base + offs);
+}
+
+static void sh_tmu_start_stop_ch(struct sh_tmu_priv *p, int start)
+{
+	struct sh_tmu_config *cfg = p->pdev->dev.platform_data;
+	unsigned long flags, value;
+
+	/* start stop register shared by multiple timer channels */
+	spin_lock_irqsave(&sh_tmu_lock, flags);
+	value = sh_tmu_read(p, TSTR);
+
+	if (start)
+		value |= 1 << cfg->timer_bit;
+	else
+		value &= ~(1 << cfg->timer_bit);
+
+	sh_tmu_write(p, TSTR, value);
+	spin_unlock_irqrestore(&sh_tmu_lock, flags);
+}
+
+static int sh_tmu_enable(struct sh_tmu_priv *p)
+{
+	struct sh_tmu_config *cfg = p->pdev->dev.platform_data;
+	int ret;
+
+	/* enable clock */
+	ret = clk_enable(p->clk);
+	if (ret) {
+		pr_err("sh_tmu: cannot enable clock \"%s\"\n", cfg->clk);
+		return ret;
+	}
+
+	/* make sure channel is disabled */
+	sh_tmu_start_stop_ch(p, 0);
+
+	/* maximum timeout */
+	sh_tmu_write(p, TCOR, 0xffffffff);
+	sh_tmu_write(p, TCNT, 0xffffffff);
+
+	/* configure channel to parent clock / 4, irq off */
+	p->rate = clk_get_rate(p->clk) / 4;
+	sh_tmu_write(p, TCR, 0x0000);
+
+	/* enable channel */
+	sh_tmu_start_stop_ch(p, 1);
+
+	return 0;
+}
+
+static void sh_tmu_disable(struct sh_tmu_priv *p)
+{
+	/* disable channel */
+	sh_tmu_start_stop_ch(p, 0);
+
+	/* stop clock */
+	clk_disable(p->clk);
+}
+
+static void sh_tmu_set_next(struct sh_tmu_priv *p, unsigned long delta,
+			    int periodic)
+{
+	/* stop timer */
+	sh_tmu_start_stop_ch(p, 0);
+
+	/* acknowledge interrupt */
+	sh_tmu_read(p, TCR);
+
+	/* enable interrupt */
+	sh_tmu_write(p, TCR, 0x0020);
+
+	/* reload delta value in case of periodic timer */
+	if (periodic)
+		sh_tmu_write(p, TCOR, delta);
+	else
+		sh_tmu_write(p, TCOR, 0);
+
+	sh_tmu_write(p, TCNT, delta);
+
+	/* start timer */
+	sh_tmu_start_stop_ch(p, 1);
+}
+
+static irqreturn_t sh_tmu_interrupt(int irq, void *dev_id)
+{
+	struct sh_tmu_priv *p = dev_id;
+
+	/* disable or acknowledge interrupt */
+	if (p->ced.mode == CLOCK_EVT_MODE_ONESHOT)
+		sh_tmu_write(p, TCR, 0x0000);
+	else
+		sh_tmu_write(p, TCR, 0x0020);
+
+	/* notify clockevent layer */
+	p->ced.event_handler(&p->ced);
+	return IRQ_HANDLED;
+}
+
+static struct sh_tmu_priv *cs_to_sh_tmu(struct clocksource *cs)
+{
+	return container_of(cs, struct sh_tmu_priv, cs);
+}
+
+static cycle_t sh_tmu_clocksource_read(struct clocksource *cs)
+{
+	struct sh_tmu_priv *p = cs_to_sh_tmu(cs);
+
+	return sh_tmu_read(p, TCNT) ^ 0xffffffff;
+}
+
+static int sh_tmu_clocksource_enable(struct clocksource *cs)
+{
+	struct sh_tmu_priv *p = cs_to_sh_tmu(cs);
+	int ret;
+
+	ret = sh_tmu_enable(p);
+	if (ret)
+		return ret;
+
+	/* TODO: calculate good shift from rate and counter bit width */
+	cs->shift = 10;
+	cs->mult = clocksource_hz2mult(p->rate, cs->shift);
+	return 0;
+}
+
+static void sh_tmu_clocksource_disable(struct clocksource *cs)
+{
+	sh_tmu_disable(cs_to_sh_tmu(cs));
+}
+
+static int sh_tmu_register_clocksource(struct sh_tmu_priv *p,
+				       char *name, unsigned long rating)
+{
+	struct clocksource *cs = &p->cs;
+
+	cs->name = name;
+	cs->rating = rating;
+	cs->read = sh_tmu_clocksource_read;
+	cs->enable = sh_tmu_clocksource_enable;
+	cs->disable = sh_tmu_clocksource_disable;
+	cs->mask = CLOCKSOURCE_MASK(32);
+	cs->flags = CLOCK_SOURCE_IS_CONTINUOUS;
+	pr_info("sh_tmu: %s used as clock source\n", cs->name);
+	clocksource_register(cs);
+	return 0;
+}
+
+static struct sh_tmu_priv *ced_to_sh_tmu(struct clock_event_device *ced)
+{
+	return container_of(ced, struct sh_tmu_priv, ced);
+}
+
+static void sh_tmu_clock_event_start(struct sh_tmu_priv *p, int periodic)
+{
+	struct clock_event_device *ced = &p->ced;
+
+	sh_tmu_enable(p);
+
+	/* TODO: calculate good shift from rate and counter bit width */
+
+	ced->shift = 32;
+	ced->mult = div_sc(p->rate, NSEC_PER_SEC, ced->shift);
+	ced->max_delta_ns = clockevent_delta2ns(0xffffffff, ced);
+	ced->min_delta_ns = 5000;
+
+	if (periodic) {
+		p->periodic = (p->rate + HZ/2) / HZ;
+		sh_tmu_set_next(p, p->periodic, 1);
+	}
+}
+
+static void sh_tmu_clock_event_mode(enum clock_event_mode mode,
+				    struct clock_event_device *ced)
+{
+	struct sh_tmu_priv *p = ced_to_sh_tmu(ced);
+	int disabled = 0;
+
+	/* deal with old setting first */
+	switch (ced->mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+	case CLOCK_EVT_MODE_ONESHOT:
+		sh_tmu_disable(p);
+		disabled = 1;
+		break;
+	default:
+		break;
+	}
+
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		pr_info("sh_tmu: %s used for periodic clock events\n",
+			ced->name);
+		sh_tmu_clock_event_start(p, 1);
+		break;
+	case CLOCK_EVT_MODE_ONESHOT:
+		pr_info("sh_tmu: %s used for oneshot clock events\n",
+			ced->name);
+		sh_tmu_clock_event_start(p, 0);
+		break;
+	case CLOCK_EVT_MODE_UNUSED:
+		if (!disabled)
+			sh_tmu_disable(p);
+		break;
+	case CLOCK_EVT_MODE_SHUTDOWN:
+	default:
+		break;
+	}
+}
+
+static int sh_tmu_clock_event_next(unsigned long delta,
+				   struct clock_event_device *ced)
+{
+	struct sh_tmu_priv *p = ced_to_sh_tmu(ced);
+
+	BUG_ON(ced->mode != CLOCK_EVT_MODE_ONESHOT);
+
+	/* program new delta value */
+	sh_tmu_set_next(p, delta, 0);
+	return 0;
+}
+
+static void sh_tmu_register_clockevent(struct sh_tmu_priv *p,
+				       char *name, unsigned long rating)
+{
+	struct clock_event_device *ced = &p->ced;
+	int ret;
+
+	memset(ced, 0, sizeof(*ced));
+
+	ced->name = name;
+	ced->features = CLOCK_EVT_FEAT_PERIODIC;
+	ced->features |= CLOCK_EVT_FEAT_ONESHOT;
+	ced->rating = rating;
+	ced->cpumask = cpumask_of(0);
+	ced->set_next_event = sh_tmu_clock_event_next;
+	ced->set_mode = sh_tmu_clock_event_mode;
+
+	ret = setup_irq(p->irqaction.irq, &p->irqaction);
+	if (ret) {
+		pr_err("sh_tmu: failed to request irq %d\n",
+		       p->irqaction.irq);
+		return;
+	}
+
+	pr_info("sh_tmu: %s used for clock events\n", ced->name);
+	clockevents_register_device(ced);
+}
+
+static int sh_tmu_register(struct sh_tmu_priv *p, char *name,
+		    unsigned long clockevent_rating,
+		    unsigned long clocksource_rating)
+{
+	if (clockevent_rating)
+		sh_tmu_register_clockevent(p, name, clockevent_rating);
+	else if (clocksource_rating)
+		sh_tmu_register_clocksource(p, name, clocksource_rating);
+
+	return 0;
+}
+
+static int sh_tmu_setup(struct sh_tmu_priv *p, struct platform_device *pdev)
+{
+	struct sh_tmu_config *cfg = pdev->dev.platform_data;
+	struct resource *res;
+	int irq, ret;
+	ret = -ENXIO;
+
+	memset(p, 0, sizeof(*p));
+	p->pdev = pdev;
+
+	if (!cfg) {
+		dev_err(&p->pdev->dev, "missing platform data\n");
+		goto err0;
+	}
+
+	platform_set_drvdata(pdev, p);
+
+	res = platform_get_resource(p->pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(&p->pdev->dev, "failed to get I/O memory\n");
+		goto err0;
+	}
+
+	irq = platform_get_irq(p->pdev, 0);
+	if (irq < 0) {
+		dev_err(&p->pdev->dev, "failed to get irq\n");
+		goto err0;
+	}
+
+	/* map memory, let mapbase point to our channel */
+	p->mapbase = ioremap_nocache(res->start, resource_size(res));
+	if (p->mapbase == NULL) {
+		pr_err("sh_tmu: failed to remap I/O memory\n");
+		goto err0;
+	}
+
+	/* setup data for setup_irq() (too early for request_irq()) */
+	p->irqaction.name = cfg->name;
+	p->irqaction.handler = sh_tmu_interrupt;
+	p->irqaction.dev_id = p;
+	p->irqaction.irq = irq;
+	p->irqaction.flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL;
+	p->irqaction.mask = CPU_MASK_NONE;
+
+	/* get hold of clock */
+	p->clk = clk_get(&p->pdev->dev, cfg->clk);
+	if (IS_ERR(p->clk)) {
+		pr_err("sh_tmu: cannot get clock \"%s\"\n", cfg->clk);
+		ret = PTR_ERR(p->clk);
+		goto err1;
+	}
+
+	return sh_tmu_register(p, cfg->name,
+			       cfg->clockevent_rating,
+			       cfg->clocksource_rating);
+ err1:
+	iounmap(p->mapbase);
+ err0:
+	return ret;
+}
+
+static int __devinit sh_tmu_probe(struct platform_device *pdev)
+{
+	struct sh_tmu_priv *p = platform_get_drvdata(pdev);
+	struct sh_tmu_config *cfg = pdev->dev.platform_data;
+	int ret;
+
+	if (p) {
+		pr_info("sh_tmu: %s kept as earlytimer\n", cfg->name);
+		return 0;
+	}
+
+	p = kmalloc(sizeof(*p), GFP_KERNEL);
+	if (p == NULL) {
+		dev_err(&pdev->dev, "failed to allocate driver data\n");
+		return -ENOMEM;
+	}
+
+	ret = sh_tmu_setup(p, pdev);
+	if (ret) {
+		kfree(p);
+		platform_set_drvdata(pdev, NULL);
+	}
+	return ret;
+}
+
+static int __devexit sh_tmu_remove(struct platform_device *pdev)
+{
+	return -EBUSY; /* cannot unregister clockevent and clocksource */
+}
+
+static struct platform_driver sh_tmu_device_driver = {
+	.probe		= sh_tmu_probe,
+	.remove		= __devexit_p(sh_tmu_remove),
+	.driver		= {
+		.name	= "sh_tmu",
+	}
+};
+
+static int __init sh_tmu_init(void)
+{
+	return platform_driver_register(&sh_tmu_device_driver);
+}
+
+static void __exit sh_tmu_exit(void)
+{
+	platform_driver_unregister(&sh_tmu_device_driver);
+}
+
+early_platform_init("earlytimer", &sh_tmu_device_driver);
+module_init(sh_tmu_init);
+module_exit(sh_tmu_exit);
+
+MODULE_AUTHOR("Magnus Damm");
+MODULE_DESCRIPTION("SuperH TMU Timer Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/sh_tmu.h b/include/linux/sh_tmu.h
new file mode 100644
index 0000000..b1195e8
--- /dev/null
+++ b/include/linux/sh_tmu.h
@@ -0,0 +1,13 @@
+#ifndef __SH_TMU_H__
+#define __SH_TMU_H__
+
+struct sh_tmu_config {
+	char *name;
+	unsigned long channel_offset;
+	int timer_bit;
+	char *clk;
+	unsigned long clockevent_rating;
+	unsigned long clocksource_rating;
+};
+
+#endif /* __SH_TMU_H__ */
-- 
cgit v0.10.2


From d43a41bf8b504a1d9f0b4ce7e17d803f4ef39d84 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Fri, 1 May 2009 06:58:52 +0000
Subject: sh: TMU platform data for sh7722

This patch adds TMU platform data for sh7722. Only clockevent
mode is enabled for now, clocksource requires this patch:
"clocksource: setup mult_orig in clocksource_enable()"

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 1f74737..c3e9455 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -423,6 +423,7 @@ config CPU_SUBTYPE_SH7722
 	select ARCH_SPARSEMEM_ENABLE
 	select SYS_SUPPORTS_NUMA
 	select SYS_SUPPORTS_CMT
+	select SYS_SUPPORTS_TMU
 
 config CPU_SUBTYPE_SH7366
 	bool "Support SH7366 processor"
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
index 793c50d..512735c 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
@@ -14,6 +14,7 @@
 #include <linux/mm.h>
 #include <linux/uio_driver.h>
 #include <linux/sh_cmt.h>
+#include <linux/sh_tmu.h>
 #include <asm/clock.h>
 #include <asm/mmzone.h>
 
@@ -209,6 +210,98 @@ static struct platform_device cmt_device = {
 	.num_resources	= ARRAY_SIZE(cmt_resources),
 };
 
+static struct sh_tmu_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "tmu0",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffd80008,
+		.end	= 0xffd80013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_tmu_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "tmu0",
+	.clocksource_rating = 0, /* disabled for now */
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffd80014,
+		.end	= 0xffd8001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_tmu_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "tmu0",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffd80020,
+		.end	= 0xffd8002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
 static struct plat_sci_port sci_platform_data[] = {
 	{
 		.mapbase	= 0xffe00000,
@@ -243,6 +336,9 @@ static struct platform_device sci_device = {
 
 static struct platform_device *sh7722_devices[] __initdata = {
 	&cmt_device,
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
 	&rtc_device,
 	&usbf_device,
 	&iic_device,
@@ -271,6 +367,9 @@ __initcall(sh7722_devices_setup);
 
 static struct platform_device *sh7722_early_devices[] __initdata = {
 	&cmt_device,
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
 };
 
 void __init plat_early_device_setup(void)
-- 
cgit v0.10.2


From 9a8709d44139748fe2e0ab56d20d8c384c8b65ad Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Sat, 2 May 2009 00:25:11 +0400
Subject: x86: uv - prevent NULL dereference in uv_system_init()

We may reach NULL dereference oops if kmalloc failed.
Prevent it with explicit BUG_ON.

[ Impact: more controlled assert in 'impossible' scenario ]

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20090501202511.GE4633@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 873bf71..9d9e228 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -569,15 +569,18 @@ void __init uv_system_init(void)
 
 	bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
 	uv_blade_info = kmalloc(bytes, GFP_KERNEL);
+	BUG_ON(!uv_blade_info);
 
 	get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
 
 	bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes();
 	uv_node_to_blade = kmalloc(bytes, GFP_KERNEL);
+	BUG_ON(!uv_node_to_blade);
 	memset(uv_node_to_blade, 255, bytes);
 
 	bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus();
 	uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL);
+	BUG_ON(!uv_cpu_to_blade);
 	memset(uv_cpu_to_blade, 255, bytes);
 
 	blade = 0;
-- 
cgit v0.10.2


From 46a12f7426d71cabc08972cf8d3ffdd441d26a3a Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 3 May 2009 17:57:17 +0900
Subject: sh: Consolidate MTU2/CMT/TMU timer platform data.

All of the SH timers use a roughly identical structure for platform data,
which presently is broken out for each block. Consolidate all of these
definitions, as there is no reason for them to be broken out in the first
place.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh2/setup-sh7619.c b/arch/sh/kernel/cpu/sh2/setup-sh7619.c
index d70c263..94ac27f 100644
--- a/arch/sh/kernel/cpu/sh2/setup-sh7619.c
+++ b/arch/sh/kernel/cpu/sh2/setup-sh7619.c
@@ -12,7 +12,7 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
-#include <linux/sh_cmt.h>
+#include <linux/sh_timer.h>
 #include <linux/io.h>
 
 enum {
@@ -111,7 +111,7 @@ static struct platform_device eth_device = {
 	.resource = eth_resources,
 };
 
-static struct sh_cmt_config cmt0_platform_data = {
+static struct sh_timer_config cmt0_platform_data = {
 	.name = "CMT0",
 	.channel_offset = 0x02,
 	.timer_bit = 0,
@@ -143,7 +143,7 @@ static struct platform_device cmt0_device = {
 	.num_resources	= ARRAY_SIZE(cmt0_resources),
 };
 
-static struct sh_cmt_config cmt1_platform_data = {
+static struct sh_timer_config cmt1_platform_data = {
 	.name = "CMT1",
 	.channel_offset = 0x08,
 	.timer_bit = 1,
diff --git a/arch/sh/kernel/cpu/sh2a/setup-mxg.c b/arch/sh/kernel/cpu/sh2a/setup-mxg.c
index 870030a..a452d96 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-mxg.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-mxg.c
@@ -11,7 +11,7 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
-#include <linux/sh_mtu2.h>
+#include <linux/sh_timer.h>
 
 enum {
 	UNUSED = 0,
@@ -114,7 +114,7 @@ static struct intc_mask_reg mask_registers[] __initdata = {
 static DECLARE_INTC_DESC(intc_desc, "mxg", vectors, groups,
 			 mask_registers, prio_registers, NULL);
 
-static struct sh_mtu2_config mtu2_0_platform_data = {
+static struct sh_timer_config mtu2_0_platform_data = {
 	.name = "MTU2_0",
 	.channel_offset = -0x80,
 	.timer_bit = 0,
@@ -145,7 +145,7 @@ static struct platform_device mtu2_0_device = {
 	.num_resources	= ARRAY_SIZE(mtu2_0_resources),
 };
 
-static struct sh_mtu2_config mtu2_1_platform_data = {
+static struct sh_timer_config mtu2_1_platform_data = {
 	.name = "MTU2_1",
 	.channel_offset = -0x100,
 	.timer_bit = 1,
@@ -176,7 +176,7 @@ static struct platform_device mtu2_1_device = {
 	.num_resources	= ARRAY_SIZE(mtu2_1_resources),
 };
 
-static struct sh_mtu2_config mtu2_2_platform_data = {
+static struct sh_timer_config mtu2_2_platform_data = {
 	.name = "MTU2_2",
 	.channel_offset = 0x80,
 	.timer_bit = 2,
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
index 074aa90..772358b 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
@@ -12,7 +12,7 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
-#include <linux/sh_mtu2.h>
+#include <linux/sh_timer.h>
 #include <linux/io.h>
 
 enum {
@@ -251,7 +251,7 @@ static struct platform_device rtc_device = {
 	.resource	= rtc_resources,
 };
 
-static struct sh_mtu2_config mtu2_0_platform_data = {
+static struct sh_timer_config mtu2_0_platform_data = {
 	.name = "MTU2_0",
 	.channel_offset = -0x80,
 	.timer_bit = 0,
@@ -282,7 +282,7 @@ static struct platform_device mtu2_0_device = {
 	.num_resources	= ARRAY_SIZE(mtu2_0_resources),
 };
 
-static struct sh_mtu2_config mtu2_1_platform_data = {
+static struct sh_timer_config mtu2_1_platform_data = {
 	.name = "MTU2_1",
 	.channel_offset = -0x100,
 	.timer_bit = 1,
@@ -313,7 +313,7 @@ static struct platform_device mtu2_1_device = {
 	.num_resources	= ARRAY_SIZE(mtu2_1_resources),
 };
 
-static struct sh_mtu2_config mtu2_2_platform_data = {
+static struct sh_timer_config mtu2_2_platform_data = {
 	.name = "MTU2_2",
 	.channel_offset = 0x80,
 	.timer_bit = 2,
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
index 3448164..d749341 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
@@ -11,8 +11,7 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
-#include <linux/sh_cmt.h>
-#include <linux/sh_mtu2.h>
+#include <linux/sh_timer.h>
 #include <linux/io.h>
 
 enum {
@@ -208,7 +207,7 @@ static struct platform_device sci_device = {
 	},
 };
 
-static struct sh_cmt_config cmt0_platform_data = {
+static struct sh_timer_config cmt0_platform_data = {
 	.name = "CMT0",
 	.channel_offset = 0x02,
 	.timer_bit = 0,
@@ -240,7 +239,7 @@ static struct platform_device cmt0_device = {
 	.num_resources	= ARRAY_SIZE(cmt0_resources),
 };
 
-static struct sh_cmt_config cmt1_platform_data = {
+static struct sh_timer_config cmt1_platform_data = {
 	.name = "CMT1",
 	.channel_offset = 0x08,
 	.timer_bit = 1,
@@ -272,7 +271,7 @@ static struct platform_device cmt1_device = {
 	.num_resources	= ARRAY_SIZE(cmt1_resources),
 };
 
-static struct sh_mtu2_config mtu2_0_platform_data = {
+static struct sh_timer_config mtu2_0_platform_data = {
 	.name = "MTU2_0",
 	.channel_offset = -0x80,
 	.timer_bit = 0,
@@ -303,7 +302,7 @@ static struct platform_device mtu2_0_device = {
 	.num_resources	= ARRAY_SIZE(mtu2_0_resources),
 };
 
-static struct sh_mtu2_config mtu2_1_platform_data = {
+static struct sh_timer_config mtu2_1_platform_data = {
 	.name = "MTU2_1",
 	.channel_offset = -0x100,
 	.timer_bit = 1,
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
index e700559..2fc6bff 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
@@ -12,8 +12,7 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
-#include <linux/sh_cmt.h>
-#include <linux/sh_mtu2.h>
+#include <linux/sh_timer.h>
 #include <linux/io.h>
 
 enum {
@@ -168,7 +167,7 @@ static struct platform_device sci_device = {
 	},
 };
 
-static struct sh_cmt_config cmt0_platform_data = {
+static struct sh_timer_config cmt0_platform_data = {
 	.name = "CMT0",
 	.channel_offset = 0x02,
 	.timer_bit = 0,
@@ -200,7 +199,7 @@ static struct platform_device cmt0_device = {
 	.num_resources	= ARRAY_SIZE(cmt0_resources),
 };
 
-static struct sh_cmt_config cmt1_platform_data = {
+static struct sh_timer_config cmt1_platform_data = {
 	.name = "CMT1",
 	.channel_offset = 0x08,
 	.timer_bit = 1,
@@ -232,7 +231,7 @@ static struct platform_device cmt1_device = {
 	.num_resources	= ARRAY_SIZE(cmt1_resources),
 };
 
-static struct sh_mtu2_config mtu2_0_platform_data = {
+static struct sh_timer_config mtu2_0_platform_data = {
 	.name = "MTU2_0",
 	.channel_offset = -0x80,
 	.timer_bit = 0,
@@ -263,7 +262,7 @@ static struct platform_device mtu2_0_device = {
 	.num_resources	= ARRAY_SIZE(mtu2_0_resources),
 };
 
-static struct sh_mtu2_config mtu2_1_platform_data = {
+static struct sh_timer_config mtu2_1_platform_data = {
 	.name = "MTU2_1",
 	.channel_offset = -0x100,
 	.timer_bit = 1,
@@ -294,7 +293,7 @@ static struct platform_device mtu2_1_device = {
 	.num_resources	= ARRAY_SIZE(mtu2_1_resources),
 };
 
-static struct sh_mtu2_config mtu2_2_platform_data = {
+static struct sh_timer_config mtu2_2_platform_data = {
 	.name = "MTU2_2",
 	.channel_offset = 0x80,
 	.timer_bit = 2,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
index cb5b4db..f327b7e 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
@@ -12,7 +12,7 @@
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
 #include <linux/uio_driver.h>
-#include <linux/sh_cmt.h>
+#include <linux/sh_timer.h>
 #include <asm/clock.h>
 
 static struct resource iic0_resources[] = {
@@ -141,7 +141,7 @@ static struct platform_device jpu_device = {
 	.num_resources	= ARRAY_SIZE(jpu_resources),
 };
 
-static struct sh_cmt_config cmt_platform_data = {
+static struct sh_timer_config cmt_platform_data = {
 	.name = "CMT",
 	.channel_offset = 0x60,
 	.timer_bit = 5,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
index 2a771f4..7361ea9 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
@@ -14,7 +14,7 @@
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
 #include <linux/uio_driver.h>
-#include <linux/sh_cmt.h>
+#include <linux/sh_timer.h>
 #include <asm/clock.h>
 
 static struct resource iic_resources[] = {
@@ -148,7 +148,7 @@ static struct platform_device veu1_device = {
 	.num_resources	= ARRAY_SIZE(veu1_resources),
 };
 
-static struct sh_cmt_config cmt_platform_data = {
+static struct sh_timer_config cmt_platform_data = {
 	.name = "CMT",
 	.channel_offset = 0x60,
 	.timer_bit = 5,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
index 512735c..624e8e5 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
@@ -13,8 +13,7 @@
 #include <linux/serial_sci.h>
 #include <linux/mm.h>
 #include <linux/uio_driver.h>
-#include <linux/sh_cmt.h>
-#include <linux/sh_tmu.h>
+#include <linux/sh_timer.h>
 #include <asm/clock.h>
 #include <asm/mmzone.h>
 
@@ -178,7 +177,7 @@ static struct platform_device jpu_device = {
 	.num_resources	= ARRAY_SIZE(jpu_resources),
 };
 
-static struct sh_cmt_config cmt_platform_data = {
+static struct sh_timer_config cmt_platform_data = {
 	.name = "CMT",
 	.channel_offset = 0x60,
 	.timer_bit = 5,
@@ -210,7 +209,7 @@ static struct platform_device cmt_device = {
 	.num_resources	= ARRAY_SIZE(cmt_resources),
 };
 
-static struct sh_tmu_config tmu0_platform_data = {
+static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
@@ -241,7 +240,7 @@ static struct platform_device tmu0_device = {
 	.num_resources	= ARRAY_SIZE(tmu0_resources),
 };
 
-static struct sh_tmu_config tmu1_platform_data = {
+static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
@@ -272,7 +271,7 @@ static struct platform_device tmu1_device = {
 	.num_resources	= ARRAY_SIZE(tmu1_resources),
 };
 
-static struct sh_tmu_config tmu2_platform_data = {
+static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
index dbb4494..fb9b542 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
@@ -13,7 +13,7 @@
 #include <linux/mm.h>
 #include <linux/serial_sci.h>
 #include <linux/uio_driver.h>
-#include <linux/sh_cmt.h>
+#include <linux/sh_timer.h>
 #include <asm/clock.h>
 #include <asm/mmzone.h>
 
@@ -101,7 +101,7 @@ static struct platform_device veu1_device = {
 	.num_resources	= ARRAY_SIZE(veu1_resources),
 };
 
-static struct sh_cmt_config cmt_platform_data = {
+static struct sh_timer_config cmt_platform_data = {
 	.name = "CMT",
 	.channel_offset = 0x60,
 	.timer_bit = 5,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index 8429396..e074951 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -18,7 +18,7 @@
 #include <linux/mm.h>
 #include <linux/serial_sci.h>
 #include <linux/uio_driver.h>
-#include <linux/sh_cmt.h>
+#include <linux/sh_timer.h>
 #include <linux/io.h>
 #include <asm/clock.h>
 #include <asm/mmzone.h>
@@ -230,7 +230,7 @@ static struct platform_device veu1_device = {
 	.num_resources	= ARRAY_SIZE(veu1_resources),
 };
 
-static struct sh_cmt_config cmt_platform_data = {
+static struct sh_timer_config cmt_platform_data = {
 	.name = "CMT",
 	.channel_offset = 0x60,
 	.timer_bit = 5,
diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c
index 4ff1508..aeb8c9b 100644
--- a/drivers/clocksource/sh_cmt.c
+++ b/drivers/clocksource/sh_cmt.c
@@ -28,7 +28,7 @@
 #include <linux/err.h>
 #include <linux/clocksource.h>
 #include <linux/clockchips.h>
-#include <linux/sh_cmt.h>
+#include <linux/sh_timer.h>
 
 struct sh_cmt_priv {
 	void __iomem *mapbase;
@@ -59,7 +59,7 @@ static DEFINE_SPINLOCK(sh_cmt_lock);
 
 static inline unsigned long sh_cmt_read(struct sh_cmt_priv *p, int reg_nr)
 {
-	struct sh_cmt_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	void __iomem *base = p->mapbase;
 	unsigned long offs;
 
@@ -83,7 +83,7 @@ static inline unsigned long sh_cmt_read(struct sh_cmt_priv *p, int reg_nr)
 static inline void sh_cmt_write(struct sh_cmt_priv *p, int reg_nr,
 				unsigned long value)
 {
-	struct sh_cmt_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	void __iomem *base = p->mapbase;
 	unsigned long offs;
 
@@ -131,7 +131,7 @@ static unsigned long sh_cmt_get_counter(struct sh_cmt_priv *p,
 
 static void sh_cmt_start_stop_ch(struct sh_cmt_priv *p, int start)
 {
-	struct sh_cmt_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	unsigned long flags, value;
 
 	/* start stop register shared by multiple timer channels */
@@ -149,7 +149,7 @@ static void sh_cmt_start_stop_ch(struct sh_cmt_priv *p, int start)
 
 static int sh_cmt_enable(struct sh_cmt_priv *p, unsigned long *rate)
 {
-	struct sh_cmt_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	int ret;
 
 	/* enable clock */
@@ -560,7 +560,7 @@ int sh_cmt_register(struct sh_cmt_priv *p, char *name,
 
 static int sh_cmt_setup(struct sh_cmt_priv *p, struct platform_device *pdev)
 {
-	struct sh_cmt_config *cfg = pdev->dev.platform_data;
+	struct sh_timer_config *cfg = pdev->dev.platform_data;
 	struct resource *res;
 	int irq, ret;
 	ret = -ENXIO;
@@ -638,7 +638,7 @@ static int sh_cmt_setup(struct sh_cmt_priv *p, struct platform_device *pdev)
 static int __devinit sh_cmt_probe(struct platform_device *pdev)
 {
 	struct sh_cmt_priv *p = platform_get_drvdata(pdev);
-	struct sh_cmt_config *cfg = pdev->dev.platform_data;
+	struct sh_timer_config *cfg = pdev->dev.platform_data;
 	int ret;
 
 	if (p) {
diff --git a/drivers/clocksource/sh_mtu2.c b/drivers/clocksource/sh_mtu2.c
index 420566f..ef02185 100644
--- a/drivers/clocksource/sh_mtu2.c
+++ b/drivers/clocksource/sh_mtu2.c
@@ -28,7 +28,7 @@
 #include <linux/irq.h>
 #include <linux/err.h>
 #include <linux/clockchips.h>
-#include <linux/sh_mtu2.h>
+#include <linux/sh_timer.h>
 
 struct sh_mtu2_priv {
 	void __iomem *mapbase;
@@ -63,7 +63,7 @@ static unsigned long mtu2_reg_offs[] = {
 
 static inline unsigned long sh_mtu2_read(struct sh_mtu2_priv *p, int reg_nr)
 {
-	struct sh_mtu2_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	void __iomem *base = p->mapbase;
 	unsigned long offs;
 
@@ -81,7 +81,7 @@ static inline unsigned long sh_mtu2_read(struct sh_mtu2_priv *p, int reg_nr)
 static inline void sh_mtu2_write(struct sh_mtu2_priv *p, int reg_nr,
 				unsigned long value)
 {
-	struct sh_mtu2_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	void __iomem *base = p->mapbase;
 	unsigned long offs;
 
@@ -100,7 +100,7 @@ static inline void sh_mtu2_write(struct sh_mtu2_priv *p, int reg_nr,
 
 static void sh_mtu2_start_stop_ch(struct sh_mtu2_priv *p, int start)
 {
-	struct sh_mtu2_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	unsigned long flags, value;
 
 	/* start stop register shared by multiple timer channels */
@@ -118,7 +118,7 @@ static void sh_mtu2_start_stop_ch(struct sh_mtu2_priv *p, int start)
 
 static int sh_mtu2_enable(struct sh_mtu2_priv *p)
 {
-	struct sh_mtu2_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	int ret;
 
 	/* enable clock */
@@ -243,7 +243,7 @@ int sh_mtu2_register(struct sh_mtu2_priv *p, char *name,
 
 static int sh_mtu2_setup(struct sh_mtu2_priv *p, struct platform_device *pdev)
 {
-	struct sh_mtu2_config *cfg = pdev->dev.platform_data;
+	struct sh_timer_config *cfg = pdev->dev.platform_data;
 	struct resource *res;
 	int irq, ret;
 	ret = -ENXIO;
@@ -303,7 +303,7 @@ static int sh_mtu2_setup(struct sh_mtu2_priv *p, struct platform_device *pdev)
 static int __devinit sh_mtu2_probe(struct platform_device *pdev)
 {
 	struct sh_mtu2_priv *p = platform_get_drvdata(pdev);
-	struct sh_mtu2_config *cfg = pdev->dev.platform_data;
+	struct sh_timer_config *cfg = pdev->dev.platform_data;
 	int ret;
 
 	if (p) {
diff --git a/drivers/clocksource/sh_tmu.c b/drivers/clocksource/sh_tmu.c
index 21bd77a..d6ea439 100644
--- a/drivers/clocksource/sh_tmu.c
+++ b/drivers/clocksource/sh_tmu.c
@@ -29,7 +29,7 @@
 #include <linux/err.h>
 #include <linux/clocksource.h>
 #include <linux/clockchips.h>
-#include <linux/sh_tmu.h>
+#include <linux/sh_timer.h>
 
 struct sh_tmu_priv {
 	void __iomem *mapbase;
@@ -51,7 +51,7 @@ static DEFINE_SPINLOCK(sh_tmu_lock);
 
 static inline unsigned long sh_tmu_read(struct sh_tmu_priv *p, int reg_nr)
 {
-	struct sh_tmu_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	void __iomem *base = p->mapbase;
 	unsigned long offs;
 
@@ -69,7 +69,7 @@ static inline unsigned long sh_tmu_read(struct sh_tmu_priv *p, int reg_nr)
 static inline void sh_tmu_write(struct sh_tmu_priv *p, int reg_nr,
 				unsigned long value)
 {
-	struct sh_tmu_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	void __iomem *base = p->mapbase;
 	unsigned long offs;
 
@@ -88,7 +88,7 @@ static inline void sh_tmu_write(struct sh_tmu_priv *p, int reg_nr,
 
 static void sh_tmu_start_stop_ch(struct sh_tmu_priv *p, int start)
 {
-	struct sh_tmu_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	unsigned long flags, value;
 
 	/* start stop register shared by multiple timer channels */
@@ -106,7 +106,7 @@ static void sh_tmu_start_stop_ch(struct sh_tmu_priv *p, int start)
 
 static int sh_tmu_enable(struct sh_tmu_priv *p)
 {
-	struct sh_tmu_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	int ret;
 
 	/* enable clock */
@@ -345,7 +345,7 @@ static int sh_tmu_register(struct sh_tmu_priv *p, char *name,
 
 static int sh_tmu_setup(struct sh_tmu_priv *p, struct platform_device *pdev)
 {
-	struct sh_tmu_config *cfg = pdev->dev.platform_data;
+	struct sh_timer_config *cfg = pdev->dev.platform_data;
 	struct resource *res;
 	int irq, ret;
 	ret = -ENXIO;
@@ -407,7 +407,7 @@ static int sh_tmu_setup(struct sh_tmu_priv *p, struct platform_device *pdev)
 static int __devinit sh_tmu_probe(struct platform_device *pdev)
 {
 	struct sh_tmu_priv *p = platform_get_drvdata(pdev);
-	struct sh_tmu_config *cfg = pdev->dev.platform_data;
+	struct sh_timer_config *cfg = pdev->dev.platform_data;
 	int ret;
 
 	if (p) {
diff --git a/include/linux/sh_cmt.h b/include/linux/sh_cmt.h
deleted file mode 100644
index 68cacde..0000000
--- a/include/linux/sh_cmt.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef __SH_CMT_H__
-#define __SH_CMT_H__
-
-struct sh_cmt_config {
-	char *name;
-	unsigned long channel_offset;
-	int timer_bit;
-	char *clk;
-	unsigned long clockevent_rating;
-	unsigned long clocksource_rating;
-};
-
-#endif /* __SH_CMT_H__ */
diff --git a/include/linux/sh_mtu2.h b/include/linux/sh_mtu2.h
deleted file mode 100644
index a988763..0000000
--- a/include/linux/sh_mtu2.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef __SH_MTU2_H__
-#define __SH_MTU2_H__
-
-struct sh_mtu2_config {
-	char *name;
-	int channel_offset;
-	int timer_bit;
-	char *clk;
-	unsigned long clockevent_rating;
-};
-
-#endif /* __SH_MTU2_H__ */
diff --git a/include/linux/sh_timer.h b/include/linux/sh_timer.h
new file mode 100644
index 0000000..864bd56
--- /dev/null
+++ b/include/linux/sh_timer.h
@@ -0,0 +1,13 @@
+#ifndef __SH_TIMER_H__
+#define __SH_TIMER_H__
+
+struct sh_timer_config {
+	char *name;
+	long channel_offset;
+	int timer_bit;
+	char *clk;
+	unsigned long clockevent_rating;
+	unsigned long clocksource_rating;
+};
+
+#endif /* __SH_TIMER_H__ */
diff --git a/include/linux/sh_tmu.h b/include/linux/sh_tmu.h
deleted file mode 100644
index b1195e8..0000000
--- a/include/linux/sh_tmu.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef __SH_TMU_H__
-#define __SH_TMU_H__
-
-struct sh_tmu_config {
-	char *name;
-	unsigned long channel_offset;
-	int timer_bit;
-	char *clk;
-	unsigned long clockevent_rating;
-	unsigned long clocksource_rating;
-};
-
-#endif /* __SH_TMU_H__ */
-- 
cgit v0.10.2


From d1fcc0a8db5e47c1abaa783a3e83dbf5f2184969 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 3 May 2009 18:05:42 +0900
Subject: clocksource: sh_mtu2/cmt_register() should be static.

Neither of these need to be exported, so just make them static.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c
index aeb8c9b..cf56a2a 100644
--- a/drivers/clocksource/sh_cmt.c
+++ b/drivers/clocksource/sh_cmt.c
@@ -537,9 +537,9 @@ static void sh_cmt_register_clockevent(struct sh_cmt_priv *p,
 	clockevents_register_device(ced);
 }
 
-int sh_cmt_register(struct sh_cmt_priv *p, char *name,
-		    unsigned long clockevent_rating,
-		    unsigned long clocksource_rating)
+static int sh_cmt_register(struct sh_cmt_priv *p, char *name,
+			   unsigned long clockevent_rating,
+			   unsigned long clocksource_rating)
 {
 	if (p->width == (sizeof(p->max_match_value) * 8))
 		p->max_match_value = ~0;
diff --git a/drivers/clocksource/sh_mtu2.c b/drivers/clocksource/sh_mtu2.c
index ef02185..d1ae754 100644
--- a/drivers/clocksource/sh_mtu2.c
+++ b/drivers/clocksource/sh_mtu2.c
@@ -232,8 +232,8 @@ static void sh_mtu2_register_clockevent(struct sh_mtu2_priv *p,
 	clockevents_register_device(ced);
 }
 
-int sh_mtu2_register(struct sh_mtu2_priv *p, char *name,
-		     unsigned long clockevent_rating)
+static int sh_mtu2_register(struct sh_mtu2_priv *p, char *name,
+			    unsigned long clockevent_rating)
 {
 	if (clockevent_rating)
 		sh_mtu2_register_clockevent(p, name, clockevent_rating);
-- 
cgit v0.10.2


From 938edae11ee3a7b20b6d754074a0f2c2edc4534b Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 3 May 2009 18:12:26 +0900
Subject: sh: select both GENERIC_TIME and GENERIC_CLOCKEVENTS.

Now that the rest of the timers that didn't support clockevents have been
rewritten, both of these can be enabled by default.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index c3e9455..565f390 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -78,7 +78,7 @@ config GENERIC_TIME
 	def_bool y
 
 config GENERIC_CLOCKEVENTS
-	def_bool n
+	def_bool y
 
 config GENERIC_CLOCKEVENTS_BROADCAST
 	bool
@@ -469,8 +469,6 @@ config SH_TMU
 	bool "TMU timer support"
 	depends on CPU_SH3 || CPU_SH4
 	default y
-	select GENERIC_TIME
-	select GENERIC_CLOCKEVENTS
 	help
 	  This enables the use of the TMU as the system timer.
 
@@ -478,8 +476,6 @@ config SH_TIMER_TMU
 	bool "TMU timer driver"
 	depends on !SH_TMU && SYS_SUPPORTS_TMU
 	default y
-	select GENERIC_TIME
-	select GENERIC_CLOCKEVENTS
 	help
 	  This enables the build of the TMU timer driver.
 
@@ -487,8 +483,6 @@ config SH_TIMER_CMT
 	bool "CMT timer driver"
 	depends on SYS_SUPPORTS_CMT
 	default y
-	select GENERIC_CLOCKEVENTS
-	select GENERIC_TIME
 	help
 	  This enables build of the CMT timer driver.
 
@@ -496,7 +490,6 @@ config SH_TIMER_MTU2
 	bool "MTU2 timer driver"
 	depends on SYS_SUPPORTS_MTU2
 	default y
-	select GENERIC_CLOCKEVENTS
 	help
 	  This enables build of the MTU2 timer driver.
 
-- 
cgit v0.10.2


From dec56e6312434b2536fedf9d7e9e73d666aaf0a8 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 3 May 2009 18:18:14 +0900
Subject: sh: Kill off the now unused ARCH_USES_GETTIMEOFFSET code.

Now that the stragglers (MTU2/CMT/etc.) have been rewritten and we are
selecting both GENERIC_TIME and GENERIC_CLOCKEVENTS, the get_offset()
timer op is completely unused. As a result, we are now able to kill off
the ARCH_USES_GETTIMEOFFSET references.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/timer.h b/arch/sh/include/asm/timer.h
index 581e3fe..cb16645 100644
--- a/arch/sh/include/asm/timer.h
+++ b/arch/sh/include/asm/timer.h
@@ -9,9 +9,6 @@ struct sys_timer_ops {
 	int (*init)(void);
 	int (*start)(void);
 	int (*stop)(void);
-#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
-	unsigned long (*get_offset)(void);
-#endif
 };
 
 struct sys_timer {
@@ -26,13 +23,6 @@ struct sys_timer {
 extern struct sys_timer tmu_timer;
 extern struct sys_timer *sys_timer;
 
-#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
-static inline unsigned long get_timer_offset(void)
-{
-	return sys_timer->ops->get_offset();
-}
-#endif
-
 /* arch/sh/kernel/timers/timer.c */
 struct sys_timer *get_sys_timer(void);
 
diff --git a/arch/sh/kernel/time_32.c b/arch/sh/kernel/time_32.c
index 9d34dff..d41ca4c 100644
--- a/arch/sh/kernel/time_32.c
+++ b/arch/sh/kernel/time_32.c
@@ -83,13 +83,6 @@ static int __init rtc_generic_init(void)
 }
 module_init(rtc_generic_init);
 
-#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
-u32 arch_gettimeoffset(void)
-{
-	return get_timer_offset() * 1000;
-}
-#endif /* CONFIG_ARCH_USES_GETTIMEOFFSET */
-
 /* last time the RTC clock got updated */
 static long last_rtc_update;
 
@@ -185,7 +178,6 @@ struct clocksource clocksource_sh = {
 	.name		= "SuperH",
 };
 
-#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
 unsigned long long sched_clock(void)
 {
 	unsigned long long cycles;
@@ -197,7 +189,6 @@ unsigned long long sched_clock(void)
 	cycles = clocksource_sh.read(&clocksource_sh);
 	return cyc2ns(&clocksource_sh, cycles);
 }
-#endif
 
 static void __init sh_late_time_init(void)
 {
-- 
cgit v0.10.2


From 25483efeb2e56521e418a59fa93401be156dc3bb Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 3 May 2009 18:29:27 +0900
Subject: sh: Move dummy clockevents broadcast timer to its new home.

The old arch/sh/kernel/timers/ directly will be going away completely
once the rest of the TMU users are migrated, so move the dummy broadcast
driver up a level in preparation.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/Makefile_32 b/arch/sh/kernel/Makefile_32
index 82a3a15..5b1d4d0 100644
--- a/arch/sh/kernel/Makefile_32
+++ b/arch/sh/kernel/Makefile_32
@@ -32,4 +32,6 @@ obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
 obj-$(CONFIG_DUMP_CODE)		+= disassemble.o
 obj-$(CONFIG_HIBERNATION)	+= swsusp.o
 
+obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)	+= localtimer.o
+
 EXTRA_CFLAGS += -Werror
diff --git a/arch/sh/kernel/Makefile_64 b/arch/sh/kernel/Makefile_64
index fe425d7..d256c77 100644
--- a/arch/sh/kernel/Makefile_64
+++ b/arch/sh/kernel/Makefile_64
@@ -17,4 +17,6 @@ obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-$(CONFIG_IO_TRAPPED)	+= io_trapped.o
 obj-$(CONFIG_GENERIC_GPIO)	+= gpio.o
 
+obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)	+= localtimer.o
+
 EXTRA_CFLAGS += -Werror
diff --git a/arch/sh/kernel/localtimer.c b/arch/sh/kernel/localtimer.c
new file mode 100644
index 0000000..96e8eae
--- /dev/null
+++ b/arch/sh/kernel/localtimer.c
@@ -0,0 +1,57 @@
+/*
+ * Dummy local timer
+ *
+ * Copyright (C) 2008  Paul Mundt
+ *
+ * cloned from:
+ *
+ *  linux/arch/arm/mach-realview/localtimer.c
+ *
+ *  Copyright (C) 2002 ARM Ltd.
+ *  All Rights Reserved
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/smp.h>
+#include <linux/jiffies.h>
+#include <linux/percpu.h>
+#include <linux/clockchips.h>
+#include <linux/irq.h>
+
+static DEFINE_PER_CPU(struct clock_event_device, local_clockevent);
+
+/*
+ * Used on SMP for either the local timer or SMP_MSG_TIMER
+ */
+void local_timer_interrupt(void)
+{
+	struct clock_event_device *clk = &__get_cpu_var(local_clockevent);
+
+	clk->event_handler(clk);
+}
+
+static void dummy_timer_set_mode(enum clock_event_mode mode,
+				 struct clock_event_device *clk)
+{
+}
+
+void __cpuinit local_timer_setup(unsigned int cpu)
+{
+	struct clock_event_device *clk = &per_cpu(local_clockevent, cpu);
+
+	clk->name		= "dummy_timer";
+	clk->features		= CLOCK_EVT_FEAT_DUMMY;
+	clk->rating		= 200;
+	clk->mult		= 1;
+	clk->set_mode		= dummy_timer_set_mode;
+	clk->broadcast		= smp_timer_broadcast;
+	clk->cpumask		= cpumask_of(cpu);
+
+	clockevents_register_device(clk);
+}
diff --git a/arch/sh/kernel/timers/Makefile b/arch/sh/kernel/timers/Makefile
index 44156df..8bceba5 100644
--- a/arch/sh/kernel/timers/Makefile
+++ b/arch/sh/kernel/timers/Makefile
@@ -5,5 +5,3 @@
 obj-y	:= timer.o
 
 obj-$(CONFIG_SH_TMU)		+= timer-tmu.o
-
-obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)	+= timer-broadcast.o
diff --git a/arch/sh/kernel/timers/timer-broadcast.c b/arch/sh/kernel/timers/timer-broadcast.c
deleted file mode 100644
index 96e8eae..0000000
--- a/arch/sh/kernel/timers/timer-broadcast.c
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Dummy local timer
- *
- * Copyright (C) 2008  Paul Mundt
- *
- * cloned from:
- *
- *  linux/arch/arm/mach-realview/localtimer.c
- *
- *  Copyright (C) 2002 ARM Ltd.
- *  All Rights Reserved
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/delay.h>
-#include <linux/device.h>
-#include <linux/smp.h>
-#include <linux/jiffies.h>
-#include <linux/percpu.h>
-#include <linux/clockchips.h>
-#include <linux/irq.h>
-
-static DEFINE_PER_CPU(struct clock_event_device, local_clockevent);
-
-/*
- * Used on SMP for either the local timer or SMP_MSG_TIMER
- */
-void local_timer_interrupt(void)
-{
-	struct clock_event_device *clk = &__get_cpu_var(local_clockevent);
-
-	clk->event_handler(clk);
-}
-
-static void dummy_timer_set_mode(enum clock_event_mode mode,
-				 struct clock_event_device *clk)
-{
-}
-
-void __cpuinit local_timer_setup(unsigned int cpu)
-{
-	struct clock_event_device *clk = &per_cpu(local_clockevent, cpu);
-
-	clk->name		= "dummy_timer";
-	clk->features		= CLOCK_EVT_FEAT_DUMMY;
-	clk->rating		= 200;
-	clk->mult		= 1;
-	clk->set_mode		= dummy_timer_set_mode;
-	clk->broadcast		= smp_timer_broadcast;
-	clk->cpumask		= cpumask_of(cpu);
-
-	clockevents_register_device(clk);
-}
-- 
cgit v0.10.2


From b82914ce33146186d554b0f5c41e4e13693614ce Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 4 May 2009 18:54:32 +0200
Subject: perf_counter: round-robin per-CPU counters too

This used to be unstable when we had the rq->lock dependencies,
but now that they are that of the past we can turn on percpu
counter RR too.

[ Impact: handle counter over-commit for per-CPU counters too ]

LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 8660ae5..b9679c3 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1069,18 +1069,14 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 {
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
-	const int rotate_percpu = 0;
 
-	if (rotate_percpu)
-		perf_counter_cpu_sched_out(cpuctx);
+	perf_counter_cpu_sched_out(cpuctx);
 	perf_counter_task_sched_out(curr, cpu);
 
-	if (rotate_percpu)
-		rotate_ctx(&cpuctx->ctx);
+	rotate_ctx(&cpuctx->ctx);
 	rotate_ctx(ctx);
 
-	if (rotate_percpu)
-		perf_counter_cpu_sched_in(cpuctx, cpu);
+	perf_counter_cpu_sched_in(cpuctx, cpu);
 	perf_counter_task_sched_in(curr, cpu);
 }
 
-- 
cgit v0.10.2


From ba77813a2a22d631fe5bc0bf1ec0d11350544b70 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 4 May 2009 18:47:44 +0200
Subject: perf_counter: x86: fixup nmi_watchdog vs perf_counter boo-boo

Invert the atomic_inc_not_zero() test so that we will indeed detect the
first activation.

Also rename the global num_counters, since its easy to confuse with
x86_pmu.num_counters.

[ Impact: fix non-working perfcounters on AMD CPUs, cleanup ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241455664.7620.4938.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d4c0cc9..196b58f 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -171,7 +171,7 @@ again:
 	return new_raw_count;
 }
 
-static atomic_t num_counters;
+static atomic_t active_counters;
 static DEFINE_MUTEX(pmc_reserve_mutex);
 
 static bool reserve_pmc_hardware(void)
@@ -224,7 +224,7 @@ static void release_pmc_hardware(void)
 
 static void hw_perf_counter_destroy(struct perf_counter *counter)
 {
-	if (atomic_dec_and_mutex_lock(&num_counters, &pmc_reserve_mutex)) {
+	if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
 		release_pmc_hardware();
 		mutex_unlock(&pmc_reserve_mutex);
 	}
@@ -248,12 +248,12 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		return -ENODEV;
 
 	err = 0;
-	if (atomic_inc_not_zero(&num_counters)) {
+	if (!atomic_inc_not_zero(&active_counters)) {
 		mutex_lock(&pmc_reserve_mutex);
-		if (atomic_read(&num_counters) == 0 && !reserve_pmc_hardware())
+		if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
 			err = -EBUSY;
 		else
-			atomic_inc(&num_counters);
+			atomic_inc(&active_counters);
 		mutex_unlock(&pmc_reserve_mutex);
 	}
 	if (err)
@@ -280,7 +280,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (capable(CAP_SYS_ADMIN) && hw_event->nmi)
 		hwc->nmi = 1;
 
-	hwc->irq_period		= hw_event->irq_period;
+	hwc->irq_period	= hw_event->irq_period;
 	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period)
 		hwc->irq_period = x86_pmu.max_period;
 
@@ -871,7 +871,7 @@ perf_counter_nmi_handler(struct notifier_block *self,
 	struct pt_regs *regs;
 	int ret;
 
-	if (!atomic_read(&num_counters))
+	if (!atomic_read(&active_counters))
 		return NOTIFY_DONE;
 
 	switch (cmd) {
-- 
cgit v0.10.2


From 0d905bca23aca5c86a10ee101bcd3b1abbd40b25 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 4 May 2009 19:13:30 +0200
Subject: perf_counter: initialize the per-cpu context earlier

percpu scheduling for perfcounters wants to take the context lock,
but that lock first needs to be initialized. Currently it is an
early_initcall() - but that is too late, the task tick runs much
sooner than that.

Call it explicitly from the scheduler init sequence instead.

[ Impact: fix access-before-init crash ]

LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index f776851..a356fa6 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -573,6 +573,8 @@ extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
 
 extern int sysctl_perf_counter_priv;
 
+extern void perf_counter_init(void);
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -600,9 +602,10 @@ perf_counter_mmap(unsigned long addr, unsigned long len,
 
 static inline void
 perf_counter_munmap(unsigned long addr, unsigned long len,
-		    unsigned long pgoff, struct file *file) 		{ }
+		    unsigned long pgoff, struct file *file)		{ }
 
 static inline void perf_counter_comm(struct task_struct *tsk)		{ }
+static inline void perf_counter_init(void)				{ }
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index b9679c3..fcdafa2 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3265,15 +3265,12 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = {
 	.notifier_call		= perf_cpu_notify,
 };
 
-static int __init perf_counter_init(void)
+void __init perf_counter_init(void)
 {
 	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
 			(void *)(long)smp_processor_id());
 	register_cpu_notifier(&perf_cpu_nb);
-
-	return 0;
 }
-early_initcall(perf_counter_init);
 
 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
 {
diff --git a/kernel/sched.c b/kernel/sched.c
index 2f600e3..a728976 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,6 +39,7 @@
 #include <linux/completion.h>
 #include <linux/kernel_stat.h>
 #include <linux/debug_locks.h>
+#include <linux/perf_counter.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/profile.h>
@@ -8996,7 +8997,7 @@ void __init sched_init(void)
 		 * 1024) and two child groups A0 and A1 (of weight 1024 each),
 		 * then A0's share of the cpu resource is:
 		 *
-		 * 	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
+		 *	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
 		 *
 		 * We achieve this by letting init_task_group's tasks sit
 		 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
@@ -9097,6 +9098,8 @@ void __init sched_init(void)
 	alloc_bootmem_cpumask_var(&cpu_isolated_map);
 #endif /* SMP */
 
+	perf_counter_init();
+
 	scheduler_running = 1;
 }
 
-- 
cgit v0.10.2


From 1dce8d99b85aba6eddb8b8260baea944922e6fe7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 4 May 2009 19:23:18 +0200
Subject: perf_counter: convert perf_resource_mutex to a spinlock

Now percpu counters can be initialized very early. But the init
sequence uses mutex_lock(). Fortunately, perf_resource_mutex should
be a spinlock anyway, so convert it.

[ Impact: fix crash due to early init mutex use ]

LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index fcdafa2..5f86a11 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -46,9 +46,9 @@ static atomic_t nr_comm_tracking __read_mostly;
 int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
 
 /*
- * Mutex for (sysadmin-configurable) counter reservations:
+ * Lock for (sysadmin-configurable) counter reservations:
  */
-static DEFINE_MUTEX(perf_resource_mutex);
+static DEFINE_SPINLOCK(perf_resource_lock);
 
 /*
  * Architecture provided APIs - weak aliases:
@@ -3207,9 +3207,9 @@ static void __cpuinit perf_counter_init_cpu(int cpu)
 	cpuctx = &per_cpu(perf_cpu_context, cpu);
 	__perf_counter_init_context(&cpuctx->ctx, NULL);
 
-	mutex_lock(&perf_resource_mutex);
+	spin_lock(&perf_resource_lock);
 	cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
-	mutex_unlock(&perf_resource_mutex);
+	spin_unlock(&perf_resource_lock);
 
 	hw_perf_counter_setup(cpu);
 }
@@ -3292,7 +3292,7 @@ perf_set_reserve_percpu(struct sysdev_class *class,
 	if (val > perf_max_counters)
 		return -EINVAL;
 
-	mutex_lock(&perf_resource_mutex);
+	spin_lock(&perf_resource_lock);
 	perf_reserved_percpu = val;
 	for_each_online_cpu(cpu) {
 		cpuctx = &per_cpu(perf_cpu_context, cpu);
@@ -3302,7 +3302,7 @@ perf_set_reserve_percpu(struct sysdev_class *class,
 		cpuctx->max_pertask = mpt;
 		spin_unlock_irq(&cpuctx->ctx.lock);
 	}
-	mutex_unlock(&perf_resource_mutex);
+	spin_unlock(&perf_resource_lock);
 
 	return count;
 }
@@ -3324,9 +3324,9 @@ perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
 	if (val > 1)
 		return -EINVAL;
 
-	mutex_lock(&perf_resource_mutex);
+	spin_lock(&perf_resource_lock);
 	perf_overcommit = val;
-	mutex_unlock(&perf_resource_mutex);
+	spin_unlock(&perf_resource_lock);
 
 	return count;
 }
-- 
cgit v0.10.2


From 066d7dea32c9bffe6decc0abe465627656cdd84e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 4 May 2009 19:04:09 +0200
Subject: perf_counter: fix fixed-purpose counter support on v2 Intel-PERFMON

Fixed-purpose counters stopped working in a simple 'perf stat ls' run:

   <not counted>  cache references
   <not counted>  cache misses

Due to:

  ef7b3e0: perf_counter, x86: remove vendor check in fixed_mode_idx()

Which made x86_pmu.num_counters_fixed matter: if it's nonzero, the
fixed-purpose counters are utilized.

But on v2 perfmon this field is not set (despite there being
fixed-purpose PMCs). So add a quirk to set the number of fixed-purpose
counters to at least three.

[ Impact: add quirk for three fixed-purpose counters on certain Intel CPUs ]

Cc: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-28-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 196b58f..a6878b0 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -962,7 +962,13 @@ static int intel_pmu_init(void)
 	x86_pmu = intel_pmu;
 	x86_pmu.version = version;
 	x86_pmu.num_counters = eax.split.num_counters;
-	x86_pmu.num_counters_fixed = edx.split.num_counters_fixed;
+
+	/*
+	 * Quirk: v2 perfmon does not report fixed-purpose counters, so
+	 * assume at least 3 counters:
+	 */
+	x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
+
 	x86_pmu.counter_bits = eax.split.bit_width;
 	x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
 
-- 
cgit v0.10.2


From d6ce96dabe2c4409fd009ec14250a1fdbab4b133 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 5 May 2009 01:15:24 -0400
Subject: ring-buffer: export symbols

I'm adding a module to do a series of tests on the ring buffer as well
as benchmarks. This module needs to have more of the ring buffer API
exported. There's nothing wrong with reading the ring buffer from a
module.

[ Impact: allow modules to read pages from the ring buffer ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f4cc590..3e86da9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2802,6 +2802,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
 
 	return bpage;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
 
 /**
  * ring_buffer_free_read_page - free an allocated read page
@@ -2814,6 +2815,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
 {
 	free_page((unsigned long)data);
 }
+EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
 
 /**
  * ring_buffer_read_page - extract a page from the ring buffer
@@ -2959,6 +2961,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
  out:
 	return ret;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_read_page);
 
 static ssize_t
 rb_simple_read(struct file *filp, char __user *ubuf,
-- 
cgit v0.10.2


From f0d2c681ac0a85142fc8abe65fc33fcad35cb9b7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 29 Apr 2009 13:43:37 -0400
Subject: ring-buffer: add counters for commit overrun and nmi dropped entries

The WARN_ON in the ring buffer when a commit is preempted and the
buffer is filled by preceding writes can happen in normal operations.
The WARN_ON makes it look like a bug, not to mention, because
it does not stop tracing and calls printk which can also recurse, this
is prone to deadlock (the WARN_ON is not in a position to recurse).

This patch removes the WARN_ON and replaces it with a counter that
can be retrieved by a tracer. This counter is called commit_overrun.

While at it, I added a nmi_dropped counter to count any time an NMI entry
is dropped because the NMI could not take the spinlock.

[ Impact: prevent deadlock by printing normal case warning ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 1c2f809..f134582 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -153,6 +153,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer);
 unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
 unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu);
 unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu);
+unsigned long ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu);
+unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu);
 
 u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu);
 void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3e86da9..26e1359 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -402,6 +402,8 @@ struct ring_buffer_per_cpu {
 	struct buffer_page		*tail_page;	/* write to tail */
 	struct buffer_page		*commit_page;	/* committed pages */
 	struct buffer_page		*reader_page;
+	unsigned long			nmi_dropped;
+	unsigned long			commit_overrun;
 	unsigned long			overrun;
 	unsigned long			entries;
 	u64				write_stamp;
@@ -1216,8 +1218,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		 * simply fail.
 		 */
 		if (unlikely(in_nmi())) {
-			if (!__raw_spin_trylock(&cpu_buffer->lock))
+			if (!__raw_spin_trylock(&cpu_buffer->lock)) {
+				cpu_buffer->nmi_dropped++;
 				goto out_reset;
+			}
 		} else
 			__raw_spin_lock(&cpu_buffer->lock);
 
@@ -1238,8 +1242,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		 * about it.
 		 */
 		if (unlikely(next_page == commit_page)) {
-			/* This can easily happen on small ring buffers */
-			WARN_ON_ONCE(buffer->pages > 2);
+			cpu_buffer->commit_overrun++;
 			goto out_reset;
 		}
 
@@ -1926,6 +1929,47 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
 EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
 
 /**
+ * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long ret;
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		return 0;
+
+	cpu_buffer = buffer->buffers[cpu];
+	ret = cpu_buffer->nmi_dropped;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu);
+
+/**
+ * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long
+ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long ret;
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		return 0;
+
+	cpu_buffer = buffer->buffers[cpu];
+	ret = cpu_buffer->commit_overrun;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
+
+/**
  * ring_buffer_entries - get the number of entries in a buffer
  * @buffer: The ring buffer
  *
@@ -2595,6 +2639,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 	local_set(&cpu_buffer->reader_page->page->commit, 0);
 	cpu_buffer->reader_page->read = 0;
 
+	cpu_buffer->nmi_dropped = 0;
+	cpu_buffer->commit_overrun = 0;
 	cpu_buffer->overrun = 0;
 	cpu_buffer->entries = 0;
 
-- 
cgit v0.10.2


From c8d771835e18c938dae8690611d65fe98ad30f58 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 29 Apr 2009 18:03:45 -0400
Subject: tracing: export stats of ring buffers to userspace

This patch adds stats to the ftrace ring buffers:

 # cat /debugfs/tracing/per_cpu/cpu0/stats
 entries: 42360
 overrun: 30509326
 commit overrun: 0
 nmi dropped: 0

Where entries are the total number of data entries in the buffer.

overrun is the number of entries not consumed and were overwritten by
the writer.

commit overrun is the number of entries dropped due to nested writers
wrapping the buffer before the initial writer finished the commit.

nmi dropped is the number of entries dropped due to the ring buffer
lock being held when an nmi was going to write to the ring buffer.
Note, this field will be meaningless and will go away when the ring
buffer becomes lockless.

[ Impact: let userspace know what is happening in the ring buffers ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f5427e0..74df029 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3595,6 +3595,45 @@ static const struct file_operations tracing_buffers_fops = {
 	.llseek		= no_llseek,
 };
 
+static ssize_t
+tracing_stats_read(struct file *filp, char __user *ubuf,
+		   size_t count, loff_t *ppos)
+{
+	unsigned long cpu = (unsigned long)filp->private_data;
+	struct trace_array *tr = &global_trace;
+	struct trace_seq *s;
+	unsigned long cnt;
+
+	s = kmalloc(sizeof(*s), GFP_ATOMIC);
+	if (!s)
+		return ENOMEM;
+
+	trace_seq_init(s);
+
+	cnt = ring_buffer_entries_cpu(tr->buffer, cpu);
+	trace_seq_printf(s, "entries: %ld\n", cnt);
+
+	cnt = ring_buffer_overrun_cpu(tr->buffer, cpu);
+	trace_seq_printf(s, "overrun: %ld\n", cnt);
+
+	cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
+	trace_seq_printf(s, "commit overrun: %ld\n", cnt);
+
+	cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu);
+	trace_seq_printf(s, "nmi dropped: %ld\n", cnt);
+
+	count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
+
+	kfree(s);
+
+	return count;
+}
+
+static const struct file_operations tracing_stats_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_stats_read,
+};
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 int __weak ftrace_arch_read_dyn_info(char *buf, int size)
@@ -3708,6 +3747,9 @@ static void tracing_init_debugfs_percpu(long cpu)
 
 	trace_create_file("trace_pipe_raw", 0444, d_cpu,
 			(void *) cpu, &tracing_buffers_fops);
+
+	trace_create_file("stats", 0444, d_cpu,
+			(void *) cpu, &tracing_stats_fops);
 }
 
 #ifdef CONFIG_FTRACE_SELFTEST
-- 
cgit v0.10.2


From 60aa605dfce2976e54fa76e805ab0f221372d4d9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 5 May 2009 17:50:21 +0200
Subject: sched: rt: document the risk of small values in the bandwidth
 settings

Thomas noted that we should disallow sysctl_sched_rt_runtime == 0 for
(!RT_GROUP) since the root group always has some RT tasks in it.

Further, update the documentation to inspire clue.

[ Impact: exclude corner-case sysctl_sched_rt_runtime value ]

Reported-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090505155436.863098054@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index 5ba4d3f..eb74b01 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -4,6 +4,7 @@
 CONTENTS
 ========
 
+0. WARNING
 1. Overview
   1.1 The problem
   1.2 The solution
@@ -14,6 +15,23 @@ CONTENTS
 3. Future plans
 
 
+0. WARNING
+==========
+
+ Fiddling with these settings can result in an unstable system, the knobs are
+ root only and assumes root knows what he is doing.
+
+Most notable:
+
+ * very small values in sched_rt_period_us can result in an unstable
+   system when the period is smaller than either the available hrtimer
+   resolution, or the time it takes to handle the budget refresh itself.
+
+ * very small values in sched_rt_runtime_us can result in an unstable
+   system when the runtime is so small the system has difficulty making
+   forward progress (NOTE: the migration thread and kstopmachine both
+   are real-time processes).
+
 1. Overview
 ===========
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 54d67b9..2a43a58 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9917,6 +9917,13 @@ static int sched_rt_global_constraints(void)
 	if (sysctl_sched_rt_period <= 0)
 		return -EINVAL;
 
+	/*
+	 * There's always some RT tasks in the root group
+	 * -- migration, kstopmachine etc..
+	 */
+	if (sysctl_sched_rt_runtime == 0)
+		return -EBUSY;
+
 	spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
 	for_each_possible_cpu(i) {
 		struct rt_rq *rt_rq = &cpu_rq(i)->rt;
-- 
cgit v0.10.2


From c66de4a5be7913247bd83d79168f8e4420c9cfbc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 5 May 2009 17:50:22 +0200
Subject: perf_counter: uncouple data_head updates from wakeups

Keep data_head up-to-date irrespective of notifications. This fixes
the case where you disable a counter and don't get a notification for
the last few pending events, and it also allows polling usage.

[ Impact: increase precision of perfcounter mmap-ed fields ]

Suggested-by: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090505155436.925084300@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index a356fa6..17b6310 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -362,9 +362,11 @@ struct perf_mmap_data {
 	atomic_t			head;		/* write position    */
 	atomic_t			events;		/* event limit       */
 
-	atomic_t			wakeup_head;	/* completed head    */
+	atomic_t			done_head;	/* completed head    */
 	atomic_t			lock;		/* concurrent writes */
 
+	atomic_t			wakeup;		/* needs a wakeup    */
+
 	struct perf_counter_mmap_page   *user_page;
 	void 				*data_pages[0];
 };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 5f86a11..ba5e921 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1696,7 +1696,6 @@ struct perf_output_handle {
 	struct perf_mmap_data	*data;
 	unsigned int		offset;
 	unsigned int		head;
-	int			wakeup;
 	int			nmi;
 	int			overflow;
 	int			locked;
@@ -1752,8 +1751,7 @@ static void perf_output_unlock(struct perf_output_handle *handle)
 	struct perf_mmap_data *data = handle->data;
 	int head, cpu;
 
-	if (handle->wakeup)
-		data->wakeup_head = data->head;
+	data->done_head = data->head;
 
 	if (!handle->locked)
 		goto out;
@@ -1764,13 +1762,11 @@ again:
 	 * before we publish the new head, matched by a rmb() in userspace when
 	 * reading this position.
 	 */
-	while ((head = atomic_xchg(&data->wakeup_head, 0))) {
+	while ((head = atomic_xchg(&data->done_head, 0)))
 		data->user_page->data_head = head;
-		handle->wakeup = 1;
-	}
 
 	/*
-	 * NMI can happen here, which means we can miss a wakeup_head update.
+	 * NMI can happen here, which means we can miss a done_head update.
 	 */
 
 	cpu = atomic_xchg(&data->lock, 0);
@@ -1779,7 +1775,7 @@ again:
 	/*
 	 * Therefore we have to validate we did not indeed do so.
 	 */
-	if (unlikely(atomic_read(&data->wakeup_head))) {
+	if (unlikely(atomic_read(&data->done_head))) {
 		/*
 		 * Since we had it locked, we can lock it again.
 		 */
@@ -1789,7 +1785,7 @@ again:
 		goto again;
 	}
 
-	if (handle->wakeup)
+	if (atomic_xchg(&data->wakeup, 0))
 		perf_output_wakeup(handle);
 out:
 	local_irq_restore(handle->flags);
@@ -1824,7 +1820,9 @@ static int perf_output_begin(struct perf_output_handle *handle,
 
 	handle->offset	= offset;
 	handle->head	= head;
-	handle->wakeup	= (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
+
+	if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
+		atomic_set(&data->wakeup, 1);
 
 	return 0;
 
@@ -1882,7 +1880,7 @@ static void perf_output_end(struct perf_output_handle *handle)
 		int events = atomic_inc_return(&data->events);
 		if (events >= wakeup_events) {
 			atomic_sub(wakeup_events, &data->events);
-			handle->wakeup = 1;
+			atomic_set(&data->wakeup, 1);
 		}
 	}
 
-- 
cgit v0.10.2


From 6de6a7b95705b859b61430fa3afa1403034eb3e6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 5 May 2009 17:50:23 +0200
Subject: perf_counter: add ioctl(PERF_COUNTER_IOC_RESET)

Provide a way to reset an existing counter - this eases PAPI
libraries around perfcounters.

Similar to read() it doesn't collapse pending child counters.

[ Impact: new perfcounter fd ioctl method to reset counters ]

Suggested-by: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090505155437.022272933@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 17b6310..0fcbf34 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -160,6 +160,7 @@ struct perf_counter_hw_event {
 #define PERF_COUNTER_IOC_ENABLE		_IO ('$', 0)
 #define PERF_COUNTER_IOC_DISABLE	_IO ('$', 1)
 #define PERF_COUNTER_IOC_REFRESH	_IOW('$', 2, u32)
+#define PERF_COUNTER_IOC_RESET		_IO ('$', 3)
 
 /*
  * Structure of the page that can be mapped via mmap
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ba5e921..6e6834e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1288,6 +1288,11 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
 	return events;
 }
 
+static void perf_counter_reset(struct perf_counter *counter)
+{
+	atomic_set(&counter->count, 0);
+}
+
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct perf_counter *counter = file->private_data;
@@ -1303,6 +1308,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case PERF_COUNTER_IOC_REFRESH:
 		perf_counter_refresh(counter, arg);
 		break;
+	case PERF_COUNTER_IOC_RESET:
+		perf_counter_reset(counter);
+		break;
 	default:
 		err = -ENOTTY;
 	}
-- 
cgit v0.10.2


From c5078f78b455fbf67ea71442c7e7ca8acf9ff095 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 5 May 2009 17:50:24 +0200
Subject: perf_counter: provide an mlock threshold

Provide a threshold to relax the mlock accounting, increasing usability.

Each counter gets perf_counter_mlock_kb for free.

[ Impact: allow more mmap buffering ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090505155437.112113632@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 0fcbf34..00081d8 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -358,6 +358,7 @@ struct file;
 struct perf_mmap_data {
 	struct rcu_head			rcu_head;
 	int				nr_pages;	/* nr of data pages  */
+	int				nr_locked;	/* nr pages mlocked  */
 
 	atomic_t			poll;		/* POLL_ for wakeups */
 	atomic_t			head;		/* write position    */
@@ -575,6 +576,7 @@ struct perf_callchain_entry {
 extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
 
 extern int sysctl_perf_counter_priv;
+extern int sysctl_perf_counter_mlock;
 
 extern void perf_counter_init(void);
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 6e6834e..2d13427 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -44,6 +44,7 @@ static atomic_t nr_munmap_tracking __read_mostly;
 static atomic_t nr_comm_tracking __read_mostly;
 
 int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
+int sysctl_perf_counter_mlock __read_mostly = 128; /* 'free' kb per counter */
 
 /*
  * Lock for (sysadmin-configurable) counter reservations:
@@ -1461,7 +1462,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 
 	if (atomic_dec_and_mutex_lock(&counter->mmap_count,
 				      &counter->mmap_mutex)) {
-		vma->vm_mm->locked_vm -= counter->data->nr_pages + 1;
+		vma->vm_mm->locked_vm -= counter->data->nr_locked;
 		perf_mmap_data_free(counter);
 		mutex_unlock(&counter->mmap_mutex);
 	}
@@ -1480,6 +1481,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	unsigned long nr_pages;
 	unsigned long locked, lock_limit;
 	int ret = 0;
+	long extra;
 
 	if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
 		return -EINVAL;
@@ -1507,8 +1509,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		goto unlock;
 	}
 
-	locked = vma->vm_mm->locked_vm;
-	locked += nr_pages + 1;
+	extra = nr_pages /* + 1 only account the data pages */;
+	extra -= sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
+	if (extra < 0)
+		extra = 0;
+
+	locked = vma->vm_mm->locked_vm + extra;
 
 	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 	lock_limit >>= PAGE_SHIFT;
@@ -1524,7 +1530,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		goto unlock;
 
 	atomic_set(&counter->mmap_count, 1);
-	vma->vm_mm->locked_vm += nr_pages + 1;
+	vma->vm_mm->locked_vm += extra;
+	counter->data->nr_locked = extra;
 unlock:
 	mutex_unlock(&counter->mmap_mutex);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8203d70..3b05c2b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -920,6 +920,14 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "perf_counter_mlock_kb",
+		.data		= &sysctl_perf_counter_mlock,
+		.maxlen		= sizeof(sysctl_perf_counter_mlock),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 /*
  * NOTE: do not add new entries to this table unless you have read
-- 
cgit v0.10.2


From 22c1558e51c210787c6cf75d8905246fc91ec030 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 5 May 2009 17:50:25 +0200
Subject: perf_counter: fix the output lock

Use -1 instead of 0 as unlocked, since 0 is a valid cpu number.

( This is not an issue right now but will be once we allow multiple
  counters to output to the same mmap area. )

[ Impact: prepare code for multi-counter profile output ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090505155437.232686598@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 2d13427..c881afe 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1409,6 +1409,7 @@ static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
 	}
 
 	data->nr_pages = nr_pages;
+	atomic_set(&data->lock, -1);
 
 	rcu_assign_pointer(counter->data, data);
 
@@ -1755,7 +1756,7 @@ static void perf_output_lock(struct perf_output_handle *handle)
 	if (in_nmi() && atomic_read(&data->lock) == cpu)
 		return;
 
-	while (atomic_cmpxchg(&data->lock, 0, cpu) != 0)
+	while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
 		cpu_relax();
 
 	handle->locked = 1;
@@ -1784,7 +1785,7 @@ again:
 	 * NMI can happen here, which means we can miss a done_head update.
 	 */
 
-	cpu = atomic_xchg(&data->lock, 0);
+	cpu = atomic_xchg(&data->lock, -1);
 	WARN_ON_ONCE(cpu != smp_processor_id());
 
 	/*
@@ -1794,7 +1795,7 @@ again:
 		/*
 		 * Since we had it locked, we can lock it again.
 		 */
-		while (atomic_cmpxchg(&data->lock, 0, cpu) != 0)
+		while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
 			cpu_relax();
 
 		goto again;
-- 
cgit v0.10.2


From 2023b359214bbc5bad31571cf50d7fb83b535c0a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 5 May 2009 17:50:26 +0200
Subject: perf_counter: inheritable sample counters

Redirect the output to the parent counter and put in some sanity checks.

[ Impact: new perfcounter feature - inherited sampling counters ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090505155437.331556171@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c881afe..60e55f0 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -738,10 +738,18 @@ static void perf_counter_enable(struct perf_counter *counter)
 	spin_unlock_irq(&ctx->lock);
 }
 
-static void perf_counter_refresh(struct perf_counter *counter, int refresh)
+static int perf_counter_refresh(struct perf_counter *counter, int refresh)
 {
+	/*
+	 * not supported on inherited counters
+	 */
+	if (counter->hw_event.inherit)
+		return -EINVAL;
+
 	atomic_add(refresh, &counter->event_limit);
 	perf_counter_enable(counter);
+
+	return 0;
 }
 
 /*
@@ -1307,7 +1315,7 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		perf_counter_disable_family(counter);
 		break;
 	case PERF_COUNTER_IOC_REFRESH:
-		perf_counter_refresh(counter, arg);
+		err = perf_counter_refresh(counter, arg);
 		break;
 	case PERF_COUNTER_IOC_RESET:
 		perf_counter_reset(counter);
@@ -1814,6 +1822,12 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	struct perf_mmap_data *data;
 	unsigned int offset, head;
 
+	/*
+	 * For inherited counters we send all the output towards the parent.
+	 */
+	if (counter->parent)
+		counter = counter->parent;
+
 	rcu_read_lock();
 	data = rcu_dereference(counter->data);
 	if (!data)
@@ -1995,6 +2009,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (record_type & PERF_RECORD_ADDR)
 		perf_output_put(&handle, addr);
 
+	/*
+	 * XXX PERF_RECORD_GROUP vs inherited counters seems difficult.
+	 */
 	if (record_type & PERF_RECORD_GROUP) {
 		struct perf_counter *leader, *sub;
 		u64 nr = counter->nr_siblings;
@@ -2281,6 +2298,11 @@ int perf_counter_overflow(struct perf_counter *counter,
 	int events = atomic_read(&counter->event_limit);
 	int ret = 0;
 
+	/*
+	 * XXX event_limit might not quite work as expected on inherited
+	 * counters
+	 */
+
 	counter->pending_kill = POLL_IN;
 	if (events && atomic_dec_and_test(&counter->event_limit)) {
 		ret = 1;
@@ -2801,6 +2823,12 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
 	pmu = NULL;
 
+	/*
+	 * we currently do not support PERF_RECORD_GROUP on inherited counters
+	 */
+	if (hw_event->inherit && (hw_event->record_type & PERF_RECORD_GROUP))
+		goto done;
+
 	if (perf_event_raw(hw_event)) {
 		pmu = hw_perf_counter_init(counter);
 		goto done;
-- 
cgit v0.10.2


From 16c8a10932aef971292c9570eb5f60b5d4e83ed2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 5 May 2009 17:50:27 +0200
Subject: perf_counter: tools: update the tools to support process and
 inherited counters

"perf record":
 - per task counter
 - inherit switch
 - nmi switch

"perf report":
 - userspace/kernel filter

"perf stat":
 - userspace/kernel filter

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090505155437.389163017@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index ddfdcf8..5f5e6df 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -45,7 +45,10 @@ static unsigned int		mmap_pages			= 16;
 static int			output;
 static char 			*output_name			= "output.perf";
 static int			group				= 0;
-static unsigned int		realtime_prio			=  0;
+static unsigned int		realtime_prio			= 0;
+static int			system_wide			= 0;
+static int			inherit				= 1;
+static int			nmi				= 1;
 
 const unsigned int default_count[] = {
 	1000000,
@@ -167,7 +170,7 @@ static void display_events_help(void)
 static void display_help(void)
 {
 	printf(
-	"Usage: perf-record [<options>]\n"
+	"Usage: perf-record [<options>] <cmd>\n"
 	"perf-record Options (up to %d event types can be specified at once):\n\n",
 		 MAX_COUNTERS);
 
@@ -178,12 +181,13 @@ static void display_help(void)
 	" -m pages  --mmap_pages=<pages> # number of mmap data pages\n"
 	" -o file   --output=<file>      # output file\n"
 	" -r prio   --realtime=<prio>    # use RT prio\n"
+	" -s        --system             # system wide profiling\n"
 	);
 
 	exit(0);
 }
 
-static void process_options(int argc, char *argv[])
+static void process_options(int argc, const char *argv[])
 {
 	int error = 0, counter;
 
@@ -196,9 +200,12 @@ static void process_options(int argc, char *argv[])
 			{"mmap_pages",	required_argument,	NULL, 'm'},
 			{"output",	required_argument,	NULL, 'o'},
 			{"realtime",	required_argument,	NULL, 'r'},
+			{"system",	no_argument,		NULL, 's'},
+			{"inherit",	no_argument,		NULL, 'i'},
+			{"nmi",		no_argument,		NULL, 'n'},
 			{NULL,		0,			NULL,  0 }
 		};
-		int c = getopt_long(argc, argv, "+:c:e:m:o:r:",
+		int c = getopt_long(argc, argv, "+:c:e:m:o:r:sin",
 				    long_options, &option_index);
 		if (c == -1)
 			break;
@@ -209,9 +216,16 @@ static void process_options(int argc, char *argv[])
 		case 'm': mmap_pages			=   atoi(optarg); break;
 		case 'o': output_name			= strdup(optarg); break;
 		case 'r': realtime_prio			=   atoi(optarg); break;
+		case 's': system_wide                   ^=             1; break;
+		case 'i': inherit			^=	       1; break;
+		case 'n': nmi				^=	       1; break;
 		default: error = 1; break;
 		}
 	}
+
+	if (argc - optind == 0)
+		error = 1;
+
 	if (error)
 		display_help();
 
@@ -325,18 +339,82 @@ static void mmap_read(struct mmap_data *md)
 
 static volatile int done = 0;
 
-static void sigchld_handler(int sig)
+static void sig_handler(int sig)
 {
-	if (sig == SIGCHLD)
-		done = 1;
+	done = 1;
 }
 
-int cmd_record(int argc, char **argv)
+static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
+static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
+
+static int nr_poll;
+static int nr_cpu;
+
+static void open_counters(int cpu)
 {
-	struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
-	struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 	struct perf_counter_hw_event hw_event;
-	int i, counter, group_fd, nr_poll = 0;
+	int counter, group_fd;
+	int track = 1;
+	pid_t pid = -1;
+
+	if (cpu < 0)
+		pid = 0;
+
+	group_fd = -1;
+	for (counter = 0; counter < nr_counters; counter++) {
+
+		memset(&hw_event, 0, sizeof(hw_event));
+		hw_event.config		= event_id[counter];
+		hw_event.irq_period	= event_count[counter];
+		hw_event.record_type	= PERF_RECORD_IP | PERF_RECORD_TID;
+		hw_event.nmi		= nmi;
+		hw_event.mmap		= track;
+		hw_event.comm		= track;
+		hw_event.inherit	= (cpu < 0) && inherit;
+
+		track = 0; // only the first counter needs these
+
+		fd[nr_cpu][counter] =
+			sys_perf_counter_open(&hw_event, pid, cpu, group_fd, 0);
+
+		if (fd[nr_cpu][counter] < 0) {
+			int err = errno;
+			printf("kerneltop error: syscall returned with %d (%s)\n",
+					fd[nr_cpu][counter], strerror(err));
+			if (err == EPERM)
+				printf("Are you root?\n");
+			exit(-1);
+		}
+		assert(fd[nr_cpu][counter] >= 0);
+		fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK);
+
+		/*
+		 * First counter acts as the group leader:
+		 */
+		if (group && group_fd == -1)
+			group_fd = fd[nr_cpu][counter];
+
+		event_array[nr_poll].fd = fd[nr_cpu][counter];
+		event_array[nr_poll].events = POLLIN;
+		nr_poll++;
+
+		mmap_array[nr_cpu][counter].counter = counter;
+		mmap_array[nr_cpu][counter].prev = 0;
+		mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
+		mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
+				PROT_READ, MAP_SHARED, fd[nr_cpu][counter], 0);
+		if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
+			printf("kerneltop error: failed to mmap with %d (%s)\n",
+					errno, strerror(errno));
+			exit(-1);
+		}
+	}
+	nr_cpu++;
+}
+
+int cmd_record(int argc, const char **argv)
+{
+	int i, counter;
 	pid_t pid;
 	int ret;
 
@@ -357,54 +435,13 @@ int cmd_record(int argc, char **argv)
 	argc -= optind;
 	argv += optind;
 
-	for (i = 0; i < nr_cpus; i++) {
-		group_fd = -1;
-		for (counter = 0; counter < nr_counters; counter++) {
-
-			memset(&hw_event, 0, sizeof(hw_event));
-			hw_event.config		= event_id[counter];
-			hw_event.irq_period	= event_count[counter];
-			hw_event.record_type	= PERF_RECORD_IP | PERF_RECORD_TID;
-			hw_event.nmi		= 1;
-			hw_event.mmap		= 1;
-			hw_event.comm		= 1;
-
-			fd[i][counter] = sys_perf_counter_open(&hw_event, -1, i, group_fd, 0);
-			if (fd[i][counter] < 0) {
-				int err = errno;
-				printf("kerneltop error: syscall returned with %d (%s)\n",
-					fd[i][counter], strerror(err));
-				if (err == EPERM)
-					printf("Are you root?\n");
-				exit(-1);
-			}
-			assert(fd[i][counter] >= 0);
-			fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
-
-			/*
-			 * First counter acts as the group leader:
-			 */
-			if (group && group_fd == -1)
-				group_fd = fd[i][counter];
-
-			event_array[nr_poll].fd = fd[i][counter];
-			event_array[nr_poll].events = POLLIN;
-			nr_poll++;
-
-			mmap_array[i][counter].counter = counter;
-			mmap_array[i][counter].prev = 0;
-			mmap_array[i][counter].mask = mmap_pages*page_size - 1;
-			mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
-					PROT_READ, MAP_SHARED, fd[i][counter], 0);
-			if (mmap_array[i][counter].base == MAP_FAILED) {
-				printf("kerneltop error: failed to mmap with %d (%s)\n",
-						errno, strerror(errno));
-				exit(-1);
-			}
-		}
-	}
+	if (!system_wide)
+		open_counters(-1);
+	else for (i = 0; i < nr_cpus; i++)
+		open_counters(i);
 
-	signal(SIGCHLD, sigchld_handler);
+	signal(SIGCHLD, sig_handler);
+	signal(SIGINT, sig_handler);
 
 	pid = fork();
 	if (pid < 0)
@@ -434,7 +471,7 @@ int cmd_record(int argc, char **argv)
 	while (!done) {
 		int hits = events;
 
-		for (i = 0; i < nr_cpus; i++) {
+		for (i = 0; i < nr_cpu; i++) {
 			for (counter = 0; counter < nr_counters; counter++)
 				mmap_read(&mmap_array[i][counter]);
 		}
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 6de38d2..e2fa117 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -87,6 +87,9 @@
 
 #include "perf.h"
 
+#define EVENT_MASK_KERNEL		1
+#define EVENT_MASK_USER			2
+
 static int			system_wide			=  0;
 
 static int			nr_counters			=  0;
@@ -104,6 +107,7 @@ static __u64			event_id[MAX_COUNTERS]		= {
 static int			default_interval = 100000;
 static int			event_count[MAX_COUNTERS];
 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+static int			event_mask[MAX_COUNTERS];
 
 static int			tid				= -1;
 static int			profile_cpu			= -1;
@@ -258,12 +262,23 @@ static __u64 match_event_symbols(char *str)
 	__u64 config, id;
 	int type;
 	unsigned int i;
+	char mask_str[4];
 
 	if (sscanf(str, "r%llx", &config) == 1)
 		return config | PERF_COUNTER_RAW_MASK;
 
-	if (sscanf(str, "%d:%llu", &type, &id) == 2)
-		return EID(type, id);
+	switch (sscanf(str, "%d:%llu:%2s", &type, &id, mask_str)) {
+		case 3:
+			if (strchr(mask_str, 'u'))
+				event_mask[nr_counters] |= EVENT_MASK_USER;
+			if (strchr(mask_str, 'k'))
+				event_mask[nr_counters] |= EVENT_MASK_KERNEL;
+		case 2:
+			return EID(type, id);
+
+		default:
+			break;
+	}
 
 	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
 		if (!strncmp(str, event_symbols[i].symbol,
@@ -313,6 +328,11 @@ static void create_perfstat_counter(int counter)
 	hw_event.config		= event_id[counter];
 	hw_event.record_type	= 0;
 	hw_event.nmi		= 0;
+	hw_event.exclude_kernel = event_mask[counter] & EVENT_MASK_KERNEL;
+	hw_event.exclude_user   = event_mask[counter] & EVENT_MASK_USER;
+
+printf("exclude: %d\n", event_mask[counter]);
+
 	if (scale)
 		hw_event.read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
 					  PERF_FORMAT_TOTAL_TIME_RUNNING;
diff --git a/Documentation/perf_counter/perf-report.cc b/Documentation/perf_counter/perf-report.cc
index 911d7f3..8855107 100644
--- a/Documentation/perf_counter/perf-report.cc
+++ b/Documentation/perf_counter/perf-report.cc
@@ -33,8 +33,13 @@
 #include <string>
 
 
+#define SHOW_KERNEL	1
+#define SHOW_USER	2
+#define SHOW_HV		4
+
 static char 		const *input_name = "output.perf";
 static int		input;
+static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
@@ -359,15 +364,21 @@ static void process_options(int argc, char *argv[])
 		/** Options for getopt */
 		static struct option long_options[] = {
 			{"input",	required_argument,	NULL, 'i'},
+			{"no-user",	no_argument,		NULL, 'u'},
+			{"no-kernel",	no_argument,		NULL, 'k'},
+			{"no-hv",	no_argument,		NULL, 'h'},
 			{NULL,		0,			NULL,  0 }
 		};
-		int c = getopt_long(argc, argv, "+:i:",
+		int c = getopt_long(argc, argv, "+:i:kuh",
 				    long_options, &option_index);
 		if (c == -1)
 			break;
 
 		switch (c) {
 		case 'i': input_name			= strdup(optarg); break;
+		case 'k': show_mask &= ~SHOW_KERNEL; break;
+		case 'u': show_mask &= ~SHOW_USER; break;
+		case 'h': show_mask &= ~SHOW_HV; break;
 		default: error = 1; break;
 		}
 	}
@@ -443,22 +454,28 @@ more:
 
 	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
 		std::string comm, sym, level;
+		int show = 0;
 		char output[1024];
 
 		if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
+			show |= SHOW_KERNEL;
 			level = " [k] ";
 			sym = resolve_kernel_symbol(event->ip.ip);
 		} else if (event->header.misc & PERF_EVENT_MISC_USER) {
+			show |= SHOW_USER;
 			level = " [.] ";
 			sym = resolve_user_symbol(event->ip.pid, event->ip.ip);
 		} else {
+			show |= SHOW_HV;
 			level = " [H] ";
 		}
-		comm = resolve_comm(event->ip.pid);
 
-		snprintf(output, sizeof(output), "%16s %s %s",
-				comm.c_str(), level.c_str(), sym.c_str());
-		hist[output]++;
+		if (show & show_mask) {
+			comm = resolve_comm(event->ip.pid);
+			snprintf(output, sizeof(output), "%16s %s %s",
+					comm.c_str(), level.c_str(), sym.c_str());
+			hist[output]++;
+		}
 
 		total++;
 
-- 
cgit v0.10.2


From e4906eff9e6fbd2d311abcbcc53d5a531773c982 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 30 Apr 2009 20:49:44 -0400
Subject: ring-buffer: convert cpu buffer entries to local_t

The entries counter in cpu buffer is not atomic. It can be updated by
other interrupts or from another CPU (readers).

But making entries into "atomic_t" causes an atomic operation that can
hurt performance. Instead we convert it to a local_t that will increment
a counter with a local CPU atomic operation (if the arch supports it).

Instead of fighting with readers and overwrites that decrement the counter,
I added a "read" counter. Every time a reader reads an entry it is
incremented.

We already have a overrun counter and with that, the entries counter and
the read counter, we can calculate the total number of entries in the
buffer with:

  (entries - overrun) - read

As long as the total number of entries in the ring buffer is less than
the word size, this will work. But since the entries counter was previously
a long, this is no different than what we had before.

Thanks to Andrew Morton for pointing out in the first version that
atomic_t does not replace unsigned long. I switched to atomic_long_t
even though it is signed. A negative count is most likely a bug.

[ Impact: keep accurate count of cpu buffer entries ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 26e1359..c792ea8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -405,7 +405,8 @@ struct ring_buffer_per_cpu {
 	unsigned long			nmi_dropped;
 	unsigned long			commit_overrun;
 	unsigned long			overrun;
-	unsigned long			entries;
+	unsigned long			read;
+	local_t				entries;
 	u64				write_stamp;
 	u64				read_stamp;
 	atomic_t			record_disabled;
@@ -997,7 +998,6 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
 		if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 			continue;
 		cpu_buffer->overrun++;
-		cpu_buffer->entries--;
 	}
 }
 
@@ -1588,7 +1588,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
 static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
 		      struct ring_buffer_event *event)
 {
-	cpu_buffer->entries++;
+	local_inc(&cpu_buffer->entries);
 
 	/* Only process further if we own the commit */
 	if (!rb_is_commit(cpu_buffer, event))
@@ -1722,7 +1722,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 	 * The commit is still visible by the reader, so we
 	 * must increment entries.
 	 */
-	cpu_buffer->entries++;
+	local_inc(&cpu_buffer->entries);
  out:
 	/*
 	 * If a write came in and pushed the tail page
@@ -1902,7 +1902,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
 		return 0;
 
 	cpu_buffer = buffer->buffers[cpu];
-	ret = cpu_buffer->entries;
+	ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun)
+		- cpu_buffer->read;
 
 	return ret;
 }
@@ -1985,7 +1986,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
 	/* if you care about this being correct, lock the buffer */
 	for_each_buffer_cpu(buffer, cpu) {
 		cpu_buffer = buffer->buffers[cpu];
-		entries += cpu_buffer->entries;
+		entries += (local_read(&cpu_buffer->entries) -
+			    cpu_buffer->overrun) - cpu_buffer->read;
 	}
 
 	return entries;
@@ -2225,7 +2227,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
 
 	if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX
 			|| rb_discarded_event(event))
-		cpu_buffer->entries--;
+		cpu_buffer->read++;
 
 	rb_update_read_stamp(cpu_buffer, event);
 
@@ -2642,7 +2644,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->nmi_dropped = 0;
 	cpu_buffer->commit_overrun = 0;
 	cpu_buffer->overrun = 0;
-	cpu_buffer->entries = 0;
+	cpu_buffer->read = 0;
+	local_set(&cpu_buffer->entries, 0);
 
 	cpu_buffer->write_stamp = 0;
 	cpu_buffer->read_stamp = 0;
@@ -2813,7 +2816,7 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
 		/* Only count data entries */
 		if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 			continue;
-		cpu_buffer->entries--;
+		cpu_buffer->read++;
 	}
 	__raw_spin_unlock(&cpu_buffer->lock);
 }
-- 
cgit v0.10.2


From 41c51c98f588edcdf6141cff1895df738e03ddd4 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sun, 3 May 2009 23:11:18 +0200
Subject: rcu: rcu_sched_grace_period(): kill the bogus flush_signals()

As a kernel thread, rcu_sched_grace_period() runs with all signals ignored.
It can never receive a signal even if it sleeps in TASK_INTERRUPTIBLE, it
needs the explicit allow_signal() to be visible for signals.

[ Impact: reduce kernel size, remove dead code ]

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <20090503211118.GA22973@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index ce97a4d..beb0e65 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -1356,17 +1356,11 @@ static int rcu_sched_grace_period(void *arg)
 
 		rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
 		spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
-		ret = 0;
+		ret = 0; /* unused */
 		__wait_event_interruptible(rcu_ctrlblk.sched_wq,
 			rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
 			ret);
 
-		/*
-		 * Signals would prevent us from sleeping, and we cannot
-		 * do much with them in any case.  So flush them.
-		 */
-		if (ret)
-			flush_signals(current);
 		couldsleepnext = 0;
 
 	} while (!kthread_should_stop());
-- 
cgit v0.10.2


From 778c55d44eb4f5f658915ed631d68ed9d1ac3ad1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 1 May 2009 18:44:45 -0400
Subject: ring-buffer: record page entries in buffer page descriptor

Currently, when the ring buffer writer overflows the buffer and must
write over non consumed data, we increment the overrun counter by
reading the entries on the page we are about to overwrite. This reads
the entries one by one.

This is not very effecient. This patch adds another entry counter
into each buffer page descriptor that keeps track of the number of
entries on the page. Now on overwrite, the overrun counter simply
needs to add the number of entries that is on the page it is about
to overwrite.

[ Impact: speed up of ring buffer in overwrite mode ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c792ea8..342eacc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -321,9 +321,10 @@ struct buffer_data_page {
 };
 
 struct buffer_page {
+	struct list_head list;		/* list of buffer pages */
 	local_t		 write;		/* index for next write */
 	unsigned	 read;		/* index for next read */
-	struct list_head list;		/* list of free pages */
+	local_t		 entries;	/* entries on this page */
 	struct buffer_data_page *page;	/* Actual data page */
 };
 
@@ -977,30 +978,6 @@ static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
 	return rb_page_commit(cpu_buffer->head_page);
 }
 
-/*
- * When the tail hits the head and the buffer is in overwrite mode,
- * the head jumps to the next page and all content on the previous
- * page is discarded. But before doing so, we update the overrun
- * variable of the buffer.
- */
-static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
-{
-	struct ring_buffer_event *event;
-	unsigned long head;
-
-	for (head = 0; head < rb_head_size(cpu_buffer);
-	     head += rb_event_length(event)) {
-
-		event = __rb_page_index(cpu_buffer->head_page, head);
-		if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
-			return;
-		/* Only count data entries */
-		if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
-			continue;
-		cpu_buffer->overrun++;
-	}
-}
-
 static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
 			       struct buffer_page **bpage)
 {
@@ -1253,7 +1230,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 			/* tail_page has not moved yet? */
 			if (tail_page == cpu_buffer->tail_page) {
 				/* count overflows */
-				rb_update_overflow(cpu_buffer);
+				cpu_buffer->overrun +=
+					local_read(&head_page->entries);
 
 				rb_inc_page(cpu_buffer, &head_page);
 				cpu_buffer->head_page = head_page;
@@ -1268,6 +1246,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		 */
 		if (tail_page == cpu_buffer->tail_page) {
 			local_set(&next_page->write, 0);
+			local_set(&next_page->entries, 0);
 			local_set(&next_page->page->commit, 0);
 			cpu_buffer->tail_page = next_page;
 
@@ -1313,6 +1292,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	event = __rb_page_index(tail_page, tail);
 	rb_update_event(event, type, length);
 
+	/* The passed in type is zero for DATA */
+	if (likely(!type))
+		local_inc(&tail_page->entries);
+
 	/*
 	 * If this is a commit and the tail is zero, then update
 	 * this page's time stamp.
@@ -2183,6 +2166,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->reader_page->list.prev = reader->list.prev;
 
 	local_set(&cpu_buffer->reader_page->write, 0);
+	local_set(&cpu_buffer->reader_page->entries, 0);
 	local_set(&cpu_buffer->reader_page->page->commit, 0);
 
 	/* Make the reader page now replace the head */
@@ -2629,6 +2613,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->head_page
 		= list_entry(cpu_buffer->pages.next, struct buffer_page, list);
 	local_set(&cpu_buffer->head_page->write, 0);
+	local_set(&cpu_buffer->head_page->entries, 0);
 	local_set(&cpu_buffer->head_page->page->commit, 0);
 
 	cpu_buffer->head_page->read = 0;
@@ -2638,6 +2623,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 
 	INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
 	local_set(&cpu_buffer->reader_page->write, 0);
+	local_set(&cpu_buffer->reader_page->entries, 0);
 	local_set(&cpu_buffer->reader_page->page->commit, 0);
 	cpu_buffer->reader_page->read = 0;
 
@@ -2996,6 +2982,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		bpage = reader->page;
 		reader->page = *data_page;
 		local_set(&reader->write, 0);
+		local_set(&reader->entries, 0);
 		reader->read = 0;
 		*data_page = bpage;
 
-- 
cgit v0.10.2


From afbab76a62b69ea6197e19727d4b8a8aef8deb25 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 1 May 2009 19:40:05 -0400
Subject: ring-buffer: have read page swap increment counter with page entries

In the swap page ring buffer code that is used by the ftrace splice code,
we scan the page to increment the counter of entries read.

With the number of entries already in the page we simply need to add it.

[ Impact: speed up reading page from ring buffer ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 342eacc..9e42a74 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2785,28 +2785,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
 
-static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
-			      struct buffer_data_page *bpage,
-			      unsigned int offset)
-{
-	struct ring_buffer_event *event;
-	unsigned long head;
-
-	__raw_spin_lock(&cpu_buffer->lock);
-	for (head = offset; head < local_read(&bpage->commit);
-	     head += rb_event_length(event)) {
-
-		event = __rb_data_page_index(bpage, head);
-		if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
-			return;
-		/* Only count data entries */
-		if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
-			continue;
-		cpu_buffer->read++;
-	}
-	__raw_spin_unlock(&cpu_buffer->lock);
-}
-
 /**
  * ring_buffer_alloc_read_page - allocate a page to read from buffer
  * @buffer: the buffer to allocate for.
@@ -2977,6 +2955,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		/* we copied everything to the beginning */
 		read = 0;
 	} else {
+		/* update the entry counter */
+		cpu_buffer->read += local_read(&reader->entries);
+
 		/* swap the pages */
 		rb_init_page(bpage);
 		bpage = reader->page;
@@ -2985,9 +2966,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		local_set(&reader->entries, 0);
 		reader->read = 0;
 		*data_page = bpage;
-
-		/* update the entry counter */
-		rb_remove_entries(cpu_buffer, bpage, read);
 	}
 	ret = read;
 
-- 
cgit v0.10.2


From 41ede23eded40832c955d98d4b71bc244809abb3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 1 May 2009 20:26:54 -0400
Subject: ring-buffer: disable writers when resetting buffers

As a precaution, it is best to disable writing to the ring buffers
when reseting them.

[ Impact: prevent weird things if write happens during reset ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9e42a74..7876df0 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2650,6 +2650,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return;
 
+	atomic_inc(&cpu_buffer->record_disabled);
+
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
 	__raw_spin_lock(&cpu_buffer->lock);
@@ -2659,6 +2661,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 	__raw_spin_unlock(&cpu_buffer->lock);
 
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+	atomic_dec(&cpu_buffer->record_disabled);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
 
-- 
cgit v0.10.2


From 31b6e76e21b2ffd3cb2f6fe4149790a9fdadce2d Mon Sep 17 00:00:00 2001
From: Tim Abbott <tabbott@MIT.EDU>
Date: Thu, 30 Apr 2009 20:06:11 -0400
Subject: ftrace: use .sched.text, not .text.sched in recordmcount.pl

The only references in the kernel to the .text.sched section are in
recordmcount.pl.  Since the code it has is intended to be example code
it should refer to real kernel sections.  So change it to .sched.text
instead.

[ Impact: consistency in comments ]

Signed-off-by: Tim Abbott <tabbott@mit.edu>
LKML-Reference: <1241136371-10768-1-git-send-email-tabbott@mit.edu>
Acked-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
index 409596e..0fae7da 100755
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -26,7 +26,7 @@
 # which will also be the location of that section after final link.
 # e.g.
 #
-#  .section ".text.sched"
+#  .section ".sched.text", "ax"
 #  .globl my_func
 #  my_func:
 #        [...]
@@ -39,7 +39,7 @@
 #        [...]
 #
 # Both relocation offsets for the mcounts in the above example will be
-# offset from .text.sched. If we make another file called tmp.s with:
+# offset from .sched.text. If we make another file called tmp.s with:
 #
 #  .section __mcount_loc
 #  .quad  my_func + 0x5
@@ -51,7 +51,7 @@
 # But this gets hard if my_func is not globl (a static function).
 # In such a case we have:
 #
-#  .section ".text.sched"
+#  .section ".sched.text", "ax"
 #  my_func:
 #        [...]
 #        call mcount  (offset: 0x5)
-- 
cgit v0.10.2


From 94487d6d53af5acae10cf9fd52f74498994d46b1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 5 May 2009 19:22:53 -0400
Subject: tracing: use proper export symbol for tracing api

When adding the EXPORT_SYMBOL to some of the tracing API, I accidently
used EXPORT_SYMBOL instead of EXPORT_SYMBOL_GPL. This patch fixes
that mistake.

[ Impact: export the tracing code only for GPL modules ]

Reported-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 74df029..4164a34 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -887,21 +887,21 @@ trace_current_buffer_lock_reserve(int type, unsigned long len,
 	return trace_buffer_lock_reserve(&global_trace,
 					 type, len, flags, pc);
 }
-EXPORT_SYMBOL(trace_current_buffer_lock_reserve);
+EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
 
 void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc)
 {
 	__trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
 }
-EXPORT_SYMBOL(trace_current_buffer_unlock_commit);
+EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
 
 void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc)
 {
 	__trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
 }
-EXPORT_SYMBOL(trace_nowake_buffer_unlock_commit);
+EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
 
 void trace_current_buffer_discard_commit(struct ring_buffer_event *event)
 {
-- 
cgit v0.10.2


From 53fc0e2259f261602a2750dcc82b8d7bf04d3c35 Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Tue, 5 May 2009 13:12:48 -0400
Subject: integrity: lsm audit rule matching fix

An audit subsystem change replaced AUDIT_EQUAL with Audit_equal.
Update calls to security_filter_rule_init()/match() to reflect
the change.

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index b5291ad..b168c1d 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -96,7 +96,7 @@ static bool ima_match_rules(struct ima_measure_rule_entry *rule,
 	if ((rule->flags & IMA_UID) && rule->uid != tsk->cred->uid)
 		return false;
 	for (i = 0; i < MAX_LSM_RULES; i++) {
-		int rc;
+		int rc = 0;
 		u32 osid, sid;
 
 		if (!rule->lsm[i].rule)
@@ -109,7 +109,7 @@ static bool ima_match_rules(struct ima_measure_rule_entry *rule,
 			security_inode_getsecid(inode, &osid);
 			rc = security_filter_rule_match(osid,
 							rule->lsm[i].type,
-							AUDIT_EQUAL,
+							Audit_equal,
 							rule->lsm[i].rule,
 							NULL);
 			break;
@@ -119,7 +119,7 @@ static bool ima_match_rules(struct ima_measure_rule_entry *rule,
 			security_task_getsecid(tsk, &sid);
 			rc = security_filter_rule_match(sid,
 							rule->lsm[i].type,
-							AUDIT_EQUAL,
+							Audit_equal,
 							rule->lsm[i].rule,
 							NULL);
 		default:
@@ -227,7 +227,7 @@ static int ima_lsm_rule_init(struct ima_measure_rule_entry *entry,
 
 	entry->lsm[lsm_rule].type = audit_type;
 	result = security_filter_rule_init(entry->lsm[lsm_rule].type,
-					   AUDIT_EQUAL, args,
+					   Audit_equal, args,
 					   &entry->lsm[lsm_rule].rule);
 	return result;
 }
-- 
cgit v0.10.2


From e5e520a715dcea6b72f6b9417b203a4b1e813a8b Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Tue, 5 May 2009 13:13:00 -0400
Subject: integrity: use audit_log_string

Based on a request from Eric Paris to simplify parsing, replace
audit_log_format statements containing "%s" with audit_log_string().

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Acked-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/integrity/ima/ima_audit.c b/security/integrity/ima/ima_audit.c
index 1e082bb..c146115 100644
--- a/security/integrity/ima/ima_audit.c
+++ b/security/integrity/ima/ima_audit.c
@@ -54,19 +54,10 @@ void integrity_audit_msg(int audit_msgno, struct inode *inode,
 			 audit_get_loginuid(current),
 			 audit_get_sessionid(current));
 	audit_log_task_context(ab);
-	switch (audit_msgno) {
-	case AUDIT_INTEGRITY_DATA:
-	case AUDIT_INTEGRITY_METADATA:
-	case AUDIT_INTEGRITY_PCR:
-	case AUDIT_INTEGRITY_STATUS:
-		audit_log_format(ab, " op=%s cause=%s", op, cause);
-		break;
-	case AUDIT_INTEGRITY_HASH:
-		audit_log_format(ab, " op=%s hash=%s", op, cause);
-		break;
-	default:
-		audit_log_format(ab, " op=%s", op);
-	}
+	audit_log_format(ab, " op=");
+	audit_log_string(ab, op);
+	audit_log_format(ab, " cause=");
+	audit_log_string(ab, cause);
 	audit_log_format(ab, " comm=");
 	audit_log_untrustedstring(ab, current->comm);
 	if (fname) {
-- 
cgit v0.10.2


From 07ff7a0b187f3951788f64ae1f30e8109bc8e9eb Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Tue, 5 May 2009 13:13:10 -0400
Subject: integrity: remove __setup auditing msgs

Remove integrity audit messages from __setup()

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/integrity/ima/ima_audit.c b/security/integrity/ima/ima_audit.c
index c146115..b628eea 100644
--- a/security/integrity/ima/ima_audit.c
+++ b/security/integrity/ima/ima_audit.c
@@ -22,18 +22,9 @@ static int ima_audit;
 static int __init ima_audit_setup(char *str)
 {
 	unsigned long audit;
-	int rc, result = 0;
-	char *op = "ima_audit";
-	char *cause;
 
-	rc = strict_strtoul(str, 0, &audit);
-	if (rc || audit > 1)
-		result = 1;
-	else
-		ima_audit = audit;
-	cause = ima_audit ? "enabled" : "not_enabled";
-	integrity_audit_msg(AUDIT_INTEGRITY_STATUS, NULL, NULL,
-			    op, cause, result, 0);
+	if (!strict_strtoul(str, 0, &audit))
+		ima_audit = audit ? 1 : 0;
 	return 1;
 }
 __setup("ima_audit=", ima_audit_setup);
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index f4e7266..122f17fc 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -29,20 +29,8 @@ int ima_initialized;
 char *ima_hash = "sha1";
 static int __init hash_setup(char *str)
 {
-	const char *op = "hash_setup";
-	const char *hash = "sha1";
-	int result = 0;
-	int audit_info = 0;
-
-	if (strncmp(str, "md5", 3) == 0) {
-		hash = "md5";
-		ima_hash = str;
-	} else if (strncmp(str, "sha1", 4) != 0) {
-		hash = "invalid_hash_type";
-		result = 1;
-	}
-	integrity_audit_msg(AUDIT_INTEGRITY_HASH, NULL, NULL, op, hash,
-			    result, audit_info);
+	if (strncmp(str, "md5", 3) == 0)
+		ima_hash = "md5";
 	return 1;
 }
 __setup("ima_hash=", hash_setup);
-- 
cgit v0.10.2


From aa20ae8444fc6c318272c643f856d8d8ad3e198d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 5 May 2009 21:16:11 -0400
Subject: ring-buffer: move big if statement down

In the hot path of the ring buffer "__rb_reserve_next" there's a big
if statement that does not even return back to the work flow.

	code;

	if (cross to next page) {

		[ lots of code ]

		return;
	}

	more code;

The condition is even the unlikely path, although we do not denote it
with an unlikely because gcc is fine with it. The condition is true when
the write crosses a page boundary, and we need to start at a new page.

Having this if statement makes it hard to read, but calling another
function to do the work is also not appropriate, because we are using a lot
of variables that were set before the if statement, and we do not want to
send them as parameters.

This patch changes it to a goto:

	code;

	if (cross to next page)
		goto next_page;

	more code;

	return;

next_page:

	[ lots of code]

This makes the code easier to understand, and a bit more obvious.

The output from gcc is practically identical. For some reason, gcc decided
to use different registers when I switched it to a goto. But other than that,
the logic is the same.

[ Impact: easier to read code ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7876df0..424129e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1159,6 +1159,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		  unsigned type, unsigned long length, u64 *ts)
 {
 	struct buffer_page *tail_page, *head_page, *reader_page, *commit_page;
+	struct buffer_page *next_page;
 	unsigned long tail, write;
 	struct ring_buffer *buffer = cpu_buffer->buffer;
 	struct ring_buffer_event *event;
@@ -1173,137 +1174,140 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	tail = write - length;
 
 	/* See if we shot pass the end of this buffer page */
-	if (write > BUF_PAGE_SIZE) {
-		struct buffer_page *next_page = tail_page;
+	if (write > BUF_PAGE_SIZE)
+		goto next_page;
 
-		local_irq_save(flags);
-		/*
-		 * Since the write to the buffer is still not
-		 * fully lockless, we must be careful with NMIs.
-		 * The locks in the writers are taken when a write
-		 * crosses to a new page. The locks protect against
-		 * races with the readers (this will soon be fixed
-		 * with a lockless solution).
-		 *
-		 * Because we can not protect against NMIs, and we
-		 * want to keep traces reentrant, we need to manage
-		 * what happens when we are in an NMI.
-		 *
-		 * NMIs can happen after we take the lock.
-		 * If we are in an NMI, only take the lock
-		 * if it is not already taken. Otherwise
-		 * simply fail.
-		 */
-		if (unlikely(in_nmi())) {
-			if (!__raw_spin_trylock(&cpu_buffer->lock)) {
-				cpu_buffer->nmi_dropped++;
-				goto out_reset;
-			}
-		} else
-			__raw_spin_lock(&cpu_buffer->lock);
-
-		lock_taken = true;
+	/* We reserved something on the buffer */
 
-		rb_inc_page(cpu_buffer, &next_page);
+	if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
+		return NULL;
 
-		head_page = cpu_buffer->head_page;
-		reader_page = cpu_buffer->reader_page;
+	event = __rb_page_index(tail_page, tail);
+	rb_update_event(event, type, length);
 
-		/* we grabbed the lock before incrementing */
-		if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
-			goto out_reset;
+	/* The passed in type is zero for DATA */
+	if (likely(!type))
+		local_inc(&tail_page->entries);
 
-		/*
-		 * If for some reason, we had an interrupt storm that made
-		 * it all the way around the buffer, bail, and warn
-		 * about it.
-		 */
-		if (unlikely(next_page == commit_page)) {
-			cpu_buffer->commit_overrun++;
-			goto out_reset;
-		}
+	/*
+	 * If this is a commit and the tail is zero, then update
+	 * this page's time stamp.
+	 */
+	if (!tail && rb_is_commit(cpu_buffer, event))
+		cpu_buffer->commit_page->page->time_stamp = *ts;
 
-		if (next_page == head_page) {
-			if (!(buffer->flags & RB_FL_OVERWRITE))
-				goto out_reset;
+	return event;
 
-			/* tail_page has not moved yet? */
-			if (tail_page == cpu_buffer->tail_page) {
-				/* count overflows */
-				cpu_buffer->overrun +=
-					local_read(&head_page->entries);
+ next_page:
 
-				rb_inc_page(cpu_buffer, &head_page);
-				cpu_buffer->head_page = head_page;
-				cpu_buffer->head_page->read = 0;
-			}
-		}
+	next_page = tail_page;
 
-		/*
-		 * If the tail page is still the same as what we think
-		 * it is, then it is up to us to update the tail
-		 * pointer.
-		 */
-		if (tail_page == cpu_buffer->tail_page) {
-			local_set(&next_page->write, 0);
-			local_set(&next_page->entries, 0);
-			local_set(&next_page->page->commit, 0);
-			cpu_buffer->tail_page = next_page;
-
-			/* reread the time stamp */
-			*ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu);
-			cpu_buffer->tail_page->page->time_stamp = *ts;
+	local_irq_save(flags);
+	/*
+	 * Since the write to the buffer is still not
+	 * fully lockless, we must be careful with NMIs.
+	 * The locks in the writers are taken when a write
+	 * crosses to a new page. The locks protect against
+	 * races with the readers (this will soon be fixed
+	 * with a lockless solution).
+	 *
+	 * Because we can not protect against NMIs, and we
+	 * want to keep traces reentrant, we need to manage
+	 * what happens when we are in an NMI.
+	 *
+	 * NMIs can happen after we take the lock.
+	 * If we are in an NMI, only take the lock
+	 * if it is not already taken. Otherwise
+	 * simply fail.
+	 */
+	if (unlikely(in_nmi())) {
+		if (!__raw_spin_trylock(&cpu_buffer->lock)) {
+			cpu_buffer->nmi_dropped++;
+			goto out_reset;
 		}
+	} else
+		__raw_spin_lock(&cpu_buffer->lock);
 
-		/*
-		 * The actual tail page has moved forward.
-		 */
-		if (tail < BUF_PAGE_SIZE) {
-			/* Mark the rest of the page with padding */
-			event = __rb_page_index(tail_page, tail);
-			rb_event_set_padding(event);
-		}
+	lock_taken = true;
 
-		if (tail <= BUF_PAGE_SIZE)
-			/* Set the write back to the previous setting */
-			local_set(&tail_page->write, tail);
+	rb_inc_page(cpu_buffer, &next_page);
 
-		/*
-		 * If this was a commit entry that failed,
-		 * increment that too
-		 */
-		if (tail_page == cpu_buffer->commit_page &&
-		    tail == rb_commit_index(cpu_buffer)) {
-			rb_set_commit_to_write(cpu_buffer);
-		}
+	head_page = cpu_buffer->head_page;
+	reader_page = cpu_buffer->reader_page;
 
-		__raw_spin_unlock(&cpu_buffer->lock);
-		local_irq_restore(flags);
+	/* we grabbed the lock before incrementing */
+	if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
+		goto out_reset;
 
-		/* fail and let the caller try again */
-		return ERR_PTR(-EAGAIN);
+	/*
+	 * If for some reason, we had an interrupt storm that made
+	 * it all the way around the buffer, bail, and warn
+	 * about it.
+	 */
+	if (unlikely(next_page == commit_page)) {
+		cpu_buffer->commit_overrun++;
+		goto out_reset;
 	}
 
-	/* We reserved something on the buffer */
+	if (next_page == head_page) {
+		if (!(buffer->flags & RB_FL_OVERWRITE))
+			goto out_reset;
 
-	if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
-		return NULL;
+		/* tail_page has not moved yet? */
+		if (tail_page == cpu_buffer->tail_page) {
+			/* count overflows */
+			cpu_buffer->overrun +=
+				local_read(&head_page->entries);
 
-	event = __rb_page_index(tail_page, tail);
-	rb_update_event(event, type, length);
+			rb_inc_page(cpu_buffer, &head_page);
+			cpu_buffer->head_page = head_page;
+			cpu_buffer->head_page->read = 0;
+		}
+	}
 
-	/* The passed in type is zero for DATA */
-	if (likely(!type))
-		local_inc(&tail_page->entries);
+	/*
+	 * If the tail page is still the same as what we think
+	 * it is, then it is up to us to update the tail
+	 * pointer.
+	 */
+	if (tail_page == cpu_buffer->tail_page) {
+		local_set(&next_page->write, 0);
+		local_set(&next_page->entries, 0);
+		local_set(&next_page->page->commit, 0);
+		cpu_buffer->tail_page = next_page;
+
+		/* reread the time stamp */
+		*ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu);
+		cpu_buffer->tail_page->page->time_stamp = *ts;
+	}
 
 	/*
-	 * If this is a commit and the tail is zero, then update
-	 * this page's time stamp.
+	 * The actual tail page has moved forward.
 	 */
-	if (!tail && rb_is_commit(cpu_buffer, event))
-		cpu_buffer->commit_page->page->time_stamp = *ts;
+	if (tail < BUF_PAGE_SIZE) {
+		/* Mark the rest of the page with padding */
+		event = __rb_page_index(tail_page, tail);
+		rb_event_set_padding(event);
+	}
 
-	return event;
+	if (tail <= BUF_PAGE_SIZE)
+		/* Set the write back to the previous setting */
+		local_set(&tail_page->write, tail);
+
+	/*
+	 * If this was a commit entry that failed,
+	 * increment that too
+	 */
+	if (tail_page == cpu_buffer->commit_page &&
+	    tail == rb_commit_index(cpu_buffer)) {
+		rb_set_commit_to_write(cpu_buffer);
+	}
+
+	__raw_spin_unlock(&cpu_buffer->lock);
+	local_irq_restore(flags);
+
+	/* fail and let the caller try again */
+	return ERR_PTR(-EAGAIN);
 
  out_reset:
 	/* reset write */
-- 
cgit v0.10.2


From c898faf91b3ec6b0f6efa35831b3984fa3331db0 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Tue, 5 May 2009 17:28:56 -0400
Subject: x86: 46 bit physical address support on 64 bits

Extend the maximum addressable memory on x86-64 from 2^44 to
2^46 bytes. This requires some shuffling around of the vmalloc
and virtual memmap memory areas, to keep them away from the
direct mapping of up to 64TB of physical memory.

This patch also introduces a guard hole between the vmalloc
area and the virtual memory map space.  There's really no
good reason why we wouldn't have a guard hole there.

[ Impact: future hardware enablement ]

Signed-off-by: Rik van Riel <riel@redhat.com>
LKML-Reference: <20090505172856.6820db22@cuia.bos.redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 29b52b1..5394132 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -6,10 +6,11 @@ Virtual memory map with 4 level page tables:
 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
 hole caused by [48:63] sign extension
 ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole
-ffff880000000000 - ffffc0ffffffffff (=57 TB) direct mapping of all phys. memory
-ffffc10000000000 - ffffc1ffffffffff (=40 bits) hole
-ffffc20000000000 - ffffe1ffffffffff (=45 bits) vmalloc/ioremap space
-ffffe20000000000 - ffffe2ffffffffff (=40 bits) virtual memory map (1TB)
+ffff880000000000 - ffffc8ffffffffff (=64 TB) direct mapping of all phys. memory
+ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole
+ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space
+ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole
+ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
 ... unused hole ...
 ffffffff80000000 - ffffffffa0000000 (=512 MB)  kernel text mapping, from phys 0
 ffffffffa0000000 - fffffffffff00000 (=1536 MB) module mapping space
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 3f58718..6fadb02 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -47,7 +47,7 @@
 #define __START_KERNEL		(__START_KERNEL_map + __PHYSICAL_START)
 #define __START_KERNEL_map	_AC(0xffffffff80000000, UL)
 
-/* See Documentation/x86_64/mm.txt for a description of the memory map. */
+/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
 #define __PHYSICAL_MASK_SHIFT	46
 #define __VIRTUAL_MASK_SHIFT	48
 
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index fbf42b8..766ea16 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -51,11 +51,11 @@ typedef struct { pteval_t pte; } pte_t;
 #define PGDIR_SIZE	(_AC(1, UL) << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE - 1))
 
-
+/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
 #define MAXMEM		 _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
-#define VMALLOC_START    _AC(0xffffc20000000000, UL)
-#define VMALLOC_END      _AC(0xffffe1ffffffffff, UL)
-#define VMEMMAP_START	 _AC(0xffffe20000000000, UL)
+#define VMALLOC_START    _AC(0xffffc90000000000, UL)
+#define VMALLOC_END      _AC(0xffffe8ffffffffff, UL)
+#define VMEMMAP_START	 _AC(0xffffea0000000000, UL)
 #define MODULES_VADDR    _AC(0xffffffffa0000000, UL)
 #define MODULES_END      _AC(0xffffffffff000000, UL)
 #define MODULES_LEN   (MODULES_END - MODULES_VADDR)
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index e3cc3c0..4517d6b 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -27,7 +27,7 @@
 #else /* CONFIG_X86_32 */
 # define SECTION_SIZE_BITS	27 /* matt - 128 is convenient right now */
 # define MAX_PHYSADDR_BITS	44
-# define MAX_PHYSMEM_BITS	44 /* Can be max 45 bits */
+# define MAX_PHYSMEM_BITS	46
 #endif
 
 #endif /* CONFIG_SPARSEMEM */
-- 
cgit v0.10.2


From 2feceeff1e771850e49f9074307f071964fd9e3e Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 5 May 2009 19:07:07 -0700
Subject: x86: fix typo in address space documentation

Fix a trivial typo in Documentation/x86/x86_64/mm.txt.

[ Impact: documentation only ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Rik van Riel <riel@redhat.com>

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 5394132..d6498e3 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -6,7 +6,7 @@ Virtual memory map with 4 level page tables:
 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
 hole caused by [48:63] sign extension
 ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole
-ffff880000000000 - ffffc8ffffffffff (=64 TB) direct mapping of all phys. memory
+ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory
 ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole
 ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space
 ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole
-- 
cgit v0.10.2


From 5092dbc96f3acdac5433b27c06860352dc6d23b9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 5 May 2009 22:47:18 -0400
Subject: ring-buffer: add benchmark and tester

This patch adds code that can benchmark the ring buffer as well as
test it. This code can be compiled into the kernel (not recommended)
or as a module.

A separate ring buffer is used to not interfer with other users, like
ftrace. It creates a producer and a consumer (option to disable creation
of the consumer) and will run for 10 seconds, then sleep for 10 seconds
and then repeat.

While running, the producer will write 10 byte loads into the ring
buffer with just putting in the current CPU number. The reader will
continually try to read the buffer. The reader will alternate from reading
the buffer via event by event, or by full pages.

The output is a pr_info, thus it will fill up the syslogs.

  Starting ring buffer hammer
  End ring buffer hammer
  Time:     9000349 (usecs)
  Overruns: 12578640
  Read:     5358440  (by events)
  Entries:  0
  Total:    17937080
  Missed:   0
  Hit:      17937080
  Entries per millisec: 1993
  501 ns per entry
  Sleeping for 10 secs
  Starting ring buffer hammer
  End ring buffer hammer
  Time:     9936350 (usecs)
  Overruns: 0
  Read:     28146644  (by pages)
  Entries:  74
  Total:    28146718
  Missed:   0
  Hit:      28146718
  Entries per millisec: 2832
  353 ns per entry
  Sleeping for 10 secs

Time:      is the time the test ran
Overruns:  the number of events that were overwritten and not read
Read:      the number of events read (either by pages or events)
Entries:   the number of entries left in the buffer
                 (the by pages will only read full pages)
Total:     Entries + Read + Overruns
Missed:    the number of entries that failed to write
Hit:       the number of entries that were written

The above example shows that it takes ~353 nanosecs per entry when
there is a reader, reading by pages (and no overruns)

The event by event reader slowed the producer down to 501 nanosecs.

[ Impact: see how changes to the ring buffer affect stability and performance ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 450d3c2..50f62a2 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -471,6 +471,22 @@ config MMIOTRACE_TEST
 
 	  Say N, unless you absolutely know what you are doing.
 
+config RING_BUFFER_BENCHMARK
+	tristate "Ring buffer benchmark stress tester"
+	depends on RING_BUFFER
+	help
+	  This option creates a test to stress the ring buffer and bench mark it.
+	  It creates its own ring buffer such that it will not interfer with
+	  any other users of the ring buffer (such as ftrace). It then creates
+	  a producer and consumer that will run for 10 seconds and sleep for
+	  10 seconds. Each interval it will print out the number of events
+	  it recorded and give a rough estimate of how long each iteration took.
+
+	  It does not disable interrupts or raise its priority, so it may be
+	  affected by processes that are running.
+
+	  If unsure, say N
+
 endif # FTRACE
 
 endif # TRACING_SUPPORT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index fb9d7f9..7c34cbf 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -17,6 +17,7 @@ endif
 
 obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
+obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o
 
 obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_TRACING) += trace_clock.o
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
new file mode 100644
index 0000000..747244a
--- /dev/null
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -0,0 +1,379 @@
+/*
+ * ring buffer tester and benchmark
+ *
+ * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/ring_buffer.h>
+#include <linux/completion.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/time.h>
+
+struct rb_page {
+	u64		ts;
+	local_t		commit;
+	char		data[4080];
+};
+
+/* run time and sleep time in seconds */
+#define RUN_TIME	10
+#define SLEEP_TIME	10
+
+/* number of events for writer to wake up the reader */
+static int wakeup_interval = 100;
+
+static int reader_finish;
+static struct completion read_start;
+static struct completion read_done;
+
+static struct ring_buffer *buffer;
+static struct task_struct *producer;
+static struct task_struct *consumer;
+static unsigned long read;
+
+static int disable_reader;
+module_param(disable_reader, uint, 0644);
+MODULE_PARM_DESC(disable_reader, "only run producer");
+
+static int read_events;
+
+static int kill_test;
+
+#define KILL_TEST()				\
+	do {					\
+		if (!kill_test) {		\
+			kill_test = 1;		\
+			WARN_ON(1);		\
+		}				\
+	} while (0)
+
+enum event_status {
+	EVENT_FOUND,
+	EVENT_DROPPED,
+};
+
+static enum event_status read_event(int cpu)
+{
+	struct ring_buffer_event *event;
+	int *entry;
+	u64 ts;
+
+	event = ring_buffer_consume(buffer, cpu, &ts);
+	if (!event)
+		return EVENT_DROPPED;
+
+	entry = ring_buffer_event_data(event);
+	if (*entry != cpu) {
+		KILL_TEST();
+		return EVENT_DROPPED;
+	}
+
+	read++;
+	return EVENT_FOUND;
+}
+
+static enum event_status read_page(int cpu)
+{
+	struct ring_buffer_event *event;
+	struct rb_page *rpage;
+	unsigned long commit;
+	void *bpage;
+	int *entry;
+	int ret;
+	int inc;
+	int i;
+
+	bpage = ring_buffer_alloc_read_page(buffer);
+	ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
+	if (ret >= 0) {
+		rpage = bpage;
+		commit = local_read(&rpage->commit);
+		for (i = 0; i < commit && !kill_test; i += inc) {
+
+			if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
+				KILL_TEST();
+				break;
+			}
+
+			inc = -1;
+			event = (void *)&rpage->data[i];
+			switch (event->type_len) {
+			case RINGBUF_TYPE_PADDING:
+				/* We don't expect any padding */
+				KILL_TEST();
+				break;
+			case RINGBUF_TYPE_TIME_EXTEND:
+				inc = 8;
+				break;
+			case 0:
+				entry = ring_buffer_event_data(event);
+				if (*entry != cpu) {
+					KILL_TEST();
+					break;
+				}
+				read++;
+				if (!event->array[0]) {
+					KILL_TEST();
+					break;
+				}
+				inc = event->array[0];
+				break;
+			default:
+				entry = ring_buffer_event_data(event);
+				if (*entry != cpu) {
+					KILL_TEST();
+					break;
+				}
+				read++;
+				inc = ((event->type_len + 1) * 4);
+			}
+			if (kill_test)
+				break;
+
+			if (inc <= 0) {
+				KILL_TEST();
+				break;
+			}
+		}
+	}
+	ring_buffer_free_read_page(buffer, bpage);
+
+	if (ret < 0)
+		return EVENT_DROPPED;
+	return EVENT_FOUND;
+}
+
+static void ring_buffer_consumer(void)
+{
+	/* toggle between reading pages and events */
+	read_events ^= 1;
+
+	read = 0;
+	while (!reader_finish && !kill_test) {
+		int found;
+
+		do {
+			int cpu;
+
+			found = 0;
+			for_each_online_cpu(cpu) {
+				enum event_status stat;
+
+				if (read_events)
+					stat = read_event(cpu);
+				else
+					stat = read_page(cpu);
+
+				if (kill_test)
+					break;
+				if (stat == EVENT_FOUND)
+					found = 1;
+			}
+		} while (found && !kill_test);
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (reader_finish)
+			break;
+
+		schedule();
+		__set_current_state(TASK_RUNNING);
+	}
+	reader_finish = 0;
+	complete(&read_done);
+}
+
+static void ring_buffer_producer(void)
+{
+	struct timeval start_tv;
+	struct timeval end_tv;
+	unsigned long long time;
+	unsigned long long entries;
+	unsigned long long overruns;
+	unsigned long missed = 0;
+	unsigned long hit = 0;
+	unsigned long avg;
+	int cnt = 0;
+
+	/*
+	 * Hammer the buffer for 10 secs (this may
+	 * make the system stall)
+	 */
+	pr_info("Starting ring buffer hammer\n");
+	do_gettimeofday(&start_tv);
+	do {
+		struct ring_buffer_event *event;
+		int *entry;
+
+		event = ring_buffer_lock_reserve(buffer, 10);
+		if (!event) {
+			missed++;
+		} else {
+			hit++;
+			entry = ring_buffer_event_data(event);
+			*entry = smp_processor_id();
+			ring_buffer_unlock_commit(buffer, event);
+		}
+		do_gettimeofday(&end_tv);
+
+		if (consumer && !(++cnt % wakeup_interval))
+			wake_up_process(consumer);
+
+	} while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
+	pr_info("End ring buffer hammer\n");
+
+	if (consumer) {
+		/* Init both completions here to avoid races */
+		init_completion(&read_start);
+		init_completion(&read_done);
+		/* the completions must be visible before the finish var */
+		smp_wmb();
+		reader_finish = 1;
+		/* finish var visible before waking up the consumer */
+		smp_wmb();
+		wake_up_process(consumer);
+		wait_for_completion(&read_done);
+	}
+
+	time = end_tv.tv_sec - start_tv.tv_sec;
+	time *= 1000000;
+	time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec);
+
+	entries = ring_buffer_entries(buffer);
+	overruns = ring_buffer_overruns(buffer);
+
+	if (kill_test)
+		pr_info("ERROR!\n");
+	pr_info("Time:     %lld (usecs)\n", time);
+	pr_info("Overruns: %lld\n", overruns);
+	if (disable_reader)
+		pr_info("Read:     (reader disabled)\n");
+	else
+		pr_info("Read:     %ld  (by %s)\n", read,
+			read_events ? "events" : "pages");
+	pr_info("Entries:  %lld\n", entries);
+	pr_info("Total:    %lld\n", entries + overruns + read);
+	pr_info("Missed:   %ld\n", missed);
+	pr_info("Hit:      %ld\n", hit);
+
+	do_div(time, 1000);
+	if (time)
+		hit /= (long)time;
+	else
+		pr_info("TIME IS ZERO??\n");
+
+	pr_info("Entries per millisec: %ld\n", hit);
+
+	if (hit) {
+		avg = 1000000 / hit;
+		pr_info("%ld ns per entry\n", avg);
+	}
+}
+
+static void wait_to_die(void)
+{
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+}
+
+static int ring_buffer_consumer_thread(void *arg)
+{
+	while (!kthread_should_stop() && !kill_test) {
+		complete(&read_start);
+
+		ring_buffer_consumer();
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread_should_stop() || kill_test)
+			break;
+
+		schedule();
+		__set_current_state(TASK_RUNNING);
+	}
+	__set_current_state(TASK_RUNNING);
+
+	if (kill_test)
+		wait_to_die();
+
+	return 0;
+}
+
+static int ring_buffer_producer_thread(void *arg)
+{
+	init_completion(&read_start);
+
+	while (!kthread_should_stop() && !kill_test) {
+		ring_buffer_reset(buffer);
+
+		if (consumer) {
+			smp_wmb();
+			wake_up_process(consumer);
+			wait_for_completion(&read_start);
+		}
+
+		ring_buffer_producer();
+
+		pr_info("Sleeping for 10 secs\n");
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(HZ * SLEEP_TIME);
+		__set_current_state(TASK_RUNNING);
+	}
+
+	if (kill_test)
+		wait_to_die();
+
+	return 0;
+}
+
+static int __init ring_buffer_benchmark_init(void)
+{
+	int ret;
+
+	/* make a one meg buffer in overwite mode */
+	buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE);
+	if (!buffer)
+		return -ENOMEM;
+
+	if (!disable_reader) {
+		consumer = kthread_create(ring_buffer_consumer_thread,
+					  NULL, "rb_consumer");
+		ret = PTR_ERR(consumer);
+		if (IS_ERR(consumer))
+			goto out_fail;
+	}
+
+	producer = kthread_run(ring_buffer_producer_thread,
+			       NULL, "rb_producer");
+	ret = PTR_ERR(producer);
+
+	if (IS_ERR(producer))
+		goto out_kill;
+
+	return 0;
+
+ out_kill:
+	if (consumer)
+		kthread_stop(consumer);
+
+ out_fail:
+	ring_buffer_free(buffer);
+	return ret;
+}
+
+static void __exit ring_buffer_benchmark_exit(void)
+{
+	kthread_stop(producer);
+	if (consumer)
+		kthread_stop(consumer);
+	ring_buffer_free(buffer);
+}
+
+module_init(ring_buffer_benchmark_init);
+module_exit(ring_buffer_benchmark_exit);
+
+MODULE_AUTHOR("Steven Rostedt");
+MODULE_DESCRIPTION("ring_buffer_benchmark");
+MODULE_LICENSE("GPL");
-- 
cgit v0.10.2


From b2e5d8588de0b5341eddad87dbe48d2185eaa3dd Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 6 May 2009 07:55:33 +0200
Subject: irq: change ->set_affinity() to return status, fix

This build failure:

 arch/powerpc/sysdev/mpic.c:810: error: conflicting types for 'mpic_set_affinity'
 arch/powerpc/sysdev/mpic.h:39: error: previous declaration of 'mpic_set_affinity' was here
 make[2]: *** [arch/powerpc/sysdev/mpic.o] Error 1
 make[2]: *** Waiting for unfinished jobs....

Triggers because the function prototype was not updated when the
function call signature got changed by:

   d5dedd4: irq: change ->set_affinity() to return status

[ Impact: build fix on powerpc ]

Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: linux-arch@vger.kernel.org
LKML-Reference: <49F654E9.4070809@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/sysdev/mpic.h b/arch/powerpc/sysdev/mpic.h
index 3cef2af..eff433c 100644
--- a/arch/powerpc/sysdev/mpic.h
+++ b/arch/powerpc/sysdev/mpic.h
@@ -36,6 +36,6 @@ static inline int mpic_pasemi_msi_init(struct mpic *mpic)
 
 extern int mpic_set_irq_type(unsigned int virq, unsigned int flow_type);
 extern void mpic_set_vector(unsigned int virq, unsigned int vector);
-extern void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask);
+extern int mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask);
 
 #endif /* _POWERPC_SYSDEV_MPIC_H */
-- 
cgit v0.10.2


From fd6da10a617f483348ee32bcfe53fd20c302eca1 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 6 May 2009 10:32:13 +0800
Subject: tracing/events: don't say hi when loading the trace event sample

The sample is useful for testing, and I'm using it. But after
loading the module, it keeps saying hi every 10 seconds, this may
be disturbing.

Also Steven said commenting out the "hi" helped in causing races. :)

[ Impact: make testing a bit easier ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4A00F6AD.2070008@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c
index f33b3ba..aabc4e9 100644
--- a/samples/trace_events/trace-events-sample.c
+++ b/samples/trace_events/trace-events-sample.c
@@ -16,10 +16,6 @@ static void simple_thread_func(int cnt)
 	set_current_state(TASK_INTERRUPTIBLE);
 	schedule_timeout(HZ);
 	trace_foo_bar("hello", cnt);
-
-	if (!(cnt % 10))
-		/* It is really important that I say "hi!" */
-		printk(KERN_EMERG "hi!\n");
 }
 
 static int simple_thread(void *arg)
-- 
cgit v0.10.2


From 96d17980fabeb757706d2d6db5a28580a6156bfc Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 6 May 2009 10:32:32 +0800
Subject: tracing/events: make SAMPLE_TRACE_EVENTS default to n

Normally a config should be default to n. This patch also makes the
sample module-only, like SAMPLE_MARKERS and SAMPLE_TRACEPOINTS.

[ Impact: don't build trace event sample by default ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4A00F6C0.8090803@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/samples/Kconfig b/samples/Kconfig
index 93f41c0..b75d28c 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -20,9 +20,8 @@ config SAMPLE_TRACEPOINTS
 	  This build tracepoints example modules.
 
 config SAMPLE_TRACE_EVENTS
-	tristate "Build trace_events examples"
-	depends on EVENT_TRACING
-	default m
+	tristate "Build trace_events examples -- loadable modules only"
+	depends on EVENT_TRACING && m
 	help
 	  This build trace event example modules.
 
-- 
cgit v0.10.2


From 2df75e415709ad12862028916c772c1f377f6a7c Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 6 May 2009 10:33:04 +0800
Subject: tracing/events: fix memory leak when unloading module

When unloading a module, memory allocated by init_preds() and
trace_define_field() is not freed.

[ Impact: fix memory leak ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <4A00F6E0.3040503@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 5fff40c..662c1be 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -116,6 +116,7 @@ struct ftrace_event_call {
 #define MAX_FILTER_STR_VAL	128
 
 extern int init_preds(struct ftrace_event_call *call);
+extern void destroy_preds(struct ftrace_event_call *call);
 extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
 extern int filter_current_check_discard(struct ftrace_event_call *call,
 					void *rec,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f789ca5..f251a15 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -60,6 +60,22 @@ err:
 }
 EXPORT_SYMBOL_GPL(trace_define_field);
 
+#ifdef CONFIG_MODULES
+
+static void trace_destroy_fields(struct ftrace_event_call *call)
+{
+	struct ftrace_event_field *field, *next;
+
+	list_for_each_entry_safe(field, next, &call->fields, link) {
+		list_del(&field->link);
+		kfree(field->type);
+		kfree(field->name);
+		kfree(field);
+	}
+}
+
+#endif /* CONFIG_MODULES */
+
 static void ftrace_clear_events(void)
 {
 	struct ftrace_event_call *call;
@@ -925,6 +941,8 @@ static void trace_module_remove_events(struct module *mod)
 				unregister_ftrace_event(call->event);
 			debugfs_remove_recursive(call->dir);
 			list_del(&call->list);
+			trace_destroy_fields(call);
+			destroy_preds(call);
 		}
 	}
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index f494866..ce07b81 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -346,6 +346,20 @@ static void filter_disable_preds(struct ftrace_event_call *call)
 		filter->preds[i]->fn = filter_pred_none;
 }
 
+void destroy_preds(struct ftrace_event_call *call)
+{
+	struct event_filter *filter = call->filter;
+	int i;
+
+	for (i = 0; i < MAX_FILTER_PRED; i++) {
+		if (filter->preds[i])
+			filter_free_pred(filter->preds[i]);
+	}
+	kfree(filter->preds);
+	kfree(filter);
+	call->filter = NULL;
+}
+
 int init_preds(struct ftrace_event_call *call)
 {
 	struct event_filter *filter;
@@ -374,13 +388,7 @@ int init_preds(struct ftrace_event_call *call)
 	return 0;
 
 oom:
-	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		if (filter->preds[i])
-			filter_free_pred(filter->preds[i]);
-	}
-	kfree(filter->preds);
-	kfree(call->filter);
-	call->filter = NULL;
+	destroy_preds(call);
 
 	return -ENOMEM;
 }
-- 
cgit v0.10.2


From 20c8928abe70e204bd077ab6cfe23002d7788983 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 6 May 2009 10:33:45 +0800
Subject: tracing/events: fix concurrent access to ftrace_events list

A module will add/remove its trace events when it gets loaded/unloaded, so
the ftrace_events list is not "const", and concurrent access needs to be
protected.

This patch thus fixes races between loading/unloding modules and read
'available_events' or read/write 'set_event', etc.

Below shows how to reproduce the race:

 # for ((; ;)) { cat /mnt/tracing/available_events; } > /dev/null &
 # for ((; ;)) { insmod trace-events-sample.ko; rmmod sample; } &

After a while:

BUG: unable to handle kernel paging request at 0010011c
IP: [<c1080f27>] t_next+0x1b/0x2d
...
Call Trace:
 [<c10c90e6>] ? seq_read+0x217/0x30d
 [<c10c8ecf>] ? seq_read+0x0/0x30d
 [<c10b4c19>] ? vfs_read+0x8f/0x136
 [<c10b4fc3>] ? sys_read+0x40/0x65
 [<c1002a68>] ? sysenter_do_call+0x12/0x36

[ Impact: fix races when concurrent accessing ftrace_events list ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4A00F709.3080800@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 7736fe8..777c6c3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -825,6 +825,7 @@ static int filter_pred_##size(struct filter_pred *pred, void *event,	\
 	return match;							\
 }
 
+extern struct mutex event_mutex;
 extern struct list_head ftrace_events;
 
 extern const char *__start___trace_bprintk_fmt[];
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 7bf2ad6..5b5895a 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -10,21 +10,30 @@
 int ftrace_profile_enable(int event_id)
 {
 	struct ftrace_event_call *event;
+	int ret = -EINVAL;
 
+	mutex_lock(&event_mutex);
 	list_for_each_entry(event, &ftrace_events, list) {
-		if (event->id == event_id)
-			return event->profile_enable(event);
+		if (event->id == event_id) {
+			ret = event->profile_enable(event);
+			break;
+		}
 	}
+	mutex_unlock(&event_mutex);
 
-	return -EINVAL;
+	return ret;
 }
 
 void ftrace_profile_disable(int event_id)
 {
 	struct ftrace_event_call *event;
 
+	mutex_lock(&event_mutex);
 	list_for_each_entry(event, &ftrace_events, list) {
-		if (event->id == event_id)
-			return event->profile_disable(event);
+		if (event->id == event_id) {
+			event->profile_disable(event);
+			break;
+		}
 	}
+	mutex_unlock(&event_mutex);
 }
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f251a15..8d579ff 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -21,7 +21,7 @@
 
 #define TRACE_SYSTEM "TRACE_SYSTEM"
 
-static DEFINE_MUTEX(event_mutex);
+DEFINE_MUTEX(event_mutex);
 
 LIST_HEAD(ftrace_events);
 
@@ -80,6 +80,7 @@ static void ftrace_clear_events(void)
 {
 	struct ftrace_event_call *call;
 
+	mutex_lock(&event_mutex);
 	list_for_each_entry(call, &ftrace_events, list) {
 
 		if (call->enabled) {
@@ -87,6 +88,7 @@ static void ftrace_clear_events(void)
 			call->unregfunc();
 		}
 	}
+	mutex_unlock(&event_mutex);
 }
 
 static void ftrace_event_enable_disable(struct ftrace_event_call *call,
@@ -274,6 +276,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
+	mutex_lock(&event_mutex);
+	if (*pos == 0)
+		m->private = ftrace_events.next;
 	return t_next(m, NULL, pos);
 }
 
@@ -303,6 +308,9 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
+	mutex_lock(&event_mutex);
+	if (*pos == 0)
+		m->private = ftrace_events.next;
 	return s_next(m, NULL, pos);
 }
 
@@ -319,12 +327,12 @@ static int t_show(struct seq_file *m, void *v)
 
 static void t_stop(struct seq_file *m, void *p)
 {
+	mutex_unlock(&event_mutex);
 }
 
 static int
 ftrace_event_seq_open(struct inode *inode, struct file *file)
 {
-	int ret;
 	const struct seq_operations *seq_ops;
 
 	if ((file->f_mode & FMODE_WRITE) &&
@@ -332,13 +340,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)
 		ftrace_clear_events();
 
 	seq_ops = inode->i_private;
-	ret = seq_open(file, seq_ops);
-	if (!ret) {
-		struct seq_file *m = file->private_data;
-
-		m->private = ftrace_events.next;
-	}
-	return ret;
+	return seq_open(file, seq_ops);
 }
 
 static ssize_t
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index ce07b81..7ac6910 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -408,6 +408,7 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
 		filter->n_preds = 0;
 	}
 
+	mutex_lock(&event_mutex);
 	list_for_each_entry(call, &ftrace_events, list) {
 		if (!call->define_fields)
 			continue;
@@ -417,6 +418,7 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
 			remove_filter_string(call->filter);
 		}
 	}
+	mutex_unlock(&event_mutex);
 }
 
 static int filter_add_pred_fn(struct filter_parse_state *ps,
@@ -567,6 +569,7 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
 {
 	struct event_filter *filter = system->filter;
 	struct ftrace_event_call *call;
+	int err = 0;
 
 	if (!filter->preds) {
 		filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
@@ -584,8 +587,8 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
 	filter->preds[filter->n_preds] = pred;
 	filter->n_preds++;
 
+	mutex_lock(&event_mutex);
 	list_for_each_entry(call, &ftrace_events, list) {
-		int err;
 
 		if (!call->define_fields)
 			continue;
@@ -597,12 +600,13 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
 		if (err) {
 			filter_free_subsystem_preds(system);
 			parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
-			return err;
+			break;
 		}
 		replace_filter_string(call->filter, filter_string);
 	}
+	mutex_unlock(&event_mutex);
 
-	return 0;
+	return err;
 }
 
 static void parse_init(struct filter_parse_state *ps,
-- 
cgit v0.10.2


From de1d7286060430e79a1d50ad6e5fee8fe863c5f6 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Tue, 5 May 2009 16:49:59 +0800
Subject: tracepoint: trace_sched_migrate_task(): remove parameter

The orig_cpu parameter in trace_sched_migrate_task() is not necessary,
it can be got by using task_cpu(p) in the probe.

[ Impact: micro-optimization ]

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
[ modified from Mathieu's patch. The original patch is at:
  http://marc.info/?l=linux-kernel&m=123791201716239&w=2 ]
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Cc: fweisbec@gmail.com
Cc: rostedt@goodmis.org
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: zhaolei@cn.fujitsu.com
Cc: laijs@cn.fujitsu.com
LKML-Reference: <49FFFDB7.1050402@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index ffa1cab..dd4033c 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -180,9 +180,9 @@ TRACE_EVENT(sched_switch,
  */
 TRACE_EVENT(sched_migrate_task,
 
-	TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
+	TP_PROTO(struct task_struct *p, int dest_cpu),
 
-	TP_ARGS(p, orig_cpu, dest_cpu),
+	TP_ARGS(p, dest_cpu),
 
 	TP_STRUCT__entry(
 		__array(	char,	comm,	TASK_COMM_LEN	)
@@ -196,7 +196,7 @@ TRACE_EVENT(sched_migrate_task,
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid		= p->pid;
 		__entry->prio		= p->prio;
-		__entry->orig_cpu	= orig_cpu;
+		__entry->orig_cpu	= task_cpu(p);
 		__entry->dest_cpu	= dest_cpu;
 	),
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 9f7ffd0..9cdedbd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1954,7 +1954,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 
 	clock_offset = old_rq->clock - new_rq->clock;
 
-	trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+	trace_sched_migrate_task(p, new_cpu);
 
 #ifdef CONFIG_SCHEDSTATS
 	if (p->se.wait_start)
-- 
cgit v0.10.2


From a42aaa3bbce85ac487ad4fad5db99e8e91b7aac1 Mon Sep 17 00:00:00 2001
From: "Alan D. Brunelle" <Alan.Brunelle@hp.com>
Date: Mon, 4 May 2009 16:27:26 -0400
Subject: blktrace: correct remap names

This attempts to clarify names utilized during block I/O remap
operations (partition, volume manager). It correctly matches up the
/from/ information for both device & sector. This takes in the concept
from Kosaki Motohiro and extends it to include better naming for the
"device_from" field.

[ Impact: cleanup ]

Signed-off-by: Alan D. Brunelle <alan.brunelle@hp.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <49FF4FAE.3000301@hp.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 62763c9..82b4636 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -116,9 +116,9 @@ struct blk_io_trace {
  * The remap event
  */
 struct blk_io_trace_remap {
-	__be32 device;
 	__be32 device_from;
-	__be64 sector;
+	__be32 device_to;
+	__be64 sector_from;
 };
 
 enum {
diff --git a/include/trace/block.h b/include/trace/block.h
index 25b7068..87f6456 100644
--- a/include/trace/block.h
+++ b/include/trace/block.h
@@ -70,7 +70,7 @@ DECLARE_TRACE(block_split,
 
 DECLARE_TRACE(block_remap,
 	TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
-		 sector_t from, sector_t to),
-	      TP_ARGS(q, bio, dev, from, to));
+		 sector_t to, sector_t from),
+	      TP_ARGS(q, bio, dev, to, from));
 
 #endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c32062b..f8d46d6 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -830,8 +830,8 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
  * @q:		queue the io is for
  * @bio:	the source bio
  * @dev:	target device
- * @from:	source sector
  * @to:		target sector
+ * @from:	source sector
  *
  * Description:
  *     Device mapper or raid target sometimes need to split a bio because
@@ -839,7 +839,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
  *
  **/
 static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
-				       dev_t dev, sector_t from, sector_t to)
+				       dev_t dev, sector_t to, sector_t from)
 {
 	struct blk_trace *bt = q->blk_trace;
 	struct blk_io_trace_remap r;
@@ -847,9 +847,9 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
 	if (likely(!bt))
 		return;
 
-	r.device = cpu_to_be32(dev);
-	r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
-	r.sector = cpu_to_be64(to);
+	r.device_from = cpu_to_be32(dev);
+	r.device_to   = cpu_to_be32(bio->bi_bdev->bd_dev);
+	r.sector_from = cpu_to_be64(from);
 
 	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
 			!bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
@@ -1028,11 +1028,11 @@ static void get_pdu_remap(const struct trace_entry *ent,
 			  struct blk_io_trace_remap *r)
 {
 	const struct blk_io_trace_remap *__r = pdu_start(ent);
-	__u64 sector = __r->sector;
+	__u64 sector_from = __r->sector_from;
 
-	r->device = be32_to_cpu(__r->device);
 	r->device_from = be32_to_cpu(__r->device_from);
-	r->sector = be64_to_cpu(sector);
+	r->device_to   = be32_to_cpu(__r->device_to);
+	r->sector_from = be64_to_cpu(sector_from);
 }
 
 typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
@@ -1148,13 +1148,13 @@ static int blk_log_with_error(struct trace_seq *s,
 
 static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
 {
-	struct blk_io_trace_remap r = { .device = 0, };
+	struct blk_io_trace_remap r = { .device_from = 0, };
 
 	get_pdu_remap(ent, &r);
 	return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
-			       t_sector(ent),
-			       t_sec(ent), MAJOR(r.device), MINOR(r.device),
-			       (unsigned long long)r.sector);
+				t_sector(ent), t_sec(ent),
+				MAJOR(r.device_from), MINOR(r.device_from),
+				(unsigned long long)r.sector_from);
 }
 
 static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
-- 
cgit v0.10.2


From 22a7c31a9659deaddafbbcec6562d44141e84474 Mon Sep 17 00:00:00 2001
From: "Alan D. Brunelle" <Alan.Brunelle@hp.com>
Date: Mon, 4 May 2009 16:35:08 -0400
Subject: blktrace: from-sector redundant in trace_block_remap

Remove redundant from-sector parameter: it's /always/ the bio's sector
passed in.

[ Impact: cleanup ]

Signed-off-by: Alan D. Brunelle <alan.brunelle@hp.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <49FF517C.7000503@hp.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/block/blk-core.c b/block/blk-core.c
index 07ab754..a5f747a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1275,7 +1275,7 @@ static inline void blk_partition_remap(struct bio *bio)
 		bio->bi_bdev = bdev->bd_contains;
 
 		trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
-				    bdev->bd_dev, bio->bi_sector,
+				    bdev->bd_dev,
 				    bio->bi_sector - p->start_sect);
 	}
 }
@@ -1444,8 +1444,7 @@ static inline void __generic_make_request(struct bio *bio)
 			goto end_io;
 
 		if (old_sector != -1)
-			trace_block_remap(q, bio, old_dev, bio->bi_sector,
-					    old_sector);
+			trace_block_remap(q, bio, old_dev, old_sector);
 
 		trace_block_bio_queue(q, bio);
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8a994be..b01514a 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -657,8 +657,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 		/* the bio has been remapped so dispatch it */
 
 		trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
-				    tio->io->bio->bi_bdev->bd_dev,
-				    clone->bi_sector, sector);
+				    tio->io->bio->bi_bdev->bd_dev, sector);
 
 		generic_make_request(clone);
 	} else if (r < 0 || r == DM_MAPIO_REQUEUE) {
diff --git a/include/trace/block.h b/include/trace/block.h
index 87f6456..8ac945b 100644
--- a/include/trace/block.h
+++ b/include/trace/block.h
@@ -70,7 +70,7 @@ DECLARE_TRACE(block_split,
 
 DECLARE_TRACE(block_remap,
 	TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
-		 sector_t to, sector_t from),
-	      TP_ARGS(q, bio, dev, to, from));
+		 sector_t to),
+	      TP_ARGS(q, bio, dev, to));
 
 #endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index f8d46d6..e099f8c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -830,7 +830,6 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
  * @q:		queue the io is for
  * @bio:	the source bio
  * @dev:	target device
- * @to:		target sector
  * @from:	source sector
  *
  * Description:
@@ -839,7 +838,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
  *
  **/
 static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
-				       dev_t dev, sector_t to, sector_t from)
+				       dev_t dev, sector_t from)
 {
 	struct blk_trace *bt = q->blk_trace;
 	struct blk_io_trace_remap r;
@@ -851,8 +850,9 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
 	r.device_to   = cpu_to_be32(bio->bi_bdev->bd_dev);
 	r.sector_from = cpu_to_be64(from);
 
-	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
-			!bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
+			BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE),
+			sizeof(r), &r);
 }
 
 /**
-- 
cgit v0.10.2


From 48dd0fed90e2b1f1ba87401439b85942181c6df3 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Wed, 6 May 2009 15:45:45 +0530
Subject: tracing: trace_output.c, fix false positive compiler warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This compiler warning:

  CC      kernel/trace/trace_output.o
 kernel/trace/trace_output.c: In function ‘register_ftrace_event’:
 kernel/trace/trace_output.c:544: warning: ‘list’ may be used uninitialized in this function

Is wrong as 'list' is always initialized - but GCC (4.3.2) does not
recognize this relationship properly.

Work around the warning by initializing the variable to NULL.

[ Impact: fix false positive compiler warning ]

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 5fc51f0..8bd9a2c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -541,7 +541,7 @@ int register_ftrace_event(struct trace_event *event)
 	INIT_LIST_HEAD(&event->list);
 
 	if (!event->type) {
-		struct list_head *list;
+		struct list_head *list = NULL;
 
 		if (next_event_type > FTRACE_MAX_EVENT) {
 
-- 
cgit v0.10.2


From 35cf723e99c0e26ddf51f037dffaa4ff2c2c9106 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 May 2009 12:33:38 +0200
Subject: tracing: small trave_events sample Makefile cleanup

Use -I$(src) to add the current directory the include path.

[ Impact: cleanup ]

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/samples/trace_events/Makefile b/samples/trace_events/Makefile
index 06c6dea..0d428dc 100644
--- a/samples/trace_events/Makefile
+++ b/samples/trace_events/Makefile
@@ -1,8 +1,6 @@
 # builds the trace events example kernel modules;
 # then to use one (as root):  insmod <module_name.ko>
 
-PWD := $(shell pwd)
-
-CFLAGS_trace-events-sample.o := -I$(PWD)/samples/trace_events/
+CFLAGS_trace-events-sample.o := -I$(src)
 
 obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace-events-sample.o
-- 
cgit v0.10.2


From 8e7abf1c62941ebb7a1416cbc62392c8a0902625 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 6 May 2009 10:26:45 -0400
Subject: ring-buffer: remove unneeded conditional in rb_reserve_next

The code in __rb_reserve_next checks on page overflow if it is the
original commiter and then resets the page back to the original
setting.  Although this is fine, and the code is correct, it is
a bit fragil. Some experimental work I did breaks it easily.

The better and more robust solution is to have all commiters that
overflow the page, simply subtract what they added.

[ Impact: more robust ring buffer account management ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 424129e..03ed52b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1290,9 +1290,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		rb_event_set_padding(event);
 	}
 
-	if (tail <= BUF_PAGE_SIZE)
-		/* Set the write back to the previous setting */
-		local_set(&tail_page->write, tail);
+	/* Set the write back to the previous setting */
+	local_sub(length, &tail_page->write);
 
 	/*
 	 * If this was a commit entry that failed,
@@ -1311,8 +1310,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 
  out_reset:
 	/* reset write */
-	if (tail <= BUF_PAGE_SIZE)
-		local_set(&tail_page->write, tail);
+	local_sub(length, &tail_page->write);
 
 	if (likely(lock_taken))
 		__raw_spin_unlock(&cpu_buffer->lock);
-- 
cgit v0.10.2


From 00c81a58c5b4e0de14ee33bfbc3d71c90f69f9ea Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 6 May 2009 12:40:51 -0400
Subject: ring-buffer: check for failed allocation in ring buffer benchmark

The result of the allocation of the ring buffer read page in the
ring buffer bench mark does not check the return to see if a page
was actually allocated. This patch fixes that.

[ Impact: avoid NULL dereference ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 747244a..dcd75e9 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -84,6 +84,9 @@ static enum event_status read_page(int cpu)
 	int i;
 
 	bpage = ring_buffer_alloc_read_page(buffer);
+	if (!bpage)
+		return EVENT_DROPPED;
+
 	ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
 	if (ret >= 0) {
 		rpage = bpage;
-- 
cgit v0.10.2


From 6634ff26cce2da04e5c2a5481bcb8888e7d01786 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 6 May 2009 15:30:07 -0400
Subject: ring-buffer: make moving the tail page a separate function

Ingo Molnar thought the code would be cleaner if we used a function call
instead of a goto for moving the tail page. After implementing this,
it seems that gcc still inlines the result and the output is pretty much
the same. Since this is considered a cleaner approach, might as well
implement it.

[ Impact: code clean up ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 03ed52b..3ae5ccf 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1154,51 +1154,18 @@ static unsigned rb_calculate_event_length(unsigned length)
 	return length;
 }
 
+
 static struct ring_buffer_event *
-__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
-		  unsigned type, unsigned long length, u64 *ts)
+rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
+	     unsigned long length, unsigned long tail,
+	     struct buffer_page *commit_page,
+	     struct buffer_page *tail_page, u64 *ts)
 {
-	struct buffer_page *tail_page, *head_page, *reader_page, *commit_page;
-	struct buffer_page *next_page;
-	unsigned long tail, write;
+	struct buffer_page *next_page, *head_page, *reader_page;
 	struct ring_buffer *buffer = cpu_buffer->buffer;
 	struct ring_buffer_event *event;
-	unsigned long flags;
 	bool lock_taken = false;
-
-	commit_page = cpu_buffer->commit_page;
-	/* we just need to protect against interrupts */
-	barrier();
-	tail_page = cpu_buffer->tail_page;
-	write = local_add_return(length, &tail_page->write);
-	tail = write - length;
-
-	/* See if we shot pass the end of this buffer page */
-	if (write > BUF_PAGE_SIZE)
-		goto next_page;
-
-	/* We reserved something on the buffer */
-
-	if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
-		return NULL;
-
-	event = __rb_page_index(tail_page, tail);
-	rb_update_event(event, type, length);
-
-	/* The passed in type is zero for DATA */
-	if (likely(!type))
-		local_inc(&tail_page->entries);
-
-	/*
-	 * If this is a commit and the tail is zero, then update
-	 * this page's time stamp.
-	 */
-	if (!tail && rb_is_commit(cpu_buffer, event))
-		cpu_buffer->commit_page->page->time_stamp = *ts;
-
-	return event;
-
- next_page:
+	unsigned long flags;
 
 	next_page = tail_page;
 
@@ -1318,6 +1285,48 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	return NULL;
 }
 
+static struct ring_buffer_event *
+__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+		  unsigned type, unsigned long length, u64 *ts)
+{
+	struct buffer_page *tail_page, *commit_page;
+	struct ring_buffer_event *event;
+	unsigned long tail, write;
+
+	commit_page = cpu_buffer->commit_page;
+	/* we just need to protect against interrupts */
+	barrier();
+	tail_page = cpu_buffer->tail_page;
+	write = local_add_return(length, &tail_page->write);
+	tail = write - length;
+
+	/* See if we shot pass the end of this buffer page */
+	if (write > BUF_PAGE_SIZE)
+		return rb_move_tail(cpu_buffer, length, tail,
+				    commit_page, tail_page, ts);
+
+	/* We reserved something on the buffer */
+
+	if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
+		return NULL;
+
+	event = __rb_page_index(tail_page, tail);
+	rb_update_event(event, type, length);
+
+	/* The passed in type is zero for DATA */
+	if (likely(!type))
+		local_inc(&tail_page->entries);
+
+	/*
+	 * If this is a commit and the tail is zero, then update
+	 * this page's time stamp.
+	 */
+	if (!tail && rb_is_commit(cpu_buffer, event))
+		cpu_buffer->commit_page->page->time_stamp = *ts;
+
+	return event;
+}
+
 static int
 rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 		  u64 *ts, u64 *delta)
-- 
cgit v0.10.2


From 3e07a4f680adc66dfa175aa5021aedf340251b12 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 6 May 2009 18:36:59 -0400
Subject: ring-buffer: change test to be more latency friendly

The ring buffer benchmark/test runs a producer for 10 seconds.
This is done with preemption and interrupts enabled. But if the kernel
is not compiled with CONFIG_PREEMPT, it basically stops everything
but interrupts for 10 seconds.

Although this is just a test and is not for production, this attribute
can be quite annoying. It can also spawn badness elsewhere.

This patch solves the issues by calling "cond_resched" when the system
is not compiled with CONFIG_PREEMPT. It also keeps track of the time
spent to call cond_resched such that it does not go against the
time calculations. That is, if the task schedules away, the time scheduled
out is removed from the test data. Note, this only works for non PREEMPT
because we do not know when the task is scheduled out if we have PREEMPT
enabled.

[ Impact: prevent test from stopping the world for 10 seconds ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index dcd75e9..a26fc67 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -185,6 +185,35 @@ static void ring_buffer_consumer(void)
 	complete(&read_done);
 }
 
+/*
+ * If we are a non preempt kernel, the 10 second run will
+ * stop everything while it runs. Instead, we will call cond_resched
+ * and also add any time that was lost by a rescedule.
+ */
+#ifdef CONFIG_PREEMPT
+static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv)
+{
+}
+#else
+static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv)
+{
+	struct timeval tv;
+
+	cond_resched();
+	do_gettimeofday(&tv);
+	if (tv.tv_usec < end_tv->tv_usec) {
+		tv.tv_usec += 1000000;
+		tv.tv_sec--;
+	}
+	start_tv->tv_sec += tv.tv_sec - end_tv->tv_sec;
+	start_tv->tv_usec += tv.tv_usec - end_tv->tv_usec;
+	if (start_tv->tv_usec > 1000000) {
+		start_tv->tv_usec -= 1000000;
+		start_tv->tv_sec++;
+	}
+}
+#endif
+
 static void ring_buffer_producer(void)
 {
 	struct timeval start_tv;
@@ -221,6 +250,8 @@ static void ring_buffer_producer(void)
 		if (consumer && !(++cnt % wakeup_interval))
 			wake_up_process(consumer);
 
+		sched_if_needed(&start_tv, &end_tv);
+
 	} while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
 	pr_info("End ring buffer hammer\n");
 
-- 
cgit v0.10.2


From 71e1c8ac42ae4038ddb1367cce7097ab868dc532 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 6 May 2009 21:20:39 -0400
Subject: tracing: update sample with TRACE_INCLUDE_FILE

When creating trace events for ftrace, the header file with the TRACE_EVENT
macros must also have a macro called TRACE_SYSTEM. This macro describes
the name of the system the TRACE_EVENTS are defined for. It also doubles
as a way for the define_trace.h file to include the file that included
it.

For example:

in irq.h

 #define TRACE_SYSTEM irq

[...]

 #include <trace/define_trace.h>

The define_trace will use TRACE_SYSTEM to include irq.h. But if the name
of the trace system does not match the name of the trace header file,
one can override it with:

Which will change define_trace.h to inclued foo_trace.h instead of foo.h

The sample comments this, but people that use the sample code will more
likely use the code and not read the comments. This patch changes the
sample code to use the TRACE_INCLUDE_FILE to better show developers how to
use it.

[ Impact: make sample less confusing to developers ]

Reported-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h
index eab4644..128a897 100644
--- a/samples/trace_events/trace-events-sample.h
+++ b/samples/trace_events/trace-events-sample.h
@@ -31,7 +31,7 @@
  *
  */
 #undef TRACE_SYSTEM
-#define TRACE_SYSTEM trace-events-sample
+#define TRACE_SYSTEM sample
 
 /*
  * The TRACE_EVENT macro is broken up into 5 parts.
@@ -120,5 +120,10 @@ TRACE_EVENT(foo_bar,
  * result.
  */
 #undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
 #define TRACE_INCLUDE_PATH .
+/*
+ * TRACE_INCLUDE_FILE is not needed if the filename and TRACE_SYSTEM are equal
+ */
+#define TRACE_INCLUDE_FILE trace-events-sample
 #include <trace/define_trace.h>
-- 
cgit v0.10.2


From 9456f0fa6d3cb944d3b9fc31c9a244e0362c26ea Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 6 May 2009 21:54:09 -0400
Subject: tracing: reset ring buffer when removing modules with events

Li Zefan found that there's a race using the event ids of events and
modules. When a module is loaded, an event id is incremented. We only
have 16 bits for event ids (65536) and there is a possible (but highly
unlikely) race that we could load and unload a module that registers
events so many times that the event id counter overflows.

When it overflows, it then restarts and goes looking for available
ids. An id is available if it was added by a module and released.

The race is if you have one module add an id, and then is removed.
Another module loaded can use that same event id. But if the old module
still had events in the ring buffer, the new module's call back would
get bogus data.  At best (and most likely) the output would just be
garbage. But if the module for some reason used pointers (not recommended)
then this could potentially crash.

The safest thing to do is just reset the ring buffer if a module that
registered events is removed.

[ Impact: prevent unpredictable results of event id overflows ]

Reported-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <49FEAFD0.30106@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4164a34..dd40d23 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -639,6 +639,16 @@ void tracing_reset_online_cpus(struct trace_array *tr)
 		tracing_reset(tr, cpu);
 }
 
+void tracing_reset_current(int cpu)
+{
+	tracing_reset(&global_trace, cpu);
+}
+
+void tracing_reset_current_online_cpus(void)
+{
+	tracing_reset_online_cpus(&global_trace);
+}
+
 #define SAVED_CMDLINES 128
 #define NO_CMDLINE_MAP UINT_MAX
 static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 777c6c3..ba25793 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -409,6 +409,8 @@ int tracing_is_enabled(void);
 void trace_wake_up(void);
 void tracing_reset(struct trace_array *tr, int cpu);
 void tracing_reset_online_cpus(struct trace_array *tr);
+void tracing_reset_current(int cpu);
+void tracing_reset_current_online_cpus(void);
 int tracing_open_generic(struct inode *inode, struct file *filp);
 struct dentry *trace_create_file(const char *name,
 				 mode_t mode,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 8d579ff..6d2c842 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -932,9 +932,11 @@ static void trace_module_remove_events(struct module *mod)
 {
 	struct ftrace_module_file_ops *file_ops;
 	struct ftrace_event_call *call, *p;
+	bool found = false;
 
 	list_for_each_entry_safe(call, p, &ftrace_events, list) {
 		if (call->mod == mod) {
+			found = true;
 			if (call->enabled) {
 				call->enabled = 0;
 				call->unregfunc();
@@ -957,6 +959,13 @@ static void trace_module_remove_events(struct module *mod)
 		list_del(&file_ops->list);
 		kfree(file_ops);
 	}
+
+	/*
+	 * It is safest to reset the ring buffer if the module being unloaded
+	 * registered any events.
+	 */
+	if (found)
+		tracing_reset_current_online_cpus();
 }
 
 static int trace_module_notify(struct notifier_block *self,
-- 
cgit v0.10.2


From 8ae79a138e88aceeeb07077bff2883245fb7c218 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 6 May 2009 22:52:15 -0400
Subject: tracing: add hierarchical enabling of events

With the current event directory, you can only enable individual events.
The file debugfs/tracing/set_event is used to be able to enable or
disable several events at once. But that can still be awkward.

This patch adds hierarchical enabling of events. That is, each directory
in debugfs/tracing/events has an "enable" file. This file can enable
or disable all events within the directory and below.

 # echo 1 > /debugfs/tracing/events/enable

will enable all events.

 # echo 1 > /debugfs/tracing/events/sched/enable

will enable all events in the sched subsystem.

 # echo 1 > /debugfs/tracing/events/enable
 # echo 0 > /debugfs/tracing/events/irq/enable

will enable all events, but then disable just the irq subsystem events.

When reading one of these enable files, there are four results:

 0 - all events this file affects are disabled
 1 - all events this file affects are enabled
 X - there is a mixture of events enabled and disabled
 ? - this file does not affect any event

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 6d2c842..87feb01 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -400,6 +400,133 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 }
 
+static ssize_t
+system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
+		   loff_t *ppos)
+{
+	const char *system = filp->private_data;
+	struct ftrace_event_call *call;
+	char buf[2];
+	int set = -1;
+	int all = 0;
+	int ret;
+
+	if (system[0] == '*')
+		all = 1;
+
+	mutex_lock(&event_mutex);
+	list_for_each_entry(call, &ftrace_events, list) {
+		if (!call->name || !call->regfunc)
+			continue;
+
+		if (!all && strcmp(call->system, system) != 0)
+			continue;
+
+		/*
+		 * We need to find out if all the events are set
+		 * or if all events or cleared, or if we have
+		 * a mixture.
+		 */
+		if (call->enabled) {
+			switch (set) {
+			case -1:
+				set = 1;
+				break;
+			case 0:
+				set = 2;
+				break;
+			}
+		} else {
+			switch (set) {
+			case -1:
+				set = 0;
+				break;
+			case 1:
+				set = 2;
+				break;
+			}
+		}
+		/*
+		 * If we have a mixture, no need to look further.
+		 */
+		if (set == 2)
+			break;
+	}
+	mutex_unlock(&event_mutex);
+
+	buf[1] = '\n';
+	switch (set) {
+	case 0:
+		buf[0] = '0';
+		break;
+	case 1:
+		buf[0] = '1';
+		break;
+	case 2:
+		buf[0] = 'X';
+		break;
+	default:
+		buf[0] = '?';
+	}
+
+	ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+
+	return ret;
+}
+
+static ssize_t
+system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
+		    loff_t *ppos)
+{
+	const char *system = filp->private_data;
+	unsigned long val;
+	char *command;
+	char buf[64];
+	ssize_t ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	ret = tracing_update_buffers();
+	if (ret < 0)
+		return ret;
+
+	switch (val) {
+	case 0:
+	case 1:
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	command = kstrdup(system, GFP_KERNEL);
+	if (!command)
+		return -ENOMEM;
+
+	ret = ftrace_set_clr_event(command, val);
+	if (ret)
+		goto out_free;
+
+	ret = cnt;
+
+ out_free:
+	kfree(command);
+
+	*ppos += cnt;
+
+	return ret;
+}
+
 extern char *__bad_type_size(void);
 
 #undef FIELD
@@ -686,6 +813,12 @@ static const struct file_operations ftrace_subsystem_filter_fops = {
 	.write = subsystem_filter_write,
 };
 
+static const struct file_operations ftrace_system_enable_fops = {
+	.open = tracing_open_generic,
+	.read = system_enable_read,
+	.write = system_enable_write,
+};
+
 static const struct file_operations ftrace_show_header_fops = {
 	.open = tracing_open_generic,
 	.read = show_header,
@@ -768,6 +901,10 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 			   "'%s/filter' entry\n", name);
 	}
 
+	entry = trace_create_file("enable", 0644, system->entry,
+				  (void *)system->name,
+				  &ftrace_system_enable_fops);
+
 	return system->entry;
 }
 
@@ -1041,6 +1178,9 @@ static __init int event_trace_init(void)
 			  ring_buffer_print_entry_header,
 			  &ftrace_show_header_fops);
 
+	trace_create_file("enable", 0644, d_events,
+			  "*:*", &ftrace_system_enable_fops);
+
 	for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
 		/* The linker may leave blanks */
 		if (!call->name)
-- 
cgit v0.10.2


From 975e5f45500dff6d15c0001bb662e9aac0ce0076 Mon Sep 17 00:00:00 2001
From: Samuel Bronson <naesten@gmail.com>
Date: Wed, 6 May 2009 22:27:55 -0400
Subject: x86: use symbolic name for VM86_SIGNAL when used as vm86 default
 return

This code has apparently used "0" and not VM86_SIGNAL since Linux
1.1.9, when Linus added VM86_SIGNAL to vm86.h. This patch changes the
code to use the symbolic name.

The magic 0 tripped me up in trying to extend the vm86(2) manpage to
actually explain vm86()'s interface -- my greps for VM86_SIGNAL came up
fruitless.

[ Impact: cleanup; no object code change ]

Signed-off-by: Samuel Bronson <naesten@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index d7ac84e..b8035a0 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -318,9 +318,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 	}
 
 /*
- * Save old state, set default return value (%ax) to 0
+ * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
  */
-	info->regs32->ax = 0;
+	info->regs32->ax = VM86_SIGNAL;
 	tsk->thread.saved_sp0 = tsk->thread.sp0;
 	tsk->thread.saved_fs = info->regs32->fs;
 	tsk->thread.saved_gs = get_user_gs(info->regs32);
-- 
cgit v0.10.2


From 40c8bca76ecaa6b663d403d34f0fcd422bbdbffd Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 7 May 2009 15:24:36 +0900
Subject: sh: Flag IRQSTACKS as BROKEN for now.

There still seems to be some stack corruption to sort out here, so flag
this as BROKEN until this issue is sorted out.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig.debug b/arch/sh/Kconfig.debug
index 0d62681..fdb0493 100644
--- a/arch/sh/Kconfig.debug
+++ b/arch/sh/Kconfig.debug
@@ -92,7 +92,7 @@ config 4KSTACKS
 
 config IRQSTACKS
 	bool "Use separate kernel stacks when processing interrupts"
-	depends on DEBUG_KERNEL && SUPERH32
+	depends on DEBUG_KERNEL && SUPERH32 && BROKEN
 	help
 	  If you say Y here the kernel will use separate kernel stacks
 	  for handling hard and soft interrupts.  This can help avoid
-- 
cgit v0.10.2


From 643bec956544d376b7c2a80a3d5c3d0bf94da8d3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 7 May 2009 09:12:50 +0200
Subject: x86: clean up arch/x86/kernel/tsc_sync.c a bit

 - remove unused define
 - make the lock variable definition stand out some more
 - convert KERN_* to pr_info() / pr_warning()

[ Impact: cleanup ]

LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index bf36328..027b5b4 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -34,6 +34,7 @@ static __cpuinitdata atomic_t stop_count;
  * of a critical section, to be able to prove TSC time-warps:
  */
 static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
+
 static __cpuinitdata cycles_t last_tsc;
 static __cpuinitdata cycles_t max_warp;
 static __cpuinitdata int nr_warps;
@@ -113,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
 		return;
 
 	if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
-		printk(KERN_INFO
-		       "Skipping synchronization checks as TSC is reliable.\n");
+		pr_info("Skipping synchronization checks as TSC is reliable.\n");
 		return;
 	}
 
-	printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
-			  smp_processor_id(), cpu);
+	pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:",
+		smp_processor_id(), cpu);
 
 	/*
 	 * Reset it - in case this is a second bootup:
@@ -143,8 +143,8 @@ void __cpuinit check_tsc_sync_source(int cpu)
 
 	if (nr_warps) {
 		printk("\n");
-		printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs,"
-				    " turning off TSC clock.\n", max_warp);
+		pr_warning("Measured %Ld cycles TSC warp between CPUs, "
+			   "turning off TSC clock.\n", max_warp);
 		mark_tsc_unstable("check_tsc_sync_source failed");
 	} else {
 		printk(" passed.\n");
@@ -195,5 +195,3 @@ void __cpuinit check_tsc_sync_target(void)
 	while (atomic_read(&stop_count) != cpus)
 		cpu_relax();
 }
-#undef NR_LOOPS
-
-- 
cgit v0.10.2


From aa47b7e0f89b9998dad4d1667447e8cb7703ff4e Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 4 May 2009 01:38:05 -0700
Subject: sched: emit thread info flags with stack trace

When a thread is oom killed and fails to exit, it's helpful to know which
threads have access to memory reserves if the machine livelocks.  This is
done by testing for the TIF_MEMDIE thread info flag and should be
displayed alongside stack traces to identify tasks that have access to
such reserves but are still stuck allocating pages, for instance.

It would probably be helpful in other cases as well, so all thread info
flags are emitted when showing a task.

( v2: fix warning reported by Stephen Rothwell )

[ Impact: extend debug printout info ]

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
LKML-Reference: <alpine.DEB.2.00.0905040136390.15831@chino.kir.corp.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 2a43a58..5aa63f5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6610,8 +6610,9 @@ void sched_show_task(struct task_struct *p)
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	free = stack_not_used(p);
 #endif
-	printk(KERN_CONT "%5lu %5d %6d\n", free,
-		task_pid_nr(p), task_pid_nr(p->real_parent));
+	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
+		task_pid_nr(p), task_pid_nr(p->real_parent),
+		(unsigned long)task_thread_info(p)->flags);
 
 	show_stack(p, NULL);
 }
-- 
cgit v0.10.2


From ee1acbfabd5270b40ce2cfdc202070b7ca91cdff Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 7 May 2009 16:38:16 +0900
Subject: sh: Handle shm_align_mask also for HAVE_ARCH_UNMAPPED_AREA_TOPDOWN.

Presently shm_align_mask is only looked at for the bottom up case, but we
still want this for proper colouring constraints in the topdown case.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/cacheflush.h b/arch/sh/include/asm/cacheflush.h
index 09acbc3..4c5462d 100644
--- a/arch/sh/include/asm/cacheflush.h
+++ b/arch/sh/include/asm/cacheflush.h
@@ -75,7 +75,5 @@ extern void copy_from_user_page(struct vm_area_struct *vma,
 #define flush_cache_vmap(start, end)		flush_cache_all()
 #define flush_cache_vunmap(start, end)		flush_cache_all()
 
-#define HAVE_ARCH_UNMAPPED_AREA
-
 #endif /* __KERNEL__ */
 #endif /* __ASM_SH_CACHEFLUSH_H */
diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h
index b517ae0..2a011b1 100644
--- a/arch/sh/include/asm/pgtable.h
+++ b/arch/sh/include/asm/pgtable.h
@@ -154,6 +154,10 @@ extern void kmap_coherent_init(void);
 #define kmap_coherent_init()	do { } while (0)
 #endif
 
+/* arch/sh/mm/mmap.c */
+#define HAVE_ARCH_UNMAPPED_AREA
+#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
+
 #include <asm-generic/pgtable.h>
 
 #endif /* __ASM_SH_PGTABLE_H */
diff --git a/arch/sh/mm/mmap.c b/arch/sh/mm/mmap.c
index 931f4d0..1b5fdfb 100644
--- a/arch/sh/mm/mmap.c
+++ b/arch/sh/mm/mmap.c
@@ -1,7 +1,7 @@
 /*
  * arch/sh/mm/mmap.c
  *
- * Copyright (C) 2008  Paul Mundt
+ * Copyright (C) 2008 - 2009  Paul Mundt
  *
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
@@ -21,9 +21,26 @@ EXPORT_SYMBOL(shm_align_mask);
 /*
  * To avoid cache aliases, we map the shared page with same color.
  */
-#define COLOUR_ALIGN(addr, pgoff)				\
-	((((addr) + shm_align_mask) & ~shm_align_mask) +	\
-	 (((pgoff) << PAGE_SHIFT) & shm_align_mask))
+static inline unsigned long COLOUR_ALIGN(unsigned long addr,
+					 unsigned long pgoff)
+{
+	unsigned long base = (addr + shm_align_mask) & ~shm_align_mask;
+	unsigned long off = (pgoff << PAGE_SHIFT) & shm_align_mask;
+
+	return base + off;
+}
+
+static inline unsigned long COLOUR_ALIGN_DOWN(unsigned long addr,
+					      unsigned long pgoff)
+{
+	unsigned long base = addr & ~shm_align_mask;
+	unsigned long off = (pgoff << PAGE_SHIFT) & shm_align_mask;
+
+	if (base + off <= addr)
+		return base + off;
+
+	return base - off;
+}
 
 unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	unsigned long len, unsigned long pgoff, unsigned long flags)
@@ -103,6 +120,117 @@ full_search:
 			addr = COLOUR_ALIGN(addr, pgoff);
 	}
 }
+
+unsigned long
+arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
+			  const unsigned long len, const unsigned long pgoff,
+			  const unsigned long flags)
+{
+	struct vm_area_struct *vma;
+	struct mm_struct *mm = current->mm;
+	unsigned long addr = addr0;
+	int do_colour_align;
+
+	if (flags & MAP_FIXED) {
+		/* We do not accept a shared mapping if it would violate
+		 * cache aliasing constraints.
+		 */
+		if ((flags & MAP_SHARED) &&
+		    ((addr - (pgoff << PAGE_SHIFT)) & shm_align_mask))
+			return -EINVAL;
+		return addr;
+	}
+
+	if (unlikely(len > TASK_SIZE))
+		return -ENOMEM;
+
+	do_colour_align = 0;
+	if (filp || (flags & MAP_SHARED))
+		do_colour_align = 1;
+
+	/* requesting a specific address */
+	if (addr) {
+		if (do_colour_align)
+			addr = COLOUR_ALIGN(addr, pgoff);
+		else
+			addr = PAGE_ALIGN(addr);
+
+		vma = find_vma(mm, addr);
+		if (TASK_SIZE - len >= addr &&
+		    (!vma || addr + len <= vma->vm_start))
+			return addr;
+	}
+
+	/* check if free_area_cache is useful for us */
+	if (len <= mm->cached_hole_size) {
+	        mm->cached_hole_size = 0;
+		mm->free_area_cache = mm->mmap_base;
+	}
+
+	/* either no address requested or can't fit in requested address hole */
+	addr = mm->free_area_cache;
+	if (do_colour_align) {
+		unsigned long base = COLOUR_ALIGN_DOWN(addr-len, pgoff);
+
+		addr = base + len;
+	}
+
+	/* make sure it can fit in the remaining address space */
+	if (likely(addr > len)) {
+		vma = find_vma(mm, addr-len);
+		if (!vma || addr <= vma->vm_start) {
+			/* remember the address as a hint for next time */
+			return (mm->free_area_cache = addr-len);
+		}
+	}
+
+	if (unlikely(mm->mmap_base < len))
+		goto bottomup;
+
+	addr = mm->mmap_base-len;
+	if (do_colour_align)
+		addr = COLOUR_ALIGN_DOWN(addr, pgoff);
+
+	do {
+		/*
+		 * Lookup failure means no vma is above this address,
+		 * else if new region fits below vma->vm_start,
+		 * return with success:
+		 */
+		vma = find_vma(mm, addr);
+		if (likely(!vma || addr+len <= vma->vm_start)) {
+			/* remember the address as a hint for next time */
+			return (mm->free_area_cache = addr);
+		}
+
+		/* remember the largest hole we saw so far */
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
+
+		/* try just below the current vma->vm_start */
+		addr = vma->vm_start-len;
+		if (do_colour_align)
+			addr = COLOUR_ALIGN_DOWN(addr, pgoff);
+	} while (likely(len < vma->vm_start));
+
+bottomup:
+	/*
+	 * A failed mmap() very likely causes application failure,
+	 * so fall back to the bottom-up function here. This scenario
+	 * can happen with large stack limits and large mmap()
+	 * allocations.
+	 */
+	mm->cached_hole_size = ~0UL;
+	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
+	/*
+	 * Restore the topdown base:
+	 */
+	mm->free_area_cache = mm->mmap_base;
+	mm->cached_hole_size = ~0UL;
+
+	return addr;
+}
 #endif /* CONFIG_MMU */
 
 /*
-- 
cgit v0.10.2


From e8808c1019b048a43686dbd25c188a035842c2e2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 3 May 2009 02:48:52 +0200
Subject: tracing/filters: support for filters of dynamic sized arrays

Currently the filtering infrastructure supports well the
numeric types and fixed sized array types.

But the recently added __string() field uses a specific
indirect offset mechanism which requires a specific
predicate. Until now it wasn't supported.

This patch adds this support and implies very few changes,
only a new predicate is needed, the management of this specific
field can be done through the usual string helpers in the
filtering infrastructure.

[ Impact: support all kinds of strings in the tracing filters ]

Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Zhaolei <zhaolei@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 7ac6910..01c76eb 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -151,6 +151,7 @@ static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
 	return val1 || val2;
 }
 
+/* Filter predicate for fixed sized arrays of characters */
 static int filter_pred_string(struct filter_pred *pred, void *event,
 			      int val1, int val2)
 {
@@ -164,6 +165,30 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
 	return match;
 }
 
+/*
+ * Filter predicate for dynamic sized arrays of characters.
+ * These are implemented through a list of strings at the end
+ * of the entry.
+ * Also each of these strings have a field in the entry which
+ * contains its offset from the beginning of the entry.
+ * We have then first to get this field, dereference it
+ * and add it to the address of the entry, and at last we have
+ * the address of the string.
+ */
+static int filter_pred_strloc(struct filter_pred *pred, void *event,
+			      int val1, int val2)
+{
+	int str_loc = *(int *)(event + pred->offset);
+	char *addr = (char *)(event + str_loc);
+	int cmp, match;
+
+	cmp = strncmp(addr, pred->str_val, pred->str_len);
+
+	match = (!cmp) ^ pred->not;
+
+	return match;
+}
+
 static int filter_pred_none(struct filter_pred *pred, void *event,
 			    int val1, int val2)
 {
@@ -446,10 +471,18 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
 	return 0;
 }
 
+enum {
+	FILTER_STATIC_STRING = 1,
+	FILTER_DYN_STRING
+};
+
 static int is_string_field(const char *type)
 {
 	if (strchr(type, '[') && strstr(type, "char"))
-		return 1;
+		return FILTER_STATIC_STRING;
+
+	if (!strcmp(type, "__str_loc"))
+		return FILTER_DYN_STRING;
 
 	return 0;
 }
@@ -512,6 +545,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
 	struct ftrace_event_field *field;
 	filter_pred_fn_t fn;
 	unsigned long long val;
+	int string_type;
 
 	pred->fn = filter_pred_none;
 
@@ -536,8 +570,12 @@ static int filter_add_pred(struct filter_parse_state *ps,
 		return -EINVAL;
 	}
 
-	if (is_string_field(field->type)) {
-		fn = filter_pred_string;
+	string_type = is_string_field(field->type);
+	if (string_type) {
+		if (string_type == FILTER_STATIC_STRING)
+			fn = filter_pred_string;
+		else
+			fn = filter_pred_strloc;
 		pred->str_len = field->size;
 		if (pred->op == OP_NE)
 			pred->not = 1;
-- 
cgit v0.10.2


From 5928c3cc0ffcb6894bbab6be591b7ae1786b2d87 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 3 May 2009 03:03:57 +0200
Subject: tracing/filters: support for operator reserved characters in strings

When we set a filter for an event, such as:

echo "name == my_lock_name" > \
	/debug/tracing/events/lockdep/lock_acquired/filter

then the following order of token type is parsed:

- space
- operator
- parentheses
- operand

Because the operators and parentheses have a higher precedence
than the operand characters, which is normal, then we can't
use any string containing such special characters:

()=<>!&|

To get this support and also avoid ambiguous intepretation from
the parser or the human, we can do it using double quotes so that
we keep the usual languages habits.

Then after this patch you can still declare string condition like
before:

echo name == myname

But if you want to compare against a string containing an operator
character, you can use double quotes:

echo 'name == "&myname"'

Don't forget to include the whole expression into single quotes or
the double ones will be eaten by echo.

[ Impact: support strings with special characters for tracing filters ]

Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Zhaolei <zhaolei@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 01c76eb..8c62e5b 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -851,10 +851,19 @@ static void postfix_clear(struct filter_parse_state *ps)
 
 static int filter_parse(struct filter_parse_state *ps)
 {
+	int in_string = 0;
 	int op, top_op;
 	char ch;
 
 	while ((ch = infix_next(ps))) {
+		if (ch == '"') {
+			in_string ^= 1;
+			continue;
+		}
+
+		if (in_string)
+			goto parse_operand;
+
 		if (isspace(ch))
 			continue;
 
@@ -908,6 +917,7 @@ static int filter_parse(struct filter_parse_state *ps)
 			}
 			continue;
 		}
+parse_operand:
 		if (append_operand_char(ps, ch)) {
 			parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0);
 			return -EINVAL;
-- 
cgit v0.10.2


From d94fc523f3c35bd8013f04827e94756cbc0212f4 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 7 May 2009 15:11:15 +0800
Subject: tracing/events: fix concurrent access to ftrace_events list, fix

In filter_add_subsystem_pred() we should release event_mutex before
calling filter_free_subsystem_preds(), since both functions hold
event_mutex.

[ Impact: fix deadlock when writing invalid pred into subsystem filter ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: tzanussi@gmail.com
Cc: a.p.zijlstra@chello.nl
Cc: fweisbec@gmail.com
Cc: rostedt@goodmis.org
LKML-Reference: <4A028993.7020509@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 8c62e5b..85ad6a8 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -636,14 +636,15 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
 
 		err = filter_add_pred(ps, call, pred);
 		if (err) {
+			mutex_unlock(&event_mutex);
 			filter_free_subsystem_preds(system);
 			parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
-			break;
+			goto out;
 		}
 		replace_filter_string(call->filter, filter_string);
 	}
 	mutex_unlock(&event_mutex);
-
+out:
 	return err;
 }
 
-- 
cgit v0.10.2


From ae318a148e4d255dfbc87d963fdd6031c2af9c46 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 7 May 2009 17:55:09 +0900
Subject: sh: sh64 still needs ARCH_USES_GETTIMEOFFSET temporarily.

sh64 is still using this, so re-enable it temporarily while SH-5 gets
converted to use the generic code.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 565f390..02688d7 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -141,6 +141,10 @@ config ARCH_NO_VIRT_TO_BUS
 config ARCH_HAS_DEFAULT_IDLE
 	def_bool y
 
+config ARCH_USES_GETTIMEOFFSET
+	def_bool y 
+	depends on SUPERH64
+
 config IO_TRAPPED
 	bool
 
-- 
cgit v0.10.2


From 0fb849b9d743a20056f2418cd955e5c650658663 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 7 May 2009 18:10:27 +0900
Subject: sh: Integrate the SH-5 onchip_remap() more coherently.

Presently this is special-cased for early initialization. While there are
situations where these static early initializations are still necessary,
with minor changes it is possible to use this for the regular ioremap
implementation as well. This allows us to kill off the special-casing for
the remap completely and to start tidying up all of the SH-5
special-casing in drivers.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/boards/mach-cayman/irq.c b/arch/sh/boards/mach-cayman/irq.c
index da62ad5..dbb00c9 100644
--- a/arch/sh/boards/mach-cayman/irq.c
+++ b/arch/sh/boards/mach-cayman/irq.c
@@ -161,7 +161,7 @@ void init_cayman_irq(void)
 {
 	int i;
 
-	epld_virt = onchip_remap(EPLD_BASE, 1024, "EPLD");
+	epld_virt = (unsigned long)ioremap_nocache(EPLD_BASE, 1024);
 	if (!epld_virt) {
 		printk(KERN_ERR "Cayman IRQ: Unable to remap EPLD\n");
 		return;
diff --git a/arch/sh/boards/mach-cayman/setup.c b/arch/sh/boards/mach-cayman/setup.c
index e7f9cc5..7e8216a 100644
--- a/arch/sh/boards/mach-cayman/setup.c
+++ b/arch/sh/boards/mach-cayman/setup.c
@@ -102,7 +102,7 @@ static int __init smsc_superio_setup(void)
 {
 	unsigned char devid, devrev;
 
-	smsc_superio_virt = onchip_remap(SMSC_SUPERIO_BASE, 1024, "SMSC SuperIO");
+	smsc_superio_virt = (unsigned long)ioremap_nocache(SMSC_SUPERIO_BASE, 1024);
 	if (!smsc_superio_virt) {
 		panic("Unable to remap SMSC SuperIO\n");
 	}
diff --git a/arch/sh/drivers/pci/pci-sh5.c b/arch/sh/drivers/pci/pci-sh5.c
index cf43185..873ed2b 100644
--- a/arch/sh/drivers/pci/pci-sh5.c
+++ b/arch/sh/drivers/pci/pci-sh5.c
@@ -119,12 +119,12 @@ static int __init sh5pci_init(void)
                 return -EINVAL;
         }
 
-	pcicr_virt = onchip_remap(SH5PCI_ICR_BASE, 1024, "PCICR");
+	pcicr_virt = (unsigned long)ioremap_nocache(SH5PCI_ICR_BASE, 1024);
 	if (!pcicr_virt) {
 		panic("Unable to remap PCICR\n");
 	}
 
-	PCI_IO_AREA = onchip_remap(SH5PCI_IO_BASE, 0x10000, "PCIIO");
+	PCI_IO_AREA = (unsigned long)ioremap_nocache(SH5PCI_IO_BASE, 0x10000);
 	if (!PCI_IO_AREA) {
 		panic("Unable to remap PCIIO\n");
 	}
diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h
index ef21734..c7c360b 100644
--- a/arch/sh/include/asm/io.h
+++ b/arch/sh/include/asm/io.h
@@ -224,11 +224,6 @@ void __iomem *__ioremap(unsigned long offset, unsigned long size,
 			unsigned long flags);
 void __iounmap(void __iomem *addr);
 
-/* arch/sh/mm/ioremap_64.c */
-unsigned long onchip_remap(unsigned long addr, unsigned long size,
-			   const char *name);
-extern void onchip_unmap(unsigned long vaddr);
-
 static inline void __iomem *
 __ioremap_mode(unsigned long offset, unsigned long size, unsigned long flags)
 {
@@ -263,8 +258,6 @@ __ioremap_mode(unsigned long offset, unsigned long size, unsigned long flags)
 	return __ioremap(offset, size, flags);
 }
 #else
-#define onchip_remap(addr, size, name)		(addr)
-#define onchip_unmap(addr)			do { } while (0)
 #define __ioremap_mode(offset, size, flags)	((void __iomem *)(offset))
 #define __iounmap(addr)				do { } while (0)
 #endif /* CONFIG_MMU */
diff --git a/arch/sh/kernel/cpu/irq/intc-sh5.c b/arch/sh/kernel/cpu/irq/intc-sh5.c
index d8679a4..02d8137 100644
--- a/arch/sh/kernel/cpu/irq/intc-sh5.c
+++ b/arch/sh/kernel/cpu/irq/intc-sh5.c
@@ -188,7 +188,7 @@ void __init plat_irq_setup(void)
 	unsigned long reg;
 	int i;
 
-	intc_virt = onchip_remap(INTC_BASE, 1024, "INTC");
+	intc_virt = (unsigned long)ioremap_nocache(INTC_BASE, 1024);
 	if (!intc_virt) {
 		panic("Unable to remap INTC\n");
 	}
diff --git a/arch/sh/kernel/cpu/sh5/Makefile b/arch/sh/kernel/cpu/sh5/Makefile
index ce4602e..a184a31 100644
--- a/arch/sh/kernel/cpu/sh5/Makefile
+++ b/arch/sh/kernel/cpu/sh5/Makefile
@@ -6,6 +6,9 @@ obj-y := entry.o probe.o switchto.o
 obj-$(CONFIG_SH_FPU)		+= fpu.o
 obj-$(CONFIG_KALLSYMS)		+= unwind.o
 
+# CPU subtype setup
+obj-$(CONFIG_CPU_SH5)		+= setup-sh5.o
+
 # Primary on-chip clocks (common)
 clock-$(CONFIG_CPU_SH5)		:= clock-sh5.o
 
diff --git a/arch/sh/kernel/cpu/sh5/clock-sh5.c b/arch/sh/kernel/cpu/sh5/clock-sh5.c
index 52c4924..5486324 100644
--- a/arch/sh/kernel/cpu/sh5/clock-sh5.c
+++ b/arch/sh/kernel/cpu/sh5/clock-sh5.c
@@ -71,7 +71,7 @@ static struct clk_ops *sh5_clk_ops[] = {
 
 void __init arch_init_clk_ops(struct clk_ops **ops, int idx)
 {
-	cprc_base = onchip_remap(CPRC_BASE, 1024, "CPRC");
+	cprc_base = (unsigned long)ioremap_nocache(CPRC_BASE, 1024);
 	BUG_ON(!cprc_base);
 
 	if (idx < ARRAY_SIZE(sh5_clk_ops))
diff --git a/arch/sh/kernel/cpu/sh5/entry.S b/arch/sh/kernel/cpu/sh5/entry.S
index 7e49cb8..5162975 100644
--- a/arch/sh/kernel/cpu/sh5/entry.S
+++ b/arch/sh/kernel/cpu/sh5/entry.S
@@ -1410,8 +1410,8 @@ peek_real_address_q:
 	   r2(out) : result quadword
 
 	   This is provided as a cheapskate way of manipulating device
-	   registers for debugging (to avoid the need to onchip_remap the debug
-	   module, and to avoid the need to onchip_remap the watchpoint
+	   registers for debugging (to avoid the need to ioremap the debug
+	   module, and to avoid the need to ioremap the watchpoint
 	   controller in a way that identity maps sufficient bits to avoid the
 	   SH5-101 cut2 silicon defect).
 
@@ -1459,8 +1459,8 @@ poke_real_address_q:
 	   r3 : quadword value to write.
 
 	   This is provided as a cheapskate way of manipulating device
-	   registers for debugging (to avoid the need to onchip_remap the debug
-	   module, and to avoid the need to onchip_remap the watchpoint
+	   registers for debugging (to avoid the need to ioremap the debug
+	   module, and to avoid the need to ioremap the watchpoint
 	   controller in a way that identity maps sufficient bits to avoid the
 	   SH5-101 cut2 silicon defect).
 
diff --git a/arch/sh/kernel/cpu/sh5/setup-sh5.c b/arch/sh/kernel/cpu/sh5/setup-sh5.c
new file mode 100644
index 0000000..d8d59fe
--- /dev/null
+++ b/arch/sh/kernel/cpu/sh5/setup-sh5.c
@@ -0,0 +1,46 @@
+/*
+ * SH5-101/SH5-103 CPU Setup
+ *
+ *  Copyright (C) 2009  Paul Mundt
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/platform_device.h>
+#include <linux/init.h>
+#include <linux/serial.h>
+#include <linux/serial_sci.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <asm/addrspace.h>
+
+static struct plat_sci_port sci_platform_data[] = {
+	{
+		.mapbase	= PHYS_PERIPHERAL_BLOCK + 0x01030000,
+		.flags		= UPF_BOOT_AUTOCONF | UPF_IOREMAP,
+		.type		= PORT_SCIF,
+		.irqs		= { 39, 40, 42, 0 },
+	}, {
+		.flags = 0,
+	}
+};
+
+static struct platform_device sci_device = {
+	.name		= "sh-sci",
+	.id		= -1,
+	.dev		= {
+		.platform_data	= sci_platform_data,
+	},
+};
+
+static struct platform_device *sh5_devices[] __initdata = {
+	&sci_device,
+};
+
+static int __init sh5_devices_setup(void)
+{
+	return platform_add_devices(sh5_devices,
+				    ARRAY_SIZE(sh5_devices));
+}
+__initcall(sh5_devices_setup);
diff --git a/arch/sh/kernel/time_64.c b/arch/sh/kernel/time_64.c
index f4f5e8a..7bfeaa0 100644
--- a/arch/sh/kernel/time_64.c
+++ b/arch/sh/kernel/time_64.c
@@ -243,12 +243,12 @@ void __init time_init(void)
 	unsigned long interval;
 	struct clk *clk;
 
-	tmu_base = onchip_remap(TMU_BASE, 1024, "TMU");
+	tmu_base = (unsigned long)ioremap_nocache(TMU_BASE, 1024);
 	if (!tmu_base) {
 		panic("Unable to remap TMU\n");
 	}
 
-	rtc_base = onchip_remap(RTC_BASE, 1024, "RTC");
+	rtc_base = (unsigned long)ioremap_nocache(RTC_BASE, 1024);
 	if (!rtc_base) {
 		panic("Unable to remap RTC\n");
 	}
diff --git a/arch/sh/mm/ioremap_64.c b/arch/sh/mm/ioremap_64.c
index 31e1bb5..2331229 100644
--- a/arch/sh/mm/ioremap_64.c
+++ b/arch/sh/mm/ioremap_64.c
@@ -27,88 +27,17 @@
 #include <asm/tlbflush.h>
 #include <asm/mmu.h>
 
-static void shmedia_mapioaddr(unsigned long, unsigned long);
-static unsigned long shmedia_ioremap(struct resource *, u32, int);
-
-/*
- * Generic mapping function (not visible outside):
- */
-
-/*
- * Remap an arbitrary physical address space into the kernel virtual
- * address space. Needed when the kernel wants to access high addresses
- * directly.
- *
- * NOTE! We need to allow non-page-aligned mappings too: we will obviously
- * have to convert them into an offset in a page-aligned mapping, but the
- * caller shouldn't need to know that small detail.
- */
-void *__ioremap(unsigned long phys_addr, unsigned long size,
-		unsigned long flags)
-{
-	void * addr;
-	struct vm_struct * area;
-	unsigned long offset, last_addr;
-	pgprot_t pgprot;
-
-	/* Don't allow wraparound or zero size */
-	last_addr = phys_addr + size - 1;
-	if (!size || last_addr < phys_addr)
-		return NULL;
-
-	pgprot = __pgprot(_PAGE_PRESENT  | _PAGE_READ   |
-			  _PAGE_WRITE    | _PAGE_DIRTY  |
-			  _PAGE_ACCESSED | _PAGE_SHARED | flags);
-
-	/*
-	 * Mappings have to be page-aligned
-	 */
-	offset = phys_addr & ~PAGE_MASK;
-	phys_addr &= PAGE_MASK;
-	size = PAGE_ALIGN(last_addr + 1) - phys_addr;
-
-	/*
-	 * Ok, go for it..
-	 */
-	area = get_vm_area(size, VM_IOREMAP);
-	if (!area)
-		return NULL;
-	pr_debug("Get vm_area returns %p addr %p\n", area, area->addr);
-	area->phys_addr = phys_addr;
-	addr = area->addr;
-	if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
-			       phys_addr, pgprot)) {
-		vunmap(addr);
-		return NULL;
-	}
-	return (void *) (offset + (char *)addr);
-}
-EXPORT_SYMBOL(__ioremap);
-
-void __iounmap(void *addr)
-{
-	struct vm_struct *area;
-
-	vfree((void *) (PAGE_MASK & (unsigned long) addr));
-	area = remove_vm_area((void *) (PAGE_MASK & (unsigned long) addr));
-	if (!area) {
-		printk(KERN_ERR "iounmap: bad address %p\n", addr);
-		return;
-	}
-
-	kfree(area);
-}
-EXPORT_SYMBOL(__iounmap);
-
 static struct resource shmedia_iomap = {
 	.name	= "shmedia_iomap",
 	.start	= IOBASE_VADDR + PAGE_SIZE,
 	.end	= IOBASE_END - 1,
 };
 
-static void shmedia_mapioaddr(unsigned long pa, unsigned long va);
+static void shmedia_mapioaddr(unsigned long pa, unsigned long va,
+			      unsigned long flags);
 static void shmedia_unmapioaddr(unsigned long vaddr);
-static unsigned long shmedia_ioremap(struct resource *res, u32 pa, int sz);
+static void __iomem *shmedia_ioremap(struct resource *res, u32 pa,
+				     int sz, unsigned long flags);
 
 /*
  * We have the same problem as the SPARC, so lets have the same comment:
@@ -130,18 +59,18 @@ static struct xresource xresv[XNRES];
 
 static struct xresource *xres_alloc(void)
 {
-        struct xresource *xrp;
-        int n;
-
-        xrp = xresv;
-        for (n = 0; n < XNRES; n++) {
-                if (xrp->xflag == 0) {
-                        xrp->xflag = 1;
-                        return xrp;
-                }
-                xrp++;
-        }
-        return NULL;
+	struct xresource *xrp;
+	int n;
+
+	xrp = xresv;
+	for (n = 0; n < XNRES; n++) {
+		if (xrp->xflag == 0) {
+			xrp->xflag = 1;
+			return xrp;
+		}
+		xrp++;
+	}
+	return NULL;
 }
 
 static void xres_free(struct xresource *xrp)
@@ -161,76 +90,71 @@ static struct resource *shmedia_find_resource(struct resource *root,
 	return NULL;
 }
 
-static unsigned long shmedia_alloc_io(unsigned long phys, unsigned long size,
-				      const char *name)
+static void __iomem *shmedia_alloc_io(unsigned long phys, unsigned long size,
+				      const char *name, unsigned long flags)
 {
-        static int printed_full = 0;
-        struct xresource *xres;
-        struct resource *res;
-        char *tack;
-        int tlen;
-
-        if (name == NULL) name = "???";
-
-        if ((xres = xres_alloc()) != 0) {
-                tack = xres->xname;
-                res = &xres->xres;
-        } else {
-                if (!printed_full) {
-                        printk("%s: done with statics, switching to kmalloc\n",
-			       __func__);
-                        printed_full = 1;
-                }
-                tlen = strlen(name);
-                tack = kmalloc(sizeof (struct resource) + tlen + 1, GFP_KERNEL);
-                if (!tack)
-			return -ENOMEM;
-                memset(tack, 0, sizeof(struct resource));
-                res = (struct resource *) tack;
-                tack += sizeof (struct resource);
-        }
-
-        strncpy(tack, name, XNMLN);
-        tack[XNMLN] = 0;
-        res->name = tack;
-
-        return shmedia_ioremap(res, phys, size);
+	static int printed_full;
+	struct xresource *xres;
+	struct resource *res;
+	char *tack;
+	int tlen;
+
+	if (name == NULL)
+		name = "???";
+
+	xres = xres_alloc();
+	if (xres != 0) {
+		tack = xres->xname;
+		res = &xres->xres;
+	} else {
+		if (!printed_full) {
+			printk(KERN_NOTICE "%s: done with statics, "
+			       "switching to kmalloc\n", __func__);
+			printed_full = 1;
+		}
+		tlen = strlen(name);
+		tack = kmalloc(sizeof(struct resource) + tlen + 1, GFP_KERNEL);
+		if (!tack)
+			return NULL;
+		memset(tack, 0, sizeof(struct resource));
+		res = (struct resource *) tack;
+		tack += sizeof(struct resource);
+	}
+
+	strncpy(tack, name, XNMLN);
+	tack[XNMLN] = 0;
+	res->name = tack;
+
+	return shmedia_ioremap(res, phys, size, flags);
 }
 
-static unsigned long shmedia_ioremap(struct resource *res, u32 pa, int sz)
+static void __iomem *shmedia_ioremap(struct resource *res, u32 pa, int sz,
+				     unsigned long flags)
 {
-        unsigned long offset = ((unsigned long) pa) & (~PAGE_MASK);
+	unsigned long offset = ((unsigned long) pa) & (~PAGE_MASK);
 	unsigned long round_sz = (offset + sz + PAGE_SIZE-1) & PAGE_MASK;
-        unsigned long va;
-        unsigned int psz;
+	unsigned long va;
+	unsigned int psz;
 
-        if (allocate_resource(&shmedia_iomap, res, round_sz,
+	if (allocate_resource(&shmedia_iomap, res, round_sz,
 			      shmedia_iomap.start, shmedia_iomap.end,
 			      PAGE_SIZE, NULL, NULL) != 0) {
-                panic("alloc_io_res(%s): cannot occupy\n",
-                    (res->name != NULL)? res->name: "???");
-        }
+		panic("alloc_io_res(%s): cannot occupy\n",
+		      (res->name != NULL) ? res->name : "???");
+	}
 
-        va = res->start;
-        pa &= PAGE_MASK;
+	va = res->start;
+	pa &= PAGE_MASK;
 
 	psz = (res->end - res->start + (PAGE_SIZE - 1)) / PAGE_SIZE;
 
-	/* log at boot time ... */
-	printk("mapioaddr: %6s  [%2d page%s]  va 0x%08lx   pa 0x%08x\n",
-	       ((res->name != NULL) ? res->name : "???"),
-	       psz, psz == 1 ? " " : "s", va, pa);
-
-        for (psz = res->end - res->start + 1; psz != 0; psz -= PAGE_SIZE) {
-                shmedia_mapioaddr(pa, va);
-                va += PAGE_SIZE;
-                pa += PAGE_SIZE;
-        }
-
-        res->start += offset;
-        res->end = res->start + sz - 1;         /* not strictly necessary.. */
+	for (psz = res->end - res->start + 1; psz != 0; psz -= PAGE_SIZE) {
+		shmedia_mapioaddr(pa, va, flags);
+		va += PAGE_SIZE;
+		pa += PAGE_SIZE;
+	}
 
-        return res->start;
+	return (void __iomem *)(unsigned long)(res->start + offset);
 }
 
 static void shmedia_free_io(struct resource *res)
@@ -249,14 +173,12 @@ static void shmedia_free_io(struct resource *res)
 
 static __init_refok void *sh64_get_page(void)
 {
-	extern int after_bootmem;
 	void *page;
 
-	if (after_bootmem) {
-		page = (void *)get_zeroed_page(GFP_ATOMIC);
-	} else {
+	if (after_bootmem)
+		page = (void *)get_zeroed_page(GFP_KERNEL);
+	else
 		page = alloc_bootmem_pages(PAGE_SIZE);
-	}
 
 	if (!page || ((unsigned long)page & ~PAGE_MASK))
 		panic("sh64_get_page: Out of memory already?\n");
@@ -264,17 +186,20 @@ static __init_refok void *sh64_get_page(void)
 	return page;
 }
 
-static void shmedia_mapioaddr(unsigned long pa, unsigned long va)
+static void shmedia_mapioaddr(unsigned long pa, unsigned long va,
+			      unsigned long flags)
 {
 	pgd_t *pgdp;
 	pud_t *pudp;
 	pmd_t *pmdp;
 	pte_t *ptep, pte;
 	pgprot_t prot;
-	unsigned long flags = 1; /* 1 = CB0-1 device */
 
 	pr_debug("shmedia_mapiopage pa %08lx va %08lx\n",  pa, va);
 
+	if (!flags)
+		flags = 1; /* 1 = CB0-1 device */
+
 	pgdp = pgd_offset_k(va);
 	if (pgd_none(*pgdp) || !pgd_present(*pgdp)) {
 		pudp = (pud_t *)sh64_get_page();
@@ -288,7 +213,7 @@ static void shmedia_mapioaddr(unsigned long pa, unsigned long va)
 	}
 
 	pmdp = pmd_offset(pudp, va);
-	if (pmd_none(*pmdp) || !pmd_present(*pmdp) ) {
+	if (pmd_none(*pmdp) || !pmd_present(*pmdp)) {
 		ptep = (pte_t *)sh64_get_page();
 		set_pmd(pmdp, __pmd((unsigned long)ptep + _PAGE_TABLE));
 	}
@@ -336,17 +261,19 @@ static void shmedia_unmapioaddr(unsigned long vaddr)
 	pte_clear(&init_mm, vaddr, ptep);
 }
 
-unsigned long onchip_remap(unsigned long phys, unsigned long size, const char *name)
+void __iomem *__ioremap(unsigned long offset, unsigned long size,
+			unsigned long flags)
 {
-	if (size < PAGE_SIZE)
-		size = PAGE_SIZE;
+	char name[14];
 
-	return shmedia_alloc_io(phys, size, name);
+	sprintf(name, "phys_%08x", (u32)offset);
+	return shmedia_alloc_io(offset, size, name, flags);
 }
-EXPORT_SYMBOL(onchip_remap);
+EXPORT_SYMBOL(__ioremap);
 
-void onchip_unmap(unsigned long vaddr)
+void __iounmap(void __iomem *virtual)
 {
+	unsigned long vaddr = (unsigned long)virtual & PAGE_MASK;
 	struct resource *res;
 	unsigned int psz;
 
@@ -357,10 +284,7 @@ void onchip_unmap(unsigned long vaddr)
 		return;
 	}
 
-        psz = (res->end - res->start + (PAGE_SIZE - 1)) / PAGE_SIZE;
-
-        printk(KERN_DEBUG "unmapioaddr: %6s  [%2d page%s] freed\n",
-	       res->name, psz, psz == 1 ? " " : "s");
+	psz = (res->end - res->start + (PAGE_SIZE - 1)) / PAGE_SIZE;
 
 	shmedia_free_io(res);
 
@@ -371,9 +295,8 @@ void onchip_unmap(unsigned long vaddr)
 		kfree(res);
 	}
 }
-EXPORT_SYMBOL(onchip_unmap);
+EXPORT_SYMBOL(__iounmap);
 
-#ifdef CONFIG_PROC_FS
 static int
 ioremap_proc_info(char *buf, char **start, off_t fpos, int length, int *eof,
 		  void *data)
@@ -385,7 +308,10 @@ ioremap_proc_info(char *buf, char **start, off_t fpos, int length, int *eof,
 	for (r = ((struct resource *)data)->child; r != NULL; r = r->sibling) {
 		if (p + 32 >= e)        /* Better than nothing */
 			break;
-		if ((nm = r->name) == 0) nm = "???";
+		nm = r->name;
+		if (nm == NULL)
+			nm = "???";
+
 		p += sprintf(p, "%08lx-%08lx: %s\n",
 			     (unsigned long)r->start,
 			     (unsigned long)r->end, nm);
@@ -393,14 +319,11 @@ ioremap_proc_info(char *buf, char **start, off_t fpos, int length, int *eof,
 
 	return p-buf;
 }
-#endif /* CONFIG_PROC_FS */
 
 static int __init register_proc_onchip(void)
 {
-#ifdef CONFIG_PROC_FS
-	create_proc_read_entry("io_map",0,0, ioremap_proc_info, &shmedia_iomap);
-#endif
+	create_proc_read_entry("io_map", 0, 0, ioremap_proc_info,
+			       &shmedia_iomap);
 	return 0;
 }
-
-__initcall(register_proc_onchip);
+late_initcall(register_proc_onchip);
diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index dbf5357..728d6a0 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -985,13 +985,7 @@ static void sci_config_port(struct uart_port *port, int flags)
 	port->type = s->type;
 
 	if (port->flags & UPF_IOREMAP && !port->membase) {
-#if defined(CONFIG_SUPERH64)
-		port->mapbase = onchip_remap(SCIF_ADDR_SH5, 1024, "SCIF");
-		port->membase = (void __iomem *)port->mapbase;
-#else
 		port->membase = ioremap_nocache(port->mapbase, 0x40);
-#endif
-
 		dev_err(port->dev, "can't remap port#%d\n", port->line);
 	}
 }
-- 
cgit v0.10.2


From c51279ec0d2d959e13831ae84b714301f0494f27 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 7 May 2009 18:17:20 +0900
Subject: sh: Kill off unused SH-5 irq_describe cruft.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/boards/mach-cayman/irq.c b/arch/sh/boards/mach-cayman/irq.c
index dbb00c9..33f77085 100644
--- a/arch/sh/boards/mach-cayman/irq.c
+++ b/arch/sh/boards/mach-cayman/irq.c
@@ -142,21 +142,6 @@ int cayman_irq_demux(int evt)
 	return irq;
 }
 
-#if defined(CONFIG_PROC_FS) && defined(CONFIG_SYSCTL)
-int cayman_irq_describe(char* p, int irq)
-{
-	if (irq < NR_INTC_IRQS) {
-		return intc_irq_describe(p, irq);
-	} else if (irq < NR_INTC_IRQS + 8) {
-		return sprintf(p, "(SMSC %d)", irq - NR_INTC_IRQS);
-	} else if ((irq >= NR_INTC_IRQS + 24) && (irq < NR_INTC_IRQS + 32)) {
-		return sprintf(p, "(PCI2 %d)", irq - (NR_INTC_IRQS + 24));
-	}
-
-	return 0;
-}
-#endif
-
 void init_cayman_irq(void)
 {
 	int i;
diff --git a/arch/sh/include/cpu-sh5/cpu/irq.h b/arch/sh/include/cpu-sh5/cpu/irq.h
index f0f0756e..0ccf257 100644
--- a/arch/sh/include/cpu-sh5/cpu/irq.h
+++ b/arch/sh/include/cpu-sh5/cpu/irq.h
@@ -111,7 +111,6 @@
 #define TOP_PRIORITY	15
 
 extern int intc_evt_to_irq[(0xE20/0x20)+1];
-int intc_irq_describe(char* p, int irq);
 extern int platform_int_priority[NR_INTC_IRQS];
 
 #endif /* __ASM_SH_CPU_SH5_IRQ_H */
diff --git a/arch/sh/kernel/cpu/irq/intc-sh5.c b/arch/sh/kernel/cpu/irq/intc-sh5.c
index 02d8137..a619eaf 100644
--- a/arch/sh/kernel/cpu/irq/intc-sh5.c
+++ b/arch/sh/kernel/cpu/irq/intc-sh5.c
@@ -160,28 +160,6 @@ void make_intc_irq(unsigned int irq)
 	disable_intc_irq(irq);
 }
 
-#if defined(CONFIG_PROC_FS) && defined(CONFIG_SYSCTL)
-static int IRQ_to_vectorN[NR_INTC_IRQS] = {
-	0x12, 0x15, 0x18, 0x1B, 0x40, 0x41, 0x42, 0x43, /*  0- 7 */
-	  -1,   -1,   -1,   -1, 0x50, 0x51, 0x52, 0x53,	/*  8-15 */
-	0x54, 0x55, 0x32, 0x33, 0x34, 0x35, 0x36,   -1, /* 16-23 */
-	  -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1, /* 24-31 */
-	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x38,	/* 32-39 */
-        0x39, 0x3A, 0x3B,   -1,   -1,   -1,   -1,   -1, /* 40-47 */
-	  -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1, /* 48-55 */
-	  -1,   -1,   -1,   -1,   -1,   -1,   -1, 0x2B, /* 56-63 */
-
-};
-
-int intc_irq_describe(char* p, int irq)
-{
-	if (irq < NR_INTC_IRQS)
-		return sprintf(p, "(0x%3x)", IRQ_to_vectorN[irq]*0x20);
-	else
-		return 0;
-}
-#endif
-
 void __init plat_irq_setup(void)
 {
 	unsigned long long __dummy0, __dummy1=~0x00000000100000f0;
-- 
cgit v0.10.2


From 29c8000ee7da3a6756d26143991e573eaaf2a9f6 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 7 May 2009 11:13:42 -0400
Subject: ring-buffer: remove complex calculations in ring-buffer-test

Ingo Molnar thought that the code to calculate the time in cond_resched
is a bit too ugly and is not needed. This patch removes it and replaces
it with a simple call to cond_resched. I kept the comment that explains
the reason for the cond_resched.

[ Impact: remove ugly code ]

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index a26fc67..f4ceb45 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -185,35 +185,6 @@ static void ring_buffer_consumer(void)
 	complete(&read_done);
 }
 
-/*
- * If we are a non preempt kernel, the 10 second run will
- * stop everything while it runs. Instead, we will call cond_resched
- * and also add any time that was lost by a rescedule.
- */
-#ifdef CONFIG_PREEMPT
-static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv)
-{
-}
-#else
-static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv)
-{
-	struct timeval tv;
-
-	cond_resched();
-	do_gettimeofday(&tv);
-	if (tv.tv_usec < end_tv->tv_usec) {
-		tv.tv_usec += 1000000;
-		tv.tv_sec--;
-	}
-	start_tv->tv_sec += tv.tv_sec - end_tv->tv_sec;
-	start_tv->tv_usec += tv.tv_usec - end_tv->tv_usec;
-	if (start_tv->tv_usec > 1000000) {
-		start_tv->tv_usec -= 1000000;
-		start_tv->tv_sec++;
-	}
-}
-#endif
-
 static void ring_buffer_producer(void)
 {
 	struct timeval start_tv;
@@ -250,7 +221,13 @@ static void ring_buffer_producer(void)
 		if (consumer && !(++cnt % wakeup_interval))
 			wake_up_process(consumer);
 
-		sched_if_needed(&start_tv, &end_tv);
+		/*
+		 * If we are a non preempt kernel, the 10 second run will
+		 * stop everything while it runs. Instead, we will call
+		 * cond_resched and also add any time that was lost by a
+		 * rescedule.
+		 */
+		cond_resched();
 
 	} while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
 	pr_info("End ring buffer hammer\n");
-- 
cgit v0.10.2


From 54f2c841fa0007e5fee3b7d01a911c774f0a6cda Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Thu, 7 May 2009 17:28:59 +0200
Subject: oprofile: fix cpu buffer size

The unit of oprofile_cpu_buffer_size is in samples, but was allocated
in bytes. This led to the allocation of too small cpu buffers. This
patch recalculates the buffer size in bytes taking also the
ring_buffer_event header size into account.

Reported-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Robert Richter <robert.richter@amd.com>

diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c
index f0e99d4..242257b 100644
--- a/drivers/oprofile/cpu_buffer.c
+++ b/drivers/oprofile/cpu_buffer.c
@@ -78,16 +78,20 @@ void free_cpu_buffers(void)
 	op_ring_buffer_write = NULL;
 }
 
+#define RB_EVENT_HDR_SIZE 4
+
 int alloc_cpu_buffers(void)
 {
 	int i;
 
 	unsigned long buffer_size = oprofile_cpu_buffer_size;
+	unsigned long byte_size = buffer_size * (sizeof(struct op_sample) +
+						 RB_EVENT_HDR_SIZE);
 
-	op_ring_buffer_read = ring_buffer_alloc(buffer_size, OP_BUFFER_FLAGS);
+	op_ring_buffer_read = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS);
 	if (!op_ring_buffer_read)
 		goto fail;
-	op_ring_buffer_write = ring_buffer_alloc(buffer_size, OP_BUFFER_FLAGS);
+	op_ring_buffer_write = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS);
 	if (!op_ring_buffer_write)
 		goto fail;
 
-- 
cgit v0.10.2


From d6bf81ef0f7474434c2a049e8bf3c9146a14dd96 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 7 May 2009 11:49:35 -0400
Subject: tracing: append ":*" to internal setting of system events

The system enabling of events uses the same code as the set_event file.
It passes in the name of the system to the parser and that will enable
all the events that has that system as a name.

The problem is that it will also enable events with the same name as the
system.

If you have system name foo, and system name bar, but within the system
bar, there exists an event called foo. By setting the system name foo,
you will also be enabling the event foo in the system bar. This is not
an expected result.

The solution is to pass in "foo:*", which will only enable the system
foo and not events called foo.

[ Impact: prevent accidental enabling of events with same name as a system ]

Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 87feb01..8d0fae3 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -509,9 +509,11 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		return -EINVAL;
 	}
 
-	command = kstrdup(system, GFP_KERNEL);
+	/* +3 for the ":*\0" */
+	command = kmalloc(strlen(system)+3, GFP_KERNEL);
 	if (!command)
 		return -ENOMEM;
+	sprintf(command, "%s:*", system);
 
 	ret = ftrace_set_clr_event(command, val);
 	if (ret)
@@ -1179,7 +1181,7 @@ static __init int event_trace_init(void)
 			  &ftrace_show_header_fops);
 
 	trace_create_file("enable", 0644, d_events,
-			  "*:*", &ftrace_system_enable_fops);
+			  "*", &ftrace_system_enable_fops);
 
 	for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
 		/* The linker may leave blanks */
-- 
cgit v0.10.2


From 65b77242043f74bca6a0d733c0e48ef03a8c9893 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 7 May 2009 12:49:27 -0400
Subject: tracing: have menu default enabled when kernel debug is configured

Tracing can be very helpful to debug the kernel. When DEBUG_KERNEL is
enabled it is nice to enable the trace menu as well.

This patch only make the tracing menu enabled by default, it does not
make any of the tracers enabled. And the menu is only enabled by
default if DEBUG_KERNEL is enabled.

[ Impact: show tracing options to those debugging the kernel ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 50f62a2..f61be30 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -79,6 +79,7 @@ if TRACING_SUPPORT
 
 menuconfig FTRACE
 	bool "Tracers"
+	default y if DEBUG_KERNEL
 	help
 	 Enable the kernel tracing infrastructure.
 
-- 
cgit v0.10.2


From 0574ea421b90e0e45a72c447dd3c2c79ffd8c153 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 7 May 2009 14:20:28 -0400
Subject: ring-buffer: only periodically call cond_resched to
 ring-buffer-benchmark

Calling cond_resched at every iteration of the loop adds a bit of
overhead to the benchmark.

This patch does two things.

1) only calls cond-resched when CONFIG_PREEMPT is not enabled
2) only calls cond-resched after so many traces has been performed.

[ Impact: less overhead to the ring-buffer-benchmark ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index f4ceb45..a7c048b 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -218,16 +218,23 @@ static void ring_buffer_producer(void)
 		}
 		do_gettimeofday(&end_tv);
 
-		if (consumer && !(++cnt % wakeup_interval))
+		cnt++;
+		if (consumer && !(cnt % wakeup_interval))
 			wake_up_process(consumer);
 
+#ifndef CONFIG_PREEMPT
 		/*
 		 * If we are a non preempt kernel, the 10 second run will
 		 * stop everything while it runs. Instead, we will call
 		 * cond_resched and also add any time that was lost by a
 		 * rescedule.
+		 *
+		 * Do a cond resched at the same frequency we would wake up
+		 * the reader.
 		 */
-		cond_resched();
+		if (cnt % wakeup_interval)
+			cond_resched();
+#endif
 
 	} while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
 	pr_info("End ring buffer hammer\n");
-- 
cgit v0.10.2


From d3584183d2f40f40371e288ceef187d04da213b5 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 7 May 2009 15:36:13 -0700
Subject: sparc64: Fix SET_PERSONALITY to not clip bits outside of PER_MASK.

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/include/asm/elf_64.h b/arch/sparc/include/asm/elf_64.h
index 425c2f9..d42e393 100644
--- a/arch/sparc/include/asm/elf_64.h
+++ b/arch/sparc/include/asm/elf_64.h
@@ -208,8 +208,9 @@ do {	unsigned long new_flags = current_thread_info()->flags; \
 	else						\
 		clear_thread_flag(TIF_ABI_PENDING);	\
 	/* flush_thread will update pgd cache */	\
-	if (current->personality != PER_LINUX32)	\
-		set_personality(PER_LINUX);		\
+	if (personality(current->personality) != PER_LINUX32)	\
+		set_personality(PER_LINUX |		\
+			(current->personality & (~PER_MASK)));	\
 } while (0)
 
 #endif /* !(__ASM_SPARC64_ELF_H) */
-- 
cgit v0.10.2


From 7da3046d6ce6ea97494020081c509b642b7016af Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 7 May 2009 19:52:20 -0400
Subject: ring-buffer: add total count in ring-buffer-benchmark

It is nice to see the overhead of the benchmark test when tracing is
disabled. That is, we turn off the ring buffer just to see what the
cost of running the loop that calls into the ring buffer is.

Currently, if no entries wer made, we get 0. This is not informative.
This patch changes it to check if we had any "missed" (non recorded)
events. If so, a total count is also reported.

[ Impact: evaluate the over head of the ring buffer benchmark test ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index a7c048b..a21aa7b 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -285,6 +285,17 @@ static void ring_buffer_producer(void)
 		avg = 1000000 / hit;
 		pr_info("%ld ns per entry\n", avg);
 	}
+
+
+	if (missed) {
+		if (time)
+			missed /= (long)time;
+
+		pr_info("Total iterations per millisec: %ld\n", hit + missed);
+
+		avg = 1000000 / (hit + missed);
+		pr_info("%ld ns per entry\n", avg);
+	}
 }
 
 static void wait_to_die(void)
-- 
cgit v0.10.2


From 74f4fd21664148b8c454cc07bfe74e4dd51cf07b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 7 May 2009 19:58:55 -0400
Subject: ring-buffer: change WARN_ON from checking preempt_count to
 preemptible

There's a WARN_ON in the ring buffer code that makes sure preemption
is disabled. It checks "!preempt_count()". But when CONFIG_PREEMPT is not
enabled, preempt_count() is always zero, and this will trigger the warning.

[ Impact: prevent false warning on non preemptible kernels ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3ae5ccf..3611706 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1688,7 +1688,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 	 * committed yet. Thus we can assume that preemption
 	 * is still disabled.
 	 */
-	RB_WARN_ON(buffer, !preempt_count());
+	RB_WARN_ON(buffer, preemptible());
 
 	cpu = smp_processor_id();
 	cpu_buffer = buffer->buffers[cpu];
-- 
cgit v0.10.2


From 5dafc91fca9135a7b0acb76d9709f4abfad92d6e Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 7 May 2009 10:17:44 +0000
Subject: sh: sh7785 early scif fix

This patch moves the SH4 case of EARLY_SCIF_CONSOLE_PORT
so the SH7785 default value gets used. Without this patch
the value for SH7785 is set to 0xffe80000.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig.debug b/arch/sh/Kconfig.debug
index fdb0493..fafa907 100644
--- a/arch/sh/Kconfig.debug
+++ b/arch/sh/Kconfig.debug
@@ -38,10 +38,10 @@ config EARLY_SCIF_CONSOLE_PORT
 	default "0xffe00000" if CPU_SUBTYPE_SH7780 || CPU_SUBTYPE_SH7763 || \
 				CPU_SUBTYPE_SH7722 || CPU_SUBTYPE_SH7366 || \
 				CPU_SUBTYPE_SH7343
-	default "0xffe80000" if CPU_SH4
 	default "0xffea0000" if CPU_SUBTYPE_SH7785
 	default "0xfffe8000" if CPU_SUBTYPE_SH7203
 	default "0xfffe9800" if CPU_SUBTYPE_SH7206 || CPU_SUBTYPE_SH7263
+	default "0xffe80000" if CPU_SH4
 	default "0x00000000"
 
 config EARLY_PRINTK
-- 
cgit v0.10.2


From b3cacf318172757783d8272fc333284dd4f50140 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 7 May 2009 10:31:39 +0000
Subject: sh: call clock framework init() callback once

Make sure that clk->ops->init() only gets called once in
the case of CLK_ALWAYS_ENABLED. Without this patch the
init() callback may be called multiple times.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 1dc8964..099373a 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -99,15 +99,18 @@ static int __clk_enable(struct clk *clk)
 	 * changes and the clock needs to hunt for the proper set of
 	 * divisors to use before it can effectively recalc.
 	 */
+
+	if (clk->flags & CLK_ALWAYS_ENABLED) {
+		kref_get(&clk->kref);
+		return 0;
+	}
+
 	if (unlikely(atomic_read(&clk->kref.refcount) == 1))
 		if (clk->ops && clk->ops->init)
 			clk->ops->init(clk);
 
 	kref_get(&clk->kref);
 
-	if (clk->flags & CLK_ALWAYS_ENABLED)
-		return 0;
-
 	if (likely(clk->ops && clk->ops->enable))
 		clk->ops->enable(clk);
 
-- 
cgit v0.10.2


From 06ee846a25642c434c55209151c633f538c12022 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 7 May 2009 10:44:55 +0000
Subject: sh: r7785 highlander clock fixes

Update the r7785 highlander defconfig to fix
PCLK value and that mode4 is set high.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/configs/r7785rp_defconfig b/arch/sh/configs/r7785rp_defconfig
index 890fe3c..5e677a8 100644
--- a/arch/sh/configs/r7785rp_defconfig
+++ b/arch/sh/configs/r7785rp_defconfig
@@ -269,7 +269,7 @@ CONFIG_SH_R7785RP=y
 #
 CONFIG_SH_TMU=y
 CONFIG_SH_TIMER_IRQ=28
-CONFIG_SH_PCLK_FREQ=50000000
+CONFIG_SH_PCLK_FREQ=33333333
 CONFIG_TICK_ONESHOT=y
 CONFIG_NO_HZ=y
 CONFIG_HIGH_RES_TIMERS=y
@@ -333,7 +333,7 @@ CONFIG_GUSA=y
 CONFIG_ZERO_PAGE_OFFSET=0x00001000
 CONFIG_BOOT_LINK_OFFSET=0x00800000
 CONFIG_CMDLINE_BOOL=y
-CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1"
+CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1 mode4_pin=high"
 
 #
 # Bus options
-- 
cgit v0.10.2


From e367592cc93ac653e7bc0bebbc9bb713a77e2696 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 7 May 2009 10:55:37 +0000
Subject: sh: TMU platform data for sh7785

This patch adds TMU platform data for sh7785. Both clockevent
and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 02688d7..b773e7d 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -393,6 +393,7 @@ config CPU_SUBTYPE_SH7785
 	select CPU_SHX2
 	select ARCH_SPARSEMEM_ENABLE
 	select SYS_SUPPORTS_NUMA
+	select SYS_SUPPORTS_TMU
 
 config CPU_SUBTYPE_SH7786
 	bool "Support SH7786 processor"
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
index d80802a..dc5d3e5 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
@@ -13,8 +13,191 @@
 #include <linux/serial_sci.h>
 #include <linux/io.h>
 #include <linux/mm.h>
+#include <linux/sh_timer.h>
 #include <asm/mmzone.h>
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffd80008,
+		.end	= 0xffd80013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 28,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffd80014,
+		.end	= 0xffd8001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 29,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffd80020,
+		.end	= 0xffd8002f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 30,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
+static struct sh_timer_config tmu3_platform_data = {
+	.name = "TMU3",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+};
+
+static struct resource tmu3_resources[] = {
+	[0] = {
+		.name	= "TMU3",
+		.start	= 0xffdc0008,
+		.end	= 0xffdc0013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 96,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu3_device = {
+	.name		= "sh_tmu",
+	.id		= 3,
+	.dev = {
+		.platform_data	= &tmu3_platform_data,
+	},
+	.resource	= tmu3_resources,
+	.num_resources	= ARRAY_SIZE(tmu3_resources),
+};
+
+static struct sh_timer_config tmu4_platform_data = {
+	.name = "TMU4",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+};
+
+static struct resource tmu4_resources[] = {
+	[0] = {
+		.name	= "TMU4",
+		.start	= 0xffdc0014,
+		.end	= 0xffdc001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 97,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu4_device = {
+	.name		= "sh_tmu",
+	.id		= 4,
+	.dev = {
+		.platform_data	= &tmu4_platform_data,
+	},
+	.resource	= tmu4_resources,
+	.num_resources	= ARRAY_SIZE(tmu4_resources),
+};
+
+static struct sh_timer_config tmu5_platform_data = {
+	.name = "TMU5",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu5_resources[] = {
+	[0] = {
+		.name	= "TMU5",
+		.start	= 0xffdc0020,
+		.end	= 0xffdc002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 98,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu5_device = {
+	.name		= "sh_tmu",
+	.id		= 5,
+	.dev = {
+		.platform_data	= &tmu5_platform_data,
+	},
+	.resource	= tmu5_resources,
+	.num_resources	= ARRAY_SIZE(tmu5_resources),
+};
+
 static struct plat_sci_port sci_platform_data[] = {
 	{
 		.mapbase	= 0xffea0000,
@@ -60,6 +243,12 @@ static struct platform_device sci_device = {
 };
 
 static struct platform_device *sh7785_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
 	&sci_device,
 };
 
@@ -70,6 +259,21 @@ static int __init sh7785_devices_setup(void)
 }
 __initcall(sh7785_devices_setup);
 
+static struct platform_device *sh7785_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7785_early_devices,
+				   ARRAY_SIZE(sh7785_early_devices));
+}
+
 enum {
 	UNUSED = 0,
 
-- 
cgit v0.10.2


From 4fa48e1774992fd03a4cd83a0f2adecb288c46f8 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 15:28:15 +0900
Subject: sh: Enable new TMU driver support for all SH-3 and SH-4 CPUs.

The TMU block is supported on all SH-3 and SH-4 subtypes, so just select
it there, rather than conditionalizing it per subtype.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index b773e7d..88eb118 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -168,6 +168,7 @@ config CPU_SH3
 	bool
 	select CPU_HAS_INTEVT
 	select CPU_HAS_SR_RB
+	select SYS_SUPPORTS_TMU
 
 config CPU_SH4
 	bool
@@ -175,6 +176,7 @@ config CPU_SH4
 	select CPU_HAS_SR_RB
 	select CPU_HAS_PTEA if !CPU_SH4A || CPU_SHX2
 	select CPU_HAS_FPU if !CPU_SH4AL_DSP
+	select SYS_SUPPORTS_TMU
 
 config CPU_SH4A
 	bool
@@ -393,7 +395,6 @@ config CPU_SUBTYPE_SH7785
 	select CPU_SHX2
 	select ARCH_SPARSEMEM_ENABLE
 	select SYS_SUPPORTS_NUMA
-	select SYS_SUPPORTS_TMU
 
 config CPU_SUBTYPE_SH7786
 	bool "Support SH7786 processor"
@@ -428,7 +429,6 @@ config CPU_SUBTYPE_SH7722
 	select ARCH_SPARSEMEM_ENABLE
 	select SYS_SUPPORTS_NUMA
 	select SYS_SUPPORTS_CMT
-	select SYS_SUPPORTS_TMU
 
 config CPU_SUBTYPE_SH7366
 	bool "Support SH7366 processor"
-- 
cgit v0.10.2


From c2ecb4c4a7da16288062a057b693b7b1e16aaf88 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 15:39:25 +0900
Subject: sh: Move out rtc-sh registration from time_64.c to setup-sh5.c

Now that the onchip_remap() mess is sorted out, the rtc-sh support code
for SH-5 can follow the same approach as the other CPUs.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh5/setup-sh5.c b/arch/sh/kernel/cpu/sh5/setup-sh5.c
index d8d59fe..9a362c8 100644
--- a/arch/sh/kernel/cpu/sh5/setup-sh5.c
+++ b/arch/sh/kernel/cpu/sh5/setup-sh5.c
@@ -34,8 +34,39 @@ static struct platform_device sci_device = {
 	},
 };
 
+static struct resource rtc_resources[] = {
+	[0] = {
+		.start	= PHYS_PERIPHERAL_BLOCK + 0x01040000,
+		.end	= PHYS_PERIPHERAL_BLOCK + 0x01040000 + 0x58 - 1,
+		.flags	= IORESOURCE_IO,
+	},
+	[1] = {
+		/* Period IRQ */
+		.start	= IRQ_PRI,
+		.flags	= IORESOURCE_IRQ,
+	},
+	[2] = {
+		/* Carry IRQ */
+		.start	= IRQ_CUI,
+		.flags	= IORESOURCE_IRQ,
+	},
+	[3] = {
+		/* Alarm IRQ */
+		.start	= IRQ_ATI,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device rtc_device = {
+	.name		= "sh-rtc",
+	.id		= -1,
+	.num_resources	= ARRAY_SIZE(rtc_resources),
+	.resource	= rtc_resources,
+};
+
 static struct platform_device *sh5_devices[] __initdata = {
 	&sci_device,
+	&rtc_device,
 };
 
 static int __init sh5_devices_setup(void)
diff --git a/arch/sh/kernel/time_64.c b/arch/sh/kernel/time_64.c
index 7bfeaa0..b4fe770 100644
--- a/arch/sh/kernel/time_64.c
+++ b/arch/sh/kernel/time_64.c
@@ -46,12 +46,6 @@
 #define TMU_TSTR_INIT	1
 #define TMU_TSTR_OFF	0
 
-/* Real Time Clock */
-#define	RTC_BLOCK_OFF	0x01040000
-#define RTC_BASE	PHYS_PERIPHERAL_BLOCK + RTC_BLOCK_OFF
-#define RTC_RCR1_CIE	0x10	/* Carry Interrupt Enable */
-#define RTC_RCR1	(rtc_base + 0x38)
-
 /* Time Management Unit */
 #define	TMU_BLOCK_OFF	0x01020000
 #define TMU_BASE	PHYS_PERIPHERAL_BLOCK + TMU_BLOCK_OFF
@@ -68,8 +62,7 @@
 
 #define TICK_SIZE (tick_nsec / 1000)
 
-static unsigned long tmu_base, rtc_base;
-unsigned long cprc_base;
+static unsigned long tmu_base;
 
 /* Variables to allow interpolation of time of day to resolution better than a
  * jiffy. */
@@ -248,11 +241,6 @@ void __init time_init(void)
 		panic("Unable to remap TMU\n");
 	}
 
-	rtc_base = (unsigned long)ioremap_nocache(RTC_BASE, 1024);
-	if (!rtc_base) {
-		panic("Unable to remap RTC\n");
-	}
-
 	clk = clk_get(NULL, "cpu_clk");
 	scaled_recip_ctc_ticks_per_jiffy = ((1ULL << CTC_JIFFY_SCALE_SHIFT) /
 			(unsigned long long)(clk_get_rate(clk) / HZ));
@@ -274,41 +262,3 @@ void __init time_init(void)
 	ctrl_outl(interval, TMU0_TCNT);
 	ctrl_outb(TMU_TSTR_INIT, TMU_TSTR);
 }
-
-static struct resource rtc_resources[] = {
-	[0] = {
-		/* RTC base, filled in by rtc_init */
-		.flags	= IORESOURCE_IO,
-	},
-	[1] = {
-		/* Period IRQ */
-		.start	= IRQ_PRI,
-		.flags	= IORESOURCE_IRQ,
-	},
-	[2] = {
-		/* Carry IRQ */
-		.start	= IRQ_CUI,
-		.flags	= IORESOURCE_IRQ,
-	},
-	[3] = {
-		/* Alarm IRQ */
-		.start	= IRQ_ATI,
-		.flags	= IORESOURCE_IRQ,
-	},
-};
-
-static struct platform_device rtc_device = {
-	.name		= "sh-rtc",
-	.id		= -1,
-	.num_resources	= ARRAY_SIZE(rtc_resources),
-	.resource	= rtc_resources,
-};
-
-static int __init rtc_init(void)
-{
-	rtc_resources[0].start	= rtc_base;
-	rtc_resources[0].end	= rtc_resources[0].start + 0x58 - 1;
-
-	return platform_device_register(&rtc_device);
-}
-device_initcall(rtc_init);
-- 
cgit v0.10.2


From add47067a8ca324e9d26c4373350dc3cce7f1e7f Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 16:12:17 +0900
Subject: sh: Finish the sh64 migration off of ARCH_USES_GETTIMEOFFSET.

This adds sh_tmu support to the SH-5 subtypes, which subsequently allows
us to kill off time_64.c and use the now generic time_32.c. As a bonus,
SH-5 now supports highres timers and tickless for the first time.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 88eb118..ca5c09b 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -141,10 +141,6 @@ config ARCH_NO_VIRT_TO_BUS
 config ARCH_HAS_DEFAULT_IDLE
 	def_bool y
 
-config ARCH_USES_GETTIMEOFFSET
-	def_bool y 
-	depends on SUPERH64
-
 config IO_TRAPPED
 	bool
 
@@ -190,6 +186,7 @@ config CPU_SH4AL_DSP
 config CPU_SH5
 	bool
 	select CPU_HAS_FPU
+	select SYS_SUPPORTS_TMU
 
 config CPU_SHX2
 	bool
diff --git a/arch/sh/kernel/Makefile_64 b/arch/sh/kernel/Makefile_64
index d256c77..4667d4c 100644
--- a/arch/sh/kernel/Makefile_64
+++ b/arch/sh/kernel/Makefile_64
@@ -2,7 +2,7 @@ extra-y	:= head_64.o init_task.o vmlinux.lds
 
 obj-y	:= debugtraps.o idle.o io.o io_generic.o irq.o machvec.o process_64.o \
 	   ptrace_64.o setup.o signal_64.o sys_sh.o sys_sh64.o \
-	   syscalls_64.o time_64.o topology.o traps.o traps_64.o
+	   syscalls_64.o time_32.o topology.o traps.o traps_64.o
 
 obj-y				+= cpu/ timers/
 obj-$(CONFIG_VSYSCALL)		+= vsyscall/
diff --git a/arch/sh/kernel/cpu/sh5/setup-sh5.c b/arch/sh/kernel/cpu/sh5/setup-sh5.c
index 9a362c8..678d69b 100644
--- a/arch/sh/kernel/cpu/sh5/setup-sh5.c
+++ b/arch/sh/kernel/cpu/sh5/setup-sh5.c
@@ -13,6 +13,7 @@
 #include <linux/serial_sci.h>
 #include <linux/io.h>
 #include <linux/mm.h>
+#include <linux/sh_timer.h>
 #include <asm/addrspace.h>
 
 static struct plat_sci_port sci_platform_data[] = {
@@ -64,6 +65,110 @@ static struct platform_device rtc_device = {
 	.resource	= rtc_resources,
 };
 
+#define	TMU_BLOCK_OFF	0x01020000
+#define TMU_BASE	PHYS_PERIPHERAL_BLOCK + TMU_BLOCK_OFF
+#define TMU0_BASE	(TMU_BASE + 0x8 + (0xc * 0x0))
+#define TMU1_BASE	(TMU_BASE + 0x8 + (0xc * 0x1))
+#define TMU2_BASE	(TMU_BASE + 0x8 + (0xc * 0x2))
+
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= TMU0_BASE,
+		.end	= TMU0_BASE + 0xc - 1,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= IRQ_TUNI0,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= TMU1_BASE,
+		.end	= TMU1_BASE + 0xc - 1,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= IRQ_TUNI1,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= TMU2_BASE,
+		.end	= TMU2_BASE + 0xc - 1,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= IRQ_TUNI2,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
+static struct platform_device *sh5_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+};
+
 static struct platform_device *sh5_devices[] __initdata = {
 	&sci_device,
 	&rtc_device,
@@ -71,7 +176,20 @@ static struct platform_device *sh5_devices[] __initdata = {
 
 static int __init sh5_devices_setup(void)
 {
+	int ret;
+
+	ret = platform_add_devices(sh5_early_devices,
+				   ARRAY_SIZE(sh5_early_devices));
+	if (unlikely(ret != 0))
+		return ret;
+
 	return platform_add_devices(sh5_devices,
 				    ARRAY_SIZE(sh5_devices));
 }
 __initcall(sh5_devices_setup);
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh5_early_devices,
+				   ARRAY_SIZE(sh5_early_devices));
+}
diff --git a/arch/sh/kernel/time_64.c b/arch/sh/kernel/time_64.c
deleted file mode 100644
index b4fe770..0000000
--- a/arch/sh/kernel/time_64.c
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * arch/sh/kernel/time_64.c
- *
- * Copyright (C) 2000, 2001  Paolo Alberelli
- * Copyright (C) 2003 - 2007  Paul Mundt
- * Copyright (C) 2003  Richard Curnow
- *
- *    Original TMU/RTC code taken from sh version.
- *    Copyright (C) 1999  Tetsuya Okada & Niibe Yutaka
- *      Some code taken from i386 version.
- *      Copyright (C) 1991, 1992, 1995  Linus Torvalds
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#include <linux/errno.h>
-#include <linux/rwsem.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/param.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/time.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/profile.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/bcd.h>
-#include <linux/timex.h>
-#include <linux/irq.h>
-#include <linux/io.h>
-#include <linux/platform_device.h>
-#include <cpu/registers.h>	 /* required by inline __asm__ stmt. */
-#include <cpu/irq.h>
-#include <asm/addrspace.h>
-#include <asm/processor.h>
-#include <asm/uaccess.h>
-#include <asm/delay.h>
-#include <asm/clock.h>
-
-#define TMU_TOCR_INIT	0x00
-#define TMU0_TCR_INIT	0x0020
-#define TMU_TSTR_INIT	1
-#define TMU_TSTR_OFF	0
-
-/* Time Management Unit */
-#define	TMU_BLOCK_OFF	0x01020000
-#define TMU_BASE	PHYS_PERIPHERAL_BLOCK + TMU_BLOCK_OFF
-#define TMU0_BASE	tmu_base + 0x8 + (0xc * 0x0)
-#define TMU1_BASE	tmu_base + 0x8 + (0xc * 0x1)
-#define TMU2_BASE	tmu_base + 0x8 + (0xc * 0x2)
-
-#define TMU_TOCR	tmu_base+0x0	/* Byte access */
-#define TMU_TSTR	tmu_base+0x4	/* Byte access */
-
-#define TMU0_TCOR	TMU0_BASE+0x0	/* Long access */
-#define TMU0_TCNT	TMU0_BASE+0x4	/* Long access */
-#define TMU0_TCR	TMU0_BASE+0x8	/* Word access */
-
-#define TICK_SIZE (tick_nsec / 1000)
-
-static unsigned long tmu_base;
-
-/* Variables to allow interpolation of time of day to resolution better than a
- * jiffy. */
-
-/* This is effectively protected by xtime_lock */
-static unsigned long ctc_last_interrupt;
-static unsigned long long usecs_per_jiffy = 1000000/HZ; /* Approximation */
-
-#define CTC_JIFFY_SCALE_SHIFT 40
-
-/* 2**CTC_JIFFY_SCALE_SHIFT / ctc_ticks_per_jiffy */
-static unsigned long long scaled_recip_ctc_ticks_per_jiffy;
-
-/* Estimate number of microseconds that have elapsed since the last timer tick,
-   by scaling the delta that has occurred in the CTC register.
-
-   WARNING WARNING WARNING : This algorithm relies on the CTC decrementing at
-   the CPU clock rate.  If the CPU sleeps, the CTC stops counting.  Bear this
-   in mind if enabling SLEEP_WORKS in process.c.  In that case, this algorithm
-   probably needs to use TMU.TCNT0 instead.  This will work even if the CPU is
-   sleeping, though will be coarser.
-
-   FIXME : What if usecs_per_tick is moving around too much, e.g. if an adjtime
-   is running or if the freq or tick arguments of adjtimex are modified after
-   we have calibrated the scaling factor?  This will result in either a jump at
-   the end of a tick period, or a wrap backwards at the start of the next one,
-   if the application is reading the time of day often enough.  I think we
-   ought to do better than this.  For this reason, usecs_per_jiffy is left
-   separated out in the calculation below.  This allows some future hook into
-   the adjtime-related stuff in kernel/timer.c to remove this hazard.
-
-*/
-
-static unsigned long usecs_since_tick(void)
-{
-	unsigned long long current_ctc;
-	long ctc_ticks_since_interrupt;
-	unsigned long long ull_ctc_ticks_since_interrupt;
-	unsigned long result;
-
-	unsigned long long mul1_out;
-	unsigned long long mul1_out_high;
-	unsigned long long mul2_out_low, mul2_out_high;
-
-	/* Read CTC register */
-	asm ("getcon cr62, %0" : "=r" (current_ctc));
-	/* Note, the CTC counts down on each CPU clock, not up.
-	   Note(2), use long type to get correct wraparound arithmetic when
-	   the counter crosses zero. */
-	ctc_ticks_since_interrupt = (long) ctc_last_interrupt - (long) current_ctc;
-	ull_ctc_ticks_since_interrupt = (unsigned long long) ctc_ticks_since_interrupt;
-
-	/* Inline assembly to do 32x32x32->64 multiplier */
-	asm volatile ("mulu.l %1, %2, %0" :
-	     "=r" (mul1_out) :
-	     "r" (ull_ctc_ticks_since_interrupt), "r" (usecs_per_jiffy));
-
-	mul1_out_high = mul1_out >> 32;
-
-	asm volatile ("mulu.l %1, %2, %0" :
-	     "=r" (mul2_out_low) :
-	     "r" (mul1_out), "r" (scaled_recip_ctc_ticks_per_jiffy));
-
-#if 1
-	asm volatile ("mulu.l %1, %2, %0" :
-	     "=r" (mul2_out_high) :
-	     "r" (mul1_out_high), "r" (scaled_recip_ctc_ticks_per_jiffy));
-#endif
-
-	result = (unsigned long) (((mul2_out_high << 32) + mul2_out_low) >> CTC_JIFFY_SCALE_SHIFT);
-
-	return result;
-}
-
-u32 arch_gettimeoffset(void)
-{
-	return usecs_since_tick() * 1000;
-}
-
-/* Dummy RTC ops */
-static void null_rtc_get_time(struct timespec *tv)
-{
-	tv->tv_sec = mktime(2000, 1, 1, 0, 0, 0);
-	tv->tv_nsec = 0;
-}
-
-static int null_rtc_set_time(const time_t secs)
-{
-	return 0;
-}
-
-void (*rtc_sh_get_time)(struct timespec *) = null_rtc_get_time;
-int (*rtc_sh_set_time)(const time_t) = null_rtc_set_time;
-
-/* last time the RTC clock got updated */
-static long last_rtc_update;
-
-/*
- * timer_interrupt() needs to keep up the real-time clock,
- * as well as call the "do_timer()" routine every clocktick
- */
-static inline void do_timer_interrupt(void)
-{
-	unsigned long long current_ctc;
-
-	if (current->pid)
-		profile_tick(CPU_PROFILING);
-
-	/*
-	 * Here we are in the timer irq handler. We just have irqs locally
-	 * disabled but we don't know if the timer_bh is running on the other
-	 * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
-	 * the irq version of write_lock because as just said we have irq
-	 * locally disabled. -arca
-	 */
-	write_seqlock(&xtime_lock);
-	asm ("getcon cr62, %0" : "=r" (current_ctc));
-	ctc_last_interrupt = (unsigned long) current_ctc;
-
-	do_timer(1);
-
-	/*
-	 * If we have an externally synchronized Linux clock, then update
-	 * RTC clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
-	 * called as close as possible to 500 ms before the new second starts.
-	 */
-	if (ntp_synced() &&
-	    xtime.tv_sec > last_rtc_update + 660 &&
-	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
-	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
-		if (rtc_sh_set_time(xtime.tv_sec) == 0)
-			last_rtc_update = xtime.tv_sec;
-		else
-			/* do it again in 60 s */
-			last_rtc_update = xtime.tv_sec - 600;
-	}
-	write_sequnlock(&xtime_lock);
-
-#ifndef CONFIG_SMP
-	update_process_times(user_mode(get_irq_regs()));
-#endif
-}
-
-/*
- * This is the same as the above, except we _also_ save the current
- * Time Stamp Counter value at the time of the timer interrupt, so that
- * we later on can estimate the time of day more exactly.
- */
-static irqreturn_t timer_interrupt(int irq, void *dev_id)
-{
-	unsigned long timer_status;
-
-	/* Clear UNF bit */
-	timer_status = ctrl_inw(TMU0_TCR);
-	timer_status &= ~0x100;
-	ctrl_outw(timer_status, TMU0_TCR);
-
-	do_timer_interrupt();
-
-	return IRQ_HANDLED;
-}
-
-static struct irqaction irq0  = {
-	.handler = timer_interrupt,
-	.flags = IRQF_DISABLED,
-	.name = "timer",
-};
-
-void __init time_init(void)
-{
-	unsigned long interval;
-	struct clk *clk;
-
-	tmu_base = (unsigned long)ioremap_nocache(TMU_BASE, 1024);
-	if (!tmu_base) {
-		panic("Unable to remap TMU\n");
-	}
-
-	clk = clk_get(NULL, "cpu_clk");
-	scaled_recip_ctc_ticks_per_jiffy = ((1ULL << CTC_JIFFY_SCALE_SHIFT) /
-			(unsigned long long)(clk_get_rate(clk) / HZ));
-
-	rtc_sh_get_time(&xtime);
-
-	setup_irq(TIMER_IRQ, &irq0);
-
-	clk = clk_get(NULL, "module_clk");
-	interval = (clk_get_rate(clk)/(HZ*4));
-
-	printk("Interval = %ld\n", interval);
-
-	/* Start TMU0 */
-	ctrl_outb(TMU_TSTR_OFF, TMU_TSTR);
-	ctrl_outb(TMU_TOCR_INIT, TMU_TOCR);
-	ctrl_outw(TMU0_TCR_INIT, TMU0_TCR);
-	ctrl_outl(interval, TMU0_TCOR);
-	ctrl_outl(interval, TMU0_TCNT);
-	ctrl_outb(TMU_TSTR_INIT, TMU_TSTR);
-}
-- 
cgit v0.10.2


From b179b72fad5c88c3b616fb88a9ae7cbbc1a750d3 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 16:17:36 +0900
Subject: sh: Rename arch/sh/kernel/time_32.c to arch/sh/kernel/time.c.

This is now fully generic, and used both by _32 and _64 variants.
Rename it accordingly.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/Makefile_32 b/arch/sh/kernel/Makefile_32
index 5b1d4d0..aee08af 100644
--- a/arch/sh/kernel/Makefile_32
+++ b/arch/sh/kernel/Makefile_32
@@ -11,7 +11,7 @@ endif
 
 obj-y	:= debugtraps.o idle.o io.o io_generic.o irq.o			\
 	   machvec.o process_32.o ptrace_32.o setup.o signal_32.o	\
-	   sys_sh.o sys_sh32.o syscalls_32.o time_32.o topology.o	\
+	   sys_sh.o sys_sh32.o syscalls_32.o time.o topology.o	\
 	   traps.o traps_32.o
 
 obj-y				+= cpu/ timers/
diff --git a/arch/sh/kernel/Makefile_64 b/arch/sh/kernel/Makefile_64
index 4667d4c..31f6ebf 100644
--- a/arch/sh/kernel/Makefile_64
+++ b/arch/sh/kernel/Makefile_64
@@ -2,7 +2,7 @@ extra-y	:= head_64.o init_task.o vmlinux.lds
 
 obj-y	:= debugtraps.o idle.o io.o io_generic.o irq.o machvec.o process_64.o \
 	   ptrace_64.o setup.o signal_64.o sys_sh.o sys_sh64.o \
-	   syscalls_64.o time_32.o topology.o traps.o traps_64.o
+	   syscalls_64.o time.o topology.o traps.o traps_64.o
 
 obj-y				+= cpu/ timers/
 obj-$(CONFIG_VSYSCALL)		+= vsyscall/
diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
new file mode 100644
index 0000000..d41ca4c
--- /dev/null
+++ b/arch/sh/kernel/time.c
@@ -0,0 +1,231 @@
+/*
+ *  arch/sh/kernel/time_32.c
+ *
+ *  Copyright (C) 1999  Tetsuya Okada & Niibe Yutaka
+ *  Copyright (C) 2000  Philipp Rumpf <prumpf@tux.org>
+ *  Copyright (C) 2002 - 2009  Paul Mundt
+ *  Copyright (C) 2002  M. R. Brown  <mrbrown@linux-sh.org>
+ *
+ *  Some code taken from i386 version.
+ *    Copyright (C) 1991, 1992, 1995  Linus Torvalds
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/profile.h>
+#include <linux/timex.h>
+#include <linux/sched.h>
+#include <linux/clockchips.h>
+#include <linux/mc146818rtc.h>	/* for rtc_lock */
+#include <linux/platform_device.h>
+#include <linux/smp.h>
+#include <linux/rtc.h>
+#include <asm/clock.h>
+#include <asm/rtc.h>
+#include <asm/timer.h>
+#include <asm/kgdb.h>
+
+struct sys_timer *sys_timer;
+
+/* Move this somewhere more sensible.. */
+DEFINE_SPINLOCK(rtc_lock);
+EXPORT_SYMBOL(rtc_lock);
+
+/* Dummy RTC ops */
+static void null_rtc_get_time(struct timespec *tv)
+{
+	tv->tv_sec = mktime(2000, 1, 1, 0, 0, 0);
+	tv->tv_nsec = 0;
+}
+
+static int null_rtc_set_time(const time_t secs)
+{
+	return 0;
+}
+
+void (*rtc_sh_get_time)(struct timespec *) = null_rtc_get_time;
+int (*rtc_sh_set_time)(const time_t) = null_rtc_set_time;
+
+unsigned int get_rtc_time(struct rtc_time *tm)
+{
+	if (rtc_sh_get_time != null_rtc_get_time) {
+		struct timespec tv;
+
+		rtc_sh_get_time(&tv);
+		rtc_time_to_tm(tv.tv_sec, tm);
+	}
+
+	return RTC_24H;
+}
+EXPORT_SYMBOL(get_rtc_time);
+
+int set_rtc_time(struct rtc_time *tm)
+{
+	unsigned long secs;
+
+	rtc_tm_to_time(tm, &secs);
+	return rtc_sh_set_time(secs);
+}
+EXPORT_SYMBOL(set_rtc_time);
+
+static int __init rtc_generic_init(void)
+{
+	struct platform_device *pdev;
+
+	if (rtc_sh_get_time == null_rtc_get_time)
+		return -ENODEV;
+
+	pdev = platform_device_register_simple("rtc-generic", -1, NULL, 0);
+	if (IS_ERR(pdev))
+		return PTR_ERR(pdev);
+
+	return 0;
+}
+module_init(rtc_generic_init);
+
+/* last time the RTC clock got updated */
+static long last_rtc_update;
+
+/*
+ * handle_timer_tick() needs to keep up the real-time clock,
+ * as well as call the "do_timer()" routine every clocktick
+ */
+void handle_timer_tick(void)
+{
+	if (current->pid)
+		profile_tick(CPU_PROFILING);
+
+	/*
+	 * Here we are in the timer irq handler. We just have irqs locally
+	 * disabled but we don't know if the timer_bh is running on the other
+	 * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
+	 * the irq version of write_lock because as just said we have irq
+	 * locally disabled. -arca
+	 */
+	write_seqlock(&xtime_lock);
+	do_timer(1);
+
+	/*
+	 * If we have an externally synchronized Linux clock, then update
+	 * RTC clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
+	 * called as close as possible to 500 ms before the new second starts.
+	 */
+	if (ntp_synced() &&
+	    xtime.tv_sec > last_rtc_update + 660 &&
+	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
+	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
+		if (rtc_sh_set_time(xtime.tv_sec) == 0)
+			last_rtc_update = xtime.tv_sec;
+		else
+			/* do it again in 60s */
+			last_rtc_update = xtime.tv_sec - 600;
+	}
+	write_sequnlock(&xtime_lock);
+
+#ifndef CONFIG_SMP
+	update_process_times(user_mode(get_irq_regs()));
+#endif
+}
+
+#ifdef CONFIG_PM
+int timer_suspend(struct sys_device *dev, pm_message_t state)
+{
+	struct sys_timer *sys_timer = container_of(dev, struct sys_timer, dev);
+
+	sys_timer->ops->stop();
+
+	return 0;
+}
+
+int timer_resume(struct sys_device *dev)
+{
+	struct sys_timer *sys_timer = container_of(dev, struct sys_timer, dev);
+
+	sys_timer->ops->start();
+
+	return 0;
+}
+#else
+#define timer_suspend NULL
+#define timer_resume NULL
+#endif
+
+static struct sysdev_class timer_sysclass = {
+	.name	 = "timer",
+	.suspend = timer_suspend,
+	.resume	 = timer_resume,
+};
+
+static int __init timer_init_sysfs(void)
+{
+	int ret;
+
+	if (!sys_timer)
+		return 0;
+
+	ret = sysdev_class_register(&timer_sysclass);
+	if (ret != 0)
+		return ret;
+
+	sys_timer->dev.cls = &timer_sysclass;
+	return sysdev_register(&sys_timer->dev);
+}
+device_initcall(timer_init_sysfs);
+
+void (*board_time_init)(void);
+
+struct clocksource clocksource_sh = {
+	.name		= "SuperH",
+};
+
+unsigned long long sched_clock(void)
+{
+	unsigned long long cycles;
+
+	/* jiffies based sched_clock if no clocksource is installed */
+	if (!clocksource_sh.rating)
+		return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
+
+	cycles = clocksource_sh.read(&clocksource_sh);
+	return cyc2ns(&clocksource_sh, cycles);
+}
+
+static void __init sh_late_time_init(void)
+{
+	/*
+	 * Make sure all compiled-in early timers register themselves.
+	 * Run probe() for one "earlytimer" device.
+	 */
+	early_platform_driver_register_all("earlytimer");
+	if (early_platform_driver_probe("earlytimer", 1, 0))
+		return;
+
+	/*
+	 * Find the timer to use as the system timer, it will be
+	 * initialized for us.
+	 */
+	sys_timer = get_sys_timer();
+	if (unlikely(!sys_timer))
+		panic("System timer missing.\n");
+
+	printk(KERN_INFO "Using %s for system timer\n", sys_timer->name);
+}
+
+void __init time_init(void)
+{
+	if (board_time_init)
+		board_time_init();
+
+	clk_init();
+
+	rtc_sh_get_time(&xtime);
+	set_normalized_timespec(&wall_to_monotonic,
+				-xtime.tv_sec, -xtime.tv_nsec);
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+	local_timer_setup(smp_processor_id());
+#endif
+
+	late_time_init = sh_late_time_init;
+}
+
diff --git a/arch/sh/kernel/time_32.c b/arch/sh/kernel/time_32.c
deleted file mode 100644
index d41ca4c..0000000
--- a/arch/sh/kernel/time_32.c
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- *  arch/sh/kernel/time_32.c
- *
- *  Copyright (C) 1999  Tetsuya Okada & Niibe Yutaka
- *  Copyright (C) 2000  Philipp Rumpf <prumpf@tux.org>
- *  Copyright (C) 2002 - 2009  Paul Mundt
- *  Copyright (C) 2002  M. R. Brown  <mrbrown@linux-sh.org>
- *
- *  Some code taken from i386 version.
- *    Copyright (C) 1991, 1992, 1995  Linus Torvalds
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/profile.h>
-#include <linux/timex.h>
-#include <linux/sched.h>
-#include <linux/clockchips.h>
-#include <linux/mc146818rtc.h>	/* for rtc_lock */
-#include <linux/platform_device.h>
-#include <linux/smp.h>
-#include <linux/rtc.h>
-#include <asm/clock.h>
-#include <asm/rtc.h>
-#include <asm/timer.h>
-#include <asm/kgdb.h>
-
-struct sys_timer *sys_timer;
-
-/* Move this somewhere more sensible.. */
-DEFINE_SPINLOCK(rtc_lock);
-EXPORT_SYMBOL(rtc_lock);
-
-/* Dummy RTC ops */
-static void null_rtc_get_time(struct timespec *tv)
-{
-	tv->tv_sec = mktime(2000, 1, 1, 0, 0, 0);
-	tv->tv_nsec = 0;
-}
-
-static int null_rtc_set_time(const time_t secs)
-{
-	return 0;
-}
-
-void (*rtc_sh_get_time)(struct timespec *) = null_rtc_get_time;
-int (*rtc_sh_set_time)(const time_t) = null_rtc_set_time;
-
-unsigned int get_rtc_time(struct rtc_time *tm)
-{
-	if (rtc_sh_get_time != null_rtc_get_time) {
-		struct timespec tv;
-
-		rtc_sh_get_time(&tv);
-		rtc_time_to_tm(tv.tv_sec, tm);
-	}
-
-	return RTC_24H;
-}
-EXPORT_SYMBOL(get_rtc_time);
-
-int set_rtc_time(struct rtc_time *tm)
-{
-	unsigned long secs;
-
-	rtc_tm_to_time(tm, &secs);
-	return rtc_sh_set_time(secs);
-}
-EXPORT_SYMBOL(set_rtc_time);
-
-static int __init rtc_generic_init(void)
-{
-	struct platform_device *pdev;
-
-	if (rtc_sh_get_time == null_rtc_get_time)
-		return -ENODEV;
-
-	pdev = platform_device_register_simple("rtc-generic", -1, NULL, 0);
-	if (IS_ERR(pdev))
-		return PTR_ERR(pdev);
-
-	return 0;
-}
-module_init(rtc_generic_init);
-
-/* last time the RTC clock got updated */
-static long last_rtc_update;
-
-/*
- * handle_timer_tick() needs to keep up the real-time clock,
- * as well as call the "do_timer()" routine every clocktick
- */
-void handle_timer_tick(void)
-{
-	if (current->pid)
-		profile_tick(CPU_PROFILING);
-
-	/*
-	 * Here we are in the timer irq handler. We just have irqs locally
-	 * disabled but we don't know if the timer_bh is running on the other
-	 * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
-	 * the irq version of write_lock because as just said we have irq
-	 * locally disabled. -arca
-	 */
-	write_seqlock(&xtime_lock);
-	do_timer(1);
-
-	/*
-	 * If we have an externally synchronized Linux clock, then update
-	 * RTC clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
-	 * called as close as possible to 500 ms before the new second starts.
-	 */
-	if (ntp_synced() &&
-	    xtime.tv_sec > last_rtc_update + 660 &&
-	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
-	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
-		if (rtc_sh_set_time(xtime.tv_sec) == 0)
-			last_rtc_update = xtime.tv_sec;
-		else
-			/* do it again in 60s */
-			last_rtc_update = xtime.tv_sec - 600;
-	}
-	write_sequnlock(&xtime_lock);
-
-#ifndef CONFIG_SMP
-	update_process_times(user_mode(get_irq_regs()));
-#endif
-}
-
-#ifdef CONFIG_PM
-int timer_suspend(struct sys_device *dev, pm_message_t state)
-{
-	struct sys_timer *sys_timer = container_of(dev, struct sys_timer, dev);
-
-	sys_timer->ops->stop();
-
-	return 0;
-}
-
-int timer_resume(struct sys_device *dev)
-{
-	struct sys_timer *sys_timer = container_of(dev, struct sys_timer, dev);
-
-	sys_timer->ops->start();
-
-	return 0;
-}
-#else
-#define timer_suspend NULL
-#define timer_resume NULL
-#endif
-
-static struct sysdev_class timer_sysclass = {
-	.name	 = "timer",
-	.suspend = timer_suspend,
-	.resume	 = timer_resume,
-};
-
-static int __init timer_init_sysfs(void)
-{
-	int ret;
-
-	if (!sys_timer)
-		return 0;
-
-	ret = sysdev_class_register(&timer_sysclass);
-	if (ret != 0)
-		return ret;
-
-	sys_timer->dev.cls = &timer_sysclass;
-	return sysdev_register(&sys_timer->dev);
-}
-device_initcall(timer_init_sysfs);
-
-void (*board_time_init)(void);
-
-struct clocksource clocksource_sh = {
-	.name		= "SuperH",
-};
-
-unsigned long long sched_clock(void)
-{
-	unsigned long long cycles;
-
-	/* jiffies based sched_clock if no clocksource is installed */
-	if (!clocksource_sh.rating)
-		return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
-
-	cycles = clocksource_sh.read(&clocksource_sh);
-	return cyc2ns(&clocksource_sh, cycles);
-}
-
-static void __init sh_late_time_init(void)
-{
-	/*
-	 * Make sure all compiled-in early timers register themselves.
-	 * Run probe() for one "earlytimer" device.
-	 */
-	early_platform_driver_register_all("earlytimer");
-	if (early_platform_driver_probe("earlytimer", 1, 0))
-		return;
-
-	/*
-	 * Find the timer to use as the system timer, it will be
-	 * initialized for us.
-	 */
-	sys_timer = get_sys_timer();
-	if (unlikely(!sys_timer))
-		panic("System timer missing.\n");
-
-	printk(KERN_INFO "Using %s for system timer\n", sys_timer->name);
-}
-
-void __init time_init(void)
-{
-	if (board_time_init)
-		board_time_init();
-
-	clk_init();
-
-	rtc_sh_get_time(&xtime);
-	set_normalized_timespec(&wall_to_monotonic,
-				-xtime.tv_sec, -xtime.tv_nsec);
-
-#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
-	local_timer_setup(smp_processor_id());
-#endif
-
-	late_time_init = sh_late_time_init;
-}
-
-- 
cgit v0.10.2


From 6d134b9e8d3f32331ad2faca2db8186f54198931 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 16:36:13 +0900
Subject: sh: Wire up GENERIC_CMOS_UPDATE for the platforms that need it.

Now that everything has converted over to generic timekeeping, we need an
alternate method for keeping the RTC updated for those platforms that are
still using the rtc_sh_get/set_time pairs, presently limited to SH-03 and
the Dreamcast. This wires up the GENERIC_CMOS_UPDATE hooks for those to
maintain the same behaviour.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index ca5c09b..d88a61b 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -83,6 +83,10 @@ config GENERIC_CLOCKEVENTS
 config GENERIC_CLOCKEVENTS_BROADCAST
 	bool
 
+config GENERIC_CMOS_UPDATE
+	def_bool y
+	depends on SH_SH03 || SH_DREAMCAST
+
 config GENERIC_LOCKBREAK
 	def_bool y
 	depends on SMP && PREEMPT
diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
index d41ca4c..77b841a 100644
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -46,6 +46,20 @@ static int null_rtc_set_time(const time_t secs)
 void (*rtc_sh_get_time)(struct timespec *) = null_rtc_get_time;
 int (*rtc_sh_set_time)(const time_t) = null_rtc_set_time;
 
+#ifdef CONFIG_GENERIC_CMOS_UPDATE
+unsigned long read_persistent_clock(void)
+{
+	struct timespec tv;
+	rtc_sh_get_time(&tv);
+	return tv.tv_sec;
+}
+
+int update_persistent_clock(struct timespec now)
+{
+	return rtc_sh_set_time(now.tv_sec);
+}
+#endif
+
 unsigned int get_rtc_time(struct rtc_time *tm)
 {
 	if (rtc_sh_get_time != null_rtc_get_time) {
-- 
cgit v0.10.2


From 5ac5496411b30d41945a996fe7a7fb5abccf2aaa Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 16:44:00 +0900
Subject: sh: Kill off dead handle_timer_tick() code.

Nothing is using this anymore now that we have fully converted to generic
time, so kill it off completely.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/timer.h b/arch/sh/include/asm/timer.h
index cb16645..6c08511 100644
--- a/arch/sh/include/asm/timer.h
+++ b/arch/sh/include/asm/timer.h
@@ -18,17 +18,12 @@ struct sys_timer {
 	struct sys_timer_ops	*ops;
 };
 
-#define TICK_SIZE (tick_nsec / 1000)
-
 extern struct sys_timer tmu_timer;
 extern struct sys_timer *sys_timer;
 
 /* arch/sh/kernel/timers/timer.c */
 struct sys_timer *get_sys_timer(void);
 
-/* arch/sh/kernel/time.c */
-void handle_timer_tick(void);
-
 extern struct clocksource clocksource_sh;
 
 #endif /* __ASM_SH_TIMER_H */
diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
index 77b841a..f4c304a 100644
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -1,13 +1,14 @@
 /*
- *  arch/sh/kernel/time_32.c
+ *  arch/sh/kernel/time.c
  *
  *  Copyright (C) 1999  Tetsuya Okada & Niibe Yutaka
  *  Copyright (C) 2000  Philipp Rumpf <prumpf@tux.org>
  *  Copyright (C) 2002 - 2009  Paul Mundt
  *  Copyright (C) 2002  M. R. Brown  <mrbrown@linux-sh.org>
  *
- *  Some code taken from i386 version.
- *    Copyright (C) 1991, 1992, 1995  Linus Torvalds
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -23,7 +24,6 @@
 #include <asm/clock.h>
 #include <asm/rtc.h>
 #include <asm/timer.h>
-#include <asm/kgdb.h>
 
 struct sys_timer *sys_timer;
 
@@ -97,50 +97,6 @@ static int __init rtc_generic_init(void)
 }
 module_init(rtc_generic_init);
 
-/* last time the RTC clock got updated */
-static long last_rtc_update;
-
-/*
- * handle_timer_tick() needs to keep up the real-time clock,
- * as well as call the "do_timer()" routine every clocktick
- */
-void handle_timer_tick(void)
-{
-	if (current->pid)
-		profile_tick(CPU_PROFILING);
-
-	/*
-	 * Here we are in the timer irq handler. We just have irqs locally
-	 * disabled but we don't know if the timer_bh is running on the other
-	 * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
-	 * the irq version of write_lock because as just said we have irq
-	 * locally disabled. -arca
-	 */
-	write_seqlock(&xtime_lock);
-	do_timer(1);
-
-	/*
-	 * If we have an externally synchronized Linux clock, then update
-	 * RTC clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
-	 * called as close as possible to 500 ms before the new second starts.
-	 */
-	if (ntp_synced() &&
-	    xtime.tv_sec > last_rtc_update + 660 &&
-	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
-	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
-		if (rtc_sh_set_time(xtime.tv_sec) == 0)
-			last_rtc_update = xtime.tv_sec;
-		else
-			/* do it again in 60s */
-			last_rtc_update = xtime.tv_sec - 600;
-	}
-	write_sequnlock(&xtime_lock);
-
-#ifndef CONFIG_SMP
-	update_process_times(user_mode(get_irq_regs()));
-#endif
-}
-
 #ifdef CONFIG_PM
 int timer_suspend(struct sys_device *dev, pm_message_t state)
 {
@@ -242,4 +198,3 @@ void __init time_init(void)
 
 	late_time_init = sh_late_time_init;
 }
-
-- 
cgit v0.10.2


From 6459d7bb72e9767bc7d22f2ee44aab35188e4b8a Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 16:47:48 +0900
Subject: sh: Kill off dead timer sysclass pm hooks.

With the conversion to generic clockevents these are completely unused,
so just kill it off.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
index f4c304a..c26576a 100644
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -97,51 +97,6 @@ static int __init rtc_generic_init(void)
 }
 module_init(rtc_generic_init);
 
-#ifdef CONFIG_PM
-int timer_suspend(struct sys_device *dev, pm_message_t state)
-{
-	struct sys_timer *sys_timer = container_of(dev, struct sys_timer, dev);
-
-	sys_timer->ops->stop();
-
-	return 0;
-}
-
-int timer_resume(struct sys_device *dev)
-{
-	struct sys_timer *sys_timer = container_of(dev, struct sys_timer, dev);
-
-	sys_timer->ops->start();
-
-	return 0;
-}
-#else
-#define timer_suspend NULL
-#define timer_resume NULL
-#endif
-
-static struct sysdev_class timer_sysclass = {
-	.name	 = "timer",
-	.suspend = timer_suspend,
-	.resume	 = timer_resume,
-};
-
-static int __init timer_init_sysfs(void)
-{
-	int ret;
-
-	if (!sys_timer)
-		return 0;
-
-	ret = sysdev_class_register(&timer_sysclass);
-	if (ret != 0)
-		return ret;
-
-	sys_timer->dev.cls = &timer_sysclass;
-	return sysdev_register(&sys_timer->dev);
-}
-device_initcall(timer_init_sysfs);
-
 void (*board_time_init)(void);
 
 struct clocksource clocksource_sh = {
-- 
cgit v0.10.2


From cd1408f22d2fd8f0d09c082b07cf0b9bcfb6eac9 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 16:57:35 +0900
Subject: sh: mach-sh03: Give the sh03 rtc its own spinlock.

This converts the sh03 rtc code off of using the global rtc_lock and on
to its own spinlock. There are no other possible users of the rtc_lock,
so serializing with it is not necessary.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/boards/mach-sh03/rtc.c b/arch/sh/boards/mach-sh03/rtc.c
index 0a9266b..a8b9f84 100644
--- a/arch/sh/boards/mach-sh03/rtc.c
+++ b/arch/sh/boards/mach-sh03/rtc.c
@@ -35,13 +35,13 @@
 #define RTC_BUSY	1
 #define RTC_STOP	2
 
-extern spinlock_t rtc_lock;
+static DEFINE_SPINLOCK(sh03_rtc_lock);
 
 unsigned long get_cmos_time(void)
 {
 	unsigned int year, mon, day, hour, min, sec;
 
-	spin_lock(&rtc_lock);
+	spin_lock(&sh03_rtc_lock);
  again:
 	do {
 		sec  = (ctrl_inb(RTC_SEC1) & 0xf) + (ctrl_inb(RTC_SEC10) & 0x7) * 10;
@@ -73,7 +73,7 @@ unsigned long get_cmos_time(void)
 		goto again;
 	}
 
-	spin_unlock(&rtc_lock);
+	spin_unlock(&sh03_rtc_lock);
 	return mktime(year, mon, day, hour, min, sec);
 }
 
@@ -91,7 +91,7 @@ static int set_rtc_mmss(unsigned long nowtime)
 	int i;
 
 	/* gets recalled with irq locally disabled */
-	spin_lock(&rtc_lock);
+	spin_lock(&sh03_rtc_lock);
 	for (i = 0 ; i < 1000000 ; i++)	/* may take up to 1 second... */
 		if (!(ctrl_inb(RTC_CTL) & RTC_BUSY))
 			break;
@@ -113,7 +113,7 @@ static int set_rtc_mmss(unsigned long nowtime)
 		       cmos_minutes, real_minutes);
 		retval = -1;
 	}
-	spin_unlock(&rtc_lock);
+	spin_unlock(&sh03_rtc_lock);
 
 	return retval;
 }
-- 
cgit v0.10.2


From 1af2fe45fec15bdba446c22b9b602699cdabfc9f Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 16:59:05 +0900
Subject: sh: Kill off the global rtc_lock with extreme prejudice.

Now that all of the possible users for rtc_lock have gone away, it is no
longer necessary to keep this lock definition around.

This follows several other architectures that have either recently
dropped it or never supported it in the first place.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
index c26576a..e0aa769 100644
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -17,7 +17,6 @@
 #include <linux/timex.h>
 #include <linux/sched.h>
 #include <linux/clockchips.h>
-#include <linux/mc146818rtc.h>	/* for rtc_lock */
 #include <linux/platform_device.h>
 #include <linux/smp.h>
 #include <linux/rtc.h>
@@ -27,10 +26,6 @@
 
 struct sys_timer *sys_timer;
 
-/* Move this somewhere more sensible.. */
-DEFINE_SPINLOCK(rtc_lock);
-EXPORT_SYMBOL(rtc_lock);
-
 /* Dummy RTC ops */
 static void null_rtc_get_time(struct timespec *tv)
 {
-- 
cgit v0.10.2


From cb3a86c89ebdf917b88665f70e06863986fbab5c Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 17:25:35 +0900
Subject: sh: Kill off sh64's hand-rolled syscall tracer.

This is no longer necessary, as there are now sufficient generic
alternatives available.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig.debug b/arch/sh/Kconfig.debug
index fafa907..44627ef 100644
--- a/arch/sh/Kconfig.debug
+++ b/arch/sh/Kconfig.debug
@@ -137,12 +137,4 @@ config SH64_SR_WATCH
 	bool "Debug: set SR.WATCH to enable hardware watchpoints and trace"
 	depends on SUPERH64
 
-config POOR_MANS_STRACE
-	bool "Debug: enable rudimentary strace facility"
-	depends on SUPERH64
-	help
-	  This option allows system calls to be traced to the console.  It also
-	  aids in detecting kernel stack underflow.  It is useful for debugging
-	  early-userland problems (e.g. init incurring fatal exceptions.)
-
 endmenu
diff --git a/arch/sh/Makefile b/arch/sh/Makefile
index bece1f7..493da97 100644
--- a/arch/sh/Makefile
+++ b/arch/sh/Makefile
@@ -193,7 +193,7 @@ zImage uImage uImage.srec vmlinux.srec: vmlinux
 
 compressed: zImage
 
-archprepare: maketools arch/sh/lib64/syscalltab.h
+archprepare: maketools
 
 archclean:
 	$(Q)$(MAKE) $(clean)=$(boot)
@@ -205,34 +205,4 @@ define archhelp
 	@echo '  uImage.srec  	           - Create an S-record for U-Boot'
 endef
 
-define filechk_gen-syscalltab
-       (set -e; \
-	echo "/*"; \
-	echo " * DO NOT MODIFY."; \
-	echo " *"; \
-	echo " * This file was generated by arch/sh/Makefile"; \
-	echo " * Any changes will be reverted at build time."; \
-	echo " */"; \
-	echo ""; \
-	echo "#ifndef __SYSCALLTAB_H"; \
-	echo "#define __SYSCALLTAB_H"; \
-	echo ""; \
-	echo "#include <linux/kernel.h>"; \
-	echo ""; \
-	echo "struct syscall_info {"; \
-	echo "	const char *name;"; \
-	echo "} syscall_info_table[] = {"; \
-	sed -e '/^.*\.long /!d;s//	{ "/;s/\(\([^/]*\)\/\)\{1\}.*/\2/; \
-		s/[ \t]*$$//g;s/$$/" },/;s/\("\)sys_/\1/g'; \
-	echo "};"; \
-	echo ""; \
-	echo "#define NUM_SYSCALL_INFO_ENTRIES ARRAY_SIZE(syscall_info_table)";\
-	echo ""; \
-	echo "#endif /* __SYSCALLTAB_H */" )
-endef
-
-arch/sh/lib64/syscalltab.h: arch/sh/kernel/syscalls_64.S
-	$(call filechk,gen-syscalltab)
-
-CLEAN_FILES += arch/sh/lib64/syscalltab.h \
-	       include/asm-sh/machtypes.h
+CLEAN_FILES += include/asm-sh/machtypes.h
diff --git a/arch/sh/kernel/cpu/sh5/entry.S b/arch/sh/kernel/cpu/sh5/entry.S
index 5162975..b0aacf6 100644
--- a/arch/sh/kernel/cpu/sh5/entry.S
+++ b/arch/sh/kernel/cpu/sh5/entry.S
@@ -812,27 +812,6 @@ no_underflow:
 	! exceptions
 	add	SP, ZERO, r14
 
-#ifdef CONFIG_POOR_MANS_STRACE
-	/* We've pushed all the registers now, so only r2-r4 hold anything
-	 * useful. Move them into callee save registers */
-	or	r2, ZERO, r28
-	or	r3, ZERO, r29
-	or	r4, ZERO, r30
-
-	/* Preserve r2 as the event code */
-	movi	evt_debug, r3
-	ori	r3, 1, r3
-	ptabs	r3, tr0
-
-	or	SP, ZERO, r6
-	getcon	TRA, r5
-	blink	tr0, LINK
-
-	or	r28, ZERO, r2
-	or	r29, ZERO, r3
-	or	r30, ZERO, r4
-#endif
-
 	/* For syscall and debug race condition, get TRA now */
 	getcon	TRA, r5
 
@@ -887,11 +866,6 @@ no_underflow:
  */
 	.global ret_from_irq
 ret_from_irq:
-#ifdef CONFIG_POOR_MANS_STRACE
-	pta	evt_debug_ret_from_irq, tr0
-	ori	SP, 0, r2
-	blink	tr0, LINK
-#endif
 	ld.q	SP, FRAME_S(FSSR), r6
 	shlri	r6, 30, r6
 	andi	r6, 1, r6
@@ -905,12 +879,6 @@ ret_from_irq:
 ret_from_exception:
 	preempt_stop()
 
-#ifdef CONFIG_POOR_MANS_STRACE
-	pta	evt_debug_ret_from_exc, tr0
-	ori	SP, 0, r2
-	blink	tr0, LINK
-#endif
-
 	ld.q	SP, FRAME_S(FSSR), r6
 	shlri	r6, 30, r6
 	andi	r6, 1, r6
@@ -1236,18 +1204,6 @@ syscall_bad:
 	.global syscall_ret
 syscall_ret:
 	st.q	SP, FRAME_R(9), r2	/* Expecting SP back to BASIC frame */
-
-#ifdef CONFIG_POOR_MANS_STRACE
-	/* nothing useful in registers at this point */
-
-	movi	evt_debug2, r5
-	ori	r5, 1, r5
-	ptabs	r5, tr0
-	ld.q	SP, FRAME_R(9), r2
-	or	SP, ZERO, r3
-	blink	tr0, LINK
-#endif
-
 	ld.q	SP, FRAME_S(FSPC), r2
 	addi	r2, 4, r2		/* Move PC, being pre-execution event */
 	st.q	SP, FRAME_S(FSPC), r2
@@ -1268,25 +1224,12 @@ ret_from_fork:
 	ptabs	r5, tr0
 	blink	tr0, LINK
 
-#ifdef CONFIG_POOR_MANS_STRACE
-	/* nothing useful in registers at this point */
-
-	movi	evt_debug2, r5
-	ori	r5, 1, r5
-	ptabs	r5, tr0
-	ld.q	SP, FRAME_R(9), r2
-	or	SP, ZERO, r3
-	blink	tr0, LINK
-#endif
-
 	ld.q	SP, FRAME_S(FSPC), r2
 	addi	r2, 4, r2		/* Move PC, being pre-execution event */
 	st.q	SP, FRAME_S(FSPC), r2
 	pta	ret_from_syscall, tr0
 	blink	tr0, ZERO
 
-
-
 syscall_allowed:
 	/* Use LINK to deflect the exit point, default is syscall_ret */
 	pta	syscall_ret, tr0
diff --git a/arch/sh/lib64/.gitignore b/arch/sh/lib64/.gitignore
deleted file mode 100644
index 3508c2c..0000000
--- a/arch/sh/lib64/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-syscalltab.h
diff --git a/arch/sh/lib64/dbg.c b/arch/sh/lib64/dbg.c
index 2fb8eaf..25f2481 100644
--- a/arch/sh/lib64/dbg.c
+++ b/arch/sh/lib64/dbg.c
@@ -135,140 +135,6 @@ void print_itlb(void)
 	    (" =============================================================\n");
 }
 
-/* ======================================================================= */
-
-#ifdef CONFIG_POOR_MANS_STRACE
-
-#include "syscalltab.h"
-
-struct ring_node {
-	int evt;
-	int ret_addr;
-	int event;
-	int tra;
-	int pid;
-	unsigned long sp;
-	unsigned long pc;
-};
-
-static struct ring_node event_ring[16];
-static int event_ptr = 0;
-
-struct stored_syscall_data {
-	int pid;
-	int syscall_number;
-};
-
-#define N_STORED_SYSCALLS 16
-
-static struct stored_syscall_data stored_syscalls[N_STORED_SYSCALLS];
-static int syscall_next=0;
-static int syscall_next_print=0;
-
-void evt_debug(int evt, int ret_addr, int event, int tra, struct pt_regs *regs)
-{
-	int syscallno = tra & 0xff;
-	unsigned long sp;
-	unsigned long stack_bottom;
-	int pid;
-	struct ring_node *rr;
-
-	pid = current->pid;
-	stack_bottom = (unsigned long) task_stack_page(current);
-	asm volatile("ori r15, 0, %0" : "=r" (sp));
-	rr = event_ring + event_ptr;
-	rr->evt = evt;
-	rr->ret_addr = ret_addr;
-	rr->event = event;
-	rr->tra = tra;
-	rr->pid = pid;
-	rr->sp = sp;
-	rr->pc = regs->pc;
-
-	if (sp < stack_bottom + 3092) {
-		int i, j;
-		printk("evt_debug : stack underflow report\n");
-		for (j=0, i = event_ptr; j<16; j++) {
-			rr = event_ring + i;
-			printk("evt=%08x event=%08x tra=%08x pid=%5d sp=%08lx pc=%08lx\n",
-				rr->evt, rr->event, rr->tra, rr->pid, rr->sp, rr->pc);
-			i--;
-			i &= 15;
-		}
-		panic("STACK UNDERFLOW\n");
-	}
-
-	event_ptr = (event_ptr + 1) & 15;
-
-	if ((event == 2) && (evt == 0x160)) {
-		if (syscallno < NUM_SYSCALL_INFO_ENTRIES) {
-			/* Store the syscall information to print later.  We
-			 * can't print this now - currently we're running with
-			 * SR.BL=1, so we can't take a tlbmiss (which could occur
-			 * in the console drivers under printk).
-			 *
-			 * Just overwrite old entries on ring overflow - this
-			 * is only for last-hope debugging. */
-			stored_syscalls[syscall_next].pid = current->pid;
-			stored_syscalls[syscall_next].syscall_number = syscallno;
-			syscall_next++;
-			syscall_next &= (N_STORED_SYSCALLS - 1);
-		}
-	}
-}
-
-static void drain_syscalls(void) {
-	while (syscall_next_print != syscall_next) {
-		printk("Task %d: %s()\n",
-			stored_syscalls[syscall_next_print].pid,
-			syscall_info_table[stored_syscalls[syscall_next_print].syscall_number].name);
-			syscall_next_print++;
-			syscall_next_print &= (N_STORED_SYSCALLS - 1);
-	}
-}
-
-void evt_debug2(unsigned int ret)
-{
-	drain_syscalls();
-	printk("Task %d: syscall returns %08x\n", current->pid, ret);
-}
-
-void evt_debug_ret_from_irq(struct pt_regs *regs)
-{
-	int pid;
-	struct ring_node *rr;
-
-	pid = current->pid;
-	rr = event_ring + event_ptr;
-	rr->evt = 0xffff;
-	rr->ret_addr = 0;
-	rr->event = 0;
-	rr->tra = 0;
-	rr->pid = pid;
-	rr->pc = regs->pc;
-	event_ptr = (event_ptr + 1) & 15;
-}
-
-void evt_debug_ret_from_exc(struct pt_regs *regs)
-{
-	int pid;
-	struct ring_node *rr;
-
-	pid = current->pid;
-	rr = event_ring + event_ptr;
-	rr->evt = 0xfffe;
-	rr->ret_addr = 0;
-	rr->event = 0;
-	rr->tra = 0;
-	rr->pid = pid;
-	rr->pc = regs->pc;
-	event_ptr = (event_ptr + 1) & 15;
-}
-
-#endif /* CONFIG_POOR_MANS_STRACE */
-
-/* ======================================================================= */
-
 void show_excp_regs(char *from, int trapnr, int signr, struct pt_regs *regs)
 {
 
-- 
cgit v0.10.2


From ef9f89996e450e9686816b6fc1e34d7b9a65766c Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 17:36:57 +0900
Subject: sh: Kill off unused sh64 debug code.

None of the print_page() code and associated helpers are presently used
by anything in-tree, so just kill it off.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/lib64/dbg.c b/arch/sh/lib64/dbg.c
index 25f2481..6152a6a 100644
--- a/arch/sh/lib64/dbg.c
+++ b/arch/sh/lib64/dbg.c
@@ -246,51 +246,3 @@ void show_excp_regs(char *from, int trapnr, int signr, struct pt_regs *regs)
 	print_dtlb();
 	print_itlb();
 }
-
-/* ======================================================================= */
-
-/*
-** Depending on <base> scan the MMU, Data or Instruction side
-** looking for a valid mapping matching Eaddr & asid.
-** Return -1 if not found or the TLB id entry otherwise.
-** Note: it works only for 4k pages!
-*/
-static unsigned long
-lookup_mmu_side(unsigned long base, unsigned long Eaddr, unsigned long asid)
-{
-	regType_t pteH;
-	unsigned long epn;
-	int count;
-
-	epn = Eaddr & 0xfffff000;
-
-	for (count = 0; count < MAX_TLBs; count++, base += TLB_STEP) {
-		pteH = getConfigReg(base);
-		if (GET_VALID(pteH))
-			if ((unsigned long) GET_EPN(pteH) == epn)
-				if ((unsigned long) GET_ASID(pteH) == asid)
-					break;
-	}
-	return ((unsigned long) ((count < MAX_TLBs) ? base : -1));
-}
-
-unsigned long lookup_dtlb(unsigned long Eaddr)
-{
-	unsigned long asid = get_asid();
-	return (lookup_mmu_side((u64) DTLB_BASE, Eaddr, asid));
-}
-
-unsigned long lookup_itlb(unsigned long Eaddr)
-{
-	unsigned long asid = get_asid();
-	return (lookup_mmu_side((u64) ITLB_BASE, Eaddr, asid));
-}
-
-void print_page(struct page *page)
-{
-	printk("  page[%p] -> index 0x%lx,  count 0x%x,  flags 0x%lx\n",
-	       page, page->index, page_count(page), page->flags);
-	printk("       address_space = %p, pages =%ld\n", page->mapping,
-	       page->mapping->nrpages);
-
-}
diff --git a/arch/sh/lib64/udelay.c b/arch/sh/lib64/udelay.c
index d76bd80..f215b06 100644
--- a/arch/sh/lib64/udelay.c
+++ b/arch/sh/lib64/udelay.c
@@ -33,7 +33,7 @@ void __delay(unsigned long loops)
 			     :"0"(loops));
 }
 
-inline void __const_udelay(unsigned long xloops)
+void __const_udelay(unsigned long xloops)
 {
 	__delay(xloops * (HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy));
 }
-- 
cgit v0.10.2


From 7d170b1bc540a1d83098a9f27cf4939e026fda81 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 17:41:59 +0900
Subject: sh: Move out cayman-specific panic handler code to its own file.

This moves out the cayman-specific panic handler code to a better
location, and leaves the generic implementation a simple stub that is
still used under emulation.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/boards/mach-cayman/Makefile b/arch/sh/boards/mach-cayman/Makefile
index cafe1ac3..00fa3ea 100644
--- a/arch/sh/boards/mach-cayman/Makefile
+++ b/arch/sh/boards/mach-cayman/Makefile
@@ -1,4 +1,4 @@
 #
 # Makefile for the Hitachi Cayman specific parts of the kernel
 #
-obj-y := setup.o irq.o
+obj-y := setup.o irq.o panic.o
diff --git a/arch/sh/boards/mach-cayman/panic.c b/arch/sh/boards/mach-cayman/panic.c
new file mode 100644
index 0000000..d1e6730
--- /dev/null
+++ b/arch/sh/boards/mach-cayman/panic.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2003  Richard Curnow, SuperH UK Limited
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/io.h>
+#include <cpu/registers.h>
+
+/* THIS IS A PHYSICAL ADDRESS */
+#define HDSP2534_ADDR (0x04002100)
+
+static void poor_mans_delay(void)
+{
+	int i;
+
+	for (i = 0; i < 2500000; i++)
+		cpu_relax();
+}
+
+static void show_value(unsigned long x)
+{
+	int i;
+	unsigned nibble;
+	for (i = 0; i < 8; i++) {
+		nibble = ((x >> (i * 4)) & 0xf);
+
+		__raw_writeb(nibble + ((nibble > 9) ? 55 : 48),
+			  HDSP2534_ADDR + 0xe0 + ((7 - i) << 2));
+	}
+}
+
+void
+panic_handler(unsigned long panicPC, unsigned long panicSSR,
+	      unsigned long panicEXPEVT)
+{
+	while (1) {
+		/* This piece of code displays the PC on the LED display */
+		show_value(panicPC);
+		poor_mans_delay();
+		show_value(panicSSR);
+		poor_mans_delay();
+		show_value(panicEXPEVT);
+		poor_mans_delay();
+	}
+}
diff --git a/arch/sh/lib64/panic.c b/arch/sh/lib64/panic.c
index da32ba7..38c954e 100644
--- a/arch/sh/lib64/panic.c
+++ b/arch/sh/lib64/panic.c
@@ -6,53 +6,10 @@
  * for more details.
  */
 
-#include <linux/kernel.h>
-#include <asm/io.h>
-#include <cpu/registers.h>
-
-/* THIS IS A PHYSICAL ADDRESS */
-#define HDSP2534_ADDR (0x04002100)
-
-#ifdef CONFIG_SH_CAYMAN
-
-static void poor_mans_delay(void)
-{
-	int i;
-	for (i = 0; i < 2500000; i++) {
-	}		/* poor man's delay */
-}
-
-static void show_value(unsigned long x)
-{
-	int i;
-	unsigned nibble;
-	for (i = 0; i < 8; i++) {
-		nibble = ((x >> (i * 4)) & 0xf);
-
-		ctrl_outb(nibble + ((nibble > 9) ? 55 : 48),
-			  HDSP2534_ADDR + 0xe0 + ((7 - i) << 2));
-	}
-}
-
-#endif
-
 void
 panic_handler(unsigned long panicPC, unsigned long panicSSR,
 	      unsigned long panicEXPEVT)
 {
-#ifdef CONFIG_SH_CAYMAN
-	while (1) {
-		/* This piece of code displays the PC on the LED display */
-		show_value(panicPC);
-		poor_mans_delay();
-		show_value(panicSSR);
-		poor_mans_delay();
-		show_value(panicEXPEVT);
-		poor_mans_delay();
-	}
-#endif
-
 	/* Never return from the panic handler */
 	for (;;) ;
-
 }
-- 
cgit v0.10.2


From 4f5ecaa05493dfddf155b40224b951592bfce325 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Fri, 8 May 2009 08:23:29 +0000
Subject: sh: clock framework update, fix count and kill off kref

This patch updates the clock framework use count code.
With this patch the enable() and disable() callbacks
only get called when counting from and to zero.
While at it the kref stuff gets replaced with an int.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index 2f6c962..b1f2919 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_SH_CLOCK_H
 #define __ASM_SH_CLOCK_H
 
-#include <linux/kref.h>
 #include <linux/list.h>
 #include <linux/seq_file.h>
 #include <linux/clk.h>
@@ -28,7 +27,7 @@ struct clk {
 	struct clk		*parent;
 	struct clk_ops		*ops;
 
-	struct kref		kref;
+	int			usecount;
 
 	unsigned long		rate;
 	unsigned long		flags;
@@ -37,6 +36,7 @@ struct clk {
 
 #define CLK_ALWAYS_ENABLED	(1 << 0)
 #define CLK_RATE_PROPAGATES	(1 << 1)
+#define CLK_NEEDS_INIT		(1 << 2)
 
 /* Should be defined by processor-specific code */
 void arch_init_clk_ops(struct clk_ops **, int type);
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 099373a..133dbe4 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -19,7 +19,6 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/list.h>
-#include <linux/kref.h>
 #include <linux/kobject.h>
 #include <linux/sysdev.h>
 #include <linux/seq_file.h>
@@ -90,7 +89,7 @@ static void propagate_rate(struct clk *clk)
 	}
 }
 
-static int __clk_enable(struct clk *clk)
+static void __clk_init(struct clk *clk)
 {
 	/*
 	 * See if this is the first time we're enabling the clock, some
@@ -100,19 +99,33 @@ static int __clk_enable(struct clk *clk)
 	 * divisors to use before it can effectively recalc.
 	 */
 
-	if (clk->flags & CLK_ALWAYS_ENABLED) {
-		kref_get(&clk->kref);
-		return 0;
-	}
-
-	if (unlikely(atomic_read(&clk->kref.refcount) == 1))
+	if (clk->flags & CLK_NEEDS_INIT) {
 		if (clk->ops && clk->ops->init)
 			clk->ops->init(clk);
 
-	kref_get(&clk->kref);
+		clk->flags &= ~CLK_NEEDS_INIT;
+	}
+}
+
+static int __clk_enable(struct clk *clk)
+{
+	if (!clk)
+		return -EINVAL;
+
+	clk->usecount++;
+
+	/* nothing to do if always enabled */
+	if (clk->flags & CLK_ALWAYS_ENABLED)
+		return 0;
+
+	if (clk->usecount == 1) {
+		__clk_init(clk);
 
-	if (likely(clk->ops && clk->ops->enable))
-		clk->ops->enable(clk);
+		__clk_enable(clk->parent);
+
+		if (clk->ops && clk->ops->enable)
+			clk->ops->enable(clk);
+	}
 
 	return 0;
 }
@@ -122,11 +135,6 @@ int clk_enable(struct clk *clk)
 	unsigned long flags;
 	int ret;
 
-	if (!clk)
-		return -EINVAL;
-
-	clk_enable(clk->parent);
-
 	spin_lock_irqsave(&clock_lock, flags);
 	ret = __clk_enable(clk);
 	spin_unlock_irqrestore(&clock_lock, flags);
@@ -135,21 +143,23 @@ int clk_enable(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(clk_enable);
 
-static void clk_kref_release(struct kref *kref)
-{
-	/* Nothing to do */
-}
-
 static void __clk_disable(struct clk *clk)
 {
-	int count = kref_put(&clk->kref, clk_kref_release);
+	if (!clk)
+		return;
+
+	clk->usecount--;
+
+	WARN_ON(clk->usecount < 0);
 
 	if (clk->flags & CLK_ALWAYS_ENABLED)
 		return;
 
-	if (!count) {	/* count reaches zero, disable the clock */
+	if (clk->usecount == 0) {
 		if (likely(clk->ops && clk->ops->disable))
 			clk->ops->disable(clk);
+
+		__clk_disable(clk->parent);
 	}
 }
 
@@ -157,14 +167,9 @@ void clk_disable(struct clk *clk)
 {
 	unsigned long flags;
 
-	if (!clk)
-		return;
-
 	spin_lock_irqsave(&clock_lock, flags);
 	__clk_disable(clk);
 	spin_unlock_irqrestore(&clock_lock, flags);
-
-	clk_disable(clk->parent);
 }
 EXPORT_SYMBOL_GPL(clk_disable);
 
@@ -173,14 +178,14 @@ int clk_register(struct clk *clk)
 	mutex_lock(&clock_list_sem);
 
 	list_add(&clk->node, &clock_list);
-	kref_init(&clk->kref);
+	clk->usecount = 0;
+	clk->flags |= CLK_NEEDS_INIT;
 
 	mutex_unlock(&clock_list_sem);
 
 	if (clk->flags & CLK_ALWAYS_ENABLED) {
+		__clk_init(clk);
 		pr_debug( "Clock '%s' is ALWAYS_ENABLED\n", clk->name);
-		if (clk->ops && clk->ops->init)
-			clk->ops->init(clk);
 		if (clk->ops && clk->ops->enable)
 			clk->ops->enable(clk);
 		pr_debug( "Enabled.");
@@ -356,7 +361,7 @@ static int show_clocks(char *buf, char **start, off_t off,
 		p += sprintf(p, "%-12s\t: %ld.%02ldMHz\t%s\n", clk->name,
 			     rate / 1000000, (rate % 1000000) / 10000,
 			     ((clk->flags & CLK_ALWAYS_ENABLED) ||
-			      (atomic_read(&clk->kref.refcount) != 1)) ?
+			      clk->usecount > 0) ?
 			     "enabled" : "disabled");
 	}
 
-- 
cgit v0.10.2


From 583d1d549ff107500481461cec0325189c5349ec Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Fri, 8 May 2009 08:27:19 +0000
Subject: sh: enable TMU clocksource on sh7722

This patch enables the TMU clocksource on sh7722.
To prioritize TMU over CMT we also adjust the CMT
clock source rating.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
index 624e8e5..b1c3e7f 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
@@ -183,7 +183,7 @@ static struct sh_timer_config cmt_platform_data = {
 	.timer_bit = 5,
 	.clk = "cmt0",
 	.clockevent_rating = 125,
-	.clocksource_rating = 200,
+	.clocksource_rating = 125,
 };
 
 static struct resource cmt_resources[] = {
@@ -245,7 +245,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.channel_offset = 0x10,
 	.timer_bit = 1,
 	.clk = "tmu0",
-	.clocksource_rating = 0, /* disabled for now */
+	.clocksource_rating = 200,
 };
 
 static struct resource tmu1_resources[] = {
-- 
cgit v0.10.2


From 47dd6f4439811c6c83d9abf2f364bcb6d2684640 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Fri, 8 May 2009 08:32:18 +0000
Subject: sh: TMU platform data for sh7723

This patch adds TMU platform data for sh7723. Both clockevent
and clocksource support is enabled. While at it, adjust the
CMT clocksource rating to prioritize the TMU.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
index fb9b542..484ca2d 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
@@ -107,7 +107,7 @@ static struct sh_timer_config cmt_platform_data = {
 	.timer_bit = 5,
 	.clk = "cmt0",
 	.clockevent_rating = 125,
-	.clocksource_rating = 200,
+	.clocksource_rating = 125,
 };
 
 static struct resource cmt_resources[] = {
@@ -133,6 +133,188 @@ static struct platform_device cmt_device = {
 	.num_resources	= ARRAY_SIZE(cmt_resources),
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "tmu0",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffd80008,
+		.end	= 0xffd80013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "tmu0",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffd80014,
+		.end	= 0xffd8001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "tmu0",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffd80020,
+		.end	= 0xffd8002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
+static struct sh_timer_config tmu3_platform_data = {
+	.name = "TMU3",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "tmu1",
+};
+
+static struct resource tmu3_resources[] = {
+	[0] = {
+		.name	= "TMU3",
+		.start	= 0xffd90008,
+		.end	= 0xffd90013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 57,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu3_device = {
+	.name		= "sh_tmu",
+	.id		= 3,
+	.dev = {
+		.platform_data	= &tmu3_platform_data,
+	},
+	.resource	= tmu3_resources,
+	.num_resources	= ARRAY_SIZE(tmu3_resources),
+};
+
+static struct sh_timer_config tmu4_platform_data = {
+	.name = "TMU4",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "tmu1",
+};
+
+static struct resource tmu4_resources[] = {
+	[0] = {
+		.name	= "TMU4",
+		.start	= 0xffd90014,
+		.end	= 0xffd9001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 58,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu4_device = {
+	.name		= "sh_tmu",
+	.id		= 4,
+	.dev = {
+		.platform_data	= &tmu4_platform_data,
+	},
+	.resource	= tmu4_resources,
+	.num_resources	= ARRAY_SIZE(tmu4_resources),
+};
+
+static struct sh_timer_config tmu5_platform_data = {
+	.name = "TMU5",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "tmu1",
+};
+
+static struct resource tmu5_resources[] = {
+	[0] = {
+		.name	= "TMU5",
+		.start	= 0xffd90020,
+		.end	= 0xffd9002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 57,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu5_device = {
+	.name		= "sh_tmu",
+	.id		= 5,
+	.dev = {
+		.platform_data	= &tmu5_platform_data,
+	},
+	.resource	= tmu5_resources,
+	.num_resources	= ARRAY_SIZE(tmu5_resources),
+};
+
 static struct plat_sci_port sci_platform_data[] = {
 	{
 		.mapbase        = 0xffe00000,
@@ -255,6 +437,12 @@ static struct platform_device iic_device = {
 
 static struct platform_device *sh7723_devices[] __initdata = {
 	&cmt_device,
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
 	&sci_device,
 	&rtc_device,
 	&iic_device,
@@ -282,6 +470,12 @@ __initcall(sh7723_devices_setup);
 
 static struct platform_device *sh7723_early_devices[] __initdata = {
 	&cmt_device,
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
 };
 
 void __init plat_early_device_setup(void)
-- 
cgit v0.10.2


From 6b2e8523df148c15ea5abf13075026fb8bdb3f86 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Thu, 7 May 2009 11:56:49 -0700
Subject: xen: reserve Xen start_info rather than e820 reserving

Use reserve_early rather than e820 reservations for Xen start info and mfn->pfn
table, so that the memory use is a bit more self-documenting.

[ Impact: cleanup ]

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Xen-devel <xen-devel@lists.xensource.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <4A032EF1.6070708@goop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 15c6c68..ad0047f 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -61,9 +61,9 @@ char * __init xen_memory_setup(void)
 	 *  - xen_start_info
 	 * See comment above "struct start_info" in <xen/interface/xen.h>
 	 */
-	e820_add_region(__pa(xen_start_info->mfn_list),
-			xen_start_info->pt_base - xen_start_info->mfn_list,
-			E820_RESERVED);
+	reserve_early(__pa(xen_start_info->mfn_list),
+		      __pa(xen_start_info->pt_base),
+			"XEN START INFO");
 
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 
-- 
cgit v0.10.2


From b366328335b98373325977e116d2423f500708ac Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 18:01:03 +0900
Subject: sh: Drop dead rules from arch/sh/kernel/Makefile_64.

Several of these options are specific to the SHcompact ISA and will need
to be rewritten for SHmedia if they are to be supported at all. Drop
the impossible rules for now.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/Makefile_64 b/arch/sh/kernel/Makefile_64
index 31f6ebf..c845a0d 100644
--- a/arch/sh/kernel/Makefile_64
+++ b/arch/sh/kernel/Makefile_64
@@ -5,13 +5,10 @@ obj-y	:= debugtraps.o idle.o io.o io_generic.o irq.o machvec.o process_64.o \
 	   syscalls_64.o time.o topology.o traps.o traps_64.o
 
 obj-y				+= cpu/ timers/
-obj-$(CONFIG_VSYSCALL)		+= vsyscall/
 obj-$(CONFIG_SMP)		+= smp.o
-obj-$(CONFIG_SH_STANDARD_BIOS)	+= sh_bios.o
 obj-$(CONFIG_SH_CPU_FREQ)	+= cpufreq.o
 obj-$(CONFIG_MODULES)		+= sh_ksyms_64.o module.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
-obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-$(CONFIG_IO_TRAPPED)	+= io_trapped.o
-- 
cgit v0.10.2


From 1dcdb5a9e7c235e6e80f1f4d5b8247b3e5347e48 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Mon, 27 Apr 2009 17:44:11 +0200
Subject: oprofile: re-add force_arch_perfmon option

This re-adds the force_arch_perfmon option that was in the original
arch perfmon patchkit. Originally this was rejected in favour
of a generalized perfmon=name option, but it turned out implementing
the later in a reliable way is hard (and it would have been easy
to crash the kernel if a user gets it wrong)

But now Atom and Core i7 support being readded a user would
need to update their oprofile userland to beyond 0.9.4 to use oprofile again
on Atom or Core i7.

To avoid this problem readd the force_arch_perfmon option.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Robert Richter <robert.richter@amd.com>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 90b3924..9b9566b 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1650,6 +1650,12 @@ and is between 256 and 4096 characters. It is defined in the file
 	oprofile.timer=	[HW]
 			Use timer interrupt instead of performance counters
 
+	oprofile.force_arch_perfmon=1	[X86]
+			Force use of architectural perfmon instead of
+			the CPU specific event set.
+			This might be useful if you have older oprofile
+			userland or if you want common events over Intel CPUs.
+
 	osst=		[HW,SCSI] SCSI Tape Driver
 			Format: <buffer_size>,<write_threshold>
 			See also Documentation/scsi/st.txt.
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 202864a..e5171c9 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -389,10 +389,16 @@ static int __init p4_init(char **cpu_type)
 	return 0;
 }
 
+int force_arch_perfmon;
+module_param(force_arch_perfmon, int, 0);
+
 static int __init ppro_init(char **cpu_type)
 {
 	__u8 cpu_model = boot_cpu_data.x86_model;
 
+	if (force_arch_perfmon && cpu_has_arch_perfmon)
+		return 0;
+
 	switch (cpu_model) {
 	case 0 ... 2:
 		*cpu_type = "i386/ppro";
-- 
cgit v0.10.2


From 1f3d7b60691993d8d368d8dd7d5d85871d41e8f5 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Mon, 27 Apr 2009 17:44:12 +0200
Subject: oprofile: remove undocumented oprofile.p4force option

There are no new P4s and the oprofile code knows about all existing
ones, so we don't really need the p4force option anymore.

Remove it.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Robert Richter <robert.richter@amd.com>

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index e5171c9..f472c0c 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -356,14 +356,11 @@ static void exit_sysfs(void)
 #define exit_sysfs() do { } while (0)
 #endif /* CONFIG_PM */
 
-static int p4force;
-module_param(p4force, int, 0);
-
 static int __init p4_init(char **cpu_type)
 {
 	__u8 cpu_model = boot_cpu_data.x86_model;
 
-	if (!p4force && (cpu_model > 6 || cpu_model == 5))
+	if (cpu_model > 6 || cpu_model == 5)
 		return 0;
 
 #ifndef CONFIG_SMP
-- 
cgit v0.10.2


From 6adf406f0a0eaf37251018d15f51e93f5b538ee6 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Mon, 27 Apr 2009 17:44:13 +0200
Subject: oprofile: add support for Core i7 and Atom

The registers are about the same as other Family 6 CPUs
so we only need to add detection.

I'm not completely happy with calling Nehalem Core i7 because
there will be undoubtedly other Nehalem based CPUs
in the future with different marketing names, but it's
the best we got for now.

Requires updated oprofile userland for the new event files.

If you don't want to update right now you can also use
oprofile.force_arch_perfmon=1 (added in the next patch) with 0.9.4

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Robert Richter <robert.richter@amd.com>

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index f472c0c..3308147 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -417,6 +417,13 @@ static int __init ppro_init(char **cpu_type)
 	case 15: case 23:
 		*cpu_type = "i386/core_2";
 		break;
+	case 26:
+		arch_perfmon_setup_counters();
+		*cpu_type = "i386/core_i7";
+		break;
+	case 28:
+		*cpu_type = "i386/atom";
+		break;
 	default:
 		/* Unknown */
 		return 0;
-- 
cgit v0.10.2


From 7e4e0bd50e80df2fe5501f48f872448376cdd997 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 6 May 2009 12:10:23 +0200
Subject: oprofile: introduce module_param oprofile.cpu_type

This patch removes module_param oprofile.force_arch_perfmon and
introduces oprofile.cpu_type=archperfmon instead. This new parameter
can be reused for other models and architectures.

Currently only archperfmon is supported.

Cc: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Robert Richter <robert.richter@amd.com>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9b9566b..6ce5f48 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1650,11 +1650,13 @@ and is between 256 and 4096 characters. It is defined in the file
 	oprofile.timer=	[HW]
 			Use timer interrupt instead of performance counters
 
-	oprofile.force_arch_perfmon=1	[X86]
-			Force use of architectural perfmon instead of
-			the CPU specific event set.
-			This might be useful if you have older oprofile
-			userland or if you want common events over Intel CPUs.
+	oprofile.cpu_type=	Force an oprofile cpu type
+			This might be useful if you have an older oprofile
+			userland or if you want common events.
+			Format: { archperfmon }
+			archperfmon: [X86] Force use of architectural
+				perfmon on Intel CPUs instead of the
+				CPU specific event set.
 
 	osst=		[HW,SCSI] SCSI Tape Driver
 			Format: <buffer_size>,<write_threshold>
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 3308147..3b285e6 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -386,8 +386,17 @@ static int __init p4_init(char **cpu_type)
 	return 0;
 }
 
-int force_arch_perfmon;
-module_param(force_arch_perfmon, int, 0);
+static int force_arch_perfmon;
+static int force_cpu_type(const char *str, struct kernel_param *kp)
+{
+	if (!strcmp(str, "archperfmon")) {
+		force_arch_perfmon = 1;
+		printk(KERN_INFO "oprofile: forcing architectural perfmon\n");
+	}
+
+	return 0;
+}
+module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0);
 
 static int __init ppro_init(char **cpu_type)
 {
-- 
cgit v0.10.2


From 6eac1af01112d1919b1c432f6591e6368e8d538f Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 19:48:47 +0900
Subject: sh: Always select RTC_LIB, not just for SUPERH32.

The RTC_LIB helpers are used in arch/sh/kernel/time.c, which was
previously only the case for the 32-bit variant. Now that this has
become the common implementation, move the RTC_LIB select to reflect
that. Fixes up the sh64 build.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index d88a61b..6ed16c4 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -15,6 +15,7 @@ config SUPERH
 	select HAVE_IOREMAP_PROT if MMU
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_DMA_API_DEBUG
+	select RTC_LIB
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
 	  and consumer electronics; it was also used in the Sega Dreamcast
@@ -30,7 +31,6 @@ config SUPERH32
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_ARCH_KGDB
 	select ARCH_HIBERNATION_POSSIBLE if MMU
-	select RTC_LIB
 
 config SUPERH64
 	def_bool ARCH = "sh64"
-- 
cgit v0.10.2


From 30d88cf52f229c3e0c90b8d71036960ccc2db4e2 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 20:20:56 +0900
Subject: sh: Kill off extra cflags Kconfig entry.

There is no real reason to use this anymore, as the build system
generally knows what it is doing with regards to cflags mangling.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig.debug b/arch/sh/Kconfig.debug
index 44627ef..8179cc9 100644
--- a/arch/sh/Kconfig.debug
+++ b/arch/sh/Kconfig.debug
@@ -122,17 +122,6 @@ config SH_NO_BSS_INIT
 	  For all other cases, say N. If this option seems perplexing, or
 	  you aren't sure, say N.
 
-config MORE_COMPILE_OPTIONS
-	bool "Add any additional compile options"
-	help
-	  If you want to add additional CFLAGS to the kernel build, enable this
-	  option and then enter what you would like to add in the next question.
-	  Note however that -g is already appended with the selection of KGDB.
-
-config COMPILE_OPTIONS
-	string "Additional compile arguments"
-	depends on MORE_COMPILE_OPTIONS
-
 config SH64_SR_WATCH
 	bool "Debug: set SR.WATCH to enable hardware watchpoints and trace"
 	depends on SUPERH64
diff --git a/arch/sh/Makefile b/arch/sh/Makefile
index 493da97..68348e7 100644
--- a/arch/sh/Makefile
+++ b/arch/sh/Makefile
@@ -70,9 +70,6 @@ cflags-$(CONFIG_CPU_LITTLE_ENDIAN)	+= -ml
 cflags-y	+= $(call cc-option,-mno-fdpic)
 cflags-y	+= $(isaflags-y) -ffreestanding
 
-cflags-$(CONFIG_MORE_COMPILE_OPTIONS)	+= \
-	$(shell echo $(CONFIG_COMPILE_OPTIONS) | sed -e 's/"//g')
-
 OBJCOPYFLAGS	:= -O binary -R .note -R .note.gnu.build-id -R .comment \
 		   -R .stab -R .stabstr -S
 
-- 
cgit v0.10.2


From c29418c2ae15ee9171bc136ad261c497b95a46fc Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 20:32:56 +0900
Subject: sh: Always fixup unaligned userspace accesses on sh64.

sh64 has traditionally had this configurable via a Kconfig option
(CONFIG_SH64_USER_MISALIGNED_FIXUP). In practice it has never really been
terribly useful to turn this off, so just get rid of the option entirely.

We leave the sysctl around so we don't end up breaking existing root
file systems, and to allow folks that really want this off to do so at
their own risk.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig.cpu b/arch/sh/Kconfig.cpu
index c7d7043..9eb1712 100644
--- a/arch/sh/Kconfig.cpu
+++ b/arch/sh/Kconfig.cpu
@@ -76,11 +76,6 @@ config SPECULATIVE_EXECUTION
 
 	  If unsure, say N.
 
-config SH64_USER_MISALIGNED_FIXUP
-	def_bool y
-	prompt "Fixup misaligned loads/stores occurring in user mode"
-	depends on SUPERH64
-
 config SH64_ID2815_WORKAROUND
 	bool "Include workaround for SH5-101 cut2 silicon defect ID2815"
 	depends on CPU_SUBTYPE_SH5_101
diff --git a/arch/sh/kernel/traps_64.c b/arch/sh/kernel/traps_64.c
index a85831c..267e5eb 100644
--- a/arch/sh/kernel/traps_64.c
+++ b/arch/sh/kernel/traps_64.c
@@ -370,7 +370,6 @@ static int generate_and_check_address(struct pt_regs *regs,
 		return -1;
 	}
 
-#if defined(CONFIG_SH64_USER_MISALIGNED_FIXUP)
 	/* Check accessible.  For misaligned access in the kernel, assume the
 	   address is always accessible (and if not, just fault when the
 	   load/store gets done.) */
@@ -380,18 +379,13 @@ static int generate_and_check_address(struct pt_regs *regs,
 		}
 		/* Do access_ok check later - it depends on whether it's a load or a store. */
 	}
-#endif
 
 	*address = addr;
 	return 0;
 }
 
-/* Default value as for sh */
-#if defined(CONFIG_SH64_USER_MISALIGNED_FIXUP)
 static int user_mode_unaligned_fixup_count = 10;
 static int user_mode_unaligned_fixup_enable = 1;
-#endif
-
 static int kernel_mode_unaligned_fixup_count = 32;
 
 static void misaligned_kernel_word_load(__u64 address, int do_sign_extend, __u64 *result)
@@ -440,7 +434,6 @@ static int misaligned_load(struct pt_regs *regs,
 	}
 
 	destreg = (opcode >> 4) & 0x3f;
-#if defined(CONFIG_SH64_USER_MISALIGNED_FIXUP)
 	if (user_mode(regs)) {
 		__u64 buffer;
 
@@ -470,9 +463,7 @@ static int misaligned_load(struct pt_regs *regs,
 				width_shift, (unsigned long) regs->pc);
 			break;
 		}
-	} else
-#endif
-	{
+	} else {
 		/* kernel mode - we can take short cuts since if we fault, it's a genuine bug */
 		__u64 lo, hi;
 
@@ -519,7 +510,6 @@ static int misaligned_store(struct pt_regs *regs,
 	}
 
 	srcreg = (opcode >> 4) & 0x3f;
-#if defined(CONFIG_SH64_USER_MISALIGNED_FIXUP)
 	if (user_mode(regs)) {
 		__u64 buffer;
 
@@ -546,9 +536,7 @@ static int misaligned_store(struct pt_regs *regs,
 		if (__copy_user((void *)(int)address, &buffer, (1 << width_shift)) > 0) {
 			return -1; /* fault */
 		}
-	} else
-#endif
-	{
+	} else {
 		/* kernel mode - we can take short cuts since if we fault, it's a genuine bug */
 		__u64 val = regs->regs[srcreg];
 
@@ -576,7 +564,6 @@ static int misaligned_store(struct pt_regs *regs,
 
 }
 
-#if defined(CONFIG_SH64_USER_MISALIGNED_FIXUP)
 /* Never need to fix up misaligned FPU accesses within the kernel since that's a real
    error. */
 static int misaligned_fpu_load(struct pt_regs *regs,
@@ -727,7 +714,6 @@ static int misaligned_fpu_store(struct pt_regs *regs,
 		return -1;
 	}
 }
-#endif
 
 static int misaligned_fixup(struct pt_regs *regs)
 {
@@ -735,12 +721,8 @@ static int misaligned_fixup(struct pt_regs *regs)
 	int error;
 	int major, minor;
 
-#if !defined(CONFIG_SH64_USER_MISALIGNED_FIXUP)
-	/* Never fixup user mode misaligned accesses without this option enabled. */
-	return -1;
-#else
-	if (!user_mode_unaligned_fixup_enable) return -1;
-#endif
+	if (!user_mode_unaligned_fixup_enable)
+		return -1;
 
 	error = read_opcode(regs->pc, &opcode, user_mode(regs));
 	if (error < 0) {
@@ -749,15 +731,12 @@ static int misaligned_fixup(struct pt_regs *regs)
 	major = (opcode >> 26) & 0x3f;
 	minor = (opcode >> 16) & 0xf;
 
-#if defined(CONFIG_SH64_USER_MISALIGNED_FIXUP)
 	if (user_mode(regs) && (user_mode_unaligned_fixup_count > 0)) {
 		--user_mode_unaligned_fixup_count;
 		/* Only do 'count' worth of these reports, to remove a potential DoS against syslog */
 		printk("Fixing up unaligned userspace access in \"%s\" pid=%d pc=0x%08x ins=0x%08lx\n",
 		       current->comm, task_pid_nr(current), (__u32)regs->pc, opcode);
-	} else
-#endif
-	if (!user_mode(regs) && (kernel_mode_unaligned_fixup_count > 0)) {
+	} else if (!user_mode(regs) && (kernel_mode_unaligned_fixup_count > 0)) {
 		--kernel_mode_unaligned_fixup_count;
 		if (in_interrupt()) {
 			printk("Fixing up unaligned kernelspace access in interrupt pc=0x%08x ins=0x%08lx\n",
@@ -830,7 +809,6 @@ static int misaligned_fixup(struct pt_regs *regs)
 			}
 			break;
 
-#if defined(CONFIG_SH64_USER_MISALIGNED_FIXUP)
 		case (0x94>>2): /* FLD.S */
 			error = misaligned_fpu_load(regs, opcode, 1, 2, 0);
 			break;
@@ -881,7 +859,6 @@ static int misaligned_fixup(struct pt_regs *regs)
 				break;
 			}
 			break;
-#endif
 
 		default:
 			/* Fault */
@@ -907,7 +884,6 @@ static ctl_table unaligned_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
-#if defined(CONFIG_SH64_USER_MISALIGNED_FIXUP)
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "user_reports",
@@ -923,7 +899,6 @@ static ctl_table unaligned_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec},
-#endif
 	{}
 };
 
-- 
cgit v0.10.2


From 8f31bfe538ebafac187d2d4465a92e1d9ee6d8c2 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 8 May 2009 10:31:42 +0800
Subject: tracing/events: clean up for ftrace_set_clr_event()

Add a helper function __ftrace_set_clr_event(), and replace some
ftrace_set_clr_event() calls with this helper, thus we don't need any
kstrdup() or kmalloc().

As a side effect, this patch fixes an issue in self tests code, which is
similar to the one fixed in commit d6bf81ef0f7474434c2a049e8bf3c9146a14dd96
("tracing: append ":*" to internal setting of system events")

It's a small issue and won't cause any bug in fact, but we should do things
right anyway.

[ Impact: prevent spurious event-enabling in tracing self-tests ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4A03998E.3020503@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 8d0fae3..45f1099 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -111,11 +111,44 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
 	}
 }
 
-static int ftrace_set_clr_event(char *buf, int set)
+/*
+ * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
+ */
+static int __ftrace_set_clr_event(const char *match, const char *sub,
+				  const char *event, int set)
 {
 	struct ftrace_event_call *call;
+	int ret;
+
+	mutex_lock(&event_mutex);
+	list_for_each_entry(call, &ftrace_events, list) {
+
+		if (!call->name || !call->regfunc)
+			continue;
+
+		if (match &&
+		    strcmp(match, call->name) != 0 &&
+		    strcmp(match, call->system) != 0)
+			continue;
+
+		if (sub && strcmp(sub, call->system) != 0)
+			continue;
+
+		if (event && strcmp(event, call->name) != 0)
+			continue;
+
+		ftrace_event_enable_disable(call, set);
+
+		ret = 0;
+	}
+	mutex_unlock(&event_mutex);
+
+	return ret;
+}
+
+static int ftrace_set_clr_event(char *buf, int set)
+{
 	char *event = NULL, *sub = NULL, *match;
-	int ret = -EINVAL;
 
 	/*
 	 * The buf format can be <subsystem>:<event-name>
@@ -141,30 +174,7 @@ static int ftrace_set_clr_event(char *buf, int set)
 			event = NULL;
 	}
 
-	mutex_lock(&event_mutex);
-	list_for_each_entry(call, &ftrace_events, list) {
-
-		if (!call->name || !call->regfunc)
-			continue;
-
-		if (match &&
-		    strcmp(match, call->name) != 0 &&
-		    strcmp(match, call->system) != 0)
-			continue;
-
-		if (sub && strcmp(sub, call->system) != 0)
-			continue;
-
-		if (event && strcmp(event, call->name) != 0)
-			continue;
-
-		ftrace_event_enable_disable(call, set);
-
-		ret = 0;
-	}
-	mutex_unlock(&event_mutex);
-
-	return ret;
+	return __ftrace_set_clr_event(match, sub, event, set);
 }
 
 /* 128 should be much more than enough */
@@ -408,18 +418,14 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 	struct ftrace_event_call *call;
 	char buf[2];
 	int set = -1;
-	int all = 0;
 	int ret;
 
-	if (system[0] == '*')
-		all = 1;
-
 	mutex_lock(&event_mutex);
 	list_for_each_entry(call, &ftrace_events, list) {
 		if (!call->name || !call->regfunc)
 			continue;
 
-		if (!all && strcmp(call->system, system) != 0)
+		if (system && strcmp(call->system, system) != 0)
 			continue;
 
 		/*
@@ -480,7 +486,6 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 {
 	const char *system = filp->private_data;
 	unsigned long val;
-	char *command;
 	char buf[64];
 	ssize_t ret;
 
@@ -500,30 +505,16 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	if (ret < 0)
 		return ret;
 
-	switch (val) {
-	case 0:
-	case 1:
-		break;
-
-	default:
+	if (val != 0 && val != 1)
 		return -EINVAL;
-	}
 
-	/* +3 for the ":*\0" */
-	command = kmalloc(strlen(system)+3, GFP_KERNEL);
-	if (!command)
-		return -ENOMEM;
-	sprintf(command, "%s:*", system);
-
-	ret = ftrace_set_clr_event(command, val);
+	ret = __ftrace_set_clr_event(NULL, system, NULL, val);
 	if (ret)
-		goto out_free;
+		goto out;
 
 	ret = cnt;
 
- out_free:
-	kfree(command);
-
+out:
 	*ppos += cnt;
 
 	return ret;
@@ -1181,7 +1172,7 @@ static __init int event_trace_init(void)
 			  &ftrace_show_header_fops);
 
 	trace_create_file("enable", 0644, d_events,
-			  "*", &ftrace_system_enable_fops);
+			  NULL, &ftrace_system_enable_fops);
 
 	for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
 		/* The linker may leave blanks */
@@ -1259,7 +1250,6 @@ static __init void event_trace_self_tests(void)
 {
 	struct ftrace_event_call *call;
 	struct event_subsystem *system;
-	char *sysname;
 	int ret;
 
 	pr_info("Running tests on trace events:\n");
@@ -1305,14 +1295,7 @@ static __init void event_trace_self_tests(void)
 
 		pr_info("Testing event system %s: ", system->name);
 
-		/* ftrace_set_clr_event can modify the name passed in. */
-		sysname = kstrdup(system->name, GFP_KERNEL);
-		if (WARN_ON(!sysname)) {
-			pr_warning("Can't allocate memory, giving up!\n");
-			return;
-		}
-		ret = ftrace_set_clr_event(sysname, 1);
-		kfree(sysname);
+		ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1);
 		if (WARN_ON_ONCE(ret)) {
 			pr_warning("error enabling system %s\n",
 				   system->name);
@@ -1321,14 +1304,7 @@ static __init void event_trace_self_tests(void)
 
 		event_test_stuff();
 
-		sysname = kstrdup(system->name, GFP_KERNEL);
-		if (WARN_ON(!sysname)) {
-			pr_warning("Can't allocate memory, giving up!\n");
-			return;
-		}
-		ret = ftrace_set_clr_event(sysname, 0);
-		kfree(sysname);
-
+		ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0);
 		if (WARN_ON_ONCE(ret))
 			pr_warning("error disabling system %s\n",
 				   system->name);
@@ -1341,15 +1317,8 @@ static __init void event_trace_self_tests(void)
 	pr_info("Running tests on all trace events:\n");
 	pr_info("Testing all events: ");
 
-	sysname = kmalloc(4, GFP_KERNEL);
-	if (WARN_ON(!sysname)) {
-		pr_warning("Can't allocate memory, giving up!\n");
-		return;
-	}
-	memcpy(sysname, "*:*", 4);
-	ret = ftrace_set_clr_event(sysname, 1);
+	ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1);
 	if (WARN_ON_ONCE(ret)) {
-		kfree(sysname);
 		pr_warning("error enabling all events\n");
 		return;
 	}
@@ -1357,10 +1326,7 @@ static __init void event_trace_self_tests(void)
 	event_test_stuff();
 
 	/* reset sysname */
-	memcpy(sysname, "*:*", 4);
-	ret = ftrace_set_clr_event(sysname, 0);
-	kfree(sysname);
-
+	ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0);
 	if (WARN_ON_ONCE(ret)) {
 		pr_warning("error disabling all events\n");
 		return;
-- 
cgit v0.10.2


From c142b15dc56ee6d55cb97a062e3c8e9c61e384c0 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 8 May 2009 10:32:05 +0800
Subject: tracing/events: simplify system_enable_read()

A smarter way to figure out the output of an enable file.

[ Impact: clean up ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4A0399A5.2080603@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 45f1099..df394bc 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -414,10 +414,11 @@ static ssize_t
 system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 		   loff_t *ppos)
 {
+	const char set_to_char[4] = { '?', '0', '1', 'X' };
 	const char *system = filp->private_data;
 	struct ftrace_event_call *call;
 	char buf[2];
-	int set = -1;
+	int set = 0;
 	int ret;
 
 	mutex_lock(&event_mutex);
@@ -433,47 +434,18 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 		 * or if all events or cleared, or if we have
 		 * a mixture.
 		 */
-		if (call->enabled) {
-			switch (set) {
-			case -1:
-				set = 1;
-				break;
-			case 0:
-				set = 2;
-				break;
-			}
-		} else {
-			switch (set) {
-			case -1:
-				set = 0;
-				break;
-			case 1:
-				set = 2;
-				break;
-			}
-		}
+		set |= (1 << !!call->enabled);
+
 		/*
 		 * If we have a mixture, no need to look further.
 		 */
-		if (set == 2)
+		if (set == 3)
 			break;
 	}
 	mutex_unlock(&event_mutex);
 
+	buf[0] = set_to_char[set];
 	buf[1] = '\n';
-	switch (set) {
-	case 0:
-		buf[0] = '0';
-		break;
-	case 1:
-		buf[0] = '1';
-		break;
-	case 2:
-		buf[0] = 'X';
-		break;
-	default:
-		buf[0] = '?';
-	}
 
 	ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
 
-- 
cgit v0.10.2


From c3d480ded1584dc17f6e82f49af4155380a51dda Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 21:57:04 +0900
Subject: sh: TMU platform data for SH7786.

Wires up all 12 TMU channels, with TMU0 and 1 used as clockevent and
clocksource respectively.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7786.c b/arch/sh/kernel/cpu/sh4a/setup-sh7786.c
index 90e8cff..2c464bf 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7786.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7786.c
@@ -3,6 +3,7 @@
  *
  * Copyright (C) 2009  Renesas Solutions Corp.
  * Kuninori Morimoto <morimoto.kuninori@renesas.com>
+ * Paul Mundt <paul.mundt@renesas.com>
  *
  * Based on SH7785 Setup
  *
@@ -19,6 +20,7 @@
 #include <linux/io.h>
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
+#include <linux/sh_timer.h>
 #include <asm/mmzone.h>
 
 static struct plat_sci_port sci_platform_data[] = {
@@ -69,6 +71,368 @@ static struct platform_device sci_device = {
 	},
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffd80008,
+		.end	= 0xffd80013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffd80014,
+		.end	= 0xffd8001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffd80020,
+		.end	= 0xffd8002f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
+static struct sh_timer_config tmu3_platform_data = {
+	.name = "TMU3",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+};
+
+static struct resource tmu3_resources[] = {
+	[0] = {
+		.name	= "TMU3",
+		.start	= 0xffda0008,
+		.end	= 0xffda0013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 20,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu3_device = {
+	.name		= "sh_tmu",
+	.id		= 3,
+	.dev = {
+		.platform_data	= &tmu3_platform_data,
+	},
+	.resource	= tmu3_resources,
+	.num_resources	= ARRAY_SIZE(tmu3_resources),
+};
+
+static struct sh_timer_config tmu4_platform_data = {
+	.name = "TMU4",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+};
+
+static struct resource tmu4_resources[] = {
+	[0] = {
+		.name	= "TMU4",
+		.start	= 0xffda0014,
+		.end	= 0xffda001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 21,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu4_device = {
+	.name		= "sh_tmu",
+	.id		= 4,
+	.dev = {
+		.platform_data	= &tmu4_platform_data,
+	},
+	.resource	= tmu4_resources,
+	.num_resources	= ARRAY_SIZE(tmu4_resources),
+};
+
+static struct sh_timer_config tmu5_platform_data = {
+	.name = "TMU5",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu5_resources[] = {
+	[0] = {
+		.name	= "TMU5",
+		.start	= 0xffda0020,
+		.end	= 0xffda002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 22,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu5_device = {
+	.name		= "sh_tmu",
+	.id		= 5,
+	.dev = {
+		.platform_data	= &tmu5_platform_data,
+	},
+	.resource	= tmu5_resources,
+	.num_resources	= ARRAY_SIZE(tmu5_resources),
+};
+
+static struct sh_timer_config tmu6_platform_data = {
+	.name = "TMU6",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+};
+
+static struct resource tmu6_resources[] = {
+	[0] = {
+		.name	= "TMU6",
+		.start	= 0xffdc0008,
+		.end	= 0xffdc0013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 45,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu6_device = {
+	.name		= "sh_tmu",
+	.id		= 6,
+	.dev = {
+		.platform_data	= &tmu6_platform_data,
+	},
+	.resource	= tmu6_resources,
+	.num_resources	= ARRAY_SIZE(tmu6_resources),
+};
+
+static struct sh_timer_config tmu7_platform_data = {
+	.name = "TMU7",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+};
+
+static struct resource tmu7_resources[] = {
+	[0] = {
+		.name	= "TMU7",
+		.start	= 0xffdc0014,
+		.end	= 0xffdc001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 45,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu7_device = {
+	.name		= "sh_tmu",
+	.id		= 7,
+	.dev = {
+		.platform_data	= &tmu7_platform_data,
+	},
+	.resource	= tmu7_resources,
+	.num_resources	= ARRAY_SIZE(tmu7_resources),
+};
+
+static struct sh_timer_config tmu8_platform_data = {
+	.name = "TMU8",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu8_resources[] = {
+	[0] = {
+		.name	= "TMU8",
+		.start	= 0xffdc0020,
+		.end	= 0xffdc002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 45,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu8_device = {
+	.name		= "sh_tmu",
+	.id		= 8,
+	.dev = {
+		.platform_data	= &tmu8_platform_data,
+	},
+	.resource	= tmu8_resources,
+	.num_resources	= ARRAY_SIZE(tmu8_resources),
+};
+
+static struct sh_timer_config tmu9_platform_data = {
+	.name = "TMU9",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+};
+
+static struct resource tmu9_resources[] = {
+	[0] = {
+		.name	= "TMU9",
+		.start	= 0xffde0008,
+		.end	= 0xffde0013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 46,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu9_device = {
+	.name		= "sh_tmu",
+	.id		= 9,
+	.dev = {
+		.platform_data	= &tmu9_platform_data,
+	},
+	.resource	= tmu9_resources,
+	.num_resources	= ARRAY_SIZE(tmu9_resources),
+};
+
+static struct sh_timer_config tmu10_platform_data = {
+	.name = "TMU10",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+};
+
+static struct resource tmu10_resources[] = {
+	[0] = {
+		.name	= "TMU10",
+		.start	= 0xffde0014,
+		.end	= 0xffde001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 46,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu10_device = {
+	.name		= "sh_tmu",
+	.id		= 10,
+	.dev = {
+		.platform_data	= &tmu10_platform_data,
+	},
+	.resource	= tmu10_resources,
+	.num_resources	= ARRAY_SIZE(tmu10_resources),
+};
+
+static struct sh_timer_config tmu11_platform_data = {
+	.name = "TMU11",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu11_resources[] = {
+	[0] = {
+		.name	= "TMU11",
+		.start	= 0xffde0020,
+		.end	= 0xffde002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 46,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu11_device = {
+	.name		= "sh_tmu",
+	.id		= 11,
+	.dev = {
+		.platform_data	= &tmu11_platform_data,
+	},
+	.resource	= tmu11_resources,
+	.num_resources	= ARRAY_SIZE(tmu11_resources),
+};
+
 static struct resource usb_ohci_resources[] = {
 	[0] = {
 		.start	= 0xffe70400,
@@ -94,6 +458,21 @@ static struct platform_device usb_ohci_device = {
 	.resource	= usb_ohci_resources,
 };
 
+static struct platform_device *sh7786_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
+	&tmu6_device,
+	&tmu7_device,
+	&tmu8_device,
+	&tmu9_device,
+	&tmu10_device,
+	&tmu11_device,
+};
+
 static struct platform_device *sh7786_devices[] __initdata = {
 	&sci_device,
 	&usb_ohci_device,
@@ -156,12 +535,26 @@ static void __init sh7786_usb_setup(void)
 
 static int __init sh7786_devices_setup(void)
 {
+	int ret;
+
 	sh7786_usb_setup();
+
+	ret = platform_add_devices(sh7786_early_devices,
+				   ARRAY_SIZE(sh7786_early_devices));
+	if (unlikely(ret != 0))
+		return ret;
+
 	return platform_add_devices(sh7786_devices,
 				    ARRAY_SIZE(sh7786_devices));
 }
 device_initcall(sh7786_devices_setup);
 
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7786_early_devices,
+				   ARRAY_SIZE(sh7786_early_devices));
+}
+
 enum {
 	UNUSED = 0,
 
-- 
cgit v0.10.2


From ccdaeb4c8f982900a2abf722e34b60131394d8eb Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 22:09:30 +0900
Subject: sh: TMU platform data for SH-X3 proto CPU.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/setup-shx3.c b/arch/sh/kernel/cpu/sh4a/setup-shx3.c
index bd35f3253..9d5185b 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-shx3.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-shx3.c
@@ -1,7 +1,7 @@
 /*
- * SH-X3 Setup
+ * SH-X3 Prototype Setup
  *
- *  Copyright (C) 2007  Paul Mundt
+ *  Copyright (C) 2007 - 2009  Paul Mundt
  *
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
@@ -12,6 +12,7 @@
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
 #include <linux/io.h>
+#include <linux/sh_timer.h>
 #include <asm/mmzone.h>
 
 static struct plat_sci_port sci_platform_data[] = {
@@ -48,17 +49,221 @@ static struct platform_device sci_device = {
 	},
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffc10008,
+		.end	= 0xffc10013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffc10014,
+		.end	= 0xffc1001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffc10020,
+		.end	= 0xffc1002f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
+static struct sh_timer_config tmu3_platform_data = {
+	.name = "TMU3",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+};
+
+static struct resource tmu3_resources[] = {
+	[0] = {
+		.name	= "TMU3",
+		.start	= 0xffc20008,
+		.end	= 0xffc20013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 19,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu3_device = {
+	.name		= "sh_tmu",
+	.id		= 3,
+	.dev = {
+		.platform_data	= &tmu3_platform_data,
+	},
+	.resource	= tmu3_resources,
+	.num_resources	= ARRAY_SIZE(tmu3_resources),
+};
+
+static struct sh_timer_config tmu4_platform_data = {
+	.name = "TMU4",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+};
+
+static struct resource tmu4_resources[] = {
+	[0] = {
+		.name	= "TMU4",
+		.start	= 0xffc20014,
+		.end	= 0xffc2001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 20,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu4_device = {
+	.name		= "sh_tmu",
+	.id		= 4,
+	.dev = {
+		.platform_data	= &tmu4_platform_data,
+	},
+	.resource	= tmu4_resources,
+	.num_resources	= ARRAY_SIZE(tmu4_resources),
+};
+
+static struct sh_timer_config tmu5_platform_data = {
+	.name = "TMU5",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu5_resources[] = {
+	[0] = {
+		.name	= "TMU5",
+		.start	= 0xffc20020,
+		.end	= 0xffc2002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 21,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu5_device = {
+	.name		= "sh_tmu",
+	.id		= 5,
+	.dev = {
+		.platform_data	= &tmu5_platform_data,
+	},
+	.resource	= tmu5_resources,
+	.num_resources	= ARRAY_SIZE(tmu5_resources),
+};
+
+static struct platform_device *shx3_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
+};
+
 static struct platform_device *shx3_devices[] __initdata = {
 	&sci_device,
 };
 
 static int __init shx3_devices_setup(void)
 {
+	int ret;
+
+	ret = platform_add_devices(shx3_early_devices,
+				   ARRAY_SIZE(shx3_early_devices));
+	if (unlikely(ret != 0))
+		return ret;
+
 	return platform_add_devices(shx3_devices,
 				    ARRAY_SIZE(shx3_devices));
 }
 __initcall(shx3_devices_setup);
 
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(shx3_early_devices,
+				   ARRAY_SIZE(shx3_early_devices));
+}
+
 enum {
 	UNUSED = 0,
 
-- 
cgit v0.10.2


From 7b551f9daa9bd9533ba4ce31622ed4be1dd97d3e Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 22:14:01 +0900
Subject: sh: Kill off the GENERIC_CALIBRATE_DELAY ifndef.

Now that everyone is using the clock framework directly and we
unconditionally provide our own calibrate_delay() function, having it
wrapped in an ifndef is no longer useful. So, kill it off.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index 38515a7..00086b2 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -156,7 +156,7 @@ static void __init reserve_crashkernel(void)
 			&crash_size, &crash_base);
 	if (ret == 0 && crash_size) {
 		if (crash_base <= 0) {
-			vp = alloc_bootmem_nopanic(crash_size); 
+			vp = alloc_bootmem_nopanic(crash_size);
 			if (!vp) {
 				printk(KERN_INFO "crashkernel allocation "
 				       "failed\n");
@@ -185,7 +185,6 @@ static inline void __init reserve_crashkernel(void)
 {}
 #endif
 
-#ifndef CONFIG_GENERIC_CALIBRATE_DELAY
 void __cpuinit calibrate_delay(void)
 {
 	struct clk *clk = clk_get(NULL, "cpu_clk");
@@ -201,7 +200,6 @@ void __cpuinit calibrate_delay(void)
 			 (loops_per_jiffy/(5000/HZ)) % 100,
 			 loops_per_jiffy);
 }
-#endif
 
 void __init __add_active_range(unsigned int nid, unsigned long start_pfn,
 						unsigned long end_pfn)
-- 
cgit v0.10.2


From e552de2413edad1a7b0c7f82a2f2753e4f905d93 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 21 Jan 2009 15:13:42 +0000
Subject: sh-sci: add platform device private data

This patch adds per-platform private data to the sh-sci driver.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index 728d6a0..af5da11 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -47,6 +47,7 @@
 #include <linux/clk.h>
 #include <linux/ctype.h>
 #include <linux/err.h>
+#include <linux/list.h>
 
 #ifdef CONFIG_SUPERH
 #include <asm/clock.h>
@@ -78,6 +79,16 @@ struct sci_port {
 	/* Port clock */
 	struct clk		*clk;
 #endif
+	struct list_head	node;
+};
+
+struct sh_sci_priv {
+	spinlock_t lock;
+	struct list_head ports;
+
+#ifdef CONFIG_HAVE_CLK
+	struct notifier_block clk_nb;
+#endif
 };
 
 #ifdef CONFIG_SERIAL_SH_SCI_CONSOLE
@@ -726,19 +737,22 @@ static irqreturn_t sci_mpxed_interrupt(int irq, void *ptr)
 static int sci_notifier(struct notifier_block *self,
 			unsigned long phase, void *p)
 {
-	int i;
+	struct sh_sci_priv *priv = container_of(self,
+						struct sh_sci_priv, clk_nb);
+	struct sci_port *sci_port;
+	unsigned long flags;
 
 	if ((phase == CPUFREQ_POSTCHANGE) ||
-	    (phase == CPUFREQ_RESUMECHANGE))
-		for (i = 0; i < SCI_NPORTS; i++) {
-			struct sci_port *s = &sci_ports[i];
-			s->port.uartclk = clk_get_rate(s->clk);
-		}
+	    (phase == CPUFREQ_RESUMECHANGE)) {
+		spin_lock_irqsave(&priv->lock, flags);
+		list_for_each_entry(sci_port, &priv->ports, node)
+			sci_port->port.uartclk = clk_get_rate(sci_port->clk);
+
+		spin_unlock_irqrestore(&priv->lock, flags);
+	}
 
 	return NOTIFY_OK;
 }
-
-static struct notifier_block sci_nb = { &sci_notifier, NULL, 0 };
 #endif
 
 static int sci_request_irq(struct sci_port *port)
@@ -1201,6 +1215,27 @@ static struct uart_driver sci_uart_driver = {
 	.cons		= SCI_CONSOLE,
 };
 
+
+static int __devexit sci_remove(struct platform_device *dev)
+{
+	struct sh_sci_priv *priv = platform_get_drvdata(dev);
+	struct sci_port *p;
+	unsigned long flags;
+
+#ifdef CONFIG_HAVE_CLK
+	cpufreq_unregister_notifier(&priv->clk_nb, CPUFREQ_TRANSITION_NOTIFIER);
+#endif
+
+	spin_lock_irqsave(&priv->lock, flags);
+	list_for_each_entry(p, &priv->ports, node)
+		uart_remove_one_port(&sci_uart_driver, &p->port);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	kfree(priv);
+	return 0;
+}
+
 /*
  * Register a set of serial devices attached to a platform device.  The
  * list is terminated with a zero flags entry, which means we expect
@@ -1210,7 +1245,22 @@ static struct uart_driver sci_uart_driver = {
 static int __devinit sci_probe(struct platform_device *dev)
 {
 	struct plat_sci_port *p = dev->dev.platform_data;
+	struct sh_sci_priv *priv;
 	int i, ret = -EINVAL;
+	unsigned long flags;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&priv->ports);
+	spin_lock_init(&priv->lock);
+	platform_set_drvdata(dev, priv);
+
+#ifdef CONFIG_HAVE_CLK
+	priv->clk_nb.notifier_call = sci_notifier;
+	cpufreq_register_notifier(&priv->clk_nb, CPUFREQ_TRANSITION_NOTIFIER);
+#endif
 
 	for (i = 0; p && p->flags != 0; p++, i++) {
 		struct sci_port *sciport = &sci_ports[i];
@@ -1254,12 +1304,19 @@ static int __devinit sci_probe(struct platform_device *dev)
 
 		memcpy(&sciport->irqs, &p->irqs, sizeof(p->irqs));
 
-		uart_add_one_port(&sci_uart_driver, &sciport->port);
-	}
+		ret = uart_add_one_port(&sci_uart_driver, &sciport->port);
 
-#ifdef CONFIG_HAVE_CLK
-	cpufreq_register_notifier(&sci_nb, CPUFREQ_TRANSITION_NOTIFIER);
-#endif
+		if (ret && (p->flags & UPF_IOREMAP)) {
+			iounmap(p->membase);
+			goto err_unreg;
+		}
+
+		INIT_LIST_HEAD(&sciport->node);
+
+		spin_lock_irqsave(&priv->lock, flags);
+		list_add(&sciport->node, &priv->ports);
+		spin_unlock_irqrestore(&priv->lock, flags);
+	}
 
 #ifdef CONFIG_SH_STANDARD_BIOS
 	sh_bios_gdb_detach();
@@ -1268,50 +1325,36 @@ static int __devinit sci_probe(struct platform_device *dev)
 	return 0;
 
 err_unreg:
-	for (i = i - 1; i >= 0; i--)
-		uart_remove_one_port(&sci_uart_driver, &sci_ports[i].port);
-
+	sci_remove(dev);
 	return ret;
 }
 
-static int __devexit sci_remove(struct platform_device *dev)
-{
-	int i;
-
-#ifdef CONFIG_HAVE_CLK
-	cpufreq_unregister_notifier(&sci_nb, CPUFREQ_TRANSITION_NOTIFIER);
-#endif
-
-	for (i = 0; i < SCI_NPORTS; i++)
-		uart_remove_one_port(&sci_uart_driver, &sci_ports[i].port);
-
-	return 0;
-}
-
 static int sci_suspend(struct platform_device *dev, pm_message_t state)
 {
-	int i;
+	struct sh_sci_priv *priv = platform_get_drvdata(dev);
+	struct sci_port *p;
+	unsigned long flags;
 
-	for (i = 0; i < SCI_NPORTS; i++) {
-		struct sci_port *p = &sci_ports[i];
+	spin_lock_irqsave(&priv->lock, flags);
+	list_for_each_entry(p, &priv->ports, node)
+		uart_suspend_port(&sci_uart_driver, &p->port);
 
-		if (p->type != PORT_UNKNOWN && p->port.dev == &dev->dev)
-			uart_suspend_port(&sci_uart_driver, &p->port);
-	}
+	spin_unlock_irqrestore(&priv->lock, flags);
 
 	return 0;
 }
 
 static int sci_resume(struct platform_device *dev)
 {
-	int i;
+	struct sh_sci_priv *priv = platform_get_drvdata(dev);
+	struct sci_port *p;
+	unsigned long flags;
 
-	for (i = 0; i < SCI_NPORTS; i++) {
-		struct sci_port *p = &sci_ports[i];
+	spin_lock_irqsave(&priv->lock, flags);
+	list_for_each_entry(p, &priv->ports, node)
+		uart_resume_port(&sci_uart_driver, &p->port);
 
-		if (p->type != PORT_UNKNOWN && p->port.dev == &dev->dev)
-			uart_resume_port(&sci_uart_driver, &p->port);
-	}
+	spin_unlock_irqrestore(&priv->lock, flags);
 
 	return 0;
 }
-- 
cgit v0.10.2


From 9080b72819650c3a757d173a19bc930d603b79d6 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 21 Jan 2009 15:13:58 +0000
Subject: sh-sci: remove early_sci_setup()

Remove unused early_sci_setup() function from sh-sci.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index af5da11..5f37f7d 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -1082,20 +1082,6 @@ static void __init sci_init_ports(void)
 	}
 }
 
-int __init early_sci_setup(struct uart_port *port)
-{
-	if (unlikely(port->line > SCI_NPORTS))
-		return -ENODEV;
-
-	sci_init_ports();
-
-	sci_ports[port->line].port.membase	= port->membase;
-	sci_ports[port->line].port.mapbase	= port->mapbase;
-	sci_ports[port->line].port.type		= port->type;
-
-	return 0;
-}
-
 #ifdef CONFIG_SERIAL_SH_SCI_CONSOLE
 /*
  *	Print a string to the serial port trying not to disturb
diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h
index 893cc53..717059d 100644
--- a/include/linux/serial_sci.h
+++ b/include/linux/serial_sci.h
@@ -27,6 +27,4 @@ struct plat_sci_port {
 	upf_t		flags;			/* UPF_* flags */
 };
 
-int early_sci_setup(struct uart_port *port);
-
 #endif /* __LINUX_SERIAL_SCI_H */
-- 
cgit v0.10.2


From dc8e6f5bfcd6a307a8196d3e41fd9798be5a1c76 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 21 Jan 2009 15:14:06 +0000
Subject: sh-sci: rework serial console support

Rework sh-sci serial console code.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index 5f37f7d..d2cd1a4 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -91,10 +91,6 @@ struct sh_sci_priv {
 #endif
 };
 
-#ifdef CONFIG_SERIAL_SH_SCI_CONSOLE
-static struct sci_port *serial_console_port;
-#endif
-
 /* Function prototypes */
 static void sci_stop_tx(struct uart_port *port);
 
@@ -1083,6 +1079,18 @@ static void __init sci_init_ports(void)
 }
 
 #ifdef CONFIG_SERIAL_SH_SCI_CONSOLE
+static struct tty_driver *serial_console_device(struct console *co, int *index)
+{
+	struct uart_driver *p = &sci_uart_driver;
+	*index = co->index;
+	return p->tty_driver;
+}
+
+static void serial_console_putchar(struct uart_port *port, int ch)
+{
+	sci_poll_put_char(port, ch);
+}
+
 /*
  *	Print a string to the serial port trying not to disturb
  *	any possible real use of the port...
@@ -1090,16 +1098,10 @@ static void __init sci_init_ports(void)
 static void serial_console_write(struct console *co, const char *s,
 				 unsigned count)
 {
-	struct uart_port *port = &serial_console_port->port;
+	struct uart_port *port = co->data;
 	unsigned short bits;
-	int i;
 
-	for (i = 0; i < count; i++) {
-		if (*s == 10)
-			sci_poll_put_char(port, '\r');
-
-		sci_poll_put_char(port, *s++);
-	}
+	uart_console_write(co->data, s, count, serial_console_putchar);
 
 	/* wait until fifo is empty and last bit has been transmitted */
 	bits = SCxSR_TDxE(port) | SCxSR_TEND(port);
@@ -1109,6 +1111,7 @@ static void serial_console_write(struct console *co, const char *s,
 
 static int __init serial_console_setup(struct console *co, char *options)
 {
+	struct sci_port *sci_port;
 	struct uart_port *port;
 	int baud = 115200;
 	int bits = 8;
@@ -1124,8 +1127,9 @@ static int __init serial_console_setup(struct console *co, char *options)
 	if (co->index >= SCI_NPORTS)
 		co->index = 0;
 
-	serial_console_port = &sci_ports[co->index];
-	port = &serial_console_port->port;
+	sci_port = &sci_ports[co->index];
+	port = &sci_port->port;
+	co->data = port;
 
 	/*
 	 * Also need to check port->type, we don't actually have any
@@ -1135,21 +1139,17 @@ static int __init serial_console_setup(struct console *co, char *options)
 	 */
 	if (!port->type)
 		return -ENODEV;
-	if (!port->membase || !port->mapbase)
-		return -ENODEV;
-
-	port->type = serial_console_port->type;
 
 #ifdef CONFIG_HAVE_CLK
-	if (!serial_console_port->clk)
-		serial_console_port->clk = clk_get(NULL, "module_clk");
+	if (!sci_port->clk)
+		sci_port->clk = clk_get(NULL, "module_clk");
 #endif
 
 	if (port->flags & UPF_IOREMAP)
 		sci_config_port(port, 0);
 
-	if (serial_console_port->enable)
-		serial_console_port->enable(port);
+	if (sci_port->enable)
+		sci_port->enable(port);
 
 	if (options)
 		uart_parse_options(options, &baud, &parity, &bits, &flow);
@@ -1165,12 +1165,11 @@ static int __init serial_console_setup(struct console *co, char *options)
 
 static struct console serial_console = {
 	.name		= "ttySC",
-	.device		= uart_console_device,
+	.device		= serial_console_device,
 	.write		= serial_console_write,
 	.setup		= serial_console_setup,
 	.flags		= CON_PRINTBUFFER,
 	.index		= -1,
-	.data		= &sci_uart_driver,
 };
 
 static int __init sci_console_init(void)
-- 
cgit v0.10.2


From a5660adae85918f2ab6b10ab58e2f574c1bd5ce1 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 21 Jan 2009 15:14:38 +0000
Subject: sh-sci: use to_sci_port() if possible

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index d2cd1a4..17fa7f1 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -618,7 +618,7 @@ static inline int sci_handle_breaks(struct uart_port *port)
 	int copied = 0;
 	unsigned short status = sci_in(port, SCxSR);
 	struct tty_struct *tty = port->info->port.tty;
-	struct sci_port *s = &sci_ports[port->line];
+	struct sci_port *s = to_sci_port(port);
 
 	if (uart_handle_break(port))
 		return 0;
@@ -875,7 +875,7 @@ static void sci_break_ctl(struct uart_port *port, int break_state)
 
 static int sci_startup(struct uart_port *port)
 {
-	struct sci_port *s = &sci_ports[port->line];
+	struct sci_port *s = to_sci_port(port);
 
 	if (s->enable)
 		s->enable(port);
@@ -893,7 +893,7 @@ static int sci_startup(struct uart_port *port)
 
 static void sci_shutdown(struct uart_port *port)
 {
-	struct sci_port *s = &sci_ports[port->line];
+	struct sci_port *s = to_sci_port(port);
 
 	sci_stop_rx(port);
 	sci_stop_tx(port);
@@ -990,7 +990,7 @@ static int sci_request_port(struct uart_port *port)
 
 static void sci_config_port(struct uart_port *port, int flags)
 {
-	struct sci_port *s = &sci_ports[port->line];
+	struct sci_port *s = to_sci_port(port);
 
 	port->type = s->type;
 
@@ -1002,7 +1002,7 @@ static void sci_config_port(struct uart_port *port, int flags)
 
 static int sci_verify_port(struct uart_port *port, struct serial_struct *ser)
 {
-	struct sci_port *s = &sci_ports[port->line];
+	struct sci_port *s = to_sci_port(port);
 
 	if (ser->irq != s->irqs[SCIx_TXI_IRQ] || ser->irq > nr_irqs)
 		return -EINVAL;
-- 
cgit v0.10.2


From 0ee70712922c15252183db8b50a7e369c96017c0 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 21 Jan 2009 15:13:50 +0000
Subject: sh-sci: allow single port platform devices

Allow registration of single port sh-sci platform devices.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index 17fa7f1..e9b350c 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -1221,6 +1221,70 @@ static int __devexit sci_remove(struct platform_device *dev)
 	return 0;
 }
 
+static int __devinit sci_probe_single(struct platform_device *dev,
+				      unsigned int index,
+				      struct plat_sci_port *p,
+				      struct sci_port *sciport)
+{
+	struct sh_sci_priv *priv = platform_get_drvdata(dev);
+	unsigned long flags;
+	int ret;
+
+	/* Sanity check */
+	if (unlikely(index >= SCI_NPORTS)) {
+		dev_notice(&dev->dev, "Attempting to register port "
+			   "%d when only %d are available.\n",
+			   index+1, SCI_NPORTS);
+		dev_notice(&dev->dev, "Consider bumping "
+			   "CONFIG_SERIAL_SH_SCI_NR_UARTS!\n");
+		return 0;
+	}
+
+	sciport->port.mapbase	= p->mapbase;
+
+	if (p->mapbase && !p->membase) {
+		if (p->flags & UPF_IOREMAP) {
+			p->membase = ioremap_nocache(p->mapbase, 0x40);
+			if (IS_ERR(p->membase))
+				return PTR_ERR(p->membase);
+		} else {
+			/*
+			 * For the simple (and majority of) cases
+			 * where we don't need to do any remapping,
+			 * just cast the cookie directly.
+			 */
+			p->membase = (void __iomem *)p->mapbase;
+		}
+	}
+
+	sciport->port.membase	= p->membase;
+
+	sciport->port.irq	= p->irqs[SCIx_TXI_IRQ];
+	sciport->port.flags	= p->flags;
+	sciport->port.dev	= &dev->dev;
+
+	sciport->type		= sciport->port.type = p->type;
+
+	memcpy(&sciport->irqs, &p->irqs, sizeof(p->irqs));
+
+	ret = uart_add_one_port(&sci_uart_driver, &sciport->port);
+
+	if (ret) {
+		if (p->flags & UPF_IOREMAP)
+			iounmap(p->membase);
+
+		return ret;
+	}
+
+	INIT_LIST_HEAD(&sciport->node);
+
+	spin_lock_irqsave(&priv->lock, flags);
+	list_add(&sciport->node, &priv->ports);
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	return 0;
+}
+
 /*
  * Register a set of serial devices attached to a platform device.  The
  * list is terminated with a zero flags entry, which means we expect
@@ -1232,7 +1296,6 @@ static int __devinit sci_probe(struct platform_device *dev)
 	struct plat_sci_port *p = dev->dev.platform_data;
 	struct sh_sci_priv *priv;
 	int i, ret = -EINVAL;
-	unsigned long flags;
 
 	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
 	if (!priv)
@@ -1247,60 +1310,16 @@ static int __devinit sci_probe(struct platform_device *dev)
 	cpufreq_register_notifier(&priv->clk_nb, CPUFREQ_TRANSITION_NOTIFIER);
 #endif
 
-	for (i = 0; p && p->flags != 0; p++, i++) {
-		struct sci_port *sciport = &sci_ports[i];
-
-		/* Sanity check */
-		if (unlikely(i == SCI_NPORTS)) {
-			dev_notice(&dev->dev, "Attempting to register port "
-				   "%d when only %d are available.\n",
-				   i+1, SCI_NPORTS);
-			dev_notice(&dev->dev, "Consider bumping "
-				   "CONFIG_SERIAL_SH_SCI_NR_UARTS!\n");
-			break;
-		}
-
-		sciport->port.mapbase	= p->mapbase;
-
-		if (p->mapbase && !p->membase) {
-			if (p->flags & UPF_IOREMAP) {
-				p->membase = ioremap_nocache(p->mapbase, 0x40);
-				if (IS_ERR(p->membase)) {
-					ret = PTR_ERR(p->membase);
-					goto err_unreg;
-				}
-			} else {
-				/*
-				 * For the simple (and majority of) cases
-				 * where we don't need to do any remapping,
-				 * just cast the cookie directly.
-				 */
-				p->membase = (void __iomem *)p->mapbase;
-			}
-		}
-
-		sciport->port.membase	= p->membase;
-
-		sciport->port.irq	= p->irqs[SCIx_TXI_IRQ];
-		sciport->port.flags	= p->flags;
-		sciport->port.dev	= &dev->dev;
-
-		sciport->type		= sciport->port.type = p->type;
-
-		memcpy(&sciport->irqs, &p->irqs, sizeof(p->irqs));
-
-		ret = uart_add_one_port(&sci_uart_driver, &sciport->port);
-
-		if (ret && (p->flags & UPF_IOREMAP)) {
-			iounmap(p->membase);
+	if (dev->id != -1) {
+		ret = sci_probe_single(dev, dev->id, p, &sci_ports[dev->id]);
+		if (ret)
 			goto err_unreg;
+	} else {
+		for (i = 0; p && p->flags != 0; p++, i++) {
+			ret = sci_probe_single(dev, i, p, &sci_ports[i]);
+			if (ret)
+				goto err_unreg;
 		}
-
-		INIT_LIST_HEAD(&sciport->node);
-
-		spin_lock_irqsave(&priv->lock, flags);
-		list_add(&sciport->node, &priv->ports);
-		spin_unlock_irqrestore(&priv->lock, flags);
 	}
 
 #ifdef CONFIG_SH_STANDARD_BIOS
-- 
cgit v0.10.2


From 7ed7e0711b3ff85b3e15591081b42f2af96d584b Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 21 Jan 2009 15:14:14 +0000
Subject: sh-sci: replace sci_init_ports()

Replace sci_init_ports() with sci_init_single().

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index e9b350c..039c700 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -1036,46 +1036,64 @@ static struct uart_ops sci_uart_ops = {
 #endif
 };
 
-static void __init sci_init_ports(void)
+static int __devinit sci_init_single(struct sci_port *sci_port,
+				     unsigned int index,
+				     struct plat_sci_port *p)
 {
-	static int first = 1;
-	int i;
-
-	if (!first)
-		return;
-
-	first = 0;
-
-	for (i = 0; i < SCI_NPORTS; i++) {
-		sci_ports[i].port.ops		= &sci_uart_ops;
-		sci_ports[i].port.iotype	= UPIO_MEM;
-		sci_ports[i].port.line		= i;
-		sci_ports[i].port.fifosize	= 1;
+	sci_port->port.ops	= &sci_uart_ops;
+	sci_port->port.iotype	= UPIO_MEM;
+	sci_port->port.line	= index;
+	sci_port->port.fifosize	= 1;
 
 #if defined(__H8300H__) || defined(__H8300S__)
 #ifdef __H8300S__
-		sci_ports[i].enable	= h8300_sci_enable;
-		sci_ports[i].disable	= h8300_sci_disable;
+	sci_port->enable	= h8300_sci_enable;
+	sci_port->disable	= h8300_sci_disable;
 #endif
-		sci_ports[i].port.uartclk = CONFIG_CPU_CLOCK;
+	sci_port->port.uartclk	= CONFIG_CPU_CLOCK;
 #elif defined(CONFIG_HAVE_CLK)
-		/*
-		 * XXX: We should use a proper SCI/SCIF clock
-		 */
-		{
-			struct clk *clk = clk_get(NULL, "module_clk");
-			sci_ports[i].port.uartclk = clk_get_rate(clk);
-			clk_put(clk);
-		}
+	/*
+	 * XXX: We should use a proper SCI/SCIF clock
+	 */
+	{
+		struct clk *clk = clk_get(NULL, "module_clk");
+		sci_port->port.uartclk = clk_get_rate(clk);
+		clk_put(clk);
+	}
 #else
 #error "Need a valid uartclk"
 #endif
 
-		sci_ports[i].break_timer.data = (unsigned long)&sci_ports[i];
-		sci_ports[i].break_timer.function = sci_break_timer;
+	sci_port->break_timer.data = (unsigned long)sci_port;
+	sci_port->break_timer.function = sci_break_timer;
+	init_timer(&sci_port->break_timer);
+
+	sci_port->port.mapbase	= p->mapbase;
 
-		init_timer(&sci_ports[i].break_timer);
+	if (p->mapbase && !p->membase) {
+		if (p->flags & UPF_IOREMAP) {
+			p->membase = ioremap_nocache(p->mapbase, 0x40);
+			if (IS_ERR(p->membase))
+				return PTR_ERR(p->membase);
+		} else {
+			/*
+			 * For the simple (and majority of) cases
+			 * where we don't need to do any remapping,
+			 * just cast the cookie directly.
+			 */
+			p->membase = (void __iomem *)p->mapbase;
+		}
 	}
+
+	sci_port->port.membase	= p->membase;
+
+	sci_port->port.irq	= p->irqs[SCIx_TXI_IRQ];
+	sci_port->port.flags	= p->flags;
+	sci_port->type		= sci_port->port.type = p->type;
+
+	memcpy(&sci_port->irqs, &p->irqs, sizeof(p->irqs));
+
+	return 0;
 }
 
 #ifdef CONFIG_SERIAL_SH_SCI_CONSOLE
@@ -1174,7 +1192,6 @@ static struct console serial_console = {
 
 static int __init sci_console_init(void)
 {
-	sci_init_ports();
 	register_console(&serial_console);
 	return 0;
 }
@@ -1240,32 +1257,10 @@ static int __devinit sci_probe_single(struct platform_device *dev,
 		return 0;
 	}
 
-	sciport->port.mapbase	= p->mapbase;
-
-	if (p->mapbase && !p->membase) {
-		if (p->flags & UPF_IOREMAP) {
-			p->membase = ioremap_nocache(p->mapbase, 0x40);
-			if (IS_ERR(p->membase))
-				return PTR_ERR(p->membase);
-		} else {
-			/*
-			 * For the simple (and majority of) cases
-			 * where we don't need to do any remapping,
-			 * just cast the cookie directly.
-			 */
-			p->membase = (void __iomem *)p->mapbase;
-		}
-	}
-
-	sciport->port.membase	= p->membase;
-
-	sciport->port.irq	= p->irqs[SCIx_TXI_IRQ];
-	sciport->port.flags	= p->flags;
-	sciport->port.dev	= &dev->dev;
-
-	sciport->type		= sciport->port.type = p->type;
-
-	memcpy(&sciport->irqs, &p->irqs, sizeof(p->irqs));
+	sciport->port.dev = &dev->dev;
+	ret = sci_init_single(sciport, index, p);
+	if (ret)
+		return ret;
 
 	ret = uart_add_one_port(&sci_uart_driver, &sciport->port);
 
@@ -1380,8 +1375,6 @@ static int __init sci_init(void)
 
 	printk(banner);
 
-	sci_init_ports();
-
 	ret = uart_register_driver(&sci_uart_driver);
 	if (likely(ret == 0)) {
 		ret = platform_driver_register(&sci_driver);
-- 
cgit v0.10.2


From 08f8cb315fdf9195b472aeb440ae65b189b151da Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 21 Jan 2009 15:14:22 +0000
Subject: sh-sci: ioremap() in a single place

Handle ioremap() in sci_config_port only.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index 039c700..408624a 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -994,9 +994,21 @@ static void sci_config_port(struct uart_port *port, int flags)
 
 	port->type = s->type;
 
-	if (port->flags & UPF_IOREMAP && !port->membase) {
+	if (port->membase)
+		return;
+
+	if (port->flags & UPF_IOREMAP) {
 		port->membase = ioremap_nocache(port->mapbase, 0x40);
-		dev_err(port->dev, "can't remap port#%d\n", port->line);
+
+		if (IS_ERR(port->membase))
+			dev_err(port->dev, "can't remap port#%d\n", port->line);
+	} else {
+		/*
+		 * For the simple (and majority of) cases where we don't
+		 * need to do any remapping, just cast the cookie
+		 * directly.
+		 */
+		port->membase = (void __iomem *)port->mapbase;
 	}
 }
 
@@ -1036,9 +1048,9 @@ static struct uart_ops sci_uart_ops = {
 #endif
 };
 
-static int __devinit sci_init_single(struct sci_port *sci_port,
-				     unsigned int index,
-				     struct plat_sci_port *p)
+static void __devinit sci_init_single(struct sci_port *sci_port,
+				      unsigned int index,
+				      struct plat_sci_port *p)
 {
 	sci_port->port.ops	= &sci_uart_ops;
 	sci_port->port.iotype	= UPIO_MEM;
@@ -1069,22 +1081,6 @@ static int __devinit sci_init_single(struct sci_port *sci_port,
 	init_timer(&sci_port->break_timer);
 
 	sci_port->port.mapbase	= p->mapbase;
-
-	if (p->mapbase && !p->membase) {
-		if (p->flags & UPF_IOREMAP) {
-			p->membase = ioremap_nocache(p->mapbase, 0x40);
-			if (IS_ERR(p->membase))
-				return PTR_ERR(p->membase);
-		} else {
-			/*
-			 * For the simple (and majority of) cases
-			 * where we don't need to do any remapping,
-			 * just cast the cookie directly.
-			 */
-			p->membase = (void __iomem *)p->mapbase;
-		}
-	}
-
 	sci_port->port.membase	= p->membase;
 
 	sci_port->port.irq	= p->irqs[SCIx_TXI_IRQ];
@@ -1092,8 +1088,6 @@ static int __devinit sci_init_single(struct sci_port *sci_port,
 	sci_port->type		= sci_port->port.type = p->type;
 
 	memcpy(&sci_port->irqs, &p->irqs, sizeof(p->irqs));
-
-	return 0;
 }
 
 #ifdef CONFIG_SERIAL_SH_SCI_CONSOLE
@@ -1163,8 +1157,7 @@ static int __init serial_console_setup(struct console *co, char *options)
 		sci_port->clk = clk_get(NULL, "module_clk");
 #endif
 
-	if (port->flags & UPF_IOREMAP)
-		sci_config_port(port, 0);
+	sci_config_port(port, 0);
 
 	if (sci_port->enable)
 		sci_port->enable(port);
@@ -1258,18 +1251,11 @@ static int __devinit sci_probe_single(struct platform_device *dev,
 	}
 
 	sciport->port.dev = &dev->dev;
-	ret = sci_init_single(sciport, index, p);
-	if (ret)
-		return ret;
+	sci_init_single(sciport, index, p);
 
 	ret = uart_add_one_port(&sci_uart_driver, &sciport->port);
-
-	if (ret) {
-		if (p->flags & UPF_IOREMAP)
-			iounmap(p->membase);
-
+	if (ret)
 		return ret;
-	}
 
 	INIT_LIST_HEAD(&sciport->node);
 
-- 
cgit v0.10.2


From 501b825d01efb93766c87d29f299851152cf4eb0 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 21 Jan 2009 15:14:30 +0000
Subject: sh-sci: improve clock framework support

Use enable/disable hooks for clock framework integration.
Make sure we control the clock for the serial console as well.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index 408624a..3daf767 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -76,8 +76,10 @@ struct sci_port {
 	int			break_flag;
 
 #ifdef CONFIG_HAVE_CLK
-	/* Port clock */
-	struct clk		*clk;
+	/* Interface clock */
+	struct clk		*iclk;
+	/* Data clock */
+	struct clk		*dclk;
 #endif
 	struct list_head	node;
 };
@@ -166,12 +168,12 @@ static void h8300_sci_config(struct uart_port *port, unsigned int ctrl)
 		*mstpcrl &= ~mask;
 }
 
-static inline void h8300_sci_enable(struct uart_port *port)
+static void h8300_sci_enable(struct uart_port *port)
 {
 	h8300_sci_config(port, sci_enable);
 }
 
-static inline void h8300_sci_disable(struct uart_port *port)
+static void h8300_sci_disable(struct uart_port *port)
 {
 	h8300_sci_config(port, sci_disable);
 }
@@ -742,13 +744,34 @@ static int sci_notifier(struct notifier_block *self,
 	    (phase == CPUFREQ_RESUMECHANGE)) {
 		spin_lock_irqsave(&priv->lock, flags);
 		list_for_each_entry(sci_port, &priv->ports, node)
-			sci_port->port.uartclk = clk_get_rate(sci_port->clk);
+			sci_port->port.uartclk = clk_get_rate(sci_port->dclk);
 
 		spin_unlock_irqrestore(&priv->lock, flags);
 	}
 
 	return NOTIFY_OK;
 }
+
+static void sci_clk_enable(struct uart_port *port)
+{
+	struct sci_port *sci_port = to_sci_port(port);
+
+	clk_enable(sci_port->dclk);
+	sci_port->port.uartclk = clk_get_rate(sci_port->dclk);
+
+	if (sci_port->iclk)
+		clk_enable(sci_port->iclk);
+}
+
+static void sci_clk_disable(struct uart_port *port)
+{
+	struct sci_port *sci_port = to_sci_port(port);
+
+	if (sci_port->iclk)
+		clk_disable(sci_port->iclk);
+
+	clk_disable(sci_port->dclk);
+}
 #endif
 
 static int sci_request_irq(struct sci_port *port)
@@ -880,10 +903,6 @@ static int sci_startup(struct uart_port *port)
 	if (s->enable)
 		s->enable(port);
 
-#ifdef CONFIG_HAVE_CLK
-	s->clk = clk_get(NULL, "module_clk");
-#endif
-
 	sci_request_irq(s);
 	sci_start_tx(port);
 	sci_start_rx(port, 1);
@@ -901,11 +920,6 @@ static void sci_shutdown(struct uart_port *port)
 
 	if (s->disable)
 		s->disable(port);
-
-#ifdef CONFIG_HAVE_CLK
-	clk_put(s->clk);
-	s->clk = NULL;
-#endif
 }
 
 static void sci_set_termios(struct uart_port *port, struct ktermios *termios,
@@ -1048,7 +1062,8 @@ static struct uart_ops sci_uart_ops = {
 #endif
 };
 
-static void __devinit sci_init_single(struct sci_port *sci_port,
+static void __devinit sci_init_single(struct platform_device *dev,
+				      struct sci_port *sci_port,
 				      unsigned int index,
 				      struct plat_sci_port *p)
 {
@@ -1064,14 +1079,10 @@ static void __devinit sci_init_single(struct sci_port *sci_port,
 #endif
 	sci_port->port.uartclk	= CONFIG_CPU_CLOCK;
 #elif defined(CONFIG_HAVE_CLK)
-	/*
-	 * XXX: We should use a proper SCI/SCIF clock
-	 */
-	{
-		struct clk *clk = clk_get(NULL, "module_clk");
-		sci_port->port.uartclk = clk_get_rate(clk);
-		clk_put(clk);
-	}
+	sci_port->iclk		= p->clk ? clk_get(&dev->dev, p->clk) : NULL;
+	sci_port->dclk		= clk_get(&dev->dev, "module_clk");
+	sci_port->enable	= sci_clk_enable;
+	sci_port->disable	= sci_clk_disable;
 #else
 #error "Need a valid uartclk"
 #endif
@@ -1085,9 +1096,11 @@ static void __devinit sci_init_single(struct sci_port *sci_port,
 
 	sci_port->port.irq	= p->irqs[SCIx_TXI_IRQ];
 	sci_port->port.flags	= p->flags;
+	sci_port->port.dev	= &dev->dev;
 	sci_port->type		= sci_port->port.type = p->type;
 
 	memcpy(&sci_port->irqs, &p->irqs, sizeof(p->irqs));
+
 }
 
 #ifdef CONFIG_SERIAL_SH_SCI_CONSOLE
@@ -1111,14 +1124,21 @@ static void serial_console_write(struct console *co, const char *s,
 				 unsigned count)
 {
 	struct uart_port *port = co->data;
+	struct sci_port *sci_port = to_sci_port(port);
 	unsigned short bits;
 
-	uart_console_write(co->data, s, count, serial_console_putchar);
+	if (sci_port->enable)
+		sci_port->enable(port);
+
+	uart_console_write(port, s, count, serial_console_putchar);
 
 	/* wait until fifo is empty and last bit has been transmitted */
 	bits = SCxSR_TDxE(port) | SCxSR_TEND(port);
 	while ((sci_in(port, SCxSR) & bits) != bits)
 		cpu_relax();
+
+	if (sci_port->disable);
+		sci_port->disable(port);
 }
 
 static int __init serial_console_setup(struct console *co, char *options)
@@ -1152,11 +1172,6 @@ static int __init serial_console_setup(struct console *co, char *options)
 	if (!port->type)
 		return -ENODEV;
 
-#ifdef CONFIG_HAVE_CLK
-	if (!sci_port->clk)
-		sci_port->clk = clk_get(NULL, "module_clk");
-#endif
-
 	sci_config_port(port, 0);
 
 	if (sci_port->enable)
@@ -1171,6 +1186,7 @@ static int __init serial_console_setup(struct console *co, char *options)
 	if (ret == 0)
 		sci_stop_rx(port);
 #endif
+	/* TODO: disable clock */
 	return ret;
 }
 
@@ -1250,8 +1266,7 @@ static int __devinit sci_probe_single(struct platform_device *dev,
 		return 0;
 	}
 
-	sciport->port.dev = &dev->dev;
-	sci_init_single(sciport, index, p);
+	sci_init_single(dev, sciport, index, p);
 
 	ret = uart_add_one_port(&sci_uart_driver, &sciport->port);
 	if (ret)
diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h
index 717059d..1c297dd 100644
--- a/include/linux/serial_sci.h
+++ b/include/linux/serial_sci.h
@@ -25,6 +25,7 @@ struct plat_sci_port {
 	unsigned int	irqs[SCIx_NR_IRQS];	/* ERI, RXI, TXI, BRI */
 	unsigned int	type;			/* SCI / SCIF / IRDA */
 	upf_t		flags;			/* UPF_* flags */
+	char		*clk;			/* clock string */
 };
 
 #endif /* __LINUX_SERIAL_SCI_H */
-- 
cgit v0.10.2


From 3b226e15beb5ecf068738e796811afd1e5b3f81f Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 23:28:54 +0900
Subject: sh: Add clock id to sh-sci platform data on SH-Mobile CPUs.

This adds the clock specifier to all of the SH-Mobile sh-sci ports.
Impacted CPUs are SH7343/SH7366/SH7722/SH7723/SH7724.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
index f327b7e..b9c361e 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
@@ -179,21 +179,25 @@ static struct plat_sci_port sci_platform_data[] = {
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 80, 80, 80, 80 },
+		.clk		= "scif0",
 	}, {
 		.mapbase	= 0xffe10000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 81, 81, 81, 81 },
+		.clk		= "scif1",
 	}, {
 		.mapbase	= 0xffe20000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 82, 82, 82, 82 },
+		.clk		= "scif2",
 	}, {
 		.mapbase	= 0xffe30000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 83, 83, 83, 83 },
+		.clk		= "scif3",
 	}, {
 		.flags = 0,
 	}
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
index 7361ea9..d4a994b 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
@@ -186,6 +186,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 80, 80, 80, 80 },
+		.clk		= "scif0",
 	}, {
 		.flags = 0,
 	}
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
index b1c3e7f..5d6247f 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
@@ -307,18 +307,21 @@ static struct plat_sci_port sci_platform_data[] = {
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 80, 80, 80, 80 },
+		.clk		= "scif0",
 	},
 	{
 		.mapbase	= 0xffe10000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 81, 81, 81, 81 },
+		.clk		= "scif1",
 	},
 	{
 		.mapbase	= 0xffe20000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 82, 82, 82, 82 },
+		.clk		= "scif2",
 	},
 	{
 		.flags = 0,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
index 484ca2d..1429fc5 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
@@ -321,31 +321,37 @@ static struct plat_sci_port sci_platform_data[] = {
 		.flags          = UPF_BOOT_AUTOCONF,
 		.type           = PORT_SCIF,
 		.irqs           = { 80, 80, 80, 80 },
+		.clk		= "scif0",
 	},{
 		.mapbase        = 0xffe10000,
 		.flags          = UPF_BOOT_AUTOCONF,
 		.type           = PORT_SCIF,
 		.irqs           = { 81, 81, 81, 81 },
+		.clk		= "scif1",
 	},{
 		.mapbase        = 0xffe20000,
 		.flags          = UPF_BOOT_AUTOCONF,
 		.type           = PORT_SCIF,
 		.irqs           = { 82, 82, 82, 82 },
+		.clk		= "scif2",
 	},{
 		.mapbase	= 0xa4e30000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIFA,
 		.irqs		= { 56, 56, 56, 56 },
+		.clk		= "scif3",
 	},{
 		.mapbase	= 0xa4e40000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIFA,
 		.irqs		= { 88, 88, 88, 88 },
+		.clk		= "scif4",
 	},{
 		.mapbase	= 0xa4e50000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIFA,
 		.irqs		= { 109, 109, 109, 109 },
+		.clk		= "scif5",
 	}, {
 		.flags = 0,
 	}
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index e074951..a3039ce 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -30,31 +30,37 @@ static struct plat_sci_port sci_platform_data[] = {
 		.flags          = UPF_BOOT_AUTOCONF,
 		.type           = PORT_SCIF,
 		.irqs           = { 80, 80, 80, 80 },
+		.clk		= "scif0",
 	}, {
 		.mapbase        = 0xffe10000,
 		.flags          = UPF_BOOT_AUTOCONF,
 		.type           = PORT_SCIF,
 		.irqs           = { 81, 81, 81, 81 },
+		.clk		= "scif1",
 	}, {
 		.mapbase        = 0xffe20000,
 		.flags          = UPF_BOOT_AUTOCONF,
 		.type           = PORT_SCIF,
 		.irqs           = { 82, 82, 82, 82 },
+		.clk		= "scif2",
 	}, {
 		.mapbase	= 0xa4e30000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIFA,
 		.irqs		= { 56, 56, 56, 56 },
+		.clk		= "scif3",
 	}, {
 		.mapbase	= 0xa4e40000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIFA,
 		.irqs		= { 88, 88, 88, 88 },
+		.clk		= "scif4",
 	}, {
 		.mapbase	= 0xa4e50000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIFA,
 		.irqs		= { 109, 109, 109, 109 },
+		.clk		= "scif5",
 	}, {
 		.flags = 0,
 	}
-- 
cgit v0.10.2


From 54507f6ee99778a727ff1b38a1f4050fe6479835 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 23:48:33 +0900
Subject: serial: sh-sci: Fix up section mismatch in error path.

The sci_probe_single() path attempts to use sci_remove() for the error
path, while sci_remove() is still flagged as __devexit. So, we simply
discard the section annotation.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index 3daf767..686e4a4 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -1227,7 +1227,7 @@ static struct uart_driver sci_uart_driver = {
 };
 
 
-static int __devexit sci_remove(struct platform_device *dev)
+static int sci_remove(struct platform_device *dev)
 {
 	struct sh_sci_priv *priv = platform_get_drvdata(dev);
 	struct sci_port *p;
-- 
cgit v0.10.2


From 168f36237b16e2b3159e24c7d3b658e3c912d149 Mon Sep 17 00:00:00 2001
From: Yoshinori Sato <ysato@users.sourceforge.jp>
Date: Tue, 28 Apr 2009 04:40:15 +0000
Subject: serial: sh-sci: Fix up h8300 support.

- Dummy SCIF functions define.
- h8300 specific header include.

Signed-off-by: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index 686e4a4..4e3248d 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -54,6 +54,10 @@
 #include <asm/sh_bios.h>
 #endif
 
+#ifdef CONFIG_H8300
+#include <asm/gpio.h>
+#endif
+
 #include "sh-sci.h"
 
 struct sci_port {
diff --git a/drivers/serial/sh-sci.h b/drivers/serial/sh-sci.h
index 84cc651..e3eae4e 100644
--- a/drivers/serial/sh-sci.h
+++ b/drivers/serial/sh-sci.h
@@ -317,7 +317,18 @@
     }									\
   }
 
-#define CPU_SCIF_FNS(name, scif_offset, scif_size)				\
+#ifdef CONFIG_H8300
+/* h8300 don't have SCIF */
+#define CPU_SCIF_FNS(name)						\
+  static inline unsigned int sci_##name##_in(struct uart_port *port)	\
+  {									\
+    return 0;								\
+  }									\
+  static inline void sci_##name##_out(struct uart_port *port, unsigned int value) \
+  {									\
+  }
+#else
+#define CPU_SCIF_FNS(name, scif_offset, scif_size)			\
   static inline unsigned int sci_##name##_in(struct uart_port *port)	\
   {									\
     SCI_IN(scif_size, scif_offset);					\
@@ -326,6 +337,7 @@
   {									\
     SCI_OUT(scif_size, scif_offset, value);				\
   }
+#endif
 
 #define CPU_SCI_FNS(name, sci_offset, sci_size)				\
   static inline unsigned int sci_##name##_in(struct uart_port* port)	\
@@ -363,7 +375,8 @@
 		 sh3_scif_offset, sh3_scif_size, sh4_scif_offset, sh4_scif_size, \
                  h8_sci_offset, h8_sci_size) \
   CPU_SCI_FNS(name, h8_sci_offset, h8_sci_size)
-#define SCIF_FNS(name, sh3_scif_offset, sh3_scif_size, sh4_scif_offset, sh4_scif_size)
+#define SCIF_FNS(name, sh3_scif_offset, sh3_scif_size, sh4_scif_offset, sh4_scif_size) \
+  CPU_SCIF_FNS(name)
 #elif defined(CONFIG_CPU_SUBTYPE_SH7723) ||\
       defined(CONFIG_CPU_SUBTYPE_SH7724)
         #define SCIx_FNS(name, sh4_scifa_offset, sh4_scifa_size, sh4_scif_offset, sh4_scif_size) \
-- 
cgit v0.10.2


From be6514c6295cc79498eeb9a8f933451082ca9e69 Mon Sep 17 00:00:00 2001
From: Kieran Bingham <kieranbingham@gmail.com>
Date: Fri, 8 May 2009 15:48:15 +0100
Subject: sh: Add in some ptrace definitions from GDB.

Plugs in PT_TEXT_END_ADDR/PT_TEXT_ADDR/PT_DATA_ADDR/PT_TEXT_LEN
definitions.

Signed-off-by: Kieran Bingham <kieranbingham@gmail.com>
Signed-off-by: Peter Griffin <pgriffin@mpc-data.co.uk>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/ptrace.h b/arch/sh/include/asm/ptrace.h
index 68e20ff..1dc12cb 100644
--- a/arch/sh/include/asm/ptrace.h
+++ b/arch/sh/include/asm/ptrace.h
@@ -102,6 +102,11 @@ struct pt_dspregs {
 #define	PTRACE_GETDSPREGS	55	/* DSP registers */
 #define	PTRACE_SETDSPREGS	56
 
+#define PT_TEXT_END_ADDR 	240
+#define PT_TEXT_ADDR 		244	/* &(struct user)->start_code */
+#define PT_DATA_ADDR 		248	/* &(struct user)->start_data */
+#define PT_TEXT_LEN		252
+
 #ifdef __KERNEL__
 #include <asm/addrspace.h>
 
-- 
cgit v0.10.2


From e73173dbe55e5b4c2306728aad50c8e42194f6d5 Mon Sep 17 00:00:00 2001
From: Kieran Bingham <kieranbingham@gmail.com>
Date: Fri, 8 May 2009 15:49:50 +0100
Subject: sh: Fix UBC setup and registers for SH2A

Signed-off-by: Kieran Bingham <kieranbingham@gmail.com>
Signed-off-by: Peter Griffin <pgriffin@mpc-data.co.uk>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/ubc.h b/arch/sh/include/asm/ubc.h
index a7b9028..4ca4b77 100644
--- a/arch/sh/include/asm/ubc.h
+++ b/arch/sh/include/asm/ubc.h
@@ -42,12 +42,23 @@
 
 #define BRCR_CMFA		(1 << 15)
 #define BRCR_CMFB		(1 << 14)
+
+#if defined CONFIG_CPU_SH2A
+#define BRCR_CMFCA		(1 << 15)
+#define BRCR_CMFCB		(1 << 14)
+#define BRCR_CMFDA		(1 << 13)
+#define BRCR_CMFDB		(1 << 12)
+#define BRCR_PCBB		(1 << 6)	/* 1: after execution */
+#define BRCR_PCBA		(1 << 5)	/* 1: after execution */
+#define BRCR_PCTE		0
+#else
 #define BRCR_PCTE		(1 << 11)
 #define BRCR_PCBA		(1 << 10)	/* 1: after execution */
 #define BRCR_DBEB		(1 << 7)
 #define BRCR_PCBB		(1 << 6)
 #define BRCR_SEQ		(1 << 3)
 #define BRCR_UBDE		(1 << 0)
+#endif
 
 #ifndef __ASSEMBLY__
 /* arch/sh/kernel/cpu/ubc.S */
diff --git a/arch/sh/include/cpu-sh2a/cpu/ubc.h b/arch/sh/include/cpu-sh2a/cpu/ubc.h
index 8ce2fc1c..1192e1c 100644
--- a/arch/sh/include/cpu-sh2a/cpu/ubc.h
+++ b/arch/sh/include/cpu-sh2a/cpu/ubc.h
@@ -1 +1,28 @@
-#include <cpu-sh2/cpu/ubc.h>
+/*
+ * SH-2A UBC definitions
+ *
+ * Copyright (C) 2008 Kieran Bingham
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+
+#ifndef __ASM_CPU_SH2A_UBC_H
+#define __ASM_CPU_SH2A_UBC_H
+
+#define UBC_BARA                0xfffc0400
+#define UBC_BAMRA               0xfffc0404
+#define UBC_BBRA                0xfffc04a0	/* 16 bit access */
+#define UBC_BDRA                0xfffc0408
+#define UBC_BDMRA               0xfffc040c
+
+#define UBC_BARB                0xfffc0410
+#define UBC_BAMRB               0xfffc0414
+#define UBC_BBRB                0xfffc04b0	/* 16 bit access */
+#define UBC_BDRB                0xfffc0418
+#define UBC_BDMRB               0xfffc041c
+
+#define UBC_BRCR                0xfffc04c0
+
+#endif /* __ASM_CPU_SH2A_UBC_H */
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index 6d94725..9289ede 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -251,7 +251,8 @@ static void ubc_set_tracing(int asid, unsigned long pc)
 
 	if (current_cpu_data.type == CPU_SH7729 ||
 	    current_cpu_data.type == CPU_SH7710 ||
-	    current_cpu_data.type == CPU_SH7712) {
+	    current_cpu_data.type == CPU_SH7712 ||
+	    current_cpu_data.type == CPU_SH7203){
 		ctrl_outw(BBR_INST | BBR_READ | BBR_CPU, UBC_BBRA);
 		ctrl_outl(BRCR_PCBA | BRCR_PCTE, UBC_BRCR);
 	} else {
@@ -407,6 +408,7 @@ asmlinkage void break_point_trap(void)
 #else
 	ctrl_outw(0, UBC_BBRA);
 	ctrl_outw(0, UBC_BBRB);
+	ctrl_outl(0, UBC_BRCR);
 #endif
 	current->thread.ubc_pc = 0;
 	ubc_usercnt -= 1;
-- 
cgit v0.10.2


From ba0d474082dfa37fb0efd439b4d0c0234ceb1e80 Mon Sep 17 00:00:00 2001
From: Peter Griffin <pgriffin@mpc-data.co.uk>
Date: Fri, 8 May 2009 15:50:54 +0100
Subject: sh: Add ptrace support for NOMMU debugging

Signed-off-by: Peter Griffin <pgriffin@mpc-data.co.uk>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c
index f7b22dd..3392e83 100644
--- a/arch/sh/kernel/ptrace_32.c
+++ b/arch/sh/kernel/ptrace_32.c
@@ -334,6 +334,14 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 					[(addr - (long)&dummy->fpu) >> 2];
 		} else if (addr == (long) &dummy->u_fpvalid)
 			tmp = !!tsk_used_math(child);
+		else if (addr == PT_TEXT_ADDR)
+			tmp = child->mm->start_code;
+		else if (addr == PT_DATA_ADDR)
+			tmp = child->mm->start_data;
+		else if (addr == PT_TEXT_END_ADDR)
+			tmp = child->mm->end_code;
+		else if (addr == PT_TEXT_LEN)
+			tmp = child->mm->end_code - child->mm->start_code;
 		else
 			tmp = 0;
 		ret = put_user(tmp, datap);
-- 
cgit v0.10.2


From cd89436e54b29a07a383ee82f2f718d8c9d24cc4 Mon Sep 17 00:00:00 2001
From: Peter Griffin <pgriffin@mpc-data.co.uk>
Date: Fri, 8 May 2009 15:51:51 +0100
Subject: sh: Add UBC trap vector for SH2A

Signed-off-by: Peter Griffin <pgriffin@mpc-data.co.uk>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c
index 30ca9c5..67550d8 100644
--- a/arch/sh/kernel/traps_32.c
+++ b/arch/sh/kernel/traps_32.c
@@ -34,6 +34,7 @@
 # define TRAP_ILLEGAL_SLOT_INST	6
 # define TRAP_ADDRESS_ERROR	9
 # ifdef CONFIG_CPU_SH2A
+#  define TRAP_UBC		12
 #  define TRAP_FPU_ERROR	13
 #  define TRAP_DIVZERO_ERROR	17
 #  define TRAP_DIVOVF_ERROR	18
@@ -849,6 +850,10 @@ void __init trap_init(void)
 #endif
 #endif
 
+#ifdef TRAP_UBC
+	set_exception_table_vec(TRAP_UBC, break_point_trap);
+#endif
+
 	/* Setup VBR for boot cpu */
 	per_cpu_trap_init();
 }
-- 
cgit v0.10.2


From bf8b9a63c18a1a7777571650de0c9f4fd4368ca0 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Fri, 8 May 2009 20:53:58 +0530
Subject: x86: msr-index.h remove duplicate MSR C001_0015 declaration

MSRC001_0015 Hardware Configuration Register (HWCR) is already defined
as MSR_K7_HWCR.

And HWCR is available for >= K7.

So MSR_K8_HWCR is not required and no-one is using it.

[ Impact: cleanup, no object code change ]

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index ec41fc1..4d58d04 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -121,7 +121,6 @@
 #define MSR_K8_TOP_MEM1			0xc001001a
 #define MSR_K8_TOP_MEM2			0xc001001d
 #define MSR_K8_SYSCFG			0xc0010010
-#define MSR_K8_HWCR			0xc0010015
 #define MSR_K8_INT_PENDING_MSG		0xc0010055
 /* C1E active bits in int pending message */
 #define K8_INTP_C1E_ACTIVE_MASK		0x18000000
-- 
cgit v0.10.2


From 7fc23a5380797012e92a9633169440f2f4a21253 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 8 May 2009 18:52:21 +0200
Subject: perf_counter: optimize perf_counter_task_tick()

perf_counter_task_tick() does way too much work to find out
there's nothing to do. Provide an easy short-circuit for the
normal case where there are no counters on the system.

[ Impact: micro-optimization ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090508170028.750619201@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 60e55f0..fdb0d24 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -39,6 +39,7 @@ int perf_max_counters __read_mostly = 1;
 static int perf_reserved_percpu __read_mostly;
 static int perf_overcommit __read_mostly = 1;
 
+static atomic_t nr_counters __read_mostly;
 static atomic_t nr_mmap_tracking __read_mostly;
 static atomic_t nr_munmap_tracking __read_mostly;
 static atomic_t nr_comm_tracking __read_mostly;
@@ -1076,8 +1077,14 @@ static void rotate_ctx(struct perf_counter_context *ctx)
 
 void perf_counter_task_tick(struct task_struct *curr, int cpu)
 {
-	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+	struct perf_cpu_context *cpuctx;
+	struct perf_counter_context *ctx;
+
+	if (!atomic_read(&nr_counters))
+		return;
+
+	cpuctx = &per_cpu(perf_cpu_context, cpu);
+	ctx = &curr->perf_counter_ctx;
 
 	perf_counter_cpu_sched_out(cpuctx);
 	perf_counter_task_sched_out(curr, cpu);
@@ -1197,6 +1204,7 @@ static void free_counter(struct perf_counter *counter)
 {
 	perf_pending_sync(counter);
 
+	atomic_dec(&nr_counters);
 	if (counter->hw_event.mmap)
 		atomic_dec(&nr_mmap_tracking);
 	if (counter->hw_event.munmap)
@@ -2861,6 +2869,7 @@ done:
 
 	counter->pmu = pmu;
 
+	atomic_inc(&nr_counters);
 	if (counter->hw_event.mmap)
 		atomic_inc(&nr_mmap_tracking);
 	if (counter->hw_event.munmap)
-- 
cgit v0.10.2


From 3df5edad87a998273aa5a9a8c728c05d855ad00e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 8 May 2009 18:52:22 +0200
Subject: perf_counter: rework ioctl()s

Corey noticed that ioctl()s on grouped counters didn't work on
the whole group. This extends the ioctl() interface to take a
second argument that is interpreted as a flags field. We then
provide PERF_IOC_FLAG_GROUP to toggle the behaviour.

Having this flag gives the greatest flexibility, allowing you
to individually enable/disable/reset counters in a group, or
all together.

[ Impact: fix group counter enable/disable semantics ]

Reported-by: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090508170028.837558214@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 00081d8..88f863e 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -157,10 +157,14 @@ struct perf_counter_hw_event {
 /*
  * Ioctls that can be done on a perf counter fd:
  */
-#define PERF_COUNTER_IOC_ENABLE		_IO ('$', 0)
-#define PERF_COUNTER_IOC_DISABLE	_IO ('$', 1)
+#define PERF_COUNTER_IOC_ENABLE		_IOW('$', 0, u32)
+#define PERF_COUNTER_IOC_DISABLE	_IOW('$', 1, u32)
 #define PERF_COUNTER_IOC_REFRESH	_IOW('$', 2, u32)
-#define PERF_COUNTER_IOC_RESET		_IO ('$', 3)
+#define PERF_COUNTER_IOC_RESET		_IOW('$', 3, u32)
+
+enum perf_counter_ioc_flags {
+	PERF_IOC_FLAG_GROUP		= 1U << 0,
+};
 
 /*
  * Structure of the page that can be mapped via mmap
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index fdb0d24..f4883f1 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -82,7 +82,7 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	 * add it straight to the context's counter list, or to the group
 	 * leader's sibling list:
 	 */
-	if (counter->group_leader == counter)
+	if (group_leader == counter)
 		list_add_tail(&counter->list_entry, &ctx->counter_list);
 	else {
 		list_add_tail(&counter->list_entry, &group_leader->sibling_list);
@@ -385,24 +385,6 @@ static void perf_counter_disable(struct perf_counter *counter)
 	spin_unlock_irq(&ctx->lock);
 }
 
-/*
- * Disable a counter and all its children.
- */
-static void perf_counter_disable_family(struct perf_counter *counter)
-{
-	struct perf_counter *child;
-
-	perf_counter_disable(counter);
-
-	/*
-	 * Lock the mutex to protect the list of children
-	 */
-	mutex_lock(&counter->mutex);
-	list_for_each_entry(child, &counter->child_list, child_list)
-		perf_counter_disable(child);
-	mutex_unlock(&counter->mutex);
-}
-
 static int
 counter_sched_in(struct perf_counter *counter,
 		 struct perf_cpu_context *cpuctx,
@@ -753,24 +735,6 @@ static int perf_counter_refresh(struct perf_counter *counter, int refresh)
 	return 0;
 }
 
-/*
- * Enable a counter and all its children.
- */
-static void perf_counter_enable_family(struct perf_counter *counter)
-{
-	struct perf_counter *child;
-
-	perf_counter_enable(counter);
-
-	/*
-	 * Lock the mutex to protect the list of children
-	 */
-	mutex_lock(&counter->mutex);
-	list_for_each_entry(child, &counter->child_list, child_list)
-		perf_counter_enable(child);
-	mutex_unlock(&counter->mutex);
-}
-
 void __perf_counter_sched_out(struct perf_counter_context *ctx,
 			      struct perf_cpu_context *cpuctx)
 {
@@ -1307,31 +1271,79 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
 
 static void perf_counter_reset(struct perf_counter *counter)
 {
+	(void)perf_counter_read(counter);
 	atomic_set(&counter->count, 0);
+	perf_counter_update_userpage(counter);
+}
+
+static void perf_counter_for_each_sibling(struct perf_counter *counter,
+					  void (*func)(struct perf_counter *))
+{
+	struct perf_counter_context *ctx = counter->ctx;
+	struct perf_counter *sibling;
+
+	spin_lock_irq(&ctx->lock);
+	counter = counter->group_leader;
+
+	func(counter);
+	list_for_each_entry(sibling, &counter->sibling_list, list_entry)
+		func(sibling);
+	spin_unlock_irq(&ctx->lock);
+}
+
+static void perf_counter_for_each_child(struct perf_counter *counter,
+					void (*func)(struct perf_counter *))
+{
+	struct perf_counter *child;
+
+	mutex_lock(&counter->mutex);
+	func(counter);
+	list_for_each_entry(child, &counter->child_list, child_list)
+		func(child);
+	mutex_unlock(&counter->mutex);
+}
+
+static void perf_counter_for_each(struct perf_counter *counter,
+				  void (*func)(struct perf_counter *))
+{
+	struct perf_counter *child;
+
+	mutex_lock(&counter->mutex);
+	perf_counter_for_each_sibling(counter, func);
+	list_for_each_entry(child, &counter->child_list, child_list)
+		perf_counter_for_each_sibling(child, func);
+	mutex_unlock(&counter->mutex);
 }
 
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct perf_counter *counter = file->private_data;
-	int err = 0;
+	void (*func)(struct perf_counter *);
+	u32 flags = arg;
 
 	switch (cmd) {
 	case PERF_COUNTER_IOC_ENABLE:
-		perf_counter_enable_family(counter);
+		func = perf_counter_enable;
 		break;
 	case PERF_COUNTER_IOC_DISABLE:
-		perf_counter_disable_family(counter);
-		break;
-	case PERF_COUNTER_IOC_REFRESH:
-		err = perf_counter_refresh(counter, arg);
+		func = perf_counter_disable;
 		break;
 	case PERF_COUNTER_IOC_RESET:
-		perf_counter_reset(counter);
+		func = perf_counter_reset;
 		break;
+
+	case PERF_COUNTER_IOC_REFRESH:
+		return perf_counter_refresh(counter, arg);
 	default:
-		err = -ENOTTY;
+		return -ENOTTY;
 	}
-	return err;
+
+	if (flags & PERF_IOC_FLAG_GROUP)
+		perf_counter_for_each(counter, func);
+	else
+		perf_counter_for_each_child(counter, func);
+
+	return 0;
 }
 
 /*
-- 
cgit v0.10.2


From a85f61abe11a46553c4562e74edb27ebc782aeb7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 8 May 2009 18:52:23 +0200
Subject: perf_counter: add PERF_RECORD_CONFIG

Much like CONFIG_RECORD_GROUP records the hw_event.config to
identify the values, allow to record this for all counters.

[ Impact: extend perfcounter output record format ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090508170028.923228280@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 88f863e..0e6303d 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -104,6 +104,7 @@ enum perf_counter_record_format {
 	PERF_RECORD_ADDR	= 1U << 3,
 	PERF_RECORD_GROUP	= 1U << 4,
 	PERF_RECORD_CALLCHAIN	= 1U << 5,
+	PERF_RECORD_CONFIG	= 1U << 6,
 };
 
 /*
@@ -258,6 +259,7 @@ enum perf_event_type {
 	 * 	{ u32			pid, tid; } && PERF_RECORD_TID
 	 * 	{ u64			time;     } && PERF_RECORD_TIME
 	 * 	{ u64			addr;     } && PERF_RECORD_ADDR
+	 * 	{ u64			config;   } && PERF_RECORD_CONFIG
 	 *
 	 * 	{ u64			nr;
 	 * 	  { u64 event, val; } 	cnt[nr];  } && PERF_RECORD_GROUP
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f4883f1..c615f52 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1994,6 +1994,11 @@ static void perf_counter_output(struct perf_counter *counter,
 		header.size += sizeof(u64);
 	}
 
+	if (record_type & PERF_RECORD_CONFIG) {
+		header.type |= PERF_RECORD_CONFIG;
+		header.size += sizeof(u64);
+	}
+
 	if (record_type & PERF_RECORD_GROUP) {
 		header.type |= PERF_RECORD_GROUP;
 		header.size += sizeof(u64) +
@@ -2029,6 +2034,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (record_type & PERF_RECORD_ADDR)
 		perf_output_put(&handle, addr);
 
+	if (record_type & PERF_RECORD_CONFIG)
+		perf_output_put(&handle, counter->hw_event.config);
+
 	/*
 	 * XXX PERF_RECORD_GROUP vs inherited counters seems difficult.
 	 */
-- 
cgit v0.10.2


From f370e1e2f195ec1e6420e26fc83e0319595db578 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 8 May 2009 18:52:24 +0200
Subject: perf_counter: add PERF_RECORD_CPU

Allow recording the CPU number the event was generated on.

RFC: this leaves a u32 as reserved, should we fill in the
     node_id() there, or leave this open for future extention,
     as userspace can already easily do the cpu->node mapping
     if needed.

[ Impact: extend perfcounter output record format ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090508170029.008627711@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 0e6303d..614f921 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -105,6 +105,7 @@ enum perf_counter_record_format {
 	PERF_RECORD_GROUP	= 1U << 4,
 	PERF_RECORD_CALLCHAIN	= 1U << 5,
 	PERF_RECORD_CONFIG	= 1U << 6,
+	PERF_RECORD_CPU		= 1U << 7,
 };
 
 /*
@@ -260,6 +261,7 @@ enum perf_event_type {
 	 * 	{ u64			time;     } && PERF_RECORD_TIME
 	 * 	{ u64			addr;     } && PERF_RECORD_ADDR
 	 * 	{ u64			config;   } && PERF_RECORD_CONFIG
+	 * 	{ u32			cpu, res; } && PERF_RECORD_CPU
 	 *
 	 * 	{ u64			nr;
 	 * 	  { u64 event, val; } 	cnt[nr];  } && PERF_RECORD_GROUP
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c615f52..d850a1f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1956,6 +1956,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	struct perf_callchain_entry *callchain = NULL;
 	int callchain_size = 0;
 	u64 time;
+	struct {
+		u32 cpu, reserved;
+	} cpu_entry;
 
 	header.type = 0;
 	header.size = sizeof(header);
@@ -1999,6 +2002,13 @@ static void perf_counter_output(struct perf_counter *counter,
 		header.size += sizeof(u64);
 	}
 
+	if (record_type & PERF_RECORD_CPU) {
+		header.type |= PERF_RECORD_CPU;
+		header.size += sizeof(cpu_entry);
+
+		cpu_entry.cpu = raw_smp_processor_id();
+	}
+
 	if (record_type & PERF_RECORD_GROUP) {
 		header.type |= PERF_RECORD_GROUP;
 		header.size += sizeof(u64) +
@@ -2037,6 +2047,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (record_type & PERF_RECORD_CONFIG)
 		perf_output_put(&handle, counter->hw_event.config);
 
+	if (record_type & PERF_RECORD_CPU)
+		perf_output_put(&handle, cpu_entry);
+
 	/*
 	 * XXX PERF_RECORD_GROUP vs inherited counters seems difficult.
 	 */
-- 
cgit v0.10.2


From 29f93943d1916d1a3faa3f10f4a06994347ac990 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 8 May 2009 16:06:47 -0400
Subject: tracing: initialize return value for __ftrace_set_clr_event

Commit 8f31bfe538ebafac187d2d4465a92e1d9ee6d8c2
tracing/events: clean up for ftrace_set_clr_event()

Moved out the code for ftrace_set_clr_event into a helper funciton but
did not initialize the return value. As a result, we do not warn about
a typo in the echoing of events in set_event.

This patch restores the old warning:

 # echo foobar > set_event
-bash: echo: write error: Invalid argument

[ Impact: restore warning of invalid entries to set_event ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index df394bc..2eecb87 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -118,7 +118,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
 				  const char *event, int set)
 {
 	struct ftrace_event_call *call;
-	int ret;
+	int ret = -EINVAL;
 
 	mutex_lock(&event_mutex);
 	list_for_each_entry(call, &ftrace_events, list) {
-- 
cgit v0.10.2


From 4671c79408a3f8a5a6a45e39c4c164dada3a5678 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 8 May 2009 16:27:41 -0400
Subject: tracing: add trace_set_clr_event to export event enabling function

Other parts of the kernel may need to be able to enable or disable
specific events. Especially parts that create trace events.

[ Impact: allow enabling of trace events by those that create the event ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 662c1be..bae51dd 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -127,6 +127,8 @@ extern int trace_define_field(struct ftrace_event_call *call, char *type,
 
 #define is_signed_type(type)	(((type)(-1)) < 0)
 
+int trace_set_clr_event(const char *system, const char *event, int set);
+
 /*
  * The double __builtin_constant_p is because gcc will give us an error
  * if we try to allocate the static variable to fmt if it is not a
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 2eecb87..0eec0c5 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -177,6 +177,23 @@ static int ftrace_set_clr_event(char *buf, int set)
 	return __ftrace_set_clr_event(match, sub, event, set);
 }
 
+/**
+ * trace_set_clr_event - enable or disable an event
+ * @system: system name to match (NULL for any system)
+ * @event: event name to match (NULL for all events, within system)
+ * @set: 1 to enable, 0 to disable
+ *
+ * This is a way for other parts of the kernel to enable or disable
+ * event recording.
+ *
+ * Returns 0 on success, -EINVAL if the parameters do not match any
+ * registered events.
+ */
+int trace_set_clr_event(const char *system, const char *event, int set)
+{
+	return __ftrace_set_clr_event(NULL, system, event, set);
+}
+
 /* 128 should be much more than enough */
 #define EVENT_BUF_SIZE		127
 
-- 
cgit v0.10.2


From 6cac5a924668a56c7ccefc345805f1fe0536a90e Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Sun, 29 Mar 2009 19:56:29 -0700
Subject: xen/x86-64: fix breakpoints and hardware watchpoints

Native x86-64 uses the IST mechanism to run int3 and debug traps on
an alternative stack.  Xen does not do this, and so the frames were
being misinterpreted by the ptrace code.  This change special-cases
these two exceptions by using Xen variants which run on the normal
kernel stack properly.

Impact: avoid crash or bad data when IST trap is invoked under Xen
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 0d53425..c44e500 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -13,6 +13,9 @@ asmlinkage void divide_error(void);
 asmlinkage void debug(void);
 asmlinkage void nmi(void);
 asmlinkage void int3(void);
+asmlinkage void xen_debug(void);
+asmlinkage void xen_int3(void);
+asmlinkage void xen_stack_segment(void);
 asmlinkage void overflow(void);
 asmlinkage void bounds(void);
 asmlinkage void invalid_op(void);
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 38946c6..bb01ce0 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1379,6 +1379,11 @@ END(xen_failsafe_callback)
 paranoidzeroentry_ist debug do_debug DEBUG_STACK
 paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
 paranoiderrorentry stack_segment do_stack_segment
+#ifdef CONFIG_XEN
+zeroentry xen_debug do_debug
+zeroentry xen_int3 do_int3
+errorentry xen_stack_segment do_stack_segment
+#endif
 errorentry general_protection do_general_protection
 errorentry page_fault do_page_fault
 #ifdef CONFIG_X86_MCE
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 12a3159..7566e13 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -20,6 +20,7 @@
 #include <linux/delay.h>
 #include <linux/start_kernel.h>
 #include <linux/sched.h>
+#include <linux/kprobes.h>
 #include <linux/bootmem.h>
 #include <linux/module.h>
 #include <linux/mm.h>
@@ -44,6 +45,7 @@
 #include <asm/processor.h>
 #include <asm/proto.h>
 #include <asm/msr-index.h>
+#include <asm/traps.h>
 #include <asm/setup.h>
 #include <asm/desc.h>
 #include <asm/pgtable.h>
@@ -428,11 +430,26 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
 static int cvt_gate_to_trap(int vector, const gate_desc *val,
 			    struct trap_info *info)
 {
+	unsigned long addr;
+
 	if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
 		return 0;
 
 	info->vector = vector;
-	info->address = gate_offset(*val);
+
+	addr = gate_offset(*val);
+#ifdef CONFIG_X86_64
+	if (addr == (unsigned long)debug)
+		addr = (unsigned long)xen_debug;
+	else if (addr == (unsigned long)int3)
+		addr = (unsigned long)xen_int3;
+	else if (addr == (unsigned long)stack_segment)
+		addr = (unsigned long)xen_stack_segment;
+	else
+		WARN_ON(val->ist != 0);
+#endif	/* CONFIG_X86_64 */
+	info->address = addr;
+
 	info->cs = gate_segment(*val);
 	info->flags = val->dpl;
 	/* interrupt gates clear IF */
-- 
cgit v0.10.2


From b80119bb35a49a4e8dbfb9708872adfd5cf38dee Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 24 Apr 2009 00:22:08 -0700
Subject: xen/x86-64: clean up warnings about IST-using traps

Ignore known IST-using traps.  Aside from the debugger traps, they're
low-level faults which Xen will handle for us, so the kernel needn't
worry about them.  Keep warning in case unknown trap starts using IST.

Impact: suppress spurious warnings
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 7566e13..e9df942 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -439,14 +439,32 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
 
 	addr = gate_offset(*val);
 #ifdef CONFIG_X86_64
+	/*
+	 * Look for known traps using IST, and substitute them
+	 * appropriately.  The debugger ones are the only ones we care
+	 * about.  Xen will handle faults like double_fault and
+	 * machine_check, so we should never see them.  Warn if
+	 * there's an unexpected IST-using fault handler.
+	 */
 	if (addr == (unsigned long)debug)
 		addr = (unsigned long)xen_debug;
 	else if (addr == (unsigned long)int3)
 		addr = (unsigned long)xen_int3;
 	else if (addr == (unsigned long)stack_segment)
 		addr = (unsigned long)xen_stack_segment;
-	else
-		WARN_ON(val->ist != 0);
+	else if (addr == (unsigned long)double_fault ||
+		 addr == (unsigned long)nmi) {
+		/* Don't need to handle these */
+		return 0;
+#ifdef CONFIG_X86_MCE
+	} else if (addr == (unsigned long)machine_check) {
+		return 0;
+#endif
+	} else {
+		/* Some other trap using IST? */
+		if (WARN_ON(val->ist != 0))
+			return 0;
+	}
 #endif	/* CONFIG_X86_64 */
 	info->address = addr;
 
-- 
cgit v0.10.2


From a789ed5fb6d0256c4177c2cc27e06520ddbe4d4c Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 24 Apr 2009 00:26:50 -0700
Subject: xen: cache cr0 value to avoid trap'n'emulate for read_cr0

stts() is implemented in terms of read_cr0/write_cr0 to update the
state of the TS bit.  This happens during context switch, and so
is fairly performance critical.  Rather than falling back to
a trap-and-emulate native read_cr0, implement our own by caching
the last-written value from write_cr0 (the TS bit is the only one
we really care about).

Impact: optimise Xen context switches
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index e9df942..0a1700a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -658,10 +658,26 @@ static void xen_clts(void)
 	xen_mc_issue(PARAVIRT_LAZY_CPU);
 }
 
+static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
+
+static unsigned long xen_read_cr0(void)
+{
+	unsigned long cr0 = percpu_read(xen_cr0_value);
+
+	if (unlikely(cr0 == 0)) {
+		cr0 = native_read_cr0();
+		percpu_write(xen_cr0_value, cr0);
+	}
+
+	return cr0;
+}
+
 static void xen_write_cr0(unsigned long cr0)
 {
 	struct multicall_space mcs;
 
+	percpu_write(xen_cr0_value, cr0);
+
 	/* Only pay attention to cr0.TS; everything else is
 	   ignored. */
 	mcs = xen_mc_entry(0);
@@ -847,7 +863,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
 
 	.clts = xen_clts,
 
-	.read_cr0 = native_read_cr0,
+	.read_cr0 = xen_read_cr0,
 	.write_cr0 = xen_write_cr0,
 
 	.read_cr4 = native_read_cr4,
-- 
cgit v0.10.2


From 0b4eb462da10f832b28d518abffa4d77805928a0 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Thu, 30 Apr 2009 17:59:36 -0700
Subject: x86, boot: align the .bss section in the decompressor

Aligning the .bss section makes it trivial to use large operation
sizes for moving the initialized sections and clearing the .bss.
The alignment chosen (L1 cache) is somewhat arbitrary, but should be
large enough to avoid all known performance traps and small enough to
not cause troubles.

[ Impact: trivial performance enhancement, future patch prep	]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index 0d26c92..dbe515e 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -42,6 +42,7 @@ SECTIONS
 		*(.data.*)
 		_edata = . ;
 	}
+	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
 	.bss : {
 		_bss = . ;
 		*(.bss)
-- 
cgit v0.10.2


From d3dd3b5a29bb9582957451531fed461628dfc834 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 5 May 2009 21:17:15 -0700
Subject: kbuild: allow compressors (gzip, bzip2, lzma) to take multiple inputs

Allow the compression commands in Kbuild (i.e. gzip, bzip2, lzma) to
take multiple input files and emit the concatenated compressed
output.  This avoids an intermediate step when a kernel image is built
from multiple components, such as the relocatable x86-32 kernel.

Sam Ravnborg integrated the bin_size script into the Makefile.

[ Impact: new build feature, not yet used ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Acked-by: Sam Ravnborg <sam@ravnborg.org>

diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 9796195..f8cf938 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -183,20 +183,34 @@ cmd_objcopy = $(OBJCOPY) $(OBJCOPYFLAGS) $(OBJCOPYFLAGS_$(@F)) $< $@
 # ---------------------------------------------------------------------------
 
 quiet_cmd_gzip = GZIP    $@
-cmd_gzip = gzip -f -9 < $< > $@
+cmd_gzip = (cat $(filter-out FORCE,$^) | gzip -f -9 > $@) || \
+	(rm -f $@ ; false)
 
 
 # Bzip2
 # ---------------------------------------------------------------------------
 
-# Bzip2 does not include size in file... so we have to fake that
-size_append=$(CONFIG_SHELL) $(srctree)/scripts/bin_size
-
-quiet_cmd_bzip2 = BZIP2    $@
-cmd_bzip2 = (bzip2 -9 < $< && $(size_append) $<) > $@ || (rm -f $@ ; false)
+# Bzip2 and LZMA do not include size in file... so we have to fake that;
+# append the size as a 32-bit littleendian number as gzip does.
+size_append = echo -ne $(shell						\
+dec_size=0;								\
+for F in $1; do								\
+	fsize=$$(stat -c "%s" $$F);					\
+	dec_size=$$(expr $$dec_size + $$fsize);				\
+done;									\
+printf "%08x" $$dec_size |						\
+	sed 's/\(..\)\(..\)\(..\)\(..\)/\\\\x\4\\\\x\3\\\\x\2\\\\x\1/g'	\
+)
+
+quiet_cmd_bzip2 = BZIP2   $@
+cmd_bzip2 = (cat $(filter-out FORCE,$^) | \
+	bzip2 -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \
+	(rm -f $@ ; false)
 
 # Lzma
 # ---------------------------------------------------------------------------
 
 quiet_cmd_lzma = LZMA    $@
-cmd_lzma = (lzma -9 -c $< && $(size_append) $<) >$@ || (rm -f $@ ; false)
+cmd_lzma = (cat $(filter-out FORCE,$^) | \
+	lzma -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \
+	(rm -f $@ ; false)
diff --git a/scripts/bin_size b/scripts/bin_size
deleted file mode 100644
index 43e1b36..0000000
--- a/scripts/bin_size
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/sh
-
-if [ $# = 0 ] ; then
-   echo Usage: $0 file
-fi
-
-size_dec=`stat -c "%s" $1`
-size_hex_echo_string=`printf "%08x" $size_dec |
-     sed 's/\(..\)\(..\)\(..\)\(..\)/\\\\x\4\\\\x\3\\\\x\2\\\\x\1/g'`
-/bin/echo -ne $size_hex_echo_string
-- 
cgit v0.10.2


From 845adf7266a7ba6970bf982ffd96abc60d2018ab Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 5 May 2009 21:20:51 -0700
Subject: x86: add a Kconfig symbol for when relocations are needed

We only need to build relocations when we are building a 32-bit
relocatable kernel.  Rather than unnecessarily complicating the
Makefiles, make an explicit Kbuild symbol for this.

[ Impact: permits future cleanup ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Sam Ravnborg <sam@ravnborg.org>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 039c3f0..5aee453 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1513,6 +1513,11 @@ config RELOCATABLE
 	  it has been loaded at and the compile time physical address
 	  (CONFIG_PHYSICAL_START) is ignored.
 
+# Relocation on x86-32 needs some additional build support
+config X86_NEED_RELOCS
+	def_bool y
+	depends on X86_32 && RELOCATABLE
+
 config PHYSICAL_ALIGN
 	hex
 	prompt "Alignment value to which kernel should be aligned" if X86_32
-- 
cgit v0.10.2


From 5f11e02019ef44f041e6e38a1363fa2fd4b8785d Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 5 May 2009 22:53:11 -0700
Subject: x86, boot: simplify arch/x86/boot/compressed/Makefile

Simplify the arch/x86/boot/compressed/Makefile, by using the new
capability of specifying multiple inputs to a compressor, and the
CONFIG_X86_NEED_RELOCS Kconfig symbol.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Acked-by: Sam Ravnborg <sam@ravnborg.org>

diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 0f4b5e2..b35c3bb 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -29,7 +29,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE
 
 
 targets += vmlinux.bin.all vmlinux.relocs relocs
-hostprogs-$(CONFIG_X86_32) += relocs
+hostprogs-$(CONFIG_X86_NEED_RELOCS) += relocs
 
 quiet_cmd_relocs = RELOCS  $@
       cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
@@ -37,46 +37,19 @@ $(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE
 	$(call if_changed,relocs)
 
 vmlinux.bin.all-y := $(obj)/vmlinux.bin
-vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs
-quiet_cmd_relocbin = BUILD   $@
-      cmd_relocbin = cat $(filter-out FORCE,$^) > $@
-$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
-	$(call if_changed,relocbin)
+vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs
 
-ifeq ($(CONFIG_X86_32),y)
-
-ifdef CONFIG_RELOCATABLE
-$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
-	$(call if_changed,gzip)
-$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin.all FORCE
-	$(call if_changed,bzip2)
-$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin.all FORCE
-	$(call if_changed,lzma)
-else
-$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
-	$(call if_changed,gzip)
-$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE
-	$(call if_changed,bzip2)
-$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE
-	$(call if_changed,lzma)
-endif
-LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
-
-else
-
-$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
+$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
 	$(call if_changed,gzip)
-$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE
+$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
 	$(call if_changed,bzip2)
-$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE
+$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
 	$(call if_changed,lzma)
 
-LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
-endif
-
 suffix_$(CONFIG_KERNEL_GZIP)  = gz
 suffix_$(CONFIG_KERNEL_BZIP2) = bz2
 suffix_$(CONFIG_KERNEL_LZMA)  = lzma
 
+LDFLAGS_piggy.o := -r --format binary --oformat $(CONFIG_OUTPUT_FORMAT) -T
 $(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix_y) FORCE
 	$(call if_changed,ld)
-- 
cgit v0.10.2


From 283ab1c0bd462dd0b179393fb081a626f6687413 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 8 May 2009 15:32:47 -0700
Subject: x86, boot: follow standard Kbuild style for compression suffix

When generating the compression suffix in
arch/x86/boot/compressed/Makefile, follow standard Kbuild
conventions, that is:

- Use a dash not underscore before y/m/n endings
- Use := whenever possible.

Requested-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index b35c3bb..7f24fdb 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -46,10 +46,10 @@ $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
 $(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
 	$(call if_changed,lzma)
 
-suffix_$(CONFIG_KERNEL_GZIP)  = gz
-suffix_$(CONFIG_KERNEL_BZIP2) = bz2
-suffix_$(CONFIG_KERNEL_LZMA)  = lzma
+suffix-$(CONFIG_KERNEL_GZIP)	:= gz
+suffix-$(CONFIG_KERNEL_BZIP2)	:= bz2
+suffix-$(CONFIG_KERNEL_LZMA)	:= lzma
 
 LDFLAGS_piggy.o := -r --format binary --oformat $(CONFIG_OUTPUT_FORMAT) -T
-$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix_y) FORCE
+$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix-y) FORCE
 	$(call if_changed,ld)
-- 
cgit v0.10.2


From bd2a36984c50bb546a7d04cb395fddcf98a1092c Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 5 May 2009 23:24:50 -0700
Subject: x86, boot: use BP_scratch in arch/x86/boot/compressed/head_*.S

Use the BP_scratch symbol from asm-offsets.h instead of hard-coding
the location.

[ Impact: cleanup ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 85bd328..e3398f3 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -53,7 +53,7 @@ ENTRY(startup_32)
  * data at 0x1e4 (defined as a scratch field) are used as the stack
  * for this calculation. Only 4 bytes are needed.
  */
-	leal (0x1e4+4)(%esi), %esp
+	leal (BP_scratch+4)(%esi), %esp
 	call 1f
 1:	popl %ebp
 	subl $1b, %ebp
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index ed4a829..06cc7e5 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -56,7 +56,7 @@ ENTRY(startup_32)
  * data at 0x1e4 (defined as a scratch field) are used as the stack
  * for this calculation. Only 4 bytes are needed.
  */
-	leal	(0x1e4+4)(%esi), %esp
+	leal	(BP_scratch+4)(%esi), %esp
 	call	1f
 1:	popl	%ebp
 	subl	$1b, %ebp
-- 
cgit v0.10.2


From 5f64ec64e7f9b246c0a94f34cdf7782f98a6e55d Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 8 May 2009 15:45:17 -0700
Subject: x86, boot: stylistic cleanups for boot/compressed/head_32.S

Reformat arch/x86/boot/compressed/head_32.S to be closer to currently
preferred kernel assembly style, that is:

- opcode and operand separated by tab
- operands separated by ", "
- C-style comments

This also makes it more similar to head_64.S.

[ Impact: cleanup, no object code change ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index e3398f3..7bd7766 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -12,16 +12,16 @@
  * the page directory. [According to comments etc elsewhere on a compressed
  * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC]
  *
- * Page 0 is deliberately kept safe, since System Management Mode code in 
+ * Page 0 is deliberately kept safe, since System Management Mode code in
  * laptops may need to access the BIOS data stored there.  This is also
- * useful for future device drivers that either access the BIOS via VM86 
+ * useful for future device drivers that either access the BIOS via VM86
  * mode.
  */
 
 /*
  * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
  */
-.text
+	.text
 
 #include <linux/linkage.h>
 #include <asm/segment.h>
@@ -29,75 +29,80 @@
 #include <asm/boot.h>
 #include <asm/asm-offsets.h>
 
-.section ".text.head","ax",@progbits
+	.section ".text.head","ax",@progbits
 ENTRY(startup_32)
 	cld
-	/* test KEEP_SEGMENTS flag to see if the bootloader is asking
-	 * us to not reload segments */
-	testb $(1<<6), BP_loadflags(%esi)
-	jnz 1f
+	/*
+	 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
+	 * us to not reload segments
+	 */
+	testb	$(1<<6), BP_loadflags(%esi)
+	jnz	1f
 
 	cli
-	movl $(__BOOT_DS),%eax
-	movl %eax,%ds
-	movl %eax,%es
-	movl %eax,%fs
-	movl %eax,%gs
-	movl %eax,%ss
+	movl	$__BOOT_DS, %eax
+	movl	%eax, %ds
+	movl	%eax, %es
+	movl	%eax, %fs
+	movl	%eax, %gs
+	movl	%eax, %ss
 1:
 
-/* Calculate the delta between where we were compiled to run
+/*
+ * Calculate the delta between where we were compiled to run
  * at and where we were actually loaded at.  This can only be done
  * with a short local call on x86.  Nothing  else will tell us what
  * address we are running at.  The reserved chunk of the real-mode
  * data at 0x1e4 (defined as a scratch field) are used as the stack
  * for this calculation. Only 4 bytes are needed.
  */
-	leal (BP_scratch+4)(%esi), %esp
-	call 1f
-1:	popl %ebp
-	subl $1b, %ebp
+	leal	(BP_scratch+4)(%esi), %esp
+	call	1f
+1:	popl	%ebp
+	subl	$1b, %ebp
 
-/* %ebp contains the address we are loaded at by the boot loader and %ebx
+/*
+ * %ebp contains the address we are loaded at by the boot loader and %ebx
  * contains the address where we should move the kernel image temporarily
  * for safe in-place decompression.
  */
 
 #ifdef CONFIG_RELOCATABLE
-	movl 	%ebp, %ebx
+	movl	%ebp, %ebx
 	addl    $(CONFIG_PHYSICAL_ALIGN - 1), %ebx
 	andl    $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx
 #else
-	movl $LOAD_PHYSICAL_ADDR, %ebx
+	movl	$LOAD_PHYSICAL_ADDR, %ebx
 #endif
 
 	/* Replace the compressed data size with the uncompressed size */
-	subl input_len(%ebp), %ebx
-	movl output_len(%ebp), %eax
-	addl %eax, %ebx
+	subl	input_len(%ebp), %ebx
+	movl	output_len(%ebp), %eax
+	addl	%eax, %ebx
 	/* Add 8 bytes for every 32K input block */
-	shrl $12, %eax
-	addl %eax, %ebx
+	shrl	$12, %eax
+	addl	%eax, %ebx
 	/* Add 32K + 18 bytes of extra slack */
-	addl $(32768 + 18), %ebx
+	addl	$(32768 + 18), %ebx
 	/* Align on a 4K boundary */
-	addl $4095, %ebx
-	andl $~4095, %ebx
+	addl	$4095, %ebx
+	andl	$~4095, %ebx
 
-/* Copy the compressed kernel to the end of our buffer
+/*
+ * Copy the compressed kernel to the end of our buffer
  * where decompression in place becomes safe.
  */
-	pushl %esi
-	leal _ebss(%ebp), %esi
-	leal _ebss(%ebx), %edi
-	movl $(_ebss - startup_32), %ecx
+	pushl	%esi
+	leal	_ebss(%ebp), %esi
+	leal	_ebss(%ebx), %edi
+	movl	$(_ebss - startup_32), %ecx
 	std
-	rep
-	movsb
+	rep	movsb
 	cld
-	popl %esi
+	popl	%esi
 
-/* Compute the kernel start address.
+/*
+ * Compute the kernel start address.
  */
 #ifdef CONFIG_RELOCATABLE
 	addl    $(CONFIG_PHYSICAL_ALIGN - 1), %ebp
@@ -109,81 +114,84 @@ ENTRY(startup_32)
 /*
  * Jump to the relocated address.
  */
-	leal relocated(%ebx), %eax
-	jmp *%eax
+	leal	relocated(%ebx), %eax
+	jmp	*%eax
 ENDPROC(startup_32)
 
-.section ".text"
+	.text
 relocated:
 
 /*
  * Clear BSS
  */
-	xorl %eax,%eax
-	leal _edata(%ebx),%edi
-	leal _ebss(%ebx), %ecx
-	subl %edi,%ecx
+	xorl	%eax, %eax
+	leal	_edata(%ebx), %edi
+	leal	_ebss(%ebx), %ecx
+	subl	%edi, %ecx
 	cld
-	rep
-	stosb
+	rep	stosb
 
 /*
  * Setup the stack for the decompressor
  */
-	leal boot_stack_end(%ebx), %esp
+	leal	boot_stack_end(%ebx), %esp
 
 /*
  * Do the decompression, and jump to the new kernel..
  */
-	movl output_len(%ebx), %eax
-	pushl %eax
-			# push arguments for decompress_kernel:
-	pushl %ebp	# output address
-	movl input_len(%ebx), %eax
-	pushl %eax	# input_len
-	leal input_data(%ebx), %eax
-	pushl %eax	# input_data
-	leal boot_heap(%ebx), %eax
-	pushl %eax	# heap area
-	pushl %esi	# real mode pointer
-	call decompress_kernel
-	addl $20, %esp
-	popl %ecx
+	movl	output_len(%ebx), %eax
+	pushl	%eax
+				/* push arguments for decompress_kernel: */
+	pushl	%ebp		/* output address */
+	movl	input_len(%ebx), %eax
+	pushl	%eax		/* input_len */
+	leal	input_data(%ebx), %eax
+	pushl	%eax		/* input_data */
+	leal	boot_heap(%ebx), %eax
+	pushl	%eax		/* heap area */
+	pushl	%esi		/* real mode pointer */
+	call	decompress_kernel
+	addl	$20, %esp
+	popl	%ecx
 
 #if CONFIG_RELOCATABLE
-/* Find the address of the relocations.
+/*
+ * Find the address of the relocations.
  */
-	movl %ebp, %edi
-	addl %ecx, %edi
+	movl	%ebp, %edi
+	addl	%ecx, %edi
 
-/* Calculate the delta between where vmlinux was compiled to run
+/*
+ * Calculate the delta between where vmlinux was compiled to run
  * and where it was actually loaded.
  */
-	movl %ebp, %ebx
-	subl $LOAD_PHYSICAL_ADDR, %ebx
-	jz   2f		/* Nothing to be done if loaded at compiled addr. */
+	movl	%ebp, %ebx
+	subl	$LOAD_PHYSICAL_ADDR, %ebx
+	jz	2f	/* Nothing to be done if loaded at compiled addr. */
 /*
  * Process relocations.
  */
 
-1:	subl $4, %edi
-	movl 0(%edi), %ecx
-	testl %ecx, %ecx
-	jz 2f
-	addl %ebx, -__PAGE_OFFSET(%ebx, %ecx)
-	jmp 1b
+1:	subl	$4, %edi
+	movl	(%edi), %ecx
+	testl	%ecx, %ecx
+	jz	2f
+	addl	%ebx, -__PAGE_OFFSET(%ebx, %ecx)
+	jmp	1b
 2:
 #endif
 
 /*
  * Jump to the decompressed kernel.
  */
-	xorl %ebx,%ebx
-	jmp *%ebp
+	xorl	%ebx, %ebx
+	jmp	*%ebp
 
-.bss
-/* Stack and heap for uncompression */
-.balign 4
+/*
+ * Stack and heap for uncompression
+ */
+	.bss
+	.balign 4
 boot_heap:
 	.fill BOOT_HEAP_SIZE, 1, 0
 boot_stack:
-- 
cgit v0.10.2


From b40d68d5b5b799caaf99d2e073e62962e6d917ce Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 8 May 2009 15:59:13 -0700
Subject: x86, boot: stylistic cleanups for boot/compressed/head_64.S

Clean up style issues in arch/x86/boot/compressed/head_64.S.  This
file had a lot fewer style issues than its 32-bit cousin, but the ones
it has are worth fixing, especially since it makes the two files more
similar.

[ Impact: cleanup, no object code change ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 06cc7e5..26c3def 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -21,8 +21,8 @@
 /*
  * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
  */
-.code32
-.text
+	.code32
+	.text
 
 #include <linux/linkage.h>
 #include <asm/segment.h>
@@ -33,12 +33,14 @@
 #include <asm/processor-flags.h>
 #include <asm/asm-offsets.h>
 
-.section ".text.head"
+	.section ".text.head"
 	.code32
 ENTRY(startup_32)
 	cld
-	/* test KEEP_SEGMENTS flag to see if the bootloader is asking
-	 * us to not reload segments */
+	/*
+	 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
+	 * us to not reload segments
+	 */
 	testb $(1<<6), BP_loadflags(%esi)
 	jnz 1f
 
@@ -49,7 +51,8 @@ ENTRY(startup_32)
 	movl	%eax, %ss
 1:
 
-/* Calculate the delta between where we were compiled to run
+/*
+ * Calculate the delta between where we were compiled to run
  * at and where we were actually loaded at.  This can only be done
  * with a short local call on x86.  Nothing  else will tell us what
  * address we are running at.  The reserved chunk of the real-mode
@@ -70,10 +73,11 @@ ENTRY(startup_32)
 	testl	%eax, %eax
 	jnz	no_longmode
 
-/* Compute the delta between where we were compiled to run at
+/*
+ * Compute the delta between where we were compiled to run at
  * and where the code will actually run at.
- */
-/* %ebp contains the address we are loaded at by the boot loader and %ebx
+ *
+ * %ebp contains the address we are loaded at by the boot loader and %ebx
  * contains the address where we should move the kernel image temporarily
  * for safe in-place decompression.
  */
@@ -114,7 +118,7 @@ ENTRY(startup_32)
  /*
   * Build early 4G boot pagetable
   */
-	/* Initialize Page tables to 0*/
+	/* Initialize Page tables to 0 */
 	leal	pgtable(%ebx), %edi
 	xorl	%eax, %eax
 	movl	$((4096*6)/4), %ecx
@@ -155,7 +159,8 @@ ENTRY(startup_32)
 	btsl	$_EFER_LME, %eax
 	wrmsr
 
-	/* Setup for the jump to 64bit mode
+	/*
+	 * Setup for the jump to 64bit mode
 	 *
 	 * When the jump is performend we will be in long mode but
 	 * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1
@@ -184,7 +189,8 @@ no_longmode:
 
 #include "../../kernel/verify_cpu_64.S"
 
-	/* Be careful here startup_64 needs to be at a predictable
+	/*
+	 * Be careful here startup_64 needs to be at a predictable
 	 * address so I can export it in an ELF header.  Bootloaders
 	 * should look at the ELF header to find this address, as
 	 * it may change in the future.
@@ -192,7 +198,8 @@ no_longmode:
 	.code64
 	.org 0x200
 ENTRY(startup_64)
-	/* We come here either from startup_32 or directly from a
+	/*
+	 * We come here either from startup_32 or directly from a
 	 * 64bit bootloader.  If we come here from a bootloader we depend on
 	 * an identity mapped page table being provied that maps our
 	 * entire text+data+bss and hopefully all of memory.
@@ -209,7 +216,8 @@ ENTRY(startup_64)
 	movl    $0x20, %eax
 	ltr	%ax
 
-	/* Compute the decompressed kernel start address.  It is where
+	/*
+	 * Compute the decompressed kernel start address.  It is where
 	 * we were loaded at aligned to a 2M boundary. %rbp contains the
 	 * decompressed kernel start address.
 	 *
@@ -241,7 +249,8 @@ ENTRY(startup_64)
 	addq	$(32768 + 18 + 4095), %rbx
 	andq	$~4095, %rbx
 
-/* Copy the compressed kernel to the end of our buffer
+/*
+ * Copy the compressed kernel to the end of our buffer
  * where decompression in place becomes safe.
  */
 	leaq	_end_before_pgt(%rip), %r8
@@ -260,7 +269,7 @@ ENTRY(startup_64)
 	leaq	relocated(%rbx), %rax
 	jmp	*%rax
 
-.section ".text"
+	.text
 relocated:
 
 /*
@@ -271,8 +280,7 @@ relocated:
 	leaq    _end_before_pgt(%rbx), %rcx
 	subq	%rdi, %rcx
 	cld
-	rep
-	stosb
+	rep	stosb
 
 	/* Setup the stack */
 	leaq	boot_stack_end(%rip), %rsp
@@ -311,9 +319,11 @@ gdt:
 	.quad   0x0000000000000000	/* TS continued */
 gdt_end:
 
-.bss
-/* Stack and heap for uncompression */
-.balign 4
+/*
+ * Stack and heap for uncompression
+ */
+	.bss
+	.balign 4
 boot_heap:
 	.fill BOOT_HEAP_SIZE, 1, 0
 boot_stack:
-- 
cgit v0.10.2


From 5b11f1cee5797b38d16b94d8745b12b6727a8373 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 8 May 2009 16:20:34 -0700
Subject: x86, boot: straighten out ranges to copy/zero in compressed/head*.S

Both on 32 and 64 bits, we copy all the way up to the end of bss,
except that on 64 bits there is a hack to avoid copying on top of the
page tables.  There is no point in copying bss at all, especially
since we are just about to zero it all anyway.

To clean up and unify the handling, we now do:

  - copy from startup_32 to _bss.
  - zero from _bss to _ebss.
  - the _ebss symbol is aligned to an 8-byte boundary.
  - the page tables are moved to a separate section.

Use _bss as the copy endpoint since _edata may be misaligned.

[ Impact: cleanup, trivial performance improvement ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 7bd7766..59425e1 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -93,9 +93,9 @@ ENTRY(startup_32)
  * where decompression in place becomes safe.
  */
 	pushl	%esi
-	leal	_ebss(%ebp), %esi
-	leal	_ebss(%ebx), %edi
-	movl	$(_ebss - startup_32), %ecx
+	leal	_bss(%ebp), %esi
+	leal	_bss(%ebx), %edi
+	movl	$(_bss - startup_32), %ecx
 	std
 	rep	movsb
 	cld
@@ -125,7 +125,7 @@ relocated:
  * Clear BSS
  */
 	xorl	%eax, %eax
-	leal	_edata(%ebx), %edi
+	leal	_bss(%ebx), %edi
 	leal	_ebss(%ebx), %ecx
 	subl	%edi, %ecx
 	cld
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 26c3def..5bc9052 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -253,9 +253,9 @@ ENTRY(startup_64)
  * Copy the compressed kernel to the end of our buffer
  * where decompression in place becomes safe.
  */
-	leaq	_end_before_pgt(%rip), %r8
-	leaq	_end_before_pgt(%rbx), %r9
-	movq	$_end_before_pgt /* - $startup_32 */, %rcx
+	leaq	_bss(%rip), %r8
+	leaq	_bss(%rbx), %r9
+	movq	$_bss /* - $startup_32 */, %rcx
 1:	subq	$8, %r8
 	subq	$8, %r9
 	movq	0(%r8), %rax
@@ -276,8 +276,8 @@ relocated:
  * Clear BSS
  */
 	xorq	%rax, %rax
-	leaq    _edata(%rbx), %rdi
-	leaq    _end_before_pgt(%rbx), %rcx
+	leaq    _bss(%rbx), %rdi
+	leaq    _ebss(%rbx), %rcx
 	subq	%rdi, %rcx
 	cld
 	rep	stosb
@@ -329,3 +329,11 @@ boot_heap:
 boot_stack:
 	.fill BOOT_STACK_SIZE, 1, 0
 boot_stack_end:
+
+/*
+ * Space for page tables (not in .bss so not zeroed)
+ */
+	.section ".pgtable","a",@nobits
+	.balign 4096
+pgtable:
+	.fill 6*4096, 1, 0
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index dbe515e..cc353e1 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -2,6 +2,8 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
 
 #undef i386
 
+#include <asm/page_types.h>
+
 #ifdef CONFIG_X86_64
 OUTPUT_ARCH(i386:x86-64)
 ENTRY(startup_64)
@@ -48,13 +50,16 @@ SECTIONS
 		*(.bss)
 		*(.bss.*)
 		*(COMMON)
-#ifdef CONFIG_X86_64
-		. = ALIGN(8);
-		_end_before_pgt = . ;
-		. = ALIGN(4096);
-		pgtable = . ;
-		. = . + 4096 * 6;
-#endif
+		. = ALIGN(8);	/* For convenience during zeroing */
 		_ebss = .;
 	}
+#ifdef CONFIG_X86_64
+       . = ALIGN(PAGE_SIZE);
+       .pgtable : {
+		_pgtable = . ;
+		*(.pgtable)
+		_epgtable = . ;
+	}
+#endif
+	_end = .;
 }
-- 
cgit v0.10.2


From 0a137736704ef9af719409933b3c33e138461786 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 8 May 2009 16:27:41 -0700
Subject: x86, boot: set up the decompression stack as early as possible

Set up the decompression stack as soon as we know where it needs to
go.  That way we have a full-service stack as soon as possible, rather
than relying on the BP_scratch field.

Note that the stack does need to be empty during bss zeroing (or
else the stack needs to be moved out of the bss segment, which is also
an option.)

[ Impact: cleanup, minor paranoia ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 59425e1..d7245cf 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -88,6 +88,9 @@ ENTRY(startup_32)
 	addl	$4095, %ebx
 	andl	$~4095, %ebx
 
+	/* Set up the stack */
+	leal	boot_stack_end(%ebx), %esp
+
 /*
  * Copy the compressed kernel to the end of our buffer
  * where decompression in place becomes safe.
@@ -122,7 +125,7 @@ ENDPROC(startup_32)
 relocated:
 
 /*
- * Clear BSS
+ * Clear BSS (stack is currently empty)
  */
 	xorl	%eax, %eax
 	leal	_bss(%ebx), %edi
@@ -132,11 +135,6 @@ relocated:
 	rep	stosb
 
 /*
- * Setup the stack for the decompressor
- */
-	leal	boot_stack_end(%ebx), %esp
-
-/*
  * Do the decompression, and jump to the new kernel..
  */
 	movl	output_len(%ebx), %eax
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 5bc9052..a0b1842 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -249,6 +249,13 @@ ENTRY(startup_64)
 	addq	$(32768 + 18 + 4095), %rbx
 	andq	$~4095, %rbx
 
+	/* Set up the stack */
+	leaq	boot_stack_end(%rbx), %rsp
+
+	/* Zero EFLAGS */
+	pushq	$0
+	popfq
+
 /*
  * Copy the compressed kernel to the end of our buffer
  * where decompression in place becomes safe.
@@ -273,7 +280,7 @@ ENTRY(startup_64)
 relocated:
 
 /*
- * Clear BSS
+ * Clear BSS (stack is currently empty)
  */
 	xorq	%rax, %rax
 	leaq    _bss(%rbx), %rdi
@@ -282,13 +289,6 @@ relocated:
 	cld
 	rep	stosb
 
-	/* Setup the stack */
-	leaq	boot_stack_end(%rip), %rsp
-
-	/* zero EFLAGS after setting rsp */
-	pushq	$0
-	popfq
-
 /*
  * Do the decompression, and jump to the new kernel..
  */
-- 
cgit v0.10.2


From 97541912785369925723b6255438ad9fce2ddf04 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 6 May 2009 17:56:51 -0700
Subject: x86, boot: zero EFLAGS on 32 bits

The 64-bit code already clears EFLAGS as soon as it has a stack.  This
seems like a reasonable precaution, so do it on 32 bits as well.

[ Impact: extra paranoia ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index d7245cf..d02a4f0 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -91,6 +91,10 @@ ENTRY(startup_32)
 	/* Set up the stack */
 	leal	boot_stack_end(%ebx), %esp
 
+	/* Zero EFLAGS */
+	pushl	$0
+	popfl
+
 /*
  * Copy the compressed kernel to the end of our buffer
  * where decompression in place becomes safe.
-- 
cgit v0.10.2


From 36d3793c947f1ef7ba3d24eeeddc1be41adc5ab4 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 8 May 2009 16:45:15 -0700
Subject: x86, boot: use appropriate rep string for move and clear

In the pre-decompression code, use the appropriate largest possible
rep movs and rep stos to move code and clear bss, respectively.  For
reverse copy, do note that the initial values are supposed to be the
address of the first (highest) copy datum, not one byte beyond the end
of the buffer.

rep strings are not necessarily the fastest way to perform these
operations on all current processors, but are likely to be in the
future, and perhaps more importantly, we want to encourage the
architecturally right thing to do here.

This also fixes a couple of trivial inefficiencies on 64 bits.

[ Impact: trivial performance enhancement, increase code similarity ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index d02a4f0..6710dc7 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -100,11 +100,12 @@ ENTRY(startup_32)
  * where decompression in place becomes safe.
  */
 	pushl	%esi
-	leal	_bss(%ebp), %esi
-	leal	_bss(%ebx), %edi
+	leal	(_bss-4)(%ebp), %esi
+	leal	(_bss-4)(%ebx), %edi
 	movl	$(_bss - startup_32), %ecx
+	shrl	$2, %ecx
 	std
-	rep	movsb
+	rep	movsl
 	cld
 	popl	%esi
 
@@ -135,8 +136,8 @@ relocated:
 	leal	_bss(%ebx), %edi
 	leal	_ebss(%ebx), %ecx
 	subl	%edi, %ecx
-	cld
-	rep	stosb
+	shrl	$2, %ecx
+	rep	stosl
 
 /*
  * Do the decompression, and jump to the new kernel..
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index a0b1842..723c72d 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -260,15 +260,15 @@ ENTRY(startup_64)
  * Copy the compressed kernel to the end of our buffer
  * where decompression in place becomes safe.
  */
-	leaq	_bss(%rip), %r8
-	leaq	_bss(%rbx), %r9
+	pushq	%rsi
+	leaq	(_bss-8)(%rip), %rsi
+	leaq	(_bss-8)(%rbx), %rdi
 	movq	$_bss /* - $startup_32 */, %rcx
-1:	subq	$8, %r8
-	subq	$8, %r9
-	movq	0(%r8), %rax
-	movq	%rax, 0(%r9)
-	subq	$8, %rcx
-	jnz	1b
+	shrq	$3, %rcx
+	std
+	rep	movsq
+	cld
+	popq	%rsi
 
 /*
  * Jump to the relocated address.
@@ -282,12 +282,12 @@ relocated:
 /*
  * Clear BSS (stack is currently empty)
  */
-	xorq	%rax, %rax
-	leaq    _bss(%rbx), %rdi
-	leaq    _ebss(%rbx), %rcx
+	xorl	%eax, %eax
+	leaq    _bss(%rip), %rdi
+	leaq    _ebss(%rip), %rcx
 	subq	%rdi, %rcx
-	cld
-	rep	stosb
+	shrq	$3, %rcx
+	rep	stosq
 
 /*
  * Do the decompression, and jump to the new kernel..
-- 
cgit v0.10.2


From 02a884c0fe7ec8459d00d34b7d4101af21fc4a86 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 8 May 2009 17:42:16 -0700
Subject: x86, boot: determine compressed code offset at compile time

Determine the compressed code offset (from the kernel runtime address)
at compile time.  This allows some minor optimizations in
arch/x86/boot/compressed/head_*.S, but more importantly it makes this
value available to the build process, which will enable a future patch
to export the necessary linear memory footprint into the bzImage
header.

[ Impact: cleanup, future patch enabling ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 7f24fdb..49c8a4c 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -19,6 +19,8 @@ KBUILD_AFLAGS  := $(KBUILD_CFLAGS) -D__ASSEMBLY__
 LDFLAGS := -m elf_$(UTS_MACHINE)
 LDFLAGS_vmlinux := -T
 
+hostprogs-y	:= mkpiggy
+
 $(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
 	$(call if_changed,ld)
 	@:
@@ -50,6 +52,9 @@ suffix-$(CONFIG_KERNEL_GZIP)	:= gz
 suffix-$(CONFIG_KERNEL_BZIP2)	:= bz2
 suffix-$(CONFIG_KERNEL_LZMA)	:= lzma
 
-LDFLAGS_piggy.o := -r --format binary --oformat $(CONFIG_OUTPUT_FORMAT) -T
-$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix-y) FORCE
-	$(call if_changed,ld)
+quiet_cmd_mkpiggy = MKPIGGY $@
+      cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false )
+
+targets += piggy.S
+$(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE
+	$(call if_changed,mkpiggy)
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 6710dc7..470474b 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -75,18 +75,8 @@ ENTRY(startup_32)
 	movl	$LOAD_PHYSICAL_ADDR, %ebx
 #endif
 
-	/* Replace the compressed data size with the uncompressed size */
-	subl	input_len(%ebp), %ebx
-	movl	output_len(%ebp), %eax
-	addl	%eax, %ebx
-	/* Add 8 bytes for every 32K input block */
-	shrl	$12, %eax
-	addl	%eax, %ebx
-	/* Add 32K + 18 bytes of extra slack */
-	addl	$(32768 + 18), %ebx
-	/* Align on a 4K boundary */
-	addl	$4095, %ebx
-	andl	$~4095, %ebx
+	/* Target address to relocate to for decompression */
+	addl	$z_extract_offset, %ebx
 
 	/* Set up the stack */
 	leal	boot_stack_end(%ebx), %esp
@@ -142,12 +132,10 @@ relocated:
 /*
  * Do the decompression, and jump to the new kernel..
  */
-	movl	output_len(%ebx), %eax
-	pushl	%eax
+	leal	z_extract_offset_negative(%ebx), %ebp
 				/* push arguments for decompress_kernel: */
 	pushl	%ebp		/* output address */
-	movl	input_len(%ebx), %eax
-	pushl	%eax		/* input_len */
+	pushl	$z_input_len	/* input_len */
 	leal	input_data(%ebx), %eax
 	pushl	%eax		/* input_data */
 	leal	boot_heap(%ebx), %eax
@@ -155,14 +143,12 @@ relocated:
 	pushl	%esi		/* real mode pointer */
 	call	decompress_kernel
 	addl	$20, %esp
-	popl	%ecx
 
 #if CONFIG_RELOCATABLE
 /*
  * Find the address of the relocations.
  */
-	movl	%ebp, %edi
-	addl	%ecx, %edi
+	leal	z_output_len(%ebp), %edi
 
 /*
  * Calculate the delta between where vmlinux was compiled to run
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 723c72d..2b9f251 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -90,16 +90,8 @@ ENTRY(startup_32)
 	movl	$CONFIG_PHYSICAL_START, %ebx
 #endif
 
-	/* Replace the compressed data size with the uncompressed size */
-	subl	input_len(%ebp), %ebx
-	movl	output_len(%ebp), %eax
-	addl	%eax, %ebx
-	/* Add 8 bytes for every 32K input block */
-	shrl	$12, %eax
-	addl	%eax, %ebx
-	/* Add 32K + 18 bytes of extra slack and align on a 4K boundary */
-	addl	$(32768 + 18 + 4095), %ebx
-	andl	$~4095, %ebx
+	/* Target address to relocate to for decompression */
+	addl	$z_extract_offset, %ebx
 
 /*
  * Prepare for entering 64 bit mode
@@ -224,6 +216,9 @@ ENTRY(startup_64)
 	 * If it is a relocatable kernel then decompress and run the kernel
 	 * from load address aligned to 2MB addr, otherwise decompress and
 	 * run the kernel from CONFIG_PHYSICAL_START
+	 *
+	 * We cannot rely on the calculation done in 32-bit mode, since we
+	 * may have been invoked via the 64-bit entry point.
 	 */
 
 	/* Start with the delta to where the kernel will run at. */
@@ -237,17 +232,8 @@ ENTRY(startup_64)
 	movq	%rbp, %rbx
 #endif
 
-	/* Replace the compressed data size with the uncompressed size */
-	movl	input_len(%rip), %eax
-	subq	%rax, %rbx
-	movl	output_len(%rip), %eax
-	addq	%rax, %rbx
-	/* Add 8 bytes for every 32K input block */
-	shrq	$12, %rax
-	addq	%rax, %rbx
-	/* Add 32K + 18 bytes of extra slack and align on a 4K boundary */
-	addq	$(32768 + 18 + 4095), %rbx
-	andq	$~4095, %rbx
+	/* Target address to relocate to for decompression */
+	leaq	z_extract_offset(%rbp), %rbx
 
 	/* Set up the stack */
 	leaq	boot_stack_end(%rbx), %rsp
@@ -292,13 +278,12 @@ relocated:
 /*
  * Do the decompression, and jump to the new kernel..
  */
-	pushq	%rsi			# Save the real mode argument
-	movq	%rsi, %rdi		# real mode address
-	leaq	boot_heap(%rip), %rsi	# malloc area for uncompression
-	leaq	input_data(%rip), %rdx  # input_data
-	movl	input_len(%rip), %eax
-	movq	%rax, %rcx		# input_len
-	movq	%rbp, %r8		# output
+	pushq	%rsi			/* Save the real mode argument */
+	movq	%rsi, %rdi		/* real mode address */
+	leaq	boot_heap(%rip), %rsi	/* malloc area for uncompression */
+	leaq	input_data(%rip), %rdx  /* input_data */
+	movl	$z_input_len, %ecx	/* input_len */
+	movq	%rbp, %r8		/* output target address */
 	call	decompress_kernel
 	popq	%rsi
 
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
new file mode 100644
index 0000000..bcbd36c
--- /dev/null
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -0,0 +1,97 @@
+/* ----------------------------------------------------------------------- *
+ *
+ *  Copyright (C) 2009 Intel Corporation. All rights reserved.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License version
+ *  2 as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ *  02110-1301, USA.
+ *
+ *  H. Peter Anvin <hpa@linux.intel.com>
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Compute the desired load offset from a compressed program; outputs
+ * a small assembly wrapper with the appropriate symbols defined.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+
+static uint32_t getle32(const void *p)
+{
+	const uint8_t *cp = p;
+
+	return (uint32_t)cp[0] + ((uint32_t)cp[1] << 8) +
+		((uint32_t)cp[2] << 16) + ((uint32_t)cp[3] << 24);
+}
+
+int main(int argc, char *argv[])
+{
+	uint32_t olen;
+	long ilen;
+	unsigned long offs;
+	FILE *f;
+
+	if (argc < 2) {
+		fprintf(stderr, "Usage: %s compressed_file\n", argv[0]);
+		return 1;
+	}
+
+	/* Get the information for the compressed kernel image first */
+
+	f = fopen(argv[1], "r");
+	if (!f) {
+		perror(argv[1]);
+		return 1;
+	}
+
+
+	if (fseek(f, -4L, SEEK_END)) {
+		perror(argv[1]);
+	}
+	fread(&olen, sizeof olen, 1, f);
+	ilen = ftell(f);
+	olen = getle32(&olen);
+	fclose(f);
+
+	/*
+	 * Now we have the input (compressed) and output (uncompressed)
+	 * sizes, compute the necessary decompression offset...
+	 */
+
+	offs = (olen > ilen) ? olen - ilen : 0;
+	offs += olen >> 12;	/* Add 8 bytes for each 32K block */
+	offs += 32*1024 + 18;	/* Add 32K + 18 bytes slack */
+	offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
+
+	printf(".section \".rodata.compressed\",\"a\",@progbits\n");
+	printf(".globl z_input_len\n");
+	printf("z_input_len = %lu\n", ilen);
+	printf(".globl z_output_len\n");
+	printf("z_output_len = %lu\n", (unsigned long)olen);
+	printf(".globl z_extract_offset\n");
+	printf("z_extract_offset = 0x%lx\n", offs);
+	/* z_extract_offset_negative allows simplification of head_32.S */
+	printf(".globl z_extract_offset_negative\n");
+	printf("z_extract_offset_negative = -0x%lx\n", offs);
+
+	printf(".globl input_data, input_data_end\n");
+	printf("input_data:\n");
+	printf(".incbin \"%s\"\n", argv[1]);
+	printf("input_data_end:\n");
+
+	return 0;
+}
diff --git a/arch/x86/boot/compressed/vmlinux.scr b/arch/x86/boot/compressed/vmlinux.scr
deleted file mode 100644
index f02382a..0000000
--- a/arch/x86/boot/compressed/vmlinux.scr
+++ /dev/null
@@ -1,10 +0,0 @@
-SECTIONS
-{
-  .rodata.compressed : {
-	input_len = .;
-	LONG(input_data_end - input_data) input_data = .;
-	*(.data)
-	output_len = . - 4;
-	input_data_end = .;
-	}
-}
-- 
cgit v0.10.2


From 778dedae0cb76a441145f3a0c5d59fcb3ba296d5 Mon Sep 17 00:00:00 2001
From: Huang Weiyi <weiyi.huang@gmail.com>
Date: Sat, 9 May 2009 12:54:34 +0800
Subject: x86: mce: remove duplicated #include

Remove duplicated #include in arch/x86/kernel/cpu/mcheck/mce_intel_64.c.

[ Impact: cleanup ]

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index d6b72df..aedf976 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -15,7 +15,6 @@
 #include <asm/hw_irq.h>
 #include <asm/idle.h>
 #include <asm/therm_throt.h>
-#include <asm/apic.h>
 
 asmlinkage void smp_thermal_interrupt(void)
 {
-- 
cgit v0.10.2


From b30505c81a9d4adea8b70ecff512b0216929b797 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Thu, 7 May 2009 15:40:14 -0700
Subject: futex: add requeue-pi documentation

Add Documentation/futex-requeue-pi.txt describing the motivation for the
newly added FUTEX_*REQUEUE_PI op codes and their implementation.

[ Impact: add documentation ]

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Cc: Sripathi Kodi <sripathik@in.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <johnstul@us.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Dinakar Guniguntala <dino@in.ibm.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: Jakub Jelinek <jakub@redhat.com>
LKML-Reference: <4A03634E.3080609@us.ibm.com>
[ reformatted the file ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/futex-requeue-pi.txt b/Documentation/futex-requeue-pi.txt
new file mode 100644
index 0000000..9dc1ff4
--- /dev/null
+++ b/Documentation/futex-requeue-pi.txt
@@ -0,0 +1,131 @@
+Futex Requeue PI
+----------------
+
+Requeueing of tasks from a non-PI futex to a PI futex requires
+special handling in order to ensure the underlying rt_mutex is never
+left without an owner if it has waiters; doing so would break the PI
+boosting logic [see rt-mutex-desgin.txt] For the purposes of
+brevity, this action will be referred to as "requeue_pi" throughout
+this document.  Priority inheritance is abbreviated throughout as
+"PI".
+
+Motivation
+----------
+
+Without requeue_pi, the glibc implementation of
+pthread_cond_broadcast() must resort to waking all the tasks waiting
+on a pthread_condvar and letting them try to sort out which task
+gets to run first in classic thundering-herd formation.  An ideal
+implementation would wake the highest-priority waiter, and leave the
+rest to the natural wakeup inherent in unlocking the mutex
+associated with the condvar.
+
+Consider the simplified glibc calls:
+
+/* caller must lock mutex */
+pthread_cond_wait(cond, mutex)
+{
+	lock(cond->__data.__lock);
+	unlock(mutex);
+	do {
+	   unlock(cond->__data.__lock);
+	   futex_wait(cond->__data.__futex);
+	   lock(cond->__data.__lock);
+	} while(...)
+	unlock(cond->__data.__lock);
+	lock(mutex);
+}
+
+pthread_cond_broadcast(cond)
+{
+	lock(cond->__data.__lock);
+	unlock(cond->__data.__lock);
+	futex_requeue(cond->data.__futex, cond->mutex);
+}
+
+Once pthread_cond_broadcast() requeues the tasks, the cond->mutex
+has waiters. Note that pthread_cond_wait() attempts to lock the
+mutex only after it has returned to user space.  This will leave the
+underlying rt_mutex with waiters, and no owner, breaking the
+previously mentioned PI-boosting algorithms.
+
+In order to support PI-aware pthread_condvar's, the kernel needs to
+be able to requeue tasks to PI futexes.  This support implies that
+upon a successful futex_wait system call, the caller would return to
+user space already holding the PI futex.  The glibc implementation
+would be modified as follows:
+
+
+/* caller must lock mutex */
+pthread_cond_wait_pi(cond, mutex)
+{
+	lock(cond->__data.__lock);
+	unlock(mutex);
+	do {
+	   unlock(cond->__data.__lock);
+	   futex_wait_requeue_pi(cond->__data.__futex);
+	   lock(cond->__data.__lock);
+	} while(...)
+	unlock(cond->__data.__lock);
+        /* the kernel acquired the the mutex for us */
+}
+
+pthread_cond_broadcast_pi(cond)
+{
+	lock(cond->__data.__lock);
+	unlock(cond->__data.__lock);
+	futex_requeue_pi(cond->data.__futex, cond->mutex);
+}
+
+The actual glibc implementation will likely test for PI and make the
+necessary changes inside the existing calls rather than creating new
+calls for the PI cases.  Similar changes are needed for
+pthread_cond_timedwait() and pthread_cond_signal().
+
+Implementation
+--------------
+
+In order to ensure the rt_mutex has an owner if it has waiters, it
+is necessary for both the requeue code, as well as the waiting code,
+to be able to acquire the rt_mutex before returning to user space.
+The requeue code cannot simply wake the waiter and leave it to
+acquire the rt_mutex as it would open a race window between the
+requeue call returning to user space and the waiter waking and
+starting to run.  This is especially true in the uncontended case.
+
+The solution involves two new rt_mutex helper routines,
+rt_mutex_start_proxy_lock() and rt_mutex_finish_proxy_lock(), which
+allow the requeue code to acquire an uncontended rt_mutex on behalf
+of the waiter and to enqueue the waiter on a contended rt_mutex.
+Two new system calls provide the kernel<->user interface to
+requeue_pi: FUTEX_WAIT_REQUEUE_PI and FUTEX_REQUEUE_CMP_PI.
+
+FUTEX_WAIT_REQUEUE_PI is called by the waiter (pthread_cond_wait()
+and pthread_cond_timedwait()) to block on the initial futex and wait
+to be requeued to a PI-aware futex.  The implementation is the
+result of a high-speed collision between futex_wait() and
+futex_lock_pi(), with some extra logic to check for the additional
+wake-up scenarios.
+
+FUTEX_REQUEUE_CMP_PI is called by the waker
+(pthread_cond_broadcast() and pthread_cond_signal()) to requeue and
+possibly wake the waiting tasks. Internally, this system call is
+still handled by futex_requeue (by passing requeue_pi=1).  Before
+requeueing, futex_requeue() attempts to acquire the requeue target
+PI futex on behalf of the top waiter.  If it can, this waiter is
+woken.  futex_requeue() then proceeds to requeue the remaining
+nr_wake+nr_requeue tasks to the PI futex, calling
+rt_mutex_start_proxy_lock() prior to each requeue to prepare the
+task as a waiter on the underlying rt_mutex.  It is possible that
+the lock can be acquired at this stage as well, if so, the next
+waiter is woken to finish the acquisition of the lock.
+
+FUTEX_REQUEUE_PI accepts nr_wake and nr_requeue as arguments, but
+their sum is all that really matters.  futex_requeue() will wake or
+requeue up to nr_wake + nr_requeue tasks.  It will wake only as many
+tasks as it can acquire the lock for, which in the majority of cases
+should be 0 as good programming practice dictates that the caller of
+either pthread_cond_broadcast() or pthread_cond_signal() acquire the
+mutex prior to making the call. FUTEX_REQUEUE_PI requires that
+nr_wake=1.  nr_requeue should be INT_MAX for broadcast and 0 for
+signal.
-- 
cgit v0.10.2


From e9e8b1fb995543c4cc3930f465923a552b2e48b7 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sat, 9 May 2009 14:31:37 +0900
Subject: sh: Fix up the sh64 earlyprintk build.

sci_rxd_in() on SH-5 wants to be using SCSPTR, not SCSPTR2.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/drivers/serial/sh-sci.h b/drivers/serial/sh-sci.h
index e3eae4e..38072c1 100644
--- a/drivers/serial/sh-sci.h
+++ b/drivers/serial/sh-sci.h
@@ -636,7 +636,7 @@ static inline int sci_rxd_in(struct uart_port *port)
 #elif defined(CONFIG_CPU_SUBTYPE_SH5_101) || defined(CONFIG_CPU_SUBTYPE_SH5_103)
 static inline int sci_rxd_in(struct uart_port *port)
 {
-         return sci_in(port, SCSPTR2)&0x0001 ? 1 : 0; /* SCIF */
+         return sci_in(port, SCSPTR)&0x0001 ? 1 : 0; /* SCIF */
 }
 #elif defined(__H8300H__) || defined(__H8300S__)
 static inline int sci_rxd_in(struct uart_port *port)
-- 
cgit v0.10.2


From 2fedaacdc07e053d93e0607047a96d282f62aca2 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sat, 9 May 2009 14:38:49 +0900
Subject: sh: Cleanup irqflags size mismatch on SH-5 build.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/mm/cache-sh5.c b/arch/sh/mm/cache-sh5.c
index 9e277ec..8676209 100644
--- a/arch/sh/mm/cache-sh5.c
+++ b/arch/sh/mm/cache-sh5.c
@@ -60,7 +60,7 @@ static inline void sh64_teardown_dtlb_cache_slot(void)
 static inline void sh64_icache_inv_all(void)
 {
 	unsigned long long addr, flag, data;
-	unsigned int flags;
+	unsigned long flags;
 
 	addr = ICCR0;
 	flag = ICCR0_ICI;
@@ -172,7 +172,7 @@ static void sh64_icache_inv_user_page_range(struct mm_struct *mm,
 		unsigned long eaddr;
 		unsigned long after_last_page_start;
 		unsigned long mm_asid, current_asid;
-		unsigned long long flags = 0ULL;
+		unsigned long flags = 0;
 
 		mm_asid = cpu_asid(smp_processor_id(), mm);
 		current_asid = get_asid();
@@ -236,7 +236,7 @@ static void sh64_icache_inv_user_small_range(struct mm_struct *mm,
 	unsigned long long eaddr = start;
 	unsigned long long eaddr_end = start + len;
 	unsigned long current_asid, mm_asid;
-	unsigned long long flags;
+	unsigned long flags;
 	unsigned long long epage_start;
 
 	/*
@@ -342,7 +342,7 @@ static void inline sh64_dcache_purge_sets(int sets_to_purge_base, int n_sets)
 			 * alloco is a NOP if the cache is write-through.
 			 */
 			if (test_bit(SH_CACHE_MODE_WT, &(cpu_data->dcache.flags)))
-				ctrl_inb(eaddr);
+				__raw_readb((unsigned long)eaddr);
 		}
 	}
 
-- 
cgit v0.10.2


From 6dbe47a170f80159f23c856ad4e02f2685c6f6cb Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sat, 9 May 2009 14:44:30 +0900
Subject: sh: Provide __read_{read,write}sl() definitions for sh64.

These are presently only defined for sh32, use the plain unoptimized
versions for sh64. Fixes up smsc911x build.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h
index c7c360b..2534814 100644
--- a/arch/sh/include/asm/io.h
+++ b/arch/sh/include/asm/io.h
@@ -123,10 +123,15 @@ static inline void __raw_reads##bwlq(volatile void __iomem *mem,	\
 
 __BUILD_MEMORY_STRING(b, u8)
 __BUILD_MEMORY_STRING(w, u16)
-__BUILD_MEMORY_STRING(q, u64)
 
+#ifdef CONFIG_SUPERH32
 void __raw_writesl(void __iomem *addr, const void *data, int longlen);
 void __raw_readsl(const void __iomem *addr, void *data, int longlen);
+#else
+__BUILD_MEMORY_STRING(l, u32)
+#endif
+
+__BUILD_MEMORY_STRING(q, u64)
 
 #define writesb			__raw_writesb
 #define writesw			__raw_writesw
-- 
cgit v0.10.2


From 2bcfffa42309b6f73042c62459bf5207762a271d Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sat, 9 May 2009 16:02:08 +0900
Subject: sh: Rename opcode_t to insn_size_t.

This is now clashing with a driver, so just rename it.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/kprobes.h b/arch/sh/include/asm/kprobes.h
index 613644a..036c331 100644
--- a/arch/sh/include/asm/kprobes.h
+++ b/arch/sh/include/asm/kprobes.h
@@ -6,7 +6,7 @@
 #include <linux/types.h>
 #include <linux/ptrace.h>
 
-typedef u16 kprobe_opcode_t;
+typedef insn_size_t kprobe_opcode_t;
 #define BREAKPOINT_INSTRUCTION	0xc33a
 
 #define MAX_INSN_SIZE 16
diff --git a/arch/sh/include/asm/system_32.h b/arch/sh/include/asm/system_32.h
index 240b31e..6c68a51 100644
--- a/arch/sh/include/asm/system_32.h
+++ b/arch/sh/include/asm/system_32.h
@@ -198,7 +198,7 @@ do {							\
 })
 #endif
 
-int handle_unaligned_access(opcode_t instruction, struct pt_regs *regs,
+int handle_unaligned_access(insn_size_t instruction, struct pt_regs *regs,
 			    struct mem_access *ma);
 
 asmlinkage void do_address_error(struct pt_regs *regs,
diff --git a/arch/sh/include/asm/types.h b/arch/sh/include/asm/types.h
index beea4e6..b13caca 100644
--- a/arch/sh/include/asm/types.h
+++ b/arch/sh/include/asm/types.h
@@ -23,9 +23,9 @@ typedef unsigned short umode_t;
 typedef u32 dma_addr_t;
 
 #ifdef CONFIG_SUPERH32
-typedef u16 opcode_t;
+typedef u16 insn_size_t;
 #else
-typedef u32 opcode_t;
+typedef u32 insn_size_t;
 #endif
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/sh/kernel/io_trapped.c b/arch/sh/kernel/io_trapped.c
index c22853b..77dfecb 100644
--- a/arch/sh/kernel/io_trapped.c
+++ b/arch/sh/kernel/io_trapped.c
@@ -267,7 +267,7 @@ static struct mem_access trapped_io_access = {
 int handle_trapped_io(struct pt_regs *regs, unsigned long address)
 {
 	mm_segment_t oldfs;
-	opcode_t instruction;
+	insn_size_t instruction;
 	int tmp;
 
 	if (!lookup_tiop(address))
diff --git a/arch/sh/kernel/kgdb.c b/arch/sh/kernel/kgdb.c
index 7c747e7..305aad7 100644
--- a/arch/sh/kernel/kgdb.c
+++ b/arch/sh/kernel/kgdb.c
@@ -47,7 +47,7 @@ char in_nmi = 0;	/* Set during NMI to prevent re-entry */
 /* Calculate the new address for after a step */
 static short *get_step_address(struct pt_regs *linux_regs)
 {
-	opcode_t op = __raw_readw(linux_regs->pc);
+	insn_size_t op = __raw_readw(linux_regs->pc);
 	long addr;
 
 	/* BT */
@@ -134,7 +134,7 @@ static short *get_step_address(struct pt_regs *linux_regs)
  */
 
 static unsigned long stepped_address;
-static opcode_t stepped_opcode;
+static insn_size_t stepped_opcode;
 
 static void do_single_step(struct pt_regs *linux_regs)
 {
diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c
index 438f1eb..46348ed0 100644
--- a/arch/sh/kernel/traps.c
+++ b/arch/sh/kernel/traps.c
@@ -22,11 +22,11 @@ static void handle_BUG(struct pt_regs *regs)
 
 int is_valid_bugaddr(unsigned long addr)
 {
-	unsigned short opcode;
+	insn_size_t opcode;
 
 	if (addr < PAGE_OFFSET)
 		return 0;
-	if (probe_kernel_address((u16 *)addr, opcode))
+	if (probe_kernel_address((insn_size_t *)addr, opcode))
 		return 0;
 
 	return opcode == TRAPA_BUG_OPCODE;
@@ -66,7 +66,7 @@ BUILD_TRAP_HANDLER(bug)
 
 #ifdef CONFIG_BUG
 	if (__kernel_text_address(instruction_pointer(regs))) {
-		opcode_t insn = *(opcode_t *)instruction_pointer(regs);
+		insn_size_t insn = *(insn_size_t *)instruction_pointer(regs);
 		if (insn == TRAPA_BUG_OPCODE)
 			handle_BUG(regs);
 	}
diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c
index 67550d8..2b77277 100644
--- a/arch/sh/kernel/traps_32.c
+++ b/arch/sh/kernel/traps_32.c
@@ -177,7 +177,7 @@ static struct mem_access user_mem_access = {
  *   (if that instruction is in a branch delay slot)
  * - return 0 if emulation okay, -EFAULT on existential error
  */
-static int handle_unaligned_ins(opcode_t instruction, struct pt_regs *regs,
+static int handle_unaligned_ins(insn_size_t instruction, struct pt_regs *regs,
 				struct mem_access *ma)
 {
 	int ret, index, count;
@@ -322,10 +322,10 @@ static int handle_unaligned_ins(opcode_t instruction, struct pt_regs *regs,
  * - fetches the instruction from PC+2
  */
 static inline int handle_delayslot(struct pt_regs *regs,
-				   opcode_t old_instruction,
+				   insn_size_t old_instruction,
 				   struct mem_access *ma)
 {
-	opcode_t instruction;
+	insn_size_t instruction;
 	void __user *addr = (void __user *)(regs->pc +
 		instruction_size(old_instruction));
 
@@ -365,7 +365,7 @@ static inline int handle_delayslot(struct pt_regs *regs,
 
 static int handle_unaligned_notify_count = 10;
 
-int handle_unaligned_access(opcode_t instruction, struct pt_regs *regs,
+int handle_unaligned_access(insn_size_t instruction, struct pt_regs *regs,
 			    struct mem_access *ma)
 {
 	u_int rm;
@@ -523,7 +523,7 @@ asmlinkage void do_address_error(struct pt_regs *regs,
 	unsigned long error_code = 0;
 	mm_segment_t oldfs;
 	siginfo_t info;
-	opcode_t instruction;
+	insn_size_t instruction;
 	int tmp;
 
 	/* Intentional ifdef */
-- 
cgit v0.10.2


From f15b18d0755b3ee4b29991fc2fde535ee41df53c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 9 May 2009 10:04:22 +0200
Subject: perf_counter tools: remove debug code from builtin-stat.c

Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index e2fa117..cf575c3 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -331,8 +331,6 @@ static void create_perfstat_counter(int counter)
 	hw_event.exclude_kernel = event_mask[counter] & EVENT_MASK_KERNEL;
 	hw_event.exclude_user   = event_mask[counter] & EVENT_MASK_USER;
 
-printf("exclude: %d\n", event_mask[counter]);
-
 	if (scale)
 		hw_event.read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
 					  PERF_FORMAT_TOTAL_TIME_RUNNING;
-- 
cgit v0.10.2


From 1031a3a57ed76f5cb8754a89ddb22f3d9ebe8861 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sat, 9 May 2009 17:57:21 +0900
Subject: sh: Provide an __sdivsi3_2 export for sh64.

Newer code paths that are heavier in 64-bit math manage to get this
generated by newer compilers, provide a definition and export
accordingly. This is trivially wrapped around the existing __sdivsi3
code.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/sh_ksyms_64.c b/arch/sh/kernel/sh_ksyms_64.c
index 0d74d6b..8f54ef0 100644
--- a/arch/sh/kernel/sh_ksyms_64.c
+++ b/arch/sh/kernel/sh_ksyms_64.c
@@ -76,5 +76,7 @@ EXPORT_SYMBOL(strcpy);
 #define DECLARE_EXPORT(name) extern void name(void);EXPORT_SYMBOL(name)
 
 DECLARE_EXPORT(__sdivsi3);
+DECLARE_EXPORT(__sdivsi3_1);
+DECLARE_EXPORT(__sdivsi3_2);
 DECLARE_EXPORT(__udivsi3);
 DECLARE_EXPORT(__div_table);
diff --git a/arch/sh/lib64/sdivsi3.S b/arch/sh/lib64/sdivsi3.S
index 6a800c6..1963bbd 100644
--- a/arch/sh/lib64/sdivsi3.S
+++ b/arch/sh/lib64/sdivsi3.S
@@ -1,4 +1,6 @@
 	.global	__sdivsi3
+	.global	__sdivsi3_1
+	.global	__sdivsi3_2
 	.section	.text..SHmedia32,"ax"
 	.align	2
 
@@ -6,13 +8,15 @@
 	/* clobbered: r1,r18,r19,r20,r21,r25,tr0 */
 	/* result in r0 */
 __sdivsi3:
+__sdivsi3_1:
 	ptb __div_table,tr0
+	gettr tr0,r20
 
+__sdivsi3_2:
 	nsb r5, r1
 	shlld r5, r1, r25    /* normalize; [-2 ..1, 1..2) in s2.62 */
 	shari r25, 58, r21   /* extract 5(6) bit index (s2.4 with hole -1..1) */
 	/* bubble */
-	gettr tr0,r20
 	ldx.ub r20, r21, r19 /* u0.8 */
 	shari r25, 32, r25   /* normalize to s2.30 */
 	shlli r21, 1, r21
-- 
cgit v0.10.2


From 7cd0378ef4c0d7b05e7dcb1daa115b841ffdc31a Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sat, 9 May 2009 18:03:37 +0900
Subject: sh: Fix up SHmedia module ELF relocations.

This fixes up the LSB setting for SHmedia branching in updated symbols
when processing module relocations.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/module.c b/arch/sh/kernel/module.c
index c430810..c19b0f7 100644
--- a/arch/sh/kernel/module.c
+++ b/arch/sh/kernel/module.c
@@ -90,7 +90,7 @@ int apply_relocate_add(Elf32_Shdr *sechdrs,
 		 * SHmedia, the LSB of the symbol needs to be asserted
 		 * for the CPU to be in SHmedia mode when it starts executing
 		 * the branch target. */
-		relocation |= (sym->st_other & 4);
+		relocation |= !!(sym->st_other & 4);
 #endif
 
 		switch (ELF32_R_TYPE(rel[i].r_info)) {
-- 
cgit v0.10.2


From c3e2586b794b12ffcdf69b4e547030b51e18e6d9 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sat, 9 May 2009 23:33:02 +0900
Subject: sh: Integrate sh64 bits in vmlinux_32.lds.S.

This adds all of the requisite bits from vmlinux_64.lds.S in to the _32
variant, resulting in a unified and generic linker script that can be
shared across both.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/vmlinux_32.lds.S b/arch/sh/kernel/vmlinux_32.lds.S
index dd9b2ee..8d97f52 100644
--- a/arch/sh/kernel/vmlinux_32.lds.S
+++ b/arch/sh/kernel/vmlinux_32.lds.S
@@ -1,17 +1,23 @@
 /*
  * ld script to make SuperH Linux kernel
- * Written by Niibe Yutaka
+ * Written by Niibe Yutaka and Paul Mundt
  */
-#include <asm/thread_info.h>
-#include <asm/cache.h>
-#include <asm-generic/vmlinux.lds.h>
-
+#ifdef CONFIG_SUPERH64
+#define LOAD_OFFSET	CONFIG_PAGE_OFFSET
+OUTPUT_ARCH(sh:sh5)
+#else
+OUTPUT_ARCH(sh)
 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 OUTPUT_FORMAT("elf32-sh-linux", "elf32-sh-linux", "elf32-sh-linux")
 #else
 OUTPUT_FORMAT("elf32-shbig-linux", "elf32-shbig-linux", "elf32-shbig-linux")
 #endif
-OUTPUT_ARCH(sh)
+#endif
+
+#include <asm/thread_info.h>
+#include <asm/cache.h>
+#include <asm-generic/vmlinux.lds.h>
+
 ENTRY(_start)
 SECTIONS
 {
@@ -24,28 +30,35 @@ SECTIONS
 	. = CONFIG_PAGE_OFFSET + CONFIG_MEMORY_START + CONFIG_ZERO_PAGE_OFFSET;
 #endif
 
-	_text = .;			/* Text and read-only data */
-
-	.empty_zero_page : {
+	.empty_zero_page : AT(ADDR(.empty_zero_page) - LOAD_OFFSET) {
 		*(.empty_zero_page)
 	} = 0
 
-	.text : {
+	.text : AT(ADDR(.text) - LOAD_OFFSET) {
+		_text = .;		/* Text and read-only data */
 		HEAD_TEXT
 		TEXT_TEXT
+
+#ifdef CONFIG_SUPERH64
+		*(.text64)
+		*(.text..SHmedia32)
+#endif
+
 		SCHED_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
+		IRQENTRY_TEXT
 		*(.fixup)
 		*(.gnu.warning)
+		_etext = .;		/* End of text section */
 	} = 0x0009
 
 	. = ALIGN(16);		/* Exception table */
-	__start___ex_table = .;
-	__ex_table : { *(__ex_table) }
-	__stop___ex_table = .;
-
-	_etext = .;			/* End of text section */
+	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
+		__start___ex_table = .;
+		*(__ex_table)
+		__stop___ex_table = .;
+	}
 
 	NOTES
 	RO_DATA(PAGE_SIZE)
@@ -54,13 +67,15 @@ SECTIONS
 	 * Code which must be executed uncached and the associated data
 	 */
 	. = ALIGN(PAGE_SIZE);
-	__uncached_start = .;
-	.uncached.text : { *(.uncached.text) }
-	.uncached.data : { *(.uncached.data) }
-	__uncached_end = .;
+	.uncached : AT(ADDR(.uncached) - LOAD_OFFSET) {
+		__uncached_start = .;
+		*(.uncached.text)
+		*(.uncached.data)
+		__uncached_end = .;
+	}
 
 	. = ALIGN(THREAD_SIZE);
-	.data : {			/* Data */
+	.data : AT(ADDR(.data) - LOAD_OFFSET) {		/* Data */
 		*(.data.init_task)
 
 		. = ALIGN(L1_CACHE_BYTES);
@@ -84,39 +99,51 @@ SECTIONS
 	_edata = .;			/* End of data section */
 
 	. = ALIGN(PAGE_SIZE);		/* Init code and data */
-	__init_begin = .;
-	_sinittext = .;
-	.init.text : { INIT_TEXT }
-	_einittext = .;
-	.init.data : { INIT_DATA }
+	.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
+		__init_begin = .;
+		_sinittext = .;
+		INIT_TEXT
+		_einittext = .;
+	}
+
+	.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { INIT_DATA }
 
 	. = ALIGN(16);
-	__setup_start = .;
-	.init.setup : { *(.init.setup) }
-	__setup_end = .;
+	.init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+		__setup_start = .;
+		*(.init.setup)
+		__setup_end = .;
+	}
 
-	__initcall_start = .;
-	.initcall.init : {
+	.initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+		__initcall_start = .;
 		INITCALLS
+		__initcall_end = .;
+	}
+
+	.con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+		__con_initcall_start = .;
+		*(.con_initcall.init)
+		__con_initcall_end = .;
 	}
-	__initcall_end = .;
-	__con_initcall_start = .;
-	.con_initcall.init : { *(.con_initcall.init) }
-	__con_initcall_end = .;
 
 	SECURITY_INIT
 
 #ifdef CONFIG_BLK_DEV_INITRD
 	. = ALIGN(PAGE_SIZE);
-	__initramfs_start = .;
-	.init.ramfs : { *(.init.ramfs) }
-	__initramfs_end = .;
+	.init.ramfs : AT(ADDR(.init_ramfs) - LOAD_OFFSET) {
+		__initramfs_start = .;
+		*(.init.ramfs)
+		__initramfs_end = .;
+	}
 #endif
 
 	. = ALIGN(4);
-	__machvec_start = .;
-	.machvec.init : { *(.machvec.init) }
-	__machvec_end = .;
+	.machvec.init : AT(ADDR(.machvec.init) - LOAD_OFFSET) {
+		__machvec_start = .;
+		*(.machvec.init)
+		__machvec_end = .;
+	}
 
 	PERCPU(PAGE_SIZE)
 
@@ -124,11 +151,11 @@ SECTIONS
 	 * .exit.text is discarded at runtime, not link time, to deal with
 	 * references from __bug_table
 	 */
-	.exit.text : { EXIT_TEXT }
-	.exit.data : { EXIT_DATA }
+	.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { EXIT_TEXT }
+	.exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { EXIT_DATA }
 
 	. = ALIGN(PAGE_SIZE);
-	.bss : {
+	.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
 		__init_end = .;
 		__bss_start = .;		/* BSS */
 		*(.bss.page_aligned)
-- 
cgit v0.10.2


From dce97c8cb2ad365fa87dc6924d99528c21c63912 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sat, 9 May 2009 23:36:10 +0900
Subject: sh: Move the unified linker script in place, kill off old _64 one.

Just forcefully rename the _32 variant overtop, and kill off the now
unused _64 version.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S
index d7d4991..8d97f52 100644
--- a/arch/sh/kernel/vmlinux.lds.S
+++ b/arch/sh/kernel/vmlinux.lds.S
@@ -1,5 +1,181 @@
-#ifdef CONFIG_SUPERH32
-# include "vmlinux_32.lds.S"
+/*
+ * ld script to make SuperH Linux kernel
+ * Written by Niibe Yutaka and Paul Mundt
+ */
+#ifdef CONFIG_SUPERH64
+#define LOAD_OFFSET	CONFIG_PAGE_OFFSET
+OUTPUT_ARCH(sh:sh5)
 #else
-# include "vmlinux_64.lds.S"
+OUTPUT_ARCH(sh)
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+OUTPUT_FORMAT("elf32-sh-linux", "elf32-sh-linux", "elf32-sh-linux")
+#else
+OUTPUT_FORMAT("elf32-shbig-linux", "elf32-shbig-linux", "elf32-shbig-linux")
+#endif
+#endif
+
+#include <asm/thread_info.h>
+#include <asm/cache.h>
+#include <asm-generic/vmlinux.lds.h>
+
+ENTRY(_start)
+SECTIONS
+{
+#ifdef CONFIG_PMB_FIXED
+	. = CONFIG_PAGE_OFFSET + (CONFIG_MEMORY_START & 0x1fffffff) +
+	    CONFIG_ZERO_PAGE_OFFSET;
+#elif defined(CONFIG_32BIT)
+	. = CONFIG_PAGE_OFFSET + CONFIG_ZERO_PAGE_OFFSET;
+#else
+	. = CONFIG_PAGE_OFFSET + CONFIG_MEMORY_START + CONFIG_ZERO_PAGE_OFFSET;
+#endif
+
+	.empty_zero_page : AT(ADDR(.empty_zero_page) - LOAD_OFFSET) {
+		*(.empty_zero_page)
+	} = 0
+
+	.text : AT(ADDR(.text) - LOAD_OFFSET) {
+		_text = .;		/* Text and read-only data */
+		HEAD_TEXT
+		TEXT_TEXT
+
+#ifdef CONFIG_SUPERH64
+		*(.text64)
+		*(.text..SHmedia32)
+#endif
+
+		SCHED_TEXT
+		LOCK_TEXT
+		KPROBES_TEXT
+		IRQENTRY_TEXT
+		*(.fixup)
+		*(.gnu.warning)
+		_etext = .;		/* End of text section */
+	} = 0x0009
+
+	. = ALIGN(16);		/* Exception table */
+	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
+		__start___ex_table = .;
+		*(__ex_table)
+		__stop___ex_table = .;
+	}
+
+	NOTES
+	RO_DATA(PAGE_SIZE)
+
+	/*
+	 * Code which must be executed uncached and the associated data
+	 */
+	. = ALIGN(PAGE_SIZE);
+	.uncached : AT(ADDR(.uncached) - LOAD_OFFSET) {
+		__uncached_start = .;
+		*(.uncached.text)
+		*(.uncached.data)
+		__uncached_end = .;
+	}
+
+	. = ALIGN(THREAD_SIZE);
+	.data : AT(ADDR(.data) - LOAD_OFFSET) {		/* Data */
+		*(.data.init_task)
+
+		. = ALIGN(L1_CACHE_BYTES);
+		*(.data.cacheline_aligned)
+
+		. = ALIGN(L1_CACHE_BYTES);
+		*(.data.read_mostly)
+
+		. = ALIGN(PAGE_SIZE);
+		*(.data.page_aligned)
+
+		__nosave_begin = .;
+		*(.data.nosave)
+		. = ALIGN(PAGE_SIZE);
+		__nosave_end = .;
+
+		DATA_DATA
+		CONSTRUCTORS
+	}
+
+	_edata = .;			/* End of data section */
+
+	. = ALIGN(PAGE_SIZE);		/* Init code and data */
+	.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
+		__init_begin = .;
+		_sinittext = .;
+		INIT_TEXT
+		_einittext = .;
+	}
+
+	.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { INIT_DATA }
+
+	. = ALIGN(16);
+	.init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+		__setup_start = .;
+		*(.init.setup)
+		__setup_end = .;
+	}
+
+	.initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+		__initcall_start = .;
+		INITCALLS
+		__initcall_end = .;
+	}
+
+	.con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+		__con_initcall_start = .;
+		*(.con_initcall.init)
+		__con_initcall_end = .;
+	}
+
+	SECURITY_INIT
+
+#ifdef CONFIG_BLK_DEV_INITRD
+	. = ALIGN(PAGE_SIZE);
+	.init.ramfs : AT(ADDR(.init_ramfs) - LOAD_OFFSET) {
+		__initramfs_start = .;
+		*(.init.ramfs)
+		__initramfs_end = .;
+	}
 #endif
+
+	. = ALIGN(4);
+	.machvec.init : AT(ADDR(.machvec.init) - LOAD_OFFSET) {
+		__machvec_start = .;
+		*(.machvec.init)
+		__machvec_end = .;
+	}
+
+	PERCPU(PAGE_SIZE)
+
+	/*
+	 * .exit.text is discarded at runtime, not link time, to deal with
+	 * references from __bug_table
+	 */
+	.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { EXIT_TEXT }
+	.exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { EXIT_DATA }
+
+	. = ALIGN(PAGE_SIZE);
+	.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+		__init_end = .;
+		__bss_start = .;		/* BSS */
+		*(.bss.page_aligned)
+		*(.bss)
+		*(COMMON)
+		. = ALIGN(4);
+		_ebss = .;			/* uClinux MTD sucks */
+		_end = . ;
+	}
+
+	/*
+	 * When something in the kernel is NOT compiled as a module, the
+	 * module cleanup code and data are put into these segments. Both
+	 * can then be thrown away, as cleanup code is never called unless
+	 * it's a module.
+	 */
+	/DISCARD/ : {
+		*(.exitcall.exit)
+	}
+
+	STABS_DEBUG
+	DWARF_DEBUG
+}
diff --git a/arch/sh/kernel/vmlinux_32.lds.S b/arch/sh/kernel/vmlinux_32.lds.S
deleted file mode 100644
index 8d97f52..0000000
--- a/arch/sh/kernel/vmlinux_32.lds.S
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * ld script to make SuperH Linux kernel
- * Written by Niibe Yutaka and Paul Mundt
- */
-#ifdef CONFIG_SUPERH64
-#define LOAD_OFFSET	CONFIG_PAGE_OFFSET
-OUTPUT_ARCH(sh:sh5)
-#else
-OUTPUT_ARCH(sh)
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
-OUTPUT_FORMAT("elf32-sh-linux", "elf32-sh-linux", "elf32-sh-linux")
-#else
-OUTPUT_FORMAT("elf32-shbig-linux", "elf32-shbig-linux", "elf32-shbig-linux")
-#endif
-#endif
-
-#include <asm/thread_info.h>
-#include <asm/cache.h>
-#include <asm-generic/vmlinux.lds.h>
-
-ENTRY(_start)
-SECTIONS
-{
-#ifdef CONFIG_PMB_FIXED
-	. = CONFIG_PAGE_OFFSET + (CONFIG_MEMORY_START & 0x1fffffff) +
-	    CONFIG_ZERO_PAGE_OFFSET;
-#elif defined(CONFIG_32BIT)
-	. = CONFIG_PAGE_OFFSET + CONFIG_ZERO_PAGE_OFFSET;
-#else
-	. = CONFIG_PAGE_OFFSET + CONFIG_MEMORY_START + CONFIG_ZERO_PAGE_OFFSET;
-#endif
-
-	.empty_zero_page : AT(ADDR(.empty_zero_page) - LOAD_OFFSET) {
-		*(.empty_zero_page)
-	} = 0
-
-	.text : AT(ADDR(.text) - LOAD_OFFSET) {
-		_text = .;		/* Text and read-only data */
-		HEAD_TEXT
-		TEXT_TEXT
-
-#ifdef CONFIG_SUPERH64
-		*(.text64)
-		*(.text..SHmedia32)
-#endif
-
-		SCHED_TEXT
-		LOCK_TEXT
-		KPROBES_TEXT
-		IRQENTRY_TEXT
-		*(.fixup)
-		*(.gnu.warning)
-		_etext = .;		/* End of text section */
-	} = 0x0009
-
-	. = ALIGN(16);		/* Exception table */
-	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
-		__start___ex_table = .;
-		*(__ex_table)
-		__stop___ex_table = .;
-	}
-
-	NOTES
-	RO_DATA(PAGE_SIZE)
-
-	/*
-	 * Code which must be executed uncached and the associated data
-	 */
-	. = ALIGN(PAGE_SIZE);
-	.uncached : AT(ADDR(.uncached) - LOAD_OFFSET) {
-		__uncached_start = .;
-		*(.uncached.text)
-		*(.uncached.data)
-		__uncached_end = .;
-	}
-
-	. = ALIGN(THREAD_SIZE);
-	.data : AT(ADDR(.data) - LOAD_OFFSET) {		/* Data */
-		*(.data.init_task)
-
-		. = ALIGN(L1_CACHE_BYTES);
-		*(.data.cacheline_aligned)
-
-		. = ALIGN(L1_CACHE_BYTES);
-		*(.data.read_mostly)
-
-		. = ALIGN(PAGE_SIZE);
-		*(.data.page_aligned)
-
-		__nosave_begin = .;
-		*(.data.nosave)
-		. = ALIGN(PAGE_SIZE);
-		__nosave_end = .;
-
-		DATA_DATA
-		CONSTRUCTORS
-	}
-
-	_edata = .;			/* End of data section */
-
-	. = ALIGN(PAGE_SIZE);		/* Init code and data */
-	.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
-		__init_begin = .;
-		_sinittext = .;
-		INIT_TEXT
-		_einittext = .;
-	}
-
-	.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { INIT_DATA }
-
-	. = ALIGN(16);
-	.init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
-		__setup_start = .;
-		*(.init.setup)
-		__setup_end = .;
-	}
-
-	.initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
-		__initcall_start = .;
-		INITCALLS
-		__initcall_end = .;
-	}
-
-	.con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
-		__con_initcall_start = .;
-		*(.con_initcall.init)
-		__con_initcall_end = .;
-	}
-
-	SECURITY_INIT
-
-#ifdef CONFIG_BLK_DEV_INITRD
-	. = ALIGN(PAGE_SIZE);
-	.init.ramfs : AT(ADDR(.init_ramfs) - LOAD_OFFSET) {
-		__initramfs_start = .;
-		*(.init.ramfs)
-		__initramfs_end = .;
-	}
-#endif
-
-	. = ALIGN(4);
-	.machvec.init : AT(ADDR(.machvec.init) - LOAD_OFFSET) {
-		__machvec_start = .;
-		*(.machvec.init)
-		__machvec_end = .;
-	}
-
-	PERCPU(PAGE_SIZE)
-
-	/*
-	 * .exit.text is discarded at runtime, not link time, to deal with
-	 * references from __bug_table
-	 */
-	.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { EXIT_TEXT }
-	.exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { EXIT_DATA }
-
-	. = ALIGN(PAGE_SIZE);
-	.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
-		__init_end = .;
-		__bss_start = .;		/* BSS */
-		*(.bss.page_aligned)
-		*(.bss)
-		*(COMMON)
-		. = ALIGN(4);
-		_ebss = .;			/* uClinux MTD sucks */
-		_end = . ;
-	}
-
-	/*
-	 * When something in the kernel is NOT compiled as a module, the
-	 * module cleanup code and data are put into these segments. Both
-	 * can then be thrown away, as cleanup code is never called unless
-	 * it's a module.
-	 */
-	/DISCARD/ : {
-		*(.exitcall.exit)
-	}
-
-	STABS_DEBUG
-	DWARF_DEBUG
-}
diff --git a/arch/sh/kernel/vmlinux_64.lds.S b/arch/sh/kernel/vmlinux_64.lds.S
deleted file mode 100644
index 6966446..0000000
--- a/arch/sh/kernel/vmlinux_64.lds.S
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * ld script to make SH64 Linux kernel
- *
- * Copyright (C) 2000, 2001  Paolo Alberelli
- *
- * benedict.gaster@superh.com:	 2nd May 2002
- *    Add definition of empty_zero_page to be the first page of kernel image.
- *
- * benedict.gaster@superh.com:	 3rd May 2002
- *    Added support for ramdisk, removing statically linked romfs at the
- *    same time.
- *
- * lethal@linux-sh.org:          9th May 2003
- *    Kill off GLOBAL_NAME() usage and other CDC-isms.
- *
- * lethal@linux-sh.org:         19th May 2003
- *    Remove support for ancient toolchains.
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#include <asm/page.h>
-#include <asm/cache.h>
-#include <asm/thread_info.h>
-
-#define LOAD_OFFSET	CONFIG_PAGE_OFFSET
-#include <asm-generic/vmlinux.lds.h>
-
-OUTPUT_ARCH(sh:sh5)
-
-#define C_PHYS(x) AT (ADDR(x) - LOAD_OFFSET)
-
-ENTRY(__start)
-SECTIONS
-{
-	. = CONFIG_PAGE_OFFSET + CONFIG_MEMORY_START + PAGE_SIZE;
-	_text = .;			/* Text and read-only data */
-
-	.empty_zero_page : C_PHYS(.empty_zero_page) {
-		*(.empty_zero_page)
-	} = 0
-
-	.text : C_PHYS(.text) {
-		HEAD_TEXT
-		TEXT_TEXT
-		*(.text64)
-		*(.text..SHmedia32)
-		SCHED_TEXT
-		LOCK_TEXT
-		KPROBES_TEXT
-		*(.fixup)
-		*(.gnu.warning)
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
-	} = 0x6ff0fff0
-#else
-	} = 0xf0fff06f
-#endif
-
-	/* We likely want __ex_table to be Cache Line aligned */
-	. = ALIGN(L1_CACHE_BYTES);		/* Exception table */
-	__start___ex_table = .;
-	__ex_table : C_PHYS(__ex_table) { *(__ex_table) }
-	__stop___ex_table = .;
-
-	_etext = .;			/* End of text section */
-
-	NOTES 
-	RO_DATA(PAGE_SIZE)
-
-	. = ALIGN(THREAD_SIZE);
-	.data : C_PHYS(.data) {			/* Data */
-		*(.data.init_task)
-
-		. = ALIGN(L1_CACHE_BYTES);
-		*(.data.cacheline_aligned)
-
-		. = ALIGN(L1_CACHE_BYTES);
-		*(.data.read_mostly)
-
-		. = ALIGN(PAGE_SIZE);
-		*(.data.page_aligned)
-
-		__nosave_begin = .;
-		*(.data.nosave)
-		. = ALIGN(PAGE_SIZE);
-		__nosave_end = .;
-
-		DATA_DATA
-		CONSTRUCTORS
-	}
-
-	_edata = .;			/* End of data section */
-
-	. = ALIGN(PAGE_SIZE);		/* Init code and data */
-	__init_begin = .;
-	_sinittext = .;
-	.init.text : C_PHYS(.init.text) { INIT_TEXT }
-	_einittext = .;
-	.init.data : C_PHYS(.init.data) { INIT_DATA }
-	. = ALIGN(L1_CACHE_BYTES);	/* Better if Cache Line aligned */
-	__setup_start = .;
-	.init.setup : C_PHYS(.init.setup) { *(.init.setup) }
-	__setup_end = .;
-	__initcall_start = .;
-	.initcall.init : C_PHYS(.initcall.init) {
-		INITCALLS
-	}
-	__initcall_end = .;
-	__con_initcall_start = .;
-	.con_initcall.init : C_PHYS(.con_initcall.init) {
-		*(.con_initcall.init)
-	}
-	__con_initcall_end = .;
-
-	SECURITY_INIT
-
-#ifdef CONFIG_BLK_DEV_INITRD
-	. = ALIGN(PAGE_SIZE);
-	__initramfs_start = .;
-	.init.ramfs : C_PHYS(.init.ramfs) { *(.init.ramfs) }
-	__initramfs_end = .;
-#endif
-
-	. = ALIGN(8);
-	__machvec_start = .;
-	.machvec.init : C_PHYS(.machvec.init) { *(.machvec.init) }
-	__machvec_end = .;
-
-	PERCPU(PAGE_SIZE)
-
-	/*
-	 * .exit.text is discarded at runtime, not link time, to deal with
-	 * references from __bug_table
-	 */
-	.exit.text : C_PHYS(.exit.text) { EXIT_TEXT }
-	.exit.data : C_PHYS(.exit.data) { EXIT_DATA }
-
-	. = ALIGN(PAGE_SIZE);
-	.bss : C_PHYS(.bss) {
-		__init_end = .;
-		__bss_start = .;		/* BSS */
-		*(.bss.page_aligned)
-		*(.bss)
-		*(COMMON)
-		. = ALIGN(4);
-		_ebss = .;			/* uClinux MTD sucks */
-		_end = . ;
-	}
-
-	/*
-	 * When something in the kernel is NOT compiled as a module, the
-	 * module cleanup code and data are put into these segments. Both
-	 * can then be thrown away, as cleanup code is never called unless
-	 * it's a module.
-	 */
-	/DISCARD/ : {
-		*(.exitcall.exit)
-	}
-
-	STABS_DEBUG
-	DWARF_DEBUG
-}
-- 
cgit v0.10.2


From 7b022d07a0fd2ce02d4456b732c674ff1d16f5ce Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 10 May 2009 00:25:08 +0900
Subject: sh: Tidy up the ldscript output format specifier.

Tie this in to the Makefile directly, where we already know what we are
running on. This tidies up the linker script a bit, and is prep work for
unifying the arch/sh/boot/compressed linker scripts.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Makefile b/arch/sh/Makefile
index 68348e7..b941dc9 100644
--- a/arch/sh/Makefile
+++ b/arch/sh/Makefile
@@ -100,13 +100,17 @@ LDFLAGS_vmlinux	+= --defsym phys_stext=_stext-$(CONFIG_PAGE_OFFSET) \
 endif
 
 ifdef CONFIG_CPU_LITTLE_ENDIAN
-LDFLAGS_vmlinux		+= --defsym 'jiffies=jiffies_64'
+ld-bfd			:= elf32-$(UTS_MACHINE)-linux
+LDFLAGS_vmlinux		+= --defsym 'jiffies=jiffies_64' --oformat $(ld-bfd)
 LDFLAGS			+= -EL
 else
-LDFLAGS_vmlinux		+= --defsym 'jiffies=jiffies_64+4'
+ld-bfd			:= elf32-$(UTS_MACHINE)big-linux
+LDFLAGS_vmlinux		+= --defsym 'jiffies=jiffies_64+4' --oformat $(ld-bfd)
 LDFLAGS			+= -EB
 endif
 
+export ld-bfd
+
 head-y			:= arch/sh/kernel/init_task.o
 head-$(CONFIG_SUPERH32)	+= arch/sh/kernel/head_32.o
 head-$(CONFIG_SUPERH64)	+= arch/sh/kernel/head_64.o
diff --git a/arch/sh/boot/compressed/Makefile_32 b/arch/sh/boot/compressed/Makefile_32
index b96a055..2492557 100644
--- a/arch/sh/boot/compressed/Makefile_32
+++ b/arch/sh/boot/compressed/Makefile_32
@@ -28,7 +28,8 @@ ORIG_CFLAGS := $(KBUILD_CFLAGS)
 KBUILD_CFLAGS = $(subst -pg, , $(ORIG_CFLAGS))
 endif
 
-LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup -T $(obj)/../../kernel/vmlinux.lds
+LDFLAGS_vmlinux := --oformat $(ld-bfd) -Ttext $(IMAGE_OFFSET) -e startup \
+		   -T $(obj)/../../kernel/vmlinux.lds
 
 $(obj)/vmlinux: $(OBJECTS) $(obj)/piggy.o $(LIBGCC) FORCE
 	$(call if_changed,ld)
diff --git a/arch/sh/boot/compressed/Makefile_64 b/arch/sh/boot/compressed/Makefile_64
index 658d4f9..541a529 100644
--- a/arch/sh/boot/compressed/Makefile_64
+++ b/arch/sh/boot/compressed/Makefile_64
@@ -24,8 +24,8 @@ OBJECTS		:= $(obj)/vmlinux_64.lds $(obj)/head_64.o $(obj)/misc_64.o \
 ZIMAGE_OFFSET	:= $(shell /bin/bash -c 'printf "0x%08x" \
 		     $$[$(CONFIG_PAGE_OFFSET)+0x400000+0x10000]')
 
-LDFLAGS_vmlinux := -Ttext $(ZIMAGE_OFFSET) -e startup \
-		    -T $(obj)/../../kernel/vmlinux.lds
+LDFLAGS_vmlinux := --oformat $(ld-bfd) -Ttext $(ZIMAGE_OFFSET) -e startup \
+		   -T $(obj)/../../kernel/vmlinux.lds
 
 $(obj)/vmlinux: $(OBJECTS) $(obj)/piggy.o FORCE
 	$(call if_changed,ld)
diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S
index 8d97f52..1a2c8db 100644
--- a/arch/sh/kernel/vmlinux.lds.S
+++ b/arch/sh/kernel/vmlinux.lds.S
@@ -6,12 +6,8 @@
 #define LOAD_OFFSET	CONFIG_PAGE_OFFSET
 OUTPUT_ARCH(sh:sh5)
 #else
+#define LOAD_OFFSET	0
 OUTPUT_ARCH(sh)
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
-OUTPUT_FORMAT("elf32-sh-linux", "elf32-sh-linux", "elf32-sh-linux")
-#else
-OUTPUT_FORMAT("elf32-shbig-linux", "elf32-shbig-linux", "elf32-shbig-linux")
-#endif
 #endif
 
 #include <asm/thread_info.h>
-- 
cgit v0.10.2


From 20b27fa33743c6ef77a1248421fab51e8bf21a25 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 10 May 2009 00:36:22 +0900
Subject: sh: Fix up the sh64 zImage build.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/boot/compressed/Makefile_64 b/arch/sh/boot/compressed/Makefile_64
index 541a529..7093255 100644
--- a/arch/sh/boot/compressed/Makefile_64
+++ b/arch/sh/boot/compressed/Makefile_64
@@ -14,8 +14,7 @@
 targets		:= vmlinux vmlinux.bin vmlinux.bin.gz \
 		   head_64.o misc_64.o cache.o piggy.o
 
-OBJECTS		:= $(obj)/vmlinux_64.lds $(obj)/head_64.o $(obj)/misc_64.o \
-		   $(obj)/cache.o
+OBJECTS		:= $(obj)/head_64.o $(obj)/misc_64.o $(obj)/cache.o
 
 #
 # ZIMAGE_OFFSET is the load offset of the compression loader
diff --git a/arch/sh/boot/compressed/head_64.S b/arch/sh/boot/compressed/head_64.S
index 622eac3..9993113 100644
--- a/arch/sh/boot/compressed/head_64.S
+++ b/arch/sh/boot/compressed/head_64.S
@@ -14,6 +14,7 @@
  *   Copyright (C) 2002 Stuart Menefy (stuart.menefy@st.com)
  */
 #include <asm/cache.h>
+#include <asm/tlb.h>
 #include <cpu/mmu_context.h>
 #include <cpu/registers.h>
 
@@ -33,11 +34,7 @@
 #define	ICCR0_INIT_VAL	ICCR0_ON | ICCR0_ICI		/* ICE + ICI */
 #define	ICCR1_INIT_VAL	ICCR1_NOLOCK			/* No locking */
 
-#if 1
 #define	OCCR0_INIT_VAL	OCCR0_ON | OCCR0_OCI | OCCR0_WB	/* OCE + OCI + WB */
-#else
-#define	OCCR0_INIT_VAL	OCCR0_OFF
-#endif
 #define	OCCR1_INIT_VAL	OCCR1_NOLOCK			/* No locking */
 
 	.text
diff --git a/arch/sh/boot/compressed/vmlinux_64.lds b/arch/sh/boot/compressed/vmlinux_64.lds
deleted file mode 100644
index 59c2ef4..0000000
--- a/arch/sh/boot/compressed/vmlinux_64.lds
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * ld script to make compressed SuperH/shmedia Linux kernel+decompression
- *		bootstrap
- * Modified by Stuart Menefy from arch/sh/vmlinux.lds.S written by Niibe Yutaka
- */
-
-
-#ifdef CONFIG_LITTLE_ENDIAN
-/* OUTPUT_FORMAT("elf32-sh64l-linux", "elf32-sh64l-linux", "elf32-sh64l-linux") */
-#define NOP 0x6ff0fff0
-#else
-/* OUTPUT_FORMAT("elf32-sh64", "elf32-sh64", "elf32-sh64") */
-#define NOP 0xf0fff06f
-#endif
-
-OUTPUT_FORMAT("elf32-sh64-linux")
-OUTPUT_ARCH(sh)
-ENTRY(_start)
-
-#define ALIGNED_GAP(section, align) (((ADDR(section)+SIZEOF(section)+(align)-1) & ~((align)-1))-ADDR(section))
-#define FOLLOWING(section, align) AT (LOADADDR(section) + ALIGNED_GAP(section,align))
-
-SECTIONS
-{
-  _text = .;			/* Text and read-only data */
-
-  .text : {
-	*(.text)
-	*(.text64)
-	*(.text..SHmedia32)
-	*(.fixup)
-	*(.gnu.warning)
-	} = NOP
-  . = ALIGN(4);
-  .rodata : { *(.rodata) }
-
-  /* There is no 'real' reason for eight byte alignment, four would work
-   * as well, but gdb downloads much (*4) faster with this.
-   */
-  . = ALIGN(8);
-  .image : { *(.image) }
-  . = ALIGN(4);
-  _etext = .;			/* End of text section */
-
-  .data :			/* Data */
-	FOLLOWING(.image, 4)
-	{
-	_data = .;
-	*(.data)
-	}
-  _data_image = LOADADDR(.data);/* Address of data section in ROM */
-
-  _edata = .;			/* End of data section */
-
-  .stack : { stack = .;  _stack = .; }
-
-  . = ALIGN(4);
-  __bss_start = .;		/* BSS */
-  .bss : {
-	*(.bss)
-	}
-  . = ALIGN(4);
-  _end = . ;
-}
-- 
cgit v0.10.2


From a2e76c80d93ec3c59a030b6ca37b9087033565c1 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 10 May 2009 00:54:39 +0900
Subject: sh: Provide a tighter BOOT_LINK_OFFSET definition for the Cayman
 board.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 6ed16c4..bca6a2a 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -704,6 +704,7 @@ config ZERO_PAGE_OFFSET
 
 config BOOT_LINK_OFFSET
 	hex "Link address offset for booting"
+	default "0x00400000" if SH_CAYMAN
 	default "0x00800000"
 	help
 	  This option allows you to set the link address offset of the zImage.
-- 
cgit v0.10.2


From b20883562455060272126c36563a7d8edafc30d3 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 10 May 2009 00:55:45 +0900
Subject: sh: Provide a BITS definition, use it in the arch/sh/boot/ Makefiles.

This introduces a BITS export that can handily be picked up by Makefiles
for cleaner sharing. Reflect its use in arch/sh/boot/compressed/ in
preparation for unifying the Makefiles.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Makefile b/arch/sh/Makefile
index b941dc9..c1bbae1 100644
--- a/arch/sh/Makefile
+++ b/arch/sh/Makefile
@@ -91,9 +91,11 @@ KBUILD_IMAGE		:= $(defaultimage-y)
 #
 ifdef CONFIG_SUPERH32
 UTS_MACHINE	:= sh
+BITS		:= 32
 LDFLAGS_vmlinux	+= -e _stext
 else
 UTS_MACHINE	:= sh64
+BITS		:= 64
 LDFLAGS_vmlinux	+= --defsym phys_stext=_stext-$(CONFIG_PAGE_OFFSET) \
 		   --defsym phys_stext_shmedia=phys_stext+1 \
 		   -e phys_stext_shmedia
@@ -109,11 +111,9 @@ LDFLAGS_vmlinux		+= --defsym 'jiffies=jiffies_64+4' --oformat $(ld-bfd)
 LDFLAGS			+= -EB
 endif
 
-export ld-bfd
+export ld-bfd BITS
 
-head-y			:= arch/sh/kernel/init_task.o
-head-$(CONFIG_SUPERH32)	+= arch/sh/kernel/head_32.o
-head-$(CONFIG_SUPERH64)	+= arch/sh/kernel/head_64.o
+head-y	:= arch/sh/kernel/init_task.o arch/sh/kernel/head_$(BITS).o
 
 core-y				+= arch/sh/kernel/ arch/sh/mm/ arch/sh/boards/
 core-$(CONFIG_SH_FPU_EMU)	+= arch/sh/math-emu/
diff --git a/arch/sh/boot/Makefile b/arch/sh/boot/Makefile
index 95483d1..78efb04 100644
--- a/arch/sh/boot/Makefile
+++ b/arch/sh/boot/Makefile
@@ -20,9 +20,6 @@ CONFIG_BOOT_LINK_OFFSET	?= 0x00800000
 CONFIG_ZERO_PAGE_OFFSET	?= 0x00001000
 CONFIG_ENTRY_OFFSET	?= 0x00001000
 
-export CONFIG_PAGE_OFFSET CONFIG_MEMORY_START CONFIG_BOOT_LINK_OFFSET \
-       CONFIG_ZERO_PAGE_OFFSET CONFIG_ENTRY_OFFSET
-
 targets := zImage vmlinux.srec uImage uImage.srec
 subdir- := compressed
 
@@ -43,6 +40,9 @@ KERNEL_MEMORY := $(shell /bin/bash -c 'printf "0x%08x" \
 		     $$[$(CONFIG_MEMORY_START)]')
 endif
 
+export CONFIG_PAGE_OFFSET CONFIG_MEMORY_START CONFIG_BOOT_LINK_OFFSET \
+       CONFIG_ZERO_PAGE_OFFSET CONFIG_ENTRY_OFFSET KERNEL_MEMORY
+
 KERNEL_LOAD	:= $(shell /bin/bash -c 'printf "0x%08x" \
 		     $$[$(CONFIG_PAGE_OFFSET)  + \
 			$(KERNEL_MEMORY) + \
diff --git a/arch/sh/boot/compressed/Makefile b/arch/sh/boot/compressed/Makefile
index efb01dc..f0a71df 100644
--- a/arch/sh/boot/compressed/Makefile
+++ b/arch/sh/boot/compressed/Makefile
@@ -1,5 +1 @@
-ifeq ($(CONFIG_SUPERH32),y)
 include ${srctree}/arch/sh/boot/compressed/Makefile_32
-else
-include ${srctree}/arch/sh/boot/compressed/Makefile_64
-endif
diff --git a/arch/sh/boot/compressed/Makefile_32 b/arch/sh/boot/compressed/Makefile_32
index 2492557..9531bf1 100644
--- a/arch/sh/boot/compressed/Makefile_32
+++ b/arch/sh/boot/compressed/Makefile_32
@@ -5,9 +5,9 @@
 #
 
 targets		:= vmlinux vmlinux.bin vmlinux.bin.gz \
-		   head_32.o misc_32.o piggy.o
+		   head_$(BITS).o misc_$(BITS).o piggy.o
 
-OBJECTS = $(obj)/head_32.o $(obj)/misc_32.o
+OBJECTS = $(obj)/head_$(BITS).o $(obj)/misc_$(BITS).o $(obj)/cache.o
 
 ifdef CONFIG_SH_STANDARD_BIOS
 OBJECTS += $(obj)/../../kernel/sh_bios.o
@@ -18,7 +18,7 @@ endif
 #
 IMAGE_OFFSET	:= $(shell /bin/bash -c 'printf "0x%08x" \
 		     $$[$(CONFIG_PAGE_OFFSET)  + \
-			$(CONFIG_MEMORY_START) + \
+			$(KERNEL_MEMORY) + \
 			$(CONFIG_BOOT_LINK_OFFSET)]')
 
 LIBGCC	:= $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name)
-- 
cgit v0.10.2


From 1eca133cc9f978a8c44788fc5b2fe54219f9425c Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 10 May 2009 00:58:21 +0900
Subject: sh: Merge the split arch/sh/boot/compressed/ Makefiles.

This kills off the _64 variant and moves the _32 one over as the generic
one to use.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/boot/compressed/Makefile b/arch/sh/boot/compressed/Makefile
index f0a71df..9531bf1 100644
--- a/arch/sh/boot/compressed/Makefile
+++ b/arch/sh/boot/compressed/Makefile
@@ -1 +1,47 @@
-include ${srctree}/arch/sh/boot/compressed/Makefile_32
+#
+# linux/arch/sh/boot/compressed/Makefile
+#
+# create a compressed vmlinux image from the original vmlinux
+#
+
+targets		:= vmlinux vmlinux.bin vmlinux.bin.gz \
+		   head_$(BITS).o misc_$(BITS).o piggy.o
+
+OBJECTS = $(obj)/head_$(BITS).o $(obj)/misc_$(BITS).o $(obj)/cache.o
+
+ifdef CONFIG_SH_STANDARD_BIOS
+OBJECTS += $(obj)/../../kernel/sh_bios.o
+endif
+
+#
+# IMAGE_OFFSET is the load offset of the compression loader
+#
+IMAGE_OFFSET	:= $(shell /bin/bash -c 'printf "0x%08x" \
+		     $$[$(CONFIG_PAGE_OFFSET)  + \
+			$(KERNEL_MEMORY) + \
+			$(CONFIG_BOOT_LINK_OFFSET)]')
+
+LIBGCC	:= $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name)
+
+ifeq ($(CONFIG_FUNCTION_TRACER),y)
+ORIG_CFLAGS := $(KBUILD_CFLAGS)
+KBUILD_CFLAGS = $(subst -pg, , $(ORIG_CFLAGS))
+endif
+
+LDFLAGS_vmlinux := --oformat $(ld-bfd) -Ttext $(IMAGE_OFFSET) -e startup \
+		   -T $(obj)/../../kernel/vmlinux.lds
+
+$(obj)/vmlinux: $(OBJECTS) $(obj)/piggy.o $(LIBGCC) FORCE
+	$(call if_changed,ld)
+	@:
+
+$(obj)/vmlinux.bin: vmlinux FORCE
+	$(call if_changed,objcopy)
+
+$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
+	$(call if_changed,gzip)
+
+OBJCOPYFLAGS += -R .empty_zero_page
+
+$(obj)/piggy.o: $(obj)/piggy.S $(obj)/vmlinux.bin.gz FORCE
+	$(call if_changed,as_o_S)
diff --git a/arch/sh/boot/compressed/Makefile_32 b/arch/sh/boot/compressed/Makefile_32
deleted file mode 100644
index 9531bf1..0000000
--- a/arch/sh/boot/compressed/Makefile_32
+++ /dev/null
@@ -1,47 +0,0 @@
-#
-# linux/arch/sh/boot/compressed/Makefile
-#
-# create a compressed vmlinux image from the original vmlinux
-#
-
-targets		:= vmlinux vmlinux.bin vmlinux.bin.gz \
-		   head_$(BITS).o misc_$(BITS).o piggy.o
-
-OBJECTS = $(obj)/head_$(BITS).o $(obj)/misc_$(BITS).o $(obj)/cache.o
-
-ifdef CONFIG_SH_STANDARD_BIOS
-OBJECTS += $(obj)/../../kernel/sh_bios.o
-endif
-
-#
-# IMAGE_OFFSET is the load offset of the compression loader
-#
-IMAGE_OFFSET	:= $(shell /bin/bash -c 'printf "0x%08x" \
-		     $$[$(CONFIG_PAGE_OFFSET)  + \
-			$(KERNEL_MEMORY) + \
-			$(CONFIG_BOOT_LINK_OFFSET)]')
-
-LIBGCC	:= $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name)
-
-ifeq ($(CONFIG_FUNCTION_TRACER),y)
-ORIG_CFLAGS := $(KBUILD_CFLAGS)
-KBUILD_CFLAGS = $(subst -pg, , $(ORIG_CFLAGS))
-endif
-
-LDFLAGS_vmlinux := --oformat $(ld-bfd) -Ttext $(IMAGE_OFFSET) -e startup \
-		   -T $(obj)/../../kernel/vmlinux.lds
-
-$(obj)/vmlinux: $(OBJECTS) $(obj)/piggy.o $(LIBGCC) FORCE
-	$(call if_changed,ld)
-	@:
-
-$(obj)/vmlinux.bin: vmlinux FORCE
-	$(call if_changed,objcopy)
-
-$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
-	$(call if_changed,gzip)
-
-OBJCOPYFLAGS += -R .empty_zero_page
-
-$(obj)/piggy.o: $(obj)/piggy.S $(obj)/vmlinux.bin.gz FORCE
-	$(call if_changed,as_o_S)
diff --git a/arch/sh/boot/compressed/Makefile_64 b/arch/sh/boot/compressed/Makefile_64
deleted file mode 100644
index 7093255..0000000
--- a/arch/sh/boot/compressed/Makefile_64
+++ /dev/null
@@ -1,42 +0,0 @@
-#
-# arch/sh/boot/compressed/Makefile_64
-#
-# create a compressed vmlinux image from the original vmlinux
-#
-# Copyright (C) 2002 Stuart Menefy
-# Copyright (C) 2004 Paul Mundt
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License.  See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-
-targets		:= vmlinux vmlinux.bin vmlinux.bin.gz \
-		   head_64.o misc_64.o cache.o piggy.o
-
-OBJECTS		:= $(obj)/head_64.o $(obj)/misc_64.o $(obj)/cache.o
-
-#
-# ZIMAGE_OFFSET is the load offset of the compression loader
-# (4M for the kernel plus 64K for this loader)
-#
-ZIMAGE_OFFSET	:= $(shell /bin/bash -c 'printf "0x%08x" \
-		     $$[$(CONFIG_PAGE_OFFSET)+0x400000+0x10000]')
-
-LDFLAGS_vmlinux := --oformat $(ld-bfd) -Ttext $(ZIMAGE_OFFSET) -e startup \
-		   -T $(obj)/../../kernel/vmlinux.lds
-
-$(obj)/vmlinux: $(OBJECTS) $(obj)/piggy.o FORCE
-	$(call if_changed,ld)
-	@:
-
-$(obj)/vmlinux.bin: vmlinux FORCE
-	$(call if_changed,objcopy)
-
-$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
-	$(call if_changed,gzip)
-
-OBJCOPYFLAGS += -R .empty_zero_page
-
-$(obj)/piggy.o: $(obj)/piggy.S $(obj)/vmlinux.bin.gz FORCE
-	$(call if_changed,as_o_S)
-- 
cgit v0.10.2


From b412a49af970e703124fa3259bf24165c0c74024 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 10 May 2009 01:23:25 +0900
Subject: sh: Consolidate the boot link and entry offset definitions.

Consolidate these in a single place in the Kconfig menus. At the same
time, disable their interactivity and set them according to the board
config defaults.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index bca6a2a..a9dee13 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -694,23 +694,37 @@ endmenu
 menu "Boot options"
 
 config ZERO_PAGE_OFFSET
-	hex "Zero page offset"
-	default "0x00004000" if SH_SH03
-	default "0x00010000" if PAGE_SIZE_64KB
+	hex
+	default "0x00010000" if PAGE_SIZE_64KB || SH_RTS7751R2D || \
+				SH_7751_SOLUTION_ENGINE
+	default "0x00004000" if PAGE_SIZE_16KB || SH_SH03
 	default "0x00002000" if PAGE_SIZE_8KB
 	default "0x00001000"
 	help
 	  This sets the default offset of zero page.
 
 config BOOT_LINK_OFFSET
-	hex "Link address offset for booting"
+	hex
+	default "0x00210000" if SH_SHMIN
 	default "0x00400000" if SH_CAYMAN
+	default "0x00810000" if SH_7780_SOLUTION_ENGINE
+	default "0x009e0000" if SH_TITAN
+	default "0x01800000" if SH_SDK7780
+	default "0x02000000" if SH_EDOSK7760
 	default "0x00800000"
 	help
 	  This option allows you to set the link address offset of the zImage.
 	  This can be useful if you are on a board which has a small amount of
 	  memory.
 
+config ENTRY_OFFSET
+	hex
+	default "0x00001000" if PAGE_SIZE_4KB
+	default "0x00002000" if PAGE_SIZE_8KB
+	default "0x00004000" if PAGE_SIZE_16KB
+	default "0x00010000" if PAGE_SIZE_64KB
+	default "0x00000000"
+
 config UBC_WAKEUP
 	bool "Wakeup UBC on startup"
 	depends on CPU_SH4 && !CPU_SH4A
diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
index d4079ca..b900d2cd 100644
--- a/arch/sh/mm/Kconfig
+++ b/arch/sh/mm/Kconfig
@@ -201,14 +201,6 @@ config PAGE_SIZE_64KB
 
 endchoice
 
-config ENTRY_OFFSET
-	hex
-	default "0x00001000" if PAGE_SIZE_4KB
-	default "0x00002000" if PAGE_SIZE_8KB
-	default "0x00004000" if PAGE_SIZE_16KB
-	default "0x00010000" if PAGE_SIZE_64KB
-	default "0x00000000"
-
 choice
 	prompt "HugeTLB page size"
 	depends on HUGETLB_PAGE && (CPU_SH4 || CPU_SH5) && MMU
-- 
cgit v0.10.2


From 457daa2b66e07bbd2280b9f8d2b03e800f357243 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 10 May 2009 01:28:01 +0900
Subject: sh: Hook up cc-cross-prefix support.

This implements a simple case that just iterates through the common
cases, looking at UTS_MACHINE for hints.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Makefile b/arch/sh/Makefile
index c1bbae1..a47fbd2 100644
--- a/arch/sh/Makefile
+++ b/arch/sh/Makefile
@@ -101,6 +101,12 @@ LDFLAGS_vmlinux	+= --defsym phys_stext=_stext-$(CONFIG_PAGE_OFFSET) \
 		   -e phys_stext_shmedia
 endif
 
+ifneq ($(SUBARCH),$(ARCH))
+  ifeq ($(CROSS_COMPILE),)
+    CROSS_COMPILE := $(call cc-cross-prefix, $(UTS_MACHINE)-linux-  $(UTS_MACHINE)-linux-gnu-  $(UTS_MACHINE)-unknown-linux-gnu-)
+  endif
+endif
+
 ifdef CONFIG_CPU_LITTLE_ENDIAN
 ld-bfd			:= elf32-$(UTS_MACHINE)-linux
 LDFLAGS_vmlinux		+= --defsym 'jiffies=jiffies_64' --oformat $(ld-bfd)
-- 
cgit v0.10.2


From 567bb8fd47624cb9f894c64ce9530d43d5862a71 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 10 May 2009 14:25:39 +0900
Subject: sh: Fix up R0 dependence in __arch_swab16/32.

There is nothing in these routines that inherently depends on R0 use.
Given that these routines are inlined, it is rather easy to blow up the
compiler by exhausting the spill class when performing a 64-bit swab.

This presently manifests itself as the following:

CC      fs/ocfs2/suballoc.o
fs/ocfs2/suballoc.c: In function 'ocfs2_reserve_suballoc_bits':
fs/ocfs2/suballoc.c:638: error: unrecognizable insn:
(insn 2793 1230 1231 103 arch/sh/include/asm/swab.h:33 (set (reg:HI 853)
        (subreg:HI (reg:SI 149 macl) 2)) -1 (expr_list:REG_DEAD (reg:SI 149 macl)
        (nil)))
fs/ocfs2/suballoc.c:638: internal compiler error: in extract_insn, at recog.c:1991

This patch switches over to using an arbitrarily assigned register instead.

While the same issue does not exist in the SH-5 case, there is likewise no harm
in having an alternate register used for the byterev/shari pair.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/swab.h b/arch/sh/include/asm/swab.h
index e693159..0e08fe5 100644
--- a/arch/sh/include/asm/swab.h
+++ b/arch/sh/include/asm/swab.h
@@ -14,15 +14,15 @@ static inline __attribute_const__ __u32 __arch_swab32(__u32 x)
 {
 	__asm__(
 #ifdef __SH5__
-		"byterev	%0, %0\n\t"
+		"byterev	%1, %0\n\t"
 		"shari		%0, 32, %0"
 #else
-		"swap.b		%0, %0\n\t"
+		"swap.b		%1, %0\n\t"
 		"swap.w		%0, %0\n\t"
 		"swap.b		%0, %0"
 #endif
 		: "=r" (x)
-		: "0" (x));
+		: "r" (x));
 
 	return x;
 }
@@ -32,13 +32,13 @@ static inline __attribute_const__ __u16 __arch_swab16(__u16 x)
 {
 	__asm__(
 #ifdef __SH5__
-		"byterev	%0, %0\n\t"
+		"byterev	%1, %0\n\t"
 		"shari		%0, 32, %0"
 #else
-		"swap.b		%0, %0"
+		"swap.b		%1, %0"
 #endif
 		: "=r" (x)
-		:  "0" (x));
+		:  "r" (x));
 
 	return x;
 }
-- 
cgit v0.10.2


From 82afae6016b672acb90ceb8e773bba0bd977d2a3 Mon Sep 17 00:00:00 2001
From: Erdem Aktas <eaktas1@gmail.com>
Date: Sun, 10 May 2009 02:13:19 -0400
Subject: perf_counter tools: fix buffer overwrite problem for perf top command

There is a buffer overwrite problem in builtin-top.c line 526, When I
tried to use ./perf top command, it was giving memory corruption
problem.

[ Impact: fix 'perf top' crash ]

LKML-Reference: <3fee128b0905092313x608e65e0l7b1116d86914114f@mail.gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index cd6f61d..b1549dd 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -523,7 +523,7 @@ static int read_symbol(FILE *in, struct sym_entry *s)
 	if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
 		return 1;
 
-	s->sym = malloc(strlen(str));
+	s->sym = malloc(strlen(str)+1);
 	assert(s->sym);
 
 	strcpy((char *)s->sym, str);
-- 
cgit v0.10.2


From 5e751e992f3fb08ba35e1ca8095ec8fbf9eda523 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 8 May 2009 13:55:22 +0100
Subject: CRED: Rename cred_exec_mutex to reflect that it's a guard against
 ptrace

Rename cred_exec_mutex to reflect that it's a guard against foreign
intervention on a process's credential state, such as is made by ptrace().  The
attachment of a debugger to a process affects execve()'s calculation of the new
credential state - _and_ also setprocattr()'s calculation of that state.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/fs/compat.c b/fs/compat.c
index 681ed81..bb2a9b2 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1488,7 +1488,7 @@ int compat_do_execve(char * filename,
 	if (!bprm)
 		goto out_files;
 
-	retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+	retval = mutex_lock_interruptible(&current->cred_guard_mutex);
 	if (retval < 0)
 		goto out_free;
 	current->in_execve = 1;
@@ -1550,7 +1550,7 @@ int compat_do_execve(char * filename,
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
-	mutex_unlock(&current->cred_exec_mutex);
+	mutex_unlock(&current->cred_guard_mutex);
 	acct_update_integrals(current);
 	free_bprm(bprm);
 	if (displaced)
@@ -1573,7 +1573,7 @@ out_unmark:
 
 out_unlock:
 	current->in_execve = 0;
-	mutex_unlock(&current->cred_exec_mutex);
+	mutex_unlock(&current->cred_guard_mutex);
 
 out_free:
 	free_bprm(bprm);
diff --git a/fs/exec.c b/fs/exec.c
index 639177b..998e856 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1045,7 +1045,7 @@ void install_exec_creds(struct linux_binprm *bprm)
 	commit_creds(bprm->cred);
 	bprm->cred = NULL;
 
-	/* cred_exec_mutex must be held at least to this point to prevent
+	/* cred_guard_mutex must be held at least to this point to prevent
 	 * ptrace_attach() from altering our determination of the task's
 	 * credentials; any time after this it may be unlocked */
 
@@ -1055,7 +1055,7 @@ EXPORT_SYMBOL(install_exec_creds);
 
 /*
  * determine how safe it is to execute the proposed program
- * - the caller must hold current->cred_exec_mutex to protect against
+ * - the caller must hold current->cred_guard_mutex to protect against
  *   PTRACE_ATTACH
  */
 int check_unsafe_exec(struct linux_binprm *bprm)
@@ -1297,7 +1297,7 @@ int do_execve(char * filename,
 	if (!bprm)
 		goto out_files;
 
-	retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+	retval = mutex_lock_interruptible(&current->cred_guard_mutex);
 	if (retval < 0)
 		goto out_free;
 	current->in_execve = 1;
@@ -1360,7 +1360,7 @@ int do_execve(char * filename,
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
-	mutex_unlock(&current->cred_exec_mutex);
+	mutex_unlock(&current->cred_guard_mutex);
 	acct_update_integrals(current);
 	free_bprm(bprm);
 	if (displaced)
@@ -1383,7 +1383,7 @@ out_unmark:
 
 out_unlock:
 	current->in_execve = 0;
-	mutex_unlock(&current->cred_exec_mutex);
+	mutex_unlock(&current->cred_guard_mutex);
 
 out_free:
 	free_bprm(bprm);
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index d87247d..7f54ba9 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -145,8 +145,8 @@ extern struct cred init_cred;
 	.group_leader	= &tsk,						\
 	.real_cred	= &init_cred,					\
 	.cred		= &init_cred,					\
-	.cred_exec_mutex =						\
-		 __MUTEX_INITIALIZER(tsk.cred_exec_mutex),		\
+	.cred_guard_mutex =						\
+		 __MUTEX_INITIALIZER(tsk.cred_guard_mutex),		\
 	.comm		= "swapper",					\
 	.thread		= INIT_THREAD,					\
 	.fs		= &init_fs,					\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3fa82b3..5932ace 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1247,7 +1247,9 @@ struct task_struct {
 					 * credentials (COW) */
 	const struct cred *cred;	/* effective (overridable) subjective task
 					 * credentials (COW) */
-	struct mutex cred_exec_mutex;	/* execve vs ptrace cred calculation mutex */
+	struct mutex cred_guard_mutex;	/* guard against foreign influences on
+					 * credential calculations
+					 * (notably. ptrace) */
 
 	char comm[TASK_COMM_LEN]; /* executable name excluding path
 				     - access with [gs]et_task_comm (which lock
diff --git a/kernel/cred.c b/kernel/cred.c
index 3a03918..1bb4d7e 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -167,7 +167,7 @@ EXPORT_SYMBOL(prepare_creds);
 
 /*
  * Prepare credentials for current to perform an execve()
- * - The caller must hold current->cred_exec_mutex
+ * - The caller must hold current->cred_guard_mutex
  */
 struct cred *prepare_exec_creds(void)
 {
@@ -276,7 +276,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	struct cred *new;
 	int ret;
 
-	mutex_init(&p->cred_exec_mutex);
+	mutex_init(&p->cred_guard_mutex);
 
 	if (
 #ifdef CONFIG_KEYS
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 0692ab5..27ac802 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -185,10 +185,11 @@ int ptrace_attach(struct task_struct *task)
 	if (same_thread_group(task, current))
 		goto out;
 
-	/* Protect exec's credential calculations against our interference;
-	 * SUID, SGID and LSM creds get determined differently under ptrace.
+	/* Protect the target's credential calculations against our
+	 * interference; SUID, SGID and LSM creds get determined differently
+	 * under ptrace.
 	 */
-	retval = mutex_lock_interruptible(&task->cred_exec_mutex);
+	retval = mutex_lock_interruptible(&task->cred_guard_mutex);
 	if (retval  < 0)
 		goto out;
 
@@ -232,7 +233,7 @@ repeat:
 bad:
 	write_unlock_irqrestore(&tasklist_lock, flags);
 	task_unlock(task);
-	mutex_unlock(&task->cred_exec_mutex);
+	mutex_unlock(&task->cred_guard_mutex);
 out:
 	return retval;
 }
-- 
cgit v0.10.2


From 107db7c7dd137aeb7361b8c2606ac936c0be58ff Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 8 May 2009 13:55:27 +0100
Subject: CRED: Guard the setprocattr security hook against ptrace

Guard the setprocattr security hook against ptrace by taking the target task's
cred_guard_mutex around it.  The problem is that setprocattr() may otherwise
note the lack of a debugger, and then perform an action on that basis whilst
letting a debugger attach between the two points.  Holding cred_guard_mutex
across the test and the action prevents ptrace_attach() from doing that.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/fs/proc/base.c b/fs/proc/base.c
index fb45615..23342e1 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2128,9 +2128,15 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
 	if (copy_from_user(page, buf, count))
 		goto out_free;
 
+	/* Guard against adverse ptrace interaction */
+	length = mutex_lock_interruptible(&task->cred_guard_mutex);
+	if (length < 0)
+		goto out_free;
+
 	length = security_setprocattr(task,
 				      (char*)file->f_path.dentry->d_name.name,
 				      (void*)page, count);
+	mutex_unlock(&task->cred_guard_mutex);
 out_free:
 	free_page((unsigned long) page);
 out:
-- 
cgit v0.10.2


From d9d674e50007362567edb5df65c05dd56a5964bf Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 11 May 2009 12:12:38 +0900
Subject: sh: Fix up typo in arch/sh/kernel/vmlinux.lds.S

.init_ramfs ought to be .init.ramfs, fix it up.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S
index 1a2c8db..d737230 100644
--- a/arch/sh/kernel/vmlinux.lds.S
+++ b/arch/sh/kernel/vmlinux.lds.S
@@ -127,7 +127,7 @@ SECTIONS
 
 #ifdef CONFIG_BLK_DEV_INITRD
 	. = ALIGN(PAGE_SIZE);
-	.init.ramfs : AT(ADDR(.init_ramfs) - LOAD_OFFSET) {
+	.init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
 		__initramfs_start = .;
 		*(.init.ramfs)
 		__initramfs_end = .;
-- 
cgit v0.10.2


From 780f98ff1fa9cfcab177f6b5ab09b11321f1e5c8 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 11 May 2009 12:15:14 +0900
Subject: sh: Account for INITIAL_JIFFIES when using jiffies clocksource.

In the case where we fall back on the generic jiffies clocksource,
INITIAL_JIFFIES needs to be accounted for so that printk times aren't
completely skewed.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
index e0aa769..a77838f 100644
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -104,7 +104,7 @@ unsigned long long sched_clock(void)
 
 	/* jiffies based sched_clock if no clocksource is installed */
 	if (!clocksource_sh.rating)
-		return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
+		return (jiffies_64 - INITIAL_JIFFIES) * (NSEC_PER_SEC / HZ);
 
 	cycles = clocksource_sh.read(&clocksource_sh);
 	return cyc2ns(&clocksource_sh, cycles);
-- 
cgit v0.10.2


From 45fbe3ee01b8e463b28c2751b5dcc0cbdc142d90 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 6 May 2009 08:06:44 -0700
Subject: x86, e820, pci: reserve extra free space near end of RAM

The point is to take all RAM resources we have, and
_after_ we've added all the resources we've seen in
the E820 tree, we then _also_ try to add fake reserved
entries for any "round up to X" at the end of the RAM
resources.

[ Impact: improve PCI mem-resource allocation robustness, protect "stolen RAM" ]

Reported-by: Yannick Roehlly <yannick.roehlly@free.fr>
Acked-by: Jesse Barnes <jesse.barnes@intel.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: yannick.roehlly@free.fr
LKML-Reference: <4A01A784.2050407@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 0062813..a2335d9 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1371,6 +1371,23 @@ void __init e820_reserve_resources(void)
 	}
 }
 
+/* How much should we pad RAM ending depending on where it is? */
+static unsigned long ram_alignment(resource_size_t pos)
+{
+	unsigned long mb = pos >> 20;
+
+	/* To 64kB in the first megabyte */
+	if (!mb)
+		return 64*1024;
+
+	/* To 1MB in the first 16MB */
+	if (mb < 16)
+		return 1024*1024;
+
+	/* To 32MB for anything above that */
+	return 32*1024*1024;
+}
+
 void __init e820_reserve_resources_late(void)
 {
 	int i;
@@ -1382,6 +1399,24 @@ void __init e820_reserve_resources_late(void)
 			insert_resource_expand_to_fit(&iomem_resource, res);
 		res++;
 	}
+
+	/*
+	 * Try to bump up RAM regions to reasonable boundaries to
+	 * avoid stolen RAM:
+	 */
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *entry = &e820_saved.map[i];
+		resource_size_t start, end;
+
+		if (entry->type != E820_RAM)
+			continue;
+		start = entry->addr + entry->size;
+		end = round_up(start, ram_alignment(start));
+		if (start == end)
+			continue;
+		reserve_region_with_split(&iomem_resource, start,
+						  end - 1, "RAM buffer");
+	}
 }
 
 char *__init default_machine_specific_memory_setup(void)
-- 
cgit v0.10.2


From 5d423ccd7ba4285f1084e91b26805e1d0ae978ed Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 6 May 2009 08:07:52 -0700
Subject: x86/pci: remove rounding quirk from e820_setup_gap()

Now that the e820 code explicitly reserves 'potentially dangerous'
free physical memory address space to protect ACPI stolen RAM,
there's no need for the rounding quirk in the PCI allocator anymore.

Also, this quirk was open-ended iteration that could end up reserving
a lot of free space and potentially breaking drivers - such as the one
reported by Yannick Roehlly <yannick.roehlly@free.fr> where there's
a PCI device with a large memory resource.

So remove it.

[ Impact: make more of the PCI hole available for assigning pci devices ]

Reported-by: Yannick Roehlly <yannick.roehlly@free.fr>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Jesse Barnes <jesse.barnes@intel.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <4A01A7C8.5090701@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index a2335d9..7271fa3 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -617,7 +617,7 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
  */
 __init void e820_setup_gap(void)
 {
-	unsigned long gapstart, gapsize, round;
+	unsigned long gapstart, gapsize;
 	int found;
 
 	gapstart = 0x10000000;
@@ -635,14 +635,9 @@ __init void e820_setup_gap(void)
 #endif
 
 	/*
-	 * See how much we want to round up: start off with
-	 * rounding to the next 1MB area.
+	 * e820_reserve_resources_late protect stolen RAM already
 	 */
-	round = 0x100000;
-	while ((gapsize >> 4) > round)
-		round += round;
-	/* Fun with two's complement */
-	pci_mem_start = (gapstart + round) & -round;
+	pci_mem_start = gapstart;
 
 	printk(KERN_INFO
 	       "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
-- 
cgit v0.10.2


From 53d6979ab6747e758207e8ac861b96d0da0d3332 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:35 +0900
Subject: nbd: don't clear rq->sector and nr_sectors unnecessarily

There's no reason to clear rq->sector and nr_sectors after calling
blk_rq_init().  They're guaranteed to be clear.  Drop unnecessary
clearing.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Paul Clements <paul.clements@steeleye.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 4d6de4f..a9ab8be 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -580,13 +580,6 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
 		blk_rq_init(NULL, &sreq);
 		sreq.cmd_type = REQ_TYPE_SPECIAL;
 		nbd_cmd(&sreq) = NBD_CMD_DISC;
-		/*
-		 * Set these to sane values in case server implementation
-		 * fails to check the request type first and also to keep
-		 * debugging output cleaner.
-		 */
-		sreq.sector = 0;
-		sreq.nr_sectors = 0;
 		if (!lo->sock)
 			return -EINVAL;
 		nbd_send_req(lo, &sreq);
-- 
cgit v0.10.2


From 9720aef2539c10e3a872e9a92beec225030d99db Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:36 +0900
Subject: ide-tape: don't initialize rq->sector for rw requests

rq->sector is set to the tape->first_frame but it's never actually
used and not even in the correct unit (512 byte sectors).  Don't set
it.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Borislav Petkov <petkovbb@gmail.com>
Acked-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 8dfc688..7149224 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -892,7 +892,6 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
 	rq->cmd_type = REQ_TYPE_SPECIAL;
 	rq->cmd[13] = cmd;
 	rq->rq_disk = tape->disk;
-	rq->sector = tape->first_frame;
 
 	if (size) {
 		ret = blk_rq_map_kern(drive->queue, rq, tape->buf, size,
-- 
cgit v0.10.2


From c3a4d78c580de4edc9ef0f7c59812fb02ceb037f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:37 +0900
Subject: block: add rq->resid_len

rq->data_len served two purposes - the length of data buffer on issue
and the residual count on completion.  This duality creates some
headaches.

First of all, block layer and low level drivers can't really determine
what rq->data_len contains while a request is executing.  It could be
the total request length or it coulde be anything else one of the
lower layers is using to keep track of residual count.  This
complicates things because blk_rq_bytes() and thus
[__]blk_end_request_all() relies on rq->data_len for PC commands.
Drivers which want to report residual count should first cache the
total request length, update rq->data_len and then complete the
request with the cached data length.

Secondly, it makes requests default to reporting full residual count,
ie. reporting that no data transfer occurred.  The residual count is
an exception not the norm; however, the driver should clear
rq->data_len to zero to signify the normal cases while leaving it
alone means no data transfer occurred at all.  This reverse default
behavior complicates code unnecessarily and renders block PC on some
drivers (ide-tape/floppy) unuseable.

This patch adds rq->resid_len which is used only for residual count.

While at it, remove now unnecessasry blk_rq_bytes() caching in
ide_pc_intr() as rq->data_len is not changed anymore.

Boaz	: spotted missing conversion in osd
Sergei	: spotted too early conversion to blk_rq_bytes() in ide-tape

[ Impact: cleanup residual count handling, report 0 resid by default ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Borislav Petkov <petkovbb@googlemail.com>
Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: Mike Miller <mike.miller@hp.com>
Cc: Eric Moore <Eric.Moore@lsi.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Doug Gilbert <dgilbert@interlog.com>
Cc: Mike Miller <mike.miller@hp.com>
Cc: Eric Moore <Eric.Moore@lsi.com>
Cc: Darrick J. Wong <djwong@us.ibm.com>
Cc: Pete Zaitcev <zaitcev@redhat.com>
Cc: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/bsg.c b/block/bsg.c
index 206060e..2d746e3 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -445,14 +445,14 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
 	}
 
 	if (rq->next_rq) {
-		hdr->dout_resid = rq->data_len;
-		hdr->din_resid = rq->next_rq->data_len;
+		hdr->dout_resid = rq->resid_len;
+		hdr->din_resid = rq->next_rq->resid_len;
 		blk_rq_unmap_user(bidi_bio);
 		blk_put_request(rq->next_rq);
 	} else if (rq_data_dir(rq) == READ)
-		hdr->din_resid = rq->data_len;
+		hdr->din_resid = rq->resid_len;
 	else
-		hdr->dout_resid = rq->data_len;
+		hdr->dout_resid = rq->resid_len;
 
 	/*
 	 * If the request generated a negative error number, return it
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 58cf456..a9670dd 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -230,7 +230,7 @@ static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
 	hdr->info = 0;
 	if (hdr->masked_status || hdr->host_status || hdr->driver_status)
 		hdr->info |= SG_INFO_CHECK;
-	hdr->resid = rq->data_len;
+	hdr->resid = rq->resid_len;
 	hdr->sb_len_wr = 0;
 
 	if (rq->sense_len && hdr->sbp) {
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 4d4d5e0..f22d493 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1299,7 +1299,6 @@ static void cciss_softirq_done(struct request *rq)
 {
 	CommandList_struct *cmd = rq->completion_data;
 	ctlr_info_t *h = hba[cmd->ctlr];
-	unsigned int nr_bytes;
 	unsigned long flags;
 	u64bit temp64;
 	int i, ddir;
@@ -1321,15 +1320,11 @@ static void cciss_softirq_done(struct request *rq)
 	printk("Done with %p\n", rq);
 #endif				/* CCISS_DEBUG */
 
-	/*
-	 * Store the full size and set the residual count for pc requests
-	 */
-	nr_bytes = blk_rq_bytes(rq);
+	/* set the residual count for pc requests */
 	if (blk_pc_request(rq))
-		rq->data_len = cmd->err_info->ResidualCnt;
+		rq->resid_len = cmd->err_info->ResidualCnt;
 
-	if (blk_end_request(rq, (rq->errors == 0) ? 0 : -EIO, nr_bytes))
-		BUG();
+	blk_end_request_all(rq, (rq->errors == 0) ? 0 : -EIO);
 
 	spin_lock_irqsave(&h->lock, flags);
 	cmd_free(h, cmd, 1);
@@ -2691,7 +2686,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 			printk(KERN_WARNING "cciss: cmd %p has"
 			       " completed with data underrun "
 			       "reported\n", cmd);
-			cmd->rq->data_len = cmd->err_info->ResidualCnt;
+			cmd->rq->resid_len = cmd->err_info->ResidualCnt;
 		}
 		break;
 	case CMD_DATA_OVERRUN:
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 689cd27..8c2cc71 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -783,10 +783,8 @@ static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 
 	if (cmd->error == 0) {
 		if (blk_pc_request(rq)) {
-			if (cmd->act_len >= rq->data_len)
-				rq->data_len = 0;
-			else
-				rq->data_len -= cmd->act_len;
+			if (cmd->act_len < rq->data_len)
+				rq->resid_len = rq->data_len - cmd->act_len;
 			scsi_status = 0;
 		} else {
 			if (cmd->act_len != cmd->len) {
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index afe5a43..e4a02a0 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -367,7 +367,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 	/* No more interrupts */
 	if ((stat & ATA_DRQ) == 0) {
 		int uptodate, error;
-		unsigned int done;
 
 		debug_log("Packet command completed, %d bytes transferred\n",
 			  pc->xferred);
@@ -406,12 +405,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		if ((pc->flags & PC_FLAG_WAIT_FOR_DSC) && (stat & ATA_DSC) == 0)
 			dsc = 1;
 
-		/*
-		 * ->pc_callback() might change rq->data_len for
-		 * residual count, cache total length.
-		 */
-		done = blk_rq_bytes(rq);
-
 		/* Command finished - Call the callback function */
 		uptodate = drive->pc_callback(drive, dsc);
 
@@ -431,7 +424,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 			error = uptodate ? 0 : -EIO;
 		}
 
-		ide_complete_rq(drive, error, done);
+		ide_complete_rq(drive, error, blk_rq_bytes(rq));
 		return ide_stopped;
 	}
 
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 6736287..8bbe222 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -519,7 +519,7 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 		error = blk_execute_rq(drive->queue, info->disk, rq, 0);
 
 		if (buffer)
-			*bufflen = rq->data_len;
+			*bufflen = rq->resid_len;
 
 		flags = rq->cmd_flags;
 		blk_put_request(rq);
@@ -707,11 +707,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 
 out_end:
 	if (blk_pc_request(rq) && rc == 0) {
-		unsigned int dlen = rq->data_len;
-
-		rq->data_len = 0;
-
-		if (blk_end_request(rq, 0, dlen))
+		if (blk_end_request(rq, 0, rq->data_len))
 			BUG();
 
 		hwif->rq = NULL;
@@ -740,9 +736,10 @@ out_end:
 			nsectors = 1;
 
 		if (blk_fs_request(rq) == 0) {
-			rq->data_len -= (cmd->nbytes - cmd->nleft);
+			rq->resid_len = rq->data_len -
+				(cmd->nbytes - cmd->nleft);
 			if (uptodate == 0 && (cmd->tf_flags & IDE_TFLAG_WRITE))
-				rq->data_len += cmd->last_xfer_len;
+				rq->resid_len += cmd->last_xfer_len;
 		}
 
 		ide_complete_rq(drive, uptodate ? 0 : -EIO, nsectors << 9);
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 7149224..65c5b70 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -380,7 +380,7 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc)
 		}
 
 		tape->first_frame += blocks;
-		rq->data_len -= blocks * tape->blk_size;
+		rq->resid_len = rq->data_len - blocks * tape->blk_size;
 
 		if (pc->error) {
 			uptodate = 0;
@@ -903,7 +903,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
 	blk_execute_rq(drive->queue, tape->disk, rq, 0);
 
 	/* calculate the number of transferred bytes and update buffer state */
-	size -= rq->data_len;
+	size -= rq->resid_len;
 	tape->cur = tape->buf;
 	if (cmd == REQ_IDETAPE_READ)
 		tape->valid = size;
diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c
index a9019f0..5d5f347 100644
--- a/drivers/message/fusion/mptsas.c
+++ b/drivers/message/fusion/mptsas.c
@@ -1357,8 +1357,7 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 		smprep = (SmpPassthroughReply_t *)ioc->sas_mgmt.reply;
 		memcpy(req->sense, smprep, sizeof(*smprep));
 		req->sense_len = sizeof(*smprep);
-		req->data_len = 0;
-		rsp->data_len -= smprep->ResponseDataLength;
+		rsp->resid_len = rsp->data_len - smprep->ResponseDataLength;
 	} else {
 		printk(MYIOC_s_ERR_FMT "%s: smp passthru reply failed to be returned\n",
 		    ioc->name, __func__);
diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
index 3da02e4..6605ec9 100644
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -1936,12 +1936,8 @@ int sas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 			       bio_data(rsp->bio), rsp->data_len);
 	if (ret > 0) {
 		/* positive number is the untransferred residual */
-		rsp->data_len = ret;
-		req->data_len = 0;
+		rsp->resid_len = ret;
 		ret = 0;
-	} else if (ret == 0) {
-		rsp->data_len = 0;
-		req->data_len = 0;
 	}
 
 	return ret;
diff --git a/drivers/scsi/libsas/sas_host_smp.c b/drivers/scsi/libsas/sas_host_smp.c
index d110a36..89952ed 100644
--- a/drivers/scsi/libsas/sas_host_smp.c
+++ b/drivers/scsi/libsas/sas_host_smp.c
@@ -134,7 +134,7 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 {
 	u8 *req_data = NULL, *resp_data = NULL, *buf;
 	struct sas_ha_struct *sas_ha = SHOST_TO_SAS_HA(shost);
-	int error = -EINVAL, resp_data_len = rsp->data_len;
+	int error = -EINVAL;
 
 	/* eight is the minimum size for request and response frames */
 	if (req->data_len < 8 || rsp->data_len < 8)
@@ -176,17 +176,20 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 	resp_data[1] = req_data[1];
 	resp_data[2] = SMP_RESP_FUNC_UNK;
 
+	req->resid_len = req->data_len;
+	rsp->resid_len = rsp->data_len;
+
 	switch (req_data[1]) {
 	case SMP_REPORT_GENERAL:
-		req->data_len -= 8;
-		resp_data_len -= 32;
+		req->resid_len -= 8;
+		rsp->resid_len -= 32;
 		resp_data[2] = SMP_RESP_FUNC_ACC;
 		resp_data[9] = sas_ha->num_phys;
 		break;
 
 	case SMP_REPORT_MANUF_INFO:
-		req->data_len -= 8;
-		resp_data_len -= 64;
+		req->resid_len -= 8;
+		rsp->resid_len -= 64;
 		resp_data[2] = SMP_RESP_FUNC_ACC;
 		memcpy(resp_data + 12, shost->hostt->name,
 		       SAS_EXPANDER_VENDOR_ID_LEN);
@@ -199,13 +202,13 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 		break;
 
 	case SMP_DISCOVER:
-		req->data_len -= 16;
-		if ((int)req->data_len < 0) {
-			req->data_len = 0;
+		req->resid_len -= 16;
+		if ((int)req->resid_len < 0) {
+			req->resid_len = 0;
 			error = -EINVAL;
 			goto out;
 		}
-		resp_data_len -= 56;
+		rsp->resid_len -= 56;
 		sas_host_smp_discover(sas_ha, resp_data, req_data[9]);
 		break;
 
@@ -215,13 +218,13 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 		break;
 
 	case SMP_REPORT_PHY_SATA:
-		req->data_len -= 16;
-		if ((int)req->data_len < 0) {
-			req->data_len = 0;
+		req->resid_len -= 16;
+		if ((int)req->resid_len < 0) {
+			req->resid_len = 0;
 			error = -EINVAL;
 			goto out;
 		}
-		resp_data_len -= 60;
+		rsp->resid_len -= 60;
 		sas_report_phy_sata(sas_ha, resp_data, req_data[9]);
 		break;
 
@@ -238,13 +241,13 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 		break;
 
 	case SMP_PHY_CONTROL:
-		req->data_len -= 44;
-		if ((int)req->data_len < 0) {
-			req->data_len = 0;
+		req->resid_len -= 44;
+		if ((int)req->resid_len < 0) {
+			req->resid_len = 0;
 			error = -EINVAL;
 			goto out;
 		}
-		resp_data_len -= 8;
+		rsp->resid_len -= 8;
 		sas_phy_control(sas_ha, req_data[9], req_data[10],
 				req_data[32] >> 4, req_data[33] >> 4,
 				resp_data);
@@ -265,7 +268,6 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 	flush_kernel_dcache_page(bio_page(rsp->bio));
 	kunmap_atomic(buf - bio_offset(rsp->bio), KM_USER0);
 	local_irq_enable();
-	rsp->data_len = resp_data_len;
 
  out:
 	kfree(req_data);
diff --git a/drivers/scsi/mpt2sas/mpt2sas_transport.c b/drivers/scsi/mpt2sas/mpt2sas_transport.c
index e03dc0b..53759c5 100644
--- a/drivers/scsi/mpt2sas/mpt2sas_transport.c
+++ b/drivers/scsi/mpt2sas/mpt2sas_transport.c
@@ -1170,9 +1170,7 @@ transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 
 		memcpy(req->sense, mpi_reply, sizeof(*mpi_reply));
 		req->sense_len = sizeof(*mpi_reply);
-		req->data_len = 0;
-		rsp->data_len -= mpi_reply->ResponseDataLength;
-
+		rsp->resid_len = rsp->data_len - mpi_reply->ResponseDataLength;
 	} else {
 		dtransportprintk(ioc, printk(MPT2SAS_DEBUG_FMT
 		    "%s - no reply\n", ioc->name, __func__));
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index aa9fc57..7d49ef5 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -240,11 +240,11 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 	 * is invalid.  Prevent the garbage from being misinterpreted
 	 * and prevent security leaks by zeroing out the excess data.
 	 */
-	if (unlikely(req->data_len > 0 && req->data_len <= bufflen))
-		memset(buffer + (bufflen - req->data_len), 0, req->data_len);
+	if (unlikely(req->resid_len > 0 && req->resid_len <= bufflen))
+		memset(buffer + (bufflen - req->resid_len), 0, req->resid_len);
 
 	if (resid)
-		*resid = req->data_len;
+		*resid = req->resid_len;
 	ret = req->errors;
  out:
 	blk_put_request(req);
@@ -549,7 +549,7 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error,
 		int leftover = (req->hard_nr_sectors << 9);
 
 		if (blk_pc_request(req))
-			leftover = req->data_len;
+			leftover = req->resid_len;
 
 		/* kill remainder if no retrys */
 		if (error && scsi_noretry_cmd(cmd))
@@ -673,11 +673,11 @@ void scsi_release_buffers(struct scsi_cmnd *cmd)
 EXPORT_SYMBOL(scsi_release_buffers);
 
 /*
- * Bidi commands Must be complete as a whole, both sides at once.
- * If part of the bytes were written and lld returned
- * scsi_in()->resid and/or scsi_out()->resid this information will be left
- * in req->data_len and req->next_rq->data_len. The upper-layer driver can
- * decide what to do with this information.
+ * Bidi commands Must be complete as a whole, both sides at once.  If
+ * part of the bytes were written and lld returned scsi_in()->resid
+ * and/or scsi_out()->resid this information will be left in
+ * req->resid_len and req->next_rq->resid_len. The upper-layer driver
+ * can decide what to do with this information.
  */
 static void scsi_end_bidi_request(struct scsi_cmnd *cmd)
 {
@@ -685,8 +685,8 @@ static void scsi_end_bidi_request(struct scsi_cmnd *cmd)
 	unsigned int dlen = req->data_len;
 	unsigned int next_dlen = req->next_rq->data_len;
 
-	req->data_len = scsi_out(cmd)->resid;
-	req->next_rq->data_len = scsi_in(cmd)->resid;
+	req->resid_len = scsi_out(cmd)->resid;
+	req->next_rq->resid_len = scsi_in(cmd)->resid;
 
 	/* The req and req->next_rq have not been completed */
 	BUG_ON(blk_end_bidi_request(req, 0, dlen, next_dlen));
@@ -778,7 +778,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 			scsi_end_bidi_request(cmd);
 			return;
 		}
-		req->data_len = scsi_get_resid(cmd);
+		req->resid_len = scsi_get_resid(cmd);
 	}
 
 	BUG_ON(blk_bidi_rq(req)); /* bidi not support for !blk_pc_request yet */
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 82312df..dec4c70 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1260,7 +1260,7 @@ static void sg_rq_end_io(struct request *rq, int uptodate)
 
 	sense = rq->sense;
 	result = rq->errors;
-	resid = rq->data_len;
+	resid = rq->resid_len;
 
 	SCSI_LOG_TIMEOUT(4, printk("sg_cmd_done: %s, pack_id=%d, res=0x%x\n",
 		sdp->disk->disk_name, srp->header.pack_id, result));
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index eb24efe..8681b7083 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -463,7 +463,7 @@ static void st_scsi_execute_end(struct request *req, int uptodate)
 	struct scsi_tape *STp = SRpnt->stp;
 
 	STp->buffer->cmdstat.midlevel_result = SRpnt->result = req->errors;
-	STp->buffer->cmdstat.residual = req->data_len;
+	STp->buffer->cmdstat.residual = req->resid_len;
 
 	if (SRpnt->waiting)
 		complete(SRpnt->waiting);
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
index b249ae9..06ca926 100644
--- a/fs/exofs/osd.c
+++ b/fs/exofs/osd.c
@@ -50,10 +50,10 @@ int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
 
 	/* FIXME: should be include in osd_sense_info */
 	if (in_resid)
-		*in_resid = or->in.req ? or->in.req->data_len : 0;
+		*in_resid = or->in.req ? or->in.req->resid_len : 0;
 
 	if (out_resid)
-		*out_resid = or->out.req ? or->out.req->data_len : 0;
+		*out_resid = or->out.req ? or->out.req->resid_len : 0;
 
 	return ret;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3a5b1bd..6a967ca 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -229,6 +229,7 @@ struct request {
 	unsigned int data_len;
 	unsigned int extra_len;	/* length of alignment and padding */
 	unsigned int sense_len;
+	unsigned int resid_len;	/* residual count */
 	void *sense;
 
 	unsigned long deadline;
-- 
cgit v0.10.2


From 5b93629b4509c03ffa87a9316412fedf6f58cb37 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:38 +0900
Subject: block: implement blk_rq_pos/[cur_]sectors() and convert obvious ones

Implement accessors - blk_rq_pos(), blk_rq_sectors() and
blk_rq_cur_sectors() which return rq->hard_sector, rq->hard_nr_sectors
and rq->hard_cur_sectors respectively and convert direct references of
the said fields to the accessors.

This is in preparation of request data length handling cleanup.

Geert	: suggested adding const to struct request * parameter to accessors
Sergei	: spotted error in patch description

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Acked-by: Stephen Rothwell <sfr@canb.auug.org.au>
Tested-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Ackec-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Borislav Petkov <petkovbb@googlemail.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index c8d0876..c167de5 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -163,7 +163,7 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 	 * For an empty barrier, there's no actual BAR request, which
 	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
 	 */
-	if (!rq->hard_nr_sectors) {
+	if (!blk_rq_sectors(rq)) {
 		q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
 				QUEUE_ORDERED_DO_POSTFLUSH);
 		/*
diff --git a/block/blk-core.c b/block/blk-core.c
index 394c5bd..895e55b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1683,7 +1683,7 @@ static void blk_account_io_done(struct request *req)
 unsigned int blk_rq_bytes(struct request *rq)
 {
 	if (blk_fs_request(rq))
-		return rq->hard_nr_sectors << 9;
+		return blk_rq_sectors(rq) << 9;
 
 	return rq->data_len;
 }
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index def0c69..575083a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -760,7 +760,7 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
 	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
 						cfqd->rq_in_driver);
 
-	cfqd->last_position = rq->hard_sector + rq->hard_nr_sectors;
+	cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
 }
 
 static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index f6586e4..c238867 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -136,7 +136,7 @@ static int ps3disk_submit_request_sg(struct ps3_storage_device *dev,
 	dev_dbg(&dev->sbd.core,
 		"%s:%u: %s req has %u bvecs for %lu sectors %lu hard sectors\n",
 		__func__, __LINE__, op, n, req->nr_sectors,
-		req->hard_nr_sectors);
+		blk_rq_sectors(req));
 #endif
 
 	start_sector = req->sector * priv->blocking_factor;
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index ecccf65..e821eed 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -368,12 +368,12 @@ static void do_viodasd_request(struct request_queue *q)
 		blkdev_dequeue_request(req);
 		/* check that request contains a valid command */
 		if (!blk_fs_request(req)) {
-			viodasd_end_request(req, -EIO, req->hard_nr_sectors);
+			viodasd_end_request(req, -EIO, blk_rq_sectors(req));
 			continue;
 		}
 		/* Try sending the request */
 		if (send_request(req) != 0)
-			viodasd_end_request(req, -EIO, req->hard_nr_sectors);
+			viodasd_end_request(req, -EIO, blk_rq_sectors(req));
 	}
 }
 
@@ -590,7 +590,7 @@ static int viodasd_handle_read_write(struct vioblocklpevent *bevent)
 		err = vio_lookup_rc(viodasd_err_table, bevent->sub_result);
 		printk(VIOD_KERN_WARNING "read/write error %d:0x%04x (%s)\n",
 				event->xRc, bevent->sub_result, err->msg);
-		num_sect = req->hard_nr_sectors;
+		num_sect = blk_rq_sectors(req);
 	}
 	qlock = req->q->queue_lock;
 	spin_lock_irqsave(qlock, irq_flags);
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index b1e1d7e..5722931 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -645,8 +645,8 @@ static void ace_fsm_dostate(struct ace_device *ace)
 
 		/* Okay, it's a data request, set it up for transfer */
 		dev_dbg(ace->dev,
-			"request: sec=%llx hcnt=%lx, ccnt=%x, dir=%i\n",
-			(unsigned long long) req->sector, req->hard_nr_sectors,
+			"request: sec=%llx hcnt=%x, ccnt=%x, dir=%i\n",
+			(unsigned long long) req->sector, blk_rq_sectors(req),
 			req->current_nr_sectors, rq_data_dir(req));
 
 		ace->req = req;
@@ -654,7 +654,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
 		ace->data_count = req->current_nr_sectors * ACE_BUF_PER_SECTOR;
 		ace_out32(ace, ACE_MPULBA, req->sector & 0x0FFFFFFF);
 
-		count = req->hard_nr_sectors;
+		count = blk_rq_sectors(req);
 		if (rq_data_dir(req)) {
 			/* Kick off write request */
 			dev_dbg(ace->dev, "write data\n");
@@ -719,8 +719,8 @@ static void ace_fsm_dostate(struct ace_device *ace)
 		/* bio finished; is there another one? */
 		if (__blk_end_request(ace->req, 0,
 					blk_rq_cur_bytes(ace->req))) {
-			/* dev_dbg(ace->dev, "next block; h=%li c=%i\n",
-			 *      ace->req->hard_nr_sectors,
+			/* dev_dbg(ace->dev, "next block; h=%u c=%u\n",
+			 *      blk_rq_sectors(ace->req),
 			 *      ace->req->current_nr_sectors);
 			 */
 			ace->data_ptr = ace->req->buffer;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 8bbe222..182320d 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -730,7 +730,7 @@ out_end:
 		if (blk_pc_request(rq))
 			nsectors = (rq->data_len + 511) >> 9;
 		else
-			nsectors = rq->hard_nr_sectors;
+			nsectors = blk_rq_sectors(rq);
 
 		if (nsectors == 0)
 			nsectors = 1;
@@ -875,7 +875,7 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 
 	return ide_issue_pc(drive, &cmd);
 out_end:
-	nsectors = rq->hard_nr_sectors;
+	nsectors = blk_rq_sectors(rq);
 
 	if (nsectors == 0)
 		nsectors = 1;
@@ -1359,8 +1359,8 @@ static int ide_cdrom_probe_capabilities(ide_drive_t *drive)
 static int ide_cdrom_prep_fs(struct request_queue *q, struct request *rq)
 {
 	int hard_sect = queue_hardsect_size(q);
-	long block = (long)rq->hard_sector / (hard_sect >> 9);
-	unsigned long blocks = rq->hard_nr_sectors / (hard_sect >> 9);
+	long block = (long)blk_rq_pos(rq) / (hard_sect >> 9);
+	unsigned long blocks = blk_rq_sectors(rq) / (hard_sect >> 9);
 
 	memset(rq->cmd, 0, BLK_MAX_CDB);
 
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index a0309ea..df23bcb 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -118,7 +118,7 @@ unsigned int ide_rq_bytes(struct request *rq)
 	if (blk_pc_request(rq))
 		return rq->data_len;
 	else
-		return rq->hard_cur_sectors << 9;
+		return blk_rq_cur_sectors(rq) << 9;
 }
 EXPORT_SYMBOL_GPL(ide_rq_bytes);
 
@@ -133,7 +133,7 @@ int ide_complete_rq(ide_drive_t *drive, int error, unsigned int nr_bytes)
 	 * and complete the whole request right now
 	 */
 	if (blk_noretry_request(rq) && error <= 0)
-		nr_bytes = rq->hard_nr_sectors << 9;
+		nr_bytes = blk_rq_sectors(rq) << 9;
 
 	rc = ide_end_rq(drive, rq, error, nr_bytes);
 	if (rc == 0)
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 221317e..56e60f0 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -427,7 +427,7 @@ static void i2o_block_end_request(struct request *req, int error,
 	unsigned long flags;
 
 	if (blk_end_request(req, error, nr_bytes)) {
-		int leftover = (req->hard_nr_sectors << KERNEL_SECTOR_SHIFT);
+		int leftover = (blk_rq_sectors(req) << KERNEL_SECTOR_SHIFT);
 
 		if (blk_pc_request(req))
 			leftover = req->data_len;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 7d49ef5..9ff0ca9 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -546,7 +546,7 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error,
 	 * to queue the remainder of them.
 	 */
 	if (blk_end_request(req, error, bytes)) {
-		int leftover = (req->hard_nr_sectors << 9);
+		int leftover = blk_rq_sectors(req) << 9;
 
 		if (blk_pc_request(req))
 			leftover = req->resid_len;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6a967ca..4e5f855 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -832,13 +832,30 @@ static inline void blk_run_address_space(struct address_space *mapping)
 extern void blkdev_dequeue_request(struct request *req);
 
 /*
- * blk_end_request() takes bytes instead of sectors as a complete size.
- * blk_rq_bytes() returns bytes left to complete in the entire request.
- * blk_rq_cur_bytes() returns bytes left to complete in the current segment.
+ * blk_rq_pos()		: the current sector
+ * blk_rq_bytes()	: bytes left in the entire request
+ * blk_rq_cur_bytes()	: bytes left in the current segment
+ * blk_rq_sectors()	: sectors left in the entire request
+ * blk_rq_cur_sectors()	: sectors left in the current segment
  */
+static inline sector_t blk_rq_pos(const struct request *rq)
+{
+	return rq->hard_sector;
+}
+
 extern unsigned int blk_rq_bytes(struct request *rq);
 extern unsigned int blk_rq_cur_bytes(struct request *rq);
 
+static inline unsigned int blk_rq_sectors(const struct request *rq)
+{
+	return rq->hard_nr_sectors;
+}
+
+static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
+{
+	return rq->hard_cur_sectors;
+}
+
 /*
  * Request completion related functions.
  *
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 921ef5d..42f1c11 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -646,7 +646,7 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 				rq->cmd_len, rq->cmd);
 	} else  {
 		what |= BLK_TC_ACT(BLK_TC_FS);
-		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
+		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_sectors(rq) << 9,
 				rw, what, rq->errors, 0, NULL);
 	}
 }
@@ -857,7 +857,7 @@ void blk_add_driver_data(struct request_queue *q,
 		__blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
 				rq->errors, len, data);
 	else
-		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
+		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_sectors(rq) << 9,
 				0, BLK_TA_DRV_DATA, rq->errors, len, data);
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
-- 
cgit v0.10.2


From 83096ebf1263b2c1ee5e653ba37d993d02e3eb7b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:39 +0900
Subject: block: convert to pos and nr_sectors accessors

With recent cleanups, there is no place where low level driver
directly manipulates request fields.  This means that the 'hard'
request fields always equal the !hard fields.  Convert all
rq->sectors, nr_sectors and current_nr_sectors references to
accessors.

While at it, drop superflous blk_rq_pos() < 0 test in swim.c.

[ Impact: use pos and nr_sectors accessors ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Tested-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Tested-by: Adrian McMenamin <adrian@mcmen.demon.co.uk>
Acked-by: Adrian McMenamin <adrian@mcmen.demon.co.uk>
Acked-by: Mike Miller <mike.miller@hp.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Borislav Petkov <petkovbb@googlemail.com>
Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: Eric Moore <Eric.Moore@lsi.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Pete Zaitcev <zaitcev@redhat.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Paul Clements <paul.clements@steeleye.com>
Cc: Tim Waugh <tim@cyberelk.net>
Cc: Jeff Garzik <jgarzik@pobox.com>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Alex Dubov <oakad@yahoo.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Dario Ballabio <ballabio_dario@emc.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: unsik Kim <donari75@gmail.com>
Cc: Laurent Vivier <Laurent@lvivier.info>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 4330127..402ba8f 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -1239,7 +1239,7 @@ static void do_ubd_request(struct request_queue *q)
 		}
 
 		req = dev->request;
-		sector = req->sector;
+		sector = blk_rq_pos(req);
 		while(dev->start_sg < dev->end_sg){
 			struct scatterlist *sg = &dev->sg[dev->start_sg];
 
diff --git a/block/as-iosched.c b/block/as-iosched.c
index 45bd070..7a12cf6 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -306,8 +306,8 @@ as_choose_req(struct as_data *ad, struct request *rq1, struct request *rq2)
 	data_dir = rq_is_sync(rq1);
 
 	last = ad->last_sector[data_dir];
-	s1 = rq1->sector;
-	s2 = rq2->sector;
+	s1 = blk_rq_pos(rq1);
+	s2 = blk_rq_pos(rq2);
 
 	BUG_ON(data_dir != rq_is_sync(rq2));
 
@@ -566,13 +566,15 @@ static void as_update_iohist(struct as_data *ad, struct as_io_context *aic,
 			as_update_thinktime(ad, aic, thinktime);
 
 			/* Calculate read -> read seek distance */
-			if (aic->last_request_pos < rq->sector)
-				seek_dist = rq->sector - aic->last_request_pos;
+			if (aic->last_request_pos < blk_rq_pos(rq))
+				seek_dist = blk_rq_pos(rq) -
+					    aic->last_request_pos;
 			else
-				seek_dist = aic->last_request_pos - rq->sector;
+				seek_dist = aic->last_request_pos -
+					    blk_rq_pos(rq);
 			as_update_seekdist(ad, aic, seek_dist);
 		}
-		aic->last_request_pos = rq->sector + rq->nr_sectors;
+		aic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
 		set_bit(AS_TASK_IOSTARTED, &aic->state);
 		spin_unlock(&aic->lock);
 	}
@@ -587,7 +589,7 @@ static int as_close_req(struct as_data *ad, struct as_io_context *aic,
 {
 	unsigned long delay;	/* jiffies */
 	sector_t last = ad->last_sector[ad->batch_data_dir];
-	sector_t next = rq->sector;
+	sector_t next = blk_rq_pos(rq);
 	sector_t delta; /* acceptable close offset (in sectors) */
 	sector_t s;
 
@@ -981,7 +983,7 @@ static void as_move_to_dispatch(struct as_data *ad, struct request *rq)
 	 * This has to be set in order to be correctly updated by
 	 * as_find_next_rq
 	 */
-	ad->last_sector[data_dir] = rq->sector + rq->nr_sectors;
+	ad->last_sector[data_dir] = blk_rq_pos(rq) + blk_rq_sectors(rq);
 
 	if (data_dir == BLK_RW_SYNC) {
 		struct io_context *ioc = RQ_IOC(rq);
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index c167de5..8713c2f 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -324,7 +324,7 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
 	/*
 	 * The driver must store the error location in ->bi_sector, if
 	 * it supports it. For non-stacked drivers, this should be copied
-	 * from rq->sector.
+	 * from blk_rq_pos(rq).
 	 */
 	if (error_sector)
 		*error_sector = bio->bi_sector;
diff --git a/block/blk-core.c b/block/blk-core.c
index 895e55b..82dc206 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -72,7 +72,7 @@ static void drive_stat_acct(struct request *rq, int new_io)
 		return;
 
 	cpu = part_stat_lock();
-	part = disk_map_sector_rcu(rq->rq_disk, rq->sector);
+	part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
 
 	if (!new_io)
 		part_stat_inc(cpu, part, merges[rw]);
@@ -185,10 +185,9 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
 		rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
 		rq->cmd_flags);
 
-	printk(KERN_INFO "  sector %llu, nr/cnr %lu/%u\n",
-						(unsigned long long)rq->sector,
-						rq->nr_sectors,
-						rq->current_nr_sectors);
+	printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
+	       (unsigned long long)blk_rq_pos(rq),
+	       blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
 	printk(KERN_INFO "  bio %p, biotail %p, buffer %p, len %u\n",
 						rq->bio, rq->biotail,
 						rq->buffer, rq->data_len);
@@ -1557,7 +1556,7 @@ EXPORT_SYMBOL(submit_bio);
  */
 int blk_rq_check_limits(struct request_queue *q, struct request *rq)
 {
-	if (rq->nr_sectors > q->max_sectors ||
+	if (blk_rq_sectors(rq) > q->max_sectors ||
 	    rq->data_len > q->max_hw_sectors << 9) {
 		printk(KERN_ERR "%s: over max size limit.\n", __func__);
 		return -EIO;
@@ -1645,7 +1644,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, req->sector);
+		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
 		part_stat_add(cpu, part, sectors[rw], bytes >> 9);
 		part_stat_unlock();
 	}
@@ -1665,7 +1664,7 @@ static void blk_account_io_done(struct request *req)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, req->sector);
+		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
 
 		part_stat_inc(cpu, part, ios[rw]);
 		part_stat_add(cpu, part, ticks[rw], duration);
@@ -1846,7 +1845,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 	if (error && (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))) {
 		printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n",
 				req->rq_disk ? req->rq_disk->disk_name : "?",
-				(unsigned long long)req->sector);
+				(unsigned long long)blk_rq_pos(req));
 	}
 
 	blk_account_io_completion(req, nr_bytes);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 23d2a6f..bf62a87 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -259,7 +259,7 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
 	else
 		max_sectors = q->max_sectors;
 
-	if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
+	if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) {
 		req->cmd_flags |= REQ_NOMERGE;
 		if (req == q->last_merge)
 			q->last_merge = NULL;
@@ -284,7 +284,7 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 		max_sectors = q->max_sectors;
 
 
-	if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
+	if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) {
 		req->cmd_flags |= REQ_NOMERGE;
 		if (req == q->last_merge)
 			q->last_merge = NULL;
@@ -315,7 +315,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 	/*
 	 * Will it become too large?
 	 */
-	if ((req->nr_sectors + next->nr_sectors) > q->max_sectors)
+	if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > q->max_sectors)
 		return 0;
 
 	total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
@@ -345,7 +345,7 @@ static void blk_account_io_merge(struct request *req)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, req->sector);
+		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
 
 		part_round_stats(cpu, part);
 		part_dec_in_flight(part);
@@ -366,7 +366,7 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	/*
 	 * not contiguous
 	 */
-	if (req->sector + req->nr_sectors != next->sector)
+	if (blk_rq_pos(req) + blk_rq_sectors(req) != blk_rq_pos(next))
 		return 0;
 
 	if (rq_data_dir(req) != rq_data_dir(next)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 575083a..db4d990 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -349,8 +349,8 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
 	else if (rq_is_meta(rq2) && !rq_is_meta(rq1))
 		return rq2;
 
-	s1 = rq1->sector;
-	s2 = rq2->sector;
+	s1 = blk_rq_pos(rq1);
+	s2 = blk_rq_pos(rq2);
 
 	last = cfqd->last_position;
 
@@ -949,10 +949,10 @@ static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
 static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
 					  struct request *rq)
 {
-	if (rq->sector >= cfqd->last_position)
-		return rq->sector - cfqd->last_position;
+	if (blk_rq_pos(rq) >= cfqd->last_position)
+		return blk_rq_pos(rq) - cfqd->last_position;
 	else
-		return cfqd->last_position - rq->sector;
+		return cfqd->last_position - blk_rq_pos(rq);
 }
 
 #define CIC_SEEK_THR	8 * 1024
@@ -1918,10 +1918,10 @@ cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
 
 	if (!cic->last_request_pos)
 		sdist = 0;
-	else if (cic->last_request_pos < rq->sector)
-		sdist = rq->sector - cic->last_request_pos;
+	else if (cic->last_request_pos < blk_rq_pos(rq))
+		sdist = blk_rq_pos(rq) - cic->last_request_pos;
 	else
-		sdist = cic->last_request_pos - rq->sector;
+		sdist = cic->last_request_pos - blk_rq_pos(rq);
 
 	/*
 	 * Don't allow the seek distance to get too large from the
@@ -2071,7 +2071,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	cfq_update_io_seektime(cfqd, cic, rq);
 	cfq_update_idle_window(cfqd, cfqq, cic);
 
-	cic->last_request_pos = rq->sector + rq->nr_sectors;
+	cic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
 
 	if (cfqq == cfqd->active_queue) {
 		/*
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index c4d991d..b547cbc 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -138,7 +138,7 @@ deadline_merge(struct request_queue *q, struct request **req, struct bio *bio)
 
 		__rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
 		if (__rq) {
-			BUG_ON(sector != __rq->sector);
+			BUG_ON(sector != blk_rq_pos(__rq));
 
 			if (elv_rq_merge_ok(__rq, bio)) {
 				ret = ELEVATOR_FRONT_MERGE;
diff --git a/block/elevator.c b/block/elevator.c
index 1af5d9f..9189200 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -52,7 +52,7 @@ static const int elv_hash_shift = 6;
 #define ELV_HASH_FN(sec)	\
 		(hash_long(ELV_HASH_BLOCK((sec)), elv_hash_shift))
 #define ELV_HASH_ENTRIES	(1 << elv_hash_shift)
-#define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
+#define rq_hash_key(rq)		(blk_rq_pos(rq) + blk_rq_sectors(rq))
 
 DEFINE_TRACE(block_rq_insert);
 DEFINE_TRACE(block_rq_issue);
@@ -119,9 +119,9 @@ static inline int elv_try_merge(struct request *__rq, struct bio *bio)
 	 * we can merge and sequence is ok, check if it's possible
 	 */
 	if (elv_rq_merge_ok(__rq, bio)) {
-		if (__rq->sector + __rq->nr_sectors == bio->bi_sector)
+		if (blk_rq_pos(__rq) + blk_rq_sectors(__rq) == bio->bi_sector)
 			ret = ELEVATOR_BACK_MERGE;
-		else if (__rq->sector - bio_sectors(bio) == bio->bi_sector)
+		else if (blk_rq_pos(__rq) - bio_sectors(bio) == bio->bi_sector)
 			ret = ELEVATOR_FRONT_MERGE;
 	}
 
@@ -370,9 +370,9 @@ struct request *elv_rb_add(struct rb_root *root, struct request *rq)
 		parent = *p;
 		__rq = rb_entry(parent, struct request, rb_node);
 
-		if (rq->sector < __rq->sector)
+		if (blk_rq_pos(rq) < blk_rq_pos(__rq))
 			p = &(*p)->rb_left;
-		else if (rq->sector > __rq->sector)
+		else if (blk_rq_pos(rq) > blk_rq_pos(__rq))
 			p = &(*p)->rb_right;
 		else
 			return __rq;
@@ -400,9 +400,9 @@ struct request *elv_rb_find(struct rb_root *root, sector_t sector)
 	while (n) {
 		rq = rb_entry(n, struct request, rb_node);
 
-		if (sector < rq->sector)
+		if (sector < blk_rq_pos(rq))
 			n = n->rb_left;
-		else if (sector > rq->sector)
+		else if (sector > blk_rq_pos(rq))
 			n = n->rb_right;
 		else
 			return rq;
@@ -441,14 +441,14 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
 			break;
 		if (pos->cmd_flags & stop_flags)
 			break;
-		if (rq->sector >= boundary) {
-			if (pos->sector < boundary)
+		if (blk_rq_pos(rq) >= boundary) {
+			if (blk_rq_pos(pos) < boundary)
 				continue;
 		} else {
-			if (pos->sector >= boundary)
+			if (blk_rq_pos(pos) >= boundary)
 				break;
 		}
-		if (rq->sector >= pos->sector)
+		if (blk_rq_pos(rq) >= blk_rq_pos(pos))
 			break;
 	}
 
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index f22ed6c..774ab05 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -3338,8 +3338,8 @@ static int DAC960_process_queue(DAC960_Controller_T *Controller, struct request_
 	}
 	Command->Completion = Request->end_io_data;
 	Command->LogicalDriveNumber = (long)Request->rq_disk->private_data;
-	Command->BlockNumber = Request->sector;
-	Command->BlockCount = Request->nr_sectors;
+	Command->BlockNumber = blk_rq_pos(Request);
+	Command->BlockCount = blk_rq_sectors(Request);
 	Command->Request = Request;
 	blkdev_dequeue_request(Request);
 	Command->SegmentCount = blk_rq_map_sg(req_q,
@@ -3431,7 +3431,7 @@ static void DAC960_queue_partial_rw(DAC960_Command_T *Command)
    * successfully as possible.
    */
   Command->SegmentCount = 1;
-  Command->BlockNumber = Request->sector;
+  Command->BlockNumber = blk_rq_pos(Request);
   Command->BlockCount = 1;
   DAC960_QueueReadWriteCommand(Command);
   return;
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 8ff95f2..e4a14b9 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1351,13 +1351,13 @@ static void redo_fd_request(void)
 	drive = floppy - unit;
 
 	/* Here someone could investigate to be more efficient */
-	for (cnt = 0; cnt < CURRENT->current_nr_sectors; cnt++) { 
+	for (cnt = 0; cnt < blk_rq_cur_sectors(CURRENT); cnt++) {
 #ifdef DEBUG
 		printk("fd: sector %ld + %d requested for %s\n",
-		       CURRENT->sector,cnt,
+		       blk_rq_pos(CURRENT), cnt,
 		       (rq_data_dir(CURRENT) == READ) ? "read" : "write");
 #endif
-		block = CURRENT->sector + cnt;
+		block = blk_rq_pos(CURRENT) + cnt;
 		if ((int)block > floppy->blocks) {
 			__blk_end_request_cur(CURRENT, -EIO);
 			goto repeat;
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 2506728..234024c 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -725,7 +725,7 @@ static void do_fd_action( int drive )
 	    if (IS_BUFFERED( drive, ReqSide, ReqTrack )) {
 		if (ReqCmd == READ) {
 		    copy_buffer( SECTOR_BUFFER(ReqSector), ReqData );
-		    if (++ReqCnt < CURRENT->current_nr_sectors) {
+		    if (++ReqCnt < blk_rq_cur_sectors(CURRENT)) {
 			/* read next sector */
 			setup_req_params( drive );
 			goto repeat;
@@ -1130,7 +1130,7 @@ static void fd_rwsec_done1(int status)
 		}
 	}
   
-	if (++ReqCnt < CURRENT->current_nr_sectors) {
+	if (++ReqCnt < blk_rq_cur_sectors(CURRENT)) {
 		/* read next sector */
 		setup_req_params( SelectedDrive );
 		do_fd_action( SelectedDrive );
@@ -1394,7 +1394,7 @@ static void redo_fd_request(void)
 
 	DPRINT(("redo_fd_request: CURRENT=%p dev=%s CURRENT->sector=%ld\n",
 		CURRENT, CURRENT ? CURRENT->rq_disk->disk_name : "",
-		CURRENT ? CURRENT->sector : 0 ));
+		CURRENT ? blk_rq_pos(CURRENT) : 0 ));
 
 	IsFormatting = 0;
 
@@ -1440,7 +1440,7 @@ repeat:
 		UD.autoprobe = 0;
 	}
 	
-	if (CURRENT->sector + 1 > UDT->blocks) {
+	if (blk_rq_pos(CURRENT) + 1 > UDT->blocks) {
 		__blk_end_request_cur(CURRENT, -EIO);
 		goto repeat;
 	}
@@ -1450,7 +1450,7 @@ repeat:
 		
 	ReqCnt = 0;
 	ReqCmd = rq_data_dir(CURRENT);
-	ReqBlock = CURRENT->sector;
+	ReqBlock = blk_rq_pos(CURRENT);
 	ReqBuffer = CURRENT->buffer;
 	setup_req_params( drive );
 	do_fd_action( drive );
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index f22d493..ab7b04c 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2835,10 +2835,10 @@ static void do_cciss_request(struct request_queue *q)
 	c->Request.Timeout = 0;	// Don't time out
 	c->Request.CDB[0] =
 	    (rq_data_dir(creq) == READ) ? h->cciss_read : h->cciss_write;
-	start_blk = creq->sector;
+	start_blk = blk_rq_pos(creq);
 #ifdef CCISS_DEBUG
-	printk(KERN_DEBUG "ciss: sector =%d nr_sectors=%d\n", (int)creq->sector,
-	       (int)creq->nr_sectors);
+	printk(KERN_DEBUG "ciss: sector =%d nr_sectors=%d\n",
+	       (int)blk_rq_pos(creq), (int)blk_rq_sectors(creq));
 #endif				/* CCISS_DEBUG */
 
 	sg_init_table(tmp_sg, MAXSGENTRIES);
@@ -2864,8 +2864,8 @@ static void do_cciss_request(struct request_queue *q)
 		h->maxSG = seg;
 
 #ifdef CCISS_DEBUG
-	printk(KERN_DEBUG "cciss: Submitting %lu sectors in %d segments\n",
-	       creq->nr_sectors, seg);
+	printk(KERN_DEBUG "cciss: Submitting %u sectors in %d segments\n",
+	       blk_rq_sectors(creq), seg);
 #endif				/* CCISS_DEBUG */
 
 	c->Header.SGList = c->Header.SGTotal = seg;
@@ -2877,8 +2877,8 @@ static void do_cciss_request(struct request_queue *q)
 			c->Request.CDB[4] = (start_blk >> 8) & 0xff;
 			c->Request.CDB[5] = start_blk & 0xff;
 			c->Request.CDB[6] = 0;	// (sect >> 24) & 0xff; MSB
-			c->Request.CDB[7] = (creq->nr_sectors >> 8) & 0xff;
-			c->Request.CDB[8] = creq->nr_sectors & 0xff;
+			c->Request.CDB[7] = (blk_rq_sectors(creq) >> 8) & 0xff;
+			c->Request.CDB[8] = blk_rq_sectors(creq) & 0xff;
 			c->Request.CDB[9] = c->Request.CDB[11] = c->Request.CDB[12] = 0;
 		} else {
 			u32 upper32 = upper_32_bits(start_blk);
@@ -2893,10 +2893,10 @@ static void do_cciss_request(struct request_queue *q)
 			c->Request.CDB[7]= (start_blk >> 16) & 0xff;
 			c->Request.CDB[8]= (start_blk >>  8) & 0xff;
 			c->Request.CDB[9]= start_blk & 0xff;
-			c->Request.CDB[10]= (creq->nr_sectors >>  24) & 0xff;
-			c->Request.CDB[11]= (creq->nr_sectors >>  16) & 0xff;
-			c->Request.CDB[12]= (creq->nr_sectors >>  8) & 0xff;
-			c->Request.CDB[13]= creq->nr_sectors & 0xff;
+			c->Request.CDB[10]= (blk_rq_sectors(creq) >> 24) & 0xff;
+			c->Request.CDB[11]= (blk_rq_sectors(creq) >> 16) & 0xff;
+			c->Request.CDB[12]= (blk_rq_sectors(creq) >>  8) & 0xff;
+			c->Request.CDB[13]= blk_rq_sectors(creq) & 0xff;
 			c->Request.CDB[14] = c->Request.CDB[15] = 0;
 		}
 	} else if (blk_pc_request(creq)) {
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index 488a8f4..a5caeff 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -919,10 +919,11 @@ queue_next:
 	c->hdr.size = sizeof(rblk_t) >> 2;
 	c->size += sizeof(rblk_t);
 
-	c->req.hdr.blk = creq->sector;
+	c->req.hdr.blk = blk_rq_pos(creq);
 	c->rq = creq;
 DBGPX(
-	printk("sector=%d, nr_sectors=%d\n", creq->sector, creq->nr_sectors);
+	printk("sector=%d, nr_sectors=%u\n",
+	       blk_rq_pos(creq), blk_rq_sectors(creq));
 );
 	sg_init_table(tmp_sg, SG_MAX);
 	seg = blk_rq_map_sg(q, creq, tmp_sg);
@@ -940,9 +941,9 @@ DBGPX(
 						 tmp_sg[i].offset,
 						 tmp_sg[i].length, dir);
 	}
-DBGPX(	printk("Submitting %d sectors in %d segments\n", creq->nr_sectors, seg); );
+DBGPX(	printk("Submitting %u sectors in %d segments\n", blk_rq_sectors(creq), seg); );
 	c->req.hdr.sg_cnt = seg;
-	c->req.hdr.blk_cnt = creq->nr_sectors;
+	c->req.hdr.blk_cnt = blk_rq_sectors(creq);
 	c->req.hdr.cmd = (rq_data_dir(creq) == READ) ? IDA_READ : IDA_WRITE;
 	c->type = CMD_RWREQ;
 
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 1300df6..4524862 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2303,7 +2303,7 @@ static void floppy_end_request(struct request *req, int error)
 
 	/* current_count_sectors can be zero if transfer failed */
 	if (error)
-		nr_sectors = req->current_nr_sectors;
+		nr_sectors = blk_rq_cur_sectors(req);
 	if (__blk_end_request(req, error, nr_sectors << 9))
 		return;
 
@@ -2332,7 +2332,7 @@ static void request_done(int uptodate)
 	if (uptodate) {
 		/* maintain values for invalidation on geometry
 		 * change */
-		block = current_count_sectors + req->sector;
+		block = current_count_sectors + blk_rq_pos(req);
 		INFBOUND(DRS->maxblock, block);
 		if (block > _floppy->sect)
 			DRS->maxtrack = 1;
@@ -2346,10 +2346,10 @@ static void request_done(int uptodate)
 			/* record write error information */
 			DRWE->write_errors++;
 			if (DRWE->write_errors == 1) {
-				DRWE->first_error_sector = req->sector;
+				DRWE->first_error_sector = blk_rq_pos(req);
 				DRWE->first_error_generation = DRS->generation;
 			}
-			DRWE->last_error_sector = req->sector;
+			DRWE->last_error_sector = blk_rq_pos(req);
 			DRWE->last_error_generation = DRS->generation;
 		}
 		spin_lock_irqsave(q->queue_lock, flags);
@@ -2503,24 +2503,24 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2)
 
 	max_sector = transfer_size(ssize,
 				   min(max_sector, max_sector_2),
-				   current_req->nr_sectors);
+				   blk_rq_sectors(current_req));
 
 	if (current_count_sectors <= 0 && CT(COMMAND) == FD_WRITE &&
-	    buffer_max > fsector_t + current_req->nr_sectors)
+	    buffer_max > fsector_t + blk_rq_sectors(current_req))
 		current_count_sectors = min_t(int, buffer_max - fsector_t,
-					      current_req->nr_sectors);
+					      blk_rq_sectors(current_req));
 
 	remaining = current_count_sectors << 9;
 #ifdef FLOPPY_SANITY_CHECK
-	if ((remaining >> 9) > current_req->nr_sectors &&
+	if ((remaining >> 9) > blk_rq_sectors(current_req) &&
 	    CT(COMMAND) == FD_WRITE) {
 		DPRINT("in copy buffer\n");
 		printk("current_count_sectors=%ld\n", current_count_sectors);
 		printk("remaining=%d\n", remaining >> 9);
-		printk("current_req->nr_sectors=%ld\n",
-		       current_req->nr_sectors);
+		printk("current_req->nr_sectors=%u\n",
+		       blk_rq_sectors(current_req));
 		printk("current_req->current_nr_sectors=%u\n",
-		       current_req->current_nr_sectors);
+		       blk_rq_cur_sectors(current_req));
 		printk("max_sector=%d\n", max_sector);
 		printk("ssize=%d\n", ssize);
 	}
@@ -2530,7 +2530,7 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2)
 
 	dma_buffer = floppy_track_buffer + ((fsector_t - buffer_min) << 9);
 
-	size = current_req->current_nr_sectors << 9;
+	size = blk_rq_cur_sectors(current_req) << 9;
 
 	rq_for_each_segment(bv, current_req, iter) {
 		if (!remaining)
@@ -2648,10 +2648,10 @@ static int make_raw_rw_request(void)
 
 	max_sector = _floppy->sect * _floppy->head;
 
-	TRACK = (int)current_req->sector / max_sector;
-	fsector_t = (int)current_req->sector % max_sector;
+	TRACK = (int)blk_rq_pos(current_req) / max_sector;
+	fsector_t = (int)blk_rq_pos(current_req) % max_sector;
 	if (_floppy->track && TRACK >= _floppy->track) {
-		if (current_req->current_nr_sectors & 1) {
+		if (blk_rq_cur_sectors(current_req) & 1) {
 			current_count_sectors = 1;
 			return 1;
 		} else
@@ -2669,7 +2669,7 @@ static int make_raw_rw_request(void)
 		if (fsector_t >= max_sector) {
 			current_count_sectors =
 			    min_t(int, _floppy->sect - fsector_t,
-				  current_req->nr_sectors);
+				  blk_rq_sectors(current_req));
 			return 1;
 		}
 		SIZECODE = 2;
@@ -2720,7 +2720,7 @@ static int make_raw_rw_request(void)
 
 	in_sector_offset = (fsector_t % _floppy->sect) % ssize;
 	aligned_sector_t = fsector_t - in_sector_offset;
-	max_size = current_req->nr_sectors;
+	max_size = blk_rq_sectors(current_req);
 	if ((raw_cmd->track == buffer_track) &&
 	    (current_drive == buffer_drive) &&
 	    (fsector_t >= buffer_min) && (fsector_t < buffer_max)) {
@@ -2729,10 +2729,10 @@ static int make_raw_rw_request(void)
 			copy_buffer(1, max_sector, buffer_max);
 			return 1;
 		}
-	} else if (in_sector_offset || current_req->nr_sectors < ssize) {
+	} else if (in_sector_offset || blk_rq_sectors(current_req) < ssize) {
 		if (CT(COMMAND) == FD_WRITE) {
-			if (fsector_t + current_req->nr_sectors > ssize &&
-			    fsector_t + current_req->nr_sectors < ssize + ssize)
+			if (fsector_t + blk_rq_sectors(current_req) > ssize &&
+			    fsector_t + blk_rq_sectors(current_req) < ssize + ssize)
 				max_size = ssize + ssize;
 			else
 				max_size = ssize;
@@ -2776,7 +2776,7 @@ static int make_raw_rw_request(void)
 		    (indirect * 2 > direct * 3 &&
 		     *errors < DP->max_errors.read_track && ((!probing
 		       || (DP->read_track & (1 << DRS->probed_format)))))) {
-			max_size = current_req->nr_sectors;
+			max_size = blk_rq_sectors(current_req);
 		} else {
 			raw_cmd->kernel_data = current_req->buffer;
 			raw_cmd->length = current_count_sectors << 9;
@@ -2801,7 +2801,7 @@ static int make_raw_rw_request(void)
 	    fsector_t > buffer_max ||
 	    fsector_t < buffer_min ||
 	    ((CT(COMMAND) == FD_READ ||
-	      (!in_sector_offset && current_req->nr_sectors >= ssize)) &&
+	      (!in_sector_offset && blk_rq_sectors(current_req) >= ssize)) &&
 	     max_sector > 2 * max_buffer_sectors + buffer_min &&
 	     max_size + fsector_t > 2 * max_buffer_sectors + buffer_min)
 	    /* not enough space */
@@ -2879,8 +2879,8 @@ static int make_raw_rw_request(void)
 				printk("write\n");
 			return 0;
 		}
-	} else if (raw_cmd->length > current_req->nr_sectors << 9 ||
-		   current_count_sectors > current_req->nr_sectors) {
+	} else if (raw_cmd->length > blk_rq_sectors(current_req) << 9 ||
+		   current_count_sectors > blk_rq_sectors(current_req)) {
 		DPRINT("buffer overrun in direct transfer\n");
 		return 0;
 	} else if (raw_cmd->length < current_count_sectors << 9) {
@@ -2990,8 +2990,9 @@ static void do_fd_request(struct request_queue * q)
 	if (usage_count == 0) {
 		printk("warning: usage count=0, current_req=%p exiting\n",
 		       current_req);
-		printk("sect=%ld type=%x flags=%x\n", (long)current_req->sector,
-		       current_req->cmd_type, current_req->cmd_flags);
+		printk("sect=%ld type=%x flags=%x\n",
+		       (long)blk_rq_pos(current_req), current_req->cmd_type,
+		       current_req->cmd_flags);
 		return;
 	}
 	if (test_bit(0, &fdc_busy)) {
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index 75b9ca9..a3b3994 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -228,7 +228,7 @@ static void dump_status(const char *msg, unsigned int stat)
 			printk(", CHS=%d/%d/%d", (inb(HD_HCYL)<<8) + inb(HD_LCYL),
 				inb(HD_CURRENT) & 0xf, inb(HD_SECTOR));
 			if (CURRENT)
-				printk(", sector=%ld", CURRENT->sector);
+				printk(", sector=%ld", blk_rq_pos(CURRENT));
 		}
 		printk("\n");
 	}
@@ -457,9 +457,9 @@ ok_to_read:
 	req = CURRENT;
 	insw(HD_DATA, req->buffer, 256);
 #ifdef DEBUG
-	printk("%s: read: sector %ld, remaining = %ld, buffer=%p\n",
-		req->rq_disk->disk_name, req->sector + 1, req->nr_sectors - 1,
-		req->buffer+512);
+	printk("%s: read: sector %ld, remaining = %u, buffer=%p\n",
+	       req->rq_disk->disk_name, blk_rq_pos(req) + 1,
+	       blk_rq_sectors(req) - 1, req->buffer+512);
 #endif
 	if (__blk_end_request(req, 0, 512)) {
 		SET_HANDLER(&read_intr);
@@ -485,7 +485,7 @@ static void write_intr(void)
 			continue;
 		if (!OK_STATUS(i))
 			break;
-		if ((req->nr_sectors <= 1) || (i & DRQ_STAT))
+		if ((blk_rq_sectors(req) <= 1) || (i & DRQ_STAT))
 			goto ok_to_write;
 	} while (--retries > 0);
 	dump_status("write_intr", i);
@@ -589,8 +589,8 @@ repeat:
 		return;
 	}
 	disk = req->rq_disk->private_data;
-	block = req->sector;
-	nsect = req->nr_sectors;
+	block = blk_rq_pos(req);
+	nsect = blk_rq_sectors(req);
 	if (block >= get_capacity(req->rq_disk) ||
 	    ((block+nsect) > get_capacity(req->rq_disk))) {
 		printk("%s: bad access: block=%d, count=%d\n",
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 71e56cc..826c349 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -220,7 +220,8 @@ static void mg_dump_status(const char *msg, unsigned int stat,
 			if (host->breq) {
 				req = elv_next_request(host->breq);
 				if (req)
-					printk(", sector=%u", (u32)req->sector);
+					printk(", sector=%u",
+					       (unsigned int)blk_rq_pos(req));
 			}
 
 		}
@@ -493,12 +494,12 @@ static void mg_read(struct request *req)
 	u32 j;
 	struct mg_host *host = req->rq_disk->private_data;
 
-	if (mg_out(host, req->sector, req->nr_sectors, MG_CMD_RD, NULL) !=
-			MG_ERR_NONE)
+	if (mg_out(host, blk_rq_pos(req), blk_rq_sectors(req),
+		   MG_CMD_RD, NULL) != MG_ERR_NONE)
 		mg_bad_rw_intr(host);
 
 	MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
-	       req->nr_sectors, req->sector, req->buffer);
+	       blk_rq_sectors(req), blk_rq_pos(req), req->buffer);
 
 	do {
 		u16 *buff = (u16 *)req->buffer;
@@ -522,14 +523,14 @@ static void mg_write(struct request *req)
 	u32 j;
 	struct mg_host *host = req->rq_disk->private_data;
 
-	if (mg_out(host, req->sector, req->nr_sectors, MG_CMD_WR, NULL) !=
-			MG_ERR_NONE) {
+	if (mg_out(host, blk_rq_pos(req), blk_rq_sectors(req),
+		   MG_CMD_WR, NULL) != MG_ERR_NONE) {
 		mg_bad_rw_intr(host);
 		return;
 	}
 
 	MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
-	       req->nr_sectors, req->sector, req->buffer);
+	       blk_rq_sectors(req), blk_rq_pos(req), req->buffer);
 
 	do {
 		u16 *buff = (u16 *)req->buffer;
@@ -579,7 +580,7 @@ ok_to_read:
 			      (i << 1));
 
 	MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
-			req->sector, req->nr_sectors - 1, req->buffer);
+	       blk_rq_pos(req), blk_rq_sectors(req) - 1, req->buffer);
 
 	/* send read confirm */
 	outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
@@ -609,7 +610,7 @@ static void mg_write_intr(struct mg_host *host)
 			break;
 		if (!MG_READY_OK(i))
 			break;
-		if ((req->nr_sectors <= 1) || (i & ATA_DRQ))
+		if ((blk_rq_sectors(req) <= 1) || (i & ATA_DRQ))
 			goto ok_to_write;
 	} while (0);
 	mg_dump_status("mg_write_intr", i, host);
@@ -627,7 +628,7 @@ ok_to_write:
 			buff++;
 		}
 		MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
-				req->sector, req->nr_sectors, req->buffer);
+		       blk_rq_pos(req), blk_rq_sectors(req), req->buffer);
 		host->mg_do_intr = mg_write_intr;
 		mod_timer(&host->timer, jiffies + 3 * HZ);
 	}
@@ -749,9 +750,9 @@ static void mg_request(struct request_queue *q)
 
 		del_timer(&host->timer);
 
-		sect_num = req->sector;
+		sect_num = blk_rq_pos(req);
 		/* deal whole segments */
-		sect_cnt = req->nr_sectors;
+		sect_cnt = blk_rq_sectors(req);
 
 		/* sanity check */
 		if (sect_num >= get_capacity(req->rq_disk) ||
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index a9ab8be..977a573 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -110,7 +110,7 @@ static void nbd_end_request(struct request *req)
 			req, error ? "failed" : "done");
 
 	spin_lock_irqsave(q->queue_lock, flags);
-	__blk_end_request(req, error, req->nr_sectors << 9);
+	__blk_end_request(req, error, blk_rq_sectors(req) << 9);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
@@ -231,19 +231,19 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req)
 {
 	int result, flags;
 	struct nbd_request request;
-	unsigned long size = req->nr_sectors << 9;
+	unsigned long size = blk_rq_sectors(req) << 9;
 
 	request.magic = htonl(NBD_REQUEST_MAGIC);
 	request.type = htonl(nbd_cmd(req));
-	request.from = cpu_to_be64((u64) req->sector << 9);
+	request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
 	request.len = htonl(size);
 	memcpy(request.handle, &req, sizeof(req));
 
-	dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%luB)\n",
+	dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%uB)\n",
 			lo->disk->disk_name, req,
 			nbdcmd_to_ascii(nbd_cmd(req)),
-			(unsigned long long)req->sector << 9,
-			req->nr_sectors << 9);
+			(unsigned long long)blk_rq_pos(req) << 9,
+			blk_rq_sectors(req) << 9);
 	result = sock_xmit(lo, 1, &request, sizeof(request),
 			(nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0);
 	if (result <= 0) {
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 9fd57c2..2d5dc0a 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -728,8 +728,8 @@ static void do_pcd_request(struct request_queue * q)
 			if (cd != pcd_current)
 				pcd_bufblk = -1;
 			pcd_current = cd;
-			pcd_sector = pcd_req->sector;
-			pcd_count = pcd_req->current_nr_sectors;
+			pcd_sector = blk_rq_pos(pcd_req);
+			pcd_count = blk_rq_cur_sectors(pcd_req);
 			pcd_buf = pcd_req->buffer;
 			pcd_busy = 1;
 			ps_set_intr(do_pcd_read, NULL, 0, nice);
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 0732df4..9ec5d4a 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -444,11 +444,11 @@ static enum action do_pd_io_start(void)
 
 	pd_cmd = rq_data_dir(pd_req);
 	if (pd_cmd == READ || pd_cmd == WRITE) {
-		pd_block = pd_req->sector;
-		pd_count = pd_req->current_nr_sectors;
+		pd_block = blk_rq_pos(pd_req);
+		pd_count = blk_rq_cur_sectors(pd_req);
 		if (pd_block + pd_count > get_capacity(pd_req->rq_disk))
 			return Fail;
-		pd_run = pd_req->nr_sectors;
+		pd_run = blk_rq_sectors(pd_req);
 		pd_buf = pd_req->buffer;
 		pd_retries = 0;
 		if (pd_cmd == READ)
@@ -479,7 +479,7 @@ static int pd_next_buf(void)
 		return 0;
 	spin_lock_irqsave(&pd_lock, saved_flags);
 	__blk_end_request_cur(pd_req, 0);
-	pd_count = pd_req->current_nr_sectors;
+	pd_count = blk_rq_cur_sectors(pd_req);
 	pd_buf = pd_req->buffer;
 	spin_unlock_irqrestore(&pd_lock, saved_flags);
 	return 0;
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index 3871e35..e88c889 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -768,9 +768,9 @@ repeat:
 		return;
 
 	pf_current = pf_req->rq_disk->private_data;
-	pf_block = pf_req->sector;
-	pf_run = pf_req->nr_sectors;
-	pf_count = pf_req->current_nr_sectors;
+	pf_block = blk_rq_pos(pf_req);
+	pf_run = blk_rq_sectors(pf_req);
+	pf_count = blk_rq_cur_sectors(pf_req);
 
 	if (pf_block + pf_count > get_capacity(pf_req->rq_disk)) {
 		pf_end_request(-EIO);
@@ -810,7 +810,7 @@ static int pf_next_buf(void)
 		spin_unlock_irqrestore(&pf_spin_lock, saved_flags);
 		if (!pf_req)
 			return 1;
-		pf_count = pf_req->current_nr_sectors;
+		pf_count = blk_rq_cur_sectors(pf_req);
 		pf_buf = pf_req->buffer;
 	}
 	return 0;
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index c238867..8d58308 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -134,13 +134,12 @@ static int ps3disk_submit_request_sg(struct ps3_storage_device *dev,
 	rq_for_each_segment(bv, req, iter)
 		n++;
 	dev_dbg(&dev->sbd.core,
-		"%s:%u: %s req has %u bvecs for %lu sectors %lu hard sectors\n",
-		__func__, __LINE__, op, n, req->nr_sectors,
-		blk_rq_sectors(req));
+		"%s:%u: %s req has %u bvecs for %u sectors\n",
+		__func__, __LINE__, op, n, blk_rq_sectors(req));
 #endif
 
-	start_sector = req->sector * priv->blocking_factor;
-	sectors = req->nr_sectors * priv->blocking_factor;
+	start_sector = blk_rq_pos(req) * priv->blocking_factor;
+	sectors = blk_rq_sectors(req) * priv->blocking_factor;
 	dev_dbg(&dev->sbd.core, "%s:%u: %s %llu sectors starting at %llu\n",
 		__func__, __LINE__, op, sectors, start_sector);
 
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index f59887c..9f351bf 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -416,7 +416,7 @@ static int __send_request(struct request *req)
 		desc->slice = 0;
 	}
 	desc->status = ~0;
-	desc->offset = (req->sector << 9) / port->vdisk_block_size;
+	desc->offset = (blk_rq_pos(req) << 9) / port->vdisk_block_size;
 	desc->size = len;
 	desc->ncookies = err;
 
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index 97ef426..fc6a1c3 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -531,7 +531,7 @@ static void redo_fd_request(struct request_queue *q)
 	while ((req = elv_next_request(q))) {
 
 		fs = req->rq_disk->private_data;
-		if (req->sector < 0 || req->sector >= fs->total_secs) {
+		if (blk_rq_pos(req) >= fs->total_secs) {
 			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
@@ -551,8 +551,8 @@ static void redo_fd_request(struct request_queue *q)
 			__blk_end_request_cur(req, -EIO);
 			break;
 		case READ:
-			if (floppy_read_sectors(fs, req->sector,
-						req->current_nr_sectors,
+			if (floppy_read_sectors(fs, blk_rq_pos(req),
+						blk_rq_cur_sectors(req),
 						req->buffer)) {
 				__blk_end_request_cur(req, -EIO);
 				continue;
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index 4248559..c1b9a4d 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -312,14 +312,14 @@ static void start_request(struct floppy_state *fs)
 	}
 	while (fs->state == idle && (req = elv_next_request(swim3_queue))) {
 #if 0
-		printk("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%ld buf=%p\n",
+		printk("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n",
 		       req->rq_disk->disk_name, req->cmd,
-		       (long)req->sector, req->nr_sectors, req->buffer);
-		printk("           errors=%d current_nr_sectors=%ld\n",
-		       req->errors, req->current_nr_sectors);
+		       (long)blk_rq_pos(req), blk_rq_sectors(req), req->buffer);
+		printk("           errors=%d current_nr_sectors=%u\n",
+		       req->errors, blk_rq_cur_sectors(req));
 #endif
 
-		if (req->sector >= fs->total_secs) {
+		if (blk_rq_pos(req) >= fs->total_secs) {
 			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
@@ -337,13 +337,14 @@ static void start_request(struct floppy_state *fs)
 			}
 		}
 
-		/* Do not remove the cast. req->sector is now a sector_t and
-		 * can be 64 bits, but it will never go past 32 bits for this
-		 * driver anyway, so we can safely cast it down and not have
-		 * to do a 64/32 division
+		/* Do not remove the cast. blk_rq_pos(req) is now a
+		 * sector_t and can be 64 bits, but it will never go
+		 * past 32 bits for this driver anyway, so we can
+		 * safely cast it down and not have to do a 64/32
+		 * division
 		 */
-		fs->req_cyl = ((long)req->sector) / fs->secpercyl;
-		x = ((long)req->sector) % fs->secpercyl;
+		fs->req_cyl = ((long)blk_rq_pos(req)) / fs->secpercyl;
+		x = ((long)blk_rq_pos(req)) % fs->secpercyl;
 		fs->head = x / fs->secpertrack;
 		fs->req_sector = x % fs->secpertrack + 1;
 		fd_req = req;
@@ -420,7 +421,7 @@ static inline void setup_transfer(struct floppy_state *fs)
 	struct dbdma_cmd *cp = fs->dma_cmd;
 	struct dbdma_regs __iomem *dr = fs->dma;
 
-	if (fd_req->current_nr_sectors <= 0) {
+	if (blk_rq_cur_sectors(fd_req) <= 0) {
 		printk(KERN_ERR "swim3: transfer 0 sectors?\n");
 		return;
 	}
@@ -428,8 +429,8 @@ static inline void setup_transfer(struct floppy_state *fs)
 		n = 1;
 	else {
 		n = fs->secpertrack - fs->req_sector + 1;
-		if (n > fd_req->current_nr_sectors)
-			n = fd_req->current_nr_sectors;
+		if (n > blk_rq_cur_sectors(fd_req))
+			n = blk_rq_cur_sectors(fd_req);
 	}
 	fs->scount = n;
 	swim3_select(fs, fs->head? READ_DATA_1: READ_DATA_0);
@@ -600,7 +601,8 @@ static void xfer_timeout(unsigned long data)
 	out_8(&sw->control_bic, WRITE_SECTORS | DO_ACTION);
 	out_8(&sw->select, RELAX);
 	printk(KERN_ERR "swim3: timeout %sing sector %ld\n",
-	       (rq_data_dir(fd_req)==WRITE? "writ": "read"), (long)fd_req->sector);
+	       (rq_data_dir(fd_req)==WRITE? "writ": "read"),
+	       (long)blk_rq_pos(fd_req));
 	__blk_end_request_cur(fd_req, -EIO);
 	fs->state = idle;
 	start_request(fs);
@@ -714,7 +716,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 			} else {
 				printk("swim3: error %sing block %ld (err=%x)\n",
 				       rq_data_dir(fd_req) == WRITE? "writ": "read",
-				       (long)fd_req->sector, err);
+				       (long)blk_rq_pos(fd_req), err);
 				__blk_end_request_cur(fd_req, -EIO);
 				fs->state = idle;
 			}
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 60e85bb..087c94c 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -903,10 +903,10 @@ queue_one_request:
 	msg->sg_count	= n_elem;
 	msg->sg_type	= SGT_32BIT;
 	msg->handle	= cpu_to_le32(TAG_ENCODE(crq->tag));
-	msg->lba	= cpu_to_le32(rq->sector & 0xffffffff);
-	tmp		= (rq->sector >> 16) >> 16;
+	msg->lba	= cpu_to_le32(blk_rq_pos(rq) & 0xffffffff);
+	tmp		= (blk_rq_pos(rq) >> 16) >> 16;
 	msg->lba_high	= cpu_to_le16( (u16) tmp );
-	msg->lba_count	= cpu_to_le16(rq->nr_sectors);
+	msg->lba_count	= cpu_to_le16(blk_rq_sectors(rq));
 
 	msg_size = sizeof(struct carm_msg_rw) - sizeof(msg->sg);
 	for (i = 0; i < n_elem; i++) {
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 8c2cc71..dc3b899 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -726,8 +726,8 @@ static void ub_cmd_build_block(struct ub_dev *sc, struct ub_lun *lun,
 	 * The call to blk_queue_hardsect_size() guarantees that request
 	 * is aligned, but it is given in terms of 512 byte units, always.
 	 */
-	block = rq->sector >> lun->capacity.bshift;
-	nblks = rq->nr_sectors >> lun->capacity.bshift;
+	block = blk_rq_pos(rq) >> lun->capacity.bshift;
+	nblks = blk_rq_sectors(rq) >> lun->capacity.bshift;
 
 	cmd->cdb[0] = (cmd->dir == UB_DIR_READ)? READ_10: WRITE_10;
 	/* 10-byte uses 4 bytes of LBA: 2147483648KB, 2097152MB, 2048GB */
@@ -739,7 +739,7 @@ static void ub_cmd_build_block(struct ub_dev *sc, struct ub_lun *lun,
 	cmd->cdb[8] = nblks;
 	cmd->cdb_len = 10;
 
-	cmd->len = rq->nr_sectors * 512;
+	cmd->len = blk_rq_sectors(rq) * 512;
 }
 
 static void ub_cmd_build_packet(struct ub_dev *sc, struct ub_lun *lun,
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index e821eed..2086cb1 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -252,7 +252,7 @@ static int send_request(struct request *req)
 	struct viodasd_device *d;
 	unsigned long flags;
 
-	start = (u64)req->sector << 9;
+	start = (u64)blk_rq_pos(req) << 9;
 
 	if (rq_data_dir(req) == READ) {
 		direction = DMA_FROM_DEVICE;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 50745e6..1980ab4 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -85,7 +85,7 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 	vbr->req = req;
 	if (blk_fs_request(vbr->req)) {
 		vbr->out_hdr.type = 0;
-		vbr->out_hdr.sector = vbr->req->sector;
+		vbr->out_hdr.sector = blk_rq_pos(vbr->req);
 		vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
 	} else if (blk_pc_request(vbr->req)) {
 		vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD;
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index 14be4c1..4ef8801 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -306,8 +306,8 @@ static void do_xd_request (struct request_queue * q)
 		return;
 
 	while ((req = elv_next_request(q)) != NULL) {
-		unsigned block = req->sector;
-		unsigned count = req->nr_sectors;
+		unsigned block = blk_rq_pos(req);
+		unsigned count = blk_rq_sectors(req);
 		XD_INFO *disk = req->rq_disk->private_data;
 		int res = 0;
 		int retry;
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index b456447..91fc565 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -231,7 +231,7 @@ static int blkif_queue_request(struct request *req)
 	info->shadow[id].request = (unsigned long)req;
 
 	ring_req->id = id;
-	ring_req->sector_number = (blkif_sector_t)req->sector;
+	ring_req->sector_number = (blkif_sector_t)blk_rq_pos(req);
 	ring_req->handle = info->handle;
 
 	ring_req->operation = rq_data_dir(req) ?
@@ -310,11 +310,10 @@ static void do_blkif_request(struct request_queue *rq)
 			goto wait;
 
 		pr_debug("do_blk_req %p: cmd %p, sec %lx, "
-			 "(%u/%li) buffer:%p [%s]\n",
-			 req, req->cmd, (unsigned long)req->sector,
-			 req->current_nr_sectors,
-			 req->nr_sectors, req->buffer,
-			 rq_data_dir(req) ? "write" : "read");
+			 "(%u/%u) buffer:%p [%s]\n",
+			 req, req->cmd, (unsigned long)blk_rq_pos(req),
+			 blk_rq_cur_sectors(req), blk_rq_sectors(req),
+			 req->buffer, rq_data_dir(req) ? "write" : "read");
 
 
 		blkdev_dequeue_request(req);
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 5722931..97c99b43 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -646,13 +646,14 @@ static void ace_fsm_dostate(struct ace_device *ace)
 		/* Okay, it's a data request, set it up for transfer */
 		dev_dbg(ace->dev,
 			"request: sec=%llx hcnt=%x, ccnt=%x, dir=%i\n",
-			(unsigned long long) req->sector, blk_rq_sectors(req),
-			req->current_nr_sectors, rq_data_dir(req));
+			(unsigned long long)blk_rq_pos(req),
+			blk_rq_sectors(req), blk_rq_cur_sectors(req),
+			rq_data_dir(req));
 
 		ace->req = req;
 		ace->data_ptr = req->buffer;
-		ace->data_count = req->current_nr_sectors * ACE_BUF_PER_SECTOR;
-		ace_out32(ace, ACE_MPULBA, req->sector & 0x0FFFFFFF);
+		ace->data_count = blk_rq_cur_sectors(req) * ACE_BUF_PER_SECTOR;
+		ace_out32(ace, ACE_MPULBA, blk_rq_pos(req) & 0x0FFFFFFF);
 
 		count = blk_rq_sectors(req);
 		if (rq_data_dir(req)) {
@@ -688,7 +689,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
 			dev_dbg(ace->dev,
 				"CFBSY set; t=%i iter=%i c=%i dc=%i irq=%i\n",
 				ace->fsm_task, ace->fsm_iter_num,
-				ace->req->current_nr_sectors * 16,
+				blk_rq_cur_sectors(ace->req) * 16,
 				ace->data_count, ace->in_irq);
 			ace_fsm_yield(ace);	/* need to poll CFBSY bit */
 			break;
@@ -697,7 +698,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
 			dev_dbg(ace->dev,
 				"DATABUF not set; t=%i iter=%i c=%i dc=%i irq=%i\n",
 				ace->fsm_task, ace->fsm_iter_num,
-				ace->req->current_nr_sectors * 16,
+				blk_rq_cur_sectors(ace->req) * 16,
 				ace->data_count, ace->in_irq);
 			ace_fsm_yieldirq(ace);
 			break;
@@ -721,10 +722,10 @@ static void ace_fsm_dostate(struct ace_device *ace)
 					blk_rq_cur_bytes(ace->req))) {
 			/* dev_dbg(ace->dev, "next block; h=%u c=%u\n",
 			 *      blk_rq_sectors(ace->req),
-			 *      ace->req->current_nr_sectors);
+			 *      blk_rq_cur_sectors(ace->req));
 			 */
 			ace->data_ptr = ace->req->buffer;
-			ace->data_count = ace->req->current_nr_sectors * 16;
+			ace->data_count = blk_rq_cur_sectors(ace->req) * 16;
 			ace_fsm_yieldirq(ace);
 			break;
 		}
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index b66ad58..d4e6b71 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -71,12 +71,12 @@ static void do_z2_request(struct request_queue *q)
 {
 	struct request *req;
 	while ((req = elv_next_request(q)) != NULL) {
-		unsigned long start = req->sector << 9;
-		unsigned long len  = req->current_nr_sectors << 9;
+		unsigned long start = blk_rq_pos(req) << 9;
+		unsigned long len  = blk_rq_cur_sectors(req) << 9;
 
 		if (start + len > z2ram_size) {
 			printk( KERN_ERR DEVICE_NAME ": bad access: block=%lu, count=%u\n",
-				req->sector, req->current_nr_sectors);
+				blk_rq_pos(req), blk_rq_cur_sectors(req));
 			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index cab2b1f..488423c 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -584,8 +584,8 @@ static void gdrom_readdisk_dma(struct work_struct *work)
 	list_for_each_safe(elem, next, &gdrom_deferred) {
 		req = list_entry(elem, struct request, queuelist);
 		spin_unlock(&gdrom_lock);
-		block = req->sector/GD_TO_BLK + GD_SESSION_OFFSET;
-		block_cnt = req->nr_sectors/GD_TO_BLK;
+		block = blk_rq_pos(req)/GD_TO_BLK + GD_SESSION_OFFSET;
+		block_cnt = blk_rq_sectors(req)/GD_TO_BLK;
 		ctrl_outl(PHYSADDR(req->buffer), GDROM_DMA_STARTADDR_REG);
 		ctrl_outl(block_cnt * GDROM_HARD_SECTOR, GDROM_DMA_LENGTH_REG);
 		ctrl_outl(1, GDROM_DMA_DIRECTION_REG);
@@ -661,7 +661,7 @@ static void gdrom_request(struct request_queue *rq)
 			printk(" write request ignored\n");
 			__blk_end_request_cur(req, -EIO);
 		}
-		if (req->nr_sectors)
+		if (blk_rq_sectors(req))
 			gdrom_request_handler_dma(req);
 		else
 			__blk_end_request_cur(req, -EIO);
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index cc3efa0..6e190a9 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -282,7 +282,7 @@ static int send_request(struct request *req)
 			viopath_targetinst(viopath_hostLp),
 			(u64)req, VIOVERSION << 16,
 			((u64)DEVICE_NR(diskinfo) << 48) | dmaaddr,
-			(u64)req->sector * 512, len, 0);
+			(u64)blk_rq_pos(req) * 512, len, 0);
 	if (hvrc != HvLpEvent_Rc_Good) {
 		printk(VIOCD_KERN_WARNING "hv error on op %d\n", (int)hvrc);
 		return -1;
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index a416346..9e600d2 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -677,10 +677,10 @@ try_again:
 			continue;
 		}
 
-		t_sec = msb->block_req->sector << 9;
+		t_sec = blk_rq_pos(msb->block_req) << 9;
 		sector_div(t_sec, msb->page_size);
 
-		count = msb->block_req->nr_sectors << 9;
+		count = blk_rq_sectors(msb->block_req) << 9;
 		count /= msb->page_size;
 
 		param.system = msb->system;
@@ -745,7 +745,7 @@ static int mspro_block_complete_req(struct memstick_dev *card, int error)
 					t_len *= msb->page_size;
 			}
 		} else
-			t_len = msb->block_req->nr_sectors << 9;
+			t_len = blk_rq_sectors(msb->block_req) << 9;
 
 		dev_dbg(&card->dev, "transferred %x (%d)\n", t_len, error);
 
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 56e60f0..510eb47 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -761,7 +761,7 @@ static int i2o_block_transfer(struct request *req)
 			break;
 
 		case CACHE_SMARTFETCH:
-			if (req->nr_sectors > 16)
+			if (blk_rq_sectors(req) > 16)
 				ctl_flags = 0x201F0008;
 			else
 				ctl_flags = 0x001F0000;
@@ -781,13 +781,13 @@ static int i2o_block_transfer(struct request *req)
 			ctl_flags = 0x001F0010;
 			break;
 		case CACHE_SMARTBACK:
-			if (req->nr_sectors > 16)
+			if (blk_rq_sectors(req) > 16)
 				ctl_flags = 0x001F0004;
 			else
 				ctl_flags = 0x001F0010;
 			break;
 		case CACHE_SMARTTHROUGH:
-			if (req->nr_sectors > 16)
+			if (blk_rq_sectors(req) > 16)
 				ctl_flags = 0x001F0004;
 			else
 				ctl_flags = 0x001F0010;
@@ -827,22 +827,24 @@ static int i2o_block_transfer(struct request *req)
 
 		*mptr++ = cpu_to_le32(scsi_flags);
 
-		*((u32 *) & cmd[2]) = cpu_to_be32(req->sector * hwsec);
-		*((u16 *) & cmd[7]) = cpu_to_be16(req->nr_sectors * hwsec);
+		*((u32 *) & cmd[2]) = cpu_to_be32(blk_rq_pos(req) * hwsec);
+		*((u16 *) & cmd[7]) = cpu_to_be16(blk_rq_sectors(req) * hwsec);
 
 		memcpy(mptr, cmd, 10);
 		mptr += 4;
-		*mptr++ = cpu_to_le32(req->nr_sectors << KERNEL_SECTOR_SHIFT);
+		*mptr++ =
+		    cpu_to_le32(blk_rq_sectors(req) << KERNEL_SECTOR_SHIFT);
 	} else
 #endif
 	{
 		msg->u.head[1] = cpu_to_le32(cmd | HOST_TID << 12 | tid);
 		*mptr++ = cpu_to_le32(ctl_flags);
-		*mptr++ = cpu_to_le32(req->nr_sectors << KERNEL_SECTOR_SHIFT);
 		*mptr++ =
-		    cpu_to_le32((u32) (req->sector << KERNEL_SECTOR_SHIFT));
+		    cpu_to_le32(blk_rq_sectors(req) << KERNEL_SECTOR_SHIFT);
+		*mptr++ =
+		    cpu_to_le32((u32) (blk_rq_pos(req) << KERNEL_SECTOR_SHIFT));
 		*mptr++ =
-		    cpu_to_le32(req->sector >> (32 - KERNEL_SECTOR_SHIFT));
+		    cpu_to_le32(blk_rq_pos(req) >> (32 - KERNEL_SECTOR_SHIFT));
 	}
 
 	if (!i2o_block_sglist_alloc(c, ireq, &mptr)) {
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index fe8041e..949e997 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -243,7 +243,7 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
 		brq.mrq.cmd = &brq.cmd;
 		brq.mrq.data = &brq.data;
 
-		brq.cmd.arg = req->sector;
+		brq.cmd.arg = blk_rq_pos(req);
 		if (!mmc_card_blockaddr(card))
 			brq.cmd.arg <<= 9;
 		brq.cmd.flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_ADTC;
@@ -251,7 +251,7 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
 		brq.stop.opcode = MMC_STOP_TRANSMISSION;
 		brq.stop.arg = 0;
 		brq.stop.flags = MMC_RSP_SPI_R1B | MMC_RSP_R1B | MMC_CMD_AC;
-		brq.data.blocks = req->nr_sectors;
+		brq.data.blocks = blk_rq_sectors(req);
 
 		/*
 		 * After a read error, we redo the request one sector at a time
@@ -293,7 +293,7 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
 		 * Adjust the sg list so it is the same size as the
 		 * request.
 		 */
-		if (brq.data.blocks != req->nr_sectors) {
+		if (brq.data.blocks != blk_rq_sectors(req)) {
 			int i, data_size = brq.data.blocks << 9;
 			struct scatterlist *sg;
 
@@ -344,8 +344,8 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
 			printk(KERN_ERR "%s: error %d transferring data,"
 			       " sector %u, nr %u, card status %#x\n",
 			       req->rq_disk->disk_name, brq.data.error,
-			       (unsigned)req->sector,
-			       (unsigned)req->nr_sectors, status);
+			       (unsigned)blk_rq_pos(req),
+			       (unsigned)blk_rq_sectors(req), status);
 		}
 
 		if (brq.stop.error) {
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 76c4c8d..4ea2e67 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -47,8 +47,8 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 	unsigned long block, nsect;
 	char *buf;
 
-	block = req->sector << 9 >> tr->blkshift;
-	nsect = req->current_nr_sectors << 9 >> tr->blkshift;
+	block = blk_rq_pos(req) << 9 >> tr->blkshift;
+	nsect = blk_rq_cur_sectors(req) << 9 >> tr->blkshift;
 
 	buf = req->buffer;
 
@@ -59,7 +59,8 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 	if (!blk_fs_request(req))
 		return -EIO;
 
-	if (req->sector + req->current_nr_sectors > get_capacity(req->rq_disk))
+	if (blk_rq_pos(req) + blk_rq_cur_sectors(req) >
+	    get_capacity(req->rq_disk))
 		return -EIO;
 
 	switch(rq_data_dir(req)) {
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index fabec95..7df03c7a 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -603,7 +603,7 @@ static void dasd_profile_end(struct dasd_block *block,
 	if (dasd_profile_level != DASD_PROFILE_ON)
 		return;
 
-	sectors = req->nr_sectors;
+	sectors = blk_rq_sectors(req);
 	if (!cqr->buildclk || !cqr->startclk ||
 	    !cqr->stopclk || !cqr->endclk ||
 	    !sectors)
diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c
index b9a7f77..2efaddf 100644
--- a/drivers/s390/block/dasd_diag.c
+++ b/drivers/s390/block/dasd_diag.c
@@ -505,8 +505,9 @@ static struct dasd_ccw_req *dasd_diag_build_cp(struct dasd_device *memdev,
 		return ERR_PTR(-EINVAL);
 	blksize = block->bp_block;
 	/* Calculate record id of first and last block. */
-	first_rec = req->sector >> block->s2b_shift;
-	last_rec = (req->sector + req->nr_sectors - 1) >> block->s2b_shift;
+	first_rec = blk_rq_pos(req) >> block->s2b_shift;
+	last_rec =
+		(blk_rq_pos(req) + blk_rq_sectors(req) - 1) >> block->s2b_shift;
 	/* Check struct bio and count the number of blocks for the request. */
 	count = 0;
 	rq_for_each_segment(bv, req, iter) {
diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index cb52da0..a41c940 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -2354,10 +2354,10 @@ static struct dasd_ccw_req *dasd_eckd_build_cp(struct dasd_device *startdev,
 	blksize = block->bp_block;
 	blk_per_trk = recs_per_track(&private->rdc_data, 0, blksize);
 	/* Calculate record id of first and last block. */
-	first_rec = first_trk = req->sector >> block->s2b_shift;
+	first_rec = first_trk = blk_rq_pos(req) >> block->s2b_shift;
 	first_offs = sector_div(first_trk, blk_per_trk);
 	last_rec = last_trk =
-		(req->sector + req->nr_sectors - 1) >> block->s2b_shift;
+		(blk_rq_pos(req) + blk_rq_sectors(req) - 1) >> block->s2b_shift;
 	last_offs = sector_div(last_trk, blk_per_trk);
 	cdlspecial = (private->uses_cdl && first_rec < 2*blk_per_trk);
 
@@ -2420,7 +2420,7 @@ dasd_eckd_free_cp(struct dasd_ccw_req *cqr, struct request *req)
 	private = (struct dasd_eckd_private *) cqr->block->base->private;
 	blksize = cqr->block->bp_block;
 	blk_per_trk = recs_per_track(&private->rdc_data, 0, blksize);
-	recid = req->sector >> cqr->block->s2b_shift;
+	recid = blk_rq_pos(req) >> cqr->block->s2b_shift;
 	ccw = cqr->cpaddr;
 	/* Skip over define extent & locate record. */
 	ccw++;
diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c
index a3eb6fd..8912358 100644
--- a/drivers/s390/block/dasd_fba.c
+++ b/drivers/s390/block/dasd_fba.c
@@ -270,8 +270,9 @@ static struct dasd_ccw_req *dasd_fba_build_cp(struct dasd_device * memdev,
 		return ERR_PTR(-EINVAL);
 	blksize = block->bp_block;
 	/* Calculate record id of first and last block. */
-	first_rec = req->sector >> block->s2b_shift;
-	last_rec = (req->sector + req->nr_sectors - 1) >> block->s2b_shift;
+	first_rec = blk_rq_pos(req) >> block->s2b_shift;
+	last_rec =
+		(blk_rq_pos(req) + blk_rq_sectors(req) - 1) >> block->s2b_shift;
 	/* Check struct bio and count the number of blocks for the request. */
 	count = 0;
 	cidaw = 0;
@@ -309,7 +310,7 @@ static struct dasd_ccw_req *dasd_fba_build_cp(struct dasd_device * memdev,
 	ccw = cqr->cpaddr;
 	/* First ccw is define extent. */
 	define_extent(ccw++, cqr->data, rq_data_dir(req),
-		      block->bp_block, req->sector, req->nr_sectors);
+		      block->bp_block, blk_rq_pos(req), blk_rq_sectors(req));
 	/* Build locate_record + read/write ccws. */
 	idaws = (unsigned long *) (cqr->data + sizeof(struct DE_fba_data));
 	LO_data = (struct LO_fba_data *) (idaws + cidaw);
diff --git a/drivers/s390/char/tape_34xx.c b/drivers/s390/char/tape_34xx.c
index 5f8e8ef..2d00a38 100644
--- a/drivers/s390/char/tape_34xx.c
+++ b/drivers/s390/char/tape_34xx.c
@@ -1134,7 +1134,7 @@ tape_34xx_bread(struct tape_device *device, struct request *req)
 	/* Setup ccws. */
 	request->op = TO_BLOCK;
 	start_block = (struct tape_34xx_block_id *) request->cpdata;
-	start_block->block = req->sector >> TAPEBLOCK_HSEC_S2B;
+	start_block->block = blk_rq_pos(req) >> TAPEBLOCK_HSEC_S2B;
 	DBF_EVENT(6, "start_block = %i\n", start_block->block);
 
 	ccw = request->cpaddr;
diff --git a/drivers/s390/char/tape_3590.c b/drivers/s390/char/tape_3590.c
index 823b05b..c453b2f 100644
--- a/drivers/s390/char/tape_3590.c
+++ b/drivers/s390/char/tape_3590.c
@@ -633,7 +633,7 @@ tape_3590_bread(struct tape_device *device, struct request *req)
 	struct req_iterator iter;
 
 	DBF_EVENT(6, "xBREDid:");
-	start_block = req->sector >> TAPEBLOCK_HSEC_S2B;
+	start_block = blk_rq_pos(req) >> TAPEBLOCK_HSEC_S2B;
 	DBF_EVENT(6, "start_block = %i\n", start_block);
 
 	rq_for_each_segment(bv, req, iter)
diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c
index 86596d3..5d035e4 100644
--- a/drivers/s390/char/tape_block.c
+++ b/drivers/s390/char/tape_block.c
@@ -87,7 +87,7 @@ __tapeblock_end_request(struct tape_request *ccw_req, void *data)
 	if (ccw_req->rc == 0)
 		/* Update position. */
 		device->blk_data.block_position =
-			(req->sector + req->nr_sectors) >> TAPEBLOCK_HSEC_S2B;
+		  (blk_rq_pos(req) + blk_rq_sectors(req)) >> TAPEBLOCK_HSEC_S2B;
 	else
 		/* We lost the position information due to an error. */
 		device->blk_data.block_position = -1;
diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c
index 0961788..2132c90 100644
--- a/drivers/sbus/char/jsflash.c
+++ b/drivers/sbus/char/jsflash.c
@@ -188,8 +188,8 @@ static void jsfd_do_request(struct request_queue *q)
 
 	while ((req = elv_next_request(q)) != NULL) {
 		struct jsfd_part *jdp = req->rq_disk->private_data;
-		unsigned long offset = req->sector << 9;
-		size_t len = req->current_nr_sectors << 9;
+		unsigned long offset = blk_rq_pos(req) << 9;
+		size_t len = blk_rq_cur_sectors(req) << 9;
 
 		if ((offset + len) > jdp->dsize) {
 			__blk_end_request_cur(req, -EIO);
diff --git a/drivers/scsi/eata.c b/drivers/scsi/eata.c
index be5099d..c7076ce 100644
--- a/drivers/scsi/eata.c
+++ b/drivers/scsi/eata.c
@@ -1825,7 +1825,7 @@ static int eata2x_queuecommand(struct scsi_cmnd *SCpnt,
 	if (linked_comm && SCpnt->device->queue_depth > 2
 	    && TLDEV(SCpnt->device->type)) {
 		ha->cp_stat[i] = READY;
-		flush_dev(SCpnt->device, SCpnt->request->sector, ha, 0);
+		flush_dev(SCpnt->device, blk_rq_pos(SCpnt->request), ha, 0);
 		return 0;
 	}
 
@@ -2144,13 +2144,13 @@ static int reorder(struct hostdata *ha, unsigned long cursec,
 		if (!cpp->din)
 			input_only = 0;
 
-		if (SCpnt->request->sector < minsec)
-			minsec = SCpnt->request->sector;
-		if (SCpnt->request->sector > maxsec)
-			maxsec = SCpnt->request->sector;
+		if (blk_rq_pos(SCpnt->request) < minsec)
+			minsec = blk_rq_pos(SCpnt->request);
+		if (blk_rq_pos(SCpnt->request) > maxsec)
+			maxsec = blk_rq_pos(SCpnt->request);
 
-		sl[n] = SCpnt->request->sector;
-		ioseek += SCpnt->request->nr_sectors;
+		sl[n] = blk_rq_pos(SCpnt->request);
+		ioseek += blk_rq_sectors(SCpnt->request);
 
 		if (!n)
 			continue;
@@ -2190,7 +2190,7 @@ static int reorder(struct hostdata *ha, unsigned long cursec,
 			k = il[n];
 			cpp = &ha->cp[k];
 			SCpnt = cpp->SCpnt;
-			ll[n] = SCpnt->request->nr_sectors;
+			ll[n] = blk_rq_sectors(SCpnt->request);
 			pl[n] = SCpnt->serial_number;
 
 			if (!n)
@@ -2236,12 +2236,12 @@ static int reorder(struct hostdata *ha, unsigned long cursec,
 			cpp = &ha->cp[k];
 			SCpnt = cpp->SCpnt;
 			scmd_printk(KERN_INFO, SCpnt,
-			    "%s pid %ld mb %d fc %d nr %d sec %ld ns %ld"
+			    "%s pid %ld mb %d fc %d nr %d sec %ld ns %u"
 			     " cur %ld s:%c r:%c rev:%c in:%c ov:%c xd %d.\n",
 			     (ihdlr ? "ihdlr" : "qcomm"),
 			     SCpnt->serial_number, k, flushcount,
-			     n_ready, SCpnt->request->sector,
-			     SCpnt->request->nr_sectors, cursec, YESNO(s),
+			     n_ready, blk_rq_pos(SCpnt->request),
+			     blk_rq_sectors(SCpnt->request), cursec, YESNO(s),
 			     YESNO(r), YESNO(rev), YESNO(input_only),
 			     YESNO(overlap), cpp->din);
 		}
@@ -2408,7 +2408,7 @@ static irqreturn_t ihdlr(struct Scsi_Host *shost)
 
 	if (linked_comm && SCpnt->device->queue_depth > 2
 	    && TLDEV(SCpnt->device->type))
-		flush_dev(SCpnt->device, SCpnt->request->sector, ha, 1);
+		flush_dev(SCpnt->device, blk_rq_pos(SCpnt->request), ha, 1);
 
 	tstatus = status_byte(spp->target_status);
 
diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c
index 36fd2e7..a8fab39 100644
--- a/drivers/scsi/lpfc/lpfc_scsi.c
+++ b/drivers/scsi/lpfc/lpfc_scsi.c
@@ -1313,10 +1313,10 @@ lpfc_parse_bg_err(struct lpfc_hba *phba, struct lpfc_scsi_buf *lpfc_cmd,
 	uint32_t bgstat = bgf->bgstat;
 	uint64_t failing_sector = 0;
 
-	printk(KERN_ERR "BG ERROR in cmd 0x%x lba 0x%llx blk cnt 0x%lx "
+	printk(KERN_ERR "BG ERROR in cmd 0x%x lba 0x%llx blk cnt 0x%x "
 			"bgstat=0x%x bghm=0x%x\n",
 			cmd->cmnd[0], (unsigned long long)scsi_get_lba(cmd),
-			cmd->request->nr_sectors, bgstat, bghm);
+			blk_rq_sectors(cmd->request), bgstat, bghm);
 
 	spin_lock(&_dump_buf_lock);
 	if (!_dump_buf_done) {
@@ -2375,15 +2375,15 @@ lpfc_queuecommand(struct scsi_cmnd *cmnd, void (*done) (struct scsi_cmnd *))
 		if (cmnd->cmnd[0] == READ_10)
 			lpfc_printf_vlog(vport, KERN_WARNING, LOG_BG,
 					"9035 BLKGRD: READ @ sector %llu, "
-					 "count %lu\n",
-					 (unsigned long long)scsi_get_lba(cmnd),
-					cmnd->request->nr_sectors);
+					"count %u\n",
+					(unsigned long long)scsi_get_lba(cmnd),
+					blk_rq_sectors(cmnd->request));
 		else if (cmnd->cmnd[0] == WRITE_10)
 			lpfc_printf_vlog(vport, KERN_WARNING, LOG_BG,
 					"9036 BLKGRD: WRITE @ sector %llu, "
-					"count %lu cmd=%p\n",
+					"count %u cmd=%p\n",
 					(unsigned long long)scsi_get_lba(cmnd),
-					cmnd->request->nr_sectors,
+					blk_rq_sectors(cmnd->request),
 					cmnd);
 
 		err = lpfc_bg_scsi_prep_dma_buf(phba, lpfc_cmd);
@@ -2403,15 +2403,15 @@ lpfc_queuecommand(struct scsi_cmnd *cmnd, void (*done) (struct scsi_cmnd *))
 		if (cmnd->cmnd[0] == READ_10)
 			lpfc_printf_vlog(vport, KERN_WARNING, LOG_BG,
 					 "9040 dbg: READ @ sector %llu, "
-					 "count %lu\n",
+					 "count %u\n",
 					 (unsigned long long)scsi_get_lba(cmnd),
-					 cmnd->request->nr_sectors);
+					 blk_rq_sectors(cmnd->request));
 		else if (cmnd->cmnd[0] == WRITE_10)
 			lpfc_printf_vlog(vport, KERN_WARNING, LOG_BG,
 					 "9041 dbg: WRITE @ sector %llu, "
-					 "count %lu cmd=%p\n",
+					 "count %u cmd=%p\n",
 					 (unsigned long long)scsi_get_lba(cmnd),
-					 cmnd->request->nr_sectors, cmnd);
+					 blk_rq_sectors(cmnd->request), cmnd);
 		else
 			lpfc_printf_vlog(vport, KERN_WARNING, LOG_BG,
 					 "9042 dbg: parser not implemented\n");
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9ff0ca9..39b3acf 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -787,9 +787,9 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 	 * Next deal with any sectors which we were able to correctly
 	 * handle.
 	 */
-	SCSI_LOG_HLCOMPLETE(1, printk("%ld sectors total, "
+	SCSI_LOG_HLCOMPLETE(1, printk("%u sectors total, "
 				      "%d bytes done.\n",
-				      req->nr_sectors, good_bytes));
+				      blk_rq_sectors(req), good_bytes));
 
 	/*
 	 * Recovered errors need reporting, but they're always treated
@@ -968,7 +968,7 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
 	if (blk_pc_request(req))
 		sdb->length = req->data_len;
 	else
-		sdb->length = req->nr_sectors << 9;
+		sdb->length = blk_rq_sectors(req) << 9;
 	return BLKPREP_OK;
 }
 
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 3fcb64b..70c4dd9 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -383,9 +383,9 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
 	struct scsi_device *sdp = q->queuedata;
 	struct gendisk *disk = rq->rq_disk;
 	struct scsi_disk *sdkp;
-	sector_t block = rq->sector;
+	sector_t block = blk_rq_pos(rq);
 	sector_t threshold;
-	unsigned int this_count = rq->nr_sectors;
+	unsigned int this_count = blk_rq_sectors(rq);
 	int ret, host_dif;
 
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
@@ -412,10 +412,10 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
 					this_count));
 
 	if (!sdp || !scsi_device_online(sdp) ||
- 	    block + rq->nr_sectors > get_capacity(disk)) {
+	    block + blk_rq_sectors(rq) > get_capacity(disk)) {
 		SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt,
-						"Finishing %ld sectors\n",
-						rq->nr_sectors));
+						"Finishing %u sectors\n",
+						blk_rq_sectors(rq)));
 		SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt,
 						"Retry with 0x%p\n", SCpnt));
 		goto out;
@@ -462,7 +462,7 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
 	 * for this.
 	 */
 	if (sdp->sector_size == 1024) {
-		if ((block & 1) || (rq->nr_sectors & 1)) {
+		if ((block & 1) || (blk_rq_sectors(rq) & 1)) {
 			scmd_printk(KERN_ERR, SCpnt,
 				    "Bad block number requested\n");
 			goto out;
@@ -472,7 +472,7 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
 		}
 	}
 	if (sdp->sector_size == 2048) {
-		if ((block & 3) || (rq->nr_sectors & 3)) {
+		if ((block & 3) || (blk_rq_sectors(rq) & 3)) {
 			scmd_printk(KERN_ERR, SCpnt,
 				    "Bad block number requested\n");
 			goto out;
@@ -482,7 +482,7 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
 		}
 	}
 	if (sdp->sector_size == 4096) {
-		if ((block & 7) || (rq->nr_sectors & 7)) {
+		if ((block & 7) || (blk_rq_sectors(rq) & 7)) {
 			scmd_printk(KERN_ERR, SCpnt,
 				    "Bad block number requested\n");
 			goto out;
@@ -511,10 +511,10 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
 	}
 
 	SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt,
-					"%s %d/%ld 512 byte blocks.\n",
+					"%s %d/%u 512 byte blocks.\n",
 					(rq_data_dir(rq) == WRITE) ?
 					"writing" : "reading", this_count,
-					rq->nr_sectors));
+					blk_rq_sectors(rq)));
 
 	/* Set RDPROTECT/WRPROTECT if disk is formatted with DIF */
 	host_dif = scsi_host_dif_capable(sdp->host, sdkp->protection_type);
@@ -970,8 +970,8 @@ static struct block_device_operations sd_fops = {
 
 static unsigned int sd_completed_bytes(struct scsi_cmnd *scmd)
 {
-	u64 start_lba = scmd->request->sector;
-	u64 end_lba = scmd->request->sector + (scsi_bufflen(scmd) / 512);
+	u64 start_lba = blk_rq_pos(scmd->request);
+	u64 end_lba = blk_rq_pos(scmd->request) + (scsi_bufflen(scmd) / 512);
 	u64 bad_lba;
 	int info_valid;
 
diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c
index 184dff4..82f14a9 100644
--- a/drivers/scsi/sd_dif.c
+++ b/drivers/scsi/sd_dif.c
@@ -507,7 +507,7 @@ void sd_dif_complete(struct scsi_cmnd *scmd, unsigned int good_bytes)
 	sector_sz = scmd->device->sector_size;
 	sectors = good_bytes / sector_sz;
 
-	phys = scmd->request->sector & 0xffffffff;
+	phys = blk_rq_pos(scmd->request) & 0xffffffff;
 	if (sector_sz == 4096)
 		phys >>= 3;
 
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 0e1a0f2..fddba53 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -292,7 +292,8 @@ static int sr_done(struct scsi_cmnd *SCpnt)
 			if (cd->device->sector_size == 2048)
 				error_sector <<= 2;
 			error_sector &= ~(block_sectors - 1);
-			good_bytes = (error_sector - SCpnt->request->sector) << 9;
+			good_bytes = (error_sector -
+				      blk_rq_pos(SCpnt->request)) << 9;
 			if (good_bytes < 0 || good_bytes >= this_count)
 				good_bytes = 0;
 			/*
@@ -349,8 +350,8 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq)
 				cd->disk->disk_name, block));
 
 	if (!cd->device || !scsi_device_online(cd->device)) {
-		SCSI_LOG_HLQUEUE(2, printk("Finishing %ld sectors\n",
-					rq->nr_sectors));
+		SCSI_LOG_HLQUEUE(2, printk("Finishing %u sectors\n",
+					   blk_rq_sectors(rq)));
 		SCSI_LOG_HLQUEUE(2, printk("Retry with 0x%p\n", SCpnt));
 		goto out;
 	}
@@ -413,7 +414,7 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq)
 	/*
 	 * request doesn't start on hw block boundary, add scatter pads
 	 */
-	if (((unsigned int)rq->sector % (s_size >> 9)) ||
+	if (((unsigned int)blk_rq_pos(rq) % (s_size >> 9)) ||
 	    (scsi_bufflen(SCpnt) % s_size)) {
 		scmd_printk(KERN_NOTICE, SCpnt, "unaligned transfer\n");
 		goto out;
@@ -422,14 +423,14 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq)
 	this_count = (scsi_bufflen(SCpnt) >> 9) / (s_size >> 9);
 
 
-	SCSI_LOG_HLQUEUE(2, printk("%s : %s %d/%ld 512 byte blocks.\n",
+	SCSI_LOG_HLQUEUE(2, printk("%s : %s %d/%u 512 byte blocks.\n",
 				cd->cdi.name,
 				(rq_data_dir(rq) == WRITE) ?
 					"writing" : "reading",
-				this_count, rq->nr_sectors));
+				this_count, blk_rq_sectors(rq)));
 
 	SCpnt->cmnd[1] = 0;
-	block = (unsigned int)rq->sector / (s_size >> 9);
+	block = (unsigned int)blk_rq_pos(rq) / (s_size >> 9);
 
 	if (this_count > 0xffff) {
 		this_count = 0xffff;
diff --git a/drivers/scsi/u14-34f.c b/drivers/scsi/u14-34f.c
index 601e951..54023d41 100644
--- a/drivers/scsi/u14-34f.c
+++ b/drivers/scsi/u14-34f.c
@@ -1306,7 +1306,7 @@ static int u14_34f_queuecommand(struct scsi_cmnd *SCpnt, void (*done)(struct scs
    if (linked_comm && SCpnt->device->queue_depth > 2
                                      && TLDEV(SCpnt->device->type)) {
       HD(j)->cp_stat[i] = READY;
-      flush_dev(SCpnt->device, SCpnt->request->sector, j, FALSE);
+      flush_dev(SCpnt->device, blk_rq_pos(SCpnt->request), j, FALSE);
       return 0;
       }
 
@@ -1610,11 +1610,13 @@ static int reorder(unsigned int j, unsigned long cursec,
 
       if (!(cpp->xdir == DTD_IN)) input_only = FALSE;
 
-      if (SCpnt->request->sector < minsec) minsec = SCpnt->request->sector;
-      if (SCpnt->request->sector > maxsec) maxsec = SCpnt->request->sector;
+      if (blk_rq_pos(SCpnt->request) < minsec)
+	 minsec = blk_rq_pos(SCpnt->request);
+      if (blk_rq_pos(SCpnt->request) > maxsec)
+	 maxsec = blk_rq_pos(SCpnt->request);
 
-      sl[n] = SCpnt->request->sector;
-      ioseek += SCpnt->request->nr_sectors;
+      sl[n] = blk_rq_pos(SCpnt->request);
+      ioseek += blk_rq_sectors(SCpnt->request);
 
       if (!n) continue;
 
@@ -1642,7 +1644,7 @@ static int reorder(unsigned int j, unsigned long cursec,
 
    if (!input_only) for (n = 0; n < n_ready; n++) {
       k = il[n]; cpp = &HD(j)->cp[k]; SCpnt = cpp->SCpnt;
-      ll[n] = SCpnt->request->nr_sectors; pl[n] = SCpnt->serial_number;
+      ll[n] = blk_rq_sectors(SCpnt->request); pl[n] = SCpnt->serial_number;
 
       if (!n) continue;
 
@@ -1666,12 +1668,12 @@ static int reorder(unsigned int j, unsigned long cursec,
    if (link_statistics && (overlap || !(flushcount % link_statistics)))
       for (n = 0; n < n_ready; n++) {
          k = il[n]; cpp = &HD(j)->cp[k]; SCpnt = cpp->SCpnt;
-         printk("%s %d.%d:%d pid %ld mb %d fc %d nr %d sec %ld ns %ld"\
+         printk("%s %d.%d:%d pid %ld mb %d fc %d nr %d sec %ld ns %u"\
                 " cur %ld s:%c r:%c rev:%c in:%c ov:%c xd %d.\n",
                 (ihdlr ? "ihdlr" : "qcomm"), SCpnt->channel, SCpnt->target,
                 SCpnt->lun, SCpnt->serial_number, k, flushcount, n_ready,
-                SCpnt->request->sector, SCpnt->request->nr_sectors, cursec,
-                YESNO(s), YESNO(r), YESNO(rev), YESNO(input_only),
+                blk_rq_pos(SCpnt->request), blk_rq_sectors(SCpnt->request),
+		cursec, YESNO(s), YESNO(r), YESNO(rev), YESNO(input_only),
                 YESNO(overlap), cpp->xdir);
          }
 #endif
@@ -1799,7 +1801,7 @@ static irqreturn_t ihdlr(unsigned int j)
 
    if (linked_comm && SCpnt->device->queue_depth > 2
                                      && TLDEV(SCpnt->device->type))
-      flush_dev(SCpnt->device, SCpnt->request->sector, j, TRUE);
+      flush_dev(SCpnt->device, blk_rq_pos(SCpnt->request), j, TRUE);
 
    tstatus = status_byte(spp->target_status);
 
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index 43b50d3..3878d1d 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -270,7 +270,7 @@ static inline unsigned char scsi_get_prot_type(struct scsi_cmnd *scmd)
 
 static inline sector_t scsi_get_lba(struct scsi_cmnd *scmd)
 {
-	return scmd->request->sector;
+	return blk_rq_pos(scmd->request);
 }
 
 static inline unsigned scsi_prot_sg_count(struct scsi_cmnd *cmd)
-- 
cgit v0.10.2


From 9780e2dd8254351f6cbe11304849126b51dbd561 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:40 +0900
Subject: ide: convert to rq pos and nr_sectors accessors

ide doesn't manipulate request fields anymore and thus all hard and
their soft equivalents are always equal.  Convert all references to
accessors.

[ Impact: use pos and nr_sectors accessors ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Borislav Petkov <petkovbb@googlemail.com>
Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 182320d..eb4f3dc 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -775,8 +775,8 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
 	}
 
 	/* fs requests *must* be hardware frame aligned */
-	if ((rq->nr_sectors & (sectors_per_frame - 1)) ||
-	    (rq->sector & (sectors_per_frame - 1)))
+	if ((blk_rq_sectors(rq) & (sectors_per_frame - 1)) ||
+	    (blk_rq_pos(rq) & (sectors_per_frame - 1)))
 		return ide_stopped;
 
 	/* use DMA, if possible */
@@ -868,8 +868,8 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 	cmd.rq = rq;
 
 	if (blk_fs_request(rq) || rq->data_len) {
-		ide_init_sg_cmd(&cmd, blk_fs_request(rq) ? (rq->nr_sectors << 9)
-							 : rq->data_len);
+		ide_init_sg_cmd(&cmd, blk_fs_request(rq) ?
+				(blk_rq_sectors(rq) << 9) : rq->data_len);
 		ide_map_sg(drive, &cmd);
 	}
 
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index c243880..ad18e14 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -82,7 +82,7 @@ static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 					sector_t block)
 {
 	ide_hwif_t *hwif	= drive->hwif;
-	u16 nsectors		= (u16)rq->nr_sectors;
+	u16 nsectors		= (u16)blk_rq_sectors(rq);
 	u8 lba48		= !!(drive->dev_flags & IDE_DFLAG_LBA48);
 	u8 dma			= !!(drive->dev_flags & IDE_DFLAG_USING_DMA);
 	struct ide_cmd		cmd;
@@ -90,7 +90,7 @@ static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 	ide_startstop_t		rc;
 
 	if ((hwif->host_flags & IDE_HFLAG_NO_LBA48_DMA) && lba48 && dma) {
-		if (block + rq->nr_sectors > 1ULL << 28)
+		if (block + blk_rq_sectors(rq) > 1ULL << 28)
 			dma = 0;
 		else
 			lba48 = 0;
@@ -195,9 +195,9 @@ static ide_startstop_t ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 
 	ledtrig_ide_activity();
 
-	pr_debug("%s: %sing: block=%llu, sectors=%lu, buffer=0x%08lx\n",
+	pr_debug("%s: %sing: block=%llu, sectors=%u, buffer=0x%08lx\n",
 		 drive->name, rq_data_dir(rq) == READ ? "read" : "writ",
-		 (unsigned long long)block, rq->nr_sectors,
+		 (unsigned long long)block, blk_rq_sectors(rq),
 		 (unsigned long)rq->buffer);
 
 	if (hwif->rw_disk)
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index d9123ec..001f68f 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -103,7 +103,7 @@ ide_startstop_t ide_dma_intr(ide_drive_t *drive)
 				ide_finish_cmd(drive, cmd, stat);
 			else
 				ide_complete_rq(drive, 0,
-						cmd->rq->nr_sectors << 9);
+						blk_rq_sectors(cmd->rq) << 9);
 			return ide_stopped;
 		}
 		printk(KERN_ERR "%s: %s: bad DMA status (0x%02x)\n",
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 537b7c5..1c460bd 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -194,7 +194,7 @@ static void idefloppy_create_rw_cmd(ide_drive_t *drive,
 {
 	struct ide_disk_obj *floppy = drive->driver_data;
 	int block = sector / floppy->bs_factor;
-	int blocks = rq->nr_sectors / floppy->bs_factor;
+	int blocks = blk_rq_sectors(rq) / floppy->bs_factor;
 	int cmd = rq_data_dir(rq);
 
 	ide_debug_log(IDE_DBG_FUNC, "block: %d, blocks: %d", block, blocks);
@@ -259,8 +259,8 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 			goto out_end;
 	}
 	if (blk_fs_request(rq)) {
-		if (((long)rq->sector % floppy->bs_factor) ||
-		    (rq->nr_sectors % floppy->bs_factor)) {
+		if (((long)blk_rq_pos(rq) % floppy->bs_factor) ||
+		    (blk_rq_sectors(rq) % floppy->bs_factor)) {
 			printk(KERN_ERR PFX "%s: unsupported r/w rq size\n",
 				drive->name);
 			goto out_end;
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index df23bcb..59799ca 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -279,7 +279,7 @@ static ide_startstop_t execute_drive_cmd (ide_drive_t *drive,
 
 	if (cmd) {
 		if (cmd->protocol == ATA_PROT_PIO) {
-			ide_init_sg_cmd(cmd, rq->nr_sectors << 9);
+			ide_init_sg_cmd(cmd, blk_rq_sectors(rq) << 9);
 			ide_map_sg(drive, cmd);
 		}
 
@@ -387,7 +387,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 
 		drv = *(struct ide_driver **)rq->rq_disk->private_data;
 
-		return drv->do_request(drive, rq, rq->sector);
+		return drv->do_request(drive, rq, blk_rq_pos(rq));
 	}
 	return do_special(drive);
 kill_rq:
diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c
index 56ff8c4..05b7fbc 100644
--- a/drivers/ide/ide-lib.c
+++ b/drivers/ide/ide-lib.c
@@ -114,7 +114,7 @@ static void ide_dump_ata_error(ide_drive_t *drive, u8 err)
 
 		if (rq)
 			printk(KERN_CONT ", sector=%llu",
-			       (unsigned long long)rq->sector);
+			       (unsigned long long)blk_rq_pos(rq));
 	}
 	printk(KERN_CONT "\n");
 }
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 65c5b70..c6cf3a9 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -586,7 +586,7 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape,
 				   struct ide_atapi_pc *pc, struct request *rq,
 				   u8 opcode)
 {
-	unsigned int length = rq->nr_sectors;
+	unsigned int length = blk_rq_sectors(rq);
 
 	ide_init_pc(pc);
 	put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]);
@@ -617,8 +617,8 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	struct ide_cmd cmd;
 	u8 stat;
 
-	debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %lu\n"
-		  (unsigned long long)rq->sector, rq->nr_sectors);
+	debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %u\n"
+		  (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq));
 
 	if (!(blk_special_request(rq) || blk_sense_request(rq))) {
 		/* We do not support buffer cache originated requests. */
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index f400eb4..a0c3e1b 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -385,7 +385,7 @@ out_end:
 	if ((cmd->tf_flags & IDE_TFLAG_FS) == 0)
 		ide_finish_cmd(drive, cmd, stat);
 	else
-		ide_complete_rq(drive, 0, cmd->rq->nr_sectors << 9);
+		ide_complete_rq(drive, 0, blk_rq_sectors(cmd->rq) << 9);
 	return ide_stopped;
 out_err:
 	ide_error_cmd(drive, cmd);
diff --git a/drivers/ide/pdc202xx_old.c b/drivers/ide/pdc202xx_old.c
index 248a54b..c2a16a8 100644
--- a/drivers/ide/pdc202xx_old.c
+++ b/drivers/ide/pdc202xx_old.c
@@ -177,7 +177,7 @@ static void pdc202xx_dma_start(ide_drive_t *drive)
 		u8 clock = inb(high_16 + 0x11);
 
 		outb(clock | (hwif->channel ? 0x08 : 0x02), high_16 + 0x11);
-		word_count = (rq->nr_sectors << 8);
+		word_count = (blk_rq_sectors(rq) << 8);
 		word_count = (rq_data_dir(rq) == READ) ?
 					word_count | 0x05000000 :
 					word_count | 0x06000000;
diff --git a/drivers/ide/tc86c001.c b/drivers/ide/tc86c001.c
index b4cf42d..05a93d6b 100644
--- a/drivers/ide/tc86c001.c
+++ b/drivers/ide/tc86c001.c
@@ -112,7 +112,7 @@ static void tc86c001_dma_start(ide_drive_t *drive)
 	ide_hwif_t *hwif	= drive->hwif;
 	unsigned long sc_base	= hwif->config_data;
 	unsigned long twcr_port	= sc_base + (drive->dn ? 0x06 : 0x04);
-	unsigned long nsectors	= hwif->rq->nr_sectors;
+	unsigned long nsectors	= blk_rq_sectors(hwif->rq);
 
 	/*
 	 * We have to manually load the sector count and size into
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index 564422d..5ca7622 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -307,7 +307,7 @@ static int tx4939ide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 	tx4939ide_writew(SECTOR_SIZE / 2, base, drive->dn ?
 			 TX4939IDE_Xfer_Cnt_2 : TX4939IDE_Xfer_Cnt_1);
 
-	tx4939ide_writew(cmd->rq->nr_sectors, base, TX4939IDE_Sec_Cnt);
+	tx4939ide_writew(blk_rq_sectors(cmd->rq), base, TX4939IDE_Sec_Cnt);
 
 	return 0;
 }
-- 
cgit v0.10.2


From 2e46e8b27aa57c6bd34b3102b40ee4d0144b4fab Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:41 +0900
Subject: block: drop request->hard_* and *nr_sectors

struct request has had a few different ways to represent some
properties of a request.  ->hard_* represent block layer's view of the
request progress (completion cursor) and the ones without the prefix
are supposed to represent the issue cursor and allowed to be updated
as necessary by the low level drivers.  The thing is that as block
layer supports partial completion, the two cursors really aren't
necessary and only cause confusion.  In addition, manual management of
request detail from low level drivers is cumbersome and error-prone at
the very least.

Another interesting duplicate fields are rq->[hard_]nr_sectors and
rq->{hard_cur|current}_nr_sectors against rq->data_len and
rq->bio->bi_size.  This is more convoluted than the hard_ case.

rq->[hard_]nr_sectors are initialized for requests with bio but
blk_rq_bytes() uses it only for !pc requests.  rq->data_len is
initialized for all request but blk_rq_bytes() uses it only for pc
requests.  This causes good amount of confusion throughout block layer
and its drivers and determining the request length has been a bit of
black magic which may or may not work depending on circumstances and
what the specific LLD is actually doing.

rq->{hard_cur|current}_nr_sectors represent the number of sectors in
the contiguous data area at the front.  This is mainly used by drivers
which transfers data by walking request segment-by-segment.  This
value always equals rq->bio->bi_size >> 9.  However, data length for
pc requests may not be multiple of 512 bytes and using this field
becomes a bit confusing.

In general, having multiple fields to represent the same property
leads only to confusion and subtle bugs.  With recent block low level
driver cleanups, no driver is accessing or manipulating these
duplicate fields directly.  Drop all the duplicates.  Now rq->sector
means the current sector, rq->data_len the current total length and
rq->bio->bi_size the current segment length.  Everything else is
defined in terms of these three and available only through accessors.

* blk_recalc_rq_sectors() is collapsed into blk_update_request() and
  now handles pc and fs requests equally other than rq->sector update.
  This means that now pc requests can use partial completion too (no
  in-kernel user yet tho).

* bio_cur_sectors() is replaced with bio_cur_bytes() as block layer
  now uses byte count as the primary data length.

* blk_rq_pos() is now guranteed to be always correct.  In-block users
  converted.

* blk_rq_bytes() is now guaranteed to be always valid as is
  blk_rq_sectors().  In-block users converted.

* blk_rq_sectors() is now guaranteed to equal blk_rq_bytes() >> 9.
  More convenient one is used.

* blk_rq_bytes() and blk_rq_cur_bytes() are now inlined and take const
  pointer to request.

[ Impact: API cleanup, single way to represent one property of a request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index 82dc206..3596ca7 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -127,7 +127,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	INIT_LIST_HEAD(&rq->timeout_list);
 	rq->cpu = -1;
 	rq->q = q;
-	rq->sector = rq->hard_sector = (sector_t) -1;
+	rq->sector = (sector_t) -1;
 	INIT_HLIST_NODE(&rq->hash);
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->cmd = rq->__cmd;
@@ -189,8 +189,7 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
 	       (unsigned long long)blk_rq_pos(rq),
 	       blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
 	printk(KERN_INFO "  bio %p, biotail %p, buffer %p, len %u\n",
-						rq->bio, rq->biotail,
-						rq->buffer, rq->data_len);
+	       rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq));
 
 	if (blk_pc_request(rq)) {
 		printk(KERN_INFO "  cdb: ");
@@ -1096,7 +1095,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 		req->cmd_flags |= REQ_NOIDLE;
 
 	req->errors = 0;
-	req->hard_sector = req->sector = bio->bi_sector;
+	req->sector = bio->bi_sector;
 	req->ioprio = bio_prio(bio);
 	blk_rq_bio_prep(req->q, req, bio);
 }
@@ -1113,14 +1112,13 @@ static inline bool queue_should_plug(struct request_queue *q)
 static int __make_request(struct request_queue *q, struct bio *bio)
 {
 	struct request *req;
-	int el_ret, nr_sectors;
+	int el_ret;
+	unsigned int bytes = bio->bi_size;
 	const unsigned short prio = bio_prio(bio);
 	const int sync = bio_sync(bio);
 	const int unplug = bio_unplug(bio);
 	int rw_flags;
 
-	nr_sectors = bio_sectors(bio);
-
 	/*
 	 * low level driver can indicate that it wants pages above a
 	 * certain limit bounced to low memory (ie for highmem, or even
@@ -1145,7 +1143,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 
 		req->biotail->bi_next = bio;
 		req->biotail = bio;
-		req->nr_sectors = req->hard_nr_sectors += nr_sectors;
+		req->data_len += bytes;
 		req->ioprio = ioprio_best(req->ioprio, prio);
 		if (!blk_rq_cpu_valid(req))
 			req->cpu = bio->bi_comp_cpu;
@@ -1171,10 +1169,8 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 		 * not touch req->buffer either...
 		 */
 		req->buffer = bio_data(bio);
-		req->current_nr_sectors = bio_cur_sectors(bio);
-		req->hard_cur_sectors = req->current_nr_sectors;
-		req->sector = req->hard_sector = bio->bi_sector;
-		req->nr_sectors = req->hard_nr_sectors += nr_sectors;
+		req->sector = bio->bi_sector;
+		req->data_len += bytes;
 		req->ioprio = ioprio_best(req->ioprio, prio);
 		if (!blk_rq_cpu_valid(req))
 			req->cpu = bio->bi_comp_cpu;
@@ -1557,7 +1553,7 @@ EXPORT_SYMBOL(submit_bio);
 int blk_rq_check_limits(struct request_queue *q, struct request *rq)
 {
 	if (blk_rq_sectors(rq) > q->max_sectors ||
-	    rq->data_len > q->max_hw_sectors << 9) {
+	    blk_rq_bytes(rq) > q->max_hw_sectors << 9) {
 		printk(KERN_ERR "%s: over max size limit.\n", __func__);
 		return -EIO;
 	}
@@ -1675,35 +1671,6 @@ static void blk_account_io_done(struct request *req)
 	}
 }
 
-/**
- * blk_rq_bytes - Returns bytes left to complete in the entire request
- * @rq: the request being processed
- **/
-unsigned int blk_rq_bytes(struct request *rq)
-{
-	if (blk_fs_request(rq))
-		return blk_rq_sectors(rq) << 9;
-
-	return rq->data_len;
-}
-EXPORT_SYMBOL_GPL(blk_rq_bytes);
-
-/**
- * blk_rq_cur_bytes - Returns bytes left to complete in the current segment
- * @rq: the request being processed
- **/
-unsigned int blk_rq_cur_bytes(struct request *rq)
-{
-	if (blk_fs_request(rq))
-		return rq->current_nr_sectors << 9;
-
-	if (rq->bio)
-		return rq->bio->bi_size;
-
-	return rq->data_len;
-}
-EXPORT_SYMBOL_GPL(blk_rq_cur_bytes);
-
 struct request *elv_next_request(struct request_queue *q)
 {
 	struct request *rq;
@@ -1736,7 +1703,7 @@ struct request *elv_next_request(struct request_queue *q)
 		if (rq->cmd_flags & REQ_DONTPREP)
 			break;
 
-		if (q->dma_drain_size && rq->data_len) {
+		if (q->dma_drain_size && blk_rq_bytes(rq)) {
 			/*
 			 * make sure space for the drain appears we
 			 * know we can do this because max_hw_segments
@@ -1759,7 +1726,7 @@ struct request *elv_next_request(struct request_queue *q)
 			 * avoid resource deadlock.  REQ_STARTED will
 			 * prevent other fs requests from passing this one.
 			 */
-			if (q->dma_drain_size && rq->data_len &&
+			if (q->dma_drain_size && blk_rq_bytes(rq) &&
 			    !(rq->cmd_flags & REQ_DONTPREP)) {
 				/*
 				 * remove the space for the drain we added
@@ -1911,8 +1878,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 		 * can find how many bytes remain in the request
 		 * later.
 		 */
-		req->nr_sectors = req->hard_nr_sectors = 0;
-		req->current_nr_sectors = req->hard_cur_sectors = 0;
+		req->data_len = 0;
 		return false;
 	}
 
@@ -1926,8 +1892,25 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 		bio_iovec(bio)->bv_len -= nr_bytes;
 	}
 
-	blk_recalc_rq_sectors(req, total_bytes >> 9);
+	req->data_len -= total_bytes;
+	req->buffer = bio_data(req->bio);
+
+	/* update sector only for requests with clear definition of sector */
+	if (blk_fs_request(req) || blk_discard_rq(req))
+		req->sector += total_bytes >> 9;
+
+	/*
+	 * If total number of sectors is less than the first segment
+	 * size, something has gone terribly wrong.
+	 */
+	if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
+		printk(KERN_ERR "blk: request botched\n");
+		req->data_len = blk_rq_cur_bytes(req);
+	}
+
+	/* recalculate the number of segments */
 	blk_recalc_rq_segments(req);
+
 	return true;
 }
 EXPORT_SYMBOL_GPL(blk_update_request);
@@ -2049,11 +2032,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		rq->nr_phys_segments = bio_phys_segments(q, bio);
 		rq->buffer = bio_data(bio);
 	}
-	rq->current_nr_sectors = bio_cur_sectors(bio);
-	rq->hard_cur_sectors = rq->current_nr_sectors;
-	rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
 	rq->data_len = bio->bi_size;
-
 	rq->bio = rq->biotail = bio;
 
 	if (bio->bi_bdev)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index bf62a87..b8df66a 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -9,35 +9,6 @@
 
 #include "blk.h"
 
-void blk_recalc_rq_sectors(struct request *rq, int nsect)
-{
-	if (blk_fs_request(rq) || blk_discard_rq(rq)) {
-		rq->hard_sector += nsect;
-		rq->hard_nr_sectors -= nsect;
-
-		/*
-		 * Move the I/O submission pointers ahead if required.
-		 */
-		if ((rq->nr_sectors >= rq->hard_nr_sectors) &&
-		    (rq->sector <= rq->hard_sector)) {
-			rq->sector = rq->hard_sector;
-			rq->nr_sectors = rq->hard_nr_sectors;
-			rq->hard_cur_sectors = bio_cur_sectors(rq->bio);
-			rq->current_nr_sectors = rq->hard_cur_sectors;
-			rq->buffer = bio_data(rq->bio);
-		}
-
-		/*
-		 * if total number of sectors is less than the first segment
-		 * size, something has gone terribly wrong
-		 */
-		if (rq->nr_sectors < rq->current_nr_sectors) {
-			printk(KERN_ERR "blk: request botched\n");
-			rq->nr_sectors = rq->current_nr_sectors;
-		}
-	}
-}
-
 static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 					     struct bio *bio)
 {
@@ -199,8 +170,9 @@ new_segment:
 
 
 	if (unlikely(rq->cmd_flags & REQ_COPY_USER) &&
-	    (rq->data_len & q->dma_pad_mask)) {
-		unsigned int pad_len = (q->dma_pad_mask & ~rq->data_len) + 1;
+	    (blk_rq_bytes(rq) & q->dma_pad_mask)) {
+		unsigned int pad_len =
+			(q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1;
 
 		sg->length += pad_len;
 		rq->extra_len += pad_len;
@@ -398,7 +370,7 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	req->biotail->bi_next = next->bio;
 	req->biotail = next->biotail;
 
-	req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
+	req->data_len += blk_rq_bytes(next);
 
 	elv_merge_requests(q, req, next);
 
diff --git a/block/blk.h b/block/blk.h
index 5111559..ab54529 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -101,7 +101,6 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 int attempt_back_merge(struct request_queue *q, struct request *rq);
 int attempt_front_merge(struct request_queue *q, struct request *rq);
 void blk_recalc_rq_segments(struct request *rq);
-void blk_recalc_rq_sectors(struct request *rq, int nsect);
 
 void blk_queue_congestion_threshold(struct request_queue *q);
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index db4d990..99ac430 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -579,9 +579,9 @@ cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
 		 * Sort strictly based on sector.  Smallest to the left,
 		 * largest to the right.
 		 */
-		if (sector > cfqq->next_rq->sector)
+		if (sector > blk_rq_pos(cfqq->next_rq))
 			n = &(*p)->rb_right;
-		else if (sector < cfqq->next_rq->sector)
+		else if (sector < blk_rq_pos(cfqq->next_rq))
 			n = &(*p)->rb_left;
 		else
 			break;
@@ -611,8 +611,8 @@ static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 		return;
 
 	cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
-	__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root, cfqq->next_rq->sector,
-					 &parent, &p);
+	__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
+				      blk_rq_pos(cfqq->next_rq), &parent, &p);
 	if (!__cfqq) {
 		rb_link_node(&cfqq->p_node, parent, p);
 		rb_insert_color(&cfqq->p_node, cfqq->p_root);
@@ -996,7 +996,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
 	if (cfq_rq_close(cfqd, __cfqq->next_rq))
 		return __cfqq;
 
-	if (__cfqq->next_rq->sector < sector)
+	if (blk_rq_pos(__cfqq->next_rq) < sector)
 		node = rb_next(&__cfqq->p_node);
 	else
 		node = rb_prev(&__cfqq->p_node);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index f37ca8c..d30ec6f 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -218,12 +218,12 @@ struct bio {
 #define bio_sectors(bio)	((bio)->bi_size >> 9)
 #define bio_empty_barrier(bio)	(bio_barrier(bio) && !bio_has_data(bio) && !bio_discard(bio))
 
-static inline unsigned int bio_cur_sectors(struct bio *bio)
+static inline unsigned int bio_cur_bytes(struct bio *bio)
 {
 	if (bio->bi_vcnt)
-		return bio_iovec(bio)->bv_len >> 9;
+		return bio_iovec(bio)->bv_len;
 	else /* dataless requests such as discard */
-		return bio->bi_size >> 9;
+		return bio->bi_size;
 }
 
 static inline void *bio_data(struct bio *bio)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4e5f855..ce2bf5e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -166,19 +166,8 @@ struct request {
 	enum rq_cmd_type_bits cmd_type;
 	unsigned long atomic_flags;
 
-	/* Maintain bio traversal state for part by part I/O submission.
-	 * hard_* are block layer internals, no driver should touch them!
-	 */
-
-	sector_t sector;		/* next sector to submit */
-	sector_t hard_sector;		/* next sector to complete */
-	unsigned long nr_sectors;	/* no. of sectors left to submit */
-	unsigned long hard_nr_sectors;	/* no. of sectors left to complete */
-	/* no. of sectors left to submit in the current segment */
-	unsigned int current_nr_sectors;
-
-	/* no. of sectors left to complete in the current segment */
-	unsigned int hard_cur_sectors;
+	sector_t sector;	/* sector cursor */
+	unsigned int data_len;	/* total data len, don't access directly */
 
 	struct bio *bio;
 	struct bio *biotail;
@@ -226,7 +215,6 @@ struct request {
 	unsigned char __cmd[BLK_MAX_CDB];
 	unsigned char *cmd;
 
-	unsigned int data_len;
 	unsigned int extra_len;	/* length of alignment and padding */
 	unsigned int sense_len;
 	unsigned int resid_len;	/* residual count */
@@ -840,20 +828,27 @@ extern void blkdev_dequeue_request(struct request *req);
  */
 static inline sector_t blk_rq_pos(const struct request *rq)
 {
-	return rq->hard_sector;
+	return rq->sector;
+}
+
+static inline unsigned int blk_rq_bytes(const struct request *rq)
+{
+	return rq->data_len;
 }
 
-extern unsigned int blk_rq_bytes(struct request *rq);
-extern unsigned int blk_rq_cur_bytes(struct request *rq);
+static inline int blk_rq_cur_bytes(const struct request *rq)
+{
+	return rq->bio ? bio_cur_bytes(rq->bio) : 0;
+}
 
 static inline unsigned int blk_rq_sectors(const struct request *rq)
 {
-	return rq->hard_nr_sectors;
+	return blk_rq_bytes(rq) >> 9;
 }
 
 static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
 {
-	return rq->hard_cur_sectors;
+	return blk_rq_cur_bytes(rq) >> 9;
 }
 
 /*
@@ -928,7 +923,7 @@ static inline void blk_end_request_all(struct request *rq, int error)
  */
 static inline bool blk_end_request_cur(struct request *rq, int error)
 {
-	return blk_end_request(rq, error, rq->hard_cur_sectors << 9);
+	return blk_end_request(rq, error, blk_rq_cur_bytes(rq));
 }
 
 /**
@@ -981,7 +976,7 @@ static inline void __blk_end_request_all(struct request *rq, int error)
  */
 static inline bool __blk_end_request_cur(struct request *rq, int error)
 {
-	return __blk_end_request(rq, error, rq->hard_cur_sectors << 9);
+	return __blk_end_request(rq, error, blk_rq_cur_bytes(rq));
 }
 
 extern void blk_complete_request(struct request *);
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index c59b769..4e46287 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -171,7 +171,7 @@ enum {
 	ELV_MQUEUE_MUST,
 };
 
-#define rq_end_sector(rq)	((rq)->sector + (rq)->nr_sectors)
+#define rq_end_sector(rq)	(blk_rq_pos(rq) + blk_rq_sectors(rq))
 #define rb_entry_rq(node)	rb_entry((node), struct request, rb_node)
 
 /*
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 42f1c11..5708a14 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -642,12 +642,12 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 
 	if (blk_pc_request(rq)) {
 		what |= BLK_TC_ACT(BLK_TC_PC);
-		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
-				rq->cmd_len, rq->cmd);
+		__blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
+				what, rq->errors, rq->cmd_len, rq->cmd);
 	} else  {
 		what |= BLK_TC_ACT(BLK_TC_FS);
-		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_sectors(rq) << 9,
-				rw, what, rq->errors, 0, NULL);
+		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw,
+				what, rq->errors, 0, NULL);
 	}
 }
 
@@ -854,11 +854,11 @@ void blk_add_driver_data(struct request_queue *q,
 		return;
 
 	if (blk_pc_request(rq))
-		__blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
-				rq->errors, len, data);
+		__blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
+				BLK_TA_DRV_DATA, rq->errors, len, data);
 	else
-		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_sectors(rq) << 9,
-				0, BLK_TA_DRV_DATA, rq->errors, len, data);
+		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0,
+				BLK_TA_DRV_DATA, rq->errors, len, data);
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
 
-- 
cgit v0.10.2


From b0790410300abaaf4f25f702803beff701baebf1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:42 +0900
Subject: block: cleanup rq->data_len usages

With recent unification of fields, it's now guaranteed that
rq->data_len always equals blk_rq_bytes().  Convert all non-IDE direct
users to accessors.  IDE will be converted in a separate patch.

Boaz: spotted incorrect data_len/resid_len conversion in osd.

[ Impact: convert direct rq->data_len usages to blk_rq_bytes() ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: Pete Zaitcev <zaitcev@redhat.com>
Cc: Eric Moore <Eric.Moore@lsi.com>
Cc: Markus Lidel <Markus.Lidel@shadowconnect.com>
Cc: Darrick J. Wong <djwong@us.ibm.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Eric Moore <Eric.Moore@lsi.com>
Cc: Boaz Harrosh <bharrosh@panasas.com>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 2733b0c..6e4c600 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1084,7 +1084,7 @@ static int atapi_drain_needed(struct request *rq)
 	if (likely(!blk_pc_request(rq)))
 		return 0;
 
-	if (!rq->data_len || (rq->cmd_flags & REQ_RW))
+	if (!blk_rq_bytes(rq) || (rq->cmd_flags & REQ_RW))
 		return 0;
 
 	return atapi_cmd_type(rq->cmd[0]) == ATAPI_MISC;
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index dc3b899..1591f61 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -747,7 +747,7 @@ static void ub_cmd_build_packet(struct ub_dev *sc, struct ub_lun *lun,
 {
 	struct request *rq = urq->rq;
 
-	if (rq->data_len == 0) {
+	if (blk_rq_bytes(rq) == 0) {
 		cmd->dir = UB_DIR_NONE;
 	} else {
 		if (rq_data_dir(rq) == WRITE)
@@ -762,7 +762,7 @@ static void ub_cmd_build_packet(struct ub_dev *sc, struct ub_lun *lun,
 	memcpy(&cmd->cdb, rq->cmd, rq->cmd_len);
 	cmd->cdb_len = rq->cmd_len;
 
-	cmd->len = rq->data_len;
+	cmd->len = blk_rq_bytes(rq);
 
 	/*
 	 * To reapply this to every URB is not as incorrect as it looks.
@@ -783,8 +783,8 @@ static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 
 	if (cmd->error == 0) {
 		if (blk_pc_request(rq)) {
-			if (cmd->act_len < rq->data_len)
-				rq->resid_len = rq->data_len - cmd->act_len;
+			if (cmd->act_len < blk_rq_bytes(rq))
+				rq->resid_len = blk_rq_bytes(rq) - cmd->act_len;
 			scsi_status = 0;
 		} else {
 			if (cmd->act_len != cmd->len) {
diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c
index 5d5f347..4e6fcf0 100644
--- a/drivers/message/fusion/mptsas.c
+++ b/drivers/message/fusion/mptsas.c
@@ -1277,8 +1277,8 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	/* do we need to support multiple segments? */
 	if (req->bio->bi_vcnt > 1 || rsp->bio->bi_vcnt > 1) {
 		printk(MYIOC_s_ERR_FMT "%s: multiple segments req %u %u, rsp %u %u\n",
-		    ioc->name, __func__, req->bio->bi_vcnt, req->data_len,
-		    rsp->bio->bi_vcnt, rsp->data_len);
+		    ioc->name, __func__, req->bio->bi_vcnt, blk_rq_bytes(req),
+		    rsp->bio->bi_vcnt, blk_rq_bytes(rsp));
 		return -EINVAL;
 	}
 
@@ -1295,7 +1295,7 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	smpreq = (SmpPassthroughRequest_t *)mf;
 	memset(smpreq, 0, sizeof(*smpreq));
 
-	smpreq->RequestDataLength = cpu_to_le16(req->data_len - 4);
+	smpreq->RequestDataLength = cpu_to_le16(blk_rq_bytes(req) - 4);
 	smpreq->Function = MPI_FUNCTION_SMP_PASSTHROUGH;
 
 	if (rphy)
@@ -1321,10 +1321,10 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 		       MPI_SGE_FLAGS_END_OF_BUFFER |
 		       MPI_SGE_FLAGS_DIRECTION |
 		       mpt_addr_size()) << MPI_SGE_FLAGS_SHIFT;
-	flagsLength |= (req->data_len - 4);
+	flagsLength |= (blk_rq_bytes(req) - 4);
 
 	dma_addr_out = pci_map_single(ioc->pcidev, bio_data(req->bio),
-				      req->data_len, PCI_DMA_BIDIRECTIONAL);
+				      blk_rq_bytes(req), PCI_DMA_BIDIRECTIONAL);
 	if (!dma_addr_out)
 		goto put_mf;
 	mpt_add_sge(psge, flagsLength, dma_addr_out);
@@ -1332,9 +1332,9 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 
 	/* response */
 	flagsLength = MPT_SGE_FLAGS_SSIMPLE_READ;
-	flagsLength |= rsp->data_len + 4;
+	flagsLength |= blk_rq_bytes(rsp) + 4;
 	dma_addr_in =  pci_map_single(ioc->pcidev, bio_data(rsp->bio),
-				      rsp->data_len, PCI_DMA_BIDIRECTIONAL);
+				      blk_rq_bytes(rsp), PCI_DMA_BIDIRECTIONAL);
 	if (!dma_addr_in)
 		goto unmap;
 	mpt_add_sge(psge, flagsLength, dma_addr_in);
@@ -1357,7 +1357,7 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 		smprep = (SmpPassthroughReply_t *)ioc->sas_mgmt.reply;
 		memcpy(req->sense, smprep, sizeof(*smprep));
 		req->sense_len = sizeof(*smprep);
-		rsp->resid_len = rsp->data_len - smprep->ResponseDataLength;
+		rsp->resid_len = blk_rq_bytes(rsp) - smprep->ResponseDataLength;
 	} else {
 		printk(MYIOC_s_ERR_FMT "%s: smp passthru reply failed to be returned\n",
 		    ioc->name, __func__);
@@ -1365,10 +1365,10 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	}
 unmap:
 	if (dma_addr_out)
-		pci_unmap_single(ioc->pcidev, dma_addr_out, req->data_len,
+		pci_unmap_single(ioc->pcidev, dma_addr_out, blk_rq_bytes(req),
 				 PCI_DMA_BIDIRECTIONAL);
 	if (dma_addr_in)
-		pci_unmap_single(ioc->pcidev, dma_addr_in, rsp->data_len,
+		pci_unmap_single(ioc->pcidev, dma_addr_in, blk_rq_bytes(rsp),
 				 PCI_DMA_BIDIRECTIONAL);
 put_mf:
 	if (mf)
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 510eb47..6b61d28 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -430,7 +430,7 @@ static void i2o_block_end_request(struct request *req, int error,
 		int leftover = (blk_rq_sectors(req) << KERNEL_SECTOR_SHIFT);
 
 		if (blk_pc_request(req))
-			leftover = req->data_len;
+			leftover = blk_rq_bytes(req);
 
 		if (error)
 			blk_end_request(req, -EIO, leftover);
diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
index 6605ec9..531af9e 100644
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -1927,13 +1927,13 @@ int sas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	/* do we need to support multiple segments? */
 	if (req->bio->bi_vcnt > 1 || rsp->bio->bi_vcnt > 1) {
 		printk("%s: multiple segments req %u %u, rsp %u %u\n",
-		       __func__, req->bio->bi_vcnt, req->data_len,
-		       rsp->bio->bi_vcnt, rsp->data_len);
+		       __func__, req->bio->bi_vcnt, blk_rq_bytes(req),
+		       rsp->bio->bi_vcnt, blk_rq_bytes(rsp));
 		return -EINVAL;
 	}
 
-	ret = smp_execute_task(dev, bio_data(req->bio), req->data_len,
-			       bio_data(rsp->bio), rsp->data_len);
+	ret = smp_execute_task(dev, bio_data(req->bio), blk_rq_bytes(req),
+			       bio_data(rsp->bio), blk_rq_bytes(rsp));
 	if (ret > 0) {
 		/* positive number is the untransferred residual */
 		rsp->resid_len = ret;
diff --git a/drivers/scsi/libsas/sas_host_smp.c b/drivers/scsi/libsas/sas_host_smp.c
index 89952ed..be9a951 100644
--- a/drivers/scsi/libsas/sas_host_smp.c
+++ b/drivers/scsi/libsas/sas_host_smp.c
@@ -137,21 +137,21 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 	int error = -EINVAL;
 
 	/* eight is the minimum size for request and response frames */
-	if (req->data_len < 8 || rsp->data_len < 8)
+	if (blk_rq_bytes(req) < 8 || blk_rq_bytes(rsp) < 8)
 		goto out;
 
-	if (bio_offset(req->bio) + req->data_len > PAGE_SIZE ||
-	    bio_offset(rsp->bio) + rsp->data_len > PAGE_SIZE) {
+	if (bio_offset(req->bio) + blk_rq_bytes(req) > PAGE_SIZE ||
+	    bio_offset(rsp->bio) + blk_rq_bytes(rsp) > PAGE_SIZE) {
 		shost_printk(KERN_ERR, shost,
 			"SMP request/response frame crosses page boundary");
 		goto out;
 	}
 
-	req_data = kzalloc(req->data_len, GFP_KERNEL);
+	req_data = kzalloc(blk_rq_bytes(req), GFP_KERNEL);
 
 	/* make sure frame can always be built ... we copy
 	 * back only the requested length */
-	resp_data = kzalloc(max(rsp->data_len, 128U), GFP_KERNEL);
+	resp_data = kzalloc(max(blk_rq_bytes(rsp), 128U), GFP_KERNEL);
 
 	if (!req_data || !resp_data) {
 		error = -ENOMEM;
@@ -160,7 +160,7 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 
 	local_irq_disable();
 	buf = kmap_atomic(bio_page(req->bio), KM_USER0) + bio_offset(req->bio);
-	memcpy(req_data, buf, req->data_len);
+	memcpy(req_data, buf, blk_rq_bytes(req));
 	kunmap_atomic(buf - bio_offset(req->bio), KM_USER0);
 	local_irq_enable();
 
@@ -176,8 +176,8 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 	resp_data[1] = req_data[1];
 	resp_data[2] = SMP_RESP_FUNC_UNK;
 
-	req->resid_len = req->data_len;
-	rsp->resid_len = rsp->data_len;
+	req->resid_len = blk_rq_bytes(req);
+	rsp->resid_len = blk_rq_bytes(rsp);
 
 	switch (req_data[1]) {
 	case SMP_REPORT_GENERAL:
@@ -264,7 +264,7 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 
 	local_irq_disable();
 	buf = kmap_atomic(bio_page(rsp->bio), KM_USER0) + bio_offset(rsp->bio);
-	memcpy(buf, resp_data, rsp->data_len);
+	memcpy(buf, resp_data, blk_rq_bytes(rsp));
 	flush_kernel_dcache_page(bio_page(rsp->bio));
 	kunmap_atomic(buf - bio_offset(rsp->bio), KM_USER0);
 	local_irq_enable();
diff --git a/drivers/scsi/mpt2sas/mpt2sas_transport.c b/drivers/scsi/mpt2sas/mpt2sas_transport.c
index 53759c5..af95a44 100644
--- a/drivers/scsi/mpt2sas/mpt2sas_transport.c
+++ b/drivers/scsi/mpt2sas/mpt2sas_transport.c
@@ -1041,7 +1041,7 @@ transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	if (req->bio->bi_vcnt > 1 || rsp->bio->bi_vcnt > 1) {
 		printk(MPT2SAS_ERR_FMT "%s: multiple segments req %u %u, "
 		    "rsp %u %u\n", ioc->name, __func__, req->bio->bi_vcnt,
-		    req->data_len, rsp->bio->bi_vcnt, rsp->data_len);
+		    blk_rq_bytes(req), rsp->bio->bi_vcnt, blk_rq_bytes(rsp));
 		return -EINVAL;
 	}
 
@@ -1104,7 +1104,7 @@ transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	*((u64 *)&mpi_request->SASAddress) = (rphy) ?
 	    cpu_to_le64(rphy->identify.sas_address) :
 	    cpu_to_le64(ioc->sas_hba.sas_address);
-	mpi_request->RequestDataLength = cpu_to_le16(req->data_len - 4);
+	mpi_request->RequestDataLength = cpu_to_le16(blk_rq_bytes(req) - 4);
 	psge = &mpi_request->SGL;
 
 	/* WRITE sgel first */
@@ -1112,13 +1112,13 @@ transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	    MPI2_SGE_FLAGS_END_OF_BUFFER | MPI2_SGE_FLAGS_HOST_TO_IOC);
 	sgl_flags = sgl_flags << MPI2_SGE_FLAGS_SHIFT;
 	dma_addr_out = pci_map_single(ioc->pdev, bio_data(req->bio),
-	      req->data_len, PCI_DMA_BIDIRECTIONAL);
+		blk_rq_bytes(req), PCI_DMA_BIDIRECTIONAL);
 	if (!dma_addr_out) {
 		mpt2sas_base_free_smid(ioc, le16_to_cpu(smid));
 		goto unmap;
 	}
 
-	ioc->base_add_sg_single(psge, sgl_flags | (req->data_len - 4),
+	ioc->base_add_sg_single(psge, sgl_flags | (blk_rq_bytes(req) - 4),
 	    dma_addr_out);
 
 	/* incr sgel */
@@ -1129,14 +1129,14 @@ transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	    MPI2_SGE_FLAGS_LAST_ELEMENT | MPI2_SGE_FLAGS_END_OF_BUFFER |
 	    MPI2_SGE_FLAGS_END_OF_LIST);
 	sgl_flags = sgl_flags << MPI2_SGE_FLAGS_SHIFT;
-	dma_addr_in =  pci_map_single(ioc->pdev, bio_data(rsp->bio),
-	      rsp->data_len, PCI_DMA_BIDIRECTIONAL);
+	dma_addr_in = pci_map_single(ioc->pdev, bio_data(rsp->bio),
+				     blk_rq_bytes(rsp), PCI_DMA_BIDIRECTIONAL);
 	if (!dma_addr_in) {
 		mpt2sas_base_free_smid(ioc, le16_to_cpu(smid));
 		goto unmap;
 	}
 
-	ioc->base_add_sg_single(psge, sgl_flags | (rsp->data_len + 4),
+	ioc->base_add_sg_single(psge, sgl_flags | (blk_rq_bytes(rsp) + 4),
 	    dma_addr_in);
 
 	dtransportprintk(ioc, printk(MPT2SAS_DEBUG_FMT "%s - "
@@ -1170,7 +1170,8 @@ transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 
 		memcpy(req->sense, mpi_reply, sizeof(*mpi_reply));
 		req->sense_len = sizeof(*mpi_reply);
-		rsp->resid_len = rsp->data_len - mpi_reply->ResponseDataLength;
+		rsp->resid_len = blk_rq_bytes(rsp) -
+				 mpi_reply->ResponseDataLength;
 	} else {
 		dtransportprintk(ioc, printk(MPT2SAS_DEBUG_FMT
 		    "%s - no reply\n", ioc->name, __func__));
@@ -1186,10 +1187,10 @@ transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 
  unmap:
 	if (dma_addr_out)
-		pci_unmap_single(ioc->pdev, dma_addr_out, req->data_len,
+		pci_unmap_single(ioc->pdev, dma_addr_out, blk_rq_bytes(req),
 		    PCI_DMA_BIDIRECTIONAL);
 	if (dma_addr_in)
-		pci_unmap_single(ioc->pdev, dma_addr_in, rsp->data_len,
+		pci_unmap_single(ioc->pdev, dma_addr_in, blk_rq_bytes(rsp),
 		    PCI_DMA_BIDIRECTIONAL);
 
  out:
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index 2a5f077..d178a8b 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -1299,7 +1299,7 @@ int osd_finalize_request(struct osd_request *or,
 			return ret;
 		}
 		OSD_DEBUG("out bytes=%llu (bytes_req=%u)\n",
-			_LLU(or->out.total_bytes), or->out.req->data_len);
+			_LLU(or->out.total_bytes), blk_rq_bytes(or->out.req));
 	}
 	if (or->in.bio) {
 		ret = blk_rq_append_bio(or->request->q, or->in.req, or->in.bio);
@@ -1308,7 +1308,7 @@ int osd_finalize_request(struct osd_request *or,
 			return ret;
 		}
 		OSD_DEBUG("in bytes=%llu (bytes_req=%u)\n",
-			_LLU(or->in.total_bytes), or->in.req->data_len);
+			_LLU(or->in.total_bytes), blk_rq_bytes(or->in.req));
 	}
 
 	or->out.pad_buff = sg_out_pad_buffer;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 39b3acf..3d16c70 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -682,14 +682,13 @@ EXPORT_SYMBOL(scsi_release_buffers);
 static void scsi_end_bidi_request(struct scsi_cmnd *cmd)
 {
 	struct request *req = cmd->request;
-	unsigned int dlen = req->data_len;
-	unsigned int next_dlen = req->next_rq->data_len;
 
 	req->resid_len = scsi_out(cmd)->resid;
 	req->next_rq->resid_len = scsi_in(cmd)->resid;
 
 	/* The req and req->next_rq have not been completed */
-	BUG_ON(blk_end_bidi_request(req, 0, dlen, next_dlen));
+	BUG_ON(blk_end_bidi_request(req, 0, blk_rq_bytes(req),
+				    blk_rq_bytes(req->next_rq)));
 
 	scsi_release_buffers(cmd);
 
@@ -966,7 +965,7 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
 	BUG_ON(count > sdb->table.nents);
 	sdb->table.nents = count;
 	if (blk_pc_request(req))
-		sdb->length = req->data_len;
+		sdb->length = blk_rq_bytes(req);
 	else
 		sdb->length = blk_rq_sectors(req) << 9;
 	return BLKPREP_OK;
@@ -1087,21 +1086,21 @@ int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
 		if (unlikely(ret))
 			return ret;
 	} else {
-		BUG_ON(req->data_len);
+		BUG_ON(blk_rq_bytes(req));
 
 		memset(&cmd->sdb, 0, sizeof(cmd->sdb));
 		req->buffer = NULL;
 	}
 
 	cmd->cmd_len = req->cmd_len;
-	if (!req->data_len)
+	if (!blk_rq_bytes(req))
 		cmd->sc_data_direction = DMA_NONE;
 	else if (rq_data_dir(req) == WRITE)
 		cmd->sc_data_direction = DMA_TO_DEVICE;
 	else
 		cmd->sc_data_direction = DMA_FROM_DEVICE;
 	
-	cmd->transfersize = req->data_len;
+	cmd->transfersize = blk_rq_bytes(req);
 	cmd->allowed = req->retries;
 	return BLKPREP_OK;
 }
diff --git a/drivers/scsi/scsi_tgt_lib.c b/drivers/scsi/scsi_tgt_lib.c
index 48ba413..1030327 100644
--- a/drivers/scsi/scsi_tgt_lib.c
+++ b/drivers/scsi/scsi_tgt_lib.c
@@ -387,7 +387,7 @@ static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd,
 	 * we use REQ_TYPE_BLOCK_PC so scsi_init_io doesn't set the
 	 * length for us.
 	 */
-	cmd->sdb.length = rq->data_len;
+	cmd->sdb.length = blk_rq_bytes(rq);
 
 	return 0;
 
-- 
cgit v0.10.2


From 34b7d2c957199834c474c9d46739265643f4d9c7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:43 +0900
Subject: ide: cleanup rq->data_len usages

With recent unification of fields, it's now guaranteed that
rq->data_len always equals blk_rq_bytes().  Convert all direct users
to accessors.

[ Impact: convert direct rq->data_len usages to blk_rq_bytes() ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Borislav Petkov <petkovbb@googlemail.com>
Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index e4a02a0..792534d 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -255,7 +255,7 @@ void ide_retry_pc(ide_drive_t *drive)
 	ide_init_pc(pc);
 	memcpy(pc->c, sense_rq->cmd, 12);
 	pc->buf = bio_data(sense_rq->bio);	/* pointer to mapped address */
-	pc->req_xfer = sense_rq->data_len;
+	pc->req_xfer = blk_rq_bytes(sense_rq);
 
 	if (drive->media == ide_tape)
 		set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags);
@@ -303,7 +303,7 @@ int ide_cd_get_xferlen(struct request *rq)
 		return 32768;
 	else if (blk_sense_request(rq) || blk_pc_request(rq) ||
 			 rq->cmd_type == REQ_TYPE_ATA_PC)
-		return rq->data_len;
+		return blk_rq_bytes(rq);
 	else
 		return 0;
 }
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index eb4f3dc..2eadc9d 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -577,7 +577,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 	struct request *rq = hwif->rq;
 	ide_expiry_t *expiry = NULL;
 	int dma_error = 0, dma, thislen, uptodate = 0;
-	int write = (rq_data_dir(rq) == WRITE) ? 1 : 0, rc = 0, nsectors;
+	int write = (rq_data_dir(rq) == WRITE) ? 1 : 0, rc = 0;
 	int sense = blk_sense_request(rq);
 	unsigned int timeout;
 	u16 len;
@@ -707,9 +707,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 
 out_end:
 	if (blk_pc_request(rq) && rc == 0) {
-		if (blk_end_request(rq, 0, rq->data_len))
-			BUG();
-
+		blk_end_request_all(rq, 0);
 		hwif->rq = NULL;
 	} else {
 		if (sense && uptodate)
@@ -727,22 +725,14 @@ out_end:
 			ide_cd_error_cmd(drive, cmd);
 
 		/* make sure it's fully ended */
-		if (blk_pc_request(rq))
-			nsectors = (rq->data_len + 511) >> 9;
-		else
-			nsectors = blk_rq_sectors(rq);
-
-		if (nsectors == 0)
-			nsectors = 1;
-
 		if (blk_fs_request(rq) == 0) {
-			rq->resid_len = rq->data_len -
+			rq->resid_len = blk_rq_bytes(rq) -
 				(cmd->nbytes - cmd->nleft);
 			if (uptodate == 0 && (cmd->tf_flags & IDE_TFLAG_WRITE))
 				rq->resid_len += cmd->last_xfer_len;
 		}
 
-		ide_complete_rq(drive, uptodate ? 0 : -EIO, nsectors << 9);
+		ide_complete_rq(drive, uptodate ? 0 : -EIO, blk_rq_bytes(rq));
 
 		if (sense && rc == 2)
 			ide_error(drive, "request sense failure", stat);
@@ -819,7 +809,7 @@ static void cdrom_do_block_pc(ide_drive_t *drive, struct request *rq)
 		 */
 		alignment = queue_dma_alignment(q) | q->dma_pad_mask;
 		if ((unsigned long)buf & alignment
-		    || rq->data_len & q->dma_pad_mask
+		    || blk_rq_bytes(rq) & q->dma_pad_mask
 		    || object_is_on_stack(buf))
 			drive->dma = 0;
 	}
@@ -867,9 +857,8 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 
 	cmd.rq = rq;
 
-	if (blk_fs_request(rq) || rq->data_len) {
-		ide_init_sg_cmd(&cmd, blk_fs_request(rq) ?
-				(blk_rq_sectors(rq) << 9) : rq->data_len);
+	if (blk_fs_request(rq) || blk_rq_bytes(rq)) {
+		ide_init_sg_cmd(&cmd, blk_rq_bytes(rq));
 		ide_map_sg(drive, &cmd);
 	}
 
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 1c460bd..6509817 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -220,14 +220,14 @@ static void idefloppy_blockpc_cmd(struct ide_disk_obj *floppy,
 	ide_init_pc(pc);
 	memcpy(pc->c, rq->cmd, sizeof(pc->c));
 	pc->rq = rq;
-	if (rq->data_len) {
+	if (blk_rq_bytes(rq)) {
 		pc->flags |= PC_FLAG_DMA_OK;
 		if (rq_data_dir(rq) == WRITE)
 			pc->flags |= PC_FLAG_WRITING;
 	}
 	/* pio will be performed by ide_pio_bytes() which handles sg fine */
 	pc->buf = NULL;
-	pc->req_xfer = pc->buf_size = rq->data_len;
+	pc->req_xfer = pc->buf_size = blk_rq_bytes(rq);
 }
 
 static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 59799ca..ca2519d 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -116,7 +116,7 @@ void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err)
 unsigned int ide_rq_bytes(struct request *rq)
 {
 	if (blk_pc_request(rq))
-		return rq->data_len;
+		return blk_rq_bytes(rq);
 	else
 		return blk_rq_cur_sectors(rq) << 9;
 }
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index c6cf3a9..e166045 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -380,7 +380,7 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc)
 		}
 
 		tape->first_frame += blocks;
-		rq->resid_len = rq->data_len - blocks * tape->blk_size;
+		rq->resid_len = blk_rq_bytes(rq) - blocks * tape->blk_size;
 
 		if (pc->error) {
 			uptodate = 0;
-- 
cgit v0.10.2


From a2dec7b36364a5cc564c4d76cf16d2e7d33f5c05 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:44 +0900
Subject: block: hide request sector and data_len

Block low level drivers for some reason have been pretty good at
abusing block layer API.  Especially struct request's fields tend to
get violated in all possible ways.  Make it clear that low level
drivers MUST NOT access or manipulate rq->sector and rq->data_len
directly by prefixing them with double underscores.

This change is also necessary to break build of out-of-tree codes
which assume the previous block API where internal fields can be
manipulated and rq->data_len carries residual count on completion.

[ Impact: hide internal fields, block API change ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index 3596ca7..6226a38 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -127,7 +127,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	INIT_LIST_HEAD(&rq->timeout_list);
 	rq->cpu = -1;
 	rq->q = q;
-	rq->sector = (sector_t) -1;
+	rq->__sector = (sector_t) -1;
 	INIT_HLIST_NODE(&rq->hash);
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->cmd = rq->__cmd;
@@ -1095,7 +1095,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 		req->cmd_flags |= REQ_NOIDLE;
 
 	req->errors = 0;
-	req->sector = bio->bi_sector;
+	req->__sector = bio->bi_sector;
 	req->ioprio = bio_prio(bio);
 	blk_rq_bio_prep(req->q, req, bio);
 }
@@ -1143,7 +1143,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 
 		req->biotail->bi_next = bio;
 		req->biotail = bio;
-		req->data_len += bytes;
+		req->__data_len += bytes;
 		req->ioprio = ioprio_best(req->ioprio, prio);
 		if (!blk_rq_cpu_valid(req))
 			req->cpu = bio->bi_comp_cpu;
@@ -1169,8 +1169,8 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 		 * not touch req->buffer either...
 		 */
 		req->buffer = bio_data(bio);
-		req->sector = bio->bi_sector;
-		req->data_len += bytes;
+		req->__sector = bio->bi_sector;
+		req->__data_len += bytes;
 		req->ioprio = ioprio_best(req->ioprio, prio);
 		if (!blk_rq_cpu_valid(req))
 			req->cpu = bio->bi_comp_cpu;
@@ -1878,7 +1878,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 		 * can find how many bytes remain in the request
 		 * later.
 		 */
-		req->data_len = 0;
+		req->__data_len = 0;
 		return false;
 	}
 
@@ -1892,12 +1892,12 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 		bio_iovec(bio)->bv_len -= nr_bytes;
 	}
 
-	req->data_len -= total_bytes;
+	req->__data_len -= total_bytes;
 	req->buffer = bio_data(req->bio);
 
 	/* update sector only for requests with clear definition of sector */
 	if (blk_fs_request(req) || blk_discard_rq(req))
-		req->sector += total_bytes >> 9;
+		req->__sector += total_bytes >> 9;
 
 	/*
 	 * If total number of sectors is less than the first segment
@@ -1905,7 +1905,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 	 */
 	if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
 		printk(KERN_ERR "blk: request botched\n");
-		req->data_len = blk_rq_cur_bytes(req);
+		req->__data_len = blk_rq_cur_bytes(req);
 	}
 
 	/* recalculate the number of segments */
@@ -2032,7 +2032,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		rq->nr_phys_segments = bio_phys_segments(q, bio);
 		rq->buffer = bio_data(bio);
 	}
-	rq->data_len = bio->bi_size;
+	rq->__data_len = bio->bi_size;
 	rq->bio = rq->biotail = bio;
 
 	if (bio->bi_bdev)
diff --git a/block/blk-map.c b/block/blk-map.c
index 694fefa..56082be 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -20,7 +20,7 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 		rq->biotail->bi_next = bio;
 		rq->biotail = bio;
 
-		rq->data_len += bio->bi_size;
+		rq->__data_len += bio->bi_size;
 	}
 	return 0;
 }
diff --git a/block/blk-merge.c b/block/blk-merge.c
index b8df66a..4974dd5 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -370,7 +370,7 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	req->biotail->bi_next = next->bio;
 	req->biotail = next->biotail;
 
-	req->data_len += blk_rq_bytes(next);
+	req->__data_len += blk_rq_bytes(next);
 
 	elv_merge_requests(q, req, next);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ce2bf5e..c755803 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -166,8 +166,9 @@ struct request {
 	enum rq_cmd_type_bits cmd_type;
 	unsigned long atomic_flags;
 
-	sector_t sector;	/* sector cursor */
-	unsigned int data_len;	/* total data len, don't access directly */
+	/* the following two fields are internal, NEVER access directly */
+	sector_t __sector;		/* sector cursor */
+	unsigned int __data_len;	/* total data len */
 
 	struct bio *bio;
 	struct bio *biotail;
@@ -828,12 +829,12 @@ extern void blkdev_dequeue_request(struct request *req);
  */
 static inline sector_t blk_rq_pos(const struct request *rq)
 {
-	return rq->sector;
+	return rq->__sector;
 }
 
 static inline unsigned int blk_rq_bytes(const struct request *rq)
 {
-	return rq->data_len;
+	return rq->__data_len;
 }
 
 static inline int blk_rq_cur_bytes(const struct request *rq)
-- 
cgit v0.10.2


From 1011c1b9f2e45ce7c6e38888d2b83936aec38771 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:45 +0900
Subject: block: blk_rq_[cur_]_{sectors|bytes}() usage cleanup

With the previous changes, the followings are now guaranteed for all
requests in any valid state.

* blk_rq_sectors() == blk_rq_bytes() >> 9
* blk_rq_cur_sectors() == blk_rq_cur_bytes() >> 9

Clean up accessor usages.  Notable changes are

* nbd,i2o_block: end_all used instead of explicit byte count
* scsi_lib: unnecessary conditional on request type removed

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Paul Clements <paul.clements@steeleye.com>
Cc: Pete Zaitcev <zaitcev@redhat.com>
Cc: Alex Dubov <oakad@yahoo.com>
Cc: Markus Lidel <Markus.Lidel@shadowconnect.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 4524862..1e27ed9 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2512,8 +2512,7 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2)
 
 	remaining = current_count_sectors << 9;
 #ifdef FLOPPY_SANITY_CHECK
-	if ((remaining >> 9) > blk_rq_sectors(current_req) &&
-	    CT(COMMAND) == FD_WRITE) {
+	if (remaining > blk_rq_bytes(current_req) && CT(COMMAND) == FD_WRITE) {
 		DPRINT("in copy buffer\n");
 		printk("current_count_sectors=%ld\n", current_count_sectors);
 		printk("remaining=%d\n", remaining >> 9);
@@ -2530,7 +2529,7 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2)
 
 	dma_buffer = floppy_track_buffer + ((fsector_t - buffer_min) << 9);
 
-	size = blk_rq_cur_sectors(current_req) << 9;
+	size = blk_rq_cur_bytes(current_req);
 
 	rq_for_each_segment(bv, current_req, iter) {
 		if (!remaining)
@@ -2879,7 +2878,7 @@ static int make_raw_rw_request(void)
 				printk("write\n");
 			return 0;
 		}
-	} else if (raw_cmd->length > blk_rq_sectors(current_req) << 9 ||
+	} else if (raw_cmd->length > blk_rq_bytes(current_req) ||
 		   current_count_sectors > blk_rq_sectors(current_req)) {
 		DPRINT("buffer overrun in direct transfer\n");
 		return 0;
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 977a573..fad167de 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -110,7 +110,7 @@ static void nbd_end_request(struct request *req)
 			req, error ? "failed" : "done");
 
 	spin_lock_irqsave(q->queue_lock, flags);
-	__blk_end_request(req, error, blk_rq_sectors(req) << 9);
+	__blk_end_request_all(req, error);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
@@ -231,7 +231,7 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req)
 {
 	int result, flags;
 	struct nbd_request request;
-	unsigned long size = blk_rq_sectors(req) << 9;
+	unsigned long size = blk_rq_bytes(req);
 
 	request.magic = htonl(NBD_REQUEST_MAGIC);
 	request.type = htonl(nbd_cmd(req));
@@ -243,7 +243,7 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req)
 			lo->disk->disk_name, req,
 			nbdcmd_to_ascii(nbd_cmd(req)),
 			(unsigned long long)blk_rq_pos(req) << 9,
-			blk_rq_sectors(req) << 9);
+			blk_rq_bytes(req));
 	result = sock_xmit(lo, 1, &request, sizeof(request),
 			(nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0);
 	if (result <= 0) {
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 1591f61..40d03cf 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -739,7 +739,7 @@ static void ub_cmd_build_block(struct ub_dev *sc, struct ub_lun *lun,
 	cmd->cdb[8] = nblks;
 	cmd->cdb_len = 10;
 
-	cmd->len = blk_rq_sectors(rq) * 512;
+	cmd->len = blk_rq_bytes(rq);
 }
 
 static void ub_cmd_build_packet(struct ub_dev *sc, struct ub_lun *lun,
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index d4e6b71..6a13838 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -72,7 +72,7 @@ static void do_z2_request(struct request_queue *q)
 	struct request *req;
 	while ((req = elv_next_request(q)) != NULL) {
 		unsigned long start = blk_rq_pos(req) << 9;
-		unsigned long len  = blk_rq_cur_sectors(req) << 9;
+		unsigned long len  = blk_rq_cur_bytes(req);
 
 		if (start + len > z2ram_size) {
 			printk( KERN_ERR DEVICE_NAME ": bad access: block=%lu, count=%u\n",
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 9e600d2..93b2c61 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -680,7 +680,7 @@ try_again:
 		t_sec = blk_rq_pos(msb->block_req) << 9;
 		sector_div(t_sec, msb->page_size);
 
-		count = blk_rq_sectors(msb->block_req) << 9;
+		count = blk_rq_bytes(msb->block_req);
 		count /= msb->page_size;
 
 		param.system = msb->system;
@@ -745,7 +745,7 @@ static int mspro_block_complete_req(struct memstick_dev *card, int error)
 					t_len *= msb->page_size;
 			}
 		} else
-			t_len = blk_rq_sectors(msb->block_req) << 9;
+			t_len = blk_rq_bytes(msb->block_req);
 
 		dev_dbg(&card->dev, "transferred %x (%d)\n", t_len, error);
 
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 6b61d28..e153f5d 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -426,15 +426,9 @@ static void i2o_block_end_request(struct request *req, int error,
 	struct request_queue *q = req->q;
 	unsigned long flags;
 
-	if (blk_end_request(req, error, nr_bytes)) {
-		int leftover = (blk_rq_sectors(req) << KERNEL_SECTOR_SHIFT);
-
-		if (blk_pc_request(req))
-			leftover = blk_rq_bytes(req);
-
+	if (blk_end_request(req, error, nr_bytes))
 		if (error)
-			blk_end_request(req, -EIO, leftover);
-	}
+			blk_end_request_all(req, -EIO);
 
 	spin_lock_irqsave(q->queue_lock, flags);
 
@@ -832,15 +826,13 @@ static int i2o_block_transfer(struct request *req)
 
 		memcpy(mptr, cmd, 10);
 		mptr += 4;
-		*mptr++ =
-		    cpu_to_le32(blk_rq_sectors(req) << KERNEL_SECTOR_SHIFT);
+		*mptr++ = cpu_to_le32(blk_rq_bytes(req));
 	} else
 #endif
 	{
 		msg->u.head[1] = cpu_to_le32(cmd | HOST_TID << 12 | tid);
 		*mptr++ = cpu_to_le32(ctl_flags);
-		*mptr++ =
-		    cpu_to_le32(blk_rq_sectors(req) << KERNEL_SECTOR_SHIFT);
+		*mptr++ = cpu_to_le32(blk_rq_bytes(req));
 		*mptr++ =
 		    cpu_to_le32((u32) (blk_rq_pos(req) << KERNEL_SECTOR_SHIFT));
 		*mptr++ =
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 4ea2e67..50c76a2 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -48,7 +48,7 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 	char *buf;
 
 	block = blk_rq_pos(req) << 9 >> tr->blkshift;
-	nsect = blk_rq_cur_sectors(req) << 9 >> tr->blkshift;
+	nsect = blk_rq_cur_bytes(req) >> tr->blkshift;
 
 	buf = req->buffer;
 
diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c
index 2132c90..d56ddaa 100644
--- a/drivers/sbus/char/jsflash.c
+++ b/drivers/sbus/char/jsflash.c
@@ -189,7 +189,7 @@ static void jsfd_do_request(struct request_queue *q)
 	while ((req = elv_next_request(q)) != NULL) {
 		struct jsfd_part *jdp = req->rq_disk->private_data;
 		unsigned long offset = blk_rq_pos(req) << 9;
-		size_t len = blk_rq_cur_sectors(req) << 9;
+		size_t len = blk_rq_cur_bytes(req);
 
 		if ((offset + len) > jdp->dsize) {
 			__blk_end_request_cur(req, -EIO);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 3d16c70..ee308f6 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -546,7 +546,7 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error,
 	 * to queue the remainder of them.
 	 */
 	if (blk_end_request(req, error, bytes)) {
-		int leftover = blk_rq_sectors(req) << 9;
+		int leftover = blk_rq_bytes(req);
 
 		if (blk_pc_request(req))
 			leftover = req->resid_len;
@@ -964,10 +964,7 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
 	count = blk_rq_map_sg(req->q, req, sdb->table.sgl);
 	BUG_ON(count > sdb->table.nents);
 	sdb->table.nents = count;
-	if (blk_pc_request(req))
-		sdb->length = blk_rq_bytes(req);
-	else
-		sdb->length = blk_rq_sectors(req) << 9;
+	sdb->length = blk_rq_bytes(req);
 	return BLKPREP_OK;
 }
 
-- 
cgit v0.10.2


From 8f6205cd572fece673da0255d74843680f67f879 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:53:59 +0900
Subject: ide: dequeue in-flight request

ide generally has single request in flight and tracks it using
hwif->rq and all state handlers follow the following convention.

* ide_started is returned if the request is in flight.

* ide_stopped is returned if the queue needs to be restarted.  The
  request might or might not have been processed fully or partially.

* hwif->rq is set to NULL, when an issued request completes.

So, dequeueing model can be implemented by dequeueing after fetch,
requeueing if hwif->rq isn't NULL on ide_stopped return and doing
about the same thing on completion / port unlock paths.  These changes
can be made in ide-io proper.

In addition to the above main changes, the following updates are
necessary.

* ide-cd shouldn't dequeue a request when issuing REQUEST SENSE for it
  as the request is already dequeued.

* ide-atapi uses request queue as stack when issuing REQUEST SENSE to
  put the REQUEST SENSE in front of the failed request.  This now
  needs to be done using requeueing.

[ Impact: dequeue in-flight request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Borislav Petkov <petkovbb@googlemail.com>
Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 792534d..2874c3d 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -246,6 +246,7 @@ EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
  */
 void ide_retry_pc(ide_drive_t *drive)
 {
+	struct request *failed_rq = drive->hwif->rq;
 	struct request *sense_rq = &drive->sense_rq;
 	struct ide_atapi_pc *pc = &drive->request_sense_pc;
 
@@ -260,8 +261,17 @@ void ide_retry_pc(ide_drive_t *drive)
 	if (drive->media == ide_tape)
 		set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags);
 
-	if (ide_queue_sense_rq(drive, pc))
-		ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq));
+	/*
+	 * Push back the failed request and put request sense on top
+	 * of it.  The failed command will be retried after sense data
+	 * is acquired.
+	 */
+	blk_requeue_request(failed_rq->q, failed_rq);
+	drive->hwif->rq = NULL;
+	if (ide_queue_sense_rq(drive, pc)) {
+		blkdev_dequeue_request(failed_rq);
+		ide_complete_rq(drive, -EIO, blk_rq_bytes(failed_rq));
+	}
 }
 EXPORT_SYMBOL_GPL(ide_retry_pc);
 
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 2eadc9d..4c7792f 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -405,15 +405,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 
 end_request:
 	if (stat & ATA_ERR) {
-		struct request_queue *q = drive->queue;
-		unsigned long flags;
-
-		spin_lock_irqsave(q->queue_lock, flags);
-		blkdev_dequeue_request(rq);
-		spin_unlock_irqrestore(q->queue_lock, flags);
-
 		hwif->rq = NULL;
-
 		return ide_queue_sense_rq(drive, rq) ? 2 : 1;
 	} else
 		return 2;
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index ca2519d..abda733 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -487,10 +487,10 @@ void do_ide_request(struct request_queue *q)
 
 	if (!ide_lock_port(hwif)) {
 		ide_hwif_t *prev_port;
+
+		WARN_ON_ONCE(hwif->rq);
 repeat:
 		prev_port = hwif->host->cur_port;
-		hwif->rq = NULL;
-
 		if (drive->dev_flags & IDE_DFLAG_SLEEPING &&
 		    time_after(drive->sleep, jiffies)) {
 			ide_unlock_port(hwif);
@@ -519,7 +519,12 @@ repeat:
 		 * we know that the queue isn't empty, but this can happen
 		 * if the q->prep_rq_fn() decides to kill a request
 		 */
-		rq = elv_next_request(drive->queue);
+		if (!rq) {
+			rq = elv_next_request(drive->queue);
+			if (rq)
+				blkdev_dequeue_request(rq);
+		}
+
 		spin_unlock_irq(q->queue_lock);
 		spin_lock_irq(&hwif->lock);
 
@@ -555,8 +560,11 @@ repeat:
 		startstop = start_request(drive, rq);
 		spin_lock_irq(&hwif->lock);
 
-		if (startstop == ide_stopped)
+		if (startstop == ide_stopped) {
+			rq = hwif->rq;
+			hwif->rq = NULL;
 			goto repeat;
+		}
 	} else
 		goto plug_device;
 out:
@@ -572,18 +580,24 @@ plug_device:
 plug_device_2:
 	spin_lock_irq(q->queue_lock);
 
+	if (rq)
+		blk_requeue_request(q, rq);
 	if (!elv_queue_empty(q))
 		blk_plug_device(q);
 }
 
-static void ide_plug_device(ide_drive_t *drive)
+static void ide_requeue_and_plug(ide_drive_t *drive, struct request *rq)
 {
 	struct request_queue *q = drive->queue;
 	unsigned long flags;
 
 	spin_lock_irqsave(q->queue_lock, flags);
+
+	if (rq)
+		blk_requeue_request(q, rq);
 	if (!elv_queue_empty(q))
 		blk_plug_device(q);
+
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
@@ -632,6 +646,7 @@ void ide_timer_expiry (unsigned long data)
 	unsigned long	flags;
 	int		wait = -1;
 	int		plug_device = 0;
+	struct request	*uninitialized_var(rq_in_flight);
 
 	spin_lock_irqsave(&hwif->lock, flags);
 
@@ -693,6 +708,8 @@ void ide_timer_expiry (unsigned long data)
 		spin_lock_irq(&hwif->lock);
 		enable_irq(hwif->irq);
 		if (startstop == ide_stopped) {
+			rq_in_flight = hwif->rq;
+			hwif->rq = NULL;
 			ide_unlock_port(hwif);
 			plug_device = 1;
 		}
@@ -701,7 +718,7 @@ void ide_timer_expiry (unsigned long data)
 
 	if (plug_device) {
 		ide_unlock_host(hwif->host);
-		ide_plug_device(drive);
+		ide_requeue_and_plug(drive, rq_in_flight);
 	}
 }
 
@@ -787,6 +804,7 @@ irqreturn_t ide_intr (int irq, void *dev_id)
 	ide_startstop_t startstop;
 	irqreturn_t irq_ret = IRQ_NONE;
 	int plug_device = 0;
+	struct request *uninitialized_var(rq_in_flight);
 
 	if (host->host_flags & IDE_HFLAG_SERIALIZE) {
 		if (hwif != host->cur_port)
@@ -866,6 +884,8 @@ irqreturn_t ide_intr (int irq, void *dev_id)
 	 */
 	if (startstop == ide_stopped) {
 		BUG_ON(hwif->handler);
+		rq_in_flight = hwif->rq;
+		hwif->rq = NULL;
 		ide_unlock_port(hwif);
 		plug_device = 1;
 	}
@@ -875,7 +895,7 @@ out:
 out_early:
 	if (plug_device) {
 		ide_unlock_host(hwif->host);
-		ide_plug_device(drive);
+		ide_requeue_and_plug(drive, rq_in_flight);
 	}
 
 	return irq_ret;
-- 
cgit v0.10.2


From 9a8d23d8855e554fc5887f14cb008b55c4300ccc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:00 +0900
Subject: mg_disk: fix queue hang / infinite retry on !fs requests

Both request functions in mg_disk simply return when they encounter a
!fs request, which means the request will never be cleared from the
queue causing queue hang and indefinite retry of the request.  Fix it.

While at it, flatten condition checks and add unlikely to !fs tests.

[ Impact: fix possible queue hang / infinite retry of !fs requests ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: unsik Kim <donari75@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 826c349..be32388 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -672,16 +672,16 @@ static void mg_request_poll(struct request_queue *q)
 
 	while ((req = elv_next_request(q)) != NULL) {
 		host = req->rq_disk->private_data;
-		if (blk_fs_request(req)) {
-			switch (rq_data_dir(req)) {
-			case READ:
-				mg_read(req);
-				break;
-			case WRITE:
-				mg_write(req);
-				break;
-			}
+
+		if (unlikely(!blk_fs_request(req))) {
+			__blk_end_request_cur(req, -EIO);
+			continue;
 		}
+
+		if (rq_data_dir(req) == READ)
+			mg_read(req);
+		else
+			mg_write(req);
 	}
 }
 
@@ -766,8 +766,10 @@ static void mg_request(struct request_queue *q)
 			continue;
 		}
 
-		if (!blk_fs_request(req))
-			return;
+		if (unlikely(!blk_fs_request(req))) {
+			__blk_end_request_cur(req, -EIO);
+			continue;
+		}
 
 		if (!mg_issue_req(req, host, sect_num, sect_cnt))
 			return;
-- 
cgit v0.10.2


From 5b36ad6000ddea390aca3c3b67ead7616ace2ffc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:01 +0900
Subject: mg_disk: dequeue and track in-flight request

mg_disk has at most single request in flight per device.  Till now,
whenever it needs to access the in-flight request it called
elv_next_request().  This patch makes mg_disk track the in-flight
request directly using mg_host->req and dequeue it when processing
starts.

q->queuedata is set to mg_host so that mg_host can be determined
without fetching request from the queue.

[ Impact: dequeue in-flight request, one elv_next_request() per request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: unsik Kim <donari75@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index be32388..1ca5d14 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -135,6 +135,7 @@ struct mg_host {
 	struct device *dev;
 
 	struct request_queue *breq;
+	struct request *req;
 	spinlock_t lock;
 	struct gendisk *gd;
 
@@ -171,17 +172,27 @@ struct mg_host {
 
 static void mg_request(struct request_queue *);
 
+static bool mg_end_request(struct mg_host *host, int err, unsigned int nr_bytes)
+{
+	if (__blk_end_request(host->req, err, nr_bytes))
+		return true;
+
+	host->req = NULL;
+	return false;
+}
+
+static bool mg_end_request_cur(struct mg_host *host, int err)
+{
+	return mg_end_request(host, err, blk_rq_cur_bytes(host->req));
+}
+
 static void mg_dump_status(const char *msg, unsigned int stat,
 		struct mg_host *host)
 {
 	char *name = MG_DISK_NAME;
-	struct request *req;
 
-	if (host->breq) {
-		req = elv_next_request(host->breq);
-		if (req)
-			name = req->rq_disk->disk_name;
-	}
+	if (host->req)
+		name = host->req->rq_disk->disk_name;
 
 	printk(KERN_ERR "%s: %s: status=0x%02x { ", name, msg, stat & 0xff);
 	if (stat & ATA_BUSY)
@@ -217,13 +228,9 @@ static void mg_dump_status(const char *msg, unsigned int stat,
 			printk("AddrMarkNotFound ");
 		printk("}");
 		if (host->error & (ATA_BBK | ATA_UNC | ATA_IDNF | ATA_AMNF)) {
-			if (host->breq) {
-				req = elv_next_request(host->breq);
-				if (req)
-					printk(", sector=%u",
-					       (unsigned int)blk_rq_pos(req));
-			}
-
+			if (host->req)
+				printk(", sector=%u",
+				       (unsigned int)blk_rq_pos(host->req));
 		}
 		printk("\n");
 	}
@@ -453,11 +460,10 @@ static int mg_disk_init(struct mg_host *host)
 
 static void mg_bad_rw_intr(struct mg_host *host)
 {
-	struct request *req = elv_next_request(host->breq);
-	if (req != NULL)
-		if (++req->errors >= MG_MAX_ERRORS ||
-				host->error == MG_ERR_TIMEOUT)
-			__blk_end_request_cur(req, -EIO);
+	if (host->req)
+		if (++host->req->errors >= MG_MAX_ERRORS ||
+		    host->error == MG_ERR_TIMEOUT)
+			mg_end_request_cur(host, -EIO);
 }
 
 static unsigned int mg_out(struct mg_host *host,
@@ -515,7 +521,7 @@ static void mg_read(struct request *req)
 
 		outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base +
 				MG_REG_COMMAND);
-	} while (__blk_end_request(req, 0, MG_SECTOR_SIZE));
+	} while (mg_end_request(host, 0, MG_SECTOR_SIZE));
 }
 
 static void mg_write(struct request *req)
@@ -545,14 +551,14 @@ static void mg_write(struct request *req)
 
 		outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
 				MG_REG_COMMAND);
-	} while (__blk_end_request(req, 0, MG_SECTOR_SIZE));
+	} while (mg_end_request(host, 0, MG_SECTOR_SIZE));
 }
 
 static void mg_read_intr(struct mg_host *host)
 {
+	struct request *req = host->req;
 	u32 i;
 	u16 *buff;
-	struct request *req;
 
 	/* check status */
 	do {
@@ -571,7 +577,6 @@ static void mg_read_intr(struct mg_host *host)
 
 ok_to_read:
 	/* get current segment of request */
-	req = elv_next_request(host->breq);
 	buff = (u16 *)req->buffer;
 
 	/* read 1 sector */
@@ -585,7 +590,7 @@ ok_to_read:
 	/* send read confirm */
 	outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
 
-	if (__blk_end_request(req, 0, MG_SECTOR_SIZE)) {
+	if (mg_end_request(host, 0, MG_SECTOR_SIZE)) {
 		/* set handler if read remains */
 		host->mg_do_intr = mg_read_intr;
 		mod_timer(&host->timer, jiffies + 3 * HZ);
@@ -595,14 +600,11 @@ ok_to_read:
 
 static void mg_write_intr(struct mg_host *host)
 {
+	struct request *req = host->req;
 	u32 i, j;
 	u16 *buff;
-	struct request *req;
 	bool rem;
 
-	/* get current segment of request */
-	req = elv_next_request(host->breq);
-
 	/* check status */
 	do {
 		i = inb((unsigned long)host->dev_base + MG_REG_STATUS);
@@ -619,7 +621,7 @@ static void mg_write_intr(struct mg_host *host)
 	return;
 
 ok_to_write:
-	if ((rem = __blk_end_request(req, 0, MG_SECTOR_SIZE))) {
+	if ((rem = mg_end_request(host, 0, MG_SECTOR_SIZE))) {
 		/* write 1 sector and set handler if remains */
 		buff = (u16 *)req->buffer;
 		for (j = 0; j < MG_STORAGE_BUFFER_SIZE >> 1; j++) {
@@ -644,44 +646,47 @@ void mg_times_out(unsigned long data)
 {
 	struct mg_host *host = (struct mg_host *)data;
 	char *name;
-	struct request *req;
 
 	spin_lock_irq(&host->lock);
 
-	req = elv_next_request(host->breq);
-	if (!req)
+	if (!host->req)
 		goto out_unlock;
 
 	host->mg_do_intr = NULL;
 
-	name = req->rq_disk->disk_name;
+	name = host->req->rq_disk->disk_name;
 	printk(KERN_DEBUG "%s: timeout\n", name);
 
 	host->error = MG_ERR_TIMEOUT;
 	mg_bad_rw_intr(host);
 
-	mg_request(host->breq);
 out_unlock:
+	mg_request(host->breq);
 	spin_unlock_irq(&host->lock);
 }
 
 static void mg_request_poll(struct request_queue *q)
 {
-	struct request *req;
-	struct mg_host *host;
+	struct mg_host *host = q->queuedata;
 
-	while ((req = elv_next_request(q)) != NULL) {
-		host = req->rq_disk->private_data;
+	while (1) {
+		if (!host->req) {
+			host->req = elv_next_request(q);
+			if (host->req)
+				blkdev_dequeue_request(host->req);
+			else
+				break;
+		}
 
-		if (unlikely(!blk_fs_request(req))) {
-			__blk_end_request_cur(req, -EIO);
+		if (unlikely(!blk_fs_request(host->req))) {
+			mg_end_request_cur(host, -EIO);
 			continue;
 		}
 
-		if (rq_data_dir(req) == READ)
-			mg_read(req);
+		if (rq_data_dir(host->req) == READ)
+			mg_read(host->req);
 		else
-			mg_write(req);
+			mg_write(host->req);
 	}
 }
 
@@ -733,16 +738,19 @@ static unsigned int mg_issue_req(struct request *req,
 /* This function also called from IRQ context */
 static void mg_request(struct request_queue *q)
 {
+	struct mg_host *host = q->queuedata;
 	struct request *req;
-	struct mg_host *host;
 	u32 sect_num, sect_cnt;
 
 	while (1) {
-		req = elv_next_request(q);
-		if (!req)
-			return;
-
-		host = req->rq_disk->private_data;
+		if (!host->req) {
+			host->req = elv_next_request(q);
+			if (host->req)
+				blkdev_dequeue_request(host->req);
+			else
+				break;
+		}
+		req = host->req;
 
 		/* check unwanted request call */
 		if (host->mg_do_intr)
@@ -762,12 +770,12 @@ static void mg_request(struct request_queue *q)
 					"%s: bad access: sector=%d, count=%d\n",
 					req->rq_disk->disk_name,
 					sect_num, sect_cnt);
-			__blk_end_request_cur(req, -EIO);
+			mg_end_request_cur(host, -EIO);
 			continue;
 		}
 
 		if (unlikely(!blk_fs_request(req))) {
-			__blk_end_request_cur(req, -EIO);
+			mg_end_request_cur(host, -EIO);
 			continue;
 		}
 
@@ -981,6 +989,7 @@ static int mg_probe(struct platform_device *plat_dev)
 				__func__, __LINE__);
 		goto probe_err_5;
 	}
+	host->breq->queuedata = host;
 
 	/* mflash is random device, thanx for the noop */
 	elevator_exit(host->breq->elevator);
-- 
cgit v0.10.2


From 8a12c4a456c7ee261a85c2cbec835601e6aae05a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:02 +0900
Subject: hd: dequeue and track in-flight request

hd has at most single request in flight.  Till now, whenever it needs
to access the in-flight request it called elv_next_request().  This
patch makes hd track the in-flight request directly and dequeue it
when processing starts.  The added complexity is minimal and this will
help future block layer changes.

[ Impact: dequeue in-flight request, one elv_next_request() per request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index a3b3994..288ab63 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -98,10 +98,9 @@
 
 static DEFINE_SPINLOCK(hd_lock);
 static struct request_queue *hd_queue;
+static struct request *hd_req;
 
 #define MAJOR_NR HD_MAJOR
-#define QUEUE (hd_queue)
-#define CURRENT elv_next_request(hd_queue)
 
 #define TIMEOUT_VALUE	(6*HZ)
 #define	HD_DELAY	0
@@ -195,11 +194,24 @@ static void __init hd_setup(char *str, int *ints)
 	NR_HD = hdind+1;
 }
 
+static bool hd_end_request(int err, unsigned int bytes)
+{
+	if (__blk_end_request(hd_req, err, bytes))
+		return true;
+	hd_req = NULL;
+	return false;
+}
+
+static bool hd_end_request_cur(int err)
+{
+	return hd_end_request(err, blk_rq_cur_bytes(hd_req));
+}
+
 static void dump_status(const char *msg, unsigned int stat)
 {
 	char *name = "hd?";
-	if (CURRENT)
-		name = CURRENT->rq_disk->disk_name;
+	if (hd_req)
+		name = hd_req->rq_disk->disk_name;
 
 #ifdef VERBOSE_ERRORS
 	printk("%s: %s: status=0x%02x { ", name, msg, stat & 0xff);
@@ -227,8 +239,8 @@ static void dump_status(const char *msg, unsigned int stat)
 		if (hd_error & (BBD_ERR|ECC_ERR|ID_ERR|MARK_ERR)) {
 			printk(", CHS=%d/%d/%d", (inb(HD_HCYL)<<8) + inb(HD_LCYL),
 				inb(HD_CURRENT) & 0xf, inb(HD_SECTOR));
-			if (CURRENT)
-				printk(", sector=%ld", blk_rq_pos(CURRENT));
+			if (hd_req)
+				printk(", sector=%ld", blk_rq_pos(hd_req));
 		}
 		printk("\n");
 	}
@@ -406,11 +418,12 @@ static void unexpected_hd_interrupt(void)
  */
 static void bad_rw_intr(void)
 {
-	struct request *req = CURRENT;
+	struct request *req = hd_req;
+
 	if (req != NULL) {
 		struct hd_i_struct *disk = req->rq_disk->private_data;
 		if (++req->errors >= MAX_ERRORS || (hd_error & BBD_ERR)) {
-			__blk_end_request_cur(req, -EIO);
+			hd_end_request_cur(-EIO);
 			disk->special_op = disk->recalibrate = 1;
 		} else if (req->errors % RESET_FREQ == 0)
 			reset = 1;
@@ -454,14 +467,14 @@ static void read_intr(void)
 	return;
 
 ok_to_read:
-	req = CURRENT;
+	req = hd_req;
 	insw(HD_DATA, req->buffer, 256);
 #ifdef DEBUG
 	printk("%s: read: sector %ld, remaining = %u, buffer=%p\n",
 	       req->rq_disk->disk_name, blk_rq_pos(req) + 1,
 	       blk_rq_sectors(req) - 1, req->buffer+512);
 #endif
-	if (__blk_end_request(req, 0, 512)) {
+	if (hd_end_request(0, 512)) {
 		SET_HANDLER(&read_intr);
 		return;
 	}
@@ -475,7 +488,7 @@ ok_to_read:
 
 static void write_intr(void)
 {
-	struct request *req = CURRENT;
+	struct request *req = hd_req;
 	int i;
 	int retries = 100000;
 
@@ -494,7 +507,7 @@ static void write_intr(void)
 	return;
 
 ok_to_write:
-	if (__blk_end_request(req, 0, 512)) {
+	if (hd_end_request(0, 512)) {
 		SET_HANDLER(&write_intr);
 		outsw(HD_DATA, req->buffer, 256);
 		return;
@@ -525,18 +538,18 @@ static void hd_times_out(unsigned long dummy)
 
 	do_hd = NULL;
 
-	if (!CURRENT)
+	if (!hd_req)
 		return;
 
 	spin_lock_irq(hd_queue->queue_lock);
 	reset = 1;
-	name = CURRENT->rq_disk->disk_name;
+	name = hd_req->rq_disk->disk_name;
 	printk("%s: timeout\n", name);
-	if (++CURRENT->errors >= MAX_ERRORS) {
+	if (++hd_req->errors >= MAX_ERRORS) {
 #ifdef DEBUG
 		printk("%s: too many errors\n", name);
 #endif
-		__blk_end_request_cur(CURRENT, -EIO);
+		hd_end_request_cur(-EIO);
 	}
 	hd_request();
 	spin_unlock_irq(hd_queue->queue_lock);
@@ -551,7 +564,7 @@ static int do_special_op(struct hd_i_struct *disk, struct request *req)
 	}
 	if (disk->head > 16) {
 		printk("%s: cannot handle device with more than 16 heads - giving up\n", req->rq_disk->disk_name);
-		__blk_end_request_cur(req, -EIO);
+		hd_end_request_cur(-EIO);
 	}
 	disk->special_op = 0;
 	return 1;
@@ -578,11 +591,15 @@ static void hd_request(void)
 repeat:
 	del_timer(&device_timer);
 
-	req = CURRENT;
-	if (!req) {
-		do_hd = NULL;
-		return;
+	if (!hd_req) {
+		hd_req = elv_next_request(hd_queue);
+		if (!hd_req) {
+			do_hd = NULL;
+			return;
+		}
+		blkdev_dequeue_request(hd_req);
 	}
+	req = hd_req;
 
 	if (reset) {
 		reset_hd();
@@ -595,7 +612,7 @@ repeat:
 	    ((block+nsect) > get_capacity(req->rq_disk))) {
 		printk("%s: bad access: block=%d, count=%d\n",
 			req->rq_disk->disk_name, block, nsect);
-		__blk_end_request_cur(req, -EIO);
+		hd_end_request_cur(-EIO);
 		goto repeat;
 	}
 
@@ -635,7 +652,7 @@ repeat:
 			break;
 		default:
 			printk("unknown hd-command\n");
-			__blk_end_request_cur(req, -EIO);
+			hd_end_request_cur(-EIO);
 			break;
 		}
 	}
-- 
cgit v0.10.2


From a336ca6fe6e0c46b2eef0e520951acb4e6cb2976 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:03 +0900
Subject: ataflop: dequeue and track in-flight request

ataflop has single request in flight.  Till now, whenever it needs to
access the in-flight request it called elv_next_request().  This patch
makes ataflop track the in-flight request directly and dequeue it when
processing starts.  The added complexity is minimal and this will help
future block layer changes.

[ Impact: dequeue in-flight request, one elv_next_request() per request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 234024c..89a591d 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -79,9 +79,7 @@
 #undef DEBUG
 
 static struct request_queue *floppy_queue;
-
-#define QUEUE (floppy_queue)
-#define CURRENT elv_next_request(floppy_queue)
+static struct request *fd_request;
 
 /* Disk types: DD, HD, ED */
 static struct atari_disk_type {
@@ -376,6 +374,12 @@ static DEFINE_TIMER(readtrack_timer, fd_readtrack_check, 0, 0);
 static DEFINE_TIMER(timeout_timer, fd_times_out, 0, 0);
 static DEFINE_TIMER(fd_timer, check_change, 0, 0);
 	
+static void fd_end_request_cur(int err)
+{
+	if (!__blk_end_request_cur(fd_request, err))
+		fd_request = NULL;
+}
+
 static inline void start_motor_off_timer(void)
 {
 	mod_timer(&motor_off_timer, jiffies + FD_MOTOR_OFF_DELAY);
@@ -606,15 +610,15 @@ static void fd_error( void )
 		return;
 	}
 
-	if (!CURRENT)
+	if (!fd_request)
 		return;
 
-	CURRENT->errors++;
-	if (CURRENT->errors >= MAX_ERRORS) {
+	fd_request->errors++;
+	if (fd_request->errors >= MAX_ERRORS) {
 		printk(KERN_ERR "fd%d: too many errors.\n", SelectedDrive );
-		__blk_end_request_cur(CURRENT, -EIO);
+		fd_end_request_cur(-EIO);
 	}
-	else if (CURRENT->errors == RECALIBRATE_ERRORS) {
+	else if (fd_request->errors == RECALIBRATE_ERRORS) {
 		printk(KERN_WARNING "fd%d: recalibrating\n", SelectedDrive );
 		if (SelectedDrive != -1)
 			SUD.track = -1;
@@ -725,14 +729,14 @@ static void do_fd_action( int drive )
 	    if (IS_BUFFERED( drive, ReqSide, ReqTrack )) {
 		if (ReqCmd == READ) {
 		    copy_buffer( SECTOR_BUFFER(ReqSector), ReqData );
-		    if (++ReqCnt < blk_rq_cur_sectors(CURRENT)) {
+		    if (++ReqCnt < blk_rq_cur_sectors(fd_request)) {
 			/* read next sector */
 			setup_req_params( drive );
 			goto repeat;
 		    }
 		    else {
 			/* all sectors finished */
-			__blk_end_request_cur(CURRENT, 0);
+			fd_end_request_cur(0);
 			redo_fd_request();
 			return;
 		    }
@@ -1130,14 +1134,14 @@ static void fd_rwsec_done1(int status)
 		}
 	}
   
-	if (++ReqCnt < blk_rq_cur_sectors(CURRENT)) {
+	if (++ReqCnt < blk_rq_cur_sectors(fd_request)) {
 		/* read next sector */
 		setup_req_params( SelectedDrive );
 		do_fd_action( SelectedDrive );
 	}
 	else {
 		/* all sectors finished */
-		__blk_end_request_cur(CURRENT, 0);
+		fd_end_request_cur(0);
 		redo_fd_request();
 	}
 	return;
@@ -1378,7 +1382,7 @@ static void setup_req_params( int drive )
 	ReqData = ReqBuffer + 512 * ReqCnt;
 
 	if (UseTrackbuffer)
-		read_track = (ReqCmd == READ && CURRENT->errors == 0);
+		read_track = (ReqCmd == READ && fd_request->errors == 0);
 	else
 		read_track = 0;
 
@@ -1392,25 +1396,28 @@ static void redo_fd_request(void)
 	int drive, type;
 	struct atari_floppy_struct *floppy;
 
-	DPRINT(("redo_fd_request: CURRENT=%p dev=%s CURRENT->sector=%ld\n",
-		CURRENT, CURRENT ? CURRENT->rq_disk->disk_name : "",
-		CURRENT ? blk_rq_pos(CURRENT) : 0 ));
+	DPRINT(("redo_fd_request: fd_request=%p dev=%s fd_request->sector=%ld\n",
+		fd_request, fd_request ? fd_request->rq_disk->disk_name : "",
+		fd_request ? blk_rq_pos(fd_request) : 0 ));
 
 	IsFormatting = 0;
 
 repeat:
+	if (!fd_request) {
+		fd_request = elv_next_request(floppy_queue);
+		if (!fd_request)
+			goto the_end;
+		blkdev_dequeue_request(fd_request);
+	}
 
-	if (!CURRENT)
-		goto the_end;
-
-	floppy = CURRENT->rq_disk->private_data;
+	floppy = fd_request->rq_disk->private_data;
 	drive = floppy - unit;
 	type = floppy->type;
 	
 	if (!UD.connected) {
 		/* drive not connected */
 		printk(KERN_ERR "Unknown Device: fd%d\n", drive );
-		__blk_end_request_cur(CURRENT, -EIO);
+		fd_end_request_cur(-EIO);
 		goto repeat;
 	}
 		
@@ -1426,12 +1433,12 @@ repeat:
 		/* user supplied disk type */
 		if (--type >= NUM_DISK_MINORS) {
 			printk(KERN_WARNING "fd%d: invalid disk format", drive );
-			__blk_end_request_cur(CURRENT, -EIO);
+			fd_end_request_cur(-EIO);
 			goto repeat;
 		}
 		if (minor2disktype[type].drive_types > DriveType)  {
 			printk(KERN_WARNING "fd%d: unsupported disk format", drive );
-			__blk_end_request_cur(CURRENT, -EIO);
+			fd_end_request_cur(-EIO);
 			goto repeat;
 		}
 		type = minor2disktype[type].index;
@@ -1440,8 +1447,8 @@ repeat:
 		UD.autoprobe = 0;
 	}
 	
-	if (blk_rq_pos(CURRENT) + 1 > UDT->blocks) {
-		__blk_end_request_cur(CURRENT, -EIO);
+	if (blk_rq_pos(fd_request) + 1 > UDT->blocks) {
+		fd_end_request_cur(-EIO);
 		goto repeat;
 	}
 
@@ -1449,9 +1456,9 @@ repeat:
 	del_timer( &motor_off_timer );
 		
 	ReqCnt = 0;
-	ReqCmd = rq_data_dir(CURRENT);
-	ReqBlock = blk_rq_pos(CURRENT);
-	ReqBuffer = CURRENT->buffer;
+	ReqCmd = rq_data_dir(fd_request);
+	ReqBlock = blk_rq_pos(fd_request);
+	ReqBuffer = fd_request->buffer;
 	setup_req_params( drive );
 	do_fd_action( drive );
 
-- 
cgit v0.10.2


From f4bd4b90bfa86e867111aa182895cb5ac203aa7a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:04 +0900
Subject: swim3: dequeue in-flight request

swim3 has at most single request in flight and already tracks it using
fd_req.  Convert it to dequeuing model by updating request fetching
and wrapping completion function.

[ Impact: dequeue in-flight request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index c1b9a4d..f48c6dd 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -251,6 +251,20 @@ static int floppy_release(struct gendisk *disk, fmode_t mode);
 static int floppy_check_change(struct gendisk *disk);
 static int floppy_revalidate(struct gendisk *disk);
 
+static bool swim3_end_request(int err, unsigned int nr_bytes)
+{
+	if (__blk_end_request(fd_req, err, nr_bytes))
+		return true;
+
+	fd_req = NULL;
+	return false;
+}
+
+static bool swim3_end_request_cur(int err)
+{
+	return swim3_end_request(err, blk_rq_cur_bytes(fd_req));
+}
+
 static void swim3_select(struct floppy_state *fs, int sel)
 {
 	struct swim3 __iomem *sw = fs->swim3;
@@ -310,7 +324,14 @@ static void start_request(struct floppy_state *fs)
 		wake_up(&fs->wait);
 		return;
 	}
-	while (fs->state == idle && (req = elv_next_request(swim3_queue))) {
+	while (fs->state == idle) {
+		if (!fd_req) {
+			fd_req = elv_next_request(swim3_queue);
+			if (!fd_req)
+				break;
+			blkdev_dequeue_request(fd_req);
+		}
+		req = fd_req;
 #if 0
 		printk("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n",
 		       req->rq_disk->disk_name, req->cmd,
@@ -320,11 +341,11 @@ static void start_request(struct floppy_state *fs)
 #endif
 
 		if (blk_rq_pos(req) >= fs->total_secs) {
-			__blk_end_request_cur(req, -EIO);
+			swim3_end_request_cur(-EIO);
 			continue;
 		}
 		if (fs->ejected) {
-			__blk_end_request_cur(req, -EIO);
+			swim3_end_request_cur(-EIO);
 			continue;
 		}
 
@@ -332,7 +353,7 @@ static void start_request(struct floppy_state *fs)
 			if (fs->write_prot < 0)
 				fs->write_prot = swim3_readbit(fs, WRITE_PROT);
 			if (fs->write_prot) {
-				__blk_end_request_cur(req, -EIO);
+				swim3_end_request_cur(-EIO);
 				continue;
 			}
 		}
@@ -505,7 +526,7 @@ static void act(struct floppy_state *fs)
 		case do_transfer:
 			if (fs->cur_cyl != fs->req_cyl) {
 				if (fs->retries > 5) {
-					__blk_end_request_cur(fd_req, -EIO);
+					swim3_end_request_cur(-EIO);
 					fs->state = idle;
 					return;
 				}
@@ -537,7 +558,7 @@ static void scan_timeout(unsigned long data)
 	out_8(&sw->intr_enable, 0);
 	fs->cur_cyl = -1;
 	if (fs->retries > 5) {
-		__blk_end_request_cur(fd_req, -EIO);
+		swim3_end_request_cur(-EIO);
 		fs->state = idle;
 		start_request(fs);
 	} else {
@@ -556,7 +577,7 @@ static void seek_timeout(unsigned long data)
 	out_8(&sw->select, RELAX);
 	out_8(&sw->intr_enable, 0);
 	printk(KERN_ERR "swim3: seek timeout\n");
-	__blk_end_request_cur(fd_req, -EIO);
+	swim3_end_request_cur(-EIO);
 	fs->state = idle;
 	start_request(fs);
 }
@@ -580,7 +601,7 @@ static void settle_timeout(unsigned long data)
 		return;
 	}
 	printk(KERN_ERR "swim3: seek settle timeout\n");
-	__blk_end_request_cur(fd_req, -EIO);
+	swim3_end_request_cur(-EIO);
 	fs->state = idle;
 	start_request(fs);
 }
@@ -603,7 +624,7 @@ static void xfer_timeout(unsigned long data)
 	printk(KERN_ERR "swim3: timeout %sing sector %ld\n",
 	       (rq_data_dir(fd_req)==WRITE? "writ": "read"),
 	       (long)blk_rq_pos(fd_req));
-	__blk_end_request_cur(fd_req, -EIO);
+	swim3_end_request_cur(-EIO);
 	fs->state = idle;
 	start_request(fs);
 }
@@ -634,7 +655,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 				printk(KERN_ERR "swim3: seen sector but cyl=ff?\n");
 				fs->cur_cyl = -1;
 				if (fs->retries > 5) {
-					__blk_end_request_cur(fd_req, -EIO);
+					swim3_end_request_cur(-EIO);
 					fs->state = idle;
 					start_request(fs);
 				} else {
@@ -717,7 +738,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 				printk("swim3: error %sing block %ld (err=%x)\n",
 				       rq_data_dir(fd_req) == WRITE? "writ": "read",
 				       (long)blk_rq_pos(fd_req), err);
-				__blk_end_request_cur(fd_req, -EIO);
+				swim3_end_request_cur(-EIO);
 				fs->state = idle;
 			}
 		} else {
@@ -726,12 +747,12 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 				printk(KERN_ERR "swim3: fd dma: stat=%x resid=%d\n", stat, resid);
 				printk(KERN_ERR "  state=%d, dir=%x, intr=%x, err=%x\n",
 				       fs->state, rq_data_dir(fd_req), intr, err);
-				__blk_end_request_cur(fd_req, -EIO);
+				swim3_end_request_cur(-EIO);
 				fs->state = idle;
 				start_request(fs);
 				break;
 			}
-			if (__blk_end_request(fd_req, 0, fs->scount << 9)) {
+			if (swim3_end_request(0, fs->scount << 9)) {
 				fs->req_sector += fs->scount;
 				if (fs->req_sector > fs->secpertrack) {
 					fs->req_sector -= fs->secpertrack;
-- 
cgit v0.10.2


From 2d75ce084eec2a8154225c7e978191b364029003 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:05 +0900
Subject: xsysace: dequeue in-flight request

xsysace already tracks in-flight request using ace->req.  Converting
to dequeueing model is mostly a matter of adding dequeueing call after
request fetching.  The only tricky part is handling CF removal which
should complete both in flight and on queue requests.  Convert to
dequeueing model.

While at it, remove explicit blk_rq_cur_bytes() and use
__blk_end_request_cur() instead.

[ Impact: dequeue in-flight request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 97c99b43..edf137b 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -466,7 +466,8 @@ struct request *ace_get_next_request(struct request_queue * q)
 	while ((req = elv_next_request(q)) != NULL) {
 		if (blk_fs_request(req))
 			break;
-		__blk_end_request_cur(req, -EIO);
+		blkdev_dequeue_request(req);
+		__blk_end_request_all(req, -EIO);
 	}
 	return req;
 }
@@ -492,9 +493,15 @@ static void ace_fsm_dostate(struct ace_device *ace)
 		set_capacity(ace->gd, 0);
 		dev_info(ace->dev, "No CF in slot\n");
 
-		/* Drop all pending requests */
-		while ((req = elv_next_request(ace->queue)) != NULL)
-			__blk_end_request_cur(req, -EIO);
+		/* Drop all in-flight and pending requests */
+		if (ace->req) {
+			__blk_end_request_all(ace->req, -EIO);
+			ace->req = NULL;
+		}
+		while ((req = elv_next_request(ace->queue)) != NULL) {
+			blkdev_dequeue_request(req);
+			__blk_end_request_all(req, -EIO);
+		}
 
 		/* Drop back to IDLE state and notify waiters */
 		ace->fsm_state = ACE_FSM_STATE_IDLE;
@@ -642,6 +649,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
 			ace->fsm_state = ACE_FSM_STATE_IDLE;
 			break;
 		}
+		blkdev_dequeue_request(req);
 
 		/* Okay, it's a data request, set it up for transfer */
 		dev_dbg(ace->dev,
@@ -718,8 +726,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
 		}
 
 		/* bio finished; is there another one? */
-		if (__blk_end_request(ace->req, 0,
-					blk_rq_cur_bytes(ace->req))) {
+		if (__blk_end_request_cur(ace->req, 0)) {
 			/* dev_dbg(ace->dev, "next block; h=%u c=%u\n",
 			 *      blk_rq_sectors(ace->req),
 			 *      blk_rq_cur_sectors(ace->req));
-- 
cgit v0.10.2


From b12d4f82c1a3cdcb2441c803a3368a9426f2f47f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:06 +0900
Subject: paride: dequeue in-flight request

pd/pf/pcd have track in-flight request by pd/pf/pcd_req.  They can be
converted to dequeueing model by updating fetching and completion
paths.  Convert them.

Note that removal of elv_next_request() call from pf_next_buf()
doesn't make any functional difference.  The path is traveled only
during partial completion of a request and elv_next_request() call
must return the same request anyway.

[ Impact: dequeue in-flight request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Tim Waugh <tim@cyberelk.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 2d5dc0a..425f815 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -719,9 +719,12 @@ static void do_pcd_request(struct request_queue * q)
 	if (pcd_busy)
 		return;
 	while (1) {
-		pcd_req = elv_next_request(q);
-		if (!pcd_req)
-			return;
+		if (!pcd_req) {
+			pcd_req = elv_next_request(q);
+			if (!pcd_req)
+				return;
+			blkdev_dequeue_request(pcd_req);
+		}
 
 		if (rq_data_dir(pcd_req) == READ) {
 			struct pcd_unit *cd = pcd_req->rq_disk->private_data;
@@ -734,8 +737,10 @@ static void do_pcd_request(struct request_queue * q)
 			pcd_busy = 1;
 			ps_set_intr(do_pcd_read, NULL, 0, nice);
 			return;
-		} else
-			__blk_end_request_cur(pcd_req, -EIO);
+		} else {
+			__blk_end_request_all(pcd_req, -EIO);
+			pcd_req = NULL;
+		}
 	}
 }
 
@@ -744,7 +749,8 @@ static inline void next_request(int err)
 	unsigned long saved_flags;
 
 	spin_lock_irqsave(&pcd_lock, saved_flags);
-	__blk_end_request_cur(pcd_req, err);
+	if (!__blk_end_request_cur(pcd_req, err))
+		pcd_req = NULL;
 	pcd_busy = 0;
 	do_pcd_request(pcd_queue);
 	spin_unlock_irqrestore(&pcd_lock, saved_flags);
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 9ec5d4a..d2ca3f5 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -410,11 +410,14 @@ static void run_fsm(void)
 				pd_claimed = 0;
 				phase = NULL;
 				spin_lock_irqsave(&pd_lock, saved_flags);
-				__blk_end_request_cur(pd_req,
-						      res == Ok ? 0 : -EIO);
-				pd_req = elv_next_request(pd_queue);
-				if (!pd_req)
-					stop = 1;
+				if (!__blk_end_request_cur(pd_req,
+						res == Ok ? 0 : -EIO)) {
+					pd_req = elv_next_request(pd_queue);
+					if (!pd_req)
+						stop = 1;
+					else
+						blkdev_dequeue_request(pd_req);
+				}
 				spin_unlock_irqrestore(&pd_lock, saved_flags);
 				if (stop)
 					return;
@@ -706,6 +709,7 @@ static void do_pd_request(struct request_queue * q)
 	pd_req = elv_next_request(q);
 	if (!pd_req)
 		return;
+	blkdev_dequeue_request(pd_req);
 
 	schedule_fsm();
 }
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index e88c889..d6f7bd8 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -752,10 +752,8 @@ static struct request_queue *pf_queue;
 
 static void pf_end_request(int err)
 {
-	if (pf_req) {
-		__blk_end_request_cur(pf_req, err);
+	if (pf_req && !__blk_end_request_cur(pf_req, err))
 		pf_req = NULL;
-	}
 }
 
 static void do_pf_request(struct request_queue * q)
@@ -763,9 +761,12 @@ static void do_pf_request(struct request_queue * q)
 	if (pf_busy)
 		return;
 repeat:
-	pf_req = elv_next_request(q);
-	if (!pf_req)
-		return;
+	if (!pf_req) {
+		pf_req = elv_next_request(q);
+		if (!pf_req)
+			return;
+		blkdev_dequeue_request(pf_req);
+	}
 
 	pf_current = pf_req->rq_disk->private_data;
 	pf_block = blk_rq_pos(pf_req);
@@ -806,7 +807,6 @@ static int pf_next_buf(void)
 	if (!pf_count) {
 		spin_lock_irqsave(&pf_spin_lock, saved_flags);
 		pf_end_request(0);
-		pf_req = elv_next_request(pf_queue);
 		spin_unlock_irqrestore(&pf_spin_lock, saved_flags);
 		if (!pf_req)
 			return 1;
-- 
cgit v0.10.2


From 10e1e629b386aef97bf66de6ef28d450bec06ee3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:07 +0900
Subject: ps3disk: dequeue in-flight request

Other than in issue error paths, ps3disk always completely finishes
fetched requests.  With full completion on error paths, it can be
easily converted to dequeueing model.

* After L1 r/w call failure, ps3disk_submit_request_sg() now fails the
  whole request.  Issue failure isn't likely to benefit from partial
  retry anyway and ps3disk uses full failure in completion error path
  too, so I don't think this amounts to any meaningful functionality
  loss.

* flush completion is converted to _all for consistency.  It doesn't
  make any functional difference.

[ Impact: dequeue in-flight request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 8d58308..f4d8db9 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -157,7 +157,7 @@ static int ps3disk_submit_request_sg(struct ps3_storage_device *dev,
 	if (res) {
 		dev_err(&dev->sbd.core, "%s:%u: %s failed %d\n", __func__,
 			__LINE__, op, res);
-		__blk_end_request_cur(req, -EIO);
+		__blk_end_request_all(req, -EIO);
 		return 0;
 	}
 
@@ -179,7 +179,7 @@ static int ps3disk_submit_flush_request(struct ps3_storage_device *dev,
 	if (res) {
 		dev_err(&dev->sbd.core, "%s:%u: sync cache failed 0x%llx\n",
 			__func__, __LINE__, res);
-		__blk_end_request_cur(req, -EIO);
+		__blk_end_request_all(req, -EIO);
 		return 0;
 	}
 
@@ -195,6 +195,8 @@ static void ps3disk_do_request(struct ps3_storage_device *dev,
 	dev_dbg(&dev->sbd.core, "%s:%u\n", __func__, __LINE__);
 
 	while ((req = elv_next_request(q))) {
+		blkdev_dequeue_request(req);
+
 		if (blk_fs_request(req)) {
 			if (ps3disk_submit_request_sg(dev, req))
 				break;
@@ -204,7 +206,7 @@ static void ps3disk_do_request(struct ps3_storage_device *dev,
 				break;
 		} else {
 			blk_dump_rq_flags(req, DEVICE_NAME " bad request");
-			__blk_end_request_cur(req, -EIO);
+			__blk_end_request_all(req, -EIO);
 			continue;
 		}
 	}
-- 
cgit v0.10.2


From 9e31bebee2d8b5f8abe9677f2a29b90cba848657 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:08 +0900
Subject: amiflop: dequeue in-flight request

Request processing in amiflop is done sequentially in
redo_fd_request() proper and redo_fd_request() can easily be converted
to track in-flight request.  Remove CURRENT, track in-flight request
directly and dequeue it when processing starts.

[ Impact: dequeue in-flight request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index e4a14b9..80a68b2 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -112,8 +112,6 @@ module_param(fd_def_df0, ulong, 0);
 MODULE_LICENSE("GPL");
 
 static struct request_queue *floppy_queue;
-#define QUEUE (floppy_queue)
-#define CURRENT elv_next_request(floppy_queue)
 
 /*
  *  Macros
@@ -1335,59 +1333,61 @@ static int get_track(int drive, int track)
 
 static void redo_fd_request(void)
 {
+	struct request *rq;
 	unsigned int cnt, block, track, sector;
 	int drive;
 	struct amiga_floppy_struct *floppy;
 	char *data;
 	unsigned long flags;
+	int err;
 
- repeat:
-	if (!CURRENT) {
+next_req:
+	rq = elv_next_request(floppy_queue);
+	if (!rq) {
 		/* Nothing left to do */
 		return;
 	}
+	blkdev_dequeue_request(rq);
 
-	floppy = CURRENT->rq_disk->private_data;
+	floppy = rq->rq_disk->private_data;
 	drive = floppy - unit;
 
+next_segment:
 	/* Here someone could investigate to be more efficient */
-	for (cnt = 0; cnt < blk_rq_cur_sectors(CURRENT); cnt++) {
+	for (cnt = 0, err = 0; cnt < blk_rq_cur_sectors(rq); cnt++) {
 #ifdef DEBUG
 		printk("fd: sector %ld + %d requested for %s\n",
-		       blk_rq_pos(CURRENT), cnt,
-		       (rq_data_dir(CURRENT) == READ) ? "read" : "write");
+		       blk_rq_pos(rq), cnt,
+		       (rq_data_dir(rq) == READ) ? "read" : "write");
 #endif
-		block = blk_rq_pos(CURRENT) + cnt;
+		block = blk_rq_pos(rq) + cnt;
 		if ((int)block > floppy->blocks) {
-			__blk_end_request_cur(CURRENT, -EIO);
-			goto repeat;
+			err = -EIO;
+			break;
 		}
 
 		track = block / (floppy->dtype->sects * floppy->type->sect_mult);
 		sector = block % (floppy->dtype->sects * floppy->type->sect_mult);
-		data = CURRENT->buffer + 512 * cnt;
+		data = rq->buffer + 512 * cnt;
 #ifdef DEBUG
 		printk("access to track %d, sector %d, with buffer at "
 		       "0x%08lx\n", track, sector, data);
 #endif
 
 		if (get_track(drive, track) == -1) {
-			__blk_end_request_cur(CURRENT, -EIO);
-			goto repeat;
+			err = -EIO;
+			break;
 		}
 
-		switch (rq_data_dir(CURRENT)) {
-		case READ:
+		if (rq_data_dir(rq) == READ) {
 			memcpy(data, floppy->trackbuf + sector * 512, 512);
-			break;
-
-		case WRITE:
+		} else {
 			memcpy(floppy->trackbuf + sector * 512, data, 512);
 
 			/* keep the drive spinning while writes are scheduled */
 			if (!fd_motor_on(drive)) {
-				__blk_end_request_cur(CURRENT, -EIO);
-				goto repeat;
+				err = -EIO;
+				break;
 			}
 			/*
 			 * setup a callback to write the track buffer
@@ -1399,12 +1399,12 @@ static void redo_fd_request(void)
 		        /* reset the timer */
 			mod_timer (flush_track_timer + drive, jiffies + 1);
 			local_irq_restore(flags);
-			break;
 		}
 	}
 
-	__blk_end_request_cur(CURRENT, 0);
-	goto repeat;
+	if (__blk_end_request_cur(rq, err))
+		goto next_segment;
+	goto next_req;
 }
 
 static void do_fd_request(struct request_queue * q)
-- 
cgit v0.10.2


From 06b0608e2b46465e8e663214e7db982ddb000346 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:09 +0900
Subject: swim: dequeue in-flight request

swim processes requests one-by-one synchronously and can easily be
converted to dequeuing model.  Convert it.

[ Impact: dequeue in-flight request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Laurent Vivier <Laurent@lvivier.info>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index fc6a1c3..dedd489 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -514,7 +514,7 @@ static int floppy_read_sectors(struct floppy_state *fs,
 			ret = swim_read_sector(fs, side, track, sector,
 						buffer);
 			if (try-- == 0)
-				return -1;
+				return -EIO;
 		} while (ret != 512);
 
 		buffer += ret;
@@ -528,38 +528,37 @@ static void redo_fd_request(struct request_queue *q)
 	struct request *req;
 	struct floppy_state *fs;
 
-	while ((req = elv_next_request(q))) {
+	req = elv_next_request(q);
+	if (req)
+		blkdev_dequeue_request(req);
+
+	while (req) {
+		int err = -EIO;
 
 		fs = req->rq_disk->private_data;
-		if (blk_rq_pos(req) >= fs->total_secs) {
-			__blk_end_request_cur(req, -EIO);
-			continue;
-		}
-		if (!fs->disk_in) {
-			__blk_end_request_cur(req, -EIO);
-			continue;
-		}
-		if (rq_data_dir(req) == WRITE) {
-			if (fs->write_protected) {
-				__blk_end_request_cur(req, -EIO);
-				continue;
-			}
-		}
+		if (blk_rq_pos(req) >= fs->total_secs)
+			goto done;
+		if (!fs->disk_in)
+			goto done;
+		if (rq_data_dir(req) == WRITE && fs->write_protected)
+			goto done;
+
 		switch (rq_data_dir(req)) {
 		case WRITE:
 			/* NOT IMPLEMENTED */
-			__blk_end_request_cur(req, -EIO);
 			break;
 		case READ:
-			if (floppy_read_sectors(fs, blk_rq_pos(req),
-						blk_rq_cur_sectors(req),
-						req->buffer)) {
-				__blk_end_request_cur(req, -EIO);
-				continue;
-			}
-			__blk_end_request_cur(req, 0);
+			err = floppy_read_sectors(fs, blk_rq_pos(req),
+						  blk_rq_cur_sectors(req),
+						  req->buffer);
 			break;
 		}
+	done:
+		if (!__blk_end_request_cur(req, err)) {
+			req = elv_next_request(q);
+			if (req)
+				blkdev_dequeue_request(req);
+		}
 	}
 }
 
-- 
cgit v0.10.2


From bab2a807a489822ded0c9d4a5344c80bcac10b0a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:10 +0900
Subject: xd: dequeue in-flight request

xd processes requests one-by-one synchronously and can be easily
converted to dequeueing model.  Convert it.

While at it, use rq_cur_bytes instead of rq_bytes when checking for
sector overflow.  This is for for consistency and better behavior for
merged requests.

[ Impact: dequeue in-flight request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index 4ef8801..d4c4352 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -305,26 +305,31 @@ static void do_xd_request (struct request_queue * q)
 	if (xdc_busy)
 		return;
 
-	while ((req = elv_next_request(q)) != NULL) {
+	req = elv_next_request(q);
+	if (req)
+		blkdev_dequeue_request(req);
+
+	while (req) {
 		unsigned block = blk_rq_pos(req);
-		unsigned count = blk_rq_sectors(req);
+		unsigned count = blk_rq_cur_sectors(req);
 		XD_INFO *disk = req->rq_disk->private_data;
-		int res = 0;
+		int res = -EIO;
 		int retry;
 
-		if (!blk_fs_request(req)) {
-			__blk_end_request_cur(req, -EIO);
-			continue;
-		}
-		if (block + count > get_capacity(req->rq_disk)) {
-			__blk_end_request_cur(req, -EIO);
-			continue;
-		}
+		if (!blk_fs_request(req))
+			goto done;
+		if (block + count > get_capacity(req->rq_disk))
+			goto done;
 		for (retry = 0; (retry < XD_RETRIES) && !res; retry++)
 			res = xd_readwrite(rq_data_dir(req), disk, req->buffer,
 					   block, count);
+	done:
 		/* wrap up, 0 = success, -errno = fail */
-		__blk_end_request_cur(req, res);
+		if (!__blk_end_request_cur(req, res)) {
+			req = elv_next_request(q);
+			if (req)
+				blkdev_dequeue_request(req);
+		}
 	}
 }
 
-- 
cgit v0.10.2


From 1498ada7a8e80afe324f2b8d86158925d0096ec9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:11 +0900
Subject: mtd_blkdevs: dequeue in-flight request

mtd_blkdevs processes requests one-by-one synchronously from a kthread
and can be easily converted to dequeueing model.  Convert it.

[ Impact: dequeue in-flight request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 50c76a2..3e10442 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -89,18 +89,22 @@ static int mtd_blktrans_thread(void *arg)
 {
 	struct mtd_blktrans_ops *tr = arg;
 	struct request_queue *rq = tr->blkcore_priv->rq;
+	struct request *req = NULL;
 
 	/* we might get involved when memory gets low, so use PF_MEMALLOC */
 	current->flags |= PF_MEMALLOC;
 
 	spin_lock_irq(rq->queue_lock);
+
 	while (!kthread_should_stop()) {
-		struct request *req;
 		struct mtd_blktrans_dev *dev;
 		int res;
 
-		req = elv_next_request(rq);
-
+		if (!req) {
+			req = elv_next_request(rq);
+			if (req)
+				blkdev_dequeue_request(req);
+		}
 		if (!req) {
 			set_current_state(TASK_INTERRUPTIBLE);
 			spin_unlock_irq(rq->queue_lock);
@@ -120,8 +124,13 @@ static int mtd_blktrans_thread(void *arg)
 
 		spin_lock_irq(rq->queue_lock);
 
-		__blk_end_request_cur(req, res);
+		if (!__blk_end_request_cur(req, res))
+			req = NULL;
 	}
+
+	if (req)
+		__blk_end_request_all(req, -EIO);
+
 	spin_unlock_irq(rq->queue_lock);
 
 	return 0;
-- 
cgit v0.10.2


From 6b0bf407b586b6ba8e060ad9979cb2bc3370b7eb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:12 +0900
Subject: jsflash: dequeue in-flight request

jsflash processes requests one-by-one synchronously from a kthread and
can be easily converted to dequeueing model.  Convert it.

[ Impact: dequeue in-flight request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Pete Zaitcev <zaitcev@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c
index d56ddaa..f572a4a 100644
--- a/drivers/sbus/char/jsflash.c
+++ b/drivers/sbus/char/jsflash.c
@@ -186,31 +186,37 @@ static void jsfd_do_request(struct request_queue *q)
 {
 	struct request *req;
 
-	while ((req = elv_next_request(q)) != NULL) {
+	req = elv_next_request(q);
+	if (req)
+		blkdev_dequeue_request(req);
+
+	while (req) {
 		struct jsfd_part *jdp = req->rq_disk->private_data;
 		unsigned long offset = blk_rq_pos(req) << 9;
 		size_t len = blk_rq_cur_bytes(req);
+		int err = -EIO;
 
-		if ((offset + len) > jdp->dsize) {
-			__blk_end_request_cur(req, -EIO);
-			continue;
-		}
+		if ((offset + len) > jdp->dsize)
+			goto end;
 
 		if (rq_data_dir(req) != READ) {
 			printk(KERN_ERR "jsfd: write\n");
-			__blk_end_request_cur(req, -EIO);
-			continue;
+			goto end;
 		}
 
 		if ((jdp->dbase & 0xff000000) != 0x20000000) {
 			printk(KERN_ERR "jsfd: bad base %x\n", (int)jdp->dbase);
-			__blk_end_request_cur(req, -EIO);
-			continue;
+			goto end;
 		}
 
 		jsfd_read(req->buffer, jdp->dbase + offset, len);
-
-		__blk_end_request_cur(req, 0);
+		err = 0;
+	end:
+		if (!__blk_end_request_cur(req, err)) {
+			req = elv_next_request(q);
+			if (req)
+				blkdev_dequeue_request(req);
+		}
 	}
 }
 
-- 
cgit v0.10.2


From fb3ac7f6b811eac8e0aafa3df1c16ed872e898a8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:13 +0900
Subject: z2ram: dequeue in-flight request

z2ram processes requests one-by-one synchronously and can be easily
converted to dequeueing model.  Convert it.

[ Impact: dequeue in-flight request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index 6a13838..c909c1a 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -70,15 +70,21 @@ static struct gendisk *z2ram_gendisk;
 static void do_z2_request(struct request_queue *q)
 {
 	struct request *req;
-	while ((req = elv_next_request(q)) != NULL) {
+
+	req = elv_next_request(q);
+	if (req)
+		blkdev_dequeue_request(req);
+
+	while (req) {
 		unsigned long start = blk_rq_pos(req) << 9;
 		unsigned long len  = blk_rq_cur_bytes(req);
+		int err = 0;
 
 		if (start + len > z2ram_size) {
 			printk( KERN_ERR DEVICE_NAME ": bad access: block=%lu, count=%u\n",
 				blk_rq_pos(req), blk_rq_cur_sectors(req));
-			__blk_end_request_cur(req, -EIO);
-			continue;
+			err = -EIO;
+			goto done;
 		}
 		while (len) {
 			unsigned long addr = start & Z2RAM_CHUNKMASK;
@@ -93,7 +99,12 @@ static void do_z2_request(struct request_queue *q)
 			start += size;
 			len -= size;
 		}
-		__blk_end_request_cur(req, 0);
+	done:
+		if (!__blk_end_request_cur(req, err)) {
+			req = elv_next_request(q);
+			if (req)
+				blkdev_dequeue_request(req);
+		}
 	}
 }
 
-- 
cgit v0.10.2


From 296b2f6ae654581adc27f0d6f0af454c7f3d06ee Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:15 +0900
Subject: block: convert to dequeueing model (easy ones)

plat-omap/mailbox, floppy, viocd, mspro_block, i2o_block and
mmc/card/queue are already pretty close to dequeueing model and can be
converted with simple changes.  Convert them.

While at it,

* xen-blkfront: !fs check moved downwards to share dequeue call with
  normal path.

* mspro_block: __blk_end_request(..., blk_rq_cur_byte()) converted to
  __blk_end_request_cur()

* mmc/card/queue: loop of __blk_end_request() converted to
  __blk_end_request_all()

[ Impact: dequeue in-flight request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Alex Dubov <oakad@yahoo.com>
Cc: Markus Lidel <Markus.Lidel@shadowconnect.com>
Cc: Pierre Ossman <drzeus@drzeus.cx>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/arch/arm/plat-omap/mailbox.c b/arch/arm/plat-omap/mailbox.c
index 538ba75..7a1f5c2 100644
--- a/arch/arm/plat-omap/mailbox.c
+++ b/arch/arm/plat-omap/mailbox.c
@@ -198,6 +198,8 @@ static void mbox_tx_work(struct work_struct *work)
 
 		spin_lock(q->queue_lock);
 		rq = elv_next_request(q);
+		if (rq)
+			blkdev_dequeue_request(rq);
 		spin_unlock(q->queue_lock);
 
 		if (!rq)
@@ -208,6 +210,9 @@ static void mbox_tx_work(struct work_struct *work)
 		ret = __mbox_msg_send(mbox, tx_data->msg, tx_data->arg);
 		if (ret) {
 			enable_mbox_irq(mbox, IRQ_TX);
+			spin_lock(q->queue_lock);
+			blk_requeue_request(q, rq);
+			spin_unlock(q->queue_lock);
 			return;
 		}
 
@@ -238,6 +243,8 @@ static void mbox_rx_work(struct work_struct *work)
 	while (1) {
 		spin_lock_irqsave(q->queue_lock, flags);
 		rq = elv_next_request(q);
+		if (rq)
+			blkdev_dequeue_request(rq);
 		spin_unlock_irqrestore(q->queue_lock, flags);
 		if (!rq)
 			break;
@@ -345,6 +352,8 @@ omap_mbox_read(struct device *dev, struct device_attribute *attr, char *buf)
 	while (1) {
 		spin_lock_irqsave(q->queue_lock, flags);
 		rq = elv_next_request(q);
+		if (rq)
+			blkdev_dequeue_request(rq);
 		spin_unlock_irqrestore(q->queue_lock, flags);
 
 		if (!rq)
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 1e27ed9..e2c70d2 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -931,7 +931,7 @@ static inline void unlock_fdc(void)
 	del_timer(&fd_timeout);
 	cont = NULL;
 	clear_bit(0, &fdc_busy);
-	if (elv_next_request(floppy_queue))
+	if (current_req || elv_next_request(floppy_queue))
 		do_fd_request(floppy_queue);
 	spin_unlock_irqrestore(&floppy_lock, flags);
 	wake_up(&fdc_wait);
@@ -2913,6 +2913,8 @@ static void redo_fd_request(void)
 
 			spin_lock_irq(floppy_queue->queue_lock);
 			req = elv_next_request(floppy_queue);
+			if (req)
+				blkdev_dequeue_request(req);
 			spin_unlock_irq(floppy_queue->queue_lock);
 			if (!req) {
 				do_floppy = NULL;
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 91fc565..66f8345 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -301,22 +301,23 @@ static void do_blkif_request(struct request_queue *rq)
 
 	while ((req = elv_next_request(rq)) != NULL) {
 		info = req->rq_disk->private_data;
-		if (!blk_fs_request(req)) {
-			__blk_end_request_cur(req, -EIO);
-			continue;
-		}
 
 		if (RING_FULL(&info->ring))
 			goto wait;
 
+		blkdev_dequeue_request(req);
+
+		if (!blk_fs_request(req)) {
+			__blk_end_request_all(req, -EIO);
+			continue;
+		}
+
 		pr_debug("do_blk_req %p: cmd %p, sec %lx, "
 			 "(%u/%u) buffer:%p [%s]\n",
 			 req, req->cmd, (unsigned long)blk_rq_pos(req),
 			 blk_rq_cur_sectors(req), blk_rq_sectors(req),
 			 req->buffer, rq_data_dir(req) ? "write" : "read");
 
-
-		blkdev_dequeue_request(req);
 		if (blkif_queue_request(req)) {
 			blk_requeue_request(rq, req);
 wait:
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index 6e190a9..bbe9f08 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -298,6 +298,8 @@ static void do_viocd_request(struct request_queue *q)
 	struct request *req;
 
 	while ((rwreq == 0) && ((req = elv_next_request(q)) != NULL)) {
+		blkdev_dequeue_request(req);
+
 		if (!blk_fs_request(req))
 			__blk_end_request_all(req, -EIO);
 		else if (send_request(req) < 0) {
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 93b2c61..58f5be8 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -672,8 +672,7 @@ try_again:
 					       msb->req_sg);
 
 		if (!msb->seg_count) {
-			chunk = __blk_end_request(msb->block_req, -ENOMEM,
-					blk_rq_cur_bytes(msb->block_req));
+			chunk = __blk_end_request_cur(msb->block_req, -ENOMEM);
 			continue;
 		}
 
@@ -711,6 +710,7 @@ try_again:
 		dev_dbg(&card->dev, "issue end\n");
 		return -EAGAIN;
 	}
+	blkdev_dequeue_request(msb->block_req);
 
 	dev_dbg(&card->dev, "trying again\n");
 	chunk = 1;
@@ -825,8 +825,10 @@ static void mspro_block_submit_req(struct request_queue *q)
 		return;
 
 	if (msb->eject) {
-		while ((req = elv_next_request(q)) != NULL)
+		while ((req = elv_next_request(q)) != NULL) {
+			blkdev_dequeue_request(req);
 			__blk_end_request_all(req, -ENODEV);
+		}
 
 		return;
 	}
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index e153f5d..8b5cbfc 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -916,8 +916,10 @@ static void i2o_block_request_fn(struct request_queue *q)
 				blk_stop_queue(q);
 				break;
 			}
-		} else
-			__blk_end_request_cur(req, -EIO);
+		} else {
+			blkdev_dequeue_request(req);
+			__blk_end_request_all(req, -EIO);
+		}
 	}
 };
 
diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c
index 7a72e75..4b70f1e 100644
--- a/drivers/mmc/card/queue.c
+++ b/drivers/mmc/card/queue.c
@@ -54,8 +54,11 @@ static int mmc_queue_thread(void *d)
 
 		spin_lock_irq(q->queue_lock);
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (!blk_queue_plugged(q))
+		if (!blk_queue_plugged(q)) {
 			req = elv_next_request(q);
+			if (req)
+				blkdev_dequeue_request(req);
+		}
 		mq->req = req;
 		spin_unlock_irq(q->queue_lock);
 
@@ -88,15 +91,12 @@ static void mmc_request(struct request_queue *q)
 {
 	struct mmc_queue *mq = q->queuedata;
 	struct request *req;
-	int ret;
 
 	if (!mq) {
 		printk(KERN_ERR "MMC: killing requests for dead queue\n");
 		while ((req = elv_next_request(q)) != NULL) {
-			do {
-				ret = __blk_end_request(req, -EIO,
-							blk_rq_cur_bytes(req));
-			} while (ret);
+			blkdev_dequeue_request(req);
+			__blk_end_request_all(req, -EIO);
 		}
 		return;
 	}
-- 
cgit v0.10.2


From 2343046826a8ca426b07601d9593ee046c298b68 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:14 +0900
Subject: gdrom: dequeue in-flight request

gdrom already dequeues and fully completes requests on normal path and
the error paths can be easily converted to do so too.  Clean it up and
dequeue requests on error paths too.

While at it remove superflous blk_fs_request() && !blk_rq_sectors()
condition check.

[ Impact: dequeue in-flight request, cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Adrian McMenamin <adrian@mcmen.demon.co.uk>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 488423c..3cc02bf 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -638,33 +638,31 @@ static void gdrom_readdisk_dma(struct work_struct *work)
 	kfree(read_command);
 }
 
-static void gdrom_request_handler_dma(struct request *req)
-{
-	/* dequeue, add to list of deferred work
-	* and then schedule workqueue */
-	blkdev_dequeue_request(req);
-	list_add_tail(&req->queuelist, &gdrom_deferred);
-	schedule_work(&work);
-}
-
 static void gdrom_request(struct request_queue *rq)
 {
 	struct request *req;
 
 	while ((req = elv_next_request(rq)) != NULL) {
+		blkdev_dequeue_request(req);
+
 		if (!blk_fs_request(req)) {
 			printk(KERN_DEBUG "GDROM: Non-fs request ignored\n");
-			__blk_end_request_cur(req, -EIO);
+			__blk_end_request_all(req, -EIO);
+			continue;
 		}
 		if (rq_data_dir(req) != READ) {
 			printk(KERN_NOTICE "GDROM: Read only device -");
 			printk(" write request ignored\n");
-			__blk_end_request_cur(req, -EIO);
+			__blk_end_request_all(req, -EIO);
+			continue;
 		}
-		if (blk_rq_sectors(req))
-			gdrom_request_handler_dma(req);
-		else
-			__blk_end_request_cur(req, -EIO);
+
+		/*
+		 * Add to list of deferred work and then schedule
+		 * workqueue.
+		 */
+		list_add_tail(&req->queuelist, &gdrom_deferred);
+		schedule_work(&work);
 	}
 }
 
-- 
cgit v0.10.2


From 9934c8c04561413609d2bc38c6b9f268cba774a4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:16 +0900
Subject: block: implement and enforce request peek/start/fetch

Till now block layer allowed two separate modes of request execution.
A request is always acquired from the request queue via
elv_next_request().  After that, drivers are free to either dequeue it
or process it without dequeueing.  Dequeue allows elv_next_request()
to return the next request so that multiple requests can be in flight.

Executing requests without dequeueing has its merits mostly in
allowing drivers for simpler devices which can't do sg to deal with
segments only without considering request boundary.  However, the
benefit this brings is dubious and declining while the cost of the API
ambiguity is increasing.  Segment based drivers are usually for very
old or limited devices and as converting to dequeueing model isn't
difficult, it doesn't justify the API overhead it puts on block layer
and its more modern users.

Previous patches converted all block low level drivers to dequeueing
model.  This patch completes the API transition by...

* renaming elv_next_request() to blk_peek_request()

* renaming blkdev_dequeue_request() to blk_start_request()

* adding blk_fetch_request() which is combination of peek and start

* disallowing completion of queued (not started) requests

* applying new API to all LLDs

Renamings are for consistency and to break out of tree code so that
it's apparent that out of tree drivers need updating.

[ Impact: block request issue API cleanup, no functional change ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Mike Miller <mike.miller@hp.com>
Cc: unsik Kim <donari75@gmail.com>
Cc: Paul Clements <paul.clements@steeleye.com>
Cc: Tim Waugh <tim@cyberelk.net>
Cc: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Laurent Vivier <Laurent@lvivier.info>
Cc: Jeff Garzik <jgarzik@pobox.com>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Grant Likely <grant.likely@secretlab.ca>
Cc: Adrian McMenamin <adrian@mcmen.demon.co.uk>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Borislav Petkov <petkovbb@googlemail.com>
Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: Alex Dubov <oakad@yahoo.com>
Cc: Pierre Ossman <drzeus@drzeus.cx>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Markus Lidel <Markus.Lidel@shadowconnect.com>
Cc: Stefan Weinhuber <wein@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Pete Zaitcev <zaitcev@redhat.com>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/arch/arm/plat-omap/mailbox.c b/arch/arm/plat-omap/mailbox.c
index 7a1f5c2..40424ed 100644
--- a/arch/arm/plat-omap/mailbox.c
+++ b/arch/arm/plat-omap/mailbox.c
@@ -197,9 +197,7 @@ static void mbox_tx_work(struct work_struct *work)
 		struct omap_msg_tx_data *tx_data;
 
 		spin_lock(q->queue_lock);
-		rq = elv_next_request(q);
-		if (rq)
-			blkdev_dequeue_request(rq);
+		rq = blk_fetch_request(q);
 		spin_unlock(q->queue_lock);
 
 		if (!rq)
@@ -242,9 +240,7 @@ static void mbox_rx_work(struct work_struct *work)
 
 	while (1) {
 		spin_lock_irqsave(q->queue_lock, flags);
-		rq = elv_next_request(q);
-		if (rq)
-			blkdev_dequeue_request(rq);
+		rq = blk_fetch_request(q);
 		spin_unlock_irqrestore(q->queue_lock, flags);
 		if (!rq)
 			break;
@@ -351,9 +347,7 @@ omap_mbox_read(struct device *dev, struct device_attribute *attr, char *buf)
 
 	while (1) {
 		spin_lock_irqsave(q->queue_lock, flags);
-		rq = elv_next_request(q);
-		if (rq)
-			blkdev_dequeue_request(rq);
+		rq = blk_fetch_request(q);
 		spin_unlock_irqrestore(q->queue_lock, flags);
 
 		if (!rq)
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 402ba8f..aa9e926e 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -1228,12 +1228,11 @@ static void do_ubd_request(struct request_queue *q)
 	while(1){
 		struct ubd *dev = q->queuedata;
 		if(dev->end_sg == 0){
-			struct request *req = elv_next_request(q);
+			struct request *req = blk_fetch_request(q);
 			if(req == NULL)
 				return;
 
 			dev->request = req;
-			blkdev_dequeue_request(req);
 			dev->start_sg = 0;
 			dev->end_sg = blk_rq_map_sg(q, req, dev->sg);
 		}
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 8713c2f..0ab81a0 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -180,7 +180,7 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 	}
 
 	/* stash away the original request */
-	elv_dequeue_request(q, rq);
+	blk_dequeue_request(rq);
 	q->orig_bar_rq = rq;
 	rq = NULL;
 
@@ -248,7 +248,7 @@ bool blk_do_ordered(struct request_queue *q, struct request **rqp)
 			 * Queue ordering not supported.  Terminate
 			 * with prejudice.
 			 */
-			elv_dequeue_request(q, rq);
+			blk_dequeue_request(rq);
 			__blk_end_request_all(rq, -EOPNOTSUPP);
 			*rqp = NULL;
 			return false;
diff --git a/block/blk-core.c b/block/blk-core.c
index 6226a38..93691d2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -902,6 +902,8 @@ EXPORT_SYMBOL(blk_get_request);
  */
 void blk_requeue_request(struct request_queue *q, struct request *rq)
 {
+	BUG_ON(blk_queued_rq(rq));
+
 	blk_delete_timer(rq);
 	blk_clear_rq_complete(rq);
 	trace_block_rq_requeue(q, rq);
@@ -1610,28 +1612,6 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 }
 EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
 
-/**
- * blkdev_dequeue_request - dequeue request and start timeout timer
- * @req: request to dequeue
- *
- * Dequeue @req and start timeout timer on it.  This hands off the
- * request to the driver.
- *
- * Block internal functions which don't want to start timer should
- * call elv_dequeue_request().
- */
-void blkdev_dequeue_request(struct request *req)
-{
-	elv_dequeue_request(req->q, req);
-
-	/*
-	 * We are now handing the request to the hardware, add the
-	 * timeout handler.
-	 */
-	blk_add_timer(req);
-}
-EXPORT_SYMBOL(blkdev_dequeue_request);
-
 static void blk_account_io_completion(struct request *req, unsigned int bytes)
 {
 	if (blk_do_io_stat(req)) {
@@ -1671,7 +1651,23 @@ static void blk_account_io_done(struct request *req)
 	}
 }
 
-struct request *elv_next_request(struct request_queue *q)
+/**
+ * blk_peek_request - peek at the top of a request queue
+ * @q: request queue to peek at
+ *
+ * Description:
+ *     Return the request at the top of @q.  The returned request
+ *     should be started using blk_start_request() before LLD starts
+ *     processing it.
+ *
+ * Return:
+ *     Pointer to the request at the top of @q if available.  Null
+ *     otherwise.
+ *
+ * Context:
+ *     queue_lock must be held.
+ */
+struct request *blk_peek_request(struct request_queue *q)
 {
 	struct request *rq;
 	int ret;
@@ -1748,10 +1744,12 @@ struct request *elv_next_request(struct request_queue *q)
 
 	return rq;
 }
-EXPORT_SYMBOL(elv_next_request);
+EXPORT_SYMBOL(blk_peek_request);
 
-void elv_dequeue_request(struct request_queue *q, struct request *rq)
+void blk_dequeue_request(struct request *rq)
 {
+	struct request_queue *q = rq->q;
+
 	BUG_ON(list_empty(&rq->queuelist));
 	BUG_ON(ELV_ON_HASH(rq));
 
@@ -1767,6 +1765,58 @@ void elv_dequeue_request(struct request_queue *q, struct request *rq)
 }
 
 /**
+ * blk_start_request - start request processing on the driver
+ * @req: request to dequeue
+ *
+ * Description:
+ *     Dequeue @req and start timeout timer on it.  This hands off the
+ *     request to the driver.
+ *
+ *     Block internal functions which don't want to start timer should
+ *     call blk_dequeue_request().
+ *
+ * Context:
+ *     queue_lock must be held.
+ */
+void blk_start_request(struct request *req)
+{
+	blk_dequeue_request(req);
+
+	/*
+	 * We are now handing the request to the hardware, add the
+	 * timeout handler.
+	 */
+	blk_add_timer(req);
+}
+EXPORT_SYMBOL(blk_start_request);
+
+/**
+ * blk_fetch_request - fetch a request from a request queue
+ * @q: request queue to fetch a request from
+ *
+ * Description:
+ *     Return the request at the top of @q.  The request is started on
+ *     return and LLD can start processing it immediately.
+ *
+ * Return:
+ *     Pointer to the request at the top of @q if available.  Null
+ *     otherwise.
+ *
+ * Context:
+ *     queue_lock must be held.
+ */
+struct request *blk_fetch_request(struct request_queue *q)
+{
+	struct request *rq;
+
+	rq = blk_peek_request(q);
+	if (rq)
+		blk_start_request(rq);
+	return rq;
+}
+EXPORT_SYMBOL(blk_fetch_request);
+
+/**
  * blk_update_request - Special helper function for request stacking drivers
  * @rq:	      the request being processed
  * @error:    %0 for success, < %0 for error
@@ -1937,12 +1987,11 @@ static bool blk_update_bidi_request(struct request *rq, int error,
  */
 static void blk_finish_request(struct request *req, int error)
 {
+	BUG_ON(blk_queued_rq(req));
+
 	if (blk_rq_tagged(req))
 		blk_queue_end_tag(req->q, req);
 
-	if (blk_queued_rq(req))
-		elv_dequeue_request(req->q, req);
-
 	if (unlikely(laptop_mode) && blk_fs_request(req))
 		laptop_io_completion();
 
diff --git a/block/blk-tag.c b/block/blk-tag.c
index 3c518e3..c260f7c 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -374,7 +374,7 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
 	rq->cmd_flags |= REQ_QUEUED;
 	rq->tag = tag;
 	bqt->tag_index[tag] = rq;
-	blkdev_dequeue_request(rq);
+	blk_start_request(rq);
 	list_add(&rq->queuelist, &q->tag_busy_list);
 	return 0;
 }
diff --git a/block/blk.h b/block/blk.h
index ab54529..9e0042c 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -13,6 +13,7 @@ extern struct kobj_type blk_queue_ktype;
 void init_request_from_bio(struct request *req, struct bio *bio);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 			struct bio *bio);
+void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
 
 void blk_unplug_work(struct work_struct *work);
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 774ab05..668dc23 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -3321,7 +3321,7 @@ static int DAC960_process_queue(DAC960_Controller_T *Controller, struct request_
 	DAC960_Command_T *Command;
 
    while(1) {
-	Request = elv_next_request(req_q);
+	Request = blk_peek_request(req_q);
 	if (!Request)
 		return 1;
 
@@ -3341,7 +3341,7 @@ static int DAC960_process_queue(DAC960_Controller_T *Controller, struct request_
 	Command->BlockNumber = blk_rq_pos(Request);
 	Command->BlockCount = blk_rq_sectors(Request);
 	Command->Request = Request;
-	blkdev_dequeue_request(Request);
+	blk_start_request(Request);
 	Command->SegmentCount = blk_rq_map_sg(req_q,
 		  Command->Request, Command->cmd_sglist);
 	/* pci_map_sg MAY change the value of SegCount */
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 80a68b2..9c6e5b0 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1342,12 +1342,11 @@ static void redo_fd_request(void)
 	int err;
 
 next_req:
-	rq = elv_next_request(floppy_queue);
+	rq = blk_fetch_request(floppy_queue);
 	if (!rq) {
 		/* Nothing left to do */
 		return;
 	}
-	blkdev_dequeue_request(rq);
 
 	floppy = rq->rq_disk->private_data;
 	drive = floppy - unit;
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 89a591d..f5e7180 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -1404,10 +1404,9 @@ static void redo_fd_request(void)
 
 repeat:
 	if (!fd_request) {
-		fd_request = elv_next_request(floppy_queue);
+		fd_request = blk_fetch_request(floppy_queue);
 		if (!fd_request)
 			goto the_end;
-		blkdev_dequeue_request(fd_request);
 	}
 
 	floppy = fd_request->rq_disk->private_data;
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index ab7b04c..e714e7c 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2801,7 +2801,7 @@ static void do_cciss_request(struct request_queue *q)
 		goto startio;
 
       queue:
-	creq = elv_next_request(q);
+	creq = blk_peek_request(q);
 	if (!creq)
 		goto startio;
 
@@ -2810,7 +2810,7 @@ static void do_cciss_request(struct request_queue *q)
 	if ((c = cmd_alloc(h, 1)) == NULL)
 		goto full;
 
-	blkdev_dequeue_request(creq);
+	blk_start_request(creq);
 
 	spin_unlock_irq(q->queue_lock);
 
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index a5caeff..a02dcfc 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -903,7 +903,7 @@ static void do_ida_request(struct request_queue *q)
 		goto startio;
 
 queue_next:
-	creq = elv_next_request(q);
+	creq = blk_peek_request(q);
 	if (!creq)
 		goto startio;
 
@@ -912,7 +912,7 @@ queue_next:
 	if ((c = cmd_alloc(h,1)) == NULL)
 		goto startio;
 
-	blkdev_dequeue_request(creq);
+	blk_start_request(creq);
 
 	c->ctlr = h->ctlr;
 	c->hdr.unit = (drv_info_t *)(creq->rq_disk->private_data) - h->drv;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index e2c70d2..90877fe 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -931,7 +931,7 @@ static inline void unlock_fdc(void)
 	del_timer(&fd_timeout);
 	cont = NULL;
 	clear_bit(0, &fdc_busy);
-	if (current_req || elv_next_request(floppy_queue))
+	if (current_req || blk_peek_request(floppy_queue))
 		do_fd_request(floppy_queue);
 	spin_unlock_irqrestore(&floppy_lock, flags);
 	wake_up(&fdc_wait);
@@ -2912,9 +2912,7 @@ static void redo_fd_request(void)
 			struct request *req;
 
 			spin_lock_irq(floppy_queue->queue_lock);
-			req = elv_next_request(floppy_queue);
-			if (req)
-				blkdev_dequeue_request(req);
+			req = blk_fetch_request(floppy_queue);
 			spin_unlock_irq(floppy_queue->queue_lock);
 			if (!req) {
 				do_floppy = NULL;
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index 288ab63..961de56 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -592,12 +592,11 @@ repeat:
 	del_timer(&device_timer);
 
 	if (!hd_req) {
-		hd_req = elv_next_request(hd_queue);
+		hd_req = blk_fetch_request(hd_queue);
 		if (!hd_req) {
 			do_hd = NULL;
 			return;
 		}
-		blkdev_dequeue_request(hd_req);
 	}
 	req = hd_req;
 
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 1ca5d14..c0cd0a0 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -671,10 +671,8 @@ static void mg_request_poll(struct request_queue *q)
 
 	while (1) {
 		if (!host->req) {
-			host->req = elv_next_request(q);
-			if (host->req)
-				blkdev_dequeue_request(host->req);
-			else
+			host->req = blk_fetch_request(q);
+			if (!host->req)
 				break;
 		}
 
@@ -744,10 +742,8 @@ static void mg_request(struct request_queue *q)
 
 	while (1) {
 		if (!host->req) {
-			host->req = elv_next_request(q);
-			if (host->req)
-				blkdev_dequeue_request(host->req);
-			else
+			host->req = blk_fetch_request(q);
+			if (!host->req)
 				break;
 		}
 		req = host->req;
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index fad167de..5d23ffa 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -533,11 +533,9 @@ static void do_nbd_request(struct request_queue *q)
 {
 	struct request *req;
 	
-	while ((req = elv_next_request(q)) != NULL) {
+	while ((req = blk_fetch_request(q)) != NULL) {
 		struct nbd_device *lo;
 
-		blkdev_dequeue_request(req);
-
 		spin_unlock_irq(q->queue_lock);
 
 		dprintk(DBG_BLKDEV, "%s: request %p: dequeued (flags=%x)\n",
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 425f815..911dfd9 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -720,10 +720,9 @@ static void do_pcd_request(struct request_queue * q)
 		return;
 	while (1) {
 		if (!pcd_req) {
-			pcd_req = elv_next_request(q);
+			pcd_req = blk_fetch_request(q);
 			if (!pcd_req)
 				return;
-			blkdev_dequeue_request(pcd_req);
 		}
 
 		if (rq_data_dir(pcd_req) == READ) {
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index d2ca3f5..bf5955b 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -412,11 +412,9 @@ static void run_fsm(void)
 				spin_lock_irqsave(&pd_lock, saved_flags);
 				if (!__blk_end_request_cur(pd_req,
 						res == Ok ? 0 : -EIO)) {
-					pd_req = elv_next_request(pd_queue);
+					pd_req = blk_fetch_request(pd_queue);
 					if (!pd_req)
 						stop = 1;
-					else
-						blkdev_dequeue_request(pd_req);
 				}
 				spin_unlock_irqrestore(&pd_lock, saved_flags);
 				if (stop)
@@ -706,10 +704,9 @@ static void do_pd_request(struct request_queue * q)
 {
 	if (pd_req)
 		return;
-	pd_req = elv_next_request(q);
+	pd_req = blk_fetch_request(q);
 	if (!pd_req)
 		return;
-	blkdev_dequeue_request(pd_req);
 
 	schedule_fsm();
 }
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index d6f7bd8..68a90834 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -762,10 +762,9 @@ static void do_pf_request(struct request_queue * q)
 		return;
 repeat:
 	if (!pf_req) {
-		pf_req = elv_next_request(q);
+		pf_req = blk_fetch_request(q);
 		if (!pf_req)
 			return;
-		blkdev_dequeue_request(pf_req);
 	}
 
 	pf_current = pf_req->rq_disk->private_data;
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index f4d8db9..338cee4 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -194,9 +194,7 @@ static void ps3disk_do_request(struct ps3_storage_device *dev,
 
 	dev_dbg(&dev->sbd.core, "%s:%u\n", __func__, __LINE__);
 
-	while ((req = elv_next_request(q))) {
-		blkdev_dequeue_request(req);
-
+	while ((req = blk_fetch_request(q))) {
 		if (blk_fs_request(req)) {
 			if (ps3disk_submit_request_sg(dev, req))
 				break;
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 9f351bf..cbfd9c0 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -441,12 +441,11 @@ out:
 static void do_vdc_request(struct request_queue *q)
 {
 	while (1) {
-		struct request *req = elv_next_request(q);
+		struct request *req = blk_fetch_request(q);
 
 		if (!req)
 			break;
 
-		blkdev_dequeue_request(req);
 		if (__send_request(req) < 0)
 			__blk_end_request_all(req, -EIO);
 	}
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index dedd489..cf7877f 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -528,10 +528,7 @@ static void redo_fd_request(struct request_queue *q)
 	struct request *req;
 	struct floppy_state *fs;
 
-	req = elv_next_request(q);
-	if (req)
-		blkdev_dequeue_request(req);
-
+	req = blk_fetch_request(q);
 	while (req) {
 		int err = -EIO;
 
@@ -554,11 +551,8 @@ static void redo_fd_request(struct request_queue *q)
 			break;
 		}
 	done:
-		if (!__blk_end_request_cur(req, err)) {
-			req = elv_next_request(q);
-			if (req)
-				blkdev_dequeue_request(req);
-		}
+		if (!__blk_end_request_cur(req, err))
+			req = blk_fetch_request(q);
 	}
 }
 
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index f48c6dd..80df93e 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -326,10 +326,9 @@ static void start_request(struct floppy_state *fs)
 	}
 	while (fs->state == idle) {
 		if (!fd_req) {
-			fd_req = elv_next_request(swim3_queue);
+			fd_req = blk_fetch_request(swim3_queue);
 			if (!fd_req)
 				break;
-			blkdev_dequeue_request(fd_req);
 		}
 		req = fd_req;
 #if 0
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 087c94c..da403b6 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -810,12 +810,10 @@ static void carm_oob_rq_fn(struct request_queue *q)
 
 	while (1) {
 		DPRINTK("get req\n");
-		rq = elv_next_request(q);
+		rq = blk_fetch_request(q);
 		if (!rq)
 			break;
 
-		blkdev_dequeue_request(rq);
-
 		crq = rq->special;
 		assert(crq != NULL);
 		assert(crq->rq == rq);
@@ -846,7 +844,7 @@ static void carm_rq_fn(struct request_queue *q)
 
 queue_one_request:
 	VPRINTK("get req\n");
-	rq = elv_next_request(q);
+	rq = blk_peek_request(q);
 	if (!rq)
 		return;
 
@@ -857,7 +855,7 @@ queue_one_request:
 	}
 	crq->rq = rq;
 
-	blkdev_dequeue_request(rq);
+	blk_start_request(rq);
 
 	if (rq_data_dir(rq) == WRITE) {
 		writing = 1;
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 40d03cf..178f459 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -627,7 +627,7 @@ static void ub_request_fn(struct request_queue *q)
 	struct ub_lun *lun = q->queuedata;
 	struct request *rq;
 
-	while ((rq = elv_next_request(q)) != NULL) {
+	while ((rq = blk_peek_request(q)) != NULL) {
 		if (ub_request_fn_1(lun, rq) != 0) {
 			blk_stop_queue(q);
 			break;
@@ -643,13 +643,13 @@ static int ub_request_fn_1(struct ub_lun *lun, struct request *rq)
 	int n_elem;
 
 	if (atomic_read(&sc->poison)) {
-		blkdev_dequeue_request(rq);
+		blk_start_request(rq);
 		ub_end_rq(rq, DID_NO_CONNECT << 16, blk_rq_bytes(rq));
 		return 0;
 	}
 
 	if (lun->changed && !blk_pc_request(rq)) {
-		blkdev_dequeue_request(rq);
+		blk_start_request(rq);
 		ub_end_rq(rq, SAM_STAT_CHECK_CONDITION, blk_rq_bytes(rq));
 		return 0;
 	}
@@ -660,7 +660,7 @@ static int ub_request_fn_1(struct ub_lun *lun, struct request *rq)
 		return -1;
 	memset(cmd, 0, sizeof(struct ub_scsi_cmd));
 
-	blkdev_dequeue_request(rq);
+	blk_start_request(rq);
 
 	urq = &lun->urq;
 	memset(urq, 0, sizeof(struct ub_request));
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index 2086cb1..390d69b 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -361,11 +361,9 @@ static void do_viodasd_request(struct request_queue *q)
 	 * back later.
 	 */
 	while (num_req_outstanding < VIOMAXREQ) {
-		req = elv_next_request(q);
+		req = blk_fetch_request(q);
 		if (req == NULL)
 			return;
-		/* dequeue the current request from the queue */
-		blkdev_dequeue_request(req);
 		/* check that request contains a valid command */
 		if (!blk_fs_request(req)) {
 			viodasd_end_request(req, -EIO, blk_rq_sectors(req));
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 1980ab4..29a9daf 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -128,7 +128,7 @@ static void do_virtblk_request(struct request_queue *q)
 	struct request *req;
 	unsigned int issued = 0;
 
-	while ((req = elv_next_request(q)) != NULL) {
+	while ((req = blk_peek_request(q)) != NULL) {
 		vblk = req->rq_disk->private_data;
 		BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
 
@@ -138,7 +138,7 @@ static void do_virtblk_request(struct request_queue *q)
 			blk_stop_queue(q);
 			break;
 		}
-		blkdev_dequeue_request(req);
+		blk_start_request(req);
 		issued++;
 	}
 
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index d4c4352..ce24292 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -305,10 +305,7 @@ static void do_xd_request (struct request_queue * q)
 	if (xdc_busy)
 		return;
 
-	req = elv_next_request(q);
-	if (req)
-		blkdev_dequeue_request(req);
-
+	req = blk_fetch_request(q);
 	while (req) {
 		unsigned block = blk_rq_pos(req);
 		unsigned count = blk_rq_cur_sectors(req);
@@ -325,11 +322,8 @@ static void do_xd_request (struct request_queue * q)
 					   block, count);
 	done:
 		/* wrap up, 0 = success, -errno = fail */
-		if (!__blk_end_request_cur(req, res)) {
-			req = elv_next_request(q);
-			if (req)
-				blkdev_dequeue_request(req);
-		}
+		if (!__blk_end_request_cur(req, res))
+			req = blk_fetch_request(q);
 	}
 }
 
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 66f8345..6d4ac76 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -299,13 +299,13 @@ static void do_blkif_request(struct request_queue *rq)
 
 	queued = 0;
 
-	while ((req = elv_next_request(rq)) != NULL) {
+	while ((req = blk_peek_request(rq)) != NULL) {
 		info = req->rq_disk->private_data;
 
 		if (RING_FULL(&info->ring))
 			goto wait;
 
-		blkdev_dequeue_request(req);
+		blk_start_request(req);
 
 		if (!blk_fs_request(req)) {
 			__blk_end_request_all(req, -EIO);
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index edf137b..3a4397e 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -463,10 +463,10 @@ struct request *ace_get_next_request(struct request_queue * q)
 {
 	struct request *req;
 
-	while ((req = elv_next_request(q)) != NULL) {
+	while ((req = blk_peek_request(q)) != NULL) {
 		if (blk_fs_request(req))
 			break;
-		blkdev_dequeue_request(req);
+		blk_start_request(req);
 		__blk_end_request_all(req, -EIO);
 	}
 	return req;
@@ -498,10 +498,8 @@ static void ace_fsm_dostate(struct ace_device *ace)
 			__blk_end_request_all(ace->req, -EIO);
 			ace->req = NULL;
 		}
-		while ((req = elv_next_request(ace->queue)) != NULL) {
-			blkdev_dequeue_request(req);
+		while ((req = blk_fetch_request(ace->queue)) != NULL)
 			__blk_end_request_all(req, -EIO);
-		}
 
 		/* Drop back to IDLE state and notify waiters */
 		ace->fsm_state = ACE_FSM_STATE_IDLE;
@@ -649,7 +647,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
 			ace->fsm_state = ACE_FSM_STATE_IDLE;
 			break;
 		}
-		blkdev_dequeue_request(req);
+		blk_start_request(req);
 
 		/* Okay, it's a data request, set it up for transfer */
 		dev_dbg(ace->dev,
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index c909c1a..4575171 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -71,10 +71,7 @@ static void do_z2_request(struct request_queue *q)
 {
 	struct request *req;
 
-	req = elv_next_request(q);
-	if (req)
-		blkdev_dequeue_request(req);
-
+	req = blk_fetch_request(q);
 	while (req) {
 		unsigned long start = blk_rq_pos(req) << 9;
 		unsigned long len  = blk_rq_cur_bytes(req);
@@ -100,11 +97,8 @@ static void do_z2_request(struct request_queue *q)
 			len -= size;
 		}
 	done:
-		if (!__blk_end_request_cur(req, err)) {
-			req = elv_next_request(q);
-			if (req)
-				blkdev_dequeue_request(req);
-		}
+		if (!__blk_end_request_cur(req, err))
+			req = blk_fetch_request(q);
 	}
 }
 
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 3cc02bf..1e366ad 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -642,9 +642,7 @@ static void gdrom_request(struct request_queue *rq)
 {
 	struct request *req;
 
-	while ((req = elv_next_request(rq)) != NULL) {
-		blkdev_dequeue_request(req);
-
+	while ((req = blk_fetch_request(rq)) != NULL) {
 		if (!blk_fs_request(req)) {
 			printk(KERN_DEBUG "GDROM: Non-fs request ignored\n");
 			__blk_end_request_all(req, -EIO);
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index bbe9f08..ca741c2 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -297,9 +297,7 @@ static void do_viocd_request(struct request_queue *q)
 {
 	struct request *req;
 
-	while ((rwreq == 0) && ((req = elv_next_request(q)) != NULL)) {
-		blkdev_dequeue_request(req);
-
+	while ((rwreq == 0) && ((req = blk_fetch_request(q)) != NULL)) {
 		if (!blk_fs_request(req))
 			__blk_end_request_all(req, -EIO);
 		else if (send_request(req) < 0) {
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 2874c3d..8a894fa 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -269,7 +269,7 @@ void ide_retry_pc(ide_drive_t *drive)
 	blk_requeue_request(failed_rq->q, failed_rq);
 	drive->hwif->rq = NULL;
 	if (ide_queue_sense_rq(drive, pc)) {
-		blkdev_dequeue_request(failed_rq);
+		blk_start_request(failed_rq);
 		ide_complete_rq(drive, -EIO, blk_rq_bytes(failed_rq));
 	}
 }
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index abda733..e4e3a0e3 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -519,11 +519,8 @@ repeat:
 		 * we know that the queue isn't empty, but this can happen
 		 * if the q->prep_rq_fn() decides to kill a request
 		 */
-		if (!rq) {
-			rq = elv_next_request(drive->queue);
-			if (rq)
-				blkdev_dequeue_request(rq);
-		}
+		if (!rq)
+			rq = blk_fetch_request(drive->queue);
 
 		spin_unlock_irq(q->queue_lock);
 		spin_lock_irq(&hwif->lock);
@@ -536,7 +533,7 @@ repeat:
 		/*
 		 * Sanity: don't accept a request that isn't a PM request
 		 * if we are currently power managed. This is very important as
-		 * blk_stop_queue() doesn't prevent the elv_next_request()
+		 * blk_stop_queue() doesn't prevent the blk_fetch_request()
 		 * above to return us whatever is in the queue. Since we call
 		 * ide_do_request() ourselves, we end up taking requests while
 		 * the queue is blocked...
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 58f5be8..c0bebc6 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -704,13 +704,12 @@ try_again:
 		return 0;
 	}
 
-	dev_dbg(&card->dev, "elv_next\n");
-	msb->block_req = elv_next_request(msb->queue);
+	dev_dbg(&card->dev, "blk_fetch\n");
+	msb->block_req = blk_fetch_request(msb->queue);
 	if (!msb->block_req) {
 		dev_dbg(&card->dev, "issue end\n");
 		return -EAGAIN;
 	}
-	blkdev_dequeue_request(msb->block_req);
 
 	dev_dbg(&card->dev, "trying again\n");
 	chunk = 1;
@@ -825,10 +824,8 @@ static void mspro_block_submit_req(struct request_queue *q)
 		return;
 
 	if (msb->eject) {
-		while ((req = elv_next_request(q)) != NULL) {
-			blkdev_dequeue_request(req);
+		while ((req = blk_fetch_request(q)) != NULL)
 			__blk_end_request_all(req, -ENODEV);
-		}
 
 		return;
 	}
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 8b5cbfc..6573ef4 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -877,7 +877,7 @@ static void i2o_block_request_fn(struct request_queue *q)
 	struct request *req;
 
 	while (!blk_queue_plugged(q)) {
-		req = elv_next_request(q);
+		req = blk_peek_request(q);
 		if (!req)
 			break;
 
@@ -890,7 +890,7 @@ static void i2o_block_request_fn(struct request_queue *q)
 
 			if (queue_depth < I2O_BLOCK_MAX_OPEN_REQUESTS) {
 				if (!i2o_block_transfer(req)) {
-					blkdev_dequeue_request(req);
+					blk_start_request(req);
 					continue;
 				} else
 					osm_info("transfer error\n");
@@ -917,7 +917,7 @@ static void i2o_block_request_fn(struct request_queue *q)
 				break;
 			}
 		} else {
-			blkdev_dequeue_request(req);
+			blk_start_request(req);
 			__blk_end_request_all(req, -EIO);
 		}
 	}
diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c
index 4b70f1e..49e5823 100644
--- a/drivers/mmc/card/queue.c
+++ b/drivers/mmc/card/queue.c
@@ -54,11 +54,8 @@ static int mmc_queue_thread(void *d)
 
 		spin_lock_irq(q->queue_lock);
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (!blk_queue_plugged(q)) {
-			req = elv_next_request(q);
-			if (req)
-				blkdev_dequeue_request(req);
-		}
+		if (!blk_queue_plugged(q))
+			req = blk_fetch_request(q);
 		mq->req = req;
 		spin_unlock_irq(q->queue_lock);
 
@@ -94,10 +91,8 @@ static void mmc_request(struct request_queue *q)
 
 	if (!mq) {
 		printk(KERN_ERR "MMC: killing requests for dead queue\n");
-		while ((req = elv_next_request(q)) != NULL) {
-			blkdev_dequeue_request(req);
+		while ((req = blk_fetch_request(q)) != NULL)
 			__blk_end_request_all(req, -EIO);
-		}
 		return;
 	}
 
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 3e10442..502622f 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -100,12 +100,7 @@ static int mtd_blktrans_thread(void *arg)
 		struct mtd_blktrans_dev *dev;
 		int res;
 
-		if (!req) {
-			req = elv_next_request(rq);
-			if (req)
-				blkdev_dequeue_request(req);
-		}
-		if (!req) {
+		if (!req && !(req = blk_fetch_request(rq))) {
 			set_current_state(TASK_INTERRUPTIBLE);
 			spin_unlock_irq(rq->queue_lock);
 			schedule();
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 7df03c7a..e64f62d 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -1656,17 +1656,13 @@ static void __dasd_process_request_queue(struct dasd_block *block)
 	if (basedev->state < DASD_STATE_READY)
 		return;
 	/* Now we try to fetch requests from the request queue */
-	while (!blk_queue_plugged(queue) &&
-	       elv_next_request(queue)) {
-
-		req = elv_next_request(queue);
-
+	while (!blk_queue_plugged(queue) && (req = blk_peek_request(queue))) {
 		if (basedev->features & DASD_FEATURE_READONLY &&
 		    rq_data_dir(req) == WRITE) {
 			DBF_DEV_EVENT(DBF_ERR, basedev,
 				      "Rejecting write request %p",
 				      req);
-			blkdev_dequeue_request(req);
+			blk_start_request(req);
 			__blk_end_request_all(req, -EIO);
 			continue;
 		}
@@ -1695,7 +1691,7 @@ static void __dasd_process_request_queue(struct dasd_block *block)
 				      "CCW creation failed (rc=%ld) "
 				      "on request %p",
 				      PTR_ERR(cqr), req);
-			blkdev_dequeue_request(req);
+			blk_start_request(req);
 			__blk_end_request_all(req, -EIO);
 			continue;
 		}
@@ -1705,7 +1701,7 @@ static void __dasd_process_request_queue(struct dasd_block *block)
 		 */
 		cqr->callback_data = (void *) req;
 		cqr->status = DASD_CQR_FILLED;
-		blkdev_dequeue_request(req);
+		blk_start_request(req);
 		list_add_tail(&cqr->blocklist, &block->ccw_queue);
 		dasd_profile_start(block, cqr, req);
 	}
@@ -2029,10 +2025,8 @@ static void dasd_flush_request_queue(struct dasd_block *block)
 		return;
 
 	spin_lock_irq(&block->request_queue_lock);
-	while ((req = elv_next_request(block->request_queue))) {
-		blkdev_dequeue_request(req);
+	while ((req = blk_fetch_request(block->request_queue)))
 		__blk_end_request_all(req, -EIO);
-	}
 	spin_unlock_irq(&block->request_queue_lock);
 }
 
diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c
index 5d035e4..1e79676 100644
--- a/drivers/s390/char/tape_block.c
+++ b/drivers/s390/char/tape_block.c
@@ -93,7 +93,7 @@ __tapeblock_end_request(struct tape_request *ccw_req, void *data)
 		device->blk_data.block_position = -1;
 	device->discipline->free_bread(ccw_req);
 	if (!list_empty(&device->req_queue) ||
-	    elv_next_request(device->blk_data.request_queue))
+	    blk_peek_request(device->blk_data.request_queue))
 		tapeblock_trigger_requeue(device);
 }
 
@@ -162,19 +162,16 @@ tapeblock_requeue(struct work_struct *work) {
 	spin_lock_irq(&device->blk_data.request_queue_lock);
 	while (
 		!blk_queue_plugged(queue) &&
-		elv_next_request(queue)   &&
+		(req = blk_fetch_request(queue)) &&
 		nr_queued < TAPEBLOCK_MIN_REQUEUE
 	) {
-		req = elv_next_request(queue);
 		if (rq_data_dir(req) == WRITE) {
 			DBF_EVENT(1, "TBLOCK: Rejecting write request\n");
-			blkdev_dequeue_request(req);
 			spin_unlock_irq(&device->blk_data.request_queue_lock);
 			blk_end_request_all(req, -EIO);
 			spin_lock_irq(&device->blk_data.request_queue_lock);
 			continue;
 		}
-		blkdev_dequeue_request(req);
 		nr_queued++;
 		spin_unlock_irq(&device->blk_data.request_queue_lock);
 		rc = tapeblock_start_request(device, req);
diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c
index f572a4a..6d46516 100644
--- a/drivers/sbus/char/jsflash.c
+++ b/drivers/sbus/char/jsflash.c
@@ -186,10 +186,7 @@ static void jsfd_do_request(struct request_queue *q)
 {
 	struct request *req;
 
-	req = elv_next_request(q);
-	if (req)
-		blkdev_dequeue_request(req);
-
+	req = blk_fetch_request(q);
 	while (req) {
 		struct jsfd_part *jdp = req->rq_disk->private_data;
 		unsigned long offset = blk_rq_pos(req) << 9;
@@ -212,11 +209,8 @@ static void jsfd_do_request(struct request_queue *q)
 		jsfd_read(req->buffer, jdp->dbase + offset, len);
 		err = 0;
 	end:
-		if (!__blk_end_request_cur(req, err)) {
-			req = elv_next_request(q);
-			if (req)
-				blkdev_dequeue_request(req);
-		}
+		if (!__blk_end_request_cur(req, err))
+			req = blk_fetch_request(q);
 	}
 }
 
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index ee308f6..b12750f 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1207,7 +1207,7 @@ int scsi_prep_return(struct request_queue *q, struct request *req, int ret)
 		break;
 	case BLKPREP_DEFER:
 		/*
-		 * If we defer, the elv_next_request() returns NULL, but the
+		 * If we defer, the blk_peek_request() returns NULL, but the
 		 * queue must be restarted, so we plug here if no returning
 		 * command will automatically do that.
 		 */
@@ -1385,7 +1385,7 @@ static void scsi_kill_request(struct request *req, struct request_queue *q)
 	struct scsi_target *starget = scsi_target(sdev);
 	struct Scsi_Host *shost = sdev->host;
 
-	blkdev_dequeue_request(req);
+	blk_start_request(req);
 
 	if (unlikely(cmd == NULL)) {
 		printk(KERN_CRIT "impossible request in %s.\n",
@@ -1477,7 +1477,7 @@ static void scsi_request_fn(struct request_queue *q)
 
 	if (!sdev) {
 		printk("scsi: killing requests for dead queue\n");
-		while ((req = elv_next_request(q)) != NULL)
+		while ((req = blk_peek_request(q)) != NULL)
 			scsi_kill_request(req, q);
 		return;
 	}
@@ -1498,7 +1498,7 @@ static void scsi_request_fn(struct request_queue *q)
 		 * that the request is fully prepared even if we cannot 
 		 * accept it.
 		 */
-		req = elv_next_request(q);
+		req = blk_peek_request(q);
 		if (!req || !scsi_dev_queue_ready(q, sdev))
 			break;
 
@@ -1514,7 +1514,7 @@ static void scsi_request_fn(struct request_queue *q)
 		 * Remove the request from the request list.
 		 */
 		if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req)))
-			blkdev_dequeue_request(req);
+			blk_start_request(req);
 		sdev->device_busy++;
 
 		spin_unlock(q->queue_lock);
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index 50988cb..d606452 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -163,12 +163,10 @@ static void sas_smp_request(struct request_queue *q, struct Scsi_Host *shost,
 	int (*handler)(struct Scsi_Host *, struct sas_rphy *, struct request *);
 
 	while (!blk_queue_plugged(q)) {
-		req = elv_next_request(q);
+		req = blk_fetch_request(q);
 		if (!req)
 			break;
 
-		blkdev_dequeue_request(req);
-
 		spin_unlock_irq(q->queue_lock);
 
 		handler = to_sas_internal(shost->transportt)->f->smp_handler;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c755803..6e59d3b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -818,8 +818,6 @@ static inline void blk_run_address_space(struct address_space *mapping)
 		blk_run_backing_dev(mapping->backing_dev_info, NULL);
 }
 
-extern void blkdev_dequeue_request(struct request *req);
-
 /*
  * blk_rq_pos()		: the current sector
  * blk_rq_bytes()	: bytes left in the entire request
@@ -853,6 +851,13 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
 }
 
 /*
+ * Request issue related functions.
+ */
+extern struct request *blk_peek_request(struct request_queue *q);
+extern void blk_start_request(struct request *rq);
+extern struct request *blk_fetch_request(struct request_queue *q);
+
+/*
  * Request completion related functions.
  *
  * blk_update_request() completes given number of bytes and updates
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 4e46287..1cb3372e 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -103,10 +103,8 @@ extern int elv_merge(struct request_queue *, struct request **, struct bio *);
 extern void elv_merge_requests(struct request_queue *, struct request *,
 			       struct request *);
 extern void elv_merged_request(struct request_queue *, struct request *, int);
-extern void elv_dequeue_request(struct request_queue *, struct request *);
 extern void elv_requeue_request(struct request_queue *, struct request *);
 extern int elv_queue_empty(struct request_queue *);
-extern struct request *elv_next_request(struct request_queue *q);
 extern struct request *elv_former_request(struct request_queue *, struct request *);
 extern struct request *elv_latter_request(struct request_queue *, struct request *);
 extern int elv_register_queue(struct request_queue *q);
-- 
cgit v0.10.2


From 7bce6c2740fab36708233e998a9e53115649b193 Mon Sep 17 00:00:00 2001
From: Yoshihiro Shimoda <shimoda.yoshihiro@renesas.com>
Date: Mon, 11 May 2009 06:51:28 +0000
Subject: sh: sh7785lcr: fix I2C device address map for 32-bit mode

This fixes up the broken I2C offset in 32-bit mode.
The cause is because the board datasheet had a mistake.

Signed-off-by: Yoshihiro Shimoda <shimoda.yoshihiro@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/mach-common/mach/sh7785lcr.h b/arch/sh/include/mach-common/mach/sh7785lcr.h
index 1ce27d5..90011d4 100644
--- a/arch/sh/include/mach-common/mach/sh7785lcr.h
+++ b/arch/sh/include/mach-common/mach/sh7785lcr.h
@@ -9,11 +9,11 @@
  * -----------------------------+---------------+---------------
  * 0x00000000 - 0x03ffffff(CS0)	| NOR Flash	| NOR Flash
  * 0x04000000 - 0x05ffffff(CS1)	| PLD		| PLD
- * 0x06000000 - 0x07ffffff(CS1)	| reserved	| I2C
+ * 0x06000000 - 0x07ffffff(CS1)	| I2C		| I2C
  * 0x08000000 - 0x0bffffff(CS2)	| USB		| DDR SDRAM
  * 0x0c000000 - 0x0fffffff(CS3)	| SD		| DDR SDRAM
  * 0x10000000 - 0x13ffffff(CS4)	| SM107		| SM107
- * 0x14000000 - 0x17ffffff(CS5)	| I2C		| USB
+ * 0x14000000 - 0x17ffffff(CS5)	| reserved	| USB
  * 0x18000000 - 0x1bffffff(CS6)	| reserved	| SD
  * 0x40000000 - 0x5fffffff	| DDR SDRAM	| (cannot use)
  *
@@ -32,6 +32,9 @@
 #define PLD_VERSR		(PLD_BASE_ADDR + 0x0c)
 #define PLD_MMSR		(PLD_BASE_ADDR + 0x0e)
 
+#define PCA9564_ADDR		0x06000000	/* I2C */
+#define PCA9564_SIZE		0x00000100
+
 #define SM107_MEM_ADDR		0x10000000
 #define SM107_MEM_SIZE		0x00e00000
 #define SM107_REG_ADDR		0x13e00000
@@ -40,16 +43,13 @@
 #if defined(CONFIG_SH_SH7785LCR_29BIT_PHYSMAPS)
 #define R8A66597_ADDR		0x14000000	/* USB */
 #define CG200_ADDR		0x18000000	/* SD */
-#define PCA9564_ADDR		0x06000000	/* I2C */
 #else
 #define R8A66597_ADDR		0x08000000
 #define CG200_ADDR		0x0c000000
-#define PCA9564_ADDR		0x14000000
 #endif
 
 #define R8A66597_SIZE		0x00000100
 #define CG200_SIZE		0x00010000
-#define PCA9564_SIZE		0x00000100
 
 #endif  /* __ASM_SH_RENESAS_SH7785LCR_H */
 
-- 
cgit v0.10.2


From b9e0353fc85dab4ef5ebcef2bd09ebc4ce6d5a7b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 6 May 2009 10:05:32 -0700
Subject: x86/acpi: remove irq-compression trick on 32-bit

We already have a per cpu vector on 32-bit via recent changes, and
don't need this trick any more (which trick obfuscates the real GSI
mappings and which only triggers on larger systems to begin with):

On 3 ioapic system (24 per ioapic) before patch I got:

ACPI: PCI Interrupt Link [ILSB] enabled at IRQ 71
IOAPIC[2]: Set routing entry (10-23 -> 0xa9 -> IRQ 64 Mode:1 Active:1)
pci 0000:80:01.1: PCI INT A -> Link[ILSB] -> GSI 71 (level, low) -> IRQ 64
ACPI: PCI Interrupt Link [LE5B] enabled at IRQ 67
IOAPIC[2]: Set routing entry (10-19 -> 0xb1 -> IRQ 65 Mode:1 Active:1)
pci 0000:83:00.0: PCI INT B -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 65
ACPI: PCI Interrupt Link [LE5A] enabled at IRQ 66
IOAPIC[2]: Set routing entry (10-18 -> 0xb9 -> IRQ 66 Mode:1 Active:1)
pci 0000:83:00.1: PCI INT A -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66
ACPI: PCI Interrupt Link [LE5D] enabled at IRQ 65
IOAPIC[2]: Set routing entry (10-17 -> 0xc1 -> IRQ 67 Mode:1 Active:1)
pci 0000:84:00.0: PCI INT B -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 67
ACPI: PCI Interrupt Link [LE5C] enabled at IRQ 64
IOAPIC[2]: Set routing entry (10-16 -> 0xc9 -> IRQ 68 Mode:1 Active:1)
pci 0000:84:00.1: PCI INT A -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 68
pci 0000:87:00.0: PCI INT B -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66
pci 0000:87:00.1: PCI INT A -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 67
pci 0000:88:00.0: PCI INT B -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 68
pci 0000:88:00.1: PCI INT A -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 65
pci 0000:8b:00.0: PCI INT B -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66
pci 0000:8b:00.1: PCI INT A -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 67
pci 0000:8c:00.0: PCI INT B -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 68
pci 0000:8c:00.1: PCI INT A -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 65

after the patch we get:

ACPI: PCI Interrupt Link [ILSB] enabled at IRQ 71
IOAPIC[2]: Set routing entry (10-23 -> 0xa9 -> IRQ 71 Mode:1 Active:1)
pci 0000:80:01.1: PCI INT A -> Link[ILSB] -> GSI 71 (level, low) -> IRQ 71
ACPI: PCI Interrupt Link [LE5B] enabled at IRQ 67
IOAPIC[2]: Set routing entry (10-19 -> 0xb1 -> IRQ 67 Mode:1 Active:1)
pci 0000:83:00.0: PCI INT B -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 67
ACPI: PCI Interrupt Link [LE5A] enabled at IRQ 66
IOAPIC[2]: Set routing entry (10-18 -> 0xb9 -> IRQ 66 Mode:1 Active:1)
pci 0000:83:00.1: PCI INT A -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66
ACPI: PCI Interrupt Link [LE5D] enabled at IRQ 65
IOAPIC[2]: Set routing entry (10-17 -> 0xc1 -> IRQ 65 Mode:1 Active:1)
pci 0000:84:00.0: PCI INT B -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 65
ACPI: PCI Interrupt Link [LE5C] enabled at IRQ 64
IOAPIC[2]: Set routing entry (10-16 -> 0xc9 -> IRQ 64 Mode:1 Active:1)
pci 0000:84:00.1: PCI INT A -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 64
pci 0000:87:00.0: PCI INT B -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66
pci 0000:87:00.1: PCI INT A -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 65
pci 0000:88:00.0: PCI INT B -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 64
pci 0000:88:00.1: PCI INT A -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 67
pci 0000:8b:00.0: PCI INT B -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66
pci 0000:8b:00.1: PCI INT A -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 65
pci 0000:8c:00.0: PCI INT B -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 64
pci 0000:8c:00.1: PCI INT A -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 67

As it can be seen that GSIs now get mapped lineary.

[ Impact: simplify irq number mapping on bigger 32-bit systems ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: Len Brown <lenb@kernel.org>
LKML-Reference: <4A01C35C.7060207@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 6ee96b5..fb5e882 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1162,22 +1162,9 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 {
 	int ioapic;
 	int ioapic_pin;
-#ifdef CONFIG_X86_32
-#define MAX_GSI_NUM	4096
-#define IRQ_COMPRESSION_START	64
-
-	static int pci_irq = IRQ_COMPRESSION_START;
-	/*
-	 * Mapping between Global System Interrupts, which
-	 * represent all possible interrupts, and IRQs
-	 * assigned to actual devices.
-	 */
-	static int gsi_to_irq[MAX_GSI_NUM];
-#else
 
 	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
 		return gsi;
-#endif
 
 	/* Don't set up the ACPI SCI because it's already set up */
 	if (acpi_gbl_FADT.sci_interrupt == gsi)
@@ -1196,66 +1183,28 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 		gsi = ioapic_renumber_irq(ioapic, gsi);
 #endif
 
-	/*
-	 * Avoid pin reprogramming.  PRTs typically include entries
-	 * with redundant pin->gsi mappings (but unique PCI devices);
-	 * we only program the IOAPIC on the first.
-	 */
 	if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
 		printk(KERN_ERR "Invalid reference to IOAPIC pin "
 		       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
 		       ioapic_pin);
 		return gsi;
 	}
+
+	/*
+	 * Avoid pin reprogramming.  PRTs typically include entries
+	 * with redundant pin->gsi mappings (but unique PCI devices);
+	 * we only program the IOAPIC on the first.
+	 */
 	if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
 		pr_debug("Pin %d-%d already programmed\n",
 			 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-#ifdef CONFIG_X86_32
-		return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
-#else
 		return gsi;
-#endif
 	}
-
 	set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
-#ifdef CONFIG_X86_32
-	/*
-	 * For GSI >= 64, use IRQ compression
-	 */
-	if ((gsi >= IRQ_COMPRESSION_START)
-	    && (triggering == ACPI_LEVEL_SENSITIVE)) {
-		/*
-		 * For PCI devices assign IRQs in order, avoiding gaps
-		 * due to unused I/O APIC pins.
-		 */
-		int irq = gsi;
-		if (gsi < MAX_GSI_NUM) {
-			/*
-			 * Retain the VIA chipset work-around (gsi > 15), but
-			 * avoid a problem where the 8254 timer (IRQ0) is setup
-			 * via an override (so it's not on pin 0 of the ioapic),
-			 * and at the same time, the pin 0 interrupt is a PCI
-			 * type.  The gsi > 15 test could cause these two pins
-			 * to be shared as IRQ0, and they are not shareable.
-			 * So test for this condition, and if necessary, avoid
-			 * the pin collision.
-			 */
-			gsi = pci_irq++;
-			/*
-			 * Don't assign IRQ used by ACPI SCI
-			 */
-			if (gsi == acpi_gbl_FADT.sci_interrupt)
-				gsi = pci_irq++;
-			gsi_to_irq[irq] = gsi;
-		} else {
-			printk(KERN_ERR "GSI %u is too high\n", gsi);
-			return gsi;
-		}
-	}
-#endif
 	io_apic_set_pci_routing(dev, ioapic, ioapic_pin, gsi,
 				triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
 				polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+
 	return gsi;
 }
 
-- 
cgit v0.10.2


From ee214558c2e959781a406e76c5b34364da638e1d Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 6 May 2009 10:07:07 -0700
Subject: x86: fix alloc_mptable()

Fix the conditions when we stop updating the mptable due to
running out of slots.

[ Impact: fix memory corruption / non-working update_mptable boot parameter ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: Len Brown <lenb@kernel.org>
LKML-Reference: <4A01C3BB.1000609@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 70fd7e4..cd2a41a 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -870,24 +870,17 @@ static
 inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
 #endif /* CONFIG_X86_IO_APIC */
 
-static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length,
-		      int count)
+static int
+check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
 {
-	if (!mpc_new_phys) {
-		pr_info("No spare slots, try to append...take your risk, "
-			"new mpc_length %x\n", count);
-	} else {
-		if (count <= mpc_new_length)
-			pr_info("No spare slots, try to append..., "
-				"new mpc_length %x\n", count);
-		else {
-			pr_err("mpc_new_length %lx is too small\n",
-				mpc_new_length);
-			return -1;
-		}
+	int ret = 0;
+
+	if (!mpc_new_phys || count <= mpc_new_length) {
+		WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
+		return -1;
 	}
 
-	return 0;
+	return ret;
 }
 
 static int  __init replace_intsrc_all(struct mpc_table *mpc,
@@ -946,7 +939,7 @@ static int  __init replace_intsrc_all(struct mpc_table *mpc,
 		} else {
 			struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
 			count += sizeof(struct mpc_intsrc);
-			if (!check_slot(mpc_new_phys, mpc_new_length, count))
+			if (check_slot(mpc_new_phys, mpc_new_length, count) < 0)
 				goto out;
 			assign_to_mpc_intsrc(&mp_irqs[i], m);
 			mpc->length = count;
-- 
cgit v0.10.2


From a31f82057ce6f7ced578d64c07a72ccbdc7336e4 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 6 May 2009 10:06:15 -0700
Subject: x86/acpi: call mp_config_acpi_gsi() in mp_register_gsi()

The patch to call mp_config_acpi_gsi() from the ACPI IRQ registration
code never got mainline because there were open discussions about it.

This call is needed to properly update the kernel's copy of the mptable,
when the update_mptable boot parameter is needed.

Now that the dust has settled with the APIC unification, and since there
were no objections when the patch was re-submitted, try this again.

[ Impact: fix the update_mptable boot parameter ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: Len Brown <lenb@kernel.org>
LKML-Reference: <4A01C387.7090103@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 3ea1f53..c34961a 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -77,17 +77,8 @@ extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
 				 int active_high_low);
 extern int acpi_probe_gsi(void);
 #ifdef CONFIG_X86_IO_APIC
-extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
-				u32 gsi, int triggering, int polarity);
 extern int mp_find_ioapic(int gsi);
 extern int mp_find_ioapic_pin(int ioapic, int gsi);
-#else
-static inline int
-mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
-		   u32 gsi, int triggering, int polarity)
-{
-	return 0;
-}
 #endif
 #else /* !CONFIG_ACPI: */
 static inline int acpi_probe_gsi(void)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index fb5e882..8019ecf 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -33,6 +33,7 @@
 #include <linux/irq.h>
 #include <linux/bootmem.h>
 #include <linux/ioport.h>
+#include <linux/pci.h>
 
 #include <asm/pgtable.h>
 #include <asm/io_apic.h>
@@ -1158,6 +1159,44 @@ void __init mp_config_acpi_legacy_irqs(void)
 	}
 }
 
+static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int triggering,
+			int polarity)
+{
+#ifdef CONFIG_X86_MPPARSE
+	struct mpc_intsrc mp_irq;
+	struct pci_dev *pdev;
+	unsigned char number;
+	unsigned int devfn;
+	int ioapic;
+	u8 pin;
+
+	if (!acpi_ioapic)
+		return 0;
+	if (!dev)
+		return 0;
+	if (dev->bus != &pci_bus_type)
+		return 0;
+
+	pdev = to_pci_dev(dev);
+	number = pdev->bus->number;
+	devfn = pdev->devfn;
+	pin = pdev->pin;
+	/* print the entry should happen on mptable identically */
+	mp_irq.type = MP_INTSRC;
+	mp_irq.irqtype = mp_INT;
+	mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+				(polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
+	mp_irq.srcbus = number;
+	mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
+	ioapic = mp_find_ioapic(gsi);
+	mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id;
+	mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
+
+	save_mp_irq(&mp_irq);
+#endif
+	return 0;
+}
+
 int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 {
 	int ioapic;
@@ -1189,6 +1228,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 		       ioapic_pin);
 		return gsi;
 	}
+	mp_config_acpi_gsi(dev, gsi, triggering, polarity);
 
 	/*
 	 * Avoid pin reprogramming.  PRTs typically include entries
@@ -1208,32 +1248,6 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 	return gsi;
 }
 
-int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
-			u32 gsi, int triggering, int polarity)
-{
-#ifdef CONFIG_X86_MPPARSE
-	struct mpc_intsrc mp_irq;
-	int ioapic;
-
-	if (!acpi_ioapic)
-		return 0;
-
-	/* print the entry should happen on mptable identically */
-	mp_irq.type = MP_INTSRC;
-	mp_irq.irqtype = mp_INT;
-	mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
-				(polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
-	mp_irq.srcbus = number;
-	mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
-	ioapic = mp_find_ioapic(gsi);
-	mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id;
-	mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
-
-	save_mp_irq(&mp_irq);
-#endif
-	return 0;
-}
-
 /*
  * Parse IOAPIC related entries in MADT
  * returns 0 on success, < 0 on error
-- 
cgit v0.10.2


From bdfe8ac153546537ed24de69610ea781a734f785 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 6 May 2009 10:07:41 -0700
Subject: x86/acpi: move pin_programmed bit map to io_apic.c

Prepare to call setup_io_apic_routing() in pcibios_irq_enable()
also remove not needed member apic_id.

[ Impact: clean up, prepare for future change ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: Len Brown <lenb@kernel.org>
LKML-Reference: <4A01C3DD.3050104@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 8019ecf..dcfbc3a 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -904,10 +904,8 @@ extern int es7000_plat;
 #endif
 
 static struct {
-	int apic_id;
 	int gsi_base;
 	int gsi_end;
-	DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
 } mp_ioapic_routing[MAX_IO_APICS];
 
 int mp_find_ioapic(int gsi)
@@ -996,7 +994,6 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 	 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
 	 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
 	 */
-	mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid;
 	mp_ioapic_routing[idx].gsi_base = gsi_base;
 	mp_ioapic_routing[idx].gsi_end = gsi_base +
 	    io_apic_get_redir_entries(idx);
@@ -1189,7 +1186,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int triggering,
 	mp_irq.srcbus = number;
 	mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
 	ioapic = mp_find_ioapic(gsi);
-	mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id;
+	mp_irq.dstapic = mp_ioapics[ioapic].apicid;
 	mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
 
 	save_mp_irq(&mp_irq);
@@ -1224,23 +1221,12 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 
 	if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
 		printk(KERN_ERR "Invalid reference to IOAPIC pin "
-		       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
+		       "%d-%d\n", mp_ioapics[ioapic].apicid,
 		       ioapic_pin);
 		return gsi;
 	}
 	mp_config_acpi_gsi(dev, gsi, triggering, polarity);
 
-	/*
-	 * Avoid pin reprogramming.  PRTs typically include entries
-	 * with redundant pin->gsi mappings (but unique PCI devices);
-	 * we only program the IOAPIC on the first.
-	 */
-	if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
-		pr_debug("Pin %d-%d already programmed\n",
-			 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-		return gsi;
-	}
-	set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
 	io_apic_set_pci_routing(dev, ioapic, ioapic_pin, gsi,
 				triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
 				polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 21c30e1..e279ae3 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3922,7 +3922,7 @@ int __init io_apic_get_version(int ioapic)
 }
 #endif
 
-int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
+static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
 				 int triggering, int polarity)
 {
 	struct irq_desc *desc;
@@ -3959,6 +3959,29 @@ int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
 	return 0;
 }
 
+static struct {
+	DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+} mp_ioapic_routing[MAX_IO_APICS];
+
+int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
+				 int triggering, int polarity)
+{
+
+	/*
+	 * Avoid pin reprogramming.  PRTs typically include entries
+	 * with redundant pin->gsi mappings (but unique PCI devices);
+	 * we only program the IOAPIC on the first.
+	 */
+	if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
+		pr_debug("Pin %d-%d already programmed\n",
+			 mp_ioapics[ioapic].apicid, pin);
+		return 0;
+	}
+	set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
+
+	return __io_apic_set_pci_routing(dev, ioapic, pin, irq,
+					 triggering, polarity);
+}
 
 int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
 {
-- 
cgit v0.10.2


From e20c06fd6950265a899edd96a02dc2e6ae2d1ce5 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 6 May 2009 10:08:22 -0700
Subject: x86/pci: add 4 more return parameters to IO_APIC_get_PCI_irq_vector()

To prepare those params for pcibios_irq_enable() to call setup_io_apic_routing().

[ Impact: extend function call API to prepare for new functionality ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <4A01C406.2040303@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index b762ea4..26a40ab 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -63,7 +63,9 @@ extern unsigned long io_apic_irqs;
 extern void init_VISWS_APIC_irqs(void);
 extern void setup_IO_APIC(void);
 extern void disable_IO_APIC(void);
-extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
+extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin,
+					int *ioapic, int *ioapic_pin,
+					int *trigger, int *polarity);
 extern void setup_ioapic_dest(void);
 
 extern void enable_IO_APIC(void);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e279ae3..caf9dbd 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -873,54 +873,6 @@ static int __init find_isa_irq_apic(int irq, int type)
 	return -1;
 }
 
-/*
- * Find a specific PCI IRQ entry.
- * Not an __init, possibly needed by modules
- */
-static int pin_2_irq(int idx, int apic, int pin);
-
-int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
-{
-	int apic, i, best_guess = -1;
-
-	apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
-		bus, slot, pin);
-	if (test_bit(bus, mp_bus_not_pci)) {
-		apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
-		return -1;
-	}
-	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].srcbus;
-
-		for (apic = 0; apic < nr_ioapics; apic++)
-			if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
-			    mp_irqs[i].dstapic == MP_APIC_ALL)
-				break;
-
-		if (!test_bit(lbus, mp_bus_not_pci) &&
-		    !mp_irqs[i].irqtype &&
-		    (bus == lbus) &&
-		    (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
-			int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
-
-			if (!(apic || IO_APIC_IRQ(irq)))
-				continue;
-
-			if (pin == (mp_irqs[i].srcbusirq & 3))
-				return irq;
-			/*
-			 * Use the first all-but-pin matching entry as a
-			 * best-guess fuzzy result for broken mptables.
-			 */
-			if (best_guess < 0)
-				best_guess = irq;
-		}
-	}
-	return best_guess;
-}
-
-EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
-
 #if defined(CONFIG_EISA) || defined(CONFIG_MCA)
 /*
  * EISA Edge/Level control register, ELCR
@@ -1139,6 +1091,65 @@ static int pin_2_irq(int idx, int apic, int pin)
 	return irq;
 }
 
+/*
+ * Find a specific PCI IRQ entry.
+ * Not an __init, possibly needed by modules
+ */
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
+				int *ioapic, int *ioapic_pin,
+				int *trigger, int *polarity)
+{
+	int apic, i, best_guess = -1;
+
+	apic_printk(APIC_DEBUG,
+		    "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
+		    bus, slot, pin);
+	if (test_bit(bus, mp_bus_not_pci)) {
+		apic_printk(APIC_VERBOSE,
+			    "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+		return -1;
+	}
+	for (i = 0; i < mp_irq_entries; i++) {
+		int lbus = mp_irqs[i].srcbus;
+
+		for (apic = 0; apic < nr_ioapics; apic++)
+			if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
+			    mp_irqs[i].dstapic == MP_APIC_ALL)
+				break;
+
+		if (!test_bit(lbus, mp_bus_not_pci) &&
+		    !mp_irqs[i].irqtype &&
+		    (bus == lbus) &&
+		    (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
+			int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
+
+			if (!(apic || IO_APIC_IRQ(irq)))
+				continue;
+
+			if (pin == (mp_irqs[i].srcbusirq & 3)) {
+				*ioapic = apic;
+				*ioapic_pin = mp_irqs[i].dstirq;
+				*trigger = irq_trigger(i);
+				*polarity = irq_polarity(i);
+				return irq;
+			}
+			/*
+			 * Use the first all-but-pin matching entry as a
+			 * best-guess fuzzy result for broken mptables.
+			 */
+			if (best_guess < 0) {
+				*ioapic = apic;
+				*ioapic_pin = mp_irqs[i].dstirq;
+				*trigger = irq_trigger(i);
+				*polarity = irq_polarity(i);
+				best_guess = irq;
+			}
+		}
+	}
+	return best_guess;
+}
+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
+
 void lock_vector_lock(void)
 {
 	/* Used to the online set of cpus does not change
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index fecbce6..a2f6bde 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -1051,12 +1051,16 @@ static void __init pcibios_fixup_irqs(void)
 		 */
 		if (io_apic_assign_pci_irqs) {
 			int irq;
+			int ioapic = -1, ioapic_pin = -1;
+			int triggering, polarity;
 
 			/*
 			 * interrupt pins are numbered starting from 1
 			 */
 			irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
-				PCI_SLOT(dev->devfn), pin - 1);
+						PCI_SLOT(dev->devfn), pin - 1,
+						&ioapic, &ioapic_pin,
+						&triggering, &polarity);
 			/*
 			 * Busses behind bridges are typically not listed in the
 			 * MP-table.  In this case we have to look up the IRQ
@@ -1072,7 +1076,10 @@ static void __init pcibios_fixup_irqs(void)
 				pin = pci_swizzle_interrupt_pin(dev, pin);
 				bus = bridge->bus->number;
 				irq = IO_APIC_get_PCI_irq_vector(bus,
-						PCI_SLOT(bridge->devfn), pin - 1);
+						PCI_SLOT(bridge->devfn),
+						pin - 1,
+						&ioapic, &ioapic_pin,
+						&triggering, &polarity);
 				if (irq >= 0)
 					dev_warn(&dev->dev,
 						"using bridge %s INT %c to "
@@ -1221,8 +1228,14 @@ static int pirq_enable_irq(struct pci_dev *dev)
 
 		if (io_apic_assign_pci_irqs) {
 			int irq;
+			int ioapic = -1, ioapic_pin = -1;
+			int triggering, polarity;
 
-			irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin - 1);
+			irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
+						PCI_SLOT(dev->devfn),
+						pin - 1,
+						&ioapic, &ioapic_pin,
+						&triggering, &polarity);
 			/*
 			 * Busses behind bridges are typically not listed in the MP-table.
 			 * In this case we have to look up the IRQ based on the parent bus,
@@ -1235,7 +1248,10 @@ static int pirq_enable_irq(struct pci_dev *dev)
 
 				pin = pci_swizzle_interrupt_pin(dev, pin);
 				irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
-						PCI_SLOT(bridge->devfn), pin - 1);
+						PCI_SLOT(bridge->devfn),
+						pin - 1,
+						&ioapic, &ioapic_pin,
+						&triggering, &polarity);
 				if (irq >= 0)
 					dev_warn(&dev->dev, "using bridge %s "
 						 "INT %c to get IRQ %d\n",
diff --git a/drivers/pci/hotplug/ibmphp_core.c b/drivers/pci/hotplug/ibmphp_core.c
index dd18f85..ef53b05 100644
--- a/drivers/pci/hotplug/ibmphp_core.c
+++ b/drivers/pci/hotplug/ibmphp_core.c
@@ -153,45 +153,49 @@ int ibmphp_init_devno(struct slot **cur_slot)
 		return -1;
 	}
 	for (loop = 0; loop < len; loop++) {
-		if ((*cur_slot)->number == rtable->slots[loop].slot) {
-		if ((*cur_slot)->bus == rtable->slots[loop].bus) {
+		if ((*cur_slot)->number == rtable->slots[loop].slot &&
+		    (*cur_slot)->bus == rtable->slots[loop].bus) {
+			int ioapic = -1, ioapic_pin = -1;
+			int triggering, polarity;
+
 			(*cur_slot)->device = PCI_SLOT(rtable->slots[loop].devfn);
 			for (i = 0; i < 4; i++)
 				(*cur_slot)->irq[i] = IO_APIC_get_PCI_irq_vector((int) (*cur_slot)->bus,
-						(int) (*cur_slot)->device, i);
-
-				debug("(*cur_slot)->irq[0] = %x\n",
-						(*cur_slot)->irq[0]);
-				debug("(*cur_slot)->irq[1] = %x\n",
-						(*cur_slot)->irq[1]);
-				debug("(*cur_slot)->irq[2] = %x\n",
-						(*cur_slot)->irq[2]);
-				debug("(*cur_slot)->irq[3] = %x\n",
-						(*cur_slot)->irq[3]);
-
-				debug("rtable->exlusive_irqs = %x\n",
+						(int) (*cur_slot)->device, i.
+						&ioapic, &ioapic_pin,
+						&triggering, &polarity);
+
+			debug("(*cur_slot)->irq[0] = %x\n",
+					(*cur_slot)->irq[0]);
+			debug("(*cur_slot)->irq[1] = %x\n",
+					(*cur_slot)->irq[1]);
+			debug("(*cur_slot)->irq[2] = %x\n",
+					(*cur_slot)->irq[2]);
+			debug("(*cur_slot)->irq[3] = %x\n",
+					(*cur_slot)->irq[3]);
+
+			debug("rtable->exlusive_irqs = %x\n",
 					rtable->exclusive_irqs);
-				debug("rtable->slots[loop].irq[0].bitmap = %x\n",
+			debug("rtable->slots[loop].irq[0].bitmap = %x\n",
 					rtable->slots[loop].irq[0].bitmap);
-				debug("rtable->slots[loop].irq[1].bitmap = %x\n",
+			debug("rtable->slots[loop].irq[1].bitmap = %x\n",
 					rtable->slots[loop].irq[1].bitmap);
-				debug("rtable->slots[loop].irq[2].bitmap = %x\n",
+			debug("rtable->slots[loop].irq[2].bitmap = %x\n",
 					rtable->slots[loop].irq[2].bitmap);
-				debug("rtable->slots[loop].irq[3].bitmap = %x\n",
+			debug("rtable->slots[loop].irq[3].bitmap = %x\n",
 					rtable->slots[loop].irq[3].bitmap);
 
-				debug("rtable->slots[loop].irq[0].link = %x\n",
+			debug("rtable->slots[loop].irq[0].link = %x\n",
 					rtable->slots[loop].irq[0].link);
-				debug("rtable->slots[loop].irq[1].link = %x\n",
+			debug("rtable->slots[loop].irq[1].link = %x\n",
 					rtable->slots[loop].irq[1].link);
-				debug("rtable->slots[loop].irq[2].link = %x\n",
+			debug("rtable->slots[loop].irq[2].link = %x\n",
 					rtable->slots[loop].irq[2].link);
-				debug("rtable->slots[loop].irq[3].link = %x\n",
+			debug("rtable->slots[loop].irq[3].link = %x\n",
 					rtable->slots[loop].irq[3].link);
-				debug("end of init_devno\n");
-				kfree(rtable);
-				return 0;
-			}
+			debug("end of init_devno\n");
+			kfree(rtable);
+			return 0;
 		}
 	}
 
-- 
cgit v0.10.2


From 5ef2183768bb7d64b85eccbfa1537a61cbefa97c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 6 May 2009 10:08:50 -0700
Subject: x86/acpi: move setup io apic routing out of CONFIG_ACPI scope

So we could set io apic routing when ACPI is not enabled.

[ Impact: prepare for new functionality ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: Len Brown <lenb@kernel.org>
LKML-Reference: <4A01C422.5070400@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 27bd2fd..6fd99f9 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -154,10 +154,10 @@ extern int timer_through_8259;
 extern int io_apic_get_unique_id(int ioapic, int apic_id);
 extern int io_apic_get_version(int ioapic);
 extern int io_apic_get_redir_entries(int ioapic);
-extern int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin,
-				  int irq, int edge_level, int active_high_low);
 #endif /* CONFIG_ACPI */
 
+extern int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin,
+				  int irq, int edge_level, int active_high_low);
 extern int (*ioapic_renumber_irq)(int ioapic, int irq);
 extern void ioapic_init_mappings(void);
 
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index caf9dbd..3a68dae 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3839,6 +3839,67 @@ int __init arch_probe_nr_irqs(void)
 }
 #endif
 
+static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
+				 int triggering, int polarity)
+{
+	struct irq_desc *desc;
+	struct irq_cfg *cfg;
+	int node;
+
+	if (!IO_APIC_IRQ(irq)) {
+		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
+			ioapic);
+		return -EINVAL;
+	}
+
+	if (dev)
+		node = dev_to_node(dev);
+	else
+		node = cpu_to_node(boot_cpu_id);
+
+	desc = irq_to_desc_alloc_node(irq, node);
+	if (!desc) {
+		printk(KERN_INFO "can not get irq_desc %d\n", irq);
+		return 0;
+	}
+
+	/*
+	 * IRQs < 16 are already in the irq_2_pin[] map
+	 */
+	if (irq >= NR_IRQS_LEGACY) {
+		cfg = desc->chip_data;
+		add_pin_to_irq_node(cfg, node, ioapic, pin);
+	}
+
+	setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
+
+	return 0;
+}
+
+static struct {
+	DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+} mp_ioapic_routing[MAX_IO_APICS];
+
+int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
+				 int triggering, int polarity)
+{
+
+	/*
+	 * Avoid pin reprogramming.  PRTs typically include entries
+	 * with redundant pin->gsi mappings (but unique PCI devices);
+	 * we only program the IOAPIC on the first.
+	 */
+	if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
+		pr_debug("Pin %d-%d already programmed\n",
+			 mp_ioapics[ioapic].apicid, pin);
+		return 0;
+	}
+	set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
+
+	return __io_apic_set_pci_routing(dev, ioapic, pin, irq,
+					 triggering, polarity);
+}
+
 /* --------------------------------------------------------------------------
                           ACPI-based IOAPIC Configuration
    -------------------------------------------------------------------------- */
@@ -3933,67 +3994,6 @@ int __init io_apic_get_version(int ioapic)
 }
 #endif
 
-static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
-				 int triggering, int polarity)
-{
-	struct irq_desc *desc;
-	struct irq_cfg *cfg;
-	int node;
-
-	if (!IO_APIC_IRQ(irq)) {
-		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
-			ioapic);
-		return -EINVAL;
-	}
-
-	if (dev)
-		node = dev_to_node(dev);
-	else
-		node = cpu_to_node(boot_cpu_id);
-
-	desc = irq_to_desc_alloc_node(irq, node);
-	if (!desc) {
-		printk(KERN_INFO "can not get irq_desc %d\n", irq);
-		return 0;
-	}
-
-	/*
-	 * IRQs < 16 are already in the irq_2_pin[] map
-	 */
-	if (irq >= NR_IRQS_LEGACY) {
-		cfg = desc->chip_data;
-		add_pin_to_irq_node(cfg, node, ioapic, pin);
-	}
-
-	setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
-
-	return 0;
-}
-
-static struct {
-	DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
-} mp_ioapic_routing[MAX_IO_APICS];
-
-int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
-				 int triggering, int polarity)
-{
-
-	/*
-	 * Avoid pin reprogramming.  PRTs typically include entries
-	 * with redundant pin->gsi mappings (but unique PCI devices);
-	 * we only program the IOAPIC on the first.
-	 */
-	if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
-		pr_debug("Pin %d-%d already programmed\n",
-			 mp_ioapics[ioapic].apicid, pin);
-		return 0;
-	}
-	set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
-
-	return __io_apic_set_pci_routing(dev, ioapic, pin, irq,
-					 triggering, polarity);
-}
-
 int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
 {
 	int i;
-- 
cgit v0.10.2


From b9c61b70075c87a8612624736faf4a2de5b1ed30 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 6 May 2009 10:10:06 -0700
Subject: x86/pci: update pirq_enable_irq() to setup io apic routing

So we can set io apic routing only when enabling the device irq.

This is advantageous for IRQ descriptor allocation affinity: if we set up
the IO-APIC entry later, we have a chance to allocate the IRQ descriptor
later and know which device it is on and can set affinity accordingly.

[ Impact: standardize/enhance irq-enabling sequence for mptable irqs ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <4A01C46E.8000501@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 3a68dae..5d5f412 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1480,9 +1480,13 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
 	ioapic_write_entry(apic_id, pin, entry);
 }
 
+static struct {
+	DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+} mp_ioapic_routing[MAX_IO_APICS];
+
 static void __init setup_IO_APIC_irqs(void)
 {
-	int apic_id, pin, idx, irq;
+	int apic_id = 0, pin, idx, irq;
 	int notcon = 0;
 	struct irq_desc *desc;
 	struct irq_cfg *cfg;
@@ -1490,48 +1494,53 @@ static void __init setup_IO_APIC_irqs(void)
 
 	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
 
-	for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
-		for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
-
-			idx = find_irq_entry(apic_id, pin, mp_INT);
-			if (idx == -1) {
-				if (!notcon) {
-					notcon = 1;
-					apic_printk(APIC_VERBOSE,
-						KERN_DEBUG " %d-%d",
-						mp_ioapics[apic_id].apicid, pin);
-				} else
-					apic_printk(APIC_VERBOSE, " %d-%d",
-						mp_ioapics[apic_id].apicid, pin);
-				continue;
-			}
-			if (notcon) {
-				apic_printk(APIC_VERBOSE,
-					" (apicid-pin) not connected\n");
-				notcon = 0;
-			}
+#ifdef CONFIG_ACPI
+	if (!acpi_disabled && acpi_ioapic) {
+		apic_id = mp_find_ioapic(0);
+		if (apic_id < 0)
+			apic_id = 0;
+	}
+#endif
 
-			irq = pin_2_irq(idx, apic_id, pin);
+	for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
+		idx = find_irq_entry(apic_id, pin, mp_INT);
+		if (idx == -1) {
+			if (!notcon) {
+				notcon = 1;
+				apic_printk(APIC_VERBOSE,
+					KERN_DEBUG " %d-%d",
+					mp_ioapics[apic_id].apicid, pin);
+			} else
+				apic_printk(APIC_VERBOSE, " %d-%d",
+					mp_ioapics[apic_id].apicid, pin);
+			continue;
+		}
+		if (notcon) {
+			apic_printk(APIC_VERBOSE,
+				" (apicid-pin) not connected\n");
+			notcon = 0;
+		}
 
-			/*
-			 * Skip the timer IRQ if there's a quirk handler
-			 * installed and if it returns 1:
-			 */
-			if (apic->multi_timer_check &&
-					apic->multi_timer_check(apic_id, irq))
-				continue;
+		irq = pin_2_irq(idx, apic_id, pin);
 
-			desc = irq_to_desc_alloc_node(irq, node);
-			if (!desc) {
-				printk(KERN_INFO "can not get irq_desc for %d\n", irq);
-				continue;
-			}
-			cfg = desc->chip_data;
-			add_pin_to_irq_node(cfg, node, apic_id, pin);
+		/*
+		 * Skip the timer IRQ if there's a quirk handler
+		 * installed and if it returns 1:
+		 */
+		if (apic->multi_timer_check &&
+				apic->multi_timer_check(apic_id, irq))
+			continue;
 
-			setup_IO_APIC_irq(apic_id, pin, irq, desc,
-					irq_trigger(idx), irq_polarity(idx));
+		desc = irq_to_desc_alloc_node(irq, node);
+		if (!desc) {
+			printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+			continue;
 		}
+		cfg = desc->chip_data;
+		add_pin_to_irq_node(cfg, node, apic_id, pin);
+		set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
+		setup_IO_APIC_irq(apic_id, pin, irq, desc,
+				irq_trigger(idx), irq_polarity(idx));
 	}
 
 	if (notcon)
@@ -3876,10 +3885,6 @@ static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, in
 	return 0;
 }
 
-static struct {
-	DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
-} mp_ioapic_routing[MAX_IO_APICS];
-
 int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
 				 int triggering, int polarity)
 {
@@ -4023,51 +4028,44 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
 #ifdef CONFIG_SMP
 void __init setup_ioapic_dest(void)
 {
-	int pin, ioapic, irq, irq_entry;
+	int pin, ioapic = 0, irq, irq_entry;
 	struct irq_desc *desc;
-	struct irq_cfg *cfg;
 	const struct cpumask *mask;
 
 	if (skip_ioapic_setup == 1)
 		return;
 
-	for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
-		for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
-			irq_entry = find_irq_entry(ioapic, pin, mp_INT);
-			if (irq_entry == -1)
-				continue;
-			irq = pin_2_irq(irq_entry, ioapic, pin);
-
-			/* setup_IO_APIC_irqs could fail to get vector for some device
-			 * when you have too many devices, because at that time only boot
-			 * cpu is online.
-			 */
-			desc = irq_to_desc(irq);
-			cfg = desc->chip_data;
-			if (!cfg->vector) {
-				setup_IO_APIC_irq(ioapic, pin, irq, desc,
-						  irq_trigger(irq_entry),
-						  irq_polarity(irq_entry));
-				continue;
+#ifdef CONFIG_ACPI
+	if (!acpi_disabled && acpi_ioapic) {
+		ioapic = mp_find_ioapic(0);
+		if (ioapic < 0)
+			ioapic = 0;
+	}
+#endif
 
-			}
+	for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
+		irq_entry = find_irq_entry(ioapic, pin, mp_INT);
+		if (irq_entry == -1)
+			continue;
+		irq = pin_2_irq(irq_entry, ioapic, pin);
 
-			/*
-			 * Honour affinities which have been set in early boot
-			 */
-			if (desc->status &
-			    (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
-				mask = desc->affinity;
-			else
-				mask = apic->target_cpus();
+		desc = irq_to_desc(irq);
 
-			if (intr_remapping_enabled)
-				set_ir_ioapic_affinity_irq_desc(desc, mask);
-			else
-				set_ioapic_affinity_irq_desc(desc, mask);
-		}
+		/*
+		 * Honour affinities which have been set in early boot
+		 */
+		if (desc->status &
+		    (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
+			mask = desc->affinity;
+		else
+			mask = apic->target_cpus();
 
+		if (intr_remapping_enabled)
+			set_ir_ioapic_affinity_irq_desc(desc, mask);
+		else
+			set_ioapic_affinity_irq_desc(desc, mask);
 	}
+
 }
 #endif
 
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index a2f6bde..2f3e192 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -889,6 +889,9 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
 		return 0;
 	}
 
+	if (io_apic_assign_pci_irqs)
+		return 0;
+
 	/* Find IRQ routing entry */
 
 	if (!pirq_table)
@@ -1039,63 +1042,15 @@ static void __init pcibios_fixup_irqs(void)
 		pirq_penalty[dev->irq]++;
 	}
 
+	if (io_apic_assign_pci_irqs)
+		return;
+
 	dev = NULL;
 	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
 		pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
 		if (!pin)
 			continue;
 
-#ifdef CONFIG_X86_IO_APIC
-		/*
-		 * Recalculate IRQ numbers if we use the I/O APIC.
-		 */
-		if (io_apic_assign_pci_irqs) {
-			int irq;
-			int ioapic = -1, ioapic_pin = -1;
-			int triggering, polarity;
-
-			/*
-			 * interrupt pins are numbered starting from 1
-			 */
-			irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
-						PCI_SLOT(dev->devfn), pin - 1,
-						&ioapic, &ioapic_pin,
-						&triggering, &polarity);
-			/*
-			 * Busses behind bridges are typically not listed in the
-			 * MP-table.  In this case we have to look up the IRQ
-			 * based on the parent bus, parent slot, and pin number.
-			 * The SMP code detects such bridged busses itself so we
-			 * should get into this branch reliably.
-			 */
-			if (irq < 0 && dev->bus->parent) {
-				/* go back to the bridge */
-				struct pci_dev *bridge = dev->bus->self;
-				int bus;
-
-				pin = pci_swizzle_interrupt_pin(dev, pin);
-				bus = bridge->bus->number;
-				irq = IO_APIC_get_PCI_irq_vector(bus,
-						PCI_SLOT(bridge->devfn),
-						pin - 1,
-						&ioapic, &ioapic_pin,
-						&triggering, &polarity);
-				if (irq >= 0)
-					dev_warn(&dev->dev,
-						"using bridge %s INT %c to "
-							"get IRQ %d\n",
-						 pci_name(bridge),
-						 'A' + pin - 1, irq);
-			}
-			if (irq >= 0) {
-				dev_info(&dev->dev,
-					"PCI->APIC IRQ transform: INT %c "
-						"-> IRQ %d\n",
-					'A' + pin - 1, irq);
-				dev->irq = irq;
-			}
-		}
-#endif
 		/*
 		 * Still no IRQ? Try to lookup one...
 		 */
@@ -1190,6 +1145,19 @@ int __init pcibios_irq_init(void)
 	pcibios_enable_irq = pirq_enable_irq;
 
 	pcibios_fixup_irqs();
+
+	if (io_apic_assign_pci_irqs && pci_routeirq) {
+		struct pci_dev *dev = NULL;
+		/*
+		 * PCI IRQ routing is set up by pci_enable_device(), but we
+		 * also do it here in case there are still broken drivers that
+		 * don't use pci_enable_device().
+		 */
+		printk(KERN_INFO "PCI: Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n");
+		for_each_pci_dev(dev)
+			pirq_enable_irq(dev);
+	}
+
 	return 0;
 }
 
@@ -1220,13 +1188,17 @@ void pcibios_penalize_isa_irq(int irq, int active)
 static int pirq_enable_irq(struct pci_dev *dev)
 {
 	u8 pin;
-	struct pci_dev *temp_dev;
 
 	pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
-	if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
+	if (pin && !pcibios_lookup_irq(dev, 1)) {
 		char *msg = "";
 
+		if (!io_apic_assign_pci_irqs && dev->irq)
+			return 0;
+
 		if (io_apic_assign_pci_irqs) {
+#ifdef CONFIG_X86_IO_APIC
+			struct pci_dev *temp_dev;
 			int irq;
 			int ioapic = -1, ioapic_pin = -1;
 			int triggering, polarity;
@@ -1261,12 +1233,16 @@ static int pirq_enable_irq(struct pci_dev *dev)
 			}
 			dev = temp_dev;
 			if (irq >= 0) {
+				io_apic_set_pci_routing(&dev->dev, ioapic,
+							ioapic_pin, irq,
+							triggering, polarity);
+				dev->irq = irq;
 				dev_info(&dev->dev, "PCI->APIC IRQ transform: "
 					 "INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
-				dev->irq = irq;
 				return 0;
 			} else
 				msg = "; probably buggy MP table";
+#endif
 		} else if (pci_probe & PCI_BIOS_IRQ_SCAN)
 			msg = "";
 		else
-- 
cgit v0.10.2


From 61fe91e1319556f32bebfd7ed2c68ef02e2c17f7 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 9 May 2009 23:47:42 -0700
Subject: x86: apic: Check rev 3 fadt correctly for physical_apic bit

Impact: fix fadt version checking

FADT2_REVISION_ID has value 3 aka rev 3 FADT. So need to use >= instead
of >, as other places in the code do.

[ Impact: extend scope of APIC boot quirk ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 306e5e8..744e6d8 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -235,7 +235,7 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 	 * regardless of how many processors are present (x86_64 ES7000
 	 * is an example).
 	 */
-	if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
+	if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
 		(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
 		printk(KERN_DEBUG "system APIC only can use physical flat");
 		return 1;
-- 
cgit v0.10.2


From 3e0c373749d7eb5b354ac0b043f2b2cdf84eefef Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 9 May 2009 23:47:42 -0700
Subject: x86: clean up and fix setup_clear/force_cpu_cap handling

setup_force_cpu_cap() only have one user (Xen guest code),
but it should not reuse cleared_cpu_cpus, otherwise it
will have problems on SMP.

Need to have a separate cpu_cpus_set array too, for forced-on
flags, beyond the forced-off flags.

Also need to setup handling before all cpus caps are combined.

[ Impact: fix the forced-set CPU feature flag logic ]

Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Yinghai Lu <yinghai.lu@kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index ccc1061..13cc6a5 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -192,11 +192,11 @@ extern const char * const x86_power_flags[32];
 #define clear_cpu_cap(c, bit)	clear_bit(bit, (unsigned long *)((c)->x86_capability))
 #define setup_clear_cpu_cap(bit) do { \
 	clear_cpu_cap(&boot_cpu_data, bit);	\
-	set_bit(bit, (unsigned long *)cleared_cpu_caps); \
+	set_bit(bit, (unsigned long *)cpu_caps_cleared); \
 } while (0)
 #define setup_force_cpu_cap(bit) do { \
 	set_cpu_cap(&boot_cpu_data, bit);	\
-	clear_bit(bit, (unsigned long *)cleared_cpu_caps);	\
+	set_bit(bit, (unsigned long *)cpu_caps_set);	\
 } while (0)
 
 #define cpu_has_fpu		boot_cpu_has(X86_FEATURE_FPU)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c2cceae..fed93fe 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -135,7 +135,8 @@ extern struct cpuinfo_x86	boot_cpu_data;
 extern struct cpuinfo_x86	new_cpu_data;
 
 extern struct tss_struct	doublefault_tss;
-extern __u32			cleared_cpu_caps[NCAPINTS];
+extern __u32			cpu_caps_cleared[NCAPINTS];
+extern __u32			cpu_caps_set[NCAPINTS];
 
 #ifdef CONFIG_SMP
 DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c4f6678..e7fd5c4 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -292,7 +292,8 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
 	return NULL;		/* Not found */
 }
 
-__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
+__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata;
+__u32 cpu_caps_set[NCAPINTS] __cpuinitdata;
 
 void load_percpu_segment(int cpu)
 {
@@ -806,6 +807,16 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 #endif
 
 	init_hypervisor(c);
+
+	/*
+	 * Clear/Set all flags overriden by options, need do it
+	 * before following smp all cpus cap AND.
+	 */
+	for (i = 0; i < NCAPINTS; i++) {
+		c->x86_capability[i] &= ~cpu_caps_cleared[i];
+		c->x86_capability[i] |= cpu_caps_set[i];
+	}
+
 	/*
 	 * On SMP, boot_cpu_data holds the common feature set between
 	 * all CPUs; so make sure that we indicate which features are
@@ -818,10 +829,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 			boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
 	}
 
-	/* Clear all flags overriden by options */
-	for (i = 0; i < NCAPINTS; i++)
-		c->x86_capability[i] &= ~cleared_cpu_caps[i];
-
 #ifdef CONFIG_X86_MCE
 	/* Init Machine Check Exception if available. */
 	mcheck_init(c);
-- 
cgit v0.10.2


From 1822952ba2b9f22f79019d07ebbeca31dc14b718 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 11 May 2009 17:56:07 +0900
Subject: block: let blk_end_request_all handle bidi requests

blk_end_request_all() and __blk_end_request_all() should finish all
bytes including bidi, by definition. That's what all bidi users need ,
bidi requests must be complete as a whole (partial completion is
impossible).

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6e59d3b..1069f44 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -910,8 +910,12 @@ static inline bool blk_end_request(struct request *rq, int error,
 static inline void blk_end_request_all(struct request *rq, int error)
 {
 	bool pending;
+	unsigned int bidi_bytes = 0;
 
-	pending = blk_end_request(rq, error, blk_rq_bytes(rq));
+	if (unlikely(blk_bidi_rq(rq)))
+		bidi_bytes = blk_rq_bytes(rq->next_rq);
+
+	pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
 	BUG_ON(pending);
 }
 
@@ -962,8 +966,12 @@ static inline bool __blk_end_request(struct request *rq, int error,
 static inline void __blk_end_request_all(struct request *rq, int error)
 {
 	bool pending;
+	unsigned int bidi_bytes = 0;
+
+	if (unlikely(blk_bidi_rq(rq)))
+		bidi_bytes = blk_rq_bytes(rq->next_rq);
 
-	pending = __blk_end_request(rq, error, blk_rq_bytes(rq));
+	pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
 	BUG_ON(pending);
 }
 
-- 
cgit v0.10.2


From e6bb7a96c2c36f20c05ef648f15bd3c2b1834c78 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 11 May 2009 17:56:08 +0900
Subject: scsi: simplify the bidi completion

Let's use blk_end_request_all() instead of blk_end_bidi_request().

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index b12750f..a54bec9 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -673,33 +673,6 @@ void scsi_release_buffers(struct scsi_cmnd *cmd)
 EXPORT_SYMBOL(scsi_release_buffers);
 
 /*
- * Bidi commands Must be complete as a whole, both sides at once.  If
- * part of the bytes were written and lld returned scsi_in()->resid
- * and/or scsi_out()->resid this information will be left in
- * req->resid_len and req->next_rq->resid_len. The upper-layer driver
- * can decide what to do with this information.
- */
-static void scsi_end_bidi_request(struct scsi_cmnd *cmd)
-{
-	struct request *req = cmd->request;
-
-	req->resid_len = scsi_out(cmd)->resid;
-	req->next_rq->resid_len = scsi_in(cmd)->resid;
-
-	/* The req and req->next_rq have not been completed */
-	BUG_ON(blk_end_bidi_request(req, 0, blk_rq_bytes(req),
-				    blk_rq_bytes(req->next_rq)));
-
-	scsi_release_buffers(cmd);
-
-	/*
-	 * This will goose the queue request function at the end, so we don't
-	 * need to worry about launching another command.
-	 */
-	scsi_next_command(cmd);
-}
-
-/*
  * Function:    scsi_io_completion()
  *
  * Purpose:     Completion processing for block device I/O requests.
@@ -772,12 +745,22 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 			if (!sense_deferred)
 				error = -EIO;
 		}
+
+		req->resid_len = scsi_get_resid(cmd);
+
 		if (scsi_bidi_cmnd(cmd)) {
-			/* will also release_buffers */
-			scsi_end_bidi_request(cmd);
+			/*
+			 * Bidi commands Must be complete as a whole,
+			 * both sides at once.
+			 */
+			req->next_rq->resid_len = scsi_in(cmd)->resid;
+
+			blk_end_request_all(req, 0);
+
+			scsi_release_buffers(cmd);
+			scsi_next_command(cmd);
 			return;
 		}
-		req->resid_len = scsi_get_resid(cmd);
 	}
 
 	BUG_ON(blk_bidi_rq(req)); /* bidi not support for !blk_pc_request yet */
-- 
cgit v0.10.2


From b1f744937f1be3e6d3009382a755679133cf782d Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 11 May 2009 17:56:09 +0900
Subject: block: move completion related functions back to blk-core.c

Let's put the completion related functions back to block/blk-core.c
where they have lived. We can also unexport blk_end_bidi_request() and
__blk_end_bidi_request(), which nobody uses.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index 93691d2..a2d97de 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2026,8 +2026,8 @@ static void blk_finish_request(struct request *req, int error)
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  **/
-bool blk_end_bidi_request(struct request *rq, int error,
-			  unsigned int nr_bytes, unsigned int bidi_bytes)
+static bool blk_end_bidi_request(struct request *rq, int error,
+				 unsigned int nr_bytes, unsigned int bidi_bytes)
 {
 	struct request_queue *q = rq->q;
 	unsigned long flags;
@@ -2041,7 +2041,6 @@ bool blk_end_bidi_request(struct request *rq, int error,
 
 	return false;
 }
-EXPORT_SYMBOL_GPL(blk_end_bidi_request);
 
 /**
  * __blk_end_bidi_request - Complete a bidi request with queue lock held
@@ -2058,8 +2057,8 @@ EXPORT_SYMBOL_GPL(blk_end_bidi_request);
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  **/
-bool __blk_end_bidi_request(struct request *rq, int error,
-			    unsigned int nr_bytes, unsigned int bidi_bytes)
+static bool __blk_end_bidi_request(struct request *rq, int error,
+				   unsigned int nr_bytes, unsigned int bidi_bytes)
 {
 	if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
 		return true;
@@ -2068,7 +2067,124 @@ bool __blk_end_bidi_request(struct request *rq, int error,
 
 	return false;
 }
-EXPORT_SYMBOL_GPL(__blk_end_bidi_request);
+
+/**
+ * blk_end_request - Helper function for drivers to complete the request.
+ * @rq:       the request being processed
+ * @error:    %0 for success, < %0 for error
+ * @nr_bytes: number of bytes to complete
+ *
+ * Description:
+ *     Ends I/O on a number of bytes attached to @rq.
+ *     If @rq has leftover, sets it up for the next range of segments.
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
+ **/
+bool blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
+{
+	return blk_end_bidi_request(rq, error, nr_bytes, 0);
+}
+EXPORT_SYMBOL_GPL(blk_end_request);
+
+/**
+ * blk_end_request_all - Helper function for drives to finish the request.
+ * @rq: the request to finish
+ * @err: %0 for success, < %0 for error
+ *
+ * Description:
+ *     Completely finish @rq.
+ */
+void blk_end_request_all(struct request *rq, int error)
+{
+	bool pending;
+	unsigned int bidi_bytes = 0;
+
+	if (unlikely(blk_bidi_rq(rq)))
+		bidi_bytes = blk_rq_bytes(rq->next_rq);
+
+	pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
+	BUG_ON(pending);
+}
+EXPORT_SYMBOL_GPL(blk_end_request_all);
+
+/**
+ * blk_end_request_cur - Helper function to finish the current request chunk.
+ * @rq: the request to finish the current chunk for
+ * @err: %0 for success, < %0 for error
+ *
+ * Description:
+ *     Complete the current consecutively mapped chunk from @rq.
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
+ */
+bool blk_end_request_cur(struct request *rq, int error)
+{
+	return blk_end_request(rq, error, blk_rq_cur_bytes(rq));
+}
+EXPORT_SYMBOL_GPL(blk_end_request_cur);
+
+/**
+ * __blk_end_request - Helper function for drivers to complete the request.
+ * @rq:       the request being processed
+ * @error:    %0 for success, < %0 for error
+ * @nr_bytes: number of bytes to complete
+ *
+ * Description:
+ *     Must be called with queue lock held unlike blk_end_request().
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
+ **/
+bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
+{
+	return __blk_end_bidi_request(rq, error, nr_bytes, 0);
+}
+EXPORT_SYMBOL_GPL(__blk_end_request);
+
+/**
+ * __blk_end_request_all - Helper function for drives to finish the request.
+ * @rq: the request to finish
+ * @err: %0 for success, < %0 for error
+ *
+ * Description:
+ *     Completely finish @rq.  Must be called with queue lock held.
+ */
+void __blk_end_request_all(struct request *rq, int error)
+{
+	bool pending;
+	unsigned int bidi_bytes = 0;
+
+	if (unlikely(blk_bidi_rq(rq)))
+		bidi_bytes = blk_rq_bytes(rq->next_rq);
+
+	pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
+	BUG_ON(pending);
+}
+EXPORT_SYMBOL_GPL(__blk_end_request_all);
+
+/**
+ * __blk_end_request_cur - Helper function to finish the current request chunk.
+ * @rq: the request to finish the current chunk for
+ * @err: %0 for success, < %0 for error
+ *
+ * Description:
+ *     Complete the current consecutively mapped chunk from @rq.  Must
+ *     be called with queue lock held.
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
+ */
+bool __blk_end_request_cur(struct request *rq, int error)
+{
+	return __blk_end_request(rq, error, blk_rq_cur_bytes(rq));
+}
+EXPORT_SYMBOL_GPL(__blk_end_request_cur);
 
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1069f44..f9d60a7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -872,126 +872,14 @@ extern struct request *blk_fetch_request(struct request_queue *q);
  */
 extern bool blk_update_request(struct request *rq, int error,
 			       unsigned int nr_bytes);
-extern bool blk_end_bidi_request(struct request *rq, int error,
-				 unsigned int nr_bytes,
-				 unsigned int bidi_bytes);
-extern bool __blk_end_bidi_request(struct request *rq, int error,
-				   unsigned int nr_bytes,
-				   unsigned int bidi_bytes);
-
-/**
- * blk_end_request - Helper function for drivers to complete the request.
- * @rq:       the request being processed
- * @error:    %0 for success, < %0 for error
- * @nr_bytes: number of bytes to complete
- *
- * Description:
- *     Ends I/O on a number of bytes attached to @rq.
- *     If @rq has leftover, sets it up for the next range of segments.
- *
- * Return:
- *     %false - we are done with this request
- *     %true  - still buffers pending for this request
- **/
-static inline bool blk_end_request(struct request *rq, int error,
-				   unsigned int nr_bytes)
-{
-	return blk_end_bidi_request(rq, error, nr_bytes, 0);
-}
-
-/**
- * blk_end_request_all - Helper function for drives to finish the request.
- * @rq: the request to finish
- * @err: %0 for success, < %0 for error
- *
- * Description:
- *     Completely finish @rq.
- */
-static inline void blk_end_request_all(struct request *rq, int error)
-{
-	bool pending;
-	unsigned int bidi_bytes = 0;
-
-	if (unlikely(blk_bidi_rq(rq)))
-		bidi_bytes = blk_rq_bytes(rq->next_rq);
-
-	pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
-	BUG_ON(pending);
-}
-
-/**
- * blk_end_request_cur - Helper function to finish the current request chunk.
- * @rq: the request to finish the current chunk for
- * @err: %0 for success, < %0 for error
- *
- * Description:
- *     Complete the current consecutively mapped chunk from @rq.
- *
- * Return:
- *     %false - we are done with this request
- *     %true  - still buffers pending for this request
- */
-static inline bool blk_end_request_cur(struct request *rq, int error)
-{
-	return blk_end_request(rq, error, blk_rq_cur_bytes(rq));
-}
-
-/**
- * __blk_end_request - Helper function for drivers to complete the request.
- * @rq:       the request being processed
- * @error:    %0 for success, < %0 for error
- * @nr_bytes: number of bytes to complete
- *
- * Description:
- *     Must be called with queue lock held unlike blk_end_request().
- *
- * Return:
- *     %false - we are done with this request
- *     %true  - still buffers pending for this request
- **/
-static inline bool __blk_end_request(struct request *rq, int error,
-				     unsigned int nr_bytes)
-{
-	return __blk_end_bidi_request(rq, error, nr_bytes, 0);
-}
-
-/**
- * __blk_end_request_all - Helper function for drives to finish the request.
- * @rq: the request to finish
- * @err: %0 for success, < %0 for error
- *
- * Description:
- *     Completely finish @rq.  Must be called with queue lock held.
- */
-static inline void __blk_end_request_all(struct request *rq, int error)
-{
-	bool pending;
-	unsigned int bidi_bytes = 0;
-
-	if (unlikely(blk_bidi_rq(rq)))
-		bidi_bytes = blk_rq_bytes(rq->next_rq);
-
-	pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
-	BUG_ON(pending);
-}
-
-/**
- * __blk_end_request_cur - Helper function to finish the current request chunk.
- * @rq: the request to finish the current chunk for
- * @err: %0 for success, < %0 for error
- *
- * Description:
- *     Complete the current consecutively mapped chunk from @rq.  Must
- *     be called with queue lock held.
- *
- * Return:
- *     %false - we are done with this request
- *     %true  - still buffers pending for this request
- */
-static inline bool __blk_end_request_cur(struct request *rq, int error)
-{
-	return __blk_end_request(rq, error, blk_rq_cur_bytes(rq));
-}
+extern bool blk_end_request(struct request *rq, int error,
+			    unsigned int nr_bytes);
+extern void blk_end_request_all(struct request *rq, int error);
+extern bool blk_end_request_cur(struct request *rq, int error);
+extern bool __blk_end_request(struct request *rq, int error,
+			      unsigned int nr_bytes);
+extern void __blk_end_request_all(struct request *rq, int error);
+extern bool __blk_end_request_cur(struct request *rq, int error);
 
 extern void blk_complete_request(struct request *);
 extern void __blk_complete_request(struct request *);
-- 
cgit v0.10.2


From 80989ce0643c1034822f3e339ed8d790b649abe1 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 9 May 2009 23:47:42 -0700
Subject: x86: clean up and and print out initial max_pfn_mapped

Do this so we can check the range that is mapped before
init_memory_mapping().

To be able to print out meaningful info, we first have to fix
64-bit to have max_pfn_mapped assigned before that call. This
also unifies the code-path a bit.

[ Impact: print more debug info, cleanup ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <49BF0978.40605@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0d77e56..4031d6c 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -862,12 +862,16 @@ void __init setup_arch(char **cmdline_p)
 		max_low_pfn = max_pfn;
 
 	high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
+	max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
 #endif
 
 #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
 	setup_bios_corruption_check();
 #endif
 
+	printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
+			max_pfn_mapped<<PAGE_SHIFT);
+
 	reserve_brk();
 
 	/* max_pfn_mapped is updated here */
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 95f5ecf..92d2108 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -132,12 +132,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
 	 */
 #ifdef CONFIG_X86_32
 	start = 0x7000;
-	e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
-					tables, PAGE_SIZE);
-#else /* CONFIG_X86_64 */
+#else
 	start = 0x8000;
-	e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE);
 #endif
+	e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
+					tables, PAGE_SIZE);
 	if (e820_table_start == -1UL)
 		panic("Cannot find space for the kernel page tables");
 
-- 
cgit v0.10.2


From 4401da6111ac58f94234417427d06a72c4048c74 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 2 May 2009 10:40:57 -0700
Subject: x86: read apic ID in the !acpi_lapic case

Ed found that on 32-bit, boot_cpu_physical_apicid is not read right,
when the mptable is broken.

Interestingly, actually three paths use/set it:

 1. acpi: at that time that is already read from reg
 2. mptable: only read from mptable
 3. no madt, and no mptable, that use default apic id 0 for 64-bit, -1 for 32-bit

so we could read the apic id for the 2/3 path. We trust the hardware
register more than we trust a BIOS data structure (the mptable).

We can also avoid the double set_fixmap() when acpi_lapic
is used, and also need to move cpu_has_apic earlier and
call apic_disable().

Also when need to update the apic id, we'd better read and
set the apic version as well - so that quirks are applied precisely.

v2: make path 3 with 64bit, use -1 as apic id, so could read it later.
v3: fix whitespace problem pointed out by Ed Swierk

[ Impact: get correct apic id for bsp other than acpi path ]

Reported-by: Ed Swierk <eswierk@aristanetworks.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <49FC85A9.2070702@kernel.org>
[ v4: sanity-check in the ACPI case too ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e258bed..1ee966f 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1456,7 +1456,6 @@ static int __init detect_init_APIC(void)
 	}
 
 	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-	boot_cpu_physical_apicid = 0;
 	return 0;
 }
 #else
@@ -1570,6 +1569,8 @@ void __init early_init_lapic_mapping(void)
  */
 void __init init_apic_mappings(void)
 {
+	unsigned int new_apicid;
+
 	if (x2apic_mode) {
 		boot_cpu_physical_apicid = read_apic_id();
 		return;
@@ -1586,21 +1587,31 @@ void __init init_apic_mappings(void)
 	} else
 		apic_phys = mp_lapic_addr;
 
-	set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+	/* lets check if we may NOP'ify apic operations */
+	if (!cpu_has_apic) {
+		pr_info("APIC: disable apic facility\n");
+		apic_disable();
+		return;
+	}
+
+	/*
+	 * acpi lapic path already maps that address in
+	 * acpi_register_lapic_address()
+	 */
+	if (!acpi_lapic)
+		set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+
 	apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
 				APIC_BASE, apic_phys);
-
 	/*
 	 * Fetch the APIC ID of the BSP in case we have a
 	 * default configuration (or the MP table is broken).
 	 */
-	if (boot_cpu_physical_apicid == -1U)
-		boot_cpu_physical_apicid = read_apic_id();
-
-	/* lets check if we may to NOP'ify apic operations */
-	if (!cpu_has_apic) {
-		pr_info("APIC: disable apic facility\n");
-		apic_disable();
+	new_apicid = read_apic_id();
+	if (boot_cpu_physical_apicid != new_apicid) {
+		boot_cpu_physical_apicid = new_apicid;
+		apic_version[new_apicid] =
+			 GET_APIC_VERSION(apic_read(APIC_LVR));
 	}
 }
 
-- 
cgit v0.10.2


From b37ab91907e9002925f4217e3bbd496aa12c2fa3 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 8 May 2009 00:36:44 -0700
Subject: x86: Sanity check the e820 against the SRAT table using e820 map only

node_cover_memory() sanity checks the SRAT table by ensuring that all
PXMs cover the memory reported in the e820.

However, when calculating the size of the holes in the e820, it uses
the early_node_map[] which contains information taken from both SRAT
and e820. If the SRAT is missing an entry, then it is not detected
that the SRAT table is incorrect and missing entries.

This patch uses the e820 map to calculate the holes instead of
early_node_map[].

comment is from Mel.

[ Impact: reject incorrect SRAT tables ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <4A03E10C.60906@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 0176595..c7a18aa 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -345,7 +345,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
 			pxmram = 0;
 	}
 
-	e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
+	e820ram = max_pfn - (e820_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
 	/* We seem to lose 3 pages somewhere. Allow a bit of slack. */
 	if ((long)(e820ram - pxmram) >= 1*1024*1024) {
 		printk(KERN_ERR
-- 
cgit v0.10.2


From 0964b0562bb9c93194e852b47bab2397b9e11c18 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 8 May 2009 00:37:34 -0700
Subject: x86: Allow 1MB of slack between the e820 map and SRAT, not 4GB

It is expected that there might be slight differences between the e820
map and the SRAT table and the intention was that 1MB of slack be allowed.

The calculation comparing e820ram and pxmram assumes the units are bytes,
when they are in fact pages. This means 4GB of slack is being allowed,
not 1MB. This patch makes the correct comparison.

comment is from Mel.

[ Impact: don't accept buggy SRATs that could dump up to 4G of RAM ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <4A03E13E.6050107@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index c7a18aa..87b45bf 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -346,8 +346,8 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
 	}
 
 	e820ram = max_pfn - (e820_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
-	/* We seem to lose 3 pages somewhere. Allow a bit of slack. */
-	if ((long)(e820ram - pxmram) >= 1*1024*1024) {
+	/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
+	if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
 		printk(KERN_ERR
 	"SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
 			(pxmram << PAGE_SHIFT) >> 20,
-- 
cgit v0.10.2


From 53c0054c3f11b49fc09f24e46f58661def952728 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Mon, 11 May 2009 08:45:27 +0000
Subject: sh: include empty_zero_page in text

Include empty_zero_page in _text. This fixes a problem
introduced by c3e2586b794b12ffcdf69b4e547030b51e18e6d9
which results in broken boot on R2D-Plus.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S
index d737230..f53c76a 100644
--- a/arch/sh/kernel/vmlinux.lds.S
+++ b/arch/sh/kernel/vmlinux.lds.S
@@ -26,12 +26,13 @@ SECTIONS
 	. = CONFIG_PAGE_OFFSET + CONFIG_MEMORY_START + CONFIG_ZERO_PAGE_OFFSET;
 #endif
 
+	_text = .;		/* Text and read-only data */
+
 	.empty_zero_page : AT(ADDR(.empty_zero_page) - LOAD_OFFSET) {
 		*(.empty_zero_page)
 	} = 0
 
 	.text : AT(ADDR(.text) - LOAD_OFFSET) {
-		_text = .;		/* Text and read-only data */
 		HEAD_TEXT
 		TEXT_TEXT
 
-- 
cgit v0.10.2


From 03f408f1aad8d0f5eb6380732bffc0f72b250fa7 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Mon, 11 May 2009 08:54:54 +0000
Subject: sh: TMU platform data for sh775x

This patch adds TMU platform data for sh775x. Both clockevent
and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7750.c b/arch/sh/kernel/cpu/sh4/setup-sh7750.c
index a1c80d9..09da0c1 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh7750.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh7750.c
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/io.h>
+#include <linux/sh_timer.h>
 #include <linux/serial_sci.h>
 
 static struct resource rtc_resources[] = {
@@ -60,9 +61,177 @@ static struct platform_device sci_device = {
 	},
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffd80008,
+		.end	= 0xffd80013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffd80014,
+		.end	= 0xffd8001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffd80020,
+		.end	= 0xffd8002f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
+/* SH7750R, SH7751 and SH7751R all have two extra timer channels */
+#if defined(CONFIG_CPU_SUBTYPE_SH7750R) || \
+	defined(CONFIG_CPU_SUBTYPE_SH7751) || \
+	defined(CONFIG_CPU_SUBTYPE_SH7751R)
+
+static struct sh_timer_config tmu3_platform_data = {
+	.name = "TMU3",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+};
+
+static struct resource tmu3_resources[] = {
+	[0] = {
+		.name	= "TMU3",
+		.start	= 0xfe100008,
+		.end	= 0xfe100013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 72,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu3_device = {
+	.name		= "sh_tmu",
+	.id		= 3,
+	.dev = {
+		.platform_data	= &tmu3_platform_data,
+	},
+	.resource	= tmu3_resources,
+	.num_resources	= ARRAY_SIZE(tmu3_resources),
+};
+
+static struct sh_timer_config tmu4_platform_data = {
+	.name = "TMU4",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+};
+
+static struct resource tmu4_resources[] = {
+	[0] = {
+		.name	= "TMU4",
+		.start	= 0xfe100014,
+		.end	= 0xfe10001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 76,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu4_device = {
+	.name		= "sh_tmu",
+	.id		= 4,
+	.dev = {
+		.platform_data	= &tmu4_platform_data,
+	},
+	.resource	= tmu4_resources,
+	.num_resources	= ARRAY_SIZE(tmu4_resources),
+};
+
+#endif
+
 static struct platform_device *sh7750_devices[] __initdata = {
 	&rtc_device,
 	&sci_device,
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+#if defined(CONFIG_CPU_SUBTYPE_SH7750R) || \
+	defined(CONFIG_CPU_SUBTYPE_SH7751) || \
+	defined(CONFIG_CPU_SUBTYPE_SH7751R)
+	&tmu3_device,
+	&tmu4_device,
+#endif
 };
 
 static int __init sh7750_devices_setup(void)
@@ -72,6 +241,24 @@ static int __init sh7750_devices_setup(void)
 }
 __initcall(sh7750_devices_setup);
 
+static struct platform_device *sh7750_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+#if defined(CONFIG_CPU_SUBTYPE_SH7750R) || \
+	defined(CONFIG_CPU_SUBTYPE_SH7751) || \
+	defined(CONFIG_CPU_SUBTYPE_SH7751R)
+	&tmu3_device,
+	&tmu4_device,
+#endif
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7750_early_devices,
+				   ARRAY_SIZE(sh7750_early_devices));
+}
+
 enum {
 	UNUSED = 0,
 
-- 
cgit v0.10.2


From c42f32dca3855d8f867387ec6993d9b62516a00e Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Mon, 11 May 2009 09:06:24 +0000
Subject: sh: TMU platform data for sh7760

This patch adds TMU platform data for sh7760. Both clockevent
and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7760.c b/arch/sh/kernel/cpu/sh4/setup-sh7760.c
index d9bdc93..4a6fd0d 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh7760.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh7760.c
@@ -10,6 +10,7 @@
 #include <linux/platform_device.h>
 #include <linux/init.h>
 #include <linux/serial.h>
+#include <linux/sh_timer.h>
 #include <linux/serial_sci.h>
 #include <linux/io.h>
 
@@ -168,8 +169,104 @@ static struct platform_device sci_device = {
 	},
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffd80008,
+		.end	= 0xffd80013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffd80014,
+		.end	= 0xffd8001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffd80020,
+		.end	= 0xffd8002f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
+
 static struct platform_device *sh7760_devices[] __initdata = {
 	&sci_device,
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
 };
 
 static int __init sh7760_devices_setup(void)
@@ -179,6 +276,18 @@ static int __init sh7760_devices_setup(void)
 }
 __initcall(sh7760_devices_setup);
 
+static struct platform_device *sh7760_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7760_early_devices,
+				   ARRAY_SIZE(sh7760_early_devices));
+}
+
 #define INTC_ICR	0xffd00000UL
 #define INTC_ICR_IRLM	(1 << 7)
 
-- 
cgit v0.10.2


From 3551f88f6439cf4da3f5a3747b320280e30500de Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 7 May 2009 15:35:41 +0300
Subject: x86: unify 64-bit UMA and NUMA paging_init()

64-bit UMA and NUMA versions of paging_init() are almost identical.
Therefore, merge the copy in mm/numa_64.c to mm/init_64.c to remove
duplicate code.

[ Impact: cleanup ]

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
LKML-Reference: <1241699741.17846.30.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 6a1a573..be7e127 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -585,6 +585,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 	early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
 	reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
 }
+#endif
 
 void __init paging_init(void)
 {
@@ -595,11 +596,14 @@ void __init paging_init(void)
 	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
 	max_zone_pfns[ZONE_NORMAL] = max_pfn;
 
+#ifdef CONFIG_NUMA
+	sparse_memory_present_with_active_regions(MAX_NUMNODES);
+#else
 	memory_present(0, 0, max_pfn);
+#endif
 	sparse_init();
 	free_area_init_nodes(max_zone_pfns);
 }
-#endif
 
 /*
  * Memory hotplug specific functions
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 2d05a12..fb61d81 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -578,21 +578,6 @@ unsigned long __init numa_free_all_bootmem(void)
 	return pages;
 }
 
-void __init paging_init(void)
-{
-	unsigned long max_zone_pfns[MAX_NR_ZONES];
-
-	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
-	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
-	max_zone_pfns[ZONE_NORMAL] = max_pfn;
-
-	sparse_memory_present_with_active_regions(MAX_NUMNODES);
-	sparse_init();
-
-	free_area_init_nodes(max_zone_pfns);
-}
-
 static __init int numa_setup(char *opt)
 {
 	if (!opt)
-- 
cgit v0.10.2


From 087fa4e964e04371a67f340e1f6ac92c5db5e0a6 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 7 May 2009 15:35:42 +0300
Subject: x86: use sparse_memory_present_with_active_regions() on UMA

There's no need to use call memory_present() manually on UMA because
initmem_init() sets up early_node_map by calling
e820_register_active_regions().

[ Impact: cleanup ]

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
LKML-Reference: <1241699742.17846.31.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index be7e127..52bb951 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -596,11 +596,7 @@ void __init paging_init(void)
 	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
 	max_zone_pfns[ZONE_NORMAL] = max_pfn;
 
-#ifdef CONFIG_NUMA
 	sparse_memory_present_with_active_regions(MAX_NUMNODES);
-#else
-	memory_present(0, 0, max_pfn);
-#endif
 	sparse_init();
 	free_area_init_nodes(max_zone_pfns);
 }
-- 
cgit v0.10.2


From 8823392360dc4992f87bf4c623834d315f297493 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Sun, 10 May 2009 10:53:05 +0200
Subject: perf_counter, x86: clean up throttling printk

s/PERFMON/perfcounters for perfcounter interrupt throttling warning.

'perfmon' is the CPU feature name that is Intel-only, while we do
throttling in a generic way.

[ Impact: cleanup ]

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a6878b0..da27419 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -814,7 +814,7 @@ void perf_counter_unthrottle(void)
 	cpuc = &__get_cpu_var(cpu_hw_counters);
 	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
 		if (printk_ratelimit())
-			printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n");
+			printk(KERN_WARNING "perfcounters: max interrupts exceeded!\n");
 		hw_perf_restore(cpuc->throttle_ctrl);
 	}
 	cpuc->interrupts = 0;
-- 
cgit v0.10.2


From 6751b71ea2c7ab8c0d65f01973a3fc8ea16992f4 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 11 May 2009 12:08:02 +1000
Subject: perf_counter: Put whole group on when enabling group leader

Currently, if you have a group where the leader is disabled and there
are siblings that are enabled, and then you enable the leader, we only
put the leader on the PMU, and not its enabled siblings.  This is
incorrect, since the enabled group members should be all on or all off
at any given point.

This fixes it by adding a call to group_sched_in in
__perf_counter_enable in the case where we're enabling a group leader.

To avoid the need for a forward declaration this also moves
group_sched_in up before __perf_counter_enable.  The actual content of
group_sched_in is unchanged by this patch.

[ Impact: fix bug in counter enable code ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <18951.34946.451546.691693@drongo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d850a1f..a5bdc93 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -419,6 +419,54 @@ counter_sched_in(struct perf_counter *counter,
 	return 0;
 }
 
+static int
+group_sched_in(struct perf_counter *group_counter,
+	       struct perf_cpu_context *cpuctx,
+	       struct perf_counter_context *ctx,
+	       int cpu)
+{
+	struct perf_counter *counter, *partial_group;
+	int ret;
+
+	if (group_counter->state == PERF_COUNTER_STATE_OFF)
+		return 0;
+
+	ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
+	if (ret)
+		return ret < 0 ? ret : 0;
+
+	group_counter->prev_state = group_counter->state;
+	if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
+		return -EAGAIN;
+
+	/*
+	 * Schedule in siblings as one group (if any):
+	 */
+	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
+		counter->prev_state = counter->state;
+		if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
+			partial_group = counter;
+			goto group_error;
+		}
+	}
+
+	return 0;
+
+group_error:
+	/*
+	 * Groups can be scheduled in as one unit only, so undo any
+	 * partial group before returning:
+	 */
+	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
+		if (counter == partial_group)
+			break;
+		counter_sched_out(counter, cpuctx, ctx);
+	}
+	counter_sched_out(group_counter, cpuctx, ctx);
+
+	return -EAGAIN;
+}
+
 /*
  * Return 1 for a group consisting entirely of software counters,
  * 0 if the group contains any hardware counters.
@@ -643,6 +691,9 @@ static void __perf_counter_enable(void *info)
 
 	if (!group_can_go_on(counter, cpuctx, 1))
 		err = -EEXIST;
+	else if (counter == leader)
+		err = group_sched_in(counter, cpuctx, ctx,
+				     smp_processor_id());
 	else
 		err = counter_sched_in(counter, cpuctx, ctx,
 				       smp_processor_id());
@@ -791,54 +842,6 @@ static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
 	__perf_counter_sched_out(&cpuctx->ctx, cpuctx);
 }
 
-static int
-group_sched_in(struct perf_counter *group_counter,
-	       struct perf_cpu_context *cpuctx,
-	       struct perf_counter_context *ctx,
-	       int cpu)
-{
-	struct perf_counter *counter, *partial_group;
-	int ret;
-
-	if (group_counter->state == PERF_COUNTER_STATE_OFF)
-		return 0;
-
-	ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
-	if (ret)
-		return ret < 0 ? ret : 0;
-
-	group_counter->prev_state = group_counter->state;
-	if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
-		return -EAGAIN;
-
-	/*
-	 * Schedule in siblings as one group (if any):
-	 */
-	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
-		counter->prev_state = counter->state;
-		if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
-			partial_group = counter;
-			goto group_error;
-		}
-	}
-
-	return 0;
-
-group_error:
-	/*
-	 * Groups can be scheduled in as one unit only, so undo any
-	 * partial group before returning:
-	 */
-	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
-		if (counter == partial_group)
-			break;
-		counter_sched_out(counter, cpuctx, ctx);
-	}
-	counter_sched_out(group_counter, cpuctx, ctx);
-
-	return -EAGAIN;
-}
-
 static void
 __perf_counter_sched_in(struct perf_counter_context *ctx,
 			struct perf_cpu_context *cpuctx, int cpu)
-- 
cgit v0.10.2


From a08b159fc243dbfe415250466d24cfc5010deee5 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 11 May 2009 15:46:10 +1000
Subject: perf_counter: don't count scheduler ticks as context switches

The context-switch software counter gives inflated values at present
because each scheduler tick and each process-wide counter
enable/disable prctl gets counted as a context switch.

This happens because perf_counter_task_tick, perf_counter_task_disable
and perf_counter_task_enable all call perf_counter_task_sched_out,
which calls perf_swcounter_event to record a context switch event.

This fixes it by introducing a variant of perf_counter_task_sched_out
with two underscores in front for internal use within the perf_counter
code, and makes perf_counter_task_{tick,disable,enable} call it.  This
variant doesn't record a context switch event, and takes a struct
perf_counter_context *.  This adds the new variant rather than
changing the behaviour or interface of perf_counter_task_sched_out
because that is called from other code.

[ Impact: fix inflated context-switch event counts ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <18951.48034.485580.498953@drongo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index a5bdc93..7373b96 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -837,6 +837,14 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 	cpuctx->task_ctx = NULL;
 }
 
+static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+
+	__perf_counter_sched_out(ctx, cpuctx);
+	cpuctx->task_ctx = NULL;
+}
+
 static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
 {
 	__perf_counter_sched_out(&cpuctx->ctx, cpuctx);
@@ -943,15 +951,13 @@ int perf_counter_task_disable(void)
 	struct perf_counter *counter;
 	unsigned long flags;
 	u64 perf_flags;
-	int cpu;
 
 	if (likely(!ctx->nr_counters))
 		return 0;
 
 	local_irq_save(flags);
-	cpu = smp_processor_id();
 
-	perf_counter_task_sched_out(curr, cpu);
+	__perf_counter_task_sched_out(ctx);
 
 	spin_lock(&ctx->lock);
 
@@ -989,7 +995,7 @@ int perf_counter_task_enable(void)
 	local_irq_save(flags);
 	cpu = smp_processor_id();
 
-	perf_counter_task_sched_out(curr, cpu);
+	__perf_counter_task_sched_out(ctx);
 
 	spin_lock(&ctx->lock);
 
@@ -1054,7 +1060,7 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	ctx = &curr->perf_counter_ctx;
 
 	perf_counter_cpu_sched_out(cpuctx);
-	perf_counter_task_sched_out(curr, cpu);
+	__perf_counter_task_sched_out(ctx);
 
 	rotate_ctx(&cpuctx->ctx);
 	rotate_ctx(ctx);
-- 
cgit v0.10.2


From 615a3f1e055ac9b0ae74e1f935a12ea2cfe2a2ad Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 11 May 2009 15:50:21 +1000
Subject: perf_counter: call atomic64_set for counter->count

A compile warning triggered because we are calling
atomic_set(&counter->count). But since counter->count
is an atomic64_t, we have to use atomic64_set.

So the count can be set short, resulting in the reset ioctl
only resetting the low word.

[ Impact: clear counter properly during the reset ioctl ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <18951.48285.270311.981806@drongo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 7373b96..5ea0240 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1281,7 +1281,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
 static void perf_counter_reset(struct perf_counter *counter)
 {
 	(void)perf_counter_read(counter);
-	atomic_set(&counter->count, 0);
+	atomic64_set(&counter->count, 0);
 	perf_counter_update_userpage(counter);
 }
 
-- 
cgit v0.10.2


From 049862579333cc6cd9e6edfd6987cd0addfd8c59 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 11 May 2009 14:33:23 +0800
Subject: blktrace: pdu_buf of pc events should be unsigned

I got this:
  8,0    1   305.417782332  2037  I   R 32 (ffffff9e 10 00 ...) [bash]

It should be:
  8,0    1   305.417782332  2037  I   R 32 (9e 10 00 ...) [bash]

[ Impact: fix output of pc events ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4A07C6B3.9080802@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e099f8c..05b4747 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1065,7 +1065,7 @@ static int blk_log_action(struct trace_iterator *iter, const char *act)
 
 static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
 {
-	const char *pdu_buf;
+	const unsigned char *pdu_buf;
 	int pdu_len;
 	int i, end, ret;
 
-- 
cgit v0.10.2


From 79c5d3ce614d8fe706545c7bca2158b63db6bb5e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 11 May 2009 15:06:46 +0800
Subject: blktrace: from-sector redundant in trace_block_remap, cleanup

The last argument of block_remap prober is the original sector
before remap, so it should be 'from', not 'to'.

[ Impact: clean up ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: "Alan D. Brunelle" <Alan.Brunelle@hp.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
LKML-Reference: <4A07CE86.5090301@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/trace/block.h b/include/trace/block.h
index 8ac945b..5b12efa 100644
--- a/include/trace/block.h
+++ b/include/trace/block.h
@@ -70,7 +70,7 @@ DECLARE_TRACE(block_split,
 
 DECLARE_TRACE(block_remap,
 	TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
-		 sector_t to),
-	      TP_ARGS(q, bio, dev, to));
+		 sector_t from),
+	      TP_ARGS(q, bio, dev, from));
 
 #endif
-- 
cgit v0.10.2


From c969f58ca43fc403c75f5d3da4cf1e21de7afaa0 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 7 Apr 2009 14:13:01 +0100
Subject: GFS2: Update the rw flags

After Jens recent updates:
http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=a1f242524c3c1f5d40f1c9c343427e34d1aadd6e
et al. this is a patch to bring gfs2 uptodate with the core
code. Also I've managed to squash another call to ll_rw_block()
along the way.

There is still one part of the GFS2 I/O paths which are not correctly
annotated and that is due to the sharing of the writeback code between
the data and metadata address spaces. I would like to change that too,
but this patch is still worth doing on its own, I think.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 98918a7..aa62cf5 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -120,7 +120,7 @@ __acquires(&sdp->sd_log_lock)
 			lock_buffer(bh);
 			if (test_clear_buffer_dirty(bh)) {
 				bh->b_end_io = end_buffer_write_sync;
-				submit_bh(WRITE, bh);
+				submit_bh(WRITE_SYNC_PLUG, bh);
 			} else {
 				unlock_buffer(bh);
 				brelse(bh);
@@ -604,7 +604,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
 	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
 		goto skip_barrier;
 	get_bh(bh);
-	submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh);
+	submit_bh(WRITE_SYNC | (1 << BIO_RW_BARRIER) | (1 << BIO_RW_META), bh);
 	wait_on_buffer(bh);
 	if (buffer_eopnotsupp(bh)) {
 		clear_buffer_eopnotsupp(bh);
@@ -664,7 +664,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
 		lock_buffer(bh);
 		if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
 			bh->b_end_io = end_buffer_write_sync;
-			submit_bh(WRITE, bh);
+			submit_bh(WRITE_SYNC_PLUG, bh);
 		} else {
 			unlock_buffer(bh);
 			brelse(bh);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 80e4f5f..00315f5 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -13,6 +13,8 @@
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/bio.h>
+#include <linux/fs.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -189,7 +191,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
 		}
 
 		gfs2_log_unlock(sdp);
-		submit_bh(WRITE, bh);
+		submit_bh(WRITE_SYNC_PLUG, bh);
 		gfs2_log_lock(sdp);
 
 		n = 0;
@@ -199,7 +201,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
 			gfs2_log_unlock(sdp);
 			lock_buffer(bd2->bd_bh);
 			bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
-			submit_bh(WRITE, bh);
+			submit_bh(WRITE_SYNC_PLUG, bh);
 			gfs2_log_lock(sdp);
 			if (++n >= num)
 				break;
@@ -341,7 +343,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 		sdp->sd_log_num_revoke--;
 
 		if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
-			submit_bh(WRITE, bh);
+			submit_bh(WRITE_SYNC_PLUG, bh);
 
 			bh = gfs2_log_get_buf(sdp);
 			mh = (struct gfs2_meta_header *)bh->b_data;
@@ -358,7 +360,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 	}
 	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
 
-	submit_bh(WRITE, bh);
+	submit_bh(WRITE_SYNC_PLUG, bh);
 }
 
 static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
@@ -560,7 +562,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
 	ptr = bh_log_ptr(bh);
 	
 	get_bh(bh);
-	submit_bh(WRITE, bh);
+	submit_bh(WRITE_SYNC_PLUG, bh);
 	gfs2_log_lock(sdp);
 	while(!list_empty(list)) {
 		bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list);
@@ -586,7 +588,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
 		} else {
 			bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh);
 		}
-		submit_bh(WRITE, bh1);
+		submit_bh(WRITE_SYNC_PLUG, bh1);
 		gfs2_log_lock(sdp);
 		ptr += 2;
 	}
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 8d6f132..75b2aec 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -201,16 +201,32 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
 int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
 		   struct buffer_head **bhp)
 {
-	*bhp = gfs2_getbuf(gl, blkno, CREATE);
-	if (!buffer_uptodate(*bhp)) {
-		ll_rw_block(READ_META, 1, bhp);
-		if (flags & DIO_WAIT) {
-			int error = gfs2_meta_wait(gl->gl_sbd, *bhp);
-			if (error) {
-				brelse(*bhp);
-				return error;
-			}
-		}
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct buffer_head *bh;
+
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		return -EIO;
+
+	*bhp = bh = gfs2_getbuf(gl, blkno, CREATE);
+
+	lock_buffer(bh);
+	if (buffer_uptodate(bh)) {
+		unlock_buffer(bh);
+		return 0;
+	}
+	bh->b_end_io = end_buffer_read_sync;
+	get_bh(bh);
+	submit_bh(READ_SYNC | (1 << BIO_RW_META), bh);
+	if (!(flags & DIO_WAIT))
+		return 0;
+
+	wait_on_buffer(bh);
+	if (unlikely(!buffer_uptodate(bh))) {
+		struct gfs2_trans *tr = current->journal_info;
+		if (tr && tr->tr_touched)
+			gfs2_io_error_bh(sdp, bh);
+		brelse(bh);
+		return -EIO;
 	}
 
 	return 0;
@@ -404,7 +420,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
 	if (buffer_uptodate(first_bh))
 		goto out;
 	if (!buffer_locked(first_bh))
-		ll_rw_block(READ_META, 1, &first_bh);
+		ll_rw_block(READ_SYNC | (1 << BIO_RW_META), 1, &first_bh);
 
 	dblock++;
 	extlen--;
-- 
cgit v0.10.2


From 4a0f9a321a113392b448e477018311d14fba2b34 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 20 Apr 2009 08:16:26 +0100
Subject: GFS2: Optimise writepage for metadata

This adds a GFS2 specific writepage for metadata, rather than
continuing to use the VFS function. As a result we now tag all
our metadata I/O with the correct flag so that blktraces will
now be less confusing.

Also, the generic function was checking for a number of corner
cases which cannot happen on the metadata address spaces so that
this should be faster too.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 75b2aec..78a5f43 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -33,17 +33,65 @@
 #include "util.h"
 #include "ops_address.h"
 
-static int aspace_get_block(struct inode *inode, sector_t lblock,
-			    struct buffer_head *bh_result, int create)
+static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
 {
-	gfs2_assert_warn(inode->i_sb->s_fs_info, 0);
-	return -EOPNOTSUPP;
-}
+	int err;
+	struct buffer_head *bh, *head;
+	int nr_underway = 0;
+	int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ?
+			WRITE_SYNC_PLUG : WRITE));
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(!page_has_buffers(page));
+
+	head = page_buffers(page);
+	bh = head;
+
+	do {
+		if (!buffer_mapped(bh))
+			continue;
+		/*
+		 * If it's a fully non-blocking write attempt and we cannot
+		 * lock the buffer then redirty the page.  Note that this can
+		 * potentially cause a busy-wait loop from pdflush and kswapd
+		 * activity, but those code paths have their own higher-level
+		 * throttling.
+		 */
+		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+			lock_buffer(bh);
+		} else if (!trylock_buffer(bh)) {
+			redirty_page_for_writepage(wbc, page);
+			continue;
+		}
+		if (test_clear_buffer_dirty(bh)) {
+			mark_buffer_async_write(bh);
+		} else {
+			unlock_buffer(bh);
+		}
+	} while ((bh = bh->b_this_page) != head);
+
+	/*
+	 * The page and its buffers are protected by PageWriteback(), so we can
+	 * drop the bh refcounts early.
+	 */
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+
+	do {
+		struct buffer_head *next = bh->b_this_page;
+		if (buffer_async_write(bh)) {
+			submit_bh(write_op, bh);
+			nr_underway++;
+		}
+		bh = next;
+	} while (bh != head);
+	unlock_page(page);
 
-static int gfs2_aspace_writepage(struct page *page,
-				 struct writeback_control *wbc)
-{
-	return block_write_full_page(page, aspace_get_block, wbc);
+	err = 0;
+	if (nr_underway == 0)
+		end_page_writeback(page);
+
+	return err;
 }
 
 static const struct address_space_operations aspace_aops = {
-- 
cgit v0.10.2


From 48bf2b1711dc498494e77705c415ee46bb508fd9 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 29 Apr 2009 13:59:35 +0100
Subject: GFS2: Something nonlinear this way comes!

For some reason GFS2 has been missing support for non-linear
mappings. This patch fixes that, and also avoids taking any
locks for mmap in the O_NOATIME case. In fact we don't actually need
to take the lock here at all - just doing file_accessed() would be
enough, but we have to take the lock eventually and this helps
it hit disk (and thus be seen by other nodes) faster.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 5d82e91..0ee7bd2 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -425,33 +425,36 @@ static struct vm_operations_struct gfs2_vm_ops = {
 	.page_mkwrite = gfs2_page_mkwrite,
 };
 
-
 /**
  * gfs2_mmap -
  * @file: The file to map
  * @vma: The VMA which described the mapping
  *
- * Returns: 0 or error code
+ * There is no need to get a lock here unless we should be updating
+ * atime. We ignore any locking errors since the only consequence is
+ * a missed atime update (which will just be deferred until later).
+ *
+ * Returns: 0
  */
 
 static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-	struct gfs2_holder i_gh;
-	int error;
 
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
-	error = gfs2_glock_nq(&i_gh);
-	if (error) {
-		gfs2_holder_uninit(&i_gh);
-		return error;
-	}
+	if (!(file->f_flags & O_NOATIME)) {
+		struct gfs2_holder i_gh;
+		int error;
 
+		gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+		error = gfs2_glock_nq(&i_gh);
+		file_accessed(file);
+		if (error == 0)
+			gfs2_glock_dq_uninit(&i_gh);
+	}
 	vma->vm_ops = &gfs2_vm_ops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
 
-	gfs2_glock_dq_uninit(&i_gh);
-
-	return error;
+	return 0;
 }
 
 /**
-- 
cgit v0.10.2


From 7c77f0b3f9208c339a4b40737bb2cb0f0319bb8d Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Thu, 7 May 2009 15:37:35 +0200
Subject: splice: implement pipe to pipe splicing

Allow splice(2) to work when both the input and the output is a pipe.

Based on the impementation of the tee(2) syscall, but instead of
duplicating the buffer references move the buffers from the input pipe
to the output pipe.

Moving the whole buffer only succeeds if the full length of the buffer
is spliced.  Otherwise duplicate the buffer, just like tee(2), set the
length of the output buffer and advance the offset on the input
buffer.

Since splice is operating on two pipes, special care needs to be taken
with locking to prevent AN ABBA deadlock.  Again this is done
similarly to the tee(2) syscall, first preparing the input and output
pipes so there's data to consume and space for that data, and then
doing the move operation while holding both locks.

If other processes are doing I/O on the same pipes parallel to the
splice, then by the time both inodes are locked there might be no
buffers left to move, or no space to move them to.  In this case retry
the whole operation, including the preparation phase.  This could lead
to starvation, but I'm not sure if that's serious enough to worry
about.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/splice.c b/fs/splice.c
index 666953d5..e405cf5 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1112,6 +1112,9 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 	return ret;
 }
 
+static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
+			       struct pipe_inode_info *opipe,
+			       size_t len, unsigned int flags);
 /*
  * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
  * location, so checking ->i_pipe is not enough to verify that this is a
@@ -1132,12 +1135,32 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		      struct file *out, loff_t __user *off_out,
 		      size_t len, unsigned int flags)
 {
-	struct pipe_inode_info *pipe;
+	struct pipe_inode_info *ipipe;
+	struct pipe_inode_info *opipe;
 	loff_t offset, *off;
 	long ret;
 
-	pipe = pipe_info(in->f_path.dentry->d_inode);
-	if (pipe) {
+	ipipe = pipe_info(in->f_path.dentry->d_inode);
+	opipe = pipe_info(out->f_path.dentry->d_inode);
+
+	if (ipipe && opipe) {
+		if (off_in || off_out)
+			return -ESPIPE;
+
+		if (!(in->f_mode & FMODE_READ))
+			return -EBADF;
+
+		if (!(out->f_mode & FMODE_WRITE))
+			return -EBADF;
+
+		/* Splicing to self would be fun, but... */
+		if (ipipe == opipe)
+			return -EINVAL;
+
+		return splice_pipe_to_pipe(ipipe, opipe, len, flags);
+	}
+
+	if (ipipe) {
 		if (off_in)
 			return -ESPIPE;
 		if (off_out) {
@@ -1149,7 +1172,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		} else
 			off = &out->f_pos;
 
-		ret = do_splice_from(pipe, out, off, len, flags);
+		ret = do_splice_from(ipipe, out, off, len, flags);
 
 		if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
 			ret = -EFAULT;
@@ -1157,8 +1180,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		return ret;
 	}
 
-	pipe = pipe_info(out->f_path.dentry->d_inode);
-	if (pipe) {
+	if (opipe) {
 		if (off_out)
 			return -ESPIPE;
 		if (off_in) {
@@ -1170,7 +1192,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		} else
 			off = &in->f_pos;
 
-		ret = do_splice_to(in, off, pipe, len, flags);
+		ret = do_splice_to(in, off, opipe, len, flags);
 
 		if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
 			ret = -EFAULT;
@@ -1511,7 +1533,7 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
  * Make sure there's data to read. Wait for input if we can, otherwise
  * return an appropriate error.
  */
-static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
+static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 {
 	int ret;
 
@@ -1549,7 +1571,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
  * Make sure there's writeable room. Wait for room if we can, otherwise
  * return an appropriate error.
  */
-static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
+static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 {
 	int ret;
 
@@ -1587,6 +1609,124 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 }
 
 /*
+ * Splice contents of ipipe to opipe.
+ */
+static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
+			       struct pipe_inode_info *opipe,
+			       size_t len, unsigned int flags)
+{
+	struct pipe_buffer *ibuf, *obuf;
+	int ret = 0, nbuf;
+	bool input_wakeup = false;
+
+
+retry:
+	ret = ipipe_prep(ipipe, flags);
+	if (ret)
+		return ret;
+
+	ret = opipe_prep(opipe, flags);
+	if (ret)
+		return ret;
+
+	/*
+	 * Potential ABBA deadlock, work around it by ordering lock
+	 * grabbing by pipe info address. Otherwise two different processes
+	 * could deadlock (one doing tee from A -> B, the other from B -> A).
+	 */
+	pipe_double_lock(ipipe, opipe);
+
+	do {
+		if (!opipe->readers) {
+			send_sig(SIGPIPE, current, 0);
+			if (!ret)
+				ret = -EPIPE;
+			break;
+		}
+
+		if (!ipipe->nrbufs && !ipipe->writers)
+			break;
+
+		/*
+		 * Cannot make any progress, because either the input
+		 * pipe is empty or the output pipe is full.
+		 */
+		if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) {
+			/* Already processed some buffers, break */
+			if (ret)
+				break;
+
+			if (flags & SPLICE_F_NONBLOCK) {
+				ret = -EAGAIN;
+				break;
+			}
+
+			/*
+			 * We raced with another reader/writer and haven't
+			 * managed to process any buffers.  A zero return
+			 * value means EOF, so retry instead.
+			 */
+			pipe_unlock(ipipe);
+			pipe_unlock(opipe);
+			goto retry;
+		}
+
+		ibuf = ipipe->bufs + ipipe->curbuf;
+		nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS;
+		obuf = opipe->bufs + nbuf;
+
+		if (len >= ibuf->len) {
+			/*
+			 * Simply move the whole buffer from ipipe to opipe
+			 */
+			*obuf = *ibuf;
+			ibuf->ops = NULL;
+			opipe->nrbufs++;
+			ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS;
+			ipipe->nrbufs--;
+			input_wakeup = true;
+		} else {
+			/*
+			 * Get a reference to this pipe buffer,
+			 * so we can copy the contents over.
+			 */
+			ibuf->ops->get(ipipe, ibuf);
+			*obuf = *ibuf;
+
+			/*
+			 * Don't inherit the gift flag, we need to
+			 * prevent multiple steals of this page.
+			 */
+			obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+
+			obuf->len = len;
+			opipe->nrbufs++;
+			ibuf->offset += obuf->len;
+			ibuf->len -= obuf->len;
+		}
+		ret += obuf->len;
+		len -= obuf->len;
+	} while (len);
+
+	pipe_unlock(ipipe);
+	pipe_unlock(opipe);
+
+	/*
+	 * If we put data in the output pipe, wakeup any potential readers.
+	 */
+	if (ret > 0) {
+		smp_mb();
+		if (waitqueue_active(&opipe->wait))
+			wake_up_interruptible(&opipe->wait);
+		kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
+	}
+	if (input_wakeup)
+		wakeup_pipe_writers(ipipe);
+
+	return ret;
+}
+
+/*
  * Link contents of ipipe to opipe.
  */
 static int link_pipe(struct pipe_inode_info *ipipe,
@@ -1690,9 +1830,9 @@ static long do_tee(struct file *in, struct file *out, size_t len,
 		 * Keep going, unless we encounter an error. The ipipe/opipe
 		 * ordering doesn't really matter.
 		 */
-		ret = link_ipipe_prep(ipipe, flags);
+		ret = ipipe_prep(ipipe, flags);
 		if (!ret) {
-			ret = link_opipe_prep(opipe, flags);
+			ret = opipe_prep(opipe, flags);
 			if (!ret)
 				ret = link_pipe(ipipe, opipe, len, flags);
 		}
-- 
cgit v0.10.2


From 6818173bd658439b83896a2a7586f64ab51bf29c Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Thu, 7 May 2009 15:37:36 +0200
Subject: splice: implement default splice_read method

If f_op->splice_read() is not implemented, fall back to a plain read.
Use vfs_readv() to read into previously allocated pages.

This will allow splice and functions using splice, such as the loop
device, to work on all filesystems.  This includes "direct_io" files
in fuse which bypass the page cache.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 9ca4bb0..801f4ab 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -709,10 +709,6 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 	if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
 		goto out_putf;
 
-	/* new backing store needs to support loop (eg splice_read) */
-	if (!inode->i_fop->splice_read)
-		goto out_putf;
-
 	/* size of the new backing store needs to be the same */
 	if (get_loop_size(lo, file) != get_loop_size(lo, old_file))
 		goto out_putf;
@@ -788,12 +784,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	error = -EINVAL;
 	if (S_ISREG(inode->i_mode) || S_ISBLK(inode->i_mode)) {
 		const struct address_space_operations *aops = mapping->a_ops;
-		/*
-		 * If we can't read - sorry. If we only can't write - well,
-		 * it's going to be read-only.
-		 */
-		if (!file->f_op->splice_read)
-			goto out_putf;
+
 		if (aops->write_begin)
 			lo_flags |= LO_FLAGS_USE_AOPS;
 		if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write)
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 6a347fb..ffd4281 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -47,6 +47,8 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos,
 		      struct pipe_inode_info *pipe, size_t count,
 		      unsigned int flags)
 {
+	ssize_t (*splice_read)(struct file *, loff_t *,
+			       struct pipe_inode_info *, size_t, unsigned int);
 	struct coda_file_info *cfi;
 	struct file *host_file;
 
@@ -54,10 +56,11 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos,
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
 	host_file = cfi->cfi_container;
 
-	if (!host_file->f_op || !host_file->f_op->splice_read)
-		return -EINVAL;
+	splice_read = host_file->f_op->splice_read;
+	if (!splice_read)
+		splice_read = default_file_splice_read;
 
-	return host_file->f_op->splice_read(host_file, ppos, pipe, count,flags);
+	return splice_read(host_file, ppos, pipe, count, flags);
 }
 
 static ssize_t
diff --git a/fs/pipe.c b/fs/pipe.c
index 13414ec..f7dd21a 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -302,6 +302,20 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *info,
 	return 0;
 }
 
+/**
+ * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
+ * @pipe:	the pipe that the buffer belongs to
+ * @buf:	the buffer to put a reference to
+ *
+ * Description:
+ *	This function releases a reference to @buf.
+ */
+void generic_pipe_buf_release(struct pipe_inode_info *pipe,
+			      struct pipe_buffer *buf)
+{
+	page_cache_release(buf->page);
+}
+
 static const struct pipe_buf_operations anon_pipe_buf_ops = {
 	.can_merge = 1,
 	.map = generic_pipe_buf_map,
diff --git a/fs/read_write.c b/fs/read_write.c
index 9d1e76b..6c8c55d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -805,12 +805,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 		goto out;
 	if (!(in_file->f_mode & FMODE_READ))
 		goto fput_in;
-	retval = -EINVAL;
-	in_inode = in_file->f_path.dentry->d_inode;
-	if (!in_inode)
-		goto fput_in;
-	if (!in_file->f_op || !in_file->f_op->splice_read)
-		goto fput_in;
 	retval = -ESPIPE;
 	if (!ppos)
 		ppos = &in_file->f_pos;
@@ -834,6 +828,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 	retval = -EINVAL;
 	if (!out_file->f_op || !out_file->f_op->sendpage)
 		goto fput_out;
+	in_inode = in_file->f_path.dentry->d_inode;
 	out_inode = out_file->f_path.dentry->d_inode;
 	retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
 	if (retval < 0)
diff --git a/fs/splice.c b/fs/splice.c
index e405cf5..3bd9cb2 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -507,9 +507,116 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 
 	return ret;
 }
-
 EXPORT_SYMBOL(generic_file_splice_read);
 
+static const struct pipe_buf_operations default_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.confirm = generic_pipe_buf_confirm,
+	.release = generic_pipe_buf_release,
+	.steal = generic_pipe_buf_steal,
+	.get = generic_pipe_buf_get,
+};
+
+static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
+			    unsigned long vlen, loff_t offset)
+{
+	mm_segment_t old_fs;
+	loff_t pos = offset;
+	ssize_t res;
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	/* The cast to a user pointer is valid due to the set_fs() */
+	res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
+	set_fs(old_fs);
+
+	return res;
+}
+
+ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
+				 struct pipe_inode_info *pipe, size_t len,
+				 unsigned int flags)
+{
+	unsigned int nr_pages;
+	unsigned int nr_freed;
+	size_t offset;
+	struct page *pages[PIPE_BUFFERS];
+	struct partial_page partial[PIPE_BUFFERS];
+	struct iovec vec[PIPE_BUFFERS];
+	pgoff_t index;
+	ssize_t res;
+	size_t this_len;
+	int error;
+	int i;
+	struct splice_pipe_desc spd = {
+		.pages = pages,
+		.partial = partial,
+		.flags = flags,
+		.ops = &default_pipe_buf_ops,
+		.spd_release = spd_release_page,
+	};
+
+	index = *ppos >> PAGE_CACHE_SHIFT;
+	offset = *ppos & ~PAGE_CACHE_MASK;
+	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) {
+		struct page *page;
+
+		page = alloc_page(GFP_HIGHUSER);
+		error = -ENOMEM;
+		if (!page)
+			goto err;
+
+		this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
+		vec[i].iov_base = (void __user *) kmap(page);
+		vec[i].iov_len = this_len;
+		pages[i] = page;
+		spd.nr_pages++;
+		len -= this_len;
+		offset = 0;
+	}
+
+	res = kernel_readv(in, vec, spd.nr_pages, *ppos);
+	if (res < 0)
+		goto err;
+
+	error = 0;
+	if (!res)
+		goto err;
+
+	nr_freed = 0;
+	for (i = 0; i < spd.nr_pages; i++) {
+		kunmap(pages[i]);
+		this_len = min_t(size_t, vec[i].iov_len, res);
+		partial[i].offset = 0;
+		partial[i].len = this_len;
+		if (!this_len) {
+			__free_page(pages[i]);
+			pages[i] = NULL;
+			nr_freed++;
+		}
+		res -= this_len;
+	}
+	spd.nr_pages -= nr_freed;
+
+	res = splice_to_pipe(pipe, &spd);
+	if (res > 0)
+		*ppos += res;
+
+	return res;
+
+err:
+	for (i = 0; i < spd.nr_pages; i++) {
+		kunmap(pages[i]);
+		__free_page(pages[i]);
+	}
+	return error;
+}
+EXPORT_SYMBOL(default_file_splice_read);
+
 /*
  * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
  * using sendpage(). Return the number of bytes sent.
@@ -933,11 +1040,10 @@ static long do_splice_to(struct file *in, loff_t *ppos,
 			 struct pipe_inode_info *pipe, size_t len,
 			 unsigned int flags)
 {
+	ssize_t (*splice_read)(struct file *, loff_t *,
+			       struct pipe_inode_info *, size_t, unsigned int);
 	int ret;
 
-	if (unlikely(!in->f_op || !in->f_op->splice_read))
-		return -EINVAL;
-
 	if (unlikely(!(in->f_mode & FMODE_READ)))
 		return -EBADF;
 
@@ -945,7 +1051,11 @@ static long do_splice_to(struct file *in, loff_t *ppos,
 	if (unlikely(ret < 0))
 		return ret;
 
-	return in->f_op->splice_read(in, ppos, pipe, len, flags);
+	splice_read = in->f_op->splice_read;
+	if (!splice_read)
+		splice_read = default_file_splice_read;
+
+	return splice_read(in, ppos, pipe, len, flags);
 }
 
 /**
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5bed436..d926c2b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2204,6 +2204,8 @@ extern int generic_segment_checks(const struct iovec *iov,
 /* fs/splice.c */
 extern ssize_t generic_file_splice_read(struct file *, loff_t *,
 		struct pipe_inode_info *, size_t, unsigned int);
+extern ssize_t default_file_splice_read(struct file *, loff_t *,
+		struct pipe_inode_info *, size_t, unsigned int);
 extern ssize_t generic_file_splice_write(struct pipe_inode_info *,
 		struct file *, loff_t *, size_t, unsigned int);
 extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe,
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index c8f0385..b43a9e0 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -152,5 +152,6 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *, struct pipe_buffer *, void
 void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
 int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *);
 int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
+void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);
 
 #endif
-- 
cgit v0.10.2


From 0b0a47f5c4a30b58432e20ae1706a27baea91a88 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Thu, 7 May 2009 15:37:37 +0200
Subject: splice: implement default splice_write method

If f_op->splice_write() is not implemented, fall back to a plain write.
Use vfs_writev() to write from the pipe buffers.

This will allow splice on all filesystems and file types.  This
includes "direct_io" files in fuse which bypass the page cache.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/splice.c b/fs/splice.c
index 3bd9cb2..eefd96b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -535,6 +535,21 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
 	return res;
 }
 
+static ssize_t kernel_writev(struct file *file, const struct iovec *vec,
+			    unsigned long vlen, loff_t *ppos)
+{
+	mm_segment_t old_fs;
+	ssize_t res;
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	/* The cast to a user pointer is valid due to the set_fs() */
+	res = vfs_writev(file, (const struct iovec __user *)vec, vlen, ppos);
+	set_fs(old_fs);
+
+	return res;
+}
+
 ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 				 struct pipe_inode_info *pipe, size_t len,
 				 unsigned int flags)
@@ -988,6 +1003,122 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 
 EXPORT_SYMBOL(generic_file_splice_write);
 
+static struct pipe_buffer *nth_pipe_buf(struct pipe_inode_info *pipe, int n)
+{
+	return &pipe->bufs[(pipe->curbuf + n) % PIPE_BUFFERS];
+}
+
+static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
+					 struct file *out, loff_t *ppos,
+					 size_t len, unsigned int flags)
+{
+	ssize_t ret = 0;
+	ssize_t total_len = 0;
+	int do_wakeup = 0;
+
+	pipe_lock(pipe);
+	while (len) {
+		struct pipe_buffer *buf;
+		void *data[PIPE_BUFFERS];
+		struct iovec vec[PIPE_BUFFERS];
+		unsigned int nr_pages = 0;
+		unsigned int write_len = 0;
+		unsigned int now_len = len;
+		unsigned int this_len;
+		int i;
+
+		BUG_ON(pipe->nrbufs > PIPE_BUFFERS);
+		for (i = 0; i < pipe->nrbufs && now_len; i++) {
+			buf = nth_pipe_buf(pipe, i);
+
+			ret = buf->ops->confirm(pipe, buf);
+			if (ret)
+				break;
+
+			data[i] = buf->ops->map(pipe, buf, 0);
+			this_len = min(buf->len, now_len);
+			vec[i].iov_base = (void __user *) data[i] + buf->offset;
+			vec[i].iov_len = this_len;
+			now_len -= this_len;
+			write_len += this_len;
+			nr_pages++;
+		}
+
+		if (nr_pages) {
+			ret = kernel_writev(out, vec, nr_pages, ppos);
+			if (ret == 0)
+				ret = -EIO;
+			if (ret > 0) {
+				len -= ret;
+				total_len += ret;
+			}
+		}
+
+		for (i = 0; i < nr_pages; i++) {
+			buf = nth_pipe_buf(pipe, i);
+			buf->ops->unmap(pipe, buf, data[i]);
+
+			if (ret > 0) {
+				this_len = min_t(unsigned, vec[i].iov_len, ret);
+				buf->offset += this_len;
+				buf->len -= this_len;
+				ret -= this_len;
+			}
+		}
+
+		if (ret < 0)
+			break;
+
+		while (pipe->nrbufs) {
+			const struct pipe_buf_operations *ops;
+
+			buf = nth_pipe_buf(pipe, 0);
+			if (buf->len)
+				break;
+
+			ops = buf->ops;
+			buf->ops = NULL;
+			ops->release(pipe, buf);
+			pipe->curbuf = (pipe->curbuf + 1) % PIPE_BUFFERS;
+			pipe->nrbufs--;
+			if (pipe->inode)
+				do_wakeup = 1;
+		}
+
+		if (pipe->nrbufs)
+			continue;
+		if (!pipe->writers)
+			break;
+		if (!pipe->waiting_writers) {
+			if (total_len)
+				break;
+		}
+
+		if (flags & SPLICE_F_NONBLOCK) {
+			ret = -EAGAIN;
+			break;
+		}
+
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+
+		if (do_wakeup) {
+			wakeup_pipe_writers(pipe);
+			do_wakeup = 0;
+		}
+
+		pipe_wait(pipe);
+	}
+	pipe_unlock(pipe);
+
+	if (do_wakeup)
+		wakeup_pipe_writers(pipe);
+
+	return total_len ? total_len : ret;
+}
+
 /**
  * generic_splice_sendpage - splice data from a pipe to a socket
  * @pipe:	pipe to splice from
@@ -1015,11 +1146,10 @@ EXPORT_SYMBOL(generic_splice_sendpage);
 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 			   loff_t *ppos, size_t len, unsigned int flags)
 {
+	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
+				loff_t *, size_t, unsigned int);
 	int ret;
 
-	if (unlikely(!out->f_op || !out->f_op->splice_write))
-		return -EINVAL;
-
 	if (unlikely(!(out->f_mode & FMODE_WRITE)))
 		return -EBADF;
 
@@ -1030,7 +1160,11 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 	if (unlikely(ret < 0))
 		return ret;
 
-	return out->f_op->splice_write(pipe, out, ppos, len, flags);
+	splice_write = out->f_op->splice_write;
+	if (!splice_write)
+		splice_write = default_file_splice_write;
+
+	return splice_write(pipe, out, ppos, len, flags);
 }
 
 /*
-- 
cgit v0.10.2


From 97a52714658cd959a3cfa35c5b6f489859f0204b Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Fri, 8 May 2009 18:23:50 +0200
Subject: x86: display extended apic registers with print_local_APIC and
 cpu_debug code

Both print_local_APIC (used when apic=debug kernel param is set) and
cpu_debug code missed support for some extended APIC registers that
I'd like to see.

This adds support to show:

 - extended APIC feature register
 - extended APIC control register
 - extended LVT registers

[ Impact: print more debug info ]

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Jaswinder Singh Rajput <jaswinder@kernel.org>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <20090508162350.GO29045@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index bc9514f..7ddb36a 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -22,6 +22,7 @@
 #  define	APIC_INTEGRATED(x)	(1)
 #endif
 #define		APIC_XAPIC(x)		((x) >= 0x14)
+#define		APIC_EXT_SPACE(x)	((x) & 0x80000000)
 #define	APIC_TASKPRI	0x80
 #define		APIC_TPRI_MASK		0xFFu
 #define	APIC_ARBPRI	0x90
@@ -116,7 +117,9 @@
 #define		APIC_TDR_DIV_32		0x8
 #define		APIC_TDR_DIV_64		0x9
 #define		APIC_TDR_DIV_128	0xA
-#define	APIC_EILVT0     0x500
+#define	APIC_EFEAT	0x400
+#define	APIC_ECTRL	0x410
+#define APIC_EILVTn(n)	(0x500 + 0x10 * n)
 #define		APIC_EILVT_NR_AMD_K8	1	/* # of extended interrupts */
 #define		APIC_EILVT_NR_AMD_10H	4
 #define		APIC_EILVT_LVTOFF(x)	(((x) >> 4) & 0xF)
@@ -125,9 +128,6 @@
 #define		APIC_EILVT_MSG_NMI	0x4
 #define		APIC_EILVT_MSG_EXT	0x7
 #define		APIC_EILVT_MASKED	(1 << 16)
-#define	APIC_EILVT1     0x510
-#define	APIC_EILVT2     0x520
-#define	APIC_EILVT3     0x530
 
 #define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
 #define APIC_BASE_MSR	0x800
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 1ee966f..0e6543f 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -395,7 +395,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
 
 static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
 {
-	unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
+	unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0);
 	unsigned int  v   = (mask << 16) | (msg_type << 8) | vector;
 
 	apic_write(reg, v);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 2afe145..65b824c 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1739,7 +1739,7 @@ __apicdebuginit(void) print_APIC_bitfield(int base)
 
 __apicdebuginit(void) print_local_APIC(void *dummy)
 {
-	unsigned int v, ver, maxlvt;
+	unsigned int i, v, ver, maxlvt;
 	u64 icr;
 
 	if (apic_verbosity == APIC_QUIET)
@@ -1827,6 +1827,18 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
 	printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
 	v = apic_read(APIC_TDCR);
 	printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
+
+	if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
+		v = apic_read(APIC_EFEAT);
+		maxlvt = (v >> 16) & 0xff;
+		printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v);
+		v = apic_read(APIC_ECTRL);
+		printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v);
+		for (i = 0; i < maxlvt; i++) {
+			v = apic_read(APIC_EILVTn(i));
+			printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v);
+		}
+	}
 	printk("\n");
 }
 
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 46e29ab..2fc4f6b 100644
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -588,8 +588,20 @@ static void print_apic(void *arg)
 	seq_printf(seq, " TMICT\t\t: %08x\n",  apic_read(APIC_TMICT));
 	seq_printf(seq, " TMCCT\t\t: %08x\n",  apic_read(APIC_TMCCT));
 	seq_printf(seq, " TDCR\t\t: %08x\n",  apic_read(APIC_TDCR));
-#endif /* CONFIG_X86_LOCAL_APIC */
+	if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
+		unsigned int i, v, maxeilvt;
+
+		v = apic_read(APIC_EFEAT);
+		maxeilvt = (v >> 16) & 0xff;
+		seq_printf(seq, " EFEAT\t\t: %08x\n", v);
+		seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL));
 
+		for (i = 0; i < maxeilvt; i++) {
+			v = apic_read(APIC_EILVTn(i));
+			seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v);
+		}
+	}
+#endif /* CONFIG_X86_LOCAL_APIC */
 	seq_printf(seq, "\n MSR\t:\n");
 }
 
-- 
cgit v0.10.2


From 3d6ad460214cc72b93333f51f498441a56d622e9 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Mon, 11 May 2009 09:01:08 +0000
Subject: sh: multiple vectors per irq - sh7760

Update intc tables and platform data to use one linux irq
per maskable interrupt source instead of keeping the one-to-one
mapping between vectors and linux irqs.

This fixes potential irq masking issues for sh7760 hardware
blocks such as DMAC/TMU2/REF.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/dma/Kconfig b/arch/sh/drivers/dma/Kconfig
index 666713a..63e9dd3 100644
--- a/arch/sh/drivers/dma/Kconfig
+++ b/arch/sh/drivers/dma/Kconfig
@@ -16,7 +16,8 @@ config SH_DMA_IRQ_MULTI
 		     CPU_SUBTYPE_SH7750S || CPU_SUBTYPE_SH7750R || \
 		     CPU_SUBTYPE_SH7751R || CPU_SUBTYPE_SH7091  || \
 		     CPU_SUBTYPE_SH7763  || CPU_SUBTYPE_SH7764  || \
-		     CPU_SUBTYPE_SH7780  || CPU_SUBTYPE_SH7785
+		     CPU_SUBTYPE_SH7780  || CPU_SUBTYPE_SH7785  || \
+		     CPU_SUBTYPE_SH7760
 
 config NR_ONCHIP_DMA_CHANNELS
 	int
diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7760.c b/arch/sh/kernel/cpu/sh4/setup-sh7760.c
index 4a6fd0d..cd09733 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh7760.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh7760.c
@@ -19,10 +19,7 @@ enum {
 
 	/* interrupt sources */
 	IRL0, IRL1, IRL2, IRL3,
-	HUDI, GPIOI,
-	DMAC_DMTE0, DMAC_DMTE1, DMAC_DMTE2, DMAC_DMTE3,
-	DMAC_DMTE4, DMAC_DMTE5, DMAC_DMTE6, DMAC_DMTE7,
-	DMAC_DMAE,
+	HUDI, GPIOI, DMAC,
 	IRQ4, IRQ5, IRQ6, IRQ7,
 	HCAN20, HCAN21,
 	SSI0, SSI1,
@@ -37,21 +34,20 @@ enum {
 	HSPI,
 	MMCIF0, MMCIF1, MMCIF2, MMCIF3,
 	MFI, ADC, CMT,
-	TMU0, TMU1, TMU2_TUNI, TMU2_TICPI,
-	WDT,
-	REF_RCMI, REF_ROVI,
+	TMU0, TMU1, TMU2,
+	WDT, REF,
 
 	/* interrupt groups */
-	DMAC, DMABRG, SCIF0, SCIF1, SCIF2, SIM, MMCIF, TMU2, REF,
+	DMABRG, SCIF0, SCIF1, SCIF2, SIM, MMCIF,
 };
 
 static struct intc_vect vectors[] __initdata = {
 	INTC_VECT(HUDI, 0x600), INTC_VECT(GPIOI, 0x620),
-	INTC_VECT(DMAC_DMTE0, 0x640), INTC_VECT(DMAC_DMTE1, 0x660),
-	INTC_VECT(DMAC_DMTE2, 0x680), INTC_VECT(DMAC_DMTE3, 0x6a0),
-	INTC_VECT(DMAC_DMTE4, 0x780), INTC_VECT(DMAC_DMTE5, 0x7a0),
-	INTC_VECT(DMAC_DMTE6, 0x7c0), INTC_VECT(DMAC_DMTE7, 0x7e0),
-	INTC_VECT(DMAC_DMAE, 0x6c0),
+	INTC_VECT(DMAC, 0x640), INTC_VECT(DMAC, 0x660),
+	INTC_VECT(DMAC, 0x680), INTC_VECT(DMAC, 0x6a0),
+	INTC_VECT(DMAC, 0x780), INTC_VECT(DMAC, 0x7a0),
+	INTC_VECT(DMAC, 0x7c0), INTC_VECT(DMAC, 0x7e0),
+	INTC_VECT(DMAC, 0x6c0),
 	INTC_VECT(IRQ4, 0x800), INTC_VECT(IRQ5, 0x820),
 	INTC_VECT(IRQ6, 0x840), INTC_VECT(IRQ6, 0x860),
 	INTC_VECT(HCAN20, 0x900), INTC_VECT(HCAN21, 0x920),
@@ -75,23 +71,18 @@ static struct intc_vect vectors[] __initdata = {
 	INTC_VECT(MFI, 0xe80), /* 0xf80 according to data sheet */
 	INTC_VECT(ADC, 0xf80), INTC_VECT(CMT, 0xfa0),
 	INTC_VECT(TMU0, 0x400), INTC_VECT(TMU1, 0x420),
-	INTC_VECT(TMU2_TUNI, 0x440), INTC_VECT(TMU2_TICPI, 0x460),
+	INTC_VECT(TMU2, 0x440), INTC_VECT(TMU2, 0x460),
 	INTC_VECT(WDT, 0x560),
-	INTC_VECT(REF_RCMI, 0x580), INTC_VECT(REF_ROVI, 0x5a0),
+	INTC_VECT(REF, 0x580), INTC_VECT(REF, 0x5a0),
 };
 
 static struct intc_group groups[] __initdata = {
-	INTC_GROUP(DMAC, DMAC_DMTE0, DMAC_DMTE1, DMAC_DMTE2,
-		   DMAC_DMTE3, DMAC_DMTE4, DMAC_DMTE5,
-		   DMAC_DMTE6, DMAC_DMTE7, DMAC_DMAE),
 	INTC_GROUP(DMABRG, DMABRG0, DMABRG1, DMABRG2),
 	INTC_GROUP(SCIF0, SCIF0_ERI, SCIF0_RXI, SCIF0_BRI, SCIF0_TXI),
 	INTC_GROUP(SCIF1, SCIF1_ERI, SCIF1_RXI, SCIF1_BRI, SCIF1_TXI),
 	INTC_GROUP(SCIF2, SCIF2_ERI, SCIF2_RXI, SCIF2_BRI, SCIF2_TXI),
 	INTC_GROUP(SIM, SIM_ERI, SIM_RXI, SIM_TXI, SIM_TEI),
 	INTC_GROUP(MMCIF, MMCIF0, MMCIF1, MMCIF2, MMCIF3),
-	INTC_GROUP(TMU2, TMU2_TUNI, TMU2_TICPI),
-	INTC_GROUP(REF, REF_RCMI, REF_ROVI),
 };
 
 static struct intc_mask_reg mask_registers[] __initdata = {
-- 
cgit v0.10.2


From 50e2d0d3b4ffc945a6c27f16d4e5e557e725ec67 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Mon, 11 May 2009 11:34:16 +0000
Subject: sh: r7780 highlander clock fixes

Update the r7780 highlander defconfig to fix
PCLK value, while at it fix cmdline on r7785.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/configs/r7780mp_defconfig b/arch/sh/configs/r7780mp_defconfig
index 68e5eff..943da63 100644
--- a/arch/sh/configs/r7780mp_defconfig
+++ b/arch/sh/configs/r7780mp_defconfig
@@ -260,7 +260,7 @@ CONFIG_SH_R7780MP=y
 #
 CONFIG_SH_TMU=y
 CONFIG_SH_TIMER_IRQ=28
-CONFIG_SH_PCLK_FREQ=32000000
+CONFIG_SH_PCLK_FREQ=33333333
 # CONFIG_NO_HZ is not set
 # CONFIG_HIGH_RES_TIMERS is not set
 CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
diff --git a/arch/sh/configs/r7785rp_defconfig b/arch/sh/configs/r7785rp_defconfig
index 5e677a8..82658f6 100644
--- a/arch/sh/configs/r7785rp_defconfig
+++ b/arch/sh/configs/r7785rp_defconfig
@@ -333,7 +333,7 @@ CONFIG_GUSA=y
 CONFIG_ZERO_PAGE_OFFSET=0x00001000
 CONFIG_BOOT_LINK_OFFSET=0x00800000
 CONFIG_CMDLINE_BOOL=y
-CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1 mode4_pin=high"
+CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1"
 
 #
 # Bus options
-- 
cgit v0.10.2


From ccc195655fb25d7d967b278c4a4725dc5e7a6bf4 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Mon, 11 May 2009 11:37:16 +0000
Subject: sh: TMU platform data for sh7780

This patch adds TMU platform data for sh7780. Both clockevent
and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7780.c b/arch/sh/kernel/cpu/sh4a/setup-sh7780.c
index 6f7227c..f1df020 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7780.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7780.c
@@ -12,6 +12,189 @@
 #include <linux/serial.h>
 #include <linux/io.h>
 #include <linux/serial_sci.h>
+#include <linux/sh_timer.h>
+
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffd80008,
+		.end	= 0xffd80013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 28,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffd80014,
+		.end	= 0xffd8001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 29,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffd80020,
+		.end	= 0xffd8002f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 30,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
+static struct sh_timer_config tmu3_platform_data = {
+	.name = "TMU3",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+};
+
+static struct resource tmu3_resources[] = {
+	[0] = {
+		.name	= "TMU3",
+		.start	= 0xffdc0008,
+		.end	= 0xffdc0013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 96,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu3_device = {
+	.name		= "sh_tmu",
+	.id		= 3,
+	.dev = {
+		.platform_data	= &tmu3_platform_data,
+	},
+	.resource	= tmu3_resources,
+	.num_resources	= ARRAY_SIZE(tmu3_resources),
+};
+
+static struct sh_timer_config tmu4_platform_data = {
+	.name = "TMU4",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+};
+
+static struct resource tmu4_resources[] = {
+	[0] = {
+		.name	= "TMU4",
+		.start	= 0xffdc0014,
+		.end	= 0xffdc001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 97,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu4_device = {
+	.name		= "sh_tmu",
+	.id		= 4,
+	.dev = {
+		.platform_data	= &tmu4_platform_data,
+	},
+	.resource	= tmu4_resources,
+	.num_resources	= ARRAY_SIZE(tmu4_resources),
+};
+
+static struct sh_timer_config tmu5_platform_data = {
+	.name = "TMU5",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu5_resources[] = {
+	[0] = {
+		.name	= "TMU5",
+		.start	= 0xffdc0020,
+		.end	= 0xffdc002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 98,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu5_device = {
+	.name		= "sh_tmu",
+	.id		= 5,
+	.dev = {
+		.platform_data	= &tmu5_platform_data,
+	},
+	.resource	= tmu5_resources,
+	.num_resources	= ARRAY_SIZE(tmu5_resources),
+};
 
 static struct resource rtc_resources[] = {
 	[0] = {
@@ -58,6 +241,12 @@ static struct platform_device sci_device = {
 };
 
 static struct platform_device *sh7780_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
 	&rtc_device,
 	&sci_device,
 };
@@ -69,6 +258,21 @@ static int __init sh7780_devices_setup(void)
 }
 __initcall(sh7780_devices_setup);
 
+static struct platform_device *sh7780_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7780_early_devices,
+				   ARRAY_SIZE(sh7780_early_devices));
+}
+
 enum {
 	UNUSED = 0,
 
-- 
cgit v0.10.2


From cec6be6d1069d697beb490bbb40a290d5ff554a2 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Mon, 11 May 2009 17:41:40 +0400
Subject: x86: apic: Fixmap apic address even if apic disabled

In case if apic were disabled by boot option
we still need read_apic operation. So fixmap
a fake apic area if needed.

[ Impact: fix boot crash ]

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: yinghai@kernel.org
Cc: eswierk@aristanetworks.com
LKML-Reference: <20090511134140.GH4624@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 0e6543f..07cffc1 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1587,13 +1587,6 @@ void __init init_apic_mappings(void)
 	} else
 		apic_phys = mp_lapic_addr;
 
-	/* lets check if we may NOP'ify apic operations */
-	if (!cpu_has_apic) {
-		pr_info("APIC: disable apic facility\n");
-		apic_disable();
-		return;
-	}
-
 	/*
 	 * acpi lapic path already maps that address in
 	 * acpi_register_lapic_address()
@@ -1602,7 +1595,15 @@ void __init init_apic_mappings(void)
 		set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
 
 	apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
-				APIC_BASE, apic_phys);
+			APIC_BASE, apic_phys);
+
+	/* lets check if we may NOP'ify apic operations */
+	if (!cpu_has_apic) {
+		pr_info("APIC: disable apic facility\n");
+		apic_disable();
+		return;
+	}
+
 	/*
 	 * Fetch the APIC ID of the BSP in case we have a
 	 * default configuration (or the MP table is broken).
-- 
cgit v0.10.2


From d756f4adb9d8a86e347a2d5435bb5cc95744733e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 4 May 2009 03:29:52 +0400
Subject: x86, 32-bit: ifdef out struct thread_struct::fs

After commit 464d1a78fbf8cf6c7fd970e7b3e2db50a320ce28 aka
"[PATCH] i386: Convert i386 PDA code to use %fs"
%fs saved during context switch moved from thread_struct to pt_regs
and value on thread_struct became unused.

[ Impact: reduce thread_struct size on 32-bit ]

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: containers@lists.linux-foundation.org
LKML-Reference: <20090503232952.GI16631@x200.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c2cceae..a6732ff 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -428,7 +428,9 @@ struct thread_struct {
 	unsigned short		gsindex;
 #endif
 	unsigned long		ip;
+#ifdef CONFIG_X86_64
 	unsigned long		fs;
+#endif
 	unsigned long		gs;
 	/* Hardware debugging registers: */
 	unsigned long		debugreg0;
@@ -874,7 +876,6 @@ static inline void spin_lock_prefetch(const void *x)
 	.vm86_info		= NULL,					  \
 	.sysenter_cs		= __KERNEL_CS,				  \
 	.io_bitmap_ptr		= NULL,					  \
-	.fs			= __KERNEL_PERCPU,			  \
 }
 
 /*
-- 
cgit v0.10.2


From 0c23590f00f85467b318ad0c20c36796a5bd4c60 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 4 May 2009 03:30:15 +0400
Subject: x86, 64-bit: ifdef out struct thread_struct::ip

struct thread_struct::ip isn't used on x86_64, struct pt_regs::ip is used
instead.

kgdb should be reading 0 always, but I can't check it.

[ Impact: (potentially) reduce thread_struct size on 64-bit ]

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: containers@lists.linux-foundation.org
LKML-Reference: <20090503233015.GJ16631@x200.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a6732ff..a9ba743 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -427,7 +427,9 @@ struct thread_struct {
 	unsigned short		fsindex;
 	unsigned short		gsindex;
 #endif
+#ifdef CONFIG_X86_32
 	unsigned long		ip;
+#endif
 #ifdef CONFIG_X86_64
 	unsigned long		fs;
 #endif
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index eedfaeb..d07706f 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -141,7 +141,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
 	gdb_regs32[GDB_PS]	= *(unsigned long *)(p->thread.sp + 8);
 	gdb_regs32[GDB_CS]	= __KERNEL_CS;
 	gdb_regs32[GDB_SS]	= __KERNEL_DS;
-	gdb_regs[GDB_PC]	= p->thread.ip;
+	gdb_regs[GDB_PC]	= 0;
 	gdb_regs[GDB_R8]	= 0;
 	gdb_regs[GDB_R9]	= 0;
 	gdb_regs[GDB_R10]	= 0;
-- 
cgit v0.10.2


From 72af2b363107df9cd67985ecb8495bbdff3c9857 Mon Sep 17 00:00:00 2001
From: Tomi Valkeinen <tomi.valkeinen@nokia.com>
Date: Mon, 11 May 2009 09:58:19 -0700
Subject: ARM: OMAP: Fix printing of reserved memory for frambuffer

Print reserved memory only if it was actually reserved.

Signed-off-by: Tomi Valkeinen <tomi.valkeinen@nokia.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>

diff --git a/arch/arm/plat-omap/fb.c b/arch/arm/plat-omap/fb.c
index ce6b4ba..3746222 100644
--- a/arch/arm/plat-omap/fb.c
+++ b/arch/arm/plat-omap/fb.c
@@ -206,9 +206,10 @@ void __init omapfb_reserve_sdram(void)
 			config_invalid = 1;
 			return;
 		}
-		if (rg.paddr)
+		if (rg.paddr) {
 			reserve_bootmem(rg.paddr, rg.size, BOOTMEM_DEFAULT);
-		reserved += rg.size;
+			reserved += rg.size;
+		}
 		omapfb_config.mem_desc.region[i] = rg;
 		configured_regions++;
 	}
-- 
cgit v0.10.2


From 5a772b2b3c68e7e0b503c5a48469113bb0634314 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 8 May 2009 10:56:33 -0400
Subject: ring-buffer: replace constants with time macros in
 ring-buffer-benchmark

The use of numeric constants is discouraged. It is cleaner and more
descriptive to use macros for constant time conversions.

This patch also removes an extra new line.

[ Impact: more descriptive time conversions ]

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index a21aa7b..7d3aef9 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -253,7 +253,7 @@ static void ring_buffer_producer(void)
 	}
 
 	time = end_tv.tv_sec - start_tv.tv_sec;
-	time *= 1000000;
+	time *= USEC_PER_SEC;
 	time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec);
 
 	entries = ring_buffer_entries(buffer);
@@ -273,7 +273,8 @@ static void ring_buffer_producer(void)
 	pr_info("Missed:   %ld\n", missed);
 	pr_info("Hit:      %ld\n", hit);
 
-	do_div(time, 1000);
+	/* Convert time from usecs to millisecs */
+	do_div(time, USEC_PER_MSEC);
 	if (time)
 		hit /= (long)time;
 	else
@@ -282,18 +283,19 @@ static void ring_buffer_producer(void)
 	pr_info("Entries per millisec: %ld\n", hit);
 
 	if (hit) {
-		avg = 1000000 / hit;
+		/* Calculate the average time in nanosecs */
+		avg = NSEC_PER_MSEC / hit;
 		pr_info("%ld ns per entry\n", avg);
 	}
 
-
 	if (missed) {
 		if (time)
 			missed /= (long)time;
 
 		pr_info("Total iterations per millisec: %ld\n", hit + missed);
 
-		avg = 1000000 / (hit + missed);
+		/* Caculate the average time in nanosecs */
+		avg = NSEC_PER_MSEC / (hit + missed);
 		pr_info("%ld ns per entry\n", avg);
 	}
 }
-- 
cgit v0.10.2


From d988ff94c1074c4c914235c8591bcceafb585ecf Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 8 May 2009 11:03:57 -0400
Subject: ring-buffer: check for divide by zero in ring-buffer-benchmark

Although we check if "missed" is not zero, we divide by hit + missed,
and the addition can possible overflow and become a divide by zero.

This patch checks for this case, and will report it when it happens
then modify "hit" to make the calculation be non zero.

[ Impact: prevent possible divide by zero in ring-buffer-benchmark ]

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 7d3aef9..8d68e14 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -294,6 +294,12 @@ static void ring_buffer_producer(void)
 
 		pr_info("Total iterations per millisec: %ld\n", hit + missed);
 
+		/* it is possible that hit + missed will overflow and be zero */
+		if (!(hit + missed)) {
+			pr_info("hit + missed overflowed and totalled zero!\n");
+			hit--; /* make it non zero */
+		}
+
 		/* Caculate the average time in nanosecs */
 		avg = NSEC_PER_MSEC / (hit + missed);
 		pr_info("%ld ns per entry\n", avg);
-- 
cgit v0.10.2


From 1cd8d7358948909ab80b254eb14bcebc555ad417 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 11 May 2009 14:08:09 -0400
Subject: ring-buffer: remove type parameter from rb_reserve_next_event

The rb_reserve_next_event is only called for the data type (type = 0).
There is no reason to pass in the type to the function.

Before:
   text    data     bss     dec     hex filename
  16554      24      12   16590    40ce kernel/trace/ring_buffer.o

After:
   text    data     bss     dec     hex filename
  16538      24      12   16574    40be kernel/trace/ring_buffer.o

[ Impact: cleaner, smaller and slightly more efficient code ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3611706..fe40f6c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1389,7 +1389,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 
 static struct ring_buffer_event *
 rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
-		      unsigned type, unsigned long length)
+		      unsigned long length)
 {
 	struct ring_buffer_event *event;
 	u64 ts, delta;
@@ -1448,7 +1448,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 		/* Non commits have zero deltas */
 		delta = 0;
 
-	event = __rb_reserve_next(cpu_buffer, type, length, &ts);
+	event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
 	if (PTR_ERR(event) == -EAGAIN)
 		goto again;
 
@@ -1556,7 +1556,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 	if (length > BUF_PAGE_SIZE)
 		goto out;
 
-	event = rb_reserve_next_event(cpu_buffer, 0, length);
+	event = rb_reserve_next_event(cpu_buffer, length);
 	if (!event)
 		goto out;
 
@@ -1782,7 +1782,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
 		goto out;
 
 	event_length = rb_calculate_event_length(length);
-	event = rb_reserve_next_event(cpu_buffer, 0, event_length);
+	event = rb_reserve_next_event(cpu_buffer, event_length);
 	if (!event)
 		goto out;
 
-- 
cgit v0.10.2


From be957c447f7233a67904a1b11eb3ab61e702bf4d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 11 May 2009 14:42:53 -0400
Subject: ring-buffer: move calculation of event length

The event length is calculated and passed in to rb_reserve_next_event
in two different locations. Having rb_reserve_next_event do the
calculations directly makes only one location to do the change and
causes the calculation to be inlined by gcc.

Before:
   text    data     bss     dec     hex filename
  16538      24      12   16574    40be kernel/trace/ring_buffer.o

After:
   text    data     bss     dec     hex filename
  16490      24      12   16526    408e kernel/trace/ring_buffer.o

[ Impact: smaller more efficient code ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index fe40f6c..493cba4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -367,6 +367,9 @@ static inline int test_time_stamp(u64 delta)
 
 #define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
 
+/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
+#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
+
 int ring_buffer_print_page_header(struct trace_seq *s)
 {
 	struct buffer_data_page field;
@@ -1396,6 +1399,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	int commit = 0;
 	int nr_loops = 0;
 
+	length = rb_calculate_event_length(length);
  again:
 	/*
 	 * We allow for interrupts to reenter here and do a trace.
@@ -1552,8 +1556,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 	if (atomic_read(&cpu_buffer->record_disabled))
 		goto out;
 
-	length = rb_calculate_event_length(length);
-	if (length > BUF_PAGE_SIZE)
+	if (length > BUF_MAX_DATA_SIZE)
 		goto out;
 
 	event = rb_reserve_next_event(cpu_buffer, length);
@@ -1758,7 +1761,6 @@ int ring_buffer_write(struct ring_buffer *buffer,
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_event *event;
-	unsigned long event_length;
 	void *body;
 	int ret = -EBUSY;
 	int cpu, resched;
@@ -1781,8 +1783,10 @@ int ring_buffer_write(struct ring_buffer *buffer,
 	if (atomic_read(&cpu_buffer->record_disabled))
 		goto out;
 
-	event_length = rb_calculate_event_length(length);
-	event = rb_reserve_next_event(cpu_buffer, event_length);
+	if (length > BUF_MAX_DATA_SIZE)
+		goto out;
+
+	event = rb_reserve_next_event(cpu_buffer, length);
 	if (!event)
 		goto out;
 
-- 
cgit v0.10.2


From b68d8201433a91cabbcbeae48b53d8c1c426433a Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 12 May 2009 03:45:08 +0900
Subject: sh: clkfwk: Make recalc return an unsigned long.

This is prep work for cleaning up some of the rate propagation bits.
Trivial conversion.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index b1f2919..d63352b 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -12,7 +12,7 @@ struct clk_ops {
 	void (*init)(struct clk *clk);
 	void (*enable)(struct clk *clk);
 	void (*disable)(struct clk *clk);
-	void (*recalc)(struct clk *clk);
+	unsigned long (*recalc)(struct clk *clk);
 	int (*set_rate)(struct clk *clk, unsigned long rate, int algo_id);
 	int (*set_parent)(struct clk *clk, struct clk *parent);
 	long (*round_rate)(struct clk *clk, unsigned long rate);
@@ -96,4 +96,5 @@ enum clk_sh_algo_id {
 
 	IP_N1,
 };
+
 #endif /* __ASM_SH_CLOCK_H */
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 133dbe4..b022aff 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -75,6 +75,7 @@ static struct clk *onchip_clocks[] = {
 	&cpu_clk,
 };
 
+/* Propagate rate to children */
 static void propagate_rate(struct clk *clk)
 {
 	struct clk *clkp;
@@ -83,7 +84,7 @@ static void propagate_rate(struct clk *clk)
 		if (likely(clkp->parent != clk))
 			continue;
 		if (likely(clkp->ops && clkp->ops->recalc))
-			clkp->ops->recalc(clkp);
+			clkp->rate = clkp->ops->recalc(clkp);
 		if (unlikely(clkp->flags & CLK_RATE_PROPAGATES))
 			propagate_rate(clkp);
 	}
@@ -240,7 +241,7 @@ void clk_recalc_rate(struct clk *clk)
 		unsigned long flags;
 
 		spin_lock_irqsave(&clock_lock, flags);
-		clk->ops->recalc(clk);
+		clk->rate = clk->ops->recalc(clk);
 		spin_unlock_irqrestore(&clock_lock, flags);
 	}
 
@@ -377,20 +378,22 @@ static int clks_sysdev_suspend(struct sys_device *dev, pm_message_t state)
 	switch (state.event) {
 	case PM_EVENT_ON:
 		/* Resumeing from hibernation */
-		if (prev_state.event == PM_EVENT_FREEZE) {
-			list_for_each_entry(clkp, &clock_list, node)
-				if (likely(clkp->ops)) {
-					unsigned long rate = clkp->rate;
-
-					if (likely(clkp->ops->set_parent))
-						clkp->ops->set_parent(clkp,
-							clkp->parent);
-					if (likely(clkp->ops->set_rate))
-						clkp->ops->set_rate(clkp,
-							rate, NO_CHANGE);
-					else if (likely(clkp->ops->recalc))
-						clkp->ops->recalc(clkp);
-					}
+		if (prev_state.event != PM_EVENT_FREEZE)
+			break;
+
+		list_for_each_entry(clkp, &clock_list, node) {
+			if (likely(clkp->ops)) {
+				unsigned long rate = clkp->rate;
+
+				if (likely(clkp->ops->set_parent))
+					clkp->ops->set_parent(clkp,
+						clkp->parent);
+				if (likely(clkp->ops->set_rate))
+					clkp->ops->set_rate(clkp,
+						rate, NO_CHANGE);
+				else if (likely(clkp->ops->recalc))
+					clkp->rate = clkp->ops->recalc(clkp);
+			}
 		}
 		break;
 	case PM_EVENT_FREEZE:
diff --git a/arch/sh/kernel/cpu/sh2/clock-sh7619.c b/arch/sh/kernel/cpu/sh2/clock-sh7619.c
index d2c1579..2679913 100644
--- a/arch/sh/kernel/cpu/sh2/clock-sh7619.c
+++ b/arch/sh/kernel/cpu/sh2/clock-sh7619.c
@@ -38,28 +38,28 @@ static struct clk_ops sh7619_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FREQCR) & 0x0007);
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh7619_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
-	clk->rate = clk->parent->rate / pll1rate[(ctrl_inw(FREQCR) >> 8) & 7];
+	return clk->parent->rate / pll1rate[(ctrl_inw(FREQCR) >> 8) & 7];
 }
 
 static struct clk_ops sh7619_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
-	clk->rate = clk->parent->rate;
+	return clk->parent->rate;
 }
 
 static struct clk_ops sh7619_cpu_clk_ops = {
@@ -78,4 +78,3 @@ void __init arch_init_clk_ops(struct clk_ops **ops, int idx)
 	if (idx < ARRAY_SIZE(sh7619_clk_ops))
 		*ops = sh7619_clk_ops[idx];
 }
-
diff --git a/arch/sh/kernel/cpu/sh2a/clock-sh7201.c b/arch/sh/kernel/cpu/sh2a/clock-sh7201.c
index 4a5e597..7814c76 100644
--- a/arch/sh/kernel/cpu/sh2a/clock-sh7201.c
+++ b/arch/sh/kernel/cpu/sh2a/clock-sh7201.c
@@ -34,37 +34,37 @@ static const int pfc_divisors[]={1,2,3,4,6,8,12};
 
 static void master_clk_init(struct clk *clk)
 {
-	clk->rate = 10000000 * PLL2 * pll1rate[(ctrl_inw(FREQCR) >> 8) & 0x0007];
+	return 10000000 * PLL2 * pll1rate[(ctrl_inw(FREQCR) >> 8) & 0x0007];
 }
 
 static struct clk_ops sh7201_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FREQCR) & 0x0007);
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh7201_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FREQCR) & 0x0007);
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh7201_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inw(FREQCR) >> 4) & 0x0007);
-	clk->rate = clk->parent->rate / ifc_divisors[idx];
+	return clk->parent->rate / ifc_divisors[idx];
 }
 
 static struct clk_ops sh7201_cpu_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh2a/clock-sh7203.c b/arch/sh/kernel/cpu/sh2a/clock-sh7203.c
index fb78132..f8c6933 100644
--- a/arch/sh/kernel/cpu/sh2a/clock-sh7203.c
+++ b/arch/sh/kernel/cpu/sh2a/clock-sh7203.c
@@ -46,29 +46,29 @@ static struct clk_ops sh7203_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FREQCR) & 0x0007);
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh7203_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FREQCR) & 0x0007);
-	clk->rate = clk->parent->rate / pfc_divisors[idx-2];
+	return clk->parent->rate / pfc_divisors[idx-2];
 }
 
 static struct clk_ops sh7203_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
-	clk->rate = clk->parent->rate;
+	return clk->parent->rate;
 }
 
 static struct clk_ops sh7203_cpu_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh2a/clock-sh7206.c b/arch/sh/kernel/cpu/sh2a/clock-sh7206.c
index 82d7f99..c2268bd 100644
--- a/arch/sh/kernel/cpu/sh2a/clock-sh7206.c
+++ b/arch/sh/kernel/cpu/sh2a/clock-sh7206.c
@@ -41,29 +41,29 @@ static struct clk_ops sh7206_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FREQCR) & 0x0007);
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh7206_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
-	clk->rate = clk->parent->rate / pll1rate[(ctrl_inw(FREQCR) >> 8) & 0x0007];
+	return clk->parent->rate / pll1rate[(ctrl_inw(FREQCR) >> 8) & 0x0007];
 }
 
 static struct clk_ops sh7206_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FREQCR) & 0x0007);
-	clk->rate = clk->parent->rate / ifc_divisors[idx];
+	return clk->parent->rate / ifc_divisors[idx];
 }
 
 static struct clk_ops sh7206_cpu_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh3/clock-sh3.c b/arch/sh/kernel/cpu/sh3/clock-sh3.c
index c3c9459..27b8738 100644
--- a/arch/sh/kernel/cpu/sh3/clock-sh3.c
+++ b/arch/sh/kernel/cpu/sh3/clock-sh3.c
@@ -38,36 +38,36 @@ static struct clk_ops sh3_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int frqcr = ctrl_inw(FRQCR);
 	int idx = ((frqcr & 0x2000) >> 11) | (frqcr & 0x0003);
 
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh3_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int frqcr = ctrl_inw(FRQCR);
 	int idx = ((frqcr & 0x8000) >> 13) | ((frqcr & 0x0030) >> 4);
 
-	clk->rate = clk->parent->rate / stc_multipliers[idx];
+	return clk->parent->rate / stc_multipliers[idx];
 }
 
 static struct clk_ops sh3_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int frqcr = ctrl_inw(FRQCR);
 	int idx = ((frqcr & 0x4000) >> 12) | ((frqcr & 0x000c) >> 2);
 
-	clk->rate = clk->parent->rate / ifc_divisors[idx];
+	return clk->parent->rate / ifc_divisors[idx];
 }
 
 static struct clk_ops sh3_cpu_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh3/clock-sh7705.c b/arch/sh/kernel/cpu/sh3/clock-sh7705.c
index dfdbf32..0ca8f2c 100644
--- a/arch/sh/kernel/cpu/sh3/clock-sh7705.c
+++ b/arch/sh/kernel/cpu/sh3/clock-sh7705.c
@@ -39,30 +39,30 @@ static struct clk_ops sh7705_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = ctrl_inw(FRQCR) & 0x0003;
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh7705_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FRQCR) & 0x0300) >> 8;
-	clk->rate = clk->parent->rate / stc_multipliers[idx];
+	return clk->parent->rate / stc_multipliers[idx];
 }
 
 static struct clk_ops sh7705_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FRQCR) & 0x0030) >> 4;
-	clk->rate = clk->parent->rate / ifc_divisors[idx];
+	return clk->parent->rate / ifc_divisors[idx];
 }
 
 static struct clk_ops sh7705_cpu_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh3/clock-sh7706.c b/arch/sh/kernel/cpu/sh3/clock-sh7706.c
index 0cf96f9..4bf7887 100644
--- a/arch/sh/kernel/cpu/sh3/clock-sh7706.c
+++ b/arch/sh/kernel/cpu/sh3/clock-sh7706.c
@@ -34,36 +34,36 @@ static struct clk_ops sh7706_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int frqcr = ctrl_inw(FRQCR);
 	int idx = ((frqcr & 0x2000) >> 11) | (frqcr & 0x0003);
 
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh7706_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int frqcr = ctrl_inw(FRQCR);
 	int idx = ((frqcr & 0x8000) >> 13) | ((frqcr & 0x0030) >> 4);
 
-	clk->rate = clk->parent->rate / stc_multipliers[idx];
+	return clk->parent->rate / stc_multipliers[idx];
 }
 
 static struct clk_ops sh7706_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int frqcr = ctrl_inw(FRQCR);
 	int idx = ((frqcr & 0x4000) >> 12) | ((frqcr & 0x000c) >> 2);
 
-	clk->rate = clk->parent->rate / ifc_divisors[idx];
+	return clk->parent->rate / ifc_divisors[idx];
 }
 
 static struct clk_ops sh7706_cpu_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh3/clock-sh7709.c b/arch/sh/kernel/cpu/sh3/clock-sh7709.c
index b791a29..fa30b60 100644
--- a/arch/sh/kernel/cpu/sh3/clock-sh7709.c
+++ b/arch/sh/kernel/cpu/sh3/clock-sh7709.c
@@ -41,12 +41,12 @@ static struct clk_ops sh7709_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int frqcr = ctrl_inw(FRQCR);
 	int idx = ((frqcr & 0x2000) >> 11) | (frqcr & 0x0003);
 
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh7709_module_clk_ops = {
@@ -56,25 +56,25 @@ static struct clk_ops sh7709_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int frqcr = ctrl_inw(FRQCR);
 	int idx = (frqcr & 0x0080) ?
 		((frqcr & 0x8000) >> 13) | ((frqcr & 0x0030) >> 4) : 1;
 
-	clk->rate = clk->parent->rate * stc_multipliers[idx];
+	return clk->parent->rate * stc_multipliers[idx];
 }
 
 static struct clk_ops sh7709_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int frqcr = ctrl_inw(FRQCR);
 	int idx = ((frqcr & 0x4000) >> 12) | ((frqcr & 0x000c) >> 2);
 
-	clk->rate = clk->parent->rate / ifc_divisors[idx];
+	return clk->parent->rate / ifc_divisors[idx];
 }
 
 static struct clk_ops sh7709_cpu_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh3/clock-sh7710.c b/arch/sh/kernel/cpu/sh3/clock-sh7710.c
index 4744c50..030a58b 100644
--- a/arch/sh/kernel/cpu/sh3/clock-sh7710.c
+++ b/arch/sh/kernel/cpu/sh3/clock-sh7710.c
@@ -33,30 +33,30 @@ static struct clk_ops sh7710_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FRQCR) & 0x0007);
-	clk->rate = clk->parent->rate / md_table[idx];
+	return clk->parent->rate / md_table[idx];
 }
 
 static struct clk_ops sh7710_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FRQCR) & 0x0700) >> 8;
-	clk->rate = clk->parent->rate / md_table[idx];
+	return clk->parent->rate / md_table[idx];
 }
 
 static struct clk_ops sh7710_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FRQCR) & 0x0070) >> 4;
-	clk->rate = clk->parent->rate / md_table[idx];
+	return clk->parent->rate / md_table[idx];
 }
 
 static struct clk_ops sh7710_cpu_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh3/clock-sh7712.c b/arch/sh/kernel/cpu/sh3/clock-sh7712.c
index 54f54df..6428ee6 100644
--- a/arch/sh/kernel/cpu/sh3/clock-sh7712.c
+++ b/arch/sh/kernel/cpu/sh3/clock-sh7712.c
@@ -33,24 +33,24 @@ static struct clk_ops sh7712_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int frqcr = ctrl_inw(FRQCR);
 	int idx = frqcr & 0x0007;
 
-	clk->rate = clk->parent->rate / divisors[idx];
+	return clk->parent->rate / divisors[idx];
 }
 
 static struct clk_ops sh7712_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int frqcr = ctrl_inw(FRQCR);
 	int idx = (frqcr & 0x0030) >> 4;
 
-	clk->rate = clk->parent->rate / divisors[idx];
+	return clk->parent->rate / divisors[idx];
 }
 
 static struct clk_ops sh7712_cpu_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
index a334294..628d50e 100644
--- a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
+++ b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
@@ -21,10 +21,10 @@
 static int frqcr3_divisors[] = { 1, 2, 3, 4, 6, 8, 16 };
 static int frqcr3_values[]   = { 0, 1, 2, 3, 4, 5, 6  };
 
-static void emi_clk_recalc(struct clk *clk)
+static unsigned long emi_clk_recalc(struct clk *clk)
 {
 	int idx = ctrl_inl(CPG2_FRQCR3) & 0x0007;
-	clk->rate = clk->parent->rate / frqcr3_divisors[idx];
+	return clk->parent->rate / frqcr3_divisors[idx];
 }
 
 static inline int frqcr3_lookup(struct clk *clk, unsigned long rate)
@@ -50,10 +50,10 @@ static struct clk sh4202_emi_clk = {
 	.ops		= &sh4202_emi_clk_ops,
 };
 
-static void femi_clk_recalc(struct clk *clk)
+static unsigned long femi_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inl(CPG2_FRQCR3) >> 3) & 0x0007;
-	clk->rate = clk->parent->rate / frqcr3_divisors[idx];
+	return clk->parent->rate / frqcr3_divisors[idx];
 }
 
 static struct clk_ops sh4202_femi_clk_ops = {
@@ -90,10 +90,10 @@ static void shoc_clk_init(struct clk *clk)
 	WARN_ON(i == ARRAY_SIZE(frqcr3_divisors));	/* Undefined clock */
 }
 
-static void shoc_clk_recalc(struct clk *clk)
+static unsigned long shoc_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inl(CPG2_FRQCR3) >> 6) & 0x0007;
-	clk->rate = clk->parent->rate / frqcr3_divisors[idx];
+	return clk->parent->rate / frqcr3_divisors[idx];
 }
 
 static int shoc_clk_verify_rate(struct clk *clk, unsigned long rate)
@@ -127,7 +127,7 @@ static int shoc_clk_set_rate(struct clk *clk, unsigned long rate, int algo_id)
 	frqcr3 |= tmp << 6;
 	ctrl_outl(frqcr3, CPG2_FRQCR3);
 
-	clk->rate = clk->parent->rate / frqcr3_divisors[tmp];
+	return clk->parent->rate / frqcr3_divisors[tmp];
 
 	return 0;
 }
diff --git a/arch/sh/kernel/cpu/sh4/clock-sh4.c b/arch/sh/kernel/cpu/sh4/clock-sh4.c
index dca9f87..73294d9 100644
--- a/arch/sh/kernel/cpu/sh4/clock-sh4.c
+++ b/arch/sh/kernel/cpu/sh4/clock-sh4.c
@@ -35,30 +35,30 @@ static struct clk_ops sh4_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FRQCR) & 0x0007);
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh4_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FRQCR) >> 3) & 0x0007;
-	clk->rate = clk->parent->rate / bfc_divisors[idx];
+	return clk->parent->rate / bfc_divisors[idx];
 }
 
 static struct clk_ops sh4_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FRQCR) >> 6) & 0x0007;
-	clk->rate = clk->parent->rate / ifc_divisors[idx];
+	return clk->parent->rate / ifc_divisors[idx];
 }
 
 static struct clk_ops sh4_cpu_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index 1ccdfc5..5b1427f 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -151,11 +151,11 @@ static int divisors2[] = { 4, 1, 8, 12, 16, 24, 32, 1, 48, 64, 72, 96, 1, 144 };
 static int divisors2[] = { 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32, 40 };
 #endif
 
-static void master_clk_recalc(struct clk *clk)
+static unsigned long master_clk_recalc(struct clk *clk)
 {
 	unsigned frqcr = ctrl_inl(FRQCR);
 
-	clk->rate = CONFIG_SH_PCLK_FREQ * STCPLL(frqcr);
+	return CONFIG_SH_PCLK_FREQ * STCPLL(frqcr);
 }
 
 static void master_clk_init(struct clk *clk)
@@ -166,12 +166,11 @@ static void master_clk_init(struct clk *clk)
 	master_clk_recalc(clk);
 }
 
-
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	unsigned long frqcr = ctrl_inl(FRQCR);
 
-	clk->rate = clk->parent->rate / STCPLL(frqcr);
+	return clk->parent->rate / STCPLL(frqcr);
 }
 
 #if defined(CONFIG_CPU_SUBTYPE_SH7724)
@@ -283,14 +282,14 @@ static int sh7722_find_div_index(unsigned long parent_rate, unsigned rate)
 	return index;
 }
 
-static void sh7722_frqcr_recalc(struct clk *clk)
+static unsigned long sh7722_frqcr_recalc(struct clk *clk)
 {
 	struct frqcr_context ctx = sh7722_get_clk_context(clk->name);
 	unsigned long frqcr = ctrl_inl(FRQCR);
 	int index;
 
 	index = (frqcr >> ctx.shift) & ctx.mask;
-	clk->rate = clk->parent->rate * 2 / divisors2[index];
+	return clk->parent->rate * 2 / divisors2[index];
 }
 
 static int sh7722_frqcr_set_rate(struct clk *clk, unsigned long rate,
@@ -439,11 +438,8 @@ static struct clk_ops sh7722_frqcr_clk_ops = {
 
 /*
  * clock ops methods for SIU A/B and IrDA clock
- *
  */
-
 #ifndef CONFIG_CPU_SUBTYPE_SH7343
-
 static int sh7722_siu_set_rate(struct clk *clk, unsigned long rate, int algo_id)
 {
 	unsigned long r;
@@ -458,12 +454,12 @@ static int sh7722_siu_set_rate(struct clk *clk, unsigned long rate, int algo_id)
 	return 0;
 }
 
-static void sh7722_siu_recalc(struct clk *clk)
+static unsigned long sh7722_siu_recalc(struct clk *clk)
 {
 	unsigned long r;
 
 	r = ctrl_inl(clk->arch_flags);
-	clk->rate = clk->parent->rate * 2 / divisors2[r & 0xF];
+	return clk->parent->rate * 2 / divisors2[r & 0xF];
 }
 
 static int sh7722_siu_start_stop(struct clk *clk, int enable)
@@ -525,12 +521,12 @@ static int sh7722_video_set_rate(struct clk *clk, unsigned long rate,
 	return 0;
 }
 
-static void sh7722_video_recalc(struct clk *clk)
+static unsigned long sh7722_video_recalc(struct clk *clk)
 {
 	unsigned long r;
 
 	r = ctrl_inl(VCLKCR);
-	clk->rate = clk->parent->rate / ((r & 0x3F) + 1);
+	return clk->parent->rate / ((r & 0x3F) + 1);
 }
 
 static struct clk_ops sh7722_video_clk_ops = {
@@ -627,7 +623,7 @@ static int sh7722_mstpcr_start_stop(struct clk *clk, int enable)
 		break;
 	default:
 		return -EINVAL;
-	}  
+	}
 
 	r = ctrl_inl(reg);
 
@@ -650,10 +646,9 @@ static void sh7722_mstpcr_disable(struct clk *clk)
 	sh7722_mstpcr_start_stop(clk, 0);
 }
 
-static void sh7722_mstpcr_recalc(struct clk *clk)
+static unsigned long sh7722_mstpcr_recalc(struct clk *clk)
 {
-	if (clk->parent)
-		clk->rate = clk->parent->rate;
+	return clk->parent->rate;
 }
 
 static struct clk_ops sh7722_mstpcr_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
index 3177d0d..26630fb 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
@@ -29,29 +29,29 @@ static struct clk_ops sh7763_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQCR) >> 4) & 0x07);
-	clk->rate = clk->parent->rate / p0fc_divisors[idx];
+	return clk->parent->rate / p0fc_divisors[idx];
 }
 
 static struct clk_ops sh7763_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQCR) >> 16) & 0x07);
-	clk->rate = clk->parent->rate / bfc_divisors[idx];
+	return clk->parent->rate / bfc_divisors[idx];
 }
 
 static struct clk_ops sh7763_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
-	clk->rate = clk->parent->rate;
+	return clk->parent->rate;
 }
 
 static struct clk_ops sh7763_cpu_clk_ops = {
@@ -71,10 +71,10 @@ void __init arch_init_clk_ops(struct clk_ops **ops, int idx)
 		*ops = sh7763_clk_ops[idx];
 }
 
-static void shyway_clk_recalc(struct clk *clk)
+static unsigned long shyway_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQCR) >> 20) & 0x07);
-	clk->rate = clk->parent->rate / cfc_divisors[idx];
+	return clk->parent->rate / cfc_divisors[idx];
 }
 
 static struct clk_ops sh7763_shyway_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7770.c b/arch/sh/kernel/cpu/sh4a/clock-sh7770.c
index 8e23606..e0b8967 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7770.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7770.c
@@ -28,30 +28,30 @@ static struct clk_ops sh7770_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQCR) >> 28) & 0x000f);
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh7770_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inl(FRQCR) & 0x000f);
-	clk->rate = clk->parent->rate / bfc_divisors[idx];
+	return clk->parent->rate / bfc_divisors[idx];
 }
 
 static struct clk_ops sh7770_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQCR) >> 24) & 0x000f);
-	clk->rate = clk->parent->rate / ifc_divisors[idx];
+	return clk->parent->rate / ifc_divisors[idx];
 }
 
 static struct clk_ops sh7770_cpu_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
index 01f3da6..ba8dacc 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
@@ -29,30 +29,30 @@ static struct clk_ops sh7780_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inl(FRQCR) & 0x0003);
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh7780_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQCR) >> 16) & 0x0007);
-	clk->rate = clk->parent->rate / bfc_divisors[idx];
+	return clk->parent->rate / bfc_divisors[idx];
 }
 
 static struct clk_ops sh7780_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQCR) >> 24) & 0x0001);
-	clk->rate = clk->parent->rate / ifc_divisors[idx];
+	return clk->parent->rate / ifc_divisors[idx];
 }
 
 static struct clk_ops sh7780_cpu_clk_ops = {
@@ -72,10 +72,10 @@ void __init arch_init_clk_ops(struct clk_ops **ops, int idx)
 		*ops = sh7780_clk_ops[idx];
 }
 
-static void shyway_clk_recalc(struct clk *clk)
+static unsigned long shyway_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQCR) >> 20) & 0x0007);
-	clk->rate = clk->parent->rate / cfc_divisors[idx];
+	return clk->parent->rate / cfc_divisors[idx];
 }
 
 static struct clk_ops sh7780_shyway_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index 27fa81b..52691ea 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -33,30 +33,30 @@ static struct clk_ops sh7785_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inl(FRQMR1) & 0x000f);
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh7785_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQMR1) >> 16) & 0x000f);
-	clk->rate = clk->parent->rate / bfc_divisors[idx];
+	return clk->parent->rate / bfc_divisors[idx];
 }
 
 static struct clk_ops sh7785_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQMR1) >> 28) & 0x0003);
-	clk->rate = clk->parent->rate / ifc_divisors[idx];
+	return clk->parent->rate / ifc_divisors[idx];
 }
 
 static struct clk_ops sh7785_cpu_clk_ops = {
@@ -76,10 +76,10 @@ void __init arch_init_clk_ops(struct clk_ops **ops, int idx)
 		*ops = sh7785_clk_ops[idx];
 }
 
-static void shyway_clk_recalc(struct clk *clk)
+static unsigned long shyway_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQMR1) >> 20) & 0x0003);
-	clk->rate = clk->parent->rate / sfc_divisors[idx];
+	return clk->parent->rate / sfc_divisors[idx];
 }
 
 static struct clk_ops sh7785_shyway_clk_ops = {
@@ -92,10 +92,10 @@ static struct clk sh7785_shyway_clk = {
 	.ops		= &sh7785_shyway_clk_ops,
 };
 
-static void ddr_clk_recalc(struct clk *clk)
+static unsigned long ddr_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQMR1) >> 12) & 0x0003);
-	clk->rate = clk->parent->rate / mfc_divisors[idx];
+	return clk->parent->rate / mfc_divisors[idx];
 }
 
 static struct clk_ops sh7785_ddr_clk_ops = {
@@ -108,10 +108,10 @@ static struct clk sh7785_ddr_clk = {
 	.ops		= &sh7785_ddr_clk_ops,
 };
 
-static void ram_clk_recalc(struct clk *clk)
+static unsigned long ram_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQMR1) >> 24) & 0x0003);
-	clk->rate = clk->parent->rate / ufc_divisors[idx];
+	return clk->parent->rate / ufc_divisors[idx];
 }
 
 static struct clk_ops sh7785_ram_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
index f84a9c1..2e00ff4 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
@@ -36,30 +36,30 @@ static struct clk_ops sh7786_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inl(FRQMR1) & 0x000f);
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh7786_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQMR1) >> 16) & 0x000f);
-	clk->rate = clk->parent->rate / bfc_divisors[idx];
+	return clk->parent->rate / bfc_divisors[idx];
 }
 
 static struct clk_ops sh7786_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQMR1) >> 28) & 0x0003);
-	clk->rate = clk->parent->rate / ifc_divisors[idx];
+	return clk->parent->rate / ifc_divisors[idx];
 }
 
 static struct clk_ops sh7786_cpu_clk_ops = {
@@ -79,10 +79,10 @@ void __init arch_init_clk_ops(struct clk_ops **ops, int idx)
 		*ops = sh7786_clk_ops[idx];
 }
 
-static void shyway_clk_recalc(struct clk *clk)
+static unsigned long shyway_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQMR1) >> 20) & 0x0003);
-	clk->rate = clk->parent->rate / sfc_divisors[idx];
+	return clk->parent->rate / sfc_divisors[idx];
 }
 
 static struct clk_ops sh7786_shyway_clk_ops = {
@@ -95,10 +95,10 @@ static struct clk sh7786_shyway_clk = {
 	.ops		= &sh7786_shyway_clk_ops,
 };
 
-static void ddr_clk_recalc(struct clk *clk)
+static unsigned long ddr_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQMR1) >> 12) & 0x0003);
-	clk->rate = clk->parent->rate / mfc_divisors[idx];
+	return clk->parent->rate / mfc_divisors[idx];
 }
 
 static struct clk_ops sh7786_ddr_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh4a/clock-shx3.c b/arch/sh/kernel/cpu/sh4a/clock-shx3.c
index c630b29..770934e 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-shx3.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-shx3.c
@@ -40,30 +40,30 @@ static struct clk_ops shx3_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQCR) >> PFC_POS) & PFC_MSK);
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops shx3_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQCR) >> BFC_POS) & BFC_MSK);
-	clk->rate = clk->parent->rate / bfc_divisors[idx];
+	return clk->parent->rate / bfc_divisors[idx];
 }
 
 static struct clk_ops shx3_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQCR) >> IFC_POS) & IFC_MSK);
-	clk->rate = clk->parent->rate / ifc_divisors[idx];
+	return clk->parent->rate / ifc_divisors[idx];
 }
 
 static struct clk_ops shx3_cpu_clk_ops = {
@@ -83,10 +83,10 @@ void __init arch_init_clk_ops(struct clk_ops **ops, int idx)
 		*ops = shx3_clk_ops[idx];
 }
 
-static void shyway_clk_recalc(struct clk *clk)
+static unsigned long shyway_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQCR) >> CFC_POS) & CFC_MSK);
-	clk->rate = clk->parent->rate / cfc_divisors[idx];
+	return clk->parent->rate / cfc_divisors[idx];
 }
 
 static struct clk_ops shx3_shyway_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh5/clock-sh5.c b/arch/sh/kernel/cpu/sh5/clock-sh5.c
index 5486324..7f864eb 100644
--- a/arch/sh/kernel/cpu/sh5/clock-sh5.c
+++ b/arch/sh/kernel/cpu/sh5/clock-sh5.c
@@ -32,30 +32,30 @@ static struct clk_ops sh5_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(cprc_base) >> 12) & 0x0007;
-	clk->rate = clk->parent->rate / ifc_table[idx];
+	return clk->parent->rate / ifc_table[idx];
 }
 
 static struct clk_ops sh5_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(cprc_base) >> 3) & 0x0007;
-	clk->rate = clk->parent->rate / ifc_table[idx];
+	return clk->parent->rate / ifc_table[idx];
 }
 
 static struct clk_ops sh5_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(cprc_base) & 0x0007);
-	clk->rate = clk->parent->rate / ifc_table[idx];
+	return clk->parent->rate / ifc_table[idx];
 }
 
 static struct clk_ops sh5_cpu_clk_ops = {
diff --git a/arch/sh/kernel/timers/timer-tmu.c b/arch/sh/kernel/timers/timer-tmu.c
index fe8d893..a693dcd 100644
--- a/arch/sh/kernel/timers/timer-tmu.c
+++ b/arch/sh/kernel/timers/timer-tmu.c
@@ -172,22 +172,19 @@ static void __init tmu_clk_init(struct clk *clk)
 	clk->rate = clk_get_rate(clk->parent) / (4 << (divisor << 1));
 }
 
-static void tmu_clk_recalc(struct clk *clk)
+static unsigned long tmu_clk_recalc(struct clk *clk)
 {
 	int tmu_num = clk->name[3]-'0';
-	unsigned long prev_rate = clk_get_rate(clk);
+	unsigned long new_rate;
 	unsigned long flags;
 	u8 divisor = ctrl_inw(TMU0_TCR+tmu_num*0xC) & 0x7;
-	clk->rate  = clk_get_rate(clk->parent) / (4 << (divisor << 1));
 
-	if(prev_rate==clk_get_rate(clk))
-		return;
-
-	if(tmu_num)
-		return; /* No more work on TMU1 */
+	new_rate = clk_get_rate(clk->parent) / (4 << (divisor << 1));
+	if (clk->rate == new_rate || tmu_num)
+		return clk->rate; /* No more work on TMU1 */
 
 	local_irq_save(flags);
-	tmus_are_scaled = (prev_rate > clk->rate);
+	tmus_are_scaled = (clk->rate > new_rate);
 
 	_tmu_stop(TMU0);
 
@@ -210,6 +207,7 @@ static void tmu_clk_recalc(struct clk *clk)
 	_tmu_start(TMU0);
 
 	local_irq_restore(flags);
+	return new_rate;
 }
 
 static struct clk_ops tmu_clk_ops = {
-- 
cgit v0.10.2


From a02cb230bb4fca04f091746c593de720a0e3a94a Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 12 May 2009 03:50:44 +0900
Subject: sh: clkfwk: Add a followparent_recalc() helper.

This adds a followparent_recalc() helper for clocks that just follow the
parent's rate. Switch over the few CPUs that use this scheme for some of
their clocks.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index d63352b..241f1c1 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -44,7 +44,7 @@ int __init arch_clk_init(void);
 
 /* arch/sh/kernel/cpu/clock.c */
 int clk_init(void);
-
+unsigned long followparent_recalc(struct clk *clk);
 void clk_recalc_rate(struct clk *);
 
 int clk_register(struct clk *);
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index b022aff..17f6c07 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -90,6 +90,12 @@ static void propagate_rate(struct clk *clk)
 	}
 }
 
+/* Used for clocks that always have same value as the parent clock */
+unsigned long followparent_recalc(struct clk *clk)
+{
+	return clk->parent->rate;
+}
+
 static void __clk_init(struct clk *clk)
 {
 	/*
diff --git a/arch/sh/kernel/cpu/sh2/clock-sh7619.c b/arch/sh/kernel/cpu/sh2/clock-sh7619.c
index 2679913..4fe8631 100644
--- a/arch/sh/kernel/cpu/sh2/clock-sh7619.c
+++ b/arch/sh/kernel/cpu/sh2/clock-sh7619.c
@@ -57,13 +57,8 @@ static struct clk_ops sh7619_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static unsigned long cpu_clk_recalc(struct clk *clk)
-{
-	return clk->parent->rate;
-}
-
 static struct clk_ops sh7619_cpu_clk_ops = {
-	.recalc		= cpu_clk_recalc,
+	.recalc		= followparent_recalc,
 };
 
 static struct clk_ops *sh7619_clk_ops[] = {
diff --git a/arch/sh/kernel/cpu/sh2a/clock-sh7203.c b/arch/sh/kernel/cpu/sh2a/clock-sh7203.c
index f8c6933..9409869 100644
--- a/arch/sh/kernel/cpu/sh2a/clock-sh7203.c
+++ b/arch/sh/kernel/cpu/sh2a/clock-sh7203.c
@@ -66,13 +66,8 @@ static struct clk_ops sh7203_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static unsigned long cpu_clk_recalc(struct clk *clk)
-{
-	return clk->parent->rate;
-}
-
 static struct clk_ops sh7203_cpu_clk_ops = {
-	.recalc		= cpu_clk_recalc,
+	.recalc		= followparent_recalc,
 };
 
 static struct clk_ops *sh7203_clk_ops[] = {
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index 5b1427f..4bdae84 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -646,15 +646,10 @@ static void sh7722_mstpcr_disable(struct clk *clk)
 	sh7722_mstpcr_start_stop(clk, 0);
 }
 
-static unsigned long sh7722_mstpcr_recalc(struct clk *clk)
-{
-	return clk->parent->rate;
-}
-
 static struct clk_ops sh7722_mstpcr_clk_ops = {
 	.enable = sh7722_mstpcr_enable,
 	.disable = sh7722_mstpcr_disable,
-	.recalc = sh7722_mstpcr_recalc,
+	.recalc = followparent_recalc,
 };
 
 #define MSTPCR(_name, _parent, regnr, bitnr) \
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
index 26630fb..db51cff 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
@@ -49,13 +49,8 @@ static struct clk_ops sh7763_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static unsigned long cpu_clk_recalc(struct clk *clk)
-{
-	return clk->parent->rate;
-}
-
 static struct clk_ops sh7763_cpu_clk_ops = {
-	.recalc		= cpu_clk_recalc,
+	.recalc		= followparent_recalc,
 };
 
 static struct clk_ops *sh7763_clk_ops[] = {
-- 
cgit v0.10.2


From b1f6cfe48c3cb1dfa77db3d2f42f765febaef9bc Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 12 May 2009 04:27:43 +0900
Subject: sh: clkfwk: refactor rate propagation.

This resyncs the rate propagation strategy with the scheme used by the
OMAP clock framework. Child clocks are tracked on a list under each
parent and propagation happens there specifically rather than constantly
iterating over the global clock list.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index 241f1c1..5dc8b73 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -27,6 +27,9 @@ struct clk {
 	struct clk		*parent;
 	struct clk_ops		*ops;
 
+	struct list_head	children;
+	struct list_head	sibling;	/* node for children */
+
 	int			usecount;
 
 	unsigned long		rate;
@@ -35,7 +38,6 @@ struct clk {
 };
 
 #define CLK_ALWAYS_ENABLED	(1 << 0)
-#define CLK_RATE_PROPAGATES	(1 << 1)
 #define CLK_NEEDS_INIT		(1 << 2)
 
 /* Should be defined by processor-specific code */
@@ -44,9 +46,10 @@ int __init arch_clk_init(void);
 
 /* arch/sh/kernel/cpu/clock.c */
 int clk_init(void);
-unsigned long followparent_recalc(struct clk *clk);
+unsigned long followparent_recalc(struct clk *);
+void recalculate_root_clocks(void);
+void propagate_rate(struct clk *);
 void clk_recalc_rate(struct clk *);
-
 int clk_register(struct clk *);
 void clk_unregister(struct clk *);
 
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 17f6c07..0a06df8 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -1,11 +1,11 @@
 /*
  * arch/sh/kernel/cpu/clock.c - SuperH clock framework
  *
- *  Copyright (C) 2005, 2006, 2007  Paul Mundt
+ *  Copyright (C) 2005 - 2009  Paul Mundt
  *
  * This clock framework is derived from the OMAP version by:
  *
- *	Copyright (C) 2004 - 2005 Nokia Corporation
+ *	Copyright (C) 2004 - 2008 Nokia Corporation
  *	Written by Tuukka Tikkanen <tuukka.tikkanen@elektrobit.com>
  *
  *  Modified for omap shared clock framework by Tony Lindgren <tony@atomide.com>
@@ -43,20 +43,20 @@ static DEFINE_MUTEX(clock_list_sem);
  */
 static struct clk master_clk = {
 	.name		= "master_clk",
-	.flags		= CLK_ALWAYS_ENABLED | CLK_RATE_PROPAGATES,
+	.flags		= CLK_ALWAYS_ENABLED,
 	.rate		= CONFIG_SH_PCLK_FREQ,
 };
 
 static struct clk module_clk = {
 	.name		= "module_clk",
 	.parent		= &master_clk,
-	.flags		= CLK_ALWAYS_ENABLED | CLK_RATE_PROPAGATES,
+	.flags		= CLK_ALWAYS_ENABLED,
 };
 
 static struct clk bus_clk = {
 	.name		= "bus_clk",
 	.parent		= &master_clk,
-	.flags		= CLK_ALWAYS_ENABLED | CLK_RATE_PROPAGATES,
+	.flags		= CLK_ALWAYS_ENABLED,
 };
 
 static struct clk cpu_clk = {
@@ -75,27 +75,24 @@ static struct clk *onchip_clocks[] = {
 	&cpu_clk,
 };
 
+/* Used for clocks that always have same value as the parent clock */
+unsigned long followparent_recalc(struct clk *clk)
+{
+	return clk->parent->rate;
+}
+
 /* Propagate rate to children */
-static void propagate_rate(struct clk *clk)
+void propagate_rate(struct clk *tclk)
 {
 	struct clk *clkp;
 
-	list_for_each_entry(clkp, &clock_list, node) {
-		if (likely(clkp->parent != clk))
-			continue;
-		if (likely(clkp->ops && clkp->ops->recalc))
+	list_for_each_entry(clkp, &tclk->children, sibling) {
+		if (clkp->ops->recalc)
 			clkp->rate = clkp->ops->recalc(clkp);
-		if (unlikely(clkp->flags & CLK_RATE_PROPAGATES))
-			propagate_rate(clkp);
+		propagate_rate(clkp);
 	}
 }
 
-/* Used for clocks that always have same value as the parent clock */
-unsigned long followparent_recalc(struct clk *clk)
-{
-	return clk->parent->rate;
-}
-
 static void __clk_init(struct clk *clk)
 {
 	/*
@@ -180,10 +177,46 @@ void clk_disable(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(clk_disable);
 
+static LIST_HEAD(root_clks);
+
+/**
+ * recalculate_root_clocks - recalculate and propagate all root clocks
+ *
+ * Recalculates all root clocks (clocks with no parent), which if the
+ * clock's .recalc is set correctly, should also propagate their rates.
+ * Called at init.
+ */
+void recalculate_root_clocks(void)
+{
+	struct clk *clkp;
+
+	list_for_each_entry(clkp, &root_clks, sibling) {
+		if (clkp->ops->recalc)
+			clkp->rate = clkp->ops->recalc(clkp);
+		propagate_rate(clkp);
+	}
+}
+
 int clk_register(struct clk *clk)
 {
+	if (clk == NULL || IS_ERR(clk))
+		return -EINVAL;
+
+	/*
+	 * trap out already registered clocks
+	 */
+	if (clk->node.next || clk->node.prev)
+		return 0;
+
 	mutex_lock(&clock_list_sem);
 
+	INIT_LIST_HEAD(&clk->children);
+
+	if (clk->parent)
+		list_add(&clk->sibling, &clk->parent->children);
+	else
+		list_add(&clk->sibling, &root_clks);
+
 	list_add(&clk->node, &clock_list);
 	clk->usecount = 0;
 	clk->flags |= CLK_NEEDS_INIT;
@@ -205,6 +238,7 @@ EXPORT_SYMBOL_GPL(clk_register);
 void clk_unregister(struct clk *clk)
 {
 	mutex_lock(&clock_list_sem);
+	list_del(&clk->sibling);
 	list_del(&clk->node);
 	mutex_unlock(&clock_list_sem);
 }
@@ -231,50 +265,53 @@ int clk_set_rate_ex(struct clk *clk, unsigned long rate, int algo_id)
 
 		spin_lock_irqsave(&clock_lock, flags);
 		ret = clk->ops->set_rate(clk, rate, algo_id);
+		if (ret == 0) {
+			if (clk->ops->recalc)
+				clk->rate = clk->ops->recalc(clk);
+			propagate_rate(clk);
+		}
 		spin_unlock_irqrestore(&clock_lock, flags);
 	}
 
-	if (unlikely(clk->flags & CLK_RATE_PROPAGATES))
-		propagate_rate(clk);
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(clk_set_rate_ex);
 
 void clk_recalc_rate(struct clk *clk)
 {
-	if (likely(clk->ops && clk->ops->recalc)) {
-		unsigned long flags;
+	unsigned long flags;
 
-		spin_lock_irqsave(&clock_lock, flags);
-		clk->rate = clk->ops->recalc(clk);
-		spin_unlock_irqrestore(&clock_lock, flags);
-	}
+	if (!clk->ops->recalc)
+		return;
 
-	if (unlikely(clk->flags & CLK_RATE_PROPAGATES))
-		propagate_rate(clk);
+	spin_lock_irqsave(&clock_lock, flags);
+	clk->rate = clk->ops->recalc(clk);
+	propagate_rate(clk);
+	spin_unlock_irqrestore(&clock_lock, flags);
 }
 EXPORT_SYMBOL_GPL(clk_recalc_rate);
 
 int clk_set_parent(struct clk *clk, struct clk *parent)
 {
+	unsigned long flags;
 	int ret = -EINVAL;
-	struct clk *old;
 
 	if (!parent || !clk)
 		return ret;
 
-	old = clk->parent;
-	if (likely(clk->ops && clk->ops->set_parent)) {
-		unsigned long flags;
-		spin_lock_irqsave(&clock_lock, flags);
-		ret = clk->ops->set_parent(clk, parent);
-		spin_unlock_irqrestore(&clock_lock, flags);
-		clk->parent = (ret ? old : parent);
-	}
+	spin_lock_irqsave(&clock_lock, flags);
+	if (clk->usecount == 0) {
+		if (clk->ops->set_parent)
+			ret = clk->ops->set_parent(clk, parent);
+		if (ret == 0) {
+			if (clk->ops->recalc)
+				clk->rate = clk->ops->recalc(clk);
+			propagate_rate(clk);
+		}
+	} else
+		ret = -EBUSY;
+	spin_unlock_irqrestore(&clock_lock, flags);
 
-	if (unlikely(clk->flags & CLK_RATE_PROPAGATES))
-		propagate_rate(clk);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(clk_set_parent);
@@ -457,8 +494,7 @@ int __init clk_init(void)
 	ret |= arch_clk_init();
 
 	/* Kick the child clocks.. */
-	propagate_rate(&master_clk);
-	propagate_rate(&bus_clk);
+	recalculate_root_clocks();
 
 	return ret;
 }
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index 4bdae84..8e53829 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -161,9 +161,7 @@ static unsigned long master_clk_recalc(struct clk *clk)
 static void master_clk_init(struct clk *clk)
 {
 	clk->parent = NULL;
-	clk->flags |= CLK_RATE_PROPAGATES;
-	clk->rate = CONFIG_SH_PCLK_FREQ;
-	master_clk_recalc(clk);
+	clk->rate = master_clk_recalc(clk);
 }
 
 static unsigned long module_clk_recalc(struct clk *clk)
@@ -541,19 +539,16 @@ static struct clk_ops sh7722_video_clk_ops = {
 static struct clk sh7722_umem_clock = {
 	.name = "umem_clk",
 	.ops = &sh7722_frqcr_clk_ops,
-	.flags = CLK_RATE_PROPAGATES,
 };
 
 static struct clk sh7722_sh_clock = {
 	.name = "sh_clk",
 	.ops = &sh7722_frqcr_clk_ops,
-	.flags = CLK_RATE_PROPAGATES,
 };
 
 static struct clk sh7722_peripheral_clock = {
 	.name = "peripheral_clk",
 	.ops = &sh7722_frqcr_clk_ops,
-	.flags = CLK_RATE_PROPAGATES,
 };
 
 static struct clk sh7722_sdram_clock = {
@@ -564,7 +559,6 @@ static struct clk sh7722_sdram_clock = {
 static struct clk sh7722_r_clock = {
 	.name = "r_clk",
 	.rate = 32768,
-	.flags = CLK_RATE_PROPAGATES,
 };
 
 #if !defined(CONFIG_CPU_SUBTYPE_SH7343) &&\
-- 
cgit v0.10.2


From 4ff29ff8e8723a41e7defd8bc78a7b16cbf940a2 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 12 May 2009 05:14:53 +0900
Subject: sh: clkfwk: Consolidate the ALWAYS_ENABLED / NEEDS_INIT mess.

There is no real distinction here in behaviour, either a clock needs to
be enabled on initialiation or not. The ALWAYS_ENABLED flag was always
intended to only apply to clocks that were physically always on and could
simply not be disabled at all from software. Unfortunately over time this
was abused and the meaning became a bit blurry.

So, we kill off both of all of those paths now, as well as the newer
NEEDS_INIT flag, and consolidate on a CLK_ENABLE_ON_INIT. Clocks that
need to be enabled on initialization can set this, and it will purposely
enable them and bump the refcount up.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index 5dc8b73..246f9eb 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -37,8 +37,7 @@ struct clk {
 	unsigned long		arch_flags;
 };
 
-#define CLK_ALWAYS_ENABLED	(1 << 0)
-#define CLK_NEEDS_INIT		(1 << 2)
+#define CLK_ENABLE_ON_INIT	(1 << 0)
 
 /* Should be defined by processor-specific code */
 void arch_init_clk_ops(struct clk_ops **, int type);
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 0a06df8..c683be5 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -43,26 +43,26 @@ static DEFINE_MUTEX(clock_list_sem);
  */
 static struct clk master_clk = {
 	.name		= "master_clk",
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 	.rate		= CONFIG_SH_PCLK_FREQ,
 };
 
 static struct clk module_clk = {
 	.name		= "module_clk",
 	.parent		= &master_clk,
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 };
 
 static struct clk bus_clk = {
 	.name		= "bus_clk",
 	.parent		= &master_clk,
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 };
 
 static struct clk cpu_clk = {
 	.name		= "cpu_clk",
 	.parent		= &master_clk,
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 };
 
 /*
@@ -93,39 +93,11 @@ void propagate_rate(struct clk *tclk)
 	}
 }
 
-static void __clk_init(struct clk *clk)
-{
-	/*
-	 * See if this is the first time we're enabling the clock, some
-	 * clocks that are always enabled still require "special"
-	 * initialization. This is especially true if the clock mode
-	 * changes and the clock needs to hunt for the proper set of
-	 * divisors to use before it can effectively recalc.
-	 */
-
-	if (clk->flags & CLK_NEEDS_INIT) {
-		if (clk->ops && clk->ops->init)
-			clk->ops->init(clk);
-
-		clk->flags &= ~CLK_NEEDS_INIT;
-	}
-}
-
 static int __clk_enable(struct clk *clk)
 {
-	if (!clk)
-		return -EINVAL;
-
-	clk->usecount++;
-
-	/* nothing to do if always enabled */
-	if (clk->flags & CLK_ALWAYS_ENABLED)
-		return 0;
-
-	if (clk->usecount == 1) {
-		__clk_init(clk);
-
-		__clk_enable(clk->parent);
+	if (clk->usecount++ == 0) {
+		if (clk->parent)
+			__clk_enable(clk->parent);
 
 		if (clk->ops && clk->ops->enable)
 			clk->ops->enable(clk);
@@ -139,6 +111,9 @@ int clk_enable(struct clk *clk)
 	unsigned long flags;
 	int ret;
 
+	if (!clk)
+		return -EINVAL;
+
 	spin_lock_irqsave(&clock_lock, flags);
 	ret = __clk_enable(clk);
 	spin_unlock_irqrestore(&clock_lock, flags);
@@ -149,21 +124,11 @@ EXPORT_SYMBOL_GPL(clk_enable);
 
 static void __clk_disable(struct clk *clk)
 {
-	if (!clk)
-		return;
-
-	clk->usecount--;
-
-	WARN_ON(clk->usecount < 0);
-
-	if (clk->flags & CLK_ALWAYS_ENABLED)
-		return;
-
-	if (clk->usecount == 0) {
+	if (clk->usecount > 0 && !(--clk->usecount)) {
 		if (likely(clk->ops && clk->ops->disable))
 			clk->ops->disable(clk);
-
-		__clk_disable(clk->parent);
+		if (likely(clk->parent))
+			__clk_disable(clk->parent);
 	}
 }
 
@@ -171,6 +136,9 @@ void clk_disable(struct clk *clk)
 {
 	unsigned long flags;
 
+	if (!clk)
+		return;
+
 	spin_lock_irqsave(&clock_lock, flags);
 	__clk_disable(clk);
 	spin_unlock_irqrestore(&clock_lock, flags);
@@ -211,6 +179,7 @@ int clk_register(struct clk *clk)
 	mutex_lock(&clock_list_sem);
 
 	INIT_LIST_HEAD(&clk->children);
+	clk->usecount = 0;
 
 	if (clk->parent)
 		list_add(&clk->sibling, &clk->parent->children);
@@ -218,19 +187,10 @@ int clk_register(struct clk *clk)
 		list_add(&clk->sibling, &root_clks);
 
 	list_add(&clk->node, &clock_list);
-	clk->usecount = 0;
-	clk->flags |= CLK_NEEDS_INIT;
-
+	if (clk->ops->init)
+		clk->ops->init(clk);
 	mutex_unlock(&clock_list_sem);
 
-	if (clk->flags & CLK_ALWAYS_ENABLED) {
-		__clk_init(clk);
-		pr_debug( "Clock '%s' is ALWAYS_ENABLED\n", clk->name);
-		if (clk->ops && clk->ops->enable)
-			clk->ops->enable(clk);
-		pr_debug( "Enabled.");
-	}
-
 	return 0;
 }
 EXPORT_SYMBOL_GPL(clk_register);
@@ -244,6 +204,15 @@ void clk_unregister(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(clk_unregister);
 
+static void clk_enable_init_clocks(void)
+{
+	struct clk *clkp;
+
+	list_for_each_entry(clkp, &clock_list, node)
+		if (clkp->flags & CLK_ENABLE_ON_INIT)
+			clk_enable(clkp);
+}
+
 unsigned long clk_get_rate(struct clk *clk)
 {
 	return clk->rate;
@@ -404,9 +373,7 @@ static int show_clocks(char *buf, char **start, off_t off,
 
 		p += sprintf(p, "%-12s\t: %ld.%02ldMHz\t%s\n", clk->name,
 			     rate / 1000000, (rate % 1000000) / 10000,
-			     ((clk->flags & CLK_ALWAYS_ENABLED) ||
-			      clk->usecount > 0) ?
-			     "enabled" : "disabled");
+			      (clk->usecount > 0) ?  "enabled" : "disabled");
 	}
 
 	return p - buf;
@@ -496,6 +463,9 @@ int __init clk_init(void)
 	/* Kick the child clocks.. */
 	recalculate_root_clocks();
 
+	/* Enable the necessary init clocks */
+	clk_enable_init_clocks();
+
 	return ret;
 }
 
diff --git a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
index 628d50e..0caca9f 100644
--- a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
+++ b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
@@ -46,7 +46,7 @@ static struct clk_ops sh4202_emi_clk_ops = {
 
 static struct clk sh4202_emi_clk = {
 	.name		= "emi_clk",
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 	.ops		= &sh4202_emi_clk_ops,
 };
 
@@ -62,7 +62,7 @@ static struct clk_ops sh4202_femi_clk_ops = {
 
 static struct clk sh4202_femi_clk = {
 	.name		= "femi_clk",
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 	.ops		= &sh4202_femi_clk_ops,
 };
 
@@ -140,7 +140,7 @@ static struct clk_ops sh4202_shoc_clk_ops = {
 
 static struct clk sh4202_shoc_clk = {
 	.name		= "shoc_clk",
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 	.ops		= &sh4202_shoc_clk_ops,
 };
 
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
index db51cff..21bd70f 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
@@ -78,7 +78,7 @@ static struct clk_ops sh7763_shyway_clk_ops = {
 
 static struct clk sh7763_shyway_clk = {
 	.name		= "shyway_clk",
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 	.ops		= &sh7763_shyway_clk_ops,
 };
 
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
index ba8dacc..4c11f89 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
@@ -84,7 +84,7 @@ static struct clk_ops sh7780_shyway_clk_ops = {
 
 static struct clk sh7780_shyway_clk = {
 	.name		= "shyway_clk",
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 	.ops		= &sh7780_shyway_clk_ops,
 };
 
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index 52691ea..edd4328 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -88,7 +88,7 @@ static struct clk_ops sh7785_shyway_clk_ops = {
 
 static struct clk sh7785_shyway_clk = {
 	.name		= "shyway_clk",
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 	.ops		= &sh7785_shyway_clk_ops,
 };
 
@@ -104,7 +104,7 @@ static struct clk_ops sh7785_ddr_clk_ops = {
 
 static struct clk sh7785_ddr_clk = {
 	.name		= "ddr_clk",
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 	.ops		= &sh7785_ddr_clk_ops,
 };
 
@@ -120,7 +120,7 @@ static struct clk_ops sh7785_ram_clk_ops = {
 
 static struct clk sh7785_ram_clk = {
 	.name		= "ram_clk",
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 	.ops		= &sh7785_ram_clk_ops,
 };
 
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
index 2e00ff4..2825494 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
@@ -91,7 +91,7 @@ static struct clk_ops sh7786_shyway_clk_ops = {
 
 static struct clk sh7786_shyway_clk = {
 	.name		= "shyway_clk",
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 	.ops		= &sh7786_shyway_clk_ops,
 };
 
@@ -107,7 +107,7 @@ static struct clk_ops sh7786_ddr_clk_ops = {
 
 static struct clk sh7786_ddr_clk = {
 	.name		= "ddr_clk",
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 	.ops		= &sh7786_ddr_clk_ops,
 };
 
diff --git a/arch/sh/kernel/cpu/sh4a/clock-shx3.c b/arch/sh/kernel/cpu/sh4a/clock-shx3.c
index 770934e..6e5c864 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-shx3.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-shx3.c
@@ -95,7 +95,7 @@ static struct clk_ops shx3_shyway_clk_ops = {
 
 static struct clk shx3_shyway_clk = {
 	.name		= "shyway_clk",
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 	.ops		= &shx3_shyway_clk_ops,
 };
 
-- 
cgit v0.10.2


From 154502e160e02dee7b00ec2149762ae5d48e0bb4 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 12 May 2009 05:18:13 +0900
Subject: sh: clkfwk: Convert SH-Mobile CPUs to use CLK_ENABLE_ON_INIT.

Kill off all of the clk_always_enabled leftovers and use the new flag
directly.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index 246f9eb..e9fced9 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -52,22 +52,6 @@ void clk_recalc_rate(struct clk *);
 int clk_register(struct clk *);
 void clk_unregister(struct clk *);
 
-static inline int clk_always_enable(const char *id)
-{
-	struct clk *clk;
-	int ret;
-
-	clk = clk_get(NULL, id);
-	if (IS_ERR(clk))
-		return PTR_ERR(clk);
-
-	ret = clk_enable(clk);
-	if (ret)
-		clk_put(clk);
-
-	return ret;
-}
-
 /* the exported API, in addition to clk_set_rate */
 /**
  * clk_set_rate_ex - set the clock rate for a clock source, with additional parameter
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index 8e53829..f777d00 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -646,207 +646,208 @@ static struct clk_ops sh7722_mstpcr_clk_ops = {
 	.recalc = followparent_recalc,
 };
 
-#define MSTPCR(_name, _parent, regnr, bitnr) \
+#define MSTPCR(_name, _parent, regnr, bitnr, _flags) \
 {						\
 	.name = _name,				\
+	.flags = _flags,			\
 	.arch_flags = MSTPCR_ARCH_FLAGS(regnr, bitnr),	\
 	.ops = (void *)_parent,		\
 }
 
 static struct clk sh7722_mstpcr_clocks[] = {
 #if defined(CONFIG_CPU_SUBTYPE_SH7722)
-	MSTPCR("uram0", "umem_clk", 0, 28),
-	MSTPCR("xymem0", "bus_clk", 0, 26),
-	MSTPCR("tmu0", "peripheral_clk", 0, 15),
-	MSTPCR("cmt0", "r_clk", 0, 14),
-	MSTPCR("rwdt0", "r_clk", 0, 13),
-	MSTPCR("flctl0", "peripheral_clk", 0, 10),
-	MSTPCR("scif0", "peripheral_clk", 0, 7),
-	MSTPCR("scif1", "peripheral_clk", 0, 6),
-	MSTPCR("scif2", "peripheral_clk", 0, 5),
-	MSTPCR("i2c0", "peripheral_clk", 1, 9),
-	MSTPCR("rtc0", "r_clk", 1, 8),
-	MSTPCR("sdhi0", "peripheral_clk", 2, 18),
-	MSTPCR("keysc0", "r_clk", 2, 14),
-	MSTPCR("usbf0", "peripheral_clk", 2, 11),
-	MSTPCR("2dg0", "bus_clk", 2, 9),
-	MSTPCR("siu0", "bus_clk", 2, 8),
-	MSTPCR("vou0", "bus_clk", 2, 5),
-	MSTPCR("jpu0", "bus_clk", 2, 6),
-	MSTPCR("beu0", "bus_clk", 2, 4),
-	MSTPCR("ceu0", "bus_clk", 2, 3),
-	MSTPCR("veu0", "bus_clk", 2, 2),
-	MSTPCR("vpu0", "bus_clk", 2, 1),
-	MSTPCR("lcdc0", "bus_clk", 2, 0),
+	MSTPCR("uram0", "umem_clk", 0, 28, CLK_ENABLE_ON_INIT),
+	MSTPCR("xymem0", "bus_clk", 0, 26, CLK_ENABLE_ON_INIT),
+	MSTPCR("tmu0", "peripheral_clk", 0, 15, 0),
+	MSTPCR("cmt0", "r_clk", 0, 14, 0),
+	MSTPCR("rwdt0", "r_clk", 0, 13, 0),
+	MSTPCR("flctl0", "peripheral_clk", 0, 10, 0),
+	MSTPCR("scif0", "peripheral_clk", 0, 7, 0),
+	MSTPCR("scif1", "peripheral_clk", 0, 6, 0),
+	MSTPCR("scif2", "peripheral_clk", 0, 5, 0),
+	MSTPCR("i2c0", "peripheral_clk", 1, 9, 0),
+	MSTPCR("rtc0", "r_clk", 1, 8, 0),
+	MSTPCR("sdhi0", "peripheral_clk", 2, 18, 0),
+	MSTPCR("keysc0", "r_clk", 2, 14, 0),
+	MSTPCR("usbf0", "peripheral_clk", 2, 11, 0),
+	MSTPCR("2dg0", "bus_clk", 2, 9, 0),
+	MSTPCR("siu0", "bus_clk", 2, 8, 0),
+	MSTPCR("vou0", "bus_clk", 2, 5, 0),
+	MSTPCR("jpu0", "bus_clk", 2, 6, CLK_ENABLE_ON_INIT),
+	MSTPCR("beu0", "bus_clk", 2, 4, 0),
+	MSTPCR("ceu0", "bus_clk", 2, 3, 0),
+	MSTPCR("veu0", "bus_clk", 2, 2, CLK_ENABLE_ON_INIT),
+	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
+	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
 #endif
 #if defined(CONFIG_CPU_SUBTYPE_SH7723)
 	/* See page 60 of Datasheet V1.0: Overview -> Block Diagram */
-	MSTPCR("tlb0", "cpu_clk", 0, 31),
-	MSTPCR("ic0", "cpu_clk", 0, 30),
-	MSTPCR("oc0", "cpu_clk", 0, 29),
-	MSTPCR("l2c0", "sh_clk", 0, 28),
-	MSTPCR("ilmem0", "cpu_clk", 0, 27),
-	MSTPCR("fpu0", "cpu_clk", 0, 24),
-	MSTPCR("intc0", "cpu_clk", 0, 22),
-	MSTPCR("dmac0", "bus_clk", 0, 21),
-	MSTPCR("sh0", "sh_clk", 0, 20),
-	MSTPCR("hudi0", "peripheral_clk", 0, 19),
-	MSTPCR("ubc0", "cpu_clk", 0, 17),
-	MSTPCR("tmu0", "peripheral_clk", 0, 15),
-	MSTPCR("cmt0", "r_clk", 0, 14),
-	MSTPCR("rwdt0", "r_clk", 0, 13),
-	MSTPCR("dmac1", "bus_clk", 0, 12),
-	MSTPCR("tmu1", "peripheral_clk", 0, 11),
-	MSTPCR("flctl0", "peripheral_clk", 0, 10),
-	MSTPCR("scif0", "peripheral_clk", 0, 9),
-	MSTPCR("scif1", "peripheral_clk", 0, 8),
-	MSTPCR("scif2", "peripheral_clk", 0, 7),
-	MSTPCR("scif3", "bus_clk", 0, 6),
-	MSTPCR("scif4", "bus_clk", 0, 5),
-	MSTPCR("scif5", "bus_clk", 0, 4),
-	MSTPCR("msiof0", "bus_clk", 0, 2),
-	MSTPCR("msiof1", "bus_clk", 0, 1),
-	MSTPCR("meram0", "sh_clk", 0, 0),
-	MSTPCR("i2c0", "peripheral_clk", 1, 9),
-	MSTPCR("rtc0", "r_clk", 1, 8),
-	MSTPCR("atapi0", "sh_clk", 2, 28),
-	MSTPCR("adc0", "peripheral_clk", 2, 28),
-	MSTPCR("tpu0", "bus_clk", 2, 25),
-	MSTPCR("irda0", "peripheral_clk", 2, 24),
-	MSTPCR("tsif0", "bus_clk", 2, 22),
-	MSTPCR("icb0", "bus_clk", 2, 21),
-	MSTPCR("sdhi0", "bus_clk", 2, 18),
-	MSTPCR("sdhi1", "bus_clk", 2, 17),
-	MSTPCR("keysc0", "r_clk", 2, 14),
-	MSTPCR("usb0", "bus_clk", 2, 11),
-	MSTPCR("2dg0", "bus_clk", 2, 10),
-	MSTPCR("siu0", "bus_clk", 2, 8),
-	MSTPCR("veu1", "bus_clk", 2, 6),
-	MSTPCR("vou0", "bus_clk", 2, 5),
-	MSTPCR("beu0", "bus_clk", 2, 4),
-	MSTPCR("ceu0", "bus_clk", 2, 3),
-	MSTPCR("veu0", "bus_clk", 2, 2),
-	MSTPCR("vpu0", "bus_clk", 2, 1),
-	MSTPCR("lcdc0", "bus_clk", 2, 0),
+	MSTPCR("tlb0", "cpu_clk", 0, 31, 0),
+	MSTPCR("ic0", "cpu_clk", 0, 30, 0),
+	MSTPCR("oc0", "cpu_clk", 0, 29, 0),
+	MSTPCR("l2c0", "sh_clk", 0, 28, 0),
+	MSTPCR("ilmem0", "cpu_clk", 0, 27, 0),
+	MSTPCR("fpu0", "cpu_clk", 0, 24, 0),
+	MSTPCR("intc0", "cpu_clk", 0, 22, 0),
+	MSTPCR("dmac0", "bus_clk", 0, 21, 0),
+	MSTPCR("sh0", "sh_clk", 0, 20, 0),
+	MSTPCR("hudi0", "peripheral_clk", 0, 19, 0),
+	MSTPCR("ubc0", "cpu_clk", 0, 17, 0),
+	MSTPCR("tmu0", "peripheral_clk", 0, 15, 0),
+	MSTPCR("cmt0", "r_clk", 0, 14, 0),
+	MSTPCR("rwdt0", "r_clk", 0, 13, 0),
+	MSTPCR("dmac1", "bus_clk", 0, 12, 0),
+	MSTPCR("tmu1", "peripheral_clk", 0, 11, 0),
+	MSTPCR("flctl0", "peripheral_clk", 0, 10, 0),
+	MSTPCR("scif0", "peripheral_clk", 0, 9, 0),
+	MSTPCR("scif1", "peripheral_clk", 0, 8, 0),
+	MSTPCR("scif2", "peripheral_clk", 0, 7, 0),
+	MSTPCR("scif3", "bus_clk", 0, 6, 0),
+	MSTPCR("scif4", "bus_clk", 0, 5, 0),
+	MSTPCR("scif5", "bus_clk", 0, 4, 0),
+	MSTPCR("msiof0", "bus_clk", 0, 2, 0),
+	MSTPCR("msiof1", "bus_clk", 0, 1, 0),
+	MSTPCR("meram0", "sh_clk", 0, 0, CLK_ENABLE_ON_INIT),
+	MSTPCR("i2c0", "peripheral_clk", 1, 9, 0),
+	MSTPCR("rtc0", "r_clk", 1, 8, 0),
+	MSTPCR("atapi0", "sh_clk", 2, 28, 0),
+	MSTPCR("adc0", "peripheral_clk", 2, 28, 0),
+	MSTPCR("tpu0", "bus_clk", 2, 25, 0),
+	MSTPCR("irda0", "peripheral_clk", 2, 24, 0),
+	MSTPCR("tsif0", "bus_clk", 2, 22, 0),
+	MSTPCR("icb0", "bus_clk", 2, 21, 0),
+	MSTPCR("sdhi0", "bus_clk", 2, 18, 0),
+	MSTPCR("sdhi1", "bus_clk", 2, 17, 0),
+	MSTPCR("keysc0", "r_clk", 2, 14, 0),
+	MSTPCR("usb0", "bus_clk", 2, 11, 0),
+	MSTPCR("2dg0", "bus_clk", 2, 10, 0),
+	MSTPCR("siu0", "bus_clk", 2, 8, 0),
+	MSTPCR("veu1", "bus_clk", 2, 6, CLK_ENABLE_ON_INIT),
+	MSTPCR("vou0", "bus_clk", 2, 5, 0),
+	MSTPCR("beu0", "bus_clk", 2, 4, 0),
+	MSTPCR("ceu0", "bus_clk", 2, 3, 0),
+	MSTPCR("veu0", "bus_clk", 2, 2, CLK_ENABLE_ON_INIT),
+	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
+	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
 #endif
 #if defined(CONFIG_CPU_SUBTYPE_SH7724)
 	/* See Datasheet : Overview -> Block Diagram */
-	MSTPCR("tlb0", "cpu_clk", 0, 31),
-	MSTPCR("ic0", "cpu_clk", 0, 30),
-	MSTPCR("oc0", "cpu_clk", 0, 29),
-	MSTPCR("rs0", "bus_clk", 0, 28),
-	MSTPCR("ilmem0", "cpu_clk", 0, 27),
-	MSTPCR("l2c0", "sh_clk", 0, 26),
-	MSTPCR("fpu0", "cpu_clk", 0, 24),
-	MSTPCR("intc0", "peripheral_clk", 0, 22),
-	MSTPCR("dmac0", "bus_clk", 0, 21),
-	MSTPCR("sh0", "sh_clk", 0, 20),
-	MSTPCR("hudi0", "peripheral_clk", 0, 19),
-	MSTPCR("ubc0", "cpu_clk", 0, 17),
-	MSTPCR("tmu0", "peripheral_clk", 0, 15),
-	MSTPCR("cmt0", "r_clk", 0, 14),
-	MSTPCR("rwdt0", "r_clk", 0, 13),
-	MSTPCR("dmac1", "bus_clk", 0, 12),
-	MSTPCR("tmu1", "peripheral_clk", 0, 10),
-	MSTPCR("scif0", "peripheral_clk", 0, 9),
-	MSTPCR("scif1", "peripheral_clk", 0, 8),
-	MSTPCR("scif2", "peripheral_clk", 0, 7),
-	MSTPCR("scif3", "bus_clk", 0, 6),
-	MSTPCR("scif4", "bus_clk", 0, 5),
-	MSTPCR("scif5", "bus_clk", 0, 4),
-	MSTPCR("msiof0", "bus_clk", 0, 2),
-	MSTPCR("msiof1", "bus_clk", 0, 1),
-	MSTPCR("keysc0", "r_clk", 1, 12),
-	MSTPCR("rtc0", "r_clk", 1, 11),
-	MSTPCR("i2c0", "peripheral_clk", 1, 9),
-	MSTPCR("i2c1", "peripheral_clk", 1, 8),
-	MSTPCR("mmc0", "bus_clk", 2, 29),
-	MSTPCR("eth0", "bus_clk", 2, 28),
-	MSTPCR("atapi0", "bus_clk", 2, 26),
-	MSTPCR("tpu0", "bus_clk", 2, 25),
-	MSTPCR("irda0", "peripheral_clk", 2, 24),
-	MSTPCR("tsif0", "bus_clk", 2, 22),
-	MSTPCR("usb1", "bus_clk", 2, 21),
-	MSTPCR("usb0", "bus_clk", 2, 20),
-	MSTPCR("2dg0", "bus_clk", 2, 19),
-	MSTPCR("sdhi0", "bus_clk", 2, 18),
-	MSTPCR("sdhi1", "bus_clk", 2, 17),
-	MSTPCR("veu1", "bus_clk", 2, 15),
-	MSTPCR("ceu1", "bus_clk", 2, 13),
-	MSTPCR("beu1", "bus_clk", 2, 12),
-	MSTPCR("2ddmac0", "sh_clk", 2, 10),
-	MSTPCR("spu0", "bus_clk", 2, 9),
-	MSTPCR("jpu0", "bus_clk", 2, 6),
-	MSTPCR("vou0", "bus_clk", 2, 5),
-	MSTPCR("beu0", "bus_clk", 2, 4),
-	MSTPCR("ceu0", "bus_clk", 2, 3),
-	MSTPCR("veu0", "bus_clk", 2, 2),
-	MSTPCR("vpu0", "bus_clk", 2, 1),
-	MSTPCR("lcdc0", "bus_clk", 2, 0),
+	MSTPCR("tlb0", "cpu_clk", 0, 31, 0),
+	MSTPCR("ic0", "cpu_clk", 0, 30, 0),
+	MSTPCR("oc0", "cpu_clk", 0, 29, 0),
+	MSTPCR("rs0", "bus_clk", 0, 28, 0),
+	MSTPCR("ilmem0", "cpu_clk", 0, 27, 0),
+	MSTPCR("l2c0", "sh_clk", 0, 26, 0),
+	MSTPCR("fpu0", "cpu_clk", 0, 24, 0),
+	MSTPCR("intc0", "peripheral_clk", 0, 22, 0),
+	MSTPCR("dmac0", "bus_clk", 0, 21, 0),
+	MSTPCR("sh0", "sh_clk", 0, 20, 0),
+	MSTPCR("hudi0", "peripheral_clk", 0, 19, 0),
+	MSTPCR("ubc0", "cpu_clk", 0, 17, 0),
+	MSTPCR("tmu0", "peripheral_clk", 0, 15, 0),
+	MSTPCR("cmt0", "r_clk", 0, 14, 0),
+	MSTPCR("rwdt0", "r_clk", 0, 13, 0),
+	MSTPCR("dmac1", "bus_clk", 0, 12, 0),
+	MSTPCR("tmu1", "peripheral_clk", 0, 10, 0),
+	MSTPCR("scif0", "peripheral_clk", 0, 9, 0),
+	MSTPCR("scif1", "peripheral_clk", 0, 8, 0),
+	MSTPCR("scif2", "peripheral_clk", 0, 7, 0),
+	MSTPCR("scif3", "bus_clk", 0, 6, 0),
+	MSTPCR("scif4", "bus_clk", 0, 5, 0),
+	MSTPCR("scif5", "bus_clk", 0, 4, 0),
+	MSTPCR("msiof0", "bus_clk", 0, 2, 0),
+	MSTPCR("msiof1", "bus_clk", 0, 1, 0),
+	MSTPCR("keysc0", "r_clk", 1, 12, 0),
+	MSTPCR("rtc0", "r_clk", 1, 11, 0),
+	MSTPCR("i2c0", "peripheral_clk", 1, 9, 0),
+	MSTPCR("i2c1", "peripheral_clk", 1, 8, 0),
+	MSTPCR("mmc0", "bus_clk", 2, 29, 0),
+	MSTPCR("eth0", "bus_clk", 2, 28, 0),
+	MSTPCR("atapi0", "bus_clk", 2, 26, 0),
+	MSTPCR("tpu0", "bus_clk", 2, 25, 0),
+	MSTPCR("irda0", "peripheral_clk", 2, 24, 0),
+	MSTPCR("tsif0", "bus_clk", 2, 22, 0),
+	MSTPCR("usb1", "bus_clk", 2, 21, 0),
+	MSTPCR("usb0", "bus_clk", 2, 20, 0),
+	MSTPCR("2dg0", "bus_clk", 2, 19, 0),
+	MSTPCR("sdhi0", "bus_clk", 2, 18, 0),
+	MSTPCR("sdhi1", "bus_clk", 2, 17, 0),
+	MSTPCR("veu1", "bus_clk", 2, 15, CLK_ENABLE_ON_INIT),
+	MSTPCR("ceu1", "bus_clk", 2, 13, 0),
+	MSTPCR("beu1", "bus_clk", 2, 12, 0),
+	MSTPCR("2ddmac0", "sh_clk", 2, 10, 0),
+	MSTPCR("spu0", "bus_clk", 2, 9, 0),
+	MSTPCR("jpu0", "bus_clk", 2, 6, 0),
+	MSTPCR("vou0", "bus_clk", 2, 5, 0),
+	MSTPCR("beu0", "bus_clk", 2, 4, 0),
+	MSTPCR("ceu0", "bus_clk", 2, 3, 0),
+	MSTPCR("veu0", "bus_clk", 2, 2, CLK_ENABLE_ON_INIT),
+	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
+	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
 #endif
 #if defined(CONFIG_CPU_SUBTYPE_SH7343)
-	MSTPCR("uram0", "umem_clk", 0, 28),
-	MSTPCR("xymem0", "bus_clk", 0, 26),
-	MSTPCR("tmu0", "peripheral_clk", 0, 15),
-	MSTPCR("cmt0", "r_clk", 0, 14),
-	MSTPCR("rwdt0", "r_clk", 0, 13),
-	MSTPCR("scif0", "peripheral_clk", 0, 7),
-	MSTPCR("scif1", "peripheral_clk", 0, 6),
-	MSTPCR("scif2", "peripheral_clk", 0, 5),
-	MSTPCR("scif3", "peripheral_clk", 0, 4),
-	MSTPCR("i2c0", "peripheral_clk", 1, 9),
-	MSTPCR("i2c1", "peripheral_clk", 1, 8),
-	MSTPCR("sdhi0", "peripheral_clk", 2, 18),
-	MSTPCR("keysc0", "r_clk", 2, 14),
-	MSTPCR("usbf0", "peripheral_clk", 2, 11),
-	MSTPCR("siu0", "bus_clk", 2, 8),
-	MSTPCR("jpu0", "bus_clk", 2, 6),
-	MSTPCR("vou0", "bus_clk", 2, 5),
-	MSTPCR("beu0", "bus_clk", 2, 4),
-	MSTPCR("ceu0", "bus_clk", 2, 3),
-	MSTPCR("veu0", "bus_clk", 2, 2),
-	MSTPCR("vpu0", "bus_clk", 2, 1),
-	MSTPCR("lcdc0", "bus_clk", 2, 0),
+	MSTPCR("uram0", "umem_clk", 0, 28, CLK_ENABLE_ON_INIT),
+	MSTPCR("xymem0", "bus_clk", 0, 26, CLK_ENABLE_ON_INIT),
+	MSTPCR("tmu0", "peripheral_clk", 0, 15, 0),
+	MSTPCR("cmt0", "r_clk", 0, 14, 0),
+	MSTPCR("rwdt0", "r_clk", 0, 13, 0),
+	MSTPCR("scif0", "peripheral_clk", 0, 7, 0),
+	MSTPCR("scif1", "peripheral_clk", 0, 6, 0),
+	MSTPCR("scif2", "peripheral_clk", 0, 5, 0),
+	MSTPCR("scif3", "peripheral_clk", 0, 4, 0),
+	MSTPCR("i2c0", "peripheral_clk", 1, 9, 0),
+	MSTPCR("i2c1", "peripheral_clk", 1, 8, 0),
+	MSTPCR("sdhi0", "peripheral_clk", 2, 18, 0),
+	MSTPCR("keysc0", "r_clk", 2, 14, 0),
+	MSTPCR("usbf0", "peripheral_clk", 2, 11, 0),
+	MSTPCR("siu0", "bus_clk", 2, 8, 0),
+	MSTPCR("jpu0", "bus_clk", 2, 6, CLK_ENABLE_ON_INIT),
+	MSTPCR("vou0", "bus_clk", 2, 5, 0),
+	MSTPCR("beu0", "bus_clk", 2, 4, 0),
+	MSTPCR("ceu0", "bus_clk", 2, 3, 0),
+	MSTPCR("veu0", "bus_clk", 2, 2, CLK_ENABLE_ON_INIT),
+	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
+	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
 #endif
 #if defined(CONFIG_CPU_SUBTYPE_SH7366)
 	/* See page 52 of Datasheet V0.40: Overview -> Block Diagram */
-	MSTPCR("tlb0", "cpu_clk", 0, 31),
-	MSTPCR("ic0", "cpu_clk", 0, 30),
-	MSTPCR("oc0", "cpu_clk", 0, 29),
-	MSTPCR("rsmem0", "sh_clk", 0, 28),
-	MSTPCR("xymem0", "cpu_clk", 0, 26),
-	MSTPCR("intc30", "peripheral_clk", 0, 23),
-	MSTPCR("intc0", "peripheral_clk", 0, 22),
-	MSTPCR("dmac0", "bus_clk", 0, 21),
-	MSTPCR("sh0", "sh_clk", 0, 20),
-	MSTPCR("hudi0", "peripheral_clk", 0, 19),
-	MSTPCR("ubc0", "cpu_clk", 0, 17),
-	MSTPCR("tmu0", "peripheral_clk", 0, 15),
-	MSTPCR("cmt0", "r_clk", 0, 14),
-	MSTPCR("rwdt0", "r_clk", 0, 13),
-	MSTPCR("flctl0", "peripheral_clk", 0, 10),
-	MSTPCR("scif0", "peripheral_clk", 0, 7),
-	MSTPCR("scif1", "bus_clk", 0, 6),
-	MSTPCR("scif2", "bus_clk", 0, 5),
-	MSTPCR("msiof0", "peripheral_clk", 0, 2),
-	MSTPCR("sbr0", "peripheral_clk", 0, 1),
-	MSTPCR("i2c0", "peripheral_clk", 1, 9),
-	MSTPCR("icb0", "bus_clk", 2, 27),
-	MSTPCR("meram0", "sh_clk", 2, 26),
-	MSTPCR("dacc0", "peripheral_clk", 2, 24),
-	MSTPCR("dacy0", "peripheral_clk", 2, 23),
-	MSTPCR("tsif0", "bus_clk", 2, 22),
-	MSTPCR("sdhi0", "bus_clk", 2, 18),
-	MSTPCR("mmcif0", "bus_clk", 2, 17),
-	MSTPCR("usb0", "bus_clk", 2, 11),
-	MSTPCR("siu0", "bus_clk", 2, 8),
-	MSTPCR("veu1", "bus_clk", 2, 7),
-	MSTPCR("vou0", "bus_clk", 2, 5),
-	MSTPCR("beu0", "bus_clk", 2, 4),
-	MSTPCR("ceu0", "bus_clk", 2, 3),
-	MSTPCR("veu0", "bus_clk", 2, 2),
-	MSTPCR("vpu0", "bus_clk", 2, 1),
-	MSTPCR("lcdc0", "bus_clk", 2, 0),
+	MSTPCR("tlb0", "cpu_clk", 0, 31, 0),
+	MSTPCR("ic0", "cpu_clk", 0, 30, 0),
+	MSTPCR("oc0", "cpu_clk", 0, 29, 0),
+	MSTPCR("rsmem0", "sh_clk", 0, 28, CLK_ENABLE_ON_INIT),
+	MSTPCR("xymem0", "cpu_clk", 0, 26, CLK_ENABLE_ON_INIT),
+	MSTPCR("intc30", "peripheral_clk", 0, 23, 0),
+	MSTPCR("intc0", "peripheral_clk", 0, 22, 0),
+	MSTPCR("dmac0", "bus_clk", 0, 21, 0),
+	MSTPCR("sh0", "sh_clk", 0, 20, 0),
+	MSTPCR("hudi0", "peripheral_clk", 0, 19, 0),
+	MSTPCR("ubc0", "cpu_clk", 0, 17, 0),
+	MSTPCR("tmu0", "peripheral_clk", 0, 15, 0),
+	MSTPCR("cmt0", "r_clk", 0, 14, 0),
+	MSTPCR("rwdt0", "r_clk", 0, 13, 0),
+	MSTPCR("flctl0", "peripheral_clk", 0, 10, 0),
+	MSTPCR("scif0", "peripheral_clk", 0, 7, 0),
+	MSTPCR("scif1", "bus_clk", 0, 6, 0),
+	MSTPCR("scif2", "bus_clk", 0, 5, 0),
+	MSTPCR("msiof0", "peripheral_clk", 0, 2, 0),
+	MSTPCR("sbr0", "peripheral_clk", 0, 1, 0),
+	MSTPCR("i2c0", "peripheral_clk", 1, 9, 0),
+	MSTPCR("icb0", "bus_clk", 2, 27, 0),
+	MSTPCR("meram0", "sh_clk", 2, 26, 0),
+	MSTPCR("dacc0", "peripheral_clk", 2, 24, 0),
+	MSTPCR("dacy0", "peripheral_clk", 2, 23, 0),
+	MSTPCR("tsif0", "bus_clk", 2, 22, 0),
+	MSTPCR("sdhi0", "bus_clk", 2, 18, 0),
+	MSTPCR("mmcif0", "bus_clk", 2, 17, 0),
+	MSTPCR("usb0", "bus_clk", 2, 11, 0),
+	MSTPCR("siu0", "bus_clk", 2, 8, 0),
+	MSTPCR("veu1", "bus_clk", 2, 7, CLK_ENABLE_ON_INIT),
+	MSTPCR("vou0", "bus_clk", 2, 5, 0),
+	MSTPCR("beu0", "bus_clk", 2, 4, 0),
+	MSTPCR("ceu0", "bus_clk", 2, 3, 0),
+	MSTPCR("veu0", "bus_clk", 2, 2, CLK_ENABLE_ON_INIT),
+	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
+	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
 #endif
 };
 
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
index b9c361e..42ce5fc 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
@@ -223,12 +223,6 @@ static struct platform_device *sh7343_devices[] __initdata = {
 
 static int __init sh7343_devices_setup(void)
 {
-	clk_always_enable("uram0"); /* URAM */
-	clk_always_enable("xymem0"); /* XYMEM */
-	clk_always_enable("veu0"); /* VEU */
-	clk_always_enable("vpu0"); /* VPU */
-	clk_always_enable("jpu0"); /* JPU */
-
 	platform_resource_setup_memory(&vpu_device, "vpu", 1 << 20);
 	platform_resource_setup_memory(&veu_device, "veu", 2 << 20);
 	platform_resource_setup_memory(&jpu_device, "jpu", 2 << 20);
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
index d4a994b..d0172af 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
@@ -212,12 +212,6 @@ static struct platform_device *sh7366_devices[] __initdata = {
 
 static int __init sh7366_devices_setup(void)
 {
-	clk_always_enable("rsmem0"); /* RSMEM */
-	clk_always_enable("xymem0"); /* XYMEM */
-	clk_always_enable("veu1"); /* VEU-2 */
-	clk_always_enable("veu0"); /* VEU-1 */
-	clk_always_enable("vpu0"); /* VPU */
-
 	platform_resource_setup_memory(&vpu_device, "vpu", 2 << 20);
 	platform_resource_setup_memory(&veu0_device, "veu0", 2 << 20);
 	platform_resource_setup_memory(&veu1_device, "veu1", 2 << 20);
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
index 5d6247f..ea524a2 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
@@ -352,12 +352,6 @@ static struct platform_device *sh7722_devices[] __initdata = {
 
 static int __init sh7722_devices_setup(void)
 {
-	clk_always_enable("uram0"); /* URAM */
-	clk_always_enable("xymem0"); /* XYMEM */
-	clk_always_enable("veu0"); /* VEU */
-	clk_always_enable("vpu0"); /* VPU */
-	clk_always_enable("jpu0"); /* JPU */
-
 	platform_resource_setup_memory(&vpu_device, "vpu", 1 << 20);
 	platform_resource_setup_memory(&veu_device, "veu", 2 << 20);
 	platform_resource_setup_memory(&jpu_device, "jpu", 2 << 20);
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
index 1429fc5..04cb4aa 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
@@ -460,11 +460,6 @@ static struct platform_device *sh7723_devices[] __initdata = {
 
 static int __init sh7723_devices_setup(void)
 {
-	clk_always_enable("meram0"); /* MERAM */
-	clk_always_enable("veu1"); /* VEU2H1 */
-	clk_always_enable("veu0"); /* VEU2H0 */
-	clk_always_enable("vpu0"); /* VPU */
-
 	platform_resource_setup_memory(&vpu_device, "vpu", 2 << 20);
 	platform_resource_setup_memory(&veu0_device, "veu0", 2 << 20);
 	platform_resource_setup_memory(&veu1_device, "veu1", 2 << 20);
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index a3039ce..080f541 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -281,10 +281,6 @@ static struct platform_device *sh7724_devices[] __initdata = {
 
 static int __init sh7724_devices_setup(void)
 {
-	clk_always_enable("vpu0");   /* VPU */
-	clk_always_enable("veu1");   /* VEU3F1 */
-	clk_always_enable("veu0");   /* VEU3F0 */
-
 	platform_resource_setup_memory(&vpu_device, "vpu", 2 << 20);
 	platform_resource_setup_memory(&veu0_device, "veu0", 2 << 20);
 	platform_resource_setup_memory(&veu1_device, "veu1", 2 << 20);
-- 
cgit v0.10.2


From ae891a4264c91246c0b4c22be68b9838747ae48d Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 12 May 2009 05:30:10 +0900
Subject: sh: clkfwk: Fix up the clk_enable() error path.

There are a couple of instances where a clk_enable() can fail, which the
SH-Mobile code presently handles, but doesn't get reported all the way
back up. This fixes up the return type so the errors make it all the way
down to the drivers.

Additionally, we now also error out properly if the parent enable fails.
Prep work for aggressively turning off unused clocks on boot.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index e9fced9..5de72ee 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -10,7 +10,7 @@ struct clk;
 
 struct clk_ops {
 	void (*init)(struct clk *clk);
-	void (*enable)(struct clk *clk);
+	int (*enable)(struct clk *clk);
 	void (*disable)(struct clk *clk);
 	unsigned long (*recalc)(struct clk *clk);
 	int (*set_rate)(struct clk *clk, unsigned long rate, int algo_id);
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index c683be5..e027fe5 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -93,57 +93,78 @@ void propagate_rate(struct clk *tclk)
 	}
 }
 
-static int __clk_enable(struct clk *clk)
+static void __clk_disable(struct clk *clk)
 {
-	if (clk->usecount++ == 0) {
-		if (clk->parent)
-			__clk_enable(clk->parent);
-
-		if (clk->ops && clk->ops->enable)
-			clk->ops->enable(clk);
+	if (clk->usecount == 0) {
+		printk(KERN_ERR "Trying disable clock %s with 0 usecount\n",
+		       clk->name);
+		WARN_ON(1);
+		return;
 	}
 
-	return 0;
+	if (!(--clk->usecount)) {
+		if (likely(clk->ops && clk->ops->disable))
+			clk->ops->disable(clk);
+		if (likely(clk->parent))
+			__clk_disable(clk->parent);
+	}
 }
 
-int clk_enable(struct clk *clk)
+void clk_disable(struct clk *clk)
 {
 	unsigned long flags;
-	int ret;
 
 	if (!clk)
-		return -EINVAL;
+		return;
 
 	spin_lock_irqsave(&clock_lock, flags);
-	ret = __clk_enable(clk);
+	__clk_disable(clk);
 	spin_unlock_irqrestore(&clock_lock, flags);
-
-	return ret;
 }
-EXPORT_SYMBOL_GPL(clk_enable);
+EXPORT_SYMBOL_GPL(clk_disable);
 
-static void __clk_disable(struct clk *clk)
+static int __clk_enable(struct clk *clk)
 {
-	if (clk->usecount > 0 && !(--clk->usecount)) {
-		if (likely(clk->ops && clk->ops->disable))
-			clk->ops->disable(clk);
-		if (likely(clk->parent))
-			__clk_disable(clk->parent);
+	int ret = 0;
+
+	if (clk->usecount++ == 0) {
+		if (clk->parent) {
+			ret = __clk_enable(clk->parent);
+			if (unlikely(ret))
+				goto err;
+		}
+
+		if (clk->ops && clk->ops->enable) {
+			ret = clk->ops->enable(clk);
+			if (ret) {
+				if (clk->parent)
+					__clk_disable(clk->parent);
+				goto err;
+			}
+		}
 	}
+
+	return ret;
+err:
+	clk->usecount--;
+	return ret;
 }
 
-void clk_disable(struct clk *clk)
+int clk_enable(struct clk *clk)
 {
 	unsigned long flags;
+	int ret;
 
 	if (!clk)
-		return;
+		return -EINVAL;
 
 	spin_lock_irqsave(&clock_lock, flags);
-	__clk_disable(clk);
+	ret = __clk_enable(clk);
 	spin_unlock_irqrestore(&clock_lock, flags);
+
+	return ret;
 }
-EXPORT_SYMBOL_GPL(clk_disable);
+EXPORT_SYMBOL_GPL(clk_enable);
 
 static LIST_HEAD(root_clks);
 
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index f777d00..1c41db4 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -472,9 +472,9 @@ static int sh7722_siu_start_stop(struct clk *clk, int enable)
 	return 0;
 }
 
-static void sh7722_siu_enable(struct clk *clk)
+static int sh7722_siu_enable(struct clk *clk)
 {
-	sh7722_siu_start_stop(clk, 1);
+	return sh7722_siu_start_stop(clk, 1);
 }
 
 static void sh7722_siu_disable(struct clk *clk)
@@ -491,12 +491,13 @@ static struct clk_ops sh7722_siu_clk_ops = {
 
 #endif /* CONFIG_CPU_SUBTYPE_SH7343 */
 
-static void sh7722_video_enable(struct clk *clk)
+static int sh7722_video_enable(struct clk *clk)
 {
 	unsigned long r;
 
 	r = ctrl_inl(VCLKCR);
 	ctrl_outl( r & ~(1<<8), VCLKCR);
+	return 0;
 }
 
 static void sh7722_video_disable(struct clk *clk)
@@ -630,9 +631,9 @@ static int sh7722_mstpcr_start_stop(struct clk *clk, int enable)
 	return 0;
 }
 
-static void sh7722_mstpcr_enable(struct clk *clk)
+static int sh7722_mstpcr_enable(struct clk *clk)
 {
-	sh7722_mstpcr_start_stop(clk, 1);
+	return sh7722_mstpcr_start_stop(clk, 1);
 }
 
 static void sh7722_mstpcr_disable(struct clk *clk)
-- 
cgit v0.10.2


From aa87aa343f2cd236b5eccd643abd4df918ed5c4f Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 12 May 2009 05:51:05 +0900
Subject: sh: clkfwk: Improve the generic clk_set_parent() implementation.

This causes the generic clk_set_parent() implementation to be a bit more
intelligent. A clk_reparent() is added to move the clock over to the new
parent's sibling list, which then allows the generic rate propagation
code to succeed. This also becomes a nop if the new and old parents are
unchanged.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index 5de72ee..fdb9156 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -48,6 +48,7 @@ int clk_init(void);
 unsigned long followparent_recalc(struct clk *);
 void recalculate_root_clocks(void);
 void propagate_rate(struct clk *);
+int clk_reparent(struct clk *child, struct clk *parent);
 void clk_recalc_rate(struct clk *);
 int clk_register(struct clk *);
 void clk_unregister(struct clk *);
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index e027fe5..e3d1de8 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -81,6 +81,19 @@ unsigned long followparent_recalc(struct clk *clk)
 	return clk->parent->rate;
 }
 
+int clk_reparent(struct clk *child, struct clk *parent)
+{
+	list_del_init(&child->sibling);
+	if (parent)
+		list_add(&child->sibling, &parent->children);
+	child->parent = parent;
+
+	/* now do the debugfs renaming to reattach the child
+	   to the proper parent */
+
+	return 0;
+}
+
 /* Propagate rate to children */
 void propagate_rate(struct clk *tclk)
 {
@@ -288,12 +301,19 @@ int clk_set_parent(struct clk *clk, struct clk *parent)
 
 	if (!parent || !clk)
 		return ret;
+	if (clk->parent == parent)
+		return 0;
 
 	spin_lock_irqsave(&clock_lock, flags);
 	if (clk->usecount == 0) {
 		if (clk->ops->set_parent)
 			ret = clk->ops->set_parent(clk, parent);
+		else
+			ret = clk_reparent(clk, parent);
+
 		if (ret == 0) {
+			pr_debug("clock: set parent of %s to %s (new rate %ld)\n",
+				 clk->name, clk->parent->name, clk->rate);
 			if (clk->ops->recalc)
 				clk->rate = clk->ops->recalc(clk);
 			propagate_rate(clk);
-- 
cgit v0.10.2


From f5c84cf50812c80133e64683d0500b2416d55cb3 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 12 May 2009 05:59:27 +0900
Subject: sh: clkfwk: Tidy up on-chip clock registration and rate propagation.

This tidies up the set_rate hack that the on-chip clocks were abusing to
trigger rate propagation, which is now handled generically.

Additionally, now that CLK_ENABLE_ON_INIT is wired up where it needs to
be for these clocks, the clk_enable() can go away. In some cases this was
bumping up the refcount higher than it should have been.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
index 0caca9f..435f4f1 100644
--- a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
+++ b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
@@ -127,7 +127,7 @@ static int shoc_clk_set_rate(struct clk *clk, unsigned long rate, int algo_id)
 	frqcr3 |= tmp << 6;
 	ctrl_outl(frqcr3, CPG2_FRQCR3);
 
-	return clk->parent->rate / frqcr3_divisors[tmp];
+	clk->rate = clk->parent->rate / frqcr3_divisors[tmp];
 
 	return 0;
 }
@@ -153,28 +153,17 @@ static struct clk *sh4202_onchip_clocks[] = {
 static int __init sh4202_clk_init(void)
 {
 	struct clk *clk = clk_get(NULL, "master_clk");
-	int i;
+	int i, ret = 0;
 
 	for (i = 0; i < ARRAY_SIZE(sh4202_onchip_clocks); i++) {
 		struct clk *clkp = sh4202_onchip_clocks[i];
 
 		clkp->parent = clk;
-		clk_register(clkp);
-		clk_enable(clkp);
+		ret |= clk_register(clkp);
 	}
 
-	/*
-	 * Now that we have the rest of the clocks registered, we need to
-	 * force the parent clock to propagate so that these clocks will
-	 * automatically figure out their rate. We cheat by handing the
-	 * parent clock its current rate and forcing child propagation.
-	 */
-	clk_set_rate(clk, clk_get_rate(clk));
-
 	clk_put(clk);
 
-	return 0;
+	return ret;
 }
-
 arch_initcall(sh4202_clk_init);
-
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
index 21bd70f..0110da6 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
@@ -93,28 +93,17 @@ static struct clk *sh7763_onchip_clocks[] = {
 static int __init sh7763_clk_init(void)
 {
 	struct clk *clk = clk_get(NULL, "master_clk");
-	int i;
+	int i, ret = 0;
 
 	for (i = 0; i < ARRAY_SIZE(sh7763_onchip_clocks); i++) {
 		struct clk *clkp = sh7763_onchip_clocks[i];
 
 		clkp->parent = clk;
-		clk_register(clkp);
-		clk_enable(clkp);
+		ret |= clk_register(clkp);
 	}
 
-	/*
-	 * Now that we have the rest of the clocks registered, we need to
-	 * force the parent clock to propagate so that these clocks will
-	 * automatically figure out their rate. We cheat by handing the
-	 * parent clock its current rate and forcing child propagation.
-	 */
-	clk_set_rate(clk, clk_get_rate(clk));
-
 	clk_put(clk);
 
-	return 0;
+	return ret;
 }
-
 arch_initcall(sh7763_clk_init);
-
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
index 4c11f89..0a22d50 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
@@ -99,28 +99,17 @@ static struct clk *sh7780_onchip_clocks[] = {
 static int __init sh7780_clk_init(void)
 {
 	struct clk *clk = clk_get(NULL, "master_clk");
-	int i;
+	int i, ret = 0;
 
 	for (i = 0; i < ARRAY_SIZE(sh7780_onchip_clocks); i++) {
 		struct clk *clkp = sh7780_onchip_clocks[i];
 
 		clkp->parent = clk;
-		clk_register(clkp);
-		clk_enable(clkp);
+		ret |= clk_register(clkp);
 	}
 
-	/*
-	 * Now that we have the rest of the clocks registered, we need to
-	 * force the parent clock to propagate so that these clocks will
-	 * automatically figure out their rate. We cheat by handing the
-	 * parent clock its current rate and forcing child propagation.
-	 */
-	clk_set_rate(clk, clk_get_rate(clk));
-
 	clk_put(clk);
 
-	return 0;
+	return ret;
 }
-
 arch_initcall(sh7780_clk_init);
-
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index edd4328..4dcd1f6 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -137,26 +137,17 @@ static struct clk *sh7785_onchip_clocks[] = {
 static int __init sh7785_clk_init(void)
 {
 	struct clk *clk = clk_get(NULL, "master_clk");
-	int i;
+	int i, ret = 0;
 
 	for (i = 0; i < ARRAY_SIZE(sh7785_onchip_clocks); i++) {
 		struct clk *clkp = sh7785_onchip_clocks[i];
 
 		clkp->parent = clk;
-		clk_register(clkp);
-		clk_enable(clkp);
+		ret |= clk_register(clkp);
 	}
 
-	/*
-	 * Now that we have the rest of the clocks registered, we need to
-	 * force the parent clock to propagate so that these clocks will
-	 * automatically figure out their rate. We cheat by handing the
-	 * parent clock its current rate and forcing child propagation.
-	 */
-	clk_set_rate(clk, clk_get_rate(clk));
-
 	clk_put(clk);
 
-	return 0;
+	return ret;
 }
 arch_initcall(sh7785_clk_init);
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
index 2825494..825556f 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
@@ -123,26 +123,17 @@ static struct clk *sh7786_onchip_clocks[] = {
 static int __init sh7786_clk_init(void)
 {
 	struct clk *clk = clk_get(NULL, "master_clk");
-	int i;
+	int i, ret = 0;
 
 	for (i = 0; i < ARRAY_SIZE(sh7786_onchip_clocks); i++) {
 		struct clk *clkp = sh7786_onchip_clocks[i];
 
 		clkp->parent = clk;
-		clk_register(clkp);
-		clk_enable(clkp);
+		ret |= clk_register(clkp);
 	}
 
-	/*
-	 * Now that we have the rest of the clocks registered, we need to
-	 * force the parent clock to propagate so that these clocks will
-	 * automatically figure out their rate. We cheat by handing the
-	 * parent clock its current rate and forcing child propagation.
-	 */
-	clk_set_rate(clk, clk_get_rate(clk));
-
 	clk_put(clk);
 
-	return 0;
+	return ret;
 }
 arch_initcall(sh7786_clk_init);
diff --git a/arch/sh/kernel/cpu/sh4a/clock-shx3.c b/arch/sh/kernel/cpu/sh4a/clock-shx3.c
index 6e5c864..1eb149b 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-shx3.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-shx3.c
@@ -110,26 +110,17 @@ static struct clk *shx3_onchip_clocks[] = {
 static int __init shx3_clk_init(void)
 {
 	struct clk *clk = clk_get(NULL, "master_clk");
-	int i;
+	int i, ret = 0;
 
 	for (i = 0; i < ARRAY_SIZE(shx3_onchip_clocks); i++) {
 		struct clk *clkp = shx3_onchip_clocks[i];
 
 		clkp->parent = clk;
-		clk_register(clkp);
-		clk_enable(clkp);
+		ret |= clk_register(clkp);
 	}
 
-	/*
-	 * Now that we have the rest of the clocks registered, we need to
-	 * force the parent clock to propagate so that these clocks will
-	 * automatically figure out their rate. We cheat by handing the
-	 * parent clock its current rate and forcing child propagation.
-	 */
-	clk_set_rate(clk, clk_get_rate(clk));
-
 	clk_put(clk);
 
-	return 0;
+	return ret;
 }
 arch_initcall(shx3_clk_init);
-- 
cgit v0.10.2


From 007e8363b656768fe3f59c180824ff704680bb25 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 12 May 2009 06:05:32 +0900
Subject: sh: clkfwk: Kill off clk_recalc_rate().

The only user for this is the SH-Mobile r_clk, which is now added as a
root clock and can be kicked via propagate_rate() as usual. Given that,
there is no longer any need for the special clk_recalc_rate(), so we kill
it off.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index fdb9156..6a2f463 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -49,7 +49,6 @@ unsigned long followparent_recalc(struct clk *);
 void recalculate_root_clocks(void);
 void propagate_rate(struct clk *);
 int clk_reparent(struct clk *child, struct clk *parent);
-void clk_recalc_rate(struct clk *);
 int clk_register(struct clk *);
 void clk_unregister(struct clk *);
 
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index e3d1de8..7a356de9 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -280,20 +280,6 @@ int clk_set_rate_ex(struct clk *clk, unsigned long rate, int algo_id)
 }
 EXPORT_SYMBOL_GPL(clk_set_rate_ex);
 
-void clk_recalc_rate(struct clk *clk)
-{
-	unsigned long flags;
-
-	if (!clk->ops->recalc)
-		return;
-
-	spin_lock_irqsave(&clock_lock, flags);
-	clk->rate = clk->ops->recalc(clk);
-	propagate_rate(clk);
-	spin_unlock_irqrestore(&clock_lock, flags);
-}
-EXPORT_SYMBOL_GPL(clk_recalc_rate);
-
 int clk_set_parent(struct clk *clk, struct clk *parent)
 {
 	unsigned long flags;
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index 1c41db4..426065b 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -912,7 +912,7 @@ int __init arch_clk_init(void)
 		clk_put(clk);
 	}
 
-	clk_recalc_rate(&sh7722_r_clock); /* make sure rate gets propagated */
+	propagate_rate(&sh7722_r_clock); /* make sure rate gets propagated */
 
 	return 0;
 }
-- 
cgit v0.10.2


From 0dae89572cbcd5f676ea52a9448d9639d97a53d6 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 12 May 2009 06:18:09 +0900
Subject: sh: clkfwk: Wire up clk_get_sys() support.

This stubs in clk_get_sys() from the ARM clkdev implementation.
Tentatively conver the clk_get() lookup code to use this, and once the
rest of the in-tree users are happy with this, it can replace the
fallback lookups.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index 6a2f463..e2f5bf1 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -37,6 +37,13 @@ struct clk {
 	unsigned long		arch_flags;
 };
 
+struct clk_lookup {
+	struct list_head	node;
+	const char		*dev_id;
+	const char		*con_id;
+	struct clk		*clk;
+};
+
 #define CLK_ENABLE_ON_INIT	(1 << 0)
 
 /* Should be defined by processor-specific code */
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 7a356de9..34707b8 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -10,6 +10,10 @@
  *
  *  Modified for omap shared clock framework by Tony Lindgren <tony@atomide.com>
  *
+ *  With clkdev bits:
+ *
+ *	Copyright (C) 2008 Russell King.
+ *
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
@@ -335,14 +339,69 @@ long clk_round_rate(struct clk *clk, unsigned long rate)
 EXPORT_SYMBOL_GPL(clk_round_rate);
 
 /*
+ * Find the correct struct clk for the device and connection ID.
+ * We do slightly fuzzy matching here:
+ *  An entry with a NULL ID is assumed to be a wildcard.
+ *  If an entry has a device ID, it must match
+ *  If an entry has a connection ID, it must match
+ * Then we take the most specific entry - with the following
+ * order of precidence: dev+con > dev only > con only.
+ */
+static struct clk *clk_find(const char *dev_id, const char *con_id)
+{
+	struct clk_lookup *p;
+	struct clk *clk = NULL;
+	int match, best = 0;
+
+	list_for_each_entry(p, &clock_list, node) {
+		match = 0;
+		if (p->dev_id) {
+			if (!dev_id || strcmp(p->dev_id, dev_id))
+				continue;
+			match += 2;
+		}
+		if (p->con_id) {
+			if (!con_id || strcmp(p->con_id, con_id))
+				continue;
+			match += 1;
+		}
+		if (match == 0)
+			continue;
+
+		if (match > best) {
+			clk = p->clk;
+			best = match;
+		}
+	}
+	return clk;
+}
+
+struct clk *clk_get_sys(const char *dev_id, const char *con_id)
+{
+	struct clk *clk;
+
+	mutex_lock(&clock_list_sem);
+	clk = clk_find(dev_id, con_id);
+	mutex_unlock(&clock_list_sem);
+
+	return clk ? clk : ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL_GPL(clk_get_sys);
+
+/*
  * Returns a clock. Note that we first try to use device id on the bus
  * and clock name. If this fails, we try to use clock name only.
  */
 struct clk *clk_get(struct device *dev, const char *id)
 {
+	const char *dev_id = dev ? dev_name(dev) : NULL;
 	struct clk *p, *clk = ERR_PTR(-ENOENT);
 	int idno;
 
+	clk = clk_get_sys(dev_id, id);
+	if (clk)
+		return clk;
+
 	if (dev == NULL || dev->bus != &platform_bus_type)
 		idno = -1;
 	else
-- 
cgit v0.10.2


From 77d1a4999502c260df0eb2de437d320bf8c64b36 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 11 May 2009 14:21:12 -0700
Subject: x86, boot: make symbols from the main vmlinux available

Make symbols from the main vmlinux, as opposed to just
compressed/vmlinux, available to header.S.  Also, export a few
additional symbols.

This will be used in a subsequent patch to export the total memory
footprint of the kernel.

[ Impact: enable future enhancement ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 6633b6e..75e0301 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -86,19 +86,27 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
 
 SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
 
-sed-offsets := -e 's/^00*/0/' \
-        -e 's/^\([0-9a-fA-F]*\) . \(input_data\|input_data_end\)$$/\#define \2 0x\1/p'
+sed-voffset := -e 's/^\([0-9a-fA-F]*\) . \(_text\|_end\)$$/\#define VO_\2 0x\1/p'
 
-quiet_cmd_offsets = OFFSETS $@
-      cmd_offsets = $(NM) $< | sed -n $(sed-offsets) > $@
+quiet_cmd_voffset = VOFFSET $@
+      cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
 
-$(obj)/offsets.h: $(obj)/compressed/vmlinux FORCE
-	$(call if_changed,offsets)
+targets += voffset.h
+$(obj)/voffset.h: vmlinux FORCE
+	$(call if_changed,voffset)
+
+sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p'
+
+quiet_cmd_zoffset = ZOFFSET $@
+      cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
+
+targets += zoffset.h
+$(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE
+	$(call if_changed,zoffset)
 
-targets += offsets.h
 
 AFLAGS_header.o += -I$(obj)
-$(obj)/header.o: $(obj)/offsets.h
+$(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h
 
 LDFLAGS_setup.elf	:= -T
 $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 5d84d1c..2728514 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -22,7 +22,8 @@
 #include <asm/page_types.h>
 #include <asm/setup.h>
 #include "boot.h"
-#include "offsets.h"
+#include "voffset.h"
+#include "zoffset.h"
 
 BOOTSEG		= 0x07C0		/* original address of boot-sector */
 SYSSEG		= 0x1000		/* historical load address >> 4 */
@@ -212,8 +213,8 @@ hardware_subarch:	.long 0			# subarchitecture, added with 2.07
 
 hardware_subarch_data:	.quad 0
 
-payload_offset:		.long input_data
-payload_length:		.long input_data_end-input_data
+payload_offset:		.long ZO_input_data
+payload_length:		.long ZO_z_input_len
 
 setup_data:		.quad 0			# 64-bit physical pointer to
 						# single linked list of
-- 
cgit v0.10.2


From 40b387a8a9a821878ecdf9fb117958c426fc1385 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 11 May 2009 14:41:55 -0700
Subject: x86, boot: use LOAD_PHYSICAL_ADDR on 64 bits

Use LOAD_PHYSICAL_ADDR instead of CONFIG_PHYSICAL_START in the 64-bit
decompression code, for equivalence with the 32-bit code.

[ Impact: cleanup, increases code similarity ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 2b9f251..4135d43 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -87,7 +87,7 @@ ENTRY(startup_32)
 	addl	$(PMD_PAGE_SIZE -1), %ebx
 	andl	$PMD_PAGE_MASK, %ebx
 #else
-	movl	$CONFIG_PHYSICAL_START, %ebx
+	movl	$LOAD_PHYSICAL_ADDR, %ebx
 #endif
 
 	/* Target address to relocate to for decompression */
@@ -215,7 +215,7 @@ ENTRY(startup_64)
 	 *
 	 * If it is a relocatable kernel then decompress and run the kernel
 	 * from load address aligned to 2MB addr, otherwise decompress and
-	 * run the kernel from CONFIG_PHYSICAL_START
+	 * run the kernel from LOAD_PHYSICAL_ADDR
 	 *
 	 * We cannot rely on the calculation done in 32-bit mode, since we
 	 * may have been invoked via the 64-bit entry point.
@@ -228,7 +228,7 @@ ENTRY(startup_64)
 	andq	$PMD_PAGE_MASK, %rbp
 	movq	%rbp, %rbx
 #else
-	movq	$CONFIG_PHYSICAL_START, %rbp
+	movq	$LOAD_PHYSICAL_ADDR, %rbp
 	movq	%rbp, %rbx
 #endif
 
-- 
cgit v0.10.2


From 2ff799d3cff1ecb274049378b28120ee5c1c5e5f Mon Sep 17 00:00:00 2001
From: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Date: Mon, 11 May 2009 20:09:14 +0530
Subject: sched: Don't export sched_mc_power_savings on multi-socket single
 core system

Fix to prevent sched_mc_power_saving from being exported through sysfs
for multi-scoket single core system. Max cores should be always greater than
one (1). My earlier patch that introduced fix for not exporting
'sched_mc_power_saving' on laptops  broke it on multi-socket single core
system. This fix addresses issue on both laptop and multi-socket single
core system.
Below are the Test results:

1. Single socket - multi-core
       Before Patch: Does not export 'sched_mc_power_saving'
       After Patch: Does not export 'sched_mc_power_saving'
       Result: Pass

2. Multi Socket - single core
      Before Patch: exports 'sched_mc_power_saving'
      After Patch: Does not export 'sched_mc_power_saving'
      Result: Pass

3. Multi Socket - Multi core
      Before Patch: exports 'sched_mc_power_saving'
      After Patch: exports 'sched_mc_power_saving'

[ Impact: make the sched_mc_power_saving control available more consistently ]

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: Suresh B Siddha <suresh.b.siddha@intel.com>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090511143914.GB4853@dirshya.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index f44b49a..066ef59 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -203,7 +203,8 @@ struct pci_bus;
 void x86_pci_root_bus_res_quirks(struct pci_bus *b);
 
 #ifdef CONFIG_SMP
-#define mc_capable()	(cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids)
+#define mc_capable()	((boot_cpu_data.x86_max_cores > 1) && \
+			(cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids))
 #define smt_capable()			(smp_num_siblings > 1)
 #endif
 
-- 
cgit v0.10.2


From 99aa45595f45603526513d5e29fc00f8afbf3913 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 11 May 2009 16:02:10 -0700
Subject: x86, boot: remove dead code from boot/compressed/head_*.S

Remove a couple of lines of dead code from
arch/x86/boot/compressed/head_*.S; all of these update registers that
are dead in the current code.

[ Impact: cleanup ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 470474b..2b8e0df 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -100,16 +100,6 @@ ENTRY(startup_32)
 	popl	%esi
 
 /*
- * Compute the kernel start address.
- */
-#ifdef CONFIG_RELOCATABLE
-	addl    $(CONFIG_PHYSICAL_ALIGN - 1), %ebp
-	andl    $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebp
-#else
-	movl	$LOAD_PHYSICAL_ADDR, %ebp
-#endif
-
-/*
  * Jump to the relocated address.
  */
 	leal	relocated(%ebx), %eax
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 4135d43..2bb500a 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -226,10 +226,8 @@ ENTRY(startup_64)
 	leaq	startup_32(%rip) /* - $startup_32 */, %rbp
 	addq	$(PMD_PAGE_SIZE - 1), %rbp
 	andq	$PMD_PAGE_MASK, %rbp
-	movq	%rbp, %rbx
 #else
 	movq	$LOAD_PHYSICAL_ADDR, %rbp
-	movq	%rbp, %rbx
 #endif
 
 	/* Target address to relocate to for decompression */
-- 
cgit v0.10.2


From 37ba7ab5e33cebc25c68fffe33e9f21e7c2014e8 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 11 May 2009 15:56:08 -0700
Subject: x86, boot: make kernel_alignment adjustable; new bzImage fields

Make the kernel_alignment field adjustable; this allows us to set it
to a large value (intended to be 16 MB to avoid ZONE_DMA contention,
memory holes and other weirdness) while a smart bootloader can still
force a loading at a lesser alignment if absolutely necessary.

Also export pref_address (preferred loading address, corresponding to
the link-time address) and init_size, the total amount of linear
memory the kernel will require during initialization.

[ Impact: allows better kernel placement, gives bootloader more info ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 2b8e0df..75e4f00 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -69,8 +69,11 @@ ENTRY(startup_32)
 
 #ifdef CONFIG_RELOCATABLE
 	movl	%ebp, %ebx
-	addl    $(CONFIG_PHYSICAL_ALIGN - 1), %ebx
-	andl    $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx
+	movl	BP_kernel_alignment(%esi), %eax
+	decl	%eax
+	addl    %eax, %ebx
+	notl	%eax
+	andl    %eax, %ebx
 #else
 	movl	$LOAD_PHYSICAL_ADDR, %ebx
 #endif
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 2bb500a..f62c284 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -84,8 +84,11 @@ ENTRY(startup_32)
 
 #ifdef CONFIG_RELOCATABLE
 	movl	%ebp, %ebx
-	addl	$(PMD_PAGE_SIZE -1), %ebx
-	andl	$PMD_PAGE_MASK, %ebx
+	movl	BP_kernel_alignment(%esi), %eax
+	decl	%eax
+	addl	%eax, %ebx
+	notl	%eax
+	andl	%eax, %ebx
 #else
 	movl	$LOAD_PHYSICAL_ADDR, %ebx
 #endif
@@ -224,8 +227,11 @@ ENTRY(startup_64)
 	/* Start with the delta to where the kernel will run at. */
 #ifdef CONFIG_RELOCATABLE
 	leaq	startup_32(%rip) /* - $startup_32 */, %rbp
-	addq	$(PMD_PAGE_SIZE - 1), %rbp
-	andq	$PMD_PAGE_MASK, %rbp
+	movl	BP_kernel_alignment(%rsi), %eax
+	decl	%eax
+	addq	%rax, %rbp
+	notq	%rax
+	andq	%rax, %rbp
 #else
 	movq	$LOAD_PHYSICAL_ADDR, %rbp
 #endif
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 2728514..a0b4269 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -116,7 +116,7 @@ _start:
 	# Part 2 of the header, from the old setup.S
 
 		.ascii	"HdrS"		# header signature
-		.word	0x0209		# header version number (>= 0x0105)
+		.word	0x020a		# header version number (>= 0x0105)
 					# or else old loadlin-1.5 will fail)
 		.globl realmode_swtch
 realmode_swtch:	.word	0, 0		# default_switch, SETUPSEG
@@ -201,7 +201,7 @@ relocatable_kernel:    .byte 1
 #else
 relocatable_kernel:    .byte 0
 #endif
-pad2:			.byte 0
+min_alignment:		.byte MIN_KERNEL_ALIGN_LG2	# minimum alignment
 pad3:			.word 0
 
 cmdline_size:   .long   COMMAND_LINE_SIZE-1     #length of the command line,
@@ -220,6 +220,17 @@ setup_data:		.quad 0			# 64-bit physical pointer to
 						# single linked list of
 						# struct setup_data
 
+pref_address:		.quad LOAD_PHYSICAL_ADDR	# preferred load addr
+
+#define ZO_INIT_SIZE	(ZO__end - ZO_startup_32 + ZO_extract_offset)
+#define VO_INIT_SIZE	(VO__end - VO__text)
+#if ZO_INIT_SIZE > VO_INIT_SIZE
+#define INIT_SIZE ZO_INIT_SIZE
+#else
+#define INIT_SIZE VO_INIT_SIZE
+#endif
+init_size:		.long INIT_SIZE		# kernel initialization size
+
 # End of setup header #####################################################
 
 	.section ".inittext", "ax"
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 6ba23dd..418e632 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -8,11 +8,26 @@
 
 #ifdef __KERNEL__
 
+#include <asm/page_types.h>
+
 /* Physical address where kernel should be loaded. */
 #define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
 				+ (CONFIG_PHYSICAL_ALIGN - 1)) \
 				& ~(CONFIG_PHYSICAL_ALIGN - 1))
 
+/* Minimum kernel alignment, as a power of two */
+#ifdef CONFIG_x86_64
+#define MIN_KERNEL_ALIGN_LG2	PMD_SHIFT
+#else
+#define MIN_KERNEL_ALIGN_LG2	(PAGE_SHIFT+1)
+#endif
+#define MIN_KERNEL_ALIGN	(_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)
+
+#if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \
+	(CONFIG_PHYSICAL_ALIGN < (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2))
+#error "Invalid value for CONFIG_PHYSICAL_ALIGN"
+#endif
+
 #ifdef CONFIG_KERNEL_BZIP2
 #define BOOT_HEAP_SIZE             0x400000
 #else /* !CONFIG_KERNEL_BZIP2 */
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 5a6aa1c..1a830cb 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -146,4 +146,5 @@ void foo(void)
 	OFFSET(BP_loadflags, boot_params, hdr.loadflags);
 	OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
 	OFFSET(BP_version, boot_params, hdr.version);
+	OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
 }
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index e72f062..898ecc4 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -125,6 +125,7 @@ int main(void)
 	OFFSET(BP_loadflags, boot_params, hdr.loadflags);
 	OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
 	OFFSET(BP_version, boot_params, hdr.version);
+	OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
 
 	BLANK();
 	DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
-- 
cgit v0.10.2


From d297366ba692faf1f0384811a6ff0b20c3470b1b Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 11 May 2009 16:06:23 -0700
Subject: x86: document new bzImage fields

Document the new bzImage fields for kernel memory placement.

[ Impact: adds documentation ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt
index e020366..cf8dfc7 100644
--- a/Documentation/x86/boot.txt
+++ b/Documentation/x86/boot.txt
@@ -50,6 +50,11 @@ Protocol 2.08:	(Kernel 2.6.26) Added crc32 checksum and ELF format
 Protocol 2.09:	(Kernel 2.6.26) Added a field of 64-bit physical
 		pointer to single linked list of struct	setup_data.
 
+Protocol 2.10:	(Kernel 2.6.31) A protocol for relaxed alignment
+		beyond the kernel_alignment added, new init_size and
+		pref_address fields.
+
+
 **** MEMORY LAYOUT
 
 The traditional memory map for the kernel loader, used for Image or
@@ -173,7 +178,7 @@ Offset	Proto	Name		Meaning
 022C/4	2.03+	ramdisk_max	Highest legal initrd address
 0230/4	2.05+	kernel_alignment Physical addr alignment required for kernel
 0234/1	2.05+	relocatable_kernel Whether kernel is relocatable or not
-0235/1	N/A	pad2		Unused
+0235/1	2.10+	min_alignment	Minimum alignment, as a power of two
 0236/2	N/A	pad3		Unused
 0238/4	2.06+	cmdline_size	Maximum size of the kernel command line
 023C/4	2.07+	hardware_subarch Hardware subarchitecture
@@ -182,6 +187,8 @@ Offset	Proto	Name		Meaning
 024C/4	2.08+	payload_length	Length of kernel payload
 0250/8	2.09+	setup_data	64-bit physical pointer to linked list
 				of struct setup_data
+0258/8	2.10+	pref_address	Preferred loading address
+0260/4	2.10+	init_size	Linear memory required during initialization
 
 (1) For backwards compatibility, if the setup_sects field contains 0, the
     real value is 4.
@@ -482,11 +489,19 @@ Protocol:	2.03+
   0x37FFFFFF, you can start your ramdisk at 0x37FE0000.)
 
 Field name:	kernel_alignment
-Type:		read (reloc)
+Type:		read/modify (reloc)
 Offset/size:	0x230/4
-Protocol:	2.05+
+Protocol:	2.05+ (read), 2.10+ (modify)
+
+  Alignment unit required by the kernel (if relocatable_kernel is
+  true.)  A relocatable kernel that is loaded at an alignment
+  incompatible with the value in this field will be realigned during
+  kernel initialization.
 
-  Alignment unit required by the kernel (if relocatable_kernel is true.)
+  Starting with protocol version 2.10, this reflects the kernel
+  alignment preferred for optimal performance; it is possible for the
+  loader to modify this field to permit a lesser alignment.  See the
+  min_alignment and pref_address field below.
 
 Field name:	relocatable_kernel
 Type:		read (reloc)
@@ -498,6 +513,22 @@ Protocol:	2.05+
   After loading, the boot loader must set the code32_start field to
   point to the loaded code, or to a boot loader hook.
 
+Field name:	min_alignment
+Type:		read (reloc)
+Offset/size:	0x235/1
+Protocol:	2.10+
+
+  This field, if nonzero, indicates as a power of two the minimum
+  alignment required, as opposed to preferred, by the kernel to boot.
+  If a boot loader makes use of this field, it should update the
+  kernel_alignment field with the alignment unit desired; typically:
+
+	kernel_alignment = 1 << min_alignment
+
+  There may be a considerable performance cost with an excessively
+  misaligned kernel.  Therefore, a loader should typically try each
+  power-of-two alignment from kernel_alignment down to this alignment.
+
 Field name:	cmdline_size
 Type:		read
 Offset/size:	0x238/4
@@ -582,6 +613,36 @@ Protocol:	2.09+
   sure to consider the case where the linked list already contains
   entries.
 
+Field name:	pref_address
+Type:		read (reloc)
+Offset/size:	0x258/8
+Protocol:	2.10+
+
+  This field, if nonzero, represents a preferred load address for the
+  kernel.  A relocating bootloader should attempt to load at this
+  address if possible.
+
+  A non-relocatable kernel will unconditionally move itself and to run
+  at this address.
+
+Field name:	init_size
+Type:		read
+Offset/size:	0x25c/4
+
+  This field indicates the amount of linear contiguous memory starting
+  at the kernel runtime start address that the kernel needs before it
+  is capable of examining its memory map.  This is not the same thing
+  as the total amount of memory the kernel needs to boot, but it can
+  be used by a relocating boot loader to help select a safe load
+  address for the kernel.
+
+  The kernel runtime start address is determined by the following algorithm:
+
+  if (relocatable_kernel)
+	runtime_start = align_up(load_address, kernel_alignment)
+  else
+	runtime_start = pref_address
+
 
 **** THE IMAGE CHECKSUM
 
-- 
cgit v0.10.2


From ceefccc93932b920a8ec6f35f596db05202a12fe Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 11 May 2009 16:12:16 -0700
Subject: x86: default CONFIG_PHYSICAL_START and CONFIG_PHYSICAL_ALIGN to 16 MB

Default CONFIG_PHYSICAL_START and CONFIG_PHYSICAL_ALIGN each to 16 MB,
so that both non-relocatable and relocatable kernels are loaded at
16 MB by a non-relocating bootloader.  This is somewhat hacky, but it
appears to be the only way to do this that does not break some some
set of existing bootloaders.

We want to avoid the bottom 16 MB because of large page breakup,
memory holes, and ZONE_DMA.  Embedded systems may need to reduce this,
or update their bootloaders to be aware of the new min_alignment field.

[ Impact: performance improvement, avoids problems on some systems ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5aee453..50fbb47 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1455,9 +1455,7 @@ config KEXEC_JUMP
 
 config PHYSICAL_START
 	hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
-	default "0x1000000" if X86_NUMAQ
-	default "0x200000" if X86_64
-	default "0x100000"
+	default "0x1000000"
 	---help---
 	  This gives the physical address where the kernel is loaded.
 
@@ -1476,15 +1474,15 @@ config PHYSICAL_START
 	  to be specifically compiled to run from a specific memory area
 	  (normally a reserved region) and this option comes handy.
 
-	  So if you are using bzImage for capturing the crash dump, leave
-	  the value here unchanged to 0x100000 and set CONFIG_RELOCATABLE=y.
-	  Otherwise if you plan to use vmlinux for capturing the crash dump
-	  change this value to start of the reserved region (Typically 16MB
-	  0x1000000). In other words, it can be set based on the "X" value as
-	  specified in the "crashkernel=YM@XM" command line boot parameter
-	  passed to the panic-ed kernel. Typically this parameter is set as
-	  crashkernel=64M@16M. Please take a look at
-	  Documentation/kdump/kdump.txt for more details about crash dumps.
+	  So if you are using bzImage for capturing the crash dump,
+	  leave the value here unchanged to 0x1000000 and set
+	  CONFIG_RELOCATABLE=y.  Otherwise if you plan to use vmlinux
+	  for capturing the crash dump change this value to start of
+	  the reserved region.  In other words, it can be set based on
+	  the "X" value as specified in the "crashkernel=YM@XM"
+	  command line boot parameter passed to the panic-ed
+	  kernel. Please take a look at Documentation/kdump/kdump.txt
+	  for more details about crash dumps.
 
 	  Usage of bzImage for capturing the crash dump is recommended as
 	  one does not have to build two kernels. Same kernel can be used
@@ -1521,9 +1519,8 @@ config X86_NEED_RELOCS
 config PHYSICAL_ALIGN
 	hex
 	prompt "Alignment value to which kernel should be aligned" if X86_32
-	default "0x100000" if X86_32
-	default "0x200000" if X86_64
-	range 0x2000 0x400000
+	default "0x1000000"
+	range 0x2000 0x1000000
 	---help---
 	  This value puts the alignment restrictions on physical address
 	  where kernel is loaded and run from. Kernel is compiled for an
-- 
cgit v0.10.2


From 26717808f93a27c22d4853c4fb17fa225f4ccc68 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Thu, 7 May 2009 14:19:34 -0700
Subject: x86: make CONFIG_RELOCATABLE the default

Remove the EXPERIMENTAL tag from CONFIG_RELOCATABLE and make it the
default.  Relocatable kernels have been used for a while now, and
should now have identical semantics to non-relocatable kernels when
loaded by a non-relocating bootloader.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 50fbb47..3e0f80a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1495,8 +1495,8 @@ config PHYSICAL_START
 	  Don't change this unless you know what you are doing.
 
 config RELOCATABLE
-	bool "Build a relocatable kernel (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	bool "Build a relocatable kernel"
+	default y
 	---help---
 	  This builds a kernel image that retains relocation information
 	  so it can be loaded someplace besides the default 1MB.
-- 
cgit v0.10.2


From c4a994645d04d5fa6bfa52a3204af87dd92168d5 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 11 May 2009 16:20:12 -0700
Subject: x86, defconfig: update to current, no material changes

Update defconfigs to reflect current configuration files.  No other
changes.

[ Impact: updates defconfigs to match what "make defconfig" generates ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 235b81d..70036a7 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,12 +1,13 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29-rc4
-# Tue Feb 24 15:50:58 2009
+# Linux kernel version: 2.6.30-rc2
+# Mon May 11 16:19:47 2009
 #
 # CONFIG_64BIT is not set
 CONFIG_X86_32=y
 # CONFIG_X86_64 is not set
 CONFIG_X86=y
+CONFIG_OUTPUT_FORMAT="elf32-i386"
 CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig"
 CONFIG_GENERIC_TIME=y
 CONFIG_GENERIC_CMOS_UPDATE=y
@@ -33,6 +34,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y
 CONFIG_ARCH_HAS_DEFAULT_IDLE=y
 CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
 CONFIG_HAVE_SETUP_PER_CPU_AREA=y
+CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y
 # CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set
 CONFIG_ARCH_HIBERNATION_POSSIBLE=y
 CONFIG_ARCH_SUSPEND_POSSIBLE=y
@@ -40,15 +42,16 @@ CONFIG_ARCH_SUSPEND_POSSIBLE=y
 CONFIG_ARCH_POPULATES_NODE_MAP=y
 # CONFIG_AUDIT_ARCH is not set
 CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
+CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
 CONFIG_GENERIC_HARDIRQS=y
+CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
 CONFIG_GENERIC_IRQ_PROBE=y
 CONFIG_GENERIC_PENDING_IRQ=y
-CONFIG_X86_SMP=y
 CONFIG_USE_GENERIC_SMP_HELPERS=y
 CONFIG_X86_32_SMP=y
 CONFIG_X86_HT=y
-CONFIG_X86_BIOS_REBOOT=y
 CONFIG_X86_TRAMPOLINE=y
+CONFIG_X86_32_LAZY_GS=y
 CONFIG_KTIME_SCALAR=y
 CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
 
@@ -60,10 +63,17 @@ CONFIG_LOCK_KERNEL=y
 CONFIG_INIT_ENV_ARG_LIMIT=32
 CONFIG_LOCALVERSION=""
 # CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_HAVE_KERNEL_GZIP=y
+CONFIG_HAVE_KERNEL_BZIP2=y
+CONFIG_HAVE_KERNEL_LZMA=y
+CONFIG_KERNEL_GZIP=y
+# CONFIG_KERNEL_BZIP2 is not set
+# CONFIG_KERNEL_LZMA is not set
 CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 # CONFIG_BSD_PROCESS_ACCT_V3 is not set
 CONFIG_TASKSTATS=y
@@ -113,23 +123,26 @@ CONFIG_PID_NS=y
 CONFIG_NET_NS=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_INITRAMFS_SOURCE=""
+CONFIG_RD_GZIP=y
+CONFIG_RD_BZIP2=y
+CONFIG_RD_LZMA=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_SYSCTL=y
+CONFIG_ANON_INODES=y
 # CONFIG_EMBEDDED is not set
 CONFIG_UID16=y
 CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 CONFIG_KALLSYMS_EXTRA_PASS=y
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
 CONFIG_ELF_CORE=y
 CONFIG_PCSPKR_PLATFORM=y
-# CONFIG_COMPAT_BRK is not set
 CONFIG_BASE_FULL=y
 CONFIG_FUTEX=y
-CONFIG_ANON_INODES=y
 CONFIG_EPOLL=y
 CONFIG_SIGNALFD=y
 CONFIG_TIMERFD=y
@@ -139,6 +152,7 @@ CONFIG_AIO=y
 CONFIG_VM_EVENT_COUNTERS=y
 CONFIG_PCI_QUIRKS=y
 CONFIG_SLUB_DEBUG=y
+# CONFIG_COMPAT_BRK is not set
 # CONFIG_SLAB is not set
 CONFIG_SLUB=y
 # CONFIG_SLOB is not set
@@ -154,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -167,7 +183,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 CONFIG_STOP_MACHINE=y
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-CONFIG_BLK_DEV_IO_TRACE=y
 CONFIG_BLK_DEV_BSG=y
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -194,12 +209,12 @@ CONFIG_HIGH_RES_TIMERS=y
 CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
 CONFIG_SMP=y
 CONFIG_SPARSE_IRQ=y
-CONFIG_X86_FIND_SMP_CONFIG=y
 CONFIG_X86_MPPARSE=y
+# CONFIG_X86_BIGSMP is not set
+CONFIG_X86_EXTENDED_PLATFORM=y
 # CONFIG_X86_ELAN is not set
-# CONFIG_X86_GENERICARCH is not set
-# CONFIG_X86_VSMP is not set
 # CONFIG_X86_RDC321X is not set
+# CONFIG_X86_32_NON_STANDARD is not set
 CONFIG_SCHED_OMIT_FRAME_POINTER=y
 # CONFIG_PARAVIRT_GUEST is not set
 # CONFIG_MEMTEST is not set
@@ -230,8 +245,10 @@ CONFIG_M686=y
 # CONFIG_GENERIC_CPU is not set
 CONFIG_X86_GENERIC=y
 CONFIG_X86_CPU=y
+CONFIG_X86_L1_CACHE_BYTES=64
+CONFIG_X86_INTERNODE_CACHE_BYTES=64
 CONFIG_X86_CMPXCHG=y
-CONFIG_X86_L1_CACHE_SHIFT=7
+CONFIG_X86_L1_CACHE_SHIFT=5
 CONFIG_X86_XADD=y
 # CONFIG_X86_PPRO_FENCE is not set
 CONFIG_X86_WP_WORKS_OK=y
@@ -247,7 +264,7 @@ CONFIG_X86_DEBUGCTLMSR=y
 CONFIG_CPU_SUP_INTEL=y
 CONFIG_CPU_SUP_CYRIX_32=y
 CONFIG_CPU_SUP_AMD=y
-CONFIG_CPU_SUP_CENTAUR_32=y
+CONFIG_CPU_SUP_CENTAUR=y
 CONFIG_CPU_SUP_TRANSMETA_32=y
 CONFIG_CPU_SUP_UMC_32=y
 CONFIG_X86_DS=y
@@ -279,6 +296,7 @@ CONFIG_MICROCODE_AMD=y
 CONFIG_MICROCODE_OLD_INTERFACE=y
 CONFIG_X86_MSR=y
 CONFIG_X86_CPUID=y
+# CONFIG_X86_CPU_DEBUG is not set
 # CONFIG_NOHIGHMEM is not set
 CONFIG_HIGHMEM4G=y
 # CONFIG_HIGHMEM64G is not set
@@ -302,6 +320,8 @@ CONFIG_ZONE_DMA_FLAG=1
 CONFIG_BOUNCE=y
 CONFIG_VIRT_TO_BUS=y
 CONFIG_UNEVICTABLE_LRU=y
+CONFIG_HAVE_MLOCK=y
+CONFIG_HAVE_MLOCKED_PAGE_BIT=y
 CONFIG_HIGHPTE=y
 CONFIG_X86_CHECK_BIOS_CORRUPTION=y
 CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
@@ -312,6 +332,7 @@ CONFIG_MTRR=y
 CONFIG_X86_PAT=y
 CONFIG_EFI=y
 CONFIG_SECCOMP=y
+# CONFIG_CC_STACKPROTECTOR is not set
 # CONFIG_HZ_100 is not set
 # CONFIG_HZ_250 is not set
 # CONFIG_HZ_300 is not set
@@ -363,7 +384,6 @@ CONFIG_ACPI_THERMAL=y
 CONFIG_ACPI_BLACKLIST_YEAR=0
 # CONFIG_ACPI_DEBUG is not set
 # CONFIG_ACPI_PCI_SLOT is not set
-CONFIG_ACPI_SYSTEM=y
 CONFIG_X86_PM_TIMER=y
 CONFIG_ACPI_CONTAINER=y
 # CONFIG_ACPI_SBS is not set
@@ -425,6 +445,7 @@ CONFIG_PCI_BIOS=y
 CONFIG_PCI_DIRECT=y
 CONFIG_PCI_MMCONFIG=y
 CONFIG_PCI_DOMAINS=y
+# CONFIG_DMAR is not set
 CONFIG_PCIEPORTBUS=y
 # CONFIG_HOTPLUG_PCI_PCIE is not set
 CONFIG_PCIEAER=y
@@ -435,6 +456,7 @@ CONFIG_PCI_MSI=y
 # CONFIG_PCI_DEBUG is not set
 # CONFIG_PCI_STUB is not set
 CONFIG_HT_IRQ=y
+# CONFIG_PCI_IOV is not set
 CONFIG_ISA_DMA_API=y
 # CONFIG_ISA is not set
 # CONFIG_MCA is not set
@@ -481,7 +503,6 @@ CONFIG_NET=y
 #
 # Networking options
 #
-CONFIG_COMPAT_NET_DEV_OPS=y
 CONFIG_PACKET=y
 CONFIG_PACKET_MMAP=y
 CONFIG_UNIX=y
@@ -639,6 +660,7 @@ CONFIG_LLC=y
 # CONFIG_LAPB is not set
 # CONFIG_ECONET is not set
 # CONFIG_WAN_ROUTER is not set
+# CONFIG_PHONET is not set
 CONFIG_NET_SCHED=y
 
 #
@@ -696,6 +718,7 @@ CONFIG_NET_SCH_FIFO=y
 #
 # CONFIG_NET_PKTGEN is not set
 # CONFIG_NET_TCPPROBE is not set
+# CONFIG_NET_DROP_MONITOR is not set
 CONFIG_HAMRADIO=y
 
 #
@@ -706,12 +729,10 @@ CONFIG_HAMRADIO=y
 # CONFIG_IRDA is not set
 # CONFIG_BT is not set
 # CONFIG_AF_RXRPC is not set
-# CONFIG_PHONET is not set
 CONFIG_FIB_RULES=y
 CONFIG_WIRELESS=y
 CONFIG_CFG80211=y
 # CONFIG_CFG80211_REG_DEBUG is not set
-CONFIG_NL80211=y
 CONFIG_WIRELESS_OLD_REGULATORY=y
 CONFIG_WIRELESS_EXT=y
 CONFIG_WIRELESS_EXT_SYSFS=y
@@ -789,6 +810,7 @@ CONFIG_MISC_DEVICES=y
 # CONFIG_ICS932S401 is not set
 # CONFIG_ENCLOSURE_SERVICES is not set
 # CONFIG_HP_ILO is not set
+# CONFIG_ISL29003 is not set
 # CONFIG_C2PORT is not set
 
 #
@@ -842,6 +864,7 @@ CONFIG_SCSI_SPI_ATTRS=y
 # CONFIG_SCSI_LOWLEVEL is not set
 # CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
 # CONFIG_SCSI_DH is not set
+# CONFIG_SCSI_OSD_INITIATOR is not set
 CONFIG_ATA=y
 # CONFIG_ATA_NONSTANDARD is not set
 CONFIG_ATA_ACPI=y
@@ -940,6 +963,7 @@ CONFIG_DM_ZERO=y
 CONFIG_MACINTOSH_DRIVERS=y
 CONFIG_MAC_EMUMOUSEBTN=y
 CONFIG_NETDEVICES=y
+CONFIG_COMPAT_NET_DEV_OPS=y
 # CONFIG_IFB is not set
 # CONFIG_DUMMY is not set
 # CONFIG_BONDING is not set
@@ -977,6 +1001,8 @@ CONFIG_MII=y
 CONFIG_NET_VENDOR_3COM=y
 # CONFIG_VORTEX is not set
 # CONFIG_TYPHOON is not set
+# CONFIG_ETHOC is not set
+# CONFIG_DNET is not set
 CONFIG_NET_TULIP=y
 # CONFIG_DE2104X is not set
 # CONFIG_TULIP is not set
@@ -1026,6 +1052,7 @@ CONFIG_E1000=y
 CONFIG_E1000E=y
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -1040,6 +1067,7 @@ CONFIG_BNX2=y
 # CONFIG_QLA3XXX is not set
 # CONFIG_ATL1 is not set
 # CONFIG_ATL1E is not set
+# CONFIG_ATL1C is not set
 # CONFIG_JME is not set
 CONFIG_NETDEV_10000=y
 # CONFIG_CHELSIO_T1 is not set
@@ -1049,6 +1077,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_IXGBE is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
 # CONFIG_NIU is not set
@@ -1058,6 +1087,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_BNX2X is not set
 # CONFIG_QLGE is not set
 # CONFIG_SFC is not set
+# CONFIG_BE2NET is not set
 CONFIG_TR=y
 # CONFIG_IBMOL is not set
 # CONFIG_IBMLS is not set
@@ -1073,8 +1103,8 @@ CONFIG_WLAN_80211=y
 # CONFIG_LIBERTAS is not set
 # CONFIG_LIBERTAS_THINFIRM is not set
 # CONFIG_AIRO is not set
-# CONFIG_HERMES is not set
 # CONFIG_ATMEL is not set
+# CONFIG_AT76C50X_USB is not set
 # CONFIG_AIRO_CS is not set
 # CONFIG_PCMCIA_WL3501 is not set
 # CONFIG_PRISM54 is not set
@@ -1084,21 +1114,21 @@ CONFIG_WLAN_80211=y
 # CONFIG_RTL8187 is not set
 # CONFIG_ADM8211 is not set
 # CONFIG_MAC80211_HWSIM is not set
+# CONFIG_MWL8K is not set
 # CONFIG_P54_COMMON is not set
 CONFIG_ATH5K=y
 # CONFIG_ATH5K_DEBUG is not set
 # CONFIG_ATH9K is not set
+# CONFIG_AR9170_USB is not set
 # CONFIG_IPW2100 is not set
 # CONFIG_IPW2200 is not set
-# CONFIG_IWLCORE is not set
-# CONFIG_IWLWIFI_LEDS is not set
-# CONFIG_IWLAGN is not set
-# CONFIG_IWL3945 is not set
+# CONFIG_IWLWIFI is not set
 # CONFIG_HOSTAP is not set
 # CONFIG_B43 is not set
 # CONFIG_B43LEGACY is not set
 # CONFIG_ZD1211RW is not set
 # CONFIG_RT2X00 is not set
+# CONFIG_HERMES is not set
 
 #
 # Enable WiMAX (Networking options) to see the WiMAX drivers
@@ -1209,6 +1239,8 @@ CONFIG_INPUT_TABLET=y
 # CONFIG_TABLET_USB_KBTAB is not set
 # CONFIG_TABLET_USB_WACOM is not set
 CONFIG_INPUT_TOUCHSCREEN=y
+# CONFIG_TOUCHSCREEN_AD7879_I2C is not set
+# CONFIG_TOUCHSCREEN_AD7879 is not set
 # CONFIG_TOUCHSCREEN_FUJITSU is not set
 # CONFIG_TOUCHSCREEN_GUNZE is not set
 # CONFIG_TOUCHSCREEN_ELO is not set
@@ -1303,6 +1335,7 @@ CONFIG_UNIX98_PTYS=y
 # CONFIG_LEGACY_PTYS is not set
 # CONFIG_IPMI_HANDLER is not set
 CONFIG_HW_RANDOM=y
+# CONFIG_HW_RANDOM_TIMERIOMEM is not set
 CONFIG_HW_RANDOM_INTEL=y
 CONFIG_HW_RANDOM_AMD=y
 CONFIG_HW_RANDOM_GEODE=y
@@ -1390,7 +1423,6 @@ CONFIG_I2C_I801=y
 # CONFIG_SENSORS_PCF8574 is not set
 # CONFIG_PCF8575 is not set
 # CONFIG_SENSORS_PCA9539 is not set
-# CONFIG_SENSORS_PCF8591 is not set
 # CONFIG_SENSORS_MAX6875 is not set
 # CONFIG_SENSORS_TSL2550 is not set
 # CONFIG_I2C_DEBUG_CORE is not set
@@ -1424,6 +1456,7 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_ADT7475 is not set
 # CONFIG_SENSORS_K8TEMP is not set
 # CONFIG_SENSORS_ASB100 is not set
+# CONFIG_SENSORS_ATK0110 is not set
 # CONFIG_SENSORS_ATXP1 is not set
 # CONFIG_SENSORS_DS1621 is not set
 # CONFIG_SENSORS_I5K_AMB is not set
@@ -1433,6 +1466,7 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_FSCHER is not set
 # CONFIG_SENSORS_FSCPOS is not set
 # CONFIG_SENSORS_FSCHMD is not set
+# CONFIG_SENSORS_G760A is not set
 # CONFIG_SENSORS_GL518SM is not set
 # CONFIG_SENSORS_GL520SM is not set
 # CONFIG_SENSORS_CORETEMP is not set
@@ -1448,11 +1482,14 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_LM90 is not set
 # CONFIG_SENSORS_LM92 is not set
 # CONFIG_SENSORS_LM93 is not set
+# CONFIG_SENSORS_LTC4215 is not set
 # CONFIG_SENSORS_LTC4245 is not set
+# CONFIG_SENSORS_LM95241 is not set
 # CONFIG_SENSORS_MAX1619 is not set
 # CONFIG_SENSORS_MAX6650 is not set
 # CONFIG_SENSORS_PC87360 is not set
 # CONFIG_SENSORS_PC87427 is not set
+# CONFIG_SENSORS_PCF8591 is not set
 # CONFIG_SENSORS_SIS5595 is not set
 # CONFIG_SENSORS_DME1737 is not set
 # CONFIG_SENSORS_SMSC47M1 is not set
@@ -1643,7 +1680,6 @@ CONFIG_FB_EFI=y
 # CONFIG_FB_3DFX is not set
 # CONFIG_FB_VOODOO1 is not set
 # CONFIG_FB_VT8623 is not set
-# CONFIG_FB_CYBLA is not set
 # CONFIG_FB_TRIDENT is not set
 # CONFIG_FB_ARK is not set
 # CONFIG_FB_PM3 is not set
@@ -1652,6 +1688,7 @@ CONFIG_FB_EFI=y
 # CONFIG_FB_VIRTUAL is not set
 # CONFIG_FB_METRONOME is not set
 # CONFIG_FB_MB862XX is not set
+# CONFIG_FB_BROADSHEET is not set
 CONFIG_BACKLIGHT_LCD_SUPPORT=y
 # CONFIG_LCD_CLASS_DEVICE is not set
 CONFIG_BACKLIGHT_CLASS_DEVICE=y
@@ -1738,6 +1775,8 @@ CONFIG_SND_PCI=y
 # CONFIG_SND_INDIGO is not set
 # CONFIG_SND_INDIGOIO is not set
 # CONFIG_SND_INDIGODJ is not set
+# CONFIG_SND_INDIGOIOX is not set
+# CONFIG_SND_INDIGODJX is not set
 # CONFIG_SND_EMU10K1 is not set
 # CONFIG_SND_EMU10K1X is not set
 # CONFIG_SND_ENS1370 is not set
@@ -1811,15 +1850,17 @@ CONFIG_USB_HIDDEV=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=y
 CONFIG_HID_APPLE=y
 CONFIG_HID_BELKIN=y
 CONFIG_HID_CHERRY=y
 CONFIG_HID_CHICONY=y
 CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=y
+CONFIG_HID_KYE=y
 CONFIG_HID_GYRATION=y
+CONFIG_HID_KENSINGTON=y
 CONFIG_HID_LOGITECH=y
 CONFIG_LOGITECH_FF=y
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1885,11 +1926,11 @@ CONFIG_USB_PRINTER=y
 # CONFIG_USB_TMC is not set
 
 #
-# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed;
+# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
 #
 
 #
-# see USB_STORAGE Help for more information
+# also be needed; see USB_STORAGE Help for more info
 #
 CONFIG_USB_STORAGE=y
 # CONFIG_USB_STORAGE_DEBUG is not set
@@ -1931,7 +1972,6 @@ CONFIG_USB_LIBUSUAL=y
 # CONFIG_USB_LED is not set
 # CONFIG_USB_CYPRESS_CY7C63 is not set
 # CONFIG_USB_CYTHERM is not set
-# CONFIG_USB_PHIDGET is not set
 # CONFIG_USB_IDMOUSE is not set
 # CONFIG_USB_FTDI_ELAN is not set
 # CONFIG_USB_APPLEDISPLAY is not set
@@ -1947,6 +1987,7 @@ CONFIG_USB_LIBUSUAL=y
 #
 # OTG and related infrastructure
 #
+# CONFIG_NOP_USB_XCEIV is not set
 # CONFIG_UWB is not set
 # CONFIG_MMC is not set
 # CONFIG_MEMSTICK is not set
@@ -1958,8 +1999,10 @@ CONFIG_LEDS_CLASS=y
 #
 # CONFIG_LEDS_ALIX2 is not set
 # CONFIG_LEDS_PCA9532 is not set
+# CONFIG_LEDS_LP5521 is not set
 # CONFIG_LEDS_CLEVO_MAIL is not set
 # CONFIG_LEDS_PCA955X is not set
+# CONFIG_LEDS_BD2802 is not set
 
 #
 # LED Triggers
@@ -1969,6 +2012,10 @@ CONFIG_LEDS_TRIGGERS=y
 # CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
 # CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
 # CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
+
+#
+# iptables trigger is under Netfilter config (LED target)
+#
 # CONFIG_ACCESSIBILITY is not set
 # CONFIG_INFINIBAND is not set
 CONFIG_EDAC=y
@@ -2037,6 +2084,7 @@ CONFIG_DMADEVICES=y
 # DMA Devices
 #
 # CONFIG_INTEL_IOATDMA is not set
+# CONFIG_AUXDISPLAY is not set
 # CONFIG_UIO is not set
 # CONFIG_STAGING is not set
 CONFIG_X86_PLATFORM_DEVICES=y
@@ -2071,6 +2119,7 @@ CONFIG_DMIID=y
 #
 # CONFIG_EXT2_FS is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 CONFIG_EXT3_FS_POSIX_ACL=y
 CONFIG_EXT3_FS_SECURITY=y
@@ -2101,6 +2150,11 @@ CONFIG_AUTOFS4_FS=y
 CONFIG_GENERIC_ACL=y
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 CONFIG_ISO9660_FS=y
@@ -2151,6 +2205,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -2164,7 +2219,6 @@ CONFIG_NFS_ACL_SUPPORT=y
 CONFIG_NFS_COMMON=y
 CONFIG_SUNRPC=y
 CONFIG_SUNRPC_GSS=y
-# CONFIG_SUNRPC_REGISTER_V4 is not set
 CONFIG_RPCSEC_GSS_KRB5=y
 # CONFIG_RPCSEC_GSS_SPKM3 is not set
 # CONFIG_SMB_FS is not set
@@ -2251,6 +2305,7 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 # CONFIG_DEBUG_SHIRQ is not set
 # CONFIG_DETECT_SOFTLOCKUP is not set
+# CONFIG_DETECT_HUNG_TASK is not set
 # CONFIG_SCHED_DEBUG is not set
 CONFIG_SCHEDSTATS=y
 CONFIG_TIMER_STATS=y
@@ -2266,6 +2321,7 @@ CONFIG_TIMER_STATS=y
 # CONFIG_LOCK_STAT is not set
 # CONFIG_DEBUG_SPINLOCK_SLEEP is not set
 # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
+CONFIG_STACKTRACE=y
 # CONFIG_DEBUG_KOBJECT is not set
 # CONFIG_DEBUG_HIGHMEM is not set
 CONFIG_DEBUG_BUGVERBOSE=y
@@ -2289,13 +2345,19 @@ CONFIG_FRAME_POINTER=y
 # CONFIG_FAULT_INJECTION is not set
 # CONFIG_LATENCYTOP is not set
 CONFIG_SYSCTL_SYSCALL_CHECK=y
+# CONFIG_DEBUG_PAGEALLOC is not set
 CONFIG_USER_STACKTRACE_SUPPORT=y
+CONFIG_NOP_TRACER=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
 CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_HAVE_HW_BRANCH_TRACER=y
+CONFIG_HAVE_FTRACE_SYSCALLS=y
+CONFIG_RING_BUFFER=y
+CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -2305,13 +2367,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y
 # CONFIG_SYSPROF_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_FTRACE_SYSCALLS is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_POWER_TRACER is not set
 # CONFIG_STACK_TRACER is not set
 # CONFIG_HW_BRANCH_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+CONFIG_BLK_DEV_IO_TRACE=y
+# CONFIG_FTRACE_STARTUP_TEST is not set
+# CONFIG_MMIOTRACE is not set
 CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
-# CONFIG_DYNAMIC_PRINTK_DEBUG is not set
+# CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -2321,7 +2391,6 @@ CONFIG_EARLY_PRINTK=y
 CONFIG_EARLY_PRINTK_DBGP=y
 CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_DEBUG_STACK_USAGE=y
-# CONFIG_DEBUG_PAGEALLOC is not set
 # CONFIG_DEBUG_PER_CPU_MAPS is not set
 # CONFIG_X86_PTDUMP is not set
 CONFIG_DEBUG_RODATA=y
@@ -2329,7 +2398,7 @@ CONFIG_DEBUG_RODATA=y
 CONFIG_DEBUG_NX_TEST=m
 # CONFIG_4KSTACKS is not set
 CONFIG_DOUBLEFAULT=y
-# CONFIG_MMIOTRACE is not set
+CONFIG_HAVE_MMIOTRACE_SUPPORT=y
 CONFIG_IO_DELAY_TYPE_0X80=0
 CONFIG_IO_DELAY_TYPE_0XED=1
 CONFIG_IO_DELAY_TYPE_UDELAY=2
@@ -2365,6 +2434,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y
 CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
 # CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
 # CONFIG_SECURITY_SMACK is not set
+# CONFIG_SECURITY_TOMOYO is not set
+# CONFIG_IMA is not set
 CONFIG_CRYPTO=y
 
 #
@@ -2380,10 +2451,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y
 CONFIG_CRYPTO_HASH=y
 CONFIG_CRYPTO_HASH2=y
 CONFIG_CRYPTO_RNG2=y
+CONFIG_CRYPTO_PCOMP=y
 CONFIG_CRYPTO_MANAGER=y
 CONFIG_CRYPTO_MANAGER2=y
 # CONFIG_CRYPTO_GF128MUL is not set
 # CONFIG_CRYPTO_NULL is not set
+CONFIG_CRYPTO_WORKQUEUE=y
 # CONFIG_CRYPTO_CRYPTD is not set
 CONFIG_CRYPTO_AUTHENC=y
 # CONFIG_CRYPTO_TEST is not set
@@ -2456,6 +2529,7 @@ CONFIG_CRYPTO_DES=y
 # Compression
 #
 # CONFIG_CRYPTO_DEFLATE is not set
+# CONFIG_CRYPTO_ZLIB is not set
 # CONFIG_CRYPTO_LZO is not set
 
 #
@@ -2467,11 +2541,13 @@ CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_GEODE is not set
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
 CONFIG_HAVE_KVM=y
+CONFIG_HAVE_KVM_IRQCHIP=y
 CONFIG_VIRTUALIZATION=y
 # CONFIG_KVM is not set
 # CONFIG_LGUEST is not set
 # CONFIG_VIRTIO_PCI is not set
 # CONFIG_VIRTIO_BALLOON is not set
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
@@ -2489,7 +2565,10 @@ CONFIG_CRC32=y
 # CONFIG_LIBCRC32C is not set
 CONFIG_AUDIT_GENERIC=y
 CONFIG_ZLIB_INFLATE=y
-CONFIG_PLIST=y
+CONFIG_DECOMPRESS_GZIP=y
+CONFIG_DECOMPRESS_BZIP2=y
+CONFIG_DECOMPRESS_LZMA=y
 CONFIG_HAS_IOMEM=y
 CONFIG_HAS_IOPORT=y
 CONFIG_HAS_DMA=y
+CONFIG_NLATTR=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 9fe5d21..f3e53e2 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,12 +1,13 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29-rc4
-# Tue Feb 24 15:44:16 2009
+# Linux kernel version: 2.6.30-rc2
+# Mon May 11 16:19:24 2009
 #
 CONFIG_64BIT=y
 # CONFIG_X86_32 is not set
 CONFIG_X86_64=y
 CONFIG_X86=y
+CONFIG_OUTPUT_FORMAT="elf64-x86-64"
 CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig"
 CONFIG_GENERIC_TIME=y
 CONFIG_GENERIC_CMOS_UPDATE=y
@@ -34,6 +35,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y
 CONFIG_ARCH_HAS_DEFAULT_IDLE=y
 CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
 CONFIG_HAVE_SETUP_PER_CPU_AREA=y
+CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y
 CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y
 CONFIG_ARCH_HIBERNATION_POSSIBLE=y
 CONFIG_ARCH_SUSPEND_POSSIBLE=y
@@ -41,14 +43,14 @@ CONFIG_ZONE_DMA32=y
 CONFIG_ARCH_POPULATES_NODE_MAP=y
 CONFIG_AUDIT_ARCH=y
 CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
+CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
 CONFIG_GENERIC_HARDIRQS=y
+CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
 CONFIG_GENERIC_IRQ_PROBE=y
 CONFIG_GENERIC_PENDING_IRQ=y
-CONFIG_X86_SMP=y
 CONFIG_USE_GENERIC_SMP_HELPERS=y
 CONFIG_X86_64_SMP=y
 CONFIG_X86_HT=y
-CONFIG_X86_BIOS_REBOOT=y
 CONFIG_X86_TRAMPOLINE=y
 # CONFIG_KTIME_SCALAR is not set
 CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
@@ -61,10 +63,17 @@ CONFIG_LOCK_KERNEL=y
 CONFIG_INIT_ENV_ARG_LIMIT=32
 CONFIG_LOCALVERSION=""
 # CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_HAVE_KERNEL_GZIP=y
+CONFIG_HAVE_KERNEL_BZIP2=y
+CONFIG_HAVE_KERNEL_LZMA=y
+CONFIG_KERNEL_GZIP=y
+# CONFIG_KERNEL_BZIP2 is not set
+# CONFIG_KERNEL_LZMA is not set
 CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 # CONFIG_BSD_PROCESS_ACCT_V3 is not set
 CONFIG_TASKSTATS=y
@@ -114,23 +123,26 @@ CONFIG_PID_NS=y
 CONFIG_NET_NS=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_INITRAMFS_SOURCE=""
+CONFIG_RD_GZIP=y
+CONFIG_RD_BZIP2=y
+CONFIG_RD_LZMA=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_SYSCTL=y
+CONFIG_ANON_INODES=y
 # CONFIG_EMBEDDED is not set
 CONFIG_UID16=y
 CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 CONFIG_KALLSYMS_EXTRA_PASS=y
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
 CONFIG_ELF_CORE=y
 CONFIG_PCSPKR_PLATFORM=y
-# CONFIG_COMPAT_BRK is not set
 CONFIG_BASE_FULL=y
 CONFIG_FUTEX=y
-CONFIG_ANON_INODES=y
 CONFIG_EPOLL=y
 CONFIG_SIGNALFD=y
 CONFIG_TIMERFD=y
@@ -140,6 +152,7 @@ CONFIG_AIO=y
 CONFIG_VM_EVENT_COUNTERS=y
 CONFIG_PCI_QUIRKS=y
 CONFIG_SLUB_DEBUG=y
+# CONFIG_COMPAT_BRK is not set
 # CONFIG_SLAB is not set
 CONFIG_SLUB=y
 # CONFIG_SLOB is not set
@@ -155,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 # CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -167,7 +182,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_STOP_MACHINE=y
 CONFIG_BLOCK=y
-CONFIG_BLK_DEV_IO_TRACE=y
 CONFIG_BLK_DEV_BSG=y
 # CONFIG_BLK_DEV_INTEGRITY is not set
 CONFIG_BLOCK_COMPAT=y
@@ -196,11 +210,10 @@ CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
 CONFIG_SMP=y
 CONFIG_SPARSE_IRQ=y
 # CONFIG_NUMA_MIGRATE_IRQ_DESC is not set
-CONFIG_X86_FIND_SMP_CONFIG=y
 CONFIG_X86_MPPARSE=y
-# CONFIG_X86_ELAN is not set
-# CONFIG_X86_GENERICARCH is not set
+CONFIG_X86_EXTENDED_PLATFORM=y
 # CONFIG_X86_VSMP is not set
+# CONFIG_X86_UV is not set
 CONFIG_SCHED_OMIT_FRAME_POINTER=y
 # CONFIG_PARAVIRT_GUEST is not set
 # CONFIG_MEMTEST is not set
@@ -230,10 +243,10 @@ CONFIG_SCHED_OMIT_FRAME_POINTER=y
 # CONFIG_MCORE2 is not set
 CONFIG_GENERIC_CPU=y
 CONFIG_X86_CPU=y
-CONFIG_X86_L1_CACHE_BYTES=128
-CONFIG_X86_INTERNODE_CACHE_BYTES=128
+CONFIG_X86_L1_CACHE_BYTES=64
+CONFIG_X86_INTERNODE_CACHE_BYTES=64
 CONFIG_X86_CMPXCHG=y
-CONFIG_X86_L1_CACHE_SHIFT=7
+CONFIG_X86_L1_CACHE_SHIFT=6
 CONFIG_X86_WP_WORKS_OK=y
 CONFIG_X86_TSC=y
 CONFIG_X86_CMPXCHG64=y
@@ -242,7 +255,7 @@ CONFIG_X86_MINIMUM_CPU_FAMILY=64
 CONFIG_X86_DEBUGCTLMSR=y
 CONFIG_CPU_SUP_INTEL=y
 CONFIG_CPU_SUP_AMD=y
-CONFIG_CPU_SUP_CENTAUR_64=y
+CONFIG_CPU_SUP_CENTAUR=y
 CONFIG_X86_DS=y
 CONFIG_X86_PTRACE_BTS=y
 CONFIG_HPET_TIMER=y
@@ -269,6 +282,7 @@ CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
 CONFIG_X86_MCE=y
 CONFIG_X86_MCE_INTEL=y
 CONFIG_X86_MCE_AMD=y
+CONFIG_X86_MCE_THRESHOLD=y
 # CONFIG_I8K is not set
 CONFIG_MICROCODE=y
 CONFIG_MICROCODE_INTEL=y
@@ -276,6 +290,7 @@ CONFIG_MICROCODE_AMD=y
 CONFIG_MICROCODE_OLD_INTERFACE=y
 CONFIG_X86_MSR=y
 CONFIG_X86_CPUID=y
+# CONFIG_X86_CPU_DEBUG is not set
 CONFIG_ARCH_PHYS_ADDR_T_64BIT=y
 CONFIG_DIRECT_GBPAGES=y
 CONFIG_NUMA=y
@@ -309,6 +324,8 @@ CONFIG_ZONE_DMA_FLAG=1
 CONFIG_BOUNCE=y
 CONFIG_VIRT_TO_BUS=y
 CONFIG_UNEVICTABLE_LRU=y
+CONFIG_HAVE_MLOCK=y
+CONFIG_HAVE_MLOCKED_PAGE_BIT=y
 CONFIG_X86_CHECK_BIOS_CORRUPTION=y
 CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
 CONFIG_X86_RESERVE_LOW_64K=y
@@ -317,6 +334,7 @@ CONFIG_MTRR=y
 CONFIG_X86_PAT=y
 CONFIG_EFI=y
 CONFIG_SECCOMP=y
+# CONFIG_CC_STACKPROTECTOR is not set
 # CONFIG_HZ_100 is not set
 # CONFIG_HZ_250 is not set
 # CONFIG_HZ_300 is not set
@@ -325,9 +343,10 @@ CONFIG_HZ=1000
 CONFIG_SCHED_HRTICK=y
 CONFIG_KEXEC=y
 CONFIG_CRASH_DUMP=y
+# CONFIG_KEXEC_JUMP is not set
 CONFIG_PHYSICAL_START=0x1000000
 # CONFIG_RELOCATABLE is not set
-CONFIG_PHYSICAL_ALIGN=0x200000
+CONFIG_PHYSICAL_ALIGN=0x1000000
 CONFIG_HOTPLUG_CPU=y
 # CONFIG_COMPAT_VDSO is not set
 # CONFIG_CMDLINE_BOOL is not set
@@ -370,7 +389,6 @@ CONFIG_ACPI_NUMA=y
 CONFIG_ACPI_BLACKLIST_YEAR=0
 # CONFIG_ACPI_DEBUG is not set
 # CONFIG_ACPI_PCI_SLOT is not set
-CONFIG_ACPI_SYSTEM=y
 CONFIG_X86_PM_TIMER=y
 CONFIG_ACPI_CONTAINER=y
 # CONFIG_ACPI_SBS is not set
@@ -436,6 +454,7 @@ CONFIG_PCI_MSI=y
 # CONFIG_PCI_DEBUG is not set
 # CONFIG_PCI_STUB is not set
 CONFIG_HT_IRQ=y
+# CONFIG_PCI_IOV is not set
 CONFIG_ISA_DMA_API=y
 CONFIG_K8_NB=y
 CONFIG_PCCARD=y
@@ -481,7 +500,6 @@ CONFIG_NET=y
 #
 # Networking options
 #
-CONFIG_COMPAT_NET_DEV_OPS=y
 CONFIG_PACKET=y
 CONFIG_PACKET_MMAP=y
 CONFIG_UNIX=y
@@ -639,6 +657,7 @@ CONFIG_LLC=y
 # CONFIG_LAPB is not set
 # CONFIG_ECONET is not set
 # CONFIG_WAN_ROUTER is not set
+# CONFIG_PHONET is not set
 CONFIG_NET_SCHED=y
 
 #
@@ -696,6 +715,7 @@ CONFIG_NET_SCH_FIFO=y
 #
 # CONFIG_NET_PKTGEN is not set
 # CONFIG_NET_TCPPROBE is not set
+# CONFIG_NET_DROP_MONITOR is not set
 CONFIG_HAMRADIO=y
 
 #
@@ -706,12 +726,10 @@ CONFIG_HAMRADIO=y
 # CONFIG_IRDA is not set
 # CONFIG_BT is not set
 # CONFIG_AF_RXRPC is not set
-# CONFIG_PHONET is not set
 CONFIG_FIB_RULES=y
 CONFIG_WIRELESS=y
 CONFIG_CFG80211=y
 # CONFIG_CFG80211_REG_DEBUG is not set
-CONFIG_NL80211=y
 CONFIG_WIRELESS_OLD_REGULATORY=y
 CONFIG_WIRELESS_EXT=y
 CONFIG_WIRELESS_EXT_SYSFS=y
@@ -788,9 +806,8 @@ CONFIG_MISC_DEVICES=y
 # CONFIG_TIFM_CORE is not set
 # CONFIG_ICS932S401 is not set
 # CONFIG_ENCLOSURE_SERVICES is not set
-# CONFIG_SGI_XP is not set
 # CONFIG_HP_ILO is not set
-# CONFIG_SGI_GRU is not set
+# CONFIG_ISL29003 is not set
 # CONFIG_C2PORT is not set
 
 #
@@ -844,6 +861,7 @@ CONFIG_SCSI_SPI_ATTRS=y
 # CONFIG_SCSI_LOWLEVEL is not set
 # CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
 # CONFIG_SCSI_DH is not set
+# CONFIG_SCSI_OSD_INITIATOR is not set
 CONFIG_ATA=y
 # CONFIG_ATA_NONSTANDARD is not set
 CONFIG_ATA_ACPI=y
@@ -940,6 +958,7 @@ CONFIG_DM_ZERO=y
 CONFIG_MACINTOSH_DRIVERS=y
 CONFIG_MAC_EMUMOUSEBTN=y
 CONFIG_NETDEVICES=y
+CONFIG_COMPAT_NET_DEV_OPS=y
 # CONFIG_IFB is not set
 # CONFIG_DUMMY is not set
 # CONFIG_BONDING is not set
@@ -977,6 +996,8 @@ CONFIG_MII=y
 CONFIG_NET_VENDOR_3COM=y
 # CONFIG_VORTEX is not set
 # CONFIG_TYPHOON is not set
+# CONFIG_ETHOC is not set
+# CONFIG_DNET is not set
 CONFIG_NET_TULIP=y
 # CONFIG_DE2104X is not set
 # CONFIG_TULIP is not set
@@ -1026,6 +1047,7 @@ CONFIG_E1000=y
 # CONFIG_E1000E is not set
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -1040,6 +1062,7 @@ CONFIG_TIGON3=y
 # CONFIG_QLA3XXX is not set
 # CONFIG_ATL1 is not set
 # CONFIG_ATL1E is not set
+# CONFIG_ATL1C is not set
 # CONFIG_JME is not set
 CONFIG_NETDEV_10000=y
 # CONFIG_CHELSIO_T1 is not set
@@ -1049,6 +1072,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_IXGBE is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
 # CONFIG_NIU is not set
@@ -1058,6 +1082,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_BNX2X is not set
 # CONFIG_QLGE is not set
 # CONFIG_SFC is not set
+# CONFIG_BE2NET is not set
 CONFIG_TR=y
 # CONFIG_IBMOL is not set
 # CONFIG_3C359 is not set
@@ -1072,8 +1097,8 @@ CONFIG_WLAN_80211=y
 # CONFIG_LIBERTAS is not set
 # CONFIG_LIBERTAS_THINFIRM is not set
 # CONFIG_AIRO is not set
-# CONFIG_HERMES is not set
 # CONFIG_ATMEL is not set
+# CONFIG_AT76C50X_USB is not set
 # CONFIG_AIRO_CS is not set
 # CONFIG_PCMCIA_WL3501 is not set
 # CONFIG_PRISM54 is not set
@@ -1083,21 +1108,21 @@ CONFIG_WLAN_80211=y
 # CONFIG_RTL8187 is not set
 # CONFIG_ADM8211 is not set
 # CONFIG_MAC80211_HWSIM is not set
+# CONFIG_MWL8K is not set
 # CONFIG_P54_COMMON is not set
 CONFIG_ATH5K=y
 # CONFIG_ATH5K_DEBUG is not set
 # CONFIG_ATH9K is not set
+# CONFIG_AR9170_USB is not set
 # CONFIG_IPW2100 is not set
 # CONFIG_IPW2200 is not set
-# CONFIG_IWLCORE is not set
-# CONFIG_IWLWIFI_LEDS is not set
-# CONFIG_IWLAGN is not set
-# CONFIG_IWL3945 is not set
+# CONFIG_IWLWIFI is not set
 # CONFIG_HOSTAP is not set
 # CONFIG_B43 is not set
 # CONFIG_B43LEGACY is not set
 # CONFIG_ZD1211RW is not set
 # CONFIG_RT2X00 is not set
+# CONFIG_HERMES is not set
 
 #
 # Enable WiMAX (Networking options) to see the WiMAX drivers
@@ -1208,6 +1233,8 @@ CONFIG_INPUT_TABLET=y
 # CONFIG_TABLET_USB_KBTAB is not set
 # CONFIG_TABLET_USB_WACOM is not set
 CONFIG_INPUT_TOUCHSCREEN=y
+# CONFIG_TOUCHSCREEN_AD7879_I2C is not set
+# CONFIG_TOUCHSCREEN_AD7879 is not set
 # CONFIG_TOUCHSCREEN_FUJITSU is not set
 # CONFIG_TOUCHSCREEN_GUNZE is not set
 # CONFIG_TOUCHSCREEN_ELO is not set
@@ -1301,6 +1328,7 @@ CONFIG_UNIX98_PTYS=y
 # CONFIG_LEGACY_PTYS is not set
 # CONFIG_IPMI_HANDLER is not set
 CONFIG_HW_RANDOM=y
+# CONFIG_HW_RANDOM_TIMERIOMEM is not set
 # CONFIG_HW_RANDOM_INTEL is not set
 # CONFIG_HW_RANDOM_AMD is not set
 CONFIG_NVRAM=y
@@ -1382,7 +1410,6 @@ CONFIG_I2C_I801=y
 # CONFIG_SENSORS_PCF8574 is not set
 # CONFIG_PCF8575 is not set
 # CONFIG_SENSORS_PCA9539 is not set
-# CONFIG_SENSORS_PCF8591 is not set
 # CONFIG_SENSORS_MAX6875 is not set
 # CONFIG_SENSORS_TSL2550 is not set
 # CONFIG_I2C_DEBUG_CORE is not set
@@ -1416,6 +1443,7 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_ADT7475 is not set
 # CONFIG_SENSORS_K8TEMP is not set
 # CONFIG_SENSORS_ASB100 is not set
+# CONFIG_SENSORS_ATK0110 is not set
 # CONFIG_SENSORS_ATXP1 is not set
 # CONFIG_SENSORS_DS1621 is not set
 # CONFIG_SENSORS_I5K_AMB is not set
@@ -1425,6 +1453,7 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_FSCHER is not set
 # CONFIG_SENSORS_FSCPOS is not set
 # CONFIG_SENSORS_FSCHMD is not set
+# CONFIG_SENSORS_G760A is not set
 # CONFIG_SENSORS_GL518SM is not set
 # CONFIG_SENSORS_GL520SM is not set
 # CONFIG_SENSORS_CORETEMP is not set
@@ -1440,11 +1469,14 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_LM90 is not set
 # CONFIG_SENSORS_LM92 is not set
 # CONFIG_SENSORS_LM93 is not set
+# CONFIG_SENSORS_LTC4215 is not set
 # CONFIG_SENSORS_LTC4245 is not set
+# CONFIG_SENSORS_LM95241 is not set
 # CONFIG_SENSORS_MAX1619 is not set
 # CONFIG_SENSORS_MAX6650 is not set
 # CONFIG_SENSORS_PC87360 is not set
 # CONFIG_SENSORS_PC87427 is not set
+# CONFIG_SENSORS_PCF8591 is not set
 # CONFIG_SENSORS_SIS5595 is not set
 # CONFIG_SENSORS_DME1737 is not set
 # CONFIG_SENSORS_SMSC47M1 is not set
@@ -1635,6 +1667,7 @@ CONFIG_FB_EFI=y
 # CONFIG_FB_VIRTUAL is not set
 # CONFIG_FB_METRONOME is not set
 # CONFIG_FB_MB862XX is not set
+# CONFIG_FB_BROADSHEET is not set
 CONFIG_BACKLIGHT_LCD_SUPPORT=y
 # CONFIG_LCD_CLASS_DEVICE is not set
 CONFIG_BACKLIGHT_CLASS_DEVICE=y
@@ -1720,6 +1753,8 @@ CONFIG_SND_PCI=y
 # CONFIG_SND_INDIGO is not set
 # CONFIG_SND_INDIGOIO is not set
 # CONFIG_SND_INDIGODJ is not set
+# CONFIG_SND_INDIGOIOX is not set
+# CONFIG_SND_INDIGODJX is not set
 # CONFIG_SND_EMU10K1 is not set
 # CONFIG_SND_EMU10K1X is not set
 # CONFIG_SND_ENS1370 is not set
@@ -1792,15 +1827,17 @@ CONFIG_USB_HIDDEV=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=y
 CONFIG_HID_APPLE=y
 CONFIG_HID_BELKIN=y
 CONFIG_HID_CHERRY=y
 CONFIG_HID_CHICONY=y
 CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=y
+CONFIG_HID_KYE=y
 CONFIG_HID_GYRATION=y
+CONFIG_HID_KENSINGTON=y
 CONFIG_HID_LOGITECH=y
 CONFIG_LOGITECH_FF=y
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1866,11 +1903,11 @@ CONFIG_USB_PRINTER=y
 # CONFIG_USB_TMC is not set
 
 #
-# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed;
+# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
 #
 
 #
-# see USB_STORAGE Help for more information
+# also be needed; see USB_STORAGE Help for more info
 #
 CONFIG_USB_STORAGE=y
 # CONFIG_USB_STORAGE_DEBUG is not set
@@ -1912,7 +1949,6 @@ CONFIG_USB_LIBUSUAL=y
 # CONFIG_USB_LED is not set
 # CONFIG_USB_CYPRESS_CY7C63 is not set
 # CONFIG_USB_CYTHERM is not set
-# CONFIG_USB_PHIDGET is not set
 # CONFIG_USB_IDMOUSE is not set
 # CONFIG_USB_FTDI_ELAN is not set
 # CONFIG_USB_APPLEDISPLAY is not set
@@ -1928,6 +1964,7 @@ CONFIG_USB_LIBUSUAL=y
 #
 # OTG and related infrastructure
 #
+# CONFIG_NOP_USB_XCEIV is not set
 # CONFIG_UWB is not set
 # CONFIG_MMC is not set
 # CONFIG_MEMSTICK is not set
@@ -1939,8 +1976,10 @@ CONFIG_LEDS_CLASS=y
 #
 # CONFIG_LEDS_ALIX2 is not set
 # CONFIG_LEDS_PCA9532 is not set
+# CONFIG_LEDS_LP5521 is not set
 # CONFIG_LEDS_CLEVO_MAIL is not set
 # CONFIG_LEDS_PCA955X is not set
+# CONFIG_LEDS_BD2802 is not set
 
 #
 # LED Triggers
@@ -1950,6 +1989,10 @@ CONFIG_LEDS_TRIGGERS=y
 # CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
 # CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
 # CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
+
+#
+# iptables trigger is under Netfilter config (LED target)
+#
 # CONFIG_ACCESSIBILITY is not set
 # CONFIG_INFINIBAND is not set
 CONFIG_EDAC=y
@@ -2018,6 +2061,7 @@ CONFIG_DMADEVICES=y
 # DMA Devices
 #
 # CONFIG_INTEL_IOATDMA is not set
+# CONFIG_AUXDISPLAY is not set
 # CONFIG_UIO is not set
 # CONFIG_STAGING is not set
 CONFIG_X86_PLATFORM_DEVICES=y
@@ -2051,6 +2095,7 @@ CONFIG_DMIID=y
 #
 # CONFIG_EXT2_FS is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 CONFIG_EXT3_FS_POSIX_ACL=y
 CONFIG_EXT3_FS_SECURITY=y
@@ -2082,6 +2127,11 @@ CONFIG_AUTOFS4_FS=y
 CONFIG_GENERIC_ACL=y
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 CONFIG_ISO9660_FS=y
@@ -2132,6 +2182,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -2145,7 +2196,6 @@ CONFIG_NFS_ACL_SUPPORT=y
 CONFIG_NFS_COMMON=y
 CONFIG_SUNRPC=y
 CONFIG_SUNRPC_GSS=y
-# CONFIG_SUNRPC_REGISTER_V4 is not set
 CONFIG_RPCSEC_GSS_KRB5=y
 # CONFIG_RPCSEC_GSS_SPKM3 is not set
 # CONFIG_SMB_FS is not set
@@ -2232,6 +2282,7 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 # CONFIG_DEBUG_SHIRQ is not set
 # CONFIG_DETECT_SOFTLOCKUP is not set
+# CONFIG_DETECT_HUNG_TASK is not set
 # CONFIG_SCHED_DEBUG is not set
 CONFIG_SCHEDSTATS=y
 CONFIG_TIMER_STATS=y
@@ -2247,6 +2298,7 @@ CONFIG_TIMER_STATS=y
 # CONFIG_LOCK_STAT is not set
 # CONFIG_DEBUG_SPINLOCK_SLEEP is not set
 # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
+CONFIG_STACKTRACE=y
 # CONFIG_DEBUG_KOBJECT is not set
 CONFIG_DEBUG_BUGVERBOSE=y
 # CONFIG_DEBUG_INFO is not set
@@ -2269,13 +2321,19 @@ CONFIG_FRAME_POINTER=y
 # CONFIG_FAULT_INJECTION is not set
 # CONFIG_LATENCYTOP is not set
 CONFIG_SYSCTL_SYSCALL_CHECK=y
+# CONFIG_DEBUG_PAGEALLOC is not set
 CONFIG_USER_STACKTRACE_SUPPORT=y
+CONFIG_NOP_TRACER=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
 CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_HAVE_HW_BRANCH_TRACER=y
+CONFIG_HAVE_FTRACE_SYSCALLS=y
+CONFIG_RING_BUFFER=y
+CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -2285,13 +2343,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y
 # CONFIG_SYSPROF_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_FTRACE_SYSCALLS is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_POWER_TRACER is not set
 # CONFIG_STACK_TRACER is not set
 # CONFIG_HW_BRANCH_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+CONFIG_BLK_DEV_IO_TRACE=y
+# CONFIG_FTRACE_STARTUP_TEST is not set
+# CONFIG_MMIOTRACE is not set
 CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
-# CONFIG_DYNAMIC_PRINTK_DEBUG is not set
+# CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -2301,14 +2367,13 @@ CONFIG_EARLY_PRINTK=y
 CONFIG_EARLY_PRINTK_DBGP=y
 CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_DEBUG_STACK_USAGE=y
-# CONFIG_DEBUG_PAGEALLOC is not set
 # CONFIG_DEBUG_PER_CPU_MAPS is not set
 # CONFIG_X86_PTDUMP is not set
 CONFIG_DEBUG_RODATA=y
 # CONFIG_DEBUG_RODATA_TEST is not set
 CONFIG_DEBUG_NX_TEST=m
 # CONFIG_IOMMU_DEBUG is not set
-# CONFIG_MMIOTRACE is not set
+CONFIG_HAVE_MMIOTRACE_SUPPORT=y
 CONFIG_IO_DELAY_TYPE_0X80=0
 CONFIG_IO_DELAY_TYPE_0XED=1
 CONFIG_IO_DELAY_TYPE_UDELAY=2
@@ -2344,6 +2409,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y
 CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
 # CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
 # CONFIG_SECURITY_SMACK is not set
+# CONFIG_SECURITY_TOMOYO is not set
+# CONFIG_IMA is not set
 CONFIG_CRYPTO=y
 
 #
@@ -2359,10 +2426,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y
 CONFIG_CRYPTO_HASH=y
 CONFIG_CRYPTO_HASH2=y
 CONFIG_CRYPTO_RNG2=y
+CONFIG_CRYPTO_PCOMP=y
 CONFIG_CRYPTO_MANAGER=y
 CONFIG_CRYPTO_MANAGER2=y
 # CONFIG_CRYPTO_GF128MUL is not set
 # CONFIG_CRYPTO_NULL is not set
+CONFIG_CRYPTO_WORKQUEUE=y
 # CONFIG_CRYPTO_CRYPTD is not set
 CONFIG_CRYPTO_AUTHENC=y
 # CONFIG_CRYPTO_TEST is not set
@@ -2414,6 +2483,7 @@ CONFIG_CRYPTO_SHA1=y
 #
 CONFIG_CRYPTO_AES=y
 # CONFIG_CRYPTO_AES_X86_64 is not set
+# CONFIG_CRYPTO_AES_NI_INTEL is not set
 # CONFIG_CRYPTO_ANUBIS is not set
 CONFIG_CRYPTO_ARC4=y
 # CONFIG_CRYPTO_BLOWFISH is not set
@@ -2435,6 +2505,7 @@ CONFIG_CRYPTO_DES=y
 # Compression
 #
 # CONFIG_CRYPTO_DEFLATE is not set
+# CONFIG_CRYPTO_ZLIB is not set
 # CONFIG_CRYPTO_LZO is not set
 
 #
@@ -2444,10 +2515,12 @@ CONFIG_CRYPTO_DES=y
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
 CONFIG_HAVE_KVM=y
+CONFIG_HAVE_KVM_IRQCHIP=y
 CONFIG_VIRTUALIZATION=y
 # CONFIG_KVM is not set
 # CONFIG_VIRTIO_PCI is not set
 # CONFIG_VIRTIO_BALLOON is not set
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
@@ -2464,7 +2537,10 @@ CONFIG_CRC32=y
 # CONFIG_CRC7 is not set
 # CONFIG_LIBCRC32C is not set
 CONFIG_ZLIB_INFLATE=y
-CONFIG_PLIST=y
+CONFIG_DECOMPRESS_GZIP=y
+CONFIG_DECOMPRESS_BZIP2=y
+CONFIG_DECOMPRESS_LZMA=y
 CONFIG_HAS_IOMEM=y
 CONFIG_HAS_IOPORT=y
 CONFIG_HAS_DMA=y
+CONFIG_NLATTR=y
-- 
cgit v0.10.2


From fe83fcc0a14dcf71996de5eb84771b2369ba7abc Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 11 May 2009 16:23:16 -0700
Subject: x86, defconfig: update kernel position parameters

Update CONFIG_RELOCATABLE, CONFIG_PHYSICAL_START and
CONFIG_PHYSICAL_ALIGN to reflect the current defaults.

[ Impact: make defconfig match Kconfig defaults ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 70036a7..edb992e 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
 # Linux kernel version: 2.6.30-rc2
-# Mon May 11 16:19:47 2009
+# Mon May 11 16:21:55 2009
 #
 # CONFIG_64BIT is not set
 CONFIG_X86_32=y
@@ -343,8 +343,9 @@ CONFIG_KEXEC=y
 CONFIG_CRASH_DUMP=y
 # CONFIG_KEXEC_JUMP is not set
 CONFIG_PHYSICAL_START=0x1000000
-# CONFIG_RELOCATABLE is not set
-CONFIG_PHYSICAL_ALIGN=0x200000
+CONFIG_RELOCATABLE=y
+CONFIG_X86_NEED_RELOCS=y
+CONFIG_PHYSICAL_ALIGN=0x1000000
 CONFIG_HOTPLUG_CPU=y
 # CONFIG_COMPAT_VDSO is not set
 # CONFIG_CMDLINE_BOOL is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index f3e53e2..4ba7d4e 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
 # Linux kernel version: 2.6.30-rc2
-# Mon May 11 16:19:24 2009
+# Mon May 11 16:22:00 2009
 #
 CONFIG_64BIT=y
 # CONFIG_X86_32 is not set
@@ -345,7 +345,7 @@ CONFIG_KEXEC=y
 CONFIG_CRASH_DUMP=y
 # CONFIG_KEXEC_JUMP is not set
 CONFIG_PHYSICAL_START=0x1000000
-# CONFIG_RELOCATABLE is not set
+CONFIG_RELOCATABLE=y
 CONFIG_PHYSICAL_ALIGN=0x1000000
 CONFIG_HOTPLUG_CPU=y
 # CONFIG_COMPAT_VDSO is not set
-- 
cgit v0.10.2


From 5031296c57024a78ddad4edfc993367dbf4abb98 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Thu, 7 May 2009 16:54:11 -0700
Subject: x86: add extension fields for bootloader type and version

A long ago, in days of yore, it all began with a god named Thor.
There were vikings and boats and some plans for a Linux kernel
header.  Unfortunately, a single 8-bit field was used for bootloader
type and version.  This has generally worked without *too* much pain,
but we're getting close to flat running out of ID fields.

Add extension fields for both type and version.  The type will be
extended if it the old field is 0xE; the version is a simple MSB
extension.

Keep /proc/sys/kernel/bootloader_type containing
(type << 4) + (ver & 0xf) for backwards compatiblity, but also add
/proc/sys/kernel/bootloader_version which contains the full version
number.

[ Impact: new feature to support more bootloaders ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt
index cf8dfc7..8da3a79 100644
--- a/Documentation/x86/boot.txt
+++ b/Documentation/x86/boot.txt
@@ -50,10 +50,9 @@ Protocol 2.08:	(Kernel 2.6.26) Added crc32 checksum and ELF format
 Protocol 2.09:	(Kernel 2.6.26) Added a field of 64-bit physical
 		pointer to single linked list of struct	setup_data.
 
-Protocol 2.10:	(Kernel 2.6.31) A protocol for relaxed alignment
+Protocol 2.10:	(Kernel 2.6.31) Added a protocol for relaxed alignment
 		beyond the kernel_alignment added, new init_size and
-		pref_address fields.
-
+		pref_address fields.  Added extended boot loader IDs.
 
 **** MEMORY LAYOUT
 
@@ -173,7 +172,8 @@ Offset	Proto	Name		Meaning
 021C/4	2.00+	ramdisk_size	initrd size (set by boot loader)
 0220/4	2.00+	bootsect_kludge	DO NOT USE - for bootsect.S use only
 0224/2	2.01+	heap_end_ptr	Free memory after setup end
-0226/2	N/A	pad1		Unused
+0226/1	2.02+(3 ext_loader_ver	Extended boot loader version
+0227/1	2.02+(3	ext_loader_type	Extended boot loader ID
 0228/4	2.02+	cmd_line_ptr	32-bit pointer to the kernel command line
 022C/4	2.03+	ramdisk_max	Highest legal initrd address
 0230/4	2.05+	kernel_alignment Physical addr alignment required for kernel
@@ -197,6 +197,8 @@ Offset	Proto	Name		Meaning
     field are unusable, which means the size of a bzImage kernel
     cannot be determined.
 
+(3) Ignored, but safe to set, for boot protocols 2.02-2.09.
+
 If the "HdrS" (0x53726448) magic number is not found at offset 0x202,
 the boot protocol version is "old".  Loading an old kernel, the
 following parameters should be assumed:
@@ -350,18 +352,32 @@ Protocol:	2.00+
   0xTV here, where T is an identifier for the boot loader and V is
   a version number.  Otherwise, enter 0xFF here.
 
+  For boot loader IDs above T = 0xD, write T = 0xE to this field and
+  write the extended ID minus 0x10 to the ext_loader_type field.
+  Similarly, the ext_loader_ver field can be used to provide more than
+  four bits for the bootloader version.
+
+  For example, for T = 0x15, V = 0x234, write:
+
+  type_of_loader  <- 0xE4
+  ext_loader_type <- 0x05
+  ext_loader_ver  <- 0x23
+
   Assigned boot loader ids:
 	0  LILO			(0x00 reserved for pre-2.00 bootloader)
 	1  Loadlin
 	2  bootsect-loader	(0x20, all other values reserved)
-	3  SYSLINUX
-	4  EtherBoot
+	3  Syslinux
+	4  Etherboot/gPXE
 	5  ELILO
 	7  GRUB
-	8  U-BOOT
+	8  U-Boot
 	9  Xen
 	A  Gujin
 	B  Qemu
+	C  Arcturus Networks uCbootloader
+	E  Extended		(see ext_loader_type)
+	F  Special		(0xFF = undefined)
 
   Please contact <hpa@zytor.com> if you need a bootloader ID
   value assigned.
@@ -460,6 +476,35 @@ Protocol:	2.01+
   Set this field to the offset (from the beginning of the real-mode
   code) of the end of the setup stack/heap, minus 0x0200.
 
+Field name:	ext_loader_ver
+Type:		write (optional)
+Offset/size:	0x226/1
+Protocol:	2.02+
+
+  This field is used as an extension of the version number in the
+  type_of_loader field.  The total version number is considered to be
+  (type_of_loader & 0x0f) + (ext_loader_ver << 4).
+
+  The use of this field is boot loader specific.  If not written, it
+  is zero.
+
+  Kernels prior to 2.6.31 did not recognize this field, but it is safe
+  to write for protocol version 2.02 or higher.
+
+Field name:	ext_loader_type
+Type:		write (obligatory if (type_of_loader & 0xf0) == 0xe0)
+Offset/size:	0x227/1
+Protocol:	2.02+
+
+  This field is used as an extension of the type number in
+  type_of_loader field.  If the type in type_of_loader is 0xE, then
+  the actual type is (ext_loader_type + 0x10).
+
+  This field is ignored if the type in type_of_loader is not 0xE.
+
+  Kernels prior to 2.6.31 did not recognize this field, but it is safe
+  to write for protocol version 2.02 or higher.
+
 Field name:	cmd_line_ptr
 Type:		write (obligatory)
 Offset/size:	0x228/4
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index a0b4269..68c3bfb 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -169,7 +169,11 @@ heap_end_ptr:	.word	_end+STACK_SIZE-512
 					# end of setup code can be used by setup
 					# for local heap purposes.
 
-pad1:		.word	0
+ext_loader_ver:
+		.byte	0		# Extended boot loader version
+ext_loader_type:
+		.byte	0		# Extended boot loader type
+
 cmd_line_ptr:	.long	0		# (Header version 0x0202 or later)
 					# If nonzero, a 32-bit pointer
 					# to the kernel command line.
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index 433adae..1724e8d 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -50,7 +50,8 @@ struct setup_header {
 	__u32	ramdisk_size;
 	__u32	bootsect_kludge;
 	__u16	heap_end_ptr;
-	__u16	_pad1;
+	__u8	ext_loader_ver;
+	__u8	ext_loader_type;
 	__u32	cmd_line_ptr;
 	__u32	initrd_addr_max;
 	__u32	kernel_alignment;
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index fcf4d92..6384d25 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -814,6 +814,7 @@ extern unsigned int		BIOS_revision;
 
 /* Boot loader type from the setup header: */
 extern int			bootloader_type;
+extern int			bootloader_version;
 
 extern char			ignore_fpu_irq;
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b415843..2b09345 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -214,8 +214,8 @@ unsigned long mmu_cr4_features;
 unsigned long mmu_cr4_features = X86_CR4_PAE;
 #endif
 
-/* Boot loader ID as an integer, for the benefit of proc_dointvec */
-int bootloader_type;
+/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
+int bootloader_type, bootloader_version;
 
 /*
  * Setup options
@@ -706,6 +706,12 @@ void __init setup_arch(char **cmdline_p)
 #endif
 	saved_video_mode = boot_params.hdr.vid_mode;
 	bootloader_type = boot_params.hdr.type_of_loader;
+	if ((bootloader_type >> 4) == 0xe) {
+		bootloader_type &= 0xf;
+		bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
+	}
+	bootloader_version  = bootloader_type & 0xf;
+	bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
 
 #ifdef CONFIG_BLK_DEV_RAM
 	rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e3d2c7d..cf91c93 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -729,6 +729,14 @@ static struct ctl_table kern_table[] = {
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "bootloader_version",
+		.data		= &bootloader_version,
+		.maxlen		= sizeof (int),
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "kstack_depth_to_print",
 		.data		= &kstack_depth_to_print,
 		.maxlen		= sizeof(int),
-- 
cgit v0.10.2


From 37bcbf13d32e4e453e9def79ee72bd953b88302f Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 11 May 2009 13:59:10 -0400
Subject: IMA: use current_cred() instead of current->cred

Proper invocation of the current credentials is to use current_cred() not
current->cred.  This patches makes IMA use the new method.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/integrity/ima/ima_audit.c b/security/integrity/ima/ima_audit.c
index b628eea..ff513ff 100644
--- a/security/integrity/ima/ima_audit.c
+++ b/security/integrity/ima/ima_audit.c
@@ -41,7 +41,7 @@ void integrity_audit_msg(int audit_msgno, struct inode *inode,
 
 	ab = audit_log_start(current->audit_context, GFP_KERNEL, audit_msgno);
 	audit_log_format(ab, "integrity: pid=%d uid=%u auid=%u ses=%u",
-			 current->pid, current->cred->uid,
+			 current->pid, current_cred()->uid,
 			 audit_get_loginuid(current),
 			 audit_get_sessionid(current));
 	audit_log_task_context(ab);
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 122f17fc..cdae13c 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -184,7 +184,7 @@ int ima_path_check(struct path *path, int mask)
 		struct dentry *dentry = dget(path->dentry);
 		struct vfsmount *mnt = mntget(path->mnt);
 
-		file = dentry_open(dentry, mnt, O_RDONLY, current->cred);
+		file = dentry_open(dentry, mnt, O_RDONLY, current_cred());
 		rc = get_path_measurement(iint, file, dentry->d_name.name);
 	}
 out:
-- 
cgit v0.10.2


From f06dd16a03f6f7f72fab4db03be36e28c28c6fd6 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 11 May 2009 13:59:16 -0400
Subject: IMA: Handle dentry_open failures

Currently IMA does not handle failures from dentry_open().  This means that we
leave a pointer set to ERR_PTR(errno) and then try to use it just a few lines
later in fput().  Oops.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index cdae13c..1987424 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -116,10 +116,6 @@ static int get_path_measurement(struct ima_iint_cache *iint, struct file *file,
 {
 	int rc = 0;
 
-	if (IS_ERR(file)) {
-		pr_info("%s dentry_open failed\n", filename);
-		return rc;
-	}
 	iint->opencount++;
 	iint->readcount++;
 
@@ -185,6 +181,12 @@ int ima_path_check(struct path *path, int mask)
 		struct vfsmount *mnt = mntget(path->mnt);
 
 		file = dentry_open(dentry, mnt, O_RDONLY, current_cred());
+		if (IS_ERR(file)) {
+			pr_info("%s dentry_open failed\n", dentry->d_name.name);
+			rc = PTR_ERR(file);
+			file = NULL;
+			goto out;
+		}
 		rc = get_path_measurement(iint, file, dentry->d_name.name);
 	}
 out:
-- 
cgit v0.10.2


From 1a62e958fa4aaeeb752311b4f5e16b2a86737b23 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 11 May 2009 13:59:22 -0400
Subject: IMA: open all files O_LARGEFILE

If IMA tried to measure a file which was larger than 4G dentry_open would fail
with -EOVERFLOW since IMA wasn't passing O_LARGEFILE.  This patch passes
O_LARGEFILE to all IMA opens to avoid this problem.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 1987424..c4228c0 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -180,7 +180,8 @@ int ima_path_check(struct path *path, int mask)
 		struct dentry *dentry = dget(path->dentry);
 		struct vfsmount *mnt = mntget(path->mnt);
 
-		file = dentry_open(dentry, mnt, O_RDONLY, current_cred());
+		file = dentry_open(dentry, mnt, O_RDONLY | O_LARGEFILE,
+				   current_cred());
 		if (IS_ERR(file)) {
 			pr_info("%s dentry_open failed\n", dentry->d_name.name);
 			rc = PTR_ERR(file);
-- 
cgit v0.10.2


From d93e4c940f51ae06b59c14523c4d55947f9597d6 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 11 May 2009 20:47:15 -0400
Subject: securityfs: securityfs_remove should handle IS_ERR pointers

Both of the securityfs users (TPM and IMA) can call securityfs_remove and pass
an IS_ERR(dentry) in their failure paths.  This patch handles those rather
than panicing when it tries to start deferencing some negative memory.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/inode.c b/security/inode.c
index f3b91bf..f7496c6 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -287,7 +287,7 @@ void securityfs_remove(struct dentry *dentry)
 {
 	struct dentry *parent;
 
-	if (!dentry)
+	if (!dentry || IS_ERR(dentry))
 		return;
 
 	parent = dentry->d_parent;
-- 
cgit v0.10.2


From 0f0c85fc80adbbd2265d89867d743f929d516805 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 11 May 2009 16:08:00 -0400
Subject: ring-buffer: small optimizations

Doing some small changes in the fast path of the ring buffer recording
saves over 3% in the ring-buffer-benchmark test.

[ Impact: a little faster ring buffer recording ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 493cba4..f452de2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1000,7 +1000,7 @@ rb_event_index(struct ring_buffer_event *event)
 	return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);
 }
 
-static int
+static inline int
 rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
 	     struct ring_buffer_event *event)
 {
@@ -1423,9 +1423,9 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	 * also be made. But only the entry that did the actual
 	 * commit will be something other than zero.
 	 */
-	if (cpu_buffer->tail_page == cpu_buffer->commit_page &&
-	    rb_page_write(cpu_buffer->tail_page) ==
-	    rb_commit_index(cpu_buffer)) {
+	if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
+		   rb_page_write(cpu_buffer->tail_page) ==
+		   rb_commit_index(cpu_buffer))) {
 
 		delta = ts - cpu_buffer->write_stamp;
 
@@ -1436,7 +1436,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 		if (unlikely(ts < cpu_buffer->write_stamp))
 			delta = 0;
 
-		if (test_time_stamp(delta)) {
+		else if (unlikely(test_time_stamp(delta))) {
 
 			commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
 
@@ -1470,7 +1470,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	 * If the timestamp was commited, make the commit our entry
 	 * now so that we will update it when needed.
 	 */
-	if (commit)
+	if (unlikely(commit))
 		rb_set_commit_event(cpu_buffer, event);
 	else if (!rb_is_commit(cpu_buffer, event))
 		delta = 0;
-- 
cgit v0.10.2


From 88eb0125362f2ab272cbaf84252cf101ddc2dec9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 11 May 2009 16:28:23 -0400
Subject: ring-buffer: use internal time stamp function

The ring_buffer_time_stamp that is exported adds a little more overhead
than is needed for using it internally. This patch adds an internal
timestamp function that can be inlined (a single line function)
and used internally for the ring buffer.

[ Impact: a little less overhead to the ring buffer ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f452de2..a9e645a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -454,13 +454,18 @@ struct ring_buffer_iter {
 /* Up this if you want to test the TIME_EXTENTS and normalization */
 #define DEBUG_SHIFT 0
 
+static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu)
+{
+	/* shift to debug/test normalization and TIME_EXTENTS */
+	return buffer->clock() << DEBUG_SHIFT;
+}
+
 u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
 {
 	u64 time;
 
 	preempt_disable_notrace();
-	/* shift to debug/test normalization and TIME_EXTENTS */
-	time = buffer->clock() << DEBUG_SHIFT;
+	time = rb_time_stamp(buffer, cpu);
 	preempt_enable_no_resched_notrace();
 
 	return time;
@@ -1247,7 +1252,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 		cpu_buffer->tail_page = next_page;
 
 		/* reread the time stamp */
-		*ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu);
+		*ts = rb_time_stamp(buffer, cpu_buffer->cpu);
 		cpu_buffer->tail_page->page->time_stamp = *ts;
 	}
 
@@ -1413,7 +1418,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
 		return NULL;
 
-	ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
+	ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
 
 	/*
 	 * Only the first commit can update the timestamp.
-- 
cgit v0.10.2


From 168b6b1d0594c7866caa73b12f3b8d91075695f2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 11 May 2009 22:11:05 -0400
Subject: ring-buffer: move code around to remove some branches

This is a bit of micro-optimizations. But since the ring buffer is used
in tracing every function call, it is an extreme hot path. Every nanosecond
counts.

This change shows over 5% improvement in the ring-buffer-benchmark.

[ Impact: more efficient code ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a9e645a..16b24d4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1400,7 +1400,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 		      unsigned long length)
 {
 	struct ring_buffer_event *event;
-	u64 ts, delta;
+	u64 ts, delta = 0;
 	int commit = 0;
 	int nr_loops = 0;
 
@@ -1431,20 +1431,21 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
 		   rb_page_write(cpu_buffer->tail_page) ==
 		   rb_commit_index(cpu_buffer))) {
+		u64 diff;
 
-		delta = ts - cpu_buffer->write_stamp;
+		diff = ts - cpu_buffer->write_stamp;
 
-		/* make sure this delta is calculated here */
+		/* make sure this diff is calculated here */
 		barrier();
 
 		/* Did the write stamp get updated already? */
 		if (unlikely(ts < cpu_buffer->write_stamp))
-			delta = 0;
+			goto get_event;
 
-		else if (unlikely(test_time_stamp(delta))) {
+		delta = diff;
+		if (unlikely(test_time_stamp(delta))) {
 
 			commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
-
 			if (commit == -EBUSY)
 				return NULL;
 
@@ -1453,12 +1454,11 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 
 			RB_WARN_ON(cpu_buffer, commit < 0);
 		}
-	} else
-		/* Non commits have zero deltas */
-		delta = 0;
+	}
 
+ get_event:
 	event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
-	if (PTR_ERR(event) == -EAGAIN)
+	if (unlikely(PTR_ERR(event) == -EAGAIN))
 		goto again;
 
 	if (!event) {
-- 
cgit v0.10.2


From bc8e67409ccdcff72c3f1656b1fb1aad7ff396db Mon Sep 17 00:00:00 2001
From: Vincent Minet <vincent@vincent-minet.net>
Date: Fri, 15 May 2009 08:33:18 -0400
Subject: ext4: Fix spinlock assertions on UP systems

On UP systems without DEBUG_SPINLOCK, ext4_is_group_locked always fails
which triggers a BUG_ON() call.
This patch fixes it by using assert_spin_locked instead.

Signed-off-by: Vincent Minet <vincent@vincent-minet.net>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 149e02d..89190ae 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1583,12 +1583,6 @@ static inline void ext4_unlock_group(struct super_block *sb,
 	spin_unlock(ext4_group_lock_ptr(sb, group));
 }
 
-static inline int ext4_is_group_locked(struct super_block *sb,
-					ext4_group_t group)
-{
-	return spin_is_locked(ext4_group_lock_ptr(sb, group));
-}
-
 /*
  * Inodes and files operations
  */
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e76459c..541bd9a 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -436,7 +436,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
 
 	if (unlikely(e4b->bd_info->bb_bitmap == NULL))
 		return;
-	BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
+	assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
 	for (i = 0; i < count; i++) {
 		if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
 			ext4_fsblk_t blocknr;
@@ -460,7 +460,7 @@ static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
 
 	if (unlikely(e4b->bd_info->bb_bitmap == NULL))
 		return;
-	BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
 	for (i = 0; i < count; i++) {
 		BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
 		mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
@@ -1115,7 +1115,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 	struct super_block *sb = e4b->bd_sb;
 
 	BUG_ON(first + count > (sb->s_blocksize << 3));
-	BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
+	assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
 	mb_check_buddy(e4b);
 	mb_free_blocks_double(inode, e4b, first, count);
 
@@ -1196,7 +1196,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
 	int ord;
 	void *buddy;
 
-	BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
 	BUG_ON(ex == NULL);
 
 	buddy = mb_find_buddy(e4b, order, &max);
@@ -1260,7 +1260,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
 
 	BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
 	BUG_ON(e4b->bd_group != ex->fe_group);
-	BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
 	mb_check_buddy(e4b);
 	mb_mark_used_double(e4b, start, len);
 
-- 
cgit v0.10.2


From f888e652d758bfe0c04c209b72a05972daeba386 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 12 May 2009 00:21:29 -0400
Subject: ext4: Simplify function signature for ext4_da_get_block_write()

The function ext4_da_get_block_write() is called in exactly one write,
and the last argument, create, is always 1.  Remove it to simplify the
code slightly.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4e7f363..476d843 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2000,7 +2000,7 @@ static void ext4_print_free_blocks(struct inode *inode)
 
 #define		EXT4_DELALLOC_RSVED	1
 static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
-				   struct buffer_head *bh_result, int create)
+				   struct buffer_head *bh_result)
 {
 	int ret;
 	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
@@ -2010,7 +2010,7 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 	handle = ext4_journal_current_handle();
 	BUG_ON(!handle);
 	ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
-				   bh_result, create, 0, EXT4_DELALLOC_RSVED);
+				   bh_result, 1, 0, EXT4_DELALLOC_RSVED);
 	if (ret <= 0)
 		return ret;
 
@@ -2088,7 +2088,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	if (!new.b_size)
 		return 0;
 
-	err = ext4_da_get_block_write(mpd->inode, next, &new, 1);
+	err = ext4_da_get_block_write(mpd->inode, next, &new);
 	if (err) {
 		/*
 		 * If get block returns with error we simply
-- 
cgit v0.10.2


From e4d996ca806e93dddb5d76c0d3d859b494c559f6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 12 May 2009 00:25:28 -0400
Subject: ext4: Rename ext4_get_blocks_handle() to be ext4_ind_get_blocks()

The static function ext4_get_blocks_handle() is badly named.  Of
*course* it takes a handle.  Since its counterpart for extent-based
file is ext4_ext_get_blocks(), rename it to be ext4_ind_get_blocks().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 476d843..f758e80 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -914,7 +914,7 @@ err_out:
  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
  * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
  */
-static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
+static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 				  ext4_lblk_t iblock, unsigned int maxblocks,
 				  struct buffer_head *bh_result,
 				  int create, int extend_disksize)
@@ -1129,7 +1129,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
  * mapped.
  *
  * If file type is extents based, it will call ext4_ext_get_blocks(),
- * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping
+ * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping
  * based files
  *
  * On success, it returns the number of blocks being mapped or allocate.
@@ -1160,8 +1160,8 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 		retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
 				bh, 0, 0);
 	} else {
-		retval = ext4_get_blocks_handle(handle,
-				inode, block, max_blocks, bh, 0, 0);
+		retval = ext4_ind_get_blocks(handle, inode, block, max_blocks,
+					     bh, 0, 0);
 	}
 	up_read((&EXT4_I(inode)->i_data_sem));
 
@@ -1215,7 +1215,7 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 		retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
 				bh, create, extend_disksize);
 	} else {
-		retval = ext4_get_blocks_handle(handle, inode, block,
+		retval = ext4_ind_get_blocks(handle, inode, block,
 				max_blocks, bh, create, extend_disksize);
 
 		if (retval > 0 && buffer_new(bh)) {
@@ -1297,7 +1297,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 	err = ext4_get_blocks_wrap(handle, inode, block, 1,
 					&dummy, create, 1, 0);
 	/*
-	 * ext4_get_blocks_handle() returns number of blocks
+	 * ext4_get_blocks_wrap() returns number of blocks
 	 * mapped. 0 in case of a HOLE.
 	 */
 	if (err > 0) {
-- 
cgit v0.10.2


From e458824f9d32e9bf7700d1eb0d201749af48eee0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 12 May 2009 08:49:32 +0200
Subject: scsi: fix resid_len mis-conversion in scsi_end_request()

Commit c3a4d78c580de4edc9ef0f7c59812fb02ceb037f introduced
rq->data_len and converted residual count users to it.  While
converting, it mistakenly converted scsi_end_request() to finish
requests with residual count when it wants to do is fully complete the
request.  Fix it by using blk_end_request_all() instead.

This bug was spotted by Boaz Harrosh.

Signed-off-by: Tejun Heo <tj@kernel.org>
Spotted-by: Boaz Harrosh <bharrosh@panasas.com>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index a54bec9..e410d66 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -546,14 +546,9 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error,
 	 * to queue the remainder of them.
 	 */
 	if (blk_end_request(req, error, bytes)) {
-		int leftover = blk_rq_bytes(req);
-
-		if (blk_pc_request(req))
-			leftover = req->resid_len;
-
 		/* kill remainder if no retrys */
 		if (error && scsi_noretry_cmd(cmd))
-			blk_end_request(req, error, leftover);
+			blk_end_request_all(req, error);
 		else {
 			if (requeue) {
 				/*
-- 
cgit v0.10.2


From f3f8290cb3fa4aa627321530bb85d5f35e487433 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 12 May 2009 16:07:40 +0900
Subject: sh: clkfwk: Handle clk_get_sys() returning an ERR_PTR.

clk_get() needs to also perform an IS_ERR() check to see whether
clk_get_sys() failed or not.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 34707b8..9e1fc13 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -399,7 +399,7 @@ struct clk *clk_get(struct device *dev, const char *id)
 	int idno;
 
 	clk = clk_get_sys(dev_id, id);
-	if (clk)
+	if (clk && !IS_ERR(clk))
 		return clk;
 
 	if (dev == NULL || dev->bus != &platform_bus_type)
-- 
cgit v0.10.2


From 871b72dd1e12afc3f024479531d25a9339d2e3f9 Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Mon, 11 May 2009 23:48:27 +0200
Subject: x86: microcode: use smp_call_function_single instead of
 set_cpus_allowed, cleanup of synchronization logic

* Solve issues described in 6f66cbc63081fd70e3191b4dbb796746780e5ae1
  in a way that doesn't resort to set_cpus_allowed();

* in fact, only collect_cpu_info and apply_microcode callbacks
  must run on a target cpu, others will do just fine on any other.
  smp_call_function_single() (as suggested by Ingo) is used to run
  these callbacks on a target cpu.

* cleanup of synchronization logic of the 'microcode_core' part

  The generic 'microcode_core' part guarantees that only a single cpu
  (be it a full-fledged cpu, one of the cores or HT)
  is being updated at any particular moment of time.

  In general, there is no need for any additional sync. mechanism in
  arch-specific parts (the patch removes existing spinlocks).

  See also the "Synchronization" section in microcode_core.c.

* return -EINVAL instead of -1 (which is translated into -EPERM) in
  microcode_write(), reload_cpu() and mc_sysdev_add(). Other suggestions
  for an error code?

* use 'enum ucode_state' as return value of request_microcode_{fw, user}
  to gain more flexibility by distinguishing between real error cases
  and situations when an appropriate ucode was not found (which is not an
  error per-se).

* some minor cleanups

Thanks a lot to Hugh Dickins for review/suggestions/testing!

   Reference: http://marc.info/?l=linux-kernel&m=124025889012541&w=2

[ Impact: refactor and clean up microcode driver locking code ]

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Peter Oruba <peter.oruba@amd.com>
Cc: Arjan van de Ven <arjan@infradead.org>
LKML-Reference: <1242078507.5560.9.camel@earth>
[ did some more cleanups ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
 arch/x86/include/asm/microcode.h  |   25 ++
 arch/x86/kernel/microcode_amd.c   |   58 ++----
 arch/x86/kernel/microcode_core.c  |  326 +++++++++++++++++++++-----------------
 arch/x86/kernel/microcode_intel.c |   92 +++-------
 4 files changed, 261 insertions(+), 240 deletions(-)

(~20 new comment lines)

diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index c882664..ef51b50 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -9,20 +9,31 @@ struct cpu_signature {
 
 struct device;
 
+enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND };
+
 struct microcode_ops {
-	int  (*request_microcode_user) (int cpu, const void __user *buf, size_t size);
-	int  (*request_microcode_fw) (int cpu, struct device *device);
+	enum ucode_state (*request_microcode_user) (int cpu,
+				const void __user *buf, size_t size);
 
-	void (*apply_microcode) (int cpu);
+	enum ucode_state (*request_microcode_fw) (int cpu,
+				struct device *device);
 
-	int  (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
 	void (*microcode_fini_cpu) (int cpu);
+
+	/*
+	 * The generic 'microcode_core' part guarantees that
+	 * the callbacks below run on a target cpu when they
+	 * are being called.
+	 * See also the "Synchronization" section in microcode_core.c.
+	 */
+	int (*apply_microcode) (int cpu);
+	int (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
 };
 
 struct ucode_cpu_info {
-	struct cpu_signature cpu_sig;
-	int valid;
-	void *mc;
+	struct cpu_signature	cpu_sig;
+	int			valid;
+	void			*mc;
 };
 extern struct ucode_cpu_info ucode_cpu_info[];
 
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 453b579..c8be20f 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -13,25 +13,13 @@
  *  Licensed under the terms of the GNU General Public
  *  License version 2. See file COPYING for details.
  */
-#include <linux/platform_device.h>
-#include <linux/capability.h>
-#include <linux/miscdevice.h>
 #include <linux/firmware.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
 #include <linux/pci_ids.h>
 #include <linux/uaccess.h>
 #include <linux/vmalloc.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/cpu.h>
 #include <linux/pci.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
 
 #include <asm/microcode.h>
 #include <asm/processor.h>
@@ -79,9 +67,6 @@ struct microcode_amd {
 #define UCODE_CONTAINER_SECTION_HDR	8
 #define UCODE_CONTAINER_HEADER_SIZE	12
 
-/* serialize access to the physical write */
-static DEFINE_SPINLOCK(microcode_update_lock);
-
 static struct equiv_cpu_entry *equiv_cpu_table;
 
 static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
@@ -144,9 +129,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
 	return 1;
 }
 
-static void apply_microcode_amd(int cpu)
+static int apply_microcode_amd(int cpu)
 {
-	unsigned long flags;
 	u32 rev, dummy;
 	int cpu_num = raw_smp_processor_id();
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
@@ -156,25 +140,25 @@ static void apply_microcode_amd(int cpu)
 	BUG_ON(cpu_num != cpu);
 
 	if (mc_amd == NULL)
-		return;
+		return 0;
 
-	spin_lock_irqsave(&microcode_update_lock, flags);
 	wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
 	/* get patch id after patching */
 	rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-	spin_unlock_irqrestore(&microcode_update_lock, flags);
 
 	/* check current patch id and patch's id for match */
 	if (rev != mc_amd->hdr.patch_id) {
 		printk(KERN_ERR "microcode: CPU%d: update failed "
 		       "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id);
-		return;
+		return -1;
 	}
 
 	printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n",
 	       cpu, rev);
 
 	uci->cpu_sig.rev = rev;
+
+	return 0;
 }
 
 static int get_ucode_data(void *to, const u8 *from, size_t n)
@@ -263,7 +247,8 @@ static void free_equiv_cpu_table(void)
 	}
 }
 
-static int generic_load_microcode(int cpu, const u8 *data, size_t size)
+static enum ucode_state
+generic_load_microcode(int cpu, const u8 *data, size_t size)
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 	const u8 *ucode_ptr = data;
@@ -272,12 +257,13 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
 	int new_rev = uci->cpu_sig.rev;
 	unsigned int leftover;
 	unsigned long offset;
+	enum ucode_state state = UCODE_OK;
 
 	offset = install_equiv_cpu_table(ucode_ptr);
 	if (!offset) {
 		printk(KERN_ERR "microcode: failed to create "
 		       "equivalent cpu table\n");
-		return -EINVAL;
+		return UCODE_ERROR;
 	}
 
 	ucode_ptr += offset;
@@ -312,28 +298,27 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
 			pr_debug("microcode: CPU%d found a matching microcode "
 				 "update with version 0x%x (current=0x%x)\n",
 				 cpu, new_rev, uci->cpu_sig.rev);
-		} else
+		} else {
 			vfree(new_mc);
-	}
+			state = UCODE_ERROR;
+		}
+	} else
+		state = UCODE_NFOUND;
 
 	free_equiv_cpu_table();
 
-	return (int)leftover;
+	return state;
 }
 
-static int request_microcode_fw(int cpu, struct device *device)
+static enum ucode_state request_microcode_fw(int cpu, struct device *device)
 {
 	const char *fw_name = "amd-ucode/microcode_amd.bin";
 	const struct firmware *firmware;
-	int ret;
-
-	/* We should bind the task to the CPU */
-	BUG_ON(cpu != raw_smp_processor_id());
+	enum ucode_state ret;
 
-	ret = request_firmware(&firmware, fw_name, device);
-	if (ret) {
+	if (request_firmware(&firmware, fw_name, device)) {
 		printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
-		return ret;
+		return UCODE_NFOUND;
 	}
 
 	ret = generic_load_microcode(cpu, firmware->data, firmware->size);
@@ -343,11 +328,12 @@ static int request_microcode_fw(int cpu, struct device *device)
 	return ret;
 }
 
-static int request_microcode_user(int cpu, const void __user *buf, size_t size)
+static enum ucode_state
+request_microcode_user(int cpu, const void __user *buf, size_t size)
 {
 	printk(KERN_INFO "microcode: AMD microcode update via "
 	       "/dev/cpu/microcode not supported\n");
-	return -1;
+	return UCODE_ERROR;
 }
 
 static void microcode_fini_cpu_amd(int cpu)
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 98c470c..9c44615 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -71,27 +71,18 @@
  *		Thanks to Stuart Swales for pointing out this bug.
  */
 #include <linux/platform_device.h>
-#include <linux/capability.h>
 #include <linux/miscdevice.h>
-#include <linux/firmware.h>
+#include <linux/capability.h>
 #include <linux/smp_lock.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
-#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
 
 #include <asm/microcode.h>
 #include <asm/processor.h>
-#include <asm/msr.h>
 
 MODULE_DESCRIPTION("Microcode Update Driver");
 MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -101,36 +92,110 @@ MODULE_LICENSE("GPL");
 
 static struct microcode_ops	*microcode_ops;
 
-/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
+/*
+ * Synchronization.
+ *
+ * All non cpu-hotplug-callback call sites use:
+ *
+ * - microcode_mutex to synchronize with each other;
+ * - get/put_online_cpus() to synchronize with
+ *   the cpu-hotplug-callback call sites.
+ *
+ * We guarantee that only a single cpu is being
+ * updated at any particular moment of time.
+ */
 static DEFINE_MUTEX(microcode_mutex);
 
 struct ucode_cpu_info		ucode_cpu_info[NR_CPUS];
 EXPORT_SYMBOL_GPL(ucode_cpu_info);
 
+/*
+ * Operations that are run on a target cpu:
+ */
+
+struct cpu_info_ctx {
+	struct cpu_signature	*cpu_sig;
+	int			err;
+};
+
+static void collect_cpu_info_local(void *arg)
+{
+	struct cpu_info_ctx *ctx = arg;
+
+	ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(),
+						   ctx->cpu_sig);
+}
+
+static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig)
+{
+	struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 };
+	int ret;
+
+	ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1);
+	if (!ret)
+		ret = ctx.err;
+
+	return ret;
+}
+
+static int collect_cpu_info(int cpu)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+	int ret;
+
+	memset(uci, 0, sizeof(*uci));
+
+	ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig);
+	if (!ret)
+		uci->valid = 1;
+
+	return ret;
+}
+
+struct apply_microcode_ctx {
+	int err;
+};
+
+static void apply_microcode_local(void *arg)
+{
+	struct apply_microcode_ctx *ctx = arg;
+
+	ctx->err = microcode_ops->apply_microcode(smp_processor_id());
+}
+
+static int apply_microcode_on_target(int cpu)
+{
+	struct apply_microcode_ctx ctx = { .err = 0 };
+	int ret;
+
+	ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1);
+	if (!ret)
+		ret = ctx.err;
+
+	return ret;
+}
+
 #ifdef CONFIG_MICROCODE_OLD_INTERFACE
 static int do_microcode_update(const void __user *buf, size_t size)
 {
-	cpumask_t old;
 	int error = 0;
 	int cpu;
 
-	old = current->cpus_allowed;
-
 	for_each_online_cpu(cpu) {
 		struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+		enum ucode_state ustate;
 
 		if (!uci->valid)
 			continue;
 
-		set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-		error = microcode_ops->request_microcode_user(cpu, buf, size);
-		if (error < 0)
-			goto out;
-		if (!error)
-			microcode_ops->apply_microcode(cpu);
+		ustate = microcode_ops->request_microcode_user(cpu, buf, size);
+		if (ustate == UCODE_ERROR) {
+			error = -1;
+			break;
+		} else if (ustate == UCODE_OK)
+			apply_microcode_on_target(cpu);
 	}
-out:
-	set_cpus_allowed_ptr(current, &old);
+
 	return error;
 }
 
@@ -143,19 +208,17 @@ static int microcode_open(struct inode *unused1, struct file *unused2)
 static ssize_t microcode_write(struct file *file, const char __user *buf,
 			       size_t len, loff_t *ppos)
 {
-	ssize_t ret;
+	ssize_t ret = -EINVAL;
 
 	if ((len >> PAGE_SHIFT) > num_physpages) {
-		printk(KERN_ERR "microcode: too much data (max %ld pages)\n",
-		       num_physpages);
-		return -EINVAL;
+		pr_err("microcode: too much data (max %ld pages)\n", num_physpages);
+		return ret;
 	}
 
 	get_online_cpus();
 	mutex_lock(&microcode_mutex);
 
-	ret = do_microcode_update(buf, len);
-	if (!ret)
+	if (do_microcode_update(buf, len) == 0)
 		ret = (ssize_t)len;
 
 	mutex_unlock(&microcode_mutex);
@@ -165,15 +228,15 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
 }
 
 static const struct file_operations microcode_fops = {
-	.owner		= THIS_MODULE,
-	.write		= microcode_write,
-	.open		= microcode_open,
+	.owner			= THIS_MODULE,
+	.write			= microcode_write,
+	.open			= microcode_open,
 };
 
 static struct miscdevice microcode_dev = {
-	.minor		= MICROCODE_MINOR,
-	.name		= "microcode",
-	.fops		= &microcode_fops,
+	.minor			= MICROCODE_MINOR,
+	.name			= "microcode",
+	.fops			= &microcode_fops,
 };
 
 static int __init microcode_dev_init(void)
@@ -182,9 +245,7 @@ static int __init microcode_dev_init(void)
 
 	error = misc_register(&microcode_dev);
 	if (error) {
-		printk(KERN_ERR
-			"microcode: can't misc_register on minor=%d\n",
-			MICROCODE_MINOR);
+		pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR);
 		return error;
 	}
 
@@ -205,42 +266,51 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
 /* fake device for request_firmware */
 static struct platform_device	*microcode_pdev;
 
-static long reload_for_cpu(void *unused)
+static int reload_for_cpu(int cpu)
 {
-	struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id();
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 	int err = 0;
 
 	mutex_lock(&microcode_mutex);
 	if (uci->valid) {
-		err = microcode_ops->request_microcode_fw(smp_processor_id(),
-							  &microcode_pdev->dev);
-		if (!err)
-			microcode_ops->apply_microcode(smp_processor_id());
+		enum ucode_state ustate;
+
+		ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
+		if (ustate == UCODE_OK)
+			apply_microcode_on_target(cpu);
+		else
+			if (ustate == UCODE_ERROR)
+				err = -EINVAL;
 	}
 	mutex_unlock(&microcode_mutex);
+
 	return err;
 }
 
 static ssize_t reload_store(struct sys_device *dev,
 			    struct sysdev_attribute *attr,
-			    const char *buf, size_t sz)
+			    const char *buf, size_t size)
 {
-	char *end;
-	unsigned long val = simple_strtoul(buf, &end, 0);
-	int err = 0;
+	unsigned long val;
 	int cpu = dev->id;
+	int ret = 0;
+	char *end;
 
+	val = simple_strtoul(buf, &end, 0);
 	if (end == buf)
 		return -EINVAL;
+
 	if (val == 1) {
 		get_online_cpus();
 		if (cpu_online(cpu))
-			err = work_on_cpu(cpu, reload_for_cpu, NULL);
+			ret = reload_for_cpu(cpu);
 		put_online_cpus();
 	}
-	if (err)
-		return err;
-	return sz;
+
+	if (!ret)
+		ret = size;
+
+	return ret;
 }
 
 static ssize_t version_show(struct sys_device *dev,
@@ -271,11 +341,11 @@ static struct attribute *mc_default_attrs[] = {
 };
 
 static struct attribute_group mc_attr_group = {
-	.attrs		= mc_default_attrs,
-	.name		= "microcode",
+	.attrs			= mc_default_attrs,
+	.name			= "microcode",
 };
 
-static void __microcode_fini_cpu(int cpu)
+static void microcode_fini_cpu(int cpu)
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 
@@ -283,103 +353,68 @@ static void __microcode_fini_cpu(int cpu)
 	uci->valid = 0;
 }
 
-static void microcode_fini_cpu(int cpu)
-{
-	mutex_lock(&microcode_mutex);
-	__microcode_fini_cpu(cpu);
-	mutex_unlock(&microcode_mutex);
-}
-
-static void collect_cpu_info(int cpu)
+static enum ucode_state microcode_resume_cpu(int cpu)
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 
-	memset(uci, 0, sizeof(*uci));
-	if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig))
-		uci->valid = 1;
+	if (!uci->mc)
+		return UCODE_NFOUND;
+
+	pr_debug("microcode: CPU%d updated upon resume\n", cpu);
+	apply_microcode_on_target(cpu);
+
+	return UCODE_OK;
 }
 
-static int microcode_resume_cpu(int cpu)
+static enum ucode_state microcode_init_cpu(int cpu)
 {
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-	struct cpu_signature nsig;
+	enum ucode_state ustate;
 
-	pr_debug("microcode: CPU%d resumed\n", cpu);
+	if (collect_cpu_info(cpu))
+		return UCODE_ERROR;
 
-	if (!uci->mc)
-		return 1;
+	/* --dimm. Trigger a delayed update? */
+	if (system_state != SYSTEM_RUNNING)
+		return UCODE_NFOUND;
 
-	/*
-	 * Let's verify that the 'cached' ucode does belong
-	 * to this cpu (a bit of paranoia):
-	 */
-	if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
-		__microcode_fini_cpu(cpu);
-		printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n",
-				cpu);
-		return -1;
-	}
+	ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
 
-	if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) {
-		__microcode_fini_cpu(cpu);
-		printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n",
-				cpu);
-		/* Should we look for a new ucode here? */
-		return 1;
+	if (ustate == UCODE_OK) {
+		pr_debug("microcode: CPU%d updated upon init\n", cpu);
+		apply_microcode_on_target(cpu);
 	}
 
-	return 0;
+	return ustate;
 }
 
-static long microcode_update_cpu(void *unused)
+static enum ucode_state microcode_update_cpu(int cpu)
 {
-	struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id();
-	int err = 0;
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+	enum ucode_state ustate;
 
-	/*
-	 * Check if the system resume is in progress (uci->valid != NULL),
-	 * otherwise just request a firmware:
-	 */
-	if (uci->valid) {
-		err = microcode_resume_cpu(smp_processor_id());
-	} else {
-		collect_cpu_info(smp_processor_id());
-		if (uci->valid && system_state == SYSTEM_RUNNING)
-			err = microcode_ops->request_microcode_fw(
-					smp_processor_id(),
-					&microcode_pdev->dev);
-	}
-	if (!err)
-		microcode_ops->apply_microcode(smp_processor_id());
-	return err;
-}
+	if (uci->valid)
+		ustate = microcode_resume_cpu(cpu);
+	else
+		ustate = microcode_init_cpu(cpu);
 
-static int microcode_init_cpu(int cpu)
-{
-	int err;
-	mutex_lock(&microcode_mutex);
-	err = work_on_cpu(cpu, microcode_update_cpu, NULL);
-	mutex_unlock(&microcode_mutex);
-
-	return err;
+	return ustate;
 }
 
 static int mc_sysdev_add(struct sys_device *sys_dev)
 {
 	int err, cpu = sys_dev->id;
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 
 	if (!cpu_online(cpu))
 		return 0;
 
 	pr_debug("microcode: CPU%d added\n", cpu);
-	memset(uci, 0, sizeof(*uci));
 
 	err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
 	if (err)
 		return err;
 
-	err = microcode_init_cpu(cpu);
+	if (microcode_init_cpu(cpu) == UCODE_ERROR)
+		err = -EINVAL;
 
 	return err;
 }
@@ -400,19 +435,30 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
 static int mc_sysdev_resume(struct sys_device *dev)
 {
 	int cpu = dev->id;
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 
 	if (!cpu_online(cpu))
 		return 0;
 
-	/* only CPU 0 will apply ucode here */
-	microcode_update_cpu(NULL);
+	/*
+	 * All non-bootup cpus are still disabled,
+	 * so only CPU 0 will apply ucode here.
+	 *
+	 * Moreover, there can be no concurrent
+	 * updates from any other places at this point.
+	 */
+	WARN_ON(cpu != 0);
+
+	if (uci->valid && uci->mc)
+		microcode_ops->apply_microcode(cpu);
+
 	return 0;
 }
 
 static struct sysdev_driver mc_sysdev_driver = {
-	.add		= mc_sysdev_add,
-	.remove		= mc_sysdev_remove,
-	.resume		= mc_sysdev_resume,
+	.add			= mc_sysdev_add,
+	.remove			= mc_sysdev_remove,
+	.resume			= mc_sysdev_resume,
 };
 
 static __cpuinit int
@@ -425,15 +471,12 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
-		if (microcode_init_cpu(cpu))
-			printk(KERN_ERR "microcode: failed to init CPU%d\n",
-			       cpu);
+		microcode_update_cpu(cpu);
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
 		pr_debug("microcode: CPU%d added\n", cpu);
 		if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
-			printk(KERN_ERR "microcode: Failed to create the sysfs "
-				"group for CPU%d\n", cpu);
+			pr_err("microcode: Failed to create group for CPU%d\n", cpu);
 		break;
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
@@ -465,13 +508,10 @@ static int __init microcode_init(void)
 		microcode_ops = init_amd_microcode();
 
 	if (!microcode_ops) {
-		printk(KERN_ERR "microcode: no support for this CPU vendor\n");
+		pr_err("microcode: no support for this CPU vendor\n");
 		return -ENODEV;
 	}
 
-	error = microcode_dev_init();
-	if (error)
-		return error;
 	microcode_pdev = platform_device_register_simple("microcode", -1,
 							 NULL, 0);
 	if (IS_ERR(microcode_pdev)) {
@@ -480,23 +520,31 @@ static int __init microcode_init(void)
 	}
 
 	get_online_cpus();
+	mutex_lock(&microcode_mutex);
+
 	error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
+
+	mutex_unlock(&microcode_mutex);
 	put_online_cpus();
+
 	if (error) {
-		microcode_dev_exit();
 		platform_device_unregister(microcode_pdev);
 		return error;
 	}
 
+	error = microcode_dev_init();
+	if (error)
+		return error;
+
 	register_hotcpu_notifier(&mc_cpu_notifier);
 
-	printk(KERN_INFO
-	       "Microcode Update Driver: v" MICROCODE_VERSION
+	pr_info("Microcode Update Driver: v" MICROCODE_VERSION
 	       " <tigran@aivazian.fsnet.co.uk>,"
 	       " Peter Oruba\n");
 
 	return 0;
 }
+module_init(microcode_init);
 
 static void __exit microcode_exit(void)
 {
@@ -505,16 +553,17 @@ static void __exit microcode_exit(void)
 	unregister_hotcpu_notifier(&mc_cpu_notifier);
 
 	get_online_cpus();
+	mutex_lock(&microcode_mutex);
+
 	sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
+
+	mutex_unlock(&microcode_mutex);
 	put_online_cpus();
 
 	platform_device_unregister(microcode_pdev);
 
 	microcode_ops = NULL;
 
-	printk(KERN_INFO
-	       "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
+	pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
 }
-
-module_init(microcode_init);
 module_exit(microcode_exit);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 149b9ec..0d334dd 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -70,24 +70,11 @@
  *		Fix sigmatch() macro to handle old CPUs with pf == 0.
  *		Thanks to Stuart Swales for pointing out this bug.
  */
-#include <linux/platform_device.h>
-#include <linux/capability.h>
-#include <linux/miscdevice.h>
 #include <linux/firmware.h>
-#include <linux/smp_lock.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
 #include <linux/uaccess.h>
-#include <linux/vmalloc.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/cpu.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
+#include <linux/vmalloc.h>
 
 #include <asm/microcode.h>
 #include <asm/processor.h>
@@ -150,13 +137,9 @@ struct extended_sigtable {
 
 #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
 
-/* serialize access to the physical write to MSR 0x79 */
-static DEFINE_SPINLOCK(microcode_update_lock);
-
 static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
 {
 	struct cpuinfo_x86 *c = &cpu_data(cpu_num);
-	unsigned long flags;
 	unsigned int val[2];
 
 	memset(csig, 0, sizeof(*csig));
@@ -176,18 +159,14 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
 		csig->pf = 1 << ((val[1] >> 18) & 7);
 	}
 
-	/* serialize access to the physical write to MSR 0x79 */
-	spin_lock_irqsave(&microcode_update_lock, flags);
-
 	wrmsr(MSR_IA32_UCODE_REV, 0, 0);
 	/* see notes above for revision 1.07.  Apparent chip bug */
 	sync_core();
 	/* get the current revision from MSR 0x8B */
 	rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
-	spin_unlock_irqrestore(&microcode_update_lock, flags);
 
-	pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
-			csig->sig, csig->pf, csig->rev);
+	printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
+			cpu_num, csig->sig, csig->pf, csig->rev);
 
 	return 0;
 }
@@ -318,11 +297,10 @@ get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
 	return 0;
 }
 
-static void apply_microcode(int cpu)
+static int apply_microcode(int cpu)
 {
 	struct microcode_intel *mc_intel;
 	struct ucode_cpu_info *uci;
-	unsigned long flags;
 	unsigned int val[2];
 	int cpu_num;
 
@@ -334,10 +312,7 @@ static void apply_microcode(int cpu)
 	BUG_ON(cpu_num != cpu);
 
 	if (mc_intel == NULL)
-		return;
-
-	/* serialize access to the physical write to MSR 0x79 */
-	spin_lock_irqsave(&microcode_update_lock, flags);
+		return 0;
 
 	/* write microcode via MSR 0x79 */
 	wrmsr(MSR_IA32_UCODE_WRITE,
@@ -351,30 +326,32 @@ static void apply_microcode(int cpu)
 	/* get the current revision from MSR 0x8B */
 	rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
 
-	spin_unlock_irqrestore(&microcode_update_lock, flags);
 	if (val[1] != mc_intel->hdr.rev) {
-		printk(KERN_ERR "microcode: CPU%d update from revision "
-				"0x%x to 0x%x failed\n",
-			cpu_num, uci->cpu_sig.rev, val[1]);
-		return;
+		printk(KERN_ERR "microcode: CPU%d update "
+				"to revision 0x%x failed\n",
+			cpu_num, mc_intel->hdr.rev);
+		return -1;
 	}
-	printk(KERN_INFO "microcode: CPU%d updated from revision "
-			 "0x%x to 0x%x, date = %04x-%02x-%02x \n",
-		cpu_num, uci->cpu_sig.rev, val[1],
+	printk(KERN_INFO "microcode: CPU%d updated to revision "
+			 "0x%x, date = %04x-%02x-%02x \n",
+		cpu_num, val[1],
 		mc_intel->hdr.date & 0xffff,
 		mc_intel->hdr.date >> 24,
 		(mc_intel->hdr.date >> 16) & 0xff);
 
 	uci->cpu_sig.rev = val[1];
+
+	return 0;
 }
 
-static int generic_load_microcode(int cpu, void *data, size_t size,
-		int (*get_ucode_data)(void *, const void *, size_t))
+static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
+				int (*get_ucode_data)(void *, const void *, size_t))
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 	u8 *ucode_ptr = data, *new_mc = NULL, *mc;
 	int new_rev = uci->cpu_sig.rev;
 	unsigned int leftover = size;
+	enum ucode_state state = UCODE_OK;
 
 	while (leftover) {
 		struct microcode_header_intel mc_header;
@@ -412,11 +389,15 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
 		leftover  -= mc_size;
 	}
 
-	if (!new_mc)
+	if (leftover) {
+		if (new_mc)
+			vfree(new_mc);
+		state = UCODE_ERROR;
 		goto out;
+	}
 
-	if (leftover) {
-		vfree(new_mc);
+	if (!new_mc) {
+		state = UCODE_NFOUND;
 		goto out;
 	}
 
@@ -427,9 +408,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
 	pr_debug("microcode: CPU%d found a matching microcode update with"
 		 " version 0x%x (current=0x%x)\n",
 			cpu, new_rev, uci->cpu_sig.rev);
-
- out:
-	return (int)leftover;
+out:
+	return state;
 }
 
 static int get_ucode_fw(void *to, const void *from, size_t n)
@@ -438,21 +418,19 @@ static int get_ucode_fw(void *to, const void *from, size_t n)
 	return 0;
 }
 
-static int request_microcode_fw(int cpu, struct device *device)
+static enum ucode_state request_microcode_fw(int cpu, struct device *device)
 {
 	char name[30];
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 	const struct firmware *firmware;
-	int ret;
+	enum ucode_state ret;
 
-	/* We should bind the task to the CPU */
-	BUG_ON(cpu != raw_smp_processor_id());
 	sprintf(name, "intel-ucode/%02x-%02x-%02x",
 		c->x86, c->x86_model, c->x86_mask);
-	ret = request_firmware(&firmware, name, device);
-	if (ret) {
+
+	if (request_firmware(&firmware, name, device)) {
 		pr_debug("microcode: data file %s load failed\n", name);
-		return ret;
+		return UCODE_NFOUND;
 	}
 
 	ret = generic_load_microcode(cpu, (void *)firmware->data,
@@ -468,11 +446,9 @@ static int get_ucode_user(void *to, const void *from, size_t n)
 	return copy_from_user(to, from, n);
 }
 
-static int request_microcode_user(int cpu, const void __user *buf, size_t size)
+static enum ucode_state
+request_microcode_user(int cpu, const void __user *buf, size_t size)
 {
-	/* We should bind the task to the CPU */
-	BUG_ON(cpu != raw_smp_processor_id());
-
 	return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
 }
 
-- 
cgit v0.10.2


From 2b1ccc0ee918a653d0483fdad9dd6112ce8e9043 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 12 May 2009 11:11:48 +0200
Subject: splice: fix misleading comment

Splice is tied to pipes by design, it'll not change. And now that
the splice stuff is in splice.h (and note pipe.h), the rest of the comment
is out-of-date as well.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/include/linux/splice.h b/include/linux/splice.h
index 5f3faa9..18e7c7c 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -11,8 +11,7 @@
 #include <linux/pipe_fs_i.h>
 
 /*
- * splice is tied to pipes as a transport (at least for now), so we'll just
- * add the splice flags here.
+ * Flags passed in from splice/tee/vmsplice
  */
 #define SPLICE_F_MOVE	(0x01)	/* move pages instead of copying */
 #define SPLICE_F_NONBLOCK (0x02) /* don't block on the pipe splicing (but */
-- 
cgit v0.10.2


From 9d62dcdfa6f6fc843f7d9b494bcd48f00b94f883 Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Mon, 11 May 2009 22:05:28 -0400
Subject: x86: merge process.c a bit

Merge arch_align_stack() and arch_randomize_brk(), since
they are the same.

Tested on x86_64.

[ Impact: cleanup ]

Signed-off-by: Amerigo Wang <amwang@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ca98915..2b9a8d0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/pm.h>
 #include <linux/clockchips.h>
+#include <linux/random.h>
 #include <trace/power.h>
 #include <asm/system.h>
 #include <asm/apic.h>
@@ -613,3 +614,16 @@ static int __init idle_setup(char *str)
 }
 early_param("idle", idle_setup);
 
+unsigned long arch_align_stack(unsigned long sp)
+{
+	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+		sp -= get_random_int() % 8192;
+	return sp & ~0xf;
+}
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+	unsigned long range_end = mm->brk + 0x02000000;
+	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+}
+
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 76f8f84..a3bb049 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -33,7 +33,6 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/ptrace.h>
-#include <linux/random.h>
 #include <linux/personality.h>
 #include <linux/tick.h>
 #include <linux/percpu.h>
@@ -497,15 +496,3 @@ unsigned long get_wchan(struct task_struct *p)
 	return 0;
 }
 
-unsigned long arch_align_stack(unsigned long sp)
-{
-	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
-		sp -= get_random_int() % 8192;
-	return sp & ~0xf;
-}
-
-unsigned long arch_randomize_brk(struct mm_struct *mm)
-{
-	unsigned long range_end = mm->brk + 0x02000000;
-	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
-}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b751a41..34386f7 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -32,7 +32,6 @@
 #include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/ptrace.h>
-#include <linux/random.h>
 #include <linux/notifier.h>
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
@@ -660,15 +659,3 @@ long sys_arch_prctl(int code, unsigned long addr)
 	return do_arch_prctl(current, code, addr);
 }
 
-unsigned long arch_align_stack(unsigned long sp)
-{
-	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
-		sp -= get_random_int() % 8192;
-	return sp & ~0xf;
-}
-
-unsigned long arch_randomize_brk(struct mm_struct *mm)
-{
-	unsigned long range_end = mm->brk + 0x02000000;
-	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
-}
-- 
cgit v0.10.2


From bf78ad69cd351798b9447a269c6bd41ce1f111f4 Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Mon, 11 May 2009 23:29:09 -0400
Subject: x86: process.c, remove useless headers

<stdarg.h> is not needed by these files, remove them.

[ Impact: cleanup ]

Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: akpm@linux-foundation.org
LKML-Reference: <20090512032956.5040.77055.sendpatchset@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index a3bb049..56d50b7 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -9,8 +9,6 @@
  * This file handles the architecture-dependent parts of process handling..
  */
 
-#include <stdarg.h>
-
 #include <linux/stackprotector.h>
 #include <linux/cpu.h>
 #include <linux/errno.h>
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 34386f7..9d6b20e 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -14,8 +14,6 @@
  * This file handles the architecture-dependent parts of process handling..
  */
 
-#include <stdarg.h>
-
 #include <linux/stackprotector.h>
 #include <linux/cpu.h>
 #include <linux/errno.h>
-- 
cgit v0.10.2


From ed077b58f6146684069975122b1728a9d248a501 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Tue, 12 May 2009 16:40:00 +0800
Subject: x86: make sparse mem work in non-NUMA mode

With sparse memory, holes should not be marked present for memmap.
This patch makes sure sparsemem really works on SMP mode (!NUMA).

[ Impact: use less memory to map fragmented RAM, avoid boot-OOM/crash ]

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Sheng Yang <sheng.yang@intel.com>
LKML-Reference: <1242117600.22431.0.camel@sli10-desk.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index fef1d90..949708d 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -706,15 +706,15 @@ void __init initmem_init(unsigned long start_pfn,
 	highstart_pfn = highend_pfn = max_pfn;
 	if (max_pfn > max_low_pfn)
 		highstart_pfn = max_low_pfn;
-	memory_present(0, 0, highend_pfn);
 	e820_register_active_regions(0, 0, highend_pfn);
+	sparse_memory_present_with_active_regions(0);
 	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
 		pages_to_mb(highend_pfn - highstart_pfn));
 	num_physpages = highend_pfn;
 	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
 #else
-	memory_present(0, 0, max_low_pfn);
 	e820_register_active_regions(0, 0, max_low_pfn);
+	sparse_memory_present_with_active_regions(0);
 	num_physpages = max_low_pfn;
 	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
 #endif
-- 
cgit v0.10.2


From 4797f6b021a3fa399942245d07a1feb30df81bb8 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 2 May 2009 10:40:57 -0700
Subject: x86: read apic ID in the !acpi_lapic case

Ed found that on 32-bit, boot_cpu_physical_apicid is not read right,
when the mptable is broken.

Interestingly, actually three paths use/set it:

 1. acpi: at that time that is already read from reg
 2. mptable: only read from mptable
 3. no madt, and no mptable, that use default apic id 0 for 64-bit, -1 for 32-bit

so we could read the apic id for the 2/3 path. We trust the hardware
register more than we trust a BIOS data structure (the mptable).

We can also avoid the double set_fixmap() when acpi_lapic
is used, and also need to move cpu_has_apic earlier and
call apic_disable().

Also when need to update the apic id, we'd better read and
set the apic version as well - so that quirks are applied precisely.

v2: make path 3 with 64bit, use -1 as apic id, so could read it later.
v3: fix whitespace problem pointed out by Ed Swierk
v5: fix boot crash

[ Impact: get correct apic id for bsp other than acpi path ]

Reported-by: Ed Swierk <eswierk@aristanetworks.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <49FC85A9.2070702@kernel.org>
[ v4: sanity-check in the ACPI case too ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 19e0d88..6a84ed1 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -180,7 +180,7 @@ extern int safe_smp_processor_id(void);
 static inline int logical_smp_processor_id(void)
 {
 	/* we don't want to mark this access volatile - bad code generation */
-	return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
+	return GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
 }
 
 #endif
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 07cffc1..b0fd264 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -242,17 +242,24 @@ static int modern_apic(void)
  * bare function to substitute write operation
  * and it's _that_ fast :)
  */
-void native_apic_write_dummy(u32 reg, u32 v)
+static void native_apic_write_dummy(u32 reg, u32 v)
 {
 	WARN_ON_ONCE((cpu_has_apic || !disable_apic));
 }
 
+static u32 native_apic_read_dummy(u32 reg)
+{
+	WARN_ON_ONCE((cpu_has_apic || !disable_apic));
+	return 0;
+}
+
 /*
- * right after this call apic->write doesn't do anything
+ * right after this call apic->write/read doesn't do anything
  * note that there is no restore operation it works one way
  */
 void apic_disable(void)
 {
+	apic->read = native_apic_read_dummy;
 	apic->write = native_apic_write_dummy;
 }
 
@@ -1576,32 +1583,23 @@ void __init init_apic_mappings(void)
 		return;
 	}
 
-	/*
-	 * If no local APIC can be found then set up a fake all
-	 * zeroes page to simulate the local APIC and another
-	 * one for the IO-APIC.
-	 */
+	/* If no local APIC can be found return early */
 	if (!smp_found_config && detect_init_APIC()) {
-		apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
-		apic_phys = __pa(apic_phys);
-	} else
+		/* lets NOP'ify apic operations */
+		pr_info("APIC: disable apic facility\n");
+		apic_disable();
+	} else {
 		apic_phys = mp_lapic_addr;
 
-	/*
-	 * acpi lapic path already maps that address in
-	 * acpi_register_lapic_address()
-	 */
-	if (!acpi_lapic)
-		set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
-
-	apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
-			APIC_BASE, apic_phys);
+		/*
+		 * acpi lapic path already maps that address in
+		 * acpi_register_lapic_address()
+		 */
+		if (!acpi_lapic)
+			set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
 
-	/* lets check if we may NOP'ify apic operations */
-	if (!cpu_has_apic) {
-		pr_info("APIC: disable apic facility\n");
-		apic_disable();
-		return;
+		apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
+					APIC_BASE, apic_phys);
 	}
 
 	/*
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 1f3d366..74d2b48 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1878,6 +1878,11 @@ __apicdebuginit(void) print_PIC(void)
 __apicdebuginit(int) print_all_ICs(void)
 {
 	print_PIC();
+
+	/* don't print out if apic is not there */
+	if (!cpu_has_apic || disable_apic)
+		return 0;
+
 	print_all_local_APICs();
 	print_IO_APIC();
 
-- 
cgit v0.10.2


From 9fe5ee0efb1b1d4a0939bc4252a8427e3337d96a Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 12 May 2009 19:29:04 +0900
Subject: sh: clkfwk: Use arch_clk_init() for on-chip clock registration.

CPUs registering on-chip clocks should be using arch_clk_init() with the
new scheme so that the CPUs have the opportunity to establish the
topology prior to the initial root clock rate propagation. This ensures
that CPUs with on-chip clocks that use CLK_ENABLE_ON_INIT are properly
enabled at the initial propagation time, without having to further poke
the root clocks.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 9e1fc13..f833843 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -437,13 +437,7 @@ void clk_put(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(clk_put);
 
-void __init __attribute__ ((weak))
-arch_init_clk_ops(struct clk_ops **ops, int type)
-{
-}
-
-int __init __attribute__ ((weak))
-arch_clk_init(void)
+int __init __weak arch_clk_init(void)
 {
 	return 0;
 }
diff --git a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
index 435f4f1..a72dc32 100644
--- a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
+++ b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
@@ -150,7 +150,7 @@ static struct clk *sh4202_onchip_clocks[] = {
 	&sh4202_shoc_clk,
 };
 
-static int __init sh4202_clk_init(void)
+int __init arch_clk_init(void)
 {
 	struct clk *clk = clk_get(NULL, "master_clk");
 	int i, ret = 0;
@@ -166,4 +166,3 @@ static int __init sh4202_clk_init(void)
 
 	return ret;
 }
-arch_initcall(sh4202_clk_init);
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
index 0110da6..ce3d4e6 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
@@ -90,7 +90,7 @@ static struct clk *sh7763_onchip_clocks[] = {
 	&sh7763_shyway_clk,
 };
 
-static int __init sh7763_clk_init(void)
+int __init arch_clk_init(void)
 {
 	struct clk *clk = clk_get(NULL, "master_clk");
 	int i, ret = 0;
@@ -106,4 +106,3 @@ static int __init sh7763_clk_init(void)
 
 	return ret;
 }
-arch_initcall(sh7763_clk_init);
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
index 0a22d50..38b8b1d 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
@@ -96,7 +96,7 @@ static struct clk *sh7780_onchip_clocks[] = {
 	&sh7780_shyway_clk,
 };
 
-static int __init sh7780_clk_init(void)
+int __init arch_clk_init(void)
 {
 	struct clk *clk = clk_get(NULL, "master_clk");
 	int i, ret = 0;
@@ -112,4 +112,3 @@ static int __init sh7780_clk_init(void)
 
 	return ret;
 }
-arch_initcall(sh7780_clk_init);
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index 4dcd1f6..fa14f35 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -134,7 +134,7 @@ static struct clk *sh7785_onchip_clocks[] = {
 	&sh7785_ram_clk,
 };
 
-static int __init sh7785_clk_init(void)
+int __init arch_clk_init(void)
 {
 	struct clk *clk = clk_get(NULL, "master_clk");
 	int i, ret = 0;
@@ -150,4 +150,3 @@ static int __init sh7785_clk_init(void)
 
 	return ret;
 }
-arch_initcall(sh7785_clk_init);
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
index 825556f..7907002 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
@@ -120,7 +120,7 @@ static struct clk *sh7786_onchip_clocks[] = {
 	&sh7786_ddr_clk,
 };
 
-static int __init sh7786_clk_init(void)
+int __init arch_clk_init(void)
 {
 	struct clk *clk = clk_get(NULL, "master_clk");
 	int i, ret = 0;
@@ -136,4 +136,3 @@ static int __init sh7786_clk_init(void)
 
 	return ret;
 }
-arch_initcall(sh7786_clk_init);
diff --git a/arch/sh/kernel/cpu/sh4a/clock-shx3.c b/arch/sh/kernel/cpu/sh4a/clock-shx3.c
index 1eb149b..c14553e 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-shx3.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-shx3.c
@@ -107,7 +107,7 @@ static struct clk *shx3_onchip_clocks[] = {
 	&shx3_shyway_clk,
 };
 
-static int __init shx3_clk_init(void)
+int __init arch_clk_init(void)
 {
 	struct clk *clk = clk_get(NULL, "master_clk");
 	int i, ret = 0;
@@ -123,4 +123,3 @@ static int __init shx3_clk_init(void)
 
 	return ret;
 }
-arch_initcall(shx3_clk_init);
-- 
cgit v0.10.2


From 0ee8b4d7c7d39d9a9913e27686ef786642c3ccbb Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 09:13:59 +0000
Subject: sh: TMU platform data for sh7763

This patch adds TMU platform data for sh7763. Both clockevent
and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7763.c b/arch/sh/kernel/cpu/sh4a/setup-sh7763.c
index bdf0f61..c91f34c 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7763.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7763.c
@@ -12,6 +12,7 @@
 #include <linux/platform_device.h>
 #include <linux/init.h>
 #include <linux/serial.h>
+#include <linux/sh_timer.h>
 #include <linux/io.h>
 #include <linux/serial_sci.h>
 
@@ -113,7 +114,195 @@ static struct platform_device usbf_device = {
 	.resource	= usbf_resources,
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffd80008,
+		.end	= 0xffd80013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 28,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffd80014,
+		.end	= 0xffd8001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 29,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffd80020,
+		.end	= 0xffd8002f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 30,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
+static struct sh_timer_config tmu3_platform_data = {
+	.name = "TMU3",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+};
+
+static struct resource tmu3_resources[] = {
+	[0] = {
+		.name	= "TMU3",
+		.start	= 0xffd88008,
+		.end	= 0xffd88013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 96,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu3_device = {
+	.name		= "sh_tmu",
+	.id		= 3,
+	.dev = {
+		.platform_data	= &tmu3_platform_data,
+	},
+	.resource	= tmu3_resources,
+	.num_resources	= ARRAY_SIZE(tmu3_resources),
+};
+
+static struct sh_timer_config tmu4_platform_data = {
+	.name = "TMU4",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+};
+
+static struct resource tmu4_resources[] = {
+	[0] = {
+		.name	= "TMU4",
+		.start	= 0xffd88014,
+		.end	= 0xffd8801f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 97,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu4_device = {
+	.name		= "sh_tmu",
+	.id		= 4,
+	.dev = {
+		.platform_data	= &tmu4_platform_data,
+	},
+	.resource	= tmu4_resources,
+	.num_resources	= ARRAY_SIZE(tmu4_resources),
+};
+
+static struct sh_timer_config tmu5_platform_data = {
+	.name = "TMU5",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu5_resources[] = {
+	[0] = {
+		.name	= "TMU5",
+		.start	= 0xffd88020,
+		.end	= 0xffd8802b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 98,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu5_device = {
+	.name		= "sh_tmu",
+	.id		= 5,
+	.dev = {
+		.platform_data	= &tmu5_platform_data,
+	},
+	.resource	= tmu5_resources,
+	.num_resources	= ARRAY_SIZE(tmu5_resources),
+};
+
 static struct platform_device *sh7763_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
 	&rtc_device,
 	&sci_device,
 	&usb_ohci_device,
@@ -127,6 +316,21 @@ static int __init sh7763_devices_setup(void)
 }
 __initcall(sh7763_devices_setup);
 
+static struct platform_device *sh7763_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7763_early_devices,
+				   ARRAY_SIZE(sh7763_early_devices));
+}
+
 enum {
 	UNUSED = 0,
 
-- 
cgit v0.10.2


From 5d8728a71f416fd38a0945271c71cb2933fc5734 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 09:18:13 +0000
Subject: sh: add sh7770_generic_defconfig

This patch adds a generic sh7770 defconfig file.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/configs/sh7770_generic_defconfig b/arch/sh/configs/sh7770_generic_defconfig
new file mode 100644
index 0000000..d6489b4
--- /dev/null
+++ b/arch/sh/configs/sh7770_generic_defconfig
@@ -0,0 +1,700 @@
+#
+# Automatically generated make config: don't edit
+# Linux kernel version: 2.6.30-rc4
+# Tue May 12 14:48:21 2009
+#
+CONFIG_SUPERH=y
+CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
+CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
+CONFIG_RWSEM_GENERIC_SPINLOCK=y
+CONFIG_GENERIC_BUG=y
+CONFIG_GENERIC_FIND_NEXT_BIT=y
+CONFIG_GENERIC_HWEIGHT=y
+CONFIG_GENERIC_HARDIRQS=y
+CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
+CONFIG_GENERIC_IRQ_PROBE=y
+# CONFIG_GENERIC_GPIO is not set
+CONFIG_GENERIC_TIME=y
+CONFIG_GENERIC_CLOCKEVENTS=y
+# CONFIG_ARCH_SUSPEND_POSSIBLE is not set
+CONFIG_ARCH_HIBERNATION_POSSIBLE=y
+CONFIG_SYS_SUPPORTS_TMU=y
+CONFIG_STACKTRACE_SUPPORT=y
+CONFIG_LOCKDEP_SUPPORT=y
+CONFIG_HAVE_LATENCYTOP_SUPPORT=y
+# CONFIG_ARCH_HAS_ILOG2_U32 is not set
+# CONFIG_ARCH_HAS_ILOG2_U64 is not set
+CONFIG_ARCH_NO_VIRT_TO_BUS=y
+CONFIG_ARCH_HAS_DEFAULT_IDLE=y
+CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
+
+#
+# General setup
+#
+CONFIG_EXPERIMENTAL=y
+CONFIG_BROKEN_ON_SMP=y
+CONFIG_INIT_ENV_ARG_LIMIT=32
+CONFIG_LOCALVERSION=""
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_SWAP=y
+CONFIG_SYSVIPC=y
+CONFIG_SYSVIPC_SYSCTL=y
+# CONFIG_BSD_PROCESS_ACCT is not set
+
+#
+# RCU Subsystem
+#
+# CONFIG_CLASSIC_RCU is not set
+CONFIG_TREE_RCU=y
+# CONFIG_PREEMPT_RCU is not set
+# CONFIG_RCU_TRACE is not set
+CONFIG_RCU_FANOUT=32
+# CONFIG_RCU_FANOUT_EXACT is not set
+# CONFIG_TREE_RCU_TRACE is not set
+# CONFIG_PREEMPT_RCU_TRACE is not set
+# CONFIG_IKCONFIG is not set
+CONFIG_LOG_BUF_SHIFT=17
+CONFIG_GROUP_SCHED=y
+CONFIG_FAIR_GROUP_SCHED=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_USER_SCHED=y
+# CONFIG_CGROUP_SCHED is not set
+CONFIG_CGROUPS=y
+# CONFIG_CGROUP_DEBUG is not set
+# CONFIG_CGROUP_NS is not set
+# CONFIG_CGROUP_FREEZER is not set
+# CONFIG_CGROUP_DEVICE is not set
+# CONFIG_CPUSETS is not set
+# CONFIG_CGROUP_CPUACCT is not set
+# CONFIG_RESOURCE_COUNTERS is not set
+# CONFIG_RELAY is not set
+# CONFIG_NAMESPACES is not set
+# CONFIG_BLK_DEV_INITRD is not set
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+CONFIG_SYSCTL=y
+CONFIG_ANON_INODES=y
+CONFIG_EMBEDDED=y
+# CONFIG_UID16 is not set
+CONFIG_SYSCTL_SYSCALL=y
+CONFIG_KALLSYMS=y
+# CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
+CONFIG_HOTPLUG=y
+CONFIG_PRINTK=y
+CONFIG_BUG=y
+CONFIG_ELF_CORE=y
+CONFIG_BASE_FULL=y
+CONFIG_FUTEX=y
+CONFIG_EPOLL=y
+CONFIG_SIGNALFD=y
+CONFIG_TIMERFD=y
+CONFIG_EVENTFD=y
+CONFIG_SHMEM=y
+CONFIG_AIO=y
+CONFIG_VM_EVENT_COUNTERS=y
+# CONFIG_COMPAT_BRK is not set
+# CONFIG_SLAB is not set
+CONFIG_SLUB=y
+# CONFIG_SLOB is not set
+CONFIG_PROFILING=y
+CONFIG_TRACEPOINTS=y
+# CONFIG_MARKERS is not set
+CONFIG_OPROFILE=y
+CONFIG_HAVE_OPROFILE=y
+CONFIG_HAVE_IOREMAP_PROT=y
+CONFIG_HAVE_KPROBES=y
+CONFIG_HAVE_KRETPROBES=y
+CONFIG_HAVE_ARCH_TRACEHOOK=y
+CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
+CONFIG_HAVE_GENERIC_DMA_COHERENT=y
+CONFIG_RT_MUTEXES=y
+CONFIG_BASE_SMALL=0
+# CONFIG_MODULES is not set
+CONFIG_BLOCK=y
+# CONFIG_LBD is not set
+# CONFIG_BLK_DEV_BSG is not set
+# CONFIG_BLK_DEV_INTEGRITY is not set
+
+#
+# IO Schedulers
+#
+CONFIG_IOSCHED_NOOP=y
+CONFIG_IOSCHED_AS=y
+CONFIG_IOSCHED_DEADLINE=y
+CONFIG_IOSCHED_CFQ=y
+CONFIG_DEFAULT_AS=y
+# CONFIG_DEFAULT_DEADLINE is not set
+# CONFIG_DEFAULT_CFQ is not set
+# CONFIG_DEFAULT_NOOP is not set
+CONFIG_DEFAULT_IOSCHED="anticipatory"
+CONFIG_FREEZER=y
+
+#
+# System type
+#
+CONFIG_CPU_SH4=y
+CONFIG_CPU_SH4A=y
+# CONFIG_CPU_SUBTYPE_SH7619 is not set
+# CONFIG_CPU_SUBTYPE_SH7201 is not set
+# CONFIG_CPU_SUBTYPE_SH7203 is not set
+# CONFIG_CPU_SUBTYPE_SH7206 is not set
+# CONFIG_CPU_SUBTYPE_SH7263 is not set
+# CONFIG_CPU_SUBTYPE_MXG is not set
+# CONFIG_CPU_SUBTYPE_SH7705 is not set
+# CONFIG_CPU_SUBTYPE_SH7706 is not set
+# CONFIG_CPU_SUBTYPE_SH7707 is not set
+# CONFIG_CPU_SUBTYPE_SH7708 is not set
+# CONFIG_CPU_SUBTYPE_SH7709 is not set
+# CONFIG_CPU_SUBTYPE_SH7710 is not set
+# CONFIG_CPU_SUBTYPE_SH7712 is not set
+# CONFIG_CPU_SUBTYPE_SH7720 is not set
+# CONFIG_CPU_SUBTYPE_SH7721 is not set
+# CONFIG_CPU_SUBTYPE_SH7750 is not set
+# CONFIG_CPU_SUBTYPE_SH7091 is not set
+# CONFIG_CPU_SUBTYPE_SH7750R is not set
+# CONFIG_CPU_SUBTYPE_SH7750S is not set
+# CONFIG_CPU_SUBTYPE_SH7751 is not set
+# CONFIG_CPU_SUBTYPE_SH7751R is not set
+# CONFIG_CPU_SUBTYPE_SH7760 is not set
+# CONFIG_CPU_SUBTYPE_SH4_202 is not set
+# CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
+# CONFIG_CPU_SUBTYPE_SH7763 is not set
+CONFIG_CPU_SUBTYPE_SH7770=y
+# CONFIG_CPU_SUBTYPE_SH7780 is not set
+# CONFIG_CPU_SUBTYPE_SH7785 is not set
+# CONFIG_CPU_SUBTYPE_SH7786 is not set
+# CONFIG_CPU_SUBTYPE_SHX3 is not set
+# CONFIG_CPU_SUBTYPE_SH7343 is not set
+# CONFIG_CPU_SUBTYPE_SH7722 is not set
+# CONFIG_CPU_SUBTYPE_SH7366 is not set
+
+#
+# Memory management options
+#
+CONFIG_QUICKLIST=y
+CONFIG_MMU=y
+CONFIG_PAGE_OFFSET=0x80000000
+CONFIG_MEMORY_START=0x08000000
+CONFIG_MEMORY_SIZE=0x04000000
+CONFIG_29BIT=y
+CONFIG_VSYSCALL=y
+CONFIG_ARCH_FLATMEM_ENABLE=y
+CONFIG_ARCH_SPARSEMEM_ENABLE=y
+CONFIG_ARCH_SPARSEMEM_DEFAULT=y
+CONFIG_MAX_ACTIVE_REGIONS=1
+CONFIG_ARCH_POPULATES_NODE_MAP=y
+CONFIG_ARCH_SELECT_MEMORY_MODEL=y
+CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
+CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y
+CONFIG_PAGE_SIZE_4KB=y
+# CONFIG_PAGE_SIZE_8KB is not set
+# CONFIG_PAGE_SIZE_16KB is not set
+# CONFIG_PAGE_SIZE_64KB is not set
+CONFIG_SELECT_MEMORY_MODEL=y
+# CONFIG_FLATMEM_MANUAL is not set
+# CONFIG_DISCONTIGMEM_MANUAL is not set
+CONFIG_SPARSEMEM_MANUAL=y
+CONFIG_SPARSEMEM=y
+CONFIG_HAVE_MEMORY_PRESENT=y
+CONFIG_SPARSEMEM_STATIC=y
+
+#
+# Memory hotplug is currently incompatible with Software Suspend
+#
+CONFIG_PAGEFLAGS_EXTENDED=y
+CONFIG_SPLIT_PTLOCK_CPUS=4
+CONFIG_MIGRATION=y
+# CONFIG_PHYS_ADDR_T_64BIT is not set
+CONFIG_ZONE_DMA_FLAG=0
+CONFIG_NR_QUICK=2
+CONFIG_UNEVICTABLE_LRU=y
+CONFIG_HAVE_MLOCK=y
+CONFIG_HAVE_MLOCKED_PAGE_BIT=y
+
+#
+# Cache configuration
+#
+CONFIG_CACHE_WRITEBACK=y
+# CONFIG_CACHE_WRITETHROUGH is not set
+# CONFIG_CACHE_OFF is not set
+
+#
+# Processor features
+#
+CONFIG_CPU_LITTLE_ENDIAN=y
+# CONFIG_CPU_BIG_ENDIAN is not set
+CONFIG_SH_FPU=y
+# CONFIG_SH_STORE_QUEUES is not set
+CONFIG_CPU_HAS_INTEVT=y
+CONFIG_CPU_HAS_SR_RB=y
+CONFIG_CPU_HAS_FPU=y
+
+#
+# Board support
+#
+
+#
+# Timer and clock configuration
+#
+CONFIG_SH_TMU=y
+CONFIG_SH_TIMER_IRQ=16
+CONFIG_SH_PCLK_FREQ=41666666
+CONFIG_TICK_ONESHOT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
+
+#
+# CPU Frequency scaling
+#
+CONFIG_CPU_FREQ=y
+CONFIG_CPU_FREQ_TABLE=y
+# CONFIG_CPU_FREQ_DEBUG is not set
+CONFIG_CPU_FREQ_STAT=y
+# CONFIG_CPU_FREQ_STAT_DETAILS is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y
+# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set
+# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set
+# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set
+# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set
+CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
+# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set
+# CONFIG_CPU_FREQ_GOV_USERSPACE is not set
+# CONFIG_CPU_FREQ_GOV_ONDEMAND is not set
+# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set
+CONFIG_SH_CPU_FREQ=y
+
+#
+# DMA support
+#
+# CONFIG_SH_DMA is not set
+
+#
+# Companion Chips
+#
+
+#
+# Additional SuperH Device Drivers
+#
+# CONFIG_HEARTBEAT is not set
+# CONFIG_PUSH_SWITCH is not set
+
+#
+# Kernel features
+#
+# CONFIG_HZ_100 is not set
+CONFIG_HZ_250=y
+# CONFIG_HZ_300 is not set
+# CONFIG_HZ_1000 is not set
+CONFIG_HZ=250
+CONFIG_SCHED_HRTICK=y
+CONFIG_KEXEC=y
+# CONFIG_CRASH_DUMP is not set
+CONFIG_KEXEC_JUMP=y
+CONFIG_PREEMPT_NONE=y
+# CONFIG_PREEMPT_VOLUNTARY is not set
+# CONFIG_PREEMPT is not set
+CONFIG_GUSA=y
+
+#
+# Boot options
+#
+CONFIG_ZERO_PAGE_OFFSET=0x00001000
+CONFIG_BOOT_LINK_OFFSET=0x00800000
+CONFIG_ENTRY_OFFSET=0x00001000
+# CONFIG_CMDLINE_BOOL is not set
+
+#
+# Bus options
+#
+# CONFIG_ARCH_SUPPORTS_MSI is not set
+# CONFIG_PCCARD is not set
+
+#
+# Executable file formats
+#
+CONFIG_BINFMT_ELF=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+# CONFIG_HAVE_AOUT is not set
+# CONFIG_BINFMT_MISC is not set
+
+#
+# Power management options (EXPERIMENTAL)
+#
+CONFIG_PM=y
+# CONFIG_PM_DEBUG is not set
+CONFIG_PM_SLEEP=y
+CONFIG_HIBERNATION=y
+CONFIG_PM_STD_PARTITION=""
+CONFIG_CPU_IDLE=y
+CONFIG_CPU_IDLE_GOV_LADDER=y
+CONFIG_CPU_IDLE_GOV_MENU=y
+# CONFIG_NET is not set
+
+#
+# Device Drivers
+#
+
+#
+# Generic Driver Options
+#
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_STANDALONE=y
+# CONFIG_PREVENT_FIRMWARE_BUILD is not set
+CONFIG_FW_LOADER=y
+CONFIG_FIRMWARE_IN_KERNEL=y
+CONFIG_EXTRA_FIRMWARE=""
+# CONFIG_SYS_HYPERVISOR is not set
+# CONFIG_MTD is not set
+# CONFIG_PARPORT is not set
+CONFIG_BLK_DEV=y
+# CONFIG_BLK_DEV_COW_COMMON is not set
+# CONFIG_BLK_DEV_LOOP is not set
+# CONFIG_BLK_DEV_RAM is not set
+# CONFIG_CDROM_PKTCDVD is not set
+# CONFIG_BLK_DEV_HD is not set
+# CONFIG_MISC_DEVICES is not set
+CONFIG_HAVE_IDE=y
+# CONFIG_IDE is not set
+
+#
+# SCSI device support
+#
+# CONFIG_RAID_ATTRS is not set
+# CONFIG_SCSI is not set
+# CONFIG_SCSI_DMA is not set
+# CONFIG_SCSI_NETLINK is not set
+# CONFIG_ATA is not set
+# CONFIG_MD is not set
+# CONFIG_PHONE is not set
+
+#
+# Input device support
+#
+# CONFIG_INPUT is not set
+
+#
+# Hardware I/O ports
+#
+# CONFIG_SERIO is not set
+# CONFIG_GAMEPORT is not set
+
+#
+# Character devices
+#
+# CONFIG_VT is not set
+# CONFIG_DEVKMEM is not set
+# CONFIG_SERIAL_NONSTANDARD is not set
+
+#
+# Serial drivers
+#
+# CONFIG_SERIAL_8250 is not set
+
+#
+# Non-8250 serial port support
+#
+CONFIG_SERIAL_SH_SCI=y
+CONFIG_SERIAL_SH_SCI_NR_UARTS=6
+CONFIG_SERIAL_SH_SCI_CONSOLE=y
+CONFIG_SERIAL_CORE=y
+CONFIG_SERIAL_CORE_CONSOLE=y
+# CONFIG_UNIX98_PTYS is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_IPMI_HANDLER is not set
+# CONFIG_HW_RANDOM is not set
+# CONFIG_R3964 is not set
+# CONFIG_RAW_DRIVER is not set
+# CONFIG_TCG_TPM is not set
+CONFIG_I2C=y
+CONFIG_I2C_BOARDINFO=y
+CONFIG_I2C_CHARDEV=y
+CONFIG_I2C_HELPER_AUTO=y
+
+#
+# I2C Hardware Bus support
+#
+
+#
+# I2C system bus drivers (mostly embedded / system-on-chip)
+#
+# CONFIG_I2C_OCORES is not set
+CONFIG_I2C_SH_MOBILE=y
+# CONFIG_I2C_SIMTEC is not set
+
+#
+# External I2C/SMBus adapter drivers
+#
+# CONFIG_I2C_PARPORT_LIGHT is not set
+# CONFIG_I2C_TAOS_EVM is not set
+
+#
+# Other I2C/SMBus bus drivers
+#
+# CONFIG_I2C_PCA_PLATFORM is not set
+
+#
+# Miscellaneous I2C Chip support
+#
+# CONFIG_DS1682 is not set
+# CONFIG_SENSORS_PCF8574 is not set
+# CONFIG_PCF8575 is not set
+# CONFIG_SENSORS_PCA9539 is not set
+# CONFIG_SENSORS_MAX6875 is not set
+# CONFIG_SENSORS_TSL2550 is not set
+# CONFIG_I2C_DEBUG_CORE is not set
+# CONFIG_I2C_DEBUG_ALGO is not set
+# CONFIG_I2C_DEBUG_BUS is not set
+# CONFIG_I2C_DEBUG_CHIP is not set
+# CONFIG_SPI is not set
+# CONFIG_W1 is not set
+# CONFIG_POWER_SUPPLY is not set
+# CONFIG_HWMON is not set
+# CONFIG_THERMAL is not set
+# CONFIG_THERMAL_HWMON is not set
+# CONFIG_WATCHDOG is not set
+CONFIG_SSB_POSSIBLE=y
+
+#
+# Sonics Silicon Backplane
+#
+# CONFIG_SSB is not set
+
+#
+# Multifunction device drivers
+#
+# CONFIG_MFD_CORE is not set
+# CONFIG_MFD_SM501 is not set
+# CONFIG_HTC_PASIC3 is not set
+# CONFIG_TWL4030_CORE is not set
+# CONFIG_MFD_TMIO is not set
+# CONFIG_PMIC_DA903X is not set
+# CONFIG_MFD_WM8400 is not set
+# CONFIG_MFD_WM8350_I2C is not set
+# CONFIG_MFD_PCF50633 is not set
+# CONFIG_REGULATOR is not set
+
+#
+# Multimedia devices
+#
+
+#
+# Multimedia core support
+#
+# CONFIG_VIDEO_DEV is not set
+# CONFIG_VIDEO_MEDIA is not set
+
+#
+# Multimedia drivers
+#
+# CONFIG_DAB is not set
+
+#
+# Graphics support
+#
+# CONFIG_VGASTATE is not set
+# CONFIG_VIDEO_OUTPUT_CONTROL is not set
+# CONFIG_FB is not set
+# CONFIG_BACKLIGHT_LCD_SUPPORT is not set
+
+#
+# Display device support
+#
+# CONFIG_DISPLAY_SUPPORT is not set
+# CONFIG_SOUND is not set
+# CONFIG_USB_SUPPORT is not set
+# CONFIG_MMC is not set
+# CONFIG_MEMSTICK is not set
+# CONFIG_NEW_LEDS is not set
+# CONFIG_ACCESSIBILITY is not set
+CONFIG_RTC_LIB=y
+CONFIG_RTC_CLASS=y
+CONFIG_RTC_HCTOSYS=y
+CONFIG_RTC_HCTOSYS_DEVICE="rtc0"
+# CONFIG_RTC_DEBUG is not set
+
+#
+# RTC interfaces
+#
+CONFIG_RTC_INTF_DEV=y
+# CONFIG_RTC_INTF_DEV_UIE_EMUL is not set
+# CONFIG_RTC_DRV_TEST is not set
+
+#
+# I2C RTC drivers
+#
+# CONFIG_RTC_DRV_DS1307 is not set
+# CONFIG_RTC_DRV_DS1374 is not set
+# CONFIG_RTC_DRV_DS1672 is not set
+# CONFIG_RTC_DRV_MAX6900 is not set
+# CONFIG_RTC_DRV_RS5C372 is not set
+# CONFIG_RTC_DRV_ISL1208 is not set
+# CONFIG_RTC_DRV_X1205 is not set
+# CONFIG_RTC_DRV_PCF8563 is not set
+# CONFIG_RTC_DRV_PCF8583 is not set
+# CONFIG_RTC_DRV_M41T80 is not set
+# CONFIG_RTC_DRV_S35390A is not set
+# CONFIG_RTC_DRV_FM3130 is not set
+# CONFIG_RTC_DRV_RX8581 is not set
+
+#
+# SPI RTC drivers
+#
+
+#
+# Platform RTC drivers
+#
+# CONFIG_RTC_DRV_DS1286 is not set
+# CONFIG_RTC_DRV_DS1511 is not set
+# CONFIG_RTC_DRV_DS1553 is not set
+# CONFIG_RTC_DRV_DS1742 is not set
+# CONFIG_RTC_DRV_STK17TA8 is not set
+# CONFIG_RTC_DRV_M48T86 is not set
+# CONFIG_RTC_DRV_M48T35 is not set
+# CONFIG_RTC_DRV_M48T59 is not set
+# CONFIG_RTC_DRV_BQ4802 is not set
+# CONFIG_RTC_DRV_V3020 is not set
+
+#
+# on-CPU RTC drivers
+#
+CONFIG_RTC_DRV_SH=y
+# CONFIG_RTC_DRV_GENERIC is not set
+# CONFIG_DMADEVICES is not set
+# CONFIG_AUXDISPLAY is not set
+CONFIG_UIO=y
+# CONFIG_UIO_PDRV is not set
+CONFIG_UIO_PDRV_GENIRQ=y
+# CONFIG_UIO_SMX is not set
+# CONFIG_UIO_SERCOS3 is not set
+# CONFIG_STAGING is not set
+
+#
+# File systems
+#
+# CONFIG_EXT2_FS is not set
+# CONFIG_EXT3_FS is not set
+# CONFIG_EXT4_FS is not set
+# CONFIG_REISERFS_FS is not set
+# CONFIG_JFS_FS is not set
+# CONFIG_FS_POSIX_ACL is not set
+CONFIG_FILE_LOCKING=y
+# CONFIG_XFS_FS is not set
+# CONFIG_BTRFS_FS is not set
+# CONFIG_DNOTIFY is not set
+# CONFIG_INOTIFY is not set
+# CONFIG_QUOTA is not set
+# CONFIG_AUTOFS_FS is not set
+# CONFIG_AUTOFS4_FS is not set
+# CONFIG_FUSE_FS is not set
+
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
+# CD-ROM/DVD Filesystems
+#
+# CONFIG_ISO9660_FS is not set
+# CONFIG_UDF_FS is not set
+
+#
+# DOS/FAT/NT Filesystems
+#
+# CONFIG_MSDOS_FS is not set
+# CONFIG_VFAT_FS is not set
+# CONFIG_NTFS_FS is not set
+
+#
+# Pseudo filesystems
+#
+# CONFIG_PROC_FS is not set
+# CONFIG_SYSFS is not set
+# CONFIG_TMPFS is not set
+# CONFIG_HUGETLBFS is not set
+# CONFIG_HUGETLB_PAGE is not set
+# CONFIG_MISC_FILESYSTEMS is not set
+
+#
+# Partition Types
+#
+# CONFIG_PARTITION_ADVANCED is not set
+CONFIG_MSDOS_PARTITION=y
+# CONFIG_NLS is not set
+
+#
+# Kernel hacking
+#
+CONFIG_TRACE_IRQFLAGS_SUPPORT=y
+# CONFIG_PRINTK_TIME is not set
+# CONFIG_ENABLE_WARN_DEPRECATED is not set
+# CONFIG_ENABLE_MUST_CHECK is not set
+CONFIG_FRAME_WARN=1024
+# CONFIG_MAGIC_SYSRQ is not set
+# CONFIG_UNUSED_SYMBOLS is not set
+CONFIG_DEBUG_FS=y
+# CONFIG_HEADERS_CHECK is not set
+# CONFIG_DEBUG_KERNEL is not set
+CONFIG_STACKTRACE=y
+# CONFIG_DEBUG_BUGVERBOSE is not set
+# CONFIG_DEBUG_MEMORY_INIT is not set
+# CONFIG_RCU_CPU_STALL_DETECTOR is not set
+# CONFIG_LATENCYTOP is not set
+# CONFIG_SYSCTL_SYSCALL_CHECK is not set
+CONFIG_NOP_TRACER=y
+CONFIG_HAVE_FUNCTION_TRACER=y
+CONFIG_HAVE_DYNAMIC_FTRACE=y
+CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_RING_BUFFER=y
+CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
+
+#
+# Tracers
+#
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_FTRACE_STARTUP_TEST is not set
+# CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
+# CONFIG_SAMPLES is not set
+CONFIG_HAVE_ARCH_KGDB=y
+# CONFIG_SH_STANDARD_BIOS is not set
+# CONFIG_EARLY_SCIF_CONSOLE is not set
+
+#
+# Security options
+#
+# CONFIG_KEYS is not set
+# CONFIG_SECURITYFS is not set
+# CONFIG_SECURITY_FILE_CAPABILITIES is not set
+# CONFIG_CRYPTO is not set
+CONFIG_BINARY_PRINTF=y
+
+#
+# Library routines
+#
+CONFIG_GENERIC_FIND_LAST_BIT=y
+# CONFIG_CRC_CCITT is not set
+# CONFIG_CRC16 is not set
+# CONFIG_CRC_T10DIF is not set
+# CONFIG_CRC_ITU_T is not set
+# CONFIG_CRC32 is not set
+# CONFIG_CRC7 is not set
+# CONFIG_LIBCRC32C is not set
+CONFIG_HAS_IOMEM=y
+CONFIG_HAS_IOPORT=y
+CONFIG_HAS_DMA=y
-- 
cgit v0.10.2


From f251935e02e89aa203e0729bfd727175018cec6f Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 09:20:08 +0000
Subject: sh: TMU platform data for sh7770

This patch adds TMU platform data for sh7770. Both clockevent
and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
index b73578e..b61d614 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
+#include <linux/sh_timer.h>
 
 static struct plat_sci_port sci_platform_data[] = {
 	{
@@ -76,7 +77,288 @@ static struct platform_device sci_device = {
 	},
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffd80008,
+		.end	= 0xffd80013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffd80014,
+		.end	= 0xffd8001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffd80020,
+		.end	= 0xffd8002f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
+static struct sh_timer_config tmu3_platform_data = {
+	.name = "TMU3",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+};
+
+static struct resource tmu3_resources[] = {
+	[0] = {
+		.name	= "TMU3",
+		.start	= 0xffd81008,
+		.end	= 0xffd81013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 19,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu3_device = {
+	.name		= "sh_tmu",
+	.id		= 3,
+	.dev = {
+		.platform_data	= &tmu3_platform_data,
+	},
+	.resource	= tmu3_resources,
+	.num_resources	= ARRAY_SIZE(tmu3_resources),
+};
+
+static struct sh_timer_config tmu4_platform_data = {
+	.name = "TMU4",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+};
+
+static struct resource tmu4_resources[] = {
+	[0] = {
+		.name	= "TMU4",
+		.start	= 0xffd81014,
+		.end	= 0xffd8101f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 20,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu4_device = {
+	.name		= "sh_tmu",
+	.id		= 4,
+	.dev = {
+		.platform_data	= &tmu4_platform_data,
+	},
+	.resource	= tmu4_resources,
+	.num_resources	= ARRAY_SIZE(tmu4_resources),
+};
+
+static struct sh_timer_config tmu5_platform_data = {
+	.name = "TMU5",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu5_resources[] = {
+	[0] = {
+		.name	= "TMU5",
+		.start	= 0xffd81020,
+		.end	= 0xffd8102f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 21,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu5_device = {
+	.name		= "sh_tmu",
+	.id		= 5,
+	.dev = {
+		.platform_data	= &tmu5_platform_data,
+	},
+	.resource	= tmu5_resources,
+	.num_resources	= ARRAY_SIZE(tmu5_resources),
+};
+
+static struct sh_timer_config tmu6_platform_data = {
+	.name = "TMU6",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+};
+
+static struct resource tmu6_resources[] = {
+	[0] = {
+		.name	= "TMU6",
+		.start	= 0xffd82008,
+		.end	= 0xffd82013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 22,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu6_device = {
+	.name		= "sh_tmu",
+	.id		= 6,
+	.dev = {
+		.platform_data	= &tmu6_platform_data,
+	},
+	.resource	= tmu6_resources,
+	.num_resources	= ARRAY_SIZE(tmu6_resources),
+};
+
+static struct sh_timer_config tmu7_platform_data = {
+	.name = "TMU7",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+};
+
+static struct resource tmu7_resources[] = {
+	[0] = {
+		.name	= "TMU7",
+		.start	= 0xffd82014,
+		.end	= 0xffd8201f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 23,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu7_device = {
+	.name		= "sh_tmu",
+	.id		= 7,
+	.dev = {
+		.platform_data	= &tmu7_platform_data,
+	},
+	.resource	= tmu7_resources,
+	.num_resources	= ARRAY_SIZE(tmu7_resources),
+};
+
+static struct sh_timer_config tmu8_platform_data = {
+	.name = "TMU8",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu8_resources[] = {
+	[0] = {
+		.name	= "TMU8",
+		.start	= 0xffd82020,
+		.end	= 0xffd8202b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 24,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu8_device = {
+	.name		= "sh_tmu",
+	.id		= 8,
+	.dev = {
+		.platform_data	= &tmu8_platform_data,
+	},
+	.resource	= tmu8_resources,
+	.num_resources	= ARRAY_SIZE(tmu8_resources),
+};
+
 static struct platform_device *sh7770_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
+	&tmu6_device,
+	&tmu7_device,
+	&tmu8_device,
 	&sci_device,
 };
 
@@ -87,6 +369,24 @@ static int __init sh7770_devices_setup(void)
 }
 __initcall(sh7770_devices_setup);
 
+static struct platform_device *sh7770_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
+	&tmu6_device,
+	&tmu7_device,
+	&tmu8_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7770_early_devices,
+				   ARRAY_SIZE(sh7770_early_devices));
+}
+
 void __init plat_irq_setup(void)
 {
 }
-- 
cgit v0.10.2


From 5f8a29ba39d52b2eaaed907b3cb3016b949a8f9b Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 09:28:25 +0000
Subject: sh: TMU platform data for sh4-202

This patch adds TMU platform data for sh4-202. Both clockevent
and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
index 7371abf..034c069 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
+#include <linux/sh_timer.h>
 
 static struct plat_sci_port sci_platform_data[] = {
 	{
@@ -31,8 +32,103 @@ static struct platform_device sci_device = {
 	},
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffd80008,
+		.end	= 0xffd80013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffd80014,
+		.end	= 0xffd8001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffd80020,
+		.end	= 0xffd8002f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
 static struct platform_device *sh4202_devices[] __initdata = {
 	&sci_device,
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
 };
 
 static int __init sh4202_devices_setup(void)
@@ -42,6 +138,18 @@ static int __init sh4202_devices_setup(void)
 }
 __initcall(sh4202_devices_setup);
 
+static struct platform_device *sh4202_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh4202_early_devices,
+				   ARRAY_SIZE(sh4202_early_devices));
+}
+
 void __init plat_irq_setup(void)
 {
 	/* do nothing - all IRL interrupts are handled by the board code */
-- 
cgit v0.10.2


From 67d889bd828a3cbf36ef27bd4a5c5f7ec76e20b9 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 09:34:26 +0000
Subject: sh: add sh4-202 INTC tables

This patch adds INTC tables for sh4-202 with support
for HUDI, TMU0, TMU1, TMU2, RTC, SCIF and WDT.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
index 034c069..be79fa13 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
@@ -2,6 +2,7 @@
  * SH4-202 Setup
  *
  *  Copyright (C) 2006  Paul Mundt
+ *  Copyright (C) 2009  Magnus Damm
  *
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
@@ -12,6 +13,7 @@
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
 #include <linux/sh_timer.h>
+#include <linux/io.h>
 
 static struct plat_sci_port sci_platform_data[] = {
 	{
@@ -150,7 +152,59 @@ void __init plat_early_device_setup(void)
 				   ARRAY_SIZE(sh4202_early_devices));
 }
 
+enum {
+	UNUSED = 0,
+
+	/* interrupt sources */
+	IRL0, IRL1, IRL2, IRL3, /* only IRLM mode supported */
+	HUDI, TMU0, TMU1, TMU2, RTC, SCIF, WDT,
+};
+
+static struct intc_vect vectors[] __initdata = {
+	INTC_VECT(HUDI, 0x600),
+	INTC_VECT(TMU0, 0x400), INTC_VECT(TMU1, 0x420),
+	INTC_VECT(TMU2, 0x440), INTC_VECT(TMU2, 0x460),
+	INTC_VECT(RTC, 0x480), INTC_VECT(RTC, 0x4a0),
+	INTC_VECT(RTC, 0x4c0),
+	INTC_VECT(SCIF, 0x700), INTC_VECT(SCIF, 0x720),
+	INTC_VECT(SCIF, 0x740), INTC_VECT(SCIF, 0x760),
+	INTC_VECT(WDT, 0x560),
+};
+
+static struct intc_prio_reg prio_registers[] __initdata = {
+	{ 0xffd00004, 0, 16, 4, /* IPRA */ { TMU0, TMU1, TMU2, RTC } },
+	{ 0xffd00008, 0, 16, 4, /* IPRB */ { WDT, 0, 0, 0 } },
+	{ 0xffd0000c, 0, 16, 4, /* IPRC */ { 0, 0, SCIF, HUDI } },
+	{ 0xffd00010, 0, 16, 4, /* IPRD */ { IRL0, IRL1, IRL2, IRL3 } },
+};
+
+static DECLARE_INTC_DESC(intc_desc, "sh4-202", vectors, NULL,
+			 NULL, prio_registers, NULL);
+
+static struct intc_vect vectors_irlm[] __initdata = {
+	INTC_VECT(IRL0, 0x240), INTC_VECT(IRL1, 0x2a0),
+	INTC_VECT(IRL2, 0x300), INTC_VECT(IRL3, 0x360),
+};
+
+static DECLARE_INTC_DESC(intc_desc_irlm, "sh4-202_irlm", vectors_irlm, NULL,
+			 NULL, prio_registers, NULL);
+
 void __init plat_irq_setup(void)
 {
-	/* do nothing - all IRL interrupts are handled by the board code */
+	register_intc_controller(&intc_desc);
+}
+
+#define INTC_ICR	0xffd00000UL
+#define INTC_ICR_IRLM   (1<<7)
+
+void __init plat_irq_setup_pins(int mode)
+{
+	switch (mode) {
+	case IRQ_MODE_IRQ: /* individual interrupt mode for IRL3-0 */
+		ctrl_outw(ctrl_inw(INTC_ICR) | INTC_ICR_IRLM, INTC_ICR);
+		register_intc_controller(&intc_desc_irlm);
+		break;
+	default:
+		BUG();
+	}
 }
-- 
cgit v0.10.2


From e37677a429a67cb45004b060ea3376e500bc73c4 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 09:39:06 +0000
Subject: sh: TMU platform data for sh7343

This patch adds TMU platform data for sh7343. Both clockevent
and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
index b9c361e..51204dc 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
@@ -173,6 +173,98 @@ static struct platform_device cmt_device = {
 	.num_resources	= ARRAY_SIZE(cmt_resources),
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "tmu0",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffd80008,
+		.end	= 0xffd80013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "tmu0",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffd80014,
+		.end	= 0xffd8001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "tmu0",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffd80020,
+		.end	= 0xffd8002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
 static struct plat_sci_port sci_platform_data[] = {
 	{
 		.mapbase	= 0xffe00000,
@@ -213,6 +305,9 @@ static struct platform_device sci_device = {
 
 static struct platform_device *sh7343_devices[] __initdata = {
 	&cmt_device,
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
 	&iic0_device,
 	&iic1_device,
 	&sci_device,
@@ -240,6 +335,9 @@ __initcall(sh7343_devices_setup);
 
 static struct platform_device *sh7343_early_devices[] __initdata = {
 	&cmt_device,
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
 };
 
 void __init plat_early_device_setup(void)
-- 
cgit v0.10.2


From f2710ebcd0a0404ccca3bbad68fd3639fe87ba6f Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 09:48:21 +0000
Subject: sh: TMU platform data for sh7366

This patch adds TMU platform data for sh7366. Both clockevent
and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
index d4a994b..04de0fa 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
@@ -180,6 +180,98 @@ static struct platform_device cmt_device = {
 	.num_resources	= ARRAY_SIZE(cmt_resources),
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "tmu0",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffd80008,
+		.end	= 0xffd80013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "tmu0",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffd80014,
+		.end	= 0xffd8001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "tmu0",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffd80020,
+		.end	= 0xffd8002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
 static struct plat_sci_port sci_platform_data[] = {
 	{
 		.mapbase	= 0xffe00000,
@@ -202,6 +294,9 @@ static struct platform_device sci_device = {
 
 static struct platform_device *sh7366_devices[] __initdata = {
 	&cmt_device,
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
 	&iic_device,
 	&sci_device,
 	&usb_host_device,
@@ -229,6 +324,9 @@ __initcall(sh7366_devices_setup);
 
 static struct platform_device *sh7366_early_devices[] __initdata = {
 	&cmt_device,
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
 };
 
 void __init plat_early_device_setup(void)
-- 
cgit v0.10.2


From 6a3501b63d30643120566cc1efba5acd0e64b12d Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 09:50:46 +0000
Subject: sh: TMU platform data for sh7724

This patch adds TMU platform data for sh7724. Both clockevent
and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index a3039ce..852f810 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -268,8 +268,197 @@ static struct platform_device cmt_device = {
 	.num_resources	= ARRAY_SIZE(cmt_resources),
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "tmu0",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffd80008,
+		.end	= 0xffd80013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "tmu0",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffd80014,
+		.end	= 0xffd8001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "tmu0",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffd80020,
+		.end	= 0xffd8002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
+
+static struct sh_timer_config tmu3_platform_data = {
+	.name = "TMU3",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "tmu1",
+};
+
+static struct resource tmu3_resources[] = {
+	[0] = {
+		.name	= "TMU3",
+		.start	= 0xffd90008,
+		.end	= 0xffd90013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 57,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu3_device = {
+	.name		= "sh_tmu",
+	.id		= 3,
+	.dev = {
+		.platform_data	= &tmu3_platform_data,
+	},
+	.resource	= tmu3_resources,
+	.num_resources	= ARRAY_SIZE(tmu3_resources),
+};
+
+static struct sh_timer_config tmu4_platform_data = {
+	.name = "TMU4",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "tmu1",
+};
+
+static struct resource tmu4_resources[] = {
+	[0] = {
+		.name	= "TMU4",
+		.start	= 0xffd90014,
+		.end	= 0xffd9001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 58,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu4_device = {
+	.name		= "sh_tmu",
+	.id		= 4,
+	.dev = {
+		.platform_data	= &tmu4_platform_data,
+	},
+	.resource	= tmu4_resources,
+	.num_resources	= ARRAY_SIZE(tmu4_resources),
+};
+
+static struct sh_timer_config tmu5_platform_data = {
+	.name = "TMU5",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "tmu1",
+};
+
+static struct resource tmu5_resources[] = {
+	[0] = {
+		.name	= "TMU5",
+		.start	= 0xffd90020,
+		.end	= 0xffd9002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 57,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu5_device = {
+	.name		= "sh_tmu",
+	.id		= 5,
+	.dev = {
+		.platform_data	= &tmu5_platform_data,
+	},
+	.resource	= tmu5_resources,
+	.num_resources	= ARRAY_SIZE(tmu5_resources),
+};
+
 static struct platform_device *sh7724_devices[] __initdata = {
 	&cmt_device,
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
 	&sci_device,
 	&rtc_device,
 	&iic0_device,
@@ -296,6 +485,12 @@ device_initcall(sh7724_devices_setup);
 
 static struct platform_device *sh7724_early_devices[] __initdata = {
 	&cmt_device,
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
 };
 
 void __init plat_early_device_setup(void)
-- 
cgit v0.10.2


From acd664ab54a449a7257dcb3c7ef9cd1e310b4c4f Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 10:01:01 +0000
Subject: sh: TMU platform data for sh7705

This patch adds TMU platform data for sh7705. Both clockevent
and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7705.c b/arch/sh/kernel/cpu/sh3/setup-sh7705.c
index 63b67ba..3951366 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh7705.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh7705.c
@@ -13,6 +13,7 @@
 #include <linux/irq.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
+#include <linux/sh_timer.h>
 #include <asm/rtc.h>
 
 enum {
@@ -116,7 +117,102 @@ static struct platform_device rtc_device = {
 	},
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x02,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xfffffe94,
+		.end	= 0xfffffe9f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0xe,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xfffffea0,
+		.end	= 0xfffffeab,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1a,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xfffffeac,
+		.end	= 0xfffffebb,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
 static struct platform_device *sh7705_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
 	&sci_device,
 	&rtc_device,
 };
@@ -128,6 +224,18 @@ static int __init sh7705_devices_setup(void)
 }
 __initcall(sh7705_devices_setup);
 
+static struct platform_device *sh7705_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7705_early_devices,
+				   ARRAY_SIZE(sh7705_early_devices));
+}
+
 void __init plat_irq_setup(void)
 {
 	register_intc_controller(&intc_desc);
-- 
cgit v0.10.2


From c8a9011bce6c43f4988cbcf1db8fc31433e69964 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 10:04:49 +0000
Subject: sh: TMU platform data for sh7706/sh7707/sh7708/sh7709

Add TMU platform data for sh7706/sh7707/sh7708/sh7709.
Both clockevent and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh3/setup-sh770x.c b/arch/sh/kernel/cpu/sh3/setup-sh770x.c
index a74f960..9412d91 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh770x.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh770x.c
@@ -18,6 +18,7 @@
 #include <linux/platform_device.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
+#include <linux/sh_timer.h>
 
 enum {
 	UNUSED = 0,
@@ -144,7 +145,102 @@ static struct platform_device sci_device = {
 	},
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x02,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xfffffe94,
+		.end	= 0xfffffe9f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0xe,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xfffffea0,
+		.end	= 0xfffffeab,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1a,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xfffffeac,
+		.end	= 0xfffffebb,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
 static struct platform_device *sh770x_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
 	&sci_device,
 	&rtc_device,
 };
@@ -156,6 +252,18 @@ static int __init sh770x_devices_setup(void)
 }
 __initcall(sh770x_devices_setup);
 
+static struct platform_device *sh770x_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh770x_early_devices,
+				   ARRAY_SIZE(sh770x_early_devices));
+}
+
 void __init plat_irq_setup(void)
 {
 	register_intc_controller(&intc_desc);
-- 
cgit v0.10.2


From e5ad00896a381937326ac55fc173630fe731d041 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 10:15:30 +0000
Subject: sh: TMU platform data for sh7710/sh7712

This patch adds TMU platform data for sh7710 and sh7712.
Both clockevent and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7710.c b/arch/sh/kernel/cpu/sh3/setup-sh7710.c
index 335098b..07ff38d 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh7710.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh7710.c
@@ -13,6 +13,7 @@
 #include <linux/irq.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
+#include <linux/sh_timer.h>
 #include <asm/rtc.h>
 
 enum {
@@ -120,7 +121,102 @@ static struct platform_device sci_device = {
 	},
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x02,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xa412fe94,
+		.end	= 0xa412fe9f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0xe,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xa412fea0,
+		.end	= 0xa412feab,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1a,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xa412feac,
+		.end	= 0xa412feb5,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
 static struct platform_device *sh7710_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
 	&sci_device,
 	&rtc_device,
 };
@@ -132,6 +228,18 @@ static int __init sh7710_devices_setup(void)
 }
 __initcall(sh7710_devices_setup);
 
+static struct platform_device *sh7710_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7710_early_devices,
+				   ARRAY_SIZE(sh7710_early_devices));
+}
+
 void __init plat_irq_setup(void)
 {
 	register_intc_controller(&intc_desc);
-- 
cgit v0.10.2


From 4a1a5a2f60ceabc026ba28cdbf81d7d47603b480 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 10:17:52 +0000
Subject: sh: TMU platform data for sh7720/sh7721

This patch adds TMU platform data for sh7720 and sh7721.
Both clockevent and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7720.c b/arch/sh/kernel/cpu/sh3/setup-sh7720.c
index 003874a..c09619c 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh7720.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh7720.c
@@ -18,6 +18,7 @@
 #include <linux/serial.h>
 #include <linux/io.h>
 #include <linux/serial_sci.h>
+#include <linux/sh_timer.h>
 #include <asm/rtc.h>
 
 static struct resource rtc_resources[] = {
@@ -123,7 +124,103 @@ static struct platform_device usbf_device = {
 	.resource	= usbf_resources,
 };
 
+
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x02,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xa412fe94,
+		.end	= 0xa412fe9f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0xe,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xa412fea0,
+		.end	= 0xa412feab,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1a,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xa412feac,
+		.end	= 0xa412feb5,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
 static struct platform_device *sh7720_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
 	&rtc_device,
 	&sci_device,
 	&usb_ohci_device,
@@ -137,6 +234,18 @@ static int __init sh7720_devices_setup(void)
 }
 __initcall(sh7720_devices_setup);
 
+static struct platform_device *sh7720_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7720_early_devices,
+				   ARRAY_SIZE(sh7720_early_devices));
+}
+
 enum {
 	UNUSED = 0,
 
-- 
cgit v0.10.2


From 2b23a8826a60268ec52302729911dd7ac6b10776 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 10:21:11 +0000
Subject: sh: CMT platform data for sh7720/sh7721

This patch adds CMT platform data for sh7720 and sh7721.
All 5 32-bit CMT channels unfortunately share a single IRQ.
Both clockevent and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index a9dee13..5391746 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -302,6 +302,7 @@ config CPU_SUBTYPE_SH7720
 	bool "Support SH7720 processor"
 	select CPU_SH3
 	select CPU_HAS_DSP
+	select SYS_SUPPORTS_CMT
 	help
 	  Select SH7720 if you have a SH3-DSP SH7720 CPU.
 
@@ -309,6 +310,7 @@ config CPU_SUBTYPE_SH7721
 	bool "Support SH7721 processor"
 	select CPU_SH3
 	select CPU_HAS_DSP
+	select SYS_SUPPORTS_CMT
 	help
 	  Select SH7721 if you have a SH3-DSP SH7721 CPU.
 
diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7720.c b/arch/sh/kernel/cpu/sh3/setup-sh7720.c
index c09619c..d8b46f5 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh7720.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh7720.c
@@ -124,6 +124,157 @@ static struct platform_device usbf_device = {
 	.resource	= usbf_resources,
 };
 
+static struct sh_timer_config cmt0_platform_data = {
+	.name = "CMT0",
+	.channel_offset = 0x10,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 125,
+	.clocksource_rating = 125,
+};
+
+static struct resource cmt0_resources[] = {
+	[0] = {
+		.name	= "CMT0",
+		.start	= 0x044a0010,
+		.end	= 0x044a001b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 104,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device cmt0_device = {
+	.name		= "sh_cmt",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &cmt0_platform_data,
+	},
+	.resource	= cmt0_resources,
+	.num_resources	= ARRAY_SIZE(cmt0_resources),
+};
+
+static struct sh_timer_config cmt1_platform_data = {
+	.name = "CMT1",
+	.channel_offset = 0x20,
+	.timer_bit = 1,
+	.clk = "module_clk",
+};
+
+static struct resource cmt1_resources[] = {
+	[0] = {
+		.name	= "CMT1",
+		.start	= 0x044a0020,
+		.end	= 0x044a002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 104,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device cmt1_device = {
+	.name		= "sh_cmt",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &cmt1_platform_data,
+	},
+	.resource	= cmt1_resources,
+	.num_resources	= ARRAY_SIZE(cmt1_resources),
+};
+
+static struct sh_timer_config cmt2_platform_data = {
+	.name = "CMT2",
+	.channel_offset = 0x30,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource cmt2_resources[] = {
+	[0] = {
+		.name	= "CMT2",
+		.start	= 0x044a0030,
+		.end	= 0x044a003b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 104,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device cmt2_device = {
+	.name		= "sh_cmt",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &cmt2_platform_data,
+	},
+	.resource	= cmt2_resources,
+	.num_resources	= ARRAY_SIZE(cmt2_resources),
+};
+
+static struct sh_timer_config cmt3_platform_data = {
+	.name = "CMT3",
+	.channel_offset = 0x40,
+	.timer_bit = 3,
+	.clk = "module_clk",
+};
+
+static struct resource cmt3_resources[] = {
+	[0] = {
+		.name	= "CMT3",
+		.start	= 0x044a0040,
+		.end	= 0x044a004b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 104,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device cmt3_device = {
+	.name		= "sh_cmt",
+	.id		= 3,
+	.dev = {
+		.platform_data	= &cmt3_platform_data,
+	},
+	.resource	= cmt3_resources,
+	.num_resources	= ARRAY_SIZE(cmt3_resources),
+};
+
+static struct sh_timer_config cmt4_platform_data = {
+	.name = "CMT4",
+	.channel_offset = 0x50,
+	.timer_bit = 4,
+	.clk = "module_clk",
+};
+
+static struct resource cmt4_resources[] = {
+	[0] = {
+		.name	= "CMT4",
+		.start	= 0x044a0050,
+		.end	= 0x044a005b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 104,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device cmt4_device = {
+	.name		= "sh_cmt",
+	.id		= 4,
+	.dev = {
+		.platform_data	= &cmt4_platform_data,
+	},
+	.resource	= cmt4_resources,
+	.num_resources	= ARRAY_SIZE(cmt4_resources),
+};
 
 static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
@@ -218,6 +369,11 @@ static struct platform_device tmu2_device = {
 };
 
 static struct platform_device *sh7720_devices[] __initdata = {
+	&cmt0_device,
+	&cmt1_device,
+	&cmt2_device,
+	&cmt3_device,
+	&cmt4_device,
 	&tmu0_device,
 	&tmu1_device,
 	&tmu2_device,
@@ -235,6 +391,11 @@ static int __init sh7720_devices_setup(void)
 __initcall(sh7720_devices_setup);
 
 static struct platform_device *sh7720_early_devices[] __initdata = {
+	&cmt0_device,
+	&cmt1_device,
+	&cmt2_device,
+	&cmt3_device,
+	&cmt4_device,
 	&tmu0_device,
 	&tmu1_device,
 	&tmu2_device,
-- 
cgit v0.10.2


From f19900b2e608b604777a74d6d711bbf744657756 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 10:25:54 +0000
Subject: sh: remove old TMU driver

This patch removes the old TMU driver (CONFIG_SH_TMU/timer-tmu.c)

As replacement, select the sh_tmu driver with CONFIG_SH_TIMER_TMU
and configure timer channel using platform data.

If multiple TMU channels are enabled using platform data, use the
earlytimer parameter on the kernel command line to select channel.
For instance, use "earlytimer=sh_tmu.0" to select the first channel.

To verify which timer is being used, look at printouts or the timer
irq count in /proc/interrupts.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 5391746..e7b6406 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -473,16 +473,9 @@ source "arch/sh/boards/Kconfig"
 
 menu "Timer and clock configuration"
 
-config SH_TMU
-	bool "TMU timer support"
-	depends on CPU_SH3 || CPU_SH4
-	default y
-	help
-	  This enables the use of the TMU as the system timer.
-
 config SH_TIMER_TMU
 	bool "TMU timer driver"
-	depends on !SH_TMU && SYS_SUPPORTS_TMU
+	depends on SYS_SUPPORTS_TMU
 	default y
 	help
 	  This enables the build of the TMU timer driver.
diff --git a/arch/sh/include/asm/timer.h b/arch/sh/include/asm/timer.h
index 6c08511..f27a88b 100644
--- a/arch/sh/include/asm/timer.h
+++ b/arch/sh/include/asm/timer.h
@@ -18,7 +18,6 @@ struct sys_timer {
 	struct sys_timer_ops	*ops;
 };
 
-extern struct sys_timer tmu_timer;
 extern struct sys_timer *sys_timer;
 
 /* arch/sh/kernel/timers/timer.c */
diff --git a/arch/sh/include/cpu-sh3/cpu/timer.h b/arch/sh/include/cpu-sh3/cpu/timer.h
deleted file mode 100644
index 793acf1..0000000
--- a/arch/sh/include/cpu-sh3/cpu/timer.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * include/asm-sh/cpu-sh3/timer.h
- *
- * Copyright (C) 2004 Lineo Solutions, Inc.
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#ifndef __ASM_CPU_SH3_TIMER_H
-#define __ASM_CPU_SH3_TIMER_H
-
-/*
- * ---------------------------------------------------------------------------
- * TMU Common definitions for SH3 processors
- *	SH7706
- *	SH7709S
- *	SH7727
- *	SH7729R
- *	SH7710
- *	SH7720
- *	SH7710
- * ---------------------------------------------------------------------------
- */
-
-#if  !defined(CONFIG_CPU_SUBTYPE_SH7720) && !defined(CONFIG_CPU_SUBTYPE_SH7721)
-#define TMU_TOCR	0xfffffe90	/* Byte access */
-#endif
-
-#if defined(CONFIG_CPU_SUBTYPE_SH7710) || \
-    defined(CONFIG_CPU_SUBTYPE_SH7720) || \
-    defined(CONFIG_CPU_SUBTYPE_SH7721)
-#define TMU_012_TSTR	0xa412fe92	/* Byte access */
-
-#define TMU0_TCOR	0xa412fe94	/* Long access */
-#define TMU0_TCNT	0xa412fe98	/* Long access */
-#define TMU0_TCR	0xa412fe9c	/* Word access */
-
-#define TMU1_TCOR	0xa412fea0	/* Long access */
-#define TMU1_TCNT	0xa412fea4	/* Long access */
-#define TMU1_TCR	0xa412fea8	/* Word access */
-
-#define TMU2_TCOR	0xa412feac	/* Long access */
-#define TMU2_TCNT	0xa412feb0	/* Long access */
-#define TMU2_TCR	0xa412feb4	/* Word access */
-
-#else
-#define TMU_012_TSTR	0xfffffe92	/* Byte access */
-
-#define TMU0_TCOR	0xfffffe94	/* Long access */
-#define TMU0_TCNT	0xfffffe98	/* Long access */
-#define TMU0_TCR	0xfffffe9c	/* Word access */
-
-#define TMU1_TCOR	0xfffffea0	/* Long access */
-#define TMU1_TCNT	0xfffffea4	/* Long access */
-#define TMU1_TCR	0xfffffea8	/* Word access */
-
-#define TMU2_TCOR	0xfffffeac	/* Long access */
-#define TMU2_TCNT	0xfffffeb0	/* Long access */
-#define TMU2_TCR	0xfffffeb4	/* Word access */
-#if !defined(CONFIG_CPU_SUBTYPE_SH7720) && !defined(CONFIG_CPU_SUBTYPE_SH7721)
-#define TMU2_TCPR2	0xfffffeb8	/* Long access */
-#endif
-#endif
-
-#endif /* __ASM_CPU_SH3_TIMER_H */
-
diff --git a/arch/sh/include/cpu-sh4/cpu/timer.h b/arch/sh/include/cpu-sh4/cpu/timer.h
deleted file mode 100644
index d1e796b..0000000
--- a/arch/sh/include/cpu-sh4/cpu/timer.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * include/asm-sh/cpu-sh4/timer.h
- *
- * Copyright (C) 2004 Lineo Solutions, Inc.
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#ifndef __ASM_CPU_SH4_TIMER_H
-#define __ASM_CPU_SH4_TIMER_H
-
-/*
- * ---------------------------------------------------------------------------
- * TMU Common definitions for SH4 processors
- *	SH7750S/SH7750R
- *	SH7751/SH7751R
- *	SH7760
- *	SH-X3
- * ---------------------------------------------------------------------------
- */
-#ifdef CONFIG_CPU_SUBTYPE_SHX3
-#define TMU_012_BASE	0xffc10000
-#define TMU_345_BASE	0xffc20000
-#else
-#define TMU_012_BASE	0xffd80000
-#define TMU_345_BASE	0xfe100000
-#endif
-
-#define TMU_TOCR	TMU_012_BASE	/* Not supported on all CPUs */
-
-#define TMU_012_TSTR	(TMU_012_BASE + 0x04)
-#define TMU_345_TSTR	(TMU_345_BASE + 0x04)
-
-#define TMU0_TCOR	(TMU_012_BASE + 0x08)
-#define TMU0_TCNT	(TMU_012_BASE + 0x0c)
-#define TMU0_TCR	(TMU_012_BASE + 0x10)
-
-#define TMU1_TCOR       (TMU_012_BASE + 0x14)
-#define TMU1_TCNT       (TMU_012_BASE + 0x18)
-#define TMU1_TCR        (TMU_012_BASE + 0x1c)
-
-#define TMU2_TCOR       (TMU_012_BASE + 0x20)
-#define TMU2_TCNT       (TMU_012_BASE + 0x24)
-#define TMU2_TCR	(TMU_012_BASE + 0x28)
-#define TMU2_TCPR	(TMU_012_BASE + 0x2c)
-
-#define TMU3_TCOR	(TMU_345_BASE + 0x08)
-#define TMU3_TCNT	(TMU_345_BASE + 0x0c)
-#define TMU3_TCR	(TMU_345_BASE + 0x10)
-
-#define TMU4_TCOR	(TMU_345_BASE + 0x14)
-#define TMU4_TCNT	(TMU_345_BASE + 0x18)
-#define TMU4_TCR	(TMU_345_BASE + 0x1c)
-
-#define TMU5_TCOR	(TMU_345_BASE + 0x20)
-#define TMU5_TCNT	(TMU_345_BASE + 0x24)
-#define TMU5_TCR	(TMU_345_BASE + 0x28)
-
-#endif /* __ASM_CPU_SH4_TIMER_H */
diff --git a/arch/sh/kernel/timers/Makefile b/arch/sh/kernel/timers/Makefile
index 8bceba5..fefd7ed 100644
--- a/arch/sh/kernel/timers/Makefile
+++ b/arch/sh/kernel/timers/Makefile
@@ -3,5 +3,3 @@
 #
 
 obj-y	:= timer.o
-
-obj-$(CONFIG_SH_TMU)		+= timer-tmu.o
diff --git a/arch/sh/kernel/timers/timer-tmu.c b/arch/sh/kernel/timers/timer-tmu.c
deleted file mode 100644
index fe8d893..0000000
--- a/arch/sh/kernel/timers/timer-tmu.c
+++ /dev/null
@@ -1,297 +0,0 @@
-/*
- * arch/sh/kernel/timers/timer-tmu.c - TMU Timer Support
- *
- *  Copyright (C) 2005 - 2007  Paul Mundt
- *
- * TMU handling code hacked out of arch/sh/kernel/time.c
- *
- *  Copyright (C) 1999  Tetsuya Okada & Niibe Yutaka
- *  Copyright (C) 2000  Philipp Rumpf <prumpf@tux.org>
- *  Copyright (C) 2002, 2003, 2004  Paul Mundt
- *  Copyright (C) 2002  M. R. Brown  <mrbrown@linux-sh.org>
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/seqlock.h>
-#include <linux/clockchips.h>
-#include <asm/timer.h>
-#include <asm/rtc.h>
-#include <asm/io.h>
-#include <asm/irq.h>
-#include <asm/clock.h>
-
-#define TMU_TOCR_INIT	0x00
-#define TMU_TCR_INIT	0x0020
-
-#define TMU0		(0)
-#define TMU1		(1)
-
-static inline void _tmu_start(int tmu_num)
-{
-	ctrl_outb(ctrl_inb(TMU_012_TSTR) | (0x1<<tmu_num), TMU_012_TSTR);
-}
-
-static inline void _tmu_set_irq(int tmu_num, int enabled)
-{
-	register unsigned long tmu_tcr = TMU0_TCR + (0xc*tmu_num);
-	ctrl_outw( (enabled ? ctrl_inw(tmu_tcr) | (1<<5) : ctrl_inw(tmu_tcr) & ~(1<<5)), tmu_tcr);
-}
-
-static inline void _tmu_stop(int tmu_num)
-{
-	ctrl_outb(ctrl_inb(TMU_012_TSTR) & ~(0x1<<tmu_num), TMU_012_TSTR);
-}
-
-static inline void _tmu_clear_status(int tmu_num)
-{
-	register unsigned long tmu_tcr = TMU0_TCR + (0xc*tmu_num);
-	/* Clear UNF bit */
-	ctrl_outw(ctrl_inw(tmu_tcr) & ~0x100, tmu_tcr);
-}
-
-static inline unsigned long _tmu_read(int tmu_num)
-{
-        return ctrl_inl(TMU0_TCNT+0xC*tmu_num);
-}
-
-static int tmu_timer_start(void)
-{
-	_tmu_start(TMU0);
-	_tmu_start(TMU1);
-	_tmu_set_irq(TMU0,1);
-	return 0;
-}
-
-static int tmu_timer_stop(void)
-{
-	_tmu_stop(TMU0);
-	_tmu_stop(TMU1);
-	_tmu_clear_status(TMU0);
-	return 0;
-}
-
-/*
- * also when the module_clk is scaled the TMU1
- * will show the same frequency
- */
-static int tmus_are_scaled;
-
-static cycle_t tmu_timer_read(struct clocksource *cs)
-{
-	return ((cycle_t)(~_tmu_read(TMU1)))<<tmus_are_scaled;
-}
-
-
-static unsigned long tmu_latest_interval[3];
-static void tmu_timer_set_interval(int tmu_num, unsigned long interval, unsigned int reload)
-{
-	unsigned long tmu_tcnt = TMU0_TCNT + tmu_num*0xC;
-	unsigned long tmu_tcor = TMU0_TCOR + tmu_num*0xC;
-
-	_tmu_stop(tmu_num);
-
-	ctrl_outl(interval, tmu_tcnt);
-	tmu_latest_interval[tmu_num] = interval;
-
-	/*
-	 * TCNT reloads from TCOR on underflow, clear it if we don't
-	 * intend to auto-reload
-	 */
-	ctrl_outl( reload ? interval : 0 , tmu_tcor);
-
-	_tmu_start(tmu_num);
-}
-
-static int tmu_set_next_event(unsigned long cycles,
-			      struct clock_event_device *evt)
-{
-	tmu_timer_set_interval(TMU0,cycles, evt->mode == CLOCK_EVT_MODE_PERIODIC);
-	_tmu_set_irq(TMU0,1);
-	return 0;
-}
-
-static void tmu_set_mode(enum clock_event_mode mode,
-			 struct clock_event_device *evt)
-{
-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-		ctrl_outl(tmu_latest_interval[TMU0], TMU0_TCOR);
-		break;
-	case CLOCK_EVT_MODE_ONESHOT:
-		ctrl_outl(0, TMU0_TCOR);
-		break;
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-	case CLOCK_EVT_MODE_RESUME:
-		break;
-	}
-}
-
-static struct clock_event_device tmu0_clockevent = {
-	.name		= "tmu0",
-	.shift		= 32,
-	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
-	.set_mode	= tmu_set_mode,
-	.set_next_event	= tmu_set_next_event,
-};
-
-static irqreturn_t tmu_timer_interrupt(int irq, void *dummy)
-{
-	struct clock_event_device *evt = &tmu0_clockevent;
-	_tmu_clear_status(TMU0);
-	_tmu_set_irq(TMU0,tmu0_clockevent.mode != CLOCK_EVT_MODE_ONESHOT);
-
-	switch (tmu0_clockevent.mode) {
-	case CLOCK_EVT_MODE_ONESHOT:
-	case CLOCK_EVT_MODE_PERIODIC:
-		evt->event_handler(evt);
-		break;
-	default:
-		break;
-	}
-
-	return IRQ_HANDLED;
-}
-
-static struct irqaction tmu0_irq = {
-	.name		= "periodic/oneshot timer",
-	.handler	= tmu_timer_interrupt,
-	.flags		= IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL,
-};
-
-static void __init tmu_clk_init(struct clk *clk)
-{
-	u8 divisor  = TMU_TCR_INIT & 0x7;
-	int tmu_num = clk->name[3]-'0';
-	ctrl_outw(TMU_TCR_INIT, TMU0_TCR+(tmu_num*0xC));
-	clk->rate = clk_get_rate(clk->parent) / (4 << (divisor << 1));
-}
-
-static void tmu_clk_recalc(struct clk *clk)
-{
-	int tmu_num = clk->name[3]-'0';
-	unsigned long prev_rate = clk_get_rate(clk);
-	unsigned long flags;
-	u8 divisor = ctrl_inw(TMU0_TCR+tmu_num*0xC) & 0x7;
-	clk->rate  = clk_get_rate(clk->parent) / (4 << (divisor << 1));
-
-	if(prev_rate==clk_get_rate(clk))
-		return;
-
-	if(tmu_num)
-		return; /* No more work on TMU1 */
-
-	local_irq_save(flags);
-	tmus_are_scaled = (prev_rate > clk->rate);
-
-	_tmu_stop(TMU0);
-
-	tmu0_clockevent.mult = div_sc(clk->rate, NSEC_PER_SEC,
-				tmu0_clockevent.shift);
-	tmu0_clockevent.max_delta_ns =
-			clockevent_delta2ns(-1, &tmu0_clockevent);
-	tmu0_clockevent.min_delta_ns =
-			clockevent_delta2ns(1, &tmu0_clockevent);
-
-	if (tmus_are_scaled)
-		tmu_latest_interval[TMU0] >>= 1;
-	else
-		tmu_latest_interval[TMU0] <<= 1;
-
-	tmu_timer_set_interval(TMU0,
-		tmu_latest_interval[TMU0],
-		tmu0_clockevent.mode == CLOCK_EVT_MODE_PERIODIC);
-
-	_tmu_start(TMU0);
-
-	local_irq_restore(flags);
-}
-
-static struct clk_ops tmu_clk_ops = {
-	.init		= tmu_clk_init,
-	.recalc		= tmu_clk_recalc,
-};
-
-static struct clk tmu0_clk = {
-	.name		= "tmu0_clk",
-	.ops		= &tmu_clk_ops,
-};
-
-static struct clk tmu1_clk = {
-	.name		= "tmu1_clk",
-	.ops		= &tmu_clk_ops,
-};
-
-static int tmu_timer_init(void)
-{
-	unsigned long interval;
-	unsigned long frequency;
-
-	setup_irq(CONFIG_SH_TIMER_IRQ, &tmu0_irq);
-
-	tmu0_clk.parent = clk_get(NULL, "module_clk");
-	tmu1_clk.parent = clk_get(NULL, "module_clk");
-
-	tmu_timer_stop();
-
-#if !defined(CONFIG_CPU_SUBTYPE_SH7720) && \
-    !defined(CONFIG_CPU_SUBTYPE_SH7721) && \
-    !defined(CONFIG_CPU_SUBTYPE_SH7760) && \
-    !defined(CONFIG_CPU_SUBTYPE_SH7785) && \
-    !defined(CONFIG_CPU_SUBTYPE_SH7786) && \
-    !defined(CONFIG_CPU_SUBTYPE_SHX3)
-	ctrl_outb(TMU_TOCR_INIT, TMU_TOCR);
-#endif
-
-	clk_register(&tmu0_clk);
-	clk_register(&tmu1_clk);
-	clk_enable(&tmu0_clk);
-	clk_enable(&tmu1_clk);
-
-	frequency = clk_get_rate(&tmu0_clk);
-	interval = (frequency + HZ / 2) / HZ;
-
-	tmu_timer_set_interval(TMU0,interval, 1);
-	tmu_timer_set_interval(TMU1,~0,1);
-
-	_tmu_start(TMU1);
-
-	clocksource_sh.rating = 200;
-	clocksource_sh.mask = CLOCKSOURCE_MASK(32);
-	clocksource_sh.read = tmu_timer_read;
-	clocksource_sh.shift = 10;
-	clocksource_sh.mult = clocksource_hz2mult(clk_get_rate(&tmu1_clk),
-						  clocksource_sh.shift);
-	clocksource_sh.flags = CLOCK_SOURCE_IS_CONTINUOUS;
-	clocksource_register(&clocksource_sh);
-
-	tmu0_clockevent.mult = div_sc(frequency, NSEC_PER_SEC,
-				      tmu0_clockevent.shift);
-	tmu0_clockevent.max_delta_ns =
-			clockevent_delta2ns(-1, &tmu0_clockevent);
-	tmu0_clockevent.min_delta_ns =
-			clockevent_delta2ns(1, &tmu0_clockevent);
-
-	tmu0_clockevent.cpumask = cpumask_of(0);
-	tmu0_clockevent.rating = 100;
-
-	clockevents_register_device(&tmu0_clockevent);
-
-	return 0;
-}
-
-static struct sys_timer_ops tmu_timer_ops = {
-	.init		= tmu_timer_init,
-	.start		= tmu_timer_start,
-	.stop		= tmu_timer_stop,
-};
-
-struct sys_timer tmu_timer = {
-	.name	= "tmu",
-	.ops	= &tmu_timer_ops,
-};
diff --git a/arch/sh/kernel/timers/timer.c b/arch/sh/kernel/timers/timer.c
index 920891a..f8812bd 100644
--- a/arch/sh/kernel/timers/timer.c
+++ b/arch/sh/kernel/timers/timer.c
@@ -14,9 +14,6 @@
 #include <asm/timer.h>
 
 static struct sys_timer *sys_timers[] = {
-#ifdef CONFIG_SH_TMU
-	&tmu_timer,
-#endif
 	NULL,
 };
 
-- 
cgit v0.10.2


From 8be5f1a68f2c14082939dd54e7037dcee2eb54f8 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 12 May 2009 19:53:55 +0900
Subject: sh: Kill off the remnants of the old timer code.

Now with all of the TMU users moved over to the new TMU driver, and the
old TMU driver killed off, the left-over infrastructure can go along
with it.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index e7b6406..fb75c2d 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -494,17 +494,6 @@ config SH_TIMER_MTU2
 	help
 	  This enables build of the MTU2 timer driver.
 
-config SH_TIMER_IRQ
-	int
-	default "28" if CPU_SUBTYPE_SH7780 || CPU_SUBTYPE_SH7785 || \
-			CPU_SUBTYPE_SH7763
-	default "86" if CPU_SUBTYPE_SH7619
-	default "140" if CPU_SUBTYPE_SH7206
-	default "142" if CPU_SUBTYPE_SH7203 && SH_CMT
-	default "153" if CPU_SUBTYPE_SH7203 && SH_MTU2
-	default "238" if CPU_SUBTYPE_MXG
-	default "16"
-
 config SH_PCLK_FREQ
 	int "Peripheral clock frequency (in Hz)"
 	default "27000000" if CPU_SUBTYPE_SH7343
diff --git a/arch/sh/include/asm/timer.h b/arch/sh/include/asm/timer.h
deleted file mode 100644
index f27a88b..0000000
--- a/arch/sh/include/asm/timer.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef __ASM_SH_TIMER_H
-#define __ASM_SH_TIMER_H
-
-#include <linux/sysdev.h>
-#include <linux/clocksource.h>
-#include <cpu/timer.h>
-
-struct sys_timer_ops {
-	int (*init)(void);
-	int (*start)(void);
-	int (*stop)(void);
-};
-
-struct sys_timer {
-	const char		*name;
-
-	struct sys_device	dev;
-	struct sys_timer_ops	*ops;
-};
-
-extern struct sys_timer *sys_timer;
-
-/* arch/sh/kernel/timers/timer.c */
-struct sys_timer *get_sys_timer(void);
-
-extern struct clocksource clocksource_sh;
-
-#endif /* __ASM_SH_TIMER_H */
diff --git a/arch/sh/kernel/Makefile_32 b/arch/sh/kernel/Makefile_32
index aee08af..9411e3e 100644
--- a/arch/sh/kernel/Makefile_32
+++ b/arch/sh/kernel/Makefile_32
@@ -14,7 +14,7 @@ obj-y	:= debugtraps.o idle.o io.o io_generic.o irq.o			\
 	   sys_sh.o sys_sh32.o syscalls_32.o time.o topology.o	\
 	   traps.o traps_32.o
 
-obj-y				+= cpu/ timers/
+obj-y				+= cpu/
 obj-$(CONFIG_VSYSCALL)		+= vsyscall/
 obj-$(CONFIG_SMP)		+= smp.o
 obj-$(CONFIG_SH_STANDARD_BIOS)	+= sh_bios.o
diff --git a/arch/sh/kernel/Makefile_64 b/arch/sh/kernel/Makefile_64
index c845a0d..67b9f6c 100644
--- a/arch/sh/kernel/Makefile_64
+++ b/arch/sh/kernel/Makefile_64
@@ -4,7 +4,7 @@ obj-y	:= debugtraps.o idle.o io.o io_generic.o irq.o machvec.o process_64.o \
 	   ptrace_64.o setup.o signal_64.o sys_sh.o sys_sh64.o \
 	   syscalls_64.o time.o topology.o traps.o traps_64.o
 
-obj-y				+= cpu/ timers/
+obj-y				+= cpu/
 obj-$(CONFIG_SMP)		+= smp.o
 obj-$(CONFIG_SH_CPU_FREQ)	+= cpufreq.o
 obj-$(CONFIG_MODULES)		+= sh_ksyms_64.o module.o
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 133dbe4..f54769f 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -26,7 +26,6 @@
 #include <linux/platform_device.h>
 #include <linux/proc_fs.h>
 #include <asm/clock.h>
-#include <asm/timer.h>
 
 static LIST_HEAD(clock_list);
 static DEFINE_SPINLOCK(clock_lock);
diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
index a77838f..2edde32 100644
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -22,9 +22,6 @@
 #include <linux/rtc.h>
 #include <asm/clock.h>
 #include <asm/rtc.h>
-#include <asm/timer.h>
-
-struct sys_timer *sys_timer;
 
 /* Dummy RTC ops */
 static void null_rtc_get_time(struct timespec *tv)
@@ -94,20 +91,9 @@ module_init(rtc_generic_init);
 
 void (*board_time_init)(void);
 
-struct clocksource clocksource_sh = {
-	.name		= "SuperH",
-};
-
 unsigned long long sched_clock(void)
 {
-	unsigned long long cycles;
-
-	/* jiffies based sched_clock if no clocksource is installed */
-	if (!clocksource_sh.rating)
-		return (jiffies_64 - INITIAL_JIFFIES) * (NSEC_PER_SEC / HZ);
-
-	cycles = clocksource_sh.read(&clocksource_sh);
-	return cyc2ns(&clocksource_sh, cycles);
+	return (jiffies_64 - INITIAL_JIFFIES) * (NSEC_PER_SEC / HZ);
 }
 
 static void __init sh_late_time_init(void)
@@ -117,18 +103,7 @@ static void __init sh_late_time_init(void)
 	 * Run probe() for one "earlytimer" device.
 	 */
 	early_platform_driver_register_all("earlytimer");
-	if (early_platform_driver_probe("earlytimer", 1, 0))
-		return;
-
-	/*
-	 * Find the timer to use as the system timer, it will be
-	 * initialized for us.
-	 */
-	sys_timer = get_sys_timer();
-	if (unlikely(!sys_timer))
-		panic("System timer missing.\n");
-
-	printk(KERN_INFO "Using %s for system timer\n", sys_timer->name);
+	early_platform_driver_probe("earlytimer", 1, 0);
 }
 
 void __init time_init(void)
diff --git a/arch/sh/kernel/timers/Makefile b/arch/sh/kernel/timers/Makefile
deleted file mode 100644
index fefd7ed..0000000
--- a/arch/sh/kernel/timers/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-#
-# Makefile for the various Linux/SuperH timers
-#
-
-obj-y	:= timer.o
diff --git a/arch/sh/kernel/timers/timer.c b/arch/sh/kernel/timers/timer.c
deleted file mode 100644
index f8812bd..0000000
--- a/arch/sh/kernel/timers/timer.c
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * arch/sh/kernel/timers/timer.c - Common timer code
- *
- *  Copyright (C) 2005  Paul Mundt
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/timer.h>
-#include <linux/string.h>
-#include <asm/timer.h>
-
-static struct sys_timer *sys_timers[] = {
-	NULL,
-};
-
-static char timer_override[10];
-static int __init timer_setup(char *str)
-{
-	if (str)
-		strlcpy(timer_override, str, sizeof(timer_override));
-	return 1;
-}
-__setup("timer=", timer_setup);
-
-struct sys_timer *get_sys_timer(void)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(sys_timers); i++) {
-		struct sys_timer *t = sys_timers[i];
-
-		if (unlikely(!t))
-			break;
-		if (unlikely(timer_override[0]))
-			if ((strcmp(timer_override, t->name) != 0))
-				continue;
-		if (likely(t->ops->init() == 0))
-			return t;
-	}
-
-	return NULL;
-}
-- 
cgit v0.10.2


From e758a33d6fc5b9d6a3ae489863d04fcecad8120b Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Tue, 12 May 2009 21:59:01 +1000
Subject: perf_counter: call hw_perf_save_disable/restore around group_sched_in

I noticed that when enabling a group via the PERF_COUNTER_IOC_ENABLE
ioctl on the group leader, the counters weren't enabled and counting
immediately on return from the ioctl, but did start counting a little
while later (presumably after a context switch).

The reason was that __perf_counter_enable calls group_sched_in which
calls hw_perf_group_sched_in, which on powerpc assumes that the caller
has called hw_perf_save_disable already.  Until commit 46d686c6
("perf_counter: put whole group on when enabling group leader") it was
true that all callers of group_sched_in had called
hw_perf_save_disable first, and the powerpc hw_perf_group_sched_in
relies on that (there isn't an x86 version).

This fixes the problem by putting calls to hw_perf_save_disable /
hw_perf_restore around the calls to group_sched_in and
counter_sched_in in __perf_counter_enable.  Having the calls to
hw_perf_save_disable/restore around the counter_sched_in call is
harmless and makes this call consistent with the other call sites
of counter_sched_in, which have all called hw_perf_save_disable first.

[ Impact: more precise counter group disable/enable functionality ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <18953.25733.53359.147452@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 5ea0240..ff166c1 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -663,6 +663,7 @@ static void __perf_counter_enable(void *info)
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter_context *ctx = counter->ctx;
 	struct perf_counter *leader = counter->group_leader;
+	unsigned long pmuflags;
 	unsigned long flags;
 	int err;
 
@@ -689,14 +690,18 @@ static void __perf_counter_enable(void *info)
 	if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
 		goto unlock;
 
-	if (!group_can_go_on(counter, cpuctx, 1))
+	if (!group_can_go_on(counter, cpuctx, 1)) {
 		err = -EEXIST;
-	else if (counter == leader)
-		err = group_sched_in(counter, cpuctx, ctx,
-				     smp_processor_id());
-	else
-		err = counter_sched_in(counter, cpuctx, ctx,
-				       smp_processor_id());
+	} else {
+		pmuflags = hw_perf_save_disable();
+		if (counter == leader)
+			err = group_sched_in(counter, cpuctx, ctx,
+					     smp_processor_id());
+		else
+			err = counter_sched_in(counter, cpuctx, ctx,
+					       smp_processor_id());
+		hw_perf_restore(pmuflags);
+	}
 
 	if (err) {
 		/*
-- 
cgit v0.10.2


From b5710ce92a8cf8e3fc0ffc230cfdbfa23463f1c8 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Tue, 12 May 2009 18:51:28 +0400
Subject: x86/pci: add 4 more return parameters to
 IO_APIC_get_PCI_irq_vector(), fix

Fix trivial typo in the drivers/pci/hotplug/ibmphp_core.c changes.

[ Impact: build fix ]

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Yinghai Lu <yinghai@kernel.org>
Cc: eswierk@aristanetworks.com
LKML-Reference: <20090512145128.GA10220@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/drivers/pci/hotplug/ibmphp_core.c b/drivers/pci/hotplug/ibmphp_core.c
index ef53b05..79901a0 100644
--- a/drivers/pci/hotplug/ibmphp_core.c
+++ b/drivers/pci/hotplug/ibmphp_core.c
@@ -161,7 +161,7 @@ int ibmphp_init_devno(struct slot **cur_slot)
 			(*cur_slot)->device = PCI_SLOT(rtable->slots[loop].devfn);
 			for (i = 0; i < 4; i++)
 				(*cur_slot)->irq[i] = IO_APIC_get_PCI_irq_vector((int) (*cur_slot)->bus,
-						(int) (*cur_slot)->device, i.
+						(int) (*cur_slot)->device, i,
 						&ioapic, &ioapic_pin,
 						&triggering, &polarity);
 
-- 
cgit v0.10.2


From 7537d81aa7b7cd31b0caeac8091456e93d96fa8d Mon Sep 17 00:00:00 2001
From: Abhijith Das <adas@redhat.com>
Date: Tue, 12 May 2009 11:16:20 -0500
Subject: GFS2: Fix timestamps on write

This patch copies the timestamps from the vfs inode into gfs2 and syncs
it to the disk inode during writes.

Signed-off-by: Abhijith Das <adas@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index a6dde17..e566421 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -781,10 +781,12 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
 	unlock_page(page);
 	page_cache_release(page);
 
-	if (inode->i_size < to) {
-		i_size_write(inode, to);
-		ip->i_disksize = inode->i_size;
-		di->di_size = cpu_to_be64(inode->i_size);
+	if (copied) {
+		if (inode->i_size < to) {
+			i_size_write(inode, to);
+			ip->i_disksize = inode->i_size;
+		}
+		gfs2_dinode_out(ip, di);
 		mark_inode_dirty(inode);
 	}
 
@@ -824,7 +826,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct buffer_head *dibh;
 	struct gfs2_alloc *al = ip->i_alloc;
-	struct gfs2_dinode *di;
 	unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
 	unsigned int to = from + len;
 	int ret;
@@ -847,11 +848,10 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
 		gfs2_page_add_databufs(ip, page, from, to);
 
 	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-
-	if (likely(ret >= 0) && (inode->i_size > ip->i_disksize)) {
-		di = (struct gfs2_dinode *)dibh->b_data;
-		ip->i_disksize = inode->i_size;
-		di->di_size = cpu_to_be64(inode->i_size);
+	if (ret > 0) {
+		if (inode->i_size > ip->i_disksize)
+			ip->i_disksize = inode->i_size;
+		gfs2_dinode_out(ip, dibh->b_data);
 		mark_inode_dirty(inode);
 	}
 
-- 
cgit v0.10.2


From 12b7ac176831df1aa58a787e67c3e5d698b30163 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 14 May 2009 00:57:44 -0400
Subject: ext4: Rename ext4_get_blocks_wrap() to be ext4_get_blocks()

Another function rename for clarity's sake.  The _wrap prefix simply
confuses people, and didn't add much people trying to follow the code
paths.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index b647899..052d637 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -131,8 +131,7 @@ static int ext4_readdir(struct file *filp,
 		struct buffer_head *bh = NULL;
 
 		map_bh.b_state = 0;
-		err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
-						0, 0, 0);
+		err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0, 0, 0);
 		if (err > 0) {
 			pgoff_t index = map_bh.b_blocknr >>
 					(PAGE_CACHE_SHIFT - inode->i_blkbits);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 89190ae..5dc8368 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1617,10 +1617,10 @@ extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
 			  loff_t len);
-extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
-			sector_t block, unsigned int max_blocks,
-			struct buffer_head *bh, int create,
-			int extend_disksize, int flag);
+extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
+			   sector_t block, unsigned int max_blocks,
+			   struct buffer_head *bh, int create,
+			   int extend_disksize, int flag);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			__u64 start, __u64 len);
 
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4fec6b7..7e7d02d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3151,9 +3151,9 @@ retry:
 			break;
 		}
 		map_bh.b_state = 0;
-		ret = ext4_get_blocks_wrap(handle, inode, block,
-					  max_blocks, &map_bh,
-					  EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
+		ret = ext4_get_blocks(handle, inode, block,
+				      max_blocks, &map_bh,
+				      EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
 		if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
 			WARN_ON(ret <= 0);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f758e80..a9a9b9b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1121,7 +1121,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
 }
 
 /*
- * The ext4_get_blocks_wrap() function try to look up the requested blocks,
+ * The ext4_get_blocks() function tries to look up the requested blocks,
  * and returns if the blocks are already mapped.
  *
  * Otherwise it takes the write lock of the i_data_sem and allocate blocks
@@ -1142,9 +1142,9 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
  *
  * It returns the error in case of allocation failure.
  */
-int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
-			unsigned int max_blocks, struct buffer_head *bh,
-			int create, int extend_disksize, int flag)
+int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
+		    unsigned int max_blocks, struct buffer_head *bh,
+		    int create, int extend_disksize, int flag)
 {
 	int retval;
 
@@ -1268,8 +1268,8 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
 		started = 1;
 	}
 
-	ret = ext4_get_blocks_wrap(handle, inode, iblock,
-					max_blocks, bh_result, create, 0, 0);
+	ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
+			      create, 0, 0);
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 		ret = 0;
@@ -1294,10 +1294,9 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 	dummy.b_state = 0;
 	dummy.b_blocknr = -1000;
 	buffer_trace_init(&dummy.b_history);
-	err = ext4_get_blocks_wrap(handle, inode, block, 1,
-					&dummy, create, 1, 0);
+	err = ext4_get_blocks(handle, inode, block, 1, &dummy, create, 1, 0);
 	/*
-	 * ext4_get_blocks_wrap() returns number of blocks
+	 * ext4_get_blocks() returns number of blocks
 	 * mapped. 0 in case of a HOLE.
 	 */
 	if (err > 0) {
@@ -2009,8 +2008,8 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 
 	handle = ext4_journal_current_handle();
 	BUG_ON(!handle);
-	ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
-				   bh_result, 1, 0, EXT4_DELALLOC_RSVED);
+	ret = ext4_get_blocks(handle, inode, iblock, max_blocks,
+			      bh_result, 1, 0, EXT4_DELALLOC_RSVED);
 	if (ret <= 0)
 		return ret;
 
@@ -2067,11 +2066,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	/*
 	 * We need to make sure the BH_Delay flag is passed down to
 	 * ext4_da_get_block_write(), since it calls
-	 * ext4_get_blocks_wrap() with the EXT4_DELALLOC_RSVED flag.
-	 * This flag causes ext4_get_blocks_wrap() to call
+	 * ext4_get_blocks() with the EXT4_DELALLOC_RSVED flag.
+	 * This flag causes ext4_get_blocks() to call
 	 * ext4_da_update_reserve_space() if the passed buffer head
 	 * has the BH_Delay flag set.  In the future, once we clean up
-	 * the interfaces to ext4_get_blocks_wrap(), we should pass in
+	 * the interfaces to ext4_get_blocks(), we should pass in
 	 * a separate flag which requests that the delayed allocation
 	 * statistics should be updated, instead of depending on the
 	 * state information getting passed down via the map_bh's
@@ -2363,7 +2362,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 	 * preallocated blocks are unmapped but should treated
 	 * the same as allocated blocks.
 	 */
-	ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 0, 0);
+	ret = ext4_get_blocks(NULL, inode, iblock, 1,  bh_result, 0, 0, 0);
 	if ((ret == 0) && !buffer_delay(bh_result)) {
 		/* the block isn't (pre)allocated yet, let's reserve space */
 		/*
@@ -2407,8 +2406,8 @@ static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
 	 * we don't want to do block allocation in writepage
 	 * so call get_block_wrap with create = 0
 	 */
-	ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
-				   bh_result, 0, 0, 0);
+	ret = ext4_get_blocks(NULL, inode, iblock, max_blocks,
+			      bh_result, 0, 0, 0);
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 		ret = 0;
@@ -5034,7 +5033,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
  * Calculate the journal credits for a chunk of data modification.
  *
  * This is called from DIO, fallocate or whoever calling
- * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks.
+ * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks.
  *
  * journal buffers for data blocks are not included here, as DIO
  * and fallocate do no need to journal data buffers.
-- 
cgit v0.10.2


From c21770573319922e3f3fcb331cfaa290c49f1c81 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 14 May 2009 00:58:52 -0400
Subject: ext4: Define a new set of flags for ext4_get_blocks()

The functions ext4_get_blocks(), ext4_ext_get_blocks(), and
ext4_ind_get_blocks() used an ad-hoc set of integer variables used as
boolean flags passed in as arguments.  Use a single flags parameter
and a setandard set of bitfield flags instead.  This saves space on
the call stack, and it also makes the code a bit more understandable.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 052d637..9dc93168 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -131,7 +131,7 @@ static int ext4_readdir(struct file *filp,
 		struct buffer_head *bh = NULL;
 
 		map_bh.b_state = 0;
-		err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0, 0, 0);
+		err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0);
 		if (err > 0) {
 			pgoff_t index = map_bh.b_blocknr >>
 					(PAGE_CACHE_SHIFT - inode->i_blkbits);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5dc8368..17feb4a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -314,10 +314,20 @@ struct ext4_new_group_data {
 };
 
 /*
- * Following is used by preallocation code to tell get_blocks() that we
- * want uninitialzed extents.
+ * Flags used by ext4_get_blocks()
  */
-#define EXT4_CREATE_UNINITIALIZED_EXT		2
+	/* Allocate any needed blocks and/or convert an unitialized
+	   extent to be an initialized ext4 */
+#define EXT4_GET_BLOCKS_CREATE			1
+	/* Request the creation of an unitialized extent */
+#define EXT4_GET_BLOCKS_UNINIT_EXT		2
+#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT	(EXT4_GET_BLOCKS_UNINIT_EXT|\
+						 EXT4_GET_BLOCKS_CREATE)
+	/* Update the ext4_inode_info i_disksize field */
+#define EXT4_GET_BLOCKS_EXTEND_DISKSIZE		4
+	/* Caller is from the delayed allocation writeout path,
+	   so the filesystem blocks have already been accounted for */
+#define EXT4_GET_BLOCKS_DELALLOC_RESERVE	8
 
 /*
  * ioctl commands
@@ -1610,8 +1620,7 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
 				       int chunk);
 extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			       ext4_lblk_t iblock, unsigned int max_blocks,
-			       struct buffer_head *bh_result,
-			       int create, int extend_disksize);
+			       struct buffer_head *bh_result, int flags);
 extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
@@ -1619,8 +1628,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
 			  loff_t len);
 extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
 			   sector_t block, unsigned int max_blocks,
-			   struct buffer_head *bh, int create,
-			   int extend_disksize, int flag);
+			   struct buffer_head *bh, int flags);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			__u64 start, __u64 len);
 
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7e7d02d..27c383c 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2784,7 +2784,7 @@ fix_extent_len:
 int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			ext4_lblk_t iblock,
 			unsigned int max_blocks, struct buffer_head *bh_result,
-			int create, int extend_disksize)
+			int flags)
 {
 	struct ext4_ext_path *path = NULL;
 	struct ext4_extent_header *eh;
@@ -2803,7 +2803,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	cache_type = ext4_ext_in_cache(inode, iblock, &newex);
 	if (cache_type) {
 		if (cache_type == EXT4_EXT_CACHE_GAP) {
-			if (!create) {
+			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
 				/*
 				 * block isn't allocated yet and
 				 * user doesn't want to allocate it
@@ -2869,9 +2869,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 							EXT4_EXT_CACHE_EXTENT);
 				goto out;
 			}
-			if (create == EXT4_CREATE_UNINITIALIZED_EXT)
+			if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
 				goto out;
-			if (!create) {
+			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
 				if (allocated > max_blocks)
 					allocated = max_blocks;
 				/*
@@ -2903,7 +2903,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	 * requested block isn't allocated yet;
 	 * we couldn't try to create block if create flag is zero
 	 */
-	if (!create) {
+	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
 		/*
 		 * put just found gap into cache to speed up
 		 * subsequent requests
@@ -2932,10 +2932,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	 * EXT_UNINIT_MAX_LEN.
 	 */
 	if (max_blocks > EXT_INIT_MAX_LEN &&
-	    create != EXT4_CREATE_UNINITIALIZED_EXT)
+	    !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
 		max_blocks = EXT_INIT_MAX_LEN;
 	else if (max_blocks > EXT_UNINIT_MAX_LEN &&
-		 create == EXT4_CREATE_UNINITIALIZED_EXT)
+		 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
 		max_blocks = EXT_UNINIT_MAX_LEN;
 
 	/* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
@@ -2966,7 +2966,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	/* try to insert new extent into found leaf and return */
 	ext4_ext_store_pblock(&newex, newblock);
 	newex.ee_len = cpu_to_le16(ar.len);
-	if (create == EXT4_CREATE_UNINITIALIZED_EXT)  /* Mark uninitialized */
+	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)  /* Mark uninitialized */
 		ext4_ext_mark_uninitialized(&newex);
 	err = ext4_ext_insert_extent(handle, inode, path, &newex);
 	if (err) {
@@ -2983,7 +2983,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	newblock = ext_pblock(&newex);
 	allocated = ext4_ext_get_actual_len(&newex);
 outnew:
-	if (extend_disksize) {
+	if (flags & EXT4_GET_BLOCKS_EXTEND_DISKSIZE) {
 		disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
 		if (disksize > i_size_read(inode))
 			disksize = i_size_read(inode);
@@ -2994,7 +2994,7 @@ outnew:
 	set_buffer_new(bh_result);
 
 	/* Cache only when it is _not_ an uninitialized extent */
-	if (create != EXT4_CREATE_UNINITIALIZED_EXT)
+	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
 		ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
 						EXT4_EXT_CACHE_EXTENT);
 out:
@@ -3153,7 +3153,7 @@ retry:
 		map_bh.b_state = 0;
 		ret = ext4_get_blocks(handle, inode, block,
 				      max_blocks, &map_bh,
-				      EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
+				      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
 		if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
 			WARN_ON(ret <= 0);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a9a9b9b..8b7564d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -917,7 +917,7 @@ err_out:
 static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 				  ext4_lblk_t iblock, unsigned int maxblocks,
 				  struct buffer_head *bh_result,
-				  int create, int extend_disksize)
+				  int flags)
 {
 	int err = -EIO;
 	ext4_lblk_t offsets[4];
@@ -934,7 +934,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 
 
 	J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
-	J_ASSERT(handle != NULL || create == 0);
+	J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
 	depth = ext4_block_to_path(inode, iblock, offsets,
 					&blocks_to_boundary);
 
@@ -963,7 +963,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	}
 
 	/* Next simple case - plain lookup or failed read of indirect block */
-	if (!create || err == -EIO)
+	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
 		goto cleanup;
 
 	/*
@@ -1002,7 +1002,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	 * protect it if you're about to implement concurrent
 	 * ext4_get_block() -bzzz
 	*/
-	if (!err && extend_disksize) {
+	if (!err && (flags & EXT4_GET_BLOCKS_EXTEND_DISKSIZE)) {
 		disksize = ((loff_t) iblock + count) << inode->i_blkbits;
 		if (disksize > i_size_read(inode))
 			disksize = i_size_read(inode);
@@ -1144,7 +1144,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
  */
 int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 		    unsigned int max_blocks, struct buffer_head *bh,
-		    int create, int extend_disksize, int flag)
+		    int flags)
 {
 	int retval;
 
@@ -1158,15 +1158,15 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	down_read((&EXT4_I(inode)->i_data_sem));
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
 		retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
-				bh, 0, 0);
+				bh, 0);
 	} else {
 		retval = ext4_ind_get_blocks(handle, inode, block, max_blocks,
-					     bh, 0, 0);
+					     bh, 0);
 	}
 	up_read((&EXT4_I(inode)->i_data_sem));
 
 	/* If it is only a block(s) look up */
-	if (!create)
+	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
 		return retval;
 
 	/*
@@ -1205,7 +1205,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	 * let the underlying get_block() function know to
 	 * avoid double accounting
 	 */
-	if (flag)
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
 		EXT4_I(inode)->i_delalloc_reserved_flag = 1;
 	/*
 	 * We need to check for EXT4 here because migrate
@@ -1213,10 +1213,10 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	 */
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
 		retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
-				bh, create, extend_disksize);
+					      bh, flags);
 	} else {
 		retval = ext4_ind_get_blocks(handle, inode, block,
-				max_blocks, bh, create, extend_disksize);
+					     max_blocks, bh, flags);
 
 		if (retval > 0 && buffer_new(bh)) {
 			/*
@@ -1229,7 +1229,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 		}
 	}
 
-	if (flag) {
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
 		EXT4_I(inode)->i_delalloc_reserved_flag = 0;
 		/*
 		 * Update reserved blocks/metadata blocks
@@ -1269,7 +1269,7 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
 	}
 
 	ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
-			      create, 0, 0);
+			      create ? EXT4_GET_BLOCKS_CREATE : 0);
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 		ret = 0;
@@ -1288,16 +1288,19 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 {
 	struct buffer_head dummy;
 	int fatal = 0, err;
+	int flags = EXT4_GET_BLOCKS_EXTEND_DISKSIZE;
 
 	J_ASSERT(handle != NULL || create == 0);
 
 	dummy.b_state = 0;
 	dummy.b_blocknr = -1000;
 	buffer_trace_init(&dummy.b_history);
-	err = ext4_get_blocks(handle, inode, block, 1, &dummy, create, 1, 0);
+	if (create)
+		flags |= EXT4_GET_BLOCKS_CREATE;
+	err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags);
 	/*
-	 * ext4_get_blocks() returns number of blocks
-	 * mapped. 0 in case of a HOLE.
+	 * ext4_get_blocks() returns number of blocks mapped. 0 in
+	 * case of a HOLE.
 	 */
 	if (err > 0) {
 		if (err > 1)
@@ -1997,7 +2000,6 @@ static void ext4_print_free_blocks(struct inode *inode)
 	return;
 }
 
-#define		EXT4_DELALLOC_RSVED	1
 static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result)
 {
@@ -2009,7 +2011,8 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 	handle = ext4_journal_current_handle();
 	BUG_ON(!handle);
 	ret = ext4_get_blocks(handle, inode, iblock, max_blocks,
-			      bh_result, 1, 0, EXT4_DELALLOC_RSVED);
+			      bh_result, EXT4_GET_BLOCKS_CREATE|
+			      EXT4_GET_BLOCKS_DELALLOC_RESERVE);
 	if (ret <= 0)
 		return ret;
 
@@ -2065,16 +2068,17 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 		return 0;
 	/*
 	 * We need to make sure the BH_Delay flag is passed down to
-	 * ext4_da_get_block_write(), since it calls
-	 * ext4_get_blocks() with the EXT4_DELALLOC_RSVED flag.
-	 * This flag causes ext4_get_blocks() to call
+	 * ext4_da_get_block_write(), since it calls ext4_get_blocks()
+	 * with the EXT4_GET_BLOCKS_DELALLOC_RESERVE flag.  This flag
+	 * causes ext4_get_blocks() to call
 	 * ext4_da_update_reserve_space() if the passed buffer head
 	 * has the BH_Delay flag set.  In the future, once we clean up
-	 * the interfaces to ext4_get_blocks(), we should pass in
-	 * a separate flag which requests that the delayed allocation
+	 * the interfaces to ext4_get_blocks(), we should pass in a
+	 * separate flag which requests that the delayed allocation
 	 * statistics should be updated, instead of depending on the
 	 * state information getting passed down via the map_bh's
-	 * state bitmasks plus the magic EXT4_DELALLOC_RSVED flag.
+	 * state bitmasks plus the magic
+	 * EXT4_GET_BLOCKS_DELALLOC_RESERVE flag.
 	 */
 	new.b_state = mpd->b_state & (1 << BH_Delay);
 	new.b_blocknr = 0;
@@ -2362,7 +2366,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 	 * preallocated blocks are unmapped but should treated
 	 * the same as allocated blocks.
 	 */
-	ret = ext4_get_blocks(NULL, inode, iblock, 1,  bh_result, 0, 0, 0);
+	ret = ext4_get_blocks(NULL, inode, iblock, 1,  bh_result, 0);
 	if ((ret == 0) && !buffer_delay(bh_result)) {
 		/* the block isn't (pre)allocated yet, let's reserve space */
 		/*
@@ -2406,8 +2410,7 @@ static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
 	 * we don't want to do block allocation in writepage
 	 * so call get_block_wrap with create = 0
 	 */
-	ret = ext4_get_blocks(NULL, inode, iblock, max_blocks,
-			      bh_result, 0, 0, 0);
+	ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 		ret = 0;
-- 
cgit v0.10.2


From b920c75502cb2c48654ef196d647c8eb81ab608a Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 14 May 2009 00:54:29 -0400
Subject: ext4: Add documentation to the ext4_*get_block* functions

This adds more documentation to various internal functions in
fs/ext4/inode.c, most notably ext4_ind_get_blocks(),
ext4_da_get_block_write(), ext4_da_get_block_prep(),
ext4_normal_get_block_write().

In addition, the static function ext4_normal_get_block_write() has
been renamed noalloc_get_block_write(), since it is used in many
places far beyond ext4_normal_writepage().

Plenty of warnings have been added to the noalloc_get_block_write()
function, since the way it is used is amazingly fragile.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8b7564d..fd5f27a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -892,6 +892,10 @@ err_out:
 }
 
 /*
+ * The ext4_ind_get_blocks() function handles non-extents inodes
+ * (i.e., using the traditional indirect/double-indirect i_blocks
+ * scheme) for ext4_get_blocks().
+ *
  * Allocation strategy is simple: if we have to allocate something, we will
  * have to go the whole way to leaf. So let's do it before attaching anything
  * to tree, set linkage between the newborn blocks, write them if sync is
@@ -909,10 +913,11 @@ err_out:
  * return = 0, if plain lookup failed.
  * return < 0, error case.
  *
- *
- * Need to be called with
- * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
- * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
+ * The ext4_ind_get_blocks() function should be called with
+ * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
+ * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
+ * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
+ * blocks.
  */
 static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 				  ext4_lblk_t iblock, unsigned int maxblocks,
@@ -1152,8 +1157,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	clear_buffer_unwritten(bh);
 
 	/*
-	 * Try to see if we can get  the block without requesting
-	 * for new file system block.
+	 * Try to see if we can get the block without requesting a new
+	 * file system block.
 	 */
 	down_read((&EXT4_I(inode)->i_data_sem));
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
@@ -2000,6 +2005,12 @@ static void ext4_print_free_blocks(struct inode *inode)
 	return;
 }
 
+/*
+ * This function is used by mpage_da_map_blocks().  We separate it out
+ * as a separate function just to make life easier, and because
+ * mpage_da_map_blocks() used to be a generic function that took a
+ * get_block_t.
+ */
 static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result)
 {
@@ -2031,8 +2042,8 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 
 	/*
 	 * Update on-disk size along with block allocation we don't
-	 * use 'extend_disksize' as size may change within already
-	 * allocated block -bzzz
+	 * use EXT4_GET_BLOCKS_EXTEND_DISKSIZE as size may change
+	 * within already allocated block -bzzz
 	 */
 	disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
 	if (disksize > i_size_read(inode))
@@ -2338,8 +2349,9 @@ static int __mpage_da_writepage(struct page *page,
 }
 
 /*
- * this is a special callback for ->write_begin() only
- * it's intention is to return mapped block or reserve space
+ * This is a special get_blocks_t callback which is used by
+ * ext4_da_write_begin().  It will either return mapped block or
+ * reserve space for a single block.
  *
  * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
  * We also have b_blocknr = -1 and b_bdev initialized properly
@@ -2347,7 +2359,6 @@ static int __mpage_da_writepage(struct page *page,
  * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
  * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
  * initialized properly.
- *
  */
 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 				  struct buffer_head *bh_result, int create)
@@ -2400,7 +2411,23 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 	return ret;
 }
 
-static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
+/*
+ * This function is used as a standard get_block_t calback function
+ * when there is no desire to allocate any blocks.  It is used as a
+ * callback function for block_prepare_write(), nobh_writepage(), and
+ * block_write_full_page().  These functions should only try to map a
+ * single block at a time.
+ *
+ * Since this function doesn't do block allocations even if the caller
+ * requests it by passing in create=1, it is critically important that
+ * any caller checks to make sure that any buffer heads are returned
+ * by this function are either all already mapped or marked for
+ * delayed allocation before calling nobh_writepage() or
+ * block_write_full_page().  Otherwise, b_blocknr could be left
+ * unitialized, and the page write functions will be taken by
+ * surprise.
+ */
+static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
 {
 	int ret = 0;
@@ -2419,10 +2446,11 @@ static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
 }
 
 /*
- * get called vi ext4_da_writepages after taking page lock (have journal handle)
- * get called via journal_submit_inode_data_buffers (no journal handle)
- * get called via shrink_page_list via pdflush (no journal handle)
- * or grab_page_cache when doing write_begin (have journal handle)
+ * This function can get called via...
+ *   - ext4_da_writepages after taking page lock (have journal handle)
+ *   - journal_submit_inode_data_buffers (no journal handle)
+ *   - shrink_page_list via pdflush (no journal handle)
+ *   - grab_page_cache when doing write_begin (have journal handle)
  */
 static int ext4_da_writepage(struct page *page,
 				struct writeback_control *wbc)
@@ -2473,7 +2501,7 @@ static int ext4_da_writepage(struct page *page,
 		 * do block allocation here.
 		 */
 		ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-						ext4_normal_get_block_write);
+					  noalloc_get_block_write);
 		if (!ret) {
 			page_bufs = page_buffers(page);
 			/* check whether all are mapped and non delay */
@@ -2498,11 +2526,10 @@ static int ext4_da_writepage(struct page *page,
 	}
 
 	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
-		ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
+		ret = nobh_writepage(page, noalloc_get_block_write, wbc);
 	else
-		ret = block_write_full_page(page,
-						ext4_normal_get_block_write,
-						wbc);
+		ret = block_write_full_page(page, noalloc_get_block_write,
+					    wbc);
 
 	return ret;
 }
@@ -2814,7 +2841,7 @@ retry:
 	*pagep = page;
 
 	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-							ext4_da_get_block_prep);
+				ext4_da_get_block_prep);
 	if (ret < 0) {
 		unlock_page(page);
 		ext4_journal_stop(handle);
@@ -3122,12 +3149,10 @@ static int __ext4_normal_writepage(struct page *page,
 	struct inode *inode = page->mapping->host;
 
 	if (test_opt(inode->i_sb, NOBH))
-		return nobh_writepage(page,
-					ext4_normal_get_block_write, wbc);
+		return nobh_writepage(page, noalloc_get_block_write, wbc);
 	else
-		return block_write_full_page(page,
-						ext4_normal_get_block_write,
-						wbc);
+		return block_write_full_page(page, noalloc_get_block_write,
+					     wbc);
 }
 
 static int ext4_normal_writepage(struct page *page,
@@ -3179,7 +3204,7 @@ static int __ext4_journalled_writepage(struct page *page,
 	int err;
 
 	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-					ext4_normal_get_block_write);
+				  noalloc_get_block_write);
 	if (ret != 0)
 		goto out_unlock;
 
@@ -3264,9 +3289,8 @@ static int ext4_journalled_writepage(struct page *page,
 		 * really know unless we go poke around in the buffer_heads.
 		 * But block_write_full_page will do the right thing.
 		 */
-		return block_write_full_page(page,
-						ext4_normal_get_block_write,
-						wbc);
+		return block_write_full_page(page, noalloc_get_block_write,
+					     wbc);
 	}
 no_write:
 	redirty_page_for_writepage(wbc, page);
-- 
cgit v0.10.2


From a2dc52b5d1d8cc280b3e795abf1c80ac8c49f30c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 12 May 2009 13:51:29 -0400
Subject: ext4: Add BUG_ON debugging checks to noalloc_get_block_write()

Enforce that noalloc_get_block_write() is only called to map one block
at a time, and that it always is successful in finding a mapping for
given an inode's logical block block number if it is called with
create == 1.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index fd5f27a..e6113c3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2433,11 +2433,14 @@ static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
 	int ret = 0;
 	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
 
+	BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+
 	/*
 	 * we don't want to do block allocation in writepage
 	 * so call get_block_wrap with create = 0
 	 */
 	ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
+	BUG_ON(create && ret == 0);
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 		ret = 0;
-- 
cgit v0.10.2


From 6f733a349c2b70a83b5451fbe61ef2cf67d41cdf Mon Sep 17 00:00:00 2001
From: Paul Walmsley <paul@pwsan.com>
Date: Mon, 11 May 2009 09:58:19 -0700
Subject: ARM: OMAP: GPIO de-bounce clocks don't affect module idle state

GPIO de-bounce clocks don't have any impact on the module idle state, so
the clock code should not wait for the module to enable after the de-bounce
clocks are enabled.

Problem found by Kevin Hilman <khilman@deeprootsystems.com>.

Signed-off-by: Paul Walmsley <paul@pwsan.com>
Signed-off-by: Kevin Hilman <khilman@deeprootsystems.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>

diff --git a/arch/arm/mach-omap2/clock34xx.h b/arch/arm/mach-omap2/clock34xx.h
index 6763b8f..017a30e 100644
--- a/arch/arm/mach-omap2/clock34xx.h
+++ b/arch/arm/mach-omap2/clock34xx.h
@@ -2182,7 +2182,7 @@ static struct clk wkup_32k_fck = {
 
 static struct clk gpio1_dbck = {
 	.name		= "gpio1_dbck",
-	.ops		= &clkops_omap2_dflt_wait,
+	.ops		= &clkops_omap2_dflt,
 	.parent		= &wkup_32k_fck,
 	.enable_reg	= OMAP_CM_REGADDR(WKUP_MOD, CM_FCLKEN),
 	.enable_bit	= OMAP3430_EN_GPIO1_SHIFT,
@@ -2427,7 +2427,7 @@ static struct clk per_32k_alwon_fck = {
 
 static struct clk gpio6_dbck = {
 	.name		= "gpio6_dbck",
-	.ops		= &clkops_omap2_dflt_wait,
+	.ops		= &clkops_omap2_dflt,
 	.parent		= &per_32k_alwon_fck,
 	.enable_reg	= OMAP_CM_REGADDR(OMAP3430_PER_MOD, CM_FCLKEN),
 	.enable_bit	= OMAP3430_EN_GPIO6_SHIFT,
@@ -2437,7 +2437,7 @@ static struct clk gpio6_dbck = {
 
 static struct clk gpio5_dbck = {
 	.name		= "gpio5_dbck",
-	.ops		= &clkops_omap2_dflt_wait,
+	.ops		= &clkops_omap2_dflt,
 	.parent		= &per_32k_alwon_fck,
 	.enable_reg	= OMAP_CM_REGADDR(OMAP3430_PER_MOD, CM_FCLKEN),
 	.enable_bit	= OMAP3430_EN_GPIO5_SHIFT,
@@ -2447,7 +2447,7 @@ static struct clk gpio5_dbck = {
 
 static struct clk gpio4_dbck = {
 	.name		= "gpio4_dbck",
-	.ops		= &clkops_omap2_dflt_wait,
+	.ops		= &clkops_omap2_dflt,
 	.parent		= &per_32k_alwon_fck,
 	.enable_reg	= OMAP_CM_REGADDR(OMAP3430_PER_MOD, CM_FCLKEN),
 	.enable_bit	= OMAP3430_EN_GPIO4_SHIFT,
@@ -2457,7 +2457,7 @@ static struct clk gpio4_dbck = {
 
 static struct clk gpio3_dbck = {
 	.name		= "gpio3_dbck",
-	.ops		= &clkops_omap2_dflt_wait,
+	.ops		= &clkops_omap2_dflt,
 	.parent		= &per_32k_alwon_fck,
 	.enable_reg	= OMAP_CM_REGADDR(OMAP3430_PER_MOD, CM_FCLKEN),
 	.enable_bit	= OMAP3430_EN_GPIO3_SHIFT,
@@ -2467,7 +2467,7 @@ static struct clk gpio3_dbck = {
 
 static struct clk gpio2_dbck = {
 	.name		= "gpio2_dbck",
-	.ops		= &clkops_omap2_dflt_wait,
+	.ops		= &clkops_omap2_dflt,
 	.parent		= &per_32k_alwon_fck,
 	.enable_reg	= OMAP_CM_REGADDR(OMAP3430_PER_MOD, CM_FCLKEN),
 	.enable_bit	= OMAP3430_EN_GPIO2_SHIFT,
-- 
cgit v0.10.2


From eaf9393bb79a9c7c8e708669f9581ca466122430 Mon Sep 17 00:00:00 2001
From: Jarkko Nikula <jarkko.nikula@nokia.com>
Date: Tue, 12 May 2009 11:20:02 -0700
Subject: ARM: OMAP2: Fix tusb6010 init error and compilation warning

Fix "tusb6010 init error 5, -19" and compilation warning from function
tusb6010_platform_retime "warning: 'sysclk_ps' is used uninitialized in this
function".

I suppose commit c094ba34b8f780885d029ce3c2715a194b780e5d was meant to test
for zero fclk_ps instead of sysclk_ps.

Signed-off-by: Jarkko Nikula <jarkko.nikula@nokia.com>
Cc: Roel Kluin <roel.kluin@gmail.com>
Tested-by: Kalle Valo <kalle.valo@iki.fi>
Signed-off-by: Tony Lindgren <tony@atomide.com>

diff --git a/arch/arm/mach-omap2/usb-tusb6010.c b/arch/arm/mach-omap2/usb-tusb6010.c
index 8df55f4..8622c24 100644
--- a/arch/arm/mach-omap2/usb-tusb6010.c
+++ b/arch/arm/mach-omap2/usb-tusb6010.c
@@ -187,7 +187,7 @@ int tusb6010_platform_retime(unsigned is_refclk)
 	unsigned	sysclk_ps;
 	int		status;
 
-	if (!refclk_psec || sysclk_ps == 0)
+	if (!refclk_psec || fclk_ps == 0)
 		return -ENODEV;
 
 	sysclk_ps = is_refclk ? refclk_psec : TUSB6010_OSCCLK_60;
-- 
cgit v0.10.2


From c8a799b03a981b4b27d8fd7320c96cfbe781b2fb Mon Sep 17 00:00:00 2001
From: Jarkko Nikula <jhnikula@gmail.com>
Date: Tue, 12 May 2009 11:20:02 -0700
Subject: ARM: OMAP2: Fix SPI driver failure on 2420 when running multi-omap
 config

SPI driver will do unhandled fault on OMAP2420 if trying to probe
non-existing SPI busses. Register those additional busses runtime only
for cpus having them.

Signed-off-by: Jarkko Nikula <jhnikula@gmail.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>

diff --git a/arch/arm/mach-omap2/devices.c b/arch/arm/mach-omap2/devices.c
index 496983a..894cc35 100644
--- a/arch/arm/mach-omap2/devices.c
+++ b/arch/arm/mach-omap2/devices.c
@@ -354,10 +354,12 @@ static void omap_init_mcspi(void)
 	platform_device_register(&omap2_mcspi1);
 	platform_device_register(&omap2_mcspi2);
 #if defined(CONFIG_ARCH_OMAP2430) || defined(CONFIG_ARCH_OMAP3)
-	platform_device_register(&omap2_mcspi3);
+	if (cpu_is_omap2430() || cpu_is_omap343x())
+		platform_device_register(&omap2_mcspi3);
 #endif
 #ifdef CONFIG_ARCH_OMAP3
-	platform_device_register(&omap2_mcspi4);
+	if (cpu_is_omap343x())
+		platform_device_register(&omap2_mcspi4);
 #endif
 }
 
-- 
cgit v0.10.2


From 4ea60b0c7a8487af5de736d394b147baf7691f3c Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Tue, 12 May 2009 11:20:03 -0700
Subject: ARM: OMAP2/3: Add name for musb clocks

With the clkdev, musb_core.c needs to register clock with name "ick".

Once all the platforms using the musb driver have been converted
to use clockdev, the clock name does not need to be passed
from the low-level init code.

Signed-off-by: Tony Lindgren <tony@atomide.com>

diff --git a/arch/arm/mach-omap2/clock24xx.c b/arch/arm/mach-omap2/clock24xx.c
index efc59c4..ff9b4c0 100644
--- a/arch/arm/mach-omap2/clock24xx.c
+++ b/arch/arm/mach-omap2/clock24xx.c
@@ -206,7 +206,7 @@ static struct omap_clk omap24xx_clks[] = {
 	CLK(NULL,	"aes_ick",	&aes_ick,	CK_243X | CK_242X),
 	CLK(NULL,	"pka_ick",	&pka_ick,	CK_243X | CK_242X),
 	CLK(NULL,	"usb_fck",	&usb_fck,	CK_243X | CK_242X),
-	CLK(NULL,	"usbhs_ick",	&usbhs_ick,	CK_243X),
+	CLK("musb_hdrc",	"ick",	&usbhs_ick,	CK_243X),
 	CLK("mmci-omap-hs.0", "ick",	&mmchs1_ick,	CK_243X),
 	CLK("mmci-omap-hs.0", "fck",	&mmchs1_fck,	CK_243X),
 	CLK("mmci-omap-hs.1", "ick",	&mmchs2_ick,	CK_243X),
diff --git a/arch/arm/mach-omap2/clock34xx.c b/arch/arm/mach-omap2/clock34xx.c
index 0a14dca..a057539 100644
--- a/arch/arm/mach-omap2/clock34xx.c
+++ b/arch/arm/mach-omap2/clock34xx.c
@@ -157,7 +157,7 @@ static struct omap_clk omap34xx_clks[] = {
 	CLK(NULL,	"ssi_ssr_fck",	&ssi_ssr_fck,	CK_343X),
 	CLK(NULL,	"ssi_sst_fck",	&ssi_sst_fck,	CK_343X),
 	CLK(NULL,	"core_l3_ick",	&core_l3_ick,	CK_343X),
-	CLK(NULL,	"hsotgusb_ick",	&hsotgusb_ick,	CK_343X),
+	CLK("musb_hdrc",	"ick",	&hsotgusb_ick,	CK_343X),
 	CLK(NULL,	"sdrc_ick",	&sdrc_ick,	CK_343X),
 	CLK(NULL,	"gpmc_fck",	&gpmc_fck,	CK_343X),
 	CLK(NULL,	"security_l3_ick", &security_l3_ick, CK_343X),
-- 
cgit v0.10.2


From 7ed42a28b269f8682eefae27f5c11187eb56e63b Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 12 May 2009 11:33:08 -0700
Subject: x86, boot: correct sanity checks in boot/compressed/misc.c

arch/x86/boot/compressed/misc.c contains several sanity checks on the
output address.  Correct constraints that are no longer correct:

- the alignment test should be MIN_KERNEL_ALIGN on both 32 and 64
  bits.
- the 64 bit maximum address was set to 2^40, which was the limit of
  one specific x86-64 implementation.  Change the test to 2^46, the
  current Linux limit, and at least try to test the end rather than
  the beginning.
- for non-relocatable kernels, test against LOAD_PHYSICAL_ADDR on both
  32 and 64 bits.

[ Impact: fix potential boot failure due to invalid tests ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index e45be73..842b2a3 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -325,21 +325,19 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
 	free_mem_ptr     = heap;	/* Heap */
 	free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
 
+	if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1))
+		error("Destination address inappropriately aligned");
 #ifdef CONFIG_X86_64
-	if ((unsigned long)output & (__KERNEL_ALIGN - 1))
-		error("Destination address not 2M aligned");
-	if ((unsigned long)output >= 0xffffffffffUL)
+	if (heap > 0x3fffffffffffUL)
 		error("Destination address too large");
 #else
-	if ((u32)output & (CONFIG_PHYSICAL_ALIGN - 1))
-		error("Destination address not CONFIG_PHYSICAL_ALIGN aligned");
 	if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff))
 		error("Destination address too large");
+#endif
 #ifndef CONFIG_RELOCATABLE
-	if ((u32)output != LOAD_PHYSICAL_ADDR)
+	if ((unsigned long)output != LOAD_PHYSICAL_ADDR)
 		error("Wrong destination address");
 #endif
-#endif
 
 	if (!quiet)
 		putstr("\nDecompressing Linux... ");
-- 
cgit v0.10.2


From c4f68236e41641494f9c8a418ccc0678c335bbb5 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 12 May 2009 11:37:34 -0700
Subject: x86-64: align __PHYSICAL_START, remove __KERNEL_ALIGN

Handle the misconfiguration where CONFIG_PHYSICAL_START is
incompatible with CONFIG_PHYSICAL_ALIGN.  This is a configuration
error, but one which arises easily since Kconfig doesn't have the
smarts to express the true relationship between these two variables.
Hence, align __PHYSICAL_START the same way we align LOAD_PHYSICAL_ADDR
in <asm/boot.h>.

For non-relocatable kernels, this would cause the boot to fail.

[ Impact: fix boot failures for non-relocatable kernels ]

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index d38c91b..e11900f 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -32,17 +32,9 @@
  */
 #define __PAGE_OFFSET           _AC(0xffff880000000000, UL)
 
-#define __PHYSICAL_START	CONFIG_PHYSICAL_START
-#define __KERNEL_ALIGN		0x200000
-
-/*
- * Make sure kernel is aligned to 2MB address. Catching it at compile
- * time is better. Change your config file and compile the kernel
- * for a 2MB aligned address (CONFIG_PHYSICAL_START)
- */
-#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0
-#error "CONFIG_PHYSICAL_START must be a multiple of 2MB"
-#endif
+#define __PHYSICAL_START	((CONFIG_PHYSICAL_START +	 	\
+				  (CONFIG_PHYSICAL_ALIGN - 1)) &	\
+				 ~(CONFIG_PHYSICAL_ALIGN - 1))
 
 #define __START_KERNEL		(__START_KERNEL_map + __PHYSICAL_START)
 #define __START_KERNEL_map	_AC(0xffffffff80000000, UL)
-- 
cgit v0.10.2


From 5bb9efe33ea4001a17ab98186a40a134a3061d67 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 May 2009 08:12:51 +0200
Subject: perf_counter: fix print debug irq disable

inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage.
bash/15802 [HC0[0]:SC0[0]:HE1:SE1] takes:
 (sysrq_key_table_lock){?.....},

Don't unconditionally enable interrupts in the perf_counter_print_debug()
path.

[ Impact: fix potential deadlock pointed out by lockdep ]

LKML-Reference: <new-submission>
Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index da27419..f7772ff 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -621,12 +621,13 @@ void perf_counter_print_debug(void)
 {
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
 	struct cpu_hw_counters *cpuc;
+	unsigned long flags;
 	int cpu, idx;
 
 	if (!x86_pmu.num_counters)
 		return;
 
-	local_irq_disable();
+	local_irq_save(flags);
 
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
@@ -664,7 +665,7 @@ void perf_counter_print_debug(void)
 		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
 			cpu, idx, pmc_count);
 	}
-	local_irq_enable();
+	local_irq_restore(flags);
 }
 
 static void x86_pmu_disable(struct perf_counter *counter)
-- 
cgit v0.10.2


From 4f23122858a27ba97444b9b37a066d83edebd4c8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 13 May 2009 08:35:35 +0200
Subject: splice: fix repeated kmap()'s in default_file_splice_read()

We cannot reliably map more than one page at the time, or we risk
deadlocking. Just allocate the pages from low mem instead.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/splice.c b/fs/splice.c
index eefd96b..c5e3c79 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -580,13 +580,13 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) {
 		struct page *page;
 
-		page = alloc_page(GFP_HIGHUSER);
+		page = alloc_page(GFP_USER);
 		error = -ENOMEM;
 		if (!page)
 			goto err;
 
 		this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
-		vec[i].iov_base = (void __user *) kmap(page);
+		vec[i].iov_base = (void __user *) page_address(page);
 		vec[i].iov_len = this_len;
 		pages[i] = page;
 		spd.nr_pages++;
@@ -604,7 +604,6 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 
 	nr_freed = 0;
 	for (i = 0; i < spd.nr_pages; i++) {
-		kunmap(pages[i]);
 		this_len = min_t(size_t, vec[i].iov_len, res);
 		partial[i].offset = 0;
 		partial[i].len = this_len;
@@ -624,10 +623,9 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	return res;
 
 err:
-	for (i = 0; i < spd.nr_pages; i++) {
-		kunmap(pages[i]);
+	for (i = 0; i < spd.nr_pages; i++)
 		__free_page(pages[i]);
-	}
+
 	return error;
 }
 EXPORT_SYMBOL(default_file_splice_read);
-- 
cgit v0.10.2


From af777ce42d3d51cdef353ce296d6f99dc503feef Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 13 May 2009 16:59:40 +0900
Subject: sh: clkfwk: module_clk -> peripheral_clk rename.

For consistenct naming, and to allow us to fix up some confusion in the
SH-Mobile clock framework, amongst other places.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 61ff227..0eedf93 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -50,8 +50,8 @@ static struct clk master_clk = {
 	.rate		= CONFIG_SH_PCLK_FREQ,
 };
 
-static struct clk module_clk = {
-	.name		= "module_clk",
+static struct clk peripheral_clk = {
+	.name		= "peripheral_clk",
 	.parent		= &master_clk,
 	.flags		= CLK_ENABLE_ON_INIT,
 };
@@ -73,7 +73,7 @@ static struct clk cpu_clk = {
  */
 static struct clk *onchip_clocks[] = {
 	&master_clk,
-	&module_clk,
+	&peripheral_clk,
 	&bus_clk,
 	&cpu_clk,
 };
diff --git a/arch/sh/kernel/cpu/sh2/setup-sh7619.c b/arch/sh/kernel/cpu/sh2/setup-sh7619.c
index 94ac27f..1379873 100644
--- a/arch/sh/kernel/cpu/sh2/setup-sh7619.c
+++ b/arch/sh/kernel/cpu/sh2/setup-sh7619.c
@@ -115,7 +115,7 @@ static struct sh_timer_config cmt0_platform_data = {
 	.name = "CMT0",
 	.channel_offset = 0x02,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 125,
 	.clocksource_rating = 0, /* disabled due to code generation issues */
 };
@@ -147,7 +147,7 @@ static struct sh_timer_config cmt1_platform_data = {
 	.name = "CMT1",
 	.channel_offset = 0x08,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 125,
 	.clocksource_rating = 0, /* disabled due to code generation issues */
 };
diff --git a/arch/sh/kernel/cpu/sh2a/setup-mxg.c b/arch/sh/kernel/cpu/sh2a/setup-mxg.c
index a452d96..869c2da 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-mxg.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-mxg.c
@@ -118,7 +118,7 @@ static struct sh_timer_config mtu2_0_platform_data = {
 	.name = "MTU2_0",
 	.channel_offset = -0x80,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -149,7 +149,7 @@ static struct sh_timer_config mtu2_1_platform_data = {
 	.name = "MTU2_1",
 	.channel_offset = -0x100,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -180,7 +180,7 @@ static struct sh_timer_config mtu2_2_platform_data = {
 	.name = "MTU2_2",
 	.channel_offset = 0x80,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
index 772358b..d8febe1 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
@@ -255,7 +255,7 @@ static struct sh_timer_config mtu2_0_platform_data = {
 	.name = "MTU2_0",
 	.channel_offset = -0x80,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -286,7 +286,7 @@ static struct sh_timer_config mtu2_1_platform_data = {
 	.name = "MTU2_1",
 	.channel_offset = -0x100,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -317,7 +317,7 @@ static struct sh_timer_config mtu2_2_platform_data = {
 	.name = "MTU2_2",
 	.channel_offset = 0x80,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
index d749341..62e3039 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
@@ -211,7 +211,7 @@ static struct sh_timer_config cmt0_platform_data = {
 	.name = "CMT0",
 	.channel_offset = 0x02,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 125,
 	.clocksource_rating = 0, /* disabled due to code generation issues */
 };
@@ -243,7 +243,7 @@ static struct sh_timer_config cmt1_platform_data = {
 	.name = "CMT1",
 	.channel_offset = 0x08,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 125,
 	.clocksource_rating = 0, /* disabled due to code generation issues */
 };
@@ -275,7 +275,7 @@ static struct sh_timer_config mtu2_0_platform_data = {
 	.name = "MTU2_0",
 	.channel_offset = -0x80,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -306,7 +306,7 @@ static struct sh_timer_config mtu2_1_platform_data = {
 	.name = "MTU2_1",
 	.channel_offset = -0x100,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
index 2fc6bff..3e6f3d7 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
@@ -171,7 +171,7 @@ static struct sh_timer_config cmt0_platform_data = {
 	.name = "CMT0",
 	.channel_offset = 0x02,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 125,
 	.clocksource_rating = 0, /* disabled due to code generation issues */
 };
@@ -203,7 +203,7 @@ static struct sh_timer_config cmt1_platform_data = {
 	.name = "CMT1",
 	.channel_offset = 0x08,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 125,
 	.clocksource_rating = 0, /* disabled due to code generation issues */
 };
@@ -235,7 +235,7 @@ static struct sh_timer_config mtu2_0_platform_data = {
 	.name = "MTU2_0",
 	.channel_offset = -0x80,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -266,7 +266,7 @@ static struct sh_timer_config mtu2_1_platform_data = {
 	.name = "MTU2_1",
 	.channel_offset = -0x100,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -297,7 +297,7 @@ static struct sh_timer_config mtu2_2_platform_data = {
 	.name = "MTU2_2",
 	.channel_offset = 0x80,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7705.c b/arch/sh/kernel/cpu/sh3/setup-sh7705.c
index 3951366..88f742f 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh7705.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh7705.c
@@ -121,7 +121,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x02,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -152,7 +152,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0xe,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -183,7 +183,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1a,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh3/setup-sh770x.c b/arch/sh/kernel/cpu/sh3/setup-sh770x.c
index 9412d91..c563067 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh770x.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh770x.c
@@ -149,7 +149,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x02,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -180,7 +180,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0xe,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -211,7 +211,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1a,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7710.c b/arch/sh/kernel/cpu/sh3/setup-sh7710.c
index 07ff38d..efa76c8 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh7710.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh7710.c
@@ -125,7 +125,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x02,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -156,7 +156,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0xe,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -187,7 +187,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1a,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7720.c b/arch/sh/kernel/cpu/sh3/setup-sh7720.c
index d8b46f5..5b21077 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh7720.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh7720.c
@@ -128,7 +128,7 @@ static struct sh_timer_config cmt0_platform_data = {
 	.name = "CMT0",
 	.channel_offset = 0x10,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 125,
 	.clocksource_rating = 125,
 };
@@ -160,7 +160,7 @@ static struct sh_timer_config cmt1_platform_data = {
 	.name = "CMT1",
 	.channel_offset = 0x20,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource cmt1_resources[] = {
@@ -190,7 +190,7 @@ static struct sh_timer_config cmt2_platform_data = {
 	.name = "CMT2",
 	.channel_offset = 0x30,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource cmt2_resources[] = {
@@ -220,7 +220,7 @@ static struct sh_timer_config cmt3_platform_data = {
 	.name = "CMT3",
 	.channel_offset = 0x40,
 	.timer_bit = 3,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource cmt3_resources[] = {
@@ -250,7 +250,7 @@ static struct sh_timer_config cmt4_platform_data = {
 	.name = "CMT4",
 	.channel_offset = 0x50,
 	.timer_bit = 4,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource cmt4_resources[] = {
@@ -280,7 +280,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x02,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -311,7 +311,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0xe,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -342,7 +342,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1a,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
index be79fa13..6d088d1 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
@@ -38,7 +38,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -69,7 +69,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -100,7 +100,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7750.c b/arch/sh/kernel/cpu/sh4/setup-sh7750.c
index 09da0c1..851672d 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh7750.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh7750.c
@@ -65,7 +65,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -96,7 +96,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -127,7 +127,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
@@ -162,7 +162,7 @@ static struct sh_timer_config tmu3_platform_data = {
 	.name = "TMU3",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu3_resources[] = {
@@ -192,7 +192,7 @@ static struct sh_timer_config tmu4_platform_data = {
 	.name = "TMU4",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu4_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7760.c b/arch/sh/kernel/cpu/sh4/setup-sh7760.c
index cd09733..5b82251 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh7760.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh7760.c
@@ -164,7 +164,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -195,7 +195,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -226,7 +226,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7763.c b/arch/sh/kernel/cpu/sh4a/setup-sh7763.c
index c91f34c..f1e0c0d 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7763.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7763.c
@@ -118,7 +118,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -149,7 +149,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -180,7 +180,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
@@ -210,7 +210,7 @@ static struct sh_timer_config tmu3_platform_data = {
 	.name = "TMU3",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu3_resources[] = {
@@ -240,7 +240,7 @@ static struct sh_timer_config tmu4_platform_data = {
 	.name = "TMU4",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu4_resources[] = {
@@ -270,7 +270,7 @@ static struct sh_timer_config tmu5_platform_data = {
 	.name = "TMU5",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu5_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
index b61d614..12fb875 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
@@ -81,7 +81,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -112,7 +112,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -143,7 +143,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
@@ -173,7 +173,7 @@ static struct sh_timer_config tmu3_platform_data = {
 	.name = "TMU3",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu3_resources[] = {
@@ -203,7 +203,7 @@ static struct sh_timer_config tmu4_platform_data = {
 	.name = "TMU4",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu4_resources[] = {
@@ -233,7 +233,7 @@ static struct sh_timer_config tmu5_platform_data = {
 	.name = "TMU5",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu5_resources[] = {
@@ -263,7 +263,7 @@ static struct sh_timer_config tmu6_platform_data = {
 	.name = "TMU6",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu6_resources[] = {
@@ -293,7 +293,7 @@ static struct sh_timer_config tmu7_platform_data = {
 	.name = "TMU7",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu7_resources[] = {
@@ -323,7 +323,7 @@ static struct sh_timer_config tmu8_platform_data = {
 	.name = "TMU8",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu8_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7780.c b/arch/sh/kernel/cpu/sh4a/setup-sh7780.c
index f1df020..715e05b 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7780.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7780.c
@@ -18,7 +18,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -49,7 +49,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -80,7 +80,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
@@ -110,7 +110,7 @@ static struct sh_timer_config tmu3_platform_data = {
 	.name = "TMU3",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu3_resources[] = {
@@ -140,7 +140,7 @@ static struct sh_timer_config tmu4_platform_data = {
 	.name = "TMU4",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu4_resources[] = {
@@ -170,7 +170,7 @@ static struct sh_timer_config tmu5_platform_data = {
 	.name = "TMU5",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu5_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
index dc5d3e5..d7e77bc 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
@@ -20,7 +20,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -51,7 +51,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -82,7 +82,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
@@ -112,7 +112,7 @@ static struct sh_timer_config tmu3_platform_data = {
 	.name = "TMU3",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu3_resources[] = {
@@ -142,7 +142,7 @@ static struct sh_timer_config tmu4_platform_data = {
 	.name = "TMU4",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu4_resources[] = {
@@ -172,7 +172,7 @@ static struct sh_timer_config tmu5_platform_data = {
 	.name = "TMU5",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu5_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7786.c b/arch/sh/kernel/cpu/sh4a/setup-sh7786.c
index 2c464bf..93e0d2c 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7786.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7786.c
@@ -75,7 +75,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -106,7 +106,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -137,7 +137,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
@@ -167,7 +167,7 @@ static struct sh_timer_config tmu3_platform_data = {
 	.name = "TMU3",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu3_resources[] = {
@@ -197,7 +197,7 @@ static struct sh_timer_config tmu4_platform_data = {
 	.name = "TMU4",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu4_resources[] = {
@@ -227,7 +227,7 @@ static struct sh_timer_config tmu5_platform_data = {
 	.name = "TMU5",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu5_resources[] = {
@@ -257,7 +257,7 @@ static struct sh_timer_config tmu6_platform_data = {
 	.name = "TMU6",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu6_resources[] = {
@@ -287,7 +287,7 @@ static struct sh_timer_config tmu7_platform_data = {
 	.name = "TMU7",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu7_resources[] = {
@@ -317,7 +317,7 @@ static struct sh_timer_config tmu8_platform_data = {
 	.name = "TMU8",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu8_resources[] = {
@@ -347,7 +347,7 @@ static struct sh_timer_config tmu9_platform_data = {
 	.name = "TMU9",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu9_resources[] = {
@@ -377,7 +377,7 @@ static struct sh_timer_config tmu10_platform_data = {
 	.name = "TMU10",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu10_resources[] = {
@@ -407,7 +407,7 @@ static struct sh_timer_config tmu11_platform_data = {
 	.name = "TMU11",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu11_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh4a/setup-shx3.c b/arch/sh/kernel/cpu/sh4a/setup-shx3.c
index 9d5185b..53c65fd 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-shx3.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-shx3.c
@@ -53,7 +53,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -84,7 +84,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -115,7 +115,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
@@ -145,7 +145,7 @@ static struct sh_timer_config tmu3_platform_data = {
 	.name = "TMU3",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu3_resources[] = {
@@ -175,7 +175,7 @@ static struct sh_timer_config tmu4_platform_data = {
 	.name = "TMU4",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu4_resources[] = {
@@ -205,7 +205,7 @@ static struct sh_timer_config tmu5_platform_data = {
 	.name = "TMU5",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu5_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh5/setup-sh5.c b/arch/sh/kernel/cpu/sh5/setup-sh5.c
index 678d69b..f5ff1ac 100644
--- a/arch/sh/kernel/cpu/sh5/setup-sh5.c
+++ b/arch/sh/kernel/cpu/sh5/setup-sh5.c
@@ -75,7 +75,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -106,7 +106,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -137,7 +137,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
diff --git a/drivers/i2c/busses/i2c-sh7760.c b/drivers/i2c/busses/i2c-sh7760.c
index baa28b7..b9680f5 100644
--- a/drivers/i2c/busses/i2c-sh7760.c
+++ b/drivers/i2c/busses/i2c-sh7760.c
@@ -396,7 +396,7 @@ static int __devinit calc_CCR(unsigned long scl_hz)
 	signed char cdf, cdfm;
 	int scgd, scgdm, scgds;
 
-	mclk = clk_get(NULL, "module_clk");
+	mclk = clk_get(NULL, "peripheral_clk");
 	if (IS_ERR(mclk)) {
 		return PTR_ERR(mclk);
 	} else {
diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index 4e3248d..fa4d52a 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -1084,7 +1084,7 @@ static void __devinit sci_init_single(struct platform_device *dev,
 	sci_port->port.uartclk	= CONFIG_CPU_CLOCK;
 #elif defined(CONFIG_HAVE_CLK)
 	sci_port->iclk		= p->clk ? clk_get(&dev->dev, p->clk) : NULL;
-	sci_port->dclk		= clk_get(&dev->dev, "module_clk");
+	sci_port->dclk		= clk_get(&dev->dev, "peripheral_clk");
 	sci_port->enable	= sci_clk_enable;
 	sci_port->disable	= sci_clk_disable;
 #else
diff --git a/sound/oss/sh_dac_audio.c b/sound/oss/sh_dac_audio.c
index 78cfb66..6fa3140 100644
--- a/sound/oss/sh_dac_audio.c
+++ b/sound/oss/sh_dac_audio.c
@@ -95,18 +95,18 @@ static void dac_audio_stop(void)
 		outw(v, HD64461_GPADR);
 	}
 
- 	sh_dac_output(0, CONFIG_SOUND_SH_DAC_AUDIO_CHANNEL);
+	sh_dac_output(0, CONFIG_SOUND_SH_DAC_AUDIO_CHANNEL);
 	sh_dac_disable(CONFIG_SOUND_SH_DAC_AUDIO_CHANNEL);
 }
 
 static void dac_audio_set_rate(void)
 {
 	unsigned long interval;
- 	struct clk *clk;
+	struct clk *clk;
 
- 	clk = clk_get(NULL, "module_clk");
- 	interval = (clk_get_rate(clk) / 4) / rate;
- 	clk_put(clk);
+	clk = clk_get(NULL, "peripheral_clk");
+	interval = (clk_get_rate(clk) / 4) / rate;
+	clk_put(clk);
 	ctrl_outl(interval, TMU1_TCOR);
 	ctrl_outl(interval, TMU1_TCNT);
 }
-- 
cgit v0.10.2


From d672fef02738582bdeae6e77176e141eeb9169bc Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 13 May 2009 17:03:09 +0900
Subject: sh: clkfwk: Handle NULL clkops for root clocks.

root clocks may simply be placeholders for rate and ancestry information,
and have no real associated operations of their own. Account for this,
so we are still able to use these sorts of clocks for rate propagation.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 0eedf93..2ced20f 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -103,7 +103,7 @@ void propagate_rate(struct clk *tclk)
 	struct clk *clkp;
 
 	list_for_each_entry(clkp, &tclk->children, sibling) {
-		if (clkp->ops->recalc)
+		if (clkp->ops && clkp->ops->recalc)
 			clkp->rate = clkp->ops->recalc(clkp);
 		propagate_rate(clkp);
 	}
@@ -196,7 +196,7 @@ void recalculate_root_clocks(void)
 	struct clk *clkp;
 
 	list_for_each_entry(clkp, &root_clks, sibling) {
-		if (clkp->ops->recalc)
+		if (clkp->ops && clkp->ops->recalc)
 			clkp->rate = clkp->ops->recalc(clkp);
 		propagate_rate(clkp);
 	}
@@ -224,7 +224,7 @@ int clk_register(struct clk *clk)
 		list_add(&clk->sibling, &root_clks);
 
 	list_add(&clk->node, &clock_list);
-	if (clk->ops->init)
+	if (clk->ops && clk->ops->init)
 		clk->ops->init(clk);
 	mutex_unlock(&clock_list_sem);
 
-- 
cgit v0.10.2


From 100890c55e326a9acb4429593c5ad2012c194564 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 13 May 2009 17:05:51 +0900
Subject: sh: clkfwk: Provide a generic clk_set_rate_ex() path for root clocks.

In the case of root clocks (such as clkin oscillators, extal, etc.), the
rate information is entirely platform dependent and needs to be lazily
set and propagated from the platform code. This provides a method for
establishing the rate update on these types of clocks that define no
set_rate() op of their own.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 2ced20f..0a7755c 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -265,20 +265,27 @@ EXPORT_SYMBOL_GPL(clk_set_rate);
 int clk_set_rate_ex(struct clk *clk, unsigned long rate, int algo_id)
 {
 	int ret = -EOPNOTSUPP;
+	unsigned long flags;
 
-	if (likely(clk->ops && clk->ops->set_rate)) {
-		unsigned long flags;
+	spin_lock_irqsave(&clock_lock, flags);
 
-		spin_lock_irqsave(&clock_lock, flags);
+	if (likely(clk->ops && clk->ops->set_rate)) {
 		ret = clk->ops->set_rate(clk, rate, algo_id);
-		if (ret == 0) {
-			if (clk->ops->recalc)
-				clk->rate = clk->ops->recalc(clk);
-			propagate_rate(clk);
-		}
-		spin_unlock_irqrestore(&clock_lock, flags);
+		if (ret != 0)
+			goto out_unlock;
+	} else {
+		clk->rate = rate;
+		ret = 0;
 	}
 
+	if (clk->ops && clk->ops->recalc)
+		clk->rate = clk->ops->recalc(clk);
+
+	propagate_rate(clk);
+
+out_unlock:
+	spin_unlock_irqrestore(&clock_lock, flags);
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(clk_set_rate_ex);
-- 
cgit v0.10.2


From 253b0887b3736160feac9ccdcf146a2073e41463 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 13 May 2009 17:38:11 +0900
Subject: sh: clkfwk: Rework legacy CPG clock handling.

This moves out the old legacy CPG clocks to their own file, and converts
over the existing users. With these clocks going away and each CPU
dealing with them on their own, CPUs can gradually move over to the new
interface.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index fb75c2d..d7990cd 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -513,6 +513,9 @@ config SH_PCLK_FREQ
 	  This is necessary for determining the reference clock value on
 	  platforms lacking an RTC.
 
+config SH_CLK_CPG_LEGACY
+	def_bool y
+
 config SH_CLK_MD
 	int "CPU Mode Pin Setting"
 	depends on CPU_SH2
diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index e2f5bf1..c27e844 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -47,7 +47,7 @@ struct clk_lookup {
 #define CLK_ENABLE_ON_INIT	(1 << 0)
 
 /* Should be defined by processor-specific code */
-void arch_init_clk_ops(struct clk_ops **, int type);
+void __deprecated arch_init_clk_ops(struct clk_ops **, int type);
 int __init arch_clk_init(void);
 
 /* arch/sh/kernel/cpu/clock.c */
@@ -59,6 +59,9 @@ int clk_reparent(struct clk *child, struct clk *parent);
 int clk_register(struct clk *);
 void clk_unregister(struct clk *);
 
+/* arch/sh/kernel/cpu/clock-cpg.c */
+int __init __deprecated cpg_clk_init(void);
+
 /* the exported API, in addition to clk_set_rate */
 /**
  * clk_set_rate_ex - set the clock rate for a clock source, with additional parameter
diff --git a/arch/sh/include/asm/machvec.h b/arch/sh/include/asm/machvec.h
index 64b1c16..73d6d16 100644
--- a/arch/sh/include/asm/machvec.h
+++ b/arch/sh/include/asm/machvec.h
@@ -46,6 +46,8 @@ struct sh_machine_vector {
 
 	void __iomem *(*mv_ioport_map)(unsigned long port, unsigned int size);
 	void (*mv_ioport_unmap)(void __iomem *);
+
+	int (*mv_clk_init)(void);
 };
 
 extern struct sh_machine_vector sh_mv;
diff --git a/arch/sh/kernel/cpu/Makefile b/arch/sh/kernel/cpu/Makefile
index 2600641..05105b9 100644
--- a/arch/sh/kernel/cpu/Makefile
+++ b/arch/sh/kernel/cpu/Makefile
@@ -17,5 +17,6 @@ obj-$(CONFIG_ARCH_SHMOBILE)	+= shmobile/
 
 obj-$(CONFIG_UBC_WAKEUP)	+= ubc.o
 obj-$(CONFIG_SH_ADC)		+= adc.o
+obj-$(CONFIG_SH_CLK_CPG_LEGACY)	+= clock-cpg.o
 
 obj-y	+= irq/ init.o clock.o
diff --git a/arch/sh/kernel/cpu/clock-cpg.c b/arch/sh/kernel/cpu/clock-cpg.c
new file mode 100644
index 0000000..b4ca200
--- /dev/null
+++ b/arch/sh/kernel/cpu/clock-cpg.c
@@ -0,0 +1,60 @@
+#include <linux/clk.h>
+#include <linux/compiler.h>
+#include <asm/clock.h>
+
+static struct clk master_clk = {
+	.name		= "master_clk",
+	.flags		= CLK_ENABLE_ON_INIT,
+	.rate		= CONFIG_SH_PCLK_FREQ,
+};
+
+static struct clk peripheral_clk = {
+	.name		= "peripheral_clk",
+	.parent		= &master_clk,
+	.flags		= CLK_ENABLE_ON_INIT,
+};
+
+static struct clk bus_clk = {
+	.name		= "bus_clk",
+	.parent		= &master_clk,
+	.flags		= CLK_ENABLE_ON_INIT,
+};
+
+static struct clk cpu_clk = {
+	.name		= "cpu_clk",
+	.parent		= &master_clk,
+	.flags		= CLK_ENABLE_ON_INIT,
+};
+
+/*
+ * The ordering of these clocks matters, do not change it.
+ */
+static struct clk *onchip_clocks[] = {
+	&master_clk,
+	&peripheral_clk,
+	&bus_clk,
+	&cpu_clk,
+};
+
+int __init __deprecated cpg_clk_init(void)
+{
+	int i, ret = 0;
+
+	for (i = 0; i < ARRAY_SIZE(onchip_clocks); i++) {
+		struct clk *clk = onchip_clocks[i];
+		arch_init_clk_ops(&clk->ops, i);
+		if (clk->ops)
+			ret |= clk_register(clk);
+	}
+
+	return ret;
+}
+
+/*
+ * Placeholder for compatability, until the lazy CPUs do this
+ * on their own.
+ */
+int __init __weak arch_clk_init(void)
+{
+	return cpg_clk_init();
+}
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 0a7755c..033f466 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -30,54 +30,12 @@
 #include <linux/platform_device.h>
 #include <linux/proc_fs.h>
 #include <asm/clock.h>
+#include <asm/machvec.h>
 
 static LIST_HEAD(clock_list);
 static DEFINE_SPINLOCK(clock_lock);
 static DEFINE_MUTEX(clock_list_sem);
 
-/*
- * Each subtype is expected to define the init routines for these clocks,
- * as each subtype (or processor family) will have these clocks at the
- * very least. These are all provided through the CPG, which even some of
- * the more quirky parts (such as ST40, SH4-202, etc.) still have.
- *
- * The processor-specific code is expected to register any additional
- * clock sources that are of interest.
- */
-static struct clk master_clk = {
-	.name		= "master_clk",
-	.flags		= CLK_ENABLE_ON_INIT,
-	.rate		= CONFIG_SH_PCLK_FREQ,
-};
-
-static struct clk peripheral_clk = {
-	.name		= "peripheral_clk",
-	.parent		= &master_clk,
-	.flags		= CLK_ENABLE_ON_INIT,
-};
-
-static struct clk bus_clk = {
-	.name		= "bus_clk",
-	.parent		= &master_clk,
-	.flags		= CLK_ENABLE_ON_INIT,
-};
-
-static struct clk cpu_clk = {
-	.name		= "cpu_clk",
-	.parent		= &master_clk,
-	.flags		= CLK_ENABLE_ON_INIT,
-};
-
-/*
- * The ordering of these clocks matters, do not change it.
- */
-static struct clk *onchip_clocks[] = {
-	&master_clk,
-	&peripheral_clk,
-	&bus_clk,
-	&cpu_clk,
-};
-
 /* Used for clocks that always have same value as the parent clock */
 unsigned long followparent_recalc(struct clk *clk)
 {
@@ -443,10 +401,6 @@ void clk_put(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(clk_put);
 
-int __init __weak arch_clk_init(void)
-{
-	return 0;
-}
 
 static int show_clocks(char *buf, char **start, off_t off,
 		       int len, int *eof, void *data)
@@ -533,18 +487,22 @@ subsys_initcall(clk_sysdev_init);
 
 int __init clk_init(void)
 {
-	int i, ret = 0;
-
-	BUG_ON(!master_clk.rate);
-
-	for (i = 0; i < ARRAY_SIZE(onchip_clocks); i++) {
-		struct clk *clk = onchip_clocks[i];
+	int ret;
 
-		arch_init_clk_ops(&clk->ops, i);
-		ret |= clk_register(clk);
+	ret = arch_clk_init();
+	if (unlikely(ret)) {
+		pr_err("%s: CPU clock registration failed.\n", __func__);
+		return ret;
 	}
 
-	ret |= arch_clk_init();
+	if (sh_mv.mv_clk_init) {
+		ret = sh_mv.mv_clk_init();
+		if (unlikely(ret)) {
+			pr_err("%s: machvec clock initialization failed.\n",
+			       __func__);
+			return ret;
+		}
+	}
 
 	/* Kick the child clocks.. */
 	recalculate_root_clocks();
diff --git a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
index a72dc32..21421e3 100644
--- a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
+++ b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
@@ -152,9 +152,12 @@ static struct clk *sh4202_onchip_clocks[] = {
 
 int __init arch_clk_init(void)
 {
-	struct clk *clk = clk_get(NULL, "master_clk");
+	struct clk *clk;
 	int i, ret = 0;
 
+	cpg_clk_init();
+
+	clk = clk_get(NULL, "master_clk");
 	for (i = 0; i < ARRAY_SIZE(sh4202_onchip_clocks); i++) {
 		struct clk *clkp = sh4202_onchip_clocks[i];
 
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index 426065b..a98d7da 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -892,6 +892,8 @@ int __init arch_clk_init(void)
 	struct clk *clk;
 	int i;
 
+	clk_cpg_init();
+
 	clk = clk_get(NULL, "master_clk");
 	for (i = 0; i < ARRAY_SIZE(sh7722_clocks); i++) {
 		pr_debug( "Registering clock '%s'\n", sh7722_clocks[i]->name);
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
index ce3d4e6..370cd47 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
@@ -92,9 +92,12 @@ static struct clk *sh7763_onchip_clocks[] = {
 
 int __init arch_clk_init(void)
 {
-	struct clk *clk = clk_get(NULL, "master_clk");
+	struct clk *clk;
 	int i, ret = 0;
 
+	cpg_clk_init();
+
+	clk = clk_get(NULL, "master_clk");
 	for (i = 0; i < ARRAY_SIZE(sh7763_onchip_clocks); i++) {
 		struct clk *clkp = sh7763_onchip_clocks[i];
 
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
index 38b8b1d..a249d82 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
@@ -98,9 +98,12 @@ static struct clk *sh7780_onchip_clocks[] = {
 
 int __init arch_clk_init(void)
 {
-	struct clk *clk = clk_get(NULL, "master_clk");
+	struct clk *clk;
 	int i, ret = 0;
 
+	cpg_clk_init();
+
+	clk = clk_get(NULL, "master_clk");
 	for (i = 0; i < ARRAY_SIZE(sh7780_onchip_clocks); i++) {
 		struct clk *clkp = sh7780_onchip_clocks[i];
 
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index fa14f35..bf5a0da 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -136,9 +136,12 @@ static struct clk *sh7785_onchip_clocks[] = {
 
 int __init arch_clk_init(void)
 {
-	struct clk *clk = clk_get(NULL, "master_clk");
+	struct clk *clk;
 	int i, ret = 0;
 
+	cpg_clk_init();
+
+	clk = clk_get(NULL, "master_clk");
 	for (i = 0; i < ARRAY_SIZE(sh7785_onchip_clocks); i++) {
 		struct clk *clkp = sh7785_onchip_clocks[i];
 
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
index 7907002..a0e8869 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
@@ -122,9 +122,12 @@ static struct clk *sh7786_onchip_clocks[] = {
 
 int __init arch_clk_init(void)
 {
-	struct clk *clk = clk_get(NULL, "master_clk");
+	struct clk *clk;
 	int i, ret = 0;
 
+	cpg_clk_init();
+
+	clk = clk_get(NULL, "master_clk");
 	for (i = 0; i < ARRAY_SIZE(sh7786_onchip_clocks); i++) {
 		struct clk *clkp = sh7786_onchip_clocks[i];
 
diff --git a/arch/sh/kernel/cpu/sh4a/clock-shx3.c b/arch/sh/kernel/cpu/sh4a/clock-shx3.c
index c14553e..23c27d3 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-shx3.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-shx3.c
@@ -109,9 +109,12 @@ static struct clk *shx3_onchip_clocks[] = {
 
 int __init arch_clk_init(void)
 {
-	struct clk *clk = clk_get(NULL, "master_clk");
+	struct clk *clk;
 	int i, ret = 0;
 
+	cpg_clk_init();
+
+	clk = clk_get(NULL, "master_clk");
 	for (i = 0; i < ARRAY_SIZE(shx3_onchip_clocks); i++) {
 		struct clk *clkp = shx3_onchip_clocks[i];
 
-- 
cgit v0.10.2


From a77b5ac0ea8e47c77008d3a9a9976dcfbc01c42a Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 13 May 2009 17:55:00 +0900
Subject: sh: clkfwk: Update SH7785 for refactored clock framework.

This updates the SH7785 CPU code as well as the SH7785LCR board support
code for making use of the newly refactored clock framework. Support for
the legacy CPG clocks is dropped at this point, with the extal frequency
fed in from the board code.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index d7990cd..df764c5 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -514,7 +514,7 @@ config SH_PCLK_FREQ
 	  platforms lacking an RTC.
 
 config SH_CLK_CPG_LEGACY
-	def_bool y
+	def_bool y if !CPU_SUBTYPE_SH7785
 
 config SH_CLK_MD
 	int "CPU Mode Pin Setting"
diff --git a/arch/sh/boards/board-sh7785lcr.c b/arch/sh/boards/board-sh7785lcr.c
index 6f94f17..33b194b 100644
--- a/arch/sh/boards/board-sh7785lcr.c
+++ b/arch/sh/boards/board-sh7785lcr.c
@@ -2,12 +2,12 @@
  * Renesas Technology Corp. R0P7785LC0011RL Support.
  *
  * Copyright (C) 2008  Yoshihiro Shimoda
+ * Copyright (C) 2009  Paul Mundt
  *
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  */
-
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/sm501.h>
@@ -19,8 +19,11 @@
 #include <linux/i2c-pca-platform.h>
 #include <linux/i2c-algo-pca.h>
 #include <linux/irq.h>
-#include <asm/heartbeat.h>
+#include <linux/clk.h>
+#include <linux/errno.h>
 #include <mach/sh7785lcr.h>
+#include <asm/heartbeat.h>
+#include <asm/clock.h>
 
 /*
  * NOTE: This board has 2 physical memory maps.
@@ -273,6 +276,20 @@ void __init init_sh7785lcr_IRQ(void)
 	plat_irq_setup_pins(IRQ_MODE_IRQ3210);
 }
 
+static int sh7785lcr_clk_init(void)
+{
+	struct clk *clk;
+	int ret;
+
+	clk = clk_get(NULL, "extal");
+	if (!clk || IS_ERR(clk))
+		return PTR_ERR(clk);
+	ret = clk_set_rate(clk, 33333333);
+	clk_put(clk);
+
+	return ret;
+}
+
 static void sh7785lcr_power_off(void)
 {
 	unsigned char *p;
@@ -309,6 +326,7 @@ static void __init sh7785lcr_setup(char **cmdline_p)
 static struct sh_machine_vector mv_sh7785lcr __initmv = {
 	.mv_name		= "SH7785LCR",
 	.mv_setup		= sh7785lcr_setup,
+	.mv_clk_init		= sh7785lcr_clk_init,
 	.mv_init_irq		= init_sh7785lcr_IRQ,
 };
 
diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index c27e844..40cf3c0 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -34,7 +34,9 @@ struct clk {
 
 	unsigned long		rate;
 	unsigned long		flags;
+
 	unsigned long		arch_flags;
+	void			*priv;
 };
 
 struct clk_lookup {
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index bf5a0da..87584dc 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -3,7 +3,7 @@
  *
  * SH7785 support for the clock framework
  *
- *  Copyright (C) 2007  Paul Mundt
+ *  Copyright (C) 2007 - 2009  Paul Mundt
  *
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
@@ -11,145 +11,146 @@
  */
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/clk.h>
+#include <linux/io.h>
 #include <asm/clock.h>
 #include <asm/freq.h>
-#include <asm/io.h>
-
-static int ifc_divisors[] = { 1, 2, 4, 6 };
-static int ufc_divisors[] = { 1, 1, 4, 6 };
-static int sfc_divisors[] = { 1, 1, 4, 6 };
-static int bfc_divisors[] = { 1, 1, 1, 1, 1, 12, 16, 18,
-			     24, 32, 36, 48, 1, 1, 1, 1 };
-static int mfc_divisors[] = { 1, 1, 4, 6 };
-static int pfc_divisors[] = { 1, 1, 1, 1, 1, 1, 1, 18,
-			      24, 32, 36, 48, 1, 1, 1, 1 };
-
-static void master_clk_init(struct clk *clk)
-{
-	clk->rate *= pfc_divisors[ctrl_inl(FRQMR1) & 0x000f];
-}
 
-static struct clk_ops sh7785_master_clk_ops = {
-	.init		= master_clk_init,
+static unsigned int div2[] = { 1, 2, 4, 6, 8, 12, 16, 18,
+			       24, 32, 36, 48 };
+struct clk_priv {
+	unsigned int shift;
 };
 
-static unsigned long module_clk_recalc(struct clk *clk)
-{
-	int idx = (ctrl_inl(FRQMR1) & 0x000f);
-	return clk->parent->rate / pfc_divisors[idx];
-}
+#define FRQMR_CLK_DATA(_name, _shift)	\
+static struct clk_priv _name##_data = {	.shift = _shift, }
 
-static struct clk_ops sh7785_module_clk_ops = {
-	.recalc		= module_clk_recalc,
-};
+FRQMR_CLK_DATA(pfc,  0);
+FRQMR_CLK_DATA(s3fc, 4);
+FRQMR_CLK_DATA(s2fc, 8);
+FRQMR_CLK_DATA(mfc, 12);
+FRQMR_CLK_DATA(bfc, 16);
+FRQMR_CLK_DATA(sfc, 20);
+FRQMR_CLK_DATA(ufc, 24);
+FRQMR_CLK_DATA(ifc, 28);
 
-static unsigned long bus_clk_recalc(struct clk *clk)
+static unsigned long frqmr_clk_recalc(struct clk *clk)
 {
-	int idx = ((ctrl_inl(FRQMR1) >> 16) & 0x000f);
-	return clk->parent->rate / bfc_divisors[idx];
-}
+	struct clk_priv *data = clk->priv;
+	unsigned int idx;
 
-static struct clk_ops sh7785_bus_clk_ops = {
-	.recalc		= bus_clk_recalc,
-};
+	idx = (__raw_readl(FRQMR1) >> data->shift) & 0x000f;
 
-static unsigned long cpu_clk_recalc(struct clk *clk)
-{
-	int idx = ((ctrl_inl(FRQMR1) >> 28) & 0x0003);
-	return clk->parent->rate / ifc_divisors[idx];
+	/*
+	 * XXX: PLL1 multiplier is locked for the default clock mode,
+	 * when mode pin detection and configuration support is added,
+	 * select the multiplier dynamically.
+	 */
+	return clk->parent->rate * 36 / div2[idx];
 }
 
-static struct clk_ops sh7785_cpu_clk_ops = {
-	.recalc		= cpu_clk_recalc,
+static struct clk_ops frqmr_clk_ops = {
+	.recalc		= frqmr_clk_recalc,
 };
 
-static struct clk_ops *sh7785_clk_ops[] = {
-	&sh7785_master_clk_ops,
-	&sh7785_module_clk_ops,
-	&sh7785_bus_clk_ops,
-	&sh7785_cpu_clk_ops,
+/*
+ * Default rate for the root input clock, reset this with clk_set_rate()
+ * from the platform code.
+ */
+static struct clk extal_clk = {
+	.name		= "extal",
+	.id		= -1,
+	.rate		= 33333333,
 };
 
-void __init arch_init_clk_ops(struct clk_ops **ops, int idx)
-{
-	if (idx < ARRAY_SIZE(sh7785_clk_ops))
-		*ops = sh7785_clk_ops[idx];
-}
-
-static unsigned long shyway_clk_recalc(struct clk *clk)
-{
-	int idx = ((ctrl_inl(FRQMR1) >> 20) & 0x0003);
-	return clk->parent->rate / sfc_divisors[idx];
-}
-
-static struct clk_ops sh7785_shyway_clk_ops = {
-	.recalc		= shyway_clk_recalc,
+static struct clk cpu_clk = {
+	.name		= "cpu_clk",		/* Ick */
+	.id		= -1,
+	.ops		= &frqmr_clk_ops,
+	.parent		= &extal_clk,
+	.flags		= CLK_ENABLE_ON_INIT,
+	.priv		= &ifc_data,
 };
 
-static struct clk sh7785_shyway_clk = {
-	.name		= "shyway_clk",
+static struct clk shyway_clk = {
+	.name		= "shyway_clk",		/* SHck */
+	.id		= -1,
+	.ops		= &frqmr_clk_ops,
+	.parent		= &extal_clk,
 	.flags		= CLK_ENABLE_ON_INIT,
-	.ops		= &sh7785_shyway_clk_ops,
+	.priv		= &sfc_data,
 };
 
-static unsigned long ddr_clk_recalc(struct clk *clk)
-{
-	int idx = ((ctrl_inl(FRQMR1) >> 12) & 0x0003);
-	return clk->parent->rate / mfc_divisors[idx];
-}
+static struct clk peripheral_clk = {
+	.name		= "peripheral_clk",	/* Pck */
+	.id		= -1,
+	.ops		= &frqmr_clk_ops,
+	.parent		= &extal_clk,
+	.flags		= CLK_ENABLE_ON_INIT,
+	.priv		= &pfc_data,
+};
 
-static struct clk_ops sh7785_ddr_clk_ops = {
-	.recalc		= ddr_clk_recalc,
+static struct clk ddr_clk = {
+	.name		= "ddr_clk",		/* DDRck */
+	.id		= -1,
+	.ops		= &frqmr_clk_ops,
+	.parent		= &extal_clk,
+	.flags		= CLK_ENABLE_ON_INIT,
+	.priv		= &mfc_data,
 };
 
-static struct clk sh7785_ddr_clk = {
-	.name		= "ddr_clk",
+static struct clk bus_clk = {
+	.name		= "bus_clk",		/* Bck */
+	.id		= -1,
+	.ops		= &frqmr_clk_ops,
+	.parent		= &extal_clk,
 	.flags		= CLK_ENABLE_ON_INIT,
-	.ops		= &sh7785_ddr_clk_ops,
+	.priv		= &bfc_data,
 };
 
-static unsigned long ram_clk_recalc(struct clk *clk)
-{
-	int idx = ((ctrl_inl(FRQMR1) >> 24) & 0x0003);
-	return clk->parent->rate / ufc_divisors[idx];
-}
+static struct clk ga_clk = {
+	.name		= "ga_clk",		/* GAck */
+	.id		= -1,
+	.ops		= &frqmr_clk_ops,
+	.parent		= &extal_clk,
+	.priv		= &s2fc_data,
+};
 
-static struct clk_ops sh7785_ram_clk_ops = {
-	.recalc		= ram_clk_recalc,
+static struct clk du_clk = {
+	.name		= "du_clk",		/* DUck */
+	.id		= -1,
+	.ops		= &frqmr_clk_ops,
+	.parent		= &extal_clk,
+	.priv		= &s3fc_data,
 };
 
-static struct clk sh7785_ram_clk = {
-	.name		= "ram_clk",
+static struct clk umem_clk = {
+	.name		= "umem_clk",		/* uck */
+	.id		= -1,
+	.ops		= &frqmr_clk_ops,
+	.parent		= &extal_clk,
 	.flags		= CLK_ENABLE_ON_INIT,
-	.ops		= &sh7785_ram_clk_ops,
+	.priv		= &ufc_data,
 };
 
-/*
- * Additional SH7785-specific on-chip clocks that aren't already part of the
- * clock framework
- */
-static struct clk *sh7785_onchip_clocks[] = {
-	&sh7785_shyway_clk,
-	&sh7785_ddr_clk,
-	&sh7785_ram_clk,
+static struct clk *clks[] = {
+	&extal_clk,
+	&cpu_clk,
+	&shyway_clk,
+	&peripheral_clk,
+	&ddr_clk,
+	&bus_clk,
+	&ga_clk,
+	&du_clk,
+	&umem_clk,
 };
 
 int __init arch_clk_init(void)
 {
-	struct clk *clk;
 	int i, ret = 0;
 
-	cpg_clk_init();
-
-	clk = clk_get(NULL, "master_clk");
-	for (i = 0; i < ARRAY_SIZE(sh7785_onchip_clocks); i++) {
-		struct clk *clkp = sh7785_onchip_clocks[i];
-
-		clkp->parent = clk;
-		ret |= clk_register(clkp);
-	}
-
-	clk_put(clk);
+	for (i = 0; i < ARRAY_SIZE(clks); i++)
+		ret |= clk_register(clks[i]);
 
 	return ret;
 }
-- 
cgit v0.10.2


From a1c0643ff9f360a30644f6e3cd643ca2a5083aea Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 13 May 2009 10:56:52 +0100
Subject: GFS2: Move journal live test at transaction start

There seems little point grabbing the transaction glock
only to have to release it again if the journal isn't
live. This moves the test earlier to avoid grabbing the lock
when we don't need it in the first place.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 053752d..4ef0e9f 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -33,6 +33,9 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
 	BUG_ON(current->journal_info);
 	BUG_ON(blocks == 0 && revokes == 0);
 
+	if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
+		return -EROFS;
+
 	tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS);
 	if (!tr)
 		return -ENOMEM;
@@ -54,12 +57,6 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
 	if (error)
 		goto fail_holder_uninit;
 
-	if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
-		tr->tr_t_gh.gh_flags |= GL_NOCACHE;
-		error = -EROFS;
-		goto fail_gunlock;
-	}
-
 	error = gfs2_log_reserve(sdp, tr->tr_reserved);
 	if (error)
 		goto fail_gunlock;
-- 
cgit v0.10.2


From cc96eace48fdf0f8925a74c6c1f7ffa512e458d2 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 13 May 2009 20:28:15 +0900
Subject: sh: clkfwk: rate table construction and rounding for SH7785.

This adds support for constructing a rate table by looking at potential
divisors for a specified clock. Each FQRMR clock is given its own table.
Presently each table is rebuilt when the parent propagates down a new
rate, so some more logic needs to be added to do this more intelligently.

Additionally, a fairly generic round_rate() implementation is then
layered on top of it, which subsequently provides us with cpufreq support.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index 40cf3c0..da681de 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -16,6 +16,7 @@ struct clk_ops {
 	int (*set_rate)(struct clk *clk, unsigned long rate, int algo_id);
 	int (*set_parent)(struct clk *clk, struct clk *parent);
 	long (*round_rate)(struct clk *clk, unsigned long rate);
+	void (*build_rate_table)(struct clk *clk);
 };
 
 struct clk {
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 033f466..56c6e11 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -63,6 +63,9 @@ void propagate_rate(struct clk *tclk)
 	list_for_each_entry(clkp, &tclk->children, sibling) {
 		if (clkp->ops && clkp->ops->recalc)
 			clkp->rate = clkp->ops->recalc(clkp);
+		if (clkp->ops && clkp->ops->build_rate_table)
+			clkp->ops->build_rate_table(clkp);
+
 		propagate_rate(clkp);
 	}
 }
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index 87584dc..b7a32dd 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -13,28 +13,43 @@
 #include <linux/kernel.h>
 #include <linux/clk.h>
 #include <linux/io.h>
+#include <linux/cpufreq.h>
 #include <asm/clock.h>
 #include <asm/freq.h>
 
 static unsigned int div2[] = { 1, 2, 4, 6, 8, 12, 16, 18,
 			       24, 32, 36, 48 };
 struct clk_priv {
-	unsigned int shift;
+	unsigned int			shift;
+
+	/* allowable divisor bitmap */
+	unsigned long			div_bitmap;
+
+	/* Supportable frequencies + termination entry */
+	struct cpufreq_frequency_table	freq_table[ARRAY_SIZE(div2)+1];
 };
 
-#define FRQMR_CLK_DATA(_name, _shift)	\
-static struct clk_priv _name##_data = {	.shift = _shift, }
+#define FRQMR_CLK_DATA(_name, _shift, _div_bitmap)	\
+static struct clk_priv _name##_data = {			\
+	.shift		= _shift,			\
+	.div_bitmap	= _div_bitmap,			\
+							\
+	.freq_table[0]	= {				\
+		.index = 0,				\
+		.frequency = CPUFREQ_TABLE_END,		\
+	},						\
+}
 
-FRQMR_CLK_DATA(pfc,  0);
-FRQMR_CLK_DATA(s3fc, 4);
-FRQMR_CLK_DATA(s2fc, 8);
-FRQMR_CLK_DATA(mfc, 12);
-FRQMR_CLK_DATA(bfc, 16);
-FRQMR_CLK_DATA(sfc, 20);
-FRQMR_CLK_DATA(ufc, 24);
-FRQMR_CLK_DATA(ifc, 28);
+FRQMR_CLK_DATA(pfc,  0, 0x0f80);
+FRQMR_CLK_DATA(s3fc, 4, 0x0ff0);
+FRQMR_CLK_DATA(s2fc, 8, 0x0030);
+FRQMR_CLK_DATA(mfc, 12, 0x000c);
+FRQMR_CLK_DATA(bfc, 16, 0x0fe0);
+FRQMR_CLK_DATA(sfc, 20, 0x000c);
+FRQMR_CLK_DATA(ufc, 24, 0x000c);
+FRQMR_CLK_DATA(ifc, 28, 0x000e);
 
-static unsigned long frqmr_clk_recalc(struct clk *clk)
+static unsigned long frqmr_recalc(struct clk *clk)
 {
 	struct clk_priv *data = clk->priv;
 	unsigned int idx;
@@ -49,8 +64,76 @@ static unsigned long frqmr_clk_recalc(struct clk *clk)
 	return clk->parent->rate * 36 / div2[idx];
 }
 
+static void frqmr_build_rate_table(struct clk *clk)
+{
+	struct clk_priv *data = clk->priv;
+	int i, entry;
+
+	for (i = entry = 0; i < ARRAY_SIZE(div2); i++) {
+		if ((data->div_bitmap & (1 << i)) == 0)
+			continue;
+
+		data->freq_table[entry].index = entry;
+		data->freq_table[entry].frequency =
+			clk->parent->rate * 36 / div2[i];
+
+		entry++;
+	}
+
+	if (entry == 0) {
+		pr_warning("clkfwk: failed to build frequency table "
+			   "for \"%s\" clk!\n", clk->name);
+		return;
+	}
+
+	/* Termination entry */
+	data->freq_table[entry].index = entry;
+	data->freq_table[entry].frequency = CPUFREQ_TABLE_END;
+}
+
+static long frqmr_round_rate(struct clk *clk, unsigned long rate)
+{
+	struct clk_priv *data = clk->priv;
+	unsigned long rate_error, rate_error_prev = ~0UL;
+	unsigned long rate_best_fit = rate;
+	unsigned long highest, lowest;
+	int i;
+
+	highest = lowest = 0;
+
+	for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
+		unsigned long freq = data->freq_table[i].frequency;
+
+		if (freq == CPUFREQ_ENTRY_INVALID)
+			continue;
+
+		if (freq > highest)
+			highest = freq;
+		if (freq < lowest)
+			lowest = freq;
+
+		rate_error = abs(freq - rate);
+		if (rate_error < rate_error_prev) {
+			rate_best_fit = freq;
+			rate_error_prev = rate_error;
+		}
+
+		if (rate_error == 0)
+			break;
+	}
+
+	if (rate >= highest)
+		rate_best_fit = highest;
+	if (rate <= lowest)
+		rate_best_fit = lowest;
+
+	return rate_best_fit;
+}
+
 static struct clk_ops frqmr_clk_ops = {
-	.recalc		= frqmr_clk_recalc,
+	.recalc			= frqmr_recalc,
+	.build_rate_table	= frqmr_build_rate_table,
+	.round_rate		= frqmr_round_rate,
 };
 
 /*
-- 
cgit v0.10.2


From cedcf3366f2191885aff92d33d6078ef08203e52 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 13 May 2009 21:51:28 +0900
Subject: sh: clkfwk: Map tree hierarchy in debugfs.

This adopts the OMAP clock framework debugfs bits and replaces the aging
procfs bits. The procfs clocks entry was primarily a debugging aid, and
used to be tied in to cpuinfo before the clock list grew too unweildly.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index da681de..c499d47 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -38,6 +38,7 @@ struct clk {
 
 	unsigned long		arch_flags;
 	void			*priv;
+	struct dentry		*dentry;
 };
 
 struct clk_lookup {
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 56c6e11..686477f 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -28,7 +28,7 @@
 #include <linux/seq_file.h>
 #include <linux/err.h>
 #include <linux/platform_device.h>
-#include <linux/proc_fs.h>
+#include <linux/debugfs.h>
 #include <asm/clock.h>
 #include <asm/machvec.h>
 
@@ -404,24 +404,6 @@ void clk_put(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(clk_put);
 
-
-static int show_clocks(char *buf, char **start, off_t off,
-		       int len, int *eof, void *data)
-{
-	struct clk *clk;
-	char *p = buf;
-
-	list_for_each_entry_reverse(clk, &clock_list, node) {
-		unsigned long rate = clk_get_rate(clk);
-
-		p += sprintf(p, "%-12s\t: %ld.%02ldMHz\t%s\n", clk->name,
-			     rate / 1000000, (rate % 1000000) / 10000,
-			      (clk->usecount > 0) ?  "enabled" : "disabled");
-	}
-
-	return p - buf;
-}
-
 #ifdef CONFIG_PM
 static int clks_sysdev_suspend(struct sys_device *dev, pm_message_t state)
 {
@@ -516,14 +498,90 @@ int __init clk_init(void)
 	return ret;
 }
 
-static int __init clk_proc_init(void)
+/*
+ *	debugfs support to trace clock tree hierarchy and attributes
+ */
+static struct dentry *clk_debugfs_root;
+
+static int clk_debugfs_register_one(struct clk *c)
 {
-	struct proc_dir_entry *p;
-	p = create_proc_read_entry("clocks", S_IRUSR, NULL,
-				   show_clocks, NULL);
-	if (unlikely(!p))
-		return -EINVAL;
+	int err;
+	struct dentry *d, *child;
+	struct clk *pa = c->parent;
+	char s[255];
+	char *p = s;
+
+	p += sprintf(p, "%s", c->name);
+	if (c->id > 0)
+		sprintf(p, ":%d", c->id);
+	d = debugfs_create_dir(s, pa ? pa->dentry : clk_debugfs_root);
+	if (!d)
+		return -ENOMEM;
+	c->dentry = d;
+
+	d = debugfs_create_u8("usecount", S_IRUGO, c->dentry, (u8 *)&c->usecount);
+	if (!d) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	d = debugfs_create_u32("rate", S_IRUGO, c->dentry, (u32 *)&c->rate);
+	if (!d) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	d = debugfs_create_x32("flags", S_IRUGO, c->dentry, (u32 *)&c->flags);
+	if (!d) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	return 0;
+
+err_out:
+	d = c->dentry;
+	list_for_each_entry(child, &d->d_subdirs, d_u.d_child)
+		debugfs_remove(child);
+	debugfs_remove(c->dentry);
+	return err;
+}
+
+static int clk_debugfs_register(struct clk *c)
+{
+	int err;
+	struct clk *pa = c->parent;
+
+	if (pa && !pa->dentry) {
+		err = clk_debugfs_register(pa);
+		if (err)
+			return err;
+	}
 
+	if (!c->dentry) {
+		err = clk_debugfs_register_one(c);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int __init clk_debugfs_init(void)
+{
+	struct clk *c;
+	struct dentry *d;
+	int err;
+
+	d = debugfs_create_dir("clock", NULL);
+	if (!d)
+		return -ENOMEM;
+	clk_debugfs_root = d;
+
+	list_for_each_entry(c, &clock_list, node) {
+		err = clk_debugfs_register(c);
+		if (err)
+			goto err_out;
+	}
 	return 0;
+err_out:
+	debugfs_remove(clk_debugfs_root); /* REVISIT: Cleanup correctly */
+	return err;
 }
-subsys_initcall(clk_proc_init);
+late_initcall(clk_debugfs_init);
-- 
cgit v0.10.2


From 48c2b613616235d7c97fda5982f50100a6c79166 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 13 May 2009 14:49:48 +0100
Subject: GFS2: Add commit= mount option

It has always been possible to adjust the gfs2 log commit
interval, but only from the sysfs interface. This adds a
mount option, commit=<nn>, which will be familar to ext3
users.

The sysfs interface continues to be available as well, although
this might be removed in the future.

Also this patch cleans up some duplicated structures in the GFS2
sysfs code.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 399d1b9..65f438e 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -418,6 +418,7 @@ struct gfs2_args {
 	unsigned int ar_data:2;			/* ordered/writeback */
 	unsigned int ar_meta:1;			/* mount metafs */
 	unsigned int ar_discard:1;		/* discard requests */
+	int ar_commit;				/* Commit interval */
 };
 
 struct gfs2_tune {
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index f7e8527..947af15 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -45,6 +45,7 @@ enum {
 	Opt_meta,
 	Opt_discard,
 	Opt_nodiscard,
+	Opt_commit,
 	Opt_err,
 };
 
@@ -73,6 +74,7 @@ static const match_table_t tokens = {
 	{Opt_meta, "meta"},
 	{Opt_discard, "discard"},
 	{Opt_nodiscard, "nodiscard"},
+	{Opt_commit, "commit=%d"},
 	{Opt_err, NULL}
 };
 
@@ -89,6 +91,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
 	char *o;
 	int token;
 	substring_t tmp[MAX_OPT_ARGS];
+	int rv;
 
 	/* Split the options into tokens with the "," character and
 	   process them */
@@ -173,6 +176,13 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
 		case Opt_nodiscard:
 			args->ar_discard = 0;
 			break;
+		case Opt_commit:
+			rv = match_int(&tmp[0], &args->ar_commit);
+			if (rv || args->ar_commit <= 0) {
+				fs_info(sdp, "commit mount option requires a positive numeric argument\n");
+				return rv ? rv : -EINVAL;
+			}
+			break;
 		case Opt_err:
 		default:
 			fs_info(sdp, "invalid mount option: %s\n", o);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1ff9473..7981fbc 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -55,7 +55,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 	spin_lock_init(&gt->gt_spin);
 
 	gt->gt_incore_log_blocks = 1024;
-	gt->gt_log_flush_secs = 60;
 	gt->gt_recoverd_secs = 60;
 	gt->gt_logd_secs = 1;
 	gt->gt_quota_simul_sync = 64;
@@ -1165,6 +1164,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 
 	sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
 	sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
+	sdp->sd_args.ar_commit = 60;
 
 	error = gfs2_mount_args(sdp, &sdp->sd_args, data);
 	if (error) {
@@ -1191,6 +1191,8 @@ static int fill_super(struct super_block *sb, void *data, int silent)
                                GFS2_BASIC_BLOCK_SHIFT;
 	sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
 
+	sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit;
+
 	error = init_names(sdp, silent);
 	if (error)
 		goto fail;
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 4580195..0677a83 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -436,8 +436,12 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	struct gfs2_args args = sdp->sd_args; /* Default to current settings */
+	struct gfs2_tune *gt = &sdp->sd_tune;
 	int error;
 
+	spin_lock(&gt->gt_spin);
+	args.ar_commit = gt->gt_log_flush_secs;
+	spin_unlock(&gt->gt_spin);
 	error = gfs2_mount_args(sdp, &args, data);
 	if (error)
 		return error;
@@ -473,6 +477,10 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 		sb->s_flags |= MS_POSIXACL;
 	else
 		sb->s_flags &= ~MS_POSIXACL;
+	spin_lock(&gt->gt_spin);
+	gt->gt_log_flush_secs = args.ar_commit;
+	spin_unlock(&gt->gt_spin);
+
 	return 0;
 }
 
@@ -550,6 +558,7 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 {
 	struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
 	struct gfs2_args *args = &sdp->sd_args;
+	int lfsecs;
 
 	if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
 		seq_printf(s, ",meta");
@@ -610,7 +619,9 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	}
 	if (args->ar_discard)
 		seq_printf(s, ",discard");
-
+	lfsecs = sdp->sd_tune.gt_log_flush_secs;
+	if (lfsecs != 60)
+		seq_printf(s, ",commit=%d", lfsecs);
 	return 0;
 }
 
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 7655f502..d53b22e 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -26,6 +26,36 @@
 #include "util.h"
 #include "glops.h"
 
+struct gfs2_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct gfs2_sbd *, char *);
+	ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
+};
+
+static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+	struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
+	return a->show ? a->show(sdp, buf) : 0;
+}
+
+static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
+			       const char *buf, size_t len)
+{
+	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+	struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
+	return a->store ? a->store(sdp, buf, len) : len;
+}
+
+static struct sysfs_ops gfs2_attr_ops = {
+	.show  = gfs2_attr_show,
+	.store = gfs2_attr_store,
+};
+
+
+static struct kset *gfs2_kset;
+
 static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
 {
 	return snprintf(buf, PAGE_SIZE, "%u:%u\n",
@@ -212,11 +242,6 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
 	return len;
 }
 
-struct gfs2_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct gfs2_sbd *, char *);
-	ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
-};
 
 #define GFS2_ATTR(name, mode, show, store) \
 static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
@@ -246,49 +271,21 @@ static struct attribute *gfs2_attrs[] = {
 	NULL,
 };
 
-static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
-			      char *buf)
-{
-	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
-	struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
-	return a->show ? a->show(sdp, buf) : 0;
-}
-
-static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
-			       const char *buf, size_t len)
-{
-	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
-	struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
-	return a->store ? a->store(sdp, buf, len) : len;
-}
-
-static struct sysfs_ops gfs2_attr_ops = {
-	.show  = gfs2_attr_show,
-	.store = gfs2_attr_store,
-};
-
 static struct kobj_type gfs2_ktype = {
 	.default_attrs = gfs2_attrs,
 	.sysfs_ops     = &gfs2_attr_ops,
 };
 
-static struct kset *gfs2_kset;
-
 /*
  * display struct lm_lockstruct fields
  */
 
-struct lockstruct_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct gfs2_sbd *, char *);
-};
-
 #define LOCKSTRUCT_ATTR(name, fmt)                                          \
 static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
 {                                                                           \
 	return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_lockstruct.ls_##name); \
 }                                                                           \
-static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name)
+static struct gfs2_attr lockstruct_attr_##name = __ATTR_RO(name)
 
 LOCKSTRUCT_ATTR(jid,      "%u\n");
 LOCKSTRUCT_ATTR(first,    "%u\n");
@@ -401,14 +398,8 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
 	return sprintf(buf, "%d\n", ls->ls_recover_jid_status);
 }
 
-struct gdlm_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct gfs2_sbd *sdp, char *);
-	ssize_t (*store)(struct gfs2_sbd *sdp, const char *, size_t);
-};
-
 #define GDLM_ATTR(_name,_mode,_show,_store) \
-static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
+static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
 
 GDLM_ATTR(proto_name,     0444, proto_name_show,     NULL);
 GDLM_ATTR(block,          0644, block_show,          block_store);
@@ -434,21 +425,12 @@ static struct attribute *lock_module_attrs[] = {
 	NULL,
 };
 
-/*
- * display struct gfs2_args fields
- */
-
-struct args_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct gfs2_sbd *, char *);
-};
-
 #define ARGS_ATTR(name, fmt)                                                \
 static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
 {                                                                           \
 	return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_args.ar_##name);       \
 }                                                                           \
-static struct args_attr args_attr_##name = __ATTR_RO(name)
+static struct gfs2_attr args_attr_##name = __ATTR_RO(name)
 
 ARGS_ATTR(lockproto,       "%s\n");
 ARGS_ATTR(locktable,       "%s\n");
@@ -531,14 +513,8 @@ static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,
 	return len;
 }
 
-struct tune_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct gfs2_sbd *, char *);
-	ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
-};
-
 #define TUNE_ATTR_3(name, show, store)                                        \
-static struct tune_attr tune_attr_##name = __ATTR(name, 0644, show, store)
+static struct gfs2_attr tune_attr_##name = __ATTR(name, 0644, show, store)
 
 #define TUNE_ATTR_2(name, store)                                              \
 static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                   \
-- 
cgit v0.10.2


From 9582d41135c0d362f04ed6bf3dc8d693a7eafee2 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 13 May 2009 15:06:25 +0100
Subject: GFS2: Remove a couple of unused sysfs entries

These two tunables are pointless and would never need to be
changed anyway. There is also a race between them and umount
as the deamons which they refer to might have gone away. The
easiest way to fix the race is to remove the interface.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index d53b22e..894bf77 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -530,15 +530,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
 }                                                                             \
 TUNE_ATTR_2(name, name##_store)
 
-#define TUNE_ATTR_DAEMON(name, process)                                       \
-static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
-{                                                                             \
-	ssize_t r = tune_set(sdp, &sdp->sd_tune.gt_##name, 1, buf, len);      \
-	wake_up_process(sdp->sd_##process);                                   \
-	return r;                                                             \
-}                                                                             \
-TUNE_ATTR_2(name, name##_store)
-
 TUNE_ATTR(incore_log_blocks, 0);
 TUNE_ATTR(log_flush_secs, 0);
 TUNE_ATTR(quota_warn_period, 0);
@@ -550,8 +541,6 @@ TUNE_ATTR(new_files_jdata, 0);
 TUNE_ATTR(quota_simul_sync, 1);
 TUNE_ATTR(stall_secs, 1);
 TUNE_ATTR(statfs_quantum, 1);
-TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
-TUNE_ATTR_DAEMON(logd_secs, logd_process);
 TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
 
 static struct attribute *tune_attrs[] = {
@@ -565,8 +554,6 @@ static struct attribute *tune_attrs[] = {
 	&tune_attr_quota_simul_sync.attr,
 	&tune_attr_stall_secs.attr,
 	&tune_attr_statfs_quantum.attr,
-	&tune_attr_recoverd_secs.attr,
-	&tune_attr_logd_secs.attr,
 	&tune_attr_quota_scale.attr,
 	&tune_attr_new_files_jdata.attr,
 	NULL,
-- 
cgit v0.10.2


From b103387037cea2ba0f04b44d408d54c53f678061 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 13 May 2009 12:50:40 -0400
Subject: TPM: get_event_name stack corruption

get_event_name uses sprintf to fill a buffer declared on the stack.  It fills
the buffer 2 bytes at a time.  What the code doesn't take into account is that
sprintf(buf, "%02x", data) actually writes 3 bytes.  2 bytes for the data and
then it nul terminates the string.  Since we declare buf to be 40 characters
long and then we write 40 bytes of data into buf sprintf is going to write 41
characters.  The fix is to leave room in buf for the nul terminator.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/drivers/char/tpm/tpm_bios.c b/drivers/char/tpm/tpm_bios.c
index ed306eb..0c2f55a 100644
--- a/drivers/char/tpm/tpm_bios.c
+++ b/drivers/char/tpm/tpm_bios.c
@@ -212,7 +212,8 @@ static int get_event_name(char *dest, struct tcpa_event *event,
 			unsigned char * event_entry)
 {
 	const char *name = "";
-	char data[40] = "";
+	/* 41 so there is room for 40 data and 1 nul */
+	char data[41] = "";
 	int i, n_len = 0, d_len = 0;
 	struct tcpa_pc_event *pc_event;
 
-- 
cgit v0.10.2


From 77f6bf57ba9d2c50173536dbfdacdab27cb867ca Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 14 May 2009 09:49:44 +0200
Subject: splice: fix error return code

fs/splice.c: In function 'default_file_splice_read':
fs/splice.c:566: warning: 'error' may be used uninitialized in this function

which is sort-of true.  The code will in fact return -ENOMEM instead of the
kernel_readv() return value.

Cc: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/splice.c b/fs/splice.c
index c5e3c79..41179c0 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -595,8 +595,10 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	}
 
 	res = kernel_readv(in, vec, spd.nr_pages, *ppos);
-	if (res < 0)
+	if (res < 0) {
+		error = res;
 		goto err;
+	}
 
 	error = 0;
 	if (!res)
-- 
cgit v0.10.2


From 549b5e358d17a8c04953ed80896ce07d37722451 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 14 May 2009 17:38:46 +0900
Subject: sh: clkfwk: Add MSTP bits to SH7785 clock framework.

This plugs in all of the MSTP functions in to the clock framework,
and hands them off to the platform devices that want them.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index c499d47..64c93cb 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -36,6 +36,9 @@ struct clk {
 	unsigned long		rate;
 	unsigned long		flags;
 
+	void __iomem		*enable_reg;
+	unsigned int		enable_bit;
+
 	unsigned long		arch_flags;
 	void			*priv;
 	struct dentry		*dentry;
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 686477f..012d234 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -39,7 +39,7 @@ static DEFINE_MUTEX(clock_list_sem);
 /* Used for clocks that always have same value as the parent clock */
 unsigned long followparent_recalc(struct clk *clk)
 {
-	return clk->parent->rate;
+	return clk->parent ? clk->parent->rate : 0;
 }
 
 int clk_reparent(struct clk *child, struct clk *parent)
@@ -512,7 +512,7 @@ static int clk_debugfs_register_one(struct clk *c)
 	char *p = s;
 
 	p += sprintf(p, "%s", c->name);
-	if (c->id > 0)
+	if (c->id >= 0)
 		sprintf(p, ":%d", c->id);
 	d = debugfs_create_dir(s, pa ? pa->dentry : clk_debugfs_root);
 	if (!d)
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index b7a32dd..cf042b5 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -228,12 +228,75 @@ static struct clk *clks[] = {
 	&umem_clk,
 };
 
+static int mstpcr_clk_enable(struct clk *clk)
+{
+	__raw_writel(__raw_readl(clk->enable_reg) & ~(1 << clk->enable_bit),
+		     clk->enable_reg);
+	return 0;
+}
+
+static void mstpcr_clk_disable(struct clk *clk)
+{
+	__raw_writel(__raw_readl(clk->enable_reg) | (1 << clk->enable_bit),
+		     clk->enable_reg);
+}
+
+static struct clk_ops mstpcr_clk_ops = {
+	.enable		= mstpcr_clk_enable,
+	.disable	= mstpcr_clk_disable,
+	.recalc		= followparent_recalc,
+};
+
+#define MSTPCR0		0xffc80030
+#define MSTPCR1		0xffc80034
+
+#define CLK(_name, _id, _parent, _enable_reg,		\
+	    _enable_bit, _flags)			\
+{							\
+	.name		= _name,			\
+	.id		= _id,				\
+	.parent		= _parent,			\
+	.enable_reg	= (void __iomem *)_enable_reg,	\
+	.enable_bit	= _enable_bit,			\
+	.flags		= _flags,			\
+	.ops		= &mstpcr_clk_ops,		\
+}
+
+static struct clk mstpcr_clks[] = {
+	/* MSTPCR0 */
+	CLK("scif_fck", 5, &peripheral_clk, MSTPCR0, 29, 0),
+	CLK("scif_fck", 4, &peripheral_clk, MSTPCR0, 28, 0),
+	CLK("scif_fck", 3, &peripheral_clk, MSTPCR0, 27, 0),
+	CLK("scif_fck", 2, &peripheral_clk, MSTPCR0, 26, 0),
+	CLK("scif_fck", 1, &peripheral_clk, MSTPCR0, 25, 0),
+	CLK("scif_fck", 0, &peripheral_clk, MSTPCR0, 24, 0),
+	CLK("ssi_fck", 1, &peripheral_clk, MSTPCR0, 21, 0),
+	CLK("ssi_fck", 0, &peripheral_clk, MSTPCR0, 20, 0),
+	CLK("hac_fck", 1, &peripheral_clk, MSTPCR0, 17, 0),
+	CLK("hac_fck", 0, &peripheral_clk, MSTPCR0, 16, 0),
+	CLK("mmcif_fck", -1, &peripheral_clk, MSTPCR0, 13, 0),
+	CLK("flctl_fck", -1, &peripheral_clk, MSTPCR0, 12, 0),
+	CLK("tmu345_fck", -1, &peripheral_clk, MSTPCR0, 9, 0),
+	CLK("tmu012_fck", -1, &peripheral_clk, MSTPCR0, 8, 0),
+	CLK("siof_fck", -1, &peripheral_clk, MSTPCR0, 3, 0),
+	CLK("hspi_fck", -1, &peripheral_clk, MSTPCR0, 2, 0),
+
+	/* MSTPCR1 */
+	CLK("hudi_fck", -1, NULL, MSTPCR1, 19, 0),
+	CLK("ubc_fck", -1, NULL, MSTPCR1, 17, 0),
+	CLK("dmac_11_6_fck", -1, NULL, MSTPCR1, 5, 0),
+	CLK("dmac_5_0_fck", -1, NULL, MSTPCR1, 4, 0),
+	CLK("gdta_fck", -1, NULL, MSTPCR1, 0, 0),
+};
+
 int __init arch_clk_init(void)
 {
 	int i, ret = 0;
 
 	for (i = 0; i < ARRAY_SIZE(clks); i++)
 		ret |= clk_register(clks[i]);
+	for (i = 0; i < ARRAY_SIZE(mstpcr_clks); i++)
+		ret |= clk_register(&mstpcr_clks[i]);
 
 	return ret;
 }
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
index d7e77bc..af56140 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
@@ -20,7 +20,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "peripheral_clk",
+	.clk = "tmu012_fck",
 	.clockevent_rating = 200,
 };
 
@@ -51,7 +51,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "peripheral_clk",
+	.clk = "tmu012_fck",
 	.clocksource_rating = 200,
 };
 
@@ -82,7 +82,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "peripheral_clk",
+	.clk = "tmu012_fck",
 };
 
 static struct resource tmu2_resources[] = {
@@ -112,7 +112,7 @@ static struct sh_timer_config tmu3_platform_data = {
 	.name = "TMU3",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "peripheral_clk",
+	.clk = "tmu345_fck",
 };
 
 static struct resource tmu3_resources[] = {
@@ -142,7 +142,7 @@ static struct sh_timer_config tmu4_platform_data = {
 	.name = "TMU4",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "peripheral_clk",
+	.clk = "tmu345_fck",
 };
 
 static struct resource tmu4_resources[] = {
@@ -172,7 +172,7 @@ static struct sh_timer_config tmu5_platform_data = {
 	.name = "TMU5",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "peripheral_clk",
+	.clk = "tmu345_fck",
 };
 
 static struct resource tmu5_resources[] = {
@@ -204,31 +204,37 @@ static struct plat_sci_port sci_platform_data[] = {
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 40, 40, 40, 40 },
+		.clk		= "scif_fck",
 	}, {
 		.mapbase	= 0xffeb0000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 44, 44, 44, 44 },
+		.clk		= "scif_fck",
 	}, {
 		.mapbase	= 0xffec0000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 60, 60, 60, 60 },
+		.clk		= "scif_fck",
 	}, {
 		.mapbase	= 0xffed0000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 61, 61, 61, 61 },
+		.clk		= "scif_fck",
 	}, {
 		.mapbase	= 0xffee0000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 62, 62, 62, 62 },
+		.clk		= "scif_fck",
 	}, {
 		.mapbase	= 0xffef0000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 63, 63, 63, 63 },
+		.clk		= "scif_fck",
 	}, {
 		.flags = 0,
 	}
-- 
cgit v0.10.2


From ad3256e361663923342bff7f292dd289f794aa33 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 14 May 2009 17:40:08 +0900
Subject: sh: Provide FORCE_MAX_ZONEORDER.

Several platforms want to be able to do large physically contiguous
allocations (primarily nommu and video codecs on SH-Mobile), provide a
MAX_ORDER override for those cases.

Tested-by: Conrad Parker <conrad@metadecks.org>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
index b900d2cd..2795618 100644
--- a/arch/sh/mm/Kconfig
+++ b/arch/sh/mm/Kconfig
@@ -21,6 +21,29 @@ config PAGE_OFFSET
 	default "0x20000000" if MMU && SUPERH64
 	default "0x00000000"
 
+config FORCE_MAX_ZONEORDER
+	int "Maximum zone order"
+	range 9 64 if PAGE_SIZE_16KB
+	default "9" if PAGE_SIZE_16KB
+	range 7 64 if PAGE_SIZE_64KB
+	default "7" if PAGE_SIZE_64KB
+	range 11 64
+	default "14" if !MMU
+	default "11"
+	help
+	  The kernel memory allocator divides physically contiguous memory
+	  blocks into "zones", where each zone is a power of two number of
+	  pages.  This option selects the largest power of two that the kernel
+	  keeps in the memory allocator.  If you need to allocate very large
+	  blocks of physically contiguous memory, then you may need to
+	  increase this value.
+
+	  This config option is actually maximum order plus one. For example,
+	  a value of 11 means that the largest free memory block is 2^10 pages.
+
+	  The page size is not necessarily 4KB. Keep this in mind when
+	  choosing a value for this option.
+
 config MEMORY_START
 	hex "Physical memory start address"
 	default "0x08000000"
-- 
cgit v0.10.2


From 9304d0ccf1922b9b954b6e9223d1b2bc83198ad2 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 14 May 2009 07:53:39 +0000
Subject: sh: intc tables for sh7770

This patch adds INTC tables for sh7770, thanks
goes to Paul for the first prototype version.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
index b61d614..0feba41 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
@@ -12,6 +12,7 @@
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
 #include <linux/sh_timer.h>
+#include <linux/io.h>
 
 static struct plat_sci_port sci_platform_data[] = {
 	{
@@ -387,6 +388,251 @@ void __init plat_early_device_setup(void)
 				   ARRAY_SIZE(sh7770_early_devices));
 }
 
+enum {
+	UNUSED = 0,
+
+	/* interrupt sources */
+	IRL_LLLL, IRL_LLLH, IRL_LLHL, IRL_LLHH,
+	IRL_LHLL, IRL_LHLH, IRL_LHHL, IRL_LHHH,
+	IRL_HLLL, IRL_HLLH, IRL_HLHL, IRL_HLHH,
+	IRL_HHLL, IRL_HHLH, IRL_HHHL,
+
+	IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5,
+
+	GPIO,
+	TMU0, TMU1, TMU2, TMU2_TICPI,
+	TMU3, TMU4, TMU5, TMU5_TICPI,
+	TMU6, TMU7, TMU8,
+	HAC, IPI, SPDIF, HUDI, I2C,
+	DMAC0_DMINT0, DMAC0_DMINT1, DMAC0_DMINT2,
+	I2S0, I2S1, I2S2, I2S3,
+	SRC_RX, SRC_TX, SRC_SPDIF,
+	DU, VIDEO_IN, REMOTE, YUV, USB, ATAPI, CAN, GPS, GFX2D,
+	GFX3D_MBX, GFX3D_DMAC,
+	EXBUS_ATA,
+	SPI0, SPI1,
+	SCIF089, SCIF1234, SCIF567,
+	ADC,
+	BBDMAC_0_3, BBDMAC_4_7, BBDMAC_8_10, BBDMAC_11_14,
+	BBDMAC_15_18, BBDMAC_19_22, BBDMAC_23_26, BBDMAC_27,
+	BBDMAC_28, BBDMAC_29, BBDMAC_30, BBDMAC_31,
+
+	/* interrupt groups */
+	TMU, DMAC, I2S, SRC, GFX3D, SPI, SCIF, BBDMAC,
+};
+
+static struct intc_vect vectors[] __initdata = {
+	INTC_VECT(GPIO, 0x3e0),
+	INTC_VECT(TMU0, 0x400), INTC_VECT(TMU1, 0x420),
+	INTC_VECT(TMU2, 0x440), INTC_VECT(TMU2_TICPI, 0x460),
+	INTC_VECT(TMU3, 0x480), INTC_VECT(TMU4, 0x4a0),
+	INTC_VECT(TMU5, 0x4c0), INTC_VECT(TMU5_TICPI, 0x4e0),
+	INTC_VECT(TMU6, 0x500), INTC_VECT(TMU7, 0x520),
+	INTC_VECT(TMU8, 0x540),
+	INTC_VECT(HAC, 0x580), INTC_VECT(IPI, 0x5c0),
+	INTC_VECT(SPDIF, 0x5e0),
+	INTC_VECT(HUDI, 0x600), INTC_VECT(I2C, 0x620),
+	INTC_VECT(DMAC0_DMINT0, 0x640), INTC_VECT(DMAC0_DMINT1, 0x660),
+	INTC_VECT(DMAC0_DMINT2, 0x680),
+	INTC_VECT(I2S0, 0x6a0), INTC_VECT(I2S1, 0x6c0),
+	INTC_VECT(I2S2, 0x6e0), INTC_VECT(I2S3, 0x700),
+	INTC_VECT(SRC_RX, 0x720), INTC_VECT(SRC_TX, 0x740),
+	INTC_VECT(SRC_SPDIF, 0x760),
+	INTC_VECT(DU, 0x780), INTC_VECT(VIDEO_IN, 0x7a0),
+	INTC_VECT(REMOTE, 0x7c0), INTC_VECT(YUV, 0x7e0),
+	INTC_VECT(USB, 0x840), INTC_VECT(ATAPI, 0x860),
+	INTC_VECT(CAN, 0x880), INTC_VECT(GPS, 0x8a0),
+	INTC_VECT(GFX2D, 0x8c0),
+	INTC_VECT(GFX3D_MBX, 0x900), INTC_VECT(GFX3D_DMAC, 0x920),
+	INTC_VECT(EXBUS_ATA, 0x940),
+	INTC_VECT(SPI0, 0x960), INTC_VECT(SPI1, 0x980),
+	INTC_VECT(SCIF089, 0x9a0), INTC_VECT(SCIF1234, 0x9c0),
+	INTC_VECT(SCIF1234, 0x9e0), INTC_VECT(SCIF1234, 0xa00),
+	INTC_VECT(SCIF1234, 0xa20), INTC_VECT(SCIF567, 0xa40),
+	INTC_VECT(SCIF567, 0xa60), INTC_VECT(SCIF567, 0xa80),
+	INTC_VECT(SCIF089, 0xaa0), INTC_VECT(SCIF089, 0xac0),
+	INTC_VECT(ADC, 0xb20),
+	INTC_VECT(BBDMAC_0_3, 0xba0), INTC_VECT(BBDMAC_0_3, 0xbc0),
+	INTC_VECT(BBDMAC_0_3, 0xbe0), INTC_VECT(BBDMAC_0_3, 0xc00),
+	INTC_VECT(BBDMAC_4_7, 0xc20), INTC_VECT(BBDMAC_4_7, 0xc40),
+	INTC_VECT(BBDMAC_4_7, 0xc60), INTC_VECT(BBDMAC_4_7, 0xc80),
+	INTC_VECT(BBDMAC_8_10, 0xca0), INTC_VECT(BBDMAC_8_10, 0xcc0),
+	INTC_VECT(BBDMAC_8_10, 0xce0), INTC_VECT(BBDMAC_11_14, 0xd00),
+	INTC_VECT(BBDMAC_11_14, 0xd20), INTC_VECT(BBDMAC_11_14, 0xd40),
+	INTC_VECT(BBDMAC_11_14, 0xd60), INTC_VECT(BBDMAC_15_18, 0xd80),
+	INTC_VECT(BBDMAC_15_18, 0xda0), INTC_VECT(BBDMAC_15_18, 0xdc0),
+	INTC_VECT(BBDMAC_15_18, 0xde0), INTC_VECT(BBDMAC_19_22, 0xe00),
+	INTC_VECT(BBDMAC_19_22, 0xe20), INTC_VECT(BBDMAC_19_22, 0xe40),
+	INTC_VECT(BBDMAC_19_22, 0xe60), INTC_VECT(BBDMAC_23_26, 0xe80),
+	INTC_VECT(BBDMAC_23_26, 0xea0), INTC_VECT(BBDMAC_23_26, 0xec0),
+	INTC_VECT(BBDMAC_23_26, 0xee0), INTC_VECT(BBDMAC_27, 0xf00),
+	INTC_VECT(BBDMAC_28, 0xf20), INTC_VECT(BBDMAC_29, 0xf40),
+	INTC_VECT(BBDMAC_30, 0xf60), INTC_VECT(BBDMAC_31, 0xf80),
+};
+
+static struct intc_group groups[] __initdata = {
+	INTC_GROUP(TMU, TMU0, TMU1, TMU2, TMU2_TICPI, TMU3, TMU4, TMU5,
+		   TMU5_TICPI, TMU6, TMU7, TMU8),
+	INTC_GROUP(DMAC, DMAC0_DMINT0, DMAC0_DMINT1, DMAC0_DMINT2),
+	INTC_GROUP(I2S, I2S0, I2S1, I2S2, I2S3),
+	INTC_GROUP(SRC, SRC_RX, SRC_TX, SRC_SPDIF),
+	INTC_GROUP(GFX3D, GFX3D_MBX, GFX3D_DMAC),
+	INTC_GROUP(SPI, SPI0, SPI1),
+	INTC_GROUP(SCIF, SCIF089, SCIF1234, SCIF567),
+	INTC_GROUP(BBDMAC,
+		   BBDMAC_0_3, BBDMAC_4_7, BBDMAC_8_10, BBDMAC_11_14,
+		   BBDMAC_15_18, BBDMAC_19_22, BBDMAC_23_26, BBDMAC_27,
+		   BBDMAC_28, BBDMAC_29, BBDMAC_30, BBDMAC_31),
+};
+
+static struct intc_mask_reg mask_registers[] __initdata = {
+	{ 0xffe00040, 0xffe00044, 32, /* INT2MSKR / INT2MSKCR */
+	  { 0, BBDMAC, ADC, SCIF, SPI, EXBUS_ATA, GFX3D, GFX2D,
+	    GPS, CAN, ATAPI, USB, YUV, REMOTE, VIDEO_IN, DU, SRC, I2S,
+	    DMAC, I2C, HUDI, SPDIF, IPI, HAC, TMU, GPIO } },
+};
+
+static struct intc_prio_reg prio_registers[] __initdata = {
+	{ 0xffe00000, 0, 32, 8, /* INT2PRI0 */ { GPIO, TMU0, 0, HAC } },
+	{ 0xffe00004, 0, 32, 8, /* INT2PRI1 */ { IPI, SPDIF, HUDI, I2C } },
+	{ 0xffe00008, 0, 32, 8, /* INT2PRI2 */ { DMAC, I2S, SRC, DU } },
+	{ 0xffe0000c, 0, 32, 8, /* INT2PRI3 */ { VIDEO_IN, REMOTE, YUV, USB } },
+	{ 0xffe00010, 0, 32, 8, /* INT2PRI4 */ { ATAPI, CAN, GPS, GFX2D } },
+	{ 0xffe00014, 0, 32, 8, /* INT2PRI5 */ { 0, GFX3D, EXBUS_ATA, SPI } },
+	{ 0xffe00018, 0, 32, 8, /* INT2PRI6 */ { SCIF1234, SCIF567, SCIF089 } },
+	{ 0xffe0001c, 0, 32, 8, /* INT2PRI7 */ { ADC, 0, 0, BBDMAC_0_3 } },
+	{ 0xffe00020, 0, 32, 8, /* INT2PRI8 */
+	  { BBDMAC_4_7, BBDMAC_8_10, BBDMAC_11_14, BBDMAC_15_18 } },
+	{ 0xffe00024, 0, 32, 8, /* INT2PRI9 */
+	  { BBDMAC_19_22, BBDMAC_23_26, BBDMAC_27, BBDMAC_28 } },
+	{ 0xffe00028, 0, 32, 8, /* INT2PRI10 */
+	  { BBDMAC_29, BBDMAC_30, BBDMAC_31 } },
+	{ 0xffe0002c, 0, 32, 8, /* INT2PRI11 */
+	  { TMU1, TMU2, TMU2_TICPI, TMU3 } },
+	{ 0xffe00030, 0, 32, 8, /* INT2PRI12 */
+	  { TMU4, TMU5, TMU5_TICPI, TMU6 } },
+	{ 0xffe00034, 0, 32, 8, /* INT2PRI13 */
+	  { TMU7, TMU8 } },
+};
+
+static DECLARE_INTC_DESC(intc_desc, "sh7770", vectors, groups,
+			 mask_registers, prio_registers, NULL);
+
+/* Support for external interrupt pins in IRQ mode */
+static struct intc_vect irq_vectors[] __initdata = {
+	INTC_VECT(IRQ0, 0x240), INTC_VECT(IRQ1, 0x280),
+	INTC_VECT(IRQ2, 0x2c0), INTC_VECT(IRQ3, 0x300),
+	INTC_VECT(IRQ4, 0x340), INTC_VECT(IRQ5, 0x380),
+};
+
+static struct intc_mask_reg irq_mask_registers[] __initdata = {
+	{ 0xffd00044, 0xffd00064, 32, /* INTMSK0 / INTMSKCLR0 */
+	  { IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5, } },
+};
+
+static struct intc_prio_reg irq_prio_registers[] __initdata = {
+	{ 0xffd00010, 0, 32, 4, /* INTPRI */ { IRQ0, IRQ1, IRQ2, IRQ3,
+					       IRQ4, IRQ5, } },
+};
+
+static struct intc_sense_reg irq_sense_registers[] __initdata = {
+	{ 0xffd0001c, 32, 2, /* ICR1 */   { IRQ0, IRQ1, IRQ2, IRQ3,
+					    IRQ4, IRQ5, } },
+};
+
+static DECLARE_INTC_DESC(intc_irq_desc, "sh7770-irq", irq_vectors,
+			 NULL, irq_mask_registers, irq_prio_registers,
+			 irq_sense_registers);
+
+/* External interrupt pins in IRL mode */
+static struct intc_vect irl_vectors[] __initdata = {
+	INTC_VECT(IRL_LLLL, 0x200), INTC_VECT(IRL_LLLH, 0x220),
+	INTC_VECT(IRL_LLHL, 0x240), INTC_VECT(IRL_LLHH, 0x260),
+	INTC_VECT(IRL_LHLL, 0x280), INTC_VECT(IRL_LHLH, 0x2a0),
+	INTC_VECT(IRL_LHHL, 0x2c0), INTC_VECT(IRL_LHHH, 0x2e0),
+	INTC_VECT(IRL_HLLL, 0x300), INTC_VECT(IRL_HLLH, 0x320),
+	INTC_VECT(IRL_HLHL, 0x340), INTC_VECT(IRL_HLHH, 0x360),
+	INTC_VECT(IRL_HHLL, 0x380), INTC_VECT(IRL_HHLH, 0x3a0),
+	INTC_VECT(IRL_HHHL, 0x3c0),
+};
+
+static struct intc_mask_reg irl3210_mask_registers[] __initdata = {
+	{ 0xffd40080, 0xffd40084, 32, /* INTMSK2 / INTMSKCLR2 */
+	  { IRL_LLLL, IRL_LLLH, IRL_LLHL, IRL_LLHH,
+	    IRL_LHLL, IRL_LHLH, IRL_LHHL, IRL_LHHH,
+	    IRL_HLLL, IRL_HLLH, IRL_HLHL, IRL_HLHH,
+	    IRL_HHLL, IRL_HHLH, IRL_HHHL, } },
+};
+
+static struct intc_mask_reg irl7654_mask_registers[] __initdata = {
+	{ 0xffd40080, 0xffd40084, 32, /* INTMSK2 / INTMSKCLR2 */
+	  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	    IRL_LLLL, IRL_LLLH, IRL_LLHL, IRL_LLHH,
+	    IRL_LHLL, IRL_LHLH, IRL_LHHL, IRL_LHHH,
+	    IRL_HLLL, IRL_HLLH, IRL_HLHL, IRL_HLHH,
+	    IRL_HHLL, IRL_HHLH, IRL_HHHL, } },
+};
+
+static DECLARE_INTC_DESC(intc_irl7654_desc, "sh7780-irl7654", irl_vectors,
+			 NULL, irl7654_mask_registers, NULL, NULL);
+
+static DECLARE_INTC_DESC(intc_irl3210_desc, "sh7780-irl3210", irl_vectors,
+			 NULL, irl3210_mask_registers, NULL, NULL);
+
+#define INTC_ICR0	0xffd00000
+#define INTC_INTMSK0	0xffd00044
+#define INTC_INTMSK1	0xffd00048
+#define INTC_INTMSK2	0xffd40080
+#define INTC_INTMSKCLR1	0xffd00068
+#define INTC_INTMSKCLR2	0xffd40084
+
 void __init plat_irq_setup(void)
 {
+	/* disable IRQ7-0 */
+	ctrl_outl(0xff000000, INTC_INTMSK0);
+
+	/* disable IRL3-0 + IRL7-4 */
+	ctrl_outl(0xc0000000, INTC_INTMSK1);
+	ctrl_outl(0xfffefffe, INTC_INTMSK2);
+
+	/* select IRL mode for IRL3-0 + IRL7-4 */
+	ctrl_outl(ctrl_inl(INTC_ICR0) & ~0x00c00000, INTC_ICR0);
+
+	/* disable holding function, ie enable "SH-4 Mode" */
+	ctrl_outl(ctrl_inl(INTC_ICR0) | 0x00200000, INTC_ICR0);
+
+	register_intc_controller(&intc_desc);
+}
+
+void __init plat_irq_setup_pins(int mode)
+{
+	switch (mode) {
+	case IRQ_MODE_IRQ:
+		/* select IRQ mode for IRL3-0 + IRL7-4 */
+		ctrl_outl(ctrl_inl(INTC_ICR0) | 0x00c00000, INTC_ICR0);
+		register_intc_controller(&intc_irq_desc);
+		break;
+	case IRQ_MODE_IRL7654:
+		/* enable IRL7-4 but don't provide any masking */
+		ctrl_outl(0x40000000, INTC_INTMSKCLR1);
+		ctrl_outl(0x0000fffe, INTC_INTMSKCLR2);
+		break;
+	case IRQ_MODE_IRL3210:
+		/* enable IRL0-3 but don't provide any masking */
+		ctrl_outl(0x80000000, INTC_INTMSKCLR1);
+		ctrl_outl(0xfffe0000, INTC_INTMSKCLR2);
+		break;
+	case IRQ_MODE_IRL7654_MASK:
+		/* enable IRL7-4 and mask using cpu intc controller */
+		ctrl_outl(0x40000000, INTC_INTMSKCLR1);
+		register_intc_controller(&intc_irl7654_desc);
+		break;
+	case IRQ_MODE_IRL3210_MASK:
+		/* enable IRL0-3 and mask using cpu intc controller */
+		ctrl_outl(0x80000000, INTC_INTMSKCLR1);
+		register_intc_controller(&intc_irl3210_desc);
+		break;
+	default:
+		BUG();
+	}
 }
-- 
cgit v0.10.2


From 2fa3cdfb319055fd8b25abdafa413e16f00ad493 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 14 May 2009 09:29:45 -0400
Subject: ext4: Merge ext4_da_get_block_write() into mpage_da_map_blocks()

The static function ext4_da_get_block_write() was only used by
mpage_da_map_blocks().  So to simplify the code, merge that function
into mpage_da_map_blocks().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e6113c3..bfe50a2 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2006,57 +2006,6 @@ static void ext4_print_free_blocks(struct inode *inode)
 }
 
 /*
- * This function is used by mpage_da_map_blocks().  We separate it out
- * as a separate function just to make life easier, and because
- * mpage_da_map_blocks() used to be a generic function that took a
- * get_block_t.
- */
-static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
-				   struct buffer_head *bh_result)
-{
-	int ret;
-	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-	loff_t disksize = EXT4_I(inode)->i_disksize;
-	handle_t *handle = NULL;
-
-	handle = ext4_journal_current_handle();
-	BUG_ON(!handle);
-	ret = ext4_get_blocks(handle, inode, iblock, max_blocks,
-			      bh_result, EXT4_GET_BLOCKS_CREATE|
-			      EXT4_GET_BLOCKS_DELALLOC_RESERVE);
-	if (ret <= 0)
-		return ret;
-
-	bh_result->b_size = (ret << inode->i_blkbits);
-
-	if (ext4_should_order_data(inode)) {
-		int retval;
-		retval = ext4_jbd2_file_inode(handle, inode);
-		if (retval)
-			/*
-			 * Failed to add inode for ordered mode. Don't
-			 * update file size
-			 */
-			return retval;
-	}
-
-	/*
-	 * Update on-disk size along with block allocation we don't
-	 * use EXT4_GET_BLOCKS_EXTEND_DISKSIZE as size may change
-	 * within already allocated block -bzzz
-	 */
-	disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
-	if (disksize > i_size_read(inode))
-		disksize = i_size_read(inode);
-	if (disksize > EXT4_I(inode)->i_disksize) {
-		ext4_update_i_disksize(inode, disksize);
-		ret = ext4_mark_inode_dirty(handle, inode);
-		return ret;
-	}
-	return 0;
-}
-
-/*
  * mpage_da_map_blocks - go through given space
  *
  * @mpd - bh describing space
@@ -2066,9 +2015,12 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
  */
 static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
-	int err = 0;
+	int err, blks;
 	struct buffer_head new;
-	sector_t next;
+	sector_t next = mpd->b_blocknr;
+	unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
+	loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
+	handle_t *handle = NULL;
 
 	/*
 	 * We consider only non-mapped and non-allocated blocks
@@ -2077,6 +2029,16 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 		!(mpd->b_state & (1 << BH_Delay)) &&
 		!(mpd->b_state & (1 << BH_Unwritten)))
 		return 0;
+
+	/*
+	 * If we didn't accumulate anything to write simply return
+	 */
+	if (!mpd->b_size)
+		return 0;
+
+	handle = ext4_journal_current_handle();
+	BUG_ON(!handle);
+
 	/*
 	 * We need to make sure the BH_Delay flag is passed down to
 	 * ext4_da_get_block_write(), since it calls ext4_get_blocks()
@@ -2092,18 +2054,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	 * EXT4_GET_BLOCKS_DELALLOC_RESERVE flag.
 	 */
 	new.b_state = mpd->b_state & (1 << BH_Delay);
-	new.b_blocknr = 0;
-	new.b_size = mpd->b_size;
-	next = mpd->b_blocknr;
-	/*
-	 * If we didn't accumulate anything
-	 * to write simply return
-	 */
-	if (!new.b_size)
-		return 0;
-
-	err = ext4_da_get_block_write(mpd->inode, next, &new);
-	if (err) {
+	blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
+			       &new, EXT4_GET_BLOCKS_CREATE|
+			       EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+	if (blks < 0) {
+		err = blks;
 		/*
 		 * If get block returns with error we simply
 		 * return. Later writepage will redirty the page and
@@ -2136,12 +2091,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 		if (err == -ENOSPC) {
 			ext4_print_free_blocks(mpd->inode);
 		}
-		/* invlaidate all the pages */
+		/* invalidate all the pages */
 		ext4_da_block_invalidatepages(mpd, next,
 				mpd->b_size >> mpd->inode->i_blkbits);
 		return err;
 	}
-	BUG_ON(new.b_size == 0);
+	BUG_ON(blks == 0);
+
+	new.b_size = (blks << mpd->inode->i_blkbits);
 
 	if (buffer_new(&new))
 		__unmap_underlying_blocks(mpd->inode, &new);
@@ -2154,6 +2111,25 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	    (mpd->b_state & (1 << BH_Unwritten)))
 		mpage_put_bnr_to_bhs(mpd, next, &new);
 
+	if (ext4_should_order_data(mpd->inode)) {
+		err = ext4_jbd2_file_inode(handle, mpd->inode);
+		if (err)
+			return err;
+	}
+
+	/*
+	 * Update on-disk size along with block allocation we don't
+	 * use EXT4_GET_BLOCKS_EXTEND_DISKSIZE as size may change
+	 * within already allocated block -bzzz
+	 */
+	disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
+	if (disksize > i_size_read(mpd->inode))
+		disksize = i_size_read(mpd->inode);
+	if (disksize > EXT4_I(mpd->inode)->i_disksize) {
+		ext4_update_i_disksize(mpd->inode, disksize);
+		return ext4_mark_inode_dirty(handle, mpd->inode);
+	}
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From 2ac3b6e00acb46406c993d57921f86a594aafe08 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 14 May 2009 13:57:08 -0400
Subject: ext4: Clean up ext4_get_blocks() so it does not depend on
 bh_result->b_state

The ext4_get_blocks() function was depending on the value of
bh_result->b_state as an input parameter to decide whether or not
update the delalloc accounting statistics by calling
ext4_da_update_reserve_space().  We now use a separate flag,
EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE, to requests this update, so that
all callers of ext4_get_blocks() can clear map_bh.b_state before
calling ext4_get_blocks() without worrying about any consistency
issues.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 17feb4a..d164f12 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -318,16 +318,21 @@ struct ext4_new_group_data {
  */
 	/* Allocate any needed blocks and/or convert an unitialized
 	   extent to be an initialized ext4 */
-#define EXT4_GET_BLOCKS_CREATE			1
+#define EXT4_GET_BLOCKS_CREATE			0x0001
 	/* Request the creation of an unitialized extent */
-#define EXT4_GET_BLOCKS_UNINIT_EXT		2
+#define EXT4_GET_BLOCKS_UNINIT_EXT		0x0002
 #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT	(EXT4_GET_BLOCKS_UNINIT_EXT|\
 						 EXT4_GET_BLOCKS_CREATE)
 	/* Update the ext4_inode_info i_disksize field */
-#define EXT4_GET_BLOCKS_EXTEND_DISKSIZE		4
+#define EXT4_GET_BLOCKS_EXTEND_DISKSIZE		0x0004
 	/* Caller is from the delayed allocation writeout path,
-	   so the filesystem blocks have already been accounted for */
-#define EXT4_GET_BLOCKS_DELALLOC_RESERVE	8
+	   so set the magic i_delalloc_reserve_flag after taking the 
+	   inode allocation semaphore for */
+#define EXT4_GET_BLOCKS_DELALLOC_RESERVE	0x0008
+	/* Call ext4_da_update_reserve_space() after successfully 
+	   allocating the blocks */
+#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE	0x0010
+
 
 /*
  * ioctl commands
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bfe50a2..d7b7480 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1234,16 +1234,15 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 		}
 	}
 
-	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
 		EXT4_I(inode)->i_delalloc_reserved_flag = 0;
-		/*
-		 * Update reserved blocks/metadata blocks
-		 * after successful block allocation
-		 * which were deferred till now
-		 */
-		if ((retval > 0) && buffer_delay(bh))
-			ext4_da_update_reserve_space(inode, retval);
-	}
+
+	/*
+	 * Update reserved blocks/metadata blocks after successful
+	 * block allocation which had been deferred till now.
+	 */
+	if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
+		ext4_da_update_reserve_space(inode, retval);
 
 	up_write((&EXT4_I(inode)->i_data_sem));
 	return retval;
@@ -2015,7 +2014,7 @@ static void ext4_print_free_blocks(struct inode *inode)
  */
 static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
-	int err, blks;
+	int err, blks, get_blocks_flags;
 	struct buffer_head new;
 	sector_t next = mpd->b_blocknr;
 	unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
@@ -2040,23 +2039,30 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	BUG_ON(!handle);
 
 	/*
-	 * We need to make sure the BH_Delay flag is passed down to
-	 * ext4_da_get_block_write(), since it calls ext4_get_blocks()
-	 * with the EXT4_GET_BLOCKS_DELALLOC_RESERVE flag.  This flag
-	 * causes ext4_get_blocks() to call
-	 * ext4_da_update_reserve_space() if the passed buffer head
-	 * has the BH_Delay flag set.  In the future, once we clean up
-	 * the interfaces to ext4_get_blocks(), we should pass in a
-	 * separate flag which requests that the delayed allocation
-	 * statistics should be updated, instead of depending on the
-	 * state information getting passed down via the map_bh's
-	 * state bitmasks plus the magic
-	 * EXT4_GET_BLOCKS_DELALLOC_RESERVE flag.
+	 * Call ext4_get_blocks() to allocate any delayed allocation
+	 * blocks, or to convert an uninitialized extent to be
+	 * initialized (in the case where we have written into
+	 * one or more preallocated blocks).
+	 *
+	 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
+	 * indicate that we are on the delayed allocation path.  This
+	 * affects functions in many different parts of the allocation
+	 * call path.  This flag exists primarily because we don't
+	 * want to change *many* call functions, so ext4_get_blocks()
+	 * will set the magic i_delalloc_reserved_flag once the
+	 * inode's allocation semaphore is taken.
+	 *
+	 * If the blocks in questions were delalloc blocks, set
+	 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
+	 * variables are updated after the blocks have been allocated.
 	 */
-	new.b_state = mpd->b_state & (1 << BH_Delay);
+	new.b_state = 0;
+	get_blocks_flags = (EXT4_GET_BLOCKS_CREATE |
+			    EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+	if (mpd->b_state & (1 << BH_Delay))
+		get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE;
 	blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
-			       &new, EXT4_GET_BLOCKS_CREATE|
-			       EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+			       &new, get_blocks_flags);
 	if (blks < 0) {
 		err = blks;
 		/*
-- 
cgit v0.10.2


From f850a7c040d9faafb41bceb0a05d6bb7432c8c7a Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 12 May 2009 15:13:55 -0400
Subject: IMA: remove read permissions on the ima policy file

The IMA policy file does not implement read.  Trying to just open/read/close
the file will load a blank policy and you cannot then change the policy
without a reboot.  This removes the read permission from the file so one must
at least be attempting to write...

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c
index ffbe259..3305a96 100644
--- a/security/integrity/ima/ima_fs.c
+++ b/security/integrity/ima/ima_fs.c
@@ -15,6 +15,7 @@
  *	implemenents security file system for reporting
  *	current measurement list and IMA statistics
  */
+#include <linux/fcntl.h>
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/rculist.h>
@@ -283,6 +284,9 @@ static atomic_t policy_opencount = ATOMIC_INIT(1);
  */
 int ima_open_policy(struct inode * inode, struct file * filp)
 {
+	/* No point in being allowed to open it if you aren't going to write */
+	if (!(filp->f_flags & O_WRONLY))
+		return -EACCES;
 	if (atomic_dec_and_test(&policy_opencount))
 		return 0;
 	return -EBUSY;
@@ -349,7 +353,7 @@ int ima_fs_init(void)
 		goto out;
 
 	ima_policy = securityfs_create_file("policy",
-					    S_IRUSR | S_IRGRP | S_IWUSR,
+					    S_IWUSR,
 					    ima_dir, NULL,
 					    &ima_measure_policy_ops);
 	if (IS_ERR(ima_policy))
-- 
cgit v0.10.2


From c3d20103d08e5c0b6738fbd0acf3ca004e5356c5 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 12 May 2009 15:14:23 -0400
Subject: IMA: do not measure everything opened by root by default

The IMA default policy measures every single file opened by root.  This is
terrible for most users.  Consider a system (like mine) with virtual machine
images.  When those images are touched (which happens at boot for me) those
images are measured.  This is just way too much for the default case.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index b168c1d..dec6dcb 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -61,8 +61,6 @@ static struct ima_measure_rule_entry default_rules[] = {
 	 .flags = IMA_FUNC | IMA_MASK},
 	{.action = MEASURE,.func = BPRM_CHECK,.mask = MAY_EXEC,
 	 .flags = IMA_FUNC | IMA_MASK},
-	{.action = MEASURE,.func = PATH_CHECK,.mask = MAY_READ,.uid = 0,
-	 .flags = IMA_FUNC | IMA_MASK | IMA_UID}
 };
 
 static LIST_HEAD(measure_default_rules);
-- 
cgit v0.10.2


From 4e5b6d006b61b6b998e15da850d959520f811b4c Mon Sep 17 00:00:00 2001
From: Inaky Perez-Gonzalez <inaky@linux.intel.com>
Date: Fri, 8 May 2009 14:02:32 -0700
Subject: wimax/i2400m: fix device crash: fix optimization in
 _roq_queue_update_ws

When the i2400m receives data and the device indicates there has to be
reordering, we keep an sliding window implementation to sort the
packets before sending them to the network stack.

One of the "operations" that the device indicates is "queue a packet
and update the window start". When the queue is empty, this is
equivalent to "deliver the packet and update the window start".

That case was optimized in i2400m_roq_queue_update_ws() so that we
would not pointlessly queue and dequeue a packet. However, when the
optimization was active, it wasn't updating the window start. That
caused the reorder management code to get confused later on with what
seemed to be wrong reorder requests from the device.

Thus the fix implemented is to do the right thing and update the
window start in both cases, when the queue is empty (and the
optimization is done) and when not.

Signed-off-by: Inaky Perez-Gonzalez <inaky@linux.intel.com>

diff --git a/drivers/net/wimax/i2400m/rx.c b/drivers/net/wimax/i2400m/rx.c
index 02419bf..f9fc389 100644
--- a/drivers/net/wimax/i2400m/rx.c
+++ b/drivers/net/wimax/i2400m/rx.c
@@ -819,10 +819,9 @@ void i2400m_roq_queue_update_ws(struct i2400m *i2400m, struct i2400m_roq *roq,
 			roq_data = (struct i2400m_roq_data *) &skb->cb;
 			i2400m_net_erx(i2400m, skb, roq_data->cs);
 		}
-		else {
+		else
 			__i2400m_roq_queue(i2400m, roq, skb, sn, nsn);
-			__i2400m_roq_update_ws(i2400m, roq, sn + 1);
-		}
+		__i2400m_roq_update_ws(i2400m, roq, sn + 1);
 		i2400m_roq_log_add(i2400m, roq, I2400M_RO_TYPE_PACKET_WS,
 				   old_ws, len, sn, nsn, roq->ws);
 	}
-- 
cgit v0.10.2


From bec36eca6f5d1d83a9c3733fc40ba173ad849df2 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 15 May 2009 12:03:04 +0900
Subject: sh: hd64461: Fix up I/O base register offsets.

hd64461 is mapped in a fixed location, so the I/O base itself is fairly
meaningless as a configuration item. Additionally, this makes it
impossible to share hd64461 code alongside generic drivers (in the case
of sh_dac_audio), so simply make it commonly defined and permit the
mach_is_foo() logic to work out the proper semantics.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/cchips/Kconfig b/arch/sh/cchips/Kconfig
index f43d183..a5ab2ec 100644
--- a/arch/sh/cchips/Kconfig
+++ b/arch/sh/cchips/Kconfig
@@ -34,11 +34,6 @@ config HD64461_IRQ
 
 	  Do not change this unless you know what you are doing.
 
-config HD64461_IOBASE
-	hex "HD64461 start address"
-	depends on HD64461
-	default "0xb0000000"
-
 config HD64461_ENABLER
 	bool "HD64461 PCMCIA enabler"
 	depends on HD64461
diff --git a/arch/sh/include/asm/hd64461.h b/arch/sh/include/asm/hd64461.h
index 52b4b62..c16d545 100644
--- a/arch/sh/include/asm/hd64461.h
+++ b/arch/sh/include/asm/hd64461.h
@@ -13,13 +13,15 @@
 #define	HD64461_PCC_WINDOW	0x01000000
 
 /* Area 6 - Slot 0 - memory and/or IO card */
-#define	HD64461_PCC0_BASE	(CONFIG_HD64461_IOBASE + 0x8000000)
+#define HD64461_IOBASE		0xb0000000
+#define HD64461_IO_OFFSET(x)	(HD64461_IOBASE + (x))
+#define	HD64461_PCC0_BASE	HD64461_IO_OFFSET(0x8000000)
 #define	HD64461_PCC0_ATTR	(HD64461_PCC0_BASE)				/* 0xb80000000 */
 #define	HD64461_PCC0_COMM	(HD64461_PCC0_BASE+HD64461_PCC_WINDOW)		/* 0xb90000000 */
 #define	HD64461_PCC0_IO		(HD64461_PCC0_BASE+2*HD64461_PCC_WINDOW)	/* 0xba0000000 */
 
 /* Area 5 - Slot 1 - memory card only */
-#define	HD64461_PCC1_BASE	(CONFIG_HD64461_IOBASE + 0x4000000)
+#define	HD64461_PCC1_BASE	HD64461_IO_OFFSET(0x4000000)
 #define	HD64461_PCC1_ATTR	(HD64461_PCC1_BASE)				/* 0xb4000000 */
 #define	HD64461_PCC1_COMM	(HD64461_PCC1_BASE+HD64461_PCC_WINDOW)		/* 0xb5000000 */
 
@@ -41,19 +43,19 @@
 #define	HD64461_STBCR_SURTST		0x0001
 
 /* System Configuration Register */
-#define	HD64461_SYSCR		(CONFIG_HD64461_IOBASE + 0x02)
+#define	HD64461_SYSCR		HD64461_IO_OFFSET(0x02)
 
 /* CPU Data Bus Control Register */
-#define	HD64461_SCPUCR		(CONFIG_HD64461_IOBASE + 0x04)
+#define	HD64461_SCPUCR		HD64461_IO_OFFSET(0x04)
 
 /* Base Address Register */
-#define	HD64461_LCDCBAR		(CONFIG_HD64461_IOBASE + 0x1000)
+#define	HD64461_LCDCBAR		HD64461_IO_OFFSET(0x1000)
 
 /* Line increment address */
-#define	HD64461_LCDCLOR		(CONFIG_HD64461_IOBASE + 0x1002)
+#define	HD64461_LCDCLOR		HD64461_IO_OFFSET(0x1002)
 
 /* Controls LCD controller */
-#define	HD64461_LCDCCR		(CONFIG_HD64461_IOBASE + 0x1004)
+#define	HD64461_LCDCCR		HD64461_IO_OFFSET(0x1004)
 
 /* LCCDR control bits */
 #define	HD64461_LCDCCR_STBACK	0x0400	/* Standby Back */
@@ -64,30 +66,30 @@
 #define	HD64461_LCDCCR_SPON	0x0010	/* Start Power On */
 
 /* Controls LCD (1) */
-#define	HD64461_LDR1		(CONFIG_HD64461_IOBASE + 0x1010)
+#define	HD64461_LDR1		HD64461_IO_OFFSET(0x1010)
 #define	HD64461_LDR1_DON	0x01	/* Display On */
 #define	HD64461_LDR1_DINV	0x80	/* Display Invert */
 
 /* Controls LCD (2) */
-#define	HD64461_LDR2		(CONFIG_HD64461_IOBASE + 0x1012)
-#define	HD64461_LDHNCR		(CONFIG_HD64461_IOBASE + 0x1014)	/* Number of horizontal characters */
-#define	HD64461_LDHNSR		(CONFIG_HD64461_IOBASE + 0x1016)	/* Specify output start position + width of CL1 */
-#define	HD64461_LDVNTR		(CONFIG_HD64461_IOBASE + 0x1018)	/* Specify total vertical lines */
-#define	HD64461_LDVNDR		(CONFIG_HD64461_IOBASE + 0x101a)	/* specify number of display vertical lines */
-#define	HD64461_LDVSPR		(CONFIG_HD64461_IOBASE + 0x101c)	/* specify vertical synchronization pos and AC nr */
+#define	HD64461_LDR2		HD64461_IO_OFFSET(0x1012)
+#define	HD64461_LDHNCR		HD64461_IO_OFFSET(0x1014)	/* Number of horizontal characters */
+#define	HD64461_LDHNSR		HD64461_IO_OFFSET(0x1016)	/* Specify output start position + width of CL1 */
+#define	HD64461_LDVNTR		HD64461_IO_OFFSET(0x1018)	/* Specify total vertical lines */
+#define	HD64461_LDVNDR		HD64461_IO_OFFSET(0x101a)	/* specify number of display vertical lines */
+#define	HD64461_LDVSPR		HD64461_IO_OFFSET(0x101c)	/* specify vertical synchronization pos and AC nr */
 
 /* Controls LCD (3) */
-#define	HD64461_LDR3		(CONFIG_HD64461_IOBASE + 0x101e)
+#define	HD64461_LDR3		HD64461_IO_OFFSET(0x101e)
 
 /* Palette Registers */
-#define	HD64461_CPTWAR		(CONFIG_HD64461_IOBASE + 0x1030)	/* Color Palette Write Address Register */
-#define	HD64461_CPTWDR		(CONFIG_HD64461_IOBASE + 0x1032)	/* Color Palette Write Data Register */
-#define	HD64461_CPTRAR		(CONFIG_HD64461_IOBASE + 0x1034)	/* Color Palette Read Address Register */
-#define	HD64461_CPTRDR		(CONFIG_HD64461_IOBASE + 0x1036)	/* Color Palette Read Data Register */
+#define	HD64461_CPTWAR		HD64461_IO_OFFSET(0x1030)	/* Color Palette Write Address Register */
+#define	HD64461_CPTWDR		HD64461_IO_OFFSET(0x1032)	/* Color Palette Write Data Register */
+#define	HD64461_CPTRAR		HD64461_IO_OFFSET(0x1034)	/* Color Palette Read Address Register */
+#define	HD64461_CPTRDR		HD64461_IO_OFFSET(0x1036)	/* Color Palette Read Data Register */
 
-#define	HD64461_GRDOR		(CONFIG_HD64461_IOBASE + 0x1040)	/* Display Resolution Offset Register */
-#define	HD64461_GRSCR		(CONFIG_HD64461_IOBASE + 0x1042)	/* Solid Color Register */
-#define	HD64461_GRCFGR		(CONFIG_HD64461_IOBASE + 0x1044)	/* Accelerator Configuration Register */
+#define	HD64461_GRDOR		HD64461_IO_OFFSET(0x1040)	/* Display Resolution Offset Register */
+#define	HD64461_GRSCR		HD64461_IO_OFFSET(0x1042)	/* Solid Color Register */
+#define	HD64461_GRCFGR		HD64461_IO_OFFSET(0x1044)	/* Accelerator Configuration Register */
 
 #define	HD64461_GRCFGR_ACCSTATUS	0x10	/* Accelerator Status */
 #define	HD64461_GRCFGR_ACCRESET		0x08	/* Accelerator Reset */
@@ -97,41 +99,41 @@
 #define	HD64461_GRCFGR_COLORDEPTH8	0x01	/* Sets Colordepth 8 for Accelerator */
 
 /* Line Drawing Registers */
-#define	HD64461_LNSARH		(CONFIG_HD64461_IOBASE + 0x1046)	/* Line Start Address Register (H) */
-#define	HD64461_LNSARL		(CONFIG_HD64461_IOBASE + 0x1048)	/* Line Start Address Register (L) */
-#define	HD64461_LNAXLR		(CONFIG_HD64461_IOBASE + 0x104a)	/* Axis Pixel Length Register */
-#define	HD64461_LNDGR		(CONFIG_HD64461_IOBASE + 0x104c)	/* Diagonal Register */
-#define	HD64461_LNAXR		(CONFIG_HD64461_IOBASE + 0x104e)	/* Axial Register */
-#define	HD64461_LNERTR		(CONFIG_HD64461_IOBASE + 0x1050)	/* Start Error Term Register */
-#define	HD64461_LNMDR		(CONFIG_HD64461_IOBASE + 0x1052)	/* Line Mode Register */
+#define	HD64461_LNSARH		HD64461_IO_OFFSET(0x1046)	/* Line Start Address Register (H) */
+#define	HD64461_LNSARL		HD64461_IO_OFFSET(0x1048)	/* Line Start Address Register (L) */
+#define	HD64461_LNAXLR		HD64461_IO_OFFSET(0x104a)	/* Axis Pixel Length Register */
+#define	HD64461_LNDGR		HD64461_IO_OFFSET(0x104c)	/* Diagonal Register */
+#define	HD64461_LNAXR		HD64461_IO_OFFSET(0x104e)	/* Axial Register */
+#define	HD64461_LNERTR		HD64461_IO_OFFSET(0x1050)	/* Start Error Term Register */
+#define	HD64461_LNMDR		HD64461_IO_OFFSET(0x1052)	/* Line Mode Register */
 
 /* BitBLT Registers */
-#define	HD64461_BBTSSARH	(CONFIG_HD64461_IOBASE + 0x1054)	/* Source Start Address Register (H) */
-#define	HD64461_BBTSSARL	(CONFIG_HD64461_IOBASE + 0x1056)	/* Source Start Address Register (L) */
-#define	HD64461_BBTDSARH	(CONFIG_HD64461_IOBASE + 0x1058)	/* Destination Start Address Register (H) */
-#define	HD64461_BBTDSARL	(CONFIG_HD64461_IOBASE + 0x105a)	/* Destination Start Address Register (L) */
-#define	HD64461_BBTDWR		(CONFIG_HD64461_IOBASE + 0x105c)	/* Destination Block Width Register */
-#define	HD64461_BBTDHR		(CONFIG_HD64461_IOBASE + 0x105e)	/* Destination Block Height Register */
-#define	HD64461_BBTPARH		(CONFIG_HD64461_IOBASE + 0x1060)	/* Pattern Start Address Register (H) */
-#define	HD64461_BBTPARL		(CONFIG_HD64461_IOBASE + 0x1062)	/* Pattern Start Address Register (L) */
-#define	HD64461_BBTMARH		(CONFIG_HD64461_IOBASE + 0x1064)	/* Mask Start Address Register (H) */
-#define	HD64461_BBTMARL		(CONFIG_HD64461_IOBASE + 0x1066)	/* Mask Start Address Register (L) */
-#define	HD64461_BBTROPR		(CONFIG_HD64461_IOBASE + 0x1068)	/* ROP Register */
-#define	HD64461_BBTMDR		(CONFIG_HD64461_IOBASE + 0x106a)	/* BitBLT Mode Register */
+#define	HD64461_BBTSSARH	HD64461_IO_OFFSET(0x1054)	/* Source Start Address Register (H) */
+#define	HD64461_BBTSSARL	HD64461_IO_OFFSET(0x1056)	/* Source Start Address Register (L) */
+#define	HD64461_BBTDSARH	HD64461_IO_OFFSET(0x1058)	/* Destination Start Address Register (H) */
+#define	HD64461_BBTDSARL	HD64461_IO_OFFSET(0x105a)	/* Destination Start Address Register (L) */
+#define	HD64461_BBTDWR		HD64461_IO_OFFSET(0x105c)	/* Destination Block Width Register */
+#define	HD64461_BBTDHR		HD64461_IO_OFFSET(0x105e)	/* Destination Block Height Register */
+#define	HD64461_BBTPARH		HD64461_IO_OFFSET(0x1060)	/* Pattern Start Address Register (H) */
+#define	HD64461_BBTPARL		HD64461_IO_OFFSET(0x1062)	/* Pattern Start Address Register (L) */
+#define	HD64461_BBTMARH		HD64461_IO_OFFSET(0x1064)	/* Mask Start Address Register (H) */
+#define	HD64461_BBTMARL		HD64461_IO_OFFSET(0x1066)	/* Mask Start Address Register (L) */
+#define	HD64461_BBTROPR		HD64461_IO_OFFSET(0x1068)	/* ROP Register */
+#define	HD64461_BBTMDR		HD64461_IO_OFFSET(0x106a)	/* BitBLT Mode Register */
 
 /* PC Card Controller Registers */
 /* Maps to Physical Area 6 */
-#define	HD64461_PCC0ISR		(CONFIG_HD64461_IOBASE + 0x2000)	/* socket 0 interface status */
-#define	HD64461_PCC0GCR		(CONFIG_HD64461_IOBASE + 0x2002)	/* socket 0 general control */
-#define	HD64461_PCC0CSCR	(CONFIG_HD64461_IOBASE + 0x2004)	/* socket 0 card status change */
-#define	HD64461_PCC0CSCIER	(CONFIG_HD64461_IOBASE + 0x2006)	/* socket 0 card status change interrupt enable */
-#define	HD64461_PCC0SCR		(CONFIG_HD64461_IOBASE + 0x2008)	/* socket 0 software control */
+#define	HD64461_PCC0ISR		HD64461_IO_OFFSET(0x2000)	/* socket 0 interface status */
+#define	HD64461_PCC0GCR		HD64461_IO_OFFSET(0x2002)	/* socket 0 general control */
+#define	HD64461_PCC0CSCR	HD64461_IO_OFFSET(0x2004)	/* socket 0 card status change */
+#define	HD64461_PCC0CSCIER	HD64461_IO_OFFSET(0x2006)	/* socket 0 card status change interrupt enable */
+#define	HD64461_PCC0SCR		HD64461_IO_OFFSET(0x2008)	/* socket 0 software control */
 /* Maps to Physical Area 5 */
-#define	HD64461_PCC1ISR		(CONFIG_HD64461_IOBASE + 0x2010)	/* socket 1 interface status */
-#define	HD64461_PCC1GCR		(CONFIG_HD64461_IOBASE + 0x2012)	/* socket 1 general control */
-#define	HD64461_PCC1CSCR	(CONFIG_HD64461_IOBASE + 0x2014)	/* socket 1 card status change */
-#define	HD64461_PCC1CSCIER	(CONFIG_HD64461_IOBASE + 0x2016)	/* socket 1 card status change interrupt enable */
-#define	HD64461_PCC1SCR		(CONFIG_HD64461_IOBASE + 0x2018)	/* socket 1 software control */
+#define	HD64461_PCC1ISR		HD64461_IO_OFFSET(0x2010)	/* socket 1 interface status */
+#define	HD64461_PCC1GCR		HD64461_IO_OFFSET(0x2012)	/* socket 1 general control */
+#define	HD64461_PCC1CSCR	HD64461_IO_OFFSET(0x2014)	/* socket 1 card status change */
+#define	HD64461_PCC1CSCIER	HD64461_IO_OFFSET(0x2016)	/* socket 1 card status change interrupt enable */
+#define	HD64461_PCC1SCR		HD64461_IO_OFFSET(0x2018)	/* socket 1 software control */
 
 /* PCC Interface Status Register */
 #define	HD64461_PCCISR_READY		0x80	/* card ready */
@@ -189,41 +191,41 @@
 #define	HD64461_PCCSCR_SWP		0x01	/* write protect */
 
 /* PCC0 Output Pins Control Register */
-#define	HD64461_P0OCR		(CONFIG_HD64461_IOBASE + 0x202a)
+#define	HD64461_P0OCR		HD64461_IO_OFFSET(0x202a)
 
 /* PCC1 Output Pins Control Register */
-#define	HD64461_P1OCR		(CONFIG_HD64461_IOBASE + 0x202c)
+#define	HD64461_P1OCR		HD64461_IO_OFFSET(0x202c)
 
 /* PC Card General Control Register */
-#define	HD64461_PGCR		(CONFIG_HD64461_IOBASE + 0x202e)
+#define	HD64461_PGCR		HD64461_IO_OFFSET(0x202e)
 
 /* Port Control Registers */
-#define	HD64461_GPACR		(CONFIG_HD64461_IOBASE + 0x4000)	/* Port A - Handles IRDA/TIMER */
-#define	HD64461_GPBCR		(CONFIG_HD64461_IOBASE + 0x4002)	/* Port B - Handles UART */
-#define	HD64461_GPCCR		(CONFIG_HD64461_IOBASE + 0x4004)	/* Port C - Handles PCMCIA 1 */
-#define	HD64461_GPDCR		(CONFIG_HD64461_IOBASE + 0x4006)	/* Port D - Handles PCMCIA 1 */
+#define	HD64461_GPACR		HD64461_IO_OFFSET(0x4000)	/* Port A - Handles IRDA/TIMER */
+#define	HD64461_GPBCR		HD64461_IO_OFFSET(0x4002)	/* Port B - Handles UART */
+#define	HD64461_GPCCR		HD64461_IO_OFFSET(0x4004)	/* Port C - Handles PCMCIA 1 */
+#define	HD64461_GPDCR		HD64461_IO_OFFSET(0x4006)	/* Port D - Handles PCMCIA 1 */
 
 /* Port Control Data Registers */
-#define	HD64461_GPADR		(CONFIG_HD64461_IOBASE + 0x4010)	/* A */
-#define	HD64461_GPBDR		(CONFIG_HD64461_IOBASE + 0x4012)	/* B */
-#define	HD64461_GPCDR		(CONFIG_HD64461_IOBASE + 0x4014)	/* C */
-#define	HD64461_GPDDR		(CONFIG_HD64461_IOBASE + 0x4016)	/* D */
+#define	HD64461_GPADR		HD64461_IO_OFFSET(0x4010)	/* A */
+#define	HD64461_GPBDR		HD64461_IO_OFFSET(0x4012)	/* B */
+#define	HD64461_GPCDR		HD64461_IO_OFFSET(0x4014)	/* C */
+#define	HD64461_GPDDR		HD64461_IO_OFFSET(0x4016)	/* D */
 
 /* Interrupt Control Registers */
-#define	HD64461_GPAICR		(CONFIG_HD64461_IOBASE + 0x4020)	/* A */
-#define	HD64461_GPBICR		(CONFIG_HD64461_IOBASE + 0x4022)	/* B */
-#define	HD64461_GPCICR		(CONFIG_HD64461_IOBASE + 0x4024)	/* C */
-#define	HD64461_GPDICR		(CONFIG_HD64461_IOBASE + 0x4026)	/* D */
+#define	HD64461_GPAICR		HD64461_IO_OFFSET(0x4020)	/* A */
+#define	HD64461_GPBICR		HD64461_IO_OFFSET(0x4022)	/* B */
+#define	HD64461_GPCICR		HD64461_IO_OFFSET(0x4024)	/* C */
+#define	HD64461_GPDICR		HD64461_IO_OFFSET(0x4026)	/* D */
 
 /* Interrupt Status Registers */
-#define	HD64461_GPAISR		(CONFIG_HD64461_IOBASE + 0x4040)	/* A */
-#define	HD64461_GPBISR		(CONFIG_HD64461_IOBASE + 0x4042)	/* B */
-#define	HD64461_GPCISR		(CONFIG_HD64461_IOBASE + 0x4044)	/* C */
-#define	HD64461_GPDISR		(CONFIG_HD64461_IOBASE + 0x4046)	/* D */
+#define	HD64461_GPAISR		HD64461_IO_OFFSET(0x4040)	/* A */
+#define	HD64461_GPBISR		HD64461_IO_OFFSET(0x4042)	/* B */
+#define	HD64461_GPCISR		HD64461_IO_OFFSET(0x4044)	/* C */
+#define	HD64461_GPDISR		HD64461_IO_OFFSET(0x4046)	/* D */
 
 /* Interrupt Request Register & Interrupt Mask Register */
-#define	HD64461_NIRR		(CONFIG_HD64461_IOBASE + 0x5000)
-#define	HD64461_NIMR		(CONFIG_HD64461_IOBASE + 0x5002)
+#define	HD64461_NIRR		HD64461_IO_OFFSET(0x5000)
+#define	HD64461_NIMR		HD64461_IO_OFFSET(0x5002)
 
 #define	HD64461_IRQBASE		OFFCHIP_IRQ_BASE
 #define	OFFCHIP_IRQ_BASE	64
-- 
cgit v0.10.2


From 639138a991aaf1a3f65cc66700d097edc5f602fe Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Fri, 15 May 2009 12:07:17 +0900
Subject: sound: oss: sh_dac_audio timer fixes.

This patch modifies sh_dac_audio in the following ways:

- use high resolution timer instead of TMU1
- fix cpu/dac.h include
- add future rewrite comment

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/sound/oss/Kconfig b/sound/oss/Kconfig
index 1ca7427..bcf2a06 100644
--- a/sound/oss/Kconfig
+++ b/sound/oss/Kconfig
@@ -561,7 +561,7 @@ endif	# SOUND_OSS
 
 config SOUND_SH_DAC_AUDIO
 	tristate "SuperH DAC audio support"
-	depends on CPU_SH3
+	depends on CPU_SH3 && HIGH_RES_TIMERS
 
 config SOUND_SH_DAC_AUDIO_CHANNEL
 	int "DAC channel"
diff --git a/sound/oss/sh_dac_audio.c b/sound/oss/sh_dac_audio.c
index 78cfb66..b2ed875 100644
--- a/sound/oss/sh_dac_audio.c
+++ b/sound/oss/sh_dac_audio.c
@@ -18,47 +18,36 @@
 #include <linux/sound.h>
 #include <linux/soundcard.h>
 #include <linux/interrupt.h>
+#include <linux/hrtimer.h>
 #include <asm/io.h>
 #include <asm/uaccess.h>
 #include <asm/irq.h>
 #include <asm/delay.h>
 #include <asm/clock.h>
-#include <asm/cpu/dac.h>
-#include <asm/cpu/timer.h>
+#include <cpu/dac.h>
 #include <asm/machvec.h>
 #include <mach/hp6xx.h>
 #include <asm/hd64461.h>
 
 #define MODNAME "sh_dac_audio"
 
-#define TMU_TOCR_INIT	0x00
-
-#define TMU1_TCR_INIT	0x0020	/* Clock/4, rising edge; interrupt on */
-#define TMU1_TSTR_INIT  0x02	/* Bit to turn on TMU1 */
-
 #define BUFFER_SIZE 48000
 
 static int rate;
 static int empty;
 static char *data_buffer, *buffer_begin, *buffer_end;
 static int in_use, device_major;
+static struct hrtimer hrtimer;
+static ktime_t wakeups_per_second;
 
 static void dac_audio_start_timer(void)
 {
-	u8 tstr;
-
-	tstr = ctrl_inb(TMU_TSTR);
-	tstr |= TMU1_TSTR_INIT;
-	ctrl_outb(tstr, TMU_TSTR);
+	hrtimer_start(&hrtimer, wakeups_per_second, HRTIMER_MODE_REL);
 }
 
 static void dac_audio_stop_timer(void)
 {
-	u8 tstr;
-
-	tstr = ctrl_inb(TMU_TSTR);
-	tstr &= ~TMU1_TSTR_INIT;
-	ctrl_outb(tstr, TMU_TSTR);
+	hrtimer_cancel(&hrtimer);
 }
 
 static void dac_audio_reset(void)
@@ -77,38 +66,30 @@ static void dac_audio_sync(void)
 static void dac_audio_start(void)
 {
 	if (mach_is_hp6xx()) {
-		u16 v = inw(HD64461_GPADR);
+		u16 v = __raw_readw(HD64461_GPADR);
 		v &= ~HD64461_GPADR_SPEAKER;
-		outw(v, HD64461_GPADR);
+		__raw_writew(v, HD64461_GPADR);
 	}
 
 	sh_dac_enable(CONFIG_SOUND_SH_DAC_AUDIO_CHANNEL);
-	ctrl_outw(TMU1_TCR_INIT, TMU1_TCR);
 }
 static void dac_audio_stop(void)
 {
 	dac_audio_stop_timer();
 
 	if (mach_is_hp6xx()) {
-		u16 v = inw(HD64461_GPADR);
+		u16 v = __raw_readw(HD64461_GPADR);
 		v |= HD64461_GPADR_SPEAKER;
-		outw(v, HD64461_GPADR);
+		__raw_writew(v, HD64461_GPADR);
 	}
 
- 	sh_dac_output(0, CONFIG_SOUND_SH_DAC_AUDIO_CHANNEL);
+	sh_dac_output(0, CONFIG_SOUND_SH_DAC_AUDIO_CHANNEL);
 	sh_dac_disable(CONFIG_SOUND_SH_DAC_AUDIO_CHANNEL);
 }
 
 static void dac_audio_set_rate(void)
 {
-	unsigned long interval;
- 	struct clk *clk;
-
- 	clk = clk_get(NULL, "module_clk");
- 	interval = (clk_get_rate(clk) / 4) / rate;
- 	clk_put(clk);
-	ctrl_outl(interval, TMU1_TCOR);
-	ctrl_outl(interval, TMU1_TCNT);
+	wakeups_per_second = ktime_set(0, 1000000000 / rate);
 }
 
 static int dac_audio_ioctl(struct inode *inode, struct file *file,
@@ -265,32 +246,26 @@ const struct file_operations dac_audio_fops = {
       .release =	dac_audio_release,
 };
 
-static irqreturn_t timer1_interrupt(int irq, void *dev)
+static enum hrtimer_restart sh_dac_audio_timer(struct hrtimer *handle)
 {
-	unsigned long timer_status;
-
-	timer_status = ctrl_inw(TMU1_TCR);
-	timer_status &= ~0x100;
-	ctrl_outw(timer_status, TMU1_TCR);
-
 	if (!empty) {
 		sh_dac_output(*buffer_begin, CONFIG_SOUND_SH_DAC_AUDIO_CHANNEL);
 		buffer_begin++;
 
 		if (buffer_begin == data_buffer + BUFFER_SIZE)
 			buffer_begin = data_buffer;
-		if (buffer_begin == buffer_end) {
+		if (buffer_begin == buffer_end)
 			empty = 1;
-			dac_audio_stop_timer();
-		}
 	}
-	return IRQ_HANDLED;
+
+	if (!empty)
+		hrtimer_start(&hrtimer, wakeups_per_second, HRTIMER_MODE_REL);
+
+	return HRTIMER_NORESTART;
 }
 
 static int __init dac_audio_init(void)
 {
-	int retval;
-
 	if ((device_major = register_sound_dsp(&dac_audio_fops, -1)) < 0) {
 		printk(KERN_ERR "Cannot register dsp device");
 		return device_major;
@@ -306,21 +281,25 @@ static int __init dac_audio_init(void)
 	rate = 8000;
 	dac_audio_set_rate();
 
-	retval =
-	    request_irq(TIMER1_IRQ, timer1_interrupt, IRQF_DISABLED, MODNAME, 0);
-	if (retval < 0) {
-		printk(KERN_ERR "sh_dac_audio: IRQ %d request failed\n",
-		       TIMER1_IRQ);
-		return retval;
-	}
+	/* Today: High Resolution Timer driven DAC playback.
+	 * The timer callback gets called once per sample. Ouch.
+	 *
+	 * Future: A much better approach would be to use the
+	 * SH7720 CMT+DMAC+DAC hardware combination like this:
+	 * - Program sample rate using CMT0 or CMT1
+	 * - Program DMAC to use CMT for timing and output to DAC
+	 * - Play sound using DMAC, let CPU sleep.
+	 * - While at it, rewrite this driver to use ALSA.
+	 */
+
+	hrtimer_init(&hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer.function = sh_dac_audio_timer;
 
 	return 0;
 }
 
 static void __exit dac_audio_exit(void)
 {
-	free_irq(TIMER1_IRQ, 0);
-
 	unregister_sound_dsp(device_major);
 	kfree((void *)data_buffer);
 }
-- 
cgit v0.10.2


From 29a679754b1a2581ee456eada6c2de7ce95068bb Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 14 May 2009 23:19:09 -0400
Subject: x86/stacktrace: return 0 instead of -1 for stack ops

If we return -1 in the ops->stack for the stacktrace saving, we end up
breaking out of the loop if the stack we are tracing is in the exception
stack. This causes traces like:

          <idle>-0     [002] 34263.745825: raise_softirq_irqoff <-__blk_complete_request
          <idle>-0     [002] 34263.745826:
 <= 0
 <= 0
 <= 0
 <= 0
 <= 0
 <= 0
 <= 0

By returning "0" instead, the irq stack is saved as well, and we see:

          <idle>-0     [003]   883.280992: raise_softirq_irqoff <-__hrtimer_star
t_range_ns
          <idle>-0     [003]   883.280992:
 <= hrtimer_start_range_ns
 <= tick_nohz_restart_sched_tick
 <= cpu_idle
 <= start_secondary
 <=
 <= 0
 <= 0

[ Impact: record stacks from interrupts ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index f7bddc2..4aaf7e4 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -20,7 +20,7 @@ save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
 
 static int save_stack_stack(void *data, char *name)
 {
-	return -1;
+	return 0;
 }
 
 static void save_stack_address(void *data, unsigned long addr, int reliable)
-- 
cgit v0.10.2


From 1ec7c4849c214fc78b023230264399836ea3b245 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 14 May 2009 23:40:06 -0400
Subject: tracing: stop stack trace on first empty entry

The stack tracer stores eight entries in the ring buffer when an event
traces the stack. The output outputs all eight entries regardless of
how many entries were recorded.

This patch breaks out of the loop when a null entry is discovered.

[ Impact: only print the stack that is recorded ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8bd9a2c..489c0e8 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -898,6 +898,8 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 	trace_assign_type(field, iter->ent);
 
 	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+		if (!field->caller[i])
+			break;
 		if (i) {
 			if (!trace_seq_puts(s, " <= "))
 				goto partial;
-- 
cgit v0.10.2


From 8cd995b6deedf98b7694ed32a786ee7f793d1eec Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 15 May 2009 11:07:27 +0800
Subject: tracing/filters: add missing unlock in a failure path

[ Impact: fix deadlock in a rare case we fail to allocate memory ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4A0CDC6F.7070200@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 85ad6a8..22c2998 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1079,9 +1079,10 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 		return 0;
 	}
 
+	err = -ENOMEM;
 	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
 	if (!ps)
-		return -ENOMEM;
+		goto out_unlock;
 
 	filter_disable_preds(call);
 	replace_filter_string(call->filter, filter_string);
@@ -1101,7 +1102,7 @@ out:
 	filter_opstack_clear(ps);
 	postfix_clear(ps);
 	kfree(ps);
-
+out_unlock:
 	mutex_unlock(&filter_mutex);
 
 	return err;
@@ -1123,9 +1124,10 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
 		return 0;
 	}
 
+	err = -ENOMEM;
 	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
 	if (!ps)
-		return -ENOMEM;
+		goto out_unlock;
 
 	filter_free_subsystem_preds(system);
 	replace_filter_string(system->filter, filter_string);
@@ -1145,7 +1147,7 @@ out:
 	filter_opstack_clear(ps);
 	postfix_clear(ps);
 	kfree(ps);
-
+out_unlock:
 	mutex_unlock(&filter_mutex);
 
 	return err;
-- 
cgit v0.10.2


From 5872144f64b34a5942f6b4acedc90b02de72c58b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 15 May 2009 11:07:56 +0800
Subject: tracing/filters: fix off-by-one bug

We should leave the last slot for the ending '\0'.

[ Impact: fix possible crash when the length of an operand is 128 ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4A0CDC8C.30602@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 22c2998..a7430b1 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -736,7 +736,7 @@ static inline void clear_operand_string(struct filter_parse_state *ps)
 
 static inline int append_operand_char(struct filter_parse_state *ps, char c)
 {
-	if (ps->operand.tail == MAX_FILTER_STR_VAL)
+	if (ps->operand.tail == MAX_FILTER_STR_VAL - 1)
 		return -EINVAL;
 
 	ps->operand.string[ps->operand.tail++] = c;
-- 
cgit v0.10.2


From 1a853e36871b533ccc3f3c5bdd5cd0d867043a00 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 14 May 2009 22:50:46 -0300
Subject: perf record: Allow specifying a pid to record

Allow specifying a pid instead of always fork+exec'ing a command.

Because the PERF_EVENT_COMM and PERF_EVENT_MMAP events happened before
we connected, we must synthesize them so that 'perf report' can get what
it needs.

[ Impact: add new command line option ]

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20090515015046.GA13664@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 5f5e6df..efb8759 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -34,6 +34,9 @@
 
 #include "perf.h"
 
+#define ALIGN(x,a)		__ALIGN_MASK(x,(typeof(x))(a)-1)
+#define __ALIGN_MASK(x,mask)	(((x)+(mask))&~(mask))
+
 static int			nr_counters			=  0;
 static __u64			event_id[MAX_COUNTERS]		= { };
 static int			default_interval = 100000;
@@ -47,6 +50,7 @@ static char 			*output_name			= "output.perf";
 static int			group				= 0;
 static unsigned int		realtime_prio			= 0;
 static int			system_wide			= 0;
+static pid_t			target_pid			= -1;
 static int			inherit				= 1;
 static int			nmi				= 1;
 
@@ -180,6 +184,7 @@ static void display_help(void)
 	" -c CNT    --count=CNT          # event period to sample\n"
 	" -m pages  --mmap_pages=<pages> # number of mmap data pages\n"
 	" -o file   --output=<file>      # output file\n"
+	" -p pid    --pid=<pid>		 # record events on existing pid\n"
 	" -r prio   --realtime=<prio>    # use RT prio\n"
 	" -s        --system             # system wide profiling\n"
 	);
@@ -199,13 +204,14 @@ static void process_options(int argc, const char *argv[])
 			{"event",	required_argument,	NULL, 'e'},
 			{"mmap_pages",	required_argument,	NULL, 'm'},
 			{"output",	required_argument,	NULL, 'o'},
+			{"pid",		required_argument,	NULL, 'p'},
 			{"realtime",	required_argument,	NULL, 'r'},
 			{"system",	no_argument,		NULL, 's'},
 			{"inherit",	no_argument,		NULL, 'i'},
 			{"nmi",		no_argument,		NULL, 'n'},
 			{NULL,		0,			NULL,  0 }
 		};
-		int c = getopt_long(argc, argv, "+:c:e:m:o:r:sin",
+		int c = getopt_long(argc, argv, "+:c:e:m:o:p:r:sin",
 				    long_options, &option_index);
 		if (c == -1)
 			break;
@@ -215,6 +221,7 @@ static void process_options(int argc, const char *argv[])
 		case 'e': error				= parse_events(optarg); break;
 		case 'm': mmap_pages			=   atoi(optarg); break;
 		case 'o': output_name			= strdup(optarg); break;
+		case 'p': target_pid			=   atoi(optarg); break;
 		case 'r': realtime_prio			=   atoi(optarg); break;
 		case 's': system_wide                   ^=             1; break;
 		case 'i': inherit			^=	       1; break;
@@ -223,7 +230,7 @@ static void process_options(int argc, const char *argv[])
 		}
 	}
 
-	if (argc - optind == 0)
+	if (argc - optind == 0 && target_pid == -1)
 		error = 1;
 
 	if (error)
@@ -350,15 +357,135 @@ static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 static int nr_poll;
 static int nr_cpu;
 
-static void open_counters(int cpu)
+struct mmap_event {
+	struct perf_event_header header;
+	__u32 pid, tid;
+	__u64 start;
+	__u64 len;
+	__u64 pgoff;
+	char filename[PATH_MAX];
+};
+struct comm_event {
+	struct perf_event_header header;
+	__u32 pid,tid;
+	char comm[16];
+};
+
+static pid_t pid_synthesize_comm_event(pid_t pid)
+{
+	char filename[PATH_MAX];
+	char bf[BUFSIZ];
+	struct comm_event comm_ev;
+	size_t size;
+	int fd;
+
+	snprintf(filename, sizeof(filename), "/proc/%d/stat", pid);
+
+	fd = open(filename, O_RDONLY);
+	if (fd < 0) {
+		fprintf(stderr, "couldn't open %s\n", filename);
+		exit(EXIT_FAILURE);
+	}
+	if (read(fd, bf, sizeof(bf)) < 0) {
+		fprintf(stderr, "couldn't read %s\n", filename);
+		exit(EXIT_FAILURE);
+	}
+	close(fd);
+
+	pid_t spid, ppid;
+	char state;
+	char comm[18];
+
+	memset(&comm_ev, 0, sizeof(comm_ev));
+        int nr = sscanf(bf, "%d %s %c %d %d ",
+			&spid, comm, &state, &ppid, &comm_ev.pid);
+	if (nr != 5) {
+		fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n",
+			filename);
+		exit(EXIT_FAILURE);
+	}
+	comm_ev.header.type = PERF_EVENT_COMM;
+	comm_ev.tid = pid;
+	size = strlen(comm);
+	comm[--size] = '\0'; /* Remove the ')' at the end */
+	--size; /* Remove the '(' at the begin */
+	memcpy(comm_ev.comm, comm + 1, size);
+	size = ALIGN(size, sizeof(uint64_t));
+	comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
+	int ret = write(output, &comm_ev, comm_ev.header.size);
+	if (ret < 0) {
+		perror("failed to write");
+		exit(-1);
+	}
+	return comm_ev.pid;
+}
+
+static void pid_synthesize_mmap_events(pid_t pid, pid_t pgid)
+{
+	char filename[PATH_MAX];
+	FILE *fp;
+
+	snprintf(filename, sizeof(filename), "/proc/%d/maps", pid);
+
+	fp = fopen(filename, "r");
+	if (fp == NULL) {
+		fprintf(stderr, "couldn't open %s\n", filename);
+		exit(EXIT_FAILURE);
+	}
+	while (1) {
+		char bf[BUFSIZ];
+		unsigned char vm_read, vm_write, vm_exec, vm_mayshare;
+		struct mmap_event mmap_ev = {
+			.header.type = PERF_EVENT_MMAP,
+		};
+		unsigned long ino;
+		int major, minor;
+		size_t size;
+		if (fgets(bf, sizeof(bf), fp) == NULL)
+			break;
+
+		/* 00400000-0040c000 r-xp 00000000 fd:01 41038  /bin/cat */
+		sscanf(bf, "%llx-%llx %c%c%c%c %llx %x:%x %lu",
+			&mmap_ev.start, &mmap_ev.len,
+                        &vm_read, &vm_write, &vm_exec, &vm_mayshare,
+                        &mmap_ev.pgoff, &major, &minor, &ino);
+		if (vm_exec == 'x') {
+			char *execname = strrchr(bf, ' ');
+
+			if (execname == NULL || execname[1] != '/')
+				continue;
+
+			execname += 1;
+			size = strlen(execname);
+			execname[size - 1] = '\0'; /* Remove \n */
+			memcpy(mmap_ev.filename, execname, size);
+			size = ALIGN(size, sizeof(uint64_t));
+			mmap_ev.len -= mmap_ev.start;
+			mmap_ev.header.size = (sizeof(mmap_ev) -
+					       (sizeof(mmap_ev.filename) - size));
+			mmap_ev.pid = pgid;
+			mmap_ev.tid = pid;
+
+			if (write(output, &mmap_ev, mmap_ev.header.size) < 0) {
+				perror("failed to write");
+				exit(-1);
+			}
+		}
+	}
+
+	fclose(fp);
+}
+
+static void open_counters(int cpu, pid_t pid)
 {
 	struct perf_counter_hw_event hw_event;
 	int counter, group_fd;
 	int track = 1;
-	pid_t pid = -1;
 
-	if (cpu < 0)
-		pid = 0;
+	if (pid > 0) {
+		pid_t pgid = pid_synthesize_comm_event(pid);
+		pid_synthesize_mmap_events(pid, pgid);
+	}
 
 	group_fd = -1;
 	for (counter = 0; counter < nr_counters; counter++) {
@@ -435,22 +562,24 @@ int cmd_record(int argc, const char **argv)
 	argc -= optind;
 	argv += optind;
 
-	if (!system_wide)
-		open_counters(-1);
-	else for (i = 0; i < nr_cpus; i++)
-		open_counters(i);
+	if (!system_wide) {
+		open_counters(-1, target_pid != -1 ? target_pid : 0);
+	} else for (i = 0; i < nr_cpus; i++)
+		open_counters(i, target_pid);
 
 	signal(SIGCHLD, sig_handler);
 	signal(SIGINT, sig_handler);
 
-	pid = fork();
-	if (pid < 0)
-		perror("failed to fork");
+	if (target_pid == -1) {
+		pid = fork();
+		if (pid < 0)
+			perror("failed to fork");
 
-	if (!pid) {
-		if (execvp(argv[0], argv)) {
-			perror(argv[0]);
-			exit(-1);
+		if (!pid) {
+			if (execvp(argv[0], argv)) {
+				perror(argv[0]);
+				exit(-1);
+			}
 		}
 	}
 
-- 
cgit v0.10.2


From ec3232bdf8518bea8410f0027f870b24d3aa8753 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 May 2009 09:45:19 +0200
Subject: perf_counter: x86: More accurate counter update

Take the counter width into account instead of assuming 32 bits.

In particular Nehalem has 44 bit wide counters, and all
arithmetics should happen on a 44-bit signed integer basis.

[ Impact: fix rare event imprecision, warning message on Nehalem ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f7772ff..3a92a2b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -138,7 +138,9 @@ static u64
 x86_perf_counter_update(struct perf_counter *counter,
 			struct hw_perf_counter *hwc, int idx)
 {
-	u64 prev_raw_count, new_raw_count, delta;
+	int shift = 64 - x86_pmu.counter_bits;
+	u64 prev_raw_count, new_raw_count;
+	s64 delta;
 
 	/*
 	 * Careful: an NMI might modify the previous counter value.
@@ -161,9 +163,10 @@ again:
 	 * (counter-)time and add that to the generic counter.
 	 *
 	 * Careful, not all hw sign-extends above the physical width
-	 * of the count, so we do that by clipping the delta to 32 bits:
+	 * of the count.
 	 */
-	delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
+	delta = (new_raw_count << shift) - (prev_raw_count << shift);
+	delta >>= shift;
 
 	atomic64_add(delta, &counter->count);
 	atomic64_sub(delta, &hwc->period_left);
-- 
cgit v0.10.2


From f5a5a2f6e69e88647ae12da39f0ff3a510bcf0a6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 13 May 2009 12:54:01 +0200
Subject: perf_counter: x86: Fix throttling

If counters are disabled globally when a perfcounter IRQ/NMI hits,
and if we throttle in that case, we'll promote the '0' value to
the next lapic IRQ and disable all perfcounters at that point,
permanently ...

Fix it.

[ Impact: fix hung perfcounters under load ]

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 3a92a2b..88ae8ce 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -765,8 +765,13 @@ out:
 	/*
 	 * Restore - do not reenable when global enable is off or throttled:
 	 */
-	if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
-		intel_pmu_restore_all(cpuc->throttle_ctrl);
+	if (cpuc->throttle_ctrl) {
+		if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) {
+			intel_pmu_restore_all(cpuc->throttle_ctrl);
+		} else {
+			pr_info("CPU#%d: perfcounters: max interrupt rate exceeded! Throttle on.\n", smp_processor_id());
+		}
+	}
 
 	return ret;
 }
@@ -817,11 +822,16 @@ void perf_counter_unthrottle(void)
 
 	cpuc = &__get_cpu_var(cpu_hw_counters);
 	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
-		if (printk_ratelimit())
-			printk(KERN_WARNING "perfcounters: max interrupts exceeded!\n");
+		pr_info("CPU#%d: perfcounters: throttle off.\n", smp_processor_id());
+
+		/*
+		 * Clear them before re-enabling irqs/NMIs again:
+		 */
+		cpuc->interrupts = 0;
 		hw_perf_restore(cpuc->throttle_ctrl);
+	} else {
+		cpuc->interrupts = 0;
 	}
-	cpuc->interrupts = 0;
 }
 
 void smp_perf_counter_interrupt(struct pt_regs *regs)
-- 
cgit v0.10.2


From a026dfecc035f213c1cfa0bf6407ce3155f6a9df Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 May 2009 10:02:57 +0200
Subject: perf_counter: x86: Allow unpriviliged use of NMIs

Apply sysctl_perf_counter_priv to NMIs. Also, fail the counter
creation instead of silently down-grading to regular interrupts.

[ Impact: allow wider perf-counter usage ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 88ae8ce..c19e927 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -280,8 +280,11 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * If privileged enough, allow NMI events:
 	 */
 	hwc->nmi = 0;
-	if (capable(CAP_SYS_ADMIN) && hw_event->nmi)
+	if (hw_event->nmi) {
+		if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN))
+			return -EACCES;
 		hwc->nmi = 1;
+	}
 
 	hwc->irq_period	= hw_event->irq_period;
 	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period)
-- 
cgit v0.10.2


From 53020fe81eecd0b7be295868ce5850ef8f41074e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 May 2009 21:26:19 +0200
Subject: perf_counter: Fix perf_output_copy() WARN to account for overflow

The simple reservation test in perf_output_copy() failed to take
unsigned int overflow into account, fix this.

[ Impact: fix false positive warning with more than 4GB of profiling data ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ff166c1..985be0b 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1927,7 +1927,11 @@ static void perf_output_copy(struct perf_output_handle *handle,
 
 	handle->offset = offset;
 
-	WARN_ON_ONCE(handle->offset > handle->head);
+	/*
+	 * Check we didn't copy past our reservation window, taking the
+	 * possible unsigned int wrap into account.
+	 */
+	WARN_ON_ONCE(((int)(handle->head - handle->offset)) < 0);
 }
 
 #define perf_output_put(handle, x) \
-- 
cgit v0.10.2


From 962bf7a66edca4d36a730a38ff8410a67f560e40 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 May 2009 13:21:36 +0200
Subject: perf_counter: x86: Fix up the amd NMI/INT throttle

perf_counter_unthrottle() restores throttle_ctrl, buts its never set.
Also, we fail to disable all counters when throttling.

[ Impact: fix rare stuck perf-counters when they are throttled ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index c19e927..7601c01 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -334,6 +334,8 @@ static u64 amd_pmu_save_disable_all(void)
 	 * right thing.
 	 */
 	barrier();
+	if (!enabled)
+		goto out;
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
@@ -347,6 +349,7 @@ static u64 amd_pmu_save_disable_all(void)
 		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
 	}
 
+out:
 	return enabled;
 }
 
@@ -787,32 +790,43 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	int handled = 0;
 	struct perf_counter *counter;
 	struct hw_perf_counter *hwc;
-	int idx;
+	int idx, throttle = 0;
+
+	cpuc->throttle_ctrl = cpuc->enabled;
+	cpuc->enabled = 0;
+	barrier();
+
+	if (cpuc->throttle_ctrl) {
+		if (++cpuc->interrupts >= PERFMON_MAX_INTERRUPTS)
+			throttle = 1;
+	}
 
-	++cpuc->interrupts;
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		int disable = 0;
+
 		if (!test_bit(idx, cpuc->active_mask))
 			continue;
+
 		counter = cpuc->counters[idx];
 		hwc = &counter->hw;
 		val = x86_perf_counter_update(counter, hwc, idx);
 		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
-			continue;
+			goto next;
+
 		/* counter overflow */
 		x86_perf_counter_set_period(counter, hwc, idx);
 		handled = 1;
 		inc_irq_stat(apic_perf_irqs);
-		if (perf_counter_overflow(counter, nmi, regs, 0))
-			amd_pmu_disable_counter(hwc, idx);
-		else if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS)
-			/*
-			 * do not reenable when throttled, but reload
-			 * the register
-			 */
+		disable = perf_counter_overflow(counter, nmi, regs, 0);
+
+next:
+		if (disable || throttle)
 			amd_pmu_disable_counter(hwc, idx);
-		else if (counter->state == PERF_COUNTER_STATE_ACTIVE)
-			amd_pmu_enable_counter(hwc, idx);
 	}
+
+	if (cpuc->throttle_ctrl && !throttle)
+		cpuc->enabled = 1;
+
 	return handled;
 }
 
-- 
cgit v0.10.2


From 9e35ad388bea89f7d6f375af4c0ae98803688666 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 May 2009 16:21:38 +0200
Subject: perf_counter: Rework the perf counter disable/enable

The current disable/enable mechanism is:

	token = hw_perf_save_disable();
	...
	/* do bits */
	...
	hw_perf_restore(token);

This works well, provided that the use nests properly. Except we don't.

x86 NMI/INT throttling has non-nested use of this, breaking things. Therefore
provide a reference counter disable/enable interface, where the first disable
disables the hardware, and the last enable enables the hardware again.

[ Impact: refactor, simplify the PMU disable/enable logic ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 15cdc8e..bb1b463 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -386,7 +386,7 @@ static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0)
  * Disable all counters to prevent PMU interrupts and to allow
  * counters to be added or removed.
  */
-u64 hw_perf_save_disable(void)
+void hw_perf_disable(void)
 {
 	struct cpu_hw_counters *cpuhw;
 	unsigned long ret;
@@ -428,7 +428,6 @@ u64 hw_perf_save_disable(void)
 		mb();
 	}
 	local_irq_restore(flags);
-	return ret;
 }
 
 /*
@@ -436,7 +435,7 @@ u64 hw_perf_save_disable(void)
  * If we were previously disabled and counters were added, then
  * put the new config on the PMU.
  */
-void hw_perf_restore(u64 disable)
+void hw_perf_enable(void)
 {
 	struct perf_counter *counter;
 	struct cpu_hw_counters *cpuhw;
@@ -448,9 +447,12 @@ void hw_perf_restore(u64 disable)
 	int n_lim;
 	int idx;
 
-	if (disable)
-		return;
 	local_irq_save(flags);
+	if (!cpuhw->disabled) {
+		local_irq_restore(flags);
+		return;
+	}
+
 	cpuhw = &__get_cpu_var(cpu_hw_counters);
 	cpuhw->disabled = 0;
 
@@ -649,19 +651,18 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
 /*
  * Add a counter to the PMU.
  * If all counters are not already frozen, then we disable and
- * re-enable the PMU in order to get hw_perf_restore to do the
+ * re-enable the PMU in order to get hw_perf_enable to do the
  * actual work of reconfiguring the PMU.
  */
 static int power_pmu_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuhw;
 	unsigned long flags;
-	u64 pmudis;
 	int n0;
 	int ret = -EAGAIN;
 
 	local_irq_save(flags);
-	pmudis = hw_perf_save_disable();
+	perf_disable();
 
 	/*
 	 * Add the counter to the list (if there is room)
@@ -685,7 +686,7 @@ static int power_pmu_enable(struct perf_counter *counter)
 
 	ret = 0;
  out:
-	hw_perf_restore(pmudis);
+	perf_enable();
 	local_irq_restore(flags);
 	return ret;
 }
@@ -697,11 +698,10 @@ static void power_pmu_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuhw;
 	long i;
-	u64 pmudis;
 	unsigned long flags;
 
 	local_irq_save(flags);
-	pmudis = hw_perf_save_disable();
+	perf_disable();
 
 	power_pmu_read(counter);
 
@@ -735,7 +735,7 @@ static void power_pmu_disable(struct perf_counter *counter)
 		cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
 	}
 
-	hw_perf_restore(pmudis);
+	perf_enable();
 	local_irq_restore(flags);
 }
 
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 7601c01..313638c 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -31,7 +31,6 @@ struct cpu_hw_counters {
 	unsigned long		used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		interrupts;
-	u64			throttle_ctrl;
 	int			enabled;
 };
 
@@ -42,8 +41,8 @@ struct x86_pmu {
 	const char	*name;
 	int		version;
 	int		(*handle_irq)(struct pt_regs *, int);
-	u64		(*save_disable_all)(void);
-	void		(*restore_all)(u64);
+	void		(*disable_all)(void);
+	void		(*enable_all)(void);
 	void		(*enable)(struct hw_perf_counter *, int);
 	void		(*disable)(struct hw_perf_counter *, int);
 	unsigned	eventsel;
@@ -56,6 +55,7 @@ struct x86_pmu {
 	int		counter_bits;
 	u64		counter_mask;
 	u64		max_period;
+	u64		intel_ctrl;
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -311,22 +311,19 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	return 0;
 }
 
-static u64 intel_pmu_save_disable_all(void)
+static void intel_pmu_disable_all(void)
 {
-	u64 ctrl;
-
-	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-
-	return ctrl;
 }
 
-static u64 amd_pmu_save_disable_all(void)
+static void amd_pmu_disable_all(void)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	int enabled, idx;
+	int idx;
+
+	if (!cpuc->enabled)
+		return;
 
-	enabled = cpuc->enabled;
 	cpuc->enabled = 0;
 	/*
 	 * ensure we write the disable before we start disabling the
@@ -334,8 +331,6 @@ static u64 amd_pmu_save_disable_all(void)
 	 * right thing.
 	 */
 	barrier();
-	if (!enabled)
-		goto out;
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
@@ -348,37 +343,31 @@ static u64 amd_pmu_save_disable_all(void)
 		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
 		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
 	}
-
-out:
-	return enabled;
 }
 
-u64 hw_perf_save_disable(void)
+void hw_perf_disable(void)
 {
 	if (!x86_pmu_initialized())
-		return 0;
-	return x86_pmu.save_disable_all();
+		return;
+	return x86_pmu.disable_all();
 }
-/*
- * Exported because of ACPI idle
- */
-EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
-static void intel_pmu_restore_all(u64 ctrl)
+static void intel_pmu_enable_all(void)
 {
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
 }
 
-static void amd_pmu_restore_all(u64 ctrl)
+static void amd_pmu_enable_all(void)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	int idx;
 
-	cpuc->enabled = ctrl;
-	barrier();
-	if (!ctrl)
+	if (cpuc->enabled)
 		return;
 
+	cpuc->enabled = 1;
+	barrier();
+
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
@@ -392,16 +381,12 @@ static void amd_pmu_restore_all(u64 ctrl)
 	}
 }
 
-void hw_perf_restore(u64 ctrl)
+void hw_perf_enable(void)
 {
 	if (!x86_pmu_initialized())
 		return;
-	x86_pmu.restore_all(ctrl);
+	x86_pmu.enable_all();
 }
-/*
- * Exported because of ACPI idle
- */
-EXPORT_SYMBOL_GPL(hw_perf_restore);
 
 static inline u64 intel_pmu_get_status(void)
 {
@@ -735,15 +720,14 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	int bit, cpu = smp_processor_id();
 	u64 ack, status;
 	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
-	int ret = 0;
-
-	cpuc->throttle_ctrl = intel_pmu_save_disable_all();
 
+	perf_disable();
 	status = intel_pmu_get_status();
-	if (!status)
-		goto out;
+	if (!status) {
+		perf_enable();
+		return 0;
+	}
 
-	ret = 1;
 again:
 	inc_irq_stat(apic_perf_irqs);
 	ack = status;
@@ -767,19 +751,11 @@ again:
 	status = intel_pmu_get_status();
 	if (status)
 		goto again;
-out:
-	/*
-	 * Restore - do not reenable when global enable is off or throttled:
-	 */
-	if (cpuc->throttle_ctrl) {
-		if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) {
-			intel_pmu_restore_all(cpuc->throttle_ctrl);
-		} else {
-			pr_info("CPU#%d: perfcounters: max interrupt rate exceeded! Throttle on.\n", smp_processor_id());
-		}
-	}
 
-	return ret;
+	if (++cpuc->interrupts != PERFMON_MAX_INTERRUPTS)
+		perf_enable();
+
+	return 1;
 }
 
 static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
@@ -792,13 +768,11 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	struct hw_perf_counter *hwc;
 	int idx, throttle = 0;
 
-	cpuc->throttle_ctrl = cpuc->enabled;
-	cpuc->enabled = 0;
-	barrier();
-
-	if (cpuc->throttle_ctrl) {
-		if (++cpuc->interrupts >= PERFMON_MAX_INTERRUPTS)
-			throttle = 1;
+	if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) {
+		throttle = 1;
+		__perf_disable();
+		cpuc->enabled = 0;
+		barrier();
 	}
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
@@ -824,9 +798,6 @@ next:
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
-	if (cpuc->throttle_ctrl && !throttle)
-		cpuc->enabled = 1;
-
 	return handled;
 }
 
@@ -839,13 +810,11 @@ void perf_counter_unthrottle(void)
 
 	cpuc = &__get_cpu_var(cpu_hw_counters);
 	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
-		pr_info("CPU#%d: perfcounters: throttle off.\n", smp_processor_id());
-
 		/*
 		 * Clear them before re-enabling irqs/NMIs again:
 		 */
 		cpuc->interrupts = 0;
-		hw_perf_restore(cpuc->throttle_ctrl);
+		perf_enable();
 	} else {
 		cpuc->interrupts = 0;
 	}
@@ -931,8 +900,8 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 static struct x86_pmu intel_pmu = {
 	.name			= "Intel",
 	.handle_irq		= intel_pmu_handle_irq,
-	.save_disable_all	= intel_pmu_save_disable_all,
-	.restore_all		= intel_pmu_restore_all,
+	.disable_all		= intel_pmu_disable_all,
+	.enable_all		= intel_pmu_enable_all,
 	.enable			= intel_pmu_enable_counter,
 	.disable		= intel_pmu_disable_counter,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
@@ -951,8 +920,8 @@ static struct x86_pmu intel_pmu = {
 static struct x86_pmu amd_pmu = {
 	.name			= "AMD",
 	.handle_irq		= amd_pmu_handle_irq,
-	.save_disable_all	= amd_pmu_save_disable_all,
-	.restore_all		= amd_pmu_restore_all,
+	.disable_all		= amd_pmu_disable_all,
+	.enable_all		= amd_pmu_enable_all,
 	.enable			= amd_pmu_enable_counter,
 	.disable		= amd_pmu_disable_counter,
 	.eventsel		= MSR_K7_EVNTSEL0,
@@ -1003,6 +972,8 @@ static int intel_pmu_init(void)
 	x86_pmu.counter_bits = eax.split.bit_width;
 	x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
 
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+
 	return 0;
 }
 
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index d2830f3..9645758 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -763,11 +763,9 @@ static int acpi_idle_bm_check(void)
  */
 static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 {
-	u64 perf_flags;
-
 	/* Don't trace irqs off for idle */
 	stop_critical_timings();
-	perf_flags = hw_perf_save_disable();
+	perf_disable();
 	if (cx->entry_method == ACPI_CSTATE_FFH) {
 		/* Call into architectural FFH based C-state */
 		acpi_processor_ffh_cstate_enter(cx);
@@ -782,7 +780,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 		   gets asserted in time to freeze execution properly. */
 		unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
 	}
-	hw_perf_restore(perf_flags);
+	perf_enable();
 	start_critical_timings();
 }
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 614f921..e543ecc 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -544,8 +544,10 @@ extern void perf_counter_exit_task(struct task_struct *child);
 extern void perf_counter_do_pending(void);
 extern void perf_counter_print_debug(void);
 extern void perf_counter_unthrottle(void);
-extern u64 hw_perf_save_disable(void);
-extern void hw_perf_restore(u64 ctrl);
+extern void __perf_disable(void);
+extern bool __perf_enable(void);
+extern void perf_disable(void);
+extern void perf_enable(void);
 extern int perf_counter_task_disable(void);
 extern int perf_counter_task_enable(void);
 extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
@@ -600,8 +602,8 @@ static inline void perf_counter_exit_task(struct task_struct *child)	{ }
 static inline void perf_counter_do_pending(void)			{ }
 static inline void perf_counter_print_debug(void)			{ }
 static inline void perf_counter_unthrottle(void)			{ }
-static inline void hw_perf_restore(u64 ctrl)				{ }
-static inline u64 hw_perf_save_disable(void)		      { return 0; }
+static inline void perf_disable(void)					{ }
+static inline void perf_enable(void)					{ }
 static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
 static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 985be0b..e814ff0 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -60,8 +60,9 @@ extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counte
 	return NULL;
 }
 
-u64 __weak hw_perf_save_disable(void)		{ return 0; }
-void __weak hw_perf_restore(u64 ctrl)		{ barrier(); }
+void __weak hw_perf_disable(void)		{ barrier(); }
+void __weak hw_perf_enable(void)		{ barrier(); }
+
 void __weak hw_perf_counter_setup(int cpu)	{ barrier(); }
 int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_cpu_context *cpuctx,
@@ -72,6 +73,32 @@ int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
 
 void __weak perf_counter_print_debug(void)	{ }
 
+static DEFINE_PER_CPU(int, disable_count);
+
+void __perf_disable(void)
+{
+	__get_cpu_var(disable_count)++;
+}
+
+bool __perf_enable(void)
+{
+	return !--__get_cpu_var(disable_count);
+}
+
+void perf_disable(void)
+{
+	__perf_disable();
+	hw_perf_disable();
+}
+EXPORT_SYMBOL_GPL(perf_disable); /* ACPI idle */
+
+void perf_enable(void)
+{
+	if (__perf_enable())
+		hw_perf_enable();
+}
+EXPORT_SYMBOL_GPL(perf_enable); /* ACPI idle */
+
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
@@ -170,7 +197,6 @@ static void __perf_counter_remove_from_context(void *info)
 	struct perf_counter *counter = info;
 	struct perf_counter_context *ctx = counter->ctx;
 	unsigned long flags;
-	u64 perf_flags;
 
 	/*
 	 * If this is a task context, we need to check whether it is
@@ -191,9 +217,9 @@ static void __perf_counter_remove_from_context(void *info)
 	 * Protect the list operation against NMI by disabling the
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
-	perf_flags = hw_perf_save_disable();
+	perf_disable();
 	list_del_counter(counter, ctx);
-	hw_perf_restore(perf_flags);
+	perf_enable();
 
 	if (!ctx->task) {
 		/*
@@ -538,7 +564,6 @@ static void __perf_install_in_context(void *info)
 	struct perf_counter *leader = counter->group_leader;
 	int cpu = smp_processor_id();
 	unsigned long flags;
-	u64 perf_flags;
 	int err;
 
 	/*
@@ -556,7 +581,7 @@ static void __perf_install_in_context(void *info)
 	 * Protect the list operation against NMI by disabling the
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
-	perf_flags = hw_perf_save_disable();
+	perf_disable();
 
 	add_counter_to_ctx(counter, ctx);
 
@@ -596,7 +621,7 @@ static void __perf_install_in_context(void *info)
 		cpuctx->max_pertask--;
 
  unlock:
-	hw_perf_restore(perf_flags);
+	perf_enable();
 
 	spin_unlock_irqrestore(&ctx->lock, flags);
 }
@@ -663,7 +688,6 @@ static void __perf_counter_enable(void *info)
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter_context *ctx = counter->ctx;
 	struct perf_counter *leader = counter->group_leader;
-	unsigned long pmuflags;
 	unsigned long flags;
 	int err;
 
@@ -693,14 +717,14 @@ static void __perf_counter_enable(void *info)
 	if (!group_can_go_on(counter, cpuctx, 1)) {
 		err = -EEXIST;
 	} else {
-		pmuflags = hw_perf_save_disable();
+		perf_disable();
 		if (counter == leader)
 			err = group_sched_in(counter, cpuctx, ctx,
 					     smp_processor_id());
 		else
 			err = counter_sched_in(counter, cpuctx, ctx,
 					       smp_processor_id());
-		hw_perf_restore(pmuflags);
+		perf_enable();
 	}
 
 	if (err) {
@@ -795,7 +819,6 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 			      struct perf_cpu_context *cpuctx)
 {
 	struct perf_counter *counter;
-	u64 flags;
 
 	spin_lock(&ctx->lock);
 	ctx->is_active = 0;
@@ -803,12 +826,12 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 		goto out;
 	update_context_time(ctx);
 
-	flags = hw_perf_save_disable();
+	perf_disable();
 	if (ctx->nr_active) {
 		list_for_each_entry(counter, &ctx->counter_list, list_entry)
 			group_sched_out(counter, cpuctx, ctx);
 	}
-	hw_perf_restore(flags);
+	perf_enable();
  out:
 	spin_unlock(&ctx->lock);
 }
@@ -860,7 +883,6 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 			struct perf_cpu_context *cpuctx, int cpu)
 {
 	struct perf_counter *counter;
-	u64 flags;
 	int can_add_hw = 1;
 
 	spin_lock(&ctx->lock);
@@ -870,7 +892,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 
 	ctx->timestamp = perf_clock();
 
-	flags = hw_perf_save_disable();
+	perf_disable();
 
 	/*
 	 * First go through the list and put on any pinned groups
@@ -917,7 +939,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 				can_add_hw = 0;
 		}
 	}
-	hw_perf_restore(flags);
+	perf_enable();
  out:
 	spin_unlock(&ctx->lock);
 }
@@ -955,7 +977,6 @@ int perf_counter_task_disable(void)
 	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 	struct perf_counter *counter;
 	unsigned long flags;
-	u64 perf_flags;
 
 	if (likely(!ctx->nr_counters))
 		return 0;
@@ -969,7 +990,7 @@ int perf_counter_task_disable(void)
 	/*
 	 * Disable all the counters:
 	 */
-	perf_flags = hw_perf_save_disable();
+	perf_disable();
 
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		if (counter->state != PERF_COUNTER_STATE_ERROR) {
@@ -978,7 +999,7 @@ int perf_counter_task_disable(void)
 		}
 	}
 
-	hw_perf_restore(perf_flags);
+	perf_enable();
 
 	spin_unlock_irqrestore(&ctx->lock, flags);
 
@@ -991,7 +1012,6 @@ int perf_counter_task_enable(void)
 	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 	struct perf_counter *counter;
 	unsigned long flags;
-	u64 perf_flags;
 	int cpu;
 
 	if (likely(!ctx->nr_counters))
@@ -1007,7 +1027,7 @@ int perf_counter_task_enable(void)
 	/*
 	 * Disable all the counters:
 	 */
-	perf_flags = hw_perf_save_disable();
+	perf_disable();
 
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		if (counter->state > PERF_COUNTER_STATE_OFF)
@@ -1017,7 +1037,7 @@ int perf_counter_task_enable(void)
 			ctx->time - counter->total_time_enabled;
 		counter->hw_event.disabled = 0;
 	}
-	hw_perf_restore(perf_flags);
+	perf_enable();
 
 	spin_unlock(&ctx->lock);
 
@@ -1034,7 +1054,6 @@ int perf_counter_task_enable(void)
 static void rotate_ctx(struct perf_counter_context *ctx)
 {
 	struct perf_counter *counter;
-	u64 perf_flags;
 
 	if (!ctx->nr_counters)
 		return;
@@ -1043,12 +1062,12 @@ static void rotate_ctx(struct perf_counter_context *ctx)
 	/*
 	 * Rotate the first entry last (works just fine for group counters too):
 	 */
-	perf_flags = hw_perf_save_disable();
+	perf_disable();
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		list_move_tail(&counter->list_entry, &ctx->counter_list);
 		break;
 	}
-	hw_perf_restore(perf_flags);
+	perf_enable();
 
 	spin_unlock(&ctx->lock);
 }
@@ -3194,7 +3213,6 @@ __perf_counter_exit_task(struct task_struct *child,
 	} else {
 		struct perf_cpu_context *cpuctx;
 		unsigned long flags;
-		u64 perf_flags;
 
 		/*
 		 * Disable and unlink this counter.
@@ -3203,7 +3221,7 @@ __perf_counter_exit_task(struct task_struct *child,
 		 * could still be processing it:
 		 */
 		local_irq_save(flags);
-		perf_flags = hw_perf_save_disable();
+		perf_disable();
 
 		cpuctx = &__get_cpu_var(perf_cpu_context);
 
@@ -3214,7 +3232,7 @@ __perf_counter_exit_task(struct task_struct *child,
 
 		child_ctx->nr_counters--;
 
-		hw_perf_restore(perf_flags);
+		perf_enable();
 		local_irq_restore(flags);
 	}
 
-- 
cgit v0.10.2


From a4016a79fcbd139e7378944c0d86a39fdbc70ecc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 14 May 2009 14:52:17 +0200
Subject: perf_counter: x86: Robustify interrupt handling

Two consecutive NMIs could daze and confuse the machine when the
first would handle the overflow of both counters.

[ Impact: fix false-positive syslog messages under multi-session profiling ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 313638c..1dcf670 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -783,6 +783,10 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 
 		counter = cpuc->counters[idx];
 		hwc = &counter->hw;
+
+		if (counter->hw_event.nmi != nmi)
+			goto next;
+
 		val = x86_perf_counter_update(counter, hwc, idx);
 		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
 			goto next;
@@ -869,7 +873,6 @@ perf_counter_nmi_handler(struct notifier_block *self,
 {
 	struct die_args *args = __args;
 	struct pt_regs *regs;
-	int ret;
 
 	if (!atomic_read(&active_counters))
 		return NOTIFY_DONE;
@@ -886,9 +889,16 @@ perf_counter_nmi_handler(struct notifier_block *self,
 	regs = args->regs;
 
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	ret = x86_pmu.handle_irq(regs, 1);
+	/*
+	 * Can't rely on the handled return value to say it was our NMI, two
+	 * counters could trigger 'simultaneously' raising two back-to-back NMIs.
+	 *
+	 * If the first NMI handles both, the latter will be empty and daze
+	 * the CPU.
+	 */
+	x86_pmu.handle_irq(regs, 1);
 
-	return ret ? NOTIFY_STOP : NOTIFY_OK;
+	return NOTIFY_STOP;
 }
 
 static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
-- 
cgit v0.10.2


From 1c80f4b598d9b075a2a0be694e28be93a6702bcc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 15 May 2009 08:25:22 +0200
Subject: perf_counter: x86: Disallow interval of 1

On certain CPUs i have observed a stuck PMU if interval was set to
1 and NMIs were used. The PMU had PMC0 set in MSR_CORE_PERF_GLOBAL_STATUS,
but it was not possible to ack it via MSR_CORE_PERF_GLOBAL_OVF_CTRL,
and the NMI loop got stuck infinitely.

[ Impact: fix rare hangs during high perfcounter load ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 1dcf670..46a82d1 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -473,6 +473,11 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 		left += period;
 		atomic64_set(&hwc->period_left, left);
 	}
+	/*
+	 * Quirk: certain CPUs dont like it if just 1 event is left:
+	 */
+	if (unlikely(left < 2))
+		left = 2;
 
 	per_cpu(prev_left[idx], smp_processor_id()) = left;
 
-- 
cgit v0.10.2


From 9029a5e3801f1cc7cdaab80169d82427acf928d8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 15 May 2009 08:26:20 +0200
Subject: perf_counter: x86: Protect against infinite loops in
 intel_pmu_handle_irq()

intel_pmu_handle_irq() can lock up in an infinite loop if the hardware
does not allow the acking of irqs. Alas, this happened in testing so
make this robust and emit a warning if it happens in the future.

Also, clean up the IRQ handlers a bit.

[ Impact: improve perfcounter irq/nmi handling robustness ]

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 46a82d1..5a7f718 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -722,9 +722,13 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter)
  */
 static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 {
-	int bit, cpu = smp_processor_id();
+	struct cpu_hw_counters *cpuc;
+	struct cpu_hw_counters;
+	int bit, cpu, loops;
 	u64 ack, status;
-	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+	cpu = smp_processor_id();
+	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
 	perf_disable();
 	status = intel_pmu_get_status();
@@ -733,7 +737,13 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 		return 0;
 	}
 
+	loops = 0;
 again:
+	if (++loops > 100) {
+		WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
+		return 1;
+	}
+
 	inc_irq_stat(apic_perf_irqs);
 	ack = status;
 	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
@@ -765,13 +775,14 @@ again:
 
 static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 {
-	int cpu = smp_processor_id();
-	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
-	u64 val;
-	int handled = 0;
+	int cpu, idx, throttle = 0, handled = 0;
+	struct cpu_hw_counters *cpuc;
 	struct perf_counter *counter;
 	struct hw_perf_counter *hwc;
-	int idx, throttle = 0;
+	u64 val;
+
+	cpu = smp_processor_id();
+	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
 	if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) {
 		throttle = 1;
-- 
cgit v0.10.2


From 251e8e3c7235f5944805a64f24c79fc4696793f1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 14 May 2009 05:16:59 +0200
Subject: perf_counter: Remove ACPI quirk

We had a disable/enable around acpi_idle_do_entry() due to an erratum
in an early prototype CPU i had access to. That erratum has been fixed
in the BIOS so remove the quirk.

The quirk also kept us from profiling interrupts that hit the ACPI idle
instruction - so this is an improvement as well, beyond a cleanup and
a micro-optimization.

[ Impact: improve profiling scope, cleanup, micro-optimization ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 9645758..f7ca8c5 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -765,7 +765,6 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 {
 	/* Don't trace irqs off for idle */
 	stop_critical_timings();
-	perf_disable();
 	if (cx->entry_method == ACPI_CSTATE_FFH) {
 		/* Call into architectural FFH based C-state */
 		acpi_processor_ffh_cstate_enter(cx);
@@ -780,7 +779,6 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 		   gets asserted in time to freeze execution properly. */
 		unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
 	}
-	perf_enable();
 	start_critical_timings();
 }
 
-- 
cgit v0.10.2


From 58d7e993b16b62d30f8ef27757614056fe4def11 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 15 May 2009 11:03:23 +0200
Subject: perf stat: handle Ctrl-C

Before this change, if a long-running perf stat workload was Ctrl-C-ed,
the utility exited without displaying statistics.

After the change, the Ctrl-C gets propagated into the workload (and
causes its early exit there), but perf stat itself will still continue
to run and will display counter results.

This is useful to run open-ended workloads, let them run for
a while, then Ctrl-C them to get the stats.

[ Impact: extend perf stat with new functionality ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index cf575c3..03518d7 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -538,8 +538,14 @@ static void process_options(int argc, char **argv)
 	}
 }
 
+static void skip_signal(int signo)
+{
+}
+
 int cmd_stat(int argc, char **argv, const char *prefix)
 {
+	sigset_t blocked;
+
 	page_size = sysconf(_SC_PAGE_SIZE);
 
 	process_options(argc, argv);
@@ -548,5 +554,15 @@ int cmd_stat(int argc, char **argv, const char *prefix)
 	assert(nr_cpus <= MAX_NR_CPUS);
 	assert(nr_cpus >= 0);
 
+	/*
+	 * We dont want to block the signals - that would cause
+	 * child tasks to inherit that and Ctrl-C would not work.
+	 * What we want is for Ctrl-C to work in the exec()-ed
+	 * task, but being ignored by perf stat itself:
+	 */
+	signal(SIGINT,  skip_signal);
+	signal(SIGALRM, skip_signal);
+	signal(SIGABRT, skip_signal);
+
 	return do_perfstat(argc, argv);
 }
-- 
cgit v0.10.2


From ceec1c33c36e4783cba41416f327f4bbc05218c3 Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben-linux@fluff.org>
Date: Fri, 17 Apr 2009 12:29:22 +0100
Subject: [ARM] nwfpe: Add decleration for ExtendedCPDO

Add header file decleration for 'ExtendedCPDO' in fpa11.h
to stop the following sparse warning:

extended_cpdo.c:90:14: warning: symbol 'ExtendedCPDO' was not declared. Should it be static?

Signed-off-by: Ben Dooks <ben-linux@fluff.org>

diff --git a/arch/arm/nwfpe/fpa11.h b/arch/arm/nwfpe/fpa11.h
index 386cbd1..d3a6f92 100644
--- a/arch/arm/nwfpe/fpa11.h
+++ b/arch/arm/nwfpe/fpa11.h
@@ -114,4 +114,8 @@ extern unsigned int SingleCPDO(struct roundingData *roundData,
 extern unsigned int DoubleCPDO(struct roundingData *roundData,
 			       const unsigned int opcode, FPREG * rFd);
 
+/* extneded_cpdo.c */
+extern unsigned int ExtendedCPDO(struct roundingData *roundData,
+				 const unsigned int opcode, FPREG * rFd);
+
 #endif
-- 
cgit v0.10.2


From 3ea385f061d08a9509a884e347ad1007bb6c3c66 Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben-linux@fluff.org>
Date: Fri, 17 Apr 2009 12:21:56 +0100
Subject: [ARM] nwfpe: fix 'floatx80_is_nan' sparse warning

The symbol 'floatx80_is_nan' prototype was defined
locally in fpa11_cprt.c when it was built outside the
file in softfloat-specialisze.

Move this into softfloat.h to fix the following sparse
warning:

softfloat-specialize:276:6: warning: symbol 'floatx80_is_nan' was not declared. Should it be static?

Signed-off-by: Ben Dooks <ben-linux@fluff.org>

diff --git a/arch/arm/nwfpe/fpa11_cprt.c b/arch/arm/nwfpe/fpa11_cprt.c
index 9843dc5..31c4eee 100644
--- a/arch/arm/nwfpe/fpa11_cprt.c
+++ b/arch/arm/nwfpe/fpa11_cprt.c
@@ -27,10 +27,6 @@
 #include "fpmodule.inl"
 #include "softfloat.h"
 
-#ifdef CONFIG_FPE_NWFPE_XP
-extern flag floatx80_is_nan(floatx80);
-#endif
-
 unsigned int PerformFLT(const unsigned int opcode);
 unsigned int PerformFIX(const unsigned int opcode);
 
diff --git a/arch/arm/nwfpe/softfloat.h b/arch/arm/nwfpe/softfloat.h
index 260fe29..13e479c 100644
--- a/arch/arm/nwfpe/softfloat.h
+++ b/arch/arm/nwfpe/softfloat.h
@@ -226,6 +226,8 @@ char floatx80_le_quiet( floatx80, floatx80 );
 char floatx80_lt_quiet( floatx80, floatx80 );
 char floatx80_is_signaling_nan( floatx80 );
 
+extern flag floatx80_is_nan(floatx80);
+
 #endif
 
 static inline flag extractFloat32Sign(float32 a)
-- 
cgit v0.10.2


From f1a11e0576c7a73d759d05d776692b2b2d37172b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 5 May 2009 19:21:40 +0200
Subject: futex: remove the wait queue

The waitqueue which is used in struct futex_q is a leftover from the
futexfd implementation. There is no need to use a waitqueue at all, as
the waiting task is the only user of it. The waitqueue just adds
additional locking and a loop in the wake up path which both can be
avoided.

We have already a task reference in struct futex_q which is used for
PI futexes. Use it for normal futexes as well and just wake up the
task directly.

The logic of signalling the futex wakeup via setting q->lock_ptr to
NULL is kept with the difference that we set it NULL before doing the
wakeup. This opens an exit race window vs. a non futex wake up of the
to be woken up task, which we prevent with get_task_struct /
put_task_struct on the waiter.

[ Impact: simplification ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/futex.c b/kernel/futex.c
index aec8bf8..157bfcd 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -100,8 +100,8 @@ struct futex_pi_state {
  */
 struct futex_q {
 	struct plist_node list;
-	/* There can only be a single waiter */
-	wait_queue_head_t waiter;
+	/* Waiter reference */
+	struct task_struct *task;
 
 	/* Which hash list lock to use: */
 	spinlock_t *lock_ptr;
@@ -111,7 +111,6 @@ struct futex_q {
 
 	/* Optional priority inheritance state: */
 	struct futex_pi_state *pi_state;
-	struct task_struct *task;
 
 	/* rt_waiter storage for requeue_pi: */
 	struct rt_mutex_waiter *rt_waiter;
@@ -694,22 +693,29 @@ retry:
  */
 static void wake_futex(struct futex_q *q)
 {
-	plist_del(&q->list, &q->list.plist);
+	struct task_struct *p = q->task;
+
 	/*
-	 * The lock in wake_up_all() is a crucial memory barrier after the
-	 * plist_del() and also before assigning to q->lock_ptr.
+	 * We set q->lock_ptr = NULL _before_ we wake up the task. If
+	 * a non futex wake up happens on another CPU then the task
+	 * might exit and p would dereference a non existing task
+	 * struct. Prevent this by holding a reference on p across the
+	 * wake up.
 	 */
-	wake_up(&q->waiter);
+	get_task_struct(p);
+
+	plist_del(&q->list, &q->list.plist);
 	/*
-	 * The waiting task can free the futex_q as soon as this is written,
-	 * without taking any locks.  This must come last.
-	 *
-	 * A memory barrier is required here to prevent the following store to
-	 * lock_ptr from getting ahead of the wakeup. Clearing the lock at the
-	 * end of wake_up() does not prevent this store from moving.
+	 * The waiting task can free the futex_q as soon as
+	 * q->lock_ptr = NULL is written, without taking any locks. A
+	 * memory barrier is required here to prevent the following
+	 * store to lock_ptr from getting ahead of the plist_del.
 	 */
 	smp_wmb();
 	q->lock_ptr = NULL;
+
+	wake_up_state(p, TASK_NORMAL);
+	put_task_struct(p);
 }
 
 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
@@ -1003,7 +1009,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
 	WARN_ON(!q->rt_waiter);
 	q->rt_waiter = NULL;
 
-	wake_up(&q->waiter);
+	wake_up_state(q->task, TASK_NORMAL);
 }
 
 /**
@@ -1280,8 +1286,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
 {
 	struct futex_hash_bucket *hb;
 
-	init_waitqueue_head(&q->waiter);
-
 	get_futex_key_refs(&q->key);
 	hb = hash_futex(&q->key);
 	q->lock_ptr = &hb->lock;
@@ -1575,11 +1579,9 @@ out:
  * @hb:		the futex hash bucket, must be locked by the caller
  * @q:		the futex_q to queue up on
  * @timeout:	the prepared hrtimer_sleeper, or null for no timeout
- * @wait:	the wait_queue to add to the futex_q after queueing in the hb
  */
 static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
-				struct hrtimer_sleeper *timeout,
-				wait_queue_t *wait)
+				struct hrtimer_sleeper *timeout)
 {
 	queue_me(q, hb);
 
@@ -1587,19 +1589,11 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 	 * There might have been scheduling since the queue_me(), as we
 	 * cannot hold a spinlock across the get_user() in case it
 	 * faults, and we cannot just set TASK_INTERRUPTIBLE state when
-	 * queueing ourselves into the futex hash.  This code thus has to
+	 * queueing ourselves into the futex hash. This code thus has to
 	 * rely on the futex_wake() code removing us from hash when it
 	 * wakes us up.
 	 */
-
-	/* add_wait_queue is the barrier after __set_current_state. */
-	__set_current_state(TASK_INTERRUPTIBLE);
-
-	/*
-	 * Add current as the futex_q waiter.  We don't remove ourselves from
-	 * the wait_queue because we are the only user of it.
-	 */
-	add_wait_queue(&q->waiter, wait);
+	set_current_state(TASK_INTERRUPTIBLE);
 
 	/* Arm the timer */
 	if (timeout) {
@@ -1704,7 +1698,6 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 		      u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
 {
 	struct hrtimer_sleeper timeout, *to = NULL;
-	DECLARE_WAITQUEUE(wait, current);
 	struct restart_block *restart;
 	struct futex_hash_bucket *hb;
 	struct futex_q q;
@@ -1733,7 +1726,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 		goto out;
 
 	/* queue_me and wait for wakeup, timeout, or a signal. */
-	futex_wait_queue_me(hb, &q, to, &wait);
+	futex_wait_queue_me(hb, &q, to);
 
 	/* If we were woken (and unqueued), we succeeded, whatever. */
 	ret = 0;
@@ -2147,7 +2140,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 	struct hrtimer_sleeper timeout, *to = NULL;
 	struct rt_mutex_waiter rt_waiter;
 	struct rt_mutex *pi_mutex = NULL;
-	DECLARE_WAITQUEUE(wait, current);
 	struct restart_block *restart;
 	struct futex_hash_bucket *hb;
 	union futex_key key2;
@@ -2191,7 +2183,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 	}
 
 	/* Queue the futex_q, drop the hb lock, wait for wakeup. */
-	futex_wait_queue_me(hb, &q, to, &wait);
+	futex_wait_queue_me(hb, &q, to);
 
 	spin_lock(&hb->lock);
 	ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
-- 
cgit v0.10.2


From 548e1ddf255b4ebfb4ef20c08936fd8d4deb3bd9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 15 May 2009 15:19:26 +0200
Subject: perf_counter: remove perf_disable/enable exports

Now that ACPI idle doesn't use it anymore, remove the exports.

[ Impact: remove dead code/data ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <20090515132018.429826617@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index e814ff0..0173738 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -90,14 +90,12 @@ void perf_disable(void)
 	__perf_disable();
 	hw_perf_disable();
 }
-EXPORT_SYMBOL_GPL(perf_disable); /* ACPI idle */
 
 void perf_enable(void)
 {
 	if (__perf_enable())
 		hw_perf_enable();
 }
-EXPORT_SYMBOL_GPL(perf_enable); /* ACPI idle */
 
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
-- 
cgit v0.10.2


From 789f90fcf6b0b54e655740e9396c954378542c79 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 15 May 2009 15:19:27 +0200
Subject: perf_counter: per user mlock gift

Instead of a per-process mlock gift for perf-counters, use a
per-user gift so that there is less of a DoS potential.

[ Impact: allow less worst-case unprivileged memory consumption ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <20090515132018.496182835@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d185758..ff59d123 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -674,6 +674,10 @@ struct user_struct {
 	struct work_struct work;
 #endif
 #endif
+
+#ifdef CONFIG_PERF_COUNTERS
+	atomic_long_t locked_vm;
+#endif
 };
 
 extern int uids_sysfs_init(void);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0173738..93f4a0e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -45,7 +45,7 @@ static atomic_t nr_munmap_tracking __read_mostly;
 static atomic_t nr_comm_tracking __read_mostly;
 
 int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
-int sysctl_perf_counter_mlock __read_mostly = 128; /* 'free' kb per counter */
+int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
 
 /*
  * Lock for (sysadmin-configurable) counter reservations:
@@ -1522,6 +1522,9 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 
 	if (atomic_dec_and_mutex_lock(&counter->mmap_count,
 				      &counter->mmap_mutex)) {
+		struct user_struct *user = current_user();
+
+		atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
 		vma->vm_mm->locked_vm -= counter->data->nr_locked;
 		perf_mmap_data_free(counter);
 		mutex_unlock(&counter->mmap_mutex);
@@ -1537,11 +1540,13 @@ static struct vm_operations_struct perf_mmap_vmops = {
 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct perf_counter *counter = file->private_data;
+	struct user_struct *user = current_user();
 	unsigned long vma_size;
 	unsigned long nr_pages;
+	unsigned long user_locked, user_lock_limit;
 	unsigned long locked, lock_limit;
+	long user_extra, extra;
 	int ret = 0;
-	long extra;
 
 	if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
 		return -EINVAL;
@@ -1569,15 +1574,17 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		goto unlock;
 	}
 
-	extra = nr_pages /* + 1 only account the data pages */;
-	extra -= sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
-	if (extra < 0)
-		extra = 0;
+	user_extra = nr_pages + 1;
+	user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
+	user_locked = atomic_long_read(&user->locked_vm) + user_extra;
 
-	locked = vma->vm_mm->locked_vm + extra;
+	extra = 0;
+	if (user_locked > user_lock_limit)
+		extra = user_locked - user_lock_limit;
 
 	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 	lock_limit >>= PAGE_SHIFT;
+	locked = vma->vm_mm->locked_vm + extra;
 
 	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
 		ret = -EPERM;
@@ -1590,6 +1597,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		goto unlock;
 
 	atomic_set(&counter->mmap_count, 1);
+	atomic_long_add(user_extra, &user->locked_vm);
 	vma->vm_mm->locked_vm += extra;
 	counter->data->nr_locked = extra;
 unlock:
-- 
cgit v0.10.2


From 60db5e09c13109b13830cc9dcae688003fd39e79 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 15 May 2009 15:19:28 +0200
Subject: perf_counter: frequency based adaptive irq_period

Instead of specifying the irq_period for a counter, provide a target interrupt
frequency and dynamically adapt the irq_period to match this frequency.

[ Impact: new perf-counter attribute/feature ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <20090515132018.646195868@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index bb1b463..db8d5ca 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -534,7 +534,7 @@ void hw_perf_enable(void)
 			continue;
 		}
 		val = 0;
-		if (counter->hw_event.irq_period) {
+		if (counter->hw.irq_period) {
 			left = atomic64_read(&counter->hw.period_left);
 			if (left < 0x80000000L)
 				val = 0x80000000L - left;
@@ -829,8 +829,6 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	if (!ppmu)
 		return ERR_PTR(-ENXIO);
-	if ((s64)counter->hw_event.irq_period < 0)
-		return ERR_PTR(-EINVAL);
 	if (!perf_event_raw(&counter->hw_event)) {
 		ev = perf_event_id(&counter->hw_event);
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
@@ -901,7 +899,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	counter->hw.config = events[n];
 	counter->hw.counter_base = cflags[n];
-	atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
+	atomic64_set(&counter->hw.period_left, counter->hw.irq_period);
 
 	/*
 	 * See if we need to reserve the PMU.
@@ -934,6 +932,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 static void record_and_restart(struct perf_counter *counter, long val,
 			       struct pt_regs *regs, int nmi)
 {
+	u64 period = counter->hw.irq_period;
 	s64 prev, delta, left;
 	int record = 0;
 
@@ -948,11 +947,11 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 */
 	val = 0;
 	left = atomic64_read(&counter->hw.period_left) - delta;
-	if (counter->hw_event.irq_period) {
+	if (period) {
 		if (left <= 0) {
-			left += counter->hw_event.irq_period;
+			left += period;
 			if (left <= 0)
-				left = counter->hw_event.irq_period;
+				left = period;
 			record = 1;
 		}
 		if (left < 0x80000000L)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 5a7f718..886dcf3 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -286,11 +286,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->nmi = 1;
 	}
 
-	hwc->irq_period	= hw_event->irq_period;
-	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period)
-		hwc->irq_period = x86_pmu.max_period;
-
-	atomic64_set(&hwc->period_left, hwc->irq_period);
+	atomic64_set(&hwc->period_left,
+			min(x86_pmu.max_period, hwc->irq_period));
 
 	/*
 	 * Raw event type provide the config in the event structure
@@ -458,7 +455,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 			     struct hw_perf_counter *hwc, int idx)
 {
 	s64 left = atomic64_read(&hwc->period_left);
-	s64 period = hwc->irq_period;
+	s64 period = min(x86_pmu.max_period, hwc->irq_period);
 	int err;
 
 	/*
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index e543ecc..004b6e1 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -130,7 +130,11 @@ struct perf_counter_hw_event {
 	 */
 	__u64			config;
 
-	__u64			irq_period;
+	union {
+		__u64		irq_period;
+		__u64		irq_freq;
+	};
+
 	__u32			record_type;
 	__u32			read_format;
 
@@ -146,8 +150,9 @@ struct perf_counter_hw_event {
 				mmap           :  1, /* include mmap data     */
 				munmap         :  1, /* include munmap data   */
 				comm	       :  1, /* include comm data     */
+				freq           :  1, /* use freq, not period  */
 
-				__reserved_1   : 52;
+				__reserved_1   : 51;
 
 	__u32			extra_config_len;
 	__u32			wakeup_events;	/* wakeup every n events */
@@ -337,6 +342,7 @@ struct hw_perf_counter {
 	atomic64_t			prev_count;
 	u64				irq_period;
 	atomic64_t			period_left;
+	u64				interrupts;
 #endif
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 93f4a0e..0ad1db4 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1046,6 +1046,38 @@ int perf_counter_task_enable(void)
 	return 0;
 }
 
+void perf_adjust_freq(struct perf_counter_context *ctx)
+{
+	struct perf_counter *counter;
+	u64 irq_period;
+	u64 events, period;
+	s64 delta;
+
+	spin_lock(&ctx->lock);
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+			continue;
+
+		if (!counter->hw_event.freq || !counter->hw_event.irq_freq)
+			continue;
+
+		events = HZ * counter->hw.interrupts * counter->hw.irq_period;
+		period = div64_u64(events, counter->hw_event.irq_freq);
+
+		delta = (s64)(1 + period - counter->hw.irq_period);
+		delta >>= 1;
+
+		irq_period = counter->hw.irq_period + delta;
+
+		if (!irq_period)
+			irq_period = 1;
+
+		counter->hw.irq_period = irq_period;
+		counter->hw.interrupts = 0;
+	}
+	spin_unlock(&ctx->lock);
+}
+
 /*
  * Round-robin a context's counters:
  */
@@ -1081,6 +1113,9 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	cpuctx = &per_cpu(perf_cpu_context, cpu);
 	ctx = &curr->perf_counter_ctx;
 
+	perf_adjust_freq(&cpuctx->ctx);
+	perf_adjust_freq(ctx);
+
 	perf_counter_cpu_sched_out(cpuctx);
 	__perf_counter_task_sched_out(ctx);
 
@@ -2382,6 +2417,8 @@ int perf_counter_overflow(struct perf_counter *counter,
 	int events = atomic_read(&counter->event_limit);
 	int ret = 0;
 
+	counter->hw.interrupts++;
+
 	/*
 	 * XXX event_limit might not quite work as expected on inherited
 	 * counters
@@ -2450,6 +2487,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 	enum hrtimer_restart ret = HRTIMER_RESTART;
 	struct perf_counter *counter;
 	struct pt_regs *regs;
+	u64 period;
 
 	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer);
 	counter->pmu->read(counter);
@@ -2468,7 +2506,8 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 			ret = HRTIMER_NORESTART;
 	}
 
-	hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
+	period = max_t(u64, 10000, counter->hw.irq_period);
+	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
 
 	return ret;
 }
@@ -2629,8 +2668,9 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hwc->hrtimer.function = perf_swcounter_hrtimer;
 	if (hwc->irq_period) {
+		u64 period = max_t(u64, 10000, hwc->irq_period);
 		__hrtimer_start_range_ns(&hwc->hrtimer,
-				ns_to_ktime(hwc->irq_period), 0,
+				ns_to_ktime(period), 0,
 				HRTIMER_MODE_REL, 0);
 	}
 
@@ -2679,8 +2719,9 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter)
 	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hwc->hrtimer.function = perf_swcounter_hrtimer;
 	if (hwc->irq_period) {
+		u64 period = max_t(u64, 10000, hwc->irq_period);
 		__hrtimer_start_range_ns(&hwc->hrtimer,
-				ns_to_ktime(hwc->irq_period), 0,
+				ns_to_ktime(period), 0,
 				HRTIMER_MODE_REL, 0);
 	}
 
@@ -2811,9 +2852,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 
 static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 {
-	struct perf_counter_hw_event *hw_event = &counter->hw_event;
 	const struct pmu *pmu = NULL;
-	struct hw_perf_counter *hwc = &counter->hw;
 
 	/*
 	 * Software counters (currently) can't in general distinguish
@@ -2826,8 +2865,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_CPU_CLOCK:
 		pmu = &perf_ops_cpu_clock;
 
-		if (hw_event->irq_period && hw_event->irq_period < 10000)
-			hw_event->irq_period = 10000;
 		break;
 	case PERF_COUNT_TASK_CLOCK:
 		/*
@@ -2839,8 +2876,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 		else
 			pmu = &perf_ops_cpu_clock;
 
-		if (hw_event->irq_period && hw_event->irq_period < 10000)
-			hw_event->irq_period = 10000;
 		break;
 	case PERF_COUNT_PAGE_FAULTS:
 	case PERF_COUNT_PAGE_FAULTS_MIN:
@@ -2854,9 +2889,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 		break;
 	}
 
-	if (pmu)
-		hwc->irq_period = hw_event->irq_period;
-
 	return pmu;
 }
 
@@ -2872,6 +2904,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 {
 	const struct pmu *pmu;
 	struct perf_counter *counter;
+	struct hw_perf_counter *hwc;
 	long err;
 
 	counter = kzalloc(sizeof(*counter), gfpflags);
@@ -2907,6 +2940,12 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
 	pmu = NULL;
 
+	hwc = &counter->hw;
+	if (hw_event->freq && hw_event->irq_freq)
+		hwc->irq_period = TICK_NSEC / hw_event->irq_freq;
+	else
+		hwc->irq_period = hw_event->irq_period;
+
 	/*
 	 * we currently do not support PERF_RECORD_GROUP on inherited counters
 	 */
-- 
cgit v0.10.2


From f5456a6b056b709282e87a68b4c1b81ac2e866fa Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 15 May 2009 15:19:29 +0200
Subject: perf top: update to use the new freq interface

Provide perf top -F as alternative to -c.

[ Impact: new 'perf top' feature ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <20090515132018.707922166@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index b1549dd..814b2e4 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -98,6 +98,7 @@ static unsigned int		page_size;
 static unsigned int		mmap_pages			=  16;
 static int			use_mmap			= 0;
 static int			use_munmap			= 0;
+static int			freq				= 0;
 
 static char			*vmlinux;
 
@@ -846,9 +847,10 @@ static void process_options(int argc, char **argv)
 			{"stat",	no_argument,		NULL, 'S'},
 			{"vmlinux",	required_argument,	NULL, 'x'},
 			{"zero",	no_argument,		NULL, 'z'},
+			{"freq",	required_argument,	NULL, 'F'},
 			{NULL,		0,			NULL,  0 }
 		};
-		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
+		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMUF:",
 				    long_options, &option_index);
 		if (c == -1)
 			break;
@@ -889,6 +891,7 @@ static void process_options(int argc, char **argv)
 		case 'm': mmap_pages			=   atoi(optarg); break;
 		case 'M': use_mmap			=              1; break;
 		case 'U': use_munmap			=              1; break;
+		case 'F': freq = 1; default_interval	=   atoi(optarg); break;
 		default: error = 1; break;
 		}
 	}
@@ -1075,6 +1078,7 @@ int cmd_top(int argc, char **argv, const char *prefix)
 			hw_event.nmi		= nmi;
 			hw_event.mmap		= use_mmap;
 			hw_event.munmap		= use_munmap;
+			hw_event.freq		= freq;
 
 			fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
 			if (fd[i][counter] < 0) {
-- 
cgit v0.10.2


From dce48a84adf1806676319f6f480e30a6daa012f9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 11 Apr 2009 10:43:41 +0200
Subject: sched, timers: move calc_load() to scheduler

Dimitri Sivanich noticed that xtime_lock is held write locked across
calc_load() which iterates over all online CPUs. That can cause long
latencies for xtime_lock readers on large SMP systems.

The load average calculation is an rough estimate anyway so there is
no real need to protect the readers vs. the update. It's not a problem
when the avenrun array is updated while a reader copies the values.

Instead of iterating over all online CPUs let the scheduler_tick code
update the number of active tasks shortly before the avenrun update
happens. The avenrun update itself is handled by the CPU which calls
do_timer().

[ Impact: reduce xtime_lock write locked section ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4c38bc..6eb4892 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -135,8 +135,8 @@ DECLARE_PER_CPU(unsigned long, process_counts);
 extern int nr_processes(void);
 extern unsigned long nr_running(void);
 extern unsigned long nr_uninterruptible(void);
-extern unsigned long nr_active(void);
 extern unsigned long nr_iowait(void);
+extern void calc_global_load(void);
 
 extern unsigned long get_parent_ip(unsigned long addr);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 8908d19..f4eb881 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -630,6 +630,10 @@ struct rq {
 	struct list_head migration_queue;
 #endif
 
+	/* calc_load related fields */
+	unsigned long calc_load_update;
+	long calc_load_active;
+
 #ifdef CONFIG_SCHED_HRTICK
 #ifdef CONFIG_SMP
 	int hrtick_csd_pending;
@@ -1728,6 +1732,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 }
 #endif
 
+static void calc_load_account_active(struct rq *this_rq);
+
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -2856,19 +2862,57 @@ unsigned long nr_iowait(void)
 	return sum;
 }
 
-unsigned long nr_active(void)
+/* Variables and functions for calc_load */
+static atomic_long_t calc_load_tasks;
+static unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun);
+
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
 {
-	unsigned long i, running = 0, uninterruptible = 0;
+	load *= exp;
+	load += active * (FIXED_1 - exp);
+	return load >> FSHIFT;
+}
 
-	for_each_online_cpu(i) {
-		running += cpu_rq(i)->nr_running;
-		uninterruptible += cpu_rq(i)->nr_uninterruptible;
-	}
+/*
+ * calc_load - update the avenrun load estimates 10 ticks after the
+ * CPUs have updated calc_load_tasks.
+ */
+void calc_global_load(void)
+{
+	unsigned long upd = calc_load_update + 10;
+	long active;
+
+	if (time_before(jiffies, upd))
+		return;
 
-	if (unlikely((long)uninterruptible < 0))
-		uninterruptible = 0;
+	active = atomic_long_read(&calc_load_tasks);
+	active = active > 0 ? active * FIXED_1 : 0;
 
-	return running + uninterruptible;
+	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+	calc_load_update += LOAD_FREQ;
+}
+
+/*
+ * Either called from update_cpu_load() or from a cpu going idle
+ */
+static void calc_load_account_active(struct rq *this_rq)
+{
+	long nr_active, delta;
+
+	nr_active = this_rq->nr_running;
+	nr_active += (long) this_rq->nr_uninterruptible;
+
+	if (nr_active != this_rq->calc_load_active) {
+		delta = nr_active - this_rq->calc_load_active;
+		this_rq->calc_load_active = nr_active;
+		atomic_long_add(delta, &calc_load_tasks);
+	}
 }
 
 /*
@@ -2899,6 +2943,11 @@ static void update_cpu_load(struct rq *this_rq)
 			new_load += scale-1;
 		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
 	}
+
+	if (time_after_eq(jiffies, this_rq->calc_load_update)) {
+		this_rq->calc_load_update += LOAD_FREQ;
+		calc_load_account_active(this_rq);
+	}
 }
 
 #ifdef CONFIG_SMP
@@ -7091,6 +7140,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
 
 	}
 }
+
+/*
+ * remove the tasks which were accounted by rq from calc_load_tasks.
+ */
+static void calc_global_load_remove(struct rq *rq)
+{
+	atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+}
 #endif /* CONFIG_HOTPLUG_CPU */
 
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7325,6 +7382,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		/* Update our root-domain */
 		rq = cpu_rq(cpu);
 		spin_lock_irqsave(&rq->lock, flags);
+		rq->calc_load_update = calc_load_update;
+		rq->calc_load_active = 0;
 		if (rq->rd) {
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 
@@ -7364,7 +7423,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		cpuset_unlock();
 		migrate_nr_uninterruptible(rq);
 		BUG_ON(rq->nr_running != 0);
-
+		calc_global_load_remove(rq);
 		/*
 		 * No need to migrate the tasks: it was best-effort if
 		 * they didn't take sched_hotcpu_mutex. Just wake up
@@ -9059,6 +9118,8 @@ void __init sched_init(void)
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
 		rq->nr_running = 0;
+		rq->calc_load_active = 0;
+		rq->calc_load_update = jiffies + LOAD_FREQ;
 		init_cfs_rq(&rq->cfs, rq);
 		init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -9166,6 +9227,9 @@ void __init sched_init(void)
 	 * when this runqueue becomes "idle".
 	 */
 	init_idle(current, smp_processor_id());
+
+	calc_load_update = jiffies + LOAD_FREQ;
+
 	/*
 	 * During early bootup we pretend to be a normal task:
 	 */
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 8a21a2e..499672c 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy
 static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
 	schedstat_inc(rq, sched_goidle);
-
+	/* adjust the active tasks as we might go into a long sleep */
+	calc_load_account_active(rq);
 	return rq->idle;
 }
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 687dff4..52a8bf8 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -22,7 +22,7 @@
 
 /*
  * This read-write spinlock protects us from races in SMP while
- * playing with xtime and avenrun.
+ * playing with xtime.
  */
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
 
diff --git a/kernel/timer.c b/kernel/timer.c
index cffffad..6a21d7a 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1123,47 +1123,6 @@ void update_process_times(int user_tick)
 }
 
 /*
- * Nr of active tasks - counted in fixed-point numbers
- */
-static unsigned long count_active_tasks(void)
-{
-	return nr_active() * FIXED_1;
-}
-
-/*
- * Hmm.. Changed this, as the GNU make sources (load.c) seems to
- * imply that avenrun[] is the standard name for this kind of thing.
- * Nothing else seems to be standardized: the fractional size etc
- * all seem to differ on different machines.
- *
- * Requires xtime_lock to access.
- */
-unsigned long avenrun[3];
-
-EXPORT_SYMBOL(avenrun);
-
-/*
- * calc_load - given tick count, update the avenrun load estimates.
- * This is called while holding a write_lock on xtime_lock.
- */
-static inline void calc_load(unsigned long ticks)
-{
-	unsigned long active_tasks; /* fixed-point */
-	static int count = LOAD_FREQ;
-
-	count -= ticks;
-	if (unlikely(count < 0)) {
-		active_tasks = count_active_tasks();
-		do {
-			CALC_LOAD(avenrun[0], EXP_1, active_tasks);
-			CALC_LOAD(avenrun[1], EXP_5, active_tasks);
-			CALC_LOAD(avenrun[2], EXP_15, active_tasks);
-			count += LOAD_FREQ;
-		} while (count < 0);
-	}
-}
-
-/*
  * This function runs timers and the timer-tq in bottom half context.
  */
 static void run_timer_softirq(struct softirq_action *h)
@@ -1187,16 +1146,6 @@ void run_local_timers(void)
 }
 
 /*
- * Called by the timer interrupt. xtime_lock must already be taken
- * by the timer IRQ!
- */
-static inline void update_times(unsigned long ticks)
-{
-	update_wall_time();
-	calc_load(ticks);
-}
-
-/*
  * The 64-bit jiffies value is not atomic - you MUST NOT read it
  * without sampling the sequence number in xtime_lock.
  * jiffies is defined in the linker script...
@@ -1205,7 +1154,8 @@ static inline void update_times(unsigned long ticks)
 void do_timer(unsigned long ticks)
 {
 	jiffies_64 += ticks;
-	update_times(ticks);
+	update_wall_time();
+	calc_global_load();
 }
 
 #ifdef __ARCH_WANT_SYS_ALARM
-- 
cgit v0.10.2


From 2d02494f5a90f2e4b3c4c6acc85ec94674cdc431 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 2 May 2009 20:08:52 +0200
Subject: sched, timers: cleanup avenrun users

avenrun is an rough estimate so we don't have to worry about
consistency of the three avenrun values. Remove the xtime lock
dependency and provide a function to scale the values. Cleanup the
users.

[ Impact: cleanup ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>

diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 9bca39c..1afa4dd 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -12,20 +12,14 @@
 
 static int loadavg_proc_show(struct seq_file *m, void *v)
 {
-	int a, b, c;
-	unsigned long seq;
+	unsigned long avnrun[3];
 
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		a = avenrun[0] + (FIXED_1/200);
-		b = avenrun[1] + (FIXED_1/200);
-		c = avenrun[2] + (FIXED_1/200);
-	} while (read_seqretry(&xtime_lock, seq));
+	get_avenrun(avnrun, FIXED_1/200, 0);
 
-	seq_printf(m, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
-		LOAD_INT(a), LOAD_FRAC(a),
-		LOAD_INT(b), LOAD_FRAC(b),
-		LOAD_INT(c), LOAD_FRAC(c),
+	seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
+		LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
+		LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
+		LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
 		nr_running(), nr_threads,
 		task_active_pid_ns(current)->last_pid);
 	return 0;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6eb4892..de7b3b2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -116,6 +116,7 @@ struct fs_struct;
  *    11 bit fractions.
  */
 extern unsigned long avenrun[];		/* Load averages */
+extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
 
 #define FSHIFT		11		/* nr of bits of precision */
 #define FIXED_1		(1<<FSHIFT)	/* 1.0 as fixed-point */
diff --git a/kernel/sched.c b/kernel/sched.c
index f4eb881..497c09b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2868,6 +2868,21 @@ static unsigned long calc_load_update;
 unsigned long avenrun[3];
 EXPORT_SYMBOL(avenrun);
 
+/**
+ * get_avenrun - get the load average array
+ * @loads:	pointer to dest load array
+ * @offset:	offset to add
+ * @shift:	shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+	loads[0] = (avenrun[0] + offset) << shift;
+	loads[1] = (avenrun[1] + offset) << shift;
+	loads[2] = (avenrun[2] + offset) << shift;
+}
+
 static unsigned long
 calc_load(unsigned long load, unsigned long exp, unsigned long active)
 {
diff --git a/kernel/timer.c b/kernel/timer.c
index 6a21d7a..a26ed29 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1356,37 +1356,17 @@ int do_sysinfo(struct sysinfo *info)
 {
 	unsigned long mem_total, sav_total;
 	unsigned int mem_unit, bitcount;
-	unsigned long seq;
+	struct timespec tp;
 
 	memset(info, 0, sizeof(struct sysinfo));
 
-	do {
-		struct timespec tp;
-		seq = read_seqbegin(&xtime_lock);
-
-		/*
-		 * This is annoying.  The below is the same thing
-		 * posix_get_clock_monotonic() does, but it wants to
-		 * take the lock which we want to cover the loads stuff
-		 * too.
-		 */
-
-		getnstimeofday(&tp);
-		tp.tv_sec += wall_to_monotonic.tv_sec;
-		tp.tv_nsec += wall_to_monotonic.tv_nsec;
-		monotonic_to_bootbased(&tp);
-		if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
-			tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
-			tp.tv_sec++;
-		}
-		info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
+	ktime_get_ts(&tp);
+	monotonic_to_bootbased(&tp);
+	info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
 
-		info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
-		info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
-		info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
+	get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
 
-		info->procs = nr_threads;
-	} while (read_seqretry(&xtime_lock, seq));
+	info->procs = nr_threads;
 
 	si_meminfo(info);
 	si_swapinfo(info);
-- 
cgit v0.10.2


From 2e569d36729c8105ae066a9b105068305442cc77 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 15 May 2009 15:37:47 +0200
Subject: perf_counter: frequency based adaptive irq_period, 32-bit fix

fix:

  kernel/built-in.o: In function `perf_counter_alloc':
  perf_counter.c:(.text+0x7ddc7): undefined reference to `__udivdi3'

[ Impact: build fix on 32-bit systems ]

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <1242394667.6642.1887.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0ad1db4..728a595 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2942,7 +2942,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
 	hwc = &counter->hw;
 	if (hw_event->freq && hw_event->irq_freq)
-		hwc->irq_period = TICK_NSEC / hw_event->irq_freq;
+		hwc->irq_period = div64_u64(TICK_NSEC, hw_event->irq_freq);
 	else
 		hwc->irq_period = hw_event->irq_period;
 
-- 
cgit v0.10.2


From f36dd6e7c0c04a056d3441185a17e5d4e3723192 Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Thu, 7 May 2009 15:07:47 +0200
Subject: [ARM] S3C64XX: GPIO include cleanup

Cleanup arm/plat-s3c64xx/include/plat/gpio-bank-h.h include file.
Using shift-left operation with value >32 is a bad habit.

Reviewed-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Ben Dooks <ben-linux@fluff.org>

diff --git a/arch/arm/plat-s3c64xx/include/plat/gpio-bank-h.h b/arch/arm/plat-s3c64xx/include/plat/gpio-bank-h.h
index 8154951..2ba1767 100644
--- a/arch/arm/plat-s3c64xx/include/plat/gpio-bank-h.h
+++ b/arch/arm/plat-s3c64xx/include/plat/gpio-bank-h.h
@@ -61,14 +61,14 @@
 #define S3C64XX_GPH7_ADDR_CF1		(0x06 << 28)
 #define S3C64XX_GPH7_EINT_G6_7		(0x07 << 28)
 
-#define S3C64XX_GPH8_MMC1_DATA6		(0x02 << 32)
-#define S3C64XX_GPH8_MMC2_DATA2		(0x03 << 32)
-#define S3C64XX_GPH8_I2S_V40_LRCLK	(0x05 << 32)
-#define S3C64XX_GPH8_ADDR_CF2		(0x06 << 32)
-#define S3C64XX_GPH8_EINT_G6_8		(0x07 << 32)
-
-#define S3C64XX_GPH9_MMC1_DATA7		(0x02 << 36)
-#define S3C64XX_GPH9_MMC2_DATA3		(0x03 << 36)
-#define S3C64XX_GPH9_I2S_V40_DI		(0x05 << 36)
-#define S3C64XX_GPH9_EINT_G6_9		(0x07 << 36)
+#define S3C64XX_GPH8_MMC1_DATA6		(0x02 <<  0)
+#define S3C64XX_GPH8_MMC2_DATA2		(0x03 <<  0)
+#define S3C64XX_GPH8_I2S_V40_LRCLK	(0x05 <<  0)
+#define S3C64XX_GPH8_ADDR_CF2		(0x06 <<  0)
+#define S3C64XX_GPH8_EINT_G6_8		(0x07 <<  0)
 
+#define S3C64XX_GPH9_OUTPUT		(0x01 <<  4)
+#define S3C64XX_GPH9_MMC1_DATA7		(0x02 <<  4)
+#define S3C64XX_GPH9_MMC2_DATA3		(0x03 <<  4)
+#define S3C64XX_GPH9_I2S_V40_DI		(0x05 <<  4)
+#define S3C64XX_GPH9_EINT_G6_9		(0x07 <<  4)
-- 
cgit v0.10.2


From beb9f4ed22ee618ffcf5064ff7d5ca4cfc83104b Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Thu, 7 May 2009 15:07:41 +0200
Subject: [ARM] S3C64XX: fix GPIO debug

Fix compilation bug when debug was enabled

Reviewed-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Ben Dooks <ben-linux@fluff.org>

diff --git a/arch/arm/plat-s3c64xx/gpiolib.c b/arch/arm/plat-s3c64xx/gpiolib.c
index ee9188a..78ee52c 100644
--- a/arch/arm/plat-s3c64xx/gpiolib.c
+++ b/arch/arm/plat-s3c64xx/gpiolib.c
@@ -57,7 +57,7 @@
 #if 1
 #define gpio_dbg(x...) do { } while(0)
 #else
-#define gpio_dbg(x...) printk(KERN_DEBUG ## x)
+#define gpio_dbg(x...) printk(KERN_DEBUG x)
 #endif
 
 /* The s3c64xx_gpiolib_4bit routines are to control the gpio banks where
-- 
cgit v0.10.2


From 871fcd7cf791b853ae044f813db94caefbf66725 Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben-linux@fluff.org>
Date: Fri, 15 May 2009 14:20:53 +0100
Subject: [ARM] S3C24XX: Fix unused code warning in arch/arm/plat-s3c24xx/dma.c

Fix unused code warning in arch/arm/plat-s3c24xx/dma.c if there
is no PM support enabled. The function to_dma_chan() should
be marked inline so that the compiler will eliminate it without
warning if it isn't used.

arch/arm/plat-s3c24xx/dma.c:1239: warning: 'to_dma_chan' defined but not used

Signed-off-by: Ben Dooks <ben-linux@fluff.org>

diff --git a/arch/arm/plat-s3c24xx/dma.c b/arch/arm/plat-s3c24xx/dma.c
index aee2aeb..07326f6 100644
--- a/arch/arm/plat-s3c24xx/dma.c
+++ b/arch/arm/plat-s3c24xx/dma.c
@@ -1235,7 +1235,7 @@ int s3c2410_dma_getposition(unsigned int channel, dma_addr_t *src, dma_addr_t *d
 
 EXPORT_SYMBOL(s3c2410_dma_getposition);
 
-static struct s3c2410_dma_chan *to_dma_chan(struct sys_device *dev)
+static inline struct s3c2410_dma_chan *to_dma_chan(struct sys_device *dev)
 {
 	return container_of(dev, struct s3c2410_dma_chan, dev);
 }
-- 
cgit v0.10.2


From a8af6de00fafed316fea8f39d87c7e8e19ec1ea0 Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben-linux@fluff.org>
Date: Fri, 15 May 2009 14:57:09 +0100
Subject: [ARM] S3C2410: mach-bast.c registering i2c data too early

The BAST support code is calling s3c_i2c0_set_platdata() from
the map_io() entry, instead of the bast_init() code. This causes
the registration to fail due to kmalloc() not being available
at the time.

This fixes the following error:
s3c_i2c0_set_platdata: no memory for platform data

Signed-off-by: Ben Dooks <ben-linux@fluff.org>

diff --git a/arch/arm/mach-s3c2410/mach-bast.c b/arch/arm/mach-s3c2410/mach-bast.c
index 4389c16..8637dea 100644
--- a/arch/arm/mach-s3c2410/mach-bast.c
+++ b/arch/arm/mach-s3c2410/mach-bast.c
@@ -588,8 +588,6 @@ static void __init bast_map_io(void)
 
 	s3c_device_nand.dev.platform_data = &bast_nand_info;
 
-	s3c_i2c0_set_platdata(&bast_i2c_info);
-
 	s3c24xx_init_io(bast_iodesc, ARRAY_SIZE(bast_iodesc));
 	s3c24xx_init_clocks(0);
 	s3c24xx_init_uarts(bast_uartcfgs, ARRAY_SIZE(bast_uartcfgs));
@@ -602,6 +600,7 @@ static void __init bast_init(void)
 	sysdev_class_register(&bast_pm_sysclass);
 	sysdev_register(&bast_pm_sysdev);
 
+	s3c_i2c0_set_platdata(&bast_i2c_info);
 	s3c24xx_fb_set_platdata(&bast_fb_info);
 	platform_add_devices(bast_devices, ARRAY_SIZE(bast_devices));
 
-- 
cgit v0.10.2


From 3ac19bb443255579b30f05af5e688b6c73b1bb91 Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben-linux@fluff.org>
Date: Fri, 15 May 2009 15:21:57 +0100
Subject: [ARM] S3C: Do not set clk->owner field if unset

The s3c24xx_register_clock() function has been doing a test
on clk->owner to see if it is NULL, and then setting itself
as the owner if clk->owner == NULL.

This is not needed, arch/arm/plat-s3c/clock.c cannot be
compiled as a module, and even if it was, it should not be
playing with this field if it being registered from somewhere
else.

The best course of action is to remove this bit of
code completely.

Signed-off-by: Ben Dooks <ben-linux@fluff.org>

diff --git a/arch/arm/plat-s3c/clock.c b/arch/arm/plat-s3c/clock.c
index b6be76e..4d01ef1 100644
--- a/arch/arm/plat-s3c/clock.c
+++ b/arch/arm/plat-s3c/clock.c
@@ -306,8 +306,6 @@ struct clk s3c24xx_uclk = {
 
 int s3c24xx_register_clock(struct clk *clk)
 {
-	clk->owner = THIS_MODULE;
-
 	if (clk->enable == NULL)
 		clk->enable = clk_null_enable;
 
-- 
cgit v0.10.2


From ef923214a4816c289e4af2d67a9ebb1a31e4ac61 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 14 May 2009 13:29:14 +1000
Subject: perf_counter: powerpc: use u64 for event codes internally

Although the perf_counter API allows 63-bit raw event codes,
internally in the powerpc back-end we had been using 32-bit
event codes.  This expands them to 64 bits so that we can add
bits for specifying threshold start/stop events and instruction
sampling modes later.

This also corrects the return value of can_go_on_limited_pmc;
we were returning an event code rather than just a 0/1 value in
some circumstances. That didn't particularly matter while event
codes were 32-bit, but now that event codes are 64-bit it
might, so this fixes it.

[ Impact: extend PowerPC perfcounter interfaces from u32 to u64 ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <18955.36874.472452.353104@drongo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
index 56d66c3..ceea76a 100644
--- a/arch/powerpc/include/asm/perf_counter.h
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -23,13 +23,13 @@ struct power_pmu {
 	int	max_alternatives;
 	u64	add_fields;
 	u64	test_adder;
-	int	(*compute_mmcr)(unsigned int events[], int n_ev,
+	int	(*compute_mmcr)(u64 events[], int n_ev,
 				unsigned int hwc[], u64 mmcr[]);
-	int	(*get_constraint)(unsigned int event, u64 *mskp, u64 *valp);
-	int	(*get_alternatives)(unsigned int event, unsigned int flags,
-				    unsigned int alt[]);
+	int	(*get_constraint)(u64 event, u64 *mskp, u64 *valp);
+	int	(*get_alternatives)(u64 event, unsigned int flags,
+				    u64 alt[]);
 	void	(*disable_pmc)(unsigned int pmc, u64 mmcr[]);
-	int	(*limited_pmc_event)(unsigned int event);
+	int	(*limited_pmc_event)(u64 event);
 	int	limited_pmc5_6;	/* PMC5 and PMC6 have limited function */
 	int	n_generic;
 	int	*generic_events;
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index db8d5ca..8d4cafc 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -26,7 +26,7 @@ struct cpu_hw_counters {
 	int n_limited;
 	u8  pmcs_enabled;
 	struct perf_counter *counter[MAX_HWCOUNTERS];
-	unsigned int events[MAX_HWCOUNTERS];
+	u64 events[MAX_HWCOUNTERS];
 	unsigned int flags[MAX_HWCOUNTERS];
 	u64 mmcr[3];
 	struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS];
@@ -131,11 +131,11 @@ static void write_pmc(int idx, unsigned long val)
  * and see if any combination of alternative codes is feasible.
  * The feasible set is returned in event[].
  */
-static int power_check_constraints(unsigned int event[], unsigned int cflags[],
+static int power_check_constraints(u64 event[], unsigned int cflags[],
 				   int n_ev)
 {
 	u64 mask, value, nv;
-	unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
+	u64 alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
 	u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
 	u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
 	u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
@@ -564,7 +564,7 @@ void hw_perf_enable(void)
 }
 
 static int collect_events(struct perf_counter *group, int max_count,
-			  struct perf_counter *ctrs[], unsigned int *events,
+			  struct perf_counter *ctrs[], u64 *events,
 			  unsigned int *flags)
 {
 	int n = 0;
@@ -752,11 +752,11 @@ struct pmu power_pmu = {
  * that a limited PMC can count, doesn't require interrupts, and
  * doesn't exclude any processor mode.
  */
-static int can_go_on_limited_pmc(struct perf_counter *counter, unsigned int ev,
+static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev,
 				 unsigned int flags)
 {
 	int n;
-	unsigned int alt[MAX_EVENT_ALTERNATIVES];
+	u64 alt[MAX_EVENT_ALTERNATIVES];
 
 	if (counter->hw_event.exclude_user
 	    || counter->hw_event.exclude_kernel
@@ -776,10 +776,8 @@ static int can_go_on_limited_pmc(struct perf_counter *counter, unsigned int ev,
 
 	flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
 	n = ppmu->get_alternatives(ev, flags, alt);
-	if (n)
-		return alt[0];
 
-	return 0;
+	return n > 0;
 }
 
 /*
@@ -787,10 +785,9 @@ static int can_go_on_limited_pmc(struct perf_counter *counter, unsigned int ev,
  * and return the event code, or 0 if there is no such alternative.
  * (Note: event code 0 is "don't count" on all machines.)
  */
-static unsigned long normal_pmc_alternative(unsigned long ev,
-					    unsigned long flags)
+static u64 normal_pmc_alternative(u64 ev, unsigned long flags)
 {
-	unsigned int alt[MAX_EVENT_ALTERNATIVES];
+	u64 alt[MAX_EVENT_ALTERNATIVES];
 	int n;
 
 	flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
@@ -820,9 +817,10 @@ static void hw_perf_counter_destroy(struct perf_counter *counter)
 
 const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 {
-	unsigned long ev, flags;
+	u64 ev;
+	unsigned long flags;
 	struct perf_counter *ctrs[MAX_HWCOUNTERS];
-	unsigned int events[MAX_HWCOUNTERS];
+	u64 events[MAX_HWCOUNTERS];
 	unsigned int cflags[MAX_HWCOUNTERS];
 	int n;
 	int err;
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
index 744a275..836fa11 100644
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -213,7 +213,7 @@ static unsigned char direct_marked_event[8] = {
  * Returns 1 if event counts things relating to marked instructions
  * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
  */
-static int p4_marked_instr_event(unsigned int event)
+static int p4_marked_instr_event(u64 event)
 {
 	int pmc, psel, unit, byte, bit;
 	unsigned int mask;
@@ -249,7 +249,7 @@ static int p4_marked_instr_event(unsigned int event)
 	return (mask >> (byte * 8 + bit)) & 1;
 }
 
-static int p4_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+static int p4_get_constraint(u64 event, u64 *maskp, u64 *valp)
 {
 	int pmc, byte, unit, lower, sh;
 	u64 mask = 0, value = 0;
@@ -320,8 +320,7 @@ static unsigned int ppc_inst_cmpl[] = {
 	0x1001, 0x4001, 0x6001, 0x7001, 0x8001
 };
 
-static int p4_get_alternatives(unsigned int event, unsigned int flags,
-			       unsigned int alt[])
+static int p4_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 {
 	int i, j, na;
 
@@ -353,7 +352,7 @@ static int p4_get_alternatives(unsigned int event, unsigned int flags,
 	return na;
 }
 
-static int p4_compute_mmcr(unsigned int event[], int n_ev,
+static int p4_compute_mmcr(u64 event[], int n_ev,
 			   unsigned int hwc[], u64 mmcr[])
 {
 	u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index 8154eaa..3ac0654 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -135,7 +135,7 @@ static u64 unit_cons[PM_LASTUNIT+1][2] = {
 	[PM_GRS] =   { 0x0e00000000ull, 0x0c40000000ull },
 };
 
-static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+static int power5p_get_constraint(u64 event, u64 *maskp, u64 *valp)
 {
 	int pmc, byte, unit, sh;
 	int bit, fmask;
@@ -188,7 +188,7 @@ static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 	return 0;
 }
 
-static int power5p_limited_pmc_event(unsigned int event)
+static int power5p_limited_pmc_event(u64 event)
 {
 	int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
 
@@ -273,11 +273,11 @@ static int find_alternative_bdecode(unsigned int event)
 	return -1;
 }
 
-static int power5p_get_alternatives(unsigned int event, unsigned int flags,
-				    unsigned int alt[])
+static int power5p_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 {
-	int i, j, ae, nalt = 1;
+	int i, j, nalt = 1;
 	int nlim;
+	u64 ae;
 
 	alt[0] = event;
 	nalt = 1;
@@ -402,7 +402,7 @@ static unsigned char direct_event_is_marked[0x28] = {
  * Returns 1 if event counts things relating to marked instructions
  * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
  */
-static int power5p_marked_instr_event(unsigned int event)
+static int power5p_marked_instr_event(u64 event)
 {
 	int pmc, psel;
 	int bit, byte, unit;
@@ -451,7 +451,7 @@ static int power5p_marked_instr_event(unsigned int event)
 	return (mask >> (byte * 8 + bit)) & 1;
 }
 
-static int power5p_compute_mmcr(unsigned int event[], int n_ev,
+static int power5p_compute_mmcr(u64 event[], int n_ev,
 				unsigned int hwc[], u64 mmcr[])
 {
 	u64 mmcr1 = 0;
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index 6e667dc..d534496 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -139,7 +139,7 @@ static u64 unit_cons[PM_LASTUNIT+1][2] = {
 	[PM_GRS] =   { 0x30002000000000ull, 0x30000400000000ull },
 };
 
-static int power5_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+static int power5_get_constraint(u64 event, u64 *maskp, u64 *valp)
 {
 	int pmc, byte, unit, sh;
 	int bit, fmask;
@@ -224,7 +224,7 @@ static const unsigned int event_alternatives[][MAX_ALT] = {
  * Scan the alternatives table for a match and return the
  * index into the alternatives table if found, else -1.
  */
-static int find_alternative(unsigned int event)
+static int find_alternative(u64 event)
 {
 	int i, j;
 
@@ -250,7 +250,7 @@ static const unsigned char bytedecode_alternatives[4][4] = {
  * PMCSEL values on other counters.  This returns the alternative
  * event code for those that do, or -1 otherwise.
  */
-static int find_alternative_bdecode(unsigned int event)
+static u64 find_alternative_bdecode(u64 event)
 {
 	int pmc, altpmc, pp, j;
 
@@ -269,10 +269,10 @@ static int find_alternative_bdecode(unsigned int event)
 	return -1;
 }
 
-static int power5_get_alternatives(unsigned int event, unsigned int flags,
-				   unsigned int alt[])
+static int power5_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 {
-	int i, j, ae, nalt = 1;
+	int i, j, nalt = 1;
+	u64 ae;
 
 	alt[0] = event;
 	nalt = 1;
@@ -338,7 +338,7 @@ static unsigned char direct_event_is_marked[0x28] = {
  * Returns 1 if event counts things relating to marked instructions
  * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
  */
-static int power5_marked_instr_event(unsigned int event)
+static int power5_marked_instr_event(u64 event)
 {
 	int pmc, psel;
 	int bit, byte, unit;
@@ -382,7 +382,7 @@ static int power5_marked_instr_event(unsigned int event)
 	return (mask >> (byte * 8 + bit)) & 1;
 }
 
-static int power5_compute_mmcr(unsigned int event[], int n_ev,
+static int power5_compute_mmcr(u64 event[], int n_ev,
 			       unsigned int hwc[], u64 mmcr[])
 {
 	u64 mmcr1 = 0;
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index d44049f..ab7c615 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -134,7 +134,7 @@ static u32 marked_bus_events[16] = {
  * Returns 1 if event counts things relating to marked instructions
  * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
  */
-static int power6_marked_instr_event(unsigned int event)
+static int power6_marked_instr_event(u64 event)
 {
 	int pmc, psel, ptype;
 	int bit, byte, unit;
@@ -172,7 +172,7 @@ static int power6_marked_instr_event(unsigned int event)
 /*
  * Assign PMC numbers and compute MMCR1 value for a set of events
  */
-static int p6_compute_mmcr(unsigned int event[], int n_ev,
+static int p6_compute_mmcr(u64 event[], int n_ev,
 			   unsigned int hwc[], u64 mmcr[])
 {
 	u64 mmcr1 = 0;
@@ -265,7 +265,7 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev,
  *	20-23, 24-27, 28-31 ditto for bytes 1, 2, 3
  *	32-34	select field: nest (subunit) event selector
  */
-static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+static int p6_get_constraint(u64 event, u64 *maskp, u64 *valp)
 {
 	int pmc, byte, sh, subunit;
 	u64 mask = 0, value = 0;
@@ -298,7 +298,7 @@ static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 	return 0;
 }
 
-static int p6_limited_pmc_event(unsigned int event)
+static int p6_limited_pmc_event(u64 event)
 {
 	int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
 
@@ -337,7 +337,7 @@ static const unsigned int event_alternatives[][MAX_ALT] = {
  * This could be made more efficient with a binary search on
  * a presorted list, if necessary
  */
-static int find_alternatives_list(unsigned int event)
+static int find_alternatives_list(u64 event)
 {
 	int i, j;
 	unsigned int alt;
@@ -356,12 +356,12 @@ static int find_alternatives_list(unsigned int event)
 	return -1;
 }
 
-static int p6_get_alternatives(unsigned int event, unsigned int flags,
-			       unsigned int alt[])
+static int p6_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 {
 	int i, j, nlim;
-	unsigned int aevent, psel, pmc;
+	unsigned int psel, pmc;
 	unsigned int nalt = 1;
+	u64 aevent;
 
 	alt[0] = event;
 	nlim = p6_limited_pmc_event(event);
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index af2d188..eed47c4 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -147,7 +147,7 @@ static unsigned char direct_marked_event[8] = {
  * Returns 1 if event counts things relating to marked instructions
  * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
  */
-static int p970_marked_instr_event(unsigned int event)
+static int p970_marked_instr_event(u64 event)
 {
 	int pmc, psel, unit, byte, bit;
 	unsigned int mask;
@@ -192,7 +192,7 @@ static u64 unit_cons[PM_LASTUNIT+1][2] = {
 	[PM_STS] =   { 0x380000000000ull, 0x310000000000ull },
 };
 
-static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+static int p970_get_constraint(u64 event, u64 *maskp, u64 *valp)
 {
 	int pmc, byte, unit, sh, spcsel;
 	u64 mask = 0, value = 0;
@@ -243,8 +243,7 @@ static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 	return 0;
 }
 
-static int p970_get_alternatives(unsigned int event, unsigned int flags,
-				 unsigned int alt[])
+static int p970_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 {
 	alt[0] = event;
 
@@ -257,7 +256,7 @@ static int p970_get_alternatives(unsigned int event, unsigned int flags,
 	return 1;
 }
 
-static int p970_compute_mmcr(unsigned int event[], int n_ev,
+static int p970_compute_mmcr(u64 event[], int n_ev,
 			     unsigned int hwc[], u64 mmcr[])
 {
 	u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
-- 
cgit v0.10.2


From 9d23a90a67261e73b2fcac04d8ca963c6b496afb Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 14 May 2009 21:48:08 +1000
Subject: perf_counter: allow arch to supply event misc flags and instruction
 pointer

At present the values we put in overflow events for the misc
flags indicating processor mode and the instruction pointer are
obtained using the standard user_mode() and
instruction_pointer() functions. Those functions tell you where
the performance monitor interrupt was taken, which might not be
exactly where the counter overflow occurred, for example
because interrupts were disabled at the point where the
overflow occurred, or because the processor had many
instructions in flight and chose to complete some more
instructions beyond the one that caused the counter overflow.

Some architectures (e.g. powerpc) can supply more precise
information about where the counter overflow occurred and the
processor mode at that point.  This introduces new functions,
perf_misc_flags() and perf_instruction_pointer(), which arch
code can override to provide more precise information if
available.  They have default implementations which are
identical to the existing code.

This also adds a new misc flag value,
PERF_EVENT_MISC_HYPERVISOR, for the case where a counter
overflow occurred in the hypervisor.  We encode the processor
mode in the 2 bits previously used to indicate user or kernel
mode; the values for user and kernel mode are unchanged and
hypervisor mode is indicated by both bits being set.

[ Impact: generalize perfcounter core facilities ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <18956.1272.818511.561835@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 004b6e1..c8c1dfc2 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -215,8 +215,11 @@ struct perf_counter_mmap_page {
 	__u32   data_head;		/* head in the data section */
 };
 
+#define PERF_EVENT_MISC_CPUMODE_MASK	(3 << 0)
+#define PERF_EVENT_MISC_CPUMODE_UNKNOWN	(0 << 0)
 #define PERF_EVENT_MISC_KERNEL		(1 << 0)
-#define PERF_EVENT_MISC_USER		(1 << 1)
+#define PERF_EVENT_MISC_USER		(2 << 0)
+#define PERF_EVENT_MISC_HYPERVISOR	(3 << 0)
 #define PERF_EVENT_MISC_OVERFLOW	(1 << 2)
 
 struct perf_event_header {
@@ -596,6 +599,12 @@ extern int sysctl_perf_counter_mlock;
 
 extern void perf_counter_init(void);
 
+#ifndef perf_misc_flags
+#define perf_misc_flags(regs)	(user_mode(regs) ? PERF_EVENT_MISC_USER : \
+				 PERF_EVENT_MISC_KERNEL)
+#define perf_instruction_pointer(regs)	instruction_pointer(regs)
+#endif
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 728a595..57840a9 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2042,11 +2042,10 @@ static void perf_counter_output(struct perf_counter *counter,
 	header.size = sizeof(header);
 
 	header.misc = PERF_EVENT_MISC_OVERFLOW;
-	header.misc |= user_mode(regs) ?
-		PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL;
+	header.misc |= perf_misc_flags(regs);
 
 	if (record_type & PERF_RECORD_IP) {
-		ip = instruction_pointer(regs);
+		ip = perf_instruction_pointer(regs);
 		header.type |= PERF_RECORD_IP;
 		header.size += sizeof(ip);
 	}
-- 
cgit v0.10.2


From 0bbd0d4be8d5d3676c126e06e3c75c16def00441 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 14 May 2009 13:31:48 +1000
Subject: perf_counter: powerpc: supply more precise information on counter
 overflow events

This uses values from the MMCRA, SIAR and SDAR registers on
powerpc to supply more precise information for overflow events,
including a data address when PERF_RECORD_ADDR is specified.

Since POWER6 uses different bit positions in MMCRA from earlier
processors, this converts the struct power_pmu limited_pmc5_6
field, which only had 0/1 values, into a flags field and
defines bit values for its previous use (PPMU_LIMITED_PMC5_6)
and a new flag (PPMU_ALT_SIPR) to indicate that the processor
uses the POWER6 bit positions rather than the earlier
positions.  It also adds definitions in reg.h for the new and
old positions of the bit that indicates that the SIAR and SDAR
values come from the same instruction.

For the data address, the SDAR value is supplied if we are not
doing instruction sampling.  In that case there is no guarantee
that the address given in the PERF_RECORD_ADDR subrecord will
correspond to the instruction whose address is given in the
PERF_RECORD_IP subrecord.

If instruction sampling is enabled (e.g. because this counter
is counting a marked instruction event), then we only supply
the SDAR value for the PERF_RECORD_ADDR subrecord if it
corresponds to the instruction whose address is in the
PERF_RECORD_IP subrecord.  Otherwise we supply 0.

[ Impact: support more PMU hardware features on PowerPC ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <18955.37028.48861.555309@drongo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
index ceea76a..1c60f0c 100644
--- a/arch/powerpc/include/asm/perf_counter.h
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -30,7 +30,7 @@ struct power_pmu {
 				    u64 alt[]);
 	void	(*disable_pmc)(unsigned int pmc, u64 mmcr[]);
 	int	(*limited_pmc_event)(u64 event);
-	int	limited_pmc5_6;	/* PMC5 and PMC6 have limited function */
+	u32	flags;
 	int	n_generic;
 	int	*generic_events;
 };
@@ -38,12 +38,24 @@ struct power_pmu {
 extern struct power_pmu *ppmu;
 
 /*
+ * Values for power_pmu.flags
+ */
+#define PPMU_LIMITED_PMC5_6	1	/* PMC5/6 have limited function */
+#define PPMU_ALT_SIPR		2	/* uses alternate posn for SIPR/HV */
+
+/*
  * Values for flags to get_alternatives()
  */
 #define PPMU_LIMITED_PMC_OK	1	/* can put this on a limited PMC */
 #define PPMU_LIMITED_PMC_REQD	2	/* have to put this on a limited PMC */
 #define PPMU_ONLY_COUNT_RUN	4	/* only counting in run state */
 
+struct pt_regs;
+extern unsigned long perf_misc_flags(struct pt_regs *regs);
+#define perf_misc_flags(regs)	perf_misc_flags(regs)
+
+extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
+
 /*
  * The power_pmu.get_constraint function returns a 64-bit value and
  * a 64-bit mask that express the constraints between this event and
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index e8018d5..fb359b0 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -492,11 +492,13 @@
 #define   MMCR0_FCHV	0x00000001UL /* freeze conditions in hypervisor mode */
 #define SPRN_MMCR1	798
 #define SPRN_MMCRA	0x312
+#define   MMCRA_SDSYNC	0x80000000UL /* SDAR synced with SIAR */
 #define   MMCRA_SIHV	0x10000000UL /* state of MSR HV when SIAR set */
 #define   MMCRA_SIPR	0x08000000UL /* state of MSR PR when SIAR set */
 #define   MMCRA_SLOT	0x07000000UL /* SLOT bits (37-39) */
 #define   MMCRA_SLOT_SHIFT	24
 #define   MMCRA_SAMPLE_ENABLE 0x00000001UL /* enable sampling */
+#define   POWER6_MMCRA_SDSYNC 0x0000080000000000ULL	/* SDAR/SIAR synced */
 #define   POWER6_MMCRA_SIHV   0x0000040000000000ULL
 #define   POWER6_MMCRA_SIPR   0x0000020000000000ULL
 #define   POWER6_MMCRA_THRM	0x00000020UL
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 8d4cafc..6baae5a 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -17,6 +17,7 @@
 #include <asm/pmc.h>
 #include <asm/machdep.h>
 #include <asm/firmware.h>
+#include <asm/ptrace.h>
 
 struct cpu_hw_counters {
 	int n_counters;
@@ -310,7 +311,8 @@ static void power_pmu_read(struct perf_counter *counter)
  */
 static int is_limited_pmc(int pmcnum)
 {
-	return ppmu->limited_pmc5_6 && (pmcnum == 5 || pmcnum == 6);
+	return (ppmu->flags & PPMU_LIMITED_PMC5_6)
+		&& (pmcnum == 5 || pmcnum == 6);
 }
 
 static void freeze_limited_counters(struct cpu_hw_counters *cpuhw,
@@ -860,7 +862,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 	 * If this machine has limited counters, check whether this
 	 * event could go on a limited counter.
 	 */
-	if (ppmu->limited_pmc5_6) {
+	if (ppmu->flags & PPMU_LIMITED_PMC5_6) {
 		if (can_go_on_limited_pmc(counter, ev, flags)) {
 			flags |= PPMU_LIMITED_PMC_OK;
 		} else if (ppmu->limited_pmc_event(ev)) {
@@ -933,6 +935,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	u64 period = counter->hw.irq_period;
 	s64 prev, delta, left;
 	int record = 0;
+	u64 addr, mmcra, sdsync;
 
 	/* we don't have to worry about interrupts here */
 	prev = atomic64_read(&counter->hw.prev_count);
@@ -963,8 +966,76 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	/*
 	 * Finally record data if requested.
 	 */
-	if (record)
-		perf_counter_overflow(counter, nmi, regs, 0);
+	if (record) {
+		addr = 0;
+		if (counter->hw_event.record_type & PERF_RECORD_ADDR) {
+			/*
+			 * The user wants a data address recorded.
+			 * If we're not doing instruction sampling,
+			 * give them the SDAR (sampled data address).
+			 * If we are doing instruction sampling, then only
+			 * give them the SDAR if it corresponds to the
+			 * instruction pointed to by SIAR; this is indicated
+			 * by the [POWER6_]MMCRA_SDSYNC bit in MMCRA.
+			 */
+			mmcra = regs->dsisr;
+			sdsync = (ppmu->flags & PPMU_ALT_SIPR) ?
+				POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC;
+			if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
+				addr = mfspr(SPRN_SDAR);
+		}
+		perf_counter_overflow(counter, nmi, regs, addr);
+	}
+}
+
+/*
+ * Called from generic code to get the misc flags (i.e. processor mode)
+ * for an event.
+ */
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+	unsigned long mmcra;
+
+	if (TRAP(regs) != 0xf00) {
+		/* not a PMU interrupt */
+		return user_mode(regs) ? PERF_EVENT_MISC_USER :
+			PERF_EVENT_MISC_KERNEL;
+	}
+
+	mmcra = regs->dsisr;
+	if (ppmu->flags & PPMU_ALT_SIPR) {
+		if (mmcra & POWER6_MMCRA_SIHV)
+			return PERF_EVENT_MISC_HYPERVISOR;
+		return (mmcra & POWER6_MMCRA_SIPR) ? PERF_EVENT_MISC_USER :
+			PERF_EVENT_MISC_KERNEL;
+	}
+	if (mmcra & MMCRA_SIHV)
+		return PERF_EVENT_MISC_HYPERVISOR;
+	return (mmcra & MMCRA_SIPR) ? PERF_EVENT_MISC_USER :
+			PERF_EVENT_MISC_KERNEL;
+}
+
+/*
+ * Called from generic code to get the instruction pointer
+ * for an event.
+ */
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+	unsigned long mmcra;
+	unsigned long ip;
+	unsigned long slot;
+
+	if (TRAP(regs) != 0xf00)
+		return regs->nip;	/* not a PMU interrupt */
+
+	ip = mfspr(SPRN_SIAR);
+	mmcra = regs->dsisr;
+	if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) {
+		slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
+		if (slot > 1)
+			ip += 4 * (slot - 1);
+	}
+	return ip;
 }
 
 /*
@@ -984,6 +1055,11 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 					mfspr(SPRN_PMC6));
 
 	/*
+	 * Overload regs->dsisr to store MMCRA so we only need to read it once.
+	 */
+	regs->dsisr = mfspr(SPRN_MMCRA);
+
+	/*
 	 * If interrupts were soft-disabled when this PMU interrupt
 	 * occurred, treat it as an NMI.
 	 */
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index 3ac0654..c6cdfc1 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -625,6 +625,6 @@ struct power_pmu power5p_pmu = {
 	.disable_pmc = power5p_disable_pmc,
 	.n_generic = ARRAY_SIZE(power5p_generic_events),
 	.generic_events = power5p_generic_events,
-	.limited_pmc5_6 = 1,
+	.flags = PPMU_LIMITED_PMC5_6,
 	.limited_pmc_event = power5p_limited_pmc_event,
 };
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index ab7c615..cd4fbe0 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -485,6 +485,6 @@ struct power_pmu power6_pmu = {
 	.disable_pmc = p6_disable_pmc,
 	.n_generic = ARRAY_SIZE(power6_generic_events),
 	.generic_events = power6_generic_events,
-	.limited_pmc5_6 = 1,
+	.flags = PPMU_LIMITED_PMC5_6 | PPMU_ALT_SIPR,
 	.limited_pmc_event = p6_limited_pmc_event,
 };
-- 
cgit v0.10.2


From d9bcc01d58d18ed287091707b0b45c6ac888a11a Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Thu, 14 May 2009 12:06:12 +0530
Subject: x86, mtrr: replace MTRRcap_MSR with msr-index's MSR_MTRRcap

Use standard msr-index.h's MSR declaration and no need to declare again.

[ Impact: cleanup, no object code change ]

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 0b776c0..de9c20f 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -306,7 +306,7 @@ void __init get_mtrr_state(void)
 
 	vrs = mtrr_state.var_ranges;
 
-	rdmsr(MTRRcap_MSR, lo, dummy);
+	rdmsr(MSR_MTRRcap, lo, dummy);
 	mtrr_state.have_fixed = (lo >> 8) & 1;
 
 	for (i = 0; i < num_var_ranges; i++)
@@ -703,7 +703,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i
 static int generic_have_wrcomb(void)
 {
 	unsigned long config, dummy;
-	rdmsr(MTRRcap_MSR, config, dummy);
+	rdmsr(MSR_MTRRcap, config, dummy);
 	return (config & (1 << 10));
 }
 
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 03cda01..8fc248b 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -104,7 +104,7 @@ static void __init set_num_var_ranges(void)
 	unsigned long config = 0, dummy;
 
 	if (use_intel()) {
-		rdmsr(MTRRcap_MSR, config, dummy);
+		rdmsr(MSR_MTRRcap, config, dummy);
 	} else if (is_cpu(AMD))
 		config = 2;
 	else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 77f67f7..5d37fb1 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -5,7 +5,6 @@
 #include <linux/types.h>
 #include <linux/stddef.h>
 
-#define MTRRcap_MSR     0x0fe
 #define MTRRdefType_MSR 0x2ff
 
 #define MTRRfix64K_00000_MSR 0x250
-- 
cgit v0.10.2


From a036c7a358cc9d7ed28a188480b9a4d709e09b24 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Thu, 14 May 2009 12:10:43 +0530
Subject: x86, mtrr: replace MTRRfix64K_00000_MSR with msr-index's
 MSR_MTRRfix64K_00000

Use standard msr-index.h's MSR declaration and no need to declare again.

[ Impact: cleanup, no object code change ]

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index de9c20f..8b115c0 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -20,7 +20,7 @@ struct fixed_range_block {
 };
 
 static struct fixed_range_block fixed_range_blocks[] = {
-	{ MTRRfix64K_00000_MSR, 1 }, /* one  64k MTRR  */
+	{ MSR_MTRRfix64K_00000, 1 }, /* one  64k MTRR  */
 	{ MTRRfix16K_80000_MSR, 2 }, /* two  16k MTRRs */
 	{ MTRRfix4K_C0000_MSR,  8 }, /* eight 4k MTRRs */
 	{}
@@ -194,7 +194,7 @@ get_fixed_ranges(mtrr_type * frs)
 
 	k8_check_syscfg_dram_mod_en();
 
-	rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]);
+	rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]);
 
 	for (i = 0; i < 2; i++)
 		rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]);
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 5d37fb1..7f23cae 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -7,7 +7,6 @@
 
 #define MTRRdefType_MSR 0x2ff
 
-#define MTRRfix64K_00000_MSR 0x250
 #define MTRRfix16K_80000_MSR 0x258
 #define MTRRfix16K_A0000_MSR 0x259
 #define MTRRfix4K_C0000_MSR 0x268
-- 
cgit v0.10.2


From 7d9d55e449089df8463bca2045d702ae6cda64a2 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Thu, 14 May 2009 12:15:32 +0530
Subject: x86, mtrr: replace MTRRfix16K_80000_MSR with msr-index's
 MSR_MTRRfix16K_80000

Use standard msr-index.h's MSR declaration and no need to declare again

[ Impact: cleanup, no object code change ]

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 8b115c0..00437c2 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -21,7 +21,7 @@ struct fixed_range_block {
 
 static struct fixed_range_block fixed_range_blocks[] = {
 	{ MSR_MTRRfix64K_00000, 1 }, /* one  64k MTRR  */
-	{ MTRRfix16K_80000_MSR, 2 }, /* two  16k MTRRs */
+	{ MSR_MTRRfix16K_80000, 2 }, /* two  16k MTRRs */
 	{ MTRRfix4K_C0000_MSR,  8 }, /* eight 4k MTRRs */
 	{}
 };
@@ -197,7 +197,7 @@ get_fixed_ranges(mtrr_type * frs)
 	rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]);
 
 	for (i = 0; i < 2; i++)
-		rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]);
+		rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]);
 	for (i = 0; i < 8; i++)
 		rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]);
 }
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 7f23cae..712b605 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -7,7 +7,6 @@
 
 #define MTRRdefType_MSR 0x2ff
 
-#define MTRRfix16K_80000_MSR 0x258
 #define MTRRfix16K_A0000_MSR 0x259
 #define MTRRfix4K_C0000_MSR 0x268
 #define MTRRfix4K_C8000_MSR 0x269
-- 
cgit v0.10.2


From 654ac05801ae806661c8fdeb3b5c420a31cbc5b1 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Thu, 14 May 2009 12:21:54 +0530
Subject: x86, mtrr: remove mtrr MSRs double declaration

Removed MTRR MSR from mtrr/mtrr.h as these are already declared in
msr-index.h and nobody is using them:
 MTRRfix16K_A0000_MSR
 MTRRfix4K_C8000_MSR
 MTRRfix4K_D0000_MSR
 MTRRfix4K_D8000_MSR
 MTRRfix4K_E0000_MSR
 MTRRfix4K_E8000_MSR
 MTRRfix4K_F0000_MSR
 MTRRfix4K_F8000_MSR

Use standard msr-index.h's MSR declaration and no need to declare again

[ Impact: cleanup, no object code change ]

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 712b605..5053793 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -7,15 +7,7 @@
 
 #define MTRRdefType_MSR 0x2ff
 
-#define MTRRfix16K_A0000_MSR 0x259
 #define MTRRfix4K_C0000_MSR 0x268
-#define MTRRfix4K_C8000_MSR 0x269
-#define MTRRfix4K_D0000_MSR 0x26a
-#define MTRRfix4K_D8000_MSR 0x26b
-#define MTRRfix4K_E0000_MSR 0x26c
-#define MTRRfix4K_E8000_MSR 0x26d
-#define MTRRfix4K_F0000_MSR 0x26e
-#define MTRRfix4K_F8000_MSR 0x26f
 
 #define MTRR_CHANGE_MASK_FIXED     0x01
 #define MTRR_CHANGE_MASK_VARIABLE  0x02
-- 
cgit v0.10.2


From ba5673ff1ff5f428256db4cedd4b05b7be008bb6 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Thu, 14 May 2009 12:29:25 +0530
Subject: x86, mtrr: replace MTRRfix4K_C0000_MSR with msr-index's
 MSR_MTRRfix4K_C0000

Use standard msr-index.h's MSR declaration and no need to declare again.

[ Impact: cleanup, no object code change ]

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 00437c2..3cf58e2 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -22,7 +22,7 @@ struct fixed_range_block {
 static struct fixed_range_block fixed_range_blocks[] = {
 	{ MSR_MTRRfix64K_00000, 1 }, /* one  64k MTRR  */
 	{ MSR_MTRRfix16K_80000, 2 }, /* two  16k MTRRs */
-	{ MTRRfix4K_C0000_MSR,  8 }, /* eight 4k MTRRs */
+	{ MSR_MTRRfix4K_C0000,  8 }, /* eight 4k MTRRs */
 	{}
 };
 
@@ -199,7 +199,7 @@ get_fixed_ranges(mtrr_type * frs)
 	for (i = 0; i < 2; i++)
 		rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]);
 	for (i = 0; i < 8; i++)
-		rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]);
+		rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]);
 }
 
 void mtrr_save_fixed_ranges(void *info)
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 5053793..e5ee686 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -7,8 +7,6 @@
 
 #define MTRRdefType_MSR 0x2ff
 
-#define MTRRfix4K_C0000_MSR 0x268
-
 #define MTRR_CHANGE_MASK_FIXED     0x01
 #define MTRR_CHANGE_MASK_VARIABLE  0x02
 #define MTRR_CHANGE_MASK_DEFTYPE   0x04
-- 
cgit v0.10.2


From 52650257ea06bb15c2e2bbe854cbdf463726141a Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Thu, 14 May 2009 12:35:46 +0530
Subject: x86, mtrr: replace MTRRdefType_MSR with msr-index's MSR_MTRRdefType

Use standard msr-index.h's MSR declaration and no need to declare again.

[ Impact: cleanup, no object code change ]

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index ce0fe4b..1d584a1 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -808,7 +808,7 @@ int __init mtrr_cleanup(unsigned address_bits)
 
 	if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
 		return 0;
-	rdmsr(MTRRdefType_MSR, def, dummy);
+	rdmsr(MSR_MTRRdefType, def, dummy);
 	def &= 0xff;
 	if (def != MTRR_TYPE_UNCACHABLE)
 		return 0;
@@ -1003,7 +1003,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 	 */
 	if (!is_cpu(INTEL) || disable_mtrr_trim)
 		return 0;
-	rdmsr(MTRRdefType_MSR, def, dummy);
+	rdmsr(MSR_MTRRdefType, def, dummy);
 	def &= 0xff;
 	if (def != MTRR_TYPE_UNCACHABLE)
 		return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 3cf58e2..e930a31 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -314,7 +314,7 @@ void __init get_mtrr_state(void)
 	if (mtrr_state.have_fixed)
 		get_fixed_ranges(mtrr_state.fixed_ranges);
 
-	rdmsr(MTRRdefType_MSR, lo, dummy);
+	rdmsr(MSR_MTRRdefType, lo, dummy);
 	mtrr_state.def_type = (lo & 0xff);
 	mtrr_state.enabled = (lo & 0xc00) >> 10;
 
@@ -579,10 +579,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
 	__flush_tlb();
 
 	/*  Save MTRR state */
-	rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
+	rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
 
 	/*  Disable MTRRs, and set the default type to uncached  */
-	mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi);
+	mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
 }
 
 static void post_set(void) __releases(set_atomicity_lock)
@@ -591,7 +591,7 @@ static void post_set(void) __releases(set_atomicity_lock)
 	__flush_tlb();
 
 	/* Intel (P6) standard MTRRs */
-	mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
+	mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
 		
 	/*  Enable caches  */
 	write_cr0(read_cr0() & 0xbfffffff);
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index e5ee686..7538b76 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -5,8 +5,6 @@
 #include <linux/types.h>
 #include <linux/stddef.h>
 
-#define MTRRdefType_MSR 0x2ff
-
 #define MTRR_CHANGE_MASK_FIXED     0x01
 #define MTRR_CHANGE_MASK_VARIABLE  0x02
 #define MTRR_CHANGE_MASK_DEFTYPE   0x04
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
index 7f7e275..1f5fb15 100644
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ b/arch/x86/kernel/cpu/mtrr/state.c
@@ -35,7 +35,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
 
 		if (use_intel())
 			/*  Save MTRR state */
-			rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi);
+			rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
 		else
 			/* Cyrix ARRs - everything else were excluded at the top */
 			ctxt->ccr3 = getCx86(CX86_CCR3);
@@ -46,7 +46,7 @@ void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
 {
 	if (use_intel())
 		/*  Disable MTRRs, and set the default type to uncached  */
-		mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL,
+		mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,
 		      ctxt->deftype_hi);
 	else if (is_cpu(CYRIX))
 		/* Cyrix ARRs - everything else were excluded at the top */
@@ -64,7 +64,7 @@ void set_mtrr_done(struct set_mtrr_context *ctxt)
 		/*  Restore MTRRdefType  */
 		if (use_intel())
 			/* Intel (P6) standard MTRRs */
-			mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi);
+			mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
 		else
 			/* Cyrix ARRs - everything else was excluded at the top */
 			setCx86(CX86_CCR3, ctxt->ccr3);
-- 
cgit v0.10.2


From e102657ed16bbed49820d9c58509220fc8d9289a Mon Sep 17 00:00:00 2001
From: Vikram Pandita <vikram.pandita@ti.com>
Date: Sat, 16 May 2009 08:28:16 -0700
Subject: ARM: OMAP3: Fix number of GPIO lines for 34xx

As per 3430 TRM, there are 6 banks [0 to 191]

Signed-off-by: Tom Rix <Tom.Rix@windriver.com>
Signed-off-by: Vikram Pandita <vikram.pandita@ti.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>

diff --git a/arch/arm/plat-omap/gpio.c b/arch/arm/plat-omap/gpio.c
index 17d7afe..ee0b21f 100644
--- a/arch/arm/plat-omap/gpio.c
+++ b/arch/arm/plat-omap/gpio.c
@@ -307,7 +307,7 @@ static inline int gpio_valid(int gpio)
 		return 0;
 	if (cpu_is_omap24xx() && gpio < 128)
 		return 0;
-	if (cpu_is_omap34xx() && gpio < 160)
+	if (cpu_is_omap34xx() && gpio < 192)
 		return 0;
 	return -1;
 }
-- 
cgit v0.10.2


From 8dbe43930a4e9bede88eb67a9c613773a2747caf Mon Sep 17 00:00:00 2001
From: Kalle Jokiniemi <kalle.jokiniemi@digia.com>
Date: Sat, 16 May 2009 08:28:17 -0700
Subject: ARM: OMAP3: Fix HW SAVEANDRESTORE shift define

The OMAP3430ES2_SAVEANDRESTORE_SHIFT macro is used
by powerdomain code in
"1 << OMAP3430ES2_SAVEANDRESTORE_SHIFT" manner, but
the definition was also (1 << 4), meaning we actually
modified bit 16. So the definition needs to be 4.

This fixes also a cold reset HW bug in OMAP3430 ES3.x
where some of the efuse bits are not isolated during
wake-up from off mode. This can cause randomish
cold resets with off mode. Enabling the USBTLL hardware
SAVEANDRESTORE causes the core power up assert to be
delayed in a way that we will not get faulty values
when boot ROM is reading the unisolated registers.

Signed-off-by: Kalle Jokiniemi <kalle.jokiniemi@digia.com>
Acked-by: Kevin Hilman <khilman@deeprootsystems.com>
Acked-by: Paul Walmsley <paul@pwsan.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>

diff --git a/arch/arm/mach-omap2/prm-regbits-34xx.h b/arch/arm/mach-omap2/prm-regbits-34xx.h
index c6a7940..9fd03a2 100644
--- a/arch/arm/mach-omap2/prm-regbits-34xx.h
+++ b/arch/arm/mach-omap2/prm-regbits-34xx.h
@@ -409,7 +409,7 @@
 /* PM_PREPWSTST_CAM specific bits */
 
 /* PM_PWSTCTRL_USBHOST specific bits */
-#define OMAP3430ES2_SAVEANDRESTORE_SHIFT		(1 << 4)
+#define OMAP3430ES2_SAVEANDRESTORE_SHIFT		4
 
 /* RM_RSTST_PER specific bits */
 
-- 
cgit v0.10.2


From 005187eecaa400b4b43d9f640fbde9fcc50f37c1 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Sat, 16 May 2009 08:28:17 -0700
Subject: ARM: OMAP2/3: Change omapfb to use clkdev for dispc and rfbi, v2

This makes the framebuffer work on omap3.

Also fix the clk_get usage for checkpatch.pl
"ERROR: do not use assignment in if condition".

Cc: Imre Deak <imre.deak@nokia.com>
Cc: linux-fbdev-devel@lists.sourceforge.net
Acked-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Tony Lindgren <tony@atomide.com>

diff --git a/arch/arm/mach-omap2/clock24xx.c b/arch/arm/mach-omap2/clock24xx.c
index ff9b4c0..e4cef33 100644
--- a/arch/arm/mach-omap2/clock24xx.c
+++ b/arch/arm/mach-omap2/clock24xx.c
@@ -103,10 +103,10 @@ static struct omap_clk omap24xx_clks[] = {
 	CLK(NULL,	"mdm_ick",	&mdm_ick,	CK_243X),
 	CLK(NULL,	"mdm_osc_ck",	&mdm_osc_ck,	CK_243X),
 	/* DSS domain clocks */
-	CLK(NULL,	"dss_ick",	&dss_ick,	CK_243X | CK_242X),
-	CLK(NULL,	"dss1_fck",	&dss1_fck,	CK_243X | CK_242X),
-	CLK(NULL,	"dss2_fck",	&dss2_fck,	CK_243X | CK_242X),
-	CLK(NULL,	"dss_54m_fck",	&dss_54m_fck,	CK_243X | CK_242X),
+	CLK("omapfb",	"ick",		&dss_ick,	CK_243X | CK_242X),
+	CLK("omapfb",	"dss1_fck",	&dss1_fck,	CK_243X | CK_242X),
+	CLK("omapfb",	"dss2_fck",	&dss2_fck,	CK_243X | CK_242X),
+	CLK("omapfb",	"tv_fck",	&dss_54m_fck,	CK_243X | CK_242X),
 	/* L3 domain clocks */
 	CLK(NULL,	"core_l3_ck",	&core_l3_ck,	CK_243X | CK_242X),
 	CLK(NULL,	"ssi_fck",	&ssi_ssr_sst_fck, CK_243X | CK_242X),
diff --git a/arch/arm/mach-omap2/clock34xx.c b/arch/arm/mach-omap2/clock34xx.c
index a057539..ba05aa4 100644
--- a/arch/arm/mach-omap2/clock34xx.c
+++ b/arch/arm/mach-omap2/clock34xx.c
@@ -197,11 +197,11 @@ static struct omap_clk omap34xx_clks[] = {
 	CLK("omap_rng",	"ick",		&rng_ick,	CK_343X),
 	CLK(NULL,	"sha11_ick",	&sha11_ick,	CK_343X),
 	CLK(NULL,	"des1_ick",	&des1_ick,	CK_343X),
-	CLK(NULL,	"dss1_alwon_fck", &dss1_alwon_fck, CK_343X),
-	CLK(NULL,	"dss_tv_fck",	&dss_tv_fck,	CK_343X),
-	CLK(NULL,	"dss_96m_fck",	&dss_96m_fck,	CK_343X),
-	CLK(NULL,	"dss2_alwon_fck", &dss2_alwon_fck, CK_343X),
-	CLK(NULL,	"dss_ick",	&dss_ick,	CK_343X),
+	CLK("omapfb",	"dss1_fck",	&dss1_alwon_fck, CK_343X),
+	CLK("omapfb",	"tv_fck",	&dss_tv_fck,	CK_343X),
+	CLK("omapfb",	"video_fck",	&dss_96m_fck,	CK_343X),
+	CLK("omapfb",	"dss2_fck",	&dss2_alwon_fck, CK_343X),
+	CLK("omapfb",	"ick",		&dss_ick,	CK_343X),
 	CLK(NULL,	"cam_mclk",	&cam_mclk,	CK_343X),
 	CLK(NULL,	"cam_ick",	&cam_ick,	CK_343X),
 	CLK(NULL,	"csi2_96m_fck",	&csi2_96m_fck,	CK_343X),
diff --git a/drivers/video/omap/dispc.c b/drivers/video/omap/dispc.c
index dfb72f5e..148cbcc 100644
--- a/drivers/video/omap/dispc.c
+++ b/drivers/video/omap/dispc.c
@@ -880,20 +880,22 @@ static irqreturn_t omap_dispc_irq_handler(int irq, void *dev)
 
 static int get_dss_clocks(void)
 {
-	if (IS_ERR((dispc.dss_ick = clk_get(dispc.fbdev->dev, "dss_ick")))) {
-		dev_err(dispc.fbdev->dev, "can't get dss_ick\n");
+	dispc.dss_ick = clk_get(dispc.fbdev->dev, "ick");
+	if (IS_ERR(dispc.dss_ick)) {
+		dev_err(dispc.fbdev->dev, "can't get ick\n");
 		return PTR_ERR(dispc.dss_ick);
 	}
 
-	if (IS_ERR((dispc.dss1_fck = clk_get(dispc.fbdev->dev, "dss1_fck")))) {
+	dispc.dss1_fck = clk_get(dispc.fbdev->dev, "dss1_fck");
+	if (IS_ERR(dispc.dss1_fck)) {
 		dev_err(dispc.fbdev->dev, "can't get dss1_fck\n");
 		clk_put(dispc.dss_ick);
 		return PTR_ERR(dispc.dss1_fck);
 	}
 
-	if (IS_ERR((dispc.dss_54m_fck =
-				clk_get(dispc.fbdev->dev, "dss_54m_fck")))) {
-		dev_err(dispc.fbdev->dev, "can't get dss_54m_fck\n");
+	dispc.dss_54m_fck = clk_get(dispc.fbdev->dev, "tv_fck");
+	if (IS_ERR(dispc.dss_54m_fck)) {
+		dev_err(dispc.fbdev->dev, "can't get tv_fck\n");
 		clk_put(dispc.dss_ick);
 		clk_put(dispc.dss1_fck);
 		return PTR_ERR(dispc.dss_54m_fck);
diff --git a/drivers/video/omap/rfbi.c b/drivers/video/omap/rfbi.c
index a13c8dc..9332d6c 100644
--- a/drivers/video/omap/rfbi.c
+++ b/drivers/video/omap/rfbi.c
@@ -83,12 +83,14 @@ static inline u32 rfbi_read_reg(int idx)
 
 static int rfbi_get_clocks(void)
 {
-	if (IS_ERR((rfbi.dss_ick = clk_get(rfbi.fbdev->dev, "dss_ick")))) {
-		dev_err(rfbi.fbdev->dev, "can't get dss_ick\n");
+	rfbi.dss_ick = clk_get(rfbi.fbdev->dev, "ick");
+	if (IS_ERR(rfbi.dss_ick)) {
+		dev_err(rfbi.fbdev->dev, "can't get ick\n");
 		return PTR_ERR(rfbi.dss_ick);
 	}
 
-	if (IS_ERR((rfbi.dss1_fck = clk_get(rfbi.fbdev->dev, "dss1_fck")))) {
+	rfbi.dss1_fck = clk_get(rfbi.fbdev->dev, "dss1_fck");
+	if (IS_ERR(rfbi.dss1_fck)) {
 		dev_err(rfbi.fbdev->dev, "can't get dss1_fck\n");
 		clk_put(rfbi.dss_ick);
 		return PTR_ERR(rfbi.dss1_fck);
-- 
cgit v0.10.2


From ff05c0330b9880f9ccbb7fa40f2ed3b5842f5693 Mon Sep 17 00:00:00 2001
From: Hartley Sweeten <hartleys@visionengravers.com>
Date: Thu, 7 May 2009 18:41:47 +0100
Subject: [ARM] 5509/1: ep93xx: clkdev enable UARTS

Fix the clkdev API support for the ep93xx uart clocks.

The uarts available in the ep93xx have individual clock controls.
The current implementation assumes that the bootloader has enabled
the clocks before the kernel has booted. It also assumes that the
bootloader has set the UARTBAUD bit indicating that the uarts are
running off the 14.7456MHz external crystal.

This fixes both issues. It also allows the uart clocks to be stopped
when there are no users.

Tested-by: Matthias Kaehlcke <matthias@kaehlcke.net>

Cc: Ryan Mallon <ryan@bluewatersys.com>
Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

diff --git a/arch/arm/mach-ep93xx/clock.c b/arch/arm/mach-ep93xx/clock.c
index e8ebeae..b2eede5 100644
--- a/arch/arm/mach-ep93xx/clock.c
+++ b/arch/arm/mach-ep93xx/clock.c
@@ -21,15 +21,50 @@
 #include <asm/div64.h>
 #include <mach/hardware.h>
 
+
+/*
+ * The EP93xx has two external crystal oscillators.  To generate the
+ * required high-frequency clocks, the processor uses two phase-locked-
+ * loops (PLLs) to multiply the incoming external clock signal to much
+ * higher frequencies that are then divided down by programmable dividers
+ * to produce the needed clocks.  The PLLs operate independently of one
+ * another.
+ */
+#define EP93XX_EXT_CLK_RATE	14745600
+#define EP93XX_EXT_RTC_RATE	32768
+
+
 struct clk {
 	unsigned long	rate;
 	int		users;
+	int		sw_locked;
 	u32		enable_reg;
 	u32		enable_mask;
+
+	unsigned long	(*get_rate)(struct clk *clk);
 };
 
-static struct clk clk_uart = {
-	.rate		= 14745600,
+
+static unsigned long get_uart_rate(struct clk *clk);
+
+
+static struct clk clk_uart1 = {
+	.sw_locked	= 1,
+	.enable_reg	= EP93XX_SYSCON_DEVICE_CONFIG,
+	.enable_mask	= EP93XX_SYSCON_DEVICE_CONFIG_U1EN,
+	.get_rate	= get_uart_rate,
+};
+static struct clk clk_uart2 = {
+	.sw_locked	= 1,
+	.enable_reg	= EP93XX_SYSCON_DEVICE_CONFIG,
+	.enable_mask	= EP93XX_SYSCON_DEVICE_CONFIG_U2EN,
+	.get_rate	= get_uart_rate,
+};
+static struct clk clk_uart3 = {
+	.sw_locked	= 1,
+	.enable_reg	= EP93XX_SYSCON_DEVICE_CONFIG,
+	.enable_mask	= EP93XX_SYSCON_DEVICE_CONFIG_U3EN,
+	.get_rate	= get_uart_rate,
 };
 static struct clk clk_pll1;
 static struct clk clk_f;
@@ -95,9 +130,9 @@ static struct clk clk_m2m1 = {
 	{ .dev_id = dev, .con_id = con, .clk = ck }
 
 static struct clk_lookup clocks[] = {
-	INIT_CK("apb:uart1", NULL, &clk_uart),
-	INIT_CK("apb:uart2", NULL, &clk_uart),
-	INIT_CK("apb:uart3", NULL, &clk_uart),
+	INIT_CK("apb:uart1", NULL, &clk_uart1),
+	INIT_CK("apb:uart2", NULL, &clk_uart2),
+	INIT_CK("apb:uart3", NULL, &clk_uart3),
 	INIT_CK(NULL, "pll1", &clk_pll1),
 	INIT_CK(NULL, "fclk", &clk_f),
 	INIT_CK(NULL, "hclk", &clk_h),
@@ -125,6 +160,8 @@ int clk_enable(struct clk *clk)
 		u32 value;
 
 		value = __raw_readl(clk->enable_reg);
+		if (clk->sw_locked)
+			__raw_writel(0xaa, EP93XX_SYSCON_SWLOCK);
 		__raw_writel(value | clk->enable_mask, clk->enable_reg);
 	}
 
@@ -138,13 +175,29 @@ void clk_disable(struct clk *clk)
 		u32 value;
 
 		value = __raw_readl(clk->enable_reg);
+		if (clk->sw_locked)
+			__raw_writel(0xaa, EP93XX_SYSCON_SWLOCK);
 		__raw_writel(value & ~clk->enable_mask, clk->enable_reg);
 	}
 }
 EXPORT_SYMBOL(clk_disable);
 
+static unsigned long get_uart_rate(struct clk *clk)
+{
+	u32 value;
+
+	value = __raw_readl(EP93XX_SYSCON_CLOCK_CONTROL);
+	if (value & EP93XX_SYSCON_CLOCK_UARTBAUD)
+		return EP93XX_EXT_CLK_RATE;
+	else
+		return EP93XX_EXT_CLK_RATE / 2;
+}
+
 unsigned long clk_get_rate(struct clk *clk)
 {
+	if (clk->get_rate)
+		return clk->get_rate(clk);
+
 	return clk->rate;
 }
 EXPORT_SYMBOL(clk_get_rate);
@@ -162,7 +215,7 @@ static unsigned long calc_pll_rate(u32 config_word)
 	unsigned long long rate;
 	int i;
 
-	rate = 14745600;
+	rate = EP93XX_EXT_CLK_RATE;
 	rate *= ((config_word >> 11) & 0x1f) + 1;		/* X1FBD */
 	rate *= ((config_word >> 5) & 0x3f) + 1;		/* X2FBD */
 	do_div(rate, (config_word & 0x1f) + 1);			/* X2IPD */
@@ -195,7 +248,7 @@ static int __init ep93xx_clock_init(void)
 
 	value = __raw_readl(EP93XX_SYSCON_CLOCK_SET1);
 	if (!(value & 0x00800000)) {			/* PLL1 bypassed?  */
-		clk_pll1.rate = 14745600;
+		clk_pll1.rate = EP93XX_EXT_CLK_RATE;
 	} else {
 		clk_pll1.rate = calc_pll_rate(value);
 	}
@@ -206,7 +259,7 @@ static int __init ep93xx_clock_init(void)
 
 	value = __raw_readl(EP93XX_SYSCON_CLOCK_SET2);
 	if (!(value & 0x00080000)) {			/* PLL2 bypassed?  */
-		clk_pll2.rate = 14745600;
+		clk_pll2.rate = EP93XX_EXT_CLK_RATE;
 	} else if (value & 0x00040000) {		/* PLL2 enabled?  */
 		clk_pll2.rate = calc_pll_rate(value);
 	} else {
diff --git a/arch/arm/mach-ep93xx/include/mach/ep93xx-regs.h b/arch/arm/mach-ep93xx/include/mach/ep93xx-regs.h
index f66be12..1732de7 100644
--- a/arch/arm/mach-ep93xx/include/mach/ep93xx-regs.h
+++ b/arch/arm/mach-ep93xx/include/mach/ep93xx-regs.h
@@ -159,7 +159,10 @@
 #define EP93XX_SYSCON_CLOCK_SET1	EP93XX_SYSCON_REG(0x20)
 #define EP93XX_SYSCON_CLOCK_SET2	EP93XX_SYSCON_REG(0x24)
 #define EP93XX_SYSCON_DEVICE_CONFIG	EP93XX_SYSCON_REG(0x80)
-#define EP93XX_SYSCON_DEVICE_CONFIG_CRUNCH_ENABLE	0x00800000
+#define EP93XX_SYSCON_DEVICE_CONFIG_U3EN		(1<<24)
+#define EP93XX_SYSCON_DEVICE_CONFIG_CRUNCH_ENABLE	(1<<23)
+#define EP93XX_SYSCON_DEVICE_CONFIG_U2EN		(1<<20)
+#define EP93XX_SYSCON_DEVICE_CONFIG_U1EN		(1<<18)
 #define EP93XX_SYSCON_SWLOCK		EP93XX_SYSCON_REG(0xc0)
 
 #define EP93XX_WATCHDOG_BASE		(EP93XX_APB_VIRT_BASE + 0x00140000)
-- 
cgit v0.10.2


From 776abac81764847338a6a02c34609d4b8dfb4918 Mon Sep 17 00:00:00 2001
From: Ricardo Martins <rasm@fe.up.pt>
Date: Mon, 11 May 2009 00:15:08 +0100
Subject: [ARM] 5513/1: Eurotech VIPER SBC: fix compilation error

Compilation for this board yields the following errors:

arch/arm/mach-pxa/viper.c:511: error: 'FFUART' undeclared here (not in a function)
arch/arm/mach-pxa/viper.c:520: error: 'BTUART' undeclared here (not in a function)
arch/arm/mach-pxa/viper.c:529: error: 'STUART' undeclared here (not in a function)

Fix them by including the necessary header.

Signed-off-by: Ricardo Martins <rasm@fe.up.pt>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

diff --git a/arch/arm/mach-pxa/viper.c b/arch/arm/mach-pxa/viper.c
index 0e65344..dd031cc 100644
--- a/arch/arm/mach-pxa/viper.c
+++ b/arch/arm/mach-pxa/viper.c
@@ -46,6 +46,7 @@
 #include <mach/audio.h>
 #include <mach/pxafb.h>
 #include <mach/i2c.h>
+#include <mach/regs-uart.h>
 #include <mach/viper.h>
 
 #include <asm/setup.h>
-- 
cgit v0.10.2


From 8bc2095951517e2c74b8aeeca4685ddd6b16ed4b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 15 May 2009 20:45:59 +0200
Subject: perf_counter: Fix inheritance cleanup code

Clean up code that open-coded the list_{add,del}_counter() code in
__perf_counter_exit_task() which consequently diverged. This could
lead to software counter crashes.

Also, fold the ctx->nr_counter inc/dec into those functions and clean
up some of the related code.

[ Impact: fix potential sw counter crash, cleanup ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 57840a9..59a926d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -115,6 +115,7 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	}
 
 	list_add_rcu(&counter->event_entry, &ctx->event_list);
+	ctx->nr_counters++;
 }
 
 static void
@@ -122,6 +123,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
 	struct perf_counter *sibling, *tmp;
 
+	ctx->nr_counters--;
+
 	list_del_init(&counter->list_entry);
 	list_del_rcu(&counter->event_entry);
 
@@ -209,7 +212,6 @@ static void __perf_counter_remove_from_context(void *info)
 	counter_sched_out(counter, cpuctx, ctx);
 
 	counter->task = NULL;
-	ctx->nr_counters--;
 
 	/*
 	 * Protect the list operation against NMI by disabling the
@@ -276,7 +278,6 @@ retry:
 	 * succeed.
 	 */
 	if (!list_empty(&counter->list_entry)) {
-		ctx->nr_counters--;
 		list_del_counter(counter, ctx);
 		counter->task = NULL;
 	}
@@ -544,7 +545,6 @@ static void add_counter_to_ctx(struct perf_counter *counter,
 			       struct perf_counter_context *ctx)
 {
 	list_add_counter(counter, ctx);
-	ctx->nr_counters++;
 	counter->prev_state = PERF_COUNTER_STATE_OFF;
 	counter->tstamp_enabled = ctx->time;
 	counter->tstamp_running = ctx->time;
@@ -3206,9 +3206,8 @@ static int inherit_group(struct perf_counter *parent_counter,
 static void sync_child_counter(struct perf_counter *child_counter,
 			       struct perf_counter *parent_counter)
 {
-	u64 parent_val, child_val;
+	u64 child_val;
 
-	parent_val = atomic64_read(&parent_counter->count);
 	child_val = atomic64_read(&child_counter->count);
 
 	/*
@@ -3240,7 +3239,6 @@ __perf_counter_exit_task(struct task_struct *child,
 			 struct perf_counter_context *child_ctx)
 {
 	struct perf_counter *parent_counter;
-	struct perf_counter *sub, *tmp;
 
 	/*
 	 * If we do not self-reap then we have to wait for the
@@ -3252,8 +3250,8 @@ __perf_counter_exit_task(struct task_struct *child,
 	 */
 	if (child != current) {
 		wait_task_inactive(child, 0);
-		list_del_init(&child_counter->list_entry);
 		update_counter_times(child_counter);
+		list_del_counter(child_counter, child_ctx);
 	} else {
 		struct perf_cpu_context *cpuctx;
 		unsigned long flags;
@@ -3272,9 +3270,7 @@ __perf_counter_exit_task(struct task_struct *child,
 		group_sched_out(child_counter, cpuctx, child_ctx);
 		update_counter_times(child_counter);
 
-		list_del_init(&child_counter->list_entry);
-
-		child_ctx->nr_counters--;
+		list_del_counter(child_counter, child_ctx);
 
 		perf_enable();
 		local_irq_restore(flags);
@@ -3288,13 +3284,6 @@ __perf_counter_exit_task(struct task_struct *child,
 	 */
 	if (parent_counter) {
 		sync_child_counter(child_counter, parent_counter);
-		list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
-					 list_entry) {
-			if (sub->parent) {
-				sync_child_counter(sub, sub->parent);
-				free_counter(sub);
-			}
-		}
 		free_counter(child_counter);
 	}
 }
@@ -3315,9 +3304,18 @@ void perf_counter_exit_task(struct task_struct *child)
 	if (likely(!child_ctx->nr_counters))
 		return;
 
+again:
 	list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
 				 list_entry)
 		__perf_counter_exit_task(child, child_counter, child_ctx);
+
+	/*
+	 * If the last counter was a group counter, it will have appended all
+	 * its siblings to the list, but we obtained 'tmp' before that which
+	 * will still point to the list head terminating the iteration.
+	 */
+	if (!list_empty(&child_ctx->counter_list))
+		goto again;
 }
 
 /*
-- 
cgit v0.10.2


From 856d56b9e5de650a64a6c41c17aaed702b55d578 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 15 May 2009 20:45:59 +0200
Subject: perf_counter: Fix counter inheritance

Srivatsa Vaddagiri reported that a Java workload triggers this
warning in kernel/exit.c:

   WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));

Add the inherited counter propagation on self-detach, this could
cause counter leaks and incomplete stats in threaded code like
the below:

  #include <pthread.h>
  #include <unistd.h>

  void *thread(void *arg)
  {
          sleep(5);
          return NULL;
  }

  void main(void)
  {
          pthread_t thr;
          pthread_create(&thr, NULL, thread, NULL);
  }

Reported-by: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/exit.c b/kernel/exit.c
index 4741376..16d74f1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -128,6 +128,12 @@ static void __exit_signal(struct task_struct *tsk)
 		sig = NULL; /* Marker for below. */
 	}
 
+	/*
+	 * Flush inherited counters to the parent - before the parent
+	 * gets woken up by child-exit notifications.
+	 */
+	perf_counter_exit_task(tsk);
+
 	__unhash_process(tsk);
 
 	/*
-- 
cgit v0.10.2


From 0203026b58b4299ba7281c0b4b417207c1f05d0e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 17 May 2009 11:24:08 +0200
Subject: perf_counter: fix threaded task exit

Flushing counters in __exit_signal() with irqs disabled is not
a good idea as perf_counter_exit_task() acquires mutexes. So
flush it before acquiring the tasklist lock.

(Note, we still need a fix for when the PID has been unhashed.)

[ Impact: fix crash with inherited counters ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/exit.c b/kernel/exit.c
index 16d74f1..73affd3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -128,12 +128,6 @@ static void __exit_signal(struct task_struct *tsk)
 		sig = NULL; /* Marker for below. */
 	}
 
-	/*
-	 * Flush inherited counters to the parent - before the parent
-	 * gets woken up by child-exit notifications.
-	 */
-	perf_counter_exit_task(tsk);
-
 	__unhash_process(tsk);
 
 	/*
@@ -183,6 +177,13 @@ repeat:
 	atomic_dec(&__task_cred(p)->user->processes);
 
 	proc_flush_task(p);
+
+	/*
+	 * Flush inherited counters to the parent - before the parent
+	 * gets woken up by child-exit notifications.
+	 */
+	perf_counter_exit_task(p);
+
 	write_lock_irq(&tasklist_lock);
 	tracehook_finish_release_task(p);
 	__exit_signal(p);
-- 
cgit v0.10.2


From d2517a49d55536b38c7a87e5289550cfedaa4dcc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 17 May 2009 10:04:45 +0200
Subject: perf_counter, x86: fix zero irq_period counters

The quirk to irq_period unearthed an unrobustness we had in the
hw_counter initialization sequence: we left irq_period at 0, which
was then quirked up to 2 ... which then generated a _lot_ of
interrupts during 'perf stat' runs, slowed them down and skewed
the counter results in general.

Initialize irq_period to the maximum instead.

[ Impact: fix perf stat results ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 886dcf3..5bfd30a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -286,6 +286,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->nmi = 1;
 	}
 
+	if (!hwc->irq_period)
+		hwc->irq_period = x86_pmu.max_period;
+
 	atomic64_set(&hwc->period_left,
 			min(x86_pmu.max_period, hwc->irq_period));
 
-- 
cgit v0.10.2


From 826681043d7184b4d650cab5b007b9a86b628eb5 Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Sun, 17 May 2009 16:20:18 +0100
Subject: [ARM] smp: fix cpumask usage in ARM SMP code

The ARM SMP code wasn't properly updated for the cpumask changes, which
results in smp_timer_broadcast() broadcasting ticks to non-online CPUs.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

diff --git a/arch/arm/common/gic.c b/arch/arm/common/gic.c
index c6884ba..3e1714c 100644
--- a/arch/arm/common/gic.c
+++ b/arch/arm/common/gic.c
@@ -253,9 +253,9 @@ void __cpuinit gic_cpu_init(unsigned int gic_nr, void __iomem *base)
 }
 
 #ifdef CONFIG_SMP
-void gic_raise_softirq(cpumask_t cpumask, unsigned int irq)
+void gic_raise_softirq(const struct cpumask *mask, unsigned int irq)
 {
-	unsigned long map = *cpus_addr(cpumask);
+	unsigned long map = *cpus_addr(*mask);
 
 	/* this always happens on GIC0 */
 	writel(map << 16 | irq, gic_data[0].dist_base + GIC_DIST_SOFTINT);
diff --git a/arch/arm/include/asm/hardware/gic.h b/arch/arm/include/asm/hardware/gic.h
index 4924914..7f34333b 100644
--- a/arch/arm/include/asm/hardware/gic.h
+++ b/arch/arm/include/asm/hardware/gic.h
@@ -36,7 +36,7 @@
 void gic_dist_init(unsigned int gic_nr, void __iomem *base, unsigned int irq_start);
 void gic_cpu_init(unsigned int gic_nr, void __iomem *base);
 void gic_cascade_irq(unsigned int gic_nr, unsigned int irq);
-void gic_raise_softirq(cpumask_t cpumask, unsigned int irq);
+void gic_raise_softirq(const struct cpumask *mask, unsigned int irq);
 #endif
 
 #endif
diff --git a/arch/arm/include/asm/smp.h b/arch/arm/include/asm/smp.h
index fad70da..5995935 100644
--- a/arch/arm/include/asm/smp.h
+++ b/arch/arm/include/asm/smp.h
@@ -53,17 +53,12 @@ extern void smp_store_cpu_info(unsigned int cpuid);
 /*
  * Raise an IPI cross call on CPUs in callmap.
  */
-extern void smp_cross_call(cpumask_t callmap);
-
-/*
- * Broadcast a timer interrupt to the other CPUs.
- */
-extern void smp_send_timer(void);
+extern void smp_cross_call(const struct cpumask *mask);
 
 /*
  * Broadcast a clock event to other CPUs.
  */
-extern void smp_timer_broadcast(cpumask_t mask);
+extern void smp_timer_broadcast(const struct cpumask *mask);
 
 /*
  * Boot a secondary CPU, and assign it the specified idle task.
@@ -102,7 +97,8 @@ extern int platform_cpu_kill(unsigned int cpu);
 extern void platform_cpu_enable(unsigned int cpu);
 
 extern void arch_send_call_function_single_ipi(int cpu);
-extern void arch_send_call_function_ipi(cpumask_t mask);
+extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
+#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
 
 /*
  * Local timer interrupt handling function (can be IPI'ed).
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 7801aac..6014dfd 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -326,14 +326,14 @@ void __init smp_prepare_boot_cpu(void)
 	per_cpu(cpu_data, cpu).idle = current;
 }
 
-static void send_ipi_message(cpumask_t callmap, enum ipi_msg_type msg)
+static void send_ipi_message(const struct cpumask *mask, enum ipi_msg_type msg)
 {
 	unsigned long flags;
 	unsigned int cpu;
 
 	local_irq_save(flags);
 
-	for_each_cpu_mask(cpu, callmap) {
+	for_each_cpu(cpu, mask) {
 		struct ipi_data *ipi = &per_cpu(ipi_data, cpu);
 
 		spin_lock(&ipi->lock);
@@ -344,19 +344,19 @@ static void send_ipi_message(cpumask_t callmap, enum ipi_msg_type msg)
 	/*
 	 * Call the platform specific cross-CPU call function.
 	 */
-	smp_cross_call(callmap);
+	smp_cross_call(mask);
 
 	local_irq_restore(flags);
 }
 
-void arch_send_call_function_ipi(cpumask_t mask)
+void arch_send_call_function_ipi_mask(const struct cpumask *mask)
 {
 	send_ipi_message(mask, IPI_CALL_FUNC);
 }
 
 void arch_send_call_function_single_ipi(int cpu)
 {
-	send_ipi_message(cpumask_of_cpu(cpu), IPI_CALL_FUNC_SINGLE);
+	send_ipi_message(cpumask_of(cpu), IPI_CALL_FUNC_SINGLE);
 }
 
 void show_ipi_list(struct seq_file *p)
@@ -498,17 +498,10 @@ asmlinkage void __exception do_IPI(struct pt_regs *regs)
 
 void smp_send_reschedule(int cpu)
 {
-	send_ipi_message(cpumask_of_cpu(cpu), IPI_RESCHEDULE);
+	send_ipi_message(cpumask_of(cpu), IPI_RESCHEDULE);
 }
 
-void smp_send_timer(void)
-{
-	cpumask_t mask = cpu_online_map;
-	cpu_clear(smp_processor_id(), mask);
-	send_ipi_message(mask, IPI_TIMER);
-}
-
-void smp_timer_broadcast(cpumask_t mask)
+void smp_timer_broadcast(const struct cpumask *mask)
 {
 	send_ipi_message(mask, IPI_TIMER);
 }
@@ -517,7 +510,7 @@ void smp_send_stop(void)
 {
 	cpumask_t mask = cpu_online_map;
 	cpu_clear(smp_processor_id(), mask);
-	send_ipi_message(mask, IPI_CPU_STOP);
+	send_ipi_message(&mask, IPI_CPU_STOP);
 }
 
 /*
@@ -528,20 +521,17 @@ int setup_profiling_timer(unsigned int multiplier)
 	return -EINVAL;
 }
 
-static int
-on_each_cpu_mask(void (*func)(void *), void *info, int wait, cpumask_t mask)
+static void
+on_each_cpu_mask(void (*func)(void *), void *info, int wait,
+		const struct cpumask *mask)
 {
-	int ret = 0;
-
 	preempt_disable();
 
-	ret = smp_call_function_mask(mask, func, info, wait);
-	if (cpu_isset(smp_processor_id(), mask))
+	smp_call_function_many(mask, func, info, wait);
+	if (cpumask_test_cpu(smp_processor_id(), mask))
 		func(info);
 
 	preempt_enable();
-
-	return ret;
 }
 
 /**********************************************************************/
@@ -602,20 +592,17 @@ void flush_tlb_all(void)
 
 void flush_tlb_mm(struct mm_struct *mm)
 {
-	cpumask_t mask = mm->cpu_vm_mask;
-
-	on_each_cpu_mask(ipi_flush_tlb_mm, mm, 1, mask);
+	on_each_cpu_mask(ipi_flush_tlb_mm, mm, 1, &mm->cpu_vm_mask);
 }
 
 void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
 {
-	cpumask_t mask = vma->vm_mm->cpu_vm_mask;
 	struct tlb_args ta;
 
 	ta.ta_vma = vma;
 	ta.ta_start = uaddr;
 
-	on_each_cpu_mask(ipi_flush_tlb_page, &ta, 1, mask);
+	on_each_cpu_mask(ipi_flush_tlb_page, &ta, 1, &vma->vm_mm->cpu_vm_mask);
 }
 
 void flush_tlb_kernel_page(unsigned long kaddr)
@@ -630,14 +617,13 @@ void flush_tlb_kernel_page(unsigned long kaddr)
 void flush_tlb_range(struct vm_area_struct *vma,
                      unsigned long start, unsigned long end)
 {
-	cpumask_t mask = vma->vm_mm->cpu_vm_mask;
 	struct tlb_args ta;
 
 	ta.ta_vma = vma;
 	ta.ta_start = start;
 	ta.ta_end = end;
 
-	on_each_cpu_mask(ipi_flush_tlb_range, &ta, 1, mask);
+	on_each_cpu_mask(ipi_flush_tlb_range, &ta, 1, &vma->vm_mm->cpu_vm_mask);
 }
 
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
diff --git a/arch/arm/mach-realview/include/mach/smp.h b/arch/arm/mach-realview/include/mach/smp.h
index 515819e..e2ff534 100644
--- a/arch/arm/mach-realview/include/mach/smp.h
+++ b/arch/arm/mach-realview/include/mach/smp.h
@@ -15,15 +15,15 @@
 /*
  * We use IRQ1 as the IPI
  */
-static inline void smp_cross_call(cpumask_t callmap)
+static inline void smp_cross_call(const struct cpumask *mask)
 {
-	gic_raise_softirq(callmap, 1);
+	gic_raise_softirq(mask, 1);
 }
 
 /*
  * Do nothing on MPcore.
  */
-static inline void smp_cross_call_done(cpumask_t callmap)
+static inline void smp_cross_call_done(const struct cpumask *mask)
 {
 }
 
diff --git a/arch/arm/mach-realview/platsmp.c b/arch/arm/mach-realview/platsmp.c
index ea3c755..a573046 100644
--- a/arch/arm/mach-realview/platsmp.c
+++ b/arch/arm/mach-realview/platsmp.c
@@ -82,7 +82,7 @@ void __cpuinit platform_secondary_init(unsigned int cpu)
 	 * to get this processor out of WFI in the BootMonitor - make
 	 * sure that we are no longer being sent this soft interrupt
 	 */
-	smp_cross_call_done(cpumask_of_cpu(cpu));
+	smp_cross_call_done(cpumask_of(cpu));
 
 	/*
 	 * if any interrupts are already enabled for the primary
@@ -136,7 +136,7 @@ int __cpuinit boot_secondary(unsigned int cpu, struct task_struct *idle)
 	 * Use smp_cross_call() for this, since there's little
 	 * point duplicating the code here
 	 */
-	smp_cross_call(cpumask_of_cpu(cpu));
+	smp_cross_call(cpumask_of(cpu));
 
 	timeout = jiffies + (1 * HZ);
 	while (time_before(jiffies, timeout)) {
-- 
cgit v0.10.2


From e8e7526c3c0863be25ab03a0871ee0978de5ba50 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Sun, 17 May 2009 17:22:53 +0200
Subject: ide-tape: fix debug call

This error only occurs when IDETAPE_DEBUG_LOG is enabled.

Signed-off-by: Mark de Wever <koraq@xs4all.nl>
Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 8dfc688..203bbea 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -617,7 +617,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	struct ide_cmd cmd;
 	u8 stat;
 
-	debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %lu\n"
+	debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %lu\n",
 		  (unsigned long long)rq->sector, rq->nr_sectors);
 
 	if (!(blk_special_request(rq) || blk_sense_request(rq))) {
-- 
cgit v0.10.2


From 78d236c2b30d4712c1fd8c9768b163c94b39e77d Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Sun, 17 May 2009 16:23:45 +0100
Subject: [ARM] realview: remove useless smp_cross_call_done()

smp_cross_call_done() is a no-op for MPCore, and since it's only
used by platform code, there's no point in having it unless it's
doing something.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

diff --git a/arch/arm/mach-realview/include/mach/smp.h b/arch/arm/mach-realview/include/mach/smp.h
index e2ff534..dd53892 100644
--- a/arch/arm/mach-realview/include/mach/smp.h
+++ b/arch/arm/mach-realview/include/mach/smp.h
@@ -20,11 +20,4 @@ static inline void smp_cross_call(const struct cpumask *mask)
 	gic_raise_softirq(mask, 1);
 }
 
-/*
- * Do nothing on MPcore.
- */
-static inline void smp_cross_call_done(const struct cpumask *mask)
-{
-}
-
 #endif
diff --git a/arch/arm/mach-realview/platsmp.c b/arch/arm/mach-realview/platsmp.c
index a573046..b037fd6 100644
--- a/arch/arm/mach-realview/platsmp.c
+++ b/arch/arm/mach-realview/platsmp.c
@@ -78,13 +78,6 @@ void __cpuinit platform_secondary_init(unsigned int cpu)
 	trace_hardirqs_off();
 
 	/*
-	 * the primary core may have used a "cross call" soft interrupt
-	 * to get this processor out of WFI in the BootMonitor - make
-	 * sure that we are no longer being sent this soft interrupt
-	 */
-	smp_cross_call_done(cpumask_of(cpu));
-
-	/*
 	 * if any interrupts are already enabled for the primary
 	 * core (e.g. timer irq), then they will not have been enabled
 	 * for us: do so
-- 
cgit v0.10.2


From ee348d5a1d810bc9958cabb7c27302aab235d36e Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Sun, 17 May 2009 17:00:47 +0100
Subject: [ARM] realview: fix broadcast tick support

Having discussed broadcast tick support with Thomas Glexiner, the
broadcast tick devices should be registered with a higher rating
than the global tick device, and it should have the ONESHOT and
PERIODIC feature flags set.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Acked-by: Thomas Glexiner <tglx@linutronix.de>

diff --git a/arch/arm/mach-realview/core.c b/arch/arm/mach-realview/core.c
index 942e1a7..076acbc 100644
--- a/arch/arm/mach-realview/core.c
+++ b/arch/arm/mach-realview/core.c
@@ -750,14 +750,6 @@ void __init realview_timer_init(unsigned int timer_irq)
 {
 	u32 val;
 
-#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
-	/*
-	 * The dummy clock device has to be registered before the main device
-	 * so that the latter will broadcast the clock events
-	 */
-	local_timer_setup();
-#endif
-
 	/* 
 	 * set clock frequency: 
 	 *	REALVIEW_REFCLK is 32KHz
diff --git a/arch/arm/mach-realview/localtimer.c b/arch/arm/mach-realview/localtimer.c
index d0d39ad..1c01d13 100644
--- a/arch/arm/mach-realview/localtimer.c
+++ b/arch/arm/mach-realview/localtimer.c
@@ -189,8 +189,10 @@ void __cpuinit local_timer_setup(void)
 	struct clock_event_device *clk = &per_cpu(local_clockevent, cpu);
 
 	clk->name		= "dummy_timer";
-	clk->features		= CLOCK_EVT_FEAT_DUMMY;
-	clk->rating		= 200;
+	clk->features		= CLOCK_EVT_FEAT_ONESHOT |
+				  CLOCK_EVT_FEAT_PERIODIC |
+				  CLOCK_EVT_FEAT_DUMMY;
+	clk->rating		= 400;
 	clk->mult               = 1;
 	clk->set_mode		= dummy_timer_set_mode;
 	clk->broadcast		= smp_timer_broadcast;
diff --git a/arch/arm/mach-realview/platsmp.c b/arch/arm/mach-realview/platsmp.c
index b037fd6..30a9c68 100644
--- a/arch/arm/mach-realview/platsmp.c
+++ b/arch/arm/mach-realview/platsmp.c
@@ -217,11 +217,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	if (max_cpus > ncores)
 		max_cpus = ncores;
 
-#ifdef CONFIG_LOCAL_TIMERS
+#if defined(CONFIG_LOCAL_TIMERS) || defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)
 	/*
-	 * Enable the local timer for primary CPU. If the device is
-	 * dummy (!CONFIG_LOCAL_TIMERS), it was already registers in
-	 * realview_timer_init
+	 * Enable the local timer or broadcast device for the boot CPU.
 	 */
 	local_timer_setup();
 #endif
-- 
cgit v0.10.2


From 6fd058f7791087648c683eb8572edf3be3c4c23c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 17 May 2009 15:38:01 -0400
Subject: ext4: Add a comprehensive block validity check to ext4_get_blocks()

To catch filesystem bugs or corruption which could lead to the
filesystem getting severly damaged, this patch adds a facility for
tracking all of the filesystem metadata blocks by contiguous regions
in a red-black tree.  This allows quick searching of the tree to
locate extents which might overlap with filesystem metadata blocks.

This facility is also used by the multi-block allocator to assure that
it is not allocating blocks out of the system zone, as well as by the
routines used when reading indirect blocks and extents information
from disk to make sure their contents are valid.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index a8ff003..8a34710 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -5,8 +5,8 @@
 obj-$(CONFIG_EXT4_FS) += ext4.o
 
 ext4-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-		   ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-		   ext4_jbd2.o migrate.o mballoc.o
+		ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
+		ext4_jbd2.o migrate.o mballoc.o block_validity.o
 
 ext4-$(CONFIG_EXT4_FS_XATTR)		+= xattr.o xattr_user.o xattr_trusted.o
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)	+= acl.o
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
new file mode 100644
index 0000000..50784ef
--- /dev/null
+++ b/fs/ext4/block_validity.c
@@ -0,0 +1,244 @@
+/*
+ *  linux/fs/ext4/block_validity.c
+ *
+ * Copyright (C) 2009
+ * Theodore Ts'o (tytso@mit.edu)
+ *
+ * Track which blocks in the filesystem are metadata blocks that
+ * should never be used as data blocks by files or directories.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/version.h>
+#include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include "ext4.h"
+
+struct ext4_system_zone {
+	struct rb_node	node;
+	ext4_fsblk_t	start_blk;
+	unsigned int	count;
+};
+
+static struct kmem_cache *ext4_system_zone_cachep;
+
+int __init init_ext4_system_zone(void)
+{
+	ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone,
+					     SLAB_RECLAIM_ACCOUNT);
+	if (ext4_system_zone_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void exit_ext4_system_zone(void)
+{
+	kmem_cache_destroy(ext4_system_zone_cachep);
+}
+
+static inline int can_merge(struct ext4_system_zone *entry1,
+		     struct ext4_system_zone *entry2)
+{
+	if ((entry1->start_blk + entry1->count) == entry2->start_blk)
+		return 1;
+	return 0;
+}
+
+/*
+ * Mark a range of blocks as belonging to the "system zone" --- that
+ * is, filesystem metadata blocks which should never be used by
+ * inodes.
+ */
+static int add_system_zone(struct ext4_sb_info *sbi,
+			   ext4_fsblk_t start_blk,
+			   unsigned int count)
+{
+	struct ext4_system_zone *new_entry = NULL, *entry;
+	struct rb_node **n = &sbi->system_blks.rb_node, *node;
+	struct rb_node *parent = NULL, *new_node = NULL;
+
+	while (*n) {
+		parent = *n;
+		entry = rb_entry(parent, struct ext4_system_zone, node);
+		if (start_blk < entry->start_blk)
+			n = &(*n)->rb_left;
+		else if (start_blk >= (entry->start_blk + entry->count))
+			n = &(*n)->rb_right;
+		else {
+			if (start_blk + count > (entry->start_blk + 
+						 entry->count))
+				entry->count = (start_blk + count - 
+						entry->start_blk);
+			new_node = *n;
+			new_entry = rb_entry(new_node, struct ext4_system_zone,
+					     node);
+			break;
+		}
+	}
+
+	if (!new_entry) {
+		new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
+					     GFP_KERNEL);
+		if (!new_entry)
+			return -ENOMEM;
+		new_entry->start_blk = start_blk;
+		new_entry->count = count;
+		new_node = &new_entry->node;
+
+		rb_link_node(new_node, parent, n);
+		rb_insert_color(new_node, &sbi->system_blks);
+	}
+
+	/* Can we merge to the left? */
+	node = rb_prev(new_node);
+	if (node) {
+		entry = rb_entry(node, struct ext4_system_zone, node);
+		if (can_merge(entry, new_entry)) {
+			new_entry->start_blk = entry->start_blk;
+			new_entry->count += entry->count;
+			rb_erase(node, &sbi->system_blks);
+			kmem_cache_free(ext4_system_zone_cachep, entry);
+		}
+	}
+
+	/* Can we merge to the right? */
+	node = rb_next(new_node);
+	if (node) {
+		entry = rb_entry(node, struct ext4_system_zone, node);
+		if (can_merge(new_entry, entry)) {
+			new_entry->count += entry->count;
+			rb_erase(node, &sbi->system_blks);
+			kmem_cache_free(ext4_system_zone_cachep, entry);
+		}
+	}
+	return 0;
+}
+
+static void debug_print_tree(struct ext4_sb_info *sbi)
+{
+	struct rb_node *node;
+	struct ext4_system_zone *entry;
+	int first = 1;
+
+	printk(KERN_INFO "System zones: ");
+	node = rb_first(&sbi->system_blks);
+	while (node) {
+		entry = rb_entry(node, struct ext4_system_zone, node);
+		printk("%s%llu-%llu", first ? "" : ", ",
+		       entry->start_blk, entry->start_blk + entry->count - 1);
+		first = 0;
+		node = rb_next(node);
+	}
+	printk("\n");
+}
+
+int ext4_setup_system_zone(struct super_block *sb)
+{
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *gdp;
+	ext4_group_t i;
+	int flex_size = ext4_flex_bg_size(sbi);
+	int ret;
+
+	if (!test_opt(sb, BLOCK_VALIDITY)) {
+		if (EXT4_SB(sb)->system_blks.rb_node)
+			ext4_release_system_zone(sb);
+		return 0;
+	}
+	if (EXT4_SB(sb)->system_blks.rb_node)
+		return 0;
+
+	for (i=0; i < ngroups; i++) {
+		if (ext4_bg_has_super(sb, i) &&
+		    ((i < 5) || ((i % flex_size) == 0)))
+			add_system_zone(sbi, ext4_group_first_block_no(sb, i),
+					sbi->s_gdb_count + 1);
+		gdp = ext4_get_group_desc(sb, i, NULL);
+		ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
+		if (ret)
+			return ret;
+		ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1);
+		if (ret)
+			return ret;
+		ret = add_system_zone(sbi, ext4_inode_table(sb, gdp),
+				sbi->s_itb_per_group);
+		if (ret)
+			return ret;
+	}
+
+	if (test_opt(sb, DEBUG))
+		debug_print_tree(EXT4_SB(sb));
+	return 0;
+}
+
+/* Called when the filesystem is unmounted */
+void ext4_release_system_zone(struct super_block *sb)
+{
+	struct rb_node	*n = EXT4_SB(sb)->system_blks.rb_node;
+	struct rb_node	*parent;
+	struct ext4_system_zone	*entry;
+
+	while (n) {
+		/* Do the node's children first */
+		if (n->rb_left) {
+			n = n->rb_left;
+			continue;
+		}
+		if (n->rb_right) {
+			n = n->rb_right;
+			continue;
+		}
+		/*
+		 * The node has no children; free it, and then zero
+		 * out parent's link to it.  Finally go to the
+		 * beginning of the loop and try to free the parent
+		 * node.
+		 */
+		parent = rb_parent(n);
+		entry = rb_entry(n, struct ext4_system_zone, node);
+		kmem_cache_free(ext4_system_zone_cachep, entry);
+		if (!parent)
+			EXT4_SB(sb)->system_blks.rb_node = NULL;
+		else if (parent->rb_left == n)
+			parent->rb_left = NULL;
+		else if (parent->rb_right == n)
+			parent->rb_right = NULL;
+		n = parent;
+	}
+	EXT4_SB(sb)->system_blks.rb_node = NULL;
+}
+
+/*
+ * Returns 1 if the passed-in block region (start_blk,
+ * start_blk+count) is valid; 0 if some part of the block region
+ * overlaps with filesystem metadata blocks.
+ */
+int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
+			  unsigned int count)
+{
+	struct ext4_system_zone *entry;
+	struct rb_node *n = sbi->system_blks.rb_node;
+
+	if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+	    (start_blk + count > ext4_blocks_count(sbi->s_es)))
+		return 0;
+	while (n) {
+		entry = rb_entry(n, struct ext4_system_zone, node);
+		if (start_blk + count - 1 < entry->start_blk)
+			n = n->rb_left;
+		else if (start_blk >= (entry->start_blk + entry->count))
+			n = n->rb_right;
+		else
+			return 0;
+	}
+	return 1;
+}
+
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d164f12..4311cc8 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -696,6 +696,7 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
 #define EXT4_MOUNT_DATA_ERR_ABORT	0x10000000 /* Abort on file data write */
+#define EXT4_MOUNT_BLOCK_VALIDITY	0x20000000 /* Block validity checking */
 
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
@@ -887,6 +888,7 @@ struct ext4_sb_info {
 	int s_jquota_fmt;			/* Format of quota to use */
 #endif
 	unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
+	struct rb_root system_blks;
 
 #ifdef EXTENTS_STATS
 	/* ext4 extents stats */
@@ -1618,6 +1620,15 @@ extern struct dentry *ext4_get_parent(struct dentry *child);
 extern const struct inode_operations ext4_symlink_inode_operations;
 extern const struct inode_operations ext4_fast_symlink_inode_operations;
 
+/* block_validity */
+extern void ext4_release_system_zone(struct super_block *sb);
+extern int ext4_setup_system_zone(struct super_block *sb);
+extern int __init init_ext4_system_zone(void);
+extern void exit_ext4_system_zone(void);
+extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
+				 ext4_fsblk_t start_blk,
+				 unsigned int count);
+
 /* extents.c */
 extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 27c383c..d04b779 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -326,32 +326,18 @@ ext4_ext_max_entries(struct inode *inode, int depth)
 
 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 {
-	ext4_fsblk_t block = ext_pblock(ext), valid_block;
+	ext4_fsblk_t block = ext_pblock(ext);
 	int len = ext4_ext_get_actual_len(ext);
-	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
 
-	valid_block = le32_to_cpu(es->s_first_data_block) +
-		EXT4_SB(inode->i_sb)->s_gdb_count;
-	if (unlikely(block <= valid_block ||
-		     ((block + len) > ext4_blocks_count(es))))
-		return 0;
-	else
-		return 1;
+	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
 }
 
 static int ext4_valid_extent_idx(struct inode *inode,
 				struct ext4_extent_idx *ext_idx)
 {
-	ext4_fsblk_t block = idx_pblock(ext_idx), valid_block;
-	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+	ext4_fsblk_t block = idx_pblock(ext_idx);
 
-	valid_block = le32_to_cpu(es->s_first_data_block) +
-		EXT4_SB(inode->i_sb)->s_gdb_count;
-	if (unlikely(block <= valid_block ||
-		     (block >= ext4_blocks_count(es))))
-		return 0;
-	else
-		return 1;
+	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
 }
 
 static int ext4_valid_extent_entries(struct inode *inode,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d7b7480..dadd3f9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -372,20 +372,21 @@ static int ext4_block_to_path(struct inode *inode,
 }
 
 static int __ext4_check_blockref(const char *function, struct inode *inode,
-				 __le32 *p, unsigned int max) {
-
-	unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
+				 __le32 *p, unsigned int max)
+{
 	__le32 *bref = p;
+	unsigned int blk;
+
 	while (bref < p+max) {
-		if (unlikely(le32_to_cpu(*bref) >= maxblocks)) {
+		blk = le32_to_cpu(*bref++);
+		if (blk && 
+		    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), 
+						    blk, 1))) {
 			ext4_error(inode->i_sb, function,
-				   "block reference %u >= max (%u) "
-				   "in inode #%lu, offset=%d",
-				   le32_to_cpu(*bref), maxblocks,
-				   inode->i_ino, (int)(bref-p));
+				   "invalid block reference %u "
+				   "in inode #%lu", blk, inode->i_ino);
  			return -EIO;
  		}
-		bref++;
  	}
  	return 0;
 }
@@ -1125,6 +1126,21 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
 		ext4_discard_preallocations(inode);
 }
 
+static int check_block_validity(struct inode *inode, sector_t logical,
+				sector_t phys, int len)
+{
+	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
+		ext4_error(inode->i_sb, "check_block_validity",
+			   "inode #%lu logical block %llu mapped to %llu "
+			   "(size %d)", inode->i_ino,
+			   (unsigned long long) logical,
+			   (unsigned long long) phys, len);
+		WARN_ON(1);
+		return -EIO;
+	}
+	return 0;
+}
+
 /*
  * The ext4_get_blocks() function tries to look up the requested blocks,
  * and returns if the blocks are already mapped.
@@ -1170,6 +1186,13 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	}
 	up_read((&EXT4_I(inode)->i_data_sem));
 
+	if (retval > 0 && buffer_mapped(bh)) {
+		int ret = check_block_validity(inode, block, 
+					       bh->b_blocknr, retval);
+		if (ret != 0)
+			return ret;
+	}
+
 	/* If it is only a block(s) look up */
 	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
 		return retval;
@@ -1245,6 +1268,12 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 		ext4_da_update_reserve_space(inode, retval);
 
 	up_write((&EXT4_I(inode)->i_data_sem));
+	if (retval > 0 && buffer_mapped(bh)) {
+		int ret = check_block_validity(inode, block, 
+					       bh->b_blocknr, retval);
+		if (ret != 0)
+			return ret;
+	}
 	return retval;
 }
 
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 541bd9a..ed8482e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2961,15 +2961,10 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 		+ le32_to_cpu(es->s_first_data_block);
 
 	len = ac->ac_b_ex.fe_len;
-	if (in_range(ext4_block_bitmap(sb, gdp), block, len) ||
-	    in_range(ext4_inode_bitmap(sb, gdp), block, len) ||
-	    in_range(block, ext4_inode_table(sb, gdp),
-		     EXT4_SB(sb)->s_itb_per_group) ||
-	    in_range(block + len - 1, ext4_inode_table(sb, gdp),
-		     EXT4_SB(sb)->s_itb_per_group)) {
+	if (!ext4_data_block_valid(sbi, block, len)) {
 		ext4_error(sb, __func__,
-			   "Allocating block %llu in system zone of %d group\n",
-			   block, ac->ac_b_ex.fe_group);
+			   "Allocating blocks %llu-%llu which overlap "
+			   "fs metadata\n", block, block+len);
 		/* File system mounted not to panic on error
 		 * Fix the bitmap and repeat the block allocation
 		 * We leak some of the blocks here.
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dc34ed3..600b7ad 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -568,6 +568,7 @@ static void ext4_put_super(struct super_block *sb)
 	struct ext4_super_block *es = sbi->s_es;
 	int i, err;
 
+	ext4_release_system_zone(sb);
 	ext4_mb_release(sb);
 	ext4_ext_release(sb);
 	ext4_xattr_put_super(sb);
@@ -1055,6 +1056,7 @@ enum {
 	Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
 	Opt_usrquota, Opt_grpquota, Opt_i_version,
 	Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+	Opt_block_validity, Opt_noblock_validity,
 	Opt_inode_readahead_blks, Opt_journal_ioprio
 };
 
@@ -1114,6 +1116,8 @@ static const match_table_t tokens = {
 	{Opt_resize, "resize"},
 	{Opt_delalloc, "delalloc"},
 	{Opt_nodelalloc, "nodelalloc"},
+	{Opt_block_validity, "block_validity"},
+	{Opt_noblock_validity, "noblock_validity"},
 	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
 	{Opt_journal_ioprio, "journal_ioprio=%u"},
 	{Opt_auto_da_alloc, "auto_da_alloc=%u"},
@@ -1508,6 +1512,12 @@ set_qf_format:
 		case Opt_delalloc:
 			set_opt(sbi->s_mount_opt, DELALLOC);
 			break;
+		case Opt_block_validity:
+			set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+			break;
+		case Opt_noblock_validity:
+			clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+			break;
 		case Opt_inode_readahead_blks:
 			if (match_int(&args[0], &option))
 				return 0;
@@ -2826,6 +2836,13 @@ no_journal:
 	} else if (test_opt(sb, DELALLOC))
 		printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
 
+	err = ext4_setup_system_zone(sb);
+	if (err) {
+		printk(KERN_ERR "EXT4-fs: failed to initialize system "
+		       "zone (%d)\n", err);
+		goto failed_mount4;
+	}
+
 	ext4_ext_init(sb);
 	err = ext4_mb_init(sb, needs_recovery);
 	if (err) {
@@ -2875,6 +2892,7 @@ cantfind_ext4:
 
 failed_mount4:
 	printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id);
+	ext4_release_system_zone(sb);
 	if (sbi->s_journal) {
 		jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
@@ -3515,6 +3533,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 				sb->s_flags &= ~MS_RDONLY;
 		}
 	}
+	ext4_setup_system_zone(sb);
 	if (sbi->s_journal == NULL)
 		ext4_commit_super(sb, 1);
 
@@ -3927,13 +3946,16 @@ static int __init init_ext4_fs(void)
 {
 	int err;
 
+	err = init_ext4_system_zone();
+	if (err)
+		return err;
 	ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
 	if (!ext4_kset)
-		return -ENOMEM;
+		goto out4;
 	ext4_proc_root = proc_mkdir("fs/ext4", NULL);
 	err = init_ext4_mballoc();
 	if (err)
-		return err;
+		goto out3;
 
 	err = init_ext4_xattr();
 	if (err)
@@ -3958,6 +3980,11 @@ out1:
 	exit_ext4_xattr();
 out2:
 	exit_ext4_mballoc();
+out3:
+	remove_proc_entry("fs/ext4", NULL);
+	kset_unregister(ext4_kset);
+out4:
+	exit_ext4_system_zone();
 	return err;
 }
 
@@ -3972,6 +3999,7 @@ static void __exit exit_ext4_fs(void)
 	exit_ext4_mballoc();
 	remove_proc_entry("fs/ext4", NULL);
 	kset_unregister(ext4_kset);
+	exit_ext4_system_zone();
 }
 
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
-- 
cgit v0.10.2


From 0568c518937ee3a9b6a94d18bae9c150fe5d6832 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 17 May 2009 23:31:23 -0400
Subject: ext4: down i_data_sem only for read when walking tree for fiemap

Not sure why I put this in as down_write originally; all we are
doing is walking the tree, nothing will change under us and
concurrent reads should be no problem.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index d04b779..d4e99e9 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3312,10 +3312,10 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		 * Walk the extent tree gathering extent information.
 		 * ext4_ext_fiemap_cb will push extents back to user.
 		 */
-		down_write(&EXT4_I(inode)->i_data_sem);
+		down_read(&EXT4_I(inode)->i_data_sem);
 		error = ext4_ext_walk_space(inode, start_blk, len_blks,
 					  ext4_ext_fiemap_cb, fieinfo);
-		up_write(&EXT4_I(inode)->i_data_sem);
+		up_read(&EXT4_I(inode)->i_data_sem);
 	}
 
 	return error;
-- 
cgit v0.10.2


From 5e392739d6ab72f7c35040aa07f4097904bce6e7 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Mon, 11 May 2009 00:36:35 +0000
Subject: netpoll: don't dereference NULL dev from np

It looks like the dev in netpoll_poll can be NULL - at lease it's
checked at the function beginning. Thus the dev->netde_ops dereference
looks dangerous.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index b5873bd..64f51ee 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -175,9 +175,13 @@ static void service_arp_queue(struct netpoll_info *npi)
 void netpoll_poll(struct netpoll *np)
 {
 	struct net_device *dev = np->dev;
-	const struct net_device_ops *ops = dev->netdev_ops;
+	const struct net_device_ops *ops;
+
+	if (!dev || !netif_running(dev))
+		return;
 
-	if (!dev || !netif_running(dev) || !ops->ndo_poll_controller)
+	ops = dev->netdev_ops;
+	if (!ops->ndo_poll_controller)
 		return;
 
 	/* Process pending work on NIC */
-- 
cgit v0.10.2


From 2513dfb83fc775364fe85803d3a84d7ebe5763a5 Mon Sep 17 00:00:00 2001
From: Chris Friesen <cfriesen@nortel.com>
Date: Sun, 17 May 2009 20:39:33 -0700
Subject: ipconfig: handle case of delayed DHCP server

If a DHCP server is delayed, it's possible for the client to receive the
DHCPOFFER after it has already sent out a new DHCPDISCOVER message from
a second interface.  The client then sends out a DHCPREQUEST from the
second interface, but the server doesn't recognize the device and
rejects the request.

This patch simply tracks the current device being configured and throws
away the OFFER if it is not intended for the current device.  A more
sophisticated approach would be to put the OFFER information into the
struct ic_device rather than storing it globally.

Signed-off-by: Chris Friesen <cfriesen@nortel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 90d22ae..88bf051 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -139,6 +139,8 @@ __be32 ic_servaddr = NONE;	/* Boot server IP address */
 __be32 root_server_addr = NONE;	/* Address of NFS server */
 u8 root_server_path[256] = { 0, };	/* Path to mount as root */
 
+u32 ic_dev_xid;		/* Device under configuration */
+
 /* vendor class identifier */
 static char vendor_class_identifier[253] __initdata;
 
@@ -932,6 +934,13 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
 		goto drop_unlock;
 	}
 
+	/* Is it a reply for the device we are configuring? */
+	if (b->xid != ic_dev_xid) {
+		if (net_ratelimit())
+			printk(KERN_ERR "DHCP/BOOTP: Ignoring delayed packet \n");
+		goto drop_unlock;
+	}
+
 	/* Parse extensions */
 	if (ext_len >= 4 &&
 	    !memcmp(b->exten, ic_bootp_cookie, 4)) { /* Check magic cookie */
@@ -1115,6 +1124,9 @@ static int __init ic_dynamic(void)
 	get_random_bytes(&timeout, sizeof(timeout));
 	timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM);
 	for (;;) {
+		/* Track the device we are configuring */
+		ic_dev_xid = d->xid;
+
 #ifdef IPCONFIG_BOOTP
 		if (do_bootp && (d->able & IC_BOOTP))
 			ic_bootp_send_if(d, jiffies - start_jiffies);
-- 
cgit v0.10.2


From 72876a603422d3767273ffb3918033fa1bfda0f3 Mon Sep 17 00:00:00 2001
From: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Date: Sun, 17 May 2009 20:48:59 -0700
Subject: mlx4_en: Fix not deleted napi structures

Napi structures are being created each time we open a port, but when
the port is closed the napi structure is only disabled but not removed.
This bug caused hang while removing the driver.

Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/mlx4/en_cq.c b/drivers/net/mlx4/en_cq.c
index 91f50de..a276125 100644
--- a/drivers/net/mlx4/en_cq.c
+++ b/drivers/net/mlx4/en_cq.c
@@ -125,8 +125,10 @@ void mlx4_en_deactivate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
 
 	if (cq->is_tx)
 		del_timer(&cq->timer);
-	else
+	else {
 		napi_disable(&cq->napi);
+		netif_napi_del(&cq->napi);
+	}
 
 	mlx4_cq_free(mdev->dev, &cq->mcq);
 }
-- 
cgit v0.10.2


From f68301656b5f5d2de104f2687add6beeb8f3c3b9 Mon Sep 17 00:00:00 2001
From: Manish Katiyar <mkatiyar@gmail.com>
Date: Sun, 17 May 2009 23:52:44 -0400
Subject: ext4: Fix memory leak in ext4_fill_super() in case of a failed mount

Signed-off-by: Manish Katiyar <mkatiyar@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 600b7ad..eca6c05 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2924,6 +2924,7 @@ failed_mount:
 	brelse(bh);
 out_fail:
 	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 	lock_kernel();
 	return ret;
-- 
cgit v0.10.2


From de5ce037304f2c88a319b1c3b808ab0c4c618c1c Mon Sep 17 00:00:00 2001
From: Manish Katiyar <mkatiyar@gmail.com>
Date: Sun, 17 May 2009 23:52:47 -0400
Subject: ext3: Fix memory leak in ext3_fill_super() in case of a failed mount

Signed-off-by: Manish Katiyar <mkatiyar@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 599dbfe..d8b73d4a 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2021,6 +2021,7 @@ failed_mount:
 	brelse(bh);
 out_fail:
 	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 	lock_kernel();
 	return ret;
-- 
cgit v0.10.2


From 0f7ee7c17241915fdaff49d1a36f5aafd80a7dce Mon Sep 17 00:00:00 2001
From: Manish Katiyar <mkatiyar@gmail.com>
Date: Sun, 17 May 2009 23:52:51 -0400
Subject: ext2: Fix memory leak in ext2_fill_super() in case of a failed mount

Signed-off-by: Manish Katiyar <mkatiyar@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 5c4afe6..e3c748f 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1093,6 +1093,7 @@ failed_mount:
 	brelse(bh);
 failed_sbi:
 	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 	return ret;
 }
-- 
cgit v0.10.2


From a8f492c6ac6072d6c5bce4b2f13dc44612991051 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Sat, 16 May 2009 01:21:58 +0000
Subject: NET: Meth: Fix unsafe mix of irq and non-irq spinlocks.

Mixing of normal and irq spinlocks results in the following lockdep messages
on bootup on IP32:

[...]
Sending DHCP requests .
======================================================
[ INFO: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected ]
2.6.30-rc5-00164-g41baeef #30
------------------------------------------------------
swapper/1 [HC0[0]:SC0[1]:HE0:SE0] is trying to acquire:
 (&priv->meth_lock){+.+...}, at: [<ffffffff8026388c>] meth_tx+0x48/0x43c

and this task is already holding:
 (_xmit_ETHER#2){+.-...}, at: [<ffffffff802d3a00>] __qdisc_run+0x118/0x30c
which would create a new lock dependency:
 (_xmit_ETHER#2){+.-...} -> (&priv->meth_lock){+.+...}

but this new dependency connects a SOFTIRQ-irq-safe lock:
 (_xmit_ETHER#2){+.-...}
... which became SOFTIRQ-irq-safe at:
  [<ffffffff80061458>] __lock_acquire+0x784/0x1a14
  [<ffffffff800627e0>] lock_acquire+0xf8/0x150
  [<ffffffff800128d0>] _spin_lock+0x30/0x44
  [<ffffffff802d2b88>] dev_watchdog+0x70/0x398
  [<ffffffff800433b8>] run_timer_softirq+0x1a8/0x248
  [<ffffffff8003da5c>] __do_softirq+0xec/0x208
  [<ffffffff8003dbd8>] do_softirq+0x60/0xe4
  [<ffffffff8003dda0>] irq_exit+0x54/0x9c
  [<ffffffff80004420>] ret_from_irq+0x0/0x4
  [<ffffffff80004720>] r4k_wait+0x20/0x40
  [<ffffffff80015418>] cpu_idle+0x30/0x60
  [<ffffffff804cd934>] start_kernel+0x3ec/0x404

to a SOFTIRQ-irq-unsafe lock:
 (&priv->meth_lock){+.+...}
... which became SOFTIRQ-irq-unsafe at:
...  [<ffffffff800614f8>] __lock_acquire+0x824/0x1a14
  [<ffffffff800627e0>] lock_acquire+0xf8/0x150
  [<ffffffff800128d0>] _spin_lock+0x30/0x44
  [<ffffffff80263f20>] meth_reset+0x118/0x2d8
  [<ffffffff8026424c>] meth_open+0x28/0x140
  [<ffffffff802c1ae8>] dev_open+0xe0/0x18c
  [<ffffffff802c1268>] dev_change_flags+0xd8/0x1d4
  [<ffffffff804e7770>] ip_auto_config+0x1d4/0xf28
  [<ffffffff80012e68>] do_one_initcall+0x58/0x170
  [<ffffffff804cd190>] kernel_init+0x98/0x104
  [<ffffffff8001520c>] kernel_thread_helper+0x10/0x18

other info that might help us debug this:

2 locks held by swapper/1:
 #0:  (rcu_read_lock){.+.+..}, at: [<ffffffff802c0954>] dev_queue_xmit+0x1e0/0x4b0
 #1:  (_xmit_ETHER#2){+.-...}, at: [<ffffffff802d3a00>] __qdisc_run+0x118/0x30c

the SOFTIRQ-irq-safe lock's dependencies:
-> (_xmit_ETHER#2){+.-...} ops: 0 {
   HARDIRQ-ON-W at:
                        [<ffffffff800614d0>] __lock_acquire+0x7fc/0x1a14
                        [<ffffffff800627e0>] lock_acquire+0xf8/0x150
                        [<ffffffff800128d0>] _spin_lock+0x30/0x44
                        [<ffffffff802d2b88>] dev_watchdog+0x70/0x398
                        [<ffffffff800433b8>] run_timer_softirq+0x1a8/0x248
                        [<ffffffff8003da5c>] __do_softirq+0xec/0x208
                        [<ffffffff8003dbd8>] do_softirq+0x60/0xe4
                        [<ffffffff8003dda0>] irq_exit+0x54/0x9c
                        [<ffffffff80004420>] ret_from_irq+0x0/0x4
                        [<ffffffff80004720>] r4k_wait+0x20/0x40
                        [<ffffffff80015418>] cpu_idle+0x30/0x60
                        [<ffffffff804cd934>] start_kernel+0x3ec/0x404
   IN-SOFTIRQ-W at:
                        [<ffffffff80061458>] __lock_acquire+0x784/0x1a14
                        [<ffffffff800627e0>] lock_acquire+0xf8/0x150
                        [<ffffffff800128d0>] _spin_lock+0x30/0x44
                        [<ffffffff802d2b88>] dev_watchdog+0x70/0x398
                        [<ffffffff800433b8>] run_timer_softirq+0x1a8/0x248
                        [<ffffffff8003da5c>] __do_softirq+0xec/0x208
                        [<ffffffff8003dbd8>] do_softirq+0x60/0xe4
                        [<ffffffff8003dda0>] irq_exit+0x54/0x9c
                        [<ffffffff80004420>] ret_from_irq+0x0/0x4
                        [<ffffffff80004720>] r4k_wait+0x20/0x40
                        [<ffffffff80015418>] cpu_idle+0x30/0x60
                        [<ffffffff804cd934>] start_kernel+0x3ec/0x404
   INITIAL USE at:
                       [<ffffffff80061570>] __lock_acquire+0x89c/0x1a14
                       [<ffffffff800627e0>] lock_acquire+0xf8/0x150
                       [<ffffffff800128d0>] _spin_lock+0x30/0x44
                       [<ffffffff802d2b88>] dev_watchdog+0x70/0x398
                       [<ffffffff800433b8>] run_timer_softirq+0x1a8/0x248
                       [<ffffffff8003da5c>] __do_softirq+0xec/0x208
                       [<ffffffff8003dbd8>] do_softirq+0x60/0xe4
                       [<ffffffff8003dda0>] irq_exit+0x54/0x9c
                       [<ffffffff80004420>] ret_from_irq+0x0/0x4
                       [<ffffffff80004720>] r4k_wait+0x20/0x40
                       [<ffffffff80015418>] cpu_idle+0x30/0x60
                       [<ffffffff804cd934>] start_kernel+0x3ec/0x404
 }
 ... key      at: [<ffffffff80cf93f0>] netdev_xmit_lock_key+0x8/0x1c8

the SOFTIRQ-irq-unsafe lock's dependencies:
-> (&priv->meth_lock){+.+...} ops: 0 {
   HARDIRQ-ON-W at:
                        [<ffffffff800614d0>] __lock_acquire+0x7fc/0x1a14
                        [<ffffffff800627e0>] lock_acquire+0xf8/0x150
                        [<ffffffff800128d0>] _spin_lock+0x30/0x44
                        [<ffffffff80263f20>] meth_reset+0x118/0x2d8
                        [<ffffffff8026424c>] meth_open+0x28/0x140
                        [<ffffffff802c1ae8>] dev_open+0xe0/0x18c
                        [<ffffffff802c1268>] dev_change_flags+0xd8/0x1d4
                        [<ffffffff804e7770>] ip_auto_config+0x1d4/0xf28
                        [<ffffffff80012e68>] do_one_initcall+0x58/0x170
                        [<ffffffff804cd190>] kernel_init+0x98/0x104
                        [<ffffffff8001520c>] kernel_thread_helper+0x10/0x18
   SOFTIRQ-ON-W at:
                        [<ffffffff800614f8>] __lock_acquire+0x824/0x1a14
                        [<ffffffff800627e0>] lock_acquire+0xf8/0x150
                        [<ffffffff800128d0>] _spin_lock+0x30/0x44
                        [<ffffffff80263f20>] meth_reset+0x118/0x2d8
                        [<ffffffff8026424c>] meth_open+0x28/0x140
                        [<ffffffff802c1ae8>] dev_open+0xe0/0x18c
                        [<ffffffff802c1268>] dev_change_flags+0xd8/0x1d4
                        [<ffffffff804e7770>] ip_auto_config+0x1d4/0xf28
                        [<ffffffff80012e68>] do_one_initcall+0x58/0x170
                        [<ffffffff804cd190>] kernel_init+0x98/0x104
                        [<ffffffff8001520c>] kernel_thread_helper+0x10/0x18
   INITIAL USE at:
                       [<ffffffff80061570>] __lock_acquire+0x89c/0x1a14
                       [<ffffffff800627e0>] lock_acquire+0xf8/0x150
                       [<ffffffff800128d0>] _spin_lock+0x30/0x44
                       [<ffffffff80263f20>] meth_reset+0x118/0x2d8
                       [<ffffffff8026424c>] meth_open+0x28/0x140
                       [<ffffffff802c1ae8>] dev_open+0xe0/0x18c
                       [<ffffffff802c1268>] dev_change_flags+0xd8/0x1d4
                       [<ffffffff804e7770>] ip_auto_config+0x1d4/0xf28
                       [<ffffffff80012e68>] do_one_initcall+0x58/0x170
                       [<ffffffff804cd190>] kernel_init+0x98/0x104
                       [<ffffffff8001520c>] kernel_thread_helper+0x10/0x18
 }
 ... key      at: [<ffffffff80cf6ce8>] __key.32424+0x0/0x8

stack backtrace:
Call Trace:
[<ffffffff8000ed0c>] dump_stack+0x8/0x34
[<ffffffff80060b74>] check_usage+0x470/0x4a0
[<ffffffff80060c34>] check_irq_usage+0x90/0x130
[<ffffffff80061f78>] __lock_acquire+0x12a4/0x1a14
[<ffffffff800627e0>] lock_acquire+0xf8/0x150
[<ffffffff80012a0c>] _spin_lock_irqsave+0x60/0x84
[<ffffffff8026388c>] meth_tx+0x48/0x43c
[<ffffffff802d3a38>] __qdisc_run+0x150/0x30c
[<ffffffff802c0aa8>] dev_queue_xmit+0x334/0x4b0
[<ffffffff804e7e6c>] ip_auto_config+0x8d0/0xf28
[<ffffffff80012e68>] do_one_initcall+0x58/0x170
[<ffffffff804cd190>] kernel_init+0x98/0x104
[<ffffffff8001520c>] kernel_thread_helper+0x10/0x18

..... timed out!
IP-Config: Retrying forever (NFS root)...
Sending DHCP requests ., OK
[...]

Fixed by converting all locks to irq locks.

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Tested-by: Andrew Randrianasulu <randrik_a@yahoo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/meth.c b/drivers/net/meth.c
index aa08987..dbd3436 100644
--- a/drivers/net/meth.c
+++ b/drivers/net/meth.c
@@ -127,11 +127,11 @@ static unsigned long mdio_read(struct meth_private *priv, unsigned long phyreg)
 static int mdio_probe(struct meth_private *priv)
 {
 	int i;
-	unsigned long p2, p3;
+	unsigned long p2, p3, flags;
 	/* check if phy is detected already */
 	if(priv->phy_addr>=0&&priv->phy_addr<32)
 		return 0;
-	spin_lock(&priv->meth_lock);
+	spin_lock_irqsave(&priv->meth_lock, flags);
 	for (i=0;i<32;++i){
 		priv->phy_addr=i;
 		p2=mdio_read(priv,2);
@@ -157,7 +157,7 @@ static int mdio_probe(struct meth_private *priv)
 			break;
 		}
 	}
-	spin_unlock(&priv->meth_lock);
+	spin_unlock_irqrestore(&priv->meth_lock, flags);
 	if(priv->phy_addr<32) {
 		return 0;
 	}
@@ -373,14 +373,14 @@ static int meth_release(struct net_device *dev)
 static void meth_rx(struct net_device* dev, unsigned long int_status)
 {
 	struct sk_buff *skb;
-	unsigned long status;
+	unsigned long status, flags;
 	struct meth_private *priv = netdev_priv(dev);
 	unsigned long fifo_rptr = (int_status & METH_INT_RX_RPTR_MASK) >> 8;
 
-	spin_lock(&priv->meth_lock);
+	spin_lock_irqsave(&priv->meth_lock, flags);
 	priv->dma_ctrl &= ~METH_DMA_RX_INT_EN;
 	mace->eth.dma_ctrl = priv->dma_ctrl;
-	spin_unlock(&priv->meth_lock);
+	spin_unlock_irqrestore(&priv->meth_lock, flags);
 
 	if (int_status & METH_INT_RX_UNDERFLOW) {
 		fifo_rptr = (fifo_rptr - 1) & 0x0f;
@@ -452,12 +452,12 @@ static void meth_rx(struct net_device* dev, unsigned long int_status)
 		mace->eth.rx_fifo = priv->rx_ring_dmas[priv->rx_write];
 		ADVANCE_RX_PTR(priv->rx_write);
 	}
-	spin_lock(&priv->meth_lock);
+	spin_lock_irqsave(&priv->meth_lock, flags);
 	/* In case there was underflow, and Rx DMA was disabled */
 	priv->dma_ctrl |= METH_DMA_RX_INT_EN | METH_DMA_RX_EN;
 	mace->eth.dma_ctrl = priv->dma_ctrl;
 	mace->eth.int_stat = METH_INT_RX_THRESHOLD;
-	spin_unlock(&priv->meth_lock);
+	spin_unlock_irqrestore(&priv->meth_lock, flags);
 }
 
 static int meth_tx_full(struct net_device *dev)
@@ -470,11 +470,11 @@ static int meth_tx_full(struct net_device *dev)
 static void meth_tx_cleanup(struct net_device* dev, unsigned long int_status)
 {
 	struct meth_private *priv = netdev_priv(dev);
-	unsigned long status;
+	unsigned long status, flags;
 	struct sk_buff *skb;
 	unsigned long rptr = (int_status&TX_INFO_RPTR) >> 16;
 
-	spin_lock(&priv->meth_lock);
+	spin_lock_irqsave(&priv->meth_lock, flags);
 
 	/* Stop DMA notification */
 	priv->dma_ctrl &= ~(METH_DMA_TX_INT_EN);
@@ -527,12 +527,13 @@ static void meth_tx_cleanup(struct net_device* dev, unsigned long int_status)
 	}
 
 	mace->eth.int_stat = METH_INT_TX_EMPTY | METH_INT_TX_PKT;
-	spin_unlock(&priv->meth_lock);
+	spin_unlock_irqrestore(&priv->meth_lock, flags);
 }
 
 static void meth_error(struct net_device* dev, unsigned status)
 {
 	struct meth_private *priv = netdev_priv(dev);
+	unsigned long flags;
 
 	printk(KERN_WARNING "meth: error status: 0x%08x\n",status);
 	/* check for errors too... */
@@ -547,7 +548,7 @@ static void meth_error(struct net_device* dev, unsigned status)
 		printk(KERN_WARNING "meth: Rx overflow\n");
 	if (status & (METH_INT_RX_UNDERFLOW)) {
 		printk(KERN_WARNING "meth: Rx underflow\n");
-		spin_lock(&priv->meth_lock);
+		spin_lock_irqsave(&priv->meth_lock, flags);
 		mace->eth.int_stat = METH_INT_RX_UNDERFLOW;
 		/* more underflow interrupts will be delivered,
 		 * effectively throwing us into an infinite loop.
@@ -555,7 +556,7 @@ static void meth_error(struct net_device* dev, unsigned status)
 		priv->dma_ctrl &= ~METH_DMA_RX_EN;
 		mace->eth.dma_ctrl = priv->dma_ctrl;
 		DPRINTK("Disabled meth Rx DMA temporarily\n");
-		spin_unlock(&priv->meth_lock);
+		spin_unlock_irqrestore(&priv->meth_lock, flags);
 	}
 	mace->eth.int_stat = METH_INT_ERROR;
 }
-- 
cgit v0.10.2


From a598f6aebea2481531b0757ed90cfb0d8cf1d8f5 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Fri, 15 May 2009 06:10:13 +0000
Subject: bridge: relay bridge multicast pkgs if !STP

Currently the bridge catches all STP packets; even if STP is turned
off.  This prevents other systems (which do have STP turned on)
from being able to detect loops in the network.

With this patch, if STP is off, then any packet sent to the STP
multicast group address is forwarded to all ports.

Based on earlier patch by Joakim Tjernlund with changes
to go through forwarding (not local chain), and optimization
that only last octet needs to be checked.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 30b8877..5ee1a36 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -134,6 +134,10 @@ struct sk_buff *br_handle_frame(struct net_bridge_port *p, struct sk_buff *skb)
 		if (skb->protocol == htons(ETH_P_PAUSE))
 			goto drop;
 
+		/* If STP is turned off, then forward */
+		if (p->br->stp_enabled == BR_NO_STP && dest[5] == 0)
+			goto forward;
+
 		if (NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev,
 			    NULL, br_handle_local_finish))
 			return NULL;	/* frame consumed by filter */
@@ -141,6 +145,7 @@ struct sk_buff *br_handle_frame(struct net_bridge_port *p, struct sk_buff *skb)
 			return skb;	/* continue processing */
 	}
 
+forward:
 	switch (p->state) {
 	case BR_STATE_FORWARDING:
 		rhook = rcu_dereference(br_should_route_hook);
-- 
cgit v0.10.2


From 4f0611af47e25807cf18cd2b4d4e94206c75b29e Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Fri, 15 May 2009 06:11:58 +0000
Subject: bridge: fix initial packet flood if !STP

If bridge is configured with no STP and forwarding delay of 0 (which
is typical for virtualization) then when link starts it will flood all
packets for the first 20 seconds.

This bug was introduced by a combination of earlier changes:
  * forwarding database uses hold time of zero to indicate
    user wants to always flood packets
  * optimzation of the case of forwarding delay of 0 avoids the initial
    timer tick

The fix is to just skip all the topology change detection code if
kernel STP is not being used.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 6e63ec3..0660515 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -297,6 +297,9 @@ void br_topology_change_detection(struct net_bridge *br)
 {
 	int isroot = br_is_root_bridge(br);
 
+	if (br->stp_enabled != BR_KERNEL_STP)
+		return;
+
 	pr_info("%s: topology change detected, %s\n", br->dev->name,
 		isroot ? "propagating" : "sending tcn bpdu");
 
-- 
cgit v0.10.2


From 4cd6fe1c6483cde93e2ec91f58b7af9c9eea51ad Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Fri, 15 May 2009 08:44:32 +0000
Subject: bonding: fix link down handling in 802.3ad mode

One of the purposes of bonding is to allow for redundant links, and failover
correctly if the cable is pulled. If all the members of a bonded device have
no carrier present, the bonded device itself needs to report no carrier present
to user space so management tools (like routing daemons) can respond.

Bonding in 802.3ad mode does not work correctly for this because it incorrectly
chooses a link that is down as a possible aggregator.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index 8c2e5ab..faf094a 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -1465,6 +1465,12 @@ static struct aggregator *ad_agg_selection_test(struct aggregator *best,
 	return best;
 }
 
+static int agg_device_up(const struct aggregator *agg)
+{
+	return (netif_running(agg->slave->dev) &&
+		netif_carrier_ok(agg->slave->dev));
+}
+
 /**
  * ad_agg_selection_logic - select an aggregation group for a team
  * @aggregator: the aggregator we're looking at
@@ -1496,14 +1502,13 @@ static void ad_agg_selection_logic(struct aggregator *agg)
 	struct port *port;
 
 	origin = agg;
-
 	active = __get_active_agg(agg);
-	best = active;
+	best = (active && agg_device_up(active)) ? active : NULL;
 
 	do {
 		agg->is_active = 0;
 
-		if (agg->num_of_ports)
+		if (agg->num_of_ports && agg_device_up(agg))
 			best = ad_agg_selection_test(best, agg);
 
 	} while ((agg = __get_next_agg(agg)));
-- 
cgit v0.10.2


From eb0519b5a1cf07d5a1b919cffa0304cdaaa97a7e Mon Sep 17 00:00:00 2001
From: Gabriel Paubert <paubert@iram.es>
Date: Sun, 17 May 2009 21:16:47 -0700
Subject: mv643xx_eth: fix PPC DMA breakage

After 2.6.29, PPC no more admits passing NULL to the dev parameter of
the DMA API. The result is a BUG followed by solid lock-up when the
mv643xx_eth driver brings an interface up. The following patch makes
the driver work on my Pegasos again; it is mostly a search and replace
of NULL by mp->dev->dev.parent in dma allocation/freeing/mapping/unmapping
functions.

Signed-off-by: Gabriel Paubert <paubert@iram.es>
Acked-by: Lennert Buytenhek <buytenh@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c
index a400d71..6bb5af3 100644
--- a/drivers/net/mv643xx_eth.c
+++ b/drivers/net/mv643xx_eth.c
@@ -569,7 +569,7 @@ static int rxq_process(struct rx_queue *rxq, int budget)
 		if (rxq->rx_curr_desc == rxq->rx_ring_size)
 			rxq->rx_curr_desc = 0;
 
-		dma_unmap_single(NULL, rx_desc->buf_ptr,
+		dma_unmap_single(mp->dev->dev.parent, rx_desc->buf_ptr,
 				 rx_desc->buf_size, DMA_FROM_DEVICE);
 		rxq->rx_desc_count--;
 		rx++;
@@ -678,8 +678,9 @@ static int rxq_refill(struct rx_queue *rxq, int budget)
 
 		rx_desc = rxq->rx_desc_area + rx;
 
-		rx_desc->buf_ptr = dma_map_single(NULL, skb->data,
-					mp->skb_size, DMA_FROM_DEVICE);
+		rx_desc->buf_ptr = dma_map_single(mp->dev->dev.parent,
+						  skb->data, mp->skb_size,
+						  DMA_FROM_DEVICE);
 		rx_desc->buf_size = mp->skb_size;
 		rxq->rx_skb[rx] = skb;
 		wmb();
@@ -718,6 +719,7 @@ static inline unsigned int has_tiny_unaligned_frags(struct sk_buff *skb)
 
 static void txq_submit_frag_skb(struct tx_queue *txq, struct sk_buff *skb)
 {
+	struct mv643xx_eth_private *mp = txq_to_mp(txq);
 	int nr_frags = skb_shinfo(skb)->nr_frags;
 	int frag;
 
@@ -746,10 +748,10 @@ static void txq_submit_frag_skb(struct tx_queue *txq, struct sk_buff *skb)
 
 		desc->l4i_chk = 0;
 		desc->byte_cnt = this_frag->size;
-		desc->buf_ptr = dma_map_page(NULL, this_frag->page,
-						this_frag->page_offset,
-						this_frag->size,
-						DMA_TO_DEVICE);
+		desc->buf_ptr = dma_map_page(mp->dev->dev.parent,
+					     this_frag->page,
+					     this_frag->page_offset,
+					     this_frag->size, DMA_TO_DEVICE);
 	}
 }
 
@@ -826,7 +828,8 @@ no_csum:
 
 	desc->l4i_chk = l4i_chk;
 	desc->byte_cnt = length;
-	desc->buf_ptr = dma_map_single(NULL, skb->data, length, DMA_TO_DEVICE);
+	desc->buf_ptr = dma_map_single(mp->dev->dev.parent, skb->data,
+				       length, DMA_TO_DEVICE);
 
 	__skb_queue_tail(&txq->tx_skb, skb);
 
@@ -956,10 +959,10 @@ static int txq_reclaim(struct tx_queue *txq, int budget, int force)
 		}
 
 		if (cmd_sts & TX_FIRST_DESC) {
-			dma_unmap_single(NULL, desc->buf_ptr,
+			dma_unmap_single(mp->dev->dev.parent, desc->buf_ptr,
 					 desc->byte_cnt, DMA_TO_DEVICE);
 		} else {
-			dma_unmap_page(NULL, desc->buf_ptr,
+			dma_unmap_page(mp->dev->dev.parent, desc->buf_ptr,
 				       desc->byte_cnt, DMA_TO_DEVICE);
 		}
 
@@ -1894,9 +1897,9 @@ static int rxq_init(struct mv643xx_eth_private *mp, int index)
 						mp->rx_desc_sram_size);
 		rxq->rx_desc_dma = mp->rx_desc_sram_addr;
 	} else {
-		rxq->rx_desc_area = dma_alloc_coherent(NULL, size,
-							&rxq->rx_desc_dma,
-							GFP_KERNEL);
+		rxq->rx_desc_area = dma_alloc_coherent(mp->dev->dev.parent,
+						       size, &rxq->rx_desc_dma,
+						       GFP_KERNEL);
 	}
 
 	if (rxq->rx_desc_area == NULL) {
@@ -1947,7 +1950,7 @@ out_free:
 	if (index == 0 && size <= mp->rx_desc_sram_size)
 		iounmap(rxq->rx_desc_area);
 	else
-		dma_free_coherent(NULL, size,
+		dma_free_coherent(mp->dev->dev.parent, size,
 				  rxq->rx_desc_area,
 				  rxq->rx_desc_dma);
 
@@ -1979,7 +1982,7 @@ static void rxq_deinit(struct rx_queue *rxq)
 	    rxq->rx_desc_area_size <= mp->rx_desc_sram_size)
 		iounmap(rxq->rx_desc_area);
 	else
-		dma_free_coherent(NULL, rxq->rx_desc_area_size,
+		dma_free_coherent(mp->dev->dev.parent, rxq->rx_desc_area_size,
 				  rxq->rx_desc_area, rxq->rx_desc_dma);
 
 	kfree(rxq->rx_skb);
@@ -2007,9 +2010,9 @@ static int txq_init(struct mv643xx_eth_private *mp, int index)
 						mp->tx_desc_sram_size);
 		txq->tx_desc_dma = mp->tx_desc_sram_addr;
 	} else {
-		txq->tx_desc_area = dma_alloc_coherent(NULL, size,
-							&txq->tx_desc_dma,
-							GFP_KERNEL);
+		txq->tx_desc_area = dma_alloc_coherent(mp->dev->dev.parent,
+						       size, &txq->tx_desc_dma,
+						       GFP_KERNEL);
 	}
 
 	if (txq->tx_desc_area == NULL) {
@@ -2053,7 +2056,7 @@ static void txq_deinit(struct tx_queue *txq)
 	    txq->tx_desc_area_size <= mp->tx_desc_sram_size)
 		iounmap(txq->tx_desc_area);
 	else
-		dma_free_coherent(NULL, txq->tx_desc_area_size,
+		dma_free_coherent(mp->dev->dev.parent, txq->tx_desc_area_size,
 				  txq->tx_desc_area, txq->tx_desc_dma);
 }
 
-- 
cgit v0.10.2


From d77dd8d27e73a9277096453eb901aae0bfd4ca3c Mon Sep 17 00:00:00 2001
From: roel kluin <roel.kluin@gmail.com>
Date: Fri, 15 May 2009 10:19:51 +0000
Subject: Neterion: *FIFO1_DMA_ERR set twice, should 2nd be *FIFO2_DMA_ERR?

FIFO1_DMA_ERR is set twice, the second should be FIFO2_DMA_ERR.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Acked-by: Ram Vepa <ram.vepa@neterion.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/vxge/vxge-traffic.c b/drivers/net/vxge/vxge-traffic.c
index 7be0ae10..c2eeac4 100644
--- a/drivers/net/vxge/vxge-traffic.c
+++ b/drivers/net/vxge/vxge-traffic.c
@@ -115,7 +115,7 @@ enum vxge_hw_status vxge_hw_vpath_intr_enable(struct __vxge_hw_vpath_handle *vp)
 		VXGE_HW_KDFCCTL_ERRORS_REG_KDFCCTL_FIFO1_POISON|
 		VXGE_HW_KDFCCTL_ERRORS_REG_KDFCCTL_FIFO2_POISON|
 		VXGE_HW_KDFCCTL_ERRORS_REG_KDFCCTL_FIFO1_DMA_ERR|
-		VXGE_HW_KDFCCTL_ERRORS_REG_KDFCCTL_FIFO1_DMA_ERR), 0, 32),
+		VXGE_HW_KDFCCTL_ERRORS_REG_KDFCCTL_FIFO2_DMA_ERR), 0, 32),
 		&vp_reg->kdfcctl_errors_mask);
 
 	__vxge_hw_pio_mem_write32_upper(0, &vp_reg->vpath_ppif_int_mask);
-- 
cgit v0.10.2


From 705efc3b03cbee449e4d83b230423894152f7982 Mon Sep 17 00:00:00 2001
From: Wang Tinggong <wangtinggong@gmail.com>
Date: Thu, 14 May 2009 22:49:36 +0000
Subject: Doc: fixed descriptions on /proc/sys/net/core/* and
 /proc/sys/net/unix/*

Signed-off-by: Wang Tinggong <wangtinggong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index ec5de02..b121c5d 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1266,13 +1266,22 @@ sctp_rmem - vector of 3 INTEGERs: min, default, max
 sctp_wmem  - vector of 3 INTEGERs: min, default, max
 	See tcp_wmem for a description.
 
-UNDOCUMENTED:
 
 /proc/sys/net/core/*
-	dev_weight FIXME
+dev_weight - INTEGER
+	The maximum number of packets that kernel can handle on a NAPI
+	interrupt, it's a Per-CPU variable.
+
+	Default: 64
 
 /proc/sys/net/unix/*
-	max_dgram_qlen FIXME
+max_dgram_qlen - INTEGER
+	The maximum length of dgram socket receive queue
+
+	Default: 10
+
+
+UNDOCUMENTED:
 
 /proc/sys/net/irda/*
 	fast_poll_increase FIXME
-- 
cgit v0.10.2


From c0daaf3f1f672defa3a45ca449b76d0e86c55892 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 18 May 2009 14:02:12 +1000
Subject: perf_counter: powerpc: initialize cpuhw pointer before use

Commit 9e35ad38 ("perf_counter: Rework the perf counter
disable/enable") added code to the powerpc hw_perf_enable (renamed
from hw_perf_restore) to test cpuhw->disabled and return immediately
if it is not set (i.e. if the PMU is already enabled).

Unfortunately the test got added before cpuhw was initialized,
resulting in an oops the first time hw_perf_enable got called.
This fixes it by moving the initialization of cpuhw to before
cpuhw->disabled is tested.

[ Impact: fix oops-causing bug on powerpc ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <18960.56772.869734.304631@drongo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 6baae5a..fe21b24 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -450,12 +450,11 @@ void hw_perf_enable(void)
 	int idx;
 
 	local_irq_save(flags);
+	cpuhw = &__get_cpu_var(cpu_hw_counters);
 	if (!cpuhw->disabled) {
 		local_irq_restore(flags);
 		return;
 	}
-
-	cpuhw = &__get_cpu_var(cpu_hw_counters);
 	cpuhw->disabled = 0;
 
 	/*
-- 
cgit v0.10.2


From e5198075c67a22ec9a09565b1ce88d3d3f5ba855 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 15 May 2009 13:05:16 -0700
Subject: x86, apic: introduce io_apic_irq_attr

according to Ingo, io_apic irq-setup related functions have too many
parameters with a repetitive signature.

So reduce related funcs to get less params by passing a pointer
to a newly defined io_apic_irq_attr structure.

v2: io_apic_irq ==> irq_attr
    triggering ==> trigger

v3: add set_io_apic_irq_attr

[ Impact: cleanup ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: Len Brown <lenb@kernel.org>
LKML-Reference: <4A08ACD3.2070401@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 26a40ab..a7d14bb 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -63,9 +63,26 @@ extern unsigned long io_apic_irqs;
 extern void init_VISWS_APIC_irqs(void);
 extern void setup_IO_APIC(void);
 extern void disable_IO_APIC(void);
+
+struct io_apic_irq_attr {
+	int ioapic;
+	int ioapic_pin;
+	int trigger;
+	int polarity;
+};
+
+static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
+					int ioapic, int ioapic_pin,
+					int trigger, int polarity)
+{
+	irq_attr->ioapic     = ioapic;
+	irq_attr->ioapic_pin = ioapic_pin;
+	irq_attr->trigger    = trigger;
+	irq_attr->polarity   = polarity;
+}
+
 extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin,
-					int *ioapic, int *ioapic_pin,
-					int *trigger, int *polarity);
+					struct io_apic_irq_attr *irq_attr);
 extern void setup_ioapic_dest(void);
 
 extern void enable_IO_APIC(void);
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 6fd99f9..daf866e 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -156,8 +156,9 @@ extern int io_apic_get_version(int ioapic);
 extern int io_apic_get_redir_entries(int ioapic);
 #endif /* CONFIG_ACPI */
 
-extern int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin,
-				  int irq, int edge_level, int active_high_low);
+struct io_apic_irq_attr;
+extern int io_apic_set_pci_routing(struct device *dev, int irq,
+		 struct io_apic_irq_attr *irq_attr);
 extern int (*ioapic_renumber_irq)(int ioapic, int irq);
 extern void ioapic_init_mappings(void);
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index dcfbc3a..4af63df 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -523,7 +523,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
  * success: return IRQ number (>=0)
  * failure: return < 0
  */
-int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
+int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
 {
 	unsigned int irq;
 	unsigned int plat_gsi = gsi;
@@ -533,14 +533,14 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 	 * Make sure all (legacy) PCI IRQs are set as level-triggered.
 	 */
 	if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
-		if (triggering == ACPI_LEVEL_SENSITIVE)
+		if (trigger == ACPI_LEVEL_SENSITIVE)
 			eisa_set_level_irq(gsi);
 	}
 #endif
 
 #ifdef CONFIG_X86_IO_APIC
 	if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
-		plat_gsi = mp_register_gsi(dev, gsi, triggering, polarity);
+		plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
 	}
 #endif
 	acpi_gsi_to_irq(plat_gsi, &irq);
@@ -1156,7 +1156,7 @@ void __init mp_config_acpi_legacy_irqs(void)
 	}
 }
 
-static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int triggering,
+static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
 			int polarity)
 {
 #ifdef CONFIG_X86_MPPARSE
@@ -1181,7 +1181,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int triggering,
 	/* print the entry should happen on mptable identically */
 	mp_irq.type = MP_INTSRC;
 	mp_irq.irqtype = mp_INT;
-	mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+	mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
 				(polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
 	mp_irq.srcbus = number;
 	mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
@@ -1194,10 +1194,11 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int triggering,
 	return 0;
 }
 
-int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
+int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
 {
 	int ioapic;
 	int ioapic_pin;
+	struct io_apic_irq_attr irq_attr;
 
 	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
 		return gsi;
@@ -1225,11 +1226,12 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 		       ioapic_pin);
 		return gsi;
 	}
-	mp_config_acpi_gsi(dev, gsi, triggering, polarity);
+	mp_config_acpi_gsi(dev, gsi, trigger, polarity);
 
-	io_apic_set_pci_routing(dev, ioapic, ioapic_pin, gsi,
-				triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
-				polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+	set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
+			     trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
+			     polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+	io_apic_set_pci_routing(dev, gsi, &irq_attr);
 
 	return gsi;
 }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 74d2b48..ce1ac74 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1096,8 +1096,7 @@ static int pin_2_irq(int idx, int apic, int pin)
  * Not an __init, possibly needed by modules
  */
 int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
-				int *ioapic, int *ioapic_pin,
-				int *trigger, int *polarity)
+				struct io_apic_irq_attr *irq_attr)
 {
 	int apic, i, best_guess = -1;
 
@@ -1127,10 +1126,10 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
 				continue;
 
 			if (pin == (mp_irqs[i].srcbusirq & 3)) {
-				*ioapic = apic;
-				*ioapic_pin = mp_irqs[i].dstirq;
-				*trigger = irq_trigger(i);
-				*polarity = irq_polarity(i);
+				set_io_apic_irq_attr(irq_attr, apic,
+						     mp_irqs[i].dstirq,
+						     irq_trigger(i),
+						     irq_polarity(i));
 				return irq;
 			}
 			/*
@@ -1138,10 +1137,10 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
 			 * best-guess fuzzy result for broken mptables.
 			 */
 			if (best_guess < 0) {
-				*ioapic = apic;
-				*ioapic_pin = mp_irqs[i].dstirq;
-				*trigger = irq_trigger(i);
-				*polarity = irq_polarity(i);
+				set_io_apic_irq_attr(irq_attr, apic,
+						     mp_irqs[i].dstirq,
+						     irq_trigger(i),
+						     irq_polarity(i));
 				best_guess = irq;
 			}
 		}
@@ -3865,13 +3864,16 @@ int __init arch_probe_nr_irqs(void)
 }
 #endif
 
-static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
-				 int triggering, int polarity)
+static int __io_apic_set_pci_routing(struct device *dev, int irq,
+				struct io_apic_irq_attr *irq_attr)
 {
 	struct irq_desc *desc;
 	struct irq_cfg *cfg;
 	int node;
+	int ioapic, pin;
+	int trigger, polarity;
 
+	ioapic = irq_attr->ioapic;
 	if (!IO_APIC_IRQ(irq)) {
 		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
 			ioapic);
@@ -3889,6 +3891,10 @@ static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, in
 		return 0;
 	}
 
+	pin = irq_attr->ioapic_pin;
+	trigger = irq_attr->trigger;
+	polarity = irq_attr->polarity;
+
 	/*
 	 * IRQs < 16 are already in the irq_2_pin[] map
 	 */
@@ -3897,20 +3903,22 @@ static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, in
 		add_pin_to_irq_node(cfg, node, ioapic, pin);
 	}
 
-	setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
+	setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity);
 
 	return 0;
 }
 
-int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
-				 int triggering, int polarity)
+int io_apic_set_pci_routing(struct device *dev, int irq,
+				struct io_apic_irq_attr *irq_attr)
 {
-
+	int ioapic, pin;
 	/*
 	 * Avoid pin reprogramming.  PRTs typically include entries
 	 * with redundant pin->gsi mappings (but unique PCI devices);
 	 * we only program the IOAPIC on the first.
 	 */
+	ioapic = irq_attr->ioapic;
+	pin = irq_attr->ioapic_pin;
 	if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
 		pr_debug("Pin %d-%d already programmed\n",
 			 mp_ioapics[ioapic].apicid, pin);
@@ -3918,8 +3926,7 @@ int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
 	}
 	set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
 
-	return __io_apic_set_pci_routing(dev, ioapic, pin, irq,
-					 triggering, polarity);
+	return __io_apic_set_pci_routing(dev, irq, irq_attr);
 }
 
 /* --------------------------------------------------------------------------
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 2f3e192..0696d50 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -1200,14 +1200,11 @@ static int pirq_enable_irq(struct pci_dev *dev)
 #ifdef CONFIG_X86_IO_APIC
 			struct pci_dev *temp_dev;
 			int irq;
-			int ioapic = -1, ioapic_pin = -1;
-			int triggering, polarity;
+			struct io_apic_irq_attr irq_attr;
 
 			irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
 						PCI_SLOT(dev->devfn),
-						pin - 1,
-						&ioapic, &ioapic_pin,
-						&triggering, &polarity);
+						pin - 1, &irq_attr);
 			/*
 			 * Busses behind bridges are typically not listed in the MP-table.
 			 * In this case we have to look up the IRQ based on the parent bus,
@@ -1221,9 +1218,7 @@ static int pirq_enable_irq(struct pci_dev *dev)
 				pin = pci_swizzle_interrupt_pin(dev, pin);
 				irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
 						PCI_SLOT(bridge->devfn),
-						pin - 1,
-						&ioapic, &ioapic_pin,
-						&triggering, &polarity);
+						pin - 1, &irq_attr);
 				if (irq >= 0)
 					dev_warn(&dev->dev, "using bridge %s "
 						 "INT %c to get IRQ %d\n",
@@ -1233,9 +1228,8 @@ static int pirq_enable_irq(struct pci_dev *dev)
 			}
 			dev = temp_dev;
 			if (irq >= 0) {
-				io_apic_set_pci_routing(&dev->dev, ioapic,
-							ioapic_pin, irq,
-							triggering, polarity);
+				io_apic_set_pci_routing(&dev->dev, irq,
+							 &irq_attr);
 				dev->irq = irq;
 				dev_info(&dev->dev, "PCI->APIC IRQ transform: "
 					 "INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
diff --git a/drivers/pci/hotplug/ibmphp_core.c b/drivers/pci/hotplug/ibmphp_core.c
index 79901a0..42e4260 100644
--- a/drivers/pci/hotplug/ibmphp_core.c
+++ b/drivers/pci/hotplug/ibmphp_core.c
@@ -155,15 +155,13 @@ int ibmphp_init_devno(struct slot **cur_slot)
 	for (loop = 0; loop < len; loop++) {
 		if ((*cur_slot)->number == rtable->slots[loop].slot &&
 		    (*cur_slot)->bus == rtable->slots[loop].bus) {
-			int ioapic = -1, ioapic_pin = -1;
-			int triggering, polarity;
+			struct io_apic_irq_attr irq_attr;
 
 			(*cur_slot)->device = PCI_SLOT(rtable->slots[loop].devfn);
 			for (i = 0; i < 4; i++)
 				(*cur_slot)->irq[i] = IO_APIC_get_PCI_irq_vector((int) (*cur_slot)->bus,
 						(int) (*cur_slot)->device, i,
-						&ioapic, &ioapic_pin,
-						&triggering, &polarity);
+						&irq_attr);
 
 			debug("(*cur_slot)->irq[0] = %x\n",
 					(*cur_slot)->irq[0]);
-- 
cgit v0.10.2


From 2759c3287de27266e06f1f4e82cbd2d65f6a044c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 15 May 2009 13:05:16 -0700
Subject: x86: don't call read_apic_id if !cpu_has_apic

should not call that if apic is disabled.

[ Impact: fix crash on certain UP configs ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
LKML-Reference: <4A09CCBB.2000306@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 744e6d8..d0c99ab 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -161,7 +161,7 @@ static int flat_apic_id_registered(void)
 
 static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
 {
-	return hard_smp_processor_id() >> index_msb;
+	return initial_apic_id >> index_msb;
 }
 
 struct apic apic_flat =  {
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 7e4a459..728b375 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -272,7 +272,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
 #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
 	int cpu = smp_processor_id();
 	int node;
-	unsigned apicid = hard_smp_processor_id();
+	unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
 
 	node = c->phys_proc_id;
 	if (apicid_to_node[apicid] != NUMA_NO_NODE)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c1caefc..017c600 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -761,6 +761,12 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 	if (this_cpu->c_identify)
 		this_cpu->c_identify(c);
 
+	/* Clear/Set all flags overriden by options, after probe */
+	for (i = 0; i < NCAPINTS; i++) {
+		c->x86_capability[i] &= ~cpu_caps_cleared[i];
+		c->x86_capability[i] |= cpu_caps_set[i];
+	}
+
 #ifdef CONFIG_X86_64
 	c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
 #endif
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 7437fa1..daed39b 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -229,12 +229,12 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
 }
 #endif
 
-static void __cpuinit srat_detect_node(void)
+static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
 {
 #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
 	unsigned node;
 	int cpu = smp_processor_id();
-	int apicid = hard_smp_processor_id();
+	int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
 
 	/* Don't do the funky fallback heuristics the AMD version employs
 	   for now. */
@@ -400,7 +400,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 	}
 
 	/* Work around errata */
-	srat_detect_node();
+	srat_detect_node(c);
 
 	if (cpu_has(c, X86_FEATURE_VMX))
 		detect_vmx_virtcap(c);
-- 
cgit v0.10.2


From 888a589f6be07d624e21e2174d98375e9f95911b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 15 May 2009 13:59:37 -0700
Subject: mm, x86: remove MEMORY_HOTPLUG_RESERVE related code

after:

 | commit b263295dbffd33b0fbff670720fa178c30e3392a
 | Author: Christoph Lameter <clameter@sgi.com>
 | Date:   Wed Jan 30 13:30:47 2008 +0100
 |
 |    x86: 64-bit, make sparsemem vmemmap the only memory model

we don't have MEMORY_HOTPLUG_RESERVE anymore.

Historically, x86-64 had an architecture-specific method for memory hotplug
whereby it scanned the SRAT for physical memory ranges that could be
potentially used for memory hot-add later. By reserving those ranges
without physical memory, the memmap would be allocated and left dormant
until needed. This depended on the DISCONTIG memory model which has been
removed so the code implementing HOTPLUG_RESERVE is now dead.

This patch removes the dead code used by MEMORY_HOTPLUG_RESERVE.

(Changelog authored by Mel.)

v2: updated changelog, and remove hotadd= in doc

[ Impact: remove dead code ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: Mel Gorman <mel@csn.ul.ie>
Workflow-found-OK-by: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <4A0C4910.7090508@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 34c1304..2db5893 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -150,11 +150,6 @@ NUMA
 		Otherwise, the remaining system RAM is allocated to an
 		additional node.
 
-  numa=hotadd=percent
-		Only allow hotadd memory to preallocate page structures upto
-		percent of already available memory.
-		numa=hotadd=0 will disable hotadd memory.
-
 ACPI
 
   acpi=off	Don't enable ACPI
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 064ed6d..7feff06 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -17,9 +17,6 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks,
 extern void numa_init_array(void);
 extern int numa_off;
 
-extern void srat_reserve_add_area(int nodeid);
-extern int hotadd_percent;
-
 extern s16 apicid_to_node[MAX_LOCAL_APIC];
 
 extern unsigned long numa_free_all_bootmem(void);
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index fb61d81..a6a93c3 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -272,9 +272,6 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
 		reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
 				 bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
 
-#ifdef CONFIG_ACPI_NUMA
-	srat_reserve_add_area(nodeid);
-#endif
 	node_set_online(nodeid);
 }
 
@@ -591,8 +588,6 @@ static __init int numa_setup(char *opt)
 #ifdef CONFIG_ACPI_NUMA
 	if (!strncmp(opt, "noacpi", 6))
 		acpi_numa = -1;
-	if (!strncmp(opt, "hotadd=", 7))
-		hotadd_percent = simple_strtoul(opt+7, NULL, 10);
 #endif
 	return 0;
 }
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 87b45bf..b0dbbd4 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -31,8 +31,6 @@ static nodemask_t nodes_parsed __initdata;
 static nodemask_t cpu_nodes_parsed __initdata;
 static struct bootnode nodes[MAX_NUMNODES] __initdata;
 static struct bootnode nodes_add[MAX_NUMNODES];
-static int found_add_area __initdata;
-int hotadd_percent __initdata = 0;
 
 static int num_node_memblks __initdata;
 static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
@@ -66,9 +64,6 @@ static __init void cutoff_node(int i, unsigned long start, unsigned long end)
 {
 	struct bootnode *nd = &nodes[i];
 
-	if (found_add_area)
-		return;
-
 	if (nd->start < start) {
 		nd->start = start;
 		if (nd->end < nd->start)
@@ -86,7 +81,6 @@ static __init void bad_srat(void)
 	int i;
 	printk(KERN_ERR "SRAT: SRAT not used.\n");
 	acpi_numa = -1;
-	found_add_area = 0;
 	for (i = 0; i < MAX_LOCAL_APIC; i++)
 		apicid_to_node[i] = NUMA_NO_NODE;
 	for (i = 0; i < MAX_NUMNODES; i++)
@@ -182,24 +176,21 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 	       pxm, apic_id, node);
 }
 
-static int update_end_of_memory(unsigned long end) {return -1;}
-static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 static inline int save_add_info(void) {return 1;}
 #else
 static inline int save_add_info(void) {return 0;}
 #endif
 /*
- * Update nodes_add and decide if to include add are in the zone.
- * Both SPARSE and RESERVE need nodes_add information.
- * This code supports one contiguous hot add area per node.
+ * Update nodes_add[]
+ * This code supports one contiguous hot add area per node
  */
-static int __init
-reserve_hotadd(int node, unsigned long start, unsigned long end)
+static void __init
+update_nodes_add(int node, unsigned long start, unsigned long end)
 {
 	unsigned long s_pfn = start >> PAGE_SHIFT;
 	unsigned long e_pfn = end >> PAGE_SHIFT;
-	int ret = 0, changed = 0;
+	int changed = 0;
 	struct bootnode *nd = &nodes_add[node];
 
 	/* I had some trouble with strange memory hotadd regions breaking
@@ -210,7 +201,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
 	   mistakes */
 	if ((signed long)(end - start) < NODE_MIN_SIZE) {
 		printk(KERN_ERR "SRAT: Hotplug area too small\n");
-		return -1;
+		return;
 	}
 
 	/* This check might be a bit too strict, but I'm keeping it for now. */
@@ -218,12 +209,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
 		printk(KERN_ERR
 			"SRAT: Hotplug area %lu -> %lu has existing memory\n",
 			s_pfn, e_pfn);
-		return -1;
-	}
-
-	if (!hotadd_enough_memory(&nodes_add[node]))  {
-		printk(KERN_ERR "SRAT: Hotplug area too large\n");
-		return -1;
+		return;
 	}
 
 	/* Looks good */
@@ -245,11 +231,9 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
 			printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
 	}
 
-	ret = update_end_of_memory(nd->end);
-
 	if (changed)
-	 	printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
-	return ret;
+		printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
+				 nd->start, nd->end);
 }
 
 /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
@@ -310,13 +294,10 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 	       start, end);
 	e820_register_active_regions(node, start >> PAGE_SHIFT,
 				     end >> PAGE_SHIFT);
-	push_node_boundaries(node, nd->start >> PAGE_SHIFT,
-						nd->end >> PAGE_SHIFT);
 
-	if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) &&
-	    (reserve_hotadd(node, start, end) < 0)) {
-		/* Ignore hotadd region. Undo damage */
-		printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
+	if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
+		update_nodes_add(node, start, end);
+		/* restore nodes[node] */
 		*nd = oldnode;
 		if ((nd->start | nd->end) == 0)
 			node_clear(node, nodes_parsed);
@@ -510,26 +491,6 @@ static int null_slit_node_compare(int a, int b)
 }
 #endif /* CONFIG_NUMA_EMU */
 
-void __init srat_reserve_add_area(int nodeid)
-{
-	if (found_add_area && nodes_add[nodeid].end) {
-		u64 total_mb;
-
-		printk(KERN_INFO "SRAT: Reserving hot-add memory space "
-				"for node %d at %Lx-%Lx\n",
-			nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
-		total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
-					>> PAGE_SHIFT;
-		total_mb *= sizeof(struct page);
-		total_mb >>= 20;
-		printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
-				"pre-allocated memory.\n", (unsigned long long)total_mb);
-		reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
-			       nodes_add[nodeid].end - nodes_add[nodeid].start,
-			       BOOTMEM_DEFAULT);
-	}
-}
-
 int __node_distance(int a, int b)
 {
 	int index;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bff1f0d..511b098 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1031,8 +1031,6 @@ extern void add_active_range(unsigned int nid, unsigned long start_pfn,
 					unsigned long end_pfn);
 extern void remove_active_range(unsigned int nid, unsigned long start_pfn,
 					unsigned long end_pfn);
-extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn,
-					unsigned long end_pfn);
 extern void remove_all_active_ranges(void);
 extern unsigned long absent_pages_in_range(unsigned long start_pfn,
 						unsigned long end_pfn);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fe753ec..474c7e9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -149,10 +149,6 @@ static unsigned long __meminitdata dma_reserve;
   static int __meminitdata nr_nodemap_entries;
   static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
   static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-  static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
-  static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
-#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
   static unsigned long __initdata required_kernelcore;
   static unsigned long __initdata required_movablecore;
   static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
@@ -3103,64 +3099,6 @@ void __init sparse_memory_present_with_active_regions(int nid)
 }
 
 /**
- * push_node_boundaries - Push node boundaries to at least the requested boundary
- * @nid: The nid of the node to push the boundary for
- * @start_pfn: The start pfn of the node
- * @end_pfn: The end pfn of the node
- *
- * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
- * time. Specifically, on x86_64, SRAT will report ranges that can potentially
- * be hotplugged even though no physical memory exists. This function allows
- * an arch to push out the node boundaries so mem_map is allocated that can
- * be used later.
- */
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-void __init push_node_boundaries(unsigned int nid,
-		unsigned long start_pfn, unsigned long end_pfn)
-{
-	mminit_dprintk(MMINIT_TRACE, "zoneboundary",
-			"Entering push_node_boundaries(%u, %lu, %lu)\n",
-			nid, start_pfn, end_pfn);
-
-	/* Initialise the boundary for this node if necessary */
-	if (node_boundary_end_pfn[nid] == 0)
-		node_boundary_start_pfn[nid] = -1UL;
-
-	/* Update the boundaries */
-	if (node_boundary_start_pfn[nid] > start_pfn)
-		node_boundary_start_pfn[nid] = start_pfn;
-	if (node_boundary_end_pfn[nid] < end_pfn)
-		node_boundary_end_pfn[nid] = end_pfn;
-}
-
-/* If necessary, push the node boundary out for reserve hotadd */
-static void __meminit account_node_boundary(unsigned int nid,
-		unsigned long *start_pfn, unsigned long *end_pfn)
-{
-	mminit_dprintk(MMINIT_TRACE, "zoneboundary",
-			"Entering account_node_boundary(%u, %lu, %lu)\n",
-			nid, *start_pfn, *end_pfn);
-
-	/* Return if boundary information has not been provided */
-	if (node_boundary_end_pfn[nid] == 0)
-		return;
-
-	/* Check the boundaries and update if necessary */
-	if (node_boundary_start_pfn[nid] < *start_pfn)
-		*start_pfn = node_boundary_start_pfn[nid];
-	if (node_boundary_end_pfn[nid] > *end_pfn)
-		*end_pfn = node_boundary_end_pfn[nid];
-}
-#else
-void __init push_node_boundaries(unsigned int nid,
-		unsigned long start_pfn, unsigned long end_pfn) {}
-
-static void __meminit account_node_boundary(unsigned int nid,
-		unsigned long *start_pfn, unsigned long *end_pfn) {}
-#endif
-
-
-/**
  * get_pfn_range_for_nid - Return the start and end page frames for a node
  * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
  * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
@@ -3185,9 +3123,6 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
 
 	if (*start_pfn == -1UL)
 		*start_pfn = 0;
-
-	/* Push the node boundaries out if requested */
-	account_node_boundary(nid, start_pfn, end_pfn);
 }
 
 /*
@@ -3793,10 +3728,6 @@ void __init remove_all_active_ranges(void)
 {
 	memset(early_node_map, 0, sizeof(early_node_map));
 	nr_nodemap_entries = 0;
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-	memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
-	memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
-#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
 }
 
 /* Compare two active node_active_regions */
-- 
cgit v0.10.2


From 7c43769a9776141ec23ca81a1bdd5a9c0512f165 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 15 May 2009 13:59:37 -0700
Subject: x86, mm: Fix node_possible_map logic

Recently there were some changes to the meaning of node_possible_map,
and it is quite strange:

- the node without memory would be set in node_possible_map
- but some node with less NODE_MIN_SIZE will be kicked out of node_possible_map.

fix it by adding strict_setup_node_bootmem().

Also, remove unparse_node().

so result will be:

1. cpu_to_node() will return online node only (nearest one)
2. apicid_to_node() still returns the node that could be not online but is set
   in node_possible_map.
3. node_possible_map will include nodes that mem on it are less NODE_MIN_SIZE

v2: after move_cpus_to_node change.

[ Impact: get node_possible_map right ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Tested-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <4A0C49BE.6080800@kernel.org>
[ v3: various small cleanups and comment clarifications ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 7feff06..c4ae822 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -24,6 +24,13 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
 			       unsigned long end);
 
 #ifdef CONFIG_NUMA
+/*
+ * Too small node sizes may confuse the VM badly. Usually they
+ * result from BIOS bugs. So dont recognize nodes as standalone
+ * NUMA entities that have less than this amount of RAM listed:
+ */
+#define NODE_MIN_SIZE (4*1024*1024)
+
 extern void __init init_cpu_to_node(void);
 extern void __cpuinit numa_set_node(int cpu, int node);
 extern void __cpuinit numa_clear_node(int cpu);
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a6a93c3..459913b 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -179,18 +179,25 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
 }
 
 /* Initialize bootmem allocator for a node */
-void __init setup_node_bootmem(int nodeid, unsigned long start,
-			       unsigned long end)
+void __init
+setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
 {
 	unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
+	const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
 	unsigned long bootmap_start, nodedata_phys;
 	void *bootmap;
-	const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
 	int nid;
 
 	if (!end)
 		return;
 
+	/*
+	 * Don't confuse VM with a node that doesn't have the
+	 * minimum amount of memory:
+	 */
+	if (end && (end - start) < NODE_MIN_SIZE)
+		return;
+
 	start = roundup(start, ZONE_ALIGN);
 
 	printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index b0dbbd4..2dfcbf9 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -36,10 +36,6 @@ static int num_node_memblks __initdata;
 static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
 static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
 
-/* Too small nodes confuse the VM badly. Usually they result
-   from BIOS bugs. */
-#define NODE_MIN_SIZE (4*1024*1024)
-
 static __init int setup_node(int pxm)
 {
 	return acpi_map_pxm_to_node(pxm);
@@ -338,17 +334,6 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
 	return 1;
 }
 
-static void __init unparse_node(int node)
-{
-	int i;
-	node_clear(node, nodes_parsed);
-	node_clear(node, cpu_nodes_parsed);
-	for (i = 0; i < MAX_LOCAL_APIC; i++) {
-		if (apicid_to_node[i] == node)
-			apicid_to_node[i] = NUMA_NO_NODE;
-	}
-}
-
 void __init acpi_numa_arch_fixup(void) {}
 
 /* Use the information discovered above to actually set up the nodes. */
@@ -360,18 +345,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 		return -1;
 
 	/* First clean up the node list */
-	for (i = 0; i < MAX_NUMNODES; i++) {
+	for (i = 0; i < MAX_NUMNODES; i++)
 		cutoff_node(i, start, end);
-		/*
-		 * don't confuse VM with a node that doesn't have the
-		 * minimum memory.
-		 */
-		if (nodes[i].end &&
-			(nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
-			unparse_node(i);
-			node_set_offline(i);
-		}
-	}
 
 	if (!nodes_cover_memory(nodes)) {
 		bad_srat();
@@ -404,7 +379,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 
 		if (node == NUMA_NO_NODE)
 			continue;
-		if (!node_isset(node, node_possible_map))
+		if (!node_online(node))
 			numa_clear_node(i);
 	}
 	numa_init_array();
-- 
cgit v0.10.2


From 35d5a9a61490bf39d2e48d7f499c8c801a39ebe9 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 15 May 2009 13:59:37 -0700
Subject: x86: fix system without memory on node0

Jack found a boot crash on a system which doesn't have memory on node0.

It turns out with recent per_cpu changes, node_number for BSP will always
be 0, and it is not consistent to cpu_to_node() that might set it to a
different (nearer) node already.

aka when numa_set_node() for node0 is called early before per_cpu area is
setup:

two places touched that per_cpu(node_number,):

1. in cpu/common.c::cpu_init() and it is not for BP
| #ifdef CONFIG_NUMA
|        if (cpu != 0 && percpu_read(node_number) == 0 &&
|            cpu_to_node(cpu) != NUMA_NO_NODE)
|                percpu_write(node_number, cpu_to_node(cpu));
| #endif
for BP: traps_init ==> cpu_init
for AP: start_secondary ==> cpu_init

2. cpu/intel.c or amd.c::srat_detect_node via numa_set_node()
for BP: check_bugs ==> identify_boot_cpu ==> identify_cpu()
	 that is rather later before numa_node_id() is used for BP...
for AP: start_secondary => smp_callin => smp_store_cpu_info() =>
	=> identify_secondary_cpu => identify_cpu()

so try to set that for BP earlier in setup_per_cpu_areas(), and
don't bother to set that for APs there (it will be updated later
and will be used later)

(and don't mess the 0 before the copying BP per_cpu data to APs)

[ Impact: fix boot crash on memoryless node-0 ]

Reported-and-tested-by: Jack Steiner <steiner@sgi.com>
Cc: Tejun Heo <htejun@gmail.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4A0C4A02.7050401@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 3a97a4c..3b5f327 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -423,6 +423,14 @@ void __init setup_per_cpu_areas(void)
 	early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
 #endif
 
+#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
+	/*
+	 * make sure boot cpu node_number is right, when boot cpu is on the
+	 * node that doesn't have mem installed
+	 */
+	per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id);
+#endif
+
 	/* Setup node to cpumask map */
 	setup_node_to_cpumask_map();
 
-- 
cgit v0.10.2


From 629e15d245f46bef9d26199b450f882f9437a8fe Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 15 May 2009 13:05:16 -0700
Subject: x86, irq: update_mptable needs pci_routeirq

To get all device irq routing and to save them.

This is basically an implicit pci=routeirq enablement if (and on if)
the update_mptable boot option (which is off by default) has been
specified.

[ Impact: extend the update_mptable boot opion's scope ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
LKML-Reference: <4A0DB7B4.4060702@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index cd2a41a..e6bf9d0 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -17,6 +17,7 @@
 #include <linux/acpi.h>
 #include <linux/module.h>
 #include <linux/smp.h>
+#include <linux/pci.h>
 
 #include <asm/mtrr.h>
 #include <asm/mpspec.h>
@@ -961,6 +962,9 @@ static int __initdata enable_update_mptable;
 static int __init update_mptable_setup(char *str)
 {
 	enable_update_mptable = 1;
+#ifdef CONFIG_PCI
+	pci_routeirq = 1;
+#endif
 	return 0;
 }
 early_param("update_mptable", update_mptable_setup);
@@ -973,6 +977,9 @@ static int __initdata alloc_mptable;
 static int __init parse_alloc_mptable_opt(char *p)
 {
 	enable_update_mptable = 1;
+#ifdef CONFIG_PCI
+	pci_routeirq = 1;
+#endif
 	alloc_mptable = 1;
 	if (!p)
 		return 0;
-- 
cgit v0.10.2


From f1bdb523880c7f6990e9e8e50b0fc972ca475e84 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 15 May 2009 13:05:16 -0700
Subject: x86, irq: don't call mp_config_acpi_gsi() if update_mptable is not
 enabled

Len expressed concern that the update_mptable feature has
side-effects on the ACPI code.

Make it sure explicitly that the code only ever gets called if
the (default disabled) update_mptable boot quirk option is
disabled.

[ Impact: isolate the update_mptable feature from ACPI code more ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Len Brown <lenb@kernel.org>
LKML-Reference: <4A0DC832.5090200@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index c34961a..3dcbaaa 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -87,6 +87,15 @@ static inline int acpi_probe_gsi(void)
 }
 #endif /* CONFIG_ACPI */
 
+#ifdef CONFIG_X86_MPPARSE
+extern int enable_update_mptable;
+#else
+static inline int enable_update_mptable(void)
+{
+	return 0;
+}
+#endif
+
 #define PHYSID_ARRAY_SIZE	BITS_TO_LONGS(MAX_APICS)
 
 struct physid_mask {
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 4af63df..844e5e2 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1226,7 +1226,9 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
 		       ioapic_pin);
 		return gsi;
 	}
-	mp_config_acpi_gsi(dev, gsi, trigger, polarity);
+
+	if (enable_update_mptable)
+		mp_config_acpi_gsi(dev, gsi, trigger, polarity);
 
 	set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
 			     trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index e6bf9d0..651c93b 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -957,7 +957,7 @@ out:
 	return 0;
 }
 
-static int __initdata enable_update_mptable;
+int enable_update_mptable;
 
 static int __init update_mptable_setup(char *str)
 {
-- 
cgit v0.10.2


From b68f1d2e7aa21029d73c7d453a8046e95d351740 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 17 May 2009 19:37:25 +0200
Subject: perf_counter, x86: speed up the scheduling fast-path

We have to set up the LVT entry only at counter init time, not at
every switch-in time.

There's friction between NMI and non-NMI use here - we'll probably
remove the per counter configurability of it - but until then, dont
slow down things ...

[ Impact: micro-optimization ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 5bfd30a..c109819 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -285,6 +285,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 			return -EACCES;
 		hwc->nmi = 1;
 	}
+	perf_counters_lapic_init(hwc->nmi);
 
 	if (!hwc->irq_period)
 		hwc->irq_period = x86_pmu.max_period;
@@ -603,8 +604,6 @@ try_generic:
 		hwc->counter_base = x86_pmu.perfctr;
 	}
 
-	perf_counters_lapic_init(hwc->nmi);
-
 	x86_pmu.disable(hwc, idx);
 
 	cpuc->counters[idx] = counter;
@@ -1054,7 +1053,7 @@ void __init init_hw_perf_counters(void)
 
 	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
 
-	perf_counters_lapic_init(0);
+	perf_counters_lapic_init(1);
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 
-- 
cgit v0.10.2


From 24ed0c4bfc7d2d7507bb9d50f7f3bbdcd85d76dd Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Sun, 17 May 2009 15:31:38 +0800
Subject: tracing: fix check for return value of register_module_notifier

return zero should be correct, so fix it.

[ Impact: eliminate incorrect syslog message ]

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: rostedt@goodmis.org
LKML-Reference: <1242545498-7285-1-git-send-email-tom.leiming@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5b606f4..140699a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2826,7 +2826,7 @@ void __init ftrace_init(void)
 				  __stop_mcount_loc);
 
 	ret = register_module_notifier(&ftrace_module_nb);
-	if (!ret)
+	if (ret)
 		pr_warning("Failed to register trace ftrace module notifier\n");
 
 	return;
-- 
cgit v0.10.2


From eb33575cf67d3f35fa2510210ef92631266e2465 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Wed, 13 May 2009 17:34:48 +0100
Subject: [ARM] Double check memmap is actually valid with a memmap has
 unexpected holes V2

pfn_valid() is meant to be able to tell if a given PFN has valid memmap
associated with it or not. In FLATMEM, it is expected that holes always
have valid memmap as long as there is valid PFNs either side of the hole.
In SPARSEMEM, it is assumed that a valid section has a memmap for the
entire section.

However, ARM and maybe other embedded architectures in the future free
memmap backing holes to save memory on the assumption the memmap is never
used. The page_zone linkages are then broken even though pfn_valid()
returns true. A walker of the full memmap must then do this additional
check to ensure the memmap they are looking at is sane by making sure the
zone and PFN linkages are still valid. This is expensive, but walkers of
the full memmap are extremely rare.

This was caught before for FLATMEM and hacked around but it hits again for
SPARSEMEM because the page_zone linkages can look ok where the PFN linkages
are totally screwed. This looks like a hatchet job but the reality is that
any clean solution would end up consumning all the memory saved by punching
these unexpected holes in the memmap. For example, we tried marking the
memmap within the section invalid but the section size exceeds the size of
the hole in most cases so pfn_valid() starts returning false where valid
memmap exists. Shrinking the size of the section would increase memory
consumption offsetting the gains.

This patch identifies when an architecture is punching unexpected holes
in the memmap that the memory model cannot automatically detect and sets
ARCH_HAS_HOLES_MEMORYMODEL. At the moment, this is restricted to EP93xx
which is the model sub-architecture this has been reported on but may expand
later. When set, walkers of the full memmap must call memmap_valid_within()
for each PFN and passing in what it expects the page and zone to be for
that PFN. If it finds the linkages to be broken, it assumes the memmap is
invalid for that PFN.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index e60ec54..9d02cdb 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -273,6 +273,7 @@ config ARCH_EP93XX
 	select HAVE_CLK
 	select COMMON_CLKDEV
 	select ARCH_REQUIRE_GPIOLIB
+	select ARCH_HAS_HOLES_MEMORYMODEL
 	help
 	  This enables support for the Cirrus EP93xx series of CPUs.
 
@@ -976,10 +977,9 @@ config OABI_COMPAT
 	  UNPREDICTABLE (in fact it can be predicted that it won't work
 	  at all). If in doubt say Y.
 
-config ARCH_FLATMEM_HAS_HOLES
+config ARCH_HAS_HOLES_MEMORYMODEL
 	bool
-	default y
-	depends on FLATMEM
+	default n
 
 # Discontigmem is deprecated
 config ARCH_DISCONTIGMEM_ENABLE
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 186ec6a..a47c879 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1097,6 +1097,32 @@ unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
 #define pfn_valid_within(pfn) (1)
 #endif
 
+#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
+/*
+ * pfn_valid() is meant to be able to tell if a given PFN has valid memmap
+ * associated with it or not. In FLATMEM, it is expected that holes always
+ * have valid memmap as long as there is valid PFNs either side of the hole.
+ * In SPARSEMEM, it is assumed that a valid section has a memmap for the
+ * entire section.
+ *
+ * However, an ARM, and maybe other embedded architectures in the future
+ * free memmap backing holes to save memory on the assumption the memmap is
+ * never used. The page_zone linkages are then broken even though pfn_valid()
+ * returns true. A walker of the full memmap must then do this additional
+ * check to ensure the memmap they are looking at is sane by making sure
+ * the zone and PFN linkages are still valid. This is expensive, but walkers
+ * of the full memmap are extremely rare.
+ */
+int memmap_valid_within(unsigned long pfn,
+					struct page *page, struct zone *zone);
+#else
+static inline int memmap_valid_within(unsigned long pfn,
+					struct page *page, struct zone *zone)
+{
+	return 1;
+}
+#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
+
 #endif /* !__GENERATING_BOUNDS.H */
 #endif /* !__ASSEMBLY__ */
 #endif /* _LINUX_MMZONE_H */
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 16ce8b9..f5b7d17 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -6,6 +6,7 @@
 
 
 #include <linux/stddef.h>
+#include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <linux/module.h>
 
@@ -72,3 +73,17 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
 	*zone = zonelist_zone(z);
 	return z;
 }
+
+#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
+int memmap_valid_within(unsigned long pfn,
+					struct page *page, struct zone *zone)
+{
+	if (page_to_pfn(page) != pfn)
+		return 0;
+
+	if (page_zone(page) != zone)
+		return 0;
+
+	return 1;
+}
+#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 66f6130..74d66db 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -509,22 +509,11 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
 			continue;
 
 		page = pfn_to_page(pfn);
-#ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES
-		/*
-		 * Ordinarily, memory holes in flatmem still have a valid
-		 * memmap for the PFN range. However, an architecture for
-		 * embedded systems (e.g. ARM) can free up the memmap backing
-		 * holes to save memory on the assumption the memmap is
-		 * never used. The page_zone linkages are then broken even
-		 * though pfn_valid() returns true. Skip the page if the
-		 * linkages are broken. Even if this test passed, the impact
-		 * is that the counters for the movable type are off but
-		 * fragmentation monitoring is likely meaningless on small
-		 * systems.
-		 */
-		if (page_zone(page) != zone)
+
+		/* Watch for unexpected holes punched in the memmap */
+		if (!memmap_valid_within(pfn, page, zone))
 			continue;
-#endif
+
 		mtype = get_pageblock_migratetype(page);
 
 		if (mtype < MIGRATE_TYPES)
-- 
cgit v0.10.2


From 8190b37f6483fedc530110d8c7d710d1c0317ba4 Mon Sep 17 00:00:00 2001
From: Pavel Roskin <proski@gnu.org>
Date: Tue, 12 May 2009 13:36:46 -0700
Subject: [ARM] mach-l7200: fix spelling of SYS_CLOCK_OFF

Signed-off-by: Pavel Roskin <proski@gnu.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

diff --git a/arch/arm/mach-l7200/include/mach/sys-clock.h b/arch/arm/mach-l7200/include/mach/sys-clock.h
index 2d7722b..e9729a3 100644
--- a/arch/arm/mach-l7200/include/mach/sys-clock.h
+++ b/arch/arm/mach-l7200/include/mach/sys-clock.h
@@ -18,7 +18,7 @@
 
 /* IO_START and IO_BASE are defined in hardware.h */
 
-#define SYS_CLOCK_START (IO_START + SYS_CLCOK_OFF)  /* Physical address */
+#define SYS_CLOCK_START (IO_START + SYS_CLOCK_OFF)  /* Physical address */
 #define SYS_CLOCK_BASE  (IO_BASE  + SYS_CLOCK_OFF)  /* Virtual address  */
 
 /* Define the interface to the SYS_CLOCK */
-- 
cgit v0.10.2


From 6c3b46f74587d46e71b8c2b78fdca626a3aca280 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 18 May 2009 14:38:28 +0200
Subject: virtio_blk: don't blindly derefence req->rq_disk

request->rq_disk is only set for FS requests or BLOCK_PC requests
originating from the generic block layer scsi ioctls.  It's not set
for requests origination from other soures or internal cache flush
commands implemented by the patch I'll send after this.

So instead of using it to get at the private data in do_virtblk_request
setup queue->queuedata and use it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 29a9daf..dfe9ee5 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -124,12 +124,11 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 
 static void do_virtblk_request(struct request_queue *q)
 {
-	struct virtio_blk *vblk = NULL;
+	struct virtio_blk *vblk = q->queuedata;
 	struct request *req;
 	unsigned int issued = 0;
 
 	while ((req = blk_peek_request(q)) != NULL) {
-		vblk = req->rq_disk->private_data;
 		BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
 
 		/* If this request fails, stop queue and wait for something to
@@ -249,6 +248,7 @@ static int virtblk_probe(struct virtio_device *vdev)
 		goto out_put_disk;
 	}
 
+	vblk->disk->queue->queuedata = vblk;
 	queue_flag_set_unlocked(QUEUE_FLAG_VIRT, vblk->disk->queue);
 
 	if (index < 26) {
-- 
cgit v0.10.2


From 1cde26f928863d90e9e7c1217880c8450464d305 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 18 May 2009 14:41:30 +0200
Subject: virtio_blk: SG_IO passthru support

Add support for SG_IO passthru to virtio_blk.  We add the scsi command
block after the normal outhdr, and the scsi inhdr with full status
information aswell as the sense buffer before the regular inhdr.

[hch: forward ported, added the VIRTIO_BLK_F_SCSI flags, some comments
 and tested the whole beast]
[axboe: updated to use ->resid and not dual-path the byte count]

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (+ checkpatch.pl tweak)
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index dfe9ee5..62275db 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -37,6 +37,7 @@ struct virtblk_req
 	struct list_head list;
 	struct request *req;
 	struct virtio_blk_outhdr out_hdr;
+	struct virtio_scsi_inhdr in_hdr;
 	u8 status;
 };
 
@@ -49,7 +50,9 @@ static void blk_done(struct virtqueue *vq)
 
 	spin_lock_irqsave(&vblk->lock, flags);
 	while ((vbr = vblk->vq->vq_ops->get_buf(vblk->vq, &len)) != NULL) {
+		unsigned int nr_bytes;
 		int error;
+
 		switch (vbr->status) {
 		case VIRTIO_BLK_S_OK:
 			error = 0;
@@ -62,6 +65,12 @@ static void blk_done(struct virtqueue *vq)
 			break;
 		}
 
+		if (blk_pc_request(vbr->req)) {
+			vbr->req->resid_len = vbr->in_hdr.residual;
+			vbr->req->sense_len = vbr->in_hdr.sense_len;
+			vbr->req->errors = vbr->in_hdr.errors;
+		}
+
 		__blk_end_request_all(vbr->req, error);
 		list_del(&vbr->list);
 		mempool_free(vbr, vblk->pool);
@@ -74,7 +83,7 @@ static void blk_done(struct virtqueue *vq)
 static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 		   struct request *req)
 {
-	unsigned long num, out, in;
+	unsigned long num, out = 0, in = 0;
 	struct virtblk_req *vbr;
 
 	vbr = mempool_alloc(vblk->pool, GFP_ATOMIC);
@@ -99,18 +108,36 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 	if (blk_barrier_rq(vbr->req))
 		vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER;
 
-	sg_set_buf(&vblk->sg[0], &vbr->out_hdr, sizeof(vbr->out_hdr));
-	num = blk_rq_map_sg(q, vbr->req, vblk->sg+1);
-	sg_set_buf(&vblk->sg[num+1], &vbr->status, sizeof(vbr->status));
+	sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
 
-	if (rq_data_dir(vbr->req) == WRITE) {
-		vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
-		out = 1 + num;
-		in = 1;
-	} else {
-		vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
-		out = 1;
-		in = 1 + num;
+	/*
+	 * If this is a packet command we need a couple of additional headers.
+	 * Behind the normal outhdr we put a segment with the scsi command
+	 * block, and before the normal inhdr we put the sense data and the
+	 * inhdr with additional status information before the normal inhdr.
+	 */
+	if (blk_pc_request(vbr->req))
+		sg_set_buf(&vblk->sg[out++], vbr->req->cmd, vbr->req->cmd_len);
+
+	num = blk_rq_map_sg(q, vbr->req, vblk->sg + out);
+
+	if (blk_pc_request(vbr->req)) {
+		sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, 96);
+		sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr,
+			   sizeof(vbr->in_hdr));
+	}
+
+	sg_set_buf(&vblk->sg[num + out + in++], &vbr->status,
+		   sizeof(vbr->status));
+
+	if (num) {
+		if (rq_data_dir(vbr->req) == WRITE) {
+			vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
+			out += num;
+		} else {
+			vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
+			in += num;
+		}
 	}
 
 	if (vblk->vq->vq_ops->add_buf(vblk->vq, vblk->sg, out, in, vbr)) {
@@ -148,8 +175,16 @@ static void do_virtblk_request(struct request_queue *q)
 static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
 			 unsigned cmd, unsigned long data)
 {
-	return scsi_cmd_ioctl(bdev->bd_disk->queue,
-			      bdev->bd_disk, mode, cmd,
+	struct gendisk *disk = bdev->bd_disk;
+	struct virtio_blk *vblk = disk->private_data;
+
+	/*
+	 * Only allow the generic SCSI ioctls if the host can support it.
+	 */
+	if (!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_SCSI))
+		return -ENOIOCTLCMD;
+
+	return scsi_cmd_ioctl(disk->queue, disk, mode, cmd,
 			      (void __user *)data);
 }
 
@@ -356,6 +391,7 @@ static struct virtio_device_id id_table[] = {
 static unsigned int features[] = {
 	VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX,
 	VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
+	VIRTIO_BLK_F_SCSI,
 };
 
 static struct virtio_driver virtio_blk = {
diff --git a/include/linux/virtio_blk.h b/include/linux/virtio_blk.h
index 94c56d2..4dbcbc1 100644
--- a/include/linux/virtio_blk.h
+++ b/include/linux/virtio_blk.h
@@ -15,6 +15,7 @@
 #define VIRTIO_BLK_F_GEOMETRY	4	/* Legacy geometry available  */
 #define VIRTIO_BLK_F_RO		5	/* Disk is read-only */
 #define VIRTIO_BLK_F_BLK_SIZE	6	/* Block size of disk is available*/
+#define VIRTIO_BLK_F_SCSI	7	/* Supports scsi command passthru */
 
 struct virtio_blk_config
 {
@@ -55,6 +56,13 @@ struct virtio_blk_outhdr
 	__u64 sector;
 };
 
+struct virtio_scsi_inhdr {
+	__u32 errors;
+	__u32 data_len;
+	__u32 sense_len;
+	__u32 residual;
+};
+
 /* And this is the final byte of the write scatter-gather list. */
 #define VIRTIO_BLK_S_OK		0
 #define VIRTIO_BLK_S_IOERR	1
-- 
cgit v0.10.2


From f831cc03490c78a76e2d35ad77ec2292d0323728 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 18 May 2009 14:44:45 +0200
Subject: virtio_blk: get rid of unused variable

drivers/block/virtio_blk.c: In function 'blk_done':
drivers/block/virtio_blk.c:53: warning: unused variable 'nr_bytes'

Leftover from commit 1cde26f928863d90e9e7c1217880c8450464d305

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 62275db..511d4ae 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -50,7 +50,6 @@ static void blk_done(struct virtqueue *vq)
 
 	spin_lock_irqsave(&vblk->lock, flags);
 	while ((vbr = vblk->vq->vq_ops->get_buf(vblk->vq, &len)) != NULL) {
-		unsigned int nr_bytes;
 		int error;
 
 		switch (vbr->status) {
-- 
cgit v0.10.2


From 818bc814447a35350ae90a329133e474bf1a2bd7 Mon Sep 17 00:00:00 2001
From: Daniel Ribeiro <drwyrm@gmail.com>
Date: Sat, 2 May 2009 15:05:59 -0300
Subject: [ARM] pxa: save/restore PGSR on suspend/resume.

Signed-off-by: Daniel Ribeiro <drwyrm@gmail.com>
Signed-off-by: Eric Miao <eric.miao@marvell.com>

diff --git a/arch/arm/mach-pxa/mfp-pxa2xx.c b/arch/arm/mach-pxa/mfp-pxa2xx.c
index 7ffb91d..6ae50604 100644
--- a/arch/arm/mach-pxa/mfp-pxa2xx.c
+++ b/arch/arm/mach-pxa/mfp-pxa2xx.c
@@ -322,6 +322,7 @@ static inline void pxa27x_mfp_init(void) {}
 #ifdef CONFIG_PM
 static unsigned long saved_gafr[2][4];
 static unsigned long saved_gpdr[4];
+static unsigned long saved_pgsr[4];
 
 static int pxa2xx_mfp_suspend(struct sys_device *d, pm_message_t state)
 {
@@ -332,6 +333,7 @@ static int pxa2xx_mfp_suspend(struct sys_device *d, pm_message_t state)
 		saved_gafr[0][i] = GAFR_L(i);
 		saved_gafr[1][i] = GAFR_U(i);
 		saved_gpdr[i] = GPDR(i * 32);
+		saved_pgsr[i] = PGSR(i);
 
 		GPDR(i * 32) = gpdr_lpm[i];
 	}
@@ -346,6 +348,7 @@ static int pxa2xx_mfp_resume(struct sys_device *d)
 		GAFR_L(i) = saved_gafr[0][i];
 		GAFR_U(i) = saved_gafr[1][i];
 		GPDR(i * 32) = saved_gpdr[i];
+		PGSR(i) = saved_pgsr[i];
 	}
 	PSSR = PSSR_RDH | PSSR_PH;
 	return 0;
-- 
cgit v0.10.2


From 216e3b7abbd05c35d2d1a3f86629ade485351f0d Mon Sep 17 00:00:00 2001
From: Daniel Ribeiro <drwyrm@gmail.com>
Date: Tue, 5 May 2009 22:43:18 -0300
Subject: [ARM] pxa: allow gpio_reset drive high during normal work

I want to reuse tosa/spitz gpio_reset code, but my board needs the reset
gpio to be driven high during normal operation.

Signed-off-by: Daniel Ribeiro <drwyrm@gmail.com>
Acked-by: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Signed-off-by: Eric Miao <eric.miao@marvell.com>

diff --git a/arch/arm/mach-pxa/include/mach/reset.h b/arch/arm/mach-pxa/include/mach/reset.h
index 31e6a7b..b6c1055 100644
--- a/arch/arm/mach-pxa/include/mach/reset.h
+++ b/arch/arm/mach-pxa/include/mach/reset.h
@@ -13,8 +13,9 @@ extern void clear_reset_status(unsigned int mask);
 /**
  * init_gpio_reset() - register GPIO as reset generator
  * @gpio: gpio nr
- * @output: set gpio as out/low instead of input during normal work
+ * @output: set gpio as output instead of input during normal work
+ * @level: output level
  */
-extern int init_gpio_reset(int gpio, int output);
+extern int init_gpio_reset(int gpio, int output, int level);
 
 #endif /* __ASM_ARCH_RESET_H */
diff --git a/arch/arm/mach-pxa/reset.c b/arch/arm/mach-pxa/reset.c
index df29d45..01e9d64 100644
--- a/arch/arm/mach-pxa/reset.c
+++ b/arch/arm/mach-pxa/reset.c
@@ -20,7 +20,7 @@ static void do_hw_reset(void);
 
 static int reset_gpio = -1;
 
-int init_gpio_reset(int gpio, int output)
+int init_gpio_reset(int gpio, int output, int level)
 {
 	int rc;
 
@@ -31,7 +31,7 @@ int init_gpio_reset(int gpio, int output)
 	}
 
 	if (output)
-		rc = gpio_direction_output(gpio, 0);
+		rc = gpio_direction_output(gpio, level);
 	else
 		rc = gpio_direction_input(gpio);
 	if (rc) {
diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c
index c18e34a..cdacea0 100644
--- a/arch/arm/mach-pxa/spitz.c
+++ b/arch/arm/mach-pxa/spitz.c
@@ -731,7 +731,7 @@ static void spitz_restart(char mode, const char *cmd)
 
 static void __init common_init(void)
 {
-	init_gpio_reset(SPITZ_GPIO_ON_RESET, 1);
+	init_gpio_reset(SPITZ_GPIO_ON_RESET, 1, 0);
 	pm_power_off = spitz_poweroff;
 	arm_pm_restart = spitz_restart;
 
diff --git a/arch/arm/mach-pxa/tosa.c b/arch/arm/mach-pxa/tosa.c
index afac5b6..a0bd46e 100644
--- a/arch/arm/mach-pxa/tosa.c
+++ b/arch/arm/mach-pxa/tosa.c
@@ -897,7 +897,7 @@ static void __init tosa_init(void)
 	gpio_set_wake(MFP_PIN_GPIO1, 1);
 	/* We can't pass to gpio-keys since it will drop the Reset altfunc */
 
-	init_gpio_reset(TOSA_GPIO_ON_RESET, 0);
+	init_gpio_reset(TOSA_GPIO_ON_RESET, 0, 0);
 
 	pm_power_off = tosa_poweroff;
 	arm_pm_restart = tosa_restart;
-- 
cgit v0.10.2


From 866bd435819df8d97767c407f8828a7a2ff971e6 Mon Sep 17 00:00:00 2001
From: Timothy Clacy <tcl@phaseone.com>
Date: Thu, 7 May 2009 19:40:33 +0200
Subject: [ARM] pxa: enable GPIO receivers after configuring pins

'mach-pxa' platforms currently rely on a bootloader to setup GPIO pins
and clear RDH (to enable inputs).

A kernel loaded by a 'minimal' bootloader, that doesn't touch any pins,
will not function correctly; inputs will remain disabled, even after the
pins are configured. The following change fixes the issue and has been
verified on Gumstix Verdex XL6P and a custom PXA270 platform.

Signed-off-by: Timothy Clacy <tcl@phaseone.com>
Signed-off-by: Eric Miao <eric.miao@marvell.com>

diff --git a/arch/arm/mach-pxa/mfp-pxa2xx.c b/arch/arm/mach-pxa/mfp-pxa2xx.c
index 6ae50604..cf6b720 100644
--- a/arch/arm/mach-pxa/mfp-pxa2xx.c
+++ b/arch/arm/mach-pxa/mfp-pxa2xx.c
@@ -377,6 +377,9 @@ static int __init pxa2xx_mfp_init(void)
 	if (cpu_is_pxa27x())
 		pxa27x_mfp_init();
 
+	/* clear RDH bit to enable GPIO receivers after reset/sleep exit */
+	PSSR = PSSR_RDH;
+
 	/* initialize gafr_run[], pgsr_lpm[] from existing values */
 	for (i = 0; i <= gpio_to_bank(pxa_last_gpio); i++)
 		gpdr_lpm[i] = GPDR(i * 32);
-- 
cgit v0.10.2


From a81b38688f50f51123490d45d51f4a10e8dc1184 Mon Sep 17 00:00:00 2001
From: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Date: Fri, 15 May 2009 10:11:22 +0400
Subject: [ARM] pxa/spitz: provide spitz_ohci_exit() that unregisters USB_HOST
 GPIO

Currently spitz_ohci_init() that requests GPIO doesn't have
corresponding spitz_ohci_exit() which will gpio_free(). This causes
minor problems e.g. during resume when the OHCI device can't be resumed.

Signed-off-by: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Signed-off-by: Eric Miao <eric.miao@marvell.com>

diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c
index cdacea0..5a45fe3 100644
--- a/arch/arm/mach-pxa/spitz.c
+++ b/arch/arm/mach-pxa/spitz.c
@@ -531,9 +531,15 @@ static int spitz_ohci_init(struct device *dev)
 	return gpio_direction_output(SPITZ_GPIO_USB_HOST, 1);
 }
 
+static void spitz_ohci_exit(struct device *dev)
+{
+	gpio_free(SPITZ_GPIO_USB_HOST);
+}
+
 static struct pxaohci_platform_data spitz_ohci_platform_data = {
 	.port_mode	= PMM_NPS_MODE,
 	.init		= spitz_ohci_init,
+	.exit		= spitz_ohci_exit,
 	.flags		= ENABLE_PORT_ALL | NO_OC_PROTECTION,
 	.power_budget	= 150,
 };
-- 
cgit v0.10.2


From 982db66352d31892f624390cfb64a1cea5df765a Mon Sep 17 00:00:00 2001
From: Rabin Vincent <rabin@rab.in>
Date: Mon, 18 May 2009 17:29:30 +0100
Subject: [ARM] 5518/1: versatile: don't put clock lookups in __initdata

Remove the __initdata annotation for the clock lookups, since they
will be needed when loading modules which use clk_get().

Signed-off-by: Rabin Vincent <rabin@rab.in>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

diff --git a/arch/arm/mach-versatile/core.c b/arch/arm/mach-versatile/core.c
index 1f929c3..b3bebcc 100644
--- a/arch/arm/mach-versatile/core.c
+++ b/arch/arm/mach-versatile/core.c
@@ -413,7 +413,7 @@ static struct clk ref24_clk = {
 	.rate	= 24000000,
 };
 
-static struct clk_lookup lookups[] __initdata = {
+static struct clk_lookup lookups[] = {
 	{	/* UART0 */
 		.dev_id		= "dev:f1",
 		.clk		= &ref24_clk,
-- 
cgit v0.10.2


From a93ea9b357a4d4fce9a1f65bf9c152fb67c30716 Mon Sep 17 00:00:00 2001
From: Rabin Vincent <rabin@rab.in>
Date: Mon, 18 May 2009 17:26:08 +0100
Subject: [ARM] 5517/1: integrator: don't put clock lookups in __initdata

Remove the __initdata annotation for the clock lookups, since they
will be needed when loading modules which use clk_get().

Signed-off-by: Rabin Vincent <rabin@rab.in>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

diff --git a/arch/arm/mach-integrator/core.c b/arch/arm/mach-integrator/core.c
index 6f88729..a0f60e5 100644
--- a/arch/arm/mach-integrator/core.c
+++ b/arch/arm/mach-integrator/core.c
@@ -121,7 +121,7 @@ static struct clk uartclk = {
 	.rate	= 14745600,
 };
 
-static struct clk_lookup lookups[] __initdata = {
+static struct clk_lookup lookups[] = {
 	{	/* UART0 */
 		.dev_id		= "mb:16",
 		.clk		= &uartclk,
-- 
cgit v0.10.2


From 775273131810caa41dfc7f9e552ea5d8508caf40 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Sun, 10 May 2009 20:32:34 +0000
Subject: tcp: fix MSG_PEEK race check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 518a09ef11 (tcp: Fix recvmsg MSG_PEEK influence of
blocking behavior) lets the loop run longer than the race check
did previously expect, so we need to be more careful with this
check and consider the work we have been doing.

I tried my best to deal with urg hole madness too which happens
here:
	if (!sock_flag(sk, SOCK_URGINLINE)) {
		++*seq;
		...
by using additional offset by one but I certainly have very
little interest in testing that part.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Tested-by: Frans Pop <elendil@planet.nl>
Tested-by: Ian Zimmermann <itz@buug.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1d7f49c..7a0f0b2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1321,6 +1321,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	struct task_struct *user_recv = NULL;
 	int copied_early = 0;
 	struct sk_buff *skb;
+	u32 urg_hole = 0;
 
 	lock_sock(sk);
 
@@ -1532,7 +1533,8 @@ do_prequeue:
 				}
 			}
 		}
-		if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
+		if ((flags & MSG_PEEK) &&
+		    (peek_seq - copied - urg_hole != tp->copied_seq)) {
 			if (net_ratelimit())
 				printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
 				       current->comm, task_pid_nr(current));
@@ -1553,6 +1555,7 @@ do_prequeue:
 				if (!urg_offset) {
 					if (!sock_flag(sk, SOCK_URGINLINE)) {
 						++*seq;
+						urg_hole++;
 						offset++;
 						used--;
 						if (!used)
-- 
cgit v0.10.2


From c0f84d0d4be3f7d818b4ffb04d27f9bae64397f0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Mon, 18 May 2009 15:12:31 -0700
Subject: sch_teql: should not dereference skb after ndo_start_xmit()

It is illegal to dereference a skb after a successful ndo_start_xmit()
call. We must store skb length in a local variable instead.

Bug was introduced in 2.6.27 by commit 0abf77e55a2459aa9905be4b226e4729d5b4f0cb
(net_sched: Add accessor function for packet length for qdiscs)

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index ec697ce..3b64182 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -303,6 +303,8 @@ restart:
 		switch (teql_resolve(skb, skb_res, slave)) {
 		case 0:
 			if (__netif_tx_trylock(slave_txq)) {
+				unsigned int length = qdisc_pkt_len(skb);
+
 				if (!netif_tx_queue_stopped(slave_txq) &&
 				    !netif_tx_queue_frozen(slave_txq) &&
 				    slave_ops->ndo_start_xmit(skb, slave) == 0) {
@@ -310,8 +312,7 @@ restart:
 					master->slaves = NEXT_SLAVE(q);
 					netif_wake_queue(dev);
 					master->stats.tx_packets++;
-					master->stats.tx_bytes +=
-						qdisc_pkt_len(skb);
+					master->stats.tx_bytes += length;
 					return 0;
 				}
 				__netif_tx_unlock(slave_txq);
-- 
cgit v0.10.2


From 75834fc3b6fcff00327f5d2a18760c1e8e0179c5 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 18 May 2009 10:26:10 -0400
Subject: SELinux: move SELINUX_MAGIC into magic.h

The selinuxfs superblock magic is used inside the IMA code, but is being
defined in two places and could someday get out of sync.  This patch moves the
declaration into magic.h so it is only done once.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/include/linux/magic.h b/include/linux/magic.h
index 5b4e28b..927138c 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -9,6 +9,7 @@
 #define DEBUGFS_MAGIC          0x64626720
 #define SYSFS_MAGIC		0x62656572
 #define SECURITYFS_MAGIC	0x73636673
+#define SELINUX_MAGIC		0xf97cff8c
 #define TMPFS_MAGIC		0x01021994
 #define SQUASHFS_MAGIC		0x73717368
 #define EFS_SUPER_MAGIC		0x414A53
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index dec6dcb..31d677f 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -49,14 +49,12 @@ struct ima_measure_rule_entry {
  * written in terms of .action, .func, .mask, .fsmagic, and .uid
  */
 static struct ima_measure_rule_entry default_rules[] = {
-	{.action = DONT_MEASURE,.fsmagic = PROC_SUPER_MAGIC,
-	 .flags = IMA_FSMAGIC},
+	{.action = DONT_MEASURE,.fsmagic = PROC_SUPER_MAGIC,.flags = IMA_FSMAGIC},
 	{.action = DONT_MEASURE,.fsmagic = SYSFS_MAGIC,.flags = IMA_FSMAGIC},
 	{.action = DONT_MEASURE,.fsmagic = DEBUGFS_MAGIC,.flags = IMA_FSMAGIC},
 	{.action = DONT_MEASURE,.fsmagic = TMPFS_MAGIC,.flags = IMA_FSMAGIC},
-	{.action = DONT_MEASURE,.fsmagic = SECURITYFS_MAGIC,
-	 .flags = IMA_FSMAGIC},
-	{.action = DONT_MEASURE,.fsmagic = 0xF97CFF8C,.flags = IMA_FSMAGIC},
+	{.action = DONT_MEASURE,.fsmagic = SECURITYFS_MAGIC,.flags = IMA_FSMAGIC},
+	{.action = DONT_MEASURE,.fsmagic = SELINUX_MAGIC,.flags = IMA_FSMAGIC},
 	{.action = MEASURE,.func = FILE_MMAP,.mask = MAY_EXEC,
 	 .flags = IMA_FUNC | IMA_MASK},
 	{.action = MEASURE,.func = BPRM_CHECK,.mask = MAY_EXEC,
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h
index a7be3f0..ca83579 100644
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -8,14 +8,13 @@
 #ifndef _SELINUX_SECURITY_H_
 #define _SELINUX_SECURITY_H_
 
+#include <linux/magic.h>
 #include "flask.h"
 
 #define SECSID_NULL			0x00000000 /* unspecified SID */
 #define SECSID_WILD			0xffffffff /* wildcard SID */
 #define SECCLASS_NULL			0x0000 /* no class */
 
-#define SELINUX_MAGIC 0xf97cff8c
-
 /* Identify specific policy version changes */
 #define POLICYDB_VERSION_BASE		15
 #define POLICYDB_VERSION_BOOL		16
-- 
cgit v0.10.2


From c4ca2374312b4de819dd700e72a68395eddb5fcb Mon Sep 17 00:00:00 2001
From: Ajit Khaparde <ajitk@serverengines.com>
Date: Mon, 18 May 2009 15:38:55 -0700
Subject: be2net: add two new pci device ids to pci device table

Signed-off-by: Ajit Khaparde <ajitk@serverengines.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/benet/be.h b/drivers/net/benet/be.h
index c49ddd0..b4bb06f 100644
--- a/drivers/net/benet/be.h
+++ b/drivers/net/benet/be.h
@@ -35,8 +35,22 @@
 #define DRV_VER			"2.0.348"
 #define DRV_NAME		"be2net"
 #define BE_NAME			"ServerEngines BladeEngine2 10Gbps NIC"
+#define OC_NAME			"Emulex OneConnect 10Gbps NIC"
 #define DRV_DESC		BE_NAME "Driver"
 
+#define BE_VENDOR_ID 		0x19a2
+#define BE_DEVICE_ID1		0x211
+#define OC_DEVICE_ID1		0x700
+#define OC_DEVICE_ID2		0x701
+
+static inline char *nic_name(struct pci_dev *pdev)
+{
+	if (pdev->device == OC_DEVICE_ID1 || pdev->device == OC_DEVICE_ID2)
+		return OC_NAME;
+	else
+		return BE_NAME;
+}
+
 /* Number of bytes of an RX frame that are copied to skb->data */
 #define BE_HDR_LEN 		64
 #define BE_MAX_JUMBO_FRAME_SIZE	9018
diff --git a/drivers/net/benet/be_main.c b/drivers/net/benet/be_main.c
index 30d0c81..5c378b5 100644
--- a/drivers/net/benet/be_main.c
+++ b/drivers/net/benet/be_main.c
@@ -28,10 +28,10 @@ static unsigned int rx_frag_size = 2048;
 module_param(rx_frag_size, uint, S_IRUGO);
 MODULE_PARM_DESC(rx_frag_size, "Size of a fragment that holds rcvd data.");
 
-#define BE_VENDOR_ID 		0x19a2
-#define BE2_DEVICE_ID_1 	0x0211
 static DEFINE_PCI_DEVICE_TABLE(be_dev_ids) = {
-	{ PCI_DEVICE(BE_VENDOR_ID, BE2_DEVICE_ID_1) },
+	{ PCI_DEVICE(BE_VENDOR_ID, BE_DEVICE_ID1) },
+	{ PCI_DEVICE(BE_VENDOR_ID, OC_DEVICE_ID1) },
+	{ PCI_DEVICE(BE_VENDOR_ID, OC_DEVICE_ID2) },
 	{ 0 }
 };
 MODULE_DEVICE_TABLE(pci, be_dev_ids);
@@ -1859,7 +1859,7 @@ static int __devinit be_probe(struct pci_dev *pdev,
 	if (status != 0)
 		goto stats_clean;
 
-	dev_info(&pdev->dev, BE_NAME " port %d\n", adapter->port_num);
+	dev_info(&pdev->dev, "%s port %d\n", nic_name(pdev), adapter->port_num);
 	return 0;
 
 stats_clean:
@@ -1873,7 +1873,7 @@ rel_reg:
 disable_dev:
 	pci_disable_device(pdev);
 do_none:
-	dev_warn(&pdev->dev, BE_NAME " initialization failed\n");
+	dev_err(&pdev->dev, "%s initialization failed\n", nic_name(pdev));
 	return status;
 }
 
-- 
cgit v0.10.2


From 511e11e396dc596825ce04d53d7f6d579404bc01 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Mon, 18 May 2009 19:26:37 -0700
Subject: pkt_sched: gen_estimator: use 64 bit intermediate counters for bps

gen_estimator can overflow bps (bytes per second) with Gb links, while
it was designed with a u32 API, with a theorical limit of 34360Mbit
(2^32 bytes)

Using 64 bit intermediate avbps/brate counters can allow us to reach
this theorical limit.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 9cc9f95..6d62d46 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -66,9 +66,9 @@
 
    NOTES.
 
-   * The stored value for avbps is scaled by 2^5, so that maximal
-     rate is ~1Gbit, avpps is scaled by 2^10.
-
+   * avbps is scaled by 2^5, avpps is scaled by 2^10.
+   * both values are reported as 32 bit unsigned values. bps can
+     overflow for fast links : max speed being 34360Mbit/sec
    * Minimal interval is HZ/4=250msec (it is the greatest common divisor
      for HZ=100 and HZ=1024 8)), maximal interval
      is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
@@ -86,9 +86,9 @@ struct gen_estimator
 	spinlock_t		*stats_lock;
 	int			ewma_log;
 	u64			last_bytes;
+	u64			avbps;
 	u32			last_packets;
 	u32			avpps;
-	u32			avbps;
 	struct rcu_head		e_rcu;
 	struct rb_node		node;
 };
@@ -115,6 +115,7 @@ static void est_timer(unsigned long arg)
 	rcu_read_lock();
 	list_for_each_entry_rcu(e, &elist[idx].list, list) {
 		u64 nbytes;
+		u64 brate;
 		u32 npackets;
 		u32 rate;
 
@@ -125,9 +126,9 @@ static void est_timer(unsigned long arg)
 
 		nbytes = e->bstats->bytes;
 		npackets = e->bstats->packets;
-		rate = (nbytes - e->last_bytes)<<(7 - idx);
+		brate = (nbytes - e->last_bytes)<<(7 - idx);
 		e->last_bytes = nbytes;
-		e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log;
+		e->avbps += ((s64)(brate - e->avbps)) >> e->ewma_log;
 		e->rate_est->bps = (e->avbps+0xF)>>5;
 
 		rate = (npackets - e->last_packets)<<(12 - idx);
-- 
cgit v0.10.2


From 995b337952cdf7e05d288eede580257b632a8343 Mon Sep 17 00:00:00 2001
From: Thomas Chenault <thomas_chenault@dell.com>
Date: Mon, 18 May 2009 21:43:27 -0700
Subject: net: fix skb_seq_read returning wrong offset/length for page frag
 data

When called with a consumed value that is less than skb_headlen(skb)
bytes into a page frag, skb_seq_read() incorrectly returns an
offset/length relative to skb->data. Ensure that data which should come
from a page frag does.

Signed-off-by: Thomas Chenault <thomas_chenault@dell.com>
Tested-by: Shyam Iyer <shyam_iyer@dell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d152394..e505b53 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2288,7 +2288,7 @@ unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
 next_skb:
 	block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;
 
-	if (abs_offset < block_limit) {
+	if (abs_offset < block_limit && !st->frag_data) {
 		*data = st->cur_skb->data + (abs_offset - st->stepped_offset);
 		return block_limit - abs_offset;
 	}
-- 
cgit v0.10.2


From bc8a5397433e4effbaddfa7e462d10b3c060cabb Mon Sep 17 00:00:00 2001
From: Frans Pop <elendil@planet.nl>
Date: Mon, 18 May 2009 21:48:38 -0700
Subject: ipv4: make default for INET_LRO consistent with help text

Commit e81963b1 ("ipv4: Make INET_LRO a bool instead of tristate.")
changed this config from tristate to bool.  Add default so that it is
consistent with the help text.

Signed-off-by: Frans Pop <elendil@planet.nl>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 9d26a3d..5b919f7 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -408,7 +408,7 @@ config INET_XFRM_MODE_BEET
 
 config INET_LRO
 	bool "Large Receive Offload (ipv4/tcp)"
-
+	default y
 	---help---
 	  Support for Large Receive Offload (ipv4/tcp).
 
-- 
cgit v0.10.2


From 28afea5b2ffaa48f4f43d22ae8edcc384c05df80 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Tue, 19 May 2009 08:25:48 +0200
Subject: xen/blkfront: allow xenbus state transition to Closing->Closed  when
 not Connected

This situation can occur when attempting to attach a block device whose
backend is an empty physical CD-ROM driver. The backend in this case
will go directly from the Initialising state to Closing->Closed.
Previously this would result in a NULL pointer deref on info->gd
(xenbus_dev_fatal does not return as a1a15ac5 seems to expect)

Cc: stable@kernel.org
Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Acked-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 8f90508..6971a12 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -977,8 +977,10 @@ static void backend_changed(struct xenbus_device *dev,
 		break;
 
 	case XenbusStateClosing:
-		if (info->gd == NULL)
-			xenbus_dev_fatal(dev, -ENODEV, "gd is NULL");
+		if (info->gd == NULL) {
+			xenbus_frontend_closed(dev);
+			break;
+		}
 		bd = bdget_disk(info->gd, 0);
 		if (bd == NULL)
 			xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
-- 
cgit v0.10.2


From 31a14400e8f365e5d4d1a0b2a1ef4d5405d6091e Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Tue, 19 May 2009 08:27:42 +0200
Subject: xen/blkfront: fix warning when deleting gendisk on unplug/shutdown

Currently blkfront gives a warning when hot unplugging due to calling
del_gendisk() with interrupts disabled (due to blkif_io_lock).

WARNING: at kernel/softirq.c:124 local_bh_enable+0x36/0x84()
Modules linked in: xenfs xen_netfront ext3 jbd mbcache xen_blkfront
Pid: 13, comm: xenwatch Not tainted 2.6.29-xs5.5.0.13 #3
Call Trace:
 [<c012611c>] warn_slowpath+0x80/0xb6
 [<c0104cf1>] xen_sched_clock+0x16/0x63
 [<c0104710>] xen_force_evtchn_callback+0xc/0x10
 [<c0104e32>] check_events+0x8/0xe
 [<c0104d9b>] xen_restore_fl_direct_end+0x0/0x1
 [<c0103749>] xen_mc_flush+0x10a/0x13f
 [<c0105bd2>] __switch_to+0x114/0x14e
 [<c011d92b>] dequeue_task+0x62/0x70
 [<c0123b6f>] finish_task_switch+0x2b/0x84
 [<c0299877>] schedule+0x66d/0x6e7
 [<c0104710>] xen_force_evtchn_callback+0xc/0x10
 [<c0104710>] xen_force_evtchn_callback+0xc/0x10
 [<c012a642>] local_bh_enable+0x36/0x84
 [<c022f9a7>] sk_filter+0x57/0x5c
 [<c0233dae>] netlink_broadcast+0x1d5/0x315
 [<c01c6371>] kobject_uevent_env+0x28d/0x331
 [<c01e7ead>] device_del+0x10f/0x120
 [<c01e7ec6>] device_unregister+0x8/0x10
 [<c015f86d>] bdi_unregister+0x2d/0x39
 [<c01bf6f4>] unlink_gendisk+0x23/0x3e
 [<c01ac946>] del_gendisk+0x7b/0xe7
 [<d0828c19>] blkfront_closing+0x28/0x6e [xen_blkfront]
 [<d082900c>] backend_changed+0x3ad/0x41d [xen_blkfront]

We can fix this by calling del_gendisk() later in blkfront_closing, after
releasing blkif_io_lock. Since the queue is stopped during the interrupts
disabled phase I don't think there is any danger of an event occuring between
releasing the blkif_io_lock and deleting the disk.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 6971a12..a6cbf7b 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -934,8 +934,6 @@ static void blkfront_closing(struct xenbus_device *dev)
 
 	spin_lock_irqsave(&blkif_io_lock, flags);
 
-	del_gendisk(info->gd);
-
 	/* No more blkif_request(). */
 	blk_stop_queue(info->rq);
 
@@ -949,6 +947,8 @@ static void blkfront_closing(struct xenbus_device *dev)
 	blk_cleanup_queue(info->rq);
 	info->rq = NULL;
 
+	del_gendisk(info->gd);
+
  out:
 	xenbus_frontend_closed(dev);
 }
-- 
cgit v0.10.2


From 4200efd9acda4accf24640f1e77d24fdcdb524df Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 19 May 2009 09:22:19 +0200
Subject: sched: properly define the sched_group::cpumask and
 sched_domain::span fields

Properly document the variable-size structure tricks we are doing
wrt. struct sched_group and sched_domain, and use the field[0] GCC
extension instead of defining a vla array.

Dont use unions for this, as pointed out by Linus.

[ Impact: cleanup, un-confuse Sparse and LLVM ]

Reported-by: Jeff Garzik <jeff@garzik.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <alpine.LFD.2.01.0905180850110.3301@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index de7b3b2..dbb1043 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -839,7 +839,17 @@ struct sched_group {
 	 */
 	u32 reciprocal_cpu_power;
 
-	unsigned long cpumask[];
+	/*
+	 * The CPUs this group covers.
+	 *
+	 * NOTE: this field is variable length. (Allocated dynamically
+	 * by attaching extra space to the end of the structure,
+	 * depending on how many CPUs the kernel has booted up with)
+	 *
+	 * It is also be embedded into static data structures at build
+	 * time. (See 'struct static_sched_group' in kernel/sched.c)
+	 */
+	unsigned long cpumask[0];
 };
 
 static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
@@ -925,8 +935,17 @@ struct sched_domain {
 	char *name;
 #endif
 
-	/* span of all CPUs in this domain */
-	unsigned long span[];
+	/*
+	 * Span of all CPUs in this domain.
+	 *
+	 * NOTE: this field is variable length. (Allocated dynamically
+	 * by attaching extra space to the end of the structure,
+	 * depending on how many CPUs the kernel has booted up with)
+	 *
+	 * It is also be embedded into static data structures at build
+	 * time. (See 'struct static_sched_domain' in kernel/sched.c)
+	 */
+	unsigned long span[0];
 };
 
 static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
diff --git a/kernel/sched.c b/kernel/sched.c
index 497c09b..228acae 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7948,8 +7948,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 
 /*
  * The cpus mask in sched_group and sched_domain hangs off the end.
- * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
- * for nr_cpu_ids < CONFIG_NR_CPUS.
+ *
+ * ( See the the comments in include/linux/sched.h:struct sched_group
+ *   and struct sched_domain. )
  */
 struct static_sched_group {
 	struct sched_group sg;
-- 
cgit v0.10.2


From 143c145e3a475065a4be661468d0df1bd0b25f74 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 19 May 2009 14:43:15 +0800
Subject: tracing/events: Documentation updates

- fix some typos
- document the difference between '>' and '>>'
- document the 'enable' toggle
- remove section "Defining an event-enabled tracepoint", since it's
  out-dated and sample/trace_events/ already serves this purpose.

v2: add "Updated by Li Zefan"

[ Impact: make documentation up-to-date ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
LKML-Reference: <4A125503.5060406@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
index abdee66..f157d75 100644
--- a/Documentation/trace/events.txt
+++ b/Documentation/trace/events.txt
@@ -1,9 +1,10 @@
 			     Event Tracing
 
 		Documentation written by Theodore Ts'o
+			Updated by Li Zefan
 
-Introduction
-============
+1. Introduction
+===============
 
 Tracepoints (see Documentation/trace/tracepoints.txt) can be used
 without creating custom kernel modules to register probe functions
@@ -12,30 +13,37 @@ using the event tracing infrastructure.
 Not all tracepoints can be traced using the event tracing system;
 the kernel developer must provide code snippets which define how the
 tracing information is saved into the tracing buffer, and how the
-the tracing information should be printed.
+tracing information should be printed.
 
-Using Event Tracing
-===================
+2. Using Event Tracing
+======================
+
+2.1 Via the 'set_event' interface
+---------------------------------
 
 The events which are available for tracing can be found in the file
-/sys/kernel/debug/tracing/available_events.
+/debug/tracing/available_events.
 
 To enable a particular event, such as 'sched_wakeup', simply echo it
-to /sys/debug/tracing/set_event. For example:
+to /debug/tracing/set_event. For example:
 
-	# echo sched_wakeup > /sys/kernel/debug/tracing/set_event
+	# echo sched_wakeup >> /debug/tracing/set_event
 
-[ Note: events can also be enabled/disabled via the 'enabled' toggle
-  found in the /sys/kernel/tracing/events/ hierarchy of directories. ]
+[ Note: '>>' is necessary, otherwise it will firstly disable
+  all the events. ]
 
 To disable an event, echo the event name to the set_event file prefixed
 with an exclamation point:
 
-	# echo '!sched_wakeup' >> /sys/kernel/debug/tracing/set_event
+	# echo '!sched_wakeup' >> /debug/tracing/set_event
+
+To disable all events, echo an empty line to the set_event file:
+
+	# echo > /debug/tracing/set_event
 
-To disable events, echo an empty line to the set_event file:
+To enable all events, echo '*:*' or '*:' to the set_event file:
 
-	# echo > /sys/kernel/debug/tracing/set_event
+	# echo *:* > /debug/tracing/set_event
 
 The events are organized into subsystems, such as ext4, irq, sched,
 etc., and a full event name looks like this: <subsystem>:<event>.  The
@@ -44,92 +52,39 @@ file.  All of the events in a subsystem can be specified via the syntax
 "<subsystem>:*"; for example, to enable all irq events, you can use the
 command:
 
-	# echo 'irq:*' > /sys/kernel/debug/tracing/set_event
-
-Defining an event-enabled tracepoint
-------------------------------------
-
-A kernel developer which wishes to define an event-enabled tracepoint
-must declare the tracepoint using TRACE_EVENT instead of DECLARE_TRACE.
-This is done via two header files in include/trace.  For example, to
-event-enable the jbd2 subsystem, we must create two files,
-include/trace/jbd2.h and include/trace/jbd2_event_types.h.  The
-include/trace/jbd2.h file should be included by kernel source files that
-will have a tracepoint inserted, and might look like this:
-
-#ifndef _TRACE_JBD2_H
-#define _TRACE_JBD2_H
-
-#include <linux/jbd2.h>
-#include <linux/tracepoint.h>
-
-#include <trace/jbd2_event_types.h>
-
-#endif
-
-In a file that utilizes a jbd2 tracepoint, this header file would be
-included.  Note that you still have to use DEFINE_TRACE().  So for
-example, if fs/jbd2/commit.c planned to use the jbd2_start_commit
-tracepoint, it would have the following near the beginning of the file:
-
-#include <trace/jbd2.h>
-
-DEFINE_TRACE(jbd2_start_commit);
-
-Then in the function that would call the tracepoint, it would call the
-tracepoint function.  (For more information, please see the tracepoint
-documentation in Documentation/trace/tracepoints.txt):
-
-	trace_jbd2_start_commit(journal, commit_transaction);
-
-The code snippets which allow jbd2_start_commit to be an event-enabled
-tracepoint are placed in the file include/trace/jbd2_event_types.h:
-
-/* use <trace/jbd2.h> instead */
-#ifndef TRACE_EVENT
-# error Do not include this file directly.
-# error Unless you know what you are doing.
-#endif
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM jbd2
-
-#include <linux/jbd2.h>
-
-TRACE_EVENT(jbd2_start_commit,
-	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
-	TP_ARGS(journal, commit_transaction),
-	TP_STRUCT__entry(
-		__array(	char,	devname, BDEVNAME_SIZE+24 )
-		__field(	int,	transaction		  )
-	),
-	TP_fast_assign(
-		memcpy(__entry->devname, journal->j_devname, BDEVNAME_SIZE+24);
-		__entry->transaction	= commit_transaction->t_tid;
-	),
-	TP_printk("dev %s transaction %d",
-		  __entry->devname, __entry->transaction)
-);
-
-The TP_PROTO and TP_ARGS are unchanged from DECLARE_TRACE.  The new
-arguments to TRACE_EVENT are TP_STRUCT__entry, TP_fast_assign, and
-TP_printk.
-
-TP_STRUCT__entry defines the data structure which will be stored in the
-trace buffer.  Normally, fields in __entry will be arrays or simple
-types.  It is possible to place data structures in __entry --- however,
-pointers in the data structure can not be trusted, since they will be
-accessed sometime later by TP_printk, and if the data structure contains
-fields that will not or cannot be used by TP_printk, this will waste
-space in the trace buffer.  In general, data structures should be
-avoided, unless they do only contain non-pointer types and all of the
-fields will be used by TP_printk.
-
-TP_fast_assign defines the code snippet which saves information into the
-__entry data structure, using the passed-in arguments defined in
-TP_PROTO and TP_ARGS.
-
-Finally, TP_printk will print the __entry data structure.  At the time
-when the code snippet defined by TP_printk is executed, it will not have
-access to the TP_ARGS arguments; it can only use the information saved
-in the __entry data structure.
+	# echo 'irq:*' > /debug/tracing/set_event
+
+2.2 Via the 'enable' toggle
+---------------------------
+
+The events available are also listed in /debug/tracing/events/ hierarchy
+of directories.
+
+To enable event 'sched_wakeup':
+
+	# echo 1 > /debug/tracing/events/sched/sched_wakeup/enable
+
+To disable it:
+
+	# echo 0 > /debug/tracing/events/sched/sched_wakeup/enable
+
+To enable all events in sched subsystem:
+
+	# echo 1 > /debug/tracing/events/sched/enable
+
+To eanble all events:
+
+	# echo 1 > /debug/tracing/events/enable
+
+When reading one of these enable files, there are four results:
+
+ 0 - all events this file affects are disabled
+ 1 - all events this file affects are enabled
+ X - there is a mixture of events enabled and disabled
+ ? - this file does not affect any event
+
+3. Defining an event-enabled tracepoint
+=======================================
+
+See The example provided in samples/trace_events
+
-- 
cgit v0.10.2


From fd51d251e4cdb21f68e9dbc4336514d64a105a79 Mon Sep 17 00:00:00 2001
From: Stefan Raspl <raspl@linux.vnet.ibm.com>
Date: Tue, 19 May 2009 09:59:08 +0200
Subject: blktrace: remove debugfs entries on bad path

debugfs directory entries for devices are not removed on some
of the failure pathes in do_blk_trace_setup().
One way to reproduce is to start blktrace on multiple devices
with insufficient Vmalloc space: Devices will fail with
a message like this:

	BLKTRACESETUP(2) /dev/sdu failed: 5/Input/output error

If so, the respective entries in debugfs
(e.g. /sys/kernel/debug/block/sdu) will remain and subsequent
attempts to start blktrace on the respective devices will not
succeed due to existing directories.

[ Impact: fix /debug/tracing file cleanup corner case ]

Signed-off-by: Stefan Raspl <stefan.raspl@linux.vnet.ibm.com>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: schwidefsky@de.ibm.com
Cc: heiko.carstens@de.ibm.com
LKML-Reference: <4A1266CC.5040801@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 05b4747..e3abf55 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -262,6 +262,7 @@ static void blk_trace_free(struct blk_trace *bt)
 {
 	debugfs_remove(bt->msg_file);
 	debugfs_remove(bt->dropped_file);
+	debugfs_remove(bt->dir);
 	relay_close(bt->rchan);
 	free_percpu(bt->sequence);
 	free_percpu(bt->msg_data);
-- 
cgit v0.10.2


From fe64d517df0970a68417184a12fcd4ba0589cc28 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 19 May 2009 10:01:18 +0100
Subject: GFS2: Umount recovery race fix

This patch fixes a race condition where we can receive recovery
requests part way through processing a umount. This was causing
problems since the recovery thread had already gone away.

Looking in more detail at the recovery code, it was really trying
to implement a slight variation on a work queue, and that happens to
align nicely with the recently introduced slow-work subsystem. As a
result I've updated the code to use slow-work, rather than its own home
grown variety of work queue.

When using the wait_on_bit() function, I noticed that the wait function
that was supplied as an argument was appearing in the WCHAN field, so
I've updated the function names in order to produce more meaningful
output.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 3a981b7..cad957c 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -7,6 +7,7 @@ config GFS2_FS
 	select IP_SCTP if DLM_SCTP
 	select FS_POSIX_ACL
 	select CRC32
+	select SLOW_WORK
 	help
 	  A cluster filesystem.
 
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ff49810..2bf62bc 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -796,22 +796,37 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
 	gh->gh_ip = 0;
 }
 
-static int just_schedule(void *word)
+/**
+ * gfs2_glock_holder_wait
+ * @word: unused
+ *
+ * This function and gfs2_glock_demote_wait both show up in the WCHAN
+ * field. Thus I've separated these otherwise identical functions in
+ * order to be more informative to the user.
+ */
+
+static int gfs2_glock_holder_wait(void *word)
 {
         schedule();
         return 0;
 }
 
+static int gfs2_glock_demote_wait(void *word)
+{
+	schedule();
+	return 0;
+}
+
 static void wait_on_holder(struct gfs2_holder *gh)
 {
 	might_sleep();
-	wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE);
+	wait_on_bit(&gh->gh_iflags, HIF_WAIT, gfs2_glock_holder_wait, TASK_UNINTERRUPTIBLE);
 }
 
 static void wait_on_demote(struct gfs2_glock *gl)
 {
 	might_sleep();
-	wait_on_bit(&gl->gl_flags, GLF_DEMOTE, just_schedule, TASK_UNINTERRUPTIBLE);
+	wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE);
 }
 
 /**
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 65f438e..0060e95 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -12,6 +12,7 @@
 
 #include <linux/fs.h>
 #include <linux/workqueue.h>
+#include <linux/slow-work.h>
 #include <linux/dlm.h>
 #include <linux/buffer_head.h>
 
@@ -376,11 +377,11 @@ struct gfs2_journal_extent {
 struct gfs2_jdesc {
 	struct list_head jd_list;
 	struct list_head extent_list;
-
+	struct slow_work jd_work;
 	struct inode *jd_inode;
+	unsigned long jd_flags;
+#define JDF_RECOVERY 1
 	unsigned int jd_jid;
-	int jd_dirty;
-
 	unsigned int jd_blocks;
 };
 
@@ -390,9 +391,6 @@ struct gfs2_statfs_change_host {
 	s64 sc_dinodes;
 };
 
-#define GFS2_GLOCKD_DEFAULT	1
-#define GFS2_GLOCKD_MAX		16
-
 #define GFS2_QUOTA_DEFAULT	GFS2_QUOTA_OFF
 #define GFS2_QUOTA_OFF		0
 #define GFS2_QUOTA_ACCOUNT	1
@@ -427,7 +425,6 @@ struct gfs2_tune {
 	unsigned int gt_incore_log_blocks;
 	unsigned int gt_log_flush_secs;
 
-	unsigned int gt_recoverd_secs;
 	unsigned int gt_logd_secs;
 
 	unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
@@ -448,6 +445,7 @@ enum {
 	SDF_JOURNAL_LIVE	= 1,
 	SDF_SHUTDOWN		= 2,
 	SDF_NOBARRIERS		= 3,
+	SDF_NORECOVERY		= 4,
 };
 
 #define GFS2_FSNAME_LEN		256
@@ -494,7 +492,6 @@ struct lm_lockstruct {
 	unsigned long ls_flags;
 	dlm_lockspace_t *ls_dlm;
 
-	int ls_recover_jid;
 	int ls_recover_jid_done;
 	int ls_recover_jid_status;
 };
@@ -583,7 +580,6 @@ struct gfs2_sbd {
 
 	/* Daemon stuff */
 
-	struct task_struct *sd_recoverd_process;
 	struct task_struct *sd_logd_process;
 	struct task_struct *sd_quotad_process;
 
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index a6892ed..eacd78a 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -15,6 +15,7 @@
 #include <linux/init.h>
 #include <linux/gfs2_ondisk.h>
 #include <asm/atomic.h>
+#include <linux/slow-work.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -113,12 +114,18 @@ static int __init init_gfs2_fs(void)
 	if (error)
 		goto fail_unregister;
 
+	error = slow_work_register_user();
+	if (error)
+		goto fail_slow;
+
 	gfs2_register_debugfs();
 
 	printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
 
 	return 0;
 
+fail_slow:
+	unregister_filesystem(&gfs2meta_fs_type);
 fail_unregister:
 	unregister_filesystem(&gfs2_fs_type);
 fail:
@@ -156,6 +163,7 @@ static void __exit exit_gfs2_fs(void)
 	gfs2_unregister_debugfs();
 	unregister_filesystem(&gfs2_fs_type);
 	unregister_filesystem(&gfs2meta_fs_type);
+	slow_work_unregister_user();
 
 	kmem_cache_destroy(gfs2_quotad_cachep);
 	kmem_cache_destroy(gfs2_rgrpd_cachep);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 7981fbc..2cd1164 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -17,6 +17,7 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/slow-work.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -55,7 +56,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 	spin_lock_init(&gt->gt_spin);
 
 	gt->gt_incore_log_blocks = 1024;
-	gt->gt_recoverd_secs = 60;
 	gt->gt_logd_secs = 1;
 	gt->gt_quota_simul_sync = 64;
 	gt->gt_quota_warn_period = 10;
@@ -675,6 +675,7 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
 			break;
 
 		INIT_LIST_HEAD(&jd->extent_list);
+		slow_work_init(&jd->jd_work, &gfs2_recover_ops);
 		jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
 		if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
 			if (!jd->jd_inode)
@@ -700,14 +701,13 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 {
 	struct inode *master = sdp->sd_master_dir->d_inode;
 	struct gfs2_holder ji_gh;
-	struct task_struct *p;
 	struct gfs2_inode *ip;
 	int jindex = 1;
 	int error = 0;
 
 	if (undo) {
 		jindex = 0;
-		goto fail_recoverd;
+		goto fail_jinode_gh;
 	}
 
 	sdp->sd_jindex = gfs2_lookup_simple(master, "jindex");
@@ -800,18 +800,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 	gfs2_glock_dq_uninit(&ji_gh);
 	jindex = 0;
 
-	p = kthread_run(gfs2_recoverd, sdp, "gfs2_recoverd");
-	error = IS_ERR(p);
-	if (error) {
-		fs_err(sdp, "can't start recoverd thread: %d\n", error);
-		goto fail_jinode_gh;
-	}
-	sdp->sd_recoverd_process = p;
-
 	return 0;
 
-fail_recoverd:
-	kthread_stop(sdp->sd_recoverd_process);
 fail_jinode_gh:
 	if (!sdp->sd_args.ar_spectator)
 		gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
@@ -1172,8 +1162,10 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 		goto fail;
 	}
 
-	if (sdp->sd_args.ar_spectator)
+	if (sdp->sd_args.ar_spectator) {
                 sb->s_flags |= MS_RDONLY;
+		set_bit(SDF_NORECOVERY, &sdp->sd_flags);
+	}
 	if (sdp->sd_args.ar_posix_acl)
 		sb->s_flags |= MS_POSIXACL;
 
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 0677a83..a3c2272 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -121,6 +121,12 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
 	return error;
 }
 
+static int gfs2_umount_recovery_wait(void *word)
+{
+	schedule();
+	return 0;
+}
+
 /**
  * gfs2_put_super - Unmount the filesystem
  * @sb: The VFS superblock
@@ -131,6 +137,7 @@ static void gfs2_put_super(struct super_block *sb)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	int error;
+	struct gfs2_jdesc *jd;
 
 	/*  Unfreeze the filesystem, if we need to  */
 
@@ -139,9 +146,25 @@ static void gfs2_put_super(struct super_block *sb)
 		gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
 	mutex_unlock(&sdp->sd_freeze_lock);
 
+	/* No more recovery requests */
+	set_bit(SDF_NORECOVERY, &sdp->sd_flags);
+	smp_mb();
+
+	/* Wait on outstanding recovery */
+restart:
+	spin_lock(&sdp->sd_jindex_spin);
+	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+		if (!test_bit(JDF_RECOVERY, &jd->jd_flags))
+			continue;
+		spin_unlock(&sdp->sd_jindex_spin);
+		wait_on_bit(&jd->jd_flags, JDF_RECOVERY,
+			    gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE);
+		goto restart;
+	}
+	spin_unlock(&sdp->sd_jindex_spin);
+
 	kthread_stop(sdp->sd_quotad_process);
 	kthread_stop(sdp->sd_logd_process);
-	kthread_stop(sdp->sd_recoverd_process);
 
 	if (!(sb->s_flags & MS_RDONLY)) {
 		error = gfs2_make_fs_ro(sdp);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 247e8f7..59d2695 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -13,8 +13,7 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/kthread.h>
-#include <linux/freezer.h>
+#include <linux/slow-work.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -441,18 +440,25 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
         kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
 }
 
-/**
- * gfs2_recover_journal - recover a given journal
- * @jd: the struct gfs2_jdesc describing the journal
- *
- * Acquire the journal's lock, check to see if the journal is clean, and
- * do recovery if necessary.
- *
- * Returns: errno
- */
+static int gfs2_recover_get_ref(struct slow_work *work)
+{
+	struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
+	if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags))
+		return -EBUSY;
+	return 0;
+}
 
-int gfs2_recover_journal(struct gfs2_jdesc *jd)
+static void gfs2_recover_put_ref(struct slow_work *work)
+{
+	struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
+	clear_bit(JDF_RECOVERY, &jd->jd_flags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
+}
+
+static void gfs2_recover_work(struct slow_work *work)
 {
+	struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
 	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
 	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
 	struct gfs2_log_header_host head;
@@ -569,7 +575,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
 		gfs2_glock_dq_uninit(&j_gh);
 
 	fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
-	return 0;
+	return;
 
 fail_gunlock_tr:
 	gfs2_glock_dq_uninit(&t_gh);
@@ -584,70 +590,28 @@ fail_gunlock_j:
 
 fail:
 	gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
-	return error;
 }
 
-static struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
-{
-	struct gfs2_jdesc *jd;
-	int found = 0;
-
-	spin_lock(&sdp->sd_jindex_spin);
+struct slow_work_ops gfs2_recover_ops = {
+	.get_ref = gfs2_recover_get_ref,
+	.put_ref = gfs2_recover_put_ref,
+	.execute = gfs2_recover_work,
+};
 
-	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
-		if (jd->jd_dirty) {
-			jd->jd_dirty = 0;
-			found = 1;
-			break;
-		}
-	}
-	spin_unlock(&sdp->sd_jindex_spin);
-
-	if (!found)
-		jd = NULL;
 
-	return jd;
-}
-
-/**
- * gfs2_check_journals - Recover any dirty journals
- * @sdp: the filesystem
- *
- */
-
-static void gfs2_check_journals(struct gfs2_sbd *sdp)
+static int gfs2_recovery_wait(void *word)
 {
-	struct gfs2_jdesc *jd;
-
-	for (;;) {
-		jd = gfs2_jdesc_find_dirty(sdp);
-		if (!jd)
-			break;
-
-		if (jd != sdp->sd_jdesc)
-			gfs2_recover_journal(jd);
-	}
+	schedule();
+	return 0;
 }
 
-/**
- * gfs2_recoverd - Recover dead machine's journals
- * @sdp: Pointer to GFS2 superblock
- *
- */
-
-int gfs2_recoverd(void *data)
+int gfs2_recover_journal(struct gfs2_jdesc *jd)
 {
-	struct gfs2_sbd *sdp = data;
-	unsigned long t;
-
-	while (!kthread_should_stop()) {
-		gfs2_check_journals(sdp);
-		t = gfs2_tune_get(sdp,  gt_recoverd_secs) * HZ;
-		if (freezing(current))
-			refrigerator();
-		schedule_timeout_interruptible(t);
-	}
-
+	int rv;
+	rv = slow_work_enqueue(&jd->jd_work);
+	if (rv)
+		return rv;
+	wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, TASK_UNINTERRUPTIBLE);
 	return 0;
 }
 
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index a8218ea..1616ac2 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -28,7 +28,7 @@ extern void gfs2_revoke_clean(struct gfs2_sbd *sdp);
 extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
 		    struct gfs2_log_header_host *head);
 extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
-extern int gfs2_recoverd(void *data);
+extern struct slow_work_ops gfs2_recover_ops;
 
 #endif /* __RECOVERY_DOT_H__ */
 
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 894bf77..9f6d48b 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -356,34 +356,33 @@ static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
 	return sprintf(buf, "%d\n", ls->ls_first_done);
 }
 
-static ssize_t recover_show(struct gfs2_sbd *sdp, char *buf)
-{
-	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-	return sprintf(buf, "%d\n", ls->ls_recover_jid);
-}
-
-static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
+static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
 {
+	unsigned jid;
 	struct gfs2_jdesc *jd;
+	int rv;
+
+	rv = sscanf(buf, "%u", &jid);
+	if (rv != 1)
+		return -EINVAL;
 
+	rv = -ESHUTDOWN;
 	spin_lock(&sdp->sd_jindex_spin);
+	if (test_bit(SDF_NORECOVERY, &sdp->sd_flags))
+		goto out;
+	rv = -EBUSY;
+	if (sdp->sd_jdesc->jd_jid == jid)
+		goto out;
+	rv = -ENOENT;
 	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
 		if (jd->jd_jid != jid)
 			continue;
-		jd->jd_dirty = 1;
+		rv = slow_work_enqueue(&jd->jd_work);
 		break;
 	}
+out:
 	spin_unlock(&sdp->sd_jindex_spin);
-}
-
-static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
-{
-	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-	ls->ls_recover_jid = simple_strtol(buf, NULL, 0);
-	gfs2_jdesc_make_dirty(sdp, ls->ls_recover_jid);
-	if (sdp->sd_recoverd_process)
-		wake_up_process(sdp->sd_recoverd_process);
-	return len;
+	return rv ? rv : len;
 }
 
 static ssize_t recover_done_show(struct gfs2_sbd *sdp, char *buf)
@@ -401,15 +400,15 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
 #define GDLM_ATTR(_name,_mode,_show,_store) \
 static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
 
-GDLM_ATTR(proto_name,     0444, proto_name_show,     NULL);
-GDLM_ATTR(block,          0644, block_show,          block_store);
-GDLM_ATTR(withdraw,       0644, withdraw_show,       withdraw_store);
-GDLM_ATTR(id,             0444, lkid_show,           NULL);
-GDLM_ATTR(first,          0444, lkfirst_show,        NULL);
-GDLM_ATTR(first_done,     0444, first_done_show,     NULL);
-GDLM_ATTR(recover,        0644, recover_show,        recover_store);
-GDLM_ATTR(recover_done,   0444, recover_done_show,   NULL);
-GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
+GDLM_ATTR(proto_name,     0444, proto_name_show,	NULL);
+GDLM_ATTR(block,          0644, block_show,		block_store);
+GDLM_ATTR(withdraw,       0644, withdraw_show,		withdraw_store);
+GDLM_ATTR(id,             0444, lkid_show,		NULL);
+GDLM_ATTR(first,          0444, lkfirst_show,		NULL);
+GDLM_ATTR(first_done,     0444, first_done_show,	NULL);
+GDLM_ATTR(recover,        0200, NULL,			recover_store);
+GDLM_ATTR(recover_done,   0444, recover_done_show,	NULL);
+GDLM_ATTR(recover_status, 0444, recover_status_show,	NULL);
 
 static struct attribute *lock_module_attrs[] = {
 	&gdlm_attr_proto_name.attr,
-- 
cgit v0.10.2


From e9ccb73ab57ada469602506496c42e5b4468ac3e Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 19 May 2009 10:23:23 +0100
Subject: GFS2: Update docs

Update a few things which were out of date, and fix a typo.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/Documentation/filesystems/gfs2-glocks.txt b/Documentation/filesystems/gfs2-glocks.txt
index 4dae9a3..0494f78 100644
--- a/Documentation/filesystems/gfs2-glocks.txt
+++ b/Documentation/filesystems/gfs2-glocks.txt
@@ -60,7 +60,7 @@ go_lock          | Called for the first local holder of a lock
 go_unlock        | Called on the final local unlock of a lock
 go_dump          | Called to print content of object for debugfs file, or on
                  | error to dump glock to the log.
-go_type;         | The type of the glock, LM_TYPE_.....
+go_type          | The type of the glock, LM_TYPE_.....
 go_min_hold_time | The minimum hold time
 
 The minimum hold time for each lock is the time after a remote lock
diff --git a/Documentation/filesystems/gfs2.txt b/Documentation/filesystems/gfs2.txt
index 593004b..5e3ab8f 100644
--- a/Documentation/filesystems/gfs2.txt
+++ b/Documentation/filesystems/gfs2.txt
@@ -11,18 +11,15 @@ their I/O so file system consistency is maintained.  One of the nifty
 features of GFS is perfect consistency -- changes made to the file system
 on one machine show up immediately on all other machines in the cluster.
 
-GFS uses interchangable inter-node locking mechanisms.  Different lock
-modules can plug into GFS and each file system selects the appropriate
-lock module at mount time.  Lock modules include:
+GFS uses interchangable inter-node locking mechanisms, the currently
+supported mechanisms are:
 
   lock_nolock -- allows gfs to be used as a local file system
 
   lock_dlm -- uses a distributed lock manager (dlm) for inter-node locking
   The dlm is found at linux/fs/dlm/
 
-In addition to interfacing with an external locking manager, a gfs lock
-module is responsible for interacting with external cluster management
-systems.  Lock_dlm depends on user space cluster management systems found
+Lock_dlm depends on user space cluster management systems found
 at the URL above.
 
 To use gfs as a local file system, no external clustering systems are
@@ -31,13 +28,19 @@ needed, simply:
   $ mkfs -t gfs2 -p lock_nolock -j 1 /dev/block_device
   $ mount -t gfs2 /dev/block_device /dir
 
-GFS2 is not on-disk compatible with previous versions of GFS.
+If you are using Fedora, you need to install the gfs2-utils package
+and, for lock_dlm, you will also need to install the cman package
+and write a cluster.conf as per the documentation.
+
+GFS2 is not on-disk compatible with previous versions of GFS, but it
+is pretty close.
 
 The following man pages can be found at the URL above:
-  gfs2_fsck	to repair a filesystem
+  fsck.gfs2	to repair a filesystem
   gfs2_grow	to expand a filesystem online
   gfs2_jadd	to add journals to a filesystem online
   gfs2_tool	to manipulate, examine and tune a filesystem
   gfs2_quota	to examine and change quota values in a filesystem
+  gfs2_convert	to convert a gfs filesystem to gfs2 in-place
   mount.gfs2	to help mount(8) mount a filesystem
   mkfs.gfs2	to make a filesystem
-- 
cgit v0.10.2


From 3755100dd5f66761aaaa7ae44c70b319a7c78a56 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 19 May 2009 18:33:04 +0900
Subject: ub: use __blk_end_request_all()

ub_end_rq() always tries to complete full request.  The @cmd_len
parameter was there because rq->data_len used to be overwritten with
residue count.  Drop @cmd_len and use __blk_end_request_all().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Pete Zaitcev <zaitcev@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 178f459..f32781c 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -360,8 +360,7 @@ static void ub_cmd_build_block(struct ub_dev *sc, struct ub_lun *lun,
 static void ub_cmd_build_packet(struct ub_dev *sc, struct ub_lun *lun,
     struct ub_scsi_cmd *cmd, struct ub_request *urq);
 static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd);
-static void ub_end_rq(struct request *rq, unsigned int status,
-    unsigned int cmd_len);
+static void ub_end_rq(struct request *rq, unsigned int status);
 static int ub_rw_cmd_retry(struct ub_dev *sc, struct ub_lun *lun,
     struct ub_request *urq, struct ub_scsi_cmd *cmd);
 static int ub_submit_scsi(struct ub_dev *sc, struct ub_scsi_cmd *cmd);
@@ -644,13 +643,13 @@ static int ub_request_fn_1(struct ub_lun *lun, struct request *rq)
 
 	if (atomic_read(&sc->poison)) {
 		blk_start_request(rq);
-		ub_end_rq(rq, DID_NO_CONNECT << 16, blk_rq_bytes(rq));
+		ub_end_rq(rq, DID_NO_CONNECT << 16);
 		return 0;
 	}
 
 	if (lun->changed && !blk_pc_request(rq)) {
 		blk_start_request(rq);
-		ub_end_rq(rq, SAM_STAT_CHECK_CONDITION, blk_rq_bytes(rq));
+		ub_end_rq(rq, SAM_STAT_CHECK_CONDITION);
 		return 0;
 	}
 
@@ -702,7 +701,7 @@ static int ub_request_fn_1(struct ub_lun *lun, struct request *rq)
 
 drop:
 	ub_put_cmd(lun, cmd);
-	ub_end_rq(rq, DID_ERROR << 16, blk_rq_bytes(rq));
+	ub_end_rq(rq, DID_ERROR << 16);
 	return 0;
 }
 
@@ -777,7 +776,6 @@ static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 	struct ub_request *urq = cmd->back;
 	struct request *rq;
 	unsigned int scsi_status;
-	unsigned int cmd_len;
 
 	rq = urq->rq;
 
@@ -816,17 +814,14 @@ static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 
 	urq->rq = NULL;
 
-	cmd_len = cmd->len;
 	ub_put_cmd(lun, cmd);
-	ub_end_rq(rq, scsi_status, cmd_len);
+	ub_end_rq(rq, scsi_status);
 	blk_start_queue(lun->disk->queue);
 }
 
-static void ub_end_rq(struct request *rq, unsigned int scsi_status,
-    unsigned int cmd_len)
+static void ub_end_rq(struct request *rq, unsigned int scsi_status)
 {
 	int error;
-	long rqlen;
 
 	if (scsi_status == 0) {
 		error = 0;
@@ -834,12 +829,7 @@ static void ub_end_rq(struct request *rq, unsigned int scsi_status,
 		error = -EIO;
 		rq->errors = scsi_status;
 	}
-	rqlen = blk_rq_bytes(rq);    /* Oddly enough, this is the residue. */
-	if (__blk_end_request(rq, error, cmd_len)) {
-		printk(KERN_WARNING DRV_NAME
-		    ": __blk_end_request blew, %s-cmd total %u rqlen %ld\n",
-		    blk_pc_request(rq)? "pc": "fs", cmd_len, rqlen);
-	}
+	__blk_end_request_all(rq, error);
 }
 
 static int ub_rw_cmd_retry(struct ub_dev *sc, struct ub_lun *lun,
-- 
cgit v0.10.2


From 5f49f63178360b07a095bd33b0d850d60edf7590 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 19 May 2009 18:33:05 +0900
Subject: block: set rq->resid_len to blk_rq_bytes() on issue

In commit c3a4d78c580de4edc9ef0f7c59812fb02ceb037f, while introducing
rq->resid_len, the default value of residue count was changed from
full count to zero.  The conversion was done under the assumption that
when a request fails residue count wasn't defined.  However, Boaz and
James pointed out that this wasn't true and the residue count should
be preserved for failed requests too.

This patchset restores the original behavior by setting rq->resid_len
to blk_rq_bytes(rq) on request start and restoring explicit clearing
in affected drivers.  While at it, take advantage of the fact that
rq->resid_len is set to full count where applicable.

* ide-cd: rq->resid_len cleared on pc success

* mptsas: req->resid_len cleared on success

* sas_expander: rsp/req->resid_len cleared on success

* mpt2sas_transport: req->resid_len cleared on success

* ide-cd, ide-tape, mptsas, sas_host_smp, mpt2sas_transport, ub: take
  advantage of initial full count to simplify code

Boaz Harrosh spotted bug in resid_len initialization.  Fixed as
suggested.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Borislav Petkov <petkovbb@googlemail.com>
Cc: Boaz Harrosh <bharrosh@panasas.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Pete Zaitcev <zaitcev@redhat.com>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: Eric Moore <Eric.Moore@lsi.com>
Cc: Darrick J. Wong <djwong@us.ibm.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index a2d97de..e3f7e6a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1783,9 +1783,10 @@ void blk_start_request(struct request *req)
 	blk_dequeue_request(req);
 
 	/*
-	 * We are now handing the request to the hardware, add the
-	 * timeout handler.
+	 * We are now handing the request to the hardware, initialize
+	 * resid_len to full count and add the timeout handler.
 	 */
+	req->resid_len = blk_rq_bytes(req);
 	blk_add_timer(req);
 }
 EXPORT_SYMBOL(blk_start_request);
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index f32781c..e67bbae 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -781,8 +781,10 @@ static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 
 	if (cmd->error == 0) {
 		if (blk_pc_request(rq)) {
-			if (cmd->act_len < blk_rq_bytes(rq))
-				rq->resid_len = blk_rq_bytes(rq) - cmd->act_len;
+			if (cmd->act_len >= rq->resid_len)
+				rq->resid_len = 0;
+			else
+				rq->resid_len -= cmd->act_len;
 			scsi_status = 0;
 		} else {
 			if (cmd->act_len != cmd->len) {
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 4c7792f..081aed6 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -699,6 +699,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 
 out_end:
 	if (blk_pc_request(rq) && rc == 0) {
+		rq->resid_len = 0;
 		blk_end_request_all(rq, 0);
 		hwif->rq = NULL;
 	} else {
@@ -718,8 +719,7 @@ out_end:
 
 		/* make sure it's fully ended */
 		if (blk_fs_request(rq) == 0) {
-			rq->resid_len = blk_rq_bytes(rq) -
-				(cmd->nbytes - cmd->nleft);
+			rq->resid_len -= cmd->nbytes - cmd->nleft;
 			if (uptodate == 0 && (cmd->tf_flags & IDE_TFLAG_WRITE))
 				rq->resid_len += cmd->last_xfer_len;
 		}
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index e166045..683ff37 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -380,7 +380,7 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc)
 		}
 
 		tape->first_frame += blocks;
-		rq->resid_len = blk_rq_bytes(rq) - blocks * tape->blk_size;
+		rq->resid_len -= blocks * tape->blk_size;
 
 		if (pc->error) {
 			uptodate = 0;
diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c
index 4e6fcf0..79f5433 100644
--- a/drivers/message/fusion/mptsas.c
+++ b/drivers/message/fusion/mptsas.c
@@ -1357,7 +1357,8 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 		smprep = (SmpPassthroughReply_t *)ioc->sas_mgmt.reply;
 		memcpy(req->sense, smprep, sizeof(*smprep));
 		req->sense_len = sizeof(*smprep);
-		rsp->resid_len = blk_rq_bytes(rsp) - smprep->ResponseDataLength;
+		req->resid_len = 0;
+		rsp->resid_len -= smprep->ResponseDataLength;
 	} else {
 		printk(MYIOC_s_ERR_FMT "%s: smp passthru reply failed to be returned\n",
 		    ioc->name, __func__);
diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
index 531af9e..54fa1e4 100644
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -1937,7 +1937,11 @@ int sas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	if (ret > 0) {
 		/* positive number is the untransferred residual */
 		rsp->resid_len = ret;
+		req->resid_len = 0;
 		ret = 0;
+	} else if (ret == 0) {
+		rsp->resid_len = 0;
+		req->resid_len = 0;
 	}
 
 	return ret;
diff --git a/drivers/scsi/libsas/sas_host_smp.c b/drivers/scsi/libsas/sas_host_smp.c
index be9a951..1bc3b75 100644
--- a/drivers/scsi/libsas/sas_host_smp.c
+++ b/drivers/scsi/libsas/sas_host_smp.c
@@ -176,9 +176,6 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 	resp_data[1] = req_data[1];
 	resp_data[2] = SMP_RESP_FUNC_UNK;
 
-	req->resid_len = blk_rq_bytes(req);
-	rsp->resid_len = blk_rq_bytes(rsp);
-
 	switch (req_data[1]) {
 	case SMP_REPORT_GENERAL:
 		req->resid_len -= 8;
diff --git a/drivers/scsi/mpt2sas/mpt2sas_transport.c b/drivers/scsi/mpt2sas/mpt2sas_transport.c
index af95a44..5c65da5 100644
--- a/drivers/scsi/mpt2sas/mpt2sas_transport.c
+++ b/drivers/scsi/mpt2sas/mpt2sas_transport.c
@@ -1170,8 +1170,8 @@ transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 
 		memcpy(req->sense, mpi_reply, sizeof(*mpi_reply));
 		req->sense_len = sizeof(*mpi_reply);
-		rsp->resid_len = blk_rq_bytes(rsp) -
-				 mpi_reply->ResponseDataLength;
+		req->resid_len = 0;
+		rsp->resid_len -= mpi_reply->ResponseDataLength;
 	} else {
 		dtransportprintk(ioc, printk(MPT2SAS_DEBUG_FMT
 		    "%s - no reply\n", ioc->name, __func__));
-- 
cgit v0.10.2


From 4fc981ef9e7c0953d5c4896ce088b19c50cb018f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 19 May 2009 18:33:06 +0900
Subject: bio: always copy back data for copied kernel requests

When a read bio_copy_kern() request fails, the content of the bounce
buffer is not copied back.  However, as request failure doesn't
necessarily mean complete failure, the buffer state can be useful.
This behavior is also inconsistent with the user map counterpart and
causes the subtle difference between bounced and unbounced IO causes
confusion.

This patch makes bio_copy_kern_endio() ignore @err and always copy
back data on request completion.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Boaz Harrosh <bharrosh@panasas.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/bio.c b/fs/bio.c
index 7bbc98f..ee3bc67 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1198,7 +1198,7 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
 		char *addr = page_address(bvec->bv_page);
 		int len = bmd->iovecs[i].bv_len;
 
-		if (read && !err)
+		if (read)
 			memcpy(p, addr, len);
 
 		__free_page(bvec->bv_page);
-- 
cgit v0.10.2


From b2858d7d1639c04ca3c54988d76c5f7300b76f1c Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 19 May 2009 11:37:46 +0200
Subject: splice: fix kmaps in default_file_splice_write()

Unfortunately multiple kmap() within a single thread are deadlockable,
so writing out multiple buffers with writev() isn't possible.

Change the implementation so that it does a separate write() for each
buffer.  This actually simplifies the code a lot since the
splice_from_pipe() helper can be used.

This limitation is caused by HIGHMEM pages, and so only affects a
subset of architectures and configurations.  In the future it may be
worth to implement default_file_splice_write() in a more efficient way
on configs that allow it.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/splice.c b/fs/splice.c
index 41179c0..73766d2 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -535,8 +535,8 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
 	return res;
 }
 
-static ssize_t kernel_writev(struct file *file, const struct iovec *vec,
-			    unsigned long vlen, loff_t *ppos)
+static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
+			    loff_t pos)
 {
 	mm_segment_t old_fs;
 	ssize_t res;
@@ -544,7 +544,7 @@ static ssize_t kernel_writev(struct file *file, const struct iovec *vec,
 	old_fs = get_fs();
 	set_fs(get_ds());
 	/* The cast to a user pointer is valid due to the set_fs() */
-	res = vfs_writev(file, (const struct iovec __user *)vec, vlen, ppos);
+	res = vfs_write(file, (const char __user *)buf, count, &pos);
 	set_fs(old_fs);
 
 	return res;
@@ -1003,120 +1003,34 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 
 EXPORT_SYMBOL(generic_file_splice_write);
 
-static struct pipe_buffer *nth_pipe_buf(struct pipe_inode_info *pipe, int n)
+static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+			  struct splice_desc *sd)
 {
-	return &pipe->bufs[(pipe->curbuf + n) % PIPE_BUFFERS];
+	int ret;
+	void *data;
+
+	ret = buf->ops->confirm(pipe, buf);
+	if (ret)
+		return ret;
+
+	data = buf->ops->map(pipe, buf, 0);
+	ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
+	buf->ops->unmap(pipe, buf, data);
+
+	return ret;
 }
 
 static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
 					 struct file *out, loff_t *ppos,
 					 size_t len, unsigned int flags)
 {
-	ssize_t ret = 0;
-	ssize_t total_len = 0;
-	int do_wakeup = 0;
-
-	pipe_lock(pipe);
-	while (len) {
-		struct pipe_buffer *buf;
-		void *data[PIPE_BUFFERS];
-		struct iovec vec[PIPE_BUFFERS];
-		unsigned int nr_pages = 0;
-		unsigned int write_len = 0;
-		unsigned int now_len = len;
-		unsigned int this_len;
-		int i;
-
-		BUG_ON(pipe->nrbufs > PIPE_BUFFERS);
-		for (i = 0; i < pipe->nrbufs && now_len; i++) {
-			buf = nth_pipe_buf(pipe, i);
-
-			ret = buf->ops->confirm(pipe, buf);
-			if (ret)
-				break;
-
-			data[i] = buf->ops->map(pipe, buf, 0);
-			this_len = min(buf->len, now_len);
-			vec[i].iov_base = (void __user *) data[i] + buf->offset;
-			vec[i].iov_len = this_len;
-			now_len -= this_len;
-			write_len += this_len;
-			nr_pages++;
-		}
-
-		if (nr_pages) {
-			ret = kernel_writev(out, vec, nr_pages, ppos);
-			if (ret == 0)
-				ret = -EIO;
-			if (ret > 0) {
-				len -= ret;
-				total_len += ret;
-			}
-		}
-
-		for (i = 0; i < nr_pages; i++) {
-			buf = nth_pipe_buf(pipe, i);
-			buf->ops->unmap(pipe, buf, data[i]);
-
-			if (ret > 0) {
-				this_len = min_t(unsigned, vec[i].iov_len, ret);
-				buf->offset += this_len;
-				buf->len -= this_len;
-				ret -= this_len;
-			}
-		}
-
-		if (ret < 0)
-			break;
-
-		while (pipe->nrbufs) {
-			const struct pipe_buf_operations *ops;
-
-			buf = nth_pipe_buf(pipe, 0);
-			if (buf->len)
-				break;
-
-			ops = buf->ops;
-			buf->ops = NULL;
-			ops->release(pipe, buf);
-			pipe->curbuf = (pipe->curbuf + 1) % PIPE_BUFFERS;
-			pipe->nrbufs--;
-			if (pipe->inode)
-				do_wakeup = 1;
-		}
-
-		if (pipe->nrbufs)
-			continue;
-		if (!pipe->writers)
-			break;
-		if (!pipe->waiting_writers) {
-			if (total_len)
-				break;
-		}
-
-		if (flags & SPLICE_F_NONBLOCK) {
-			ret = -EAGAIN;
-			break;
-		}
-
-		if (signal_pending(current)) {
-			ret = -ERESTARTSYS;
-			break;
-		}
-
-		if (do_wakeup) {
-			wakeup_pipe_writers(pipe);
-			do_wakeup = 0;
-		}
-
-		pipe_wait(pipe);
-	}
-	pipe_unlock(pipe);
+	ssize_t ret;
 
-	if (do_wakeup)
-		wakeup_pipe_writers(pipe);
+	ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
+	if (ret > 0)
+		*ppos += ret;
 
-	return total_len ? total_len : ret;
+	return ret;
 }
 
 /**
-- 
cgit v0.10.2


From 3a5a39276d2a32b05b1ee25b384516805b17cf87 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Sun, 17 May 2009 18:55:18 +0300
Subject: block: allow blk_rq_map_kern to append to requests

Use blk_rq_append_bio() internally instead of blk_rq_bio_prep()
so blk_rq_map_kern can be called multiple times, to map multiple
buffers.

This is in the effort to un-export blk_rq_append_bio()

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-map.c b/block/blk-map.c
index 56082be..caa05a6 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -282,7 +282,8 @@ EXPORT_SYMBOL(blk_rq_unmap_user);
  *
  * Description:
  *    Data will be mapped directly if possible. Otherwise a bounce
- *    buffer is used.
+ *    buffer is used. Can be called multple times to append multple
+ *    buffers.
  */
 int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 		    unsigned int len, gfp_t gfp_mask)
@@ -290,6 +291,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	int reading = rq_data_dir(rq) == READ;
 	int do_copy = 0;
 	struct bio *bio;
+	int ret;
 
 	if (len > (q->max_hw_sectors << 9))
 		return -EINVAL;
@@ -311,7 +313,13 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	if (do_copy)
 		rq->cmd_flags |= REQ_COPY_USER;
 
-	blk_rq_bio_prep(q, rq, bio);
+	ret = blk_rq_append_bio(q, rq, bio);
+	if (unlikely(ret)) {
+		/* request is too big */
+		bio_put(bio);
+		return ret;
+	}
+
 	blk_queue_bounce(q, &rq->bio);
 	rq->buffer = NULL;
 	return 0;
-- 
cgit v0.10.2


From bc38bf106c967389a465d926be22c7371abba69d Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Sun, 17 May 2009 18:56:17 +0300
Subject: libosd: Use new blk_rq_map_kern

Now that blk_rq_map_kern will append the buffer onto the
request we can use it easily for adding extra segments
(eg. attributes)

This patch is dependent on a block layer patch titled:
   [BLOCK] allow blk_rq_map_kern to append to requests

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index d178a8b..bf66e30 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -826,26 +826,6 @@ int osd_req_add_set_attr_list(struct osd_request *or,
 }
 EXPORT_SYMBOL(osd_req_add_set_attr_list);
 
-static int _append_map_kern(struct request *req,
-	void *buff, unsigned len, gfp_t flags)
-{
-	struct bio *bio;
-	int ret;
-
-	bio = bio_map_kern(req->q, buff, len, flags);
-	if (IS_ERR(bio)) {
-		OSD_ERR("Failed bio_map_kern(%p, %d) => %ld\n", buff, len,
-			PTR_ERR(bio));
-		return PTR_ERR(bio);
-	}
-	ret = blk_rq_append_bio(req->q, req, bio);
-	if (ret) {
-		OSD_ERR("Failed blk_rq_append_bio(%p) => %d\n", bio, ret);
-		bio_put(bio);
-	}
-	return ret;
-}
-
 static int _req_append_segment(struct osd_request *or,
 	unsigned padding, struct _osd_req_data_segment *seg,
 	struct _osd_req_data_segment *last_seg, struct _osd_io_info *io)
@@ -861,14 +841,14 @@ static int _req_append_segment(struct osd_request *or,
 		else
 			pad_buff = io->pad_buff;
 
-		ret = _append_map_kern(io->req, pad_buff, padding,
+		ret = blk_rq_map_kern(io->req->q, io->req, pad_buff, padding,
 				       or->alloc_flags);
 		if (ret)
 			return ret;
 		io->total_bytes += padding;
 	}
 
-	ret = _append_map_kern(io->req, seg->buff, seg->total_bytes,
+	ret = blk_rq_map_kern(io->req->q, io->req, seg->buff, seg->total_bytes,
 			       or->alloc_flags);
 	if (ret)
 		return ret;
-- 
cgit v0.10.2


From 79eb63e9e5875b84341a3a05f8e6ae9cdb4bb6f6 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Sun, 17 May 2009 18:57:15 +0300
Subject: block: Add blk_make_request(), takes bio, returns a request

New block API:
given a struct bio allocates a new request. This is the parallel of
generic_make_request for BLOCK_PC commands users.

The passed bio may be a chained-bio. The bio is bounced if needed
inside the call to this member.

This is in the effort of un-exporting blk_rq_append_bio().

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
CC: Jeff Garzik <jeff@garzik.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index e3f7e6a..bec1d69 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -891,6 +891,51 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 EXPORT_SYMBOL(blk_get_request);
 
 /**
+ * blk_make_request - given a bio, allocate a corresponding struct request.
+ *
+ * @bio:  The bio describing the memory mappings that will be submitted for IO.
+ *        It may be a chained-bio properly constructed by block/bio layer.
+ *
+ * blk_make_request is the parallel of generic_make_request for BLOCK_PC
+ * type commands. Where the struct request needs to be farther initialized by
+ * the caller. It is passed a &struct bio, which describes the memory info of
+ * the I/O transfer.
+ *
+ * The caller of blk_make_request must make sure that bi_io_vec
+ * are set to describe the memory buffers. That bio_data_dir() will return
+ * the needed direction of the request. (And all bio's in the passed bio-chain
+ * are properly set accordingly)
+ *
+ * If called under none-sleepable conditions, mapped bio buffers must not
+ * need bouncing, by calling the appropriate masked or flagged allocator,
+ * suitable for the target device. Otherwise the call to blk_queue_bounce will
+ * BUG.
+ */
+struct request *blk_make_request(struct request_queue *q, struct bio *bio,
+				 gfp_t gfp_mask)
+{
+	struct request *rq = blk_get_request(q, bio_data_dir(bio), gfp_mask);
+
+	if (unlikely(!rq))
+		return ERR_PTR(-ENOMEM);
+
+	for_each_bio(bio) {
+		struct bio *bounce_bio = bio;
+		int ret;
+
+		blk_queue_bounce(q, &bounce_bio);
+		ret = blk_rq_append_bio(q, rq, bounce_bio);
+		if (unlikely(ret)) {
+			blk_put_request(rq);
+			return ERR_PTR(ret);
+		}
+	}
+
+	return rq;
+}
+EXPORT_SYMBOL(blk_make_request);
+
+/**
  * blk_requeue_request - put a request back on queue
  * @q:		request queue where request should be inserted
  * @rq:		request to be inserted
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f9d60a7..88a83e1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -740,6 +740,8 @@ extern void blk_rq_init(struct request_queue *q, struct request *rq);
 extern void blk_put_request(struct request *);
 extern void __blk_put_request(struct request_queue *, struct request *);
 extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
+extern struct request *blk_make_request(struct request_queue *, struct bio *,
+					gfp_t);
 extern void blk_insert_request(struct request_queue *, struct request *, int, void *);
 extern void blk_requeue_request(struct request_queue *, struct request *);
 extern int blk_rq_check_limits(struct request_queue *q, struct request *rq);
-- 
cgit v0.10.2


From c29b70f6ee4f2fa3ef07f55bc9082945861e5391 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Sun, 17 May 2009 18:58:41 +0300
Subject: libosd: Use of new blk_make_request

Use new blk_make_request() to allocate a request from bio
and avoid using deprecated blk_rq_append_bio().

This patch is dependent on a block layer patch titled:
    [BLOCK] New blk_make_request() takes bio returns request

This is the last usage of blk_rq_append_bio in osd, it can now
be un-exported.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
CC: Jeff Garzik <jeff@garzik.org>
CC: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index bf66e30..865ec0f 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -1200,6 +1200,21 @@ static int _osd_req_finalize_data_integrity(struct osd_request *or,
 /*
  * osd_finalize_request and helpers
  */
+static struct request *_make_request(struct request_queue *q, bool has_write,
+			      struct _osd_io_info *oii, gfp_t flags)
+{
+	if (oii->bio)
+		return blk_make_request(q, oii->bio, flags);
+	else {
+		struct request *req;
+
+		req = blk_get_request(q, has_write ? WRITE : READ, flags);
+		if (unlikely(!req))
+			return ERR_PTR(-ENOMEM);
+
+		return req;
+	}
+}
 
 static int _init_blk_request(struct osd_request *or,
 	bool has_in, bool has_out)
@@ -1208,11 +1223,13 @@ static int _init_blk_request(struct osd_request *or,
 	struct scsi_device *scsi_device = or->osd_dev->scsi_device;
 	struct request_queue *q = scsi_device->request_queue;
 	struct request *req;
-	int ret = -ENOMEM;
+	int ret;
 
-	req = blk_get_request(q, has_out, flags);
-	if (!req)
+	req = _make_request(q, has_out, has_out ? &or->out : &or->in, flags);
+	if (IS_ERR(req)) {
+		ret = PTR_ERR(req);
 		goto out;
+	}
 
 	or->request = req;
 	req->cmd_type = REQ_TYPE_BLOCK_PC;
@@ -1225,9 +1242,10 @@ static int _init_blk_request(struct osd_request *or,
 		or->out.req = req;
 		if (has_in) {
 			/* allocate bidi request */
-			req = blk_get_request(q, READ, flags);
-			if (!req) {
+			req = _make_request(q, false, &or->in, flags);
+			if (IS_ERR(req)) {
 				OSD_DEBUG("blk_get_request for bidi failed\n");
+				ret = PTR_ERR(req);
 				goto out;
 			}
 			req->cmd_type = REQ_TYPE_BLOCK_PC;
@@ -1271,26 +1289,6 @@ int osd_finalize_request(struct osd_request *or,
 		return ret;
 	}
 
-	if (or->out.bio) {
-		ret = blk_rq_append_bio(or->request->q, or->out.req,
-					or->out.bio);
-		if (ret) {
-			OSD_DEBUG("blk_rq_append_bio out failed\n");
-			return ret;
-		}
-		OSD_DEBUG("out bytes=%llu (bytes_req=%u)\n",
-			_LLU(or->out.total_bytes), blk_rq_bytes(or->out.req));
-	}
-	if (or->in.bio) {
-		ret = blk_rq_append_bio(or->request->q, or->in.req, or->in.bio);
-		if (ret) {
-			OSD_DEBUG("blk_rq_append_bio in failed\n");
-			return ret;
-		}
-		OSD_DEBUG("in bytes=%llu (bytes_req=%u)\n",
-			_LLU(or->in.total_bytes), blk_rq_bytes(or->in.req));
-	}
-
 	or->out.pad_buff = sg_out_pad_buffer;
 	or->in.pad_buff = sg_in_pad_buffer;
 
-- 
cgit v0.10.2


From a411f4bbb89f1f08687b344064d6775bce1e4658 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Sun, 17 May 2009 19:00:01 +0300
Subject: block: Un-export blk_rq_append_bio

OSD was the last in-tree user of blk_rq_append_bio(). Now
that it is fixed blk_rq_append_bio is un-exported and
is only used internally by block layer.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-map.c b/block/blk-map.c
index caa05a6..ef2492a 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -24,7 +24,6 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 	}
 	return 0;
 }
-EXPORT_SYMBOL(blk_rq_append_bio);
 
 static int __blk_rq_unmap_user(struct bio *bio)
 {
diff --git a/block/blk.h b/block/blk.h
index 9e0042c..c863ec2 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -13,6 +13,8 @@ extern struct kobj_type blk_queue_ktype;
 void init_request_from_bio(struct request *req, struct bio *bio);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 			struct bio *bio);
+int blk_rq_append_bio(struct request_queue *q, struct request *rq,
+		      struct bio *bio);
 void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 88a83e1..564445b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -758,12 +758,6 @@ extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 			 struct scsi_ioctl_command __user *);
 
 /*
- * Temporary export, until SCSI gets fixed up.
- */
-extern int blk_rq_append_bio(struct request_queue *q, struct request *rq,
-			     struct bio *bio);
-
-/*
  * A queue has just exitted congestion.  Note this in the global counter of
  * congested queues, and wake up anyone who was waiting for requests to be
  * put back.
-- 
cgit v0.10.2


From 4aee2ad461889132bfb5a1518a9580d00e17008c Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Tue, 19 May 2009 17:07:01 +0530
Subject: x86: asm/processor.h: remove double declaration

Remove double declaration of:

 extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
 extern unsigned short num_cache_leaves;

they are already defined in the same file.

[ Impact: cleanup ]

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
LKML-Reference: <1242733021.3377.1.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 34c5237..85628ea 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -403,9 +403,6 @@ DECLARE_PER_CPU(unsigned long, stack_canary);
 extern unsigned int xstate_size;
 extern void free_thread_xstate(struct task_struct *);
 extern struct kmem_cache *task_xstate_cachep;
-extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
-extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
-extern unsigned short num_cache_leaves;
 
 struct thread_struct {
 	/* Cached TLS descriptors: */
-- 
cgit v0.10.2


From 4c6f18fc81565967da20f2d4a3922cdba33f8e2b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 18 May 2009 10:23:28 -0700
Subject: x86, io-apic: Don't mark pin_programmed early

Peter bisected that:

| commit b9c61b70075c87a8612624736faf4a2de5b1ed30
| Date:   Wed May 6 10:10:06 2009 -0700
|
|     x86/pci: update pirq_enable_irq() to setup io apic routing
|
|     So we can set io apic routing only when enabling the device irq.

wrecked his opteron box, ata1 interrupts fail to get through.

ata1 is using irq 11:

[    1.451839] sata_svw 0000:01:0e.0: version 2.3
[    1.456333] sata_svw 0000:01:0e.0: PCI INT A -> GSI 11 (level, low) -> IRQ 11
[    1.463639] scsi0 : sata_svw
[    1.466949] scsi1 : sata_svw
[    1.470022] scsi2 : sata_svw
[    1.473090] scsi3 : sata_svw
[    1.476112] ata1: SATA max UDMA/133 mmio m8192@0xff3fe000 port 0xff3fe000 irq 11
[    1.483490] ata2: SATA max UDMA/133 mmio m8192@0xff3fe000 port 0xff3fe100 irq 11
[    1.490870] ata3: SATA max UDMA/133 mmio m8192@0xff3fe000 port 0xff3fe200 irq 11
[    1.498247] ata4: SATA max UDMA/133 mmio m8192@0xff3fe000 port 0xff3fe300 irq 11

that pin is overlapped with pin with legacy ones.

We should not set bits in pin_programmed here, so that those bit could
be set later via io_apic_set_pci_routing().

[ Impact: fix boot hang on certain systems ]

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Yinghai Lu <yinghai.lu@kernel.org>
Tested-by: Peter Zijlstra <peterz@infradead.org>
Cc: Jack Steiner <steiner@sgi.com>
LKML-Reference: <4A119990.9020606@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index ce1ac74..ac7f3b6 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1537,7 +1537,10 @@ static void __init setup_IO_APIC_irqs(void)
 		}
 		cfg = desc->chip_data;
 		add_pin_to_irq_node(cfg, node, apic_id, pin);
-		set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
+		/*
+		 * don't mark it in pin_programmed, so later acpi could
+		 * set it correctly when irq < 16
+		 */
 		setup_IO_APIC_irq(apic_id, pin, irq, desc,
 				irq_trigger(idx), irq_polarity(idx));
 	}
-- 
cgit v0.10.2


From ef9e8b14a5c1d0afbaf12b4c3b271188ddfc52a4 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 19 May 2009 14:25:16 +0100
Subject: GFS2: Don't warn when delete inode fails on ro filesystem

If the filesystem is read-only, then we expect that delete inode
will fail, so there is no need to warn about it.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index a3c2272..2fd1dcb 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -714,7 +714,7 @@ out_unlock:
 		gfs2_glock_dq(&ip->i_iopen_gh);
 	gfs2_holder_uninit(&ip->i_iopen_gh);
 	gfs2_glock_dq_uninit(&gh);
-	if (error && error != GLR_TRYFAILED)
+	if (error && error != GLR_TRYFAILED && error != -EROFS)
 		fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
 out:
 	truncate_inode_pages(&inode->i_data, 0);
-- 
cgit v0.10.2


From c5642f4bbae30122beb696e723f6da273caa570e Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Tue, 19 May 2009 09:02:23 -0400
Subject: selinux: remove obsolete read buffer limit from sel_read_bool

On Tue, 2009-05-19 at 00:05 -0400, Eamon Walsh wrote:
> Recent versions of coreutils have bumped the read buffer size from 4K to
> 32K in several of the utilities.
>
> This means that "cat /selinux/booleans/xserver_object_manager" no longer
> works, it returns "Invalid argument" on F11.  getsebool works fine.
>
> sel_read_bool has a check for "count > PAGE_SIZE" that doesn't seem to
> be present in the other read functions.  Maybe it could be removed?

Yes, that check is obsoleted by the conversion of those functions to
using simple_read_from_buffer(), which will reduce count if necessary to
what is available in the buffer.

Signed-off-by:  Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 8d4007f..b4fc506 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -803,10 +803,6 @@ static ssize_t sel_read_bool(struct file *filep, char __user *buf,
 		goto out;
 	}
 
-	if (count > PAGE_SIZE) {
-		ret = -EINVAL;
-		goto out;
-	}
 	page = (char *)get_zeroed_page(GFP_KERNEL);
 	if (!page) {
 		ret = -ENOMEM;
-- 
cgit v0.10.2


From 8b6427a2a8f7dd43e9208fb33a3b116d66db4979 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 19 May 2009 09:57:03 -0400
Subject: cifs: fix pointer initialization and checks in cifs_follow_symlink
 (try #4)

This is the third respin of the patch posted yesterday to fix the error
handling in cifs_follow_symlink. It also includes a fix for a bogus NULL
pointer check in CIFSSMBQueryUnixSymLink that Jeff Moyer spotted.

It's possible for CIFSSMBQueryUnixSymLink to return without setting
target_path to a valid pointer. If that happens then the current value
to which we're initializing this pointer could cause an oops when it's
kfree'd.

This patch is a little more comprehensive than the last patches. It
reorganizes cifs_follow_link a bit for (hopefully) better readability.
It should also eliminate the uneeded allocation of full_path on servers
without unix extensions (assuming they can get to this point anyway, of
which I'm not convinced).

On a side note, I'm not sure I agree with the logic of enabling this
query even when unix extensions are disabled on the client. It seems
like that should disable this as well. But, changing that is outside the
scope of this fix, so I've left it alone for now.

Reported-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Reviewed-by: Christoph Hellwig <hch@inraded.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 5759ba5..d062602 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2475,7 +2475,7 @@ querySymLinkRetry:
 			/* BB FIXME investigate remapping reserved chars here */
 			*symlinkinfo = cifs_strndup_from_ucs(data_start, count,
 						    is_unicode, nls_codepage);
-			if (!symlinkinfo)
+			if (!*symlinkinfo)
 				rc = -ENOMEM;
 		}
 	}
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index ea9d11e..cd83c53 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -107,48 +107,48 @@ void *
 cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 {
 	struct inode *inode = direntry->d_inode;
-	int rc = -EACCES;
+	int rc = -ENOMEM;
 	int xid;
 	char *full_path = NULL;
-	char *target_path = ERR_PTR(-ENOMEM);
-	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
+	char *target_path = NULL;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsTconInfo *tcon = cifs_sb->tcon;
 
 	xid = GetXid();
 
-	full_path = build_path_from_dentry(direntry);
+	/*
+	 * For now, we just handle symlinks with unix extensions enabled.
+	 * Eventually we should handle NTFS reparse points, and MacOS
+	 * symlink support. For instance...
+	 *
+	 * rc = CIFSSMBQueryReparseLinkInfo(...)
+	 *
+	 * For now, just return -EACCES when the server doesn't support posix
+	 * extensions. Note that we still allow querying symlinks when posix
+	 * extensions are manually disabled. We could disable these as well
+	 * but there doesn't seem to be any harm in allowing the client to
+	 * read them.
+	 */
+	if (!(tcon->ses->capabilities & CAP_UNIX)) {
+		rc = -EACCES;
+		goto out;
+	}
 
+	full_path = build_path_from_dentry(direntry);
 	if (!full_path)
 		goto out;
 
 	cFYI(1, ("Full path: %s inode = 0x%p", full_path, inode));
-	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb->tcon;
-
-	/* We could change this to:
-		if (pTcon->unix_ext)
-	   but there does not seem any point in refusing to
-	   get symlink info if we can, even if unix extensions
-	   turned off for this mount */
-
-	if (pTcon->ses->capabilities & CAP_UNIX)
-		rc = CIFSSMBUnixQuerySymLink(xid, pTcon, full_path,
-					     &target_path,
-					     cifs_sb->local_nls);
-	else {
-		/* BB add read reparse point symlink code here */
-		/* rc = CIFSSMBQueryReparseLinkInfo */
-		/* BB Add code to Query ReparsePoint info */
-		/* BB Add MAC style xsymlink check here if enabled */
-	}
 
+	rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
+				     cifs_sb->local_nls);
+	kfree(full_path);
+out:
 	if (rc != 0) {
 		kfree(target_path);
 		target_path = ERR_PTR(rc);
 	}
 
-	kfree(full_path);
-out:
 	FreeXid(xid);
 	nd_set_link(nd, target_path);
 	return NULL;
-- 
cgit v0.10.2


From 8e7d2b2c6ecd3c21a54b877eae3d5be48292e6b5 Mon Sep 17 00:00:00 2001
From: Jesse Barnes <jbarnes@virtuousgeek.org>
Date: Fri, 8 May 2009 16:13:25 -0700
Subject: drm/i915: allocate large pointer arrays with vmalloc

For awhile now, many of the GEM code paths have allocated page or
object arrays with the slab allocator.  This is nice and fast, but
won't work well if memory is fragmented, since the slab allocator works
with physically contiguous memory (i.e. order > 2 allocations are
likely to fail fairly early after booting and doing some work).

This patch works around the issue by falling back to vmalloc for
>PAGE_SIZE allocations.  This is ugly, but much less work than chaining
a bunch of pages together by hand (suprisingly there's not a bunch of
generic kernel helpers for this yet afaik).  vmalloc space is somewhat
precious on 32 bit kernels, but our allocations shouldn't be big enough
to cause problems, though they're routinely more than a page.

Note that this patch doesn't address the unchecked
alloc-based-on-ioctl-args in GEM; that needs to be fixed in a separate
patch.

Also, I've deliberately ignored the DRM's "area" junk.  I don't think
anyone actually uses it anymore and I'm hoping it gets ripped out soon.

[Updated: removed size arg to new free function.  We could unify the
free functions as well once the DRM mem tracking is ripped out.]

fd.o bug #20152 (part 1/3)

Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Eric Anholt <eric@anholt.net>

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index b189b49..4a24c90 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -349,7 +349,7 @@ i915_gem_shmem_pread_slow(struct drm_device *dev, struct drm_gem_object *obj,
 	last_data_page = (data_ptr + args->size - 1) / PAGE_SIZE;
 	num_pages = last_data_page - first_data_page + 1;
 
-	user_pages = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
+	user_pages = drm_calloc_large(num_pages, sizeof(struct page *));
 	if (user_pages == NULL)
 		return -ENOMEM;
 
@@ -429,7 +429,7 @@ fail_put_user_pages:
 		SetPageDirty(user_pages[i]);
 		page_cache_release(user_pages[i]);
 	}
-	kfree(user_pages);
+	drm_free_large(user_pages);
 
 	return ret;
 }
@@ -649,7 +649,7 @@ i915_gem_gtt_pwrite_slow(struct drm_device *dev, struct drm_gem_object *obj,
 	last_data_page = (data_ptr + args->size - 1) / PAGE_SIZE;
 	num_pages = last_data_page - first_data_page + 1;
 
-	user_pages = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
+	user_pages = drm_calloc_large(num_pages, sizeof(struct page *));
 	if (user_pages == NULL)
 		return -ENOMEM;
 
@@ -719,7 +719,7 @@ out_unlock:
 out_unpin_pages:
 	for (i = 0; i < pinned_pages; i++)
 		page_cache_release(user_pages[i]);
-	kfree(user_pages);
+	drm_free_large(user_pages);
 
 	return ret;
 }
@@ -824,7 +824,7 @@ i915_gem_shmem_pwrite_slow(struct drm_device *dev, struct drm_gem_object *obj,
 	last_data_page = (data_ptr + args->size - 1) / PAGE_SIZE;
 	num_pages = last_data_page - first_data_page + 1;
 
-	user_pages = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
+	user_pages = drm_calloc_large(num_pages, sizeof(struct page *));
 	if (user_pages == NULL)
 		return -ENOMEM;
 
@@ -902,7 +902,7 @@ fail_unlock:
 fail_put_user_pages:
 	for (i = 0; i < pinned_pages; i++)
 		page_cache_release(user_pages[i]);
-	kfree(user_pages);
+	drm_free_large(user_pages);
 
 	return ret;
 }
@@ -1408,9 +1408,7 @@ i915_gem_object_put_pages(struct drm_gem_object *obj)
 		}
 	obj_priv->dirty = 0;
 
-	drm_free(obj_priv->pages,
-		 page_count * sizeof(struct page *),
-		 DRM_MEM_DRIVER);
+	drm_free_large(obj_priv->pages);
 	obj_priv->pages = NULL;
 }
 
@@ -2024,8 +2022,7 @@ i915_gem_object_get_pages(struct drm_gem_object *obj)
 	 */
 	page_count = obj->size / PAGE_SIZE;
 	BUG_ON(obj_priv->pages != NULL);
-	obj_priv->pages = drm_calloc(page_count, sizeof(struct page *),
-				     DRM_MEM_DRIVER);
+	obj_priv->pages = drm_calloc_large(page_count, sizeof(struct page *));
 	if (obj_priv->pages == NULL) {
 		DRM_ERROR("Faled to allocate page list\n");
 		obj_priv->pages_refcount--;
@@ -3111,7 +3108,7 @@ i915_gem_get_relocs_from_user(struct drm_i915_gem_exec_object *exec_list,
 		reloc_count += exec_list[i].relocation_count;
 	}
 
-	*relocs = drm_calloc(reloc_count, sizeof(**relocs), DRM_MEM_DRIVER);
+	*relocs = drm_calloc_large(reloc_count, sizeof(**relocs));
 	if (*relocs == NULL)
 		return -ENOMEM;
 
@@ -3125,8 +3122,7 @@ i915_gem_get_relocs_from_user(struct drm_i915_gem_exec_object *exec_list,
 				     exec_list[i].relocation_count *
 				     sizeof(**relocs));
 		if (ret != 0) {
-			drm_free(*relocs, reloc_count * sizeof(**relocs),
-				 DRM_MEM_DRIVER);
+			drm_free_large(*relocs);
 			*relocs = NULL;
 			return -EFAULT;
 		}
@@ -3165,7 +3161,7 @@ i915_gem_put_relocs_to_user(struct drm_i915_gem_exec_object *exec_list,
 	}
 
 err:
-	drm_free(relocs, reloc_count * sizeof(*relocs), DRM_MEM_DRIVER);
+	drm_free_large(relocs);
 
 	return ret;
 }
@@ -3198,10 +3194,8 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
 		return -EINVAL;
 	}
 	/* Copy in the exec list from userland */
-	exec_list = drm_calloc(sizeof(*exec_list), args->buffer_count,
-			       DRM_MEM_DRIVER);
-	object_list = drm_calloc(sizeof(*object_list), args->buffer_count,
-				 DRM_MEM_DRIVER);
+	exec_list = drm_calloc_large(sizeof(*exec_list), args->buffer_count);
+	object_list = drm_calloc_large(sizeof(*object_list), args->buffer_count);
 	if (exec_list == NULL || object_list == NULL) {
 		DRM_ERROR("Failed to allocate exec or object list "
 			  "for %d buffers\n",
@@ -3462,10 +3456,8 @@ err:
 	}
 
 pre_mutex_err:
-	drm_free(object_list, sizeof(*object_list) * args->buffer_count,
-		 DRM_MEM_DRIVER);
-	drm_free(exec_list, sizeof(*exec_list) * args->buffer_count,
-		 DRM_MEM_DRIVER);
+	drm_free_large(object_list);
+	drm_free_large(exec_list);
 	drm_free(cliprects, sizeof(*cliprects) * args->num_cliprects,
 		 DRM_MEM_DRIVER);
 
diff --git a/include/drm/drmP.h b/include/drm/drmP.h
index c8c4221..b84d8ae 100644
--- a/include/drm/drmP.h
+++ b/include/drm/drmP.h
@@ -1519,6 +1519,30 @@ static __inline__ void *drm_calloc(size_t nmemb, size_t size, int area)
 {
 	return kcalloc(nmemb, size, GFP_KERNEL);
 }
+
+static __inline__ void *drm_calloc_large(size_t nmemb, size_t size)
+{
+	u8 *addr;
+
+	if (size <= PAGE_SIZE)
+	    return kcalloc(nmemb, size, GFP_KERNEL);
+
+	addr = vmalloc(nmemb * size);
+	if (!addr)
+		return NULL;
+
+	memset(addr, 0, nmemb * size);
+
+	return addr;
+}
+
+static __inline void drm_free_large(void *ptr)
+{
+	if (!is_vmalloc_addr(ptr))
+		return kfree(ptr);
+
+	vfree(ptr);
+}
 #else
 extern void *drm_alloc(size_t size, int area);
 extern void drm_free(void *pt, size_t size, int area);
-- 
cgit v0.10.2


From 53674ac5a997a8eedbb2dfdc308b895170746c8b Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 19 May 2009 19:52:35 +0200
Subject: block: add warning to blk_make_request()

Add a note about how one needs to be careful when setting up these bio
chains.

Extracted from Boaz's updated patch.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index bec1d69..4906507 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -910,6 +910,15 @@ EXPORT_SYMBOL(blk_get_request);
  * need bouncing, by calling the appropriate masked or flagged allocator,
  * suitable for the target device. Otherwise the call to blk_queue_bounce will
  * BUG.
+ *
+ * WARNING: When allocating/cloning a bio-chain, careful consideration should be
+ * given to how you allocate bios. In particular, you cannot use __GFP_WAIT for
+ * anything but the first bio in the chain. Otherwise you risk waiting for IO
+ * completion of a bio that hasn't been submitted yet, thus resulting in a
+ * deadlock. Alternatively bios should be allocated using bio_kmalloc() instead
+ * of bio_alloc(), as that avoids the mempool deadlock.
+ * If possible a big IO should be split into smaller parts when allocation
+ * fails. Partial allocation should not be an error, or you risk a live-lock.
  */
 struct request *blk_make_request(struct request_queue *q, struct bio *bio,
 				 gfp_t gfp_mask)
-- 
cgit v0.10.2


From ac36552a52a6ec8563ac0a109e2a0935673f4abb Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Tue, 19 May 2009 19:54:09 +0200
Subject: scsi_lib: remove unused variable

The last request completion cleanup in scsi_lib left an unused
this_count variable in scsi_io_completion().
(It was used before in a code segment that now uses blk_end_request_all())

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index e410d66..d7c6c75 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -706,7 +706,6 @@ EXPORT_SYMBOL(scsi_release_buffers);
 void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 {
 	int result = cmd->result;
-	int this_count;
 	struct request_queue *q = cmd->device->request_queue;
 	struct request *req = cmd->request;
 	int error = 0;
@@ -789,7 +788,6 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 	 */
 	if (scsi_end_request(cmd, error, good_bytes, result == 0) == NULL)
 		return;
-	this_count = blk_rq_bytes(req);
 
 	error = -EIO;
 
-- 
cgit v0.10.2


From b126d113484886d900179b4ae55ea7fabe15f936 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Thu, 30 Apr 2009 14:48:36 +0100
Subject: mfd: Keep a cache of WM8350 volatile values

Due to the way that the WM8350 audio driver handles CODEC_ENA many of
the WM8350 audio registers are marked as volatile when they aren't
actually so. Allow the audio driver to see a cache of these values for
inspection during interrupt context.

To do this we need to stop satisfying any bits from volatile registers
from cache - there's no real benefit from doing so anyway, we did the
read already.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/wm8350-core.c b/drivers/mfd/wm8350-core.c
index c2be308..fe24079 100644
--- a/drivers/mfd/wm8350-core.c
+++ b/drivers/mfd/wm8350-core.c
@@ -79,10 +79,6 @@ static int wm8350_phys_read(struct wm8350 *wm8350, u8 reg, int num_regs,
 		/* Cache is CPU endian */
 		dest[i - reg] = be16_to_cpu(dest[i - reg]);
 
-		/* Satisfy non-volatile bits from cache */
-		dest[i - reg] &= wm8350_reg_io_map[i].vol;
-		dest[i - reg] |= wm8350->reg_cache[i];
-
 		/* Mask out non-readable bits */
 		dest[i - reg] &= wm8350_reg_io_map[i].readable;
 	}
@@ -182,9 +178,6 @@ static int wm8350_write(struct wm8350 *wm8350, u8 reg, int num_regs, u16 *src)
 			(wm8350->reg_cache[i] & ~wm8350_reg_io_map[i].writable)
 			| src[i - reg];
 
-		/* Don't store volatile bits */
-		wm8350->reg_cache[i] &= ~wm8350_reg_io_map[i].vol;
-
 		src[i - reg] = cpu_to_be16(src[i - reg]);
 	}
 
@@ -1261,7 +1254,6 @@ static int wm8350_create_cache(struct wm8350 *wm8350, int type, int mode)
 		    (i < WM8350_CLOCK_CONTROL_1 || i > WM8350_AIF_TEST)) {
 			value = be16_to_cpu(wm8350->reg_cache[i]);
 			value &= wm8350_reg_io_map[i].readable;
-			value &= ~wm8350_reg_io_map[i].vol;
 			wm8350->reg_cache[i] = value;
 		} else
 			wm8350->reg_cache[i] = reg_map[i];
-- 
cgit v0.10.2


From f43ab901005c2bb3c5440e91b6efae9f5db02e7c Mon Sep 17 00:00:00 2001
From: Nelson Castillo <arhuaco@freaks-unidos.net>
Date: Tue, 12 May 2009 13:26:47 -0700
Subject: mfd: pcf50633: fix unsafe disable_irq()

Without this change Openmoko Freerunner (GTA02) bootstrap will deadlock.
As pointed out in other patches this issue is in the wild since the merge
of:

: commit 3aa551c9b4c40018f0e261a178e3d25478dc04a9
: Author: Thomas Gleixner <tglx@linutronix.de>
: Date:   Mon Mar 23 18:28:15 2009 +0100
:
:    genirq: add threaded interrupt handler support
:
:    Add support for threaded interrupt handlers

Signed-off-by: Nelson Castillo <arhuaco@freaks-unidos.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: <balajirrao@openmoko.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>

diff --git a/drivers/mfd/pcf50633-core.c b/drivers/mfd/pcf50633-core.c
index 7793932..11a6248 100644
--- a/drivers/mfd/pcf50633-core.c
+++ b/drivers/mfd/pcf50633-core.c
@@ -443,7 +443,7 @@ static irqreturn_t pcf50633_irq(int irq, void *data)
 	dev_dbg(pcf->dev, "pcf50633_irq\n");
 
 	get_device(pcf->dev);
-	disable_irq(pcf->irq);
+	disable_irq_nosync(pcf->irq);
 	schedule_work(&pcf->irq_work);
 
 	return IRQ_HANDLED;
-- 
cgit v0.10.2


From 33252572e727ccdcc54efdb67157e7ab3d6942db Mon Sep 17 00:00:00 2001
From: Nico Schottelius <nico-linux-20090330@schottelius.org>
Date: Sat, 16 May 2009 14:00:56 +0200
Subject: Fix scripts/setlocalversion with tagged git commit

Produce correct output for
- tagged commit (v2.6.30-rc6)
- past tagged commit (v2.6.30-rc5-299-g7c7327d)
- no tag

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/scripts/setlocalversion b/scripts/setlocalversion
index 32c8554..0079047 100755
--- a/scripts/setlocalversion
+++ b/scripts/setlocalversion
@@ -1,5 +1,13 @@
 #!/bin/sh
-# Print additional version information for non-release trees.
+#
+# This scripts adds local version information from the version
+# control systems git, mercurial (hg) and subversion (svn).
+#
+# If something goes wrong, send a mail the kernel build mailinglist
+# (see MAINTAINERS) and CC Nico Schottelius
+# <nico-linuxsetlocalversion -at- schottelius.org>.
+#
+#
 
 usage() {
 	echo "Usage: $0 [srctree]" >&2
@@ -10,12 +18,20 @@ cd "${1:-.}" || usage
 
 # Check for git and a git repo.
 if head=`git rev-parse --verify --short HEAD 2>/dev/null`; then
-	# Do we have an untagged tag?
-	if atag=`git describe 2>/dev/null`; then
-		echo "$atag" | awk -F- '{printf("-%05d-%s", $(NF-1),$(NF))}'
-	# add -g${head}, if there is no usable tag
-	else
-		printf '%s%s' -g $head
+
+	# If we are at a tagged commit (like "v2.6.30-rc6"), we ignore it,
+	# because this version is defined in the top level Makefile.
+	if [ -z "`git describe --exact-match 2>/dev/null`" ]; then
+
+		# If we are past a tagged commit (like "v2.6.30-rc5-302-g72357d5"),
+		# we pretty print it.
+		if atag="`git describe 2>/dev/null`"; then
+			echo "$atag" | awk -F- '{printf("-%05d-%s", $(NF-1),$(NF))}'
+
+		# If we don't have a tag at all we print -g{commitish}.
+		else
+			printf '%s%s' -g $head
+		fi
 	fi
 
 	# Is this git on svn?
-- 
cgit v0.10.2


From 64d1304a64477629cb16b75491a77bafe6f86963 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 May 2009 21:20:10 +0200
Subject: futex: setup writeable mapping for futex ops which modify user space
 data

The futex code installs a read only mapping via get_user_pages_fast()
even if the futex op function has to modify user space data. The
eventual fault was fixed up by futex_handle_fault() which walked the
VMA with mmap_sem held.

After the cleanup patches which removed the mmap_sem dependency of the
futex code commit 4dc5b7a36a49eff97050894cf1b3a9a02523717 (futex:
clean up fault logic) removed the private VMA walk logic from the
futex code. This change results in a stale RO mapping which is not
fixed up.

Instead of reintroducing the previous fault logic we set up the
mapping in get_user_pages_fast() read/write for all operations which
modify user space data. Also handle private futexes in the same way
and make the current unconditional access_ok(VERIFY_WRITE) depend on
the futex op.

Reported-by: Andreas Schwab <schwab@linux-m68k.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
CC: stable@kernel.org

diff --git a/kernel/futex.c b/kernel/futex.c
index eef8cd2..d546b2d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -193,6 +193,7 @@ static void drop_futex_key_refs(union futex_key *key)
  * @uaddr: virtual address of the futex
  * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
  * @key: address where result is stored.
+ * @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE)
  *
  * Returns a negative error code or 0
  * The key words are stored in *key on success.
@@ -203,7 +204,8 @@ static void drop_futex_key_refs(union futex_key *key)
  *
  * lock_page() might sleep, the caller should not hold a spinlock.
  */
-static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
+static int
+get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
 {
 	unsigned long address = (unsigned long)uaddr;
 	struct mm_struct *mm = current->mm;
@@ -226,7 +228,7 @@ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
 	 *        but access_ok() should be faster than find_vma()
 	 */
 	if (!fshared) {
-		if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
+		if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
 			return -EFAULT;
 		key->private.mm = mm;
 		key->private.address = address;
@@ -235,7 +237,7 @@ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
 	}
 
 again:
-	err = get_user_pages_fast(address, 1, 0, &page);
+	err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page);
 	if (err < 0)
 		return err;
 
@@ -677,7 +679,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
 	if (!bitset)
 		return -EINVAL;
 
-	ret = get_futex_key(uaddr, fshared, &key);
+	ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -723,10 +725,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
 	int ret, op_ret;
 
 retry:
-	ret = get_futex_key(uaddr1, fshared, &key1);
+	ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
 	if (unlikely(ret != 0))
 		goto out;
-	ret = get_futex_key(uaddr2, fshared, &key2);
+	ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
 	if (unlikely(ret != 0))
 		goto out_put_key1;
 
@@ -814,10 +816,10 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
 	int ret, drop_count = 0;
 
 retry:
-	ret = get_futex_key(uaddr1, fshared, &key1);
+	ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
 	if (unlikely(ret != 0))
 		goto out;
-	ret = get_futex_key(uaddr2, fshared, &key2);
+	ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_READ);
 	if (unlikely(ret != 0))
 		goto out_put_key1;
 
@@ -1140,7 +1142,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 	q.bitset = bitset;
 retry:
 	q.key = FUTEX_KEY_INIT;
-	ret = get_futex_key(uaddr, fshared, &q.key);
+	ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_READ);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -1330,7 +1332,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 	q.pi_state = NULL;
 retry:
 	q.key = FUTEX_KEY_INIT;
-	ret = get_futex_key(uaddr, fshared, &q.key);
+	ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -1594,7 +1596,7 @@ retry:
 	if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
 		return -EPERM;
 
-	ret = get_futex_key(uaddr, fshared, &key);
+	ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE);
 	if (unlikely(ret != 0))
 		goto out;
 
-- 
cgit v0.10.2


From 33b2fb303fe7f6b08bbb32f708e67b96eaa94a7a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 17 May 2009 11:08:41 +0200
Subject: perf_counter: fix counter freeing logic

Fix counter lifetime bugs which explain the crashes reported by
Marcelo Tosatti and Arnaldo Carvalho de Melo.

The new rule is: flushing + freeing is only done for a task's
own counters, never for other tasks.

[ Impact: fix crashes/lockups with inherited counters ]

Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Reported-by: Marcelo Tosatti <mtosatti@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/exit.c b/kernel/exit.c
index 73affd3..f9dfedd 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -178,12 +178,6 @@ repeat:
 
 	proc_flush_task(p);
 
-	/*
-	 * Flush inherited counters to the parent - before the parent
-	 * gets woken up by child-exit notifications.
-	 */
-	perf_counter_exit_task(p);
-
 	write_lock_irq(&tasklist_lock);
 	tracehook_finish_release_task(p);
 	__exit_signal(p);
@@ -985,6 +979,13 @@ NORET_TYPE void do_exit(long code)
 		module_put(tsk->binfmt->module);
 
 	proc_exit_connector(tsk);
+
+	/*
+	 * Flush inherited counters to the parent - before the parent
+	 * gets woken up by child-exit notifications.
+	 */
+	perf_counter_exit_task(tsk);
+
 	exit_notify(tsk, group_dead);
 #ifdef CONFIG_NUMA
 	mpol_put(tsk->mempolicy);
@@ -1257,12 +1258,6 @@ static int wait_task_zombie(struct task_struct *p, int options,
 	 */
 	read_unlock(&tasklist_lock);
 
-	/*
-	 * Flush inherited counters to the parent - before the parent
-	 * gets woken up by child-exit notifications.
-	 */
-	perf_counter_exit_task(p);
-
 	retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
 	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
 		? p->signal->group_exit_code : p->exit_code;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 59a926d..7af16d1 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3299,6 +3299,8 @@ void perf_counter_exit_task(struct task_struct *child)
 	struct perf_counter *child_counter, *tmp;
 	struct perf_counter_context *child_ctx;
 
+	WARN_ON_ONCE(child != current);
+
 	child_ctx = &child->perf_counter_ctx;
 
 	if (likely(!child_ctx->nr_counters))
-- 
cgit v0.10.2


From c44d70a340554a33071339064a303ac0f1a31623 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 17 May 2009 11:24:08 +0200
Subject: perf_counter: fix counter inheritance race

Context rotation should not occur when we are in the middle of
walking the counter list when inheriting counters ...

[ Impact: fix occasionally incorrect perf stat results ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c8c1dfc2..13cb2fb 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -508,6 +508,7 @@ struct perf_counter_context {
 	int			nr_counters;
 	int			nr_active;
 	int			is_active;
+	int			rr_allowed;
 	struct task_struct	*task;
 
 	/*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 7af16d1..4d8f973 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1120,7 +1120,8 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	__perf_counter_task_sched_out(ctx);
 
 	rotate_ctx(&cpuctx->ctx);
-	rotate_ctx(ctx);
+	if (ctx->rr_allowed)
+		rotate_ctx(ctx);
 
 	perf_counter_cpu_sched_in(cpuctx, cpu);
 	perf_counter_task_sched_in(curr, cpu);
@@ -3108,6 +3109,7 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
 	mutex_init(&ctx->mutex);
 	INIT_LIST_HEAD(&ctx->counter_list);
 	INIT_LIST_HEAD(&ctx->event_list);
+	ctx->rr_allowed = 1;
 	ctx->task = task;
 }
 
@@ -3348,6 +3350,9 @@ void perf_counter_init_task(struct task_struct *child)
 	 */
 	mutex_lock(&parent_ctx->mutex);
 
+	parent_ctx->rr_allowed = 0;
+	barrier(); /* irqs */
+
 	/*
 	 * We dont have to disable NMIs - we are only looking at
 	 * the list, not manipulating it:
@@ -3361,6 +3366,9 @@ void perf_counter_init_task(struct task_struct *child)
 			break;
 	}
 
+	barrier(); /* irqs */
+	parent_ctx->rr_allowed = 1;
+
 	mutex_unlock(&parent_ctx->mutex);
 }
 
-- 
cgit v0.10.2


From b3bad72e494fb2ff0c81be4ca2ddb94adf6a47c2 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 17 May 2009 20:17:06 +0200
Subject: PCI PM: Fix initialization and kexec breakage for some devices

Recent PCI PM changes introduced a bug that causes some devices to be
mishandled after kexec and during early initialization.  The failure
scenario in the kexec case is the following:

* Assume a PCI device is not power-manageable by the platform and has
  PCI_PM_CTRL_NO_SOFT_RESET set in PMCSR.
* The device is put into D3 before kexec (using the native PCI PM).
* After kexec, pci_setup_device() sets the device's power state to
  PCI_UNKNOWN.
* pci_set_power_state(dev, PCI_D0) is called by the device's driver.
* __pci_start_power_transition(dev, PCI_D0) is called and since the
  device is not power-manageable by the platform, it causes
  pci_update_current_state(dev, PCI_D0) to be called.  As a result
  the device's current_state field is updated to PCI_D3, in
  accordance with the contents of its PCI PM registers.
* pci_raw_set_power_state() is called and it changes the device power
  state to D0.  *However*, it should also call pci_restore_bars() to
  reinitialize the device, but it doesn't, because the device's
  current_state field has been modified earlier.

To prevent this from happening, modify pci_platform_power_transition()
so that it doesn't use pci_update_current_state() to update the
current_state field for devices that aren't power-manageable by the
platform.  Instead, this field should be updated directly for devices
that don't support the native PCI PM.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 34bf0fd..1a91bf9 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -557,7 +557,8 @@ static int pci_platform_power_transition(struct pci_dev *dev, pci_power_t state)
 	} else {
 		error = -ENODEV;
 		/* Fall back to PCI_D0 if native PM is not supported */
-		pci_update_current_state(dev, PCI_D0);
+		if (!dev->pm_cap)
+			dev->current_state = PCI_D0;
 	}
 
 	return error;
-- 
cgit v0.10.2


From fbaa58696cef848de818768783ef185bd3f05158 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 13 May 2009 12:50:40 -0400
Subject: TPM: get_event_name stack corruption

get_event_name uses sprintf to fill a buffer declared on the stack.  It fills
the buffer 2 bytes at a time.  What the code doesn't take into account is that
sprintf(buf, "%02x", data) actually writes 3 bytes.  2 bytes for the data and
then it nul terminates the string.  Since we declare buf to be 40 characters
long and then we write 40 bytes of data into buf sprintf is going to write 41
characters.  The fix is to leave room in buf for the nul terminator.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/drivers/char/tpm/tpm_bios.c b/drivers/char/tpm/tpm_bios.c
index ed306eb..0c2f55a 100644
--- a/drivers/char/tpm/tpm_bios.c
+++ b/drivers/char/tpm/tpm_bios.c
@@ -212,7 +212,8 @@ static int get_event_name(char *dest, struct tcpa_event *event,
 			unsigned char * event_entry)
 {
 	const char *name = "";
-	char data[40] = "";
+	/* 41 so there is room for 40 data and 1 nul */
+	char data[41] = "";
 	int i, n_len = 0, d_len = 0;
 	struct tcpa_pc_event *pc_event;
 
-- 
cgit v0.10.2


From b674137755bbe2750f997a2a1264db3cdf8abcb3 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Mon, 18 May 2009 11:56:16 +1000
Subject: drm: Round size of SHM maps to PAGE_SIZE

Currently, userspace can fail to obtain the SAREA mapping (among other
reasons) if it passes SAREA_MAX to drmAddMap without aligning it to the
page size. This breaks for example on PowerPC with 64K pages and radeon
despite the kernel radeon actually doing the right rouding in the first
place.

The way SAREA_MAX is defined with a bunch of ifdef's and duplicated
between libdrm and the X server is gross, ultimately it should be
retrieved by userspace from the kernel, but in the meantime, we have
plenty of existing userspace built with bad values that need to work.

This patch works around broken userspace by rounding the requested size
in drm_addmap_core() of any SHM map to the page size. Since the backing
memory for SHM maps is also allocated within addmap_core, there is no
danger of adjacent memory being exposed due to the increased map size.
The only side effect is that drivers that previously tried to create or
access SHM maps using a size < PAGE_SIZE and failed (getting -EINVAL),
will now succeed at the cost of a little bit more memory used if that
happens to be when the map is created.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>

diff --git a/drivers/gpu/drm/drm_bufs.c b/drivers/gpu/drm/drm_bufs.c
index 6d80d17..0411d91 100644
--- a/drivers/gpu/drm/drm_bufs.c
+++ b/drivers/gpu/drm/drm_bufs.c
@@ -170,6 +170,14 @@ static int drm_addmap_core(struct drm_device * dev, resource_size_t offset,
 	}
 	DRM_DEBUG("offset = 0x%08llx, size = 0x%08lx, type = %d\n",
 		  (unsigned long long)map->offset, map->size, map->type);
+
+	/* page-align _DRM_SHM maps. They are allocated here so there is no security
+	 * hole created by that and it works around various broken drivers that use
+	 * a non-aligned quantity to map the SAREA. --BenH
+	 */
+	if (map->type == _DRM_SHM)
+		map->size = PAGE_ALIGN(map->size);
+
 	if ((map->offset & (~(resource_size_t)PAGE_MASK)) || (map->size & (~PAGE_MASK))) {
 		drm_free(map, sizeof(*map), DRM_MEM_MAPS);
 		return -EINVAL;
-- 
cgit v0.10.2


From ff71338ed31398384b2e5992623d52f9aaba1da1 Mon Sep 17 00:00:00 2001
From: Daniel Ribeiro <drwyrm@gmail.com>
Date: Fri, 15 May 2009 06:33:50 -0300
Subject: [ARM] pxa/ezx: fix pin configuration for low power mode

Fix LPM configuration on ezx.c

Signed-off-by: Daniel Ribeiro <drwyrm@gmail.com>
Signed-off-by: Eric Miao <eric.miao@marvell.com>

diff --git a/arch/arm/mach-pxa/ezx.c b/arch/arm/mach-pxa/ezx.c
index 92ba16e..7db966d 100644
--- a/arch/arm/mach-pxa/ezx.c
+++ b/arch/arm/mach-pxa/ezx.c
@@ -111,9 +111,9 @@ static unsigned long ezx_pin_config[] __initdata = {
 	GPIO25_SSP1_TXD,
 	GPIO26_SSP1_RXD,
 	GPIO24_GPIO,				/* pcap chip select */
-	GPIO1_GPIO,				/* pcap interrupt */
-	GPIO4_GPIO,				/* WDI_AP */
-	GPIO55_GPIO,				/* SYS_RESTART */
+	GPIO1_GPIO | WAKEUP_ON_EDGE_RISE,	/* pcap interrupt */
+	GPIO4_GPIO | MFP_LPM_DRIVE_HIGH,	/* WDI_AP */
+	GPIO55_GPIO | MFP_LPM_DRIVE_HIGH,	/* SYS_RESTART */
 
 	/* MMC */
 	GPIO32_MMC_CLK,
@@ -144,20 +144,20 @@ static unsigned long ezx_pin_config[] __initdata = {
 #if defined(CONFIG_MACH_EZX_A780) || defined(CONFIG_MACH_EZX_E680)
 static unsigned long gen1_pin_config[] __initdata = {
 	/* flip / lockswitch */
-	GPIO12_GPIO,
+	GPIO12_GPIO | WAKEUP_ON_EDGE_BOTH,
 
 	/* bluetooth (bcm2035) */
-	GPIO14_GPIO | WAKEUP_ON_LEVEL_HIGH,	/* HOSTWAKE */
+	GPIO14_GPIO | WAKEUP_ON_EDGE_RISE,	/* HOSTWAKE */
 	GPIO48_GPIO,				/* RESET */
 	GPIO28_GPIO,				/* WAKEUP */
 
 	/* Neptune handshake */
-	GPIO0_GPIO | WAKEUP_ON_LEVEL_HIGH,	/* BP_RDY */
-	GPIO57_GPIO,				/* AP_RDY */
-	GPIO13_GPIO | WAKEUP_ON_LEVEL_HIGH,	/* WDI */
-	GPIO3_GPIO | WAKEUP_ON_LEVEL_HIGH,	/* WDI2 */
-	GPIO82_GPIO,				/* RESET */
-	GPIO99_GPIO,				/* TC_MM_EN */
+	GPIO0_GPIO | WAKEUP_ON_EDGE_FALL,	/* BP_RDY */
+	GPIO57_GPIO | MFP_LPM_DRIVE_HIGH,	/* AP_RDY */
+	GPIO13_GPIO | WAKEUP_ON_EDGE_BOTH,	/* WDI */
+	GPIO3_GPIO | WAKEUP_ON_EDGE_BOTH,	/* WDI2 */
+	GPIO82_GPIO | MFP_LPM_DRIVE_HIGH,	/* RESET */
+	GPIO99_GPIO | MFP_LPM_DRIVE_HIGH,	/* TC_MM_EN */
 
 	/* sound */
 	GPIO52_SSP3_SCLK,
@@ -199,21 +199,21 @@ static unsigned long gen1_pin_config[] __initdata = {
 	defined(CONFIG_MACH_EZX_E2) || defined(CONFIG_MACH_EZX_E6)
 static unsigned long gen2_pin_config[] __initdata = {
 	/* flip / lockswitch */
-	GPIO15_GPIO,
+	GPIO15_GPIO | WAKEUP_ON_EDGE_BOTH,
 
 	/* EOC */
-	GPIO10_GPIO,
+	GPIO10_GPIO | WAKEUP_ON_EDGE_RISE,
 
 	/* bluetooth (bcm2045) */
-	GPIO13_GPIO | WAKEUP_ON_LEVEL_HIGH,	/* HOSTWAKE */
+	GPIO13_GPIO | WAKEUP_ON_EDGE_RISE,	/* HOSTWAKE */
 	GPIO37_GPIO,				/* RESET */
 	GPIO57_GPIO,				/* WAKEUP */
 
 	/* Neptune handshake */
-	GPIO0_GPIO | WAKEUP_ON_LEVEL_HIGH,	/* BP_RDY */
-	GPIO96_GPIO,				/* AP_RDY */
-	GPIO3_GPIO | WAKEUP_ON_LEVEL_HIGH,	/* WDI */
-	GPIO116_GPIO,				/* RESET */
+	GPIO0_GPIO | WAKEUP_ON_EDGE_FALL,	/* BP_RDY */
+	GPIO96_GPIO | MFP_LPM_DRIVE_HIGH,	/* AP_RDY */
+	GPIO3_GPIO | WAKEUP_ON_EDGE_FALL,	/* WDI */
+	GPIO116_GPIO | MFP_LPM_DRIVE_HIGH,	/* RESET */
 	GPIO41_GPIO,				/* BP_FLASH */
 
 	/* sound */
-- 
cgit v0.10.2


From 62669e61a5f559826b1d2e863649a6005eee629b Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 20 May 2009 11:27:13 +0900
Subject: sh: mach-hp6xx: Fix up the hp6xx build for hd64461 changes.

Fixes several compile errors due to the recent hd64461 I/O base changes.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/cchips/hd6446x/hd64461.c b/arch/sh/cchips/hd6446x/hd64461.c
index 25ef910..50aa0c1 100644
--- a/arch/sh/cchips/hd6446x/hd64461.c
+++ b/arch/sh/cchips/hd6446x/hd64461.c
@@ -80,7 +80,7 @@ int __init setup_hd64461(void)
 
 	printk(KERN_INFO
 	       "HD64461 configured at 0x%x on irq %d(mapped into %d to %d)\n",
-	       CONFIG_HD64461_IOBASE, CONFIG_HD64461_IRQ, HD64461_IRQBASE,
+	       HD64461_IOBASE, CONFIG_HD64461_IRQ, HD64461_IRQBASE,
 	       HD64461_IRQBASE + 15);
 
 /* Should be at processor specific part.. */
diff --git a/arch/sh/include/asm/hd64461.h b/arch/sh/include/asm/hd64461.h
index c16d545..977355f 100644
--- a/arch/sh/include/asm/hd64461.h
+++ b/arch/sh/include/asm/hd64461.h
@@ -26,7 +26,7 @@
 #define	HD64461_PCC1_COMM	(HD64461_PCC1_BASE+HD64461_PCC_WINDOW)		/* 0xb5000000 */
 
 /* Standby Control Register for HD64461 */
-#define	HD64461_STBCR			CONFIG_HD64461_IOBASE
+#define	HD64461_STBCR			HD64461_IO_OFFSET(0x00000000)
 #define	HD64461_STBCR_CKIO_STBY		0x2000
 #define	HD64461_STBCR_SAFECKE_IST	0x1000
 #define	HD64461_STBCR_SLCKE_IST		0x0800
diff --git a/drivers/video/hitfb.c b/drivers/video/hitfb.c
index e6467cf..020db7f 100644
--- a/drivers/video/hitfb.c
+++ b/drivers/video/hitfb.c
@@ -335,9 +335,9 @@ static int __init hitfb_probe(struct platform_device *dev)
 	if (fb_get_options("hitfb", NULL))
 		return -ENODEV;
 
-	hitfb_fix.mmio_start = CONFIG_HD64461_IOBASE+0x1000;
+	hitfb_fix.mmio_start = HD64461_IO_OFFSET(0x1000);
 	hitfb_fix.mmio_len = 0x1000;
-	hitfb_fix.smem_start = CONFIG_HD64461_IOBASE + 0x02000000;
+	hitfb_fix.smem_start = HD64461_IO_OFFSET(0x02000000);
 	hitfb_fix.smem_len = 512 * 1024;
 
 	lcdclor = fb_readw(HD64461_LCDCLOR);
-- 
cgit v0.10.2


From d3375ea7213d09284ee3683d8bc075b9d3786a03 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Wed, 20 May 2009 08:42:06 +0200
Subject: cdrom: beyond ARRAY_SIZE of viocd_diskinfo

Do not go beyond ARRAY_SIZE of viocd_diskinfo

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index 1392935..9b1624e 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -587,7 +587,7 @@ static int viocd_probe(struct vio_dev *vdev, const struct vio_device_id *id)
 	struct device_node *node = vdev->dev.archdata.of_node;
 
 	deviceno = vdev->unit_address;
-	if (deviceno > VIOCD_MAX_CD)
+	if (deviceno >= VIOCD_MAX_CD)
 		return -ENODEV;
 	if (!node)
 		return -ENODEV;
-- 
cgit v0.10.2


From 0a7ae2ff0d29bb3b327edff4c8ab67b3834fa811 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 20 May 2009 08:54:31 +0200
Subject: block: change the tag sync vs async restriction logic

Make them fully share the tag space, but disallow async requests using
the last any two slots.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 0ab81a0..0d98054 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -218,7 +218,7 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 	} else
 		skip |= QUEUE_ORDSEQ_PREFLUSH;
 
-	if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && q->in_flight)
+	if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && queue_in_flight(q))
 		rq = NULL;
 	else
 		skip |= QUEUE_ORDSEQ_DRAIN;
diff --git a/block/blk-core.c b/block/blk-core.c
index 4906507..1c74840 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1815,7 +1815,7 @@ void blk_dequeue_request(struct request *rq)
 	 * the driver side.
 	 */
 	if (blk_account_rq(rq))
-		q->in_flight++;
+		q->in_flight[rq_is_sync(rq)]++;
 }
 
 /**
diff --git a/block/blk-tag.c b/block/blk-tag.c
index c260f7c..2e5cfeb 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -336,7 +336,7 @@ EXPORT_SYMBOL(blk_queue_end_tag);
 int blk_queue_start_tag(struct request_queue *q, struct request *rq)
 {
 	struct blk_queue_tag *bqt = q->queue_tags;
-	unsigned max_depth, offset;
+	unsigned max_depth;
 	int tag;
 
 	if (unlikely((rq->cmd_flags & REQ_QUEUED))) {
@@ -355,13 +355,16 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
 	 * to starve sync IO on behalf of flooding async IO.
 	 */
 	max_depth = bqt->max_depth;
-	if (rq_is_sync(rq))
-		offset = 0;
-	else
-		offset = max_depth >> 2;
+	if (!rq_is_sync(rq) && max_depth > 1) {
+		max_depth -= 2;
+		if (!max_depth)
+			max_depth = 1;
+		if (q->in_flight[0] > max_depth)
+			return 1;
+	}
 
 	do {
-		tag = find_next_zero_bit(bqt->tag_map, max_depth, offset);
+		tag = find_first_zero_bit(bqt->tag_map, max_depth);
 		if (tag >= max_depth)
 			return 1;
 
diff --git a/block/elevator.c b/block/elevator.c
index 9189200..ebee948 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -546,7 +546,7 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
 	 * in_flight count again
 	 */
 	if (blk_account_rq(rq)) {
-		q->in_flight--;
+		q->in_flight[rq_is_sync(rq)]--;
 		if (blk_sorted_rq(rq))
 			elv_deactivate_rq(q, rq);
 	}
@@ -685,7 +685,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 
 	if (unplug_it && blk_queue_plugged(q)) {
 		int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC]
-			- q->in_flight;
+				- queue_in_flight(q);
 
 		if (nrq >= q->unplug_thresh)
 			__generic_unplug_device(q);
@@ -823,7 +823,7 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 	 * request is released from the driver, io must be done
 	 */
 	if (blk_account_rq(rq)) {
-		q->in_flight--;
+		q->in_flight[rq_is_sync(rq)]--;
 		if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn)
 			e->ops->elevator_completed_req_fn(q, rq);
 	}
@@ -838,7 +838,7 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 		if (!list_empty(&q->queue_head))
 			next = list_entry_rq(q->queue_head.next);
 
-		if (!q->in_flight &&
+		if (!queue_in_flight(q) &&
 		    blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
 		    (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) {
 			blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 564445b..a967dd77 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -404,7 +404,7 @@ struct request_queue
 	struct list_head	tag_busy_list;
 
 	unsigned int		nr_sorted;
-	unsigned int		in_flight;
+	unsigned int		in_flight[2];
 
 	unsigned int		rq_timeout;
 	struct timer_list	timeout;
@@ -511,6 +511,11 @@ static inline void queue_flag_clear_unlocked(unsigned int flag,
 	__clear_bit(flag, &q->queue_flags);
 }
 
+static inline int queue_in_flight(struct request_queue *q)
+{
+	return q->in_flight[0] + q->in_flight[1];
+}
+
 static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 {
 	WARN_ON_ONCE(!queue_is_locked(q));
-- 
cgit v0.10.2


From c8b15a706d921baed3195407e4f55270112bb3c6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 20 May 2009 09:18:50 +0200
Subject: futex: cleanup error exit

Reuse the put_key_ref(key2) call in the exit path.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/futex.c b/kernel/futex.c
index 476603a..381125a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2185,10 +2185,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 
 	/* Prepare to wait on uaddr. */
 	ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
-	if (ret) {
-		put_futex_key(fshared, &key2);
-		goto out;
-	}
+	if (ret)
+		goto out_key2;
 
 	/* Queue the futex_q, drop the hb lock, wait for wakeup. */
 	futex_wait_queue_me(hb, &q, to);
@@ -2282,6 +2280,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 
 out_put_keys:
 	put_futex_key(fshared, &q.key);
+out_key2:
 	put_futex_key(fshared, &key2);
 
 out:
-- 
cgit v0.10.2


From 1c840c14906d4ddf66c1f4f5daea059aad951c82 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 20 May 2009 09:22:40 +0200
Subject: futex: fix restart for early wakeup in futex_wait_requeue_pi()

The futex_wait_requeue_pi op should restart unconditionally like
futex_lock_pi. The user of that function e.g. pthread_cond_wait can
not be interrupted so we do not care about the SA_RESTART flag of the
signal. Clean up the FIXMEs.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/futex.c b/kernel/futex.c
index 381125a..2aa216e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2060,7 +2060,7 @@ pi_faulted:
  *
  * Returns
  *  0 - no early wakeup detected
- * <0 - -ETIMEDOUT or -ERESTARTSYS (FIXME: or ERESTARTNOINTR?)
+ * <0 - -ETIMEDOUT or -ERESTARTNOINTR
  */
 static inline
 int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
@@ -2087,15 +2087,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 
 		if (timeout && !timeout->task)
 			ret = -ETIMEDOUT;
-		else {
-			/*
-			 * We expect signal_pending(current), but another
-			 * thread may have handled it for us already.
-			 */
-			/* FIXME: ERESTARTSYS or ERESTARTNOINTR?  Do we care if
-			 * the user specified SA_RESTART or not? */
-			ret = -ERESTARTSYS;
-		}
+		else
+			ret = -ERESTARTNOINTR;
 	}
 	return ret;
 }
-- 
cgit v0.10.2


From 2070887fdeacd9c13f3e805e3f0086c9f22a4d93 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 19 May 2009 23:04:59 +0200
Subject: futex: fix restart in wait_requeue_pi

If the waiter has been requeued to the outer PI futex and is
interrupted by a signal and the thread handles the signal then
ERESTART_RESTARTBLOCK is changed to EINTR and the restart block is
discarded. That way we return an unexcpected EINTR to user space
instead of ending up in futex_lock_pi_restart.

But we do not need to restart the syscall because we know that the
condition has changed since we have been requeued. If we would simply
restart the syscall then we would drop out via the comparison of the
user space value with EWOULDBLOCK.

The user space side needs to handle EWOULDBLOCK anyway as the
enqueueing on the inner futex can race with a requeue/wake. So we can
simply return EWOULDBLOCK to user space which also signals that we did
not take the outer futex and let user space handle it in the same way
it has to handle the requeue/wake race.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/futex.c b/kernel/futex.c
index 2aa216e..80b5ce7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1507,7 +1507,6 @@ handle_fault:
 #define FLAGS_HAS_TIMEOUT	0x04
 
 static long futex_wait_restart(struct restart_block *restart);
-static long futex_lock_pi_restart(struct restart_block *restart);
 
 /**
  * fixup_owner() - Post lock pi_state and corner case management
@@ -1930,21 +1929,6 @@ uaddr_faulted:
 	goto retry;
 }
 
-static long futex_lock_pi_restart(struct restart_block *restart)
-{
-	u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
-	ktime_t t, *tp = NULL;
-	int fshared = restart->futex.flags & FLAGS_SHARED;
-
-	if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
-		t.tv64 = restart->futex.time;
-		tp = &t;
-	}
-	restart->fn = do_no_restart_syscall;
-
-	return (long)futex_lock_pi(uaddr, fshared, restart->futex.val, tp, 0);
-}
-
 /*
  * Userspace attempted a TID -> 0 atomic transition, and failed.
  * This is the in-kernel slowpath: we look up the PI state (if any),
@@ -2141,12 +2125,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 	struct hrtimer_sleeper timeout, *to = NULL;
 	struct rt_mutex_waiter rt_waiter;
 	struct rt_mutex *pi_mutex = NULL;
-	struct restart_block *restart;
 	struct futex_hash_bucket *hb;
 	union futex_key key2;
 	struct futex_q q;
 	int res, ret;
-	u32 uval;
 
 	if (!bitset)
 		return -EINVAL;
@@ -2245,30 +2227,17 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 		if (rt_mutex_owner(pi_mutex) == current)
 			rt_mutex_unlock(pi_mutex);
 	} else if (ret == -EINTR) {
-		ret = -EFAULT;
-		if (get_user(uval, uaddr2))
-			goto out_put_keys;
-
 		/*
-		 * We've already been requeued, so restart by calling
-		 * futex_lock_pi() directly, rather then returning to this
-		 * function.
+		 * We've already been requeued, but we have no way to
+		 * restart by calling futex_lock_pi() directly. We
+		 * could restart the syscall, but that will look at
+		 * the user space value and return right away. So we
+		 * drop back with EWOULDBLOCK to tell user space that
+		 * "val" has been changed. That's the same what the
+		 * restart of the syscall would do in
+		 * futex_wait_setup().
 		 */
-		ret = -ERESTART_RESTARTBLOCK;
-		restart = &current_thread_info()->restart_block;
-		restart->fn = futex_lock_pi_restart;
-		restart->futex.uaddr = (u32 *)uaddr2;
-		restart->futex.val = uval;
-		restart->futex.flags = 0;
-		if (abs_time) {
-			restart->futex.flags |= FLAGS_HAS_TIMEOUT;
-			restart->futex.time = abs_time->tv64;
-		}
-
-		if (fshared)
-			restart->futex.flags |= FLAGS_SHARED;
-		if (clockrt)
-			restart->futex.flags |= FLAGS_CLOCKRT;
+		ret = -EWOULDBLOCK;
 	}
 
 out_put_keys:
-- 
cgit v0.10.2


From 09010978345e8883003bf411bb99753710eb5a3a Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 20 May 2009 10:48:47 +0100
Subject: GFS2: Improve resource group error handling

This patch improves the error handling in the case where we
discover that the summary information in the resource group
doesn't match the bitmap information while in the process of
allocating blocks. Originally this resulted in a kernel bug,
but this patch changes that so that we return -EIO and print
some messages explaining what went wrong, and how to fix it.

We also remember locally not to try and allocate from the
same rgrp again, so that a subsequent allocation in a
different rgrp should succeed.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 3a5d3f8..253e1a3 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -136,7 +136,9 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 		   and write it out to disk */
 
 		unsigned int n = 1;
-		block = gfs2_alloc_block(ip, &n);
+		error = gfs2_alloc_block(ip, &block, &n);
+		if (error)
+			goto out_brelse;
 		if (isdir) {
 			gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
 			error = gfs2_dir_get_new_buffer(ip, block, &bh);
@@ -476,8 +478,11 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 	blks = dblks + iblks;
 	i = sheight;
 	do {
+		int error;
 		n = blks - alloced;
-		bn = gfs2_alloc_block(ip, &n);
+		error = gfs2_alloc_block(ip, &bn, &n);
+		if (error)
+			return error;
 		alloced += n;
 		if (state != ALLOC_DATA || gfs2_is_jdata(ip))
 			gfs2_trans_add_unrevoke(sdp, bn, n);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index aef4d0c..297d7e5 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -803,13 +803,20 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	unsigned int n = 1;
-	u64 bn = gfs2_alloc_block(ip, &n);
-	struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn);
+	u64 bn;
+	int error;
+	struct buffer_head *bh;
 	struct gfs2_leaf *leaf;
 	struct gfs2_dirent *dent;
 	struct qstr name = { .name = "", .len = 0, .hash = 0 };
+
+	error = gfs2_alloc_block(ip, &bn, &n);
+	if (error)
+		return NULL;
+	bh = gfs2_meta_new(ip->i_gl, bn);
 	if (!bh)
 		return NULL;
+
 	gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1);
 	gfs2_trans_add_bh(ip->i_gl, bh, 1);
 	gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 899763a..07ea952 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -582,8 +582,11 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
 	struct gfs2_ea_header *ea;
 	unsigned int n = 1;
 	u64 block;
+	int error;
 
-	block = gfs2_alloc_block(ip, &n);
+	error = gfs2_alloc_block(ip, &block, &n);
+	if (error)
+		return error;
 	gfs2_trans_add_unrevoke(sdp, block, 1);
 	*bhp = gfs2_meta_new(ip->i_gl, block);
 	gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
@@ -617,6 +620,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
 		    struct gfs2_ea_request *er)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	int error;
 
 	ea->ea_data_len = cpu_to_be32(er->er_data_len);
 	ea->ea_name_len = er->er_name_len;
@@ -642,7 +646,9 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
 			int mh_size = sizeof(struct gfs2_meta_header);
 			unsigned int n = 1;
 
-			block = gfs2_alloc_block(ip, &n);
+			error = gfs2_alloc_block(ip, &block, &n);
+			if (error)
+				return error;
 			gfs2_trans_add_unrevoke(sdp, block, 1);
 			bh = gfs2_meta_new(ip->i_gl, block);
 			gfs2_trans_add_bh(ip->i_gl, bh, 1);
@@ -963,7 +969,9 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	} else {
 		u64 blk;
 		unsigned int n = 1;
-		blk = gfs2_alloc_block(ip, &n);
+		error = gfs2_alloc_block(ip, &blk, &n);
+		if (error)
+			return error;
 		gfs2_trans_add_unrevoke(sdp, blk, 1);
 		indbh = gfs2_meta_new(ip->i_gl, blk);
 		gfs2_trans_add_bh(ip->i_gl, indbh, 1);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 70f87f4..d5e4ab1 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -310,24 +310,6 @@ static void rgrp_go_unlock(struct gfs2_holder *gh)
 }
 
 /**
- * rgrp_go_dump - print out an rgrp
- * @seq: The iterator
- * @gl: The glock in question
- *
- */
-
-static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
-{
-	const struct gfs2_rgrpd *rgd = gl->gl_object;
-	if (rgd == NULL)
-		return 0;
-	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
-		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
-		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
-	return 0;
-}
-
-/**
  * trans_go_sync - promote/demote the transaction glock
  * @gl: the glock
  * @state: the requested state
@@ -410,7 +392,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
 	.go_demote_ok = rgrp_go_demote_ok,
 	.go_lock = rgrp_go_lock,
 	.go_unlock = rgrp_go_unlock,
-	.go_dump = rgrp_go_dump,
+	.go_dump = gfs2_rgrp_dump,
 	.go_type = LM_TYPE_RGRP,
 	.go_min_hold_time = HZ / 5,
 };
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 0060e95..de50d86 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -92,9 +92,10 @@ struct gfs2_rgrpd {
 	unsigned int rd_bh_count;
 	u32 rd_last_alloc;
 	unsigned char rd_flags;
-#define GFS2_RDF_CHECK        0x01      /* Need to check for unlinked inodes */
-#define GFS2_RDF_NOALLOC      0x02      /* rg prohibits allocation */
-#define GFS2_RDF_UPTODATE     0x04      /* rg is up to date */
+#define GFS2_RDF_CHECK		0x10000000 /* check for unlinked inodes */
+#define GFS2_RDF_UPTODATE	0x20000000 /* rg is up to date */
+#define GFS2_RDF_ERROR		0x40000000 /* error in rg */
+#define GFS2_RDF_MASK		0xf0000000 /* mask for internal flags */
 };
 
 enum gfs2_state_bits {
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 5650382..fbacf09 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -701,10 +701,7 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 	u32 rg_flags;
 
 	rg_flags = be32_to_cpu(str->rg_flags);
-	if (rg_flags & GFS2_RGF_NOALLOC)
-		rgd->rd_flags |= GFS2_RDF_NOALLOC;
-	else
-		rgd->rd_flags &= ~GFS2_RDF_NOALLOC;
+	rg_flags &= ~GFS2_RDF_MASK;
 	rgd->rd_free = be32_to_cpu(str->rg_free);
 	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
 	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
@@ -713,11 +710,8 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
 {
 	struct gfs2_rgrp *str = buf;
-	u32 rg_flags = 0;
 
-	if (rgd->rd_flags & GFS2_RDF_NOALLOC)
-		rg_flags |= GFS2_RGF_NOALLOC;
-	str->rg_flags = cpu_to_be32(rg_flags);
+	str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
 	str->rg_free = cpu_to_be32(rgd->rd_free);
 	str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
 	str->__pad = cpu_to_be32(0);
@@ -942,7 +936,7 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
 	struct gfs2_sbd *sdp = rgd->rd_sbd;
 	int ret = 0;
 
-	if (rgd->rd_flags & GFS2_RDF_NOALLOC)
+	if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
 		return 0;
 
 	spin_lock(&sdp->sd_rindex_spin);
@@ -1435,13 +1429,33 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 }
 
 /**
- * gfs2_alloc_block - Allocate a block
+ * gfs2_rgrp_dump - print out an rgrp
+ * @seq: The iterator
+ * @gl: The glock in question
+ *
+ */
+
+int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+	const struct gfs2_rgrpd *rgd = gl->gl_object;
+	if (rgd == NULL)
+		return 0;
+	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
+		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
+		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
+	return 0;
+}
+
+/**
+ * gfs2_alloc_block - Allocate one or more blocks
  * @ip: the inode to allocate the block for
+ * @bn: Used to return the starting block number
+ * @n: requested number of blocks/extent length (value/result)
  *
- * Returns: the allocated block
+ * Returns: 0 or error
  */
 
-u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
+int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct buffer_head *dibh;
@@ -1457,7 +1471,10 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
 		goal = rgd->rd_last_alloc;
 
 	blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED, n);
-	BUG_ON(blk == BFITNOENT);
+
+	/* Since all blocks are reserved in advance, this shouldn't happen */
+	if (blk == BFITNOENT)
+		goto rgrp_error;
 
 	rgd->rd_last_alloc = blk;
 	block = rgd->rd_data0 + blk;
@@ -1469,7 +1486,9 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
 		di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal);
 		brelse(dibh);
 	}
-	gfs2_assert_withdraw(sdp, rgd->rd_free >= *n);
+	if (rgd->rd_free < *n)
+		goto rgrp_error;
+
 	rgd->rd_free -= *n;
 
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
@@ -1484,7 +1503,16 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
 	rgd->rd_free_clone -= *n;
 	spin_unlock(&sdp->sd_rindex_spin);
 
-	return block;
+	*bn = block;
+	return 0;
+
+rgrp_error:
+	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
+	        (unsigned long long)rgd->rd_addr);
+	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
+	gfs2_rgrp_dump(NULL, rgd->rd_gl);
+	rgd->rd_flags |= GFS2_RDF_ERROR;
+	return -EIO;
 }
 
 /**
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 3181c7e..1e76ff0 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -14,22 +14,22 @@ struct gfs2_rgrpd;
 struct gfs2_sbd;
 struct gfs2_holder;
 
-void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
 
 struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk);
 struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
 struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
 
-void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
-int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
+extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
+extern int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
 
-int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
-void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
-void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
+extern int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
 
-void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
 
-struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
+extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
 static inline void gfs2_alloc_put(struct gfs2_inode *ip)
 {
 	BUG_ON(ip->i_alloc == NULL);
@@ -37,22 +37,22 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
 	ip->i_alloc = NULL;
 }
 
-int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
-			 char *file, unsigned int line);
+extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file,
+				  unsigned int line);
 #define gfs2_inplace_reserve(ip) \
 gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
 
-void gfs2_inplace_release(struct gfs2_inode *ip);
+extern void gfs2_inplace_release(struct gfs2_inode *ip);
 
-unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
+extern unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
 
-u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n);
-u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
+extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
+extern u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
 
-void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
-void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
-void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
-void gfs2_unlink_di(struct inode *inode);
+extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
+extern void gfs2_unlink_di(struct inode *inode);
 
 struct gfs2_rgrp_list {
 	unsigned int rl_rgrps;
@@ -61,10 +61,11 @@ struct gfs2_rgrp_list {
 	struct gfs2_holder *rl_ghs;
 };
 
-void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
-		    u64 block);
-void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
-void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
-u64 gfs2_ri_total(struct gfs2_sbd *sdp);
+extern void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
+			   u64 block);
+extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
+extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
+extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
+extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
 
 #endif /* __RGRP_DOT_H__ */
-- 
cgit v0.10.2


From d7b629a34fc4134a43c730b5f0197855dc4948d0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 20 May 2009 12:21:19 +0200
Subject: perf_counter: Solve the rotate_ctx vs inherit race differently

Instead of disabling RR scheduling of the counters, use a different list
that does not get rotated to iterate the counters on inheritance.

[ Impact: cleanup, optimization ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090520102553.237504544@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 13cb2fb..c8c1dfc2 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -508,7 +508,6 @@ struct perf_counter_context {
 	int			nr_counters;
 	int			nr_active;
 	int			is_active;
-	int			rr_allowed;
 	struct task_struct	*task;
 
 	/*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4d8f973..64113e6 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1120,8 +1120,7 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	__perf_counter_task_sched_out(ctx);
 
 	rotate_ctx(&cpuctx->ctx);
-	if (ctx->rr_allowed)
-		rotate_ctx(ctx);
+	rotate_ctx(ctx);
 
 	perf_counter_cpu_sched_in(cpuctx, cpu);
 	perf_counter_task_sched_in(curr, cpu);
@@ -3109,7 +3108,6 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
 	mutex_init(&ctx->mutex);
 	INIT_LIST_HEAD(&ctx->counter_list);
 	INIT_LIST_HEAD(&ctx->event_list);
-	ctx->rr_allowed = 1;
 	ctx->task = task;
 }
 
@@ -3350,14 +3348,14 @@ void perf_counter_init_task(struct task_struct *child)
 	 */
 	mutex_lock(&parent_ctx->mutex);
 
-	parent_ctx->rr_allowed = 0;
-	barrier(); /* irqs */
-
 	/*
 	 * We dont have to disable NMIs - we are only looking at
 	 * the list, not manipulating it:
 	 */
-	list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
+	list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) {
+		if (counter != counter->group_leader)
+			continue;
+
 		if (!counter->hw_event.inherit)
 			continue;
 
@@ -3366,9 +3364,6 @@ void perf_counter_init_task(struct task_struct *child)
 			break;
 	}
 
-	barrier(); /* irqs */
-	parent_ctx->rr_allowed = 1;
-
 	mutex_unlock(&parent_ctx->mutex);
 }
 
-- 
cgit v0.10.2


From 26b119bc811a73bac6ecf95bdf284bf31c7955f0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 20 May 2009 12:21:20 +0200
Subject: perf_counter: Log irq_period changes

For the dynamic irq_period code, log whenever we change the period so that
analyzing code can normalize the event flow.

[ Impact: add new feature to allow more precise profiling ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090520102553.298769743@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c8c1dfc2..f612941 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -258,6 +258,14 @@ enum perf_event_type {
 	PERF_EVENT_COMM			= 3,
 
 	/*
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 * 	u64				irq_period;
+	 * };
+	 */
+	PERF_EVENT_PERIOD		= 4,
+
+	/*
 	 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
 	 * will be PERF_RECORD_*
 	 *
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 64113e6..db02eb1 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1046,7 +1046,9 @@ int perf_counter_task_enable(void)
 	return 0;
 }
 
-void perf_adjust_freq(struct perf_counter_context *ctx)
+static void perf_log_period(struct perf_counter *counter, u64 period);
+
+static void perf_adjust_freq(struct perf_counter_context *ctx)
 {
 	struct perf_counter *counter;
 	u64 irq_period;
@@ -1072,6 +1074,8 @@ void perf_adjust_freq(struct perf_counter_context *ctx)
 		if (!irq_period)
 			irq_period = 1;
 
+		perf_log_period(counter, irq_period);
+
 		counter->hw.irq_period = irq_period;
 		counter->hw.interrupts = 0;
 	}
@@ -2407,6 +2411,40 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
 }
 
 /*
+ *
+ */
+
+static void perf_log_period(struct perf_counter *counter, u64 period)
+{
+	struct perf_output_handle handle;
+	int ret;
+
+	struct {
+		struct perf_event_header	header;
+		u64				time;
+		u64				period;
+	} freq_event = {
+		.header = {
+			.type = PERF_EVENT_PERIOD,
+			.misc = 0,
+			.size = sizeof(freq_event),
+		},
+		.time = sched_clock(),
+		.period = period,
+	};
+
+	if (counter->hw.irq_period == period)
+		return;
+
+	ret = perf_output_begin(&handle, counter, sizeof(freq_event), 0, 0);
+	if (ret)
+		return;
+
+	perf_output_put(&handle, freq_event);
+	perf_output_end(&handle);
+}
+
+/*
  * Generic counter overflow handling.
  */
 
-- 
cgit v0.10.2


From b986d7ec0f8b7ea3cc7366d80a137fbe839df227 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 20 May 2009 12:21:21 +0200
Subject: perf_counter: Optimize disable of time based sw counters

Currently we call hrtimer_cancel() unconditionally on disable of time based
software counters. Avoid when possible.

[ Impact: micro-optimize the code ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090520102553.388185031@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index db02eb1..473ed2c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2716,7 +2716,8 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 
 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
 {
-	hrtimer_cancel(&counter->hw.hrtimer);
+	if (counter->hw.irq_period)
+		hrtimer_cancel(&counter->hw.hrtimer);
 	cpu_clock_perf_counter_update(counter);
 }
 
@@ -2767,7 +2768,8 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter)
 
 static void task_clock_perf_counter_disable(struct perf_counter *counter)
 {
-	hrtimer_cancel(&counter->hw.hrtimer);
+	if (counter->hw.irq_period)
+		hrtimer_cancel(&counter->hw.hrtimer);
 	task_clock_perf_counter_update(counter, counter->ctx->time);
 
 }
-- 
cgit v0.10.2


From afedadf23a2c90f3ba0d963282cbe6a6be129494 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 20 May 2009 12:21:22 +0200
Subject: perf_counter: Optimize sched in/out of counters

Avoid a function call for !group counters by directly calling the counter
function.

[ Impact: micro-optimize the code ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090520102553.511933670@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 473ed2c..69d4de8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -826,8 +826,12 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 
 	perf_disable();
 	if (ctx->nr_active) {
-		list_for_each_entry(counter, &ctx->counter_list, list_entry)
-			group_sched_out(counter, cpuctx, ctx);
+		list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+			if (counter != counter->group_leader)
+				counter_sched_out(counter, cpuctx, ctx);
+			else
+				group_sched_out(counter, cpuctx, ctx);
+		}
 	}
 	perf_enable();
  out:
@@ -903,8 +907,12 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 		if (counter->cpu != -1 && counter->cpu != cpu)
 			continue;
 
-		if (group_can_go_on(counter, cpuctx, 1))
-			group_sched_in(counter, cpuctx, ctx, cpu);
+		if (counter != counter->group_leader)
+			counter_sched_in(counter, cpuctx, ctx, cpu);
+		else {
+			if (group_can_go_on(counter, cpuctx, 1))
+				group_sched_in(counter, cpuctx, ctx, cpu);
+		}
 
 		/*
 		 * If this pinned group hasn't been scheduled,
@@ -932,9 +940,14 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 		if (counter->cpu != -1 && counter->cpu != cpu)
 			continue;
 
-		if (group_can_go_on(counter, cpuctx, can_add_hw)) {
-			if (group_sched_in(counter, cpuctx, ctx, cpu))
+		if (counter != counter->group_leader) {
+			if (counter_sched_in(counter, cpuctx, ctx, cpu))
 				can_add_hw = 0;
+		} else {
+			if (group_can_go_on(counter, cpuctx, can_add_hw)) {
+				if (group_sched_in(counter, cpuctx, ctx, cpu))
+					can_add_hw = 0;
+			}
 		}
 	}
 	perf_enable();
-- 
cgit v0.10.2


From b3b778b387ed3849ebc4a51baf8617be90df6625 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 20 May 2009 17:05:52 +0200
Subject: ALSA: pcsp - fix printk format warning again

The commit 5a641bcd6398841cc4606b0a732d41a09256fd94 changed the
printk format to '%lu', but the value passed seems to be dependent
on the architecture.  On x86-64, I got a new warning now because an
int value is passed actaully.

As a workaround, just cast the value always to unsigned long.

Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/drivers/pcsp/pcsp_mixer.c b/sound/drivers/pcsp/pcsp_mixer.c
index 771955a..199b033 100644
--- a/sound/drivers/pcsp/pcsp_mixer.c
+++ b/sound/drivers/pcsp/pcsp_mixer.c
@@ -51,7 +51,7 @@ static int pcsp_treble_info(struct snd_kcontrol *kcontrol,
 	if (uinfo->value.enumerated.item > chip->max_treble)
 		uinfo->value.enumerated.item = chip->max_treble;
 	sprintf(uinfo->value.enumerated.name, "%lu",
-			PCSP_CALC_RATE(uinfo->value.enumerated.item));
+		(unsigned long)PCSP_CALC_RATE(uinfo->value.enumerated.item));
 	return 0;
 }
 
-- 
cgit v0.10.2


From 5537937696c55530447c20aa27daccb8d0d29b33 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Mon, 18 May 2009 23:04:46 +0800
Subject: ftrace: fix check for return value of register_module_notifier in
 event_trace_init

register_module_notifier() returns zero in the success case.
So fix the inverted fail case check in trace events modules
handler.

[ Impact: fix spurious warning on ftrace initialization]

Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0eec0c5..9e91c4a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1174,7 +1174,7 @@ static __init int event_trace_init(void)
 	}
 
 	ret = register_module_notifier(&trace_module_nb);
-	if (!ret)
+	if (ret)
 		pr_warning("Failed to register trace events module notifier\n");
 
 	return 0;
-- 
cgit v0.10.2


From 950312ce22696ddfa42a957bdadaa9c24151e679 Mon Sep 17 00:00:00 2001
From: Andrew Randrianasulu <randrik_a@yahoo.com>
Date: Thu, 14 May 2009 11:29:27 -0700
Subject: MIPS: IP32: Fix hang on shutdown in power button interrupt handler.

The hang was caused by the use of disable_irq() from the interrupt handler
itself.  Fixed by the use of disable_irq_nosync().  The issue was
triggered by:

    commit 3aa551c9b4c40018f0e261a178e3d25478dc04a9
    Author: Thomas Gleixner <tglx@linutronix.de>
    Date:   Mon Mar 23 18:28:15 2009 +0100

        genirq: add threaded interrupt handler support

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>

diff --git a/arch/mips/sgi-ip32/ip32-reset.c b/arch/mips/sgi-ip32/ip32-reset.c
index b6cab08..667da93 100644
--- a/arch/mips/sgi-ip32/ip32-reset.c
+++ b/arch/mips/sgi-ip32/ip32-reset.c
@@ -145,7 +145,7 @@ static irqreturn_t ip32_rtc_int(int irq, void *dev_id)
 			"%s: RTC IRQ without RTC_IRQF\n", __func__);
 	}
 	/* Wait until interrupt goes away */
-	disable_irq(MACEISA_RTC_IRQ);
+	disable_irq_nosync(MACEISA_RTC_IRQ);
 	init_timer(&debounce_timer);
 	debounce_timer.function = debounce;
 	debounce_timer.expires = jiffies + 50;
-- 
cgit v0.10.2


From 7e9e05cad94217498e4d9bd6ef7137b4e9e7ed64 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Sat, 16 May 2009 12:23:45 +0100
Subject: MIPS: IP22: Fix hang in power button interrupt handler

The hang was caused by the use of disable_irq() from the interrupt handler
itself.  Fixed by the use of disable_irq_nosync().  The issue was
triggered by:

    commit 3aa551c9b4c40018f0e261a178e3d25478dc04a9
    Author: Thomas Gleixner <tglx@linutronix.de>
    Date:   Mon Mar 23 18:28:15 2009 +0100

        genirq: add threaded interrupt handler support

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>

diff --git a/arch/mips/sgi-ip22/ip22-reset.c b/arch/mips/sgi-ip22/ip22-reset.c
index 4ad5c33..45b6694 100644
--- a/arch/mips/sgi-ip22/ip22-reset.c
+++ b/arch/mips/sgi-ip22/ip22-reset.c
@@ -148,7 +148,7 @@ static irqreturn_t panel_int(int irq, void *dev_id)
 
 	if (sgint->istat1 & SGINT_ISTAT1_PWR) {
 		/* Wait until interrupt goes away */
-		disable_irq(SGI_PANEL_IRQ);
+		disable_irq_nosync(SGI_PANEL_IRQ);
 		init_timer(&debounce_timer);
 		debounce_timer.function = debounce;
 		debounce_timer.expires = jiffies + 5;
-- 
cgit v0.10.2


From 195d1a96ae5fdfbedb8dc4b97afee578921fa99e Mon Sep 17 00:00:00 2001
From: peter fuerst <post@pfrst.de>
Date: Sun, 17 May 2009 23:49:45 +0200
Subject: MIPS: IP28: Change to build with -mr10k-cache-barrier=store

Richard Sandiford's new code for inserting the cache-barriers, for GCC
4.3 and above and already incorporated in the current GCC-release, uses
a slightly different option-syntax.

Signed-off-by: peter fuerst <post@pfrst.de>
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>

diff --git a/arch/mips/Makefile b/arch/mips/Makefile
index 26947ab..c4cae9e 100644
--- a/arch/mips/Makefile
+++ b/arch/mips/Makefile
@@ -473,12 +473,12 @@ endif
 # Simplified: what IP22 does at 128MB+ in ksegN, IP28 does at 512MB+ in xkphys
 #
 ifdef CONFIG_SGI_IP28
-  ifeq ($(call cc-option-yn,-mr10k-cache-barrier=1), n)
-      $(error gcc doesn't support needed option -mr10k-cache-barrier=1)
+  ifeq ($(call cc-option-yn,-mr10k-cache-barrier=store), n)
+      $(error gcc doesn't support needed option -mr10k-cache-barrier=store)
   endif
 endif
 core-$(CONFIG_SGI_IP28)		+= arch/mips/sgi-ip22/
-cflags-$(CONFIG_SGI_IP28)	+= -mr10k-cache-barrier=1 -I$(srctree)/arch/mips/include/asm/mach-ip28
+cflags-$(CONFIG_SGI_IP28)	+= -mr10k-cache-barrier=store -I$(srctree)/arch/mips/include/asm/mach-ip28
 load-$(CONFIG_SGI_IP28)		+= 0xa800000020004000
 
 #
-- 
cgit v0.10.2


From a5e696e5d0f1377ff6beb10d2f40edb6a3d1de18 Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@snapgear.com>
Date: Wed, 20 May 2009 16:12:32 +1000
Subject: MIPS: 64-bit: Fix system lockup.

The address range size calculation inside local_flush_tlb_kernel_range()
is being truncated by a too small size variable holder on 64-bit systems.
The truncated size can result in an erroneous tlbsize check that means we
sit spinning inside a loop trying to flush a hige number of TLB entries.
This is for all intents and purposes a system hang. Fix by using an
appropriately sized valiable to hold the size.

[Ralf: Greg's original patch submission identified the issue and fixed one
instance in tlb-r4k.c but there there were several more.  For consistency
I also modified tlb-r3k.c even though that file is only used on 32-bit.]

Signed-off-by: Greg Ungerer <gerg@snapgear.com>
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>

diff --git a/arch/mips/mm/tlb-r3k.c b/arch/mips/mm/tlb-r3k.c
index f0cf46a..1c0048a 100644
--- a/arch/mips/mm/tlb-r3k.c
+++ b/arch/mips/mm/tlb-r3k.c
@@ -82,8 +82,7 @@ void local_flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 	int cpu = smp_processor_id();
 
 	if (cpu_context(cpu, mm) != 0) {
-		unsigned long flags;
-		int size;
+		unsigned long size, flags;
 
 #ifdef DEBUG_TLB
 		printk("[tlbrange<%lu,0x%08lx,0x%08lx>]",
@@ -121,8 +120,7 @@ void local_flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 
 void local_flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-	unsigned long flags;
-	int size;
+	unsigned long size, flags;
 
 #ifdef DEBUG_TLB
 	printk("[tlbrange<%lu,0x%08lx,0x%08lx>]", start, end);
diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c
index 9619f66..892be42 100644
--- a/arch/mips/mm/tlb-r4k.c
+++ b/arch/mips/mm/tlb-r4k.c
@@ -117,8 +117,7 @@ void local_flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 	int cpu = smp_processor_id();
 
 	if (cpu_context(cpu, mm) != 0) {
-		unsigned long flags;
-		int size;
+		unsigned long size, flags;
 
 		ENTER_CRITICAL(flags);
 		size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
@@ -160,8 +159,7 @@ void local_flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 
 void local_flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-	unsigned long flags;
-	int size;
+	unsigned long size, flags;
 
 	ENTER_CRITICAL(flags);
 	size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
diff --git a/arch/mips/mm/tlb-r8k.c b/arch/mips/mm/tlb-r8k.c
index 4f01a3b..4ec95cc 100644
--- a/arch/mips/mm/tlb-r8k.c
+++ b/arch/mips/mm/tlb-r8k.c
@@ -111,8 +111,7 @@ out_restore:
 /* Usable for KV1 addresses only! */
 void local_flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-	unsigned long flags;
-	int size;
+	unsigned long size, flags;
 
 	size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
 	size = (size + 1) >> 1;
-- 
cgit v0.10.2


From 2b611cb6eed04062d0a9861c82248e02c844ba3f Mon Sep 17 00:00:00 2001
From: Pavel Roskin <proski@gnu.org>
Date: Fri, 27 Mar 2009 17:47:27 -0400
Subject: ath5k: fix scanning in AR2424

AR5K_PHY_PLL_40MHZ_5413 should not be ORed with AR5K_PHY_MODE_RAD_RF5112
for 5 GHz channels.

The incorrect PLL value breaks scanning in the countries where 5 GHz
channels are allowed.

Signed-off-by: Pavel Roskin <proski@gnu.org>
Acked-by: Nick Kossifidis <mickflemm@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/drivers/net/wireless/ath5k/reset.c b/drivers/net/wireless/ath5k/reset.c
index 7a17d31..faede82 100644
--- a/drivers/net/wireless/ath5k/reset.c
+++ b/drivers/net/wireless/ath5k/reset.c
@@ -359,7 +359,7 @@ int ath5k_hw_nic_wakeup(struct ath5k_hw *ah, int flags, bool initial)
 			mode |= AR5K_PHY_MODE_FREQ_5GHZ;
 
 			if (ah->ah_radio == AR5K_RF5413)
-				clock |= AR5K_PHY_PLL_40MHZ_5413;
+				clock = AR5K_PHY_PLL_40MHZ_5413;
 			else
 				clock |= AR5K_PHY_PLL_40MHZ;
 
-- 
cgit v0.10.2


From 88f16db7a2fa63b9242e8a0fbc40d51722f2e2f9 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Wed, 13 May 2009 12:04:30 +0200
Subject: wext: verify buffer size for SIOCSIWENCODEEXT

Another design flaw in wireless extensions (is anybody
surprised?) in the way it handles the iw_encode_ext
structure: The structure is part of the 'extra' memory
but contains the key length explicitly, instead of it
just being the length of the extra buffer - size of
the struct and using the explicit key length only for
the get operation (which only writes it).

Therefore, we have this layout:

extra: +-------------------------+
       | struct iw_encode_ext  { |
       |     ...                 |
       |     u16 key_len;        |
       |     u8 key[0];          |
       | };                      |
       +-------------------------+
       | key material            |
       +-------------------------+

Now, all drivers I checked use ext->key_len without
checking that both key_len and the struct fit into the
extra buffer that has been copied from userspace. This
leads to a buffer overrun while reading that buffer,
depending on the driver it may be possible to specify
arbitrary key_len or it may need to be a proper length
for the key algorithm specified.

Thankfully, this is only exploitable by root, but root
can actually cause a segfault or use kernel memory as
a key (which you can even get back with siocgiwencode
or siocgiwencodeext from the key buffer).

Fix this by verifying that key_len fits into the buffer
along with struct iw_encode_ext.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/net/wireless/wext.c b/net/wireless/wext.c
index cb6a5bb..0e59f9a 100644
--- a/net/wireless/wext.c
+++ b/net/wireless/wext.c
@@ -786,6 +786,13 @@ static int ioctl_standard_iw_point(struct iw_point *iwp, unsigned int cmd,
 			err = -EFAULT;
 			goto out;
 		}
+
+		if (cmd == SIOCSIWENCODEEXT) {
+			struct iw_encode_ext *ee = (void *) extra;
+
+			if (iwp->length < sizeof(*ee) + ee->key_len)
+				return -EFAULT;
+		}
 	}
 
 	err = handler(dev, info, (union iwreq_data *) iwp, extra);
-- 
cgit v0.10.2


From a54be5d43aa2d6febc5a4f8dd3b87b9429b60437 Mon Sep 17 00:00:00 2001
From: Forrest Zhang <forrest@hifulltech.com>
Date: Wed, 13 May 2009 11:14:39 -0400
Subject: ath5k: fix exp off-by-one when computing OFDM delta slope

Commit e8f055f0c3b ("ath5k: Update reset code") subtly changed the
code that computes floating point values for the PHY3_TIMING register
such that the exponent is off by a decimal point, which can cause
problems with OFDM channel operation.

get_bitmask_order() actually returns the highest bit set plus one,
whereas the previous code wanted the highest bit set.  Instead, use
ilog2 which is what this code is really calculating.  Also check
coef_scaled to handle the (invalid) case where we need log2(0).

Signed-off-by: Bob Copeland <me@bobcopeland.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/drivers/net/wireless/ath5k/reset.c b/drivers/net/wireless/ath5k/reset.c
index faede82..5f72c11 100644
--- a/drivers/net/wireless/ath5k/reset.c
+++ b/drivers/net/wireless/ath5k/reset.c
@@ -26,7 +26,7 @@
 \*****************************/
 
 #include <linux/pci.h> 		/* To determine if a card is pci-e */
-#include <linux/bitops.h>	/* For get_bitmask_order */
+#include <linux/log2.h>
 #include "ath5k.h"
 #include "reg.h"
 #include "base.h"
@@ -69,10 +69,10 @@ static inline int ath5k_hw_write_ofdm_timings(struct ath5k_hw *ah,
 
 	/* Get exponent
 	 * ALGO: coef_exp = 14 - highest set bit position */
-	coef_exp = get_bitmask_order(coef_scaled);
+	coef_exp = ilog2(coef_scaled);
 
 	/* Doesn't make sense if it's zero*/
-	if (!coef_exp)
+	if (!coef_scaled || !coef_exp)
 		return -EINVAL;
 
 	/* Note: we've shifted coef_scaled by 24 */
-- 
cgit v0.10.2


From 34adc8062227f41b04ade0ff3fbd1dbe3002669e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 20 May 2009 20:13:28 +0200
Subject: perf_counter: Fix context removal deadlock

Disable the PMU globally before removing a counter from a
context. This fixes the following lockup:

[22081.741922] ------------[ cut here ]------------
[22081.746668] WARNING: at arch/x86/kernel/cpu/perf_counter.c:803 intel_pmu_handle_irq+0x9b/0x24e()
[22081.755624] Hardware name: X8DTN
[22081.758903] perfcounters: irq loop stuck!
[22081.762985] Modules linked in:
[22081.766136] Pid: 11082, comm: perf Not tainted 2.6.30-rc6-tip #226
[22081.772432] Call Trace:
[22081.774940]  <NMI>  [<ffffffff81019aed>] ? intel_pmu_handle_irq+0x9b/0x24e
[22081.781993]  [<ffffffff81019aed>] ? intel_pmu_handle_irq+0x9b/0x24e
[22081.788368]  [<ffffffff8104505c>] ? warn_slowpath_common+0x77/0xa3
[22081.794649]  [<ffffffff810450d3>] ? warn_slowpath_fmt+0x40/0x45
[22081.800696]  [<ffffffff81019aed>] ? intel_pmu_handle_irq+0x9b/0x24e
[22081.807080]  [<ffffffff814d1a72>] ? perf_counter_nmi_handler+0x3f/0x4a
[22081.813751]  [<ffffffff814d2d09>] ? notifier_call_chain+0x58/0x86
[22081.819951]  [<ffffffff8105b250>] ? notify_die+0x2d/0x32
[22081.825392]  [<ffffffff814d1414>] ? do_nmi+0x8e/0x242
[22081.830538]  [<ffffffff814d0f0a>] ? nmi+0x1a/0x20
[22081.835342]  [<ffffffff8117e102>] ? selinux_file_free_security+0x0/0x1a
[22081.842105]  [<ffffffff81018793>] ? x86_pmu_disable_counter+0x15/0x41
[22081.848673]  <<EOE>>  [<ffffffff81018f3d>] ? x86_pmu_disable+0x86/0x103
[22081.855512]  [<ffffffff8108fedd>] ? __perf_counter_remove_from_context+0x0/0xfe
[22081.862926]  [<ffffffff8108fcbc>] ? counter_sched_out+0x30/0xce
[22081.868909]  [<ffffffff8108ff36>] ? __perf_counter_remove_from_context+0x59/0xfe
[22081.876382]  [<ffffffff8106808a>] ? smp_call_function_single+0x6c/0xe6
[22081.882955]  [<ffffffff81091b96>] ? perf_release+0x86/0x14c
[22081.888600]  [<ffffffff810c4c84>] ? __fput+0xe7/0x195
[22081.893718]  [<ffffffff810c213e>] ? filp_close+0x5b/0x62
[22081.899107]  [<ffffffff81046a70>] ? put_files_struct+0x64/0xc2
[22081.905031]  [<ffffffff8104841a>] ? do_exit+0x1e2/0x6ef
[22081.910360]  [<ffffffff814d0a60>] ? _spin_lock_irqsave+0x9/0xe
[22081.916292]  [<ffffffff8104898e>] ? do_group_exit+0x67/0x93
[22081.921953]  [<ffffffff810489cc>] ? sys_exit_group+0x12/0x16
[22081.927759]  [<ffffffff8100baab>] ? system_call_fastpath+0x16/0x1b
[22081.934076] ---[ end trace 3a3936ce3e1b4505 ]---

And could potentially also fix the lockup reported by Marcelo Tosatti.

Also, print more debug info in case of a detected lockup.

[ Impact: fix lockup ]

Reported-by: Marcelo Tosatti <mtosatti@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index c109819..6cc1660 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -740,6 +740,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 again:
 	if (++loops > 100) {
 		WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
+		perf_counter_print_debug();
 		return 1;
 	}
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 69d4de8..08584c1 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -208,18 +208,17 @@ static void __perf_counter_remove_from_context(void *info)
 		return;
 
 	spin_lock_irqsave(&ctx->lock, flags);
+	/*
+	 * Protect the list operation against NMI by disabling the
+	 * counters on a global level.
+	 */
+	perf_disable();
 
 	counter_sched_out(counter, cpuctx, ctx);
 
 	counter->task = NULL;
 
-	/*
-	 * Protect the list operation against NMI by disabling the
-	 * counters on a global level. NOP for non NMI based counters.
-	 */
-	perf_disable();
 	list_del_counter(counter, ctx);
-	perf_enable();
 
 	if (!ctx->task) {
 		/*
@@ -231,6 +230,7 @@ static void __perf_counter_remove_from_context(void *info)
 			    perf_max_counters - perf_reserved_percpu);
 	}
 
+	perf_enable();
 	spin_unlock_irqrestore(&ctx->lock, flags);
 }
 
-- 
cgit v0.10.2


From c6ac4c18fbc92a26df71ece609b082bc3099676b Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 20 May 2009 11:26:09 -0700
Subject: x86, boot: correct the calculation of ZO_INIT_SIZE

Correct the calculation of ZO_INIT_SIZE (the amount of memory we need
during decompression).  One symbol (ZO_startup_32) was missing from
zoffset.h, and another (ZO_z_extract_offset) was misspelled.

[ Impact: build fix ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 75e0301..619d297 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -95,7 +95,7 @@ targets += voffset.h
 $(obj)/voffset.h: vmlinux FORCE
 	$(call if_changed,voffset)
 
-sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p'
+sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(startup_32\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p'
 
 quiet_cmd_zoffset = ZOFFSET $@
       cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 68c3bfb..1040f6e 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -226,7 +226,7 @@ setup_data:		.quad 0			# 64-bit physical pointer to
 
 pref_address:		.quad LOAD_PHYSICAL_ADDR	# preferred load addr
 
-#define ZO_INIT_SIZE	(ZO__end - ZO_startup_32 + ZO_extract_offset)
+#define ZO_INIT_SIZE	(ZO__end - ZO_startup_32 + ZO_z_extract_offset)
 #define VO_INIT_SIZE	(VO__end - VO__text)
 #if ZO_INIT_SIZE > VO_INIT_SIZE
 #define INIT_SIZE ZO_INIT_SIZE
-- 
cgit v0.10.2


From fbc9f97bbf5e1eaee562eba93dc60faaff3f3bfa Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Fri, 15 May 2009 16:13:46 -0700
Subject: iwlwifi: do not cancel delayed work inside spin_lock_irqsave

Calling cancel_delayed_work() from inside
spin_lock_irqsave, introduces a potential deadlock.

As explained by Johannes Berg <johannes@sipsolutions.net>

A - lock
T - timer

phase                   CPU 1           CPU 2
---------------------------------------------

some place that calls
cancel_timer_sync()
(which is the | code)
                                        lock-irq(A)
|                                       "lock-irq"(T)
|                                       "unlock"(T)
|                                       wait(T)
                                        unlock(A)

timer softirq
                        "lock"(T)
                        run(T)
                        "unlock"(T)

irq handler
          lock(A)
          unlock(A)

Now all that again, interleaved, leading to deadlock:

                                        lock-irq(A)
                        "lock"(T)
                         run(T)
IRQ during or maybe
before run(T) -->        lock(A)
                                        "lock-irq"(T)
                                        wait(T)

We fix this by moving the call to cancel_delayed_work() into workqueue.
There are cases where the work may not actually be queued or running
at the time we are trying to cancel it, but cancel_delayed_work() is
able to deal with this.

Also cleanup iwl_set_mode related to this call. This function
(iwl_set_mode) is only called when bringing interface up and there will
thus not be any scanning done. No need to try to cancel scanning.

Fixes http://bugzilla.kernel.org/show_bug.cgi?id=13224, which was also
reported at http://marc.info/?l=linux-wireless&m=124081921903223&w=2 .

Tested-by: Miles Lane <miles.lane@gmail.com>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Acked-by: Zhu Yi <yi.zhu@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/drivers/net/wireless/iwlwifi/iwl-agn.c b/drivers/net/wireless/iwlwifi/iwl-agn.c
index 3bb28db..f46ba24 100644
--- a/drivers/net/wireless/iwlwifi/iwl-agn.c
+++ b/drivers/net/wireless/iwlwifi/iwl-agn.c
@@ -669,13 +669,6 @@ static int iwl_set_mode(struct iwl_priv *priv, int mode)
 	if (!iwl_is_ready_rf(priv))
 		return -EAGAIN;
 
-	cancel_delayed_work(&priv->scan_check);
-	if (iwl_scan_cancel_timeout(priv, 100)) {
-		IWL_WARN(priv, "Aborted scan still in progress after 100ms\n");
-		IWL_DEBUG_MAC80211(priv, "leaving - scan abort failed.\n");
-		return -EAGAIN;
-	}
-
 	iwl_commit_rxon(priv);
 
 	return 0;
diff --git a/drivers/net/wireless/iwlwifi/iwl-scan.c b/drivers/net/wireless/iwlwifi/iwl-scan.c
index e7c65c4..6330b91 100644
--- a/drivers/net/wireless/iwlwifi/iwl-scan.c
+++ b/drivers/net/wireless/iwlwifi/iwl-scan.c
@@ -227,9 +227,6 @@ static void iwl_rx_scan_complete_notif(struct iwl_priv *priv,
 	/* The HW is no longer scanning */
 	clear_bit(STATUS_SCAN_HW, &priv->status);
 
-	/* The scan completion notification came in, so kill that timer... */
-	cancel_delayed_work(&priv->scan_check);
-
 	IWL_DEBUG_INFO(priv, "Scan pass on %sGHz took %dms\n",
 		       (priv->scan_bands & BIT(IEEE80211_BAND_2GHZ)) ?
 						"2.4" : "5.2",
@@ -712,6 +709,8 @@ static void iwl_bg_request_scan(struct work_struct *data)
 
 	mutex_lock(&priv->mutex);
 
+	cancel_delayed_work(&priv->scan_check);
+
 	if (!iwl_is_ready(priv)) {
 		IWL_WARN(priv, "request scan called when driver not ready.\n");
 		goto done;
@@ -925,6 +924,8 @@ void iwl_bg_scan_completed(struct work_struct *work)
 
 	IWL_DEBUG_SCAN(priv, "SCAN complete scan\n");
 
+	cancel_delayed_work(&priv->scan_check);
+
 	ieee80211_scan_completed(priv->hw, false);
 
 	if (test_bit(STATUS_EXIT_PENDING, &priv->status))
diff --git a/drivers/net/wireless/iwlwifi/iwl3945-base.c b/drivers/net/wireless/iwlwifi/iwl3945-base.c
index 4cce661..ff4d0e4 100644
--- a/drivers/net/wireless/iwlwifi/iwl3945-base.c
+++ b/drivers/net/wireless/iwlwifi/iwl3945-base.c
@@ -782,13 +782,6 @@ static int iwl3945_set_mode(struct iwl_priv *priv, int mode)
 	if (!iwl_is_ready_rf(priv))
 		return -EAGAIN;
 
-	cancel_delayed_work(&priv->scan_check);
-	if (iwl_scan_cancel_timeout(priv, 100)) {
-		IWL_WARN(priv, "Aborted scan still in progress after 100ms\n");
-		IWL_DEBUG_MAC80211(priv, "leaving - scan abort failed.\n");
-		return -EAGAIN;
-	}
-
 	iwl3945_commit_rxon(priv);
 
 	return 0;
@@ -3298,6 +3291,8 @@ static void iwl3945_bg_request_scan(struct work_struct *data)
 
 	mutex_lock(&priv->mutex);
 
+	cancel_delayed_work(&priv->scan_check);
+
 	if (!iwl_is_ready(priv)) {
 		IWL_WARN(priv, "request scan called when driver not ready.\n");
 		goto done;
-- 
cgit v0.10.2


From 875690c378d64d9ee2de15cad8206d3f11ae5096 Mon Sep 17 00:00:00 2001
From: Fabio Rossi <rossi.f@inwind.it>
Date: Wed, 1 Apr 2009 20:37:50 +0200
Subject: ath5k: fix interpolation with equal power levels

When the EEPROM contains weird values for the power levels we have to
fix the interpolation process.

Signed-off-by: Fabio Rossi <rossi.f@inwind.it>
Acked-by: Nick Kossifidis <mickflemm@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/drivers/net/wireless/ath5k/phy.c b/drivers/net/wireless/ath5k/phy.c
index 9e2faae..b48b29d 100644
--- a/drivers/net/wireless/ath5k/phy.c
+++ b/drivers/net/wireless/ath5k/phy.c
@@ -1487,28 +1487,35 @@ ath5k_get_linear_pcdac_min(const u8 *stepL, const u8 *stepR,
 {
 	s8 tmp;
 	s16 min_pwrL, min_pwrR;
-	s16 pwr_i = pwrL[0];
-
-	do {
-		pwr_i--;
-		tmp = (s8) ath5k_get_interpolated_value(pwr_i,
-						pwrL[0], pwrL[1],
-						stepL[0], stepL[1]);
-
-	} while (tmp > 1);
-
-	min_pwrL = pwr_i;
-
-	pwr_i = pwrR[0];
-	do {
-		pwr_i--;
-		tmp = (s8) ath5k_get_interpolated_value(pwr_i,
-						pwrR[0], pwrR[1],
-						stepR[0], stepR[1]);
-
-	} while (tmp > 1);
+	s16 pwr_i;
+
+	if (pwrL[0] == pwrL[1])
+		min_pwrL = pwrL[0];
+	else {
+		pwr_i = pwrL[0];
+		do {
+			pwr_i--;
+			tmp = (s8) ath5k_get_interpolated_value(pwr_i,
+							pwrL[0], pwrL[1],
+							stepL[0], stepL[1]);
+		} while (tmp > 1);
+
+		min_pwrL = pwr_i;
+	}
 
-	min_pwrR = pwr_i;
+	if (pwrR[0] == pwrR[1])
+		min_pwrR = pwrR[0];
+	else {
+		pwr_i = pwrR[0];
+		do {
+			pwr_i--;
+			tmp = (s8) ath5k_get_interpolated_value(pwr_i,
+							pwrR[0], pwrR[1],
+							stepR[0], stepR[1]);
+		} while (tmp > 1);
+
+		min_pwrR = pwr_i;
+	}
 
 	/* Keep the right boundary so that it works for both curves */
 	return max(min_pwrL, min_pwrR);
-- 
cgit v0.10.2


From 267d493b322b05984048aef8ea9b5b213490bbe0 Mon Sep 17 00:00:00 2001
From: "John W. Linville" <linville@tuxdriver.com>
Date: Wed, 20 May 2009 10:51:41 -0400
Subject: airo: fix airo_get_encode{,ext} buffer overflow like I mean it...

"airo: airo_get_encode{,ext} potential buffer overflow" was actually a
no-op, due to an unrecognized type overflow in an assignment.  Oddly,
gcc only seems to tell me about it when using -Wextra...grrr...

Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c
index d734757..9eabf4d 100644
--- a/drivers/net/wireless/airo.c
+++ b/drivers/net/wireless/airo.c
@@ -6467,6 +6467,7 @@ static int airo_get_encode(struct net_device *dev,
 {
 	struct airo_info *local = dev->ml_priv;
 	int index = (dwrq->flags & IW_ENCODE_INDEX) - 1;
+	int wep_key_len;
 	u8 buf[16];
 
 	if (!local->wep_capable)
@@ -6500,11 +6501,13 @@ static int airo_get_encode(struct net_device *dev,
 	dwrq->flags |= index + 1;
 
 	/* Copy the key to the user buffer */
-	dwrq->length = get_wep_key(local, index, &buf[0], sizeof(buf));
-	if (dwrq->length != -1)
-		memcpy(extra, buf, dwrq->length);
-	else
+	wep_key_len = get_wep_key(local, index, &buf[0], sizeof(buf));
+	if (wep_key_len < 0) {
 		dwrq->length = 0;
+	} else {
+		dwrq->length = wep_key_len;
+		memcpy(extra, buf, dwrq->length);
+	}
 
 	return 0;
 }
@@ -6617,7 +6620,7 @@ static int airo_get_encodeext(struct net_device *dev,
 	struct airo_info *local = dev->ml_priv;
 	struct iw_point *encoding = &wrqu->encoding;
 	struct iw_encode_ext *ext = (struct iw_encode_ext *)extra;
-	int idx, max_key_len;
+	int idx, max_key_len, wep_key_len;
 	u8 buf[16];
 
 	if (!local->wep_capable)
@@ -6661,11 +6664,13 @@ static int airo_get_encodeext(struct net_device *dev,
 	memset(extra, 0, 16);
 	
 	/* Copy the key to the user buffer */
-	ext->key_len = get_wep_key(local, idx, &buf[0], sizeof(buf));
-	if (ext->key_len != -1)
-		memcpy(extra, buf, ext->key_len);
-	else
+	wep_key_len = get_wep_key(local, idx, &buf[0], sizeof(buf));
+	if (wep_key_len < 0) {
 		ext->key_len = 0;
+	} else {
+		ext->key_len = wep_key_len;
+		memcpy(extra, buf, ext->key_len);
+	}
 
 	return 0;
 }
-- 
cgit v0.10.2


From 5078b2e32ad4b1f753b1c837c15892202f753c97 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <lrodriguez@atheros.com>
Date: Wed, 13 May 2009 17:04:42 -0400
Subject: cfg80211: fix race between core hint and driver's custom apply

Its possible for cfg80211 to have scheduled the work and for
the global workqueue to not have kicked in prior to a cfg80211
driver's regulatory hint or wiphy_apply_custom_regulatory().

Although this is very unlikely its possible and should fix
this race. When this race would happen you are expected to have
hit a null pointer dereference panic.

Cc: stable@kernel.org
Signed-off-by: Luis R. Rodriguez <lrodriguez@atheros.com>
Tested-by: Alan Jenkins <alan-jenkins@tuffmail.co.uk>
Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 08265ca..487cb62 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -1551,6 +1551,13 @@ static int regulatory_hint_core(const char *alpha2)
 
 	queue_regulatory_request(request);
 
+	/*
+	 * This ensures last_request is populated once modules
+	 * come swinging in and calling regulatory hints and
+	 * wiphy_apply_custom_regulatory().
+	 */
+	flush_scheduled_work();
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From c9d2fbf36df5e04efa226614093bb1bacc6fe131 Mon Sep 17 00:00:00 2001
From: Jay Sternberg <jay.e.sternberg@linux.intel.com>
Date: Tue, 19 May 2009 14:56:36 -0700
Subject: iwlwifi: update 5000 ucode support to version 2 of API

enable iwl driver to support 5000 ucode having version 2 of API

Signed-off-by: Jay Sternberg <jay.e.sternberg@linux.intel.com>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/drivers/net/wireless/iwlwifi/iwl-5000.c b/drivers/net/wireless/iwlwifi/iwl-5000.c
index e5ca251..9452461 100644
--- a/drivers/net/wireless/iwlwifi/iwl-5000.c
+++ b/drivers/net/wireless/iwlwifi/iwl-5000.c
@@ -46,7 +46,7 @@
 #include "iwl-6000-hw.h"
 
 /* Highest firmware API version supported */
-#define IWL5000_UCODE_API_MAX 1
+#define IWL5000_UCODE_API_MAX 2
 #define IWL5150_UCODE_API_MAX 2
 
 /* Lowest firmware API version supported */
-- 
cgit v0.10.2


From a6c67339784db5763d6f20ae1881aeebe8c5a9f4 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Wed, 20 May 2009 02:12:56 +0200
Subject: wireless: beyond ARRAY_SIZE of intf->crypto_stats

Do not go beyond ARRAY_SIZE of intf->crypto_stats

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Acked-by: Ivo van Doorn <IvDoorn@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/drivers/net/wireless/rt2x00/rt2x00debug.c b/drivers/net/wireless/rt2x00/rt2x00debug.c
index 07d378e..7b3ee8c 100644
--- a/drivers/net/wireless/rt2x00/rt2x00debug.c
+++ b/drivers/net/wireless/rt2x00/rt2x00debug.c
@@ -138,7 +138,7 @@ void rt2x00debug_update_crypto(struct rt2x00_dev *rt2x00dev,
 
 	if (cipher == CIPHER_TKIP_NO_MIC)
 		cipher = CIPHER_TKIP;
-	if (cipher == CIPHER_NONE || cipher > CIPHER_MAX)
+	if (cipher == CIPHER_NONE || cipher >= CIPHER_MAX)
 		return;
 
 	/* Remove CIPHER_NONE index */
-- 
cgit v0.10.2


From 9b6fe313bfce27d4a261257da70196be0ac2bef5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michel=20D=C3=A4nzer?= <daenzer@vmware.com>
Date: Wed, 20 May 2009 13:32:00 +0200
Subject: drm: Copy back ioctl data to userspace regardless of return code.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes a regression from commit 9d5b3ffc42f7820e8ee07705496955e4c2c38dd9
('drm: fixup some of the ioctl function exit paths'): The vblank ioctl
needs to update the userspace parameters when interrupted by a signal,
which was prevented by the return code check. This could cause the X
server to hang in drmWaitVBlank().

Signed-off-by: Michel Dänzer <daenzer@vmware.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>

diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index f01def1..019b7c5 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -481,7 +481,7 @@ int drm_ioctl(struct inode *inode, struct file *filp,
 		}
 		retcode = func(dev, kdata, file_priv);
 
-		if ((retcode == 0) && (cmd & IOC_OUT)) {
+		if (cmd & IOC_OUT) {
 			if (copy_to_user((void __user *)arg, kdata,
 					 _IOC_SIZE(cmd)) != 0)
 				retcode = -EFAULT;
-- 
cgit v0.10.2


From 03fbdb15c14e9746c63168e3ff2c64b9c8336d33 Mon Sep 17 00:00:00 2001
From: Alessandro Rubini <rubini@gnudd.com>
Date: Wed, 20 May 2009 22:39:08 +0100
Subject: [ARM] 5519/1: amba probe: pass "struct amba_id *" instead of void *

The second argument of the probe method points to the amba_id
structure, so it's better passed with the correct type. None of the
current in-tree drivers uses the pointer, so they have only been
checked for a clean compile.

Change suggested by Russell King.

Signed-off-by: Alessandro Rubini <rubini@unipv.it>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

diff --git a/drivers/input/serio/ambakmi.c b/drivers/input/serio/ambakmi.c
index e29cdc1..a28c06d 100644
--- a/drivers/input/serio/ambakmi.c
+++ b/drivers/input/serio/ambakmi.c
@@ -107,7 +107,7 @@ static void amba_kmi_close(struct serio *io)
 	clk_disable(kmi->clk);
 }
 
-static int amba_kmi_probe(struct amba_device *dev, void *id)
+static int amba_kmi_probe(struct amba_device *dev, struct amba_id *id)
 {
 	struct amba_kmi_port *kmi;
 	struct serio *io;
diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
index 36875dc..7d4febd 100644
--- a/drivers/mmc/host/mmci.c
+++ b/drivers/mmc/host/mmci.c
@@ -490,7 +490,7 @@ static void mmci_check_status(unsigned long data)
 	mod_timer(&host->timer, jiffies + HZ);
 }
 
-static int __devinit mmci_probe(struct amba_device *dev, void *id)
+static int __devinit mmci_probe(struct amba_device *dev, struct amba_id *id)
 {
 	struct mmc_platform_data *plat = dev->dev.platform_data;
 	struct mmci_host *host;
diff --git a/drivers/rtc/rtc-pl030.c b/drivers/rtc/rtc-pl030.c
index 8261535..aaf1f75 100644
--- a/drivers/rtc/rtc-pl030.c
+++ b/drivers/rtc/rtc-pl030.c
@@ -102,7 +102,7 @@ static const struct rtc_class_ops pl030_ops = {
 	.set_alarm	= pl030_set_alarm,
 };
 
-static int pl030_probe(struct amba_device *dev, void *id)
+static int pl030_probe(struct amba_device *dev, struct amba_id *id)
 {
 	struct pl030_rtc *rtc;
 	int ret;
diff --git a/drivers/rtc/rtc-pl031.c b/drivers/rtc/rtc-pl031.c
index 333eec6..451fc13 100644
--- a/drivers/rtc/rtc-pl031.c
+++ b/drivers/rtc/rtc-pl031.c
@@ -127,7 +127,7 @@ static int pl031_remove(struct amba_device *adev)
 	return 0;
 }
 
-static int pl031_probe(struct amba_device *adev, void *id)
+static int pl031_probe(struct amba_device *adev, struct amba_id *id)
 {
 	int ret;
 	struct pl031_local *ldata;
diff --git a/drivers/serial/amba-pl010.c b/drivers/serial/amba-pl010.c
index e3a5ad5..cdc049d 100644
--- a/drivers/serial/amba-pl010.c
+++ b/drivers/serial/amba-pl010.c
@@ -665,7 +665,7 @@ static struct uart_driver amba_reg = {
 	.cons			= AMBA_CONSOLE,
 };
 
-static int pl010_probe(struct amba_device *dev, void *id)
+static int pl010_probe(struct amba_device *dev, struct amba_id *id)
 {
 	struct uart_amba_port *uap;
 	void __iomem *base;
diff --git a/drivers/serial/amba-pl011.c b/drivers/serial/amba-pl011.c
index 8b2b970..88fdac5 100644
--- a/drivers/serial/amba-pl011.c
+++ b/drivers/serial/amba-pl011.c
@@ -729,7 +729,7 @@ static struct uart_driver amba_reg = {
 	.cons			= AMBA_CONSOLE,
 };
 
-static int pl011_probe(struct amba_device *dev, void *id)
+static int pl011_probe(struct amba_device *dev, struct amba_id *id)
 {
 	struct uart_amba_port *uap;
 	void __iomem *base;
diff --git a/drivers/video/amba-clcd.c b/drivers/video/amba-clcd.c
index 61050ab..d1f80ba 100644
--- a/drivers/video/amba-clcd.c
+++ b/drivers/video/amba-clcd.c
@@ -437,7 +437,7 @@ static int clcdfb_register(struct clcd_fb *fb)
 	return ret;
 }
 
-static int clcdfb_probe(struct amba_device *dev, void *id)
+static int clcdfb_probe(struct amba_device *dev, struct amba_id *id)
 {
 	struct clcd_board *board = dev->dev.platform_data;
 	struct clcd_fb *fb;
diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h
index 51e6e54..9b93caf 100644
--- a/include/linux/amba/bus.h
+++ b/include/linux/amba/bus.h
@@ -28,7 +28,7 @@ struct amba_id {
 
 struct amba_driver {
 	struct device_driver	drv;
-	int			(*probe)(struct amba_device *, void *);
+	int			(*probe)(struct amba_device *, struct amba_id *);
 	int			(*remove)(struct amba_device *);
 	void			(*shutdown)(struct amba_device *);
 	int			(*suspend)(struct amba_device *, pm_message_t);
diff --git a/sound/arm/aaci.c b/sound/arm/aaci.c
index 7fbd68f..5c48e36 100644
--- a/sound/arm/aaci.c
+++ b/sound/arm/aaci.c
@@ -1074,7 +1074,7 @@ static unsigned int __devinit aaci_size_fifo(struct aaci *aaci)
 	return i;
 }
 
-static int __devinit aaci_probe(struct amba_device *dev, void *id)
+static int __devinit aaci_probe(struct amba_device *dev, struct amba_id *id)
 {
 	struct aaci *aaci;
 	int ret, i;
-- 
cgit v0.10.2


From cf8da764fc6959b7efb482f375dfef9830e98205 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 19 May 2009 18:54:22 +0000
Subject: net: fix length computation in rt_check_expire()

rt_check_expire() computes average and standard deviation of chain lengths,
but not correclty reset length to 0 at beginning of each chain.
This probably gives overflows for sum2 (and sum) on loaded machines instead
of meaningful results.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index c4c60e9..869cf1c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -785,7 +785,7 @@ static void rt_check_expire(void)
 	static unsigned int rover;
 	unsigned int i = rover, goal;
 	struct rtable *rth, **rthp;
-	unsigned long length = 0, samples = 0;
+	unsigned long samples = 0;
 	unsigned long sum = 0, sum2 = 0;
 	u64 mult;
 
@@ -795,9 +795,9 @@ static void rt_check_expire(void)
 	goal = (unsigned int)mult;
 	if (goal > rt_hash_mask)
 		goal = rt_hash_mask + 1;
-	length = 0;
 	for (; goal > 0; goal--) {
 		unsigned long tmo = ip_rt_gc_timeout;
+		unsigned long length;
 
 		i = (i + 1) & rt_hash_mask;
 		rthp = &rt_hash_table[i].chain;
@@ -809,6 +809,7 @@ static void rt_check_expire(void)
 
 		if (*rthp == NULL)
 			continue;
+		length = 0;
 		spin_lock_bh(rt_hash_lock_addr(i));
 		while ((rth = *rthp) != NULL) {
 			if (rt_is_expired(rth)) {
-- 
cgit v0.10.2


From 1ddbcb005c395518c2cd0df504cff3d4b5c85853 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 19 May 2009 20:14:28 +0000
Subject: net: fix rtable leak in net/ipv4/route.c

Alexander V. Lukyanov found a regression in 2.6.29 and made a complete
analysis found in http://bugzilla.kernel.org/show_bug.cgi?id=13339
Quoted here because its a perfect one :

begin_of_quotation
 2.6.29 patch has introduced flexible route cache rebuilding. Unfortunately the
 patch has at least one critical flaw, and another problem.

 rt_intern_hash calculates rthi pointer, which is later used for new entry
 insertion. The same loop calculates cand pointer which is used to clean the
 list. If the pointers are the same, rtable leak occurs, as first the cand is
 removed then the new entry is appended to it.

 This leak leads to unregister_netdevice problem (usage count > 0).

 Another problem of the patch is that it tries to insert the entries in certain
 order, to facilitate counting of entries distinct by all but QoS parameters.
 Unfortunately, referencing an existing rtable entry moves it to list beginning,
 to speed up further lookups, so the carefully built order is destroyed.

 For the first problem the simplest patch it to set rthi=0 when rthi==cand, but
 it will also destroy the ordering.
end_of_quotation

Problematic commit is 1080d709fb9d8cd4392f93476ee46a9d6ea05a5b
(net: implement emergency route cache rebulds when gc_elasticity is exceeded)

Trying to keep dst_entries ordered is too complex and breaks the fact that
order should depend on the frequency of use for garbage collection.

A possible fix is to make rt_intern_hash() simpler, and only makes
rt_check_expire() a litle bit smarter, being able to cope with an arbitrary
entries order. The added loop is running on cache hot data, while cpu
is prefetching next object, so should be unnoticied.

Reported-and-analyzed-by: Alexander V. Lukyanov <lav@yar.ru>
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 869cf1c..28205e5 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -784,7 +784,7 @@ static void rt_check_expire(void)
 {
 	static unsigned int rover;
 	unsigned int i = rover, goal;
-	struct rtable *rth, **rthp;
+	struct rtable *rth, *aux, **rthp;
 	unsigned long samples = 0;
 	unsigned long sum = 0, sum2 = 0;
 	u64 mult;
@@ -812,6 +812,7 @@ static void rt_check_expire(void)
 		length = 0;
 		spin_lock_bh(rt_hash_lock_addr(i));
 		while ((rth = *rthp) != NULL) {
+			prefetch(rth->u.dst.rt_next);
 			if (rt_is_expired(rth)) {
 				*rthp = rth->u.dst.rt_next;
 				rt_free(rth);
@@ -820,33 +821,30 @@ static void rt_check_expire(void)
 			if (rth->u.dst.expires) {
 				/* Entry is expired even if it is in use */
 				if (time_before_eq(jiffies, rth->u.dst.expires)) {
+nofree:
 					tmo >>= 1;
 					rthp = &rth->u.dst.rt_next;
 					/*
-					 * Only bump our length if the hash
-					 * inputs on entries n and n+1 are not
-					 * the same, we only count entries on
+					 * We only count entries on
 					 * a chain with equal hash inputs once
 					 * so that entries for different QOS
 					 * levels, and other non-hash input
 					 * attributes don't unfairly skew
 					 * the length computation
 					 */
-					if ((*rthp == NULL) ||
-					    !compare_hash_inputs(&(*rthp)->fl,
-								 &rth->fl))
-						length += ONE;
+					for (aux = rt_hash_table[i].chain;;) {
+						if (aux == rth) {
+							length += ONE;
+							break;
+						}
+						if (compare_hash_inputs(&aux->fl, &rth->fl))
+							break;
+						aux = aux->u.dst.rt_next;
+					}
 					continue;
 				}
-			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
-				tmo >>= 1;
-				rthp = &rth->u.dst.rt_next;
-				if ((*rthp == NULL) ||
-				    !compare_hash_inputs(&(*rthp)->fl,
-							 &rth->fl))
-					length += ONE;
-				continue;
-			}
+			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
+				goto nofree;
 
 			/* Cleanup aged off entries. */
 			*rthp = rth->u.dst.rt_next;
@@ -1069,7 +1067,6 @@ out:	return 0;
 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 {
 	struct rtable	*rth, **rthp;
-	struct rtable	*rthi;
 	unsigned long	now;
 	struct rtable *cand, **candp;
 	u32 		min_score;
@@ -1089,7 +1086,6 @@ restart:
 	}
 
 	rthp = &rt_hash_table[hash].chain;
-	rthi = NULL;
 
 	spin_lock_bh(rt_hash_lock_addr(hash));
 	while ((rth = *rthp) != NULL) {
@@ -1135,17 +1131,6 @@ restart:
 		chain_length++;
 
 		rthp = &rth->u.dst.rt_next;
-
-		/*
-		 * check to see if the next entry in the chain
-		 * contains the same hash input values as rt.  If it does
-		 * This is where we will insert into the list, instead of
-		 * at the head.  This groups entries that differ by aspects not
-		 * relvant to the hash function together, which we use to adjust
-		 * our chain length
-		 */
-		if (*rthp && compare_hash_inputs(&(*rthp)->fl, &rt->fl))
-			rthi = rth;
 	}
 
 	if (cand) {
@@ -1206,10 +1191,7 @@ restart:
 		}
 	}
 
-	if (rthi)
-		rt->u.dst.rt_next = rthi->u.dst.rt_next;
-	else
-		rt->u.dst.rt_next = rt_hash_table[hash].chain;
+	rt->u.dst.rt_next = rt_hash_table[hash].chain;
 
 #if RT_CACHE_DEBUG >= 2
 	if (rt->u.dst.rt_next) {
@@ -1225,10 +1207,7 @@ restart:
 	 * previous writes to rt are comitted to memory
 	 * before making rt visible to other CPUS.
 	 */
-	if (rthi)
-		rcu_assign_pointer(rthi->u.dst.rt_next, rt);
-	else
-		rcu_assign_pointer(rt_hash_table[hash].chain, rt);
+	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
 
 	spin_unlock_bh(rt_hash_lock_addr(hash));
 	*rp = rt;
-- 
cgit v0.10.2


From 4f72427998b105392e60bae7a6798a0c96fe4f0a Mon Sep 17 00:00:00 2001
From: Jean-Mickael Guerin <jean-mickael.guerin@6wind.com>
Date: Wed, 20 May 2009 17:38:59 -0700
Subject: IPv6: set RTPROT_KERNEL to initial route

The use of unspecified protocol in IPv6 initial route prevents quagga to
install IPv6 default route:
# show ipv6 route
S   ::/0 [1/0] via fe80::1, eth1_0
K>* ::/0 is directly connected, lo, rej
C>* ::1/128 is directly connected, lo
C>* fe80::/64 is directly connected, eth1_0

# ip -6 route
fe80::/64 dev eth1_0  proto kernel  metric 256  mtu 1500 advmss 1440
hoplimit -1
ff00::/8 dev eth1_0  metric 256  mtu 1500 advmss 1440 hoplimit -1
unreachable default dev lo  proto none  metric -1  error -101 hoplimit 255

The attached patch ensures RTPROT_KERNEL to the default initial route
and fixes the problem for quagga.
This is similar to "ipv6: protocol for address routes"
f410a1fba7afa79d2992620e874a343fdba28332.

# show ipv6 route
S>* ::/0 [1/0] via fe80::1, eth1_0
C>* ::1/128 is directly connected, lo
C>* fe80::/64 is directly connected, eth1_0

# ip -6 route
fe80::/64 dev eth1_0  proto kernel  metric 256  mtu 1500 advmss 1440
hoplimit -1
fe80::/64 dev eth1_0  proto kernel  metric 256  mtu 1500 advmss 1440
hoplimit -1
ff00::/8 dev eth1_0  metric 256  mtu 1500 advmss 1440 hoplimit -1
default via fe80::1 dev eth1_0  proto zebra  metric 1024  mtu 1500
advmss 1440 hoplimit -1
unreachable default dev lo  proto kernel  metric -1  error -101 hoplimit 255

Signed-off-by: Jean-Mickael Guerin <jean-mickael.guerin@6wind.com>
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1394ddb..032a5ec 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -137,6 +137,7 @@ static struct rt6_info ip6_null_entry_template = {
 		}
 	},
 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
+	.rt6i_protocol  = RTPROT_KERNEL,
 	.rt6i_metric	= ~(u32) 0,
 	.rt6i_ref	= ATOMIC_INIT(1),
 };
@@ -159,6 +160,7 @@ static struct rt6_info ip6_prohibit_entry_template = {
 		}
 	},
 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
+	.rt6i_protocol  = RTPROT_KERNEL,
 	.rt6i_metric	= ~(u32) 0,
 	.rt6i_ref	= ATOMIC_INIT(1),
 };
@@ -176,6 +178,7 @@ static struct rt6_info ip6_blk_hole_entry_template = {
 		}
 	},
 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
+	.rt6i_protocol  = RTPROT_KERNEL,
 	.rt6i_metric	= ~(u32) 0,
 	.rt6i_ref	= ATOMIC_INIT(1),
 };
-- 
cgit v0.10.2


From 385aa9e7012d35b017981e67b3464aef4e1e7108 Mon Sep 17 00:00:00 2001
From: Thomas Reitmayr <treitmayr@devbase.at>
Date: Tue, 19 May 2009 19:35:26 +0200
Subject: [ARM] Kirkwood: Correct MPP for SATA activity/presence LEDs of QNAP
 TS-119/TS-219.

For the QNAP TS-119 and TS-219 the wrong MPPs were used for the SATA
activity/presence LEDs. The new settings make these LEDs work as
expected.

Signed-off-by: Thomas Reitmayr <treitmayr@devbase.at>
Tested-by: Martin Michlmayr <tbm@cyrius.com>
Signed-off-by: Nicolas Pitre <nico@marvell.com>

diff --git a/arch/arm/mach-kirkwood/ts219-setup.c b/arch/arm/mach-kirkwood/ts219-setup.c
index dda5743..01aa213 100644
--- a/arch/arm/mach-kirkwood/ts219-setup.c
+++ b/arch/arm/mach-kirkwood/ts219-setup.c
@@ -142,6 +142,8 @@ static unsigned int qnap_ts219_mpp_config[] __initdata = {
 	MPP1_SPI_MOSI,
 	MPP2_SPI_SCK,
 	MPP3_SPI_MISO,
+	MPP4_SATA1_ACTn,
+	MPP5_SATA0_ACTn,
 	MPP8_TW_SDA,
 	MPP9_TW_SCK,
 	MPP10_UART0_TXD,
@@ -150,10 +152,6 @@ static unsigned int qnap_ts219_mpp_config[] __initdata = {
 	MPP14_UART1_RXD,	/* PIC controller */
 	MPP15_GPIO,		/* USB Copy button */
 	MPP16_GPIO,		/* Reset button */
-	MPP20_SATA1_ACTn,
-	MPP21_SATA0_ACTn,
-	MPP22_SATA1_PRESENTn,
-	MPP23_SATA0_PRESENTn,
 	0
 };
 
-- 
cgit v0.10.2


From 60a0b8f93664621a07b93273fc8ebc29590c62f5 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 21 May 2009 12:23:12 +0100
Subject: GFS2: Add a rgrp bitmap full flag

During block allocation, it is useful to know if sections of disk
are full on a finer grained basis than a single resource group.
This can make a performance difference when resource groups have
larger numbers of bitmap blocks, since we no longer have to search
them all block by block in each individual bitmap.

The full flag is set on a per-bitmap basis when it has been
searched and found to have no free space. It is then skipped in
subsequent searches until the flag is reset. The resetting
occurs if we have to drop the glock on the resource group for any
reason, or if we deallocate some blocks within that resource
group and thus free up some space.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index de50d86..dd87379 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -64,9 +64,12 @@ struct gfs2_log_element {
 	const struct gfs2_log_operations *le_ops;
 };
 
+#define GBF_FULL 1
+
 struct gfs2_bitmap {
 	struct buffer_head *bi_bh;
 	char *bi_clone;
+	unsigned long bi_flags;
 	u32 bi_offset;
 	u32 bi_start;
 	u32 bi_len;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index fbacf09..23637b9 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -442,6 +442,7 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd)
 	for (x = 0; x < length; x++) {
 		bi = rgd->rd_bits + x;
 
+		bi->bi_flags = 0;
 		/* small rgrp; bitmap stored completely in header block */
 		if (length == 1) {
 			bytes = bytes_left;
@@ -769,6 +770,8 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
 	}
 
 	if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
+		for (x = 0; x < length; x++)
+			clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
 		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
 		rgd->rd_flags |= GFS2_RDF_UPTODATE;
 	}
@@ -897,6 +900,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
 			continue;
 		if (sdp->sd_args.ar_discard)
 			gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi);
+		clear_bit(GBF_FULL, &bi->bi_flags);
 		memcpy(bi->bi_clone + bi->bi_offset,
 		       bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
 	}
@@ -1309,30 +1313,37 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
 {
 	struct gfs2_bitmap *bi = NULL;
 	const u32 length = rgd->rd_length;
-	u32 blk = 0;
+	u32 blk = BFITNOENT;
 	unsigned int buf, x;
 	const unsigned int elen = *n;
-	const u8 *buffer;
+	const u8 *buffer = NULL;
 
 	*n = 0;
 	/* Find bitmap block that contains bits for goal block */
 	for (buf = 0; buf < length; buf++) {
 		bi = rgd->rd_bits + buf;
-		if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
-			break;
+		/* Convert scope of "goal" from rgrp-wide to within found bit block */
+		if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) {
+			goal -= bi->bi_start * GFS2_NBBY;
+			goto do_search;
+		}
 	}
+	buf = 0;
+	goal = 0;
 
-	gfs2_assert(rgd->rd_sbd, buf < length);
-
-	/* Convert scope of "goal" from rgrp-wide to within found bit block */
-	goal -= bi->bi_start * GFS2_NBBY;
-
+do_search:
 	/* Search (up to entire) bitmap in this rgrp for allocatable block.
 	   "x <= length", instead of "x < length", because we typically start
 	   the search in the middle of a bit block, but if we can't find an
 	   allocatable block anywhere else, we want to be able wrap around and
 	   search in the first part of our first-searched bit block.  */
 	for (x = 0; x <= length; x++) {
+		bi = rgd->rd_bits + buf;
+
+		if (test_bit(GBF_FULL, &bi->bi_flags) &&
+		    (old_state == GFS2_BLKST_FREE))
+			goto skip;
+
 		/* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
 		   bitmaps, so we must search the originals for that. */
 		buffer = bi->bi_bh->b_data + bi->bi_offset;
@@ -1343,33 +1354,39 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
 		if (blk != BFITNOENT)
 			break;
 
+		if ((goal == 0) && (old_state == GFS2_BLKST_FREE))
+			set_bit(GBF_FULL, &bi->bi_flags);
+
 		/* Try next bitmap block (wrap back to rgrp header if at end) */
-		buf = (buf + 1) % length;
-		bi = rgd->rd_bits + buf;
+skip:
+		buf++;
+		buf %= length;
 		goal = 0;
 	}
 
-	if (blk != BFITNOENT && old_state != new_state) {
-		*n = 1;
-		gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+	if (blk == BFITNOENT)
+		return blk;
+	*n = 1;
+	if (old_state == new_state)
+		goto out;
+
+	gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+	gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
+		    bi->bi_len, blk, new_state);
+	goal = blk;
+	while (*n < elen) {
+		goal++;
+		if (goal >= (bi->bi_len * GFS2_NBBY))
+			break;
+		if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
+		    GFS2_BLKST_FREE)
+			break;
 		gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
-			    bi->bi_len, blk, new_state);
-		goal = blk;
-		while (*n < elen) {
-			goal++;
-			if (goal >= (bi->bi_len * GFS2_NBBY))
-				break;
-			if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
-			    GFS2_BLKST_FREE)
-				break;
-			gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone,
-				    bi->bi_offset, bi->bi_len, goal,
-				    new_state);
-			(*n)++;
-		}
+			    bi->bi_len, goal, new_state);
+		(*n)++;
 	}
-
-	return (blk == BFITNOENT) ? blk : (bi->bi_start * GFS2_NBBY) + blk;
+out:
+	return (bi->bi_start * GFS2_NBBY) + blk;
 }
 
 /**
-- 
cgit v0.10.2


From 1ce97e564b628bee30b8dbb64e5e653a484308f6 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 21 May 2009 15:18:19 +0100
Subject: GFS2: Be more aggressive in reclaiming unlinked inodes

This patch increases the frequency with which gfs2 looks
for unlinked, but still allocated inodes. Its the equivalent
operation to ext3's orphan list, but done with bitmaps in
the resource groups.

This also fixes a bug where a field in the rgrp was too small.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index dd87379..225347f 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -94,7 +94,7 @@ struct gfs2_rgrpd {
 	struct gfs2_sbd *rd_sbd;
 	unsigned int rd_bh_count;
 	u32 rd_last_alloc;
-	unsigned char rd_flags;
+	u32 rd_flags;
 #define GFS2_RDF_CHECK		0x10000000 /* check for unlinked inodes */
 #define GFS2_RDF_UPTODATE	0x20000000 /* rg is up to date */
 #define GFS2_RDF_ERROR		0x40000000 /* error in rg */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 23637b9..ee3d5c1 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -581,7 +581,6 @@ static int read_rindex_entry(struct gfs2_inode *ip,
 
 	rgd->rd_gl->gl_object = rgd;
 	rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
-	rgd->rd_flags |= GFS2_RDF_CHECK;
 	return error;
 }
 
@@ -703,6 +702,8 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 
 	rg_flags = be32_to_cpu(str->rg_flags);
 	rg_flags &= ~GFS2_RDF_MASK;
+	rgd->rd_flags &= GFS2_RDF_MASK;
+	rgd->rd_flags |= rg_flags;
 	rgd->rd_free = be32_to_cpu(str->rg_free);
 	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
 	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
@@ -773,7 +774,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
 		for (x = 0; x < length; x++)
 			clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
 		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
-		rgd->rd_flags |= GFS2_RDF_UPTODATE;
+		rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
 	}
 
 	spin_lock(&sdp->sd_rindex_spin);
-- 
cgit v0.10.2


From 55620c86ebc29013c0d26c5b3bc423faea299fee Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 22 May 2009 01:16:17 +0900
Subject: sh: irq: Rework the SR.IMASK bitmap handling.

This tidies up how the SR.IMASK bitmap is managed, using the bitmap API
directly instead. At the same time, tidy up the irq_chip conversion a
bit.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/irq/imask.c b/arch/sh/kernel/cpu/irq/imask.c
index f43753b..3bef3c2 100644
--- a/arch/sh/kernel/cpu/irq/imask.c
+++ b/arch/sh/kernel/cpu/irq/imask.c
@@ -18,38 +18,17 @@
 #include <linux/spinlock.h>
 #include <linux/cache.h>
 #include <linux/irq.h>
+#include <linux/bitmap.h>
 #include <asm/system.h>
 #include <asm/irq.h>
 
 /* Bitmap of IRQ masked */
-static unsigned long imask_mask = 0x7fff;
-static int interrupt_priority = 0;
-
-static void enable_imask_irq(unsigned int irq);
-static void disable_imask_irq(unsigned int irq);
-static void shutdown_imask_irq(unsigned int irq);
-static void mask_and_ack_imask(unsigned int);
-static void end_imask_irq(unsigned int irq);
-
 #define IMASK_PRIORITY	15
 
-static unsigned int startup_imask_irq(unsigned int irq)
-{
-	/* Nothing to do */
-	return 0; /* never anything pending */
-}
+static DECLARE_BITMAP(imask_mask, IMASK_PRIORITY);
+static int interrupt_priority;
 
-static struct irq_chip imask_irq_type = {
-	.typename = "SR.IMASK",
-	.startup = startup_imask_irq,
-	.shutdown = shutdown_imask_irq,
-	.enable = enable_imask_irq,
-	.disable = disable_imask_irq,
-	.ack = mask_and_ack_imask,
-	.end = end_imask_irq
-};
-
-void static inline set_interrupt_registers(int ip)
+static inline void set_interrupt_registers(int ip)
 {
 	unsigned long __dummy;
 
@@ -72,42 +51,31 @@ void static inline set_interrupt_registers(int ip)
 		     : "t");
 }
 
-static void disable_imask_irq(unsigned int irq)
+static void mask_imask_irq(unsigned int irq)
 {
 	clear_bit(irq, &imask_mask);
 	if (interrupt_priority < IMASK_PRIORITY - irq)
 		interrupt_priority = IMASK_PRIORITY - irq;
-
 	set_interrupt_registers(interrupt_priority);
 }
 
-static void enable_imask_irq(unsigned int irq)
+static void unmask_imask_irq(unsigned int irq)
 {
 	set_bit(irq, &imask_mask);
-	interrupt_priority = IMASK_PRIORITY - ffz(imask_mask);
-
+	interrupt_priority = IMASK_PRIORITY -
+		find_first_zero_bit(imask_mask, IMASK_PRIORITY);
 	set_interrupt_registers(interrupt_priority);
 }
 
-static void mask_and_ack_imask(unsigned int irq)
-{
-	disable_imask_irq(irq);
-}
-
-static void end_imask_irq(unsigned int irq)
-{
-	if (!(irq_desc[irq].status & (IRQ_DISABLED|IRQ_INPROGRESS)))
-		enable_imask_irq(irq);
-}
-
-static void shutdown_imask_irq(unsigned int irq)
-{
-	/* Nothing to do */
-}
+static struct irq_chip imask_irq_chip = {
+	.typename	= "SR.IMASK",
+	.mask		= mask_imask_irq,
+	.unmask		= unmask_imask_irq,
+	.mask_ack	= mask_imask_irq,
+};
 
 void make_imask_irq(unsigned int irq)
 {
-	disable_irq_nosync(irq);
-	irq_desc[irq].chip = &imask_irq_type;
-	enable_irq(irq);
+	set_irq_chip_and_handler_name(irq, &imask_irq_chip,
+				      handle_level_irq, "level");
 }
-- 
cgit v0.10.2


From fa1d43ab451084785153d37ae559c4fdd1546a5b Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 22 May 2009 01:26:16 +0900
Subject: sh: irq: Convert from irq_desc[] to irq_to_desc().

This converts a few places that were using the old irq_desc[] array over
to the shiny new irq_to_desc() helper. Preperatory work for sparse irq
support.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/irq/intc-sh5.c b/arch/sh/kernel/cpu/irq/intc-sh5.c
index a619eaf..6c092f1 100644
--- a/arch/sh/kernel/cpu/irq/intc-sh5.c
+++ b/arch/sh/kernel/cpu/irq/intc-sh5.c
@@ -152,14 +152,6 @@ static void end_intc_irq(unsigned int irq)
 	enable_intc_irq(irq);
 }
 
-/* For future use, if we ever support IRLM=0) */
-void make_intc_irq(unsigned int irq)
-{
-	disable_irq_nosync(irq);
-	irq_desc[irq].chip = &intc_irq_type;
-	disable_intc_irq(irq);
-}
-
 void __init plat_irq_setup(void)
 {
 	unsigned long long __dummy0, __dummy1=~0x00000000100000f0;
@@ -174,7 +166,7 @@ void __init plat_irq_setup(void)
 
 	/* Set default: per-line enable/disable, priority driven ack/eoi */
 	for (i = 0; i < NR_INTC_IRQS; i++)
-		irq_desc[i].chip = &intc_irq_type;
+		set_irq_chip_and_handler(i, &intc_irq_type, handle_level_irq);
 
 
 	/* Disable all interrupts and set all priorities to 0 to avoid trouble */
diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
index 3f1372e..878532b 100644
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -31,39 +31,64 @@ void ack_bad_irq(unsigned int irq)
 }
 
 #if defined(CONFIG_PROC_FS)
+/*
+ * /proc/interrupts printing:
+ */
+static int show_other_interrupts(struct seq_file *p, int prec)
+{
+	seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
+	return 0;
+}
+
 int show_interrupts(struct seq_file *p, void *v)
 {
-	int i = *(loff_t *) v, j;
-	struct irqaction * action;
-	unsigned long flags;
+	unsigned long flags, any_count = 0;
+	int i = *(loff_t *)v, j, prec;
+	struct irqaction *action;
+	struct irq_desc *desc;
+
+	if (i > nr_irqs)
+		return 0;
+
+	for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
+		j *= 10;
+
+	if (i == nr_irqs)
+		return show_other_interrupts(p, prec);
 
 	if (i == 0) {
-		seq_puts(p, "           ");
+		seq_printf(p, "%*s", prec + 8, "");
 		for_each_online_cpu(j)
-			seq_printf(p, "CPU%d       ",j);
+			seq_printf(p, "CPU%-8d", j);
 		seq_putc(p, '\n');
 	}
 
-	if (i < sh_mv.mv_nr_irqs) {
-		spin_lock_irqsave(&irq_desc[i].lock, flags);
-		action = irq_desc[i].action;
-		if (!action)
-			goto unlock;
-		seq_printf(p, "%3d: ",i);
-		for_each_online_cpu(j)
-			seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
-		seq_printf(p, " %14s", irq_desc[i].chip->name);
-		seq_printf(p, "-%-8s", irq_desc[i].name);
-		seq_printf(p, "  %s", action->name);
+	desc = irq_to_desc(i);
+	if (!desc)
+		return 0;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	for_each_online_cpu(j)
+		any_count |= kstat_irqs_cpu(i, j);
+	action = desc->action;
+	if (!action && !any_count)
+		goto out;
 
-		for (action=action->next; action; action = action->next)
+	seq_printf(p, "%*d: ", prec, i);
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
+	seq_printf(p, " %14s", desc->chip->name);
+	seq_printf(p, "-%-8s", desc->name);
+
+	if (action) {
+		seq_printf(p, "  %s", action->name);
+		while ((action = action->next) != NULL)
 			seq_printf(p, ", %s", action->name);
-		seq_putc(p, '\n');
-unlock:
-		spin_unlock_irqrestore(&irq_desc[i].lock, flags);
-	} else if (i == sh_mv.mv_nr_irqs)
-		seq_printf(p, "Err: %10u\n", atomic_read(&irq_err_count));
+	}
 
+	seq_putc(p, '\n');
+out:
+	spin_unlock_irqrestore(&desc->lock, flags);
 	return 0;
 }
 #endif
diff --git a/arch/sh/kernel/sh_ksyms_32.c b/arch/sh/kernel/sh_ksyms_32.c
index 2d868c6..fcc5de3 100644
--- a/arch/sh/kernel/sh_ksyms_32.c
+++ b/arch/sh/kernel/sh_ksyms_32.c
@@ -23,9 +23,6 @@ extern int dump_fpu(struct pt_regs *, elf_fpregset_t *);
 /* platform dependent support */
 EXPORT_SYMBOL(dump_fpu);
 EXPORT_SYMBOL(kernel_thread);
-EXPORT_SYMBOL(irq_desc);
-EXPORT_SYMBOL(no_irq_chip);
-
 EXPORT_SYMBOL(strlen);
 
 /* PCI exports */
@@ -40,11 +37,6 @@ EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memmove);
 EXPORT_SYMBOL(__copy_user);
-
-#ifdef CONFIG_MMU
-EXPORT_SYMBOL(get_vm_area);
-#endif
-
 EXPORT_SYMBOL(__udelay);
 EXPORT_SYMBOL(__ndelay);
 EXPORT_SYMBOL(__const_udelay);
-- 
cgit v0.10.2


From 05ff3004d278b54760abd71530506d803182c71d Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 22 May 2009 01:28:33 +0900
Subject: sh: irq: Teach ipr and intc about dynamically allocating irq_descs.

This hooks in irq_to_desc_alloc_cpu() to the necessary code paths in the
intc and ipr controller registration paths. As these are the primary call
paths for all SH CPUs, this alone will make all CPUs sparse IRQ ready.

There is the added benefit now that each CPU contains specific IPR and
INTC tables, so only the vectors with interrupt sources backing them will
ever see an irq_desc instantiation. This effectively packs irq_desc
down to match the CPU, rather than padding NR_IRQS out to cover the valid
vector range.

Boards with extra sources will still have to fiddle with the nr_irqs
setting, but they can continue doing so through the machvec as before.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/irq/ipr.c b/arch/sh/kernel/cpu/irq/ipr.c
index 3eb17ee..fa0c846 100644
--- a/arch/sh/kernel/cpu/irq/ipr.c
+++ b/arch/sh/kernel/cpu/irq/ipr.c
@@ -59,10 +59,18 @@ void register_ipr_controller(struct ipr_desc *desc)
 
 	for (i = 0; i < desc->nr_irqs; i++) {
 		struct ipr_data *p = desc->ipr_data + i;
+		struct irq_desc *irq_desc;
 
 		BUG_ON(p->ipr_idx >= desc->nr_offsets);
 		BUG_ON(!desc->ipr_offsets[p->ipr_idx]);
 
+		irq_desc = irq_to_desc_alloc_cpu(p->irq, smp_processor_id());
+		if (unlikely(!irq_desc)) {
+			printk(KERN_INFO "can not get irq_desc for %d\n",
+			       p->irq);
+			continue;
+		}
+
 		disable_irq_nosync(p->irq);
 		set_irq_chip_and_handler_name(p->irq, &desc->chip,
 				      handle_level_irq, "level");
diff --git a/drivers/sh/intc.c b/drivers/sh/intc.c
index 12d13d9..098b767 100644
--- a/drivers/sh/intc.c
+++ b/drivers/sh/intc.c
@@ -671,7 +671,7 @@ unsigned int intc_evt2irq(unsigned int vector)
 
 void __init register_intc_controller(struct intc_desc *desc)
 {
-	unsigned int i, k, smp;
+	unsigned int i, k, smp, cpu = smp_processor_id();
 	struct intc_desc_int *d;
 
 	d = alloc_bootmem(sizeof(*d));
@@ -770,11 +770,19 @@ void __init register_intc_controller(struct intc_desc *desc)
 	/* register the vectors one by one */
 	for (i = 0; i < desc->nr_vectors; i++) {
 		struct intc_vect *vect = desc->vectors + i;
+		unsigned int irq = evt2irq(vect->vect);
+		struct irq_desc *irq_desc;
 
 		if (!vect->enum_id)
 			continue;
 
-		intc_register_irq(desc, d, vect->enum_id, evt2irq(vect->vect));
+		irq_desc = irq_to_desc_alloc_cpu(irq, cpu);
+		if (unlikely(!irq_desc)) {
+			printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+			continue;
+		}
+
+		intc_register_irq(desc, d, vect->enum_id, irq);
 	}
 }
 
-- 
cgit v0.10.2


From d8586ba6e1415150e1bab89f0a05447bb6f2d6d5 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 22 May 2009 01:36:13 +0900
Subject: sh: irq: Provide an arch_probe_nr_irqs() that wraps the machvec def.

This is just a simple arch_probe_nr_irqs() stub that wraps to the
platform defined number of IRQs. This can be made gradually more
intelligent based on what we can infer from the INTC tables and so on.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
index 878532b..3d09062 100644
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -279,3 +279,11 @@ void __init init_IRQ(void)
 
 	irq_ctx_init(smp_processor_id());
 }
+
+#ifdef CONFIG_SPARSE_IRQ
+int __init arch_probe_nr_irqs(void)
+{
+	nr_irqs = sh_mv.mv_nr_irqs;
+	return 0;
+}
+#endif
-- 
cgit v0.10.2


From 30cff215b54b20d201a2bd7d50361c4c14700281 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 20 May 2009 13:44:40 +0000
Subject: sh: clkfwk branch compile fix for clock-sh7722

Fix clkfwk branch compile error in clock-sh7722.c.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index a98d7da..ae78efd 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -892,7 +892,7 @@ int __init arch_clk_init(void)
 	struct clk *clk;
 	int i;
 
-	clk_cpg_init();
+	cpg_clk_init();
 
 	clk = clk_get(NULL, "master_clk");
 	for (i = 0; i < ARRAY_SIZE(sh7722_clocks); i++) {
-- 
cgit v0.10.2


From 98f32602d42951e61a059685f842aa7d778ffab0 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Date: Thu, 21 May 2009 20:33:58 +0100
Subject: hugh: update email address

My old address will shut down in a few days time: remove it from the tree,
and add a tmpfs (shmem filesystem) maintainer entry with the new address.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
index 222437e..3015da0 100644
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -133,4 +133,4 @@ RAM/SWAP in 10240 inodes and it is only accessible by root.
 Author:
    Christoph Rohland <cr@sap.com>, 1.12.01
 Updated:
-   Hugh Dickins <hugh@veritas.com>, 4 June 2007
+   Hugh Dickins, 4 June 2007
diff --git a/MAINTAINERS b/MAINTAINERS
index 2b349ba..64ea80e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5579,6 +5579,14 @@ M:	ian@mnementh.co.uk
 S:	Maintained
 F:	drivers/mmc/host/tmio_mmc.*
 
+TMPFS (SHMEM FILESYSTEM)
+P:	Hugh Dickins
+M:	hugh.dickins@tiscali.co.uk
+L:	linux-mm@kvack.org
+S:	Maintained
+F:	include/linux/shmem_fs.h
+F:	mm/shmem.c
+
 TPM DEVICE DRIVER
 P:	Debora Velarde
 M:	debora@linux.vnet.ibm.com
diff --git a/mm/rmap.c b/mm/rmap.c
index 1652166..23122af 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -14,7 +14,7 @@
  * Original design by Rik van Riel <riel@conectiva.com.br> 2001
  * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
  * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
- * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
+ * Contributions by Hugh Dickins 2003, 2004
  */
 
 /*
-- 
cgit v0.10.2


From 85bc26211c6a2c6e82c2403697f8ce44e9587215 Mon Sep 17 00:00:00 2001
From: Martin Michlmayr <tbm@cyrius.com>
Date: Tue, 19 May 2009 12:30:52 +0200
Subject: [ARM] Orion: Remove explicit name for platform device resources

Remove explicit names from platform device resources since they will
automatically be named after the platform device they're associated
with.

Signed-off-by: Martin Michlmayr <tbm@cyrius.com>
Acked-by: Russell King <linux@arm.linux.org.uk>
Signed-off-by: Nicolas Pitre <nico@marvell.com>

diff --git a/arch/arm/mach-kirkwood/common.c b/arch/arm/mach-kirkwood/common.c
index eeb0024..3fab82a 100644
--- a/arch/arm/mach-kirkwood/common.c
+++ b/arch/arm/mach-kirkwood/common.c
@@ -386,12 +386,10 @@ static struct mv64xxx_i2c_pdata kirkwood_i2c_pdata = {
 
 static struct resource kirkwood_i2c_resources[] = {
 	{
-		.name	= "i2c",
 		.start	= I2C_PHYS_BASE,
 		.end	= I2C_PHYS_BASE + 0x1f,
 		.flags	= IORESOURCE_MEM,
 	}, {
-		.name	= "i2c",
 		.start	= IRQ_KIRKWOOD_TWSI,
 		.end	= IRQ_KIRKWOOD_TWSI,
 		.flags	= IORESOURCE_IRQ,
diff --git a/arch/arm/mach-mv78xx0/common.c b/arch/arm/mach-mv78xx0/common.c
index 9ba5950..0d88eea 100644
--- a/arch/arm/mach-mv78xx0/common.c
+++ b/arch/arm/mach-mv78xx0/common.c
@@ -532,12 +532,10 @@ static struct mv64xxx_i2c_pdata mv78xx0_i2c_0_pdata = {
 
 static struct resource mv78xx0_i2c_0_resources[] = {
 	{
-		.name   = "i2c 0 base",
 		.start  = I2C_0_PHYS_BASE,
 		.end    = I2C_0_PHYS_BASE + 0x1f,
 		.flags  = IORESOURCE_MEM,
 	}, {
-		.name   = "i2c 0 irq",
 		.start  = IRQ_MV78XX0_I2C_0,
 		.end    = IRQ_MV78XX0_I2C_0,
 		.flags  = IORESOURCE_IRQ,
@@ -567,12 +565,10 @@ static struct mv64xxx_i2c_pdata mv78xx0_i2c_1_pdata = {
 
 static struct resource mv78xx0_i2c_1_resources[] = {
 	{
-		.name   = "i2c 1 base",
 		.start  = I2C_1_PHYS_BASE,
 		.end    = I2C_1_PHYS_BASE + 0x1f,
 		.flags  = IORESOURCE_MEM,
 	}, {
-		.name   = "i2c 1 irq",
 		.start  = IRQ_MV78XX0_I2C_1,
 		.end    = IRQ_MV78XX0_I2C_1,
 		.flags  = IORESOURCE_IRQ,
diff --git a/arch/arm/mach-orion5x/common.c b/arch/arm/mach-orion5x/common.c
index 6af99dd..a51fb9d 100644
--- a/arch/arm/mach-orion5x/common.c
+++ b/arch/arm/mach-orion5x/common.c
@@ -248,12 +248,10 @@ static struct mv64xxx_i2c_pdata orion5x_i2c_pdata = {
 
 static struct resource orion5x_i2c_resources[] = {
 	{
-		.name	= "i2c base",
 		.start	= I2C_PHYS_BASE,
 		.end	= I2C_PHYS_BASE + 0x1f,
 		.flags	= IORESOURCE_MEM,
 	}, {
-		.name	= "i2c irq",
 		.start	= IRQ_ORION5X_I2C,
 		.end	= IRQ_ORION5X_I2C,
 		.flags	= IORESOURCE_IRQ,
-- 
cgit v0.10.2


From c40499e04b2005e61f989824251f9343b55f96bb Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Thu, 21 May 2009 15:04:15 -0700
Subject: gigaset: beyond ARRAY_SIZE of iwb->data

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/isdn/gigaset/isocdata.c b/drivers/isdn/gigaset/isocdata.c
index b171e75..29808c4 100644
--- a/drivers/isdn/gigaset/isocdata.c
+++ b/drivers/isdn/gigaset/isocdata.c
@@ -175,7 +175,7 @@ int gigaset_isowbuf_getbytes(struct isowbuf_t *iwb, int size)
 		return -EINVAL;
 	}
 	src = iwb->read;
-	if (unlikely(limit > BAS_OUTBUFSIZE + BAS_OUTBUFPAD ||
+	if (unlikely(limit >= BAS_OUTBUFSIZE + BAS_OUTBUFPAD ||
 		     (read < src && limit >= src))) {
 		pr_err("isoc write buffer frame reservation violated\n");
 		return -EFAULT;
-- 
cgit v0.10.2


From 5b5f792a6a9a2f9ae812d151ed621f72e99b1725 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 21 May 2009 15:07:12 -0700
Subject: pktgen: do not access flows[] beyond its length

typo -- pkt_dev->nflows is for stats only, the number of concurrent
flows is stored in cflows.

Reported-By: Vladimir Ivashchenko <hazard@francoudi.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 3779c14..0666a82 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2447,7 +2447,7 @@ static inline void free_SAs(struct pktgen_dev *pkt_dev)
 	if (pkt_dev->cflows) {
 		/* let go of the SAs if we have them */
 		int i = 0;
-		for (;  i < pkt_dev->nflows; i++){
+		for (;  i < pkt_dev->cflows; i++) {
 			struct xfrm_state *x = pkt_dev->flows[i].x;
 			if (x) {
 				xfrm_state_put(x);
-- 
cgit v0.10.2


From 3ed18d76d959e5cbfa5d70c8f7ba95476582a556 Mon Sep 17 00:00:00 2001
From: Robert Olsson <robert.olsson@its.uu.se>
Date: Thu, 21 May 2009 15:20:59 -0700
Subject: ipv4: Fix oops with FIB_TRIE

It seems we can fix this by disabling preemption while we re-balance the
trie. This is with the CONFIG_CLASSIC_RCU. It's been stress-tested at high
loads continuesly taking a full BGP table up/down via iproute -batch.

Note. fib_trie is not updated for CONFIG_PREEMPT_RCU

Reported-by: Andrei Popa
Signed-off-by: Robert Olsson <robert.olsson@its.uu.se>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index ec0ae49..33c7c85 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -986,9 +986,12 @@ fib_find_node(struct trie *t, u32 key)
 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
 {
 	int wasfull;
-	t_key cindex, key = tn->key;
+	t_key cindex, key;
 	struct tnode *tp;
 
+	preempt_disable();
+	key = tn->key;
+
 	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
@@ -1007,6 +1010,7 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
 	if (IS_TNODE(tn))
 		tn = (struct tnode *)resize(t, (struct tnode *)tn);
 
+	preempt_enable();
 	return (struct node *)tn;
 }
 
-- 
cgit v0.10.2


From 0975ecba3b670df7c488a5e0e6fe9f1f370a8ad8 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Thu, 21 May 2009 15:22:02 -0700
Subject: RxRPC: Error handling for rxrpc_alloc_connection()

rxrpc_alloc_connection() doesn't return an error code on failure, it just
returns NULL.  IS_ERR(NULL) is false.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c
index 0f1218b..67e38a0 100644
--- a/net/rxrpc/ar-connection.c
+++ b/net/rxrpc/ar-connection.c
@@ -343,9 +343,9 @@ static int rxrpc_connect_exclusive(struct rxrpc_sock *rx,
 		/* not yet present - create a candidate for a new connection
 		 * and then redo the check */
 		conn = rxrpc_alloc_connection(gfp);
-		if (IS_ERR(conn)) {
-			_leave(" = %ld", PTR_ERR(conn));
-			return PTR_ERR(conn);
+		if (!conn) {
+			_leave(" = -ENOMEM");
+			return -ENOMEM;
 		}
 
 		conn->trans = trans;
@@ -508,9 +508,9 @@ int rxrpc_connect_call(struct rxrpc_sock *rx,
 		/* not yet present - create a candidate for a new connection and then
 		 * redo the check */
 		candidate = rxrpc_alloc_connection(gfp);
-		if (IS_ERR(candidate)) {
-			_leave(" = %ld", PTR_ERR(candidate));
-			return PTR_ERR(candidate);
+		if (!candidate) {
+			_leave(" = -ENOMEM");
+			return -ENOMEM;
 		}
 
 		candidate->trans = trans;
-- 
cgit v0.10.2


From 703a3b8e5c01cf6fb33c6d8dc99905f889a4e992 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 21 May 2009 22:21:53 +0000
Subject: [CIFS] fix posix open regression

Posix open code was not properly adding the file to the
list of open files.  Fix  allocating cifsFileInfo
more than once, and adding twice to flist and tlist.
Also fix mode setting to be done in one place in these
paths.

Signed-off-by: Steve French <sfrench@us.ibm.com>
Reviewed-by: Shirish Pargaonkar <shirishp@us.ibm.com>
Tested-by: Jeff Layton <jlayton@redhat.com>
Tested-by: Luca Tettamanti <kronos.it@gmail.com>

diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 11431ed..f49d684 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -225,6 +225,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
 	if (!(oflags & FMODE_READ))
 		write_only = true;
 
+	mode &= ~current_umask();
 	rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
 			pnetfid, presp_data, &oplock, full_path,
 			cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
@@ -310,7 +311,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 		return -ENOMEM;
 	}
 
-	mode &= ~current_umask();
 	if (oplockEnabled)
 		oplock = REQ_OPLOCK;
 
@@ -336,7 +336,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 			else /* success, no need to query */
 				goto cifs_create_set_dentry;
 		} else if ((rc != -EIO) && (rc != -EREMOTE) &&
-			 (rc != -EOPNOTSUPP)) /* path not found or net err */
+			 (rc != -EOPNOTSUPP) && (rc != -EINVAL))
 			goto cifs_create_out;
 		/* else fallthrough to retry, using older open call, this is
 		   case where server does not support this SMB level, and
@@ -609,7 +609,6 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	int xid;
 	int rc = 0; /* to get around spurious gcc warning, set to zero here */
 	int oplock = 0;
-	int mode;
 	__u16 fileHandle = 0;
 	bool posix_open = false;
 	struct cifs_sb_info *cifs_sb;
@@ -660,13 +659,12 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 
 	if (pTcon->unix_ext) {
 		if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
-				(nd->flags & LOOKUP_OPEN)) {
+		     (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open) {
 			if (!((nd->intent.open.flags & O_CREAT) &&
 					(nd->intent.open.flags & O_EXCL))) {
-				mode = nd->intent.open.create_mode &
-						~current_umask();
 				rc = cifs_posix_open(full_path, &newInode,
-					parent_dir_inode->i_sb, mode,
+					parent_dir_inode->i_sb,
+					nd->intent.open.create_mode,
 					nd->intent.open.flags, &oplock,
 					&fileHandle, xid);
 				/*
@@ -681,6 +679,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 				 */
 				if ((rc != -EINVAL) && (rc != -EOPNOTSUPP))
 					posix_open = true;
+				else
+					pTcon->broken_posix_open = true;
 			}
 		}
 		if (!posix_open)
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 38c06f8..302ea15 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -130,10 +130,6 @@ static inline int cifs_posix_open_inode_helper(struct inode *inode,
 			struct cifsFileInfo *pCifsFile, int oplock, u16 netfid)
 {
 
-	file->private_data = kmalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
-	if (file->private_data == NULL)
-		return -ENOMEM;
-	pCifsFile = cifs_init_private(file->private_data, inode, file, netfid);
 	write_lock(&GlobalSMBSeslock);
 
 	pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
@@ -184,6 +180,38 @@ psx_client_can_cache:
 	return 0;
 }
 
+static struct cifsFileInfo *
+cifs_fill_filedata(struct file *file)
+{
+	struct list_head *tmp;
+	struct cifsFileInfo *pCifsFile = NULL;
+	struct cifsInodeInfo *pCifsInode = NULL;
+
+	/* search inode for this file and fill in file->private_data */
+	pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
+	read_lock(&GlobalSMBSeslock);
+	list_for_each(tmp, &pCifsInode->openFileList) {
+		pCifsFile = list_entry(tmp, struct cifsFileInfo, flist);
+		if ((pCifsFile->pfile == NULL) &&
+		    (pCifsFile->pid == current->tgid)) {
+			/* mode set in cifs_create */
+
+			/* needed for writepage */
+			pCifsFile->pfile = file;
+			file->private_data = pCifsFile;
+			break;
+		}
+	}
+	read_unlock(&GlobalSMBSeslock);
+
+	if (file->private_data != NULL) {
+		return pCifsFile;
+	} else if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL))
+			cERROR(1, ("could not find file instance for "
+				   "new file %p", file));
+	return NULL;
+}
+
 /* all arguments to this function must be checked for validity in caller */
 static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
 	struct cifsInodeInfo *pCifsInode, struct cifsFileInfo *pCifsFile,
@@ -258,7 +286,6 @@ int cifs_open(struct inode *inode, struct file *file)
 	struct cifsTconInfo *tcon;
 	struct cifsFileInfo *pCifsFile;
 	struct cifsInodeInfo *pCifsInode;
-	struct list_head *tmp;
 	char *full_path = NULL;
 	int desiredAccess;
 	int disposition;
@@ -270,32 +297,12 @@ int cifs_open(struct inode *inode, struct file *file)
 	cifs_sb = CIFS_SB(inode->i_sb);
 	tcon = cifs_sb->tcon;
 
-	/* search inode for this file and fill in file->private_data */
 	pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
-	read_lock(&GlobalSMBSeslock);
-	list_for_each(tmp, &pCifsInode->openFileList) {
-		pCifsFile = list_entry(tmp, struct cifsFileInfo,
-				       flist);
-		if ((pCifsFile->pfile == NULL) &&
-		    (pCifsFile->pid == current->tgid)) {
-			/* mode set in cifs_create */
-
-			/* needed for writepage */
-			pCifsFile->pfile = file;
-
-			file->private_data = pCifsFile;
-			break;
-		}
-	}
-	read_unlock(&GlobalSMBSeslock);
-
-	if (file->private_data != NULL) {
-		rc = 0;
+	pCifsFile = cifs_fill_filedata(file);
+	if (pCifsFile) {
 		FreeXid(xid);
-		return rc;
-	} else if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL))
-			cERROR(1, ("could not find file instance for "
-				   "new file %p", file));
+		return 0;
+	}
 
 	full_path = build_path_from_dentry(file->f_path.dentry);
 	if (full_path == NULL) {
@@ -325,6 +332,7 @@ int cifs_open(struct inode *inode, struct file *file)
 			/* no need for special case handling of setting mode
 			   on read only files needed here */
 
+			pCifsFile = cifs_fill_filedata(file);
 			cifs_posix_open_inode_helper(inode, file, pCifsInode,
 						     pCifsFile, oplock, netfid);
 			goto out;
-- 
cgit v0.10.2


From 5789ba3bd0a3cd20df5980ebf03358f2eb44fd67 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 15:47:06 -0400
Subject: IMA: Minimal IMA policy and boot param for TCB IMA policy

The IMA TCB policy is dangerous.  A normal use can use all of a system's
memory (which cannot be freed) simply by building and running lots of
executables.  The TCB policy is also nearly useless because logging in as root
often causes a policy violation when dealing with utmp, thus rendering the
measurements meaningless.

There is no good fix for this in the kernel.  A full TCB policy would need to
be loaded in userspace using LSM rule matching to get both a protected and
useful system.  But, if too little is measured before userspace can load a real
policy one again ends up with a meaningless set of measurements.  One option
would be to put the policy load inside the initrd in order to get it early
enough in the boot sequence to be useful, but this runs into trouble with the
LSM.  For IMA to measure the LSM policy and the LSM policy loading mechanism
it needs rules to do so, but we already talked about problems with defaulting
to such broad rules....

IMA also depends on the files being measured to be on an FS which implements
and supports i_version.  Since the only FS with this support (ext4) doesn't
even use it by default it seems silly to have any IMA rules by default.

This should reduce the performance overhead of IMA to near 0 while still
letting users who choose to configure their machine as such to inclue the
ima_tcb kernel paramenter and get measurements during boot before they can
load a customized, reasonable policy in userspace.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e87bdbf..d9a24a0 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -914,6 +914,12 @@ and is between 256 and 4096 characters. It is defined in the file
 			Formt: { "sha1" | "md5" }
 			default: "sha1"
 
+	ima_tcb		[IMA]
+			Load a policy which meets the needs of the Trusted
+			Computing Base.  This means IMA will measure all
+			programs exec'd, files mmap'd for exec, and all files
+			opened for read by uid=0.
+
 	in2000=		[HW,SCSI]
 			See header of drivers/scsi/in2000.c.
 
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index 31d677f..4719bbf 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -45,9 +45,17 @@ struct ima_measure_rule_entry {
 	} lsm[MAX_LSM_RULES];
 };
 
-/* Without LSM specific knowledge, the default policy can only be
+/*
+ * Without LSM specific knowledge, the default policy can only be
  * written in terms of .action, .func, .mask, .fsmagic, and .uid
  */
+
+/*
+ * The minimum rule set to allow for full TCB coverage.  Measures all files
+ * opened or mmap for exec and everything read by root.  Dangerous because
+ * normal users can easily run the machine out of memory simply building
+ * and running executables.
+ */
 static struct ima_measure_rule_entry default_rules[] = {
 	{.action = DONT_MEASURE,.fsmagic = PROC_SUPER_MAGIC,.flags = IMA_FSMAGIC},
 	{.action = DONT_MEASURE,.fsmagic = SYSFS_MAGIC,.flags = IMA_FSMAGIC},
@@ -59,6 +67,8 @@ static struct ima_measure_rule_entry default_rules[] = {
 	 .flags = IMA_FUNC | IMA_MASK},
 	{.action = MEASURE,.func = BPRM_CHECK,.mask = MAY_EXEC,
 	 .flags = IMA_FUNC | IMA_MASK},
+	{.action = MEASURE,.func = PATH_CHECK,.mask = MAY_READ,.uid = 0,
+	 .flags = IMA_FUNC | IMA_MASK | IMA_UID},
 };
 
 static LIST_HEAD(measure_default_rules);
@@ -67,6 +77,14 @@ static struct list_head *ima_measure;
 
 static DEFINE_MUTEX(ima_measure_mutex);
 
+static bool ima_use_tcb __initdata;
+static int __init default_policy_setup(char *str)
+{
+	ima_use_tcb = 1;
+	return 1;
+}
+__setup("ima_tcb", default_policy_setup);
+
 /**
  * ima_match_rules - determine whether an inode matches the measure rule.
  * @rule: a pointer to a rule
@@ -162,9 +180,15 @@ int ima_match_policy(struct inode *inode, enum ima_hooks func, int mask)
  */
 void ima_init_policy(void)
 {
-	int i;
+	int i, entries;
+
+	/* if !ima_use_tcb set entries = 0 so we load NO default rules */
+	if (ima_use_tcb)
+		entries = ARRAY_SIZE(default_rules);
+	else
+		entries = 0;
 
-	for (i = 0; i < ARRAY_SIZE(default_rules); i++)
+	for (i = 0; i < entries; i++)
 		list_add_tail(&default_rules[i].list, &measure_default_rules);
 	ima_measure = &measure_default_rules;
 }
-- 
cgit v0.10.2


From 932995f0ce52525b32ff5127b522c2c164de3810 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 15:43:32 -0400
Subject: IMA: Add __init notation to ima functions

A number of IMA functions only used during init are not marked with __init.
Add those notations so they are freed automatically.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c
index 50d572b..63003a6 100644
--- a/security/integrity/ima/ima_crypto.c
+++ b/security/integrity/ima/ima_crypto.c
@@ -103,7 +103,7 @@ int ima_calc_template_hash(int template_len, void *template, char *digest)
 	return rc;
 }
 
-static void ima_pcrread(int idx, u8 *pcr)
+static void __init ima_pcrread(int idx, u8 *pcr)
 {
 	if (!ima_used_chip)
 		return;
@@ -115,7 +115,7 @@ static void ima_pcrread(int idx, u8 *pcr)
 /*
  * Calculate the boot aggregate hash
  */
-int ima_calc_boot_aggregate(char *digest)
+int __init ima_calc_boot_aggregate(char *digest)
 {
 	struct hash_desc desc;
 	struct scatterlist sg;
diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c
index 3305a96..7039b14 100644
--- a/security/integrity/ima/ima_fs.c
+++ b/security/integrity/ima/ima_fs.c
@@ -319,7 +319,7 @@ static struct file_operations ima_measure_policy_ops = {
 	.release = ima_release_policy
 };
 
-int ima_fs_init(void)
+int __init ima_fs_init(void)
 {
 	ima_dir = securityfs_create_dir("ima", NULL);
 	if (IS_ERR(ima_dir))
diff --git a/security/integrity/ima/ima_iint.c b/security/integrity/ima/ima_iint.c
index ec79f1e..b8dd693 100644
--- a/security/integrity/ima/ima_iint.c
+++ b/security/integrity/ima/ima_iint.c
@@ -196,7 +196,7 @@ static void init_once(void *foo)
 	kref_set(&iint->refcount, 1);
 }
 
-void ima_iintcache_init(void)
+void __init ima_iintcache_init(void)
 {
 	iint_cache =
 	    kmem_cache_create("iint_cache", sizeof(struct ima_iint_cache), 0,
diff --git a/security/integrity/ima/ima_init.c b/security/integrity/ima/ima_init.c
index 0b0bb8c..a40da7a 100644
--- a/security/integrity/ima/ima_init.c
+++ b/security/integrity/ima/ima_init.c
@@ -38,7 +38,7 @@ int ima_used_chip;
  * a different value.) Violations add a zero entry to the measurement
  * list and extend the aggregate PCR value with ff...ff's.
  */
-static void ima_add_boot_aggregate(void)
+static void __init ima_add_boot_aggregate(void)
 {
 	struct ima_template_entry *entry;
 	const char *op = "add_boot_aggregate";
@@ -71,7 +71,7 @@ err_out:
 			    audit_cause, result, 0);
 }
 
-int ima_init(void)
+int __init ima_init(void)
 {
 	u8 pcr_i[IMA_DIGEST_SIZE];
 	int rc;
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index 4719bbf..e127839 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -178,7 +178,7 @@ int ima_match_policy(struct inode *inode, enum ima_hooks func, int mask)
  * ima_measure points to either the measure_default_rules or the
  * the new measure_policy_rules.
  */
-void ima_init_policy(void)
+void __init ima_init_policy(void)
 {
 	int i, entries;
 
-- 
cgit v0.10.2


From b9fc745db833bbf74b4988493b8cd902a84c9415 Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Tue, 19 May 2009 13:25:57 -0400
Subject: integrity: path_check update

- Add support in ima_path_check() for integrity checking without
incrementing the counts. (Required for nfsd.)
- rename and export opencount_get to ima_counts_get
- replace ima_shm_check calls with ima_counts_get
- export ima_path_check

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/fs/exec.c b/fs/exec.c
index 998e856..618d6d1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -130,7 +130,8 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
 				 MAY_READ | MAY_EXEC | MAY_OPEN);
 	if (error)
 		goto exit;
-	error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN);
+	error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN,
+			       IMA_COUNT_UPDATE);
 	if (error)
 		goto exit;
 
@@ -680,7 +681,7 @@ struct file *open_exec(const char *name)
 	err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN);
 	if (err)
 		goto out_path_put;
-	err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN);
+	err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN, IMA_COUNT_UPDATE);
 	if (err)
 		goto out_path_put;
 
diff --git a/fs/namei.c b/fs/namei.c
index 78f253c..b05a2b1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -853,7 +853,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
 			err = inode_permission(nd->path.dentry->d_inode,
 					       MAY_EXEC);
 		if (!err)
-			err = ima_path_check(&nd->path, MAY_EXEC);
+			err = ima_path_check(&nd->path, MAY_EXEC,
+				             IMA_COUNT_UPDATE);
  		if (err)
 			break;
 
@@ -1515,7 +1516,8 @@ int may_open(struct path *path, int acc_mode, int flag)
 		return error;
 
 	error = ima_path_check(path,
-			       acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
+			       acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC),
+			       IMA_COUNT_UPDATE);
 	if (error)
 		return error;
 	/*
diff --git a/include/linux/ima.h b/include/linux/ima.h
index 0e2aa45..b1b827d 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -13,14 +13,17 @@
 #include <linux/fs.h>
 struct linux_binprm;
 
+#define IMA_COUNT_UPDATE 1
+#define IMA_COUNT_LEAVE 0
+
 #ifdef CONFIG_IMA
 extern int ima_bprm_check(struct linux_binprm *bprm);
 extern int ima_inode_alloc(struct inode *inode);
 extern void ima_inode_free(struct inode *inode);
-extern int ima_path_check(struct path *path, int mask);
+extern int ima_path_check(struct path *path, int mask, int update_counts);
 extern void ima_file_free(struct file *file);
 extern int ima_file_mmap(struct file *file, unsigned long prot);
-extern void ima_shm_check(struct file *file);
+extern void ima_counts_get(struct file *file);
 
 #else
 static inline int ima_bprm_check(struct linux_binprm *bprm)
@@ -38,7 +41,7 @@ static inline void ima_inode_free(struct inode *inode)
 	return;
 }
 
-static inline int ima_path_check(struct path *path, int mask)
+static inline int ima_path_check(struct path *path, int mask, int update_counts)
 {
 	return 0;
 }
@@ -53,7 +56,7 @@ static inline int ima_file_mmap(struct file *file, unsigned long prot)
 	return 0;
 }
 
-static inline void ima_shm_check(struct file *file)
+static inline void ima_counts_get(struct file *file)
 {
 	return;
 }
diff --git a/ipc/shm.c b/ipc/shm.c
index faa46da..47b4642 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -384,7 +384,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 	error = PTR_ERR(file);
 	if (IS_ERR(file))
 		goto no_file;
-	ima_shm_check(file);
+	ima_counts_get(file);
 
 	id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
 	if (id < 0) {
@@ -891,7 +891,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr)
 	file = alloc_file(path.mnt, path.dentry, f_mode, &shm_file_operations);
 	if (!file)
 		goto out_free;
-	ima_shm_check(file);
+	ima_counts_get(file);
 
 	file->private_data = sfd;
 	file->f_mapping = shp->shm_file->f_mapping;
diff --git a/mm/shmem.c b/mm/shmem.c
index b25f95c..a817f75 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2684,7 +2684,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
-	ima_shm_check(file);
+	ima_counts_get(file);
 	if (vma->vm_file)
 		fput(vma->vm_file);
 	vma->vm_file = file;
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index c4228c0..a2eb233 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -125,6 +125,15 @@ static int get_path_measurement(struct ima_iint_cache *iint, struct file *file,
 	return rc;
 }
 
+static void ima_update_counts(struct ima_iint_cache *iint, int mask)
+{
+	iint->opencount++;
+	if ((mask & MAY_WRITE) || (mask == 0))
+		iint->writecount++;
+	else if (mask & (MAY_READ | MAY_EXEC))
+		iint->readcount++;
+}
+
 /**
  * ima_path_check - based on policy, collect/store measurement.
  * @path: contains a pointer to the path to be measured
@@ -143,7 +152,7 @@ static int get_path_measurement(struct ima_iint_cache *iint, struct file *file,
  * Return 0 on success, an error code on failure.
  * (Based on the results of appraise_measurement().)
  */
-int ima_path_check(struct path *path, int mask)
+int ima_path_check(struct path *path, int mask, int update_counts)
 {
 	struct inode *inode = path->dentry->d_inode;
 	struct ima_iint_cache *iint;
@@ -157,11 +166,8 @@ int ima_path_check(struct path *path, int mask)
 		return 0;
 
 	mutex_lock(&iint->mutex);
-	iint->opencount++;
-	if ((mask & MAY_WRITE) || (mask == 0))
-		iint->writecount++;
-	else if (mask & (MAY_READ | MAY_EXEC))
-		iint->readcount++;
+	if (update_counts)
+		ima_update_counts(iint, mask);
 
 	rc = ima_must_measure(iint, inode, MAY_READ, PATH_CHECK);
 	if (rc < 0)
@@ -197,6 +203,7 @@ out:
 	kref_put(&iint->refcount, iint_free);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(ima_path_check);
 
 static int process_measurement(struct file *file, const unsigned char *filename,
 			       int mask, int function)
@@ -225,7 +232,16 @@ out:
 	return rc;
 }
 
-static void opencount_get(struct file *file)
+/*
+ * ima_opens_get - increment file counts
+ *
+ * - for IPC shm and shmat file.
+ * - for nfsd exported files.
+ *
+ * Increment the counts for these files to prevent unnecessary
+ * imbalance messages.
+ */
+void ima_counts_get(struct file *file)
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ima_iint_cache *iint;
@@ -237,8 +253,14 @@ static void opencount_get(struct file *file)
 		return;
 	mutex_lock(&iint->mutex);
 	iint->opencount++;
+	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+		iint->readcount++;
+
+	if (file->f_mode & FMODE_WRITE)
+		iint->writecount++;
 	mutex_unlock(&iint->mutex);
 }
+EXPORT_SYMBOL_GPL(ima_counts_get);
 
 /**
  * ima_file_mmap - based on policy, collect/store measurement.
@@ -263,18 +285,6 @@ int ima_file_mmap(struct file *file, unsigned long prot)
 	return 0;
 }
 
-/*
- * ima_shm_check - IPC shm and shmat create/fput a file
- *
- * Maintain the opencount for these files to prevent unnecessary
- * imbalance messages.
- */
-void ima_shm_check(struct file *file)
-{
-	opencount_get(file);
-	return;
-}
-
 /**
  * ima_bprm_check - based on policy, collect/store measurement.
  * @bprm: contains the linux_binprm structure
-- 
cgit v0.10.2


From c9d9ac525a0285a5b5ad9c3f9aa8b7c1753e6121 Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Tue, 19 May 2009 13:25:58 -0400
Subject: integrity: move ima_counts_get

Based on discussion on lkml (Andrew Morton and Eric Paris),
move ima_counts_get down a layer into shmem/hugetlb__file_setup().
Resolves drm shmem_file_setup() usage case as well.

HD comment:
  I still think you're doing this at the wrong level, but recognize
  that you probably won't be persuaded until a few more users of
  alloc_file() emerge, all wanting your ima_counts_get().

  Resolving GEM's shmem_file_setup() is an improvement, so I'll say

Acked-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 153d968..ccc62de 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -30,6 +30,7 @@
 #include <linux/dnotify.h>
 #include <linux/statfs.h>
 #include <linux/security.h>
+#include <linux/ima.h>
 
 #include <asm/uaccess.h>
 
@@ -997,6 +998,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
 			&hugetlbfs_file_operations);
 	if (!file)
 		goto out_dentry; /* inode is already attached */
+	ima_counts_get(file);
 
 	return file;
 
diff --git a/ipc/shm.c b/ipc/shm.c
index 47b4642..5608183 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -384,7 +384,6 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 	error = PTR_ERR(file);
 	if (IS_ERR(file))
 		goto no_file;
-	ima_counts_get(file);
 
 	id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
 	if (id < 0) {
diff --git a/mm/shmem.c b/mm/shmem.c
index a817f75..0132fbd 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2659,6 +2659,7 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
 	if (error)
 		goto close_file;
 #endif
+	ima_counts_get(file);
 	return file;
 
 close_file:
@@ -2684,7 +2685,6 @@ int shmem_zero_setup(struct vm_area_struct *vma)
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
-	ima_counts_get(file);
 	if (vma->vm_file)
 		fput(vma->vm_file);
 	vma->vm_file = file;
-- 
cgit v0.10.2


From 6470c077cae12227318f40f3e6d756caadcce4b0 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Thu, 21 May 2009 18:42:54 +0200
Subject: smack: do not beyond ARRAY_SIZE of data

Do not go beyond ARRAY_SIZE of data

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c
index 904af34..8d3c2a0 100644
--- a/security/smack/smackfs.c
+++ b/security/smack/smackfs.c
@@ -776,7 +776,7 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf,
 	struct sockaddr_in newname;
 	char smack[SMK_LABELLEN];
 	char *sp;
-	char data[SMK_NETLBLADDRMAX];
+	char data[SMK_NETLBLADDRMAX + 1];
 	char *host = (char *)&newname.sin_addr.s_addr;
 	int rc;
 	struct netlbl_audit audit_info;
-- 
cgit v0.10.2


From 7fc1e5c15fde0fa9d2c08441f6898a9e51593d47 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Thu, 21 May 2009 18:25:23 +0000
Subject: sh: clkfwk: beyond ARRAY_SIZE of onchip_ops for sh7722.

Do not go beyond ARRAY_SIZE of onchip_ops

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index ae78efd..c090c9a 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -883,7 +883,7 @@ struct clk_ops *onchip_ops[] = {
 void __init
 arch_init_clk_ops(struct clk_ops **ops, int type)
 {
-	BUG_ON(type < 0 || type > ARRAY_SIZE(onchip_ops));
+	BUG_ON(type < 0 || type >= ARRAY_SIZE(onchip_ops));
 	*ops = onchip_ops[type];
 }
 
-- 
cgit v0.10.2


From 37869fa2da1a952ed736858f8b9e45c0b9131ae9 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 20 May 2009 14:30:06 +0000
Subject: sh: ap325 camera without i2c driver fix

This patch fixes the ap325rxa ncm03j camera code to handle
the case where no i2c driver is present. Without this fix
i2c_transfer() may be passed NULL as adapter which results
in a crash.

Triggered when i2c-sh_mobile.c failed to probe() due to
missing MSTP clocks.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/boards/board-ap325rxa.c b/arch/sh/boards/board-ap325rxa.c
index 39e4691..f2a2964 100644
--- a/arch/sh/boards/board-ap325rxa.c
+++ b/arch/sh/boards/board-ap325rxa.c
@@ -263,6 +263,9 @@ static int camera_probe(void)
 	struct i2c_msg msg;
 	int ret;
 
+	if (!a)
+		return -ENODEV;
+
 	camera_power(1);
 	msg.addr = 0x6e;
 	msg.buf = camera_ncm03j_magic;
-- 
cgit v0.10.2


From 8e9bb19ef97d6594e735bee64b6d72103e350854 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 20 May 2009 14:34:43 +0000
Subject: video: stop sh_mobile_lcdcfb only if started

This patch fixes the LCDC driver to avoid calling the
function sh_mobile_lcdc_start_stop(priv, 0) unless the
same function has been called before to start the LCDC
hardware.

Triggered when sh_mobile_lcdcfb.c failed to probe() due to
missing MSTP clocks.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/drivers/video/sh_mobile_lcdcfb.c b/drivers/video/sh_mobile_lcdcfb.c
index 92ea0ab..f10d2fb 100644
--- a/drivers/video/sh_mobile_lcdcfb.c
+++ b/drivers/video/sh_mobile_lcdcfb.c
@@ -47,6 +47,7 @@ struct sh_mobile_lcdc_priv {
 #endif
 	unsigned long lddckr;
 	struct sh_mobile_lcdc_chan ch[2];
+	int started;
 };
 
 /* shared registers */
@@ -451,6 +452,7 @@ static int sh_mobile_lcdc_start(struct sh_mobile_lcdc_priv *priv)
 
 	/* start the lcdc */
 	sh_mobile_lcdc_start_stop(priv, 1);
+	priv->started = 1;
 
 	/* tell the board code to enable the panel */
 	for (k = 0; k < ARRAY_SIZE(priv->ch); k++) {
@@ -493,7 +495,10 @@ static void sh_mobile_lcdc_stop(struct sh_mobile_lcdc_priv *priv)
 	}
 
 	/* stop the lcdc */
-	sh_mobile_lcdc_start_stop(priv, 0);
+	if (priv->started) {
+		sh_mobile_lcdc_start_stop(priv, 0);
+		priv->started = 0;
+	}
 
 	/* stop clocks */
 	for (k = 0; k < ARRAY_SIZE(priv->ch); k++)
-- 
cgit v0.10.2


From 2f3ed17e010e8c0873094016f93c1afbb4adb666 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 22 May 2009 13:47:52 +0900
Subject: sh: Wrap irq_to_desc_alloc_cpu() around CONFIG_SPARSE_IRQ
 temporarily.

irq_to_desc_alloc_cpu() has been renamed to irq_to_desc_alloc_node() in
-next, but as we can not presently enable SPARSE_IRQ without the early
irq_desc alloc patch, protect it with an ifdef until the interface has
settled and we are ready to enable it system-wide.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/irq/ipr.c b/arch/sh/kernel/cpu/irq/ipr.c
index fa0c846..6ad40db 100644
--- a/arch/sh/kernel/cpu/irq/ipr.c
+++ b/arch/sh/kernel/cpu/irq/ipr.c
@@ -59,17 +59,21 @@ void register_ipr_controller(struct ipr_desc *desc)
 
 	for (i = 0; i < desc->nr_irqs; i++) {
 		struct ipr_data *p = desc->ipr_data + i;
+#ifdef CONFIG_SPARSE_IRQ
 		struct irq_desc *irq_desc;
+#endif
 
 		BUG_ON(p->ipr_idx >= desc->nr_offsets);
 		BUG_ON(!desc->ipr_offsets[p->ipr_idx]);
 
+#ifdef CONFIG_SPARSE_IRQ
 		irq_desc = irq_to_desc_alloc_cpu(p->irq, smp_processor_id());
 		if (unlikely(!irq_desc)) {
 			printk(KERN_INFO "can not get irq_desc for %d\n",
 			       p->irq);
 			continue;
 		}
+#endif
 
 		disable_irq_nosync(p->irq);
 		set_irq_chip_and_handler_name(p->irq, &desc->chip,
diff --git a/drivers/sh/intc.c b/drivers/sh/intc.c
index 098b767..caf0656 100644
--- a/drivers/sh/intc.c
+++ b/drivers/sh/intc.c
@@ -771,16 +771,19 @@ void __init register_intc_controller(struct intc_desc *desc)
 	for (i = 0; i < desc->nr_vectors; i++) {
 		struct intc_vect *vect = desc->vectors + i;
 		unsigned int irq = evt2irq(vect->vect);
+#ifdef CONFIG_SPARSE_IRQ
 		struct irq_desc *irq_desc;
-
+#endif
 		if (!vect->enum_id)
 			continue;
 
+#ifdef CONFIG_SPARSE_IRQ
 		irq_desc = irq_to_desc_alloc_cpu(irq, cpu);
 		if (unlikely(!irq_desc)) {
 			printk(KERN_INFO "can not get irq_desc for %d\n", irq);
 			continue;
 		}
+#endif
 
 		intc_register_irq(desc, d, vect->enum_id, irq);
 	}
-- 
cgit v0.10.2


From 62fad39be0662a924b60e4354b802525ceda0bb1 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 22 May 2009 13:50:18 +0900
Subject: sh: Add a NR_IRQS_LEGACY for external IRQ0-7.

This adds a NR_IRQS_LEGACY definition, which will be used by sparse irq.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/irq.h b/arch/sh/include/asm/irq.h
index d319baa..a2b8c99 100644
--- a/arch/sh/include/asm/irq.h
+++ b/arch/sh/include/asm/irq.h
@@ -8,7 +8,8 @@
  * advised to cap this at the hard limit that they're interested in
  * through the machvec.
  */
-#define NR_IRQS 256
+#define NR_IRQS			256
+#define NR_IRQS_LEGACY		8	/* Legacy external IRQ0-7 */
 
 /*
  * Convert back and forth between INTEVT and IRQ values.
-- 
cgit v0.10.2


From 36aa1e32f451b664adaf3fc9a77d8279b7a833b2 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 22 May 2009 14:00:34 +0900
Subject: sh: clkfwk: Make clock-cpg usable for non-legacy platforms.

This adds a new SH_CLK_CPG for parts that have CPG support.
SH_CLK_CPG_LEGACY is made to depend on this, and still needs to be set
for platforms that want clock-cpg to register the legacy clocks. With
this new config item in place, it is now possible to start layering more
generic CPG code in place while other platforms transition off of the
legacy clocks.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index df764c5..c815975 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -513,7 +513,11 @@ config SH_PCLK_FREQ
 	  This is necessary for determining the reference clock value on
 	  platforms lacking an RTC.
 
+config SH_CLK_CPG
+	def_bool y
+
 config SH_CLK_CPG_LEGACY
+	depends on SH_CLK_CPG
 	def_bool y if !CPU_SUBTYPE_SH7785
 
 config SH_CLK_MD
diff --git a/arch/sh/kernel/cpu/Makefile b/arch/sh/kernel/cpu/Makefile
index 05105b9..eecad7c 100644
--- a/arch/sh/kernel/cpu/Makefile
+++ b/arch/sh/kernel/cpu/Makefile
@@ -17,6 +17,6 @@ obj-$(CONFIG_ARCH_SHMOBILE)	+= shmobile/
 
 obj-$(CONFIG_UBC_WAKEUP)	+= ubc.o
 obj-$(CONFIG_SH_ADC)		+= adc.o
-obj-$(CONFIG_SH_CLK_CPG_LEGACY)	+= clock-cpg.o
+obj-$(CONFIG_SH_CLK_CPG)	+= clock-cpg.o
 
 obj-y	+= irq/ init.o clock.o
diff --git a/arch/sh/kernel/cpu/clock-cpg.c b/arch/sh/kernel/cpu/clock-cpg.c
index b4ca200..b78c237a 100644
--- a/arch/sh/kernel/cpu/clock-cpg.c
+++ b/arch/sh/kernel/cpu/clock-cpg.c
@@ -2,6 +2,7 @@
 #include <linux/compiler.h>
 #include <asm/clock.h>
 
+#ifdef CONFIG_SH_CLK_CPG_LEGACY
 static struct clk master_clk = {
 	.name		= "master_clk",
 	.flags		= CLK_ENABLE_ON_INIT,
@@ -58,3 +59,4 @@ int __init __weak arch_clk_init(void)
 {
 	return cpg_clk_init();
 }
+#endif /* CONFIG_SH_CPG_CLK_LEGACY */
-- 
cgit v0.10.2


From 8fc40238b4ebf07cd11ca9707843338be22af72f Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 22 May 2009 14:21:03 +0900
Subject: sh: Prefer slab_is_available() over after_bootmem.

This kills off after_bootmem and switches to using slab_is_available()
instead. Presently the only place this is used is by the sh64 ioremap,
and there's not much point in keeping the reference around otherwise.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 3edf297..ee8e6bb 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -184,7 +184,6 @@ void __init paging_init(void)
 }
 
 static struct kcore_list kcore_mem, kcore_vmalloc;
-int after_bootmem = 0;
 
 void __init mem_init(void)
 {
@@ -217,8 +216,6 @@ void __init mem_init(void)
 	memset(empty_zero_page, 0, PAGE_SIZE);
 	__flush_wback_region(empty_zero_page, PAGE_SIZE);
 
-	after_bootmem = 1;
-
 	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
diff --git a/arch/sh/mm/ioremap_64.c b/arch/sh/mm/ioremap_64.c
index 2331229..828c859 100644
--- a/arch/sh/mm/ioremap_64.c
+++ b/arch/sh/mm/ioremap_64.c
@@ -20,6 +20,7 @@
 #include <linux/io.h>
 #include <linux/bootmem.h>
 #include <linux/proc_fs.h>
+#include <linux/slab.h>
 #include <asm/page.h>
 #include <asm/pgalloc.h>
 #include <asm/addrspace.h>
@@ -175,7 +176,7 @@ static __init_refok void *sh64_get_page(void)
 {
 	void *page;
 
-	if (after_bootmem)
+	if (slab_is_available())
 		page = (void *)get_zeroed_page(GFP_KERNEL);
 	else
 		page = alloc_bootmem_pages(PAGE_SIZE);
-- 
cgit v0.10.2


From 6eb0ac03899a1363ba176abe0830a9e6698c0503 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Thu, 21 May 2009 19:10:23 +0000
Subject: powerpc/maple: Add a quirk to disable MSI for IPR on Bimini

Something in the HW or FW setup is busted and MSIs aren't working with
IPR on Bimini, so until we figure out exaxtly what's up, we quirk them
out

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/maple/pci.c b/arch/powerpc/platforms/maple/pci.c
index 3018552..04296ff 100644
--- a/arch/powerpc/platforms/maple/pci.c
+++ b/arch/powerpc/platforms/maple/pci.c
@@ -592,3 +592,17 @@ int maple_pci_get_legacy_ide_irq(struct pci_dev *pdev, int channel)
 	}
 	return irq;
 }
+
+static void __devinit quirk_ipr_msi(struct pci_dev *dev)
+{
+	/* Something prevents MSIs from the IPR from working on Bimini,
+	 * and the driver has no smarts to recover. So disable MSI
+	 * on it for now. */
+
+	if (machine_is(maple)) {
+		dev->no_msi = 1;
+		dev_info(&dev->dev, "Quirk disabled MSI\n");
+	}
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_OBSIDIAN,
+			quirk_ipr_msi);
-- 
cgit v0.10.2


From 87488957a68293357a94c8142de7d0ae17914912 Mon Sep 17 00:00:00 2001
From: Adam Williamson <awilliam@redhat.com>
Date: Thu, 21 May 2009 18:32:59 -0400
Subject: ALSA: hda - fix audio on HP TX25xx series notebooks

Fixes https://bugtrack.alsa-project.org/alsa-bug/view.php?id=4121

Taken from https://bugzilla.redhat.com/show_bug.cgi?id=498060

Signed-off-by: Adam Williamson <awilliam@redhat.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index b8a0d3e..bcbb736 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -12058,6 +12058,7 @@ static struct snd_pci_quirk alc268_cfg_tbl[] = {
 	SND_PCI_QUIRK(0x1028, 0x0253, "Dell OEM", ALC268_DELL),
 	SND_PCI_QUIRK(0x1028, 0x02b0, "Dell Inspiron Mini9", ALC268_DELL),
 	SND_PCI_QUIRK(0x103c, 0x30cc, "TOSHIBA", ALC268_TOSHIBA),
+	SND_PCI_QUIRK(0x103c, 0x30f1, "HP TX25xx series", ALC268_TOSHIBA),
 	SND_PCI_QUIRK(0x1043, 0x1205, "ASUS W7J", ALC268_3ST),
 	SND_PCI_QUIRK(0x1179, 0xff10, "TOSHIBA A205", ALC268_TOSHIBA),
 	SND_PCI_QUIRK(0x1179, 0xff50, "TOSHIBA A305", ALC268_TOSHIBA),
-- 
cgit v0.10.2


From 88dff4936c0a5fa53080cca68dc963a8a2a674b0 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 22 May 2009 11:35:50 +0800
Subject: x86: DMI match for the Sony VGN-Z540N as it needs BIOS reboot

x86: DMI match for the Sony VGN-Z540N as it needs BIOS reboot,
see:

  http://bugzilla.kernel.org/show_bug.cgi?id=12901

[ Impact: fix hung reboot on certain systems ]

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Cc: Len Brown <lenb@kernel.org>
LKML-Reference: <1242963350.32574.53.camel@rzhang-dt>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 1340dad..667188e 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -232,6 +232,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"),
 		},
 	},
+	{	/* Handle problems with rebooting on Sony VGN-Z540N */
+		.callback = set_bios_reboot,
+		.ident = "Sony VGN-Z540N",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"),
+		},
+	},
 	{ }
 };
 
-- 
cgit v0.10.2


From e069c0cf7c169ae5a8bfdc8d083a5d66fbef73d8 Mon Sep 17 00:00:00 2001
From: Inaky Perez-Gonzalez <inaky@linux.intel.com>
Date: Fri, 8 May 2009 15:51:44 -0700
Subject: wimax/i2400m: usb: fix device reset on autosuspend while not yet idle

When the i2400m is connected to a network, the host interface (USB)
cannot be suspended. For that to happen, the device has to have
negotiated with the basestation to put the link on IDLE state.

If the host tries to put the device in standby while it is connected
but not idle, the device resets, as the driver should not do that.

To avoid triggering that, when the USB susbsytem requires the driver
to autosuspend the device, the driver checks if the device is not yet
idle. If it is not, the request is rejected (will be retried again
later on after the autosuspend timeout). At some point the device will
enter idle and the request will succeed (unless of course, there is
network traffic, but at that point, there is no idle neither in the
link or the host interface).

Signed-off-by: Inaky Perez-Gonzalez <inaky@linux.intel.com>

diff --git a/drivers/net/wimax/i2400m/usb.c b/drivers/net/wimax/i2400m/usb.c
index ca4151a..1785132 100644
--- a/drivers/net/wimax/i2400m/usb.c
+++ b/drivers/net/wimax/i2400m/usb.c
@@ -505,27 +505,52 @@ int i2400mu_suspend(struct usb_interface *iface, pm_message_t pm_msg)
 #ifdef CONFIG_PM
 	struct usb_device *usb_dev = i2400mu->usb_dev;
 #endif
+	unsigned is_autosuspend = 0;
 	struct i2400m *i2400m = &i2400mu->i2400m;
 
+#ifdef CONFIG_PM
+	if (usb_dev->auto_pm > 0)
+		is_autosuspend = 1;
+#endif
+
 	d_fnstart(3, dev, "(iface %p pm_msg %u)\n", iface, pm_msg.event);
 	if (i2400m->updown == 0)
 		goto no_firmware;
-	d_printf(1, dev, "fw up, requesting standby\n");
+	if (i2400m->state == I2400M_SS_DATA_PATH_CONNECTED && is_autosuspend) {
+		/* ugh -- the device is connected and this suspend
+		 * request is an autosuspend one (not a system standby
+		 * / hibernate).
+		 *
+		 * The only way the device can go to standby is if the
+		 * link with the base station is in IDLE mode; that
+		 * were the case, we'd be in status
+		 * I2400M_SS_CONNECTED_IDLE. But we are not.
+		 *
+		 * If we *tell* him to go power save now, it'll reset
+		 * as a precautionary measure, so if this is an
+		 * autosuspend thing, say no and it'll come back
+		 * later, when the link is IDLE
+		 */
+		result = -EBADF;
+		d_printf(1, dev, "fw up, link up, not-idle, autosuspend: "
+			 "not entering powersave\n");
+		goto error_not_now;
+	}
+	d_printf(1, dev, "fw up: entering powersave\n");
 	atomic_dec(&i2400mu->do_autopm);
 	result = i2400m_cmd_enter_powersave(i2400m);
 	atomic_inc(&i2400mu->do_autopm);
-#ifdef CONFIG_PM
-	if (result < 0 && usb_dev->auto_pm == 0) {
+	if (result < 0 && !is_autosuspend) {
 		/* System suspend, can't fail */
 		dev_err(dev, "failed to suspend, will reset on resume\n");
 		result = 0;
 	}
-#endif
 	if (result < 0)
 		goto error_enter_powersave;
 	i2400mu_notification_release(i2400mu);
-	d_printf(1, dev, "fw up, got standby\n");
+	d_printf(1, dev, "powersave requested\n");
 error_enter_powersave:
+error_not_now:
 no_firmware:
 	d_fnend(3, dev, "(iface %p pm_msg %u) = %d\n",
 		iface, pm_msg.event, result);
-- 
cgit v0.10.2


From b9ed7252d219c1c663944bf03846eabb515dbe75 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Fri, 22 May 2009 09:25:32 +0200
Subject: xen-blkfront: beyond ARRAY_SIZE of info->shadow

Do not go beyond ARRAY_SIZE of info->shadow
Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Acked-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 6d4ac76..6d59508 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -122,7 +122,7 @@ static DEFINE_SPINLOCK(blkif_io_lock);
 static int get_id_from_freelist(struct blkfront_info *info)
 {
 	unsigned long free = info->shadow_free;
-	BUG_ON(free > BLK_RING_SIZE);
+	BUG_ON(free >= BLK_RING_SIZE);
 	info->shadow_free = info->shadow[free].req.id;
 	info->shadow[free].req.id = 0x0fffffee; /* debug */
 	return free;
-- 
cgit v0.10.2


From 0899d6349c60e4021224b51c8c97f49b829dfefd Mon Sep 17 00:00:00 2001
From: Li Yang <leoli@freescale.com>
Date: Fri, 22 May 2009 16:39:59 +0800
Subject: fsldma: update mailling list address in MAINTAINERS

linuxppc-embedded has been merged into linuxppc-dev.

Signed-off-by: Li Yang <leoli@freescale.com>

diff --git a/MAINTAINERS b/MAINTAINERS
index 2b349ba..cac3e3b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2241,7 +2241,7 @@ P:	Li Yang
 M:	leoli@freescale.com
 P:	Zhang Wei
 M:	zw@zh-kernel.org
-L:	linuxppc-embedded@ozlabs.org
+L:	linuxppc-dev@ozlabs.org
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	drivers/dma/fsldma.*
-- 
cgit v0.10.2


From f47edc6dab11801c2e97088ba7bbce042ded867c Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Fri, 22 May 2009 16:46:52 +0800
Subject: fsldma: fix check on potential fdev->chan[] overflow

Fix the check of potential array overflow when using corrupted channel
device tree nodes.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Li Yang <leoli@freescale.com>

diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index da8a8ed..391b1bd 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -830,7 +830,7 @@ static int __devinit fsl_dma_chan_probe(struct fsl_dma_device *fdev,
 			new_fsl_chan->reg.end - new_fsl_chan->reg.start + 1);
 
 	new_fsl_chan->id = ((new_fsl_chan->reg.start - 0x100) & 0xfff) >> 7;
-	if (new_fsl_chan->id > FSL_DMA_MAX_CHANS_PER_DEVICE) {
+	if (new_fsl_chan->id >= FSL_DMA_MAX_CHANS_PER_DEVICE) {
 		dev_err(fdev->dev, "There is no %d channel!\n",
 				new_fsl_chan->id);
 		err = -EINVAL;
-- 
cgit v0.10.2


From 138ef0185177a6d221d24b6aa8f12d867fbbef90 Mon Sep 17 00:00:00 2001
From: Ira Snyder <iws@ovro.caltech.edu>
Date: Tue, 19 May 2009 15:42:13 -0700
Subject: fsldma: fix "DMA halt timeout!" errors

When using the DMA controller from multiple threads at the same time, it is
possible to get lots of "DMA halt timeout!" errors printed to the kernel
log.

This occurs due to a race between fsl_dma_memcpy_issue_pending() and the
interrupt handler, fsl_dma_chan_do_interrupt(). Both call the
fsl_chan_xfer_ld_queue() function, which does not protect against
concurrent accesses to dma_halt() and dma_start().

The existing spinlock is moved to cover the dma_halt() and dma_start()
functions. Testing shows that the "DMA halt timeout!" errors disappear.

Signed-off-by: Ira W. Snyder <iws@ovro.caltech.edu>
Signed-off-by: Li Yang <leoli@freescale.com>

diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index 391b1bd..a4151c3 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -598,15 +598,16 @@ static void fsl_chan_xfer_ld_queue(struct fsl_dma_chan *fsl_chan)
 	dma_addr_t next_dest_addr;
 	unsigned long flags;
 
+	spin_lock_irqsave(&fsl_chan->desc_lock, flags);
+
 	if (!dma_is_idle(fsl_chan))
-		return;
+		goto out_unlock;
 
 	dma_halt(fsl_chan);
 
 	/* If there are some link descriptors
 	 * not transfered in queue. We need to start it.
 	 */
-	spin_lock_irqsave(&fsl_chan->desc_lock, flags);
 
 	/* Find the first un-transfer desciptor */
 	for (ld_node = fsl_chan->ld_queue.next;
@@ -617,8 +618,6 @@ static void fsl_chan_xfer_ld_queue(struct fsl_dma_chan *fsl_chan)
 				fsl_chan->common.cookie) == DMA_SUCCESS);
 		ld_node = ld_node->next);
 
-	spin_unlock_irqrestore(&fsl_chan->desc_lock, flags);
-
 	if (ld_node != &fsl_chan->ld_queue) {
 		/* Get the ld start address from ld_queue */
 		next_dest_addr = to_fsl_desc(ld_node)->async_tx.phys;
@@ -630,6 +629,9 @@ static void fsl_chan_xfer_ld_queue(struct fsl_dma_chan *fsl_chan)
 		set_cdar(fsl_chan, 0);
 		set_ndar(fsl_chan, 0);
 	}
+
+out_unlock:
+	spin_unlock_irqrestore(&fsl_chan->desc_lock, flags);
 }
 
 /**
-- 
cgit v0.10.2


From bcfb7465c03a8c62c89da374677df56f6b894d44 Mon Sep 17 00:00:00 2001
From: Ira Snyder <iws@ovro.caltech.edu>
Date: Fri, 15 May 2009 14:27:16 -0700
Subject: fsldma: fix infinite loop on multi-descriptor DMA chain completion

When creating a DMA transaction with multiple descriptors, the async_tx
cookie is set to 0 for each descriptor in the chain, excluding the last
descriptor, whose cookie is set to -EBUSY.

When fsl_dma_tx_submit() is run, it only assigns a cookie to the first
descriptor. All of the remaining descriptors keep their original value,
including the last descriptor, which is set to -EBUSY.

After the DMA completes, the driver will update the last completed cookie
to be -EBUSY, which is an error code instead of a valid cookie. This causes
dma_async_is_complete() to always return DMA_IN_PROGRESS.

This causes the fsldma driver to never cleanup the queue of link
descriptors, and the driver will re-run the DMA transaction on the hardware
each time it receives the End-of-Chain interrupt. This causes an infinite
loop.

With this patch, fsl_dma_tx_submit() is changed to assign a cookie to every
descriptor in the chain. The rest of the code then works without problems.

Signed-off-by: Ira W. Snyder <iws@ovro.caltech.edu>
Signed-off-by: Li Yang <leoli@freescale.com>

diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index a4151c3..7313a1a 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -313,8 +313,8 @@ static void fsl_chan_toggle_ext_start(struct fsl_dma_chan *fsl_chan, int enable)
 
 static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx)
 {
-	struct fsl_desc_sw *desc = tx_to_fsl_desc(tx);
 	struct fsl_dma_chan *fsl_chan = to_fsl_chan(tx->chan);
+	struct fsl_desc_sw *desc;
 	unsigned long flags;
 	dma_cookie_t cookie;
 
@@ -322,14 +322,17 @@ static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx)
 	spin_lock_irqsave(&fsl_chan->desc_lock, flags);
 
 	cookie = fsl_chan->common.cookie;
-	cookie++;
-	if (cookie < 0)
-		cookie = 1;
-	desc->async_tx.cookie = cookie;
-	fsl_chan->common.cookie = desc->async_tx.cookie;
-
-	append_ld_queue(fsl_chan, desc);
-	list_splice_init(&desc->async_tx.tx_list, fsl_chan->ld_queue.prev);
+	list_for_each_entry(desc, &tx->tx_list, node) {
+		cookie++;
+		if (cookie < 0)
+			cookie = 1;
+
+		desc->async_tx.cookie = cookie;
+	}
+
+	fsl_chan->common.cookie = cookie;
+	append_ld_queue(fsl_chan, tx_to_fsl_desc(tx));
+	list_splice_init(&tx->tx_list, fsl_chan->ld_queue.prev);
 
 	spin_unlock_irqrestore(&fsl_chan->desc_lock, flags);
 
-- 
cgit v0.10.2


From 776c8943f2766f2819fafd88fdfbaf418ecd6e41 Mon Sep 17 00:00:00 2001
From: Ira Snyder <iws@ovro.caltech.edu>
Date: Fri, 15 May 2009 11:33:20 -0700
Subject: fsldma: snooping is not enabled for last entry in descriptor chain

On the 83xx controller, snooping is necessary for the DMA controller to
ensure cache coherence with the CPU when transferring to/from RAM.

The last descriptor in a chain will always have the End-of-Chain interrupt
bit set, so we can set the snoop bit while adding the End-of-Chain
interrupt bit.

Signed-off-by: Ira W. Snyder <iws@ovro.caltech.edu>
Signed-off-by: Li Yang <leoli@freescale.com>

diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index 7313a1a..ff9194d 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -179,9 +179,14 @@ static void dma_halt(struct fsl_dma_chan *fsl_chan)
 static void set_ld_eol(struct fsl_dma_chan *fsl_chan,
 			struct fsl_desc_sw *desc)
 {
+	u64 snoop_bits;
+
+	snoop_bits = ((fsl_chan->feature & FSL_DMA_IP_MASK) == FSL_DMA_IP_83XX)
+		? FSL_DMA_SNEN : 0;
+
 	desc->hw.next_ln_addr = CPU_TO_DMA(fsl_chan,
-		DMA_TO_CPU(fsl_chan, desc->hw.next_ln_addr, 64)	| FSL_DMA_EOL,
-		64);
+		DMA_TO_CPU(fsl_chan, desc->hw.next_ln_addr, 64) | FSL_DMA_EOL
+			| snoop_bits, 64);
 }
 
 static void append_ld_queue(struct fsl_dma_chan *fsl_chan,
-- 
cgit v0.10.2


From 2e077f8e8337e52eef3c39c24c31e103b11a0326 Mon Sep 17 00:00:00 2001
From: Ira Snyder <iws@ovro.caltech.edu>
Date: Fri, 15 May 2009 09:59:46 -0700
Subject: fsldma: fix memory leak on error path in fsl_dma_prep_memcpy()

When preparing a memcpy operation, if the kernel fails to allocate memory
for a link descriptor after the first link descriptor has already been
allocated, then some memory will never be released. Fix the problem by
walking the list of allocated descriptors backwards, and freeing the
allocated descriptors back into the DMA pool.

Signed-off-by: Ira W. Snyder <iws@ovro.caltech.edu>
Signed-off-by: Li Yang <leoli@freescale.com>

diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index ff9194d..1578310 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -462,8 +462,8 @@ static struct dma_async_tx_descriptor *fsl_dma_prep_memcpy(
 {
 	struct fsl_dma_chan *fsl_chan;
 	struct fsl_desc_sw *first = NULL, *prev = NULL, *new;
+	struct list_head *list;
 	size_t copy;
-	LIST_HEAD(link_chain);
 
 	if (!chan)
 		return NULL;
@@ -480,7 +480,7 @@ static struct dma_async_tx_descriptor *fsl_dma_prep_memcpy(
 		if (!new) {
 			dev_err(fsl_chan->dev,
 					"No free memory for link descriptor\n");
-			return NULL;
+			goto fail;
 		}
 #ifdef FSL_DMA_LD_DEBUG
 		dev_dbg(fsl_chan->dev, "new link desc alloc %p\n", new);
@@ -515,7 +515,19 @@ static struct dma_async_tx_descriptor *fsl_dma_prep_memcpy(
 	/* Set End-of-link to the last link descriptor of new list*/
 	set_ld_eol(fsl_chan, new);
 
-	return first ? &first->async_tx : NULL;
+	return &first->async_tx;
+
+fail:
+	if (!first)
+		return NULL;
+
+	list = &first->async_tx.tx_list;
+	list_for_each_entry_safe_reverse(new, prev, list, node) {
+		list_del(&new->node);
+		dma_pool_free(fsl_chan->desc_pool, new, new->async_tx.phys);
+	}
+
+	return NULL;
 }
 
 /**
-- 
cgit v0.10.2


From b1e71b0622974953e46a284aa986504a90869a9b Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 22 May 2009 10:01:55 +0100
Subject: GFS2: Clean up some file names

This patch renames the ops_*.c files which have no counterpart
without the ops_ prefix in order to shorten the name and make
it more readable. In addition, ops_address.h (which was very
small) is moved into inode.h and inode.h is cleaned up by
adding extern where required.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index a851ea4..4f7332c 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,7 +1,7 @@
 obj-$(CONFIG_GFS2_FS) += gfs2.o
 gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
 	glops.o inode.o log.o lops.o main.o meta_io.o \
-	mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
+	mount.o aops.o dentry.o export.o file.o \
 	ops_fstype.o ops_inode.o ops_super.o quota.o \
 	recovery.o rgrp.o super.o sys.o trans.o util.o
 
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
new file mode 100644
index 0000000..03ebb43
--- /dev/null
+++ b/fs/gfs2/aops.c
@@ -0,0 +1,1145 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/mpage.h>
+#include <linux/fs.h>
+#include <linux/writeback.h>
+#include <linux/swap.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/backing-dev.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "trans.h"
+#include "rgrp.h"
+#include "super.h"
+#include "util.h"
+#include "glops.h"
+
+
+static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+				   unsigned int from, unsigned int to)
+{
+	struct buffer_head *head = page_buffers(page);
+	unsigned int bsize = head->b_size;
+	struct buffer_head *bh;
+	unsigned int start, end;
+
+	for (bh = head, start = 0; bh != head || !start;
+	     bh = bh->b_this_page, start = end) {
+		end = start + bsize;
+		if (end <= from || start >= to)
+			continue;
+		if (gfs2_is_jdata(ip))
+			set_buffer_uptodate(bh);
+		gfs2_trans_add_bh(ip->i_gl, bh, 0);
+	}
+}
+
+/**
+ * gfs2_get_block_noalloc - Fills in a buffer head with details about a block
+ * @inode: The inode
+ * @lblock: The block number to look up
+ * @bh_result: The buffer head to return the result in
+ * @create: Non-zero if we may add block to the file
+ *
+ * Returns: errno
+ */
+
+static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
+				  struct buffer_head *bh_result, int create)
+{
+	int error;
+
+	error = gfs2_block_map(inode, lblock, bh_result, 0);
+	if (error)
+		return error;
+	if (!buffer_mapped(bh_result))
+		return -EIO;
+	return 0;
+}
+
+static int gfs2_get_block_direct(struct inode *inode, sector_t lblock,
+				 struct buffer_head *bh_result, int create)
+{
+	return gfs2_block_map(inode, lblock, bh_result, 0);
+}
+
+/**
+ * gfs2_writepage_common - Common bits of writepage
+ * @page: The page to be written
+ * @wbc: The writeback control
+ *
+ * Returns: 1 if writepage is ok, otherwise an error code or zero if no error.
+ */
+
+static int gfs2_writepage_common(struct page *page,
+				 struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset;
+
+	if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
+		goto out;
+	if (current->journal_info)
+		goto redirty;
+	/* Is the page fully outside i_size? (truncate in progress) */
+	offset = i_size & (PAGE_CACHE_SIZE-1);
+	if (page->index > end_index || (page->index == end_index && !offset)) {
+		page->mapping->a_ops->invalidatepage(page, 0);
+		goto out;
+	}
+	return 1;
+redirty:
+	redirty_page_for_writepage(wbc, page);
+out:
+	unlock_page(page);
+	return 0;
+}
+
+/**
+ * gfs2_writeback_writepage - Write page for writeback mappings
+ * @page: The page
+ * @wbc: The writeback control
+ *
+ */
+
+static int gfs2_writeback_writepage(struct page *page,
+				    struct writeback_control *wbc)
+{
+	int ret;
+
+	ret = gfs2_writepage_common(page, wbc);
+	if (ret <= 0)
+		return ret;
+
+	ret = mpage_writepage(page, gfs2_get_block_noalloc, wbc);
+	if (ret == -EAGAIN)
+		ret = block_write_full_page(page, gfs2_get_block_noalloc, wbc);
+	return ret;
+}
+
+/**
+ * gfs2_ordered_writepage - Write page for ordered data files
+ * @page: The page to write
+ * @wbc: The writeback control
+ *
+ */
+
+static int gfs2_ordered_writepage(struct page *page,
+				  struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	int ret;
+
+	ret = gfs2_writepage_common(page, wbc);
+	if (ret <= 0)
+		return ret;
+
+	if (!page_has_buffers(page)) {
+		create_empty_buffers(page, inode->i_sb->s_blocksize,
+				     (1 << BH_Dirty)|(1 << BH_Uptodate));
+	}
+	gfs2_page_add_databufs(ip, page, 0, inode->i_sb->s_blocksize-1);
+	return block_write_full_page(page, gfs2_get_block_noalloc, wbc);
+}
+
+/**
+ * __gfs2_jdata_writepage - The core of jdata writepage
+ * @page: The page to write
+ * @wbc: The writeback control
+ *
+ * This is shared between writepage and writepages and implements the
+ * core of the writepage operation. If a transaction is required then
+ * PageChecked will have been set and the transaction will have
+ * already been started before this is called.
+ */
+
+static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+
+	if (PageChecked(page)) {
+		ClearPageChecked(page);
+		if (!page_has_buffers(page)) {
+			create_empty_buffers(page, inode->i_sb->s_blocksize,
+					     (1 << BH_Dirty)|(1 << BH_Uptodate));
+		}
+		gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
+	}
+	return block_write_full_page(page, gfs2_get_block_noalloc, wbc);
+}
+
+/**
+ * gfs2_jdata_writepage - Write complete page
+ * @page: Page to write
+ *
+ * Returns: errno
+ *
+ */
+
+static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	int ret;
+	int done_trans = 0;
+
+	if (PageChecked(page)) {
+		if (wbc->sync_mode != WB_SYNC_ALL)
+			goto out_ignore;
+		ret = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
+		if (ret)
+			goto out_ignore;
+		done_trans = 1;
+	}
+	ret = gfs2_writepage_common(page, wbc);
+	if (ret > 0)
+		ret = __gfs2_jdata_writepage(page, wbc);
+	if (done_trans)
+		gfs2_trans_end(sdp);
+	return ret;
+
+out_ignore:
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+	return 0;
+}
+
+/**
+ * gfs2_writeback_writepages - Write a bunch of dirty pages back to disk
+ * @mapping: The mapping to write
+ * @wbc: Write-back control
+ *
+ * For the data=writeback case we can already ignore buffer heads
+ * and write whole extents at once. This is a big reduction in the
+ * number of I/O requests we send and the bmap calls we make in this case.
+ */
+static int gfs2_writeback_writepages(struct address_space *mapping,
+				     struct writeback_control *wbc)
+{
+	return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
+}
+
+/**
+ * gfs2_write_jdata_pagevec - Write back a pagevec's worth of pages
+ * @mapping: The mapping
+ * @wbc: The writeback control
+ * @writepage: The writepage function to call for each page
+ * @pvec: The vector of pages
+ * @nr_pages: The number of pages to write
+ *
+ * Returns: non-zero if loop should terminate, zero otherwise
+ */
+
+static int gfs2_write_jdata_pagevec(struct address_space *mapping,
+				    struct writeback_control *wbc,
+				    struct pagevec *pvec,
+				    int nr_pages, pgoff_t end)
+{
+	struct inode *inode = mapping->host;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset = i_size & (PAGE_CACHE_SIZE-1);
+	unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize);
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	int i;
+	int ret;
+
+	ret = gfs2_trans_begin(sdp, nrblocks, nrblocks);
+	if (ret < 0)
+		return ret;
+
+	for(i = 0; i < nr_pages; i++) {
+		struct page *page = pvec->pages[i];
+
+		lock_page(page);
+
+		if (unlikely(page->mapping != mapping)) {
+			unlock_page(page);
+			continue;
+		}
+
+		if (!wbc->range_cyclic && page->index > end) {
+			ret = 1;
+			unlock_page(page);
+			continue;
+		}
+
+		if (wbc->sync_mode != WB_SYNC_NONE)
+			wait_on_page_writeback(page);
+
+		if (PageWriteback(page) ||
+		    !clear_page_dirty_for_io(page)) {
+			unlock_page(page);
+			continue;
+		}
+
+		/* Is the page fully outside i_size? (truncate in progress) */
+		if (page->index > end_index || (page->index == end_index && !offset)) {
+			page->mapping->a_ops->invalidatepage(page, 0);
+			unlock_page(page);
+			continue;
+		}
+
+		ret = __gfs2_jdata_writepage(page, wbc);
+
+		if (ret || (--(wbc->nr_to_write) <= 0))
+			ret = 1;
+		if (wbc->nonblocking && bdi_write_congested(bdi)) {
+			wbc->encountered_congestion = 1;
+			ret = 1;
+		}
+
+	}
+	gfs2_trans_end(sdp);
+	return ret;
+}
+
+/**
+ * gfs2_write_cache_jdata - Like write_cache_pages but different
+ * @mapping: The mapping to write
+ * @wbc: The writeback control
+ * @writepage: The writepage function to call
+ * @data: The data to pass to writepage
+ *
+ * The reason that we use our own function here is that we need to
+ * start transactions before we grab page locks. This allows us
+ * to get the ordering right.
+ */
+
+static int gfs2_write_cache_jdata(struct address_space *mapping,
+				  struct writeback_control *wbc)
+{
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	int ret = 0;
+	int done = 0;
+	struct pagevec pvec;
+	int nr_pages;
+	pgoff_t index;
+	pgoff_t end;
+	int scanned = 0;
+	int range_whole = 0;
+
+	if (wbc->nonblocking && bdi_write_congested(bdi)) {
+		wbc->encountered_congestion = 1;
+		return 0;
+	}
+
+	pagevec_init(&pvec, 0);
+	if (wbc->range_cyclic) {
+		index = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
+		scanned = 1;
+	}
+
+retry:
+	 while (!done && (index <= end) &&
+		(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+					       PAGECACHE_TAG_DIRTY,
+					       min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+		scanned = 1;
+		ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end);
+		if (ret)
+			done = 1;
+		if (ret > 0)
+			ret = 0;
+
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+	if (!scanned && !done) {
+		/*
+		 * We hit the last page and there is more work to be done: wrap
+		 * back to the start of the file
+		 */
+		scanned = 1;
+		index = 0;
+		goto retry;
+	}
+
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		mapping->writeback_index = index;
+	return ret;
+}
+
+
+/**
+ * gfs2_jdata_writepages - Write a bunch of dirty pages back to disk
+ * @mapping: The mapping to write
+ * @wbc: The writeback control
+ * 
+ */
+
+static int gfs2_jdata_writepages(struct address_space *mapping,
+				 struct writeback_control *wbc)
+{
+	struct gfs2_inode *ip = GFS2_I(mapping->host);
+	struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
+	int ret;
+
+	ret = gfs2_write_cache_jdata(mapping, wbc);
+	if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) {
+		gfs2_log_flush(sdp, ip->i_gl);
+		ret = gfs2_write_cache_jdata(mapping, wbc);
+	}
+	return ret;
+}
+
+/**
+ * stuffed_readpage - Fill in a Linux page with stuffed file data
+ * @ip: the inode
+ * @page: the page
+ *
+ * Returns: errno
+ */
+
+static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
+{
+	struct buffer_head *dibh;
+	void *kaddr;
+	int error;
+
+	/*
+	 * Due to the order of unstuffing files and ->fault(), we can be
+	 * asked for a zero page in the case of a stuffed file being extended,
+	 * so we need to supply one here. It doesn't happen often.
+	 */
+	if (unlikely(page->index)) {
+		zero_user(page, 0, PAGE_CACHE_SIZE);
+		SetPageUptodate(page);
+		return 0;
+	}
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		return error;
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
+	       ip->i_disksize);
+	memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize);
+	kunmap_atomic(kaddr, KM_USER0);
+	flush_dcache_page(page);
+	brelse(dibh);
+	SetPageUptodate(page);
+
+	return 0;
+}
+
+
+/**
+ * __gfs2_readpage - readpage
+ * @file: The file to read a page for
+ * @page: The page to read
+ *
+ * This is the core of gfs2's readpage. Its used by the internal file
+ * reading code as in that case we already hold the glock. Also its
+ * called by gfs2_readpage() once the required lock has been granted.
+ *
+ */
+
+static int __gfs2_readpage(void *file, struct page *page)
+{
+	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+	struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+	int error;
+
+	if (gfs2_is_stuffed(ip)) {
+		error = stuffed_readpage(ip, page);
+		unlock_page(page);
+	} else {
+		error = mpage_readpage(page, gfs2_block_map);
+	}
+
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		return -EIO;
+
+	return error;
+}
+
+/**
+ * gfs2_readpage - read a page of a file
+ * @file: The file to read
+ * @page: The page of the file
+ *
+ * This deals with the locking required. We have to unlock and
+ * relock the page in order to get the locking in the right
+ * order.
+ */
+
+static int gfs2_readpage(struct file *file, struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct gfs2_inode *ip = GFS2_I(mapping->host);
+	struct gfs2_holder gh;
+	int error;
+
+	unlock_page(page);
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+	error = gfs2_glock_nq(&gh);
+	if (unlikely(error))
+		goto out;
+	error = AOP_TRUNCATED_PAGE;
+	lock_page(page);
+	if (page->mapping == mapping && !PageUptodate(page))
+		error = __gfs2_readpage(file, page);
+	else
+		unlock_page(page);
+	gfs2_glock_dq(&gh);
+out:
+	gfs2_holder_uninit(&gh);
+	if (error && error != AOP_TRUNCATED_PAGE)
+		lock_page(page);
+	return error;
+}
+
+/**
+ * gfs2_internal_read - read an internal file
+ * @ip: The gfs2 inode
+ * @ra_state: The readahead state (or NULL for no readahead)
+ * @buf: The buffer to fill
+ * @pos: The file position
+ * @size: The amount to read
+ *
+ */
+
+int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
+                       char *buf, loff_t *pos, unsigned size)
+{
+	struct address_space *mapping = ip->i_inode.i_mapping;
+	unsigned long index = *pos / PAGE_CACHE_SIZE;
+	unsigned offset = *pos & (PAGE_CACHE_SIZE - 1);
+	unsigned copied = 0;
+	unsigned amt;
+	struct page *page;
+	void *p;
+
+	do {
+		amt = size - copied;
+		if (offset + size > PAGE_CACHE_SIZE)
+			amt = PAGE_CACHE_SIZE - offset;
+		page = read_cache_page(mapping, index, __gfs2_readpage, NULL);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+		p = kmap_atomic(page, KM_USER0);
+		memcpy(buf + copied, p + offset, amt);
+		kunmap_atomic(p, KM_USER0);
+		mark_page_accessed(page);
+		page_cache_release(page);
+		copied += amt;
+		index++;
+		offset = 0;
+	} while(copied < size);
+	(*pos) += size;
+	return size;
+}
+
+/**
+ * gfs2_readpages - Read a bunch of pages at once
+ *
+ * Some notes:
+ * 1. This is only for readahead, so we can simply ignore any things
+ *    which are slightly inconvenient (such as locking conflicts between
+ *    the page lock and the glock) and return having done no I/O. Its
+ *    obviously not something we'd want to do on too regular a basis.
+ *    Any I/O we ignore at this time will be done via readpage later.
+ * 2. We don't handle stuffed files here we let readpage do the honours.
+ * 3. mpage_readpages() does most of the heavy lifting in the common case.
+ * 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places.
+ */
+
+static int gfs2_readpages(struct file *file, struct address_space *mapping,
+			  struct list_head *pages, unsigned nr_pages)
+{
+	struct inode *inode = mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_holder gh;
+	int ret;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (unlikely(ret))
+		goto out_uninit;
+	if (!gfs2_is_stuffed(ip))
+		ret = mpage_readpages(mapping, pages, nr_pages, gfs2_block_map);
+	gfs2_glock_dq(&gh);
+out_uninit:
+	gfs2_holder_uninit(&gh);
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		ret = -EIO;
+	return ret;
+}
+
+/**
+ * gfs2_write_begin - Begin to write to a file
+ * @file: The file to write to
+ * @mapping: The mapping in which to write
+ * @pos: The file offset at which to start writing
+ * @len: Length of the write
+ * @flags: Various flags
+ * @pagep: Pointer to return the page
+ * @fsdata: Pointer to return fs data (unused by GFS2)
+ *
+ * Returns: errno
+ */
+
+static int gfs2_write_begin(struct file *file, struct address_space *mapping,
+			    loff_t pos, unsigned len, unsigned flags,
+			    struct page **pagep, void **fsdata)
+{
+	struct gfs2_inode *ip = GFS2_I(mapping->host);
+	struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
+	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
+	int alloc_required;
+	int error = 0;
+	struct gfs2_alloc *al;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned to = from + len;
+	struct page *page;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+	error = gfs2_glock_nq(&ip->i_gh);
+	if (unlikely(error))
+		goto out_uninit;
+
+	error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
+	if (error)
+		goto out_unlock;
+
+	if (alloc_required || gfs2_is_jdata(ip))
+		gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
+
+	if (alloc_required) {
+		al = gfs2_alloc_get(ip);
+		if (!al) {
+			error = -ENOMEM;
+			goto out_unlock;
+		}
+
+		error = gfs2_quota_lock_check(ip);
+		if (error)
+			goto out_alloc_put;
+
+		al->al_requested = data_blocks + ind_blocks;
+		error = gfs2_inplace_reserve(ip);
+		if (error)
+			goto out_qunlock;
+	}
+
+	rblocks = RES_DINODE + ind_blocks;
+	if (gfs2_is_jdata(ip))
+		rblocks += data_blocks ? data_blocks : 1;
+	if (ind_blocks || data_blocks)
+		rblocks += RES_STATFS + RES_QUOTA;
+
+	error = gfs2_trans_begin(sdp, rblocks,
+				 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+	if (error)
+		goto out_trans_fail;
+
+	error = -ENOMEM;
+	flags |= AOP_FLAG_NOFS;
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	*pagep = page;
+	if (unlikely(!page))
+		goto out_endtrans;
+
+	if (gfs2_is_stuffed(ip)) {
+		error = 0;
+		if (pos + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
+			error = gfs2_unstuff_dinode(ip, page);
+			if (error == 0)
+				goto prepare_write;
+		} else if (!PageUptodate(page)) {
+			error = stuffed_readpage(ip, page);
+		}
+		goto out;
+	}
+
+prepare_write:
+	error = block_prepare_write(page, from, to, gfs2_block_map);
+out:
+	if (error == 0)
+		return 0;
+
+	page_cache_release(page);
+	if (pos + len > ip->i_inode.i_size)
+		vmtruncate(&ip->i_inode, ip->i_inode.i_size);
+out_endtrans:
+	gfs2_trans_end(sdp);
+out_trans_fail:
+	if (alloc_required) {
+		gfs2_inplace_release(ip);
+out_qunlock:
+		gfs2_quota_unlock(ip);
+out_alloc_put:
+		gfs2_alloc_put(ip);
+	}
+out_unlock:
+	gfs2_glock_dq(&ip->i_gh);
+out_uninit:
+	gfs2_holder_uninit(&ip->i_gh);
+	return error;
+}
+
+/**
+ * adjust_fs_space - Adjusts the free space available due to gfs2_grow
+ * @inode: the rindex inode
+ */
+static void adjust_fs_space(struct inode *inode)
+{
+	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+	u64 fs_total, new_free;
+
+	/* Total up the file system space, according to the latest rindex. */
+	fs_total = gfs2_ri_total(sdp);
+
+	spin_lock(&sdp->sd_statfs_spin);
+	if (fs_total > (m_sc->sc_total + l_sc->sc_total))
+		new_free = fs_total - (m_sc->sc_total + l_sc->sc_total);
+	else
+		new_free = 0;
+	spin_unlock(&sdp->sd_statfs_spin);
+	fs_warn(sdp, "File system extended by %llu blocks.\n",
+		(unsigned long long)new_free);
+	gfs2_statfs_change(sdp, new_free, new_free, 0);
+}
+
+/**
+ * gfs2_stuffed_write_end - Write end for stuffed files
+ * @inode: The inode
+ * @dibh: The buffer_head containing the on-disk inode
+ * @pos: The file position
+ * @len: The length of the write
+ * @copied: How much was actually copied by the VFS
+ * @page: The page
+ *
+ * This copies the data from the page into the inode block after
+ * the inode data structure itself.
+ *
+ * Returns: errno
+ */
+static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
+				  loff_t pos, unsigned len, unsigned copied,
+				  struct page *page)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	u64 to = pos + copied;
+	void *kaddr;
+	unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
+	struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
+
+	BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode)));
+	kaddr = kmap_atomic(page, KM_USER0);
+	memcpy(buf + pos, kaddr + pos, copied);
+	memset(kaddr + pos + copied, 0, len - copied);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+	unlock_page(page);
+	page_cache_release(page);
+
+	if (copied) {
+		if (inode->i_size < to) {
+			i_size_write(inode, to);
+			ip->i_disksize = inode->i_size;
+		}
+		gfs2_dinode_out(ip, di);
+		mark_inode_dirty(inode);
+	}
+
+	if (inode == sdp->sd_rindex)
+		adjust_fs_space(inode);
+
+	brelse(dibh);
+	gfs2_trans_end(sdp);
+	gfs2_glock_dq(&ip->i_gh);
+	gfs2_holder_uninit(&ip->i_gh);
+	return copied;
+}
+
+/**
+ * gfs2_write_end
+ * @file: The file to write to
+ * @mapping: The address space to write to
+ * @pos: The file position
+ * @len: The length of the data
+ * @copied:
+ * @page: The page that has been written
+ * @fsdata: The fsdata (unused in GFS2)
+ *
+ * The main write_end function for GFS2. We have a separate one for
+ * stuffed files as they are slightly different, otherwise we just
+ * put our locking around the VFS provided functions.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_write_end(struct file *file, struct address_space *mapping,
+			  loff_t pos, unsigned len, unsigned copied,
+			  struct page *page, void *fsdata)
+{
+	struct inode *inode = page->mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct buffer_head *dibh;
+	struct gfs2_alloc *al = ip->i_alloc;
+	unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned int to = from + len;
+	int ret;
+
+	BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL);
+
+	ret = gfs2_meta_inode_buffer(ip, &dibh);
+	if (unlikely(ret)) {
+		unlock_page(page);
+		page_cache_release(page);
+		goto failed;
+	}
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
+	if (gfs2_is_stuffed(ip))
+		return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page);
+
+	if (!gfs2_is_writeback(ip))
+		gfs2_page_add_databufs(ip, page, from, to);
+
+	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+	if (ret > 0) {
+		if (inode->i_size > ip->i_disksize)
+			ip->i_disksize = inode->i_size;
+		gfs2_dinode_out(ip, dibh->b_data);
+		mark_inode_dirty(inode);
+	}
+
+	if (inode == sdp->sd_rindex)
+		adjust_fs_space(inode);
+
+	brelse(dibh);
+	gfs2_trans_end(sdp);
+failed:
+	if (al) {
+		gfs2_inplace_release(ip);
+		gfs2_quota_unlock(ip);
+		gfs2_alloc_put(ip);
+	}
+	gfs2_glock_dq(&ip->i_gh);
+	gfs2_holder_uninit(&ip->i_gh);
+	return ret;
+}
+
+/**
+ * gfs2_set_page_dirty - Page dirtying function
+ * @page: The page to dirty
+ *
+ * Returns: 1 if it dirtyed the page, or 0 otherwise
+ */
+ 
+static int gfs2_set_page_dirty(struct page *page)
+{
+	SetPageChecked(page);
+	return __set_page_dirty_buffers(page);
+}
+
+/**
+ * gfs2_bmap - Block map function
+ * @mapping: Address space info
+ * @lblock: The block to map
+ *
+ * Returns: The disk address for the block or 0 on hole or error
+ */
+
+static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
+{
+	struct gfs2_inode *ip = GFS2_I(mapping->host);
+	struct gfs2_holder i_gh;
+	sector_t dblock = 0;
+	int error;
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+	if (error)
+		return 0;
+
+	if (!gfs2_is_stuffed(ip))
+		dblock = generic_block_bmap(mapping, lblock, gfs2_block_map);
+
+	gfs2_glock_dq_uninit(&i_gh);
+
+	return dblock;
+}
+
+static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
+{
+	struct gfs2_bufdata *bd;
+
+	lock_buffer(bh);
+	gfs2_log_lock(sdp);
+	clear_buffer_dirty(bh);
+	bd = bh->b_private;
+	if (bd) {
+		if (!list_empty(&bd->bd_le.le_list) && !buffer_pinned(bh))
+			list_del_init(&bd->bd_le.le_list);
+		else
+			gfs2_remove_from_journal(bh, current->journal_info, 0);
+	}
+	bh->b_bdev = NULL;
+	clear_buffer_mapped(bh);
+	clear_buffer_req(bh);
+	clear_buffer_new(bh);
+	gfs2_log_unlock(sdp);
+	unlock_buffer(bh);
+}
+
+static void gfs2_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+	struct buffer_head *bh, *head;
+	unsigned long pos = 0;
+
+	BUG_ON(!PageLocked(page));
+	if (offset == 0)
+		ClearPageChecked(page);
+	if (!page_has_buffers(page))
+		goto out;
+
+	bh = head = page_buffers(page);
+	do {
+		if (offset <= pos)
+			gfs2_discard(sdp, bh);
+		pos += bh->b_size;
+		bh = bh->b_this_page;
+	} while (bh != head);
+out:
+	if (offset == 0)
+		try_to_release_page(page, 0);
+}
+
+/**
+ * gfs2_ok_for_dio - check that dio is valid on this file
+ * @ip: The inode
+ * @rw: READ or WRITE
+ * @offset: The offset at which we are reading or writing
+ *
+ * Returns: 0 (to ignore the i/o request and thus fall back to buffered i/o)
+ *          1 (to accept the i/o request)
+ */
+static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
+{
+	/*
+	 * Should we return an error here? I can't see that O_DIRECT for
+	 * a stuffed file makes any sense. For now we'll silently fall
+	 * back to buffered I/O
+	 */
+	if (gfs2_is_stuffed(ip))
+		return 0;
+
+	if (offset >= i_size_read(&ip->i_inode))
+		return 0;
+	return 1;
+}
+
+
+
+static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
+			      const struct iovec *iov, loff_t offset,
+			      unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int rv;
+
+	/*
+	 * Deferred lock, even if its a write, since we do no allocation
+	 * on this path. All we need change is atime, and this lock mode
+	 * ensures that other nodes have flushed their buffered read caches
+	 * (i.e. their page cache entries for this inode). We do not,
+	 * unfortunately have the option of only flushing a range like
+	 * the VFS does.
+	 */
+	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh);
+	rv = gfs2_glock_nq(&gh);
+	if (rv)
+		return rv;
+	rv = gfs2_ok_for_dio(ip, rw, offset);
+	if (rv != 1)
+		goto out; /* dio not valid, fall back to buffered i/o */
+
+	rv = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev,
+					   iov, offset, nr_segs,
+					   gfs2_get_block_direct, NULL);
+out:
+	gfs2_glock_dq_m(1, &gh);
+	gfs2_holder_uninit(&gh);
+	return rv;
+}
+
+/**
+ * gfs2_releasepage - free the metadata associated with a page
+ * @page: the page that's being released
+ * @gfp_mask: passed from Linux VFS, ignored by us
+ *
+ * Call try_to_free_buffers() if the buffers in this page can be
+ * released.
+ *
+ * Returns: 0
+ */
+
+int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
+{
+	struct inode *aspace = page->mapping->host;
+	struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info;
+	struct buffer_head *bh, *head;
+	struct gfs2_bufdata *bd;
+
+	if (!page_has_buffers(page))
+		return 0;
+
+	gfs2_log_lock(sdp);
+	head = bh = page_buffers(page);
+	do {
+		if (atomic_read(&bh->b_count))
+			goto cannot_release;
+		bd = bh->b_private;
+		if (bd && bd->bd_ail)
+			goto cannot_release;
+		gfs2_assert_warn(sdp, !buffer_pinned(bh));
+		gfs2_assert_warn(sdp, !buffer_dirty(bh));
+		bh = bh->b_this_page;
+	} while(bh != head);
+	gfs2_log_unlock(sdp);
+
+	head = bh = page_buffers(page);
+	do {
+		gfs2_log_lock(sdp);
+		bd = bh->b_private;
+		if (bd) {
+			gfs2_assert_warn(sdp, bd->bd_bh == bh);
+			gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr));
+			if (!list_empty(&bd->bd_le.le_list)) {
+				if (!buffer_pinned(bh))
+					list_del_init(&bd->bd_le.le_list);
+				else
+					bd = NULL;
+			}
+			if (bd)
+				bd->bd_bh = NULL;
+			bh->b_private = NULL;
+		}
+		gfs2_log_unlock(sdp);
+		if (bd)
+			kmem_cache_free(gfs2_bufdata_cachep, bd);
+
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+	return try_to_free_buffers(page);
+cannot_release:
+	gfs2_log_unlock(sdp);
+	return 0;
+}
+
+static const struct address_space_operations gfs2_writeback_aops = {
+	.writepage = gfs2_writeback_writepage,
+	.writepages = gfs2_writeback_writepages,
+	.readpage = gfs2_readpage,
+	.readpages = gfs2_readpages,
+	.sync_page = block_sync_page,
+	.write_begin = gfs2_write_begin,
+	.write_end = gfs2_write_end,
+	.bmap = gfs2_bmap,
+	.invalidatepage = gfs2_invalidatepage,
+	.releasepage = gfs2_releasepage,
+	.direct_IO = gfs2_direct_IO,
+	.migratepage = buffer_migrate_page,
+	.is_partially_uptodate = block_is_partially_uptodate,
+};
+
+static const struct address_space_operations gfs2_ordered_aops = {
+	.writepage = gfs2_ordered_writepage,
+	.readpage = gfs2_readpage,
+	.readpages = gfs2_readpages,
+	.sync_page = block_sync_page,
+	.write_begin = gfs2_write_begin,
+	.write_end = gfs2_write_end,
+	.set_page_dirty = gfs2_set_page_dirty,
+	.bmap = gfs2_bmap,
+	.invalidatepage = gfs2_invalidatepage,
+	.releasepage = gfs2_releasepage,
+	.direct_IO = gfs2_direct_IO,
+	.migratepage = buffer_migrate_page,
+	.is_partially_uptodate = block_is_partially_uptodate,
+};
+
+static const struct address_space_operations gfs2_jdata_aops = {
+	.writepage = gfs2_jdata_writepage,
+	.writepages = gfs2_jdata_writepages,
+	.readpage = gfs2_readpage,
+	.readpages = gfs2_readpages,
+	.sync_page = block_sync_page,
+	.write_begin = gfs2_write_begin,
+	.write_end = gfs2_write_end,
+	.set_page_dirty = gfs2_set_page_dirty,
+	.bmap = gfs2_bmap,
+	.invalidatepage = gfs2_invalidatepage,
+	.releasepage = gfs2_releasepage,
+	.is_partially_uptodate = block_is_partially_uptodate,
+};
+
+void gfs2_set_aops(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+
+	if (gfs2_is_writeback(ip))
+		inode->i_mapping->a_ops = &gfs2_writeback_aops;
+	else if (gfs2_is_ordered(ip))
+		inode->i_mapping->a_ops = &gfs2_ordered_aops;
+	else if (gfs2_is_jdata(ip))
+		inode->i_mapping->a_ops = &gfs2_jdata_aops;
+	else
+		BUG();
+}
+
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 253e1a3..1153a07 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -25,7 +25,6 @@
 #include "trans.h"
 #include "dir.h"
 #include "util.h"
-#include "ops_address.h"
 
 /* This doesn't need to be that large as max 64 bit pointers in a 4k
  * block is 512, so __u16 is fine for that. It saves stack space to
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
new file mode 100644
index 0000000..022c66c
--- /dev/null
+++ b/fs/gfs2/dentry.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "dir.h"
+#include "glock.h"
+#include "super.h"
+#include "util.h"
+#include "inode.h"
+
+/**
+ * gfs2_drevalidate - Check directory lookup consistency
+ * @dentry: the mapping to check
+ * @nd:
+ *
+ * Check to make sure the lookup necessary to arrive at this inode from its
+ * parent is still good.
+ *
+ * Returns: 1 if the dentry is ok, 0 if it isn't
+ */
+
+static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct dentry *parent = dget_parent(dentry);
+	struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode);
+	struct gfs2_inode *dip = GFS2_I(parent->d_inode);
+	struct inode *inode = dentry->d_inode;
+	struct gfs2_holder d_gh;
+	struct gfs2_inode *ip = NULL;
+	int error;
+	int had_lock = 0;
+
+	if (inode) {
+		if (is_bad_inode(inode))
+			goto invalid;
+		ip = GFS2_I(inode);
+	}
+
+	if (sdp->sd_args.ar_localcaching)
+		goto valid;
+
+	had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
+	if (!had_lock) {
+		error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+		if (error)
+			goto fail;
+	} 
+
+	error = gfs2_dir_check(parent->d_inode, &dentry->d_name, ip);
+	switch (error) {
+	case 0:
+		if (!inode)
+			goto invalid_gunlock;
+		break;
+	case -ENOENT:
+		if (!inode)
+			goto valid_gunlock;
+		goto invalid_gunlock;
+	default:
+		goto fail_gunlock;
+	}
+
+valid_gunlock:
+	if (!had_lock)
+		gfs2_glock_dq_uninit(&d_gh);
+valid:
+	dput(parent);
+	return 1;
+
+invalid_gunlock:
+	if (!had_lock)
+		gfs2_glock_dq_uninit(&d_gh);
+invalid:
+	if (inode && S_ISDIR(inode->i_mode)) {
+		if (have_submounts(dentry))
+			goto valid;
+		shrink_dcache_parent(dentry);
+	}
+	d_drop(dentry);
+	dput(parent);
+	return 0;
+
+fail_gunlock:
+	gfs2_glock_dq_uninit(&d_gh);
+fail:
+	dput(parent);
+	return 0;
+}
+
+static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
+{
+	str->hash = gfs2_disk_hash(str->name, str->len);
+	return 0;
+}
+
+const struct dentry_operations gfs2_dops = {
+	.d_revalidate = gfs2_drevalidate,
+	.d_hash = gfs2_dhash,
+};
+
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
new file mode 100644
index 0000000..9200ef2
--- /dev/null
+++ b/fs/gfs2/export.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/exportfs.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "dir.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "super.h"
+#include "rgrp.h"
+#include "util.h"
+
+#define GFS2_SMALL_FH_SIZE 4
+#define GFS2_LARGE_FH_SIZE 8
+#define GFS2_OLD_FH_SIZE 10
+
+static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
+			  int connectable)
+{
+	__be32 *fh = (__force __be32 *)p;
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	struct gfs2_inode *ip = GFS2_I(inode);
+
+	if (*len < GFS2_SMALL_FH_SIZE ||
+	    (connectable && *len < GFS2_LARGE_FH_SIZE))
+		return 255;
+
+	fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32);
+	fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
+	fh[2] = cpu_to_be32(ip->i_no_addr >> 32);
+	fh[3] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF);
+	*len = GFS2_SMALL_FH_SIZE;
+
+	if (!connectable || inode == sb->s_root->d_inode)
+		return *len;
+
+	spin_lock(&dentry->d_lock);
+	inode = dentry->d_parent->d_inode;
+	ip = GFS2_I(inode);
+	igrab(inode);
+	spin_unlock(&dentry->d_lock);
+
+	fh[4] = cpu_to_be32(ip->i_no_formal_ino >> 32);
+	fh[5] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
+	fh[6] = cpu_to_be32(ip->i_no_addr >> 32);
+	fh[7] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF);
+	*len = GFS2_LARGE_FH_SIZE;
+
+	iput(inode);
+
+	return *len;
+}
+
+struct get_name_filldir {
+	struct gfs2_inum_host inum;
+	char *name;
+};
+
+static int get_name_filldir(void *opaque, const char *name, int length,
+			    loff_t offset, u64 inum, unsigned int type)
+{
+	struct get_name_filldir *gnfd = opaque;
+
+	if (inum != gnfd->inum.no_addr)
+		return 0;
+
+	memcpy(gnfd->name, name, length);
+	gnfd->name[length] = 0;
+
+	return 1;
+}
+
+static int gfs2_get_name(struct dentry *parent, char *name,
+			 struct dentry *child)
+{
+	struct inode *dir = parent->d_inode;
+	struct inode *inode = child->d_inode;
+	struct gfs2_inode *dip, *ip;
+	struct get_name_filldir gnfd;
+	struct gfs2_holder gh;
+	u64 offset = 0;
+	int error;
+
+	if (!dir)
+		return -EINVAL;
+
+	if (!S_ISDIR(dir->i_mode) || !inode)
+		return -EINVAL;
+
+	dip = GFS2_I(dir);
+	ip = GFS2_I(inode);
+
+	*name = 0;
+	gnfd.inum.no_addr = ip->i_no_addr;
+	gnfd.inum.no_formal_ino = ip->i_no_formal_ino;
+	gnfd.name = name;
+
+	error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
+	if (error)
+		return error;
+
+	error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir);
+
+	gfs2_glock_dq_uninit(&gh);
+
+	if (!error && !*name)
+		error = -ENOENT;
+
+	return error;
+}
+
+static struct dentry *gfs2_get_parent(struct dentry *child)
+{
+	struct qstr dotdot;
+	struct dentry *dentry;
+
+	/*
+	 * XXX(hch): it would be a good idea to keep this around as a
+	 *	     static variable.
+	 */
+	gfs2_str2qstr(&dotdot, "..");
+
+	dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &dotdot, 1));
+	if (!IS_ERR(dentry))
+		dentry->d_op = &gfs2_dops;
+	return dentry;
+}
+
+static struct dentry *gfs2_get_dentry(struct super_block *sb,
+		struct gfs2_inum_host *inum)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_holder i_gh, ri_gh, rgd_gh;
+	struct gfs2_rgrpd *rgd;
+	struct inode *inode;
+	struct dentry *dentry;
+	int error;
+
+	/* System files? */
+
+	inode = gfs2_ilookup(sb, inum->no_addr);
+	if (inode) {
+		if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
+			iput(inode);
+			return ERR_PTR(-ESTALE);
+		}
+		goto out_inode;
+	}
+
+	error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops,
+				  LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+	if (error)
+		return ERR_PTR(error);
+
+	error = gfs2_rindex_hold(sdp, &ri_gh);
+	if (error)
+		goto fail;
+
+	error = -EINVAL;
+	rgd = gfs2_blk2rgrpd(sdp, inum->no_addr);
+	if (!rgd)
+		goto fail_rindex;
+
+	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
+	if (error)
+		goto fail_rindex;
+
+	error = -ESTALE;
+	if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE)
+		goto fail_rgd;
+
+	gfs2_glock_dq_uninit(&rgd_gh);
+	gfs2_glock_dq_uninit(&ri_gh);
+
+	inode = gfs2_inode_lookup(sb, DT_UNKNOWN,
+					inum->no_addr,
+					0, 0);
+	if (IS_ERR(inode)) {
+		error = PTR_ERR(inode);
+		goto fail;
+	}
+
+	error = gfs2_inode_refresh(GFS2_I(inode));
+	if (error) {
+		iput(inode);
+		goto fail;
+	}
+
+	/* Pick up the works we bypass in gfs2_inode_lookup */
+	if (inode->i_state & I_NEW) 
+		gfs2_set_iop(inode);
+
+	if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
+		iput(inode);
+		goto fail;
+	}
+
+	error = -EIO;
+	if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) {
+		iput(inode);
+		goto fail;
+	}
+
+	gfs2_glock_dq_uninit(&i_gh);
+
+out_inode:
+	dentry = d_obtain_alias(inode);
+	if (!IS_ERR(dentry))
+		dentry->d_op = &gfs2_dops;
+	return dentry;
+
+fail_rgd:
+	gfs2_glock_dq_uninit(&rgd_gh);
+
+fail_rindex:
+	gfs2_glock_dq_uninit(&ri_gh);
+
+fail:
+	gfs2_glock_dq_uninit(&i_gh);
+	return ERR_PTR(error);
+}
+
+static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	struct gfs2_inum_host this;
+	__be32 *fh = (__force __be32 *)fid->raw;
+
+	switch (fh_type) {
+	case GFS2_SMALL_FH_SIZE:
+	case GFS2_LARGE_FH_SIZE:
+	case GFS2_OLD_FH_SIZE:
+		this.no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32;
+		this.no_formal_ino |= be32_to_cpu(fh[1]);
+		this.no_addr = ((u64)be32_to_cpu(fh[2])) << 32;
+		this.no_addr |= be32_to_cpu(fh[3]);
+		return gfs2_get_dentry(sb, &this);
+	default:
+		return NULL;
+	}
+}
+
+static struct dentry *gfs2_fh_to_parent(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	struct gfs2_inum_host parent;
+	__be32 *fh = (__force __be32 *)fid->raw;
+
+	switch (fh_type) {
+	case GFS2_LARGE_FH_SIZE:
+	case GFS2_OLD_FH_SIZE:
+		parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32;
+		parent.no_formal_ino |= be32_to_cpu(fh[5]);
+		parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32;
+		parent.no_addr |= be32_to_cpu(fh[7]);
+		return gfs2_get_dentry(sb, &parent);
+	default:
+		return NULL;
+	}
+}
+
+const struct export_operations gfs2_export_ops = {
+	.encode_fh = gfs2_encode_fh,
+	.fh_to_dentry = gfs2_fh_to_dentry,
+	.fh_to_parent = gfs2_fh_to_parent,
+	.get_name = gfs2_get_name,
+	.get_parent = gfs2_get_parent,
+};
+
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
new file mode 100644
index 0000000..73b6f55
--- /dev/null
+++ b/fs/gfs2/file.c
@@ -0,0 +1,765 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/blkdev.h>
+#include <linux/mm.h>
+#include <linux/mount.h>
+#include <linux/fs.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/ext2_fs.h>
+#include <linux/crc32.h>
+#include <linux/writeback.h>
+#include <asm/uaccess.h>
+#include <linux/dlm.h>
+#include <linux/dlm_plock.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "dir.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+#include "eaops.h"
+
+/**
+ * gfs2_llseek - seek to a location in a file
+ * @file: the file
+ * @offset: the offset
+ * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
+ *
+ * SEEK_END requires the glock for the file because it references the
+ * file's size.
+ *
+ * Returns: The new offset, or errno
+ */
+
+static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
+{
+	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+	struct gfs2_holder i_gh;
+	loff_t error;
+
+	if (origin == 2) {
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
+					   &i_gh);
+		if (!error) {
+			error = generic_file_llseek_unlocked(file, offset, origin);
+			gfs2_glock_dq_uninit(&i_gh);
+		}
+	} else
+		error = generic_file_llseek_unlocked(file, offset, origin);
+
+	return error;
+}
+
+/**
+ * gfs2_readdir - Read directory entries from a directory
+ * @file: The directory to read from
+ * @dirent: Buffer for dirents
+ * @filldir: Function used to do the copying
+ *
+ * Returns: errno
+ */
+
+static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+	struct inode *dir = file->f_mapping->host;
+	struct gfs2_inode *dip = GFS2_I(dir);
+	struct gfs2_holder d_gh;
+	u64 offset = file->f_pos;
+	int error;
+
+	gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+	error = gfs2_glock_nq(&d_gh);
+	if (error) {
+		gfs2_holder_uninit(&d_gh);
+		return error;
+	}
+
+	error = gfs2_dir_read(dir, &offset, dirent, filldir);
+
+	gfs2_glock_dq_uninit(&d_gh);
+
+	file->f_pos = offset;
+
+	return error;
+}
+
+/**
+ * fsflags_cvt
+ * @table: A table of 32 u32 flags
+ * @val: a 32 bit value to convert
+ *
+ * This function can be used to convert between fsflags values and
+ * GFS2's own flags values.
+ *
+ * Returns: the converted flags
+ */
+static u32 fsflags_cvt(const u32 *table, u32 val)
+{
+	u32 res = 0;
+	while(val) {
+		if (val & 1)
+			res |= *table;
+		table++;
+		val >>= 1;
+	}
+	return res;
+}
+
+static const u32 fsflags_to_gfs2[32] = {
+	[3] = GFS2_DIF_SYNC,
+	[4] = GFS2_DIF_IMMUTABLE,
+	[5] = GFS2_DIF_APPENDONLY,
+	[7] = GFS2_DIF_NOATIME,
+	[12] = GFS2_DIF_EXHASH,
+	[14] = GFS2_DIF_INHERIT_JDATA,
+};
+
+static const u32 gfs2_to_fsflags[32] = {
+	[gfs2fl_Sync] = FS_SYNC_FL,
+	[gfs2fl_Immutable] = FS_IMMUTABLE_FL,
+	[gfs2fl_AppendOnly] = FS_APPEND_FL,
+	[gfs2fl_NoAtime] = FS_NOATIME_FL,
+	[gfs2fl_ExHash] = FS_INDEX_FL,
+	[gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,
+};
+
+static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int error;
+	u32 fsflags;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+	error = gfs2_glock_nq(&gh);
+	if (error)
+		return error;
+
+	fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags);
+	if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA)
+		fsflags |= FS_JOURNAL_DATA_FL;
+	if (put_user(fsflags, ptr))
+		error = -EFAULT;
+
+	gfs2_glock_dq(&gh);
+	gfs2_holder_uninit(&gh);
+	return error;
+}
+
+void gfs2_set_inode_flags(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	unsigned int flags = inode->i_flags;
+
+	flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+	if (ip->i_diskflags & GFS2_DIF_IMMUTABLE)
+		flags |= S_IMMUTABLE;
+	if (ip->i_diskflags & GFS2_DIF_APPENDONLY)
+		flags |= S_APPEND;
+	if (ip->i_diskflags & GFS2_DIF_NOATIME)
+		flags |= S_NOATIME;
+	if (ip->i_diskflags & GFS2_DIF_SYNC)
+		flags |= S_SYNC;
+	inode->i_flags = flags;
+}
+
+/* Flags that can be set by user space */
+#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA|			\
+			     GFS2_DIF_IMMUTABLE|		\
+			     GFS2_DIF_APPENDONLY|		\
+			     GFS2_DIF_NOATIME|			\
+			     GFS2_DIF_SYNC|			\
+			     GFS2_DIF_SYSTEM|			\
+			     GFS2_DIF_INHERIT_JDATA)
+
+/**
+ * gfs2_set_flags - set flags on an inode
+ * @inode: The inode
+ * @flags: The flags to set
+ * @mask: Indicates which flags are valid
+ *
+ */
+static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct buffer_head *bh;
+	struct gfs2_holder gh;
+	int error;
+	u32 new_flags, flags;
+
+	error = mnt_want_write(filp->f_path.mnt);
+	if (error)
+		return error;
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	if (error)
+		goto out_drop_write;
+
+	flags = ip->i_diskflags;
+	new_flags = (flags & ~mask) | (reqflags & mask);
+	if ((new_flags ^ flags) == 0)
+		goto out;
+
+	error = -EINVAL;
+	if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET)
+		goto out;
+
+	error = -EPERM;
+	if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE))
+		goto out;
+	if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY))
+		goto out;
+	if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		goto out;
+	if (!IS_IMMUTABLE(inode)) {
+		error = gfs2_permission(inode, MAY_WRITE);
+		if (error)
+			goto out;
+	}
+	if ((flags ^ new_flags) & GFS2_DIF_JDATA) {
+		if (flags & GFS2_DIF_JDATA)
+			gfs2_log_flush(sdp, ip->i_gl);
+		error = filemap_fdatawrite(inode->i_mapping);
+		if (error)
+			goto out;
+		error = filemap_fdatawait(inode->i_mapping);
+		if (error)
+			goto out;
+	}
+	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+	if (error)
+		goto out;
+	error = gfs2_meta_inode_buffer(ip, &bh);
+	if (error)
+		goto out_trans_end;
+	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+	ip->i_diskflags = new_flags;
+	gfs2_dinode_out(ip, bh->b_data);
+	brelse(bh);
+	gfs2_set_inode_flags(inode);
+	gfs2_set_aops(inode);
+out_trans_end:
+	gfs2_trans_end(sdp);
+out:
+	gfs2_glock_dq_uninit(&gh);
+out_drop_write:
+	mnt_drop_write(filp->f_path.mnt);
+	return error;
+}
+
+static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	u32 fsflags, gfsflags;
+	if (get_user(fsflags, ptr))
+		return -EFAULT;
+	gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
+	if (!S_ISDIR(inode->i_mode)) {
+		if (gfsflags & GFS2_DIF_INHERIT_JDATA)
+			gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA);
+		return do_gfs2_set_flags(filp, gfsflags, ~0);
+	}
+	return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA);
+}
+
+static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	switch(cmd) {
+	case FS_IOC_GETFLAGS:
+		return gfs2_get_flags(filp, (u32 __user *)arg);
+	case FS_IOC_SETFLAGS:
+		return gfs2_set_flags(filp, (u32 __user *)arg);
+	}
+	return -ENOTTY;
+}
+
+/**
+ * gfs2_allocate_page_backing - Use bmap to allocate blocks
+ * @page: The (locked) page to allocate backing for
+ *
+ * We try to allocate all the blocks required for the page in
+ * one go. This might fail for various reasons, so we keep
+ * trying until all the blocks to back this page are allocated.
+ * If some of the blocks are already allocated, thats ok too.
+ */
+
+static int gfs2_allocate_page_backing(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct buffer_head bh;
+	unsigned long size = PAGE_CACHE_SIZE;
+	u64 lblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	do {
+		bh.b_state = 0;
+		bh.b_size = size;
+		gfs2_block_map(inode, lblock, &bh, 1);
+		if (!buffer_mapped(&bh))
+			return -EIO;
+		size -= bh.b_size;
+		lblock += (bh.b_size >> inode->i_blkbits);
+	} while(size > 0);
+	return 0;
+}
+
+/**
+ * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable
+ * @vma: The virtual memory area
+ * @page: The page which is about to become writable
+ *
+ * When the page becomes writable, we need to ensure that we have
+ * blocks allocated on disk to back that page.
+ */
+
+static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	unsigned long last_index;
+	u64 pos = page->index << PAGE_CACHE_SHIFT;
+	unsigned int data_blocks, ind_blocks, rblocks;
+	int alloc_required = 0;
+	struct gfs2_holder gh;
+	struct gfs2_alloc *al;
+	int ret;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (ret)
+		goto out;
+
+	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
+	set_bit(GIF_SW_PAGED, &ip->i_flags);
+
+	ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
+	if (ret || !alloc_required)
+		goto out_unlock;
+	ret = -ENOMEM;
+	al = gfs2_alloc_get(ip);
+	if (al == NULL)
+		goto out_unlock;
+
+	ret = gfs2_quota_lock_check(ip);
+	if (ret)
+		goto out_alloc_put;
+	gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
+	al->al_requested = data_blocks + ind_blocks;
+	ret = gfs2_inplace_reserve(ip);
+	if (ret)
+		goto out_quota_unlock;
+
+	rblocks = RES_DINODE + ind_blocks;
+	if (gfs2_is_jdata(ip))
+		rblocks += data_blocks ? data_blocks : 1;
+	if (ind_blocks || data_blocks)
+		rblocks += RES_STATFS + RES_QUOTA;
+	ret = gfs2_trans_begin(sdp, rblocks, 0);
+	if (ret)
+		goto out_trans_fail;
+
+	lock_page(page);
+	ret = -EINVAL;
+	last_index = ip->i_inode.i_size >> PAGE_CACHE_SHIFT;
+	if (page->index > last_index)
+		goto out_unlock_page;
+	ret = 0;
+	if (!PageUptodate(page) || page->mapping != ip->i_inode.i_mapping)
+		goto out_unlock_page;
+	if (gfs2_is_stuffed(ip)) {
+		ret = gfs2_unstuff_dinode(ip, page);
+		if (ret)
+			goto out_unlock_page;
+	}
+	ret = gfs2_allocate_page_backing(page);
+
+out_unlock_page:
+	unlock_page(page);
+	gfs2_trans_end(sdp);
+out_trans_fail:
+	gfs2_inplace_release(ip);
+out_quota_unlock:
+	gfs2_quota_unlock(ip);
+out_alloc_put:
+	gfs2_alloc_put(ip);
+out_unlock:
+	gfs2_glock_dq(&gh);
+out:
+	gfs2_holder_uninit(&gh);
+	if (ret == -ENOMEM)
+		ret = VM_FAULT_OOM;
+	else if (ret)
+		ret = VM_FAULT_SIGBUS;
+	return ret;
+}
+
+static struct vm_operations_struct gfs2_vm_ops = {
+	.fault = filemap_fault,
+	.page_mkwrite = gfs2_page_mkwrite,
+};
+
+/**
+ * gfs2_mmap -
+ * @file: The file to map
+ * @vma: The VMA which described the mapping
+ *
+ * There is no need to get a lock here unless we should be updating
+ * atime. We ignore any locking errors since the only consequence is
+ * a missed atime update (which will just be deferred until later).
+ *
+ * Returns: 0
+ */
+
+static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+
+	if (!(file->f_flags & O_NOATIME)) {
+		struct gfs2_holder i_gh;
+		int error;
+
+		gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+		error = gfs2_glock_nq(&i_gh);
+		file_accessed(file);
+		if (error == 0)
+			gfs2_glock_dq_uninit(&i_gh);
+	}
+	vma->vm_ops = &gfs2_vm_ops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
+
+	return 0;
+}
+
+/**
+ * gfs2_open - open a file
+ * @inode: the inode to open
+ * @file: the struct file for this opening
+ *
+ * Returns: errno
+ */
+
+static int gfs2_open(struct inode *inode, struct file *file)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder i_gh;
+	struct gfs2_file *fp;
+	int error;
+
+	fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
+	if (!fp)
+		return -ENOMEM;
+
+	mutex_init(&fp->f_fl_mutex);
+
+	gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
+	file->private_data = fp;
+
+	if (S_ISREG(ip->i_inode.i_mode)) {
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
+					   &i_gh);
+		if (error)
+			goto fail;
+
+		if (!(file->f_flags & O_LARGEFILE) &&
+		    ip->i_disksize > MAX_NON_LFS) {
+			error = -EOVERFLOW;
+			goto fail_gunlock;
+		}
+
+		gfs2_glock_dq_uninit(&i_gh);
+	}
+
+	return 0;
+
+fail_gunlock:
+	gfs2_glock_dq_uninit(&i_gh);
+fail:
+	file->private_data = NULL;
+	kfree(fp);
+	return error;
+}
+
+/**
+ * gfs2_close - called to close a struct file
+ * @inode: the inode the struct file belongs to
+ * @file: the struct file being closed
+ *
+ * Returns: errno
+ */
+
+static int gfs2_close(struct inode *inode, struct file *file)
+{
+	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+	struct gfs2_file *fp;
+
+	fp = file->private_data;
+	file->private_data = NULL;
+
+	if (gfs2_assert_warn(sdp, fp))
+		return -EIO;
+
+	kfree(fp);
+
+	return 0;
+}
+
+/**
+ * gfs2_fsync - sync the dirty data for a file (across the cluster)
+ * @file: the file that points to the dentry (we ignore this)
+ * @dentry: the dentry that points to the inode to sync
+ *
+ * The VFS will flush "normal" data for us. We only need to worry
+ * about metadata here. For journaled data, we just do a log flush
+ * as we can't avoid it. Otherwise we can just bale out if datasync
+ * is set. For stuffed inodes we must flush the log in order to
+ * ensure that all data is on disk.
+ *
+ * The call to write_inode_now() is there to write back metadata and
+ * the inode itself. It does also try and write the data, but thats
+ * (hopefully) a no-op due to the VFS having already called filemap_fdatawrite()
+ * for us.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+	struct inode *inode = dentry->d_inode;
+	int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
+	int ret = 0;
+
+	if (gfs2_is_jdata(GFS2_I(inode))) {
+		gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl);
+		return 0;
+	}
+
+	if (sync_state != 0) {
+		if (!datasync)
+			ret = write_inode_now(inode, 0);
+
+		if (gfs2_is_stuffed(GFS2_I(inode)))
+			gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl);
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_GFS2_FS_LOCKING_DLM
+
+/**
+ * gfs2_setlease - acquire/release a file lease
+ * @file: the file pointer
+ * @arg: lease type
+ * @fl: file lock
+ *
+ * We don't currently have a way to enforce a lease across the whole
+ * cluster; until we do, disable leases (by just returning -EINVAL),
+ * unless the administrator has requested purely local locking.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl)
+{
+	return -EINVAL;
+}
+
+/**
+ * gfs2_lock - acquire/release a posix lock on a file
+ * @file: the file pointer
+ * @cmd: either modify or retrieve lock state, possibly wait
+ * @fl: type and range of lock
+ *
+ * Returns: errno
+ */
+
+static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+	struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
+	if (!(fl->fl_flags & FL_POSIX))
+		return -ENOLCK;
+	if (__mandatory_lock(&ip->i_inode))
+		return -ENOLCK;
+
+	if (cmd == F_CANCELLK) {
+		/* Hack: */
+		cmd = F_SETLK;
+		fl->fl_type = F_UNLCK;
+	}
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		return -EIO;
+	if (IS_GETLK(cmd))
+		return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl);
+	else if (fl->fl_type == F_UNLCK)
+		return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl);
+	else
+		return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl);
+}
+
+static int do_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct gfs2_file *fp = file->private_data;
+	struct gfs2_holder *fl_gh = &fp->f_fl_gh;
+	struct gfs2_inode *ip = GFS2_I(file->f_path.dentry->d_inode);
+	struct gfs2_glock *gl;
+	unsigned int state;
+	int flags;
+	int error = 0;
+
+	state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
+	flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
+
+	mutex_lock(&fp->f_fl_mutex);
+
+	gl = fl_gh->gh_gl;
+	if (gl) {
+		if (fl_gh->gh_state == state)
+			goto out;
+		flock_lock_file_wait(file,
+				     &(struct file_lock){.fl_type = F_UNLCK});
+		gfs2_glock_dq_wait(fl_gh);
+		gfs2_holder_reinit(state, flags, fl_gh);
+	} else {
+		error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr,
+				       &gfs2_flock_glops, CREATE, &gl);
+		if (error)
+			goto out;
+		gfs2_holder_init(gl, state, flags, fl_gh);
+		gfs2_glock_put(gl);
+	}
+	error = gfs2_glock_nq(fl_gh);
+	if (error) {
+		gfs2_holder_uninit(fl_gh);
+		if (error == GLR_TRYFAILED)
+			error = -EAGAIN;
+	} else {
+		error = flock_lock_file_wait(file, fl);
+		gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
+	}
+
+out:
+	mutex_unlock(&fp->f_fl_mutex);
+	return error;
+}
+
+static void do_unflock(struct file *file, struct file_lock *fl)
+{
+	struct gfs2_file *fp = file->private_data;
+	struct gfs2_holder *fl_gh = &fp->f_fl_gh;
+
+	mutex_lock(&fp->f_fl_mutex);
+	flock_lock_file_wait(file, fl);
+	if (fl_gh->gh_gl)
+		gfs2_glock_dq_uninit(fl_gh);
+	mutex_unlock(&fp->f_fl_mutex);
+}
+
+/**
+ * gfs2_flock - acquire/release a flock lock on a file
+ * @file: the file pointer
+ * @cmd: either modify or retrieve lock state, possibly wait
+ * @fl: type and range of lock
+ *
+ * Returns: errno
+ */
+
+static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+
+	if (!(fl->fl_flags & FL_FLOCK))
+		return -ENOLCK;
+	if (__mandatory_lock(&ip->i_inode))
+		return -ENOLCK;
+
+	if (fl->fl_type == F_UNLCK) {
+		do_unflock(file, fl);
+		return 0;
+	} else {
+		return do_flock(file, cmd, fl);
+	}
+}
+
+const struct file_operations gfs2_file_fops = {
+	.llseek		= gfs2_llseek,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
+	.unlocked_ioctl	= gfs2_ioctl,
+	.mmap		= gfs2_mmap,
+	.open		= gfs2_open,
+	.release	= gfs2_close,
+	.fsync		= gfs2_fsync,
+	.lock		= gfs2_lock,
+	.flock		= gfs2_flock,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
+	.setlease	= gfs2_setlease,
+};
+
+const struct file_operations gfs2_dir_fops = {
+	.readdir	= gfs2_readdir,
+	.unlocked_ioctl	= gfs2_ioctl,
+	.open		= gfs2_open,
+	.release	= gfs2_close,
+	.fsync		= gfs2_fsync,
+	.lock		= gfs2_lock,
+	.flock		= gfs2_flock,
+};
+
+#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
+
+const struct file_operations gfs2_file_fops_nolock = {
+	.llseek		= gfs2_llseek,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
+	.unlocked_ioctl	= gfs2_ioctl,
+	.mmap		= gfs2_mmap,
+	.open		= gfs2_open,
+	.release	= gfs2_close,
+	.fsync		= gfs2_fsync,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
+	.setlease	= generic_setlease,
+};
+
+const struct file_operations gfs2_dir_fops_nolock = {
+	.readdir	= gfs2_readdir,
+	.unlocked_ioctl	= gfs2_ioctl,
+	.open		= gfs2_open,
+	.release	= gfs2_close,
+	.fsync		= gfs2_fsync,
+};
+
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 5a31d42..c03a1a3 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -30,7 +30,6 @@
 #include "inode.h"
 #include "log.h"
 #include "meta_io.h"
-#include "ops_address.h"
 #include "quota.h"
 #include "rgrp.h"
 #include "trans.h"
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c30be2b..2c3ec07 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -11,8 +11,16 @@
 #define __INODE_DOT_H__
 
 #include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
 #include "util.h"
 
+extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
+extern int gfs2_internal_read(struct gfs2_inode *ip,
+			      struct file_ra_state *ra_state,
+			      char *buf, loff_t *pos, unsigned size);
+extern void gfs2_set_aops(struct inode *inode);
+
 static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
 {
 	return !ip->i_height;
@@ -73,30 +81,31 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
 }
 
 
-void gfs2_set_iop(struct inode *inode);
-struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
-				u64 no_addr, u64 no_formal_ino,
-				int skip_freeing);
-struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
-
-int gfs2_inode_refresh(struct gfs2_inode *ip);
-
-int gfs2_dinode_dealloc(struct gfs2_inode *inode);
-int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
-struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
-			   int is_root);
-struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
-			   unsigned int mode, dev_t dev);
-int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
-		struct gfs2_inode *ip);
-int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
-		   const struct gfs2_inode *ip);
-int gfs2_permission(struct inode *inode, int mask);
-int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
-int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
-struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
-void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
-void gfs2_dinode_print(const struct gfs2_inode *ip);
+extern void gfs2_set_iop(struct inode *inode);
+extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
+				       u64 no_addr, u64 no_formal_ino,
+				       int skip_freeing);
+extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
+
+extern int gfs2_inode_refresh(struct gfs2_inode *ip);
+
+extern int gfs2_dinode_dealloc(struct gfs2_inode *inode);
+extern int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
+extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
+				  int is_root);
+extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
+				  const struct qstr *name,
+				  unsigned int mode, dev_t dev);
+extern int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
+		       struct gfs2_inode *ip);
+extern int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
+			  const struct gfs2_inode *ip);
+extern int gfs2_permission(struct inode *inode, int mask);
+extern int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
+extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
+extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
+extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
+extern void gfs2_dinode_print(const struct gfs2_inode *ip);
 
 extern const struct inode_operations gfs2_file_iops;
 extern const struct inode_operations gfs2_dir_iops;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 78a5f43..cb8d7a9 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -31,7 +31,6 @@
 #include "rgrp.h"
 #include "trans.h"
 #include "util.h"
-#include "ops_address.h"
 
 static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
 {
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
deleted file mode 100644
index e566421..0000000
--- a/fs/gfs2/ops_address.c
+++ /dev/null
@@ -1,1146 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/pagemap.h>
-#include <linux/pagevec.h>
-#include <linux/mpage.h>
-#include <linux/fs.h>
-#include <linux/writeback.h>
-#include <linux/swap.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/backing-dev.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "bmap.h"
-#include "glock.h"
-#include "inode.h"
-#include "log.h"
-#include "meta_io.h"
-#include "ops_address.h"
-#include "quota.h"
-#include "trans.h"
-#include "rgrp.h"
-#include "super.h"
-#include "util.h"
-#include "glops.h"
-
-
-static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
-				   unsigned int from, unsigned int to)
-{
-	struct buffer_head *head = page_buffers(page);
-	unsigned int bsize = head->b_size;
-	struct buffer_head *bh;
-	unsigned int start, end;
-
-	for (bh = head, start = 0; bh != head || !start;
-	     bh = bh->b_this_page, start = end) {
-		end = start + bsize;
-		if (end <= from || start >= to)
-			continue;
-		if (gfs2_is_jdata(ip))
-			set_buffer_uptodate(bh);
-		gfs2_trans_add_bh(ip->i_gl, bh, 0);
-	}
-}
-
-/**
- * gfs2_get_block_noalloc - Fills in a buffer head with details about a block
- * @inode: The inode
- * @lblock: The block number to look up
- * @bh_result: The buffer head to return the result in
- * @create: Non-zero if we may add block to the file
- *
- * Returns: errno
- */
-
-static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
-				  struct buffer_head *bh_result, int create)
-{
-	int error;
-
-	error = gfs2_block_map(inode, lblock, bh_result, 0);
-	if (error)
-		return error;
-	if (!buffer_mapped(bh_result))
-		return -EIO;
-	return 0;
-}
-
-static int gfs2_get_block_direct(struct inode *inode, sector_t lblock,
-				 struct buffer_head *bh_result, int create)
-{
-	return gfs2_block_map(inode, lblock, bh_result, 0);
-}
-
-/**
- * gfs2_writepage_common - Common bits of writepage
- * @page: The page to be written
- * @wbc: The writeback control
- *
- * Returns: 1 if writepage is ok, otherwise an error code or zero if no error.
- */
-
-static int gfs2_writepage_common(struct page *page,
-				 struct writeback_control *wbc)
-{
-	struct inode *inode = page->mapping->host;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	loff_t i_size = i_size_read(inode);
-	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
-	unsigned offset;
-
-	if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
-		goto out;
-	if (current->journal_info)
-		goto redirty;
-	/* Is the page fully outside i_size? (truncate in progress) */
-	offset = i_size & (PAGE_CACHE_SIZE-1);
-	if (page->index > end_index || (page->index == end_index && !offset)) {
-		page->mapping->a_ops->invalidatepage(page, 0);
-		goto out;
-	}
-	return 1;
-redirty:
-	redirty_page_for_writepage(wbc, page);
-out:
-	unlock_page(page);
-	return 0;
-}
-
-/**
- * gfs2_writeback_writepage - Write page for writeback mappings
- * @page: The page
- * @wbc: The writeback control
- *
- */
-
-static int gfs2_writeback_writepage(struct page *page,
-				    struct writeback_control *wbc)
-{
-	int ret;
-
-	ret = gfs2_writepage_common(page, wbc);
-	if (ret <= 0)
-		return ret;
-
-	ret = mpage_writepage(page, gfs2_get_block_noalloc, wbc);
-	if (ret == -EAGAIN)
-		ret = block_write_full_page(page, gfs2_get_block_noalloc, wbc);
-	return ret;
-}
-
-/**
- * gfs2_ordered_writepage - Write page for ordered data files
- * @page: The page to write
- * @wbc: The writeback control
- *
- */
-
-static int gfs2_ordered_writepage(struct page *page,
-				  struct writeback_control *wbc)
-{
-	struct inode *inode = page->mapping->host;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	int ret;
-
-	ret = gfs2_writepage_common(page, wbc);
-	if (ret <= 0)
-		return ret;
-
-	if (!page_has_buffers(page)) {
-		create_empty_buffers(page, inode->i_sb->s_blocksize,
-				     (1 << BH_Dirty)|(1 << BH_Uptodate));
-	}
-	gfs2_page_add_databufs(ip, page, 0, inode->i_sb->s_blocksize-1);
-	return block_write_full_page(page, gfs2_get_block_noalloc, wbc);
-}
-
-/**
- * __gfs2_jdata_writepage - The core of jdata writepage
- * @page: The page to write
- * @wbc: The writeback control
- *
- * This is shared between writepage and writepages and implements the
- * core of the writepage operation. If a transaction is required then
- * PageChecked will have been set and the transaction will have
- * already been started before this is called.
- */
-
-static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
-{
-	struct inode *inode = page->mapping->host;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-
-	if (PageChecked(page)) {
-		ClearPageChecked(page);
-		if (!page_has_buffers(page)) {
-			create_empty_buffers(page, inode->i_sb->s_blocksize,
-					     (1 << BH_Dirty)|(1 << BH_Uptodate));
-		}
-		gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
-	}
-	return block_write_full_page(page, gfs2_get_block_noalloc, wbc);
-}
-
-/**
- * gfs2_jdata_writepage - Write complete page
- * @page: Page to write
- *
- * Returns: errno
- *
- */
-
-static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
-{
-	struct inode *inode = page->mapping->host;
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	int ret;
-	int done_trans = 0;
-
-	if (PageChecked(page)) {
-		if (wbc->sync_mode != WB_SYNC_ALL)
-			goto out_ignore;
-		ret = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
-		if (ret)
-			goto out_ignore;
-		done_trans = 1;
-	}
-	ret = gfs2_writepage_common(page, wbc);
-	if (ret > 0)
-		ret = __gfs2_jdata_writepage(page, wbc);
-	if (done_trans)
-		gfs2_trans_end(sdp);
-	return ret;
-
-out_ignore:
-	redirty_page_for_writepage(wbc, page);
-	unlock_page(page);
-	return 0;
-}
-
-/**
- * gfs2_writeback_writepages - Write a bunch of dirty pages back to disk
- * @mapping: The mapping to write
- * @wbc: Write-back control
- *
- * For the data=writeback case we can already ignore buffer heads
- * and write whole extents at once. This is a big reduction in the
- * number of I/O requests we send and the bmap calls we make in this case.
- */
-static int gfs2_writeback_writepages(struct address_space *mapping,
-				     struct writeback_control *wbc)
-{
-	return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
-}
-
-/**
- * gfs2_write_jdata_pagevec - Write back a pagevec's worth of pages
- * @mapping: The mapping
- * @wbc: The writeback control
- * @writepage: The writepage function to call for each page
- * @pvec: The vector of pages
- * @nr_pages: The number of pages to write
- *
- * Returns: non-zero if loop should terminate, zero otherwise
- */
-
-static int gfs2_write_jdata_pagevec(struct address_space *mapping,
-				    struct writeback_control *wbc,
-				    struct pagevec *pvec,
-				    int nr_pages, pgoff_t end)
-{
-	struct inode *inode = mapping->host;
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	loff_t i_size = i_size_read(inode);
-	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
-	unsigned offset = i_size & (PAGE_CACHE_SIZE-1);
-	unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize);
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
-	int i;
-	int ret;
-
-	ret = gfs2_trans_begin(sdp, nrblocks, nrblocks);
-	if (ret < 0)
-		return ret;
-
-	for(i = 0; i < nr_pages; i++) {
-		struct page *page = pvec->pages[i];
-
-		lock_page(page);
-
-		if (unlikely(page->mapping != mapping)) {
-			unlock_page(page);
-			continue;
-		}
-
-		if (!wbc->range_cyclic && page->index > end) {
-			ret = 1;
-			unlock_page(page);
-			continue;
-		}
-
-		if (wbc->sync_mode != WB_SYNC_NONE)
-			wait_on_page_writeback(page);
-
-		if (PageWriteback(page) ||
-		    !clear_page_dirty_for_io(page)) {
-			unlock_page(page);
-			continue;
-		}
-
-		/* Is the page fully outside i_size? (truncate in progress) */
-		if (page->index > end_index || (page->index == end_index && !offset)) {
-			page->mapping->a_ops->invalidatepage(page, 0);
-			unlock_page(page);
-			continue;
-		}
-
-		ret = __gfs2_jdata_writepage(page, wbc);
-
-		if (ret || (--(wbc->nr_to_write) <= 0))
-			ret = 1;
-		if (wbc->nonblocking && bdi_write_congested(bdi)) {
-			wbc->encountered_congestion = 1;
-			ret = 1;
-		}
-
-	}
-	gfs2_trans_end(sdp);
-	return ret;
-}
-
-/**
- * gfs2_write_cache_jdata - Like write_cache_pages but different
- * @mapping: The mapping to write
- * @wbc: The writeback control
- * @writepage: The writepage function to call
- * @data: The data to pass to writepage
- *
- * The reason that we use our own function here is that we need to
- * start transactions before we grab page locks. This allows us
- * to get the ordering right.
- */
-
-static int gfs2_write_cache_jdata(struct address_space *mapping,
-				  struct writeback_control *wbc)
-{
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
-	int ret = 0;
-	int done = 0;
-	struct pagevec pvec;
-	int nr_pages;
-	pgoff_t index;
-	pgoff_t end;
-	int scanned = 0;
-	int range_whole = 0;
-
-	if (wbc->nonblocking && bdi_write_congested(bdi)) {
-		wbc->encountered_congestion = 1;
-		return 0;
-	}
-
-	pagevec_init(&pvec, 0);
-	if (wbc->range_cyclic) {
-		index = mapping->writeback_index; /* Start from prev offset */
-		end = -1;
-	} else {
-		index = wbc->range_start >> PAGE_CACHE_SHIFT;
-		end = wbc->range_end >> PAGE_CACHE_SHIFT;
-		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
-			range_whole = 1;
-		scanned = 1;
-	}
-
-retry:
-	 while (!done && (index <= end) &&
-		(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-					       PAGECACHE_TAG_DIRTY,
-					       min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
-		scanned = 1;
-		ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end);
-		if (ret)
-			done = 1;
-		if (ret > 0)
-			ret = 0;
-
-		pagevec_release(&pvec);
-		cond_resched();
-	}
-
-	if (!scanned && !done) {
-		/*
-		 * We hit the last page and there is more work to be done: wrap
-		 * back to the start of the file
-		 */
-		scanned = 1;
-		index = 0;
-		goto retry;
-	}
-
-	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
-		mapping->writeback_index = index;
-	return ret;
-}
-
-
-/**
- * gfs2_jdata_writepages - Write a bunch of dirty pages back to disk
- * @mapping: The mapping to write
- * @wbc: The writeback control
- * 
- */
-
-static int gfs2_jdata_writepages(struct address_space *mapping,
-				 struct writeback_control *wbc)
-{
-	struct gfs2_inode *ip = GFS2_I(mapping->host);
-	struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
-	int ret;
-
-	ret = gfs2_write_cache_jdata(mapping, wbc);
-	if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) {
-		gfs2_log_flush(sdp, ip->i_gl);
-		ret = gfs2_write_cache_jdata(mapping, wbc);
-	}
-	return ret;
-}
-
-/**
- * stuffed_readpage - Fill in a Linux page with stuffed file data
- * @ip: the inode
- * @page: the page
- *
- * Returns: errno
- */
-
-static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
-{
-	struct buffer_head *dibh;
-	void *kaddr;
-	int error;
-
-	/*
-	 * Due to the order of unstuffing files and ->fault(), we can be
-	 * asked for a zero page in the case of a stuffed file being extended,
-	 * so we need to supply one here. It doesn't happen often.
-	 */
-	if (unlikely(page->index)) {
-		zero_user(page, 0, PAGE_CACHE_SIZE);
-		SetPageUptodate(page);
-		return 0;
-	}
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (error)
-		return error;
-
-	kaddr = kmap_atomic(page, KM_USER0);
-	memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
-	       ip->i_disksize);
-	memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize);
-	kunmap_atomic(kaddr, KM_USER0);
-	flush_dcache_page(page);
-	brelse(dibh);
-	SetPageUptodate(page);
-
-	return 0;
-}
-
-
-/**
- * __gfs2_readpage - readpage
- * @file: The file to read a page for
- * @page: The page to read
- *
- * This is the core of gfs2's readpage. Its used by the internal file
- * reading code as in that case we already hold the glock. Also its
- * called by gfs2_readpage() once the required lock has been granted.
- *
- */
-
-static int __gfs2_readpage(void *file, struct page *page)
-{
-	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
-	struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
-	int error;
-
-	if (gfs2_is_stuffed(ip)) {
-		error = stuffed_readpage(ip, page);
-		unlock_page(page);
-	} else {
-		error = mpage_readpage(page, gfs2_block_map);
-	}
-
-	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		return -EIO;
-
-	return error;
-}
-
-/**
- * gfs2_readpage - read a page of a file
- * @file: The file to read
- * @page: The page of the file
- *
- * This deals with the locking required. We have to unlock and
- * relock the page in order to get the locking in the right
- * order.
- */
-
-static int gfs2_readpage(struct file *file, struct page *page)
-{
-	struct address_space *mapping = page->mapping;
-	struct gfs2_inode *ip = GFS2_I(mapping->host);
-	struct gfs2_holder gh;
-	int error;
-
-	unlock_page(page);
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
-	error = gfs2_glock_nq(&gh);
-	if (unlikely(error))
-		goto out;
-	error = AOP_TRUNCATED_PAGE;
-	lock_page(page);
-	if (page->mapping == mapping && !PageUptodate(page))
-		error = __gfs2_readpage(file, page);
-	else
-		unlock_page(page);
-	gfs2_glock_dq(&gh);
-out:
-	gfs2_holder_uninit(&gh);
-	if (error && error != AOP_TRUNCATED_PAGE)
-		lock_page(page);
-	return error;
-}
-
-/**
- * gfs2_internal_read - read an internal file
- * @ip: The gfs2 inode
- * @ra_state: The readahead state (or NULL for no readahead)
- * @buf: The buffer to fill
- * @pos: The file position
- * @size: The amount to read
- *
- */
-
-int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
-                       char *buf, loff_t *pos, unsigned size)
-{
-	struct address_space *mapping = ip->i_inode.i_mapping;
-	unsigned long index = *pos / PAGE_CACHE_SIZE;
-	unsigned offset = *pos & (PAGE_CACHE_SIZE - 1);
-	unsigned copied = 0;
-	unsigned amt;
-	struct page *page;
-	void *p;
-
-	do {
-		amt = size - copied;
-		if (offset + size > PAGE_CACHE_SIZE)
-			amt = PAGE_CACHE_SIZE - offset;
-		page = read_cache_page(mapping, index, __gfs2_readpage, NULL);
-		if (IS_ERR(page))
-			return PTR_ERR(page);
-		p = kmap_atomic(page, KM_USER0);
-		memcpy(buf + copied, p + offset, amt);
-		kunmap_atomic(p, KM_USER0);
-		mark_page_accessed(page);
-		page_cache_release(page);
-		copied += amt;
-		index++;
-		offset = 0;
-	} while(copied < size);
-	(*pos) += size;
-	return size;
-}
-
-/**
- * gfs2_readpages - Read a bunch of pages at once
- *
- * Some notes:
- * 1. This is only for readahead, so we can simply ignore any things
- *    which are slightly inconvenient (such as locking conflicts between
- *    the page lock and the glock) and return having done no I/O. Its
- *    obviously not something we'd want to do on too regular a basis.
- *    Any I/O we ignore at this time will be done via readpage later.
- * 2. We don't handle stuffed files here we let readpage do the honours.
- * 3. mpage_readpages() does most of the heavy lifting in the common case.
- * 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places.
- */
-
-static int gfs2_readpages(struct file *file, struct address_space *mapping,
-			  struct list_head *pages, unsigned nr_pages)
-{
-	struct inode *inode = mapping->host;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct gfs2_holder gh;
-	int ret;
-
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
-	ret = gfs2_glock_nq(&gh);
-	if (unlikely(ret))
-		goto out_uninit;
-	if (!gfs2_is_stuffed(ip))
-		ret = mpage_readpages(mapping, pages, nr_pages, gfs2_block_map);
-	gfs2_glock_dq(&gh);
-out_uninit:
-	gfs2_holder_uninit(&gh);
-	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		ret = -EIO;
-	return ret;
-}
-
-/**
- * gfs2_write_begin - Begin to write to a file
- * @file: The file to write to
- * @mapping: The mapping in which to write
- * @pos: The file offset at which to start writing
- * @len: Length of the write
- * @flags: Various flags
- * @pagep: Pointer to return the page
- * @fsdata: Pointer to return fs data (unused by GFS2)
- *
- * Returns: errno
- */
-
-static int gfs2_write_begin(struct file *file, struct address_space *mapping,
-			    loff_t pos, unsigned len, unsigned flags,
-			    struct page **pagep, void **fsdata)
-{
-	struct gfs2_inode *ip = GFS2_I(mapping->host);
-	struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
-	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
-	int alloc_required;
-	int error = 0;
-	struct gfs2_alloc *al;
-	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
-	unsigned to = from + len;
-	struct page *page;
-
-	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
-	error = gfs2_glock_nq(&ip->i_gh);
-	if (unlikely(error))
-		goto out_uninit;
-
-	error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
-	if (error)
-		goto out_unlock;
-
-	if (alloc_required || gfs2_is_jdata(ip))
-		gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
-
-	if (alloc_required) {
-		al = gfs2_alloc_get(ip);
-		if (!al) {
-			error = -ENOMEM;
-			goto out_unlock;
-		}
-
-		error = gfs2_quota_lock_check(ip);
-		if (error)
-			goto out_alloc_put;
-
-		al->al_requested = data_blocks + ind_blocks;
-		error = gfs2_inplace_reserve(ip);
-		if (error)
-			goto out_qunlock;
-	}
-
-	rblocks = RES_DINODE + ind_blocks;
-	if (gfs2_is_jdata(ip))
-		rblocks += data_blocks ? data_blocks : 1;
-	if (ind_blocks || data_blocks)
-		rblocks += RES_STATFS + RES_QUOTA;
-
-	error = gfs2_trans_begin(sdp, rblocks,
-				 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
-	if (error)
-		goto out_trans_fail;
-
-	error = -ENOMEM;
-	flags |= AOP_FLAG_NOFS;
-	page = grab_cache_page_write_begin(mapping, index, flags);
-	*pagep = page;
-	if (unlikely(!page))
-		goto out_endtrans;
-
-	if (gfs2_is_stuffed(ip)) {
-		error = 0;
-		if (pos + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
-			error = gfs2_unstuff_dinode(ip, page);
-			if (error == 0)
-				goto prepare_write;
-		} else if (!PageUptodate(page)) {
-			error = stuffed_readpage(ip, page);
-		}
-		goto out;
-	}
-
-prepare_write:
-	error = block_prepare_write(page, from, to, gfs2_block_map);
-out:
-	if (error == 0)
-		return 0;
-
-	page_cache_release(page);
-	if (pos + len > ip->i_inode.i_size)
-		vmtruncate(&ip->i_inode, ip->i_inode.i_size);
-out_endtrans:
-	gfs2_trans_end(sdp);
-out_trans_fail:
-	if (alloc_required) {
-		gfs2_inplace_release(ip);
-out_qunlock:
-		gfs2_quota_unlock(ip);
-out_alloc_put:
-		gfs2_alloc_put(ip);
-	}
-out_unlock:
-	gfs2_glock_dq(&ip->i_gh);
-out_uninit:
-	gfs2_holder_uninit(&ip->i_gh);
-	return error;
-}
-
-/**
- * adjust_fs_space - Adjusts the free space available due to gfs2_grow
- * @inode: the rindex inode
- */
-static void adjust_fs_space(struct inode *inode)
-{
-	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
-	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
-	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
-	u64 fs_total, new_free;
-
-	/* Total up the file system space, according to the latest rindex. */
-	fs_total = gfs2_ri_total(sdp);
-
-	spin_lock(&sdp->sd_statfs_spin);
-	if (fs_total > (m_sc->sc_total + l_sc->sc_total))
-		new_free = fs_total - (m_sc->sc_total + l_sc->sc_total);
-	else
-		new_free = 0;
-	spin_unlock(&sdp->sd_statfs_spin);
-	fs_warn(sdp, "File system extended by %llu blocks.\n",
-		(unsigned long long)new_free);
-	gfs2_statfs_change(sdp, new_free, new_free, 0);
-}
-
-/**
- * gfs2_stuffed_write_end - Write end for stuffed files
- * @inode: The inode
- * @dibh: The buffer_head containing the on-disk inode
- * @pos: The file position
- * @len: The length of the write
- * @copied: How much was actually copied by the VFS
- * @page: The page
- *
- * This copies the data from the page into the inode block after
- * the inode data structure itself.
- *
- * Returns: errno
- */
-static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
-				  loff_t pos, unsigned len, unsigned copied,
-				  struct page *page)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	u64 to = pos + copied;
-	void *kaddr;
-	unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
-	struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
-
-	BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode)));
-	kaddr = kmap_atomic(page, KM_USER0);
-	memcpy(buf + pos, kaddr + pos, copied);
-	memset(kaddr + pos + copied, 0, len - copied);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
-
-	if (!PageUptodate(page))
-		SetPageUptodate(page);
-	unlock_page(page);
-	page_cache_release(page);
-
-	if (copied) {
-		if (inode->i_size < to) {
-			i_size_write(inode, to);
-			ip->i_disksize = inode->i_size;
-		}
-		gfs2_dinode_out(ip, di);
-		mark_inode_dirty(inode);
-	}
-
-	if (inode == sdp->sd_rindex)
-		adjust_fs_space(inode);
-
-	brelse(dibh);
-	gfs2_trans_end(sdp);
-	gfs2_glock_dq(&ip->i_gh);
-	gfs2_holder_uninit(&ip->i_gh);
-	return copied;
-}
-
-/**
- * gfs2_write_end
- * @file: The file to write to
- * @mapping: The address space to write to
- * @pos: The file position
- * @len: The length of the data
- * @copied:
- * @page: The page that has been written
- * @fsdata: The fsdata (unused in GFS2)
- *
- * The main write_end function for GFS2. We have a separate one for
- * stuffed files as they are slightly different, otherwise we just
- * put our locking around the VFS provided functions.
- *
- * Returns: errno
- */
-
-static int gfs2_write_end(struct file *file, struct address_space *mapping,
-			  loff_t pos, unsigned len, unsigned copied,
-			  struct page *page, void *fsdata)
-{
-	struct inode *inode = page->mapping->host;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct buffer_head *dibh;
-	struct gfs2_alloc *al = ip->i_alloc;
-	unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
-	unsigned int to = from + len;
-	int ret;
-
-	BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL);
-
-	ret = gfs2_meta_inode_buffer(ip, &dibh);
-	if (unlikely(ret)) {
-		unlock_page(page);
-		page_cache_release(page);
-		goto failed;
-	}
-
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-
-	if (gfs2_is_stuffed(ip))
-		return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page);
-
-	if (!gfs2_is_writeback(ip))
-		gfs2_page_add_databufs(ip, page, from, to);
-
-	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-	if (ret > 0) {
-		if (inode->i_size > ip->i_disksize)
-			ip->i_disksize = inode->i_size;
-		gfs2_dinode_out(ip, dibh->b_data);
-		mark_inode_dirty(inode);
-	}
-
-	if (inode == sdp->sd_rindex)
-		adjust_fs_space(inode);
-
-	brelse(dibh);
-	gfs2_trans_end(sdp);
-failed:
-	if (al) {
-		gfs2_inplace_release(ip);
-		gfs2_quota_unlock(ip);
-		gfs2_alloc_put(ip);
-	}
-	gfs2_glock_dq(&ip->i_gh);
-	gfs2_holder_uninit(&ip->i_gh);
-	return ret;
-}
-
-/**
- * gfs2_set_page_dirty - Page dirtying function
- * @page: The page to dirty
- *
- * Returns: 1 if it dirtyed the page, or 0 otherwise
- */
- 
-static int gfs2_set_page_dirty(struct page *page)
-{
-	SetPageChecked(page);
-	return __set_page_dirty_buffers(page);
-}
-
-/**
- * gfs2_bmap - Block map function
- * @mapping: Address space info
- * @lblock: The block to map
- *
- * Returns: The disk address for the block or 0 on hole or error
- */
-
-static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
-{
-	struct gfs2_inode *ip = GFS2_I(mapping->host);
-	struct gfs2_holder i_gh;
-	sector_t dblock = 0;
-	int error;
-
-	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
-	if (error)
-		return 0;
-
-	if (!gfs2_is_stuffed(ip))
-		dblock = generic_block_bmap(mapping, lblock, gfs2_block_map);
-
-	gfs2_glock_dq_uninit(&i_gh);
-
-	return dblock;
-}
-
-static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
-{
-	struct gfs2_bufdata *bd;
-
-	lock_buffer(bh);
-	gfs2_log_lock(sdp);
-	clear_buffer_dirty(bh);
-	bd = bh->b_private;
-	if (bd) {
-		if (!list_empty(&bd->bd_le.le_list) && !buffer_pinned(bh))
-			list_del_init(&bd->bd_le.le_list);
-		else
-			gfs2_remove_from_journal(bh, current->journal_info, 0);
-	}
-	bh->b_bdev = NULL;
-	clear_buffer_mapped(bh);
-	clear_buffer_req(bh);
-	clear_buffer_new(bh);
-	gfs2_log_unlock(sdp);
-	unlock_buffer(bh);
-}
-
-static void gfs2_invalidatepage(struct page *page, unsigned long offset)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
-	struct buffer_head *bh, *head;
-	unsigned long pos = 0;
-
-	BUG_ON(!PageLocked(page));
-	if (offset == 0)
-		ClearPageChecked(page);
-	if (!page_has_buffers(page))
-		goto out;
-
-	bh = head = page_buffers(page);
-	do {
-		if (offset <= pos)
-			gfs2_discard(sdp, bh);
-		pos += bh->b_size;
-		bh = bh->b_this_page;
-	} while (bh != head);
-out:
-	if (offset == 0)
-		try_to_release_page(page, 0);
-}
-
-/**
- * gfs2_ok_for_dio - check that dio is valid on this file
- * @ip: The inode
- * @rw: READ or WRITE
- * @offset: The offset at which we are reading or writing
- *
- * Returns: 0 (to ignore the i/o request and thus fall back to buffered i/o)
- *          1 (to accept the i/o request)
- */
-static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
-{
-	/*
-	 * Should we return an error here? I can't see that O_DIRECT for
-	 * a stuffed file makes any sense. For now we'll silently fall
-	 * back to buffered I/O
-	 */
-	if (gfs2_is_stuffed(ip))
-		return 0;
-
-	if (offset >= i_size_read(&ip->i_inode))
-		return 0;
-	return 1;
-}
-
-
-
-static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
-			      const struct iovec *iov, loff_t offset,
-			      unsigned long nr_segs)
-{
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_mapping->host;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_holder gh;
-	int rv;
-
-	/*
-	 * Deferred lock, even if its a write, since we do no allocation
-	 * on this path. All we need change is atime, and this lock mode
-	 * ensures that other nodes have flushed their buffered read caches
-	 * (i.e. their page cache entries for this inode). We do not,
-	 * unfortunately have the option of only flushing a range like
-	 * the VFS does.
-	 */
-	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh);
-	rv = gfs2_glock_nq(&gh);
-	if (rv)
-		return rv;
-	rv = gfs2_ok_for_dio(ip, rw, offset);
-	if (rv != 1)
-		goto out; /* dio not valid, fall back to buffered i/o */
-
-	rv = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev,
-					   iov, offset, nr_segs,
-					   gfs2_get_block_direct, NULL);
-out:
-	gfs2_glock_dq_m(1, &gh);
-	gfs2_holder_uninit(&gh);
-	return rv;
-}
-
-/**
- * gfs2_releasepage - free the metadata associated with a page
- * @page: the page that's being released
- * @gfp_mask: passed from Linux VFS, ignored by us
- *
- * Call try_to_free_buffers() if the buffers in this page can be
- * released.
- *
- * Returns: 0
- */
-
-int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
-{
-	struct inode *aspace = page->mapping->host;
-	struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info;
-	struct buffer_head *bh, *head;
-	struct gfs2_bufdata *bd;
-
-	if (!page_has_buffers(page))
-		return 0;
-
-	gfs2_log_lock(sdp);
-	head = bh = page_buffers(page);
-	do {
-		if (atomic_read(&bh->b_count))
-			goto cannot_release;
-		bd = bh->b_private;
-		if (bd && bd->bd_ail)
-			goto cannot_release;
-		gfs2_assert_warn(sdp, !buffer_pinned(bh));
-		gfs2_assert_warn(sdp, !buffer_dirty(bh));
-		bh = bh->b_this_page;
-	} while(bh != head);
-	gfs2_log_unlock(sdp);
-
-	head = bh = page_buffers(page);
-	do {
-		gfs2_log_lock(sdp);
-		bd = bh->b_private;
-		if (bd) {
-			gfs2_assert_warn(sdp, bd->bd_bh == bh);
-			gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr));
-			if (!list_empty(&bd->bd_le.le_list)) {
-				if (!buffer_pinned(bh))
-					list_del_init(&bd->bd_le.le_list);
-				else
-					bd = NULL;
-			}
-			if (bd)
-				bd->bd_bh = NULL;
-			bh->b_private = NULL;
-		}
-		gfs2_log_unlock(sdp);
-		if (bd)
-			kmem_cache_free(gfs2_bufdata_cachep, bd);
-
-		bh = bh->b_this_page;
-	} while (bh != head);
-
-	return try_to_free_buffers(page);
-cannot_release:
-	gfs2_log_unlock(sdp);
-	return 0;
-}
-
-static const struct address_space_operations gfs2_writeback_aops = {
-	.writepage = gfs2_writeback_writepage,
-	.writepages = gfs2_writeback_writepages,
-	.readpage = gfs2_readpage,
-	.readpages = gfs2_readpages,
-	.sync_page = block_sync_page,
-	.write_begin = gfs2_write_begin,
-	.write_end = gfs2_write_end,
-	.bmap = gfs2_bmap,
-	.invalidatepage = gfs2_invalidatepage,
-	.releasepage = gfs2_releasepage,
-	.direct_IO = gfs2_direct_IO,
-	.migratepage = buffer_migrate_page,
-	.is_partially_uptodate = block_is_partially_uptodate,
-};
-
-static const struct address_space_operations gfs2_ordered_aops = {
-	.writepage = gfs2_ordered_writepage,
-	.readpage = gfs2_readpage,
-	.readpages = gfs2_readpages,
-	.sync_page = block_sync_page,
-	.write_begin = gfs2_write_begin,
-	.write_end = gfs2_write_end,
-	.set_page_dirty = gfs2_set_page_dirty,
-	.bmap = gfs2_bmap,
-	.invalidatepage = gfs2_invalidatepage,
-	.releasepage = gfs2_releasepage,
-	.direct_IO = gfs2_direct_IO,
-	.migratepage = buffer_migrate_page,
-	.is_partially_uptodate = block_is_partially_uptodate,
-};
-
-static const struct address_space_operations gfs2_jdata_aops = {
-	.writepage = gfs2_jdata_writepage,
-	.writepages = gfs2_jdata_writepages,
-	.readpage = gfs2_readpage,
-	.readpages = gfs2_readpages,
-	.sync_page = block_sync_page,
-	.write_begin = gfs2_write_begin,
-	.write_end = gfs2_write_end,
-	.set_page_dirty = gfs2_set_page_dirty,
-	.bmap = gfs2_bmap,
-	.invalidatepage = gfs2_invalidatepage,
-	.releasepage = gfs2_releasepage,
-	.is_partially_uptodate = block_is_partially_uptodate,
-};
-
-void gfs2_set_aops(struct inode *inode)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-
-	if (gfs2_is_writeback(ip))
-		inode->i_mapping->a_ops = &gfs2_writeback_aops;
-	else if (gfs2_is_ordered(ip))
-		inode->i_mapping->a_ops = &gfs2_ordered_aops;
-	else if (gfs2_is_jdata(ip))
-		inode->i_mapping->a_ops = &gfs2_jdata_aops;
-	else
-		BUG();
-}
-
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
deleted file mode 100644
index 5da2128..0000000
--- a/fs/gfs2/ops_address.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __OPS_ADDRESS_DOT_H__
-#define __OPS_ADDRESS_DOT_H__
-
-#include <linux/fs.h>
-#include <linux/buffer_head.h>
-#include <linux/mm.h>
-
-extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
-extern int gfs2_internal_read(struct gfs2_inode *ip,
-			      struct file_ra_state *ra_state,
-			      char *buf, loff_t *pos, unsigned size);
-extern void gfs2_set_aops(struct inode *inode);
-
-#endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
deleted file mode 100644
index 022c66c..0000000
--- a/fs/gfs2/ops_dentry.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/crc32.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "dir.h"
-#include "glock.h"
-#include "super.h"
-#include "util.h"
-#include "inode.h"
-
-/**
- * gfs2_drevalidate - Check directory lookup consistency
- * @dentry: the mapping to check
- * @nd:
- *
- * Check to make sure the lookup necessary to arrive at this inode from its
- * parent is still good.
- *
- * Returns: 1 if the dentry is ok, 0 if it isn't
- */
-
-static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
-{
-	struct dentry *parent = dget_parent(dentry);
-	struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode);
-	struct gfs2_inode *dip = GFS2_I(parent->d_inode);
-	struct inode *inode = dentry->d_inode;
-	struct gfs2_holder d_gh;
-	struct gfs2_inode *ip = NULL;
-	int error;
-	int had_lock = 0;
-
-	if (inode) {
-		if (is_bad_inode(inode))
-			goto invalid;
-		ip = GFS2_I(inode);
-	}
-
-	if (sdp->sd_args.ar_localcaching)
-		goto valid;
-
-	had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
-	if (!had_lock) {
-		error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
-		if (error)
-			goto fail;
-	} 
-
-	error = gfs2_dir_check(parent->d_inode, &dentry->d_name, ip);
-	switch (error) {
-	case 0:
-		if (!inode)
-			goto invalid_gunlock;
-		break;
-	case -ENOENT:
-		if (!inode)
-			goto valid_gunlock;
-		goto invalid_gunlock;
-	default:
-		goto fail_gunlock;
-	}
-
-valid_gunlock:
-	if (!had_lock)
-		gfs2_glock_dq_uninit(&d_gh);
-valid:
-	dput(parent);
-	return 1;
-
-invalid_gunlock:
-	if (!had_lock)
-		gfs2_glock_dq_uninit(&d_gh);
-invalid:
-	if (inode && S_ISDIR(inode->i_mode)) {
-		if (have_submounts(dentry))
-			goto valid;
-		shrink_dcache_parent(dentry);
-	}
-	d_drop(dentry);
-	dput(parent);
-	return 0;
-
-fail_gunlock:
-	gfs2_glock_dq_uninit(&d_gh);
-fail:
-	dput(parent);
-	return 0;
-}
-
-static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
-{
-	str->hash = gfs2_disk_hash(str->name, str->len);
-	return 0;
-}
-
-const struct dentry_operations gfs2_dops = {
-	.d_revalidate = gfs2_drevalidate,
-	.d_hash = gfs2_dhash,
-};
-
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
deleted file mode 100644
index 9200ef2..0000000
--- a/fs/gfs2/ops_export.c
+++ /dev/null
@@ -1,285 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/exportfs.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/crc32.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "dir.h"
-#include "glock.h"
-#include "glops.h"
-#include "inode.h"
-#include "super.h"
-#include "rgrp.h"
-#include "util.h"
-
-#define GFS2_SMALL_FH_SIZE 4
-#define GFS2_LARGE_FH_SIZE 8
-#define GFS2_OLD_FH_SIZE 10
-
-static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
-			  int connectable)
-{
-	__be32 *fh = (__force __be32 *)p;
-	struct inode *inode = dentry->d_inode;
-	struct super_block *sb = inode->i_sb;
-	struct gfs2_inode *ip = GFS2_I(inode);
-
-	if (*len < GFS2_SMALL_FH_SIZE ||
-	    (connectable && *len < GFS2_LARGE_FH_SIZE))
-		return 255;
-
-	fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32);
-	fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
-	fh[2] = cpu_to_be32(ip->i_no_addr >> 32);
-	fh[3] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF);
-	*len = GFS2_SMALL_FH_SIZE;
-
-	if (!connectable || inode == sb->s_root->d_inode)
-		return *len;
-
-	spin_lock(&dentry->d_lock);
-	inode = dentry->d_parent->d_inode;
-	ip = GFS2_I(inode);
-	igrab(inode);
-	spin_unlock(&dentry->d_lock);
-
-	fh[4] = cpu_to_be32(ip->i_no_formal_ino >> 32);
-	fh[5] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
-	fh[6] = cpu_to_be32(ip->i_no_addr >> 32);
-	fh[7] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF);
-	*len = GFS2_LARGE_FH_SIZE;
-
-	iput(inode);
-
-	return *len;
-}
-
-struct get_name_filldir {
-	struct gfs2_inum_host inum;
-	char *name;
-};
-
-static int get_name_filldir(void *opaque, const char *name, int length,
-			    loff_t offset, u64 inum, unsigned int type)
-{
-	struct get_name_filldir *gnfd = opaque;
-
-	if (inum != gnfd->inum.no_addr)
-		return 0;
-
-	memcpy(gnfd->name, name, length);
-	gnfd->name[length] = 0;
-
-	return 1;
-}
-
-static int gfs2_get_name(struct dentry *parent, char *name,
-			 struct dentry *child)
-{
-	struct inode *dir = parent->d_inode;
-	struct inode *inode = child->d_inode;
-	struct gfs2_inode *dip, *ip;
-	struct get_name_filldir gnfd;
-	struct gfs2_holder gh;
-	u64 offset = 0;
-	int error;
-
-	if (!dir)
-		return -EINVAL;
-
-	if (!S_ISDIR(dir->i_mode) || !inode)
-		return -EINVAL;
-
-	dip = GFS2_I(dir);
-	ip = GFS2_I(inode);
-
-	*name = 0;
-	gnfd.inum.no_addr = ip->i_no_addr;
-	gnfd.inum.no_formal_ino = ip->i_no_formal_ino;
-	gnfd.name = name;
-
-	error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
-	if (error)
-		return error;
-
-	error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir);
-
-	gfs2_glock_dq_uninit(&gh);
-
-	if (!error && !*name)
-		error = -ENOENT;
-
-	return error;
-}
-
-static struct dentry *gfs2_get_parent(struct dentry *child)
-{
-	struct qstr dotdot;
-	struct dentry *dentry;
-
-	/*
-	 * XXX(hch): it would be a good idea to keep this around as a
-	 *	     static variable.
-	 */
-	gfs2_str2qstr(&dotdot, "..");
-
-	dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &dotdot, 1));
-	if (!IS_ERR(dentry))
-		dentry->d_op = &gfs2_dops;
-	return dentry;
-}
-
-static struct dentry *gfs2_get_dentry(struct super_block *sb,
-		struct gfs2_inum_host *inum)
-{
-	struct gfs2_sbd *sdp = sb->s_fs_info;
-	struct gfs2_holder i_gh, ri_gh, rgd_gh;
-	struct gfs2_rgrpd *rgd;
-	struct inode *inode;
-	struct dentry *dentry;
-	int error;
-
-	/* System files? */
-
-	inode = gfs2_ilookup(sb, inum->no_addr);
-	if (inode) {
-		if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
-			iput(inode);
-			return ERR_PTR(-ESTALE);
-		}
-		goto out_inode;
-	}
-
-	error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops,
-				  LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
-	if (error)
-		return ERR_PTR(error);
-
-	error = gfs2_rindex_hold(sdp, &ri_gh);
-	if (error)
-		goto fail;
-
-	error = -EINVAL;
-	rgd = gfs2_blk2rgrpd(sdp, inum->no_addr);
-	if (!rgd)
-		goto fail_rindex;
-
-	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
-	if (error)
-		goto fail_rindex;
-
-	error = -ESTALE;
-	if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE)
-		goto fail_rgd;
-
-	gfs2_glock_dq_uninit(&rgd_gh);
-	gfs2_glock_dq_uninit(&ri_gh);
-
-	inode = gfs2_inode_lookup(sb, DT_UNKNOWN,
-					inum->no_addr,
-					0, 0);
-	if (IS_ERR(inode)) {
-		error = PTR_ERR(inode);
-		goto fail;
-	}
-
-	error = gfs2_inode_refresh(GFS2_I(inode));
-	if (error) {
-		iput(inode);
-		goto fail;
-	}
-
-	/* Pick up the works we bypass in gfs2_inode_lookup */
-	if (inode->i_state & I_NEW) 
-		gfs2_set_iop(inode);
-
-	if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
-		iput(inode);
-		goto fail;
-	}
-
-	error = -EIO;
-	if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) {
-		iput(inode);
-		goto fail;
-	}
-
-	gfs2_glock_dq_uninit(&i_gh);
-
-out_inode:
-	dentry = d_obtain_alias(inode);
-	if (!IS_ERR(dentry))
-		dentry->d_op = &gfs2_dops;
-	return dentry;
-
-fail_rgd:
-	gfs2_glock_dq_uninit(&rgd_gh);
-
-fail_rindex:
-	gfs2_glock_dq_uninit(&ri_gh);
-
-fail:
-	gfs2_glock_dq_uninit(&i_gh);
-	return ERR_PTR(error);
-}
-
-static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
-		int fh_len, int fh_type)
-{
-	struct gfs2_inum_host this;
-	__be32 *fh = (__force __be32 *)fid->raw;
-
-	switch (fh_type) {
-	case GFS2_SMALL_FH_SIZE:
-	case GFS2_LARGE_FH_SIZE:
-	case GFS2_OLD_FH_SIZE:
-		this.no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32;
-		this.no_formal_ino |= be32_to_cpu(fh[1]);
-		this.no_addr = ((u64)be32_to_cpu(fh[2])) << 32;
-		this.no_addr |= be32_to_cpu(fh[3]);
-		return gfs2_get_dentry(sb, &this);
-	default:
-		return NULL;
-	}
-}
-
-static struct dentry *gfs2_fh_to_parent(struct super_block *sb, struct fid *fid,
-		int fh_len, int fh_type)
-{
-	struct gfs2_inum_host parent;
-	__be32 *fh = (__force __be32 *)fid->raw;
-
-	switch (fh_type) {
-	case GFS2_LARGE_FH_SIZE:
-	case GFS2_OLD_FH_SIZE:
-		parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32;
-		parent.no_formal_ino |= be32_to_cpu(fh[5]);
-		parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32;
-		parent.no_addr |= be32_to_cpu(fh[7]);
-		return gfs2_get_dentry(sb, &parent);
-	default:
-		return NULL;
-	}
-}
-
-const struct export_operations gfs2_export_ops = {
-	.encode_fh = gfs2_encode_fh,
-	.fh_to_dentry = gfs2_fh_to_dentry,
-	.fh_to_parent = gfs2_fh_to_parent,
-	.get_name = gfs2_get_name,
-	.get_parent = gfs2_get_parent,
-};
-
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
deleted file mode 100644
index 0ee7bd2..0000000
--- a/fs/gfs2/ops_file.c
+++ /dev/null
@@ -1,766 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/pagemap.h>
-#include <linux/uio.h>
-#include <linux/blkdev.h>
-#include <linux/mm.h>
-#include <linux/mount.h>
-#include <linux/fs.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/ext2_fs.h>
-#include <linux/crc32.h>
-#include <linux/writeback.h>
-#include <asm/uaccess.h>
-#include <linux/dlm.h>
-#include <linux/dlm_plock.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "bmap.h"
-#include "dir.h"
-#include "glock.h"
-#include "glops.h"
-#include "inode.h"
-#include "log.h"
-#include "meta_io.h"
-#include "quota.h"
-#include "rgrp.h"
-#include "trans.h"
-#include "util.h"
-#include "eaops.h"
-#include "ops_address.h"
-
-/**
- * gfs2_llseek - seek to a location in a file
- * @file: the file
- * @offset: the offset
- * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
- *
- * SEEK_END requires the glock for the file because it references the
- * file's size.
- *
- * Returns: The new offset, or errno
- */
-
-static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
-{
-	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-	struct gfs2_holder i_gh;
-	loff_t error;
-
-	if (origin == 2) {
-		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
-					   &i_gh);
-		if (!error) {
-			error = generic_file_llseek_unlocked(file, offset, origin);
-			gfs2_glock_dq_uninit(&i_gh);
-		}
-	} else
-		error = generic_file_llseek_unlocked(file, offset, origin);
-
-	return error;
-}
-
-/**
- * gfs2_readdir - Read directory entries from a directory
- * @file: The directory to read from
- * @dirent: Buffer for dirents
- * @filldir: Function used to do the copying
- *
- * Returns: errno
- */
-
-static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
-{
-	struct inode *dir = file->f_mapping->host;
-	struct gfs2_inode *dip = GFS2_I(dir);
-	struct gfs2_holder d_gh;
-	u64 offset = file->f_pos;
-	int error;
-
-	gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
-	error = gfs2_glock_nq(&d_gh);
-	if (error) {
-		gfs2_holder_uninit(&d_gh);
-		return error;
-	}
-
-	error = gfs2_dir_read(dir, &offset, dirent, filldir);
-
-	gfs2_glock_dq_uninit(&d_gh);
-
-	file->f_pos = offset;
-
-	return error;
-}
-
-/**
- * fsflags_cvt
- * @table: A table of 32 u32 flags
- * @val: a 32 bit value to convert
- *
- * This function can be used to convert between fsflags values and
- * GFS2's own flags values.
- *
- * Returns: the converted flags
- */
-static u32 fsflags_cvt(const u32 *table, u32 val)
-{
-	u32 res = 0;
-	while(val) {
-		if (val & 1)
-			res |= *table;
-		table++;
-		val >>= 1;
-	}
-	return res;
-}
-
-static const u32 fsflags_to_gfs2[32] = {
-	[3] = GFS2_DIF_SYNC,
-	[4] = GFS2_DIF_IMMUTABLE,
-	[5] = GFS2_DIF_APPENDONLY,
-	[7] = GFS2_DIF_NOATIME,
-	[12] = GFS2_DIF_EXHASH,
-	[14] = GFS2_DIF_INHERIT_JDATA,
-};
-
-static const u32 gfs2_to_fsflags[32] = {
-	[gfs2fl_Sync] = FS_SYNC_FL,
-	[gfs2fl_Immutable] = FS_IMMUTABLE_FL,
-	[gfs2fl_AppendOnly] = FS_APPEND_FL,
-	[gfs2fl_NoAtime] = FS_NOATIME_FL,
-	[gfs2fl_ExHash] = FS_INDEX_FL,
-	[gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,
-};
-
-static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
-{
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_holder gh;
-	int error;
-	u32 fsflags;
-
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
-	error = gfs2_glock_nq(&gh);
-	if (error)
-		return error;
-
-	fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags);
-	if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA)
-		fsflags |= FS_JOURNAL_DATA_FL;
-	if (put_user(fsflags, ptr))
-		error = -EFAULT;
-
-	gfs2_glock_dq(&gh);
-	gfs2_holder_uninit(&gh);
-	return error;
-}
-
-void gfs2_set_inode_flags(struct inode *inode)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	unsigned int flags = inode->i_flags;
-
-	flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
-	if (ip->i_diskflags & GFS2_DIF_IMMUTABLE)
-		flags |= S_IMMUTABLE;
-	if (ip->i_diskflags & GFS2_DIF_APPENDONLY)
-		flags |= S_APPEND;
-	if (ip->i_diskflags & GFS2_DIF_NOATIME)
-		flags |= S_NOATIME;
-	if (ip->i_diskflags & GFS2_DIF_SYNC)
-		flags |= S_SYNC;
-	inode->i_flags = flags;
-}
-
-/* Flags that can be set by user space */
-#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA|			\
-			     GFS2_DIF_IMMUTABLE|		\
-			     GFS2_DIF_APPENDONLY|		\
-			     GFS2_DIF_NOATIME|			\
-			     GFS2_DIF_SYNC|			\
-			     GFS2_DIF_SYSTEM|			\
-			     GFS2_DIF_INHERIT_JDATA)
-
-/**
- * gfs2_set_flags - set flags on an inode
- * @inode: The inode
- * @flags: The flags to set
- * @mask: Indicates which flags are valid
- *
- */
-static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
-{
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct buffer_head *bh;
-	struct gfs2_holder gh;
-	int error;
-	u32 new_flags, flags;
-
-	error = mnt_want_write(filp->f_path.mnt);
-	if (error)
-		return error;
-
-	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-	if (error)
-		goto out_drop_write;
-
-	flags = ip->i_diskflags;
-	new_flags = (flags & ~mask) | (reqflags & mask);
-	if ((new_flags ^ flags) == 0)
-		goto out;
-
-	error = -EINVAL;
-	if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET)
-		goto out;
-
-	error = -EPERM;
-	if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE))
-		goto out;
-	if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY))
-		goto out;
-	if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) &&
-	    !capable(CAP_LINUX_IMMUTABLE))
-		goto out;
-	if (!IS_IMMUTABLE(inode)) {
-		error = gfs2_permission(inode, MAY_WRITE);
-		if (error)
-			goto out;
-	}
-	if ((flags ^ new_flags) & GFS2_DIF_JDATA) {
-		if (flags & GFS2_DIF_JDATA)
-			gfs2_log_flush(sdp, ip->i_gl);
-		error = filemap_fdatawrite(inode->i_mapping);
-		if (error)
-			goto out;
-		error = filemap_fdatawait(inode->i_mapping);
-		if (error)
-			goto out;
-	}
-	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
-	if (error)
-		goto out;
-	error = gfs2_meta_inode_buffer(ip, &bh);
-	if (error)
-		goto out_trans_end;
-	gfs2_trans_add_bh(ip->i_gl, bh, 1);
-	ip->i_diskflags = new_flags;
-	gfs2_dinode_out(ip, bh->b_data);
-	brelse(bh);
-	gfs2_set_inode_flags(inode);
-	gfs2_set_aops(inode);
-out_trans_end:
-	gfs2_trans_end(sdp);
-out:
-	gfs2_glock_dq_uninit(&gh);
-out_drop_write:
-	mnt_drop_write(filp->f_path.mnt);
-	return error;
-}
-
-static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
-{
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	u32 fsflags, gfsflags;
-	if (get_user(fsflags, ptr))
-		return -EFAULT;
-	gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
-	if (!S_ISDIR(inode->i_mode)) {
-		if (gfsflags & GFS2_DIF_INHERIT_JDATA)
-			gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA);
-		return do_gfs2_set_flags(filp, gfsflags, ~0);
-	}
-	return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA);
-}
-
-static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
-	switch(cmd) {
-	case FS_IOC_GETFLAGS:
-		return gfs2_get_flags(filp, (u32 __user *)arg);
-	case FS_IOC_SETFLAGS:
-		return gfs2_set_flags(filp, (u32 __user *)arg);
-	}
-	return -ENOTTY;
-}
-
-/**
- * gfs2_allocate_page_backing - Use bmap to allocate blocks
- * @page: The (locked) page to allocate backing for
- *
- * We try to allocate all the blocks required for the page in
- * one go. This might fail for various reasons, so we keep
- * trying until all the blocks to back this page are allocated.
- * If some of the blocks are already allocated, thats ok too.
- */
-
-static int gfs2_allocate_page_backing(struct page *page)
-{
-	struct inode *inode = page->mapping->host;
-	struct buffer_head bh;
-	unsigned long size = PAGE_CACHE_SIZE;
-	u64 lblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-
-	do {
-		bh.b_state = 0;
-		bh.b_size = size;
-		gfs2_block_map(inode, lblock, &bh, 1);
-		if (!buffer_mapped(&bh))
-			return -EIO;
-		size -= bh.b_size;
-		lblock += (bh.b_size >> inode->i_blkbits);
-	} while(size > 0);
-	return 0;
-}
-
-/**
- * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable
- * @vma: The virtual memory area
- * @page: The page which is about to become writable
- *
- * When the page becomes writable, we need to ensure that we have
- * blocks allocated on disk to back that page.
- */
-
-static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	struct page *page = vmf->page;
-	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	unsigned long last_index;
-	u64 pos = page->index << PAGE_CACHE_SHIFT;
-	unsigned int data_blocks, ind_blocks, rblocks;
-	int alloc_required = 0;
-	struct gfs2_holder gh;
-	struct gfs2_alloc *al;
-	int ret;
-
-	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-	ret = gfs2_glock_nq(&gh);
-	if (ret)
-		goto out;
-
-	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
-	set_bit(GIF_SW_PAGED, &ip->i_flags);
-
-	ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
-	if (ret || !alloc_required)
-		goto out_unlock;
-	ret = -ENOMEM;
-	al = gfs2_alloc_get(ip);
-	if (al == NULL)
-		goto out_unlock;
-
-	ret = gfs2_quota_lock_check(ip);
-	if (ret)
-		goto out_alloc_put;
-	gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
-	al->al_requested = data_blocks + ind_blocks;
-	ret = gfs2_inplace_reserve(ip);
-	if (ret)
-		goto out_quota_unlock;
-
-	rblocks = RES_DINODE + ind_blocks;
-	if (gfs2_is_jdata(ip))
-		rblocks += data_blocks ? data_blocks : 1;
-	if (ind_blocks || data_blocks)
-		rblocks += RES_STATFS + RES_QUOTA;
-	ret = gfs2_trans_begin(sdp, rblocks, 0);
-	if (ret)
-		goto out_trans_fail;
-
-	lock_page(page);
-	ret = -EINVAL;
-	last_index = ip->i_inode.i_size >> PAGE_CACHE_SHIFT;
-	if (page->index > last_index)
-		goto out_unlock_page;
-	ret = 0;
-	if (!PageUptodate(page) || page->mapping != ip->i_inode.i_mapping)
-		goto out_unlock_page;
-	if (gfs2_is_stuffed(ip)) {
-		ret = gfs2_unstuff_dinode(ip, page);
-		if (ret)
-			goto out_unlock_page;
-	}
-	ret = gfs2_allocate_page_backing(page);
-
-out_unlock_page:
-	unlock_page(page);
-	gfs2_trans_end(sdp);
-out_trans_fail:
-	gfs2_inplace_release(ip);
-out_quota_unlock:
-	gfs2_quota_unlock(ip);
-out_alloc_put:
-	gfs2_alloc_put(ip);
-out_unlock:
-	gfs2_glock_dq(&gh);
-out:
-	gfs2_holder_uninit(&gh);
-	if (ret == -ENOMEM)
-		ret = VM_FAULT_OOM;
-	else if (ret)
-		ret = VM_FAULT_SIGBUS;
-	return ret;
-}
-
-static struct vm_operations_struct gfs2_vm_ops = {
-	.fault = filemap_fault,
-	.page_mkwrite = gfs2_page_mkwrite,
-};
-
-/**
- * gfs2_mmap -
- * @file: The file to map
- * @vma: The VMA which described the mapping
- *
- * There is no need to get a lock here unless we should be updating
- * atime. We ignore any locking errors since the only consequence is
- * a missed atime update (which will just be deferred until later).
- *
- * Returns: 0
- */
-
-static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-
-	if (!(file->f_flags & O_NOATIME)) {
-		struct gfs2_holder i_gh;
-		int error;
-
-		gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
-		error = gfs2_glock_nq(&i_gh);
-		file_accessed(file);
-		if (error == 0)
-			gfs2_glock_dq_uninit(&i_gh);
-	}
-	vma->vm_ops = &gfs2_vm_ops;
-	vma->vm_flags |= VM_CAN_NONLINEAR;
-
-	return 0;
-}
-
-/**
- * gfs2_open - open a file
- * @inode: the inode to open
- * @file: the struct file for this opening
- *
- * Returns: errno
- */
-
-static int gfs2_open(struct inode *inode, struct file *file)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_holder i_gh;
-	struct gfs2_file *fp;
-	int error;
-
-	fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
-	if (!fp)
-		return -ENOMEM;
-
-	mutex_init(&fp->f_fl_mutex);
-
-	gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
-	file->private_data = fp;
-
-	if (S_ISREG(ip->i_inode.i_mode)) {
-		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
-					   &i_gh);
-		if (error)
-			goto fail;
-
-		if (!(file->f_flags & O_LARGEFILE) &&
-		    ip->i_disksize > MAX_NON_LFS) {
-			error = -EOVERFLOW;
-			goto fail_gunlock;
-		}
-
-		gfs2_glock_dq_uninit(&i_gh);
-	}
-
-	return 0;
-
-fail_gunlock:
-	gfs2_glock_dq_uninit(&i_gh);
-fail:
-	file->private_data = NULL;
-	kfree(fp);
-	return error;
-}
-
-/**
- * gfs2_close - called to close a struct file
- * @inode: the inode the struct file belongs to
- * @file: the struct file being closed
- *
- * Returns: errno
- */
-
-static int gfs2_close(struct inode *inode, struct file *file)
-{
-	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
-	struct gfs2_file *fp;
-
-	fp = file->private_data;
-	file->private_data = NULL;
-
-	if (gfs2_assert_warn(sdp, fp))
-		return -EIO;
-
-	kfree(fp);
-
-	return 0;
-}
-
-/**
- * gfs2_fsync - sync the dirty data for a file (across the cluster)
- * @file: the file that points to the dentry (we ignore this)
- * @dentry: the dentry that points to the inode to sync
- *
- * The VFS will flush "normal" data for us. We only need to worry
- * about metadata here. For journaled data, we just do a log flush
- * as we can't avoid it. Otherwise we can just bale out if datasync
- * is set. For stuffed inodes we must flush the log in order to
- * ensure that all data is on disk.
- *
- * The call to write_inode_now() is there to write back metadata and
- * the inode itself. It does also try and write the data, but thats
- * (hopefully) a no-op due to the VFS having already called filemap_fdatawrite()
- * for us.
- *
- * Returns: errno
- */
-
-static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
-{
-	struct inode *inode = dentry->d_inode;
-	int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
-	int ret = 0;
-
-	if (gfs2_is_jdata(GFS2_I(inode))) {
-		gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl);
-		return 0;
-	}
-
-	if (sync_state != 0) {
-		if (!datasync)
-			ret = write_inode_now(inode, 0);
-
-		if (gfs2_is_stuffed(GFS2_I(inode)))
-			gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl);
-	}
-
-	return ret;
-}
-
-#ifdef CONFIG_GFS2_FS_LOCKING_DLM
-
-/**
- * gfs2_setlease - acquire/release a file lease
- * @file: the file pointer
- * @arg: lease type
- * @fl: file lock
- *
- * We don't currently have a way to enforce a lease across the whole
- * cluster; until we do, disable leases (by just returning -EINVAL),
- * unless the administrator has requested purely local locking.
- *
- * Returns: errno
- */
-
-static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl)
-{
-	return -EINVAL;
-}
-
-/**
- * gfs2_lock - acquire/release a posix lock on a file
- * @file: the file pointer
- * @cmd: either modify or retrieve lock state, possibly wait
- * @fl: type and range of lock
- *
- * Returns: errno
- */
-
-static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
-{
-	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-	struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
-	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-
-	if (!(fl->fl_flags & FL_POSIX))
-		return -ENOLCK;
-	if (__mandatory_lock(&ip->i_inode))
-		return -ENOLCK;
-
-	if (cmd == F_CANCELLK) {
-		/* Hack: */
-		cmd = F_SETLK;
-		fl->fl_type = F_UNLCK;
-	}
-	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		return -EIO;
-	if (IS_GETLK(cmd))
-		return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl);
-	else if (fl->fl_type == F_UNLCK)
-		return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl);
-	else
-		return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl);
-}
-
-static int do_flock(struct file *file, int cmd, struct file_lock *fl)
-{
-	struct gfs2_file *fp = file->private_data;
-	struct gfs2_holder *fl_gh = &fp->f_fl_gh;
-	struct gfs2_inode *ip = GFS2_I(file->f_path.dentry->d_inode);
-	struct gfs2_glock *gl;
-	unsigned int state;
-	int flags;
-	int error = 0;
-
-	state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
-	flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
-
-	mutex_lock(&fp->f_fl_mutex);
-
-	gl = fl_gh->gh_gl;
-	if (gl) {
-		if (fl_gh->gh_state == state)
-			goto out;
-		flock_lock_file_wait(file,
-				     &(struct file_lock){.fl_type = F_UNLCK});
-		gfs2_glock_dq_wait(fl_gh);
-		gfs2_holder_reinit(state, flags, fl_gh);
-	} else {
-		error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr,
-				       &gfs2_flock_glops, CREATE, &gl);
-		if (error)
-			goto out;
-		gfs2_holder_init(gl, state, flags, fl_gh);
-		gfs2_glock_put(gl);
-	}
-	error = gfs2_glock_nq(fl_gh);
-	if (error) {
-		gfs2_holder_uninit(fl_gh);
-		if (error == GLR_TRYFAILED)
-			error = -EAGAIN;
-	} else {
-		error = flock_lock_file_wait(file, fl);
-		gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
-	}
-
-out:
-	mutex_unlock(&fp->f_fl_mutex);
-	return error;
-}
-
-static void do_unflock(struct file *file, struct file_lock *fl)
-{
-	struct gfs2_file *fp = file->private_data;
-	struct gfs2_holder *fl_gh = &fp->f_fl_gh;
-
-	mutex_lock(&fp->f_fl_mutex);
-	flock_lock_file_wait(file, fl);
-	if (fl_gh->gh_gl)
-		gfs2_glock_dq_uninit(fl_gh);
-	mutex_unlock(&fp->f_fl_mutex);
-}
-
-/**
- * gfs2_flock - acquire/release a flock lock on a file
- * @file: the file pointer
- * @cmd: either modify or retrieve lock state, possibly wait
- * @fl: type and range of lock
- *
- * Returns: errno
- */
-
-static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
-{
-	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-
-	if (!(fl->fl_flags & FL_FLOCK))
-		return -ENOLCK;
-	if (__mandatory_lock(&ip->i_inode))
-		return -ENOLCK;
-
-	if (fl->fl_type == F_UNLCK) {
-		do_unflock(file, fl);
-		return 0;
-	} else {
-		return do_flock(file, cmd, fl);
-	}
-}
-
-const struct file_operations gfs2_file_fops = {
-	.llseek		= gfs2_llseek,
-	.read		= do_sync_read,
-	.aio_read	= generic_file_aio_read,
-	.write		= do_sync_write,
-	.aio_write	= generic_file_aio_write,
-	.unlocked_ioctl	= gfs2_ioctl,
-	.mmap		= gfs2_mmap,
-	.open		= gfs2_open,
-	.release	= gfs2_close,
-	.fsync		= gfs2_fsync,
-	.lock		= gfs2_lock,
-	.flock		= gfs2_flock,
-	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
-	.setlease	= gfs2_setlease,
-};
-
-const struct file_operations gfs2_dir_fops = {
-	.readdir	= gfs2_readdir,
-	.unlocked_ioctl	= gfs2_ioctl,
-	.open		= gfs2_open,
-	.release	= gfs2_close,
-	.fsync		= gfs2_fsync,
-	.lock		= gfs2_lock,
-	.flock		= gfs2_flock,
-};
-
-#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
-
-const struct file_operations gfs2_file_fops_nolock = {
-	.llseek		= gfs2_llseek,
-	.read		= do_sync_read,
-	.aio_read	= generic_file_aio_read,
-	.write		= do_sync_write,
-	.aio_write	= generic_file_aio_write,
-	.unlocked_ioctl	= gfs2_ioctl,
-	.mmap		= gfs2_mmap,
-	.open		= gfs2_open,
-	.release	= gfs2_close,
-	.fsync		= gfs2_fsync,
-	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
-	.setlease	= generic_setlease,
-};
-
-const struct file_operations gfs2_dir_fops_nolock = {
-	.readdir	= gfs2_readdir,
-	.unlocked_ioctl	= gfs2_ioctl,
-	.open		= gfs2_open,
-	.release	= gfs2_close,
-	.fsync		= gfs2_fsync,
-};
-
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 152e6c4..2e9b932 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -60,7 +60,6 @@
 #include "super.h"
 #include "trans.h"
 #include "inode.h"
-#include "ops_address.h"
 #include "util.h"
 
 #define QUOTA_USER 1
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index ee3d5c1..6122c7e 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -29,7 +29,6 @@
 #include "util.h"
 #include "log.h"
 #include "inode.h"
-#include "ops_address.h"
 
 #define BFITNOENT ((u32)~0)
 #define NO_BLOCK ((u64)~0)
-- 
cgit v0.10.2


From 9e6e0a128bca0a151d8d3fbd9459b22fc21cfebb Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 22 May 2009 10:36:01 +0100
Subject: GFS2: Merge mount.c and ops_super.c into super.c

mount.c only contained a single function, so is not really
worth retaining on its own. All of the super related code
is now either in super.c or ops_fstype.c

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 4f7332c..d53a9be 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,8 +1,8 @@
 obj-$(CONFIG_GFS2_FS) += gfs2.o
 gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
 	glops.o inode.o log.o lops.o main.o meta_io.o \
-	mount.o aops.o dentry.o export.o file.o \
-	ops_fstype.o ops_inode.o ops_super.o quota.o \
+	aops.o dentry.o export.o file.o \
+	ops_fstype.o ops_inode.o quota.o \
 	recovery.o rgrp.o super.o sys.o trans.o util.o
 
 gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
deleted file mode 100644
index 947af15..0000000
--- a/fs/gfs2/mount.c
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/parser.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "super.h"
-#include "sys.h"
-#include "util.h"
-
-enum {
-	Opt_lockproto,
-	Opt_locktable,
-	Opt_hostdata,
-	Opt_spectator,
-	Opt_ignore_local_fs,
-	Opt_localflocks,
-	Opt_localcaching,
-	Opt_debug,
-	Opt_nodebug,
-	Opt_upgrade,
-	Opt_acl,
-	Opt_noacl,
-	Opt_quota_off,
-	Opt_quota_account,
-	Opt_quota_on,
-	Opt_quota,
-	Opt_noquota,
-	Opt_suiddir,
-	Opt_nosuiddir,
-	Opt_data_writeback,
-	Opt_data_ordered,
-	Opt_meta,
-	Opt_discard,
-	Opt_nodiscard,
-	Opt_commit,
-	Opt_err,
-};
-
-static const match_table_t tokens = {
-	{Opt_lockproto, "lockproto=%s"},
-	{Opt_locktable, "locktable=%s"},
-	{Opt_hostdata, "hostdata=%s"},
-	{Opt_spectator, "spectator"},
-	{Opt_ignore_local_fs, "ignore_local_fs"},
-	{Opt_localflocks, "localflocks"},
-	{Opt_localcaching, "localcaching"},
-	{Opt_debug, "debug"},
-	{Opt_nodebug, "nodebug"},
-	{Opt_upgrade, "upgrade"},
-	{Opt_acl, "acl"},
-	{Opt_noacl, "noacl"},
-	{Opt_quota_off, "quota=off"},
-	{Opt_quota_account, "quota=account"},
-	{Opt_quota_on, "quota=on"},
-	{Opt_quota, "quota"},
-	{Opt_noquota, "noquota"},
-	{Opt_suiddir, "suiddir"},
-	{Opt_nosuiddir, "nosuiddir"},
-	{Opt_data_writeback, "data=writeback"},
-	{Opt_data_ordered, "data=ordered"},
-	{Opt_meta, "meta"},
-	{Opt_discard, "discard"},
-	{Opt_nodiscard, "nodiscard"},
-	{Opt_commit, "commit=%d"},
-	{Opt_err, NULL}
-};
-
-/**
- * gfs2_mount_args - Parse mount options
- * @sdp:
- * @data:
- *
- * Return: errno
- */
-
-int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
-{
-	char *o;
-	int token;
-	substring_t tmp[MAX_OPT_ARGS];
-	int rv;
-
-	/* Split the options into tokens with the "," character and
-	   process them */
-
-	while (1) {
-		o = strsep(&options, ",");
-		if (o == NULL)
-			break;
-		if (*o == '\0')
-			continue;
-
-		token = match_token(o, tokens, tmp);
-		switch (token) {
-		case Opt_lockproto:
-			match_strlcpy(args->ar_lockproto, &tmp[0],
-				      GFS2_LOCKNAME_LEN);
-			break;
-		case Opt_locktable:
-			match_strlcpy(args->ar_locktable, &tmp[0],
-				      GFS2_LOCKNAME_LEN);
-			break;
-		case Opt_hostdata:
-			match_strlcpy(args->ar_hostdata, &tmp[0],
-				      GFS2_LOCKNAME_LEN);
-			break;
-		case Opt_spectator:
-			args->ar_spectator = 1;
-			break;
-		case Opt_ignore_local_fs:
-			args->ar_ignore_local_fs = 1;
-			break;
-		case Opt_localflocks:
-			args->ar_localflocks = 1;
-			break;
-		case Opt_localcaching:
-			args->ar_localcaching = 1;
-			break;
-		case Opt_debug:
-			args->ar_debug = 1;
-			break;
-		case Opt_nodebug:
-			args->ar_debug = 0;
-			break;
-		case Opt_upgrade:
-			args->ar_upgrade = 1;
-			break;
-		case Opt_acl:
-			args->ar_posix_acl = 1;
-			break;
-		case Opt_noacl:
-			args->ar_posix_acl = 0;
-			break;
-		case Opt_quota_off:
-		case Opt_noquota:
-			args->ar_quota = GFS2_QUOTA_OFF;
-			break;
-		case Opt_quota_account:
-			args->ar_quota = GFS2_QUOTA_ACCOUNT;
-			break;
-		case Opt_quota_on:
-		case Opt_quota:
-			args->ar_quota = GFS2_QUOTA_ON;
-			break;
-		case Opt_suiddir:
-			args->ar_suiddir = 1;
-			break;
-		case Opt_nosuiddir:
-			args->ar_suiddir = 0;
-			break;
-		case Opt_data_writeback:
-			args->ar_data = GFS2_DATA_WRITEBACK;
-			break;
-		case Opt_data_ordered:
-			args->ar_data = GFS2_DATA_ORDERED;
-			break;
-		case Opt_meta:
-			args->ar_meta = 1;
-			break;
-		case Opt_discard:
-			args->ar_discard = 1;
-			break;
-		case Opt_nodiscard:
-			args->ar_discard = 0;
-			break;
-		case Opt_commit:
-			rv = match_int(&tmp[0], &args->ar_commit);
-			if (rv || args->ar_commit <= 0) {
-				fs_info(sdp, "commit mount option requires a positive numeric argument\n");
-				return rv ? rv : -EINVAL;
-			}
-			break;
-		case Opt_err:
-		default:
-			fs_info(sdp, "invalid mount option: %s\n", o);
-			return -EINVAL;
-		}
-	}
-
-	return 0;
-}
-
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
deleted file mode 100644
index 2fd1dcb..0000000
--- a/fs/gfs2/ops_super.c
+++ /dev/null
@@ -1,757 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/statfs.h>
-#include <linux/seq_file.h>
-#include <linux/mount.h>
-#include <linux/kthread.h>
-#include <linux/delay.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/crc32.h>
-#include <linux/time.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "glock.h"
-#include "inode.h"
-#include "log.h"
-#include "quota.h"
-#include "recovery.h"
-#include "rgrp.h"
-#include "super.h"
-#include "sys.h"
-#include "util.h"
-#include "trans.h"
-#include "dir.h"
-#include "eattr.h"
-#include "bmap.h"
-#include "meta_io.h"
-
-#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
-
-/**
- * gfs2_write_inode - Make sure the inode is stable on the disk
- * @inode: The inode
- * @sync: synchronous write flag
- *
- * Returns: errno
- */
-
-static int gfs2_write_inode(struct inode *inode, int sync)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct gfs2_holder gh;
-	struct buffer_head *bh;
-	struct timespec atime;
-	struct gfs2_dinode *di;
-	int ret = 0;
-
-	/* Check this is a "normal" inode, etc */
-	if (!test_bit(GIF_USER, &ip->i_flags) ||
-	    (current->flags & PF_MEMALLOC))
-		return 0;
-	ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-	if (ret)
-		goto do_flush;
-	ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
-	if (ret)
-		goto do_unlock;
-	ret = gfs2_meta_inode_buffer(ip, &bh);
-	if (ret == 0) {
-		di = (struct gfs2_dinode *)bh->b_data;
-		atime.tv_sec = be64_to_cpu(di->di_atime);
-		atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
-		if (timespec_compare(&inode->i_atime, &atime) > 0) {
-			gfs2_trans_add_bh(ip->i_gl, bh, 1);
-			gfs2_dinode_out(ip, bh->b_data);
-		}
-		brelse(bh);
-	}
-	gfs2_trans_end(sdp);
-do_unlock:
-	gfs2_glock_dq_uninit(&gh);
-do_flush:
-	if (sync != 0)
-		gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
-	return ret;
-}
-
-/**
- * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
- * @sdp: the filesystem
- *
- * Returns: errno
- */
-
-static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
-{
-	struct gfs2_holder t_gh;
-	int error;
-
-	gfs2_quota_sync(sdp);
-	gfs2_statfs_sync(sdp);
-
-	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
-				   &t_gh);
-	if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-		return error;
-
-	gfs2_meta_syncfs(sdp);
-	gfs2_log_shutdown(sdp);
-
-	clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
-
-	if (t_gh.gh_gl)
-		gfs2_glock_dq_uninit(&t_gh);
-
-	gfs2_quota_cleanup(sdp);
-
-	return error;
-}
-
-static int gfs2_umount_recovery_wait(void *word)
-{
-	schedule();
-	return 0;
-}
-
-/**
- * gfs2_put_super - Unmount the filesystem
- * @sb: The VFS superblock
- *
- */
-
-static void gfs2_put_super(struct super_block *sb)
-{
-	struct gfs2_sbd *sdp = sb->s_fs_info;
-	int error;
-	struct gfs2_jdesc *jd;
-
-	/*  Unfreeze the filesystem, if we need to  */
-
-	mutex_lock(&sdp->sd_freeze_lock);
-	if (sdp->sd_freeze_count)
-		gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
-	mutex_unlock(&sdp->sd_freeze_lock);
-
-	/* No more recovery requests */
-	set_bit(SDF_NORECOVERY, &sdp->sd_flags);
-	smp_mb();
-
-	/* Wait on outstanding recovery */
-restart:
-	spin_lock(&sdp->sd_jindex_spin);
-	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
-		if (!test_bit(JDF_RECOVERY, &jd->jd_flags))
-			continue;
-		spin_unlock(&sdp->sd_jindex_spin);
-		wait_on_bit(&jd->jd_flags, JDF_RECOVERY,
-			    gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE);
-		goto restart;
-	}
-	spin_unlock(&sdp->sd_jindex_spin);
-
-	kthread_stop(sdp->sd_quotad_process);
-	kthread_stop(sdp->sd_logd_process);
-
-	if (!(sb->s_flags & MS_RDONLY)) {
-		error = gfs2_make_fs_ro(sdp);
-		if (error)
-			gfs2_io_error(sdp);
-	}
-	/*  At this point, we're through modifying the disk  */
-
-	/*  Release stuff  */
-
-	iput(sdp->sd_jindex);
-	iput(sdp->sd_inum_inode);
-	iput(sdp->sd_statfs_inode);
-	iput(sdp->sd_rindex);
-	iput(sdp->sd_quota_inode);
-
-	gfs2_glock_put(sdp->sd_rename_gl);
-	gfs2_glock_put(sdp->sd_trans_gl);
-
-	if (!sdp->sd_args.ar_spectator) {
-		gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
-		gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
-		gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
-		gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
-		gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
-		iput(sdp->sd_ir_inode);
-		iput(sdp->sd_sc_inode);
-		iput(sdp->sd_qc_inode);
-	}
-
-	gfs2_glock_dq_uninit(&sdp->sd_live_gh);
-	gfs2_clear_rgrpd(sdp);
-	gfs2_jindex_free(sdp);
-	/*  Take apart glock structures and buffer lists  */
-	gfs2_gl_hash_clear(sdp);
-	/*  Unmount the locking protocol  */
-	gfs2_lm_unmount(sdp);
-
-	/*  At this point, we're through participating in the lockspace  */
-	gfs2_sys_fs_del(sdp);
-}
-
-/**
- * gfs2_write_super
- * @sb: the superblock
- *
- */
-
-static void gfs2_write_super(struct super_block *sb)
-{
-	sb->s_dirt = 0;
-}
-
-/**
- * gfs2_sync_fs - sync the filesystem
- * @sb: the superblock
- *
- * Flushes the log to disk.
- */
-
-static int gfs2_sync_fs(struct super_block *sb, int wait)
-{
-	sb->s_dirt = 0;
-	if (wait && sb->s_fs_info)
-		gfs2_log_flush(sb->s_fs_info, NULL);
-	return 0;
-}
-
-/**
- * gfs2_freeze - prevent further writes to the filesystem
- * @sb: the VFS structure for the filesystem
- *
- */
-
-static int gfs2_freeze(struct super_block *sb)
-{
-	struct gfs2_sbd *sdp = sb->s_fs_info;
-	int error;
-
-	if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-		return -EINVAL;
-
-	for (;;) {
-		error = gfs2_freeze_fs(sdp);
-		if (!error)
-			break;
-
-		switch (error) {
-		case -EBUSY:
-			fs_err(sdp, "waiting for recovery before freeze\n");
-			break;
-
-		default:
-			fs_err(sdp, "error freezing FS: %d\n", error);
-			break;
-		}
-
-		fs_err(sdp, "retrying...\n");
-		msleep(1000);
-	}
-	return 0;
-}
-
-/**
- * gfs2_unfreeze - reallow writes to the filesystem
- * @sb: the VFS structure for the filesystem
- *
- */
-
-static int gfs2_unfreeze(struct super_block *sb)
-{
-	gfs2_unfreeze_fs(sb->s_fs_info);
-	return 0;
-}
-
-/**
- * statfs_fill - fill in the sg for a given RG
- * @rgd: the RG
- * @sc: the sc structure
- *
- * Returns: 0 on success, -ESTALE if the LVB is invalid
- */
-
-static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
-			    struct gfs2_statfs_change_host *sc)
-{
-	gfs2_rgrp_verify(rgd);
-	sc->sc_total += rgd->rd_data;
-	sc->sc_free += rgd->rd_free;
-	sc->sc_dinodes += rgd->rd_dinodes;
-	return 0;
-}
-
-/**
- * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
- * @sdp: the filesystem
- * @sc: the sc info that will be returned
- *
- * Any error (other than a signal) will cause this routine to fall back
- * to the synchronous version.
- *
- * FIXME: This really shouldn't busy wait like this.
- *
- * Returns: errno
- */
-
-static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
-{
-	struct gfs2_holder ri_gh;
-	struct gfs2_rgrpd *rgd_next;
-	struct gfs2_holder *gha, *gh;
-	unsigned int slots = 64;
-	unsigned int x;
-	int done;
-	int error = 0, err;
-
-	memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
-	gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
-	if (!gha)
-		return -ENOMEM;
-
-	error = gfs2_rindex_hold(sdp, &ri_gh);
-	if (error)
-		goto out;
-
-	rgd_next = gfs2_rgrpd_get_first(sdp);
-
-	for (;;) {
-		done = 1;
-
-		for (x = 0; x < slots; x++) {
-			gh = gha + x;
-
-			if (gh->gh_gl && gfs2_glock_poll(gh)) {
-				err = gfs2_glock_wait(gh);
-				if (err) {
-					gfs2_holder_uninit(gh);
-					error = err;
-				} else {
-					if (!error)
-						error = statfs_slow_fill(
-							gh->gh_gl->gl_object, sc);
-					gfs2_glock_dq_uninit(gh);
-				}
-			}
-
-			if (gh->gh_gl)
-				done = 0;
-			else if (rgd_next && !error) {
-				error = gfs2_glock_nq_init(rgd_next->rd_gl,
-							   LM_ST_SHARED,
-							   GL_ASYNC,
-							   gh);
-				rgd_next = gfs2_rgrpd_get_next(rgd_next);
-				done = 0;
-			}
-
-			if (signal_pending(current))
-				error = -ERESTARTSYS;
-		}
-
-		if (done)
-			break;
-
-		yield();
-	}
-
-	gfs2_glock_dq_uninit(&ri_gh);
-
-out:
-	kfree(gha);
-	return error;
-}
-
-/**
- * gfs2_statfs_i - Do a statfs
- * @sdp: the filesystem
- * @sg: the sg structure
- *
- * Returns: errno
- */
-
-static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
-{
-	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
-	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
-
-	spin_lock(&sdp->sd_statfs_spin);
-
-	*sc = *m_sc;
-	sc->sc_total += l_sc->sc_total;
-	sc->sc_free += l_sc->sc_free;
-	sc->sc_dinodes += l_sc->sc_dinodes;
-
-	spin_unlock(&sdp->sd_statfs_spin);
-
-	if (sc->sc_free < 0)
-		sc->sc_free = 0;
-	if (sc->sc_free > sc->sc_total)
-		sc->sc_free = sc->sc_total;
-	if (sc->sc_dinodes < 0)
-		sc->sc_dinodes = 0;
-
-	return 0;
-}
-
-/**
- * gfs2_statfs - Gather and return stats about the filesystem
- * @sb: The superblock
- * @statfsbuf: The buffer
- *
- * Returns: 0 on success or error code
- */
-
-static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-	struct super_block *sb = dentry->d_inode->i_sb;
-	struct gfs2_sbd *sdp = sb->s_fs_info;
-	struct gfs2_statfs_change_host sc;
-	int error;
-
-	if (gfs2_tune_get(sdp, gt_statfs_slow))
-		error = gfs2_statfs_slow(sdp, &sc);
-	else
-		error = gfs2_statfs_i(sdp, &sc);
-
-	if (error)
-		return error;
-
-	buf->f_type = GFS2_MAGIC;
-	buf->f_bsize = sdp->sd_sb.sb_bsize;
-	buf->f_blocks = sc.sc_total;
-	buf->f_bfree = sc.sc_free;
-	buf->f_bavail = sc.sc_free;
-	buf->f_files = sc.sc_dinodes + sc.sc_free;
-	buf->f_ffree = sc.sc_free;
-	buf->f_namelen = GFS2_FNAMESIZE;
-
-	return 0;
-}
-
-/**
- * gfs2_remount_fs - called when the FS is remounted
- * @sb:  the filesystem
- * @flags:  the remount flags
- * @data:  extra data passed in (not used right now)
- *
- * Returns: errno
- */
-
-static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
-{
-	struct gfs2_sbd *sdp = sb->s_fs_info;
-	struct gfs2_args args = sdp->sd_args; /* Default to current settings */
-	struct gfs2_tune *gt = &sdp->sd_tune;
-	int error;
-
-	spin_lock(&gt->gt_spin);
-	args.ar_commit = gt->gt_log_flush_secs;
-	spin_unlock(&gt->gt_spin);
-	error = gfs2_mount_args(sdp, &args, data);
-	if (error)
-		return error;
-
-	/* Not allowed to change locking details */
-	if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) ||
-	    strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) ||
-	    strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata))
-		return -EINVAL;
-
-	/* Some flags must not be changed */
-	if (args_neq(&args, &sdp->sd_args, spectator) ||
-	    args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
-	    args_neq(&args, &sdp->sd_args, localflocks) ||
-	    args_neq(&args, &sdp->sd_args, localcaching) ||
-	    args_neq(&args, &sdp->sd_args, meta))
-		return -EINVAL;
-
-	if (sdp->sd_args.ar_spectator)
-		*flags |= MS_RDONLY;
-
-	if ((sb->s_flags ^ *flags) & MS_RDONLY) {
-		if (*flags & MS_RDONLY)
-			error = gfs2_make_fs_ro(sdp);
-		else
-			error = gfs2_make_fs_rw(sdp);
-		if (error)
-			return error;
-	}
-
-	sdp->sd_args = args;
-	if (sdp->sd_args.ar_posix_acl)
-		sb->s_flags |= MS_POSIXACL;
-	else
-		sb->s_flags &= ~MS_POSIXACL;
-	spin_lock(&gt->gt_spin);
-	gt->gt_log_flush_secs = args.ar_commit;
-	spin_unlock(&gt->gt_spin);
-
-	return 0;
-}
-
-/**
- * gfs2_drop_inode - Drop an inode (test for remote unlink)
- * @inode: The inode to drop
- *
- * If we've received a callback on an iopen lock then its because a
- * remote node tried to deallocate the inode but failed due to this node
- * still having the inode open. Here we mark the link count zero
- * since we know that it must have reached zero if the GLF_DEMOTE flag
- * is set on the iopen glock. If we didn't do a disk read since the
- * remote node removed the final link then we might otherwise miss
- * this event. This check ensures that this node will deallocate the
- * inode's blocks, or alternatively pass the baton on to another
- * node for later deallocation.
- */
-
-static void gfs2_drop_inode(struct inode *inode)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-
-	if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) {
-		struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
-		if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
-			clear_nlink(inode);
-	}
-	generic_drop_inode(inode);
-}
-
-/**
- * gfs2_clear_inode - Deallocate an inode when VFS is done with it
- * @inode: The VFS inode
- *
- */
-
-static void gfs2_clear_inode(struct inode *inode)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-
-	/* This tells us its a "real" inode and not one which only
-	 * serves to contain an address space (see rgrp.c, meta_io.c)
-	 * which therefore doesn't have its own glocks.
-	 */
-	if (test_bit(GIF_USER, &ip->i_flags)) {
-		ip->i_gl->gl_object = NULL;
-		gfs2_glock_put(ip->i_gl);
-		ip->i_gl = NULL;
-		if (ip->i_iopen_gh.gh_gl) {
-			ip->i_iopen_gh.gh_gl->gl_object = NULL;
-			gfs2_glock_dq_uninit(&ip->i_iopen_gh);
-		}
-	}
-}
-
-static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
-{
-	do {
-		if (d1 == d2)
-			return 1;
-		d1 = d1->d_parent;
-	} while (!IS_ROOT(d1));
-	return 0;
-}
-
-/**
- * gfs2_show_options - Show mount options for /proc/mounts
- * @s: seq_file structure
- * @mnt: vfsmount
- *
- * Returns: 0 on success or error code
- */
-
-static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
-{
-	struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
-	struct gfs2_args *args = &sdp->sd_args;
-	int lfsecs;
-
-	if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
-		seq_printf(s, ",meta");
-	if (args->ar_lockproto[0])
-		seq_printf(s, ",lockproto=%s", args->ar_lockproto);
-	if (args->ar_locktable[0])
-		seq_printf(s, ",locktable=%s", args->ar_locktable);
-	if (args->ar_hostdata[0])
-		seq_printf(s, ",hostdata=%s", args->ar_hostdata);
-	if (args->ar_spectator)
-		seq_printf(s, ",spectator");
-	if (args->ar_ignore_local_fs)
-		seq_printf(s, ",ignore_local_fs");
-	if (args->ar_localflocks)
-		seq_printf(s, ",localflocks");
-	if (args->ar_localcaching)
-		seq_printf(s, ",localcaching");
-	if (args->ar_debug)
-		seq_printf(s, ",debug");
-	if (args->ar_upgrade)
-		seq_printf(s, ",upgrade");
-	if (args->ar_posix_acl)
-		seq_printf(s, ",acl");
-	if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
-		char *state;
-		switch (args->ar_quota) {
-		case GFS2_QUOTA_OFF:
-			state = "off";
-			break;
-		case GFS2_QUOTA_ACCOUNT:
-			state = "account";
-			break;
-		case GFS2_QUOTA_ON:
-			state = "on";
-			break;
-		default:
-			state = "unknown";
-			break;
-		}
-		seq_printf(s, ",quota=%s", state);
-	}
-	if (args->ar_suiddir)
-		seq_printf(s, ",suiddir");
-	if (args->ar_data != GFS2_DATA_DEFAULT) {
-		char *state;
-		switch (args->ar_data) {
-		case GFS2_DATA_WRITEBACK:
-			state = "writeback";
-			break;
-		case GFS2_DATA_ORDERED:
-			state = "ordered";
-			break;
-		default:
-			state = "unknown";
-			break;
-		}
-		seq_printf(s, ",data=%s", state);
-	}
-	if (args->ar_discard)
-		seq_printf(s, ",discard");
-	lfsecs = sdp->sd_tune.gt_log_flush_secs;
-	if (lfsecs != 60)
-		seq_printf(s, ",commit=%d", lfsecs);
-	return 0;
-}
-
-/*
- * We have to (at the moment) hold the inodes main lock to cover
- * the gap between unlocking the shared lock on the iopen lock and
- * taking the exclusive lock. I'd rather do a shared -> exclusive
- * conversion on the iopen lock, but we can change that later. This
- * is safe, just less efficient.
- */
-
-static void gfs2_delete_inode(struct inode *inode)
-{
-	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_holder gh;
-	int error;
-
-	if (!test_bit(GIF_USER, &ip->i_flags))
-		goto out;
-
-	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-	if (unlikely(error)) {
-		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
-		goto out;
-	}
-
-	gfs2_glock_dq_wait(&ip->i_iopen_gh);
-	gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
-	error = gfs2_glock_nq(&ip->i_iopen_gh);
-	if (error)
-		goto out_truncate;
-
-	if (S_ISDIR(inode->i_mode) &&
-	    (ip->i_diskflags & GFS2_DIF_EXHASH)) {
-		error = gfs2_dir_exhash_dealloc(ip);
-		if (error)
-			goto out_unlock;
-	}
-
-	if (ip->i_eattr) {
-		error = gfs2_ea_dealloc(ip);
-		if (error)
-			goto out_unlock;
-	}
-
-	if (!gfs2_is_stuffed(ip)) {
-		error = gfs2_file_dealloc(ip);
-		if (error)
-			goto out_unlock;
-	}
-
-	error = gfs2_dinode_dealloc(ip);
-	if (error)
-		goto out_unlock;
-
-out_truncate:
-	error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
-	if (error)
-		goto out_unlock;
-	/* Needs to be done before glock release & also in a transaction */
-	truncate_inode_pages(&inode->i_data, 0);
-	gfs2_trans_end(sdp);
-
-out_unlock:
-	if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
-		gfs2_glock_dq(&ip->i_iopen_gh);
-	gfs2_holder_uninit(&ip->i_iopen_gh);
-	gfs2_glock_dq_uninit(&gh);
-	if (error && error != GLR_TRYFAILED && error != -EROFS)
-		fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
-out:
-	truncate_inode_pages(&inode->i_data, 0);
-	clear_inode(inode);
-}
-
-static struct inode *gfs2_alloc_inode(struct super_block *sb)
-{
-	struct gfs2_inode *ip;
-
-	ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
-	if (ip) {
-		ip->i_flags = 0;
-		ip->i_gl = NULL;
-	}
-	return &ip->i_inode;
-}
-
-static void gfs2_destroy_inode(struct inode *inode)
-{
-	kmem_cache_free(gfs2_inode_cachep, inode);
-}
-
-const struct super_operations gfs2_super_ops = {
-	.alloc_inode		= gfs2_alloc_inode,
-	.destroy_inode		= gfs2_destroy_inode,
-	.write_inode		= gfs2_write_inode,
-	.delete_inode		= gfs2_delete_inode,
-	.put_super		= gfs2_put_super,
-	.write_super		= gfs2_write_super,
-	.sync_fs		= gfs2_sync_fs,
-	.freeze_fs 		= gfs2_freeze,
-	.unfreeze_fs		= gfs2_unfreeze,
-	.statfs			= gfs2_statfs,
-	.remount_fs		= gfs2_remount_fs,
-	.clear_inode		= gfs2_clear_inode,
-	.drop_inode		= gfs2_drop_inode,
-	.show_options		= gfs2_show_options,
-};
-
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 601913e..40bcc37 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -7,14 +7,20 @@
  * of the GNU General Public License version 2.
  */
 
+#include <linux/bio.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
-#include <linux/crc32.h>
+#include <linux/statfs.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/bio.h>
+#include <linux/crc32.h>
+#include <linux/time.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -31,6 +37,183 @@
 #include "super.h"
 #include "trans.h"
 #include "util.h"
+#include "sys.h"
+#include "eattr.h"
+
+#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
+
+enum {
+	Opt_lockproto,
+	Opt_locktable,
+	Opt_hostdata,
+	Opt_spectator,
+	Opt_ignore_local_fs,
+	Opt_localflocks,
+	Opt_localcaching,
+	Opt_debug,
+	Opt_nodebug,
+	Opt_upgrade,
+	Opt_acl,
+	Opt_noacl,
+	Opt_quota_off,
+	Opt_quota_account,
+	Opt_quota_on,
+	Opt_quota,
+	Opt_noquota,
+	Opt_suiddir,
+	Opt_nosuiddir,
+	Opt_data_writeback,
+	Opt_data_ordered,
+	Opt_meta,
+	Opt_discard,
+	Opt_nodiscard,
+	Opt_commit,
+	Opt_error,
+};
+
+static const match_table_t tokens = {
+	{Opt_lockproto, "lockproto=%s"},
+	{Opt_locktable, "locktable=%s"},
+	{Opt_hostdata, "hostdata=%s"},
+	{Opt_spectator, "spectator"},
+	{Opt_ignore_local_fs, "ignore_local_fs"},
+	{Opt_localflocks, "localflocks"},
+	{Opt_localcaching, "localcaching"},
+	{Opt_debug, "debug"},
+	{Opt_nodebug, "nodebug"},
+	{Opt_upgrade, "upgrade"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
+	{Opt_quota_off, "quota=off"},
+	{Opt_quota_account, "quota=account"},
+	{Opt_quota_on, "quota=on"},
+	{Opt_quota, "quota"},
+	{Opt_noquota, "noquota"},
+	{Opt_suiddir, "suiddir"},
+	{Opt_nosuiddir, "nosuiddir"},
+	{Opt_data_writeback, "data=writeback"},
+	{Opt_data_ordered, "data=ordered"},
+	{Opt_meta, "meta"},
+	{Opt_discard, "discard"},
+	{Opt_nodiscard, "nodiscard"},
+	{Opt_commit, "commit=%d"},
+	{Opt_error, NULL}
+};
+
+/**
+ * gfs2_mount_args - Parse mount options
+ * @sdp:
+ * @data:
+ *
+ * Return: errno
+ */
+
+int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
+{
+	char *o;
+	int token;
+	substring_t tmp[MAX_OPT_ARGS];
+	int rv;
+
+	/* Split the options into tokens with the "," character and
+	   process them */
+
+	while (1) {
+		o = strsep(&options, ",");
+		if (o == NULL)
+			break;
+		if (*o == '\0')
+			continue;
+
+		token = match_token(o, tokens, tmp);
+		switch (token) {
+		case Opt_lockproto:
+			match_strlcpy(args->ar_lockproto, &tmp[0],
+				      GFS2_LOCKNAME_LEN);
+			break;
+		case Opt_locktable:
+			match_strlcpy(args->ar_locktable, &tmp[0],
+				      GFS2_LOCKNAME_LEN);
+			break;
+		case Opt_hostdata:
+			match_strlcpy(args->ar_hostdata, &tmp[0],
+				      GFS2_LOCKNAME_LEN);
+			break;
+		case Opt_spectator:
+			args->ar_spectator = 1;
+			break;
+		case Opt_ignore_local_fs:
+			args->ar_ignore_local_fs = 1;
+			break;
+		case Opt_localflocks:
+			args->ar_localflocks = 1;
+			break;
+		case Opt_localcaching:
+			args->ar_localcaching = 1;
+			break;
+		case Opt_debug:
+			args->ar_debug = 1;
+			break;
+		case Opt_nodebug:
+			args->ar_debug = 0;
+			break;
+		case Opt_upgrade:
+			args->ar_upgrade = 1;
+			break;
+		case Opt_acl:
+			args->ar_posix_acl = 1;
+			break;
+		case Opt_noacl:
+			args->ar_posix_acl = 0;
+			break;
+		case Opt_quota_off:
+		case Opt_noquota:
+			args->ar_quota = GFS2_QUOTA_OFF;
+			break;
+		case Opt_quota_account:
+			args->ar_quota = GFS2_QUOTA_ACCOUNT;
+			break;
+		case Opt_quota_on:
+		case Opt_quota:
+			args->ar_quota = GFS2_QUOTA_ON;
+			break;
+		case Opt_suiddir:
+			args->ar_suiddir = 1;
+			break;
+		case Opt_nosuiddir:
+			args->ar_suiddir = 0;
+			break;
+		case Opt_data_writeback:
+			args->ar_data = GFS2_DATA_WRITEBACK;
+			break;
+		case Opt_data_ordered:
+			args->ar_data = GFS2_DATA_ORDERED;
+			break;
+		case Opt_meta:
+			args->ar_meta = 1;
+			break;
+		case Opt_discard:
+			args->ar_discard = 1;
+			break;
+		case Opt_nodiscard:
+			args->ar_discard = 0;
+			break;
+		case Opt_commit:
+			rv = match_int(&tmp[0], &args->ar_commit);
+			if (rv || args->ar_commit <= 0) {
+				fs_info(sdp, "commit mount option requires a positive numeric argument\n");
+				return rv ? rv : -EINVAL;
+			}
+			break;
+		case Opt_error:
+		default:
+			fs_info(sdp, "invalid mount option: %s\n", o);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
 
 /**
  * gfs2_jindex_free - Clear all the journal index information
@@ -436,3 +619,719 @@ void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
 	mutex_unlock(&sdp->sd_freeze_lock);
 }
 
+
+/**
+ * gfs2_write_inode - Make sure the inode is stable on the disk
+ * @inode: The inode
+ * @sync: synchronous write flag
+ *
+ * Returns: errno
+ */
+
+static int gfs2_write_inode(struct inode *inode, int sync)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_holder gh;
+	struct buffer_head *bh;
+	struct timespec atime;
+	struct gfs2_dinode *di;
+	int ret = 0;
+
+	/* Check this is a "normal" inode, etc */
+	if (!test_bit(GIF_USER, &ip->i_flags) ||
+	    (current->flags & PF_MEMALLOC))
+		return 0;
+	ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	if (ret)
+		goto do_flush;
+	ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
+	if (ret)
+		goto do_unlock;
+	ret = gfs2_meta_inode_buffer(ip, &bh);
+	if (ret == 0) {
+		di = (struct gfs2_dinode *)bh->b_data;
+		atime.tv_sec = be64_to_cpu(di->di_atime);
+		atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
+		if (timespec_compare(&inode->i_atime, &atime) > 0) {
+			gfs2_trans_add_bh(ip->i_gl, bh, 1);
+			gfs2_dinode_out(ip, bh->b_data);
+		}
+		brelse(bh);
+	}
+	gfs2_trans_end(sdp);
+do_unlock:
+	gfs2_glock_dq_uninit(&gh);
+do_flush:
+	if (sync != 0)
+		gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+	return ret;
+}
+
+/**
+ * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
+ * @sdp: the filesystem
+ *
+ * Returns: errno
+ */
+
+static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
+{
+	struct gfs2_holder t_gh;
+	int error;
+
+	gfs2_quota_sync(sdp);
+	gfs2_statfs_sync(sdp);
+
+	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
+				   &t_gh);
+	if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+		return error;
+
+	gfs2_meta_syncfs(sdp);
+	gfs2_log_shutdown(sdp);
+
+	clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+
+	if (t_gh.gh_gl)
+		gfs2_glock_dq_uninit(&t_gh);
+
+	gfs2_quota_cleanup(sdp);
+
+	return error;
+}
+
+static int gfs2_umount_recovery_wait(void *word)
+{
+	schedule();
+	return 0;
+}
+
+/**
+ * gfs2_put_super - Unmount the filesystem
+ * @sb: The VFS superblock
+ *
+ */
+
+static void gfs2_put_super(struct super_block *sb)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	int error;
+	struct gfs2_jdesc *jd;
+
+	/*  Unfreeze the filesystem, if we need to  */
+
+	mutex_lock(&sdp->sd_freeze_lock);
+	if (sdp->sd_freeze_count)
+		gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
+	mutex_unlock(&sdp->sd_freeze_lock);
+
+	/* No more recovery requests */
+	set_bit(SDF_NORECOVERY, &sdp->sd_flags);
+	smp_mb();
+
+	/* Wait on outstanding recovery */
+restart:
+	spin_lock(&sdp->sd_jindex_spin);
+	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+		if (!test_bit(JDF_RECOVERY, &jd->jd_flags))
+			continue;
+		spin_unlock(&sdp->sd_jindex_spin);
+		wait_on_bit(&jd->jd_flags, JDF_RECOVERY,
+			    gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE);
+		goto restart;
+	}
+	spin_unlock(&sdp->sd_jindex_spin);
+
+	kthread_stop(sdp->sd_quotad_process);
+	kthread_stop(sdp->sd_logd_process);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		error = gfs2_make_fs_ro(sdp);
+		if (error)
+			gfs2_io_error(sdp);
+	}
+	/*  At this point, we're through modifying the disk  */
+
+	/*  Release stuff  */
+
+	iput(sdp->sd_jindex);
+	iput(sdp->sd_inum_inode);
+	iput(sdp->sd_statfs_inode);
+	iput(sdp->sd_rindex);
+	iput(sdp->sd_quota_inode);
+
+	gfs2_glock_put(sdp->sd_rename_gl);
+	gfs2_glock_put(sdp->sd_trans_gl);
+
+	if (!sdp->sd_args.ar_spectator) {
+		gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
+		iput(sdp->sd_ir_inode);
+		iput(sdp->sd_sc_inode);
+		iput(sdp->sd_qc_inode);
+	}
+
+	gfs2_glock_dq_uninit(&sdp->sd_live_gh);
+	gfs2_clear_rgrpd(sdp);
+	gfs2_jindex_free(sdp);
+	/*  Take apart glock structures and buffer lists  */
+	gfs2_gl_hash_clear(sdp);
+	/*  Unmount the locking protocol  */
+	gfs2_lm_unmount(sdp);
+
+	/*  At this point, we're through participating in the lockspace  */
+	gfs2_sys_fs_del(sdp);
+}
+
+/**
+ * gfs2_write_super
+ * @sb: the superblock
+ *
+ */
+
+static void gfs2_write_super(struct super_block *sb)
+{
+	sb->s_dirt = 0;
+}
+
+/**
+ * gfs2_sync_fs - sync the filesystem
+ * @sb: the superblock
+ *
+ * Flushes the log to disk.
+ */
+
+static int gfs2_sync_fs(struct super_block *sb, int wait)
+{
+	sb->s_dirt = 0;
+	if (wait && sb->s_fs_info)
+		gfs2_log_flush(sb->s_fs_info, NULL);
+	return 0;
+}
+
+/**
+ * gfs2_freeze - prevent further writes to the filesystem
+ * @sb: the VFS structure for the filesystem
+ *
+ */
+
+static int gfs2_freeze(struct super_block *sb)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	int error;
+
+	if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+		return -EINVAL;
+
+	for (;;) {
+		error = gfs2_freeze_fs(sdp);
+		if (!error)
+			break;
+
+		switch (error) {
+		case -EBUSY:
+			fs_err(sdp, "waiting for recovery before freeze\n");
+			break;
+
+		default:
+			fs_err(sdp, "error freezing FS: %d\n", error);
+			break;
+		}
+
+		fs_err(sdp, "retrying...\n");
+		msleep(1000);
+	}
+	return 0;
+}
+
+/**
+ * gfs2_unfreeze - reallow writes to the filesystem
+ * @sb: the VFS structure for the filesystem
+ *
+ */
+
+static int gfs2_unfreeze(struct super_block *sb)
+{
+	gfs2_unfreeze_fs(sb->s_fs_info);
+	return 0;
+}
+
+/**
+ * statfs_fill - fill in the sg for a given RG
+ * @rgd: the RG
+ * @sc: the sc structure
+ *
+ * Returns: 0 on success, -ESTALE if the LVB is invalid
+ */
+
+static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
+			    struct gfs2_statfs_change_host *sc)
+{
+	gfs2_rgrp_verify(rgd);
+	sc->sc_total += rgd->rd_data;
+	sc->sc_free += rgd->rd_free;
+	sc->sc_dinodes += rgd->rd_dinodes;
+	return 0;
+}
+
+/**
+ * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
+ * @sdp: the filesystem
+ * @sc: the sc info that will be returned
+ *
+ * Any error (other than a signal) will cause this routine to fall back
+ * to the synchronous version.
+ *
+ * FIXME: This really shouldn't busy wait like this.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+	struct gfs2_holder ri_gh;
+	struct gfs2_rgrpd *rgd_next;
+	struct gfs2_holder *gha, *gh;
+	unsigned int slots = 64;
+	unsigned int x;
+	int done;
+	int error = 0, err;
+
+	memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
+	gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
+	if (!gha)
+		return -ENOMEM;
+
+	error = gfs2_rindex_hold(sdp, &ri_gh);
+	if (error)
+		goto out;
+
+	rgd_next = gfs2_rgrpd_get_first(sdp);
+
+	for (;;) {
+		done = 1;
+
+		for (x = 0; x < slots; x++) {
+			gh = gha + x;
+
+			if (gh->gh_gl && gfs2_glock_poll(gh)) {
+				err = gfs2_glock_wait(gh);
+				if (err) {
+					gfs2_holder_uninit(gh);
+					error = err;
+				} else {
+					if (!error)
+						error = statfs_slow_fill(
+							gh->gh_gl->gl_object, sc);
+					gfs2_glock_dq_uninit(gh);
+				}
+			}
+
+			if (gh->gh_gl)
+				done = 0;
+			else if (rgd_next && !error) {
+				error = gfs2_glock_nq_init(rgd_next->rd_gl,
+							   LM_ST_SHARED,
+							   GL_ASYNC,
+							   gh);
+				rgd_next = gfs2_rgrpd_get_next(rgd_next);
+				done = 0;
+			}
+
+			if (signal_pending(current))
+				error = -ERESTARTSYS;
+		}
+
+		if (done)
+			break;
+
+		yield();
+	}
+
+	gfs2_glock_dq_uninit(&ri_gh);
+
+out:
+	kfree(gha);
+	return error;
+}
+
+/**
+ * gfs2_statfs_i - Do a statfs
+ * @sdp: the filesystem
+ * @sg: the sg structure
+ *
+ * Returns: errno
+ */
+
+static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+
+	spin_lock(&sdp->sd_statfs_spin);
+
+	*sc = *m_sc;
+	sc->sc_total += l_sc->sc_total;
+	sc->sc_free += l_sc->sc_free;
+	sc->sc_dinodes += l_sc->sc_dinodes;
+
+	spin_unlock(&sdp->sd_statfs_spin);
+
+	if (sc->sc_free < 0)
+		sc->sc_free = 0;
+	if (sc->sc_free > sc->sc_total)
+		sc->sc_free = sc->sc_total;
+	if (sc->sc_dinodes < 0)
+		sc->sc_dinodes = 0;
+
+	return 0;
+}
+
+/**
+ * gfs2_statfs - Gather and return stats about the filesystem
+ * @sb: The superblock
+ * @statfsbuf: The buffer
+ *
+ * Returns: 0 on success or error code
+ */
+
+static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_inode->i_sb;
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_statfs_change_host sc;
+	int error;
+
+	if (gfs2_tune_get(sdp, gt_statfs_slow))
+		error = gfs2_statfs_slow(sdp, &sc);
+	else
+		error = gfs2_statfs_i(sdp, &sc);
+
+	if (error)
+		return error;
+
+	buf->f_type = GFS2_MAGIC;
+	buf->f_bsize = sdp->sd_sb.sb_bsize;
+	buf->f_blocks = sc.sc_total;
+	buf->f_bfree = sc.sc_free;
+	buf->f_bavail = sc.sc_free;
+	buf->f_files = sc.sc_dinodes + sc.sc_free;
+	buf->f_ffree = sc.sc_free;
+	buf->f_namelen = GFS2_FNAMESIZE;
+
+	return 0;
+}
+
+/**
+ * gfs2_remount_fs - called when the FS is remounted
+ * @sb:  the filesystem
+ * @flags:  the remount flags
+ * @data:  extra data passed in (not used right now)
+ *
+ * Returns: errno
+ */
+
+static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_args args = sdp->sd_args; /* Default to current settings */
+	struct gfs2_tune *gt = &sdp->sd_tune;
+	int error;
+
+	spin_lock(&gt->gt_spin);
+	args.ar_commit = gt->gt_log_flush_secs;
+	spin_unlock(&gt->gt_spin);
+	error = gfs2_mount_args(sdp, &args, data);
+	if (error)
+		return error;
+
+	/* Not allowed to change locking details */
+	if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) ||
+	    strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) ||
+	    strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata))
+		return -EINVAL;
+
+	/* Some flags must not be changed */
+	if (args_neq(&args, &sdp->sd_args, spectator) ||
+	    args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
+	    args_neq(&args, &sdp->sd_args, localflocks) ||
+	    args_neq(&args, &sdp->sd_args, localcaching) ||
+	    args_neq(&args, &sdp->sd_args, meta))
+		return -EINVAL;
+
+	if (sdp->sd_args.ar_spectator)
+		*flags |= MS_RDONLY;
+
+	if ((sb->s_flags ^ *flags) & MS_RDONLY) {
+		if (*flags & MS_RDONLY)
+			error = gfs2_make_fs_ro(sdp);
+		else
+			error = gfs2_make_fs_rw(sdp);
+		if (error)
+			return error;
+	}
+
+	sdp->sd_args = args;
+	if (sdp->sd_args.ar_posix_acl)
+		sb->s_flags |= MS_POSIXACL;
+	else
+		sb->s_flags &= ~MS_POSIXACL;
+	spin_lock(&gt->gt_spin);
+	gt->gt_log_flush_secs = args.ar_commit;
+	spin_unlock(&gt->gt_spin);
+
+	return 0;
+}
+
+/**
+ * gfs2_drop_inode - Drop an inode (test for remote unlink)
+ * @inode: The inode to drop
+ *
+ * If we've received a callback on an iopen lock then its because a
+ * remote node tried to deallocate the inode but failed due to this node
+ * still having the inode open. Here we mark the link count zero
+ * since we know that it must have reached zero if the GLF_DEMOTE flag
+ * is set on the iopen glock. If we didn't do a disk read since the
+ * remote node removed the final link then we might otherwise miss
+ * this event. This check ensures that this node will deallocate the
+ * inode's blocks, or alternatively pass the baton on to another
+ * node for later deallocation.
+ */
+
+static void gfs2_drop_inode(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+
+	if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) {
+		struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
+		if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
+			clear_nlink(inode);
+	}
+	generic_drop_inode(inode);
+}
+
+/**
+ * gfs2_clear_inode - Deallocate an inode when VFS is done with it
+ * @inode: The VFS inode
+ *
+ */
+
+static void gfs2_clear_inode(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+
+	/* This tells us its a "real" inode and not one which only
+	 * serves to contain an address space (see rgrp.c, meta_io.c)
+	 * which therefore doesn't have its own glocks.
+	 */
+	if (test_bit(GIF_USER, &ip->i_flags)) {
+		ip->i_gl->gl_object = NULL;
+		gfs2_glock_put(ip->i_gl);
+		ip->i_gl = NULL;
+		if (ip->i_iopen_gh.gh_gl) {
+			ip->i_iopen_gh.gh_gl->gl_object = NULL;
+			gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+		}
+	}
+}
+
+static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
+{
+	do {
+		if (d1 == d2)
+			return 1;
+		d1 = d1->d_parent;
+	} while (!IS_ROOT(d1));
+	return 0;
+}
+
+/**
+ * gfs2_show_options - Show mount options for /proc/mounts
+ * @s: seq_file structure
+ * @mnt: vfsmount
+ *
+ * Returns: 0 on success or error code
+ */
+
+static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
+{
+	struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
+	struct gfs2_args *args = &sdp->sd_args;
+	int lfsecs;
+
+	if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
+		seq_printf(s, ",meta");
+	if (args->ar_lockproto[0])
+		seq_printf(s, ",lockproto=%s", args->ar_lockproto);
+	if (args->ar_locktable[0])
+		seq_printf(s, ",locktable=%s", args->ar_locktable);
+	if (args->ar_hostdata[0])
+		seq_printf(s, ",hostdata=%s", args->ar_hostdata);
+	if (args->ar_spectator)
+		seq_printf(s, ",spectator");
+	if (args->ar_ignore_local_fs)
+		seq_printf(s, ",ignore_local_fs");
+	if (args->ar_localflocks)
+		seq_printf(s, ",localflocks");
+	if (args->ar_localcaching)
+		seq_printf(s, ",localcaching");
+	if (args->ar_debug)
+		seq_printf(s, ",debug");
+	if (args->ar_upgrade)
+		seq_printf(s, ",upgrade");
+	if (args->ar_posix_acl)
+		seq_printf(s, ",acl");
+	if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
+		char *state;
+		switch (args->ar_quota) {
+		case GFS2_QUOTA_OFF:
+			state = "off";
+			break;
+		case GFS2_QUOTA_ACCOUNT:
+			state = "account";
+			break;
+		case GFS2_QUOTA_ON:
+			state = "on";
+			break;
+		default:
+			state = "unknown";
+			break;
+		}
+		seq_printf(s, ",quota=%s", state);
+	}
+	if (args->ar_suiddir)
+		seq_printf(s, ",suiddir");
+	if (args->ar_data != GFS2_DATA_DEFAULT) {
+		char *state;
+		switch (args->ar_data) {
+		case GFS2_DATA_WRITEBACK:
+			state = "writeback";
+			break;
+		case GFS2_DATA_ORDERED:
+			state = "ordered";
+			break;
+		default:
+			state = "unknown";
+			break;
+		}
+		seq_printf(s, ",data=%s", state);
+	}
+	if (args->ar_discard)
+		seq_printf(s, ",discard");
+	lfsecs = sdp->sd_tune.gt_log_flush_secs;
+	if (lfsecs != 60)
+		seq_printf(s, ",commit=%d", lfsecs);
+	return 0;
+}
+
+/*
+ * We have to (at the moment) hold the inodes main lock to cover
+ * the gap between unlocking the shared lock on the iopen lock and
+ * taking the exclusive lock. I'd rather do a shared -> exclusive
+ * conversion on the iopen lock, but we can change that later. This
+ * is safe, just less efficient.
+ */
+
+static void gfs2_delete_inode(struct inode *inode)
+{
+	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int error;
+
+	if (!test_bit(GIF_USER, &ip->i_flags))
+		goto out;
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	if (unlikely(error)) {
+		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+		goto out;
+	}
+
+	gfs2_glock_dq_wait(&ip->i_iopen_gh);
+	gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
+	error = gfs2_glock_nq(&ip->i_iopen_gh);
+	if (error)
+		goto out_truncate;
+
+	if (S_ISDIR(inode->i_mode) &&
+	    (ip->i_diskflags & GFS2_DIF_EXHASH)) {
+		error = gfs2_dir_exhash_dealloc(ip);
+		if (error)
+			goto out_unlock;
+	}
+
+	if (ip->i_eattr) {
+		error = gfs2_ea_dealloc(ip);
+		if (error)
+			goto out_unlock;
+	}
+
+	if (!gfs2_is_stuffed(ip)) {
+		error = gfs2_file_dealloc(ip);
+		if (error)
+			goto out_unlock;
+	}
+
+	error = gfs2_dinode_dealloc(ip);
+	if (error)
+		goto out_unlock;
+
+out_truncate:
+	error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
+	if (error)
+		goto out_unlock;
+	/* Needs to be done before glock release & also in a transaction */
+	truncate_inode_pages(&inode->i_data, 0);
+	gfs2_trans_end(sdp);
+
+out_unlock:
+	if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
+		gfs2_glock_dq(&ip->i_iopen_gh);
+	gfs2_holder_uninit(&ip->i_iopen_gh);
+	gfs2_glock_dq_uninit(&gh);
+	if (error && error != GLR_TRYFAILED && error != -EROFS)
+		fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
+out:
+	truncate_inode_pages(&inode->i_data, 0);
+	clear_inode(inode);
+}
+
+static struct inode *gfs2_alloc_inode(struct super_block *sb)
+{
+	struct gfs2_inode *ip;
+
+	ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
+	if (ip) {
+		ip->i_flags = 0;
+		ip->i_gl = NULL;
+	}
+	return &ip->i_inode;
+}
+
+static void gfs2_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(gfs2_inode_cachep, inode);
+}
+
+const struct super_operations gfs2_super_ops = {
+	.alloc_inode		= gfs2_alloc_inode,
+	.destroy_inode		= gfs2_destroy_inode,
+	.write_inode		= gfs2_write_inode,
+	.delete_inode		= gfs2_delete_inode,
+	.put_super		= gfs2_put_super,
+	.write_super		= gfs2_write_super,
+	.sync_fs		= gfs2_sync_fs,
+	.freeze_fs 		= gfs2_freeze,
+	.unfreeze_fs		= gfs2_unfreeze,
+	.statfs			= gfs2_statfs,
+	.remount_fs		= gfs2_remount_fs,
+	.clear_inode		= gfs2_clear_inode,
+	.drop_inode		= gfs2_drop_inode,
+	.show_options		= gfs2_show_options,
+};
+
-- 
cgit v0.10.2


From 2286dbfad1fb622ee2691537e5caaedee4618860 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 22 May 2009 10:45:09 +0100
Subject: GFS2: Move gfs2_rmdiri into ops_inode.c

Move gfs2_rmdiri() into ops_inode.c and make it static.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index c03a1a3..9b17447 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1046,58 +1046,6 @@ fail:
 	return ERR_PTR(error);
 }
 
-/**
- * gfs2_rmdiri - Remove a directory
- * @dip: The parent directory of the directory to be removed
- * @name: The name of the directory to be removed
- * @ip: The GFS2 inode of the directory to be removed
- *
- * Assumes Glocks on dip and ip are held
- *
- * Returns: errno
- */
-
-int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
-		struct gfs2_inode *ip)
-{
-	struct qstr dotname;
-	int error;
-
-	if (ip->i_entries != 2) {
-		if (gfs2_consist_inode(ip))
-			gfs2_dinode_print(ip);
-		return -EIO;
-	}
-
-	error = gfs2_dir_del(dip, name);
-	if (error)
-		return error;
-
-	error = gfs2_change_nlink(dip, -1);
-	if (error)
-		return error;
-
-	gfs2_str2qstr(&dotname, ".");
-	error = gfs2_dir_del(ip, &dotname);
-	if (error)
-		return error;
-
-	gfs2_str2qstr(&dotname, "..");
-	error = gfs2_dir_del(ip, &dotname);
-	if (error)
-		return error;
-
-	/* It looks odd, but it really should be done twice */
-	error = gfs2_change_nlink(ip, -1);
-	if (error)
-		return error;
-
-	error = gfs2_change_nlink(ip, -1);
-	if (error)
-		return error;
-
-	return error;
-}
 
 /*
  * gfs2_unlink_ok - check to see that a inode is still in a directory
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 2c3ec07..6cd3928 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -96,8 +96,6 @@ extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
 				  const struct qstr *name,
 				  unsigned int mode, dev_t dev);
-extern int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
-		       struct gfs2_inode *ip);
 extern int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 			  const struct gfs2_inode *ip);
 extern int gfs2_permission(struct inode *inode, int mask);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1c70fa5..5dacd64 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -473,6 +473,59 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 }
 
 /**
+ * gfs2_rmdiri - Remove a directory
+ * @dip: The parent directory of the directory to be removed
+ * @name: The name of the directory to be removed
+ * @ip: The GFS2 inode of the directory to be removed
+ *
+ * Assumes Glocks on dip and ip are held
+ *
+ * Returns: errno
+ */
+
+static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
+		       struct gfs2_inode *ip)
+{
+	struct qstr dotname;
+	int error;
+
+	if (ip->i_entries != 2) {
+		if (gfs2_consist_inode(ip))
+			gfs2_dinode_print(ip);
+		return -EIO;
+	}
+
+	error = gfs2_dir_del(dip, name);
+	if (error)
+		return error;
+
+	error = gfs2_change_nlink(dip, -1);
+	if (error)
+		return error;
+
+	gfs2_str2qstr(&dotname, ".");
+	error = gfs2_dir_del(ip, &dotname);
+	if (error)
+		return error;
+
+	gfs2_str2qstr(&dotname, "..");
+	error = gfs2_dir_del(ip, &dotname);
+	if (error)
+		return error;
+
+	/* It looks odd, but it really should be done twice */
+	error = gfs2_change_nlink(ip, -1);
+	if (error)
+		return error;
+
+	error = gfs2_change_nlink(ip, -1);
+	if (error)
+		return error;
+
+	return error;
+}
+
+/**
  * gfs2_rmdir - Remove a directory
  * @dir: The parent directory of the directory to be removed
  * @dentry: The dentry of the directory to remove
-- 
cgit v0.10.2


From 536baf02f650f4547f105386878b4736fbc181e8 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 22 May 2009 10:48:59 +0100
Subject: GFS2: Move gfs2_readlinki into ops_inode.c

Move gfs2_readlinki into ops_inode.c and make it static

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 9b17447..676e750 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1085,63 +1085,7 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 	return 0;
 }
 
-/**
- * gfs2_readlinki - return the contents of a symlink
- * @ip: the symlink's inode
- * @buf: a pointer to the buffer to be filled
- * @len: a pointer to the length of @buf
- *
- * If @buf is too small, a piece of memory is kmalloc()ed and needs
- * to be freed by the caller.
- *
- * Returns: errno
- */
-
-int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
-{
-	struct gfs2_holder i_gh;
-	struct buffer_head *dibh;
-	unsigned int x;
-	int error;
-
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
-	error = gfs2_glock_nq(&i_gh);
-	if (error) {
-		gfs2_holder_uninit(&i_gh);
-		return error;
-	}
-
-	if (!ip->i_disksize) {
-		gfs2_consist_inode(ip);
-		error = -EIO;
-		goto out;
-	}
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (error)
-		goto out;
-
-	x = ip->i_disksize + 1;
-	if (x > *len) {
-		*buf = kmalloc(x, GFP_NOFS);
-		if (!*buf) {
-			error = -ENOMEM;
-			goto out_brelse;
-		}
-	}
-
-	memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
-	*len = x;
-
-out_brelse:
-	brelse(dibh);
-out:
-	gfs2_glock_dq_uninit(&i_gh);
-	return error;
-}
-
-static int
-__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
+static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 {
 	struct buffer_head *dibh;
 	int error;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 6cd3928..fc9a08f 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -99,7 +99,6 @@ extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
 extern int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 			  const struct gfs2_inode *ip);
 extern int gfs2_permission(struct inode *inode, int mask);
-extern int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
 extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
 extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 5dacd64..f607f09 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -938,6 +938,61 @@ out:
 }
 
 /**
+ * gfs2_readlinki - return the contents of a symlink
+ * @ip: the symlink's inode
+ * @buf: a pointer to the buffer to be filled
+ * @len: a pointer to the length of @buf
+ *
+ * If @buf is too small, a piece of memory is kmalloc()ed and needs
+ * to be freed by the caller.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
+{
+	struct gfs2_holder i_gh;
+	struct buffer_head *dibh;
+	unsigned int x;
+	int error;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
+	error = gfs2_glock_nq(&i_gh);
+	if (error) {
+		gfs2_holder_uninit(&i_gh);
+		return error;
+	}
+
+	if (!ip->i_disksize) {
+		gfs2_consist_inode(ip);
+		error = -EIO;
+		goto out;
+	}
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto out;
+
+	x = ip->i_disksize + 1;
+	if (x > *len) {
+		*buf = kmalloc(x, GFP_NOFS);
+		if (!*buf) {
+			error = -ENOMEM;
+			goto out_brelse;
+		}
+	}
+
+	memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
+	*len = x;
+
+out_brelse:
+	brelse(dibh);
+out:
+	gfs2_glock_dq_uninit(&i_gh);
+	return error;
+}
+
+/**
  * gfs2_readlink - Read the value of a symlink
  * @dentry: the symlink
  * @buf: the buffer to read the symlink data into
-- 
cgit v0.10.2


From 87ec21741138bb42e7f943bb142b1d8567c10925 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 22 May 2009 10:54:50 +0100
Subject: GFS2: Move gfs2_unlink_ok into ops_inode.c

Another function which is only called from one ops_inode.c so
we move it and make it static.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 676e750..2f94bd7 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1046,45 +1046,6 @@ fail:
 	return ERR_PTR(error);
 }
 
-
-/*
- * gfs2_unlink_ok - check to see that a inode is still in a directory
- * @dip: the directory
- * @name: the name of the file
- * @ip: the inode
- *
- * Assumes that the lock on (at least) @dip is held.
- *
- * Returns: 0 if the parent/child relationship is correct, errno if it isn't
- */
-
-int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
-		   const struct gfs2_inode *ip)
-{
-	int error;
-
-	if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
-		return -EPERM;
-
-	if ((dip->i_inode.i_mode & S_ISVTX) &&
-	    dip->i_inode.i_uid != current_fsuid() &&
-	    ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
-		return -EPERM;
-
-	if (IS_APPEND(&dip->i_inode))
-		return -EPERM;
-
-	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
-	if (error)
-		return error;
-
-	error = gfs2_dir_check(&dip->i_inode, name, ip);
-	if (error)
-		return error;
-
-	return 0;
-}
-
 static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 {
 	struct buffer_head *dibh;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index fc9a08f..c341aaf 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -96,8 +96,6 @@ extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
 				  const struct qstr *name,
 				  unsigned int mode, dev_t dev);
-extern int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
-			  const struct gfs2_inode *ip);
 extern int gfs2_permission(struct inode *inode, int mask);
 extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
 extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index f607f09..f8bd20b 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -262,6 +262,44 @@ out_parent:
 	return error;
 }
 
+/*
+ * gfs2_unlink_ok - check to see that a inode is still in a directory
+ * @dip: the directory
+ * @name: the name of the file
+ * @ip: the inode
+ *
+ * Assumes that the lock on (at least) @dip is held.
+ *
+ * Returns: 0 if the parent/child relationship is correct, errno if it isn't
+ */
+
+static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
+			  const struct gfs2_inode *ip)
+{
+	int error;
+
+	if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
+		return -EPERM;
+
+	if ((dip->i_inode.i_mode & S_ISVTX) &&
+	    dip->i_inode.i_uid != current_fsuid() &&
+	    ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
+		return -EPERM;
+
+	if (IS_APPEND(&dip->i_inode))
+		return -EPERM;
+
+	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
+	if (error)
+		return error;
+
+	error = gfs2_dir_check(&dip->i_inode, name, ip);
+	if (error)
+		return error;
+
+	return 0;
+}
+
 /**
  * gfs2_unlink - Unlink a file
  * @dir: The inode of the directory containing the file to unlink
-- 
cgit v0.10.2


From 0e1b74df992c1ef92213ab26f952befda2087f59 Mon Sep 17 00:00:00 2001
From: Mingwei Wang <mwwang@marvell.com>
Date: Wed, 20 May 2009 16:49:57 +0800
Subject: [ARM] pxa: fix the incorrectly defined drive strength macros for
 pxa{168,910}

Signed-off-by: Mingwei Wang <mwwang@marvell.com>
Signed-off-by: Eric Miao <eric.miao@marvell.com>

diff --git a/arch/arm/mach-mmp/include/mach/mfp-pxa168.h b/arch/arm/mach-mmp/include/mach/mfp-pxa168.h
index d0bdb6e..2e91464 100644
--- a/arch/arm/mach-mmp/include/mach/mfp-pxa168.h
+++ b/arch/arm/mach-mmp/include/mach/mfp-pxa168.h
@@ -3,6 +3,11 @@
 
 #include <mach/mfp.h>
 
+#define MFP_DRIVE_VERY_SLOW	(0x0 << 13)
+#define MFP_DRIVE_SLOW		(0x1 << 13)
+#define MFP_DRIVE_MEDIUM	(0x2 << 13)
+#define MFP_DRIVE_FAST		(0x3 << 13)
+
 /* GPIO */
 #define GPIO0_GPIO		MFP_CFG(GPIO0, AF5)
 #define GPIO1_GPIO		MFP_CFG(GPIO1, AF5)
diff --git a/arch/arm/mach-mmp/include/mach/mfp-pxa910.h b/arch/arm/mach-mmp/include/mach/mfp-pxa910.h
index 48a1cbc..d97de36 100644
--- a/arch/arm/mach-mmp/include/mach/mfp-pxa910.h
+++ b/arch/arm/mach-mmp/include/mach/mfp-pxa910.h
@@ -3,6 +3,11 @@
 
 #include <mach/mfp.h>
 
+#define MFP_DRIVE_VERY_SLOW	(0x0 << 13)
+#define MFP_DRIVE_SLOW		(0x2 << 13)
+#define MFP_DRIVE_MEDIUM	(0x4 << 13)
+#define MFP_DRIVE_FAST		(0x8 << 13)
+
 /* UART2 */
 #define GPIO47_UART2_RXD	MFP_CFG(GPIO47, AF6)
 #define GPIO48_UART2_TXD	MFP_CFG(GPIO48, AF6)
diff --git a/arch/arm/mach-mmp/include/mach/mfp.h b/arch/arm/mach-mmp/include/mach/mfp.h
index 277ea4c..62e510e 100644
--- a/arch/arm/mach-mmp/include/mach/mfp.h
+++ b/arch/arm/mach-mmp/include/mach/mfp.h
@@ -12,16 +12,13 @@
  * possible, we make the following compromise:
  *
  * 1. SLEEP_OE_N will always be programmed to '1' (by MFP_LPM_FLOAT)
- * 2. DRIVE strength definitions redefined to include the reserved bit10
+ * 2. DRIVE strength definitions redefined to include the reserved bit
+ *    - the reserved bit differs between pxa168 and pxa910, and the
+ *      MFP_DRIVE_* macros are individually defined in mfp-pxa{168,910}.h
  * 3. Override MFP_CFG() and MFP_CFG_DRV()
  * 4. Drop the use of MFP_CFG_LPM() and MFP_CFG_X()
  */
 
-#define MFP_DRIVE_VERY_SLOW	(0x0 << 13)
-#define MFP_DRIVE_SLOW		(0x2 << 13)
-#define MFP_DRIVE_MEDIUM	(0x4 << 13)
-#define MFP_DRIVE_FAST		(0x8 << 13)
-
 #undef MFP_CFG
 #undef MFP_CFG_DRV
 #undef MFP_CFG_LPM
-- 
cgit v0.10.2


From f5c81a327015844eb91087dd102648b5d984f33c Mon Sep 17 00:00:00 2001
From: Coly Li <coly.li@suse.de>
Date: Thu, 23 Apr 2009 03:04:45 +0800
Subject: [ARM] pxa: add parameter to clksrc_read() for pxa168/910

This patch modifies parameter of clksrc_read() from 'void' to
'struct clocksource *cs', which fixes compile warning for
incompatible parameter type.

Signed-off-by: Coly Li <coly.li@suse.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Eric Miao <eric.miao@marvell.com>

diff --git a/arch/arm/mach-mmp/time.c b/arch/arm/mach-mmp/time.c
index b03a6ed..a8400bb 100644
--- a/arch/arm/mach-mmp/time.c
+++ b/arch/arm/mach-mmp/time.c
@@ -136,7 +136,7 @@ static struct clock_event_device ckevt = {
 	.set_mode	= timer_set_mode,
 };
 
-static cycle_t clksrc_read(void)
+static cycle_t clksrc_read(struct clocksource *cs)
 {
 	return timer_read();
 }
-- 
cgit v0.10.2


From 6ec04f434d29aed33608e0ca4d8b100190e71e96 Mon Sep 17 00:00:00 2001
From: Marek Vasut <marek.vasut@gmail.com>
Date: Fri, 22 May 2009 01:39:10 +0200
Subject: [ARM] pxa/palm: fix PalmLD/T5/TX AC97 MFP

Signed-off-by: Marek Vasut <marek.vasut@gmail.com>
Signed-off-by: Eric Miao <eric.miao@marvell.com>

diff --git a/arch/arm/mach-pxa/palmld.c b/arch/arm/mach-pxa/palmld.c
index 1cec180..471a853 100644
--- a/arch/arm/mach-pxa/palmld.c
+++ b/arch/arm/mach-pxa/palmld.c
@@ -62,6 +62,8 @@ static unsigned long palmld_pin_config[] __initdata = {
 	GPIO29_AC97_SDATA_IN_0,
 	GPIO30_AC97_SDATA_OUT,
 	GPIO31_AC97_SYNC,
+	GPIO89_AC97_SYSCLK,
+	GPIO95_AC97_nRESET,
 
 	/* IrDA */
 	GPIO108_GPIO,	/* ir disable */
diff --git a/arch/arm/mach-pxa/palmt5.c b/arch/arm/mach-pxa/palmt5.c
index 3066236..05bf979 100644
--- a/arch/arm/mach-pxa/palmt5.c
+++ b/arch/arm/mach-pxa/palmt5.c
@@ -64,6 +64,7 @@ static unsigned long palmt5_pin_config[] __initdata = {
 	GPIO29_AC97_SDATA_IN_0,
 	GPIO30_AC97_SDATA_OUT,
 	GPIO31_AC97_SYNC,
+	GPIO89_AC97_SYSCLK,
 	GPIO95_AC97_nRESET,
 
 	/* IrDA */
diff --git a/arch/arm/mach-pxa/palmtx.c b/arch/arm/mach-pxa/palmtx.c
index e2d44b1..e99a893 100644
--- a/arch/arm/mach-pxa/palmtx.c
+++ b/arch/arm/mach-pxa/palmtx.c
@@ -65,6 +65,7 @@ static unsigned long palmtx_pin_config[] __initdata = {
 	GPIO29_AC97_SDATA_IN_0,
 	GPIO30_AC97_SDATA_OUT,
 	GPIO31_AC97_SYNC,
+	GPIO89_AC97_SYSCLK,
 	GPIO95_AC97_nRESET,
 
 	/* IrDA */
-- 
cgit v0.10.2


From a63eaf34ae60bdb067a354cc8def2e8f4a01f5f4 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 22 May 2009 14:17:31 +1000
Subject: perf_counter: Dynamically allocate tasks' perf_counter_context struct

This replaces the struct perf_counter_context in the task_struct with
a pointer to a dynamically allocated perf_counter_context struct.  The
main reason for doing is this is to allow us to transfer a
perf_counter_context from one task to another when we do lazy PMU
switching in a later patch.

This has a few side-benefits: the task_struct becomes a little smaller,
we save some memory because only tasks that have perf_counters attached
get a perf_counter_context allocated for them, and we can remove the
inclusion of <linux/perf_counter.h> in sched.h, meaning that we don't
end up recompiling nearly everything whenever perf_counter.h changes.

The perf_counter_context structures are reference-counted and freed
when the last reference is dropped.  A context can have references
from its task and the counters on its task.  Counters can outlive the
task so it is possible that a context will be freed well after its
task has exited.

Contexts are allocated on fork if the parent had a context, or
otherwise the first time that a per-task counter is created on a task.
In the latter case, we set the context pointer in the task struct
locklessly using an atomic compare-and-exchange operation in case we
raced with some other task in creating a context for the subject task.

This also removes the task pointer from the perf_counter struct.  The
task pointer was not used anywhere and would make it harder to move a
context from one task to another.  Anything that needed to know which
task a counter was attached to was already using counter->ctx->task.

The __perf_counter_init_context function moves up in perf_counter.c
so that it can be called from find_get_context, and now initializes
the refcount, but is otherwise unchanged.

We were potentially calling list_del_counter twice: once from
__perf_counter_exit_task when the task exits and once from
__perf_counter_remove_from_context when the counter's fd gets closed.
This adds a check in list_del_counter so it doesn't do anything if
the counter has already been removed from the lists.

Since perf_counter_task_sched_in doesn't do anything if the task doesn't
have a context, and leaves cpuctx->task_ctx = NULL, this adds code to
__perf_install_in_context to set cpuctx->task_ctx if necessary, i.e. in
the case where the current task adds the first counter to itself and
thus creates a context for itself.

This also adds similar code to __perf_counter_enable to handle a
similar situation which can arise when the counters have been disabled
using prctl; that also leaves cpuctx->task_ctx = NULL.

[ Impact: refactor counter context management to prepare for new feature ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <18966.10075.781053.231153@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e9021a9..b4f6440 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -14,6 +14,7 @@
  *	Mikael Pettersson	:	PM converted to driver model.
  */
 
+#include <linux/perf_counter.h>
 #include <linux/kernel_stat.h>
 #include <linux/mc146818rtc.h>
 #include <linux/acpi_pmtmr.h>
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 503afaa..d87247d 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -108,18 +108,6 @@ extern struct group_info init_groups;
 
 extern struct cred init_cred;
 
-#ifdef CONFIG_PERF_COUNTERS
-# define INIT_PERF_COUNTERS(tsk)					\
-	.perf_counter_ctx.counter_list =				\
-		LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list),	\
-	.perf_counter_ctx.event_list =					\
-		LIST_HEAD_INIT(tsk.perf_counter_ctx.event_list),	\
-	.perf_counter_ctx.lock =					\
-		__SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock),
-#else
-# define INIT_PERF_COUNTERS(tsk)
-#endif
-
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -183,7 +171,6 @@ extern struct cred init_cred;
 	},								\
 	.dirties = INIT_PROP_LOCAL_SINGLE(dirties),			\
 	INIT_IDS							\
-	INIT_PERF_COUNTERS(tsk)						\
 	INIT_TRACE_IRQFLAGS						\
 	INIT_LOCKDEP							\
 	INIT_FTRACE_GRAPH						\
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index f612941..0713090 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -449,7 +449,6 @@ struct perf_counter {
 	struct hw_perf_counter		hw;
 
 	struct perf_counter_context	*ctx;
-	struct task_struct		*task;
 	struct file			*filp;
 
 	struct perf_counter		*parent;
@@ -498,7 +497,6 @@ struct perf_counter {
  * Used as a container for task counters and CPU counters as well:
  */
 struct perf_counter_context {
-#ifdef CONFIG_PERF_COUNTERS
 	/*
 	 * Protect the states of the counters in the list,
 	 * nr_active, and the list:
@@ -516,6 +514,7 @@ struct perf_counter_context {
 	int			nr_counters;
 	int			nr_active;
 	int			is_active;
+	atomic_t		refcount;
 	struct task_struct	*task;
 
 	/*
@@ -523,7 +522,6 @@ struct perf_counter_context {
 	 */
 	u64			time;
 	u64			timestamp;
-#endif
 };
 
 /**
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ff59d123..9714d45 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -71,7 +71,6 @@ struct sched_param {
 #include <linux/path.h>
 #include <linux/compiler.h>
 #include <linux/completion.h>
-#include <linux/perf_counter.h>
 #include <linux/pid.h>
 #include <linux/percpu.h>
 #include <linux/topology.h>
@@ -99,6 +98,7 @@ struct robust_list_head;
 struct bio;
 struct bts_tracer;
 struct fs_struct;
+struct perf_counter_context;
 
 /*
  * List of flags we want to share for kernel threads,
@@ -1387,7 +1387,9 @@ struct task_struct {
 	struct list_head pi_state_list;
 	struct futex_pi_state *pi_state_cache;
 #endif
-	struct perf_counter_context perf_counter_ctx;
+#ifdef CONFIG_PERF_COUNTERS
+	struct perf_counter_context *perf_counter_ctxp;
+#endif
 #ifdef CONFIG_NUMA
 	struct mempolicy *mempolicy;
 	short il_next;
diff --git a/kernel/exit.c b/kernel/exit.c
index f9dfedd..99ad406 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -48,6 +48,7 @@
 #include <linux/tracehook.h>
 #include <linux/fs_struct.h>
 #include <linux/init_task.h>
+#include <linux/perf_counter.h>
 #include <trace/sched.h>
 
 #include <asm/uaccess.h>
@@ -159,7 +160,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
 
 #ifdef CONFIG_PERF_COUNTERS
-	WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));
+	WARN_ON_ONCE(tsk->perf_counter_ctxp);
 #endif
 	trace_sched_process_free(tsk);
 	put_task_struct(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index d32fef4..e72a09f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -63,6 +63,7 @@
 #include <linux/fs_struct.h>
 #include <trace/sched.h>
 #include <linux/magic.h>
+#include <linux/perf_counter.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 08584c1..06ea3ea 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -97,6 +97,17 @@ void perf_enable(void)
 		hw_perf_enable();
 }
 
+static void get_ctx(struct perf_counter_context *ctx)
+{
+	atomic_inc(&ctx->refcount);
+}
+
+static void put_ctx(struct perf_counter_context *ctx)
+{
+	if (atomic_dec_and_test(&ctx->refcount))
+		kfree(ctx);
+}
+
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
@@ -118,11 +129,17 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	ctx->nr_counters++;
 }
 
+/*
+ * Remove a counter from the lists for its context.
+ * Must be called with counter->mutex and ctx->mutex held.
+ */
 static void
 list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
 	struct perf_counter *sibling, *tmp;
 
+	if (list_empty(&counter->list_entry))
+		return;
 	ctx->nr_counters--;
 
 	list_del_init(&counter->list_entry);
@@ -216,8 +233,6 @@ static void __perf_counter_remove_from_context(void *info)
 
 	counter_sched_out(counter, cpuctx, ctx);
 
-	counter->task = NULL;
-
 	list_del_counter(counter, ctx);
 
 	if (!ctx->task) {
@@ -279,7 +294,6 @@ retry:
 	 */
 	if (!list_empty(&counter->list_entry)) {
 		list_del_counter(counter, ctx);
-		counter->task = NULL;
 	}
 	spin_unlock_irq(&ctx->lock);
 }
@@ -568,11 +582,17 @@ static void __perf_install_in_context(void *info)
 	 * If this is a task context, we need to check whether it is
 	 * the current task context of this cpu. If not it has been
 	 * scheduled out before the smp call arrived.
+	 * Or possibly this is the right context but it isn't
+	 * on this cpu because it had no counters.
 	 */
-	if (ctx->task && cpuctx->task_ctx != ctx)
-		return;
+	if (ctx->task && cpuctx->task_ctx != ctx) {
+		if (cpuctx->task_ctx || ctx->task != current)
+			return;
+		cpuctx->task_ctx = ctx;
+	}
 
 	spin_lock_irqsave(&ctx->lock, flags);
+	ctx->is_active = 1;
 	update_context_time(ctx);
 
 	/*
@@ -653,7 +673,6 @@ perf_install_in_context(struct perf_counter_context *ctx,
 		return;
 	}
 
-	counter->task = task;
 retry:
 	task_oncpu_function_call(task, __perf_install_in_context,
 				 counter);
@@ -693,10 +712,14 @@ static void __perf_counter_enable(void *info)
 	 * If this is a per-task counter, need to check whether this
 	 * counter's task is the current task on this cpu.
 	 */
-	if (ctx->task && cpuctx->task_ctx != ctx)
-		return;
+	if (ctx->task && cpuctx->task_ctx != ctx) {
+		if (cpuctx->task_ctx || ctx->task != current)
+			return;
+		cpuctx->task_ctx = ctx;
+	}
 
 	spin_lock_irqsave(&ctx->lock, flags);
+	ctx->is_active = 1;
 	update_context_time(ctx);
 
 	counter->prev_state = counter->state;
@@ -852,10 +875,10 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 {
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-	struct perf_counter_context *ctx = &task->perf_counter_ctx;
+	struct perf_counter_context *ctx = task->perf_counter_ctxp;
 	struct pt_regs *regs;
 
-	if (likely(!cpuctx->task_ctx))
+	if (likely(!ctx || !cpuctx->task_ctx))
 		return;
 
 	update_context_time(ctx);
@@ -871,6 +894,8 @@ static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
 {
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 
+	if (!cpuctx->task_ctx)
+		return;
 	__perf_counter_sched_out(ctx, cpuctx);
 	cpuctx->task_ctx = NULL;
 }
@@ -969,8 +994,10 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 {
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-	struct perf_counter_context *ctx = &task->perf_counter_ctx;
+	struct perf_counter_context *ctx = task->perf_counter_ctxp;
 
+	if (likely(!ctx))
+		return;
 	__perf_counter_sched_in(ctx, cpuctx, cpu);
 	cpuctx->task_ctx = ctx;
 }
@@ -985,11 +1012,11 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 int perf_counter_task_disable(void)
 {
 	struct task_struct *curr = current;
-	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+	struct perf_counter_context *ctx = curr->perf_counter_ctxp;
 	struct perf_counter *counter;
 	unsigned long flags;
 
-	if (likely(!ctx->nr_counters))
+	if (!ctx || !ctx->nr_counters)
 		return 0;
 
 	local_irq_save(flags);
@@ -1020,12 +1047,12 @@ int perf_counter_task_disable(void)
 int perf_counter_task_enable(void)
 {
 	struct task_struct *curr = current;
-	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+	struct perf_counter_context *ctx = curr->perf_counter_ctxp;
 	struct perf_counter *counter;
 	unsigned long flags;
 	int cpu;
 
-	if (likely(!ctx->nr_counters))
+	if (!ctx || !ctx->nr_counters)
 		return 0;
 
 	local_irq_save(flags);
@@ -1128,19 +1155,23 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 		return;
 
 	cpuctx = &per_cpu(perf_cpu_context, cpu);
-	ctx = &curr->perf_counter_ctx;
+	ctx = curr->perf_counter_ctxp;
 
 	perf_adjust_freq(&cpuctx->ctx);
-	perf_adjust_freq(ctx);
+	if (ctx)
+		perf_adjust_freq(ctx);
 
 	perf_counter_cpu_sched_out(cpuctx);
-	__perf_counter_task_sched_out(ctx);
+	if (ctx)
+		__perf_counter_task_sched_out(ctx);
 
 	rotate_ctx(&cpuctx->ctx);
-	rotate_ctx(ctx);
+	if (ctx)
+		rotate_ctx(ctx);
 
 	perf_counter_cpu_sched_in(cpuctx, cpu);
-	perf_counter_task_sched_in(curr, cpu);
+	if (ctx)
+		perf_counter_task_sched_in(curr, cpu);
 }
 
 /*
@@ -1176,6 +1207,22 @@ static u64 perf_counter_read(struct perf_counter *counter)
 	return atomic64_read(&counter->count);
 }
 
+/*
+ * Initialize the perf_counter context in a task_struct:
+ */
+static void
+__perf_counter_init_context(struct perf_counter_context *ctx,
+			    struct task_struct *task)
+{
+	memset(ctx, 0, sizeof(*ctx));
+	spin_lock_init(&ctx->lock);
+	mutex_init(&ctx->mutex);
+	INIT_LIST_HEAD(&ctx->counter_list);
+	INIT_LIST_HEAD(&ctx->event_list);
+	atomic_set(&ctx->refcount, 1);
+	ctx->task = task;
+}
+
 static void put_context(struct perf_counter_context *ctx)
 {
 	if (ctx->task)
@@ -1186,6 +1233,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 {
 	struct perf_cpu_context *cpuctx;
 	struct perf_counter_context *ctx;
+	struct perf_counter_context *tctx;
 	struct task_struct *task;
 
 	/*
@@ -1225,15 +1273,36 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	if (!task)
 		return ERR_PTR(-ESRCH);
 
-	ctx = &task->perf_counter_ctx;
-	ctx->task = task;
-
 	/* Reuse ptrace permission checks for now. */
 	if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
-		put_context(ctx);
+		put_task_struct(task);
 		return ERR_PTR(-EACCES);
 	}
 
+	ctx = task->perf_counter_ctxp;
+	if (!ctx) {
+		ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
+		if (!ctx) {
+			put_task_struct(task);
+			return ERR_PTR(-ENOMEM);
+		}
+		__perf_counter_init_context(ctx, task);
+		/*
+		 * Make sure other cpus see correct values for *ctx
+		 * once task->perf_counter_ctxp is visible to them.
+		 */
+		smp_wmb();
+		tctx = cmpxchg(&task->perf_counter_ctxp, NULL, ctx);
+		if (tctx) {
+			/*
+			 * We raced with some other task; use
+			 * the context they set.
+			 */
+			kfree(ctx);
+			ctx = tctx;
+		}
+	}
+
 	return ctx;
 }
 
@@ -1242,6 +1311,7 @@ static void free_counter_rcu(struct rcu_head *head)
 	struct perf_counter *counter;
 
 	counter = container_of(head, struct perf_counter, rcu_head);
+	put_ctx(counter->ctx);
 	kfree(counter);
 }
 
@@ -2247,7 +2317,7 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event)
 	perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
 	put_cpu_var(perf_cpu_context);
 
-	perf_counter_comm_ctx(&current->perf_counter_ctx, comm_event);
+	perf_counter_comm_ctx(current->perf_counter_ctxp, comm_event);
 }
 
 void perf_counter_comm(struct task_struct *task)
@@ -2256,7 +2326,9 @@ void perf_counter_comm(struct task_struct *task)
 
 	if (!atomic_read(&nr_comm_tracking))
 		return;
-       
+	if (!current->perf_counter_ctxp)
+		return;
+
 	comm_event = (struct perf_comm_event){
 		.task	= task,
 		.event  = {
@@ -2372,7 +2444,7 @@ got_name:
 	perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
 	put_cpu_var(perf_cpu_context);
 
-	perf_counter_mmap_ctx(&current->perf_counter_ctx, mmap_event);
+	perf_counter_mmap_ctx(current->perf_counter_ctxp, mmap_event);
 
 	kfree(buf);
 }
@@ -2384,6 +2456,8 @@ void perf_counter_mmap(unsigned long addr, unsigned long len,
 
 	if (!atomic_read(&nr_mmap_tracking))
 		return;
+	if (!current->perf_counter_ctxp)
+		return;
 
 	mmap_event = (struct perf_mmap_event){
 		.file   = file,
@@ -2985,6 +3059,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->group_leader		= group_leader;
 	counter->pmu			= NULL;
 	counter->ctx			= ctx;
+	get_ctx(ctx);
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	if (hw_event->disabled)
@@ -3150,21 +3225,6 @@ err_put_context:
 }
 
 /*
- * Initialize the perf_counter context in a task_struct:
- */
-static void
-__perf_counter_init_context(struct perf_counter_context *ctx,
-			    struct task_struct *task)
-{
-	memset(ctx, 0, sizeof(*ctx));
-	spin_lock_init(&ctx->lock);
-	mutex_init(&ctx->mutex);
-	INIT_LIST_HEAD(&ctx->counter_list);
-	INIT_LIST_HEAD(&ctx->event_list);
-	ctx->task = task;
-}
-
-/*
  * inherit a counter from parent task to child task:
  */
 static struct perf_counter *
@@ -3195,7 +3255,6 @@ inherit_counter(struct perf_counter *parent_counter,
 	/*
 	 * Link it up in the child's context:
 	 */
-	child_counter->task = child;
 	add_counter_to_ctx(child_counter, child_ctx);
 
 	child_counter->parent = parent_counter;
@@ -3294,40 +3353,15 @@ __perf_counter_exit_task(struct task_struct *child,
 	struct perf_counter *parent_counter;
 
 	/*
-	 * If we do not self-reap then we have to wait for the
-	 * child task to unschedule (it will happen for sure),
-	 * so that its counter is at its final count. (This
-	 * condition triggers rarely - child tasks usually get
-	 * off their CPU before the parent has a chance to
-	 * get this far into the reaping action)
+	 * Protect against concurrent operations on child_counter
+	 * due its fd getting closed, etc.
 	 */
-	if (child != current) {
-		wait_task_inactive(child, 0);
-		update_counter_times(child_counter);
-		list_del_counter(child_counter, child_ctx);
-	} else {
-		struct perf_cpu_context *cpuctx;
-		unsigned long flags;
-
-		/*
-		 * Disable and unlink this counter.
-		 *
-		 * Be careful about zapping the list - IRQ/NMI context
-		 * could still be processing it:
-		 */
-		local_irq_save(flags);
-		perf_disable();
-
-		cpuctx = &__get_cpu_var(perf_cpu_context);
+	mutex_lock(&child_counter->mutex);
 
-		group_sched_out(child_counter, cpuctx, child_ctx);
-		update_counter_times(child_counter);
+	update_counter_times(child_counter);
+	list_del_counter(child_counter, child_ctx);
 
-		list_del_counter(child_counter, child_ctx);
-
-		perf_enable();
-		local_irq_restore(flags);
-	}
+	mutex_unlock(&child_counter->mutex);
 
 	parent_counter = child_counter->parent;
 	/*
@@ -3346,19 +3380,29 @@ __perf_counter_exit_task(struct task_struct *child,
  *
  * Note: we may be running in child context, but the PID is not hashed
  * anymore so new counters will not be added.
+ * (XXX not sure that is true when we get called from flush_old_exec.
+ *  -- paulus)
  */
 void perf_counter_exit_task(struct task_struct *child)
 {
 	struct perf_counter *child_counter, *tmp;
 	struct perf_counter_context *child_ctx;
+	unsigned long flags;
 
 	WARN_ON_ONCE(child != current);
 
-	child_ctx = &child->perf_counter_ctx;
+	child_ctx = child->perf_counter_ctxp;
 
-	if (likely(!child_ctx->nr_counters))
+	if (likely(!child_ctx))
 		return;
 
+	local_irq_save(flags);
+	__perf_counter_task_sched_out(child_ctx);
+	child->perf_counter_ctxp = NULL;
+	local_irq_restore(flags);
+
+	mutex_lock(&child_ctx->mutex);
+
 again:
 	list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
 				 list_entry)
@@ -3371,6 +3415,10 @@ again:
 	 */
 	if (!list_empty(&child_ctx->counter_list))
 		goto again;
+
+	mutex_unlock(&child_ctx->mutex);
+
+	put_ctx(child_ctx);
 }
 
 /*
@@ -3382,19 +3430,25 @@ void perf_counter_init_task(struct task_struct *child)
 	struct perf_counter *counter;
 	struct task_struct *parent = current;
 
-	child_ctx  =  &child->perf_counter_ctx;
-	parent_ctx = &parent->perf_counter_ctx;
-
-	__perf_counter_init_context(child_ctx, child);
+	child->perf_counter_ctxp = NULL;
 
 	/*
 	 * This is executed from the parent task context, so inherit
-	 * counters that have been marked for cloning:
+	 * counters that have been marked for cloning.
+	 * First allocate and initialize a context for the child.
 	 */
 
-	if (likely(!parent_ctx->nr_counters))
+	child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
+	if (!child_ctx)
+		return;
+
+	parent_ctx = parent->perf_counter_ctxp;
+	if (likely(!parent_ctx || !parent_ctx->nr_counters))
 		return;
 
+	__perf_counter_init_context(child_ctx, child);
+	child->perf_counter_ctxp = child_ctx;
+
 	/*
 	 * Lock the parent list. No need to lock the child - not PID
 	 * hashed yet and not running, so nobody can access it.
-- 
cgit v0.10.2


From 564c2b210add41df9a3a5aaa365c1d97cff6110d Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 22 May 2009 14:27:22 +1000
Subject: perf_counter: Optimize context switch between identical inherited
 contexts

When monitoring a process and its descendants with a set of inherited
counters, we can often get the situation in a context switch where
both the old (outgoing) and new (incoming) process have the same set
of counters, and their values are ultimately going to be added together.
In that situation it doesn't matter which set of counters are used to
count the activity for the new process, so there is really no need to
go through the process of reading the hardware counters and updating
the old task's counters and then setting up the PMU for the new task.

This optimizes the context switch in this situation.  Instead of
scheduling out the perf_counter_context for the old task and
scheduling in the new context, we simply transfer the old context
to the new task and keep using it without interruption.  The new
context gets transferred to the old task.  This means that both
tasks still have a valid perf_counter_context, so no special case
is introduced when the old task gets scheduled in again, either on
this CPU or another CPU.

The equivalence of contexts is detected by keeping a pointer in
each cloned context pointing to the context it was cloned from.
To cope with the situation where a context is changed by adding
or removing counters after it has been cloned, we also keep a
generation number on each context which is incremented every time
a context is changed.  When a context is cloned we take a copy
of the parent's generation number, and two cloned contexts are
equivalent only if they have the same parent and the same
generation number.  In order that the parent context pointer
remains valid (and is not reused), we increment the parent
context's reference count for each context cloned from it.

Since we don't have individual fds for the counters in a cloned
context, the only thing that can make two clones of a given parent
different after they have been cloned is enabling or disabling all
counters with prctl.  To account for this, we keep a count of the
number of enabled counters in each context.  Two contexts must have
the same number of enabled counters to be considered equivalent.

Here are some measurements of the context switch time as measured with
the lat_ctx benchmark from lmbench, comparing the times obtained with
and without this patch series:

		-----Unmodified-----		With this patch series
Counters:	none	2 HW	4H+4S	none	2 HW	4H+4S

2 processes:
Average		3.44	6.45	11.24	3.12	3.39	3.60
St dev		0.04	0.04	0.13	0.05	0.17	0.19

8 processes:
Average		6.45	8.79	14.00	5.57	6.23	7.57
St dev		1.27	1.04	0.88	1.42	1.46	1.42

32 processes:
Average		5.56	8.43	13.78	5.28	5.55	7.15
St dev		0.41	0.47	0.53	0.54	0.57	0.81

The numbers are the mean and standard deviation of 20 runs of
lat_ctx.  The "none" columns are lat_ctx run directly without any
counters.  The "2 HW" columns are with lat_ctx run under perfstat,
counting cycles and instructions.  The "4H+4S" columns are lat_ctx run
under perfstat with 4 hardware counters and 4 software counters
(cycles, instructions, cache references, cache misses, task
clock, context switch, cpu migrations, and page faults).

[ Impact: performance optimization of counter context-switches ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <18966.10666.517218.332164@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 0713090..4cae01a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -513,6 +513,7 @@ struct perf_counter_context {
 	struct list_head	event_list;
 	int			nr_counters;
 	int			nr_active;
+	int			nr_enabled;
 	int			is_active;
 	atomic_t		refcount;
 	struct task_struct	*task;
@@ -522,6 +523,14 @@ struct perf_counter_context {
 	 */
 	u64			time;
 	u64			timestamp;
+
+	/*
+	 * These fields let us detect when two contexts have both
+	 * been cloned (inherited) from a common ancestor.
+	 */
+	struct perf_counter_context *parent_ctx;
+	u32			parent_gen;
+	u32			generation;
 };
 
 /**
@@ -552,7 +561,8 @@ extern int perf_max_counters;
 extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter);
 
 extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
-extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
+extern void perf_counter_task_sched_out(struct task_struct *task,
+					struct task_struct *next, int cpu);
 extern void perf_counter_task_tick(struct task_struct *task, int cpu);
 extern void perf_counter_init_task(struct task_struct *child);
 extern void perf_counter_exit_task(struct task_struct *child);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 06ea3ea..c100554 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -104,8 +104,11 @@ static void get_ctx(struct perf_counter_context *ctx)
 
 static void put_ctx(struct perf_counter_context *ctx)
 {
-	if (atomic_dec_and_test(&ctx->refcount))
+	if (atomic_dec_and_test(&ctx->refcount)) {
+		if (ctx->parent_ctx)
+			put_ctx(ctx->parent_ctx);
 		kfree(ctx);
+	}
 }
 
 static void
@@ -127,6 +130,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 
 	list_add_rcu(&counter->event_entry, &ctx->event_list);
 	ctx->nr_counters++;
+	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+		ctx->nr_enabled++;
 }
 
 /*
@@ -141,6 +146,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	if (list_empty(&counter->list_entry))
 		return;
 	ctx->nr_counters--;
+	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+		ctx->nr_enabled--;
 
 	list_del_init(&counter->list_entry);
 	list_del_rcu(&counter->event_entry);
@@ -204,6 +211,22 @@ group_sched_out(struct perf_counter *group_counter,
 }
 
 /*
+ * Mark this context as not being a clone of another.
+ * Called when counters are added to or removed from this context.
+ * We also increment our generation number so that anything that
+ * was cloned from this context before this will not match anything
+ * cloned from this context after this.
+ */
+static void unclone_ctx(struct perf_counter_context *ctx)
+{
+	++ctx->generation;
+	if (!ctx->parent_ctx)
+		return;
+	put_ctx(ctx->parent_ctx);
+	ctx->parent_ctx = NULL;
+}
+
+/*
  * Cross CPU call to remove a performance counter
  *
  * We disable the counter on the hardware level first. After that we
@@ -263,6 +286,7 @@ static void perf_counter_remove_from_context(struct perf_counter *counter)
 	struct perf_counter_context *ctx = counter->ctx;
 	struct task_struct *task = ctx->task;
 
+	unclone_ctx(ctx);
 	if (!task) {
 		/*
 		 * Per cpu counters are removed via an smp call and
@@ -378,6 +402,7 @@ static void __perf_counter_disable(void *info)
 		else
 			counter_sched_out(counter, cpuctx, ctx);
 		counter->state = PERF_COUNTER_STATE_OFF;
+		ctx->nr_enabled--;
 	}
 
 	spin_unlock_irqrestore(&ctx->lock, flags);
@@ -419,6 +444,7 @@ static void perf_counter_disable(struct perf_counter *counter)
 	if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
 		update_counter_times(counter);
 		counter->state = PERF_COUNTER_STATE_OFF;
+		ctx->nr_enabled--;
 	}
 
 	spin_unlock_irq(&ctx->lock);
@@ -727,6 +753,7 @@ static void __perf_counter_enable(void *info)
 		goto unlock;
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
+	ctx->nr_enabled++;
 
 	/*
 	 * If the counter is in a group and isn't the group leader,
@@ -817,6 +844,7 @@ static void perf_counter_enable(struct perf_counter *counter)
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
 		counter->tstamp_enabled =
 			ctx->time - counter->total_time_enabled;
+		ctx->nr_enabled++;
 	}
  out:
 	spin_unlock_irq(&ctx->lock);
@@ -862,6 +890,25 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 }
 
 /*
+ * Test whether two contexts are equivalent, i.e. whether they
+ * have both been cloned from the same version of the same context
+ * and they both have the same number of enabled counters.
+ * If the number of enabled counters is the same, then the set
+ * of enabled counters should be the same, because these are both
+ * inherited contexts, therefore we can't access individual counters
+ * in them directly with an fd; we can only enable/disable all
+ * counters via prctl, or enable/disable all counters in a family
+ * via ioctl, which will have the same effect on both contexts.
+ */
+static int context_equiv(struct perf_counter_context *ctx1,
+			 struct perf_counter_context *ctx2)
+{
+	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
+		&& ctx1->parent_gen == ctx2->parent_gen
+		&& ctx1->nr_enabled == ctx2->nr_enabled;
+}
+
+/*
  * Called from scheduler to remove the counters of the current task,
  * with interrupts disabled.
  *
@@ -872,10 +919,12 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
  * accessing the counter control register. If a NMI hits, then it will
  * not restart the counter.
  */
-void perf_counter_task_sched_out(struct task_struct *task, int cpu)
+void perf_counter_task_sched_out(struct task_struct *task,
+				 struct task_struct *next, int cpu)
 {
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 	struct perf_counter_context *ctx = task->perf_counter_ctxp;
+	struct perf_counter_context *next_ctx;
 	struct pt_regs *regs;
 
 	if (likely(!ctx || !cpuctx->task_ctx))
@@ -885,6 +934,16 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 
 	regs = task_pt_regs(task);
 	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
+
+	next_ctx = next->perf_counter_ctxp;
+	if (next_ctx && context_equiv(ctx, next_ctx)) {
+		task->perf_counter_ctxp = next_ctx;
+		next->perf_counter_ctxp = ctx;
+		ctx->task = next;
+		next_ctx->task = task;
+		return;
+	}
+
 	__perf_counter_sched_out(ctx, cpuctx);
 
 	cpuctx->task_ctx = NULL;
@@ -998,6 +1057,8 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 
 	if (likely(!ctx))
 		return;
+	if (cpuctx->task_ctx == ctx)
+		return;
 	__perf_counter_sched_in(ctx, cpuctx, cpu);
 	cpuctx->task_ctx = ctx;
 }
@@ -3253,6 +3314,16 @@ inherit_counter(struct perf_counter *parent_counter,
 		return child_counter;
 
 	/*
+	 * Make the child state follow the state of the parent counter,
+	 * not its hw_event.disabled bit.  We hold the parent's mutex,
+	 * so we won't race with perf_counter_{en,dis}able_family.
+	 */
+	if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
+		child_counter->state = PERF_COUNTER_STATE_INACTIVE;
+	else
+		child_counter->state = PERF_COUNTER_STATE_OFF;
+
+	/*
 	 * Link it up in the child's context:
 	 */
 	add_counter_to_ctx(child_counter, child_ctx);
@@ -3277,16 +3348,6 @@ inherit_counter(struct perf_counter *parent_counter,
 	mutex_lock(&parent_counter->mutex);
 	list_add_tail(&child_counter->child_list, &parent_counter->child_list);
 
-	/*
-	 * Make the child state follow the state of the parent counter,
-	 * not its hw_event.disabled bit.  We hold the parent's mutex,
-	 * so we won't race with perf_counter_{en,dis}able_family.
-	 */
-	if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
-		child_counter->state = PERF_COUNTER_STATE_INACTIVE;
-	else
-		child_counter->state = PERF_COUNTER_STATE_OFF;
-
 	mutex_unlock(&parent_counter->mutex);
 
 	return child_counter;
@@ -3429,6 +3490,7 @@ void perf_counter_init_task(struct task_struct *child)
 	struct perf_counter_context *child_ctx, *parent_ctx;
 	struct perf_counter *counter;
 	struct task_struct *parent = current;
+	int inherited_all = 1;
 
 	child->perf_counter_ctxp = NULL;
 
@@ -3463,12 +3525,31 @@ void perf_counter_init_task(struct task_struct *child)
 		if (counter != counter->group_leader)
 			continue;
 
-		if (!counter->hw_event.inherit)
+		if (!counter->hw_event.inherit) {
+			inherited_all = 0;
 			continue;
+		}
 
 		if (inherit_group(counter, parent,
-				  parent_ctx, child, child_ctx))
+				  parent_ctx, child, child_ctx)) {
+			inherited_all = 0;
 			break;
+		}
+	}
+
+	if (inherited_all) {
+		/*
+		 * Mark the child context as a clone of the parent
+		 * context, or of whatever the parent is a clone of.
+		 */
+		if (parent_ctx->parent_ctx) {
+			child_ctx->parent_ctx = parent_ctx->parent_ctx;
+			child_ctx->parent_gen = parent_ctx->parent_gen;
+		} else {
+			child_ctx->parent_ctx = parent_ctx;
+			child_ctx->parent_gen = parent_ctx->generation;
+		}
+		get_ctx(child_ctx->parent_ctx);
 	}
 
 	mutex_unlock(&parent_ctx->mutex);
diff --git a/kernel/sched.c b/kernel/sched.c
index 419a39d0..4c0d58b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5091,7 +5091,7 @@ need_resched_nonpreemptible:
 
 	if (likely(prev != next)) {
 		sched_info_switch(prev, next);
-		perf_counter_task_sched_out(prev, cpu);
+		perf_counter_task_sched_out(prev, next, cpu);
 
 		rq->nr_switches++;
 		rq->curr = next;
-- 
cgit v0.10.2


From 910431c7f2e963017d767b29c80ae706421e569f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 22 May 2009 12:32:15 +0200
Subject: perf_counter: fix !PERF_COUNTERS build failure

Update the !CONFIG_PERF_COUNTERS prototype too, for
perf_counter_task_sched_out().

[ Impact: build fix ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <18966.10666.517218.332164@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 4cae01a..2eedae8 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -625,7 +625,8 @@ extern void perf_counter_init(void);
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
 static inline void
-perf_counter_task_sched_out(struct task_struct *task, int cpu)		{ }
+perf_counter_task_sched_out(struct task_struct *task,
+			    struct task_struct *next, int cpu)		{ }
 static inline void
 perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
 static inline void perf_counter_init_task(struct task_struct *child)	{ }
-- 
cgit v0.10.2


From d5046853634a8d73f28bad3cf68d182c4a99035d Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 22 May 2009 20:36:21 +0900
Subject: nilfs2: fix memory leak in nilfs_ioctl_clean_segments

This fixes a new memory leak problem in garbage collection.  The
problem was brought by the bugfix patch ("nilfs2: fix lock order
reversal in nilfs_clean_segments ioctl").

Thanks to Kentaro Suzuki for finding this problem.

Reported-by: Kentaro Suzuki <k_suzuki@ms.sylc.co.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>

diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 50ff3f2..d6759b9 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -576,7 +576,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
 	ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
 
  out_free:
-	while (--n > 0)
+	while (--n >= 0)
 		vfree(kbufs[n]);
 	kfree(kbufs[4]);
 	return ret;
-- 
cgit v0.10.2


From 63d3892379f93b73ef905fb3449f4e4438a53b40 Mon Sep 17 00:00:00 2001
From: Wu Zhangjin <wuzhangjin@gmail.com>
Date: Thu, 21 May 2009 05:50:01 +0800
Subject: MIPS: Fix sparse warning in incompatiable argument type of
 clear_user.

The type of the second argument of access_ok should be (void __user *).
The unnecessary conversion of the clear_user address argument was causing
sparse to emit warnings on the __chk_user_ptr check.

Signed-off-by: Wu Zhangjin <wuzhangjin@gmail.com>
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>

diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h
index 8de858f..c2d53c1 100644
--- a/arch/mips/include/asm/uaccess.h
+++ b/arch/mips/include/asm/uaccess.h
@@ -956,7 +956,7 @@ __clear_user(void __user *addr, __kernel_size_t size)
 	void __user * __cl_addr = (addr);				\
 	unsigned long __cl_size = (n);					\
 	if (__cl_size && access_ok(VERIFY_WRITE,			\
-		((unsigned long)(__cl_addr)), __cl_size))		\
+					__cl_addr, __cl_size))		\
 		__cl_size = __clear_user(__cl_addr, __cl_size);		\
 	__cl_size;							\
 })
-- 
cgit v0.10.2


From 63c901c7e6fb878805cd2f8f14fa3eee8c03ee84 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Fri, 22 May 2009 10:48:17 +0100
Subject: MIPS: IP32: Fix build error due to uninitialized variable.

  CC      arch/mips/sgi-ip32/ip32-reset.o
cc1: warnings being treated as errors
arch/mips/sgi-ip32/ip32-reset.c: In function 'debounce':
arch/mips/sgi-ip32/ip32-reset.c:97: error: 'reg_a' is used uninitialized in this function

The issues is old but due to the volatile keyword gcc older than 4.4 did
not warn about this obvious bug.

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>

diff --git a/arch/mips/sgi-ip32/ip32-reset.c b/arch/mips/sgi-ip32/ip32-reset.c
index 667da93..cc549a9 100644
--- a/arch/mips/sgi-ip32/ip32-reset.c
+++ b/arch/mips/sgi-ip32/ip32-reset.c
@@ -94,6 +94,7 @@ static void debounce(unsigned long data)
 	volatile unsigned char reg_a, reg_c, xctrl_a;
 
 	reg_c = CMOS_READ(RTC_INTR_FLAGS);
+	reg_a = CMOS_READ(RTC_REG_A);
 	CMOS_WRITE(reg_a | DS_REGA_DV0, RTC_REG_A);
 	wbflush();
 	xctrl_a = CMOS_READ(DS_B1_XCTRL4A);
-- 
cgit v0.10.2


From d2f82c2f70d56ba4623de25edb383fec01f43b89 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Fri, 22 May 2009 10:58:43 +0100
Subject: MIPS: IP32: Remove unnecessary if not even harmful volatile keywords.

They are unneeded and as the issue fixed in lmo commit
63f7ec59053e3f850ab67a9938e631bcba64c6ce shows even harmful.

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>

diff --git a/arch/mips/sgi-ip32/ip32-reset.c b/arch/mips/sgi-ip32/ip32-reset.c
index cc549a9..9b95d80 100644
--- a/arch/mips/sgi-ip32/ip32-reset.c
+++ b/arch/mips/sgi-ip32/ip32-reset.c
@@ -53,7 +53,7 @@ static inline void ip32_machine_halt(void)
 
 static void ip32_machine_power_off(void)
 {
-	volatile unsigned char reg_a, xctrl_a, xctrl_b;
+	unsigned char reg_a, xctrl_a, xctrl_b;
 
 	disable_irq(MACEISA_RTC_IRQ);
 	reg_a = CMOS_READ(RTC_REG_A);
@@ -91,7 +91,7 @@ static void blink_timeout(unsigned long data)
 
 static void debounce(unsigned long data)
 {
-	volatile unsigned char reg_a, reg_c, xctrl_a;
+	unsigned char reg_a, reg_c, xctrl_a;
 
 	reg_c = CMOS_READ(RTC_INTR_FLAGS);
 	reg_a = CMOS_READ(RTC_REG_A);
@@ -138,7 +138,7 @@ static inline void ip32_power_button(void)
 
 static irqreturn_t ip32_rtc_int(int irq, void *dev_id)
 {
-	volatile unsigned char reg_c;
+	unsigned char reg_c;
 
 	reg_c = CMOS_READ(RTC_INTR_FLAGS);
 	if (!(reg_c & RTC_IRQF)) {
-- 
cgit v0.10.2


From 8369d5fa63260cc54464b4687aa6a0f78402d98e Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 22 May 2009 16:23:36 +0200
Subject: ide: fix 40-wire cable detection for TSST SH-S202* ATAPI devices (v2)

Since 2.6.26 we support UDMA66 on ATAPI devices requiring IVB quirk:

  commit 8588a2b732928b343233af9b1855705b8286bed4
  ("ide: add SH-S202J to ivb_list[]")

We also later added support for more such devices in:

  commit e97564f362a93f8c248246c19828895950341252
  ("ide: More TSST drives with broken cable detection")

and in:

  commit 3ced5c49bd2d1f2c7f769e3a54385883de63a652
  ("ide: add TSSTcorp CDDVDW SH-S202H to ivb_list[]")

It turns out that such devices lack cable detection altogether
(which in turn results in incorrect detection of 40-wire cables
by our current cable detection strategy) so always handle them
by trusting host-side cable detection only.

v2:
Model detection fixup from Martin.

Reported-and-tested-by: Martin Lottermoser <Martin.Lottermoser@t-online.de>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>

diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index c19a221..06fe002 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -206,8 +206,6 @@ EXPORT_SYMBOL_GPL(ide_in_drive_list);
 
 /*
  * Early UDMA66 devices don't set bit14 to 1, only bit13 is valid.
- * We list them here and depend on the device side cable detection for them.
- *
  * Some optical devices with the buggy firmwares have the same problem.
  */
 static const struct drive_list_entry ivb_list[] = {
@@ -251,10 +249,25 @@ u8 eighty_ninty_three(ide_drive_t *drive)
 	 * - force bit13 (80c cable present) check also for !ivb devices
 	 *   (unless the slave device is pre-ATA3)
 	 */
-	if ((id[ATA_ID_HW_CONFIG] & 0x4000) ||
-	    (ivb && (id[ATA_ID_HW_CONFIG] & 0x2000)))
+	if (id[ATA_ID_HW_CONFIG] & 0x4000)
 		return 1;
 
+	if (ivb) {
+		const char *model = (char *)&id[ATA_ID_PROD];
+
+		if (strstr(model, "TSSTcorp CDDVDW SH-S202")) {
+			/*
+			 * These ATAPI devices always report 80c cable
+			 * so we have to depend on the host in this case.
+			 */
+			if (hwif->cbl == ATA_CBL_PATA80)
+				return 1;
+		} else {
+			/* Depend on the device side cable detection. */
+			if (id[ATA_ID_HW_CONFIG] & 0x2000)
+				return 1;
+		}
+	}
 no_80w:
 	if (drive->dev_flags & IDE_DFLAG_UDMA33_WARNED)
 		return 0;
-- 
cgit v0.10.2


From e3b29f05124b07303088795396ff858811d2acb8 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 22 May 2009 16:23:37 +0200
Subject: ide: fix OOPS during ide-cd error recovery

On Tuesday 19 May 2009 20:29:28 Martin Lottermoser wrote:

>   hdc: cdrom_decode_status: error=0x40 <3>{ LastFailedSense=0x04 }
>   ide: failed opcode was: unknown
>   hdc: DMA disabled
>   ------------[ cut here ]------------
>   kernel BUG at drivers/ide/ide-io.c:872!

It is possible for ide-cd to ignore ide_error()'s return value under
some circumstances.  Workaround it in ide_intr() and ide_timer_expiry()
by checking if there is a device/port reset pending currently.

Fixes bug #13345:

	http://bugzilla.kernel.org/show_bug.cgi?id=13345

Reported-by: Martin Lottermoser <Martin.Lottermoser@t-online.de>
Reported-and-tested-by: Modestas Vainius <modestas@vainius.eu>
Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 35dc38d..6415a2e 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -696,7 +696,7 @@ void ide_timer_expiry (unsigned long data)
 		}
 		spin_lock_irq(&hwif->lock);
 		enable_irq(hwif->irq);
-		if (startstop == ide_stopped) {
+		if (startstop == ide_stopped && hwif->polling == 0) {
 			ide_unlock_port(hwif);
 			plug_device = 1;
 		}
@@ -868,7 +868,7 @@ irqreturn_t ide_intr (int irq, void *dev_id)
 	 * same irq as is currently being serviced here, and Linux
 	 * won't allow another of the same (on any CPU) until we return.
 	 */
-	if (startstop == ide_stopped) {
+	if (startstop == ide_stopped && hwif->polling == 0) {
 		BUG_ON(hwif->handler);
 		ide_unlock_port(hwif);
 		plug_device = 1;
-- 
cgit v0.10.2


From 26bfcf21e25fa090f099fa0ccf201424989cbd7b Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 22 May 2009 16:23:37 +0200
Subject: ide: fix printk() levels in ide_dump_ata[pi]_error()

Fixes "<3>" in error messages like this one:

hdc: cdrom_decode_status: error=0x40 <3>{ LastFailedSense=0x04 }

Reported-by: Martin Lottermoser <Martin.Lottermoser@t-online.de>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>

diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c
index 56ff8c4..85b9bae 100644
--- a/drivers/ide/ide-lib.c
+++ b/drivers/ide/ide-lib.c
@@ -91,7 +91,7 @@ static void ide_dump_sector(ide_drive_t *drive)
 
 static void ide_dump_ata_error(ide_drive_t *drive, u8 err)
 {
-	printk(KERN_ERR "{ ");
+	printk(KERN_CONT "{ ");
 	if (err & ATA_ABORTED)
 		printk(KERN_CONT "DriveStatusError ");
 	if (err & ATA_ICRC)
@@ -121,7 +121,7 @@ static void ide_dump_ata_error(ide_drive_t *drive, u8 err)
 
 static void ide_dump_atapi_error(ide_drive_t *drive, u8 err)
 {
-	printk(KERN_ERR "{ ");
+	printk(KERN_CONT "{ ");
 	if (err & ATAPI_ILI)
 		printk(KERN_CONT "IllegalLengthIndication ");
 	if (err & ATAPI_EOM)
-- 
cgit v0.10.2


From cc30137a221372c67a943ad9ea68121a2bd57a6e Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 22 May 2009 16:23:38 +0200
Subject: ide: improve failed opcode reporting

Nowadays we (almost) always store the currently executing command
in hwif->cmd so we can use it for the failed opcode reporting.

Cc: Martin Lottermoser <Martin.Lottermoser@t-online.de>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>

diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c
index 85b9bae..2148df8 100644
--- a/drivers/ide/ide-lib.c
+++ b/drivers/ide/ide-lib.c
@@ -31,24 +31,6 @@ void ide_toggle_bounce(ide_drive_t *drive, int on)
 		blk_queue_bounce_limit(drive->queue, addr);
 }
 
-static void ide_dump_opcode(ide_drive_t *drive)
-{
-	struct request *rq = drive->hwif->rq;
-	struct ide_cmd *cmd = NULL;
-
-	if (!rq)
-		return;
-
-	if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE)
-		cmd = rq->special;
-
-	printk(KERN_ERR "ide: failed opcode was: ");
-	if (cmd == NULL)
-		printk(KERN_CONT "unknown\n");
-	else
-		printk(KERN_CONT "0x%02x\n", cmd->tf.command);
-}
-
 u64 ide_get_lba_addr(struct ide_cmd *cmd, int lba48)
 {
 	struct ide_taskfile *tf = &cmd->tf;
@@ -179,7 +161,10 @@ u8 ide_dump_status(ide_drive_t *drive, const char *msg, u8 stat)
 		else
 			ide_dump_atapi_error(drive, err);
 	}
-	ide_dump_opcode(drive);
+
+	printk(KERN_ERR "%s: possibly failed opcode: 0x%02x\n",
+		drive->name, drive->hwif->cmd.tf.command);
+
 	return err;
 }
 EXPORT_SYMBOL(ide_dump_status);
-- 
cgit v0.10.2


From 28ee9bc5cc42776e0364399b401a64906ac1ac8e Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 22 May 2009 16:23:38 +0200
Subject: ide: report timeouts in ide_busy_sleep()

* change 'hwif' argument to 'drive'
* report an error on timeout

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>

diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 7f264ed..c895ed5 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -295,7 +295,7 @@ int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id)
 
 	timeout = ((cmd == ATA_CMD_ID_ATA) ? WAIT_WORSTCASE : WAIT_PIDENTIFY) / 2;
 
-	if (ide_busy_sleep(hwif, timeout, use_altstatus))
+	if (ide_busy_sleep(drive, timeout, use_altstatus))
 		return 1;
 
 	/* wait for IRQ and ATA_DRQ */
@@ -316,8 +316,9 @@ int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id)
 	return rc;
 }
 
-int ide_busy_sleep(ide_hwif_t *hwif, unsigned long timeout, int altstatus)
+int ide_busy_sleep(ide_drive_t *drive, unsigned long timeout, int altstatus)
 {
+	ide_hwif_t *hwif = drive->hwif;
 	u8 stat;
 
 	timeout += jiffies;
@@ -330,6 +331,8 @@ int ide_busy_sleep(ide_hwif_t *hwif, unsigned long timeout, int altstatus)
 			return 0;
 	} while (time_before(jiffies, timeout));
 
+	printk(KERN_ERR "%s: timeout in %s\n", drive->name, __func__);
+
 	return 1;	/* drive timed-out */
 }
 
@@ -420,7 +423,7 @@ static int do_probe (ide_drive_t *drive, u8 cmd)
 			tp_ops->dev_select(drive);
 			msleep(50);
 			tp_ops->exec_command(hwif, ATA_CMD_DEV_RESET);
-			(void)ide_busy_sleep(hwif, WAIT_WORSTCASE, 0);
+			(void)ide_busy_sleep(drive, WAIT_WORSTCASE, 0);
 			rc = ide_dev_read_id(drive, cmd, id);
 		}
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index ff65fff..9fed365 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1109,7 +1109,7 @@ void ide_fix_driveid(u16 *);
 
 extern void ide_fixstring(u8 *, const int, const int);
 
-int ide_busy_sleep(ide_hwif_t *, unsigned long, int);
+int ide_busy_sleep(ide_drive_t *, unsigned long, int);
 
 int ide_wait_stat(ide_startstop_t *, ide_drive_t *, u8, u8, unsigned long);
 
-- 
cgit v0.10.2


From 5993856e53fbc4b4f28e2d481deaebeb715b1267 Mon Sep 17 00:00:00 2001
From: Harald Welte <HaraldWelte@viatech.com>
Date: Fri, 22 May 2009 16:23:39 +0200
Subject: via82cxxx: Add VIA VX855 PCI Device ID

This patch adds the PCI Device ID 0xc409 to the PCI ID table of via82cxxx.c,
as well as the 0x8409 south bridge ID.

This is required to make the IDE driver work on the VX855/VX875 integrated
chipset.

Signed-off-by: Harald Welte <HaraldWelte@viatech.com>
Cc: Joseph Chan <JosephChan@via.com.tw>
Cc: Bruce Chang <BruceChang@via.com.tw>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>

diff --git a/drivers/ide/via82cxxx.c b/drivers/ide/via82cxxx.c
index 3ff7231..028de26 100644
--- a/drivers/ide/via82cxxx.c
+++ b/drivers/ide/via82cxxx.c
@@ -67,6 +67,7 @@ static struct via_isa_bridge {
 	u8 udma_mask;
 	u8 flags;
 } via_isa_bridges[] = {
+	{ "vx855",	PCI_DEVICE_ID_VIA_VX855,    0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
 	{ "vx800",	PCI_DEVICE_ID_VIA_VX800,    0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
 	{ "cx700",	PCI_DEVICE_ID_VIA_CX700,    0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
 	{ "vt8237s",	PCI_DEVICE_ID_VIA_8237S,    0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
@@ -474,6 +475,7 @@ static const struct pci_device_id via_pci_tbl[] = {
 	{ PCI_VDEVICE(VIA, PCI_DEVICE_ID_VIA_82C576_1),  0 },
 	{ PCI_VDEVICE(VIA, PCI_DEVICE_ID_VIA_82C586_1),  0 },
 	{ PCI_VDEVICE(VIA, PCI_DEVICE_ID_VIA_CX700_IDE), 0 },
+	{ PCI_VDEVICE(VIA, PCI_DEVICE_ID_VIA_VX855_IDE), 0 },
 	{ PCI_VDEVICE(VIA, PCI_DEVICE_ID_VIA_6410),      1 },
 	{ PCI_VDEVICE(VIA, PCI_DEVICE_ID_VIA_SATA_EIDE), 1 },
 	{ 0, },
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 06ba90c..0f71812 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1406,7 +1406,7 @@
 #define PCI_DEVICE_ID_VIA_82C598_1	0x8598
 #define PCI_DEVICE_ID_VIA_838X_1	0xB188
 #define PCI_DEVICE_ID_VIA_83_87XX_1	0xB198
-#define PCI_DEVICE_ID_VIA_C409_IDE	0XC409
+#define PCI_DEVICE_ID_VIA_VX855_IDE	0xC409
 #define PCI_DEVICE_ID_VIA_ANON		0xFFFF
 
 #define PCI_VENDOR_ID_SIEMENS           0x110A
-- 
cgit v0.10.2


From 9a2845c453d170e4e9b1437fa671dbf39b0e7bd8 Mon Sep 17 00:00:00 2001
From: Corey Minyard <cminyard@mvista.com>
Date: Wed, 20 May 2009 13:36:17 -0500
Subject: ipmi: fix ipmi_si modprobe hang

Instead of queuing IPMB messages before channel initialization, just
throw them away.  Nobody will be listening for them at this point,
anyway, and they will clog up the queue and nothing will be delivered
if we queue them.

Also set the current channel to the number of channels, as this value
is used to tell if the channel information has been initialized.

Signed-off-by: Corey Minyard <cminyard@mvista.com>
Cc: Ferenc Wagner <wferi@niif.hu>
Cc: Dan Frazier <dannf@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index aa83a08..0905079 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -2856,6 +2856,7 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 		/* Assume a single IPMB channel at zero. */
 		intf->channels[0].medium = IPMI_CHANNEL_MEDIUM_IPMB;
 		intf->channels[0].protocol = IPMI_CHANNEL_PROTOCOL_IPMB;
+		intf->curr_channel = IPMI_MAX_CHANNELS;
 	}
 
 	if (rv == 0)
@@ -3648,13 +3649,13 @@ static int handle_new_recv_msg(ipmi_smi_t          intf,
 		}
 
 		/*
-		** We need to make sure the channels have been initialized.
-		** The channel_handler routine will set the "curr_channel"
-		** equal to or greater than IPMI_MAX_CHANNELS when all the
-		** channels for this interface have been initialized.
-		*/
+		 * We need to make sure the channels have been initialized.
+		 * The channel_handler routine will set the "curr_channel"
+		 * equal to or greater than IPMI_MAX_CHANNELS when all the
+		 * channels for this interface have been initialized.
+		 */
 		if (intf->curr_channel < IPMI_MAX_CHANNELS) {
-			requeue = 1;     /* Just put the message back for now */
+			requeue = 0; /* Throw the message away */
 			goto out;
 		}
 
-- 
cgit v0.10.2


From c6eb13847ba081552d2af644219bddeff7110caf Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 22 May 2009 18:18:28 +0200
Subject: perf_counter tools: increase limits

I tried to run with 300 active counters and the tools bailed out
because our limit was at 64. So increase the counter limit to 1024
and the CPU limit to 4096.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h
index 6fa3656..81a7374 100644
--- a/Documentation/perf_counter/perf.h
+++ b/Documentation/perf_counter/perf.h
@@ -54,8 +54,8 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
 		       group_fd, flags);
 }
 
-#define MAX_COUNTERS			64
-#define MAX_NR_CPUS			256
+#define MAX_COUNTERS			1024
+#define MAX_NR_CPUS			4096
 
 #define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
 
-- 
cgit v0.10.2


From afe6d7e3c4a9aba020637f4ae15527a89ba31f21 Mon Sep 17 00:00:00 2001
From: Andreas Mohr <andi@lisas.de>
Date: Fri, 22 May 2009 17:48:58 +0200
Subject: ALSA: Kill truncate warning by shortening Sigmatel-specific AC97
 control name

ALSA sound/core/control.c:232: Control name 'Sigmatel Surround Phase
Inversion Playback Switch' truncated to 'Sigmatel Surround Phase
Inversion Playback ' bootup message by omitting weird Sigmatel prefix
in this case; also fix up the related ca0106 mixer control removal
part by using identical naming there.

Signed-off-by: Andreas Mohr <andi@lisas.de>
Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/pci/ac97/ac97_patch.c b/sound/pci/ac97/ac97_patch.c
index 81bc93e..7337abd 100644
--- a/sound/pci/ac97/ac97_patch.c
+++ b/sound/pci/ac97/ac97_patch.c
@@ -958,10 +958,13 @@ static int patch_sigmatel_stac9708_3d(struct snd_ac97 * ac97)
 }
 
 static const struct snd_kcontrol_new snd_ac97_sigmatel_4speaker =
-AC97_SINGLE("Sigmatel 4-Speaker Stereo Playback Switch", AC97_SIGMATEL_DAC2INVERT, 2, 1, 0);
+AC97_SINGLE("Sigmatel 4-Speaker Stereo Playback Switch",
+		AC97_SIGMATEL_DAC2INVERT, 2, 1, 0);
 
+/* "Sigmatel " removed due to excessive name length: */
 static const struct snd_kcontrol_new snd_ac97_sigmatel_phaseinvert =
-AC97_SINGLE("Sigmatel Surround Phase Inversion Playback Switch", AC97_SIGMATEL_DAC2INVERT, 3, 1, 0);
+AC97_SINGLE("Surround Phase Inversion Playback Switch",
+		AC97_SIGMATEL_DAC2INVERT, 3, 1, 0);
 
 static const struct snd_kcontrol_new snd_ac97_sigmatel_controls[] = {
 AC97_SINGLE("Sigmatel DAC 6dB Attenuate", AC97_SIGMATEL_ANALOG, 1, 1, 0),
diff --git a/sound/pci/ca0106/ca0106_mixer.c b/sound/pci/ca0106/ca0106_mixer.c
index ad28887..c111efe 100644
--- a/sound/pci/ca0106/ca0106_mixer.c
+++ b/sound/pci/ca0106/ca0106_mixer.c
@@ -800,7 +800,7 @@ int __devinit snd_ca0106_mixer(struct snd_ca0106 *emu)
 		"Capture Volume",
 		"External Amplifier",
 		"Sigmatel 4-Speaker Stereo Playback Switch",
-		"Sigmatel Surround Phase Inversion Playback ",
+		"Surround Phase Inversion Playback Switch",
 		NULL
 	};
 	static char *ca0106_rename_ctls[] = {
-- 
cgit v0.10.2


From bca23dba760d6705c013f89113c46570378fb626 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Thu, 21 May 2009 11:46:16 -0700
Subject: x86, setup: revert ACPI 3 E820 extended attributes support

Remove ACPI 3 E820 extended memory attributes support.  At least one
vendor actively set all the flags to zero, but left ECX on return at
24.  This bug may be present in other BIOSes.

The breakage functionally means the ACPI 3 flags are probably
completely useless, and that no OS any time soon is going to rely on
their existence.  Therefore, drop support completely.  We may want to
revisit this question in the future, if we find ourselves actually
needing the flags.

This reverts all or part of the following checkins:

     cd670599b7b00d9263f6f11a05c0edeb9cbedaf3
     c549e71d073a6e9a4847497344db28a784061455

However, retain the part from the latter commit that copies e820 into
a temporary buffer; that is an unrelated BIOS workaround.  Put in a
comment to explain that part.

See https://bugzilla.redhat.com/show_bug.cgi?id=499396 for some
additional information.

[ Impact: detect all memory on affected machines ]

Reported-by: Thomas J. Baker <tjb@unh.edu>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Acked-by: Len Brown <len.brown@intel.com>
Cc: Chuck Ebbert <cebbert@redhat.com>
Cc: Kyle McMartin <kmcmartin@redhat.com>
Cc: Matt Domsch <matt_domsch@dell.com>

diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index 5054c2d..74b3d2b 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -17,11 +17,6 @@
 
 #define SMAP	0x534d4150	/* ASCII "SMAP" */
 
-struct e820_ext_entry {
-	struct e820entry std;
-	u32 ext_flags;
-} __attribute__((packed));
-
 static int detect_memory_e820(void)
 {
 	int count = 0;
@@ -29,13 +24,21 @@ static int detect_memory_e820(void)
 	u32 size, id, edi;
 	u8 err;
 	struct e820entry *desc = boot_params.e820_map;
-	static struct e820_ext_entry buf; /* static so it is zeroed */
+	static struct e820entry buf; /* static so it is zeroed */
 
 	/*
-	 * Set this here so that if the BIOS doesn't change this field
-	 * but still doesn't change %ecx, we're still okay...
+	 * Note: at least one BIOS is known which assumes that the
+	 * buffer pointed to by one e820 call is the same one as
+	 * the previous call, and only changes modified fields.  Therefore,
+	 * we use a temporary buffer and copy the results entry by entry.
+	 *
+	 * This routine deliberately does not try to account for
+	 * ACPI 3+ extended attributes.  This is because there are
+	 * BIOSes in the field which report zero for the valid bit for
+	 * all ranges, and we don't currently make any use of the
+	 * other attribute bits.  Revisit this if we see the extended
+	 * attribute bits deployed in a meaningful way in the future.
 	 */
-	buf.ext_flags = 1;
 
 	do {
 		size = sizeof buf;
@@ -66,13 +69,7 @@ static int detect_memory_e820(void)
 			break;
 		}
 
-		/* ACPI 3.0 added the extended flags support.  If bit 0
-		   in the extended flags is zero, we're supposed to simply
-		   ignore the entry -- a backwards incompatible change! */
-		if (size > 20 && !(buf.ext_flags & 1))
-			continue;
-
-		*desc++ = buf.std;
+		*desc++ = buf;
 		count++;
 	} while (next && count < ARRAY_SIZE(boot_params.e820_map));
 
-- 
cgit v0.10.2


From 14b60391587ab9b2207c4fb6281763a93ae85e0f Mon Sep 17 00:00:00 2001
From: Jesse Barnes <jbarnes@virtuousgeek.org>
Date: Wed, 20 May 2009 16:47:08 -0400
Subject: i915: support 8xx desktop cursors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For some reason we never added 8xx desktop cursor support to the
kernel.  This patch fixes that.

[krh: Also set the size on pre-i915 hw.]
Tested-by: Kristian Høgsberg <krh@redhat.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Eric Anholt <eric@anholt.net>

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 4a24c90..717b6a8 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1145,7 +1145,7 @@ int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 			mutex_unlock(&dev->struct_mutex);
 			return VM_FAULT_SIGBUS;
 		}
-		list_add(&obj_priv->list, &dev_priv->mm.inactive_list);
+		list_add_tail(&obj_priv->list, &dev_priv->mm.inactive_list);
 	}
 
 	/* Need a new fence register? */
@@ -1375,7 +1375,7 @@ i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
 			mutex_unlock(&dev->struct_mutex);
 			return ret;
 		}
-		list_add(&obj_priv->list, &dev_priv->mm.inactive_list);
+		list_add_tail(&obj_priv->list, &dev_priv->mm.inactive_list);
 	}
 
 	drm_gem_object_unreference(obj);
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 15da44c..9668cc0 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -1410,9 +1410,25 @@
 
 /* Cursor A & B regs */
 #define CURACNTR		0x70080
+/* Old style CUR*CNTR flags (desktop 8xx) */
+#define   CURSOR_ENABLE		0x80000000
+#define   CURSOR_GAMMA_ENABLE	0x40000000
+#define   CURSOR_STRIDE_MASK	0x30000000
+#define   CURSOR_FORMAT_SHIFT	24
+#define   CURSOR_FORMAT_MASK	(0x07 << CURSOR_FORMAT_SHIFT)
+#define   CURSOR_FORMAT_2C	(0x00 << CURSOR_FORMAT_SHIFT)
+#define   CURSOR_FORMAT_3C	(0x01 << CURSOR_FORMAT_SHIFT)
+#define   CURSOR_FORMAT_4C	(0x02 << CURSOR_FORMAT_SHIFT)
+#define   CURSOR_FORMAT_ARGB	(0x04 << CURSOR_FORMAT_SHIFT)
+#define   CURSOR_FORMAT_XRGB	(0x05 << CURSOR_FORMAT_SHIFT)
+/* New style CUR*CNTR flags */
+#define   CURSOR_MODE		0x27
 #define   CURSOR_MODE_DISABLE   0x00
 #define   CURSOR_MODE_64_32B_AX 0x07
 #define   CURSOR_MODE_64_ARGB_AX ((1 << 5) | CURSOR_MODE_64_32B_AX)
+#define   MCURSOR_PIPE_SELECT	(1 << 28)
+#define   MCURSOR_PIPE_A	0x00
+#define   MCURSOR_PIPE_B	(1 << 28)
 #define   MCURSOR_GAMMA_ENABLE  (1 << 26)
 #define CURABASE		0x70084
 #define CURAPOS			0x70088
@@ -1420,6 +1436,7 @@
 #define   CURSOR_POS_SIGN       0x8000
 #define   CURSOR_X_SHIFT        0
 #define   CURSOR_Y_SHIFT        16
+#define CURSIZE			0x700a0
 #define CURBCNTR		0x700c0
 #define CURBBASE		0x700c4
 #define CURBPOS			0x700c8
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 3387cf3..c9d6f10 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -1357,7 +1357,7 @@ static int intel_crtc_cursor_set(struct drm_crtc *crtc,
 	int pipe = intel_crtc->pipe;
 	uint32_t control = (pipe == 0) ? CURACNTR : CURBCNTR;
 	uint32_t base = (pipe == 0) ? CURABASE : CURBBASE;
-	uint32_t temp;
+	uint32_t temp = I915_READ(control);
 	size_t addr;
 	int ret;
 
@@ -1366,7 +1366,12 @@ static int intel_crtc_cursor_set(struct drm_crtc *crtc,
 	/* if we want to turn off the cursor ignore width and height */
 	if (!handle) {
 		DRM_DEBUG("cursor off\n");
-		temp = CURSOR_MODE_DISABLE;
+		if (IS_MOBILE(dev) || IS_I9XX(dev)) {
+			temp &= ~(CURSOR_MODE | MCURSOR_GAMMA_ENABLE);
+			temp |= CURSOR_MODE_DISABLE;
+		} else {
+			temp &= ~(CURSOR_ENABLE | CURSOR_GAMMA_ENABLE);
+		}
 		addr = 0;
 		bo = NULL;
 		mutex_lock(&dev->struct_mutex);
@@ -1409,10 +1414,19 @@ static int intel_crtc_cursor_set(struct drm_crtc *crtc,
 		addr = obj_priv->phys_obj->handle->busaddr;
 	}
 
-	temp = 0;
-	/* set the pipe for the cursor */
-	temp |= (pipe << 28);
-	temp |= CURSOR_MODE_64_ARGB_AX | MCURSOR_GAMMA_ENABLE;
+	if (!IS_I9XX(dev))
+		I915_WRITE(CURSIZE, (height << 12) | width);
+
+	/* Hooray for CUR*CNTR differences */
+	if (IS_MOBILE(dev) || IS_I9XX(dev)) {
+		temp &= ~(CURSOR_MODE | MCURSOR_PIPE_SELECT);
+		temp |= CURSOR_MODE_64_ARGB_AX | MCURSOR_GAMMA_ENABLE;
+		temp |= (pipe << 28); /* Connect to correct pipe */
+	} else {
+		temp &= ~(CURSOR_FORMAT_MASK);
+		temp |= CURSOR_ENABLE;
+		temp |= CURSOR_FORMAT_ARGB | CURSOR_GAMMA_ENABLE;
+	}
 
  finish:
 	I915_WRITE(control, temp);
-- 
cgit v0.10.2


From 8863170628da4b0b461eb96bf797df1dca0bd03e Mon Sep 17 00:00:00 2001
From: Ma Ling <ling.ma@intel.com>
Date: Wed, 13 May 2009 11:19:55 +0800
Subject: drm/i915: Fetch SDVO LVDS mode lines from VBT, then reserve them

Signed-off-by: Ma Ling <ling.ma@intel.com>
Signed-off-by: Eric Anholt <eric@anholt.net>

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 9b149fe..c431fa5 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -180,7 +180,8 @@ typedef struct drm_i915_private {
 	int backlight_duty_cycle;  /* restore backlight to this value */
 	bool panel_wants_dither;
 	struct drm_display_mode *panel_fixed_mode;
-	struct drm_display_mode *vbt_mode; /* if any */
+	struct drm_display_mode *lfp_lvds_vbt_mode; /* if any */
+	struct drm_display_mode *sdvo_lvds_vbt_mode; /* if any */
 
 	/* Feature bits from the VBIOS */
 	unsigned int int_tv_support:1;
diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c
index fc28e2b..9d78cff 100644
--- a/drivers/gpu/drm/i915/intel_bios.c
+++ b/drivers/gpu/drm/i915/intel_bios.c
@@ -57,9 +57,43 @@ find_section(struct bdb_header *bdb, int section_id)
 	return NULL;
 }
 
-/* Try to find panel data */
 static void
-parse_panel_data(struct drm_i915_private *dev_priv, struct bdb_header *bdb)
+fill_detail_timing_data(struct drm_display_mode *panel_fixed_mode,
+			struct lvds_dvo_timing *dvo_timing)
+{
+	panel_fixed_mode->hdisplay = (dvo_timing->hactive_hi << 8) |
+		dvo_timing->hactive_lo;
+	panel_fixed_mode->hsync_start = panel_fixed_mode->hdisplay +
+		((dvo_timing->hsync_off_hi << 8) | dvo_timing->hsync_off_lo);
+	panel_fixed_mode->hsync_end = panel_fixed_mode->hsync_start +
+		dvo_timing->hsync_pulse_width;
+	panel_fixed_mode->htotal = panel_fixed_mode->hdisplay +
+		((dvo_timing->hblank_hi << 8) | dvo_timing->hblank_lo);
+
+	panel_fixed_mode->vdisplay = (dvo_timing->vactive_hi << 8) |
+		dvo_timing->vactive_lo;
+	panel_fixed_mode->vsync_start = panel_fixed_mode->vdisplay +
+		dvo_timing->vsync_off;
+	panel_fixed_mode->vsync_end = panel_fixed_mode->vsync_start +
+		dvo_timing->vsync_pulse_width;
+	panel_fixed_mode->vtotal = panel_fixed_mode->vdisplay +
+		((dvo_timing->vblank_hi << 8) | dvo_timing->vblank_lo);
+	panel_fixed_mode->clock = dvo_timing->clock * 10;
+	panel_fixed_mode->type = DRM_MODE_TYPE_PREFERRED;
+
+	/* Some VBTs have bogus h/vtotal values */
+	if (panel_fixed_mode->hsync_end > panel_fixed_mode->htotal)
+		panel_fixed_mode->htotal = panel_fixed_mode->hsync_end + 1;
+	if (panel_fixed_mode->vsync_end > panel_fixed_mode->vtotal)
+		panel_fixed_mode->vtotal = panel_fixed_mode->vsync_end + 1;
+
+	drm_mode_set_name(panel_fixed_mode);
+}
+
+/* Try to find integrated panel data */
+static void
+parse_lfp_panel_data(struct drm_i915_private *dev_priv,
+			    struct bdb_header *bdb)
 {
 	struct bdb_lvds_options *lvds_options;
 	struct bdb_lvds_lfp_data *lvds_lfp_data;
@@ -91,38 +125,45 @@ parse_panel_data(struct drm_i915_private *dev_priv, struct bdb_header *bdb)
 	panel_fixed_mode = drm_calloc(1, sizeof(*panel_fixed_mode),
 				      DRM_MEM_DRIVER);
 
-	panel_fixed_mode->hdisplay = (dvo_timing->hactive_hi << 8) |
-		dvo_timing->hactive_lo;
-	panel_fixed_mode->hsync_start = panel_fixed_mode->hdisplay +
-		((dvo_timing->hsync_off_hi << 8) | dvo_timing->hsync_off_lo);
-	panel_fixed_mode->hsync_end = panel_fixed_mode->hsync_start +
-		dvo_timing->hsync_pulse_width;
-	panel_fixed_mode->htotal = panel_fixed_mode->hdisplay +
-		((dvo_timing->hblank_hi << 8) | dvo_timing->hblank_lo);
+	fill_detail_timing_data(panel_fixed_mode, dvo_timing);
 
-	panel_fixed_mode->vdisplay = (dvo_timing->vactive_hi << 8) |
-		dvo_timing->vactive_lo;
-	panel_fixed_mode->vsync_start = panel_fixed_mode->vdisplay +
-		dvo_timing->vsync_off;
-	panel_fixed_mode->vsync_end = panel_fixed_mode->vsync_start +
-		dvo_timing->vsync_pulse_width;
-	panel_fixed_mode->vtotal = panel_fixed_mode->vdisplay +
-		((dvo_timing->vblank_hi << 8) | dvo_timing->vblank_lo);
-	panel_fixed_mode->clock = dvo_timing->clock * 10;
-	panel_fixed_mode->type = DRM_MODE_TYPE_PREFERRED;
+	dev_priv->lfp_lvds_vbt_mode = panel_fixed_mode;
 
-	/* Some VBTs have bogus h/vtotal values */
-	if (panel_fixed_mode->hsync_end > panel_fixed_mode->htotal)
-		panel_fixed_mode->htotal = panel_fixed_mode->hsync_end + 1;
-	if (panel_fixed_mode->vsync_end > panel_fixed_mode->vtotal)
-		panel_fixed_mode->vtotal = panel_fixed_mode->vsync_end + 1;
+	DRM_DEBUG("Found panel mode in BIOS VBT tables:\n");
+	drm_mode_debug_printmodeline(panel_fixed_mode);
 
-	drm_mode_set_name(panel_fixed_mode);
+	return;
+}
+
+/* Try to find sdvo panel data */
+static void
+parse_sdvo_panel_data(struct drm_i915_private *dev_priv,
+		      struct bdb_header *bdb)
+{
+	struct bdb_sdvo_lvds_options *sdvo_lvds_options;
+	struct lvds_dvo_timing *dvo_timing;
+	struct drm_display_mode *panel_fixed_mode;
 
-	dev_priv->vbt_mode = panel_fixed_mode;
+	dev_priv->sdvo_lvds_vbt_mode = NULL;
 
-	DRM_DEBUG("Found panel mode in BIOS VBT tables:\n");
-	drm_mode_debug_printmodeline(panel_fixed_mode);
+	sdvo_lvds_options = find_section(bdb, BDB_SDVO_LVDS_OPTIONS);
+	if (!sdvo_lvds_options)
+		return;
+
+	dvo_timing = find_section(bdb, BDB_SDVO_PANEL_DTDS);
+	if (!dvo_timing)
+		return;
+
+	panel_fixed_mode = drm_calloc(1, sizeof(*panel_fixed_mode),
+				      DRM_MEM_DRIVER);
+
+	if (!panel_fixed_mode)
+		return;
+
+	fill_detail_timing_data(panel_fixed_mode,
+			dvo_timing + sdvo_lvds_options->panel_type);
+
+	dev_priv->sdvo_lvds_vbt_mode = panel_fixed_mode;
 
 	return;
 }
@@ -199,7 +240,8 @@ intel_init_bios(struct drm_device *dev)
 
 	/* Grab useful general definitions */
 	parse_general_features(dev_priv, bdb);
-	parse_panel_data(dev_priv, bdb);
+	parse_lfp_panel_data(dev_priv, bdb);
+	parse_sdvo_panel_data(dev_priv, bdb);
 
 	pci_unmap_rom(pdev, bios);
 
diff --git a/drivers/gpu/drm/i915/intel_bios.h b/drivers/gpu/drm/i915/intel_bios.h
index de621aa..8ca2cde 100644
--- a/drivers/gpu/drm/i915/intel_bios.h
+++ b/drivers/gpu/drm/i915/intel_bios.h
@@ -279,6 +279,23 @@ struct vch_bdb_22 {
 	struct vch_panel_data panels[16];
 } __attribute__((packed));
 
+struct bdb_sdvo_lvds_options {
+	u8 panel_backlight;
+	u8 h40_set_panel_type;
+	u8 panel_type;
+	u8 ssc_clk_freq;
+	u16 als_low_trip;
+	u16 als_high_trip;
+	u8 sclalarcoeff_tab_row_num;
+	u8 sclalarcoeff_tab_row_size;
+	u8 coefficient[8];
+	u8 panel_misc_bits_1;
+	u8 panel_misc_bits_2;
+	u8 panel_misc_bits_3;
+	u8 panel_misc_bits_4;
+} __attribute__((packed));
+
+
 bool intel_init_bios(struct drm_device *dev);
 
 /*
diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c
index 439a865..53731f0 100644
--- a/drivers/gpu/drm/i915/intel_lvds.c
+++ b/drivers/gpu/drm/i915/intel_lvds.c
@@ -511,10 +511,10 @@ void intel_lvds_init(struct drm_device *dev)
 	}
 
 	/* Failed to get EDID, what about VBT? */
-	if (dev_priv->vbt_mode) {
+	if (dev_priv->lfp_lvds_vbt_mode) {
 		mutex_lock(&dev->mode_config.mutex);
 		dev_priv->panel_fixed_mode =
-			drm_mode_duplicate(dev, dev_priv->vbt_mode);
+			drm_mode_duplicate(dev, dev_priv->lfp_lvds_vbt_mode);
 		mutex_unlock(&dev->mode_config.mutex);
 		if (dev_priv->panel_fixed_mode) {
 			dev_priv->panel_fixed_mode->type |=
-- 
cgit v0.10.2


From 7086c87fb1446ceb37918ffa0941359a7c2ec6cf Mon Sep 17 00:00:00 2001
From: Ma Ling <ling.ma@intel.com>
Date: Wed, 13 May 2009 11:20:06 +0800
Subject: drm/i915: Return SDVO LVDS VBT mode if no EDID modes are detected.

Some new SDVO LVDS hardware doesn't have DDC available, and this should
fix the display on it.

Signed-off-by: Ma Ling <ling.ma@intel.com>
Signed-off-by: Eric Anholt <eric@anholt.net>

diff --git a/drivers/gpu/drm/i915/intel_sdvo.c b/drivers/gpu/drm/i915/intel_sdvo.c
index 9913651..f79ebf4 100644
--- a/drivers/gpu/drm/i915/intel_sdvo.c
+++ b/drivers/gpu/drm/i915/intel_sdvo.c
@@ -69,6 +69,10 @@ struct intel_sdvo_priv {
 	 * This is set if we treat the device as HDMI, instead of DVI.
 	 */
 	bool is_hdmi;
+	/**
+	 * This is set if we detect output of sdvo device as LVDS.
+	 */
+	bool is_lvds;
 
 	/**
 	 * Returned SDTV resolutions allowed for the current format, if the
@@ -1543,6 +1547,37 @@ static void intel_sdvo_get_tv_modes(struct drm_connector *connector)
 		}
 }
 
+static void intel_sdvo_get_lvds_modes(struct drm_connector *connector)
+{
+	struct intel_output *intel_output = to_intel_output(connector);
+	struct intel_sdvo_priv *sdvo_priv = intel_output->dev_priv;
+	struct drm_i915_private *dev_priv = connector->dev->dev_private;
+
+	/*
+	 * Attempt to get the mode list from DDC.
+	 * Assume that the preferred modes are
+	 * arranged in priority order.
+	 */
+	/* set the bus switch and get the modes */
+	intel_sdvo_set_control_bus_switch(intel_output, sdvo_priv->ddc_bus);
+	intel_ddc_get_modes(intel_output);
+	if (list_empty(&connector->probed_modes) == false)
+		return;
+
+	/* Fetch modes from VBT */
+	if (dev_priv->sdvo_lvds_vbt_mode != NULL) {
+		struct drm_display_mode *newmode;
+		newmode = drm_mode_duplicate(connector->dev,
+					     dev_priv->sdvo_lvds_vbt_mode);
+		if (newmode != NULL) {
+			/* Guarantee the mode is preferred */
+			newmode->type = (DRM_MODE_TYPE_PREFERRED |
+					 DRM_MODE_TYPE_DRIVER);
+			drm_mode_probed_add(connector, newmode);
+		}
+	}
+}
+
 static int intel_sdvo_get_modes(struct drm_connector *connector)
 {
 	struct intel_output *output = to_intel_output(connector);
@@ -1550,6 +1585,8 @@ static int intel_sdvo_get_modes(struct drm_connector *connector)
 
 	if (sdvo_priv->is_tv)
 		intel_sdvo_get_tv_modes(connector);
+	else if (sdvo_priv->is_lvds == true)
+		intel_sdvo_get_lvds_modes(connector);
 	else
 		intel_sdvo_get_ddc_modes(connector);
 
@@ -1720,6 +1757,8 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device)
 		}
 	}
 
+	/* In defaut case sdvo lvds is false */
+	sdvo_priv->is_lvds = false;
 	intel_sdvo_get_capabilities(intel_output, &sdvo_priv->caps);
 
 	if (sdvo_priv->caps.output_flags &
@@ -1773,6 +1812,7 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device)
 		connector->display_info.subpixel_order = SubPixelHorizontalRGB;
 		encoder_type = DRM_MODE_ENCODER_LVDS;
 		connector_type = DRM_MODE_CONNECTOR_LVDS;
+		sdvo_priv->is_lvds = true;
 	}
 	else if (sdvo_priv->caps.output_flags & SDVO_OUTPUT_LVDS1)
 	{
@@ -1780,6 +1820,7 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device)
 		connector->display_info.subpixel_order = SubPixelHorizontalRGB;
 		encoder_type = DRM_MODE_ENCODER_LVDS;
 		connector_type = DRM_MODE_CONNECTOR_LVDS;
+		sdvo_priv->is_lvds = true;
 	}
 	else
 	{
-- 
cgit v0.10.2


From ad5b2a6db3eddc41358d8a73f5cfe1c38e7e3a19 Mon Sep 17 00:00:00 2001
From: Jonas Bonn <jonas@southpole.se>
Date: Fri, 15 May 2009 09:10:41 +0200
Subject: drm/i915: Determine type before initialising connector

drm_connector_init sets both the connector type and the connector type_id
on the newly initialised connector.  As the connector type_id is coupled to
the connector type, the connector type cannot simply be modified on an
initialised connector.

This patch changes the order of operations on intel_sdvo_init so that the
type is determined before the connector is intialised.

This fixes a bug whereby the name card0-VGA-1 would be allocted to both a
CRT and an SDVO connector since the SDVO connector would be initialised
with type 'unknown' and hence have its type_id assigned from the wrong pool.

Signed-off-by: Jonas Bonn <jonas@southpole.se>
Signed-off-by: Eric Anholt <eric@anholt.net>

diff --git a/drivers/gpu/drm/i915/intel_sdvo.c b/drivers/gpu/drm/i915/intel_sdvo.c
index f79ebf4..ded122c 100644
--- a/drivers/gpu/drm/i915/intel_sdvo.c
+++ b/drivers/gpu/drm/i915/intel_sdvo.c
@@ -1713,17 +1713,9 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device)
 		return false;
 	}
 
-	connector = &intel_output->base;
-
-	drm_connector_init(dev, connector, &intel_sdvo_connector_funcs,
-			   DRM_MODE_CONNECTOR_Unknown);
-	drm_connector_helper_add(connector, &intel_sdvo_connector_helper_funcs);
 	sdvo_priv = (struct intel_sdvo_priv *)(intel_output + 1);
 	intel_output->type = INTEL_OUTPUT_SDVO;
 
-	connector->interlace_allowed = 0;
-	connector->doublescan_allowed = 0;
-
 	/* setup the DDC bus. */
 	if (output_device == SDVOB)
 		i2cbus = intel_i2c_create(dev, GPIOE, "SDVOCTRL_E for SDVOB");
@@ -1731,7 +1723,7 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device)
 		i2cbus = intel_i2c_create(dev, GPIOE, "SDVOCTRL_E for SDVOC");
 
 	if (!i2cbus)
-		goto err_connector;
+		goto err_inteloutput;
 
 	sdvo_priv->i2c_bus = i2cbus;
 
@@ -1747,7 +1739,6 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device)
 	intel_output->i2c_bus = i2cbus;
 	intel_output->dev_priv = sdvo_priv;
 
-
 	/* Read the regs to test if we can talk to the device */
 	for (i = 0; i < 0x40; i++) {
 		if (!intel_sdvo_read_byte(intel_output, i, &ch[i])) {
@@ -1768,7 +1759,6 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device)
 		else
 			sdvo_priv->controlled_output = SDVO_OUTPUT_TMDS1;
 
-		connector->display_info.subpixel_order = SubPixelHorizontalRGB;
 		encoder_type = DRM_MODE_ENCODER_TMDS;
 		connector_type = DRM_MODE_CONNECTOR_DVID;
 
@@ -1786,7 +1776,6 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device)
 	else if (sdvo_priv->caps.output_flags & SDVO_OUTPUT_SVID0)
 	{
 		sdvo_priv->controlled_output = SDVO_OUTPUT_SVID0;
-		connector->display_info.subpixel_order = SubPixelHorizontalRGB;
 		encoder_type = DRM_MODE_ENCODER_TVDAC;
 		connector_type = DRM_MODE_CONNECTOR_SVIDEO;
 		sdvo_priv->is_tv = true;
@@ -1795,21 +1784,18 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device)
 	else if (sdvo_priv->caps.output_flags & SDVO_OUTPUT_RGB0)
 	{
 		sdvo_priv->controlled_output = SDVO_OUTPUT_RGB0;
-		connector->display_info.subpixel_order = SubPixelHorizontalRGB;
 		encoder_type = DRM_MODE_ENCODER_DAC;
 		connector_type = DRM_MODE_CONNECTOR_VGA;
 	}
 	else if (sdvo_priv->caps.output_flags & SDVO_OUTPUT_RGB1)
 	{
 		sdvo_priv->controlled_output = SDVO_OUTPUT_RGB1;
-		connector->display_info.subpixel_order = SubPixelHorizontalRGB;
 		encoder_type = DRM_MODE_ENCODER_DAC;
 		connector_type = DRM_MODE_CONNECTOR_VGA;
 	}
 	else if (sdvo_priv->caps.output_flags & SDVO_OUTPUT_LVDS0)
 	{
 		sdvo_priv->controlled_output = SDVO_OUTPUT_LVDS0;
-		connector->display_info.subpixel_order = SubPixelHorizontalRGB;
 		encoder_type = DRM_MODE_ENCODER_LVDS;
 		connector_type = DRM_MODE_CONNECTOR_LVDS;
 		sdvo_priv->is_lvds = true;
@@ -1817,7 +1803,6 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device)
 	else if (sdvo_priv->caps.output_flags & SDVO_OUTPUT_LVDS1)
 	{
 		sdvo_priv->controlled_output = SDVO_OUTPUT_LVDS1;
-		connector->display_info.subpixel_order = SubPixelHorizontalRGB;
 		encoder_type = DRM_MODE_ENCODER_LVDS;
 		connector_type = DRM_MODE_CONNECTOR_LVDS;
 		sdvo_priv->is_lvds = true;
@@ -1836,9 +1821,16 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device)
 		goto err_i2c;
 	}
 
+	connector = &intel_output->base;
+	drm_connector_init(dev, connector, &intel_sdvo_connector_funcs,
+			   connector_type);
+	drm_connector_helper_add(connector, &intel_sdvo_connector_helper_funcs);
+	connector->interlace_allowed = 0;
+	connector->doublescan_allowed = 0;
+	connector->display_info.subpixel_order = SubPixelHorizontalRGB;
+
 	drm_encoder_init(dev, &intel_output->enc, &intel_sdvo_enc_funcs, encoder_type);
 	drm_encoder_helper_add(&intel_output->enc, &intel_sdvo_helper_funcs);
-	connector->connector_type = connector_type;
 
 	drm_mode_connector_attach_encoder(&intel_output->base, &intel_output->enc);
 	drm_sysfs_connector_add(connector);
@@ -1876,8 +1868,7 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device)
 
 err_i2c:
 	intel_i2c_destroy(intel_output->i2c_bus);
-err_connector:
-	drm_connector_cleanup(connector);
+err_inteloutput:
 	kfree(intel_output);
 
 	return false;
-- 
cgit v0.10.2


From 0c752a93353d9b17dbe148312d732fbe06d235e1 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Fri, 22 May 2009 12:17:45 -0700
Subject: x86: introduce noxsave boot parameter

Introduce "noxsave" boot parameter which will disable the cpu's xsave/xrstor
capabilities. Useful for debugging and working around xsave related issues.

[ Impact: make it possible to debug problems in the field ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e87bdbf..fd5cac0 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1535,6 +1535,10 @@ and is between 256 and 4096 characters. It is defined in the file
 			register save and restore. The kernel will only save
 			legacy floating-point registers on task switch.
 
+	noxsave		[BUGS=X86] Disables x86 extended register state save
+			and restore using xsave. The kernel will fallback to
+			enabling legacy floating-point and sse state.
+
 	nohlt		[BUGS=ARM,SH] Tells the kernel that the sleep(SH) or
 			wfi(ARM) instruction doesn't work correctly and not to
 			use it. This is also useful when using JTAG debugger.
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c1caefc..77848d9 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -114,6 +114,13 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
 } };
 EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
 
+static int __init x86_xsave_setup(char *s)
+{
+	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+	return 1;
+}
+__setup("noxsave", x86_xsave_setup);
+
 #ifdef CONFIG_X86_32
 static int cachesize_override __cpuinitdata = -1;
 static int disable_x86_serial_nr __cpuinitdata = 1;
-- 
cgit v0.10.2


From 619ac3b75a1e9b2df66857f6a0fb466f1da5fa9e Mon Sep 17 00:00:00 2001
From: Ma Ling <ling.ma@intel.com>
Date: Mon, 18 May 2009 16:12:46 +0800
Subject: drm/i915: Use an I2C algo to do the flip to SDVO DDC bus.

Previously, we would set the control bus switch before calls were made
to request EDID information over DDC.  But recently the DDC code started
doing multiple I2C transfers to get the EDID extensions as well.  This
tripped up SDVO, because the control bus switch is only in effect until
the next STOP after a START.  By doing our own algo, we can wrap each i2c
transaction on the DDC I2C bus with the control bus switch it requires.

freedesktop.org bug #21042

Signed-off-by: Ma Ling <ling.ma@intel.com>
[anholt: Hand application for conflict, fixed error path]
Signed-off-by: Eric Anholt <eric@anholt.net>

diff --git a/drivers/gpu/drm/i915/intel_sdvo.c b/drivers/gpu/drm/i915/intel_sdvo.c
index ded122c..f3ef6bf 100644
--- a/drivers/gpu/drm/i915/intel_sdvo.c
+++ b/drivers/gpu/drm/i915/intel_sdvo.c
@@ -1402,10 +1402,8 @@ static enum drm_connector_status intel_sdvo_detect(struct drm_connector *connect
 static void intel_sdvo_get_ddc_modes(struct drm_connector *connector)
 {
 	struct intel_output *intel_output = to_intel_output(connector);
-	struct intel_sdvo_priv *sdvo_priv = intel_output->dev_priv;
 
 	/* set the bus switch and get the modes */
-	intel_sdvo_set_control_bus_switch(intel_output, sdvo_priv->ddc_bus);
 	intel_ddc_get_modes(intel_output);
 
 #if 0
@@ -1601,6 +1599,9 @@ static void intel_sdvo_destroy(struct drm_connector *connector)
 
 	if (intel_output->i2c_bus)
 		intel_i2c_destroy(intel_output->i2c_bus);
+	if (intel_output->ddc_bus)
+		intel_i2c_destroy(intel_output->ddc_bus);
+
 	drm_sysfs_connector_remove(connector);
 	drm_connector_cleanup(connector);
 	kfree(intel_output);
@@ -1697,12 +1698,56 @@ intel_sdvo_get_digital_encoding_mode(struct intel_output *output)
 	return true;
 }
 
+static struct intel_output *
+intel_sdvo_chan_to_intel_output(struct intel_i2c_chan *chan)
+{
+	struct drm_device *dev = chan->drm_dev;
+	struct drm_connector *connector;
+	struct intel_output *intel_output = NULL;
+
+	list_for_each_entry(connector,
+			&dev->mode_config.connector_list, head) {
+		if (to_intel_output(connector)->ddc_bus == chan) {
+			intel_output = to_intel_output(connector);
+			break;
+		}
+	}
+	return intel_output;
+}
+
+static int intel_sdvo_master_xfer(struct i2c_adapter *i2c_adap,
+				  struct i2c_msg msgs[], int num)
+{
+	struct intel_output *intel_output;
+	struct intel_sdvo_priv *sdvo_priv;
+	struct i2c_algo_bit_data *algo_data;
+	struct i2c_algorithm *algo;
+
+	algo_data = (struct i2c_algo_bit_data *)i2c_adap->algo_data;
+	intel_output =
+		intel_sdvo_chan_to_intel_output(
+				(struct intel_i2c_chan *)(algo_data->data));
+	if (intel_output == NULL)
+		return -EINVAL;
+
+	sdvo_priv = intel_output->dev_priv;
+	algo = (struct i2c_algorithm *)intel_output->i2c_bus->adapter.algo;
+
+	intel_sdvo_set_control_bus_switch(intel_output, sdvo_priv->ddc_bus);
+	return algo->master_xfer(i2c_adap, msgs, num);
+}
+
+static struct i2c_algorithm intel_sdvo_i2c_bit_algo = {
+	.master_xfer	= intel_sdvo_master_xfer,
+};
+
 bool intel_sdvo_init(struct drm_device *dev, int output_device)
 {
 	struct drm_connector *connector;
 	struct intel_output *intel_output;
 	struct intel_sdvo_priv *sdvo_priv;
 	struct intel_i2c_chan *i2cbus = NULL;
+	struct intel_i2c_chan *ddcbus = NULL;
 	int connector_type;
 	u8 ch[0x40];
 	int i;
@@ -1748,6 +1793,20 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device)
 		}
 	}
 
+	/* setup the DDC bus. */
+	if (output_device == SDVOB)
+		ddcbus = intel_i2c_create(dev, GPIOE, "SDVOB DDC BUS");
+	else
+		ddcbus = intel_i2c_create(dev, GPIOE, "SDVOC DDC BUS");
+
+	if (ddcbus == NULL)
+		goto err_i2c;
+
+	intel_sdvo_i2c_bit_algo.functionality =
+		intel_output->i2c_bus->adapter.algo->functionality;
+	ddcbus->adapter.algo = &intel_sdvo_i2c_bit_algo;
+	intel_output->ddc_bus = ddcbus;
+
 	/* In defaut case sdvo lvds is false */
 	sdvo_priv->is_lvds = false;
 	intel_sdvo_get_capabilities(intel_output, &sdvo_priv->caps);
@@ -1862,11 +1921,11 @@ bool intel_sdvo_init(struct drm_device *dev, int output_device)
 		  sdvo_priv->caps.output_flags &
 			(SDVO_OUTPUT_TMDS1 | SDVO_OUTPUT_RGB1) ? 'Y' : 'N');
 
-	intel_output->ddc_bus = i2cbus;
-
 	return true;
 
 err_i2c:
+	if (ddcbus != NULL)
+		intel_i2c_destroy(intel_output->ddc_bus);
 	intel_i2c_destroy(intel_output->i2c_bus);
 err_inteloutput:
 	kfree(intel_output);
-- 
cgit v0.10.2


From 0b827537e339c084ac9384df588969d400be9e0d Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Fri, 22 May 2009 13:23:37 -0700
Subject: x86: bugfix wbinvd() model check instead of family check

wbinvd is supported on all CPUs 486 or later. But,
pageattr.c is checking x86_model >= 4 before wbinvd(), which looks like
an oversight bug. It was first introduced at one place by changeset
d7c8f21a8cad0228c7c5ce2bb6dbd95d1ee49d13 and got copied over to second
place in the same file later.

[ Impact: fix missing cache flush on early-model CPUs, potential data corruption ]

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 797f9f1..2cc019a 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -153,7 +153,7 @@ static void __cpa_flush_all(void *arg)
 	 */
 	__flush_tlb_all();
 
-	if (cache && boot_cpu_data.x86_model >= 4)
+	if (cache && boot_cpu_data.x86 >= 4)
 		wbinvd();
 }
 
@@ -218,7 +218,7 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache,
 
 	/* 4M threshold */
 	if (numpages >= 1024) {
-		if (boot_cpu_data.x86_model >= 4)
+		if (boot_cpu_data.x86 >= 4)
 			wbinvd();
 		return;
 	}
-- 
cgit v0.10.2


From 0af48f42df15b97080b450d24219dd95db7b929a Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Fri, 22 May 2009 13:23:38 -0700
Subject: x86: cpa_flush_array wbinvd should be done on all CPUs

cpa_flush_array seems to prefer wbinvd() over clflush at 4M threshold.
clflush needs to be done on only one CPU as per instruction definition.
wbinvd() however, should be done on all CPUs.

[ Impact: fix missing flush which could cause data corruption ]

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 2cc019a..0f9052b 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -204,6 +204,11 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
 	}
 }
 
+static void wbinvd_local(void *unused)
+{
+	wbinvd();
+}
+
 static void cpa_flush_array(unsigned long *start, int numpages, int cache,
 			    int in_flags, struct page **pages)
 {
@@ -219,7 +224,8 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache,
 	/* 4M threshold */
 	if (numpages >= 1024) {
 		if (boot_cpu_data.x86 >= 4)
-			wbinvd();
+			on_each_cpu(wbinvd_local, NULL, 1);
+
 		return;
 	}
 	/*
-- 
cgit v0.10.2


From a49a018a6ea6d73742a81d673fe5ec4a7d2137b3 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@cam.org>
Date: Fri, 22 May 2009 16:53:40 -0400
Subject: [ARM] add coherent DMA mask for mv643xx_eth

Since commit eb0519b5a1cf, mv643xx_eth is non functional on ARM because
the platform device declaration does not include any coherent DMA mask
and coherent memory allocations fail.

Signed-off-by: Nicolas Pitre <nico@marvell.com>

diff --git a/arch/arm/mach-kirkwood/common.c b/arch/arm/mach-kirkwood/common.c
index 3fab82a..be1ca28 100644
--- a/arch/arm/mach-kirkwood/common.c
+++ b/arch/arm/mach-kirkwood/common.c
@@ -144,6 +144,9 @@ static struct platform_device kirkwood_ge00 = {
 	.id		= 0,
 	.num_resources	= 1,
 	.resource	= kirkwood_ge00_resources,
+	.dev		= {
+		.coherent_dma_mask	= 0xffffffff,
+	},
 };
 
 void __init kirkwood_ge00_init(struct mv643xx_eth_platform_data *eth_data)
@@ -202,6 +205,9 @@ static struct platform_device kirkwood_ge01 = {
 	.id		= 1,
 	.num_resources	= 1,
 	.resource	= kirkwood_ge01_resources,
+	.dev		= {
+		.coherent_dma_mask	= 0xffffffff,
+	},
 };
 
 void __init kirkwood_ge01_init(struct mv643xx_eth_platform_data *eth_data)
diff --git a/arch/arm/mach-loki/common.c b/arch/arm/mach-loki/common.c
index c0d2d9d..818f19d 100644
--- a/arch/arm/mach-loki/common.c
+++ b/arch/arm/mach-loki/common.c
@@ -82,6 +82,9 @@ static struct platform_device loki_ge0 = {
 	.id		= 0,
 	.num_resources	= 1,
 	.resource	= loki_ge0_resources,
+	.dev		= {
+		.coherent_dma_mask	= 0xffffffff,
+	},
 };
 
 void __init loki_ge0_init(struct mv643xx_eth_platform_data *eth_data)
@@ -136,6 +139,9 @@ static struct platform_device loki_ge1 = {
 	.id		= 1,
 	.num_resources	= 1,
 	.resource	= loki_ge1_resources,
+	.dev		= {
+		.coherent_dma_mask	= 0xffffffff,
+	},
 };
 
 void __init loki_ge1_init(struct mv643xx_eth_platform_data *eth_data)
diff --git a/arch/arm/mach-mv78xx0/common.c b/arch/arm/mach-mv78xx0/common.c
index 0d88eea..1b22e4a 100644
--- a/arch/arm/mach-mv78xx0/common.c
+++ b/arch/arm/mach-mv78xx0/common.c
@@ -321,6 +321,9 @@ static struct platform_device mv78xx0_ge00 = {
 	.id		= 0,
 	.num_resources	= 1,
 	.resource	= mv78xx0_ge00_resources,
+	.dev		= {
+		.coherent_dma_mask	= 0xffffffff,
+	},
 };
 
 void __init mv78xx0_ge00_init(struct mv643xx_eth_platform_data *eth_data)
@@ -375,6 +378,9 @@ static struct platform_device mv78xx0_ge01 = {
 	.id		= 1,
 	.num_resources	= 1,
 	.resource	= mv78xx0_ge01_resources,
+	.dev		= {
+		.coherent_dma_mask	= 0xffffffff,
+	},
 };
 
 void __init mv78xx0_ge01_init(struct mv643xx_eth_platform_data *eth_data)
@@ -429,6 +435,9 @@ static struct platform_device mv78xx0_ge10 = {
 	.id		= 2,
 	.num_resources	= 1,
 	.resource	= mv78xx0_ge10_resources,
+	.dev		= {
+		.coherent_dma_mask	= 0xffffffff,
+	},
 };
 
 void __init mv78xx0_ge10_init(struct mv643xx_eth_platform_data *eth_data)
@@ -496,6 +505,9 @@ static struct platform_device mv78xx0_ge11 = {
 	.id		= 3,
 	.num_resources	= 1,
 	.resource	= mv78xx0_ge11_resources,
+	.dev		= {
+		.coherent_dma_mask	= 0xffffffff,
+	},
 };
 
 void __init mv78xx0_ge11_init(struct mv643xx_eth_platform_data *eth_data)
diff --git a/arch/arm/mach-orion5x/common.c b/arch/arm/mach-orion5x/common.c
index a51fb9d..b1c7778 100644
--- a/arch/arm/mach-orion5x/common.c
+++ b/arch/arm/mach-orion5x/common.c
@@ -188,6 +188,9 @@ static struct platform_device orion5x_eth = {
 	.id		= 0,
 	.num_resources	= 1,
 	.resource	= orion5x_eth_resources,
+	.dev		= {
+		.coherent_dma_mask	= 0xffffffff,
+	},
 };
 
 void __init orion5x_eth_init(struct mv643xx_eth_platform_data *eth_data)
-- 
cgit v0.10.2


From e1defc4ff0cf57aca6c5e3ff99fa503f5943c1f1 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 22 May 2009 17:17:49 -0400
Subject: block: Do away with the notion of hardsect_size

Until now we have had a 1:1 mapping between storage device physical
block size and the logical block sized used when addressing the device.
With SATA 4KB drives coming out that will no longer be the case.  The
sector size will be 4KB but the logical block size will remain
512-bytes.  Hence we need to distinguish between the physical block size
and the logical ditto.

This patch renames hardsect_size to logical_block_size.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index 9e105cb..a477991 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -250,7 +250,7 @@ axon_ram_probe(struct of_device *device, const struct of_device_id *device_id)
 
 	set_capacity(bank->disk, bank->size >> AXON_RAM_SECTOR_SHIFT);
 	blk_queue_make_request(bank->disk->queue, axon_ram_make_request);
-	blk_queue_hardsect_size(bank->disk->queue, AXON_RAM_SECTOR_SIZE);
+	blk_queue_logical_block_size(bank->disk->queue, AXON_RAM_SECTOR_SIZE);
 	add_disk(bank->disk);
 
 	bank->irq_id = irq_of_parse_and_map(device->node, 0);
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 91fa8e0..73e28d3 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -340,7 +340,7 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
 		kobject_uevent(&bi->kobj, KOBJ_ADD);
 
 		bi->flags |= INTEGRITY_FLAG_READ | INTEGRITY_FLAG_WRITE;
-		bi->sector_size = disk->queue->hardsect_size;
+		bi->sector_size = queue_logical_block_size(disk->queue);
 		disk->integrity = bi;
 	} else
 		bi = disk->integrity;
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 57af728..15c3164 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -134,7 +134,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
 	q->backing_dev_info.state = 0;
 	q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
 	blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
-	blk_queue_hardsect_size(q, 512);
+	blk_queue_logical_block_size(q, 512);
 	blk_queue_dma_alignment(q, 511);
 	blk_queue_congestion_threshold(q);
 	q->nr_batching = BLK_BATCH_REQ;
@@ -288,21 +288,20 @@ void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
 EXPORT_SYMBOL(blk_queue_max_segment_size);
 
 /**
- * blk_queue_hardsect_size - set hardware sector size for the queue
+ * blk_queue_logical_block_size - set logical block size for the queue
  * @q:  the request queue for the device
- * @size:  the hardware sector size, in bytes
+ * @size:  the logical block size, in bytes
  *
  * Description:
- *   This should typically be set to the lowest possible sector size
- *   that the hardware can operate on (possible without reverting to
- *   even internal read-modify-write operations). Usually the default
- *   of 512 covers most hardware.
+ *   This should be set to the lowest possible block size that the
+ *   storage device can address.  The default of 512 covers most
+ *   hardware.
  **/
-void blk_queue_hardsect_size(struct request_queue *q, unsigned short size)
+void blk_queue_logical_block_size(struct request_queue *q, unsigned short size)
 {
-	q->hardsect_size = size;
+	q->logical_block_size = size;
 }
-EXPORT_SYMBOL(blk_queue_hardsect_size);
+EXPORT_SYMBOL(blk_queue_logical_block_size);
 
 /*
  * Returns the minimum that is _not_ zero, unless both are zero.
@@ -324,7 +323,7 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 	t->max_phys_segments = min_not_zero(t->max_phys_segments, b->max_phys_segments);
 	t->max_hw_segments = min_not_zero(t->max_hw_segments, b->max_hw_segments);
 	t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size);
-	t->hardsect_size = max(t->hardsect_size, b->hardsect_size);
+	t->logical_block_size = max(t->logical_block_size, b->logical_block_size);
 	if (!t->queue_lock)
 		WARN_ON_ONCE(1);
 	else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) {
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 3ff9bba..13d38b7 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -100,9 +100,9 @@ static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
 	return queue_var_show(max_sectors_kb, (page));
 }
 
-static ssize_t queue_hw_sector_size_show(struct request_queue *q, char *page)
+static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page)
 {
-	return queue_var_show(q->hardsect_size, page);
+	return queue_var_show(queue_logical_block_size(q), page);
 }
 
 static ssize_t
@@ -249,7 +249,12 @@ static struct queue_sysfs_entry queue_iosched_entry = {
 
 static struct queue_sysfs_entry queue_hw_sector_size_entry = {
 	.attr = {.name = "hw_sector_size", .mode = S_IRUGO },
-	.show = queue_hw_sector_size_show,
+	.show = queue_logical_block_size_show,
+};
+
+static struct queue_sysfs_entry queue_logical_block_size_entry = {
+	.attr = {.name = "logical_block_size", .mode = S_IRUGO },
+	.show = queue_logical_block_size_show,
 };
 
 static struct queue_sysfs_entry queue_nonrot_entry = {
@@ -283,6 +288,7 @@ static struct attribute *default_attrs[] = {
 	&queue_max_sectors_entry.attr,
 	&queue_iosched_entry.attr,
 	&queue_hw_sector_size_entry.attr,
+	&queue_logical_block_size_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index f87615d..9eaa194 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -763,7 +763,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */
 		return compat_put_int(arg, block_size(bdev));
 	case BLKSSZGET: /* get block device hardware sector size */
-		return compat_put_int(arg, bdev_hardsect_size(bdev));
+		return compat_put_int(arg, bdev_logical_block_size(bdev));
 	case BLKSECTGET:
 		return compat_put_ushort(arg,
 					 bdev_get_queue(bdev)->max_sectors);
diff --git a/block/ioctl.c b/block/ioctl.c
index ad474d4..7aa97f6 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -311,7 +311,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKBSZGET: /* get the logical block size (cf. BLKSSZGET) */
 		return put_int(arg, block_size(bdev));
 	case BLKSSZGET: /* get block device hardware sector size */
-		return put_int(arg, bdev_hardsect_size(bdev));
+		return put_int(arg, bdev_logical_block_size(bdev));
 	case BLKSECTGET:
 		return put_ushort(arg, bdev_get_queue(bdev)->max_sectors);
 	case BLKRASET:
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index e714e7c..94474f5 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1389,8 +1389,8 @@ static void cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
 
 	disk->queue->queuedata = h;
 
-	blk_queue_hardsect_size(disk->queue,
-				h->drv[drv_index].block_size);
+	blk_queue_logical_block_size(disk->queue,
+				     h->drv[drv_index].block_size);
 
 	/* Make sure all queue data is written out before */
 	/* setting h->drv[drv_index].queue, as setting this */
@@ -2298,7 +2298,7 @@ static int cciss_revalidate(struct gendisk *disk)
 	cciss_geometry_inquiry(h->ctlr, logvol, 1, total_size, block_size,
 			       inq_buff, drv);
 
-	blk_queue_hardsect_size(drv->queue, drv->block_size);
+	blk_queue_logical_block_size(drv->queue, drv->block_size);
 	set_capacity(disk, drv->nr_blocks);
 
 	kfree(inq_buff);
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index a02dcfc..44fa201 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -474,7 +474,7 @@ static int __init cpqarray_register_ctlr( int i, struct pci_dev *pdev)
 		disk->fops = &ida_fops;
 		if (j && !drv->nr_blks)
 			continue;
-		blk_queue_hardsect_size(hba[i]->queue, drv->blk_size);
+		blk_queue_logical_block_size(hba[i]->queue, drv->blk_size);
 		set_capacity(disk, drv->nr_blks);
 		disk->queue = hba[i]->queue;
 		disk->private_data = drv;
@@ -1546,7 +1546,7 @@ static int revalidate_allvol(ctlr_info_t *host)
 		drv_info_t *drv = &host->drv[i];
 		if (i && !drv->nr_blks)
 			continue;
-		blk_queue_hardsect_size(host->queue, drv->blk_size);
+		blk_queue_logical_block_size(host->queue, drv->blk_size);
 		set_capacity(disk, drv->nr_blks);
 		disk->queue = host->queue;
 		disk->private_data = drv;
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index 961de56..f65b3f3 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -724,7 +724,7 @@ static int __init hd_init(void)
 	blk_queue_max_sectors(hd_queue, 255);
 	init_timer(&device_timer);
 	device_timer.function = hd_times_out;
-	blk_queue_hardsect_size(hd_queue, 512);
+	blk_queue_logical_block_size(hd_queue, 512);
 
 	if (!NR_HD) {
 		/*
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index c0cd0a0..60de5a0 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -996,7 +996,7 @@ static int mg_probe(struct platform_device *plat_dev)
 		goto probe_err_6;
 	}
 	blk_queue_max_sectors(host->breq, MG_MAX_SECTS);
-	blk_queue_hardsect_size(host->breq, MG_SECTOR_SIZE);
+	blk_queue_logical_block_size(host->breq, MG_SECTOR_SIZE);
 
 	init_timer(&host->timer);
 	host->timer.function = mg_times_out;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index dc7a8c3..293f585 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2657,7 +2657,7 @@ static void pkt_init_queue(struct pktcdvd_device *pd)
 	struct request_queue *q = pd->disk->queue;
 
 	blk_queue_make_request(q, pkt_make_request);
-	blk_queue_hardsect_size(q, CD_FRAMESIZE);
+	blk_queue_logical_block_size(q, CD_FRAMESIZE);
 	blk_queue_max_sectors(q, PACKET_MAX_SECTORS);
 	blk_queue_merge_bvec(q, pkt_merge_bvec);
 	q->queuedata = pd;
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 338cee4..aaeeb54 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -477,7 +477,7 @@ static int __devinit ps3disk_probe(struct ps3_system_bus_device *_dev)
 	blk_queue_max_sectors(queue, dev->bounce_size >> 9);
 	blk_queue_segment_boundary(queue, -1UL);
 	blk_queue_dma_alignment(queue, dev->blk_size-1);
-	blk_queue_hardsect_size(queue, dev->blk_size);
+	blk_queue_logical_block_size(queue, dev->blk_size);
 
 	blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH,
 			  ps3disk_prepare_flush);
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index e67bbae..cc54473 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -722,7 +722,7 @@ static void ub_cmd_build_block(struct ub_dev *sc, struct ub_lun *lun,
 	/*
 	 * build the command
 	 *
-	 * The call to blk_queue_hardsect_size() guarantees that request
+	 * The call to blk_queue_logical_block_size() guarantees that request
 	 * is aligned, but it is given in terms of 512 byte units, always.
 	 */
 	block = blk_rq_pos(rq) >> lun->capacity.bshift;
@@ -1749,7 +1749,7 @@ static int ub_bd_revalidate(struct gendisk *disk)
 	ub_revalidate(lun->udev, lun);
 
 	/* XXX Support sector size switching like in sr.c */
-	blk_queue_hardsect_size(disk->queue, lun->capacity.bsize);
+	blk_queue_logical_block_size(disk->queue, lun->capacity.bsize);
 	set_capacity(disk, lun->capacity.nsec);
 	// set_disk_ro(sdkp->disk, lun->readonly);
 
@@ -2324,7 +2324,7 @@ static int ub_probe_lun(struct ub_dev *sc, int lnum)
 	blk_queue_max_phys_segments(q, UB_MAX_REQ_SG);
 	blk_queue_segment_boundary(q, 0xffffffff);	/* Dubious. */
 	blk_queue_max_sectors(q, UB_MAX_SECTORS);
-	blk_queue_hardsect_size(q, lun->capacity.bsize);
+	blk_queue_logical_block_size(q, lun->capacity.bsize);
 
 	lun->disk = disk;
 	q->queuedata = lun;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 511d4ae..c4845b1 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -347,7 +347,7 @@ static int virtblk_probe(struct virtio_device *vdev)
 				offsetof(struct virtio_blk_config, blk_size),
 				&blk_size);
 	if (!err)
-		blk_queue_hardsect_size(vblk->disk->queue, blk_size);
+		blk_queue_logical_block_size(vblk->disk->queue, blk_size);
 
 	add_disk(vblk->disk);
 	return 0;
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 132120a..c199682 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -344,7 +344,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
 	queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
 
 	/* Hard sector size and max sectors impersonate the equiv. hardware. */
-	blk_queue_hardsect_size(rq, sector_size);
+	blk_queue_logical_block_size(rq, sector_size);
 	blk_queue_max_sectors(rq, 512);
 
 	/* Each segment in a request is up to an aligned page in size. */
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 3a4397e..f08491a 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -984,7 +984,7 @@ static int __devinit ace_setup(struct ace_device *ace)
 	ace->queue = blk_init_queue(ace_request, &ace->lock);
 	if (ace->queue == NULL)
 		goto err_blk_initq;
-	blk_queue_hardsect_size(ace->queue, 512);
+	blk_queue_logical_block_size(ace->queue, 512);
 
 	/*
 	 * Allocate and initialize GD structure
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 1e366ad..b5621f2 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -739,7 +739,7 @@ static void __devinit probe_gdrom_setupdisk(void)
 
 static int __devinit probe_gdrom_setupqueue(void)
 {
-	blk_queue_hardsect_size(gd.gdrom_rq, GDROM_HARD_SECTOR);
+	blk_queue_logical_block_size(gd.gdrom_rq, GDROM_HARD_SECTOR);
 	/* using DMA so memory will need to be contiguous */
 	blk_queue_max_hw_segments(gd.gdrom_rq, 1);
 	/* set a large max size to get most from DMA */
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index f177c2d..0fff646 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -469,8 +469,8 @@ static void vio_handle_cd_event(struct HvLpEvent *event)
 	case viocdopen:
 		if (event->xRc == 0) {
 			di = &viocd_diskinfo[bevent->disk];
-			blk_queue_hardsect_size(di->viocd_disk->queue,
-					bevent->block_size);
+			blk_queue_logical_block_size(di->viocd_disk->queue,
+						     bevent->block_size);
 			set_capacity(di->viocd_disk,
 					bevent->media_size *
 					bevent->block_size / 512);
diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index 20d90e6..db32f0e 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -71,7 +71,7 @@ static int raw_open(struct inode *inode, struct file *filp)
 	err = bd_claim(bdev, raw_open);
 	if (err)
 		goto out1;
-	err = set_blocksize(bdev, bdev_hardsect_size(bdev));
+	err = set_blocksize(bdev, bdev_logical_block_size(bdev));
 	if (err)
 		goto out2;
 	filp->f_flags |= O_DIRECT;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 1799328..424140c 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -182,7 +182,7 @@ static void cdrom_analyze_sense_data(ide_drive_t *drive,
 				 (sense->information[2] <<  8) |
 				 (sense->information[3]);
 
-			if (drive->queue->hardsect_size == 2048)
+			if (queue_logical_block_size(drive->queue) == 2048)
 				/* device sector size is 2K */
 				sector <<= 2;
 
@@ -737,7 +737,7 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
 	struct request_queue *q = drive->queue;
 	int write = rq_data_dir(rq) == WRITE;
 	unsigned short sectors_per_frame =
-		queue_hardsect_size(q) >> SECTOR_BITS;
+		queue_logical_block_size(q) >> SECTOR_BITS;
 
 	ide_debug_log(IDE_DBG_RQ, "rq->cmd[0]: 0x%x, rq->cmd_flags: 0x%x, "
 				  "secs_per_frame: %u",
@@ -1021,8 +1021,8 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
 	/* save a private copy of the TOC capacity for error handling */
 	drive->probed_capacity = toc->capacity * sectors_per_frame;
 
-	blk_queue_hardsect_size(drive->queue,
-				sectors_per_frame << SECTOR_BITS);
+	blk_queue_logical_block_size(drive->queue,
+				     sectors_per_frame << SECTOR_BITS);
 
 	/* first read just the header, so we know how long the TOC is */
 	stat = cdrom_read_tocentry(drive, 0, 1, 0, (char *) &toc->hdr,
@@ -1338,7 +1338,7 @@ static int ide_cdrom_probe_capabilities(ide_drive_t *drive)
 /* standard prep_rq_fn that builds 10 byte cmds */
 static int ide_cdrom_prep_fs(struct request_queue *q, struct request *rq)
 {
-	int hard_sect = queue_hardsect_size(q);
+	int hard_sect = queue_logical_block_size(q);
 	long block = (long)blk_rq_pos(rq) / (hard_sect >> 9);
 	unsigned long blocks = blk_rq_sectors(rq) / (hard_sect >> 9);
 
@@ -1543,7 +1543,7 @@ static int ide_cdrom_setup(ide_drive_t *drive)
 
 	nslots = ide_cdrom_probe_capabilities(drive);
 
-	blk_queue_hardsect_size(q, CD_FRAMESIZE);
+	blk_queue_logical_block_size(q, CD_FRAMESIZE);
 
 	if (ide_cdrom_register(drive, nslots)) {
 		printk(KERN_ERR PFX "%s: %s failed to register device with the"
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 47c68bc..06b0ded 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -232,7 +232,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset,
 		target = rdev->sb_start + offset + index * (PAGE_SIZE/512);
 
 		if (sync_page_io(rdev->bdev, target,
-				 roundup(size, bdev_hardsect_size(rdev->bdev)),
+				 roundup(size, bdev_logical_block_size(rdev->bdev)),
 				 page, READ)) {
 			page->index = index;
 			attach_page_buffers(page, NULL); /* so that free_buffer will
@@ -287,7 +287,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 			int size = PAGE_SIZE;
 			if (page->index == bitmap->file_pages-1)
 				size = roundup(bitmap->last_page_size,
-					       bdev_hardsect_size(rdev->bdev));
+					       bdev_logical_block_size(rdev->bdev));
 			/* Just make sure we aren't corrupting data or
 			 * metadata
 			 */
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index a2e26c2..75d8081 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -178,7 +178,7 @@ static int set_chunk_size(struct dm_exception_store *store,
 	}
 
 	/* Validate the chunk size against the device block size */
-	if (chunk_size_ulong % (bdev_hardsect_size(store->cow->bdev) >> 9)) {
+	if (chunk_size_ulong % (bdev_logical_block_size(store->cow->bdev) >> 9)) {
 		*error = "Chunk size is not a multiple of device blocksize";
 		return -EINVAL;
 	}
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index be233bc..6fa8ccf 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -413,7 +413,8 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 		 * Buffer holds both header and bitset.
 		 */
 		buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) +
-				       bitset_size, ti->limits.hardsect_size);
+				       bitset_size,
+				       ti->limits.logical_block_size);
 
 		if (buf_size > dev->bdev->bd_inode->i_size) {
 			DMWARN("log device %s too small: need %llu bytes",
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index e75c6dd..2662a41 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -282,7 +282,7 @@ static int read_header(struct pstore *ps, int *new_snapshot)
 	 */
 	if (!ps->store->chunk_size) {
 		ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
-		    bdev_hardsect_size(ps->store->cow->bdev) >> 9);
+		    bdev_logical_block_size(ps->store->cow->bdev) >> 9);
 		ps->store->chunk_mask = ps->store->chunk_size - 1;
 		ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1;
 		chunk_size_supplied = 0;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 429b50b..65e2d97 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -108,7 +108,8 @@ static void combine_restrictions_low(struct io_restrictions *lhs,
 	lhs->max_hw_segments =
 		min_not_zero(lhs->max_hw_segments, rhs->max_hw_segments);
 
-	lhs->hardsect_size = max(lhs->hardsect_size, rhs->hardsect_size);
+	lhs->logical_block_size = max(lhs->logical_block_size,
+				      rhs->logical_block_size);
 
 	lhs->max_segment_size =
 		min_not_zero(lhs->max_segment_size, rhs->max_segment_size);
@@ -529,7 +530,8 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 	rs->max_hw_segments =
 		min_not_zero(rs->max_hw_segments, q->max_hw_segments);
 
-	rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size);
+	rs->logical_block_size = max(rs->logical_block_size,
+				     queue_logical_block_size(q));
 
 	rs->max_segment_size =
 		min_not_zero(rs->max_segment_size, q->max_segment_size);
@@ -683,8 +685,8 @@ static void check_for_valid_limits(struct io_restrictions *rs)
 		rs->max_phys_segments = MAX_PHYS_SEGMENTS;
 	if (!rs->max_hw_segments)
 		rs->max_hw_segments = MAX_HW_SEGMENTS;
-	if (!rs->hardsect_size)
-		rs->hardsect_size = 1 << SECTOR_SHIFT;
+	if (!rs->logical_block_size)
+		rs->logical_block_size = 1 << SECTOR_SHIFT;
 	if (!rs->max_segment_size)
 		rs->max_segment_size = MAX_SEGMENT_SIZE;
 	if (!rs->seg_boundary_mask)
@@ -914,7 +916,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
 	blk_queue_max_sectors(q, t->limits.max_sectors);
 	q->max_phys_segments = t->limits.max_phys_segments;
 	q->max_hw_segments = t->limits.max_hw_segments;
-	q->hardsect_size = t->limits.hardsect_size;
+	q->logical_block_size = t->limits.logical_block_size;
 	q->max_segment_size = t->limits.max_segment_size;
 	q->max_hw_sectors = t->limits.max_hw_sectors;
 	q->seg_boundary_mask = t->limits.seg_boundary_mask;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index fccc834..4cbc19f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1202,7 +1202,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 	atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
 
 	rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
-	bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
+	bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
 	if (rdev->sb_size & bmask)
 		rdev->sb_size = (rdev->sb_size | bmask) + 1;
 
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index c0bebc6..7847bbc 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -1242,7 +1242,7 @@ static int mspro_block_init_disk(struct memstick_dev *card)
 
 	sprintf(msb->disk->disk_name, "mspblk%d", disk_id);
 
-	blk_queue_hardsect_size(msb->queue, msb->page_size);
+	blk_queue_logical_block_size(msb->queue, msb->page_size);
 
 	capacity = be16_to_cpu(sys_info->user_block_count);
 	capacity *= be16_to_cpu(sys_info->block_size);
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 6573ef4..335d4c7 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -794,8 +794,9 @@ static int i2o_block_transfer(struct request *req)
 	if (c->adaptec) {
 		u8 cmd[10];
 		u32 scsi_flags;
-		u16 hwsec = queue_hardsect_size(req->q) >> KERNEL_SECTOR_SHIFT;
+		u16 hwsec;
 
+		hwsec = queue_logical_block_size(req->q) >> KERNEL_SECTOR_SHIFT;
 		memset(cmd, 0, 10);
 
 		sgl_offset = SGL_OFFSET_12;
@@ -1078,7 +1079,7 @@ static int i2o_block_probe(struct device *dev)
 	 */
 	if (!i2o_parm_field_get(i2o_dev, 0x0004, 1, &blocksize, 4) ||
 	    !i2o_parm_field_get(i2o_dev, 0x0000, 3, &blocksize, 4)) {
-		blk_queue_hardsect_size(queue, le32_to_cpu(blocksize));
+		blk_queue_logical_block_size(queue, le32_to_cpu(blocksize));
 	} else
 		osm_warn("unable to get blocksize of %s\n", gd->disk_name);
 
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index c5df865..98ffc41e 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -521,7 +521,7 @@ static struct mmc_blk_data *mmc_blk_alloc(struct mmc_card *card)
 
 	sprintf(md->disk->disk_name, "mmcblk%d", devidx);
 
-	blk_queue_hardsect_size(md->queue.queue, 512);
+	blk_queue_logical_block_size(md->queue.queue, 512);
 
 	if (!mmc_card_sd(card) && mmc_card_blockaddr(card)) {
 		/*
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 502622f..aaac3b6 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -378,7 +378,7 @@ int register_mtd_blktrans(struct mtd_blktrans_ops *tr)
 	}
 
 	tr->blkcore_priv->rq->queuedata = tr;
-	blk_queue_hardsect_size(tr->blkcore_priv->rq, tr->blksize);
+	blk_queue_logical_block_size(tr->blkcore_priv->rq, tr->blksize);
 	if (tr->discard)
 		blk_queue_set_discard(tr->blkcore_priv->rq,
 				      blktrans_discard_request);
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index e64f62d..27a1be0 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -1990,7 +1990,7 @@ static void dasd_setup_queue(struct dasd_block *block)
 {
 	int max;
 
-	blk_queue_hardsect_size(block->request_queue, block->bp_block);
+	blk_queue_logical_block_size(block->request_queue, block->bp_block);
 	max = block->base->discipline->max_blocks << block->s2b_shift;
 	blk_queue_max_sectors(block->request_queue, max);
 	blk_queue_max_phys_segments(block->request_queue, -1L);
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index cfdcf1a..a4c7ffc 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -602,7 +602,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
 	dev_info->gd->private_data = dev_info;
 	dev_info->gd->driverfs_dev = &dev_info->dev;
 	blk_queue_make_request(dev_info->dcssblk_queue, dcssblk_make_request);
-	blk_queue_hardsect_size(dev_info->dcssblk_queue, 4096);
+	blk_queue_logical_block_size(dev_info->dcssblk_queue, 4096);
 
 	seg_byte_size = (dev_info->end - dev_info->start + 1);
 	set_capacity(dev_info->gd, seg_byte_size >> 9); // size in sectors
diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c
index 76814f3..0ae0c83 100644
--- a/drivers/s390/block/xpram.c
+++ b/drivers/s390/block/xpram.c
@@ -343,7 +343,7 @@ static int __init xpram_setup_blkdev(void)
 			goto out;
 		}
 		blk_queue_make_request(xpram_queues[i], xpram_make_request);
-		blk_queue_hardsect_size(xpram_queues[i], 4096);
+		blk_queue_logical_block_size(xpram_queues[i], 4096);
 	}
 
 	/*
diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c
index 1e79676..47ff695 100644
--- a/drivers/s390/char/tape_block.c
+++ b/drivers/s390/char/tape_block.c
@@ -222,7 +222,7 @@ tapeblock_setup_device(struct tape_device * device)
 	if (rc)
 		goto cleanup_queue;
 
-	blk_queue_hardsect_size(blkdat->request_queue, TAPEBLOCK_HSEC_SIZE);
+	blk_queue_logical_block_size(blkdat->request_queue, TAPEBLOCK_HSEC_SIZE);
 	blk_queue_max_sectors(blkdat->request_queue, TAPEBLOCK_MAX_SEC);
 	blk_queue_max_phys_segments(blkdat->request_queue, -1L);
 	blk_queue_max_hw_segments(blkdat->request_queue, -1L);
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 40d2860..bcf3bd4 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1510,7 +1510,7 @@ got_data:
 		 */
 		sector_size = 512;
 	}
-	blk_queue_hardsect_size(sdp->request_queue, sector_size);
+	blk_queue_logical_block_size(sdp->request_queue, sector_size);
 
 	{
 		char cap_str_2[10], cap_str_10[10];
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index fddba53..cd350df 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -727,7 +727,7 @@ static void get_sectorsize(struct scsi_cd *cd)
 	}
 
 	queue = cd->device->request_queue;
-	blk_queue_hardsect_size(queue, sector_size);
+	blk_queue_logical_block_size(queue, sector_size);
 
 	return;
 }
diff --git a/fs/bio.c b/fs/bio.c
index 81dc93e..4445c38 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1490,11 +1490,12 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 sector_t bio_sector_offset(struct bio *bio, unsigned short index,
 			   unsigned int offset)
 {
-	unsigned int sector_sz = queue_hardsect_size(bio->bi_bdev->bd_disk->queue);
+	unsigned int sector_sz;
 	struct bio_vec *bv;
 	sector_t sectors;
 	int i;
 
+	sector_sz = queue_logical_block_size(bio->bi_bdev->bd_disk->queue);
 	sectors = 0;
 
 	if (index >= bio->bi_idx)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index a85fe31..a29b4dc 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -76,7 +76,7 @@ int set_blocksize(struct block_device *bdev, int size)
 		return -EINVAL;
 
 	/* Size cannot be smaller than the size supported by the device */
-	if (size < bdev_hardsect_size(bdev))
+	if (size < bdev_logical_block_size(bdev))
 		return -EINVAL;
 
 	/* Don't change the size if it is same as current */
@@ -106,7 +106,7 @@ EXPORT_SYMBOL(sb_set_blocksize);
 
 int sb_min_blocksize(struct super_block *sb, int size)
 {
-	int minsize = bdev_hardsect_size(sb->s_bdev);
+	int minsize = bdev_logical_block_size(sb->s_bdev);
 	if (size < minsize)
 		size = minsize;
 	return sb_set_blocksize(sb, size);
@@ -1117,7 +1117,7 @@ EXPORT_SYMBOL(check_disk_change);
 
 void bd_set_size(struct block_device *bdev, loff_t size)
 {
-	unsigned bsize = bdev_hardsect_size(bdev);
+	unsigned bsize = bdev_logical_block_size(bdev);
 
 	bdev->bd_inode->i_size = size;
 	while (bsize < PAGE_CACHE_SIZE) {
diff --git a/fs/buffer.c b/fs/buffer.c
index aed2977..36e2bbc 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1085,12 +1085,12 @@ static struct buffer_head *
 __getblk_slow(struct block_device *bdev, sector_t block, int size)
 {
 	/* Size must be multiple of hard sectorsize */
-	if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
+	if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
 			(size < 512 || size > PAGE_SIZE))) {
 		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
 					size);
-		printk(KERN_ERR "hardsect size: %d\n",
-					bdev_hardsect_size(bdev));
+		printk(KERN_ERR "logical block size: %d\n",
+					bdev_logical_block_size(bdev));
 
 		dump_stack();
 		return NULL;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 05763bb..8b10b87 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1127,7 +1127,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 		rw = WRITE_ODIRECT;
 
 	if (bdev)
-		bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
+		bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
 
 	if (offset & blocksize_mask) {
 		if (bdev)
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 599dbfe..acbb94f 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1696,7 +1696,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
-	hblock = bdev_hardsect_size(sb->s_bdev);
+	hblock = bdev_logical_block_size(sb->s_bdev);
 	if (sb->s_blocksize != blocksize) {
 		/*
 		 * Make sure the blocksize for the filesystem is larger
@@ -2119,7 +2119,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 	}
 
 	blocksize = sb->s_blocksize;
-	hblock = bdev_hardsect_size(bdev);
+	hblock = bdev_logical_block_size(bdev);
 	if (blocksize < hblock) {
 		printk(KERN_ERR
 			"EXT3-fs: blocksize too small for journal device.\n");
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2958f4e..a30549f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2962,7 +2962,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	}
 
 	blocksize = sb->s_blocksize;
-	hblock = bdev_hardsect_size(bdev);
+	hblock = bdev_logical_block_size(bdev);
 	if (blocksize < hblock) {
 		printk(KERN_ERR
 			"EXT4-fs: blocksize too small for journal device.\n");
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1ff9473..a3b2ac9 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -526,11 +526,11 @@ static int init_sb(struct gfs2_sbd *sdp, int silent)
 	}
 
 	/* Set up the buffer cache and SB for real */
-	if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
+	if (sdp->sd_sb.sb_bsize < bdev_logical_block_size(sb->s_bdev)) {
 		ret = -EINVAL;
 		fs_err(sdp, "FS block size (%u) is too small for device "
 		       "block size (%u)\n",
-		       sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
+		       sdp->sd_sb.sb_bsize, bdev_logical_block_size(sb->s_bdev));
 		goto out;
 	}
 	if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 5650382..a971d24 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -845,7 +845,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 	struct super_block *sb = sdp->sd_vfs;
 	struct block_device *bdev = sb->s_bdev;
 	const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize /
-					   bdev_hardsect_size(sb->s_bdev);
+					   bdev_logical_block_size(sb->s_bdev);
 	u64 blk;
 	sector_t start = 0;
 	sector_t nr_sects = 0;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 7f65b3b..a91f15b 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -515,7 +515,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 
 	blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
 	if (sb->s_blocksize != blocksize) {
-		int hw_blocksize = bdev_hardsect_size(sb->s_bdev);
+		int hw_blocksize = bdev_logical_block_size(sb->s_bdev);
 
 		if (blocksize < hw_blocksize) {
 			printk(KERN_ERR
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index f76951d..6aa7c47 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -25,7 +25,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/spinlock.h>
-#include <linux/blkdev.h>	/* For bdev_hardsect_size(). */
+#include <linux/blkdev.h>	/* For bdev_logical_block_size(). */
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
@@ -2785,13 +2785,13 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 		goto err_out_now;
 
 	/* We support sector sizes up to the PAGE_CACHE_SIZE. */
-	if (bdev_hardsect_size(sb->s_bdev) > PAGE_CACHE_SIZE) {
+	if (bdev_logical_block_size(sb->s_bdev) > PAGE_CACHE_SIZE) {
 		if (!silent)
 			ntfs_error(sb, "Device has unsupported sector size "
 					"(%i).  The maximum supported sector "
 					"size on this architecture is %lu "
 					"bytes.",
-					bdev_hardsect_size(sb->s_bdev),
+					bdev_logical_block_size(sb->s_bdev),
 					PAGE_CACHE_SIZE);
 		goto err_out_now;
 	}
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 4f85ece..09cc25d 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1371,7 +1371,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 
 	bdevname(reg->hr_bdev, reg->hr_dev_name);
 
-	sectsize = bdev_hardsect_size(reg->hr_bdev);
+	sectsize = bdev_logical_block_size(reg->hr_bdev);
 	if (sectsize != reg->hr_block_bytes) {
 		mlog(ML_ERROR,
 		     "blocksize %u incorrect for device, expected %d",
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 79ff8d9..5c6163f 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -713,7 +713,7 @@ static int ocfs2_sb_probe(struct super_block *sb,
 	*bh = NULL;
 
 	/* may be > 512 */
-	*sector_size = bdev_hardsect_size(sb->s_bdev);
+	*sector_size = bdev_logical_block_size(sb->s_bdev);
 	if (*sector_size > OCFS2_MAX_BLOCKSIZE) {
 		mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n",
 		     *sector_size, OCFS2_MAX_BLOCKSIZE);
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 4629768..fc71aab 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -76,7 +76,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 	Sector sect;
 
 	res = 0;
-	blocksize = bdev_hardsect_size(bdev);
+	blocksize = bdev_logical_block_size(bdev);
 	if (blocksize <= 0)
 		goto out_exit;
 	i_size = i_size_read(bdev->bd_inode);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 7965118..0028d2e 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -110,7 +110,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
 	Sector sect;
 	unsigned char *data;
 	u32 this_sector, this_size;
-	int sector_size = bdev_hardsect_size(bdev) / 512;
+	int sector_size = bdev_logical_block_size(bdev) / 512;
 	int loopct = 0;		/* number of links followed
 				   without finding a data partition */
 	int i;
@@ -415,7 +415,7 @@ static struct {
  
 int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
-	int sector_size = bdev_hardsect_size(bdev) / 512;
+	int sector_size = bdev_logical_block_size(bdev) / 512;
 	Sector sect;
 	unsigned char *data;
 	struct partition *p;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 72348cc..0ba4410 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1915,7 +1915,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
 		ret = udf_load_vrs(sb, &uopt, silent, &fileset);
 	} else {
-		uopt.blocksize = bdev_hardsect_size(sb->s_bdev);
+		uopt.blocksize = bdev_logical_block_size(sb->s_bdev);
 		ret = udf_load_vrs(sb, &uopt, silent, &fileset);
 		if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
 			if (!silent)
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index e28800a..1418b91 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1501,7 +1501,7 @@ xfs_setsize_buftarg_early(
 	struct block_device	*bdev)
 {
 	return xfs_setsize_buftarg_flags(btp,
-			PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0);
+			PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0);
 }
 
 int
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 56ce53f..872b78b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -391,7 +391,7 @@ struct request_queue
 	unsigned int		max_hw_sectors;
 	unsigned short		max_phys_segments;
 	unsigned short		max_hw_segments;
-	unsigned short		hardsect_size;
+	unsigned short		logical_block_size;
 	unsigned int		max_segment_size;
 
 	unsigned long		seg_boundary_mask;
@@ -901,7 +901,7 @@ extern void blk_queue_max_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
-extern void blk_queue_hardsect_size(struct request_queue *, unsigned short);
+extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
 extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b);
 extern void blk_queue_dma_pad(struct request_queue *, unsigned int);
 extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
@@ -988,19 +988,19 @@ extern void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter);
 
 #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist)
 
-static inline int queue_hardsect_size(struct request_queue *q)
+static inline unsigned short queue_logical_block_size(struct request_queue *q)
 {
 	int retval = 512;
 
-	if (q && q->hardsect_size)
-		retval = q->hardsect_size;
+	if (q && q->logical_block_size)
+		retval = q->logical_block_size;
 
 	return retval;
 }
 
-static inline int bdev_hardsect_size(struct block_device *bdev)
+static inline unsigned short bdev_logical_block_size(struct block_device *bdev)
 {
-	return queue_hardsect_size(bdev_get_queue(bdev));
+	return queue_logical_block_size(bdev_get_queue(bdev));
 }
 
 static inline int queue_dma_alignment(struct request_queue *q)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index ded2d7c..49c2362 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -149,7 +149,7 @@ struct io_restrictions {
 	unsigned max_hw_sectors;
 	unsigned max_sectors;
 	unsigned max_segment_size;
-	unsigned short hardsect_size;
+	unsigned short logical_block_size;
 	unsigned short max_hw_segments;
 	unsigned short max_phys_segments;
 	unsigned char no_cluster; /* inverted so that 0 is default */
-- 
cgit v0.10.2


From ae03bf639a5027d27270123f5f6e3ee6a412781d Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 22 May 2009 17:17:50 -0400
Subject: block: Use accessor functions for queue limits

Convert all external users of queue limits to using wrapper functions
instead of poking the request queue variables directly.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 0d98054..30022b4 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -388,10 +388,10 @@ int blkdev_issue_discard(struct block_device *bdev,
 
 		bio->bi_sector = sector;
 
-		if (nr_sects > q->max_hw_sectors) {
-			bio->bi_size = q->max_hw_sectors << 9;
-			nr_sects -= q->max_hw_sectors;
-			sector += q->max_hw_sectors;
+		if (nr_sects > queue_max_hw_sectors(q)) {
+			bio->bi_size = queue_max_hw_sectors(q) << 9;
+			nr_sects -= queue_max_hw_sectors(q);
+			sector += queue_max_hw_sectors(q);
 		} else {
 			bio->bi_size = nr_sects << 9;
 			nr_sects = 0;
diff --git a/block/blk-core.c b/block/blk-core.c
index 59c4af5..7a4c401 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1437,11 +1437,11 @@ static inline void __generic_make_request(struct bio *bio)
 			goto end_io;
 		}
 
-		if (unlikely(nr_sectors > q->max_hw_sectors)) {
+		if (unlikely(nr_sectors > queue_max_hw_sectors(q))) {
 			printk(KERN_ERR "bio too big device %s (%u > %u)\n",
-				bdevname(bio->bi_bdev, b),
-				bio_sectors(bio),
-				q->max_hw_sectors);
+			       bdevname(bio->bi_bdev, b),
+			       bio_sectors(bio),
+			       queue_max_hw_sectors(q));
 			goto end_io;
 		}
 
@@ -1608,8 +1608,8 @@ EXPORT_SYMBOL(submit_bio);
  */
 int blk_rq_check_limits(struct request_queue *q, struct request *rq)
 {
-	if (blk_rq_sectors(rq) > q->max_sectors ||
-	    blk_rq_bytes(rq) > q->max_hw_sectors << 9) {
+	if (blk_rq_sectors(rq) > queue_max_sectors(q) ||
+	    blk_rq_bytes(rq) > queue_max_hw_sectors(q) << 9) {
 		printk(KERN_ERR "%s: over max size limit.\n", __func__);
 		return -EIO;
 	}
@@ -1621,8 +1621,8 @@ int blk_rq_check_limits(struct request_queue *q, struct request *rq)
 	 * limitation.
 	 */
 	blk_recalc_rq_segments(rq);
-	if (rq->nr_phys_segments > q->max_phys_segments ||
-	    rq->nr_phys_segments > q->max_hw_segments) {
+	if (rq->nr_phys_segments > queue_max_phys_segments(q) ||
+	    rq->nr_phys_segments > queue_max_hw_segments(q)) {
 		printk(KERN_ERR "%s: over max segments limit.\n", __func__);
 		return -EIO;
 	}
diff --git a/block/blk-map.c b/block/blk-map.c
index ef2492a..9083cf0 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -115,7 +115,7 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
 	struct bio *bio = NULL;
 	int ret;
 
-	if (len > (q->max_hw_sectors << 9))
+	if (len > (queue_max_hw_sectors(q) << 9))
 		return -EINVAL;
 	if (!len)
 		return -EINVAL;
@@ -292,7 +292,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	struct bio *bio;
 	int ret;
 
-	if (len > (q->max_hw_sectors << 9))
+	if (len > (queue_max_hw_sectors(q) << 9))
 		return -EINVAL;
 	if (!len || !kbuf)
 		return -EINVAL;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4974dd5..39ce644 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -32,11 +32,12 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 			 * never considered part of another segment, since that
 			 * might change with the bounce page.
 			 */
-			high = page_to_pfn(bv->bv_page) > q->bounce_pfn;
+			high = page_to_pfn(bv->bv_page) > queue_bounce_pfn(q);
 			if (high || highprv)
 				goto new_segment;
 			if (cluster) {
-				if (seg_size + bv->bv_len > q->max_segment_size)
+				if (seg_size + bv->bv_len
+				    > queue_max_segment_size(q))
 					goto new_segment;
 				if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))
 					goto new_segment;
@@ -91,7 +92,7 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
 		return 0;
 
 	if (bio->bi_seg_back_size + nxt->bi_seg_front_size >
-	    q->max_segment_size)
+	    queue_max_segment_size(q))
 		return 0;
 
 	if (!bio_has_data(bio))
@@ -134,7 +135,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 		int nbytes = bvec->bv_len;
 
 		if (bvprv && cluster) {
-			if (sg->length + nbytes > q->max_segment_size)
+			if (sg->length + nbytes > queue_max_segment_size(q))
 				goto new_segment;
 
 			if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
@@ -205,8 +206,8 @@ static inline int ll_new_hw_segment(struct request_queue *q,
 {
 	int nr_phys_segs = bio_phys_segments(q, bio);
 
-	if (req->nr_phys_segments + nr_phys_segs > q->max_hw_segments
-	    || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
+	if (req->nr_phys_segments + nr_phys_segs > queue_max_hw_segments(q) ||
+	    req->nr_phys_segments + nr_phys_segs > queue_max_phys_segments(q)) {
 		req->cmd_flags |= REQ_NOMERGE;
 		if (req == q->last_merge)
 			q->last_merge = NULL;
@@ -227,9 +228,9 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
 	unsigned short max_sectors;
 
 	if (unlikely(blk_pc_request(req)))
-		max_sectors = q->max_hw_sectors;
+		max_sectors = queue_max_hw_sectors(q);
 	else
-		max_sectors = q->max_sectors;
+		max_sectors = queue_max_sectors(q);
 
 	if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) {
 		req->cmd_flags |= REQ_NOMERGE;
@@ -251,9 +252,9 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 	unsigned short max_sectors;
 
 	if (unlikely(blk_pc_request(req)))
-		max_sectors = q->max_hw_sectors;
+		max_sectors = queue_max_hw_sectors(q);
 	else
-		max_sectors = q->max_sectors;
+		max_sectors = queue_max_sectors(q);
 
 
 	if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) {
@@ -287,7 +288,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 	/*
 	 * Will it become too large?
 	 */
-	if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > q->max_sectors)
+	if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > queue_max_sectors(q))
 		return 0;
 
 	total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
@@ -299,10 +300,10 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 		total_phys_segments--;
 	}
 
-	if (total_phys_segments > q->max_phys_segments)
+	if (total_phys_segments > queue_max_phys_segments(q))
 		return 0;
 
-	if (total_phys_segments > q->max_hw_segments)
+	if (total_phys_segments > queue_max_hw_segments(q))
 		return 0;
 
 	/* Merge is OK... */
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 15c3164..0b32f98 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -219,6 +219,15 @@ void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors)
 }
 EXPORT_SYMBOL(blk_queue_max_sectors);
 
+void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_sectors)
+{
+	if (BLK_DEF_MAX_SECTORS > max_sectors)
+		q->max_hw_sectors = BLK_DEF_MAX_SECTORS;
+	else
+		q->max_hw_sectors = max_sectors;
+}
+EXPORT_SYMBOL(blk_queue_max_hw_sectors);
+
 /**
  * blk_queue_max_phys_segments - set max phys segments for a request for this queue
  * @q:  the request queue for the device
@@ -395,11 +404,11 @@ int blk_queue_dma_drain(struct request_queue *q,
 			       dma_drain_needed_fn *dma_drain_needed,
 			       void *buf, unsigned int size)
 {
-	if (q->max_hw_segments < 2 || q->max_phys_segments < 2)
+	if (queue_max_hw_segments(q) < 2 || queue_max_phys_segments(q) < 2)
 		return -EINVAL;
 	/* make room for appending the drain */
-	--q->max_hw_segments;
-	--q->max_phys_segments;
+	blk_queue_max_hw_segments(q, queue_max_hw_segments(q) - 1);
+	blk_queue_max_phys_segments(q, queue_max_phys_segments(q) - 1);
 	q->dma_drain_needed = dma_drain_needed;
 	q->dma_drain_buffer = buf;
 	q->dma_drain_size = size;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 13d38b7..142a4ac 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -95,7 +95,7 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count)
 
 static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
 {
-	int max_sectors_kb = q->max_sectors >> 1;
+	int max_sectors_kb = queue_max_sectors(q) >> 1;
 
 	return queue_var_show(max_sectors_kb, (page));
 }
@@ -109,7 +109,7 @@ static ssize_t
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 {
 	unsigned long max_sectors_kb,
-			max_hw_sectors_kb = q->max_hw_sectors >> 1,
+		max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1,
 			page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
 	ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
 
@@ -117,7 +117,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 		return -EINVAL;
 
 	spin_lock_irq(q->queue_lock);
-	q->max_sectors = max_sectors_kb << 1;
+	blk_queue_max_sectors(q, max_sectors_kb << 1);
 	spin_unlock_irq(q->queue_lock);
 
 	return ret;
@@ -125,7 +125,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 
 static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
 {
-	int max_hw_sectors_kb = q->max_hw_sectors >> 1;
+	int max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1;
 
 	return queue_var_show(max_hw_sectors_kb, (page));
 }
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 9eaa194..df18a15 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -766,7 +766,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 		return compat_put_int(arg, bdev_logical_block_size(bdev));
 	case BLKSECTGET:
 		return compat_put_ushort(arg,
-					 bdev_get_queue(bdev)->max_sectors);
+					 queue_max_sectors(bdev_get_queue(bdev)));
 	case BLKRASET: /* compatible, but no compat_ptr (!) */
 	case BLKFRASET:
 		if (!capable(CAP_SYS_ADMIN))
diff --git a/block/ioctl.c b/block/ioctl.c
index 7aa97f6..500e4c7 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -152,10 +152,10 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 		bio->bi_private = &wait;
 		bio->bi_sector = start;
 
-		if (len > q->max_hw_sectors) {
-			bio->bi_size = q->max_hw_sectors << 9;
-			len -= q->max_hw_sectors;
-			start += q->max_hw_sectors;
+		if (len > queue_max_hw_sectors(q)) {
+			bio->bi_size = queue_max_hw_sectors(q) << 9;
+			len -= queue_max_hw_sectors(q);
+			start += queue_max_hw_sectors(q);
 		} else {
 			bio->bi_size = len << 9;
 			len = 0;
@@ -313,7 +313,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKSSZGET: /* get block device hardware sector size */
 		return put_int(arg, bdev_logical_block_size(bdev));
 	case BLKSECTGET:
-		return put_ushort(arg, bdev_get_queue(bdev)->max_sectors);
+		return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
 	case BLKRASET:
 	case BLKFRASET:
 		if(!capable(CAP_SYS_ADMIN))
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index a9670dd..5f8e798 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -75,7 +75,7 @@ static int sg_set_timeout(struct request_queue *q, int __user *p)
 
 static int sg_get_reserved_size(struct request_queue *q, int __user *p)
 {
-	unsigned val = min(q->sg_reserved_size, q->max_sectors << 9);
+	unsigned val = min(q->sg_reserved_size, queue_max_sectors(q) << 9);
 
 	return put_user(val, p);
 }
@@ -89,8 +89,8 @@ static int sg_set_reserved_size(struct request_queue *q, int __user *p)
 
 	if (size < 0)
 		return -EINVAL;
-	if (size > (q->max_sectors << 9))
-		size = q->max_sectors << 9;
+	if (size > (queue_max_sectors(q) << 9))
+		size = queue_max_sectors(q) << 9;
 
 	q->sg_reserved_size = size;
 	return 0;
@@ -264,7 +264,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 	if (hdr->cmd_len > BLK_MAX_CDB)
 		return -EINVAL;
 
-	if (hdr->dxfer_len > (q->max_hw_sectors << 9))
+	if (hdr->dxfer_len > (queue_max_hw_sectors(q) << 9))
 		return -EIO;
 
 	if (hdr->dxfer_len)
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 293f585..d57f117 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -991,13 +991,15 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
  */
 static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_queue *q)
 {
-	if ((pd->settings.size << 9) / CD_FRAMESIZE <= q->max_phys_segments) {
+	if ((pd->settings.size << 9) / CD_FRAMESIZE
+	    <= queue_max_phys_segments(q)) {
 		/*
 		 * The cdrom device can handle one segment/frame
 		 */
 		clear_bit(PACKET_MERGE_SEGS, &pd->flags);
 		return 0;
-	} else if ((pd->settings.size << 9) / PAGE_SIZE <= q->max_phys_segments) {
+	} else if ((pd->settings.size << 9) / PAGE_SIZE
+		   <= queue_max_phys_segments(q)) {
 		/*
 		 * We can handle this case at the expense of some extra memory
 		 * copies during write operations
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index cceace6..71d1b9b 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2101,8 +2101,8 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 		nr = nframes;
 		if (cdi->cdda_method == CDDA_BPC_SINGLE)
 			nr = 1;
-		if (nr * CD_FRAMESIZE_RAW > (q->max_sectors << 9))
-			nr = (q->max_sectors << 9) / CD_FRAMESIZE_RAW;
+		if (nr * CD_FRAMESIZE_RAW > (queue_max_sectors(q) << 9))
+			nr = (queue_max_sectors(q) << 9) / CD_FRAMESIZE_RAW;
 
 		len = nr * CD_FRAMESIZE_RAW;
 
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 65e2d97..e9a73bb 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -510,7 +510,7 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 	 *        combine_restrictions_low()
 	 */
 	rs->max_sectors =
-		min_not_zero(rs->max_sectors, q->max_sectors);
+		min_not_zero(rs->max_sectors, queue_max_sectors(q));
 
 	/*
 	 * Check if merge fn is supported.
@@ -525,25 +525,25 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 
 	rs->max_phys_segments =
 		min_not_zero(rs->max_phys_segments,
-			     q->max_phys_segments);
+			     queue_max_phys_segments(q));
 
 	rs->max_hw_segments =
-		min_not_zero(rs->max_hw_segments, q->max_hw_segments);
+		min_not_zero(rs->max_hw_segments, queue_max_hw_segments(q));
 
 	rs->logical_block_size = max(rs->logical_block_size,
 				     queue_logical_block_size(q));
 
 	rs->max_segment_size =
-		min_not_zero(rs->max_segment_size, q->max_segment_size);
+		min_not_zero(rs->max_segment_size, queue_max_segment_size(q));
 
 	rs->max_hw_sectors =
-		min_not_zero(rs->max_hw_sectors, q->max_hw_sectors);
+		min_not_zero(rs->max_hw_sectors, queue_max_hw_sectors(q));
 
 	rs->seg_boundary_mask =
 		min_not_zero(rs->seg_boundary_mask,
-			     q->seg_boundary_mask);
+			     queue_segment_boundary(q));
 
-	rs->bounce_pfn = min_not_zero(rs->bounce_pfn, q->bounce_pfn);
+	rs->bounce_pfn = min_not_zero(rs->bounce_pfn, queue_bounce_pfn(q));
 
 	rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
 }
@@ -914,13 +914,13 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
 	 * restrictions.
 	 */
 	blk_queue_max_sectors(q, t->limits.max_sectors);
-	q->max_phys_segments = t->limits.max_phys_segments;
-	q->max_hw_segments = t->limits.max_hw_segments;
-	q->logical_block_size = t->limits.logical_block_size;
-	q->max_segment_size = t->limits.max_segment_size;
-	q->max_hw_sectors = t->limits.max_hw_sectors;
-	q->seg_boundary_mask = t->limits.seg_boundary_mask;
-	q->bounce_pfn = t->limits.bounce_pfn;
+	blk_queue_max_phys_segments(q, t->limits.max_phys_segments);
+	blk_queue_max_hw_segments(q, t->limits.max_hw_segments);
+	blk_queue_logical_block_size(q, t->limits.logical_block_size);
+	blk_queue_max_segment_size(q, t->limits.max_segment_size);
+	blk_queue_max_hw_sectors(q, t->limits.max_hw_sectors);
+	blk_queue_segment_boundary(q, t->limits.seg_boundary_mask);
+	blk_queue_bounce_limit(q, t->limits.bounce_pfn);
 
 	if (t->limits.no_cluster)
 		queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 7a36e38..64f1f3e0 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -146,7 +146,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 		 * a one page request is never in violation.
 		 */
 		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+		    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
 			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		disk->num_sectors = rdev->sectors;
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 41ced0c..4ee31aa 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -303,7 +303,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 		 * merge_bvec_fn will be involved in multipath.)
 		 */
 			if (q->merge_bvec_fn &&
-			    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+			    queue_max_sectors(q) > (PAGE_SIZE>>9))
 				blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 			conf->working_disks++;
@@ -467,7 +467,7 @@ static int multipath_run (mddev_t *mddev)
 		 * violating it, not that we ever expect a device with
 		 * a merge_bvec_fn to be involved in multipath */
 		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+		    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
 			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		if (!test_bit(Faulty, &rdev->flags))
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c08d755..925507e 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -144,7 +144,7 @@ static int create_strip_zones (mddev_t *mddev)
 		 */
 
 		if (rdev1->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+		    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
 			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		if (!smallest || (rdev1->sectors < smallest->sectors))
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 36df910..e23758b 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1130,7 +1130,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 			 * a one page request is never in violation.
 			 */
 			if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-			    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+			    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
 				blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 			p->head_position = 0;
@@ -1996,7 +1996,7 @@ static int run(mddev_t *mddev)
 		 * a one page request is never in violation.
 		 */
 		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+		    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
 			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		disk->head_position = 0;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 499620a..750550c 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1158,8 +1158,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 			 * a one page request is never in violation.
 			 */
 			if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-			    mddev->queue->max_sectors > (PAGE_SIZE>>9))
-				mddev->queue->max_sectors = (PAGE_SIZE>>9);
+			    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
+				blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 			p->head_position = 0;
 			rdev->raid_disk = mirror;
@@ -2145,8 +2145,8 @@ static int run(mddev_t *mddev)
 		 * a one page request is never in violation.
 		 */
 		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
-			mddev->queue->max_sectors = (PAGE_SIZE>>9);
+		    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
+			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		disk->head_position = 0;
 	}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4616bc3..7970dc8 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3463,10 +3463,10 @@ static int bio_fits_rdev(struct bio *bi)
 {
 	struct request_queue *q = bdev_get_queue(bi->bi_bdev);
 
-	if ((bi->bi_size>>9) > q->max_sectors)
+	if ((bi->bi_size>>9) > queue_max_sectors(q))
 		return 0;
 	blk_recount_segments(q, bi);
-	if (bi->bi_phys_segments > q->max_phys_segments)
+	if (bi->bi_phys_segments > queue_max_phys_segments(q))
 		return 0;
 
 	if (q->merge_bvec_fn)
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 0fc2c0a..9bd407f 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -289,8 +289,8 @@ sg_open(struct inode *inode, struct file *filp)
 	if (list_empty(&sdp->sfds)) {	/* no existing opens on this device */
 		sdp->sgdebug = 0;
 		q = sdp->device->request_queue;
-		sdp->sg_tablesize = min(q->max_hw_segments,
-					q->max_phys_segments);
+		sdp->sg_tablesize = min(queue_max_hw_segments(q),
+					queue_max_phys_segments(q));
 	}
 	if ((sfp = sg_add_sfp(sdp, dev)))
 		filp->private_data = sfp;
@@ -909,7 +909,7 @@ sg_ioctl(struct inode *inode, struct file *filp,
                 if (val < 0)
                         return -EINVAL;
 		val = min_t(int, val,
-				sdp->device->request_queue->max_sectors * 512);
+			    queue_max_sectors(sdp->device->request_queue) * 512);
 		if (val != sfp->reserve.bufflen) {
 			if (sg_res_in_use(sfp) || sfp->mmap_called)
 				return -EBUSY;
@@ -919,7 +919,7 @@ sg_ioctl(struct inode *inode, struct file *filp,
 		return 0;
 	case SG_GET_RESERVED_SIZE:
 		val = min_t(int, sfp->reserve.bufflen,
-				sdp->device->request_queue->max_sectors * 512);
+			    queue_max_sectors(sdp->device->request_queue) * 512);
 		return put_user(val, ip);
 	case SG_SET_COMMAND_Q:
 		result = get_user(val, ip);
@@ -1059,7 +1059,7 @@ sg_ioctl(struct inode *inode, struct file *filp,
 			return -ENODEV;
 		return scsi_ioctl(sdp->device, cmd_in, p);
 	case BLKSECTGET:
-		return put_user(sdp->device->request_queue->max_sectors * 512,
+		return put_user(queue_max_sectors(sdp->device->request_queue) * 512,
 				ip);
 	case BLKTRACESETUP:
 		return blk_trace_setup(sdp->device->request_queue,
@@ -1377,7 +1377,8 @@ static Sg_device *sg_alloc(struct gendisk *disk, struct scsi_device *scsidp)
 	sdp->device = scsidp;
 	INIT_LIST_HEAD(&sdp->sfds);
 	init_waitqueue_head(&sdp->o_excl_wait);
-	sdp->sg_tablesize = min(q->max_hw_segments, q->max_phys_segments);
+	sdp->sg_tablesize = min(queue_max_hw_segments(q),
+				queue_max_phys_segments(q));
 	sdp->index = k;
 	kref_init(&sdp->d_ref);
 
@@ -2055,7 +2056,7 @@ sg_add_sfp(Sg_device * sdp, int dev)
 		sg_big_buff = def_reserved_size;
 
 	bufflen = min_t(int, sg_big_buff,
-			sdp->device->request_queue->max_sectors * 512);
+			queue_max_sectors(sdp->device->request_queue) * 512);
 	sg_build_reserve(sfp, bufflen);
 	SCSI_LOG_TIMEOUT(3, printk("sg_add_sfp:   bufflen=%d, k_use_sg=%d\n",
 			   sfp->reserve.bufflen, sfp->reserve.k_use_sg));
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 8681b7083..89bd438 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -3983,8 +3983,8 @@ static int st_probe(struct device *dev)
 		return -ENODEV;
 	}
 
-	i = min(SDp->request_queue->max_hw_segments,
-		SDp->request_queue->max_phys_segments);
+	i = min(queue_max_hw_segments(SDp->request_queue),
+		queue_max_phys_segments(SDp->request_queue));
 	if (st_max_sg_segs < i)
 		i = st_max_sg_segs;
 	buffer = new_tape_buffer((SDp->host)->unchecked_isa_dma, i);
diff --git a/drivers/usb/storage/scsiglue.c b/drivers/usb/storage/scsiglue.c
index 4ca3b58..cfa26d5 100644
--- a/drivers/usb/storage/scsiglue.c
+++ b/drivers/usb/storage/scsiglue.c
@@ -132,7 +132,7 @@ static int slave_configure(struct scsi_device *sdev)
 
 		if (us->fflags & US_FL_MAX_SECTORS_MIN)
 			max_sectors = PAGE_CACHE_SIZE >> 9;
-		if (sdev->request_queue->max_sectors > max_sectors)
+		if (queue_max_sectors(sdev->request_queue) > max_sectors)
 			blk_queue_max_sectors(sdev->request_queue,
 					      max_sectors);
 	} else if (sdev->type == TYPE_TAPE) {
@@ -483,7 +483,7 @@ static ssize_t show_max_sectors(struct device *dev, struct device_attribute *att
 {
 	struct scsi_device *sdev = to_scsi_device(dev);
 
-	return sprintf(buf, "%u\n", sdev->request_queue->max_sectors);
+	return sprintf(buf, "%u\n", queue_max_sectors(sdev->request_queue));
 }
 
 /* Input routine for the sysfs max_sectors file */
diff --git a/fs/bio.c b/fs/bio.c
index 4445c38..ab423a1 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -499,11 +499,11 @@ int bio_get_nr_vecs(struct block_device *bdev)
 	struct request_queue *q = bdev_get_queue(bdev);
 	int nr_pages;
 
-	nr_pages = ((q->max_sectors << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	if (nr_pages > q->max_phys_segments)
-		nr_pages = q->max_phys_segments;
-	if (nr_pages > q->max_hw_segments)
-		nr_pages = q->max_hw_segments;
+	nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (nr_pages > queue_max_phys_segments(q))
+		nr_pages = queue_max_phys_segments(q);
+	if (nr_pages > queue_max_hw_segments(q))
+		nr_pages = queue_max_hw_segments(q);
 
 	return nr_pages;
 }
@@ -562,8 +562,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 	 * make this too complex.
 	 */
 
-	while (bio->bi_phys_segments >= q->max_phys_segments
-	       || bio->bi_phys_segments >= q->max_hw_segments) {
+	while (bio->bi_phys_segments >= queue_max_phys_segments(q)
+	       || bio->bi_phys_segments >= queue_max_hw_segments(q)) {
 
 		if (retried_segments)
 			return 0;
@@ -634,7 +634,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
 		    unsigned int len, unsigned int offset)
 {
-	return __bio_add_page(q, bio, page, len, offset, q->max_hw_sectors);
+	return __bio_add_page(q, bio, page, len, offset,
+			      queue_max_hw_sectors(q));
 }
 
 /**
@@ -654,7 +655,7 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
 		 unsigned int offset)
 {
 	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-	return __bio_add_page(q, bio, page, len, offset, q->max_sectors);
+	return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
 }
 
 struct bio_map_data {
diff --git a/include/linux/bio.h b/include/linux/bio.h
index d30ec6f..12737be 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -279,7 +279,7 @@ static inline int bio_has_allocated_vec(struct bio *bio)
 #define __BIO_SEG_BOUNDARY(addr1, addr2, mask) \
 	(((addr1) | (mask)) == (((addr2) - 1) | (mask)))
 #define BIOVEC_SEG_BOUNDARY(q, b1, b2) \
-	__BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, (q)->seg_boundary_mask)
+	__BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, queue_segment_boundary((q)))
 #define BIO_SEG_BOUNDARY(q, b1, b2) \
 	BIOVEC_SEG_BOUNDARY((q), __BVEC_END((b1)), __BVEC_START((b2)))
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 872b78b..29b48f7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -898,6 +898,7 @@ extern void blk_cleanup_queue(struct request_queue *);
 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
 extern void blk_queue_max_sectors(struct request_queue *, unsigned int);
+extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
@@ -988,6 +989,41 @@ extern void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter);
 
 #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist)
 
+static inline unsigned long queue_bounce_pfn(struct request_queue *q)
+{
+	return q->bounce_pfn;
+}
+
+static inline unsigned long queue_segment_boundary(struct request_queue *q)
+{
+	return q->seg_boundary_mask;
+}
+
+static inline unsigned int queue_max_sectors(struct request_queue *q)
+{
+	return q->max_sectors;
+}
+
+static inline unsigned int queue_max_hw_sectors(struct request_queue *q)
+{
+	return q->max_hw_sectors;
+}
+
+static inline unsigned short queue_max_hw_segments(struct request_queue *q)
+{
+	return q->max_hw_segments;
+}
+
+static inline unsigned short queue_max_phys_segments(struct request_queue *q)
+{
+	return q->max_phys_segments;
+}
+
+static inline unsigned int queue_max_segment_size(struct request_queue *q)
+{
+	return q->max_segment_size;
+}
+
 static inline unsigned short queue_logical_block_size(struct request_queue *q)
 {
 	int retval = 512;
diff --git a/mm/bounce.c b/mm/bounce.c
index e590272..8dcd431 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -192,7 +192,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 		/*
 		 * is destination page below bounce pfn?
 		 */
-		if (page_to_pfn(page) <= q->bounce_pfn)
+		if (page_to_pfn(page) <= queue_bounce_pfn(q))
 			continue;
 
 		/*
@@ -284,7 +284,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 	 * don't waste time iterating over bio segments
 	 */
 	if (!(q->bounce_gfp & GFP_DMA)) {
-		if (q->bounce_pfn >= blk_max_pfn)
+		if (queue_bounce_pfn(q) >= blk_max_pfn)
 			return;
 		pool = page_pool;
 	} else {
-- 
cgit v0.10.2


From 025146e13b63483add912706c101fb0fb6f015cc Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 22 May 2009 17:17:51 -0400
Subject: block: Move queue limits to an embedded struct

To accommodate stacking drivers that do not have an associated request
queue we're moving the limits to a separate, embedded structure.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 0b32f98..b0f547c 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -179,16 +179,16 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
 	 */
 	if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
 		dma = 1;
-	q->bounce_pfn = max_low_pfn;
+	q->limits.bounce_pfn = max_low_pfn;
 #else
 	if (b_pfn < blk_max_low_pfn)
 		dma = 1;
-	q->bounce_pfn = b_pfn;
+	q->limits.bounce_pfn = b_pfn;
 #endif
 	if (dma) {
 		init_emergency_isa_pool();
 		q->bounce_gfp = GFP_NOIO | GFP_DMA;
-		q->bounce_pfn = b_pfn;
+		q->limits.bounce_pfn = b_pfn;
 	}
 }
 EXPORT_SYMBOL(blk_queue_bounce_limit);
@@ -211,10 +211,10 @@ void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors)
 	}
 
 	if (BLK_DEF_MAX_SECTORS > max_sectors)
-		q->max_hw_sectors = q->max_sectors = max_sectors;
+		q->limits.max_hw_sectors = q->limits.max_sectors = max_sectors;
 	else {
-		q->max_sectors = BLK_DEF_MAX_SECTORS;
-		q->max_hw_sectors = max_sectors;
+		q->limits.max_sectors = BLK_DEF_MAX_SECTORS;
+		q->limits.max_hw_sectors = max_sectors;
 	}
 }
 EXPORT_SYMBOL(blk_queue_max_sectors);
@@ -222,9 +222,9 @@ EXPORT_SYMBOL(blk_queue_max_sectors);
 void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_sectors)
 {
 	if (BLK_DEF_MAX_SECTORS > max_sectors)
-		q->max_hw_sectors = BLK_DEF_MAX_SECTORS;
+		q->limits.max_hw_sectors = BLK_DEF_MAX_SECTORS;
 	else
-		q->max_hw_sectors = max_sectors;
+		q->limits.max_hw_sectors = max_sectors;
 }
 EXPORT_SYMBOL(blk_queue_max_hw_sectors);
 
@@ -247,7 +247,7 @@ void blk_queue_max_phys_segments(struct request_queue *q,
 		       __func__, max_segments);
 	}
 
-	q->max_phys_segments = max_segments;
+	q->limits.max_phys_segments = max_segments;
 }
 EXPORT_SYMBOL(blk_queue_max_phys_segments);
 
@@ -271,7 +271,7 @@ void blk_queue_max_hw_segments(struct request_queue *q,
 		       __func__, max_segments);
 	}
 
-	q->max_hw_segments = max_segments;
+	q->limits.max_hw_segments = max_segments;
 }
 EXPORT_SYMBOL(blk_queue_max_hw_segments);
 
@@ -292,7 +292,7 @@ void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
 		       __func__, max_size);
 	}
 
-	q->max_segment_size = max_size;
+	q->limits.max_segment_size = max_size;
 }
 EXPORT_SYMBOL(blk_queue_max_segment_size);
 
@@ -308,7 +308,7 @@ EXPORT_SYMBOL(blk_queue_max_segment_size);
  **/
 void blk_queue_logical_block_size(struct request_queue *q, unsigned short size)
 {
-	q->logical_block_size = size;
+	q->limits.logical_block_size = size;
 }
 EXPORT_SYMBOL(blk_queue_logical_block_size);
 
@@ -325,14 +325,27 @@ EXPORT_SYMBOL(blk_queue_logical_block_size);
 void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 {
 	/* zero is "infinity" */
-	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
-	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
-	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask);
-
-	t->max_phys_segments = min_not_zero(t->max_phys_segments, b->max_phys_segments);
-	t->max_hw_segments = min_not_zero(t->max_hw_segments, b->max_hw_segments);
-	t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size);
-	t->logical_block_size = max(t->logical_block_size, b->logical_block_size);
+	t->limits.max_sectors = min_not_zero(queue_max_sectors(t),
+					     queue_max_sectors(b));
+
+	t->limits.max_hw_sectors = min_not_zero(queue_max_hw_sectors(t),
+						queue_max_hw_sectors(b));
+
+	t->limits.seg_boundary_mask = min_not_zero(queue_segment_boundary(t),
+						   queue_segment_boundary(b));
+
+	t->limits.max_phys_segments = min_not_zero(queue_max_phys_segments(t),
+						   queue_max_phys_segments(b));
+
+	t->limits.max_hw_segments = min_not_zero(queue_max_hw_segments(t),
+						 queue_max_hw_segments(b));
+
+	t->limits.max_segment_size = min_not_zero(queue_max_segment_size(t),
+						  queue_max_segment_size(b));
+
+	t->limits.logical_block_size = max(queue_logical_block_size(t),
+					   queue_logical_block_size(b));
+
 	if (!t->queue_lock)
 		WARN_ON_ONCE(1);
 	else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) {
@@ -430,7 +443,7 @@ void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
 		       __func__, mask);
 	}
 
-	q->seg_boundary_mask = mask;
+	q->limits.seg_boundary_mask = mask;
 }
 EXPORT_SYMBOL(blk_queue_segment_boundary);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 29b48f7..b7bb6fd 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -307,6 +307,21 @@ struct blk_cmd_filter {
 	struct kobject kobj;
 };
 
+struct queue_limits {
+	unsigned long		bounce_pfn;
+	unsigned long		seg_boundary_mask;
+
+	unsigned int		max_hw_sectors;
+	unsigned int		max_sectors;
+	unsigned int		max_segment_size;
+
+	unsigned short		logical_block_size;
+	unsigned short		max_hw_segments;
+	unsigned short		max_phys_segments;
+
+	unsigned char		no_cluster;
+};
+
 struct request_queue
 {
 	/*
@@ -358,7 +373,6 @@ struct request_queue
 	/*
 	 * queue needs bounce pages for pages above this limit
 	 */
-	unsigned long		bounce_pfn;
 	gfp_t			bounce_gfp;
 
 	/*
@@ -387,14 +401,6 @@ struct request_queue
 	unsigned int		nr_congestion_off;
 	unsigned int		nr_batching;
 
-	unsigned int		max_sectors;
-	unsigned int		max_hw_sectors;
-	unsigned short		max_phys_segments;
-	unsigned short		max_hw_segments;
-	unsigned short		logical_block_size;
-	unsigned int		max_segment_size;
-
-	unsigned long		seg_boundary_mask;
 	void			*dma_drain_buffer;
 	unsigned int		dma_drain_size;
 	unsigned int		dma_pad_mask;
@@ -410,6 +416,8 @@ struct request_queue
 	struct timer_list	timeout;
 	struct list_head	timeout_list;
 
+	struct queue_limits	limits;
+
 	/*
 	 * sg stuff
 	 */
@@ -991,45 +999,45 @@ extern void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter);
 
 static inline unsigned long queue_bounce_pfn(struct request_queue *q)
 {
-	return q->bounce_pfn;
+	return q->limits.bounce_pfn;
 }
 
 static inline unsigned long queue_segment_boundary(struct request_queue *q)
 {
-	return q->seg_boundary_mask;
+	return q->limits.seg_boundary_mask;
 }
 
 static inline unsigned int queue_max_sectors(struct request_queue *q)
 {
-	return q->max_sectors;
+	return q->limits.max_sectors;
 }
 
 static inline unsigned int queue_max_hw_sectors(struct request_queue *q)
 {
-	return q->max_hw_sectors;
+	return q->limits.max_hw_sectors;
 }
 
 static inline unsigned short queue_max_hw_segments(struct request_queue *q)
 {
-	return q->max_hw_segments;
+	return q->limits.max_hw_segments;
 }
 
 static inline unsigned short queue_max_phys_segments(struct request_queue *q)
 {
-	return q->max_phys_segments;
+	return q->limits.max_phys_segments;
 }
 
 static inline unsigned int queue_max_segment_size(struct request_queue *q)
 {
-	return q->max_segment_size;
+	return q->limits.max_segment_size;
 }
 
 static inline unsigned short queue_logical_block_size(struct request_queue *q)
 {
 	int retval = 512;
 
-	if (q && q->logical_block_size)
-		retval = q->logical_block_size;
+	if (q && q->limits.logical_block_size)
+		retval = q->limits.logical_block_size;
 
 	return retval;
 }
-- 
cgit v0.10.2


From cd43e26f071524647e660706b784ebcbefbd2e44 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 22 May 2009 17:17:52 -0400
Subject: block: Expose stacked device queues in sysfs

Currently stacking devices do not have a queue directory in sysfs.
However, many of the I/O characteristics like sector size, maximum
request size, etc. are queue properties.

This patch enables the queue directory for MD/DM devices.  The elevator
code has been modified to deal with queues that do not have an I/O
scheduler.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 142a4ac..3ccdadb 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -395,9 +395,6 @@ int blk_register_queue(struct gendisk *disk)
 	if (WARN_ON(!q))
 		return -ENXIO;
 
-	if (!q->request_fn)
-		return 0;
-
 	ret = kobject_add(&q->kobj, kobject_get(&disk_to_dev(disk)->kobj),
 			  "%s", "queue");
 	if (ret < 0)
@@ -405,6 +402,9 @@ int blk_register_queue(struct gendisk *disk)
 
 	kobject_uevent(&q->kobj, KOBJ_ADD);
 
+	if (!q->request_fn)
+		return 0;
+
 	ret = elv_register_queue(q);
 	if (ret) {
 		kobject_uevent(&q->kobj, KOBJ_REMOVE);
diff --git a/block/elevator.c b/block/elevator.c
index ebee948..bd78f69 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -575,6 +575,9 @@ void elv_drain_elevator(struct request_queue *q)
  */
 void elv_quiesce_start(struct request_queue *q)
 {
+	if (!q->elevator)
+		return;
+
 	queue_flag_set(QUEUE_FLAG_ELVSWITCH, q);
 
 	/*
@@ -1050,6 +1053,9 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
 	char elevator_name[ELV_NAME_MAX];
 	struct elevator_type *e;
 
+	if (!q->elevator)
+		return count;
+
 	strlcpy(elevator_name, name, sizeof(elevator_name));
 	strstrip(elevator_name);
 
@@ -1073,10 +1079,15 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
 ssize_t elv_iosched_show(struct request_queue *q, char *name)
 {
 	struct elevator_queue *e = q->elevator;
-	struct elevator_type *elv = e->elevator_type;
+	struct elevator_type *elv;
 	struct elevator_type *__e;
 	int len = 0;
 
+	if (!q->elevator)
+		return sprintf(name, "none\n");
+
+	elv = e->elevator_type;
+
 	spin_lock(&elv_list_lock);
 	list_for_each_entry(__e, &elv_list, list) {
 		if (!strcmp(elv->elevator_name, __e->elevator_name))
-- 
cgit v0.10.2


From c72758f33784e5e2a1a4bb9421ef3e6de8f9fcf3 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 22 May 2009 17:17:53 -0400
Subject: block: Export I/O topology for block devices and partitions

To support devices with physical block sizes bigger than 512 bytes we
need to ensure proper alignment.  This patch adds support for exposing
I/O topology characteristics as devices are stacked.

  logical_block_size is the smallest unit the device can address.

  physical_block_size indicates the smallest I/O the device can write
  without incurring a read-modify-write penalty.

  The io_min parameter is the smallest preferred I/O size reported by
  the device.  In many cases this is the same as the physical block
  size.  However, the io_min parameter can be scaled up when stacking
  (RAID5 chunk size > physical block size).

  The io_opt characteristic indicates the optimal I/O size reported by
  the device.  This is usually the stripe width for arrays.

  The alignment_offset parameter indicates the number of bytes the start
  of the device/partition is offset from the device's natural alignment.
  Partition tools and MD/DM utilities can use this to pad their offsets
  so filesystems start on proper boundaries.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index 44f52a4..cbbd3e0 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -60,3 +60,62 @@ Description:
 		Indicates whether the block layer should automatically
 		generate checksums for write requests bound for
 		devices that support receiving integrity metadata.
+
+What:		/sys/block/<disk>/alignment_offset
+Date:		April 2009
+Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+		Storage devices may report a physical block size that is
+		bigger than the logical block size (for instance a drive
+		with 4KB physical sectors exposing 512-byte logical
+		blocks to the operating system).  This parameter
+		indicates how many bytes the beginning of the device is
+		offset from the disk's natural alignment.
+
+What:		/sys/block/<disk>/<partition>/alignment_offset
+Date:		April 2009
+Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+		Storage devices may report a physical block size that is
+		bigger than the logical block size (for instance a drive
+		with 4KB physical sectors exposing 512-byte logical
+		blocks to the operating system).  This parameter
+		indicates how many bytes the beginning of the partition
+		is offset from the disk's natural alignment.
+
+What:		/sys/block/<disk>/queue/logical_block_size
+Date:		May 2009
+Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+		This is the smallest unit the storage device can
+		address.  It is typically 512 bytes.
+
+What:		/sys/block/<disk>/queue/physical_block_size
+Date:		May 2009
+Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+		This is the smallest unit the storage device can write
+		without resorting to read-modify-write operation.  It is
+		usually the same as the logical block size but may be
+		bigger.  One example is SATA drives with 4KB sectors
+		that expose a 512-byte logical block size to the
+		operating system.
+
+What:		/sys/block/<disk>/queue/minimum_io_size
+Date:		April 2009
+Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+		Storage devices may report a preferred minimum I/O size,
+		which is the smallest request the device can perform
+		without incurring a read-modify-write penalty.  For disk
+		drives this is often the physical block size.  For RAID
+		arrays it is often the stripe chunk size.
+
+What:		/sys/block/<disk>/queue/optimal_io_size
+Date:		April 2009
+Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+		Storage devices may report an optimal I/O size, which is
+		the device's preferred unit of receiving I/O.  This is
+		rarely reported for disk drives.  For RAID devices it is
+		usually the stripe width or the internal block size.
diff --git a/block/blk-settings.c b/block/blk-settings.c
index b0f547c..5649f34 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -309,9 +309,94 @@ EXPORT_SYMBOL(blk_queue_max_segment_size);
 void blk_queue_logical_block_size(struct request_queue *q, unsigned short size)
 {
 	q->limits.logical_block_size = size;
+
+	if (q->limits.physical_block_size < size)
+		q->limits.physical_block_size = size;
+
+	if (q->limits.io_min < q->limits.physical_block_size)
+		q->limits.io_min = q->limits.physical_block_size;
 }
 EXPORT_SYMBOL(blk_queue_logical_block_size);
 
+/**
+ * blk_queue_physical_block_size - set physical block size for the queue
+ * @q:  the request queue for the device
+ * @size:  the physical block size, in bytes
+ *
+ * Description:
+ *   This should be set to the lowest possible sector size that the
+ *   hardware can operate on without reverting to read-modify-write
+ *   operations.
+ */
+void blk_queue_physical_block_size(struct request_queue *q, unsigned short size)
+{
+	q->limits.physical_block_size = size;
+
+	if (q->limits.physical_block_size < q->limits.logical_block_size)
+		q->limits.physical_block_size = q->limits.logical_block_size;
+
+	if (q->limits.io_min < q->limits.physical_block_size)
+		q->limits.io_min = q->limits.physical_block_size;
+}
+EXPORT_SYMBOL(blk_queue_physical_block_size);
+
+/**
+ * blk_queue_alignment_offset - set physical block alignment offset
+ * @q:	the request queue for the device
+ * @alignment:	alignment offset in bytes
+ *
+ * Description:
+ *   Some devices are naturally misaligned to compensate for things like
+ *   the legacy DOS partition table 63-sector offset.  Low-level drivers
+ *   should call this function for devices whose first sector is not
+ *   naturally aligned.
+ */
+void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset)
+{
+	q->limits.alignment_offset =
+		offset & (q->limits.physical_block_size - 1);
+	q->limits.misaligned = 0;
+}
+EXPORT_SYMBOL(blk_queue_alignment_offset);
+
+/**
+ * blk_queue_io_min - set minimum request size for the queue
+ * @q:	the request queue for the device
+ * @io_min:  smallest I/O size in bytes
+ *
+ * Description:
+ *   Some devices have an internal block size bigger than the reported
+ *   hardware sector size.  This function can be used to signal the
+ *   smallest I/O the device can perform without incurring a performance
+ *   penalty.
+ */
+void blk_queue_io_min(struct request_queue *q, unsigned int min)
+{
+	q->limits.io_min = min;
+
+	if (q->limits.io_min < q->limits.logical_block_size)
+		q->limits.io_min = q->limits.logical_block_size;
+
+	if (q->limits.io_min < q->limits.physical_block_size)
+		q->limits.io_min = q->limits.physical_block_size;
+}
+EXPORT_SYMBOL(blk_queue_io_min);
+
+/**
+ * blk_queue_io_opt - set optimal request size for the queue
+ * @q:	the request queue for the device
+ * @io_opt:  optimal request size in bytes
+ *
+ * Description:
+ *   Drivers can call this function to set the preferred I/O request
+ *   size for devices that report such a value.
+ */
+void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
+{
+	q->limits.io_opt = opt;
+}
+EXPORT_SYMBOL(blk_queue_io_opt);
+
 /*
  * Returns the minimum that is _not_ zero, unless both are zero.
  */
@@ -358,6 +443,107 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 EXPORT_SYMBOL(blk_queue_stack_limits);
 
 /**
+ * blk_stack_limits - adjust queue_limits for stacked devices
+ * @t:	the stacking driver limits (top)
+ * @bdev:  the underlying queue limits (bottom)
+ * @offset:  offset to beginning of data within component device
+ *
+ * Description:
+ *    Merges two queue_limit structs.  Returns 0 if alignment didn't
+ *    change.  Returns -1 if adding the bottom device caused
+ *    misalignment.
+ */
+int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
+		     sector_t offset)
+{
+	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
+	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
+
+	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
+					    b->seg_boundary_mask);
+
+	t->max_phys_segments = min_not_zero(t->max_phys_segments,
+					    b->max_phys_segments);
+
+	t->max_hw_segments = min_not_zero(t->max_hw_segments,
+					  b->max_hw_segments);
+
+	t->max_segment_size = min_not_zero(t->max_segment_size,
+					   b->max_segment_size);
+
+	t->logical_block_size = max(t->logical_block_size,
+				    b->logical_block_size);
+
+	t->physical_block_size = max(t->physical_block_size,
+				     b->physical_block_size);
+
+	t->io_min = max(t->io_min, b->io_min);
+	t->no_cluster |= b->no_cluster;
+
+	/* Bottom device offset aligned? */
+	if (offset &&
+	    (offset & (b->physical_block_size - 1)) != b->alignment_offset) {
+		t->misaligned = 1;
+		return -1;
+	}
+
+	/* If top has no alignment offset, inherit from bottom */
+	if (!t->alignment_offset)
+		t->alignment_offset =
+			b->alignment_offset & (b->physical_block_size - 1);
+
+	/* Top device aligned on logical block boundary? */
+	if (t->alignment_offset & (t->logical_block_size - 1)) {
+		t->misaligned = 1;
+		return -1;
+	}
+
+	return 0;
+}
+
+/**
+ * disk_stack_limits - adjust queue limits for stacked drivers
+ * @t:	MD/DM gendisk (top)
+ * @bdev:  the underlying block device (bottom)
+ * @offset:  offset to beginning of data within component device
+ *
+ * Description:
+ *    Merges the limits for two queues.  Returns 0 if alignment
+ *    didn't change.  Returns -1 if adding the bottom device caused
+ *    misalignment.
+ */
+void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
+		       sector_t offset)
+{
+	struct request_queue *t = disk->queue;
+	struct request_queue *b = bdev_get_queue(bdev);
+
+	offset += get_start_sect(bdev) << 9;
+
+	if (blk_stack_limits(&t->limits, &b->limits, offset) < 0) {
+		char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE];
+
+		disk_name(disk, 0, top);
+		bdevname(bdev, bottom);
+
+		printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n",
+		       top, bottom);
+	}
+
+	if (!t->queue_lock)
+		WARN_ON_ONCE(1);
+	else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) {
+		unsigned long flags;
+
+		spin_lock_irqsave(t->queue_lock, flags);
+		if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))
+			queue_flag_clear(QUEUE_FLAG_CLUSTER, t);
+		spin_unlock_irqrestore(t->queue_lock, flags);
+	}
+}
+EXPORT_SYMBOL(disk_stack_limits);
+
+/**
  * blk_queue_dma_pad - set pad mask
  * @q:     the request queue for the device
  * @mask:  pad mask
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 3ccdadb..9337e17 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -105,6 +105,21 @@ static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page
 	return queue_var_show(queue_logical_block_size(q), page);
 }
 
+static ssize_t queue_physical_block_size_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_physical_block_size(q), page);
+}
+
+static ssize_t queue_io_min_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_io_min(q), page);
+}
+
+static ssize_t queue_io_opt_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_io_opt(q), page);
+}
+
 static ssize_t
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 {
@@ -257,6 +272,21 @@ static struct queue_sysfs_entry queue_logical_block_size_entry = {
 	.show = queue_logical_block_size_show,
 };
 
+static struct queue_sysfs_entry queue_physical_block_size_entry = {
+	.attr = {.name = "physical_block_size", .mode = S_IRUGO },
+	.show = queue_physical_block_size_show,
+};
+
+static struct queue_sysfs_entry queue_io_min_entry = {
+	.attr = {.name = "minimum_io_size", .mode = S_IRUGO },
+	.show = queue_io_min_show,
+};
+
+static struct queue_sysfs_entry queue_io_opt_entry = {
+	.attr = {.name = "optimal_io_size", .mode = S_IRUGO },
+	.show = queue_io_opt_show,
+};
+
 static struct queue_sysfs_entry queue_nonrot_entry = {
 	.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_nonrot_show,
@@ -289,6 +319,9 @@ static struct attribute *default_attrs[] = {
 	&queue_iosched_entry.attr,
 	&queue_hw_sector_size_entry.attr,
 	&queue_logical_block_size_entry.attr,
+	&queue_physical_block_size_entry.attr,
+	&queue_io_min_entry.attr,
+	&queue_io_opt_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
diff --git a/block/genhd.c b/block/genhd.c
index 1a4916e..fe7ccc0 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -852,11 +852,21 @@ static ssize_t disk_capability_show(struct device *dev,
 	return sprintf(buf, "%x\n", disk->flags);
 }
 
+static ssize_t disk_alignment_offset_show(struct device *dev,
+					  struct device_attribute *attr,
+					  char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
+}
+
 static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
 static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
 static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
 static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -875,6 +885,7 @@ static struct attribute *disk_attrs[] = {
 	&dev_attr_removable.attr,
 	&dev_attr_ro.attr,
 	&dev_attr_size.attr,
+	&dev_attr_alignment_offset.attr,
 	&dev_attr_capability.attr,
 	&dev_attr_stat.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 99e33ef..0af3608 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -219,6 +219,13 @@ ssize_t part_size_show(struct device *dev,
 	return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
 }
 
+ssize_t part_alignment_offset_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
+}
+
 ssize_t part_stat_show(struct device *dev,
 		       struct device_attribute *attr, char *buf)
 {
@@ -272,6 +279,7 @@ ssize_t part_fail_store(struct device *dev,
 static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
@@ -282,6 +290,7 @@ static struct attribute *part_attrs[] = {
 	&dev_attr_partition.attr,
 	&dev_attr_start.attr,
 	&dev_attr_size.attr,
+	&dev_attr_alignment_offset.attr,
 	&dev_attr_stat.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
@@ -383,6 +392,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	pdev = part_to_dev(p);
 
 	p->start_sect = start;
+	p->alignment_offset = queue_sector_alignment_offset(disk->queue, start);
 	p->nr_sects = len;
 	p->partno = partno;
 	p->policy = get_disk_ro(disk);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b7bb6fd..5e740a1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -314,11 +314,16 @@ struct queue_limits {
 	unsigned int		max_hw_sectors;
 	unsigned int		max_sectors;
 	unsigned int		max_segment_size;
+	unsigned int		physical_block_size;
+	unsigned int		alignment_offset;
+	unsigned int		io_min;
+	unsigned int		io_opt;
 
 	unsigned short		logical_block_size;
 	unsigned short		max_hw_segments;
 	unsigned short		max_phys_segments;
 
+	unsigned char		misaligned;
 	unsigned char		no_cluster;
 };
 
@@ -911,6 +916,15 @@ extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
 extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
+extern void blk_queue_physical_block_size(struct request_queue *, unsigned short);
+extern void blk_queue_alignment_offset(struct request_queue *q,
+				       unsigned int alignment);
+extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
+extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
+extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
+			    sector_t offset);
+extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
+			      sector_t offset);
 extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b);
 extern void blk_queue_dma_pad(struct request_queue *, unsigned int);
 extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
@@ -1047,6 +1061,39 @@ static inline unsigned short bdev_logical_block_size(struct block_device *bdev)
 	return queue_logical_block_size(bdev_get_queue(bdev));
 }
 
+static inline unsigned int queue_physical_block_size(struct request_queue *q)
+{
+	return q->limits.physical_block_size;
+}
+
+static inline unsigned int queue_io_min(struct request_queue *q)
+{
+	return q->limits.io_min;
+}
+
+static inline unsigned int queue_io_opt(struct request_queue *q)
+{
+	return q->limits.io_opt;
+}
+
+static inline int queue_alignment_offset(struct request_queue *q)
+{
+	if (q && q->limits.misaligned)
+		return -1;
+
+	if (q && q->limits.alignment_offset)
+		return q->limits.alignment_offset;
+
+	return 0;
+}
+
+static inline int queue_sector_alignment_offset(struct request_queue *q,
+						sector_t sector)
+{
+	return ((sector << 9) - q->limits.alignment_offset)
+		& (q->limits.io_min - 1);
+}
+
 static inline int queue_dma_alignment(struct request_queue *q)
 {
 	return q ? q->dma_alignment : 511;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index a1a28ca..149fda2 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -90,6 +90,7 @@ struct disk_stats {
 struct hd_struct {
 	sector_t start_sect;
 	sector_t nr_sects;
+	sector_t alignment_offset;
 	struct device __dev;
 	struct kobject *holder_dir;
 	int policy, partno;
-- 
cgit v0.10.2


From 95caa0a9bdaf93607bd0cc8932f53112496f2f22 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@linux.vnet.ibm.com>
Date: Fri, 22 May 2009 21:30:39 -0300
Subject: icom: fix rmmod crash

Actually the icom driver is crashing when is being removed because
the driver is kfreeing the adapter structure before calling
pci_release_regions(), which result in the following error:

  Unable to handle kernel paging request for data at address 0x6b6b6b6b6b6b6d33
  Faulting instruction address: 0xc000000000246b80
  Oops: Kernel access of bad area, sig: 11 [#1]
  ....
  [c000000012d436a0] [c0000000001002d0] .kfree+0x120/0x34c (unreliable)
  [c000000012d43730] [c000000000246d60] .pci_release_selected_regions+0x3c/0x68
  [c000000012d437c0] [d000000002d54700] .icom_kref_release+0xf4/0x118 [icom]
  [c000000012d43850] [c000000000232e50] .kref_put+0x74/0x94
  [c000000012d438d0] [d000000002d56c58] .icom_remove+0x40/0xa4 [icom]
  [c000000012d43960] [c000000000249e48] .pci_device_remove+0x50/0x90
  [c000000012d439e0] [c0000000002d68d8] .__device_release_driver+0x94/0xd4
  [c000000012d43a70] [c0000000002d7104] .driver_detach+0xf8/0x12c
  [c000000012d43b00] [c0000000002d549c] .bus_remove_driver+0xbc/0x11c
  [c000000012d43b90] [c0000000002d71dc] .driver_unregister+0x60/0x80
  [c000000012d43c20] [c00000000024a07c] .pci_unregister_driver+0x44/0xe8
  [c000000012d43cb0] [d000000002d56bf4] .icom_exit+0x1c/0x40 [icom]
  [c000000012d43d30] [c000000000095fa8] .SyS_delete_module+0x214/0x2a8
  [c000000012d43e30] [c00000000000852c] syscall_exit+0x0/0x40

Signed-off-by: Breno Leitao <leitao@linux.vnet.ibm.com>
Cc: stable@kernel.org
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/icom.c b/drivers/serial/icom.c
index 6579e2b..a461b3b 100644
--- a/drivers/serial/icom.c
+++ b/drivers/serial/icom.c
@@ -1472,8 +1472,8 @@ static void icom_remove_adapter(struct icom_adapter *icom_adapter)
 
 	free_irq(icom_adapter->pci_dev->irq, (void *) icom_adapter);
 	iounmap(icom_adapter->base_addr);
-	icom_free_adapter(icom_adapter);
 	pci_release_regions(icom_adapter->pci_dev);
+	icom_free_adapter(icom_adapter);
 }
 
 static void icom_kref_release(struct kref *kref)
-- 
cgit v0.10.2


From 3b77f777b8f1c001b63e317c4ce317292ff0ff94 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Sat, 23 May 2009 08:23:16 +0200
Subject: ide-disk: fix missing max_sectors accessor function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The recent move to accessor functions for querying queue limits
missed an entry in ide-disk.c:

drivers/ide/ide-disk.c: In function ‘ide_disk_setup’:
drivers/ide/ide-disk.c:642: error: ‘struct request_queue’ has no member named ‘max_sectors’

Fix it.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index ad18e14..c6f7fcf 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -639,7 +639,7 @@ static void ide_disk_setup(ide_drive_t *drive)
 	}
 
 	printk(KERN_INFO "%s: max request size: %dKiB\n", drive->name,
-		q->max_sectors / 2);
+	       queue_max_sectors(q) / 2);
 
 	if (ata_id_is_ssd(id))
 		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
-- 
cgit v0.10.2


From 14f0aa359365e8a93a77b71e3b840274b9b4dcb1 Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Sat, 23 May 2009 11:36:20 +0100
Subject: [ARM] disable NX support for OABI-supporting kernels

Our signal syscall restart handling for these kernels still uses
the userspace stack to build code for restarting the syscall.
Unfortunately, fixing this is non-trivial, and so for the time
being, we resolve the problem by disabling NX support.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

diff --git a/arch/arm/kernel/elf.c b/arch/arm/kernel/elf.c
index d4a0da1..950391f 100644
--- a/arch/arm/kernel/elf.c
+++ b/arch/arm/kernel/elf.c
@@ -78,6 +78,15 @@ int arm_elf_read_implies_exec(const struct elf32_hdr *x, int executable_stack)
 		return 1;
 	if (cpu_architecture() < CPU_ARCH_ARMv6)
 		return 1;
+#if !defined(CONFIG_AEABI) || defined(CONFIG_OABI_COMPAT)
+	/*
+	 * If we have support for OABI programs, we can never allow NX
+	 * support - our signal syscall restart mechanism relies upon
+	 * being able to execute code placed on the user stack.
+	 */
+	return 1;
+#else
 	return 0;
+#endif
 }
 EXPORT_SYMBOL(arm_elf_read_implies_exec);
-- 
cgit v0.10.2


From 948cd52906baf1f92aeea2f9b5c515db1b2e592a Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 22 May 2009 10:40:09 +0900
Subject: sparseirq: Allow early irq_desc allocation

Presently non-legacy IRQs have their irq_desc allocated with
kzalloc_node(). This assumes that all callers of irq_to_desc_node_alloc()
will be sufficiently late in the boot process that kmalloc is available.

While porting sparseirq support to sh this blew up immediately, as at the
time that we register the CPU's interrupt vector map only bootmem is
available. Check slab_is_available() to work out which path to use.

[ Impact: fix SH early boot crash with sparseirq enabled ]

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
LKML-Reference: <20090522014008.GA2806@linux-sh.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a3c671e..18041a2 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/irq.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/interrupt.h>
@@ -81,11 +82,16 @@ static struct irq_desc irq_desc_init = {
 	.lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
 };
 
-void init_kstat_irqs(struct irq_desc *desc, int node, int nr)
+void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
 {
 	void *ptr;
 
-	ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node);
+	if (slab_is_available())
+		ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
+				   GFP_ATOMIC, node);
+	else
+		ptr = alloc_bootmem_node(NODE_DATA(node),
+				nr * sizeof(*desc->kstat_irqs));
 
 	/*
 	 * don't overwite if can not get new one
@@ -186,7 +192,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
 	return NULL;
 }
 
-struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
+struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
 {
 	struct irq_desc *desc;
 	unsigned long flags;
@@ -208,7 +214,11 @@ struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
 	if (desc)
 		goto out_unlock;
 
-	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
+	if (slab_is_available())
+		desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
+	else
+		desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
+
 	printk(KERN_DEBUG "  alloc irq_desc for %d on node %d\n", irq, node);
 	if (!desc) {
 		printk(KERN_ERR "can not alloc irq_desc\n");
-- 
cgit v0.10.2


From df391e0eda1e678add56a8e34226edf05d89af6a Mon Sep 17 00:00:00 2001
From: Henrik Rydberg <rydberg@euromail.se>
Date: Sat, 23 May 2009 09:51:20 -0700
Subject: Input: multitouch - add tracking ID to the protocol
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are a few multi-touch devices that support finger tracking
well in hardware, Stantum being the prime example. By exposing the
tracking ID in the MT protocol, evdev bandwidth and cpu usage in
user space can be reduced.

This patch adds the ABS_MT_TRACKING_ID to the MT protocol.

Signed-off-by: Henrik Rydberg <rydberg@euromail.se>
Tested-by: Stéphane Chatty <chatty@enac.fr>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>

diff --git a/drivers/input/input.c b/drivers/input/input.c
index e54e002..5d445f4 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -42,6 +42,7 @@ static unsigned int input_abs_bypass_init_data[] __initdata = {
 	ABS_MT_POSITION_Y,
 	ABS_MT_TOOL_TYPE,
 	ABS_MT_BLOB_ID,
+	ABS_MT_TRACKING_ID,
 	0
 };
 static unsigned long input_abs_bypass[BITS_TO_LONGS(ABS_CNT)];
diff --git a/include/linux/input.h b/include/linux/input.h
index 0e6ff5d..6fed4f6 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -656,6 +656,7 @@ struct input_absinfo {
 #define ABS_MT_POSITION_Y	0x36	/* Center Y ellipse position */
 #define ABS_MT_TOOL_TYPE	0x37	/* Type of touching device */
 #define ABS_MT_BLOB_ID		0x38	/* Group a set of packets as a blob */
+#define ABS_MT_TRACKING_ID	0x39	/* Unique ID of initiated contact */
 
 #define ABS_MAX			0x3f
 #define ABS_CNT			(ABS_MAX+1)
-- 
cgit v0.10.2


From f9fcfc3b4627a1ec9b50411060f1b384926d6610 Mon Sep 17 00:00:00 2001
From: Henrik Rydberg <rydberg@euromail.se>
Date: Sat, 23 May 2009 09:51:21 -0700
Subject: Input: multitouch - augment event semantics documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Through the collaboration to adapt the N-trig and Stantum HID
drivers to the MT protocol, some semantic clarifications to the
protocol have been made. This patch adds them to the MT documentation.

Signed-off-by: Henrik Rydberg <rydberg@euromail.se>
Tested-by: Stéphane Chatty <chatty@enac.fr>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>

diff --git a/Documentation/input/multi-touch-protocol.txt b/Documentation/input/multi-touch-protocol.txt
index 9f09557..a12ea3b 100644
--- a/Documentation/input/multi-touch-protocol.txt
+++ b/Documentation/input/multi-touch-protocol.txt
@@ -18,8 +18,12 @@ Usage
 Anonymous finger details are sent sequentially as separate packets of ABS
 events. Only the ABS_MT events are recognized as part of a finger
 packet. The end of a packet is marked by calling the input_mt_sync()
-function, which generates a SYN_MT_REPORT event. The end of multi-touch
-transfer is marked by calling the usual input_sync() function.
+function, which generates a SYN_MT_REPORT event. This instructs the
+receiver to accept the data for the current finger and prepare to receive
+another. The end of a multi-touch transfer is marked by calling the usual
+input_sync() function. This instructs the receiver to act upon events
+accumulated since last EV_SYN/SYN_REPORT and prepare to receive a new
+set of events/packets.
 
 A set of ABS_MT events with the desired properties is defined. The events
 are divided into categories, to allow for partial implementation.  The
@@ -27,11 +31,26 @@ minimum set consists of ABS_MT_TOUCH_MAJOR, ABS_MT_POSITION_X and
 ABS_MT_POSITION_Y, which allows for multiple fingers to be tracked.  If the
 device supports it, the ABS_MT_WIDTH_MAJOR may be used to provide the size
 of the approaching finger. Anisotropy and direction may be specified with
-ABS_MT_TOUCH_MINOR, ABS_MT_WIDTH_MINOR and ABS_MT_ORIENTATION. Devices with
-more granular information may specify general shapes as blobs, i.e., as a
-sequence of rectangular shapes grouped together by an
-ABS_MT_BLOB_ID. Finally, the ABS_MT_TOOL_TYPE may be used to specify
-whether the touching tool is a finger or a pen or something else.
+ABS_MT_TOUCH_MINOR, ABS_MT_WIDTH_MINOR and ABS_MT_ORIENTATION.  The
+ABS_MT_TOOL_TYPE may be used to specify whether the touching tool is a
+finger or a pen or something else.  Devices with more granular information
+may specify general shapes as blobs, i.e., as a sequence of rectangular
+shapes grouped together by an ABS_MT_BLOB_ID. Finally, for the few devices
+that currently support it, the ABS_MT_TRACKING_ID event may be used to
+report finger tracking from hardware [5].
+
+Here is what a minimal event sequence for a two-finger touch would look
+like:
+
+   ABS_MT_TOUCH_MAJOR
+   ABS_MT_POSITION_X
+   ABS_MT_POSITION_Y
+   SYN_MT_REPORT
+   ABS_MT_TOUCH_MAJOR
+   ABS_MT_POSITION_X
+   ABS_MT_POSITION_Y
+   SYN_MT_REPORT
+   SYN_REPORT
 
 
 Event Semantics
@@ -44,24 +63,24 @@ ABS_MT_TOUCH_MAJOR
 
 The length of the major axis of the contact. The length should be given in
 surface units. If the surface has an X times Y resolution, the largest
-possible value of ABS_MT_TOUCH_MAJOR is sqrt(X^2 + Y^2), the diagonal.
+possible value of ABS_MT_TOUCH_MAJOR is sqrt(X^2 + Y^2), the diagonal [4].
 
 ABS_MT_TOUCH_MINOR
 
 The length, in surface units, of the minor axis of the contact. If the
-contact is circular, this event can be omitted.
+contact is circular, this event can be omitted [4].
 
 ABS_MT_WIDTH_MAJOR
 
 The length, in surface units, of the major axis of the approaching
 tool. This should be understood as the size of the tool itself. The
 orientation of the contact and the approaching tool are assumed to be the
-same.
+same [4].
 
 ABS_MT_WIDTH_MINOR
 
 The length, in surface units, of the minor axis of the approaching
-tool. Omit if circular.
+tool. Omit if circular [4].
 
 The above four values can be used to derive additional information about
 the contact. The ratio ABS_MT_TOUCH_MAJOR / ABS_MT_WIDTH_MAJOR approximates
@@ -70,14 +89,17 @@ different characteristic widths [1].
 
 ABS_MT_ORIENTATION
 
-The orientation of the ellipse. The value should describe half a revolution
-clockwise around the touch center. The scale of the value is arbitrary, but
-zero should be returned for an ellipse aligned along the Y axis of the
-surface. As an example, an index finger placed straight onto the axis could
-return zero orientation, something negative when twisted to the left, and
-something positive when twisted to the right. This value can be omitted if
-the touching object is circular, or if the information is not available in
-the kernel driver.
+The orientation of the ellipse. The value should describe a signed quarter
+of a revolution clockwise around the touch center. The signed value range
+is arbitrary, but zero should be returned for a finger aligned along the Y
+axis of the surface, a negative value when finger is turned to the left, and
+a positive value when finger turned to the right. When completely aligned with
+the X axis, the range max should be returned.  Orientation can be omitted
+if the touching object is circular, or if the information is not available
+in the kernel driver. Partial orientation support is possible if the device
+can distinguish between the two axis, but not (uniquely) any values in
+between. In such cases, the range of ABS_MT_ORIENTATION should be [0, 1]
+[4].
 
 ABS_MT_POSITION_X
 
@@ -98,8 +120,35 @@ ABS_MT_BLOB_ID
 
 The BLOB_ID groups several packets together into one arbitrarily shaped
 contact. This is a low-level anonymous grouping, and should not be confused
-with the high-level contactID, explained below. Most kernel drivers will
-not have this capability, and can safely omit the event.
+with the high-level trackingID [5]. Most kernel drivers will not have blob
+capability, and can safely omit the event.
+
+ABS_MT_TRACKING_ID
+
+The TRACKING_ID identifies an initiated contact throughout its life cycle
+[5]. There are currently only a few devices that support it, so this event
+should normally be omitted.
+
+
+Event Computation
+-----------------
+
+The flora of different hardware unavoidably leads to some devices fitting
+better to the MT protocol than others. To simplify and unify the mapping,
+this section gives recipes for how to compute certain events.
+
+For devices reporting contacts as rectangular shapes, signed orientation
+cannot be obtained. Assuming X and Y are the lengths of the sides of the
+touching rectangle, here is a simple formula that retains the most
+information possible:
+
+   ABS_MT_TOUCH_MAJOR := max(X, Y)
+   ABS_MT_TOUCH_MINOR := min(X, Y)
+   ABS_MT_ORIENTATION := bool(X > Y)
+
+The range of ABS_MT_ORIENTATION should be set to [0, 1], to indicate that
+the device can distinguish between a finger along the Y axis (0) and a
+finger along the X axis (1).
 
 
 Finger Tracking
@@ -109,14 +158,18 @@ The kernel driver should generate an arbitrary enumeration of the set of
 anonymous contacts currently on the surface. The order in which the packets
 appear in the event stream is not important.
 
-The process of finger tracking, i.e., to assign a unique contactID to each
+The process of finger tracking, i.e., to assign a unique trackingID to each
 initiated contact on the surface, is left to user space; preferably the
-multi-touch X driver [3]. In that driver, the contactID stays the same and
+multi-touch X driver [3]. In that driver, the trackingID stays the same and
 unique until the contact vanishes (when the finger leaves the surface). The
 problem of assigning a set of anonymous fingers to a set of identified
 fingers is a euclidian bipartite matching problem at each event update, and
 relies on a sufficiently rapid update rate.
 
+There are a few devices that support trackingID in hardware. User space can
+make use of these native identifiers to reduce bandwidth and cpu usage.
+
+
 Notes
 -----
 
@@ -136,5 +189,7 @@ could be used to derive tilt.
 time of writing (April 2009), the MT protocol is not yet merged, and the
 prototype implements finger matching, basic mouse support and two-finger
 scrolling. The project aims at improving the quality of current multi-touch
-functionality available in the synaptics X driver, and in addition
+functionality available in the Synaptics X driver, and in addition
 implement more advanced gestures.
+[4] See the section on event computation.
+[5] See the section on finger tracking.
-- 
cgit v0.10.2


From e220d2dcb944c5c488b6855d15ec66d76900514f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 23 May 2009 18:28:55 +0200
Subject: perf_counter: Fix dynamic irq_period logging

We call perf_adjust_freq() from perf_counter_task_tick() which
is is called under the rq->lock causing lock recursion.
However, it's no longer required to be called under the
rq->lock, so remove it from under it.

Also, fix up some related comments.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090523163012.476197912@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 2eedae8..23ddd29 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -260,6 +260,7 @@ enum perf_event_type {
 	/*
 	 * struct {
 	 * 	struct perf_event_header	header;
+	 * 	u64				time;
 	 * 	u64				irq_period;
 	 * };
 	 */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c100554..2f410ea 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2559,7 +2559,8 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
 }
 
 /*
- *
+ * Log irq_period changes so that analyzing tools can re-normalize the
+ * event flow.
  */
 
 static void perf_log_period(struct perf_counter *counter, u64 period)
diff --git a/kernel/sched.c b/kernel/sched.c
index 4c0d58b..ad079f0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4875,9 +4875,10 @@ void scheduler_tick(void)
 	update_rq_clock(rq);
 	update_cpu_load(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
-	perf_counter_task_tick(curr, cpu);
 	spin_unlock(&rq->lock);
 
+	perf_counter_task_tick(curr, cpu);
+
 #ifdef CONFIG_SMP
 	rq->idle_at_tick = idle_cpu(cpu);
 	trigger_load_balance(rq, cpu);
-- 
cgit v0.10.2


From fccc714b3148ab9741fafc1e90c3876d50df6093 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 23 May 2009 18:28:56 +0200
Subject: perf_counter: Sanitize counter->mutex

s/counter->mutex/counter->child_mutex/ and make sure its only
used to protect child_list.

The usage in __perf_counter_exit_task() doesn't appear to be
problematic since ctx->mutex also covers anything related to fd
tear-down.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090523163012.533186528@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 23ddd29..4ab8050 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -452,9 +452,6 @@ struct perf_counter {
 	struct perf_counter_context	*ctx;
 	struct file			*filp;
 
-	struct perf_counter		*parent;
-	struct list_head		child_list;
-
 	/*
 	 * These accumulate total time (in nanoseconds) that children
 	 * counters have been enabled and running, respectively.
@@ -465,7 +462,9 @@ struct perf_counter {
 	/*
 	 * Protect attach/detach and child_list:
 	 */
-	struct mutex			mutex;
+	struct mutex			child_mutex;
+	struct list_head		child_list;
+	struct perf_counter		*parent;
 
 	int				oncpu;
 	int				cpu;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 2f410ea..679c3b5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -111,6 +111,10 @@ static void put_ctx(struct perf_counter_context *ctx)
 	}
 }
 
+/*
+ * Add a counter from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
@@ -136,7 +140,7 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 
 /*
  * Remove a counter from the lists for its context.
- * Must be called with counter->mutex and ctx->mutex held.
+ * Must be called with ctx->mutex and ctx->lock held.
  */
 static void
 list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
@@ -276,7 +280,7 @@ static void __perf_counter_remove_from_context(void *info)
 /*
  * Remove the counter from a task's (or a CPU's) list of counters.
  *
- * Must be called with counter->mutex and ctx->mutex held.
+ * Must be called with ctx->mutex held.
  *
  * CPU counters are removed with a smp call. For task counters we only
  * call when the task is on a CPU.
@@ -1407,11 +1411,7 @@ static int perf_release(struct inode *inode, struct file *file)
 	file->private_data = NULL;
 
 	mutex_lock(&ctx->mutex);
-	mutex_lock(&counter->mutex);
-
 	perf_counter_remove_from_context(counter);
-
-	mutex_unlock(&counter->mutex);
 	mutex_unlock(&ctx->mutex);
 
 	free_counter(counter);
@@ -1437,7 +1437,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 	if (counter->state == PERF_COUNTER_STATE_ERROR)
 		return 0;
 
-	mutex_lock(&counter->mutex);
+	mutex_lock(&counter->child_mutex);
 	values[0] = perf_counter_read(counter);
 	n = 1;
 	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
@@ -1446,7 +1446,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 		values[n++] = counter->total_time_running +
 			atomic64_read(&counter->child_total_time_running);
-	mutex_unlock(&counter->mutex);
+	mutex_unlock(&counter->child_mutex);
 
 	if (count < n * sizeof(u64))
 		return -EINVAL;
@@ -1510,11 +1510,11 @@ static void perf_counter_for_each_child(struct perf_counter *counter,
 {
 	struct perf_counter *child;
 
-	mutex_lock(&counter->mutex);
+	mutex_lock(&counter->child_mutex);
 	func(counter);
 	list_for_each_entry(child, &counter->child_list, child_list)
 		func(child);
-	mutex_unlock(&counter->mutex);
+	mutex_unlock(&counter->child_mutex);
 }
 
 static void perf_counter_for_each(struct perf_counter *counter,
@@ -1522,11 +1522,11 @@ static void perf_counter_for_each(struct perf_counter *counter,
 {
 	struct perf_counter *child;
 
-	mutex_lock(&counter->mutex);
+	mutex_lock(&counter->child_mutex);
 	perf_counter_for_each_sibling(counter, func);
 	list_for_each_entry(child, &counter->child_list, child_list)
 		perf_counter_for_each_sibling(child, func);
-	mutex_unlock(&counter->mutex);
+	mutex_unlock(&counter->child_mutex);
 }
 
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -3106,7 +3106,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	if (!group_leader)
 		group_leader = counter;
 
-	mutex_init(&counter->mutex);
+	mutex_init(&counter->child_mutex);
+	INIT_LIST_HEAD(&counter->child_list);
+
 	INIT_LIST_HEAD(&counter->list_entry);
 	INIT_LIST_HEAD(&counter->event_entry);
 	INIT_LIST_HEAD(&counter->sibling_list);
@@ -3114,8 +3116,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
 	mutex_init(&counter->mmap_mutex);
 
-	INIT_LIST_HEAD(&counter->child_list);
-
 	counter->cpu			= cpu;
 	counter->hw_event		= *hw_event;
 	counter->group_leader		= group_leader;
@@ -3346,10 +3346,9 @@ inherit_counter(struct perf_counter *parent_counter,
 	/*
 	 * Link this into the parent counter's child list
 	 */
-	mutex_lock(&parent_counter->mutex);
+	mutex_lock(&parent_counter->child_mutex);
 	list_add_tail(&child_counter->child_list, &parent_counter->child_list);
-
-	mutex_unlock(&parent_counter->mutex);
+	mutex_unlock(&parent_counter->child_mutex);
 
 	return child_counter;
 }
@@ -3396,9 +3395,9 @@ static void sync_child_counter(struct perf_counter *child_counter,
 	/*
 	 * Remove this counter from the parent's list
 	 */
-	mutex_lock(&parent_counter->mutex);
+	mutex_lock(&parent_counter->child_mutex);
 	list_del_init(&child_counter->child_list);
-	mutex_unlock(&parent_counter->mutex);
+	mutex_unlock(&parent_counter->child_mutex);
 
 	/*
 	 * Release the parent counter, if this was the last
@@ -3414,17 +3413,9 @@ __perf_counter_exit_task(struct task_struct *child,
 {
 	struct perf_counter *parent_counter;
 
-	/*
-	 * Protect against concurrent operations on child_counter
-	 * due its fd getting closed, etc.
-	 */
-	mutex_lock(&child_counter->mutex);
-
 	update_counter_times(child_counter);
 	list_del_counter(child_counter, child_ctx);
 
-	mutex_unlock(&child_counter->mutex);
-
 	parent_counter = child_counter->parent;
 	/*
 	 * It can happen that parent exits first, and has counters
-- 
cgit v0.10.2


From 682076ae1de0aba9c2da509f7b19dc03e30a6e1f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 23 May 2009 18:28:57 +0200
Subject: perf_counter: Sanitize context locking

Ensure we're consistent with the context locks.

 context->mutex
   context->lock
     list_{add,del}_counter();

so that either lock is sufficient to stabilize the context.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090523163012.618790733@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 679c3b5..d162d2f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -597,6 +597,8 @@ static void add_counter_to_ctx(struct perf_counter *counter,
 
 /*
  * Cross CPU call to install and enable a performance counter
+ *
+ * Must be called with ctx->mutex held
  */
 static void __perf_install_in_context(void *info)
 {
@@ -1496,13 +1498,13 @@ static void perf_counter_for_each_sibling(struct perf_counter *counter,
 	struct perf_counter_context *ctx = counter->ctx;
 	struct perf_counter *sibling;
 
-	spin_lock_irq(&ctx->lock);
+	mutex_lock(&ctx->mutex);
 	counter = counter->group_leader;
 
 	func(counter);
 	list_for_each_entry(sibling, &counter->sibling_list, list_entry)
 		func(sibling);
-	spin_unlock_irq(&ctx->lock);
+	mutex_unlock(&ctx->mutex);
 }
 
 static void perf_counter_for_each_child(struct perf_counter *counter,
@@ -3414,7 +3416,10 @@ __perf_counter_exit_task(struct task_struct *child,
 	struct perf_counter *parent_counter;
 
 	update_counter_times(child_counter);
+
+	spin_lock_irq(&child_ctx->lock);
 	list_del_counter(child_counter, child_ctx);
+	spin_unlock_irq(&child_ctx->lock);
 
 	parent_counter = child_counter->parent;
 	/*
-- 
cgit v0.10.2


From 1a482f38c5aafeb3576079a38a5b21b46619f3d2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 23 May 2009 18:28:58 +0200
Subject: perf_counter: Fix userspace build

recent userspace (F11) seems to already include the
linux/unistd.h bits which means we cannot include the version
in the kernel sources due to the header guards being the same.

Ensure we include the kernel version first.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090523163012.739756497@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index efb8759..1b19f18 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -1,5 +1,6 @@
 
 
+#include "perf.h"
 #include "util/util.h"
 
 #include <sys/types.h>
@@ -30,9 +31,7 @@
 #include <linux/unistd.h>
 #include <linux/types.h>
 
-#include "../../include/linux/perf_counter.h"
 
-#include "perf.h"
 
 #define ALIGN(x,a)		__ALIGN_MASK(x,(typeof(x))(a)-1)
 #define __ALIGN_MASK(x,mask)	(((x)+(mask))&~(mask))
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 03518d7..8ae01d5 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -61,6 +61,7 @@
   * Released under the GPL v2. (and only v2, not any later version)
   */
 
+#include "perf.h"
 #include "util/util.h"
 
 #include <getopt.h>
@@ -83,10 +84,6 @@
 #include <linux/unistd.h>
 #include <linux/types.h>
 
-#include "../../include/linux/perf_counter.h"
-
-#include "perf.h"
-
 #define EVENT_MASK_KERNEL		1
 #define EVENT_MASK_USER			2
 
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 814b2e4..a3216a6 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -42,6 +42,7 @@
   * Released under the GPL v2. (and only v2, not any later version)
   */
 
+#include "perf.h"
 #include "util/util.h"
 
 #include <getopt.h>
@@ -64,10 +65,6 @@
 #include <linux/unistd.h>
 #include <linux/types.h>
 
-#include "../../include/linux/perf_counter.h"
-
-#include "perf.h"
-
 static int			system_wide			=  0;
 
 static int			nr_counters			=  0;
diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h
index 81a7374..a517683 100644
--- a/Documentation/perf_counter/perf.h
+++ b/Documentation/perf_counter/perf.h
@@ -1,6 +1,25 @@
 #ifndef _PERF_PERF_H
 #define _PERF_PERF_H
 
+#if defined(__x86_64__) || defined(__i386__)
+#include "../../arch/x86/include/asm/unistd.h"
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#endif
+
+#ifdef __powerpc__
+#include "../../arch/powerpc/include/asm/unistd.h"
+#define rmb()		asm volatile ("sync" ::: "memory")
+#define cpu_relax()	asm volatile ("" ::: "memory");
+#endif
+
+#include <time.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+
+#include "../../include/linux/perf_counter.h"
+
 /*
  * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
  * counters in the current task.
@@ -26,18 +45,6 @@ static inline unsigned long long rdclock(void)
 #define __user
 #define asmlinkage
 
-#if defined(__x86_64__) || defined(__i386__)
-#include "../../arch/x86/include/asm/unistd.h"
-#define rmb()		asm volatile("lfence" ::: "memory")
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#endif
-
-#ifdef __powerpc__
-#include "../../arch/powerpc/include/asm/unistd.h"
-#define rmb()		asm volatile ("sync" ::: "memory")
-#define cpu_relax()	asm volatile ("" ::: "memory");
-#endif
-
 #define unlikely(x)	__builtin_expect(!!(x), 0)
 #define min(x, y) ({				\
 	typeof(x) _min1 = (x);			\
-- 
cgit v0.10.2


From aa9c67f53d1969cf1db4c9c2db3a78c4ceb96469 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 23 May 2009 18:28:59 +0200
Subject: perf_counter: Simplify context cleanup

Use perf_counter_remove_from_context() to remove counters from
the context.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090523163012.796275849@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d162d2f..0e97f89 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3416,10 +3416,7 @@ __perf_counter_exit_task(struct task_struct *child,
 	struct perf_counter *parent_counter;
 
 	update_counter_times(child_counter);
-
-	spin_lock_irq(&child_ctx->lock);
-	list_del_counter(child_counter, child_ctx);
-	spin_unlock_irq(&child_ctx->lock);
+	perf_counter_remove_from_context(child_counter);
 
 	parent_counter = child_counter->parent;
 	/*
-- 
cgit v0.10.2


From 8db14ca12569fe885694bd3d5ff84c2d973d3cb0 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Sat, 23 May 2009 18:57:25 +0000
Subject: [CIFS] Avoid open on possible directories since Samba now rejects
 them

Small change (mostly formatting) to limit lookup based open calls to
file create only.

After discussion yesteday on samba-technical about the posix lookup
regression,  and looking at a problem with cifs posix open to one
particular Samba version, Jeff and JRA realized that Samba server's
behavior changed in this area (posix open behavior on files vs.
directories).   To make this behavior consistent, JRA just made a
fix to Samba server to alter how it handles open of directories (now
returning the equivalent of EISDIR instead of success). Since we don't
know at lookup time whether the inode is a directory or file (and
thus whether posix open will succeed with most current Samba server),
this change avoids the posix open code on lookup open (just issues
posix open on creates).    This gets the semantic benefits we want
(atomicity, posix byte range locks, improved write semantics on newly
created files) and file create still is fast, and we avoid the problem
that Jeff noticed yesterday with "openat" (and some open directory
calls) of non-cached directories to one version of Samba server, and
will work with future Samba versions (which include the fix jra just
pushed into Samba server).  I confirmed this approach with jra
yesterday and with Shirish today.

Posix open is only called (at lookup time) for file create now.
For opens (rather than creates), because we do not know if it
is a file or directory yet, and current Samba no longer allows
us to do posix open on dirs, we could end up wasting an open call
on what turns out to be a dir. For file opens, we wait to call posix
open till cifs_open.  It could be added here (lookup) in the future
but the performance tradeoff of the extra network request when EISDIR
or EACCES is returned would have to be weighed against the 50%
reduction in network traffic in the other paths.

Reviewed-by: Shirish Pargaonkar <shirishp@us.ibm.com>
Tested-by: Jeff Layton <jlayton@redhat.com>
CC: Jeremy Allison <jra@samba.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>

diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index f49d684..3758965d 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -657,31 +657,36 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	}
 	cFYI(1, ("Full path: %s inode = 0x%p", full_path, direntry->d_inode));
 
+	/* Posix open is only called (at lookup time) for file create now.
+	 * For opens (rather than creates), because we do not know if it
+	 * is a file or directory yet, and current Samba no longer allows
+	 * us to do posix open on dirs, we could end up wasting an open call
+	 * on what turns out to be a dir. For file opens, we wait to call posix
+	 * open till cifs_open.  It could be added here (lookup) in the future
+	 * but the performance tradeoff of the extra network request when EISDIR
+	 * or EACCES is returned would have to be weighed against the 50%
+	 * reduction in network traffic in the other paths.
+	 */
 	if (pTcon->unix_ext) {
 		if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
-		     (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open) {
-			if (!((nd->intent.open.flags & O_CREAT) &&
-					(nd->intent.open.flags & O_EXCL))) {
-				rc = cifs_posix_open(full_path, &newInode,
+		     (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
+		     (nd->intent.open.flags & O_CREAT)) {
+			rc = cifs_posix_open(full_path, &newInode,
 					parent_dir_inode->i_sb,
 					nd->intent.open.create_mode,
 					nd->intent.open.flags, &oplock,
 					&fileHandle, xid);
-				/*
-				 * This code works around a bug in
-				 * samba posix open in samba versions 3.3.1
-				 * and earlier where create works
-				 * but open fails with invalid parameter.
-				 * If either of these error codes are
-				 * returned, follow the normal lookup.
-				 * Otherwise, the error during posix open
-				 * is handled.
-				 */
-				if ((rc != -EINVAL) && (rc != -EOPNOTSUPP))
-					posix_open = true;
-				else
-					pTcon->broken_posix_open = true;
-			}
+			/*
+			 * The check below works around a bug in POSIX
+			 * open in samba versions 3.3.1 and earlier where
+			 * open could incorrectly fail with invalid parameter.
+			 * If either that or op not supported returned, follow
+			 * the normal lookup.
+			 */
+			if ((rc == 0) || (rc == -ENOENT))
+				posix_open = true;
+			else if ((rc == -EINVAL) || (rc != -EOPNOTSUPP))
+				pTcon->broken_posix_open = true;
 		}
 		if (!posix_open)
 			rc = cifs_get_inode_info_unix(&newInode, full_path,
-- 
cgit v0.10.2


From 59a3759d0fe8d969888c741bb33f4946e4d3750d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 23 May 2009 14:47:00 -0700
Subject: Linux 2.6.30-rc7


diff --git a/Makefile b/Makefile
index b57e1f5..739fd34 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,8 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 30
-EXTRAVERSION = -rc6
-NAME = Vindictive Armadillo
+EXTRAVERSION = -rc7
+NAME = Man-Eating Seals of Antiquity
 
 # *DOCUMENTATION*
 # To see a list of typical targets execute "make help"
-- 
cgit v0.10.2


From 79f52b77b89e8b7aa9fbe62135eea198a2ecbd5b Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Date: Sat, 23 May 2009 20:28:41 -0500
Subject: jfs: Add missing mutex_unlock call to error path

Jan Kucera found an missing call to mutex_unlock() with his static code
checker.  It's an unlikely error path to hit in the real world, but it
should be fixed.

Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Reported-by: Jan Kucera <kucera.jan.cz@gmail.com>

diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 3460572..0fc3040 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -2571,6 +2571,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
 
 			txAbort(tid, 0);
 			txEnd(tid);
+			mutex_unlock(&JFS_IP(ipimap)->commit_mutex);
 
 			/* release the inode map lock */
 			IWRITE_UNLOCK(ipimap);
-- 
cgit v0.10.2


From 082ff5a2767a0679ee543f14883adbafb631ffbe Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 23 May 2009 18:29:00 +0200
Subject: perf_counter: Change pctrl() behaviour

Instead of en/dis-abling all counters acting on a particular
task, en/dis- able all counters we created.

[ v2: fix crash on first counter enable ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090523163012.916937244@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index d87247d..353c0ac 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -108,6 +108,15 @@ extern struct group_info init_groups;
 
 extern struct cred init_cred;
 
+#ifdef CONFIG_PERF_COUNTERS
+# define INIT_PERF_COUNTERS(tsk)					\
+	.perf_counter_mutex = 						\
+		 __MUTEX_INITIALIZER(tsk.perf_counter_mutex),		\
+	.perf_counter_list = LIST_HEAD_INIT(tsk.perf_counter_list),
+#else
+# define INIT_PERF_COUNTERS(tsk)
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -171,6 +180,7 @@ extern struct cred init_cred;
 	},								\
 	.dirties = INIT_PROP_LOCAL_SINGLE(dirties),			\
 	INIT_IDS							\
+	INIT_PERF_COUNTERS(tsk)						\
 	INIT_TRACE_IRQFLAGS						\
 	INIT_LOCKDEP							\
 	INIT_FTRACE_GRAPH						\
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 4ab8050..4159ee5 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -469,6 +469,9 @@ struct perf_counter {
 	int				oncpu;
 	int				cpu;
 
+	struct list_head		owner_entry;
+	struct task_struct		*owner;
+
 	/* mmap bits */
 	struct mutex			mmap_mutex;
 	atomic_t			mmap_count;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9714d45..bc9326d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1389,6 +1389,8 @@ struct task_struct {
 #endif
 #ifdef CONFIG_PERF_COUNTERS
 	struct perf_counter_context *perf_counter_ctxp;
+	struct mutex perf_counter_mutex;
+	struct list_head perf_counter_list;
 #endif
 #ifdef CONFIG_NUMA
 	struct mempolicy *mempolicy;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0e97f89..4c86a63 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1076,79 +1076,26 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 	__perf_counter_sched_in(ctx, cpuctx, cpu);
 }
 
-int perf_counter_task_disable(void)
+int perf_counter_task_enable(void)
 {
-	struct task_struct *curr = current;
-	struct perf_counter_context *ctx = curr->perf_counter_ctxp;
 	struct perf_counter *counter;
-	unsigned long flags;
-
-	if (!ctx || !ctx->nr_counters)
-		return 0;
-
-	local_irq_save(flags);
 
-	__perf_counter_task_sched_out(ctx);
-
-	spin_lock(&ctx->lock);
-
-	/*
-	 * Disable all the counters:
-	 */
-	perf_disable();
-
-	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		if (counter->state != PERF_COUNTER_STATE_ERROR) {
-			update_group_times(counter);
-			counter->state = PERF_COUNTER_STATE_OFF;
-		}
-	}
-
-	perf_enable();
-
-	spin_unlock_irqrestore(&ctx->lock, flags);
+	mutex_lock(&current->perf_counter_mutex);
+	list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
+		perf_counter_enable(counter);
+	mutex_unlock(&current->perf_counter_mutex);
 
 	return 0;
 }
 
-int perf_counter_task_enable(void)
+int perf_counter_task_disable(void)
 {
-	struct task_struct *curr = current;
-	struct perf_counter_context *ctx = curr->perf_counter_ctxp;
 	struct perf_counter *counter;
-	unsigned long flags;
-	int cpu;
-
-	if (!ctx || !ctx->nr_counters)
-		return 0;
-
-	local_irq_save(flags);
-	cpu = smp_processor_id();
-
-	__perf_counter_task_sched_out(ctx);
-
-	spin_lock(&ctx->lock);
 
-	/*
-	 * Disable all the counters:
-	 */
-	perf_disable();
-
-	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		if (counter->state > PERF_COUNTER_STATE_OFF)
-			continue;
-		counter->state = PERF_COUNTER_STATE_INACTIVE;
-		counter->tstamp_enabled =
-			ctx->time - counter->total_time_enabled;
-		counter->hw_event.disabled = 0;
-	}
-	perf_enable();
-
-	spin_unlock(&ctx->lock);
-
-	perf_counter_task_sched_in(curr, cpu);
-
-	local_irq_restore(flags);
+	mutex_lock(&current->perf_counter_mutex);
+	list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
+		perf_counter_disable(counter);
+	mutex_unlock(&current->perf_counter_mutex);
 
 	return 0;
 }
@@ -1416,6 +1363,11 @@ static int perf_release(struct inode *inode, struct file *file)
 	perf_counter_remove_from_context(counter);
 	mutex_unlock(&ctx->mutex);
 
+	mutex_lock(&counter->owner->perf_counter_mutex);
+	list_del_init(&counter->owner_entry);
+	mutex_unlock(&counter->owner->perf_counter_mutex);
+	put_task_struct(counter->owner);
+
 	free_counter(counter);
 	put_context(ctx);
 
@@ -3272,6 +3224,12 @@ SYSCALL_DEFINE5(perf_counter_open,
 	perf_install_in_context(ctx, counter, cpu);
 	mutex_unlock(&ctx->mutex);
 
+	counter->owner = current;
+	get_task_struct(current);
+	mutex_lock(&current->perf_counter_mutex);
+	list_add_tail(&counter->owner_entry, &current->perf_counter_list);
+	mutex_unlock(&current->perf_counter_mutex);
+
 	fput_light(counter_file, fput_needed2);
 
 out_fput:
@@ -3488,6 +3446,9 @@ void perf_counter_init_task(struct task_struct *child)
 
 	child->perf_counter_ctxp = NULL;
 
+	mutex_init(&child->perf_counter_mutex);
+	INIT_LIST_HEAD(&child->perf_counter_list);
+
 	/*
 	 * This is executed from the parent task context, so inherit
 	 * counters that have been marked for cloning.
-- 
cgit v0.10.2


From 475c55797323b67435083f6e2eb8ee670f6410ec Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 23 May 2009 18:29:01 +0200
Subject: perf_counter: Remove perf_counter_context::nr_enabled

now that pctrl() no longer disables other people's counters,
remove the PMU cache code that deals with that.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090523163013.032998331@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 4159ee5..2ddf5e3 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -516,7 +516,6 @@ struct perf_counter_context {
 	struct list_head	event_list;
 	int			nr_counters;
 	int			nr_active;
-	int			nr_enabled;
 	int			is_active;
 	atomic_t		refcount;
 	struct task_struct	*task;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4c86a63..cb40625 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -134,8 +134,6 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 
 	list_add_rcu(&counter->event_entry, &ctx->event_list);
 	ctx->nr_counters++;
-	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
-		ctx->nr_enabled++;
 }
 
 /*
@@ -150,8 +148,6 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	if (list_empty(&counter->list_entry))
 		return;
 	ctx->nr_counters--;
-	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
-		ctx->nr_enabled--;
 
 	list_del_init(&counter->list_entry);
 	list_del_rcu(&counter->event_entry);
@@ -406,7 +402,6 @@ static void __perf_counter_disable(void *info)
 		else
 			counter_sched_out(counter, cpuctx, ctx);
 		counter->state = PERF_COUNTER_STATE_OFF;
-		ctx->nr_enabled--;
 	}
 
 	spin_unlock_irqrestore(&ctx->lock, flags);
@@ -448,7 +443,6 @@ static void perf_counter_disable(struct perf_counter *counter)
 	if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
 		update_counter_times(counter);
 		counter->state = PERF_COUNTER_STATE_OFF;
-		ctx->nr_enabled--;
 	}
 
 	spin_unlock_irq(&ctx->lock);
@@ -759,7 +753,6 @@ static void __perf_counter_enable(void *info)
 		goto unlock;
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
-	ctx->nr_enabled++;
 
 	/*
 	 * If the counter is in a group and isn't the group leader,
@@ -850,7 +843,6 @@ static void perf_counter_enable(struct perf_counter *counter)
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
 		counter->tstamp_enabled =
 			ctx->time - counter->total_time_enabled;
-		ctx->nr_enabled++;
 	}
  out:
 	spin_unlock_irq(&ctx->lock);
@@ -910,8 +902,7 @@ static int context_equiv(struct perf_counter_context *ctx1,
 			 struct perf_counter_context *ctx2)
 {
 	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
-		&& ctx1->parent_gen == ctx2->parent_gen
-		&& ctx1->nr_enabled == ctx2->nr_enabled;
+		&& ctx1->parent_gen == ctx2->parent_gen;
 }
 
 /*
-- 
cgit v0.10.2


From c2990a2a582d73562d4dcf2502c39892a19a691d Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Sun, 24 May 2009 08:35:49 +0200
Subject: perf top: fix segfault

c6eb13 increased stack usage such that perf-top now croaks on startup.

Take event_array and mmap_array off the stack to prevent segfault on boxen
with smallish ulimit -s setting.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index a3216a6..74021ac 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -1035,10 +1035,11 @@ static void mmap_read(struct mmap_data *md)
 	md->prev = old;
 }
 
+static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
+static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
+
 int cmd_top(int argc, char **argv, const char *prefix)
 {
-	struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
-	struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 	struct perf_counter_hw_event hw_event;
 	pthread_t thread;
 	int i, counter, group_fd, nr_poll = 0;
-- 
cgit v0.10.2


From a3862d3f814ce7dfca9eed56ac23d29db3aee8d5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 24 May 2009 09:02:37 +0200
Subject: perf_counter: Increase mmap limit

In a default 'perf top' run the tool will create a counter for
each online CPU. With enough CPUs this will eventually exhaust
the default limit.

So scale it up with the number of online CPUs.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index cb40625..6cdf824 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1704,6 +1704,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 
 	user_extra = nr_pages + 1;
 	user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
+
+	/*
+	 * Increase the limit linearly with more CPUs:
+	 */
+	user_lock_limit *= num_online_cpus();
+
 	user_locked = atomic_long_read(&user->locked_vm) + user_extra;
 
 	extra = 0;
-- 
cgit v0.10.2


From 93574844bc3906941b89d6b6f72e01e87413f3c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ozan=20=C3=87a=C4=9Flayan?= <ozan@pardus.org.tr>
Date: Sat, 23 May 2009 15:00:04 +0300
Subject: ALSA: hda - Add forced codec-slots for ASUS W5Fm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ASUS W5Fm needs the fixed codec-slots to probe to override the BIOS
problem like W5F.

Tested-by: Alp Kılıç <kilic.alp@gmail.com>
Signed-off-by: Ozan Çağlayan <ozan@pardus.org.tr>
Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c
index 21e99cf..3128e1a 100644
--- a/sound/pci/hda/hda_intel.c
+++ b/sound/pci/hda/hda_intel.c
@@ -2141,6 +2141,7 @@ static struct snd_pci_quirk probe_mask_list[] __devinitdata = {
 	/* including bogus ALC268 in slot#2 that conflicts with ALC888 */
 	SND_PCI_QUIRK(0x17c0, 0x4085, "Medion MD96630", 0x01),
 	/* forced codec slots */
+	SND_PCI_QUIRK(0x1043, 0x1262, "ASUS W5Fm", 0x103),
 	SND_PCI_QUIRK(0x1046, 0x1262, "ASUS W5F", 0x103),
 	{}
 };
-- 
cgit v0.10.2


From 679d92ed1403b6cf9a19aa42ec62b81cae1aa017 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Sun, 24 May 2009 19:00:08 +0200
Subject: ALSA: hda - Add 5stack-no-fp model for STAC927x

The recent fix for the headphone volume control on IDT/STAC codecs
resulted in the removal of invalid "Side" volume eventually.  But,
if the front panel doesn't exist, this setup could be regarded as a
sort of regression, as reported in kernel bug #13250.

Now as a workaround, a new model 5stack-no-fp is added so that the user
without the front panel can choose this one explicitly.

Reference: bko#13250
	http://bugzilla.kernel.org/show_bug.cgi?id=13250

Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/Documentation/sound/alsa/HD-Audio-Models.txt b/Documentation/sound/alsa/HD-Audio-Models.txt
index 8eec05b..322869f 100644
--- a/Documentation/sound/alsa/HD-Audio-Models.txt
+++ b/Documentation/sound/alsa/HD-Audio-Models.txt
@@ -334,6 +334,7 @@ STAC9227/9228/9229/927x
   ref-no-jd	Reference board without HP/Mic jack detection
   3stack	D965 3stack
   5stack	D965 5stack + SPDIF
+  5stack-no-fp	D965 5stack without front panel
   dell-3stack	Dell Dimension E520
   dell-bios	Fixes with Dell BIOS setup
   auto		BIOS setup (default)
diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c
index 03b3646..d2fd8ef 100644
--- a/sound/pci/hda/patch_sigmatel.c
+++ b/sound/pci/hda/patch_sigmatel.c
@@ -150,6 +150,7 @@ enum {
 	STAC_D965_REF,
 	STAC_D965_3ST,
 	STAC_D965_5ST,
+	STAC_D965_5ST_NO_FP,
 	STAC_DELL_3ST,
 	STAC_DELL_BIOS,
 	STAC_927X_MODELS
@@ -2154,6 +2155,13 @@ static unsigned int d965_5st_pin_configs[14] = {
 	0x40000100, 0x40000100
 };
 
+static unsigned int d965_5st_no_fp_pin_configs[14] = {
+	0x40000100, 0x40000100, 0x0181304e, 0x01014010,
+	0x01a19040, 0x01011012, 0x01016011, 0x40000100,
+	0x40000100, 0x40000100, 0x40000100, 0x01442070,
+	0x40000100, 0x40000100
+};
+
 static unsigned int dell_3st_pin_configs[14] = {
 	0x02211230, 0x02a11220, 0x01a19040, 0x01114210,
 	0x01111212, 0x01116211, 0x01813050, 0x01112214,
@@ -2166,6 +2174,7 @@ static unsigned int *stac927x_brd_tbl[STAC_927X_MODELS] = {
 	[STAC_D965_REF]  = ref927x_pin_configs,
 	[STAC_D965_3ST]  = d965_3st_pin_configs,
 	[STAC_D965_5ST]  = d965_5st_pin_configs,
+	[STAC_D965_5ST_NO_FP]  = d965_5st_no_fp_pin_configs,
 	[STAC_DELL_3ST]  = dell_3st_pin_configs,
 	[STAC_DELL_BIOS] = NULL,
 };
@@ -2176,6 +2185,7 @@ static const char *stac927x_models[STAC_927X_MODELS] = {
 	[STAC_D965_REF]		= "ref",
 	[STAC_D965_3ST]		= "3stack",
 	[STAC_D965_5ST]		= "5stack",
+	[STAC_D965_5ST_NO_FP]	= "5stack-no-fp",
 	[STAC_DELL_3ST]		= "dell-3stack",
 	[STAC_DELL_BIOS]	= "dell-bios",
 };
-- 
cgit v0.10.2


From 657cafa6b0f5296424d6f43f6f6eeb4a3222117e Mon Sep 17 00:00:00 2001
From: Alex Riesen <raa.lkml@gmail.com>
Date: Sun, 24 May 2009 15:30:48 +0200
Subject: Use a format for linux_banner

There is no format specifiers left in the linux_banner, and gcc-4.3
complains seeing the printk.

Signed-off-by: Alex Riesen <raa.lkml@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/init/main.c b/init/main.c
index 3bbf93b..d721dad 100644
--- a/init/main.c
+++ b/init/main.c
@@ -566,8 +566,7 @@ asmlinkage void __init start_kernel(void)
 	tick_init();
 	boot_cpu_init();
 	page_address_init();
-	printk(KERN_NOTICE);
-	printk(linux_banner);
+	printk(KERN_NOTICE "%s", linux_banner);
 	setup_arch(&command_line);
 	mm_init_owner(&init_mm, &init_task);
 	setup_command_line(command_line);
-- 
cgit v0.10.2


From 32bdfac5462d777f35b00838893c4f87baf23efe Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 24 May 2009 21:15:07 +0200
Subject: PM: Do not hold dpm_list_mtx while disabling/enabling nonboot CPUs

We shouldn't hold dpm_list_mtx while executing
[disable|enable]_nonboot_cpus(), because theoretically this may lead
to a deadlock as shown by the following example (provided by Johannes
Berg):

CPU 3       CPU 2                     CPU 1
                                      suspend/hibernate
            something:
            rtnl_lock()               device_pm_lock()
                                       -> mutex_lock(&dpm_list_mtx)

            mutex_lock(&dpm_list_mtx)

linkwatch_work
 -> rtnl_lock()
                                      disable_nonboot_cpus()
                                       -> flush CPU 3 workqueue

Fortunately, device drivers are supposed to stop any activities that
might lead to the registration of new device objects way before
disable_nonboot_cpus() is called, so it shouldn't be necessary to
hold dpm_list_mtx over the entire late part of device suspend and
early part of device resume.

Thus, during the late suspend and the early resume of devices acquire
dpm_list_mtx only when dpm_list is going to be traversed and release
it right after that.

This patch is reported to fix the regressions tracked as
http://bugzilla.kernel.org/show_bug.cgi?id=13245.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Reported-by: Miles Lane <miles.lane@gmail.com>
Tested-by: Ming Lei <tom.leiming@gmail.com>

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 69b4ddb..3e4bc69 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -357,6 +357,7 @@ static void dpm_power_up(pm_message_t state)
 {
 	struct device *dev;
 
+	mutex_lock(&dpm_list_mtx);
 	list_for_each_entry(dev, &dpm_list, power.entry)
 		if (dev->power.status > DPM_OFF) {
 			int error;
@@ -366,6 +367,7 @@ static void dpm_power_up(pm_message_t state)
 			if (error)
 				pm_dev_err(dev, state, " early", error);
 		}
+	mutex_unlock(&dpm_list_mtx);
 }
 
 /**
@@ -614,6 +616,7 @@ int device_power_down(pm_message_t state)
 	int error = 0;
 
 	suspend_device_irqs();
+	mutex_lock(&dpm_list_mtx);
 	list_for_each_entry_reverse(dev, &dpm_list, power.entry) {
 		error = suspend_device_noirq(dev, state);
 		if (error) {
@@ -622,6 +625,7 @@ int device_power_down(pm_message_t state)
 		}
 		dev->power.status = DPM_OFF_IRQ;
 	}
+	mutex_unlock(&dpm_list_mtx);
 	if (error)
 		device_power_up(resume_event(state));
 	return error;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 5a758c6..e498377 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1451,7 +1451,6 @@ int kernel_kexec(void)
 		error = device_suspend(PMSG_FREEZE);
 		if (error)
 			goto Resume_console;
-		device_pm_lock();
 		/* At this point, device_suspend() has been called,
 		 * but *not* device_power_down(). We *must*
 		 * device_power_down() now.  Otherwise, drivers for
@@ -1489,7 +1488,6 @@ int kernel_kexec(void)
 		enable_nonboot_cpus();
 		device_power_up(PMSG_RESTORE);
  Resume_devices:
-		device_pm_unlock();
 		device_resume(PMSG_RESTORE);
  Resume_console:
 		resume_console();
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index b0dc9e7..5cb080e 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -215,8 +215,6 @@ static int create_image(int platform_mode)
 	if (error)
 		return error;
 
-	device_pm_lock();
-
 	/* At this point, device_suspend() has been called, but *not*
 	 * device_power_down(). We *must* call device_power_down() now.
 	 * Otherwise, drivers for some devices (e.g. interrupt controllers)
@@ -227,7 +225,7 @@ static int create_image(int platform_mode)
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down, "
 			"aborting hibernation\n");
-		goto Unlock;
+		return error;
 	}
 
 	error = platform_pre_snapshot(platform_mode);
@@ -280,9 +278,6 @@ static int create_image(int platform_mode)
 	device_power_up(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
 
- Unlock:
-	device_pm_unlock();
-
 	return error;
 }
 
@@ -344,13 +339,11 @@ static int resume_target_kernel(bool platform_mode)
 {
 	int error;
 
-	device_pm_lock();
-
 	error = device_power_down(PMSG_QUIESCE);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down, "
 			"aborting resume\n");
-		goto Unlock;
+		return error;
 	}
 
 	error = platform_pre_restore(platform_mode);
@@ -403,9 +396,6 @@ static int resume_target_kernel(bool platform_mode)
 
 	device_power_up(PMSG_RECOVER);
 
- Unlock:
-	device_pm_unlock();
-
 	return error;
 }
 
@@ -464,11 +454,9 @@ int hibernation_platform_enter(void)
 		goto Resume_devices;
 	}
 
-	device_pm_lock();
-
 	error = device_power_down(PMSG_HIBERNATE);
 	if (error)
-		goto Unlock;
+		goto Resume_devices;
 
 	error = hibernation_ops->prepare();
 	if (error)
@@ -493,9 +481,6 @@ int hibernation_platform_enter(void)
 
 	device_power_up(PMSG_RESTORE);
 
- Unlock:
-	device_pm_unlock();
-
  Resume_devices:
 	entering_platform_hibernation = false;
 	device_resume(PMSG_RESTORE);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f99ed6a..8680282 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -289,12 +289,10 @@ static int suspend_enter(suspend_state_t state)
 {
 	int error;
 
-	device_pm_lock();
-
 	if (suspend_ops->prepare) {
 		error = suspend_ops->prepare();
 		if (error)
-			goto Done;
+			return error;
 	}
 
 	error = device_power_down(PMSG_SUSPEND);
@@ -343,9 +341,6 @@ static int suspend_enter(suspend_state_t state)
 	if (suspend_ops->finish)
 		suspend_ops->finish();
 
- Done:
-	device_pm_unlock();
-
 	return error;
 }
 
-- 
cgit v0.10.2


From d5a877e8dd409d8c702986d06485c374b705d340 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Sun, 24 May 2009 13:03:43 -0700
Subject: async: make sure independent async domains can't accidentally
 entangle

The problem occurs when async_synchronize_full_domain() is called when
the async_pending list is not empty.  This will cause lowest_running()
to return the cookie of the first entry on the async_pending list, which
might be nothing at all to do with the domain being asked for and thus
cause the domain synchronization to wait for an unrelated domain.   This
can cause a deadlock if domain synchronization is used from one domain
to wait for another.

Fix by running over the async_pending list to see if any pending items
actually belong to our domain (and return their cookies if they do).

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/async.c b/kernel/async.c
index 968ef94..5054030 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -92,19 +92,23 @@ extern int initcall_debug;
 static async_cookie_t  __lowest_in_progress(struct list_head *running)
 {
 	struct async_entry *entry;
+	async_cookie_t ret = next_cookie; /* begin with "infinity" value */
+
 	if (!list_empty(running)) {
 		entry = list_first_entry(running,
 			struct async_entry, list);
-		return entry->cookie;
-	} else if (!list_empty(&async_pending)) {
-		entry = list_first_entry(&async_pending,
-			struct async_entry, list);
-		return entry->cookie;
-	} else {
-		/* nothing in progress... next_cookie is "infinity" */
-		return next_cookie;
+		ret = entry->cookie;
 	}
 
+	if (!list_empty(&async_pending)) {
+		list_for_each_entry(entry, &async_pending, list)
+			if (entry->running == running) {
+				ret = entry->cookie;
+				break;
+			}
+	}
+
+	return ret;
 }
 
 static async_cookie_t  lowest_in_progress(struct list_head *running)
-- 
cgit v0.10.2


From 71c9d8b68b299bef614afc7907393564a9f1476f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 May 2009 12:01:59 +0900
Subject: x86: Remove remap percpu allocator for the time being

Remap percpu allocator has subtle bug when combined with page
attribute changing.  Remap percpu allocator aliases PMD pages for the
first chunk and as pageattr doesn't know about the alias it ends up
updating page attributes of the original mapping thus leaving the
alises in inconsistent state which might lead to subtle data
corruption.  Please read the following threads for more information:

  http://thread.gmane.org/gmane.linux.kernel/835783

The following is the proposed fix which teaches pageattr about percpu
aliases.

  http://thread.gmane.org/gmane.linux.kernel/837157

However, the above changes are deemed too pervasive for upstream
inclusion for 2.6.30 release, so this patch essentially disables
the remap allocator for the time being.

Signed-off-by: Tejun Heo <tj@kernel.org>
LKML-Reference: <4A1A0A27.4050301@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 3a97a4c..8f0e13b 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -160,8 +160,10 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
 	/*
 	 * If large page isn't supported, there's no benefit in doing
 	 * this.  Also, on non-NUMA, embedding is better.
+	 *
+	 * NOTE: disabled for now.
 	 */
-	if (!cpu_has_pse || !pcpu_need_numa())
+	if (true || !cpu_has_pse || !pcpu_need_numa())
 		return -EINVAL;
 
 	/*
-- 
cgit v0.10.2


From 461c6c3a0a23a8fac1a4b636e93ff5dfe599a241 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 25 May 2009 08:06:02 +0200
Subject: ALSA: hda - Add missing check of pin vref 50 and others in Realtek
 codecs

Some Realtek codecs like ALC861 seem to support only VREF50 while the
current driver assumes it's only VREF80.  Check other VREF bits to set
the correct value.

Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index bcbb736..0fd258e 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -776,6 +776,12 @@ static void alc_set_input_pin(struct hda_codec *codec, hda_nid_t nid,
 		pincap = (pincap & AC_PINCAP_VREF) >> AC_PINCAP_VREF_SHIFT;
 		if (pincap & AC_PINCAP_VREF_80)
 			val = PIN_VREF80;
+		else if (pincap & AC_PINCAP_VREF_50)
+			val = PIN_VREF50;
+		else if (pincap & AC_PINCAP_VREF_100)
+			val = PIN_VREF100;
+		else if (pincap & AC_PINCAP_VREF_GRD)
+			val = PIN_VREFGRD;
 	}
 	snd_hda_codec_write(codec, nid, 0, AC_VERB_SET_PIN_WIDGET_CONTROL, val);
 }
-- 
cgit v0.10.2


From 4e2fd555199977c5994d1a4d2d3b8761b20ca4c7 Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Mon, 25 May 2009 00:42:34 -0700
Subject: gianfar: fix BUG under load after introduction of skb recycling

Since commit 0fd56bb5be6455d0d42241e65aed057244665e5e ("gianfar:
Add support for skb recycling"), gianfar puts skbuffs that are in
the rx ring back onto the recycle list as-is in case there was a
receive error, but this breaks the following invariant: that all
skbuffs on the recycle list have skb->data = skb->head + NET_SKB_PAD.

The RXBUF_ALIGNMENT realignment done in gfar_new_skb() will be done
twice on skbuffs recycled in this way, causing there not to be enough
room in the skb anymore to receive a full packet, eventually leading
to an skb_over_panic from gfar_clean_rx_ring() -> skb_put().

Resetting the skb->data pointer to skb->head + NET_SKB_PAD before
putting the skb back onto the recycle list restores the mentioned
invariant, and should fix this issue.

Reported-by: Michael Guntsche <mike@it-loops.com>
Tested-by: Michael Guntsche <mike@it-loops.com>
Signed-off-by: Lennert Buytenhek <buytenh@wantstofly.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index b2c4967..a051918 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -1885,8 +1885,17 @@ int gfar_clean_rx_ring(struct net_device *dev, int rx_work_limit)
 
 			if (unlikely(!newskb))
 				newskb = skb;
-			else if (skb)
+			else if (skb) {
+				/*
+				 * We need to reset ->data to what it
+				 * was before gfar_new_skb() re-aligned
+				 * it to an RXBUF_ALIGNMENT boundary
+				 * before we put the skb back on the
+				 * recycle list.
+				 */
+				skb->data = skb->head + NET_SKB_PAD;
 				__skb_queue_head(&priv->rx_recycle, skb);
+			}
 		} else {
 			/* Increment the number of packets */
 			dev->stats.rx_packets++;
-- 
cgit v0.10.2


From 85a9f9200226ddffc2ea50dae6a8df04c033ecd4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 25 May 2009 09:59:50 +0200
Subject: perf_counter tools: increase limits, fix

NR_CPUS and NR_COUNTERS goes up quadratic ... 1024x4096 was far
too ambitious upper limit - go for 256x256 which is still plenty.

[ Impact: reduce perf tool memory consumption ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h
index a517683..5a2520b 100644
--- a/Documentation/perf_counter/perf.h
+++ b/Documentation/perf_counter/perf.h
@@ -61,8 +61,8 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
 		       group_fd, flags);
 }
 
-#define MAX_COUNTERS			1024
-#define MAX_NR_CPUS			4096
+#define MAX_COUNTERS			256
+#define MAX_NR_CPUS			256
 
 #define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
 
-- 
cgit v0.10.2


From d94b943054721c346b0881865d645f000cd19880 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Mon, 25 May 2009 09:57:56 +0200
Subject: perf top: Reduce display overhead

Iterate over the symbol table once per display interval, and
copy/sort/tally/decay only those symbols which are active.

Before:

 top - 10:14:53 up  4:08, 17 users,  load average: 1.17, 1.53, 1.49
 Tasks: 273 total,   5 running, 268 sleeping,   0 stopped,   0 zombie
 Cpu(s):  6.9%us, 38.2%sy,  0.0%ni, 19.9%id,  0.0%wa,  0.0%hi, 35.0%si,  0.0%st

   PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  P COMMAND
 28504 root      20   0  1044  260  164 S   58  0.0   0:04.19 2 netserver
 28499 root      20   0  1040  412  316 R   51  0.0   0:04.15 0 netperf
 28500 root      20   0  1040  408  316 R   50  0.0   0:04.14 1 netperf
 28503 root      20   0  1044  260  164 S   50  0.0   0:04.01 1 netserver
 28501 root      20   0  1044  260  164 S   49  0.0   0:03.99 0 netserver
 28502 root      20   0  1040  412  316 S   43  0.0   0:03.96 2 netperf
 28468 root      20   0 1892m 325m  972 S   16 10.8   0:10.50 3 perf
 28467 root      20   0 1892m 325m  972 R    2 10.8   0:00.72 3 perf

After:

 top - 10:16:30 up  4:10, 17 users,  load average: 2.27, 1.88, 1.62
 Tasks: 273 total,   6 running, 267 sleeping,   0 stopped,   0 zombie
 Cpu(s):  2.5%us, 39.7%sy,  0.0%ni, 24.6%id,  0.0%wa,  0.0%hi, 33.3%si,  0.0%st

   PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  P COMMAND
 28590 root      20   0  1040  412  316 S   54  0.0   0:07.85 2 netperf
 28589 root      20   0  1044  260  164 R   54  0.0   0:07.84 0 netserver
 28588 root      20   0  1040  412  316 R   50  0.0   0:07.89 1 netperf
 28591 root      20   0  1044  256  164 S   50  0.0   0:07.82 1 netserver
 28587 root      20   0  1040  408  316 R   47  0.0   0:07.61 0 netperf
 28592 root      20   0  1044  260  164 R   47  0.0   0:07.85 2 netserver
 28378 root      20   0  8732 1300  860 R    2  0.0   0:01.81 3 top
 28577 root      20   0 1892m 165m  972 R    2  5.5   0:00.48 3 perf
 28578 root      20   0 1892m 165m  972 S    2  5.5   0:00.04 3 perf

[ Impact: optimization ]

Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 74021ac..4bed265 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -374,18 +374,26 @@ static struct sym_entry		tmp[MAX_SYMS];
 
 static void print_sym_table(void)
 {
-	int i, printed;
+	int i, j, active_count, printed;
 	int counter;
 	float events_per_sec = events/delay_secs;
 	float kevents_per_sec = (events-userspace_events)/delay_secs;
 	float sum_kevents = 0.0;
 
 	events = userspace_events = 0;
-	memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
-	qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
 
-	for (i = 0; i < sym_table_count && tmp[i].count[0]; i++)
-		sum_kevents += tmp[i].count[0];
+	/* Iterate over symbol table and copy/tally/decay active symbols. */
+	for (i = 0, active_count = 0; i < sym_table_count; i++) {
+		if (sym_table[i].count[0]) {
+			tmp[active_count++] = sym_table[i];
+			sum_kevents += sym_table[i].count[0];
+
+			for (j = 0; j < nr_counters; j++)
+				sym_table[i].count[j] = zero ? 0 : sym_table[i].count[j] * 7 / 8;
+		}
+	}
+
+	qsort(tmp, active_count + 1, sizeof(tmp[0]), compare);
 
 	write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
 
@@ -433,29 +441,23 @@ static void print_sym_table(void)
 	       	       "  ______     ______   _____   ________________   _______________\n\n"
 	);
 
-	for (i = 0, printed = 0; i < sym_table_count; i++) {
+	for (i = 0, printed = 0; i < active_count; i++) {
 		float pcnt;
-		int count;
 
-		if (printed <= 18 && tmp[i].count[0] >= count_filter) {
-			pcnt = 100.0 - (100.0*((sum_kevents-tmp[i].count[0])/sum_kevents));
-
-			if (nr_counters == 1)
-				printf("%19.2f - %4.1f%% - %016llx : %s\n",
-					sym_weight(tmp + i),
-					pcnt, tmp[i].addr, tmp[i].sym);
-			else
-				printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n",
-					sym_weight(tmp + i),
-					tmp[i].count[0],
-					pcnt, tmp[i].addr, tmp[i].sym);
-			printed++;
-		}
-		/*
-		 * Add decay to the counts:
-		 */
-		for (count = 0; count < nr_counters; count++)
-			sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
+		if (++printed > 18 || tmp[i].count[0] < count_filter)
+			break;
+
+		pcnt = 100.0 - (100.0*((sum_kevents-tmp[i].count[0])/sum_kevents));
+
+		if (nr_counters == 1)
+			printf("%19.2f - %4.1f%% - %016llx : %s\n",
+				sym_weight(tmp + i),
+				pcnt, tmp[i].addr, tmp[i].sym);
+		else
+			printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n",
+				sym_weight(tmp + i),
+				tmp[i].count[0],
+				pcnt, tmp[i].addr, tmp[i].sym);
 	}
 
 	if (sym_filter_entry)
-- 
cgit v0.10.2


From e4cbb4e3ac8b09fdb11e39e5a5611bfab0a7cd1a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 19 May 2009 15:50:30 +0200
Subject: perf_counter: Move child perfcounter init to after scheduler init

Initialize a task's perfcounters (inherit from parent, etc.) after
the child task's scheduler fields have been initialized already.

[ Impact: cleanup ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/fork.c b/kernel/fork.c
index e72a09f..675e01e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -984,7 +984,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto fork_out;
 
 	rt_mutex_init_task(p);
-	perf_counter_init_task(p);
 
 #ifdef CONFIG_PROVE_LOCKING
 	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
@@ -1096,6 +1095,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p, clone_flags);
+	perf_counter_init_task(p);
 
 	if ((retval = audit_alloc(p)))
 		goto bad_fork_cleanup_policy;
-- 
cgit v0.10.2


From d3f4b3855ba87caff8f35e738c7e7e3bad0a6ab1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 25 May 2009 14:40:01 +0200
Subject: perf stat: flip around ':k' and ':u' flags

This output:

 $ perf stat -e 0:1:k -e 0:1:u ./hello
  Performance counter stats for './hello':
          140131  instructions         (events)
         1906968  instructions         (events)

Is quite confusing - as :k means "user instructions", :u means
"kernel instructions".

Flip them around - as the 'exclude' property is not intuitive in
the flag naming.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 8ae01d5..88c70be9 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -266,9 +266,9 @@ static __u64 match_event_symbols(char *str)
 
 	switch (sscanf(str, "%d:%llu:%2s", &type, &id, mask_str)) {
 		case 3:
-			if (strchr(mask_str, 'u'))
-				event_mask[nr_counters] |= EVENT_MASK_USER;
 			if (strchr(mask_str, 'k'))
+				event_mask[nr_counters] |= EVENT_MASK_USER;
+			if (strchr(mask_str, 'u'))
 				event_mask[nr_counters] |= EVENT_MASK_KERNEL;
 		case 2:
 			return EID(type, id);
-- 
cgit v0.10.2


From 266dfb0b58bc4181b6158ee63a0069abaa9f3a98 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 May 2009 14:45:24 +0200
Subject: perf_counter: Fix perf-$cmd invokation

Fix:

  $ perf-top
  fatal: cannot handle -top internally

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525124559.995591577@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
index 594d270..1d6d7aa 100644
--- a/Documentation/perf_counter/perf.c
+++ b/Documentation/perf_counter/perf.c
@@ -357,7 +357,7 @@ int main(int argc, const char **argv)
 	 * die if that one cannot handle it.
 	 */
 	if (!prefixcmp(cmd, "perf-")) {
-		cmd += 4;
+		cmd += 5;
 		argv[0] = cmd;
 		handle_internal_command(argc, argv);
 		die("cannot handle %s internally", cmd);
-- 
cgit v0.10.2


From e527ea312f31e88a7fa5472b71db71c565b0d44f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 May 2009 14:45:25 +0200
Subject: perf_counter: Remove unused ABI bits

extra_config_len isn't used for anything, remove it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525124600.116035832@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 2ddf5e3..b1f2bac 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -154,11 +154,11 @@ struct perf_counter_hw_event {
 
 				__reserved_1   : 51;
 
-	__u32			extra_config_len;
 	__u32			wakeup_events;	/* wakeup every n events */
+	__u32			__reserved_2;
 
-	__u64			__reserved_2;
 	__u64			__reserved_3;
+	__u64			__reserved_4;
 };
 
 /*
-- 
cgit v0.10.2


From 771d7cde144d87f2d1fbee4da3c6234d61f7e42a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 May 2009 14:45:26 +0200
Subject: perf_counter: Make pctrl() affect inherited counters too

Paul noted that the new ptcrl() didn't work on child counters.

Reported-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525124600.203151469@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 6cdf824..217dbcc 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1067,30 +1067,6 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 	__perf_counter_sched_in(ctx, cpuctx, cpu);
 }
 
-int perf_counter_task_enable(void)
-{
-	struct perf_counter *counter;
-
-	mutex_lock(&current->perf_counter_mutex);
-	list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
-		perf_counter_enable(counter);
-	mutex_unlock(&current->perf_counter_mutex);
-
-	return 0;
-}
-
-int perf_counter_task_disable(void)
-{
-	struct perf_counter *counter;
-
-	mutex_lock(&current->perf_counter_mutex);
-	list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
-		perf_counter_disable(counter);
-	mutex_unlock(&current->perf_counter_mutex);
-
-	return 0;
-}
-
 static void perf_log_period(struct perf_counter *counter, u64 period);
 
 static void perf_adjust_freq(struct perf_counter_context *ctx)
@@ -1505,6 +1481,30 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return 0;
 }
 
+int perf_counter_task_enable(void)
+{
+	struct perf_counter *counter;
+
+	mutex_lock(&current->perf_counter_mutex);
+	list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
+		perf_counter_for_each_child(counter, perf_counter_enable);
+	mutex_unlock(&current->perf_counter_mutex);
+
+	return 0;
+}
+
+int perf_counter_task_disable(void)
+{
+	struct perf_counter *counter;
+
+	mutex_lock(&current->perf_counter_mutex);
+	list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
+		perf_counter_for_each_child(counter, perf_counter_disable);
+	mutex_unlock(&current->perf_counter_mutex);
+
+	return 0;
+}
+
 /*
  * Callers need to ensure there can be no nesting of this function, otherwise
  * the seqlock logic goes bad. We can not serialize this because the arch
-- 
cgit v0.10.2


From 6ab423e0eaca827fbd201ca4ae7d4f8573a366b2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 May 2009 14:45:27 +0200
Subject: perf_counter: Propagate inheritance failures down the fork() path

Fail fork() when we fail inheritance for some reason (-ENOMEM most likely).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525124600.324656474@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index b1f2bac..d3e85de 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -566,7 +566,7 @@ extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
 extern void perf_counter_task_sched_out(struct task_struct *task,
 					struct task_struct *next, int cpu);
 extern void perf_counter_task_tick(struct task_struct *task, int cpu);
-extern void perf_counter_init_task(struct task_struct *child);
+extern int perf_counter_init_task(struct task_struct *child);
 extern void perf_counter_exit_task(struct task_struct *child);
 extern void perf_counter_do_pending(void);
 extern void perf_counter_print_debug(void);
@@ -631,7 +631,7 @@ perf_counter_task_sched_out(struct task_struct *task,
 			    struct task_struct *next, int cpu)		{ }
 static inline void
 perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
-static inline void perf_counter_init_task(struct task_struct *child)	{ }
+static inline int perf_counter_init_task(struct task_struct *child)	{ }
 static inline void perf_counter_exit_task(struct task_struct *child)	{ }
 static inline void perf_counter_do_pending(void)			{ }
 static inline void perf_counter_print_debug(void)			{ }
diff --git a/kernel/fork.c b/kernel/fork.c
index 675e01e..c07c333 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1095,7 +1095,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p, clone_flags);
-	perf_counter_init_task(p);
+
+	retval = perf_counter_init_task(p);
+	if (retval)
+		goto bad_fork_cleanup_policy;
 
 	if ((retval = audit_alloc(p)))
 		goto bad_fork_cleanup_policy;
@@ -1295,6 +1298,7 @@ bad_fork_cleanup_semundo:
 bad_fork_cleanup_audit:
 	audit_free(p);
 bad_fork_cleanup_policy:
+	perf_counter_exit_task(p);
 #ifdef CONFIG_NUMA
 	mpol_put(p->mempolicy);
 bad_fork_cleanup_cgroup:
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 217dbcc..7a7a144 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3434,18 +3434,23 @@ again:
 /*
  * Initialize the perf_counter context in task_struct
  */
-void perf_counter_init_task(struct task_struct *child)
+int perf_counter_init_task(struct task_struct *child)
 {
 	struct perf_counter_context *child_ctx, *parent_ctx;
 	struct perf_counter *counter;
 	struct task_struct *parent = current;
 	int inherited_all = 1;
+	int ret = 0;
 
 	child->perf_counter_ctxp = NULL;
 
 	mutex_init(&child->perf_counter_mutex);
 	INIT_LIST_HEAD(&child->perf_counter_list);
 
+	parent_ctx = parent->perf_counter_ctxp;
+	if (likely(!parent_ctx || !parent_ctx->nr_counters))
+		return 0;
+
 	/*
 	 * This is executed from the parent task context, so inherit
 	 * counters that have been marked for cloning.
@@ -3454,11 +3459,7 @@ void perf_counter_init_task(struct task_struct *child)
 
 	child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
 	if (!child_ctx)
-		return;
-
-	parent_ctx = parent->perf_counter_ctxp;
-	if (likely(!parent_ctx || !parent_ctx->nr_counters))
-		return;
+		return -ENOMEM;
 
 	__perf_counter_init_context(child_ctx, child);
 	child->perf_counter_ctxp = child_ctx;
@@ -3482,8 +3483,9 @@ void perf_counter_init_task(struct task_struct *child)
 			continue;
 		}
 
-		if (inherit_group(counter, parent,
-				  parent_ctx, child, child_ctx)) {
+		ret = inherit_group(counter, parent, parent_ctx,
+					     child, child_ctx);
+		if (ret) {
 			inherited_all = 0;
 			break;
 		}
@@ -3505,6 +3507,8 @@ void perf_counter_init_task(struct task_struct *child)
 	}
 
 	mutex_unlock(&parent_ctx->mutex);
+
+	return ret;
 }
 
 static void __cpuinit perf_counter_init_cpu(int cpu)
-- 
cgit v0.10.2


From 10989fb2451763fae6f42d85fa6106c8fd010cf5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 May 2009 14:45:28 +0200
Subject: perf_counter: Fix PERF_COUNTER_CONTEXT_SWITCHES for cpu counters

Ingo noticed that cpu counters had 0 context switches, even though
there was plenty scheduling on the cpu.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525124600.419025548@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 7a7a144..14b1fe9 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -924,14 +924,13 @@ void perf_counter_task_sched_out(struct task_struct *task,
 	struct perf_counter_context *next_ctx;
 	struct pt_regs *regs;
 
+	regs = task_pt_regs(task);
+	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
+
 	if (likely(!ctx || !cpuctx->task_ctx))
 		return;
 
 	update_context_time(ctx);
-
-	regs = task_pt_regs(task);
-	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
-
 	next_ctx = next->perf_counter_ctxp;
 	if (next_ctx && context_equiv(ctx, next_ctx)) {
 		task->perf_counter_ctxp = next_ctx;
-- 
cgit v0.10.2


From bfcaa50270e18f35220a11d46e98fc6232c24606 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Mon, 25 May 2009 17:23:15 +0200
Subject: netfilter: nf_ct_tcp: fix accepting invalid RST segments

Robert L Mathews discovered that some clients send evil TCP RST segments,
which are accepted by netfilter conntrack but discarded by the
destination. Thus the conntrack entry is destroyed but the destination
retransmits data until timeout.

The same technique, i.e. sending properly crafted RST segments, can easily
be used to bypass connlimit/connbytes based restrictions (the sample
script written by Robert can be found in the netfilter mailing list
archives).

The patch below adds a new flag and new field to struct ip_ct_tcp_state so
that checking RST segments can be made more strict and thus TCP conntrack
can catch the invalid ones: the RST segment is accepted only if its
sequence number higher than or equal to the highest ack we seen from the
other direction. (The last_ack field cannot be reused because it is used
to catch resent packets.)

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>

diff --git a/include/linux/netfilter/nf_conntrack_tcp.h b/include/linux/netfilter/nf_conntrack_tcp.h
index 3066789..b2f384d 100644
--- a/include/linux/netfilter/nf_conntrack_tcp.h
+++ b/include/linux/netfilter/nf_conntrack_tcp.h
@@ -35,6 +35,9 @@ enum tcp_conntrack {
 /* Has unacknowledged data */
 #define IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED	0x10
 
+/* The field td_maxack has been set */
+#define IP_CT_TCP_FLAG_MAXACK_SET		0x20
+
 struct nf_ct_tcp_flags {
 	__u8 flags;
 	__u8 mask;
@@ -46,6 +49,7 @@ struct ip_ct_tcp_state {
 	u_int32_t	td_end;		/* max of seq + len */
 	u_int32_t	td_maxend;	/* max of ack + max(win, 1) */
 	u_int32_t	td_maxwin;	/* max(win) */
+	u_int32_t	td_maxack;	/* max of ack */
 	u_int8_t	td_scale;	/* window scale factor */
 	u_int8_t	flags;		/* per direction options */
 };
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index b5ccf2b..97a6e93 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -634,6 +634,14 @@ static bool tcp_in_window(const struct nf_conn *ct,
 			sender->td_end = end;
 			sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
 		}
+		if (tcph->ack) {
+			if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
+				sender->td_maxack = ack;
+				sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
+			} else if (after(ack, sender->td_maxack))
+				sender->td_maxack = ack;
+		}
+
 		/*
 		 * Update receiver data.
 		 */
@@ -919,6 +927,16 @@ static int tcp_packet(struct nf_conn *ct,
 		return -NF_ACCEPT;
 	case TCP_CONNTRACK_CLOSE:
 		if (index == TCP_RST_SET
+		    && (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET)
+		    && before(ntohl(th->seq), ct->proto.tcp.seen[!dir].td_maxack)) {
+			/* Invalid RST  */
+			write_unlock_bh(&tcp_lock);
+			if (LOG_INVALID(net, IPPROTO_TCP))
+				nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+					  "nf_ct_tcp: invalid RST ");
+			return -NF_ACCEPT;
+		}
+		if (index == TCP_RST_SET
 		    && ((test_bit(IPS_SEEN_REPLY_BIT, &ct->status)
 			 && ct->proto.tcp.last_index == TCP_SYN_SET)
 			|| (!test_bit(IPS_ASSURED_BIT, &ct->status)
-- 
cgit v0.10.2


From b38b1f616867c832301f24eaf259889494d495b3 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <Pablo Neira Ayuso>
Date: Mon, 25 May 2009 17:29:43 +0200
Subject: netfilter: nf_ct_dccp: add missing DCCP protocol changes in event
 cache

This patch adds the missing protocol state-change event reporting
for DCCP.

$ sudo conntrack -E
    [NEW] dccp     33 240 src=192.168.0.2 dst=192.168.1.2 sport=57040 dport=5001 [UNREPLIED] src=192.168.1.2 dst=192.168.1.100 sport=5001 dport=57040

With this patch:

$ sudo conntrack -E
    [NEW] dccp     33 240 REQUEST src=192.168.0.2 dst=192.168.1.2 sport=57040 dport=5001 [UNREPLIED] src=192.168.1.2 dst=192.168.1.100 sport=5001 dport=57040

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>

diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 8e757dd..aee0d6b 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -22,6 +22,7 @@
 #include <linux/netfilter/nfnetlink_conntrack.h>
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
 #include <net/netfilter/nf_log.h>
 
 static DEFINE_RWLOCK(dccp_lock);
@@ -553,6 +554,9 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
 	ct->proto.dccp.state = new_state;
 	write_unlock_bh(&dccp_lock);
 
+	if (new_state != old_state)
+		nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
+
 	dn = dccp_pernet(net);
 	nf_ct_refresh_acct(ct, ctinfo, skb, dn->dccp_timeout[new_state]);
 
-- 
cgit v0.10.2


From 759d427aa5a9d88a81afd11817cdeb40aea85234 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 25 May 2009 11:51:00 -0400
Subject: ext4: remove unused function __ext4_write_dirty_metadata

The __ext4_write_dirty_metadata() function was introduced by commit
0390131b, "ext4: Allow ext4 to run without a journal", but nothing
ever used the function, either then or since.  So let's remove it and
save a bit of space.

Cc: Frank Mayhar <fmayhar@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index dadd3f9..14c00ff 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4790,25 +4790,6 @@ int ext4_write_inode(struct inode *inode, int wait)
 	return ext4_force_commit(inode->i_sb);
 }
 
-int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh)
-{
-	int err = 0;
-
-	mark_buffer_dirty(bh);
-	if (inode && inode_needs_sync(inode)) {
-		sync_dirty_buffer(bh);
-		if (buffer_req(bh) && !buffer_uptodate(bh)) {
-			ext4_error(inode->i_sb, __func__,
-				   "IO error syncing inode, "
-				   "inode=%lu, block=%llu",
-				   inode->i_ino,
-				   (unsigned long long)bh->b_blocknr);
-			err = -EIO;
-		}
-	}
-	return err;
-}
-
 /*
  * ext4_setattr()
  *
-- 
cgit v0.10.2


From 88b6edd17c62b7d346d21f4087893ce7d4ef828a Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 25 May 2009 11:50:39 -0400
Subject: ext4: Clean up calls to ext4_get_group_desc()

If the caller isn't planning on modifying the block group descriptors,
there's no need to pass in a pointer to a struct buffer_head.  Nuking
this saves a tiny amount of CPU time and stack space usage.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 82f7d1d..3743bd8 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -347,7 +347,6 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_group_desc *desc;
-	struct buffer_head *bh;
 	struct flex_groups *flex_group = sbi->s_flex_groups;
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 	ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
@@ -402,7 +401,7 @@ find_close_to_parent:
 found_flexbg:
 	for (i = best_flex * flex_size; i < ngroups &&
 		     i < (best_flex + 1) * flex_size; i++) {
-		desc = ext4_get_group_desc(sb, i, &bh);
+		desc = ext4_get_group_desc(sb, i, NULL);
 		if (ext4_free_inodes_count(sb, desc)) {
 			*best_group = i;
 			goto out;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index eca6c05..91b98b5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1660,7 +1660,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_group_desc *gdp = NULL;
-	struct buffer_head *bh;
 	ext4_group_t flex_group_count;
 	ext4_group_t flex_group;
 	int groups_per_flex = 0;
@@ -1693,7 +1692,7 @@ static int ext4_fill_flex_info(struct super_block *sb)
 	}
 
 	for (i = 0; i < sbi->s_groups_count; i++) {
-		gdp = ext4_get_group_desc(sb, i, &bh);
+		gdp = ext4_get_group_desc(sb, i, NULL);
 
 		flex_group = ext4_flex_group(sbi, i);
 		atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
-- 
cgit v0.10.2


From a8cd0244e9cebcf9b358d24c7e7410062f3665cb Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 24 May 2009 22:15:25 +0300
Subject: KVM: Make paravirt tlb flush also reload the PAE PDPTRs

The paravirt tlb flush may be used not only to flush TLBs, but also
to reload the four page-directory-pointer-table entries, as it is used
as a replacement for reloading CR3.  Change the code to do the entire
CR3 reloading dance instead of simply flushing the TLB.

Cc: stable@kernel.org
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b6caf13..32cf11e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2897,8 +2897,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
 
 static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 {
-	kvm_x86_ops->tlb_flush(vcpu);
-	set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
+	kvm_set_cr3(vcpu, vcpu->arch.cr3);
 	return 1;
 }
 
-- 
cgit v0.10.2


From a2edf57f510cce6a389cc14e58c6ad0a4296d6f9 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 24 May 2009 22:19:00 +0300
Subject: KVM: Fix PDPTR reloading on CR4 writes

The processor is documented to reload the PDPTRs while in PAE mode if any
of the CR4 bits PSE, PGE, or PAE change.  Linux relies on this
behaviour when zapping the low mappings of PAE kernels during boot.

The code already handled changes to CR4.PAE; augment it to also notice changes
to PSE and PGE.

This triggered while booting an F11 PAE kernel; the futex initialization code
runs before any CR3 reloads and writes to a NULL pointer; the futex subsystem
ended up uninitialized, killing PI futexes and pulseaudio which uses them.

Cc: stable@kernel.org
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 49079a4..3944e91 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -338,6 +338,9 @@ EXPORT_SYMBOL_GPL(kvm_lmsw);
 
 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
+	unsigned long old_cr4 = vcpu->arch.cr4;
+	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
+
 	if (cr4 & CR4_RESERVED_BITS) {
 		printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
 		kvm_inject_gp(vcpu, 0);
@@ -351,7 +354,8 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 			kvm_inject_gp(vcpu, 0);
 			return;
 		}
-	} else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
+	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
+		   && ((cr4 ^ old_cr4) & pdptr_bits)
 		   && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
 		printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
 		kvm_inject_gp(vcpu, 0);
-- 
cgit v0.10.2


From ff99be573e02e9f7edc23b472c7f9a5ddba12795 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 May 2009 17:39:03 +0200
Subject: perf_counter: x86: Expose INV and EDGE bits

Expose the INV and EDGE bits of the PMU to raw configs.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525153931.494709027@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6cc1660..c14437f 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -87,11 +87,15 @@ static u64 intel_pmu_raw_event(u64 event)
 {
 #define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
 #define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
+#define CORE_EVNTSEL_EDGE_MASK		0x00040000ULL
+#define CORE_EVNTSEL_INV_MASK		0x00800000ULL
 #define CORE_EVNTSEL_COUNTER_MASK	0xFF000000ULL
 
 #define CORE_EVNTSEL_MASK 		\
 	(CORE_EVNTSEL_EVENT_MASK |	\
 	 CORE_EVNTSEL_UNIT_MASK  |	\
+	 CORE_EVNTSEL_EDGE_MASK  |	\
+	 CORE_EVNTSEL_INV_MASK  |	\
 	 CORE_EVNTSEL_COUNTER_MASK)
 
 	return event & CORE_EVNTSEL_MASK;
@@ -119,11 +123,15 @@ static u64 amd_pmu_raw_event(u64 event)
 {
 #define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL
 #define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL
+#define K7_EVNTSEL_EDGE_MASK	0x000040000ULL
+#define K7_EVNTSEL_INV_MASK	0x000800000ULL
 #define K7_EVNTSEL_COUNTER_MASK	0x0FF000000ULL
 
 #define K7_EVNTSEL_MASK			\
 	(K7_EVNTSEL_EVENT_MASK |	\
 	 K7_EVNTSEL_UNIT_MASK  |	\
+	 K7_EVNTSEL_EDGE_MASK  |	\
+	 K7_EVNTSEL_INV_MASK   |	\
 	 K7_EVNTSEL_COUNTER_MASK)
 
 	return event & K7_EVNTSEL_MASK;
-- 
cgit v0.10.2


From 48e22d56ecdeddd1ffb42a02fccba5c6ef42b133 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 May 2009 17:39:04 +0200
Subject: perf_counter: x86: Remove interrupt throttle

remove the x86 specific interrupt throttle

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525153931.616671838@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b4f6440..89b63b5 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -763,8 +763,6 @@ static void local_apic_timer_interrupt(void)
 	inc_irq_stat(apic_timer_irqs);
 
 	evt->event_handler(evt);
-
-	perf_counter_unthrottle();
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index c14437f..8c8177f 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -719,11 +719,6 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter)
 }
 
 /*
- * Maximum interrupt frequency of 100KHz per CPU
- */
-#define PERFMON_MAX_INTERRUPTS (100000/HZ)
-
-/*
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
  */
@@ -775,15 +770,14 @@ again:
 	if (status)
 		goto again;
 
-	if (++cpuc->interrupts != PERFMON_MAX_INTERRUPTS)
-		perf_enable();
+	perf_enable();
 
 	return 1;
 }
 
 static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 {
-	int cpu, idx, throttle = 0, handled = 0;
+	int cpu, idx, handled = 0;
 	struct cpu_hw_counters *cpuc;
 	struct perf_counter *counter;
 	struct hw_perf_counter *hwc;
@@ -792,16 +786,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
-	if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) {
-		throttle = 1;
-		__perf_disable();
-		cpuc->enabled = 0;
-		barrier();
-	}
-
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		int disable = 0;
-
 		if (!test_bit(idx, cpuc->active_mask))
 			continue;
 
@@ -809,45 +794,23 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 		hwc = &counter->hw;
 
 		if (counter->hw_event.nmi != nmi)
-			goto next;
+			continue;
 
 		val = x86_perf_counter_update(counter, hwc, idx);
 		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
-			goto next;
+			continue;
 
 		/* counter overflow */
 		x86_perf_counter_set_period(counter, hwc, idx);
 		handled = 1;
 		inc_irq_stat(apic_perf_irqs);
-		disable = perf_counter_overflow(counter, nmi, regs, 0);
-
-next:
-		if (disable || throttle)
+		if (perf_counter_overflow(counter, nmi, regs, 0))
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
 	return handled;
 }
 
-void perf_counter_unthrottle(void)
-{
-	struct cpu_hw_counters *cpuc;
-
-	if (!x86_pmu_initialized())
-		return;
-
-	cpuc = &__get_cpu_var(cpu_hw_counters);
-	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
-		/*
-		 * Clear them before re-enabling irqs/NMIs again:
-		 */
-		cpuc->interrupts = 0;
-		perf_enable();
-	} else {
-		cpuc->interrupts = 0;
-	}
-}
-
 void smp_perf_counter_interrupt(struct pt_regs *regs)
 {
 	irq_enter();
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index d3e85de..0c160be 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -570,7 +570,6 @@ extern int perf_counter_init_task(struct task_struct *child);
 extern void perf_counter_exit_task(struct task_struct *child);
 extern void perf_counter_do_pending(void);
 extern void perf_counter_print_debug(void);
-extern void perf_counter_unthrottle(void);
 extern void __perf_disable(void);
 extern bool __perf_enable(void);
 extern void perf_disable(void);
@@ -635,7 +634,6 @@ static inline int perf_counter_init_task(struct task_struct *child)	{ }
 static inline void perf_counter_exit_task(struct task_struct *child)	{ }
 static inline void perf_counter_do_pending(void)			{ }
 static inline void perf_counter_print_debug(void)			{ }
-static inline void perf_counter_unthrottle(void)			{ }
 static inline void perf_disable(void)					{ }
 static inline void perf_enable(void)					{ }
 static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
-- 
cgit v0.10.2


From a78ac3258782f3e64cb40beb5990808e1febcc0c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 May 2009 17:39:05 +0200
Subject: perf_counter: Generic per counter interrupt throttle

Introduce a generic per counter interrupt throttle.

This uses the perf_counter_overflow() quick disable to throttle a specific
counter when its going too fast when a pmu->unthrottle() method is provided
which can undo the quick disable.

Power needs to implement both the quick disable and the unthrottle method.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525153931.703093461@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 8c8177f..c4b543d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -623,6 +623,18 @@ try_generic:
 	return 0;
 }
 
+static void x86_pmu_unthrottle(struct perf_counter *counter)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+	struct hw_perf_counter *hwc = &counter->hw;
+
+	if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
+				cpuc->counters[hwc->idx] != counter))
+		return;
+
+	x86_pmu.enable(hwc, hwc->idx);
+}
+
 void perf_counter_print_debug(void)
 {
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
@@ -1038,6 +1050,7 @@ static const struct pmu pmu = {
 	.enable		= x86_pmu_enable,
 	.disable	= x86_pmu_disable,
 	.read		= x86_pmu_read,
+	.unthrottle	= x86_pmu_unthrottle,
 };
 
 const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 0c160be..e3a7585 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -267,6 +267,15 @@ enum perf_event_type {
 	PERF_EVENT_PERIOD		= 4,
 
 	/*
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 * 	u64				time;
+	 * };
+	 */
+	PERF_EVENT_THROTTLE		= 5,
+	PERF_EVENT_UNTHROTTLE		= 6,
+
+	/*
 	 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
 	 * will be PERF_RECORD_*
 	 *
@@ -367,6 +376,7 @@ struct pmu {
 	int (*enable)			(struct perf_counter *counter);
 	void (*disable)			(struct perf_counter *counter);
 	void (*read)			(struct perf_counter *counter);
+	void (*unthrottle)		(struct perf_counter *counter);
 };
 
 /**
@@ -613,6 +623,7 @@ extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
 
 extern int sysctl_perf_counter_priv;
 extern int sysctl_perf_counter_mlock;
+extern int sysctl_perf_counter_limit;
 
 extern void perf_counter_init(void);
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 14b1fe9..ec9c4007 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -46,6 +46,7 @@ static atomic_t nr_comm_tracking __read_mostly;
 
 int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
 int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
+int sysctl_perf_counter_limit __read_mostly = 100000; /* max NMIs per second */
 
 /*
  * Lock for (sysadmin-configurable) counter reservations:
@@ -1066,12 +1067,15 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 	__perf_counter_sched_in(ctx, cpuctx, cpu);
 }
 
+#define MAX_INTERRUPTS (~0ULL)
+
+static void perf_log_throttle(struct perf_counter *counter, int enable);
 static void perf_log_period(struct perf_counter *counter, u64 period);
 
 static void perf_adjust_freq(struct perf_counter_context *ctx)
 {
 	struct perf_counter *counter;
-	u64 irq_period;
+	u64 interrupts, irq_period;
 	u64 events, period;
 	s64 delta;
 
@@ -1080,10 +1084,19 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 		if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 			continue;
 
+		interrupts = counter->hw.interrupts;
+		counter->hw.interrupts = 0;
+
+		if (interrupts == MAX_INTERRUPTS) {
+			perf_log_throttle(counter, 1);
+			counter->pmu->unthrottle(counter);
+			interrupts = 2*sysctl_perf_counter_limit/HZ;
+		}
+
 		if (!counter->hw_event.freq || !counter->hw_event.irq_freq)
 			continue;
 
-		events = HZ * counter->hw.interrupts * counter->hw.irq_period;
+		events = HZ * interrupts * counter->hw.irq_period;
 		period = div64_u64(events, counter->hw_event.irq_freq);
 
 		delta = (s64)(1 + period - counter->hw.irq_period);
@@ -1097,7 +1110,6 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 		perf_log_period(counter, irq_period);
 
 		counter->hw.irq_period = irq_period;
-		counter->hw.interrupts = 0;
 	}
 	spin_unlock(&ctx->lock);
 }
@@ -2544,6 +2556,35 @@ static void perf_log_period(struct perf_counter *counter, u64 period)
 }
 
 /*
+ * IRQ throttle logging
+ */
+
+static void perf_log_throttle(struct perf_counter *counter, int enable)
+{
+	struct perf_output_handle handle;
+	int ret;
+
+	struct {
+		struct perf_event_header	header;
+		u64				time;
+	} throttle_event = {
+		.header = {
+			.type = PERF_EVENT_THROTTLE + 1,
+			.misc = 0,
+			.size = sizeof(throttle_event),
+		},
+		.time = sched_clock(),
+	};
+
+	ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 0, 0);
+	if (ret)
+		return;
+
+	perf_output_put(&handle, throttle_event);
+	perf_output_end(&handle);
+}
+
+/*
  * Generic counter overflow handling.
  */
 
@@ -2551,9 +2592,19 @@ int perf_counter_overflow(struct perf_counter *counter,
 			  int nmi, struct pt_regs *regs, u64 addr)
 {
 	int events = atomic_read(&counter->event_limit);
+	int throttle = counter->pmu->unthrottle != NULL;
 	int ret = 0;
 
-	counter->hw.interrupts++;
+	if (!throttle) {
+		counter->hw.interrupts++;
+	} else if (counter->hw.interrupts != MAX_INTERRUPTS) {
+		counter->hw.interrupts++;
+		if (HZ*counter->hw.interrupts > (u64)sysctl_perf_counter_limit) {
+			counter->hw.interrupts = MAX_INTERRUPTS;
+			perf_log_throttle(counter, 0);
+			ret = 1;
+		}
+	}
 
 	/*
 	 * XXX event_limit might not quite work as expected on inherited
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3cb1849..0c4bf86 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -930,6 +930,14 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "perf_counter_int_limit",
+		.data		= &sysctl_perf_counter_limit,
+		.maxlen		= sizeof(sysctl_perf_counter_limit),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 /*
  * NOTE: do not add new entries to this table unless you have read
-- 
cgit v0.10.2


From 53b441a565bf4036ab49c8ea04c5ad06ace7dd6b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 25 May 2009 21:41:28 +0200
Subject: Revert "perf_counter, x86: speed up the scheduling fast-path"

This reverts commit b68f1d2e7aa21029d73c7d453a8046e95d351740.

It is causing problems (stuck/stuttering profiling) - when mixed
NMI and non-NMI counters are used.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525153931.703093461@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index c4b543d..189bf9d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -293,7 +293,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 			return -EACCES;
 		hwc->nmi = 1;
 	}
-	perf_counters_lapic_init(hwc->nmi);
 
 	if (!hwc->irq_period)
 		hwc->irq_period = x86_pmu.max_period;
@@ -612,6 +611,8 @@ try_generic:
 		hwc->counter_base = x86_pmu.perfctr;
 	}
 
+	perf_counters_lapic_init(hwc->nmi);
+
 	x86_pmu.disable(hwc, idx);
 
 	cpuc->counters[idx] = counter;
@@ -1037,7 +1038,7 @@ void __init init_hw_perf_counters(void)
 
 	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
 
-	perf_counters_lapic_init(1);
+	perf_counters_lapic_init(0);
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 
-- 
cgit v0.10.2


From 0127c3ea082ee9f1034789b978dfc7fd83254617 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 25 May 2009 22:03:26 +0200
Subject: perf_counter: fix warning & lockup

 - remove bogus warning
 - fix wakeup from NMI path lockup
 - also fix up whitespace noise in perf_counter.h

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525153931.703093461@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index e3a7585..2b16ed3 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -73,7 +73,7 @@ enum sw_event_ids {
 	PERF_SW_EVENTS_MAX		= 7,
 };
 
-#define __PERF_COUNTER_MASK(name) 			\
+#define __PERF_COUNTER_MASK(name)			\
 	(((1ULL << PERF_COUNTER_##name##_BITS) - 1) <<	\
 	 PERF_COUNTER_##name##_SHIFT)
 
@@ -98,14 +98,14 @@ enum sw_event_ids {
  * in the overflow packets.
  */
 enum perf_counter_record_format {
-	PERF_RECORD_IP		= 1U << 0,
-	PERF_RECORD_TID		= 1U << 1,
-	PERF_RECORD_TIME	= 1U << 2,
-	PERF_RECORD_ADDR	= 1U << 3,
-	PERF_RECORD_GROUP	= 1U << 4,
-	PERF_RECORD_CALLCHAIN	= 1U << 5,
-	PERF_RECORD_CONFIG	= 1U << 6,
-	PERF_RECORD_CPU		= 1U << 7,
+	PERF_RECORD_IP			= 1U << 0,
+	PERF_RECORD_TID			= 1U << 1,
+	PERF_RECORD_TIME		= 1U << 2,
+	PERF_RECORD_ADDR		= 1U << 3,
+	PERF_RECORD_GROUP		= 1U << 4,
+	PERF_RECORD_CALLCHAIN		= 1U << 5,
+	PERF_RECORD_CONFIG		= 1U << 6,
+	PERF_RECORD_CPU			= 1U << 7,
 };
 
 /*
@@ -235,13 +235,13 @@ enum perf_event_type {
 	 * correlate userspace IPs to code. They have the following structure:
 	 *
 	 * struct {
-	 * 	struct perf_event_header	header;
+	 *	struct perf_event_header	header;
 	 *
-	 * 	u32				pid, tid;
-	 * 	u64				addr;
-	 * 	u64				len;
-	 * 	u64				pgoff;
-	 * 	char				filename[];
+	 *	u32				pid, tid;
+	 *	u64				addr;
+	 *	u64				len;
+	 *	u64				pgoff;
+	 *	char				filename[];
 	 * };
 	 */
 	PERF_EVENT_MMAP			= 1,
@@ -249,27 +249,27 @@ enum perf_event_type {
 
 	/*
 	 * struct {
-	 * 	struct perf_event_header	header;
+	 *	struct perf_event_header	header;
 	 *
-	 * 	u32				pid, tid;
-	 * 	char				comm[];
+	 *	u32				pid, tid;
+	 *	char				comm[];
 	 * };
 	 */
 	PERF_EVENT_COMM			= 3,
 
 	/*
 	 * struct {
-	 * 	struct perf_event_header	header;
-	 * 	u64				time;
-	 * 	u64				irq_period;
+	 *	struct perf_event_header	header;
+	 *	u64				time;
+	 *	u64				irq_period;
 	 * };
 	 */
 	PERF_EVENT_PERIOD		= 4,
 
 	/*
 	 * struct {
-	 * 	struct perf_event_header	header;
-	 * 	u64				time;
+	 *	struct perf_event_header	header;
+	 *	u64				time;
 	 * };
 	 */
 	PERF_EVENT_THROTTLE		= 5,
@@ -280,23 +280,23 @@ enum perf_event_type {
 	 * will be PERF_RECORD_*
 	 *
 	 * struct {
-	 * 	struct perf_event_header	header;
+	 *	struct perf_event_header	header;
 	 *
-	 * 	{ u64			ip;	  } && PERF_RECORD_IP
-	 * 	{ u32			pid, tid; } && PERF_RECORD_TID
-	 * 	{ u64			time;     } && PERF_RECORD_TIME
-	 * 	{ u64			addr;     } && PERF_RECORD_ADDR
-	 * 	{ u64			config;   } && PERF_RECORD_CONFIG
-	 * 	{ u32			cpu, res; } && PERF_RECORD_CPU
+	 *	{ u64			ip;	  } && PERF_RECORD_IP
+	 *	{ u32			pid, tid; } && PERF_RECORD_TID
+	 *	{ u64			time;     } && PERF_RECORD_TIME
+	 *	{ u64			addr;     } && PERF_RECORD_ADDR
+	 *	{ u64			config;   } && PERF_RECORD_CONFIG
+	 *	{ u32			cpu, res; } && PERF_RECORD_CPU
 	 *
-	 * 	{ u64			nr;
-	 * 	  { u64 event, val; } 	cnt[nr];  } && PERF_RECORD_GROUP
+	 *	{ u64			nr;
+	 *	  { u64 event, val; }	cnt[nr];  } && PERF_RECORD_GROUP
 	 *
-	 * 	{ u16			nr,
-	 * 				hv,
-	 * 				kernel,
-	 * 				user;
-	 * 	  u64			ips[nr];  } && PERF_RECORD_CALLCHAIN
+	 *	{ u16			nr,
+	 *				hv,
+	 *				kernel,
+	 *				user;
+	 *	  u64			ips[nr];  } && PERF_RECORD_CALLCHAIN
 	 * };
 	 */
 };
@@ -406,7 +406,7 @@ struct perf_mmap_data {
 	atomic_t			wakeup;		/* needs a wakeup    */
 
 	struct perf_counter_mmap_page   *user_page;
-	void 				*data_pages[0];
+	void				*data_pages[0];
 };
 
 struct perf_pending_entry {
@@ -422,7 +422,7 @@ struct perf_counter {
 	struct list_head		list_entry;
 	struct list_head		event_entry;
 	struct list_head		sibling_list;
-	int 				nr_siblings;
+	int				nr_siblings;
 	struct perf_counter		*group_leader;
 	const struct pmu		*pmu;
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ec9c4007..070f92d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2576,7 +2576,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
 		.time = sched_clock(),
 	};
 
-	ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 0, 0);
+	ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
 	if (ret)
 		return;
 
@@ -3449,8 +3449,6 @@ void perf_counter_exit_task(struct task_struct *child)
 	struct perf_counter_context *child_ctx;
 	unsigned long flags;
 
-	WARN_ON_ONCE(child != current);
-
 	child_ctx = child->perf_counter_ctxp;
 
 	if (likely(!child_ctx))
-- 
cgit v0.10.2


From 4f5359685af6de7dca101393dc606620adbe963f Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Mon, 18 May 2009 19:35:34 +0800
Subject: tracing: add trace_event_read_lock()

I found that there is nothing to protect event_hash in
ftrace_find_event(). Rcu protects the event hashlist
but not the event itself while we use it after its extraction
through ftrace_find_event().

This lack of a proper locking in this spot opens a race
window between any event dereferencing and module removal.

Eg:

--Task A--

print_trace_line(trace) {
  event = find_ftrace_event(trace)

--Task B--

trace_module_remove_events(mod) {
  list_trace_events_module(ev, mod) {
    unregister_ftrace_event(ev->event) {
      hlist_del(ev->event->node)
        list_del(....)
    }
  }
}
|--> module removed, the event has been dropped

--Task A--

  event->print(trace); // Dereferencing freed memory

If the event retrieved belongs to a module and this module
is concurrently removed, we may end up dereferencing a data
from a freed module.

RCU could solve this, but it would add latency to the kernel and
forbid tracers output callbacks to call any sleepable code.
So this fix converts 'trace_event_mutex' to a read/write semaphore,
and adds trace_event_read_lock() to protect ftrace_find_event().

[ Impact: fix possible freed memory dereference in ftrace ]

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <4A114806.7090302@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dd40d23..02d32ba 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1569,12 +1569,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 		p = s_next(m, p, &l);
 	}
 
+	trace_event_read_lock();
 	return p;
 }
 
 static void s_stop(struct seq_file *m, void *p)
 {
 	atomic_dec(&trace_record_cmdline_disabled);
+	trace_event_read_unlock();
 }
 
 static void print_lat_help_header(struct seq_file *m)
@@ -1817,6 +1819,7 @@ static int trace_empty(struct trace_iterator *iter)
 	return 1;
 }
 
+/*  Called with trace_event_read_lock() held. */
 static enum print_line_t print_trace_line(struct trace_iterator *iter)
 {
 	enum print_line_t ret;
@@ -3008,6 +3011,7 @@ waitagain:
 	       offsetof(struct trace_iterator, seq));
 	iter->pos = -1;
 
+	trace_event_read_lock();
 	while (find_next_entry_inc(iter) != NULL) {
 		enum print_line_t ret;
 		int len = iter->seq.len;
@@ -3024,6 +3028,7 @@ waitagain:
 		if (iter->seq.len >= cnt)
 			break;
 	}
+	trace_event_read_unlock();
 
 	/* Now copy what we have to the user */
 	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
@@ -3146,6 +3151,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		goto out_err;
 	}
 
+	trace_event_read_lock();
+
 	/* Fill as many pages as possible. */
 	for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
 		pages[i] = alloc_page(GFP_KERNEL);
@@ -3168,6 +3175,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		trace_seq_init(&iter->seq);
 	}
 
+	trace_event_read_unlock();
 	mutex_unlock(&iter->mutex);
 
 	spd.nr_pages = i;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 489c0e8..7136420 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -14,7 +14,7 @@
 /* must be a power of 2 */
 #define EVENT_HASHSIZE	128
 
-static DEFINE_MUTEX(trace_event_mutex);
+static DECLARE_RWSEM(trace_event_mutex);
 static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
 static int next_event_type = __TRACE_LAST_TYPE + 1;
@@ -466,6 +466,7 @@ static int task_state_char(unsigned long state)
  * @type: the type of event to look for
  *
  * Returns an event of type @type otherwise NULL
+ * Called with trace_event_read_lock() held.
  */
 struct trace_event *ftrace_find_event(int type)
 {
@@ -475,7 +476,7 @@ struct trace_event *ftrace_find_event(int type)
 
 	key = type & (EVENT_HASHSIZE - 1);
 
-	hlist_for_each_entry_rcu(event, n, &event_hash[key], node) {
+	hlist_for_each_entry(event, n, &event_hash[key], node) {
 		if (event->type == type)
 			return event;
 	}
@@ -513,6 +514,16 @@ static int trace_search_list(struct list_head **list)
 	return last + 1;
 }
 
+void trace_event_read_lock(void)
+{
+	down_read(&trace_event_mutex);
+}
+
+void trace_event_read_unlock(void)
+{
+	up_read(&trace_event_mutex);
+}
+
 /**
  * register_ftrace_event - register output for an event type
  * @event: the event type to register
@@ -533,7 +544,7 @@ int register_ftrace_event(struct trace_event *event)
 	unsigned key;
 	int ret = 0;
 
-	mutex_lock(&trace_event_mutex);
+	down_write(&trace_event_mutex);
 
 	if (WARN_ON(!event))
 		goto out;
@@ -581,11 +592,11 @@ int register_ftrace_event(struct trace_event *event)
 
 	key = event->type & (EVENT_HASHSIZE - 1);
 
-	hlist_add_head_rcu(&event->node, &event_hash[key]);
+	hlist_add_head(&event->node, &event_hash[key]);
 
 	ret = event->type;
  out:
-	mutex_unlock(&trace_event_mutex);
+	up_write(&trace_event_mutex);
 
 	return ret;
 }
@@ -597,10 +608,10 @@ EXPORT_SYMBOL_GPL(register_ftrace_event);
  */
 int unregister_ftrace_event(struct trace_event *event)
 {
-	mutex_lock(&trace_event_mutex);
+	down_write(&trace_event_mutex);
 	hlist_del(&event->node);
 	list_del(&event->list);
-	mutex_unlock(&trace_event_mutex);
+	up_write(&trace_event_mutex);
 
 	return 0;
 }
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 6e220a8..ac240e7 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -20,6 +20,8 @@ extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
 extern int trace_print_context(struct trace_iterator *iter);
 extern int trace_print_lat_context(struct trace_iterator *iter);
 
+extern void trace_event_read_lock(void);
+extern void trace_event_read_unlock(void);
 extern struct trace_event *ftrace_find_event(int type);
 
 extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
-- 
cgit v0.10.2


From b0aae68cc5508f3c2fbf728988c954db4c8b8a53 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 21 May 2009 13:59:18 +0800
Subject: tracing/events: change the type of __str_loc_item to unsigned short

When defining a dynamic size string, we add __str_loc_##item to the
trace entry, and it stores the location of the actual string in
entry->_str_data[]

'unsigned short' should be sufficient to store this information, thus
we save 2 bytes per dyn-size string in the ring buffer.

[ Impact: reduce memory occupied by dyn-size strings in ring buffer ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <4A14EDB6.2050507@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index edb02bc..b5ff2e8 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -25,7 +25,7 @@
 #define __field(type, item)		type	item;
 
 #undef __string
-#define __string(item, src)		int	__str_loc_##item;
+#define __string(item, src)		unsigned short	__str_loc_##item;
 
 #undef TP_STRUCT__entry
 #define TP_STRUCT__entry(args...) args
-- 
cgit v0.10.2


From 62e1e389f87a8839ad83b08c44691d1df8320846 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 26 May 2009 09:40:59 +1000
Subject: md: always update level / chunk_size / layout when writing v1.x
 metadata.

We previously didn't update these fields when writing the metadata
because they could never change.  They can now, so we better write
them.
v0.90 metadata always updated these fields.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index fccc834..aa79d55 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1375,6 +1375,9 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 
 	sb->raid_disks = cpu_to_le32(mddev->raid_disks);
 	sb->size = cpu_to_le64(mddev->dev_sectors);
+	sb->chunksize = cpu_to_le32(mddev->chunk_size >> 9);
+	sb->level = cpu_to_le32(mddev->level);
+	sb->layout = cpu_to_le32(mddev->layout);
 
 	if (mddev->bitmap && mddev->bitmap_file == NULL) {
 		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
-- 
cgit v0.10.2


From 2b69c83924396ad1eda36fdd267c9d2f360f5555 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 26 May 2009 09:41:17 +1000
Subject: md: improve errno return when setting array_size

Instead of always returns EINVAL if anything goes wrong
when setting the array size, add the option of
  E2BIG
if the size requested is too large.  This makes it easier
for user-space to be sure what went wrong.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index aa79d55..58e0b02 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3683,7 +3683,7 @@ array_size_store(mddev_t *mddev, const char *buf, size_t len)
 		if (strict_blocks_to_sectors(buf, &sectors) < 0)
 			return -EINVAL;
 		if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
-			return -EINVAL;
+			return -E2BIG;
 
 		mddev->external_size = 1;
 	}
-- 
cgit v0.10.2


From be512691036cc989c11d0f418187efbbf14468e6 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 26 May 2009 09:41:17 +1000
Subject: md: bitmap: improve bitmap maintenance code.

The code for checking which bits in the bitmap can be cleared
has 2 problems:
 1/ it repeatedly takes and drops a spinlock, where it would make
    more sense to just hold on to it most of the time.
 2/ it doesn't make use of some opportunities to skip large sections
    of the bitmap

This patch fixes those.  It will only affect CPU consumption, not
correctness.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 47c68bc..56df1ce 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1097,14 +1097,12 @@ void bitmap_daemon_work(struct bitmap *bitmap)
 	}
 	bitmap->allclean = 1;
 
+	spin_lock_irqsave(&bitmap->lock, flags);
 	for (j = 0; j < bitmap->chunks; j++) {
 		bitmap_counter_t *bmc;
-		spin_lock_irqsave(&bitmap->lock, flags);
-		if (!bitmap->filemap) {
+		if (!bitmap->filemap)
 			/* error or shutdown */
-			spin_unlock_irqrestore(&bitmap->lock, flags);
 			break;
-		}
 
 		page = filemap_get_page(bitmap, j);
 
@@ -1121,6 +1119,8 @@ void bitmap_daemon_work(struct bitmap *bitmap)
 					write_page(bitmap, page, 0);
 					bitmap->allclean = 0;
 				}
+				spin_lock_irqsave(&bitmap->lock, flags);
+				j |= (PAGE_BITS - 1);
 				continue;
 			}
 
@@ -1181,9 +1181,10 @@ void bitmap_daemon_work(struct bitmap *bitmap)
 					ext2_clear_bit(file_page_offset(j), paddr);
 				kunmap_atomic(paddr, KM_USER0);
 			}
-		}
-		spin_unlock_irqrestore(&bitmap->lock, flags);
+		} else
+			j |= PAGE_COUNTER_MASK;
 	}
+	spin_unlock_irqrestore(&bitmap->lock, flags);
 
 	/* now sync the final page */
 	if (lastpage != NULL) {
-- 
cgit v0.10.2


From b6a9ce688f613e2ee5f15e6720e0bb8520efc36e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 26 May 2009 09:41:17 +1000
Subject: md: export 'frozen' resync state through sysfs

The md resync engine has a 'frozen' state which ensures that
no resync/recovery.  This is used to avoid races.

Export this state through the 'sync_action' sysfs attribute
so that user-space can benefit and also avoid some races.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 58e0b02..384e4f0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3306,7 +3306,9 @@ static ssize_t
 action_show(mddev_t *mddev, char *page)
 {
 	char *type = "idle";
-	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+	if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
+		type = "frozen";
+	else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
 	    (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
 		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
 			type = "reshape";
@@ -3329,7 +3331,12 @@ action_store(mddev_t *mddev, const char *page, size_t len)
 	if (!mddev->pers || !mddev->pers->sync_request)
 		return -EINVAL;
 
-	if (cmd_match(page, "idle")) {
+	if (cmd_match(page, "frozen"))
+		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+	else
+		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+
+	if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
 		if (mddev->sync_thread) {
 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 			md_unregister_thread(mddev->sync_thread);
-- 
cgit v0.10.2


From 29fcefba8a2f0fea11e2b721fe174a1832801284 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Sun, 24 May 2009 11:13:17 +0300
Subject: kmemtrace: fix kernel parameter documentation

The kmemtrace.enable kernel parameter no longer works. To enable
kmemtrace at boot-time, you must pass "ftrace=kmemtrace" instead.

[ Impact: remove obsolete kernel parameter documentation ]

Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
LKML-Reference: <alpine.DEB.2.00.0905241112190.10296@rocky>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e87bdbf..9243dd8 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -56,7 +56,6 @@ parameter is applicable:
 	ISAPNP	ISA PnP code is enabled.
 	ISDN	Appropriate ISDN support is enabled.
 	JOY	Appropriate joystick support is enabled.
-	KMEMTRACE kmemtrace is enabled.
 	LIBATA  Libata driver is enabled
 	LP	Printer support is enabled.
 	LOOP	Loopback device support is enabled.
@@ -1054,15 +1053,6 @@ and is between 256 and 4096 characters. It is defined in the file
 			use the HighMem zone if it exists, and the Normal
 			zone if it does not.
 
-	kmemtrace.enable=	[KNL,KMEMTRACE] Format: { yes | no }
-				Controls whether kmemtrace is enabled
-				at boot-time.
-
-	kmemtrace.subbufs=n	[KNL,KMEMTRACE] Overrides the number of
-			subbufs kmemtrace's relay channel has. Set this
-			higher than default (KMEMTRACE_N_SUBBUFS in code) if
-			you experience buffer overruns.
-
 	kgdboc=		[HW] kgdb over consoles.
 			Requires a tty driver that supports console polling.
 			(only serial suported for now)
-- 
cgit v0.10.2


From b11c53e12f94a46b50bccc7a1a953d7ca1d54a31 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Mon, 25 May 2009 18:11:59 +0800
Subject: ftrace: Add task_comm support for trace_event

If we enable a trace event alone without any tracer running (such as
function tracer, sched switch tracer, etc...) it can't output enough
task command information.

We need to use the tracing_{start/stop}_cmdline_record() helpers
which are designed to keep track of cmdlines for any tasks that
were scheduled during the tracing.

Before this patch:
 # echo 1 > debugfs/tracing/events/sched/sched_switch/enable
 # cat debugfs/tracing/trace
 # tracer: nop
 #
 #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
 #              | |       |          |         |
            <...>-2289  [000] 526276.724790: sched_switch: task bash:2289 [120] ==> sshd:2287 [120]
            <...>-2287  [000] 526276.725231: sched_switch: task sshd:2287 [120] ==> bash:2289 [120]
            <...>-2289  [000] 526276.725452: sched_switch: task bash:2289 [120] ==> sshd:2287 [120]
            <...>-2287  [000] 526276.727181: sched_switch: task sshd:2287 [120] ==> swapper:0 [140]
           <idle>-0     [000] 526277.032734: sched_switch: task swapper:0 [140] ==> events/0:5 [115]
            <...>-5     [000] 526277.032782: sched_switch: task events/0:5 [115] ==> swapper:0 [140]
 ...

After this patch:
 # tracer: nop
 #
 #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
 #              | |       |          |         |
             bash-2269  [000] 527347.989229: sched_switch: task bash:2269 [120] ==> sshd:2267 [120]
             sshd-2267  [000] 527347.990960: sched_switch: task sshd:2267 [120] ==> bash:2269 [120]
             bash-2269  [000] 527347.991143: sched_switch: task bash:2269 [120] ==> sshd:2267 [120]
             sshd-2267  [000] 527347.992959: sched_switch: task sshd:2267 [120] ==> swapper:0 [140]
           <idle>-0     [000] 527348.531989: sched_switch: task swapper:0 [140] ==> events/0:5 [115]
         events/0-5     [000] 527348.532115: sched_switch: task events/0:5 [115] ==> swapper:0 [140]
 ...

Changelog:
v1->v2: Update Kconfig to select CONTEXT_SWITCH_TRACER in
        ENABLE_EVENT_TRACING
v2->v3: v2 can solve problem that was caused by config EVENT_TRACING
        alone, but when CONFIG_FTRACE is off and CONFIG_TRACING is
        selected by other config, compile fail happened again.
        This version solves it.

[ Impact: fix incomplete output of event tracing ]

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <4A14FDFE.2080402@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f61be30..a508b9d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -49,6 +49,11 @@ config FTRACE_NMI_ENTER
        default y
 
 config EVENT_TRACING
+	select CONTEXT_SWITCH_TRACER
+	bool
+
+config CONTEXT_SWITCH_TRACER
+	select MARKERS
 	bool
 
 config TRACING
@@ -176,10 +181,10 @@ config SCHED_TRACER
 	  This tracer tracks the latency of the highest priority task
 	  to be scheduled in, starting from the point it has woken up.
 
-config CONTEXT_SWITCH_TRACER
+config ENABLE_CONTEXT_SWITCH_TRACER
 	bool "Trace process context switches"
 	select TRACING
-	select MARKERS
+	select CONTEXT_SWITCH_TRACER
 	help
 	  This tracer gets called from the context switch and records
 	  all switching of tasks.
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 9e91c4a..9b246eb 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -85,6 +85,7 @@ static void ftrace_clear_events(void)
 
 		if (call->enabled) {
 			call->enabled = 0;
+			tracing_stop_cmdline_record();
 			call->unregfunc();
 		}
 	}
@@ -99,12 +100,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
 	case 0:
 		if (call->enabled) {
 			call->enabled = 0;
+			tracing_stop_cmdline_record();
 			call->unregfunc();
 		}
 		break;
 	case 1:
 		if (!call->enabled) {
 			call->enabled = 1;
+			tracing_start_cmdline_record();
 			call->regfunc();
 		}
 		break;
@@ -1058,6 +1061,7 @@ static void trace_module_remove_events(struct module *mod)
 			found = true;
 			if (call->enabled) {
 				call->enabled = 0;
+				tracing_stop_cmdline_record();
 				call->unregfunc();
 			}
 			if (call->event)
@@ -1262,11 +1266,13 @@ static __init void event_trace_self_tests(void)
 		}
 
 		call->enabled = 1;
+		tracing_start_cmdline_record();
 		call->regfunc();
 
 		event_test_stuff();
 
 		call->unregfunc();
+		tracing_stop_cmdline_record();
 		call->enabled = 0;
 
 		pr_cont("OK\n");
-- 
cgit v0.10.2


From 0e907c99391362385c8e3af2c43b904dd1fd5d73 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Mon, 25 May 2009 18:13:59 +0800
Subject: ftrace: clean up of using ftrace_event_enable_disable()

Always use ftrace_event_enable_disable() to enable/disable an event
so that we can factorize out the event toggling code.

[ Impact: factorize and cleanup event tracing code ]

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <4A14FDFE.2080402@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 9b246eb..6c81f9c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -76,26 +76,9 @@ static void trace_destroy_fields(struct ftrace_event_call *call)
 
 #endif /* CONFIG_MODULES */
 
-static void ftrace_clear_events(void)
-{
-	struct ftrace_event_call *call;
-
-	mutex_lock(&event_mutex);
-	list_for_each_entry(call, &ftrace_events, list) {
-
-		if (call->enabled) {
-			call->enabled = 0;
-			tracing_stop_cmdline_record();
-			call->unregfunc();
-		}
-	}
-	mutex_unlock(&event_mutex);
-}
-
 static void ftrace_event_enable_disable(struct ftrace_event_call *call,
 					int enable)
 {
-
 	switch (enable) {
 	case 0:
 		if (call->enabled) {
@@ -114,6 +97,17 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
 	}
 }
 
+static void ftrace_clear_events(void)
+{
+	struct ftrace_event_call *call;
+
+	mutex_lock(&event_mutex);
+	list_for_each_entry(call, &ftrace_events, list) {
+		ftrace_event_enable_disable(call, 0);
+	}
+	mutex_unlock(&event_mutex);
+}
+
 /*
  * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
  */
@@ -1059,11 +1053,7 @@ static void trace_module_remove_events(struct module *mod)
 	list_for_each_entry_safe(call, p, &ftrace_events, list) {
 		if (call->mod == mod) {
 			found = true;
-			if (call->enabled) {
-				call->enabled = 0;
-				tracing_stop_cmdline_record();
-				call->unregfunc();
-			}
+			ftrace_event_enable_disable(call, 0);
 			if (call->event)
 				unregister_ftrace_event(call->event);
 			debugfs_remove_recursive(call->dir);
@@ -1265,15 +1255,9 @@ static __init void event_trace_self_tests(void)
 			continue;
 		}
 
-		call->enabled = 1;
-		tracing_start_cmdline_record();
-		call->regfunc();
-
+		ftrace_event_enable_disable(call, 1);
 		event_test_stuff();
-
-		call->unregfunc();
-		tracing_stop_cmdline_record();
-		call->enabled = 0;
+		ftrace_event_enable_disable(call, 0);
 
 		pr_cont("OK\n");
 	}
-- 
cgit v0.10.2


From 848b3182365fdf5a05bcd5ed949071cac2c894b3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 26 May 2009 12:41:08 +1000
Subject: md: raid5: avoid sector values going negative when testing reshape
 progress.

As sector_t in unsigned, we cannot afford to let 'safepos' etc go
negative.
So replace
   a -= b;
by
   a -= min(b,a);

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4616bc3..3c3626d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3811,13 +3811,13 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 	safepos = conf->reshape_safe;
 	sector_div(safepos, data_disks);
 	if (mddev->delta_disks < 0) {
-		writepos -= reshape_sectors;
+		writepos -= min(reshape_sectors, writepos);
 		readpos += reshape_sectors;
 		safepos += reshape_sectors;
 	} else {
 		writepos += reshape_sectors;
-		readpos -= reshape_sectors;
-		safepos -= reshape_sectors;
+		readpos -= min(reshape_sectors, readpos);
+		safepos -= min(reshape_sectors, safepos);
 	}
 
 	/* 'writepos' is the most advanced device address we might write.
-- 
cgit v0.10.2


From 7a91ee1f628ef6bfe3f13067c0ddf9db520cb86b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 26 May 2009 12:57:21 +1000
Subject: md: don't update curr_resync_completed without also updating
 reshape_position.

In order for the metadata to always be consistent, we mustn't updated
curr_resync_completed without also updating reshape_position.

The reshape code updates both at the same time.  However since
commit 97e4f42d62badb0f9fbc27c013e89bc1336a03bc
the common md_do_sync will sometimes update curr_resync_completed
but is not in a position to update reshape_position.
So if MD_RECOVERY_RESHAPE is set (indicating that a reshape is
happening, so reshape_position might change), don't update
curr_resync_completed in md_do_sync, leave it to the per-personality
reshape code.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 384e4f0..9544565 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6362,12 +6362,13 @@ void md_do_sync(mddev_t *mddev)
 
 		skipped = 0;
 
-		if ((mddev->curr_resync > mddev->curr_resync_completed &&
-		     (mddev->curr_resync - mddev->curr_resync_completed)
-		    > (max_sectors >> 4)) ||
-		    (j - mddev->curr_resync_completed)*2
-		    >= mddev->resync_max - mddev->curr_resync_completed
-			) {
+		if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+		    ((mddev->curr_resync > mddev->curr_resync_completed &&
+		      (mddev->curr_resync - mddev->curr_resync_completed)
+		      > (max_sectors >> 4)) ||
+		     (j - mddev->curr_resync_completed)*2
+		     >= mddev->resync_max - mddev->curr_resync_completed
+			    )) {
 			/* time to update curr_resync_completed */
 			blk_unplug(mddev->queue);
 			wait_event(mddev->recovery_wait,
-- 
cgit v0.10.2


From b492b852cd8c99505708152c29a5e09a787af9de Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 26 May 2009 12:57:36 +1000
Subject: md: don't use locked_ioctl.

md has no need for the BKL - it does its own locking.
So md_ioctl doesn't need to be a locked_ioctl.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9544565..641b211 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5567,7 +5567,7 @@ static struct block_device_operations md_fops =
 	.owner		= THIS_MODULE,
 	.open		= md_open,
 	.release	= md_release,
-	.locked_ioctl	= md_ioctl,
+	.ioctl		= md_ioctl,
 	.getgeo		= md_getgeo,
 	.media_changed	= md_media_changed,
 	.revalidate_disk= md_revalidate,
-- 
cgit v0.10.2


From 8e35961b57da14cb64cb0e4e1b7e3aabda6396fe Mon Sep 17 00:00:00 2001
From: Hideo Saito <hsaito.ppc@gmail.com>
Date: Sun, 24 May 2009 15:33:34 +0000
Subject: powerpc/mm: Fix broken MMU PID stealing on !SMP

The recent rework of the MMU PID handling for non-hash CPUs has a
subtle bug in the !SMP "optimized" variant of the PID stealing
function.  It clears the PID in the mm context before it calls
local_flush_tlb_mm(). However, the later will not flush anything
if the PID in the context is clear...

Signed-off-by: Hideo Saito <hsaito.ppc@gmail.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index a70e311..030d000 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -127,12 +127,12 @@ static unsigned int steal_context_up(unsigned int id)
 
 	pr_debug("[%d] steal context %d from mm @%p\n", cpu, id, mm);
 
-	/* Mark this mm has having no context anymore */
-	mm->context.id = MMU_NO_CONTEXT;
-
 	/* Flush the TLB for that context */
 	local_flush_tlb_mm(mm);
 
+	/* Mark this mm has having no context anymore */
+	mm->context.id = MMU_NO_CONTEXT;
+
 	/* XXX This clear should ultimately be part of local_flush_tlb_mm */
 	__clear_bit(id, stale_map[cpu]);
 
-- 
cgit v0.10.2


From 3709ab8dfa23cab553ea9ffea3372c8e0a28f332 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 26 May 2009 13:52:28 +0900
Subject: sh: irq: Fix up imask build warnings.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/irq/imask.c b/arch/sh/kernel/cpu/irq/imask.c
index 3bef3c2..6b5d191 100644
--- a/arch/sh/kernel/cpu/irq/imask.c
+++ b/arch/sh/kernel/cpu/irq/imask.c
@@ -53,7 +53,7 @@ static inline void set_interrupt_registers(int ip)
 
 static void mask_imask_irq(unsigned int irq)
 {
-	clear_bit(irq, &imask_mask);
+	clear_bit(irq, imask_mask);
 	if (interrupt_priority < IMASK_PRIORITY - irq)
 		interrupt_priority = IMASK_PRIORITY - irq;
 	set_interrupt_registers(interrupt_priority);
@@ -61,7 +61,7 @@ static void mask_imask_irq(unsigned int irq)
 
 static void unmask_imask_irq(unsigned int irq)
 {
-	set_bit(irq, &imask_mask);
+	set_bit(irq, imask_mask);
 	interrupt_priority = IMASK_PRIORITY -
 		find_first_zero_bit(imask_mask, IMASK_PRIORITY);
 	set_interrupt_registers(interrupt_priority);
-- 
cgit v0.10.2


From 217cbfa856dc1cbc2890781626c4032d9e3ec59f Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Mon, 25 May 2009 22:43:49 -0700
Subject: mac8390: fix regression caused during net_device_ops conversion

Changeset ca17584bf2ad1b1e37a5c0e4386728cc5fc9dabc ("mac8390: update
to net_device_ops") broke mac8390 by adding 8390.o to the link. That
meant that lib8390.c was included twice, once in mac8390.c and once in
8390.c, subject to different macros. This patch reverts that by
avoiding the wrappers in 8390.c. They seem to be of no value since
COMPAT_NET_DEV_OPS is going away soon.

Tested with a Kinetics EtherPort card.

Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 1fc4602..a1c25cb 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -102,7 +102,7 @@ obj-$(CONFIG_HAMACHI) += hamachi.o
 obj-$(CONFIG_NET) += Space.o loopback.o
 obj-$(CONFIG_SEEQ8005) += seeq8005.o
 obj-$(CONFIG_NET_SB1000) += sb1000.o
-obj-$(CONFIG_MAC8390) += mac8390.o 8390.o
+obj-$(CONFIG_MAC8390) += mac8390.o
 obj-$(CONFIG_APNE) += apne.o 8390.o
 obj-$(CONFIG_PCMCIA_PCNET) += 8390.o
 obj-$(CONFIG_HP100) += hp100.o
diff --git a/drivers/net/mac8390.c b/drivers/net/mac8390.c
index 8e88486..f26667d 100644
--- a/drivers/net/mac8390.c
+++ b/drivers/net/mac8390.c
@@ -304,7 +304,7 @@ struct net_device * __init mac8390_probe(int unit)
 	if (!MACH_IS_MAC)
 		return ERR_PTR(-ENODEV);
 
-	dev = alloc_ei_netdev();
+	dev = ____alloc_ei_netdev(0);
 	if (!dev)
 		return ERR_PTR(-ENOMEM);
 
@@ -481,10 +481,10 @@ void cleanup_module(void)
 static const struct net_device_ops mac8390_netdev_ops = {
 	.ndo_open 		= mac8390_open,
 	.ndo_stop		= mac8390_close,
-	.ndo_start_xmit		= ei_start_xmit,
-	.ndo_tx_timeout		= ei_tx_timeout,
-	.ndo_get_stats		= ei_get_stats,
-	.ndo_set_multicast_list = ei_set_multicast_list,
+	.ndo_start_xmit		= __ei_start_xmit,
+	.ndo_tx_timeout		= __ei_tx_timeout,
+	.ndo_get_stats		= __ei_get_stats,
+	.ndo_set_multicast_list = __ei_set_multicast_list,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_change_mtu		= eth_change_mtu,
-- 
cgit v0.10.2


From c80a5cdfc5ca6533cb893154f546370da1fdb8f0 Mon Sep 17 00:00:00 2001
From: Doug Leith <doug.leith@nuim.ie>
Date: Mon, 25 May 2009 22:44:59 -0700
Subject: tcp: tcp_vegas ssthresh bugfix

This patch fixes ssthresh accounting issues in tcp_vegas when cwnd decreases

Signed-off-by: Doug Leith <doug.leith@nuim.ie>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index a453aac..c6743ee 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -158,6 +158,11 @@ void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
 }
 EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event);
 
+static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp)
+{
+	return  min(tp->snd_ssthresh, tp->snd_cwnd-1);
+}
+
 static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -221,11 +226,10 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 			 */
 			diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT;
 
-			if (diff > gamma && tp->snd_ssthresh > 2 ) {
+			if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) {
 				/* Going too fast. Time to slow down
 				 * and switch to congestion avoidance.
 				 */
-				tp->snd_ssthresh = 2;
 
 				/* Set cwnd to match the actual rate
 				 * exactly:
@@ -235,6 +239,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 				 * utilization.
 				 */
 				tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1);
+				tp->snd_ssthresh = tcp_vegas_ssthresh(tp);
 
 			} else if (tp->snd_cwnd <= tp->snd_ssthresh) {
 				/* Slow start.  */
@@ -250,6 +255,8 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 					 * we slow down.
 					 */
 					tp->snd_cwnd--;
+					tp->snd_ssthresh
+						= tcp_vegas_ssthresh(tp);
 				} else if (diff < alpha) {
 					/* We don't have enough extra packets
 					 * in the network, so speed up.
-- 
cgit v0.10.2


From 46176b4f6bac19454b7b5c35f68594b85850a600 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 26 May 2009 14:42:40 +0900
Subject: x86, relocs: ignore R_386_NONE in kernel relocation entries

For relocatable 32bit kernels, boot/compressed/relocs.c processes
relocation entries in the kernel image and appends it to the kernel
image such that boot/compressed/head_32.S can relocate the kernel.
The kernel image is one statically linked object and only uses two
relocation types - R_386_PC32 and R_386_32, of the two only the latter
needs massaging during kernel relocation and thus handled by relocs.
R_386_PC32 is ignored and all other relocation types are considered
error.

When the target of a relocation resides in a discarded section,
binutils doesn't throw away the relocation record but nullifies it by
changing it to R_386_NONE, which unfortunately makes relocs fail.

The problem was triggered by yet out-of-tree x86 stack unwind patches
but given the binutils behavior, ignoring R_386_NONE is the right
thing to do.

The problem has been tracked down to binutils behavior by Jan Beulich.

[ Impact: fix build with certain binutils by ignoring R_386_NONE ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jan Beulich <JBeulich@novell.com>
Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <4A1B8150.40702@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
index 857e492..bbeb0c3 100644
--- a/arch/x86/boot/compressed/relocs.c
+++ b/arch/x86/boot/compressed/relocs.c
@@ -504,8 +504,11 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym))
 			if (sym->st_shndx == SHN_ABS) {
 				continue;
 			}
-			if (r_type == R_386_PC32) {
-				/* PC relative relocations don't need to be adjusted */
+			if (r_type == R_386_NONE || r_type == R_386_PC32) {
+				/*
+				 * NONE can be ignored and and PC relative
+				 * relocations don't need to be adjusted.
+				 */
 			}
 			else if (r_type == R_386_32) {
 				/* Visit relocations that need to be adjusted */
-- 
cgit v0.10.2


From f11a377b3f4e897d11f0e8d1fc688667e2f19708 Mon Sep 17 00:00:00 2001
From: David Dillow <dave@thedillows.org>
Date: Fri, 22 May 2009 15:29:34 +0000
Subject: r8169: avoid losing MSI interrupts

The 8169 chip only generates MSI interrupts when all enabled event
sources are quiescent and one or more sources transition to active. If
not all of the active events are acknowledged, or a new event becomes
active while the existing ones are cleared in the handler, we will not
see a new interrupt.

The current interrupt handler masks off the Rx and Tx events once the
NAPI handler has been scheduled, which opens a race window in which we
can get another Rx or Tx event and never ACK'ing it, stopping all
activity until the link is reset (ifconfig down/up). Fix this by always
ACK'ing all event sources, and loop in the handler until we have all
sources quiescent.

Signed-off-by: David Dillow <dave@thedillows.org>
Tested-by: Michael Buesch <mb@bu3sch.de>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 0b6e8c8..8247a94 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -3554,54 +3554,64 @@ static irqreturn_t rtl8169_interrupt(int irq, void *dev_instance)
 	int handled = 0;
 	int status;
 
+	/* loop handling interrupts until we have no new ones or
+	 * we hit a invalid/hotplug case.
+	 */
 	status = RTL_R16(IntrStatus);
+	while (status && status != 0xffff) {
+		handled = 1;
 
-	/* hotplug/major error/no more work/shared irq */
-	if ((status == 0xffff) || !status)
-		goto out;
-
-	handled = 1;
+		/* Handle all of the error cases first. These will reset
+		 * the chip, so just exit the loop.
+		 */
+		if (unlikely(!netif_running(dev))) {
+			rtl8169_asic_down(ioaddr);
+			break;
+		}
 
-	if (unlikely(!netif_running(dev))) {
-		rtl8169_asic_down(ioaddr);
-		goto out;
-	}
+		/* Work around for rx fifo overflow */
+		if (unlikely(status & RxFIFOOver) &&
+		(tp->mac_version == RTL_GIGA_MAC_VER_11)) {
+			netif_stop_queue(dev);
+			rtl8169_tx_timeout(dev);
+			break;
+		}
 
-	status &= tp->intr_mask;
-	RTL_W16(IntrStatus,
-		(status & RxFIFOOver) ? (status | RxOverflow) : status);
+		if (unlikely(status & SYSErr)) {
+			rtl8169_pcierr_interrupt(dev);
+			break;
+		}
 
-	if (!(status & tp->intr_event))
-		goto out;
+		if (status & LinkChg)
+			rtl8169_check_link_status(dev, tp, ioaddr);
 
-	/* Work around for rx fifo overflow */
-	if (unlikely(status & RxFIFOOver) &&
-	    (tp->mac_version == RTL_GIGA_MAC_VER_11)) {
-		netif_stop_queue(dev);
-		rtl8169_tx_timeout(dev);
-		goto out;
-	}
+		/* We need to see the lastest version of tp->intr_mask to
+		 * avoid ignoring an MSI interrupt and having to wait for
+		 * another event which may never come.
+		 */
+		smp_rmb();
+		if (status & tp->intr_mask & tp->napi_event) {
+			RTL_W16(IntrMask, tp->intr_event & ~tp->napi_event);
+			tp->intr_mask = ~tp->napi_event;
+
+			if (likely(napi_schedule_prep(&tp->napi)))
+				__napi_schedule(&tp->napi);
+			else if (netif_msg_intr(tp)) {
+				printk(KERN_INFO "%s: interrupt %04x in poll\n",
+				dev->name, status);
+			}
+		}
 
-	if (unlikely(status & SYSErr)) {
-		rtl8169_pcierr_interrupt(dev);
-		goto out;
+		/* We only get a new MSI interrupt when all active irq
+		 * sources on the chip have been acknowledged. So, ack
+		 * everything we've seen and check if new sources have become
+		 * active to avoid blocking all interrupts from the chip.
+		 */
+		RTL_W16(IntrStatus,
+			(status & RxFIFOOver) ? (status | RxOverflow) : status);
+		status = RTL_R16(IntrStatus);
 	}
 
-	if (status & LinkChg)
-		rtl8169_check_link_status(dev, tp, ioaddr);
-
-	if (status & tp->napi_event) {
-		RTL_W16(IntrMask, tp->intr_event & ~tp->napi_event);
-		tp->intr_mask = ~tp->napi_event;
-
-		if (likely(napi_schedule_prep(&tp->napi)))
-			__napi_schedule(&tp->napi);
-		else if (netif_msg_intr(tp)) {
-			printk(KERN_INFO "%s: interrupt %04x in poll\n",
-			       dev->name, status);
-		}
-	}
-out:
 	return IRQ_RETVAL(handled);
 }
 
@@ -3617,13 +3627,15 @@ static int rtl8169_poll(struct napi_struct *napi, int budget)
 
 	if (work_done < budget) {
 		napi_complete(napi);
-		tp->intr_mask = 0xffff;
-		/*
-		 * 20040426: the barrier is not strictly required but the
-		 * behavior of the irq handler could be less predictable
-		 * without it. Btw, the lack of flush for the posted pci
-		 * write is safe - FR
+
+		/* We need for force the visibility of tp->intr_mask
+		 * for other CPUs, as we can loose an MSI interrupt
+		 * and potentially wait for a retransmit timeout if we don't.
+		 * The posted write to IntrMask is safe, as it will
+		 * eventually make it to the chip and we won't loose anything
+		 * until it does.
 		 */
+		tp->intr_mask = 0xffff;
 		smp_wmb();
 		RTL_W16(IntrMask, tp->intr_event);
 	}
-- 
cgit v0.10.2


From 8a7b8cb91f26a671f22cedc7fd54508667f2d9b9 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Tue, 26 May 2009 16:27:59 +1000
Subject: perf_counter: powerpc: Implement interrupt throttling

This implements interrupt throttling on powerpc.  Since we don't have
individual count enable/disable or interrupt enable/disable controls
per counter, this simply sets the hardware counter to 0, meaning that
it will not interrupt again until it has counted 2^31 counts, which
will take at least 2^30 cycles assuming a maximum of 2 counts per
cycle.  Also, we set counter->hw.period_left to the maximum possible
value (2^63 - 1), so we won't report overflows for this counter for
the forseeable future.

The unthrottle operation restores counter->hw.period_left and the
hardware counter so that we will once again report a counter overflow
after counter->hw.irq_period counts.

[ Impact: new perfcounters robustness feature on PowerPC ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <18971.35823.643362.446774@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index fe21b24..f96d55f 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -740,10 +740,37 @@ static void power_pmu_disable(struct perf_counter *counter)
 	local_irq_restore(flags);
 }
 
+/*
+ * Re-enable interrupts on a counter after they were throttled
+ * because they were coming too fast.
+ */
+static void power_pmu_unthrottle(struct perf_counter *counter)
+{
+	s64 val, left;
+	unsigned long flags;
+
+	if (!counter->hw.idx || !counter->hw.irq_period)
+		return;
+	local_irq_save(flags);
+	perf_disable();
+	power_pmu_read(counter);
+	left = counter->hw.irq_period;
+	val = 0;
+	if (left < 0x80000000L)
+		val = 0x80000000L - left;
+	write_pmc(counter->hw.idx, val);
+	atomic64_set(&counter->hw.prev_count, val);
+	atomic64_set(&counter->hw.period_left, left);
+	perf_counter_update_userpage(counter);
+	perf_enable();
+	local_irq_restore(flags);
+}
+
 struct pmu power_pmu = {
 	.enable		= power_pmu_enable,
 	.disable	= power_pmu_disable,
 	.read		= power_pmu_read,
+	.unthrottle	= power_pmu_unthrottle,
 };
 
 /*
@@ -957,10 +984,6 @@ static void record_and_restart(struct perf_counter *counter, long val,
 		if (left < 0x80000000L)
 			val = 0x80000000L - left;
 	}
-	write_pmc(counter->hw.idx, val);
-	atomic64_set(&counter->hw.prev_count, val);
-	atomic64_set(&counter->hw.period_left, left);
-	perf_counter_update_userpage(counter);
 
 	/*
 	 * Finally record data if requested.
@@ -983,8 +1006,23 @@ static void record_and_restart(struct perf_counter *counter, long val,
 			if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
 				addr = mfspr(SPRN_SDAR);
 		}
-		perf_counter_overflow(counter, nmi, regs, addr);
+		if (perf_counter_overflow(counter, nmi, regs, addr)) {
+			/*
+			 * Interrupts are coming too fast - throttle them
+			 * by setting the counter to 0, so it will be
+			 * at least 2^30 cycles until the next interrupt
+			 * (assuming each counter counts at most 2 counts
+			 * per cycle).
+			 */
+			val = 0;
+			left = ~0ULL >> 1;
+		}
 	}
+
+	write_pmc(counter->hw.idx, val);
+	atomic64_set(&counter->hw.prev_count, val);
+	atomic64_set(&counter->hw.period_left, left);
+	perf_counter_update_userpage(counter);
 }
 
 /*
-- 
cgit v0.10.2


From 79202ba9ff8cf570a75596f42e011167734d1c4b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 08:10:00 +0200
Subject: perf_counter, x86: Fix APIC NMI programming

My Nehalem box locks up in certain situations (with an
always-asserted NMI causing a lockup) if the PMU LVT
entry is programmed between NMI and IRQ mode with a
high frequency.

Standardize exlusively on NMIs instead.

[ Impact: fix lockup ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 189bf9d..ece3813 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -285,14 +285,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
 
 	/*
-	 * If privileged enough, allow NMI events:
+	 * Use NMI events all the time:
 	 */
-	hwc->nmi = 0;
-	if (hw_event->nmi) {
-		if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN))
-			return -EACCES;
-		hwc->nmi = 1;
-	}
+	hwc->nmi	= 1;
+	hw_event->nmi	= 1;
 
 	if (!hwc->irq_period)
 		hwc->irq_period = x86_pmu.max_period;
@@ -553,9 +549,6 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 	if (!x86_pmu.num_counters_fixed)
 		return -1;
 
-	if (unlikely(hwc->nmi))
-		return -1;
-
 	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
 
 	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS)))
@@ -806,9 +799,6 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 		counter = cpuc->counters[idx];
 		hwc = &counter->hw;
 
-		if (counter->hw_event.nmi != nmi)
-			continue;
-
 		val = x86_perf_counter_update(counter, hwc, idx);
 		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
 			continue;
-- 
cgit v0.10.2


From aaba98018b8295dfa2119345d17f833d74448cd0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 08:10:00 +0200
Subject: perf_counter, x86: Make NMI lockups more robust

We have a debug check that detects stuck NMIs and returns with
the PMU disabled in the global ctrl MSR - but i managed to trigger
a situation where this was not enough to deassert the NMI.

So clear/reset the full PMU and keep the disable count balanced when
exiting from here. This way the box produces a debug warning but
stays up and is more debuggable.

[ Impact: in case of PMU related bugs, recover more gracefully ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index ece3813..2eeaa99 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -724,6 +724,30 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter)
 		intel_pmu_enable_counter(hwc, idx);
 }
 
+static void intel_pmu_reset(void)
+{
+	unsigned long flags;
+	int idx;
+
+	if (!x86_pmu.num_counters)
+		return;
+
+	local_irq_save(flags);
+
+	printk("clearing PMU state on CPU#%d\n", smp_processor_id());
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
+		checking_wrmsrl(x86_pmu.perfctr  + idx, 0ull);
+	}
+	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+		checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+	}
+
+	local_irq_restore(flags);
+}
+
+
 /*
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
@@ -750,6 +774,8 @@ again:
 	if (++loops > 100) {
 		WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
 		perf_counter_print_debug();
+		intel_pmu_reset();
+		perf_enable();
 		return 1;
 	}
 
-- 
cgit v0.10.2


From 329d876d6fd326109f191ae0fb2798b8834fb70b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 08:10:00 +0200
Subject: perf_counter: Initialize ->oncpu properly

This shouldnt matter normally (and i have not seen any
misbehavior), because active counters always have a
proper ->oncpu value - but nevertheless initialize the
field properly to -1.

[ Impact: cleanup ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 070f92d..367299f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3122,6 +3122,8 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->group_leader		= group_leader;
 	counter->pmu			= NULL;
 	counter->ctx			= ctx;
+	counter->oncpu			= -1;
+
 	get_ctx(ctx);
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
-- 
cgit v0.10.2


From 287c129716c2d4b75f3d8dd6e68732a3cd326ee6 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Tue, 26 May 2009 07:04:52 +0000
Subject: sh: Add ms7724se (SH7724) board support

This adds preliminary support for the ms7724se solution engine board.

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/boards/Kconfig b/arch/sh/boards/Kconfig
index 99e2d47..1c91b1f 100644
--- a/arch/sh/boards/Kconfig
+++ b/arch/sh/boards/Kconfig
@@ -46,6 +46,15 @@ config SH_7722_SOLUTION_ENGINE
 	  Select 7722 SolutionEngine if configuring for a Hitachi SH772
 	  evaluation board.
 
+config SH_7724_SOLUTION_ENGINE
+	bool "SolutionEngine7724"
+	select SOLUTION_ENGINE
+	depends on CPU_SUBTYPE_SH7724
+	select ARCH_REQUIRE_GPIOLIB
+	help
+	  Select 7724 SolutionEngine if configuring for a Hitachi SH7724
+	  evaluation board.
+
 config SH_7751_SOLUTION_ENGINE
 	bool "SolutionEngine7751"
 	select SOLUTION_ENGINE
diff --git a/arch/sh/boards/mach-se/7724/Makefile b/arch/sh/boards/mach-se/7724/Makefile
new file mode 100644
index 0000000..349cbd6
--- /dev/null
+++ b/arch/sh/boards/mach-se/7724/Makefile
@@ -0,0 +1,10 @@
+#
+# Makefile for the HITACHI UL SolutionEngine 7724 specific parts of the kernel
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License.  See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+#
+
+obj-y	 := setup.o irq.o
\ No newline at end of file
diff --git a/arch/sh/boards/mach-se/7724/irq.c b/arch/sh/boards/mach-se/7724/irq.c
new file mode 100644
index 0000000..f76cf3b
--- /dev/null
+++ b/arch/sh/boards/mach-se/7724/irq.c
@@ -0,0 +1,139 @@
+/*
+ * linux/arch/sh/boards/se/7724/irq.c
+ *
+ * Copyright (C) 2009 Renesas Solutions Corp.
+ *
+ * Kuninori Morimoto <morimoto.kuninori@renesas.com>
+ *
+ * Based on  linux/arch/sh/boards/se/7722/irq.c
+ * Copyright (C) 2007  Nobuhiro Iwamatsu
+ *
+ * Hitachi UL SolutionEngine 7724 Support.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <asm/irq.h>
+#include <asm/io.h>
+#include <mach-se/mach/se7724.h>
+
+struct fpga_irq {
+	unsigned long  sraddr;
+	unsigned long  mraddr;
+	unsigned short mask;
+	unsigned int   base;
+};
+
+static unsigned int fpga2irq(unsigned int irq)
+{
+	if (irq >= IRQ0_BASE &&
+	    irq <= IRQ0_END)
+		return IRQ0_IRQ;
+	else if (irq >= IRQ1_BASE &&
+		 irq <= IRQ1_END)
+		return IRQ1_IRQ;
+	else
+		return IRQ2_IRQ;
+}
+
+static struct fpga_irq get_fpga_irq(unsigned int irq)
+{
+	struct fpga_irq set;
+
+	switch (irq) {
+	case IRQ0_IRQ:
+		set.sraddr = IRQ0_SR;
+		set.mraddr = IRQ0_MR;
+		set.mask   = IRQ0_MASK;
+		set.base   = IRQ0_BASE;
+		break;
+	case IRQ1_IRQ:
+		set.sraddr = IRQ1_SR;
+		set.mraddr = IRQ1_MR;
+		set.mask   = IRQ1_MASK;
+		set.base   = IRQ1_BASE;
+		break;
+	default:
+		set.sraddr = IRQ2_SR;
+		set.mraddr = IRQ2_MR;
+		set.mask   = IRQ2_MASK;
+		set.base   = IRQ2_BASE;
+		break;
+	}
+
+	return set;
+}
+
+static void disable_se7724_irq(unsigned int irq)
+{
+	struct fpga_irq set = get_fpga_irq(fpga2irq(irq));
+	unsigned int bit = irq - set.base;
+	ctrl_outw(ctrl_inw(set.mraddr) | 0x0001 << bit, set.mraddr);
+}
+
+static void enable_se7724_irq(unsigned int irq)
+{
+	struct fpga_irq set = get_fpga_irq(fpga2irq(irq));
+	unsigned int bit = irq - set.base;
+	ctrl_outw(ctrl_inw(set.mraddr) & ~(0x0001 << bit), set.mraddr);
+}
+
+static struct irq_chip se7724_irq_chip __read_mostly = {
+	.name           = "SE7724-FPGA",
+	.mask           = disable_se7724_irq,
+	.unmask         = enable_se7724_irq,
+	.mask_ack       = disable_se7724_irq,
+};
+
+static void se7724_irq_demux(unsigned int irq, struct irq_desc *desc)
+{
+	struct fpga_irq set = get_fpga_irq(irq);
+	unsigned short intv = ctrl_inw(set.sraddr);
+	struct irq_desc *ext_desc;
+	unsigned int ext_irq = set.base;
+
+	intv &= set.mask;
+
+	while (intv) {
+		if (intv & 0x0001) {
+			ext_desc = irq_desc + ext_irq;
+			handle_level_irq(ext_irq, ext_desc);
+		}
+		intv >>= 1;
+		ext_irq++;
+	}
+}
+
+/*
+ * Initialize IRQ setting
+ */
+void __init init_se7724_IRQ(void)
+{
+	int i;
+
+	ctrl_outw(0xffff, IRQ0_MR);  /* mask all */
+	ctrl_outw(0xffff, IRQ1_MR);  /* mask all */
+	ctrl_outw(0xffff, IRQ2_MR);  /* mask all */
+	ctrl_outw(0x0000, IRQ0_SR);  /* clear irq */
+	ctrl_outw(0x0000, IRQ1_SR);  /* clear irq */
+	ctrl_outw(0x0000, IRQ2_SR);  /* clear irq */
+	ctrl_outw(0x002a, IRQ_MODE); /* set irq type */
+
+	for (i = 0; i < SE7724_FPGA_IRQ_NR; i++)
+		set_irq_chip_and_handler_name(SE7724_FPGA_IRQ_BASE + i,
+					      &se7724_irq_chip,
+					      handle_level_irq, "level");
+
+	set_irq_chained_handler(IRQ0_IRQ, se7724_irq_demux);
+	set_irq_type(IRQ0_IRQ, IRQ_TYPE_LEVEL_LOW);
+
+	set_irq_chained_handler(IRQ1_IRQ, se7724_irq_demux);
+	set_irq_type(IRQ1_IRQ, IRQ_TYPE_LEVEL_LOW);
+
+	set_irq_chained_handler(IRQ2_IRQ, se7724_irq_demux);
+	set_irq_type(IRQ2_IRQ, IRQ_TYPE_LEVEL_LOW);
+}
diff --git a/arch/sh/boards/mach-se/7724/setup.c b/arch/sh/boards/mach-se/7724/setup.c
new file mode 100644
index 0000000..9cd04bd
--- /dev/null
+++ b/arch/sh/boards/mach-se/7724/setup.c
@@ -0,0 +1,448 @@
+/*
+ * linux/arch/sh/boards/se/7724/setup.c
+ *
+ * Copyright (C) 2009 Renesas Solutions Corp.
+ *
+ * Kuninori Morimoto <morimoto.kuninori@renesas.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/mtd/physmap.h>
+#include <linux/delay.h>
+#include <linux/smc91x.h>
+#include <linux/gpio.h>
+#include <linux/input.h>
+#include <video/sh_mobile_lcdc.h>
+#include <media/sh_mobile_ceu.h>
+#include <asm/io.h>
+#include <asm/heartbeat.h>
+#include <asm/sh_keysc.h>
+#include <cpu/sh7724.h>
+#include <mach-se/mach/se7724.h>
+
+/*
+ * SWx    1234 5678
+ * ------------------------------------
+ * SW31 : 1001 1100    : default
+ * SW32 : 0111 1111    : use on board flash
+ *
+ * SW41 : abxx xxxx  -> a = 0 : Analog  monitor
+ *                          1 : Digital monitor
+ *                      b = 0 : VGA
+ *                          1 : SVGA
+ */
+
+/* Heartbeat */
+static struct heartbeat_data heartbeat_data = {
+	.regsize = 16,
+};
+
+static struct resource heartbeat_resources[] = {
+	[0] = {
+		.start  = PA_LED,
+		.end    = PA_LED,
+		.flags  = IORESOURCE_MEM,
+	},
+};
+
+static struct platform_device heartbeat_device = {
+	.name           = "heartbeat",
+	.id             = -1,
+	.dev = {
+		.platform_data = &heartbeat_data,
+	},
+	.num_resources  = ARRAY_SIZE(heartbeat_resources),
+	.resource       = heartbeat_resources,
+};
+
+/* LAN91C111 */
+static struct smc91x_platdata smc91x_info = {
+	.flags = SMC91X_USE_16BIT | SMC91X_NOWAIT,
+};
+
+static struct resource smc91x_eth_resources[] = {
+	[0] = {
+		.name   = "SMC91C111" ,
+		.start  = 0x1a300300,
+		.end    = 0x1a30030f,
+		.flags  = IORESOURCE_MEM,
+	},
+	[1] = {
+		.start  = IRQ0_SMC,
+		.flags  = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL,
+	},
+};
+
+static struct platform_device smc91x_eth_device = {
+	.name	= "smc91x",
+	.num_resources  = ARRAY_SIZE(smc91x_eth_resources),
+	.resource       = smc91x_eth_resources,
+	.dev	= {
+		.platform_data	= &smc91x_info,
+	},
+};
+
+/* MTD */
+static struct mtd_partition nor_flash_partitions[] = {
+	{
+		.name = "uboot",
+		.offset = 0,
+		.size = (1 * 1024 * 1024),
+		.mask_flags = MTD_WRITEABLE,	/* Read-only */
+	}, {
+		.name = "kernel",
+		.offset = MTDPART_OFS_APPEND,
+		.size = (2 * 1024 * 1024),
+	}, {
+		.name = "free-area",
+		.offset = MTDPART_OFS_APPEND,
+		.size = MTDPART_SIZ_FULL,
+	},
+};
+
+static struct physmap_flash_data nor_flash_data = {
+	.width		= 2,
+	.parts		= nor_flash_partitions,
+	.nr_parts	= ARRAY_SIZE(nor_flash_partitions),
+};
+
+static struct resource nor_flash_resources[] = {
+	[0] = {
+		.name	= "NOR Flash",
+		.start	= 0x00000000,
+		.end	= 0x01ffffff,
+		.flags	= IORESOURCE_MEM,
+	}
+};
+
+static struct platform_device nor_flash_device = {
+	.name		= "physmap-flash",
+	.resource	= nor_flash_resources,
+	.num_resources	= ARRAY_SIZE(nor_flash_resources),
+	.dev		= {
+		.platform_data = &nor_flash_data,
+	},
+};
+
+/* LCDC */
+static struct sh_mobile_lcdc_info lcdc_info = {
+	.clock_source = LCDC_CLK_EXTERNAL,
+	.ch[0] = {
+		.chan = LCDC_CHAN_MAINLCD,
+		.bpp = 16,
+		.clock_divider = 1,
+		.lcd_cfg = {
+			.name = "LB070WV1",
+			.sync = 0, /* hsync and vsync are active low */
+		},
+		.lcd_size_cfg = { /* 7.0 inch */
+			.width = 152,
+			.height = 91,
+		},
+		.board_cfg = {
+		},
+	}
+};
+
+static struct resource lcdc_resources[] = {
+	[0] = {
+		.name	= "LCDC",
+		.start	= 0xfe940000,
+		.end	= 0xfe941fff,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 106,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device lcdc_device = {
+	.name		= "sh_mobile_lcdc_fb",
+	.num_resources	= ARRAY_SIZE(lcdc_resources),
+	.resource	= lcdc_resources,
+	.dev		= {
+		.platform_data	= &lcdc_info,
+	},
+};
+
+/* CEU0 */
+static struct sh_mobile_ceu_info sh_mobile_ceu0_info = {
+	.flags = SH_CEU_FLAG_USE_8BIT_BUS,
+};
+
+static struct resource ceu0_resources[] = {
+	[0] = {
+		.name	= "CEU0",
+		.start	= 0xfe910000,
+		.end	= 0xfe91009f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start  = 52,
+		.flags  = IORESOURCE_IRQ,
+	},
+	[2] = {
+		/* place holder for contiguous memory */
+	},
+};
+
+static struct platform_device ceu0_device = {
+	.name		= "sh_mobile_ceu",
+	.id             = 0, /* "ceu0" clock */
+	.num_resources	= ARRAY_SIZE(ceu0_resources),
+	.resource	= ceu0_resources,
+	.dev	= {
+		.platform_data	= &sh_mobile_ceu0_info,
+	},
+};
+
+/* CEU1 */
+static struct sh_mobile_ceu_info sh_mobile_ceu1_info = {
+	.flags = SH_CEU_FLAG_USE_8BIT_BUS,
+};
+
+static struct resource ceu1_resources[] = {
+	[0] = {
+		.name	= "CEU1",
+		.start	= 0xfe914000,
+		.end	= 0xfe91409f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start  = 63,
+		.flags  = IORESOURCE_IRQ,
+	},
+	[2] = {
+		/* place holder for contiguous memory */
+	},
+};
+
+static struct platform_device ceu1_device = {
+	.name		= "sh_mobile_ceu",
+	.id             = 1, /* "ceu1" clock */
+	.num_resources	= ARRAY_SIZE(ceu1_resources),
+	.resource	= ceu1_resources,
+	.dev	= {
+		.platform_data	= &sh_mobile_ceu1_info,
+	},
+};
+
+/* KEYSC */
+static struct sh_keysc_info keysc_info = {
+	.mode = SH_KEYSC_MODE_1,
+	.scan_timing = 10,
+	.delay = 50,
+	.keycodes = {
+		KEY_1, KEY_2, KEY_3, KEY_4, KEY_5,
+		KEY_6, KEY_7, KEY_8, KEY_9, KEY_A,
+		KEY_B, KEY_C, KEY_D, KEY_E, KEY_F,
+		KEY_G, KEY_H, KEY_I, KEY_K, KEY_L,
+		KEY_M, KEY_N, KEY_O, KEY_P, KEY_Q,
+		KEY_R, KEY_S, KEY_T, KEY_U, KEY_V,
+	},
+};
+
+static struct resource keysc_resources[] = {
+	[0] = {
+		.start  = 0x1a204000,
+		.end    = 0x1a20400f,
+		.flags  = IORESOURCE_MEM,
+	},
+	[1] = {
+		.start  = IRQ0_KEY,
+		.flags  = IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device keysc_device = {
+	.name           = "sh_keysc",
+	.id             = 0, /* "keysc0" clock */
+	.num_resources  = ARRAY_SIZE(keysc_resources),
+	.resource       = keysc_resources,
+	.dev	= {
+		.platform_data	= &keysc_info,
+	},
+};
+
+static struct platform_device *ms7724se_devices[] __initdata = {
+	&heartbeat_device,
+	&smc91x_eth_device,
+	&lcdc_device,
+	&nor_flash_device,
+	&ceu0_device,
+	&ceu1_device,
+	&keysc_device,
+};
+
+#define SW4140    0xBA201000
+#define FPGA_OUT  0xBA200400
+#define PORT_HIZA 0xA4050158
+
+#define SW41_A    0x0100
+#define SW41_B    0x0200
+#define SW41_C    0x0400
+#define SW41_D    0x0800
+#define SW41_E    0x1000
+#define SW41_F    0x2000
+#define SW41_G    0x4000
+#define SW41_H    0x8000
+static int __init devices_setup(void)
+{
+	u16 sw = ctrl_inw(SW4140); /* select camera, monitor */
+
+	/* Reset Release */
+	ctrl_outw(ctrl_inw(FPGA_OUT) &
+		  ~((1 << 1)  | /* LAN */
+		    (1 << 6)  | /* VIDEO DAC */
+		    (1 << 12)), /* USB0 */
+		  FPGA_OUT);
+
+	/* enable IRQ 0,1,2 */
+	gpio_request(GPIO_FN_INTC_IRQ0, NULL);
+	gpio_request(GPIO_FN_INTC_IRQ1, NULL);
+	gpio_request(GPIO_FN_INTC_IRQ2, NULL);
+
+	/* enable SCIFA3 */
+	gpio_request(GPIO_FN_SCIF3_I_SCK, NULL);
+	gpio_request(GPIO_FN_SCIF3_I_RXD, NULL);
+	gpio_request(GPIO_FN_SCIF3_I_TXD, NULL);
+	gpio_request(GPIO_FN_SCIF3_I_CTS, NULL);
+	gpio_request(GPIO_FN_SCIF3_I_RTS, NULL);
+
+	/* enable LCDC */
+	gpio_request(GPIO_FN_LCDD23,   NULL);
+	gpio_request(GPIO_FN_LCDD22,   NULL);
+	gpio_request(GPIO_FN_LCDD21,   NULL);
+	gpio_request(GPIO_FN_LCDD20,   NULL);
+	gpio_request(GPIO_FN_LCDD19,   NULL);
+	gpio_request(GPIO_FN_LCDD18,   NULL);
+	gpio_request(GPIO_FN_LCDD17,   NULL);
+	gpio_request(GPIO_FN_LCDD16,   NULL);
+	gpio_request(GPIO_FN_LCDD15,   NULL);
+	gpio_request(GPIO_FN_LCDD14,   NULL);
+	gpio_request(GPIO_FN_LCDD13,   NULL);
+	gpio_request(GPIO_FN_LCDD12,   NULL);
+	gpio_request(GPIO_FN_LCDD11,   NULL);
+	gpio_request(GPIO_FN_LCDD10,   NULL);
+	gpio_request(GPIO_FN_LCDD9,    NULL);
+	gpio_request(GPIO_FN_LCDD8,    NULL);
+	gpio_request(GPIO_FN_LCDD7,    NULL);
+	gpio_request(GPIO_FN_LCDD6,    NULL);
+	gpio_request(GPIO_FN_LCDD5,    NULL);
+	gpio_request(GPIO_FN_LCDD4,    NULL);
+	gpio_request(GPIO_FN_LCDD3,    NULL);
+	gpio_request(GPIO_FN_LCDD2,    NULL);
+	gpio_request(GPIO_FN_LCDD1,    NULL);
+	gpio_request(GPIO_FN_LCDD0,    NULL);
+	gpio_request(GPIO_FN_LCDDISP,  NULL);
+	gpio_request(GPIO_FN_LCDHSYN,  NULL);
+	gpio_request(GPIO_FN_LCDDCK,   NULL);
+	gpio_request(GPIO_FN_LCDVSYN,  NULL);
+	gpio_request(GPIO_FN_LCDDON,   NULL);
+	gpio_request(GPIO_FN_LCDVEPWC, NULL);
+	gpio_request(GPIO_FN_LCDVCPWC, NULL);
+	gpio_request(GPIO_FN_LCDRD,    NULL);
+	gpio_request(GPIO_FN_LCDLCLK,  NULL);
+	ctrl_outw((ctrl_inw(PORT_HIZA) & ~0x0001), PORT_HIZA);
+
+	/* enable CEU0 */
+	gpio_request(GPIO_FN_VIO0_D15, NULL);
+	gpio_request(GPIO_FN_VIO0_D14, NULL);
+	gpio_request(GPIO_FN_VIO0_D13, NULL);
+	gpio_request(GPIO_FN_VIO0_D12, NULL);
+	gpio_request(GPIO_FN_VIO0_D11, NULL);
+	gpio_request(GPIO_FN_VIO0_D10, NULL);
+	gpio_request(GPIO_FN_VIO0_D9,  NULL);
+	gpio_request(GPIO_FN_VIO0_D8,  NULL);
+	gpio_request(GPIO_FN_VIO0_D7,  NULL);
+	gpio_request(GPIO_FN_VIO0_D6,  NULL);
+	gpio_request(GPIO_FN_VIO0_D5,  NULL);
+	gpio_request(GPIO_FN_VIO0_D4,  NULL);
+	gpio_request(GPIO_FN_VIO0_D3,  NULL);
+	gpio_request(GPIO_FN_VIO0_D2,  NULL);
+	gpio_request(GPIO_FN_VIO0_D1,  NULL);
+	gpio_request(GPIO_FN_VIO0_D0,  NULL);
+	gpio_request(GPIO_FN_VIO0_VD,  NULL);
+	gpio_request(GPIO_FN_VIO0_CLK, NULL);
+	gpio_request(GPIO_FN_VIO0_FLD, NULL);
+	gpio_request(GPIO_FN_VIO0_HD,  NULL);
+	platform_resource_setup_memory(&ceu0_device, "ceu", 4 << 20);
+
+	/* enable CEU1 */
+	gpio_request(GPIO_FN_VIO1_D7,  NULL);
+	gpio_request(GPIO_FN_VIO1_D6,  NULL);
+	gpio_request(GPIO_FN_VIO1_D5,  NULL);
+	gpio_request(GPIO_FN_VIO1_D4,  NULL);
+	gpio_request(GPIO_FN_VIO1_D3,  NULL);
+	gpio_request(GPIO_FN_VIO1_D2,  NULL);
+	gpio_request(GPIO_FN_VIO1_D1,  NULL);
+	gpio_request(GPIO_FN_VIO1_D0,  NULL);
+	gpio_request(GPIO_FN_VIO1_FLD, NULL);
+	gpio_request(GPIO_FN_VIO1_HD,  NULL);
+	gpio_request(GPIO_FN_VIO1_VD,  NULL);
+	gpio_request(GPIO_FN_VIO1_CLK, NULL);
+	platform_resource_setup_memory(&ceu1_device, "ceu", 4 << 20);
+
+	/* KEYSC */
+	gpio_request(GPIO_FN_KEYOUT5_IN5, NULL);
+	gpio_request(GPIO_FN_KEYOUT4_IN6, NULL);
+	gpio_request(GPIO_FN_KEYIN4,      NULL);
+	gpio_request(GPIO_FN_KEYIN3,      NULL);
+	gpio_request(GPIO_FN_KEYIN2,      NULL);
+	gpio_request(GPIO_FN_KEYIN1,      NULL);
+	gpio_request(GPIO_FN_KEYIN0,      NULL);
+	gpio_request(GPIO_FN_KEYOUT3,     NULL);
+	gpio_request(GPIO_FN_KEYOUT2,     NULL);
+	gpio_request(GPIO_FN_KEYOUT1,     NULL);
+	gpio_request(GPIO_FN_KEYOUT0,     NULL);
+
+	if (sw & SW41_B) {
+		/* SVGA */
+		lcdc_info.ch[0].lcd_cfg.xres         = 800;
+		lcdc_info.ch[0].lcd_cfg.yres         = 600;
+		lcdc_info.ch[0].lcd_cfg.left_margin  = 142;
+		lcdc_info.ch[0].lcd_cfg.right_margin = 52;
+		lcdc_info.ch[0].lcd_cfg.hsync_len    = 96;
+		lcdc_info.ch[0].lcd_cfg.upper_margin = 24;
+		lcdc_info.ch[0].lcd_cfg.lower_margin = 2;
+		lcdc_info.ch[0].lcd_cfg.vsync_len    = 2;
+	} else {
+		/* VGA */
+		lcdc_info.ch[0].lcd_cfg.xres         = 640;
+		lcdc_info.ch[0].lcd_cfg.yres         = 480;
+		lcdc_info.ch[0].lcd_cfg.left_margin  = 105;
+		lcdc_info.ch[0].lcd_cfg.right_margin = 50;
+		lcdc_info.ch[0].lcd_cfg.hsync_len    = 96;
+		lcdc_info.ch[0].lcd_cfg.upper_margin = 33;
+		lcdc_info.ch[0].lcd_cfg.lower_margin = 10;
+		lcdc_info.ch[0].lcd_cfg.vsync_len    = 2;
+	}
+
+	if (sw & SW41_A) {
+		/* Digital monitor */
+		lcdc_info.ch[0].interface_type = RGB18;
+		lcdc_info.ch[0].flags          = 0;
+	} else {
+		/* Analog monitor */
+		lcdc_info.ch[0].interface_type = RGB24;
+		lcdc_info.ch[0].flags          = LCDC_FLAGS_DWPOL;
+	}
+
+	return platform_add_devices(ms7724se_devices,
+				ARRAY_SIZE(ms7724se_devices));
+}
+device_initcall(devices_setup);
+
+static struct sh_machine_vector mv_ms7724se __initmv = {
+	.mv_name	= "ms7724se",
+	.mv_init_irq	= init_se7724_IRQ,
+	.mv_nr_irqs	= SE7724_FPGA_IRQ_BASE + SE7724_FPGA_IRQ_NR,
+};
diff --git a/arch/sh/boards/mach-se/Makefile b/arch/sh/boards/mach-se/Makefile
index 2de42ba..b537e23 100644
--- a/arch/sh/boards/mach-se/Makefile
+++ b/arch/sh/boards/mach-se/Makefile
@@ -7,3 +7,4 @@ obj-$(CONFIG_SH_7751_SOLUTION_ENGINE)	+= 7751/
 obj-$(CONFIG_SH_7780_SOLUTION_ENGINE)	+= 7780/
 obj-$(CONFIG_SH_7343_SOLUTION_ENGINE)	+= 7343/
 obj-$(CONFIG_SH_7721_SOLUTION_ENGINE)	+= 7721/
+obj-$(CONFIG_SH_7724_SOLUTION_ENGINE)	+= 7724/
diff --git a/arch/sh/configs/se7724_defconfig b/arch/sh/configs/se7724_defconfig
new file mode 100644
index 0000000..a029d0fb
--- /dev/null
+++ b/arch/sh/configs/se7724_defconfig
@@ -0,0 +1,1552 @@
+#
+# Automatically generated make config: don't edit
+# Linux kernel version: 2.6.30-rc6
+# Tue May 26 13:18:09 2009
+#
+CONFIG_SUPERH=y
+CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
+CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
+CONFIG_RWSEM_GENERIC_SPINLOCK=y
+CONFIG_GENERIC_BUG=y
+CONFIG_GENERIC_FIND_NEXT_BIT=y
+CONFIG_GENERIC_HWEIGHT=y
+CONFIG_GENERIC_HARDIRQS=y
+CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
+CONFIG_GENERIC_IRQ_PROBE=y
+CONFIG_GENERIC_GPIO=y
+CONFIG_GENERIC_TIME=y
+CONFIG_GENERIC_CLOCKEVENTS=y
+CONFIG_ARCH_SUSPEND_POSSIBLE=y
+CONFIG_ARCH_HIBERNATION_POSSIBLE=y
+CONFIG_SYS_SUPPORTS_CMT=y
+CONFIG_SYS_SUPPORTS_TMU=y
+CONFIG_STACKTRACE_SUPPORT=y
+CONFIG_LOCKDEP_SUPPORT=y
+CONFIG_HAVE_LATENCYTOP_SUPPORT=y
+# CONFIG_ARCH_HAS_ILOG2_U32 is not set
+# CONFIG_ARCH_HAS_ILOG2_U64 is not set
+CONFIG_ARCH_NO_VIRT_TO_BUS=y
+CONFIG_ARCH_HAS_DEFAULT_IDLE=y
+CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
+
+#
+# General setup
+#
+CONFIG_EXPERIMENTAL=y
+CONFIG_BROKEN_ON_SMP=y
+CONFIG_LOCK_KERNEL=y
+CONFIG_INIT_ENV_ARG_LIMIT=32
+CONFIG_LOCALVERSION=""
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_SWAP=y
+CONFIG_SYSVIPC=y
+CONFIG_SYSVIPC_SYSCTL=y
+# CONFIG_POSIX_MQUEUE is not set
+CONFIG_BSD_PROCESS_ACCT=y
+# CONFIG_BSD_PROCESS_ACCT_V3 is not set
+# CONFIG_TASKSTATS is not set
+# CONFIG_AUDIT is not set
+
+#
+# RCU Subsystem
+#
+CONFIG_CLASSIC_RCU=y
+# CONFIG_TREE_RCU is not set
+# CONFIG_PREEMPT_RCU is not set
+# CONFIG_TREE_RCU_TRACE is not set
+# CONFIG_PREEMPT_RCU_TRACE is not set
+# CONFIG_IKCONFIG is not set
+CONFIG_LOG_BUF_SHIFT=14
+CONFIG_GROUP_SCHED=y
+CONFIG_FAIR_GROUP_SCHED=y
+# CONFIG_RT_GROUP_SCHED is not set
+CONFIG_USER_SCHED=y
+# CONFIG_CGROUP_SCHED is not set
+# CONFIG_CGROUPS is not set
+CONFIG_SYSFS_DEPRECATED=y
+CONFIG_SYSFS_DEPRECATED_V2=y
+# CONFIG_RELAY is not set
+# CONFIG_NAMESPACES is not set
+# CONFIG_BLK_DEV_INITRD is not set
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+CONFIG_SYSCTL=y
+CONFIG_ANON_INODES=y
+CONFIG_EMBEDDED=y
+CONFIG_UID16=y
+CONFIG_SYSCTL_SYSCALL=y
+# CONFIG_KALLSYMS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
+CONFIG_HOTPLUG=y
+CONFIG_PRINTK=y
+CONFIG_BUG=y
+CONFIG_ELF_CORE=y
+CONFIG_BASE_FULL=y
+CONFIG_FUTEX=y
+CONFIG_EPOLL=y
+CONFIG_SIGNALFD=y
+CONFIG_TIMERFD=y
+CONFIG_EVENTFD=y
+CONFIG_SHMEM=y
+CONFIG_AIO=y
+CONFIG_VM_EVENT_COUNTERS=y
+CONFIG_COMPAT_BRK=y
+CONFIG_SLAB=y
+# CONFIG_SLUB is not set
+# CONFIG_SLOB is not set
+# CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
+CONFIG_HAVE_OPROFILE=y
+CONFIG_HAVE_IOREMAP_PROT=y
+CONFIG_HAVE_KPROBES=y
+CONFIG_HAVE_KRETPROBES=y
+CONFIG_HAVE_ARCH_TRACEHOOK=y
+CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
+CONFIG_HAVE_GENERIC_DMA_COHERENT=y
+CONFIG_SLABINFO=y
+CONFIG_RT_MUTEXES=y
+CONFIG_BASE_SMALL=0
+CONFIG_MODULES=y
+# CONFIG_MODULE_FORCE_LOAD is not set
+CONFIG_MODULE_UNLOAD=y
+# CONFIG_MODULE_FORCE_UNLOAD is not set
+# CONFIG_MODVERSIONS is not set
+# CONFIG_MODULE_SRCVERSION_ALL is not set
+CONFIG_BLOCK=y
+# CONFIG_LBD is not set
+# CONFIG_BLK_DEV_BSG is not set
+# CONFIG_BLK_DEV_INTEGRITY is not set
+
+#
+# IO Schedulers
+#
+CONFIG_IOSCHED_NOOP=y
+CONFIG_IOSCHED_AS=y
+CONFIG_IOSCHED_DEADLINE=y
+CONFIG_IOSCHED_CFQ=y
+# CONFIG_DEFAULT_AS is not set
+# CONFIG_DEFAULT_DEADLINE is not set
+CONFIG_DEFAULT_CFQ=y
+# CONFIG_DEFAULT_NOOP is not set
+CONFIG_DEFAULT_IOSCHED="cfq"
+# CONFIG_FREEZER is not set
+
+#
+# System type
+#
+CONFIG_CPU_SH4=y
+CONFIG_CPU_SH4A=y
+CONFIG_CPU_SHX2=y
+CONFIG_ARCH_SHMOBILE=y
+# CONFIG_CPU_SUBTYPE_SH7619 is not set
+# CONFIG_CPU_SUBTYPE_SH7201 is not set
+# CONFIG_CPU_SUBTYPE_SH7203 is not set
+# CONFIG_CPU_SUBTYPE_SH7206 is not set
+# CONFIG_CPU_SUBTYPE_SH7263 is not set
+# CONFIG_CPU_SUBTYPE_MXG is not set
+# CONFIG_CPU_SUBTYPE_SH7705 is not set
+# CONFIG_CPU_SUBTYPE_SH7706 is not set
+# CONFIG_CPU_SUBTYPE_SH7707 is not set
+# CONFIG_CPU_SUBTYPE_SH7708 is not set
+# CONFIG_CPU_SUBTYPE_SH7709 is not set
+# CONFIG_CPU_SUBTYPE_SH7710 is not set
+# CONFIG_CPU_SUBTYPE_SH7712 is not set
+# CONFIG_CPU_SUBTYPE_SH7720 is not set
+# CONFIG_CPU_SUBTYPE_SH7721 is not set
+# CONFIG_CPU_SUBTYPE_SH7750 is not set
+# CONFIG_CPU_SUBTYPE_SH7091 is not set
+# CONFIG_CPU_SUBTYPE_SH7750R is not set
+# CONFIG_CPU_SUBTYPE_SH7750S is not set
+# CONFIG_CPU_SUBTYPE_SH7751 is not set
+# CONFIG_CPU_SUBTYPE_SH7751R is not set
+# CONFIG_CPU_SUBTYPE_SH7760 is not set
+# CONFIG_CPU_SUBTYPE_SH4_202 is not set
+# CONFIG_CPU_SUBTYPE_SH7723 is not set
+CONFIG_CPU_SUBTYPE_SH7724=y
+# CONFIG_CPU_SUBTYPE_SH7763 is not set
+# CONFIG_CPU_SUBTYPE_SH7770 is not set
+# CONFIG_CPU_SUBTYPE_SH7780 is not set
+# CONFIG_CPU_SUBTYPE_SH7785 is not set
+# CONFIG_CPU_SUBTYPE_SH7786 is not set
+# CONFIG_CPU_SUBTYPE_SHX3 is not set
+# CONFIG_CPU_SUBTYPE_SH7343 is not set
+# CONFIG_CPU_SUBTYPE_SH7722 is not set
+# CONFIG_CPU_SUBTYPE_SH7366 is not set
+
+#
+# Memory management options
+#
+CONFIG_QUICKLIST=y
+CONFIG_MMU=y
+CONFIG_PAGE_OFFSET=0x80000000
+CONFIG_FORCE_MAX_ZONEORDER=11
+CONFIG_MEMORY_START=0x08000000
+CONFIG_MEMORY_SIZE=0x08000000
+CONFIG_29BIT=y
+# CONFIG_X2TLB is not set
+CONFIG_VSYSCALL=y
+CONFIG_ARCH_FLATMEM_ENABLE=y
+CONFIG_ARCH_SPARSEMEM_ENABLE=y
+CONFIG_ARCH_SPARSEMEM_DEFAULT=y
+CONFIG_MAX_ACTIVE_REGIONS=1
+CONFIG_ARCH_POPULATES_NODE_MAP=y
+CONFIG_ARCH_SELECT_MEMORY_MODEL=y
+CONFIG_PAGE_SIZE_4KB=y
+# CONFIG_PAGE_SIZE_8KB is not set
+# CONFIG_PAGE_SIZE_16KB is not set
+# CONFIG_PAGE_SIZE_64KB is not set
+CONFIG_SELECT_MEMORY_MODEL=y
+CONFIG_FLATMEM_MANUAL=y
+# CONFIG_DISCONTIGMEM_MANUAL is not set
+# CONFIG_SPARSEMEM_MANUAL is not set
+CONFIG_FLATMEM=y
+CONFIG_FLAT_NODE_MEM_MAP=y
+CONFIG_SPARSEMEM_STATIC=y
+CONFIG_PAGEFLAGS_EXTENDED=y
+CONFIG_SPLIT_PTLOCK_CPUS=4
+# CONFIG_PHYS_ADDR_T_64BIT is not set
+CONFIG_ZONE_DMA_FLAG=0
+CONFIG_NR_QUICK=2
+CONFIG_UNEVICTABLE_LRU=y
+CONFIG_HAVE_MLOCK=y
+CONFIG_HAVE_MLOCKED_PAGE_BIT=y
+
+#
+# Cache configuration
+#
+CONFIG_CACHE_WRITEBACK=y
+# CONFIG_CACHE_WRITETHROUGH is not set
+# CONFIG_CACHE_OFF is not set
+
+#
+# Processor features
+#
+CONFIG_CPU_LITTLE_ENDIAN=y
+# CONFIG_CPU_BIG_ENDIAN is not set
+CONFIG_SH_FPU=y
+# CONFIG_SH_STORE_QUEUES is not set
+CONFIG_CPU_HAS_INTEVT=y
+CONFIG_CPU_HAS_SR_RB=y
+CONFIG_CPU_HAS_PTEA=y
+CONFIG_CPU_HAS_FPU=y
+
+#
+# Board support
+#
+CONFIG_SOLUTION_ENGINE=y
+CONFIG_SH_7724_SOLUTION_ENGINE=y
+
+#
+# Timer and clock configuration
+#
+CONFIG_SH_TIMER_TMU=y
+# CONFIG_SH_TIMER_CMT is not set
+CONFIG_SH_PCLK_FREQ=41666666
+# CONFIG_NO_HZ is not set
+# CONFIG_HIGH_RES_TIMERS is not set
+CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
+
+#
+# CPU Frequency scaling
+#
+# CONFIG_CPU_FREQ is not set
+
+#
+# DMA support
+#
+# CONFIG_SH_DMA is not set
+
+#
+# Companion Chips
+#
+
+#
+# Additional SuperH Device Drivers
+#
+CONFIG_HEARTBEAT=y
+# CONFIG_PUSH_SWITCH is not set
+
+#
+# Kernel features
+#
+# CONFIG_HZ_100 is not set
+CONFIG_HZ_250=y
+# CONFIG_HZ_300 is not set
+# CONFIG_HZ_1000 is not set
+CONFIG_HZ=250
+# CONFIG_SCHED_HRTICK is not set
+# CONFIG_KEXEC is not set
+# CONFIG_CRASH_DUMP is not set
+CONFIG_SECCOMP=y
+# CONFIG_PREEMPT_NONE is not set
+# CONFIG_PREEMPT_VOLUNTARY is not set
+CONFIG_PREEMPT=y
+CONFIG_GUSA=y
+
+#
+# Boot options
+#
+CONFIG_ZERO_PAGE_OFFSET=0x00001000
+CONFIG_BOOT_LINK_OFFSET=0x00800000
+CONFIG_ENTRY_OFFSET=0x00001000
+CONFIG_CMDLINE_BOOL=y
+CONFIG_CMDLINE="console=tty1 console=ttySC3,115200 root=/dev/nfs ip=dhcp memchunk.vpu=4m"
+
+#
+# Bus options
+#
+# CONFIG_ARCH_SUPPORTS_MSI is not set
+# CONFIG_PCCARD is not set
+
+#
+# Executable file formats
+#
+CONFIG_BINFMT_ELF=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+# CONFIG_HAVE_AOUT is not set
+# CONFIG_BINFMT_MISC is not set
+
+#
+# Power management options (EXPERIMENTAL)
+#
+# CONFIG_PM is not set
+# CONFIG_CPU_IDLE is not set
+CONFIG_NET=y
+
+#
+# Networking options
+#
+CONFIG_PACKET=y
+# CONFIG_PACKET_MMAP is not set
+CONFIG_UNIX=y
+# CONFIG_NET_KEY is not set
+CONFIG_INET=y
+# CONFIG_IP_MULTICAST is not set
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_ASK_IP_FIB_HASH=y
+# CONFIG_IP_FIB_TRIE is not set
+CONFIG_IP_FIB_HASH=y
+# CONFIG_IP_MULTIPLE_TABLES is not set
+# CONFIG_IP_ROUTE_MULTIPATH is not set
+# CONFIG_IP_ROUTE_VERBOSE is not set
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+# CONFIG_IP_PNP_BOOTP is not set
+# CONFIG_IP_PNP_RARP is not set
+# CONFIG_NET_IPIP is not set
+# CONFIG_NET_IPGRE is not set
+# CONFIG_ARPD is not set
+# CONFIG_SYN_COOKIES is not set
+# CONFIG_INET_AH is not set
+# CONFIG_INET_ESP is not set
+# CONFIG_INET_IPCOMP is not set
+# CONFIG_INET_XFRM_TUNNEL is not set
+# CONFIG_INET_TUNNEL is not set
+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
+# CONFIG_INET_XFRM_MODE_TUNNEL is not set
+# CONFIG_INET_XFRM_MODE_BEET is not set
+# CONFIG_INET_LRO is not set
+CONFIG_INET_DIAG=y
+CONFIG_INET_TCP_DIAG=y
+# CONFIG_TCP_CONG_ADVANCED is not set
+CONFIG_TCP_CONG_CUBIC=y
+CONFIG_DEFAULT_TCP_CONG="cubic"
+# CONFIG_TCP_MD5SIG is not set
+# CONFIG_IPV6 is not set
+# CONFIG_NETWORK_SECMARK is not set
+# CONFIG_NETFILTER is not set
+# CONFIG_IP_DCCP is not set
+# CONFIG_IP_SCTP is not set
+# CONFIG_TIPC is not set
+# CONFIG_ATM is not set
+# CONFIG_BRIDGE is not set
+# CONFIG_NET_DSA is not set
+# CONFIG_VLAN_8021Q is not set
+# CONFIG_DECNET is not set
+# CONFIG_LLC2 is not set
+# CONFIG_IPX is not set
+# CONFIG_ATALK is not set
+# CONFIG_X25 is not set
+# CONFIG_LAPB is not set
+# CONFIG_ECONET is not set
+# CONFIG_WAN_ROUTER is not set
+# CONFIG_PHONET is not set
+# CONFIG_NET_SCHED is not set
+# CONFIG_DCB is not set
+
+#
+# Network testing
+#
+# CONFIG_NET_PKTGEN is not set
+# CONFIG_HAMRADIO is not set
+# CONFIG_CAN is not set
+# CONFIG_IRDA is not set
+# CONFIG_BT is not set
+# CONFIG_AF_RXRPC is not set
+CONFIG_WIRELESS=y
+# CONFIG_CFG80211 is not set
+# CONFIG_WIRELESS_OLD_REGULATORY is not set
+# CONFIG_WIRELESS_EXT is not set
+# CONFIG_LIB80211 is not set
+# CONFIG_MAC80211 is not set
+# CONFIG_WIMAX is not set
+# CONFIG_RFKILL is not set
+# CONFIG_NET_9P is not set
+
+#
+# Device Drivers
+#
+
+#
+# Generic Driver Options
+#
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_STANDALONE=y
+CONFIG_PREVENT_FIRMWARE_BUILD=y
+CONFIG_FW_LOADER=y
+CONFIG_FIRMWARE_IN_KERNEL=y
+CONFIG_EXTRA_FIRMWARE=""
+# CONFIG_SYS_HYPERVISOR is not set
+# CONFIG_CONNECTOR is not set
+CONFIG_MTD=y
+# CONFIG_MTD_DEBUG is not set
+CONFIG_MTD_CONCAT=y
+CONFIG_MTD_PARTITIONS=y
+# CONFIG_MTD_TESTS is not set
+# CONFIG_MTD_REDBOOT_PARTS is not set
+CONFIG_MTD_CMDLINE_PARTS=y
+# CONFIG_MTD_AR7_PARTS is not set
+
+#
+# User Modules And Translation Layers
+#
+CONFIG_MTD_CHAR=y
+CONFIG_MTD_BLKDEVS=y
+CONFIG_MTD_BLOCK=y
+# CONFIG_FTL is not set
+# CONFIG_NFTL is not set
+# CONFIG_INFTL is not set
+# CONFIG_RFD_FTL is not set
+# CONFIG_SSFDC is not set
+# CONFIG_MTD_OOPS is not set
+
+#
+# RAM/ROM/Flash chip drivers
+#
+CONFIG_MTD_CFI=y
+# CONFIG_MTD_JEDECPROBE is not set
+CONFIG_MTD_GEN_PROBE=y
+# CONFIG_MTD_CFI_ADV_OPTIONS is not set
+CONFIG_MTD_MAP_BANK_WIDTH_1=y
+CONFIG_MTD_MAP_BANK_WIDTH_2=y
+CONFIG_MTD_MAP_BANK_WIDTH_4=y
+# CONFIG_MTD_MAP_BANK_WIDTH_8 is not set
+# CONFIG_MTD_MAP_BANK_WIDTH_16 is not set
+# CONFIG_MTD_MAP_BANK_WIDTH_32 is not set
+CONFIG_MTD_CFI_I1=y
+CONFIG_MTD_CFI_I2=y
+# CONFIG_MTD_CFI_I4 is not set
+# CONFIG_MTD_CFI_I8 is not set
+# CONFIG_MTD_CFI_INTELEXT is not set
+CONFIG_MTD_CFI_AMDSTD=y
+# CONFIG_MTD_CFI_STAA is not set
+CONFIG_MTD_CFI_UTIL=y
+# CONFIG_MTD_RAM is not set
+# CONFIG_MTD_ROM is not set
+# CONFIG_MTD_ABSENT is not set
+
+#
+# Mapping drivers for chip access
+#
+# CONFIG_MTD_COMPLEX_MAPPINGS is not set
+CONFIG_MTD_PHYSMAP=y
+# CONFIG_MTD_PHYSMAP_COMPAT is not set
+# CONFIG_MTD_PLATRAM is not set
+
+#
+# Self-contained MTD device drivers
+#
+# CONFIG_MTD_DATAFLASH is not set
+# CONFIG_MTD_M25P80 is not set
+# CONFIG_MTD_SLRAM is not set
+# CONFIG_MTD_PHRAM is not set
+# CONFIG_MTD_MTDRAM is not set
+# CONFIG_MTD_BLOCK2MTD is not set
+
+#
+# Disk-On-Chip Device Drivers
+#
+# CONFIG_MTD_DOC2000 is not set
+# CONFIG_MTD_DOC2001 is not set
+# CONFIG_MTD_DOC2001PLUS is not set
+CONFIG_MTD_NAND=y
+# CONFIG_MTD_NAND_VERIFY_WRITE is not set
+# CONFIG_MTD_NAND_ECC_SMC is not set
+# CONFIG_MTD_NAND_MUSEUM_IDS is not set
+CONFIG_MTD_NAND_IDS=y
+# CONFIG_MTD_NAND_DISKONCHIP is not set
+# CONFIG_MTD_NAND_NANDSIM is not set
+# CONFIG_MTD_NAND_PLATFORM is not set
+# CONFIG_MTD_ALAUDA is not set
+# CONFIG_MTD_ONENAND is not set
+
+#
+# LPDDR flash memory drivers
+#
+# CONFIG_MTD_LPDDR is not set
+
+#
+# UBI - Unsorted block images
+#
+CONFIG_MTD_UBI=y
+CONFIG_MTD_UBI_WL_THRESHOLD=4096
+CONFIG_MTD_UBI_BEB_RESERVE=1
+# CONFIG_MTD_UBI_GLUEBI is not set
+
+#
+# UBI debugging options
+#
+# CONFIG_MTD_UBI_DEBUG is not set
+# CONFIG_PARPORT is not set
+CONFIG_BLK_DEV=y
+# CONFIG_BLK_DEV_COW_COMMON is not set
+# CONFIG_BLK_DEV_LOOP is not set
+# CONFIG_BLK_DEV_NBD is not set
+# CONFIG_BLK_DEV_UB is not set
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_COUNT=4
+CONFIG_BLK_DEV_RAM_SIZE=4096
+# CONFIG_BLK_DEV_XIP is not set
+# CONFIG_CDROM_PKTCDVD is not set
+# CONFIG_ATA_OVER_ETH is not set
+# CONFIG_BLK_DEV_HD is not set
+CONFIG_MISC_DEVICES=y
+# CONFIG_ICS932S401 is not set
+# CONFIG_ENCLOSURE_SERVICES is not set
+# CONFIG_ISL29003 is not set
+# CONFIG_C2PORT is not set
+
+#
+# EEPROM support
+#
+# CONFIG_EEPROM_AT24 is not set
+# CONFIG_EEPROM_AT25 is not set
+# CONFIG_EEPROM_LEGACY is not set
+# CONFIG_EEPROM_93CX6 is not set
+CONFIG_HAVE_IDE=y
+# CONFIG_IDE is not set
+
+#
+# SCSI device support
+#
+# CONFIG_RAID_ATTRS is not set
+CONFIG_SCSI=y
+CONFIG_SCSI_DMA=y
+# CONFIG_SCSI_TGT is not set
+# CONFIG_SCSI_NETLINK is not set
+CONFIG_SCSI_PROC_FS=y
+
+#
+# SCSI support type (disk, tape, CD-ROM)
+#
+CONFIG_BLK_DEV_SD=y
+# CONFIG_CHR_DEV_ST is not set
+# CONFIG_CHR_DEV_OSST is not set
+# CONFIG_BLK_DEV_SR is not set
+# CONFIG_CHR_DEV_SG is not set
+# CONFIG_CHR_DEV_SCH is not set
+
+#
+# Some SCSI devices (e.g. CD jukebox) support multiple LUNs
+#
+# CONFIG_SCSI_MULTI_LUN is not set
+# CONFIG_SCSI_CONSTANTS is not set
+# CONFIG_SCSI_LOGGING is not set
+# CONFIG_SCSI_SCAN_ASYNC is not set
+CONFIG_SCSI_WAIT_SCAN=m
+
+#
+# SCSI Transports
+#
+# CONFIG_SCSI_SPI_ATTRS is not set
+# CONFIG_SCSI_FC_ATTRS is not set
+# CONFIG_SCSI_ISCSI_ATTRS is not set
+# CONFIG_SCSI_SAS_LIBSAS is not set
+# CONFIG_SCSI_SRP_ATTRS is not set
+CONFIG_SCSI_LOWLEVEL=y
+# CONFIG_ISCSI_TCP is not set
+# CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
+# CONFIG_SCSI_DEBUG is not set
+# CONFIG_SCSI_DH is not set
+# CONFIG_SCSI_OSD_INITIATOR is not set
+# CONFIG_ATA is not set
+# CONFIG_MD is not set
+CONFIG_NETDEVICES=y
+CONFIG_COMPAT_NET_DEV_OPS=y
+# CONFIG_DUMMY is not set
+# CONFIG_BONDING is not set
+# CONFIG_MACVLAN is not set
+# CONFIG_EQUALIZER is not set
+# CONFIG_TUN is not set
+# CONFIG_VETH is not set
+CONFIG_PHYLIB=y
+
+#
+# MII PHY device drivers
+#
+# CONFIG_MARVELL_PHY is not set
+# CONFIG_DAVICOM_PHY is not set
+# CONFIG_QSEMI_PHY is not set
+# CONFIG_LXT_PHY is not set
+# CONFIG_CICADA_PHY is not set
+# CONFIG_VITESSE_PHY is not set
+CONFIG_SMSC_PHY=y
+# CONFIG_BROADCOM_PHY is not set
+# CONFIG_ICPLUS_PHY is not set
+# CONFIG_REALTEK_PHY is not set
+# CONFIG_NATIONAL_PHY is not set
+# CONFIG_STE10XP is not set
+# CONFIG_LSI_ET1011C_PHY is not set
+# CONFIG_FIXED_PHY is not set
+CONFIG_MDIO_BITBANG=y
+# CONFIG_MDIO_GPIO is not set
+CONFIG_NET_ETHERNET=y
+CONFIG_MII=y
+# CONFIG_AX88796 is not set
+# CONFIG_STNIC is not set
+CONFIG_SMC91X=y
+# CONFIG_ENC28J60 is not set
+# CONFIG_ETHOC is not set
+# CONFIG_SMC911X is not set
+# CONFIG_SMSC911X is not set
+# CONFIG_DNET is not set
+# CONFIG_IBM_NEW_EMAC_ZMII is not set
+# CONFIG_IBM_NEW_EMAC_RGMII is not set
+# CONFIG_IBM_NEW_EMAC_TAH is not set
+# CONFIG_IBM_NEW_EMAC_EMAC4 is not set
+# CONFIG_IBM_NEW_EMAC_NO_FLOW_CTRL is not set
+# CONFIG_IBM_NEW_EMAC_MAL_CLR_ICINTSTAT is not set
+# CONFIG_IBM_NEW_EMAC_MAL_COMMON_ERR is not set
+# CONFIG_B44 is not set
+# CONFIG_NETDEV_1000 is not set
+# CONFIG_NETDEV_10000 is not set
+
+#
+# Wireless LAN
+#
+# CONFIG_WLAN_PRE80211 is not set
+# CONFIG_WLAN_80211 is not set
+
+#
+# Enable WiMAX (Networking options) to see the WiMAX drivers
+#
+
+#
+# USB Network Adapters
+#
+# CONFIG_USB_CATC is not set
+# CONFIG_USB_KAWETH is not set
+# CONFIG_USB_PEGASUS is not set
+# CONFIG_USB_RTL8150 is not set
+# CONFIG_USB_USBNET is not set
+# CONFIG_WAN is not set
+# CONFIG_PPP is not set
+# CONFIG_SLIP is not set
+# CONFIG_NETCONSOLE is not set
+# CONFIG_NETPOLL is not set
+# CONFIG_NET_POLL_CONTROLLER is not set
+# CONFIG_ISDN is not set
+# CONFIG_PHONE is not set
+
+#
+# Input device support
+#
+CONFIG_INPUT=y
+# CONFIG_INPUT_FF_MEMLESS is not set
+# CONFIG_INPUT_POLLDEV is not set
+
+#
+# Userland interfaces
+#
+# CONFIG_INPUT_MOUSEDEV is not set
+# CONFIG_INPUT_JOYDEV is not set
+CONFIG_INPUT_EVDEV=y
+# CONFIG_INPUT_EVBUG is not set
+
+#
+# Input Device Drivers
+#
+CONFIG_INPUT_KEYBOARD=y
+# CONFIG_KEYBOARD_ATKBD is not set
+# CONFIG_KEYBOARD_SUNKBD is not set
+# CONFIG_KEYBOARD_LKKBD is not set
+# CONFIG_KEYBOARD_XTKBD is not set
+# CONFIG_KEYBOARD_NEWTON is not set
+# CONFIG_KEYBOARD_STOWAWAY is not set
+# CONFIG_KEYBOARD_GPIO is not set
+CONFIG_KEYBOARD_SH_KEYSC=y
+# CONFIG_INPUT_MOUSE is not set
+# CONFIG_INPUT_JOYSTICK is not set
+# CONFIG_INPUT_TABLET is not set
+# CONFIG_INPUT_TOUCHSCREEN is not set
+# CONFIG_INPUT_MISC is not set
+
+#
+# Hardware I/O ports
+#
+# CONFIG_SERIO is not set
+# CONFIG_GAMEPORT is not set
+
+#
+# Character devices
+#
+CONFIG_VT=y
+CONFIG_CONSOLE_TRANSLATIONS=y
+CONFIG_VT_CONSOLE=y
+CONFIG_HW_CONSOLE=y
+CONFIG_VT_HW_CONSOLE_BINDING=y
+CONFIG_DEVKMEM=y
+# CONFIG_SERIAL_NONSTANDARD is not set
+
+#
+# Serial drivers
+#
+# CONFIG_SERIAL_8250 is not set
+
+#
+# Non-8250 serial port support
+#
+# CONFIG_SERIAL_MAX3100 is not set
+CONFIG_SERIAL_SH_SCI=y
+CONFIG_SERIAL_SH_SCI_NR_UARTS=6
+CONFIG_SERIAL_SH_SCI_CONSOLE=y
+CONFIG_SERIAL_CORE=y
+CONFIG_SERIAL_CORE_CONSOLE=y
+CONFIG_UNIX98_PTYS=y
+# CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set
+CONFIG_LEGACY_PTYS=y
+CONFIG_LEGACY_PTY_COUNT=256
+# CONFIG_IPMI_HANDLER is not set
+CONFIG_HW_RANDOM=y
+# CONFIG_HW_RANDOM_TIMERIOMEM is not set
+# CONFIG_R3964 is not set
+# CONFIG_RAW_DRIVER is not set
+# CONFIG_TCG_TPM is not set
+CONFIG_I2C=y
+CONFIG_I2C_BOARDINFO=y
+CONFIG_I2C_CHARDEV=y
+CONFIG_I2C_HELPER_AUTO=y
+
+#
+# I2C Hardware Bus support
+#
+
+#
+# I2C system bus drivers (mostly embedded / system-on-chip)
+#
+# CONFIG_I2C_GPIO is not set
+# CONFIG_I2C_OCORES is not set
+CONFIG_I2C_SH_MOBILE=y
+# CONFIG_I2C_SIMTEC is not set
+
+#
+# External I2C/SMBus adapter drivers
+#
+# CONFIG_I2C_PARPORT_LIGHT is not set
+# CONFIG_I2C_TAOS_EVM is not set
+# CONFIG_I2C_TINY_USB is not set
+
+#
+# Other I2C/SMBus bus drivers
+#
+# CONFIG_I2C_PCA_PLATFORM is not set
+# CONFIG_I2C_STUB is not set
+
+#
+# Miscellaneous I2C Chip support
+#
+# CONFIG_DS1682 is not set
+# CONFIG_SENSORS_PCF8574 is not set
+# CONFIG_PCF8575 is not set
+# CONFIG_SENSORS_PCA9539 is not set
+# CONFIG_SENSORS_MAX6875 is not set
+# CONFIG_SENSORS_TSL2550 is not set
+# CONFIG_I2C_DEBUG_CORE is not set
+# CONFIG_I2C_DEBUG_ALGO is not set
+# CONFIG_I2C_DEBUG_BUS is not set
+# CONFIG_I2C_DEBUG_CHIP is not set
+CONFIG_SPI=y
+CONFIG_SPI_MASTER=y
+
+#
+# SPI Master Controller Drivers
+#
+CONFIG_SPI_BITBANG=y
+# CONFIG_SPI_GPIO is not set
+# CONFIG_SPI_SH_SCI is not set
+
+#
+# SPI Protocol Masters
+#
+# CONFIG_SPI_SPIDEV is not set
+# CONFIG_SPI_TLE62X0 is not set
+CONFIG_ARCH_REQUIRE_GPIOLIB=y
+CONFIG_GPIOLIB=y
+# CONFIG_GPIO_SYSFS is not set
+
+#
+# Memory mapped GPIO expanders:
+#
+
+#
+# I2C GPIO expanders:
+#
+# CONFIG_GPIO_MAX732X is not set
+# CONFIG_GPIO_PCA953X is not set
+# CONFIG_GPIO_PCF857X is not set
+
+#
+# PCI GPIO expanders:
+#
+
+#
+# SPI GPIO expanders:
+#
+# CONFIG_GPIO_MAX7301 is not set
+# CONFIG_GPIO_MCP23S08 is not set
+# CONFIG_W1 is not set
+# CONFIG_POWER_SUPPLY is not set
+# CONFIG_HWMON is not set
+# CONFIG_THERMAL is not set
+# CONFIG_THERMAL_HWMON is not set
+# CONFIG_WATCHDOG is not set
+CONFIG_SSB_POSSIBLE=y
+
+#
+# Sonics Silicon Backplane
+#
+# CONFIG_SSB is not set
+
+#
+# Multifunction device drivers
+#
+# CONFIG_MFD_CORE is not set
+# CONFIG_MFD_SM501 is not set
+# CONFIG_HTC_PASIC3 is not set
+# CONFIG_TPS65010 is not set
+# CONFIG_TWL4030_CORE is not set
+# CONFIG_MFD_TMIO is not set
+# CONFIG_PMIC_DA903X is not set
+# CONFIG_MFD_WM8400 is not set
+# CONFIG_MFD_WM8350_I2C is not set
+# CONFIG_MFD_PCF50633 is not set
+# CONFIG_REGULATOR is not set
+
+#
+# Multimedia devices
+#
+
+#
+# Multimedia core support
+#
+CONFIG_VIDEO_DEV=y
+CONFIG_VIDEO_V4L2_COMMON=y
+# CONFIG_VIDEO_ALLOW_V4L1 is not set
+CONFIG_VIDEO_V4L1_COMPAT=y
+# CONFIG_DVB_CORE is not set
+CONFIG_VIDEO_MEDIA=y
+
+#
+# Multimedia drivers
+#
+# CONFIG_MEDIA_ATTACH is not set
+CONFIG_MEDIA_TUNER=y
+# CONFIG_MEDIA_TUNER_CUSTOMISE is not set
+CONFIG_MEDIA_TUNER_SIMPLE=y
+CONFIG_MEDIA_TUNER_TDA8290=y
+CONFIG_MEDIA_TUNER_TDA9887=y
+CONFIG_MEDIA_TUNER_TEA5761=y
+CONFIG_MEDIA_TUNER_TEA5767=y
+CONFIG_MEDIA_TUNER_MT20XX=y
+CONFIG_MEDIA_TUNER_XC2028=y
+CONFIG_MEDIA_TUNER_XC5000=y
+CONFIG_MEDIA_TUNER_MC44S803=y
+CONFIG_VIDEO_V4L2=y
+CONFIG_VIDEOBUF_GEN=y
+CONFIG_VIDEOBUF_DMA_CONTIG=y
+CONFIG_VIDEO_CAPTURE_DRIVERS=y
+# CONFIG_VIDEO_ADV_DEBUG is not set
+# CONFIG_VIDEO_FIXED_MINOR_RANGES is not set
+CONFIG_VIDEO_HELPER_CHIPS_AUTO=y
+# CONFIG_VIDEO_VIVI is not set
+# CONFIG_VIDEO_SAA5246A is not set
+# CONFIG_VIDEO_SAA5249 is not set
+CONFIG_SOC_CAMERA=y
+# CONFIG_SOC_CAMERA_MT9M001 is not set
+# CONFIG_SOC_CAMERA_MT9M111 is not set
+# CONFIG_SOC_CAMERA_MT9T031 is not set
+# CONFIG_SOC_CAMERA_MT9V022 is not set
+# CONFIG_SOC_CAMERA_TW9910 is not set
+# CONFIG_SOC_CAMERA_PLATFORM is not set
+CONFIG_SOC_CAMERA_OV772X=y
+CONFIG_VIDEO_SH_MOBILE_CEU=y
+CONFIG_V4L_USB_DRIVERS=y
+# CONFIG_USB_VIDEO_CLASS is not set
+CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y
+CONFIG_USB_GSPCA=m
+# CONFIG_USB_M5602 is not set
+# CONFIG_USB_STV06XX is not set
+# CONFIG_USB_GSPCA_CONEX is not set
+# CONFIG_USB_GSPCA_ETOMS is not set
+# CONFIG_USB_GSPCA_FINEPIX is not set
+# CONFIG_USB_GSPCA_MARS is not set
+# CONFIG_USB_GSPCA_MR97310A is not set
+# CONFIG_USB_GSPCA_OV519 is not set
+# CONFIG_USB_GSPCA_OV534 is not set
+# CONFIG_USB_GSPCA_PAC207 is not set
+# CONFIG_USB_GSPCA_PAC7311 is not set
+# CONFIG_USB_GSPCA_SONIXB is not set
+# CONFIG_USB_GSPCA_SONIXJ is not set
+# CONFIG_USB_GSPCA_SPCA500 is not set
+# CONFIG_USB_GSPCA_SPCA501 is not set
+# CONFIG_USB_GSPCA_SPCA505 is not set
+# CONFIG_USB_GSPCA_SPCA506 is not set
+# CONFIG_USB_GSPCA_SPCA508 is not set
+# CONFIG_USB_GSPCA_SPCA561 is not set
+# CONFIG_USB_GSPCA_SQ905 is not set
+# CONFIG_USB_GSPCA_SQ905C is not set
+# CONFIG_USB_GSPCA_STK014 is not set
+# CONFIG_USB_GSPCA_SUNPLUS is not set
+# CONFIG_USB_GSPCA_T613 is not set
+# CONFIG_USB_GSPCA_TV8532 is not set
+# CONFIG_USB_GSPCA_VC032X is not set
+# CONFIG_USB_GSPCA_ZC3XX is not set
+# CONFIG_VIDEO_PVRUSB2 is not set
+# CONFIG_VIDEO_HDPVR is not set
+# CONFIG_VIDEO_EM28XX is not set
+# CONFIG_VIDEO_CX231XX is not set
+# CONFIG_VIDEO_USBVISION is not set
+# CONFIG_USB_ET61X251 is not set
+# CONFIG_USB_SN9C102 is not set
+# CONFIG_USB_ZC0301 is not set
+CONFIG_USB_PWC_INPUT_EVDEV=y
+# CONFIG_USB_ZR364XX is not set
+# CONFIG_USB_STKWEBCAM is not set
+# CONFIG_USB_S2255 is not set
+# CONFIG_RADIO_ADAPTERS is not set
+# CONFIG_DAB is not set
+
+#
+# Graphics support
+#
+# CONFIG_VGASTATE is not set
+# CONFIG_VIDEO_OUTPUT_CONTROL is not set
+CONFIG_FB=y
+# CONFIG_FIRMWARE_EDID is not set
+# CONFIG_FB_DDC is not set
+# CONFIG_FB_BOOT_VESA_SUPPORT is not set
+# CONFIG_FB_CFB_FILLRECT is not set
+# CONFIG_FB_CFB_COPYAREA is not set
+# CONFIG_FB_CFB_IMAGEBLIT is not set
+# CONFIG_FB_CFB_REV_PIXELS_IN_BYTE is not set
+CONFIG_FB_SYS_FILLRECT=y
+CONFIG_FB_SYS_COPYAREA=y
+CONFIG_FB_SYS_IMAGEBLIT=y
+# CONFIG_FB_FOREIGN_ENDIAN is not set
+CONFIG_FB_SYS_FOPS=y
+CONFIG_FB_DEFERRED_IO=y
+# CONFIG_FB_SVGALIB is not set
+# CONFIG_FB_MACMODES is not set
+# CONFIG_FB_BACKLIGHT is not set
+# CONFIG_FB_MODE_HELPERS is not set
+# CONFIG_FB_TILEBLITTING is not set
+
+#
+# Frame buffer hardware drivers
+#
+# CONFIG_FB_S1D13XXX is not set
+CONFIG_FB_SH_MOBILE_LCDC=y
+# CONFIG_FB_VIRTUAL is not set
+# CONFIG_FB_METRONOME is not set
+# CONFIG_FB_MB862XX is not set
+# CONFIG_FB_BROADSHEET is not set
+# CONFIG_BACKLIGHT_LCD_SUPPORT is not set
+
+#
+# Display device support
+#
+# CONFIG_DISPLAY_SUPPORT is not set
+
+#
+# Console display driver support
+#
+CONFIG_DUMMY_CONSOLE=y
+CONFIG_FRAMEBUFFER_CONSOLE=y
+# CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY is not set
+# CONFIG_FRAMEBUFFER_CONSOLE_ROTATION is not set
+# CONFIG_FONTS is not set
+CONFIG_FONT_8x8=y
+CONFIG_FONT_8x16=y
+CONFIG_LOGO=y
+# CONFIG_LOGO_LINUX_MONO is not set
+# CONFIG_LOGO_LINUX_VGA16 is not set
+# CONFIG_LOGO_LINUX_CLUT224 is not set
+# CONFIG_LOGO_SUPERH_MONO is not set
+# CONFIG_LOGO_SUPERH_VGA16 is not set
+CONFIG_LOGO_SUPERH_CLUT224=y
+# CONFIG_SOUND is not set
+CONFIG_HID_SUPPORT=y
+CONFIG_HID=y
+# CONFIG_HID_DEBUG is not set
+# CONFIG_HIDRAW is not set
+
+#
+# USB Input Devices
+#
+CONFIG_USB_HID=y
+# CONFIG_HID_PID is not set
+# CONFIG_USB_HIDDEV is not set
+
+#
+# Special HID drivers
+#
+# CONFIG_HID_A4TECH is not set
+# CONFIG_HID_APPLE is not set
+# CONFIG_HID_BELKIN is not set
+# CONFIG_HID_CHERRY is not set
+# CONFIG_HID_CHICONY is not set
+# CONFIG_HID_CYPRESS is not set
+# CONFIG_DRAGONRISE_FF is not set
+# CONFIG_HID_EZKEY is not set
+# CONFIG_HID_KYE is not set
+# CONFIG_HID_GYRATION is not set
+# CONFIG_HID_KENSINGTON is not set
+# CONFIG_HID_LOGITECH is not set
+# CONFIG_HID_MICROSOFT is not set
+# CONFIG_HID_MONTEREY is not set
+# CONFIG_HID_NTRIG is not set
+# CONFIG_HID_PANTHERLORD is not set
+# CONFIG_HID_PETALYNX is not set
+# CONFIG_HID_SAMSUNG is not set
+# CONFIG_HID_SONY is not set
+# CONFIG_HID_SUNPLUS is not set
+# CONFIG_GREENASIA_FF is not set
+# CONFIG_HID_TOPSEED is not set
+# CONFIG_THRUSTMASTER_FF is not set
+# CONFIG_ZEROPLUS_FF is not set
+CONFIG_USB_SUPPORT=y
+CONFIG_USB_ARCH_HAS_HCD=y
+# CONFIG_USB_ARCH_HAS_OHCI is not set
+# CONFIG_USB_ARCH_HAS_EHCI is not set
+CONFIG_USB=y
+# CONFIG_USB_DEBUG is not set
+# CONFIG_USB_ANNOUNCE_NEW_DEVICES is not set
+
+#
+# Miscellaneous USB options
+#
+# CONFIG_USB_DEVICEFS is not set
+CONFIG_USB_DEVICE_CLASS=y
+# CONFIG_USB_DYNAMIC_MINORS is not set
+# CONFIG_USB_OTG is not set
+# CONFIG_USB_OTG_WHITELIST is not set
+# CONFIG_USB_OTG_BLACKLIST_HUB is not set
+CONFIG_USB_MON=y
+# CONFIG_USB_WUSB is not set
+# CONFIG_USB_WUSB_CBAF is not set
+
+#
+# USB Host Controller Drivers
+#
+# CONFIG_USB_C67X00_HCD is not set
+# CONFIG_USB_OXU210HP_HCD is not set
+# CONFIG_USB_ISP116X_HCD is not set
+# CONFIG_USB_ISP1760_HCD is not set
+# CONFIG_USB_SL811_HCD is not set
+CONFIG_USB_R8A66597_HCD=y
+# CONFIG_USB_HWA_HCD is not set
+
+#
+# USB Device Class drivers
+#
+# CONFIG_USB_ACM is not set
+# CONFIG_USB_PRINTER is not set
+# CONFIG_USB_WDM is not set
+# CONFIG_USB_TMC is not set
+
+#
+# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
+#
+
+#
+# also be needed; see USB_STORAGE Help for more info
+#
+CONFIG_USB_STORAGE=y
+# CONFIG_USB_STORAGE_DEBUG is not set
+# CONFIG_USB_STORAGE_DATAFAB is not set
+# CONFIG_USB_STORAGE_FREECOM is not set
+# CONFIG_USB_STORAGE_ISD200 is not set
+# CONFIG_USB_STORAGE_USBAT is not set
+# CONFIG_USB_STORAGE_SDDR09 is not set
+# CONFIG_USB_STORAGE_SDDR55 is not set
+# CONFIG_USB_STORAGE_JUMPSHOT is not set
+# CONFIG_USB_STORAGE_ALAUDA is not set
+# CONFIG_USB_STORAGE_ONETOUCH is not set
+# CONFIG_USB_STORAGE_KARMA is not set
+# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set
+# CONFIG_USB_LIBUSUAL is not set
+
+#
+# USB Imaging devices
+#
+# CONFIG_USB_MDC800 is not set
+# CONFIG_USB_MICROTEK is not set
+
+#
+# USB port drivers
+#
+# CONFIG_USB_SERIAL is not set
+
+#
+# USB Miscellaneous drivers
+#
+# CONFIG_USB_EMI62 is not set
+# CONFIG_USB_EMI26 is not set
+# CONFIG_USB_ADUTUX is not set
+# CONFIG_USB_SEVSEG is not set
+# CONFIG_USB_RIO500 is not set
+# CONFIG_USB_LEGOTOWER is not set
+# CONFIG_USB_LCD is not set
+# CONFIG_USB_BERRY_CHARGE is not set
+# CONFIG_USB_LED is not set
+# CONFIG_USB_CYPRESS_CY7C63 is not set
+# CONFIG_USB_CYTHERM is not set
+# CONFIG_USB_IDMOUSE is not set
+# CONFIG_USB_FTDI_ELAN is not set
+# CONFIG_USB_APPLEDISPLAY is not set
+# CONFIG_USB_LD is not set
+# CONFIG_USB_TRANCEVIBRATOR is not set
+# CONFIG_USB_IOWARRIOR is not set
+# CONFIG_USB_ISIGHTFW is not set
+# CONFIG_USB_VST is not set
+# CONFIG_USB_GADGET is not set
+
+#
+# OTG and related infrastructure
+#
+# CONFIG_USB_GPIO_VBUS is not set
+# CONFIG_NOP_USB_XCEIV is not set
+CONFIG_MMC=y
+# CONFIG_MMC_DEBUG is not set
+# CONFIG_MMC_UNSAFE_RESUME is not set
+
+#
+# MMC/SD/SDIO Card Drivers
+#
+CONFIG_MMC_BLOCK=y
+CONFIG_MMC_BLOCK_BOUNCE=y
+# CONFIG_SDIO_UART is not set
+# CONFIG_MMC_TEST is not set
+
+#
+# MMC/SD/SDIO Host Controller Drivers
+#
+# CONFIG_MMC_SDHCI is not set
+CONFIG_MMC_SPI=y
+# CONFIG_MEMSTICK is not set
+# CONFIG_NEW_LEDS is not set
+# CONFIG_ACCESSIBILITY is not set
+CONFIG_RTC_LIB=y
+CONFIG_RTC_CLASS=y
+CONFIG_RTC_HCTOSYS=y
+CONFIG_RTC_HCTOSYS_DEVICE="rtc0"
+# CONFIG_RTC_DEBUG is not set
+
+#
+# RTC interfaces
+#
+CONFIG_RTC_INTF_SYSFS=y
+CONFIG_RTC_INTF_PROC=y
+CONFIG_RTC_INTF_DEV=y
+# CONFIG_RTC_INTF_DEV_UIE_EMUL is not set
+# CONFIG_RTC_DRV_TEST is not set
+
+#
+# I2C RTC drivers
+#
+# CONFIG_RTC_DRV_DS1307 is not set
+# CONFIG_RTC_DRV_DS1374 is not set
+# CONFIG_RTC_DRV_DS1672 is not set
+# CONFIG_RTC_DRV_MAX6900 is not set
+# CONFIG_RTC_DRV_RS5C372 is not set
+# CONFIG_RTC_DRV_ISL1208 is not set
+# CONFIG_RTC_DRV_X1205 is not set
+CONFIG_RTC_DRV_PCF8563=y
+# CONFIG_RTC_DRV_PCF8583 is not set
+# CONFIG_RTC_DRV_M41T80 is not set
+# CONFIG_RTC_DRV_S35390A is not set
+# CONFIG_RTC_DRV_FM3130 is not set
+# CONFIG_RTC_DRV_RX8581 is not set
+
+#
+# SPI RTC drivers
+#
+# CONFIG_RTC_DRV_M41T94 is not set
+# CONFIG_RTC_DRV_DS1305 is not set
+# CONFIG_RTC_DRV_DS1390 is not set
+# CONFIG_RTC_DRV_MAX6902 is not set
+# CONFIG_RTC_DRV_R9701 is not set
+# CONFIG_RTC_DRV_RS5C348 is not set
+# CONFIG_RTC_DRV_DS3234 is not set
+
+#
+# Platform RTC drivers
+#
+# CONFIG_RTC_DRV_DS1286 is not set
+# CONFIG_RTC_DRV_DS1511 is not set
+# CONFIG_RTC_DRV_DS1553 is not set
+# CONFIG_RTC_DRV_DS1742 is not set
+# CONFIG_RTC_DRV_STK17TA8 is not set
+# CONFIG_RTC_DRV_M48T86 is not set
+# CONFIG_RTC_DRV_M48T35 is not set
+# CONFIG_RTC_DRV_M48T59 is not set
+# CONFIG_RTC_DRV_BQ4802 is not set
+# CONFIG_RTC_DRV_V3020 is not set
+
+#
+# on-CPU RTC drivers
+#
+# CONFIG_RTC_DRV_SH is not set
+# CONFIG_RTC_DRV_GENERIC is not set
+# CONFIG_DMADEVICES is not set
+# CONFIG_AUXDISPLAY is not set
+CONFIG_UIO=y
+# CONFIG_UIO_PDRV is not set
+CONFIG_UIO_PDRV_GENIRQ=y
+# CONFIG_UIO_SMX is not set
+# CONFIG_UIO_SERCOS3 is not set
+# CONFIG_STAGING is not set
+
+#
+# File systems
+#
+CONFIG_EXT2_FS=y
+CONFIG_EXT2_FS_XATTR=y
+CONFIG_EXT2_FS_POSIX_ACL=y
+CONFIG_EXT2_FS_SECURITY=y
+# CONFIG_EXT2_FS_XIP is not set
+CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
+CONFIG_EXT3_FS_XATTR=y
+CONFIG_EXT3_FS_POSIX_ACL=y
+CONFIG_EXT3_FS_SECURITY=y
+# CONFIG_EXT4_FS is not set
+CONFIG_JBD=y
+CONFIG_FS_MBCACHE=y
+# CONFIG_REISERFS_FS is not set
+# CONFIG_JFS_FS is not set
+CONFIG_FS_POSIX_ACL=y
+CONFIG_FILE_LOCKING=y
+# CONFIG_XFS_FS is not set
+# CONFIG_OCFS2_FS is not set
+# CONFIG_BTRFS_FS is not set
+CONFIG_DNOTIFY=y
+CONFIG_INOTIFY=y
+CONFIG_INOTIFY_USER=y
+# CONFIG_QUOTA is not set
+# CONFIG_AUTOFS_FS is not set
+# CONFIG_AUTOFS4_FS is not set
+# CONFIG_FUSE_FS is not set
+
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
+# CD-ROM/DVD Filesystems
+#
+# CONFIG_ISO9660_FS is not set
+# CONFIG_UDF_FS is not set
+
+#
+# DOS/FAT/NT Filesystems
+#
+CONFIG_FAT_FS=y
+# CONFIG_MSDOS_FS is not set
+CONFIG_VFAT_FS=y
+CONFIG_FAT_DEFAULT_CODEPAGE=437
+CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1"
+# CONFIG_NTFS_FS is not set
+
+#
+# Pseudo filesystems
+#
+CONFIG_PROC_FS=y
+CONFIG_PROC_KCORE=y
+CONFIG_PROC_SYSCTL=y
+CONFIG_PROC_PAGE_MONITOR=y
+CONFIG_SYSFS=y
+CONFIG_TMPFS=y
+# CONFIG_TMPFS_POSIX_ACL is not set
+# CONFIG_HUGETLBFS is not set
+# CONFIG_HUGETLB_PAGE is not set
+# CONFIG_CONFIGFS_FS is not set
+CONFIG_MISC_FILESYSTEMS=y
+# CONFIG_ADFS_FS is not set
+# CONFIG_AFFS_FS is not set
+# CONFIG_HFS_FS is not set
+# CONFIG_HFSPLUS_FS is not set
+# CONFIG_BEFS_FS is not set
+# CONFIG_BFS_FS is not set
+# CONFIG_EFS_FS is not set
+# CONFIG_JFFS2_FS is not set
+# CONFIG_UBIFS_FS is not set
+# CONFIG_CRAMFS is not set
+# CONFIG_SQUASHFS is not set
+# CONFIG_VXFS_FS is not set
+# CONFIG_MINIX_FS is not set
+# CONFIG_OMFS_FS is not set
+# CONFIG_HPFS_FS is not set
+# CONFIG_QNX4FS_FS is not set
+# CONFIG_ROMFS_FS is not set
+# CONFIG_SYSV_FS is not set
+# CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
+CONFIG_NETWORK_FILESYSTEMS=y
+CONFIG_NFS_FS=y
+CONFIG_NFS_V3=y
+# CONFIG_NFS_V3_ACL is not set
+# CONFIG_NFS_V4 is not set
+CONFIG_ROOT_NFS=y
+CONFIG_NFSD=y
+CONFIG_NFSD_V3=y
+# CONFIG_NFSD_V3_ACL is not set
+# CONFIG_NFSD_V4 is not set
+CONFIG_LOCKD=y
+CONFIG_LOCKD_V4=y
+CONFIG_EXPORTFS=y
+CONFIG_NFS_COMMON=y
+CONFIG_SUNRPC=y
+# CONFIG_RPCSEC_GSS_KRB5 is not set
+# CONFIG_RPCSEC_GSS_SPKM3 is not set
+# CONFIG_SMB_FS is not set
+# CONFIG_CIFS is not set
+# CONFIG_NCP_FS is not set
+# CONFIG_CODA_FS is not set
+# CONFIG_AFS_FS is not set
+
+#
+# Partition Types
+#
+# CONFIG_PARTITION_ADVANCED is not set
+CONFIG_MSDOS_PARTITION=y
+CONFIG_NLS=y
+CONFIG_NLS_DEFAULT="iso8859-1"
+CONFIG_NLS_CODEPAGE_437=y
+# CONFIG_NLS_CODEPAGE_737 is not set
+# CONFIG_NLS_CODEPAGE_775 is not set
+# CONFIG_NLS_CODEPAGE_850 is not set
+# CONFIG_NLS_CODEPAGE_852 is not set
+# CONFIG_NLS_CODEPAGE_855 is not set
+# CONFIG_NLS_CODEPAGE_857 is not set
+# CONFIG_NLS_CODEPAGE_860 is not set
+# CONFIG_NLS_CODEPAGE_861 is not set
+# CONFIG_NLS_CODEPAGE_862 is not set
+# CONFIG_NLS_CODEPAGE_863 is not set
+# CONFIG_NLS_CODEPAGE_864 is not set
+# CONFIG_NLS_CODEPAGE_865 is not set
+# CONFIG_NLS_CODEPAGE_866 is not set
+# CONFIG_NLS_CODEPAGE_869 is not set
+# CONFIG_NLS_CODEPAGE_936 is not set
+# CONFIG_NLS_CODEPAGE_950 is not set
+CONFIG_NLS_CODEPAGE_932=y
+# CONFIG_NLS_CODEPAGE_949 is not set
+# CONFIG_NLS_CODEPAGE_874 is not set
+# CONFIG_NLS_ISO8859_8 is not set
+# CONFIG_NLS_CODEPAGE_1250 is not set
+# CONFIG_NLS_CODEPAGE_1251 is not set
+# CONFIG_NLS_ASCII is not set
+CONFIG_NLS_ISO8859_1=y
+# CONFIG_NLS_ISO8859_2 is not set
+# CONFIG_NLS_ISO8859_3 is not set
+# CONFIG_NLS_ISO8859_4 is not set
+# CONFIG_NLS_ISO8859_5 is not set
+# CONFIG_NLS_ISO8859_6 is not set
+# CONFIG_NLS_ISO8859_7 is not set
+# CONFIG_NLS_ISO8859_9 is not set
+# CONFIG_NLS_ISO8859_13 is not set
+# CONFIG_NLS_ISO8859_14 is not set
+# CONFIG_NLS_ISO8859_15 is not set
+# CONFIG_NLS_KOI8_R is not set
+# CONFIG_NLS_KOI8_U is not set
+# CONFIG_NLS_UTF8 is not set
+# CONFIG_DLM is not set
+
+#
+# Kernel hacking
+#
+CONFIG_TRACE_IRQFLAGS_SUPPORT=y
+# CONFIG_PRINTK_TIME is not set
+CONFIG_ENABLE_WARN_DEPRECATED=y
+# CONFIG_ENABLE_MUST_CHECK is not set
+CONFIG_FRAME_WARN=1024
+# CONFIG_MAGIC_SYSRQ is not set
+# CONFIG_UNUSED_SYMBOLS is not set
+# CONFIG_DEBUG_FS is not set
+# CONFIG_HEADERS_CHECK is not set
+# CONFIG_DEBUG_KERNEL is not set
+# CONFIG_DEBUG_BUGVERBOSE is not set
+# CONFIG_DEBUG_MEMORY_INIT is not set
+# CONFIG_RCU_CPU_STALL_DETECTOR is not set
+# CONFIG_LATENCYTOP is not set
+CONFIG_SYSCTL_SYSCALL_CHECK=y
+CONFIG_HAVE_FUNCTION_TRACER=y
+CONFIG_HAVE_DYNAMIC_FTRACE=y
+CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
+
+#
+# Tracers
+#
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_PREEMPT_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
+# CONFIG_SAMPLES is not set
+CONFIG_HAVE_ARCH_KGDB=y
+# CONFIG_SH_STANDARD_BIOS is not set
+# CONFIG_EARLY_SCIF_CONSOLE is not set
+
+#
+# Security options
+#
+# CONFIG_KEYS is not set
+# CONFIG_SECURITY is not set
+# CONFIG_SECURITYFS is not set
+# CONFIG_SECURITY_FILE_CAPABILITIES is not set
+CONFIG_CRYPTO=y
+
+#
+# Crypto core or helper
+#
+# CONFIG_CRYPTO_FIPS is not set
+CONFIG_CRYPTO_ALGAPI=y
+CONFIG_CRYPTO_ALGAPI2=y
+CONFIG_CRYPTO_AEAD2=y
+CONFIG_CRYPTO_BLKCIPHER=y
+CONFIG_CRYPTO_BLKCIPHER2=y
+CONFIG_CRYPTO_HASH2=y
+CONFIG_CRYPTO_RNG2=y
+CONFIG_CRYPTO_PCOMP=y
+CONFIG_CRYPTO_MANAGER=y
+CONFIG_CRYPTO_MANAGER2=y
+# CONFIG_CRYPTO_GF128MUL is not set
+# CONFIG_CRYPTO_NULL is not set
+CONFIG_CRYPTO_WORKQUEUE=y
+# CONFIG_CRYPTO_CRYPTD is not set
+# CONFIG_CRYPTO_AUTHENC is not set
+# CONFIG_CRYPTO_TEST is not set
+
+#
+# Authenticated Encryption with Associated Data
+#
+# CONFIG_CRYPTO_CCM is not set
+# CONFIG_CRYPTO_GCM is not set
+# CONFIG_CRYPTO_SEQIV is not set
+
+#
+# Block modes
+#
+CONFIG_CRYPTO_CBC=y
+# CONFIG_CRYPTO_CTR is not set
+# CONFIG_CRYPTO_CTS is not set
+# CONFIG_CRYPTO_ECB is not set
+# CONFIG_CRYPTO_LRW is not set
+# CONFIG_CRYPTO_PCBC is not set
+# CONFIG_CRYPTO_XTS is not set
+
+#
+# Hash modes
+#
+# CONFIG_CRYPTO_HMAC is not set
+# CONFIG_CRYPTO_XCBC is not set
+
+#
+# Digest
+#
+# CONFIG_CRYPTO_CRC32C is not set
+# CONFIG_CRYPTO_MD4 is not set
+# CONFIG_CRYPTO_MD5 is not set
+# CONFIG_CRYPTO_MICHAEL_MIC is not set
+# CONFIG_CRYPTO_RMD128 is not set
+# CONFIG_CRYPTO_RMD160 is not set
+# CONFIG_CRYPTO_RMD256 is not set
+# CONFIG_CRYPTO_RMD320 is not set
+# CONFIG_CRYPTO_SHA1 is not set
+# CONFIG_CRYPTO_SHA256 is not set
+# CONFIG_CRYPTO_SHA512 is not set
+# CONFIG_CRYPTO_TGR192 is not set
+# CONFIG_CRYPTO_WP512 is not set
+
+#
+# Ciphers
+#
+# CONFIG_CRYPTO_AES is not set
+# CONFIG_CRYPTO_ANUBIS is not set
+# CONFIG_CRYPTO_ARC4 is not set
+# CONFIG_CRYPTO_BLOWFISH is not set
+# CONFIG_CRYPTO_CAMELLIA is not set
+# CONFIG_CRYPTO_CAST5 is not set
+# CONFIG_CRYPTO_CAST6 is not set
+# CONFIG_CRYPTO_DES is not set
+# CONFIG_CRYPTO_FCRYPT is not set
+# CONFIG_CRYPTO_KHAZAD is not set
+# CONFIG_CRYPTO_SALSA20 is not set
+# CONFIG_CRYPTO_SEED is not set
+# CONFIG_CRYPTO_SERPENT is not set
+# CONFIG_CRYPTO_TEA is not set
+# CONFIG_CRYPTO_TWOFISH is not set
+
+#
+# Compression
+#
+# CONFIG_CRYPTO_DEFLATE is not set
+# CONFIG_CRYPTO_ZLIB is not set
+# CONFIG_CRYPTO_LZO is not set
+
+#
+# Random Number Generation
+#
+# CONFIG_CRYPTO_ANSI_CPRNG is not set
+CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
+
+#
+# Library routines
+#
+CONFIG_BITREVERSE=y
+CONFIG_GENERIC_FIND_LAST_BIT=y
+# CONFIG_CRC_CCITT is not set
+# CONFIG_CRC16 is not set
+CONFIG_CRC_T10DIF=y
+CONFIG_CRC_ITU_T=y
+CONFIG_CRC32=y
+CONFIG_CRC7=y
+# CONFIG_LIBCRC32C is not set
+CONFIG_HAS_IOMEM=y
+CONFIG_HAS_IOPORT=y
+CONFIG_HAS_DMA=y
+CONFIG_NLATTR=y
diff --git a/arch/sh/include/mach-se/mach/se7724.h b/arch/sh/include/mach-se/mach/se7724.h
new file mode 100644
index 0000000..74164b6
--- /dev/null
+++ b/arch/sh/include/mach-se/mach/se7724.h
@@ -0,0 +1,67 @@
+#ifndef __ASM_SH_SE7724_H
+#define __ASM_SH_SE7724_H
+
+/*
+ * linux/include/asm-sh/se7724.h
+ *
+ * Copyright (C) 2009 Renesas Solutions Corp.
+ *
+ * Kuninori Morimoto <morimoto.kuninori@renesas.com>
+ *
+ * Hitachi UL SolutionEngine 7724 Support.
+ *
+ * Based on se7722.h
+ * Copyright (C) 2007  Nobuhiro Iwamatsu
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ */
+#include <asm/addrspace.h>
+
+#define PA_LED		(0xba203000)	/* 8bit LED */
+#define IRQ_MODE	(0xba200010)
+#define IRQ0_SR		(0xba200014)
+#define IRQ1_SR		(0xba200018)
+#define IRQ2_SR		(0xba20001c)
+#define IRQ0_MR		(0xba200020)
+#define IRQ1_MR		(0xba200024)
+#define IRQ2_MR		(0xba200028)
+
+/* IRQ */
+#define IRQ0_IRQ        32
+#define IRQ1_IRQ        33
+#define IRQ2_IRQ        34
+
+/* Bits in IRQ012 registers */
+#define SE7724_FPGA_IRQ_BASE	220
+
+/* IRQ0 */
+#define IRQ0_BASE	SE7724_FPGA_IRQ_BASE
+#define IRQ0_KEY	(IRQ0_BASE + 12)
+#define IRQ0_RMII	(IRQ0_BASE + 13)
+#define IRQ0_SMC	(IRQ0_BASE + 14)
+#define IRQ0_MASK	0x7fff
+#define IRQ0_END	IRQ0_SMC
+/* IRQ1 */
+#define IRQ1_BASE	(IRQ0_END + 1)
+#define IRQ1_TS		(IRQ1_BASE + 0)
+#define IRQ1_MASK	0x0001
+#define IRQ1_END	IRQ1_TS
+/* IRQ2 */
+#define IRQ2_BASE	(IRQ1_END + 1)
+#define IRQ2_USB0	(IRQ1_BASE + 0)
+#define IRQ2_USB1	(IRQ1_BASE + 1)
+#define IRQ2_MASK	0x0003
+#define IRQ2_END	IRQ2_USB1
+
+#define SE7724_FPGA_IRQ_NR	(IRQ2_END - IRQ0_BASE)
+
+/* arch/sh/boards/se/7724/irq.c */
+void init_se7724_IRQ(void);
+
+#define __IO_PREFIX		se7724
+#include <asm/io_generic.h>
+
+#endif  /* __ASM_SH_SE7724_H */
-- 
cgit v0.10.2


From 5b25ab29bad3114f798b136b4147f255a5d5742f Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 26 May 2009 17:02:18 +0900
Subject: sh: Record ms7724se in mach-types.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/tools/mach-types b/arch/sh/tools/mach-types
index 8477b5d..fec3a53 100644
--- a/arch/sh/tools/mach-types
+++ b/arch/sh/tools/mach-types
@@ -23,6 +23,7 @@ HD64461			HD64461
 7619SE			SH_7619_SOLUTION_ENGINE
 7721SE			SH_7721_SOLUTION_ENGINE
 7722SE			SH_7722_SOLUTION_ENGINE
+7724SE			SH_7724_SOLUTION_ENGINE
 7751SE			SH_7751_SOLUTION_ENGINE
 7780SE			SH_7780_SOLUTION_ENGINE
 7751SYSTEMH		SH_7751_SYSTEMH
-- 
cgit v0.10.2


From 69aa48ab82e17299efe2be6c21795945731a6c17 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 09:02:27 +0200
Subject: perf record: Straighten out argv types

[ Impact: cleanup ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 1b19f18..f225efa 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -191,7 +191,7 @@ static void display_help(void)
 	exit(0);
 }
 
-static void process_options(int argc, const char *argv[])
+static void process_options(int argc, char * const argv[])
 {
 	int error = 0, counter;
 
@@ -538,7 +538,7 @@ static void open_counters(int cpu, pid_t pid)
 	nr_cpu++;
 }
 
-int cmd_record(int argc, const char **argv)
+int cmd_record(int argc, char * const argv[])
 {
 	int i, counter;
 	pid_t pid;
-- 
cgit v0.10.2


From 4e97ddf09ee3ce715fc334399bae4cc0c0a13057 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 10:07:44 +0200
Subject: perf stat: Remove unused variable

[ Impact: cleanup ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 88c70be9..c1053d8 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -541,8 +541,6 @@ static void skip_signal(int signo)
 
 int cmd_stat(int argc, char **argv, const char *prefix)
 {
-	sigset_t blocked;
-
 	page_size = sysconf(_SC_PAGE_SIZE);
 
 	process_options(argc, argv);
-- 
cgit v0.10.2


From c9904dd15922f349b5f06839e34b1723d4a75940 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Mon, 25 May 2009 08:10:19 +0000
Subject: sh: add pll_clk to sh7785

This patch converts the sh7785 pll implementation from the
all-in-one code in frqmr_recalc() and frqmr_build_rate_table()
to a separate struct clk. This allows us to remove the processor
specific multiplier and use generic rate table functions.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index cf042b5..7021ab0 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -56,12 +56,7 @@ static unsigned long frqmr_recalc(struct clk *clk)
 
 	idx = (__raw_readl(FRQMR1) >> data->shift) & 0x000f;
 
-	/*
-	 * XXX: PLL1 multiplier is locked for the default clock mode,
-	 * when mode pin detection and configuration support is added,
-	 * select the multiplier dynamically.
-	 */
-	return clk->parent->rate * 36 / div2[idx];
+	return clk->parent->rate / div2[idx];
 }
 
 static void frqmr_build_rate_table(struct clk *clk)
@@ -75,7 +70,7 @@ static void frqmr_build_rate_table(struct clk *clk)
 
 		data->freq_table[entry].index = entry;
 		data->freq_table[entry].frequency =
-			clk->parent->rate * 36 / div2[i];
+			clk->parent->rate / div2[i];
 
 		entry++;
 	}
@@ -136,6 +131,20 @@ static struct clk_ops frqmr_clk_ops = {
 	.round_rate		= frqmr_round_rate,
 };
 
+static unsigned long pll_recalc(struct clk *clk)
+{
+	/*
+	 * XXX: PLL1 multiplier is locked for the default clock mode,
+	 * when mode pin detection and configuration support is added,
+	 * select the multiplier dynamically.
+	 */
+	return clk->parent->rate * 36;
+}
+
+static struct clk_ops pll_clk_ops = {
+	.recalc		= pll_recalc,
+};
+
 /*
  * Default rate for the root input clock, reset this with clk_set_rate()
  * from the platform code.
@@ -146,11 +155,19 @@ static struct clk extal_clk = {
 	.rate		= 33333333,
 };
 
+static struct clk pll_clk = {
+	.name		= "pll_clk",
+	.id		= -1,
+	.ops		= &pll_clk_ops,
+	.parent		= &extal_clk,
+	.flags		= CLK_ENABLE_ON_INIT,
+};
+
 static struct clk cpu_clk = {
 	.name		= "cpu_clk",		/* Ick */
 	.id		= -1,
 	.ops		= &frqmr_clk_ops,
-	.parent		= &extal_clk,
+	.parent		= &pll_clk,
 	.flags		= CLK_ENABLE_ON_INIT,
 	.priv		= &ifc_data,
 };
@@ -159,7 +176,7 @@ static struct clk shyway_clk = {
 	.name		= "shyway_clk",		/* SHck */
 	.id		= -1,
 	.ops		= &frqmr_clk_ops,
-	.parent		= &extal_clk,
+	.parent		= &pll_clk,
 	.flags		= CLK_ENABLE_ON_INIT,
 	.priv		= &sfc_data,
 };
@@ -168,7 +185,7 @@ static struct clk peripheral_clk = {
 	.name		= "peripheral_clk",	/* Pck */
 	.id		= -1,
 	.ops		= &frqmr_clk_ops,
-	.parent		= &extal_clk,
+	.parent		= &pll_clk,
 	.flags		= CLK_ENABLE_ON_INIT,
 	.priv		= &pfc_data,
 };
@@ -177,7 +194,7 @@ static struct clk ddr_clk = {
 	.name		= "ddr_clk",		/* DDRck */
 	.id		= -1,
 	.ops		= &frqmr_clk_ops,
-	.parent		= &extal_clk,
+	.parent		= &pll_clk,
 	.flags		= CLK_ENABLE_ON_INIT,
 	.priv		= &mfc_data,
 };
@@ -186,7 +203,7 @@ static struct clk bus_clk = {
 	.name		= "bus_clk",		/* Bck */
 	.id		= -1,
 	.ops		= &frqmr_clk_ops,
-	.parent		= &extal_clk,
+	.parent		= &pll_clk,
 	.flags		= CLK_ENABLE_ON_INIT,
 	.priv		= &bfc_data,
 };
@@ -195,7 +212,7 @@ static struct clk ga_clk = {
 	.name		= "ga_clk",		/* GAck */
 	.id		= -1,
 	.ops		= &frqmr_clk_ops,
-	.parent		= &extal_clk,
+	.parent		= &pll_clk,
 	.priv		= &s2fc_data,
 };
 
@@ -203,7 +220,7 @@ static struct clk du_clk = {
 	.name		= "du_clk",		/* DUck */
 	.id		= -1,
 	.ops		= &frqmr_clk_ops,
-	.parent		= &extal_clk,
+	.parent		= &pll_clk,
 	.priv		= &s3fc_data,
 };
 
@@ -211,13 +228,14 @@ static struct clk umem_clk = {
 	.name		= "umem_clk",		/* uck */
 	.id		= -1,
 	.ops		= &frqmr_clk_ops,
-	.parent		= &extal_clk,
+	.parent		= &pll_clk,
 	.flags		= CLK_ENABLE_ON_INIT,
 	.priv		= &ufc_data,
 };
 
 static struct clk *clks[] = {
 	&extal_clk,
+	&pll_clk,
 	&cpu_clk,
 	&shyway_clk,
 	&peripheral_clk,
-- 
cgit v0.10.2


From c94a85746f7bdc13035acdf88c130d7b6fa41bde Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Mon, 25 May 2009 08:10:28 +0000
Subject: sh: add shared clock framework frequency table code

Add SuperH-specific clock framework helper functions:
- clk_rate_table_build() - build cpufreq table from divisors/multipliers
- clk_rate_table_round() - use cpufreq table to find matching frequency

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index 64c93cb..60d5d2b 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -100,4 +100,22 @@ enum clk_sh_algo_id {
 	IP_N1,
 };
 
+struct clk_div_mult_table {
+	unsigned int *divisors;
+	unsigned int nr_divisors;
+	unsigned int *multipliers;
+	unsigned int nr_multipliers;
+};
+
+struct cpufreq_frequency_table;
+void clk_rate_table_build(struct clk *clk,
+			  struct cpufreq_frequency_table *freq_table,
+			  int nr_freqs,
+			  struct clk_div_mult_table *src_table,
+			  unsigned long *bitmap);
+
+long clk_rate_table_round(struct clk *clk,
+			  struct cpufreq_frequency_table *freq_table,
+			  unsigned long rate);
+
 #endif /* __ASM_SH_CLOCK_H */
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 012d234..59764d6 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -29,6 +29,7 @@
 #include <linux/err.h>
 #include <linux/platform_device.h>
 #include <linux/debugfs.h>
+#include <linux/cpufreq.h>
 #include <asm/clock.h>
 #include <asm/machvec.h>
 
@@ -36,6 +37,80 @@ static LIST_HEAD(clock_list);
 static DEFINE_SPINLOCK(clock_lock);
 static DEFINE_MUTEX(clock_list_sem);
 
+void clk_rate_table_build(struct clk *clk,
+			  struct cpufreq_frequency_table *freq_table,
+			  int nr_freqs,
+			  struct clk_div_mult_table *src_table,
+			  unsigned long *bitmap)
+{
+	unsigned long mult, div;
+	unsigned long freq;
+	int i;
+
+	for (i = 0; i < nr_freqs; i++) {
+		div = 1;
+		mult = 1;
+
+		if (src_table->divisors && i < src_table->nr_divisors)
+			div = src_table->divisors[i];
+
+		if (src_table->multipliers && i < src_table->nr_multipliers)
+			mult = src_table->multipliers[i];
+
+		if (!div || !mult || (bitmap && !test_bit(i, bitmap)))
+			freq = CPUFREQ_ENTRY_INVALID;
+		else
+			freq = clk->parent->rate * mult / div;
+
+		freq_table[i].index = i;
+		freq_table[i].frequency = freq;
+	}
+
+	/* Termination entry */
+	freq_table[i].index = i;
+	freq_table[i].frequency = CPUFREQ_TABLE_END;
+}
+
+long clk_rate_table_round(struct clk *clk,
+			  struct cpufreq_frequency_table *freq_table,
+			  unsigned long rate)
+{
+	unsigned long rate_error, rate_error_prev = ~0UL;
+	unsigned long rate_best_fit = rate;
+	unsigned long highest, lowest;
+	int i;
+
+	highest = lowest = 0;
+
+	for (i = 0; freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
+		unsigned long freq = freq_table[i].frequency;
+
+		if (freq == CPUFREQ_ENTRY_INVALID)
+			continue;
+
+		if (freq > highest)
+			highest = freq;
+		if (freq < lowest)
+			lowest = freq;
+
+		rate_error = abs(freq - rate);
+		if (rate_error < rate_error_prev) {
+			rate_best_fit = freq;
+			rate_error_prev = rate_error;
+		}
+
+		if (rate_error == 0)
+			break;
+	}
+
+	if (rate >= highest)
+		rate_best_fit = highest;
+	if (rate <= lowest)
+		rate_best_fit = lowest;
+
+	return rate_best_fit;
+}
+
 /* Used for clocks that always have same value as the parent clock */
 unsigned long followparent_recalc(struct clk *clk)
 {
-- 
cgit v0.10.2


From df109e630f82de63ec82eebbfec2a57852517f28 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Mon, 25 May 2009 08:10:36 +0000
Subject: sh: use shared frequency tables on sh7785

This patch converts the sh7785 clock code to make use
of clk_rate_table_build() and clk_rate_table_round().
The ->build_rate_table() callback is removed, the
table building is instead handled in ->recalc().

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index 7021ab0..a4a9bcb 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -19,6 +19,12 @@
 
 static unsigned int div2[] = { 1, 2, 4, 6, 8, 12, 16, 18,
 			       24, 32, 36, 48 };
+
+static struct clk_div_mult_table cpg_div = {
+	.divisors = div2,
+	.nr_divisors = ARRAY_SIZE(div2),
+};
+
 struct clk_priv {
 	unsigned int			shift;
 
@@ -52,82 +58,23 @@ FRQMR_CLK_DATA(ifc, 28, 0x000e);
 static unsigned long frqmr_recalc(struct clk *clk)
 {
 	struct clk_priv *data = clk->priv;
-	unsigned int idx;
-
-	idx = (__raw_readl(FRQMR1) >> data->shift) & 0x000f;
-
-	return clk->parent->rate / div2[idx];
-}
-
-static void frqmr_build_rate_table(struct clk *clk)
-{
-	struct clk_priv *data = clk->priv;
-	int i, entry;
-
-	for (i = entry = 0; i < ARRAY_SIZE(div2); i++) {
-		if ((data->div_bitmap & (1 << i)) == 0)
-			continue;
+	unsigned int idx = (__raw_readl(FRQMR1) >> data->shift) & 0x000f;
 
-		data->freq_table[entry].index = entry;
-		data->freq_table[entry].frequency =
-			clk->parent->rate / div2[i];
-
-		entry++;
-	}
-
-	if (entry == 0) {
-		pr_warning("clkfwk: failed to build frequency table "
-			   "for \"%s\" clk!\n", clk->name);
-		return;
-	}
-
-	/* Termination entry */
-	data->freq_table[entry].index = entry;
-	data->freq_table[entry].frequency = CPUFREQ_TABLE_END;
+	clk_rate_table_build(clk, data->freq_table, ARRAY_SIZE(div2),
+			     &cpg_div, &data->div_bitmap);
+	
+	return data->freq_table[idx].frequency;
 }
 
 static long frqmr_round_rate(struct clk *clk, unsigned long rate)
 {
 	struct clk_priv *data = clk->priv;
-	unsigned long rate_error, rate_error_prev = ~0UL;
-	unsigned long rate_best_fit = rate;
-	unsigned long highest, lowest;
-	int i;
-
-	highest = lowest = 0;
-
-	for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
-		unsigned long freq = data->freq_table[i].frequency;
-
-		if (freq == CPUFREQ_ENTRY_INVALID)
-			continue;
-
-		if (freq > highest)
-			highest = freq;
-		if (freq < lowest)
-			lowest = freq;
-
-		rate_error = abs(freq - rate);
-		if (rate_error < rate_error_prev) {
-			rate_best_fit = freq;
-			rate_error_prev = rate_error;
-		}
-
-		if (rate_error == 0)
-			break;
-	}
-
-	if (rate >= highest)
-		rate_best_fit = highest;
-	if (rate <= lowest)
-		rate_best_fit = lowest;
 
-	return rate_best_fit;
+	return clk_rate_table_round(clk, data->freq_table, rate);
 }
 
 static struct clk_ops frqmr_clk_ops = {
 	.recalc			= frqmr_recalc,
-	.build_rate_table	= frqmr_build_rate_table,
 	.round_rate		= frqmr_round_rate,
 };
 
-- 
cgit v0.10.2


From 61ce5393e4c8914c46ec99cbda76823515109709 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Mon, 25 May 2009 08:10:45 +0000
Subject: sh: remove clk_ops->build_rate_table()

This patch removes the ->build_rate_table() callback,
->recalc() may instead be used for this purpose.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index 60d5d2b..aa9480d 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -16,7 +16,6 @@ struct clk_ops {
 	int (*set_rate)(struct clk *clk, unsigned long rate, int algo_id);
 	int (*set_parent)(struct clk *clk, struct clk *parent);
 	long (*round_rate)(struct clk *clk, unsigned long rate);
-	void (*build_rate_table)(struct clk *clk);
 };
 
 struct clk {
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 59764d6..aa0fd08 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -138,8 +138,6 @@ void propagate_rate(struct clk *tclk)
 	list_for_each_entry(clkp, &tclk->children, sibling) {
 		if (clkp->ops && clkp->ops->recalc)
 			clkp->rate = clkp->ops->recalc(clkp);
-		if (clkp->ops && clkp->ops->build_rate_table)
-			clkp->ops->build_rate_table(clkp);
 
 		propagate_rate(clkp);
 	}
-- 
cgit v0.10.2


From f9e2b97dc2cf832ccceea2a2e6eccc4bc9af72cf Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 26 May 2009 17:12:20 +0900
Subject: sh: Add a KBUILD_DEFCONFIG for sh64.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Makefile b/arch/sh/Makefile
index a47fbd2..0fe35cf 100644
--- a/arch/sh/Makefile
+++ b/arch/sh/Makefile
@@ -82,7 +82,6 @@ defaultimage-$(CONFIG_SH_7206_SOLUTION_ENGINE)	:= vmlinux
 defaultimage-$(CONFIG_SH_7619_SOLUTION_ENGINE)	:= vmlinux
 
 # Set some sensible Kbuild defaults
-KBUILD_DEFCONFIG	:= shx3_defconfig
 KBUILD_IMAGE		:= $(defaultimage-y)
 
 #
@@ -90,15 +89,17 @@ KBUILD_IMAGE		:= $(defaultimage-y)
 # error messages during linking.
 #
 ifdef CONFIG_SUPERH32
-UTS_MACHINE	:= sh
-BITS		:= 32
-LDFLAGS_vmlinux	+= -e _stext
+UTS_MACHINE		:= sh
+BITS			:= 32
+LDFLAGS_vmlinux		+= -e _stext
+KBUILD_DEFCONFIG	:= shx3_defconfig
 else
-UTS_MACHINE	:= sh64
-BITS		:= 64
-LDFLAGS_vmlinux	+= --defsym phys_stext=_stext-$(CONFIG_PAGE_OFFSET) \
-		   --defsym phys_stext_shmedia=phys_stext+1 \
-		   -e phys_stext_shmedia
+UTS_MACHINE		:= sh64
+BITS			:= 64
+LDFLAGS_vmlinux		+= --defsym phys_stext=_stext-$(CONFIG_PAGE_OFFSET) \
+			   --defsym phys_stext_shmedia=phys_stext+1 \
+			   -e phys_stext_shmedia
+KBUILD_DEFCONFIG	:= cayman_defconfig
 endif
 
 ifneq ($(SUBARCH),$(ARCH))
-- 
cgit v0.10.2


From 0e9b20b8a1cab6c6ab4f98f917a2d98783103969 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 09:17:18 +0200
Subject: perf record: Convert to Git option parsing

Remove getopt usage and use Git's much more advanced and more compact
command option library.

Git's library (util/parse-options.[ch]) constructs help texts and
error messages automatically, and has a number of other convenience
features as well.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index f225efa..f12a782 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -2,6 +2,8 @@
 
 #include "perf.h"
 #include "util/util.h"
+#include "util/parse-options.h"
+#include "util/exec_cmd.h"
 
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -11,7 +13,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
-#include <getopt.h>
 #include <assert.h>
 #include <fcntl.h>
 #include <stdio.h>
@@ -33,8 +34,8 @@
 
 
-#define ALIGN(x,a)		__ALIGN_MASK(x,(typeof(x))(a)-1)
-#define __ALIGN_MASK(x,mask)	(((x)+(mask))&~(mask))
+#define ALIGN(x, a)		__ALIGN_MASK(x, (typeof(x))(a)-1)
+#define __ALIGN_MASK(x, mask)	(((x)+(mask))&~(mask))
 
 static int			nr_counters			=  0;
 static __u64			event_id[MAX_COUNTERS]		= { };
@@ -45,7 +46,7 @@ static int			nr_cpus				=  0;
 static unsigned int		page_size;
 static unsigned int		mmap_pages			= 16;
 static int			output;
-static char 			*output_name			= "output.perf";
+static const char		*output_name			= "output.perf";
 static int			group				= 0;
 static unsigned int		realtime_prio			= 0;
 static int			system_wide			= 0;
@@ -62,192 +63,6 @@ const unsigned int default_count[] = {
 	  10000,
 };
 
-struct event_symbol {
-	__u64 event;
-	char *symbol;
-};
-
-static struct event_symbol event_symbols[] = {
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
-
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
-};
-
-/*
- * Each event can have multiple symbolic names.
- * Symbolic names are (almost) exactly matched.
- */
-static __u64 match_event_symbols(char *str)
-{
-	__u64 config, id;
-	int type;
-	unsigned int i;
-
-	if (sscanf(str, "r%llx", &config) == 1)
-		return config | PERF_COUNTER_RAW_MASK;
-
-	if (sscanf(str, "%d:%llu", &type, &id) == 2)
-		return EID(type, id);
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		if (!strncmp(str, event_symbols[i].symbol,
-			     strlen(event_symbols[i].symbol)))
-			return event_symbols[i].event;
-	}
-
-	return ~0ULL;
-}
-
-static int parse_events(char *str)
-{
-	__u64 config;
-
-again:
-	if (nr_counters == MAX_COUNTERS)
-		return -1;
-
-	config = match_event_symbols(str);
-	if (config == ~0ULL)
-		return -1;
-
-	event_id[nr_counters] = config;
-	nr_counters++;
-
-	str = strstr(str, ",");
-	if (str) {
-		str++;
-		goto again;
-	}
-
-	return 0;
-}
-
-#define __PERF_COUNTER_FIELD(config, name) \
-	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
-
-#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
-#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
-#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
-#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
-
-static void display_events_help(void)
-{
-	unsigned int i;
-	__u64 e;
-
-	printf(
-	" -e EVENT     --event=EVENT   #  symbolic-name        abbreviations");
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		int type, id;
-
-		e = event_symbols[i].event;
-		type = PERF_COUNTER_TYPE(e);
-		id = PERF_COUNTER_ID(e);
-
-		printf("\n                             %d:%d: %-20s",
-				type, id, event_symbols[i].symbol);
-	}
-
-	printf("\n"
-	"                           rNNN: raw PMU events (eventsel+umask)\n\n");
-}
-
-static void display_help(void)
-{
-	printf(
-	"Usage: perf-record [<options>] <cmd>\n"
-	"perf-record Options (up to %d event types can be specified at once):\n\n",
-		 MAX_COUNTERS);
-
-	display_events_help();
-
-	printf(
-	" -c CNT    --count=CNT          # event period to sample\n"
-	" -m pages  --mmap_pages=<pages> # number of mmap data pages\n"
-	" -o file   --output=<file>      # output file\n"
-	" -p pid    --pid=<pid>		 # record events on existing pid\n"
-	" -r prio   --realtime=<prio>    # use RT prio\n"
-	" -s        --system             # system wide profiling\n"
-	);
-
-	exit(0);
-}
-
-static void process_options(int argc, char * const argv[])
-{
-	int error = 0, counter;
-
-	for (;;) {
-		int option_index = 0;
-		/** Options for getopt */
-		static struct option long_options[] = {
-			{"count",	required_argument,	NULL, 'c'},
-			{"event",	required_argument,	NULL, 'e'},
-			{"mmap_pages",	required_argument,	NULL, 'm'},
-			{"output",	required_argument,	NULL, 'o'},
-			{"pid",		required_argument,	NULL, 'p'},
-			{"realtime",	required_argument,	NULL, 'r'},
-			{"system",	no_argument,		NULL, 's'},
-			{"inherit",	no_argument,		NULL, 'i'},
-			{"nmi",		no_argument,		NULL, 'n'},
-			{NULL,		0,			NULL,  0 }
-		};
-		int c = getopt_long(argc, argv, "+:c:e:m:o:p:r:sin",
-				    long_options, &option_index);
-		if (c == -1)
-			break;
-
-		switch (c) {
-		case 'c': default_interval		=   atoi(optarg); break;
-		case 'e': error				= parse_events(optarg); break;
-		case 'm': mmap_pages			=   atoi(optarg); break;
-		case 'o': output_name			= strdup(optarg); break;
-		case 'p': target_pid			=   atoi(optarg); break;
-		case 'r': realtime_prio			=   atoi(optarg); break;
-		case 's': system_wide                   ^=             1; break;
-		case 'i': inherit			^=	       1; break;
-		case 'n': nmi				^=	       1; break;
-		default: error = 1; break;
-		}
-	}
-
-	if (argc - optind == 0 && target_pid == -1)
-		error = 1;
-
-	if (error)
-		display_help();
-
-	if (!nr_counters) {
-		nr_counters = 1;
-		event_id[0] = 0;
-	}
-
-	for (counter = 0; counter < nr_counters; counter++) {
-		if (event_count[counter])
-			continue;
-
-		event_count[counter] = default_interval;
-	}
-}
-
 struct mmap_data {
 	int counter;
 	void *base;
@@ -538,16 +353,13 @@ static void open_counters(int cpu, pid_t pid)
 	nr_cpu++;
 }
 
-int cmd_record(int argc, char * const argv[])
+static int __cmd_record(int argc, const char **argv)
 {
 	int i, counter;
 	pid_t pid;
 	int ret;
 
 	page_size = sysconf(_SC_PAGE_SIZE);
-
-	process_options(argc, argv);
-
 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
 	assert(nr_cpus <= MAX_NR_CPUS);
 	assert(nr_cpus >= 0);
@@ -558,9 +370,6 @@ int cmd_record(int argc, char * const argv[])
 		exit(-1);
 	}
 
-	argc -= optind;
-	argv += optind;
-
 	if (!system_wide) {
 		open_counters(-1, target_pid != -1 ? target_pid : 0);
 	} else for (i = 0; i < nr_cpus; i++)
@@ -575,7 +384,7 @@ int cmd_record(int argc, char * const argv[])
 			perror("failed to fork");
 
 		if (!pid) {
-			if (execvp(argv[0], argv)) {
+			if (execvp(argv[0], (char **)argv)) {
 				perror(argv[0]);
 				exit(-1);
 			}
@@ -610,3 +419,170 @@ int cmd_record(int argc, char * const argv[])
 
 	return 0;
 }
+
+struct event_symbol {
+	__u64 event;
+	char *symbol;
+};
+
+static struct event_symbol event_symbols[] = {
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
+
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
+};
+
+/*
+ * Each event can have multiple symbolic names.
+ * Symbolic names are (almost) exactly matched.
+ */
+static __u64 match_event_symbols(const char *str)
+{
+	__u64 config, id;
+	int type;
+	unsigned int i;
+
+	if (sscanf(str, "r%llx", &config) == 1)
+		return config | PERF_COUNTER_RAW_MASK;
+
+	if (sscanf(str, "%d:%llu", &type, &id) == 2)
+		return EID(type, id);
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		if (!strncmp(str, event_symbols[i].symbol,
+			     strlen(event_symbols[i].symbol)))
+			return event_symbols[i].event;
+	}
+
+	return ~0ULL;
+}
+
+static int parse_events(const struct option *opt, const char *str, int unset)
+{
+	__u64 config;
+
+again:
+	if (nr_counters == MAX_COUNTERS)
+		return -1;
+
+	config = match_event_symbols(str);
+	if (config == ~0ULL)
+		return -1;
+
+	event_id[nr_counters] = config;
+	nr_counters++;
+
+	str = strstr(str, ",");
+	if (str) {
+		str++;
+		goto again;
+	}
+
+	return 0;
+}
+
+static char events_help[100000];
+
+#define __PERF_COUNTER_FIELD(config, name) \
+	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
+
+#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
+#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
+#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
+#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
+
+
+
+static void create_events_help(void)
+{
+	unsigned int i;
+	char *str;
+	__u64 e;
+
+	str = events_help;
+
+	str += sprintf(str,
+	"event name: [");
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		int type, id;
+
+		e = event_symbols[i].event;
+		type = PERF_COUNTER_TYPE(e);
+		id = PERF_COUNTER_ID(e);
+
+		if (i)
+			str += sprintf(str, "|");
+
+		str += sprintf(str, "%s",
+				event_symbols[i].symbol);
+	}
+
+	str += sprintf(str, "|rNNN]");
+}
+
+static const char * const record_usage[] = {
+	"perf record [<options>] <command>",
+	NULL
+};
+
+const struct option options[] = {
+	OPT_CALLBACK('e', "event", NULL, "event",
+		     events_help, parse_events),
+	OPT_INTEGER('c', "count", &default_interval,
+		    "event period to sample"),
+	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
+		    "number of mmap data pages"),
+	OPT_STRING('o', "output", &output_name, "file",
+		    "output file name"),
+	OPT_BOOLEAN('i', "inherit", &inherit,
+		    "child tasks inherit counters"),
+	OPT_INTEGER('p', "pid", &target_pid,
+		    "record events on existing pid"),
+	OPT_INTEGER('r', "realtime", &realtime_prio,
+		    "collect data with this RT SCHED_FIFO priority"),
+	OPT_BOOLEAN('a', "all-cpus", &system_wide,
+			    "system-wide collection from all CPUs"),
+	OPT_END()
+};
+
+int cmd_record(int argc, const char **argv, const char *prefix)
+{
+	int counter;
+
+	create_events_help();
+
+	argc = parse_options(argc, argv, options, record_usage, 0);
+	if (!argc)
+		usage_with_options(record_usage, options);
+
+	if (!nr_counters) {
+		nr_counters = 1;
+		event_id[0] = 0;
+	}
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		if (event_count[counter])
+			continue;
+
+		event_count[counter] = default_interval;
+	}
+
+	return __cmd_record(argc, argv);
+}
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 4bed265..626b320 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -42,13 +42,16 @@
   * Released under the GPL v2. (and only v2, not any later version)
   */
 
+
 #include "perf.h"
 #include "util/util.h"
 
 #include <getopt.h>
 #include <assert.h>
 #include <fcntl.h>
+
 #include <stdio.h>
+
 #include <errno.h>
 #include <time.h>
 #include <sched.h>
-- 
cgit v0.10.2


From 8ad8db3788fd9a449941fb2392ca85af4ee1cde1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 11:10:09 +0200
Subject: perf_counter tools: Librarize event string parsing

Extract the event string parser from builtin-record.c, and
librarize it - to be reused in other commands.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 481e4c2..45daa72 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -290,6 +290,7 @@ LIB_H += ../../include/linux/perf_counter.h
 LIB_H += perf.h
 LIB_H += util/levenshtein.h
 LIB_H += util/parse-options.h
+LIB_H += util/parse-events.h
 LIB_H += util/quote.h
 LIB_H += util/util.h
 LIB_H += util/help.h
@@ -304,6 +305,7 @@ LIB_OBJS += util/exec_cmd.o
 LIB_OBJS += util/help.o
 LIB_OBJS += util/levenshtein.o
 LIB_OBJS += util/parse-options.o
+LIB_OBJS += util/parse-events.o
 LIB_OBJS += util/path.o
 LIB_OBJS += util/run-command.o
 LIB_OBJS += util/quote.o
diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index f12a782..6fa6ed6 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -3,44 +3,17 @@
 #include "perf.h"
 #include "util/util.h"
 #include "util/parse-options.h"
+#include "util/parse-events.h"
 #include "util/exec_cmd.h"
 
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <unistd.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#include <assert.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <errno.h>
-#include <time.h>
 #include <sched.h>
-#include <pthread.h>
-
-#include <sys/syscall.h>
-#include <sys/ioctl.h>
-#include <sys/poll.h>
-#include <sys/prctl.h>
-#include <sys/wait.h>
-#include <sys/uio.h>
-#include <sys/mman.h>
-
-#include <linux/unistd.h>
-#include <linux/types.h>
-
-
 
 #define ALIGN(x, a)		__ALIGN_MASK(x, (typeof(x))(a)-1)
 #define __ALIGN_MASK(x, mask)	(((x)+(mask))&~(mask))
 
-static int			nr_counters			=  0;
-static __u64			event_id[MAX_COUNTERS]		= { };
 static int			default_interval = 100000;
 static int			event_count[MAX_COUNTERS];
+
 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
 static int			nr_cpus				=  0;
 static unsigned int		page_size;
@@ -420,131 +393,16 @@ static int __cmd_record(int argc, const char **argv)
 	return 0;
 }
 
-struct event_symbol {
-	__u64 event;
-	char *symbol;
-};
-
-static struct event_symbol event_symbols[] = {
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
-
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
-};
-
-/*
- * Each event can have multiple symbolic names.
- * Symbolic names are (almost) exactly matched.
- */
-static __u64 match_event_symbols(const char *str)
-{
-	__u64 config, id;
-	int type;
-	unsigned int i;
-
-	if (sscanf(str, "r%llx", &config) == 1)
-		return config | PERF_COUNTER_RAW_MASK;
-
-	if (sscanf(str, "%d:%llu", &type, &id) == 2)
-		return EID(type, id);
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		if (!strncmp(str, event_symbols[i].symbol,
-			     strlen(event_symbols[i].symbol)))
-			return event_symbols[i].event;
-	}
-
-	return ~0ULL;
-}
-
-static int parse_events(const struct option *opt, const char *str, int unset)
-{
-	__u64 config;
-
-again:
-	if (nr_counters == MAX_COUNTERS)
-		return -1;
-
-	config = match_event_symbols(str);
-	if (config == ~0ULL)
-		return -1;
-
-	event_id[nr_counters] = config;
-	nr_counters++;
-
-	str = strstr(str, ",");
-	if (str) {
-		str++;
-		goto again;
-	}
-
-	return 0;
-}
-
-static char events_help[100000];
-
-#define __PERF_COUNTER_FIELD(config, name) \
-	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
-
-#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
-#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
-#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
-#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
-
-
-
-static void create_events_help(void)
-{
-	unsigned int i;
-	char *str;
-	__u64 e;
-
-	str = events_help;
-
-	str += sprintf(str,
-	"event name: [");
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		int type, id;
-
-		e = event_symbols[i].event;
-		type = PERF_COUNTER_TYPE(e);
-		id = PERF_COUNTER_ID(e);
-
-		if (i)
-			str += sprintf(str, "|");
-
-		str += sprintf(str, "%s",
-				event_symbols[i].symbol);
-	}
-
-	str += sprintf(str, "|rNNN]");
-}
-
 static const char * const record_usage[] = {
 	"perf record [<options>] <command>",
 	NULL
 };
 
+static char events_help_msg[EVENTS_HELP_MAX];
+
 const struct option options[] = {
 	OPT_CALLBACK('e', "event", NULL, "event",
-		     events_help, parse_events),
+		     events_help_msg, parse_events),
 	OPT_INTEGER('c', "count", &default_interval,
 		    "event period to sample"),
 	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
@@ -566,7 +424,7 @@ int cmd_record(int argc, const char **argv, const char *prefix)
 {
 	int counter;
 
-	create_events_help();
+	create_events_help(events_help_msg);
 
 	argc = parse_options(argc, argv, options, record_usage, 0);
 	if (!argc)
diff --git a/Documentation/perf_counter/util/parse-events.c b/Documentation/perf_counter/util/parse-events.c
new file mode 100644
index 0000000..77d0917
--- /dev/null
+++ b/Documentation/perf_counter/util/parse-events.c
@@ -0,0 +1,127 @@
+
+#include "../perf.h"
+#include "util.h"
+#include "parse-options.h"
+#include "parse-events.h"
+#include "exec_cmd.h"
+
+int nr_counters;
+
+__u64			event_id[MAX_COUNTERS]		= { };
+
+struct event_symbol {
+	__u64 event;
+	char *symbol;
+};
+
+static struct event_symbol event_symbols[] = {
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
+
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
+};
+
+/*
+ * Each event can have multiple symbolic names.
+ * Symbolic names are (almost) exactly matched.
+ */
+static __u64 match_event_symbols(const char *str)
+{
+	__u64 config, id;
+	int type;
+	unsigned int i;
+
+	if (sscanf(str, "r%llx", &config) == 1)
+		return config | PERF_COUNTER_RAW_MASK;
+
+	if (sscanf(str, "%d:%llu", &type, &id) == 2)
+		return EID(type, id);
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		if (!strncmp(str, event_symbols[i].symbol,
+			     strlen(event_symbols[i].symbol)))
+			return event_symbols[i].event;
+	}
+
+	return ~0ULL;
+}
+
+int parse_events(const struct option *opt, const char *str, int unset)
+{
+	__u64 config;
+
+again:
+	if (nr_counters == MAX_COUNTERS)
+		return -1;
+
+	config = match_event_symbols(str);
+	if (config == ~0ULL)
+		return -1;
+
+	event_id[nr_counters] = config;
+	nr_counters++;
+
+	str = strstr(str, ",");
+	if (str) {
+		str++;
+		goto again;
+	}
+
+	return 0;
+}
+
+#define __PERF_COUNTER_FIELD(config, name) \
+	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
+
+#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
+#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
+#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
+#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
+
+/*
+ * Create the help text for the event symbols:
+ */
+void create_events_help(char *events_help_msg)
+{
+	unsigned int i;
+	char *str;
+	__u64 e;
+
+	str = events_help_msg;
+
+	str += sprintf(str,
+	"event name: [");
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		int type, id;
+
+		e = event_symbols[i].event;
+		type = PERF_COUNTER_TYPE(e);
+		id = PERF_COUNTER_ID(e);
+
+		if (i)
+			str += sprintf(str, "|");
+
+		str += sprintf(str, "%s",
+				event_symbols[i].symbol);
+	}
+
+	str += sprintf(str, "|rNNN]");
+}
+
diff --git a/Documentation/perf_counter/util/parse-events.h b/Documentation/perf_counter/util/parse-events.h
new file mode 100644
index 0000000..6e2ebe5
--- /dev/null
+++ b/Documentation/perf_counter/util/parse-events.h
@@ -0,0 +1,10 @@
+
+extern int nr_counters;
+extern __u64			event_id[MAX_COUNTERS];
+
+extern int parse_events(const struct option *opt, const char *str, int unset);
+
+#define EVENTS_HELP_MAX (128*1024)
+
+extern void create_events_help(char *help_msg);
+
-- 
cgit v0.10.2


From 5242519b0296d128425368fc6ab17f541d5fa775 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 09:17:18 +0200
Subject: perf stat: Convert to Git option parsing

Remove getopt usage and use Git's much more advanced and more compact
command option library.

Extend the event parser library with the extensions that were in
perf-stat before.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 6fa6ed6..ec2b787 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -4,7 +4,6 @@
 #include "util/util.h"
 #include "util/parse-options.h"
 #include "util/parse-events.h"
-#include "util/exec_cmd.h"
 
 #include <sched.h>
 
@@ -400,7 +399,7 @@ static const char * const record_usage[] = {
 
 static char events_help_msg[EVENTS_HELP_MAX];
 
-const struct option options[] = {
+static const struct option options[] = {
 	OPT_CALLBACK('e', "event", NULL, "event",
 		     events_help_msg, parse_events),
 	OPT_INTEGER('c', "count", &default_interval,
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index c1053d8..e7cb941 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -1,35 +1,5 @@
 /*
- * kerneltop.c: show top kernel functions - performance counters showcase
-
-   Build with:
-
-     cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
-
-   Sample output:
-
-------------------------------------------------------------------------------
- KernelTop:    2669 irqs/sec  [NMI, cache-misses/cache-refs],  (all, cpu: 2)
-------------------------------------------------------------------------------
-
-             weight         RIP          kernel function
-             ______   ________________   _______________
-
-              35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
-              33.00 - ffffffff804cb740 : sock_alloc_send_skb
-              31.26 - ffffffff804ce808 : skb_push
-              22.43 - ffffffff80510004 : tcp_established_options
-              19.00 - ffffffff8027d250 : find_get_page
-              15.76 - ffffffff804e4fc9 : eth_type_trans
-              15.20 - ffffffff804d8baa : dst_release
-              14.86 - ffffffff804cf5d8 : skb_release_head_state
-              14.00 - ffffffff802217d5 : read_hpet
-              12.00 - ffffffff804ffb7f : __ip_local_out
-              11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
-               8.54 - ffffffff805001a3 : ip_queue_xmit
- */
-
-/*
- * perfstat:  /usr/bin/time -alike performance counter statistics utility
+ * perf stat:  /usr/bin/time -alike performance counter statistics utility
 
           It summarizes the counter events of all tasks (and child tasks),
           covering all CPUs that the command (or workload) executes on.
@@ -38,59 +8,38 @@
 
    Sample output:
 
-   $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
+   $ perf stat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
 
    Performance counter stats for 'ls':
 
            163516953 instructions
                 2295 cache-misses
              2855182 branch-misses
+ *
+ * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
+ *
+ * Improvements and fixes by:
+ *
+ *   Arjan van de Ven <arjan@linux.intel.com>
+ *   Yanmin Zhang <yanmin.zhang@intel.com>
+ *   Wu Fengguang <fengguang.wu@intel.com>
+ *   Mike Galbraith <efault@gmx.de>
+ *   Paul Mackerras <paulus@samba.org>
+ *
+ * Released under the GPL v2. (and only v2, not any later version)
  */
 
- /*
-  * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
-  *
-  * Improvements and fixes by:
-  *
-  *   Arjan van de Ven <arjan@linux.intel.com>
-  *   Yanmin Zhang <yanmin.zhang@intel.com>
-  *   Wu Fengguang <fengguang.wu@intel.com>
-  *   Mike Galbraith <efault@gmx.de>
-  *   Paul Mackerras <paulus@samba.org>
-  *
-  * Released under the GPL v2. (and only v2, not any later version)
-  */
-
 #include "perf.h"
 #include "util/util.h"
+#include "util/parse-options.h"
+#include "util/parse-events.h"
 
-#include <getopt.h>
-#include <assert.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <errno.h>
-#include <time.h>
-#include <sched.h>
-#include <pthread.h>
-
-#include <sys/syscall.h>
-#include <sys/ioctl.h>
-#include <sys/poll.h>
 #include <sys/prctl.h>
-#include <sys/wait.h>
-#include <sys/uio.h>
-#include <sys/mman.h>
-
-#include <linux/unistd.h>
-#include <linux/types.h>
-
-#define EVENT_MASK_KERNEL		1
-#define EVENT_MASK_USER			2
 
 static int			system_wide			=  0;
+static int			inherit				=  1;
 
-static int			nr_counters			=  0;
-static __u64			event_id[MAX_COUNTERS]		= {
+static __u64			default_event_id[MAX_COUNTERS]	= {
 	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
 	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
 	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
@@ -101,20 +50,15 @@ static __u64			event_id[MAX_COUNTERS]		= {
 	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
 	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
 };
+
 static int			default_interval = 100000;
 static int			event_count[MAX_COUNTERS];
 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
-static int			event_mask[MAX_COUNTERS];
 
-static int			tid				= -1;
-static int			profile_cpu			= -1;
+static int			target_pid			= -1;
 static int			nr_cpus				=  0;
-static int			nmi				=  1;
-static int			group				=  0;
 static unsigned int		page_size;
 
-static int			zero;
-
 static int			scale				=  1;
 
 static const unsigned int default_count[] = {
@@ -126,197 +70,6 @@ static const unsigned int default_count[] = {
 	  10000,
 };
 
-static char *hw_event_names[] = {
-	"CPU cycles",
-	"instructions",
-	"cache references",
-	"cache misses",
-	"branches",
-	"branch misses",
-	"bus cycles",
-};
-
-static char *sw_event_names[] = {
-	"cpu clock ticks",
-	"task clock ticks",
-	"pagefaults",
-	"context switches",
-	"CPU migrations",
-	"minor faults",
-	"major faults",
-};
-
-struct event_symbol {
-	__u64 event;
-	char *symbol;
-};
-
-static struct event_symbol event_symbols[] = {
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
-
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
-};
-
-#define __PERF_COUNTER_FIELD(config, name) \
-	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
-
-#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
-#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
-#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
-#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
-
-static void display_events_help(void)
-{
-	unsigned int i;
-	__u64 e;
-
-	printf(
-	" -e EVENT     --event=EVENT   #  symbolic-name        abbreviations");
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		int type, id;
-
-		e = event_symbols[i].event;
-		type = PERF_COUNTER_TYPE(e);
-		id = PERF_COUNTER_ID(e);
-
-		printf("\n                             %d:%d: %-20s",
-				type, id, event_symbols[i].symbol);
-	}
-
-	printf("\n"
-	"                           rNNN: raw PMU events (eventsel+umask)\n\n");
-}
-
-static void display_help(void)
-{
-	printf(
-	"Usage: perfstat [<events...>] <cmd...>\n\n"
-	"PerfStat Options (up to %d event types can be specified):\n\n",
-		 MAX_COUNTERS);
-
-	display_events_help();
-
-	printf(
-	" -l                           # scale counter values\n"
-	" -a                           # system-wide collection\n");
-	exit(0);
-}
-
-static char *event_name(int ctr)
-{
-	__u64 config = event_id[ctr];
-	int type = PERF_COUNTER_TYPE(config);
-	int id = PERF_COUNTER_ID(config);
-	static char buf[32];
-
-	if (PERF_COUNTER_RAW(config)) {
-		sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
-		return buf;
-	}
-
-	switch (type) {
-	case PERF_TYPE_HARDWARE:
-		if (id < PERF_HW_EVENTS_MAX)
-			return hw_event_names[id];
-		return "unknown-hardware";
-
-	case PERF_TYPE_SOFTWARE:
-		if (id < PERF_SW_EVENTS_MAX)
-			return sw_event_names[id];
-		return "unknown-software";
-
-	default:
-		break;
-	}
-
-	return "unknown";
-}
-
-/*
- * Each event can have multiple symbolic names.
- * Symbolic names are (almost) exactly matched.
- */
-static __u64 match_event_symbols(char *str)
-{
-	__u64 config, id;
-	int type;
-	unsigned int i;
-	char mask_str[4];
-
-	if (sscanf(str, "r%llx", &config) == 1)
-		return config | PERF_COUNTER_RAW_MASK;
-
-	switch (sscanf(str, "%d:%llu:%2s", &type, &id, mask_str)) {
-		case 3:
-			if (strchr(mask_str, 'k'))
-				event_mask[nr_counters] |= EVENT_MASK_USER;
-			if (strchr(mask_str, 'u'))
-				event_mask[nr_counters] |= EVENT_MASK_KERNEL;
-		case 2:
-			return EID(type, id);
-
-		default:
-			break;
-	}
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		if (!strncmp(str, event_symbols[i].symbol,
-			     strlen(event_symbols[i].symbol)))
-			return event_symbols[i].event;
-	}
-
-	return ~0ULL;
-}
-
-static int parse_events(char *str)
-{
-	__u64 config;
-
-again:
-	if (nr_counters == MAX_COUNTERS)
-		return -1;
-
-	config = match_event_symbols(str);
-	if (config == ~0ULL)
-		return -1;
-
-	event_id[nr_counters] = config;
-	nr_counters++;
-
-	str = strstr(str, ",");
-	if (str) {
-		str++;
-		goto again;
-	}
-
-	return 0;
-}
-
-
-/*
- * perfstat
- */
-
-char fault_here[1000000];
-
 static void create_perfstat_counter(int counter)
 {
 	struct perf_counter_hw_event hw_event;
@@ -324,7 +77,7 @@ static void create_perfstat_counter(int counter)
 	memset(&hw_event, 0, sizeof(hw_event));
 	hw_event.config		= event_id[counter];
 	hw_event.record_type	= 0;
-	hw_event.nmi		= 0;
+	hw_event.nmi		= 1;
 	hw_event.exclude_kernel = event_mask[counter] & EVENT_MASK_KERNEL;
 	hw_event.exclude_user   = event_mask[counter] & EVENT_MASK_USER;
 
@@ -343,7 +96,7 @@ static void create_perfstat_counter(int counter)
 			}
 		}
 	} else {
-		hw_event.inherit	= 1;
+		hw_event.inherit	= inherit;
 		hw_event.disabled	= 1;
 
 		fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
@@ -355,7 +108,7 @@ static void create_perfstat_counter(int counter)
 	}
 }
 
-int do_perfstat(int argc, char *argv[])
+int do_perfstat(int argc, const char **argv)
 {
 	unsigned long long t0, t1;
 	int counter;
@@ -369,12 +122,6 @@ int do_perfstat(int argc, char *argv[])
 	for (counter = 0; counter < nr_counters; counter++)
 		create_perfstat_counter(counter);
 
-	argc -= optind;
-	argv += optind;
-
-	if (!argc)
-		display_help();
-
 	/*
 	 * Enable counters and exec the command:
 	 */
@@ -384,7 +131,7 @@ int do_perfstat(int argc, char *argv[])
 	if ((pid = fork()) < 0)
 		perror("failed to fork");
 	if (!pid) {
-		if (execvp(argv[0], argv)) {
+		if (execvp(argv[0], (char **)argv)) {
 			perror(argv[0]);
 			exit(-1);
 		}
@@ -458,70 +205,45 @@ int do_perfstat(int argc, char *argv[])
 	return 0;
 }
 
-static void process_options(int argc, char **argv)
+static void skip_signal(int signo)
 {
-	int error = 0, counter;
-
-	for (;;) {
-		int option_index = 0;
-		/** Options for getopt */
-		static struct option long_options[] = {
-			{"count",	required_argument,	NULL, 'c'},
-			{"cpu",		required_argument,	NULL, 'C'},
-			{"delay",	required_argument,	NULL, 'd'},
-			{"dump_symtab",	no_argument,		NULL, 'D'},
-			{"event",	required_argument,	NULL, 'e'},
-			{"filter",	required_argument,	NULL, 'f'},
-			{"group",	required_argument,	NULL, 'g'},
-			{"help",	no_argument,		NULL, 'h'},
-			{"nmi",		required_argument,	NULL, 'n'},
-			{"munmap_info",	no_argument,		NULL, 'U'},
-			{"pid",		required_argument,	NULL, 'p'},
-			{"realtime",	required_argument,	NULL, 'r'},
-			{"scale",	no_argument,		NULL, 'l'},
-			{"symbol",	required_argument,	NULL, 's'},
-			{"stat",	no_argument,		NULL, 'S'},
-			{"vmlinux",	required_argument,	NULL, 'x'},
-			{"zero",	no_argument,		NULL, 'z'},
-			{NULL,		0,			NULL,  0 }
-		};
-		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
-				    long_options, &option_index);
-		if (c == -1)
-			break;
-
-		switch (c) {
-		case 'a': system_wide			=	       1; break;
-		case 'c': default_interval		=   atoi(optarg); break;
-		case 'C':
-			/* CPU and PID are mutually exclusive */
-			if (tid != -1) {
-				printf("WARNING: CPU switch overriding PID\n");
-				sleep(1);
-				tid = -1;
-			}
-			profile_cpu			=   atoi(optarg); break;
-
-		case 'e': error				= parse_events(optarg); break;
-
-		case 'g': group				=   atoi(optarg); break;
-		case 'h':      				  display_help(); break;
-		case 'l': scale				=	       1; break;
-		case 'n': nmi				=   atoi(optarg); break;
-		case 'p':
-			/* CPU and PID are mutually exclusive */
-			if (profile_cpu != -1) {
-				printf("WARNING: PID switch overriding CPU\n");
-				sleep(1);
-				profile_cpu = -1;
-			}
-			tid				=   atoi(optarg); break;
-		case 'z': zero				=              1; break;
-		default: error = 1; break;
-		}
-	}
-	if (error)
-		display_help();
+}
+
+static const char * const stat_usage[] = {
+	"perf stat [<options>] <command>",
+	NULL
+};
+
+static char events_help_msg[EVENTS_HELP_MAX];
+
+static const struct option options[] = {
+	OPT_CALLBACK('e', "event", NULL, "event",
+		     events_help_msg, parse_events),
+	OPT_INTEGER('c', "count", &default_interval,
+		    "event period to sample"),
+	OPT_BOOLEAN('i', "inherit", &inherit,
+		    "child tasks inherit counters"),
+	OPT_INTEGER('p', "pid", &target_pid,
+		    "stat events on existing pid"),
+	OPT_BOOLEAN('a', "all-cpus", &system_wide,
+			    "system-wide collection from all CPUs"),
+	OPT_BOOLEAN('l', "scale", &scale,
+			    "scale/normalize counters"),
+	OPT_END()
+};
+
+int cmd_stat(int argc, const char **argv, const char *prefix)
+{
+	int counter;
+
+	page_size = sysconf(_SC_PAGE_SIZE);
+
+	create_events_help(events_help_msg);
+	memcpy(event_id, default_event_id, sizeof(default_event_id));
+
+	argc = parse_options(argc, argv, options, stat_usage, 0);
+	if (!argc)
+		usage_with_options(stat_usage, options);
 
 	if (!nr_counters) {
 		nr_counters = 8;
@@ -533,18 +255,6 @@ static void process_options(int argc, char **argv)
 
 		event_count[counter] = default_interval;
 	}
-}
-
-static void skip_signal(int signo)
-{
-}
-
-int cmd_stat(int argc, char **argv, const char *prefix)
-{
-	page_size = sysconf(_SC_PAGE_SIZE);
-
-	process_options(argc, argv);
-
 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
 	assert(nr_cpus <= MAX_NR_CPUS);
 	assert(nr_cpus >= 0);
diff --git a/Documentation/perf_counter/util/parse-events.c b/Documentation/perf_counter/util/parse-events.c
index 77d0917..88c903e 100644
--- a/Documentation/perf_counter/util/parse-events.c
+++ b/Documentation/perf_counter/util/parse-events.c
@@ -8,6 +8,7 @@
 int nr_counters;
 
 __u64			event_id[MAX_COUNTERS]		= { };
+int			event_mask[MAX_COUNTERS];
 
 struct event_symbol {
 	__u64 event;
@@ -37,6 +38,64 @@ static struct event_symbol event_symbols[] = {
 	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
 };
 
+#define __PERF_COUNTER_FIELD(config, name) \
+	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
+
+#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
+#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
+#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
+#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
+
+static char *hw_event_names[] = {
+	"CPU cycles",
+	"instructions",
+	"cache references",
+	"cache misses",
+	"branches",
+	"branch misses",
+	"bus cycles",
+};
+
+static char *sw_event_names[] = {
+	"cpu clock ticks",
+	"task clock ticks",
+	"pagefaults",
+	"context switches",
+	"CPU migrations",
+	"minor faults",
+	"major faults",
+};
+
+char *event_name(int ctr)
+{
+	__u64 config = event_id[ctr];
+	int type = PERF_COUNTER_TYPE(config);
+	int id = PERF_COUNTER_ID(config);
+	static char buf[32];
+
+	if (PERF_COUNTER_RAW(config)) {
+		sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
+		return buf;
+	}
+
+	switch (type) {
+	case PERF_TYPE_HARDWARE:
+		if (id < PERF_HW_EVENTS_MAX)
+			return hw_event_names[id];
+		return "unknown-hardware";
+
+	case PERF_TYPE_SOFTWARE:
+		if (id < PERF_SW_EVENTS_MAX)
+			return sw_event_names[id];
+		return "unknown-software";
+
+	default:
+		break;
+	}
+
+	return "unknown";
+}
+
 /*
  * Each event can have multiple symbolic names.
  * Symbolic names are (almost) exactly matched.
@@ -46,12 +105,23 @@ static __u64 match_event_symbols(const char *str)
 	__u64 config, id;
 	int type;
 	unsigned int i;
+	char mask_str[4];
 
 	if (sscanf(str, "r%llx", &config) == 1)
 		return config | PERF_COUNTER_RAW_MASK;
 
-	if (sscanf(str, "%d:%llu", &type, &id) == 2)
-		return EID(type, id);
+	switch (sscanf(str, "%d:%llu:%2s", &type, &id, mask_str)) {
+		case 3:
+			if (strchr(mask_str, 'k'))
+				event_mask[nr_counters] |= EVENT_MASK_USER;
+			if (strchr(mask_str, 'u'))
+				event_mask[nr_counters] |= EVENT_MASK_KERNEL;
+		case 2:
+			return EID(type, id);
+
+		default:
+			break;
+	}
 
 	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
 		if (!strncmp(str, event_symbols[i].symbol,
@@ -86,14 +156,6 @@ again:
 	return 0;
 }
 
-#define __PERF_COUNTER_FIELD(config, name) \
-	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
-
-#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
-#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
-#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
-#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
-
 /*
  * Create the help text for the event symbols:
  */
diff --git a/Documentation/perf_counter/util/parse-events.h b/Documentation/perf_counter/util/parse-events.h
index 6e2ebe5..0da306b 100644
--- a/Documentation/perf_counter/util/parse-events.h
+++ b/Documentation/perf_counter/util/parse-events.h
@@ -1,6 +1,16 @@
 
+/*
+ * Parse symbolic events/counts passed in as options:
+ */
+
 extern int nr_counters;
 extern __u64			event_id[MAX_COUNTERS];
+extern int			event_mask[MAX_COUNTERS];
+
+#define EVENT_MASK_KERNEL	1
+#define EVENT_MASK_USER		2
+
+extern char *event_name(int ctr);
 
 extern int parse_events(const struct option *opt, const char *str, int unset);
 
-- 
cgit v0.10.2


From b456bae0ff4f3cf91639dd32b2bfc49b1c30b4b0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 09:17:18 +0200
Subject: perf top: Convert to Git option parsing

Remove getopt usage and use Git's much more advanced and more compact
command option library.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 626b320..87b925c 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -45,8 +45,10 @@
 
 #include "perf.h"
 #include "util/util.h"
+#include "util/util.h"
+#include "util/parse-options.h"
+#include "util/parse-events.h"
 
-#include <getopt.h>
 #include <assert.h>
 #include <fcntl.h>
 
@@ -70,8 +72,7 @@
 
 static int			system_wide			=  0;
 
-static int			nr_counters			=  0;
-static __u64			event_id[MAX_COUNTERS]		= {
+static __u64			default_event_id[MAX_COUNTERS]		= {
 	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
 	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
 	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
@@ -88,7 +89,7 @@ static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
 
 static __u64			count_filter		       = 100;
 
-static int			tid				= -1;
+static int			target_pid				= -1;
 static int			profile_cpu			= -1;
 static int			nr_cpus				=  0;
 static int			nmi				=  1;
@@ -100,8 +101,6 @@ static int			use_mmap			= 0;
 static int			use_munmap			= 0;
 static int			freq				= 0;
 
-static char			*vmlinux;
-
 static char			*sym_filter;
 static unsigned long		filter_start;
 static unsigned long		filter_end;
@@ -110,18 +109,6 @@ static int			delay_secs			=  2;
 static int			zero;
 static int			dump_symtab;
 
-static int			scale;
-
-struct source_line {
-	uint64_t		EIP;
-	unsigned long		count;
-	char			*line;
-	struct source_line	*next;
-};
-
-static struct source_line	*lines;
-static struct source_line	**lines_tail;
-
 static const unsigned int default_count[] = {
 	1000000,
 	1000000,
@@ -131,194 +118,6 @@ static const unsigned int default_count[] = {
 	  10000,
 };
 
-static char *hw_event_names[] = {
-	"CPU cycles",
-	"instructions",
-	"cache references",
-	"cache misses",
-	"branches",
-	"branch misses",
-	"bus cycles",
-};
-
-static char *sw_event_names[] = {
-	"cpu clock ticks",
-	"task clock ticks",
-	"pagefaults",
-	"context switches",
-	"CPU migrations",
-	"minor faults",
-	"major faults",
-};
-
-struct event_symbol {
-	__u64 event;
-	char *symbol;
-};
-
-static struct event_symbol event_symbols[] = {
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
-
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
-};
-
-#define __PERF_COUNTER_FIELD(config, name) \
-	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
-
-#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
-#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
-#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
-#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
-
-static void display_events_help(void)
-{
-	unsigned int i;
-	__u64 e;
-
-	printf(
-	" -e EVENT     --event=EVENT   #  symbolic-name        abbreviations");
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		int type, id;
-
-		e = event_symbols[i].event;
-		type = PERF_COUNTER_TYPE(e);
-		id = PERF_COUNTER_ID(e);
-
-		printf("\n                             %d:%d: %-20s",
-				type, id, event_symbols[i].symbol);
-	}
-
-	printf("\n"
-	"                           rNNN: raw PMU events (eventsel+umask)\n\n");
-}
-
-static void display_help(void)
-{
-	printf(
-	"Usage: kerneltop [<options>]\n"
-	"   Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
-	"KernelTop Options (up to %d event types can be specified at once):\n\n",
-		 MAX_COUNTERS);
-
-	display_events_help();
-
-	printf(
-	" -c CNT    --count=CNT        # event period to sample\n\n"
-	" -C CPU    --cpu=CPU          # CPU (-1 for all)                 [default: -1]\n"
-	" -p PID    --pid=PID          # PID of sampled task (-1 for all) [default: -1]\n\n"
-	" -l                           # show scale factor for RR events\n"
-	" -d delay  --delay=<seconds>  # sampling/display delay           [default:  2]\n"
-	" -f CNT    --filter=CNT       # min-event-count filter          [default: 100]\n\n"
-	" -r prio   --realtime=<prio>  # event acquisition runs with SCHED_FIFO policy\n"
-	" -s symbol --symbol=<symbol>  # function to be showed annotated one-shot\n"
-	" -x path   --vmlinux=<path>   # the vmlinux binary, required for -s use\n"
-	" -z        --zero             # zero counts after display\n"
-	" -D        --dump_symtab      # dump symbol table to stderr on startup\n"
-	" -m pages  --mmap_pages=<pages> # number of mmap data pages\n"
-	" -M        --mmap_info        # print mmap info stream\n"
-	" -U        --munmap_info      # print munmap info stream\n"
-	);
-
-	exit(0);
-}
-
-static char *event_name(int ctr)
-{
-	__u64 config = event_id[ctr];
-	int type = PERF_COUNTER_TYPE(config);
-	int id = PERF_COUNTER_ID(config);
-	static char buf[32];
-
-	if (PERF_COUNTER_RAW(config)) {
-		sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
-		return buf;
-	}
-
-	switch (type) {
-	case PERF_TYPE_HARDWARE:
-		if (id < PERF_HW_EVENTS_MAX)
-			return hw_event_names[id];
-		return "unknown-hardware";
-
-	case PERF_TYPE_SOFTWARE:
-		if (id < PERF_SW_EVENTS_MAX)
-			return sw_event_names[id];
-		return "unknown-software";
-
-	default:
-		break;
-	}
-
-	return "unknown";
-}
-
-/*
- * Each event can have multiple symbolic names.
- * Symbolic names are (almost) exactly matched.
- */
-static __u64 match_event_symbols(char *str)
-{
-	__u64 config, id;
-	int type;
-	unsigned int i;
-
-	if (sscanf(str, "r%llx", &config) == 1)
-		return config | PERF_COUNTER_RAW_MASK;
-
-	if (sscanf(str, "%d:%llu", &type, &id) == 2)
-		return EID(type, id);
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		if (!strncmp(str, event_symbols[i].symbol,
-			     strlen(event_symbols[i].symbol)))
-			return event_symbols[i].event;
-	}
-
-	return ~0ULL;
-}
-
-static int parse_events(char *str)
-{
-	__u64 config;
-
-again:
-	if (nr_counters == MAX_COUNTERS)
-		return -1;
-
-	config = match_event_symbols(str);
-	if (config == ~0ULL)
-		return -1;
-
-	event_id[nr_counters] = config;
-	nr_counters++;
-
-	str = strstr(str, ",");
-	if (str) {
-		str++;
-		goto again;
-	}
-
-	return 0;
-}
-
 /*
  * Symbols
  */
@@ -331,7 +130,6 @@ struct sym_entry {
 	char			*sym;
 	unsigned long		count[MAX_COUNTERS];
 	int			skip;
-	struct source_line	*source;
 };
 
 #define MAX_SYMS		100000
@@ -342,8 +140,6 @@ struct sym_entry		*sym_filter_entry;
 
 static struct sym_entry		sym_table[MAX_SYMS];
 
-static void show_details(struct sym_entry *sym);
-
 /*
  * Ordering weight: count-1 * count-2 * ... / count-n
  */
@@ -419,15 +215,15 @@ static void print_sym_table(void)
 
 	printf( "], ");
 
-	if (tid != -1)
-		printf(" (tid: %d", tid);
+	if (target_pid != -1)
+		printf(" (target_pid: %d", target_pid);
 	else
 		printf(" (all");
 
 	if (profile_cpu != -1)
 		printf(", cpu: %d)\n", profile_cpu);
 	else {
-		if (tid != -1)
+		if (target_pid != -1)
 			printf(")\n");
 		else
 			printf(", %d CPUs)\n", nr_cpus);
@@ -463,9 +259,6 @@ static void print_sym_table(void)
 				pcnt, tmp[i].addr, tmp[i].sym);
 	}
 
-	if (sym_filter_entry)
-		show_details(sym_filter_entry);
-
 	{
 		struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
 
@@ -628,134 +421,8 @@ static void parse_symbols(void)
 	}
 }
 
-/*
- * Source lines
- */
-
-static void parse_vmlinux(char *filename)
-{
-	FILE *file;
-	char command[PATH_MAX*2];
-	if (!filename)
-		return;
-
-	sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
-
-	file = popen(command, "r");
-	if (!file)
-		return;
-
-	lines_tail = &lines;
-	while (!feof(file)) {
-		struct source_line *src;
-		size_t dummy = 0;
-		char *c;
-
-		src = malloc(sizeof(struct source_line));
-		assert(src != NULL);
-		memset(src, 0, sizeof(struct source_line));
-
-		if (getline(&src->line, &dummy, file) < 0)
-			break;
-		if (!src->line)
-			break;
-
-		c = strchr(src->line, '\n');
-		if (c)
-			*c = 0;
-
-		src->next = NULL;
-		*lines_tail = src;
-		lines_tail = &src->next;
-
-		if (strlen(src->line)>8 && src->line[8] == ':')
-			src->EIP = strtoull(src->line, NULL, 16);
-		if (strlen(src->line)>8 && src->line[16] == ':')
-			src->EIP = strtoull(src->line, NULL, 16);
-	}
-	pclose(file);
-}
-
-static void record_precise_ip(uint64_t ip)
-{
-	struct source_line *line;
-
-	for (line = lines; line; line = line->next) {
-		if (line->EIP == ip)
-			line->count++;
-		if (line->EIP > ip)
-			break;
-	}
-}
-
-static void lookup_sym_in_vmlinux(struct sym_entry *sym)
-{
-	struct source_line *line;
-	char pattern[PATH_MAX];
-	sprintf(pattern, "<%s>:", sym->sym);
-
-	for (line = lines; line; line = line->next) {
-		if (strstr(line->line, pattern)) {
-			sym->source = line;
-			break;
-		}
-	}
-}
-
-static void show_lines(struct source_line *line_queue, int line_queue_count)
-{
-	int i;
-	struct source_line *line;
-
-	line = line_queue;
-	for (i = 0; i < line_queue_count; i++) {
-		printf("%8li\t%s\n", line->count, line->line);
-		line = line->next;
-	}
-}
-
 #define TRACE_COUNT     3
 
-static void show_details(struct sym_entry *sym)
-{
-	struct source_line *line;
-	struct source_line *line_queue = NULL;
-	int displayed = 0;
-	int line_queue_count = 0;
-
-	if (!sym->source)
-		lookup_sym_in_vmlinux(sym);
-	if (!sym->source)
-		return;
-
-	printf("Showing details for %s\n", sym->sym);
-
-	line = sym->source;
-	while (line) {
-		if (displayed && strstr(line->line, ">:"))
-			break;
-
-		if (!line_queue_count)
-			line_queue = line;
-		line_queue_count ++;
-
-		if (line->count >= count_filter) {
-			show_lines(line_queue, line_queue_count);
-			line_queue_count = 0;
-			line_queue = NULL;
-		} else if (line_queue_count > TRACE_COUNT) {
-			line_queue = line_queue->next;
-			line_queue_count --;
-		}
-
-		line->count = 0;
-		displayed++;
-		if (displayed > 300)
-			break;
-		line = line->next;
-	}
-}
-
 /*
  * Binary search in the histogram table and record the hit:
  */
@@ -764,8 +431,6 @@ static void record_ip(uint64_t ip, int counter)
 	int left_idx, middle_idx, right_idx, idx;
 	unsigned long left, middle, right;
 
-	record_precise_ip(ip);
-
 	left_idx = 0;
 	right_idx = sym_table_count-1;
 	assert(ip <= max_ip && ip >= min_ip);
@@ -822,97 +487,6 @@ static void process_event(uint64_t ip, int counter)
 	record_ip(ip, counter);
 }
 
-static void process_options(int argc, char **argv)
-{
-	int error = 0, counter;
-
-	for (;;) {
-		int option_index = 0;
-		/** Options for getopt */
-		static struct option long_options[] = {
-			{"count",	required_argument,	NULL, 'c'},
-			{"cpu",		required_argument,	NULL, 'C'},
-			{"delay",	required_argument,	NULL, 'd'},
-			{"dump_symtab",	no_argument,		NULL, 'D'},
-			{"event",	required_argument,	NULL, 'e'},
-			{"filter",	required_argument,	NULL, 'f'},
-			{"group",	required_argument,	NULL, 'g'},
-			{"help",	no_argument,		NULL, 'h'},
-			{"nmi",		required_argument,	NULL, 'n'},
-			{"mmap_info",	no_argument,		NULL, 'M'},
-			{"mmap_pages",	required_argument,	NULL, 'm'},
-			{"munmap_info",	no_argument,		NULL, 'U'},
-			{"pid",		required_argument,	NULL, 'p'},
-			{"realtime",	required_argument,	NULL, 'r'},
-			{"scale",	no_argument,		NULL, 'l'},
-			{"symbol",	required_argument,	NULL, 's'},
-			{"stat",	no_argument,		NULL, 'S'},
-			{"vmlinux",	required_argument,	NULL, 'x'},
-			{"zero",	no_argument,		NULL, 'z'},
-			{"freq",	required_argument,	NULL, 'F'},
-			{NULL,		0,			NULL,  0 }
-		};
-		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMUF:",
-				    long_options, &option_index);
-		if (c == -1)
-			break;
-
-		switch (c) {
-		case 'a': system_wide			=	       1; break;
-		case 'c': default_interval		=   atoi(optarg); break;
-		case 'C':
-			/* CPU and PID are mutually exclusive */
-			if (tid != -1) {
-				printf("WARNING: CPU switch overriding PID\n");
-				sleep(1);
-				tid = -1;
-			}
-			profile_cpu			=   atoi(optarg); break;
-		case 'd': delay_secs			=   atoi(optarg); break;
-		case 'D': dump_symtab			=              1; break;
-
-		case 'e': error				= parse_events(optarg); break;
-
-		case 'f': count_filter			=   atoi(optarg); break;
-		case 'g': group				=   atoi(optarg); break;
-		case 'h':      				  display_help(); break;
-		case 'l': scale				=	       1; break;
-		case 'n': nmi				=   atoi(optarg); break;
-		case 'p':
-			/* CPU and PID are mutually exclusive */
-			if (profile_cpu != -1) {
-				printf("WARNING: PID switch overriding CPU\n");
-				sleep(1);
-				profile_cpu = -1;
-			}
-			tid				=   atoi(optarg); break;
-		case 'r': realtime_prio			=   atoi(optarg); break;
-		case 's': sym_filter			= strdup(optarg); break;
-		case 'x': vmlinux			= strdup(optarg); break;
-		case 'z': zero				=              1; break;
-		case 'm': mmap_pages			=   atoi(optarg); break;
-		case 'M': use_mmap			=              1; break;
-		case 'U': use_munmap			=              1; break;
-		case 'F': freq = 1; default_interval	=   atoi(optarg); break;
-		default: error = 1; break;
-		}
-	}
-	if (error)
-		display_help();
-
-	if (!nr_counters) {
-		nr_counters = 1;
-		event_id[0] = 0;
-	}
-
-	for (counter = 0; counter < nr_counters; counter++) {
-		if (event_count[counter])
-			continue;
-
-		event_count[counter] = default_interval;
-	}
-}
-
 struct mmap_data {
 	int counter;
 	void *base;
@@ -973,11 +547,11 @@ static void mmap_read(struct mmap_data *md)
 		struct ip_event {
 			struct perf_event_header header;
 			__u64 ip;
-			__u32 pid, tid;
+			__u32 pid, target_pid;
 		};
 		struct mmap_event {
 			struct perf_event_header header;
-			__u32 pid, tid;
+			__u32 pid, target_pid;
 			__u64 start;
 			__u64 len;
 			__u64 pgoff;
@@ -1043,7 +617,7 @@ static void mmap_read(struct mmap_data *md)
 static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
 static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 
-int cmd_top(int argc, char **argv, const char *prefix)
+static int __cmd_top(void)
 {
 	struct perf_counter_hw_event hw_event;
 	pthread_t thread;
@@ -1051,27 +625,12 @@ int cmd_top(int argc, char **argv, const char *prefix)
 	unsigned int cpu;
 	int ret;
 
-	page_size = sysconf(_SC_PAGE_SIZE);
-
-	process_options(argc, argv);
-
-	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
-	assert(nr_cpus <= MAX_NR_CPUS);
-	assert(nr_cpus >= 0);
-
-	if (tid != -1 || profile_cpu != -1)
-		nr_cpus = 1;
-
-	parse_symbols();
-	if (vmlinux && sym_filter_entry)
-		parse_vmlinux(vmlinux);
-
 	for (i = 0; i < nr_cpus; i++) {
 		group_fd = -1;
 		for (counter = 0; counter < nr_counters; counter++) {
 
 			cpu	= profile_cpu;
-			if (tid == -1 && profile_cpu == -1)
+			if (target_pid == -1 && profile_cpu == -1)
 				cpu = i;
 
 			memset(&hw_event, 0, sizeof(hw_event));
@@ -1083,7 +642,7 @@ int cmd_top(int argc, char **argv, const char *prefix)
 			hw_event.munmap		= use_munmap;
 			hw_event.freq		= freq;
 
-			fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
+			fd[i][counter] = sys_perf_counter_open(&hw_event, target_pid, cpu, group_fd, 0);
 			if (fd[i][counter] < 0) {
 				int err = errno;
 				printf("kerneltop error: syscall returned with %d (%s)\n",
@@ -1147,3 +706,95 @@ int cmd_top(int argc, char **argv, const char *prefix)
 
 	return 0;
 }
+
+static const char * const top_usage[] = {
+	"perf top [<options>]",
+	NULL
+};
+
+static char events_help_msg[EVENTS_HELP_MAX];
+
+static const struct option options[] = {
+	OPT_CALLBACK('e', "event", NULL, "event",
+		     events_help_msg, parse_events),
+	OPT_INTEGER('c', "count", &default_interval,
+		    "event period to sample"),
+	OPT_INTEGER('p', "pid", &target_pid,
+		    "profile events on existing pid"),
+	OPT_BOOLEAN('a', "all-cpus", &system_wide,
+			    "system-wide collection from all CPUs"),
+	OPT_INTEGER('C', "CPU", &profile_cpu,
+		    "CPU to profile on"),
+	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
+		    "number of mmap data pages"),
+	OPT_INTEGER('r', "realtime", &realtime_prio,
+		    "collect data with this RT SCHED_FIFO priority"),
+	OPT_INTEGER('d', "delay", &realtime_prio,
+		    "number of seconds to delay between refreshes"),
+	OPT_BOOLEAN('D', "dump-symtab", &dump_symtab,
+			    "dump the symbol table used for profiling"),
+	OPT_INTEGER('f', "--count-filter", &count_filter,
+		    "only display functions with more events than this"),
+	OPT_BOOLEAN('g', "group", &group,
+			    "put the counters into a counter group"),
+	OPT_STRING('s', "sym-filter", &sym_filter, "pattern",
+		    "only display symbols matchig this pattern"),
+	OPT_BOOLEAN('z', "zero", &group,
+		    "zero history across updates"),
+	OPT_BOOLEAN('M', "use-mmap", &use_mmap,
+		    "track mmap events"),
+	OPT_BOOLEAN('U', "use-munmap", &use_munmap,
+		    "track munmap events"),
+	OPT_INTEGER('F', "--freq", &freq,
+		    "profile at this frequency"),
+	OPT_END()
+};
+
+int cmd_top(int argc, const char **argv, const char *prefix)
+{
+	int counter;
+
+	page_size = sysconf(_SC_PAGE_SIZE);
+
+	create_events_help(events_help_msg);
+	memcpy(event_id, default_event_id, sizeof(default_event_id));
+
+	argc = parse_options(argc, argv, options, top_usage, 0);
+	if (argc)
+		usage_with_options(top_usage, options);
+
+	if (freq) {
+		default_interval = freq;
+		freq = 1;
+	}
+
+	/* CPU and PID are mutually exclusive */
+	if (target_pid != -1 && profile_cpu != -1) {
+		printf("WARNING: PID switch overriding CPU\n");
+		sleep(1);
+		profile_cpu = -1;
+	}
+
+	if (!nr_counters) {
+		nr_counters = 1;
+		event_id[0] = 0;
+	}
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		if (event_count[counter])
+			continue;
+
+		event_count[counter] = default_interval;
+	}
+
+	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	assert(nr_cpus <= MAX_NR_CPUS);
+	assert(nr_cpus >= 0);
+
+	if (target_pid != -1 || profile_cpu != -1)
+		nr_cpus = 1;
+
+	parse_symbols();
+
+	return __cmd_top();
+}
-- 
cgit v0.10.2


From 8fa66bdcc81dd4fc2c91228074d0a4698120c5a2 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 18 May 2009 12:45:42 -0300
Subject: perf_counter: First part of 'perf report' conversion to C + elfutils

Integrate perf-report into 'perf', as builtin-report.c.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 45daa72..49c601e 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -228,7 +228,6 @@ COMPAT_CFLAGS =
 COMPAT_OBJS =
 LIB_H =
 LIB_OBJS =
-PROGRAMS = perf-report
 SCRIPT_PERL =
 SCRIPT_SH =
 TEST_PROGRAMS =
@@ -315,6 +314,7 @@ LIB_OBJS += util/wrapper.o
 
 BUILTIN_OBJS += builtin-help.o
 BUILTIN_OBJS += builtin-record.o
+BUILTIN_OBJS += builtin-report.o
 BUILTIN_OBJS += builtin-stat.o
 BUILTIN_OBJS += builtin-top.o
 
@@ -811,10 +811,6 @@ clean:
 	$(RM) $(htmldocs).tar.gz $(manpages).tar.gz
 	$(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-BUILD-OPTIONS
 
-# temporary hack:
-perf-report: perf-report.cc ../../include/linux/perf_counter.h Makefile
-	g++ -g -O2 -Wall -lrt -o $@ $<
-
 .PHONY: all install clean strip
 .PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
 .PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope .FORCE-PERF-CFLAGS
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
new file mode 100644
index 0000000..864f68f
--- /dev/null
+++ b/Documentation/perf_counter/builtin-report.c
@@ -0,0 +1,751 @@
+#define _GNU_SOURCE
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <errno.h>
+#include <ctype.h>
+#include <time.h>
+#include <getopt.h>
+#include <assert.h>
+#include <search.h>
+
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <linux/unistd.h>
+#include <linux/types.h>
+
+#include "../../include/linux/perf_counter.h"
+#include "list.h"
+
+#define SHOW_KERNEL	1
+#define SHOW_USER	2
+#define SHOW_HV		4
+
+static char 		const *input_name = "output.perf";
+static int		input;
+static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
+
+static unsigned long	page_size;
+static unsigned long	mmap_window = 32;
+
+static const char *perf_event_names[] = {
+	[PERF_EVENT_MMAP]   = " PERF_EVENT_MMAP",
+	[PERF_EVENT_MUNMAP] = " PERF_EVENT_MUNMAP",
+	[PERF_EVENT_COMM]   = " PERF_EVENT_COMM",
+};
+
+struct ip_event {
+	struct perf_event_header header;
+	__u64 ip;
+	__u32 pid, tid;
+};
+struct mmap_event {
+	struct perf_event_header header;
+	__u32 pid, tid;
+	__u64 start;
+	__u64 len;
+	__u64 pgoff;
+	char filename[PATH_MAX];
+};
+struct comm_event {
+	struct perf_event_header header;
+	__u32 pid,tid;
+	char comm[16];
+};
+
+typedef union event_union {
+	struct perf_event_header header;
+	struct ip_event ip;
+	struct mmap_event mmap;
+	struct comm_event comm;
+} event_t;
+
+struct section {
+	struct list_head node;
+	uint64_t	 start;
+	uint64_t	 end;
+	uint64_t	 offset;
+	char		 name[0];
+};
+
+static struct section *section__new(uint64_t start, uint64_t size,
+				    uint64_t offset, char *name)
+{
+	struct section *self = malloc(sizeof(*self) + strlen(name) + 1);
+
+	if (self != NULL) {
+		self->start  = start;
+		self->end    = start + size;
+		self->offset = offset;
+		strcpy(self->name, name);
+	}
+
+	return self;
+}
+
+static void section__delete(struct section *self)
+{
+	free(self);
+}
+
+struct symbol {
+	struct list_head node;
+	uint64_t	 start;
+	uint64_t	 end;
+	char		 name[0];
+};
+
+static struct symbol *symbol__new(uint64_t start, uint64_t len, const char *name)
+{
+	struct symbol *self = malloc(sizeof(*self) + strlen(name) + 1);
+
+	if (self != NULL) {
+		self->start = start;
+		self->end   = start + len;
+		strcpy(self->name, name);
+	}
+
+	return self;
+}
+
+static void symbol__delete(struct symbol *self)
+{
+	free(self);
+}
+
+static size_t symbol__fprintf(struct symbol *self, FILE *fp)
+{
+	return fprintf(fp, " %lx-%lx %s\n",
+		       self->start, self->end, self->name);
+}
+
+struct dso {
+	struct list_head node;
+	struct list_head sections;
+	struct list_head syms;
+	char		 name[0];
+};
+
+static struct dso *dso__new(const char *name)
+{
+	struct dso *self = malloc(sizeof(*self) + strlen(name) + 1);
+
+	if (self != NULL) {
+		strcpy(self->name, name);
+		INIT_LIST_HEAD(&self->sections);
+		INIT_LIST_HEAD(&self->syms);
+	}
+
+	return self;
+}
+
+static void dso__delete_sections(struct dso *self)
+{
+	struct section *pos, *n;
+
+	list_for_each_entry_safe(pos, n, &self->sections, node)
+		section__delete(pos);
+}
+
+static void dso__delete_symbols(struct dso *self)
+{
+	struct symbol *pos, *n;
+
+	list_for_each_entry_safe(pos, n, &self->syms, node)
+		symbol__delete(pos);
+}
+
+static void dso__delete(struct dso *self)
+{
+	dso__delete_sections(self);
+	dso__delete_symbols(self);
+	free(self);
+}
+
+static void dso__insert_symbol(struct dso *self, struct symbol *sym)
+{
+	list_add_tail(&sym->node, &self->syms);
+}
+
+static struct symbol *dso__find_symbol(struct dso *self, uint64_t ip)
+{
+	if (self == NULL)
+		return NULL;
+
+	struct symbol *pos;
+
+	list_for_each_entry(pos, &self->syms, node)
+		if (ip >= pos->start && ip <= pos->end)
+			return pos;
+
+	return NULL;
+}
+
+static int dso__load(struct dso *self)
+{
+	/* FIXME */
+	return 0;
+}
+
+static size_t dso__fprintf(struct dso *self, FILE *fp)
+{
+	struct symbol *pos;
+	size_t ret = fprintf(fp, "dso: %s\n", self->name);
+
+	list_for_each_entry(pos, &self->syms, node)
+		ret += symbol__fprintf(pos, fp);
+
+	return ret;
+}
+
+static LIST_HEAD(dsos);
+static struct dso *kernel_dso;
+
+static void dsos__add(struct dso *dso)
+{
+	list_add_tail(&dso->node, &dsos);
+}
+
+static struct dso *dsos__find(const char *name)
+{
+	struct dso *pos;
+
+	list_for_each_entry(pos, &dsos, node)
+		if (strcmp(pos->name, name) == 0)
+			return pos;
+	return NULL;
+}
+
+static struct dso *dsos__findnew(const char *name)
+{
+	struct dso *dso = dsos__find(name);
+
+	if (dso == NULL) {
+		dso = dso__new(name);
+		if (dso != NULL && dso__load(dso) < 0)
+			goto out_delete_dso;
+
+		dsos__add(dso);
+	}
+
+	return dso;
+
+out_delete_dso:
+	dso__delete(dso);
+	return NULL;
+}
+
+static void dsos__fprintf(FILE *fp)
+{
+	struct dso *pos;
+
+	list_for_each_entry(pos, &dsos, node)
+		dso__fprintf(pos, fp);
+}
+
+static int load_kallsyms(void)
+{
+	kernel_dso = dso__new("[kernel]");
+	if (kernel_dso == NULL)
+		return -1;
+
+	FILE *file = fopen("/proc/kallsyms", "r");
+
+	if (file == NULL)
+		goto out_delete_dso;
+
+	char *line = NULL;
+	size_t n;
+
+	while (!feof(file)) {
+		unsigned long long start;
+		char c, symbf[4096];
+
+		if (getline(&line, &n, file) < 0)
+			break;
+
+		if (!line)
+			goto out_delete_dso;
+
+		if (sscanf(line, "%llx %c %s", &start, &c, symbf) == 3) {
+			struct symbol *sym = symbol__new(start, 0x1000000, symbf);
+
+			if (sym == NULL)
+				goto out_delete_dso;
+
+			dso__insert_symbol(kernel_dso, sym);
+		}
+	}
+
+	dsos__add(kernel_dso);
+	free(line);
+	fclose(file);
+	return 0;
+
+out_delete_dso:
+	dso__delete(kernel_dso);
+	return -1;
+}
+
+struct map {
+	struct list_head node;
+	uint64_t	 start;
+	uint64_t	 end;
+	uint64_t	 pgoff;
+	struct dso	 *dso;
+};
+
+static struct map *map__new(struct mmap_event *event)
+{
+	struct map *self = malloc(sizeof(*self));
+
+	if (self != NULL) {
+		self->start = event->start;
+		self->end   = event->start + event->len;
+		self->pgoff = event->pgoff;
+
+		self->dso = dsos__findnew(event->filename);
+		if (self->dso == NULL)
+			goto out_delete;
+	}
+	return self;
+out_delete:
+	free(self);
+	return NULL;
+}
+
+static size_t map__fprintf(struct map *self, FILE *fp)
+{
+	return fprintf(fp, " %lx-%lx %lx %s\n",
+		       self->start, self->end, self->pgoff, self->dso->name);
+}
+
+struct symhist {
+	struct list_head node;
+	struct dso	 *dso;
+	struct symbol	 *sym;
+	uint32_t	 count;
+	char		 level;
+};
+
+static struct symhist *symhist__new(struct symbol *sym, struct dso *dso,
+				    char level)
+{
+	struct symhist *self = malloc(sizeof(*self));
+
+	if (self != NULL) {
+		self->sym   = sym;
+		self->dso   = dso;
+		self->level = level;
+		self->count = 0;
+	}
+
+	return self;
+}
+
+static void symhist__delete(struct symhist *self)
+{
+	free(self);
+}
+
+static bool symhist__equal(struct symhist *self, struct symbol *sym,
+			   struct dso *dso, char level)
+{
+	return self->level == level && self->sym == sym && self->dso == dso;
+}
+
+static void symhist__inc(struct symhist *self)
+{
+	++self->count;
+}
+
+static size_t symhist__fprintf(struct symhist *self, FILE *fp)
+{
+	size_t ret = fprintf(fp, "[%c] ", self->level);
+
+	if (self->level != '.')
+		ret += fprintf(fp, "%s", self->sym->name);
+	else
+		ret += fprintf(fp, "%s: %s",
+			       self->dso ? self->dso->name : "<unknown",
+			       self->sym ? self->sym->name : "<unknown>");
+	return ret + fprintf(fp, ": %u\n", self->count);
+}
+
+struct thread {
+	struct list_head node;
+	struct list_head maps;
+	struct list_head symhists;
+	pid_t		 pid;
+	char		 *comm;
+};
+
+static struct thread *thread__new(pid_t pid)
+{
+	struct thread *self = malloc(sizeof(*self));
+
+	if (self != NULL) {
+		self->pid = pid;
+		self->comm = NULL;
+		INIT_LIST_HEAD(&self->maps);
+		INIT_LIST_HEAD(&self->symhists);
+	}
+
+	return self;
+}
+
+static void thread__insert_symhist(struct thread *self,
+				   struct symhist *symhist)
+{
+	list_add_tail(&symhist->node, &self->symhists);
+}
+
+static struct symhist *thread__symhists_find(struct thread *self,
+					     struct symbol *sym,
+					     struct dso *dso, char level)
+{
+	struct symhist *pos;
+
+	list_for_each_entry(pos, &self->symhists, node)
+		if (symhist__equal(pos, sym, dso, level))
+			return pos;
+
+	return NULL;
+}
+
+static int thread__symbol_incnew(struct thread *self, struct symbol *sym,
+				 struct dso *dso, char level)
+{
+	struct symhist *symhist = thread__symhists_find(self, sym, dso, level);
+
+	if (symhist == NULL) {
+		symhist = symhist__new(sym, dso, level);
+		if (symhist == NULL)
+			goto out_error;
+		thread__insert_symhist(self, symhist);
+	}
+
+	symhist__inc(symhist);
+	return 0;
+out_error:
+	return -ENOMEM;
+}
+
+static int thread__set_comm(struct thread *self, const char *comm)
+{
+	self->comm = strdup(comm);
+	return self->comm ? 0 : -ENOMEM;
+}
+
+static size_t thread__maps_fprintf(struct thread *self, FILE *fp)
+{
+	struct map *pos;
+	size_t ret = 0;
+
+	list_for_each_entry(pos, &self->maps, node)
+		ret += map__fprintf(pos, fp);
+
+	return ret;
+}
+
+static size_t thread__fprintf(struct thread *self, FILE *fp)
+{
+	struct symhist *pos;
+	int ret = fprintf(fp, "thread: %d %s\n", self->pid, self->comm);
+
+	list_for_each_entry(pos, &self->symhists, node)
+		ret += symhist__fprintf(pos, fp);
+
+	return ret;
+}
+
+static LIST_HEAD(threads);
+
+static void threads__add(struct thread *thread)
+{
+	list_add_tail(&thread->node, &threads);
+}
+
+static struct thread *threads__find(pid_t pid)
+{
+	struct thread *pos;
+
+	list_for_each_entry(pos, &threads, node)
+		if (pos->pid == pid)
+			return pos;
+	return NULL;
+}
+
+static struct thread *threads__findnew(pid_t pid)
+{
+	struct thread *thread = threads__find(pid);
+
+	if (thread == NULL) {
+		thread = thread__new(pid);
+		if (thread != NULL)
+			threads__add(thread);
+	}
+
+	return thread;
+}
+
+static void thread__insert_map(struct thread *self, struct map *map)
+{
+	list_add_tail(&map->node, &self->maps);
+}
+
+static struct map *thread__find_map(struct thread *self, uint64_t ip)
+{
+	if (self == NULL)
+		return NULL;
+
+	struct map *pos;
+
+	list_for_each_entry(pos, &self->maps, node)
+		if (ip >= pos->start && ip <= pos->end)
+			return pos;
+
+	return NULL;
+}
+
+static void threads__fprintf(FILE *fp)
+{
+	struct thread *pos;
+
+	list_for_each_entry(pos, &threads, node)
+		thread__fprintf(pos, fp);
+}
+
+#if 0
+static std::string resolve_user_symbol(int pid, uint64_t ip)
+{
+	std::string sym = "<unknown>";
+
+	maps_t &m = maps[pid];
+	maps_t::const_iterator mi = m.upper_bound(map(ip));
+	if (mi == m.end())
+		return sym;
+
+	ip -= mi->start + mi->pgoff;
+
+	symbols_t &s = dsos[mi->dso].syms;
+	symbols_t::const_iterator si = s.upper_bound(symbol(ip));
+
+	sym = mi->dso + ": <unknown>";
+
+	if (si == s.begin())
+		return sym;
+	si--;
+
+	if (si->start <= ip && ip < si->end)
+		sym = mi->dso + ": " + si->name;
+#if 0
+	else if (si->start <= ip)
+		sym = mi->dso + ": ?" + si->name;
+#endif
+
+	return sym;
+}
+#endif
+
+static void display_help(void)
+{
+	printf(
+	"Usage: perf-report [<options>]\n"
+	" -i file   --input=<file>      # input file\n"
+	);
+
+	exit(0);
+}
+
+static void process_options(int argc, char *argv[])
+{
+	int error = 0;
+
+	for (;;) {
+		int option_index = 0;
+		/** Options for getopt */
+		static struct option long_options[] = {
+			{"input",	required_argument,	NULL, 'i'},
+			{"no-user",	no_argument,		NULL, 'u'},
+			{"no-kernel",	no_argument,		NULL, 'k'},
+			{"no-hv",	no_argument,		NULL, 'h'},
+			{NULL,		0,			NULL,  0 }
+		};
+		int c = getopt_long(argc, argv, "+:i:kuh",
+				    long_options, &option_index);
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 'i': input_name			= strdup(optarg); break;
+		case 'k': show_mask &= ~SHOW_KERNEL; break;
+		case 'u': show_mask &= ~SHOW_USER; break;
+		case 'h': show_mask &= ~SHOW_HV; break;
+		default: error = 1; break;
+		}
+	}
+
+	if (error)
+		display_help();
+}
+
+int cmd_report(int argc, char **argv)
+{
+	unsigned long offset = 0;
+	unsigned long head = 0;
+	struct stat stat;
+	char *buf;
+	event_t *event;
+	int ret, rc = EXIT_FAILURE;
+	unsigned long total = 0;
+
+	page_size = getpagesize();
+
+	process_options(argc, argv);
+
+	input = open(input_name, O_RDONLY);
+	if (input < 0) {
+		perror("failed to open file");
+		exit(-1);
+	}
+
+	ret = fstat(input, &stat);
+	if (ret < 0) {
+		perror("failed to stat file");
+		exit(-1);
+	}
+
+	if (!stat.st_size) {
+		fprintf(stderr, "zero-sized file, nothing to do!\n");
+		exit(0);
+	}
+
+	if (load_kallsyms() < 0) {
+		perror("failed to open kallsyms");
+		return EXIT_FAILURE;
+	}
+
+remap:
+	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
+			   MAP_SHARED, input, offset);
+	if (buf == MAP_FAILED) {
+		perror("failed to mmap file");
+		exit(-1);
+	}
+
+more:
+	event = (event_t *)(buf + head);
+
+	if (head + event->header.size >= page_size * mmap_window) {
+		unsigned long shift = page_size * (head / page_size);
+		int ret;
+
+		ret = munmap(buf, page_size * mmap_window);
+		assert(ret == 0);
+
+		offset += shift;
+		head -= shift;
+		goto remap;
+	}
+
+
+	if (!event->header.size) {
+		fprintf(stderr, "zero-sized event at file offset %ld\n", offset + head);
+		fprintf(stderr, "skipping %ld bytes of events.\n", stat.st_size - offset - head);
+		goto done;
+	}
+
+	head += event->header.size;
+
+	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
+		char level;
+		int show = 0;
+		struct dso *dso = NULL;
+		struct thread *thread = threads__findnew(event->ip.pid);
+
+		if (thread == NULL)
+			goto done;
+
+		if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
+			show = SHOW_KERNEL;
+			level = 'k';
+			dso = kernel_dso;
+		} else if (event->header.misc & PERF_EVENT_MISC_USER) {
+			show = SHOW_USER;
+			level = '.';
+			struct map *map = thread__find_map(thread, event->ip.ip);
+			if (map != NULL)
+				dso = map->dso;
+		} else {
+			show = SHOW_HV;
+			level = 'H';
+		}
+
+		if (show & show_mask) {
+			struct symbol *sym = dso__find_symbol(dso, event->ip.ip);
+
+			if (thread__symbol_incnew(thread, sym, dso, level))
+				goto done;
+		}
+		total++;
+	} else switch (event->header.type) {
+	case PERF_EVENT_MMAP: {
+		struct thread *thread = threads__findnew(event->mmap.pid);
+		struct map *map = map__new(&event->mmap);
+
+		if (thread == NULL || map == NULL )
+			goto done;
+		thread__insert_map(thread, map);
+		break;
+	}
+	case PERF_EVENT_COMM: {
+		struct thread *thread = threads__findnew(event->comm.pid);
+
+		if (thread == NULL ||
+		    thread__set_comm(thread, event->comm.comm))
+			goto done;
+		break;
+	}
+	}
+
+	if (offset + head < stat.st_size)
+		goto more;
+
+	rc = EXIT_SUCCESS;
+done:
+	close(input);
+	//dsos__fprintf(stdout);
+	threads__fprintf(stdout);
+#if 0
+	std::map<std::string, int>::iterator hi = hist.begin();
+
+	while (hi != hist.end()) {
+		rev_hist.insert(std::pair<int, std::string>(hi->second, hi->first));
+		hist.erase(hi++);
+	}
+
+	std::multimap<int, std::string>::const_iterator ri = rev_hist.begin();
+
+	while (ri != rev_hist.end()) {
+		printf(" %5.2f %s\n", (100.0 * ri->first)/total, ri->second.c_str());
+		ri++;
+	}
+#endif
+	return rc;
+}
+
diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h
index d32318a..5bfea57 100644
--- a/Documentation/perf_counter/builtin.h
+++ b/Documentation/perf_counter/builtin.h
@@ -16,6 +16,7 @@ extern int check_pager_config(const char *cmd);
 
 extern int cmd_help(int argc, const char **argv, const char *prefix);
 extern int cmd_record(int argc, const char **argv, const char *prefix);
+extern int cmd_report(int argc, const char **argv, const char *prefix);
 extern int cmd_stat(int argc, const char **argv, const char *prefix);
 extern int cmd_top(int argc, const char **argv, const char *prefix);
 extern int cmd_version(int argc, const char **argv, const char *prefix);
diff --git a/Documentation/perf_counter/command-list.txt b/Documentation/perf_counter/command-list.txt
index d15210a..4390292 100644
--- a/Documentation/perf_counter/command-list.txt
+++ b/Documentation/perf_counter/command-list.txt
@@ -1,6 +1,7 @@
 # List of known perf commands.
 # command name				category [deprecated] [common]
 perf-record                             mainporcelain common
+perf-report                             mainporcelain common
 perf-stat                               mainporcelain common
 perf-top                                mainporcelain common
 
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
index 1d6d7aa..e8a8584 100644
--- a/Documentation/perf_counter/perf.c
+++ b/Documentation/perf_counter/perf.c
@@ -250,6 +250,7 @@ static void handle_internal_command(int argc, const char **argv)
 	static struct cmd_struct commands[] = {
 		{ "help", cmd_help, 0 },
 		{ "record", cmd_record, 0 },
+		{ "report", cmd_report, 0 },
 		{ "stat", cmd_stat, 0 },
 		{ "top", cmd_top, 0 },
 		{ "version", cmd_version, 0 },
-- 
cgit v0.10.2


From fd4242bb35b70557eee8d0c79f82dacc3f3b89e0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 20 May 2009 12:45:34 +0200
Subject: perf_counter tools: remove the standalone perf-report utility

With a built-in 'perf report' command now available, remove the
standalone implementation for good.

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/perf-report.cc b/Documentation/perf_counter/perf-report.cc
deleted file mode 100644
index 8855107..0000000
--- a/Documentation/perf_counter/perf-report.cc
+++ /dev/null
@@ -1,515 +0,0 @@
-#define _GNU_SOURCE
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <unistd.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <errno.h>
-#include <ctype.h>
-#include <time.h>
-#include <getopt.h>
-#include <assert.h>
-
-#include <sys/ioctl.h>
-#include <sys/poll.h>
-#include <sys/prctl.h>
-#include <sys/wait.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include <linux/unistd.h>
-#include <linux/types.h>
-
-#include "../../include/linux/perf_counter.h"
-
-#include <set>
-#include <map>
-#include <string>
-
-
-#define SHOW_KERNEL	1
-#define SHOW_USER	2
-#define SHOW_HV		4
-
-static char 		const *input_name = "output.perf";
-static int		input;
-static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
-
-static unsigned long	page_size;
-static unsigned long	mmap_window = 32;
-
-struct ip_event {
-	struct perf_event_header header;
-	__u64 ip;
-	__u32 pid, tid;
-};
-struct mmap_event {
-	struct perf_event_header header;
-	__u32 pid, tid;
-	__u64 start;
-	__u64 len;
-	__u64 pgoff;
-	char filename[PATH_MAX];
-};
-struct comm_event {
-	struct perf_event_header header;
-	__u32 pid,tid;
-	char comm[16];
-};
-
-typedef union event_union {
-	struct perf_event_header header;
-	struct ip_event ip;
-	struct mmap_event mmap;
-	struct comm_event comm;
-} event_t;
-
-struct section {
-	uint64_t start;
-	uint64_t end;
-
-	uint64_t offset;
-
-	std::string name;
-
-	section() { };
-
-	section(uint64_t stab) : end(stab) { };
-
-	section(uint64_t start, uint64_t size, uint64_t offset, std::string name) :
-		start(start), end(start + size), offset(offset), name(name)
-	{ };
-
-	bool operator < (const struct section &s) const {
-		return end < s.end;
-	};
-};
-
-typedef std::set<struct section> sections_t;
-
-struct symbol {
-	uint64_t start;
-	uint64_t end;
-
-	std::string name;
-
-	symbol() { };
-
-	symbol(uint64_t ip) : start(ip) { }
-
-	symbol(uint64_t start, uint64_t len, std::string name) :
-		start(start), end(start + len), name(name)
-	{ };
-
-	bool operator < (const struct symbol &s) const {
-		return start < s.start;
-	};
-};
-
-typedef std::set<struct symbol> symbols_t;
-
-struct dso {
-	sections_t sections;
-	symbols_t syms;
-};
-
-static std::map<std::string, struct dso> dsos;
-
-static void load_dso_sections(std::string dso_name)
-{
-	struct dso &dso = dsos[dso_name];
-
-	std::string cmd = "readelf -DSW " + dso_name;
-
-	FILE *file = popen(cmd.c_str(), "r");
-	if (!file) {
-		perror("failed to open pipe");
-		exit(-1);
-	}
-
-	char *line = NULL;
-	size_t n = 0;
-
-	while (!feof(file)) {
-		uint64_t addr, off, size;
-		char name[32];
-
-		if (getline(&line, &n, file) < 0)
-			break;
-		if (!line)
-			break;
-
-		if (sscanf(line, "  [%*2d] %16s %*14s %Lx %Lx %Lx",
-					name, &addr, &off, &size) == 4) {
-
-			dso.sections.insert(section(addr, size, addr - off, name));
-		}
-#if 0
-		/*
-		 * for reading readelf symbols (-s), however these don't seem
-		 * to include nearly everything, so use nm for that.
-		 */
-		if (sscanf(line, " %*4d %*3d: %Lx %5Lu %*7s %*6s %*7s %3d %s",
-			   &start, &size, &section, sym) == 4) {
-
-			start -= dso.section_offsets[section];
-
-			dso.syms.insert(symbol(start, size, std::string(sym)));
-		}
-#endif
-	}
-	pclose(file);
-}
-
-static void load_dso_symbols(std::string dso_name, std::string args)
-{
-	struct dso &dso = dsos[dso_name];
-
-	std::string cmd = "nm -nSC " + args + " " + dso_name;
-
-	FILE *file = popen(cmd.c_str(), "r");
-	if (!file) {
-		perror("failed to open pipe");
-		exit(-1);
-	}
-
-	char *line = NULL;
-	size_t n = 0;
-
-	while (!feof(file)) {
-		uint64_t start, size;
-		char c;
-		char sym[1024];
-
-		if (getline(&line, &n, file) < 0)
-			break;
-		if (!line)
-			break;
-
-
-		if (sscanf(line, "%Lx %Lx %c %s", &start, &size, &c, sym) == 4) {
-			sections_t::const_iterator si =
-				dso.sections.upper_bound(section(start));
-			if (si == dso.sections.end()) {
-				printf("symbol in unknown section: %s\n", sym);
-				continue;
-			}
-
-			start -= si->offset;
-
-			dso.syms.insert(symbol(start, size, sym));
-		}
-	}
-	pclose(file);
-}
-
-static void load_dso(std::string dso_name)
-{
-	load_dso_sections(dso_name);
-	load_dso_symbols(dso_name, "-D"); /* dynamic symbols */
-	load_dso_symbols(dso_name, "");   /* regular ones */
-}
-
-void load_kallsyms(void)
-{
-	struct dso &dso = dsos["[kernel]"];
-
-	FILE *file = fopen("/proc/kallsyms", "r");
-	if (!file) {
-		perror("failed to open kallsyms");
-		exit(-1);
-	}
-
-	char *line;
-	size_t n;
-
-	while (!feof(file)) {
-		uint64_t start;
-		char c;
-		char sym[1024000];
-
-		if (getline(&line, &n, file) < 0)
-			break;
-		if (!line)
-			break;
-
-		if (sscanf(line, "%Lx %c %s", &start, &c, sym) == 3)
-			dso.syms.insert(symbol(start, 0x1000000, std::string(sym)));
-	}
-	fclose(file);
-}
-
-struct map {
-	uint64_t start;
-	uint64_t end;
-	uint64_t pgoff;
-
-	std::string dso;
-
-	map() { };
-
-	map(uint64_t ip) : end(ip) { }
-
-	map(mmap_event *mmap) {
-		start = mmap->start;
-		end = mmap->start + mmap->len;
-		pgoff = mmap->pgoff;
-
-		dso = std::string(mmap->filename);
-
-		if (dsos.find(dso) == dsos.end())
-			load_dso(dso);
-	};
-
-	bool operator < (const struct map &m) const {
-		return end < m.end;
-	};
-};
-
-typedef std::set<struct map> maps_t;
-
-static std::map<int, maps_t> maps;
-
-static std::map<int, std::string> comms;
-
-static std::map<std::string, int> hist;
-static std::multimap<int, std::string> rev_hist;
-
-static std::string resolve_comm(int pid)
-{
-	std::string comm;
-
-	std::map<int, std::string>::const_iterator ci = comms.find(pid);
-	if (ci != comms.end()) {
-		comm = ci->second;
-	} else {
-		char pid_str[30];
-
-		sprintf(pid_str, ":%d", pid);
-		comm = pid_str;
-	}
-
-	return comm;
-}
-
-static std::string resolve_user_symbol(int pid, uint64_t ip)
-{
-	std::string sym = "<unknown>";
-
-	maps_t &m = maps[pid];
-	maps_t::const_iterator mi = m.upper_bound(map(ip));
-	if (mi == m.end())
-		return sym;
-
-	ip -= mi->start + mi->pgoff;
-
-	symbols_t &s = dsos[mi->dso].syms;
-	symbols_t::const_iterator si = s.upper_bound(symbol(ip));
-
-	sym = mi->dso + ": <unknown>";
-
-	if (si == s.begin())
-		return sym;
-	si--;
-
-	if (si->start <= ip && ip < si->end)
-		sym = mi->dso + ": " + si->name;
-#if 0
-	else if (si->start <= ip)
-		sym = mi->dso + ": ?" + si->name;
-#endif
-
-	return sym;
-}
-
-static std::string resolve_kernel_symbol(uint64_t ip)
-{
-	std::string sym = "<unknown>";
-
-	symbols_t &s = dsos["[kernel]"].syms;
-	symbols_t::const_iterator si = s.upper_bound(symbol(ip));
-
-	if (si == s.begin())
-		return sym;
-	si--;
-
-	if (si->start <= ip && ip < si->end)
-		sym = si->name;
-
-	return sym;
-}
-
-static void display_help(void)
-{
-	printf(
-	"Usage: perf-report [<options>]\n"
-	" -i file   --input=<file>      # input file\n"
-	);
-
-	exit(0);
-}
-
-static void process_options(int argc, char *argv[])
-{
-	int error = 0;
-
-	for (;;) {
-		int option_index = 0;
-		/** Options for getopt */
-		static struct option long_options[] = {
-			{"input",	required_argument,	NULL, 'i'},
-			{"no-user",	no_argument,		NULL, 'u'},
-			{"no-kernel",	no_argument,		NULL, 'k'},
-			{"no-hv",	no_argument,		NULL, 'h'},
-			{NULL,		0,			NULL,  0 }
-		};
-		int c = getopt_long(argc, argv, "+:i:kuh",
-				    long_options, &option_index);
-		if (c == -1)
-			break;
-
-		switch (c) {
-		case 'i': input_name			= strdup(optarg); break;
-		case 'k': show_mask &= ~SHOW_KERNEL; break;
-		case 'u': show_mask &= ~SHOW_USER; break;
-		case 'h': show_mask &= ~SHOW_HV; break;
-		default: error = 1; break;
-		}
-	}
-
-	if (error)
-		display_help();
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned long offset = 0;
-	unsigned long head = 0;
-	struct stat stat;
-	char *buf;
-	event_t *event;
-	int ret;
-	unsigned long total = 0;
-
-	page_size = getpagesize();
-
-	process_options(argc, argv);
-
-	input = open(input_name, O_RDONLY);
-	if (input < 0) {
-		perror("failed to open file");
-		exit(-1);
-	}
-
-	ret = fstat(input, &stat);
-	if (ret < 0) {
-		perror("failed to stat file");
-		exit(-1);
-	}
-
-	if (!stat.st_size) {
-		fprintf(stderr, "zero-sized file, nothing to do!\n");
-		exit(0);
-	}
-
-	load_kallsyms();
-
-remap:
-	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
-			   MAP_SHARED, input, offset);
-	if (buf == MAP_FAILED) {
-		perror("failed to mmap file");
-		exit(-1);
-	}
-
-more:
-	event = (event_t *)(buf + head);
-
-	if (head + event->header.size >= page_size * mmap_window) {
-		unsigned long shift = page_size * (head / page_size);
-		int ret;
-
-		ret = munmap(buf, page_size * mmap_window);
-		assert(ret == 0);
-
-		offset += shift;
-		head -= shift;
-		goto remap;
-	}
-
-
-	if (!event->header.size) {
-		fprintf(stderr, "zero-sized event at file offset %ld\n", offset + head);
-		fprintf(stderr, "skipping %ld bytes of events.\n", stat.st_size - offset - head);
-		goto done;
-	}
-
-	head += event->header.size;
-
-	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
-		std::string comm, sym, level;
-		int show = 0;
-		char output[1024];
-
-		if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
-			show |= SHOW_KERNEL;
-			level = " [k] ";
-			sym = resolve_kernel_symbol(event->ip.ip);
-		} else if (event->header.misc & PERF_EVENT_MISC_USER) {
-			show |= SHOW_USER;
-			level = " [.] ";
-			sym = resolve_user_symbol(event->ip.pid, event->ip.ip);
-		} else {
-			show |= SHOW_HV;
-			level = " [H] ";
-		}
-
-		if (show & show_mask) {
-			comm = resolve_comm(event->ip.pid);
-			snprintf(output, sizeof(output), "%16s %s %s",
-					comm.c_str(), level.c_str(), sym.c_str());
-			hist[output]++;
-		}
-
-		total++;
-
-	} else switch (event->header.type) {
-	case PERF_EVENT_MMAP:
-		maps[event->mmap.pid].insert(map(&event->mmap));
-		break;
-
-	case PERF_EVENT_COMM:
-		comms[event->comm.pid] = std::string(event->comm.comm);
-		break;
-	}
-
-	if (offset + head < stat.st_size)
-		goto more;
-
-done:
-
-	close(input);
-
-	std::map<std::string, int>::iterator hi = hist.begin();
-
-	while (hi != hist.end()) {
-		rev_hist.insert(std::pair<int, std::string>(hi->second, hi->first));
-		hist.erase(hi++);
-	}
-
-	std::multimap<int, std::string>::const_iterator ri = rev_hist.begin();
-
-	while (ri != rev_hist.end()) {
-		printf(" %5.2f %s\n", (100.0 * ri->first)/total, ri->second.c_str());
-		ri++;
-	}
-
-	return 0;
-}
-
-- 
cgit v0.10.2


From 62eb93905b3b43cea407cfbc061cc7b40ae1c6e9 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 18 May 2009 14:28:47 -0300
Subject: perf_counter: Implement dso__load using libelf

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 49c601e..6bffa86 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -160,7 +160,7 @@ uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
 # CFLAGS and LDFLAGS are for the users to override from the command line.
 
 CFLAGS = -g -O2 -Wall
-LDFLAGS = -lpthread -lrt
+LDFLAGS = -lpthread -lrt -lelf
 ALL_CFLAGS = $(CFLAGS)
 ALL_LDFLAGS = $(LDFLAGS)
 STRIP ?= strip
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 864f68f..ad2f327 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -8,6 +8,9 @@
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
+#include <gelf.h>
+#include <elf.h>
+#include <libelf.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <errno.h>
@@ -195,10 +198,123 @@ static struct symbol *dso__find_symbol(struct dso *self, uint64_t ip)
 	return NULL;
 }
 
+/**
+ * elf_symtab__for_each_symbol - iterate thru all the symbols
+ *
+ * @self: struct elf_symtab instance to iterate
+ * @index: uint32_t index
+ * @sym: GElf_Sym iterator
+ */
+#define elf_symtab__for_each_symbol(syms, nr_syms, index, sym) \
+	for (index = 0, gelf_getsym(syms, index, &sym);\
+	     index < nr_syms; \
+	     index++, gelf_getsym(syms, index, &sym))
+
+static inline uint8_t elf_sym__type(const GElf_Sym *sym)
+{
+	return GELF_ST_TYPE(sym->st_info);
+}
+
+static inline bool elf_sym__is_function(const GElf_Sym *sym)
+{
+	return elf_sym__type(sym) == STT_FUNC &&
+	       sym->st_name != 0 &&
+	       sym->st_shndx != SHN_UNDEF;
+}
+
+static inline const char *elf_sym__name(const GElf_Sym *sym,
+					const Elf_Data *symstrs)
+{
+	return symstrs->d_buf + sym->st_name;
+}
+
+static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
+				    GElf_Shdr *shp, const char *name,
+				    size_t *index)
+{
+	Elf_Scn *sec = NULL;
+	size_t cnt = 1;
+
+	while ((sec = elf_nextscn(elf, sec)) != NULL) {
+		char *str;
+
+		gelf_getshdr(sec, shp);
+		str = elf_strptr(elf, ep->e_shstrndx, shp->sh_name);
+		if (!strcmp(name, str)) {
+			if (index)
+				*index = cnt;
+			break;
+		}
+		++cnt;
+	}
+
+	return sec;
+}
+
 static int dso__load(struct dso *self)
 {
-	/* FIXME */
-	return 0;
+	int fd = open(self->name, O_RDONLY), err = -1;
+
+	if (fd == -1)
+		return -1;
+
+	Elf *elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
+	if (elf == NULL) {
+		fprintf(stderr, "%s: cannot read %s ELF file.\n",
+			__func__, self->name);
+		goto out_close;
+	}
+
+	GElf_Ehdr ehdr;
+	if (gelf_getehdr(elf, &ehdr) == NULL) {
+		fprintf(stderr, "%s: cannot get elf header.\n", __func__);
+		goto out_elf_end;
+	}
+
+	GElf_Shdr shdr;
+	Elf_Scn *sec = elf_section_by_name(elf, &ehdr, &shdr, ".symtab", NULL);
+	if (sec == NULL)
+		sec = elf_section_by_name(elf, &ehdr, &shdr, ".dynsym", NULL);
+
+	if (sec == NULL)
+		goto out_elf_end;
+
+	if (gelf_getshdr(sec, &shdr) == NULL)
+		goto out_elf_end;
+
+	Elf_Data *syms = elf_getdata(sec, NULL);
+	if (syms == NULL)
+		goto out_elf_end;
+
+	sec = elf_getscn(elf, shdr.sh_link);
+	if (sec == NULL)
+		goto out_elf_end;
+
+	Elf_Data *symstrs = elf_getdata(sec, NULL);
+	if (symstrs == NULL)
+		goto out_elf_end;
+
+	const uint32_t nr_syms = shdr.sh_size / shdr.sh_entsize;
+
+	GElf_Sym sym;
+	uint32_t index;
+	elf_symtab__for_each_symbol(syms, nr_syms, index, sym) {
+		if (!elf_sym__is_function(&sym))
+			continue;
+		struct symbol *f = symbol__new(sym.st_value, sym.st_size,
+					       elf_sym__name(&sym, symstrs));
+		if (f == NULL)
+			goto out_elf_end;
+
+		dso__insert_symbol(self, f);
+	}
+
+	err = 0;
+out_elf_end:
+	elf_end(elf);
+out_close:
+	close(fd);
+	return err;
 }
 
 static size_t dso__fprintf(struct dso *self, FILE *fp)
@@ -614,6 +730,8 @@ int cmd_report(int argc, char **argv)
 	int ret, rc = EXIT_FAILURE;
 	unsigned long total = 0;
 
+	elf_version(EV_CURRENT);
+
 	page_size = getpagesize();
 
 	process_options(argc, argv);
-- 
cgit v0.10.2


From 35a50c8a20eea22c141e05c5667ac21c48b8b65d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 18 May 2009 16:24:49 -0300
Subject: perf_counter: Use rb_trees in perf report

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 6bffa86..412dea1 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -287,6 +287,8 @@ LIB_FILE=libperf.a
 
 LIB_H += ../../include/linux/perf_counter.h
 LIB_H += perf.h
+LIB_H += util/list.h
+LIB_H += util/rbtree.h
 LIB_H += util/levenshtein.h
 LIB_H += util/parse-options.h
 LIB_H += util/parse-events.h
@@ -306,6 +308,7 @@ LIB_OBJS += util/levenshtein.o
 LIB_OBJS += util/parse-options.o
 LIB_OBJS += util/parse-events.o
 LIB_OBJS += util/path.o
+LIB_OBJS += util/rbtree.o
 LIB_OBJS += util/run-command.o
 LIB_OBJS += util/quote.o
 LIB_OBJS += util/strbuf.o
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index ad2f327..f63057f 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -32,7 +32,8 @@
 #include <linux/types.h>
 
 #include "../../include/linux/perf_counter.h"
-#include "list.h"
+#include "util/list.h"
+#include "util/rbtree.h"
 
 #define SHOW_KERNEL	1
 #define SHOW_USER	2
@@ -106,10 +107,10 @@ static void section__delete(struct section *self)
 }
 
 struct symbol {
-	struct list_head node;
-	uint64_t	 start;
-	uint64_t	 end;
-	char		 name[0];
+	struct rb_node rb_node;
+	uint64_t       start;
+	uint64_t       end;
+	char	       name[0];
 };
 
 static struct symbol *symbol__new(uint64_t start, uint64_t len, const char *name)
@@ -139,7 +140,7 @@ static size_t symbol__fprintf(struct symbol *self, FILE *fp)
 struct dso {
 	struct list_head node;
 	struct list_head sections;
-	struct list_head syms;
+	struct rb_root	 syms;
 	char		 name[0];
 };
 
@@ -150,7 +151,7 @@ static struct dso *dso__new(const char *name)
 	if (self != NULL) {
 		strcpy(self->name, name);
 		INIT_LIST_HEAD(&self->sections);
-		INIT_LIST_HEAD(&self->syms);
+		self->syms = RB_ROOT;
 	}
 
 	return self;
@@ -166,10 +167,14 @@ static void dso__delete_sections(struct dso *self)
 
 static void dso__delete_symbols(struct dso *self)
 {
-	struct symbol *pos, *n;
+	struct symbol *pos;
+	struct rb_node *next = rb_first(&self->syms);
 
-	list_for_each_entry_safe(pos, n, &self->syms, node)
+	while (next) {
+		pos = rb_entry(next, struct symbol, rb_node);
+		next = rb_next(&pos->rb_node);
 		symbol__delete(pos);
+	}
 }
 
 static void dso__delete(struct dso *self)
@@ -181,7 +186,21 @@ static void dso__delete(struct dso *self)
 
 static void dso__insert_symbol(struct dso *self, struct symbol *sym)
 {
-	list_add_tail(&sym->node, &self->syms);
+	struct rb_node **p = &self->syms.rb_node;
+	struct rb_node *parent = NULL;
+	const uint64_t ip = sym->start;
+	struct symbol *s;
+
+	while (*p != NULL) {
+		parent = *p;
+		s = rb_entry(parent, struct symbol, rb_node);
+		if (ip < s->start)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+	rb_link_node(&sym->rb_node, parent, p);
+	rb_insert_color(&sym->rb_node, &self->syms);
 }
 
 static struct symbol *dso__find_symbol(struct dso *self, uint64_t ip)
@@ -189,11 +208,18 @@ static struct symbol *dso__find_symbol(struct dso *self, uint64_t ip)
 	if (self == NULL)
 		return NULL;
 
-	struct symbol *pos;
+	struct rb_node *n = self->syms.rb_node;
 
-	list_for_each_entry(pos, &self->syms, node)
-		if (ip >= pos->start && ip <= pos->end)
-			return pos;
+	while (n) {
+		struct symbol *s = rb_entry(n, struct symbol, rb_node);
+
+		if (ip < s->start)
+			n = n->rb_left;
+		else if (ip > s->end)
+			n = n->rb_right;
+		else
+			return s;
+	}
 
 	return NULL;
 }
@@ -319,11 +345,13 @@ out_close:
 
 static size_t dso__fprintf(struct dso *self, FILE *fp)
 {
-	struct symbol *pos;
 	size_t ret = fprintf(fp, "dso: %s\n", self->name);
 
-	list_for_each_entry(pos, &self->syms, node)
+	struct rb_node *nd;
+	for (nd = rb_first(&self->syms); nd; nd = rb_next(nd)) {
+		struct symbol *pos = rb_entry(nd, struct symbol, rb_node);
 		ret += symbol__fprintf(pos, fp);
+	}
 
 	return ret;
 }
diff --git a/Documentation/perf_counter/util/rbtree.c b/Documentation/perf_counter/util/rbtree.c
new file mode 100644
index 0000000..b15ba9c
--- /dev/null
+++ b/Documentation/perf_counter/util/rbtree.c
@@ -0,0 +1,383 @@
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  (C) 2002  David Woodhouse <dwmw2@infradead.org>
+  
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  linux/lib/rbtree.c
+*/
+
+#include "rbtree.h"
+
+static void __rb_rotate_left(struct rb_node *node, struct rb_root *root)
+{
+	struct rb_node *right = node->rb_right;
+	struct rb_node *parent = rb_parent(node);
+
+	if ((node->rb_right = right->rb_left))
+		rb_set_parent(right->rb_left, node);
+	right->rb_left = node;
+
+	rb_set_parent(right, parent);
+
+	if (parent)
+	{
+		if (node == parent->rb_left)
+			parent->rb_left = right;
+		else
+			parent->rb_right = right;
+	}
+	else
+		root->rb_node = right;
+	rb_set_parent(node, right);
+}
+
+static void __rb_rotate_right(struct rb_node *node, struct rb_root *root)
+{
+	struct rb_node *left = node->rb_left;
+	struct rb_node *parent = rb_parent(node);
+
+	if ((node->rb_left = left->rb_right))
+		rb_set_parent(left->rb_right, node);
+	left->rb_right = node;
+
+	rb_set_parent(left, parent);
+
+	if (parent)
+	{
+		if (node == parent->rb_right)
+			parent->rb_right = left;
+		else
+			parent->rb_left = left;
+	}
+	else
+		root->rb_node = left;
+	rb_set_parent(node, left);
+}
+
+void rb_insert_color(struct rb_node *node, struct rb_root *root)
+{
+	struct rb_node *parent, *gparent;
+
+	while ((parent = rb_parent(node)) && rb_is_red(parent))
+	{
+		gparent = rb_parent(parent);
+
+		if (parent == gparent->rb_left)
+		{
+			{
+				register struct rb_node *uncle = gparent->rb_right;
+				if (uncle && rb_is_red(uncle))
+				{
+					rb_set_black(uncle);
+					rb_set_black(parent);
+					rb_set_red(gparent);
+					node = gparent;
+					continue;
+				}
+			}
+
+			if (parent->rb_right == node)
+			{
+				register struct rb_node *tmp;
+				__rb_rotate_left(parent, root);
+				tmp = parent;
+				parent = node;
+				node = tmp;
+			}
+
+			rb_set_black(parent);
+			rb_set_red(gparent);
+			__rb_rotate_right(gparent, root);
+		} else {
+			{
+				register struct rb_node *uncle = gparent->rb_left;
+				if (uncle && rb_is_red(uncle))
+				{
+					rb_set_black(uncle);
+					rb_set_black(parent);
+					rb_set_red(gparent);
+					node = gparent;
+					continue;
+				}
+			}
+
+			if (parent->rb_left == node)
+			{
+				register struct rb_node *tmp;
+				__rb_rotate_right(parent, root);
+				tmp = parent;
+				parent = node;
+				node = tmp;
+			}
+
+			rb_set_black(parent);
+			rb_set_red(gparent);
+			__rb_rotate_left(gparent, root);
+		}
+	}
+
+	rb_set_black(root->rb_node);
+}
+
+static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
+			     struct rb_root *root)
+{
+	struct rb_node *other;
+
+	while ((!node || rb_is_black(node)) && node != root->rb_node)
+	{
+		if (parent->rb_left == node)
+		{
+			other = parent->rb_right;
+			if (rb_is_red(other))
+			{
+				rb_set_black(other);
+				rb_set_red(parent);
+				__rb_rotate_left(parent, root);
+				other = parent->rb_right;
+			}
+			if ((!other->rb_left || rb_is_black(other->rb_left)) &&
+			    (!other->rb_right || rb_is_black(other->rb_right)))
+			{
+				rb_set_red(other);
+				node = parent;
+				parent = rb_parent(node);
+			}
+			else
+			{
+				if (!other->rb_right || rb_is_black(other->rb_right))
+				{
+					rb_set_black(other->rb_left);
+					rb_set_red(other);
+					__rb_rotate_right(other, root);
+					other = parent->rb_right;
+				}
+				rb_set_color(other, rb_color(parent));
+				rb_set_black(parent);
+				rb_set_black(other->rb_right);
+				__rb_rotate_left(parent, root);
+				node = root->rb_node;
+				break;
+			}
+		}
+		else
+		{
+			other = parent->rb_left;
+			if (rb_is_red(other))
+			{
+				rb_set_black(other);
+				rb_set_red(parent);
+				__rb_rotate_right(parent, root);
+				other = parent->rb_left;
+			}
+			if ((!other->rb_left || rb_is_black(other->rb_left)) &&
+			    (!other->rb_right || rb_is_black(other->rb_right)))
+			{
+				rb_set_red(other);
+				node = parent;
+				parent = rb_parent(node);
+			}
+			else
+			{
+				if (!other->rb_left || rb_is_black(other->rb_left))
+				{
+					rb_set_black(other->rb_right);
+					rb_set_red(other);
+					__rb_rotate_left(other, root);
+					other = parent->rb_left;
+				}
+				rb_set_color(other, rb_color(parent));
+				rb_set_black(parent);
+				rb_set_black(other->rb_left);
+				__rb_rotate_right(parent, root);
+				node = root->rb_node;
+				break;
+			}
+		}
+	}
+	if (node)
+		rb_set_black(node);
+}
+
+void rb_erase(struct rb_node *node, struct rb_root *root)
+{
+	struct rb_node *child, *parent;
+	int color;
+
+	if (!node->rb_left)
+		child = node->rb_right;
+	else if (!node->rb_right)
+		child = node->rb_left;
+	else
+	{
+		struct rb_node *old = node, *left;
+
+		node = node->rb_right;
+		while ((left = node->rb_left) != NULL)
+			node = left;
+		child = node->rb_right;
+		parent = rb_parent(node);
+		color = rb_color(node);
+
+		if (child)
+			rb_set_parent(child, parent);
+		if (parent == old) {
+			parent->rb_right = child;
+			parent = node;
+		} else
+			parent->rb_left = child;
+
+		node->rb_parent_color = old->rb_parent_color;
+		node->rb_right = old->rb_right;
+		node->rb_left = old->rb_left;
+
+		if (rb_parent(old))
+		{
+			if (rb_parent(old)->rb_left == old)
+				rb_parent(old)->rb_left = node;
+			else
+				rb_parent(old)->rb_right = node;
+		} else
+			root->rb_node = node;
+
+		rb_set_parent(old->rb_left, node);
+		if (old->rb_right)
+			rb_set_parent(old->rb_right, node);
+		goto color;
+	}
+
+	parent = rb_parent(node);
+	color = rb_color(node);
+
+	if (child)
+		rb_set_parent(child, parent);
+	if (parent)
+	{
+		if (parent->rb_left == node)
+			parent->rb_left = child;
+		else
+			parent->rb_right = child;
+	}
+	else
+		root->rb_node = child;
+
+ color:
+	if (color == RB_BLACK)
+		__rb_erase_color(child, parent, root);
+}
+
+/*
+ * This function returns the first node (in sort order) of the tree.
+ */
+struct rb_node *rb_first(const struct rb_root *root)
+{
+	struct rb_node	*n;
+
+	n = root->rb_node;
+	if (!n)
+		return NULL;
+	while (n->rb_left)
+		n = n->rb_left;
+	return n;
+}
+
+struct rb_node *rb_last(const struct rb_root *root)
+{
+	struct rb_node	*n;
+
+	n = root->rb_node;
+	if (!n)
+		return NULL;
+	while (n->rb_right)
+		n = n->rb_right;
+	return n;
+}
+
+struct rb_node *rb_next(const struct rb_node *node)
+{
+	struct rb_node *parent;
+
+	if (rb_parent(node) == node)
+		return NULL;
+
+	/* If we have a right-hand child, go down and then left as far
+	   as we can. */
+	if (node->rb_right) {
+		node = node->rb_right; 
+		while (node->rb_left)
+			node=node->rb_left;
+		return (struct rb_node *)node;
+	}
+
+	/* No right-hand children.  Everything down and left is
+	   smaller than us, so any 'next' node must be in the general
+	   direction of our parent. Go up the tree; any time the
+	   ancestor is a right-hand child of its parent, keep going
+	   up. First time it's a left-hand child of its parent, said
+	   parent is our 'next' node. */
+	while ((parent = rb_parent(node)) && node == parent->rb_right)
+		node = parent;
+
+	return parent;
+}
+
+struct rb_node *rb_prev(const struct rb_node *node)
+{
+	struct rb_node *parent;
+
+	if (rb_parent(node) == node)
+		return NULL;
+
+	/* If we have a left-hand child, go down and then right as far
+	   as we can. */
+	if (node->rb_left) {
+		node = node->rb_left; 
+		while (node->rb_right)
+			node=node->rb_right;
+		return (struct rb_node *)node;
+	}
+
+	/* No left-hand children. Go up till we find an ancestor which
+	   is a right-hand child of its parent */
+	while ((parent = rb_parent(node)) && node == parent->rb_left)
+		node = parent;
+
+	return parent;
+}
+
+void rb_replace_node(struct rb_node *victim, struct rb_node *new,
+		     struct rb_root *root)
+{
+	struct rb_node *parent = rb_parent(victim);
+
+	/* Set the surrounding nodes to point to the replacement */
+	if (parent) {
+		if (victim == parent->rb_left)
+			parent->rb_left = new;
+		else
+			parent->rb_right = new;
+	} else {
+		root->rb_node = new;
+	}
+	if (victim->rb_left)
+		rb_set_parent(victim->rb_left, new);
+	if (victim->rb_right)
+		rb_set_parent(victim->rb_right, new);
+
+	/* Copy the pointers/colour from the victim to the replacement */
+	*new = *victim;
+}
diff --git a/Documentation/perf_counter/util/rbtree.h b/Documentation/perf_counter/util/rbtree.h
new file mode 100644
index 0000000..6bdc488
--- /dev/null
+++ b/Documentation/perf_counter/util/rbtree.h
@@ -0,0 +1,171 @@
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  linux/include/linux/rbtree.h
+
+  To use rbtrees you'll have to implement your own insert and search cores.
+  This will avoid us to use callbacks and to drop drammatically performances.
+  I know it's not the cleaner way,  but in C (not in C++) to get
+  performances and genericity...
+
+  Some example of insert and search follows here. The search is a plain
+  normal search over an ordered tree. The insert instead must be implemented
+  int two steps: as first thing the code must insert the element in
+  order as a red leaf in the tree, then the support library function
+  rb_insert_color() must be called. Such function will do the
+  not trivial work to rebalance the rbtree if necessary.
+
+-----------------------------------------------------------------------
+static inline struct page * rb_search_page_cache(struct inode * inode,
+						 unsigned long offset)
+{
+	struct rb_node * n = inode->i_rb_page_cache.rb_node;
+	struct page * page;
+
+	while (n)
+	{
+		page = rb_entry(n, struct page, rb_page_cache);
+
+		if (offset < page->offset)
+			n = n->rb_left;
+		else if (offset > page->offset)
+			n = n->rb_right;
+		else
+			return page;
+	}
+	return NULL;
+}
+
+static inline struct page * __rb_insert_page_cache(struct inode * inode,
+						   unsigned long offset,
+						   struct rb_node * node)
+{
+	struct rb_node ** p = &inode->i_rb_page_cache.rb_node;
+	struct rb_node * parent = NULL;
+	struct page * page;
+
+	while (*p)
+	{
+		parent = *p;
+		page = rb_entry(parent, struct page, rb_page_cache);
+
+		if (offset < page->offset)
+			p = &(*p)->rb_left;
+		else if (offset > page->offset)
+			p = &(*p)->rb_right;
+		else
+			return page;
+	}
+
+	rb_link_node(node, parent, p);
+
+	return NULL;
+}
+
+static inline struct page * rb_insert_page_cache(struct inode * inode,
+						 unsigned long offset,
+						 struct rb_node * node)
+{
+	struct page * ret;
+	if ((ret = __rb_insert_page_cache(inode, offset, node)))
+		goto out;
+	rb_insert_color(node, &inode->i_rb_page_cache);
+ out:
+	return ret;
+}
+-----------------------------------------------------------------------
+*/
+
+#ifndef	_LINUX_RBTREE_H
+#define	_LINUX_RBTREE_H
+
+#include <stddef.h>
+
+/**
+ * container_of - cast a member of a structure out to the containing structure
+ * @ptr:	the pointer to the member.
+ * @type:	the type of the container struct this is embedded in.
+ * @member:	the name of the member within the struct.
+ *
+ */
+#define container_of(ptr, type, member) ({			\
+	const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
+	(type *)( (char *)__mptr - offsetof(type,member) );})
+
+struct rb_node
+{
+	unsigned long  rb_parent_color;
+#define	RB_RED		0
+#define	RB_BLACK	1
+	struct rb_node *rb_right;
+	struct rb_node *rb_left;
+} __attribute__((aligned(sizeof(long))));
+    /* The alignment might seem pointless, but allegedly CRIS needs it */
+
+struct rb_root
+{
+	struct rb_node *rb_node;
+};
+
+
+#define rb_parent(r)   ((struct rb_node *)((r)->rb_parent_color & ~3))
+#define rb_color(r)   ((r)->rb_parent_color & 1)
+#define rb_is_red(r)   (!rb_color(r))
+#define rb_is_black(r) rb_color(r)
+#define rb_set_red(r)  do { (r)->rb_parent_color &= ~1; } while (0)
+#define rb_set_black(r)  do { (r)->rb_parent_color |= 1; } while (0)
+
+static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
+{
+	rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p;
+}
+static inline void rb_set_color(struct rb_node *rb, int color)
+{
+	rb->rb_parent_color = (rb->rb_parent_color & ~1) | color;
+}
+
+#define RB_ROOT	(struct rb_root) { NULL, }
+#define	rb_entry(ptr, type, member) container_of(ptr, type, member)
+
+#define RB_EMPTY_ROOT(root)	((root)->rb_node == NULL)
+#define RB_EMPTY_NODE(node)	(rb_parent(node) == node)
+#define RB_CLEAR_NODE(node)	(rb_set_parent(node, node))
+
+extern void rb_insert_color(struct rb_node *, struct rb_root *);
+extern void rb_erase(struct rb_node *, struct rb_root *);
+
+/* Find logical next and previous nodes in a tree */
+extern struct rb_node *rb_next(const struct rb_node *);
+extern struct rb_node *rb_prev(const struct rb_node *);
+extern struct rb_node *rb_first(const struct rb_root *);
+extern struct rb_node *rb_last(const struct rb_root *);
+
+/* Fast replacement of a single node without remove/rebalance/add/rebalance */
+extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, 
+			    struct rb_root *root);
+
+static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
+				struct rb_node ** rb_link)
+{
+	node->rb_parent_color = (unsigned long )parent;
+	node->rb_left = node->rb_right = NULL;
+
+	*rb_link = node;
+}
+
+#endif	/* _LINUX_RBTREE_H */
-- 
cgit v0.10.2


From 040e6034124c504d536736ce08e4643e640cd7c2 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 18 May 2009 16:25:31 -0300
Subject: perf_counter: Add our private copy of list.h

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/util/list.h b/Documentation/perf_counter/util/list.h
new file mode 100644
index 0000000..e2548e8
--- /dev/null
+++ b/Documentation/perf_counter/util/list.h
@@ -0,0 +1,603 @@
+#ifndef _LINUX_LIST_H
+#define _LINUX_LIST_H
+/*
+  Copyright (C) Cast of dozens, comes from the Linux kernel
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+*/
+
+#include <stddef.h>
+
+/*
+ * These are non-NULL pointers that will result in page faults
+ * under normal circumstances, used to verify that nobody uses
+ * non-initialized list entries.
+ */
+#define LIST_POISON1 ((void *)0x00100100)
+#define LIST_POISON2 ((void *)0x00200200)
+
+/**
+ * container_of - cast a member of a structure out to the containing structure
+ * @ptr:	the pointer to the member.
+ * @type:	the type of the container struct this is embedded in.
+ * @member:	the name of the member within the struct.
+ *
+ */
+#define container_of(ptr, type, member) ({			\
+        const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
+        (type *)( (char *)__mptr - offsetof(type,member) );})
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+struct list_head {
+	struct list_head *next, *prev;
+};
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define LIST_HEAD(name) \
+	struct list_head name = LIST_HEAD_INIT(name)
+
+static inline void INIT_LIST_HEAD(struct list_head *list)
+{
+	list->next = list;
+	list->prev = list;
+}
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_add(struct list_head *new,
+			      struct list_head *prev,
+			      struct list_head *next)
+{
+	next->prev = new;
+	new->next = next;
+	new->prev = prev;
+	prev->next = new;
+}
+
+/**
+ * list_add - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+	__list_add(new, head, head->next);
+}
+
+/**
+ * list_add_tail - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static inline void list_add_tail(struct list_head *new, struct list_head *head)
+{
+	__list_add(new, head->prev, head);
+}
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_del(struct list_head * prev, struct list_head * next)
+{
+	next->prev = prev;
+	prev->next = next;
+}
+
+/**
+ * list_del - deletes entry from list.
+ * @entry: the element to delete from the list.
+ * Note: list_empty on entry does not return true after this, the entry is
+ * in an undefined state.
+ */
+static inline void list_del(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	entry->next = LIST_POISON1;
+	entry->prev = LIST_POISON2;
+}
+
+/**
+ * list_del_range - deletes range of entries from list.
+ * @beging: first element in the range to delete from the list.
+ * @beging: first element in the range to delete from the list.
+ * Note: list_empty on the range of entries does not return true after this,
+ * the entries is in an undefined state.
+ */
+static inline void list_del_range(struct list_head *begin,
+				  struct list_head *end)
+{
+	begin->prev->next = end->next;
+	end->next->prev = begin->prev;
+}
+
+/**
+ * list_replace - replace old entry by new one
+ * @old : the element to be replaced
+ * @new : the new element to insert
+ * Note: if 'old' was empty, it will be overwritten.
+ */
+static inline void list_replace(struct list_head *old,
+				struct list_head *new)
+{
+	new->next = old->next;
+	new->next->prev = new;
+	new->prev = old->prev;
+	new->prev->next = new;
+}
+
+static inline void list_replace_init(struct list_head *old,
+					struct list_head *new)
+{
+	list_replace(old, new);
+	INIT_LIST_HEAD(old);
+}
+
+/**
+ * list_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static inline void list_del_init(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	INIT_LIST_HEAD(entry);
+}
+
+/**
+ * list_move - delete from one list and add as another's head
+ * @list: the entry to move
+ * @head: the head that will precede our entry
+ */
+static inline void list_move(struct list_head *list, struct list_head *head)
+{
+        __list_del(list->prev, list->next);
+        list_add(list, head);
+}
+
+/**
+ * list_move_tail - delete from one list and add as another's tail
+ * @list: the entry to move
+ * @head: the head that will follow our entry
+ */
+static inline void list_move_tail(struct list_head *list,
+				  struct list_head *head)
+{
+        __list_del(list->prev, list->next);
+        list_add_tail(list, head);
+}
+
+/**
+ * list_is_last - tests whether @list is the last entry in list @head
+ * @list: the entry to test
+ * @head: the head of the list
+ */
+static inline int list_is_last(const struct list_head *list,
+				const struct list_head *head)
+{
+	return list->next == head;
+}
+
+/**
+ * list_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+static inline int list_empty(const struct list_head *head)
+{
+	return head->next == head;
+}
+
+/**
+ * list_empty_careful - tests whether a list is empty and not being modified
+ * @head: the list to test
+ *
+ * Description:
+ * tests whether a list is empty _and_ checks that no other CPU might be
+ * in the process of modifying either member (next or prev)
+ *
+ * NOTE: using list_empty_careful() without synchronization
+ * can only be safe if the only activity that can happen
+ * to the list entry is list_del_init(). Eg. it cannot be used
+ * if another CPU could re-list_add() it.
+ */
+static inline int list_empty_careful(const struct list_head *head)
+{
+	struct list_head *next = head->next;
+	return (next == head) && (next == head->prev);
+}
+
+static inline void __list_splice(struct list_head *list,
+				 struct list_head *head)
+{
+	struct list_head *first = list->next;
+	struct list_head *last = list->prev;
+	struct list_head *at = head->next;
+
+	first->prev = head;
+	head->next = first;
+
+	last->next = at;
+	at->prev = last;
+}
+
+/**
+ * list_splice - join two lists
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+static inline void list_splice(struct list_head *list, struct list_head *head)
+{
+	if (!list_empty(list))
+		__list_splice(list, head);
+}
+
+/**
+ * list_splice_init - join two lists and reinitialise the emptied list.
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ *
+ * The list at @list is reinitialised
+ */
+static inline void list_splice_init(struct list_head *list,
+				    struct list_head *head)
+{
+	if (!list_empty(list)) {
+		__list_splice(list, head);
+		INIT_LIST_HEAD(list);
+	}
+}
+
+/**
+ * list_entry - get the struct for this entry
+ * @ptr:	the &struct list_head pointer.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_entry(ptr, type, member) \
+	container_of(ptr, type, member)
+
+/**
+ * list_first_entry - get the first element from a list
+ * @ptr:       the list head to take the element from.
+ * @type:      the type of the struct this is embedded in.
+ * @member:    the name of the list_struct within the struct.
+ *
+ * Note, that list is expected to be not empty.
+ */
+#define list_first_entry(ptr, type, member) \
+	list_entry((ptr)->next, type, member)
+
+/**
+ * list_for_each	-	iterate over a list
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @head:	the head for your list.
+ */
+#define list_for_each(pos, head) \
+	for (pos = (head)->next; pos != (head); \
+        	pos = pos->next)
+
+/**
+ * __list_for_each	-	iterate over a list
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @head:	the head for your list.
+ *
+ * This variant differs from list_for_each() in that it's the
+ * simplest possible list iteration code, no prefetching is done.
+ * Use this for code that knows the list to be very short (empty
+ * or 1 entry) most of the time.
+ */
+#define __list_for_each(pos, head) \
+	for (pos = (head)->next; pos != (head); pos = pos->next)
+
+/**
+ * list_for_each_prev	-	iterate over a list backwards
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @head:	the head for your list.
+ */
+#define list_for_each_prev(pos, head) \
+	for (pos = (head)->prev; pos != (head); \
+        	pos = pos->prev)
+
+/**
+ * list_for_each_safe - iterate over a list safe against removal of list entry
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @n:		another &struct list_head to use as temporary storage
+ * @head:	the head for your list.
+ */
+#define list_for_each_safe(pos, n, head) \
+	for (pos = (head)->next, n = pos->next; pos != (head); \
+		pos = n, n = pos->next)
+
+/**
+ * list_for_each_entry	-	iterate over list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_for_each_entry(pos, head, member)				\
+	for (pos = list_entry((head)->next, typeof(*pos), member);	\
+	     &pos->member != (head); 	\
+	     pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_reverse - iterate backwards over list of given type.
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_reverse(pos, head, member)			\
+	for (pos = list_entry((head)->prev, typeof(*pos), member);	\
+	     &pos->member != (head); 	\
+	     pos = list_entry(pos->member.prev, typeof(*pos), member))
+
+/**
+ * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue
+ * @pos:	the type * to use as a start point
+ * @head:	the head of the list
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Prepares a pos entry for use as a start point in list_for_each_entry_continue.
+ */
+#define list_prepare_entry(pos, head, member) \
+	((pos) ? : list_entry(head, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_continue - continue iteration over list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Continue to iterate over list of given type, continuing after
+ * the current position.
+ */
+#define list_for_each_entry_continue(pos, head, member) 		\
+	for (pos = list_entry(pos->member.next, typeof(*pos), member);	\
+	     &pos->member != (head);	\
+	     pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_from - iterate over list of given type from the current point
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Iterate over list of given type, continuing from current position.
+ */
+#define list_for_each_entry_from(pos, head, member) 			\
+	for (; &pos->member != (head);	\
+	     pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_safe(pos, n, head, member)			\
+	for (pos = list_entry((head)->next, typeof(*pos), member),	\
+		n = list_entry(pos->member.next, typeof(*pos), member);	\
+	     &pos->member != (head); 					\
+	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+/**
+ * list_for_each_entry_safe_continue
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Iterate over list of given type, continuing after current point,
+ * safe against removal of list entry.
+ */
+#define list_for_each_entry_safe_continue(pos, n, head, member) 		\
+	for (pos = list_entry(pos->member.next, typeof(*pos), member), 		\
+		n = list_entry(pos->member.next, typeof(*pos), member);		\
+	     &pos->member != (head);						\
+	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+/**
+ * list_for_each_entry_safe_from
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Iterate over list of given type from current point, safe against
+ * removal of list entry.
+ */
+#define list_for_each_entry_safe_from(pos, n, head, member) 			\
+	for (n = list_entry(pos->member.next, typeof(*pos), member);		\
+	     &pos->member != (head);						\
+	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+/**
+ * list_for_each_entry_safe_reverse
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Iterate backwards over list of given type, safe against removal
+ * of list entry.
+ */
+#define list_for_each_entry_safe_reverse(pos, n, head, member)		\
+	for (pos = list_entry((head)->prev, typeof(*pos), member),	\
+		n = list_entry(pos->member.prev, typeof(*pos), member);	\
+	     &pos->member != (head); 					\
+	     pos = n, n = list_entry(n->member.prev, typeof(*n), member))
+
+/*
+ * Double linked lists with a single pointer list head.
+ * Mostly useful for hash tables where the two pointer list head is
+ * too wasteful.
+ * You lose the ability to access the tail in O(1).
+ */
+
+struct hlist_head {
+	struct hlist_node *first;
+};
+
+struct hlist_node {
+	struct hlist_node *next, **pprev;
+};
+
+#define HLIST_HEAD_INIT { .first = NULL }
+#define HLIST_HEAD(name) struct hlist_head name = {  .first = NULL }
+#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
+static inline void INIT_HLIST_NODE(struct hlist_node *h)
+{
+	h->next = NULL;
+	h->pprev = NULL;
+}
+
+static inline int hlist_unhashed(const struct hlist_node *h)
+{
+	return !h->pprev;
+}
+
+static inline int hlist_empty(const struct hlist_head *h)
+{
+	return !h->first;
+}
+
+static inline void __hlist_del(struct hlist_node *n)
+{
+	struct hlist_node *next = n->next;
+	struct hlist_node **pprev = n->pprev;
+	*pprev = next;
+	if (next)
+		next->pprev = pprev;
+}
+
+static inline void hlist_del(struct hlist_node *n)
+{
+	__hlist_del(n);
+	n->next = LIST_POISON1;
+	n->pprev = LIST_POISON2;
+}
+
+static inline void hlist_del_init(struct hlist_node *n)
+{
+	if (!hlist_unhashed(n)) {
+		__hlist_del(n);
+		INIT_HLIST_NODE(n);
+	}
+}
+
+static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
+{
+	struct hlist_node *first = h->first;
+	n->next = first;
+	if (first)
+		first->pprev = &n->next;
+	h->first = n;
+	n->pprev = &h->first;
+}
+
+/* next must be != NULL */
+static inline void hlist_add_before(struct hlist_node *n,
+					struct hlist_node *next)
+{
+	n->pprev = next->pprev;
+	n->next = next;
+	next->pprev = &n->next;
+	*(n->pprev) = n;
+}
+
+static inline void hlist_add_after(struct hlist_node *n,
+					struct hlist_node *next)
+{
+	next->next = n->next;
+	n->next = next;
+	next->pprev = &n->next;
+
+	if(next->next)
+		next->next->pprev  = &next->next;
+}
+
+#define hlist_entry(ptr, type, member) container_of(ptr,type,member)
+
+#define hlist_for_each(pos, head) \
+	for (pos = (head)->first; pos; \
+	     pos = pos->next)
+
+#define hlist_for_each_safe(pos, n, head) \
+	for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \
+	     pos = n)
+
+/**
+ * hlist_for_each_entry	- iterate over list of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry(tpos, pos, head, member)			 \
+	for (pos = (head)->first;					 \
+	     pos && 			 \
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+/**
+ * hlist_for_each_entry_continue - iterate over a hlist continuing after current point
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_continue(tpos, pos, member)		 \
+	for (pos = (pos)->next;						 \
+	     pos && 			 \
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+/**
+ * hlist_for_each_entry_from - iterate over a hlist continuing from current point
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_from(tpos, pos, member)			 \
+	for (; pos && 			 \
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+/**
+ * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @n:		another &struct hlist_node to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_safe(tpos, pos, n, head, member) 		 \
+	for (pos = (head)->first;					 \
+	     pos && ({ n = pos->next; 1; }) && 				 \
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = n)
+
+#endif
-- 
cgit v0.10.2


From ce7e43653b08db094326f378958bc293a68e8e5b Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 19 May 2009 09:30:23 -0300
Subject: perf_counter: Use rb_tree for symhists and threads in report

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index f63057f..e857201 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -479,23 +479,25 @@ static size_t map__fprintf(struct map *self, FILE *fp)
 }
 
 struct symhist {
-	struct list_head node;
+	struct rb_node	 rb_node;
 	struct dso	 *dso;
 	struct symbol	 *sym;
+	uint64_t	 ip;
 	uint32_t	 count;
 	char		 level;
 };
 
-static struct symhist *symhist__new(struct symbol *sym, struct dso *dso,
-				    char level)
+static struct symhist *symhist__new(struct symbol *sym, uint64_t ip,
+				    struct dso *dso, char level)
 {
 	struct symhist *self = malloc(sizeof(*self));
 
 	if (self != NULL) {
 		self->sym   = sym;
+		self->ip    = ip;
 		self->dso   = dso;
 		self->level = level;
-		self->count = 0;
+		self->count = 1;
 	}
 
 	return self;
@@ -506,12 +508,6 @@ static void symhist__delete(struct symhist *self)
 	free(self);
 }
 
-static bool symhist__equal(struct symhist *self, struct symbol *sym,
-			   struct dso *dso, char level)
-{
-	return self->level == level && self->sym == sym && self->dso == dso;
-}
-
 static void symhist__inc(struct symhist *self)
 {
 	++self->count;
@@ -519,7 +515,7 @@ static void symhist__inc(struct symhist *self)
 
 static size_t symhist__fprintf(struct symhist *self, FILE *fp)
 {
-	size_t ret = fprintf(fp, "[%c] ", self->level);
+	size_t ret = fprintf(fp, "%#llx [%c] ", (unsigned long long)self->ip, self->level);
 
 	if (self->level != '.')
 		ret += fprintf(fp, "%s", self->sym->name);
@@ -531,9 +527,9 @@ static size_t symhist__fprintf(struct symhist *self, FILE *fp)
 }
 
 struct thread {
-	struct list_head node;
+	struct rb_node	 rb_node;
 	struct list_head maps;
-	struct list_head symhists;
+	struct rb_root	 symhists;
 	pid_t		 pid;
 	char		 *comm;
 };
@@ -546,47 +542,43 @@ static struct thread *thread__new(pid_t pid)
 		self->pid = pid;
 		self->comm = NULL;
 		INIT_LIST_HEAD(&self->maps);
-		INIT_LIST_HEAD(&self->symhists);
+		self->symhists = RB_ROOT;
 	}
 
 	return self;
 }
 
-static void thread__insert_symhist(struct thread *self,
-				   struct symhist *symhist)
-{
-	list_add_tail(&symhist->node, &self->symhists);
-}
-
-static struct symhist *thread__symhists_find(struct thread *self,
-					     struct symbol *sym,
-					     struct dso *dso, char level)
+static int thread__symbol_incnew(struct thread *self, struct symbol *sym,
+				 uint64_t ip, struct dso *dso, char level)
 {
-	struct symhist *pos;
+	struct rb_node **p = &self->symhists.rb_node;
+	struct rb_node *parent = NULL;
+	struct symhist *sh;
 
-	list_for_each_entry(pos, &self->symhists, node)
-		if (symhist__equal(pos, sym, dso, level))
-			return pos;
+	while (*p != NULL) {
+		parent = *p;
+		sh = rb_entry(parent, struct symhist, rb_node);
 
-	return NULL;
-}
+		if (sh->sym == sym || ip == sh->ip) {
+			symhist__inc(sh);
+			return 0;
+		}
 
-static int thread__symbol_incnew(struct thread *self, struct symbol *sym,
-				 struct dso *dso, char level)
-{
-	struct symhist *symhist = thread__symhists_find(self, sym, dso, level);
+		/* Handle unresolved symbols too */
+		const uint64_t start = !sh->sym ? sh->ip : sh->sym->start;
 
-	if (symhist == NULL) {
-		symhist = symhist__new(sym, dso, level);
-		if (symhist == NULL)
-			goto out_error;
-		thread__insert_symhist(self, symhist);
+		if (ip < start)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
 	}
 
-	symhist__inc(symhist);
+	sh = symhist__new(sym, ip, dso, level);
+	if (sh == NULL)
+		return -ENOMEM;
+	rb_link_node(&sh->rb_node, parent, p);
+	rb_insert_color(&sh->rb_node, &self->symhists);
 	return 0;
-out_error:
-	return -ENOMEM;
 }
 
 static int thread__set_comm(struct thread *self, const char *comm)
@@ -608,43 +600,44 @@ static size_t thread__maps_fprintf(struct thread *self, FILE *fp)
 
 static size_t thread__fprintf(struct thread *self, FILE *fp)
 {
-	struct symhist *pos;
 	int ret = fprintf(fp, "thread: %d %s\n", self->pid, self->comm);
+	struct rb_node *nd;
 
-	list_for_each_entry(pos, &self->symhists, node)
+	for (nd = rb_first(&self->symhists); nd; nd = rb_next(nd)) {
+		struct symhist *pos = rb_entry(nd, struct symhist, rb_node);
 		ret += symhist__fprintf(pos, fp);
+	}
 
 	return ret;
 }
 
-static LIST_HEAD(threads);
+static struct rb_root threads = RB_ROOT;
 
-static void threads__add(struct thread *thread)
-{
-	list_add_tail(&thread->node, &threads);
-}
-
-static struct thread *threads__find(pid_t pid)
+static struct thread *threads__findnew(pid_t pid)
 {
-	struct thread *pos;
+	struct rb_node **p = &threads.rb_node;
+	struct rb_node *parent = NULL;
+	struct thread *th;
 
-	list_for_each_entry(pos, &threads, node)
-		if (pos->pid == pid)
-			return pos;
-	return NULL;
-}
+	while (*p != NULL) {
+		parent = *p;
+		th = rb_entry(parent, struct thread, rb_node);
 
-static struct thread *threads__findnew(pid_t pid)
-{
-	struct thread *thread = threads__find(pid);
+		if (th->pid == pid)
+			return th;
 
-	if (thread == NULL) {
-		thread = thread__new(pid);
-		if (thread != NULL)
-			threads__add(thread);
+		if (pid < th->pid)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
 	}
 
-	return thread;
+	th = thread__new(pid);
+	if (th != NULL) {
+		rb_link_node(&th->rb_node, parent, p);
+		rb_insert_color(&th->rb_node, &threads);
+	}
+	return th;
 }
 
 static void thread__insert_map(struct thread *self, struct map *map)
@@ -668,44 +661,13 @@ static struct map *thread__find_map(struct thread *self, uint64_t ip)
 
 static void threads__fprintf(FILE *fp)
 {
-	struct thread *pos;
-
-	list_for_each_entry(pos, &threads, node)
+	struct rb_node *nd;
+	for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
+		struct thread *pos = rb_entry(nd, struct thread, rb_node);
 		thread__fprintf(pos, fp);
+	}
 }
 
-#if 0
-static std::string resolve_user_symbol(int pid, uint64_t ip)
-{
-	std::string sym = "<unknown>";
-
-	maps_t &m = maps[pid];
-	maps_t::const_iterator mi = m.upper_bound(map(ip));
-	if (mi == m.end())
-		return sym;
-
-	ip -= mi->start + mi->pgoff;
-
-	symbols_t &s = dsos[mi->dso].syms;
-	symbols_t::const_iterator si = s.upper_bound(symbol(ip));
-
-	sym = mi->dso + ": <unknown>";
-
-	if (si == s.begin())
-		return sym;
-	si--;
-
-	if (si->start <= ip && ip < si->end)
-		sym = mi->dso + ": " + si->name;
-#if 0
-	else if (si->start <= ip)
-		sym = mi->dso + ": ?" + si->name;
-#endif
-
-	return sym;
-}
-#endif
-
 static void display_help(void)
 {
 	printf(
@@ -824,8 +786,11 @@ more:
 		struct dso *dso = NULL;
 		struct thread *thread = threads__findnew(event->ip.pid);
 
-		if (thread == NULL)
+		if (thread == NULL) {
+			fprintf(stderr, "problem processing %d event, bailing out\n",
+				event->header.type);
 			goto done;
+		}
 
 		if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
 			show = SHOW_KERNEL;
@@ -845,8 +810,11 @@ more:
 		if (show & show_mask) {
 			struct symbol *sym = dso__find_symbol(dso, event->ip.ip);
 
-			if (thread__symbol_incnew(thread, sym, dso, level))
+			if (thread__symbol_incnew(thread, sym, event->ip.ip,
+						  dso, level)) {
+				fprintf(stderr, "problem incrementing symbol count, bailing out\n");
 				goto done;
+			}
 		}
 		total++;
 	} else switch (event->header.type) {
@@ -854,8 +822,10 @@ more:
 		struct thread *thread = threads__findnew(event->mmap.pid);
 		struct map *map = map__new(&event->mmap);
 
-		if (thread == NULL || map == NULL )
+		if (thread == NULL || map == NULL) {
+			fprintf(stderr, "problem processing PERF_EVENT_MMAP, bailing out\n");
 			goto done;
+		}
 		thread__insert_map(thread, map);
 		break;
 	}
@@ -863,8 +833,10 @@ more:
 		struct thread *thread = threads__findnew(event->comm.pid);
 
 		if (thread == NULL ||
-		    thread__set_comm(thread, event->comm.comm))
+		    thread__set_comm(thread, event->comm.comm)) {
+			fprintf(stderr, "problem processing PERF_EVENT_COMM, bailing out\n");
 			goto done;
+		}
 		break;
 	}
 	}
-- 
cgit v0.10.2


From f3e08c5341c528284460530b546608f27232f737 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 22 May 2009 15:34:54 +0200
Subject: perf report: Fix segfault on unknown symbols

Ingo reported:

> Program received signal SIGSEGV, Segmentation fault.
> 0x0000003e25080f80 in strlen () from /lib64/libc.so.6
> Missing separate debuginfos, use: debuginfo-install elfutils.x86_64
> glibc.x86_64 zlib.x86_64
> (gdb) bt
> #0  0x0000003e25080f80 in strlen () from /lib64/libc.so.6
> #1  0x0000003e2506954e in fputs () from /lib64/libc.so.6
> #2  0x00000000004059e8 in cmd_report (argc=<value optimized out>,
>     argv=<value optimized out>) at builtin-report.c:521
> #3  0x0000000000402dad in handle_internal_command (argc=1, argv=0x7fffe1218e30)
>     at perf.c:226
> #4  0x0000000000402f6d in main (argc=1, argv=0x7fffe1218e30) at perf.c:324
> (gdb)

Signed-off-by Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index e857201..21386a8 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -518,7 +518,7 @@ static size_t symhist__fprintf(struct symhist *self, FILE *fp)
 	size_t ret = fprintf(fp, "%#llx [%c] ", (unsigned long long)self->ip, self->level);
 
 	if (self->level != '.')
-		ret += fprintf(fp, "%s", self->sym->name);
+		ret += fprintf(fp, "%s", self->sym ? self->sym->name: "<unknown>");
 	else
 		ret += fprintf(fp, "%s: %s",
 			       self->dso ? self->dso->name : "<unknown",
-- 
cgit v0.10.2


From 53cb8bc2a3d976efd1a800c3de4640a7220afbb3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 09:17:18 +0200
Subject: perf record: Convert to Git option parsing

Remove getopt usage and use Git's much more advanced and more compact
command option library.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 21386a8..9e59d60 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -1,52 +1,29 @@
-#define _GNU_SOURCE
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <unistd.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
+#include "util/util.h"
+
+#include <libelf.h>
 #include <gelf.h>
 #include <elf.h>
-#include <libelf.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <errno.h>
-#include <ctype.h>
-#include <time.h>
-#include <getopt.h>
-#include <assert.h>
-#include <search.h>
-
-#include <sys/ioctl.h>
-#include <sys/poll.h>
-#include <sys/prctl.h>
-#include <sys/wait.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include <linux/unistd.h>
-#include <linux/types.h>
-
-#include "../../include/linux/perf_counter.h"
+
 #include "util/list.h"
 #include "util/rbtree.h"
 
+#include "perf.h"
+
+#include "util/parse-options.h"
+#include "util/parse-events.h"
+
 #define SHOW_KERNEL	1
 #define SHOW_USER	2
 #define SHOW_HV		4
 
-static char 		const *input_name = "output.perf";
+static char		const *input_name = "output.perf";
 static int		input;
 static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
 
-static const char *perf_event_names[] = {
+const char *perf_event_names[] = {
 	[PERF_EVENT_MMAP]   = " PERF_EVENT_MMAP",
 	[PERF_EVENT_MUNMAP] = " PERF_EVENT_MUNMAP",
 	[PERF_EVENT_COMM]   = " PERF_EVENT_COMM",
@@ -86,7 +63,7 @@ struct section {
 	char		 name[0];
 };
 
-static struct section *section__new(uint64_t start, uint64_t size,
+struct section *section__new(uint64_t start, uint64_t size,
 				    uint64_t offset, char *name)
 {
 	struct section *self = malloc(sizeof(*self) + strlen(name) + 1);
@@ -241,7 +218,7 @@ static inline uint8_t elf_sym__type(const GElf_Sym *sym)
 	return GELF_ST_TYPE(sym->st_info);
 }
 
-static inline bool elf_sym__is_function(const GElf_Sym *sym)
+static inline int elf_sym__is_function(const GElf_Sym *sym)
 {
 	return elf_sym__type(sym) == STT_FUNC &&
 	       sym->st_name != 0 &&
@@ -393,7 +370,7 @@ out_delete_dso:
 	return NULL;
 }
 
-static void dsos__fprintf(FILE *fp)
+void dsos__fprintf(FILE *fp)
 {
 	struct dso *pos;
 
@@ -503,7 +480,7 @@ static struct symhist *symhist__new(struct symbol *sym, uint64_t ip,
 	return self;
 }
 
-static void symhist__delete(struct symhist *self)
+void symhist__delete(struct symhist *self)
 {
 	free(self);
 }
@@ -587,7 +564,7 @@ static int thread__set_comm(struct thread *self, const char *comm)
 	return self->comm ? 0 : -ENOMEM;
 }
 
-static size_t thread__maps_fprintf(struct thread *self, FILE *fp)
+size_t thread__maps_fprintf(struct thread *self, FILE *fp)
 {
 	struct map *pos;
 	size_t ret = 0;
@@ -668,49 +645,7 @@ static void threads__fprintf(FILE *fp)
 	}
 }
 
-static void display_help(void)
-{
-	printf(
-	"Usage: perf-report [<options>]\n"
-	" -i file   --input=<file>      # input file\n"
-	);
-
-	exit(0);
-}
-
-static void process_options(int argc, char *argv[])
-{
-	int error = 0;
-
-	for (;;) {
-		int option_index = 0;
-		/** Options for getopt */
-		static struct option long_options[] = {
-			{"input",	required_argument,	NULL, 'i'},
-			{"no-user",	no_argument,		NULL, 'u'},
-			{"no-kernel",	no_argument,		NULL, 'k'},
-			{"no-hv",	no_argument,		NULL, 'h'},
-			{NULL,		0,			NULL,  0 }
-		};
-		int c = getopt_long(argc, argv, "+:i:kuh",
-				    long_options, &option_index);
-		if (c == -1)
-			break;
-
-		switch (c) {
-		case 'i': input_name			= strdup(optarg); break;
-		case 'k': show_mask &= ~SHOW_KERNEL; break;
-		case 'u': show_mask &= ~SHOW_USER; break;
-		case 'h': show_mask &= ~SHOW_HV; break;
-		default: error = 1; break;
-		}
-	}
-
-	if (error)
-		display_help();
-}
-
-int cmd_report(int argc, char **argv)
+static int __cmd_report(void)
 {
 	unsigned long offset = 0;
 	unsigned long head = 0;
@@ -720,12 +655,6 @@ int cmd_report(int argc, char **argv)
 	int ret, rc = EXIT_FAILURE;
 	unsigned long total = 0;
 
-	elf_version(EV_CURRENT);
-
-	page_size = getpagesize();
-
-	process_options(argc, argv);
-
 	input = open(input_name, O_RDONLY);
 	if (input < 0) {
 		perror("failed to open file");
@@ -867,3 +796,24 @@ done:
 	return rc;
 }
 
+static const char * const report_usage[] = {
+	"perf report [<options>] <command>",
+	NULL
+};
+
+static const struct option options[] = {
+	OPT_STRING('i', "input", &input_name, "file",
+		    "input file name"),
+	OPT_END()
+};
+
+int cmd_report(int argc, const char **argv, const char *prefix)
+{
+	elf_version(EV_CURRENT);
+
+	page_size = getpagesize();
+
+	parse_options(argc, argv, options, report_usage, 0);
+
+	return __cmd_report();
+}
-- 
cgit v0.10.2


From 0bec253c813fbb067db4dfd9f5b6cec1bd2ef026 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 09:17:18 +0200
Subject: perf report: Add help/manpage

Add a (minimal) manpage for perf report.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Documentation/perf-report.txt b/Documentation/perf_counter/Documentation/perf-report.txt
new file mode 100644
index 0000000..64696a2
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf-report.txt
@@ -0,0 +1,32 @@
+perf-report(1)
+==========
+
+NAME
+----
+perf-report - Read output.perf (created by perf record) and display the profile
+
+SYNOPSIS
+--------
+[verse]
+'perf report' [-i <file> | --input=file]
+
+DESCRIPTION
+-----------
+This command displays the performance counter profile information recorded
+via perf report.
+
+OPTIONS
+-------
+-i::
+--input=::
+        Input file name. (default: output.perf)
+
+Configuration
+-------------
+
+EXAMPLES
+--------
+
+SEE ALSO
+--------
+linkperf:perf-stat[1]
-- 
cgit v0.10.2


From f91183fe3780d44849110a1653dfe8af7bc67aa4 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 26 May 2009 15:25:34 +0200
Subject: perf top: Remove leftover NMI/IRQ bits

79202b removed IRQ/NMI mode selection, so remove it from
perf top as well.

[ Impact: cleanup ]

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 87b925c..cacaa3c 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -8,7 +8,7 @@
    Sample output:
 
 ------------------------------------------------------------------------------
- KernelTop:    2669 irqs/sec  [NMI, cache-misses/cache-refs],  (all, cpu: 2)
+ KernelTop:    2669 irqs/sec  [cache-misses/cache-refs],  (all, cpu: 2)
 ------------------------------------------------------------------------------
 
              weight         RIP          kernel function
@@ -92,7 +92,6 @@ static __u64			count_filter		       = 100;
 static int			target_pid				= -1;
 static int			profile_cpu			= -1;
 static int			nr_cpus				=  0;
-static int			nmi				=  1;
 static unsigned int		realtime_prio			=  0;
 static int			group				=  0;
 static unsigned int		page_size;
@@ -198,10 +197,9 @@ static void print_sym_table(void)
 
 	printf(
 "------------------------------------------------------------------------------\n");
-	printf( " KernelTop:%8.0f irqs/sec  kernel:%4.1f%% [%s, ",
+	printf( " KernelTop:%8.0f irqs/sec  kernel:%4.1f%% [",
 		events_per_sec,
-		100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
-		nmi ? "NMI" : "IRQ");
+		100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)));
 
 	if (nr_counters == 1)
 		printf("%d ", event_count[0]);
@@ -637,7 +635,7 @@ static int __cmd_top(void)
 			hw_event.config		= event_id[counter];
 			hw_event.irq_period	= event_count[counter];
 			hw_event.record_type	= PERF_RECORD_IP | PERF_RECORD_TID;
-			hw_event.nmi		= nmi;
+			hw_event.nmi		= 1;
 			hw_event.mmap		= use_mmap;
 			hw_event.munmap		= use_munmap;
 			hw_event.freq		= freq;
-- 
cgit v0.10.2


From db20c0031288ff524d82b1f240f35f85d4a052eb Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 26 May 2009 15:25:34 +0200
Subject: perf top: fix typo in -d option

Clean up copy/paste options parsing conversion error.

[ Impact: reactivate -d option ]

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index cacaa3c..6b1c66f 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -727,7 +727,7 @@ static const struct option options[] = {
 		    "number of mmap data pages"),
 	OPT_INTEGER('r', "realtime", &realtime_prio,
 		    "collect data with this RT SCHED_FIFO priority"),
-	OPT_INTEGER('d', "delay", &realtime_prio,
+	OPT_INTEGER('d', "delay", &delay_secs,
 		    "number of seconds to delay between refreshes"),
 	OPT_BOOLEAN('D', "dump-symtab", &dump_symtab,
 			    "dump the symbol table used for profiling"),
-- 
cgit v0.10.2


From d076d2bd0d9379314df5f2ab8b9c83f617c70923 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 26 May 2009 23:10:15 +0900
Subject: sh: pci-sh7751: Initialize io_map_base in controller definition.

As there is only a single controller and remapping has no impact for the
address range in question, just initialize it directly in the controller
definition. This fixes up boot time warnings about not having the field
initialized.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c
index c4fa0bb1..70c1999 100644
--- a/arch/sh/drivers/pci/pci-sh7751.c
+++ b/arch/sh/drivers/pci/pci-sh7751.c
@@ -52,7 +52,7 @@ static struct resource sh7751_io_resource = {
 };
 
 static struct resource sh7751_mem_resource = {
-	.name	= "SH7785_mem",
+	.name	= "SH7751_mem",
 	.start	= SH7751_PCI_MEMORY_BASE,
 	.end	= SH7751_PCI_MEMORY_BASE + SH7751_PCI_MEM_SIZE - 1,
 	.flags	= IORESOURCE_MEM
@@ -64,6 +64,7 @@ static struct pci_channel sh7751_pci_controller = {
 	.mem_offset	= 0x00000000,
 	.io_resource	= &sh7751_io_resource,
 	.io_offset	= 0x00000000,
+	.io_map_base	= SH7751_PCI_IO_BASE,
 };
 
 static struct sh4_pci_address_map sh7751_pci_map = {
@@ -179,8 +180,6 @@ static int __init sh7751_pci_init(void)
 	word = SH4_PCICR_PREFIX | SH4_PCICR_CFIN | SH4_PCICR_ARBM;
 	pci_write_reg(chan, word, SH4_PCICR);
 
-	__set_io_port_base(SH7751_PCI_IO_BASE);
-
 	register_pci_controller(chan);
 
 	return 0;
-- 
cgit v0.10.2


From b7e2ac619465f1774b827d9997109ceef4a61851 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 26 May 2009 23:13:13 +0900
Subject: sh: pci: Disable MWI and make pci_dma_burst_advice() a bit more
 accurate.

None of the SH PCI controllers support MWI, it is always treated as a
direct memory write, so simply disable it outright. In the case of the
PCI cache line size, consult that for the pci_dma_burst_advice()
strategy, and switch over to PCI_DMA_BURST_MULTIPLE, as PPC64.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 5b2e0fc..ae0da6f 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -89,12 +89,28 @@ static inline void pcibios_penalize_isa_irq(int irq, int active)
 #endif
 
 #ifdef CONFIG_PCI
+/*
+ * None of the SH PCI controllers support MWI, it is always treated as a
+ * direct memory write.
+ */
+#define PCI_DISABLE_MWI
+
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
 					unsigned long *strategy_parameter)
 {
-	*strat = PCI_DMA_BURST_INFINITY;
-	*strategy_parameter = ~0UL;
+	unsigned long cacheline_size;
+	u8 byte;
+
+	pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &byte);
+
+	if (byte == 0)
+		cacheline_size = L1_CACHE_BYTES;
+	else
+		cacheline_size = byte << 2;
+
+	*strat = PCI_DMA_BURST_MULTIPLE;
+	*strategy_parameter = cacheline_size;
 }
 #endif
 
-- 
cgit v0.10.2


From f17e04afaff84b5cfd317da29ac4d764908ff833 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 26 May 2009 15:30:22 +0200
Subject: perf report: Fix ELF symbol parsing

[ Impact: fix DSO symbol output in perf report ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 412dea1..10c13a6 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -159,7 +159,7 @@ uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
 
 # CFLAGS and LDFLAGS are for the users to override from the command line.
 
-CFLAGS = -g -O2 -Wall
+CFLAGS = -ggdb3 -Wall
 LDFLAGS = -lpthread -lrt -lelf
 ALL_CFLAGS = $(CFLAGS)
 ALL_LDFLAGS = $(LDFLAGS)
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 9e59d60..697f960 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -55,34 +55,6 @@ typedef union event_union {
 	struct comm_event comm;
 } event_t;
 
-struct section {
-	struct list_head node;
-	uint64_t	 start;
-	uint64_t	 end;
-	uint64_t	 offset;
-	char		 name[0];
-};
-
-struct section *section__new(uint64_t start, uint64_t size,
-				    uint64_t offset, char *name)
-{
-	struct section *self = malloc(sizeof(*self) + strlen(name) + 1);
-
-	if (self != NULL) {
-		self->start  = start;
-		self->end    = start + size;
-		self->offset = offset;
-		strcpy(self->name, name);
-	}
-
-	return self;
-}
-
-static void section__delete(struct section *self)
-{
-	free(self);
-}
-
 struct symbol {
 	struct rb_node rb_node;
 	uint64_t       start;
@@ -116,7 +88,6 @@ static size_t symbol__fprintf(struct symbol *self, FILE *fp)
 
 struct dso {
 	struct list_head node;
-	struct list_head sections;
 	struct rb_root	 syms;
 	char		 name[0];
 };
@@ -127,21 +98,12 @@ static struct dso *dso__new(const char *name)
 
 	if (self != NULL) {
 		strcpy(self->name, name);
-		INIT_LIST_HEAD(&self->sections);
 		self->syms = RB_ROOT;
 	}
 
 	return self;
 }
 
-static void dso__delete_sections(struct dso *self)
-{
-	struct section *pos, *n;
-
-	list_for_each_entry_safe(pos, n, &self->sections, node)
-		section__delete(pos);
-}
-
 static void dso__delete_symbols(struct dso *self)
 {
 	struct symbol *pos;
@@ -156,7 +118,6 @@ static void dso__delete_symbols(struct dso *self)
 
 static void dso__delete(struct dso *self)
 {
-	dso__delete_sections(self);
 	dso__delete_symbols(self);
 	free(self);
 }
@@ -282,9 +243,6 @@ static int dso__load(struct dso *self)
 	if (sec == NULL)
 		goto out_elf_end;
 
-	if (gelf_getshdr(sec, &shdr) == NULL)
-		goto out_elf_end;
-
 	Elf_Data *syms = elf_getdata(sec, NULL);
 	if (syms == NULL)
 		goto out_elf_end;
@@ -302,11 +260,21 @@ static int dso__load(struct dso *self)
 	GElf_Sym sym;
 	uint32_t index;
 	elf_symtab__for_each_symbol(syms, nr_syms, index, sym) {
+		struct symbol *f;
+
 		if (!elf_sym__is_function(&sym))
 			continue;
-		struct symbol *f = symbol__new(sym.st_value, sym.st_size,
-					       elf_sym__name(&sym, symstrs));
-		if (f == NULL)
+
+		sec = elf_getscn(elf, sym.st_shndx);
+		if (!sec)
+			goto out_elf_end;
+
+		gelf_getshdr(sec, &shdr);
+		sym.st_value -= shdr.sh_addr - shdr.sh_offset;
+
+		f = symbol__new(sym.st_value, sym.st_size,
+				elf_sym__name(&sym, symstrs));
+		if (!f)
 			goto out_elf_end;
 
 		dso__insert_symbol(self, f);
@@ -498,7 +466,7 @@ static size_t symhist__fprintf(struct symhist *self, FILE *fp)
 		ret += fprintf(fp, "%s", self->sym ? self->sym->name: "<unknown>");
 	else
 		ret += fprintf(fp, "%s: %s",
-			       self->dso ? self->dso->name : "<unknown",
+			       self->dso ? self->dso->name : "<unknown>",
 			       self->sym ? self->sym->name : "<unknown>");
 	return ret + fprintf(fp, ": %u\n", self->count);
 }
@@ -714,6 +682,7 @@ more:
 		int show = 0;
 		struct dso *dso = NULL;
 		struct thread *thread = threads__findnew(event->ip.pid);
+		uint64_t ip = event->ip.ip;
 
 		if (thread == NULL) {
 			fprintf(stderr, "problem processing %d event, bailing out\n",
@@ -728,19 +697,20 @@ more:
 		} else if (event->header.misc & PERF_EVENT_MISC_USER) {
 			show = SHOW_USER;
 			level = '.';
-			struct map *map = thread__find_map(thread, event->ip.ip);
-			if (map != NULL)
+			struct map *map = thread__find_map(thread, ip);
+			if (map != NULL) {
 				dso = map->dso;
+				ip -= map->start + map->pgoff;
+			}
 		} else {
 			show = SHOW_HV;
 			level = 'H';
 		}
 
 		if (show & show_mask) {
-			struct symbol *sym = dso__find_symbol(dso, event->ip.ip);
+			struct symbol *sym = dso__find_symbol(dso, ip);
 
-			if (thread__symbol_incnew(thread, sym, event->ip.ip,
-						  dso, level)) {
+			if (thread__symbol_incnew(thread, sym, ip, dso, level)) {
 				fprintf(stderr, "problem incrementing symbol count, bailing out\n");
 				goto done;
 			}
-- 
cgit v0.10.2


From 59d81029b6804c3d5895d07cad77d7dfddc6b5b2 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 26 May 2009 11:14:27 -0300
Subject: perf report: Fix kernel symbol resolution

kallsyms have just the symbol start, so we need to read two lines
to get the len.

[ Impact: fix incorrect kernel symbol display in perf report ]

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 697f960..b19b893 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -360,9 +360,17 @@ static int load_kallsyms(void)
 	char *line = NULL;
 	size_t n;
 
+	if (getline(&line, &n, file) < 0 || !line)
+		goto out_delete_dso;
+
+	unsigned long long previous_start;
+	char c, previous_symbf[4096];
+	if (sscanf(line, "%llx %c %s", &previous_start, &c, previous_symbf) != 3)
+		goto out_delete_line;
+
 	while (!feof(file)) {
 		unsigned long long start;
-		char c, symbf[4096];
+		char symbf[4096];
 
 		if (getline(&line, &n, file) < 0)
 			break;
@@ -371,12 +379,18 @@ static int load_kallsyms(void)
 			goto out_delete_dso;
 
 		if (sscanf(line, "%llx %c %s", &start, &c, symbf) == 3) {
-			struct symbol *sym = symbol__new(start, 0x1000000, symbf);
+			if (start > previous_start) {
+				struct symbol *sym = symbol__new(previous_start,
+								 start - previous_start,
+								 previous_symbf);
 
-			if (sym == NULL)
-				goto out_delete_dso;
+				if (sym == NULL)
+					goto out_delete_dso;
 
-			dso__insert_symbol(kernel_dso, sym);
+				dso__insert_symbol(kernel_dso, sym);
+				previous_start = start;
+				strcpy(previous_symbf, symbf);
+			}
 		}
 	}
 
@@ -385,6 +399,8 @@ static int load_kallsyms(void)
 	fclose(file);
 	return 0;
 
+out_delete_line:
+	free(line);
 out_delete_dso:
 	dso__delete(kernel_dso);
 	return -1;
-- 
cgit v0.10.2


From e1b28aab5804aa477c33d19855d6747607a885fd Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 26 May 2009 15:41:27 +0100
Subject: GFS2: Remove lockstruct subdir from gfs2 sysfs files

The lockstruct sub directory contained two entries, both of
which are duplicated elsewhere in the gfs2 sysfs files as
well as being available via /proc/mounts. There is no userland program
using either of them, so this patch removes them.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 9f6d48b..94bd59e 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -276,25 +276,6 @@ static struct kobj_type gfs2_ktype = {
 	.sysfs_ops     = &gfs2_attr_ops,
 };
 
-/*
- * display struct lm_lockstruct fields
- */
-
-#define LOCKSTRUCT_ATTR(name, fmt)                                          \
-static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
-{                                                                           \
-	return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_lockstruct.ls_##name); \
-}                                                                           \
-static struct gfs2_attr lockstruct_attr_##name = __ATTR_RO(name)
-
-LOCKSTRUCT_ATTR(jid,      "%u\n");
-LOCKSTRUCT_ATTR(first,    "%u\n");
-
-static struct attribute *lockstruct_attrs[] = {
-	&lockstruct_attr_jid.attr,
-	&lockstruct_attr_first.attr,
-	NULL,
-};
 
 /*
  * lock_module. Originally from lock_dlm
@@ -397,6 +378,11 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
 	return sprintf(buf, "%d\n", ls->ls_recover_jid_status);
 }
 
+static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
+{
+	return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid);
+}
+
 #define GDLM_ATTR(_name,_mode,_show,_store) \
 static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
 
@@ -404,6 +390,7 @@ GDLM_ATTR(proto_name,     0444, proto_name_show,	NULL);
 GDLM_ATTR(block,          0644, block_show,		block_store);
 GDLM_ATTR(withdraw,       0644, withdraw_show,		withdraw_store);
 GDLM_ATTR(id,             0444, lkid_show,		NULL);
+GDLM_ATTR(jid,		  0444, jid_show,		NULL);
 GDLM_ATTR(first,          0444, lkfirst_show,		NULL);
 GDLM_ATTR(first_done,     0444, first_done_show,	NULL);
 GDLM_ATTR(recover,        0200, NULL,			recover_store);
@@ -415,7 +402,7 @@ static struct attribute *lock_module_attrs[] = {
 	&gdlm_attr_block.attr,
 	&gdlm_attr_withdraw.attr,
 	&gdlm_attr_id.attr,
-	&lockstruct_attr_jid.attr,
+	&gdlm_attr_jid.attr,
 	&gdlm_attr_first.attr,
 	&gdlm_attr_first_done.attr,
 	&gdlm_attr_recover.attr,
@@ -558,11 +545,6 @@ static struct attribute *tune_attrs[] = {
 	NULL,
 };
 
-static struct attribute_group lockstruct_group = {
-	.name = "lockstruct",
-	.attrs = lockstruct_attrs,
-};
-
 static struct attribute_group args_group = {
 	.name = "args",
 	.attrs = args_attrs,
@@ -588,13 +570,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 	if (error)
 		goto fail;
 
-	error = sysfs_create_group(&sdp->sd_kobj, &lockstruct_group);
-	if (error)
-		goto fail_reg;
-
 	error = sysfs_create_group(&sdp->sd_kobj, &args_group);
 	if (error)
-		goto fail_lockstruct;
+		goto fail_reg;
 
 	error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
 	if (error)
@@ -611,8 +589,6 @@ fail_tune:
 	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
 fail_args:
 	sysfs_remove_group(&sdp->sd_kobj, &args_group);
-fail_lockstruct:
-	sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
 fail_reg:
 	kobject_put(&sdp->sd_kobj);
 fail:
@@ -624,7 +600,6 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
 {
 	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
 	sysfs_remove_group(&sdp->sd_kobj, &args_group);
-	sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
 	sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
 	kobject_put(&sdp->sd_kobj);
 }
-- 
cgit v0.10.2


From f6eb53498ee8f725832f3a0fffca90566bb118a6 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 26 May 2009 15:50:25 +0100
Subject: GFS2: Remove args subdir from gfs2 sysfs files

Since we can cat /proc/mounts there is no need to have this
subdirectory in the gfs2 sysfs files. In fact this does not
reflect the full range of possible mount argumenmts, where
as /proc/mounts does.

There was only one userland user of this set of sysfs files
and it will function perfectly well without these files
being present (in fact that subcommand of gfs2_tool is
obsolete anyway).

The tune/* subdirectory is also considered mostly obsolete,
but there are a few uses of this until mount arguments can
be added for the last few functions for which there are no
equivalents currently. However the tune/* directory is still
in my sights and new code should avoid using it. Only the gfs2_quota
and gfs2_tool programs are know to use tune/* at the moment.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 94bd59e..23419dc 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -411,44 +411,6 @@ static struct attribute *lock_module_attrs[] = {
 	NULL,
 };
 
-#define ARGS_ATTR(name, fmt)                                                \
-static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
-{                                                                           \
-	return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_args.ar_##name);       \
-}                                                                           \
-static struct gfs2_attr args_attr_##name = __ATTR_RO(name)
-
-ARGS_ATTR(lockproto,       "%s\n");
-ARGS_ATTR(locktable,       "%s\n");
-ARGS_ATTR(hostdata,        "%s\n");
-ARGS_ATTR(spectator,       "%d\n");
-ARGS_ATTR(ignore_local_fs, "%d\n");
-ARGS_ATTR(localcaching,    "%d\n");
-ARGS_ATTR(localflocks,     "%d\n");
-ARGS_ATTR(debug,           "%d\n");
-ARGS_ATTR(upgrade,         "%d\n");
-ARGS_ATTR(posix_acl,       "%d\n");
-ARGS_ATTR(quota,           "%u\n");
-ARGS_ATTR(suiddir,         "%d\n");
-ARGS_ATTR(data,            "%d\n");
-
-static struct attribute *args_attrs[] = {
-	&args_attr_lockproto.attr,
-	&args_attr_locktable.attr,
-	&args_attr_hostdata.attr,
-	&args_attr_spectator.attr,
-	&args_attr_ignore_local_fs.attr,
-	&args_attr_localcaching.attr,
-	&args_attr_localflocks.attr,
-	&args_attr_debug.attr,
-	&args_attr_upgrade.attr,
-	&args_attr_posix_acl.attr,
-	&args_attr_quota.attr,
-	&args_attr_suiddir.attr,
-	&args_attr_data.attr,
-	NULL,
-};
-
 /*
  * get and set struct gfs2_tune fields
  */
@@ -545,11 +507,6 @@ static struct attribute *tune_attrs[] = {
 	NULL,
 };
 
-static struct attribute_group args_group = {
-	.name = "args",
-	.attrs = args_attrs,
-};
-
 static struct attribute_group tune_group = {
 	.name = "tune",
 	.attrs = tune_attrs,
@@ -570,13 +527,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 	if (error)
 		goto fail;
 
-	error = sysfs_create_group(&sdp->sd_kobj, &args_group);
-	if (error)
-		goto fail_reg;
-
 	error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
 	if (error)
-		goto fail_args;
+		goto fail_reg;
 
 	error = sysfs_create_group(&sdp->sd_kobj, &lock_module_group);
 	if (error)
@@ -587,8 +540,6 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 
 fail_tune:
 	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
-fail_args:
-	sysfs_remove_group(&sdp->sd_kobj, &args_group);
 fail_reg:
 	kobject_put(&sdp->sd_kobj);
 fail:
@@ -599,7 +550,6 @@ fail:
 void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
 {
 	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
-	sysfs_remove_group(&sdp->sd_kobj, &args_group);
 	sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
 	kobject_put(&sdp->sd_kobj);
 }
-- 
cgit v0.10.2


From 5582b0648de6248c67c0b47fa170e5fb15ab4bf1 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 27 May 2009 00:12:58 +0900
Subject: sh: pci-sh7780: Fix up for PCI_DISABLE_MWI changes.

This fixes a build error where references to pci_cache_line_size are
undefined, as this ceases to be exported when PCI_DISABLE_MWI is enabled,
as is now the default.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index ae13ff9..323b92d 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -15,8 +15,6 @@
 #include <linux/delay.h>
 #include "pci-sh4.h"
 
-extern u8 pci_cache_line_size;
-
 static struct resource sh7785_io_resource = {
 	.name	= "SH7785_IO",
 	.start	= SH7780_PCI_IO_BASE,
@@ -37,6 +35,7 @@ static struct pci_channel sh7780_pci_controller = {
 	.mem_offset	= 0x00000000,
 	.io_resource	= &sh7785_io_resource,
 	.io_offset	= 0x00000000,
+	.io_map_base	= SH7780_PCI_IO_BASE,
 };
 
 static struct sh4_pci_address_map sh7780_pci_map = {
@@ -99,8 +98,6 @@ static int __init sh7780_pci_init(void)
 	__raw_writeb(PCI_CLASS_BRIDGE_HOST & 0xff,
 		     chan->reg_base + SH7780_PCISUB);
 
-	pci_cache_line_size = pci_read_reg(chan, SH7780_PCICLS) / 4;
-
 	/*
 	 * Set IO and Mem windows to local address
 	 * Make PCI and local address the same for easy 1 to 1 mapping
@@ -142,8 +139,6 @@ static int __init sh7780_pci_init(void)
 	word = SH4_PCICR_PREFIX | SH4_PCICR_CFIN | SH4_PCICR_FTO;
 	pci_write_reg(chan, word, SH4_PCICR);
 
-	__set_io_port_base(SH7780_PCI_IO_BASE);
-
 	register_pci_controller(chan);
 
 	return 0;
-- 
cgit v0.10.2


From abd54f68629fa73ed4fa040d433196211a9bbed2 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 26 May 2009 12:21:34 -0300
Subject: perf: Don't assume /proc/kallsyms is ordered

perf: Don't assume /proc/kallsyms is ordered

Since we _are_ ordering it by the symbol start, just traverse the
freshly built rbtree setting the prev->end members to curr->start - 1.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090526152134.GF4424@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index b19b893..e178190 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -360,17 +360,9 @@ static int load_kallsyms(void)
 	char *line = NULL;
 	size_t n;
 
-	if (getline(&line, &n, file) < 0 || !line)
-		goto out_delete_dso;
-
-	unsigned long long previous_start;
-	char c, previous_symbf[4096];
-	if (sscanf(line, "%llx %c %s", &previous_start, &c, previous_symbf) != 3)
-		goto out_delete_line;
-
 	while (!feof(file)) {
 		unsigned long long start;
-		char symbf[4096];
+		char c, symbf[4096];
 
 		if (getline(&line, &n, file) < 0)
 			break;
@@ -379,21 +371,35 @@ static int load_kallsyms(void)
 			goto out_delete_dso;
 
 		if (sscanf(line, "%llx %c %s", &start, &c, symbf) == 3) {
-			if (start > previous_start) {
-				struct symbol *sym = symbol__new(previous_start,
-								 start - previous_start,
-								 previous_symbf);
+			/*
+			 * Well fix up the end later, when we have all sorted.
+			 */
+			struct symbol *sym = symbol__new(start, 0xdead, symbf);
 
-				if (sym == NULL)
-					goto out_delete_dso;
+			if (sym == NULL)
+				goto out_delete_dso;
 
-				dso__insert_symbol(kernel_dso, sym);
-				previous_start = start;
-				strcpy(previous_symbf, symbf);
-			}
+			dso__insert_symbol(kernel_dso, sym);
 		}
 	}
 
+	/*
+	 * Now that we have all sorted out, just set the ->end of all
+	 * symbols
+	 */
+	struct rb_node *nd, *prevnd = rb_first(&kernel_dso->syms);
+
+	if (prevnd == NULL)
+		goto out_delete_line;
+
+	for (nd = rb_next(prevnd); nd; nd = rb_next(nd)) {
+		struct symbol *prev = rb_entry(prevnd, struct symbol, rb_node),
+			      *curr = rb_entry(nd, struct symbol, rb_node);
+
+		prev->end = curr->start - 1;
+		prevnd = nd;
+	}
+
 	dsos__add(kernel_dso);
 	free(line);
 	fclose(file);
-- 
cgit v0.10.2


From 4319503779060120fa5de9b8fde056603bb6e0fd Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jwilson@fedoraproject.org>
Date: Fri, 6 Mar 2009 20:24:57 +0000
Subject: [CPUFREQ] add atom family to p4-clockmod

Some atom procs don't do freq scaling (such as the atom 330 on my own
littlefalls2 board). By adding the atom family here, we at least get
the benefit of passive cooling in a thermal emergency. Not sure how
to see that its actually helping any, but the driver does bind and
claim its functioning on my atom 330.

Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: Dave Jones <davej@redhat.com>

diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 6ac55bd..8696151 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -168,6 +168,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
 		case 0x0E: /* Core */
 		case 0x0F: /* Core Duo */
 		case 0x16: /* Celeron Core */
+		case 0x1C: /* Atom */
 			p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
 			return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE);
 		case 0x0D: /* Pentium M (Dothan) */
-- 
cgit v0.10.2


From d38e73e8dad454a5916f446b0d3523c1161ae95a Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Thu, 23 Apr 2009 13:36:12 -0400
Subject: [CPUFREQ] powernow-k7 build fix when ACPI=n
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

arch/x86/kernel/cpu/cpufreq/powernow-k7.c:172: warning: 'invalidate_entry' defined but not used

Reported-by: Toralf Förster <toralf.foerster@gmx.de>
Signed-off-by: Dave Jones <davej@redhat.com>

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 3c28ccd..a8363e5 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -168,10 +168,12 @@ static int check_powernow(void)
 	return 1;
 }
 
+#ifdef CONFIG_X86_POWERNOW_K7_ACPI
 static void invalidate_entry(unsigned int entry)
 {
 	powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
 }
+#endif
 
 static int get_ranges(unsigned char *pst)
 {
-- 
cgit v0.10.2


From 42a06f2166f2f6f7bf04f32b4e823eacdceafdc9 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Sun, 17 May 2009 10:23:52 -0400
Subject: [CPUFREQ] remove rwsem lock from CPUFREQ_GOV_STOP call

* Rafael J. Wysocki (rjw@sisk.pl) wrote:
> This message has been generated automatically as a part of a report
> of regressions introduced between 2.6.28 and 2.6.29.
>
> The following bug entry is on the current list of known regressions
> introduced between 2.6.28 and 2.6.29.  Please verify if it still should
> be listed and let me know (either way).
>
>
> Bug-Entry	: http://bugzilla.kernel.org/show_bug.cgi?id=13186
> Subject		: cpufreq timer teardown problem
> Submitter	: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
> Date		: 2009-04-23 14:00 (24 days old)
> References	: http://marc.info/?l=linux-kernel&m=124049523515036&w=4
> Handled-By	: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
> Patch		: http://patchwork.kernel.org/patch/19754/
> 		  http://patchwork.kernel.org/patch/19753/

The patches linked above depend on the following patch to remove
circular locking dependency :

cpufreq: remove rwsem lock from CPUFREQ_GOV_STOP call

(the following issue was faced when using cancel_delayed_work_sync() in the
timer teardown (which fixes a race).

* KOSAKI Motohiro (kosaki.motohiro@jp.fujitsu.com) wrote:
> Hi
>
> my box output following warnings.
> it seems regression by commit 7ccc7608b836e58fbacf65ee4f8eefa288e86fac.
>
> A: work -> do_dbs_timer()  -> cpu_policy_rwsem
> B: store() -> cpu_policy_rwsem -> cpufreq_governor_dbs() -> work
>
>

Hrm, I think it must be due to my attempt to fix the timer teardown race
in ondemand governor mixed with new locking behavior in 2.6.30-rc.

The rwlock seems to be taken around the whole call to
cpufreq_governor_dbs(), when it should be only taken around accesses to
the locked data, and especially *not* around the call to
dbs_timer_exit().

Reverting my fix attempt would put the teardown race back in place
(replacing the cancel_delayed_work_sync by cancel_delayed_work).
Instead, a proper fix would imply modifying this critical section :

cpufreq.c: __cpufreq_remove_dev()
...
        if (cpufreq_driver->target)
                __cpufreq_governor(data, CPUFREQ_GOV_STOP);

        unlock_policy_rwsem_write(cpu);

To make sure the __cpufreq_governor() callback is not called with rwsem
held. This would allow execution of cancel_delayed_work_sync() without
being nested within the rwsem.

Applies on top of the 2.6.30-rc5 tree.

Required to remove circular dep in teardown of both conservative and
ondemande governors so they can use cancel_delayed_work_sync().
CPUFREQ_GOV_STOP does not modify the policy, therefore this locking seemed
unneeded.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
CC: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Greg KH <greg@kroah.com>
CC: Ingo Molnar <mingo@elte.hu>
CC: "Rafael J. Wysocki" <rjw@sisk.pl>
CC: Ben Slusky <sluskyb@paranoiacs.org>
CC: Chris Wright <chrisw@sous-sol.org>
CC: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dave Jones <davej@redhat.com>

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index d270e8e..47d2ad0 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1070,11 +1070,11 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev)
 	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
 #endif
 
+	unlock_policy_rwsem_write(cpu);
+
 	if (cpufreq_driver->target)
 		__cpufreq_governor(data, CPUFREQ_GOV_STOP);
 
-	unlock_policy_rwsem_write(cpu);
-
 	kobject_put(&data->kobj);
 
 	/* we need to make sure that the underlying kobj is actually
-- 
cgit v0.10.2


From b253d2b2d28ead6fed012feb54694b3d0562839a Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Sun, 17 May 2009 10:29:33 -0400
Subject: [CPUFREQ] fix timer teardown in conservative governor

* Rafael J. Wysocki (rjw@sisk.pl) wrote:
> This message has been generated automatically as a part of a report
> of regressions introduced between 2.6.28 and 2.6.29.
>
> The following bug entry is on the current list of known regressions
> introduced between 2.6.28 and 2.6.29.  Please verify if it still should
> be listed and let me know (either way).
>
>
> Bug-Entry	: http://bugzilla.kernel.org/show_bug.cgi?id=13186
> Subject		: cpufreq timer teardown problem
> Submitter	: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
> Date		: 2009-04-23 14:00 (24 days old)
> References	: http://marc.info/?l=linux-kernel&m=124049523515036&w=4
> Handled-By	: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
> Patch		: http://patchwork.kernel.org/patch/19754/
> 		  http://patchwork.kernel.org/patch/19753/
>

(re-send with updated changelog)

cpufreq fix timer teardown in conservative governor

The problem is that dbs_timer_exit() uses cancel_delayed_work() when it should
use cancel_delayed_work_sync(). cancel_delayed_work() does not wait for the
workqueue handler to exit.

The ondemand governor does not seem to be affected because the
"if (!dbs_info->enable)" check at the beginning of the workqueue handler returns
immediately without rescheduling the work. The conservative governor in
2.6.30-rc has the same check as the ondemand governor, which makes things
usually run smoothly. However, if the governor is quickly stopped and then
started, this could lead to the following race :

dbs_enable could be reenabled and multiple do_dbs_timer handlers would run.
This is why a synchronized teardown is required.

Depends on patch
cpufreq: remove rwsem lock from CPUFREQ_GOV_STOP call

The following patch applies to 2.6.30-rc2. Stable kernels have a similar
issue which should also be fixed, but the code changed between 2.6.29
and 2.6.30, so this patch only applies to 2.6.30-rc.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: gregkh@suse.de
CC: stable@kernel.org
CC: cpufreq@vger.kernel.org
CC: Ingo Molnar <mingo@elte.hu>
CC: rjw@sisk.pl
CC: Ben Slusky <sluskyb@paranoiacs.org>
Signed-off-by: Dave Jones <davej@redhat.com>

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 2ecd95e..7a74d17 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -91,6 +91,9 @@ static unsigned int dbs_enable;	/* number of CPUs using this policy */
  * (like __cpufreq_driver_target()) is being called with dbs_mutex taken, then
  * cpu_hotplug lock should be taken before that. Note that cpu_hotplug lock
  * is recursive for the same process. -Venki
+ * DEADLOCK ALERT! (2) : do_dbs_timer() must not take the dbs_mutex, because it
+ * would deadlock with cancel_delayed_work_sync(), which is needed for proper
+ * raceless workqueue teardown.
  */
 static DEFINE_MUTEX(dbs_mutex);
 
@@ -542,7 +545,7 @@ static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info)
 static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info)
 {
 	dbs_info->enable = 0;
-	cancel_delayed_work(&dbs_info->work);
+	cancel_delayed_work_sync(&dbs_info->work);
 }
 
 static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
-- 
cgit v0.10.2


From b14893a62c73af0eca414cfed505b8c09efc613c Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Sun, 17 May 2009 10:30:45 -0400
Subject: [CPUFREQ] fix timer teardown in ondemand governor

* Rafael J. Wysocki (rjw@sisk.pl) wrote:
> This message has been generated automatically as a part of a report
> of regressions introduced between 2.6.28 and 2.6.29.
>
> The following bug entry is on the current list of known regressions
> introduced between 2.6.28 and 2.6.29.  Please verify if it still should
> be listed and let me know (either way).
>
>
> Bug-Entry	: http://bugzilla.kernel.org/show_bug.cgi?id=13186
> Subject		: cpufreq timer teardown problem
> Submitter	: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
> Date		: 2009-04-23 14:00 (24 days old)
> References	: http://marc.info/?l=linux-kernel&m=124049523515036&w=4
> Handled-By	: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
> Patch		: http://patchwork.kernel.org/patch/19754/
> 		  http://patchwork.kernel.org/patch/19753/
>

(updated changelog)

cpufreq fix timer teardown in ondemand governor

The problem is that dbs_timer_exit() uses cancel_delayed_work() when it should
use cancel_delayed_work_sync(). cancel_delayed_work() does not wait for the
workqueue handler to exit.

The ondemand governor does not seem to be affected because the
"if (!dbs_info->enable)" check at the beginning of the workqueue handler returns
immediately without rescheduling the work. The conservative governor in
2.6.30-rc has the same check as the ondemand governor, which makes things
usually run smoothly. However, if the governor is quickly stopped and then
started, this could lead to the following race :

dbs_enable could be reenabled and multiple do_dbs_timer handlers would run.
This is why a synchronized teardown is required.

The following patch applies to, at least, 2.6.28.x, 2.6.29.1, 2.6.30-rc2.

Depends on patch
cpufreq: remove rwsem lock from CPUFREQ_GOV_STOP call

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: gregkh@suse.de
CC: stable@kernel.org
CC: cpufreq@vger.kernel.org
CC: Ingo Molnar <mingo@elte.hu>
CC: rjw@sisk.pl
CC: Ben Slusky <sluskyb@paranoiacs.org>
Signed-off-by: Dave Jones <davej@redhat.com>

diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 338f428..e741c33 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -98,6 +98,9 @@ static unsigned int dbs_enable;	/* number of CPUs using this policy */
  * (like __cpufreq_driver_target()) is being called with dbs_mutex taken, then
  * cpu_hotplug lock should be taken before that. Note that cpu_hotplug lock
  * is recursive for the same process. -Venki
+ * DEADLOCK ALERT! (2) : do_dbs_timer() must not take the dbs_mutex, because it
+ * would deadlock with cancel_delayed_work_sync(), which is needed for proper
+ * raceless workqueue teardown.
  */
 static DEFINE_MUTEX(dbs_mutex);
 
@@ -562,7 +565,7 @@ static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info)
 static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info)
 {
 	dbs_info->enable = 0;
-	cancel_delayed_work(&dbs_info->work);
+	cancel_delayed_work_sync(&dbs_info->work);
 }
 
 static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
-- 
cgit v0.10.2


From df1829770db415dc5a5ed5ada3bd70176c6f0a01 Mon Sep 17 00:00:00 2001
From: Thomas Renninger <trenn@suse.de>
Date: Wed, 22 Apr 2009 13:48:32 +0200
Subject: [CPUFREQ] powernow-k8 cleanup msg if BIOS does not export ACPI _PSS
 cpufreq data

- Make the message shorter and easier to grep for
- Use printk_once instead of WARN_ONCE (functionality of these was mixed)

Signed-off-by: Thomas Renninger <trenn@suse.de>
Cc: Langsdorf, Mark <mark.langsdorf@amd.com>
Signed-off-by: Dave Jones <davej@redhat.com>

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 4709ead..feef10c 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1215,13 +1215,16 @@ static int powernowk8_verify(struct cpufreq_policy *pol)
 	return cpufreq_frequency_table_verify(pol, data->powernow_table);
 }
 
+static const char ACPI_PSS_BIOS_BUG_MSG[] =
+	KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"
+	KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n";
+
 /* per CPU init entry point to the driver */
 static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 {
 	struct powernow_k8_data *data;
 	cpumask_t oldmask;
 	int rc;
-	static int print_once;
 
 	if (!cpu_online(pol->cpu))
 		return -ENODEV;
@@ -1244,19 +1247,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 		 * an UP version, and is deprecated by AMD.
 		 */
 		if (num_online_cpus() != 1) {
-			/*
-			 * Replace this one with print_once as soon as such a
-			 * thing gets introduced
-			 */
-			if (!print_once) {
-				WARN_ONCE(1, KERN_ERR FW_BUG PFX "Your BIOS "
-					"does not provide ACPI _PSS objects "
-					"in a way that Linux understands. "
-					"Please report this to the Linux ACPI"
-					" maintainers and complain to your "
-					"BIOS vendor.\n");
-				print_once++;
-			}
+			printk_once(ACPI_PSS_BIOS_BUG_MSG);
 			goto err_out;
 		}
 		if (pol->cpu != 0) {
-- 
cgit v0.10.2


From ca446d06351992e4f1a7c1e5e99870ab4ec5188f Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Wed, 22 Apr 2009 13:48:33 +0200
Subject: [CPUFREQ] powernow-k8: determine exact CPU frequency for HW Pstates

Slightly modified by trenn@suse.de -> only do this on fam 10h and fam 11h.

Currently powernow-k8 determines CPU frequency from ACPI PSS objects, but
according to AMD family 11h BKDG this frequency is just a rounded value:

  "CoreFreq (MHz) = The CPU COF specified by MSRC001_00[6B:64][CpuFid]
  rounded to the nearest 100 Mhz."

As a consequnce powernow-k8 reports wrong CPU frequency on some systems,
e.g. on Turion X2 Ultra:

  powernow-k8: Found 1 AMD Turion(tm)X2 Ultra DualCore Mobile ZM-82
               processors (2 cpu cores) (version 2.20.00)
  powernow-k8:    0 : pstate 0 (2200 MHz)
  powernow-k8:    1 : pstate 1 (1100 MHz)
  powernow-k8:    2 : pstate 2 (600 MHz)

But this is wrong as frequency for Pstate2 is 550 MHz. x86info reports it
correctly:

  #x86info -a |grep Pstate
  ...
  Pstate-0: fid=e, did=0, vid=24 (2200MHz)
  Pstate-1: fid=e, did=1, vid=30 (1100MHz)
  Pstate-2: fid=e, did=2, vid=3c (550MHz) (current)

Solution is to determine the frequency directly from Pstate MSRs instead
of using rounded values from ACPI table.

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Signed-off-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: Dave Jones <davej@redhat.com>

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index feef10c..f6b32d1 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -649,6 +649,20 @@ static void print_basics(struct powernow_k8_data *data)
 				data->batps);
 }
 
+static u32 freq_from_fid_did(u32 fid, u32 did)
+{
+	u32 mhz = 0;
+
+	if (boot_cpu_data.x86 == 0x10)
+		mhz = (100 * (fid + 0x10)) >> did;
+	else if (boot_cpu_data.x86 == 0x11)
+		mhz = (100 * (fid + 8)) >> did;
+	else
+		BUG();
+
+	return mhz * 1000;
+}
+
 static int fill_powernow_table(struct powernow_k8_data *data,
 		struct pst_s *pst, u8 maxvid)
 {
@@ -923,8 +937,13 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
 
 		powernow_table[i].index = index;
 
-		powernow_table[i].frequency =
-			data->acpi_data.states[i].core_frequency * 1000;
+		/* Frequency may be rounded for these */
+		if (boot_cpu_data.x86 == 0x10 || boot_cpu_data.x86 == 0x11) {
+			powernow_table[i].frequency =
+				freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
+		} else
+			powernow_table[i].frequency =
+				data->acpi_data.states[i].core_frequency * 1000;
 	}
 	return 0;
 }
-- 
cgit v0.10.2


From 97b07b699b11d4bd1218a841e5dfed16bd53de06 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 18:48:58 +0200
Subject: perf report: add --dump-raw-trace option

To help the inspection of various data files, implement an ASCII dump
method that just dumps the records as they are read in - then we exit.

[ Impact: new feature ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index e178190..8ea8aaa 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -20,6 +20,8 @@ static char		const *input_name = "output.perf";
 static int		input;
 static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
+static int		dump_trace = 0;
+
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
 
@@ -643,7 +645,7 @@ static int __cmd_report(void)
 	char *buf;
 	event_t *event;
 	int ret, rc = EXIT_FAILURE;
-	unsigned long total = 0;
+	unsigned long total = 0, total_mmap = 0, total_comm = 0;
 
 	input = open(input_name, O_RDONLY);
 	if (input < 0) {
@@ -706,6 +708,13 @@ more:
 		struct thread *thread = threads__findnew(event->ip.pid);
 		uint64_t ip = event->ip.ip;
 
+		if (dump_trace) {
+			fprintf(stderr, "PERF_EVENT (IP, %d): %d: %p\n",
+				event->header.misc,
+				event->ip.pid,
+				(void *)event->ip.ip);
+		}
+
 		if (thread == NULL) {
 			fprintf(stderr, "problem processing %d event, bailing out\n",
 				event->header.type);
@@ -743,23 +752,40 @@ more:
 		struct thread *thread = threads__findnew(event->mmap.pid);
 		struct map *map = map__new(&event->mmap);
 
+		if (dump_trace) {
+			fprintf(stderr, "PERF_EVENT_MMAP: [%p(%p) @ %p]: %s\n",
+				(void *)event->mmap.start,
+				(void *)event->mmap.len,
+				(void *)event->mmap.pgoff,
+				event->mmap.filename);
+		}
 		if (thread == NULL || map == NULL) {
 			fprintf(stderr, "problem processing PERF_EVENT_MMAP, bailing out\n");
 			goto done;
 		}
 		thread__insert_map(thread, map);
+		total_mmap++;
 		break;
 	}
 	case PERF_EVENT_COMM: {
 		struct thread *thread = threads__findnew(event->comm.pid);
 
+		if (dump_trace) {
+			fprintf(stderr, "PERF_EVENT_COMM: %s:%d\n",
+				event->comm.comm, event->comm.pid);
+		}
 		if (thread == NULL ||
 		    thread__set_comm(thread, event->comm.comm)) {
 			fprintf(stderr, "problem processing PERF_EVENT_COMM, bailing out\n");
 			goto done;
 		}
+		total_comm++;
 		break;
 	}
+	default: {
+		fprintf(stderr, "skipping unknown header type: %d\n",
+			event->header.type);
+	}
 	}
 
 	if (offset + head < stat.st_size)
@@ -768,6 +794,15 @@ more:
 	rc = EXIT_SUCCESS;
 done:
 	close(input);
+
+	if (dump_trace) {
+		fprintf(stderr, "   IP events: %10ld\n", total);
+		fprintf(stderr, " mmap events: %10ld\n", total_mmap);
+		fprintf(stderr, " comm events: %10ld\n", total_comm);
+
+		return 0;
+	}
+
 	//dsos__fprintf(stdout);
 	threads__fprintf(stdout);
 #if 0
@@ -796,6 +831,8 @@ static const char * const report_usage[] = {
 static const struct option options[] = {
 	OPT_STRING('i', "input", &input_name, "file",
 		    "input file name"),
+	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
+		    "dump raw trace in ASCII"),
 	OPT_END()
 };
 
-- 
cgit v0.10.2


From 3e70611460fe74ad32534fa9791774f6bbdd4159 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 18:53:17 +0200
Subject: perf report: add counter for unknown events

Add a counter for unknown event records.

[ Impact: improve debugging ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 8ea8aaa..4b5ccc5 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -645,7 +645,7 @@ static int __cmd_report(void)
 	char *buf;
 	event_t *event;
 	int ret, rc = EXIT_FAILURE;
-	unsigned long total = 0, total_mmap = 0, total_comm = 0;
+	unsigned long total = 0, total_mmap = 0, total_comm = 0, total_unknown;
 
 	input = open(input_name, O_RDONLY);
 	if (input < 0) {
@@ -785,6 +785,7 @@ more:
 	default: {
 		fprintf(stderr, "skipping unknown header type: %d\n",
 			event->header.type);
+		total_unknown++;
 	}
 	}
 
@@ -796,9 +797,10 @@ done:
 	close(input);
 
 	if (dump_trace) {
-		fprintf(stderr, "   IP events: %10ld\n", total);
-		fprintf(stderr, " mmap events: %10ld\n", total_mmap);
-		fprintf(stderr, " comm events: %10ld\n", total_comm);
+		fprintf(stderr, "      IP events: %10ld\n", total);
+		fprintf(stderr, "    mmap events: %10ld\n", total_mmap);
+		fprintf(stderr, "    comm events: %10ld\n", total_comm);
+		fprintf(stderr, " unknown events: %10ld\n", total_unknown);
 
 		return 0;
 	}
-- 
cgit v0.10.2


From e4a5d54f924ea5ce2913d9d0687d034004816465 Mon Sep 17 00:00:00 2001
From: Ma Ling <ling.ma@intel.com>
Date: Tue, 26 May 2009 11:31:00 +0800
Subject: drm/i915: Add support for VGA load detection (pre-945).

Two approaches for VGA detections: hot plug detection for 945G onwards
and load pipe detection for Pre-945G.  Load pipe detection will get one free
pipe, set border color as red and blue, then check CRT status by
swf register.  This is a sync-up with the 2D driver.

Signed-off-by: Ma Ling <ling.ma@intel.com>
Signed-off-by: Eric Anholt <eric@anholt.net>

diff --git a/drivers/gpu/drm/i915/intel_crt.c b/drivers/gpu/drm/i915/intel_crt.c
index 19148c3..640f515 100644
--- a/drivers/gpu/drm/i915/intel_crt.c
+++ b/drivers/gpu/drm/i915/intel_crt.c
@@ -198,9 +198,142 @@ static bool intel_crt_detect_ddc(struct drm_connector *connector)
 	return intel_ddc_probe(intel_output);
 }
 
+static enum drm_connector_status
+intel_crt_load_detect(struct drm_crtc *crtc, struct intel_output *intel_output)
+{
+	struct drm_encoder *encoder = &intel_output->enc;
+	struct drm_device *dev = encoder->dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
+	uint32_t pipe = intel_crtc->pipe;
+	uint32_t save_bclrpat;
+	uint32_t save_vtotal;
+	uint32_t vtotal, vactive;
+	uint32_t vsample;
+	uint32_t vblank, vblank_start, vblank_end;
+	uint32_t dsl;
+	uint32_t bclrpat_reg;
+	uint32_t vtotal_reg;
+	uint32_t vblank_reg;
+	uint32_t vsync_reg;
+	uint32_t pipeconf_reg;
+	uint32_t pipe_dsl_reg;
+	uint8_t	st00;
+	enum drm_connector_status status;
+
+	if (pipe == 0) {
+		bclrpat_reg = BCLRPAT_A;
+		vtotal_reg = VTOTAL_A;
+		vblank_reg = VBLANK_A;
+		vsync_reg = VSYNC_A;
+		pipeconf_reg = PIPEACONF;
+		pipe_dsl_reg = PIPEADSL;
+	} else {
+		bclrpat_reg = BCLRPAT_B;
+		vtotal_reg = VTOTAL_B;
+		vblank_reg = VBLANK_B;
+		vsync_reg = VSYNC_B;
+		pipeconf_reg = PIPEBCONF;
+		pipe_dsl_reg = PIPEBDSL;
+	}
+
+	save_bclrpat = I915_READ(bclrpat_reg);
+	save_vtotal = I915_READ(vtotal_reg);
+	vblank = I915_READ(vblank_reg);
+
+	vtotal = ((save_vtotal >> 16) & 0xfff) + 1;
+	vactive = (save_vtotal & 0x7ff) + 1;
+
+	vblank_start = (vblank & 0xfff) + 1;
+	vblank_end = ((vblank >> 16) & 0xfff) + 1;
+
+	/* Set the border color to purple. */
+	I915_WRITE(bclrpat_reg, 0x500050);
+
+	if (IS_I9XX(dev)) {
+		uint32_t pipeconf = I915_READ(pipeconf_reg);
+		I915_WRITE(pipeconf_reg, pipeconf | PIPECONF_FORCE_BORDER);
+		/* Wait for next Vblank to substitue
+		 * border color for Color info */
+		intel_wait_for_vblank(dev);
+		st00 = I915_READ8(VGA_MSR_WRITE);
+		status = ((st00 & (1 << 4)) != 0) ?
+			connector_status_connected :
+			connector_status_disconnected;
+
+		I915_WRITE(pipeconf_reg, pipeconf);
+	} else {
+		bool restore_vblank = false;
+		int count, detect;
+
+		/*
+		* If there isn't any border, add some.
+		* Yes, this will flicker
+		*/
+		if (vblank_start <= vactive && vblank_end >= vtotal) {
+			uint32_t vsync = I915_READ(vsync_reg);
+			uint32_t vsync_start = (vsync & 0xffff) + 1;
+
+			vblank_start = vsync_start;
+			I915_WRITE(vblank_reg,
+				   (vblank_start - 1) |
+				   ((vblank_end - 1) << 16));
+			restore_vblank = true;
+		}
+		/* sample in the vertical border, selecting the larger one */
+		if (vblank_start - vactive >= vtotal - vblank_end)
+			vsample = (vblank_start + vactive) >> 1;
+		else
+			vsample = (vtotal + vblank_end) >> 1;
+
+		/*
+		 * Wait for the border to be displayed
+		 */
+		while (I915_READ(pipe_dsl_reg) >= vactive)
+			;
+		while ((dsl = I915_READ(pipe_dsl_reg)) <= vsample)
+			;
+		/*
+		 * Watch ST00 for an entire scanline
+		 */
+		detect = 0;
+		count = 0;
+		do {
+			count++;
+			/* Read the ST00 VGA status register */
+			st00 = I915_READ8(VGA_MSR_WRITE);
+			if (st00 & (1 << 4))
+				detect++;
+		} while ((I915_READ(pipe_dsl_reg) == dsl));
+
+		/* restore vblank if necessary */
+		if (restore_vblank)
+			I915_WRITE(vblank_reg, vblank);
+		/*
+		 * If more than 3/4 of the scanline detected a monitor,
+		 * then it is assumed to be present. This works even on i830,
+		 * where there isn't any way to force the border color across
+		 * the screen
+		 */
+		status = detect * 4 > count * 3 ?
+			 connector_status_connected :
+			 connector_status_disconnected;
+	}
+
+	/* Restore previous settings */
+	I915_WRITE(bclrpat_reg, save_bclrpat);
+
+	return status;
+}
+
 static enum drm_connector_status intel_crt_detect(struct drm_connector *connector)
 {
 	struct drm_device *dev = connector->dev;
+	struct intel_output *intel_output = to_intel_output(connector);
+	struct drm_encoder *encoder = &intel_output->enc;
+	struct drm_crtc *crtc;
+	int dpms_mode;
+	enum drm_connector_status status;
 
 	if (IS_I9XX(dev) && !IS_I915G(dev) && !IS_I915GM(dev)) {
 		if (intel_crt_detect_hotplug(connector))
@@ -212,8 +345,20 @@ static enum drm_connector_status intel_crt_detect(struct drm_connector *connecto
 	if (intel_crt_detect_ddc(connector))
 		return connector_status_connected;
 
-	/* TODO use load detect */
-	return connector_status_unknown;
+	/* for pre-945g platforms use load detect */
+	if (encoder->crtc && encoder->crtc->enabled) {
+		status = intel_crt_load_detect(encoder->crtc, intel_output);
+	} else {
+		crtc = intel_get_load_detect_pipe(intel_output,
+						  NULL, &dpms_mode);
+		if (crtc) {
+			status = intel_crt_load_detect(crtc, intel_output);
+			intel_release_load_detect_pipe(intel_output, dpms_mode);
+		} else
+			status = connector_status_unknown;
+	}
+
+	return status;
 }
 
 static void intel_crt_destroy(struct drm_connector *connector)
-- 
cgit v0.10.2


From f49515b157e2d3ca3633eb0664fc46c42f6cb37e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 19:03:36 +0200
Subject: perf report: add more debugging

Add the offset of the file we are analyzing, and the size of the record.

In case of problems it's easier to see where the parser lost track.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 4b5ccc5..2d4e4cc 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -645,7 +645,7 @@ static int __cmd_report(void)
 	char *buf;
 	event_t *event;
 	int ret, rc = EXIT_FAILURE;
-	unsigned long total = 0, total_mmap = 0, total_comm = 0, total_unknown;
+	unsigned long total = 0, total_mmap = 0, total_comm = 0, total_unknown = 0;
 
 	input = open(input_name, O_RDONLY);
 	if (input < 0) {
@@ -699,8 +699,6 @@ more:
 		goto done;
 	}
 
-	head += event->header.size;
-
 	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
 		char level;
 		int show = 0;
@@ -709,7 +707,9 @@ more:
 		uint64_t ip = event->ip.ip;
 
 		if (dump_trace) {
-			fprintf(stderr, "PERF_EVENT (IP, %d): %d: %p\n",
+			fprintf(stderr, "%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
+				(void *)(offset + head),
+				(void *)(long)(event->header.size),
 				event->header.misc,
 				event->ip.pid,
 				(void *)event->ip.ip);
@@ -753,7 +753,9 @@ more:
 		struct map *map = map__new(&event->mmap);
 
 		if (dump_trace) {
-			fprintf(stderr, "PERF_EVENT_MMAP: [%p(%p) @ %p]: %s\n",
+			fprintf(stderr, "%p [%p]: PERF_EVENT_MMAP: [%p(%p) @ %p]: %s\n",
+				(void *)(offset + head),
+				(void *)(long)(event->header.size),
 				(void *)event->mmap.start,
 				(void *)event->mmap.len,
 				(void *)event->mmap.pgoff,
@@ -771,7 +773,9 @@ more:
 		struct thread *thread = threads__findnew(event->comm.pid);
 
 		if (dump_trace) {
-			fprintf(stderr, "PERF_EVENT_COMM: %s:%d\n",
+			fprintf(stderr, "%p [%p]: PERF_EVENT_COMM: %s:%d\n",
+				(void *)(offset + head),
+				(void *)(long)(event->header.size),
 				event->comm.comm, event->comm.pid);
 		}
 		if (thread == NULL ||
@@ -783,12 +787,16 @@ more:
 		break;
 	}
 	default: {
-		fprintf(stderr, "skipping unknown header type: %d\n",
+		fprintf(stderr, "%p [%p]: skipping unknown header type: %d\n",
+			(void *)(offset + head),
+			(void *)(long)(event->header.size),
 			event->header.type);
 		total_unknown++;
 	}
 	}
 
+	head += event->header.size;
+
 	if (offset + head < stat.st_size)
 		goto more;
 
-- 
cgit v0.10.2


From 6142f9ec108a4ddbf0d5904c3daa5fdcaa618792 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 26 May 2009 20:51:47 +0200
Subject: perf report: More robust error handling

Don't let funny events confuse us, stick to what we know and
try to find sensible data again.

If we find an unknown event, check we're still u64 aligned, and
increment by one u64. This ensures we're bound to happen upon a
valid event soon.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 2d4e4cc..a58be7f 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -645,6 +645,7 @@ static int __cmd_report(void)
 	char *buf;
 	event_t *event;
 	int ret, rc = EXIT_FAILURE;
+	uint32_t size;
 	unsigned long total = 0, total_mmap = 0, total_comm = 0, total_unknown = 0;
 
 	input = open(input_name, O_RDONLY);
@@ -680,6 +681,10 @@ remap:
 more:
 	event = (event_t *)(buf + head);
 
+	size = event->header.size;
+	if (!size)
+		size = 8;
+
 	if (head + event->header.size >= page_size * mmap_window) {
 		unsigned long shift = page_size * (head / page_size);
 		int ret;
@@ -692,12 +697,9 @@ more:
 		goto remap;
 	}
 
-
-	if (!event->header.size) {
-		fprintf(stderr, "zero-sized event at file offset %ld\n", offset + head);
-		fprintf(stderr, "skipping %ld bytes of events.\n", stat.st_size - offset - head);
-		goto done;
-	}
+	size = event->header.size;
+	if (!size)
+		goto broken_event;
 
 	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
 		char level;
@@ -787,15 +789,26 @@ more:
 		break;
 	}
 	default: {
+broken_event:
 		fprintf(stderr, "%p [%p]: skipping unknown header type: %d\n",
 			(void *)(offset + head),
 			(void *)(long)(event->header.size),
 			event->header.type);
 		total_unknown++;
+
+		/*
+		 * assume we lost track of the stream, check alignment, and
+		 * increment a single u64 in the hope to catch on again 'soon'.
+		 */
+
+		if (unlikely(head & 7))
+			head &= ~7ULL;
+
+		size = 8;
 	}
 	}
 
-	head += event->header.size;
+	head += size;
 
 	if (offset + head < stat.st_size)
 		goto more;
-- 
cgit v0.10.2


From be74b73a57645cc253d881ab0c1014eb64b9cf22 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 26 May 2009 20:25:22 +0200
Subject: tracing: add __print_flags for events

Developers have been asking for the ability in the ftrace event tracer
to display names of bits in a flags variable.

Instead of printing out c2, it would be easier to read FOO|BAR|GOO,
assuming that FOO is bit 1, BAR is bit 6 and GOO is bit 7.

Some examples where this would be useful are the state flags in a context
switch, kmalloc flags, and even permision flags in accessing files.

[
  v2 changes include:

  Frederic Weisbecker's idea of using a mask instead of bits,
  thus we can output GFP_KERNEL instead of GPF_WAIT|GFP_IO|GFP_FS.

  Li Zefan's idea of allowing the caller of __print_flags to add their
  own delimiter (or no delimiter) where we can get for file permissions
  rwx instead of r|w|x.
]

[
  v3 changes:

   Christoph Hellwig's idea of using an array instead of va_args.
]

[ Impact: better displaying of flags in trace output ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index bae51dd..4b58cf1 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -3,12 +3,23 @@
 
 #include <linux/trace_seq.h>
 #include <linux/ring_buffer.h>
-
+#include <linux/percpu.h>
 
 struct trace_array;
 struct tracer;
 struct dentry;
 
+DECLARE_PER_CPU(struct trace_seq, ftrace_event_seq);
+
+struct trace_print_flags {
+	unsigned long		mask;
+	const char		*name;
+};
+
+const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
+				   unsigned long flags,
+				   const struct trace_print_flags *flag_array);
+
 /*
  * The trace entry - the most basic unit of tracing. This is what
  * is printed in the end as a single line in the trace output, such as:
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index b5ff2e8..22c9471 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -87,6 +87,7 @@
  *	struct trace_seq *s = &iter->seq;
  *	struct ftrace_raw_<call> *field; <-- defined in stage 1
  *	struct trace_entry *entry;
+ *	struct trace_seq *p;
  *	int ret;
  *
  *	entry = iter->ent;
@@ -98,7 +99,9 @@
  *
  *	field = (typeof(field))entry;
  *
+ *	p = get_cpu_var(ftrace_event_seq);
  *	ret = trace_seq_printf(s, <TP_printk> "\n");
+ *	put_cpu();
  *	if (!ret)
  *		return TRACE_TYPE_PARTIAL_LINE;
  *
@@ -119,6 +122,14 @@
 #undef __get_str
 #define __get_str(field)	((char *)__entry + __entry->__str_loc_##field)
 
+#undef __print_flags
+#define __print_flags(flag, delim, flag_array...)			\
+	({								\
+		static const struct trace_print_flags flags[] =		\
+			{ flag_array, { -1, NULL }};			\
+		ftrace_print_flags_seq(p, delim, flag, flags);		\
+	})
+
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
 enum print_line_t							\
@@ -127,6 +138,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 	struct trace_seq *s = &iter->seq;				\
 	struct ftrace_raw_##call *field;				\
 	struct trace_entry *entry;					\
+	struct trace_seq *p;						\
 	int ret;							\
 									\
 	entry = iter->ent;						\
@@ -138,7 +150,9 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 									\
 	field = (typeof(field))entry;					\
 									\
+	p = &get_cpu_var(ftrace_event_seq);				\
 	ret = trace_seq_printf(s, #call ": " print);			\
+	put_cpu();							\
 	if (!ret)							\
 		return TRACE_TYPE_PARTIAL_LINE;				\
 									\
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 7136420..a4840c2 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -15,6 +15,9 @@
 #define EVENT_HASHSIZE	128
 
 static DECLARE_RWSEM(trace_event_mutex);
+
+DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
+
 static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
 static int next_event_type = __TRACE_LAST_TYPE + 1;
@@ -212,6 +215,42 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
 	return 0;
 }
 
+const char *
+ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
+		       unsigned long flags,
+		       const struct trace_print_flags *flag_array)
+{
+	unsigned long mask;
+	const char *str;
+	int i;
+
+	trace_seq_init(p);
+
+	for (i = 0;  flag_array[i].name && flags; i++) {
+
+		mask = flag_array[i].mask;
+		if ((flags & mask) != mask)
+			continue;
+
+		str = flag_array[i].name;
+		flags &= ~mask;
+		if (p->len && delim)
+			trace_seq_puts(p, delim);
+		trace_seq_puts(p, str);
+	}
+
+	/* check for left over flags */
+	if (flags) {
+		if (p->len && delim)
+			trace_seq_puts(p, delim);
+		trace_seq_printf(p, "0x%lx", flags);
+	}
+
+	trace_seq_putc(p, 0);
+
+	return p->buffer;
+}
+
 #ifdef CONFIG_KRETPROBES
 static inline const char *kretprobed(const char *name)
 {
-- 
cgit v0.10.2


From 937cdb9db7f59278d0cb1582e6e64e3dfd73b4fc Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 15 May 2009 10:51:13 -0400
Subject: tracing: add previous task state info to sched switch event

It is useful to see the state of a task that is being switched out.
This patch adds the output of the state of the previous task in
the context switch event.

[ Impact: see state of switched out task in context switch ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index dd4033c..24ab5bc 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -156,6 +156,7 @@ TRACE_EVENT(sched_switch,
 		__array(	char,	prev_comm,	TASK_COMM_LEN	)
 		__field(	pid_t,	prev_pid			)
 		__field(	int,	prev_prio			)
+		__field(	long,	prev_state			)
 		__array(	char,	next_comm,	TASK_COMM_LEN	)
 		__field(	pid_t,	next_pid			)
 		__field(	int,	next_prio			)
@@ -165,13 +166,19 @@ TRACE_EVENT(sched_switch,
 		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
 		__entry->prev_pid	= prev->pid;
 		__entry->prev_prio	= prev->prio;
+		__entry->prev_state	= prev->state;
 		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
 		__entry->next_pid	= next->pid;
 		__entry->next_prio	= next->prio;
 	),
 
-	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
+	TP_printk("task %s:%d [%d] (%s) ==> %s:%d [%d]",
 		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
+		__entry->prev_state ?
+		  __print_flags(__entry->prev_state, "|",
+				{ 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
+				{ 16, "Z" }, { 32, "X" }, { 64, "x" },
+				{ 128, "W" }) : "R",
 		__entry->next_comm, __entry->next_pid, __entry->next_prio)
 );
 
-- 
cgit v0.10.2


From 62ba180e80f4194a498585ac0e4c07daa8ca08d1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 15 May 2009 16:16:30 -0400
Subject: tracing: add flag output for kmem events

This patch changes the output for gfp_flags from being a simple hex value
to the actual names.

  gfp_flags=GFP_ATOMIC  instead of gfp_flags=00000020

And even

  gfp_flags=GFP_KERNEL instead of gfp_flags=000000d0

(Thanks to Frederic Weisbecker for pointing out that the first version
 had a bad order of GFP masks)

[ Impact: more human readable output from tracer ]

Acked-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index c22c42f..9baba50 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -7,6 +7,43 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kmem
 
+/*
+ * The order of these masks is important. Matching masks will be seen
+ * first and the left over flags will end up showing by themselves.
+ *
+ * For example, if we have GFP_KERNEL before GFP_USER we wil get:
+ *
+ *  GFP_KERNEL|GFP_HARDWALL
+ *
+ * Thus most bits set go first.
+ */
+#define show_gfp_flags(flags)						\
+	(flags) ? __print_flags(flags, "|",				\
+	{(unsigned long)GFP_HIGHUSER_MOVABLE,	"GFP_HIGHUSER_MOVABLE"}, \
+	{(unsigned long)GFP_HIGHUSER,		"GFP_HIGHUSER"},	\
+	{(unsigned long)GFP_USER,		"GFP_USER"},		\
+	{(unsigned long)GFP_TEMPORARY,		"GFP_TEMPORARY"},	\
+	{(unsigned long)GFP_KERNEL,		"GFP_KERNEL"},		\
+	{(unsigned long)GFP_NOFS,		"GFP_NOFS"},		\
+	{(unsigned long)GFP_ATOMIC,		"GFP_ATOMIC"},		\
+	{(unsigned long)GFP_NOIO,		"GFP_NOIO"},		\
+	{(unsigned long)__GFP_HIGH,		"GFP_HIGH"},		\
+	{(unsigned long)__GFP_WAIT,		"GFP_WAIT"},		\
+	{(unsigned long)__GFP_IO,		"GFP_IO"},		\
+	{(unsigned long)__GFP_COLD,		"GFP_COLD"},		\
+	{(unsigned long)__GFP_NOWARN,		"GFP_NOWARN"},		\
+	{(unsigned long)__GFP_REPEAT,		"GFP_REPEAT"},		\
+	{(unsigned long)__GFP_NOFAIL,		"GFP_NOFAIL"},		\
+	{(unsigned long)__GFP_NORETRY,		"GFP_NORETRY"},		\
+	{(unsigned long)__GFP_COMP,		"GFP_COMP"},		\
+	{(unsigned long)__GFP_ZERO,		"GFP_ZERO"},		\
+	{(unsigned long)__GFP_NOMEMALLOC,	"GFP_NOMEMALLOC"},	\
+	{(unsigned long)__GFP_HARDWALL,		"GFP_HARDWALL"},	\
+	{(unsigned long)__GFP_THISNODE,		"GFP_THISNODE"},	\
+	{(unsigned long)__GFP_RECLAIMABLE,	"GFP_RECLAIMABLE"},	\
+	{(unsigned long)__GFP_MOVABLE,		"GFP_MOVABLE"}		\
+	) : "GFP_NOWAIT"
+
 TRACE_EVENT(kmalloc,
 
 	TP_PROTO(unsigned long call_site,
@@ -33,12 +70,12 @@ TRACE_EVENT(kmalloc,
 		__entry->gfp_flags	= gfp_flags;
 	),
 
-	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x",
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s",
 		__entry->call_site,
 		__entry->ptr,
 		__entry->bytes_req,
 		__entry->bytes_alloc,
-		__entry->gfp_flags)
+		show_gfp_flags(__entry->gfp_flags))
 );
 
 TRACE_EVENT(kmem_cache_alloc,
@@ -67,12 +104,12 @@ TRACE_EVENT(kmem_cache_alloc,
 		__entry->gfp_flags	= gfp_flags;
 	),
 
-	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x",
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s",
 		__entry->call_site,
 		__entry->ptr,
 		__entry->bytes_req,
 		__entry->bytes_alloc,
-		__entry->gfp_flags)
+		show_gfp_flags(__entry->gfp_flags))
 );
 
 TRACE_EVENT(kmalloc_node,
@@ -104,12 +141,12 @@ TRACE_EVENT(kmalloc_node,
 		__entry->node		= node;
 	),
 
-	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d",
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d",
 		__entry->call_site,
 		__entry->ptr,
 		__entry->bytes_req,
 		__entry->bytes_alloc,
-		__entry->gfp_flags,
+		show_gfp_flags(__entry->gfp_flags),
 		__entry->node)
 );
 
@@ -142,12 +179,12 @@ TRACE_EVENT(kmem_cache_alloc_node,
 		__entry->node		= node;
 	),
 
-	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d",
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d",
 		__entry->call_site,
 		__entry->ptr,
 		__entry->bytes_req,
 		__entry->bytes_alloc,
-		__entry->gfp_flags,
+		show_gfp_flags(__entry->gfp_flags),
 		__entry->node)
 );
 
-- 
cgit v0.10.2


From 0f4fc29dd68dfab9c6ddd5d087d34a5b6818cb00 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 20 May 2009 19:21:47 -0400
Subject: tracing: add __print_symbolic to trace events

This patch adds __print_symbolic which is similar to __print_flags but
works for an enumeration type instead. That is, there is only a one to one
mapping between the values and the symbols. When a match is made, then
it is printed, otherwise the hex value is outputed.

[ Impact: add interface for showing symbol names in events ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 4b58cf1..bbf40f6 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -20,6 +20,9 @@ const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
 				   unsigned long flags,
 				   const struct trace_print_flags *flag_array);
 
+const char *ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
+				     const struct trace_print_flags *symbol_array);
+
 /*
  * The trace entry - the most basic unit of tracing. This is what
  * is printed in the end as a single line in the trace output, such as:
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 22c9471..87fc227 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -130,6 +130,14 @@
 		ftrace_print_flags_seq(p, delim, flag, flags);		\
 	})
 
+#undef __print_symbolic
+#define __print_symbolic(value, symbol_array...)			\
+	({								\
+		static const struct trace_print_flags symbols[] =	\
+			{ symbol_array, { -1, NULL }};			\
+		ftrace_print_symbols_seq(p, value, symbols);		\
+	})
+
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
 enum print_line_t							\
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index a4840c2..c12d95d 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -251,6 +251,31 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
 	return p->buffer;
 }
 
+const char *
+ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
+			 const struct trace_print_flags *symbol_array)
+{
+	int i;
+
+	trace_seq_init(p);
+
+	for (i = 0;  symbol_array[i].name; i++) {
+
+		if (val != symbol_array[i].mask)
+			continue;
+
+		trace_seq_puts(p, symbol_array[i].name);
+		break;
+	}
+
+	if (!p->len)
+		trace_seq_printf(p, "0x%lx", val);
+		
+	trace_seq_putc(p, 0);
+
+	return p->buffer;
+}
+
 #ifdef CONFIG_KRETPROBES
 static inline const char *kretprobed(const char *name)
 {
-- 
cgit v0.10.2


From c2adae0970ca1db8adb92fb56ae3bcabd916e8bd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 20 May 2009 19:56:19 -0400
Subject: tracing: convert irq events to use __print_symbolic

The recording of the names at trace time is inefficient. This patch
implements the softirq event recording to only record the vector
and then use the __print_symbolic interface to print out the names.

[ Impact: faster recording of softirq events ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h
index 32a9f7e..683fb36 100644
--- a/include/trace/events/irq.h
+++ b/include/trace/events/irq.h
@@ -7,6 +7,19 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM irq
 
+#define softirq_name(sirq) { sirq, #sirq }
+#define show_softirq_name(val)						\
+	__print_symbolic(val,						\
+			 softirq_name(HI_SOFTIRQ),			\
+			 softirq_name(TIMER_SOFTIRQ),			\
+			 softirq_name(NET_TX_SOFTIRQ),			\
+			 softirq_name(NET_RX_SOFTIRQ),			\
+			 softirq_name(BLOCK_SOFTIRQ),			\
+			 softirq_name(TASKLET_SOFTIRQ),			\
+			 softirq_name(SCHED_SOFTIRQ),			\
+			 softirq_name(HRTIMER_SOFTIRQ),			\
+			 softirq_name(RCU_SOFTIRQ))
+
 /**
  * irq_handler_entry - called immediately before the irq action handler
  * @irq: irq number
@@ -87,15 +100,14 @@ TRACE_EVENT(softirq_entry,
 
 	TP_STRUCT__entry(
 		__field(	int,	vec			)
-		__string(	name,	softirq_to_name[h-vec]	)
 	),
 
 	TP_fast_assign(
 		__entry->vec = (int)(h - vec);
-		__assign_str(name, softirq_to_name[h-vec]);
 	),
 
-	TP_printk("softirq=%d action=%s", __entry->vec, __get_str(name))
+	TP_printk("softirq=%d action=%s", __entry->vec,
+		  show_softirq_name(__entry->vec))
 );
 
 /**
@@ -117,15 +129,14 @@ TRACE_EVENT(softirq_exit,
 
 	TP_STRUCT__entry(
 		__field(	int,	vec			)
-		__string(	name,	softirq_to_name[h-vec]	)
 	),
 
 	TP_fast_assign(
 		__entry->vec = (int)(h - vec);
-		__assign_str(name, softirq_to_name[h-vec]);
 	),
 
-	TP_printk("softirq=%d action=%s", __entry->vec, __get_str(name))
+	TP_printk("softirq=%d action=%s", __entry->vec,
+		  show_softirq_name(__entry->vec))
 );
 
 #endif /*  _TRACE_IRQ_H */
-- 
cgit v0.10.2


From 68743082b560067e3e93eab8b2568f238e486865 Mon Sep 17 00:00:00 2001
From: Vu Pham <vu@mellanox.com>
Date: Tue, 26 May 2009 14:51:00 -0400
Subject: XPRTRDMA: fix client rpcrdma FRMR registration on mlx4 devices

mlx4/connectX FRMR requires local write enable together with remote
rdma write enable. This fixes NFS/RDMA operation over the ConnectX
Infiniband HCA in the default memreg mode.

Signed-off-by: Vu Pham <vu@mellanox.com>
Signed-off-by: Tom Talpey <tmtalpey@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 3b21e0c..465aafc 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1495,7 +1495,8 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
 	frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
 	frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
 	frmr_wr.wr.fast_reg.access_flags = (writing ?
-				IB_ACCESS_REMOTE_WRITE : IB_ACCESS_REMOTE_READ);
+				IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
+				IB_ACCESS_REMOTE_READ);
 	frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
 	DECR_CQCOUNT(&r_xprt->rx_ep);
 
-- 
cgit v0.10.2


From d0367a508af9cf97beb202935bb9ad8883d30cd1 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Tue, 26 May 2009 14:51:00 -0400
Subject: nfs: fix build error in nfsroot with initconst

fix build error with latest kbuild adjustments to initconst.

The commit a447c0932445f92ce6f4c1bd020f62c5097a7842 ("vfs: Use
const for kernel parser table") changed:

    static match_table_t __initdata tokens = {
to
    static match_table_t __initconst tokens = {

But the missing const causes popwerpc to fail with latest
updates to __initconst like this:

fs/nfs/nfsroot.c:400: error: __setup_str_nfs_root_setup causes a section type conflict
fs/nfs/nfsroot.c:400: error: __setup_str_nfs_root_setup causes a section type conflict

The bug is only present with kbuild-next.
Following patch has been build tested.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index d9ef602..e3ed590 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -129,7 +129,7 @@ enum {
 	Opt_err
 };
 
-static match_table_t __initconst tokens = {
+static const match_table_t tokens __initconst = {
 	{Opt_port, "port=%u"},
 	{Opt_rsize, "rsize=%u"},
 	{Opt_wsize, "wsize=%u"},
-- 
cgit v0.10.2


From 95baa25c7321eb8613246acbf61b97911cc748d3 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 26 May 2009 14:51:00 -0400
Subject: NFSv4: Fix the case where NFSv4 renewal fails

If the asynchronous lease renewal fails (usually due to a soft timeout),
then we _must_ schedule state recovery in order to ensure that we don't
lose the lease unnecessarily or, if the lease is already lost, that we
recover the locking state promptly...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a4d2426..4674f80 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2594,12 +2594,9 @@ static void nfs4_renew_done(struct rpc_task *task, void *data)
 	unsigned long timestamp = (unsigned long)data;
 
 	if (task->tk_status < 0) {
-		switch (task->tk_status) {
-			case -NFS4ERR_STALE_CLIENTID:
-			case -NFS4ERR_EXPIRED:
-			case -NFS4ERR_CB_PATH_DOWN:
-				nfs4_schedule_state_recovery(clp);
-		}
+		/* Unless we're shutting down, schedule state recovery! */
+		if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0)
+			nfs4_schedule_state_recovery(clp);
 		return;
 	}
 	spin_lock(&clp->cl_lock);
-- 
cgit v0.10.2


From ab2b7ebaad16226c9a5e85c5f384d19fa58a7459 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Tue, 26 May 2009 09:11:03 +0100
Subject: kmod: Release sub_info on cred allocation failure.

call_usermodehelper_setup() forgot to kfree(sub_info)
when prepare_usermodehelper_creds() failed.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/kmod.c b/kernel/kmod.c
index b750675..7e95bed 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -370,8 +370,10 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
 	sub_info->argv = argv;
 	sub_info->envp = envp;
 	sub_info->cred = prepare_usermodehelper_creds();
-	if (!sub_info->cred)
+	if (!sub_info->cred) {
+		kfree(sub_info);
 		return NULL;
+	}
 
   out:
 	return sub_info;
-- 
cgit v0.10.2


From 564346224daaa8f7222d7a92cdbb7bafde59ae6e Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 26 May 2009 20:54:41 +0930
Subject: lguest: fix on Intel when KVM loaded (unhandled trap 13)

When KVM is loaded, and hence VT set up, the vmcall instruction in an
lguest guest causes a #GP, not #UD.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 1a83910..eaf722f 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -358,6 +358,16 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
 			if (emulate_insn(cpu))
 				return;
 		}
+		/* If KVM is active, the vmcall instruction triggers a
+		 * General Protection Fault.  Normally it triggers an
+		 * invalid opcode fault (6): */
+	case 6:
+		/* We need to check if ring == GUEST_PL and
+		 * faulting instruction == vmcall. */
+		if (is_hypercall(cpu)) {
+			rewrite_hypercall(cpu);
+			return;
+		}
 		break;
 	case 14: /* We've intercepted a Page Fault. */
 		/* The Guest accessed a virtual address that wasn't mapped.
@@ -403,15 +413,6 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
 		 * up the pointer now to indicate a hypercall is pending. */
 		cpu->hcall = (struct hcall_args *)cpu->regs;
 		return;
-	case 6:
-		/* kvm hypercalls trigger an invalid opcode fault (6).
-		 * We need to check if ring == GUEST_PL and
-		 * faulting instruction == vmcall. */
-		if (is_hypercall(cpu)) {
-			rewrite_hypercall(cpu);
-			return;
-		}
-		break;
 	}
 
 	/* We didn't handle the trap, so it needs to go to the Guest. */
-- 
cgit v0.10.2


From 3a4b8cc70b7473a0b9f26f5b4ddc6579b5e214be Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 26 May 2009 16:19:04 -0300
Subject: perf report: Sort output by symbol usage

[acme@emilia ~]$ perf record find / > /dev/null 2>&1
[acme@emilia ~]$ perf stat perf report | head -20
 4.95          find [k] 0xffffffff81393d65 _spin_lock
 3.89          find [.] 0x000000000000af89 /usr/bin/find: <unknown>
 2.19          find [k] 0xffffffffa00518e0 ext3fs_dirhash
 1.87          find [k] 0xffffffff810a6cea __rcu_read_lock
 1.86          find [k] 0xffffffff811c7312 _atomic_dec_and_lock
 1.86          find [.] 0x00000000000782ab /lib64/libc-2.5.so: __GI_strlen
 1.85          find [k] 0xffffffff810fedfb __kmalloc
 1.62          find [.] 0x00000000000430ff /lib64/libc-2.5.so: vfprintf
 1.59          find [k] 0xffffffff810a6d6d __rcu_read_unlock
 1.55          find [k] 0xffffffff81119395 __d_lookup
 1.39          find [.] 0x0000000000071b40 /lib64/libc-2.5.so: _int_malloc
 1.30          find [k] 0xffffffffa031c4fc nfs_do_filldir
 1.21          find [k] 0xffffffff811876a5 avc_has_perm_noaudit
 1.15          find [k] 0xffffffff810fef62 kmem_cache_alloc
 1.07          find [k] 0xffffffff811d03fb copy_user_generic_string
 1.03          find [k] 0xffffffffa0043882 ext3_htree_store_dirent
 0.99          find [k] 0xffffffff81393ebb _spin_lock_bh
 0.98          find [k] 0xffffffffa03319a2 nfs3_decode_dirent
 0.97          find [k] 0xffffffff8100bf20 system_call
 0.92          find [k] 0xffffffff8139437e _spin_unlock

 Performance counter stats for 'perf':

     244.278972  task clock ticks     (msecs)
              8  context switches     (events)
              9  CPU migrations       (events)
           2104  pagefaults           (events)
       35329669  CPU cycles           (events)  (scaled from 75.40%)
       13740366  instructions         (events)  (scaled from 75.49%)
          59073  cache references     (events)  (scaled from 24.60%)
            196  cache misses         (events)  (scaled from 24.51%)

 Wall-clock time elapsed:   246.060717 msecs

[acme@emilia ~]$
[acme@emilia ~]$ grep "model name" /proc/cpuinfo | head -1
model name	: Intel(R) Xeon(R) CPU           E5405  @ 2.00GHz
[acme@emilia ~]$ grep "model name" /proc/cpuinfo | wc -l
8
[acme@emilia ~]$

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090526191904.GH4424@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index a58be7f..a4c6ffa 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -447,26 +447,33 @@ static size_t map__fprintf(struct map *self, FILE *fp)
 		       self->start, self->end, self->pgoff, self->dso->name);
 }
 
+struct thread;
+
+static const char *thread__name(struct thread *self, char *bf, size_t size);
+
 struct symhist {
 	struct rb_node	 rb_node;
 	struct dso	 *dso;
 	struct symbol	 *sym;
+	struct thread	 *thread;
 	uint64_t	 ip;
 	uint32_t	 count;
 	char		 level;
 };
 
 static struct symhist *symhist__new(struct symbol *sym, uint64_t ip,
-				    struct dso *dso, char level)
+				    struct thread *thread, struct dso *dso,
+				    char level)
 {
 	struct symhist *self = malloc(sizeof(*self));
 
 	if (self != NULL) {
-		self->sym   = sym;
-		self->ip    = ip;
-		self->dso   = dso;
-		self->level = level;
-		self->count = 1;
+		self->sym    = sym;
+		self->thread = thread;
+		self->ip     = ip;
+		self->dso    = dso;
+		self->level  = level;
+		self->count  = 1;
 	}
 
 	return self;
@@ -482,17 +489,29 @@ static void symhist__inc(struct symhist *self)
 	++self->count;
 }
 
-static size_t symhist__fprintf(struct symhist *self, FILE *fp)
+static size_t
+symhist__fprintf(struct symhist *self, uint64_t total_samples, FILE *fp)
 {
-	size_t ret = fprintf(fp, "%#llx [%c] ", (unsigned long long)self->ip, self->level);
+	char bf[32];
+	size_t ret;
+
+	if (total_samples)
+		ret = fprintf(fp, "%5.2f", (self->count * 100.0) / total_samples);
+	else
+		ret = fprintf(fp, "%12d", self->count);
+
+	ret += fprintf(fp, "%14s [%c] %#018llx ",
+		       thread__name(self->thread, bf, sizeof(bf)),
+		       self->level, (unsigned long long)self->ip);
 
 	if (self->level != '.')
-		ret += fprintf(fp, "%s", self->sym ? self->sym->name: "<unknown>");
+		ret += fprintf(fp, "%s\n",
+			       self->sym ? self->sym->name : "<unknown>");
 	else
-		ret += fprintf(fp, "%s: %s",
+		ret += fprintf(fp, "%s: %s\n",
 			       self->dso ? self->dso->name : "<unknown>",
 			       self->sym ? self->sym->name : "<unknown>");
-	return ret + fprintf(fp, ": %u\n", self->count);
+	return ret;
 }
 
 struct thread {
@@ -503,6 +522,15 @@ struct thread {
 	char		 *comm;
 };
 
+static const char *thread__name(struct thread *self, char *bf, size_t size)
+{
+	if (self->comm)
+		return self->comm;
+
+	snprintf(bf, sizeof(bf), ":%u", self->pid);
+	return bf;
+}
+
 static struct thread *thread__new(pid_t pid)
 {
 	struct thread *self = malloc(sizeof(*self));
@@ -542,7 +570,7 @@ static int thread__symbol_incnew(struct thread *self, struct symbol *sym,
 			p = &(*p)->rb_right;
 	}
 
-	sh = symhist__new(sym, ip, dso, level);
+	sh = symhist__new(sym, ip, self, dso, level);
 	if (sh == NULL)
 		return -ENOMEM;
 	rb_link_node(&sh->rb_node, parent, p);
@@ -574,7 +602,7 @@ static size_t thread__fprintf(struct thread *self, FILE *fp)
 
 	for (nd = rb_first(&self->symhists); nd; nd = rb_next(nd)) {
 		struct symhist *pos = rb_entry(nd, struct symhist, rb_node);
-		ret += symhist__fprintf(pos, fp);
+		ret += symhist__fprintf(pos, 0, fp);
 	}
 
 	return ret;
@@ -628,7 +656,7 @@ static struct map *thread__find_map(struct thread *self, uint64_t ip)
 	return NULL;
 }
 
-static void threads__fprintf(FILE *fp)
+void threads__fprintf(FILE *fp)
 {
 	struct rb_node *nd;
 	for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
@@ -637,6 +665,61 @@ static void threads__fprintf(FILE *fp)
 	}
 }
 
+static struct rb_root global_symhists = RB_ROOT;
+
+static void threads__insert_symhist(struct symhist *sh)
+{
+	struct rb_node **p = &global_symhists.rb_node;
+	struct rb_node *parent = NULL;
+	struct symhist *iter;
+
+	while (*p != NULL) {
+		parent = *p;
+		iter = rb_entry(parent, struct symhist, rb_node);
+
+		/* Reverse order */
+		if (sh->count > iter->count)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&sh->rb_node, parent, p);
+	rb_insert_color(&sh->rb_node, &global_symhists);
+}
+
+static void threads__sort_symhists(void)
+{
+	struct rb_node *nd;
+
+	for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
+		struct thread *thread = rb_entry(nd, struct thread, rb_node);
+		struct rb_node *next = rb_first(&thread->symhists);
+
+		while (next) {
+			struct symhist *n = rb_entry(next, struct symhist,
+						     rb_node);
+			next = rb_next(&n->rb_node);
+			rb_erase(&n->rb_node, &thread->symhists);
+			threads__insert_symhist(n);
+		}
+
+	}
+}
+
+static size_t threads__symhists_fprintf(uint64_t total_samples, FILE *fp)
+{
+	struct rb_node *nd;
+	size_t ret = 0;
+
+	for (nd = rb_first(&global_symhists); nd; nd = rb_next(nd)) {
+		struct symhist *pos = rb_entry(nd, struct symhist, rb_node);
+		ret += symhist__fprintf(pos, total_samples, fp);
+	}
+
+	return ret;
+}
+
 static int __cmd_report(void)
 {
 	unsigned long offset = 0;
@@ -826,23 +909,9 @@ done:
 		return 0;
 	}
 
-	//dsos__fprintf(stdout);
-	threads__fprintf(stdout);
-#if 0
-	std::map<std::string, int>::iterator hi = hist.begin();
-
-	while (hi != hist.end()) {
-		rev_hist.insert(std::pair<int, std::string>(hi->second, hi->first));
-		hist.erase(hi++);
-	}
-
-	std::multimap<int, std::string>::const_iterator ri = rev_hist.begin();
+	threads__sort_symhists();
+	threads__symhists_fprintf(total, stdout);
 
-	while (ri != rev_hist.end()) {
-		printf(" %5.2f %s\n", (100.0 * ri->first)/total, ri->second.c_str());
-		ri++;
-	}
-#endif
 	return rc;
 }
 
-- 
cgit v0.10.2


From 2171787be2e71ff71159857bfeb21398b61eb615 Mon Sep 17 00:00:00 2001
From: "Pallipadi, Venkatesh" <venkatesh.pallipadi@intel.com>
Date: Tue, 26 May 2009 10:33:35 -0700
Subject: x86: avoid back to back on_each_cpu in cpa_flush_array

Cleanup cpa_flush_array() to avoid back to back on_each_cpu() calls.

[ Impact: optimizes fix 0af48f42df15b97080b450d24219dd95db7b929a ]

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 0f9052b..e17efed 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -204,30 +204,19 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
 	}
 }
 
-static void wbinvd_local(void *unused)
-{
-	wbinvd();
-}
-
 static void cpa_flush_array(unsigned long *start, int numpages, int cache,
 			    int in_flags, struct page **pages)
 {
 	unsigned int i, level;
+	unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
 
 	BUG_ON(irqs_disabled());
 
-	on_each_cpu(__cpa_flush_range, NULL, 1);
+	on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
 
-	if (!cache)
+	if (!cache || do_wbinvd)
 		return;
 
-	/* 4M threshold */
-	if (numpages >= 1024) {
-		if (boot_cpu_data.x86 >= 4)
-			on_each_cpu(wbinvd_local, NULL, 1);
-
-		return;
-	}
 	/*
 	 * We only need to flush on one CPU,
 	 * clflush is a MESI-coherent instruction that
-- 
cgit v0.10.2


From 46a7574caf5bc533c24b315800ed323c187614f5 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sun, 24 May 2009 18:45:17 -0400
Subject: cifs: fix artificial limit on reading symlinks

There's no reason to limit the size of a symlink that we can read to
4000 bytes. That may be nowhere near PATH_MAX if the server is sending
UCS2 strings. CIFS should be able to read in a symlink up to the size of
the buffer. The size of the header has already been accounted for when
creating the slabcache, so CIFSMaxBufSize should be the correct size to
pass in.

Fixes samba bug #6384.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index d062602..aece2a8 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2427,8 +2427,7 @@ querySymLinkRetry:
 	params = 2 /* level */  + 4 /* rsrvd */  + name_len /* incl null */ ;
 	pSMB->TotalDataCount = 0;
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	/* BB find exact max data count below from sess structure BB */
-	pSMB->MaxDataCount = cpu_to_le16(4000);
+	pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
-- 
cgit v0.10.2


From f55ed1a83d099f275c9560ad7d4c4700d1e54bdd Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 26 May 2009 16:28:11 -0400
Subject: cifs: tighten up default file_mode/dir_mode

The current default file mode is 02767 and dir mode is 0777. This is
extremely "loose". Given that CIFS is a single-user protocol, these
permissions allow anyone to use the mount -- in effect, giving anyone on
the machine access to the credentials used to mount the share.

Change this by making the default permissions restrict write access to
the default owner of the mount. Give read and execute permissions to
everyone else. These are the same permissions that VFAT mounts get by
default so there is some precedent here.

Note that this patch also removes the mandatory locking flags from the
default file_mode. After having looked at how these flags are used by
the kernel, I don't think that keeping them as the default offers any
real benefit. That flag combination makes it so that the kernel enforces
mandatory locking.

Since the server is going to do that for us anyway, I don't think we
want the client to enforce this by default on applications that just
want advisory locks. Anyone that does want this behavior can always
enable it by setting the file_mode appropriately.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4aa81a5..f32c903 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -827,9 +827,9 @@ cifs_parse_mount_options(char *options, const char *devname,
 	vol->target_rfc1001_name[0] = 0;
 	vol->linux_uid = current_uid();  /* use current_euid() instead? */
 	vol->linux_gid = current_gid();
-	vol->dir_mode = S_IRWXUGO;
-	/* 2767 perms indicate mandatory locking support */
-	vol->file_mode = (S_IRWXUGO | S_ISGID) & (~S_IXGRP);
+
+	/* default to only allowing write access to owner of the mount */
+	vol->dir_mode = vol->file_mode = S_IRUGO | S_IXUGO | S_IWUSR;
 
 	/* vol->retry default is 0 (i.e. "soft" limited retry not hard retry) */
 	vol->rw = true;
-- 
cgit v0.10.2


From b1338d199dda6681d9af0297928af0a7eb9cba7b Mon Sep 17 00:00:00 2001
From: Herton Ronaldo Krzesinski <herton@mandriva.com.br>
Date: Tue, 26 May 2009 12:15:53 +0900
Subject: tomoyo: add missing call to cap_bprm_set_creds

cap_bprm_set_creds() has to be called from security_bprm_set_creds().
TOMOYO forgot to call cap_bprm_set_creds() from tomoyo_bprm_set_creds()
and suid executables were not being working.

Make sure we call cap_bprm_set_creds() with TOMOYO, to set credentials
properly inside tomoyo_bprm_set_creds().

Signed-off-by: Herton Ronaldo Krzesinski <herton@mandriva.com.br>
Acked-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index 5b48191..e42be5c 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -27,6 +27,12 @@ static int tomoyo_cred_prepare(struct cred *new, const struct cred *old,
 
 static int tomoyo_bprm_set_creds(struct linux_binprm *bprm)
 {
+	int rc;
+
+	rc = cap_bprm_set_creds(bprm);
+	if (rc)
+		return rc;
+
 	/*
 	 * Do only if this function is called for the first time of an execve
 	 * operation.
-- 
cgit v0.10.2


From e76a16deb8785317a23cca7204331af053e0fb4e Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 26 May 2009 17:44:56 -0700
Subject: drm/i915: Fix tiling pitch handling on 8xx.

The pitch field is an exponent on pre-965, so we were rejecting buffers
on 8xx that we shouldn't have.  915 got lucky in that the largest legal
value happened to match (8KB / 512 = 0x10), but 8xx has a smaller tile width.
Additionally, we programmed that bad value into the register on 8xx, so the
only pitch that would work correctly was 4096 (512-1023 pixels), while others
would probably give bad rendering or hangs.

Signed-off-by: Eric Anholt <eric@anholt.net>

fd.o bug #20473.

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 717b6a8..e4408da 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2128,8 +2128,10 @@ static void i830_write_fence_reg(struct drm_i915_fence_reg *reg)
 		return;
 	}
 
-	pitch_val = (obj_priv->stride / 128) - 1;
-	WARN_ON(pitch_val & ~0x0000000f);
+	pitch_val = obj_priv->stride / 128;
+	pitch_val = ffs(pitch_val) - 1;
+	WARN_ON(pitch_val > I830_FENCE_MAX_PITCH_VAL);
+
 	val = obj_priv->gtt_offset;
 	if (obj_priv->tiling_mode == I915_TILING_Y)
 		val |= 1 << I830_FENCE_TILING_Y_SHIFT;
diff --git a/drivers/gpu/drm/i915/i915_gem_tiling.c b/drivers/gpu/drm/i915/i915_gem_tiling.c
index 52a0593..540dd33 100644
--- a/drivers/gpu/drm/i915/i915_gem_tiling.c
+++ b/drivers/gpu/drm/i915/i915_gem_tiling.c
@@ -213,7 +213,8 @@ i915_tiling_ok(struct drm_device *dev, int stride, int size, int tiling_mode)
 	if (tiling_mode == I915_TILING_NONE)
 		return true;
 
-	if (tiling_mode == I915_TILING_Y && HAS_128_BYTE_Y_TILING(dev))
+	if (!IS_I9XX(dev) ||
+	    (tiling_mode == I915_TILING_Y && HAS_128_BYTE_Y_TILING(dev)))
 		tile_width = 128;
 	else
 		tile_width = 512;
@@ -225,11 +226,18 @@ i915_tiling_ok(struct drm_device *dev, int stride, int size, int tiling_mode)
 		if (stride / 128 > I965_FENCE_MAX_PITCH_VAL)
 			return false;
 	} else if (IS_I9XX(dev)) {
-		if (stride / tile_width > I830_FENCE_MAX_PITCH_VAL ||
+		uint32_t pitch_val = ffs(stride / tile_width) - 1;
+
+		/* XXX: For Y tiling, FENCE_MAX_PITCH_VAL is actually 6 (8KB)
+		 * instead of 4 (2KB) on 945s.
+		 */
+		if (pitch_val > I915_FENCE_MAX_PITCH_VAL ||
 		    size > (I830_FENCE_MAX_SIZE_VAL << 20))
 			return false;
 	} else {
-		if (stride / 128 > I830_FENCE_MAX_PITCH_VAL ||
+		uint32_t pitch_val = ffs(stride / tile_width) - 1;
+
+		if (pitch_val > I830_FENCE_MAX_PITCH_VAL ||
 		    size > (I830_FENCE_MAX_SIZE_VAL << 19))
 			return false;
 	}
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 9668cc0..375569d 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -190,7 +190,8 @@
 #define   I830_FENCE_SIZE_BITS(size)	((ffs((size) >> 19) - 1) << 8)
 #define   I830_FENCE_PITCH_SHIFT	4
 #define   I830_FENCE_REG_VALID		(1<<0)
-#define   I830_FENCE_MAX_PITCH_VAL	0x10
+#define   I915_FENCE_MAX_PITCH_VAL	0x10
+#define   I830_FENCE_MAX_PITCH_VAL	6
 #define   I830_FENCE_MAX_SIZE_VAL	(1<<8)
 
 #define   I915_FENCE_START_MASK		0x0ff00000
-- 
cgit v0.10.2


From cfa16a0de5392c54db553ec2233a7110e4b4da7a Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 26 May 2009 18:46:16 -0700
Subject: drm/i915: Apply a big hammer to 865 GEM object CPU cache flushing.

On the 865, but not the 855, the clflush we do appears to not actually make
it out to the hardware all the time.  An easy way to safely reproduce was
X -retro, which would show that some of the blits involved in drawing the
lovely root weave didn't make it out to the hardware.  Those blits are 32
bytes each, and 1-2 would be missing at various points around the screen.
Other experimentation (doing more clflush, doing more AGP chipset flush,
poking at some more device registers to maybe trigger more flushing) didn't
help.  krh came up with the wbinvd as a way to successfully get all those
blits to appear.

Signed-off-by: Eric Anholt <eric@anholt.net>

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index e4408da..e242186 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2423,6 +2423,16 @@ i915_gem_clflush_object(struct drm_gem_object *obj)
 	if (obj_priv->pages == NULL)
 		return;
 
+	/* XXX: The 865 in particular appears to be weird in how it handles
+	 * cache flushing.  We haven't figured it out, but the
+	 * clflush+agp_chipset_flush doesn't appear to successfully get the
+	 * data visible to the PGU, while wbinvd + agp_chipset_flush does.
+	 */
+	if (IS_I865G(obj->dev)) {
+		wbinvd();
+		return;
+	}
+
 	drm_clflush_pages(obj_priv->pages, obj->size / PAGE_SIZE);
 }
 
-- 
cgit v0.10.2


From 76b0187525f024cb391c8043adf2e359b2adb988 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Tue, 26 May 2009 14:16:31 +0900
Subject: rootplug: Remove redundant initialization.

We don't need to explicitly initialize to cap_* because
it will be filled by security_fixup_ops().

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/root_plug.c b/security/root_plug.c
index 40fb4f1..2f7ffa6 100644
--- a/security/root_plug.c
+++ b/security/root_plug.c
@@ -71,18 +71,6 @@ static int rootplug_bprm_check_security (struct linux_binprm *bprm)
 }
 
 static struct security_operations rootplug_security_ops = {
-	/* Use the capability functions for some of the hooks */
-	.ptrace_may_access =		cap_ptrace_may_access,
-	.ptrace_traceme =		cap_ptrace_traceme,
-	.capget =			cap_capget,
-	.capset =			cap_capset,
-	.capable =			cap_capable,
-
-	.bprm_set_creds =		cap_bprm_set_creds,
-
-	.task_fix_setuid =		cap_task_fix_setuid,
-	.task_prctl =			cap_task_prctl,
-
 	.bprm_check_security =		rootplug_bprm_check_security,
 };
 
-- 
cgit v0.10.2


From 84532a0fc3d5811dca8e3726fe4d372ea87bd7c6 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 27 May 2009 13:33:14 +1000
Subject: Revert "powerpc: Rework dma-noncoherent to use generic vmalloc layer"

This reverts commit 33f00dcedb0e22cdb156a23632814fc580fcfcf8.

    While it was a good idea to try to use the mm/vmalloc.c allocator instead
    of our own (in fact, ours is itself a dup on an old variant of the vmalloc
    one), unfortunately, the approach is terminally busted since
    dma_alloc_coherent() can be called at interrupt time or in atomic contexts
    and there's little chances we'll make the code in mm/vmalloc.c cope with\       that :-(

    Until we can get the generic code to forbid that idiocy and fix all
    drivers abusing it, we pretty much have no choice but revert to
    our custom virtual space allocator.

    There's also a problem with SMP safety since freeing such mapping
    would require an IPI which cannot be done at interrupt time.

    However, right now, I don't think we support any platform that is
    both SMP and has non-coherent DMA (don't laugh, I know such things
    do exist !) so we can sort that out later.

    Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index a0d1146..3bb43ad 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -868,6 +868,31 @@ config TASK_SIZE
 	default "0x80000000" if PPC_PREP || PPC_8xx
 	default "0xc0000000"
 
+config CONSISTENT_START_BOOL
+	bool "Set custom consistent memory pool address"
+	depends on ADVANCED_OPTIONS && NOT_COHERENT_CACHE
+	help
+	  This option allows you to set the base virtual address
+	  of the consistent memory pool.  This pool of virtual
+	  memory is used to make consistent memory allocations.
+
+config CONSISTENT_START
+	hex "Base virtual address of consistent memory pool" if CONSISTENT_START_BOOL
+	default "0xfd000000" if (NOT_COHERENT_CACHE && 8xx)
+	default "0xff100000" if NOT_COHERENT_CACHE
+
+config CONSISTENT_SIZE_BOOL
+	bool "Set custom consistent memory pool size"
+	depends on ADVANCED_OPTIONS && NOT_COHERENT_CACHE
+	help
+	  This option allows you to set the size of the
+	  consistent memory pool.  This pool of virtual memory
+	  is used to make consistent memory allocations.
+
+config CONSISTENT_SIZE
+	hex "Size of consistent memory pool" if CONSISTENT_SIZE_BOOL
+	default "0x00200000" if NOT_COHERENT_CACHE
+
 config PIN_TLB
 	bool "Pinned Kernel TLBs (860 ONLY)"
 	depends on ADVANCED_OPTIONS && 8xx
diff --git a/arch/powerpc/lib/dma-noncoherent.c b/arch/powerpc/lib/dma-noncoherent.c
index 005a28d..b7dc4c1 100644
--- a/arch/powerpc/lib/dma-noncoherent.c
+++ b/arch/powerpc/lib/dma-noncoherent.c
@@ -29,11 +29,121 @@
 #include <linux/types.h>
 #include <linux/highmem.h>
 #include <linux/dma-mapping.h>
-#include <linux/vmalloc.h>
 
 #include <asm/tlbflush.h>
 
 /*
+ * This address range defaults to a value that is safe for all
+ * platforms which currently set CONFIG_NOT_COHERENT_CACHE. It
+ * can be further configured for specific applications under
+ * the "Advanced Setup" menu. -Matt
+ */
+#define CONSISTENT_BASE	(CONFIG_CONSISTENT_START)
+#define CONSISTENT_END	(CONFIG_CONSISTENT_START + CONFIG_CONSISTENT_SIZE)
+#define CONSISTENT_OFFSET(x)	(((unsigned long)(x) - CONSISTENT_BASE) >> PAGE_SHIFT)
+
+/*
+ * This is the page table (2MB) covering uncached, DMA consistent allocations
+ */
+static pte_t *consistent_pte;
+static DEFINE_SPINLOCK(consistent_lock);
+
+/*
+ * VM region handling support.
+ *
+ * This should become something generic, handling VM region allocations for
+ * vmalloc and similar (ioremap, module space, etc).
+ *
+ * I envisage vmalloc()'s supporting vm_struct becoming:
+ *
+ *  struct vm_struct {
+ *    struct vm_region	region;
+ *    unsigned long	flags;
+ *    struct page	**pages;
+ *    unsigned int	nr_pages;
+ *    unsigned long	phys_addr;
+ *  };
+ *
+ * get_vm_area() would then call vm_region_alloc with an appropriate
+ * struct vm_region head (eg):
+ *
+ *  struct vm_region vmalloc_head = {
+ *	.vm_list	= LIST_HEAD_INIT(vmalloc_head.vm_list),
+ *	.vm_start	= VMALLOC_START,
+ *	.vm_end		= VMALLOC_END,
+ *  };
+ *
+ * However, vmalloc_head.vm_start is variable (typically, it is dependent on
+ * the amount of RAM found at boot time.)  I would imagine that get_vm_area()
+ * would have to initialise this each time prior to calling vm_region_alloc().
+ */
+struct ppc_vm_region {
+	struct list_head	vm_list;
+	unsigned long		vm_start;
+	unsigned long		vm_end;
+};
+
+static struct ppc_vm_region consistent_head = {
+	.vm_list	= LIST_HEAD_INIT(consistent_head.vm_list),
+	.vm_start	= CONSISTENT_BASE,
+	.vm_end		= CONSISTENT_END,
+};
+
+static struct ppc_vm_region *
+ppc_vm_region_alloc(struct ppc_vm_region *head, size_t size, gfp_t gfp)
+{
+	unsigned long addr = head->vm_start, end = head->vm_end - size;
+	unsigned long flags;
+	struct ppc_vm_region *c, *new;
+
+	new = kmalloc(sizeof(struct ppc_vm_region), gfp);
+	if (!new)
+		goto out;
+
+	spin_lock_irqsave(&consistent_lock, flags);
+
+	list_for_each_entry(c, &head->vm_list, vm_list) {
+		if ((addr + size) < addr)
+			goto nospc;
+		if ((addr + size) <= c->vm_start)
+			goto found;
+		addr = c->vm_end;
+		if (addr > end)
+			goto nospc;
+	}
+
+ found:
+	/*
+	 * Insert this entry _before_ the one we found.
+	 */
+	list_add_tail(&new->vm_list, &c->vm_list);
+	new->vm_start = addr;
+	new->vm_end = addr + size;
+
+	spin_unlock_irqrestore(&consistent_lock, flags);
+	return new;
+
+ nospc:
+	spin_unlock_irqrestore(&consistent_lock, flags);
+	kfree(new);
+ out:
+	return NULL;
+}
+
+static struct ppc_vm_region *ppc_vm_region_find(struct ppc_vm_region *head, unsigned long addr)
+{
+	struct ppc_vm_region *c;
+
+	list_for_each_entry(c, &head->vm_list, vm_list) {
+		if (c->vm_start == addr)
+			goto out;
+	}
+	c = NULL;
+ out:
+	return c;
+}
+
+/*
  * Allocate DMA-coherent memory space and return both the kernel remapped
  * virtual and bus address for that space.
  */
@@ -41,21 +151,21 @@ void *
 __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp)
 {
 	struct page *page;
+	struct ppc_vm_region *c;
 	unsigned long order;
-	int i;
-	unsigned int nr_pages = PAGE_ALIGN(size)>>PAGE_SHIFT;
-	unsigned int array_size = nr_pages * sizeof(struct page *);
-	struct page **pages;
-	struct page *end;
 	u64 mask = 0x00ffffff, limit; /* ISA default */
-	struct vm_struct *area;
 
-	BUG_ON(!mem_init_done);
+	if (!consistent_pte) {
+		printk(KERN_ERR "%s: not initialised\n", __func__);
+		dump_stack();
+		return NULL;
+	}
+
 	size = PAGE_ALIGN(size);
 	limit = (mask + 1) & ~mask;
-	if (limit && size >= limit) {
-		printk(KERN_WARNING "coherent allocation too big (requested "
-				"%#x mask %#Lx)\n", size, mask);
+	if ((limit && size >= limit) || size >= (CONSISTENT_END - CONSISTENT_BASE)) {
+		printk(KERN_WARNING "coherent allocation too big (requested %#x mask %#Lx)\n",
+		       size, mask);
 		return NULL;
 	}
 
@@ -68,8 +178,6 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp)
 	if (!page)
 		goto no_page;
 
-	end = page + (1 << order);
-
 	/*
 	 * Invalidate any data that might be lurking in the
 	 * kernel direct-mapped region for device DMA.
@@ -80,59 +188,48 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp)
 		flush_dcache_range(kaddr, kaddr + size);
 	}
 
-	split_page(page, order);
-
 	/*
-	 * Set the "dma handle"
+	 * Allocate a virtual address in the consistent mapping region.
 	 */
-	*handle = page_to_phys(page);
-
-	area = get_vm_area_caller(size, VM_IOREMAP,
-			__builtin_return_address(1));
-	if (!area)
-		goto out_free_pages;
-
-	if (array_size > PAGE_SIZE) {
-		pages = vmalloc(array_size);
-		area->flags |= VM_VPAGES;
-	} else {
-		pages = kmalloc(array_size, GFP_KERNEL);
-	}
-	if (!pages)
-		goto out_free_area;
+	c = ppc_vm_region_alloc(&consistent_head, size,
+			    gfp & ~(__GFP_DMA | __GFP_HIGHMEM));
+	if (c) {
+		unsigned long vaddr = c->vm_start;
+		pte_t *pte = consistent_pte + CONSISTENT_OFFSET(vaddr);
+		struct page *end = page + (1 << order);
 
-	area->pages = pages;
-	area->nr_pages = nr_pages;
+		split_page(page, order);
 
-	for (i = 0; i < nr_pages; i++)
-		pages[i] = page + i;
+		/*
+		 * Set the "dma handle"
+		 */
+		*handle = page_to_phys(page);
 
-	if (map_vm_area(area, pgprot_noncached(PAGE_KERNEL), &pages))
-		goto out_unmap;
+		do {
+			BUG_ON(!pte_none(*pte));
 
-	/*
-	 * Free the otherwise unused pages.
-	 */
-	page += nr_pages;
-	while (page < end) {
-		__free_page(page);
-		page++;
+			SetPageReserved(page);
+			set_pte_at(&init_mm, vaddr,
+				   pte, mk_pte(page, pgprot_noncached(PAGE_KERNEL)));
+			page++;
+			pte++;
+			vaddr += PAGE_SIZE;
+		} while (size -= PAGE_SIZE);
+
+		/*
+		 * Free the otherwise unused pages.
+		 */
+		while (page < end) {
+			__free_page(page);
+			page++;
+		}
+
+		return (void *)c->vm_start;
 	}
 
-	return area->addr;
-out_unmap:
-	vunmap(area->addr);
-	if (array_size > PAGE_SIZE)
-		vfree(pages);
-	else
-		kfree(pages);
-	goto out_free_pages;
-out_free_area:
-	free_vm_area(area);
-out_free_pages:
 	if (page)
 		__free_pages(page, order);
-no_page:
+ no_page:
 	return NULL;
 }
 EXPORT_SYMBOL(__dma_alloc_coherent);
@@ -142,12 +239,104 @@ EXPORT_SYMBOL(__dma_alloc_coherent);
  */
 void __dma_free_coherent(size_t size, void *vaddr)
 {
-	vfree(vaddr);
+	struct ppc_vm_region *c;
+	unsigned long flags, addr;
+	pte_t *ptep;
+
+	size = PAGE_ALIGN(size);
+
+	spin_lock_irqsave(&consistent_lock, flags);
+
+	c = ppc_vm_region_find(&consistent_head, (unsigned long)vaddr);
+	if (!c)
+		goto no_area;
+
+	if ((c->vm_end - c->vm_start) != size) {
+		printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n",
+		       __func__, c->vm_end - c->vm_start, size);
+		dump_stack();
+		size = c->vm_end - c->vm_start;
+	}
+
+	ptep = consistent_pte + CONSISTENT_OFFSET(c->vm_start);
+	addr = c->vm_start;
+	do {
+		pte_t pte = ptep_get_and_clear(&init_mm, addr, ptep);
+		unsigned long pfn;
+
+		ptep++;
+		addr += PAGE_SIZE;
 
+		if (!pte_none(pte) && pte_present(pte)) {
+			pfn = pte_pfn(pte);
+
+			if (pfn_valid(pfn)) {
+				struct page *page = pfn_to_page(pfn);
+				ClearPageReserved(page);
+
+				__free_page(page);
+				continue;
+			}
+		}
+
+		printk(KERN_CRIT "%s: bad page in kernel page table\n",
+		       __func__);
+	} while (size -= PAGE_SIZE);
+
+	flush_tlb_kernel_range(c->vm_start, c->vm_end);
+
+	list_del(&c->vm_list);
+
+	spin_unlock_irqrestore(&consistent_lock, flags);
+
+	kfree(c);
+	return;
+
+ no_area:
+	spin_unlock_irqrestore(&consistent_lock, flags);
+	printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n",
+	       __func__, vaddr);
+	dump_stack();
 }
 EXPORT_SYMBOL(__dma_free_coherent);
 
 /*
+ * Initialise the consistent memory allocation.
+ */
+static int __init dma_alloc_init(void)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	int ret = 0;
+
+	do {
+		pgd = pgd_offset(&init_mm, CONSISTENT_BASE);
+		pud = pud_alloc(&init_mm, pgd, CONSISTENT_BASE);
+		pmd = pmd_alloc(&init_mm, pud, CONSISTENT_BASE);
+		if (!pmd) {
+			printk(KERN_ERR "%s: no pmd tables\n", __func__);
+			ret = -ENOMEM;
+			break;
+		}
+
+		pte = pte_alloc_kernel(pmd, CONSISTENT_BASE);
+		if (!pte) {
+			printk(KERN_ERR "%s: no pte tables\n", __func__);
+			ret = -ENOMEM;
+			break;
+		}
+
+		consistent_pte = pte;
+	} while (0);
+
+	return ret;
+}
+
+core_initcall(dma_alloc_init);
+
+/*
  * make an area consistent.
  */
 void __dma_sync(void *vaddr, size_t size, int direction)
-- 
cgit v0.10.2


From 87ad57bacb25c3f24c54f142ef445f68277705f0 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Tue, 19 May 2009 16:09:42 +0800
Subject: cpuidle: makes AMD C1E work in acpi_idle

When AMD C1E is enabled, local APIC timer will stop even in C1.
This patch uses broadcast IPI to replace local APIC timer in C1.

http://bugzilla.kernel.org/show_bug.cgi?id=13233

[ impact: avoid boot hang in AMD CPU with C1E enabled ]

Tested-by: Dmitry Lyzhyn <thisistempbox@yahoo.com>
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 72069ba..6b7bcc7 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -148,6 +148,9 @@ static void acpi_timer_check_state(int state, struct acpi_processor *pr,
 	if (cpu_has(&cpu_data(pr->id), X86_FEATURE_ARAT))
 		return;
 
+	if (boot_cpu_has(X86_FEATURE_AMDC1E))
+		type = ACPI_STATE_C1;
+
 	/*
 	 * Check, if one of the previous states already marked the lapic
 	 * unstable
@@ -611,6 +614,7 @@ static int acpi_processor_power_verify(struct acpi_processor *pr)
 		switch (cx->type) {
 		case ACPI_STATE_C1:
 			cx->valid = 1;
+			acpi_timer_check_state(i, pr, cx);
 			break;
 
 		case ACPI_STATE_C2:
@@ -835,6 +839,7 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev,
 		return 0;
 	}
 
+	acpi_state_timer_broadcast(pr, cx, 1);
 	kt1 = ktime_get_real();
 	acpi_idle_do_entry(cx);
 	kt2 = ktime_get_real();
@@ -842,6 +847,7 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev,
 
 	local_irq_enable();
 	cx->usage++;
+	acpi_state_timer_broadcast(pr, cx, 0);
 
 	return idle_time;
 }
-- 
cgit v0.10.2


From 7d60e8ab0d5507229dfbdf456501cc378610fa01 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Tue, 19 May 2009 16:09:54 +0800
Subject: cpuidle: fix AMD C1E suspend hang

When AMD C1E is enabled, local APIC timer will stop even in C1. To avoid
suspend/resume hang, this patch removes C1 and replace it with a cpu_relax() in
suspend/resume path. This hasn't any impact in runtime path.

http://bugzilla.kernel.org/show_bug.cgi?id=13233

[ impact: avoid suspend/resume hang in AMD CPU with C1E enabled ]

Tested-by: Dmitry Lyzhyn <thisistempbox@yahoo.com>
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 6b7bcc7..10a2d91 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -834,8 +834,8 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev,
 
 	/* Do not access any ACPI IO ports in suspend path */
 	if (acpi_idle_suspend) {
-		acpi_safe_halt();
 		local_irq_enable();
+		cpu_relax();
 		return 0;
 	}
 
-- 
cgit v0.10.2


From e65fcfd63a9a62baa5708484ff8edbe56eb3e7ec Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Tue, 26 May 2009 20:47:02 -0700
Subject: cls_cgroup: read classid atomically in classifier

Avoid reading the unsynchronized value cs->classid multiple times,
since it could change concurrently from non-zero to zero; this would
result in the classifier returning a positive result with a bogus
(zero) classid.

Signed-off-by: Paul Menage <menage@google.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 91a3db4..cc29b44 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -104,8 +104,7 @@ static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp,
 			       struct tcf_result *res)
 {
 	struct cls_cgroup_head *head = tp->root;
-	struct cgroup_cls_state *cs;
-	int ret = 0;
+	u32 classid;
 
 	/*
 	 * Due to the nature of the classifier it is required to ignore all
@@ -121,17 +120,18 @@ static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp,
 		return -1;
 
 	rcu_read_lock();
-	cs = task_cls_state(current);
-	if (cs->classid && tcf_em_tree_match(skb, &head->ematches, NULL)) {
-		res->classid = cs->classid;
-		res->class = 0;
-		ret = tcf_exts_exec(skb, &head->exts, res);
-	} else
-		ret = -1;
-
+	classid = task_cls_state(current)->classid;
 	rcu_read_unlock();
 
-	return ret;
+	if (!classid)
+		return -1;
+
+	if (!tcf_em_tree_match(skb, &head->ematches, NULL))
+		return -1;
+
+	res->classid = classid;
+	res->class = 0;
+	return tcf_exts_exec(skb, &head->exts, res);
 }
 
 static unsigned long cls_cgroup_get(struct tcf_proto *tp, u32 handle)
-- 
cgit v0.10.2


From 18a36c1a398055e87a3646738abf3c1b0e98e7e2 Mon Sep 17 00:00:00 2001
From: Xiaotian Feng <xiaotian.feng@windriver.com>
Date: Tue, 26 May 2009 20:48:04 -0700
Subject: gianfar: fix babbling rx error event bug

Gianfar interrupt handler uses IEVENT_ERR_MASK to check and handle errors.
Babbling RX error (IEVENT_BABR) should be included in IEVENT_ERROR_MASK.
Otherwise if BABR is raised, it never gets handled nor cleared, and an
interrupt storm results. This has been observed to happen on sending a
burst of ethernet frames to a gianfar based board.

Signed-off-by: Xiaotian Feng <xiaotian.feng@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/gianfar.h b/drivers/net/gianfar.h
index 0642d52..cf35296 100644
--- a/drivers/net/gianfar.h
+++ b/drivers/net/gianfar.h
@@ -259,7 +259,7 @@ extern const char gfar_driver_version[];
 (IEVENT_RXC | IEVENT_BSY | IEVENT_EBERR | IEVENT_MSRO | \
  IEVENT_BABT | IEVENT_TXC | IEVENT_TXE | IEVENT_LC \
  | IEVENT_CRL | IEVENT_XFUN | IEVENT_DPE | IEVENT_PERR \
- | IEVENT_MAG)
+ | IEVENT_MAG | IEVENT_BABR)
 
 #define IMASK_INIT_CLEAR	0x00000000
 #define IMASK_BABR              0x80000000
-- 
cgit v0.10.2


From 5ad18900355743757e7f415bcb850ea8406a13e2 Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Tue, 26 May 2009 20:50:12 -0700
Subject: atlx: move modinfo data from atlx.h to atl1.c

Both atl1.c and atl2.c include atlx.h, which defines some modinfo
stuff. But atl2.c seems like it doesn't want the modinfo data
from atlx.h, as it defines its own.

Running modinfo on atl2.ko, we get conflicting information:

$ /sbin/modinfo drivers/net/atlx/atl2.ko | egrep "version|description|author"
version:        2.2.3
description:    Atheros Fast Ethernet Network Driver
author:         Atheros Corporation <xiong.huang@atheros.com>, Chris Snook <csnook@redhat.com>
version:        2.1.3
author:         Xiong Huang <xiong.huang@atheros.com>, 	Chris Snook <csnook@redhat.com>, Jay Cliburn <jcliburn@gmail.com>

Move the modinfo data out of atlx.h and into atl1.c to eliminate
the confusion:

$ /sbin/modinfo drivers/net/atlx/atl1.ko | egrep "version|description|author"
version:        2.1.3
author:         Xiong Huang <xiong.huang@atheros.com>, 	Chris Snook <csnook@redhat.com>, Jay Cliburn <jcliburn@gmail.com>
description:    Atheros L1 Gigabit Ethernet Driver

$ /sbin/modinfo drivers/net/atlx/atl2.ko | egrep "version|description|author"
version:        2.2.3
description:    Atheros Fast Ethernet Network Driver
author:         Atheros Corporation <xiong.huang@atheros.com>, Chris Snook <csnook@redhat.com>

Reported-by: Scott Scriven <scott.scriven@hp.com>
Signed-off-by: Alex Chiang <achiang@hp.com>
Acked-by: Jay Cliburn <jcliburn@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/atlx/atl1.c b/drivers/net/atlx/atl1.c
index 0ab2254..4e81712 100644
--- a/drivers/net/atlx/atl1.c
+++ b/drivers/net/atlx/atl1.c
@@ -82,6 +82,12 @@
 
 #include "atl1.h"
 
+#define ATLX_DRIVER_VERSION "2.1.3"
+MODULE_AUTHOR("Xiong Huang <xiong.huang@atheros.com>, \
+	Chris Snook <csnook@redhat.com>, Jay Cliburn <jcliburn@gmail.com>");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(ATLX_DRIVER_VERSION);
+
 /* Temporary hack for merging atl1 and atl2 */
 #include "atlx.c"
 
diff --git a/drivers/net/atlx/atlx.h b/drivers/net/atlx/atlx.h
index 297a03d..14054b7 100644
--- a/drivers/net/atlx/atlx.h
+++ b/drivers/net/atlx/atlx.h
@@ -29,12 +29,6 @@
 #include <linux/module.h>
 #include <linux/types.h>
 
-#define ATLX_DRIVER_VERSION "2.1.3"
-MODULE_AUTHOR("Xiong Huang <xiong.huang@atheros.com>, \
-	Chris Snook <csnook@redhat.com>, Jay Cliburn <jcliburn@gmail.com>");
-MODULE_LICENSE("GPL");
-MODULE_VERSION(ATLX_DRIVER_VERSION);
-
 #define ATLX_ERR_PHY			2
 #define ATLX_ERR_PHY_SPEED		7
 #define ATLX_ERR_PHY_RES		8
-- 
cgit v0.10.2


From b63dc8fef7ca5c51d163295d824e78c770d48ccf Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Tue, 26 May 2009 20:55:33 -0700
Subject: bfin_mac: fix build error due to net_device_ops convert
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous commit "convert to net_device_ops" broke the Blackfin MAC
driver as it declared the new structure before the function it used:
  CC      drivers/net/bfin_mac.o
drivers/net/bfin_mac.c:984: error: ‘bfin_mac_close’ undeclared here (not in a function)
make[1]: *** [drivers/net/bfin_mac.o] Error 1

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/bfin_mac.c b/drivers/net/bfin_mac.c
index 9f971ed..b4da182 100644
--- a/drivers/net/bfin_mac.c
+++ b/drivers/net/bfin_mac.c
@@ -979,22 +979,7 @@ static int bfin_mac_open(struct net_device *dev)
 	return 0;
 }
 
-static const struct net_device_ops bfin_mac_netdev_ops = {
-	.ndo_open		= bfin_mac_open,
-	.ndo_stop		= bfin_mac_close,
-	.ndo_start_xmit		= bfin_mac_hard_start_xmit,
-	.ndo_set_mac_address	= bfin_mac_set_mac_address,
-	.ndo_tx_timeout		= bfin_mac_timeout,
-	.ndo_set_multicast_list	= bfin_mac_set_multicast_list,
-	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_change_mtu		= eth_change_mtu,
-#ifdef CONFIG_NET_POLL_CONTROLLER
-	.ndo_poll_controller	= bfin_mac_poll,
-#endif
-};
-
 /*
- *
  * this makes the board clean up everything that it can
  * and not talk to the outside world.   Caused by
  * an 'ifconfig ethX down'
@@ -1019,6 +1004,20 @@ static int bfin_mac_close(struct net_device *dev)
 	return 0;
 }
 
+static const struct net_device_ops bfin_mac_netdev_ops = {
+	.ndo_open		= bfin_mac_open,
+	.ndo_stop		= bfin_mac_close,
+	.ndo_start_xmit		= bfin_mac_hard_start_xmit,
+	.ndo_set_mac_address	= bfin_mac_set_mac_address,
+	.ndo_tx_timeout		= bfin_mac_timeout,
+	.ndo_set_multicast_list	= bfin_mac_set_multicast_list,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_change_mtu		= eth_change_mtu,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller	= bfin_mac_poll,
+#endif
+};
+
 static int __devinit bfin_mac_probe(struct platform_device *pdev)
 {
 	struct net_device *ndev;
-- 
cgit v0.10.2


From 7a1450fdf4c69961f3926352fd8bc4ea19676756 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Tue, 26 May 2009 04:55:38 -0400
Subject: Blackfin: hook up preadv/pwritev syscalls

Signed-off-by: Mike Frysinger <vapier@gentoo.org>

diff --git a/arch/blackfin/include/asm/unistd.h b/arch/blackfin/include/asm/unistd.h
index 1e57b63..cf5066d 100644
--- a/arch/blackfin/include/asm/unistd.h
+++ b/arch/blackfin/include/asm/unistd.h
@@ -378,8 +378,10 @@
 #define __NR_dup3		363
 #define __NR_pipe2		364
 #define __NR_inotify_init1	365
+#define __NR_preadv		366
+#define __NR_pwritev		367
 
-#define __NR_syscall		366
+#define __NR_syscall		368
 #define NR_syscalls		__NR_syscall
 
 /* Old optional stuff no one actually uses */
diff --git a/arch/blackfin/mach-common/entry.S b/arch/blackfin/mach-common/entry.S
index 21e65a3..a063a43 100644
--- a/arch/blackfin/mach-common/entry.S
+++ b/arch/blackfin/mach-common/entry.S
@@ -1581,6 +1581,8 @@ ENTRY(_sys_call_table)
 	.long _sys_dup3
 	.long _sys_pipe2
 	.long _sys_inotify_init1	/* 365 */
+	.long _sys_preadv
+	.long _sys_pwritev
 
 	.rept NR_syscalls-(.-_sys_call_table)/4
 	.long _sys_ni_syscall
-- 
cgit v0.10.2


From 6c83429a1c32c914dfb81939cc2ddece97e48294 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Sun, 24 May 2009 02:13:15 -0400
Subject: MAINTAINERS: update Blackfin items

With Bryan Wu having moved on to another job, push the slack onto some
other ADI lackeys.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>

diff --git a/MAINTAINERS b/MAINTAINERS
index 77cbfb1..7dd3475 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1132,16 +1132,16 @@ F:	fs/bfs/
 F:	include/linux/bfs_fs.h
 
 BLACKFIN ARCHITECTURE
-P:	Bryan Wu
-M:	cooloney@kernel.org
+P:	Mike Frysinger
+M:	vapier@gentoo.org
 L:	uclinux-dist-devel@blackfin.uclinux.org
 W:	http://blackfin.uclinux.org
 S:	Supported
 F:	arch/blackfin/
 
 BLACKFIN EMAC DRIVER
-P:	Bryan Wu
-M:	cooloney@kernel.org
+P:	Michael Hennerich
+M:	michael.hennerich@analog.com
 L:	uclinux-dist-devel@blackfin.uclinux.org (subscribers-only)
 W:	http://blackfin.uclinux.org
 S:	Supported
-- 
cgit v0.10.2


From 49afa60948f859e71d68a74c1af6ccd7b5b94d82 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Mon, 18 May 2009 04:33:07 -0400
Subject: MAINTAINERS: drop (subscribers-only) markings on Blackfin lists

All of the Blackfin lists are transparently moderated for non-subscribers.
i.e. there are no annoying notices and people get whitelisted after first
their posting.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>

diff --git a/MAINTAINERS b/MAINTAINERS
index 7dd3475..5ee166e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1142,7 +1142,7 @@ F:	arch/blackfin/
 BLACKFIN EMAC DRIVER
 P:	Michael Hennerich
 M:	michael.hennerich@analog.com
-L:	uclinux-dist-devel@blackfin.uclinux.org (subscribers-only)
+L:	uclinux-dist-devel@blackfin.uclinux.org
 W:	http://blackfin.uclinux.org
 S:	Supported
 F:	drivers/net/bfin_mac.*
@@ -1150,7 +1150,7 @@ F:	drivers/net/bfin_mac.*
 BLACKFIN RTC DRIVER
 P:	Mike Frysinger
 M:	vapier.adi@gmail.com
-L:	uclinux-dist-devel@blackfin.uclinux.org (subscribers-only)
+L:	uclinux-dist-devel@blackfin.uclinux.org
 W:	http://blackfin.uclinux.org
 S:	Supported
 F:	drivers/rtc/rtc-bfin.c
@@ -1158,7 +1158,7 @@ F:	drivers/rtc/rtc-bfin.c
 BLACKFIN SERIAL DRIVER
 P:	Sonic Zhang
 M:	sonic.zhang@analog.com
-L:	uclinux-dist-devel@blackfin.uclinux.org (subscribers-only)
+L:	uclinux-dist-devel@blackfin.uclinux.org
 W:	http://blackfin.uclinux.org
 S:	Supported
 F:	drivers/serial/bfin_5xx.c
@@ -1166,7 +1166,7 @@ F:	drivers/serial/bfin_5xx.c
 BLACKFIN WATCHDOG DRIVER
 P:	Mike Frysinger
 M:	vapier.adi@gmail.com
-L:	uclinux-dist-devel@blackfin.uclinux.org (subscribers-only)
+L:	uclinux-dist-devel@blackfin.uclinux.org
 W:	http://blackfin.uclinux.org
 S:	Supported
 F:	drivers/watchdog/bfin_wdt.c
@@ -1174,7 +1174,7 @@ F:	drivers/watchdog/bfin_wdt.c
 BLACKFIN I2C TWI DRIVER
 P:	Sonic Zhang
 M:	sonic.zhang@analog.com
-L:	uclinux-dist-devel@blackfin.uclinux.org (subscribers-only)
+L:	uclinux-dist-devel@blackfin.uclinux.org
 W:	http://blackfin.uclinux.org/
 S:	Supported
 F:	drivers/i2c/busses/i2c-bfin-twi.c
-- 
cgit v0.10.2


From 6b50520b2fd9bf521f9c947b5f6999bad273a51d Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Tue, 19 May 2009 10:03:22 -0400
Subject: Blackfin: ignore generated vmlinux.lds

Signed-off-by: Mike Frysinger <vapier@gentoo.org>

diff --git a/arch/blackfin/kernel/.gitignore b/arch/blackfin/kernel/.gitignore
new file mode 100644
index 0000000..c5f676c
--- /dev/null
+++ b/arch/blackfin/kernel/.gitignore
@@ -0,0 +1 @@
+vmlinux.lds
-- 
cgit v0.10.2


From 2ec10ea91bf3688013b00638f29df4f8f6b5c18b Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Wed, 20 May 2009 19:45:39 -0400
Subject: Blackfin: drop unneeded asm/.gitignore

We don't create a include/asm/mach/ symlink anymore, so we don't need the
.gitignore for it.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>

diff --git a/arch/blackfin/include/asm/.gitignore b/arch/blackfin/include/asm/.gitignore
deleted file mode 100644
index 7858564..0000000
--- a/arch/blackfin/include/asm/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-+mach
-- 
cgit v0.10.2


From add8a5050a52f1bd1be6b97be86fdd1cfbea2d1d Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Tue, 26 May 2009 05:03:52 -0400
Subject: Blackfin: fix strncmp.o build error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix some more fallout of the string changes:

  CC      arch/blackfin/lib/strncmp.o
In file included from include/linux/bitmap.h:9,
                 from include/linux/nodemask.h:90,
                 from include/linux/mmzone.h:17,
                 from include/linux/gfp.h:5,
                 from include/linux/kmod.h:23,
                 from include/linux/module.h:14,
                 from arch/blackfin/lib/strncmp.c:14:
include/linux/string.h: In function ‘strstarts’:
include/linux/string.h:132: error: implicit declaration of function ‘strncmp’
make[1]: *** [arch/blackfin/lib/strncmp.o] Error 1

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
CC: Rusty Russell <rusty@rustcorp.com.au>

diff --git a/arch/blackfin/lib/strncmp.c b/arch/blackfin/lib/strncmp.c
index 2aaae78..46518b1 100644
--- a/arch/blackfin/lib/strncmp.c
+++ b/arch/blackfin/lib/strncmp.c
@@ -8,9 +8,8 @@
 
 #define strncmp __inline_strncmp
 #include <asm/string.h>
-#undef strncmp
-
 #include <linux/module.h>
+#undef strncmp
 
 int strncmp(const char *cs, const char *ct, size_t count)
 {
-- 
cgit v0.10.2


From d8d1656ee15d3085e0085a87e70f9093a0a102c5 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 26 May 2009 19:20:57 -0300
Subject: perf report: Use hex2long instead of sscanf

Before:

[acme@emilia ~]$ perf record -o perf_report.perf perf stat perf report > /dev/null

 Performance counter stats for 'perf':

     245.414985  task clock ticks     (msecs)
              6  context switches     (events)
              6  CPU migrations       (events)
           2108  pagefaults           (events)
       37493013  CPU cycles           (events)  (scaled from 67.04%)
       13576789  instructions         (events)  (scaled from 66.76%)
          57931  cache references     (events)  (scaled from 21.96%)
          12263  cache misses         (events)  (scaled from 21.98%)

 Wall-clock time elapsed:   246.575587 msecs

[acme@emilia ~]$ perf report -i perf_report.perf | head
12.15          perf [.] 0x000000000005432a /lib64/libc-2.5.so: _IO_vfscanf_internal
 9.38          perf [k] 0xffffffff8101b1d2 intel_pmu_enable_all
 8.53          perf [.] 0x00000000000056b8 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: dso__insert_symbol
 6.61          perf [.] 0x00000000000057cb /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: dso__find_symbol
 5.33          perf [k] 0xffffffff811ce082 number
 4.69          perf [.] 0x0000000000034829 /lib64/libc-2.5.so: ____strtoull_l_internal
 4.48          perf [.] 0x0000000000006505 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: thread__symbol_incnew
 3.41          perf [.] 0x000000000000fce6 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: rb_insert_color
 3.20          perf [k] 0xffffffff811cfc01 vsnprintf
 2.99          perf [k] 0xffffffff811ce5e8 format_decode

After:

[acme@emilia ~]$ perf record -o perf_report.perf perf stat perf report > /dev/null

 Performance counter stats for 'perf':

     218.186805  task clock ticks     (msecs)
              4  context switches     (events)
              7  CPU migrations       (events)
           2133  pagefaults           (events)
       32735365  CPU cycles           (events)  (scaled from 67.04%)
       11952309  instructions         (events)  (scaled from 66.26%)
          50314  cache references     (events)  (scaled from 21.96%)
          13228  cache misses         (events)  (scaled from 21.98%)

 Wall-clock time elapsed:   218.810451 msecs

[acme@emilia ~]$ perf report -i perf_report.perf | head
10.68          perf [.] 0x000000000000578d /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: dso__find_symbol
 9.62          perf [.] 0x00000000000065f7 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: thread__symbol_incnew
 9.40          perf [.] 0x00000000000056b4 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: dso__insert_symbol
 9.19          perf [k] 0xffffffff8101b1d2 intel_pmu_enable_all
 5.13          perf [.] 0x0000000000005ec7 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: hex2long
 4.49          perf [k] 0xffffffff81083808 kallsyms_expand_symbol
 3.85          perf [k] 0xffffffff811ce2c1 number
 3.63          perf [.] 0x0000000000005e81 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: hex
 2.99          perf [.] 0x000000000000fd5b /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: rb_insert_color
 2.99          perf [k] 0xffffffff811cf251 string
[acme@emilia ~]$

[ Impact: optimization ]

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090526222057.GI4424@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index a4c6ffa..c517483 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -348,6 +348,39 @@ void dsos__fprintf(FILE *fp)
 		dso__fprintf(pos, fp);
 }
 
+static int hex(char ch)
+{
+	if ((ch >= '0') && (ch <= '9'))
+		return ch - '0';
+	if ((ch >= 'a') && (ch <= 'f'))
+		return ch - 'a' + 10;
+	if ((ch >= 'A') && (ch <= 'F'))
+		return ch - 'A' + 10;
+	return -1;
+}
+
+/*
+ * While we find nice hex chars, build a long_val.
+ * Return number of chars processed.
+ */
+int hex2long(char *ptr, unsigned long *long_val)
+{
+	const char *p = ptr;
+	*long_val = 0;
+
+	while (*p) {
+		const int hex_val = hex(*p);
+
+		if (hex_val < 0)
+			break;
+
+		*long_val = (*long_val << 4) | hex_val;
+		p++;
+	}
+
+	return p - ptr;
+}
+
 static int load_kallsyms(void)
 {
 	kernel_dso = dso__new("[kernel]");
@@ -363,26 +396,30 @@ static int load_kallsyms(void)
 	size_t n;
 
 	while (!feof(file)) {
-		unsigned long long start;
-		char c, symbf[4096];
-
-		if (getline(&line, &n, file) < 0)
+		unsigned long start;
+		int line_len = getline(&line, &n, file);
+		if (line_len < 0)
 			break;
 
 		if (!line)
 			goto out_delete_dso;
 
-		if (sscanf(line, "%llx %c %s", &start, &c, symbf) == 3) {
-			/*
-			 * Well fix up the end later, when we have all sorted.
-			 */
-			struct symbol *sym = symbol__new(start, 0xdead, symbf);
+		line[--line_len] = '\0'; /* \n */
+
+		int len = hex2long(line, &start);
+		
+		len += 3; /* ' t ' */
+		if (len >= line_len)
+			continue;
+		/*
+		 * Well fix up the end later, when we have all sorted.
+		 */
+		struct symbol *sym = symbol__new(start, 0xdead, line + len);
 
-			if (sym == NULL)
-				goto out_delete_dso;
+		if (sym == NULL)
+			goto out_delete_dso;
 
-			dso__insert_symbol(kernel_dso, sym);
-		}
+		dso__insert_symbol(kernel_dso, sym);
 	}
 
 	/*
-- 
cgit v0.10.2


From 03f6316d32738ec5eda2e6f628c12d1c01e61a87 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 26 May 2009 19:21:55 -0300
Subject: perf report: Only load text symbols from kallsyms

Just like we do for userspace when reading the symtab, reducing the
number of entries we insert on the symbols rbtree.

Before:

[acme@emilia ~]$ rm -f perf_report.perf ; perf record -o perf_report.perf perf stat perf report > /dev/null

 Performance counter stats for 'perf':

     218.138382  task clock ticks     (msecs)
              4  context switches     (events)
              8  CPU migrations       (events)
           2136  pagefaults           (events)
       32746212  CPU cycles           (events)  (scaled from 67.04%)
       11961102  instructions         (events)  (scaled from 66.19%)
          49841  cache references     (events)  (scaled from 21.96%)
          13777  cache misses         (events)  (scaled from 21.98%)

 Wall-clock time elapsed:   218.702477 msecs

[acme@emilia ~]$ perf report -i perf_report.perf | head
11.06          perf [.] 0x00000000000057cb /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: dso__find_symbol
 9.15          perf [.] 0x00000000000056a0 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: dso__insert_symbol
 8.72          perf [k] 0xffffffff8101b1d2 intel_pmu_enable_all
 8.51          perf [.] 0x0000000000006672 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: thread__symbol_incnew
 3.83          perf [k] 0xffffffff811cfc5a vsnprintf
 3.40          perf [.] 0x0000000000005e33 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: hex
 3.40          perf [.] 0x0000000000005ec7 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: hex2long
 3.19          perf [k] 0xffffffff811ce1c1 number
 2.77          perf [.] 0x0000000000006869 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: threads__findnew
 2.77          perf [.] 0x000000000000fde3 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: rb_insert_color
[acme@emilia ~]$

After:

acme@emilia ~]$ rm -f perf_report.perf ; perf record -o perf_report.perf perf stat perf report > /dev/null

 Performance counter stats for 'perf':

     190.228511  task clock ticks     (msecs)
              4  context switches     (events)
              7  CPU migrations       (events)
           1625  pagefaults           (events)
       29578745  CPU cycles           (events)  (scaled from 66.92%)
       10516914  instructions         (events)  (scaled from 66.47%)
          44015  cache references     (events)  (scaled from 22.04%)
           8248  cache misses         (events)  (scaled from 22.07%)

 Wall-clock time elapsed:   190.816096 msecs

[acme@emilia ~]$ perf report -i perf_report.perf | head
15.99          perf [.] 0x00000000000057a9 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: dso__find_symbol
10.87          perf [.] 0x000000000000674d /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: thread__symbol_incnew
 8.74          perf [k] 0xffffffff8101b1d2 intel_pmu_enable_all
 5.54          perf [.] 0x0000000000005e42 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: hex
 4.48          perf [.] 0x0000000000005ebe /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: hex2long
 4.48          perf [k] 0xffffffff811cfba0 vsnprintf
 3.84          perf [.] 0x00000000000056b4 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: dso__insert_symbol
 3.62          perf [.] 0x00000000000068d0 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: threads__findnew
 3.20          perf [k] 0xffffffff811ce0b3 number
 2.56          perf [.] 0x0000000000006d78 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: __cmd_report
[acme@emilia ~]$

[ Impact: optimization ]

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090526222155.GJ4424@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index c517483..a55f15d 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -3,6 +3,7 @@
 #include <libelf.h>
 #include <gelf.h>
 #include <elf.h>
+#include <ctype.h>
 
 #include "util/list.h"
 #include "util/rbtree.h"
@@ -408,13 +409,20 @@ static int load_kallsyms(void)
 
 		int len = hex2long(line, &start);
 		
-		len += 3; /* ' t ' */
-		if (len >= line_len)
+		len++;
+		if (len + 2 >= line_len)
+			continue;
+
+		char symbol_type = line[len];
+		/*
+		 * We're interested only in code ('T'ext)
+		 */
+		if (toupper(symbol_type) != 'T')
 			continue;
 		/*
 		 * Well fix up the end later, when we have all sorted.
 		 */
-		struct symbol *sym = symbol__new(start, 0xdead, line + len);
+		struct symbol *sym = symbol__new(start, 0xdead, line + len + 2);
 
 		if (sym == NULL)
 			goto out_delete_dso;
-- 
cgit v0.10.2


From af83632f98aefd1ae4d8ca3c7c285ccf6a7d3956 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 27 May 2009 08:38:48 +0200
Subject: perf report: Only load text symbols from kallsyms, fix

- allow 'W' symbols too
 - Convert initializations to C99 style
 - whitespace cleanups

Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090526222155.GJ4424@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index a55f15d..ed3da9d 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -384,21 +384,26 @@ int hex2long(char *ptr, unsigned long *long_val)
 
 static int load_kallsyms(void)
 {
+	struct rb_node *nd, *prevnd;
+	char *line = NULL;
+	FILE *file;
+	size_t n;
+
 	kernel_dso = dso__new("[kernel]");
 	if (kernel_dso == NULL)
 		return -1;
 
-	FILE *file = fopen("/proc/kallsyms", "r");
-
+	file = fopen("/proc/kallsyms", "r");
 	if (file == NULL)
 		goto out_delete_dso;
 
-	char *line = NULL;
-	size_t n;
-
 	while (!feof(file)) {
 		unsigned long start;
-		int line_len = getline(&line, &n, file);
+		struct symbol *sym;
+		int line_len, len;
+		char symbol_type;
+
+		line_len = getline(&line, &n, file);
 		if (line_len < 0)
 			break;
 
@@ -407,22 +412,22 @@ static int load_kallsyms(void)
 
 		line[--line_len] = '\0'; /* \n */
 
-		int len = hex2long(line, &start);
-		
+		len = hex2long(line, &start);
+
 		len++;
 		if (len + 2 >= line_len)
 			continue;
 
-		char symbol_type = line[len];
+		symbol_type = toupper(line[len]);
 		/*
 		 * We're interested only in code ('T'ext)
 		 */
-		if (toupper(symbol_type) != 'T')
+		if (symbol_type != 'T' && symbol_type != 'W')
 			continue;
 		/*
 		 * Well fix up the end later, when we have all sorted.
 		 */
-		struct symbol *sym = symbol__new(start, 0xdead, line + len + 2);
+		sym = symbol__new(start, 0xdead, line + len + 2);
 
 		if (sym == NULL)
 			goto out_delete_dso;
@@ -434,7 +439,7 @@ static int load_kallsyms(void)
 	 * Now that we have all sorted out, just set the ->end of all
 	 * symbols
 	 */
-	struct rb_node *nd, *prevnd = rb_first(&kernel_dso->syms);
+	prevnd = rb_first(&kernel_dso->syms);
 
 	if (prevnd == NULL)
 		goto out_delete_line;
@@ -450,6 +455,7 @@ static int load_kallsyms(void)
 	dsos__add(kernel_dso);
 	free(line);
 	fclose(file);
+
 	return 0;
 
 out_delete_line:
-- 
cgit v0.10.2


From 815e777f913ed54ddb449d2854015c65b4ecbfe3 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 26 May 2009 19:46:14 -0300
Subject: perf report: Show the IP only in --verbose mode

perf: report should show the IP only in --verbose mode

[acme@emilia ~]$ perf report | head
 4.95          find [k] _spin_lock
 2.19          find [k] ext3fs_dirhash	[ext3]
 1.87          find [k] __rcu_read_lock
 1.86          find [k] _atomic_dec_and_lock
 1.86          find [.] /lib64/libc-2.5.so: __GI_strlen
 1.85          find [k] __kmalloc
 1.62          find [.] /lib64/libc-2.5.so: vfprintf
 1.59          find [k] __rcu_read_unlock
 1.55          find [k] __d_lookup

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090526224614.GK4424@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index ed3da9d..2d65d9c 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -22,6 +22,7 @@ static int		input;
 static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
 static int		dump_trace = 0;
+static int 		verbose;
 
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
@@ -551,9 +552,12 @@ symhist__fprintf(struct symhist *self, uint64_t total_samples, FILE *fp)
 	else
 		ret = fprintf(fp, "%12d", self->count);
 
-	ret += fprintf(fp, "%14s [%c] %#018llx ",
+	ret += fprintf(fp, "%14s [%c] ",
 		       thread__name(self->thread, bf, sizeof(bf)),
-		       self->level, (unsigned long long)self->ip);
+		       self->level);
+
+	if (verbose)
+		ret += fprintf(fp, "%#018llx ", (unsigned long long)self->ip);
 
 	if (self->level != '.')
 		ret += fprintf(fp, "%s\n",
@@ -974,6 +978,8 @@ static const char * const report_usage[] = {
 static const struct option options[] = {
 	OPT_STRING('i', "input", &input_name, "file",
 		    "input file name"),
+	OPT_BOOLEAN('v', "verbose", &verbose,
+		    "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
 	OPT_END()
-- 
cgit v0.10.2


From 16f762a2ac5ecf8a11f6f0332e46cc3459220da5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 27 May 2009 09:10:38 +0200
Subject: perf_counter tools: Introduce stricter C code checking

Tighten up our C code requirements:

 - disallow warnings
 - disallow declarations-mixed-with-statements
 - require proper prototypes
 - require C99 (with gcc extensions)

Fix up a ton of problems these measures unearth:

 - unused functions
 - needlessly global functions
 - missing prototypes
 - code mixed with declarations

Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090526222155.GJ4424@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 10c13a6..efb0589 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -159,7 +159,7 @@ uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
 
 # CFLAGS and LDFLAGS are for the users to override from the command line.
 
-CFLAGS = -ggdb3 -Wall
+CFLAGS = -ggdb3 -Wall -Werror -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement
 LDFLAGS = -lpthread -lrt -lelf
 ALL_CFLAGS = $(CFLAGS)
 ALL_LDFLAGS = $(LDFLAGS)
diff --git a/Documentation/perf_counter/builtin-help.c b/Documentation/perf_counter/builtin-help.c
index 6616de0..d2bd317 100644
--- a/Documentation/perf_counter/builtin-help.c
+++ b/Documentation/perf_counter/builtin-help.c
@@ -399,7 +399,7 @@ static void get_html_page_path(struct strbuf *page_path, const char *page)
  * HTML.
  */
 #ifndef open_html
-void open_html(const char *path)
+static void open_html(const char *path)
 {
 	execl_perf_cmd("web--browse", "-c", "help.browser", path, NULL);
 }
diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index ec2b787..68abfdf 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -1,6 +1,7 @@
 
 
 #include "perf.h"
+#include "builtin.h"
 #include "util/util.h"
 #include "util/parse-options.h"
 #include "util/parse-events.h"
@@ -144,26 +145,32 @@ static int nr_poll;
 static int nr_cpu;
 
 struct mmap_event {
-	struct perf_event_header header;
-	__u32 pid, tid;
-	__u64 start;
-	__u64 len;
-	__u64 pgoff;
-	char filename[PATH_MAX];
+	struct perf_event_header	header;
+	__u32				pid;
+	__u32				tid;
+	__u64				start;
+	__u64				len;
+	__u64				pgoff;
+	char				filename[PATH_MAX];
 };
+
 struct comm_event {
-	struct perf_event_header header;
-	__u32 pid,tid;
-	char comm[16];
+	struct perf_event_header	header;
+	__u32				pid;
+	__u32				tid;
+	char				comm[16];
 };
 
 static pid_t pid_synthesize_comm_event(pid_t pid)
 {
+	struct comm_event comm_ev;
 	char filename[PATH_MAX];
+	pid_t spid, ppid;
 	char bf[BUFSIZ];
-	struct comm_event comm_ev;
+	int fd, nr, ret;
+	char comm[18];
 	size_t size;
-	int fd;
+	char state;
 
 	snprintf(filename, sizeof(filename), "/proc/%d/stat", pid);
 
@@ -178,12 +185,8 @@ static pid_t pid_synthesize_comm_event(pid_t pid)
 	}
 	close(fd);
 
-	pid_t spid, ppid;
-	char state;
-	char comm[18];
-
 	memset(&comm_ev, 0, sizeof(comm_ev));
-        int nr = sscanf(bf, "%d %s %c %d %d ",
+        nr = sscanf(bf, "%d %s %c %d %d ",
 			&spid, comm, &state, &ppid, &comm_ev.pid);
 	if (nr != 5) {
 		fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n",
@@ -198,7 +201,8 @@ static pid_t pid_synthesize_comm_event(pid_t pid)
 	memcpy(comm_ev.comm, comm + 1, size);
 	size = ALIGN(size, sizeof(uint64_t));
 	comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
-	int ret = write(output, &comm_ev, comm_ev.header.size);
+
+	ret = write(output, &comm_ev, comm_ev.header.size);
 	if (ret < 0) {
 		perror("failed to write");
 		exit(-1);
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 2d65d9c..7f1255d 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -1,4 +1,5 @@
 #include "util/util.h"
+#include "builtin.h"
 
 #include <libelf.h>
 #include <gelf.h>
@@ -22,7 +23,7 @@ static int		input;
 static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
 static int		dump_trace = 0;
-static int 		verbose;
+static int		verbose;
 
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
@@ -60,10 +61,10 @@ typedef union event_union {
 } event_t;
 
 struct symbol {
-	struct rb_node rb_node;
-	uint64_t       start;
-	uint64_t       end;
-	char	       name[0];
+	struct rb_node		rb_node;
+	__u64			start;
+	__u64			end;
+	char			name[0];
 };
 
 static struct symbol *symbol__new(uint64_t start, uint64_t len, const char *name)
@@ -86,7 +87,7 @@ static void symbol__delete(struct symbol *self)
 
 static size_t symbol__fprintf(struct symbol *self, FILE *fp)
 {
-	return fprintf(fp, " %lx-%lx %s\n",
+	return fprintf(fp, " %llx-%llx %s\n",
 		       self->start, self->end, self->name);
 }
 
@@ -147,10 +148,12 @@ static void dso__insert_symbol(struct dso *self, struct symbol *sym)
 
 static struct symbol *dso__find_symbol(struct dso *self, uint64_t ip)
 {
+	struct rb_node *n;
+
 	if (self == NULL)
 		return NULL;
 
-	struct rb_node *n = self->syms.rb_node;
+	n = self->syms.rb_node;
 
 	while (n) {
 		struct symbol *s = rb_entry(n, struct symbol, rb_node);
@@ -221,33 +224,42 @@ static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
 
 static int dso__load(struct dso *self)
 {
-	int fd = open(self->name, O_RDONLY), err = -1;
+	Elf_Data *symstrs;
+	uint32_t nr_syms;
+	int fd, err = -1;
+	uint32_t index;
+	GElf_Ehdr ehdr;
+	GElf_Shdr shdr;
+	Elf_Data *syms;
+	GElf_Sym sym;
+	Elf_Scn *sec;
+	Elf *elf;
 
+
+	fd = open(self->name, O_RDONLY);
 	if (fd == -1)
 		return -1;
 
-	Elf *elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
+	elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
 	if (elf == NULL) {
 		fprintf(stderr, "%s: cannot read %s ELF file.\n",
 			__func__, self->name);
 		goto out_close;
 	}
 
-	GElf_Ehdr ehdr;
 	if (gelf_getehdr(elf, &ehdr) == NULL) {
 		fprintf(stderr, "%s: cannot get elf header.\n", __func__);
 		goto out_elf_end;
 	}
 
-	GElf_Shdr shdr;
-	Elf_Scn *sec = elf_section_by_name(elf, &ehdr, &shdr, ".symtab", NULL);
+	sec = elf_section_by_name(elf, &ehdr, &shdr, ".symtab", NULL);
 	if (sec == NULL)
 		sec = elf_section_by_name(elf, &ehdr, &shdr, ".dynsym", NULL);
 
 	if (sec == NULL)
 		goto out_elf_end;
 
-	Elf_Data *syms = elf_getdata(sec, NULL);
+	syms = elf_getdata(sec, NULL);
 	if (syms == NULL)
 		goto out_elf_end;
 
@@ -255,14 +267,12 @@ static int dso__load(struct dso *self)
 	if (sec == NULL)
 		goto out_elf_end;
 
-	Elf_Data *symstrs = elf_getdata(sec, NULL);
+	symstrs = elf_getdata(sec, NULL);
 	if (symstrs == NULL)
 		goto out_elf_end;
 
-	const uint32_t nr_syms = shdr.sh_size / shdr.sh_entsize;
+	nr_syms = shdr.sh_size / shdr.sh_entsize;
 
-	GElf_Sym sym;
-	uint32_t index;
 	elf_symtab__for_each_symbol(syms, nr_syms, index, sym) {
 		struct symbol *f;
 
@@ -342,7 +352,7 @@ out_delete_dso:
 	return NULL;
 }
 
-void dsos__fprintf(FILE *fp)
+static void dsos__fprintf(FILE *fp)
 {
 	struct dso *pos;
 
@@ -365,7 +375,7 @@ static int hex(char ch)
  * While we find nice hex chars, build a long_val.
  * Return number of chars processed.
  */
-int hex2long(char *ptr, unsigned long *long_val)
+static int hex2long(char *ptr, unsigned long *long_val)
 {
 	const char *p = ptr;
 	*long_val = 0;
@@ -493,12 +503,6 @@ out_delete:
 	return NULL;
 }
 
-static size_t map__fprintf(struct map *self, FILE *fp)
-{
-	return fprintf(fp, " %lx-%lx %lx %s\n",
-		       self->start, self->end, self->pgoff, self->dso->name);
-}
-
 struct thread;
 
 static const char *thread__name(struct thread *self, char *bf, size_t size);
@@ -531,11 +535,6 @@ static struct symhist *symhist__new(struct symbol *sym, uint64_t ip,
 	return self;
 }
 
-void symhist__delete(struct symhist *self)
-{
-	free(self);
-}
-
 static void symhist__inc(struct symhist *self)
 {
 	++self->count;
@@ -608,6 +607,8 @@ static int thread__symbol_incnew(struct thread *self, struct symbol *sym,
 	struct symhist *sh;
 
 	while (*p != NULL) {
+		uint64_t start;
+
 		parent = *p;
 		sh = rb_entry(parent, struct symhist, rb_node);
 
@@ -617,7 +618,7 @@ static int thread__symbol_incnew(struct thread *self, struct symbol *sym,
 		}
 
 		/* Handle unresolved symbols too */
-		const uint64_t start = !sh->sym ? sh->ip : sh->sym->start;
+		start = !sh->sym ? sh->ip : sh->sym->start;
 
 		if (ip < start)
 			p = &(*p)->rb_left;
@@ -639,17 +640,6 @@ static int thread__set_comm(struct thread *self, const char *comm)
 	return self->comm ? 0 : -ENOMEM;
 }
 
-size_t thread__maps_fprintf(struct thread *self, FILE *fp)
-{
-	struct map *pos;
-	size_t ret = 0;
-
-	list_for_each_entry(pos, &self->maps, node)
-		ret += map__fprintf(pos, fp);
-
-	return ret;
-}
-
 static size_t thread__fprintf(struct thread *self, FILE *fp)
 {
 	int ret = fprintf(fp, "thread: %d %s\n", self->pid, self->comm);
@@ -657,13 +647,14 @@ static size_t thread__fprintf(struct thread *self, FILE *fp)
 
 	for (nd = rb_first(&self->symhists); nd; nd = rb_next(nd)) {
 		struct symhist *pos = rb_entry(nd, struct symhist, rb_node);
+
 		ret += symhist__fprintf(pos, 0, fp);
 	}
 
 	return ret;
 }
 
-static struct rb_root threads = RB_ROOT;
+static struct rb_root threads;
 
 static struct thread *threads__findnew(pid_t pid)
 {
@@ -699,11 +690,11 @@ static void thread__insert_map(struct thread *self, struct map *map)
 
 static struct map *thread__find_map(struct thread *self, uint64_t ip)
 {
+	struct map *pos;
+
 	if (self == NULL)
 		return NULL;
 
-	struct map *pos;
-
 	list_for_each_entry(pos, &self->maps, node)
 		if (ip >= pos->start && ip <= pos->end)
 			return pos;
@@ -711,7 +702,7 @@ static struct map *thread__find_map(struct thread *self, uint64_t ip)
 	return NULL;
 }
 
-void threads__fprintf(FILE *fp)
+static void threads__fprintf(FILE *fp)
 {
 	struct rb_node *nd;
 	for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
@@ -720,7 +711,7 @@ void threads__fprintf(FILE *fp)
 	}
 }
 
-static struct rb_root global_symhists = RB_ROOT;
+static struct rb_root global_symhists;
 
 static void threads__insert_symhist(struct symhist *sh)
 {
@@ -852,7 +843,7 @@ more:
 				(void *)(long)(event->header.size),
 				event->header.misc,
 				event->ip.pid,
-				(void *)event->ip.ip);
+				(void *)(long)ip);
 		}
 
 		if (thread == NULL) {
@@ -866,9 +857,12 @@ more:
 			level = 'k';
 			dso = kernel_dso;
 		} else if (event->header.misc & PERF_EVENT_MISC_USER) {
+			struct map *map;
+
 			show = SHOW_USER;
 			level = '.';
-			struct map *map = thread__find_map(thread, ip);
+
+			map = thread__find_map(thread, ip);
 			if (map != NULL) {
 				dso = map->dso;
 				ip -= map->start + map->pgoff;
@@ -896,9 +890,9 @@ more:
 			fprintf(stderr, "%p [%p]: PERF_EVENT_MMAP: [%p(%p) @ %p]: %s\n",
 				(void *)(offset + head),
 				(void *)(long)(event->header.size),
-				(void *)event->mmap.start,
-				(void *)event->mmap.len,
-				(void *)event->mmap.pgoff,
+				(void *)(long)event->mmap.start,
+				(void *)(long)event->mmap.len,
+				(void *)(long)event->mmap.pgoff,
 				event->mmap.filename);
 		}
 		if (thread == NULL || map == NULL) {
@@ -964,6 +958,11 @@ done:
 		return 0;
 	}
 
+	if (verbose >= 2) {
+		dsos__fprintf(stdout);
+		threads__fprintf(stdout);
+	}
+
 	threads__sort_symhists();
 	threads__symhists_fprintf(total, stdout);
 
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index e7cb941..ce661e2 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -30,6 +30,7 @@
  */
 
 #include "perf.h"
+#include "builtin.h"
 #include "util/util.h"
 #include "util/parse-options.h"
 #include "util/parse-events.h"
@@ -108,7 +109,7 @@ static void create_perfstat_counter(int counter)
 	}
 }
 
-int do_perfstat(int argc, const char **argv)
+static int do_perfstat(int argc, const char **argv)
 {
 	unsigned long long t0, t1;
 	int counter;
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 6b1c66f..a890872 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -42,8 +42,8 @@
   * Released under the GPL v2. (and only v2, not any later version)
   */
 
-
 #include "perf.h"
+#include "builtin.h"
 #include "util/util.h"
 #include "util/util.h"
 #include "util/parse-options.h"
diff --git a/Documentation/perf_counter/util/abspath.c b/Documentation/perf_counter/util/abspath.c
index 649f34f..61d33b8 100644
--- a/Documentation/perf_counter/util/abspath.c
+++ b/Documentation/perf_counter/util/abspath.c
@@ -5,7 +5,7 @@
  * symlink to a directory, we do not want to say it is a directory when
  * dealing with tracked content in the working tree.
  */
-int is_directory(const char *path)
+static int is_directory(const char *path)
 {
 	struct stat st;
 	return (!stat(path, &st) && S_ISDIR(st.st_mode));
diff --git a/Documentation/perf_counter/util/cache.h b/Documentation/perf_counter/util/cache.h
index 7108051..393d614 100644
--- a/Documentation/perf_counter/util/cache.h
+++ b/Documentation/perf_counter/util/cache.h
@@ -104,6 +104,8 @@ char *strip_path_suffix(const char *path, const char *suffix);
 
 extern char *mkpath(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
 extern char *perf_path(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
+/* perf_mkstemp() - create tmp file honoring TMPDIR variable */
+extern int perf_mkstemp(char *path, size_t len, const char *template);
 
 extern char *mksnpath(char *buf, size_t n, const char *fmt, ...)
 	__attribute__((format (printf, 3, 4)));
diff --git a/Documentation/perf_counter/util/util.h b/Documentation/perf_counter/util/util.h
index 36e40c3..76590a1 100644
--- a/Documentation/perf_counter/util/util.h
+++ b/Documentation/perf_counter/util/util.h
@@ -309,6 +309,8 @@ extern ssize_t xread(int fd, void *buf, size_t len);
 extern ssize_t xwrite(int fd, const void *buf, size_t len);
 extern int xdup(int fd);
 extern FILE *xfdopen(int fd, const char *mode);
+extern int xmkstemp(char *template);
+
 static inline size_t xsize_t(off_t len)
 {
 	return (size_t)len;
-- 
cgit v0.10.2


From b16e7766d6436835f473ba823ad04fbdfe5e9cbd Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 27 May 2009 13:36:10 +1000
Subject: powerpc: Move dma-noncoherent.c from arch/powerpc/lib to
 arch/powerpc/mm

(pre-requisite to make the next patches more palatable)

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 8db3527..29b742b 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -18,7 +18,6 @@ obj-$(CONFIG_PPC64)	+= copypage_64.o copyuser_64.o \
 			   memcpy_64.o usercopy_64.o mem_64.o string.o
 obj-$(CONFIG_XMON)	+= sstep.o
 obj-$(CONFIG_KPROBES)	+= sstep.o
-obj-$(CONFIG_NOT_COHERENT_CACHE)	+= dma-noncoherent.o
 
 ifeq ($(CONFIG_PPC64),y)
 obj-$(CONFIG_SMP)	+= locks.o
diff --git a/arch/powerpc/lib/dma-noncoherent.c b/arch/powerpc/lib/dma-noncoherent.c
deleted file mode 100644
index b7dc4c1..0000000
--- a/arch/powerpc/lib/dma-noncoherent.c
+++ /dev/null
@@ -1,426 +0,0 @@
-/*
- *  PowerPC version derived from arch/arm/mm/consistent.c
- *    Copyright (C) 2001 Dan Malek (dmalek@jlc.net)
- *
- *  Copyright (C) 2000 Russell King
- *
- * Consistent memory allocators.  Used for DMA devices that want to
- * share uncached memory with the processor core.  The function return
- * is the virtual address and 'dma_handle' is the physical address.
- * Mostly stolen from the ARM port, with some changes for PowerPC.
- *						-- Dan
- *
- * Reorganized to get rid of the arch-specific consistent_* functions
- * and provide non-coherent implementations for the DMA API. -Matt
- *
- * Added in_interrupt() safe dma_alloc_coherent()/dma_free_coherent()
- * implementation. This is pulled straight from ARM and barely
- * modified. -Matt
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/highmem.h>
-#include <linux/dma-mapping.h>
-
-#include <asm/tlbflush.h>
-
-/*
- * This address range defaults to a value that is safe for all
- * platforms which currently set CONFIG_NOT_COHERENT_CACHE. It
- * can be further configured for specific applications under
- * the "Advanced Setup" menu. -Matt
- */
-#define CONSISTENT_BASE	(CONFIG_CONSISTENT_START)
-#define CONSISTENT_END	(CONFIG_CONSISTENT_START + CONFIG_CONSISTENT_SIZE)
-#define CONSISTENT_OFFSET(x)	(((unsigned long)(x) - CONSISTENT_BASE) >> PAGE_SHIFT)
-
-/*
- * This is the page table (2MB) covering uncached, DMA consistent allocations
- */
-static pte_t *consistent_pte;
-static DEFINE_SPINLOCK(consistent_lock);
-
-/*
- * VM region handling support.
- *
- * This should become something generic, handling VM region allocations for
- * vmalloc and similar (ioremap, module space, etc).
- *
- * I envisage vmalloc()'s supporting vm_struct becoming:
- *
- *  struct vm_struct {
- *    struct vm_region	region;
- *    unsigned long	flags;
- *    struct page	**pages;
- *    unsigned int	nr_pages;
- *    unsigned long	phys_addr;
- *  };
- *
- * get_vm_area() would then call vm_region_alloc with an appropriate
- * struct vm_region head (eg):
- *
- *  struct vm_region vmalloc_head = {
- *	.vm_list	= LIST_HEAD_INIT(vmalloc_head.vm_list),
- *	.vm_start	= VMALLOC_START,
- *	.vm_end		= VMALLOC_END,
- *  };
- *
- * However, vmalloc_head.vm_start is variable (typically, it is dependent on
- * the amount of RAM found at boot time.)  I would imagine that get_vm_area()
- * would have to initialise this each time prior to calling vm_region_alloc().
- */
-struct ppc_vm_region {
-	struct list_head	vm_list;
-	unsigned long		vm_start;
-	unsigned long		vm_end;
-};
-
-static struct ppc_vm_region consistent_head = {
-	.vm_list	= LIST_HEAD_INIT(consistent_head.vm_list),
-	.vm_start	= CONSISTENT_BASE,
-	.vm_end		= CONSISTENT_END,
-};
-
-static struct ppc_vm_region *
-ppc_vm_region_alloc(struct ppc_vm_region *head, size_t size, gfp_t gfp)
-{
-	unsigned long addr = head->vm_start, end = head->vm_end - size;
-	unsigned long flags;
-	struct ppc_vm_region *c, *new;
-
-	new = kmalloc(sizeof(struct ppc_vm_region), gfp);
-	if (!new)
-		goto out;
-
-	spin_lock_irqsave(&consistent_lock, flags);
-
-	list_for_each_entry(c, &head->vm_list, vm_list) {
-		if ((addr + size) < addr)
-			goto nospc;
-		if ((addr + size) <= c->vm_start)
-			goto found;
-		addr = c->vm_end;
-		if (addr > end)
-			goto nospc;
-	}
-
- found:
-	/*
-	 * Insert this entry _before_ the one we found.
-	 */
-	list_add_tail(&new->vm_list, &c->vm_list);
-	new->vm_start = addr;
-	new->vm_end = addr + size;
-
-	spin_unlock_irqrestore(&consistent_lock, flags);
-	return new;
-
- nospc:
-	spin_unlock_irqrestore(&consistent_lock, flags);
-	kfree(new);
- out:
-	return NULL;
-}
-
-static struct ppc_vm_region *ppc_vm_region_find(struct ppc_vm_region *head, unsigned long addr)
-{
-	struct ppc_vm_region *c;
-
-	list_for_each_entry(c, &head->vm_list, vm_list) {
-		if (c->vm_start == addr)
-			goto out;
-	}
-	c = NULL;
- out:
-	return c;
-}
-
-/*
- * Allocate DMA-coherent memory space and return both the kernel remapped
- * virtual and bus address for that space.
- */
-void *
-__dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp)
-{
-	struct page *page;
-	struct ppc_vm_region *c;
-	unsigned long order;
-	u64 mask = 0x00ffffff, limit; /* ISA default */
-
-	if (!consistent_pte) {
-		printk(KERN_ERR "%s: not initialised\n", __func__);
-		dump_stack();
-		return NULL;
-	}
-
-	size = PAGE_ALIGN(size);
-	limit = (mask + 1) & ~mask;
-	if ((limit && size >= limit) || size >= (CONSISTENT_END - CONSISTENT_BASE)) {
-		printk(KERN_WARNING "coherent allocation too big (requested %#x mask %#Lx)\n",
-		       size, mask);
-		return NULL;
-	}
-
-	order = get_order(size);
-
-	if (mask != 0xffffffff)
-		gfp |= GFP_DMA;
-
-	page = alloc_pages(gfp, order);
-	if (!page)
-		goto no_page;
-
-	/*
-	 * Invalidate any data that might be lurking in the
-	 * kernel direct-mapped region for device DMA.
-	 */
-	{
-		unsigned long kaddr = (unsigned long)page_address(page);
-		memset(page_address(page), 0, size);
-		flush_dcache_range(kaddr, kaddr + size);
-	}
-
-	/*
-	 * Allocate a virtual address in the consistent mapping region.
-	 */
-	c = ppc_vm_region_alloc(&consistent_head, size,
-			    gfp & ~(__GFP_DMA | __GFP_HIGHMEM));
-	if (c) {
-		unsigned long vaddr = c->vm_start;
-		pte_t *pte = consistent_pte + CONSISTENT_OFFSET(vaddr);
-		struct page *end = page + (1 << order);
-
-		split_page(page, order);
-
-		/*
-		 * Set the "dma handle"
-		 */
-		*handle = page_to_phys(page);
-
-		do {
-			BUG_ON(!pte_none(*pte));
-
-			SetPageReserved(page);
-			set_pte_at(&init_mm, vaddr,
-				   pte, mk_pte(page, pgprot_noncached(PAGE_KERNEL)));
-			page++;
-			pte++;
-			vaddr += PAGE_SIZE;
-		} while (size -= PAGE_SIZE);
-
-		/*
-		 * Free the otherwise unused pages.
-		 */
-		while (page < end) {
-			__free_page(page);
-			page++;
-		}
-
-		return (void *)c->vm_start;
-	}
-
-	if (page)
-		__free_pages(page, order);
- no_page:
-	return NULL;
-}
-EXPORT_SYMBOL(__dma_alloc_coherent);
-
-/*
- * free a page as defined by the above mapping.
- */
-void __dma_free_coherent(size_t size, void *vaddr)
-{
-	struct ppc_vm_region *c;
-	unsigned long flags, addr;
-	pte_t *ptep;
-
-	size = PAGE_ALIGN(size);
-
-	spin_lock_irqsave(&consistent_lock, flags);
-
-	c = ppc_vm_region_find(&consistent_head, (unsigned long)vaddr);
-	if (!c)
-		goto no_area;
-
-	if ((c->vm_end - c->vm_start) != size) {
-		printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n",
-		       __func__, c->vm_end - c->vm_start, size);
-		dump_stack();
-		size = c->vm_end - c->vm_start;
-	}
-
-	ptep = consistent_pte + CONSISTENT_OFFSET(c->vm_start);
-	addr = c->vm_start;
-	do {
-		pte_t pte = ptep_get_and_clear(&init_mm, addr, ptep);
-		unsigned long pfn;
-
-		ptep++;
-		addr += PAGE_SIZE;
-
-		if (!pte_none(pte) && pte_present(pte)) {
-			pfn = pte_pfn(pte);
-
-			if (pfn_valid(pfn)) {
-				struct page *page = pfn_to_page(pfn);
-				ClearPageReserved(page);
-
-				__free_page(page);
-				continue;
-			}
-		}
-
-		printk(KERN_CRIT "%s: bad page in kernel page table\n",
-		       __func__);
-	} while (size -= PAGE_SIZE);
-
-	flush_tlb_kernel_range(c->vm_start, c->vm_end);
-
-	list_del(&c->vm_list);
-
-	spin_unlock_irqrestore(&consistent_lock, flags);
-
-	kfree(c);
-	return;
-
- no_area:
-	spin_unlock_irqrestore(&consistent_lock, flags);
-	printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n",
-	       __func__, vaddr);
-	dump_stack();
-}
-EXPORT_SYMBOL(__dma_free_coherent);
-
-/*
- * Initialise the consistent memory allocation.
- */
-static int __init dma_alloc_init(void)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-	int ret = 0;
-
-	do {
-		pgd = pgd_offset(&init_mm, CONSISTENT_BASE);
-		pud = pud_alloc(&init_mm, pgd, CONSISTENT_BASE);
-		pmd = pmd_alloc(&init_mm, pud, CONSISTENT_BASE);
-		if (!pmd) {
-			printk(KERN_ERR "%s: no pmd tables\n", __func__);
-			ret = -ENOMEM;
-			break;
-		}
-
-		pte = pte_alloc_kernel(pmd, CONSISTENT_BASE);
-		if (!pte) {
-			printk(KERN_ERR "%s: no pte tables\n", __func__);
-			ret = -ENOMEM;
-			break;
-		}
-
-		consistent_pte = pte;
-	} while (0);
-
-	return ret;
-}
-
-core_initcall(dma_alloc_init);
-
-/*
- * make an area consistent.
- */
-void __dma_sync(void *vaddr, size_t size, int direction)
-{
-	unsigned long start = (unsigned long)vaddr;
-	unsigned long end   = start + size;
-
-	switch (direction) {
-	case DMA_NONE:
-		BUG();
-	case DMA_FROM_DEVICE:
-		/*
-		 * invalidate only when cache-line aligned otherwise there is
-		 * the potential for discarding uncommitted data from the cache
-		 */
-		if ((start & (L1_CACHE_BYTES - 1)) || (size & (L1_CACHE_BYTES - 1)))
-			flush_dcache_range(start, end);
-		else
-			invalidate_dcache_range(start, end);
-		break;
-	case DMA_TO_DEVICE:		/* writeback only */
-		clean_dcache_range(start, end);
-		break;
-	case DMA_BIDIRECTIONAL:	/* writeback and invalidate */
-		flush_dcache_range(start, end);
-		break;
-	}
-}
-EXPORT_SYMBOL(__dma_sync);
-
-#ifdef CONFIG_HIGHMEM
-/*
- * __dma_sync_page() implementation for systems using highmem.
- * In this case, each page of a buffer must be kmapped/kunmapped
- * in order to have a virtual address for __dma_sync(). This must
- * not sleep so kmap_atomic()/kunmap_atomic() are used.
- *
- * Note: yes, it is possible and correct to have a buffer extend
- * beyond the first page.
- */
-static inline void __dma_sync_page_highmem(struct page *page,
-		unsigned long offset, size_t size, int direction)
-{
-	size_t seg_size = min((size_t)(PAGE_SIZE - offset), size);
-	size_t cur_size = seg_size;
-	unsigned long flags, start, seg_offset = offset;
-	int nr_segs = 1 + ((size - seg_size) + PAGE_SIZE - 1)/PAGE_SIZE;
-	int seg_nr = 0;
-
-	local_irq_save(flags);
-
-	do {
-		start = (unsigned long)kmap_atomic(page + seg_nr,
-				KM_PPC_SYNC_PAGE) + seg_offset;
-
-		/* Sync this buffer segment */
-		__dma_sync((void *)start, seg_size, direction);
-		kunmap_atomic((void *)start, KM_PPC_SYNC_PAGE);
-		seg_nr++;
-
-		/* Calculate next buffer segment size */
-		seg_size = min((size_t)PAGE_SIZE, size - cur_size);
-
-		/* Add the segment size to our running total */
-		cur_size += seg_size;
-		seg_offset = 0;
-	} while (seg_nr < nr_segs);
-
-	local_irq_restore(flags);
-}
-#endif /* CONFIG_HIGHMEM */
-
-/*
- * __dma_sync_page makes memory consistent. identical to __dma_sync, but
- * takes a struct page instead of a virtual address
- */
-void __dma_sync_page(struct page *page, unsigned long offset,
-	size_t size, int direction)
-{
-#ifdef CONFIG_HIGHMEM
-	__dma_sync_page_highmem(page, offset, size, direction);
-#else
-	unsigned long start = (unsigned long)page_address(page) + offset;
-	__dma_sync((void *)start, size, direction);
-#endif
-}
-EXPORT_SYMBOL(__dma_sync_page);
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 17290bc..b746f4c 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -26,3 +26,4 @@ obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
 obj-$(CONFIG_PPC_MM_SLICES)	+= slice.o
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
 obj-$(CONFIG_PPC_SUBPAGE_PROT)	+= subpage-prot.o
+obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c
new file mode 100644
index 0000000..b7dc4c1
--- /dev/null
+++ b/arch/powerpc/mm/dma-noncoherent.c
@@ -0,0 +1,426 @@
+/*
+ *  PowerPC version derived from arch/arm/mm/consistent.c
+ *    Copyright (C) 2001 Dan Malek (dmalek@jlc.net)
+ *
+ *  Copyright (C) 2000 Russell King
+ *
+ * Consistent memory allocators.  Used for DMA devices that want to
+ * share uncached memory with the processor core.  The function return
+ * is the virtual address and 'dma_handle' is the physical address.
+ * Mostly stolen from the ARM port, with some changes for PowerPC.
+ *						-- Dan
+ *
+ * Reorganized to get rid of the arch-specific consistent_* functions
+ * and provide non-coherent implementations for the DMA API. -Matt
+ *
+ * Added in_interrupt() safe dma_alloc_coherent()/dma_free_coherent()
+ * implementation. This is pulled straight from ARM and barely
+ * modified. -Matt
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+#include <linux/dma-mapping.h>
+
+#include <asm/tlbflush.h>
+
+/*
+ * This address range defaults to a value that is safe for all
+ * platforms which currently set CONFIG_NOT_COHERENT_CACHE. It
+ * can be further configured for specific applications under
+ * the "Advanced Setup" menu. -Matt
+ */
+#define CONSISTENT_BASE	(CONFIG_CONSISTENT_START)
+#define CONSISTENT_END	(CONFIG_CONSISTENT_START + CONFIG_CONSISTENT_SIZE)
+#define CONSISTENT_OFFSET(x)	(((unsigned long)(x) - CONSISTENT_BASE) >> PAGE_SHIFT)
+
+/*
+ * This is the page table (2MB) covering uncached, DMA consistent allocations
+ */
+static pte_t *consistent_pte;
+static DEFINE_SPINLOCK(consistent_lock);
+
+/*
+ * VM region handling support.
+ *
+ * This should become something generic, handling VM region allocations for
+ * vmalloc and similar (ioremap, module space, etc).
+ *
+ * I envisage vmalloc()'s supporting vm_struct becoming:
+ *
+ *  struct vm_struct {
+ *    struct vm_region	region;
+ *    unsigned long	flags;
+ *    struct page	**pages;
+ *    unsigned int	nr_pages;
+ *    unsigned long	phys_addr;
+ *  };
+ *
+ * get_vm_area() would then call vm_region_alloc with an appropriate
+ * struct vm_region head (eg):
+ *
+ *  struct vm_region vmalloc_head = {
+ *	.vm_list	= LIST_HEAD_INIT(vmalloc_head.vm_list),
+ *	.vm_start	= VMALLOC_START,
+ *	.vm_end		= VMALLOC_END,
+ *  };
+ *
+ * However, vmalloc_head.vm_start is variable (typically, it is dependent on
+ * the amount of RAM found at boot time.)  I would imagine that get_vm_area()
+ * would have to initialise this each time prior to calling vm_region_alloc().
+ */
+struct ppc_vm_region {
+	struct list_head	vm_list;
+	unsigned long		vm_start;
+	unsigned long		vm_end;
+};
+
+static struct ppc_vm_region consistent_head = {
+	.vm_list	= LIST_HEAD_INIT(consistent_head.vm_list),
+	.vm_start	= CONSISTENT_BASE,
+	.vm_end		= CONSISTENT_END,
+};
+
+static struct ppc_vm_region *
+ppc_vm_region_alloc(struct ppc_vm_region *head, size_t size, gfp_t gfp)
+{
+	unsigned long addr = head->vm_start, end = head->vm_end - size;
+	unsigned long flags;
+	struct ppc_vm_region *c, *new;
+
+	new = kmalloc(sizeof(struct ppc_vm_region), gfp);
+	if (!new)
+		goto out;
+
+	spin_lock_irqsave(&consistent_lock, flags);
+
+	list_for_each_entry(c, &head->vm_list, vm_list) {
+		if ((addr + size) < addr)
+			goto nospc;
+		if ((addr + size) <= c->vm_start)
+			goto found;
+		addr = c->vm_end;
+		if (addr > end)
+			goto nospc;
+	}
+
+ found:
+	/*
+	 * Insert this entry _before_ the one we found.
+	 */
+	list_add_tail(&new->vm_list, &c->vm_list);
+	new->vm_start = addr;
+	new->vm_end = addr + size;
+
+	spin_unlock_irqrestore(&consistent_lock, flags);
+	return new;
+
+ nospc:
+	spin_unlock_irqrestore(&consistent_lock, flags);
+	kfree(new);
+ out:
+	return NULL;
+}
+
+static struct ppc_vm_region *ppc_vm_region_find(struct ppc_vm_region *head, unsigned long addr)
+{
+	struct ppc_vm_region *c;
+
+	list_for_each_entry(c, &head->vm_list, vm_list) {
+		if (c->vm_start == addr)
+			goto out;
+	}
+	c = NULL;
+ out:
+	return c;
+}
+
+/*
+ * Allocate DMA-coherent memory space and return both the kernel remapped
+ * virtual and bus address for that space.
+ */
+void *
+__dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp)
+{
+	struct page *page;
+	struct ppc_vm_region *c;
+	unsigned long order;
+	u64 mask = 0x00ffffff, limit; /* ISA default */
+
+	if (!consistent_pte) {
+		printk(KERN_ERR "%s: not initialised\n", __func__);
+		dump_stack();
+		return NULL;
+	}
+
+	size = PAGE_ALIGN(size);
+	limit = (mask + 1) & ~mask;
+	if ((limit && size >= limit) || size >= (CONSISTENT_END - CONSISTENT_BASE)) {
+		printk(KERN_WARNING "coherent allocation too big (requested %#x mask %#Lx)\n",
+		       size, mask);
+		return NULL;
+	}
+
+	order = get_order(size);
+
+	if (mask != 0xffffffff)
+		gfp |= GFP_DMA;
+
+	page = alloc_pages(gfp, order);
+	if (!page)
+		goto no_page;
+
+	/*
+	 * Invalidate any data that might be lurking in the
+	 * kernel direct-mapped region for device DMA.
+	 */
+	{
+		unsigned long kaddr = (unsigned long)page_address(page);
+		memset(page_address(page), 0, size);
+		flush_dcache_range(kaddr, kaddr + size);
+	}
+
+	/*
+	 * Allocate a virtual address in the consistent mapping region.
+	 */
+	c = ppc_vm_region_alloc(&consistent_head, size,
+			    gfp & ~(__GFP_DMA | __GFP_HIGHMEM));
+	if (c) {
+		unsigned long vaddr = c->vm_start;
+		pte_t *pte = consistent_pte + CONSISTENT_OFFSET(vaddr);
+		struct page *end = page + (1 << order);
+
+		split_page(page, order);
+
+		/*
+		 * Set the "dma handle"
+		 */
+		*handle = page_to_phys(page);
+
+		do {
+			BUG_ON(!pte_none(*pte));
+
+			SetPageReserved(page);
+			set_pte_at(&init_mm, vaddr,
+				   pte, mk_pte(page, pgprot_noncached(PAGE_KERNEL)));
+			page++;
+			pte++;
+			vaddr += PAGE_SIZE;
+		} while (size -= PAGE_SIZE);
+
+		/*
+		 * Free the otherwise unused pages.
+		 */
+		while (page < end) {
+			__free_page(page);
+			page++;
+		}
+
+		return (void *)c->vm_start;
+	}
+
+	if (page)
+		__free_pages(page, order);
+ no_page:
+	return NULL;
+}
+EXPORT_SYMBOL(__dma_alloc_coherent);
+
+/*
+ * free a page as defined by the above mapping.
+ */
+void __dma_free_coherent(size_t size, void *vaddr)
+{
+	struct ppc_vm_region *c;
+	unsigned long flags, addr;
+	pte_t *ptep;
+
+	size = PAGE_ALIGN(size);
+
+	spin_lock_irqsave(&consistent_lock, flags);
+
+	c = ppc_vm_region_find(&consistent_head, (unsigned long)vaddr);
+	if (!c)
+		goto no_area;
+
+	if ((c->vm_end - c->vm_start) != size) {
+		printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n",
+		       __func__, c->vm_end - c->vm_start, size);
+		dump_stack();
+		size = c->vm_end - c->vm_start;
+	}
+
+	ptep = consistent_pte + CONSISTENT_OFFSET(c->vm_start);
+	addr = c->vm_start;
+	do {
+		pte_t pte = ptep_get_and_clear(&init_mm, addr, ptep);
+		unsigned long pfn;
+
+		ptep++;
+		addr += PAGE_SIZE;
+
+		if (!pte_none(pte) && pte_present(pte)) {
+			pfn = pte_pfn(pte);
+
+			if (pfn_valid(pfn)) {
+				struct page *page = pfn_to_page(pfn);
+				ClearPageReserved(page);
+
+				__free_page(page);
+				continue;
+			}
+		}
+
+		printk(KERN_CRIT "%s: bad page in kernel page table\n",
+		       __func__);
+	} while (size -= PAGE_SIZE);
+
+	flush_tlb_kernel_range(c->vm_start, c->vm_end);
+
+	list_del(&c->vm_list);
+
+	spin_unlock_irqrestore(&consistent_lock, flags);
+
+	kfree(c);
+	return;
+
+ no_area:
+	spin_unlock_irqrestore(&consistent_lock, flags);
+	printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n",
+	       __func__, vaddr);
+	dump_stack();
+}
+EXPORT_SYMBOL(__dma_free_coherent);
+
+/*
+ * Initialise the consistent memory allocation.
+ */
+static int __init dma_alloc_init(void)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	int ret = 0;
+
+	do {
+		pgd = pgd_offset(&init_mm, CONSISTENT_BASE);
+		pud = pud_alloc(&init_mm, pgd, CONSISTENT_BASE);
+		pmd = pmd_alloc(&init_mm, pud, CONSISTENT_BASE);
+		if (!pmd) {
+			printk(KERN_ERR "%s: no pmd tables\n", __func__);
+			ret = -ENOMEM;
+			break;
+		}
+
+		pte = pte_alloc_kernel(pmd, CONSISTENT_BASE);
+		if (!pte) {
+			printk(KERN_ERR "%s: no pte tables\n", __func__);
+			ret = -ENOMEM;
+			break;
+		}
+
+		consistent_pte = pte;
+	} while (0);
+
+	return ret;
+}
+
+core_initcall(dma_alloc_init);
+
+/*
+ * make an area consistent.
+ */
+void __dma_sync(void *vaddr, size_t size, int direction)
+{
+	unsigned long start = (unsigned long)vaddr;
+	unsigned long end   = start + size;
+
+	switch (direction) {
+	case DMA_NONE:
+		BUG();
+	case DMA_FROM_DEVICE:
+		/*
+		 * invalidate only when cache-line aligned otherwise there is
+		 * the potential for discarding uncommitted data from the cache
+		 */
+		if ((start & (L1_CACHE_BYTES - 1)) || (size & (L1_CACHE_BYTES - 1)))
+			flush_dcache_range(start, end);
+		else
+			invalidate_dcache_range(start, end);
+		break;
+	case DMA_TO_DEVICE:		/* writeback only */
+		clean_dcache_range(start, end);
+		break;
+	case DMA_BIDIRECTIONAL:	/* writeback and invalidate */
+		flush_dcache_range(start, end);
+		break;
+	}
+}
+EXPORT_SYMBOL(__dma_sync);
+
+#ifdef CONFIG_HIGHMEM
+/*
+ * __dma_sync_page() implementation for systems using highmem.
+ * In this case, each page of a buffer must be kmapped/kunmapped
+ * in order to have a virtual address for __dma_sync(). This must
+ * not sleep so kmap_atomic()/kunmap_atomic() are used.
+ *
+ * Note: yes, it is possible and correct to have a buffer extend
+ * beyond the first page.
+ */
+static inline void __dma_sync_page_highmem(struct page *page,
+		unsigned long offset, size_t size, int direction)
+{
+	size_t seg_size = min((size_t)(PAGE_SIZE - offset), size);
+	size_t cur_size = seg_size;
+	unsigned long flags, start, seg_offset = offset;
+	int nr_segs = 1 + ((size - seg_size) + PAGE_SIZE - 1)/PAGE_SIZE;
+	int seg_nr = 0;
+
+	local_irq_save(flags);
+
+	do {
+		start = (unsigned long)kmap_atomic(page + seg_nr,
+				KM_PPC_SYNC_PAGE) + seg_offset;
+
+		/* Sync this buffer segment */
+		__dma_sync((void *)start, seg_size, direction);
+		kunmap_atomic((void *)start, KM_PPC_SYNC_PAGE);
+		seg_nr++;
+
+		/* Calculate next buffer segment size */
+		seg_size = min((size_t)PAGE_SIZE, size - cur_size);
+
+		/* Add the segment size to our running total */
+		cur_size += seg_size;
+		seg_offset = 0;
+	} while (seg_nr < nr_segs);
+
+	local_irq_restore(flags);
+}
+#endif /* CONFIG_HIGHMEM */
+
+/*
+ * __dma_sync_page makes memory consistent. identical to __dma_sync, but
+ * takes a struct page instead of a virtual address
+ */
+void __dma_sync_page(struct page *page, unsigned long offset,
+	size_t size, int direction)
+{
+#ifdef CONFIG_HIGHMEM
+	__dma_sync_page_highmem(page, offset, size, direction);
+#else
+	unsigned long start = (unsigned long)page_address(page) + offset;
+	__dma_sync((void *)start, size, direction);
+#endif
+}
+EXPORT_SYMBOL(__dma_sync_page);
-- 
cgit v0.10.2


From f637a49e507c88354ab32b5d914e06acfb7ee00d Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 27 May 2009 13:44:50 +1000
Subject: powerpc: Minor cleanups of kernel virt address space definitions

Make FIXADDR_TOP a compile time constant and cleanup a
couple of definitions relative to the layout of the kernel
address space on ppc32. We also print out that layout at
boot time for debugging purposes.

This is a pre-requisite for properly fixing non-coherent
DMA allocactions.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/fixmap.h b/arch/powerpc/include/asm/fixmap.h
index d60fd18..f1f4e23 100644
--- a/arch/powerpc/include/asm/fixmap.h
+++ b/arch/powerpc/include/asm/fixmap.h
@@ -14,8 +14,6 @@
 #ifndef _ASM_FIXMAP_H
 #define _ASM_FIXMAP_H
 
-extern unsigned long FIXADDR_TOP;
-
 #ifndef __ASSEMBLY__
 #include <linux/kernel.h>
 #include <asm/page.h>
@@ -24,6 +22,8 @@ extern unsigned long FIXADDR_TOP;
 #include <asm/kmap_types.h>
 #endif
 
+#define FIXADDR_TOP	((unsigned long)(-PAGE_SIZE))
+
 /*
  * Here we define all the compile-time 'special' virtual
  * addresses. The point is to have a constant address at
diff --git a/arch/powerpc/include/asm/pgtable-ppc32.h b/arch/powerpc/include/asm/pgtable-ppc32.h
index ba45c99..28fe9d4 100644
--- a/arch/powerpc/include/asm/pgtable-ppc32.h
+++ b/arch/powerpc/include/asm/pgtable-ppc32.h
@@ -10,7 +10,7 @@
 
 extern unsigned long va_to_phys(unsigned long address);
 extern pte_t *va_to_pte(unsigned long address);
-extern unsigned long ioremap_bot, ioremap_base;
+extern unsigned long ioremap_bot;
 
 #ifdef CONFIG_44x
 extern int icache_44x_need_flush;
@@ -56,8 +56,26 @@ extern int icache_44x_need_flush;
 	printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
 
 /*
+ * This is the bottom of the PKMAP area with HIGHMEM or an arbitrary
+ * value (for now) on others, from where we can start layout kernel
+ * virtual space that goes below PKMAP and FIXMAP
+ */
+#ifdef CONFIG_HIGHMEM
+#define KVIRT_TOP	PKMAP_BASE
+#else
+#define KVIRT_TOP	(0xfe000000UL)	/* for now, could be FIXMAP_BASE ? */
+#endif
+
+/*
+ * ioremap_bot starts at that address. Early ioremaps move down from there,
+ * until mem_init() at which point this becomes the top of the vmalloc
+ * and ioremap space
+ */
+#define IOREMAP_TOP	KVIRT_TOP
+
+/*
  * Just any arbitrary offset to the start of the vmalloc VM area: the
- * current 64MB value just means that there will be a 64MB "hole" after the
+ * current 16MB value just means that there will be a 64MB "hole" after the
  * physical memory until the kernel virtual memory starts.  That means that
  * any out-of-bounds memory accesses will hopefully be caught.
  * The vmalloc() routines leaves a hole of 4kB between each vmalloced
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 666a5e8..3de6a0d9 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -168,12 +168,8 @@ void __init MMU_init(void)
 		ppc_md.progress("MMU:mapin", 0x301);
 	mapin_ram();
 
-#ifdef CONFIG_HIGHMEM
-	ioremap_base = PKMAP_BASE;
-#else
-	ioremap_base = 0xfe000000UL;	/* for now, could be 0xfffff000 */
-#endif /* CONFIG_HIGHMEM */
-	ioremap_bot = ioremap_base;
+	/* Initialize early top-down ioremap allocator */
+	ioremap_bot = IOREMAP_TOP;
 
 	/* Map in I/O resources */
 	if (ppc_md.progress)
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index d0602a7..d3a4e67 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -380,6 +380,19 @@ void __init mem_init(void)
 		bsssize >> 10,
 		initsize >> 10);
 
+#ifdef CONFIG_PPC32
+	pr_info("Kernel virtual memory layout:\n");
+	pr_info("  * 0x%08lx..0x%08lx  : fixmap\n", FIXADDR_START, FIXADDR_TOP);
+#ifdef CONFIG_HIGHMEM
+	pr_info("  * 0x%08lx..0x%08lx  : highmem PTEs\n",
+		PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP));
+#endif /* CONFIG_HIGHMEM */
+	pr_info("  * 0x%08lx..0x%08lx  : early ioremap\n",
+		ioremap_bot, IOREMAP_TOP);
+	pr_info("  * 0x%08lx..0x%08lx  : vmalloc & ioremap\n",
+		VMALLOC_START, VMALLOC_END);
+#endif /* CONFIG_PPC32 */
+
 	mem_init_done = 1;
 }
 
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 430d090..5422169 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -399,8 +399,6 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
 #endif /* CONFIG_DEBUG_PAGEALLOC */
 
 static int fixmaps;
-unsigned long FIXADDR_TOP = (-PAGE_SIZE);
-EXPORT_SYMBOL(FIXADDR_TOP);
 
 void __set_fixmap (enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
 {
-- 
cgit v0.10.2


From 8b31e49d1d75729c1da9009664ba52abd1adc628 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 27 May 2009 13:50:33 +1000
Subject: powerpc: Fix up dma_alloc_coherent() on platforms without cache
 coherency.

The implementation we just revived has issues, such as using a
Kconfig-defined virtual address area in kernel space that nothing
actually carves out (and thus will overlap whatever is there),
or having some dependencies on being self contained in a single
PTE page which adds unnecessary constraints on the kernel virtual
address space.

This fixes it by using more classic PTE accessors and automatically
locating the area for consistent memory, carving an appropriate hole
in the kernel virtual address space, leaving only the size of that
area as a Kconfig option. It also brings some dma-mask related fixes
from the ARM implementation which was almost identical initially but
grew its own fixes.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 3bb43ad..cdc9a6f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -868,19 +868,6 @@ config TASK_SIZE
 	default "0x80000000" if PPC_PREP || PPC_8xx
 	default "0xc0000000"
 
-config CONSISTENT_START_BOOL
-	bool "Set custom consistent memory pool address"
-	depends on ADVANCED_OPTIONS && NOT_COHERENT_CACHE
-	help
-	  This option allows you to set the base virtual address
-	  of the consistent memory pool.  This pool of virtual
-	  memory is used to make consistent memory allocations.
-
-config CONSISTENT_START
-	hex "Base virtual address of consistent memory pool" if CONSISTENT_START_BOOL
-	default "0xfd000000" if (NOT_COHERENT_CACHE && 8xx)
-	default "0xff100000" if NOT_COHERENT_CACHE
-
 config CONSISTENT_SIZE_BOOL
 	bool "Set custom consistent memory pool size"
 	depends on ADVANCED_OPTIONS && NOT_COHERENT_CACHE
diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
index c69f2b5..cb448d68 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -26,7 +26,9 @@
  * allocate the space "normally" and use the cache management functions
  * to ensure it is consistent.
  */
-extern void *__dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp);
+struct device;
+extern void *__dma_alloc_coherent(struct device *dev, size_t size,
+				  dma_addr_t *handle, gfp_t gfp);
 extern void __dma_free_coherent(size_t size, void *vaddr);
 extern void __dma_sync(void *vaddr, size_t size, int direction);
 extern void __dma_sync_page(struct page *page, unsigned long offset,
@@ -37,7 +39,7 @@ extern void __dma_sync_page(struct page *page, unsigned long offset,
  * Cache coherent cores.
  */
 
-#define __dma_alloc_coherent(gfp, size, handle)	NULL
+#define __dma_alloc_coherent(dev, gfp, size, handle)	NULL
 #define __dma_free_coherent(size, addr)		((void)0)
 #define __dma_sync(addr, size, rw)		((void)0)
 #define __dma_sync_page(pg, off, sz, rw)	((void)0)
diff --git a/arch/powerpc/include/asm/pgtable-ppc32.h b/arch/powerpc/include/asm/pgtable-ppc32.h
index 28fe9d4..c9ff9d7 100644
--- a/arch/powerpc/include/asm/pgtable-ppc32.h
+++ b/arch/powerpc/include/asm/pgtable-ppc32.h
@@ -71,7 +71,11 @@ extern int icache_44x_need_flush;
  * until mem_init() at which point this becomes the top of the vmalloc
  * and ioremap space
  */
+#ifdef CONFIG_NOT_COHERENT_CACHE
+#define IOREMAP_TOP	((KVIRT_TOP - CONFIG_CONSISTENT_SIZE) & PAGE_MASK)
+#else
 #define IOREMAP_TOP	KVIRT_TOP
+#endif
 
 /*
  * Just any arbitrary offset to the start of the vmalloc VM area: the
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index 53c7788..6b02793 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -32,7 +32,7 @@ void *dma_direct_alloc_coherent(struct device *dev, size_t size,
 {
 	void *ret;
 #ifdef CONFIG_NOT_COHERENT_CACHE
-	ret = __dma_alloc_coherent(size, dma_handle, flag);
+	ret = __dma_alloc_coherent(dev, size, dma_handle, flag);
 	if (ret == NULL)
 		return NULL;
 	*dma_handle += get_dma_direct_offset(dev);
diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c
index b7dc4c1..36692f5 100644
--- a/arch/powerpc/mm/dma-noncoherent.c
+++ b/arch/powerpc/mm/dma-noncoherent.c
@@ -32,20 +32,21 @@
 
 #include <asm/tlbflush.h>
 
+#include "mmu_decl.h"
+
 /*
  * This address range defaults to a value that is safe for all
  * platforms which currently set CONFIG_NOT_COHERENT_CACHE. It
  * can be further configured for specific applications under
  * the "Advanced Setup" menu. -Matt
  */
-#define CONSISTENT_BASE	(CONFIG_CONSISTENT_START)
-#define CONSISTENT_END	(CONFIG_CONSISTENT_START + CONFIG_CONSISTENT_SIZE)
+#define CONSISTENT_BASE		(IOREMAP_TOP)
+#define CONSISTENT_END 		(CONSISTENT_BASE + CONFIG_CONSISTENT_SIZE)
 #define CONSISTENT_OFFSET(x)	(((unsigned long)(x) - CONSISTENT_BASE) >> PAGE_SHIFT)
 
 /*
  * This is the page table (2MB) covering uncached, DMA consistent allocations
  */
-static pte_t *consistent_pte;
 static DEFINE_SPINLOCK(consistent_lock);
 
 /*
@@ -148,22 +149,38 @@ static struct ppc_vm_region *ppc_vm_region_find(struct ppc_vm_region *head, unsi
  * virtual and bus address for that space.
  */
 void *
-__dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp)
+__dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp)
 {
 	struct page *page;
 	struct ppc_vm_region *c;
 	unsigned long order;
-	u64 mask = 0x00ffffff, limit; /* ISA default */
+	u64 mask = ISA_DMA_THRESHOLD, limit;
 
-	if (!consistent_pte) {
-		printk(KERN_ERR "%s: not initialised\n", __func__);
-		dump_stack();
-		return NULL;
+	if (dev) {
+		mask = dev->coherent_dma_mask;
+
+		/*
+		 * Sanity check the DMA mask - it must be non-zero, and
+		 * must be able to be satisfied by a DMA allocation.
+		 */
+		if (mask == 0) {
+			dev_warn(dev, "coherent DMA mask is unset\n");
+			goto no_page;
+		}
+
+		if ((~mask) & ISA_DMA_THRESHOLD) {
+			dev_warn(dev, "coherent DMA mask %#llx is smaller "
+				 "than system GFP_DMA mask %#llx\n",
+				 mask, (unsigned long long)ISA_DMA_THRESHOLD);
+			goto no_page;
+		}
 	}
 
+
 	size = PAGE_ALIGN(size);
 	limit = (mask + 1) & ~mask;
-	if ((limit && size >= limit) || size >= (CONSISTENT_END - CONSISTENT_BASE)) {
+	if ((limit && size >= limit) ||
+	    size >= (CONSISTENT_END - CONSISTENT_BASE)) {
 		printk(KERN_WARNING "coherent allocation too big (requested %#x mask %#Lx)\n",
 		       size, mask);
 		return NULL;
@@ -171,6 +188,7 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp)
 
 	order = get_order(size);
 
+	/* Might be useful if we ever have a real legacy DMA zone... */
 	if (mask != 0xffffffff)
 		gfp |= GFP_DMA;
 
@@ -195,7 +213,6 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp)
 			    gfp & ~(__GFP_DMA | __GFP_HIGHMEM));
 	if (c) {
 		unsigned long vaddr = c->vm_start;
-		pte_t *pte = consistent_pte + CONSISTENT_OFFSET(vaddr);
 		struct page *end = page + (1 << order);
 
 		split_page(page, order);
@@ -206,13 +223,10 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp)
 		*handle = page_to_phys(page);
 
 		do {
-			BUG_ON(!pte_none(*pte));
-
 			SetPageReserved(page);
-			set_pte_at(&init_mm, vaddr,
-				   pte, mk_pte(page, pgprot_noncached(PAGE_KERNEL)));
+			map_page(vaddr, page_to_phys(page),
+				 pgprot_noncached(PAGE_KERNEL));
 			page++;
-			pte++;
 			vaddr += PAGE_SIZE;
 		} while (size -= PAGE_SIZE);
 
@@ -241,8 +255,7 @@ void __dma_free_coherent(size_t size, void *vaddr)
 {
 	struct ppc_vm_region *c;
 	unsigned long flags, addr;
-	pte_t *ptep;
-
+	
 	size = PAGE_ALIGN(size);
 
 	spin_lock_irqsave(&consistent_lock, flags);
@@ -258,29 +271,26 @@ void __dma_free_coherent(size_t size, void *vaddr)
 		size = c->vm_end - c->vm_start;
 	}
 
-	ptep = consistent_pte + CONSISTENT_OFFSET(c->vm_start);
 	addr = c->vm_start;
 	do {
-		pte_t pte = ptep_get_and_clear(&init_mm, addr, ptep);
+		pte_t *ptep;
 		unsigned long pfn;
 
-		ptep++;
-		addr += PAGE_SIZE;
-
-		if (!pte_none(pte) && pte_present(pte)) {
-			pfn = pte_pfn(pte);
-
+		ptep = pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(addr),
+							       addr),
+						    addr),
+					 addr);
+		if (!pte_none(*ptep) && pte_present(*ptep)) {
+			pfn = pte_pfn(*ptep);
+			pte_clear(&init_mm, addr, ptep);
 			if (pfn_valid(pfn)) {
 				struct page *page = pfn_to_page(pfn);
-				ClearPageReserved(page);
 
+				ClearPageReserved(page);
 				__free_page(page);
-				continue;
 			}
 		}
-
-		printk(KERN_CRIT "%s: bad page in kernel page table\n",
-		       __func__);
+		addr += PAGE_SIZE;
 	} while (size -= PAGE_SIZE);
 
 	flush_tlb_kernel_range(c->vm_start, c->vm_end);
@@ -301,42 +311,6 @@ void __dma_free_coherent(size_t size, void *vaddr)
 EXPORT_SYMBOL(__dma_free_coherent);
 
 /*
- * Initialise the consistent memory allocation.
- */
-static int __init dma_alloc_init(void)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-	int ret = 0;
-
-	do {
-		pgd = pgd_offset(&init_mm, CONSISTENT_BASE);
-		pud = pud_alloc(&init_mm, pgd, CONSISTENT_BASE);
-		pmd = pmd_alloc(&init_mm, pud, CONSISTENT_BASE);
-		if (!pmd) {
-			printk(KERN_ERR "%s: no pmd tables\n", __func__);
-			ret = -ENOMEM;
-			break;
-		}
-
-		pte = pte_alloc_kernel(pmd, CONSISTENT_BASE);
-		if (!pte) {
-			printk(KERN_ERR "%s: no pte tables\n", __func__);
-			ret = -ENOMEM;
-			break;
-		}
-
-		consistent_pte = pte;
-	} while (0);
-
-	return ret;
-}
-
-core_initcall(dma_alloc_init);
-
-/*
  * make an area consistent.
  */
 void __dma_sync(void *vaddr, size_t size, int direction)
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index d3a4e67..579382c 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -387,6 +387,10 @@ void __init mem_init(void)
 	pr_info("  * 0x%08lx..0x%08lx  : highmem PTEs\n",
 		PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP));
 #endif /* CONFIG_HIGHMEM */
+#ifdef CONFIG_NOT_COHERENT_CACHE
+	pr_info("  * 0x%08lx..0x%08lx  : consistent mem\n",
+		IOREMAP_TOP, IOREMAP_TOP + CONFIG_CONSISTENT_SIZE);
+#endif /* CONFIG_NOT_COHERENT_CACHE */
 	pr_info("  * 0x%08lx..0x%08lx  : early ioremap\n",
 		ioremap_bot, IOREMAP_TOP);
 	pr_info("  * 0x%08lx..0x%08lx  : vmalloc & ioremap\n",
-- 
cgit v0.10.2


From 4c713189485dbea875aecd1990daed74908e181d Mon Sep 17 00:00:00 2001
From: Dave Young <hidave.darkstar@gmail.com>
Date: Wed, 27 May 2009 09:10:28 +0200
Subject: Bluetooth: Remove useless flush_work() causing lockdep warnings

The calls to flush_work() are pointless in a single thread workqueue
and they are actually causing a lockdep warning.

=============================================
[ INFO: possible recursive locking detected ]
2.6.30-rc6-02911-gbb803cf #16
---------------------------------------------
bluetooth/2518 is trying to acquire lock:
 (bluetooth){+.+.+.}, at: [<c0130c14>] flush_work+0x28/0xb0

but task is already holding lock:
 (bluetooth){+.+.+.}, at: [<c0130424>] worker_thread+0x149/0x25e

other info that might help us debug this:
2 locks held by bluetooth/2518:
 #0:  (bluetooth){+.+.+.}, at: [<c0130424>] worker_thread+0x149/0x25e
 #1:  (&conn->work_del){+.+...}, at: [<c0130424>] worker_thread+0x149/0x25e

stack backtrace:
Pid: 2518, comm: bluetooth Not tainted 2.6.30-rc6-02911-gbb803cf #16
Call Trace:
 [<c03d64d9>] ? printk+0xf/0x11
 [<c0140d96>] __lock_acquire+0x7ce/0xb1b
 [<c0141173>] lock_acquire+0x90/0xad
 [<c0130c14>] ? flush_work+0x28/0xb0
 [<c0130c2e>] flush_work+0x42/0xb0
 [<c0130c14>] ? flush_work+0x28/0xb0
 [<f8b84966>] del_conn+0x1c/0x84 [bluetooth]
 [<c0130469>] worker_thread+0x18e/0x25e
 [<c0130424>] ? worker_thread+0x149/0x25e
 [<f8b8494a>] ? del_conn+0x0/0x84 [bluetooth]
 [<c0133843>] ? autoremove_wake_function+0x0/0x33
 [<c01302db>] ? worker_thread+0x0/0x25e
 [<c013355a>] kthread+0x45/0x6b
 [<c0133515>] ? kthread+0x0/0x6b
 [<c01034a7>] kernel_thread_helper+0x7/0x10

Based on a report by Oliver Hartkopp <oliver@hartkopp.net>

Signed-off-by: Dave Young <hidave.darkstar@gmail.com>
Tested-by: Oliver Hartkopp <oliver@hartkopp.net>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>

diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index 4cc3624..95f7a7a 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -90,9 +90,6 @@ static void add_conn(struct work_struct *work)
 	struct hci_conn *conn = container_of(work, struct hci_conn, work_add);
 	struct hci_dev *hdev = conn->hdev;
 
-	/* ensure previous del is complete */
-	flush_work(&conn->work_del);
-
 	dev_set_name(&conn->dev, "%s:%d", hdev->name, conn->handle);
 
 	if (device_add(&conn->dev) < 0) {
@@ -118,9 +115,6 @@ static void del_conn(struct work_struct *work)
 	struct hci_conn *conn = container_of(work, struct hci_conn, work_del);
 	struct hci_dev *hdev = conn->hdev;
 
-	/* ensure previous add is complete */
-	flush_work(&conn->work_add);
-
 	if (!device_is_registered(&conn->dev))
 		return;
 
-- 
cgit v0.10.2


From 23ac9cbed82b00ca3520bb81dbe9ea3b7a936a1b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 27 May 2009 09:33:18 +0200
Subject: perf_counter tools: Rename output.perf to perf.data

output.perf is only output to perf-record - it's input to
perf-report. So change it to a more direction-neutral name.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Documentation/perf-record.txt b/Documentation/perf_counter/Documentation/perf-record.txt
index d07700e..353db1b 100644
--- a/Documentation/perf_counter/Documentation/perf-record.txt
+++ b/Documentation/perf_counter/Documentation/perf-record.txt
@@ -3,7 +3,7 @@ perf-record(1)
 
 NAME
 ----
-perf-record - Run a command and record its profile into output.perf
+perf-record - Run a command and record its profile into perf.data
 
 SYNOPSIS
 --------
@@ -13,7 +13,7 @@ SYNOPSIS
 DESCRIPTION
 -----------
 This command runs a command and gathers a performance counter profile
-from it, into output.perf - without displaying anything.
+from it, into perf.data - without displaying anything.
 
 This file can then be inspected later on, using 'perf report'.
 
diff --git a/Documentation/perf_counter/Documentation/perf-report.txt b/Documentation/perf_counter/Documentation/perf-report.txt
index 64696a2..49efe16 100644
--- a/Documentation/perf_counter/Documentation/perf-report.txt
+++ b/Documentation/perf_counter/Documentation/perf-report.txt
@@ -3,7 +3,7 @@ perf-report(1)
 
 NAME
 ----
-perf-report - Read output.perf (created by perf record) and display the profile
+perf-report - Read perf.data (created by perf record) and display the profile
 
 SYNOPSIS
 --------
@@ -19,7 +19,7 @@ OPTIONS
 -------
 -i::
 --input=::
-        Input file name. (default: output.perf)
+        Input file name. (default: perf.data)
 
 Configuration
 -------------
diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 68abfdf..431077a 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -19,7 +19,7 @@ static int			nr_cpus				=  0;
 static unsigned int		page_size;
 static unsigned int		mmap_pages			= 16;
 static int			output;
-static const char		*output_name			= "output.perf";
+static const char		*output_name			= "perf.data";
 static int			group				= 0;
 static unsigned int		realtime_prio			= 0;
 static int			system_wide			= 0;
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 7f1255d..e2712cd 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -18,7 +18,7 @@
 #define SHOW_USER	2
 #define SHOW_HV		4
 
-static char		const *input_name = "output.perf";
+static char		const *input_name = "perf.data";
 static int		input;
 static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
-- 
cgit v0.10.2


From a930d2c0d0a685ab955472b08baad041cc5edb4a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 27 May 2009 09:50:13 +0200
Subject: perf_counter tools: Add built-in pager support

Add Git's pager.c (and sigchain) code. A command only
has to call setup_pager() to get paged interactive
output.

Non-interactive (redirected, command-piped, etc.) uses
are not affected.

Update perf-report to make use of this.

[ Impact: new feature ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index efb0589..51b13f9 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -297,11 +297,13 @@ LIB_H += util/util.h
 LIB_H += util/help.h
 LIB_H += util/strbuf.h
 LIB_H += util/run-command.h
+LIB_H += util/sigchain.h
 
 LIB_OBJS += util/abspath.o
 LIB_OBJS += util/alias.o
 LIB_OBJS += util/config.o
 LIB_OBJS += util/ctype.o
+LIB_OBJS += util/environment.o
 LIB_OBJS += util/exec_cmd.o
 LIB_OBJS += util/help.o
 LIB_OBJS += util/levenshtein.o
@@ -314,6 +316,8 @@ LIB_OBJS += util/quote.o
 LIB_OBJS += util/strbuf.o
 LIB_OBJS += util/usage.o
 LIB_OBJS += util/wrapper.o
+LIB_OBJS += util/sigchain.o
+LIB_OBJS += util/pager.o
 
 BUILTIN_OBJS += builtin-help.o
 BUILTIN_OBJS += builtin-record.o
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index e2712cd..9aef7c5 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -7,6 +7,7 @@
 #include <ctype.h>
 
 #include "util/list.h"
+#include "util/cache.h"
 #include "util/rbtree.h"
 
 #include "perf.h"
@@ -992,5 +993,7 @@ int cmd_report(int argc, const char **argv, const char *prefix)
 
 	parse_options(argc, argv, options, report_usage, 0);
 
+	setup_pager();
+
 	return __cmd_report();
 }
diff --git a/Documentation/perf_counter/util/environment.c b/Documentation/perf_counter/util/environment.c
new file mode 100644
index 0000000..9b1c819
--- /dev/null
+++ b/Documentation/perf_counter/util/environment.c
@@ -0,0 +1,8 @@
+/*
+ * We put all the perf config variables in this same object
+ * file, so that programs can link against the config parser
+ * without having to link against all the rest of perf.
+ */
+#include "cache.h"
+
+const char *pager_program;
diff --git a/Documentation/perf_counter/util/pager.c b/Documentation/perf_counter/util/pager.c
new file mode 100644
index 0000000..a28bcca
--- /dev/null
+++ b/Documentation/perf_counter/util/pager.c
@@ -0,0 +1,99 @@
+#include "cache.h"
+#include "run-command.h"
+#include "sigchain.h"
+
+/*
+ * This is split up from the rest of git so that we can do
+ * something different on Windows.
+ */
+
+static int spawned_pager;
+
+#ifndef __MINGW32__
+static void pager_preexec(void)
+{
+	/*
+	 * Work around bug in "less" by not starting it until we
+	 * have real input
+	 */
+	fd_set in;
+
+	FD_ZERO(&in);
+	FD_SET(0, &in);
+	select(1, &in, NULL, &in, NULL);
+
+	setenv("LESS", "FRSX", 0);
+}
+#endif
+
+static const char *pager_argv[] = { "sh", "-c", NULL, NULL };
+static struct child_process pager_process;
+
+static void wait_for_pager(void)
+{
+	fflush(stdout);
+	fflush(stderr);
+	/* signal EOF to pager */
+	close(1);
+	close(2);
+	finish_command(&pager_process);
+}
+
+static void wait_for_pager_signal(int signo)
+{
+	wait_for_pager();
+	sigchain_pop(signo);
+	raise(signo);
+}
+
+void setup_pager(void)
+{
+	const char *pager = getenv("PERF_PAGER");
+
+	if (!isatty(1))
+		return;
+	if (!pager) {
+		if (!pager_program)
+			perf_config(perf_default_config, NULL);
+		pager = pager_program;
+	}
+	if (!pager)
+		pager = getenv("PAGER");
+	if (!pager)
+		pager = "less";
+	else if (!*pager || !strcmp(pager, "cat"))
+		return;
+
+	spawned_pager = 1; /* means we are emitting to terminal */
+
+	/* spawn the pager */
+	pager_argv[2] = pager;
+	pager_process.argv = pager_argv;
+	pager_process.in = -1;
+#ifndef __MINGW32__
+	pager_process.preexec_cb = pager_preexec;
+#endif
+	if (start_command(&pager_process))
+		return;
+
+	/* original process continues, but writes to the pipe */
+	dup2(pager_process.in, 1);
+	if (isatty(2))
+		dup2(pager_process.in, 2);
+	close(pager_process.in);
+
+	/* this makes sure that the parent terminates after the pager */
+	sigchain_push_common(wait_for_pager_signal);
+	atexit(wait_for_pager);
+}
+
+int pager_in_use(void)
+{
+	const char *env;
+
+	if (spawned_pager)
+		return 1;
+
+	env = getenv("PERF_PAGER_IN_USE");
+	return env ? perf_config_bool("PERF_PAGER_IN_USE", env) : 0;
+}
diff --git a/Documentation/perf_counter/util/sigchain.c b/Documentation/perf_counter/util/sigchain.c
new file mode 100644
index 0000000..1118b99
--- /dev/null
+++ b/Documentation/perf_counter/util/sigchain.c
@@ -0,0 +1,52 @@
+#include "sigchain.h"
+#include "cache.h"
+
+#define SIGCHAIN_MAX_SIGNALS 32
+
+struct sigchain_signal {
+	sigchain_fun *old;
+	int n;
+	int alloc;
+};
+static struct sigchain_signal signals[SIGCHAIN_MAX_SIGNALS];
+
+static void check_signum(int sig)
+{
+	if (sig < 1 || sig >= SIGCHAIN_MAX_SIGNALS)
+		die("BUG: signal out of range: %d", sig);
+}
+
+int sigchain_push(int sig, sigchain_fun f)
+{
+	struct sigchain_signal *s = signals + sig;
+	check_signum(sig);
+
+	ALLOC_GROW(s->old, s->n + 1, s->alloc);
+	s->old[s->n] = signal(sig, f);
+	if (s->old[s->n] == SIG_ERR)
+		return -1;
+	s->n++;
+	return 0;
+}
+
+int sigchain_pop(int sig)
+{
+	struct sigchain_signal *s = signals + sig;
+	check_signum(sig);
+	if (s->n < 1)
+		return 0;
+
+	if (signal(sig, s->old[s->n - 1]) == SIG_ERR)
+		return -1;
+	s->n--;
+	return 0;
+}
+
+void sigchain_push_common(sigchain_fun f)
+{
+	sigchain_push(SIGINT, f);
+	sigchain_push(SIGHUP, f);
+	sigchain_push(SIGTERM, f);
+	sigchain_push(SIGQUIT, f);
+	sigchain_push(SIGPIPE, f);
+}
diff --git a/Documentation/perf_counter/util/sigchain.h b/Documentation/perf_counter/util/sigchain.h
new file mode 100644
index 0000000..618083bc
--- /dev/null
+++ b/Documentation/perf_counter/util/sigchain.h
@@ -0,0 +1,11 @@
+#ifndef SIGCHAIN_H
+#define SIGCHAIN_H
+
+typedef void (*sigchain_fun)(int);
+
+int sigchain_push(int sig, sigchain_fun f);
+int sigchain_pop(int sig);
+
+void sigchain_push_common(sigchain_fun f);
+
+#endif /* SIGCHAIN_H */
-- 
cgit v0.10.2


From 6af3fb72d2437239e5eb13a59e95dc43ccab3e8f Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 27 May 2009 10:49:26 +0200
Subject: ALSA: Fix invalid jiffies check after pause

The hw_ptr_jiffies has to be reset properly to avoid the invalid
check of jiffies delta in snd_pcm_update_hw_ptr*() functions.
Especailly this patch fixes the bogus jiffies check after the puase
and resume.

This patch is a modified version of the original patch by Jaroslav.

Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c
index a2a792c..3eea98a 100644
--- a/sound/core/pcm_lib.c
+++ b/sound/core/pcm_lib.c
@@ -1478,7 +1478,6 @@ static int snd_pcm_lib_ioctl_reset(struct snd_pcm_substream *substream,
 		runtime->status->hw_ptr %= runtime->buffer_size;
 	else
 		runtime->status->hw_ptr = 0;
-	runtime->hw_ptr_jiffies = jiffies;
 	snd_pcm_stream_unlock_irqrestore(substream, flags);
 	return 0;
 }
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index fc6f98e..b5da656 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -848,6 +848,7 @@ static void snd_pcm_post_start(struct snd_pcm_substream *substream, int state)
 {
 	struct snd_pcm_runtime *runtime = substream->runtime;
 	snd_pcm_trigger_tstamp(substream);
+	runtime->hw_ptr_jiffies = jiffies;
 	runtime->status->state = state;
 	if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK &&
 	    runtime->silence_size > 0)
@@ -961,6 +962,11 @@ static int snd_pcm_do_pause(struct snd_pcm_substream *substream, int push)
 {
 	if (substream->runtime->trigger_master != substream)
 		return 0;
+	/* The jiffies check in snd_pcm_update_hw_ptr*() is done by
+	 * a delta betwen the current jiffies, this gives a large enough
+	 * delta, effectively to skip the check once.
+	 */
+	substream->runtime->hw_ptr_jiffies = jiffies - HZ * 1000;
 	return substream->ops->trigger(substream,
 				       push ? SNDRV_PCM_TRIGGER_PAUSE_PUSH :
 					      SNDRV_PCM_TRIGGER_PAUSE_RELEASE);
-- 
cgit v0.10.2


From 9d911d7903926a65ef49ec671bacd86bcee5eb51 Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Thu, 21 May 2009 16:21:15 -0600
Subject: PCI Hotplug: acpiphp: don't store a pci_dev in acpiphp_func

An oops can occur if a user attempts to use both PCI logical
hotplug and the ACPI physical hotplug driver (acpiphp) in this
sequence, where $slot/address == $device.

In other words, if acpiphp has claimed a PCI device, and that
device is logically removed, then acpiphp may oops when it
attempts to access it again.

	# echo 1 > /sys/bus/pci/devices/$device/remove
	# echo 0 > /sys/bus/pci/slots/$slot/power

Unable to handle kernel NULL pointer dereference (address 0000000000000000)
Call Trace:
 [<a000000100016390>] show_stack+0x50/0xa0
 [<a000000100016c60>] show_regs+0x820/0x860
 [<a00000010003b390>] die+0x190/0x2a0
 [<a000000100066a40>] ia64_do_page_fault+0x8e0/0xa40
 [<a00000010000c7a0>] ia64_native_leave_kernel+0x0/0x270
 [<a0000001003b2660>] pci_remove_bus_device+0x120/0x260
 [<a0000002060549f0>] acpiphp_disable_slot+0x410/0x540 [acpiphp]
 [<a0000002060505c0>] disable_slot+0xc0/0x120 [acpiphp]
 [<a0000002040d21c0>] power_write_file+0x1e0/0x2a0 [pci_hotplug]
 [<a0000001003bb820>] pci_slot_attr_store+0x60/0xa0
 [<a000000100240f70>] sysfs_write_file+0x230/0x2c0
 [<a000000100195750>] vfs_write+0x190/0x2e0
 [<a0000001001961a0>] sys_write+0x80/0x100
 [<a00000010000c600>] ia64_ret_from_syscall+0x0/0x20
 [<a000000000010720>] __kernel_syscall_via_break+0x0/0x20

The root cause of this oops is that the logical remove ("echo 1 >
/sys/bus/pci/devices/$device/remove") destroyed the pci_dev. The
pci_dev struct itself wasn't deallocated because acpiphp kept a
reference, but some of its fields became invalid.

acpiphp doesn't have any real reason to keep a pointer to a
pci_dev around. It can always derive it using pci_get_slot().

If a logical remove destroys the pci_dev, acpiphp won't find it
and is thus prevented from causing mischief.

Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Reviewed-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Tested-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Reported-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Acked-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>

diff --git a/drivers/pci/hotplug/acpiphp.h b/drivers/pci/hotplug/acpiphp.h
index 4fc168b..e68d5f2 100644
--- a/drivers/pci/hotplug/acpiphp.h
+++ b/drivers/pci/hotplug/acpiphp.h
@@ -129,7 +129,6 @@ struct acpiphp_func {
 	struct acpiphp_bridge *bridge;	/* Ejectable PCI-to-PCI bridge */
 
 	struct list_head sibling;
-	struct pci_dev *pci_dev;
 	struct notifier_block nb;
 	acpi_handle	handle;
 
diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
index a33794d..3a6064bc 100644
--- a/drivers/pci/hotplug/acpiphp_glue.c
+++ b/drivers/pci/hotplug/acpiphp_glue.c
@@ -32,9 +32,6 @@
 
 /*
  * Lifetime rules for pci_dev:
- *  - The one in acpiphp_func has its refcount elevated by pci_get_slot()
- *    when the driver is loaded or when an insertion event occurs.  It loses
- *    a refcount when its ejected or the driver unloads.
  *  - The one in acpiphp_bridge has its refcount elevated by pci_get_slot()
  *    when the bridge is scanned and it loses a refcount when the bridge
  *    is removed.
@@ -130,6 +127,7 @@ register_slot(acpi_handle handle, u32 lvl, void *context, void **rv)
 	unsigned long long adr, sun;
 	int device, function, retval;
 	struct pci_bus *pbus = bridge->pci_bus;
+	struct pci_dev *pdev;
 
 	if (!acpi_pci_check_ejectable(pbus, handle) && !is_dock_device(handle))
 		return AE_OK;
@@ -213,10 +211,10 @@ register_slot(acpi_handle handle, u32 lvl, void *context, void **rv)
 	newfunc->slot = slot;
 	list_add_tail(&newfunc->sibling, &slot->funcs);
 
-	/* associate corresponding pci_dev */
-	newfunc->pci_dev = pci_get_slot(pbus, PCI_DEVFN(device, function));
-	if (newfunc->pci_dev) {
+	pdev = pci_get_slot(pbus, PCI_DEVFN(device, function));
+	if (pdev) {
 		slot->flags |= (SLOT_ENABLED | SLOT_POWEREDON);
+		pci_dev_put(pdev);
 	}
 
 	if (is_dock_device(handle)) {
@@ -617,7 +615,6 @@ static void cleanup_bridge(struct acpiphp_bridge *bridge)
 				if (ACPI_FAILURE(status))
 					err("failed to remove notify handler\n");
 			}
-			pci_dev_put(func->pci_dev);
 			list_del(list);
 			kfree(func);
 		}
@@ -1101,22 +1098,24 @@ static int __ref enable_device(struct acpiphp_slot *slot)
 	pci_enable_bridges(bus);
 	pci_bus_add_devices(bus);
 
-	/* associate pci_dev to our representation */
 	list_for_each (l, &slot->funcs) {
 		func = list_entry(l, struct acpiphp_func, sibling);
-		func->pci_dev = pci_get_slot(bus, PCI_DEVFN(slot->device,
-							func->function));
-		if (!func->pci_dev)
+		dev = pci_get_slot(bus, PCI_DEVFN(slot->device,
+						  func->function));
+		if (!dev)
 			continue;
 
-		if (func->pci_dev->hdr_type != PCI_HEADER_TYPE_BRIDGE &&
-		    func->pci_dev->hdr_type != PCI_HEADER_TYPE_CARDBUS)
+		if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE &&
+		    dev->hdr_type != PCI_HEADER_TYPE_CARDBUS) {
+			pci_dev_put(dev);
 			continue;
+		}
 
 		status = find_p2p_bridge(func->handle, (u32)1, bus, NULL);
 		if (ACPI_FAILURE(status))
 			warn("find_p2p_bridge failed (error code = 0x%x)\n",
 				status);
+		pci_dev_put(dev);
 	}
 
 	slot->flags |= SLOT_ENABLED;
@@ -1142,17 +1141,14 @@ static void disable_bridges(struct pci_bus *bus)
  */
 static int disable_device(struct acpiphp_slot *slot)
 {
-	int retval = 0;
 	struct acpiphp_func *func;
-	struct list_head *l;
+	struct pci_dev *pdev;
 
 	/* is this slot already disabled? */
 	if (!(slot->flags & SLOT_ENABLED))
 		goto err_exit;
 
-	list_for_each (l, &slot->funcs) {
-		func = list_entry(l, struct acpiphp_func, sibling);
-
+	list_for_each_entry(func, &slot->funcs, sibling) {
 		if (func->bridge) {
 			/* cleanup p2p bridges under this P2P bridge */
 			cleanup_p2p_bridge(func->bridge->handle,
@@ -1160,35 +1156,28 @@ static int disable_device(struct acpiphp_slot *slot)
 			func->bridge = NULL;
 		}
 
-		if (func->pci_dev) {
-			pci_stop_bus_device(func->pci_dev);
-			if (func->pci_dev->subordinate) {
-				disable_bridges(func->pci_dev->subordinate);
-				pci_disable_device(func->pci_dev);
+		pdev = pci_get_slot(slot->bridge->pci_bus,
+				    PCI_DEVFN(slot->device, func->function));
+		if (pdev) {
+			pci_stop_bus_device(pdev);
+			if (pdev->subordinate) {
+				disable_bridges(pdev->subordinate);
+				pci_disable_device(pdev);
 			}
+			pci_remove_bus_device(pdev);
+			pci_dev_put(pdev);
 		}
 	}
 
-	list_for_each (l, &slot->funcs) {
-		func = list_entry(l, struct acpiphp_func, sibling);
-
+	list_for_each_entry(func, &slot->funcs, sibling) {
 		acpiphp_unconfigure_ioapics(func->handle);
 		acpiphp_bus_trim(func->handle);
-		/* try to remove anyway.
-		 * acpiphp_bus_add might have been failed */
-
-		if (!func->pci_dev)
-			continue;
-
-		pci_remove_bus_device(func->pci_dev);
-		pci_dev_put(func->pci_dev);
-		func->pci_dev = NULL;
 	}
 
 	slot->flags &= (~SLOT_ENABLED);
 
- err_exit:
-	return retval;
+err_exit:
+	return 0;
 }
 
 
-- 
cgit v0.10.2


From c87d9732004b3f8fd82d729f12ccfb96c0df279e Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 27 May 2009 10:53:33 +0200
Subject: ALSA: Enable PCM hw_ptr_jiffies check only in xrun_debug mode

The PCM hw_ptr jiffies check results sometimes in problems when a
hardware doesn't give smooth hw_ptr updates.  So far, au88x0 and some
other drivers appear not working due to this strict check.
However, this check is a nice debug tool, and the capability should be
still kept.

Hence, we disable this check now as default unless the user enables it
by setting the xrun_debug mode to the specific stream via a proc file.

Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/Documentation/sound/alsa/Procfile.txt b/Documentation/sound/alsa/Procfile.txt
index bba2dbb..cfac20c 100644
--- a/Documentation/sound/alsa/Procfile.txt
+++ b/Documentation/sound/alsa/Procfile.txt
@@ -104,6 +104,11 @@ card*/pcm*/xrun_debug
 	When this value is greater than 1, the driver will show the
 	stack trace additionally.  This may help the debugging.
 
+	Since 2.6.30, this option also enables the hwptr check using
+	jiffies.  This detects spontaneous invalid pointer callback
+	values, but can be lead to too much corrections for a (mostly
+	buggy) hardware that doesn't give smooth pointer updates.
+
 card*/pcm*/sub*/info
 	The general information of this PCM sub-stream.
 
diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c
index 3eea98a..d659995 100644
--- a/sound/core/pcm_lib.c
+++ b/sound/core/pcm_lib.c
@@ -249,6 +249,11 @@ static int snd_pcm_update_hw_ptr_interrupt(struct snd_pcm_substream *substream)
 			new_hw_ptr = hw_base + pos;
 		}
 	}
+
+	/* Do jiffies check only in xrun_debug mode */
+	if (!xrun_debug(substream))
+		goto no_jiffies_check;
+
 	/* Skip the jiffies check for hardwares with BATCH flag.
 	 * Such hardware usually just increases the position at each IRQ,
 	 * thus it can't give any strange position.
@@ -336,7 +341,9 @@ int snd_pcm_update_hw_ptr(struct snd_pcm_substream *substream)
 			hw_base = 0;
 		new_hw_ptr = hw_base + pos;
 	}
-	if (((delta * HZ) / runtime->rate) > jdelta + HZ/100) {
+	/* Do jiffies check only in xrun_debug mode */
+	if (xrun_debug(substream) &&
+	    ((delta * HZ) / runtime->rate) > jdelta + HZ/100) {
 		hw_ptr_error(substream,
 			     "hw_ptr skipping! "
 			     "(pos=%ld, delta=%ld, period=%ld, jdelta=%lu/%lu)\n",
-- 
cgit v0.10.2


From 55de5ef970c680d8d75f2a9aa7e4f172140dbd9c Mon Sep 17 00:00:00 2001
From: Clemens Ladisch <clemens@ladisch.de>
Date: Wed, 27 May 2009 10:49:30 +0200
Subject: sound: usb-audio: make the MotU Fastlane work again

Kernel 2.6.18 broke the MotU Fastlane, which uses duplicate endpoint
numbers in a manner that is not only illegal but also confuses the
kernel's endpoint descriptor caching mechanism.  To work around this, we
have to add a separate usb_set_interface() call to guide the USB core to
the correct descriptors.

Signed-off-by: Clemens Ladisch <clemens@ladisch.de>
Reported-and-tested-by: David Fries <david@fries.net>
Cc: <stable@kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/usb/usbaudio.c b/sound/usb/usbaudio.c
index 823296d..a6b8848 100644
--- a/sound/usb/usbaudio.c
+++ b/sound/usb/usbaudio.c
@@ -3347,7 +3347,7 @@ static int snd_usb_create_quirk(struct snd_usb_audio *chip,
 		[QUIRK_MIDI_YAMAHA] = snd_usb_create_midi_interface,
 		[QUIRK_MIDI_MIDIMAN] = snd_usb_create_midi_interface,
 		[QUIRK_MIDI_NOVATION] = snd_usb_create_midi_interface,
-		[QUIRK_MIDI_RAW] = snd_usb_create_midi_interface,
+		[QUIRK_MIDI_FASTLANE] = snd_usb_create_midi_interface,
 		[QUIRK_MIDI_EMAGIC] = snd_usb_create_midi_interface,
 		[QUIRK_MIDI_CME] = snd_usb_create_midi_interface,
 		[QUIRK_AUDIO_STANDARD_INTERFACE] = create_standard_audio_quirk,
diff --git a/sound/usb/usbaudio.h b/sound/usb/usbaudio.h
index 36e4f7a2..8e7f789 100644
--- a/sound/usb/usbaudio.h
+++ b/sound/usb/usbaudio.h
@@ -153,7 +153,7 @@ enum quirk_type {
 	QUIRK_MIDI_YAMAHA,
 	QUIRK_MIDI_MIDIMAN,
 	QUIRK_MIDI_NOVATION,
-	QUIRK_MIDI_RAW,
+	QUIRK_MIDI_FASTLANE,
 	QUIRK_MIDI_EMAGIC,
 	QUIRK_MIDI_CME,
 	QUIRK_MIDI_US122L,
diff --git a/sound/usb/usbmidi.c b/sound/usb/usbmidi.c
index 26bad37..2fb35cc 100644
--- a/sound/usb/usbmidi.c
+++ b/sound/usb/usbmidi.c
@@ -1778,8 +1778,18 @@ int snd_usb_create_midi_interface(struct snd_usb_audio* chip,
 		umidi->usb_protocol_ops = &snd_usbmidi_novation_ops;
 		err = snd_usbmidi_detect_per_port_endpoints(umidi, endpoints);
 		break;
-	case QUIRK_MIDI_RAW:
+	case QUIRK_MIDI_FASTLANE:
 		umidi->usb_protocol_ops = &snd_usbmidi_raw_ops;
+		/*
+		 * Interface 1 contains isochronous endpoints, but with the same
+		 * numbers as in interface 0.  Since it is interface 1 that the
+		 * USB core has most recently seen, these descriptors are now
+		 * associated with the endpoint numbers.  This will foul up our
+		 * attempts to submit bulk/interrupt URBs to the endpoints in
+		 * interface 0, so we have to make sure that the USB core looks
+		 * again at interface 0 by calling usb_set_interface() on it.
+		 */
+		usb_set_interface(umidi->chip->dev, 0, 0);
 		err = snd_usbmidi_detect_per_port_endpoints(umidi, endpoints);
 		break;
 	case QUIRK_MIDI_EMAGIC:
diff --git a/sound/usb/usbquirks.h b/sound/usb/usbquirks.h
index 647ef50..5d955aa 100644
--- a/sound/usb/usbquirks.h
+++ b/sound/usb/usbquirks.h
@@ -1868,7 +1868,7 @@ YAMAHA_DEVICE(0x7010, "UB99"),
 		.data = & (const struct snd_usb_audio_quirk[]) {
 			{
 				.ifnum = 0,
-				.type = QUIRK_MIDI_RAW
+				.type = QUIRK_MIDI_FASTLANE
 			},
 			{
 				.ifnum = 1,
-- 
cgit v0.10.2


From ef65b2a0b3a2f82850144df6e6a7796f6d66da6b Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 27 May 2009 10:10:51 +0200
Subject: perf record: Fix the profiling of existing pid or whole box

Perf record bails if no command argument is provided, so you can't use
naked -a or -p to profile a running task or the whole box.

Allow foreground profiling of an existing pid or the entire system.

[ Impact: fix command option handling bug ]

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 431077a..4a06866 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -354,7 +354,7 @@ static int __cmd_record(int argc, const char **argv)
 	signal(SIGCHLD, sig_handler);
 	signal(SIGINT, sig_handler);
 
-	if (target_pid == -1) {
+	if (target_pid == -1 && argc) {
 		pid = fork();
 		if (pid < 0)
 			perror("failed to fork");
@@ -430,7 +430,7 @@ int cmd_record(int argc, const char **argv, const char *prefix)
 	create_events_help(events_help_msg);
 
 	argc = parse_options(argc, argv, options, record_usage, 0);
-	if (!argc)
+	if (!argc && target_pid == -1 && !system_wide)
 		usage_with_options(record_usage, options);
 
 	if (!nr_counters) {
-- 
cgit v0.10.2


From d716fba49c7445ec87c3f045c59624fac03ee3f2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 27 May 2009 13:19:59 +0200
Subject: perf report: Remove <ctype.h> include

Pekka reported build failure in builtin-report.c:

    CC builtin-report.o
    In file included from builtin-report.c:7:
    /usr/include/ctype.h:102: error: expected expression before token

And observed:

| Removing #include <ctype.h> from builtin-report.c makes the problem
| go away. I am running Ubuntu 9.04 that has gcc 4.3.3 and libc 2.9.

Reported-by: Pekka J Enberg <penberg@cs.helsinki.fi>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 9aef7c5..6265bed 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -4,7 +4,6 @@
 #include <libelf.h>
 #include <gelf.h>
 #include <elf.h>
-#include <ctype.h>
 
 #include "util/list.h"
 #include "util/cache.h"
-- 
cgit v0.10.2


From ed37d83e6aa218192fb28bb6b82498d2a8c74070 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 27 May 2009 21:39:05 +1000
Subject: md: raid5: change incorrect usage of 'min' macro to 'min_t'

A recent patch to raid5.c use min on an int and a sector_t.
This isn't allowed.
So change it to min_t(sector_t,x,y).

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3c3626d..5d400ae 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3811,13 +3811,13 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 	safepos = conf->reshape_safe;
 	sector_div(safepos, data_disks);
 	if (mddev->delta_disks < 0) {
-		writepos -= min(reshape_sectors, writepos);
+		writepos -= min_t(sector_t, reshape_sectors, writepos);
 		readpos += reshape_sectors;
 		safepos += reshape_sectors;
 	} else {
 		writepos += reshape_sectors;
-		readpos -= min(reshape_sectors, readpos);
-		safepos -= min(reshape_sectors, safepos);
+		readpos -= min_t(sector_t, reshape_sectors, readpos);
+		safepos -= min_t(sector_t, reshape_sectors, safepos);
 	}
 
 	/* 'writepos' is the most advanced device address we might write.
-- 
cgit v0.10.2


From ba396a6c104682dfe5c8b4fbbf5974d5ac9f3687 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Wed, 27 May 2009 14:17:08 +0200
Subject: block: fix oops with block tag queueing

commit e8939a50466fd963eb1ba9118c34b9ffb7ff6aa6
Author: Tejun Heo <tj@kernel.org>
Date:   Fri May 8 11:54:16 2009 +0900

    block: implement and enforce request peek/start/fetch

Added a BUG_ON(blk_queued_rq(req)) to the top of blk_finish_req().
Unfortunately, this checks whether req->queuelist is empty.  This list
is doing double duty both as the queue list and the tag list, so tagged
requests come in here with this not empty and boom (the tag list is
emptied by blk_queue_end_tag() lower down).

Fix this by moving the BUG_ON to below the end tag we also seem
vulnerable to this in blk_requeue_request() as well.  I think all uses
of blk_queued_rq() need auditing because the check is clearly wrong in
the tagged case.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index 7a4c401..8b3b74e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -956,8 +956,6 @@ EXPORT_SYMBOL(blk_make_request);
  */
 void blk_requeue_request(struct request_queue *q, struct request *rq)
 {
-	BUG_ON(blk_queued_rq(rq));
-
 	blk_delete_timer(rq);
 	blk_clear_rq_complete(rq);
 	trace_block_rq_requeue(q, rq);
@@ -965,6 +963,8 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
 
+	BUG_ON(blk_queued_rq(rq));
+
 	elv_requeue_request(q, rq);
 }
 EXPORT_SYMBOL(blk_requeue_request);
@@ -2042,11 +2042,11 @@ static bool blk_update_bidi_request(struct request *rq, int error,
  */
 static void blk_finish_request(struct request *req, int error)
 {
-	BUG_ON(blk_queued_rq(req));
-
 	if (blk_rq_tagged(req))
 		blk_queue_end_tag(req->q, req);
 
+	BUG_ON(blk_queued_rq(req));
+
 	if (unlikely(laptop_mode) && blk_fs_request(req))
 		laptop_io_completion();
 
-- 
cgit v0.10.2


From 3c4198e874cde694f5ea1463706873e7907bdb18 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Wed, 27 May 2009 14:50:02 +0200
Subject: block: fix no diskstat problem

The commit below in 2.6-block/for-2.6.31 causes no diskstat problem
because the blk_discard_rq() check was added with '&&'.
It should be 'blk_fs_request() || blk_discard_rq()'.
This patch does it and fixes the no diskstat problem.
Please review and apply.

------ /proc/diskstat without this patch -------------------------------------
   8       0 sda 0 0 0 0 0 0 0 0 0 0 0
------------------------------------------------------------------------------

----- /proc/diskstat with this patch applied ---------------------------------
   8       0 sda 4186 303 373621 61600 9578 3859 107468 169479 2 89755 231059
------------------------------------------------------------------------------

--------------------------------------------------------------------------
commit c69d48540c201394d08cb4d48b905e001313d9b8
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Fri Apr 24 08:12:19 2009 +0200

    block: include discard requests in IO accounting

    We currently don't do merging on discard requests, but we potentially
    could. If we do, then we need to include discard requests in the IO
    accounting, or merging would end up decrementing in_flight IO counters
    for an IO which never incremented them.

    So enable accounting for discard requests.

<snip>

 static inline int blk_do_io_stat(struct request *rq)
 {
-       return rq->rq_disk && blk_rq_io_stat(rq) && blk_fs_request(rq);
+       return rq->rq_disk && blk_rq_io_stat(rq) && blk_fs_request(rq) &&
+               blk_discard_rq(rq);
 }
--------------------------------------------------------------------------

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk.h b/block/blk.h
index c863ec2..3fae6ad 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -156,12 +156,12 @@ static inline int blk_cpu_to_group(int cpu)
  *
  *	a) it's attached to a gendisk, and
  *	b) the queue had IO stats enabled when this request was started, and
- *	c) it's a file system request
+ *	c) it's a file system request or a discard request
  */
 static inline int blk_do_io_stat(struct request *rq)
 {
-	return rq->rq_disk && blk_rq_io_stat(rq) && blk_fs_request(rq) &&
-		blk_discard_rq(rq);
+	return rq->rq_disk && blk_rq_io_stat(rq) &&
+	       (blk_fs_request(rq) || blk_discard_rq(rq));
 }
 
 #endif
-- 
cgit v0.10.2


From b7a16eac5e679fb5f531b9eeff7db7952303e77d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 27 May 2009 13:35:35 +0200
Subject: perf_counter: tools: /usr/lib/debug%s.debug support

Some distros seem to store debuginfo in weird places.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 6265bed..a9ff49a 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -190,7 +190,8 @@ static inline int elf_sym__is_function(const GElf_Sym *sym)
 {
 	return elf_sym__type(sym) == STT_FUNC &&
 	       sym->st_name != 0 &&
-	       sym->st_shndx != SHN_UNDEF;
+	       sym->st_shndx != SHN_UNDEF &&
+	       sym->st_size != 0;
 }
 
 static inline const char *elf_sym__name(const GElf_Sym *sym,
@@ -222,11 +223,11 @@ static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
 	return sec;
 }
 
-static int dso__load(struct dso *self)
+static int dso__load_sym(struct dso *self, int fd, char *name)
 {
 	Elf_Data *symstrs;
 	uint32_t nr_syms;
-	int fd, err = -1;
+	int err = -1;
 	uint32_t index;
 	GElf_Ehdr ehdr;
 	GElf_Shdr shdr;
@@ -234,16 +235,12 @@ static int dso__load(struct dso *self)
 	GElf_Sym sym;
 	Elf_Scn *sec;
 	Elf *elf;
-
-
-	fd = open(self->name, O_RDONLY);
-	if (fd == -1)
-		return -1;
+	int nr = 0;
 
 	elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
 	if (elf == NULL) {
 		fprintf(stderr, "%s: cannot read %s ELF file.\n",
-			__func__, self->name);
+			__func__, name);
 		goto out_close;
 	}
 
@@ -292,16 +289,63 @@ static int dso__load(struct dso *self)
 			goto out_elf_end;
 
 		dso__insert_symbol(self, f);
+
+		nr++;
 	}
 
-	err = 0;
+	err = nr;
 out_elf_end:
 	elf_end(elf);
 out_close:
-	close(fd);
 	return err;
 }
 
+static int dso__load(struct dso *self)
+{
+	int size = strlen(self->name) + sizeof("/usr/lib/debug%s.debug");
+	char *name = malloc(size);
+	int variant = 0;
+	int ret = -1;
+	int fd;
+
+	if (!name)
+		return -1;
+
+more:
+	do {
+		switch (variant) {
+		case 0: /* Fedora */
+			snprintf(name, size, "/usr/lib/debug%s.debug", self->name);
+			break;
+		case 1: /* Ubuntu */
+			snprintf(name, size, "/usr/lib/debug%s", self->name);
+			break;
+		case 2: /* Sane people */
+			snprintf(name, size, "%s", self->name);
+			break;
+
+		default:
+			goto out;
+		}
+		variant++;
+
+		fd = open(name, O_RDONLY);
+	} while (fd < 0);
+
+	ret = dso__load_sym(self, fd, name);
+	close(fd);
+
+	/*
+	 * Some people seem to have debuginfo files _WITHOUT_ debug info!?!?
+	 */
+	if (!ret)
+		goto more;
+
+out:
+	free(name);
+	return ret;
+}
+
 static size_t dso__fprintf(struct dso *self, FILE *fp)
 {
 	size_t ret = fprintf(fp, "dso: %s\n", self->name);
@@ -336,11 +380,23 @@ static struct dso *dsos__find(const char *name)
 static struct dso *dsos__findnew(const char *name)
 {
 	struct dso *dso = dsos__find(name);
+	int nr;
 
 	if (dso == NULL) {
 		dso = dso__new(name);
-		if (dso != NULL && dso__load(dso) < 0)
+		if (!dso)
+			goto out_delete_dso;
+
+		nr = dso__load(dso);
+		if (nr < 0) {
+			fprintf(stderr, "Failed to open: %s\n", name);
 			goto out_delete_dso;
+		}
+		if (!nr) {
+			fprintf(stderr,
+		"Failed to find debug symbols for: %s, maybe install a debug package?\n",
+					name);
+		}
 
 		dsos__add(dso);
 	}
@@ -547,9 +603,9 @@ symhist__fprintf(struct symhist *self, uint64_t total_samples, FILE *fp)
 	size_t ret;
 
 	if (total_samples)
-		ret = fprintf(fp, "%5.2f", (self->count * 100.0) / total_samples);
+		ret = fprintf(fp, "%5.2f%% ", (self->count * 100.0) / total_samples);
 	else
-		ret = fprintf(fp, "%12d", self->count);
+		ret = fprintf(fp, "%12d ", self->count);
 
 	ret += fprintf(fp, "%14s [%c] ",
 		       thread__name(self->thread, bf, sizeof(bf)),
@@ -922,10 +978,12 @@ more:
 	}
 	default: {
 broken_event:
-		fprintf(stderr, "%p [%p]: skipping unknown header type: %d\n",
-			(void *)(offset + head),
-			(void *)(long)(event->header.size),
-			event->header.type);
+		if (dump_trace)
+			fprintf(stderr, "%p [%p]: skipping unknown header type: %d\n",
+					(void *)(offset + head),
+					(void *)(long)(event->header.size),
+					event->header.type);
+
 		total_unknown++;
 
 		/*
-- 
cgit v0.10.2


From 346a850e3c3a20159cef2b79235e6d34aa497c65 Mon Sep 17 00:00:00 2001
From: Manuel Traut <manut@linutronix.de>
Date: Wed, 27 May 2009 06:20:05 -0700
Subject: Input: usb1400_ts - fix access to "device data" in resume function

platform_data != driver_data

driver data is actually the "correct" place of the struct however it is
not placed there due to the need of the ac97 struct. This is broken since
d9105c2b01 aka "[ARM] 5184/1: Split ucb1400_ts into core and touchscreen"

Signed-off-by: Manuel Traut <manut@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>

diff --git a/drivers/input/touchscreen/ucb1400_ts.c b/drivers/input/touchscreen/ucb1400_ts.c
index f100c7f..6954f55 100644
--- a/drivers/input/touchscreen/ucb1400_ts.c
+++ b/drivers/input/touchscreen/ucb1400_ts.c
@@ -419,7 +419,7 @@ static int ucb1400_ts_remove(struct platform_device *dev)
 #ifdef CONFIG_PM
 static int ucb1400_ts_resume(struct platform_device *dev)
 {
-	struct ucb1400_ts *ucb = platform_get_drvdata(dev);
+	struct ucb1400_ts *ucb = dev->dev.platform_data;
 
 	if (ucb->ts_task) {
 		/*
-- 
cgit v0.10.2


From 683a04cebc63819a36b1db19843bd17771f05b55 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@comx.dk>
Date: Wed, 27 May 2009 15:45:34 +0200
Subject: netfilter: xt_hashlimit does a wrong SEQ_SKIP

The function dl_seq_show() returns 1 (equal to SEQ_SKIP) in case
a seq_printf() call return -1.  It should return -1.

This SEQ_SKIP behavior brakes processing the proc file e.g. via a
pipe or just through less.

Signed-off-by: Jesper Dangaard Brouer <hawk@comx.dk>
Signed-off-by: Patrick McHardy <kaber@trash.net>

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index a5b5369..219dcdb 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -926,7 +926,7 @@ static int dl_seq_show(struct seq_file *s, void *v)
 	if (!hlist_empty(&htable->hash[*bucket])) {
 		hlist_for_each_entry(ent, pos, &htable->hash[*bucket], node)
 			if (dl_seq_real_show(ent, htable->family, s))
-				return 1;
+				return -1;
 	}
 	return 0;
 }
-- 
cgit v0.10.2


From eeff9beec3d2563c42cca41e66d4169592bb5475 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 27 May 2009 15:49:11 +0200
Subject: netfilter: nfnetlink_log: fix wrong skbuff size	calculation

This problem was introduced in 72961ecf84d67d6359a1b30f9b2a8427f13e1e71
since no space was reserved for the new attributes NFULA_HWTYPE,
NFULA_HWLEN and NFULA_HWHEADER.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>

diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index fd326ac..66a6dd5 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -581,6 +581,12 @@ nfulnl_log_packet(u_int8_t pf,
 		+ nla_total_size(sizeof(struct nfulnl_msg_packet_hw))
 		+ nla_total_size(sizeof(struct nfulnl_msg_packet_timestamp));
 
+	if (in && skb_mac_header_was_set(skb)) {
+		size +=   nla_total_size(skb->dev->hard_header_len)
+			+ nla_total_size(sizeof(u_int16_t))	/* hwtype */
+			+ nla_total_size(sizeof(u_int16_t));	/* hwlen */
+	}
+
 	spin_lock_bh(&inst->lock);
 
 	if (inst->flags & NFULNL_CFG_F_SEQ)
-- 
cgit v0.10.2


From 348ca1029e8bae6e0c49097ad25439b17c5326f4 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 27 May 2009 15:46:50 +0100
Subject: FS-Cache: Fixup renamed filenames in comments in internal.h

Fix up renamed filenames in comments in fs/fscache/internal.h.

Originally, the files were all called fsc-xxx.c, but they got renamed to
just xxx.c.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index e0cbd16..1c34130 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -28,7 +28,7 @@
 #define FSCACHE_MAX_THREADS	32
 
 /*
- * fsc-cache.c
+ * cache.c
  */
 extern struct list_head fscache_cache_list;
 extern struct rw_semaphore fscache_addremove_sem;
@@ -37,7 +37,7 @@ extern struct fscache_cache *fscache_select_cache_for_object(
 	struct fscache_cookie *);
 
 /*
- * fsc-cookie.c
+ * cookie.c
  */
 extern struct kmem_cache *fscache_cookie_jar;
 
@@ -45,13 +45,13 @@ extern void fscache_cookie_init_once(void *);
 extern void __fscache_cookie_put(struct fscache_cookie *);
 
 /*
- * fsc-fsdef.c
+ * fsdef.c
  */
 extern struct fscache_cookie fscache_fsdef_index;
 extern struct fscache_cookie_def fscache_fsdef_netfs_def;
 
 /*
- * fsc-histogram.c
+ * histogram.c
  */
 #ifdef CONFIG_FSCACHE_HISTOGRAM
 extern atomic_t fscache_obj_instantiate_histogram[HZ];
@@ -75,7 +75,7 @@ extern const struct file_operations fscache_histogram_fops;
 #endif
 
 /*
- * fsc-main.c
+ * main.c
  */
 extern unsigned fscache_defer_lookup;
 extern unsigned fscache_defer_create;
@@ -86,14 +86,14 @@ extern int fscache_wait_bit(void *);
 extern int fscache_wait_bit_interruptible(void *);
 
 /*
- * fsc-object.c
+ * object.c
  */
 extern void fscache_withdrawing_object(struct fscache_cache *,
 				       struct fscache_object *);
 extern void fscache_enqueue_object(struct fscache_object *);
 
 /*
- * fsc-operation.c
+ * operation.c
  */
 extern int fscache_submit_exclusive_op(struct fscache_object *,
 				       struct fscache_operation *);
@@ -104,7 +104,7 @@ extern void fscache_start_operations(struct fscache_object *);
 extern void fscache_operation_gc(struct work_struct *);
 
 /*
- * fsc-proc.c
+ * proc.c
  */
 #ifdef CONFIG_PROC_FS
 extern int __init fscache_proc_init(void);
@@ -115,7 +115,7 @@ extern void fscache_proc_cleanup(void);
 #endif
 
 /*
- * fsc-stats.c
+ * stats.c
  */
 #ifdef CONFIG_FSCACHE_STATS
 extern atomic_t fscache_n_ops_processed[FSCACHE_MAX_THREADS];
-- 
cgit v0.10.2


From 911e690e70540f009125bacd16c017eb1a7b1916 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 27 May 2009 15:46:55 +0100
Subject: CacheFiles: Fixup renamed filenames in comments in internal.h

Fix up renamed filenames in comments in fs/cachefiles/internal.h.

Originally, the files were all called cf-xxx.c, but they got renamed to
just xxx.c.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 19218e1..f7c255f 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -122,13 +122,13 @@ static inline void cachefiles_state_changed(struct cachefiles_cache *cache)
 }
 
 /*
- * cf-bind.c
+ * bind.c
  */
 extern int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args);
 extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache);
 
 /*
- * cf-daemon.c
+ * daemon.c
  */
 extern const struct file_operations cachefiles_daemon_fops;
 
@@ -136,17 +136,17 @@ extern int cachefiles_has_space(struct cachefiles_cache *cache,
 				unsigned fnr, unsigned bnr);
 
 /*
- * cf-interface.c
+ * interface.c
  */
 extern const struct fscache_cache_ops cachefiles_cache_ops;
 
 /*
- * cf-key.c
+ * key.c
  */
 extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type);
 
 /*
- * cf-namei.c
+ * namei.c
  */
 extern int cachefiles_delete_object(struct cachefiles_cache *cache,
 				    struct cachefiles_object *object);
@@ -165,7 +165,7 @@ extern int cachefiles_check_in_use(struct cachefiles_cache *cache,
 				   struct dentry *dir, char *filename);
 
 /*
- * cf-proc.c
+ * proc.c
  */
 #ifdef CONFIG_CACHEFILES_HISTOGRAM
 extern atomic_t cachefiles_lookup_histogram[HZ];
@@ -190,7 +190,7 @@ void cachefiles_hist(atomic_t histogram[], unsigned long start_jif)
 #endif
 
 /*
- * cf-rdwr.c
+ * rdwr.c
  */
 extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *,
 					 struct page *, gfp_t);
@@ -205,7 +205,7 @@ extern int cachefiles_write_page(struct fscache_storage *, struct page *);
 extern void cachefiles_uncache_page(struct fscache_object *, struct page *);
 
 /*
- * cf-security.c
+ * security.c
  */
 extern int cachefiles_get_security_ID(struct cachefiles_cache *cache);
 extern int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
@@ -225,7 +225,7 @@ static inline void cachefiles_end_secure(struct cachefiles_cache *cache,
 }
 
 /*
- * cf-xattr.c
+ * xattr.c
  */
 extern int cachefiles_check_object_type(struct cachefiles_object *object);
 extern int cachefiles_set_object_xattr(struct cachefiles_object *object,
-- 
cgit v0.10.2


From 450aaa2b2a1b006870ba68251fbb40b2387caade Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 27 May 2009 20:20:23 +0200
Subject: perf_counter: tools: report: Add vmlinux support

Allow to use vmlinux instead of kallsyms.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <20090527182100.740018486@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index a9ff49a..3e87cbd 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -19,6 +19,7 @@
 #define SHOW_HV		4
 
 static char		const *input_name = "perf.data";
+static char		*vmlinux = NULL;
 static int		input;
 static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
@@ -532,6 +533,39 @@ out_delete_dso:
 	return -1;
 }
 
+static int load_kernel(void)
+{
+	int fd, nr;
+
+	if (!vmlinux)
+		goto kallsyms;
+
+	fd = open(vmlinux, O_RDONLY);
+	if (fd < 0)
+		goto kallsyms;
+
+	kernel_dso = dso__new("[kernel]");
+	if (!kernel_dso)
+		goto fail_open;
+
+	nr = dso__load_sym(kernel_dso, fd, vmlinux);
+
+	if (nr <= 0)
+		goto fail_load;
+
+	dsos__add(kernel_dso);
+	close(fd);
+
+	return 0;
+
+fail_load:
+	dso__delete(kernel_dso);
+fail_open:
+	close(fd);
+kallsyms:
+	return load_kallsyms();
+}
+
 struct map {
 	struct list_head node;
 	uint64_t	 start;
@@ -850,7 +884,7 @@ static int __cmd_report(void)
 		exit(0);
 	}
 
-	if (load_kallsyms() < 0) {
+	if (load_kernel() < 0) {
 		perror("failed to open kallsyms");
 		return EXIT_FAILURE;
 	}
@@ -1039,6 +1073,7 @@ static const struct option options[] = {
 		    "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
+	OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
 	OPT_END()
 };
 
-- 
cgit v0.10.2


From e7fb08b1d06a6b37263c765205de5614a2273aeb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 27 May 2009 20:20:24 +0200
Subject: perf_counter: tools: report: Rework histogram code

In preparation for configurable sorting, rework the histgram code a bit.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <20090527182100.796410098@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 3e87cbd..2762564 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -597,71 +597,9 @@ struct thread;
 
 static const char *thread__name(struct thread *self, char *bf, size_t size);
 
-struct symhist {
-	struct rb_node	 rb_node;
-	struct dso	 *dso;
-	struct symbol	 *sym;
-	struct thread	 *thread;
-	uint64_t	 ip;
-	uint32_t	 count;
-	char		 level;
-};
-
-static struct symhist *symhist__new(struct symbol *sym, uint64_t ip,
-				    struct thread *thread, struct dso *dso,
-				    char level)
-{
-	struct symhist *self = malloc(sizeof(*self));
-
-	if (self != NULL) {
-		self->sym    = sym;
-		self->thread = thread;
-		self->ip     = ip;
-		self->dso    = dso;
-		self->level  = level;
-		self->count  = 1;
-	}
-
-	return self;
-}
-
-static void symhist__inc(struct symhist *self)
-{
-	++self->count;
-}
-
-static size_t
-symhist__fprintf(struct symhist *self, uint64_t total_samples, FILE *fp)
-{
-	char bf[32];
-	size_t ret;
-
-	if (total_samples)
-		ret = fprintf(fp, "%5.2f%% ", (self->count * 100.0) / total_samples);
-	else
-		ret = fprintf(fp, "%12d ", self->count);
-
-	ret += fprintf(fp, "%14s [%c] ",
-		       thread__name(self->thread, bf, sizeof(bf)),
-		       self->level);
-
-	if (verbose)
-		ret += fprintf(fp, "%#018llx ", (unsigned long long)self->ip);
-
-	if (self->level != '.')
-		ret += fprintf(fp, "%s\n",
-			       self->sym ? self->sym->name : "<unknown>");
-	else
-		ret += fprintf(fp, "%s: %s\n",
-			       self->dso ? self->dso->name : "<unknown>",
-			       self->sym ? self->sym->name : "<unknown>");
-	return ret;
-}
-
 struct thread {
 	struct rb_node	 rb_node;
 	struct list_head maps;
-	struct rb_root	 symhists;
 	pid_t		 pid;
 	char		 *comm;
 };
@@ -683,67 +621,17 @@ static struct thread *thread__new(pid_t pid)
 		self->pid = pid;
 		self->comm = NULL;
 		INIT_LIST_HEAD(&self->maps);
-		self->symhists = RB_ROOT;
 	}
 
 	return self;
 }
 
-static int thread__symbol_incnew(struct thread *self, struct symbol *sym,
-				 uint64_t ip, struct dso *dso, char level)
-{
-	struct rb_node **p = &self->symhists.rb_node;
-	struct rb_node *parent = NULL;
-	struct symhist *sh;
-
-	while (*p != NULL) {
-		uint64_t start;
-
-		parent = *p;
-		sh = rb_entry(parent, struct symhist, rb_node);
-
-		if (sh->sym == sym || ip == sh->ip) {
-			symhist__inc(sh);
-			return 0;
-		}
-
-		/* Handle unresolved symbols too */
-		start = !sh->sym ? sh->ip : sh->sym->start;
-
-		if (ip < start)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-
-	sh = symhist__new(sym, ip, self, dso, level);
-	if (sh == NULL)
-		return -ENOMEM;
-	rb_link_node(&sh->rb_node, parent, p);
-	rb_insert_color(&sh->rb_node, &self->symhists);
-	return 0;
-}
-
 static int thread__set_comm(struct thread *self, const char *comm)
 {
 	self->comm = strdup(comm);
 	return self->comm ? 0 : -ENOMEM;
 }
 
-static size_t thread__fprintf(struct thread *self, FILE *fp)
-{
-	int ret = fprintf(fp, "thread: %d %s\n", self->pid, self->comm);
-	struct rb_node *nd;
-
-	for (nd = rb_first(&self->symhists); nd; nd = rb_next(nd)) {
-		struct symhist *pos = rb_entry(nd, struct symhist, rb_node);
-
-		ret += symhist__fprintf(pos, 0, fp);
-	}
-
-	return ret;
-}
-
 static struct rb_root threads;
 
 static struct thread *threads__findnew(pid_t pid)
@@ -792,70 +680,172 @@ static struct map *thread__find_map(struct thread *self, uint64_t ip)
 	return NULL;
 }
 
-static void threads__fprintf(FILE *fp)
+/*
+ * histogram, sorted on item, collects counts
+ */
+
+static struct rb_root hist;
+
+struct hist_entry {
+	struct rb_node	 rb_node;
+
+	struct thread	 *thread;
+	struct map	 *map;
+	struct dso	 *dso;
+	struct symbol	 *sym;
+	uint64_t	 ip;
+	char		 level;
+
+	uint32_t	 count;
+};
+
+static int64_t
+hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	uint64_t ip_l, ip_r;
+	int cmp = right->thread->pid - left->thread->pid;
+
+	if (cmp)
+		return cmp;
+
+	if (left->sym == right->sym)
+		return 0;
+
+	ip_l = left->sym ? left->sym->start : left->ip;
+	ip_r = right->sym ? right->sym->start : right->ip;
+
+	return (int64_t)(ip_r - ip_l);
+}
+
+static int
+hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
+		struct symbol *sym, uint64_t ip, char level)
 {
-	struct rb_node *nd;
-	for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
-		struct thread *pos = rb_entry(nd, struct thread, rb_node);
-		thread__fprintf(pos, fp);
+	struct rb_node **p = &hist.rb_node;
+	struct rb_node *parent = NULL;
+	struct hist_entry *he;
+	struct hist_entry entry = {
+		.thread	= thread,
+		.map	= map,
+		.dso	= dso,
+		.sym	= sym,
+		.ip	= ip,
+		.level	= level,
+		.count	= 1,
+	};
+	int cmp;
+
+	while (*p != NULL) {
+		parent = *p;
+		he = rb_entry(parent, struct hist_entry, rb_node);
+
+		cmp = hist_entry__cmp(&entry, he);
+
+		if (!cmp) {
+			he->count++;
+			return 0;
+		}
+
+		if (cmp < 0)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
 	}
+
+	he = malloc(sizeof(*he));
+	if (!he)
+		return -ENOMEM;
+	*he = entry;
+	rb_link_node(&he->rb_node, parent, p);
+	rb_insert_color(&he->rb_node, &hist);
+
+	return 0;
 }
 
-static struct rb_root global_symhists;
+static size_t
+hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
+{
+	char bf[32];
+	size_t ret;
+
+	if (total_samples) {
+		ret = fprintf(fp, "%5.2f%% ",
+				(self->count * 100.0) / total_samples);
+	} else
+		ret = fprintf(fp, "%12d ", self->count);
 
-static void threads__insert_symhist(struct symhist *sh)
+	ret += fprintf(fp, "%14s [%c] ",
+		       thread__name(self->thread, bf, sizeof(bf)),
+		       self->level);
+
+	if (verbose)
+		ret += fprintf(fp, "%#018llx ", (unsigned long long)self->ip);
+
+	if (self->level != '.')
+		ret += fprintf(fp, "%s\n",
+			       self->sym ? self->sym->name : "<unknown>");
+	else
+		ret += fprintf(fp, "%s: %s\n",
+			       self->dso ? self->dso->name : "<unknown>",
+			       self->sym ? self->sym->name : "<unknown>");
+	return ret;
+}
+
+/*
+ * reverse the map, sort on count.
+ */
+
+static struct rb_root output_hists;
+
+static void output__insert_entry(struct hist_entry *he)
 {
-	struct rb_node **p = &global_symhists.rb_node;
+	struct rb_node **p = &output_hists.rb_node;
 	struct rb_node *parent = NULL;
-	struct symhist *iter;
+	struct hist_entry *iter;
 
 	while (*p != NULL) {
 		parent = *p;
-		iter = rb_entry(parent, struct symhist, rb_node);
+		iter = rb_entry(parent, struct hist_entry, rb_node);
 
-		/* Reverse order */
-		if (sh->count > iter->count)
+		if (he->count > iter->count)
 			p = &(*p)->rb_left;
 		else
 			p = &(*p)->rb_right;
 	}
 
-	rb_link_node(&sh->rb_node, parent, p);
-	rb_insert_color(&sh->rb_node, &global_symhists);
+	rb_link_node(&he->rb_node, parent, p);
+	rb_insert_color(&he->rb_node, &output_hists);
 }
 
-static void threads__sort_symhists(void)
+static void output__resort(void)
 {
-	struct rb_node *nd;
-
-	for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
-		struct thread *thread = rb_entry(nd, struct thread, rb_node);
-		struct rb_node *next = rb_first(&thread->symhists);
+	struct rb_node *next = rb_first(&hist);
+	struct hist_entry *n;
 
-		while (next) {
-			struct symhist *n = rb_entry(next, struct symhist,
-						     rb_node);
-			next = rb_next(&n->rb_node);
-			rb_erase(&n->rb_node, &thread->symhists);
-			threads__insert_symhist(n);
-		}
+	while (next) {
+		n = rb_entry(next, struct hist_entry, rb_node);
+		next = rb_next(&n->rb_node);
 
+		rb_erase(&n->rb_node, &hist);
+		output__insert_entry(n);
 	}
 }
 
-static size_t threads__symhists_fprintf(uint64_t total_samples, FILE *fp)
+static size_t output__fprintf(FILE *fp, uint64_t total_samples)
 {
+	struct hist_entry *pos;
 	struct rb_node *nd;
 	size_t ret = 0;
 
-	for (nd = rb_first(&global_symhists); nd; nd = rb_next(nd)) {
-		struct symhist *pos = rb_entry(nd, struct symhist, rb_node);
-		ret += symhist__fprintf(pos, total_samples, fp);
+	for (nd = rb_first(&output_hists); nd; nd = rb_next(nd)) {
+		pos = rb_entry(nd, struct hist_entry, rb_node);
+		ret += hist_entry__fprintf(fp, pos, total_samples);
 	}
 
 	return ret;
 }
 
+
 static int __cmd_report(void)
 {
 	unsigned long offset = 0;
@@ -926,6 +916,7 @@ more:
 		struct dso *dso = NULL;
 		struct thread *thread = threads__findnew(event->ip.pid);
 		uint64_t ip = event->ip.ip;
+		struct map *map = NULL;
 
 		if (dump_trace) {
 			fprintf(stderr, "%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
@@ -945,9 +936,10 @@ more:
 		if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
 			show = SHOW_KERNEL;
 			level = 'k';
+
 			dso = kernel_dso;
+
 		} else if (event->header.misc & PERF_EVENT_MISC_USER) {
-			struct map *map;
 
 			show = SHOW_USER;
 			level = '.';
@@ -957,6 +949,7 @@ more:
 				dso = map->dso;
 				ip -= map->start + map->pgoff;
 			}
+
 		} else {
 			show = SHOW_HV;
 			level = 'H';
@@ -965,8 +958,9 @@ more:
 		if (show & show_mask) {
 			struct symbol *sym = dso__find_symbol(dso, ip);
 
-			if (thread__symbol_incnew(thread, sym, ip, dso, level)) {
-				fprintf(stderr, "problem incrementing symbol count, bailing out\n");
+			if (hist_entry__add(thread, map, dso, sym, ip, level)) {
+				fprintf(stderr,
+		"problem incrementing symbol count, bailing out\n");
 				goto done;
 			}
 		}
@@ -1050,13 +1044,11 @@ done:
 		return 0;
 	}
 
-	if (verbose >= 2) {
+	if (verbose >= 2)
 		dsos__fprintf(stdout);
-		threads__fprintf(stdout);
-	}
 
-	threads__sort_symhists();
-	threads__symhists_fprintf(total, stdout);
+	output__resort();
+	output__fprintf(stdout, total);
 
 	return rc;
 }
-- 
cgit v0.10.2


From 1aa167382323eeeeb38368cab85cf17979793cbe Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 27 May 2009 20:20:25 +0200
Subject: perf_counter: tools: report: Dynamic sort/print bits

Make the sorting and printing dynamic.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <20090527182100.921953817@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 2762564..856186f 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -699,14 +699,41 @@ struct hist_entry {
 	uint32_t	 count;
 };
 
+/*
+ * configurable sorting bits
+ */
+
+struct sort_entry {
+	struct list_head list;
+
+	int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
+	size_t	(*print)(FILE *fp, struct hist_entry *);
+};
+
 static int64_t
-hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
+sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
 {
-	uint64_t ip_l, ip_r;
-	int cmp = right->thread->pid - left->thread->pid;
+	return right->thread->pid - left->thread->pid;
+}
+
+static size_t
+sort__thread_print(FILE *fp, struct hist_entry *self)
+{
+	char bf[32];
+
+	return fprintf(fp, "%14s ",
+			thread__name(self->thread, bf, sizeof(bf)));
+}
 
-	if (cmp)
-		return cmp;
+static struct sort_entry sort_thread = {
+	.cmp	= sort__thread_cmp,
+	.print	= sort__thread_print,
+};
+
+static int64_t
+sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	uint64_t ip_l, ip_r;
 
 	if (left->sym == right->sym)
 		return 0;
@@ -717,6 +744,79 @@ hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
 	return (int64_t)(ip_r - ip_l);
 }
 
+static size_t
+sort__sym_print(FILE *fp, struct hist_entry *self)
+{
+	size_t ret = 0;
+
+	ret += fprintf(fp, "[%c] ", self->level);
+
+	if (verbose)
+		ret += fprintf(fp, "%#018llx ", (unsigned long long)self->ip);
+
+	if (self->level != '.')
+		ret += fprintf(fp, "%s ",
+			       self->sym ? self->sym->name : "<unknown>");
+	else
+		ret += fprintf(fp, "%s: %s ",
+			       self->dso ? self->dso->name : "<unknown>",
+			       self->sym ? self->sym->name : "<unknown>");
+
+	return ret;
+}
+
+static struct sort_entry sort_sym = {
+	.cmp	= sort__sym_cmp,
+	.print	= sort__sym_print,
+};
+
+static LIST_HEAD(hist_entry__sort_list);
+
+static void setup_sorting(void)
+{
+	list_add_tail(&sort_thread.list, &hist_entry__sort_list);
+	list_add_tail(&sort_sym.list, &hist_entry__sort_list);
+}
+
+static int64_t
+hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	struct sort_entry *se;
+	int64_t cmp = 0;
+
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		cmp = se->cmp(left, right);
+		if (cmp)
+			break;
+	}
+
+	return cmp;
+}
+
+static size_t
+hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
+{
+	struct sort_entry *se;
+	size_t ret;
+
+	if (total_samples) {
+		ret = fprintf(fp, "%5.2f%% ",
+				(self->count * 100.0) / total_samples);
+	} else
+		ret = fprintf(fp, "%12d ", self->count);
+
+	list_for_each_entry(se, &hist_entry__sort_list, list)
+		ret += se->print(fp, self);
+
+	ret += fprintf(fp, "\n");
+
+	return ret;
+}
+
+/*
+ * collect histogram counts
+ */
+
 static int
 hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
 		struct symbol *sym, uint64_t ip, char level)
@@ -762,35 +862,6 @@ hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
 	return 0;
 }
 
-static size_t
-hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
-{
-	char bf[32];
-	size_t ret;
-
-	if (total_samples) {
-		ret = fprintf(fp, "%5.2f%% ",
-				(self->count * 100.0) / total_samples);
-	} else
-		ret = fprintf(fp, "%12d ", self->count);
-
-	ret += fprintf(fp, "%14s [%c] ",
-		       thread__name(self->thread, bf, sizeof(bf)),
-		       self->level);
-
-	if (verbose)
-		ret += fprintf(fp, "%#018llx ", (unsigned long long)self->ip);
-
-	if (self->level != '.')
-		ret += fprintf(fp, "%s\n",
-			       self->sym ? self->sym->name : "<unknown>");
-	else
-		ret += fprintf(fp, "%s: %s\n",
-			       self->dso ? self->dso->name : "<unknown>",
-			       self->sym ? self->sym->name : "<unknown>");
-	return ret;
-}
-
 /*
  * reverse the map, sort on count.
  */
@@ -1077,6 +1148,8 @@ int cmd_report(int argc, const char **argv, const char *prefix)
 
 	parse_options(argc, argv, options, report_usage, 0);
 
+	setup_sorting();
+
 	setup_pager();
 
 	return __cmd_report();
-- 
cgit v0.10.2


From 37f440cba299bb479cf45d12eef923f0979dbcaf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 27 May 2009 20:20:26 +0200
Subject: pref_counter: tools: report: Add --sort option

option parsing for dynamic sorting.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <20090527182101.041817692@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 856186f..982abce 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -20,6 +20,7 @@
 
 static char		const *input_name = "perf.data";
 static char		*vmlinux = NULL;
+static char		*sort_order = "pid,symbol";
 static int		input;
 static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
@@ -770,12 +771,49 @@ static struct sort_entry sort_sym = {
 	.print	= sort__sym_print,
 };
 
+struct sort_dimension {
+	char *name;
+	struct sort_entry *entry;
+	int taken;
+};
+
+static struct sort_dimension sort_dimensions[] = {
+	{ .name = "pid",	.entry = &sort_thread,	},
+	{ .name = "symbol",	.entry = &sort_sym,	},
+};
+
 static LIST_HEAD(hist_entry__sort_list);
 
+static int sort_dimension__add(char *tok)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(sort_dimensions); i++) {
+		struct sort_dimension *sd = &sort_dimensions[i];
+
+		if (sd->taken)
+			continue;
+
+		if (strcmp(tok, sd->name))
+			continue;
+
+		list_add_tail(&sd->entry->list, &hist_entry__sort_list);
+		sd->taken = 1;
+		return 0;
+	}
+
+	return -ESRCH;
+}
+
 static void setup_sorting(void)
 {
-	list_add_tail(&sort_thread.list, &hist_entry__sort_list);
-	list_add_tail(&sort_sym.list, &hist_entry__sort_list);
+	char *tmp, *tok, *str = strdup(sort_order);
+
+	for (tok = strtok_r(str, ", ", &tmp);
+			tok; tok = strtok_r(NULL, ", ", &tmp))
+		sort_dimension__add(tok);
+
+	free(str);
 }
 
 static int64_t
@@ -1137,6 +1175,7 @@ static const struct option options[] = {
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
 	OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
+	OPT_STRING('s', "sort", &sort_order, "foo", "bar"),
 	OPT_END()
 };
 
-- 
cgit v0.10.2


From 992444b173f35997f96f5cbb214f0de81d1b97ff Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 27 May 2009 20:20:27 +0200
Subject: perf_counter: tools: report: Add comm sorting

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <20090527182101.129302022@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 982abce..a634022 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -732,6 +732,35 @@ static struct sort_entry sort_thread = {
 };
 
 static int64_t
+sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	char *comm_l = left->thread->comm;
+	char *comm_r = right->thread->comm;
+
+	if (!comm_l || !comm_r) {
+		if (!comm_l && !comm_r)
+			return 0;
+		else if (!comm_l)
+			return -1;
+		else
+			return 1;
+	}
+
+	return strcmp(comm_l, comm_r);
+}
+
+static size_t
+sort__comm_print(FILE *fp, struct hist_entry *self)
+{
+	return fprintf(fp, "%20s ", self->thread->comm ?: "<unknown>");
+}
+
+static struct sort_entry sort_comm = {
+	.cmp	= sort__comm_cmp,
+	.print	= sort__comm_print,
+};
+
+static int64_t
 sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
 {
 	uint64_t ip_l, ip_r;
@@ -779,6 +808,7 @@ struct sort_dimension {
 
 static struct sort_dimension sort_dimensions[] = {
 	{ .name = "pid",	.entry = &sort_thread,	},
+	{ .name = "comm",	.entry = &sort_comm,	},
 	{ .name = "symbol",	.entry = &sort_sym,	},
 };
 
-- 
cgit v0.10.2


From 55e5ec41a9de46b6ca06031f4fbdfdfc76dc24dc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 27 May 2009 20:20:28 +0200
Subject: pref_counter: tools: report: Add dso sorting

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <20090527182101.229504802@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index a634022..30e12c7 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -761,6 +761,35 @@ static struct sort_entry sort_comm = {
 };
 
 static int64_t
+sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	struct dso *dso_l = left->dso;
+	struct dso *dso_r = right->dso;
+
+	if (!dso_l || !dso_r) {
+		if (!dso_l && !dso_r)
+			return 0;
+		else if (!dso_l)
+			return -1;
+		else
+			return 1;
+	}
+
+	return strcmp(dso_l->name, dso_r->name);
+}
+
+static size_t
+sort__dso_print(FILE *fp, struct hist_entry *self)
+{
+	return fprintf(fp, "%64s ", self->dso ? self->dso->name : "<unknown>");
+}
+
+static struct sort_entry sort_dso = {
+	.cmp	= sort__dso_cmp,
+	.print	= sort__dso_print,
+};
+
+static int64_t
 sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
 {
 	uint64_t ip_l, ip_r;
@@ -809,6 +838,7 @@ struct sort_dimension {
 static struct sort_dimension sort_dimensions[] = {
 	{ .name = "pid",	.entry = &sort_thread,	},
 	{ .name = "comm",	.entry = &sort_comm,	},
+	{ .name = "dso",	.entry = &sort_dso,	},
 	{ .name = "symbol",	.entry = &sort_sym,	},
 };
 
-- 
cgit v0.10.2


From 2d65537ee7cd4a0818ea80a97ab7932368fff5cd Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 27 May 2009 21:36:22 +0200
Subject: pref_counter: tools: report: Add header printout & prettify

Old default output:

 3.12%    perf-report [.] ./perf-report:       dsos__find
 2.44%    perf-report [k] kernel:              kallsyms_expand_symbol
 2.28%          :4483 [.] <unknown>:           <unknown>
 2.05%          :4174 [k] kernel:              _spin_lock_irqsave
 2.01%    perf-report [k] kernel:              vsnprintf
 1.92%    perf-report [k] kernel:              format_decode
 1.92%          :4438 [k] kernel:              _spin_lock

New default output:

 #
 # Overhead          Command       File: Symbol
 # ........          .......       ............
 #
      6.54%             perf  [k]  kernel: kallsyms_expand_symbol
      6.26%             perf  [.]  /home/mingo/tip/Documentation/perf_counter/perf: dso__insert_symbol
      4.76%             perf  [.]  /home/mingo/tip/Documentation/perf_counter/perf: hex2long
      4.55%             perf  [k]  kernel: number
      4.48%             perf  [k]  kernel: format_decode
      4.09%             perf  [k]  kernel: vsnprintf

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <20090527182101.229504802@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 30e12c7..6df95c2 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -708,6 +708,7 @@ struct sort_entry {
 	struct list_head list;
 
 	int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
+	size_t (*print_header)(FILE *fp);
 	size_t	(*print)(FILE *fp, struct hist_entry *);
 };
 
@@ -722,7 +723,7 @@ sort__thread_print(FILE *fp, struct hist_entry *self)
 {
 	char bf[32];
 
-	return fprintf(fp, "%14s ",
+	return fprintf(fp, " %16s",
 			thread__name(self->thread, bf, sizeof(bf)));
 }
 
@@ -752,7 +753,7 @@ sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
 static size_t
 sort__comm_print(FILE *fp, struct hist_entry *self)
 {
-	return fprintf(fp, "%20s ", self->thread->comm ?: "<unknown>");
+	return fprintf(fp, " %16s", self->thread->comm ?: "<unknown>");
 }
 
 static struct sort_entry sort_comm = {
@@ -781,7 +782,7 @@ sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
 static size_t
 sort__dso_print(FILE *fp, struct hist_entry *self)
 {
-	return fprintf(fp, "%64s ", self->dso ? self->dso->name : "<unknown>");
+	return fprintf(fp, " %64s", self->dso ? self->dso->name : "<unknown>");
 }
 
 static struct sort_entry sort_dso = {
@@ -803,21 +804,33 @@ sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
 	return (int64_t)(ip_r - ip_l);
 }
 
+static size_t sort__sym_print_header(FILE *fp)
+{
+	size_t ret = 0;
+
+	ret += fprintf(fp, "#\n");
+	ret += fprintf(fp, "# Overhead          Command       File: Symbol\n");
+	ret += fprintf(fp, "# ........          .......       ............\n");
+	ret += fprintf(fp, "#\n");
+
+	return ret;
+}
+
 static size_t
 sort__sym_print(FILE *fp, struct hist_entry *self)
 {
 	size_t ret = 0;
 
-	ret += fprintf(fp, "[%c] ", self->level);
+	ret += fprintf(fp, "  [%c] ", self->level);
 
 	if (verbose)
-		ret += fprintf(fp, "%#018llx ", (unsigned long long)self->ip);
+		ret += fprintf(fp, " %#018llx", (unsigned long long)self->ip);
 
 	if (self->level != '.')
-		ret += fprintf(fp, "%s ",
+		ret += fprintf(fp, " kernel: %s",
 			       self->sym ? self->sym->name : "<unknown>");
 	else
-		ret += fprintf(fp, "%s: %s ",
+		ret += fprintf(fp, " %s: %s",
 			       self->dso ? self->dso->name : "<unknown>",
 			       self->sym ? self->sym->name : "<unknown>");
 
@@ -825,8 +838,9 @@ sort__sym_print(FILE *fp, struct hist_entry *self)
 }
 
 static struct sort_entry sort_sym = {
-	.cmp	= sort__sym_cmp,
-	.print	= sort__sym_print,
+	.cmp		= sort__sym_cmp,
+	.print_header	= sort__sym_print_header,
+	.print		= sort__sym_print,
 };
 
 struct sort_dimension {
@@ -898,7 +912,7 @@ hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
 	size_t ret;
 
 	if (total_samples) {
-		ret = fprintf(fp, "%5.2f%% ",
+		ret = fprintf(fp, "    %5.2f%%",
 				(self->count * 100.0) / total_samples);
 	} else
 		ret = fprintf(fp, "%12d ", self->count);
@@ -1003,9 +1017,15 @@ static void output__resort(void)
 static size_t output__fprintf(FILE *fp, uint64_t total_samples)
 {
 	struct hist_entry *pos;
+	struct sort_entry *se;
 	struct rb_node *nd;
 	size_t ret = 0;
 
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		if (se->print_header)
+			ret += se->print_header(fp);
+	}
+
 	for (nd = rb_first(&output_hists); nd; nd = rb_next(nd)) {
 		pos = rb_entry(nd, struct hist_entry, rb_node);
 		ret += hist_entry__fprintf(fp, pos, total_samples);
-- 
cgit v0.10.2


From 07f4f3e8a24138ca2f3650723d670df25687cd05 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristian=20H=C3=B8gsberg?= <krh@redhat.com>
Date: Wed, 27 May 2009 14:37:28 -0400
Subject: i915: Set object to gtt domain when faulting it back in
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a GEM object is evicted from the GTT we set it to the CPU domain,
as it might get swapped in and out or ever mmapped regularly.  If the
object is mmapped through the GTT it can still get evicted in this way
by other objects requiring GTT space.  When the GTT mapping is touched
again we fault it back into the GTT, but fail to set it back to the
GTT domain.  This means we fail to flush any cached CPU writes to the
pages backing the object which will then happen "eventually", typically
after we write to the page through the uncached GTT mapping.

[anholt: Note that userland does do a set_domain(GTT, GTT) when starting
to access the GTT mapping.  That covers getting the existing mapping of the
object synchronized if it's bound to the GTT.  But set_domain(GTT, GTT)
doesn't do anything if the object is currently unbound.  This fix covers the
transition to being bound for GTT mapping.]

Fixes glyph and other pixmap corruption during swapping.  fd.o bug #21790

Signed-off-by: Kristian Høgsberg <krh@redhat.com>
Signed-off-by: Eric Anholt <eric@anholt.net>

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index e242186..670d128 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1145,6 +1145,13 @@ int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 			mutex_unlock(&dev->struct_mutex);
 			return VM_FAULT_SIGBUS;
 		}
+
+		ret = i915_gem_object_set_to_gtt_domain(obj, write);
+		if (ret) {
+			mutex_unlock(&dev->struct_mutex);
+			return VM_FAULT_SIGBUS;
+		}
+
 		list_add_tail(&obj_priv->list, &dev_priv->mm.inactive_list);
 	}
 
-- 
cgit v0.10.2


From 55717314c4e3a5180a54228a2f97e50f3496de4c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 27 May 2009 22:13:17 +0200
Subject: pref_counter: tools: report: Robustify in case of weird events

This error condition:

  aldebaran:~/linux/linux/Documentation/perf_counter> perf report
  dso__load_sym: cannot get elf header.
  failed to open: /etc/ld.so.cache
  problem processing PERF_EVENT_MMAP, bailing out

caused the profile to be very short - as the error was at the beginning
of the file and we bailed out completely.

Be more permissive and consider the event broken instead.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 6df95c2..5993c12 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -1117,9 +1117,9 @@ more:
 		}
 
 		if (thread == NULL) {
-			fprintf(stderr, "problem processing %d event, bailing out\n",
+			fprintf(stderr, "problem processing %d event, skipping it.\n",
 				event->header.type);
-			goto done;
+			goto broken_event;
 		}
 
 		if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
@@ -1149,8 +1149,8 @@ more:
 
 			if (hist_entry__add(thread, map, dso, sym, ip, level)) {
 				fprintf(stderr,
-		"problem incrementing symbol count, bailing out\n");
-				goto done;
+		"problem incrementing symbol count, skipping event\n");
+				goto broken_event;
 			}
 		}
 		total++;
@@ -1169,8 +1169,8 @@ more:
 				event->mmap.filename);
 		}
 		if (thread == NULL || map == NULL) {
-			fprintf(stderr, "problem processing PERF_EVENT_MMAP, bailing out\n");
-			goto done;
+			fprintf(stderr, "problem processing PERF_EVENT_MMAP, skipping event.\n");
+			goto broken_event;
 		}
 		thread__insert_map(thread, map);
 		total_mmap++;
@@ -1187,8 +1187,8 @@ more:
 		}
 		if (thread == NULL ||
 		    thread__set_comm(thread, event->comm.comm)) {
-			fprintf(stderr, "problem processing PERF_EVENT_COMM, bailing out\n");
-			goto done;
+			fprintf(stderr, "problem processing PERF_EVENT_COMM, skipping event.\n");
+			goto broken_event;
 		}
 		total_comm++;
 		break;
@@ -1221,7 +1221,6 @@ broken_event:
 		goto more;
 
 	rc = EXIT_SUCCESS;
-done:
 	close(input);
 
 	if (dump_trace) {
-- 
cgit v0.10.2


From b787f2e2a37a373a045f4d9b9bed941ccff01663 Mon Sep 17 00:00:00 2001
From: Kumar Gala <galak@kernel.crashing.org>
Date: Wed, 13 May 2009 16:25:57 -0500
Subject: fsldma: Fix compile warnings

We we build with dma_addr_t as a 64-bit quantity we get:

drivers/dma/fsldma.c: In function 'fsl_chan_xfer_ld_queue':
drivers/dma/fsldma.c:625: warning: cast to pointer from integer of different size
drivers/dma/fsldma.c: In function 'fsl_dma_chan_do_interrupt':
drivers/dma/fsldma.c:737: warning: cast to pointer from integer of different size
drivers/dma/fsldma.c:737: warning: cast to pointer from integer of different size
drivers/dma/fsldma.c: In function 'of_fsl_dma_probe':
drivers/dma/fsldma.c:927: warning: cast to pointer from integer of different

Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index 1578310..f18d1bd 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -641,8 +641,8 @@ static void fsl_chan_xfer_ld_queue(struct fsl_dma_chan *fsl_chan)
 	if (ld_node != &fsl_chan->ld_queue) {
 		/* Get the ld start address from ld_queue */
 		next_dest_addr = to_fsl_desc(ld_node)->async_tx.phys;
-		dev_dbg(fsl_chan->dev, "xfer LDs staring from %p\n",
-				(void *)next_dest_addr);
+		dev_dbg(fsl_chan->dev, "xfer LDs staring from 0x%llx\n",
+				(unsigned long long)next_dest_addr);
 		set_cdar(fsl_chan, next_dest_addr);
 		dma_start(fsl_chan);
 	} else {
@@ -756,8 +756,9 @@ static irqreturn_t fsl_dma_chan_do_interrupt(int irq, void *data)
 	 */
 	if (stat & FSL_DMA_SR_EOSI) {
 		dev_dbg(fsl_chan->dev, "event: End-of-segments INT\n");
-		dev_dbg(fsl_chan->dev, "event: clndar %p, nlndar %p\n",
-			(void *)get_cdar(fsl_chan), (void *)get_ndar(fsl_chan));
+		dev_dbg(fsl_chan->dev, "event: clndar 0x%llx, nlndar 0x%llx\n",
+			(unsigned long long)get_cdar(fsl_chan),
+			(unsigned long long)get_ndar(fsl_chan));
 		stat &= ~FSL_DMA_SR_EOSI;
 		update_cookie = 1;
 	}
@@ -947,8 +948,8 @@ static int __devinit of_fsl_dma_probe(struct of_device *dev,
 	}
 
 	dev_info(&dev->dev, "Probe the Freescale DMA driver for %s "
-			"controller at %p...\n",
-			match->compatible, (void *)fdev->reg.start);
+			"controller at 0x%llx...\n",
+			match->compatible, (unsigned long long)fdev->reg.start);
 	fdev->reg_base = ioremap(fdev->reg.start, fdev->reg.end
 						- fdev->reg.start + 1);
 
-- 
cgit v0.10.2


From a0d24b295aed7a9daf4ca36bd4784e4d40f82303 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 19 May 2009 12:03:15 +0800
Subject: nfsd: fix hung up of nfs client while sync write data to nfs server

Commit 'Short write in nfsd becomes a full write to the client'
(31dec2538e45e9fff2007ea1f4c6bae9f78db724) broken the sync write.
With the following commands to reproduce:

  $ mount -t nfs -o sync 192.168.0.21:/nfsroot /mnt
  $ cd /mnt
  $ echo aaaa > temp.txt

Then nfs client is hung up.

In SYNC mode the server alaways return the write count 0 to the
client. This is because the value of host_err in nfsd_vfs_write()
will be overwrite in SYNC mode by 'host_err=nfsd_sync(file);',
and then we return host_err(which is now 0) as write count.

This patch fixed the problem.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6c68ffd..b660435 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1015,6 +1015,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
 	set_fs(oldfs);
 	if (host_err >= 0) {
+		*cnt = host_err;
 		nfsdstats.io_write += host_err;
 		fsnotify_modify(file->f_path.dentry);
 	}
@@ -1060,10 +1061,9 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	}
 
 	dprintk("nfsd: write complete host_err=%d\n", host_err);
-	if (host_err >= 0) {
+	if (host_err >= 0)
 		err = 0;
-		*cnt = host_err;
-	} else
+	else
 		err = nfserrno(host_err);
 out:
 	return err;
-- 
cgit v0.10.2


From dacd2549ca61ddbdd1ed62a76ca95dea3f0e02c6 Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Tue, 26 May 2009 09:08:03 +0900
Subject: PCI/ACPI: fix wrong ref count handling in acpi_pci_bind()

The 'dev' field of struct acpi_pci_data is having a pointer to struct
pci_dev without incrementing the reference counter. Because of this, I
got the following kernel oops when I was doing some pci hotplug
operations. This patch fixes this bug by replacing wrong hand-made
pci_find_slot() with pci_get_slot() in acpi_pci_bind().

 BUG: unable to handle kernel NULL pointer dereference at 00000000000000e8
 IP: [<ffffffff803f0e9b>] acpi_pci_unbind+0xb1/0xdd

 Call Trace:
  [<ffffffff803ecee4>] acpi_bus_remove+0x54/0x68
  [<ffffffff803ecf6d>] acpi_bus_trim+0x75/0xe3
  [<ffffffffa0345ddd>] acpiphp_disable_slot+0x16d/0x1e0 [acpiphp]
  [<ffffffffa03441f0>] disable_slot+0x20/0x60 [acpiphp]
  [<ffffffff803cfc18>] power_write_file+0xc8/0x110
  [<ffffffff803c6a54>] pci_slot_attr_store+0x24/0x30
  [<ffffffff803469ce>] sysfs_write_file+0xce/0x140
  [<ffffffff802e94e7>] vfs_write+0xc7/0x170
  [<ffffffff802e9aa0>] sys_write+0x50/0x90
  [<ffffffff8020bd6b>] system_call_fastpath+0x16/0x1b

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Reviewed-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Reviewed-by: Alex Chiang <achiang@hp.com>
Tested-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/drivers/acpi/pci_bind.c b/drivers/acpi/pci_bind.c
index 95650f8..bc46de3 100644
--- a/drivers/acpi/pci_bind.c
+++ b/drivers/acpi/pci_bind.c
@@ -116,9 +116,6 @@ int acpi_pci_bind(struct acpi_device *device)
 	struct acpi_pci_data *pdata;
 	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
 	acpi_handle handle;
-	struct pci_dev *dev;
-	struct pci_bus *bus;
-
 
 	if (!device || !device->parent)
 		return -EINVAL;
@@ -176,20 +173,9 @@ int acpi_pci_bind(struct acpi_device *device)
 	 * Locate matching device in PCI namespace.  If it doesn't exist
 	 * this typically means that the device isn't currently inserted
 	 * (e.g. docking station, port replicator, etc.).
-	 * We cannot simply search the global pci device list, since
-	 * PCI devices are added to the global pci list when the root
-	 * bridge start ops are run, which may not have happened yet.
 	 */
-	bus = pci_find_bus(data->id.segment, data->id.bus);
-	if (bus) {
-		list_for_each_entry(dev, &bus->devices, bus_list) {
-			if (dev->devfn == PCI_DEVFN(data->id.device,
-						    data->id.function)) {
-				data->dev = dev;
-				break;
-			}
-		}
-	}
+	data->dev = pci_get_slot(pdata->bus,
+				PCI_DEVFN(data->id.device, data->id.function));
 	if (!data->dev) {
 		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
 				  "Device %04x:%02x:%02x.%d not present in PCI namespace\n",
@@ -259,9 +245,10 @@ int acpi_pci_bind(struct acpi_device *device)
 
       end:
 	kfree(buffer.pointer);
-	if (result)
+	if (result) {
+		pci_dev_put(data->dev);
 		kfree(data);
-
+	}
 	return result;
 }
 
@@ -303,6 +290,7 @@ static int acpi_pci_unbind(struct acpi_device *device)
 	if (data->dev->subordinate) {
 		acpi_pci_irq_del_prt(data->id.segment, data->bus->number);
 	}
+	pci_dev_put(data->dev);
 	kfree(data);
 
       end:
-- 
cgit v0.10.2


From 7f4218354fe312b327af06c3d8c95ed5f214c8ca Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 27 May 2009 18:51:06 -0400
Subject: nfsd: Revert "svcrpc: take advantage of tcp autotuning"

This reverts commit 47a14ef1af48c696b214ac168f056ddc79793d0e "svcrpc:
take advantage of tcp autotuning", which uncovered some further problems
in the server rpc code, causing significant performance regressions in
common cases.

We will likely reinstate this patch after releasing 2.6.30 and applying
some work on the underlying fixes to the problem (developed by Trond).

Reported-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Olga Kornievskaia <aglo@citi.umich.edu>
Cc: Jim Rees <rees@umich.edu>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index af31988..9d50423 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -345,6 +345,7 @@ static void svc_sock_setbufsize(struct socket *sock, unsigned int snd,
 	lock_sock(sock->sk);
 	sock->sk->sk_sndbuf = snd * 2;
 	sock->sk->sk_rcvbuf = rcv * 2;
+	sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK;
 	release_sock(sock->sk);
 #endif
 }
@@ -796,6 +797,23 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 		test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags),
 		test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags));
 
+	if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
+		/* sndbuf needs to have room for one request
+		 * per thread, otherwise we can stall even when the
+		 * network isn't a bottleneck.
+		 *
+		 * We count all threads rather than threads in a
+		 * particular pool, which provides an upper bound
+		 * on the number of threads which will access the socket.
+		 *
+		 * rcvbuf just needs to be able to hold a few requests.
+		 * Normally they will be removed from the queue
+		 * as soon a a complete request arrives.
+		 */
+		svc_sock_setbufsize(svsk->sk_sock,
+				    (serv->sv_nrthreads+3) * serv->sv_max_mesg,
+				    3 * serv->sv_max_mesg);
+
 	clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 
 	/* Receive data. If we haven't got the record length yet, get
@@ -1043,6 +1061,15 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
 
 		tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
 
+		/* initialise setting must have enough space to
+		 * receive and respond to one request.
+		 * svc_tcp_recvfrom will re-adjust if necessary
+		 */
+		svc_sock_setbufsize(svsk->sk_sock,
+				    3 * svsk->sk_xprt.xpt_server->sv_max_mesg,
+				    3 * svsk->sk_xprt.xpt_server->sv_max_mesg);
+
+		set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
 		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 		if (sk->sk_state != TCP_ESTABLISHED)
 			set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
@@ -1112,14 +1139,8 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 	/* Initialize the socket */
 	if (sock->type == SOCK_DGRAM)
 		svc_udp_init(svsk, serv);
-	else {
-		/* initialise setting must have enough space to
-		 * receive and respond to one request.
-		 */
-		svc_sock_setbufsize(svsk->sk_sock, 4 * serv->sv_max_mesg,
-					4 * serv->sv_max_mesg);
+	else
 		svc_tcp_init(svsk, serv);
-	}
 
 	dprintk("svc: svc_setup_socket created %p (inet %p)\n",
 				svsk, svsk->sk_sk);
-- 
cgit v0.10.2


From 98779be861a05c4cb75bed916df72ec0cba8b53d Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Thu, 14 May 2009 16:34:28 -0500
Subject: svcrdma: dma unmap the correct length for the RPCRDMA header page.

The svcrdma module was incorrectly unmapping the RPCRDMA header page.
On IBM pserver systems this causes a resource leak that results in
running out of bus address space (10 cthon iterations will reproduce it).
The code was mapping the full page but only unmapping the actual header
length.  The fix is to only map the header length.

I also cleaned up the use of ib_dma_map_page() calls since the unmap
logic always uses ib_dma_unmap_single().  I made these symmetrical.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>

diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 8b510c5..f11be72 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -128,7 +128,8 @@ static int fast_reg_xdr(struct svcxprt_rdma *xprt,
 		page_bytes -= sge_bytes;
 
 		frmr->page_list->page_list[page_no] =
-			ib_dma_map_page(xprt->sc_cm_id->device, page, 0,
+			ib_dma_map_single(xprt->sc_cm_id->device,
+					  page_address(page),
 					  PAGE_SIZE, DMA_TO_DEVICE);
 		if (ib_dma_mapping_error(xprt->sc_cm_id->device,
 					 frmr->page_list->page_list[page_no]))
@@ -532,18 +533,17 @@ static int send_reply(struct svcxprt_rdma *rdma,
 		clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
 
 	/* Prepare the SGE for the RPCRDMA Header */
+	ctxt->sge[0].lkey = rdma->sc_dma_lkey;
+	ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
 	ctxt->sge[0].addr =
-		ib_dma_map_page(rdma->sc_cm_id->device,
-				page, 0, PAGE_SIZE, DMA_TO_DEVICE);
+		ib_dma_map_single(rdma->sc_cm_id->device, page_address(page),
+				  ctxt->sge[0].length, DMA_TO_DEVICE);
 	if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
 		goto err;
 	atomic_inc(&rdma->sc_dma_used);
 
 	ctxt->direction = DMA_TO_DEVICE;
 
-	ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
-	ctxt->sge[0].lkey = rdma->sc_dma_lkey;
-
 	/* Determine how many of our SGE are to be transmitted */
 	for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
 		sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 4b0c2fa..5151f9f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -500,8 +500,8 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
 		BUG_ON(sge_no >= xprt->sc_max_sge);
 		page = svc_rdma_get_page();
 		ctxt->pages[sge_no] = page;
-		pa = ib_dma_map_page(xprt->sc_cm_id->device,
-				     page, 0, PAGE_SIZE,
+		pa = ib_dma_map_single(xprt->sc_cm_id->device,
+				     page_address(page), PAGE_SIZE,
 				     DMA_FROM_DEVICE);
 		if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa))
 			goto err_put_ctxt;
@@ -1315,8 +1315,8 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
 	length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
 
 	/* Prepare SGE for local address */
-	sge.addr = ib_dma_map_page(xprt->sc_cm_id->device,
-				   p, 0, PAGE_SIZE, DMA_FROM_DEVICE);
+	sge.addr = ib_dma_map_single(xprt->sc_cm_id->device,
+				   page_address(p), PAGE_SIZE, DMA_FROM_DEVICE);
 	if (ib_dma_mapping_error(xprt->sc_cm_id->device, sge.addr)) {
 		put_page(p);
 		return;
@@ -1343,7 +1343,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
 	if (ret) {
 		dprintk("svcrdma: Error %d posting send for protocol error\n",
 			ret);
-		ib_dma_unmap_page(xprt->sc_cm_id->device,
+		ib_dma_unmap_single(xprt->sc_cm_id->device,
 				  sge.addr, PAGE_SIZE,
 				  DMA_FROM_DEVICE);
 		svc_rdma_put_context(ctxt, 1);
-- 
cgit v0.10.2


From 5b6045a906f48d37591365c5dcdd6d1d146bfd4a Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 26 May 2009 17:28:02 +0200
Subject: trace: disable preemption before taking raw spinlocks

s390 code uses smp_processor_id() in __raw_spin_lock() code which
reveals that a (raw) spinlock is taken without preemption disabled.
This can potentially deadlock.

To fix this explicitly disable and enable preemption.

BUG: using smp_processor_id() in preemptible [00000000] code: cat/2278
caller is trace_find_cmdline+0x40/0xfc
CPU: 0 Not tainted 2.6.30-rc7-dirty #39
Process cat (pid: 2278, task: 000000003faedb68, ksp: 000000003b33b988)
000000003b33b988 000000003b33bae0 0000000000000002 0000000000000000
       000000003b33bb80 000000003b33baf8 000000003b33baf8 00000000000175d6
       0000000000000001 000000003b33b988 000000003f9b0000 000000000000000b
       000000000000000c 000000003b33bb40 000000003b33bae0 0000000000000000
       0000000000000000 00000000000175d6 000000003b33bae0 000000003b33bb28
Call Trace:
([<00000000000174b2>] show_trace+0x112/0x170)
 [<0000000000017582>] show_stack+0x72/0x100
 [<0000000000441538>] dump_stack+0xc8/0xd8
 [<000000000025c350>] debug_smp_processor_id+0x114/0x130
 [<00000000000bf0e4>] trace_find_cmdline+0x40/0xfc
 [<00000000000c35d4>] trace_print_context+0x58/0xac
 [<00000000000bb676>] print_trace_line+0x416/0x470
 [<00000000000bc8fe>] s_show+0x4e/0x428
 [<000000000013834e>] seq_read+0x36a/0x5d4
 [<0000000000112a78>] vfs_read+0xc8/0x174
 [<0000000000112c58>] SyS_read+0x74/0xc4
 [<000000000002c7ae>] sysc_noemu+0x10/0x16
 [<000002000012436c>] 0x2000012436c
1 lock held by cat/2278:
 #0:  (&p->lock){+.+.+.}, at: [<0000000000138056>] seq_read+0x72/0x5d4

[ Impact: fix preempt-unsafe raw spinlock ]

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 02d32ba..a3a8a87 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -808,6 +808,7 @@ void trace_find_cmdline(int pid, char comm[])
 		return;
 	}
 
+	preempt_disable();
 	__raw_spin_lock(&trace_cmdline_lock);
 	map = map_pid_to_cmdline[pid];
 	if (map != NO_CMDLINE_MAP)
@@ -816,6 +817,7 @@ void trace_find_cmdline(int pid, char comm[])
 		strcpy(comm, "<...>");
 
 	__raw_spin_unlock(&trace_cmdline_lock);
+	preempt_enable();
 }
 
 void tracing_record_cmdline(struct task_struct *tsk)
-- 
cgit v0.10.2


From f2aebaee653a35b01c3665de2cbb1e31456b8ea8 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Wed, 27 May 2009 21:36:02 +0800
Subject: ftrace: don't convert function's local variable name in macro

"call" is an argument of macro, but it is also used as a local
variable name of function in macro.
We should keep this local variable name distinct from any
CPP macro parameter name if both are in the same macro scope,
although it hasn't caused any problem yet.

[ Impact: robustify macro ]

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 87fc227..b4ec83a 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -397,19 +397,19 @@ static void ftrace_profile_##call(proto)				\
 	perf_tpcounter_event(event_##call.id);				\
 }									\
 									\
-static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \
+static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \
 {									\
 	int ret = 0;							\
 									\
-	if (!atomic_inc_return(&call->profile_count))			\
+	if (!atomic_inc_return(&event_call->profile_count))		\
 		ret = register_trace_##call(ftrace_profile_##call);	\
 									\
 	return ret;							\
 }									\
 									\
-static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \
+static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
 {									\
-	if (atomic_add_negative(-1, &call->profile_count))		\
+	if (atomic_add_negative(-1, &event_call->profile_count))	\
 		unregister_trace_##call(ftrace_profile_##call);		\
 }
 
@@ -433,9 +433,9 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \
 #define __array(type, item, len)
 
 #undef __string
-#define __string(item, src)						       \
-	__str_offsets.item = __str_size +				       \
-			     offsetof(typeof(*entry), __str_data);	       \
+#define __string(item, src)						\
+	__str_offsets.item = __str_size +				\
+			     offsetof(typeof(*entry), __str_data);	\
 	__str_size += strlen(src) + 1;
 
 #undef __assign_str
@@ -451,8 +451,8 @@ static struct ftrace_event_call event_##call;				\
 									\
 static void ftrace_raw_event_##call(proto)				\
 {									\
-	struct ftrace_str_offsets_##call __maybe_unused __str_offsets;  \
-	struct ftrace_event_call *call = &event_##call;			\
+	struct ftrace_str_offsets_##call __maybe_unused __str_offsets;	\
+	struct ftrace_event_call *event_call = &event_##call;		\
 	struct ring_buffer_event *event;				\
 	struct ftrace_raw_##call *entry;				\
 	unsigned long irq_flags;					\
@@ -473,7 +473,7 @@ static void ftrace_raw_event_##call(proto)				\
 									\
 	assign;								\
 									\
-	if (!filter_current_check_discard(call, entry, event))		\
+	if (!filter_current_check_discard(event_call, entry, event))	\
 		trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
 }									\
 									\
-- 
cgit v0.10.2


From 14dba5331b90c20588ae6504fea8049c7283028d Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Wed, 27 May 2009 09:31:52 -0400
Subject: integrity: nfsd imbalance bug fix

An nfsd exported file is opened/closed by the kernel causing the
integrity imbalance message.

Before a file is opened, there normally is permission checking, which
is done in inode_permission().  However, as integrity checking requires
a dentry and mount point, which is not available in inode_permission(),
the integrity (permission) checking must be called separately.

In order to detect any missing integrity checking calls, we keep track
of file open/closes.  ima_path_check() increments these counts and
does the integrity (permission) checking. As a result, the number of
calls to ima_path_check()/ima_file_free() should be balanced.  An extra
call to fput(), indicates the file could have been accessed without first
calling ima_path_check().

In nfsv3 permission checking is done once, followed by multiple reads,
which do an open/close for each read.  The integrity (permission) checking
call should be in nfsd_permission() after the inode_permission() call, but
as there is no correlation between the number of permission checking and
open calls, the integrity checking call should not increment the counters,
but defer it to when the file is actually opened.

This patch adds:
- integrity (permission) checking for nfsd exported files in nfsd_permission().
- a call to increment counts for files opened by nfsd.

This patch has been updated to return the nfs error types.

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6c68ffd..81ff0f4 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -55,6 +55,7 @@
 #include <linux/security.h>
 #endif /* CONFIG_NFSD_V4 */
 #include <linux/jhash.h>
+#include <linux/ima.h>
 
 #include <asm/uaccess.h>
 
@@ -735,6 +736,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 			    flags, cred);
 	if (IS_ERR(*filp))
 		host_err = PTR_ERR(*filp);
+	else
+		ima_counts_get(*filp);
 out_nfserr:
 	err = nfserrno(host_err);
 out:
@@ -2024,6 +2027,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 					struct dentry *dentry, int acc)
 {
 	struct inode	*inode = dentry->d_inode;
+	struct path	path;
 	int		err;
 
 	if (acc == NFSD_MAY_NOP)
@@ -2096,7 +2100,17 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 	if (err == -EACCES && S_ISREG(inode->i_mode) &&
 	    acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
 		err = inode_permission(inode, MAY_EXEC);
+	if (err)
+		goto nfsd_out;
 
+	/* Do integrity (permission) checking now, but defer incrementing
+	 * IMA counts to the actual file open.
+	 */
+	path.mnt = exp->ex_path.mnt;
+	path.dentry = dentry;
+	err = ima_path_check(&path, acc & (MAY_READ | MAY_WRITE | MAY_EXEC),
+			     IMA_COUNT_LEAVE);
+nfsd_out:
 	return err? nfserrno(err) : 0;
 }
 
-- 
cgit v0.10.2


From abfe0af9813153bae8c85d9bac966bafcb8ddab1 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 20 May 2009 00:37:40 -0700
Subject: x86: enable_update_mptable should be a macro

instead of declaring one variant as an inline function...
because other case is a variable

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4A13B344.7030307@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 3dcbaaa..e2a1bb6 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -61,9 +61,11 @@ extern void get_smp_config(void);
 #ifdef CONFIG_X86_MPPARSE
 extern void find_smp_config(void);
 extern void early_reserve_e820_mpc_new(void);
+extern int enable_update_mptable;
 #else
 static inline void find_smp_config(void) { }
 static inline void early_reserve_e820_mpc_new(void) { }
+#define enable_update_mptable 0
 #endif
 
 void __cpuinit generic_processor_info(int apicid, int version);
@@ -87,15 +89,6 @@ static inline int acpi_probe_gsi(void)
 }
 #endif /* CONFIG_ACPI */
 
-#ifdef CONFIG_X86_MPPARSE
-extern int enable_update_mptable;
-#else
-static inline int enable_update_mptable(void)
-{
-	return 0;
-}
-#endif
-
 #define PHYSID_ARRAY_SIZE	BITS_TO_LONGS(MAX_APICS)
 
 struct physid_mask {
-- 
cgit v0.10.2


From 13b297d943828c4594527a2bd9c30ecd04e37886 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Tue, 26 May 2009 14:18:07 +0900
Subject: smack: Remove redundant initialization.

We don't need to explicitly initialize to cap_* because
it will be filled by security_fixup_ops().

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 0d030b4..0023182 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -3034,15 +3034,7 @@ struct security_operations smack_ops = {
 
 	.ptrace_may_access =		smack_ptrace_may_access,
 	.ptrace_traceme =		smack_ptrace_traceme,
-	.capget = 			cap_capget,
-	.capset = 			cap_capset,
-	.capable = 			cap_capable,
 	.syslog = 			smack_syslog,
-	.settime = 			cap_settime,
-	.vm_enough_memory = 		cap_vm_enough_memory,
-
-	.bprm_set_creds = 		cap_bprm_set_creds,
-	.bprm_secureexec = 		cap_bprm_secureexec,
 
 	.sb_alloc_security = 		smack_sb_alloc_security,
 	.sb_free_security = 		smack_sb_free_security,
@@ -3066,8 +3058,6 @@ struct security_operations smack_ops = {
 	.inode_post_setxattr = 		smack_inode_post_setxattr,
 	.inode_getxattr = 		smack_inode_getxattr,
 	.inode_removexattr = 		smack_inode_removexattr,
-	.inode_need_killpriv =		cap_inode_need_killpriv,
-	.inode_killpriv =		cap_inode_killpriv,
 	.inode_getsecurity = 		smack_inode_getsecurity,
 	.inode_setsecurity = 		smack_inode_setsecurity,
 	.inode_listsecurity = 		smack_inode_listsecurity,
@@ -3088,7 +3078,6 @@ struct security_operations smack_ops = {
 	.cred_commit =			smack_cred_commit,
 	.kernel_act_as =		smack_kernel_act_as,
 	.kernel_create_files_as =	smack_kernel_create_files_as,
-	.task_fix_setuid =		cap_task_fix_setuid,
 	.task_setpgid = 		smack_task_setpgid,
 	.task_getpgid = 		smack_task_getpgid,
 	.task_getsid = 			smack_task_getsid,
@@ -3102,7 +3091,6 @@ struct security_operations smack_ops = {
 	.task_kill = 			smack_task_kill,
 	.task_wait = 			smack_task_wait,
 	.task_to_inode = 		smack_task_to_inode,
-	.task_prctl =			cap_task_prctl,
 
 	.ipc_permission = 		smack_ipc_permission,
 	.ipc_getsecid =			smack_ipc_getsecid,
@@ -3129,9 +3117,6 @@ struct security_operations smack_ops = {
 	.sem_semctl = 			smack_sem_semctl,
 	.sem_semop = 			smack_sem_semop,
 
-	.netlink_send =			cap_netlink_send,
-	.netlink_recv = 		cap_netlink_recv,
-
 	.d_instantiate = 		smack_d_instantiate,
 
 	.getprocattr = 			smack_getprocattr,
-- 
cgit v0.10.2


From 1812e67c7410c8d0d57f14a3dc81a99bd5b30e3e Mon Sep 17 00:00:00 2001
From: Tony Vroon <tony@linx.net>
Date: Wed, 27 May 2009 21:00:41 +0100
Subject: ALSA: hda - Compaq Presario CQ60 patching for Conexant

A docking mic control is shown by default. The Compaq Presario
CQ60 laptop has no docking connector, so designate it as a
CXT5051_HP model.
This makes the phantom mixer slider disappear.

Signed-off-by: Tony Vroon <tony@linx.net>
Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c
index 56ce19e..4fcbe21 100644
--- a/sound/pci/hda/patch_conexant.c
+++ b/sound/pci/hda/patch_conexant.c
@@ -1848,6 +1848,7 @@ static const char *cxt5051_models[CXT5051_MODELS] = {
 
 static struct snd_pci_quirk cxt5051_cfg_tbl[] = {
 	SND_PCI_QUIRK(0x103c, 0x30cf, "HP DV6736", CXT5051_HP_DV6736),
+	SND_PCI_QUIRK(0x103c, 0x360b, "Compaq Presario CQ60", CXT5051_HP),
 	SND_PCI_QUIRK(0x14f1, 0x0101, "Conexant Reference board",
 		      CXT5051_LAPTOP),
 	SND_PCI_QUIRK(0x14f1, 0x5051, "HP Spartan 1.1", CXT5051_HP),
-- 
cgit v0.10.2


From d3e78ee3d015dac1794433abb6403b6fc8e70e10 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 28 May 2009 11:41:50 +0200
Subject: perf_counter: Fix perf_counter_init_task() on !CONFIG_PERF_COUNTERS

Pointed out by compiler warnings:

   tip/include/linux/perf_counter.h:644: warning: no return statement in function returning non-void

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 2b16ed3..a65ddc5 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -641,7 +641,7 @@ perf_counter_task_sched_out(struct task_struct *task,
 			    struct task_struct *next, int cpu)		{ }
 static inline void
 perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
-static inline int perf_counter_init_task(struct task_struct *child)	{ }
+static inline int perf_counter_init_task(struct task_struct *child)	{ return 0; }
 static inline void perf_counter_exit_task(struct task_struct *child)	{ }
 static inline void perf_counter_do_pending(void)			{ }
 static inline void perf_counter_print_debug(void)			{ }
-- 
cgit v0.10.2


From ca8cdeef9ca2ff89ee8a21d6f6ff3dfb60286041 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 28 May 2009 11:08:33 +0200
Subject: perf_counter tools: report: Implement header output for --sort
 variants

Implement this style of header:

 #
 # Overhead          Command       File: Symbol
 # ........          .......       ............
 #

for the various --sort variants as well.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 5993c12..506cde4 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -596,8 +596,6 @@ out_delete:
 
 struct thread;
 
-static const char *thread__name(struct thread *self, char *bf, size_t size);
-
 struct thread {
 	struct rb_node	 rb_node;
 	struct list_head maps;
@@ -605,15 +603,6 @@ struct thread {
 	char		 *comm;
 };
 
-static const char *thread__name(struct thread *self, char *bf, size_t size)
-{
-	if (self->comm)
-		return self->comm;
-
-	snprintf(bf, sizeof(bf), ":%u", self->pid);
-	return bf;
-}
-
 static struct thread *thread__new(pid_t pid)
 {
 	struct thread *self = malloc(sizeof(*self));
@@ -707,8 +696,9 @@ struct hist_entry {
 struct sort_entry {
 	struct list_head list;
 
+	char *header;
+
 	int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
-	size_t (*print_header)(FILE *fp);
 	size_t	(*print)(FILE *fp, struct hist_entry *);
 };
 
@@ -721,13 +711,11 @@ sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
 static size_t
 sort__thread_print(FILE *fp, struct hist_entry *self)
 {
-	char bf[32];
-
-	return fprintf(fp, " %16s",
-			thread__name(self->thread, bf, sizeof(bf)));
+	return fprintf(fp, " %16s:%5d", self->thread->comm ?: "", self->thread->pid);
 }
 
 static struct sort_entry sort_thread = {
+	.header = "         Command: Pid ",
 	.cmp	= sort__thread_cmp,
 	.print	= sort__thread_print,
 };
@@ -757,6 +745,7 @@ sort__comm_print(FILE *fp, struct hist_entry *self)
 }
 
 static struct sort_entry sort_comm = {
+	.header = "         Command",
 	.cmp	= sort__comm_cmp,
 	.print	= sort__comm_print,
 };
@@ -786,6 +775,7 @@ sort__dso_print(FILE *fp, struct hist_entry *self)
 }
 
 static struct sort_entry sort_dso = {
+	.header = "                                                    Shared Object",
 	.cmp	= sort__dso_cmp,
 	.print	= sort__dso_print,
 };
@@ -804,43 +794,25 @@ sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
 	return (int64_t)(ip_r - ip_l);
 }
 
-static size_t sort__sym_print_header(FILE *fp)
-{
-	size_t ret = 0;
-
-	ret += fprintf(fp, "#\n");
-	ret += fprintf(fp, "# Overhead          Command       File: Symbol\n");
-	ret += fprintf(fp, "# ........          .......       ............\n");
-	ret += fprintf(fp, "#\n");
-
-	return ret;
-}
-
 static size_t
 sort__sym_print(FILE *fp, struct hist_entry *self)
 {
 	size_t ret = 0;
 
-	ret += fprintf(fp, "  [%c] ", self->level);
-
 	if (verbose)
 		ret += fprintf(fp, " %#018llx", (unsigned long long)self->ip);
 
-	if (self->level != '.')
-		ret += fprintf(fp, " kernel: %s",
-			       self->sym ? self->sym->name : "<unknown>");
-	else
-		ret += fprintf(fp, " %s: %s",
-			       self->dso ? self->dso->name : "<unknown>",
-			       self->sym ? self->sym->name : "<unknown>");
+	ret += fprintf(fp, " %s: %s",
+			self->dso ? self->dso->name : "<unknown>",
+			self->sym ? self->sym->name : "<unknown>");
 
 	return ret;
 }
 
 static struct sort_entry sort_sym = {
-	.cmp		= sort__sym_cmp,
-	.print_header	= sort__sym_print_header,
-	.print		= sort__sym_print,
+	.header = "Shared Object: Symbol",
+	.cmp	= sort__sym_cmp,
+	.print	= sort__sym_print,
 };
 
 struct sort_dimension {
@@ -1021,10 +993,24 @@ static size_t output__fprintf(FILE *fp, uint64_t total_samples)
 	struct rb_node *nd;
 	size_t ret = 0;
 
+	fprintf(fp, "#\n");
+
+	fprintf(fp, "# Overhead");
+	list_for_each_entry(se, &hist_entry__sort_list, list)
+		fprintf(fp, " %s", se->header);
+	fprintf(fp, "\n");
+
+	fprintf(fp, "# ........");
 	list_for_each_entry(se, &hist_entry__sort_list, list) {
-		if (se->print_header)
-			ret += se->print_header(fp);
+		int i;
+
+		fprintf(fp, " ");
+		for (i = 0; i < strlen(se->header); i++)
+			fprintf(fp, ".");
 	}
+	fprintf(fp, "\n");
+
+	fprintf(fp, "#\n");
 
 	for (nd = rb_first(&output_hists); nd; nd = rb_next(nd)) {
 		pos = rb_entry(nd, struct hist_entry, rb_node);
-- 
cgit v0.10.2


From 63299f057fbce47da895e8865cba7e9c3eb01a20 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 28 May 2009 10:52:00 +0200
Subject: perf_counter tools: report: Add help text for --sort

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 506cde4..9fdf822 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -1240,7 +1240,8 @@ static const struct option options[] = {
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
 	OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
-	OPT_STRING('s', "sort", &sort_order, "foo", "bar"),
+	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
+		   "sort by key(s): pid, comm, dso, symbol. Default: pid,symbol"),
 	OPT_END()
 };
 
-- 
cgit v0.10.2


From 5d85d3247cc3555215d7d11c78576a396c98e4d9 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 28 May 2009 11:04:53 +0200
Subject: block: export blk_stack_limits()

DM needs to use blk_stack_limits(), so it needs to be exported.

Acked-by: Martin K. Petersen <martin.petersen@oracle.com>

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 5649f34..8d33934 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -500,6 +500,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 
 	return 0;
 }
+EXPORT_SYMBOL(blk_stack_limits);
 
 /**
  * disk_stack_limits - adjust queue limits for stacked drivers
-- 
cgit v0.10.2


From 7d96fd41cadc55f4e00231c8c71b8e25c779f122 Mon Sep 17 00:00:00 2001
From: Petr Tesarik <ptesarik@suse.cz>
Date: Mon, 25 May 2009 11:02:02 +0200
Subject: x86: move rdtsc_barrier() into the TSC vread method

The *fence instructions were moved to vsyscall_64.c by commit
cb9e35dce94a1b9c59d46224e8a94377d673e204.  But this breaks the
vDSO, because vread methods are also called from there.

Besides, the synchronization might be unnecessary for other
time sources than TSC.

[ Impact: fix potential time warp in VDSO ]

Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
LKML-Reference: <9d0ea9ea0f866bdc1f4d76831221ae117f11ea67.1243241859.git.ptesarik@suse.cz>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@kernel.org>

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index d57de05..cf8611d 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -710,7 +710,16 @@ static cycle_t read_tsc(struct clocksource *cs)
 #ifdef CONFIG_X86_64
 static cycle_t __vsyscall_fn vread_tsc(void)
 {
-	cycle_t ret = (cycle_t)vget_cycles();
+	cycle_t ret;
+
+	/*
+	 * Surround the RDTSC by barriers, to make sure it's not
+	 * speculated to outside the seqlock critical section and
+	 * does not cause time warps:
+	 */
+	rdtsc_barrier();
+	ret = (cycle_t)vget_cycles();
+	rdtsc_barrier();
 
 	return ret >= __vsyscall_gtod_data.clock.cycle_last ?
 		ret : __vsyscall_gtod_data.clock.cycle_last;
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 44153af..25ee06a 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -132,15 +132,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
 			return;
 		}
 
-		/*
-		 * Surround the RDTSC by barriers, to make sure it's not
-		 * speculated to outside the seqlock critical section and
-		 * does not cause time warps:
-		 */
-		rdtsc_barrier();
 		now = vread();
-		rdtsc_barrier();
-
 		base = __vsyscall_gtod_data.clock.cycle_last;
 		mask = __vsyscall_gtod_data.clock.mask;
 		mult = __vsyscall_gtod_data.clock.mult;
-- 
cgit v0.10.2


From c93f7669098eb97c5376e5396e3dfb734c17df4f Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 28 May 2009 22:18:17 +1000
Subject: perf_counter: Fix race in attaching counters to tasks and exiting

Commit 564c2b21 ("perf_counter: Optimize context switch between
identical inherited contexts") introduced a race where it is possible
that a counter being attached to a task could get attached to the
wrong task, if the task is one that has inherited its context from
another task via fork.  This happens because the optimized context
switch could switch the context to another task after find_get_context
has read task->perf_counter_ctxp.  In fact, it's possible that the
context could then get freed, if the other task then exits.

This fixes the problem by protecting both the context switch and the
critical code in find_get_context with spinlocks.  The context switch
locks the cxt->lock of both the outgoing and incoming contexts before
swapping them.  That means that once code such as find_get_context
has obtained the spinlock for the context associated with a task,
the context can't get swapped to another task.  However, the context
may have been swapped in the interval between reading
task->perf_counter_ctxp and getting the lock, so it is necessary to
check and retry.

To make sure that none of the contexts being looked at in
find_get_context can get freed, this changes the context freeing code
to use RCU.  Thus an rcu_read_lock() is sufficient to ensure that no
contexts can get freed.  This part of the patch is lifted from a patch
posted by Peter Zijlstra.

This also adds a check to make sure that we can't add a counter to a
task that is exiting.

There is also a race between perf_counter_exit_task and
find_get_context; this solves the race by moving the get_ctx that
was in perf_counter_alloc into the locked region in find_get_context,
so that once find_get_context has got the context for a task, it
won't get freed even if the task calls perf_counter_exit_task.  It
doesn't matter if new top-level (non-inherited) counters get attached
to the context after perf_counter_exit_task has detached the context
from the task.  They will just stay there and never get scheduled in
until the counters' fds get closed, and then perf_release will remove
them from the context and eventually free the context.

With this, we are now doing the unclone in find_get_context rather
than when a counter was added to or removed from a context (actually,
we were missing the unclone_ctx() call when adding a counter to a
context).  We don't need to unclone when removing a counter from a
context because we have no way to remove a counter from a cloned
context.

This also takes out the smp_wmb() in find_get_context, which Peter
Zijlstra pointed out was unnecessary because the cmpxchg implies a
full barrier anyway.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <18974.33033.667187.273886@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index a65ddc5..717bf3b 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -541,8 +541,9 @@ struct perf_counter_context {
 	 * been cloned (inherited) from a common ancestor.
 	 */
 	struct perf_counter_context *parent_ctx;
-	u32			parent_gen;
-	u32			generation;
+	u64			parent_gen;
+	u64			generation;
+	struct rcu_head		rcu_head;
 };
 
 /**
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 367299f..52e5a15 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -103,12 +103,22 @@ static void get_ctx(struct perf_counter_context *ctx)
 	atomic_inc(&ctx->refcount);
 }
 
+static void free_ctx(struct rcu_head *head)
+{
+	struct perf_counter_context *ctx;
+
+	ctx = container_of(head, struct perf_counter_context, rcu_head);
+	kfree(ctx);
+}
+
 static void put_ctx(struct perf_counter_context *ctx)
 {
 	if (atomic_dec_and_test(&ctx->refcount)) {
 		if (ctx->parent_ctx)
 			put_ctx(ctx->parent_ctx);
-		kfree(ctx);
+		if (ctx->task)
+			put_task_struct(ctx->task);
+		call_rcu(&ctx->rcu_head, free_ctx);
 	}
 }
 
@@ -212,22 +222,6 @@ group_sched_out(struct perf_counter *group_counter,
 }
 
 /*
- * Mark this context as not being a clone of another.
- * Called when counters are added to or removed from this context.
- * We also increment our generation number so that anything that
- * was cloned from this context before this will not match anything
- * cloned from this context after this.
- */
-static void unclone_ctx(struct perf_counter_context *ctx)
-{
-	++ctx->generation;
-	if (!ctx->parent_ctx)
-		return;
-	put_ctx(ctx->parent_ctx);
-	ctx->parent_ctx = NULL;
-}
-
-/*
  * Cross CPU call to remove a performance counter
  *
  * We disable the counter on the hardware level first. After that we
@@ -281,13 +275,19 @@ static void __perf_counter_remove_from_context(void *info)
  *
  * CPU counters are removed with a smp call. For task counters we only
  * call when the task is on a CPU.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid.  This is OK when called from perf_release since
+ * that only calls us on the top-level context, which can't be a clone.
+ * When called from perf_counter_exit_task, it's OK because the
+ * context has been detached from its task.
  */
 static void perf_counter_remove_from_context(struct perf_counter *counter)
 {
 	struct perf_counter_context *ctx = counter->ctx;
 	struct task_struct *task = ctx->task;
 
-	unclone_ctx(ctx);
 	if (!task) {
 		/*
 		 * Per cpu counters are removed via an smp call and
@@ -410,6 +410,16 @@ static void __perf_counter_disable(void *info)
 
 /*
  * Disable a counter.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid.  This condition is satisifed when called through
+ * perf_counter_for_each_child or perf_counter_for_each because they
+ * hold the top-level counter's child_mutex, so any descendant that
+ * goes to exit will block in sync_child_counter.
+ * When called from perf_pending_counter it's OK because counter->ctx
+ * is the current context on this CPU and preemption is disabled,
+ * hence we can't get into perf_counter_task_sched_out for this context.
  */
 static void perf_counter_disable(struct perf_counter *counter)
 {
@@ -794,6 +804,12 @@ static void __perf_counter_enable(void *info)
 
 /*
  * Enable a counter.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid.  This condition is satisfied when called through
+ * perf_counter_for_each_child or perf_counter_for_each as described
+ * for perf_counter_disable.
  */
 static void perf_counter_enable(struct perf_counter *counter)
 {
@@ -923,7 +939,9 @@ void perf_counter_task_sched_out(struct task_struct *task,
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 	struct perf_counter_context *ctx = task->perf_counter_ctxp;
 	struct perf_counter_context *next_ctx;
+	struct perf_counter_context *parent;
 	struct pt_regs *regs;
+	int do_switch = 1;
 
 	regs = task_pt_regs(task);
 	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
@@ -932,18 +950,39 @@ void perf_counter_task_sched_out(struct task_struct *task,
 		return;
 
 	update_context_time(ctx);
+
+	rcu_read_lock();
+	parent = rcu_dereference(ctx->parent_ctx);
 	next_ctx = next->perf_counter_ctxp;
-	if (next_ctx && context_equiv(ctx, next_ctx)) {
-		task->perf_counter_ctxp = next_ctx;
-		next->perf_counter_ctxp = ctx;
-		ctx->task = next;
-		next_ctx->task = task;
-		return;
+	if (parent && next_ctx &&
+	    rcu_dereference(next_ctx->parent_ctx) == parent) {
+		/*
+		 * Looks like the two contexts are clones, so we might be
+		 * able to optimize the context switch.  We lock both
+		 * contexts and check that they are clones under the
+		 * lock (including re-checking that neither has been
+		 * uncloned in the meantime).  It doesn't matter which
+		 * order we take the locks because no other cpu could
+		 * be trying to lock both of these tasks.
+		 */
+		spin_lock(&ctx->lock);
+		spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
+		if (context_equiv(ctx, next_ctx)) {
+			task->perf_counter_ctxp = next_ctx;
+			next->perf_counter_ctxp = ctx;
+			ctx->task = next;
+			next_ctx->task = task;
+			do_switch = 0;
+		}
+		spin_unlock(&next_ctx->lock);
+		spin_unlock(&ctx->lock);
 	}
+	rcu_read_unlock();
 
-	__perf_counter_sched_out(ctx, cpuctx);
-
-	cpuctx->task_ctx = NULL;
+	if (do_switch) {
+		__perf_counter_sched_out(ctx, cpuctx);
+		cpuctx->task_ctx = NULL;
+	}
 }
 
 static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
@@ -1215,18 +1254,13 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
 	ctx->task = task;
 }
 
-static void put_context(struct perf_counter_context *ctx)
-{
-	if (ctx->task)
-		put_task_struct(ctx->task);
-}
-
 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 {
 	struct perf_cpu_context *cpuctx;
 	struct perf_counter_context *ctx;
-	struct perf_counter_context *tctx;
+	struct perf_counter_context *parent_ctx;
 	struct task_struct *task;
+	int err;
 
 	/*
 	 * If cpu is not a wildcard then this is a percpu counter:
@@ -1249,6 +1283,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 
 		cpuctx = &per_cpu(perf_cpu_context, cpu);
 		ctx = &cpuctx->ctx;
+		get_ctx(ctx);
 
 		return ctx;
 	}
@@ -1265,37 +1300,79 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	if (!task)
 		return ERR_PTR(-ESRCH);
 
+	/*
+	 * Can't attach counters to a dying task.
+	 */
+	err = -ESRCH;
+	if (task->flags & PF_EXITING)
+		goto errout;
+
 	/* Reuse ptrace permission checks for now. */
-	if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
-		put_task_struct(task);
-		return ERR_PTR(-EACCES);
+	err = -EACCES;
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto errout;
+
+ retry_lock:
+	rcu_read_lock();
+ retry:
+	ctx = rcu_dereference(task->perf_counter_ctxp);
+	if (ctx) {
+		/*
+		 * If this context is a clone of another, it might
+		 * get swapped for another underneath us by
+		 * perf_counter_task_sched_out, though the
+		 * rcu_read_lock() protects us from any context
+		 * getting freed.  Lock the context and check if it
+		 * got swapped before we could get the lock, and retry
+		 * if so.  If we locked the right context, then it
+		 * can't get swapped on us any more and we can
+		 * unclone it if necessary.
+		 * Once it's not a clone things will be stable.
+		 */
+		spin_lock_irq(&ctx->lock);
+		if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
+			spin_unlock_irq(&ctx->lock);
+			goto retry;
+		}
+		parent_ctx = ctx->parent_ctx;
+		if (parent_ctx) {
+			put_ctx(parent_ctx);
+			ctx->parent_ctx = NULL;		/* no longer a clone */
+		}
+		++ctx->generation;
+		/*
+		 * Get an extra reference before dropping the lock so that
+		 * this context won't get freed if the task exits.
+		 */
+		get_ctx(ctx);
+		spin_unlock_irq(&ctx->lock);
 	}
+	rcu_read_unlock();
 
-	ctx = task->perf_counter_ctxp;
 	if (!ctx) {
 		ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
-		if (!ctx) {
-			put_task_struct(task);
-			return ERR_PTR(-ENOMEM);
-		}
+		err = -ENOMEM;
+		if (!ctx)
+			goto errout;
 		__perf_counter_init_context(ctx, task);
-		/*
-		 * Make sure other cpus see correct values for *ctx
-		 * once task->perf_counter_ctxp is visible to them.
-		 */
-		smp_wmb();
-		tctx = cmpxchg(&task->perf_counter_ctxp, NULL, ctx);
-		if (tctx) {
+		get_ctx(ctx);
+		if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
 			/*
 			 * We raced with some other task; use
 			 * the context they set.
 			 */
 			kfree(ctx);
-			ctx = tctx;
+			goto retry_lock;
 		}
+		get_task_struct(task);
 	}
 
+	put_task_struct(task);
 	return ctx;
+
+ errout:
+	put_task_struct(task);
+	return ERR_PTR(err);
 }
 
 static void free_counter_rcu(struct rcu_head *head)
@@ -1303,7 +1380,6 @@ static void free_counter_rcu(struct rcu_head *head)
 	struct perf_counter *counter;
 
 	counter = container_of(head, struct perf_counter, rcu_head);
-	put_ctx(counter->ctx);
 	kfree(counter);
 }
 
@@ -1324,6 +1400,7 @@ static void free_counter(struct perf_counter *counter)
 	if (counter->destroy)
 		counter->destroy(counter);
 
+	put_ctx(counter->ctx);
 	call_rcu(&counter->rcu_head, free_counter_rcu);
 }
 
@@ -1347,7 +1424,6 @@ static int perf_release(struct inode *inode, struct file *file)
 	put_task_struct(counter->owner);
 
 	free_counter(counter);
-	put_context(ctx);
 
 	return 0;
 }
@@ -1437,6 +1513,12 @@ static void perf_counter_for_each_sibling(struct perf_counter *counter,
 	mutex_unlock(&ctx->mutex);
 }
 
+/*
+ * Holding the top-level counter's child_mutex means that any
+ * descendant process that has inherited this counter will block
+ * in sync_child_counter if it goes to exit, thus satisfying the
+ * task existence requirements of perf_counter_enable/disable.
+ */
 static void perf_counter_for_each_child(struct perf_counter *counter,
 					void (*func)(struct perf_counter *))
 {
@@ -3124,8 +3206,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->ctx			= ctx;
 	counter->oncpu			= -1;
 
-	get_ctx(ctx);
-
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	if (hw_event->disabled)
 		counter->state = PERF_COUNTER_STATE_OFF;
@@ -3290,7 +3370,7 @@ err_free_put_context:
 	kfree(counter);
 
 err_put_context:
-	put_context(ctx);
+	put_ctx(ctx);
 
 	goto out_fput;
 }
@@ -3322,6 +3402,7 @@ inherit_counter(struct perf_counter *parent_counter,
 					   group_leader, GFP_KERNEL);
 	if (IS_ERR(child_counter))
 		return child_counter;
+	get_ctx(child_ctx);
 
 	/*
 	 * Make the child state follow the state of the parent counter,
@@ -3439,11 +3520,6 @@ __perf_counter_exit_task(struct task_struct *child,
 
 /*
  * When a child task exits, feed back counter values to parent counters.
- *
- * Note: we may be running in child context, but the PID is not hashed
- * anymore so new counters will not be added.
- * (XXX not sure that is true when we get called from flush_old_exec.
- *  -- paulus)
  */
 void perf_counter_exit_task(struct task_struct *child)
 {
@@ -3458,7 +3534,15 @@ void perf_counter_exit_task(struct task_struct *child)
 
 	local_irq_save(flags);
 	__perf_counter_task_sched_out(child_ctx);
+
+	/*
+	 * Take the context lock here so that if find_get_context is
+	 * reading child->perf_counter_ctxp, we wait until it has
+	 * incremented the context's refcount before we do put_ctx below.
+	 */
+	spin_lock(&child_ctx->lock);
 	child->perf_counter_ctxp = NULL;
+	spin_unlock(&child_ctx->lock);
 	local_irq_restore(flags);
 
 	mutex_lock(&child_ctx->mutex);
@@ -3513,6 +3597,7 @@ int perf_counter_init_task(struct task_struct *child)
 
 	__perf_counter_init_context(child_ctx, child);
 	child->perf_counter_ctxp = child_ctx;
+	get_task_struct(child);
 
 	/*
 	 * Lock the parent list. No need to lock the child - not PID
-- 
cgit v0.10.2


From f49afbb572d5e08ae12f1a979dc2e41745040339 Mon Sep 17 00:00:00 2001
From: Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
Date: Thu, 28 May 2009 16:41:36 +0300
Subject: MAINTAINER: Add F: entries for Gemini and FA526

Signed-off-by: Paulius Zaleckas <paulius.zaleckas@teltonika.lt>

diff --git a/MAINTAINERS b/MAINTAINERS
index 5ee166e..42c53ab6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -624,6 +624,7 @@ M:	paulius.zaleckas@teltonika.lt
 L:	linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only)
 T:	git git://gitorious.org/linux-gemini/mainline.git
 S:	Maintained
+F:	arch/arm/mach-gemini/
 
 ARM/EBSA110 MACHINE SUPPORT
 P:	Russell King
@@ -650,6 +651,7 @@ P:	Paulius Zaleckas
 M:	paulius.zaleckas@teltonika.lt
 L:	linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only)
 S:	Maintained
+F:	arch/arm/mm/*-fa*
 
 ARM/FOOTBRIDGE ARCHITECTURE
 P:	Russell King
-- 
cgit v0.10.2


From 67a433ce278b98f47272726a22537fab7fd99de9 Mon Sep 17 00:00:00 2001
From: Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
Date: Thu, 28 May 2009 16:42:25 +0300
Subject: Gemini: Fix SRAM/ROM location after memory swap

Signed-off-by: Paulius Zaleckas <paulius.zaleckas@teltonika.lt>

diff --git a/arch/arm/mach-gemini/include/mach/hardware.h b/arch/arm/mach-gemini/include/mach/hardware.h
index de67526..213a4fc 100644
--- a/arch/arm/mach-gemini/include/mach/hardware.h
+++ b/arch/arm/mach-gemini/include/mach/hardware.h
@@ -15,10 +15,9 @@
 /*
  * Memory Map definitions
  */
-/* FIXME: Does it really swap SRAM like this? */
 #ifdef CONFIG_GEMINI_MEM_SWAP
 # define GEMINI_DRAM_BASE	0x00000000
-# define GEMINI_SRAM_BASE	0x20000000
+# define GEMINI_SRAM_BASE	0x70000000
 #else
 # define GEMINI_SRAM_BASE	0x00000000
 # define GEMINI_DRAM_BASE	0x10000000
-- 
cgit v0.10.2


From 07119a4df8c8c77d888f2f46964ea9512ea84ff8 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 27 May 2009 09:37:33 -0400
Subject: cifs: have cifs_NTtimeToUnix take a little-endian arg

...and just have the function call le64_to_cpu.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index fae0839..8831f64 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -90,7 +90,7 @@ extern struct oplock_q_entry *AllocOplockQEntry(struct inode *, u16,
 						 struct cifsTconInfo *);
 extern void DeleteOplockQEntry(struct oplock_q_entry *);
 extern void DeleteTconOplockQEntries(struct cifsTconInfo *);
-extern struct timespec cifs_NTtimeToUnix(u64 utc_nanoseconds_since_1601);
+extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
 extern u64 cifs_UnixTimeToNT(struct timespec);
 extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
 extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 302ea15..0686684 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -241,7 +241,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
 	/* BB need same check in cifs_create too? */
 	/* if not oplocked, invalidate inode pages if mtime or file
 	   size changed */
-	temp = cifs_NTtimeToUnix(le64_to_cpu(buf->LastWriteTime));
+	temp = cifs_NTtimeToUnix(buf->LastWriteTime);
 	if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
 			   (file->f_path.dentry->d_inode->i_size ==
 			    (loff_t)le64_to_cpu(buf->EndOfFile))) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 9c869a6..42d6e0f 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -85,10 +85,10 @@ static void cifs_unix_info_to_inode(struct inode *inode,
 	__u64 num_of_bytes = le64_to_cpu(info->NumOfBytes);
 	__u64 end_of_file = le64_to_cpu(info->EndOfFile);
 
-	inode->i_atime = cifs_NTtimeToUnix(le64_to_cpu(info->LastAccessTime));
+	inode->i_atime = cifs_NTtimeToUnix(info->LastAccessTime);
 	inode->i_mtime =
-		cifs_NTtimeToUnix(le64_to_cpu(info->LastModificationTime));
-	inode->i_ctime = cifs_NTtimeToUnix(le64_to_cpu(info->LastStatusChange));
+		cifs_NTtimeToUnix(info->LastModificationTime);
+	inode->i_ctime = cifs_NTtimeToUnix(info->LastStatusChange);
 	inode->i_mode = le64_to_cpu(info->Permissions);
 
 	/*
@@ -554,14 +554,11 @@ int cifs_get_inode_info(struct inode **pinode,
 
 	/* Linux can not store file creation time so ignore it */
 	if (pfindData->LastAccessTime)
-		inode->i_atime = cifs_NTtimeToUnix
-			(le64_to_cpu(pfindData->LastAccessTime));
+		inode->i_atime = cifs_NTtimeToUnix(pfindData->LastAccessTime);
 	else /* do not need to use current_fs_time - time not stored */
 		inode->i_atime = CURRENT_TIME;
-	inode->i_mtime =
-		    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
-	inode->i_ctime =
-	    cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
+	inode->i_mtime = cifs_NTtimeToUnix(pfindData->LastWriteTime);
+	inode->i_ctime = cifs_NTtimeToUnix(pfindData->ChangeTime);
 	cFYI(DBG2, ("Attributes came in as 0x%x", attr));
 	if (adjustTZ && (pTcon->ses) && (pTcon->ses->server)) {
 		inode->i_ctime.tv_sec += pTcon->ses->server->timeAdj;
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index e2fe998..d3ba75e 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -853,12 +853,12 @@ smbCalcSize_LE(struct smb_hdr *ptr)
 
 #define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000)
 
-    /*
-     * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
-     * into Unix UTC (based 1970-01-01, in seconds).
-     */
+/*
+ * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
+ * into Unix UTC (based 1970-01-01, in seconds).
+ */
 struct timespec
-cifs_NTtimeToUnix(u64 ntutc)
+cifs_NTtimeToUnix(__le64 ntutc)
 {
 	struct timespec ts;
 	/* BB what about the timezone? BB */
@@ -866,7 +866,7 @@ cifs_NTtimeToUnix(u64 ntutc)
 	/* Subtract the NTFS time offset, then convert to 1s intervals. */
 	u64 t;
 
-	t = ntutc - NTFS_TIME_OFFSET;
+	t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
 	ts.tv_nsec = do_div(t, 10000000) * 100;
 	ts.tv_sec = t;
 	return ts;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 964e097..79c46c2 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -150,11 +150,11 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
 		allocation_size = le64_to_cpu(pfindData->AllocationSize);
 		end_of_file = le64_to_cpu(pfindData->EndOfFile);
 		tmp_inode->i_atime =
-		      cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime));
+			cifs_NTtimeToUnix(pfindData->LastAccessTime);
 		tmp_inode->i_mtime =
-		      cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
+			cifs_NTtimeToUnix(pfindData->LastWriteTime);
 		tmp_inode->i_ctime =
-		      cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
+			cifs_NTtimeToUnix(pfindData->ChangeTime);
 	} else { /* legacy, OS2 and DOS style */
 /*		struct timespec ts;*/
 		FIND_FILE_STANDARD_INFO *pfindData =
@@ -331,11 +331,11 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
 	local_size  = tmp_inode->i_size;
 
 	tmp_inode->i_atime =
-	    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime));
+	    cifs_NTtimeToUnix(pfindData->LastAccessTime);
 	tmp_inode->i_mtime =
-	    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastModificationTime));
+	    cifs_NTtimeToUnix(pfindData->LastModificationTime);
 	tmp_inode->i_ctime =
-	    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastStatusChange));
+	    cifs_NTtimeToUnix(pfindData->LastStatusChange);
 
 	tmp_inode->i_mode = le64_to_cpu(pfindData->Permissions);
 	/* since we set the inode type below we need to mask off type
-- 
cgit v0.10.2


From c4a2c08db7d976c2e23a97da5d69ec7c9701034d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 27 May 2009 09:37:33 -0400
Subject: cifs: make cnvrtDosUnixTm take a little-endian args and an offset

The callers primarily end up converting the args from le anyway. Also,
most of the callers end up needing to add an offset to the result. The
exception to these rules is cnvrtDosCifsTm, but there are no callers of
that function, so we might as well remove it.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 8831f64..d542cf1 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -92,8 +92,8 @@ extern void DeleteOplockQEntry(struct oplock_q_entry *);
 extern void DeleteTconOplockQEntries(struct cifsTconInfo *);
 extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
 extern u64 cifs_UnixTimeToNT(struct timespec);
-extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
-extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
+extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
+				      int offset);
 
 extern int cifs_posix_open(char *full_path, struct inode **pinode,
 			   struct super_block *sb, int mode, int oflags,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index aece2a8..b84c61d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -524,8 +524,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			int val, seconds, remain, result;
 			struct timespec ts, utc;
 			utc = CURRENT_TIME;
-			ts = cnvrtDosUnixTm(le16_to_cpu(rsp->SrvTime.Date),
-						le16_to_cpu(rsp->SrvTime.Time));
+			ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
+					    rsp->SrvTime.Time, 0);
 			cFYI(1, ("SrvTime %d sec since 1970 (utc: %d) diff: %d",
 				(int)ts.tv_sec, (int)utc.tv_sec,
 				(int)(utc.tv_sec - ts.tv_sec)));
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index d3ba75e..32d6baa 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -883,16 +883,12 @@ cifs_UnixTimeToNT(struct timespec t)
 static int total_days_of_prev_months[] =
 {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334};
 
-
-__le64 cnvrtDosCifsTm(__u16 date, __u16 time)
-{
-	return cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm(date, time)));
-}
-
-struct timespec cnvrtDosUnixTm(__u16 date, __u16 time)
+struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
 {
 	struct timespec ts;
 	int sec, min, days, month, year;
+	u16 date = le16_to_cpu(le_date);
+	u16 time = le16_to_cpu(le_time);
 	SMB_TIME *st = (SMB_TIME *)&time;
 	SMB_DATE *sd = (SMB_DATE *)&date;
 
@@ -933,7 +929,7 @@ struct timespec cnvrtDosUnixTm(__u16 date, __u16 time)
 		days -= ((year & 0x03) == 0) && (month < 2 ? 1 : 0);
 	sec += 24 * 60 * 60 * days;
 
-	ts.tv_sec = sec;
+	ts.tv_sec = sec + offset;
 
 	/* cFYI(1,("sec after cnvrt dos to unix time %d",sec)); */
 
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 79c46c2..86d0055 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -115,17 +115,6 @@ construct_dentry(struct qstr *qstring, struct file *file,
 	return rc;
 }
 
-static void AdjustForTZ(struct cifsTconInfo *tcon, struct inode *inode)
-{
-	if ((tcon) && (tcon->ses) && (tcon->ses->server)) {
-		inode->i_ctime.tv_sec += tcon->ses->server->timeAdj;
-		inode->i_mtime.tv_sec += tcon->ses->server->timeAdj;
-		inode->i_atime.tv_sec += tcon->ses->server->timeAdj;
-	}
-	return;
-}
-
-
 static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
 			  char *buf, unsigned int *pobject_type, int isNewInode)
 {
@@ -156,20 +145,19 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
 		tmp_inode->i_ctime =
 			cifs_NTtimeToUnix(pfindData->ChangeTime);
 	} else { /* legacy, OS2 and DOS style */
-/*		struct timespec ts;*/
+		int offset = cifs_sb->tcon->ses->server->timeAdj;
 		FIND_FILE_STANDARD_INFO *pfindData =
 			(FIND_FILE_STANDARD_INFO *)buf;
 
-		tmp_inode->i_mtime = cnvrtDosUnixTm(
-				le16_to_cpu(pfindData->LastWriteDate),
-				le16_to_cpu(pfindData->LastWriteTime));
-		tmp_inode->i_atime = cnvrtDosUnixTm(
-				le16_to_cpu(pfindData->LastAccessDate),
-				le16_to_cpu(pfindData->LastAccessTime));
-		tmp_inode->i_ctime = cnvrtDosUnixTm(
-				le16_to_cpu(pfindData->LastWriteDate),
-				le16_to_cpu(pfindData->LastWriteTime));
-		AdjustForTZ(cifs_sb->tcon, tmp_inode);
+		tmp_inode->i_mtime = cnvrtDosUnixTm(pfindData->LastWriteDate,
+						    pfindData->LastWriteTime,
+						    offset);
+		tmp_inode->i_atime = cnvrtDosUnixTm(pfindData->LastAccessDate,
+						    pfindData->LastAccessTime,
+						    offset);
+		tmp_inode->i_ctime = cnvrtDosUnixTm(pfindData->LastWriteDate,
+						    pfindData->LastWriteTime,
+						    offset);
 		attr = le16_to_cpu(pfindData->Attributes);
 		allocation_size = le32_to_cpu(pfindData->AllocationSize);
 		end_of_file = le32_to_cpu(pfindData->DataSize);
-- 
cgit v0.10.2


From bd433d4cf4d8593a5f1764776b91f1794fce5a77 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 27 May 2009 09:37:34 -0400
Subject: cifs: rename cifs_iget to cifs_root_iget

The current cifs_iget isn't suitable for anything but the root inode.
Rename it with a more appropriate name.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 5e6d358..0a10a59 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -146,7 +146,7 @@ cifs_read_super(struct super_block *sb, void *data,
 #endif
 	sb->s_blocksize = CIFS_MAX_MSGSIZE;
 	sb->s_blocksize_bits = 14;	/* default 2**14 = CIFS_MAX_MSGSIZE */
-	inode = cifs_iget(sb, ROOT_I);
+	inode = cifs_root_iget(sb, ROOT_I);
 
 	if (IS_ERR(inode)) {
 		rc = PTR_ERR(inode);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 051b71c..3b6a85c 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -36,7 +36,7 @@ extern void cifs_read_inode(struct inode *);
 
 /* Functions related to inodes */
 extern const struct inode_operations cifs_dir_inode_ops;
-extern struct inode *cifs_iget(struct super_block *, unsigned long);
+extern struct inode *cifs_root_iget(struct super_block *, unsigned long);
 extern int cifs_create(struct inode *, struct dentry *, int,
 		       struct nameidata *);
 extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 42d6e0f..84b7bea 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -696,7 +696,7 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
 }
 
 /* gets root inode */
-struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
+struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
 {
 	int xid;
 	struct cifs_sb_info *cifs_sb;
-- 
cgit v0.10.2


From a0c9217f64ee3cd1e534966da8c5f05768e1ab09 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 27 May 2009 15:40:47 -0400
Subject: cifs: make serverino the default when mounting

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index f32c903..8ae563f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -835,6 +835,8 @@ cifs_parse_mount_options(char *options, const char *devname,
 	vol->rw = true;
 	/* default is always to request posix paths. */
 	vol->posix_paths = 1;
+	/* default to using server inode numbers where available */
+	vol->server_ino = 1;
 
 	if (!options)
 		return 1;
-- 
cgit v0.10.2


From c5077ec42303e07c2c685b0f6cb8eee0f2c7751c Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 28 May 2009 15:09:04 +0000
Subject: [CIFS] Update readme to indicate change to default mount (serverino)

Signed-off-by: Steve French <sfrench@us.ibm.com>

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index f20c406..227c681 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,9 @@
+Version 1.59
+------------
+Client uses server inode numbers (which are persistent) rather than
+client generated ones by default (mount option "serverino" turned
+on by default if server supports it).
+
 Version 1.58
 ------------
 Guard against buffer overruns in various UCS-2 to UTF-8 string conversions
@@ -10,6 +16,8 @@ we converted from).  Fix endianness of the vcnum field used during
 session setup to distinguish multiple mounts to same server from different
 userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental
 flag to be set to 2, and mount must enable krb5 to turn on extended security).
+Performance of file create to Samba improved (posix create on lookup
+removes 1 of 2 network requests sent on file create)
  
 Version 1.57
 ------------
diff --git a/fs/cifs/README b/fs/cifs/README
index db208ddb..6d1608f 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -388,8 +388,13 @@ A partial list of the supported mount options follows:
 		or the CIFS Unix Extensions equivalent and for those
 		this mount option will have no effect.  Exporting cifs mounts
 		under nfsd requires this mount option on the cifs mount.
+		This is now the default if server supports the 
+		required network operation.
   noserverino   Client generates inode numbers (rather than using the actual one
-		from the server) by default.
+		from the server). These inode numbers will vary after
+		unmount or reboot which can confuse some applications,
+		but not all server filesystems support unique inode
+		numbers.
   setuids       If the CIFS Unix extensions are negotiated with the server
 		the client will attempt to set the effective uid and gid of
 		the local process on newly created files, directories, and
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 3b6a85c..9570a0e 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -100,5 +100,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.58"
+#define CIFS_VERSION   "1.59"
 #endif				/* _CIFSFS_H */
-- 
cgit v0.10.2


From ed888aef427365d19f887c271a3a906d16422d24 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 22 May 2009 17:16:04 +0200
Subject: dma-debug: re-add dma memory leak detection

This is basically a revert of commit 314eeac9 but now in a
fixed version.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index cdd205d..e47e1a0 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -105,6 +105,11 @@ static const char *type2name[4] = { "single", "page",
 static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE",
 				   "DMA_FROM_DEVICE", "DMA_NONE" };
 
+/* little merge helper - remove it after the merge window */
+#ifndef BUS_NOTIFY_UNBOUND_DRIVER
+#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005
+#endif
+
 /*
  * The access to some variables in this macro is racy. We can't use atomic_t
  * here because all these variables are exported to debugfs. Some of them even
@@ -458,9 +463,60 @@ out_err:
 	return -ENOMEM;
 }
 
+static int device_dma_allocations(struct device *dev)
+{
+	struct dma_debug_entry *entry;
+	unsigned long flags;
+	int count = 0, i;
+
+	for (i = 0; i < HASH_SIZE; ++i) {
+		spin_lock_irqsave(&dma_entry_hash[i].lock, flags);
+		list_for_each_entry(entry, &dma_entry_hash[i].list, list) {
+			if (entry->dev == dev)
+				count += 1;
+		}
+		spin_unlock_irqrestore(&dma_entry_hash[i].lock, flags);
+	}
+
+	return count;
+}
+
+static int dma_debug_device_change(struct notifier_block *nb,
+				    unsigned long action, void *data)
+{
+	struct device *dev = data;
+	int count;
+
+
+	switch (action) {
+	case BUS_NOTIFY_UNBOUND_DRIVER:
+		count = device_dma_allocations(dev);
+		if (count == 0)
+			break;
+		err_printk(dev, NULL, "DMA-API: device driver has pending "
+				"DMA allocations while released from device "
+				"[count=%d]\n", count);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
 void dma_debug_add_bus(struct bus_type *bus)
 {
-	/* FIXME: register notifier */
+	struct notifier_block *nb;
+
+	nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL);
+	if (nb == NULL) {
+		printk(KERN_ERR "dma_debug_add_bus: out of memory\n");
+		return;
+	}
+
+	nb->notifier_call = dma_debug_device_change;
+
+	bus_register_notifier(bus, nb);
 }
 
 /*
-- 
cgit v0.10.2


From fefda117ddb324b872312f1f061230e627c9f5ee Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 20 May 2009 12:21:42 +0200
Subject: amd-iommu: add amd_iommu_dump parameter

This kernel parameter will be useful to get some AMD IOMMU related
information in dmesg that is not necessary for the default user but may
be helpful in debug situations.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 95c8cd9..89dfb37 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -194,6 +194,12 @@
 #define PD_DMA_OPS_MASK		(1UL << 0) /* domain used for dma_ops */
 #define PD_DEFAULT_MASK		(1UL << 1) /* domain is a default dma_ops
 					      domain for an IOMMU */
+extern bool amd_iommu_dump;
+#define DUMP_printk(format, arg...)					\
+	do {								\
+		if (amd_iommu_dump)						\
+			printk(KERN_INFO "AMD IOMMU: " format, ## arg);	\
+	} while(0);
 
 /*
  * This structure contains generic data for  IOMMU protection domains
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 8c0be09..57fb7a7 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -115,6 +115,8 @@ struct ivmd_header {
 	u64 range_length;
 } __attribute__((packed));
 
+bool amd_iommu_dump;
+
 static int __initdata amd_iommu_detected;
 
 u16 amd_iommu_last_bdf;			/* largest PCI device id we have
@@ -1211,6 +1213,13 @@ void __init amd_iommu_detect(void)
  *
  ****************************************************************************/
 
+static int __init parse_amd_iommu_dump(char *str)
+{
+	amd_iommu_dump = true;
+
+	return 1;
+}
+
 static int __init parse_amd_iommu_options(char *str)
 {
 	for (; *str; ++str) {
@@ -1235,5 +1244,6 @@ static int __init parse_amd_iommu_size_options(char *str)
 	return 1;
 }
 
+__setup("amd_iommu_dump", parse_amd_iommu_dump);
 __setup("amd_iommu=", parse_amd_iommu_options);
 __setup("amd_iommu_size=", parse_amd_iommu_size_options);
-- 
cgit v0.10.2


From 9c72041f719e2864d4208a89341c36b316dbf893 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 20 May 2009 13:53:57 +0200
Subject: amd-iommu: add dump for iommus described in ivrs table

Add information about IOMMU devices described in the IVRS ACPI table to
the kernel log if amd_iommu_dump was specified on the kernel command
line.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 57fb7a7..2816590 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -748,6 +748,15 @@ static int __init init_iommu_all(struct acpi_table_header *table)
 		h = (struct ivhd_header *)p;
 		switch (*p) {
 		case ACPI_IVHD_TYPE:
+
+			DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x "
+				    "seg: %d flags: %01x info %04x\n",
+				    PCI_BUS(h->devid), PCI_SLOT(h->devid),
+				    PCI_FUNC(h->devid), h->cap_ptr,
+				    h->pci_seg, h->flags, h->info);
+			DUMP_printk("       mmio-addr: %016llx\n",
+				    h->mmio_phys);
+
 			iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
 			if (iommu == NULL)
 				return -ENOMEM;
-- 
cgit v0.10.2


From 42a698f40a0946f5517308411b9e003ae031414d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 20 May 2009 15:41:28 +0200
Subject: amd-iommu: print ivhd information to dmesg when requested

Add information about devices belonging to an IOMMU as described in the
IVRS ACPI table to the kernel log if amd_iommu_dump was specified on the
kernel command line.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 2816590..fe3e645 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -598,32 +598,83 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
 	p += sizeof(struct ivhd_header);
 	end += h->length;
 
+
 	while (p < end) {
 		e = (struct ivhd_entry *)p;
 		switch (e->type) {
 		case IVHD_DEV_ALL:
+
+			DUMP_printk("  DEV_ALL\t\t\t first devid: %02x:%02x.%x"
+				    " last device %02x:%02x.%x flags: %02x\n",
+				    PCI_BUS(iommu->first_device),
+				    PCI_SLOT(iommu->first_device),
+				    PCI_FUNC(iommu->first_device),
+				    PCI_BUS(iommu->last_device),
+				    PCI_SLOT(iommu->last_device),
+				    PCI_FUNC(iommu->last_device),
+				    e->flags);
+
 			for (dev_i = iommu->first_device;
 					dev_i <= iommu->last_device; ++dev_i)
 				set_dev_entry_from_acpi(iommu, dev_i,
 							e->flags, 0);
 			break;
 		case IVHD_DEV_SELECT:
+
+			DUMP_printk("  DEV_SELECT\t\t\t devid: %02x:%02x.%x "
+				    "flags: %02x\n",
+				    PCI_BUS(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid),
+				    e->flags);
+
 			devid = e->devid;
 			set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
 			break;
 		case IVHD_DEV_SELECT_RANGE_START:
+
+			DUMP_printk("  DEV_SELECT_RANGE_START\t "
+				    "devid: %02x:%02x.%x flags: %02x\n",
+				    PCI_BUS(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid),
+				    e->flags);
+
 			devid_start = e->devid;
 			flags = e->flags;
 			ext_flags = 0;
 			alias = false;
 			break;
 		case IVHD_DEV_ALIAS:
+
+			DUMP_printk("  DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
+				    "flags: %02x devid_to: %02x:%02x.%x\n",
+				    PCI_BUS(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid),
+				    e->flags,
+				    PCI_BUS(e->ext >> 8),
+				    PCI_SLOT(e->ext >> 8),
+				    PCI_FUNC(e->ext >> 8));
+
 			devid = e->devid;
 			devid_to = e->ext >> 8;
 			set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
 			amd_iommu_alias_table[devid] = devid_to;
 			break;
 		case IVHD_DEV_ALIAS_RANGE:
+
+			DUMP_printk("  DEV_ALIAS_RANGE\t\t "
+				    "devid: %02x:%02x.%x flags: %02x "
+				    "devid_to: %02x:%02x.%x\n",
+				    PCI_BUS(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid),
+				    e->flags,
+				    PCI_BUS(e->ext >> 8),
+				    PCI_SLOT(e->ext >> 8),
+				    PCI_FUNC(e->ext >> 8));
+
 			devid_start = e->devid;
 			flags = e->flags;
 			devid_to = e->ext >> 8;
@@ -631,17 +682,39 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
 			alias = true;
 			break;
 		case IVHD_DEV_EXT_SELECT:
+
+			DUMP_printk("  DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
+				    "flags: %02x ext: %08x\n",
+				    PCI_BUS(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid),
+				    e->flags, e->ext);
+
 			devid = e->devid;
 			set_dev_entry_from_acpi(iommu, devid, e->flags,
 						e->ext);
 			break;
 		case IVHD_DEV_EXT_SELECT_RANGE:
+
+			DUMP_printk("  DEV_EXT_SELECT_RANGE\t devid: "
+				    "%02x:%02x.%x flags: %02x ext: %08x\n",
+				    PCI_BUS(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid),
+				    e->flags, e->ext);
+
 			devid_start = e->devid;
 			flags = e->flags;
 			ext_flags = e->ext;
 			alias = false;
 			break;
 		case IVHD_DEV_RANGE_END:
+
+			DUMP_printk("  DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
+				    PCI_BUS(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid));
+
 			devid = e->devid;
 			for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
 				if (alias)
-- 
cgit v0.10.2


From 02acc43a294098c2a4cd22cf24e9c988644f9f7f Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 20 May 2009 16:24:21 +0200
Subject: amd-iommu: print ivmd information to dmesg when requested

Add information about device memory mapping requirements for the IOMMU
as described in the IVRS ACPI table to the kernel log if amd_iommu_dump
was specified on the kernel command line.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index fe3e645..b90a78c 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -983,6 +983,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
 static int __init init_unity_map_range(struct ivmd_header *m)
 {
 	struct unity_map_entry *e = 0;
+	char *s;
 
 	e = kzalloc(sizeof(*e), GFP_KERNEL);
 	if (e == NULL)
@@ -991,13 +992,16 @@ static int __init init_unity_map_range(struct ivmd_header *m)
 	switch (m->type) {
 	default:
 	case ACPI_IVMD_TYPE:
+		s = "IVMD_TYPEi\t\t\t";
 		e->devid_start = e->devid_end = m->devid;
 		break;
 	case ACPI_IVMD_TYPE_ALL:
+		s = "IVMD_TYPE_ALL\t\t";
 		e->devid_start = 0;
 		e->devid_end = amd_iommu_last_bdf;
 		break;
 	case ACPI_IVMD_TYPE_RANGE:
+		s = "IVMD_TYPE_RANGE\t\t";
 		e->devid_start = m->devid;
 		e->devid_end = m->aux;
 		break;
@@ -1006,6 +1010,13 @@ static int __init init_unity_map_range(struct ivmd_header *m)
 	e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
 	e->prot = m->flags >> 1;
 
+	DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
+		    " range_start: %016llx range_end: %016llx flags: %x\n", s,
+		    PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start),
+		    PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end),
+		    PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
+		    e->address_start, e->address_end, m->flags);
+
 	list_add_tail(&e->list, &amd_iommu_unity_map);
 
 	return 0;
-- 
cgit v0.10.2


From b3b99ef8b4f80f3f093a72110e7697c2281ae45d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 22 May 2009 12:02:48 +0200
Subject: amd-iommu: move protection domain printk to dump code

This information is only helpful for debugging. Don't print it anymore
unless explicitly requested.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index a97db99..3356599 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1009,8 +1009,9 @@ static int device_change_notifier(struct notifier_block *nb,
 		if (!dma_domain)
 			dma_domain = iommu->default_dom;
 		attach_device(iommu, &dma_domain->domain, devid);
-		printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
-		       "device %s\n", dma_domain->domain.id, dev_name(dev));
+		DUMP_printk(KERN_INFO "AMD IOMMU: Using protection domain "
+			    "%d for device %s\n",
+			    dma_domain->domain.id, dev_name(dev));
 		break;
 	case BUS_NOTIFY_UNBIND_DRIVER:
 		if (!domain)
@@ -1133,8 +1134,9 @@ static int get_device_resources(struct device *dev,
 			dma_dom = (*iommu)->default_dom;
 		*domain = &dma_dom->domain;
 		attach_device(*iommu, *domain, *bdf);
-		printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
-				"device %s\n", (*domain)->id, dev_name(dev));
+		DUMP_printk(KERN_INFO "AMD IOMMU: Using protection domain "
+				"%d for device %s\n",
+				(*domain)->id, dev_name(dev));
 	}
 
 	if (domain_for_device(_bdf) == NULL)
-- 
cgit v0.10.2


From 2be69c79e9a46a554fc3ff57d886e65e7a73eb72 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 22 May 2009 12:15:49 +0200
Subject: x86/iommu: add IOMMU_STRESS Kconfig entry

This Kconfig option is intended to enable various code paths or
parameters in IOMMU implementations to stress test the code and/or the
hardware. This can also be done by disabling optimizations in the code
when this option is switched on.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 5865712..33fac6b 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -159,6 +159,14 @@ config IOMMU_DEBUG
 	  options. See Documentation/x86_64/boot-options.txt for more
 	  details.
 
+config IOMMU_STRESS
+	bool "Enable IOMMU stress-test mode"
+	---help---
+	  This option disables various optimizations in IOMMU related
+	  code to do real stress testing of the IOMMU code. This option
+	  will cause a performance drop and should only be enabled for
+	  testing.
+
 config IOMMU_LEAK
 	bool "IOMMU leak tracing"
 	depends on IOMMU_DEBUG && DMA_API_DEBUG
-- 
cgit v0.10.2


From 2e8b569614b89c9b1b85cba37db36daeeeff744e Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 22 May 2009 12:44:03 +0200
Subject: amd-iommu: disable device isolation with CONFIG_IOMMU_STRESS

With device isolation disabled we can test better for race conditions in
dma_ops related code.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index b90a78c..6694112 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -124,8 +124,14 @@ u16 amd_iommu_last_bdf;			/* largest PCI device id we have
 LIST_HEAD(amd_iommu_unity_map);		/* a list of required unity mappings
 					   we find in ACPI */
 unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
+
+#ifdef CONFIG_IOMMU_STRESS
+bool amd_iommu_isolate = false;
+#else
 bool amd_iommu_isolate = true;		/* if true, device isolation is
 					   enabled */
+#endif
+
 bool amd_iommu_unmap_flush;		/* if true, flush on every unmap */
 
 LIST_HEAD(amd_iommu_list);		/* list of all AMD IOMMUs in the
-- 
cgit v0.10.2


From 421f909c803d1c397f6c66b75653f238696c39ee Mon Sep 17 00:00:00 2001
From: Neil Turton <nturton@solarflare.com>
Date: Thu, 14 May 2009 14:00:35 +0100
Subject: amd-iommu: fix an off-by-one error in the AMD IOMMU driver.

The variable amd_iommu_last_bdf holds the maximum bdf of any device
controlled by an IOMMU, so the number of device entries needed is
amd_iommu_last_bdf+1.  The function tbl_size used amd_iommu_last_bdf
instead.  This would be a problem if the last device were a large
enough power of 2.

[ Impact: fix amd_iommu_last_bdf off-by-one error ]

Signed-off-by: Neil Turton <nturton@solarflare.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 8c0be09..35fc965 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -175,7 +175,7 @@ static inline void update_last_devid(u16 devid)
 static inline unsigned long tbl_size(int entry_size)
 {
 	unsigned shift = PAGE_SHIFT +
-			 get_order(amd_iommu_last_bdf * entry_size);
+			 get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
 
 	return 1UL << shift;
 }
-- 
cgit v0.10.2


From 7455aab1f95f6464c5af3fbdee28744e73f38564 Mon Sep 17 00:00:00 2001
From: Neil Turton <nturton@solarflare.com>
Date: Thu, 14 May 2009 14:08:11 +0100
Subject: amd-iommu: fix the handling of device aliases in the AMD IOMMU
 driver.

The devid parameter to set_dev_entry_from_acpi is the requester ID
rather than the device ID since it is used to index the IOMMU device
table.  The handling of IVHD_DEV_ALIAS used to pass the device ID.
This patch fixes it to pass the requester ID.

[ Impact: fix setting the wrong req-id in acpi-table parsing ]

Signed-off-by: Neil Turton <nturton@solarflare.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 35fc965..53f93db 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -618,7 +618,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
 		case IVHD_DEV_ALIAS:
 			devid = e->devid;
 			devid_to = e->ext >> 8;
-			set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
+			set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
 			amd_iommu_alias_table[devid] = devid_to;
 			break;
 		case IVHD_DEV_ALIAS_RANGE:
-- 
cgit v0.10.2


From 0bc252f430d6a3ac7836d40f00d0ae020593b11b Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 22 May 2009 12:48:05 +0200
Subject: amd-iommu: make sure only ivmd entries are parsed

The bug never triggered. But it should be fixed to protect against
broken ACPI tables in the future.

[ Impact: protect against broken ivrs acpi table ]

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 53f93db..a3a2b98 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -906,6 +906,8 @@ static int __init init_unity_map_range(struct ivmd_header *m)
 
 	switch (m->type) {
 	default:
+		kfree(e);
+		return 0;
 	case ACPI_IVMD_TYPE:
 		e->devid_start = e->devid_end = m->devid;
 		break;
-- 
cgit v0.10.2


From c1eee67b2d8464781f5868a34168df61e40e85a6 Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@sous-sol.org>
Date: Thu, 21 May 2009 00:56:58 -0700
Subject: amd iommu: properly detach from protection domain on ->remove

Some drivers may use the dma api during ->remove which will
cause a protection domain to get reattached to a device.  Delay the
detach until after the driver is completely unbound.

[ joro: added a little merge helper ]

[ Impact: fix too early device<->domain removal ]

Signed-off-by: Chris Wright <chrisw@redhat.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index a97db99..d689883 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -57,6 +57,10 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
 static struct dma_ops_domain *find_protection_domain(u16 devid);
 
 
+#ifndef BUS_NOTIFY_UNBOUND_DRIVER
+#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005
+#endif
+
 #ifdef CONFIG_AMD_IOMMU_STATS
 
 /*
@@ -1012,7 +1016,7 @@ static int device_change_notifier(struct notifier_block *nb,
 		printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
 		       "device %s\n", dma_domain->domain.id, dev_name(dev));
 		break;
-	case BUS_NOTIFY_UNBIND_DRIVER:
+	case BUS_NOTIFY_UNBOUND_DRIVER:
 		if (!domain)
 			goto out;
 		detach_device(domain, devid);
-- 
cgit v0.10.2


From 3bd221724adb9d642270df0e78b0105fb61e4a1c Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 4 May 2009 15:06:20 +0200
Subject: amd-iommu: introduce for_each_iommu* macros

This patch introduces the for_each_iommu and for_each_iommu_safe macros
to simplify the developers life when having to iterate over all AMD
IOMMUs in the system.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 95c8cd9..cf5ef17 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -196,6 +196,14 @@
 					      domain for an IOMMU */
 
 /*
+ * Make iterating over all IOMMUs easier
+ */
+#define for_each_iommu(iommu) \
+	list_for_each_entry((iommu), &amd_iommu_list, list)
+#define for_each_iommu_safe(iommu, next) \
+	list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list)
+
+/*
  * This structure contains generic data for  IOMMU protection domains
  * independent of their use.
  */
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index a97db99..d9e9dc1 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -213,7 +213,7 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
 {
 	struct amd_iommu *iommu;
 
-	list_for_each_entry(iommu, &amd_iommu_list, list)
+	for_each_iommu(iommu)
 		iommu_poll_events(iommu);
 
 	return IRQ_HANDLED;
@@ -440,7 +440,7 @@ static void iommu_flush_domain(u16 domid)
 	__iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
 				      domid, 1, 1);
 
-	list_for_each_entry(iommu, &amd_iommu_list, list) {
+	for_each_iommu(iommu) {
 		spin_lock_irqsave(&iommu->lock, flags);
 		__iommu_queue_command(iommu, &cmd);
 		__iommu_completion_wait(iommu);
@@ -1672,7 +1672,7 @@ int __init amd_iommu_init_dma_ops(void)
 	 * found in the system. Devices not assigned to any other
 	 * protection domain will be assigned to the default one.
 	 */
-	list_for_each_entry(iommu, &amd_iommu_list, list) {
+	for_each_iommu(iommu) {
 		iommu->default_dom = dma_ops_domain_alloc(iommu, order);
 		if (iommu->default_dom == NULL)
 			return -ENOMEM;
@@ -1710,7 +1710,7 @@ int __init amd_iommu_init_dma_ops(void)
 
 free_domains:
 
-	list_for_each_entry(iommu, &amd_iommu_list, list) {
+	for_each_iommu(iommu) {
 		if (iommu->default_dom)
 			dma_ops_domain_free(iommu->default_dom);
 	}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 8c0be09..675a4b6 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -679,7 +679,7 @@ static void __init free_iommu_all(void)
 {
 	struct amd_iommu *iommu, *next;
 
-	list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) {
+	for_each_iommu_safe(iommu, next) {
 		list_del(&iommu->list);
 		free_iommu_one(iommu);
 		kfree(iommu);
@@ -779,7 +779,7 @@ static int __init iommu_setup_msix(struct amd_iommu *iommu)
 	struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
 	int nvec = 0, i;
 
-	list_for_each_entry(curr, &amd_iommu_list, list) {
+	for_each_iommu(curr) {
 		if (curr->dev == iommu->dev) {
 			entries[nvec].entry = curr->evt_msi_num;
 			entries[nvec].vector = 0;
@@ -818,7 +818,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)
 	int r;
 	struct amd_iommu *curr;
 
-	list_for_each_entry(curr, &amd_iommu_list, list) {
+	for_each_iommu(curr) {
 		if (curr->dev == iommu->dev)
 			curr->int_enabled = true;
 	}
@@ -971,7 +971,7 @@ static void __init enable_iommus(void)
 {
 	struct amd_iommu *iommu;
 
-	list_for_each_entry(iommu, &amd_iommu_list, list) {
+	for_each_iommu(iommu) {
 		iommu_set_exclusion_range(iommu);
 		iommu_init_msi(iommu);
 		iommu_enable_event_logging(iommu);
-- 
cgit v0.10.2


From 58492e128892e3b55f1a6ef0cf3c3ab4ce7cc214 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 4 May 2009 18:41:16 +0200
Subject: amd-iommu: consolidate hardware initialization to one function

This patch restructures the AMD IOMMU initialization code to initialize
all hardware registers with one single function call.
This is helpful for suspend/resume support.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 675a4b6..74f4f1f 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -252,13 +252,6 @@ static void __init iommu_enable(struct amd_iommu *iommu)
 	iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
 }
 
-/* Function to enable IOMMU event logging and event interrupts */
-static void __init iommu_enable_event_logging(struct amd_iommu *iommu)
-{
-	iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
-	iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
-}
-
 /*
  * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
  * the system has one.
@@ -413,25 +406,36 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
 {
 	u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
 			get_order(CMD_BUFFER_SIZE));
-	u64 entry;
 
 	if (cmd_buf == NULL)
 		return NULL;
 
 	iommu->cmd_buf_size = CMD_BUFFER_SIZE;
 
-	entry = (u64)virt_to_phys(cmd_buf);
+	return cmd_buf;
+}
+
+/*
+ * This function writes the command buffer address to the hardware and
+ * enables it.
+ */
+static void iommu_enable_command_buffer(struct amd_iommu *iommu)
+{
+	u64 entry;
+
+	BUG_ON(iommu->cmd_buf == NULL);
+
+	entry = (u64)virt_to_phys(iommu->cmd_buf);
 	entry |= MMIO_CMD_SIZE_512;
+
 	memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
-			&entry, sizeof(entry));
+		    &entry, sizeof(entry));
 
 	/* set head and tail to zero manually */
 	writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
 	writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
 
 	iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
-
-	return cmd_buf;
 }
 
 static void __init free_command_buffer(struct amd_iommu *iommu)
@@ -443,20 +447,27 @@ static void __init free_command_buffer(struct amd_iommu *iommu)
 /* allocates the memory where the IOMMU will log its events to */
 static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
 {
-	u64 entry;
 	iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
 						get_order(EVT_BUFFER_SIZE));
 
 	if (iommu->evt_buf == NULL)
 		return NULL;
 
+	return iommu->evt_buf;
+}
+
+static void iommu_enable_event_buffer(struct amd_iommu *iommu)
+{
+	u64 entry;
+
+	BUG_ON(iommu->evt_buf == NULL);
+
 	entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
+
 	memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
 		    &entry, sizeof(entry));
 
-	iommu->evt_buf_size = EVT_BUFFER_SIZE;
-
-	return iommu->evt_buf;
+	iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
 }
 
 static void __init free_event_buffer(struct amd_iommu *iommu)
@@ -710,7 +721,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
 	if (!iommu->mmio_base)
 		return -ENOMEM;
 
-	iommu_set_device_table(iommu);
 	iommu->cmd_buf = alloc_command_buffer(iommu);
 	if (!iommu->cmd_buf)
 		return -ENOMEM;
@@ -837,6 +847,8 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)
 		return 1;
 	}
 
+	iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
+
 	return 0;
 }
 
@@ -972,9 +984,11 @@ static void __init enable_iommus(void)
 	struct amd_iommu *iommu;
 
 	for_each_iommu(iommu) {
+		iommu_set_device_table(iommu);
+		iommu_enable_command_buffer(iommu);
+		iommu_enable_event_buffer(iommu);
 		iommu_set_exclusion_range(iommu);
 		iommu_init_msi(iommu);
-		iommu_enable_event_logging(iommu);
 		iommu_enable(iommu);
 	}
 }
-- 
cgit v0.10.2


From fab6afa30954a0684ef8ac1d9a606e74a6215ab6 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 4 May 2009 18:46:34 +0200
Subject: amd-iommu: drop pointless iommu-loop in msi setup code

It is not necessary to loop again over all IOMMUs in this code. So drop
the loop.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 74f4f1f..cc99f60 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -826,13 +826,6 @@ out_free:
 static int __init iommu_setup_msi(struct amd_iommu *iommu)
 {
 	int r;
-	struct amd_iommu *curr;
-
-	for_each_iommu(curr) {
-		if (curr->dev == iommu->dev)
-			curr->int_enabled = true;
-	}
-
 
 	if (pci_enable_msi(iommu->dev))
 		return 1;
@@ -847,6 +840,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)
 		return 1;
 	}
 
+	iommu->int_enabled = true;
 	iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
 
 	return 0;
-- 
cgit v0.10.2


From d91cecdd796c27df46339e80ed436a980c56fcad Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 4 May 2009 18:51:00 +0200
Subject: amd-iommu: remove support for msi-x

Current hardware uses msi instead of msi-x so this code it not necessary
and can not be tested. The best thing is to drop this code.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index cc99f60..feee475 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -783,46 +783,6 @@ static int __init init_iommu_all(struct acpi_table_header *table)
  *
  ****************************************************************************/
 
-static int __init iommu_setup_msix(struct amd_iommu *iommu)
-{
-	struct amd_iommu *curr;
-	struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
-	int nvec = 0, i;
-
-	for_each_iommu(curr) {
-		if (curr->dev == iommu->dev) {
-			entries[nvec].entry = curr->evt_msi_num;
-			entries[nvec].vector = 0;
-			curr->int_enabled = true;
-			nvec++;
-		}
-	}
-
-	if (pci_enable_msix(iommu->dev, entries, nvec)) {
-		pci_disable_msix(iommu->dev);
-		return 1;
-	}
-
-	for (i = 0; i < nvec; ++i) {
-		int r = request_irq(entries->vector, amd_iommu_int_handler,
-				    IRQF_SAMPLE_RANDOM,
-				    "AMD IOMMU",
-				    NULL);
-		if (r)
-			goto out_free;
-	}
-
-	return 0;
-
-out_free:
-	for (i -= 1; i >= 0; --i)
-		free_irq(entries->vector, NULL);
-
-	pci_disable_msix(iommu->dev);
-
-	return 1;
-}
-
 static int __init iommu_setup_msi(struct amd_iommu *iommu)
 {
 	int r;
@@ -851,9 +811,7 @@ static int __init iommu_init_msi(struct amd_iommu *iommu)
 	if (iommu->int_enabled)
 		return 0;
 
-	if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX))
-		return iommu_setup_msix(iommu);
-	else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
+	if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
 		return iommu_setup_msi(iommu);
 
 	return 1;
-- 
cgit v0.10.2


From 92ac4320af6ed4294c2c221dd4ccbfd9026a3aa7 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 19 May 2009 19:06:27 +0200
Subject: amd-iommu: add function to disable all iommus

This function is required for suspend/resume support with AMD IOMMU
enabled.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index feee475..ed10c0f 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -252,6 +252,11 @@ static void __init iommu_enable(struct amd_iommu *iommu)
 	iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
 }
 
+static void iommu_disable(struct amd_iommu *iommu)
+{
+	iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
+}
+
 /*
  * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
  * the system has one.
@@ -945,6 +950,14 @@ static void __init enable_iommus(void)
 	}
 }
 
+static void disable_iommus(void)
+{
+	struct amd_iommu *iommu;
+
+	for_each_iommu(iommu)
+		iommu_disable(iommu);
+}
+
 /*
  * Suspend/Resume support
  * disable suspend until real resume implemented
-- 
cgit v0.10.2


From bfd1be1857e5a3385bf146e02e6dc3dd4241bec1 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 5 May 2009 15:33:57 +0200
Subject: amd-iommu: add function to flush tlb for all domains

This function is required for suspend/resume support with AMD IOMMU
enabled.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index f712344..1750e1f 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -27,6 +27,7 @@ extern int amd_iommu_init(void);
 extern int amd_iommu_init_dma_ops(void);
 extern void amd_iommu_detect(void);
 extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
+extern void amd_iommu_flush_all_domains(void);
 #else
 static inline int amd_iommu_init(void) { return -ENODEV; }
 static inline void amd_iommu_detect(void) { }
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index d9e9dc1..826ad079 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -449,6 +449,17 @@ static void iommu_flush_domain(u16 domid)
 	}
 }
 
+void amd_iommu_flush_all_domains(void)
+{
+	int i;
+
+	for (i = 1; i < MAX_DOMAIN_ID; ++i) {
+		if (!test_bit(i, amd_iommu_pd_alloc_bitmap))
+			continue;
+		iommu_flush_domain(i);
+	}
+}
+
 /****************************************************************************
  *
  * The functions below are used the create the page table mappings for
-- 
cgit v0.10.2


From 7d7a110c6127b7fc683dc6d764555f2dbd22b054 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 5 May 2009 15:48:10 +0200
Subject: amd-iommu: add function to flush tlb for all devices

This function is required for suspend/resume support with AMD IOMMU
enabled.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index 1750e1f..262e028 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -28,6 +28,7 @@ extern int amd_iommu_init_dma_ops(void);
 extern void amd_iommu_detect(void);
 extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
 extern void amd_iommu_flush_all_domains(void);
+extern void amd_iommu_flush_all_devices(void);
 #else
 static inline int amd_iommu_init(void) { return -ENODEV; }
 static inline void amd_iommu_detect(void) { }
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 826ad079..92b0e18 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -460,6 +460,24 @@ void amd_iommu_flush_all_domains(void)
 	}
 }
 
+void amd_iommu_flush_all_devices(void)
+{
+	struct amd_iommu *iommu;
+	int i;
+
+	for (i = 0; i <= amd_iommu_last_bdf; ++i) {
+		if (amd_iommu_pd_table[i] == NULL)
+			continue;
+
+		iommu = amd_iommu_rlookup_table[i];
+		if (!iommu)
+			continue;
+
+		iommu_queue_inv_dev_entry(iommu, i);
+		iommu_completion_wait(iommu);
+	}
+}
+
 /****************************************************************************
  *
  * The functions below are used the create the page table mappings for
-- 
cgit v0.10.2


From 05f92db9f47f852ff48bbed1b063b8ab8ad00285 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 12 May 2009 09:52:46 +0200
Subject: amd_iommu: un __init functions required for suspend/resume

This patch makes sure that no function required for suspend/resume of
AMD IOMMU driver is thrown away after boot.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index ed10c0f..330896b 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -193,7 +193,7 @@ static inline unsigned long tbl_size(int entry_size)
  * This function set the exclusion range in the IOMMU. DMA accesses to the
  * exclusion range are passed through untranslated
  */
-static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
+static void iommu_set_exclusion_range(struct amd_iommu *iommu)
 {
 	u64 start = iommu->exclusion_start & PAGE_MASK;
 	u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
@@ -225,7 +225,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu)
 }
 
 /* Generic functions to enable/disable certain features of the IOMMU. */
-static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
+static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
 {
 	u32 ctrl;
 
@@ -244,7 +244,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
 }
 
 /* Function to enable the hardware */
-static void __init iommu_enable(struct amd_iommu *iommu)
+static void iommu_enable(struct amd_iommu *iommu)
 {
 	printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n",
 	       dev_name(&iommu->dev->dev), iommu->cap_ptr);
@@ -811,7 +811,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)
 	return 0;
 }
 
-static int __init iommu_init_msi(struct amd_iommu *iommu)
+static int iommu_init_msi(struct amd_iommu *iommu)
 {
 	if (iommu->int_enabled)
 		return 0;
@@ -936,7 +936,7 @@ static void init_device_table(void)
  * This function finally enables all IOMMUs found in the system after
  * they have been initialized
  */
-static void __init enable_iommus(void)
+static void enable_iommus(void)
 {
 	struct amd_iommu *iommu;
 
-- 
cgit v0.10.2


From 736501ee000757082a4f0832826ae1eda7ea106e Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 12 May 2009 09:56:12 +0200
Subject: amd-iommu: implement suspend/resume

This patch puts everything together and enables suspend/resume support
in the AMD IOMMU driver.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 330896b..4ca8fbf 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -965,12 +965,31 @@ static void disable_iommus(void)
 
 static int amd_iommu_resume(struct sys_device *dev)
 {
+	/*
+	 * Disable IOMMUs before reprogramming the hardware registers.
+	 * IOMMU is still enabled from the resume kernel.
+	 */
+	disable_iommus();
+
+	/* re-load the hardware */
+	enable_iommus();
+
+	/*
+	 * we have to flush after the IOMMUs are enabled because a
+	 * disabled IOMMU will never execute the commands we send
+	 */
+	amd_iommu_flush_all_domains();
+	amd_iommu_flush_all_devices();
+
 	return 0;
 }
 
 static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
 {
-	return -EINVAL;
+	/* disable IOMMUs to go out of the way for BIOS */
+	disable_iommus();
+
+	return 0;
 }
 
 static struct sysdev_class amd_iommu_sysdev_class = {
-- 
cgit v0.10.2


From c3239567a20e90e3026ac5453d5267506ef7b030 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 12 May 2009 10:56:44 +0200
Subject: amd-iommu: introduce aperture_range structure

This is a preperation for extended address allocator.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 95c8cd9..4c64c9b 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -195,6 +195,8 @@
 #define PD_DEFAULT_MASK		(1UL << 1) /* domain is a default dma_ops
 					      domain for an IOMMU */
 
+#define APERTURE_RANGE_SIZE	(128 * 1024 * 1024)
+
 /*
  * This structure contains generic data for  IOMMU protection domains
  * independent of their use.
@@ -210,6 +212,24 @@ struct protection_domain {
 };
 
 /*
+ * For dynamic growth the aperture size is split into ranges of 128MB of
+ * DMA address space each. This struct represents one such range.
+ */
+struct aperture_range {
+
+	/* address allocation bitmap */
+	unsigned long *bitmap;
+
+	/*
+	 * Array of PTE pages for the aperture. In this array we save all the
+	 * leaf pages of the domain page table used for the aperture. This way
+	 * we don't need to walk the page table to find a specific PTE. We can
+	 * just calculate its address in constant time.
+	 */
+	u64 *pte_pages[64];
+};
+
+/*
  * Data container for a dma_ops specific protection domain
  */
 struct dma_ops_domain {
@@ -224,16 +244,8 @@ struct dma_ops_domain {
 	/* address we start to search for free addresses */
 	unsigned long next_bit;
 
-	/* address allocation bitmap */
-	unsigned long *bitmap;
-
-	/*
-	 * Array of PTE pages for the aperture. In this array we save all the
-	 * leaf pages of the domain page table used for the aperture. This way
-	 * we don't need to walk the page table to find a specific PTE. We can
-	 * just calculate its address in constant time.
-	 */
-	u64 **pte_pages;
+	/* address space relevant data */
+	struct aperture_range aperture;
 
 	/* This will be set to true when TLB needs to be flushed */
 	bool need_flush;
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index a97db99..62acd09 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -595,7 +595,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
 		 * as allocated in the aperture
 		 */
 		if (addr < dma_dom->aperture_size)
-			__set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap);
+			__set_bit(addr >> PAGE_SHIFT,
+				  dma_dom->aperture.bitmap);
 	}
 
 	return 0;
@@ -656,11 +657,12 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
 		dom->need_flush = true;
 	}
 
-	address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
-				   0 , boundary_size, align_mask);
+	address = iommu_area_alloc(dom->aperture.bitmap, limit, dom->next_bit,
+				   pages, 0 , boundary_size, align_mask);
 	if (address == -1) {
-		address = iommu_area_alloc(dom->bitmap, limit, 0, pages,
-				0, boundary_size, align_mask);
+		address = iommu_area_alloc(dom->aperture.bitmap, limit, 0,
+					   pages, 0, boundary_size,
+					   align_mask);
 		dom->need_flush = true;
 	}
 
@@ -685,7 +687,7 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
 				   unsigned int pages)
 {
 	address >>= PAGE_SHIFT;
-	iommu_area_free(dom->bitmap, address, pages);
+	iommu_area_free(dom->aperture.bitmap, address, pages);
 
 	if (address >= dom->next_bit)
 		dom->need_flush = true;
@@ -741,7 +743,7 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
 	if (start_page + pages > last_page)
 		pages = last_page - start_page;
 
-	iommu_area_reserve(dom->bitmap, start_page, pages);
+	iommu_area_reserve(dom->aperture.bitmap, start_page, pages);
 }
 
 static void free_pagetable(struct protection_domain *domain)
@@ -785,9 +787,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
 
 	free_pagetable(&dom->domain);
 
-	kfree(dom->pte_pages);
-
-	kfree(dom->bitmap);
+	free_page((unsigned long)dom->aperture.bitmap);
 
 	kfree(dom);
 }
@@ -826,16 +826,15 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	dma_dom->domain.priv = dma_dom;
 	if (!dma_dom->domain.pt_root)
 		goto free_dma_dom;
-	dma_dom->aperture_size = (1ULL << order);
-	dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8),
-				  GFP_KERNEL);
-	if (!dma_dom->bitmap)
+	dma_dom->aperture_size = APERTURE_RANGE_SIZE;
+	dma_dom->aperture.bitmap = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!dma_dom->aperture.bitmap)
 		goto free_dma_dom;
 	/*
 	 * mark the first page as allocated so we never return 0 as
 	 * a valid dma-address. So we can use 0 as error value
 	 */
-	dma_dom->bitmap[0] = 1;
+	dma_dom->aperture.bitmap[0] = 1;
 	dma_dom->next_bit = 0;
 
 	dma_dom->need_flush = false;
@@ -854,13 +853,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	/*
 	 * At the last step, build the page tables so we don't need to
 	 * allocate page table pages in the dma_ops mapping/unmapping
-	 * path.
+	 * path for the first 128MB of dma address space.
 	 */
 	num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
-	dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
-			GFP_KERNEL);
-	if (!dma_dom->pte_pages)
-		goto free_dma_dom;
 
 	l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL);
 	if (l2_pde == NULL)
@@ -869,10 +864,11 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde));
 
 	for (i = 0; i < num_pte_pages; ++i) {
-		dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL);
-		if (!dma_dom->pte_pages[i])
+		u64 **pte_page = &dma_dom->aperture.pte_pages[i];
+		*pte_page = (u64 *)get_zeroed_page(GFP_KERNEL);
+		if (!*pte_page)
 			goto free_dma_dom;
-		address = virt_to_phys(dma_dom->pte_pages[i]);
+		address = virt_to_phys(*pte_page);
 		l2_pde[i] = IOMMU_L1_PDE(address);
 	}
 
@@ -1159,7 +1155,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
 
 	paddr &= PAGE_MASK;
 
-	pte  = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
+	pte  = dom->aperture.pte_pages[IOMMU_PTE_L1_INDEX(address)];
 	pte += IOMMU_PTE_L0_INDEX(address);
 
 	__pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
@@ -1192,7 +1188,7 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
 
 	WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size);
 
-	pte  = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
+	pte  = dom->aperture.pte_pages[IOMMU_PTE_L1_INDEX(address)];
 	pte += IOMMU_PTE_L0_INDEX(address);
 
 	WARN_ON(!*pte);
-- 
cgit v0.10.2


From 8bda3092bcfa68f786d94549ae026e8db1eff041 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 12 May 2009 12:02:46 +0200
Subject: amd-iommu: move page table allocation code to seperate function

This patch makes page table allocation usable for dma_ops code.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 62acd09..ded79f7 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -55,7 +55,9 @@ struct iommu_cmd {
 static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
 			     struct unity_map_entry *e);
 static struct dma_ops_domain *find_protection_domain(u16 devid);
-
+static u64* alloc_pte(struct protection_domain *dom,
+		      unsigned long address, u64
+		      **pte_page, gfp_t gfp);
 
 #ifdef CONFIG_AMD_IOMMU_STATS
 
@@ -468,7 +470,7 @@ static int iommu_map_page(struct protection_domain *dom,
 			  unsigned long phys_addr,
 			  int prot)
 {
-	u64 __pte, *pte, *page;
+	u64 __pte, *pte;
 
 	bus_addr  = PAGE_ALIGN(bus_addr);
 	phys_addr = PAGE_ALIGN(phys_addr);
@@ -477,27 +479,7 @@ static int iommu_map_page(struct protection_domain *dom,
 	if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
 		return -EINVAL;
 
-	pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
-
-	if (!IOMMU_PTE_PRESENT(*pte)) {
-		page = (u64 *)get_zeroed_page(GFP_KERNEL);
-		if (!page)
-			return -ENOMEM;
-		*pte = IOMMU_L2_PDE(virt_to_phys(page));
-	}
-
-	pte = IOMMU_PTE_PAGE(*pte);
-	pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
-
-	if (!IOMMU_PTE_PRESENT(*pte)) {
-		page = (u64 *)get_zeroed_page(GFP_KERNEL);
-		if (!page)
-			return -ENOMEM;
-		*pte = IOMMU_L1_PDE(virt_to_phys(page));
-	}
-
-	pte = IOMMU_PTE_PAGE(*pte);
-	pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)];
+	pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL);
 
 	if (IOMMU_PTE_PRESENT(*pte))
 		return -EBUSY;
@@ -1140,6 +1122,61 @@ static int get_device_resources(struct device *dev,
 }
 
 /*
+ * If the pte_page is not yet allocated this function is called
+ */
+static u64* alloc_pte(struct protection_domain *dom,
+		      unsigned long address, u64 **pte_page, gfp_t gfp)
+{
+	u64 *pte, *page;
+
+	pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)];
+
+	if (!IOMMU_PTE_PRESENT(*pte)) {
+		page = (u64 *)get_zeroed_page(gfp);
+		if (!page)
+			return NULL;
+		*pte = IOMMU_L2_PDE(virt_to_phys(page));
+	}
+
+	pte = IOMMU_PTE_PAGE(*pte);
+	pte = &pte[IOMMU_PTE_L1_INDEX(address)];
+
+	if (!IOMMU_PTE_PRESENT(*pte)) {
+		page = (u64 *)get_zeroed_page(gfp);
+		if (!page)
+			return NULL;
+		*pte = IOMMU_L1_PDE(virt_to_phys(page));
+	}
+
+	pte = IOMMU_PTE_PAGE(*pte);
+
+	if (pte_page)
+		*pte_page = pte;
+
+	pte = &pte[IOMMU_PTE_L0_INDEX(address)];
+
+	return pte;
+}
+
+/*
+ * This function fetches the PTE for a given address in the aperture
+ */
+static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
+			    unsigned long address)
+{
+	struct aperture_range *aperture = &dom->aperture;
+	u64 *pte, *pte_page;
+
+	pte = aperture->pte_pages[IOMMU_PTE_L1_INDEX(address)];
+	if (!pte) {
+		pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC);
+		aperture->pte_pages[IOMMU_PTE_L1_INDEX(address)] = pte_page;
+	}
+
+	return pte;
+}
+
+/*
  * This is the generic map function. It maps one 4kb page at paddr to
  * the given address in the DMA address space for the domain.
  */
@@ -1155,8 +1192,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
 
 	paddr &= PAGE_MASK;
 
-	pte  = dom->aperture.pte_pages[IOMMU_PTE_L1_INDEX(address)];
-	pte += IOMMU_PTE_L0_INDEX(address);
+	pte  = dma_ops_get_pte(dom, address);
 
 	__pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
 
-- 
cgit v0.10.2


From 53812c115cda1f660b286c939669154a56976f6b Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 12 May 2009 12:17:38 +0200
Subject: amd-iommu: handle page table allocation failures in dma_ops code

The code will be required when the aperture size increases dynamically
in the extended address allocator.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index ded79f7..a467add 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1193,6 +1193,8 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
 	paddr &= PAGE_MASK;
 
 	pte  = dma_ops_get_pte(dom, address);
+	if (!pte)
+		return bad_dma_address;
 
 	__pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
 
@@ -1248,7 +1250,7 @@ static dma_addr_t __map_single(struct device *dev,
 			       u64 dma_mask)
 {
 	dma_addr_t offset = paddr & ~PAGE_MASK;
-	dma_addr_t address, start;
+	dma_addr_t address, start, ret;
 	unsigned int pages;
 	unsigned long align_mask = 0;
 	int i;
@@ -1271,7 +1273,10 @@ static dma_addr_t __map_single(struct device *dev,
 
 	start = address;
 	for (i = 0; i < pages; ++i) {
-		dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
+		ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
+		if (ret == bad_dma_address)
+			goto out_unmap;
+
 		paddr += PAGE_SIZE;
 		start += PAGE_SIZE;
 	}
@@ -1287,6 +1292,17 @@ static dma_addr_t __map_single(struct device *dev,
 
 out:
 	return address;
+
+out_unmap:
+
+	for (--i; i >= 0; --i) {
+		start -= PAGE_SIZE;
+		dma_ops_domain_unmap(iommu, dma_dom, start);
+	}
+
+	dma_ops_free_addresses(dma_dom, address, pages);
+
+	return bad_dma_address;
 }
 
 /*
-- 
cgit v0.10.2


From 384de72910a7bf96a02a6d8023fe9e16d872beb2 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 15 May 2009 12:30:05 +0200
Subject: amd-iommu: make address allocator aware of multiple aperture ranges

This patch changes the AMD IOMMU address allocator to allow up to 32
aperture ranges per dma_ops domain.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 4c64c9b..eca9129 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -195,7 +195,12 @@
 #define PD_DEFAULT_MASK		(1UL << 1) /* domain is a default dma_ops
 					      domain for an IOMMU */
 
-#define APERTURE_RANGE_SIZE	(128 * 1024 * 1024)
+#define APERTURE_RANGE_SHIFT	27	/* 128 MB */
+#define APERTURE_RANGE_SIZE	(1ULL << APERTURE_RANGE_SHIFT)
+#define APERTURE_RANGE_PAGES	(APERTURE_RANGE_SIZE >> PAGE_SHIFT)
+#define APERTURE_MAX_RANGES	32	/* allows 4GB of DMA address space */
+#define APERTURE_RANGE_INDEX(a)	((a) >> APERTURE_RANGE_SHIFT)
+#define APERTURE_PAGE_INDEX(a)	(((a) >> 21) & 0x3fULL)
 
 /*
  * This structure contains generic data for  IOMMU protection domains
@@ -227,6 +232,8 @@ struct aperture_range {
 	 * just calculate its address in constant time.
 	 */
 	u64 *pte_pages[64];
+
+	unsigned long offset;
 };
 
 /*
@@ -245,7 +252,7 @@ struct dma_ops_domain {
 	unsigned long next_bit;
 
 	/* address space relevant data */
-	struct aperture_range aperture;
+	struct aperture_range *aperture[APERTURE_MAX_RANGES];
 
 	/* This will be set to true when TLB needs to be flushed */
 	bool need_flush;
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index a467add..794163a 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -578,7 +578,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
 		 */
 		if (addr < dma_dom->aperture_size)
 			__set_bit(addr >> PAGE_SHIFT,
-				  dma_dom->aperture.bitmap);
+				  dma_dom->aperture[0]->bitmap);
 	}
 
 	return 0;
@@ -615,43 +615,74 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
  ****************************************************************************/
 
 /*
- * The address allocator core function.
+ * The address allocator core functions.
  *
  * called with domain->lock held
  */
+
+static unsigned long dma_ops_area_alloc(struct device *dev,
+					struct dma_ops_domain *dom,
+					unsigned int pages,
+					unsigned long align_mask,
+					u64 dma_mask,
+					unsigned long start)
+{
+	unsigned long next_bit = dom->next_bit % APERTURE_RANGE_PAGES;
+	int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
+	int i = start >> APERTURE_RANGE_SHIFT;
+	unsigned long boundary_size;
+	unsigned long address = -1;
+	unsigned long limit;
+
+	boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+			PAGE_SIZE) >> PAGE_SHIFT;
+
+	for (;i < max_index; ++i) {
+		unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;
+
+		if (dom->aperture[i]->offset >= dma_mask)
+			break;
+
+		limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
+					       dma_mask >> PAGE_SHIFT);
+
+		address = iommu_area_alloc(dom->aperture[i]->bitmap,
+					   limit, next_bit, pages, 0,
+					    boundary_size, align_mask);
+		if (address != -1) {
+			address = dom->aperture[i]->offset +
+				  (address << PAGE_SHIFT);
+			dom->next_bit = (address >> PAGE_SHIFT) + pages;
+			break;
+		}
+
+		next_bit = 0;
+	}
+
+	return address;
+}
+
 static unsigned long dma_ops_alloc_addresses(struct device *dev,
 					     struct dma_ops_domain *dom,
 					     unsigned int pages,
 					     unsigned long align_mask,
 					     u64 dma_mask)
 {
-	unsigned long limit;
 	unsigned long address;
-	unsigned long boundary_size;
+	unsigned long start = dom->next_bit << PAGE_SHIFT;
 
-	boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
-			PAGE_SIZE) >> PAGE_SHIFT;
-	limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0,
-				       dma_mask >> PAGE_SHIFT);
 
-	if (dom->next_bit >= limit) {
-		dom->next_bit = 0;
-		dom->need_flush = true;
-	}
+	address = dma_ops_area_alloc(dev, dom, pages, align_mask,
+				     dma_mask, start);
 
-	address = iommu_area_alloc(dom->aperture.bitmap, limit, dom->next_bit,
-				   pages, 0 , boundary_size, align_mask);
 	if (address == -1) {
-		address = iommu_area_alloc(dom->aperture.bitmap, limit, 0,
-					   pages, 0, boundary_size,
-					   align_mask);
+		dom->next_bit = 0;
+		address = dma_ops_area_alloc(dev, dom, pages, align_mask,
+					     dma_mask, 0);
 		dom->need_flush = true;
 	}
 
-	if (likely(address != -1)) {
-		dom->next_bit = address + pages;
-		address <<= PAGE_SHIFT;
-	} else
+	if (unlikely(address == -1))
 		address = bad_dma_address;
 
 	WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
@@ -668,11 +699,17 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
 				   unsigned long address,
 				   unsigned int pages)
 {
-	address >>= PAGE_SHIFT;
-	iommu_area_free(dom->aperture.bitmap, address, pages);
+	unsigned i = address >> APERTURE_RANGE_SHIFT;
+	struct aperture_range *range = dom->aperture[i];
+
+	BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
 
-	if (address >= dom->next_bit)
+	if ((address >> PAGE_SHIFT) >= dom->next_bit)
 		dom->need_flush = true;
+
+	address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
+	iommu_area_free(range->bitmap, address, pages);
+
 }
 
 /****************************************************************************
@@ -720,12 +757,16 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
 				      unsigned long start_page,
 				      unsigned int pages)
 {
-	unsigned int last_page = dom->aperture_size >> PAGE_SHIFT;
+	unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
 
 	if (start_page + pages > last_page)
 		pages = last_page - start_page;
 
-	iommu_area_reserve(dom->aperture.bitmap, start_page, pages);
+	for (i = start_page; i < start_page + pages; ++i) {
+		int index = i / APERTURE_RANGE_PAGES;
+		int page  = i % APERTURE_RANGE_PAGES;
+		__set_bit(page, dom->aperture[index]->bitmap);
+	}
 }
 
 static void free_pagetable(struct protection_domain *domain)
@@ -764,12 +805,19 @@ static void free_pagetable(struct protection_domain *domain)
  */
 static void dma_ops_domain_free(struct dma_ops_domain *dom)
 {
+	int i;
+
 	if (!dom)
 		return;
 
 	free_pagetable(&dom->domain);
 
-	free_page((unsigned long)dom->aperture.bitmap);
+	for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
+		if (!dom->aperture[i])
+			continue;
+		free_page((unsigned long)dom->aperture[i]->bitmap);
+		kfree(dom->aperture[i]);
+	}
 
 	kfree(dom);
 }
@@ -797,6 +845,11 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	if (!dma_dom)
 		return NULL;
 
+	dma_dom->aperture[0] = kzalloc(sizeof(struct aperture_range),
+				       GFP_KERNEL);
+	if (!dma_dom->aperture[0])
+		goto free_dma_dom;
+
 	spin_lock_init(&dma_dom->domain.lock);
 
 	dma_dom->domain.id = domain_id_alloc();
@@ -809,14 +862,14 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	if (!dma_dom->domain.pt_root)
 		goto free_dma_dom;
 	dma_dom->aperture_size = APERTURE_RANGE_SIZE;
-	dma_dom->aperture.bitmap = (void *)get_zeroed_page(GFP_KERNEL);
-	if (!dma_dom->aperture.bitmap)
+	dma_dom->aperture[0]->bitmap = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!dma_dom->aperture[0]->bitmap)
 		goto free_dma_dom;
 	/*
 	 * mark the first page as allocated so we never return 0 as
 	 * a valid dma-address. So we can use 0 as error value
 	 */
-	dma_dom->aperture.bitmap[0] = 1;
+	dma_dom->aperture[0]->bitmap[0] = 1;
 	dma_dom->next_bit = 0;
 
 	dma_dom->need_flush = false;
@@ -846,7 +899,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde));
 
 	for (i = 0; i < num_pte_pages; ++i) {
-		u64 **pte_page = &dma_dom->aperture.pte_pages[i];
+		u64 **pte_page = &dma_dom->aperture[0]->pte_pages[i];
 		*pte_page = (u64 *)get_zeroed_page(GFP_KERNEL);
 		if (!*pte_page)
 			goto free_dma_dom;
@@ -1164,14 +1217,19 @@ static u64* alloc_pte(struct protection_domain *dom,
 static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
 			    unsigned long address)
 {
-	struct aperture_range *aperture = &dom->aperture;
+	struct aperture_range *aperture;
 	u64 *pte, *pte_page;
 
-	pte = aperture->pte_pages[IOMMU_PTE_L1_INDEX(address)];
+	aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
+	if (!aperture)
+		return NULL;
+
+	pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
 	if (!pte) {
 		pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC);
-		aperture->pte_pages[IOMMU_PTE_L1_INDEX(address)] = pte_page;
-	}
+		aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
+	} else
+		pte += IOMMU_PTE_L0_INDEX(address);
 
 	return pte;
 }
@@ -1219,14 +1277,20 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
 				 struct dma_ops_domain *dom,
 				 unsigned long address)
 {
+	struct aperture_range *aperture;
 	u64 *pte;
 
 	if (address >= dom->aperture_size)
 		return;
 
-	WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size);
+	aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
+	if (!aperture)
+		return;
+
+	pte  = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
+	if (!pte)
+		return;
 
-	pte  = dom->aperture.pte_pages[IOMMU_PTE_L1_INDEX(address)];
 	pte += IOMMU_PTE_L0_INDEX(address);
 
 	WARN_ON(!*pte);
-- 
cgit v0.10.2


From 803b8cb4d9a93b90c67aba2aab7f2c54d595b5b9 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 18 May 2009 15:32:48 +0200
Subject: amd-iommu: change dma_dom->next_bit to dma_dom->next_address

Simplify the code a little bit by using the same unit for all address
space related state in the dma_ops domain structure.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index eca9129..4ff4cf1 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -249,7 +249,7 @@ struct dma_ops_domain {
 	unsigned long aperture_size;
 
 	/* address we start to search for free addresses */
-	unsigned long next_bit;
+	unsigned long next_address;
 
 	/* address space relevant data */
 	struct aperture_range *aperture[APERTURE_MAX_RANGES];
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 794163a..c1a08b9 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -627,13 +627,15 @@ static unsigned long dma_ops_area_alloc(struct device *dev,
 					u64 dma_mask,
 					unsigned long start)
 {
-	unsigned long next_bit = dom->next_bit % APERTURE_RANGE_PAGES;
+	unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
 	int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
 	int i = start >> APERTURE_RANGE_SHIFT;
 	unsigned long boundary_size;
 	unsigned long address = -1;
 	unsigned long limit;
 
+	next_bit >>= PAGE_SHIFT;
+
 	boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
 			PAGE_SIZE) >> PAGE_SHIFT;
 
@@ -652,7 +654,7 @@ static unsigned long dma_ops_area_alloc(struct device *dev,
 		if (address != -1) {
 			address = dom->aperture[i]->offset +
 				  (address << PAGE_SHIFT);
-			dom->next_bit = (address >> PAGE_SHIFT) + pages;
+			dom->next_address = address + (pages << PAGE_SHIFT);
 			break;
 		}
 
@@ -669,14 +671,12 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
 					     u64 dma_mask)
 {
 	unsigned long address;
-	unsigned long start = dom->next_bit << PAGE_SHIFT;
-
 
 	address = dma_ops_area_alloc(dev, dom, pages, align_mask,
-				     dma_mask, start);
+				     dma_mask, dom->next_address);
 
 	if (address == -1) {
-		dom->next_bit = 0;
+		dom->next_address = 0;
 		address = dma_ops_area_alloc(dev, dom, pages, align_mask,
 					     dma_mask, 0);
 		dom->need_flush = true;
@@ -704,10 +704,11 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
 
 	BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
 
-	if ((address >> PAGE_SHIFT) >= dom->next_bit)
+	if (address >= dom->next_address)
 		dom->need_flush = true;
 
 	address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
+
 	iommu_area_free(range->bitmap, address, pages);
 
 }
@@ -870,7 +871,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	 * a valid dma-address. So we can use 0 as error value
 	 */
 	dma_dom->aperture[0]->bitmap[0] = 1;
-	dma_dom->next_bit = 0;
+	dma_dom->next_address = 0;
 
 	dma_dom->need_flush = false;
 	dma_dom->target_dev = 0xffff;
-- 
cgit v0.10.2


From 9cabe89b99773e682538a8809abc7d4000c77083 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 18 May 2009 16:38:55 +0200
Subject: amd-iommu: move aperture_range allocation code to seperate function

This patch prepares the dynamic increasement of dma_ops domain
apertures.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index c1a08b9..8ff02ee 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -620,6 +620,59 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
  * called with domain->lock held
  */
 
+/*
+ * This function is used to add a new aperture range to an existing
+ * aperture in case of dma_ops domain allocation or address allocation
+ * failure.
+ */
+static int alloc_new_range(struct dma_ops_domain *dma_dom,
+			   bool populate, gfp_t gfp)
+{
+	int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
+
+	if (index >= APERTURE_MAX_RANGES)
+		return -ENOMEM;
+
+	dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
+	if (!dma_dom->aperture[index])
+		return -ENOMEM;
+
+	dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
+	if (!dma_dom->aperture[index]->bitmap)
+		goto out_free;
+
+	dma_dom->aperture[index]->offset = dma_dom->aperture_size;
+
+	if (populate) {
+		unsigned long address = dma_dom->aperture_size;
+		int i, num_ptes = APERTURE_RANGE_PAGES / 512;
+		u64 *pte, *pte_page;
+
+		for (i = 0; i < num_ptes; ++i) {
+			pte = alloc_pte(&dma_dom->domain, address,
+					&pte_page, gfp);
+			if (!pte)
+				goto out_free;
+
+			dma_dom->aperture[index]->pte_pages[i] = pte_page;
+
+			address += APERTURE_RANGE_SIZE / 64;
+		}
+	}
+
+	dma_dom->aperture_size += APERTURE_RANGE_SIZE;
+
+	return 0;
+
+out_free:
+	free_page((unsigned long)dma_dom->aperture[index]->bitmap);
+
+	kfree(dma_dom->aperture[index]);
+	dma_dom->aperture[index] = NULL;
+
+	return -ENOMEM;
+}
+
 static unsigned long dma_ops_area_alloc(struct device *dev,
 					struct dma_ops_domain *dom,
 					unsigned int pages,
@@ -832,9 +885,6 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 						   unsigned order)
 {
 	struct dma_ops_domain *dma_dom;
-	unsigned i, num_pte_pages;
-	u64 *l2_pde;
-	u64 address;
 
 	/*
 	 * Currently the DMA aperture must be between 32 MB and 1GB in size
@@ -846,11 +896,6 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	if (!dma_dom)
 		return NULL;
 
-	dma_dom->aperture[0] = kzalloc(sizeof(struct aperture_range),
-				       GFP_KERNEL);
-	if (!dma_dom->aperture[0])
-		goto free_dma_dom;
-
 	spin_lock_init(&dma_dom->domain.lock);
 
 	dma_dom->domain.id = domain_id_alloc();
@@ -862,10 +907,13 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	dma_dom->domain.priv = dma_dom;
 	if (!dma_dom->domain.pt_root)
 		goto free_dma_dom;
-	dma_dom->aperture_size = APERTURE_RANGE_SIZE;
-	dma_dom->aperture[0]->bitmap = (void *)get_zeroed_page(GFP_KERNEL);
-	if (!dma_dom->aperture[0]->bitmap)
+
+	dma_dom->need_flush = false;
+	dma_dom->target_dev = 0xffff;
+
+	if (alloc_new_range(dma_dom, true, GFP_KERNEL))
 		goto free_dma_dom;
+
 	/*
 	 * mark the first page as allocated so we never return 0 as
 	 * a valid dma-address. So we can use 0 as error value
@@ -873,9 +921,6 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	dma_dom->aperture[0]->bitmap[0] = 1;
 	dma_dom->next_address = 0;
 
-	dma_dom->need_flush = false;
-	dma_dom->target_dev = 0xffff;
-
 	/* Intialize the exclusion range if necessary */
 	if (iommu->exclusion_start &&
 	    iommu->exclusion_start < dma_dom->aperture_size) {
@@ -886,28 +931,6 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 		dma_ops_reserve_addresses(dma_dom, startpage, pages);
 	}
 
-	/*
-	 * At the last step, build the page tables so we don't need to
-	 * allocate page table pages in the dma_ops mapping/unmapping
-	 * path for the first 128MB of dma address space.
-	 */
-	num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
-
-	l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL);
-	if (l2_pde == NULL)
-		goto free_dma_dom;
-
-	dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde));
-
-	for (i = 0; i < num_pte_pages; ++i) {
-		u64 **pte_page = &dma_dom->aperture[0]->pte_pages[i];
-		*pte_page = (u64 *)get_zeroed_page(GFP_KERNEL);
-		if (!*pte_page)
-			goto free_dma_dom;
-		address = virt_to_phys(*pte_page);
-		l2_pde[i] = IOMMU_L1_PDE(address);
-	}
-
 	return dma_dom;
 
 free_dma_dom:
-- 
cgit v0.10.2


From 00cd122ae5e5e7c60cce2af3c35b190d4c3f2d0d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 19 May 2009 09:52:40 +0200
Subject: amd-iommu: handle exlusion ranges and unity mappings in
 alloc_new_range

This patch makes sure no reserved addresses are allocated in an dma_ops
domain when the aperture is increased dynamically.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 8ff02ee..59ee1b9 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -58,6 +58,9 @@ static struct dma_ops_domain *find_protection_domain(u16 devid);
 static u64* alloc_pte(struct protection_domain *dom,
 		      unsigned long address, u64
 		      **pte_page, gfp_t gfp);
+static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
+				      unsigned long start_page,
+				      unsigned int pages);
 
 #ifdef CONFIG_AMD_IOMMU_STATS
 
@@ -621,14 +624,42 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
  */
 
 /*
+ * This function checks if there is a PTE for a given dma address. If
+ * there is one, it returns the pointer to it.
+ */
+static u64* fetch_pte(struct protection_domain *domain,
+		      unsigned long address)
+{
+	u64 *pte;
+
+	pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)];
+
+	if (!IOMMU_PTE_PRESENT(*pte))
+		return NULL;
+
+	pte = IOMMU_PTE_PAGE(*pte);
+	pte = &pte[IOMMU_PTE_L1_INDEX(address)];
+
+	if (!IOMMU_PTE_PRESENT(*pte))
+		return NULL;
+
+	pte = IOMMU_PTE_PAGE(*pte);
+	pte = &pte[IOMMU_PTE_L0_INDEX(address)];
+
+	return pte;
+}
+
+/*
  * This function is used to add a new aperture range to an existing
  * aperture in case of dma_ops domain allocation or address allocation
  * failure.
  */
-static int alloc_new_range(struct dma_ops_domain *dma_dom,
+static int alloc_new_range(struct amd_iommu *iommu,
+			   struct dma_ops_domain *dma_dom,
 			   bool populate, gfp_t gfp)
 {
 	int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
+	int i;
 
 	if (index >= APERTURE_MAX_RANGES)
 		return -ENOMEM;
@@ -662,6 +693,33 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
 
 	dma_dom->aperture_size += APERTURE_RANGE_SIZE;
 
+	/* Intialize the exclusion range if necessary */
+	if (iommu->exclusion_start &&
+	    iommu->exclusion_start >= dma_dom->aperture[index]->offset &&
+	    iommu->exclusion_start < dma_dom->aperture_size) {
+		unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
+		int pages = iommu_num_pages(iommu->exclusion_start,
+					    iommu->exclusion_length,
+					    PAGE_SIZE);
+		dma_ops_reserve_addresses(dma_dom, startpage, pages);
+	}
+
+	/*
+	 * Check for areas already mapped as present in the new aperture
+	 * range and mark those pages as reserved in the allocator. Such
+	 * mappings may already exist as a result of requested unity
+	 * mappings for devices.
+	 */
+	for (i = dma_dom->aperture[index]->offset;
+	     i < dma_dom->aperture_size;
+	     i += PAGE_SIZE) {
+		u64 *pte = fetch_pte(&dma_dom->domain, i);
+		if (!pte || !IOMMU_PTE_PRESENT(*pte))
+			continue;
+
+		dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
+	}
+
 	return 0;
 
 out_free:
@@ -911,7 +969,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	dma_dom->need_flush = false;
 	dma_dom->target_dev = 0xffff;
 
-	if (alloc_new_range(dma_dom, true, GFP_KERNEL))
+	if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL))
 		goto free_dma_dom;
 
 	/*
@@ -921,15 +979,6 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	dma_dom->aperture[0]->bitmap[0] = 1;
 	dma_dom->next_address = 0;
 
-	/* Intialize the exclusion range if necessary */
-	if (iommu->exclusion_start &&
-	    iommu->exclusion_start < dma_dom->aperture_size) {
-		unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
-		int pages = iommu_num_pages(iommu->exclusion_start,
-					    iommu->exclusion_length,
-					    PAGE_SIZE);
-		dma_ops_reserve_addresses(dma_dom, startpage, pages);
-	}
 
 	return dma_dom;
 
-- 
cgit v0.10.2


From 11b83888ae729457b5cfb936dbd498481f6408df Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 19 May 2009 10:23:15 +0200
Subject: amd-iommu: enlarge the aperture dynamically

By dynamically increasing the aperture the extended allocator is now
ready for use.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 59ee1b9..d129d8f 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1403,10 +1403,26 @@ static dma_addr_t __map_single(struct device *dev,
 	if (align)
 		align_mask = (1UL << get_order(size)) - 1;
 
+retry:
 	address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
 					  dma_mask);
-	if (unlikely(address == bad_dma_address))
-		goto out;
+	if (unlikely(address == bad_dma_address)) {
+		/*
+		 * setting next_address here will let the address
+		 * allocator only scan the new allocated range in the
+		 * first run. This is a small optimization.
+		 */
+		dma_dom->next_address = dma_dom->aperture_size;
+
+		if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC))
+			goto out;
+
+		/*
+		 * aperture was sucessfully enlarged by 128 MB, try
+		 * allocation again
+		 */
+		goto retry;
+	}
 
 	start = address;
 	for (i = 0; i < pages; ++i) {
-- 
cgit v0.10.2


From d9cfed925448f097ec7faab80d903eb7e5f99712 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 19 May 2009 12:16:29 +0200
Subject: amd-iommu: remove amd_iommu_size kernel parameter

This parameter is not longer necessary when aperture increases
dynamically.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e87bdbf..5b776c6 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -329,11 +329,6 @@ and is between 256 and 4096 characters. It is defined in the file
 				    flushed before they will be reused, which
 				    is a lot of faster
 
-	amd_iommu_size= [HW,X86-64]
-			Define the size of the aperture for the AMD IOMMU
-			driver. Possible values are:
-			'32M', '64M' (default), '128M', '256M', '512M', '1G'
-
 	amijoy.map=	[HW,JOY] Amiga joystick support
 			Map of devices attached to JOY0DAT and JOY1DAT
 			Format: <a>,<b>
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index d129d8f..31d56c3 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -939,17 +939,10 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
  * It also intializes the page table and the address allocator data
  * structures required for the dma_ops interface
  */
-static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
-						   unsigned order)
+static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
 {
 	struct dma_ops_domain *dma_dom;
 
-	/*
-	 * Currently the DMA aperture must be between 32 MB and 1GB in size
-	 */
-	if ((order < 25) || (order > 30))
-		return NULL;
-
 	dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
 	if (!dma_dom)
 		return NULL;
@@ -1087,7 +1080,6 @@ static int device_change_notifier(struct notifier_block *nb,
 	struct protection_domain *domain;
 	struct dma_ops_domain *dma_domain;
 	struct amd_iommu *iommu;
-	int order = amd_iommu_aperture_order;
 	unsigned long flags;
 
 	if (devid > amd_iommu_last_bdf)
@@ -1126,7 +1118,7 @@ static int device_change_notifier(struct notifier_block *nb,
 		dma_domain = find_protection_domain(devid);
 		if (dma_domain)
 			goto out;
-		dma_domain = dma_ops_domain_alloc(iommu, order);
+		dma_domain = dma_ops_domain_alloc(iommu);
 		if (!dma_domain)
 			goto out;
 		dma_domain->target_dev = devid;
@@ -1826,7 +1818,6 @@ static void prealloc_protection_domains(void)
 	struct pci_dev *dev = NULL;
 	struct dma_ops_domain *dma_dom;
 	struct amd_iommu *iommu;
-	int order = amd_iommu_aperture_order;
 	u16 devid;
 
 	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
@@ -1839,7 +1830,7 @@ static void prealloc_protection_domains(void)
 		iommu = amd_iommu_rlookup_table[devid];
 		if (!iommu)
 			continue;
-		dma_dom = dma_ops_domain_alloc(iommu, order);
+		dma_dom = dma_ops_domain_alloc(iommu);
 		if (!dma_dom)
 			continue;
 		init_unity_mappings_for_device(dma_dom, devid);
@@ -1865,7 +1856,6 @@ static struct dma_map_ops amd_iommu_dma_ops = {
 int __init amd_iommu_init_dma_ops(void)
 {
 	struct amd_iommu *iommu;
-	int order = amd_iommu_aperture_order;
 	int ret;
 
 	/*
@@ -1874,7 +1864,7 @@ int __init amd_iommu_init_dma_ops(void)
 	 * protection domain will be assigned to the default one.
 	 */
 	list_for_each_entry(iommu, &amd_iommu_list, list) {
-		iommu->default_dom = dma_ops_domain_alloc(iommu, order);
+		iommu->default_dom = dma_ops_domain_alloc(iommu);
 		if (iommu->default_dom == NULL)
 			return -ENOMEM;
 		iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 8c0be09..762a4ee 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -121,7 +121,6 @@ u16 amd_iommu_last_bdf;			/* largest PCI device id we have
 					   to handle */
 LIST_HEAD(amd_iommu_unity_map);		/* a list of required unity mappings
 					   we find in ACPI */
-unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
 bool amd_iommu_isolate = true;		/* if true, device isolation is
 					   enabled */
 bool amd_iommu_unmap_flush;		/* if true, flush on every unmap */
@@ -1137,9 +1136,6 @@ int __init amd_iommu_init(void)
 
 	enable_iommus();
 
-	printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n",
-			(1 << (amd_iommu_aperture_order-20)));
-
 	printk(KERN_INFO "AMD IOMMU: device isolation ");
 	if (amd_iommu_isolate)
 		printk("enabled\n");
@@ -1225,15 +1221,4 @@ static int __init parse_amd_iommu_options(char *str)
 	return 1;
 }
 
-static int __init parse_amd_iommu_size_options(char *str)
-{
-	unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
-
-	if ((order > 24) && (order < 31))
-		amd_iommu_aperture_order = order;
-
-	return 1;
-}
-
 __setup("amd_iommu=", parse_amd_iommu_options);
-__setup("amd_iommu_size=", parse_amd_iommu_size_options);
-- 
cgit v0.10.2


From fe16f088a88fb73161bba8784375c829f7e87b54 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 22 May 2009 12:27:53 +0200
Subject: amd-iommu: disable round-robin allocator for CONFIG_IOMMU_STRESS

Disabling the round-robin allocator results in reusing the same
dma-addresses again very fast. This is a good test if the iotlb flushing
is working correctly.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 31d56c3..543822b 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -783,6 +783,11 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
 {
 	unsigned long address;
 
+#ifdef CONFIG_IOMMU_STRESS
+	dom->next_address = 0;
+	dom->need_flush = true;
+#endif
+
 	address = dma_ops_area_alloc(dev, dom, pages, align_mask,
 				     dma_mask, dom->next_address);
 
-- 
cgit v0.10.2


From f5e9705c6429d24dee832b2edd7f4848d432ea03 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 22 May 2009 12:31:53 +0200
Subject: amd-iommu: don't preallocate page tables with CONFIG_IOMMU_STRESS

This forces testing of on-demand page table allocation code.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 543822b..33434c4 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -661,6 +661,10 @@ static int alloc_new_range(struct amd_iommu *iommu,
 	int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
 	int i;
 
+#ifdef CONFIG_IOMMU_STRESS
+	populate = false;
+#endif
+
 	if (index >= APERTURE_MAX_RANGES)
 		return -ENOMEM;
 
-- 
cgit v0.10.2


From 47bccd6bb2b866449d3ecf2ba350ac1c7473b2b8 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 22 May 2009 12:40:54 +0200
Subject: amd-iommu: don't free dma adresses below 512MB with
 CONFIG_IOMMU_STRESS

This will test the automatic aperture enlargement code. This is
important because only very few devices will ever trigger this code
path. So force it under CONFIG_IOMMU_STRESS.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 33434c4..04ff5ec 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -824,6 +824,11 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
 
 	BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
 
+#ifdef CONFIG_IOMMU_STRESS
+	if (i < 4)
+		return;
+#endif
+
 	if (address >= dom->next_address)
 		dom->need_flush = true;
 
-- 
cgit v0.10.2


From 1bf4072da67c14d6b02cfeef02212aa5a6211df2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 27 May 2009 09:37:33 -0400
Subject: cifs: reorganize get_cifs_acl

Thus spake Christoph:

"But this whole set_cifs_acl function is a real mess anyway and needs
some splitting up."

With this change too, it's possible to call acl_to_uid_mode() with a
NULL inode pointer. That (or something close to it) will eventually be
necessary when cifs_get_inode_info is reorganized.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 57ecdc8..7f8e6c4 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -552,67 +552,66 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
 	return rc;
 }
 
-
-/* Retrieve an ACL from the server */
-static struct cifs_ntsd *get_cifs_acl(u32 *pacllen, struct inode *inode,
-				       const char *path, const __u16 *pfid)
+static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
+		__u16 fid, u32 *pacllen)
 {
-	struct cifsFileInfo *open_file = NULL;
-	bool unlock_file = false;
-	int xid;
-	int rc = -EIO;
-	__u16 fid;
-	struct super_block *sb;
-	struct cifs_sb_info *cifs_sb;
 	struct cifs_ntsd *pntsd = NULL;
+	int xid, rc;
 
-	cFYI(1, ("get mode from ACL for %s", path));
+	xid = GetXid();
+	rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
+	FreeXid(xid);
 
-	if (inode == NULL)
-		return NULL;
 
-	xid = GetXid();
-	if (pfid == NULL)
-		open_file = find_readable_file(CIFS_I(inode));
-	else
-		fid = *pfid;
+	cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
+	return pntsd;
+}
 
-	sb = inode->i_sb;
-	if (sb == NULL) {
-		FreeXid(xid);
-		return NULL;
-	}
-	cifs_sb = CIFS_SB(sb);
+static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
+		const char *path, u32 *pacllen)
+{
+	struct cifs_ntsd *pntsd = NULL;
+	int oplock = 0;
+	int xid, rc;
+	__u16 fid;
 
-	if (open_file) {
-		unlock_file = true;
-		fid = open_file->netfid;
-	} else if (pfid == NULL) {
-		int oplock = 0;
-		/* open file */
-		rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
-				READ_CONTROL, 0, &fid, &oplock, NULL,
-				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
-		if (rc != 0) {
-			cERROR(1, ("Unable to open file to get ACL"));
-			FreeXid(xid);
-			return NULL;
-		}
+	xid = GetXid();
+
+	rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, READ_CONTROL, 0,
+			 &fid, &oplock, NULL, cifs_sb->local_nls,
+			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc) {
+		cERROR(1, ("Unable to open file to get ACL"));
+		goto out;
 	}
 
 	rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
 	cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
-	if (unlock_file == true) /* find_readable_file increments ref count */
-		atomic_dec(&open_file->wrtPending);
-	else if (pfid == NULL) /* if opened above we have to close the handle */
-		CIFSSMBClose(xid, cifs_sb->tcon, fid);
-	/* else handle was passed in by caller */
 
+	CIFSSMBClose(xid, cifs_sb->tcon, fid);
+ out:
 	FreeXid(xid);
 	return pntsd;
 }
 
+/* Retrieve an ACL from the server */
+static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
+				      struct inode *inode, const char *path,
+				      u32 *pacllen)
+{
+	struct cifs_ntsd *pntsd = NULL;
+	struct cifsFileInfo *open_file = NULL;
+
+	if (inode)
+		open_file = find_readable_file(CIFS_I(inode));
+	if (!open_file)
+		return get_cifs_acl_by_path(cifs_sb, path, pacllen);
+
+	pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen);
+	atomic_dec(&open_file->wrtPending);
+	return pntsd;
+}
+
 /* Set an ACL on the server */
 static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 				struct inode *inode, const char *path)
@@ -668,14 +667,19 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 }
 
 /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
-void acl_to_uid_mode(struct inode *inode, const char *path, const __u16 *pfid)
+void acl_to_uid_mode(struct cifs_sb_info *cifs_sb, struct inode *inode,
+		     const char *path, const __u16 *pfid)
 {
 	struct cifs_ntsd *pntsd = NULL;
 	u32 acllen = 0;
 	int rc = 0;
 
 	cFYI(DBG2, ("converting ACL to mode for %s", path));
-	pntsd = get_cifs_acl(&acllen, inode, path, pfid);
+
+	if (pfid)
+		pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen);
+	else
+		pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen);
 
 	/* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
 	if (pntsd)
@@ -698,7 +702,7 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
 	cFYI(DBG2, ("set ACL from mode for %s", path));
 
 	/* Get the security descriptor */
-	pntsd = get_cifs_acl(&secdesclen, inode, path, NULL);
+	pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
 
 	/* Add three ACEs for owner, group, everyone getting rid of
 	   other ACEs as chmod disables ACEs and set the security descriptor */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index d542cf1..f945232 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -108,8 +108,8 @@ extern int cifs_get_inode_info(struct inode **pinode,
 extern int cifs_get_inode_info_unix(struct inode **pinode,
 			const unsigned char *search_path,
 			struct super_block *sb, int xid);
-extern void acl_to_uid_mode(struct inode *inode, const char *path,
-			    const __u16 *pfid);
+extern void acl_to_uid_mode(struct cifs_sb_info *cifs_sb, struct inode *inode,
+			    const char *path, const __u16 *pfid);
 extern int mode_to_acl(struct inode *inode, const char *path, __u64);
 
 extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 84b7bea..fad882b 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -626,7 +626,7 @@ int cifs_get_inode_info(struct inode **pinode,
 	/* fill in 0777 bits from ACL */
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
 		cFYI(1, ("Getting mode bits from ACL"));
-		acl_to_uid_mode(inode, full_path, pfid);
+		acl_to_uid_mode(cifs_sb, inode, full_path, pfid);
 	}
 #endif
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
-- 
cgit v0.10.2


From a3ce6ea46cc0d6397d1b92b1a5983bb2935306ed Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Thu, 28 May 2009 09:51:31 -0700
Subject: Input: libps2 - better handle bad scheduler decisions

Sometimes devices send us their responses in time but due to
unfortunate scheduling decisions the receiving thread does not
get scheduled till much later and we erroneously decide that
device timed out. Work around this problem by checking whether we
received the data we needed instead of checking timeout
condition.

Tested-by: Sitsofe Wheeler <sitsofe@yahoo.com>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>

diff --git a/drivers/input/serio/libps2.c b/drivers/input/serio/libps2.c
index 67248c3..be5bbbb 100644
--- a/drivers/input/serio/libps2.c
+++ b/drivers/input/serio/libps2.c
@@ -210,7 +210,7 @@ int ps2_command(struct ps2dev *ps2dev, unsigned char *param, int command)
 	timeout = wait_event_timeout(ps2dev->wait,
 				     !(ps2dev->flags & PS2_FLAG_CMD1), timeout);
 
-	if (ps2dev->cmdcnt && timeout > 0) {
+	if (ps2dev->cmdcnt && !(ps2dev->flags & PS2_FLAG_CMD1)) {
 
 		timeout = ps2_adjust_timeout(ps2dev, command, timeout);
 		wait_event_timeout(ps2dev->wait,
-- 
cgit v0.10.2


From bac4e960b5ce2453d862beaf20e59aa68af3b43a Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Mon, 25 May 2009 20:58:00 +0100
Subject: [ARM] barriers: improve xchg, bitops and atomic SMP barriers

Mathieu Desnoyers pointed out that the ARM barriers were lacking:

- cmpxchg, xchg and atomic add return need memory barriers on
  architectures which can reorder the relative order in which memory
  read/writes can be seen between CPUs, which seems to include recent
  ARM architectures. Those barriers are currently missing on ARM.

- test_and_xxx_bit were missing SMP barriers.

So put these barriers in.  Provide separate atomic_add/atomic_sub
operations which do not require barriers.

Reported-Reviewed-and-Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index 6116e48..15f8a09 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -114,3 +114,16 @@
 	.align	3;				\
 	.long	9999b,9001f;			\
 	.previous
+
+/*
+ * SMP data memory barrier
+ */
+	.macro	smp_dmb
+#ifdef CONFIG_SMP
+#if __LINUX_ARM_ARCH__ >= 7
+	dmb
+#elif __LINUX_ARM_ARCH__ == 6
+	mcr	p15, 0, r0, c7, c10, 5	@ dmb
+#endif
+#endif
+	.endm
diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h
index ee99723..16b52f3 100644
--- a/arch/arm/include/asm/atomic.h
+++ b/arch/arm/include/asm/atomic.h
@@ -44,11 +44,29 @@ static inline void atomic_set(atomic_t *v, int i)
 	: "cc");
 }
 
+static inline void atomic_add(int i, atomic_t *v)
+{
+	unsigned long tmp;
+	int result;
+
+	__asm__ __volatile__("@ atomic_add\n"
+"1:	ldrex	%0, [%2]\n"
+"	add	%0, %0, %3\n"
+"	strex	%1, %0, [%2]\n"
+"	teq	%1, #0\n"
+"	bne	1b"
+	: "=&r" (result), "=&r" (tmp)
+	: "r" (&v->counter), "Ir" (i)
+	: "cc");
+}
+
 static inline int atomic_add_return(int i, atomic_t *v)
 {
 	unsigned long tmp;
 	int result;
 
+	smp_mb();
+
 	__asm__ __volatile__("@ atomic_add_return\n"
 "1:	ldrex	%0, [%2]\n"
 "	add	%0, %0, %3\n"
@@ -59,14 +77,34 @@ static inline int atomic_add_return(int i, atomic_t *v)
 	: "r" (&v->counter), "Ir" (i)
 	: "cc");
 
+	smp_mb();
+
 	return result;
 }
 
+static inline void atomic_sub(int i, atomic_t *v)
+{
+	unsigned long tmp;
+	int result;
+
+	__asm__ __volatile__("@ atomic_sub\n"
+"1:	ldrex	%0, [%2]\n"
+"	sub	%0, %0, %3\n"
+"	strex	%1, %0, [%2]\n"
+"	teq	%1, #0\n"
+"	bne	1b"
+	: "=&r" (result), "=&r" (tmp)
+	: "r" (&v->counter), "Ir" (i)
+	: "cc");
+}
+
 static inline int atomic_sub_return(int i, atomic_t *v)
 {
 	unsigned long tmp;
 	int result;
 
+	smp_mb();
+
 	__asm__ __volatile__("@ atomic_sub_return\n"
 "1:	ldrex	%0, [%2]\n"
 "	sub	%0, %0, %3\n"
@@ -77,6 +115,8 @@ static inline int atomic_sub_return(int i, atomic_t *v)
 	: "r" (&v->counter), "Ir" (i)
 	: "cc");
 
+	smp_mb();
+
 	return result;
 }
 
@@ -84,6 +124,8 @@ static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
 {
 	unsigned long oldval, res;
 
+	smp_mb();
+
 	do {
 		__asm__ __volatile__("@ atomic_cmpxchg\n"
 		"ldrex	%1, [%2]\n"
@@ -95,6 +137,8 @@ static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
 		    : "cc");
 	} while (res);
 
+	smp_mb();
+
 	return oldval;
 }
 
@@ -135,6 +179,7 @@ static inline int atomic_add_return(int i, atomic_t *v)
 
 	return val;
 }
+#define atomic_add(i, v)	(void) atomic_add_return(i, v)
 
 static inline int atomic_sub_return(int i, atomic_t *v)
 {
@@ -148,6 +193,7 @@ static inline int atomic_sub_return(int i, atomic_t *v)
 
 	return val;
 }
+#define atomic_sub(i, v)	(void) atomic_sub_return(i, v)
 
 static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 {
@@ -187,10 +233,8 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
 }
 #define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
 
-#define atomic_add(i, v)	(void) atomic_add_return(i, v)
-#define atomic_inc(v)		(void) atomic_add_return(1, v)
-#define atomic_sub(i, v)	(void) atomic_sub_return(i, v)
-#define atomic_dec(v)		(void) atomic_sub_return(1, v)
+#define atomic_inc(v)		atomic_add(1, v)
+#define atomic_dec(v)		atomic_sub(1, v)
 
 #define atomic_inc_and_test(v)	(atomic_add_return(1, v) == 0)
 #define atomic_dec_and_test(v)	(atomic_sub_return(1, v) == 0)
@@ -200,11 +244,10 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
 
 #define atomic_add_negative(i,v) (atomic_add_return(i, v) < 0)
 
-/* Atomic operations are already serializing on ARM */
-#define smp_mb__before_atomic_dec()	barrier()
-#define smp_mb__after_atomic_dec()	barrier()
-#define smp_mb__before_atomic_inc()	barrier()
-#define smp_mb__after_atomic_inc()	barrier()
+#define smp_mb__before_atomic_dec()	smp_mb()
+#define smp_mb__after_atomic_dec()	smp_mb()
+#define smp_mb__before_atomic_inc()	smp_mb()
+#define smp_mb__after_atomic_inc()	smp_mb()
 
 #include <asm-generic/atomic.h>
 #endif
diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h
index bd4dc8e..7fce8f3 100644
--- a/arch/arm/include/asm/system.h
+++ b/arch/arm/include/asm/system.h
@@ -248,6 +248,8 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size
 	unsigned int tmp;
 #endif
 
+	smp_mb();
+
 	switch (size) {
 #if __LINUX_ARM_ARCH__ >= 6
 	case 1:
@@ -307,6 +309,7 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size
 		__bad_xchg(ptr, size), ret = 0;
 		break;
 	}
+	smp_mb();
 
 	return ret;
 }
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index d662a2f..83b1da6 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -815,10 +815,7 @@ __kuser_helper_start:
  */
 
 __kuser_memory_barrier:				@ 0xffff0fa0
-
-#if __LINUX_ARM_ARCH__ >= 6 && defined(CONFIG_SMP)
-	mcr	p15, 0, r0, c7, c10, 5	@ dmb
-#endif
+	smp_dmb
 	usr_ret	lr
 
 	.align	5
diff --git a/arch/arm/lib/bitops.h b/arch/arm/lib/bitops.h
index 2e787d4..c7f2627 100644
--- a/arch/arm/lib/bitops.h
+++ b/arch/arm/lib/bitops.h
@@ -18,12 +18,14 @@
 	mov	r2, #1
 	add	r1, r1, r0, lsr #3	@ Get byte offset
 	mov	r3, r2, lsl r3		@ create mask
+	smp_dmb
 1:	ldrexb	r2, [r1]
 	ands	r0, r2, r3		@ save old value of bit
 	\instr	r2, r2, r3			@ toggle bit
 	strexb	ip, r2, [r1]
 	cmp	ip, #0
 	bne	1b
+	smp_dmb
 	cmp	r0, #0
 	movne	r0, #1
 2:	mov	pc, lr
-- 
cgit v0.10.2


From b96d31a62f714566fa6420851b3bb3615c796322 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 27 May 2009 09:37:33 -0400
Subject: cifs: clean up set_cifs_acl interfaces

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 7f8e6c4..1403b5d 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -612,57 +612,61 @@ static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
 	return pntsd;
 }
 
-/* Set an ACL on the server */
-static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
-				struct inode *inode, const char *path)
+static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
+		struct cifs_ntsd *pnntsd, u32 acllen)
 {
-	struct cifsFileInfo *open_file;
-	bool unlock_file = false;
-	int xid;
-	int rc = -EIO;
-	__u16 fid;
-	struct super_block *sb;
-	struct cifs_sb_info *cifs_sb;
+	int xid, rc;
 
-	cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
+	xid = GetXid();
+	rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
+	FreeXid(xid);
 
-	if (!inode)
-		return rc;
+	cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
+	return rc;
+}
 
-	sb = inode->i_sb;
-	if (sb == NULL)
-		return rc;
+static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
+		struct cifs_ntsd *pnntsd, u32 acllen)
+{
+	int oplock = 0;
+	int xid, rc;
+	__u16 fid;
 
-	cifs_sb = CIFS_SB(sb);
 	xid = GetXid();
 
-	open_file = find_readable_file(CIFS_I(inode));
-	if (open_file) {
-		unlock_file = true;
-		fid = open_file->netfid;
-	} else {
-		int oplock = 0;
-		/* open file */
-		rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
-				WRITE_DAC, 0, &fid, &oplock, NULL,
-				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
-		if (rc != 0) {
-			cERROR(1, ("Unable to open file to set ACL"));
-			FreeXid(xid);
-			return rc;
-		}
+	rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, WRITE_DAC, 0,
+			 &fid, &oplock, NULL, cifs_sb->local_nls,
+			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc) {
+		cERROR(1, ("Unable to open file to set ACL"));
+		goto out;
 	}
 
 	rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
 	cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
-	if (unlock_file)
-		atomic_dec(&open_file->wrtPending);
-	else
-		CIFSSMBClose(xid, cifs_sb->tcon, fid);
 
+	CIFSSMBClose(xid, cifs_sb->tcon, fid);
+ out:
 	FreeXid(xid);
+	return rc;
+}
 
+/* Set an ACL on the server */
+static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
+				struct inode *inode, const char *path)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsFileInfo *open_file;
+	int rc;
+
+	cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
+
+	open_file = find_readable_file(CIFS_I(inode));
+	if (!open_file)
+		return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
+
+	rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen);
+	atomic_dec(&open_file->wrtPending);
 	return rc;
 }
 
-- 
cgit v0.10.2


From ecd322c9b3e4ac70f9f108badde3eb6b99c7993d Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Thu, 28 May 2009 16:07:39 -0400
Subject: [ARM] Add cmpxchg support for ARMv6+ systems (v5)

Add cmpxchg/cmpxchg64 support for ARMv6K and ARMv7 systems
(original patch from Catalin Marinas <catalin.marinas@arm.com>)

The cmpxchg and cmpxchg64 functions can be implemented using the
LDREX*/STREX* instructions. Since operand lengths other than 32bit are
required, the full implementations are only available if the ARMv6K
extensions are present (for the LDREXB, LDREXH and LDREXD instructions).

For ARMv6, only 32-bits cmpxchg is available.

Mathieu :

Make cmpxchg_local always available with best implementation for all type sizes (1, 2, 4 bytes).
Make cmpxchg64_local always available.

Use "Ir" constraint for "old" operand, like atomic.h atomic_cmpxchg does.

Change since v3 :
- Add "memory" clobbers (thanks to Nicolas Pitre)
- removed __asmeq(), only needed for old compilers, very unlikely on ARMv6+.

Note : ARMv7-M should eventually be ifdefed-out of cmpxchg64. But it's not
supported by the Linux kernel currently.

Put back arm < v6 cmpxchg support.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
CC: Catalin Marinas <catalin.marinas@arm.com>
CC: Nicolas Pitre <nico@cam.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h
index 7fce8f3..d65b2f5 100644
--- a/arch/arm/include/asm/system.h
+++ b/arch/arm/include/asm/system.h
@@ -319,6 +319,12 @@ extern void enable_hlt(void);
 
 #include <asm-generic/cmpxchg-local.h>
 
+#if __LINUX_ARM_ARCH__ < 6
+
+#ifdef CONFIG_SMP
+#error "SMP is not supported on this platform"
+#endif
+
 /*
  * cmpxchg_local and cmpxchg64_local are atomic wrt current CPU. Always make
  * them available.
@@ -332,6 +338,173 @@ extern void enable_hlt(void);
 #include <asm-generic/cmpxchg.h>
 #endif
 
+#else	/* __LINUX_ARM_ARCH__ >= 6 */
+
+extern void __bad_cmpxchg(volatile void *ptr, int size);
+
+/*
+ * cmpxchg only support 32-bits operands on ARMv6.
+ */
+
+static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
+				      unsigned long new, int size)
+{
+	unsigned long oldval, res;
+
+	switch (size) {
+#ifdef CONFIG_CPU_32v6K
+	case 1:
+		do {
+			asm volatile("@ __cmpxchg1\n"
+			"	ldrexb	%1, [%2]\n"
+			"	mov	%0, #0\n"
+			"	teq	%1, %3\n"
+			"	strexbeq %0, %4, [%2]\n"
+				: "=&r" (res), "=&r" (oldval)
+				: "r" (ptr), "Ir" (old), "r" (new)
+				: "memory", "cc");
+		} while (res);
+		break;
+	case 2:
+		do {
+			asm volatile("@ __cmpxchg1\n"
+			"	ldrexh	%1, [%2]\n"
+			"	mov	%0, #0\n"
+			"	teq	%1, %3\n"
+			"	strexheq %0, %4, [%2]\n"
+				: "=&r" (res), "=&r" (oldval)
+				: "r" (ptr), "Ir" (old), "r" (new)
+				: "memory", "cc");
+		} while (res);
+		break;
+#endif /* CONFIG_CPU_32v6K */
+	case 4:
+		do {
+			asm volatile("@ __cmpxchg4\n"
+			"	ldrex	%1, [%2]\n"
+			"	mov	%0, #0\n"
+			"	teq	%1, %3\n"
+			"	strexeq %0, %4, [%2]\n"
+				: "=&r" (res), "=&r" (oldval)
+				: "r" (ptr), "Ir" (old), "r" (new)
+				: "memory", "cc");
+		} while (res);
+		break;
+	default:
+		__bad_cmpxchg(ptr, size);
+		oldval = 0;
+	}
+
+	return oldval;
+}
+
+static inline unsigned long __cmpxchg_mb(volatile void *ptr, unsigned long old,
+					 unsigned long new, int size)
+{
+	unsigned long ret;
+
+	smp_mb();
+	ret = __cmpxchg(ptr, old, new, size);
+	smp_mb();
+
+	return ret;
+}
+
+#define cmpxchg(ptr,o,n)						\
+	((__typeof__(*(ptr)))__cmpxchg_mb((ptr),			\
+					  (unsigned long)(o),		\
+					  (unsigned long)(n),		\
+					  sizeof(*(ptr))))
+
+static inline unsigned long __cmpxchg_local(volatile void *ptr,
+					    unsigned long old,
+					    unsigned long new, int size)
+{
+	unsigned long ret;
+
+	switch (size) {
+#ifndef CONFIG_CPU_32v6K
+	case 1:
+	case 2:
+		ret = __cmpxchg_local_generic(ptr, old, new, size);
+		break;
+#endif	/* !CONFIG_CPU_32v6K */
+	default:
+		ret = __cmpxchg(ptr, old, new, size);
+	}
+
+	return ret;
+}
+
+#define cmpxchg_local(ptr,o,n)						\
+	((__typeof__(*(ptr)))__cmpxchg_local((ptr),			\
+				       (unsigned long)(o),		\
+				       (unsigned long)(n),		\
+				       sizeof(*(ptr))))
+
+#ifdef CONFIG_CPU_32v6K
+
+/*
+ * Note : ARMv7-M (currently unsupported by Linux) does not support
+ * ldrexd/strexd. If ARMv7-M is ever supported by the Linux kernel, it should
+ * not be allowed to use __cmpxchg64.
+ */
+static inline unsigned long long __cmpxchg64(volatile void *ptr,
+					     unsigned long long old,
+					     unsigned long long new)
+{
+	register unsigned long long oldval asm("r0");
+	register unsigned long long __old asm("r2") = old;
+	register unsigned long long __new asm("r4") = new;
+	unsigned long res;
+
+	do {
+		asm volatile(
+		"	@ __cmpxchg8\n"
+		"	ldrexd	%1, %H1, [%2]\n"
+		"	mov	%0, #0\n"
+		"	teq	%1, %3\n"
+		"	teqeq	%H1, %H3\n"
+		"	strexdeq %0, %4, %H4, [%2]\n"
+			: "=&r" (res), "=&r" (oldval)
+			: "r" (ptr), "Ir" (__old), "r" (__new)
+			: "memory", "cc");
+	} while (res);
+
+	return oldval;
+}
+
+static inline unsigned long long __cmpxchg64_mb(volatile void *ptr,
+						unsigned long long old,
+						unsigned long long new)
+{
+	unsigned long long ret;
+
+	smp_mb();
+	ret = __cmpxchg64(ptr, old, new);
+	smp_mb();
+
+	return ret;
+}
+
+#define cmpxchg64(ptr,o,n)						\
+	((__typeof__(*(ptr)))__cmpxchg64_mb((ptr),			\
+					    (unsigned long long)(o),	\
+					    (unsigned long long)(n)))
+
+#define cmpxchg64_local(ptr,o,n)					\
+	((__typeof__(*(ptr)))__cmpxchg64((ptr),				\
+					 (unsigned long long)(o),	\
+					 (unsigned long long)(n)))
+
+#else	/* !CONFIG_CPU_32v6K */
+
+#define cmpxchg64_local(ptr, o, n) __cmpxchg64_local_generic((ptr), (o), (n))
+
+#endif	/* CONFIG_CPU_32v6K */
+
+#endif	/* __LINUX_ARM_ARCH__ >= 6 */
+
 #endif /* __ASSEMBLY__ */
 
 #define arch_align_stack(x) (x)
-- 
cgit v0.10.2


From f42706c90470851fd2e97eda7a4109e8949bde8a Mon Sep 17 00:00:00 2001
From: Martin Fuzzey <mfuzzey@gmail.com>
Date: Fri, 1 May 2009 17:21:11 +0200
Subject: USB: atmel-usba-udc : fix control out requests.

usbtest #14 was failing with "udc: ep0: TXCOMP: Invalid endpoint state 2, halting endpoint..."
This occured since ep0 is bidirectional and ep->is_in is not valid (must always use ep->state)

Signed-off-by: Martin Fuzzey <mfuzzey@gmail.com>
Acked-by: David Brownell <dbrownell@users.sourceforge.net>
Acked-by: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

diff --git a/drivers/usb/gadget/atmel_usba_udc.c b/drivers/usb/gadget/atmel_usba_udc.c
index 563d572..5644897 100644
--- a/drivers/usb/gadget/atmel_usba_udc.c
+++ b/drivers/usb/gadget/atmel_usba_udc.c
@@ -794,7 +794,8 @@ usba_ep_queue(struct usb_ep *_ep, struct usb_request *_req, gfp_t gfp_flags)
 	if (ep->desc) {
 		list_add_tail(&req->queue, &ep->queue);
 
-		if (ep->is_in || (ep_is_control(ep)
+		if ((!ep_is_control(ep) && ep->is_in) ||
+			(ep_is_control(ep)
 				&& (ep->state == DATA_STAGE_IN
 					|| ep->state == STATUS_STAGE_IN)))
 			usba_ep_writel(ep, CTL_ENB, USBA_TX_PK_RDY);
-- 
cgit v0.10.2


From fe92c9e481a147a9e1e14f55870f32903b967777 Mon Sep 17 00:00:00 2001
From: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
Date: Mon, 4 May 2009 17:22:43 -0700
Subject: USB: atmel_usb_udc: Use kzalloc() to allocate ep structures

This ensures that all fields are properly initialized.

Signed-off-by: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
Acked-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

diff --git a/drivers/usb/gadget/atmel_usba_udc.c b/drivers/usb/gadget/atmel_usba_udc.c
index 5644897..05c913c 100644
--- a/drivers/usb/gadget/atmel_usba_udc.c
+++ b/drivers/usb/gadget/atmel_usba_udc.c
@@ -1941,7 +1941,7 @@ static int __init usba_udc_probe(struct platform_device *pdev)
 	usba_writel(udc, CTRL, USBA_DISABLE_MASK);
 	clk_disable(pclk);
 
-	usba_ep = kmalloc(sizeof(struct usba_ep) * pdata->num_ep,
+	usba_ep = kzalloc(sizeof(struct usba_ep) * pdata->num_ep,
 			  GFP_KERNEL);
 	if (!usba_ep)
 		goto err_alloc_ep;
-- 
cgit v0.10.2


From cab98a0a349829b145d924c0649a2d30cd6a9e3d Mon Sep 17 00:00:00 2001
From: Xiao Kaijian <xiaokj@gmail.com>
Date: Fri, 8 May 2009 00:48:23 +0800
Subject: USB: Yet another Conexant Clone to add to cdc-acm.c

This patch adds another quirky Conexant USB Modem Clone to usb cdc-acm.c

Signed-off-by: Xiao Kaijian <xiaokj@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index 0a69c09..7a1164d 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -1375,6 +1375,9 @@ static struct usb_device_id acm_ids[] = {
 	{ USB_DEVICE(0x0572, 0x1324), /* Conexant USB MODEM RD02-D400 */
 	.driver_info = NO_UNION_NORMAL, /* has no union descriptor */
 	},
+	{ USB_DEVICE(0x0572, 0x1328), /* Shiro / Aztech USB MODEM UM-3100 */
+	.driver_info = NO_UNION_NORMAL, /* has no union descriptor */
+	},
 	{ USB_DEVICE(0x22b8, 0x6425), /* Motorola MOTOMAGX phones */
 	},
 	{ USB_DEVICE(0x0572, 0x1329), /* Hummingbird huc56s (Conexant) */
-- 
cgit v0.10.2


From 0afb20e00b5053170c85298fed842b32d20b4ea9 Mon Sep 17 00:00:00 2001
From: Warren Free <wfree@ipmn.com>
Date: Fri, 8 May 2009 10:27:08 +0200
Subject: USB: isp1760: urb_dequeue doesn't always find the urbs

The option driver (and presumably others) allocates several URBs when it
opens and tries to free them when it closes. The isp1760_urb_dequeue
function gets called, but the packet being dequeued is not necessarily at
the
front of one of the 32 queues. If not, the isp1760_urb_done function doesn't
get called for the URB and the process trying to free it hangs forever on a
wait_queue. This patch does two things. If the URB being dequeued has others
queued behind it, it re-queues them. And it searches the queues looking for
the URB being dequeued rather than just looking at the one at the front of
the queue.

[bigeasy@linutronix] whitespace fixes, reformating

Cc: stable <stable@kernel.org>
Signed-off-by: Warren Free <wfree@ipmn.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

diff --git a/drivers/usb/host/isp1760-hcd.c b/drivers/usb/host/isp1760-hcd.c
index cd07ea3..1543846 100644
--- a/drivers/usb/host/isp1760-hcd.c
+++ b/drivers/usb/host/isp1760-hcd.c
@@ -1658,6 +1658,7 @@ static int isp1760_urb_dequeue(struct usb_hcd *hcd, struct urb *urb,
 	u32 reg_base, or_reg, skip_reg;
 	unsigned long flags;
 	struct ptd ptd;
+	packet_enqueue *pe;
 
 	switch (usb_pipetype(urb->pipe)) {
 	case PIPE_ISOCHRONOUS:
@@ -1669,6 +1670,7 @@ static int isp1760_urb_dequeue(struct usb_hcd *hcd, struct urb *urb,
 		reg_base = INT_REGS_OFFSET;
 		or_reg = HC_INT_IRQ_MASK_OR_REG;
 		skip_reg = HC_INT_PTD_SKIPMAP_REG;
+		pe = enqueue_an_INT_packet;
 		break;
 
 	default:
@@ -1676,6 +1678,7 @@ static int isp1760_urb_dequeue(struct usb_hcd *hcd, struct urb *urb,
 		reg_base = ATL_REGS_OFFSET;
 		or_reg = HC_ATL_IRQ_MASK_OR_REG;
 		skip_reg = HC_ATL_PTD_SKIPMAP_REG;
+		pe =  enqueue_an_ATL_packet;
 		break;
 	}
 
@@ -1687,6 +1690,7 @@ static int isp1760_urb_dequeue(struct usb_hcd *hcd, struct urb *urb,
 			u32 skip_map;
 			u32 or_map;
 			struct isp1760_qtd *qtd;
+			struct isp1760_qh *qh = ints->qh;
 
 			skip_map = isp1760_readl(hcd->regs + skip_reg);
 			skip_map |= 1 << i;
@@ -1699,8 +1703,7 @@ static int isp1760_urb_dequeue(struct usb_hcd *hcd, struct urb *urb,
 			priv_write_copy(priv, (u32 *)&ptd, hcd->regs + reg_base
 					+ i * sizeof(ptd), sizeof(ptd));
 			qtd = ints->qtd;
-
-			clean_up_qtdlist(qtd);
+			qtd = clean_up_qtdlist(qtd);
 
 			free_mem(priv, ints->payload);
 
@@ -1711,7 +1714,24 @@ static int isp1760_urb_dequeue(struct usb_hcd *hcd, struct urb *urb,
 			ints->payload = 0;
 
 			isp1760_urb_done(priv, urb, status);
+			if (qtd)
+				pe(hcd, qh, qtd);
 			break;
+
+		} else if (ints->qtd) {
+			struct isp1760_qtd *qtd, *prev_qtd = ints->qtd;
+
+			for (qtd = ints->qtd->hw_next; qtd; qtd = qtd->hw_next) {
+				if (qtd->urb == urb) {
+					prev_qtd->hw_next = clean_up_qtdlist(qtd);
+					isp1760_urb_done(priv, urb, status);
+					break;
+				}
+				prev_qtd = qtd;
+			}
+			/* we found the urb before the end of the list */
+			if (qtd)
+				break;
 		}
 		ints++;
 	}
-- 
cgit v0.10.2


From 0a3c8549ea7e94d74a41096d42bc6cdf43d183bf Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Wed, 27 May 2009 11:25:52 -0400
Subject: usb-serial: fix crash when sub-driver updates firmware

This patch (as1244) fixes a crash in usb-serial that occurs when a
sub-driver returns a positive value from its attach method, indicating
that new firmware was loaded and the device will disconnect and
reconnect.  The usb-serial core then skips the step of registering the
port devices; when the disconnect occurs, the attempt to unregister
the ports fails dramatically.

This problem shows up with Keyspan devices and it might affect others
as well.

When the attach method returns a positive value, the patch sets
num_ports to 0.  This tells usb_serial_disconnect() not to try
unregistering any of the ports; instead they are cleaned up by
destroy_serial().

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Tested-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: stable <stable@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index 0a566ee..f331e2b 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -974,6 +974,7 @@ int usb_serial_probe(struct usb_interface *interface,
 		if (retval > 0) {
 			/* quietly accept this device, but don't bind to a
 			   serial port as it's about to disappear */
+			serial->num_ports = 0;
 			goto exit;
 		}
 	}
-- 
cgit v0.10.2


From 29868b281f6d057b4cbe348f4483f1717c021c5c Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Wed, 20 May 2009 08:49:48 -0400
Subject: Revert "USB: Correct Makefile to make isp1760 buildable"

This reverts commit 26e1287594864169577327fef233befc9739be3b.

A larger patch (f7e7aa585) a few days after this one added the same line
to the Makefile, but in a different place.  While it'd be more correct to
revert that one, it's easier to revert this one because this is a
one-liner.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
CC: Greg Kroah-Hartman <gregkh@suse.de>
CC: linux-usb@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

diff --git a/drivers/usb/Makefile b/drivers/usb/Makefile
index 0716cdb..0a3dc5e 100644
--- a/drivers/usb/Makefile
+++ b/drivers/usb/Makefile
@@ -11,7 +11,6 @@ obj-$(CONFIG_USB_MON)		+= mon/
 obj-$(CONFIG_PCI)		+= host/
 obj-$(CONFIG_USB_EHCI_HCD)	+= host/
 obj-$(CONFIG_USB_ISP116X_HCD)	+= host/
-obj-$(CONFIG_USB_ISP1760_HCD)	+= host/
 obj-$(CONFIG_USB_OHCI_HCD)	+= host/
 obj-$(CONFIG_USB_UHCI_HCD)	+= host/
 obj-$(CONFIG_USB_FHCI_HCD)	+= host/
-- 
cgit v0.10.2


From 086a377edc969aea6c761176a7e4ff68f264d6fe Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 7 May 2009 12:36:53 -0700
Subject: sysfs: file.c: use create_singlethread_workqueue()

We don't need a kernel thread per CPU for this application.

Acked-by: Alex Chiang <achiang@hp.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index b1606e0..561a9c0 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -723,7 +723,7 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
 	mutex_unlock(&sysfs_workq_mutex);
 
 	if (sysfs_workqueue == NULL) {
-		sysfs_workqueue = create_workqueue("sysfsd");
+		sysfs_workqueue = create_singlethread_workqueue("sysfsd");
 		if (sysfs_workqueue == NULL) {
 			module_put(owner);
 			return -ENOMEM;
-- 
cgit v0.10.2


From 5c8563d773c0e9f0ac2a552e84806decd98ce732 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Thu, 28 May 2009 14:24:07 -0700
Subject: Driver Core: do not oops when driver_unregister() is called for
 unregistered drivers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We also fix a problem with cleaning up properly when initializing
drivers and devices, so checks like this will work successfully.

Portions of the patch by Linus and Greg and Ingo.

Reported-by: Ozan Çağlayan <ozan@pardus.org.tr>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index dc030f1..c659961 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -700,8 +700,10 @@ int bus_add_driver(struct device_driver *drv)
 	}
 
 	kobject_uevent(&priv->kobj, KOBJ_ADD);
-	return error;
+	return 0;
 out_unregister:
+	kfree(drv->p);
+	drv->p = NULL;
 	kobject_put(&priv->kobj);
 out_put_bus:
 	bus_put(bus);
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 4aa527b..1977d4b 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -879,7 +879,7 @@ int device_add(struct device *dev)
 	}
 
 	if (!dev_name(dev))
-		goto done;
+		goto name_error;
 
 	pr_debug("device: '%s': %s\n", dev_name(dev), __func__);
 
@@ -978,6 +978,9 @@ done:
 	cleanup_device_parent(dev);
 	if (parent)
 		put_device(parent);
+name_error:
+	kfree(dev->p);
+	dev->p = NULL;
 	goto done;
 }
 
diff --git a/drivers/base/driver.c b/drivers/base/driver.c
index c51f11b..8ae0f63 100644
--- a/drivers/base/driver.c
+++ b/drivers/base/driver.c
@@ -257,6 +257,10 @@ EXPORT_SYMBOL_GPL(driver_register);
  */
 void driver_unregister(struct device_driver *drv)
 {
+	if (!drv || !drv->p) {
+		WARN(1, "Unexpected driver unregister!\n");
+		return;
+	}
 	driver_remove_groups(drv, drv->groups);
 	bus_remove_driver(drv);
 }
-- 
cgit v0.10.2


From a2928c42a5d69328c3578b41bd4d72f6658cf7dc Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 28 May 2009 14:55:04 -0300
Subject: perf_counter tools: Move symbol resolution classes from report to
 libperf

Will be used by perf top as well.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090528175504.GC4747@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 51b13f9..bd29a5c 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -298,6 +298,7 @@ LIB_H += util/help.h
 LIB_H += util/strbuf.h
 LIB_H += util/run-command.h
 LIB_H += util/sigchain.h
+LIB_H += util/symbol.h
 
 LIB_OBJS += util/abspath.o
 LIB_OBJS += util/alias.o
@@ -317,6 +318,7 @@ LIB_OBJS += util/strbuf.o
 LIB_OBJS += util/usage.o
 LIB_OBJS += util/wrapper.o
 LIB_OBJS += util/sigchain.o
+LIB_OBJS += util/symbol.o
 LIB_OBJS += util/pager.o
 
 BUILTIN_OBJS += builtin-help.o
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 9fdf822..04fc7ec 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -1,13 +1,10 @@
 #include "util/util.h"
 #include "builtin.h"
 
-#include <libelf.h>
-#include <gelf.h>
-#include <elf.h>
-
 #include "util/list.h"
 #include "util/cache.h"
 #include "util/rbtree.h"
+#include "util/symbol.h"
 
 #include "perf.h"
 
@@ -62,305 +59,6 @@ typedef union event_union {
 	struct comm_event comm;
 } event_t;
 
-struct symbol {
-	struct rb_node		rb_node;
-	__u64			start;
-	__u64			end;
-	char			name[0];
-};
-
-static struct symbol *symbol__new(uint64_t start, uint64_t len, const char *name)
-{
-	struct symbol *self = malloc(sizeof(*self) + strlen(name) + 1);
-
-	if (self != NULL) {
-		self->start = start;
-		self->end   = start + len;
-		strcpy(self->name, name);
-	}
-
-	return self;
-}
-
-static void symbol__delete(struct symbol *self)
-{
-	free(self);
-}
-
-static size_t symbol__fprintf(struct symbol *self, FILE *fp)
-{
-	return fprintf(fp, " %llx-%llx %s\n",
-		       self->start, self->end, self->name);
-}
-
-struct dso {
-	struct list_head node;
-	struct rb_root	 syms;
-	char		 name[0];
-};
-
-static struct dso *dso__new(const char *name)
-{
-	struct dso *self = malloc(sizeof(*self) + strlen(name) + 1);
-
-	if (self != NULL) {
-		strcpy(self->name, name);
-		self->syms = RB_ROOT;
-	}
-
-	return self;
-}
-
-static void dso__delete_symbols(struct dso *self)
-{
-	struct symbol *pos;
-	struct rb_node *next = rb_first(&self->syms);
-
-	while (next) {
-		pos = rb_entry(next, struct symbol, rb_node);
-		next = rb_next(&pos->rb_node);
-		symbol__delete(pos);
-	}
-}
-
-static void dso__delete(struct dso *self)
-{
-	dso__delete_symbols(self);
-	free(self);
-}
-
-static void dso__insert_symbol(struct dso *self, struct symbol *sym)
-{
-	struct rb_node **p = &self->syms.rb_node;
-	struct rb_node *parent = NULL;
-	const uint64_t ip = sym->start;
-	struct symbol *s;
-
-	while (*p != NULL) {
-		parent = *p;
-		s = rb_entry(parent, struct symbol, rb_node);
-		if (ip < s->start)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-	rb_link_node(&sym->rb_node, parent, p);
-	rb_insert_color(&sym->rb_node, &self->syms);
-}
-
-static struct symbol *dso__find_symbol(struct dso *self, uint64_t ip)
-{
-	struct rb_node *n;
-
-	if (self == NULL)
-		return NULL;
-
-	n = self->syms.rb_node;
-
-	while (n) {
-		struct symbol *s = rb_entry(n, struct symbol, rb_node);
-
-		if (ip < s->start)
-			n = n->rb_left;
-		else if (ip > s->end)
-			n = n->rb_right;
-		else
-			return s;
-	}
-
-	return NULL;
-}
-
-/**
- * elf_symtab__for_each_symbol - iterate thru all the symbols
- *
- * @self: struct elf_symtab instance to iterate
- * @index: uint32_t index
- * @sym: GElf_Sym iterator
- */
-#define elf_symtab__for_each_symbol(syms, nr_syms, index, sym) \
-	for (index = 0, gelf_getsym(syms, index, &sym);\
-	     index < nr_syms; \
-	     index++, gelf_getsym(syms, index, &sym))
-
-static inline uint8_t elf_sym__type(const GElf_Sym *sym)
-{
-	return GELF_ST_TYPE(sym->st_info);
-}
-
-static inline int elf_sym__is_function(const GElf_Sym *sym)
-{
-	return elf_sym__type(sym) == STT_FUNC &&
-	       sym->st_name != 0 &&
-	       sym->st_shndx != SHN_UNDEF &&
-	       sym->st_size != 0;
-}
-
-static inline const char *elf_sym__name(const GElf_Sym *sym,
-					const Elf_Data *symstrs)
-{
-	return symstrs->d_buf + sym->st_name;
-}
-
-static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
-				    GElf_Shdr *shp, const char *name,
-				    size_t *index)
-{
-	Elf_Scn *sec = NULL;
-	size_t cnt = 1;
-
-	while ((sec = elf_nextscn(elf, sec)) != NULL) {
-		char *str;
-
-		gelf_getshdr(sec, shp);
-		str = elf_strptr(elf, ep->e_shstrndx, shp->sh_name);
-		if (!strcmp(name, str)) {
-			if (index)
-				*index = cnt;
-			break;
-		}
-		++cnt;
-	}
-
-	return sec;
-}
-
-static int dso__load_sym(struct dso *self, int fd, char *name)
-{
-	Elf_Data *symstrs;
-	uint32_t nr_syms;
-	int err = -1;
-	uint32_t index;
-	GElf_Ehdr ehdr;
-	GElf_Shdr shdr;
-	Elf_Data *syms;
-	GElf_Sym sym;
-	Elf_Scn *sec;
-	Elf *elf;
-	int nr = 0;
-
-	elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
-	if (elf == NULL) {
-		fprintf(stderr, "%s: cannot read %s ELF file.\n",
-			__func__, name);
-		goto out_close;
-	}
-
-	if (gelf_getehdr(elf, &ehdr) == NULL) {
-		fprintf(stderr, "%s: cannot get elf header.\n", __func__);
-		goto out_elf_end;
-	}
-
-	sec = elf_section_by_name(elf, &ehdr, &shdr, ".symtab", NULL);
-	if (sec == NULL)
-		sec = elf_section_by_name(elf, &ehdr, &shdr, ".dynsym", NULL);
-
-	if (sec == NULL)
-		goto out_elf_end;
-
-	syms = elf_getdata(sec, NULL);
-	if (syms == NULL)
-		goto out_elf_end;
-
-	sec = elf_getscn(elf, shdr.sh_link);
-	if (sec == NULL)
-		goto out_elf_end;
-
-	symstrs = elf_getdata(sec, NULL);
-	if (symstrs == NULL)
-		goto out_elf_end;
-
-	nr_syms = shdr.sh_size / shdr.sh_entsize;
-
-	elf_symtab__for_each_symbol(syms, nr_syms, index, sym) {
-		struct symbol *f;
-
-		if (!elf_sym__is_function(&sym))
-			continue;
-
-		sec = elf_getscn(elf, sym.st_shndx);
-		if (!sec)
-			goto out_elf_end;
-
-		gelf_getshdr(sec, &shdr);
-		sym.st_value -= shdr.sh_addr - shdr.sh_offset;
-
-		f = symbol__new(sym.st_value, sym.st_size,
-				elf_sym__name(&sym, symstrs));
-		if (!f)
-			goto out_elf_end;
-
-		dso__insert_symbol(self, f);
-
-		nr++;
-	}
-
-	err = nr;
-out_elf_end:
-	elf_end(elf);
-out_close:
-	return err;
-}
-
-static int dso__load(struct dso *self)
-{
-	int size = strlen(self->name) + sizeof("/usr/lib/debug%s.debug");
-	char *name = malloc(size);
-	int variant = 0;
-	int ret = -1;
-	int fd;
-
-	if (!name)
-		return -1;
-
-more:
-	do {
-		switch (variant) {
-		case 0: /* Fedora */
-			snprintf(name, size, "/usr/lib/debug%s.debug", self->name);
-			break;
-		case 1: /* Ubuntu */
-			snprintf(name, size, "/usr/lib/debug%s", self->name);
-			break;
-		case 2: /* Sane people */
-			snprintf(name, size, "%s", self->name);
-			break;
-
-		default:
-			goto out;
-		}
-		variant++;
-
-		fd = open(name, O_RDONLY);
-	} while (fd < 0);
-
-	ret = dso__load_sym(self, fd, name);
-	close(fd);
-
-	/*
-	 * Some people seem to have debuginfo files _WITHOUT_ debug info!?!?
-	 */
-	if (!ret)
-		goto more;
-
-out:
-	free(name);
-	return ret;
-}
-
-static size_t dso__fprintf(struct dso *self, FILE *fp)
-{
-	size_t ret = fprintf(fp, "dso: %s\n", self->name);
-
-	struct rb_node *nd;
-	for (nd = rb_first(&self->syms); nd; nd = rb_next(nd)) {
-		struct symbol *pos = rb_entry(nd, struct symbol, rb_node);
-		ret += symbol__fprintf(pos, fp);
-	}
-
-	return ret;
-}
-
 static LIST_HEAD(dsos);
 static struct dso *kernel_dso;
 
@@ -418,153 +116,27 @@ static void dsos__fprintf(FILE *fp)
 		dso__fprintf(pos, fp);
 }
 
-static int hex(char ch)
-{
-	if ((ch >= '0') && (ch <= '9'))
-		return ch - '0';
-	if ((ch >= 'a') && (ch <= 'f'))
-		return ch - 'a' + 10;
-	if ((ch >= 'A') && (ch <= 'F'))
-		return ch - 'A' + 10;
-	return -1;
-}
-
-/*
- * While we find nice hex chars, build a long_val.
- * Return number of chars processed.
- */
-static int hex2long(char *ptr, unsigned long *long_val)
-{
-	const char *p = ptr;
-	*long_val = 0;
-
-	while (*p) {
-		const int hex_val = hex(*p);
-
-		if (hex_val < 0)
-			break;
-
-		*long_val = (*long_val << 4) | hex_val;
-		p++;
-	}
-
-	return p - ptr;
-}
-
-static int load_kallsyms(void)
-{
-	struct rb_node *nd, *prevnd;
-	char *line = NULL;
-	FILE *file;
-	size_t n;
-
-	kernel_dso = dso__new("[kernel]");
-	if (kernel_dso == NULL)
-		return -1;
-
-	file = fopen("/proc/kallsyms", "r");
-	if (file == NULL)
-		goto out_delete_dso;
-
-	while (!feof(file)) {
-		unsigned long start;
-		struct symbol *sym;
-		int line_len, len;
-		char symbol_type;
-
-		line_len = getline(&line, &n, file);
-		if (line_len < 0)
-			break;
-
-		if (!line)
-			goto out_delete_dso;
-
-		line[--line_len] = '\0'; /* \n */
-
-		len = hex2long(line, &start);
-
-		len++;
-		if (len + 2 >= line_len)
-			continue;
-
-		symbol_type = toupper(line[len]);
-		/*
-		 * We're interested only in code ('T'ext)
-		 */
-		if (symbol_type != 'T' && symbol_type != 'W')
-			continue;
-		/*
-		 * Well fix up the end later, when we have all sorted.
-		 */
-		sym = symbol__new(start, 0xdead, line + len + 2);
-
-		if (sym == NULL)
-			goto out_delete_dso;
-
-		dso__insert_symbol(kernel_dso, sym);
-	}
-
-	/*
-	 * Now that we have all sorted out, just set the ->end of all
-	 * symbols
-	 */
-	prevnd = rb_first(&kernel_dso->syms);
-
-	if (prevnd == NULL)
-		goto out_delete_line;
-
-	for (nd = rb_next(prevnd); nd; nd = rb_next(nd)) {
-		struct symbol *prev = rb_entry(prevnd, struct symbol, rb_node),
-			      *curr = rb_entry(nd, struct symbol, rb_node);
-
-		prev->end = curr->start - 1;
-		prevnd = nd;
-	}
-
-	dsos__add(kernel_dso);
-	free(line);
-	fclose(file);
-
-	return 0;
-
-out_delete_line:
-	free(line);
-out_delete_dso:
-	dso__delete(kernel_dso);
-	return -1;
-}
-
 static int load_kernel(void)
 {
-	int fd, nr;
-
-	if (!vmlinux)
-		goto kallsyms;
-
-	fd = open(vmlinux, O_RDONLY);
-	if (fd < 0)
-		goto kallsyms;
+	int err = -1;
 
 	kernel_dso = dso__new("[kernel]");
 	if (!kernel_dso)
-		goto fail_open;
-
-	nr = dso__load_sym(kernel_dso, fd, vmlinux);
+		return -1;
 
-	if (nr <= 0)
-		goto fail_load;
+	if (vmlinux)
+		err = dso__load_vmlinux(kernel_dso, vmlinux);
 
-	dsos__add(kernel_dso);
-	close(fd);
+	if (err)
+		err = dso__load_kallsyms(kernel_dso);
 
-	return 0;
+	if (err) {
+		dso__delete(kernel_dso);
+		kernel_dso = NULL;
+	} else
+		dsos__add(kernel_dso);
 
-fail_load:
-	dso__delete(kernel_dso);
-fail_open:
-	close(fd);
-kallsyms:
-	return load_kallsyms();
+	return err;
 }
 
 struct map {
@@ -1050,7 +622,7 @@ static int __cmd_report(void)
 	}
 
 	if (load_kernel() < 0) {
-		perror("failed to open kallsyms");
+		perror("failed to load kernel symbols");
 		return EXIT_FAILURE;
 	}
 
@@ -1247,7 +819,7 @@ static const struct option options[] = {
 
 int cmd_report(int argc, const char **argv, const char *prefix)
 {
-	elf_version(EV_CURRENT);
+	symbol__init();
 
 	page_size = getpagesize();
 
diff --git a/Documentation/perf_counter/util/symbol.c b/Documentation/perf_counter/util/symbol.c
new file mode 100644
index 0000000..d06de2c
--- /dev/null
+++ b/Documentation/perf_counter/util/symbol.c
@@ -0,0 +1,421 @@
+#include "util.h"
+#include "../perf.h"
+#include "symbol.h"
+
+#include <libelf.h>
+#include <gelf.h>
+#include <elf.h>
+
+static struct symbol *symbol__new(uint64_t start, uint64_t len,
+				  const char *name)
+{
+	struct symbol *self = malloc(sizeof(*self) + strlen(name) + 1);
+
+	if (self != NULL) {
+		self->start = start;
+		self->end   = start + len;
+		strcpy(self->name, name);
+	}
+
+	return self;
+}
+
+static void symbol__delete(struct symbol *self)
+{
+	free(self);
+}
+
+static size_t symbol__fprintf(struct symbol *self, FILE *fp)
+{
+	return fprintf(fp, " %llx-%llx %s\n",
+		       self->start, self->end, self->name);
+}
+
+struct dso *dso__new(const char *name)
+{
+	struct dso *self = malloc(sizeof(*self) + strlen(name) + 1);
+
+	if (self != NULL) {
+		strcpy(self->name, name);
+		self->syms = RB_ROOT;
+	}
+
+	return self;
+}
+
+static void dso__delete_symbols(struct dso *self)
+{
+	struct symbol *pos;
+	struct rb_node *next = rb_first(&self->syms);
+
+	while (next) {
+		pos = rb_entry(next, struct symbol, rb_node);
+		next = rb_next(&pos->rb_node);
+		symbol__delete(pos);
+	}
+}
+
+void dso__delete(struct dso *self)
+{
+	dso__delete_symbols(self);
+	free(self);
+}
+
+static void dso__insert_symbol(struct dso *self, struct symbol *sym)
+{
+	struct rb_node **p = &self->syms.rb_node;
+	struct rb_node *parent = NULL;
+	const uint64_t ip = sym->start;
+	struct symbol *s;
+
+	while (*p != NULL) {
+		parent = *p;
+		s = rb_entry(parent, struct symbol, rb_node);
+		if (ip < s->start)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+	rb_link_node(&sym->rb_node, parent, p);
+	rb_insert_color(&sym->rb_node, &self->syms);
+}
+
+struct symbol *dso__find_symbol(struct dso *self, uint64_t ip)
+{
+	struct rb_node *n;
+
+	if (self == NULL)
+		return NULL;
+
+	n = self->syms.rb_node;
+
+	while (n) {
+		struct symbol *s = rb_entry(n, struct symbol, rb_node);
+
+		if (ip < s->start)
+			n = n->rb_left;
+		else if (ip > s->end)
+			n = n->rb_right;
+		else
+			return s;
+	}
+
+	return NULL;
+}
+
+size_t dso__fprintf(struct dso *self, FILE *fp)
+{
+	size_t ret = fprintf(fp, "dso: %s\n", self->name);
+
+	struct rb_node *nd;
+	for (nd = rb_first(&self->syms); nd; nd = rb_next(nd)) {
+		struct symbol *pos = rb_entry(nd, struct symbol, rb_node);
+		ret += symbol__fprintf(pos, fp);
+	}
+
+	return ret;
+}
+
+static int hex(char ch)
+{
+	if ((ch >= '0') && (ch <= '9'))
+		return ch - '0';
+	if ((ch >= 'a') && (ch <= 'f'))
+		return ch - 'a' + 10;
+	if ((ch >= 'A') && (ch <= 'F'))
+		return ch - 'A' + 10;
+	return -1;
+}
+
+/*
+ * While we find nice hex chars, build a long_val.
+ * Return number of chars processed.
+ */
+static int hex2long(char *ptr, unsigned long *long_val)
+{
+	const char *p = ptr;
+	*long_val = 0;
+
+	while (*p) {
+		const int hex_val = hex(*p);
+
+		if (hex_val < 0)
+			break;
+
+		*long_val = (*long_val << 4) | hex_val;
+		p++;
+	}
+
+	return p - ptr;
+}
+
+int dso__load_kallsyms(struct dso *self)
+{
+	struct rb_node *nd, *prevnd;
+	char *line = NULL;
+	size_t n;
+	FILE *file = fopen("/proc/kallsyms", "r");
+
+	if (file == NULL)
+		goto out_failure;
+
+	while (!feof(file)) {
+		unsigned long start;
+		struct symbol *sym;
+		int line_len, len;
+		char symbol_type;
+
+		line_len = getline(&line, &n, file);
+		if (line_len < 0)
+			break;
+
+		if (!line)
+			goto out_failure;
+
+		line[--line_len] = '\0'; /* \n */
+
+		len = hex2long(line, &start);
+
+		len++;
+		if (len + 2 >= line_len)
+			continue;
+
+		symbol_type = toupper(line[len]);
+		/*
+		 * We're interested only in code ('T'ext)
+		 */
+		if (symbol_type != 'T' && symbol_type != 'W')
+			continue;
+		/*
+		 * Well fix up the end later, when we have all sorted.
+		 */
+		sym = symbol__new(start, 0xdead, line + len + 2);
+
+		if (sym == NULL)
+			goto out_delete_line;
+
+		dso__insert_symbol(self, sym);
+	}
+
+	/*
+	 * Now that we have all sorted out, just set the ->end of all
+	 * symbols
+	 */
+	prevnd = rb_first(&self->syms);
+
+	if (prevnd == NULL)
+		goto out_delete_line;
+
+	for (nd = rb_next(prevnd); nd; nd = rb_next(nd)) {
+		struct symbol *prev = rb_entry(prevnd, struct symbol, rb_node),
+			      *curr = rb_entry(nd, struct symbol, rb_node);
+
+		prev->end = curr->start - 1;
+		prevnd = nd;
+	}
+
+	free(line);
+	fclose(file);
+
+	return 0;
+
+out_delete_line:
+	free(line);
+out_failure:
+	return -1;
+}
+
+/**
+ * elf_symtab__for_each_symbol - iterate thru all the symbols
+ *
+ * @self: struct elf_symtab instance to iterate
+ * @index: uint32_t index
+ * @sym: GElf_Sym iterator
+ */
+#define elf_symtab__for_each_symbol(syms, nr_syms, index, sym) \
+	for (index = 0, gelf_getsym(syms, index, &sym);\
+	     index < nr_syms; \
+	     index++, gelf_getsym(syms, index, &sym))
+
+static inline uint8_t elf_sym__type(const GElf_Sym *sym)
+{
+	return GELF_ST_TYPE(sym->st_info);
+}
+
+static inline int elf_sym__is_function(const GElf_Sym *sym)
+{
+	return elf_sym__type(sym) == STT_FUNC &&
+	       sym->st_name != 0 &&
+	       sym->st_shndx != SHN_UNDEF &&
+	       sym->st_size != 0;
+}
+
+static inline const char *elf_sym__name(const GElf_Sym *sym,
+					const Elf_Data *symstrs)
+{
+	return symstrs->d_buf + sym->st_name;
+}
+
+static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
+				    GElf_Shdr *shp, const char *name,
+				    size_t *index)
+{
+	Elf_Scn *sec = NULL;
+	size_t cnt = 1;
+
+	while ((sec = elf_nextscn(elf, sec)) != NULL) {
+		char *str;
+
+		gelf_getshdr(sec, shp);
+		str = elf_strptr(elf, ep->e_shstrndx, shp->sh_name);
+		if (!strcmp(name, str)) {
+			if (index)
+				*index = cnt;
+			break;
+		}
+		++cnt;
+	}
+
+	return sec;
+}
+
+static int dso__load_sym(struct dso *self, int fd, const char *name)
+{
+	Elf_Data *symstrs;
+	uint32_t nr_syms;
+	int err = -1;
+	uint32_t index;
+	GElf_Ehdr ehdr;
+	GElf_Shdr shdr;
+	Elf_Data *syms;
+	GElf_Sym sym;
+	Elf_Scn *sec;
+	Elf *elf;
+	int nr = 0;
+
+	elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
+	if (elf == NULL) {
+		fprintf(stderr, "%s: cannot read %s ELF file.\n",
+			__func__, name);
+		goto out_close;
+	}
+
+	if (gelf_getehdr(elf, &ehdr) == NULL) {
+		fprintf(stderr, "%s: cannot get elf header.\n", __func__);
+		goto out_elf_end;
+	}
+
+	sec = elf_section_by_name(elf, &ehdr, &shdr, ".symtab", NULL);
+	if (sec == NULL)
+		sec = elf_section_by_name(elf, &ehdr, &shdr, ".dynsym", NULL);
+
+	if (sec == NULL)
+		goto out_elf_end;
+
+	syms = elf_getdata(sec, NULL);
+	if (syms == NULL)
+		goto out_elf_end;
+
+	sec = elf_getscn(elf, shdr.sh_link);
+	if (sec == NULL)
+		goto out_elf_end;
+
+	symstrs = elf_getdata(sec, NULL);
+	if (symstrs == NULL)
+		goto out_elf_end;
+
+	nr_syms = shdr.sh_size / shdr.sh_entsize;
+
+	elf_symtab__for_each_symbol(syms, nr_syms, index, sym) {
+		struct symbol *f;
+
+		if (!elf_sym__is_function(&sym))
+			continue;
+
+		sec = elf_getscn(elf, sym.st_shndx);
+		if (!sec)
+			goto out_elf_end;
+
+		gelf_getshdr(sec, &shdr);
+		sym.st_value -= shdr.sh_addr - shdr.sh_offset;
+
+		f = symbol__new(sym.st_value, sym.st_size,
+				elf_sym__name(&sym, symstrs));
+		if (!f)
+			goto out_elf_end;
+
+		dso__insert_symbol(self, f);
+
+		nr++;
+	}
+
+	err = nr;
+out_elf_end:
+	elf_end(elf);
+out_close:
+	return err;
+}
+
+int dso__load(struct dso *self)
+{
+	int size = strlen(self->name) + sizeof("/usr/lib/debug%s.debug");
+	char *name = malloc(size);
+	int variant = 0;
+	int ret = -1;
+	int fd;
+
+	if (!name)
+		return -1;
+
+more:
+	do {
+		switch (variant) {
+		case 0: /* Fedora */
+			snprintf(name, size, "/usr/lib/debug%s.debug", self->name);
+			break;
+		case 1: /* Ubuntu */
+			snprintf(name, size, "/usr/lib/debug%s", self->name);
+			break;
+		case 2: /* Sane people */
+			snprintf(name, size, "%s", self->name);
+			break;
+
+		default:
+			goto out;
+		}
+		variant++;
+
+		fd = open(name, O_RDONLY);
+	} while (fd < 0);
+
+	ret = dso__load_sym(self, fd, name);
+	close(fd);
+
+	/*
+	 * Some people seem to have debuginfo files _WITHOUT_ debug info!?!?
+	 */
+	if (!ret)
+		goto more;
+
+out:
+	free(name);
+	return ret;
+}
+
+int dso__load_vmlinux(struct dso *self, const char *vmlinux)
+{
+	int err, fd = open(vmlinux, O_RDONLY);
+
+	if (fd < 0)
+		return -1;
+
+	err = dso__load_sym(self, fd, vmlinux);
+	close(fd);
+
+	return err;
+}
+
+void symbol__init(void)
+{
+	elf_version(EV_CURRENT);
+}
diff --git a/Documentation/perf_counter/util/symbol.h b/Documentation/perf_counter/util/symbol.h
new file mode 100644
index 0000000..6dffe76
--- /dev/null
+++ b/Documentation/perf_counter/util/symbol.h
@@ -0,0 +1,33 @@
+#ifndef _PERF_SYMBOL_
+#define _PERF_SYMBOL_ 1
+
+#include <linux/types.h>
+#include "list.h"
+#include "rbtree.h"
+
+struct symbol {
+	struct rb_node	rb_node;
+	__u64		start;
+	__u64		end;
+	char		name[0];
+};
+
+struct dso {
+	struct list_head node;
+	struct rb_root	 syms;
+	char		 name[0];
+};
+
+struct dso *dso__new(const char *name);
+void dso__delete(struct dso *self);
+
+struct symbol *dso__find_symbol(struct dso *self, uint64_t ip);
+
+int dso__load_kallsyms(struct dso *self);
+int dso__load_vmlinux(struct dso *self, const char *vmlinux);
+int dso__load(struct dso *self);
+
+size_t dso__fprintf(struct dso *self, FILE *fp);
+
+void symbol__init(void);
+#endif /* _PERF_SYMBOL_ */
-- 
cgit v0.10.2


From 0085c954140d27937ada29d139c16341075d45e4 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 28 May 2009 14:55:13 -0300
Subject: perf_counter tools: struct symbol priv area

When creating a dso instance allow asking that all symbols in this dso
have a private area just before the symbol.

perf top will use this for its counters, etc.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090528175513.GD4747@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 04fc7ec..889067e 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -83,7 +83,7 @@ static struct dso *dsos__findnew(const char *name)
 	int nr;
 
 	if (dso == NULL) {
-		dso = dso__new(name);
+		dso = dso__new(name, 0);
 		if (!dso)
 			goto out_delete_dso;
 
@@ -120,7 +120,7 @@ static int load_kernel(void)
 {
 	int err = -1;
 
-	kernel_dso = dso__new("[kernel]");
+	kernel_dso = dso__new("[kernel]", 0);
 	if (!kernel_dso)
 		return -1;
 
diff --git a/Documentation/perf_counter/util/symbol.c b/Documentation/perf_counter/util/symbol.c
index d06de2c..7088206 100644
--- a/Documentation/perf_counter/util/symbol.c
+++ b/Documentation/perf_counter/util/symbol.c
@@ -7,22 +7,27 @@
 #include <elf.h>
 
 static struct symbol *symbol__new(uint64_t start, uint64_t len,
-				  const char *name)
+				  const char *name, unsigned int priv_size)
 {
-	struct symbol *self = malloc(sizeof(*self) + strlen(name) + 1);
+	size_t namelen = strlen(name) + 1;
+	struct symbol *self = malloc(priv_size + sizeof(*self) + namelen);
 
 	if (self != NULL) {
+		if (priv_size) {
+			memset(self, 0, priv_size);
+			self = ((void *)self) + priv_size;
+		}
 		self->start = start;
 		self->end   = start + len;
-		strcpy(self->name, name);
+		memcpy(self->name, name, namelen);
 	}
 
 	return self;
 }
 
-static void symbol__delete(struct symbol *self)
+static void symbol__delete(struct symbol *self, unsigned int priv_size)
 {
-	free(self);
+	free(((void *)self) - priv_size);
 }
 
 static size_t symbol__fprintf(struct symbol *self, FILE *fp)
@@ -31,13 +36,14 @@ static size_t symbol__fprintf(struct symbol *self, FILE *fp)
 		       self->start, self->end, self->name);
 }
 
-struct dso *dso__new(const char *name)
+struct dso *dso__new(const char *name, unsigned int sym_priv_size)
 {
 	struct dso *self = malloc(sizeof(*self) + strlen(name) + 1);
 
 	if (self != NULL) {
 		strcpy(self->name, name);
 		self->syms = RB_ROOT;
+		self->sym_priv_size = sym_priv_size;
 	}
 
 	return self;
@@ -51,7 +57,7 @@ static void dso__delete_symbols(struct dso *self)
 	while (next) {
 		pos = rb_entry(next, struct symbol, rb_node);
 		next = rb_next(&pos->rb_node);
-		symbol__delete(pos);
+		symbol__delete(pos, self->sym_priv_size);
 	}
 }
 
@@ -189,7 +195,8 @@ int dso__load_kallsyms(struct dso *self)
 		/*
 		 * Well fix up the end later, when we have all sorted.
 		 */
-		sym = symbol__new(start, 0xdead, line + len + 2);
+		sym = symbol__new(start, 0xdead, line + len + 2,
+				  self->sym_priv_size);
 
 		if (sym == NULL)
 			goto out_delete_line;
@@ -340,7 +347,8 @@ static int dso__load_sym(struct dso *self, int fd, const char *name)
 		sym.st_value -= shdr.sh_addr - shdr.sh_offset;
 
 		f = symbol__new(sym.st_value, sym.st_size,
-				elf_sym__name(&sym, symstrs));
+				elf_sym__name(&sym, symstrs),
+				self->sym_priv_size);
 		if (!f)
 			goto out_elf_end;
 
diff --git a/Documentation/perf_counter/util/symbol.h b/Documentation/perf_counter/util/symbol.h
index 6dffe76..9e120af 100644
--- a/Documentation/perf_counter/util/symbol.h
+++ b/Documentation/perf_counter/util/symbol.h
@@ -15,12 +15,18 @@ struct symbol {
 struct dso {
 	struct list_head node;
 	struct rb_root	 syms;
+	unsigned int	 sym_priv_size;
 	char		 name[0];
 };
 
-struct dso *dso__new(const char *name);
+struct dso *dso__new(const char *name, unsigned int sym_priv_size);
 void dso__delete(struct dso *self);
 
+static inline void *dso__sym_priv(struct dso *self, struct symbol *sym)
+{
+	return ((void *)sym) - self->sym_priv_size;
+}
+
 struct symbol *dso__find_symbol(struct dso *self, uint64_t ip);
 
 int dso__load_kallsyms(struct dso *self);
-- 
cgit v0.10.2


From a827c875f2ebe69c6e6db5e7f112d4beb4d80f01 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 28 May 2009 14:55:19 -0300
Subject: perf_counter tools: Consolidate dso methods to load kernel symbols

Now one has just to use dso__load_kernel() optionally passing a vmlinux
filename.

Will make things easier for perf top that will want to pass a callback
to filter some symbols.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 889067e..4e9f2bc 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -118,18 +118,13 @@ static void dsos__fprintf(FILE *fp)
 
 static int load_kernel(void)
 {
-	int err = -1;
+	int err;
 
 	kernel_dso = dso__new("[kernel]", 0);
 	if (!kernel_dso)
 		return -1;
 
-	if (vmlinux)
-		err = dso__load_vmlinux(kernel_dso, vmlinux);
-
-	if (err)
-		err = dso__load_kallsyms(kernel_dso);
-
+	err = dso__load_kernel(kernel_dso, vmlinux);
 	if (err) {
 		dso__delete(kernel_dso);
 		kernel_dso = NULL;
diff --git a/Documentation/perf_counter/util/symbol.c b/Documentation/perf_counter/util/symbol.c
index 7088206..504ac31 100644
--- a/Documentation/perf_counter/util/symbol.c
+++ b/Documentation/perf_counter/util/symbol.c
@@ -155,7 +155,7 @@ static int hex2long(char *ptr, unsigned long *long_val)
 	return p - ptr;
 }
 
-int dso__load_kallsyms(struct dso *self)
+static int dso__load_kallsyms(struct dso *self)
 {
 	struct rb_node *nd, *prevnd;
 	char *line = NULL;
@@ -410,7 +410,7 @@ out:
 	return ret;
 }
 
-int dso__load_vmlinux(struct dso *self, const char *vmlinux)
+static int dso__load_vmlinux(struct dso *self, const char *vmlinux)
 {
 	int err, fd = open(vmlinux, O_RDONLY);
 
@@ -423,6 +423,19 @@ int dso__load_vmlinux(struct dso *self, const char *vmlinux)
 	return err;
 }
 
+int dso__load_kernel(struct dso *self, const char *vmlinux)
+{
+	int err = -1;
+
+	if (vmlinux)
+		err = dso__load_vmlinux(self, vmlinux);
+
+	if (err)
+		err = dso__load_kallsyms(self);
+
+	return err;
+}
+
 void symbol__init(void)
 {
 	elf_version(EV_CURRENT);
diff --git a/Documentation/perf_counter/util/symbol.h b/Documentation/perf_counter/util/symbol.h
index 9e120af..db2fdf9 100644
--- a/Documentation/perf_counter/util/symbol.h
+++ b/Documentation/perf_counter/util/symbol.h
@@ -29,8 +29,7 @@ static inline void *dso__sym_priv(struct dso *self, struct symbol *sym)
 
 struct symbol *dso__find_symbol(struct dso *self, uint64_t ip);
 
-int dso__load_kallsyms(struct dso *self);
-int dso__load_vmlinux(struct dso *self, const char *vmlinux);
+int dso__load_kernel(struct dso *self, const char *vmlinux);
 int dso__load(struct dso *self);
 
 size_t dso__fprintf(struct dso *self, FILE *fp);
-- 
cgit v0.10.2


From 69ee69f63c82e63d9c6c6081d12673af4933c51e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 28 May 2009 14:55:26 -0300
Subject: perf_counter tools: Optionally pass a symbol filter to the dso load
 routines

Will be used by perf top.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090528175526.GF4747@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 4e9f2bc..412d524 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -87,7 +87,7 @@ static struct dso *dsos__findnew(const char *name)
 		if (!dso)
 			goto out_delete_dso;
 
-		nr = dso__load(dso);
+		nr = dso__load(dso, NULL);
 		if (nr < 0) {
 			fprintf(stderr, "Failed to open: %s\n", name);
 			goto out_delete_dso;
@@ -124,7 +124,7 @@ static int load_kernel(void)
 	if (!kernel_dso)
 		return -1;
 
-	err = dso__load_kernel(kernel_dso, vmlinux);
+	err = dso__load_kernel(kernel_dso, vmlinux, NULL);
 	if (err) {
 		dso__delete(kernel_dso);
 		kernel_dso = NULL;
diff --git a/Documentation/perf_counter/util/symbol.c b/Documentation/perf_counter/util/symbol.c
index 504ac31..4728121 100644
--- a/Documentation/perf_counter/util/symbol.c
+++ b/Documentation/perf_counter/util/symbol.c
@@ -155,7 +155,7 @@ static int hex2long(char *ptr, unsigned long *long_val)
 	return p - ptr;
 }
 
-static int dso__load_kallsyms(struct dso *self)
+static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter)
 {
 	struct rb_node *nd, *prevnd;
 	char *line = NULL;
@@ -201,7 +201,10 @@ static int dso__load_kallsyms(struct dso *self)
 		if (sym == NULL)
 			goto out_delete_line;
 
-		dso__insert_symbol(self, sym);
+		if (filter && filter(self, sym))
+			symbol__delete(sym, self->sym_priv_size);
+		else
+			dso__insert_symbol(self, sym);
 	}
 
 	/*
@@ -286,7 +289,8 @@ static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
 	return sec;
 }
 
-static int dso__load_sym(struct dso *self, int fd, const char *name)
+static int dso__load_sym(struct dso *self, int fd, const char *name,
+			 symbol_filter_t filter)
 {
 	Elf_Data *symstrs;
 	uint32_t nr_syms;
@@ -352,9 +356,12 @@ static int dso__load_sym(struct dso *self, int fd, const char *name)
 		if (!f)
 			goto out_elf_end;
 
-		dso__insert_symbol(self, f);
-
-		nr++;
+		if (filter && filter(self, f))
+			symbol__delete(f, self->sym_priv_size);
+		else {
+			dso__insert_symbol(self, f);
+			nr++;
+		}
 	}
 
 	err = nr;
@@ -364,7 +371,7 @@ out_close:
 	return err;
 }
 
-int dso__load(struct dso *self)
+int dso__load(struct dso *self, symbol_filter_t filter)
 {
 	int size = strlen(self->name) + sizeof("/usr/lib/debug%s.debug");
 	char *name = malloc(size);
@@ -396,7 +403,7 @@ more:
 		fd = open(name, O_RDONLY);
 	} while (fd < 0);
 
-	ret = dso__load_sym(self, fd, name);
+	ret = dso__load_sym(self, fd, name, filter);
 	close(fd);
 
 	/*
@@ -410,28 +417,29 @@ out:
 	return ret;
 }
 
-static int dso__load_vmlinux(struct dso *self, const char *vmlinux)
+static int dso__load_vmlinux(struct dso *self, const char *vmlinux,
+			     symbol_filter_t filter)
 {
 	int err, fd = open(vmlinux, O_RDONLY);
 
 	if (fd < 0)
 		return -1;
 
-	err = dso__load_sym(self, fd, vmlinux);
+	err = dso__load_sym(self, fd, vmlinux, filter);
 	close(fd);
 
 	return err;
 }
 
-int dso__load_kernel(struct dso *self, const char *vmlinux)
+int dso__load_kernel(struct dso *self, const char *vmlinux, symbol_filter_t filter)
 {
 	int err = -1;
 
 	if (vmlinux)
-		err = dso__load_vmlinux(self, vmlinux);
+		err = dso__load_vmlinux(self, vmlinux, filter);
 
 	if (err)
-		err = dso__load_kallsyms(self);
+		err = dso__load_kallsyms(self, filter);
 
 	return err;
 }
diff --git a/Documentation/perf_counter/util/symbol.h b/Documentation/perf_counter/util/symbol.h
index db2fdf9..b0299bc 100644
--- a/Documentation/perf_counter/util/symbol.h
+++ b/Documentation/perf_counter/util/symbol.h
@@ -19,6 +19,8 @@ struct dso {
 	char		 name[0];
 };
 
+typedef int (*symbol_filter_t)(struct dso *self, struct symbol *sym);
+
 struct dso *dso__new(const char *name, unsigned int sym_priv_size);
 void dso__delete(struct dso *self);
 
@@ -29,8 +31,9 @@ static inline void *dso__sym_priv(struct dso *self, struct symbol *sym)
 
 struct symbol *dso__find_symbol(struct dso *self, uint64_t ip);
 
-int dso__load_kernel(struct dso *self, const char *vmlinux);
-int dso__load(struct dso *self);
+int dso__load_kernel(struct dso *self, const char *vmlinux,
+		     symbol_filter_t filter);
+int dso__load(struct dso *self, symbol_filter_t filter);
 
 size_t dso__fprintf(struct dso *self, FILE *fp);
 
-- 
cgit v0.10.2


From de04687f868bf98e4ef644af91ed85a3bc212ce8 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 28 May 2009 14:55:41 -0300
Subject: perf_counter tools: Convert builtin-top to use libperf symbol
 routines

Now both perf top and report use the same routines.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090528175541.GG4747@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index a890872..52ba9f4 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -44,8 +44,9 @@
 
 #include "perf.h"
 #include "builtin.h"
+#include "util/symbol.h"
 #include "util/util.h"
-#include "util/util.h"
+#include "util/rbtree.h"
 #include "util/parse-options.h"
 #include "util/parse-events.h"
 
@@ -125,19 +126,21 @@ static uint64_t			min_ip;
 static uint64_t			max_ip = -1ll;
 
 struct sym_entry {
-	unsigned long long	addr;
-	char			*sym;
+	struct rb_node		rb_node;
+	struct list_head	node;
 	unsigned long		count[MAX_COUNTERS];
 	int			skip;
 };
 
-#define MAX_SYMS		100000
-
-static int sym_table_count;
-
 struct sym_entry		*sym_filter_entry;
 
-static struct sym_entry		sym_table[MAX_SYMS];
+struct dso *kernel_dso;
+
+/*
+ * Symbols will be added here in record_ip and will get out
+ * after decayed.
+ */
+static LIST_HEAD(active_symbols);
 
 /*
  * Ordering weight: count-1 * count-2 * ... / count-n
@@ -157,42 +160,60 @@ static double sym_weight(const struct sym_entry *sym)
 	return weight;
 }
 
-static int compare(const void *__sym1, const void *__sym2)
-{
-	const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
-
-	return sym_weight(sym1) < sym_weight(sym2);
-}
-
 static long			events;
 static long			userspace_events;
 static const char		CONSOLE_CLEAR[] = "[H[2J";
 
-static struct sym_entry		tmp[MAX_SYMS];
+static void list_insert_active_sym(struct sym_entry *syme)
+{
+	list_add(&syme->node, &active_symbols);
+}
+
+static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se)
+{
+	struct rb_node **p = &tree->rb_node;
+	struct rb_node *parent = NULL;
+	struct sym_entry *iter;
+
+	while (*p != NULL) {
+		parent = *p;
+		iter = rb_entry(parent, struct sym_entry, rb_node);
+
+		if (sym_weight(se) > sym_weight(iter))
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&se->rb_node, parent, p);
+	rb_insert_color(&se->rb_node, tree);
+}
 
 static void print_sym_table(void)
 {
-	int i, j, active_count, printed;
+	int printed, j;
 	int counter;
 	float events_per_sec = events/delay_secs;
 	float kevents_per_sec = (events-userspace_events)/delay_secs;
 	float sum_kevents = 0.0;
+	struct sym_entry *syme, *n;
+	struct rb_root tmp = RB_ROOT;
+	struct rb_node *nd;
 
 	events = userspace_events = 0;
 
-	/* Iterate over symbol table and copy/tally/decay active symbols. */
-	for (i = 0, active_count = 0; i < sym_table_count; i++) {
-		if (sym_table[i].count[0]) {
-			tmp[active_count++] = sym_table[i];
-			sum_kevents += sym_table[i].count[0];
+	/* Sort the active symbols */
+	list_for_each_entry_safe(syme, n, &active_symbols, node) {
+		if (syme->count[0] != 0) {
+			rb_insert_active_sym(&tmp, syme);
+			sum_kevents += syme->count[0];
 
 			for (j = 0; j < nr_counters; j++)
-				sym_table[i].count[j] = zero ? 0 : sym_table[i].count[j] * 7 / 8;
-		}
+				syme->count[j] = zero ? 0 : syme->count[j] * 7 / 8;
+		} else
+			list_del_init(&syme->node);
 	}
 
-	qsort(tmp, active_count + 1, sizeof(tmp[0]), compare);
-
 	write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
 
 	printf(
@@ -238,23 +259,25 @@ static void print_sym_table(void)
 	       	       "  ______     ______   _____   ________________   _______________\n\n"
 	);
 
-	for (i = 0, printed = 0; i < active_count; i++) {
+	for (nd = rb_first(&tmp); nd; nd = rb_next(nd)) {
+		struct sym_entry *syme = rb_entry(nd, struct sym_entry, rb_node);
+		struct symbol *sym = (struct symbol *)(syme + 1);
 		float pcnt;
 
-		if (++printed > 18 || tmp[i].count[0] < count_filter)
+		if (++printed > 18 || syme->count[0] < count_filter)
 			break;
 
-		pcnt = 100.0 - (100.0*((sum_kevents-tmp[i].count[0])/sum_kevents));
+		pcnt = 100.0 - (100.0 * ((sum_kevents - syme->count[0]) /
+					 sum_kevents));
 
 		if (nr_counters == 1)
 			printf("%19.2f - %4.1f%% - %016llx : %s\n",
-				sym_weight(tmp + i),
-				pcnt, tmp[i].addr, tmp[i].sym);
+				sym_weight(syme),
+				pcnt, sym->start, sym->name);
 		else
 			printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n",
-				sym_weight(tmp + i),
-				tmp[i].count[0],
-				pcnt, tmp[i].addr, tmp[i].sym);
+				sym_weight(syme), syme->count[0],
+				pcnt, sym->start, sym->name);
 	}
 
 	{
@@ -277,146 +300,85 @@ static void *display_thread(void *arg)
 	return NULL;
 }
 
-static int read_symbol(FILE *in, struct sym_entry *s)
+static int symbol_filter(struct dso *self, struct symbol *sym)
 {
-	static int filter_match = 0;
-	char *sym, stype;
-	char str[500];
-	int rc, pos;
-
-	rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
-	if (rc == EOF)
-		return -1;
-
-	assert(rc == 3);
-
-	/* skip until end of line: */
-	pos = strlen(str);
-	do {
-		rc = fgetc(in);
-		if (rc == '\n' || rc == EOF || pos >= 499)
-			break;
-		str[pos] = rc;
-		pos++;
-	} while (1);
-	str[pos] = 0;
-
-	sym = str;
-
-	/* Filter out known duplicates and non-text symbols. */
-	if (!strcmp(sym, "_text"))
-		return 1;
-	if (!min_ip && !strcmp(sym, "_stext"))
-		return 1;
-	if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
-		return 1;
-	if (stype != 'T' && stype != 't')
-		return 1;
-	if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
-		return 1;
-	if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
+	static int filter_match;
+	struct sym_entry *syme;
+	const char *name = sym->name;
+
+	if (!strcmp(name, "_text") ||
+	    !strcmp(name, "_etext") ||
+	    !strcmp(name, "_sinittext") ||
+	    !strncmp("init_module", name, 11) ||
+	    !strncmp("cleanup_module", name, 14) ||
+	    strstr(name, "_text_start") ||
+	    strstr(name, "_text_end"))
 		return 1;
 
-	s->sym = malloc(strlen(str)+1);
-	assert(s->sym);
-
-	strcpy((char *)s->sym, str);
-	s->skip = 0;
-
+	syme = dso__sym_priv(self, sym);
 	/* Tag events to be skipped. */
-	if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
-		s->skip = 1;
-	else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
-		s->skip = 1;
-	else if (!strcmp("mwait_idle", s->sym))
-		s->skip = 1;
+	if (!strcmp("default_idle", name) ||
+	    !strcmp("cpu_idle", name) ||
+	    !strcmp("enter_idle", name) ||
+	    !strcmp("exit_idle", name) ||
+	    !strcmp("mwait_idle", name))
+		syme->skip = 1;
 
 	if (filter_match == 1) {
-		filter_end = s->addr;
+		filter_end = sym->start;
 		filter_match = -1;
 		if (filter_end - filter_start > 10000) {
-			printf("hm, too large filter symbol <%s> - skipping.\n",
+			fprintf(stderr,
+				"hm, too large filter symbol <%s> - skipping.\n",
 				sym_filter);
-			printf("symbol filter start: %016lx\n", filter_start);
-			printf("                end: %016lx\n", filter_end);
+			fprintf(stderr, "symbol filter start: %016lx\n",
+				filter_start);
+			fprintf(stderr, "                end: %016lx\n",
+				filter_end);
 			filter_end = filter_start = 0;
 			sym_filter = NULL;
 			sleep(1);
 		}
 	}
-	if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
+
+	if (filter_match == 0 && sym_filter && !strcmp(name, sym_filter)) {
 		filter_match = 1;
-		filter_start = s->addr;
+		filter_start = sym->start;
 	}
 
+
 	return 0;
 }
 
-static int compare_addr(const void *__sym1, const void *__sym2)
+static int parse_symbols(void)
 {
-	const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
+	struct rb_node *node;
+	struct symbol  *sym;
 
-	return sym1->addr > sym2->addr;
-}
-
-static void sort_symbol_table(void)
-{
-	int i, dups;
-
-	do {
-		qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
-		for (i = 0, dups = 0; i < sym_table_count; i++) {
-			if (sym_table[i].addr == sym_table[i+1].addr) {
-				sym_table[i+1].addr = -1ll;
-				dups++;
-			}
-		}
-		sym_table_count -= dups;
-	} while(dups);
-}
-
-static void parse_symbols(void)
-{
-	struct sym_entry *last;
+	kernel_dso = dso__new("[kernel]", sizeof(struct sym_entry));
+	if (kernel_dso == NULL)
+		return -1;
 
-	FILE *kallsyms = fopen("/proc/kallsyms", "r");
+	if (dso__load_kernel(kernel_dso, NULL, symbol_filter) != 0)
+		goto out_delete_dso;
 
-	if (!kallsyms) {
-		printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
-		exit(-1);
-	}
+	node = rb_first(&kernel_dso->syms);
+	sym = rb_entry(node, struct symbol, rb_node);
+	min_ip = sym->start;
 
-	while (!feof(kallsyms)) {
-		if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
-			sym_table_count++;
-			assert(sym_table_count <= MAX_SYMS);
-		}
-	}
+	node = rb_last(&kernel_dso->syms);
+	sym = rb_entry(node, struct symbol, rb_node);
+	max_ip = sym->start;
 
-	sort_symbol_table();
-	min_ip = sym_table[0].addr;
-	max_ip = sym_table[sym_table_count-1].addr;
-	last = sym_table + sym_table_count++;
+	if (dump_symtab)
+		dso__fprintf(kernel_dso, stdout);
 
-	last->addr = -1ll;
-	last->sym = "<end>";
-
-	if (filter_end) {
-		int count;
-		for (count=0; count < sym_table_count; count ++) {
-			if (!strcmp(sym_table[count].sym, sym_filter)) {
-				sym_filter_entry = &sym_table[count];
-				break;
-			}
-		}
-	}
-	if (dump_symtab) {
-		int i;
+	return 0;
 
-		for (i = 0; i < sym_table_count; i++)
-			fprintf(stderr, "%llx %s\n",
-				sym_table[i].addr, sym_table[i].sym);
-	}
+out_delete_dso:
+	dso__delete(kernel_dso);
+	kernel_dso = NULL;
+	return -1;
 }
 
 #define TRACE_COUNT     3
@@ -426,51 +388,20 @@ static void parse_symbols(void)
  */
 static void record_ip(uint64_t ip, int counter)
 {
-	int left_idx, middle_idx, right_idx, idx;
-	unsigned long left, middle, right;
-
-	left_idx = 0;
-	right_idx = sym_table_count-1;
-	assert(ip <= max_ip && ip >= min_ip);
-
-	while (left_idx + 1 < right_idx) {
-		middle_idx = (left_idx + right_idx) / 2;
+	struct symbol *sym = dso__find_symbol(kernel_dso, ip);
 
-		left   = sym_table[  left_idx].addr;
-		middle = sym_table[middle_idx].addr;
-		right  = sym_table[ right_idx].addr;
+	if (sym != NULL) {
+		struct sym_entry *syme = dso__sym_priv(kernel_dso, sym);
 
-		if (!(left <= middle && middle <= right)) {
-			printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
-			printf("%d %d %d\n", left_idx, middle_idx, right_idx);
+		if (!syme->skip) {
+			syme->count[counter]++;
+			if (list_empty(&syme->node) || !syme->node.next)
+				list_insert_active_sym(syme);
+			return;
 		}
-		assert(left <= middle && middle <= right);
-		if (!(left <= ip && ip <= right)) {
-			printf(" left: %016lx\n", left);
-			printf("   ip: %016lx\n", (unsigned long)ip);
-			printf("right: %016lx\n", right);
-		}
-		assert(left <= ip && ip <= right);
-		/*
-		 * [ left .... target .... middle .... right ]
-		 *   => right := middle
-		 */
-		if (ip < middle) {
-			right_idx = middle_idx;
-			continue;
-		}
-		/*
-		 * [ left .... middle ... target ... right ]
-		 *   => left := middle
-		 */
-		left_idx = middle_idx;
 	}
 
-	idx = left_idx;
-
-	if (!sym_table[idx].skip)
-		sym_table[idx].count[counter]++;
-	else events--;
+	events--;
 }
 
 static void process_event(uint64_t ip, int counter)
-- 
cgit v0.10.2


From 9e09675366695405412b709e91709c1ce2925c90 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 28 May 2009 16:25:34 +0200
Subject: perf_counter tools: Document '--' option parsing terminator

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Documentation/perf-record.txt b/Documentation/perf_counter/Documentation/perf-record.txt
index 353db1b..a93d2ec 100644
--- a/Documentation/perf_counter/Documentation/perf-record.txt
+++ b/Documentation/perf_counter/Documentation/perf-record.txt
@@ -9,6 +9,7 @@ SYNOPSIS
 --------
 [verse]
 'perf record' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
+'perf record' [-e <EVENT> | --event=EVENT] [-l] [-a] -- <command> [<options>]
 
 DESCRIPTION
 -----------
diff --git a/Documentation/perf_counter/Documentation/perf-stat.txt b/Documentation/perf_counter/Documentation/perf-stat.txt
index 7fcab27..828c59ff 100644
--- a/Documentation/perf_counter/Documentation/perf-stat.txt
+++ b/Documentation/perf_counter/Documentation/perf-stat.txt
@@ -9,6 +9,7 @@ SYNOPSIS
 --------
 [verse]
 'perf stat' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
+'perf stat' [-e <EVENT> | --event=EVENT] [-l] [-a] -- <command> [<options>]
 
 DESCRIPTION
 -----------
diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 4a06866..23d1224 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -397,7 +397,8 @@ static int __cmd_record(int argc, const char **argv)
 }
 
 static const char * const record_usage[] = {
-	"perf record [<options>] <command>",
+	"perf record [<options>] [<command>]",
+	"perf record [<options>] -- <command> [<options>]",
 	NULL
 };
 
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index ce661e2..ac14086 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -212,6 +212,7 @@ static void skip_signal(int signo)
 
 static const char * const stat_usage[] = {
 	"perf stat [<options>] <command>",
+	"perf stat [<options>] -- <command> [<options>]",
 	NULL
 };
 
-- 
cgit v0.10.2


From 2f102607ac77354b02a76cf2748598ce9f270f08 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Wed, 27 May 2009 23:59:58 -0400
Subject: i7300_idle: allow testing on i5000-series hardware w/o re-compile

Testing the i7300_idle driver on i5000-series hardware required
an edit to i7300_idle.h to "#define SUPPORT_I5000 1" and a re-build
of both i7300_idle and ioat_dma.

Replace that build-time scheme with a load-time module parameter:
"7300_idle.forceload=1" to make it easier to test the driver
on hardware that while not officially validated, works fine
and is much more commonly available.

By default (no modparam) the driver will continue to load
only on the i7300.

Note that ioat_dma runs a copy of i7300_idle's probe routine
to know to reserve an IOAT channel for i7300_idle.
This change makes ioat_dma do that always on the i5000,
just like it does on the i7300.

Signed-off-by: Len Brown <len.brown@intel.com>
Acked-by: Andrew Henroid <andrew.d.henroid@intel.com>

diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c
index 1955ee8..a600fc0 100644
--- a/drivers/dma/ioat_dma.c
+++ b/drivers/dma/ioat_dma.c
@@ -173,7 +173,7 @@ static int ioat_dma_enumerate_channels(struct ioatdma_device *device)
 	xfercap = (xfercap_scale == 0 ? -1 : (1UL << xfercap_scale));
 
 #ifdef  CONFIG_I7300_IDLE_IOAT_CHANNEL
-	if (i7300_idle_platform_probe(NULL, NULL) == 0) {
+	if (i7300_idle_platform_probe(NULL, NULL, 1) == 0) {
 		device->common.chancnt--;
 	}
 #endif
diff --git a/drivers/idle/i7300_idle.c b/drivers/idle/i7300_idle.c
index bf74039..949c97f 100644
--- a/drivers/idle/i7300_idle.c
+++ b/drivers/idle/i7300_idle.c
@@ -41,6 +41,10 @@ static int debug;
 module_param_named(debug, debug, uint, 0644);
 MODULE_PARM_DESC(debug, "Enable debug printks in this driver");
 
+static int forceload;
+module_param_named(forceload, forceload, uint, 0644);
+MODULE_PARM_DESC(debug, "Enable driver testing on unvalidated i5000");
+
 #define dprintk(fmt, arg...) \
 	do { if (debug) printk(KERN_INFO I7300_PRINT fmt, ##arg); } while (0)
 
@@ -552,7 +556,7 @@ static int __init i7300_idle_init(void)
 	cpus_clear(idle_cpumask);
 	total_us = 0;
 
-	if (i7300_idle_platform_probe(&fbd_dev, &ioat_dev))
+	if (i7300_idle_platform_probe(&fbd_dev, &ioat_dev, forceload))
 		return -ENODEV;
 
 	if (i7300_idle_thrt_save())
diff --git a/include/linux/i7300_idle.h b/include/linux/i7300_idle.h
index 05a80c4..1587b7d 100644
--- a/include/linux/i7300_idle.h
+++ b/include/linux/i7300_idle.h
@@ -16,35 +16,33 @@
 struct fbd_ioat {
 	unsigned int vendor;
 	unsigned int ioat_dev;
+	unsigned int enabled;
 };
 
 /*
  * The i5000 chip-set has the same hooks as the i7300
- * but support is disabled by default because this driver
- * has not been validated on that platform.
+ * but it is not enabled by default and must be manually
+ * manually enabled with "forceload=1" because it is
+ * only lightly validated.
  */
-#define SUPPORT_I5000 0
 
 static const struct fbd_ioat fbd_ioat_list[] = {
-	{PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_CNB},
-#if SUPPORT_I5000
-	{PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT},
-#endif
+	{PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_CNB, 1},
+	{PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT, 0},
 	{0, 0}
 };
 
 /* table of devices that work with this driver */
 static const struct pci_device_id pci_tbl[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_FBD_CNB) },
-#if SUPPORT_I5000
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_5000_ERR) },
-#endif
 	{ } /* Terminating entry */
 };
 
 /* Check for known platforms with I/O-AT */
 static inline int i7300_idle_platform_probe(struct pci_dev **fbd_dev,
-						struct pci_dev **ioat_dev)
+						struct pci_dev **ioat_dev,
+						int enable_all)
 {
 	int i;
 	struct pci_dev *memdev, *dmadev;
@@ -69,6 +67,8 @@ static inline int i7300_idle_platform_probe(struct pci_dev **fbd_dev,
 	for (i = 0; fbd_ioat_list[i].vendor != 0; i++) {
 		if (dmadev->vendor == fbd_ioat_list[i].vendor &&
 		    dmadev->device == fbd_ioat_list[i].ioat_dev) {
+			if (!(fbd_ioat_list[i].enabled || enable_all))
+				continue;
 			if (fbd_dev)
 				*fbd_dev = memdev;
 			if (ioat_dev)
-- 
cgit v0.10.2


From a3ec8d70f1a55acccc4874fe9b4dadbbb9454a0f Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 29 May 2009 06:46:46 +0200
Subject: perf_counter tools: Fix top symbol table dump typo

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 52ba9f4..0d100f5 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -371,7 +371,7 @@ static int parse_symbols(void)
 	max_ip = sym->start;
 
 	if (dump_symtab)
-		dso__fprintf(kernel_dso, stdout);
+		dso__fprintf(kernel_dso, stderr);
 
 	return 0;
 
-- 
cgit v0.10.2


From da417a7537cbf4beb28a08a49adf915f2358040c Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 29 May 2009 08:23:16 +0200
Subject: perf_counter tools: Fix top symbol table max_ip typo

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 0d100f5..ebe8bec 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -368,7 +368,7 @@ static int parse_symbols(void)
 
 	node = rb_last(&kernel_dso->syms);
 	sym = rb_entry(node, struct symbol, rb_node);
-	max_ip = sym->start;
+	max_ip = sym->end;
 
 	if (dump_symtab)
 		dso__fprintf(kernel_dso, stderr);
-- 
cgit v0.10.2


From c323d95fa4dbe0b6bf6d59e24a0b7db067dd08a7 Mon Sep 17 00:00:00 2001
From: Yong Wang <yong.y.wang@linux.intel.com>
Date: Fri, 29 May 2009 13:28:35 +0800
Subject: perf_counter/x86: Always use NMI for performance-monitoring interrupt

Always use NMI for performance-monitoring interrupt as there could be
racy situations if we switch between irq and nmi mode frequently.

Signed-off-by: Yong Wang <yong.y.wang@intel.com>
LKML-Reference: <20090529052835.GA13657@ywang-moblin2.bj.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index d08dd52..876ed97 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -91,10 +91,10 @@ extern void set_perf_counter_pending(void);
 
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
-extern void perf_counters_lapic_init(int nmi);
+extern void perf_counters_lapic_init(void);
 #else
 static inline void init_hw_perf_counters(void)		{ }
-static inline void perf_counters_lapic_init(int nmi)	{ }
+static inline void perf_counters_lapic_init(void)	{ }
 #endif
 
 #endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 89b63b5..60df2ef 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1135,7 +1135,7 @@ void __cpuinit setup_local_APIC(void)
 		apic_write(APIC_ESR, 0);
 	}
 #endif
-	perf_counters_lapic_init(0);
+	perf_counters_lapic_init();
 
 	preempt_disable();
 
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 2eeaa99..316b0c9 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -604,7 +604,7 @@ try_generic:
 		hwc->counter_base = x86_pmu.perfctr;
 	}
 
-	perf_counters_lapic_init(hwc->nmi);
+	perf_counters_lapic_init();
 
 	x86_pmu.disable(hwc, idx);
 
@@ -863,24 +863,15 @@ void set_perf_counter_pending(void)
 	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
 }
 
-void perf_counters_lapic_init(int nmi)
+void perf_counters_lapic_init(void)
 {
-	u32 apic_val;
-
 	if (!x86_pmu_initialized())
 		return;
 
 	/*
-	 * Enable the performance counter vector in the APIC LVT:
+	 * Always use NMI for PMU
 	 */
-	apic_val = apic_read(APIC_LVTERR);
-
-	apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
-	if (nmi)
-		apic_write(APIC_LVTPC, APIC_DM_NMI);
-	else
-		apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
-	apic_write(APIC_LVTERR, apic_val);
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
 }
 
 static int __kprobes
@@ -1054,7 +1045,7 @@ void __init init_hw_perf_counters(void)
 
 	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
 
-	perf_counters_lapic_init(0);
+	perf_counters_lapic_init();
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 
-- 
cgit v0.10.2


From c04f5e5d7b523f90ee3cdd70a68c4002aaecd3fa Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 29 May 2009 09:10:54 +0200
Subject: perf_counter tools: Clean up builtin-stat.c's do_perfstat()

[ Impact: cleanup ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index ac14086..6a29361 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -109,11 +109,75 @@ static void create_perfstat_counter(int counter)
 	}
 }
 
+/*
+ * Does the counter have nsecs as a unit?
+ */
+static inline int nsec_counter(int counter)
+{
+	if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK))
+		return 1;
+	if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Print out the results of a single counter:
+ */
+static void print_counter(int counter)
+{
+	__u64 count[3], single_count[3];
+	ssize_t res;
+	int cpu, nv;
+	int scaled;
+
+	count[0] = count[1] = count[2] = 0;
+	nv = scale ? 3 : 1;
+	for (cpu = 0; cpu < nr_cpus; cpu ++) {
+		res = read(fd[cpu][counter], single_count, nv * sizeof(__u64));
+		assert(res == nv * sizeof(__u64));
+
+		count[0] += single_count[0];
+		if (scale) {
+			count[1] += single_count[1];
+			count[2] += single_count[2];
+		}
+	}
+
+	scaled = 0;
+	if (scale) {
+		if (count[2] == 0) {
+			fprintf(stderr, " %14s  %-20s\n",
+				"<not counted>", event_name(counter));
+			return;
+		}
+		if (count[2] < count[1]) {
+			scaled = 1;
+			count[0] = (unsigned long long)
+				((double)count[0] * count[1] / count[2] + 0.5);
+		}
+	}
+
+	if (nsec_counter(counter)) {
+		double msecs = (double)count[0] / 1000000;
+
+		fprintf(stderr, " %14.6f  %-20s (msecs)",
+			msecs, event_name(counter));
+	} else {
+		fprintf(stderr, " %14Ld  %-20s (events)",
+			count[0], event_name(counter));
+	}
+	if (scaled)
+		fprintf(stderr, "  (scaled from %.2f%%)",
+			(double) count[2] / count[1] * 100);
+	fprintf(stderr, "\n");
+}
+
 static int do_perfstat(int argc, const char **argv)
 {
 	unsigned long long t0, t1;
 	int counter;
-	ssize_t res;
 	int status;
 	int pid;
 
@@ -149,55 +213,10 @@ static int do_perfstat(int argc, const char **argv)
 		argv[0]);
 	fprintf(stderr, "\n");
 
-	for (counter = 0; counter < nr_counters; counter++) {
-		int cpu, nv;
-		__u64 count[3], single_count[3];
-		int scaled;
-
-		count[0] = count[1] = count[2] = 0;
-		nv = scale ? 3 : 1;
-		for (cpu = 0; cpu < nr_cpus; cpu ++) {
-			res = read(fd[cpu][counter],
-				   single_count, nv * sizeof(__u64));
-			assert(res == nv * sizeof(__u64));
-
-			count[0] += single_count[0];
-			if (scale) {
-				count[1] += single_count[1];
-				count[2] += single_count[2];
-			}
-		}
-
-		scaled = 0;
-		if (scale) {
-			if (count[2] == 0) {
-				fprintf(stderr, " %14s  %-20s\n",
-					"<not counted>", event_name(counter));
-				continue;
-			}
-			if (count[2] < count[1]) {
-				scaled = 1;
-				count[0] = (unsigned long long)
-					((double)count[0] * count[1] / count[2] + 0.5);
-			}
-		}
-
-		if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
-		    event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
+	for (counter = 0; counter < nr_counters; counter++)
+		print_counter(counter);
 
-			double msecs = (double)count[0] / 1000000;
 
-			fprintf(stderr, " %14.6f  %-20s (msecs)",
-				msecs, event_name(counter));
-		} else {
-			fprintf(stderr, " %14Ld  %-20s (events)",
-				count[0], event_name(counter));
-		}
-		if (scaled)
-			fprintf(stderr, "  (scaled from %.2f%%)",
-				(double) count[2] / count[1] * 100);
-		fprintf(stderr, "\n");
-	}
 	fprintf(stderr, "\n");
 	fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
 			(double)(t1-t0)/1e6);
@@ -212,7 +231,6 @@ static void skip_signal(int signo)
 
 static const char * const stat_usage[] = {
 	"perf stat [<options>] <command>",
-	"perf stat [<options>] -- <command> [<options>]",
 	NULL
 };
 
-- 
cgit v0.10.2


From 2996f5ddb7ba8889caeeac65edafe48845106eaa Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 29 May 2009 09:10:54 +0200
Subject: perf_counter tools: Split display into reading and printing

We introduce the extra pass to allow the print-out to possibly
rely on already read counters.

[ Impact: cleanup ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 6a29361..0c92eb7 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -71,6 +71,9 @@ static const unsigned int default_count[] = {
 	  10000,
 };
 
+static __u64			event_res[MAX_COUNTERS][3];
+static __u64			event_scaled[MAX_COUNTERS];
+
 static void create_perfstat_counter(int counter)
 {
 	struct perf_counter_hw_event hw_event;
@@ -123,16 +126,19 @@ static inline int nsec_counter(int counter)
 }
 
 /*
- * Print out the results of a single counter:
+ * Read out the results of a single counter:
  */
-static void print_counter(int counter)
+static void read_counter(int counter)
 {
-	__u64 count[3], single_count[3];
+	__u64 *count, single_count[3];
 	ssize_t res;
 	int cpu, nv;
 	int scaled;
 
+	count = event_res[counter];
+
 	count[0] = count[1] = count[2] = 0;
+
 	nv = scale ? 3 : 1;
 	for (cpu = 0; cpu < nr_cpus; cpu ++) {
 		res = read(fd[cpu][counter], single_count, nv * sizeof(__u64));
@@ -148,16 +154,35 @@ static void print_counter(int counter)
 	scaled = 0;
 	if (scale) {
 		if (count[2] == 0) {
-			fprintf(stderr, " %14s  %-20s\n",
-				"<not counted>", event_name(counter));
+			event_scaled[counter] = -1;
+			count[0] = 0;
 			return;
 		}
+
 		if (count[2] < count[1]) {
-			scaled = 1;
+			event_scaled[counter] = 1;
 			count[0] = (unsigned long long)
 				((double)count[0] * count[1] / count[2] + 0.5);
 		}
 	}
+}
+
+/*
+ * Print out the results of a single counter:
+ */
+static void print_counter(int counter)
+{
+	__u64 *count;
+	int scaled;
+
+	count = event_res[counter];
+	scaled = event_scaled[counter];
+
+	if (scaled == -1) {
+		fprintf(stderr, " %14s  %-20s\n",
+			"<not counted>", event_name(counter));
+		return;
+	}
 
 	if (nsec_counter(counter)) {
 		double msecs = (double)count[0] / 1000000;
@@ -214,6 +239,9 @@ static int do_perfstat(int argc, const char **argv)
 	fprintf(stderr, "\n");
 
 	for (counter = 0; counter < nr_counters; counter++)
+		read_counter(counter);
+
+	for (counter = 0; counter < nr_counters; counter++)
 		print_counter(counter);
 
 
-- 
cgit v0.10.2


From be1ac0d81d0e3ab655f8c8ade31fb860ef6aa186 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 29 May 2009 09:10:54 +0200
Subject: perf_counter tools: Also display time-normalized stat results

Add new column that normalizes counter results by
'nanoseconds spent running' unit.

Before:

 Performance counter stats for '/home/mingo/hackbench':

   10469.403605  task clock ticks     (msecs)
          75502  context switches     (events)
           9501  CPU migrations       (events)
          36158  pagefaults           (events)
    31975676185  CPU cycles           (events)
    26257738659  instructions         (events)
      108740581  cache references     (events)
       54606088  cache misses         (events)

 Wall-clock time elapsed:   810.514504 msecs

After:

 Performance counter stats for '/home/mingo/hackbench':

   10469.403605  task clock ticks     (msecs)
          75502  context switches     #        0.007 M/sec
           9501  CPU migrations       #        0.001 M/sec
          36158  pagefaults           #        0.003 M/sec
    31975676185  CPU cycles           #     3054.202 M/sec
    26257738659  instructions         #     2508.045 M/sec
      108740581  cache references     #       10.387 M/sec
       54606088  cache misses         #        5.216 M/sec

 Wall-clock time elapsed:   810.514504 msecs

The advantage of that column is that it is characteristic of the
execution workflow, regardless of runtime. Hence 'hackbench 10'
will look similar to 'hackbench 15' - while the absolute counter
values are very different.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 0c92eb7..ef7e0e1 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -74,6 +74,8 @@ static const unsigned int default_count[] = {
 static __u64			event_res[MAX_COUNTERS][3];
 static __u64			event_scaled[MAX_COUNTERS];
 
+static __u64			runtime_nsecs;
+
 static void create_perfstat_counter(int counter)
 {
 	struct perf_counter_hw_event hw_event;
@@ -165,6 +167,11 @@ static void read_counter(int counter)
 				((double)count[0] * count[1] / count[2] + 0.5);
 		}
 	}
+	/*
+	 * Save the full runtime - to allow normalization during printout:
+	 */
+	if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK))
+		runtime_nsecs = count[0];
 }
 
 /*
@@ -190,8 +197,11 @@ static void print_counter(int counter)
 		fprintf(stderr, " %14.6f  %-20s (msecs)",
 			msecs, event_name(counter));
 	} else {
-		fprintf(stderr, " %14Ld  %-20s (events)",
+		fprintf(stderr, " %14Ld  %-20s",
 			count[0], event_name(counter));
+		if (runtime_nsecs)
+			fprintf(stderr, " # %12.3f M/sec",
+				(double)count[0]/runtime_nsecs*1000.0);
 	}
 	if (scaled)
 		fprintf(stderr, "  (scaled from %.2f%%)",
-- 
cgit v0.10.2


From 10b6d95612672f89deb39b5a60fb677c78ba4844 Mon Sep 17 00:00:00 2001
From: Divy Le Ray <divy@chelsio.com>
Date: Thu, 28 May 2009 11:23:02 +0000
Subject: cxgb3: fix dma mapping regression

Commit 5e68b772e6efd189d6aca76f6872fb75d51ace60
  cxgb3: map entire Rx page, feed map+offset to Rx ring.

introduced a regression on platforms defining DECLARE_PCI_UNMAP_ADDR()
and related macros as no-ops.

Rx descriptors are fed with the a page buffer bus address + page chunk offset.
The page buffer bus address is set and retrieved through
pci_unamp_addr_set(), pci_unmap_addr().
These functions being meaningless on x86 (if CONFIG_DMA_API_DEBUG is not set).
The HW ends up with a bogus bus address.

This patch saves the page buffer bus address for all plaftorms.

Signed-off-by: Divy Le Ray <divy@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/cxgb3/adapter.h b/drivers/net/cxgb3/adapter.h
index 714df2b..c888e97 100644
--- a/drivers/net/cxgb3/adapter.h
+++ b/drivers/net/cxgb3/adapter.h
@@ -85,8 +85,8 @@ struct fl_pg_chunk {
 	struct page *page;
 	void *va;
 	unsigned int offset;
-	u64 *p_cnt;
-	DECLARE_PCI_UNMAP_ADDR(mapping);
+	unsigned long *p_cnt;
+	dma_addr_t mapping;
 };
 
 struct rx_desc;
diff --git a/drivers/net/cxgb3/sge.c b/drivers/net/cxgb3/sge.c
index 26d3587..b3ee2bc 100644
--- a/drivers/net/cxgb3/sge.c
+++ b/drivers/net/cxgb3/sge.c
@@ -355,7 +355,7 @@ static void clear_rx_desc(struct pci_dev *pdev, const struct sge_fl *q,
 		(*d->pg_chunk.p_cnt)--;
 		if (!*d->pg_chunk.p_cnt)
 			pci_unmap_page(pdev,
-				       pci_unmap_addr(&d->pg_chunk, mapping),
+				       d->pg_chunk.mapping,
 				       q->alloc_size, PCI_DMA_FROMDEVICE);
 
 		put_page(d->pg_chunk.page);
@@ -454,7 +454,7 @@ static int alloc_pg_chunk(struct adapter *adapter, struct sge_fl *q,
 		q->pg_chunk.offset = 0;
 		mapping = pci_map_page(adapter->pdev, q->pg_chunk.page,
 				       0, q->alloc_size, PCI_DMA_FROMDEVICE);
-		pci_unmap_addr_set(&q->pg_chunk, mapping, mapping);
+		q->pg_chunk.mapping = mapping;
 	}
 	sd->pg_chunk = q->pg_chunk;
 
@@ -511,8 +511,7 @@ static int refill_fl(struct adapter *adap, struct sge_fl *q, int n, gfp_t gfp)
 nomem:				q->alloc_failed++;
 				break;
 			}
-			mapping = pci_unmap_addr(&sd->pg_chunk, mapping) +
-						 sd->pg_chunk.offset;
+			mapping = sd->pg_chunk.mapping + sd->pg_chunk.offset;
 			pci_unmap_addr_set(sd, dma_addr, mapping);
 
 			add_one_rx_chunk(mapping, d, q->gen);
@@ -881,7 +880,7 @@ recycle:
 	(*sd->pg_chunk.p_cnt)--;
 	if (!*sd->pg_chunk.p_cnt)
 		pci_unmap_page(adap->pdev,
-			       pci_unmap_addr(&sd->pg_chunk, mapping),
+			       sd->pg_chunk.mapping,
 			       fl->alloc_size,
 			       PCI_DMA_FROMDEVICE);
 	if (!skb) {
@@ -2096,7 +2095,7 @@ static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
 	(*sd->pg_chunk.p_cnt)--;
 	if (!*sd->pg_chunk.p_cnt)
 		pci_unmap_page(adap->pdev,
-			       pci_unmap_addr(&sd->pg_chunk, mapping),
+			       sd->pg_chunk.mapping,
 			       fl->alloc_size,
 			       PCI_DMA_FROMDEVICE);
 
-- 
cgit v0.10.2


From c22c8149313ee85c912e7b77a7afd04be8b8cba8 Mon Sep 17 00:00:00 2001
From: Divy Le Ray <divy@chelsio.com>
Date: Thu, 28 May 2009 11:23:08 +0000
Subject: cxgb3: link fault fixes

Do not call t3_link_fault() under spinlock, as it calls msleep().
Besides, only the access to pi->link_fault needs to be serialized.

Also initialize local variables before checking the link status,
link state fields might otherwise end up containing garbage.

Signed-off-by: Divy Le Ray <divy@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/cxgb3/cxgb3_main.c b/drivers/net/cxgb3/cxgb3_main.c
index 7ea4841..17858b9 100644
--- a/drivers/net/cxgb3/cxgb3_main.c
+++ b/drivers/net/cxgb3/cxgb3_main.c
@@ -2496,14 +2496,16 @@ static void check_link_status(struct adapter *adapter)
 	for_each_port(adapter, i) {
 		struct net_device *dev = adapter->port[i];
 		struct port_info *p = netdev_priv(dev);
+		int link_fault;
 
 		spin_lock_irq(&adapter->work_lock);
-		if (p->link_fault) {
+		link_fault = p->link_fault;
+		spin_unlock_irq(&adapter->work_lock);
+
+		if (link_fault) {
 			t3_link_fault(adapter, i);
-			spin_unlock_irq(&adapter->work_lock);
 			continue;
 		}
-		spin_unlock_irq(&adapter->work_lock);
 
 		if (!(p->phy.caps & SUPPORTED_IRQ) && netif_running(dev)) {
 			t3_xgm_intr_disable(adapter, i);
diff --git a/drivers/net/cxgb3/t3_hw.c b/drivers/net/cxgb3/t3_hw.c
index 4f68aeb..4950d5d 100644
--- a/drivers/net/cxgb3/t3_hw.c
+++ b/drivers/net/cxgb3/t3_hw.c
@@ -1274,6 +1274,11 @@ void t3_link_fault(struct adapter *adapter, int port_id)
 				 A_XGM_INT_STATUS + mac->offset);
 	link_fault &= F_LINKFAULTCHANGE;
 
+	link_ok = lc->link_ok;
+	speed = lc->speed;
+	duplex = lc->duplex;
+	fc = lc->fc;
+
 	phy->ops->get_link_status(phy, &link_ok, &speed, &duplex, &fc);
 
 	if (link_fault) {
-- 
cgit v0.10.2


From 4e0168fa4842e27795a75b205a510f25b62181d9 Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Thu, 28 May 2009 02:05:53 +0000
Subject: mac8390: fix build with NET_POLL_CONTROLLER

Fix the build for CONFIG_NET_POLL_CONTROLLER that I broke with
217cbfa856dc1cbc2890781626c4032d9e3ec59f ("mac8390: fix regression
caused during net_device_ops conversion").

Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/mac8390.c b/drivers/net/mac8390.c
index f26667d..22e74a0 100644
--- a/drivers/net/mac8390.c
+++ b/drivers/net/mac8390.c
@@ -489,7 +489,7 @@ static const struct net_device_ops mac8390_netdev_ops = {
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_change_mtu		= eth_change_mtu,
 #ifdef CONFIG_NET_POLL_CONTROLLER
-	.ndo_poll_controller	= ei_poll,
+	.ndo_poll_controller	= __ei_poll,
 #endif
 };
 
-- 
cgit v0.10.2


From ad3a37de81c45f6c20d410ece86004b98f7b6d84 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 29 May 2009 16:06:20 +1000
Subject: perf_counter: Don't swap contexts containing locked mutex

Peter Zijlstra pointed out that under some circumstances, we can take
the mutex in a context or a counter and then swap that context or
counter to another task, potentially leading to lock order inversions
or the mutexes not protecting what they are supposed to protect.

This fixes the problem by making sure that we never take a mutex in a
context or counter which could get swapped to another task.  Most of
the cases where we take a mutex is on a top-level counter or context,
i.e. a counter which has an fd associated with it or a context that
contains such a counter.  This adds WARN_ON_ONCE statements to verify
that.

The two cases where we need to take the mutex on a context that is a
clone of another are in perf_counter_exit_task and
perf_counter_init_task.  The perf_counter_exit_task case is solved by
uncloning the context before starting to remove the counters from it.
The perf_counter_init_task is a little trickier; we temporarily
disable context swapping for the parent (forking) task by setting its
ctx->parent_gen to the all-1s value after locking the context, if it
is a cloned context, and restore the ctx->parent_gen value at the end
if the context didn't get uncloned in the meantime.

This also moves the increment of the context generation count to be
within the same critical section, protected by the context mutex, that
adds the new counter to the context.  That way, taking the mutex is
sufficient to ensure that both the counter list and the generation
count are stable.

[ Impact: fix hangs, races with inherited and PID counters ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18975.31580.520676.619896@drongo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 52e5a15..db843f8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -919,7 +919,8 @@ static int context_equiv(struct perf_counter_context *ctx1,
 			 struct perf_counter_context *ctx2)
 {
 	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
-		&& ctx1->parent_gen == ctx2->parent_gen;
+		&& ctx1->parent_gen == ctx2->parent_gen
+		&& ctx1->parent_gen != ~0ull;
 }
 
 /*
@@ -1339,7 +1340,6 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 			put_ctx(parent_ctx);
 			ctx->parent_ctx = NULL;		/* no longer a clone */
 		}
-		++ctx->generation;
 		/*
 		 * Get an extra reference before dropping the lock so that
 		 * this context won't get freed if the task exits.
@@ -1414,6 +1414,7 @@ static int perf_release(struct inode *inode, struct file *file)
 
 	file->private_data = NULL;
 
+	WARN_ON_ONCE(ctx->parent_ctx);
 	mutex_lock(&ctx->mutex);
 	perf_counter_remove_from_context(counter);
 	mutex_unlock(&ctx->mutex);
@@ -1445,6 +1446,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 	if (counter->state == PERF_COUNTER_STATE_ERROR)
 		return 0;
 
+	WARN_ON_ONCE(counter->ctx->parent_ctx);
 	mutex_lock(&counter->child_mutex);
 	values[0] = perf_counter_read(counter);
 	n = 1;
@@ -1504,6 +1506,7 @@ static void perf_counter_for_each_sibling(struct perf_counter *counter,
 	struct perf_counter_context *ctx = counter->ctx;
 	struct perf_counter *sibling;
 
+	WARN_ON_ONCE(ctx->parent_ctx);
 	mutex_lock(&ctx->mutex);
 	counter = counter->group_leader;
 
@@ -1524,6 +1527,7 @@ static void perf_counter_for_each_child(struct perf_counter *counter,
 {
 	struct perf_counter *child;
 
+	WARN_ON_ONCE(counter->ctx->parent_ctx);
 	mutex_lock(&counter->child_mutex);
 	func(counter);
 	list_for_each_entry(child, &counter->child_list, child_list)
@@ -1536,6 +1540,7 @@ static void perf_counter_for_each(struct perf_counter *counter,
 {
 	struct perf_counter *child;
 
+	WARN_ON_ONCE(counter->ctx->parent_ctx);
 	mutex_lock(&counter->child_mutex);
 	perf_counter_for_each_sibling(counter, func);
 	list_for_each_entry(child, &counter->child_list, child_list)
@@ -1741,6 +1746,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 {
 	struct perf_counter *counter = vma->vm_file->private_data;
 
+	WARN_ON_ONCE(counter->ctx->parent_ctx);
 	if (atomic_dec_and_mutex_lock(&counter->mmap_count,
 				      &counter->mmap_mutex)) {
 		struct user_struct *user = current_user();
@@ -1788,6 +1794,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	if (vma->vm_pgoff != 0)
 		return -EINVAL;
 
+	WARN_ON_ONCE(counter->ctx->parent_ctx);
 	mutex_lock(&counter->mmap_mutex);
 	if (atomic_inc_not_zero(&counter->mmap_count)) {
 		if (nr_pages != counter->data->nr_pages)
@@ -3349,8 +3356,10 @@ SYSCALL_DEFINE5(perf_counter_open,
 		goto err_free_put_context;
 
 	counter->filp = counter_file;
+	WARN_ON_ONCE(ctx->parent_ctx);
 	mutex_lock(&ctx->mutex);
 	perf_install_in_context(ctx, counter, cpu);
+	++ctx->generation;
 	mutex_unlock(&ctx->mutex);
 
 	counter->owner = current;
@@ -3436,6 +3445,7 @@ inherit_counter(struct perf_counter *parent_counter,
 	/*
 	 * Link this into the parent counter's child list
 	 */
+	WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
 	mutex_lock(&parent_counter->child_mutex);
 	list_add_tail(&child_counter->child_list, &parent_counter->child_list);
 	mutex_unlock(&parent_counter->child_mutex);
@@ -3485,6 +3495,7 @@ static void sync_child_counter(struct perf_counter *child_counter,
 	/*
 	 * Remove this counter from the parent's list
 	 */
+	WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
 	mutex_lock(&parent_counter->child_mutex);
 	list_del_init(&child_counter->child_list);
 	mutex_unlock(&parent_counter->child_mutex);
@@ -3527,12 +3538,17 @@ void perf_counter_exit_task(struct task_struct *child)
 	struct perf_counter_context *child_ctx;
 	unsigned long flags;
 
-	child_ctx = child->perf_counter_ctxp;
-
-	if (likely(!child_ctx))
+	if (likely(!child->perf_counter_ctxp))
 		return;
 
 	local_irq_save(flags);
+	/*
+	 * We can't reschedule here because interrupts are disabled,
+	 * and either child is current or it is a task that can't be
+	 * scheduled, so we are now safe from rescheduling changing
+	 * our context.
+	 */
+	child_ctx = child->perf_counter_ctxp;
 	__perf_counter_task_sched_out(child_ctx);
 
 	/*
@@ -3542,6 +3558,15 @@ void perf_counter_exit_task(struct task_struct *child)
 	 */
 	spin_lock(&child_ctx->lock);
 	child->perf_counter_ctxp = NULL;
+	if (child_ctx->parent_ctx) {
+		/*
+		 * This context is a clone; unclone it so it can't get
+		 * swapped to another process while we're removing all
+		 * the counters from it.
+		 */
+		put_ctx(child_ctx->parent_ctx);
+		child_ctx->parent_ctx = NULL;
+	}
 	spin_unlock(&child_ctx->lock);
 	local_irq_restore(flags);
 
@@ -3571,9 +3596,11 @@ again:
 int perf_counter_init_task(struct task_struct *child)
 {
 	struct perf_counter_context *child_ctx, *parent_ctx;
+	struct perf_counter_context *cloned_ctx;
 	struct perf_counter *counter;
 	struct task_struct *parent = current;
 	int inherited_all = 1;
+	u64 cloned_gen;
 	int ret = 0;
 
 	child->perf_counter_ctxp = NULL;
@@ -3581,8 +3608,7 @@ int perf_counter_init_task(struct task_struct *child)
 	mutex_init(&child->perf_counter_mutex);
 	INIT_LIST_HEAD(&child->perf_counter_list);
 
-	parent_ctx = parent->perf_counter_ctxp;
-	if (likely(!parent_ctx || !parent_ctx->nr_counters))
+	if (likely(!parent->perf_counter_ctxp))
 		return 0;
 
 	/*
@@ -3600,6 +3626,34 @@ int perf_counter_init_task(struct task_struct *child)
 	get_task_struct(child);
 
 	/*
+	 * If the parent's context is a clone, temporarily set its
+	 * parent_gen to an impossible value (all 1s) so it won't get
+	 * swapped under us.  The rcu_read_lock makes sure that
+	 * parent_ctx continues to exist even if it gets swapped to
+	 * another process and then freed while we are trying to get
+	 * its lock.
+	 */
+	rcu_read_lock();
+ retry:
+	parent_ctx = rcu_dereference(parent->perf_counter_ctxp);
+	/*
+	 * No need to check if parent_ctx != NULL here; since we saw
+	 * it non-NULL earlier, the only reason for it to become NULL
+	 * is if we exit, and since we're currently in the middle of
+	 * a fork we can't be exiting at the same time.
+	 */
+	spin_lock_irq(&parent_ctx->lock);
+	if (parent_ctx != rcu_dereference(parent->perf_counter_ctxp)) {
+		spin_unlock_irq(&parent_ctx->lock);
+		goto retry;
+	}
+	cloned_gen = parent_ctx->parent_gen;
+	if (parent_ctx->parent_ctx)
+		parent_ctx->parent_gen = ~0ull;
+	spin_unlock_irq(&parent_ctx->lock);
+	rcu_read_unlock();
+
+	/*
 	 * Lock the parent list. No need to lock the child - not PID
 	 * hashed yet and not running, so nobody can access it.
 	 */
@@ -3630,10 +3684,15 @@ int perf_counter_init_task(struct task_struct *child)
 		/*
 		 * Mark the child context as a clone of the parent
 		 * context, or of whatever the parent is a clone of.
+		 * Note that if the parent is a clone, it could get
+		 * uncloned at any point, but that doesn't matter
+		 * because the list of counters and the generation
+		 * count can't have changed since we took the mutex.
 		 */
-		if (parent_ctx->parent_ctx) {
-			child_ctx->parent_ctx = parent_ctx->parent_ctx;
-			child_ctx->parent_gen = parent_ctx->parent_gen;
+		cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
+		if (cloned_ctx) {
+			child_ctx->parent_ctx = cloned_ctx;
+			child_ctx->parent_gen = cloned_gen;
 		} else {
 			child_ctx->parent_ctx = parent_ctx;
 			child_ctx->parent_gen = parent_ctx->generation;
@@ -3643,6 +3702,16 @@ int perf_counter_init_task(struct task_struct *child)
 
 	mutex_unlock(&parent_ctx->mutex);
 
+	/*
+	 * Restore the clone status of the parent.
+	 */
+	if (parent_ctx->parent_ctx) {
+		spin_lock_irq(&parent_ctx->lock);
+		if (parent_ctx->parent_ctx)
+			parent_ctx->parent_gen = cloned_gen;
+		spin_unlock_irq(&parent_ctx->lock);
+	}
+
 	return ret;
 }
 
-- 
cgit v0.10.2


From 6daad5c6c586bf07528ae5b39e801b204468f907 Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Fri, 29 May 2009 10:15:08 +0100
Subject: [ARM] update mach-types

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

diff --git a/arch/arm/tools/mach-types b/arch/arm/tools/mach-types
index 945e0d2..fec6467 100644
--- a/arch/arm/tools/mach-types
+++ b/arch/arm/tools/mach-types
@@ -12,7 +12,7 @@
 #
 #   http://www.arm.linux.org.uk/developer/machines/?action=new
 #
-# Last update: Mon Mar 23 20:09:01 2009
+# Last update: Fri May 29 10:14:20 2009
 #
 # machine_is_xxx	CONFIG_xxxx		MACH_TYPE_xxx		number
 #
@@ -916,7 +916,7 @@ nxdb500			MACH_NXDB500		NXDB500			905
 apf9328			MACH_APF9328		APF9328			906
 omap_wipoq		MACH_OMAP_WIPOQ		OMAP_WIPOQ		907
 omap_twip		MACH_OMAP_TWIP		OMAP_TWIP		908
-palmt650		MACH_PALMT650		PALMT650		909
+treo650			MACH_TREO650		TREO650			909
 acumen			MACH_ACUMEN		ACUMEN			910
 xp100			MACH_XP100		XP100			911
 fs2410			MACH_FS2410		FS2410			912
@@ -1232,7 +1232,7 @@ ql202b			MACH_QL202B		QL202B			1226
 vpac270			MACH_VPAC270		VPAC270			1227
 rd129			MACH_RD129		RD129			1228
 htcwizard		MACH_HTCWIZARD		HTCWIZARD		1229
-xscale_treo680		MACH_XSCALE_TREO680	XSCALE_TREO680		1230
+treo680			MACH_TREO680		TREO680			1230
 tecon_tmezon		MACH_TECON_TMEZON	TECON_TMEZON		1231
 zylonite		MACH_ZYLONITE		ZYLONITE		1233
 gene1270		MACH_GENE1270		GENE1270		1234
@@ -1418,10 +1418,10 @@ looxc550		MACH_LOOXC550		LOOXC550		1417
 cnty_titan		MACH_CNTY_TITAN		CNTY_TITAN		1418
 app3xx			MACH_APP3XX		APP3XX			1419
 sideoatsgrama		MACH_SIDEOATSGRAMA	SIDEOATSGRAMA		1420
-palmtreo700p		MACH_PALMTREO700P	PALMTREO700P		1421
-palmtreo700w		MACH_PALMTREO700W	PALMTREO700W		1422
-palmtreo750		MACH_PALMTREO750	PALMTREO750		1423
-palmtreo755p		MACH_PALMTREO755P	PALMTREO755P		1424
+treo700p		MACH_TREO700P		TREO700P		1421
+treo700w		MACH_TREO700W		TREO700W		1422
+treo750			MACH_TREO750		TREO750			1423
+treo755p		MACH_TREO755P		TREO755P		1424
 ezreganut9200		MACH_EZREGANUT9200	EZREGANUT9200		1425
 sarge			MACH_SARGE		SARGE			1426
 a696			MACH_A696		A696			1427
@@ -1721,7 +1721,7 @@ sapphire		MACH_SAPPHIRE		SAPPHIRE		1729
 csb637xo		MACH_CSB637XO		CSB637XO		1730
 evisiong		MACH_EVISIONG		EVISIONG		1731
 stmp37xx		MACH_STMP37XX		STMP37XX		1732
-stmp378x		MACH_STMP38XX		STMP38XX		1733
+stmp378x		MACH_STMP378X		STMP378X		1733
 tnt			MACH_TNT		TNT			1734
 tbxt			MACH_TBXT		TBXT			1735
 playmate		MACH_PLAYMATE		PLAYMATE		1736
@@ -1817,7 +1817,7 @@ smdkc100		MACH_SMDKC100		SMDKC100		1826
 tavorevb		MACH_TAVOREVB		TAVOREVB		1827
 saar			MACH_SAAR		SAAR			1828
 deister_eyecam		MACH_DEISTER_EYECAM	DEISTER_EYECAM		1829
-at91sam9m10ek		MACH_AT91SAM9M10EK	AT91SAM9M10EK		1830
+at91sam9m10g45ek	MACH_AT91SAM9M10G45EK	AT91SAM9M10G45EK	1830
 linkstation_produo	MACH_LINKSTATION_PRODUO	LINKSTATION_PRODUO	1831
 hit_b0			MACH_HIT_B0		HIT_B0			1832
 adx_rmu			MACH_ADX_RMU		ADX_RMU			1833
@@ -2132,3 +2132,116 @@ apollo			MACH_APOLLO		APOLLO			2141
 at91cap9stk		MACH_AT91CAP9STK	AT91CAP9STK		2142
 spc300			MACH_SPC300		SPC300			2143
 eko			MACH_EKO		EKO			2144
+ccw9m2443		MACH_CCW9M2443		CCW9M2443		2145
+ccw9m2443js		MACH_CCW9M2443JS	CCW9M2443JS		2146
+m2m_router_device	MACH_M2M_ROUTER_DEVICE	M2M_ROUTER_DEVICE	2147
+str9104nas		MACH_STAR9104NAS	STAR9104NAS		2148
+pca100			MACH_PCA100		PCA100			2149
+z3_dm365_mod_01		MACH_Z3_DM365_MOD_01	Z3_DM365_MOD_01		2150
+hipox			MACH_HIPOX		HIPOX			2151
+omap3_piteds		MACH_OMAP3_PITEDS	OMAP3_PITEDS		2152
+bm150r			MACH_BM150R		BM150R			2153
+tbone			MACH_TBONE		TBONE			2154
+merlin			MACH_MERLIN		MERLIN			2155
+falcon			MACH_FALCON		FALCON			2156
+davinci_da850_evm	MACH_DAVINCI_DA850_EVM	DAVINCI_DA850_EVM	2157
+s5p6440			MACH_S5P6440		S5P6440			2158
+at91sam9g10ek		MACH_AT91SAM9G10EK	AT91SAM9G10EK		2159
+omap_4430sdp		MACH_OMAP_4430SDP	OMAP_4430SDP		2160
+lpc313x			MACH_LPC313X		LPC313X			2161
+magx_zn5		MACH_MAGX_ZN5		MAGX_ZN5		2162
+magx_em30		MACH_MAGX_EM30		MAGX_EM30		2163
+magx_ve66		MACH_MAGX_VE66		MAGX_VE66		2164
+meesc			MACH_MEESC		MEESC			2165
+otc570			MACH_OTC570		OTC570			2166
+bcu2412			MACH_BCU2412		BCU2412			2167
+beacon			MACH_BEACON		BEACON			2168
+actia_tgw		MACH_ACTIA_TGW		ACTIA_TGW		2169
+e4430			MACH_E4430		E4430			2170
+ql300			MACH_QL300		QL300			2171
+btmavb101		MACH_BTMAVB101		BTMAVB101		2172
+btmawb101		MACH_BTMAWB101		BTMAWB101		2173
+sq201			MACH_SQ201		SQ201			2174
+quatro45xx		MACH_QUATRO45XX		QUATRO45XX		2175
+openpad			MACH_OPENPAD		OPENPAD			2176
+tx25			MACH_TX25		TX25			2177
+omap3_torpedo		MACH_OMAP3_TORPEDO	OMAP3_TORPEDO		2178
+htcraphael_k		MACH_HTCRAPHAEL_K	HTCRAPHAEL_K		2179
+lal43			MACH_LAL43		LAL43			2181
+htcraphael_cdma500	MACH_HTCRAPHAEL_CDMA500	HTCRAPHAEL_CDMA500	2182
+anw6410			MACH_ANW6410		ANW6410			2183
+htcprophet		MACH_HTCPROPHET		HTCPROPHET		2185
+cfa_10022		MACH_CFA_10022		CFA_10022		2186
+imx27_visstrim_m10	MACH_IMX27_VISSTRIM_M10	IMX27_VISSTRIM_M10	2187
+px2imx27		MACH_PX2IMX27		PX2IMX27		2188
+stm3210e_eval		MACH_STM3210E_EVAL	STM3210E_EVAL		2189
+dvs10			MACH_DVS10		DVS10			2190
+portuxg20		MACH_PORTUXG20		PORTUXG20		2191
+arm_spv			MACH_ARM_SPV		ARM_SPV			2192
+smdkc110		MACH_SMDKC110		SMDKC110		2193
+cabespresso		MACH_CABESPRESSO	CABESPRESSO		2194
+hmc800			MACH_HMC800		HMC800			2195
+sholes			MACH_SHOLES		SHOLES			2196
+btmxc31			MACH_BTMXC31		BTMXC31			2197
+dt501			MACH_DT501		DT501			2198
+ktx			MACH_KTX		KTX			2199
+omap3517evm		MACH_OMAP3517EVM	OMAP3517EVM		2200
+netspace_v2		MACH_NETSPACE_V2	NETSPACE_V2		2201
+netspace_max_v2		MACH_NETSPACE_MAX_V2	NETSPACE_MAX_V2		2202
+d2net_v2		MACH_D2NET_V2		D2NET_V2		2203
+net2big_v2		MACH_NET2BIG_V2		NET2BIG_V2		2204
+net4big_v2		MACH_NET4BIG_V2		NET4BIG_V2		2205
+net5big_v2		MACH_NET5BIG_V2		NET5BIG_V2		2206
+endb2443		MACH_ENDB2443		ENDB2443		2207
+inetspace_v2		MACH_INETSPACE_V2	INETSPACE_V2		2208
+tros			MACH_TROS		TROS			2209
+pelco_homer		MACH_PELCO_HOMER	PELCO_HOMER		2210
+ofsp8			MACH_OFSP8		OFSP8			2211
+at91sam9g45ekes		MACH_AT91SAM9G45EKES	AT91SAM9G45EKES		2212
+guf_cupid		MACH_GUF_CUPID		GUF_CUPID		2213
+eab1r			MACH_EAB1R		EAB1R			2214
+desirec			MACH_DESIREC		DESIREC			2215
+cordoba			MACH_CORDOBA		CORDOBA			2216
+irvine			MACH_IRVINE		IRVINE			2217
+sff772			MACH_SFF772		SFF772			2218
+pelco_milano		MACH_PELCO_MILANO	PELCO_MILANO		2219
+pc7302			MACH_PC7302		PC7302			2220
+bip6000			MACH_BIP6000		BIP6000			2221
+silvermoon		MACH_SILVERMOON		SILVERMOON		2222
+vc0830			MACH_VC0830		VC0830			2223
+dt430			MACH_DT430		DT430			2224
+ji42pf			MACH_JI42PF		JI42PF			2225
+gnet_ksm		MACH_GNET_KSM		GNET_KSM		2226
+gnet_sgm		MACH_GNET_SGM		GNET_SGM		2227
+gnet_sgr		MACH_GNET_SGR		GNET_SGR		2228
+omap3_icetekevm		MACH_OMAP3_ICETEKEVM	OMAP3_ICETEKEVM		2229
+pnp			MACH_PNP		PNP			2230
+ctera_2bay_k		MACH_CTERA_2BAY_K	CTERA_2BAY_K		2231
+ctera_2bay_u		MACH_CTERA_2BAY_U	CTERA_2BAY_U		2232
+sas_c			MACH_SAS_C		SAS_C			2233
+vma2315			MACH_VMA2315		VMA2315			2234
+vcs			MACH_VCS		VCS			2235
+spear600		MACH_SPEAR600		SPEAR600		2236
+spear300		MACH_SPEAR300		SPEAR300		2237
+spear1300		MACH_SPEAR1300		SPEAR1300		2238
+lilly1131		MACH_LILLY1131		LILLY1131		2239
+arvoo_ax301		MACH_ARVOO_AX301	ARVOO_AX301		2240
+mapphone		MACH_MAPPHONE		MAPPHONE		2241
+legend			MACH_LEGEND		LEGEND			2242
+salsa			MACH_SALSA		SALSA			2243
+lounge			MACH_LOUNGE		LOUNGE			2244
+vision			MACH_VISION		VISION			2245
+vmb20			MACH_VMB20		VMB20			2246
+hy2410			MACH_HY2410		HY2410			2247
+hy9315			MACH_HY9315		HY9315			2248
+bullwinkle		MACH_BULLWINKLE		BULLWINKLE		2249
+arm_ultimator2		MACH_ARM_ULTIMATOR2	ARM_ULTIMATOR2		2250
+vs_v210			MACH_VS_V210		VS_V210			2252
+vs_v212			MACH_VS_V212		VS_V212			2253
+hmt			MACH_HMT		HMT			2254
+suen3			MACH_SUEN3		SUEN3			2255
+vesper			MACH_VESPER		VESPER			2256
+str9			MACH_STR9		STR9			2257
+omap3_wl_ff		MACH_OMAP3_WL_FF	OMAP3_WL_FF		2258
+simcom			MACH_SIMCOM		SIMCOM			2259
+mcwebio			MACH_MCWEBIO		MCWEBIO			2260
-- 
cgit v0.10.2


From 8541c1180a355c4da283fc6b03a92c0233823c1b Mon Sep 17 00:00:00 2001
From: Vladimir Barinov <vova.barinov@gmail.com>
Date: Thu, 23 Apr 2009 15:47:22 +0400
Subject: mtd: MXC NAND driver fixes (v5)

The following patch fixes:
 - re-initialization of host->col_addr which is used as byte index
   between the successive READID flash commands.
 - compile error when CONFIG_PM is enabled
 - pass on the error code from clk_get()
 - return -ENOMEM in case of failed ioremap()
 - pass on the return value of platform_driver_probe() directly
 - remove excessive printk
 - let command line partition table parsing with mxc_nand name.
   The cmd_line parsing is done via <mtd-id> name that differs
   from mxc_nand by default and looks like "NAND 256MiB 1,8V 8-bit"

Signed-off-by: Vladimir Barinov <vbarinov@embeddedalley.com>
Signed-off-by: Lothar Wassmann <LW@KARO-electronics.de>
Acked-by: Sascha Hauer <s.hauer@pengutronix.de>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>

diff --git a/drivers/mtd/nand/mxc_nand.c b/drivers/mtd/nand/mxc_nand.c
index f3548d0..40c2608 100644
--- a/drivers/mtd/nand/mxc_nand.c
+++ b/drivers/mtd/nand/mxc_nand.c
@@ -831,6 +831,7 @@ static void mxc_nand_command(struct mtd_info *mtd, unsigned command,
 		break;
 
 	case NAND_CMD_READID:
+		host->col_addr = 0;
 		send_read_id(host);
 		break;
 
@@ -867,6 +868,7 @@ static int __init mxcnd_probe(struct platform_device *pdev)
 	mtd->priv = this;
 	mtd->owner = THIS_MODULE;
 	mtd->dev.parent = &pdev->dev;
+	mtd->name = "mxc_nand";
 
 	/* 50 us command delay time */
 	this->chip_delay = 5;
@@ -882,8 +884,10 @@ static int __init mxcnd_probe(struct platform_device *pdev)
 	this->verify_buf = mxc_nand_verify_buf;
 
 	host->clk = clk_get(&pdev->dev, "nfc");
-	if (IS_ERR(host->clk))
+	if (IS_ERR(host->clk)) {
+		err = PTR_ERR(host->clk);
 		goto eclk;
+	}
 
 	clk_enable(host->clk);
 	host->clk_act = 1;
@@ -896,7 +900,7 @@ static int __init mxcnd_probe(struct platform_device *pdev)
 
 	host->regs = ioremap(res->start, res->end - res->start + 1);
 	if (!host->regs) {
-		err = -EIO;
+		err = -ENOMEM;
 		goto eres;
 	}
 
@@ -1011,30 +1015,35 @@ static int __devexit mxcnd_remove(struct platform_device *pdev)
 #ifdef CONFIG_PM
 static int mxcnd_suspend(struct platform_device *pdev, pm_message_t state)
 {
-	struct mtd_info *info = platform_get_drvdata(pdev);
+	struct mtd_info *mtd = platform_get_drvdata(pdev);
+	struct nand_chip *nand_chip = mtd->priv;
+	struct mxc_nand_host *host = nand_chip->priv;
 	int ret = 0;
 
 	DEBUG(MTD_DEBUG_LEVEL0, "MXC_ND : NAND suspend\n");
-	if (info)
-		ret = info->suspend(info);
-
-	/* Disable the NFC clock */
-	clk_disable(nfc_clk);	/* FIXME */
+	if (mtd) {
+		ret = mtd->suspend(mtd);
+		/* Disable the NFC clock */
+		clk_disable(host->clk);
+	}
 
 	return ret;
 }
 
 static int mxcnd_resume(struct platform_device *pdev)
 {
-	struct mtd_info *info = platform_get_drvdata(pdev);
+	struct mtd_info *mtd = platform_get_drvdata(pdev);
+	struct nand_chip *nand_chip = mtd->priv;
+	struct mxc_nand_host *host = nand_chip->priv;
 	int ret = 0;
 
 	DEBUG(MTD_DEBUG_LEVEL0, "MXC_ND : NAND resume\n");
-	/* Enable the NFC clock */
-	clk_enable(nfc_clk);	/* FIXME */
 
-	if (info)
-		info->resume(info);
+	if (mtd) {
+		/* Enable the NFC clock */
+		clk_enable(host->clk);
+		mtd->resume(mtd);
+	}
 
 	return ret;
 }
@@ -1055,13 +1064,7 @@ static struct platform_driver mxcnd_driver = {
 
 static int __init mxc_nd_init(void)
 {
-	/* Register the device driver structure. */
-	pr_info("MXC MTD nand Driver\n");
-	if (platform_driver_probe(&mxcnd_driver, mxcnd_probe) != 0) {
-		printk(KERN_ERR "Driver register failed for mxcnd_driver\n");
-		return -ENODEV;
-	}
-	return 0;
+	return platform_driver_probe(&mxcnd_driver, mxcnd_probe);
 }
 
 static void __exit mxc_nd_cleanup(void)
-- 
cgit v0.10.2


From 81e2962801bbb4e740c501ca687d5cb857929c04 Mon Sep 17 00:00:00 2001
From: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
Date: Thu, 28 May 2009 17:43:59 +0200
Subject: jffs2: Fix corruption when flash erase/write failure

Erase errors such as:
"Newly-erased block contained word 0xa4ef223e at offset 0x0296a014"
and failure to write the clean marker,
moves the offending erase block to erasing list before calling
jffs2_erase_failed(). This is bad as jffs2_erase_failed() will
also move the block to the bad_list, but is now moving the
wrong block, causing FS corruption.

Signed-off-by: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>

diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index c32b4a1..a024474 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -480,13 +480,6 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
 	return;
 
 filebad:
-	mutex_lock(&c->erase_free_sem);
-	spin_lock(&c->erase_completion_lock);
-	/* Stick it on a list (any list) so erase_failed can take it
-	   right off again.  Silly, but shouldn't happen often. */
-	list_move(&jeb->list, &c->erasing_list);
-	spin_unlock(&c->erase_completion_lock);
-	mutex_unlock(&c->erase_free_sem);
 	jffs2_erase_failed(c, jeb, bad_offset);
 	return;
 
-- 
cgit v0.10.2


From 21a4cc00e8e67edcfc1bdb9af6d370ed1226eb86 Mon Sep 17 00:00:00 2001
From: "John W. Linville" <linville@tuxdriver.com>
Date: Thu, 28 May 2009 11:39:02 +0200
Subject: at76c50x-usb: avoid mutex deadlock in at76_dwork_hw_scan

http://bugzilla.kernel.org/show_bug.cgi?id=13312

at76_dwork_hw_scan holds a mutex while calling ieee80211_scan_completed,
which then calls at76_config which needs the same mutex.  This reworks
the ordering to not hold the lock while calling ieee80211_scan_completed.

Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/drivers/net/wireless/at76c50x-usb.c b/drivers/net/wireless/at76c50x-usb.c
index 744f4f4..8d93ca4 100644
--- a/drivers/net/wireless/at76c50x-usb.c
+++ b/drivers/net/wireless/at76c50x-usb.c
@@ -1873,18 +1873,18 @@ static void at76_dwork_hw_scan(struct work_struct *work)
 	if (ret != CMD_STATUS_COMPLETE) {
 		queue_delayed_work(priv->hw->workqueue, &priv->dwork_hw_scan,
 				   SCAN_POLL_INTERVAL);
-		goto exit;
+		mutex_unlock(&priv->mtx);
+		return;
 	}
 
-	ieee80211_scan_completed(priv->hw, false);
-
 	if (is_valid_ether_addr(priv->bssid))
 		at76_join(priv);
 
-	ieee80211_wake_queues(priv->hw);
-
-exit:
 	mutex_unlock(&priv->mtx);
+
+	ieee80211_scan_completed(priv->hw, false);
+
+	ieee80211_wake_queues(priv->hw);
 }
 
 static int at76_hw_scan(struct ieee80211_hw *hw,
-- 
cgit v0.10.2


From aeeab4ff06b8e29cfe2fe730ba626f7e2487ba03 Mon Sep 17 00:00:00 2001
From: "John W. Linville" <linville@tuxdriver.com>
Date: Wed, 27 May 2009 09:21:57 +0200
Subject: rtl8187: add USB ID for Linksys WUSB54GC-EU v2 USB wifi dongle

http://bugzilla.kernel.org/show_bug.cgi?id=13383

Reported-by: Przemyslaw Kulczycki <azrael@autocom.pl>
Signed-off-by: John W. Linville <linville@tuxdriver.com>

diff --git a/drivers/net/wireless/Kconfig b/drivers/net/wireless/Kconfig
index 8a08235..3d94e7d 100644
--- a/drivers/net/wireless/Kconfig
+++ b/drivers/net/wireless/Kconfig
@@ -430,6 +430,7 @@ config RTL8187
 	  ASUS P5B Deluxe
 	  Toshiba Satellite Pro series of laptops
 	  Asus Wireless Link
+	  Linksys WUSB54GC-EU
 
 	  Thanks to Realtek for their support!
 
diff --git a/drivers/net/wireless/rtl818x/rtl8187_dev.c b/drivers/net/wireless/rtl818x/rtl8187_dev.c
index bac6cfb..d51ba0a 100644
--- a/drivers/net/wireless/rtl818x/rtl8187_dev.c
+++ b/drivers/net/wireless/rtl818x/rtl8187_dev.c
@@ -71,6 +71,8 @@ static struct usb_device_id rtl8187_table[] __devinitdata = {
 	{USB_DEVICE(0x18E8, 0x6232), .driver_info = DEVICE_RTL8187},
 	/* AirLive */
 	{USB_DEVICE(0x1b75, 0x8187), .driver_info = DEVICE_RTL8187},
+	/* Linksys */
+	{USB_DEVICE(0x1737, 0x0073), .driver_info = DEVICE_RTL8187B},
 	{}
 };
 
-- 
cgit v0.10.2


From 15aedea439c4d7dbec17c99b5e1594c01b979833 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Wed, 27 May 2009 09:43:01 +0900
Subject: dma-debug: use sg_dma_address accessor instead of using dma_address
 directly

Architectures might not have dma_address in struct scatterlist (PARISC
doesn't). Directly accessing to dma_address in struct scatterlist is
wrong; we need to use sg_dma_address() accesssor instead.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index e47e1a0..1b5bb82 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -840,7 +840,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
 		entry->dev            = dev;
 		entry->paddr          = sg_phys(s);
 		entry->size           = s->length;
-		entry->dev_addr       = s->dma_address;
+		entry->dev_addr       = sg_dma_address(s);
 		entry->direction      = direction;
 		entry->sg_call_ents   = nents;
 		entry->sg_mapped_ents = mapped_ents;
@@ -872,7 +872,7 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
 			.type           = dma_debug_sg,
 			.dev            = dev,
 			.paddr          = sg_phys(s),
-			.dev_addr       = s->dma_address,
+			.dev_addr       = sg_dma_address(s),
 			.size           = s->length,
 			.direction      = dir,
 			.sg_call_ents   = 0,
@@ -996,8 +996,8 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 		return;
 
 	for_each_sg(sg, s, nelems, i) {
-		check_sync(dev, s->dma_address, s->dma_length, 0,
-				direction, true);
+		check_sync(dev, sg_dma_address(s), s->dma_length, 0,
+			   direction, true);
 	}
 }
 EXPORT_SYMBOL(debug_dma_sync_sg_for_cpu);
@@ -1012,8 +1012,8 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 		return;
 
 	for_each_sg(sg, s, nelems, i) {
-		check_sync(dev, s->dma_address, s->dma_length, 0,
-				direction, false);
+		check_sync(dev, sg_dma_address(s), s->dma_length, 0,
+			   direction, false);
 	}
 }
 EXPORT_SYMBOL(debug_dma_sync_sg_for_device);
-- 
cgit v0.10.2


From 884d05970bfbc3db368f23460dc4ce63257f240d Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Wed, 27 May 2009 09:43:02 +0900
Subject: dma-debug: use sg_dma_len accessor

debug_dma_map_sg() and debug_dma_unmap_sg() use length in struct
scatterlist while debug_dma_sync_sg_for_cpu() and
debug_dma_sync_sg_for_device() use dma_length. This causes bugs
warnings on some IOMMU implementations since these values are not
same; the length doesn't represent the dma length.

We always need to use sg_dma_len() accessor to get the dma length of a
scatterlist entry.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 1b5bb82..51f95e5 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -839,7 +839,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
 		entry->type           = dma_debug_sg;
 		entry->dev            = dev;
 		entry->paddr          = sg_phys(s);
-		entry->size           = s->length;
+		entry->size           = sg_dma_len(s);
 		entry->dev_addr       = sg_dma_address(s);
 		entry->direction      = direction;
 		entry->sg_call_ents   = nents;
@@ -847,7 +847,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
 
 		if (!PageHighMem(sg_page(s))) {
 			check_for_stack(dev, sg_virt(s));
-			check_for_illegal_area(dev, sg_virt(s), s->length);
+			check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s));
 		}
 
 		add_dma_entry(entry);
@@ -873,7 +873,7 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
 			.dev            = dev,
 			.paddr          = sg_phys(s),
 			.dev_addr       = sg_dma_address(s),
-			.size           = s->length,
+			.size           = sg_dma_len(s),
 			.direction      = dir,
 			.sg_call_ents   = 0,
 		};
@@ -996,7 +996,7 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 		return;
 
 	for_each_sg(sg, s, nelems, i) {
-		check_sync(dev, sg_dma_address(s), s->dma_length, 0,
+		check_sync(dev, sg_dma_address(s), sg_dma_len(s), 0,
 			   direction, true);
 	}
 }
@@ -1012,7 +1012,7 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 		return;
 
 	for_each_sg(sg, s, nelems, i) {
-		check_sync(dev, sg_dma_address(s), s->dma_length, 0,
+		check_sync(dev, sg_dma_address(s), sg_dma_len(s), 0,
 			   direction, false);
 	}
 }
-- 
cgit v0.10.2


From 88f3907f6f447899544beadf491dccb32015dacb Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Wed, 27 May 2009 09:43:03 +0900
Subject: dma-debug: fix debug_dma_sync_sg_for_cpu and
 debug_dma_sync_sg_for_device

DMA-mapping.txt says that debug_dma_sync_sg family must be called with
the _same_ one you passed into the dma_map_sg call, it should _NOT_ be
the 'count' value _returned_ from the dma_map_sg call.

debug_dma_sync_sg_for_cpu and debug_dma_sync_sg_for_device can't
handle this properly; they need to use the sg_mapped_ents in struct
dma_debug_entry as debug_dma_unmap_sg() does.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 51f95e5..1abed17 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -855,13 +855,32 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
 }
 EXPORT_SYMBOL(debug_dma_map_sg);
 
+static int get_nr_mapped_entries(struct device *dev, struct scatterlist *s)
+{
+	struct dma_debug_entry *entry;
+	struct hash_bucket *bucket;
+	unsigned long flags;
+	int mapped_ents = 0;
+	struct dma_debug_entry ref;
+
+	ref.dev = dev;
+	ref.dev_addr = sg_dma_address(s);
+	ref.size = sg_dma_len(s),
+
+	bucket = get_hash_bucket(&ref, &flags);
+	entry = hash_bucket_find(bucket, &ref);
+	if (entry)
+		mapped_ents = entry->sg_mapped_ents;
+	put_hash_bucket(bucket, &flags);
+
+	return mapped_ents;
+}
+
 void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
 			int nelems, int dir)
 {
-	struct dma_debug_entry *entry;
 	struct scatterlist *s;
 	int mapped_ents = 0, i;
-	unsigned long flags;
 
 	if (unlikely(global_disable))
 		return;
@@ -881,14 +900,9 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
 		if (mapped_ents && i >= mapped_ents)
 			break;
 
-		if (mapped_ents == 0) {
-			struct hash_bucket *bucket;
+		if (!i) {
 			ref.sg_call_ents = nelems;
-			bucket = get_hash_bucket(&ref, &flags);
-			entry = hash_bucket_find(bucket, &ref);
-			if (entry)
-				mapped_ents = entry->sg_mapped_ents;
-			put_hash_bucket(bucket, &flags);
+			mapped_ents = get_nr_mapped_entries(dev, s);
 		}
 
 		check_unmap(&ref);
@@ -990,12 +1004,18 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 			       int nelems, int direction)
 {
 	struct scatterlist *s;
-	int i;
+	int mapped_ents = 0, i;
 
 	if (unlikely(global_disable))
 		return;
 
 	for_each_sg(sg, s, nelems, i) {
+		if (!i)
+			mapped_ents = get_nr_mapped_entries(dev, s);
+
+		if (i >= mapped_ents)
+			break;
+
 		check_sync(dev, sg_dma_address(s), sg_dma_len(s), 0,
 			   direction, true);
 	}
@@ -1006,12 +1026,18 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 				  int nelems, int direction)
 {
 	struct scatterlist *s;
-	int i;
+	int mapped_ents = 0, i;
 
 	if (unlikely(global_disable))
 		return;
 
 	for_each_sg(sg, s, nelems, i) {
+		if (!i)
+			mapped_ents = get_nr_mapped_entries(dev, s);
+
+		if (i >= mapped_ents)
+			break;
+
 		check_sync(dev, sg_dma_address(s), sg_dma_len(s), 0,
 			   direction, false);
 	}
-- 
cgit v0.10.2


From 3f4dee227348daac32f36daad9a91059efd0723e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 29 May 2009 11:25:09 +0200
Subject: perf_counter: Fix cpuctx->task_ctx races

Peter noticed that we are sometimes reading cpuctx->task_ctx with
interrupts enabled.

Noticed-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index db843f8..eb34604 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -234,15 +234,18 @@ static void __perf_counter_remove_from_context(void *info)
 	struct perf_counter_context *ctx = counter->ctx;
 	unsigned long flags;
 
+	local_irq_save(flags);
 	/*
 	 * If this is a task context, we need to check whether it is
 	 * the current task context of this cpu. If not it has been
 	 * scheduled out before the smp call arrived.
 	 */
-	if (ctx->task && cpuctx->task_ctx != ctx)
+	if (ctx->task && cpuctx->task_ctx != ctx) {
+		local_irq_restore(flags);
 		return;
+	}
 
-	spin_lock_irqsave(&ctx->lock, flags);
+	spin_lock(&ctx->lock);
 	/*
 	 * Protect the list operation against NMI by disabling the
 	 * counters on a global level.
@@ -382,14 +385,17 @@ static void __perf_counter_disable(void *info)
 	struct perf_counter_context *ctx = counter->ctx;
 	unsigned long flags;
 
+	local_irq_save(flags);
 	/*
 	 * If this is a per-task counter, need to check whether this
 	 * counter's task is the current task on this cpu.
 	 */
-	if (ctx->task && cpuctx->task_ctx != ctx)
+	if (ctx->task && cpuctx->task_ctx != ctx) {
+		local_irq_restore(flags);
 		return;
+	}
 
-	spin_lock_irqsave(&ctx->lock, flags);
+	spin_lock(&ctx->lock);
 
 	/*
 	 * If the counter is on, turn it off.
@@ -615,6 +621,7 @@ static void __perf_install_in_context(void *info)
 	unsigned long flags;
 	int err;
 
+	local_irq_save(flags);
 	/*
 	 * If this is a task context, we need to check whether it is
 	 * the current task context of this cpu. If not it has been
@@ -623,12 +630,14 @@ static void __perf_install_in_context(void *info)
 	 * on this cpu because it had no counters.
 	 */
 	if (ctx->task && cpuctx->task_ctx != ctx) {
-		if (cpuctx->task_ctx || ctx->task != current)
+		if (cpuctx->task_ctx || ctx->task != current) {
+			local_irq_restore(flags);
 			return;
+		}
 		cpuctx->task_ctx = ctx;
 	}
 
-	spin_lock_irqsave(&ctx->lock, flags);
+	spin_lock(&ctx->lock);
 	ctx->is_active = 1;
 	update_context_time(ctx);
 
@@ -745,17 +754,20 @@ static void __perf_counter_enable(void *info)
 	unsigned long flags;
 	int err;
 
+	local_irq_save(flags);
 	/*
 	 * If this is a per-task counter, need to check whether this
 	 * counter's task is the current task on this cpu.
 	 */
 	if (ctx->task && cpuctx->task_ctx != ctx) {
-		if (cpuctx->task_ctx || ctx->task != current)
+		if (cpuctx->task_ctx || ctx->task != current) {
+			local_irq_restore(flags);
 			return;
+		}
 		cpuctx->task_ctx = ctx;
 	}
 
-	spin_lock_irqsave(&ctx->lock, flags);
+	spin_lock(&ctx->lock);
 	ctx->is_active = 1;
 	update_context_time(ctx);
 
-- 
cgit v0.10.2


From 012b84dae17126d8b5d159173091eb3db5a2bc43 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 17 May 2009 11:08:41 +0200
Subject: perf_counter: Robustify counter-free logic

This fixes a nasty crash and highlights a bug that we were
freeing failed-fork() counters incorrectly.

(the fix for that will come separately)

[ Impact: fix crashes/lockups with inherited counters ]

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index eb34604..616c524 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1004,6 +1004,10 @@ static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
 
 	if (!cpuctx->task_ctx)
 		return;
+
+	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
+		return;
+
 	__perf_counter_sched_out(ctx, cpuctx);
 	cpuctx->task_ctx = NULL;
 }
-- 
cgit v0.10.2


From 294ae4011530d008c59c4fb9847738e39228821e Mon Sep 17 00:00:00 2001
From: GeunSik Lim <leemgs1@gmail.com>
Date: Thu, 28 May 2009 10:36:11 +0900
Subject: ftrace: fix typo about map of kernel priority in ftrace.txt file.

Fix typo about chart to map the kernel priority to user land priorities.

   * About sched_setscheduler(2)
      Processes scheduled under SCHED_FIFO or SCHED_RR
      can have a (user-space) static priority in the range 1 to 99.
      (reference: http://www.kernel.org/doc/man-pages/online/pages/
                  man2/sched_setscheduler.2.html)

   * From: Steven Rostedt
      0 to 98 - maps to RT tasks 99 to 1 (SCHED_RR or SCHED_FIFO)

      99 - maps to internal kernel threads that want to be lower than RT tasks
      but higher than SCHED_OTHER tasks. Although I'm not sure if any
      kernel thread actually uses this. I'm not even sure how this can be
      set, because the internal sched_setscheduler function does not allow
      for it.

      100 to 139 - maps nice levels -20 to 19. These are not set via
      sched_setscheduler, but are set via the nice system call.

      140 - reserved for idle tasks.

Signed-off-by: GeunSik Lim <geunsik.lim@samsung.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index fd9a3e6..e362f50 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -518,9 +518,18 @@ priority with zero (0) being the highest priority and the nice
 values starting at 100 (nice -20). Below is a quick chart to map
 the kernel priority to user land priorities.
 
-  Kernel priority: 0 to 99    ==> user RT priority 99 to 0
-  Kernel priority: 100 to 139 ==> user nice -20 to 19
-  Kernel priority: 140        ==> idle task priority
+   Kernel Space                     User Space
+ ===============================================================
+   0(high) to  98(low)     user RT priority 99(high) to 1(low)
+                           with SCHED_RR or SCHED_FIFO
+ ---------------------------------------------------------------
+  99                       sched_priority is not used in scheduling
+                           decisions(it must be specified as 0)
+ ---------------------------------------------------------------
+ 100(high) to 139(low)     user nice -20(high) to 19(low)
+ ---------------------------------------------------------------
+ 140                       idle task priority
+ ---------------------------------------------------------------
 
 The task states are:
 
-- 
cgit v0.10.2


From f04d82b7e0c63d0251f9952a537a4bc4d73aa1a9 Mon Sep 17 00:00:00 2001
From: GeunSik Lim <leemgs1@gmail.com>
Date: Thu, 28 May 2009 10:36:14 +0900
Subject: sched: fix typo in sched-rt-group.txt file

Fix typo about static priority's range.

    Kernel Space                     User Space
    ===============================================================
      0(high) to  98(low)     user RT priority 99(high) to 1(low)
                              with SCHED_RR or SCHED_FIFO
    ---------------------------------------------------------------
     99                       sched_priority is not used in scheduling
                              decisions(it must be specified as 0)
    ---------------------------------------------------------------
    100(high) to 139(low)     user nice -20(high) to 19(low)
    ---------------------------------------------------------------
    140                       idle task priority
    ---------------------------------------------------------------
    * ref) http://www.kernel.org/doc/man-pages/online/pages/man2/sched_setscheduler.2.html

Signed-off-by: GeunSik Lim <geunsik.lim@samsung.com>
CC: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index eb74b01..1df7f9c 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -187,7 +187,7 @@ get their allocated time.
 
 Implementing SCHED_EDF might take a while to complete. Priority Inheritance is
 the biggest challenge as the current linux PI infrastructure is geared towards
-the limited static priority levels 0-139. With deadline scheduling you need to
+the limited static priority levels 0-99. With deadline scheduling you need to
 do deadline inheritance (since priority is inversely proportional to the
 deadline delta (deadline - now).
 
-- 
cgit v0.10.2


From efb3d17240d80e27508d238809168120fe4b93a4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 29 May 2009 14:25:58 +0200
Subject: perf_counter: Fix COMM and MMAP events for cpu wide counters

Commit a63eaf34ae6 ("perf_counter: Dynamically allocate tasks'
perf_counter_context struct") broke COMM and MMAP notification for
cpu wide counters by dropping out early if there was no task context,
thereby also not iterating the cpu context.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 616c524..58d6d19 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2443,9 +2443,9 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event)
 
 	cpuctx = &get_cpu_var(perf_cpu_context);
 	perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
+	if (cpuctx->task_ctx)
+		perf_counter_comm_ctx(cpuctx->task_ctx, comm_event);
 	put_cpu_var(perf_cpu_context);
-
-	perf_counter_comm_ctx(current->perf_counter_ctxp, comm_event);
 }
 
 void perf_counter_comm(struct task_struct *task)
@@ -2454,8 +2454,6 @@ void perf_counter_comm(struct task_struct *task)
 
 	if (!atomic_read(&nr_comm_tracking))
 		return;
-	if (!current->perf_counter_ctxp)
-		return;
 
 	comm_event = (struct perf_comm_event){
 		.task	= task,
@@ -2570,10 +2568,10 @@ got_name:
 
 	cpuctx = &get_cpu_var(perf_cpu_context);
 	perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
+	if (cpuctx->task_ctx)
+		perf_counter_mmap_ctx(cpuctx->task_ctx, mmap_event);
 	put_cpu_var(perf_cpu_context);
 
-	perf_counter_mmap_ctx(current->perf_counter_ctxp, mmap_event);
-
 	kfree(buf);
 }
 
@@ -2584,8 +2582,6 @@ void perf_counter_mmap(unsigned long addr, unsigned long len,
 
 	if (!atomic_read(&nr_mmap_tracking))
 		return;
-	if (!current->perf_counter_ctxp)
-		return;
 
 	mmap_event = (struct perf_mmap_event){
 		.file   = file,
-- 
cgit v0.10.2


From 665c2142a94202881a3c11cbaee6506cb10ada2d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 29 May 2009 14:51:57 +0200
Subject: perf_counter: Clean up task_ctx vs interrupts

Remove the local_irq_save() etc.. in routines that are smp function
calls, or have IRQs disabled by other means.

Then change the COMM, MMAP, and swcounter context iteration to
current->perf_counter_ctxp and RCU, since it really doesn't matter
which context they iterate, they're all folded.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 58d6d19..0c000d3 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -232,18 +232,14 @@ static void __perf_counter_remove_from_context(void *info)
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter *counter = info;
 	struct perf_counter_context *ctx = counter->ctx;
-	unsigned long flags;
 
-	local_irq_save(flags);
 	/*
 	 * If this is a task context, we need to check whether it is
 	 * the current task context of this cpu. If not it has been
 	 * scheduled out before the smp call arrived.
 	 */
-	if (ctx->task && cpuctx->task_ctx != ctx) {
-		local_irq_restore(flags);
+	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
-	}
 
 	spin_lock(&ctx->lock);
 	/*
@@ -267,7 +263,7 @@ static void __perf_counter_remove_from_context(void *info)
 	}
 
 	perf_enable();
-	spin_unlock_irqrestore(&ctx->lock, flags);
+	spin_unlock(&ctx->lock);
 }
 
 
@@ -383,17 +379,13 @@ static void __perf_counter_disable(void *info)
 	struct perf_counter *counter = info;
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter_context *ctx = counter->ctx;
-	unsigned long flags;
 
-	local_irq_save(flags);
 	/*
 	 * If this is a per-task counter, need to check whether this
 	 * counter's task is the current task on this cpu.
 	 */
-	if (ctx->task && cpuctx->task_ctx != ctx) {
-		local_irq_restore(flags);
+	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
-	}
 
 	spin_lock(&ctx->lock);
 
@@ -411,7 +403,7 @@ static void __perf_counter_disable(void *info)
 		counter->state = PERF_COUNTER_STATE_OFF;
 	}
 
-	spin_unlock_irqrestore(&ctx->lock, flags);
+	spin_unlock(&ctx->lock);
 }
 
 /*
@@ -618,10 +610,8 @@ static void __perf_install_in_context(void *info)
 	struct perf_counter_context *ctx = counter->ctx;
 	struct perf_counter *leader = counter->group_leader;
 	int cpu = smp_processor_id();
-	unsigned long flags;
 	int err;
 
-	local_irq_save(flags);
 	/*
 	 * If this is a task context, we need to check whether it is
 	 * the current task context of this cpu. If not it has been
@@ -630,10 +620,8 @@ static void __perf_install_in_context(void *info)
 	 * on this cpu because it had no counters.
 	 */
 	if (ctx->task && cpuctx->task_ctx != ctx) {
-		if (cpuctx->task_ctx || ctx->task != current) {
-			local_irq_restore(flags);
+		if (cpuctx->task_ctx || ctx->task != current)
 			return;
-		}
 		cpuctx->task_ctx = ctx;
 	}
 
@@ -687,7 +675,7 @@ static void __perf_install_in_context(void *info)
  unlock:
 	perf_enable();
 
-	spin_unlock_irqrestore(&ctx->lock, flags);
+	spin_unlock(&ctx->lock);
 }
 
 /*
@@ -751,19 +739,15 @@ static void __perf_counter_enable(void *info)
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter_context *ctx = counter->ctx;
 	struct perf_counter *leader = counter->group_leader;
-	unsigned long flags;
 	int err;
 
-	local_irq_save(flags);
 	/*
 	 * If this is a per-task counter, need to check whether this
 	 * counter's task is the current task on this cpu.
 	 */
 	if (ctx->task && cpuctx->task_ctx != ctx) {
-		if (cpuctx->task_ctx || ctx->task != current) {
-			local_irq_restore(flags);
+		if (cpuctx->task_ctx || ctx->task != current)
 			return;
-		}
 		cpuctx->task_ctx = ctx;
 	}
 
@@ -811,7 +795,7 @@ static void __perf_counter_enable(void *info)
 	}
 
  unlock:
-	spin_unlock_irqrestore(&ctx->lock, flags);
+	spin_unlock(&ctx->lock);
 }
 
 /*
@@ -981,6 +965,10 @@ void perf_counter_task_sched_out(struct task_struct *task,
 		spin_lock(&ctx->lock);
 		spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
 		if (context_equiv(ctx, next_ctx)) {
+			/*
+			 * XXX do we need a memory barrier of sorts
+			 * wrt to rcu_dereference() of perf_counter_ctxp
+			 */
 			task->perf_counter_ctxp = next_ctx;
 			next->perf_counter_ctxp = ctx;
 			ctx->task = next;
@@ -998,6 +986,9 @@ void perf_counter_task_sched_out(struct task_struct *task,
 	}
 }
 
+/*
+ * Called with IRQs disabled
+ */
 static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
 {
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
@@ -1012,6 +1003,9 @@ static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
 	cpuctx->task_ctx = NULL;
 }
 
+/*
+ * Called with IRQs disabled
+ */
 static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
 {
 	__perf_counter_sched_out(&cpuctx->ctx, cpuctx);
@@ -2431,6 +2425,7 @@ static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
 static void perf_counter_comm_event(struct perf_comm_event *comm_event)
 {
 	struct perf_cpu_context *cpuctx;
+	struct perf_counter_context *ctx;
 	unsigned int size;
 	char *comm = comm_event->task->comm;
 
@@ -2443,9 +2438,17 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event)
 
 	cpuctx = &get_cpu_var(perf_cpu_context);
 	perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
-	if (cpuctx->task_ctx)
-		perf_counter_comm_ctx(cpuctx->task_ctx, comm_event);
 	put_cpu_var(perf_cpu_context);
+
+	rcu_read_lock();
+	/*
+	 * doesn't really matter which of the child contexts the
+	 * events ends up in.
+	 */
+	ctx = rcu_dereference(current->perf_counter_ctxp);
+	if (ctx)
+		perf_counter_comm_ctx(ctx, comm_event);
+	rcu_read_unlock();
 }
 
 void perf_counter_comm(struct task_struct *task)
@@ -2536,6 +2539,7 @@ static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
 static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
 {
 	struct perf_cpu_context *cpuctx;
+	struct perf_counter_context *ctx;
 	struct file *file = mmap_event->file;
 	unsigned int size;
 	char tmp[16];
@@ -2568,10 +2572,18 @@ got_name:
 
 	cpuctx = &get_cpu_var(perf_cpu_context);
 	perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
-	if (cpuctx->task_ctx)
-		perf_counter_mmap_ctx(cpuctx->task_ctx, mmap_event);
 	put_cpu_var(perf_cpu_context);
 
+	rcu_read_lock();
+	/*
+	 * doesn't really matter which of the child contexts the
+	 * events ends up in.
+	 */
+	ctx = rcu_dereference(current->perf_counter_ctxp);
+	if (ctx)
+		perf_counter_mmap_ctx(ctx, mmap_event);
+	rcu_read_unlock();
+
 	kfree(buf);
 }
 
@@ -2882,6 +2894,7 @@ static void __perf_swcounter_event(enum perf_event_types type, u32 event,
 {
 	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
 	int *recursion = perf_swcounter_recursion_context(cpuctx);
+	struct perf_counter_context *ctx;
 
 	if (*recursion)
 		goto out;
@@ -2891,10 +2904,15 @@ static void __perf_swcounter_event(enum perf_event_types type, u32 event,
 
 	perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
 				 nr, nmi, regs, addr);
-	if (cpuctx->task_ctx) {
-		perf_swcounter_ctx_event(cpuctx->task_ctx, type, event,
-					 nr, nmi, regs, addr);
-	}
+	rcu_read_lock();
+	/*
+	 * doesn't really matter which of the child contexts the
+	 * events ends up in.
+	 */
+	ctx = rcu_dereference(current->perf_counter_ctxp);
+	if (ctx)
+		perf_swcounter_ctx_event(ctx, type, event, nr, nmi, regs, addr);
+	rcu_read_unlock();
 
 	barrier();
 	(*recursion)--;
-- 
cgit v0.10.2


From bbbee90829304d156c12b171c0ac7e6e1aba8b90 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 29 May 2009 14:25:58 +0200
Subject: perf_counter: Ammend cleanup in fork() fail

When fork() fails we cannot use perf_counter_exit_task() since that
assumes to operate on current. Write a new helper that cleans up
unused/clean contexts.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 717bf3b..519a41b 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -579,6 +579,7 @@ extern void perf_counter_task_sched_out(struct task_struct *task,
 extern void perf_counter_task_tick(struct task_struct *task, int cpu);
 extern int perf_counter_init_task(struct task_struct *child);
 extern void perf_counter_exit_task(struct task_struct *child);
+extern void perf_counter_free_task(struct task_struct *task);
 extern void perf_counter_do_pending(void);
 extern void perf_counter_print_debug(void);
 extern void __perf_disable(void);
@@ -644,6 +645,7 @@ static inline void
 perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
 static inline int perf_counter_init_task(struct task_struct *child)	{ return 0; }
 static inline void perf_counter_exit_task(struct task_struct *child)	{ }
+static inline void perf_counter_free_task(struct task_struct *task)	{ }
 static inline void perf_counter_do_pending(void)			{ }
 static inline void perf_counter_print_debug(void)			{ }
 static inline void perf_disable(void)					{ }
diff --git a/kernel/fork.c b/kernel/fork.c
index c07c333..23bf757 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1298,7 +1298,7 @@ bad_fork_cleanup_semundo:
 bad_fork_cleanup_audit:
 	audit_free(p);
 bad_fork_cleanup_policy:
-	perf_counter_exit_task(p);
+	perf_counter_free_task(p);
 #ifdef CONFIG_NUMA
 	mpol_put(p->mempolicy);
 bad_fork_cleanup_cgroup:
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0c000d3..79c3f26 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3538,8 +3538,7 @@ static void sync_child_counter(struct perf_counter *child_counter,
 }
 
 static void
-__perf_counter_exit_task(struct task_struct *child,
-			 struct perf_counter *child_counter,
+__perf_counter_exit_task(struct perf_counter *child_counter,
 			 struct perf_counter_context *child_ctx)
 {
 	struct perf_counter *parent_counter;
@@ -3605,7 +3604,7 @@ void perf_counter_exit_task(struct task_struct *child)
 again:
 	list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
 				 list_entry)
-		__perf_counter_exit_task(child, child_counter, child_ctx);
+		__perf_counter_exit_task(child_counter, child_ctx);
 
 	/*
 	 * If the last counter was a group counter, it will have appended all
@@ -3621,6 +3620,44 @@ again:
 }
 
 /*
+ * free an unexposed, unused context as created by inheritance by
+ * init_task below, used by fork() in case of fail.
+ */
+void perf_counter_free_task(struct task_struct *task)
+{
+	struct perf_counter_context *ctx = task->perf_counter_ctxp;
+	struct perf_counter *counter, *tmp;
+
+	if (!ctx)
+		return;
+
+	mutex_lock(&ctx->mutex);
+again:
+	list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) {
+		struct perf_counter *parent = counter->parent;
+
+		if (WARN_ON_ONCE(!parent))
+			continue;
+
+		mutex_lock(&parent->child_mutex);
+		list_del_init(&counter->child_list);
+		mutex_unlock(&parent->child_mutex);
+
+		fput(parent->filp);
+
+		list_del_counter(counter, ctx);
+		free_counter(counter);
+	}
+
+	if (!list_empty(&ctx->counter_list))
+		goto again;
+
+	mutex_unlock(&ctx->mutex);
+
+	put_ctx(ctx);
+}
+
+/*
  * Initialize the perf_counter context in task_struct
  */
 int perf_counter_init_task(struct task_struct *child)
-- 
cgit v0.10.2


From 681a1b4032d72f4ad6d4beed751bc65574572746 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Thu, 28 May 2009 14:34:18 -0700
Subject: MAINTAINERS: pair EDAC-E752X P: and M: entries

Entries should be P: name then M: email address.

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/MAINTAINERS b/MAINTAINERS
index 5ee166e..ea4eced 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1971,8 +1971,8 @@ F:	include/linux/edac.h
 
 EDAC-E752X
 P:	Mark Gross
-P:	Doug Thompson
 M:	mark.gross@intel.com
+P:	Doug Thompson
 M:	dougthompson@xmission.com
 L:	bluesmoke-devel@lists.sourceforge.net (moderated for non-subscribers)
 W:	bluesmoke.sourceforge.net
-- 
cgit v0.10.2


From 6d2661ede5f20f968422e790af3334908c3bc857 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 28 May 2009 14:34:19 -0700
Subject: oom: fix possible oom_dump_tasks NULL pointer

When /proc/sys/vm/oom_dump_tasks is enabled, it is possible to get a NULL
pointer for tasks that have detached mm's since task_lock() is not held
during the tasklist scan.  Add the task_lock().

Acked-by: Nick Piggin <npiggin@suse.de>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 92bcf1d..a7b2460 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -284,22 +284,28 @@ static void dump_tasks(const struct mem_cgroup *mem)
 	printk(KERN_INFO "[ pid ]   uid  tgid total_vm      rss cpu oom_adj "
 	       "name\n");
 	do_each_thread(g, p) {
-		/*
-		 * total_vm and rss sizes do not exist for tasks with a
-		 * detached mm so there's no need to report them.
-		 */
-		if (!p->mm)
-			continue;
+		struct mm_struct *mm;
+
 		if (mem && !task_in_mem_cgroup(p, mem))
 			continue;
 		if (!thread_group_leader(p))
 			continue;
 
 		task_lock(p);
+		mm = p->mm;
+		if (!mm) {
+			/*
+			 * total_vm and rss sizes do not exist for tasks with no
+			 * mm so there's no need to report them; they can't be
+			 * oom killed anyway.
+			 */
+			task_unlock(p);
+			continue;
+		}
 		printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d     %3d %s\n",
-		       p->pid, __task_cred(p)->uid, p->tgid,
-		       p->mm->total_vm, get_mm_rss(p->mm), (int)task_cpu(p),
-		       p->oomkilladj, p->comm);
+		       p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
+		       get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj,
+		       p->comm);
 		task_unlock(p);
 	} while_each_thread(g, p);
 }
-- 
cgit v0.10.2


From b2e1feaf0af6b8a826b86748a19ddc2013ab7dbd Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 28 May 2009 14:34:20 -0700
Subject: cred: #include init.h in cred.h

linux/cred.h can't be included as first header (alphabetical order)
because it uses __init which is enough to break compilation on some archs.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/cred.h b/include/linux/cred.h
index 3282ee4..4fa9996 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -13,6 +13,7 @@
 #define _LINUX_CRED_H
 
 #include <linux/capability.h>
+#include <linux/init.h>
 #include <linux/key.h>
 #include <asm/atomic.h>
 
-- 
cgit v0.10.2


From bd6daba909d8484bd2ccf6017db4028d7a420927 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Thu, 28 May 2009 14:34:21 -0700
Subject: procfs: make errno values consistent when open pident vs exit(2) race
 occurs

proc_pident_instantiate() has following call flow.

proc_pident_lookup()
  proc_pident_instantiate()
    proc_pid_make_inode()

And, proc_pident_lookup() has following error handling.

	const struct pid_entry *p, *last;
	error = ERR_PTR(-ENOENT);
	if (!task)
		goto out_no_task;

Then, proc_pident_instantiate should return ENOENT too when racing against
exit(2) occur.

EINAL has two bad reason.
  - it implies caller is wrong. bad the race isn't caller's mistake.
  - man 2 open don't explain EINVAL. user often don't handle it.

Note: Other proc_pid_make_inode() caller already use ENOENT properly.

Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/proc/base.c b/fs/proc/base.c
index fb45615..3326bbf 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1956,7 +1956,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir,
 	const struct pid_entry *p = ptr;
 	struct inode *inode;
 	struct proc_inode *ei;
-	struct dentry *error = ERR_PTR(-EINVAL);
+	struct dentry *error = ERR_PTR(-ENOENT);
 
 	inode = proc_pid_make_inode(dir->i_sb, task);
 	if (!inode)
-- 
cgit v0.10.2


From e767e0561d7fd2333df1921f1ab4176211f9036b Mon Sep 17 00:00:00 2001
From: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Date: Thu, 28 May 2009 14:34:28 -0700
Subject: memcg: fix deadlock between lock_page_cgroup and mapping tree_lock

mapping->tree_lock can be acquired from interrupt context.  Then,
following dead lock can occur.

Assume "A" as a page.

 CPU0:
       lock_page_cgroup(A)
		interrupted
			-> take mapping->tree_lock.
 CPU1:
       take mapping->tree_lock
		-> lock_page_cgroup(A)

This patch tries to fix above deadlock by moving memcg's hook to out of
mapping->tree_lock.  charge/uncharge of pagecache/swapcache is protected
by page lock, not tree_lock.

After this patch, lock_page_cgroup() is not called under mapping->tree_lock.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 62d8143..d476aad 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -437,6 +437,11 @@ static inline int mem_cgroup_cache_charge_swapin(struct page *page,
 	return 0;
 }
 
+static inline void
+mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
+{
+}
+
 #endif /* CONFIG_SWAP */
 #endif /* __KERNEL__*/
 #endif /* _LINUX_SWAP_H */
diff --git a/mm/filemap.c b/mm/filemap.c
index 379ff0b..1b60f30 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -121,7 +121,6 @@ void __remove_from_page_cache(struct page *page)
 	mapping->nrpages--;
 	__dec_zone_page_state(page, NR_FILE_PAGES);
 	BUG_ON(page_mapped(page));
-	mem_cgroup_uncharge_cache_page(page);
 
 	/*
 	 * Some filesystems seem to re-dirty the page even after
@@ -145,6 +144,7 @@ void remove_from_page_cache(struct page *page)
 	spin_lock_irq(&mapping->tree_lock);
 	__remove_from_page_cache(page);
 	spin_unlock_irq(&mapping->tree_lock);
+	mem_cgroup_uncharge_cache_page(page);
 }
 
 static int sync_page(void *word)
@@ -476,13 +476,13 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 		if (likely(!error)) {
 			mapping->nrpages++;
 			__inc_zone_page_state(page, NR_FILE_PAGES);
+			spin_unlock_irq(&mapping->tree_lock);
 		} else {
 			page->mapping = NULL;
+			spin_unlock_irq(&mapping->tree_lock);
 			mem_cgroup_uncharge_cache_page(page);
 			page_cache_release(page);
 		}
-
-		spin_unlock_irq(&mapping->tree_lock);
 		radix_tree_preload_end();
 	} else
 		mem_cgroup_uncharge_cache_page(page);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 01c2d8f1..4a747a2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1488,8 +1488,9 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
 }
 
+#ifdef CONFIG_SWAP
 /*
- * called from __delete_from_swap_cache() and drop "page" account.
+ * called after __delete_from_swap_cache() and drop "page" account.
  * memcg information is recorded to swap_cgroup of "ent"
  */
 void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
@@ -1506,6 +1507,7 @@ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
 	if (memcg)
 		css_put(&memcg->css);
 }
+#endif
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 /*
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3ecea98..1416e7e 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -109,8 +109,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
  */
 void __delete_from_swap_cache(struct page *page)
 {
-	swp_entry_t ent = {.val = page_private(page)};
-
 	VM_BUG_ON(!PageLocked(page));
 	VM_BUG_ON(!PageSwapCache(page));
 	VM_BUG_ON(PageWriteback(page));
@@ -121,7 +119,6 @@ void __delete_from_swap_cache(struct page *page)
 	total_swapcache_pages--;
 	__dec_zone_page_state(page, NR_FILE_PAGES);
 	INC_CACHE_INFO(del_total);
-	mem_cgroup_uncharge_swapcache(page, ent);
 }
 
 /**
@@ -191,6 +188,7 @@ void delete_from_swap_cache(struct page *page)
 	__delete_from_swap_cache(page);
 	spin_unlock_irq(&swapper_space.tree_lock);
 
+	mem_cgroup_uncharge_swapcache(page, entry);
 	swap_free(entry);
 	page_cache_release(page);
 }
diff --git a/mm/truncate.c b/mm/truncate.c
index 55206fa..12e1579 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -359,6 +359,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
 	BUG_ON(page_has_private(page));
 	__remove_from_page_cache(page);
 	spin_unlock_irq(&mapping->tree_lock);
+	mem_cgroup_uncharge_cache_page(page);
 	page_cache_release(page);	/* pagecache ref */
 	return 1;
 failed:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5fa3eda..d254306 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -470,10 +470,12 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
 		swp_entry_t swap = { .val = page_private(page) };
 		__delete_from_swap_cache(page);
 		spin_unlock_irq(&mapping->tree_lock);
+		mem_cgroup_uncharge_swapcache(page, swap);
 		swap_free(swap);
 	} else {
 		__remove_from_page_cache(page);
 		spin_unlock_irq(&mapping->tree_lock);
+		mem_cgroup_uncharge_cache_page(page);
 	}
 
 	return 1;
-- 
cgit v0.10.2


From b898f4f869da5b9d41f297fff87aca4cd42d80b3 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Thu, 28 May 2009 14:34:29 -0700
Subject: drivers/serial/mpc52xx_uart.c: fix array overindexing check

The check for an overindexing of mpc52xx_uart_{ports,nodes} has an
off-by-one.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Acked-by: Wolfram Sang <w.sang@pengutronix.de>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/mpc52xx_uart.c b/drivers/serial/mpc52xx_uart.c
index 7f72f8c..b3feb61 100644
--- a/drivers/serial/mpc52xx_uart.c
+++ b/drivers/serial/mpc52xx_uart.c
@@ -988,7 +988,7 @@ mpc52xx_console_setup(struct console *co, char *options)
 	pr_debug("mpc52xx_console_setup co=%p, co->index=%i, options=%s\n",
 		 co, co->index, options);
 
-	if ((co->index < 0) || (co->index > MPC52xx_PSC_MAXNUM)) {
+	if ((co->index < 0) || (co->index >= MPC52xx_PSC_MAXNUM)) {
 		pr_debug("PSC%x out of range\n", co->index);
 		return -EINVAL;
 	}
-- 
cgit v0.10.2


From ba9447198bdd945666a9bac5e556632a7acb235d Mon Sep 17 00:00:00 2001
From: Thomas Dahlmann <dahlmann.thomas@arcor.de>
Date: Thu, 28 May 2009 14:34:30 -0700
Subject: MAINTAINERS: change email address for Thomas Dahlmann

Signed-off-by: Thomas <dahlmann.thomas@arcor.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/MAINTAINERS b/MAINTAINERS
index ea4eced..41c6605 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -434,7 +434,7 @@ F:	arch/alpha/
 
 AMD GEODE CS5536 USB DEVICE CONTROLLER DRIVER
 P:	Thomas Dahlmann
-M:	thomas.dahlmann@amd.com
+M:	dahlmann.thomas@arcor.de
 L:	linux-geode@lists.infradead.org (moderated for non-subscribers)
 S:	Supported
 F:	drivers/usb/gadget/amd5536udc.*
-- 
cgit v0.10.2


From c3dc5bec05a2ae03a72ef82e321d77fb549d951c Mon Sep 17 00:00:00 2001
From: Oskar Schirmer <os@emlix.com>
Date: Thu, 28 May 2009 14:34:31 -0700
Subject: flat: fix data sections alignment

The flat loader uses an architecture's flat_stack_align() to align the
stack but assumes word-alignment is enough for the data sections.

However, on the Xtensa S6000 we have registers up to 128bit width
which can be used from userspace and therefor need userspace stack and
data-section alignment of at least this size.

This patch drops flat_stack_align() and uses the same alignment that
is required for slab caches, ARCH_SLAB_MINALIGN, or wordsize if it's
not defined by the architecture.

It also fixes m32r which was obviously kaput, aligning an
uninitialized stack entry instead of the stack pointer.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Oskar Schirmer <os@emlix.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Bryan Wu <cooloney@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Cc: Greg Ungerer <gerg@uclinux.org>
Signed-off-by: Johannes Weiner <jw@emlix.com>
Acked-by: Mike Frysinger <vapier.adi@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/arm/include/asm/flat.h b/arch/arm/include/asm/flat.h
index 1d77e51..59426a4 100644
--- a/arch/arm/include/asm/flat.h
+++ b/arch/arm/include/asm/flat.h
@@ -5,9 +5,6 @@
 #ifndef __ARM_FLAT_H__
 #define __ARM_FLAT_H__
 
-/* An odd number of words will be pushed after this alignment, so
-   deliberately misalign the value.  */
-#define	flat_stack_align(sp)	sp = (void *)(((unsigned long)(sp) - 4) | 4)
 #define	flat_argvp_envp_on_stack()		1
 #define	flat_old_ram_flag(flags)		(flags)
 #define	flat_reloc_valid(reloc, size)		((reloc) <= (size))
diff --git a/arch/blackfin/include/asm/flat.h b/arch/blackfin/include/asm/flat.h
index e70074e..733a178 100644
--- a/arch/blackfin/include/asm/flat.h
+++ b/arch/blackfin/include/asm/flat.h
@@ -10,7 +10,6 @@
 
 #include <asm/unaligned.h>
 
-#define	flat_stack_align(sp)	/* nothing needed */
 #define	flat_argvp_envp_on_stack()		0
 #define	flat_old_ram_flag(flags)		(flags)
 
diff --git a/arch/h8300/include/asm/flat.h b/arch/h8300/include/asm/flat.h
index 2a87350..bd12b31 100644
--- a/arch/h8300/include/asm/flat.h
+++ b/arch/h8300/include/asm/flat.h
@@ -5,7 +5,6 @@
 #ifndef __H8300_FLAT_H__
 #define __H8300_FLAT_H__
 
-#define	flat_stack_align(sp)			/* nothing needed */
 #define	flat_argvp_envp_on_stack()		1
 #define	flat_old_ram_flag(flags)		1
 #define	flat_reloc_valid(reloc, size)		((reloc) <= (size))
diff --git a/arch/m32r/include/asm/flat.h b/arch/m32r/include/asm/flat.h
index d851cf0..5d711c4 100644
--- a/arch/m32r/include/asm/flat.h
+++ b/arch/m32r/include/asm/flat.h
@@ -12,7 +12,6 @@
 #ifndef __ASM_M32R_FLAT_H
 #define __ASM_M32R_FLAT_H
 
-#define	flat_stack_align(sp)		(*sp += (*sp & 3 ? (4 - (*sp & 3)): 0))
 #define	flat_argvp_envp_on_stack()		0
 #define	flat_old_ram_flag(flags)		(flags)
 #define	flat_set_persistent(relval, p)		0
diff --git a/arch/m68k/include/asm/flat.h b/arch/m68k/include/asm/flat.h
index 814b517..a0e2907 100644
--- a/arch/m68k/include/asm/flat.h
+++ b/arch/m68k/include/asm/flat.h
@@ -5,7 +5,6 @@
 #ifndef __M68KNOMMU_FLAT_H__
 #define __M68KNOMMU_FLAT_H__
 
-#define	flat_stack_align(sp)			/* nothing needed */
 #define	flat_argvp_envp_on_stack()		1
 #define	flat_old_ram_flag(flags)		(flags)
 #define	flat_reloc_valid(reloc, size)		((reloc) <= (size))
diff --git a/arch/sh/include/asm/flat.h b/arch/sh/include/asm/flat.h
index d3b2b4f..5d84df5 100644
--- a/arch/sh/include/asm/flat.h
+++ b/arch/sh/include/asm/flat.h
@@ -12,7 +12,6 @@
 #ifndef __ASM_SH_FLAT_H
 #define __ASM_SH_FLAT_H
 
-#define	flat_stack_align(sp)			/* nothing needed */
 #define	flat_argvp_envp_on_stack()		0
 #define	flat_old_ram_flag(flags)		(flags)
 #define	flat_reloc_valid(reloc, size)		((reloc) <= (size))
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 5cebf0b..697f6b5 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -41,6 +41,7 @@
 #include <asm/uaccess.h>
 #include <asm/unaligned.h>
 #include <asm/cacheflush.h>
+#include <asm/page.h>
 
 /****************************************************************************/
 
@@ -54,6 +55,18 @@
 #define	DBG_FLT(a...)
 #endif
 
+/*
+ * User data (stack, data section and bss) needs to be aligned
+ * for the same reasons as SLAB memory is, and to the same amount.
+ * Avoid duplicating architecture specific code by using the same
+ * macro as with SLAB allocation:
+ */
+#ifdef ARCH_SLAB_MINALIGN
+#define FLAT_DATA_ALIGN	(ARCH_SLAB_MINALIGN)
+#else
+#define FLAT_DATA_ALIGN	(sizeof(void *))
+#endif
+
 #define RELOC_FAILED 0xff00ff01		/* Relocation incorrect somewhere */
 #define UNLOADED_LIB 0x7ff000ff		/* Placeholder for unused library */
 
@@ -114,20 +127,18 @@ static unsigned long create_flat_tables(
 	int envc = bprm->envc;
 	char uninitialized_var(dummy);
 
-	sp = (unsigned long *) ((-(unsigned long)sizeof(char *))&(unsigned long) p);
+	sp = (unsigned long *)p;
+	sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
+	sp = (unsigned long *) ((unsigned long)sp & -FLAT_DATA_ALIGN);
+	argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
+	envp = argv + (argc + 1);
 
-	sp -= envc+1;
-	envp = sp;
-	sp -= argc+1;
-	argv = sp;
-
-	flat_stack_align(sp);
 	if (flat_argvp_envp_on_stack()) {
-		--sp; put_user((unsigned long) envp, sp);
-		--sp; put_user((unsigned long) argv, sp);
+		put_user((unsigned long) envp, sp + 2);
+		put_user((unsigned long) argv, sp + 1);
 	}
 
-	put_user(argc,--sp);
+	put_user(argc, sp);
 	current->mm->arg_start = (unsigned long) p;
 	while (argc-->0) {
 		put_user((unsigned long) p, argv++);
@@ -558,7 +569,9 @@ static int load_flat_file(struct linux_binprm * bprm,
 			ret = realdatastart;
 			goto err;
 		}
-		datapos = realdatastart + MAX_SHARED_LIBS * sizeof(unsigned long);
+		datapos = ALIGN(realdatastart +
+				MAX_SHARED_LIBS * sizeof(unsigned long),
+				FLAT_DATA_ALIGN);
 
 		DBG_FLT("BINFMT_FLAT: Allocated data+bss+stack (%d bytes): %x\n",
 				(int)(data_len + bss_len + stack_len), (int)datapos);
@@ -604,9 +617,12 @@ static int load_flat_file(struct linux_binprm * bprm,
 		}
 
 		realdatastart = textpos + ntohl(hdr->data_start);
-		datapos = realdatastart + MAX_SHARED_LIBS * sizeof(unsigned long);
-		reloc = (unsigned long *) (textpos + ntohl(hdr->reloc_start) +
-				MAX_SHARED_LIBS * sizeof(unsigned long));
+		datapos = ALIGN(realdatastart +
+				MAX_SHARED_LIBS * sizeof(unsigned long),
+				FLAT_DATA_ALIGN);
+
+		reloc = (unsigned long *)
+			(datapos + (ntohl(hdr->reloc_start) - text_len));
 		memp = textpos;
 		memp_size = len;
 #ifdef CONFIG_BINFMT_ZFLAT
@@ -854,7 +870,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	stack_len = TOP_OF_ARGS - bprm->p;             /* the strings */
 	stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */
 	stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */
-
+	stack_len += FLAT_DATA_ALIGN - 1;  /* reserve for upcoming alignment */
 	
 	res = load_flat_file(bprm, &libinfo, 0, &stack_len);
 	if (res > (unsigned long)-4096)
-- 
cgit v0.10.2


From b5d598b41aebee67bf95802b68b888e98a449687 Mon Sep 17 00:00:00 2001
From: Alexander Beregalov <a.beregalov@gmail.com>
Date: Thu, 28 May 2009 14:34:33 -0700
Subject: parport_gsc: fix printk format error

drivers/parport/parport_gsc.c:356: warning: format '%lx' expects type
'long unsigned int', but argument 2 has type 'resource_size_t'

[akpm@linux-foundation.org: fix it to handle u64's]
Signed-off-by: Alexander Beregalov <a.beregalov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/parport/parport_gsc.c b/drivers/parport/parport_gsc.c
index e6a7e84..ea31a45 100644
--- a/drivers/parport/parport_gsc.c
+++ b/drivers/parport/parport_gsc.c
@@ -352,8 +352,8 @@ static int __devinit parport_init_chip(struct parisc_device *dev)
 	unsigned long port;
 
 	if (!dev->irq) {
-		printk(KERN_WARNING "IRQ not found for parallel device at 0x%lx\n",
-			dev->hpa.start);
+		printk(KERN_WARNING "IRQ not found for parallel device at 0x%llx\n",
+			(unsigned long long)dev->hpa.start);
 		return -ENODEV;
 	}
 
-- 
cgit v0.10.2


From 8e8e8267f0a08c2415d5f51bc9a9fde6d5400619 Mon Sep 17 00:00:00 2001
From: Alexander Beregalov <a.beregalov@gmail.com>
Date: Thu, 28 May 2009 14:34:34 -0700
Subject: serial: 8250_gsc: fix printk format error

drivers/serial/8250_gsc.c:44: warning: format '%lx' expects type
'long unsigned int', but argument 2 has type 'resource_size_t'

[akpm@linux-foundation.org: fix it to handle u64's]
Signed-off-by: Alexander Beregalov <a.beregalov@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/8250_gsc.c b/drivers/serial/8250_gsc.c
index 418b4fe..33149d9 100644
--- a/drivers/serial/8250_gsc.c
+++ b/drivers/serial/8250_gsc.c
@@ -39,9 +39,9 @@ static int __init serial_init_chip(struct parisc_device *dev)
 		 */
 		if (parisc_parent(dev)->id.hw_type != HPHW_IOA)
 			printk(KERN_INFO
-				"Serial: device 0x%lx not configured.\n"
+				"Serial: device 0x%llx not configured.\n"
 				"Enable support for Wax, Lasi, Asp or Dino.\n",
-				dev->hpa.start);
+				(unsigned long long)dev->hpa.start);
 		return -ENODEV;
 	}
 
-- 
cgit v0.10.2


From 17663e59704bea838a9236f299104e30909a43b1 Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Thu, 28 May 2009 14:34:35 -0700
Subject: S3C-fb: PM fix

Correctly restore the FrameBuffer register state in the resume function.

Reviewed-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Ben Dooks <ben-linux@fluff.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/video/s3c-fb.c b/drivers/video/s3c-fb.c
index 5e9c630..d3a568e 100644
--- a/drivers/video/s3c-fb.c
+++ b/drivers/video/s3c-fb.c
@@ -947,7 +947,8 @@ static int __devexit s3c_fb_remove(struct platform_device *pdev)
 	int win;
 
 	for (win = 0; win <= S3C_FB_MAX_WIN; win++)
-		s3c_fb_release_win(sfb, sfb->windows[win]);
+		if (sfb->windows[win])
+			s3c_fb_release_win(sfb, sfb->windows[win]);
 
 	iounmap(sfb->regs);
 
@@ -985,11 +986,20 @@ static int s3c_fb_suspend(struct platform_device *pdev, pm_message_t state)
 static int s3c_fb_resume(struct platform_device *pdev)
 {
 	struct s3c_fb *sfb = platform_get_drvdata(pdev);
+	struct s3c_fb_platdata *pd = sfb->pdata;
 	struct s3c_fb_win *win;
 	int win_no;
 
 	clk_enable(sfb->bus_clk);
 
+	/* setup registers */
+	writel(pd->vidcon1, sfb->regs + VIDCON1);
+
+	/* zero all windows before we do anything */
+	for (win_no = 0; win_no < S3C_FB_MAX_WIN; win_no++)
+		s3c_fb_clear_win(sfb, win_no);
+
+	/* restore framebuffers */
 	for (win_no = 0; win_no < S3C_FB_MAX_WIN; win_no++) {
 		win = sfb->windows[win_no];
 		if (!win)
-- 
cgit v0.10.2


From 53b7479bbdaedcc7846c66fd608fe66f1b5aa35b Mon Sep 17 00:00:00 2001
From: Nicolas Ferre <nicolas.ferre@atmel.com>
Date: Thu, 28 May 2009 14:34:36 -0700
Subject: atmel_lcdfb: correct fifo size for some products

Remove wrong fifo size definition for some AT91 products.

Due to a misunderstanding of some AT91 datasheets, a fifo size of 2048
(words) has been introduced by mistake.  In fact, all products (AT91/AT32)
are sharing the same fifo size of 512 words.

Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
Cc: Andrew Victor <avictor.za@gmail.com>
Acked-by: Haavard Skinnemoen <hskinnemoen@atmel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/video/atmel_lcdfb.c b/drivers/video/atmel_lcdfb.c
index 9a577a8..2fb63f6 100644
--- a/drivers/video/atmel_lcdfb.c
+++ b/drivers/video/atmel_lcdfb.c
@@ -29,14 +29,8 @@
 
 /* configurable parameters */
 #define ATMEL_LCDC_CVAL_DEFAULT		0xc8
-#define ATMEL_LCDC_DMA_BURST_LEN	8
-
-#if defined(CONFIG_ARCH_AT91SAM9263) || defined(CONFIG_ARCH_AT91CAP9) || \
-	defined(CONFIG_ARCH_AT91SAM9RL)
-#define ATMEL_LCDC_FIFO_SIZE		2048
-#else
-#define ATMEL_LCDC_FIFO_SIZE		512
-#endif
+#define ATMEL_LCDC_DMA_BURST_LEN	8	/* words */
+#define ATMEL_LCDC_FIFO_SIZE		512	/* words */
 
 #if defined(CONFIG_ARCH_AT91)
 #define	ATMEL_LCDFB_FBINFO_DEFAULT	(FBINFO_DEFAULT \
-- 
cgit v0.10.2


From 32b154c0b0bae2879bf4e549d861caf1759a3546 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Thu, 28 May 2009 14:34:37 -0700
Subject: x86: ignore VM_LOCKED when determining if hugetlb-backed page tables
 can be shared or not

Addresses http://bugzilla.kernel.org/show_bug.cgi?id=13302

On x86 and x86-64, it is possible that page tables are shared beween
shared mappings backed by hugetlbfs.  As part of this,
page_table_shareable() checks a pair of vma->vm_flags and they must match
if they are to be shared.  All VMA flags are taken into account, including
VM_LOCKED.

The problem is that VM_LOCKED is cleared on fork().  When a process with a
shared memory segment forks() to exec() a helper, there will be shared
VMAs with different flags.  The impact is that the shared segment is
sometimes considered shareable and other times not, depending on what
process is checking.

What happens is that the segment page tables are being shared but the
count is inaccurate depending on the ordering of events.  As the page
tables are freed with put_page(), bad pmd's are found when some of the
children exit.  The hugepage counters also get corrupted and the Total and
Free count will no longer match even when all the hugepage-backed regions
are freed.  This requires a reboot of the machine to "fix".

This patch addresses the problem by comparing all flags except VM_LOCKED
when deciding if pagetables should be shared or not for hugetlbfs-backed
mapping.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: <stable@kernel.org>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: <starlight@binnacle.cx>
Cc: Eric B Munson <ebmunson@us.ibm.com>
Cc: Adam Litke <agl@us.ibm.com>
Cc: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 8f307d9..f46c340 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -26,12 +26,16 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
 	unsigned long sbase = saddr & PUD_MASK;
 	unsigned long s_end = sbase + PUD_SIZE;
 
+	/* Allow segments to share if only one is marked locked */
+	unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
+	unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
+
 	/*
 	 * match the virtual addresses, permission and the alignment of the
 	 * page table page.
 	 */
 	if (pmd_index(addr) != pmd_index(saddr) ||
-	    vma->vm_flags != svma->vm_flags ||
+	    vm_flags != svm_flags ||
 	    sbase < svma->vm_start || svma->vm_end < s_end)
 		return 0;
 
-- 
cgit v0.10.2


From f83a275dbc5ca1721143698e844243fcadfabf6a Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Thu, 28 May 2009 14:34:40 -0700
Subject: mm: account for MAP_SHARED mappings using VM_MAYSHARE and not
 VM_SHARED in hugetlbfs

Addresses http://bugzilla.kernel.org/show_bug.cgi?id=13302

hugetlbfs reserves huge pages but does not fault them at mmap() time to
ensure that future faults succeed.  The reservation behaviour differs
depending on whether the mapping was mapped MAP_SHARED or MAP_PRIVATE.
For MAP_SHARED mappings, hugepages are reserved when mmap() is first
called and are tracked based on information associated with the inode.
Other processes mapping MAP_SHARED use the same reservation.  MAP_PRIVATE
track the reservations based on the VMA created as part of the mmap()
operation.  Each process mapping MAP_PRIVATE must make its own
reservation.

hugetlbfs currently checks if a VMA is MAP_SHARED with the VM_SHARED flag
and not VM_MAYSHARE.  For file-backed mappings, such as hugetlbfs,
VM_SHARED is set only if the mapping is MAP_SHARED and the file was opened
read-write.  If a shared memory mapping was mapped shared-read-write for
populating of data and mapped shared-read-only by other processes, then
hugetlbfs would account for the mapping as if it was MAP_PRIVATE.  This
causes processes to fail to map the file MAP_SHARED even though it should
succeed as the reservation is there.

This patch alters mm/hugetlb.c and replaces VM_SHARED with VM_MAYSHARE
when the intent of the code was to check whether the VMA was mapped
MAP_SHARED or MAP_PRIVATE.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: <stable@kernel.org>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: <starlight@binnacle.cx>
Cc: Eric B Munson <ebmunson@us.ibm.com>
Cc: Adam Litke <agl@us.ibm.com>
Cc: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 28c655b..e83ad2c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -316,7 +316,7 @@ static void resv_map_release(struct kref *ref)
 static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
 {
 	VM_BUG_ON(!is_vm_hugetlb_page(vma));
-	if (!(vma->vm_flags & VM_SHARED))
+	if (!(vma->vm_flags & VM_MAYSHARE))
 		return (struct resv_map *)(get_vma_private_data(vma) &
 							~HPAGE_RESV_MASK);
 	return NULL;
@@ -325,7 +325,7 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
 static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
 {
 	VM_BUG_ON(!is_vm_hugetlb_page(vma));
-	VM_BUG_ON(vma->vm_flags & VM_SHARED);
+	VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
 
 	set_vma_private_data(vma, (get_vma_private_data(vma) &
 				HPAGE_RESV_MASK) | (unsigned long)map);
@@ -334,7 +334,7 @@ static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
 static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
 {
 	VM_BUG_ON(!is_vm_hugetlb_page(vma));
-	VM_BUG_ON(vma->vm_flags & VM_SHARED);
+	VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
 
 	set_vma_private_data(vma, get_vma_private_data(vma) | flags);
 }
@@ -353,7 +353,7 @@ static void decrement_hugepage_resv_vma(struct hstate *h,
 	if (vma->vm_flags & VM_NORESERVE)
 		return;
 
-	if (vma->vm_flags & VM_SHARED) {
+	if (vma->vm_flags & VM_MAYSHARE) {
 		/* Shared mappings always use reserves */
 		h->resv_huge_pages--;
 	} else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
@@ -369,14 +369,14 @@ static void decrement_hugepage_resv_vma(struct hstate *h,
 void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
 {
 	VM_BUG_ON(!is_vm_hugetlb_page(vma));
-	if (!(vma->vm_flags & VM_SHARED))
+	if (!(vma->vm_flags & VM_MAYSHARE))
 		vma->vm_private_data = (void *)0;
 }
 
 /* Returns true if the VMA has associated reserve pages */
 static int vma_has_reserves(struct vm_area_struct *vma)
 {
-	if (vma->vm_flags & VM_SHARED)
+	if (vma->vm_flags & VM_MAYSHARE)
 		return 1;
 	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
 		return 1;
@@ -924,7 +924,7 @@ static long vma_needs_reservation(struct hstate *h,
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
 
-	if (vma->vm_flags & VM_SHARED) {
+	if (vma->vm_flags & VM_MAYSHARE) {
 		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 		return region_chg(&inode->i_mapping->private_list,
 							idx, idx + 1);
@@ -949,7 +949,7 @@ static void vma_commit_reservation(struct hstate *h,
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
 
-	if (vma->vm_flags & VM_SHARED) {
+	if (vma->vm_flags & VM_MAYSHARE) {
 		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 		region_add(&inode->i_mapping->private_list, idx, idx + 1);
 
@@ -1893,7 +1893,7 @@ retry_avoidcopy:
 	 * at the time of fork() could consume its reserves on COW instead
 	 * of the full address range.
 	 */
-	if (!(vma->vm_flags & VM_SHARED) &&
+	if (!(vma->vm_flags & VM_MAYSHARE) &&
 			is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
 			old_page != pagecache_page)
 		outside_reserve = 1;
@@ -2000,7 +2000,7 @@ retry:
 		clear_huge_page(page, address, huge_page_size(h));
 		__SetPageUptodate(page);
 
-		if (vma->vm_flags & VM_SHARED) {
+		if (vma->vm_flags & VM_MAYSHARE) {
 			int err;
 			struct inode *inode = mapping->host;
 
@@ -2104,7 +2104,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			goto out_mutex;
 		}
 
-		if (!(vma->vm_flags & VM_SHARED))
+		if (!(vma->vm_flags & VM_MAYSHARE))
 			pagecache_page = hugetlbfs_pagecache_page(h,
 								vma, address);
 	}
@@ -2289,7 +2289,7 @@ int hugetlb_reserve_pages(struct inode *inode,
 	 * to reserve the full area even if read-only as mprotect() may be
 	 * called to make the mapping read-write. Assume !vma is a shm mapping
 	 */
-	if (!vma || vma->vm_flags & VM_SHARED)
+	if (!vma || vma->vm_flags & VM_MAYSHARE)
 		chg = region_chg(&inode->i_mapping->private_list, from, to);
 	else {
 		struct resv_map *resv_map = resv_map_alloc();
@@ -2330,7 +2330,7 @@ int hugetlb_reserve_pages(struct inode *inode,
 	 * consumed reservations are stored in the map. Hence, nothing
 	 * else has to be done for private mappings here
 	 */
-	if (!vma || vma->vm_flags & VM_SHARED)
+	if (!vma || vma->vm_flags & VM_MAYSHARE)
 		region_add(&inode->i_mapping->private_list, from, to);
 	return 0;
 }
-- 
cgit v0.10.2


From 46f7e602fb32e02145ef14f8c0ca6d399f0a96b9 Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Thu, 28 May 2009 14:34:41 -0700
Subject: memcg: fix build warning and avoid checking for mem != null again and
 again

Fix build warning, "mem_cgroup_is_obsolete defined but not used" when
CONFIG_DEBUG_VM is not set.  Also avoid checking for !mem again and again.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4a747a2..78eb855 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -314,14 +314,6 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 	return mem;
 }
 
-static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem)
-{
-	if (!mem)
-		return true;
-	return css_is_removed(&mem->css);
-}
-
-
 /*
  * Call callback function against all cgroup under hierarchy tree.
  */
@@ -932,7 +924,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 	if (unlikely(!mem))
 		return 0;
 
-	VM_BUG_ON(!mem || mem_cgroup_is_obsolete(mem));
+	VM_BUG_ON(css_is_removed(&mem->css));
 
 	while (1) {
 		int ret;
-- 
cgit v0.10.2


From 56ec0c7b88c6eb17733e5015f31302f6312511ed Mon Sep 17 00:00:00 2001
From: Harry Ciao <qingtao.cao@windriver.com>
Date: Thu, 28 May 2009 14:34:42 -0700
Subject: edac: AMD8111 & AMD8131 use dev_name()

The "bus_id" member in the device structure has been obsolete, use
dev_name() instead.

Signed-off-by: Harry Ciao <qingtao.cao@windriver.com>
Cc: Doug Thompson <norsk5@yahoo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/edac/amd8111_edac.c b/drivers/edac/amd8111_edac.c
index 6146921..2cb58ef 100644
--- a/drivers/edac/amd8111_edac.c
+++ b/drivers/edac/amd8111_edac.c
@@ -389,7 +389,7 @@ static int amd8111_dev_probe(struct pci_dev *dev,
 	dev_info->edac_dev->dev = &dev_info->dev->dev;
 	dev_info->edac_dev->mod_name = AMD8111_EDAC_MOD_STR;
 	dev_info->edac_dev->ctl_name = dev_info->ctl_name;
-	dev_info->edac_dev->dev_name = dev_info->dev->dev.bus_id;
+	dev_info->edac_dev->dev_name = dev_name(&dev_info->dev->dev);
 
 	if (edac_op_state == EDAC_OPSTATE_POLL)
 		dev_info->edac_dev->edac_check = dev_info->check;
@@ -473,7 +473,7 @@ static int amd8111_pci_probe(struct pci_dev *dev,
 	pci_info->edac_dev->dev = &pci_info->dev->dev;
 	pci_info->edac_dev->mod_name = AMD8111_EDAC_MOD_STR;
 	pci_info->edac_dev->ctl_name = pci_info->ctl_name;
-	pci_info->edac_dev->dev_name = pci_info->dev->dev.bus_id;
+	pci_info->edac_dev->dev_name = dev_name(&pci_info->dev->dev);
 
 	if (edac_op_state == EDAC_OPSTATE_POLL)
 		pci_info->edac_dev->edac_check = pci_info->check;
diff --git a/drivers/edac/amd8131_edac.c b/drivers/edac/amd8131_edac.c
index c083b31..b432d60 100644
--- a/drivers/edac/amd8131_edac.c
+++ b/drivers/edac/amd8131_edac.c
@@ -287,7 +287,7 @@ static int amd8131_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	dev_info->edac_dev->dev = &dev_info->dev->dev;
 	dev_info->edac_dev->mod_name = AMD8131_EDAC_MOD_STR;
 	dev_info->edac_dev->ctl_name = dev_info->ctl_name;
-	dev_info->edac_dev->dev_name = dev_info->dev->dev.bus_id;
+	dev_info->edac_dev->dev_name = dev_name(&dev_info->dev->dev);
 
 	if (edac_op_state == EDAC_OPSTATE_POLL)
 		dev_info->edac_dev->edac_check = amd8131_chipset.check;
-- 
cgit v0.10.2


From 715fe7af9fd7328af661742bfadc195e665a837f Mon Sep 17 00:00:00 2001
From: Harry Ciao <qingtao.cao@windriver.com>
Date: Thu, 28 May 2009 14:34:43 -0700
Subject: edac: AMD8111 & AMD8131 Kconfig fixup

The amd8111_edac.c driver will fail allmodconfig on architectures other
than PPC, introduce Kconfig dependency to avoid this, since both AMD8111
and AMD8131 chips are only adopted on Maple so far.

Signed-off-by: Harry Ciao <qingtao.cao@windriver.com>
Cc: Doug Thompson <norsk5@yahoo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index e5f5c5a..956982f 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -192,16 +192,20 @@ config EDAC_PPC4XX
 
 config EDAC_AMD8131
 	tristate "AMD8131 HyperTransport PCI-X Tunnel"
-	depends on EDAC_MM_EDAC && PCI
+	depends on EDAC_MM_EDAC && PCI && PPC_MAPLE
 	help
 	  Support for error detection and correction on the
 	  AMD8131 HyperTransport PCI-X Tunnel chip.
+	  Note, add more Kconfig dependency if it's adopted
+	  on some machine other than Maple.
 
 config EDAC_AMD8111
 	tristate "AMD8111 HyperTransport I/O Hub"
-	depends on EDAC_MM_EDAC && PCI
+	depends on EDAC_MM_EDAC && PCI && PPC_MAPLE
 	help
 	  Support for error detection and correction on the
 	  AMD8111 HyperTransport I/O Hub chip.
+	  Note, add more Kconfig dependency if it's adopted
+	  on some machine other than Maple.
 
 endif # EDAC
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index a5fdcf0..5907681 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -35,3 +35,5 @@ obj-$(CONFIG_EDAC_MPC85XX)		+= mpc85xx_edac.o
 obj-$(CONFIG_EDAC_MV64X60)		+= mv64x60_edac.o
 obj-$(CONFIG_EDAC_CELL)			+= cell_edac.o
 obj-$(CONFIG_EDAC_PPC4XX)		+= ppc4xx_edac.o
+obj-$(CONFIG_EDAC_AMD8111)		+= amd8111_edac.o
+obj-$(CONFIG_EDAC_AMD8131)		+= amd8131_edac.o
-- 
cgit v0.10.2


From b8e7e40abeac49644fec4a4f52ffe74c7b05eca0 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 28 May 2009 14:01:35 +0100
Subject: 8250: Fix oops from setserial

If you setserial a port which has never been initialised we change the type
but don't update the I/O method pointers. The same problem is true if you
change the io type of a port - but nobody ever does that so nobody noticed!

Remember the old type and when attaching if the type has changed reload the
port accessor pointers. We can't do it blindly as some 8250 drivers load custom
accessors and we must not stomp those.

Tested-by: Victor Seryodkin <vvscore@gmail.com>
Closes-bug: #13367
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index b4b3981..a0127e9 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -137,6 +137,7 @@ struct uart_8250_port {
 	unsigned char		mcr;
 	unsigned char		mcr_mask;	/* mask of user bits */
 	unsigned char		mcr_force;	/* mask of forced bits */
+	unsigned char		cur_iotype;	/* Running I/O type */
 
 	/*
 	 * Some bits in registers are cleared on a read, so they must
@@ -471,6 +472,7 @@ static void io_serial_out(struct uart_port *p, int offset, int value)
 
 static void set_io_from_upio(struct uart_port *p)
 {
+	struct uart_8250_port *up = (struct uart_8250_port *)p;
 	switch (p->iotype) {
 	case UPIO_HUB6:
 		p->serial_in = hub6_serial_in;
@@ -509,6 +511,8 @@ static void set_io_from_upio(struct uart_port *p)
 		p->serial_out = io_serial_out;
 		break;
 	}
+	/* Remember loaded iotype */
+	up->cur_iotype = p->iotype;
 }
 
 static void
@@ -1937,6 +1941,9 @@ static int serial8250_startup(struct uart_port *port)
 	up->capabilities = uart_config[up->port.type].flags;
 	up->mcr = 0;
 
+	if (up->port.iotype != up->cur_iotype)
+		set_io_from_upio(port);
+
 	if (up->port.type == PORT_16C950) {
 		/* Wake up and initialize UART */
 		up->acr = 0;
@@ -2563,6 +2570,9 @@ static void serial8250_config_port(struct uart_port *port, int flags)
 	if (ret < 0)
 		probeflags &= ~PROBE_RSA;
 
+	if (up->port.iotype != up->cur_iotype)
+		set_io_from_upio(port);
+
 	if (flags & UART_CONFIG_TYPE)
 		autoconfig(up, probeflags);
 	if (up->port.type != PORT_UNKNOWN && flags & UART_CONFIG_IRQ)
@@ -2671,6 +2681,11 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev)
 {
 	int i;
 
+	for (i = 0; i < nr_uarts; i++) {
+		struct uart_8250_port *up = &serial8250_ports[i];
+		up->cur_iotype = 0xFF;
+	}
+
 	serial8250_isa_init_ports();
 
 	for (i = 0; i < nr_uarts; i++) {
-- 
cgit v0.10.2


From b78c07d45a7e71be7b5c5d7486f922355ccf23a8 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 29 May 2009 13:48:59 -0300
Subject: perf_counter tools: Shorten the DSO names using cwd

[acme@emilia linux-2.6-tip]$ pwd
/home/acme/git/linux-2.6-tip

Before (still available using -P/--full-paths)

[acme@emilia linux-2.6-tip]$ perf report -P | head -10
    11.48%             perf: 7454 [kernel]: clear_page_c
     4.89%             perf: 7454 [kernel]: vsnprintf
     4.61%             perf: 7454 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: dso__find_symbol
     4.09%             perf: 7454 [kernel]: number
     4.06%             perf: 7454 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: dso__fprintf
     4.00%             perf: 7454 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: symbol_filter

New default:

[acme@emilia linux-2.6-tip]$ perf report | head -10
    11.48%             perf: 7454 [kernel]: clear_page_c
     4.89%             perf: 7454 [kernel]: vsnprintf
     4.61%             perf: 7454 ./Documentation/perf_counter/perf: dso__find_symbol
     4.09%             perf: 7454 [kernel]: number
     4.06%             perf: 7454 ./Documentation/perf_counter/perf: dso__fprintf
     4.00%             perf: 7454 ./Documentation/perf_counter/perf: symbol_filter

Suggested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090529164859.GN4747@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 412d524..4705679 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -23,6 +23,7 @@ static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
 static int		dump_trace = 0;
 static int		verbose;
+static int		full_paths;
 
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
@@ -134,6 +135,16 @@ static int load_kernel(void)
 	return err;
 }
 
+static int strcommon(const char *pathname, const char *cwd, int cwdlen)
+{
+	int n = 0;
+
+	while (pathname[n] == cwd[n] && n < cwdlen)
+		++n;
+
+	return n;
+}
+
 struct map {
 	struct list_head node;
 	uint64_t	 start;
@@ -142,16 +153,28 @@ struct map {
 	struct dso	 *dso;
 };
 
-static struct map *map__new(struct mmap_event *event)
+static struct map *map__new(struct mmap_event *event, char *cwd, int cwdlen)
 {
 	struct map *self = malloc(sizeof(*self));
 
 	if (self != NULL) {
+		const char *filename = event->filename;
+		char newfilename[PATH_MAX];
+
+		if (cwd) {
+			int n = strcommon(filename, cwd, cwdlen);
+			if (n == cwdlen) {
+				snprintf(newfilename, sizeof(newfilename),
+					 ".%s", filename + n);
+				filename = newfilename;
+			}
+		}
+
 		self->start = event->start;
 		self->end   = event->start + event->len;
 		self->pgoff = event->pgoff;
 
-		self->dso = dsos__findnew(event->filename);
+		self->dso = dsos__findnew(filename);
 		if (self->dso == NULL)
 			goto out_delete;
 	}
@@ -598,6 +621,8 @@ static int __cmd_report(void)
 	int ret, rc = EXIT_FAILURE;
 	uint32_t size;
 	unsigned long total = 0, total_mmap = 0, total_comm = 0, total_unknown = 0;
+	char cwd[PATH_MAX], *cwdp = cwd;
+	int cwdlen;
 
 	input = open(input_name, O_RDONLY);
 	if (input < 0) {
@@ -621,6 +646,14 @@ static int __cmd_report(void)
 		return EXIT_FAILURE;
 	}
 
+	if (!full_paths) {
+		if (getcwd(cwd, sizeof(cwd)) == NULL) {
+			perror("failed to get the current directory");
+			return EXIT_FAILURE;
+		}
+		cwdlen = strlen(cwd);
+	} else
+		cwdp = NULL;
 remap:
 	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
 			   MAP_SHARED, input, offset);
@@ -710,7 +743,7 @@ more:
 	} else switch (event->header.type) {
 	case PERF_EVENT_MMAP: {
 		struct thread *thread = threads__findnew(event->mmap.pid);
-		struct map *map = map__new(&event->mmap);
+		struct map *map = map__new(&event->mmap, cwdp, cwdlen);
 
 		if (dump_trace) {
 			fprintf(stderr, "%p [%p]: PERF_EVENT_MMAP: [%p(%p) @ %p]: %s\n",
@@ -809,6 +842,8 @@ static const struct option options[] = {
 	OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
 	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
 		   "sort by key(s): pid, comm, dso, symbol. Default: pid,symbol"),
+	OPT_BOOLEAN('P', "full-paths", &full_paths,
+		    "Don't shorten the pathnames taking into account the cwd"),
 	OPT_END()
 };
 
-- 
cgit v0.10.2


From 6373fffc5d555caf3acf7c5796cec9820aaf7479 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 29 May 2009 16:12:02 -0700
Subject: sparc64: Fix section attribute warnings.

CSUM copy to/from user assembler was missing allocatable and
executable attributes for .fixup

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/lib/csum_copy_from_user.S b/arch/sparc/lib/csum_copy_from_user.S
index a22eddb..e0304e6 100644
--- a/arch/sparc/lib/csum_copy_from_user.S
+++ b/arch/sparc/lib/csum_copy_from_user.S
@@ -5,7 +5,7 @@
 
 #define EX_LD(x)		\
 98:	x;			\
-	.section .fixup;	\
+	.section .fixup, "ax";	\
 	.align 4;		\
 99:	retl;			\
 	 mov	-1, %o0;	\
diff --git a/arch/sparc/lib/csum_copy_to_user.S b/arch/sparc/lib/csum_copy_to_user.S
index d5b12f4..afd01ac 100644
--- a/arch/sparc/lib/csum_copy_to_user.S
+++ b/arch/sparc/lib/csum_copy_to_user.S
@@ -5,7 +5,7 @@
 
 #define EX_ST(x)		\
 98:	x;			\
-	.section .fixup;	\
+	.section .fixup,"ax";	\
 	.align 4;		\
 99:	retl;			\
 	 mov	-1, %o0;	\
-- 
cgit v0.10.2


From 34d531e640cb805973cf656b15c716b961565cea Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Tue, 26 May 2009 15:11:06 -0400
Subject: ACPI: sanity check _PSS frequency to prevent cpufreq crash

When BIOS SETUP is changed to disable EIST, some BIOS
hand the OS an un-initialized _PSS:

        Name (_PSS, Package (0x06)
        {
            Package (0x06)
            {
                0x80000000,	// frequency [MHz]
                0x80000000,	// power [mW]
                0x80000000,	// latency [us]
                0x80000000,	// BM latency [us]
                0x80000000,	// control
                0x80000000	// status
            },
	    ...

These are outrageous values for frequency,
power and latency, raising the question where to draw
the line between legal and illegal.  We tend to survive
garbage in the power and latency fields, but we can BUG_ON
when garbage is in the frequency field.

Cpufreq multiplies the frequency by 1000 and stores it in a u32 KHz.
So disregard a _PSS with a frequency so large
that it can't be represented by cpufreq.

https://bugzilla.redhat.com/show_bug.cgi?id=500311

Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index cafb410..60e543d 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -309,9 +309,15 @@ static int acpi_processor_get_performance_states(struct acpi_processor *pr)
 				  (u32) px->bus_master_latency,
 				  (u32) px->control, (u32) px->status));
 
-		if (!px->core_frequency) {
-			printk(KERN_ERR PREFIX
-				    "Invalid _PSS data: freq is zero\n");
+		/*
+ 		 * Check that ACPI's u64 MHz will be valid as u32 KHz in cpufreq
+		 */
+		if (!px->core_frequency ||
+		    ((u32)(px->core_frequency * 1000) !=
+		     (px->core_frequency * 1000))) {
+			printk(KERN_ERR FW_BUG PREFIX
+			       "Invalid BIOS _PSS frequency: 0x%llx MHz\n",
+			       px->core_frequency);
 			result = -EFAULT;
 			kfree(pr->performance->states);
 			goto end;
-- 
cgit v0.10.2


From 34ac272b3aaef11a91e19a72f3ac5772a96ffbc5 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Tue, 26 May 2009 23:35:34 -0400
Subject: ACPI: video: DMI workaround broken eMachines E510 BIOS enabling
 display brightness

http://bugzilla.kernel.org/show_bug.cgi?id=13376

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c
index 810cca9..ee45e76 100644
--- a/drivers/acpi/video.c
+++ b/drivers/acpi/video.c
@@ -570,6 +570,14 @@ static struct dmi_system_id video_dmi_table[] __initdata = {
 		DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 5710Z"),
 		},
 	},
+	{
+	 .callback = video_set_bqc_offset,
+	 .ident = "eMachines E510",
+	 .matches = {
+		DMI_MATCH(DMI_BOARD_VENDOR, "EMACHINES"),
+		DMI_MATCH(DMI_PRODUCT_NAME, "eMachines E510"),
+		},
+	},
 	{}
 };
 
-- 
cgit v0.10.2


From 93bcece20ef87c29548ec7e66532f1018572cea0 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Tue, 19 May 2009 15:08:41 -0400
Subject: ACPI: video: DMI workaround broken Acer 5315 BIOS enabling display
 brightness

http://bugzilla.kernel.org/show_bug.cgi?id=13121

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c
index ee45e76..6d897bb 100644
--- a/drivers/acpi/video.c
+++ b/drivers/acpi/video.c
@@ -578,6 +578,14 @@ static struct dmi_system_id video_dmi_table[] __initdata = {
 		DMI_MATCH(DMI_PRODUCT_NAME, "eMachines E510"),
 		},
 	},
+	{
+	 .callback = video_set_bqc_offset,
+	 .ident = "Acer Aspire 5315",
+	 .matches = {
+		DMI_MATCH(DMI_BOARD_VENDOR, "Acer"),
+		DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 5315"),
+		},
+	},
 	{}
 };
 
-- 
cgit v0.10.2


From 1fc8d33acafe68bdcc21b327d22ef3820b819727 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Wed, 20 May 2009 11:56:08 +0530
Subject: drm/i915: acpi/video.c fix section mismatch warning

Currently acpi_video_exit() is exported as well as using __exit which causes:

  WARNING: drivers/acpi/video.o(__ksymtab+0x0): Section mismatch in reference from the variable __ksymtab_acpi_video_exit to the function .exit.text:acpi_video_exit()
  The symbol acpi_video_exit is exported and annotated __exit
  Fix this by removing the __exit annotation of acpi_video_exit or drop the export.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c
index 810cca9..a79b885 100644
--- a/drivers/acpi/video.c
+++ b/drivers/acpi/video.c
@@ -2334,7 +2334,7 @@ static int __init acpi_video_init(void)
 	return acpi_video_register();
 }
 
-void __exit acpi_video_exit(void)
+void acpi_video_exit(void)
 {
 
 	acpi_bus_unregister_driver(&acpi_video_bus);
-- 
cgit v0.10.2


From 21671b88be331fb9c95891d5ee7d2e940e6b024c Mon Sep 17 00:00:00 2001
From: Frans Pop <elendil@planet.nl>
Date: Fri, 22 May 2009 10:23:40 +0200
Subject: ACPI processor: remove spurious newline from warning message

Commit 4973b22a ("ACPI processor: reset the throttling state once it's
invalid") introduced a new warning which prints a spurious newline.

The ACPI_WARNING macro that is used already takes care of adding a
newline, after adding ACPI_CA_VERSION to the message. Remove the newline
to avoid the message getting split into two lines.

Signed-off-by: Frans Pop <elendil@planet.nl>
Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c
index 7f16f5f..2275437 100644
--- a/drivers/acpi/processor_throttling.c
+++ b/drivers/acpi/processor_throttling.c
@@ -840,7 +840,7 @@ static int acpi_processor_get_throttling_ptc(struct acpi_processor *pr)
 		state = acpi_get_throttling_state(pr, value);
 		if (state == -1) {
 			ACPI_WARNING((AE_INFO,
-				"Invalid throttling state, reset\n"));
+				"Invalid throttling state, reset"));
 			state = 0;
 			ret = acpi_processor_set_throttling(pr, state);
 			if (ret)
-- 
cgit v0.10.2


From 61c8c67e3ad67ea1d1360f2e88688bd942834756 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 26 May 2009 14:58:39 -0700
Subject: acpi-cpufreq: fix printk typo and indentation

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Len Brown <len.brown@intel.com>

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 208ecf6..54b6de2 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -693,8 +693,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&
 	    policy->cpuinfo.transition_latency > 20 * 1000) {
 		policy->cpuinfo.transition_latency = 20 * 1000;
-			printk_once(KERN_INFO "Capping off P-state tranision"
-				    " latency at 20 uS\n");
+		printk_once(KERN_INFO
+			    "P-state transition latency capped at 20 uS\n");
 	}
 
 	/* table init */
-- 
cgit v0.10.2


From 31db5645bda24682dadbc97d5e8a7918ade2a298 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Fri, 29 May 2009 21:11:27 -0400
Subject: ACPI, i915: build fix (v2)

drivers/built-in.o: In function `intel_opregion_init':
(.text+0x9d540): undefined reference to `acpi_video_register'

v2: move under DRM_I915 from DRM_I915_KMS

Signed-off-by: Len Brown <len.brown@intel.com>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>

diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
index 4cd35d8..f5d46e7 100644
--- a/drivers/gpu/drm/Kconfig
+++ b/drivers/gpu/drm/Kconfig
@@ -67,12 +67,18 @@ config DRM_I830
 	  will load the correct one.
 
 config DRM_I915
+	tristate "i915 driver"
 	select FB_CFB_FILLRECT
 	select FB_CFB_COPYAREA
 	select FB_CFB_IMAGEBLIT
 	select FB
 	select FRAMEBUFFER_CONSOLE if !EMBEDDED
-	tristate "i915 driver"
+	# i915 depends on ACPI_VIDEO when ACPI is enabled
+	# but for select to work, need to select ACPI_VIDEO's dependencies, ick
+	select VIDEO_OUTPUT_CONTROL if ACPI
+	select BACKLIGHT_CLASS_DEVICE if ACPI
+	select INPUT if ACPI
+	select ACPI_VIDEO if ACPI
 	help
 	  Choose this option if you have a system that has Intel 830M, 845G,
 	  852GM, 855GM 865G or 915G integrated graphics.  If M is selected, the
@@ -84,12 +90,6 @@ config DRM_I915
 config DRM_I915_KMS
 	bool "Enable modesetting on intel by default"
 	depends on DRM_I915
-	# i915 KMS depends on ACPI_VIDEO when ACPI is enabled
-	# but for select to work, need to select ACPI_VIDEO's dependencies, ick
-	select VIDEO_OUTPUT_CONTROL if ACPI
-	select BACKLIGHT_CLASS_DEVICE if ACPI
-	select INPUT if ACPI
-	select ACPI_VIDEO if ACPI
 	help
 	  Choose this option if you want kernel modesetting enabled by default,
 	  and you have a new enough userspace to support this. Running old
-- 
cgit v0.10.2


From c143dc903d7c0b15f5052e00b2c7de33a8b4299c Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Sat, 30 May 2009 06:43:49 +0200
Subject: block: fix an oops on BLKPREP_KILL

Doing a bit of torture testing, I ran across a BUG in the block
subsystem (at blk-core.c:2048): the test for if the request is queued.

It turns out the trigger was a BLKPREP_KILL coming out of the SCSI prep
function.  Currently for BLKPREP_KILL requests, we send them straight
into __blk_end_request_all() with an error, but they've never been
dequeued, so they trip the bug.  Fix this by starting requests before
killing them.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index 8b3b74e..7ae83a1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1789,6 +1789,11 @@ struct request *blk_peek_request(struct request_queue *q)
 			break;
 		} else if (ret == BLKPREP_KILL) {
 			rq->cmd_flags |= REQ_QUIET;
+			/*
+			 * Mark this request as started so we don't trigger
+			 * any debug logic in the end I/O path.
+			 */
+			blk_start_request(rq);
 			__blk_end_request_all(rq, -EIO);
 		} else {
 			printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
-- 
cgit v0.10.2


From 465440d2720543669841db5b0691ba41892ed0ae Mon Sep 17 00:00:00 2001
From: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Date: Mon, 25 May 2009 20:57:21 +0000
Subject: mlx4_en: Fix a kernel panic when waking tx queue

When the transmit queue gets full we enable interrupts for TX completions
There was a race that we handled the TX queue both from the interrupt context
and from the transmit function. Using "spin_trylock_irq()" ensures this
doesn't happen.

Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/mlx4/en_tx.c b/drivers/net/mlx4/en_tx.c
index ac6fc49..e5c98a9 100644
--- a/drivers/net/mlx4/en_tx.c
+++ b/drivers/net/mlx4/en_tx.c
@@ -426,7 +426,7 @@ void mlx4_en_poll_tx_cq(unsigned long data)
 
 	INC_PERF_COUNTER(priv->pstats.tx_poll);
 
-	if (!spin_trylock(&ring->comp_lock)) {
+	if (!spin_trylock_irq(&ring->comp_lock)) {
 		mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT);
 		return;
 	}
@@ -439,7 +439,7 @@ void mlx4_en_poll_tx_cq(unsigned long data)
 	if (inflight && priv->port_up)
 		mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT);
 
-	spin_unlock(&ring->comp_lock);
+	spin_unlock_irq(&ring->comp_lock);
 }
 
 static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv,
@@ -482,9 +482,9 @@ static inline void mlx4_en_xmit_poll(struct mlx4_en_priv *priv, int tx_ind)
 
 	/* Poll the CQ every mlx4_en_TX_MODER_POLL packets */
 	if ((++ring->poll_cnt & (MLX4_EN_TX_POLL_MODER - 1)) == 0)
-		if (spin_trylock(&ring->comp_lock)) {
+		if (spin_trylock_irq(&ring->comp_lock)) {
 			mlx4_en_process_tx_cq(priv->dev, cq);
-			spin_unlock(&ring->comp_lock);
+			spin_unlock_irq(&ring->comp_lock);
 		}
 }
 
-- 
cgit v0.10.2


From bdb0e010bf0061a73027cc84dd7ad192c663eca3 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Fri, 29 May 2009 22:04:54 -0700
Subject: ath1e: add new device id for asus hardware

Gary Lin reports that a new device id needs to be added to the atl1e in
order to get some new Asus hardware to work properly.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/atl1e/atl1e_main.c b/drivers/net/atl1e/atl1e_main.c
index fb57b75..1342418 100644
--- a/drivers/net/atl1e/atl1e_main.c
+++ b/drivers/net/atl1e/atl1e_main.c
@@ -37,6 +37,7 @@ char atl1e_driver_version[] = DRV_VERSION;
  */
 static struct pci_device_id atl1e_pci_tbl[] = {
 	{PCI_DEVICE(PCI_VENDOR_ID_ATTANSIC, PCI_DEVICE_ID_ATTANSIC_L1E)},
+	{PCI_DEVICE(PCI_VENDOR_ID_ATTANSIC, 0x1066)},
 	/* required last entry */
 	{ 0 }
 };
-- 
cgit v0.10.2


From cf4ae4e3de83e2c7394af70b46f4f50e5f0fb90c Mon Sep 17 00:00:00 2001
From: Matt Kraai <kraai@ftbfs.org>
Date: Fri, 29 May 2009 22:06:33 -0700
Subject: net/firmare: Ignore .cis files

Signed-off-by: Matt Kraai <kraai@ftbfs.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/firmware/cis/.gitignore b/firmware/cis/.gitignore
new file mode 100644
index 0000000..1de3984
--- /dev/null
+++ b/firmware/cis/.gitignore
@@ -0,0 +1 @@
+*.cis
-- 
cgit v0.10.2


From c44613a4c1092e85841b78b7ab52a06654fcd321 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 29 May 2009 17:03:07 -0300
Subject: perf_counter tools: Add locking to perf top

perf_counter tools: Add locking to perf top

We need to protect the active_symbols list as two threads change it:
the main thread adding entries to the head and the display thread
decaying entries from any place in the list.

Also related: take a snapshot of syme->count[0] before using it to
calculate the weight and to show the same number used in this calc when
displaying the symbol usage.

Reported-by: Mike Galbraith <efault@gmx.de>
Tested-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090529200307.GR4747@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index ebe8bec..24a8879 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -129,6 +129,8 @@ struct sym_entry {
 	struct rb_node		rb_node;
 	struct list_head	node;
 	unsigned long		count[MAX_COUNTERS];
+	unsigned long		snap_count;
+	double			weight;
 	int			skip;
 };
 
@@ -141,17 +143,16 @@ struct dso *kernel_dso;
  * after decayed.
  */
 static LIST_HEAD(active_symbols);
+static pthread_mutex_t active_symbols_lock = PTHREAD_MUTEX_INITIALIZER;
 
 /*
  * Ordering weight: count-1 * count-2 * ... / count-n
  */
 static double sym_weight(const struct sym_entry *sym)
 {
-	double weight;
+	double weight = sym->snap_count;
 	int counter;
 
-	weight = sym->count[0];
-
 	for (counter = 1; counter < nr_counters-1; counter++)
 		weight *= sym->count[counter];
 
@@ -164,11 +165,18 @@ static long			events;
 static long			userspace_events;
 static const char		CONSOLE_CLEAR[] = "[H[2J";
 
-static void list_insert_active_sym(struct sym_entry *syme)
+static void __list_insert_active_sym(struct sym_entry *syme)
 {
 	list_add(&syme->node, &active_symbols);
 }
 
+static void list_remove_active_sym(struct sym_entry *syme)
+{
+	pthread_mutex_lock(&active_symbols_lock);
+	list_del_init(&syme->node);
+	pthread_mutex_unlock(&active_symbols_lock);
+}
+
 static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se)
 {
 	struct rb_node **p = &tree->rb_node;
@@ -179,7 +187,7 @@ static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se)
 		parent = *p;
 		iter = rb_entry(parent, struct sym_entry, rb_node);
 
-		if (sym_weight(se) > sym_weight(iter))
+		if (se->weight > iter->weight)
 			p = &(*p)->rb_left;
 		else
 			p = &(*p)->rb_right;
@@ -203,15 +211,21 @@ static void print_sym_table(void)
 	events = userspace_events = 0;
 
 	/* Sort the active symbols */
-	list_for_each_entry_safe(syme, n, &active_symbols, node) {
-		if (syme->count[0] != 0) {
+	pthread_mutex_lock(&active_symbols_lock);
+	syme = list_entry(active_symbols.next, struct sym_entry, node);
+	pthread_mutex_unlock(&active_symbols_lock);
+
+	list_for_each_entry_safe_from(syme, n, &active_symbols, node) {
+		syme->snap_count = syme->count[0];
+		if (syme->snap_count != 0) {
+			syme->weight = sym_weight(syme);
 			rb_insert_active_sym(&tmp, syme);
-			sum_kevents += syme->count[0];
+			sum_kevents += syme->snap_count;
 
 			for (j = 0; j < nr_counters; j++)
 				syme->count[j] = zero ? 0 : syme->count[j] * 7 / 8;
 		} else
-			list_del_init(&syme->node);
+			list_remove_active_sym(syme);
 	}
 
 	write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
@@ -264,19 +278,18 @@ static void print_sym_table(void)
 		struct symbol *sym = (struct symbol *)(syme + 1);
 		float pcnt;
 
-		if (++printed > 18 || syme->count[0] < count_filter)
-			break;
+		if (++printed > 18 || syme->snap_count < count_filter)
+			continue;
 
-		pcnt = 100.0 - (100.0 * ((sum_kevents - syme->count[0]) /
+		pcnt = 100.0 - (100.0 * ((sum_kevents - syme->snap_count) /
 					 sum_kevents));
 
 		if (nr_counters == 1)
 			printf("%19.2f - %4.1f%% - %016llx : %s\n",
-				sym_weight(syme),
-				pcnt, sym->start, sym->name);
+				syme->weight, pcnt, sym->start, sym->name);
 		else
 			printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n",
-				sym_weight(syme), syme->count[0],
+				syme->weight, syme->snap_count,
 				pcnt, sym->start, sym->name);
 	}
 
@@ -395,8 +408,10 @@ static void record_ip(uint64_t ip, int counter)
 
 		if (!syme->skip) {
 			syme->count[counter]++;
+			pthread_mutex_lock(&active_symbols_lock);
 			if (list_empty(&syme->node) || !syme->node.next)
-				list_insert_active_sym(syme);
+				__list_insert_active_sym(syme);
+			pthread_mutex_unlock(&active_symbols_lock);
 			return;
 		}
 	}
-- 
cgit v0.10.2


From d7c29318c2daa96d64b7312afd8283488c1cb29f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 30 May 2009 12:38:51 +0200
Subject: perf_counter tools: Print 'CPU utilization factor' in builtin-stat

Before:

 Performance counter stats for '/home/mingo/hackbench':

    5728.862689  task clock ticks     (msecs)
          34426  context switches     #        0.006 M/sec
           3835  CPU migrations       #        0.001 M/sec
          18158  pagefaults           #        0.003 M/sec
    16218109156  CPU cycles           #     2830.947 M/sec
    13519616840  instructions         #     2359.913 M/sec
       55941661  cache references     #        9.765 M/sec
       23554938  cache misses         #        4.112 M/sec

 Wall-clock time elapsed:   528.886980 msecs

After:

 Performance counter stats for '/home/mingo/hackbench':

    5845.443541  task clock ticks     #      11.886 CPU utilization factor
          38289  context switches     #       0.007 M/sec
           4208  CPU migrations       #       0.001 M/sec
          17755  pagefaults           #       0.003 M/sec
    16664668576  CPU cycles           #    2850.882 M/sec
    13468113991  instructions         #    2304.036 M/sec
       57445468  cache references     #       9.827 M/sec
       26896502  cache misses         #       4.601 M/sec

 Wall-clock time elapsed:   491.802357 msecs

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index ef7e0e1..5886791 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -75,6 +75,7 @@ static __u64			event_res[MAX_COUNTERS][3];
 static __u64			event_scaled[MAX_COUNTERS];
 
 static __u64			runtime_nsecs;
+static __u64			walltime_nsecs;
 
 static void create_perfstat_counter(int counter)
 {
@@ -194,13 +195,19 @@ static void print_counter(int counter)
 	if (nsec_counter(counter)) {
 		double msecs = (double)count[0] / 1000000;
 
-		fprintf(stderr, " %14.6f  %-20s (msecs)",
+		fprintf(stderr, " %14.6f  %-20s",
 			msecs, event_name(counter));
+		if (event_id[counter] ==
+				EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
+
+			fprintf(stderr, " # %11.3f CPU utilization factor",
+				(double)count[0] / (double)walltime_nsecs);
+		}
 	} else {
 		fprintf(stderr, " %14Ld  %-20s",
 			count[0], event_name(counter));
 		if (runtime_nsecs)
-			fprintf(stderr, " # %12.3f M/sec",
+			fprintf(stderr, " # %11.3f M/sec",
 				(double)count[0]/runtime_nsecs*1000.0);
 	}
 	if (scaled)
@@ -241,6 +248,8 @@ static int do_perfstat(int argc, const char **argv)
 	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
 	t1 = rdclock();
 
+	walltime_nsecs = t1 - t0;
+
 	fflush(stdout);
 
 	fprintf(stderr, "\n");
-- 
cgit v0.10.2


From 7fbd55449aafb86d3237b5d1a26fb4dab2aa2c76 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 30 May 2009 12:38:51 +0200
Subject: perf_counter tools: Fix 'make install'

'make install' didnt install perf itself - which needs a special
rule to be copied to bindir.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index bd29a5c..8f72584 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -776,6 +776,7 @@ install: all
 	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
 	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
 	$(INSTALL) $(ALL_PROGRAMS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
+	$(INSTALL) perf$X '$(DESTDIR_SQ)$(bindir_SQ)'
 ifneq (,$X)
 	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';)
 endif
-- 
cgit v0.10.2


From c1c2365acf8c044f749c0fe1ea236497e8d1718e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 30 May 2009 12:38:51 +0200
Subject: perf_counter tools: Generate per command manpages (and pdf/html,
 etc.)

Import Git's nice .txt => {man/html/pdf} generation machinery.

Fix various errors in the Documentation/perf*.txt description as well.

Also fix a bug in builtin-help: we'd map 'perf help top' to 'perftop'
if only the 'perf' binary is in the default PATH - confusing the manpage
logic. I dont fully understand why Git did it this way - but i suppose
it's a migration artifact from their migration from standalone git-xyz
commands to 'git xyz' commands. The perf tools were always using the
modern form so it's not an issue there.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Documentation/Makefile b/Documentation/perf_counter/Documentation/Makefile
new file mode 100644
index 0000000..5457192
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/Makefile
@@ -0,0 +1,300 @@
+MAN1_TXT= \
+	$(filter-out $(addsuffix .txt, $(ARTICLES) $(SP_ARTICLES)), \
+		$(wildcard perf-*.txt)) \
+	perf.txt
+MAN5_TXT=
+MAN7_TXT=
+
+MAN_TXT = $(MAN1_TXT) $(MAN5_TXT) $(MAN7_TXT)
+MAN_XML=$(patsubst %.txt,%.xml,$(MAN_TXT))
+MAN_HTML=$(patsubst %.txt,%.html,$(MAN_TXT))
+
+DOC_HTML=$(MAN_HTML)
+
+ARTICLES =
+# with their own formatting rules.
+SP_ARTICLES =
+API_DOCS = $(patsubst %.txt,%,$(filter-out technical/api-index-skel.txt technical/api-index.txt, $(wildcard technical/api-*.txt)))
+SP_ARTICLES += $(API_DOCS)
+SP_ARTICLES += technical/api-index
+
+DOC_HTML += $(patsubst %,%.html,$(ARTICLES) $(SP_ARTICLES))
+
+DOC_MAN1=$(patsubst %.txt,%.1,$(MAN1_TXT))
+DOC_MAN5=$(patsubst %.txt,%.5,$(MAN5_TXT))
+DOC_MAN7=$(patsubst %.txt,%.7,$(MAN7_TXT))
+
+prefix?=$(HOME)
+bindir?=$(prefix)/bin
+htmldir?=$(prefix)/share/doc/perf-doc
+pdfdir?=$(prefix)/share/doc/perf-doc
+mandir?=$(prefix)/share/man
+man1dir=$(mandir)/man1
+man5dir=$(mandir)/man5
+man7dir=$(mandir)/man7
+# DESTDIR=
+
+ASCIIDOC=asciidoc
+ASCIIDOC_EXTRA =
+MANPAGE_XSL = manpage-normal.xsl
+XMLTO_EXTRA =
+INSTALL?=install
+RM ?= rm -f
+DOC_REF = origin/man
+HTML_REF = origin/html
+
+infodir?=$(prefix)/share/info
+MAKEINFO=makeinfo
+INSTALL_INFO=install-info
+DOCBOOK2X_TEXI=docbook2x-texi
+DBLATEX=dblatex
+ifndef PERL_PATH
+	PERL_PATH = /usr/bin/perl
+endif
+
+-include ../config.mak.autogen
+-include ../config.mak
+
+#
+# For asciidoc ...
+#	-7.1.2,	no extra settings are needed.
+#	8.0-,	set ASCIIDOC8.
+#
+
+#
+# For docbook-xsl ...
+#	-1.68.1,	set ASCIIDOC_NO_ROFF? (based on changelog from 1.73.0)
+#	1.69.0,		no extra settings are needed?
+#	1.69.1-1.71.0,	set DOCBOOK_SUPPRESS_SP?
+#	1.71.1,		no extra settings are needed?
+#	1.72.0,		set DOCBOOK_XSL_172.
+#	1.73.0-,	set ASCIIDOC_NO_ROFF
+#
+
+#
+# If you had been using DOCBOOK_XSL_172 in an attempt to get rid
+# of 'the ".ft C" problem' in your generated manpages, and you
+# instead ended up with weird characters around callouts, try
+# using ASCIIDOC_NO_ROFF instead (it works fine with ASCIIDOC8).
+#
+
+ifdef ASCIIDOC8
+ASCIIDOC_EXTRA += -a asciidoc7compatible
+endif
+ifdef DOCBOOK_XSL_172
+ASCIIDOC_EXTRA += -a perf-asciidoc-no-roff
+MANPAGE_XSL = manpage-1.72.xsl
+else
+	ifdef ASCIIDOC_NO_ROFF
+	# docbook-xsl after 1.72 needs the regular XSL, but will not
+	# pass-thru raw roff codes from asciidoc.conf, so turn them off.
+	ASCIIDOC_EXTRA += -a perf-asciidoc-no-roff
+	endif
+endif
+ifdef MAN_BOLD_LITERAL
+XMLTO_EXTRA += -m manpage-bold-literal.xsl
+endif
+ifdef DOCBOOK_SUPPRESS_SP
+XMLTO_EXTRA += -m manpage-suppress-sp.xsl
+endif
+
+SHELL_PATH ?= $(SHELL)
+# Shell quote;
+SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
+
+#
+# Please note that there is a minor bug in asciidoc.
+# The version after 6.0.3 _will_ include the patch found here:
+#   http://marc.theaimsgroup.com/?l=perf&m=111558757202243&w=2
+#
+# Until that version is released you may have to apply the patch
+# yourself - yes, all 6 characters of it!
+#
+
+QUIET_SUBDIR0  = +$(MAKE) -C # space to separate -C and subdir
+QUIET_SUBDIR1  =
+
+ifneq ($(findstring $(MAKEFLAGS),w),w)
+PRINT_DIR = --no-print-directory
+else # "make -w"
+NO_SUBDIR = :
+endif
+
+ifneq ($(findstring $(MAKEFLAGS),s),s)
+ifndef V
+	QUIET_ASCIIDOC	= @echo '   ' ASCIIDOC $@;
+	QUIET_XMLTO	= @echo '   ' XMLTO $@;
+	QUIET_DB2TEXI	= @echo '   ' DB2TEXI $@;
+	QUIET_MAKEINFO	= @echo '   ' MAKEINFO $@;
+	QUIET_DBLATEX	= @echo '   ' DBLATEX $@;
+	QUIET_XSLTPROC	= @echo '   ' XSLTPROC $@;
+	QUIET_GEN	= @echo '   ' GEN $@;
+	QUIET_STDERR	= 2> /dev/null
+	QUIET_SUBDIR0	= +@subdir=
+	QUIET_SUBDIR1	= ;$(NO_SUBDIR) echo '   ' SUBDIR $$subdir; \
+			  $(MAKE) $(PRINT_DIR) -C $$subdir
+	export V
+endif
+endif
+
+all: html man
+
+html: $(DOC_HTML)
+
+$(DOC_HTML) $(DOC_MAN1) $(DOC_MAN5) $(DOC_MAN7): asciidoc.conf
+
+man: man1 man5 man7
+man1: $(DOC_MAN1)
+man5: $(DOC_MAN5)
+man7: $(DOC_MAN7)
+
+info: perf.info perfman.info
+
+pdf: user-manual.pdf
+
+install: install-man
+
+install-man: man
+	$(INSTALL) -d -m 755 $(DESTDIR)$(man1dir)
+#	$(INSTALL) -d -m 755 $(DESTDIR)$(man5dir)
+#	$(INSTALL) -d -m 755 $(DESTDIR)$(man7dir)
+	$(INSTALL) -m 644 $(DOC_MAN1) $(DESTDIR)$(man1dir)
+#	$(INSTALL) -m 644 $(DOC_MAN5) $(DESTDIR)$(man5dir)
+#	$(INSTALL) -m 644 $(DOC_MAN7) $(DESTDIR)$(man7dir)
+
+install-info: info
+	$(INSTALL) -d -m 755 $(DESTDIR)$(infodir)
+	$(INSTALL) -m 644 perf.info perfman.info $(DESTDIR)$(infodir)
+	if test -r $(DESTDIR)$(infodir)/dir; then \
+	  $(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perf.info ;\
+	  $(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perfman.info ;\
+	else \
+	  echo "No directory found in $(DESTDIR)$(infodir)" >&2 ; \
+	fi
+
+install-pdf: pdf
+	$(INSTALL) -d -m 755 $(DESTDIR)$(pdfdir)
+	$(INSTALL) -m 644 user-manual.pdf $(DESTDIR)$(pdfdir)
+
+install-html: html
+	'$(SHELL_PATH_SQ)' ./install-webdoc.sh $(DESTDIR)$(htmldir)
+
+../PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
+	$(QUIET_SUBDIR0)../ $(QUIET_SUBDIR1) PERF-VERSION-FILE
+
+-include ../PERF-VERSION-FILE
+
+#
+# Determine "include::" file references in asciidoc files.
+#
+doc.dep : $(wildcard *.txt) build-docdep.perl
+	$(QUIET_GEN)$(RM) $@+ $@ && \
+	$(PERL_PATH) ./build-docdep.perl >$@+ $(QUIET_STDERR) && \
+	mv $@+ $@
+
+-include doc.dep
+
+cmds_txt = cmds-ancillaryinterrogators.txt \
+	cmds-ancillarymanipulators.txt \
+	cmds-mainporcelain.txt \
+	cmds-plumbinginterrogators.txt \
+	cmds-plumbingmanipulators.txt \
+	cmds-synchingrepositories.txt \
+	cmds-synchelpers.txt \
+	cmds-purehelpers.txt \
+	cmds-foreignscminterface.txt
+
+$(cmds_txt): cmd-list.made
+
+cmd-list.made: cmd-list.perl ../command-list.txt $(MAN1_TXT)
+	$(QUIET_GEN)$(RM) $@ && \
+	$(PERL_PATH) ./cmd-list.perl ../command-list.txt $(QUIET_STDERR) && \
+	date >$@
+
+clean:
+	$(RM) *.xml *.xml+ *.html *.html+ *.1 *.5 *.7
+	$(RM) *.texi *.texi+ *.texi++ perf.info perfman.info
+	$(RM) howto-index.txt howto/*.html doc.dep
+	$(RM) technical/api-*.html technical/api-index.txt
+	$(RM) $(cmds_txt) *.made
+
+$(MAN_HTML): %.html : %.txt
+	$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
+	$(ASCIIDOC) -b xhtml11 -d manpage -f asciidoc.conf \
+		$(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \
+	mv $@+ $@
+
+%.1 %.5 %.7 : %.xml
+	$(QUIET_XMLTO)$(RM) $@ && \
+	xmlto -m $(MANPAGE_XSL) $(XMLTO_EXTRA) man $<
+
+%.xml : %.txt
+	$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
+	$(ASCIIDOC) -b docbook -d manpage -f asciidoc.conf \
+		$(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \
+	mv $@+ $@
+
+XSLT = docbook.xsl
+XSLTOPTS = --xinclude --stringparam html.stylesheet docbook-xsl.css
+
+user-manual.html: user-manual.xml
+	$(QUIET_XSLTPROC)xsltproc $(XSLTOPTS) -o $@ $(XSLT) $<
+
+perf.info: user-manual.texi
+	$(QUIET_MAKEINFO)$(MAKEINFO) --no-split -o $@ user-manual.texi
+
+user-manual.texi: user-manual.xml
+	$(QUIET_DB2TEXI)$(RM) $@+ $@ && \
+	$(DOCBOOK2X_TEXI) user-manual.xml --encoding=UTF-8 --to-stdout >$@++ && \
+	$(PERL_PATH) fix-texi.perl <$@++ >$@+ && \
+	rm $@++ && \
+	mv $@+ $@
+
+user-manual.pdf: user-manual.xml
+	$(QUIET_DBLATEX)$(RM) $@+ $@ && \
+	$(DBLATEX) -o $@+ -p /etc/asciidoc/dblatex/asciidoc-dblatex.xsl -s /etc/asciidoc/dblatex/asciidoc-dblatex.sty $< && \
+	mv $@+ $@
+
+perfman.texi: $(MAN_XML) cat-texi.perl
+	$(QUIET_DB2TEXI)$(RM) $@+ $@ && \
+	($(foreach xml,$(MAN_XML),$(DOCBOOK2X_TEXI) --encoding=UTF-8 \
+		--to-stdout $(xml) &&) true) > $@++ && \
+	$(PERL_PATH) cat-texi.perl $@ <$@++ >$@+ && \
+	rm $@++ && \
+	mv $@+ $@
+
+perfman.info: perfman.texi
+	$(QUIET_MAKEINFO)$(MAKEINFO) --no-split --no-validate $*.texi
+
+$(patsubst %.txt,%.texi,$(MAN_TXT)): %.texi : %.xml
+	$(QUIET_DB2TEXI)$(RM) $@+ $@ && \
+	$(DOCBOOK2X_TEXI) --to-stdout $*.xml >$@+ && \
+	mv $@+ $@
+
+howto-index.txt: howto-index.sh $(wildcard howto/*.txt)
+	$(QUIET_GEN)$(RM) $@+ $@ && \
+	'$(SHELL_PATH_SQ)' ./howto-index.sh $(wildcard howto/*.txt) >$@+ && \
+	mv $@+ $@
+
+$(patsubst %,%.html,$(ARTICLES)) : %.html : %.txt
+	$(QUIET_ASCIIDOC)$(ASCIIDOC) -b xhtml11 $*.txt
+
+WEBDOC_DEST = /pub/software/tools/perf/docs
+
+$(patsubst %.txt,%.html,$(wildcard howto/*.txt)): %.html : %.txt
+	$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
+	sed -e '1,/^$$/d' $< | $(ASCIIDOC) -b xhtml11 - >$@+ && \
+	mv $@+ $@
+
+install-webdoc : html
+	'$(SHELL_PATH_SQ)' ./install-webdoc.sh $(WEBDOC_DEST)
+
+quick-install: quick-install-man
+
+quick-install-man:
+	'$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(DOC_REF) $(DESTDIR)$(mandir)
+
+quick-install-html:
+	'$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(HTML_REF) $(DESTDIR)$(htmldir)
+
+.PHONY: .FORCE-PERF-VERSION-FILE
diff --git a/Documentation/perf_counter/Documentation/asciidoc.conf b/Documentation/perf_counter/Documentation/asciidoc.conf
new file mode 100644
index 0000000..356b23a
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/asciidoc.conf
@@ -0,0 +1,91 @@
+## linkperf: macro
+#
+# Usage: linkperf:command[manpage-section]
+#
+# Note, {0} is the manpage section, while {target} is the command.
+#
+# Show PERF link as: <command>(<section>); if section is defined, else just show
+# the command.
+
+[macros]
+(?su)[\\]?(?P<name>linkperf):(?P<target>\S*?)\[(?P<attrlist>.*?)\]=
+
+[attributes]
+asterisk=&#42;
+plus=&#43;
+caret=&#94;
+startsb=&#91;
+endsb=&#93;
+tilde=&#126;
+
+ifdef::backend-docbook[]
+[linkperf-inlinemacro]
+{0%{target}}
+{0#<citerefentry>}
+{0#<refentrytitle>{target}</refentrytitle><manvolnum>{0}</manvolnum>}
+{0#</citerefentry>}
+endif::backend-docbook[]
+
+ifdef::backend-docbook[]
+ifndef::perf-asciidoc-no-roff[]
+# "unbreak" docbook-xsl v1.68 for manpages. v1.69 works with or without this.
+# v1.72 breaks with this because it replaces dots not in roff requests.
+[listingblock]
+<example><title>{title}</title>
+<literallayout>
+ifdef::doctype-manpage[]
+&#10;.ft C&#10;
+endif::doctype-manpage[]
+|
+ifdef::doctype-manpage[]
+&#10;.ft&#10;
+endif::doctype-manpage[]
+</literallayout>
+{title#}</example>
+endif::perf-asciidoc-no-roff[]
+
+ifdef::perf-asciidoc-no-roff[]
+ifdef::doctype-manpage[]
+# The following two small workarounds insert a simple paragraph after screen
+[listingblock]
+<example><title>{title}</title>
+<literallayout>
+|
+</literallayout><simpara></simpara>
+{title#}</example>
+
+[verseblock]
+<formalpara{id? id="{id}"}><title>{title}</title><para>
+{title%}<literallayout{id? id="{id}"}>
+{title#}<literallayout>
+|
+</literallayout>
+{title#}</para></formalpara>
+{title%}<simpara></simpara>
+endif::doctype-manpage[]
+endif::perf-asciidoc-no-roff[]
+endif::backend-docbook[]
+
+ifdef::doctype-manpage[]
+ifdef::backend-docbook[]
+[header]
+template::[header-declarations]
+<refentry>
+<refmeta>
+<refentrytitle>{mantitle}</refentrytitle>
+<manvolnum>{manvolnum}</manvolnum>
+<refmiscinfo class="source">perf</refmiscinfo>
+<refmiscinfo class="version">{perf_version}</refmiscinfo>
+<refmiscinfo class="manual">perf Manual</refmiscinfo>
+</refmeta>
+<refnamediv>
+  <refname>{manname}</refname>
+  <refpurpose>{manpurpose}</refpurpose>
+</refnamediv>
+endif::backend-docbook[]
+endif::doctype-manpage[]
+
+ifdef::backend-xhtml11[]
+[linkperf-inlinemacro]
+<a href="{target}.html">{target}{0?({0})}</a>
+endif::backend-xhtml11[]
diff --git a/Documentation/perf_counter/Documentation/manpage-1.72.xsl b/Documentation/perf_counter/Documentation/manpage-1.72.xsl
new file mode 100644
index 0000000..b4d315c
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/manpage-1.72.xsl
@@ -0,0 +1,14 @@
+<!-- manpage-1.72.xsl:
+     special settings for manpages rendered from asciidoc+docbook
+     handles peculiarities in docbook-xsl 1.72.0 -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+		version="1.0">
+
+<xsl:import href="manpage-base.xsl"/>
+
+<!-- these are the special values for the roff control characters
+     needed for docbook-xsl 1.72.0 -->
+<xsl:param name="git.docbook.backslash">&#x2593;</xsl:param>
+<xsl:param name="git.docbook.dot"      >&#x2302;</xsl:param>
+
+</xsl:stylesheet>
diff --git a/Documentation/perf_counter/Documentation/manpage-base.xsl b/Documentation/perf_counter/Documentation/manpage-base.xsl
new file mode 100644
index 0000000..a264fa61
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/manpage-base.xsl
@@ -0,0 +1,35 @@
+<!-- manpage-base.xsl:
+     special formatting for manpages rendered from asciidoc+docbook -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+		version="1.0">
+
+<!-- these params silence some output from xmlto -->
+<xsl:param name="man.output.quietly" select="1"/>
+<xsl:param name="refentry.meta.get.quietly" select="1"/>
+
+<!-- convert asciidoc callouts to man page format;
+     git.docbook.backslash and git.docbook.dot params
+     must be supplied by another XSL file or other means -->
+<xsl:template match="co">
+	<xsl:value-of select="concat(
+			      $git.docbook.backslash,'fB(',
+			      substring-after(@id,'-'),')',
+			      $git.docbook.backslash,'fR')"/>
+</xsl:template>
+<xsl:template match="calloutlist">
+	<xsl:value-of select="$git.docbook.dot"/>
+	<xsl:text>sp&#10;</xsl:text>
+	<xsl:apply-templates/>
+	<xsl:text>&#10;</xsl:text>
+</xsl:template>
+<xsl:template match="callout">
+	<xsl:value-of select="concat(
+			      $git.docbook.backslash,'fB',
+			      substring-after(@arearefs,'-'),
+			      '. ',$git.docbook.backslash,'fR')"/>
+	<xsl:apply-templates/>
+	<xsl:value-of select="$git.docbook.dot"/>
+	<xsl:text>br&#10;</xsl:text>
+</xsl:template>
+
+</xsl:stylesheet>
diff --git a/Documentation/perf_counter/Documentation/manpage-bold-literal.xsl b/Documentation/perf_counter/Documentation/manpage-bold-literal.xsl
new file mode 100644
index 0000000..608eb5d
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/manpage-bold-literal.xsl
@@ -0,0 +1,17 @@
+<!-- manpage-bold-literal.xsl:
+     special formatting for manpages rendered from asciidoc+docbook -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+		version="1.0">
+
+<!-- render literal text as bold (instead of plain or monospace);
+     this makes literal text easier to distinguish in manpages
+     viewed on a tty -->
+<xsl:template match="literal">
+	<xsl:value-of select="$git.docbook.backslash"/>
+	<xsl:text>fB</xsl:text>
+	<xsl:apply-templates/>
+	<xsl:value-of select="$git.docbook.backslash"/>
+	<xsl:text>fR</xsl:text>
+</xsl:template>
+
+</xsl:stylesheet>
diff --git a/Documentation/perf_counter/Documentation/manpage-normal.xsl b/Documentation/perf_counter/Documentation/manpage-normal.xsl
new file mode 100644
index 0000000..a48f5b1
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/manpage-normal.xsl
@@ -0,0 +1,13 @@
+<!-- manpage-normal.xsl:
+     special settings for manpages rendered from asciidoc+docbook
+     handles anything we want to keep away from docbook-xsl 1.72.0 -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+		version="1.0">
+
+<xsl:import href="manpage-base.xsl"/>
+
+<!-- these are the normal values for the roff control characters -->
+<xsl:param name="git.docbook.backslash">\</xsl:param>
+<xsl:param name="git.docbook.dot"	>.</xsl:param>
+
+</xsl:stylesheet>
diff --git a/Documentation/perf_counter/Documentation/manpage-suppress-sp.xsl b/Documentation/perf_counter/Documentation/manpage-suppress-sp.xsl
new file mode 100644
index 0000000..a63c763
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/manpage-suppress-sp.xsl
@@ -0,0 +1,21 @@
+<!-- manpage-suppress-sp.xsl:
+     special settings for manpages rendered from asciidoc+docbook
+     handles erroneous, inline .sp in manpage output of some
+     versions of docbook-xsl -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+		version="1.0">
+
+<!-- attempt to work around spurious .sp at the tail of the line
+     that some versions of docbook stylesheets seem to add -->
+<xsl:template match="simpara">
+  <xsl:variable name="content">
+    <xsl:apply-templates/>
+  </xsl:variable>
+  <xsl:value-of select="normalize-space($content)"/>
+  <xsl:if test="not(ancestor::authorblurb) and
+                not(ancestor::personblurb)">
+    <xsl:text>&#10;&#10;</xsl:text>
+  </xsl:if>
+</xsl:template>
+
+</xsl:stylesheet>
diff --git a/Documentation/perf_counter/Documentation/perf-record.txt b/Documentation/perf_counter/Documentation/perf-record.txt
index a93d2ec..4d3416f 100644
--- a/Documentation/perf_counter/Documentation/perf-record.txt
+++ b/Documentation/perf_counter/Documentation/perf-record.txt
@@ -1,5 +1,5 @@
 perf-record(1)
-==========
+==============
 
 NAME
 ----
@@ -53,12 +53,6 @@ OPTIONS
 -l::
         scale counter values
 
-Configuration
--------------
-
-EXAMPLES
---------
-
 SEE ALSO
 --------
-linkperf:git-stat[1]
+linkperf:perf-stat[1]
diff --git a/Documentation/perf_counter/Documentation/perf-report.txt b/Documentation/perf_counter/Documentation/perf-report.txt
index 49efe16..52d3fc6 100644
--- a/Documentation/perf_counter/Documentation/perf-report.txt
+++ b/Documentation/perf_counter/Documentation/perf-report.txt
@@ -1,5 +1,5 @@
 perf-report(1)
-==========
+==============
 
 NAME
 ----
@@ -21,12 +21,6 @@ OPTIONS
 --input=::
         Input file name. (default: perf.data)
 
-Configuration
--------------
-
-EXAMPLES
---------
-
 SEE ALSO
 --------
 linkperf:perf-stat[1]
diff --git a/Documentation/perf_counter/Documentation/perf-stat.txt b/Documentation/perf_counter/Documentation/perf-stat.txt
index 828c59ff..a67d0e3 100644
--- a/Documentation/perf_counter/Documentation/perf-stat.txt
+++ b/Documentation/perf_counter/Documentation/perf-stat.txt
@@ -51,9 +51,6 @@ OPTIONS
 -l::
         scale counter values
 
-Configuration
--------------
-
 EXAMPLES
 --------
 
@@ -74,4 +71,4 @@ $ perf stat sleep 1
 
 SEE ALSO
 --------
-linkperf:git-tops[1]
+linkperf:perf-tops[1]
diff --git a/Documentation/perf_counter/Documentation/perf-top.txt b/Documentation/perf_counter/Documentation/perf-top.txt
index 057333b..15251e4 100644
--- a/Documentation/perf_counter/Documentation/perf-top.txt
+++ b/Documentation/perf_counter/Documentation/perf-top.txt
@@ -50,12 +50,6 @@ OPTIONS
 -l::
         scale counter values
 
-Configuration
--------------
-
-EXAMPLES
---------
-
 SEE ALSO
 --------
-linkperf:git-stat[1]
+linkperf:perf-stat[1]
diff --git a/Documentation/perf_counter/Documentation/perf.txt b/Documentation/perf_counter/Documentation/perf.txt
new file mode 100644
index 0000000..e3d8b38
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf.txt
@@ -0,0 +1,23 @@
+perf(1)
+=======
+
+NAME
+----
+perf - Performance analysis tools for Linux
+
+SYNOPSIS
+--------
+[verse]
+'perf' [--version] [--help] COMMAND [ARGS]
+
+DESCRIPTION
+-----------
+Performance counters for Linux are are a new kernel-based subsystem
+that provide a framework for all things performance analysis. It
+covers hardware level (CPU/PMU, Performance Monitoring Unit) features
+and software features (software counters, tracepoints) as well.
+
+SEE ALSO
+--------
+linkperf:perf-stat[1], linkperf:perf-top[1],
+linkperf:perf-record[1], linkperf:perf-report[1]
diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 8f72584..416ab11 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -693,6 +693,21 @@ builtin-revert.o wt-status.o: wt-status.h
 $(LIB_FILE): $(LIB_OBJS)
 	$(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS)
 
+doc:
+	$(MAKE) -C Documentation all
+
+man:
+	$(MAKE) -C Documentation man
+
+html:
+	$(MAKE) -C Documentation html
+
+info:
+	$(MAKE) -C Documentation info
+
+pdf:
+	$(MAKE) -C Documentation pdf
+
 TAGS:
 	$(RM) TAGS
 	$(FIND) . -name '*.[hcS]' -print | xargs etags -a
@@ -781,6 +796,31 @@ ifneq (,$X)
 	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';)
 endif
 
+install-doc:
+	$(MAKE) -C Documentation install
+
+install-man:
+	$(MAKE) -C Documentation install-man
+
+install-html:
+	$(MAKE) -C Documentation install-html
+
+install-info:
+	$(MAKE) -C Documentation install-info
+
+install-pdf:
+	$(MAKE) -C Documentation install-pdf
+
+quick-install-doc:
+	$(MAKE) -C Documentation quick-install
+
+quick-install-man:
+	$(MAKE) -C Documentation quick-install-man
+
+quick-install-html:
+	$(MAKE) -C Documentation quick-install-html
+
+
 ### Maintainer's dist rules
 
 perf.spec: perf.spec.in
@@ -801,6 +841,26 @@ dist: perf.spec perf-archive$(X) configure
 	@$(RM) -r $(PERF_TARNAME)
 	gzip -f -9 $(PERF_TARNAME).tar
 
+htmldocs = perf-htmldocs-$(PERF_VERSION)
+manpages = perf-manpages-$(PERF_VERSION)
+dist-doc:
+	$(RM) -r .doc-tmp-dir
+	mkdir .doc-tmp-dir
+	$(MAKE) -C Documentation WEBDOC_DEST=../.doc-tmp-dir install-webdoc
+	cd .doc-tmp-dir && $(TAR) cf ../$(htmldocs).tar .
+	gzip -n -9 -f $(htmldocs).tar
+	:
+	$(RM) -r .doc-tmp-dir
+	mkdir -p .doc-tmp-dir/man1 .doc-tmp-dir/man5 .doc-tmp-dir/man7
+	$(MAKE) -C Documentation DESTDIR=./ \
+		man1dir=../.doc-tmp-dir/man1 \
+		man5dir=../.doc-tmp-dir/man5 \
+		man7dir=../.doc-tmp-dir/man7 \
+		install
+	cd .doc-tmp-dir && $(TAR) cf ../$(manpages).tar .
+	gzip -n -9 -f $(manpages).tar
+	$(RM) -r .doc-tmp-dir
+
 rpm: dist
 	$(RPMBUILD) -ta $(PERF_TARNAME).tar.gz
 
@@ -819,6 +879,7 @@ clean:
 	$(RM) -r $(PERF_TARNAME) .doc-tmp-dir
 	$(RM) $(PERF_TARNAME).tar.gz perf-core_$(PERF_VERSION)-*.tar.gz
 	$(RM) $(htmldocs).tar.gz $(manpages).tar.gz
+	$(MAKE) -C Documentation/ clean
 	$(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-BUILD-OPTIONS
 
 .PHONY: all install clean strip
diff --git a/Documentation/perf_counter/builtin-help.c b/Documentation/perf_counter/builtin-help.c
index d2bd317..a3894bf 100644
--- a/Documentation/perf_counter/builtin-help.c
+++ b/Documentation/perf_counter/builtin-help.c
@@ -317,7 +317,7 @@ static const char *cmd_to_page(const char *perf_cmd)
 	else if (is_perf_command(perf_cmd))
 		return prepend("perf-", perf_cmd);
 	else
-		return prepend("perf", perf_cmd);
+		return prepend("perf-", perf_cmd);
 }
 
 static void setup_man_path(void)
-- 
cgit v0.10.2


From 62013ab5d5df297a01ae5863b5c26d758ec0af7f Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sat, 30 May 2009 21:50:58 +0900
Subject: nilfs2: fix bh leak in nilfs_cpfile_delete_checkpoints function

The nilfs_cpfile_delete_checkpoints() wrongly skips brelse() for the
header block of checkpoint file in case of errors.  This fixes the
leak bug.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>

diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index e90b60d..300f1cd 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -311,7 +311,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
 		ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
 		if (ret < 0) {
 			if (ret != -ENOENT)
-				goto out_sem;
+				goto out_header;
 			/* skip hole */
 			ret = 0;
 			continue;
@@ -344,7 +344,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
 					continue;
 				printk(KERN_ERR "%s: cannot delete block\n",
 				       __func__);
-				goto out_sem;
+				goto out_header;
 			}
 		}
 
@@ -361,6 +361,8 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
 		nilfs_mdt_mark_dirty(cpfile);
 		kunmap_atomic(kaddr, KM_USER0);
 	}
+
+ out_header:
 	brelse(header_bh);
 
  out_sem:
-- 
cgit v0.10.2


From c339dfdd65b52bfd947ab29d1210314a2f6d622d Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Sat, 30 May 2009 20:06:54 +0200
Subject: ide_pci_generic: add quirk for Netcell ATA RAID

We need to explicitly mark words 85-87 as valid ones since
firmware doesn't do it.

This should fix support for LBA48 and FLUSH CACHE [EXT] command
which stopped working after we applied more strict checking of
identify words in:

	commit 942dcd85bf8edf38cdc3745306ca250684d99a61
	("ide: idedisk_supports_lba48() -> ata_id_lba48_enabled()")

and

	commit 4b58f17d7c45a8e5f4acda641bec388398b9c0fa
	("ide: ide_id_has_flush_cache() -> ata_id_flush_enabled()")

Reported-and-tested-by: "Trevor Hemsley" <trevor.hemsley@ntlworld.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>

diff --git a/drivers/ide/ide-pci-generic.c b/drivers/ide/ide-pci-generic.c
index 61111fd..39d4e01 100644
--- a/drivers/ide/ide-pci-generic.c
+++ b/drivers/ide/ide-pci-generic.c
@@ -33,6 +33,16 @@ static int ide_generic_all;		/* Set to claim all devices */
 module_param_named(all_generic_ide, ide_generic_all, bool, 0444);
 MODULE_PARM_DESC(all_generic_ide, "IDE generic will claim all unknown PCI IDE storage controllers.");
 
+static void netcell_quirkproc(ide_drive_t *drive)
+{
+	/* mark words 85-87 as valid */
+	drive->id[ATA_ID_CSF_DEFAULT] |= 0x4000;
+}
+
+static const struct ide_port_ops netcell_port_ops = {
+	.quirkproc		= netcell_quirkproc,
+};
+
 #define DECLARE_GENERIC_PCI_DEV(extra_flags) \
 	{ \
 		.name		= DRV_NAME, \
@@ -74,6 +84,7 @@ static const struct ide_port_info generic_chipsets[] __devinitdata = {
 
 	{	/* 6: Revolution */
 		.name		= DRV_NAME,
+		.port_ops	= &netcell_port_ops,
 		.host_flags	= IDE_HFLAG_CLEAR_SIMPLEX |
 				  IDE_HFLAG_TRUST_BIOS_FOR_DMA |
 				  IDE_HFLAG_OFF_BOARD,
-- 
cgit v0.10.2


From d315a0e09f1c8b833cacd5e72f3edea419978138 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 31 May 2009 23:09:22 +1000
Subject: crypto: hash - Fix handling of sg entry that crosses page boundary

A quirk that we've always supported is having an sg entry that's
bigger than a page, or more generally an sg entry that crosses
page boundaries.  Even though it would be better to explicitly have
to sg entries for this, we need to support it for the existing users,
in particular, IPsec.

The new ahash sg walking code did try to handle this, but there was
a bug where we didn't increment the page so kept on walking on the
first page over an dover again.

This patch fixes it.

Tested-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/crypto/ahash.c b/crypto/ahash.c
index b2d1ee3..f347637 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -82,10 +82,11 @@ int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err)
 	if (err)
 		return err;
 
-	walk->offset = 0;
-
-	if (nbytes)
+	if (nbytes) {
+		walk->offset = 0;
+		walk->pg++;
 		return hash_walk_next(walk);
+	}
 
 	if (!walk->total)
 		return 0;
-- 
cgit v0.10.2


From 27b9613b7be39412775d0ab80f57229aa73bb07d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 31 May 2009 22:09:49 +0200
Subject: perf_counter tools: Fix unknown command help text

Arjan reported this error when entering an unknown command to perf:

  $ perf start
  fatal: Uh oh. Your system reports no Git commands at all.

The Git code expects there to be perf-* commands - but since Perf
is a 'pure' utility with no dash commands anymore, this old assumption
of Git does not hold anymore. Remove that error check.

Reported-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/util/help.c b/Documentation/perf_counter/util/help.c
index edde541..397487f 100644
--- a/Documentation/perf_counter/util/help.c
+++ b/Documentation/perf_counter/util/help.c
@@ -323,9 +323,6 @@ const char *help_unknown_cmd(const char *cmd)
 	qsort(main_cmds.names, main_cmds.cnt,
 	      sizeof(*main_cmds.names), levenshtein_compare);
 
-	if (!main_cmds.cnt)
-		die ("Uh oh. Your system reports no Git commands at all.");
-
 	best_similarity = main_cmds.names[0]->len;
 	n = 1;
 	while (n < main_cmds.cnt && best_similarity == main_cmds.names[n]->len)
-- 
cgit v0.10.2


From 52bb25a620e1925bb53d41d0ed28571b3de98a31 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Mon, 1 Jun 2009 06:21:13 +0000
Subject: headers_check fix: linux/auto_fs.h

fix the following 'make headers_check' warnings:

  usr/include/linux/auto_fs.h:17: include of <linux/types.h> is preferred over <asm/types.h>

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>

diff --git a/include/linux/auto_fs.h b/include/linux/auto_fs.h
index 6326585..7b09c83 100644
--- a/include/linux/auto_fs.h
+++ b/include/linux/auto_fs.h
@@ -14,13 +14,12 @@
 #ifndef _LINUX_AUTO_FS_H
 #define _LINUX_AUTO_FS_H
 
+#include <linux/types.h>
 #ifdef __KERNEL__
 #include <linux/fs.h>
 #include <linux/limits.h>
-#include <linux/types.h>
 #include <linux/ioctl.h>
 #else
-#include <asm/types.h>
 #include <sys/ioctl.h>
 #endif /* __KERNEL__ */
 
-- 
cgit v0.10.2


From d280cc989ad591607e812cd5c5dfde702b5f191a Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Mon, 1 Jun 2009 06:23:25 +0000
Subject: headers_check fix: linux/net_dropmon.h

fix the following 'make headers_check' warnings:

  usr/include/linux/net_dropmon.h:7: found __[us]{8,16,32,64} type without #include <linux/types.h>

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>

diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h
index 0217fb8..0e2e100 100644
--- a/include/linux/net_dropmon.h
+++ b/include/linux/net_dropmon.h
@@ -1,6 +1,7 @@
 #ifndef __NET_DROPMON_H
 #define __NET_DROPMON_H
 
+#include <linux/types.h>
 #include <linux/netlink.h>
 
 struct net_dm_drop_point {
-- 
cgit v0.10.2


From eb9b9b56eed280e65a9e194aaeb50a5a75111859 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 28 May 2009 11:51:51 +0000
Subject: sh: boot word / mode pin support V2

Add mode pin support for the SuperH architecture V2.

With this patch applied the board code can add their
own function to export the cpu mode pin configuration.
In most cases this will be a constant bitmap, but
boards that allow reading this from a register can
instead read out the pin state from hardware.

The code warns if a pin is tested but no board specific
mode pin function has been provided.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/machvec.h b/arch/sh/include/asm/machvec.h
index 73d6d16..84dd377 100644
--- a/arch/sh/include/asm/machvec.h
+++ b/arch/sh/include/asm/machvec.h
@@ -48,6 +48,7 @@ struct sh_machine_vector {
 	void (*mv_ioport_unmap)(void __iomem *);
 
 	int (*mv_clk_init)(void);
+	int (*mv_mode_pins)(void);
 };
 
 extern struct sh_machine_vector sh_mv;
diff --git a/arch/sh/include/asm/processor.h b/arch/sh/include/asm/processor.h
index 005c962..fb67482 100644
--- a/arch/sh/include/asm/processor.h
+++ b/arch/sh/include/asm/processor.h
@@ -94,6 +94,10 @@ extern struct pt_regs fake_swapper_regs;
 const char *get_cpu_subtype(struct sh_cpuinfo *c);
 extern const struct seq_operations cpuinfo_op;
 
+/* processor boot mode configuration */
+int generic_mode_pins(void);
+int test_mode_pin(int pin);
+
 #ifdef CONFIG_VSYSCALL
 int vsyscall_init(void);
 #else
diff --git a/arch/sh/kernel/machvec.c b/arch/sh/kernel/machvec.c
index c1ea41e..548f660 100644
--- a/arch/sh/kernel/machvec.c
+++ b/arch/sh/kernel/machvec.c
@@ -129,6 +129,7 @@ void __init sh_mv_setup(void)
 	mv_set(ioport_map);
 	mv_set(ioport_unmap);
 	mv_set(irq_demux);
+	mv_set(mode_pins);
 
 	if (!sh_mv.mv_nr_irqs)
 		sh_mv.mv_nr_irqs = NR_IRQS;
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index 00086b2..050131e 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -420,6 +420,18 @@ void __init setup_arch(char **cmdline_p)
 #endif
 }
 
+/* processor boot mode configuration */
+int generic_mode_pins(void)
+{
+	pr_warning("generic_mode_pins(): missing mode pin configuration\n");
+	return 0;
+}
+
+int test_mode_pin(int pin)
+{
+	return sh_mv.mv_mode_pins() & (1 << pin);
+}
+
 static const char *cpu_name[] = {
 	[CPU_SH7201]	= "SH7201",
 	[CPU_SH7203]	= "SH7203",	[CPU_SH7263]	= "SH7263",
-- 
cgit v0.10.2


From 4a44b32969bfc29140b9f21a1ec924c796d6ac43 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Mon, 1 Jun 2009 15:56:00 +0900
Subject: sh: sh7785 mode pin definitions

This patch adds sh7785 mode pin definitions. Mode pins and
pin function controller comments are added as well.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/cpu-sh4/cpu/sh7785.h b/arch/sh/include/cpu-sh4/cpu/sh7785.h
index e4006af..89afaa6 100644
--- a/arch/sh/include/cpu-sh4/cpu/sh7785.h
+++ b/arch/sh/include/cpu-sh4/cpu/sh7785.h
@@ -1,6 +1,30 @@
 #ifndef __ASM_SH7785_H__
 #define __ASM_SH7785_H__
 
+/* Boot Mode Pins, more information in sh7785 manual Rev.1.00, page 1628 */
+enum {
+	MODE_PIN_MODE0,  /* CPG - Initial Pck/Bck Frequency [FRQMR1] */
+	MODE_PIN_MODE1,  /* CPG - Initial Uck/SHck/DDRck Frequency [FRQMR1] */
+	MODE_PIN_MODE2,  /* CPG - Reserved (L: Normal operation) */
+	MODE_PIN_MODE3,  /* CPG - Reserved (L: Normal operation) */
+	MODE_PIN_MODE4,  /* CPG - Initial PLL setting (72x/36x) */
+	MODE_PIN_MODE5,  /* LBSC - Area0 Memory Type / Bus Width [CS0BCR.8] */
+	MODE_PIN_MODE6,  /* LBSC - Area0 Memory Type / Bus Width [CS0BCR.9] */
+	MODE_PIN_MODE7,  /* LBSC - Area0 Memory Type / Bus Width [CS0BCR.3] */
+	MODE_PIN_MODE8,  /* LBSC - Endian Mode (L: Big, H: Little) [BCR.31] */
+	MODE_PIN_MODE9,  /* LBSC - Master/Slave Mode (L: Slave) [BCR.30] */
+	MODE_PIN_MODE10, /* CPG - Clock Input (L: Ext Clk, H: Crystal) */
+	MODE_PIN_MODE11, /* PCI - Pin Mode (LL: PCI host, LH: PCI slave) */
+	MODE_PIN_MODE12, /* PCI - Pin Mode (HL: Local bus, HH: DU) */
+	MODE_PIN_MODE13, /* Boot Address Mode (L: 29-bit, H: 32-bit) */
+	MODE_PIN_MODE14, /* Reserved (H: Normal operation) */
+	MODE_PIN_MPMD,   /* Emulation Mode (L: Emulation mode, H: LSI mode) */
+};
+
+/* Pin Function Controller:
+ * GPIO_FN_xx - GPIO used to select pin function
+ * GPIO_Pxx - GPIO mapped to real I/O pin on CPU
+ */
 enum {
 	/* PA */
 	GPIO_PA7, GPIO_PA6, GPIO_PA5, GPIO_PA4,
-- 
cgit v0.10.2


From 63d12e23235d982d8f55696e09b2ff91e3ba0042 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 28 May 2009 12:00:25 +0000
Subject: sh: sh7785lcr mode pin configuration

This patch adds mode pin support to the sh7785lcr board.

The harware allows the user to control the mode pins using
dip switches S1 and S2, but from the software the pins are
fixed to the factory default since we have no way to reading
out this configuration from software.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/boards/board-sh7785lcr.c b/arch/sh/boards/board-sh7785lcr.c
index 33b194b..c2894c5 100644
--- a/arch/sh/boards/board-sh7785lcr.c
+++ b/arch/sh/boards/board-sh7785lcr.c
@@ -24,6 +24,7 @@
 #include <mach/sh7785lcr.h>
 #include <asm/heartbeat.h>
 #include <asm/clock.h>
+#include <cpu/sh7785.h>
 
 /*
  * NOTE: This board has 2 physical memory maps.
@@ -320,6 +321,26 @@ static void __init sh7785lcr_setup(char **cmdline_p)
 	writel(0x000307c2, sm501_reg);
 }
 
+/* Return the board specific boot mode pin configuration */
+static int sh7785lcr_mode_pins(void)
+{
+	int value = 0;
+
+	/* These are the factory default settings of S1 and S2.
+	 * If you change these dip switches then you will need to
+	 * adjust the values below as well.
+	 */
+	value |= 1 << MODE_PIN_MODE4; /* Clock Mode 16 */
+	value |= 1 << MODE_PIN_MODE5; /* 32-bit Area0 bus width */
+	value |= 1 << MODE_PIN_MODE6; /* 32-bit Area0 bus width */
+	value |= 1 << MODE_PIN_MODE7; /* Area 0 SRAM interface [fixed] */
+	value |= 1 << MODE_PIN_MODE8; /* Little Endian */
+	value |= 1 << MODE_PIN_MODE9; /* Master Mode */
+	value |= 1 << MODE_PIN_MODE14; /* No PLL step-up */
+
+	return value;
+}
+
 /*
  * The Machine Vector
  */
@@ -328,5 +349,6 @@ static struct sh_machine_vector mv_sh7785lcr __initmv = {
 	.mv_setup		= sh7785lcr_setup,
 	.mv_clk_init		= sh7785lcr_clk_init,
 	.mv_init_irq		= init_sh7785lcr_IRQ,
+	.mv_mode_pins		= sh7785lcr_mode_pins,
 };
 
-- 
cgit v0.10.2


From 1823f6d5e6b81cca6542ed2e5f30d2556aad0f67 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 28 May 2009 12:06:17 +0000
Subject: sh: sh7785 pll configuration from mode pin

This patch modifies the sh7785 clock code to use the MODE4
value to switch between 72x and 36x PLL multiplication.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index a4a9bcb..705b023 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -16,6 +16,7 @@
 #include <linux/cpufreq.h>
 #include <asm/clock.h>
 #include <asm/freq.h>
+#include <cpu/sh7785.h>
 
 static unsigned int div2[] = { 1, 2, 4, 6, 8, 12, 16, 18,
 			       24, 32, 36, 48 };
@@ -80,12 +81,11 @@ static struct clk_ops frqmr_clk_ops = {
 
 static unsigned long pll_recalc(struct clk *clk)
 {
-	/*
-	 * XXX: PLL1 multiplier is locked for the default clock mode,
-	 * when mode pin detection and configuration support is added,
-	 * select the multiplier dynamically.
-	 */
-	return clk->parent->rate * 36;
+	int multiplier;
+
+	multiplier = test_mode_pin(MODE_PIN_MODE4) ? 36 : 72;
+
+	return clk->parent->rate * multiplier;
 }
 
 static struct clk_ops pll_clk_ops = {
-- 
cgit v0.10.2


From 98fbe45bea77c1804eae0e71f27673db1824a2a8 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Fri, 29 May 2009 07:41:23 +0000
Subject: sh: SH7724 has an L2 cache.

Add the CPU_HAS_L2_CACHE flag to SH7724.

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4/probe.c b/arch/sh/kernel/cpu/sh4/probe.c
index 973ff83..7d821ce 100644
--- a/arch/sh/kernel/cpu/sh4/probe.c
+++ b/arch/sh/kernel/cpu/sh4/probe.c
@@ -160,7 +160,7 @@ int __init detect_cpu_and_cache_system(void)
 		boot_cpu_data.type = CPU_SH7724;
 		boot_cpu_data.icache.ways = 4;
 		boot_cpu_data.dcache.ways = 4;
-		boot_cpu_data.flags |= CPU_HAS_LLSC | CPU_HAS_FPU;
+		boot_cpu_data.flags |= CPU_HAS_LLSC | CPU_HAS_FPU | CPU_HAS_L2_CACHE;
 		break;
 	case 0x4000:	/* 1st cut */
 	case 0x4001:	/* 2nd cut */
-- 
cgit v0.10.2


From 25346b93ca079080c9cb23331db5c4f6404e8530 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 1 Jun 2009 17:48:12 +1000
Subject: perf_counter: Provide functions for locking and pinning the context
 for a task

This abstracts out the code for locking the context associated
with a task.  Because the context might get transferred from
one task to another concurrently, we have to check after
locking the context that it is still the right context for the
task and retry if not.  This was open-coded in
find_get_context() and perf_counter_init_task().

This adds a further function for pinning the context for a
task, i.e. marking it so it can't be transferred to another
task.  This adds a 'pin_count' field to struct
perf_counter_context to indicate that a context is pinned,
instead of the previous method of setting the parent_gen count
to all 1s.  Pinning the context with a pin_count is easier to
undo and doesn't require saving the parent_gen value.  This
also adds a perf_unpin_context() to undo the effect of
perf_pin_task_context() and changes perf_counter_init_task to
use it.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <18979.34748.755674.596386@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 519a41b..81ec79c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -543,6 +543,7 @@ struct perf_counter_context {
 	struct perf_counter_context *parent_ctx;
 	u64			parent_gen;
 	u64			generation;
+	int			pin_count;
 	struct rcu_head		rcu_head;
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 79c3f26..da8dfef 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -123,6 +123,69 @@ static void put_ctx(struct perf_counter_context *ctx)
 }
 
 /*
+ * Get the perf_counter_context for a task and lock it.
+ * This has to cope with with the fact that until it is locked,
+ * the context could get moved to another task.
+ */
+static struct perf_counter_context *perf_lock_task_context(
+				struct task_struct *task, unsigned long *flags)
+{
+	struct perf_counter_context *ctx;
+
+	rcu_read_lock();
+ retry:
+	ctx = rcu_dereference(task->perf_counter_ctxp);
+	if (ctx) {
+		/*
+		 * If this context is a clone of another, it might
+		 * get swapped for another underneath us by
+		 * perf_counter_task_sched_out, though the
+		 * rcu_read_lock() protects us from any context
+		 * getting freed.  Lock the context and check if it
+		 * got swapped before we could get the lock, and retry
+		 * if so.  If we locked the right context, then it
+		 * can't get swapped on us any more.
+		 */
+		spin_lock_irqsave(&ctx->lock, *flags);
+		if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
+			spin_unlock_irqrestore(&ctx->lock, *flags);
+			goto retry;
+		}
+	}
+	rcu_read_unlock();
+	return ctx;
+}
+
+/*
+ * Get the context for a task and increment its pin_count so it
+ * can't get swapped to another task.  This also increments its
+ * reference count so that the context can't get freed.
+ */
+static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
+{
+	struct perf_counter_context *ctx;
+	unsigned long flags;
+
+	ctx = perf_lock_task_context(task, &flags);
+	if (ctx) {
+		++ctx->pin_count;
+		get_ctx(ctx);
+		spin_unlock_irqrestore(&ctx->lock, flags);
+	}
+	return ctx;
+}
+
+static void perf_unpin_context(struct perf_counter_context *ctx)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+	--ctx->pin_count;
+	spin_unlock_irqrestore(&ctx->lock, flags);
+	put_ctx(ctx);
+}
+
+/*
  * Add a counter from the lists for its context.
  * Must be called with ctx->mutex and ctx->lock held.
  */
@@ -916,7 +979,7 @@ static int context_equiv(struct perf_counter_context *ctx1,
 {
 	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
 		&& ctx1->parent_gen == ctx2->parent_gen
-		&& ctx1->parent_gen != ~0ull;
+		&& !ctx1->pin_count && !ctx2->pin_count;
 }
 
 /*
@@ -1271,6 +1334,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	struct perf_counter_context *ctx;
 	struct perf_counter_context *parent_ctx;
 	struct task_struct *task;
+	unsigned long flags;
 	int err;
 
 	/*
@@ -1323,28 +1387,9 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	if (!ptrace_may_access(task, PTRACE_MODE_READ))
 		goto errout;
 
- retry_lock:
-	rcu_read_lock();
  retry:
-	ctx = rcu_dereference(task->perf_counter_ctxp);
+	ctx = perf_lock_task_context(task, &flags);
 	if (ctx) {
-		/*
-		 * If this context is a clone of another, it might
-		 * get swapped for another underneath us by
-		 * perf_counter_task_sched_out, though the
-		 * rcu_read_lock() protects us from any context
-		 * getting freed.  Lock the context and check if it
-		 * got swapped before we could get the lock, and retry
-		 * if so.  If we locked the right context, then it
-		 * can't get swapped on us any more and we can
-		 * unclone it if necessary.
-		 * Once it's not a clone things will be stable.
-		 */
-		spin_lock_irq(&ctx->lock);
-		if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
-			spin_unlock_irq(&ctx->lock);
-			goto retry;
-		}
 		parent_ctx = ctx->parent_ctx;
 		if (parent_ctx) {
 			put_ctx(parent_ctx);
@@ -1355,9 +1400,8 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 		 * this context won't get freed if the task exits.
 		 */
 		get_ctx(ctx);
-		spin_unlock_irq(&ctx->lock);
+		spin_unlock_irqrestore(&ctx->lock, flags);
 	}
-	rcu_read_unlock();
 
 	if (!ctx) {
 		ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
@@ -1372,7 +1416,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 			 * the context they set.
 			 */
 			kfree(ctx);
-			goto retry_lock;
+			goto retry;
 		}
 		get_task_struct(task);
 	}
@@ -3667,7 +3711,6 @@ int perf_counter_init_task(struct task_struct *child)
 	struct perf_counter *counter;
 	struct task_struct *parent = current;
 	int inherited_all = 1;
-	u64 cloned_gen;
 	int ret = 0;
 
 	child->perf_counter_ctxp = NULL;
@@ -3693,32 +3736,17 @@ int perf_counter_init_task(struct task_struct *child)
 	get_task_struct(child);
 
 	/*
-	 * If the parent's context is a clone, temporarily set its
-	 * parent_gen to an impossible value (all 1s) so it won't get
-	 * swapped under us.  The rcu_read_lock makes sure that
-	 * parent_ctx continues to exist even if it gets swapped to
-	 * another process and then freed while we are trying to get
-	 * its lock.
+	 * If the parent's context is a clone, pin it so it won't get
+	 * swapped under us.
 	 */
-	rcu_read_lock();
- retry:
-	parent_ctx = rcu_dereference(parent->perf_counter_ctxp);
+	parent_ctx = perf_pin_task_context(parent);
+
 	/*
 	 * No need to check if parent_ctx != NULL here; since we saw
 	 * it non-NULL earlier, the only reason for it to become NULL
 	 * is if we exit, and since we're currently in the middle of
 	 * a fork we can't be exiting at the same time.
 	 */
-	spin_lock_irq(&parent_ctx->lock);
-	if (parent_ctx != rcu_dereference(parent->perf_counter_ctxp)) {
-		spin_unlock_irq(&parent_ctx->lock);
-		goto retry;
-	}
-	cloned_gen = parent_ctx->parent_gen;
-	if (parent_ctx->parent_ctx)
-		parent_ctx->parent_gen = ~0ull;
-	spin_unlock_irq(&parent_ctx->lock);
-	rcu_read_unlock();
 
 	/*
 	 * Lock the parent list. No need to lock the child - not PID
@@ -3759,7 +3787,7 @@ int perf_counter_init_task(struct task_struct *child)
 		cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
 		if (cloned_ctx) {
 			child_ctx->parent_ctx = cloned_ctx;
-			child_ctx->parent_gen = cloned_gen;
+			child_ctx->parent_gen = parent_ctx->parent_gen;
 		} else {
 			child_ctx->parent_ctx = parent_ctx;
 			child_ctx->parent_gen = parent_ctx->generation;
@@ -3769,15 +3797,7 @@ int perf_counter_init_task(struct task_struct *child)
 
 	mutex_unlock(&parent_ctx->mutex);
 
-	/*
-	 * Restore the clone status of the parent.
-	 */
-	if (parent_ctx->parent_ctx) {
-		spin_lock_irq(&parent_ctx->lock);
-		if (parent_ctx->parent_ctx)
-			parent_ctx->parent_gen = cloned_gen;
-		spin_unlock_irq(&parent_ctx->lock);
-	}
+	perf_unpin_context(parent_ctx);
 
 	return ret;
 }
-- 
cgit v0.10.2


From 880ca15adf2392770a68047e7a98e076ff4d21da Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 1 Jun 2009 17:49:14 +1000
Subject: perf_counter: Allow software counters to count while task is not
 running

This changes perf_swcounter_match() so that per-task software
counters can count events that occur while their associated
task is not running.  This will allow us to use the generic
software counter code for counting task migrations, which can
occur while the task is not scheduled in.

To do this, we have to distinguish between the situations where
the counter is inactive because its task has been scheduled
out, and those where the counter is inactive because it is part
of a group that was not able to go on the PMU.  In the former
case we want the counter to count, but not in the latter case.
If the context is active, we have the latter case.  If the
context is inactive then we need to know whether the counter
was counting when the context was last active, which we can
determine by comparing its ->tstamp_stopped timestamp with the
context's timestamp.

This also folds three checks in perf_swcounter_match, checking
perf_event_raw(), perf_event_type() and perf_event_id()
individually, into a single 64-bit comparison on
counter->hw_event.config, as an optimization.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <18979.34810.259718.955621@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index da8dfef..ff8b463 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2867,20 +2867,56 @@ static void perf_swcounter_overflow(struct perf_counter *counter,
 
 }
 
+static int perf_swcounter_is_counting(struct perf_counter *counter)
+{
+	struct perf_counter_context *ctx;
+	unsigned long flags;
+	int count;
+
+	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
+		return 1;
+
+	if (counter->state != PERF_COUNTER_STATE_INACTIVE)
+		return 0;
+
+	/*
+	 * If the counter is inactive, it could be just because
+	 * its task is scheduled out, or because it's in a group
+	 * which could not go on the PMU.  We want to count in
+	 * the first case but not the second.  If the context is
+	 * currently active then an inactive software counter must
+	 * be the second case.  If it's not currently active then
+	 * we need to know whether the counter was active when the
+	 * context was last active, which we can determine by
+	 * comparing counter->tstamp_stopped with ctx->time.
+	 *
+	 * We are within an RCU read-side critical section,
+	 * which protects the existence of *ctx.
+	 */
+	ctx = counter->ctx;
+	spin_lock_irqsave(&ctx->lock, flags);
+	count = 1;
+	/* Re-check state now we have the lock */
+	if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
+	    counter->ctx->is_active ||
+	    counter->tstamp_stopped < ctx->time)
+		count = 0;
+	spin_unlock_irqrestore(&ctx->lock, flags);
+	return count;
+}
+
 static int perf_swcounter_match(struct perf_counter *counter,
 				enum perf_event_types type,
 				u32 event, struct pt_regs *regs)
 {
-	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
-		return 0;
+	u64 event_config;
 
-	if (perf_event_raw(&counter->hw_event))
-		return 0;
+	event_config = ((u64) type << PERF_COUNTER_TYPE_SHIFT) | event;
 
-	if (perf_event_type(&counter->hw_event) != type)
+	if (!perf_swcounter_is_counting(counter))
 		return 0;
 
-	if (perf_event_id(&counter->hw_event) != event)
+	if (counter->hw_event.config != event_config)
 		return 0;
 
 	if (counter->hw_event.exclude_user && user_mode(regs))
-- 
cgit v0.10.2


From 6881e8bf3d86b23dd124134fae113ebd05fae08a Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 28 May 2009 12:52:29 +0000
Subject: sh: shared mstp32 clock code

Add shared 32-bit module stop bit clock support.

Processor specific code can use SH_CLK_MSTP32()
to initialize module stop bit clocks, and then
use sh_clk_mstp32() for registration.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index aa9480d..f43d3e7 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -117,4 +117,17 @@ long clk_rate_table_round(struct clk *clk,
 			  struct cpufreq_frequency_table *freq_table,
 			  unsigned long rate);
 
+#define SH_CLK_MSTP32(_name, _id, _parent, _enable_reg,	\
+	    _enable_bit, _flags)			\
+{							\
+	.name		= _name,			\
+	.id		= _id,				\
+	.parent		= _parent,			\
+	.enable_reg	= (void __iomem *)_enable_reg,	\
+	.enable_bit	= _enable_bit,			\
+	.flags		= _flags,			\
+}
+
+int sh_clk_mstp32_register(struct clk *clks, int nr);
+
 #endif /* __ASM_SH_CLOCK_H */
diff --git a/arch/sh/kernel/cpu/clock-cpg.c b/arch/sh/kernel/cpu/clock-cpg.c
index b78c237a..72228d2 100644
--- a/arch/sh/kernel/cpu/clock-cpg.c
+++ b/arch/sh/kernel/cpu/clock-cpg.c
@@ -1,7 +1,42 @@
 #include <linux/clk.h>
 #include <linux/compiler.h>
+#include <linux/io.h>
 #include <asm/clock.h>
 
+static int sh_clk_mstp32_enable(struct clk *clk)
+{
+	__raw_writel(__raw_readl(clk->enable_reg) & ~(1 << clk->enable_bit),
+		     clk->enable_reg);
+	return 0;
+}
+
+static void sh_clk_mstp32_disable(struct clk *clk)
+{
+	__raw_writel(__raw_readl(clk->enable_reg) | (1 << clk->enable_bit),
+		     clk->enable_reg);
+}
+
+static struct clk_ops sh_clk_mstp32_clk_ops = {
+	.enable		= sh_clk_mstp32_enable,
+	.disable	= sh_clk_mstp32_disable,
+	.recalc		= followparent_recalc,
+};
+
+int __init sh_clk_mstp32_register(struct clk *clks, int nr)
+{
+	struct clk *clkp;
+	int ret = 0;
+	int k;
+
+	for (k = 0; !ret && (k < nr); k++) {
+		clkp = clks + k;
+		clkp->ops = &sh_clk_mstp32_clk_ops;
+		ret |= clk_register(clkp);
+	}
+
+	return ret;
+}
+
 #ifdef CONFIG_SH_CLK_CPG_LEGACY
 static struct clk master_clk = {
 	.name		= "master_clk",
-- 
cgit v0.10.2


From e89d53e60593ee7066e1d36ab5c1ccf2648f5f53 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 28 May 2009 13:01:53 +0000
Subject: sh: hook up shared mstp32 clock code to sh7785

Hook up the shared 32-bit module stop bit code to sh7785.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index 705b023..7d55706 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -193,65 +193,34 @@ static struct clk *clks[] = {
 	&umem_clk,
 };
 
-static int mstpcr_clk_enable(struct clk *clk)
-{
-	__raw_writel(__raw_readl(clk->enable_reg) & ~(1 << clk->enable_bit),
-		     clk->enable_reg);
-	return 0;
-}
-
-static void mstpcr_clk_disable(struct clk *clk)
-{
-	__raw_writel(__raw_readl(clk->enable_reg) | (1 << clk->enable_bit),
-		     clk->enable_reg);
-}
-
-static struct clk_ops mstpcr_clk_ops = {
-	.enable		= mstpcr_clk_enable,
-	.disable	= mstpcr_clk_disable,
-	.recalc		= followparent_recalc,
-};
-
 #define MSTPCR0		0xffc80030
 #define MSTPCR1		0xffc80034
 
-#define CLK(_name, _id, _parent, _enable_reg,		\
-	    _enable_bit, _flags)			\
-{							\
-	.name		= _name,			\
-	.id		= _id,				\
-	.parent		= _parent,			\
-	.enable_reg	= (void __iomem *)_enable_reg,	\
-	.enable_bit	= _enable_bit,			\
-	.flags		= _flags,			\
-	.ops		= &mstpcr_clk_ops,		\
-}
-
-static struct clk mstpcr_clks[] = {
+static struct clk mstp_clks[] = {
 	/* MSTPCR0 */
-	CLK("scif_fck", 5, &peripheral_clk, MSTPCR0, 29, 0),
-	CLK("scif_fck", 4, &peripheral_clk, MSTPCR0, 28, 0),
-	CLK("scif_fck", 3, &peripheral_clk, MSTPCR0, 27, 0),
-	CLK("scif_fck", 2, &peripheral_clk, MSTPCR0, 26, 0),
-	CLK("scif_fck", 1, &peripheral_clk, MSTPCR0, 25, 0),
-	CLK("scif_fck", 0, &peripheral_clk, MSTPCR0, 24, 0),
-	CLK("ssi_fck", 1, &peripheral_clk, MSTPCR0, 21, 0),
-	CLK("ssi_fck", 0, &peripheral_clk, MSTPCR0, 20, 0),
-	CLK("hac_fck", 1, &peripheral_clk, MSTPCR0, 17, 0),
-	CLK("hac_fck", 0, &peripheral_clk, MSTPCR0, 16, 0),
-	CLK("mmcif_fck", -1, &peripheral_clk, MSTPCR0, 13, 0),
-	CLK("flctl_fck", -1, &peripheral_clk, MSTPCR0, 12, 0),
-	CLK("tmu345_fck", -1, &peripheral_clk, MSTPCR0, 9, 0),
-	CLK("tmu012_fck", -1, &peripheral_clk, MSTPCR0, 8, 0),
-	CLK("siof_fck", -1, &peripheral_clk, MSTPCR0, 3, 0),
-	CLK("hspi_fck", -1, &peripheral_clk, MSTPCR0, 2, 0),
+	SH_CLK_MSTP32("scif_fck", 5, &peripheral_clk, MSTPCR0, 29, 0),
+	SH_CLK_MSTP32("scif_fck", 4, &peripheral_clk, MSTPCR0, 28, 0),
+	SH_CLK_MSTP32("scif_fck", 3, &peripheral_clk, MSTPCR0, 27, 0),
+	SH_CLK_MSTP32("scif_fck", 2, &peripheral_clk, MSTPCR0, 26, 0),
+	SH_CLK_MSTP32("scif_fck", 1, &peripheral_clk, MSTPCR0, 25, 0),
+	SH_CLK_MSTP32("scif_fck", 0, &peripheral_clk, MSTPCR0, 24, 0),
+	SH_CLK_MSTP32("ssi_fck", 1, &peripheral_clk, MSTPCR0, 21, 0),
+	SH_CLK_MSTP32("ssi_fck", 0, &peripheral_clk, MSTPCR0, 20, 0),
+	SH_CLK_MSTP32("hac_fck", 1, &peripheral_clk, MSTPCR0, 17, 0),
+	SH_CLK_MSTP32("hac_fck", 0, &peripheral_clk, MSTPCR0, 16, 0),
+	SH_CLK_MSTP32("mmcif_fck", -1, &peripheral_clk, MSTPCR0, 13, 0),
+	SH_CLK_MSTP32("flctl_fck", -1, &peripheral_clk, MSTPCR0, 12, 0),
+	SH_CLK_MSTP32("tmu345_fck", -1, &peripheral_clk, MSTPCR0, 9, 0),
+	SH_CLK_MSTP32("tmu012_fck", -1, &peripheral_clk, MSTPCR0, 8, 0),
+	SH_CLK_MSTP32("siof_fck", -1, &peripheral_clk, MSTPCR0, 3, 0),
+	SH_CLK_MSTP32("hspi_fck", -1, &peripheral_clk, MSTPCR0, 2, 0),
 
 	/* MSTPCR1 */
-	CLK("hudi_fck", -1, NULL, MSTPCR1, 19, 0),
-	CLK("ubc_fck", -1, NULL, MSTPCR1, 17, 0),
-	CLK("dmac_11_6_fck", -1, NULL, MSTPCR1, 5, 0),
-	CLK("dmac_5_0_fck", -1, NULL, MSTPCR1, 4, 0),
-	CLK("gdta_fck", -1, NULL, MSTPCR1, 0, 0),
+	SH_CLK_MSTP32("hudi_fck", -1, NULL, MSTPCR1, 19, 0),
+	SH_CLK_MSTP32("ubc_fck", -1, NULL, MSTPCR1, 17, 0),
+	SH_CLK_MSTP32("dmac_11_6_fck", -1, NULL, MSTPCR1, 5, 0),
+	SH_CLK_MSTP32("dmac_5_0_fck", -1, NULL, MSTPCR1, 4, 0),
+	SH_CLK_MSTP32("gdta_fck", -1, NULL, MSTPCR1, 0, 0),
 };
 
 int __init arch_clk_init(void)
@@ -260,8 +229,9 @@ int __init arch_clk_init(void)
 
 	for (i = 0; i < ARRAY_SIZE(clks); i++)
 		ret |= clk_register(clks[i]);
-	for (i = 0; i < ARRAY_SIZE(mstpcr_clks); i++)
-		ret |= clk_register(&mstpcr_clks[i]);
+
+	if (!ret)
+		ret = sh_clk_mstp32_register(mstp_clks, ARRAY_SIZE(mstp_clks));
 
 	return ret;
 }
-- 
cgit v0.10.2


From a1153e27eec25e9963f5843ba8932952bd9847ac Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 28 May 2009 13:11:31 +0000
Subject: sh: shared div4 clock code

Add shared code for 4-bit divisor clocks.

Processor specific code can use SH_CLK_DIV4()
to initialize div4 clocks, and then use
sh_clk_div4_register() for registration.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index f43d3e7..7435e40 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -3,6 +3,7 @@
 
 #include <linux/list.h>
 #include <linux/seq_file.h>
+#include <linux/cpufreq.h>
 #include <linux/clk.h>
 #include <linux/err.h>
 
@@ -41,6 +42,7 @@ struct clk {
 	unsigned long		arch_flags;
 	void			*priv;
 	struct dentry		*dentry;
+	struct cpufreq_frequency_table *freq_table;
 };
 
 struct clk_lookup {
@@ -130,4 +132,17 @@ long clk_rate_table_round(struct clk *clk,
 
 int sh_clk_mstp32_register(struct clk *clks, int nr);
 
+#define SH_CLK_DIV4(_name, _parent, _reg, _shift, _div_bitmap, _flags)	\
+{									\
+	.name = _name,							\
+	.parent = _parent,						\
+	.enable_reg = (void __iomem *)_reg,				\
+	.enable_bit = _shift,						\
+	.arch_flags = _div_bitmap,					\
+	.flags = _flags,						\
+}
+
+int sh_clk_div4_register(struct clk *clks, int nr,
+			 struct clk_div_mult_table *table);
+
 #endif /* __ASM_SH_CLOCK_H */
diff --git a/arch/sh/kernel/cpu/clock-cpg.c b/arch/sh/kernel/cpu/clock-cpg.c
index 72228d2..88fc30d 100644
--- a/arch/sh/kernel/cpu/clock-cpg.c
+++ b/arch/sh/kernel/cpu/clock-cpg.c
@@ -1,5 +1,6 @@
 #include <linux/clk.h>
 #include <linux/compiler.h>
+#include <linux/bootmem.h>
 #include <linux/io.h>
 #include <asm/clock.h>
 
@@ -37,6 +38,60 @@ int __init sh_clk_mstp32_register(struct clk *clks, int nr)
 	return ret;
 }
 
+static unsigned long sh_clk_div4_recalc(struct clk *clk)
+{
+	struct clk_div_mult_table *table = clk->priv;
+	unsigned int idx;
+
+	clk_rate_table_build(clk, clk->freq_table, table->nr_divisors,
+			     table, &clk->arch_flags);
+
+	idx = (__raw_readl(clk->enable_reg) >> clk->enable_bit) & 0x000f;
+
+	return clk->freq_table[idx].frequency;
+}
+
+static long sh_clk_div4_round_rate(struct clk *clk, unsigned long rate)
+{
+	return clk_rate_table_round(clk, clk->freq_table, rate);
+}
+
+static struct clk_ops sh_clk_div4_clk_ops = {
+	.recalc		= sh_clk_div4_recalc,
+	.round_rate	= sh_clk_div4_round_rate,
+};
+
+int __init sh_clk_div4_register(struct clk *clks, int nr,
+				struct clk_div_mult_table *table)
+{
+	struct clk *clkp;
+	void *freq_table;
+	int nr_divs = table->nr_divisors;
+	int freq_table_size = sizeof(struct cpufreq_frequency_table);
+	int ret = 0;
+	int k;
+
+	k = nr_divs + 1;
+	freq_table = alloc_bootmem(freq_table_size * nr * (nr_divs + 1));
+	if (!freq_table)
+		return -ENOMEM;
+
+	for (k = 0; !ret && (k < nr); k++) {
+		clkp = clks + k;
+
+		clkp->ops = &sh_clk_div4_clk_ops;
+		clkp->id = -1;
+		clkp->priv = table;
+
+		clkp->freq_table = freq_table + (k * freq_table_size);
+		clkp->freq_table[nr_divs].frequency = CPUFREQ_TABLE_END;
+
+		ret = clk_register(clkp);
+	}
+
+	return ret;
+}
+
 #ifdef CONFIG_SH_CLK_CPG_LEGACY
 static struct clk master_clk = {
 	.name		= "master_clk",
-- 
cgit v0.10.2


From 43909a938063f9b6f98c05a2e28b072dd972ece7 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 28 May 2009 13:13:56 +0000
Subject: sh: hook up shared div4 clock code to sh7785

Hook up the shared 4-bit divisor clock code to sh7785.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index 7d55706..dae20ac 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -18,65 +18,14 @@
 #include <asm/freq.h>
 #include <cpu/sh7785.h>
 
-static unsigned int div2[] = { 1, 2, 4, 6, 8, 12, 16, 18,
-			       24, 32, 36, 48 };
-
-static struct clk_div_mult_table cpg_div = {
-	.divisors = div2,
-	.nr_divisors = ARRAY_SIZE(div2),
-};
-
-struct clk_priv {
-	unsigned int			shift;
-
-	/* allowable divisor bitmap */
-	unsigned long			div_bitmap;
-
-	/* Supportable frequencies + termination entry */
-	struct cpufreq_frequency_table	freq_table[ARRAY_SIZE(div2)+1];
-};
-
-#define FRQMR_CLK_DATA(_name, _shift, _div_bitmap)	\
-static struct clk_priv _name##_data = {			\
-	.shift		= _shift,			\
-	.div_bitmap	= _div_bitmap,			\
-							\
-	.freq_table[0]	= {				\
-		.index = 0,				\
-		.frequency = CPUFREQ_TABLE_END,		\
-	},						\
-}
-
-FRQMR_CLK_DATA(pfc,  0, 0x0f80);
-FRQMR_CLK_DATA(s3fc, 4, 0x0ff0);
-FRQMR_CLK_DATA(s2fc, 8, 0x0030);
-FRQMR_CLK_DATA(mfc, 12, 0x000c);
-FRQMR_CLK_DATA(bfc, 16, 0x0fe0);
-FRQMR_CLK_DATA(sfc, 20, 0x000c);
-FRQMR_CLK_DATA(ufc, 24, 0x000c);
-FRQMR_CLK_DATA(ifc, 28, 0x000e);
-
-static unsigned long frqmr_recalc(struct clk *clk)
-{
-	struct clk_priv *data = clk->priv;
-	unsigned int idx = (__raw_readl(FRQMR1) >> data->shift) & 0x000f;
-
-	clk_rate_table_build(clk, data->freq_table, ARRAY_SIZE(div2),
-			     &cpg_div, &data->div_bitmap);
-	
-	return data->freq_table[idx].frequency;
-}
-
-static long frqmr_round_rate(struct clk *clk, unsigned long rate)
-{
-	struct clk_priv *data = clk->priv;
-
-	return clk_rate_table_round(clk, data->freq_table, rate);
-}
-
-static struct clk_ops frqmr_clk_ops = {
-	.recalc			= frqmr_recalc,
-	.round_rate		= frqmr_round_rate,
+/*
+ * Default rate for the root input clock, reset this with clk_set_rate()
+ * from the platform code.
+ */
+static struct clk extal_clk = {
+	.name		= "extal",
+	.id		= -1,
+	.rate		= 33333333,
 };
 
 static unsigned long pll_recalc(struct clk *clk)
@@ -92,16 +41,6 @@ static struct clk_ops pll_clk_ops = {
 	.recalc		= pll_recalc,
 };
 
-/*
- * Default rate for the root input clock, reset this with clk_set_rate()
- * from the platform code.
- */
-static struct clk extal_clk = {
-	.name		= "extal",
-	.id		= -1,
-	.rate		= 33333333,
-};
-
 static struct clk pll_clk = {
 	.name		= "pll_clk",
 	.id		= -1,
@@ -110,87 +49,34 @@ static struct clk pll_clk = {
 	.flags		= CLK_ENABLE_ON_INIT,
 };
 
-static struct clk cpu_clk = {
-	.name		= "cpu_clk",		/* Ick */
-	.id		= -1,
-	.ops		= &frqmr_clk_ops,
-	.parent		= &pll_clk,
-	.flags		= CLK_ENABLE_ON_INIT,
-	.priv		= &ifc_data,
-};
-
-static struct clk shyway_clk = {
-	.name		= "shyway_clk",		/* SHck */
-	.id		= -1,
-	.ops		= &frqmr_clk_ops,
-	.parent		= &pll_clk,
-	.flags		= CLK_ENABLE_ON_INIT,
-	.priv		= &sfc_data,
-};
-
-static struct clk peripheral_clk = {
-	.name		= "peripheral_clk",	/* Pck */
-	.id		= -1,
-	.ops		= &frqmr_clk_ops,
-	.parent		= &pll_clk,
-	.flags		= CLK_ENABLE_ON_INIT,
-	.priv		= &pfc_data,
-};
-
-static struct clk ddr_clk = {
-	.name		= "ddr_clk",		/* DDRck */
-	.id		= -1,
-	.ops		= &frqmr_clk_ops,
-	.parent		= &pll_clk,
-	.flags		= CLK_ENABLE_ON_INIT,
-	.priv		= &mfc_data,
+static struct clk *clks[] = {
+	&extal_clk,
+	&pll_clk,
 };
 
-static struct clk bus_clk = {
-	.name		= "bus_clk",		/* Bck */
-	.id		= -1,
-	.ops		= &frqmr_clk_ops,
-	.parent		= &pll_clk,
-	.flags		= CLK_ENABLE_ON_INIT,
-	.priv		= &bfc_data,
-};
+static unsigned int div2[] = { 1, 2, 4, 6, 8, 12, 16, 18,
+			       24, 32, 36, 48 };
 
-static struct clk ga_clk = {
-	.name		= "ga_clk",		/* GAck */
-	.id		= -1,
-	.ops		= &frqmr_clk_ops,
-	.parent		= &pll_clk,
-	.priv		= &s2fc_data,
+static struct clk_div_mult_table div4_table = {
+	.divisors = div2,
+	.nr_divisors = ARRAY_SIZE(div2),
 };
 
-static struct clk du_clk = {
-	.name		= "du_clk",		/* DUck */
-	.id		= -1,
-	.ops		= &frqmr_clk_ops,
-	.parent		= &pll_clk,
-	.priv		= &s3fc_data,
-};
+enum { DIV4_I, DIV4_U, DIV4_SH, DIV4_B, DIV4_DDR, DIV4_GA,
+	DIV4_DU, DIV4_P, DIV4_NR };
 
-static struct clk umem_clk = {
-	.name		= "umem_clk",		/* uck */
-	.id		= -1,
-	.ops		= &frqmr_clk_ops,
-	.parent		= &pll_clk,
-	.flags		= CLK_ENABLE_ON_INIT,
-	.priv		= &ufc_data,
-};
+#define DIV4(_str, _bit, _mask, _flags) \
+  SH_CLK_DIV4(_str, &pll_clk, FRQMR1, _bit, _mask, _flags)
 
-static struct clk *clks[] = {
-	&extal_clk,
-	&pll_clk,
-	&cpu_clk,
-	&shyway_clk,
-	&peripheral_clk,
-	&ddr_clk,
-	&bus_clk,
-	&ga_clk,
-	&du_clk,
-	&umem_clk,
+struct clk div4_clks[DIV4_NR] = {
+	[DIV4_P] = DIV4("peripheral_clk", 0, 0x0f80, 0),
+	[DIV4_DU] = DIV4("du_clk", 4, 0x0ff0, 0),
+	[DIV4_GA] = DIV4("ga_clk", 8, 0x0030, 0),
+	[DIV4_DDR] = DIV4("ddr_clk", 12, 0x000c, CLK_ENABLE_ON_INIT),
+	[DIV4_B] = DIV4("bus_clk", 16, 0x0fe0, CLK_ENABLE_ON_INIT),
+	[DIV4_SH] = DIV4("shyway_clk", 20, 0x000c, CLK_ENABLE_ON_INIT),
+	[DIV4_U] = DIV4("umem_clk", 24, 0x000c, CLK_ENABLE_ON_INIT),
+	[DIV4_I] = DIV4("cpu_clk", 28, 0x000e, CLK_ENABLE_ON_INIT),
 };
 
 #define MSTPCR0		0xffc80030
@@ -198,22 +84,22 @@ static struct clk *clks[] = {
 
 static struct clk mstp_clks[] = {
 	/* MSTPCR0 */
-	SH_CLK_MSTP32("scif_fck", 5, &peripheral_clk, MSTPCR0, 29, 0),
-	SH_CLK_MSTP32("scif_fck", 4, &peripheral_clk, MSTPCR0, 28, 0),
-	SH_CLK_MSTP32("scif_fck", 3, &peripheral_clk, MSTPCR0, 27, 0),
-	SH_CLK_MSTP32("scif_fck", 2, &peripheral_clk, MSTPCR0, 26, 0),
-	SH_CLK_MSTP32("scif_fck", 1, &peripheral_clk, MSTPCR0, 25, 0),
-	SH_CLK_MSTP32("scif_fck", 0, &peripheral_clk, MSTPCR0, 24, 0),
-	SH_CLK_MSTP32("ssi_fck", 1, &peripheral_clk, MSTPCR0, 21, 0),
-	SH_CLK_MSTP32("ssi_fck", 0, &peripheral_clk, MSTPCR0, 20, 0),
-	SH_CLK_MSTP32("hac_fck", 1, &peripheral_clk, MSTPCR0, 17, 0),
-	SH_CLK_MSTP32("hac_fck", 0, &peripheral_clk, MSTPCR0, 16, 0),
-	SH_CLK_MSTP32("mmcif_fck", -1, &peripheral_clk, MSTPCR0, 13, 0),
-	SH_CLK_MSTP32("flctl_fck", -1, &peripheral_clk, MSTPCR0, 12, 0),
-	SH_CLK_MSTP32("tmu345_fck", -1, &peripheral_clk, MSTPCR0, 9, 0),
-	SH_CLK_MSTP32("tmu012_fck", -1, &peripheral_clk, MSTPCR0, 8, 0),
-	SH_CLK_MSTP32("siof_fck", -1, &peripheral_clk, MSTPCR0, 3, 0),
-	SH_CLK_MSTP32("hspi_fck", -1, &peripheral_clk, MSTPCR0, 2, 0),
+	SH_CLK_MSTP32("scif_fck", 5, &div4_clks[DIV4_P], MSTPCR0, 29, 0),
+	SH_CLK_MSTP32("scif_fck", 4, &div4_clks[DIV4_P], MSTPCR0, 28, 0),
+	SH_CLK_MSTP32("scif_fck", 3, &div4_clks[DIV4_P], MSTPCR0, 27, 0),
+	SH_CLK_MSTP32("scif_fck", 2, &div4_clks[DIV4_P], MSTPCR0, 26, 0),
+	SH_CLK_MSTP32("scif_fck", 1, &div4_clks[DIV4_P], MSTPCR0, 25, 0),
+	SH_CLK_MSTP32("scif_fck", 0, &div4_clks[DIV4_P], MSTPCR0, 24, 0),
+	SH_CLK_MSTP32("ssi_fck", 1, &div4_clks[DIV4_P], MSTPCR0, 21, 0),
+	SH_CLK_MSTP32("ssi_fck", 0, &div4_clks[DIV4_P], MSTPCR0, 20, 0),
+	SH_CLK_MSTP32("hac_fck", 1, &div4_clks[DIV4_P], MSTPCR0, 17, 0),
+	SH_CLK_MSTP32("hac_fck", 0, &div4_clks[DIV4_P], MSTPCR0, 16, 0),
+	SH_CLK_MSTP32("mmcif_fck", -1, &div4_clks[DIV4_P], MSTPCR0, 13, 0),
+	SH_CLK_MSTP32("flctl_fck", -1, &div4_clks[DIV4_P], MSTPCR0, 12, 0),
+	SH_CLK_MSTP32("tmu345_fck", -1, &div4_clks[DIV4_P], MSTPCR0, 9, 0),
+	SH_CLK_MSTP32("tmu012_fck", -1, &div4_clks[DIV4_P], MSTPCR0, 8, 0),
+	SH_CLK_MSTP32("siof_fck", -1, &div4_clks[DIV4_P], MSTPCR0, 3, 0),
+	SH_CLK_MSTP32("hspi_fck", -1, &div4_clks[DIV4_P], MSTPCR0, 2, 0),
 
 	/* MSTPCR1 */
 	SH_CLK_MSTP32("hudi_fck", -1, NULL, MSTPCR1, 19, 0),
@@ -231,6 +117,9 @@ int __init arch_clk_init(void)
 		ret |= clk_register(clks[i]);
 
 	if (!ret)
+		ret = sh_clk_div4_register(div4_clks, ARRAY_SIZE(div4_clks),
+					   &div4_table);
+	if (!ret)
 		ret = sh_clk_mstp32_register(mstp_clks, ARRAY_SIZE(mstp_clks));
 
 	return ret;
-- 
cgit v0.10.2


From 4371ee353c3fc41aad9458b8e8e627eb508bc9a3 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <florian@openwrt.org>
Date: Mon, 1 Jun 2009 02:43:17 -0700
Subject: MAINTAINERS: take maintainership of the cpmac Ethernet driver

This patch adds me as the maintainer of the CPMAC (AR7)
Ethernet driver.

Signed-off-by: Florian Fainelli <florian@openwrt.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/MAINTAINERS b/MAINTAINERS
index 2b349ba..111fc16 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1532,6 +1532,13 @@ W:	http://www.fi.muni.cz/~kas/cosa/
 S:	Maintained
 F:	drivers/net/wan/cosa*
 
+CPMAC ETHERNET DRIVER
+P:	Florian Fainelli
+M:	florian@openwrt.org
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	drivers/net/cpmac.c
+
 CPU FREQUENCY DRIVERS
 P:	Dave Jones
 M:	davej@redhat.com
-- 
cgit v0.10.2


From cf9f6e21c155d5add733b969c695837ead79eeab Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Mon, 1 Jun 2009 03:12:04 -0700
Subject: 3c509: Add missing EISA IDs

Several EISA device IDs for 3c509 family network cards are missing from
the driver, making the cards unusable in their EISA mode.  Here's a fix to
add them based on the EISA configuration files distributed by 3Com and our
eisa.ids database.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/3c509.c b/drivers/net/3c509.c
index fbb3719..682aad8 100644
--- a/drivers/net/3c509.c
+++ b/drivers/net/3c509.c
@@ -480,9 +480,13 @@ static int pnp_registered;
 
 #ifdef CONFIG_EISA
 static struct eisa_device_id el3_eisa_ids[] = {
+		{ "TCM5090" },
+		{ "TCM5091" },
 		{ "TCM5092" },
 		{ "TCM5093" },
+		{ "TCM5094" },
 		{ "TCM5095" },
+		{ "TCM5098" },
 		{ "" }
 };
 MODULE_DEVICE_TABLE(eisa, el3_eisa_ids);
-- 
cgit v0.10.2


From 7863d3f7aeae05099a38693a0a7eb7bdc7b2ab05 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 1 Jun 2009 19:38:41 +0900
Subject: sh: Tidy up the optional L2 probing, wire it up for SH7786.

This tidies up the L2 probing, as it may or may not be implemented on a
CPU, regardless of whether it is supported. This converts the cvr
validity checks from BUG_ON()'s to simply clearing the CPU_HAS_L2_CACHE
flag and moving on with life.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4/probe.c b/arch/sh/kernel/cpu/sh4/probe.c
index 7d821ce..28a2f0d 100644
--- a/arch/sh/kernel/cpu/sh4/probe.c
+++ b/arch/sh/kernel/cpu/sh4/probe.c
@@ -134,7 +134,7 @@ int __init detect_cpu_and_cache_system(void)
 		boot_cpu_data.icache.ways = 4;
 		boot_cpu_data.dcache.ways = 4;
 		boot_cpu_data.flags |= CPU_HAS_FPU | CPU_HAS_PERF_COUNTER |
-			CPU_HAS_LLSC | CPU_HAS_PTEAEX;
+			CPU_HAS_LLSC | CPU_HAS_PTEAEX | CPU_HAS_L2_CACHE;
 		break;
 	case 0x3008:
 		boot_cpu_data.icache.ways = 4;
@@ -228,43 +228,48 @@ int __init detect_cpu_and_cache_system(void)
 	}
 
 	/*
-	 * Setup the L2 cache desc
-	 *
 	 * SH-4A's have an optional PIPT L2.
 	 */
 	if (boot_cpu_data.flags & CPU_HAS_L2_CACHE) {
-		/* Bug if we can't decode the L2 info */
-		BUG_ON(!(cvr & 0xf));
-
-		/* Silicon and specifications have clearly never met.. */
-		cvr ^= 0xf;
-
 		/*
-		 * Size calculation is much more sensible
-		 * than it is for the L1.
-		 *
-		 * Sizes are 128KB, 258KB, 512KB, and 1MB.
+		 * Verify that it really has something hooked up, this
+		 * is the safety net for CPUs that have optional L2
+		 * support yet do not implement it.
 		 */
-		size = (cvr & 0xf) << 17;
+		if ((cvr & 0xf) == 0)
+			boot_cpu_data.flags &= ~CPU_HAS_L2_CACHE;
+		else {
+			/*
+			 * Silicon and specifications have clearly never
+			 * met..
+			 */
+			cvr ^= 0xf;
 
-		BUG_ON(!size);
+			/*
+			 * Size calculation is much more sensible
+			 * than it is for the L1.
+			 *
+			 * Sizes are 128KB, 258KB, 512KB, and 1MB.
+			 */
+			size = (cvr & 0xf) << 17;
 
-		boot_cpu_data.scache.way_incr		= (1 << 16);
-		boot_cpu_data.scache.entry_shift	= 5;
-		boot_cpu_data.scache.ways		= 4;
-		boot_cpu_data.scache.linesz		= L1_CACHE_BYTES;
+			boot_cpu_data.scache.way_incr		= (1 << 16);
+			boot_cpu_data.scache.entry_shift	= 5;
+			boot_cpu_data.scache.ways		= 4;
+			boot_cpu_data.scache.linesz		= L1_CACHE_BYTES;
 
-		boot_cpu_data.scache.entry_mask	=
-			(boot_cpu_data.scache.way_incr -
-			 boot_cpu_data.scache.linesz);
+			boot_cpu_data.scache.entry_mask	=
+				(boot_cpu_data.scache.way_incr -
+				 boot_cpu_data.scache.linesz);
 
-		boot_cpu_data.scache.sets	= size /
-			(boot_cpu_data.scache.linesz *
-			 boot_cpu_data.scache.ways);
+			boot_cpu_data.scache.sets	= size /
+				(boot_cpu_data.scache.linesz *
+				 boot_cpu_data.scache.ways);
 
-		boot_cpu_data.scache.way_size	=
-			(boot_cpu_data.scache.sets *
-			 boot_cpu_data.scache.linesz);
+			boot_cpu_data.scache.way_size	=
+				(boot_cpu_data.scache.sets *
+				 boot_cpu_data.scache.linesz);
+		}
 	}
 
 	return 0;
-- 
cgit v0.10.2


From 0bf8513ed0df64b38edce63411d4b7b368464f47 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 1 Jun 2009 19:50:08 +0900
Subject: sh: Tidy up SH-4A boot_cpu_data.flags probing.

This tidies up the boot_cpu_data.flags probing on SH-4A. All of them have
a few things in common, which we can blindly set, rather than having each
subtype have to set the same flags. We can also make assumptions about
cache ways and the validity of PTEA, so this also kills off CPU_HAS_PTEA
as a config option. There was also a bug in the FPU probing, which is now
tidied up.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index c815975..e9392aa 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -174,7 +174,6 @@ config CPU_SH4
 	bool
 	select CPU_HAS_INTEVT
 	select CPU_HAS_SR_RB
-	select CPU_HAS_PTEA if !CPU_SH4A || CPU_SHX2
 	select CPU_HAS_FPU if !CPU_SH4AL_DSP
 	select SYS_SUPPORTS_TMU
 
diff --git a/arch/sh/Kconfig.cpu b/arch/sh/Kconfig.cpu
index 9eb1712..cd6e3ea 100644
--- a/arch/sh/Kconfig.cpu
+++ b/arch/sh/Kconfig.cpu
@@ -96,9 +96,6 @@ config CPU_HAS_SR_RB
 	  See <file:Documentation/sh/register-banks.txt> for further
 	  information on SR.RB and register banking in the kernel in general.
 
-config CPU_HAS_PTEA
-	bool
-
 config CPU_HAS_PTEAEX
 	bool
 
diff --git a/arch/sh/kernel/cpu/sh4/probe.c b/arch/sh/kernel/cpu/sh4/probe.c
index 28a2f0d..6c78d0a 100644
--- a/arch/sh/kernel/cpu/sh4/probe.c
+++ b/arch/sh/kernel/cpu/sh4/probe.c
@@ -60,12 +60,18 @@ int __init detect_cpu_and_cache_system(void)
 		if ((cvr & 0x10000000) == 0)
 			boot_cpu_data.flags |= CPU_HAS_DSP;
 
-		boot_cpu_data.flags |= CPU_HAS_LLSC;
+		boot_cpu_data.flags |= CPU_HAS_LLSC | CPU_HAS_PERF_COUNTER;
 		boot_cpu_data.cut_major = pvr & 0x7f;
+
+		boot_cpu_data.icache.ways = 4;
+		boot_cpu_data.dcache.ways = 4;
+	} else {
+		/* And some SH-4 defaults.. */
+		boot_cpu_data.flags |= CPU_HAS_PTEA;
 	}
 
 	/* FPU detection works for everyone */
-	if ((cvr & 0x20000000) == 1)
+	if ((cvr & 0x20000000))
 		boot_cpu_data.flags |= CPU_HAS_FPU;
 
 	/* Mask off the upper chip ID */
@@ -78,25 +84,20 @@ int __init detect_cpu_and_cache_system(void)
 	switch (pvr) {
 	case 0x205:
 		boot_cpu_data.type = CPU_SH7750;
-		boot_cpu_data.flags |= CPU_HAS_P2_FLUSH_BUG | CPU_HAS_FPU |
-				   CPU_HAS_PERF_COUNTER;
+		boot_cpu_data.flags |= CPU_HAS_P2_FLUSH_BUG |
+				       CPU_HAS_PERF_COUNTER;
 		break;
 	case 0x206:
 		boot_cpu_data.type = CPU_SH7750S;
-		boot_cpu_data.flags |= CPU_HAS_P2_FLUSH_BUG | CPU_HAS_FPU |
-				   CPU_HAS_PERF_COUNTER;
+		boot_cpu_data.flags |= CPU_HAS_P2_FLUSH_BUG |
+				       CPU_HAS_PERF_COUNTER;
 		break;
 	case 0x1100:
 		boot_cpu_data.type = CPU_SH7751;
-		boot_cpu_data.flags |= CPU_HAS_FPU;
 		break;
 	case 0x2001:
 	case 0x2004:
 		boot_cpu_data.type = CPU_SH7770;
-		boot_cpu_data.icache.ways = 4;
-		boot_cpu_data.dcache.ways = 4;
-
-		boot_cpu_data.flags |= CPU_HAS_FPU | CPU_HAS_LLSC;
 		break;
 	case 0x2006:
 	case 0x200A:
@@ -107,45 +108,26 @@ int __init detect_cpu_and_cache_system(void)
 		else
 			boot_cpu_data.type = CPU_SH7780;
 
-		boot_cpu_data.icache.ways = 4;
-		boot_cpu_data.dcache.ways = 4;
-
-		boot_cpu_data.flags |= CPU_HAS_FPU | CPU_HAS_PERF_COUNTER |
-				   CPU_HAS_LLSC;
 		break;
 	case 0x3000:
 	case 0x3003:
 	case 0x3009:
 		boot_cpu_data.type = CPU_SH7343;
-		boot_cpu_data.icache.ways = 4;
-		boot_cpu_data.dcache.ways = 4;
-		boot_cpu_data.flags |= CPU_HAS_LLSC;
 		break;
 	case 0x3004:
 	case 0x3007:
 		boot_cpu_data.type = CPU_SH7785;
-		boot_cpu_data.icache.ways = 4;
-		boot_cpu_data.dcache.ways = 4;
-		boot_cpu_data.flags |= CPU_HAS_FPU | CPU_HAS_PERF_COUNTER |
-					  CPU_HAS_LLSC;
 		break;
 	case 0x4004:
 		boot_cpu_data.type = CPU_SH7786;
-		boot_cpu_data.icache.ways = 4;
-		boot_cpu_data.dcache.ways = 4;
-		boot_cpu_data.flags |= CPU_HAS_FPU | CPU_HAS_PERF_COUNTER |
-			CPU_HAS_LLSC | CPU_HAS_PTEAEX | CPU_HAS_L2_CACHE;
+		boot_cpu_data.flags |= CPU_HAS_PTEAEX | CPU_HAS_L2_CACHE;
 		break;
 	case 0x3008:
-		boot_cpu_data.icache.ways = 4;
-		boot_cpu_data.dcache.ways = 4;
-		boot_cpu_data.flags |= CPU_HAS_LLSC;
-
 		switch (prr) {
 		case 0x50:
 		case 0x51:
 			boot_cpu_data.type = CPU_SH7723;
-			boot_cpu_data.flags |= CPU_HAS_FPU | CPU_HAS_L2_CACHE;
+			boot_cpu_data.flags |= CPU_HAS_L2_CACHE;
 			break;
 		case 0x70:
 			boot_cpu_data.type = CPU_SH7366;
@@ -158,17 +140,11 @@ int __init detect_cpu_and_cache_system(void)
 		break;
 	case 0x300b:
 		boot_cpu_data.type = CPU_SH7724;
-		boot_cpu_data.icache.ways = 4;
-		boot_cpu_data.dcache.ways = 4;
-		boot_cpu_data.flags |= CPU_HAS_LLSC | CPU_HAS_FPU | CPU_HAS_L2_CACHE;
+		boot_cpu_data.flags |= CPU_HAS_L2_CACHE;
 		break;
 	case 0x4000:	/* 1st cut */
 	case 0x4001:	/* 2nd cut */
 		boot_cpu_data.type = CPU_SHX3;
-		boot_cpu_data.icache.ways = 4;
-		boot_cpu_data.dcache.ways = 4;
-		boot_cpu_data.flags |= CPU_HAS_FPU | CPU_HAS_PERF_COUNTER |
-					  CPU_HAS_LLSC;
 		break;
 	case 0x700:
 		boot_cpu_data.type = CPU_SH4_501;
@@ -179,7 +155,6 @@ int __init detect_cpu_and_cache_system(void)
 		boot_cpu_data.type = CPU_SH4_202;
 		boot_cpu_data.icache.ways = 2;
 		boot_cpu_data.dcache.ways = 2;
-		boot_cpu_data.flags |= CPU_HAS_FPU;
 		break;
 	case 0x500 ... 0x501:
 		switch (prr) {
@@ -197,18 +172,12 @@ int __init detect_cpu_and_cache_system(void)
 		boot_cpu_data.icache.ways = 2;
 		boot_cpu_data.dcache.ways = 2;
 
-		boot_cpu_data.flags |= CPU_HAS_FPU;
-
 		break;
 	default:
 		boot_cpu_data.type = CPU_SH_NONE;
 		break;
 	}
 
-#ifdef CONFIG_CPU_HAS_PTEA
-	boot_cpu_data.flags |= CPU_HAS_PTEA;
-#endif
-
 	/*
 	 * On anything that's not a direct-mapped cache, look to the CVR
 	 * for I/D-cache specifics.
-- 
cgit v0.10.2


From 39d8bbedb9571a89d638f5b05358f26ab503d7a6 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Mon, 1 Jun 2009 13:46:49 +0200
Subject: hwmon: (lm78) Add missing __devexit_p()

The remove function uses __devexit, so the .remove assignment needs
__devexit_p() to fix a build error with hotplug disabled.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Jean Delvare <khali@linux-fr.org>

diff --git a/drivers/hwmon/lm78.c b/drivers/hwmon/lm78.c
index b5e3b28..a1787fd 100644
--- a/drivers/hwmon/lm78.c
+++ b/drivers/hwmon/lm78.c
@@ -182,7 +182,7 @@ static struct platform_driver lm78_isa_driver = {
 		.name	= "lm78",
 	},
 	.probe		= lm78_isa_probe,
-	.remove		= lm78_isa_remove,
+	.remove		= __devexit_p(lm78_isa_remove),
 };
 
 
-- 
cgit v0.10.2


From d54d462472a16fc07adb53a2fcd6c0c2a9a8dd1d Mon Sep 17 00:00:00 2001
From: Christian Engelmayer <Christian.Engelmayer@frequentis.com>
Date: Mon, 1 Jun 2009 13:46:50 +0200
Subject: hwmon: Update documentation on fan_max

Add fan_max description.

Add fan limit alarm 'max_alarm' to the alarm section.

Signed-off-by: Christian Engelmayer <christian.engelmayer@frequentis.com>
Acked-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Jean Delvare <khali@linux-fr.org>

diff --git a/Documentation/hwmon/sysfs-interface b/Documentation/hwmon/sysfs-interface
index 2f10ce6..004ee16 100644
--- a/Documentation/hwmon/sysfs-interface
+++ b/Documentation/hwmon/sysfs-interface
@@ -150,6 +150,11 @@ fan[1-*]_min	Fan minimum value
 		Unit: revolution/min (RPM)
 		RW
 
+fan[1-*]_max	Fan maximum value
+		Unit: revolution/min (RPM)
+		Only rarely supported by the hardware.
+		RW
+
 fan[1-*]_input	Fan input value.
 		Unit: revolution/min (RPM)
 		RO
@@ -390,6 +395,7 @@ OR
 in[0-*]_min_alarm
 in[0-*]_max_alarm
 fan[1-*]_min_alarm
+fan[1-*]_max_alarm
 temp[1-*]_min_alarm
 temp[1-*]_max_alarm
 temp[1-*]_crit_alarm
-- 
cgit v0.10.2


From 22a4f650d686eeaac3629dae1c4294381485efdf Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 1 Jun 2009 10:13:37 +0200
Subject: perf_counter: Tidy up style details

 - whitespace fixlets
 - make local variable definitions more consistent

[ Impact: cleanup ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 81ec79c..0e57d8c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -562,7 +562,7 @@ struct perf_cpu_context {
 	 *
 	 * task, softirq, irq, nmi context
 	 */
-	int			recursion[4];
+	int				recursion[4];
 };
 
 #ifdef CONFIG_PERF_COUNTERS
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ff8b463..df319c4 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -16,8 +16,9 @@
 #include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/sysfs.h>
-#include <linux/ptrace.h>
+#include <linux/dcache.h>
 #include <linux/percpu.h>
+#include <linux/ptrace.h>
 #include <linux/vmstat.h>
 #include <linux/hardirq.h>
 #include <linux/rculist.h>
@@ -26,7 +27,6 @@
 #include <linux/anon_inodes.h>
 #include <linux/kernel_stat.h>
 #include <linux/perf_counter.h>
-#include <linux/dcache.h>
 
 #include <asm/irq_regs.h>
 
@@ -65,7 +65,9 @@ void __weak hw_perf_disable(void)		{ barrier(); }
 void __weak hw_perf_enable(void)		{ barrier(); }
 
 void __weak hw_perf_counter_setup(int cpu)	{ barrier(); }
-int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
+
+int __weak
+hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_cpu_context *cpuctx,
 	       struct perf_counter_context *ctx, int cpu)
 {
@@ -127,8 +129,8 @@ static void put_ctx(struct perf_counter_context *ctx)
  * This has to cope with with the fact that until it is locked,
  * the context could get moved to another task.
  */
-static struct perf_counter_context *perf_lock_task_context(
-				struct task_struct *task, unsigned long *flags)
+static struct perf_counter_context *
+perf_lock_task_context(struct task_struct *task, unsigned long *flags)
 {
 	struct perf_counter_context *ctx;
 
@@ -1330,9 +1332,9 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
 
 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 {
-	struct perf_cpu_context *cpuctx;
-	struct perf_counter_context *ctx;
 	struct perf_counter_context *parent_ctx;
+	struct perf_counter_context *ctx;
+	struct perf_cpu_context *cpuctx;
 	struct task_struct *task;
 	unsigned long flags;
 	int err;
@@ -1664,8 +1666,8 @@ int perf_counter_task_disable(void)
  */
 void perf_counter_update_userpage(struct perf_counter *counter)
 {
-	struct perf_mmap_data *data;
 	struct perf_counter_mmap_page *userpg;
+	struct perf_mmap_data *data;
 
 	rcu_read_lock();
 	data = rcu_dereference(counter->data);
@@ -1769,10 +1771,11 @@ fail:
 
 static void __perf_mmap_data_free(struct rcu_head *rcu_head)
 {
-	struct perf_mmap_data *data = container_of(rcu_head,
-			struct perf_mmap_data, rcu_head);
+	struct perf_mmap_data *data;
 	int i;
 
+	data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
+
 	free_page((unsigned long)data->user_page);
 	for (i = 0; i < data->nr_pages; i++)
 		free_page((unsigned long)data->data_pages[i]);
@@ -1801,8 +1804,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 	struct perf_counter *counter = vma->vm_file->private_data;
 
 	WARN_ON_ONCE(counter->ctx->parent_ctx);
-	if (atomic_dec_and_mutex_lock(&counter->mmap_count,
-				      &counter->mmap_mutex)) {
+	if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) {
 		struct user_struct *user = current_user();
 
 		atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
@@ -1821,11 +1823,11 @@ static struct vm_operations_struct perf_mmap_vmops = {
 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct perf_counter *counter = file->private_data;
+	unsigned long user_locked, user_lock_limit;
 	struct user_struct *user = current_user();
+	unsigned long locked, lock_limit;
 	unsigned long vma_size;
 	unsigned long nr_pages;
-	unsigned long user_locked, user_lock_limit;
-	unsigned long locked, lock_limit;
 	long user_extra, extra;
 	int ret = 0;
 
@@ -1900,8 +1902,8 @@ unlock:
 
 static int perf_fasync(int fd, struct file *filp, int on)
 {
-	struct perf_counter *counter = filp->private_data;
 	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct perf_counter *counter = filp->private_data;
 	int retval;
 
 	mutex_lock(&inode->i_mutex);
@@ -2412,8 +2414,8 @@ static void perf_counter_output(struct perf_counter *counter,
  */
 
 struct perf_comm_event {
-	struct task_struct 	*task;
-	char 			*comm;
+	struct task_struct	*task;
+	char			*comm;
 	int			comm_size;
 
 	struct {
@@ -2932,6 +2934,7 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
 			       int nmi, struct pt_regs *regs, u64 addr)
 {
 	int neg = atomic64_add_negative(nr, &counter->hw.count);
+
 	if (counter->hw.irq_period && !neg)
 		perf_swcounter_overflow(counter, nmi, regs, addr);
 }
@@ -3526,7 +3529,7 @@ inherit_counter(struct perf_counter *parent_counter,
 	/*
 	 * Make the child state follow the state of the parent counter,
 	 * not its hw_event.disabled bit.  We hold the parent's mutex,
-	 * so we won't race with perf_counter_{en,dis}able_family.
+	 * so we won't race with perf_counter_{en, dis}able_family.
 	 */
 	if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
 		child_counter->state = PERF_COUNTER_STATE_INACTIVE;
-- 
cgit v0.10.2


From 229c4eedcedcdadf70411120ba34bc37554a74bd Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 28 May 2009 16:28:53 +0200
Subject: perf_counter tools: Guard against record damaging existing files

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 23d1224..96bfb7c 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -340,7 +340,7 @@ static int __cmd_record(int argc, const char **argv)
 	assert(nr_cpus <= MAX_NR_CPUS);
 	assert(nr_cpus >= 0);
 
-	output = open(output_name, O_CREAT|O_RDWR, S_IRWXU);
+	output = open(output_name, O_CREAT|O_EXCL|O_RDWR, S_IRWXU);
 	if (output < 0) {
 		perror("failed to create output file");
 		exit(-1);
-- 
cgit v0.10.2


From fb39125fd79a25c5002f3b45cf4c80e3fa6b961b Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Fri, 17 Apr 2009 15:15:51 +0800
Subject: ftrace, workqueuetrace: make workqueue tracepoints use TRACE_EVENT
 macro

v3: zhaolei@cn.fujitsu.com: Change TRACE_EVENT definition to new format
    introduced by Steven Rostedt: consolidate trace and trace_event headers
v2: kosaki@jp.fujitsu.com: print the function names instead of addr, and zap
    the work addr
v1: zhaolei@cn.fujitsu.com: Make workqueue tracepoints use TRACE_EVENT macro

TRACE_EVENT is a more generic way to define tracepoints.
Doing so adds these new capabilities to the tracepoints:

  - zero-copy and per-cpu splice() tracing
  - binary tracing without printf overhead
  - structured logging records exposed under /debug/tracing/events
  - trace events embedded in function tracer output and other plugins
  - user-defined, per tracepoint filter expressions

Then, this patch converts DEFINE_TRACE to TRACE_EVENT in workqueue related
tracepoints.

[ Impact: expand workqueue tracer to events tracing ]

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/include/trace/events/workqueue.h b/include/trace/events/workqueue.h
new file mode 100644
index 0000000..035f1bf
--- /dev/null
+++ b/include/trace/events/workqueue.h
@@ -0,0 +1,100 @@
+#if !defined(_TRACE_WORKQUEUE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_WORKQUEUE_H
+
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM workqueue
+
+TRACE_EVENT(workqueue_insertion,
+
+	TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
+
+	TP_ARGS(wq_thread, work),
+
+	TP_STRUCT__entry(
+		__array(char,		thread_comm,	TASK_COMM_LEN)
+		__field(pid_t,		thread_pid)
+		__field(work_func_t,	func)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN);
+		__entry->thread_pid	= wq_thread->pid;
+		__entry->func		= work->func;
+	),
+
+	TP_printk("thread=%s:%d func=%pF", __entry->thread_comm,
+		__entry->thread_pid, __entry->func)
+);
+
+TRACE_EVENT(workqueue_execution,
+
+	TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
+
+	TP_ARGS(wq_thread, work),
+
+	TP_STRUCT__entry(
+		__array(char,		thread_comm,	TASK_COMM_LEN)
+		__field(pid_t,		thread_pid)
+		__field(work_func_t,	func)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN);
+		__entry->thread_pid	= wq_thread->pid;
+		__entry->func		= work->func;
+	),
+
+	TP_printk("thread=%s:%d func=%pF", __entry->thread_comm,
+		__entry->thread_pid, __entry->func)
+);
+
+/* Trace the creation of one workqueue thread on a cpu */
+TRACE_EVENT(workqueue_creation,
+
+	TP_PROTO(struct task_struct *wq_thread, int cpu),
+
+	TP_ARGS(wq_thread, cpu),
+
+	TP_STRUCT__entry(
+		__array(char,	thread_comm,	TASK_COMM_LEN)
+		__field(pid_t,	thread_pid)
+		__field(int,	cpu)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN);
+		__entry->thread_pid	= wq_thread->pid;
+		__entry->cpu		= cpu;
+	),
+
+	TP_printk("thread=%s:%d cpu=%d", __entry->thread_comm,
+		__entry->thread_pid, __entry->cpu)
+);
+
+TRACE_EVENT(workqueue_destruction,
+
+	TP_PROTO(struct task_struct *wq_thread),
+
+	TP_ARGS(wq_thread),
+
+	TP_STRUCT__entry(
+		__array(char,	thread_comm,	TASK_COMM_LEN)
+		__field(pid_t,	thread_pid)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN);
+		__entry->thread_pid	= wq_thread->pid;
+	),
+
+	TP_printk("thread=%s:%d", __entry->thread_comm, __entry->thread_pid)
+);
+
+#endif /* _TRACE_WORKQUEUE_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/workqueue.h b/include/trace/workqueue.h
deleted file mode 100644
index 7626523..0000000
--- a/include/trace/workqueue.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef __TRACE_WORKQUEUE_H
-#define __TRACE_WORKQUEUE_H
-
-#include <linux/tracepoint.h>
-#include <linux/workqueue.h>
-#include <linux/sched.h>
-
-DECLARE_TRACE(workqueue_insertion,
-	   TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
-	   TP_ARGS(wq_thread, work));
-
-DECLARE_TRACE(workqueue_execution,
-	   TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
-	   TP_ARGS(wq_thread, work));
-
-/* Trace the creation of one workqueue thread on a cpu */
-DECLARE_TRACE(workqueue_creation,
-	   TP_PROTO(struct task_struct *wq_thread, int cpu),
-	   TP_ARGS(wq_thread, cpu));
-
-DECLARE_TRACE(workqueue_destruction,
-	   TP_PROTO(struct task_struct *wq_thread),
-	   TP_ARGS(wq_thread));
-
-#endif /* __TRACE_WORKQUEUE_H */
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 984b917..cfe56d3 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -6,7 +6,7 @@
  */
 
 
-#include <trace/workqueue.h>
+#include <trace/events/workqueue.h>
 #include <linux/list.h>
 #include <linux/percpu.h>
 #include "trace_stat.h"
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f71fb2a..0668795 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -33,7 +33,8 @@
 #include <linux/kallsyms.h>
 #include <linux/debug_locks.h>
 #include <linux/lockdep.h>
-#include <trace/workqueue.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/workqueue.h>
 
 /*
  * The per-CPU workqueue (if single thread, we always use the first
@@ -124,8 +125,6 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
 	return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
 }
 
-DEFINE_TRACE(workqueue_insertion);
-
 static void insert_work(struct cpu_workqueue_struct *cwq,
 			struct work_struct *work, struct list_head *head)
 {
@@ -262,8 +261,6 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 
-DEFINE_TRACE(workqueue_execution);
-
 static void run_workqueue(struct cpu_workqueue_struct *cwq)
 {
 	spin_lock_irq(&cwq->lock);
@@ -753,8 +750,6 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
 	return cwq;
 }
 
-DEFINE_TRACE(workqueue_creation);
-
 static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 {
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
@@ -860,8 +855,6 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 }
 EXPORT_SYMBOL_GPL(__create_workqueue_key);
 
-DEFINE_TRACE(workqueue_destruction);
-
 static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
 {
 	/*
-- 
cgit v0.10.2


From 1fdfca9c577aac96a559c1ea68f5c9156f17d636 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Mon, 20 Apr 2009 14:58:26 +0800
Subject: trace_workqueue: use list_for_each_entry() instead of
 list_for_each_entry_safe()

No need to use list_for_each_entry_safe() in iteration without deleting
any node, we can use list_for_each_entry() instead.

[ Impact: cleanup ]

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index cfe56d3..128b64b 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -47,12 +47,11 @@ probe_workqueue_insertion(struct task_struct *wq_thread,
 			  struct work_struct *work)
 {
 	int cpu = cpumask_first(&wq_thread->cpus_allowed);
-	struct cpu_workqueue_stats *node, *next;
+	struct cpu_workqueue_stats *node;
 	unsigned long flags;
 
 	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-	list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
-							list) {
+	list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
 		if (node->pid == wq_thread->pid) {
 			atomic_inc(&node->inserted);
 			goto found;
@@ -69,12 +68,11 @@ probe_workqueue_execution(struct task_struct *wq_thread,
 			  struct work_struct *work)
 {
 	int cpu = cpumask_first(&wq_thread->cpus_allowed);
-	struct cpu_workqueue_stats *node, *next;
+	struct cpu_workqueue_stats *node;
 	unsigned long flags;
 
 	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-	list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
-							list) {
+	list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
 		if (node->pid == wq_thread->pid) {
 			node->executed++;
 			goto found;
-- 
cgit v0.10.2


From b8867164f05791a6b5363bd51c1274e03600886e Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Mon, 20 Apr 2009 14:59:36 +0800
Subject: trace_workqueue: remove cpu_workqueue_stats->first_entry

cpu_workqueue_stats->first_entry is useless because we can retrieve the
header of a cpu workqueue using:
if (&cpu_workqueue_stats->list == workqueue_cpu_stat(cpu)->list.next)

[ Impact: cleanup ]

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 128b64b..890974a 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -16,8 +16,6 @@
 /* A cpu workqueue thread */
 struct cpu_workqueue_stats {
 	struct list_head            list;
-/* Useful to know if we print the cpu headers */
-	bool		            first_entry;
 	int		            cpu;
 	pid_t			    pid;
 /* Can be inserted from interrupt or user context, need to be atomic */
@@ -103,8 +101,6 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
 	cws->pid = wq_thread->pid;
 
 	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-	if (list_empty(&workqueue_cpu_stat(cpu)->list))
-		cws->first_entry = true;
 	list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
 	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 }
-- 
cgit v0.10.2


From f3c4ae26e93d354152196b62797ba86ad86dd0cc Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Mon, 20 Apr 2009 15:02:17 +0800
Subject: trace_workqueue: remove blank line between each cpu

The blankline between each cpu's workqueue stat is not necessary, because
the cpu number is enough to part them by eye.
Old style also caused a blankline below headline, and made code complex
by using lock, disableirq and get cpu var.

Old style:
 # CPU  INSERTED  EXECUTED   NAME
 # |      |         |          |

   0   8644       8644       events/0
   0      0          0       cpuset
   ...
   0      1          1       kdmflush

   1  35365      35365       events/1
   ...

New style:
 # CPU  INSERTED  EXECUTED   NAME
 # |      |         |          |

   0   8644       8644       events/0
   0      0          0       cpuset
   ...
   0      1          1       kdmflush
   1  35365      35365       events/1
   ...

[ Impact: provide more readable code ]

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 890974a..97fcea4 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -185,16 +185,9 @@ static void *workqueue_stat_next(void *prev, int idx)
 static int workqueue_stat_show(struct seq_file *s, void *p)
 {
 	struct cpu_workqueue_stats *cws = p;
-	unsigned long flags;
-	int cpu = cws->cpu;
 	struct pid *pid;
 	struct task_struct *tsk;
 
-	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-	if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
-		seq_printf(s, "\n");
-	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-
 	pid = find_get_pid(cws->pid);
 	if (pid) {
 		tsk = get_pid_task(pid, PIDTYPE_PID);
-- 
cgit v0.10.2


From 0d64f8342de26d02451900b1aad94716fe92c4ab Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 16 May 2009 05:58:49 +0200
Subject: tracing/stat: replace trace_stat_session by stat_session

The "trace" prefix in struct trace_stat_session type is annoying while
reading the trace_stat.c file. It makes the lines longer, and
is not that much useful to explain the sense of this type.

Just keep "struct stat_session" for this type.

[ Impact: make the code a bit more readable ]

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index fdde3a4..3b6816b 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -22,7 +22,7 @@ struct trace_stat_list {
 };
 
 /* A stat session is the stats output in one file */
-struct tracer_stat_session {
+struct stat_session {
 	struct list_head	session_list;
 	struct tracer_stat	*ts;
 	struct list_head	stat_list;
@@ -38,7 +38,7 @@ static DEFINE_MUTEX(all_stat_sessions_mutex);
 static struct dentry		*stat_dir;
 
 
-static void reset_stat_session(struct tracer_stat_session *session)
+static void reset_stat_session(struct stat_session *session)
 {
 	struct trace_stat_list *node, *next;
 
@@ -48,7 +48,7 @@ static void reset_stat_session(struct tracer_stat_session *session)
 	INIT_LIST_HEAD(&session->stat_list);
 }
 
-static void destroy_session(struct tracer_stat_session *session)
+static void destroy_session(struct stat_session *session)
 {
 	debugfs_remove(session->file);
 	reset_stat_session(session);
@@ -71,7 +71,7 @@ static int dummy_cmp(void *p1, void *p2)
  * All of these copies and sorting are required on all opening
  * since the stats could have changed between two file sessions.
  */
-static int stat_seq_init(struct tracer_stat_session *session)
+static int stat_seq_init(struct stat_session *session)
 {
 	struct trace_stat_list *iter_entry, *new_entry;
 	struct tracer_stat *ts = session->ts;
@@ -154,7 +154,7 @@ exit_free_list:
 
 static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 {
-	struct tracer_stat_session *session = s->private;
+	struct stat_session *session = s->private;
 
 	/* Prevent from tracer switch or stat_list modification */
 	mutex_lock(&session->stat_mutex);
@@ -168,7 +168,7 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 
 static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
 {
-	struct tracer_stat_session *session = s->private;
+	struct stat_session *session = s->private;
 
 	if (p == SEQ_START_TOKEN)
 		return seq_list_start(&session->stat_list, *pos);
@@ -178,13 +178,13 @@ static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
 
 static void stat_seq_stop(struct seq_file *s, void *p)
 {
-	struct tracer_stat_session *session = s->private;
+	struct stat_session *session = s->private;
 	mutex_unlock(&session->stat_mutex);
 }
 
 static int stat_seq_show(struct seq_file *s, void *v)
 {
-	struct tracer_stat_session *session = s->private;
+	struct stat_session *session = s->private;
 	struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
 
 	if (v == SEQ_START_TOKEN)
@@ -205,7 +205,7 @@ static int tracing_stat_open(struct inode *inode, struct file *file)
 {
 	int ret;
 
-	struct tracer_stat_session *session = inode->i_private;
+	struct stat_session *session = inode->i_private;
 
 	ret = seq_open(file, &trace_stat_seq_ops);
 	if (!ret) {
@@ -222,7 +222,7 @@ static int tracing_stat_open(struct inode *inode, struct file *file)
  */
 static int tracing_stat_release(struct inode *i, struct file *f)
 {
-	struct tracer_stat_session *session = i->i_private;
+	struct stat_session *session = i->i_private;
 
 	mutex_lock(&session->stat_mutex);
 	reset_stat_session(session);
@@ -251,7 +251,7 @@ static int tracing_stat_init(void)
 	return 0;
 }
 
-static int init_stat_file(struct tracer_stat_session *session)
+static int init_stat_file(struct stat_session *session)
 {
 	if (!stat_dir && tracing_stat_init())
 		return -ENODEV;
@@ -266,7 +266,7 @@ static int init_stat_file(struct tracer_stat_session *session)
 
 int register_stat_tracer(struct tracer_stat *trace)
 {
-	struct tracer_stat_session *session, *node, *tmp;
+	struct stat_session *session, *node, *tmp;
 	int ret;
 
 	if (!trace)
@@ -286,7 +286,7 @@ int register_stat_tracer(struct tracer_stat *trace)
 	mutex_unlock(&all_stat_sessions_mutex);
 
 	/* Init the session */
-	session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL);
+	session = kmalloc(sizeof(struct stat_session), GFP_KERNEL);
 	if (!session)
 		return -ENOMEM;
 
@@ -312,7 +312,7 @@ int register_stat_tracer(struct tracer_stat *trace)
 
 void unregister_stat_tracer(struct tracer_stat *trace)
 {
-	struct tracer_stat_session *node, *tmp;
+	struct stat_session *node, *tmp;
 
 	mutex_lock(&all_stat_sessions_mutex);
 	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
-- 
cgit v0.10.2


From 8f184f27300f66f6dcc8296c2dae7a1fbe8429c9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 16 May 2009 06:24:36 +0200
Subject: tracing/stat: replace linked list by an rbtree for sorting

When the stat tracing framework prepares the entries from a tracer
to output them to the user, it starts by computing a linear sort
through a linked list to give the entries ordered by relevance
to the user.

This is quite ugly and causes a small latency when we begin to
read the file.

This patch changes that by turning the linked list into a red-black
tree. Athough the whole iteration using the start and next tracer
callbacks while opening the file remain the same, it is now much
more fast and scalable.

The rbtree guarantees O(log(n)) insertions whereas a linked
list with linear sorting brought us a O(n) despair. Now the
(visible) latency has disapeared.

[ Impact: kill the latency while starting to read a stat tracer file ]

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 3b6816b..0bd0fc8 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -1,7 +1,7 @@
 /*
  * Infrastructure for statistic tracing (histogram output).
  *
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ * Copyright (C) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com>
  *
  * Based on the code from trace_branch.c which is
  * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
@@ -10,14 +10,19 @@
 
 
 #include <linux/list.h>
+#include <linux/rbtree.h>
 #include <linux/debugfs.h>
 #include "trace_stat.h"
 #include "trace.h"
 
 
-/* List of stat entries from a tracer */
-struct trace_stat_list {
-	struct list_head	list;
+/*
+ * List of stat red-black nodes from a tracer
+ * We use a such tree to sort quickly the stat
+ * entries from the tracer.
+ */
+struct stat_node {
+	struct rb_node		node;
 	void			*stat;
 };
 
@@ -25,7 +30,7 @@ struct trace_stat_list {
 struct stat_session {
 	struct list_head	session_list;
 	struct tracer_stat	*ts;
-	struct list_head	stat_list;
+	struct rb_root		stat_root;
 	struct mutex		stat_mutex;
 	struct dentry		*file;
 };
@@ -37,15 +42,45 @@ static DEFINE_MUTEX(all_stat_sessions_mutex);
 /* The root directory for all stat files */
 static struct dentry		*stat_dir;
 
+/*
+ * Iterate through the rbtree using a post order traversal path
+ * to release the next node.
+ * It won't necessary release one at each iteration
+ * but it will at least advance closer to the next one
+ * to be released.
+ */
+static struct rb_node *release_next(struct rb_node *node)
+{
+	struct stat_node *snode;
+	struct rb_node *parent = rb_parent(node);
+
+	if (node->rb_left)
+		return node->rb_left;
+	else if (node->rb_right)
+		return node->rb_right;
+	else {
+		if (!parent)
+			return NULL;
+		if (parent->rb_left == node)
+			parent->rb_left = NULL;
+		else
+			parent->rb_right = NULL;
+
+		snode = container_of(node, struct stat_node, node);
+		kfree(snode);
+
+		return parent;
+	}
+}
 
 static void reset_stat_session(struct stat_session *session)
 {
-	struct trace_stat_list *node, *next;
+	struct rb_node *node = session->stat_root.rb_node;
 
-	list_for_each_entry_safe(node, next, &session->stat_list, list)
-		kfree(node);
+	while (node)
+		node = release_next(node);
 
-	INIT_LIST_HEAD(&session->stat_list);
+	session->stat_root = RB_ROOT;
 }
 
 static void destroy_session(struct stat_session *session)
@@ -56,6 +91,35 @@ static void destroy_session(struct stat_session *session)
 	kfree(session);
 }
 
+typedef int (*cmp_stat_t)(void *, void *);
+
+static void
+insert_stat(struct rb_root *root, struct stat_node *data, cmp_stat_t cmp)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+	/*
+	 * Figure out where to put new node
+	 * This is a descendent sorting
+	 */
+	while (*new) {
+		struct stat_node *this;
+		int result;
+
+		this = container_of(*new, struct stat_node, node);
+		result = cmp(data->stat, this->stat);
+
+		parent = *new;
+		if (result >= 0)
+			new = &((*new)->rb_left);
+		else
+			new = &((*new)->rb_right);
+	}
+
+	rb_link_node(&data->node, parent, new);
+	rb_insert_color(&data->node, root);
+}
+
 /*
  * For tracers that don't provide a stat_cmp callback.
  * This one will force an immediate insertion on tail of
@@ -73,8 +137,9 @@ static int dummy_cmp(void *p1, void *p2)
  */
 static int stat_seq_init(struct stat_session *session)
 {
-	struct trace_stat_list *iter_entry, *new_entry;
 	struct tracer_stat *ts = session->ts;
+	struct stat_node *new_entry;
+	struct rb_root *root;
 	void *stat;
 	int ret = 0;
 	int i;
@@ -93,15 +158,13 @@ static int stat_seq_init(struct stat_session *session)
 	 * The first entry. Actually this is the second, but the first
 	 * one (the stat_list head) is pointless.
 	 */
-	new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
+	new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL);
 	if (!new_entry) {
 		ret = -ENOMEM;
 		goto exit;
 	}
-
-	INIT_LIST_HEAD(&new_entry->list);
-
-	list_add(&new_entry->list, &session->stat_list);
+	root = &session->stat_root;
+	insert_stat(root, new_entry, dummy_cmp);
 
 	new_entry->stat = stat;
 
@@ -116,31 +179,17 @@ static int stat_seq_init(struct stat_session *session)
 		if (!stat)
 			break;
 
-		new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
+		new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL);
 		if (!new_entry) {
 			ret = -ENOMEM;
 			goto exit_free_list;
 		}
 
-		INIT_LIST_HEAD(&new_entry->list);
 		new_entry->stat = stat;
 
-		list_for_each_entry_reverse(iter_entry, &session->stat_list,
-				list) {
-
-			/* Insertion with a descendent sorting */
-			if (ts->stat_cmp(iter_entry->stat,
-					new_entry->stat) >= 0) {
-
-				list_add(&new_entry->list, &iter_entry->list);
-				break;
-			}
-		}
-
-		/* The current larger value */
-		if (list_empty(&new_entry->list))
-			list_add(&new_entry->list, &session->stat_list);
+		insert_stat(root, new_entry, ts->stat_cmp);
 	}
+
 exit:
 	mutex_unlock(&session->stat_mutex);
 	return ret;
@@ -155,25 +204,38 @@ exit_free_list:
 static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 {
 	struct stat_session *session = s->private;
+	struct rb_node *node;
+	int i;
 
 	/* Prevent from tracer switch or stat_list modification */
 	mutex_lock(&session->stat_mutex);
 
 	/* If we are in the beginning of the file, print the headers */
-	if (!*pos && session->ts->stat_headers)
+	if (!*pos && session->ts->stat_headers) {
+		(*pos)++;
 		return SEQ_START_TOKEN;
+	}
 
-	return seq_list_start(&session->stat_list, *pos);
+	node = rb_first(&session->stat_root);
+	for (i = 0; node && i < *pos; i++)
+		node = rb_next(node);
+
+	(*pos)++;
+
+	return node;
 }
 
 static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
 {
 	struct stat_session *session = s->private;
+	struct rb_node *node = p;
+
+	(*pos)++;
 
 	if (p == SEQ_START_TOKEN)
-		return seq_list_start(&session->stat_list, *pos);
+		return rb_first(&session->stat_root);
 
-	return seq_list_next(p, &session->stat_list, pos);
+	return rb_next(node);
 }
 
 static void stat_seq_stop(struct seq_file *s, void *p)
@@ -185,7 +247,7 @@ static void stat_seq_stop(struct seq_file *s, void *p)
 static int stat_seq_show(struct seq_file *s, void *v)
 {
 	struct stat_session *session = s->private;
-	struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
+	struct stat_node *l = container_of(v, struct stat_node, node);
 
 	if (v == SEQ_START_TOKEN)
 		return session->ts->stat_headers(s);
@@ -286,15 +348,13 @@ int register_stat_tracer(struct tracer_stat *trace)
 	mutex_unlock(&all_stat_sessions_mutex);
 
 	/* Init the session */
-	session = kmalloc(sizeof(struct stat_session), GFP_KERNEL);
+	session = kzalloc(sizeof(*session), GFP_KERNEL);
 	if (!session)
 		return -ENOMEM;
 
 	session->ts = trace;
 	INIT_LIST_HEAD(&session->session_list);
-	INIT_LIST_HEAD(&session->stat_list);
 	mutex_init(&session->stat_mutex);
-	session->file = NULL;
 
 	ret = init_stat_file(session);
 	if (ret) {
-- 
cgit v0.10.2


From b3dd7ba7d862707800c7ac45068f14ade2b65155 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 27 May 2009 11:04:26 +0800
Subject: tracing/stat: change dummpy_cmp() to return -1

Currently the output of trace_stat/workqueues is totally reversed:

 # cat /debug/tracing/trace_stat/workqueues
    ...
    1       17       17      210       37   `-blk_unplug_work+0x0/0x57
    1     3779     3779      181       11   |-cfq_kick_queue+0x0/0x2f
    1     3796     3796                     kblockd/1:120
    ...

The correct output should be:

    1     3796     3796                     kblockd/1:120
    1     3779     3779      181       11   |-cfq_kick_queue+0x0/0x2f
    1       17       17      210       37   `-blk_unplug_work+0x0/0x57

It's caused by "tracing/stat: replace linked list by an rbtree for
sorting"
(53059c9b67a62a3dc8c80204d3da42b9267ea5a0).

dummpy_cmp() should return -1, so rb_node will always be inserted as
right-most node in the rbtree, thus we sort the output in ascending
order.

[ Impact: fix the output of trace_stat/workqueues ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 0bd0fc8..5816d1a 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -127,7 +127,7 @@ insert_stat(struct rb_root *root, struct stat_node *data, cmp_stat_t cmp)
  */
 static int dummy_cmp(void *p1, void *p2)
 {
-	return 1;
+	return -1;
 }
 
 /*
-- 
cgit v0.10.2


From e16228069083a2f6b94383ac5739aea7a0f38ce4 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 27 May 2009 11:04:48 +0800
Subject: tracing/stat: remember to free root node

When closing a trace_stat file, we destroy the rbtree constructed during
file open, but there is memory leak that the root node is not freed.

[ Impact: fix memory leak when closing a trace_stat file ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 5816d1a..8030ec9 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -60,8 +60,8 @@ static struct rb_node *release_next(struct rb_node *node)
 		return node->rb_right;
 	else {
 		if (!parent)
-			return NULL;
-		if (parent->rb_left == node)
+			;
+		else if (parent->rb_left == node)
 			parent->rb_left = NULL;
 		else
 			parent->rb_right = NULL;
-- 
cgit v0.10.2


From dbd3fbdfeecfad4e71139db05d72560c3583e2a9 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 27 May 2009 11:42:46 +0800
Subject: tracing/stat: do some cleanups

- remove duplicate code in stat_seq_init()
- update comments to reflect the change from stat list to stat rbtree

[ Impact: clean up ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 8030ec9..17f20eb 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -93,10 +93,15 @@ static void destroy_session(struct stat_session *session)
 
 typedef int (*cmp_stat_t)(void *, void *);
 
-static void
-insert_stat(struct rb_root *root, struct stat_node *data, cmp_stat_t cmp)
+static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp)
 {
 	struct rb_node **new = &(root->rb_node), *parent = NULL;
+	struct stat_node *data;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+	data->stat = stat;
 
 	/*
 	 * Figure out where to put new node
@@ -118,12 +123,13 @@ insert_stat(struct rb_root *root, struct stat_node *data, cmp_stat_t cmp)
 
 	rb_link_node(&data->node, parent, new);
 	rb_insert_color(&data->node, root);
+	return 0;
 }
 
 /*
  * For tracers that don't provide a stat_cmp callback.
- * This one will force an immediate insertion on tail of
- * the list.
+ * This one will force an insertion as right-most node
+ * in the rbtree.
  */
 static int dummy_cmp(void *p1, void *p2)
 {
@@ -131,15 +137,14 @@ static int dummy_cmp(void *p1, void *p2)
 }
 
 /*
- * Initialize the stat list at each trace_stat file opening.
+ * Initialize the stat rbtree at each trace_stat file opening.
  * All of these copies and sorting are required on all opening
  * since the stats could have changed between two file sessions.
  */
 static int stat_seq_init(struct stat_session *session)
 {
 	struct tracer_stat *ts = session->ts;
-	struct stat_node *new_entry;
-	struct rb_root *root;
+	struct rb_root *root = &session->stat_root;
 	void *stat;
 	int ret = 0;
 	int i;
@@ -154,23 +159,12 @@ static int stat_seq_init(struct stat_session *session)
 	if (!stat)
 		goto exit;
 
-	/*
-	 * The first entry. Actually this is the second, but the first
-	 * one (the stat_list head) is pointless.
-	 */
-	new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL);
-	if (!new_entry) {
-		ret = -ENOMEM;
+	ret = insert_stat(root, stat, ts->stat_cmp);
+	if (ret)
 		goto exit;
-	}
-	root = &session->stat_root;
-	insert_stat(root, new_entry, dummy_cmp);
-
-	new_entry->stat = stat;
 
 	/*
-	 * Iterate over the tracer stat entries and store them in a sorted
-	 * list.
+	 * Iterate over the tracer stat entries and store them in an rbtree.
 	 */
 	for (i = 1; ; i++) {
 		stat = ts->stat_next(stat, i);
@@ -179,22 +173,16 @@ static int stat_seq_init(struct stat_session *session)
 		if (!stat)
 			break;
 
-		new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL);
-		if (!new_entry) {
-			ret = -ENOMEM;
-			goto exit_free_list;
-		}
-
-		new_entry->stat = stat;
-
-		insert_stat(root, new_entry, ts->stat_cmp);
+		ret = insert_stat(root, stat, ts->stat_cmp);
+		if (ret)
+			goto exit_free_rbtree;
 	}
 
 exit:
 	mutex_unlock(&session->stat_mutex);
 	return ret;
 
-exit_free_list:
+exit_free_rbtree:
 	reset_stat_session(session);
 	mutex_unlock(&session->stat_mutex);
 	return ret;
@@ -207,7 +195,7 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 	struct rb_node *node;
 	int i;
 
-	/* Prevent from tracer switch or stat_list modification */
+	/* Prevent from tracer switch or rbtree modification */
 	mutex_lock(&session->stat_mutex);
 
 	/* If we are in the beginning of the file, print the headers */
@@ -280,7 +268,7 @@ static int tracing_stat_open(struct inode *inode, struct file *file)
 }
 
 /*
- * Avoid consuming memory with our now useless list.
+ * Avoid consuming memory with our now useless rbtree.
  */
 static int tracing_stat_release(struct inode *i, struct file *f)
 {
-- 
cgit v0.10.2


From 43bd1236234cacbc18d1476a9b57e7a306efddf5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 30 May 2009 04:25:30 +0200
Subject: tracing/stat: remove unappropriate safe walk on list

register_stat_tracer() uses list_for_each_entry_safe
to check whether a tracer is already present in the list.
But we don't delete anything from the list here, so
we don't need the safe version

[ Impact: cleanup list use is stat tracing ]

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 17f20eb..c006437 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -316,7 +316,7 @@ static int init_stat_file(struct stat_session *session)
 
 int register_stat_tracer(struct tracer_stat *trace)
 {
-	struct stat_session *session, *node, *tmp;
+	struct stat_session *session, *node;
 	int ret;
 
 	if (!trace)
@@ -327,7 +327,7 @@ int register_stat_tracer(struct tracer_stat *trace)
 
 	/* Already registered? */
 	mutex_lock(&all_stat_sessions_mutex);
-	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
+	list_for_each_entry(node, &all_stat_sessions, session_list) {
 		if (node->ts == trace) {
 			mutex_unlock(&all_stat_sessions_mutex);
 			return -EINVAL;
-- 
cgit v0.10.2


From 58f892e022e88438183c48661dcdc6a2997dab99 Mon Sep 17 00:00:00 2001
From: Naga Chumbalkar <nagananda.chumbalkar@hp.com>
Date: Tue, 26 May 2009 21:48:07 +0000
Subject: x86: Print real IOAPIC version for x86-64

Fix the fact that the IOAPIC version number in the x86_64 code path always
gets assigned to 0, instead of the correct value.

Before the patch: (from "dmesg" output):

 ACPI: IOAPIC (id[0x08] address[0xfec00000] gsi_base[0])
 IOAPIC[0]: apic_id 8, version 0, address 0xfec00000, GSI 0-23     <---

 After the patch:
 ACPI: IOAPIC (id[0x08] address[0xfec00000] gsi_base[0])
 IOAPIC[0]: apic_id 8, version 32, address 0xfec00000, GSI 0-23    <---

History:

io_apic_get_version() was compiled out of the x86_64 code path in the commit
f2c2cca3acef8b253a36381d9b469ad4fb08563a:

Author: Andi Kleen <ak@suse.de>
Date:   Tue Sep 26 10:52:37 2006 +0200

    [PATCH] Remove APIC version/cpu capability mpparse checking/printing

    ACPI went to great trouble to get the APIC version and CPU capabilities
    of different CPUs before passing them to the mpparser. But all
    that data was used was to print it out.  Actually it even faked some data
    based on the boot cpu, not on the actual CPU being booted.

    Remove all this code because it's not needed.

    Cc: len.brown@intel.com

At the time, the IOAPIC version number was deliberately not printed
in the x86_64 code path. However, after the x86 and x86_64 files were
merged, the net result is that the IOAPIC version is printed incorrectly
in the x86_64 code path.

The patch below provides a fix. I have tested it with acpi, and with
acpi=off, and did not see any problems.

Signed-off-by: Naga Chumbalkar <nagananda.chumbalkar@hp.com>
Acked-by: Yinghai Lu <yhlu.kernel@gmail.com>
LKML-Reference: <20090416014230.4885.94926.sendpatchset@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
*************************

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 844e5e2..6310861 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -985,11 +985,8 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 
 	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
 	mp_ioapics[idx].apicid = uniq_ioapic_id(id);
-#ifdef CONFIG_X86_32
 	mp_ioapics[idx].apicver = io_apic_get_version(idx);
-#else
-	mp_ioapics[idx].apicver = 0;
-#endif
+
 	/*
 	 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
 	 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index ac7f3b6..f712f8f 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -4012,6 +4012,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
 
 	return apic_id;
 }
+#endif
 
 int __init io_apic_get_version(int ioapic)
 {
@@ -4024,7 +4025,6 @@ int __init io_apic_get_version(int ioapic)
 
 	return reg_01.bits.version;
 }
-#endif
 
 int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
 {
-- 
cgit v0.10.2


From 3d58829b0510244596079c1d2f1762c53aef2e97 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 28 May 2009 09:54:47 +0200
Subject: x86, apic: Restore irqs on fail paths

lapic_resume forgets to restore interrupts on fail paths.
Fix that.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <1243497289-18591-1-git-send-email-jirislaby@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b0fd264..e82488d 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2027,7 +2027,7 @@ static int lapic_resume(struct sys_device *dev)
 	unsigned int l, h;
 	unsigned long flags;
 	int maxlvt;
-	int ret;
+	int ret = 0;
 	struct IO_APIC_route_entry **ioapic_entries = NULL;
 
 	if (!apic_pm_state.active)
@@ -2038,14 +2038,15 @@ static int lapic_resume(struct sys_device *dev)
 		ioapic_entries = alloc_ioapic_entries();
 		if (!ioapic_entries) {
 			WARN(1, "Alloc ioapic_entries in lapic resume failed.");
-			return -ENOMEM;
+			ret = -ENOMEM;
+			goto restore;
 		}
 
 		ret = save_IO_APIC_setup(ioapic_entries);
 		if (ret) {
 			WARN(1, "Saving IO-APIC state failed: %d\n", ret);
 			free_ioapic_entries(ioapic_entries);
-			return ret;
+			goto restore;
 		}
 
 		mask_IO_APIC_setup(ioapic_entries);
@@ -2097,10 +2098,10 @@ static int lapic_resume(struct sys_device *dev)
 		restore_IO_APIC_setup(ioapic_entries);
 		free_ioapic_entries(ioapic_entries);
 	}
-
+restore:
 	local_irq_restore(flags);
 
-	return 0;
+	return ret;
 }
 
 /*
-- 
cgit v0.10.2


From 60e59f68824102c87e64c5f51c4e57c0b8a61e46 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Sun, 24 May 2009 20:34:10 +0000
Subject: powerpc/pmac: Update PowerMac 32-bit defconfig

This mostly adds back AppleTouch support and adds CONFIG_HIGHMEM
by default.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/configs/pmac32_defconfig b/arch/powerpc/configs/pmac32_defconfig
index 5339bb4..ea8870a 100644
--- a/arch/powerpc/configs/pmac32_defconfig
+++ b/arch/powerpc/configs/pmac32_defconfig
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.28-rc3
-# Tue Nov 11 19:36:51 2008
+# Linux kernel version: 2.6.30-rc7
+# Mon May 25 14:53:25 2009
 #
 # CONFIG_PPC64 is not set
 
@@ -14,6 +14,7 @@ CONFIG_6xx=y
 # CONFIG_40x is not set
 # CONFIG_44x is not set
 # CONFIG_E200 is not set
+CONFIG_PPC_BOOK3S=y
 CONFIG_PPC_FPU=y
 CONFIG_ALTIVEC=y
 CONFIG_PPC_STD_MMU=y
@@ -43,7 +44,7 @@ CONFIG_GENERIC_FIND_NEXT_BIT=y
 CONFIG_PPC=y
 CONFIG_EARLY_PRINTK=y
 CONFIG_GENERIC_NVRAM=y
-CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
+CONFIG_SCHED_OMIT_FRAME_POINTER=y
 CONFIG_ARCH_MAY_HAVE_PC_FDC=y
 CONFIG_PPC_OF=y
 CONFIG_OF=y
@@ -52,12 +53,14 @@ CONFIG_OF=y
 CONFIG_AUDIT_ARCH=y
 CONFIG_GENERIC_BUG=y
 CONFIG_SYS_SUPPORTS_APM_EMULATION=y
+CONFIG_DTC=y
 # CONFIG_DEFAULT_UIMAGE is not set
 CONFIG_HIBERNATE_32=y
 CONFIG_ARCH_HIBERNATION_POSSIBLE=y
 CONFIG_ARCH_SUSPEND_POSSIBLE=y
 # CONFIG_PPC_DCR_NATIVE is not set
 # CONFIG_PPC_DCR_MMIO is not set
+CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
 CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
 
 #
@@ -72,14 +75,24 @@ CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 # CONFIG_BSD_PROCESS_ACCT is not set
 # CONFIG_TASKSTATS is not set
 # CONFIG_AUDIT is not set
+
+#
+# RCU Subsystem
+#
+CONFIG_CLASSIC_RCU=y
+# CONFIG_TREE_RCU is not set
+# CONFIG_PREEMPT_RCU is not set
+# CONFIG_TREE_RCU_TRACE is not set
+# CONFIG_PREEMPT_RCU_TRACE is not set
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_BUF_SHIFT=14
-# CONFIG_CGROUPS is not set
 # CONFIG_GROUP_SCHED is not set
+# CONFIG_CGROUPS is not set
 CONFIG_SYSFS_DEPRECATED=y
 CONFIG_SYSFS_DEPRECATED_V2=y
 # CONFIG_RELAY is not set
@@ -88,23 +101,27 @@ CONFIG_NAMESPACES=y
 # CONFIG_IPC_NS is not set
 # CONFIG_USER_NS is not set
 # CONFIG_PID_NS is not set
+# CONFIG_NET_NS is not set
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_INITRAMFS_SOURCE=""
+CONFIG_RD_GZIP=y
+CONFIG_RD_BZIP2=y
+CONFIG_RD_LZMA=y
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
 CONFIG_SYSCTL=y
+CONFIG_ANON_INODES=y
 # CONFIG_EMBEDDED is not set
 CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
 CONFIG_ELF_CORE=y
-# CONFIG_COMPAT_BRK is not set
 CONFIG_BASE_FULL=y
 CONFIG_FUTEX=y
-CONFIG_ANON_INODES=y
 CONFIG_EPOLL=y
 CONFIG_SIGNALFD=y
 CONFIG_TIMERFD=y
@@ -114,10 +131,12 @@ CONFIG_AIO=y
 CONFIG_VM_EVENT_COUNTERS=y
 CONFIG_PCI_QUIRKS=y
 CONFIG_SLUB_DEBUG=y
+# CONFIG_COMPAT_BRK is not set
 # CONFIG_SLAB is not set
 CONFIG_SLUB=y
 # CONFIG_SLOB is not set
 CONFIG_PROFILING=y
+CONFIG_TRACEPOINTS=y
 # CONFIG_MARKERS is not set
 CONFIG_OPROFILE=y
 CONFIG_HAVE_OPROFILE=y
@@ -127,10 +146,10 @@ CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
+# CONFIG_SLOW_WORK is not set
 # CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
-# CONFIG_TINY_SHMEM is not set
 CONFIG_BASE_SMALL=0
 CONFIG_MODULES=y
 # CONFIG_MODULE_FORCE_LOAD is not set
@@ -138,11 +157,8 @@ CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_MODVERSIONS is not set
 # CONFIG_MODULE_SRCVERSION_ALL is not set
-CONFIG_KMOD=y
 CONFIG_BLOCK=y
 CONFIG_LBD=y
-# CONFIG_BLK_DEV_IO_TRACE is not set
-CONFIG_LSF=y
 CONFIG_BLK_DEV_BSG=y
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -158,14 +174,11 @@ CONFIG_DEFAULT_AS=y
 # CONFIG_DEFAULT_CFQ is not set
 # CONFIG_DEFAULT_NOOP is not set
 CONFIG_DEFAULT_IOSCHED="anticipatory"
-CONFIG_CLASSIC_RCU=y
 CONFIG_FREEZER=y
 
 #
 # Platform support
 #
-CONFIG_PPC_MULTIPLATFORM=y
-CONFIG_CLASSIC32=y
 # CONFIG_PPC_CHRP is not set
 # CONFIG_MPC5121_ADS is not set
 # CONFIG_MPC5121_GENERIC is not set
@@ -178,7 +191,9 @@ CONFIG_PPC_PMAC=y
 # CONFIG_PPC_83xx is not set
 # CONFIG_PPC_86xx is not set
 # CONFIG_EMBEDDED6xx is not set
+# CONFIG_AMIGAONE is not set
 CONFIG_PPC_NATIVE=y
+CONFIG_PPC_OF_BOOT_TRAMPOLINE=y
 # CONFIG_IPIC is not set
 CONFIG_MPIC=y
 # CONFIG_MPIC_WEIRD is not set
@@ -212,11 +227,12 @@ CONFIG_CPU_FREQ_PMAC=y
 CONFIG_PPC601_SYNC_FIX=y
 # CONFIG_TAU is not set
 # CONFIG_FSL_ULI1575 is not set
+# CONFIG_SIMPLE_GPIO is not set
 
 #
 # Kernel options
 #
-# CONFIG_HIGHMEM is not set
+CONFIG_HIGHMEM=y
 CONFIG_TICK_ONESHOT=y
 CONFIG_NO_HZ=y
 CONFIG_HIGH_RES_TIMERS=y
@@ -239,6 +255,7 @@ CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
 CONFIG_ARCH_HAS_WALK_MEMORY=y
 CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y
 # CONFIG_KEXEC is not set
+# CONFIG_CRASH_DUMP is not set
 CONFIG_ARCH_FLATMEM_ENABLE=y
 CONFIG_ARCH_POPULATES_NODE_MAP=y
 CONFIG_SELECT_MEMORY_MODEL=y
@@ -250,12 +267,17 @@ CONFIG_FLAT_NODE_MEM_MAP=y
 CONFIG_PAGEFLAGS_EXTENDED=y
 CONFIG_SPLIT_PTLOCK_CPUS=4
 # CONFIG_MIGRATION is not set
-# CONFIG_RESOURCES_64BIT is not set
 # CONFIG_PHYS_ADDR_T_64BIT is not set
 CONFIG_ZONE_DMA_FLAG=1
 CONFIG_BOUNCE=y
 CONFIG_VIRT_TO_BUS=y
 CONFIG_UNEVICTABLE_LRU=y
+CONFIG_HAVE_MLOCK=y
+CONFIG_HAVE_MLOCKED_PAGE_BIT=y
+CONFIG_PPC_4K_PAGES=y
+# CONFIG_PPC_16K_PAGES is not set
+# CONFIG_PPC_64K_PAGES is not set
+# CONFIG_PPC_256K_PAGES is not set
 CONFIG_FORCE_MAX_ZONEORDER=11
 CONFIG_PROC_DEVICETREE=y
 # CONFIG_CMDLINE_BOOL is not set
@@ -288,6 +310,8 @@ CONFIG_ARCH_SUPPORTS_MSI=y
 # CONFIG_PCI_MSI is not set
 # CONFIG_PCI_LEGACY is not set
 # CONFIG_PCI_DEBUG is not set
+# CONFIG_PCI_STUB is not set
+# CONFIG_PCI_IOV is not set
 CONFIG_PCCARD=m
 # CONFIG_PCMCIA_DEBUG is not set
 CONFIG_PCMCIA=m
@@ -397,6 +421,8 @@ CONFIG_NETFILTER_XTABLES=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
 # CONFIG_NETFILTER_XT_TARGET_CONNMARK is not set
 # CONFIG_NETFILTER_XT_TARGET_DSCP is not set
+CONFIG_NETFILTER_XT_TARGET_HL=m
+# CONFIG_NETFILTER_XT_TARGET_LED is not set
 CONFIG_NETFILTER_XT_TARGET_MARK=m
 CONFIG_NETFILTER_XT_TARGET_NFLOG=m
 CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
@@ -405,6 +431,7 @@ CONFIG_NETFILTER_XT_TARGET_RATEEST=m
 CONFIG_NETFILTER_XT_TARGET_TRACE=m
 CONFIG_NETFILTER_XT_TARGET_TCPMSS=m
 CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m
+# CONFIG_NETFILTER_XT_MATCH_CLUSTER is not set
 CONFIG_NETFILTER_XT_MATCH_COMMENT=m
 # CONFIG_NETFILTER_XT_MATCH_CONNBYTES is not set
 CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
@@ -415,6 +442,7 @@ CONFIG_NETFILTER_XT_MATCH_DSCP=m
 CONFIG_NETFILTER_XT_MATCH_ESP=m
 # CONFIG_NETFILTER_XT_MATCH_HASHLIMIT is not set
 CONFIG_NETFILTER_XT_MATCH_HELPER=m
+CONFIG_NETFILTER_XT_MATCH_HL=m
 CONFIG_NETFILTER_XT_MATCH_IPRANGE=m
 CONFIG_NETFILTER_XT_MATCH_LENGTH=m
 CONFIG_NETFILTER_XT_MATCH_LIMIT=m
@@ -478,17 +506,15 @@ CONFIG_IP_NF_ARPFILTER=m
 CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_IP_DCCP=m
 CONFIG_INET_DCCP_DIAG=m
-CONFIG_IP_DCCP_ACKVEC=y
 
 #
 # DCCP CCIDs Configuration (EXPERIMENTAL)
 #
-CONFIG_IP_DCCP_CCID2=m
 # CONFIG_IP_DCCP_CCID2_DEBUG is not set
-CONFIG_IP_DCCP_CCID3=m
+CONFIG_IP_DCCP_CCID3=y
 # CONFIG_IP_DCCP_CCID3_DEBUG is not set
 CONFIG_IP_DCCP_CCID3_RTO=100
-CONFIG_IP_DCCP_TFRC_LIB=m
+CONFIG_IP_DCCP_TFRC_LIB=y
 
 #
 # DCCP Kernel Hacking
@@ -508,13 +534,16 @@ CONFIG_IP_DCCP_TFRC_LIB=m
 # CONFIG_LAPB is not set
 # CONFIG_ECONET is not set
 # CONFIG_WAN_ROUTER is not set
+# CONFIG_PHONET is not set
 # CONFIG_NET_SCHED is not set
 CONFIG_NET_CLS_ROUTE=y
+# CONFIG_DCB is not set
 
 #
 # Network testing
 #
 # CONFIG_NET_PKTGEN is not set
+# CONFIG_NET_DROP_MONITOR is not set
 # CONFIG_HAMRADIO is not set
 # CONFIG_CAN is not set
 CONFIG_IRDA=m
@@ -577,8 +606,6 @@ CONFIG_BT_HIDP=m
 #
 # Bluetooth device drivers
 #
-CONFIG_BT_HCIUSB=m
-# CONFIG_BT_HCIUSB_SCO is not set
 # CONFIG_BT_HCIBTUSB is not set
 # CONFIG_BT_HCIUART is not set
 CONFIG_BT_HCIBCM203X=m
@@ -590,31 +617,27 @@ CONFIG_BT_HCIBFUSB=m
 # CONFIG_BT_HCIBTUART is not set
 # CONFIG_BT_HCIVHCI is not set
 # CONFIG_AF_RXRPC is not set
-# CONFIG_PHONET is not set
 CONFIG_WIRELESS=y
 CONFIG_CFG80211=m
-CONFIG_NL80211=y
+# CONFIG_CFG80211_REG_DEBUG is not set
 CONFIG_WIRELESS_OLD_REGULATORY=y
 CONFIG_WIRELESS_EXT=y
 CONFIG_WIRELESS_EXT_SYSFS=y
+# CONFIG_LIB80211 is not set
 CONFIG_MAC80211=m
 
 #
 # Rate control algorithm selection
 #
-CONFIG_MAC80211_RC_PID=y
-# CONFIG_MAC80211_RC_MINSTREL is not set
-CONFIG_MAC80211_RC_DEFAULT_PID=y
-# CONFIG_MAC80211_RC_DEFAULT_MINSTREL is not set
-CONFIG_MAC80211_RC_DEFAULT="pid"
+CONFIG_MAC80211_RC_MINSTREL=y
+# CONFIG_MAC80211_RC_DEFAULT_PID is not set
+CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y
+CONFIG_MAC80211_RC_DEFAULT="minstrel"
 # CONFIG_MAC80211_MESH is not set
 CONFIG_MAC80211_LEDS=y
+# CONFIG_MAC80211_DEBUGFS is not set
 # CONFIG_MAC80211_DEBUG_MENU is not set
-CONFIG_IEEE80211=m
-# CONFIG_IEEE80211_DEBUG is not set
-CONFIG_IEEE80211_CRYPT_WEP=m
-CONFIG_IEEE80211_CRYPT_CCMP=m
-CONFIG_IEEE80211_CRYPT_TKIP=m
+# CONFIG_WIMAX is not set
 # CONFIG_RFKILL is not set
 # CONFIG_NET_9P is not set
 
@@ -662,17 +685,27 @@ CONFIG_BLK_DEV_RAM_SIZE=4096
 # CONFIG_BLK_DEV_HD is not set
 CONFIG_MISC_DEVICES=y
 # CONFIG_PHANTOM is not set
-# CONFIG_EEPROM_93CX6 is not set
 # CONFIG_SGI_IOC4 is not set
 # CONFIG_TIFM_CORE is not set
+# CONFIG_ICS932S401 is not set
 # CONFIG_ENCLOSURE_SERVICES is not set
 # CONFIG_HP_ILO is not set
+# CONFIG_ISL29003 is not set
+# CONFIG_C2PORT is not set
+
+#
+# EEPROM support
+#
+# CONFIG_EEPROM_AT24 is not set
+# CONFIG_EEPROM_LEGACY is not set
+# CONFIG_EEPROM_93CX6 is not set
 CONFIG_HAVE_IDE=y
 CONFIG_IDE=y
 
 #
 # Please see Documentation/ide/ide.txt for help/info on IDE drives
 #
+CONFIG_IDE_XFER_MODE=y
 CONFIG_IDE_TIMINGS=y
 CONFIG_IDE_ATAPI=y
 # CONFIG_BLK_DEV_IDE_SATA is not set
@@ -684,7 +717,6 @@ CONFIG_BLK_DEV_IDECS=m
 CONFIG_BLK_DEV_IDECD=y
 CONFIG_BLK_DEV_IDECD_VERBOSE_ERRORS=y
 # CONFIG_BLK_DEV_IDETAPE is not set
-CONFIG_BLK_DEV_IDESCSI=y
 # CONFIG_IDE_TASK_IOCTL is not set
 CONFIG_IDE_PROC_FS=y
 
@@ -714,6 +746,7 @@ CONFIG_BLK_DEV_IDEDMA_PCI=y
 # CONFIG_BLK_DEV_JMICRON is not set
 # CONFIG_BLK_DEV_SC1200 is not set
 # CONFIG_BLK_DEV_PIIX is not set
+# CONFIG_BLK_DEV_IT8172 is not set
 # CONFIG_BLK_DEV_IT8213 is not set
 # CONFIG_BLK_DEV_IT821X is not set
 # CONFIG_BLK_DEV_NS87415 is not set
@@ -728,7 +761,6 @@ CONFIG_BLK_DEV_SL82C105=y
 # CONFIG_BLK_DEV_TC86C001 is not set
 CONFIG_BLK_DEV_IDE_PMAC=y
 CONFIG_BLK_DEV_IDE_PMAC_ATA100FIRST=y
-CONFIG_BLK_DEV_IDEDMA_PMAC=y
 CONFIG_BLK_DEV_IDEDMA=y
 
 #
@@ -772,6 +804,7 @@ CONFIG_SCSI_FC_ATTRS=y
 # CONFIG_SCSI_SRP_ATTRS is not set
 CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_ISCSI_TCP is not set
+# CONFIG_SCSI_CXGB3_ISCSI is not set
 # CONFIG_BLK_DEV_3W_XXXX_RAID is not set
 # CONFIG_SCSI_3W_9XXX is not set
 # CONFIG_SCSI_ACARD is not set
@@ -791,8 +824,12 @@ CONFIG_SCSI_AIC7XXX_OLD=m
 # CONFIG_MEGARAID_NEWGEN is not set
 # CONFIG_MEGARAID_LEGACY is not set
 # CONFIG_MEGARAID_SAS is not set
+# CONFIG_SCSI_MPT2SAS is not set
 # CONFIG_SCSI_HPTIOP is not set
 # CONFIG_SCSI_BUSLOGIC is not set
+# CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
+# CONFIG_FCOE is not set
 # CONFIG_SCSI_DMX3191D is not set
 # CONFIG_SCSI_EATA is not set
 # CONFIG_SCSI_FUTURE_DOMAIN is not set
@@ -822,6 +859,7 @@ CONFIG_SCSI_MAC53C94=y
 # CONFIG_SCSI_SRP is not set
 # CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
 # CONFIG_SCSI_DH is not set
+# CONFIG_SCSI_OSD_INITIATOR is not set
 # CONFIG_ATA is not set
 CONFIG_MD=y
 CONFIG_BLK_DEV_MD=m
@@ -881,6 +919,7 @@ CONFIG_THERM_ADT746X=m
 # CONFIG_ANSLCD is not set
 CONFIG_PMAC_RACKMETER=m
 CONFIG_NETDEVICES=y
+CONFIG_COMPAT_NET_DEV_OPS=y
 CONFIG_DUMMY=m
 # CONFIG_BONDING is not set
 # CONFIG_MACVLAN is not set
@@ -898,6 +937,8 @@ CONFIG_BMAC=y
 CONFIG_SUNGEM=y
 # CONFIG_CASSINI is not set
 # CONFIG_NET_VENDOR_3COM is not set
+# CONFIG_ETHOC is not set
+# CONFIG_DNET is not set
 # CONFIG_NET_TULIP is not set
 # CONFIG_HP100 is not set
 # CONFIG_IBM_NEW_EMAC_ZMII is not set
@@ -913,7 +954,6 @@ CONFIG_PCNET32=y
 # CONFIG_ADAPTEC_STARFIRE is not set
 # CONFIG_B44 is not set
 # CONFIG_FORCEDETH is not set
-# CONFIG_EEPRO100 is not set
 # CONFIG_E100 is not set
 # CONFIG_FEALNX is not set
 # CONFIG_NATSEMI is not set
@@ -923,6 +963,7 @@ CONFIG_PCNET32=y
 # CONFIG_R6040 is not set
 # CONFIG_SIS900 is not set
 # CONFIG_EPIC100 is not set
+# CONFIG_SMSC9420 is not set
 # CONFIG_SUNDANCE is not set
 # CONFIG_TLAN is not set
 # CONFIG_VIA_RHINE is not set
@@ -935,6 +976,7 @@ CONFIG_NETDEV_1000=y
 # CONFIG_E1000E is not set
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -945,18 +987,20 @@ CONFIG_NETDEV_1000=y
 # CONFIG_VIA_VELOCITY is not set
 # CONFIG_TIGON3 is not set
 # CONFIG_BNX2 is not set
-# CONFIG_MV643XX_ETH is not set
 # CONFIG_QLA3XXX is not set
 # CONFIG_ATL1 is not set
 # CONFIG_ATL1E is not set
+# CONFIG_ATL1C is not set
 # CONFIG_JME is not set
 CONFIG_NETDEV_10000=y
 # CONFIG_CHELSIO_T1 is not set
+CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_CHELSIO_T3 is not set
 # CONFIG_ENIC is not set
 # CONFIG_IXGBE is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
 # CONFIG_NIU is not set
@@ -966,6 +1010,7 @@ CONFIG_NETDEV_10000=y
 # CONFIG_BNX2X is not set
 # CONFIG_QLGE is not set
 # CONFIG_SFC is not set
+# CONFIG_BE2NET is not set
 # CONFIG_TR is not set
 
 #
@@ -974,20 +1019,11 @@ CONFIG_NETDEV_10000=y
 # CONFIG_WLAN_PRE80211 is not set
 CONFIG_WLAN_80211=y
 # CONFIG_PCMCIA_RAYCS is not set
-# CONFIG_IPW2100 is not set
-# CONFIG_IPW2200 is not set
 # CONFIG_LIBERTAS is not set
 # CONFIG_LIBERTAS_THINFIRM is not set
 # CONFIG_AIRO is not set
-CONFIG_HERMES=m
-CONFIG_APPLE_AIRPORT=m
-# CONFIG_PLX_HERMES is not set
-# CONFIG_TMD_HERMES is not set
-# CONFIG_NORTEL_HERMES is not set
-CONFIG_PCI_HERMES=m
-CONFIG_PCMCIA_HERMES=m
-# CONFIG_PCMCIA_SPECTRUM is not set
 # CONFIG_ATMEL is not set
+# CONFIG_AT76C50X_USB is not set
 # CONFIG_AIRO_CS is not set
 # CONFIG_PCMCIA_WL3501 is not set
 CONFIG_PRISM54=m
@@ -997,15 +1033,17 @@ CONFIG_PRISM54=m
 # CONFIG_RTL8187 is not set
 # CONFIG_ADM8211 is not set
 # CONFIG_MAC80211_HWSIM is not set
+# CONFIG_MWL8K is not set
 CONFIG_P54_COMMON=m
 # CONFIG_P54_USB is not set
 # CONFIG_P54_PCI is not set
+CONFIG_P54_LEDS=y
 # CONFIG_ATH5K is not set
 # CONFIG_ATH9K is not set
-# CONFIG_IWLCORE is not set
-# CONFIG_IWLWIFI_LEDS is not set
-# CONFIG_IWLAGN is not set
-# CONFIG_IWL3945 is not set
+# CONFIG_AR9170_USB is not set
+# CONFIG_IPW2100 is not set
+# CONFIG_IPW2200 is not set
+# CONFIG_IWLWIFI is not set
 # CONFIG_HOSTAP is not set
 CONFIG_B43=m
 CONFIG_B43_PCI_AUTOSELECT=y
@@ -1025,6 +1063,19 @@ CONFIG_B43LEGACY_DMA_AND_PIO_MODE=y
 # CONFIG_B43LEGACY_PIO_MODE is not set
 # CONFIG_ZD1211RW is not set
 # CONFIG_RT2X00 is not set
+CONFIG_HERMES=m
+CONFIG_HERMES_CACHE_FW_ON_INIT=y
+CONFIG_APPLE_AIRPORT=m
+# CONFIG_PLX_HERMES is not set
+# CONFIG_TMD_HERMES is not set
+# CONFIG_NORTEL_HERMES is not set
+CONFIG_PCI_HERMES=m
+CONFIG_PCMCIA_HERMES=m
+# CONFIG_PCMCIA_SPECTRUM is not set
+
+#
+# Enable WiMAX (Networking options) to see the WiMAX drivers
+#
 
 #
 # USB Network Adapters
@@ -1036,6 +1087,7 @@ CONFIG_B43LEGACY_DMA_AND_PIO_MODE=y
 CONFIG_USB_USBNET=m
 CONFIG_USB_NET_AX8817X=m
 CONFIG_USB_NET_CDCETHER=m
+# CONFIG_USB_NET_CDC_EEM is not set
 # CONFIG_USB_NET_DM9601 is not set
 # CONFIG_USB_NET_SMSC95XX is not set
 # CONFIG_USB_NET_GL620A is not set
@@ -1099,7 +1151,7 @@ CONFIG_INPUT_KEYBOARD=y
 CONFIG_INPUT_MOUSE=y
 # CONFIG_MOUSE_PS2 is not set
 # CONFIG_MOUSE_SERIAL is not set
-# CONFIG_MOUSE_APPLETOUCH is not set
+CONFIG_MOUSE_APPLETOUCH=y
 # CONFIG_MOUSE_BCM5974 is not set
 # CONFIG_MOUSE_VSXXXAA is not set
 # CONFIG_INPUT_JOYSTICK is not set
@@ -1150,10 +1202,13 @@ CONFIG_SERIAL_PMACZILOG_TTYS=y
 # CONFIG_SERIAL_JSM is not set
 # CONFIG_SERIAL_OF_PLATFORM is not set
 CONFIG_UNIX98_PTYS=y
+# CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set
 CONFIG_LEGACY_PTYS=y
 CONFIG_LEGACY_PTY_COUNT=256
+# CONFIG_HVC_UDBG is not set
 # CONFIG_IPMI_HANDLER is not set
 CONFIG_HW_RANDOM=m
+# CONFIG_HW_RANDOM_TIMERIOMEM is not set
 CONFIG_NVRAM=y
 CONFIG_GEN_RTC=y
 # CONFIG_GEN_RTC_X is not set
@@ -1232,12 +1287,9 @@ CONFIG_I2C_POWERMAC=y
 # Miscellaneous I2C Chip support
 #
 # CONFIG_DS1682 is not set
-# CONFIG_EEPROM_AT24 is not set
-# CONFIG_EEPROM_LEGACY is not set
 # CONFIG_SENSORS_PCF8574 is not set
 # CONFIG_PCF8575 is not set
 # CONFIG_SENSORS_PCA9539 is not set
-# CONFIG_SENSORS_PCF8591 is not set
 # CONFIG_SENSORS_MAX6875 is not set
 # CONFIG_SENSORS_TSL2550 is not set
 # CONFIG_I2C_DEBUG_CORE is not set
@@ -1259,11 +1311,11 @@ CONFIG_BATTERY_PMU=y
 # CONFIG_THERMAL is not set
 # CONFIG_THERMAL_HWMON is not set
 # CONFIG_WATCHDOG is not set
+CONFIG_SSB_POSSIBLE=y
 
 #
 # Sonics Silicon Backplane
 #
-CONFIG_SSB_POSSIBLE=y
 CONFIG_SSB=m
 CONFIG_SSB_SPROM=y
 CONFIG_SSB_PCIHOST_POSSIBLE=y
@@ -1281,18 +1333,13 @@ CONFIG_SSB_DRIVER_PCICORE=y
 # CONFIG_MFD_CORE is not set
 # CONFIG_MFD_SM501 is not set
 # CONFIG_HTC_PASIC3 is not set
+# CONFIG_TWL4030_CORE is not set
 # CONFIG_MFD_TMIO is not set
 # CONFIG_PMIC_DA903X is not set
 # CONFIG_MFD_WM8400 is not set
 # CONFIG_MFD_WM8350_I2C is not set
-
-#
-# Voltage and Current regulators
-#
+# CONFIG_MFD_PCF50633 is not set
 # CONFIG_REGULATOR is not set
-# CONFIG_REGULATOR_FIXED_VOLTAGE is not set
-# CONFIG_REGULATOR_VIRTUAL_CONSUMER is not set
-# CONFIG_REGULATOR_BQ24022 is not set
 
 #
 # Multimedia devices
@@ -1390,6 +1437,7 @@ CONFIG_FB_ATY_BACKLIGHT=y
 # CONFIG_FB_KYRO is not set
 CONFIG_FB_3DFX=y
 # CONFIG_FB_3DFX_ACCEL is not set
+CONFIG_FB_3DFX_I2C=y
 # CONFIG_FB_VOODOO1 is not set
 # CONFIG_FB_VT8623 is not set
 # CONFIG_FB_TRIDENT is not set
@@ -1399,12 +1447,14 @@ CONFIG_FB_3DFX=y
 # CONFIG_FB_IBM_GXT4500 is not set
 # CONFIG_FB_VIRTUAL is not set
 # CONFIG_FB_METRONOME is not set
+# CONFIG_FB_MB862XX is not set
+# CONFIG_FB_BROADSHEET is not set
 CONFIG_BACKLIGHT_LCD_SUPPORT=y
 CONFIG_LCD_CLASS_DEVICE=m
 # CONFIG_LCD_ILI9320 is not set
 # CONFIG_LCD_PLATFORM is not set
 CONFIG_BACKLIGHT_CLASS_DEVICE=y
-# CONFIG_BACKLIGHT_CORGI is not set
+CONFIG_BACKLIGHT_GENERIC=y
 
 #
 # Display device support
@@ -1444,11 +1494,13 @@ CONFIG_SND_MIXER_OSS=m
 CONFIG_SND_PCM_OSS=m
 CONFIG_SND_PCM_OSS_PLUGINS=y
 CONFIG_SND_SEQUENCER_OSS=y
+# CONFIG_SND_HRTIMER is not set
 # CONFIG_SND_DYNAMIC_MINORS is not set
 CONFIG_SND_SUPPORT_OLD_API=y
 CONFIG_SND_VERBOSE_PROCFS=y
 # CONFIG_SND_VERBOSE_PRINTK is not set
 # CONFIG_SND_DEBUG is not set
+CONFIG_SND_VMASTER=y
 CONFIG_SND_DRIVERS=y
 CONFIG_SND_DUMMY=m
 # CONFIG_SND_VIRMIDI is not set
@@ -1486,6 +1538,8 @@ CONFIG_SND_PCI=y
 # CONFIG_SND_INDIGO is not set
 # CONFIG_SND_INDIGOIO is not set
 # CONFIG_SND_INDIGODJ is not set
+# CONFIG_SND_INDIGOIOX is not set
+# CONFIG_SND_INDIGODJX is not set
 # CONFIG_SND_EMU10K1 is not set
 # CONFIG_SND_EMU10K1X is not set
 # CONFIG_SND_ENS1370 is not set
@@ -1551,28 +1605,31 @@ CONFIG_USB_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=y
 CONFIG_HID_APPLE=y
 CONFIG_HID_BELKIN=y
-CONFIG_HID_BRIGHT=y
 CONFIG_HID_CHERRY=y
 CONFIG_HID_CHICONY=y
 CONFIG_HID_CYPRESS=y
-CONFIG_HID_DELL=y
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=y
+CONFIG_HID_KYE=y
 CONFIG_HID_GYRATION=y
+CONFIG_HID_KENSINGTON=y
 CONFIG_HID_LOGITECH=y
 # CONFIG_LOGITECH_FF is not set
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
 CONFIG_HID_MICROSOFT=y
 CONFIG_HID_MONTEREY=y
+CONFIG_HID_NTRIG=y
 CONFIG_HID_PANTHERLORD=y
 # CONFIG_PANTHERLORD_FF is not set
 CONFIG_HID_PETALYNX=y
 CONFIG_HID_SAMSUNG=y
 CONFIG_HID_SONY=y
 CONFIG_HID_SUNPLUS=y
+# CONFIG_GREENASIA_FF is not set
+CONFIG_HID_TOPSEED=y
 # CONFIG_THRUSTMASTER_FF is not set
 # CONFIG_ZEROPLUS_FF is not set
 CONFIG_USB_SUPPORT=y
@@ -1603,6 +1660,7 @@ CONFIG_USB_EHCI_HCD=m
 CONFIG_USB_EHCI_ROOT_HUB_TT=y
 # CONFIG_USB_EHCI_TT_NEWSCHED is not set
 # CONFIG_USB_EHCI_HCD_PPC_OF is not set
+# CONFIG_USB_OXU210HP_HCD is not set
 # CONFIG_USB_ISP116X_HCD is not set
 # CONFIG_USB_ISP1760_HCD is not set
 CONFIG_USB_OHCI_HCD=y
@@ -1625,24 +1683,23 @@ CONFIG_USB_PRINTER=m
 # CONFIG_USB_TMC is not set
 
 #
-# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support'
+# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
 #
 
 #
-# may also be needed; see USB_STORAGE Help for more information
+# also be needed; see USB_STORAGE Help for more info
 #
 CONFIG_USB_STORAGE=m
 # CONFIG_USB_STORAGE_DEBUG is not set
 # CONFIG_USB_STORAGE_DATAFAB is not set
 # CONFIG_USB_STORAGE_FREECOM is not set
 # CONFIG_USB_STORAGE_ISD200 is not set
-# CONFIG_USB_STORAGE_DPCM is not set
 # CONFIG_USB_STORAGE_USBAT is not set
 # CONFIG_USB_STORAGE_SDDR09 is not set
 # CONFIG_USB_STORAGE_SDDR55 is not set
 # CONFIG_USB_STORAGE_JUMPSHOT is not set
 # CONFIG_USB_STORAGE_ALAUDA is not set
-CONFIG_USB_STORAGE_ONETOUCH=y
+CONFIG_USB_STORAGE_ONETOUCH=m
 # CONFIG_USB_STORAGE_KARMA is not set
 # CONFIG_USB_STORAGE_CYPRESS_ATACB is not set
 # CONFIG_USB_LIBUSUAL is not set
@@ -1665,7 +1722,7 @@ CONFIG_USB_EZUSB=y
 # CONFIG_USB_SERIAL_CH341 is not set
 # CONFIG_USB_SERIAL_WHITEHEAT is not set
 # CONFIG_USB_SERIAL_DIGI_ACCELEPORT is not set
-# CONFIG_USB_SERIAL_CP2101 is not set
+# CONFIG_USB_SERIAL_CP210X is not set
 # CONFIG_USB_SERIAL_CYPRESS_M8 is not set
 # CONFIG_USB_SERIAL_EMPEG is not set
 # CONFIG_USB_SERIAL_FTDI_SIO is not set
@@ -1701,15 +1758,19 @@ CONFIG_USB_SERIAL_KEYSPAN_USA49WLC=y
 # CONFIG_USB_SERIAL_NAVMAN is not set
 # CONFIG_USB_SERIAL_PL2303 is not set
 # CONFIG_USB_SERIAL_OTI6858 is not set
+# CONFIG_USB_SERIAL_QUALCOMM is not set
 # CONFIG_USB_SERIAL_SPCP8X5 is not set
 # CONFIG_USB_SERIAL_HP4X is not set
 # CONFIG_USB_SERIAL_SAFE is not set
+# CONFIG_USB_SERIAL_SIEMENS_MPI is not set
 # CONFIG_USB_SERIAL_SIERRAWIRELESS is not set
+# CONFIG_USB_SERIAL_SYMBOL is not set
 # CONFIG_USB_SERIAL_TI is not set
 # CONFIG_USB_SERIAL_CYBERJACK is not set
 # CONFIG_USB_SERIAL_XIRCOM is not set
 # CONFIG_USB_SERIAL_OPTION is not set
 # CONFIG_USB_SERIAL_OMNINET is not set
+# CONFIG_USB_SERIAL_OPTICON is not set
 # CONFIG_USB_SERIAL_DEBUG is not set
 
 #
@@ -1726,7 +1787,6 @@ CONFIG_USB_SERIAL_KEYSPAN_USA49WLC=y
 # CONFIG_USB_LED is not set
 # CONFIG_USB_CYPRESS_CY7C63 is not set
 # CONFIG_USB_CYTHERM is not set
-# CONFIG_USB_PHIDGET is not set
 # CONFIG_USB_IDMOUSE is not set
 # CONFIG_USB_FTDI_ELAN is not set
 CONFIG_USB_APPLEDISPLAY=m
@@ -1738,6 +1798,11 @@ CONFIG_USB_APPLEDISPLAY=m
 # CONFIG_USB_ISIGHTFW is not set
 # CONFIG_USB_VST is not set
 # CONFIG_USB_GADGET is not set
+
+#
+# OTG and related infrastructure
+#
+# CONFIG_NOP_USB_XCEIV is not set
 # CONFIG_UWB is not set
 # CONFIG_MMC is not set
 # CONFIG_MEMSTICK is not set
@@ -1748,7 +1813,9 @@ CONFIG_LEDS_CLASS=y
 # LED drivers
 #
 # CONFIG_LEDS_PCA9532 is not set
+# CONFIG_LEDS_LP5521 is not set
 # CONFIG_LEDS_PCA955X is not set
+# CONFIG_LEDS_BD2802 is not set
 
 #
 # LED Triggers
@@ -1759,11 +1826,16 @@ CONFIG_LEDS_TRIGGER_IDE_DISK=y
 # CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
 # CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
 CONFIG_LEDS_TRIGGER_DEFAULT_ON=y
+
+#
+# iptables trigger is under Netfilter config (LED target)
+#
 # CONFIG_ACCESSIBILITY is not set
 # CONFIG_INFINIBAND is not set
 # CONFIG_EDAC is not set
 # CONFIG_RTC_CLASS is not set
 # CONFIG_DMADEVICES is not set
+# CONFIG_AUXDISPLAY is not set
 # CONFIG_UIO is not set
 # CONFIG_STAGING is not set
 
@@ -1774,6 +1846,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 CONFIG_EXT3_FS_POSIX_ACL=y
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -1783,7 +1856,9 @@ CONFIG_EXT4_FS_XATTR=y
 # CONFIG_EXT4_FS_POSIX_ACL is not set
 # CONFIG_EXT4_FS_SECURITY is not set
 CONFIG_JBD=y
+# CONFIG_JBD_DEBUG is not set
 CONFIG_JBD2=y
+# CONFIG_JBD2_DEBUG is not set
 CONFIG_FS_MBCACHE=y
 # CONFIG_REISERFS_FS is not set
 # CONFIG_JFS_FS is not set
@@ -1792,6 +1867,7 @@ CONFIG_FILE_LOCKING=y
 # CONFIG_XFS_FS is not set
 # CONFIG_GFS2_FS is not set
 # CONFIG_OCFS2_FS is not set
+# CONFIG_BTRFS_FS is not set
 CONFIG_DNOTIFY=y
 CONFIG_INOTIFY=y
 CONFIG_INOTIFY_USER=y
@@ -1801,6 +1877,11 @@ CONFIG_AUTOFS4_FS=m
 CONFIG_FUSE_FS=m
 
 #
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
 # CD-ROM/DVD Filesystems
 #
 CONFIG_ISO9660_FS=y
@@ -1831,10 +1912,7 @@ CONFIG_TMPFS=y
 # CONFIG_TMPFS_POSIX_ACL is not set
 # CONFIG_HUGETLB_PAGE is not set
 # CONFIG_CONFIGFS_FS is not set
-
-#
-# Miscellaneous filesystems
-#
+CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ADFS_FS is not set
 # CONFIG_AFFS_FS is not set
 CONFIG_HFS_FS=m
@@ -1843,6 +1921,7 @@ CONFIG_HFSPLUS_FS=m
 # CONFIG_BFS_FS is not set
 # CONFIG_EFS_FS is not set
 # CONFIG_CRAMFS is not set
+# CONFIG_SQUASHFS is not set
 # CONFIG_VXFS_FS is not set
 # CONFIG_MINIX_FS is not set
 # CONFIG_OMFS_FS is not set
@@ -1851,6 +1930,7 @@ CONFIG_HFSPLUS_FS=m
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -1868,7 +1948,6 @@ CONFIG_NFS_ACL_SUPPORT=y
 CONFIG_NFS_COMMON=y
 CONFIG_SUNRPC=y
 CONFIG_SUNRPC_GSS=y
-# CONFIG_SUNRPC_REGISTER_V4 is not set
 CONFIG_RPCSEC_GSS_KRB5=y
 # CONFIG_RPCSEC_GSS_SPKM3 is not set
 CONFIG_SMB_FS=m
@@ -1940,11 +2019,13 @@ CONFIG_NLS_ISO8859_1=m
 # CONFIG_NLS_KOI8_U is not set
 CONFIG_NLS_UTF8=m
 # CONFIG_DLM is not set
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
 #
 CONFIG_BITREVERSE=y
+CONFIG_GENERIC_FIND_LAST_BIT=y
 CONFIG_CRC_CCITT=y
 CONFIG_CRC16=y
 CONFIG_CRC_T10DIF=y
@@ -1954,15 +2035,18 @@ CONFIG_CRC32=y
 CONFIG_LIBCRC32C=m
 CONFIG_ZLIB_INFLATE=y
 CONFIG_ZLIB_DEFLATE=y
+CONFIG_DECOMPRESS_GZIP=y
+CONFIG_DECOMPRESS_BZIP2=y
+CONFIG_DECOMPRESS_LZMA=y
 CONFIG_TEXTSEARCH=y
 CONFIG_TEXTSEARCH_KMP=m
 CONFIG_TEXTSEARCH_BM=m
 CONFIG_TEXTSEARCH_FSM=m
-CONFIG_PLIST=y
 CONFIG_HAS_IOMEM=y
 CONFIG_HAS_IOPORT=y
 CONFIG_HAS_DMA=y
 CONFIG_HAVE_LMB=y
+CONFIG_NLATTR=y
 
 #
 # Kernel hacking
@@ -1973,13 +2057,16 @@ CONFIG_ENABLE_MUST_CHECK=y
 CONFIG_FRAME_WARN=1024
 CONFIG_MAGIC_SYSRQ=y
 # CONFIG_UNUSED_SYMBOLS is not set
-# CONFIG_DEBUG_FS is not set
+CONFIG_DEBUG_FS=y
 # CONFIG_HEADERS_CHECK is not set
 CONFIG_DEBUG_KERNEL=y
 # CONFIG_DEBUG_SHIRQ is not set
 CONFIG_DETECT_SOFTLOCKUP=y
 # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0
+CONFIG_DETECT_HUNG_TASK=y
+# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0
 CONFIG_SCHED_DEBUG=y
 CONFIG_SCHEDSTATS=y
 # CONFIG_TIMER_STATS is not set
@@ -1994,6 +2081,7 @@ CONFIG_SCHEDSTATS=y
 # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
 CONFIG_STACKTRACE=y
 # CONFIG_DEBUG_KOBJECT is not set
+# CONFIG_DEBUG_HIGHMEM is not set
 CONFIG_DEBUG_BUGVERBOSE=y
 # CONFIG_DEBUG_INFO is not set
 # CONFIG_DEBUG_VM is not set
@@ -2001,6 +2089,7 @@ CONFIG_DEBUG_BUGVERBOSE=y
 CONFIG_DEBUG_MEMORY_INIT=y
 # CONFIG_DEBUG_LIST is not set
 # CONFIG_DEBUG_SG is not set
+# CONFIG_DEBUG_NOTIFIERS is not set
 # CONFIG_BOOT_PRINTK_DELAY is not set
 # CONFIG_RCU_TORTURE_TEST is not set
 # CONFIG_RCU_CPU_STALL_DETECTOR is not set
@@ -2009,7 +2098,14 @@ CONFIG_DEBUG_MEMORY_INIT=y
 # CONFIG_FAULT_INJECTION is not set
 CONFIG_LATENCYTOP=y
 CONFIG_SYSCTL_SYSCALL_CHECK=y
+CONFIG_NOP_TRACER=y
 CONFIG_HAVE_FUNCTION_TRACER=y
+CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
+CONFIG_HAVE_DYNAMIC_FTRACE=y
+CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_RING_BUFFER=y
+CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -2017,12 +2113,19 @@ CONFIG_HAVE_FUNCTION_TRACER=y
 # CONFIG_FUNCTION_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
-# CONFIG_DYNAMIC_PRINTK_DEBUG is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_FTRACE_STARTUP_TEST is not set
+# CONFIG_DYNAMIC_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
+CONFIG_PRINT_STACK_DEPTH=64
 # CONFIG_DEBUG_STACKOVERFLOW is not set
 # CONFIG_DEBUG_STACK_USAGE is not set
 # CONFIG_CODE_PATCHING_SELFTEST is not set
@@ -2033,6 +2136,7 @@ CONFIG_XMON_DEFAULT=y
 CONFIG_XMON_DISASSEMBLY=y
 CONFIG_DEBUGGER=y
 CONFIG_IRQSTACKS=y
+# CONFIG_VIRQ_DEBUG is not set
 # CONFIG_BDI_SWITCH is not set
 CONFIG_BOOTX_TEXT=y
 # CONFIG_PPC_EARLY_DEBUG is not set
@@ -2051,13 +2155,20 @@ CONFIG_CRYPTO=y
 #
 # CONFIG_CRYPTO_FIPS is not set
 CONFIG_CRYPTO_ALGAPI=y
+CONFIG_CRYPTO_ALGAPI2=y
 CONFIG_CRYPTO_AEAD=y
+CONFIG_CRYPTO_AEAD2=y
 CONFIG_CRYPTO_BLKCIPHER=y
+CONFIG_CRYPTO_BLKCIPHER2=y
 CONFIG_CRYPTO_HASH=y
-CONFIG_CRYPTO_RNG=y
+CONFIG_CRYPTO_HASH2=y
+CONFIG_CRYPTO_RNG2=y
+CONFIG_CRYPTO_PCOMP=y
 CONFIG_CRYPTO_MANAGER=y
+CONFIG_CRYPTO_MANAGER2=y
 # CONFIG_CRYPTO_GF128MUL is not set
 CONFIG_CRYPTO_NULL=m
+CONFIG_CRYPTO_WORKQUEUE=y
 # CONFIG_CRYPTO_CRYPTD is not set
 CONFIG_CRYPTO_AUTHENC=y
 # CONFIG_CRYPTO_TEST is not set
@@ -2127,6 +2238,7 @@ CONFIG_CRYPTO_TWOFISH_COMMON=m
 # Compression
 #
 CONFIG_CRYPTO_DEFLATE=m
+# CONFIG_CRYPTO_ZLIB is not set
 # CONFIG_CRYPTO_LZO is not set
 
 #
-- 
cgit v0.10.2


From ea5cc87c63b49c133d15ec2911bb2e49e8124516 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 1 Jun 2009 22:31:03 -0300
Subject: perf_counter tools: Add string.[ch]

Add hex conversion libraries. We are going to replace sscanf()
uses with them.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/util/string.c b/Documentation/perf_counter/util/string.c
new file mode 100644
index 0000000..ec33c0c
--- /dev/null
+++ b/Documentation/perf_counter/util/string.c
@@ -0,0 +1,34 @@
+#include "string.h"
+
+static int hex(char ch)
+{
+	if ((ch >= '0') && (ch <= '9'))
+		return ch - '0';
+	if ((ch >= 'a') && (ch <= 'f'))
+		return ch - 'a' + 10;
+	if ((ch >= 'A') && (ch <= 'F'))
+		return ch - 'A' + 10;
+	return -1;
+}
+
+/*
+ * While we find nice hex chars, build a long_val.
+ * Return number of chars processed.
+ */
+int hex2u64(const char *ptr, __u64 *long_val)
+{
+	const char *p = ptr;
+	*long_val = 0;
+
+	while (*p) {
+		const int hex_val = hex(*p);
+
+		if (hex_val < 0)
+			break;
+
+		*long_val = (*long_val << 4) | hex_val;
+		p++;
+	}
+
+	return p - ptr;
+}
diff --git a/Documentation/perf_counter/util/string.h b/Documentation/perf_counter/util/string.h
new file mode 100644
index 0000000..72812c1
--- /dev/null
+++ b/Documentation/perf_counter/util/string.h
@@ -0,0 +1,8 @@
+#ifndef _PERF_STRING_H_
+#define _PERF_STRING_H_
+
+#include <linux/types.h>
+
+int hex2u64(const char *ptr, __u64 *val);
+
+#endif
-- 
cgit v0.10.2


From a0055ae2a4e13db9534c438cf8f3896181da6afc Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 1 Jun 2009 17:50:19 -0300
Subject: perf_counter tools: Use hex2u64 in more places

This has also a nice side effect, tools built on newer systems such as
fedora 10 again work on systems with older versions of glibc:

My workstation:

[acme@doppio ~]$ rpm -q glibc.x86_64
glibc-2.9-3.x86_64

Test machine:

[acme@emilia ~]$ rpm -q glibc.x86_64
glibc-2.5-24

Before:

[acme@emilia ~]$ perf
perf: /lib64/libc.so.6: version `GLIBC_2.7' not found (required by perf)
[acme@emilia ~]$ nm `which perf` | grep GLIBC_2\.7
                 U __isoc99_sscanf@@GLIBC_2.7
[acme@emilia ~]$

After:
[acme@emilia ~]$ perf
usage: perf [--version] [--help] COMMAND [ARGS]

The most commonly used perf commands are:
   record   Run a command and record its profile into perf.data
   report   Read perf.data (created by perf record) and display the
profile
   stat     Run a command and gather performance counter statistics
   top      Run a command and profile it

See 'perf help COMMAND' for more information on a specific command.
[acme@emilia ~]$ nm `which perf` | grep GLIBC_2\.7
[acme@emilia ~]$

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090601205019.GA7805@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 416ab11..3b8275f 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -296,6 +296,7 @@ LIB_H += util/quote.h
 LIB_H += util/util.h
 LIB_H += util/help.h
 LIB_H += util/strbuf.h
+LIB_H += util/string.h
 LIB_H += util/run-command.h
 LIB_H += util/sigchain.h
 LIB_H += util/symbol.h
@@ -315,6 +316,7 @@ LIB_OBJS += util/rbtree.o
 LIB_OBJS += util/run-command.o
 LIB_OBJS += util/quote.o
 LIB_OBJS += util/strbuf.o
+LIB_OBJS += util/string.o
 LIB_OBJS += util/usage.o
 LIB_OBJS += util/wrapper.o
 LIB_OBJS += util/sigchain.o
diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 96bfb7c..9c151de 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -5,6 +5,7 @@
 #include "util/util.h"
 #include "util/parse-options.h"
 #include "util/parse-events.h"
+#include "util/string.h"
 
 #include <sched.h>
 
@@ -165,12 +166,10 @@ static pid_t pid_synthesize_comm_event(pid_t pid)
 {
 	struct comm_event comm_ev;
 	char filename[PATH_MAX];
-	pid_t spid, ppid;
 	char bf[BUFSIZ];
-	int fd, nr, ret;
-	char comm[18];
+	int fd, ret;
 	size_t size;
-	char state;
+	char *field, *sep;
 
 	snprintf(filename, sizeof(filename), "/proc/%d/stat", pid);
 
@@ -185,20 +184,22 @@ static pid_t pid_synthesize_comm_event(pid_t pid)
 	}
 	close(fd);
 
+	/* 9027 (cat) R 6747 9027 6747 34816 9027 ... */
 	memset(&comm_ev, 0, sizeof(comm_ev));
-        nr = sscanf(bf, "%d %s %c %d %d ",
-			&spid, comm, &state, &ppid, &comm_ev.pid);
-	if (nr != 5) {
-		fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n",
-			filename);
-		exit(EXIT_FAILURE);
-	}
+	field = strchr(bf, '(');
+	if (field == NULL)
+		goto out_failure;
+	sep = strchr(++field, ')');
+	if (sep == NULL)
+		goto out_failure;
+	size = sep - field;
+	memcpy(comm_ev.comm, field, size++);
+	field = strchr(sep + 4, ' ');
+	if (field == NULL)
+		goto out_failure;
+	comm_ev.pid = atoi(++field);
 	comm_ev.header.type = PERF_EVENT_COMM;
 	comm_ev.tid = pid;
-	size = strlen(comm);
-	comm[--size] = '\0'; /* Remove the ')' at the end */
-	--size; /* Remove the '(' at the begin */
-	memcpy(comm_ev.comm, comm + 1, size);
 	size = ALIGN(size, sizeof(uint64_t));
 	comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
 
@@ -208,6 +209,11 @@ static pid_t pid_synthesize_comm_event(pid_t pid)
 		exit(-1);
 	}
 	return comm_ev.pid;
+out_failure:
+	fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n",
+		filename);
+	exit(EXIT_FAILURE);
+	return -1;
 }
 
 static void pid_synthesize_mmap_events(pid_t pid, pid_t pgid)
@@ -223,23 +229,25 @@ static void pid_synthesize_mmap_events(pid_t pid, pid_t pgid)
 		exit(EXIT_FAILURE);
 	}
 	while (1) {
-		char bf[BUFSIZ];
-		unsigned char vm_read, vm_write, vm_exec, vm_mayshare;
+		char bf[BUFSIZ], *pbf = bf;
 		struct mmap_event mmap_ev = {
 			.header.type = PERF_EVENT_MMAP,
 		};
-		unsigned long ino;
-		int major, minor;
+		int n;
 		size_t size;
 		if (fgets(bf, sizeof(bf), fp) == NULL)
 			break;
 
 		/* 00400000-0040c000 r-xp 00000000 fd:01 41038  /bin/cat */
-		sscanf(bf, "%llx-%llx %c%c%c%c %llx %x:%x %lu",
-			&mmap_ev.start, &mmap_ev.len,
-                        &vm_read, &vm_write, &vm_exec, &vm_mayshare,
-                        &mmap_ev.pgoff, &major, &minor, &ino);
-		if (vm_exec == 'x') {
+		n = hex2u64(pbf, &mmap_ev.start);
+		if (n < 0)
+			continue;
+		pbf += n + 1;
+		n = hex2u64(pbf, &mmap_ev.len);
+		if (n < 0)
+			continue;
+		pbf += n + 3;
+		if (*pbf == 'x') { /* vm_exec */
 			char *execname = strrchr(bf, ' ');
 
 			if (execname == NULL || execname[1] != '/')
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 4705679..7973092 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -5,6 +5,7 @@
 #include "util/cache.h"
 #include "util/rbtree.h"
 #include "util/symbol.h"
+#include "util/string.h"
 
 #include "perf.h"
 
diff --git a/Documentation/perf_counter/util/parse-events.c b/Documentation/perf_counter/util/parse-events.c
index 88c903e..2fdfd1d 100644
--- a/Documentation/perf_counter/util/parse-events.c
+++ b/Documentation/perf_counter/util/parse-events.c
@@ -4,6 +4,7 @@
 #include "parse-options.h"
 #include "parse-events.h"
 #include "exec_cmd.h"
+#include "string.h"
 
 int nr_counters;
 
@@ -105,22 +106,26 @@ static __u64 match_event_symbols(const char *str)
 	__u64 config, id;
 	int type;
 	unsigned int i;
-	char mask_str[4];
+	const char *sep, *pstr;
 
-	if (sscanf(str, "r%llx", &config) == 1)
+	if (str[0] == 'r' && hex2u64(str + 1, &config) > 0)
 		return config | PERF_COUNTER_RAW_MASK;
 
-	switch (sscanf(str, "%d:%llu:%2s", &type, &id, mask_str)) {
-		case 3:
-			if (strchr(mask_str, 'k'))
+	pstr = str;
+	sep = strchr(pstr, ':');
+	if (sep) {
+		type = atoi(pstr);
+		pstr = sep + 1;
+		id = atoi(pstr);
+		sep = strchr(pstr, ':');
+		if (sep) {
+			pstr = sep + 1;
+			if (strchr(pstr, 'k'))
 				event_mask[nr_counters] |= EVENT_MASK_USER;
-			if (strchr(mask_str, 'u'))
+			if (strchr(pstr, 'u'))
 				event_mask[nr_counters] |= EVENT_MASK_KERNEL;
-		case 2:
-			return EID(type, id);
-
-		default:
-			break;
+		}
+		return EID(type, id);
 	}
 
 	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
diff --git a/Documentation/perf_counter/util/symbol.c b/Documentation/perf_counter/util/symbol.c
index 4728121..31e8fae 100644
--- a/Documentation/perf_counter/util/symbol.c
+++ b/Documentation/perf_counter/util/symbol.c
@@ -1,5 +1,6 @@
 #include "util.h"
 #include "../perf.h"
+#include "string.h"
 #include "symbol.h"
 
 #include <libelf.h>
@@ -122,39 +123,6 @@ size_t dso__fprintf(struct dso *self, FILE *fp)
 	return ret;
 }
 
-static int hex(char ch)
-{
-	if ((ch >= '0') && (ch <= '9'))
-		return ch - '0';
-	if ((ch >= 'a') && (ch <= 'f'))
-		return ch - 'a' + 10;
-	if ((ch >= 'A') && (ch <= 'F'))
-		return ch - 'A' + 10;
-	return -1;
-}
-
-/*
- * While we find nice hex chars, build a long_val.
- * Return number of chars processed.
- */
-static int hex2long(char *ptr, unsigned long *long_val)
-{
-	const char *p = ptr;
-	*long_val = 0;
-
-	while (*p) {
-		const int hex_val = hex(*p);
-
-		if (hex_val < 0)
-			break;
-
-		*long_val = (*long_val << 4) | hex_val;
-		p++;
-	}
-
-	return p - ptr;
-}
-
 static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter)
 {
 	struct rb_node *nd, *prevnd;
@@ -166,7 +134,7 @@ static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter)
 		goto out_failure;
 
 	while (!feof(file)) {
-		unsigned long start;
+		__u64 start;
 		struct symbol *sym;
 		int line_len, len;
 		char symbol_type;
@@ -180,7 +148,7 @@ static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter)
 
 		line[--line_len] = '\0'; /* \n */
 
-		len = hex2long(line, &start);
+		len = hex2u64(line, &start);
 
 		len++;
 		if (len + 2 >= line_len)
-- 
cgit v0.10.2


From c8c96525f3c25f43a4fa230e293c4976c0c36cc1 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 1 Jun 2009 17:50:57 -0300
Subject: perf_counter tools: Add missing rb_erase in dso__delete_symbols

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090601205057.GB7805@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/util/symbol.c b/Documentation/perf_counter/util/symbol.c
index 31e8fae..039931f 100644
--- a/Documentation/perf_counter/util/symbol.c
+++ b/Documentation/perf_counter/util/symbol.c
@@ -58,6 +58,7 @@ static void dso__delete_symbols(struct dso *self)
 	while (next) {
 		pos = rb_entry(next, struct symbol, rb_node);
 		next = rb_next(&pos->rb_node);
+		rb_erase(&pos->rb_node, &self->syms);
 		symbol__delete(pos, self->sym_priv_size);
 	}
 }
-- 
cgit v0.10.2


From 4778541470cf7d074acd998fd40c06b94711e4ad Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Fri, 29 May 2009 07:41:26 +0000
Subject: sh: fix CONFIG_SH_PCLK_FREQ bug for sh7724

CONFIG_SH_PCLK_FREQ=33333333 is correct for sh7724.
sh7724 master clock is 33333333, but peripheral is 41666666.
This bug came to light because sh-sci driver had changed clk
from "module_clk" to "peripheral_clk"

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index e9392aa..dec8b7b 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -502,8 +502,7 @@ config SH_PCLK_FREQ
 			      CPU_SUBTYPE_SH7760 || CPU_SUBTYPE_SH7705 || \
 			      CPU_SUBTYPE_SH7203 || CPU_SUBTYPE_SH7206 || \
 			      CPU_SUBTYPE_SH7263 || CPU_SUBTYPE_MXG    || \
-			      CPU_SUBTYPE_SH7786
-	default "41666666" if CPU_SUBTYPE_SH7724
+			      CPU_SUBTYPE_SH7786 || CPU_SUBTYPE_SH7724
 	default "60000000" if CPU_SUBTYPE_SH7751 || CPU_SUBTYPE_SH7751R
 	default "66000000" if CPU_SUBTYPE_SH4_202
 	default "50000000"
diff --git a/arch/sh/configs/se7724_defconfig b/arch/sh/configs/se7724_defconfig
index a029d0fb..96d2587 100644
--- a/arch/sh/configs/se7724_defconfig
+++ b/arch/sh/configs/se7724_defconfig
@@ -243,7 +243,7 @@ CONFIG_SH_7724_SOLUTION_ENGINE=y
 #
 CONFIG_SH_TIMER_TMU=y
 # CONFIG_SH_TIMER_CMT is not set
-CONFIG_SH_PCLK_FREQ=41666666
+CONFIG_SH_PCLK_FREQ=33333333
 # CONFIG_NO_HZ is not set
 # CONFIG_HIGH_RES_TIMERS is not set
 CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
-- 
cgit v0.10.2


From dd0a3e77c825c9f5c6d2a97deb047f8d52026581 Mon Sep 17 00:00:00 2001
From: SUGIOKA Toshinobu <sugioka@itonet.co.jp>
Date: Mon, 1 Jun 2009 03:53:41 +0000
Subject: serial: sh-sci: Fix up PORT_SCI console output ordering.

Fix SCI transmission sequence in console output function.

This reorders the write sequence to match the SH-3 manual, and corrects
a console corruption bug observed on SH-3 SCI.

Signed-off-by: Toshinobu Sugioka <sugioka@itonet.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index fa4d52a..a4cf107 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -151,9 +151,8 @@ static void sci_poll_put_char(struct uart_port *port, unsigned char c)
 		status = sci_in(port, SCxSR);
 	} while (!(status & SCxSR_TDxE(port)));
 
-	sci_in(port, SCxSR);            /* Dummy read */
-	sci_out(port, SCxSR, SCxSR_TDxE_CLEAR(port) & ~SCxSR_TEND(port));
 	sci_out(port, SCxTDR, c);
+	sci_out(port, SCxSR, SCxSR_TDxE_CLEAR(port) & ~SCxSR_TEND(port));
 }
 #endif /* CONFIG_CONSOLE_POLL || CONFIG_SERIAL_SH_SCI_CONSOLE */
 
-- 
cgit v0.10.2


From d974ac24b762296eeaebb23e5eff8ed15ce44529 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Tue, 2 Jun 2009 02:49:13 +0000
Subject: sh: add RAMCR definition for sh4

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/cpu-sh4/cpu/cache.h b/arch/sh/include/cpu-sh4/cpu/cache.h
index 1c61ebf..7bfb9e8 100644
--- a/arch/sh/include/cpu-sh4/cpu/cache.h
+++ b/arch/sh/include/cpu-sh4/cpu/cache.h
@@ -38,5 +38,7 @@
 #define CACHE_IC_ADDRESS_ARRAY	0xf0000000
 #define CACHE_OC_ADDRESS_ARRAY	0xf4000000
 
+#define RAMCR			0xFF000074
+
 #endif /* __ASM_CPU_SH4_CACHE_H */
 
-- 
cgit v0.10.2


From fab88d9fe98e9091aafeb9789fbf2e04fdace8ed Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Tue, 2 Jun 2009 02:49:20 +0000
Subject: sh: add weak l2_cache_init function.

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/init.c b/arch/sh/kernel/cpu/init.c
index d29e69c..ad85421 100644
--- a/arch/sh/kernel/cpu/init.c
+++ b/arch/sh/kernel/cpu/init.c
@@ -62,6 +62,11 @@ static void __init speculative_execution_init(void)
 #define speculative_execution_init()	do { } while (0)
 #endif
 
+/* 2nd-level cache init */
+void __uses_jump_to_uncached __attribute__ ((weak)) l2_cache_init(void)
+{
+}
+
 /*
  * Generic first-level cache init
  */
@@ -146,6 +151,8 @@ static void __uses_jump_to_uncached cache_init(void)
 	flags &= ~CCR_CACHE_ENABLE;
 #endif
 
+	l2_cache_init();
+
 	ctrl_outl(flags, CCR);
 	back_to_cached();
 }
-- 
cgit v0.10.2


From b4bd9eb0d8f5c20f942ff037c8a7939f69cb188a Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Tue, 2 Jun 2009 02:49:25 +0000
Subject: sh: sh7724: L2 cache initialization.

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index 191f0e2..000f3b8 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -495,6 +495,15 @@ void __init plat_early_device_setup(void)
 				   ARRAY_SIZE(sh7724_early_devices));
 }
 
+#define RAMCR_CACHE_L2FC	0x0002
+#define RAMCR_CACHE_L2E		0x0001
+#define L2_CACHE_ENABLE		(RAMCR_CACHE_L2E|RAMCR_CACHE_L2FC)
+void __uses_jump_to_uncached l2_cache_init(void)
+{
+	/* Enable L2 cache */
+	ctrl_outl(L2_CACHE_ENABLE, RAMCR);
+}
+
 enum {
 	UNUSED = 0,
 
-- 
cgit v0.10.2


From 138f025267dcc07d5e7d0bb1f20e9a6b5f2fdcf7 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Tue, 2 Jun 2009 02:49:28 +0000
Subject: sh: sh7723: L2 cache initialization.

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
index 04cb4aa..d8f4a13 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
@@ -14,6 +14,7 @@
 #include <linux/serial_sci.h>
 #include <linux/uio_driver.h>
 #include <linux/sh_timer.h>
+#include <linux/io.h>
 #include <asm/clock.h>
 #include <asm/mmzone.h>
 
@@ -485,6 +486,15 @@ void __init plat_early_device_setup(void)
 				   ARRAY_SIZE(sh7723_early_devices));
 }
 
+#define RAMCR_CACHE_L2FC	0x0002
+#define RAMCR_CACHE_L2E		0x0001
+#define L2_CACHE_ENABLE		(RAMCR_CACHE_L2E|RAMCR_CACHE_L2FC)
+void __uses_jump_to_uncached l2_cache_init(void)
+{
+	/* Enable L2 cache */
+	ctrl_outl(L2_CACHE_ENABLE, RAMCR);
+}
+
 enum {
 	UNUSED=0,
 
-- 
cgit v0.10.2


From 2af15d6a44b871ad4c2a651302374cde8f335480 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 28 May 2009 13:37:24 -0400
Subject: ftrace: add kernel command line function filtering

When using ftrace=function on the command line to trace functions
on boot up, one can not filter out functions that are commonly called.

This patch adds two new ftrace command line commands.

  ftrace_notrace=function-list
  ftrace_filter=function-list

Where function-list is a comma separated list of functions to filter.
The ftrace_notrace will make the functions listed not be included
in the function tracing, and ftrace_filter will only trace the functions
listed.

These two act the same as the debugfs/tracing/set_ftrace_notrace and
debugfs/tracing/set_ftrace_filter respectively.

The simple glob expressions that are allowed by the filter files can also
be used by the command line interface.

	ftrace_notrace=rcu*,*lock,*spin*

Will not trace any function that starts with rcu, ends with lock, or has
the word spin in it.

Note, if the self tests are enabled, they may interfere with the filtering
set by the command lines.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9243dd8..fcd3bfb 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -751,12 +751,25 @@ and is between 256 and 4096 characters. It is defined in the file
 			ia64_pal_cache_flush instead of SAL_CACHE_FLUSH.
 
 	ftrace=[tracer]
-			[ftrace] will set and start the specified tracer
+			[FTRACE] will set and start the specified tracer
 			as early as possible in order to facilitate early
 			boot debugging.
 
 	ftrace_dump_on_oops
-			[ftrace] will dump the trace buffers on oops.
+			[FTRACE] will dump the trace buffers on oops.
+
+	ftrace_filter=[function-list]
+			[FTRACE] Limit the functions traced by the function
+			tracer at boot up. function-list is a comma separated
+			list of functions. This list can be changed at run
+			time by the set_ftrace_filter file in the debugfs
+			tracing directory. 
+
+	ftrace_notrace=[function-list]
+			[FTRACE] Do not trace the functions specified in
+			function-list. This list can be changed at run time
+			by the set_ftrace_notrace file in the debugfs
+			tracing directory.
 
 	gamecon.map[2|3]=
 			[HW,JOY] Multisystem joystick and NES/SNES/PSX pad
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 140699a..2074e5b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -32,6 +32,7 @@
 #include <trace/events/sched.h>
 
 #include <asm/ftrace.h>
+#include <asm/setup.h>
 
 #include "trace_output.h"
 #include "trace_stat.h"
@@ -2369,6 +2370,45 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset)
 	ftrace_set_regex(buf, len, reset, 0);
 }
 
+/*
+ * command line interface to allow users to set filters on boot up.
+ */
+#define FTRACE_FILTER_SIZE		COMMAND_LINE_SIZE
+static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
+static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
+
+static int __init set_ftrace_notrace(char *str)
+{
+	strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
+	return 1;
+}
+__setup("ftrace_notrace=", set_ftrace_notrace);
+
+static int __init set_ftrace_filter(char *str)
+{
+	strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
+	return 1;
+}
+__setup("ftrace_filter=", set_ftrace_filter);
+
+static void __init set_ftrace_early_filter(char *buf, int enable)
+{
+	char *func;
+
+	while (buf) {
+		func = strsep(&buf, ",");
+		ftrace_set_regex(func, strlen(func), 0, enable);
+	}
+}
+
+static void __init set_ftrace_early_filters(void)
+{
+	if (ftrace_filter_buf[0])
+		set_ftrace_early_filter(ftrace_filter_buf, 1);
+	if (ftrace_notrace_buf[0])
+		set_ftrace_early_filter(ftrace_notrace_buf, 0);
+}
+
 static int
 ftrace_regex_release(struct inode *inode, struct file *file, int enable)
 {
@@ -2829,6 +2869,8 @@ void __init ftrace_init(void)
 	if (ret)
 		pr_warning("Failed to register trace ftrace module notifier\n");
 
+	set_ftrace_early_filters();
+
 	return;
  failed:
 	ftrace_disabled = 1;
-- 
cgit v0.10.2


From 5e0a093910876882f91f1d4b8a1635a099e6c7ba Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 28 May 2009 15:50:13 -0400
Subject: tracing: fix config options to not show when automatically selected

There are two options that are selected by all tracers, but we want
to have those options available when no tracer is selected. These are

 The event tracer and sched switch tracer.

The are enabled by all tracers, but if a tracer is not selected we want
the options to appear. All tracers including them select TRACING.
Thus what we would like to do is:

  config EVENT_TRACER
	bool "prompt"
	depends on TRACING
	select TRACING

But that gives us a bug in the kbuild system since we just created a
circular dependency. We only want the prompt to show when TRACING is off.

This patch adds GENERIC_TRACER that all tracers will select instead of
TRACING. The two options (sched switch and event tracer) will select
TRACING directly and depend on !GENERIC_TRACER. This solves the cicular
dependency.

[ Impact: hide options that are selected by default ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a508b9d..6e55cc3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -56,6 +56,13 @@ config CONTEXT_SWITCH_TRACER
 	select MARKERS
 	bool
 
+# All tracer options should select GENERIC_TRACER. For those options that are
+# enabled by all tracers (context switch and event tracer) they select TRACING.
+# This allows those options to appear when no other tracer is selected. But the
+# options do not appear when something else selects it. We need the two options
+# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
+# hidding of the automatic options options.
+
 config TRACING
 	bool
 	select DEBUG_FS
@@ -66,6 +73,10 @@ config TRACING
 	select BINARY_PRINTF
 	select EVENT_TRACING
 
+config GENERIC_TRACER
+	bool
+	select TRACING
+
 #
 # Minimum requirements an architecture has to meet for us to
 # be able to offer generic tracing facilities:
@@ -95,7 +106,7 @@ config FUNCTION_TRACER
 	depends on HAVE_FUNCTION_TRACER
 	select FRAME_POINTER
 	select KALLSYMS
-	select TRACING
+	select GENERIC_TRACER
 	select CONTEXT_SWITCH_TRACER
 	help
 	  Enable the kernel to trace every kernel function. This is done
@@ -126,7 +137,7 @@ config IRQSOFF_TRACER
 	depends on TRACE_IRQFLAGS_SUPPORT
 	depends on GENERIC_TIME
 	select TRACE_IRQFLAGS
-	select TRACING
+	select GENERIC_TRACER
 	select TRACER_MAX_TRACE
 	help
 	  This option measures the time spent in irqs-off critical
@@ -147,7 +158,7 @@ config PREEMPT_TRACER
 	default n
 	depends on GENERIC_TIME
 	depends on PREEMPT
-	select TRACING
+	select GENERIC_TRACER
 	select TRACER_MAX_TRACE
 	help
 	  This option measures the time spent in preemption off critical
@@ -166,7 +177,7 @@ config PREEMPT_TRACER
 config SYSPROF_TRACER
 	bool "Sysprof Tracer"
 	depends on X86
-	select TRACING
+	select GENERIC_TRACER
 	select CONTEXT_SWITCH_TRACER
 	help
 	  This tracer provides the trace needed by the 'Sysprof' userspace
@@ -174,7 +185,7 @@ config SYSPROF_TRACER
 
 config SCHED_TRACER
 	bool "Scheduling Latency Tracer"
-	select TRACING
+	select GENERIC_TRACER
 	select CONTEXT_SWITCH_TRACER
 	select TRACER_MAX_TRACE
 	help
@@ -183,6 +194,7 @@ config SCHED_TRACER
 
 config ENABLE_CONTEXT_SWITCH_TRACER
 	bool "Trace process context switches"
+	depends on !GENERIC_TRACER
 	select TRACING
 	select CONTEXT_SWITCH_TRACER
 	help
@@ -191,6 +203,7 @@ config ENABLE_CONTEXT_SWITCH_TRACER
 
 config ENABLE_EVENT_TRACING
 	bool "Trace various events in the kernel"
+	depends on !GENERIC_TRACER
 	select TRACING
 	help
 	  This tracer hooks to various trace points in the kernel
@@ -204,14 +217,14 @@ config ENABLE_EVENT_TRACING
 config FTRACE_SYSCALLS
 	bool "Trace syscalls"
 	depends on HAVE_FTRACE_SYSCALLS
-	select TRACING
+	select GENERIC_TRACER
 	select KALLSYMS
 	help
 	  Basic tracer to catch the syscall entry and exit events.
 
 config BOOT_TRACER
 	bool "Trace boot initcalls"
-	select TRACING
+	select GENERIC_TRACER
 	select CONTEXT_SWITCH_TRACER
 	help
 	  This tracer helps developers to optimize boot times: it records
@@ -228,7 +241,7 @@ config BOOT_TRACER
 
 config TRACE_BRANCH_PROFILING
 	bool
-	select TRACING
+	select GENERIC_TRACER
 
 choice
 	prompt "Branch Profiling"
@@ -308,7 +321,7 @@ config BRANCH_TRACER
 config POWER_TRACER
 	bool "Trace power consumption behavior"
 	depends on X86
-	select TRACING
+	select GENERIC_TRACER
 	help
 	  This tracer helps developers to analyze and optimize the kernels
 	  power management decisions, specifically the C-state and P-state
@@ -342,14 +355,14 @@ config STACK_TRACER
 config HW_BRANCH_TRACER
 	depends on HAVE_HW_BRANCH_TRACER
 	bool "Trace hw branches"
-	select TRACING
+	select GENERIC_TRACER
 	help
 	  This tracer records all branches on the system in a circular
 	  buffer giving access to the last N branches for each cpu.
 
 config KMEMTRACE
 	bool "Trace SLAB allocations"
-	select TRACING
+	select GENERIC_TRACER
 	help
 	  kmemtrace provides tracing for slab allocator functions, such as
 	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
@@ -369,7 +382,7 @@ config KMEMTRACE
 
 config WORKQUEUE_TRACER
 	bool "Trace workqueues"
-	select TRACING
+	select GENERIC_TRACER
 	help
 	  The workqueue tracer provides some statistical informations
           about each cpu workqueue thread such as the number of the
@@ -385,7 +398,7 @@ config BLK_DEV_IO_TRACE
 	select RELAY
 	select DEBUG_FS
 	select TRACEPOINTS
-	select TRACING
+	select GENERIC_TRACER
 	select STACKTRACE
 	help
 	  Say Y here if you want to be able to trace the block layer actions
@@ -446,7 +459,7 @@ config FTRACE_SELFTEST
 
 config FTRACE_STARTUP_TEST
 	bool "Perform a startup test on ftrace"
-	depends on TRACING
+	depends on GENERIC_TRACER
 	select FTRACE_SELFTEST
 	help
 	  This option performs a series of startup tests on ftrace. On bootup
@@ -457,7 +470,7 @@ config FTRACE_STARTUP_TEST
 config MMIOTRACE
 	bool "Memory mapped IO tracing"
 	depends on HAVE_MMIOTRACE_SUPPORT && PCI
-	select TRACING
+	select GENERIC_TRACER
 	help
 	  Mmiotrace traces Memory Mapped I/O access and is meant for
 	  debugging and reverse engineering. It is called from the ioremap
-- 
cgit v0.10.2


From 897f17a65389a26509bd0c79a9812d1c9ea8ea6f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 28 May 2009 16:31:21 -0400
Subject: tracing: combine the default tracers into one config

Both event tracer and sched switch plugin are selected by default
by all generic tracers. But if no generic tracer is enabled, their options
appear. But ether one of them will select the other, thus it only
makes sense to have the default tracers be selected by one option.

[ Impact: clean up kconfig menu ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 6e55cc3..4a13e5a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -192,27 +192,14 @@ config SCHED_TRACER
 	  This tracer tracks the latency of the highest priority task
 	  to be scheduled in, starting from the point it has woken up.
 
-config ENABLE_CONTEXT_SWITCH_TRACER
-	bool "Trace process context switches"
-	depends on !GENERIC_TRACER
-	select TRACING
-	select CONTEXT_SWITCH_TRACER
-	help
-	  This tracer gets called from the context switch and records
-	  all switching of tasks.
-
-config ENABLE_EVENT_TRACING
-	bool "Trace various events in the kernel"
+config ENABLE_DEFAULT_TRACERS
+	bool "Trace process context switches and events"
 	depends on !GENERIC_TRACER
 	select TRACING
 	help
 	  This tracer hooks to various trace points in the kernel
 	  allowing the user to pick and choose which trace point they
-	  want to trace.
-
-	  Note, all tracers enable event tracing. This option is
-	  only a convenience to enable event tracing when no other
-	  tracers are selected.
+	  want to trace. It also includes the sched_switch tracer plugin.
 
 config FTRACE_SYSCALLS
 	bool "Trace syscalls"
-- 
cgit v0.10.2


From 6e25db44a7ad7eb380f4ec774ec00a8fcddea112 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 29 May 2009 11:24:59 +0800
Subject: tracing/events: fix a typo in __string() format output

"tsize" should be "\tsize". Also remove the space before "__str_loc".

Before:
 # cat tracing/events/irq/irq_handler_entry/format
        ...
        field:int irq;  offset:12;      size:4;
        field: __str_loc name;  offset:16;tsize:2;
        ...

After:
 # cat tracing/events/irq/irq_handler_entry/format
	...
        field:int irq;  offset:12;      size:4;
        field:__str_loc name;   offset:16;      size:2;
	...

[ Impact: standardize __string field description in events format file ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index b4ec83a..9276ec4 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -209,8 +209,8 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 
 #undef __string
 #define __string(item, src)						       \
-	ret = trace_seq_printf(s, "\tfield: __str_loc " #item ";\t"	       \
-			       "offset:%u;tsize:%u;\n",			       \
+	ret = trace_seq_printf(s, "\tfield:__str_loc " #item ";\t"	       \
+			       "offset:%u;\tsize:%u;\n",		       \
 			       (unsigned int)offsetof(typeof(field),	       \
 					__str_loc_##item),		       \
 			       (unsigned int)sizeof(field.__str_loc_##item));  \
-- 
cgit v0.10.2


From a9c1c3abe1160a5632e48c929b02b740556bf423 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 1 Jun 2009 15:35:13 +0800
Subject: tracing/events: put TP_fast_assign into braces

Currently TP_fast_assign has a limitation that we can't define local
variables in it.

Here's one use case when we introduce __dynamic_array():

TP_fast_assign(
	type *p = __get_dynamic_array(item);

	foo(p);
	bar(p);
),

[ Impact: allow defining local variables in TP_fast_assign ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4A2384B1.90100@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 9276ec4..ee92682 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -471,7 +471,7 @@ static void ftrace_raw_event_##call(proto)				\
 		return;							\
 	entry	= ring_buffer_event_data(event);			\
 									\
-	assign;								\
+	{ assign; }							\
 									\
 	if (!filter_current_check_discard(event_call, entry, event))	\
 		trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
-- 
cgit v0.10.2


From 7fcb7c472f455d1711eb5a7633204dba8800a6d6 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 1 Jun 2009 15:35:46 +0800
Subject: tracing/events: introduce __dynamic_array()

__string() is limited:

  - it's a char array, but we may want to define array with other types
  - a source string should be available, but we may just know the string size

We introduce __dynamic_array() to break those limitations, and __string()
becomes a wrapper of it. As a side effect, now __get_str() can be used
in TP_fast_assign but not only TP_print.

Take XFS for example, we have the string length in the dirent, but the
string itself is not NULL-terminated, so __dynamic_array() can be used:

TRACE_EVENT(xfs_dir2,
	TP_PROTO(struct xfs_da_args *args),
	TP_ARGS(args),

	TP_STRUCT__entry(
		__field(int, namelen)
		__dynamic_array(char, name, args->namelen + 1)
		...
	),

	TP_fast_assign(
		char *name = __get_str(name);

		if (args->namelen)
			memcpy(name, args->name, args->namelen);
		name[args->namelen] = '\0';

		__entry->namelen = args->namelen;
	),

	TP_printk("name %.*s namelen %d",
		  __entry->namelen ? __get_str(name) : NULL
		  __entry->namelen)
);

[ Impact: allow defining dynamic size arrays ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4A2384D2.3080403@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index ee92682..b5478da 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -18,14 +18,17 @@
 
 #include <linux/ftrace_event.h>
 
+#undef __field
+#define __field(type, item)		type	item;
+
 #undef __array
 #define __array(type, item, len)	type	item[len];
 
-#undef __field
-#define __field(type, item)		type	item;
+#undef __dynamic_array
+#define __dynamic_array(type, item, len) unsigned short __data_loc_##item;
 
 #undef __string
-#define __string(item, src)		unsigned short	__str_loc_##item;
+#define __string(item, src) __dynamic_array(char, item, -1)
 
 #undef TP_STRUCT__entry
 #define TP_STRUCT__entry(args...) args
@@ -35,7 +38,7 @@
 	struct ftrace_raw_##name {				\
 		struct trace_entry	ent;			\
 		tstruct						\
-		char			__str_data[0];		\
+		char			__data[0];		\
 	};							\
 	static struct ftrace_event_call event_##name
 
@@ -47,30 +50,31 @@
  *
  * Include the following:
  *
- * struct ftrace_str_offsets_<call> {
- *	int				<str1>;
- *	int				<str2>;
+ * struct ftrace_data_offsets_<call> {
+ *	int				<item1>;
+ *	int				<item2>;
  *	[...]
  * };
  *
- * The __string() macro will create each int <str>, this is to
- * keep the offset of each string from the beggining of the event
- * once we perform the strlen() of the src strings.
- *
+ * The __dynamic_array() macro will create each int <item>, this is
+ * to keep the offset of each array from the beginning of the event.
  */
 
+#undef __field
+#define __field(type, item);
+
 #undef __array
 #define __array(type, item, len)
 
-#undef __field
-#define __field(type, item);
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)	int item;
 
 #undef __string
-#define __string(item, src)	int item;
+#define __string(item, src) __dynamic_array(char, item, -1)
 
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
-	struct ftrace_str_offsets_##call {				\
+	struct ftrace_data_offsets_##call {				\
 		tstruct;						\
 	};
 
@@ -119,8 +123,12 @@
 #undef TP_printk
 #define TP_printk(fmt, args...) fmt "\n", args
 
+#undef __get_dynamic_array
+#define __get_dynamic_array(field)	\
+		((void *)__entry + __entry->__data_loc_##field)
+
 #undef __get_str
-#define __get_str(field)	((char *)__entry + __entry->__str_loc_##field)
+#define __get_str(field) (char *)__get_dynamic_array(field)
 
 #undef __print_flags
 #define __print_flags(flag, delim, flag_array...)			\
@@ -207,16 +215,19 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 	if (!ret)							\
 		return 0;
 
-#undef __string
-#define __string(item, src)						       \
-	ret = trace_seq_printf(s, "\tfield:__str_loc " #item ";\t"	       \
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)				       \
+	ret = trace_seq_printf(s, "\tfield:__data_loc " #item ";\t"	       \
 			       "offset:%u;\tsize:%u;\n",		       \
 			       (unsigned int)offsetof(typeof(field),	       \
-					__str_loc_##item),		       \
-			       (unsigned int)sizeof(field.__str_loc_##item));  \
+					__data_loc_##item),		       \
+			       (unsigned int)sizeof(field.__data_loc_##item)); \
 	if (!ret)							       \
 		return 0;
 
+#undef __string
+#define __string(item, src) __dynamic_array(char, item, -1)
+
 #undef __entry
 #define __entry REC
 
@@ -260,11 +271,14 @@ ftrace_format_##call(struct trace_seq *s)				\
 	if (ret)							\
 		return ret;
 
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)				       \
+	ret = trace_define_field(event_call, "__data_loc" "[" #type "]", #item,\
+				offsetof(typeof(field), __data_loc_##item),    \
+				 sizeof(field.__data_loc_##item), 0);
+
 #undef __string
-#define __string(item, src)						       \
-	ret = trace_define_field(event_call, "__str_loc", #item,	       \
-				offsetof(typeof(field), __str_loc_##item),     \
-				 sizeof(field.__str_loc_##item), 0);
+#define __string(item, src) __dynamic_array(char, item, -1)
 
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
@@ -289,6 +303,43 @@ ftrace_define_fields_##call(void)					\
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 /*
+ * remember the offset of each array from the beginning of the event.
+ */
+
+#undef __entry
+#define __entry entry
+
+#undef __field
+#define __field(type, item)
+
+#undef __array
+#define __array(type, item, len)
+
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)				\
+	__data_offsets->item = __data_size +				\
+			       offsetof(typeof(*entry), __data);	\
+	__data_size += (len) * sizeof(type);
+
+#undef __string
+#define __string(item, src) __dynamic_array(char, item, strlen(src) + 1)       \
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
+static inline int ftrace_get_offsets_##call(				\
+	struct ftrace_data_offsets_##call *__data_offsets, proto)       \
+{									\
+	int __data_size = 0;						\
+	struct ftrace_raw_##call __maybe_unused *entry;			\
+									\
+	tstruct;							\
+									\
+	return __data_size;						\
+}
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+/*
  * Stage 4 of the trace events.
  *
  * Override the macros in <trace/trace_events.h> to include the following:
@@ -432,15 +483,15 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
 #undef __array
 #define __array(type, item, len)
 
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)				\
+	__entry->__data_loc_##item = __data_offsets.item;
+
 #undef __string
-#define __string(item, src)						\
-	__str_offsets.item = __str_size +				\
-			     offsetof(typeof(*entry), __str_data);	\
-	__str_size += strlen(src) + 1;
+#define __string(item, src) __dynamic_array(char, item, -1)       	\
 
 #undef __assign_str
 #define __assign_str(dst, src)						\
-	__entry->__str_loc_##dst = __str_offsets.dst;			\
 	strcpy(__get_str(dst), src);
 
 #undef TRACE_EVENT
@@ -451,26 +502,29 @@ static struct ftrace_event_call event_##call;				\
 									\
 static void ftrace_raw_event_##call(proto)				\
 {									\
-	struct ftrace_str_offsets_##call __maybe_unused __str_offsets;	\
+	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
 	struct ftrace_event_call *event_call = &event_##call;		\
 	struct ring_buffer_event *event;				\
 	struct ftrace_raw_##call *entry;				\
 	unsigned long irq_flags;					\
-	int __str_size = 0;						\
+	int __data_size;						\
 	int pc;								\
 									\
 	local_save_flags(irq_flags);					\
 	pc = preempt_count();						\
 									\
-	tstruct;							\
+	__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
 									\
 	event = trace_current_buffer_lock_reserve(event_##call.id,	\
-				 sizeof(struct ftrace_raw_##call) + __str_size,\
+				 sizeof(*entry) + __data_size,		\
 				 irq_flags, pc);			\
 	if (!event)							\
 		return;							\
 	entry	= ring_buffer_event_data(event);			\
 									\
+									\
+	tstruct								\
+									\
 	{ assign; }							\
 									\
 	if (!filter_current_check_discard(event_call, entry, event))	\
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index a7430b1..db6e54b 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -478,12 +478,12 @@ enum {
 
 static int is_string_field(const char *type)
 {
+	if (strstr(type, "__data_loc") && strstr(type, "char"))
+		return FILTER_DYN_STRING;
+
 	if (strchr(type, '[') && strstr(type, "char"))
 		return FILTER_STATIC_STRING;
 
-	if (!strcmp(type, "__str_loc"))
-		return FILTER_DYN_STRING;
-
 	return 0;
 }
 
-- 
cgit v0.10.2


From ec081ddc3d90aab35bc0de19a358b964978837cf Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 1 Jun 2009 15:53:35 +0100
Subject: tracing: add exports to use __print_symbolic and __print_flags from a
 module

A patch to allow the use of __print_symbolic and __print_flags
from a module. This allows the current GFS2 tracing patch to
build.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
LKML-Reference: <1243868015.29604.542.camel@localhost.localdomain>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c12d95d..0fe3b22 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -17,6 +17,7 @@
 static DECLARE_RWSEM(trace_event_mutex);
 
 DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
+EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
 
 static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
@@ -250,6 +251,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
 
 	return p->buffer;
 }
+EXPORT_SYMBOL(ftrace_print_flags_seq);
 
 const char *
 ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
@@ -275,6 +277,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 
 	return p->buffer;
 }
+EXPORT_SYMBOL(ftrace_print_symbols_seq);
 
 #ifdef CONFIG_KRETPROBES
 static inline const char *kretprobed(const char *name)
-- 
cgit v0.10.2


From 1d080d6c3141623c92caaebe20e847cb99ccbb60 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 1 Jun 2009 12:20:40 -0400
Subject: tracing: remove redundant SOFTIRQ from softirq event traces

After converting the softirq tracer to use te flags options, this
caused a regression with the name. Since the flag was used directly
it was printed out (i.e. HRTIMER_SOFTIRQ).

This patch only shows the softirq name without the SOFTIRQ part.

[ Impact: fix regression of output from softirq events ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h
index 683fb36..b0c7ede 100644
--- a/include/trace/events/irq.h
+++ b/include/trace/events/irq.h
@@ -7,18 +7,18 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM irq
 
-#define softirq_name(sirq) { sirq, #sirq }
-#define show_softirq_name(val)						\
-	__print_symbolic(val,						\
-			 softirq_name(HI_SOFTIRQ),			\
-			 softirq_name(TIMER_SOFTIRQ),			\
-			 softirq_name(NET_TX_SOFTIRQ),			\
-			 softirq_name(NET_RX_SOFTIRQ),			\
-			 softirq_name(BLOCK_SOFTIRQ),			\
-			 softirq_name(TASKLET_SOFTIRQ),			\
-			 softirq_name(SCHED_SOFTIRQ),			\
-			 softirq_name(HRTIMER_SOFTIRQ),			\
-			 softirq_name(RCU_SOFTIRQ))
+#define softirq_name(sirq) { sirq##_SOFTIRQ, #sirq }
+#define show_softirq_name(val)			\
+	__print_symbolic(val,			\
+			 softirq_name(HI),	\
+			 softirq_name(TIMER),	\
+			 softirq_name(NET_TX),	\
+			 softirq_name(NET_RX),	\
+			 softirq_name(BLOCK),	\
+			 softirq_name(TASKLET),	\
+			 softirq_name(SCHED),	\
+			 softirq_name(HRTIMER),	\
+			 softirq_name(RCU))
 
 /**
  * irq_handler_entry - called immediately before the irq action handler
-- 
cgit v0.10.2


From 112f38a7e36e9d688b389507136bf3af3e6d159b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 1 Jun 2009 15:16:05 -0400
Subject: tracing: make trace pipe recognize latency format flag

The trace_pipe did not recognize the latency format flag and would produce
different output than the trace file. The problem was partly due that
the trace flags in the iterator was not set as well as the trace_pipe
zeros out part of the iterator (including the flags) to be able to use
the same routines as the trace file. trace_flags of the iterator should
not cause any problems when not zeroed out by for trace_pipe.

Reported-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index bbf40f6..5c093ff 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -51,6 +51,7 @@ struct trace_iterator {
 	int			cpu_file;
 	struct mutex		mutex;
 	struct ring_buffer_iter	*buffer_iter[NR_CPUS];
+	unsigned long		iter_flags;
 
 	/* The below is zeroed out in pipe_read */
 	struct trace_seq	seq;
@@ -58,7 +59,6 @@ struct trace_iterator {
 	int			cpu;
 	u64			ts;
 
-	unsigned long		iter_flags;
 	loff_t			pos;
 	long			idx;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a3a8a87..cae34c6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2826,6 +2826,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 	/* trace pipe does not show start of buffer */
 	cpumask_setall(iter->started);
 
+	if (trace_flags & TRACE_ITER_LATENCY_FMT)
+		iter->iter_flags |= TRACE_FILE_LAT_FMT;
+
 	iter->cpu_file = cpu_file;
 	iter->tr = &global_trace;
 	mutex_init(&iter->mutex);
-- 
cgit v0.10.2


From 0f6ce3de4ef6ff940308087c49760d068851c1a7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 1 Jun 2009 21:51:28 -0400
Subject: ftrace: do not profile functions when disabled

A race was found that if one were to enable and disable the function
profiler repeatedly, then the system can panic. This was because a profiled
function may be preempted just before disabling interrupts. While
the profiler is disabled and then reenabled, the preempted function
could start again, and access the hash as it is being initialized.

This just adds a check in the irq disabled part to check if the profiler
is enabled, and if it is not then it will just exit.

When the system is disabled, the profile_enabled variable is cleared
before calling the unregistering of the function profiler. This
unregistering calls stop machine which also acts as a synchronize schedule.

[ Impact: fix panic in enabling/disabling function profiler ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2074e5b..d6973df 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -599,7 +599,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip)
 	local_irq_save(flags);
 
 	stat = &__get_cpu_var(ftrace_profile_stats);
-	if (!stat->hash)
+	if (!stat->hash || !ftrace_profile_enabled)
 		goto out;
 
 	rec = ftrace_find_profiled_func(stat, ip);
@@ -630,7 +630,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
 
 	local_irq_save(flags);
 	stat = &__get_cpu_var(ftrace_profile_stats);
-	if (!stat->hash)
+	if (!stat->hash || !ftrace_profile_enabled)
 		goto out;
 
 	calltime = trace->rettime - trace->calltime;
@@ -724,6 +724,10 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 			ftrace_profile_enabled = 1;
 		} else {
 			ftrace_profile_enabled = 0;
+			/*
+			 * unregister_ftrace_profiler calls stop_machine
+			 * so this acts like an synchronize_sched.
+			 */
 			unregister_ftrace_profiler();
 		}
 	}
-- 
cgit v0.10.2


From 1f23920dbf1377fa9e4aef4f3d20c34a06a71a35 Mon Sep 17 00:00:00 2001
From: Felix Blyakher <felixb@sgi.com>
Date: Thu, 7 May 2009 19:49:45 -0500
Subject: xfs: fix double unlock in xfs_swap_extents()

Regreesion from commit ef8f7fc, which rearranged the code in
xfs_swap_extents() leading to double unlock of xfs inode ilock.
That resulted in xfs_fsr deadlocking itself on platforms, which
don't handle double unlock of rw_semaphore nicely. It caused the
count go negative, which represents the write holder, without
really having one. ia64 is one of the platforms where deadlock
was easily reproduced and the fix was tested.

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Felix Blyakher <felixb@sgi.com>

diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index e6d839b..7465f9e 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -347,13 +347,15 @@ xfs_swap_extents(
 
 	error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT);
 
-out_unlock:
-	xfs_iunlock(ip,  XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-	xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
 out:
 	kmem_free(tempifp);
 	return error;
 
+out_unlock:
+	xfs_iunlock(ip,  XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	goto out;
+
 out_trans_cancel:
 	xfs_trans_cancel(tp, 0);
 	goto out_unlock;
-- 
cgit v0.10.2


From e6da7c9fed111ba1243297ee6eda8e24ae11c384 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Sat, 23 May 2009 14:30:12 -0500
Subject: xfs: fix overflow in xfs_growfs_data_private

In the case where growing a filesystem would leave the last AG
too small, the fixup code has an overflow in the calculation
of the new size with one fewer ag, because "nagcount" is a 32
bit number.  If the new filesystem has > 2^32 blocks in it
this causes a problem resulting in an EINVAL return from growfs:

 # xfs_io -f -c "truncate 19998630180864" fsfile
 # mkfs.xfs -f -bsize=4096 -dagsize=76288719b,size=3905982455b fsfile
 # mount -o loop fsfile /mnt
 # xfs_growfs /mnt

meta-data=/dev/loop0             isize=256    agcount=52,
agsize=76288719 blks
         =                       sectsz=512   attr=2
data     =                       bsize=4096   blocks=3905982455, imaxpct=5
         =                       sunit=0      swidth=0 blks
naming   =version 2              bsize=4096   ascii-ci=0
log      =internal               bsize=4096   blocks=32768, version=2
         =                       sectsz=512   sunit=0 blks, lazy-count=0
realtime =none                   extsz=4096   blocks=0, rtextents=0
xfs_growfs: XFS_IOC_FSGROWFSDATA xfsctl failed: Invalid argument

Reported-by: richard.ems@cape-horn-eng.com
Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 8379e3b..cbd451b 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -160,7 +160,7 @@ xfs_growfs_data_private(
 	nagcount = new + (nb_mod != 0);
 	if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
 		nagcount--;
-		nb = nagcount * mp->m_sb.sb_agblocks;
+		nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
 		if (nb < mp->m_sb.sb_dblocks)
 			return XFS_ERROR(EINVAL);
 	}
-- 
cgit v0.10.2


From 1b17d766463d51904cb242f194a780737e5f73ef Mon Sep 17 00:00:00 2001
From: Felix Blyakher <felixb@sgi.com>
Date: Mon, 1 Jun 2009 13:13:24 -0500
Subject: xfs: prevent deadlock in xfs_qm_shake()

It's possible to recurse into filesystem from the memory
allocation, which deadlocks in xfs_qm_shake(). Add check
for __GFP_FS, and bail out if it is not set.

Signed-off-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Hedi Berriche <hedi@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>

diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index af6843c..179cbd6 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -103,7 +103,7 @@ extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast);
 static inline int
 kmem_shake_allow(gfp_t gfp_mask)
 {
-	return (gfp_mask & __GFP_WAIT) != 0;
+	return ((gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS));
 }
 
 #endif /* __XFS_SUPPORT_KMEM_H__ */
-- 
cgit v0.10.2


From fbeb4a9c20d00e2550156f9e5a34473fbde59de2 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Mon, 1 Jun 2009 22:47:19 -0500
Subject: tomoyo: avoid get+put of task_struct

Use task_cred_xxx(task, security) in tomoyo_real_domain() to
avoid a get+put of the target cred.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Acked-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/tomoyo/tomoyo.h b/security/tomoyo/tomoyo.h
index 41c6eba..f12d5ad 100644
--- a/security/tomoyo/tomoyo.h
+++ b/security/tomoyo/tomoyo.h
@@ -90,17 +90,10 @@ static inline struct tomoyo_domain_info *tomoyo_domain(void)
 	return current_cred()->security;
 }
 
-/* Caller holds tasklist_lock spinlock. */
 static inline struct tomoyo_domain_info *tomoyo_real_domain(struct task_struct
 							    *task)
 {
-	/***** CRITICAL SECTION START *****/
-	const struct cred *cred = get_task_cred(task);
-	struct tomoyo_domain_info *domain = cred->security;
-
-	put_cred(cred);
-	return domain;
-	/***** CRITICAL SECTION END *****/
+	return task_cred_xxx(task, security);
 }
 
 #endif /* !defined(_SECURITY_TOMOYO_TOMOYO_H) */
-- 
cgit v0.10.2


From 53c663ce0f39ba8e8ef652e400b317bc60ac7f19 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Tue, 2 Jun 2009 08:44:01 +0200
Subject: block: fix a possible oops on elv_abort_queue()

I found one more mis-conversion to the 'request is always dequeued
when completing' model in elv_abort_queue() during code inspection.
Although I haven't hit any problem caused by this mis-conversion yet
and just done compile/boot test, please apply if you have no problem.

Request must be dequeued when it completes.
However, elv_abort_queue() completes requests without dequeueing.
This will cause oops in the __blk_end_request_all().
This patch fixes the oops.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/elevator.c b/block/elevator.c
index bd78f69..a029cfe 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -813,6 +813,11 @@ void elv_abort_queue(struct request_queue *q)
 		rq = list_entry_rq(q->queue_head.next);
 		rq->cmd_flags |= REQ_QUIET;
 		trace_block_rq_abort(q, rq);
+		/*
+		 * Mark this request as started so we don't trigger
+		 * any debug logic in the end I/O path.
+		 */
+		blk_start_request(rq);
 		__blk_end_request_all(rq, -EIO);
 	}
 }
-- 
cgit v0.10.2


From a12af1ebe675e85831fde3c4d0908fc3b0908b7a Mon Sep 17 00:00:00 2001
From: Abhijith Das <adas@redhat.com>
Date: Mon, 1 Jun 2009 12:30:03 -0500
Subject: GFS2: smbd proccess hangs with flock() call.

GFS2 currently does not support mandatory flocks. An flock() call with
LOCK_MAND triggers unexpected behavior because gfs2 is not checking for
this lock type. This patch corrects that.

Signed-off-by: Abhi Das <adas@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 73b6f55..841ddc9 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -698,8 +698,8 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 
 	if (!(fl->fl_flags & FL_FLOCK))
 		return -ENOLCK;
-	if (__mandatory_lock(&ip->i_inode))
-		return -ENOLCK;
+	if (fl->fl_type & LOCK_MAND)
+		return -EOPNOTSUPP;
 
 	if (fl->fl_type == F_UNLCK) {
 		do_unflock(file, fl);
-- 
cgit v0.10.2


From 5a9a8e32ebe269c71d8d3e78f9435fe7729f38e9 Mon Sep 17 00:00:00 2001
From: Ed Swierk <eswierk@aristanetworks.com>
Date: Tue, 2 Jun 2009 00:19:52 -0700
Subject: forcedeth: add phy_power_down parameter, leave phy powered up by
 default (v2)

Add a phy_power_down parameter to forcedeth: set to 1 to power down the
phy and disable the link when an interface goes down; set to 0 to always
leave the phy powered up.

The phy power state persists across reboots; Windows, some BIOSes, and
older versions of Linux don't bother to power up the phy again, forcing
users to remove all power to get the interface working (see
http://bugzilla.kernel.org/show_bug.cgi?id=13072).  Leaving the phy
powered on is the safest default behavior.  Users accustomed to seeing
the link state reflect the interface state and/or wanting to minimize
power consumption can set phy_power_down=1 if compatibility with other
OSes is not an issue.

Signed-off-by: Ed Swierk <eswierk@aristanetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
index f9a846b..9f6a68fb 100644
--- a/drivers/net/forcedeth.c
+++ b/drivers/net/forcedeth.c
@@ -897,6 +897,12 @@ enum {
 };
 static int phy_cross = NV_CROSSOVER_DETECTION_DISABLED;
 
+/*
+ * Power down phy when interface is down (persists through reboot;
+ * older Linux and other OSes may not power it up again)
+ */
+static int phy_power_down = 0;
+
 static inline struct fe_priv *get_nvpriv(struct net_device *dev)
 {
 	return netdev_priv(dev);
@@ -1485,7 +1491,10 @@ static int phy_init(struct net_device *dev)
 
 	/* restart auto negotiation, power down phy */
 	mii_control = mii_rw(dev, np->phyaddr, MII_BMCR, MII_READ);
-	mii_control |= (BMCR_ANRESTART | BMCR_ANENABLE | BMCR_PDOWN);
+	mii_control |= (BMCR_ANRESTART | BMCR_ANENABLE);
+	if (phy_power_down) {
+		mii_control |= BMCR_PDOWN;
+	}
 	if (mii_rw(dev, np->phyaddr, MII_BMCR, mii_control)) {
 		return PHY_ERROR;
 	}
@@ -5513,7 +5522,7 @@ static int nv_close(struct net_device *dev)
 
 	nv_drain_rxtx(dev);
 
-	if (np->wolenabled) {
+	if (np->wolenabled || !phy_power_down) {
 		writel(NVREG_PFF_ALWAYS|NVREG_PFF_MYADDR, base + NvRegPacketFilterFlags);
 		nv_start_rx(dev);
 	} else {
@@ -6367,6 +6376,8 @@ module_param(dma_64bit, int, 0);
 MODULE_PARM_DESC(dma_64bit, "High DMA is enabled by setting to 1 and disabled by setting to 0.");
 module_param(phy_cross, int, 0);
 MODULE_PARM_DESC(phy_cross, "Phy crossover detection for Realtek 8201 phy is enabled by setting to 1 and disabled by setting to 0.");
+module_param(phy_power_down, int, 0);
+MODULE_PARM_DESC(phy_power_down, "Power down phy and disable link when interface is down (1), or leave phy powered up (0).");
 
 MODULE_AUTHOR("Manfred Spraul <manfred@colorfullife.com>");
 MODULE_DESCRIPTION("Reverse Engineered nForce ethernet driver");
-- 
cgit v0.10.2


From c25486c5ea3ea5586f71285a3aa5cfafb1db59f9 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 2 Jun 2009 08:09:48 +0200
Subject: perf_counter tools: Make .gitignore reflect perf_counter tools files

Make .gitignore reflect perf_counter tools files so
git status doesn't gripe about untracked files.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/.gitignore b/Documentation/perf_counter/.gitignore
index 41c0b20..d69a759 100644
--- a/Documentation/perf_counter/.gitignore
+++ b/Documentation/perf_counter/.gitignore
@@ -1,179 +1,16 @@
-GIT-BUILD-OPTIONS
-GIT-CFLAGS
-GIT-GUI-VARS
-GIT-VERSION-FILE
-git
-git-add
-git-add--interactive
-git-am
-git-annotate
-git-apply
-git-archimport
-git-archive
-git-bisect
-git-bisect--helper
-git-blame
-git-branch
-git-bundle
-git-cat-file
-git-check-attr
-git-check-ref-format
-git-checkout
-git-checkout-index
-git-cherry
-git-cherry-pick
-git-clean
-git-clone
-git-commit
-git-commit-tree
-git-config
-git-count-objects
-git-cvsexportcommit
-git-cvsimport
-git-cvsserver
-git-daemon
-git-diff
-git-diff-files
-git-diff-index
-git-diff-tree
-git-difftool
-git-difftool--helper
-git-describe
-git-fast-export
-git-fast-import
-git-fetch
-git-fetch--tool
-git-fetch-pack
-git-filter-branch
-git-fmt-merge-msg
-git-for-each-ref
-git-format-patch
-git-fsck
-git-fsck-objects
-git-gc
-git-get-tar-commit-id
-git-grep
-git-hash-object
-git-help
-git-http-fetch
-git-http-push
-git-imap-send
-git-index-pack
-git-init
-git-init-db
-git-instaweb
-git-log
-git-lost-found
-git-ls-files
-git-ls-remote
-git-ls-tree
-git-mailinfo
-git-mailsplit
-git-merge
-git-merge-base
-git-merge-index
-git-merge-file
-git-merge-tree
-git-merge-octopus
-git-merge-one-file
-git-merge-ours
-git-merge-recursive
-git-merge-resolve
-git-merge-subtree
-git-mergetool
-git-mergetool--lib
-git-mktag
-git-mktree
-git-name-rev
-git-mv
-git-pack-redundant
-git-pack-objects
-git-pack-refs
-git-parse-remote
-git-patch-id
-git-peek-remote
-git-prune
-git-prune-packed
-git-pull
-git-push
-git-quiltimport
-git-read-tree
-git-rebase
-git-rebase--interactive
-git-receive-pack
-git-reflog
-git-relink
-git-remote
-git-repack
-git-repo-config
-git-request-pull
-git-rerere
-git-reset
-git-rev-list
-git-rev-parse
-git-revert
-git-rm
-git-send-email
-git-send-pack
-git-sh-setup
-git-shell
-git-shortlog
-git-show
-git-show-branch
-git-show-index
-git-show-ref
-git-stage
-git-stash
-git-status
-git-stripspace
-git-submodule
-git-svn
-git-symbolic-ref
-git-tag
-git-tar-tree
-git-unpack-file
-git-unpack-objects
-git-update-index
-git-update-ref
-git-update-server-info
-git-upload-archive
-git-upload-pack
-git-var
-git-verify-pack
-git-verify-tag
-git-web--browse
-git-whatchanged
-git-write-tree
-git-core-*/?*
-gitk-wish
-gitweb/gitweb.cgi
-test-chmtime
-test-ctype
-test-date
-test-delta
-test-dump-cache-tree
-test-genrandom
-test-match-trees
-test-parse-options
-test-path-utils
-test-sha1
-test-sigchain
+PERF-BUILD-OPTIONS
+PERF-CFLAGS
+PERF-GUI-VARS
+PERF-VERSION-FILE
+perf
+perf-help
+perf-record
+perf-report
+perf-stat
+perf-top
+perf*.1
+perf*.xml
 common-cmds.h
-*.tar.gz
-*.dsc
-*.deb
-git.spec
-*.exe
-*.[aos]
-*.py[co]
-config.mak
-autom4te.cache
-config.cache
-config.log
-config.status
-config.mak.autogen
-config.mak.append
-configure
 tags
 TAGS
 cscope*
-- 
cgit v0.10.2


From ea30e11970a96cfe5e32c03a29332554573b4a10 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Tue, 2 Jun 2009 01:29:58 -0700
Subject: e1000: add missing length check to e1000 receive routine

	Patch to fix bad length checking in e1000.  E1000 by default does two
things:

1) Spans rx descriptors for packets that don't fit into 1 skb on recieve
2) Strips the crc from a frame by subtracting 4 bytes from the length prior to
doing an skb_put

Since the e1000 driver isn't written to support receiving packets that span
multiple rx buffers, it checks the End of Packet bit of every frame, and
discards it if its not set.  This places us in a situation where, if we have a
spanning packet, the first part is discarded, but the second part is not (since
it is the end of packet, and it passes the EOP bit test).  If the second part of
the frame is small (4 bytes or less), we subtract 4 from it to remove its crc,
underflow the length, and wind up in skb_over_panic, when we try to skb_put a
huge number of bytes into the skb.  This amounts to a remote DOS attack through
careful selection of frame size in relation to interface MTU.  The fix for this
is already in the e1000e driver, as well as the e1000 sourceforge driver, but no
one ever pushed it to e1000.  This is lifted straight from e1000e, and prevents
small frames from causing the underflow described above

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Tested-by: Andy Gospodarek <andy@greyhouse.net>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index b1419e2..fffb006 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -4027,8 +4027,9 @@ static bool e1000_clean_rx_irq(struct e1000_adapter *adapter,
 		                 PCI_DMA_FROMDEVICE);
 
 		length = le16_to_cpu(rx_desc->length);
-
-		if (unlikely(!(status & E1000_RXD_STAT_EOP))) {
+		/* !EOP means multiple descriptors were used to store a single
+		 * packet, also make sure the frame isn't just CRC only */
+		if (unlikely(!(status & E1000_RXD_STAT_EOP) || (length <= 4))) {
 			/* All receives must fit into a single buffer */
 			E1000_DBG("%s: Receive packet consumed multiple"
 				  " buffers\n", netdev->name);
-- 
cgit v0.10.2


From c1079abd1d5e66edabeb04d75e48e604cd8c167f Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 2 Jun 2009 10:17:34 +0200
Subject: perf_counter tools: Cleanup Makefile

We currently build perf-stat/record etc, only to do nothing
with them.  We also install the perf binary in two places,
$prefix/bin and $perfexec_instdir, which appears to be for
binaries which perf would exec were a command not linked in.
Correct this, and comment out broken/incomplete targets dist
and coverage.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 3b8275f..eae8856 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -260,8 +260,6 @@ PROGRAMS += perf
 
 # List built-in command $C whose implementation cmd_$C() is not in
 # builtin-$C.o but is linked in as part of some other command.
-BUILT_INS += $(patsubst builtin-%.o,perf-%$X,$(BUILTIN_OBJS))
-
 #
 # None right now:
 #
@@ -791,12 +789,14 @@ export perfexec_instdir
 
 install: all
 	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
-	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
-	$(INSTALL) $(ALL_PROGRAMS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
 	$(INSTALL) perf$X '$(DESTDIR_SQ)$(bindir_SQ)'
+ifdef BUILT_INS
+	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
+	$(INSTALL) $(BUILT_INS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
 ifneq (,$X)
 	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';)
 endif
+endif
 
 install-doc:
 	$(MAKE) -C Documentation install
@@ -824,52 +824,55 @@ quick-install-html:
 
 
 ### Maintainer's dist rules
-
-perf.spec: perf.spec.in
-	sed -e 's/@@VERSION@@/$(PERF_VERSION)/g' < $< > $@+
-	mv $@+ $@
-
-PERF_TARNAME=perf-$(PERF_VERSION)
-dist: perf.spec perf-archive$(X) configure
-	./perf-archive --format=tar \
-		--prefix=$(PERF_TARNAME)/ HEAD^{tree} > $(PERF_TARNAME).tar
-	@mkdir -p $(PERF_TARNAME)
-	@cp perf.spec configure $(PERF_TARNAME)
-	@echo $(PERF_VERSION) > $(PERF_TARNAME)/version
-	$(TAR) rf $(PERF_TARNAME).tar \
-		$(PERF_TARNAME)/perf.spec \
-		$(PERF_TARNAME)/configure \
-		$(PERF_TARNAME)/version
-	@$(RM) -r $(PERF_TARNAME)
-	gzip -f -9 $(PERF_TARNAME).tar
-
-htmldocs = perf-htmldocs-$(PERF_VERSION)
-manpages = perf-manpages-$(PERF_VERSION)
-dist-doc:
-	$(RM) -r .doc-tmp-dir
-	mkdir .doc-tmp-dir
-	$(MAKE) -C Documentation WEBDOC_DEST=../.doc-tmp-dir install-webdoc
-	cd .doc-tmp-dir && $(TAR) cf ../$(htmldocs).tar .
-	gzip -n -9 -f $(htmldocs).tar
-	:
-	$(RM) -r .doc-tmp-dir
-	mkdir -p .doc-tmp-dir/man1 .doc-tmp-dir/man5 .doc-tmp-dir/man7
-	$(MAKE) -C Documentation DESTDIR=./ \
-		man1dir=../.doc-tmp-dir/man1 \
-		man5dir=../.doc-tmp-dir/man5 \
-		man7dir=../.doc-tmp-dir/man7 \
-		install
-	cd .doc-tmp-dir && $(TAR) cf ../$(manpages).tar .
-	gzip -n -9 -f $(manpages).tar
-	$(RM) -r .doc-tmp-dir
-
-rpm: dist
-	$(RPMBUILD) -ta $(PERF_TARNAME).tar.gz
+#
+# None right now
+#
+#
+# perf.spec: perf.spec.in
+#	sed -e 's/@@VERSION@@/$(PERF_VERSION)/g' < $< > $@+
+#	mv $@+ $@
+#
+# PERF_TARNAME=perf-$(PERF_VERSION)
+# dist: perf.spec perf-archive$(X) configure
+#	./perf-archive --format=tar \
+#		--prefix=$(PERF_TARNAME)/ HEAD^{tree} > $(PERF_TARNAME).tar
+#	@mkdir -p $(PERF_TARNAME)
+#	@cp perf.spec configure $(PERF_TARNAME)
+#	@echo $(PERF_VERSION) > $(PERF_TARNAME)/version
+#	$(TAR) rf $(PERF_TARNAME).tar \
+#		$(PERF_TARNAME)/perf.spec \
+#		$(PERF_TARNAME)/configure \
+#		$(PERF_TARNAME)/version
+#	@$(RM) -r $(PERF_TARNAME)
+#	gzip -f -9 $(PERF_TARNAME).tar
+#
+# htmldocs = perf-htmldocs-$(PERF_VERSION)
+# manpages = perf-manpages-$(PERF_VERSION)
+# dist-doc:
+#	$(RM) -r .doc-tmp-dir
+#	mkdir .doc-tmp-dir
+#	$(MAKE) -C Documentation WEBDOC_DEST=../.doc-tmp-dir install-webdoc
+#	cd .doc-tmp-dir && $(TAR) cf ../$(htmldocs).tar .
+#	gzip -n -9 -f $(htmldocs).tar
+#	:
+#	$(RM) -r .doc-tmp-dir
+#	mkdir -p .doc-tmp-dir/man1 .doc-tmp-dir/man5 .doc-tmp-dir/man7
+#	$(MAKE) -C Documentation DESTDIR=./ \
+#		man1dir=../.doc-tmp-dir/man1 \
+#		man5dir=../.doc-tmp-dir/man5 \
+#		man7dir=../.doc-tmp-dir/man7 \
+#		install
+#	cd .doc-tmp-dir && $(TAR) cf ../$(manpages).tar .
+#	gzip -n -9 -f $(manpages).tar
+#	$(RM) -r .doc-tmp-dir
+#
+# rpm: dist
+#	$(RPMBUILD) -ta $(PERF_TARNAME).tar.gz
 
 ### Cleaning rules
 
 distclean: clean
-	$(RM) configure
+#	$(RM) configure
 
 clean:
 	$(RM) *.o */*.o $(LIB_FILE)
@@ -896,25 +899,27 @@ check-builtins::
 
 ### Test suite coverage testing
 #
-.PHONY: coverage coverage-clean coverage-build coverage-report
-
-coverage:
-	$(MAKE) coverage-build
-	$(MAKE) coverage-report
-
-coverage-clean:
-	rm -f *.gcda *.gcno
-
-COVERAGE_CFLAGS = $(CFLAGS) -O0 -ftest-coverage -fprofile-arcs
-COVERAGE_LDFLAGS = $(CFLAGS)  -O0 -lgcov
-
-coverage-build: coverage-clean
-	$(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" all
-	$(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" \
-		-j1 test
-
-coverage-report:
-	gcov -b *.c */*.c
-	grep '^function.*called 0 ' *.c.gcov */*.c.gcov \
-		| sed -e 's/\([^:]*\)\.gcov: *function \([^ ]*\) called.*/\1: \2/' \
-		| tee coverage-untested-functions
+# None right now
+#
+# .PHONY: coverage coverage-clean coverage-build coverage-report
+#
+# coverage:
+#	$(MAKE) coverage-build
+#	$(MAKE) coverage-report
+#
+# coverage-clean:
+#	rm -f *.gcda *.gcno
+#
+# COVERAGE_CFLAGS = $(CFLAGS) -O0 -ftest-coverage -fprofile-arcs
+# COVERAGE_LDFLAGS = $(CFLAGS)  -O0 -lgcov
+#
+# coverage-build: coverage-clean
+#	$(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" all
+#	$(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" \
+#		-j1 test
+#
+# coverage-report:
+#	gcov -b *.c */*.c
+#	grep '^function.*called 0 ' *.c.gcov */*.c.gcov \
+#		| sed -e 's/\([^:]*\)\.gcov: *function \([^ ]*\) called.*/\1: \2/' \
+#		| tee coverage-untested-functions
-- 
cgit v0.10.2


From 10a2825514a988225ac2e336c7a9502c4ca57c39 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 2 Jun 2009 11:04:44 +0200
Subject: perf_counter tools: Fix uninitialized variable in perf-report.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# make prefix=/usr/local V=1
gcc -o builtin-report.o -c -O2 -ggdb3 -Wall -Werror -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement   -DSHA1_HEADER='<openssl/sha.h>'  builtin-report.c
cc1: warnings being treated as errors
builtin-report.c: In function ‘__cmd_report’:
builtin-report.c:626: error: ‘cwdlen’ may be used uninitialized in this function

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <new-submission>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 7973092..20a4e51 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -653,8 +653,10 @@ static int __cmd_report(void)
 			return EXIT_FAILURE;
 		}
 		cwdlen = strlen(cwd);
-	} else
+	} else {
 		cwdp = NULL;
+		cwdlen = 0;
+	}
 remap:
 	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
 			   MAP_SHARED, input, offset);
-- 
cgit v0.10.2


From 12186be7d2e1106cede1cc728526e3d7998cbe94 Mon Sep 17 00:00:00 2001
From: Minoru Usui <usui@mxm.nes.nec.co.jp>
Date: Tue, 2 Jun 2009 02:17:34 -0700
Subject: net_cls: fix unconfigured struct tcf_proto keeps chaining and avoid
 kernel panic when we use cls_cgroup

This patch fixes a bug which unconfigured struct tcf_proto keeps
chaining in tc_ctl_tfilter(), and avoids kernel panic in
cls_cgroup_classify() when we use cls_cgroup.

When we execute 'tc filter add', tcf_proto is allocated, initialized
by classifier's init(), and chained.  After it's chained,
tc_ctl_tfilter() calls classifier's change().  When classifier's
change() fails, tc_ctl_tfilter() does not free and keeps tcf_proto.

In addition, cls_cgroup is initialized in change() not in init().  It
accesses unconfigured struct tcf_proto which is chained before
change(), then hits Oops.

Signed-off-by: Minoru Usui <usui@mxm.nes.nec.co.jp>
Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Tested-by: Minoru Usui <usui@mxm.nes.nec.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 0759f32..09cdcdf 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -135,6 +135,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 	unsigned long cl;
 	unsigned long fh;
 	int err;
+	int tp_created = 0;
 
 	if (net != &init_net)
 		return -EINVAL;
@@ -266,10 +267,7 @@ replay:
 			goto errout;
 		}
 
-		spin_lock_bh(root_lock);
-		tp->next = *back;
-		*back = tp;
-		spin_unlock_bh(root_lock);
+		tp_created = 1;
 
 	} else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind))
 		goto errout;
@@ -296,8 +294,11 @@ replay:
 		switch (n->nlmsg_type) {
 		case RTM_NEWTFILTER:
 			err = -EEXIST;
-			if (n->nlmsg_flags & NLM_F_EXCL)
+			if (n->nlmsg_flags & NLM_F_EXCL) {
+				if (tp_created)
+					tcf_destroy(tp);
 				goto errout;
+			}
 			break;
 		case RTM_DELTFILTER:
 			err = tp->ops->delete(tp, fh);
@@ -314,8 +315,18 @@ replay:
 	}
 
 	err = tp->ops->change(tp, cl, t->tcm_handle, tca, &fh);
-	if (err == 0)
+	if (err == 0) {
+		if (tp_created) {
+			spin_lock_bh(root_lock);
+			tp->next = *back;
+			*back = tp;
+			spin_unlock_bh(root_lock);
+		}
 		tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER);
+	} else {
+		if (tp_created)
+			tcf_destroy(tp);
+	}
 
 errout:
 	if (cl)
-- 
cgit v0.10.2


From f38b082081bf69a06fffb8b32a175999e2320c5b Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Tue, 2 Jun 2009 21:05:16 +1000
Subject: perf_counter: Initialize per-cpu context earlier on cpu up

This arranges for perf_counter's notifier for cpu hotplug
operations to be called earlier than the migration notifier in
sched.c by increasing its priority to 20, compared to the 10
for the migration notifier.  The reason for doing this is that
a subsequent commit to convert the cpu migration counter to use
the generic swcounter infrastructure will add a call into the
perf_counter subsystem when tasks get migrated.  Therefore the
perf_counter subsystem needs a chance to initialize its per-cpu
data for the new cpu before it can get called from the
migration code.

This also adds a comment to the migration notifier noting that
its priority needs to be lower than that of the perf_counter
notifier.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18981.1900.792795.836858@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index df319c4..8d2653f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3902,8 +3902,12 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 	return NOTIFY_OK;
 }
 
+/*
+ * This has to have a higher priority than migration_notifier in sched.c.
+ */
 static struct notifier_block __cpuinitdata perf_cpu_nb = {
 	.notifier_call		= perf_cpu_notify,
+	.priority		= 20,
 };
 
 void __init perf_counter_init(void)
diff --git a/kernel/sched.c b/kernel/sched.c
index ad079f0..3226cc1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7319,8 +7319,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	return NOTIFY_OK;
 }
 
-/* Register at highest priority so that task migration (migrate_all_tasks)
- * happens before everything else.
+/*
+ * Register at high priority so that task migration (migrate_all_tasks)
+ * happens before everything else.  This has to be lower priority than
+ * the notifier in the perf_counter subsystem, though.
  */
 static struct notifier_block __cpuinitdata migration_notifier = {
 	.notifier_call = migration_call,
-- 
cgit v0.10.2


From 3f731ca60afc29f5bcdb5fd2a04391466313a9ac Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 1 Jun 2009 17:52:30 +1000
Subject: perf_counter: Fix cpu migration counter

This fixes the cpu migration software counter to count
correctly even when contexts get swapped from one task to
another.  Previously the cpu migration counts reported by perf
stat were bogus, ranging from negative to several thousand for
a single "lat_ctx 2 8 32" run.  With this patch the cpu
migration count reported for "lat_ctx 2 8 32" is almost always
between 35 and 44.

This fixes the problem by adding a call into the perf_counter
code from set_task_cpu when tasks are migrated.  This enables
us to use the generic swcounter code (with some modifications)
for the cpu migration counter.

This modifies the swcounter code to allow a NULL regs pointer
to be passed in to perf_swcounter_ctx_event() etc.  The cpu
migration counter does this because there isn't necessarily a
pt_regs struct for the task available.  In this case, the
counter will not have interrupt capability - but the migration
counter didn't have interrupt capability before, so this is no
loss.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <18979.35006.819769.416327@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 0e57d8c..deb9acf 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -615,6 +615,8 @@ extern void perf_counter_munmap(unsigned long addr, unsigned long len,
 
 extern void perf_counter_comm(struct task_struct *tsk);
 
+extern void perf_counter_task_migration(struct task_struct *task, int cpu);
+
 #define MAX_STACK_DEPTH		255
 
 struct perf_callchain_entry {
@@ -668,6 +670,8 @@ perf_counter_munmap(unsigned long addr, unsigned long len,
 
 static inline void perf_counter_comm(struct task_struct *tsk)		{ }
 static inline void perf_counter_init(void)				{ }
+static inline void perf_counter_task_migration(struct task_struct *task,
+					       int cpu)			{ }
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 8d2653f..cd94cf3 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2921,11 +2921,13 @@ static int perf_swcounter_match(struct perf_counter *counter,
 	if (counter->hw_event.config != event_config)
 		return 0;
 
-	if (counter->hw_event.exclude_user && user_mode(regs))
-		return 0;
+	if (regs) {
+		if (counter->hw_event.exclude_user && user_mode(regs))
+			return 0;
 
-	if (counter->hw_event.exclude_kernel && !user_mode(regs))
-		return 0;
+		if (counter->hw_event.exclude_kernel && !user_mode(regs))
+			return 0;
+	}
 
 	return 1;
 }
@@ -2935,7 +2937,7 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
 {
 	int neg = atomic64_add_negative(nr, &counter->hw.count);
 
-	if (counter->hw.irq_period && !neg)
+	if (counter->hw.irq_period && !neg && regs)
 		perf_swcounter_overflow(counter, nmi, regs, addr);
 }
 
@@ -3151,55 +3153,24 @@ static const struct pmu perf_ops_task_clock = {
 /*
  * Software counter: cpu migrations
  */
-
-static inline u64 get_cpu_migrations(struct perf_counter *counter)
-{
-	struct task_struct *curr = counter->ctx->task;
-
-	if (curr)
-		return curr->se.nr_migrations;
-	return cpu_nr_migrations(smp_processor_id());
-}
-
-static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
-{
-	u64 prev, now;
-	s64 delta;
-
-	prev = atomic64_read(&counter->hw.prev_count);
-	now = get_cpu_migrations(counter);
-
-	atomic64_set(&counter->hw.prev_count, now);
-
-	delta = now - prev;
-
-	atomic64_add(delta, &counter->count);
-}
-
-static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
+void perf_counter_task_migration(struct task_struct *task, int cpu)
 {
-	cpu_migrations_perf_counter_update(counter);
-}
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct perf_counter_context *ctx;
 
-static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
-{
-	if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
-		atomic64_set(&counter->hw.prev_count,
-			     get_cpu_migrations(counter));
-	return 0;
-}
+	perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
+				 PERF_COUNT_CPU_MIGRATIONS,
+				 1, 1, NULL, 0);
 
-static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
-{
-	cpu_migrations_perf_counter_update(counter);
+	ctx = perf_pin_task_context(task);
+	if (ctx) {
+		perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
+					 PERF_COUNT_CPU_MIGRATIONS,
+					 1, 1, NULL, 0);
+		perf_unpin_context(ctx);
+	}
 }
 
-static const struct pmu perf_ops_cpu_migrations = {
-	.enable		= cpu_migrations_perf_counter_enable,
-	.disable	= cpu_migrations_perf_counter_disable,
-	.read		= cpu_migrations_perf_counter_read,
-};
-
 #ifdef CONFIG_EVENT_PROFILE
 void perf_tpcounter_event(int event_id)
 {
@@ -3272,11 +3243,8 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_PAGE_FAULTS_MIN:
 	case PERF_COUNT_PAGE_FAULTS_MAJ:
 	case PERF_COUNT_CONTEXT_SWITCHES:
-		pmu = &perf_ops_generic;
-		break;
 	case PERF_COUNT_CPU_MIGRATIONS:
-		if (!counter->hw_event.exclude_kernel)
-			pmu = &perf_ops_cpu_migrations;
+		pmu = &perf_ops_generic;
 		break;
 	}
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 3226cc1..8d43347 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1977,6 +1977,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		if (task_hot(p, old_rq->clock, NULL))
 			schedstat_inc(p, se.nr_forced2_migrations);
 #endif
+		perf_counter_task_migration(p, new_cpu);
 	}
 	p->se.vruntime -= old_cfsrq->min_vruntime -
 					 new_cfsrq->min_vruntime;
-- 
cgit v0.10.2


From bf4e0ed3d027ce581be18496036862131b5f32aa Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 1 Jun 2009 17:53:16 +1000
Subject: perf_counter: Remove unused prev_state field

This removes the prev_state field of struct perf_counter since
it is now unused.  It was only used by the cpu migration
counter, which doesn't use it any more.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <18979.35052.915728.626374@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index deb9acf..d970fbc 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -427,7 +427,6 @@ struct perf_counter {
 	const struct pmu		*pmu;
 
 	enum perf_counter_active_state	state;
-	enum perf_counter_active_state	prev_state;
 	atomic64_t			count;
 
 	/*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index cd94cf3..fbed4d2 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -572,7 +572,6 @@ group_sched_in(struct perf_counter *group_counter,
 	if (ret)
 		return ret < 0 ? ret : 0;
 
-	group_counter->prev_state = group_counter->state;
 	if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
 		return -EAGAIN;
 
@@ -580,7 +579,6 @@ group_sched_in(struct perf_counter *group_counter,
 	 * Schedule in siblings as one group (if any):
 	 */
 	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
-		counter->prev_state = counter->state;
 		if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
 			partial_group = counter;
 			goto group_error;
@@ -657,7 +655,6 @@ static void add_counter_to_ctx(struct perf_counter *counter,
 			       struct perf_counter_context *ctx)
 {
 	list_add_counter(counter, ctx);
-	counter->prev_state = PERF_COUNTER_STATE_OFF;
 	counter->tstamp_enabled = ctx->time;
 	counter->tstamp_running = ctx->time;
 	counter->tstamp_stopped = ctx->time;
@@ -820,7 +817,6 @@ static void __perf_counter_enable(void *info)
 	ctx->is_active = 1;
 	update_context_time(ctx);
 
-	counter->prev_state = counter->state;
 	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 		goto unlock;
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
-- 
cgit v0.10.2


From fe67e6f2d6df371b58ba721954d45a196df5e8b8 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Tue, 2 Jun 2009 17:00:45 +0900
Subject: TOMOYO: Remove unused mutex.

I forgot to remove on TOMOYO's 15th posting.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index 2d67487..ee43631 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -296,7 +296,6 @@ static int tomoyo_update_domain_keeper_entry(const char *domainname,
 	struct tomoyo_domain_keeper_entry *ptr;
 	const struct tomoyo_path_info *saved_domainname;
 	const struct tomoyo_path_info *saved_program = NULL;
-	static DEFINE_MUTEX(lock);
 	int error = -ENOMEM;
 	bool is_last_name = false;
 
-- 
cgit v0.10.2


From 4a4b2d7684c66dbd8ed04eb284bc94a78e061d29 Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cca.cpqcorp.net>
Date: Tue, 2 Jun 2009 14:47:50 +0200
Subject: cciss: factor out core of sendcmd() for a more sane interface

Factor out the core of sendcmd() to provide a simpler interface which
exposes all the error information to the caller and make the original
sendcmd use this new function.  Rationale: The SCSI error handling
routines need to send commands with interrupts turned off, but they also
need access to the full error information.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Cc: Mike Miller <mikem@beardog.cca.cpqcorp.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 94474f5..8d0f893 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2373,41 +2373,21 @@ static int add_sendcmd_reject(__u8 cmd, int ctlr, unsigned long complete)
 	return 0;
 }
 
-/*
- * Send a command to the controller, and wait for it to complete.
- * Only used at init time.
+/* Send command c to controller h and poll for it to complete.
+ * Turns interrupts off on the board.  Used at driver init time
+ * and during SCSI error recovery.
  */
-static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size, unsigned int use_unit_num,	/* 0: address the controller,
-												   1: address logical volume log_unit,
-												   2: periph device address is scsi3addr */
-		   unsigned int log_unit,
-		   __u8 page_code, unsigned char *scsi3addr, int cmd_type)
+static int sendcmd_core(ctlr_info_t *h, CommandList_struct *c)
 {
-	CommandList_struct *c;
 	int i;
 	unsigned long complete;
-	ctlr_info_t *info_p = hba[ctlr];
+	int status = IO_ERROR;
 	u64bit buff_dma_handle;
-	int status, done = 0;
 
-	if ((c = cmd_alloc(info_p, 1)) == NULL) {
-		printk(KERN_WARNING "cciss: unable to get memory");
-		return IO_ERROR;
-	}
-	status = fill_cmd(c, cmd, ctlr, buff, size, use_unit_num,
-			  log_unit, page_code, scsi3addr, cmd_type);
-	if (status != IO_OK) {
-		cmd_free(info_p, c, 1);
-		return status;
-	}
-      resend_cmd1:
-	/*
-	 * Disable interrupt
-	 */
-#ifdef CCISS_DEBUG
-	printk(KERN_DEBUG "cciss: turning intr off\n");
-#endif				/* CCISS_DEBUG */
-	info_p->access.set_intr_mask(info_p, CCISS_INTR_OFF);
+resend_cmd1:
+
+	/* Disable interrupt on the board. */
+	h->access.set_intr_mask(h, CCISS_INTR_OFF);
 
 	/* Make sure there is room in the command FIFO */
 	/* Actually it should be completely empty at this time */
@@ -2415,21 +2395,15 @@ static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size, unsigned int use
 	/* tape side of the driver. */
 	for (i = 200000; i > 0; i--) {
 		/* if fifo isn't full go */
-		if (!(info_p->access.fifo_full(info_p))) {
-
+		if (!(h->access.fifo_full(h)))
 			break;
-		}
 		udelay(10);
 		printk(KERN_WARNING "cciss cciss%d: SendCmd FIFO full,"
-		       " waiting!\n", ctlr);
+		       " waiting!\n", h->ctlr);
 	}
-	/*
-	 * Send the cmd
-	 */
-	info_p->access.submit_command(info_p, c);
-	done = 0;
+	h->access.submit_command(h, c); /* Send the cmd */
 	do {
-		complete = pollcomplete(ctlr);
+		complete = pollcomplete(h->ctlr);
 
 #ifdef CCISS_DEBUG
 		printk(KERN_DEBUG "cciss: command completed\n");
@@ -2438,97 +2412,116 @@ static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size, unsigned int use
 		if (complete == 1) {
 			printk(KERN_WARNING
 			       "cciss cciss%d: SendCmd Timeout out, "
-			       "No command list address returned!\n", ctlr);
+			       "No command list address returned!\n", h->ctlr);
 			status = IO_ERROR;
-			done = 1;
 			break;
 		}
 
-		/* This will need to change for direct lookup completions */
-		if ((complete & CISS_ERROR_BIT)
-		    && (complete & ~CISS_ERROR_BIT) == c->busaddr) {
-			/* if data overrun or underun on Report command
-			   ignore it
-			 */
-			if (((c->Request.CDB[0] == CISS_REPORT_LOG) ||
-			     (c->Request.CDB[0] == CISS_REPORT_PHYS) ||
-			     (c->Request.CDB[0] == CISS_INQUIRY)) &&
-			    ((c->err_info->CommandStatus ==
-			      CMD_DATA_OVERRUN) ||
-			     (c->err_info->CommandStatus == CMD_DATA_UNDERRUN)
-			    )) {
-				complete = c->busaddr;
-			} else {
-				if (c->err_info->CommandStatus ==
-				    CMD_UNSOLICITED_ABORT) {
-					printk(KERN_WARNING "cciss%d: "
-					       "unsolicited abort %p\n",
-					       ctlr, c);
-					if (c->retry_count < MAX_CMD_RETRIES) {
-						printk(KERN_WARNING
-						       "cciss%d: retrying %p\n",
-						       ctlr, c);
-						c->retry_count++;
-						/* erase the old error */
-						/* information */
-						memset(c->err_info, 0,
-						       sizeof
-						       (ErrorInfo_struct));
-						goto resend_cmd1;
-					} else {
-						printk(KERN_WARNING
-						       "cciss%d: retried %p too "
-						       "many times\n", ctlr, c);
-						status = IO_ERROR;
-						goto cleanup1;
-					}
-				} else if (c->err_info->CommandStatus ==
-					   CMD_UNABORTABLE) {
-					printk(KERN_WARNING
-					       "cciss%d: command could not be aborted.\n",
-					       ctlr);
-					status = IO_ERROR;
-					goto cleanup1;
-				}
-				printk(KERN_WARNING "ciss ciss%d: sendcmd"
-				       " Error %x \n", ctlr,
-				       c->err_info->CommandStatus);
-				printk(KERN_WARNING "ciss ciss%d: sendcmd"
-				       " offensive info\n"
-				       "  size %x\n   num %x   value %x\n",
-				       ctlr,
-				       c->err_info->MoreErrInfo.Invalid_Cmd.
-				       offense_size,
-				       c->err_info->MoreErrInfo.Invalid_Cmd.
-				       offense_num,
-				       c->err_info->MoreErrInfo.Invalid_Cmd.
-				       offense_value);
-				status = IO_ERROR;
-				goto cleanup1;
-			}
+		/* If it's not the cmd we're looking for, save it for later */
+		if ((complete & ~CISS_ERROR_BIT) != c->busaddr) {
+			if (add_sendcmd_reject(c->Request.CDB[0],
+				h->ctlr, complete) != 0)
+				BUG(); /* we are hosed if we get here. */
+			continue;
+		}
+
+		/* It is our command.  If no error, we're done. */
+		if (!(complete & CISS_ERROR_BIT)) {
+			status = IO_OK;
+			break;
+		}
+
+		/* There is an error... */
+
+		/* if data overrun or underun on Report command ignore it */
+		if (((c->Request.CDB[0] == CISS_REPORT_LOG) ||
+		     (c->Request.CDB[0] == CISS_REPORT_PHYS) ||
+		     (c->Request.CDB[0] == CISS_INQUIRY)) &&
+			((c->err_info->CommandStatus == CMD_DATA_OVERRUN) ||
+			 (c->err_info->CommandStatus == CMD_DATA_UNDERRUN))) {
+			complete = c->busaddr;
+			status = IO_OK;
+			break;
 		}
-		/* This will need changing for direct lookup completions */
-		if (complete != c->busaddr) {
-			if (add_sendcmd_reject(cmd, ctlr, complete) != 0) {
-				BUG();	/* we are pretty much hosed if we get here. */
+
+		if (c->err_info->CommandStatus == CMD_UNSOLICITED_ABORT) {
+			printk(KERN_WARNING "cciss%d: unsolicited abort %p\n",
+				h->ctlr, c);
+			if (c->retry_count < MAX_CMD_RETRIES) {
+				printk(KERN_WARNING "cciss%d: retrying %p\n",
+				   h->ctlr, c);
+				c->retry_count++;
+				/* erase the old error information */
+				memset(c->err_info, 0, sizeof(c->err_info));
+				goto resend_cmd1;
 			}
-			continue;
-		} else
-			done = 1;
-	} while (!done);
+			printk(KERN_WARNING "cciss%d: retried %p too many "
+				"times\n", h->ctlr, c);
+			status = IO_ERROR;
+			goto cleanup1;
+		}
+
+		if (c->err_info->CommandStatus == CMD_UNABORTABLE) {
+			printk(KERN_WARNING "cciss%d: command could not be "
+				"aborted.\n", h->ctlr);
+			status = IO_ERROR;
+			goto cleanup1;
+		}
+
+		printk(KERN_WARNING "cciss%d: sendcmd error\n", h->ctlr);
+		printk(KERN_WARNING "cmd = 0x%02x, CommandStatus = 0x%02x\n",
+			c->Request.CDB[0], c->err_info->CommandStatus);
+		if (c->err_info->CommandStatus == CMD_TARGET_STATUS) {
+			printk(KERN_WARNING "Target status = 0x%02x\n",
+			c->err_info->ScsiStatus);
+			if (c->err_info->ScsiStatus == 2) /* chk cond */
+				printk(KERN_WARNING "Sense key = 0x%02x\n",
+					0xf & c->err_info->SenseInfo[2]);
+		}
+
+		status = IO_ERROR;
+		goto cleanup1;
+
+	} while (1);
 
-      cleanup1:
+cleanup1:
 	/* unlock the data buffer from DMA */
 	buff_dma_handle.val32.lower = c->SG[0].Addr.lower;
 	buff_dma_handle.val32.upper = c->SG[0].Addr.upper;
-	pci_unmap_single(info_p->pdev, (dma_addr_t) buff_dma_handle.val,
+	pci_unmap_single(h->pdev, (dma_addr_t) buff_dma_handle.val,
 			 c->SG[0].Len, PCI_DMA_BIDIRECTIONAL);
 #ifdef CONFIG_CISS_SCSI_TAPE
 	/* if we saved some commands for later, process them now. */
-	if (info_p->scsi_rejects.ncompletions > 0)
-		do_cciss_intr(0, info_p);
+	if (h->scsi_rejects.ncompletions > 0)
+		do_cciss_intr(0, h);
 #endif
-	cmd_free(info_p, c, 1);
+	return status;
+}
+
+/*
+ * Send a command to the controller, and wait for it to complete.
+ * Used at init time, and during SCSI error recovery.
+ */
+static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size,
+	unsigned int use_unit_num,/* 0: address the controller,
+				     1: address logical volume log_unit,
+				     2: periph device address is scsi3addr */
+	unsigned int log_unit,
+	__u8 page_code, unsigned char *scsi3addr, int cmd_type)
+{
+	CommandList_struct *c;
+	int status;
+
+	c = cmd_alloc(hba[ctlr], 1);
+	if (!c) {
+		printk(KERN_WARNING "cciss: unable to get memory");
+		return IO_ERROR;
+	}
+	status = fill_cmd(c, cmd, ctlr, buff, size, use_unit_num,
+			  log_unit, page_code, scsi3addr, cmd_type);
+	if (status == IO_OK)
+		status = sendcmd_core(hba[ctlr], c);
+	cmd_free(hba[ctlr], c, 1);
 	return status;
 }
 
-- 
cgit v0.10.2


From 88f627ae394eadd75ada669904269f1a4a77b3bd Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cca.cpqcorp.net>
Date: Tue, 2 Jun 2009 14:48:11 +0200
Subject: cciss: fix SCSI device reset handler

Fix the SCSI reset error handler to send a working, properly addressed
reset message to the target device and add code to wait for the target
device to become ready by polling it with Test Unit Ready.

The existing reset code was broken in that it didn't bother to set the
8-byte LUN address to anything besides zero, so the command was addressed
to the controller, which pretended to the driver that the command
succeeded, while doing nothing.  Ages ago I tested this code, but
unbeknownst to me, my test was flawed, and what I thought was a tape drive
getting reset was actually nothing of the sort.  Unfortunately, there is
still lots of Smartarray firmware that doesn't handle doing target resets
right, and this code won't help in those cases, but it also shouldn't make
things worse in those cases than they already are.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Cc: Mike Miller <mikem@beardog.cca.cpqcorp.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 8d0f893..cb43fb3 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1974,6 +1974,13 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff, size_
 			c->Request.CDB[0] = BMIC_WRITE;
 			c->Request.CDB[6] = BMIC_CACHE_FLUSH;
 			break;
+		case TEST_UNIT_READY:
+			memcpy(c->Header. LUN.LunAddrBytes, scsi3addr, 8);
+			c->Request.CDBLen = 6;
+			c->Request.Type.Attribute = ATTR_SIMPLE;
+			c->Request.Type.Direction = XFER_NONE;
+			c->Request.Timeout = 0;
+			break;
 		default:
 			printk(KERN_WARNING
 			       "cciss%d:  Unknown Command 0x%c\n", ctlr, cmd);
@@ -1992,13 +1999,14 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff, size_
 			memcpy(&c->Request.CDB[4], buff, 8);
 			break;
 		case 1:	/* RESET message */
-			c->Request.CDBLen = 12;
+			memcpy(c->Header.LUN.LunAddrBytes, scsi3addr, 8);
+			c->Request.CDBLen = 16;
 			c->Request.Type.Attribute = ATTR_SIMPLE;
-			c->Request.Type.Direction = XFER_WRITE;
+			c->Request.Type.Direction = XFER_NONE;
 			c->Request.Timeout = 0;
 			memset(&c->Request.CDB[0], 0, sizeof(c->Request.CDB));
 			c->Request.CDB[0] = cmd;	/* reset */
-			c->Request.CDB[1] = 0x04;	/* reset a LUN */
+			c->Request.CDB[1] = 0x03;	/* reset a target */
 			break;
 		case 3:	/* No-Op message */
 			c->Request.CDBLen = 1;
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index a3fd87b..8575c48 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -58,6 +58,18 @@ static int sendcmd(
 	unsigned char *scsi3addr,
 	int cmd_type);
 
+static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff,
+	size_t size,
+	unsigned int use_unit_num, /* 0: address the controller,
+				      1: address logical volume log_unit,
+				      2: periph device address is scsi3addr */
+	unsigned int log_unit, __u8 page_code, unsigned char *scsi3addr,
+	int cmd_type);
+
+static int sendcmd_core(ctlr_info_t *h, CommandList_struct *c);
+
+static CommandList_struct *cmd_alloc(ctlr_info_t *h, int get_from_pool);
+static void cmd_free(ctlr_info_t *h, CommandList_struct *c, int got_from_pool);
 
 static int cciss_scsi_proc_info(
 		struct Scsi_Host *sh,
@@ -1575,6 +1587,68 @@ cciss_seq_tape_report(struct seq_file *seq, int ctlr)
 	CPQ_TAPE_UNLOCK(ctlr, flags);
 }
 
+static int wait_for_device_to_become_ready(ctlr_info_t *h,
+	unsigned char lunaddr[])
+{
+	int rc;
+	int count = 0;
+	int waittime = HZ;
+	CommandList_struct *c;
+
+	c = cmd_alloc(h, 1);
+	if (!c) {
+		printk(KERN_WARNING "cciss%d: out of memory in "
+			"wait_for_device_to_become_ready.\n", h->ctlr);
+		return IO_ERROR;
+	}
+
+	/* Send test unit ready until device ready, or give up. */
+	while (count < 20) {
+
+		/* Wait for a bit.  do this first, because if we send
+		 * the TUR right away, the reset will just abort it.
+		 */
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(waittime);
+		count++;
+
+		/* Increase wait time with each try, up to a point. */
+		if (waittime < (HZ * 30))
+			waittime = waittime * 2;
+
+		/* Send the Test Unit Ready */
+		rc = fill_cmd(c, TEST_UNIT_READY, h->ctlr, NULL, 0, 0, 0, 0,
+			lunaddr, TYPE_CMD);
+		if (rc == 0) {
+			rc = sendcmd_core(h, c);
+			/* sendcmd turned off interrupts, turn 'em back on. */
+			h->access.set_intr_mask(h, CCISS_INTR_ON);
+		}
+
+		if (rc == 0 && c->err_info->CommandStatus == CMD_SUCCESS)
+			break;
+
+		if (rc == 0 &&
+			c->err_info->CommandStatus == CMD_TARGET_STATUS &&
+			c->err_info->ScsiStatus == SAM_STAT_CHECK_CONDITION &&
+			(c->err_info->SenseInfo[2] == NO_SENSE ||
+			c->err_info->SenseInfo[2] == UNIT_ATTENTION))
+			break;
+
+		printk(KERN_WARNING "cciss%d: Waiting %d secs "
+			"for device to become ready.\n",
+			h->ctlr, waittime / HZ);
+		rc = 1; /* device not ready. */
+	}
+
+	if (rc)
+		printk("cciss%d: giving up on device.\n", h->ctlr);
+	else
+		printk(KERN_WARNING "cciss%d: device is ready.\n", h->ctlr);
+
+	cmd_free(h, c, 1);
+	return rc;
+}
 
 /* Need at least one of these error handlers to keep ../scsi/hosts.c from 
  * complaining.  Doing a host- or bus-reset can't do anything good here. 
@@ -1591,6 +1665,7 @@ static int cciss_eh_device_reset_handler(struct scsi_cmnd *scsicmd)
 {
 	int rc;
 	CommandList_struct *cmd_in_trouble;
+	unsigned char lunaddr[8];
 	ctlr_info_t **c;
 	int ctlr;
 
@@ -1600,19 +1675,17 @@ static int cciss_eh_device_reset_handler(struct scsi_cmnd *scsicmd)
 		return FAILED;
 	ctlr = (*c)->ctlr;
 	printk(KERN_WARNING "cciss%d: resetting tape drive or medium changer.\n", ctlr);
-
 	/* find the command that's giving us trouble */
 	cmd_in_trouble = (CommandList_struct *) scsicmd->host_scribble;
-	if (cmd_in_trouble == NULL) { /* paranoia */
+	if (cmd_in_trouble == NULL) /* paranoia */
 		return FAILED;
-	}
+	memcpy(lunaddr, &cmd_in_trouble->Header.LUN.LunAddrBytes[0], 8);
 	/* send a reset to the SCSI LUN which the command was sent to */
-	rc = sendcmd(CCISS_RESET_MSG, ctlr, NULL, 0, 2, 0, 0, 
-		(unsigned char *) &cmd_in_trouble->Header.LUN.LunAddrBytes[0], 
+	rc = sendcmd(CCISS_RESET_MSG, ctlr, NULL, 0, 2, 0, 0, lunaddr,
 		TYPE_MSG);
 	/* sendcmd turned off interrupts on the board, turn 'em back on. */
 	(*c)->access.set_intr_mask(*c, CCISS_INTR_ON);
-	if (rc == 0)
+	if (rc == 0 && wait_for_device_to_become_ready(*c, lunaddr) == 0)
 		return SUCCESS;
 	printk(KERN_WARNING "cciss%d: resetting device failed.\n", ctlr);
 	return FAILED;
-- 
cgit v0.10.2


From 7fe063268e73681cdca1a6496a25f93d3332f517 Mon Sep 17 00:00:00 2001
From: Andrew Patterson <andrew.patterson@hp.com>
Date: Tue, 2 Jun 2009 14:48:39 +0200
Subject: cciss: add cciss driver sysfs entries

Add sysfs entries to the cciss driver needed for the dm/multipath tools.

A file for vendor, model, rev, and unique_id is added for each logical
drive under directory /sys/bus/pci/devices/<dev>/ccissX/cXdY.  Where X =
the controller (or host) number and Y is the logical drive number.

A link from /sys/bus/pci/devices/<dev>/ccissX/cXdY/block:cciss!cXdY to
/sys/block/cciss!cXdY/device is also created.  A bus is created in
/sys/bus/cciss.  A link is created from the pci ccissX entry to
/sys/bus/cciss/devices/ccissX.  Please consider this for inclusion.

Signed-off-by: Mike Miller <mike.miller@hp.com>
Cc: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss b/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss
new file mode 100644
index 0000000..0a92a7c
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss
@@ -0,0 +1,33 @@
+Where:		/sys/bus/pci/devices/<dev>/ccissX/cXdY/model
+Date:		March 2009
+Kernel Version: 2.6.30
+Contact:	iss_storagedev@hp.com
+Description:	Displays the SCSI INQUIRY page 0 model for logical drive
+		Y of controller X.
+
+Where:		/sys/bus/pci/devices/<dev>/ccissX/cXdY/rev
+Date:		March 2009
+Kernel Version: 2.6.30
+Contact:	iss_storagedev@hp.com
+Description:	Displays the SCSI INQUIRY page 0 revision for logical
+		drive Y of controller X.
+
+Where:		/sys/bus/pci/devices/<dev>/ccissX/cXdY/unique_id
+Date:		March 2009
+Kernel Version: 2.6.30
+Contact:	iss_storagedev@hp.com
+Description:	Displays the SCSI INQUIRY page 83 serial number for logical
+		drive Y of controller X.
+
+Where:		/sys/bus/pci/devices/<dev>/ccissX/cXdY/vendor
+Date:		March 2009
+Kernel Version: 2.6.30
+Contact:	iss_storagedev@hp.com
+Description:	Displays the SCSI INQUIRY page 0 vendor for logical drive
+		Y of controller X.
+
+Where:		/sys/bus/pci/devices/<dev>/ccissX/cXdY/block:cciss!cXdY
+Date:		March 2009
+Kernel Version: 2.6.30
+Contact:	iss_storagedev@hp.com
+Description:	A symbolic link to /sys/block/cciss!cXdY
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index cb43fb3..e7d0095 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -437,6 +437,194 @@ static void __devinit cciss_procinit(int i)
 }
 #endif				/* CONFIG_PROC_FS */
 
+#define MAX_PRODUCT_NAME_LEN 19
+
+#define to_hba(n) container_of(n, struct ctlr_info, dev)
+#define to_drv(n) container_of(n, drive_info_struct, dev)
+
+static struct device_type cciss_host_type = {
+	.name		= "cciss_host",
+};
+
+static ssize_t dev_show_unique_id(struct device *dev,
+				 struct device_attribute *attr,
+				 char *buf)
+{
+	drive_info_struct *drv = to_drv(dev);
+	struct ctlr_info *h = to_hba(drv->dev.parent);
+	__u8 sn[16];
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(CCISS_LOCK(h->ctlr), flags);
+	if (h->busy_configuring)
+		ret = -EBUSY;
+	else
+		memcpy(sn, drv->serial_no, sizeof(sn));
+	spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
+
+	if (ret)
+		return ret;
+	else
+		return snprintf(buf, 16 * 2 + 2,
+				"%02X%02X%02X%02X%02X%02X%02X%02X"
+				"%02X%02X%02X%02X%02X%02X%02X%02X\n",
+				sn[0], sn[1], sn[2], sn[3],
+				sn[4], sn[5], sn[6], sn[7],
+				sn[8], sn[9], sn[10], sn[11],
+				sn[12], sn[13], sn[14], sn[15]);
+}
+DEVICE_ATTR(unique_id, S_IRUGO, dev_show_unique_id, NULL);
+
+static ssize_t dev_show_vendor(struct device *dev,
+			       struct device_attribute *attr,
+			       char *buf)
+{
+	drive_info_struct *drv = to_drv(dev);
+	struct ctlr_info *h = to_hba(drv->dev.parent);
+	char vendor[VENDOR_LEN + 1];
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(CCISS_LOCK(h->ctlr), flags);
+	if (h->busy_configuring)
+		ret = -EBUSY;
+	else
+		memcpy(vendor, drv->vendor, VENDOR_LEN + 1);
+	spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
+
+	if (ret)
+		return ret;
+	else
+		return snprintf(buf, sizeof(vendor) + 1, "%s\n", drv->vendor);
+}
+DEVICE_ATTR(vendor, S_IRUGO, dev_show_vendor, NULL);
+
+static ssize_t dev_show_model(struct device *dev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	drive_info_struct *drv = to_drv(dev);
+	struct ctlr_info *h = to_hba(drv->dev.parent);
+	char model[MODEL_LEN + 1];
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(CCISS_LOCK(h->ctlr), flags);
+	if (h->busy_configuring)
+		ret = -EBUSY;
+	else
+		memcpy(model, drv->model, MODEL_LEN + 1);
+	spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
+
+	if (ret)
+		return ret;
+	else
+		return snprintf(buf, sizeof(model) + 1, "%s\n", drv->model);
+}
+DEVICE_ATTR(model, S_IRUGO, dev_show_model, NULL);
+
+static ssize_t dev_show_rev(struct device *dev,
+			    struct device_attribute *attr,
+			    char *buf)
+{
+	drive_info_struct *drv = to_drv(dev);
+	struct ctlr_info *h = to_hba(drv->dev.parent);
+	char rev[REV_LEN + 1];
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(CCISS_LOCK(h->ctlr), flags);
+	if (h->busy_configuring)
+		ret = -EBUSY;
+	else
+		memcpy(rev, drv->rev, REV_LEN + 1);
+	spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
+
+	if (ret)
+		return ret;
+	else
+		return snprintf(buf, sizeof(rev) + 1, "%s\n", drv->rev);
+}
+DEVICE_ATTR(rev, S_IRUGO, dev_show_rev, NULL);
+
+static struct attribute *cciss_dev_attrs[] = {
+	&dev_attr_unique_id.attr,
+	&dev_attr_model.attr,
+	&dev_attr_vendor.attr,
+	&dev_attr_rev.attr,
+	NULL
+};
+
+static struct attribute_group cciss_dev_attr_group = {
+	.attrs = cciss_dev_attrs,
+};
+
+static struct attribute_group *cciss_dev_attr_groups[] = {
+	&cciss_dev_attr_group,
+	NULL
+};
+
+static struct device_type cciss_dev_type = {
+	.name		= "cciss_device",
+	.groups		= cciss_dev_attr_groups,
+};
+
+static struct bus_type cciss_bus_type = {
+	.name		= "cciss",
+};
+
+
+/*
+ * Initialize sysfs entry for each controller.  This sets up and registers
+ * the 'cciss#' directory for each individual controller under
+ * /sys/bus/pci/devices/<dev>/.
+ */
+static int cciss_create_hba_sysfs_entry(struct ctlr_info *h)
+{
+	device_initialize(&h->dev);
+	h->dev.type = &cciss_host_type;
+	h->dev.bus = &cciss_bus_type;
+	dev_set_name(&h->dev, "%s", h->devname);
+	h->dev.parent = &h->pdev->dev;
+
+	return device_add(&h->dev);
+}
+
+/*
+ * Remove sysfs entries for an hba.
+ */
+static void cciss_destroy_hba_sysfs_entry(struct ctlr_info *h)
+{
+	device_del(&h->dev);
+}
+
+/*
+ * Initialize sysfs for each logical drive.  This sets up and registers
+ * the 'c#d#' directory for each individual logical drive under
+ * /sys/bus/pci/devices/<dev/ccis#/. We also create a link from
+ * /sys/block/cciss!c#d# to this entry.
+ */
+static int cciss_create_ld_sysfs_entry(struct ctlr_info *h,
+				       drive_info_struct *drv,
+				       int drv_index)
+{
+	device_initialize(&drv->dev);
+	drv->dev.type = &cciss_dev_type;
+	drv->dev.bus = &cciss_bus_type;
+	dev_set_name(&drv->dev, "c%dd%d", h->ctlr, drv_index);
+	drv->dev.parent = &h->dev;
+	return device_add(&drv->dev);
+}
+
+/*
+ * Remove sysfs entries for a logical drive.
+ */
+static void cciss_destroy_ld_sysfs_entry(drive_info_struct *drv)
+{
+	device_del(&drv->dev);
+}
+
 /*
  * For operations that cannot sleep, a command block is allocated at init,
  * and managed by cmd_alloc() and cmd_free() using a simple bitmap to track
@@ -1332,6 +1520,45 @@ static void cciss_softirq_done(struct request *rq)
 	spin_unlock_irqrestore(&h->lock, flags);
 }
 
+/* This function gets the SCSI vendor, model, and revision of a logical drive
+ * via the inquiry page 0.  Model, vendor, and rev are set to empty strings if
+ * they cannot be read.
+ */
+static void cciss_get_device_descr(int ctlr, int logvol, int withirq,
+				   char *vendor, char *model, char *rev)
+{
+	int rc;
+	InquiryData_struct *inq_buf;
+
+	*vendor = '\0';
+	*model = '\0';
+	*rev = '\0';
+
+	inq_buf = kzalloc(sizeof(InquiryData_struct), GFP_KERNEL);
+	if (!inq_buf)
+		return;
+
+	if (withirq)
+		rc = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buf,
+				     sizeof(InquiryData_struct), 1, logvol,
+				     0, TYPE_CMD);
+	else
+		rc = sendcmd(CISS_INQUIRY, ctlr, inq_buf,
+			     sizeof(InquiryData_struct), 1, logvol, 0, NULL,
+			     TYPE_CMD);
+	if (rc == IO_OK) {
+		memcpy(vendor, &inq_buf->data_byte[8], VENDOR_LEN);
+		vendor[VENDOR_LEN] = '\0';
+		memcpy(model, &inq_buf->data_byte[16], MODEL_LEN);
+		model[MODEL_LEN] = '\0';
+		memcpy(rev, &inq_buf->data_byte[32], REV_LEN);
+		rev[REV_LEN] = '\0';
+	}
+
+	kfree(inq_buf);
+	return;
+}
+
 /* This function gets the serial number of a logical drive via
  * inquiry page 0x83.  Serial no. is 16 bytes.  If the serial
  * number cannot be had, for whatever reason, 16 bytes of 0xff
@@ -1372,7 +1599,7 @@ static void cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
 	disk->first_minor = drv_index << NWD_SHIFT;
 	disk->fops = &cciss_fops;
 	disk->private_data = &h->drv[drv_index];
-	disk->driverfs_dev = &h->pdev->dev;
+	disk->driverfs_dev = &h->drv[drv_index].dev;
 
 	/* Set up queue information */
 	blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask);
@@ -1463,6 +1690,8 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time)
 	drvinfo->block_size = block_size;
 	drvinfo->nr_blocks = total_size + 1;
 
+	cciss_get_device_descr(ctlr, drv_index, 1, drvinfo->vendor,
+				drvinfo->model, drvinfo->rev);
 	cciss_get_serial_no(ctlr, drv_index, 1, drvinfo->serial_no,
 			sizeof(drvinfo->serial_no));
 
@@ -1512,6 +1741,9 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time)
 	h->drv[drv_index].cylinders = drvinfo->cylinders;
 	h->drv[drv_index].raid_level = drvinfo->raid_level;
 	memcpy(h->drv[drv_index].serial_no, drvinfo->serial_no, 16);
+	memcpy(h->drv[drv_index].vendor, drvinfo->vendor, VENDOR_LEN + 1);
+	memcpy(h->drv[drv_index].model, drvinfo->model, MODEL_LEN + 1);
+	memcpy(h->drv[drv_index].rev, drvinfo->rev, REV_LEN + 1);
 
 	++h->num_luns;
 	disk = h->gendisk[drv_index];
@@ -1586,6 +1818,8 @@ static int cciss_add_gendisk(ctlr_info_t *h, __u32 lunid, int controller_node)
 		}
 	}
 	h->drv[drv_index].LunID = lunid;
+	if (cciss_create_ld_sysfs_entry(h, &h->drv[drv_index], drv_index))
+		goto err_free_disk;
 
 	/* Don't need to mark this busy because nobody */
 	/* else knows about this disk yet to contend */
@@ -1593,6 +1827,11 @@ static int cciss_add_gendisk(ctlr_info_t *h, __u32 lunid, int controller_node)
 	h->drv[drv_index].busy_configuring = 0;
 	wmb();
 	return drv_index;
+
+err_free_disk:
+	put_disk(h->gendisk[drv_index]);
+	h->gendisk[drv_index] = NULL;
+	return -1;
 }
 
 /* This is for the special case of a controller which
@@ -1713,6 +1952,7 @@ static int rebuild_lun_table(ctlr_info_t *h, int first_time)
 			h->drv[i].busy_configuring = 1;
 			spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
 			return_code = deregister_disk(h, i, 1);
+			cciss_destroy_ld_sysfs_entry(&h->drv[i]);
 			h->drv[i].busy_configuring = 0;
 		}
 	}
@@ -3719,12 +3959,15 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 	INIT_HLIST_HEAD(&hba[i]->reqQ);
 
 	if (cciss_pci_init(hba[i], pdev) != 0)
-		goto clean1;
+		goto clean0;
 
 	sprintf(hba[i]->devname, "cciss%d", i);
 	hba[i]->ctlr = i;
 	hba[i]->pdev = pdev;
 
+	if (cciss_create_hba_sysfs_entry(hba[i]))
+		goto clean0;
+
 	/* configure PCI DMA stuff */
 	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64)))
 		dac = 1;
@@ -3868,6 +4111,8 @@ clean4:
 clean2:
 	unregister_blkdev(hba[i]->major, hba[i]->devname);
 clean1:
+	cciss_destroy_hba_sysfs_entry(hba[i]);
+clean0:
 	hba[i]->busy_initializing = 0;
 	/* cleanup any queues that may have been initialized */
 	for (j=0; j <= hba[i]->highest_lun; j++){
@@ -3978,6 +4223,7 @@ static void __devexit cciss_remove_one(struct pci_dev *pdev)
 	 */
 	pci_release_regions(pdev);
 	pci_set_drvdata(pdev, NULL);
+	cciss_destroy_hba_sysfs_entry(hba[i]);
 	free_hba(i);
 }
 
@@ -3995,6 +4241,8 @@ static struct pci_driver cciss_pci_driver = {
  */
 static int __init cciss_init(void)
 {
+	int err;
+
 	/*
 	 * The hardware requires that commands are aligned on a 64-bit
 	 * boundary. Given that we use pci_alloc_consistent() to allocate an
@@ -4004,8 +4252,20 @@ static int __init cciss_init(void)
 
 	printk(KERN_INFO DRIVER_NAME "\n");
 
+	err = bus_register(&cciss_bus_type);
+	if (err)
+		return err;
+
 	/* Register for our PCI devices */
-	return pci_register_driver(&cciss_pci_driver);
+	err = pci_register_driver(&cciss_pci_driver);
+	if (err)
+		goto err_bus_register;
+
+	return 0;
+
+err_bus_register:
+	bus_unregister(&cciss_bus_type);
+	return err;
 }
 
 static void __exit cciss_cleanup(void)
@@ -4022,6 +4282,7 @@ static void __exit cciss_cleanup(void)
 		}
 	}
 	remove_proc_entry("driver/cciss", NULL);
+	bus_unregister(&cciss_bus_type);
 }
 
 static void fail_all_cmds(unsigned long ctlr)
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index 703e080..dd1926d 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -12,6 +12,10 @@
 #define IO_OK		0
 #define IO_ERROR	1
 
+#define VENDOR_LEN	8
+#define MODEL_LEN	16
+#define REV_LEN		4
+
 struct ctlr_info;
 typedef struct ctlr_info ctlr_info_t;
 
@@ -34,13 +38,18 @@ typedef struct _drive_info_struct
 	int 	cylinders;
 	int	raid_level; /* set to -1 to indicate that
 			     * the drive is not in use/configured
-			    */
-	int	busy_configuring; /*This is set when the drive is being removed
-				   *to prevent it from being opened or it's queue
-				   *from being started.
-				  */
-	__u8 serial_no[16]; /* from inquiry page 0x83, */
-			    /* not necc. null terminated. */
+			     */
+	int	busy_configuring; /* This is set when a drive is being removed
+				   * to prevent it from being opened or it's
+				   * queue from being started.
+				   */
+	struct	device dev;
+	__u8 serial_no[16]; /* from inquiry page 0x83,
+			     * not necc. null terminated.
+			     */
+	char vendor[VENDOR_LEN + 1]; /* SCSI vendor string */
+	char model[MODEL_LEN + 1];   /* SCSI model string */
+	char rev[REV_LEN + 1];       /* SCSI revision string */
 } drive_info_struct;
 
 #ifdef CONFIG_CISS_SCSI_TAPE
@@ -123,6 +132,7 @@ struct ctlr_info
 	unsigned char alive;
 	struct completion *rescan_wait;
 	struct task_struct *cciss_scan_thread;
+	struct device dev;
 };
 
 /*  Defining the diffent access_menthods */
-- 
cgit v0.10.2


From 77b0308a0778861111184e097533000f7a458c37 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 2 Jun 2009 14:51:30 +0200
Subject: cciss: use schedule_timeout_interruptible()

Use schedule_timeout_interruptible() instead of open-coding the set and
schedule parts.

Cc: Mike Miller <mikem@beardog.cca.cpqcorp.net>
Cc: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index 8575c48..2edfc9b 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -1608,8 +1608,7 @@ static int wait_for_device_to_become_ready(ctlr_info_t *h,
 		/* Wait for a bit.  do this first, because if we send
 		 * the TUR right away, the reset will just abort it.
 		 */
-		set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(waittime);
+		schedule_timeout_interruptible(waittime);
 		count++;
 
 		/* Increase wait time with each try, up to a point. */
-- 
cgit v0.10.2


From dbdc9dd342f0a7e32f40f0d4ade662bdfe057484 Mon Sep 17 00:00:00 2001
From: vibi sreenivasan <vibi_sreenivasan@cms.com>
Date: Tue, 2 Jun 2009 14:52:32 +0200
Subject: Removed reference to non-existing file
 Documentation/PCI/PCI-DMA-mapping.txt

File Documentation/PCI/PCI-DMA-mapping.txt does not exist.
 Documentation/DMA-mapping.txt contains DMA Mapping details

Signed-off-by: vibi sreenivasan <vibi_sreenivasan@cms.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 6fab97e..8d2158a 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -186,7 +186,7 @@ a virtual address mapping (unlike the earlier scheme of virtual address
 do not have a corresponding kernel virtual address space mapping) and
 low-memory pages.
 
-Note: Please refer to Documentation/PCI/PCI-DMA-mapping.txt for a discussion
+Note: Please refer to Documentation/DMA-mapping.txt for a discussion
 on PCI high mem DMA aspects and mapping of scatter gather lists, and support
 for 64 bit PCI.
 
-- 
cgit v0.10.2


From 2e507d849f1834d3fe9aae71b7aabf4c2a3827b8 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 22 May 2009 18:24:20 +0200
Subject: dma-debug: add variables and checks for driver filter

This patch adds the state variables for the driver filter and a function
to check if the filter is enabled and matches to the current device. The
check is built into the err_printk function.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index cdd205d..c01f647 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -99,6 +99,15 @@ static struct dentry *show_num_errors_dent  __read_mostly;
 static struct dentry *num_free_entries_dent __read_mostly;
 static struct dentry *min_free_entries_dent __read_mostly;
 
+/* per-driver filter related state */
+
+#define NAME_MAX_LEN	64
+
+static char                  current_driver_name[NAME_MAX_LEN] __read_mostly;
+static struct device_driver *current_driver                    __read_mostly;
+
+static DEFINE_RWLOCK(driver_name_lock);
+
 static const char *type2name[4] = { "single", "page",
 				    "scather-gather", "coherent" };
 
@@ -128,9 +137,47 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry)
 #endif
 }
 
+static bool driver_filter(struct device *dev)
+{
+	/* driver filter off */
+	if (likely(!current_driver_name[0]))
+		return true;
+
+	/* driver filter on and initialized */
+	if (current_driver && dev->driver == current_driver)
+		return true;
+
+	/* driver filter on but not yet initialized */
+	if (!current_driver && current_driver_name[0]) {
+		struct device_driver *drv = get_driver(dev->driver);
+		unsigned long flags;
+		bool ret = false;
+
+		if (!drv)
+			return false;
+
+		/* lock to protect against change of current_driver_name */
+		read_lock_irqsave(&driver_name_lock, flags);
+
+		if (drv->name &&
+		    strncmp(current_driver_name, drv->name, 63) == 0) {
+			current_driver = drv;
+			ret = true;
+		}
+
+		read_unlock_irqrestore(&driver_name_lock, flags);
+		put_driver(drv);
+
+		return ret;
+	}
+
+	return false;
+}
+
 #define err_printk(dev, entry, format, arg...) do {		\
 		error_count += 1;				\
-		if (show_all_errors || show_num_errors > 0) {	\
+		if (driver_filter(dev) &&			\
+		    (show_all_errors || show_num_errors > 0)) {	\
 			WARN(1, "%s %s: " format,		\
 			     dev_driver_string(dev),		\
 			     dev_name(dev) , ## arg);		\
-- 
cgit v0.10.2


From 709e50cf870e61745b39552044aa6c7c38e4f9e0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 14:13:15 +0200
Subject: perf_counter: Use PID namespaces properly

Stop using task_struct::pid and start using PID namespaces.

PIDs will be reported in the PID namespace of the monitoring
task at the moment of counter creation.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index d970fbc..9ec20fc 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -317,6 +317,7 @@ enum perf_event_type {
 #include <linux/spinlock.h>
 #include <linux/hrtimer.h>
 #include <linux/fs.h>
+#include <linux/pid_namespace.h>
 #include <asm/atomic.h>
 
 struct task_struct;
@@ -500,6 +501,8 @@ struct perf_counter {
 
 	void (*destroy)(struct perf_counter *);
 	struct rcu_head			rcu_head;
+
+	struct pid_namespace		*ns;
 #endif
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index fbed4d2..caa012c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1432,6 +1432,8 @@ static void free_counter_rcu(struct rcu_head *head)
 	struct perf_counter *counter;
 
 	counter = container_of(head, struct perf_counter, rcu_head);
+	if (counter->ns)
+		put_pid_ns(counter->ns);
 	kfree(counter);
 }
 
@@ -2267,6 +2269,28 @@ static void perf_output_end(struct perf_output_handle *handle)
 	rcu_read_unlock();
 }
 
+static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p)
+{
+	/*
+	 * only top level counters have the pid namespace they were created in
+	 */
+	if (counter->parent)
+		counter = counter->parent;
+
+	return task_tgid_nr_ns(p, counter->ns);
+}
+
+static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
+{
+	/*
+	 * only top level counters have the pid namespace they were created in
+	 */
+	if (counter->parent)
+		counter = counter->parent;
+
+	return task_pid_nr_ns(p, counter->ns);
+}
+
 static void perf_counter_output(struct perf_counter *counter,
 				int nmi, struct pt_regs *regs, u64 addr)
 {
@@ -2303,8 +2327,8 @@ static void perf_counter_output(struct perf_counter *counter,
 
 	if (record_type & PERF_RECORD_TID) {
 		/* namespace issues */
-		tid_entry.pid = current->group_leader->pid;
-		tid_entry.tid = current->pid;
+		tid_entry.pid = perf_counter_pid(counter, current);
+		tid_entry.tid = perf_counter_tid(counter, current);
 
 		header.type |= PERF_RECORD_TID;
 		header.size += sizeof(tid_entry);
@@ -2432,6 +2456,9 @@ static void perf_counter_comm_output(struct perf_counter *counter,
 	if (ret)
 		return;
 
+	comm_event->event.pid = perf_counter_pid(counter, comm_event->task);
+	comm_event->event.tid = perf_counter_tid(counter, comm_event->task);
+
 	perf_output_put(&handle, comm_event->event);
 	perf_output_copy(&handle, comm_event->comm,
 				   comm_event->comm_size);
@@ -2504,8 +2531,6 @@ void perf_counter_comm(struct task_struct *task)
 		.task	= task,
 		.event  = {
 			.header = { .type = PERF_EVENT_COMM, },
-			.pid	= task->group_leader->pid,
-			.tid	= task->pid,
 		},
 	};
 
@@ -2542,6 +2567,9 @@ static void perf_counter_mmap_output(struct perf_counter *counter,
 	if (ret)
 		return;
 
+	mmap_event->event.pid = perf_counter_pid(counter, current);
+	mmap_event->event.tid = perf_counter_tid(counter, current);
+
 	perf_output_put(&handle, mmap_event->event);
 	perf_output_copy(&handle, mmap_event->file_name,
 				   mmap_event->file_size);
@@ -2641,8 +2669,6 @@ void perf_counter_mmap(unsigned long addr, unsigned long len,
 		.file   = file,
 		.event  = {
 			.header = { .type = PERF_EVENT_MMAP, },
-			.pid	= current->group_leader->pid,
-			.tid	= current->pid,
 			.start  = addr,
 			.len    = len,
 			.pgoff  = pgoff,
@@ -2664,8 +2690,6 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
 		.file   = file,
 		.event  = {
 			.header = { .type = PERF_EVENT_MUNMAP, },
-			.pid	= current->group_leader->pid,
-			.tid	= current->pid,
 			.start  = addr,
 			.len    = len,
 			.pgoff  = pgoff,
@@ -3445,6 +3469,8 @@ SYSCALL_DEFINE5(perf_counter_open,
 	list_add_tail(&counter->owner_entry, &current->perf_counter_list);
 	mutex_unlock(&current->perf_counter_mutex);
 
+	counter->ns = get_pid_ns(current->nsproxy->pid_ns);
+
 	fput_light(counter_file, fput_needed2);
 
 out_fput:
-- 
cgit v0.10.2


From f70e87d7a6d9c5a23d5f43ed0a0c224c157ef597 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 14:13:24 +0200
Subject: perf_counter: tools: Expand the COMM,MMAP event synthesizer

Include code to pre-construct mappings based on /proc,
on system wide recording.

Fix the existing code to properly fill out ->pid and ->tid.

The PID should be the Thread Group ID (PIDTYPE_PID of task->group_leader)
The TID should be the Thread ID (PIDTYPE_PID of task)

Furthermore, change the default sorting of report to comm,dso for a
better quick overview.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 9c151de..810fc27 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -162,7 +162,7 @@ struct comm_event {
 	char				comm[16];
 };
 
-static pid_t pid_synthesize_comm_event(pid_t pid)
+static void pid_synthesize_comm_event(pid_t pid, int full)
 {
 	struct comm_event comm_ev;
 	char filename[PATH_MAX];
@@ -170,6 +170,8 @@ static pid_t pid_synthesize_comm_event(pid_t pid)
 	int fd, ret;
 	size_t size;
 	char *field, *sep;
+	DIR *tasks;
+	struct dirent dirent, *next;
 
 	snprintf(filename, sizeof(filename), "/proc/%d/stat", pid);
 
@@ -194,29 +196,50 @@ static pid_t pid_synthesize_comm_event(pid_t pid)
 		goto out_failure;
 	size = sep - field;
 	memcpy(comm_ev.comm, field, size++);
-	field = strchr(sep + 4, ' ');
-	if (field == NULL)
-		goto out_failure;
-	comm_ev.pid = atoi(++field);
+
+	comm_ev.pid = pid;
 	comm_ev.header.type = PERF_EVENT_COMM;
-	comm_ev.tid = pid;
 	size = ALIGN(size, sizeof(uint64_t));
 	comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
 
-	ret = write(output, &comm_ev, comm_ev.header.size);
-	if (ret < 0) {
-		perror("failed to write");
-		exit(-1);
+	if (!full) {
+		comm_ev.tid = pid;
+
+		ret = write(output, &comm_ev, comm_ev.header.size);
+		if (ret < 0) {
+			perror("failed to write");
+			exit(-1);
+		}
+		return;
+	}
+
+	snprintf(filename, sizeof(filename), "/proc/%d/task", pid);
+
+	tasks = opendir(filename);
+	while (!readdir_r(tasks, &dirent, &next) && next) {
+		char *end;
+		pid = strtol(dirent.d_name, &end, 10);
+		if (*end)
+			continue;
+
+		comm_ev.tid = pid;
+
+		ret = write(output, &comm_ev, comm_ev.header.size);
+		if (ret < 0) {
+			perror("failed to write");
+			exit(-1);
+		}
 	}
-	return comm_ev.pid;
+	closedir(tasks);
+	return;
+
 out_failure:
 	fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n",
 		filename);
 	exit(EXIT_FAILURE);
-	return -1;
 }
 
-static void pid_synthesize_mmap_events(pid_t pid, pid_t pgid)
+static void pid_synthesize_mmap_events(pid_t pid)
 {
 	char filename[PATH_MAX];
 	FILE *fp;
@@ -261,7 +284,7 @@ static void pid_synthesize_mmap_events(pid_t pid, pid_t pgid)
 			mmap_ev.len -= mmap_ev.start;
 			mmap_ev.header.size = (sizeof(mmap_ev) -
 					       (sizeof(mmap_ev.filename) - size));
-			mmap_ev.pid = pgid;
+			mmap_ev.pid = pid;
 			mmap_ev.tid = pid;
 
 			if (write(output, &mmap_ev, mmap_ev.header.size) < 0) {
@@ -274,6 +297,28 @@ static void pid_synthesize_mmap_events(pid_t pid, pid_t pgid)
 	fclose(fp);
 }
 
+static void synthesize_events(void)
+{
+	DIR *proc;
+	struct dirent dirent, *next;
+
+	proc = opendir("/proc");
+
+	while (!readdir_r(proc, &dirent, &next) && next) {
+		char *end;
+		pid_t pid;
+
+		pid = strtol(dirent.d_name, &end, 10);
+		if (*end) /* only interested in proper numerical dirents */
+			continue;
+
+		pid_synthesize_comm_event(pid, 1);
+		pid_synthesize_mmap_events(pid);
+	}
+
+	closedir(proc);
+}
+
 static void open_counters(int cpu, pid_t pid)
 {
 	struct perf_counter_hw_event hw_event;
@@ -281,8 +326,8 @@ static void open_counters(int cpu, pid_t pid)
 	int track = 1;
 
 	if (pid > 0) {
-		pid_t pgid = pid_synthesize_comm_event(pid);
-		pid_synthesize_mmap_events(pid, pgid);
+		pid_synthesize_comm_event(pid, 0);
+		pid_synthesize_mmap_events(pid);
 	}
 
 	group_fd = -1;
@@ -348,7 +393,7 @@ static int __cmd_record(int argc, const char **argv)
 	assert(nr_cpus <= MAX_NR_CPUS);
 	assert(nr_cpus >= 0);
 
-	output = open(output_name, O_CREAT|O_EXCL|O_RDWR, S_IRWXU);
+	output = open(output_name, O_CREAT|O_EXCL|O_TRUNC|O_RDWR, S_IRUSR|S_IWUSR);
 	if (output < 0) {
 		perror("failed to create output file");
 		exit(-1);
@@ -385,9 +430,8 @@ static int __cmd_record(int argc, const char **argv)
 		}
 	}
 
-	/*
-	 * TODO: store the current /proc/$/maps information somewhere
-	 */
+	if (system_wide)
+		synthesize_events();
 
 	while (!done) {
 		int hits = events;
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 20a4e51..0558c1e 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -18,7 +18,7 @@
 
 static char		const *input_name = "perf.data";
 static char		*vmlinux = NULL;
-static char		*sort_order = "pid,symbol";
+static char		*sort_order = "comm,dso";
 static int		input;
 static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
-- 
cgit v0.10.2


From 97124d5e2df5b9eaa5bb684bb1e8ebc7e29d0f5d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 15:52:24 +0200
Subject: perf_counter: tools: Better handle existing data files

Provide an argument (-f) to overwrite existing data files.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 810fc27..bace7a8 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -7,6 +7,7 @@
 #include "util/parse-events.h"
 #include "util/string.h"
 
+#include <unistd.h>
 #include <sched.h>
 
 #define ALIGN(x, a)		__ALIGN_MASK(x, (typeof(x))(a)-1)
@@ -26,7 +27,7 @@ static unsigned int		realtime_prio			= 0;
 static int			system_wide			= 0;
 static pid_t			target_pid			= -1;
 static int			inherit				= 1;
-static int			nmi				= 1;
+static int			force				= 0;
 
 const unsigned int default_count[] = {
 	1000000,
@@ -337,7 +338,6 @@ static void open_counters(int cpu, pid_t pid)
 		hw_event.config		= event_id[counter];
 		hw_event.irq_period	= event_count[counter];
 		hw_event.record_type	= PERF_RECORD_IP | PERF_RECORD_TID;
-		hw_event.nmi		= nmi;
 		hw_event.mmap		= track;
 		hw_event.comm		= track;
 		hw_event.inherit	= (cpu < 0) && inherit;
@@ -387,13 +387,20 @@ static int __cmd_record(int argc, const char **argv)
 	int i, counter;
 	pid_t pid;
 	int ret;
+	struct stat st;
 
 	page_size = sysconf(_SC_PAGE_SIZE);
 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
 	assert(nr_cpus <= MAX_NR_CPUS);
 	assert(nr_cpus >= 0);
 
-	output = open(output_name, O_CREAT|O_EXCL|O_TRUNC|O_RDWR, S_IRUSR|S_IWUSR);
+	if (!stat(output_name, &st) && !force) {
+		fprintf(stderr, "Error, output file: %s exists, use -f to overwrite.\n",
+				output_name);
+		exit(-1);
+	}
+
+	output = open(output_name, O_CREAT|O_TRUNC|O_RDWR, S_IRUSR|S_IWUSR);
 	if (output < 0) {
 		perror("failed to create output file");
 		exit(-1);
@@ -473,6 +480,8 @@ static const struct option options[] = {
 		    "collect data with this RT SCHED_FIFO priority"),
 	OPT_BOOLEAN('a', "all-cpus", &system_wide,
 			    "system-wide collection from all CPUs"),
+	OPT_BOOLEAN('f', "force", &force,
+			"overwrite existing data file"),
 	OPT_END()
 };
 
-- 
cgit v0.10.2


From 8a6fc708b9bb48a79a385bdc2be0959ee2ab788d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 22 May 2009 21:23:13 +0200
Subject: dma-debug: add debugfs file for driver filter

This patch adds the dma-api/driver_filter file to debugfs. The root user
can write a driver name into this file to see only dma-api errors for
that particular driver in the kernel log. Writing an empty string to
that file disables the driver filter.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index c01f647..c6330b1 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -23,9 +23,11 @@
 #include <linux/dma-debug.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
+#include <linux/uaccess.h>
 #include <linux/device.h>
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/ctype.h>
 #include <linux/list.h>
 #include <linux/slab.h>
 
@@ -98,6 +100,7 @@ static struct dentry *show_all_errors_dent  __read_mostly;
 static struct dentry *show_num_errors_dent  __read_mostly;
 static struct dentry *num_free_entries_dent __read_mostly;
 static struct dentry *min_free_entries_dent __read_mostly;
+static struct dentry *filter_dent           __read_mostly;
 
 /* per-driver filter related state */
 
@@ -160,7 +163,8 @@ static bool driver_filter(struct device *dev)
 		read_lock_irqsave(&driver_name_lock, flags);
 
 		if (drv->name &&
-		    strncmp(current_driver_name, drv->name, 63) == 0) {
+		    strncmp(current_driver_name, drv->name,
+			    NAME_MAX_LEN-1) == 0) {
 			current_driver = drv;
 			ret = true;
 		}
@@ -454,6 +458,97 @@ out_err:
 	return -ENOMEM;
 }
 
+static ssize_t filter_read(struct file *file, char __user *user_buf,
+			   size_t count, loff_t *ppos)
+{
+	unsigned long flags;
+	char buf[NAME_MAX_LEN + 1];
+	int len;
+
+	if (!current_driver_name[0])
+		return 0;
+
+	/*
+	 * We can't copy to userspace directly because current_driver_name can
+	 * only be read under the driver_name_lock with irqs disabled. So
+	 * create a temporary copy first.
+	 */
+	read_lock_irqsave(&driver_name_lock, flags);
+	len = scnprintf(buf, NAME_MAX_LEN + 1, "%s\n", current_driver_name);
+	read_unlock_irqrestore(&driver_name_lock, flags);
+
+	return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+static ssize_t filter_write(struct file *file, const char __user *userbuf,
+			    size_t count, loff_t *ppos)
+{
+	unsigned long flags;
+	char buf[NAME_MAX_LEN];
+	size_t len = NAME_MAX_LEN - 1;
+	int i;
+
+	/*
+	 * We can't copy from userspace directly. Access to
+	 * current_driver_name is protected with a write_lock with irqs
+	 * disabled. Since copy_from_user can fault and may sleep we
+	 * need to copy to temporary buffer first
+	 */
+	len = min(count, len);
+	if (copy_from_user(buf, userbuf, len))
+		return -EFAULT;
+
+	buf[len] = 0;
+
+	write_lock_irqsave(&driver_name_lock, flags);
+
+	/* Now handle the string we got from userspace very carefully.
+	 * The rules are:
+	 *         - only use the first token we got
+	 *         - token delimiter is everything looking like a space
+	 *           character (' ', '\n', '\t' ...)
+	 *
+	 */
+	if (!isalnum(buf[0])) {
+		/*
+		   If the first character userspace gave us is not
+		 * alphanumerical then assume the filter should be
+		 * switched off.
+		 */
+		if (current_driver_name[0])
+			printk(KERN_INFO "DMA-API: switching off dma-debug "
+					 "driver filter\n");
+		current_driver_name[0] = 0;
+		current_driver = NULL;
+		goto out_unlock;
+	}
+
+	/*
+	 * Now parse out the first token and use it as the name for the
+	 * driver to filter for.
+	 */
+	for (i = 0; i < NAME_MAX_LEN; ++i) {
+		current_driver_name[i] = buf[i];
+		if (isspace(buf[i]) || buf[i] == ' ' || buf[i] == 0)
+			break;
+	}
+	current_driver_name[i] = 0;
+	current_driver = NULL;
+
+	printk(KERN_INFO "DMA-API: enable driver filter for driver [%s]\n",
+	       current_driver_name);
+
+out_unlock:
+	write_unlock_irqrestore(&driver_name_lock, flags);
+
+	return count;
+}
+
+const struct file_operations filter_fops = {
+	.read  = filter_read,
+	.write = filter_write,
+};
+
 static int dma_debug_fs_init(void)
 {
 	dma_debug_dent = debugfs_create_dir("dma-api", NULL);
@@ -497,6 +592,11 @@ static int dma_debug_fs_init(void)
 	if (!min_free_entries_dent)
 		goto out_err;
 
+	filter_dent = debugfs_create_file("driver_filter", 0644,
+					  dma_debug_dent, NULL, &filter_fops);
+	if (!filter_dent)
+		goto out_err;
+
 	return 0;
 
 out_err:
-- 
cgit v0.10.2


From 1745de5e5639457513fe43440f2800e23c3cbc7d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 22 May 2009 21:49:51 +0200
Subject: dma-debug: add dma_debug_driver kernel command line

This patch add the dma_debug_driver= boot parameter to enable the driver
filter for early boot.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e87bdbf..b3f1314 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -646,6 +646,13 @@ and is between 256 and 4096 characters. It is defined in the file
 			DMA-API debugging code disables itself because the
 			architectural default is too low.
 
+	dma_debug_driver=<driver_name>
+			With this option the DMA-API debugging driver
+			filter feature can be enabled at boot time. Just
+			pass the driver to filter for as the parameter.
+			The filter can be disabled or changed to another
+			driver later using sysfs.
+
 	dscc4.setup=	[NET]
 
 	dtc3181e=	[HW,SCSI]
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index c6330b1..d0618aa 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -1109,3 +1109,21 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 }
 EXPORT_SYMBOL(debug_dma_sync_sg_for_device);
 
+static int __init dma_debug_driver_setup(char *str)
+{
+	int i;
+
+	for (i = 0; i < NAME_MAX_LEN - 1; ++i, ++str) {
+		current_driver_name[i] = *str;
+		if (*str == 0)
+			break;
+	}
+
+	if (current_driver_name[0])
+		printk(KERN_INFO "DMA-API: enable driver filter for "
+				 "driver [%s]\n", current_driver_name);
+
+
+	return 1;
+}
+__setup("dma_debug_driver=", dma_debug_driver_setup);
-- 
cgit v0.10.2


From 016ea6874a6d58df85b54f56997d26df13c307b2 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 22 May 2009 21:57:23 +0200
Subject: dma-debug: add documentation for the driver filter

This patch adds the driver filter feature to the dma-debug
documentation.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index d9aa43d..25fb8bc 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -704,12 +704,24 @@ this directory the following files can currently be found:
 				The current number of free dma_debug_entries
 				in the allocator.
 
+	dma-api/driver-filter
+				You can write a name of a driver into this file
+				to limit the debug output to requests from that
+				particular driver. Write an empty string to
+				that file to disable the filter and see
+				all errors again.
+
 If you have this code compiled into your kernel it will be enabled by default.
 If you want to boot without the bookkeeping anyway you can provide
 'dma_debug=off' as a boot parameter. This will disable DMA-API debugging.
 Notice that you can not enable it again at runtime. You have to reboot to do
 so.
 
+If you want to see debug messages only for a special device driver you can
+specify the dma_debug_driver=<drivername> parameter. This will enable the
+driver filter at boot time. The debug code will only print errors for that
+driver afterwards. This filter can be disabled or changed later using debugfs.
+
 When the code disables itself at runtime this is most likely because it ran
 out of dma_debug_entries. These entries are preallocated at boot. The number
 of preallocated entries is defined per architecture. If it is too low for you
-- 
cgit v0.10.2


From 4593bba8679b925a056f84edac061676e7eda71c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 2 Jun 2009 15:34:25 +0200
Subject: perf report: Clean up the default output

 - extra space between columns
 - left-aligned the symbol column
 - moved the no-symbols printout to -v

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 0558c1e..19c1e05 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -84,24 +84,25 @@ static struct dso *dsos__findnew(const char *name)
 	struct dso *dso = dsos__find(name);
 	int nr;
 
-	if (dso == NULL) {
-		dso = dso__new(name, 0);
-		if (!dso)
-			goto out_delete_dso;
-
-		nr = dso__load(dso, NULL);
-		if (nr < 0) {
-			fprintf(stderr, "Failed to open: %s\n", name);
-			goto out_delete_dso;
-		}
-		if (!nr) {
-			fprintf(stderr,
-		"Failed to find debug symbols for: %s, maybe install a debug package?\n",
-					name);
-		}
+	if (dso)
+		return dso;
+
+	dso = dso__new(name, 0);
+	if (!dso)
+		goto out_delete_dso;
 
-		dsos__add(dso);
+	nr = dso__load(dso, NULL);
+	if (nr < 0) {
+		fprintf(stderr, "Failed to open: %s\n", name);
+		goto out_delete_dso;
 	}
+	if (!nr && verbose) {
+		fprintf(stderr,
+		"No symbols found in: %s, maybe install a debug package?\n",
+				name);
+	}
+
+	dsos__add(dso);
 
 	return dso;
 
@@ -302,11 +303,11 @@ sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
 static size_t
 sort__thread_print(FILE *fp, struct hist_entry *self)
 {
-	return fprintf(fp, " %16s:%5d", self->thread->comm ?: "", self->thread->pid);
+	return fprintf(fp, "  %16s:%5d", self->thread->comm ?: "", self->thread->pid);
 }
 
 static struct sort_entry sort_thread = {
-	.header = "         Command: Pid ",
+	.header = "          Command: Pid ",
 	.cmp	= sort__thread_cmp,
 	.print	= sort__thread_print,
 };
@@ -332,11 +333,11 @@ sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
 static size_t
 sort__comm_print(FILE *fp, struct hist_entry *self)
 {
-	return fprintf(fp, " %16s", self->thread->comm ?: "<unknown>");
+	return fprintf(fp, "  %16s", self->thread->comm ?: "<unknown>");
 }
 
 static struct sort_entry sort_comm = {
-	.header = "         Command",
+	.header = "          Command",
 	.cmp	= sort__comm_cmp,
 	.print	= sort__comm_print,
 };
@@ -362,11 +363,11 @@ sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
 static size_t
 sort__dso_print(FILE *fp, struct hist_entry *self)
 {
-	return fprintf(fp, " %64s", self->dso ? self->dso->name : "<unknown>");
+	return fprintf(fp, "  %s", self->dso ? self->dso->name : "<unknown>");
 }
 
 static struct sort_entry sort_dso = {
-	.header = "                                                    Shared Object",
+	.header = " Shared Object",
 	.cmp	= sort__dso_cmp,
 	.print	= sort__dso_print,
 };
@@ -391,9 +392,9 @@ sort__sym_print(FILE *fp, struct hist_entry *self)
 	size_t ret = 0;
 
 	if (verbose)
-		ret += fprintf(fp, " %#018llx", (unsigned long long)self->ip);
+		ret += fprintf(fp, "  %#018llx", (unsigned long long)self->ip);
 
-	ret += fprintf(fp, " %s: %s",
+	ret += fprintf(fp, "  %s: %s",
 			self->dso ? self->dso->name : "<unknown>",
 			self->sym ? self->sym->name : "<unknown>");
 
@@ -401,7 +402,7 @@ sort__sym_print(FILE *fp, struct hist_entry *self)
 }
 
 static struct sort_entry sort_sym = {
-	.header = "Shared Object: Symbol",
+	.header = " Shared Object: Symbol",
 	.cmp	= sort__sym_cmp,
 	.print	= sort__sym_print,
 };
@@ -595,8 +596,8 @@ static size_t output__fprintf(FILE *fp, uint64_t total_samples)
 	list_for_each_entry(se, &hist_entry__sort_list, list) {
 		int i;
 
-		fprintf(fp, " ");
-		for (i = 0; i < strlen(se->header); i++)
+		fprintf(fp, "  ");
+		for (i = 0; i < strlen(se->header)-1; i++)
 			fprintf(fp, ".");
 	}
 	fprintf(fp, "\n");
-- 
cgit v0.10.2


From 50b64e3b77d569c217a48e078cd565dbd6462ad0 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 2 Jun 2009 06:55:20 -0400
Subject: cifs: fix IPv6 address length check

For IPv6 the userspace mount helper sends an address in the "ip="
option.  This check fails if the length is > 35 characters. I have no
idea where the magic 35 character limit came from, but it's clearly not
enough for IPv6. Fix it by making it use the INET6_ADDRSTRLEN #define.

While we're at it, use the same #define for the address length in SPNEGO
upcalls.

Reported-by: Charles R. Anderson <cra@wpi.edu>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>

diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 67bf93a..4a4581c 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -23,6 +23,7 @@
 #include <linux/string.h>
 #include <keys/user-type.h>
 #include <linux/key-type.h>
+#include <linux/inet.h>
 #include "cifsglob.h"
 #include "cifs_spnego.h"
 #include "cifs_debug.h"
@@ -73,9 +74,6 @@ struct key_type cifs_spnego_key_type = {
  * strlen(";sec=ntlmsspi") */
 #define MAX_MECH_STR_LEN	13
 
-/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/128 */
-#define MAX_IPV6_ADDR_LEN	43
-
 /* strlen of "host=" */
 #define HOST_KEY_LEN		5
 
@@ -102,7 +100,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 	   host=hostname sec=mechanism uid=0xFF user=username */
 	desc_len = MAX_VER_STR_LEN +
 		   HOST_KEY_LEN + strlen(hostname) +
-		   IP_KEY_LEN + MAX_IPV6_ADDR_LEN +
+		   IP_KEY_LEN + INET6_ADDRSTRLEN +
 		   MAX_MECH_STR_LEN +
 		   UID_KEY_LEN + (sizeof(uid_t) * 2) +
 		   USER_KEY_LEN + strlen(sesInfo->userName) + 1;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 8ae563f..74b5a87 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -35,6 +35,7 @@
 #include <linux/namei.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
+#include <linux/inet.h>
 #include <net/ipv6.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -960,7 +961,8 @@ cifs_parse_mount_options(char *options, const char *devname,
 		} else if (strnicmp(data, "ip", 2) == 0) {
 			if (!value || !*value) {
 				vol->UNCip = NULL;
-			} else if (strnlen(value, 35) < 35) {
+			} else if (strnlen(value, INET6_ADDRSTRLEN) <
+							INET6_ADDRSTRLEN) {
 				vol->UNCip = value;
 			} else {
 				printk(KERN_WARNING "CIFS: ip address "
-- 
cgit v0.10.2


From 179c498ae2998461fe436437a74dc29036fc7dcc Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Jun 2009 12:03:19 -0400
Subject: function-graph: only allocate init tasks if it was not already done

When the function graph tracer is enabled, it calls the initialization
needed for the init tasks that would be called on all created tasks.

The problem is that this is called every time the function graph tracer
is enabled, and the ret_stack is allocated for the idle tasks each time.
Thus, the old ret_stack is lost and a memory leak is created.

This is also dangerous because if an interrupt happened on another CPU
with the init task and the ret_stack is replaced, we then lose all the
return pointers for the interrupt, and a crash would take place.

[ Impact: fix memory leak and possible crash due to race ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f1ed080..ebff62e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2643,8 +2643,10 @@ static int start_graph_tracing(void)
 		return -ENOMEM;
 
 	/* The cpu_boot init_task->ret_stack will never be freed */
-	for_each_online_cpu(cpu)
-		ftrace_graph_init_task(idle_task(cpu));
+	for_each_online_cpu(cpu) {
+		if (!idle_task(cpu)->ret_stack)
+			ftrace_graph_init_task(idle_task(cpu));
+	}
 
 	do {
 		ret = alloc_retstack_tasklist(ret_stack_list);
-- 
cgit v0.10.2


From d3ae33efb8e2f277f9007eb060c9d0b91ab38ae1 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Tue, 2 Jun 2009 12:34:31 +0100
Subject: pata_netcell: LBA48 force identify bits correct

This matches Bartlomiej's patch for ide_pci_generic:
c339dfdd65b52bfd947ab29d1210314a2f6d622d

In the libata case netcell has its own mini driver. I suspect this fix is
actually only needed for some firmware revs but it does no harm either way.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/ata/pata_netcell.c b/drivers/ata/pata_netcell.c
index bdb2369..9a69809 100644
--- a/drivers/ata/pata_netcell.c
+++ b/drivers/ata/pata_netcell.c
@@ -20,13 +20,24 @@
 
 /* No PIO or DMA methods needed for this device */
 
+static unsigned int netcell_read_id(struct ata_device *adev,
+					struct ata_taskfile *tf, u16 *id)
+{
+	unsigned int err_mask = ata_do_dev_read_id(adev, tf, id);
+	/* Firmware forgets to mark words 85-87 valid */
+	if (err_mask == 0)
+		id[ATA_ID_CSF_DEFAULT] |= 0x0400;
+	return err_mask;
+}
+
 static struct scsi_host_template netcell_sht = {
 	ATA_BMDMA_SHT(DRV_NAME),
 };
 
 static struct ata_port_operations netcell_ops = {
 	.inherits	= &ata_bmdma_port_ops,
-	.cable_detect		= ata_cable_80wire,
+	.cable_detect	= ata_cable_80wire,
+	.read_id	= netcell_read_id,
 };
 
 
-- 
cgit v0.10.2


From 05ad709d04799125ed85dd816fdb558258102172 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@etchedpixels.co.uk>
Date: Tue, 2 Jun 2009 16:58:10 +0100
Subject: parport: quickfix the proc registration bug

Ideally we should have a directory of drivers and a link to the 'active'
driver. For now just show the first device which is effectively the existing
semantics without a warning.

This is an update on the original buggy patch that I then forgot to
resubmit. Confusingly it was proposed by Red Hat, written by Etched Pixels
fixed and submitted by Intel ...

Resolves-Bug: http://bugzilla.kernel.org/show_bug.cgi?id=9749
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/parport/share.c b/drivers/parport/share.c
index 0ebca45..dffa5d4 100644
--- a/drivers/parport/share.c
+++ b/drivers/parport/share.c
@@ -614,7 +614,10 @@ parport_register_device(struct parport *port, const char *name,
 	 * pardevice fields. -arca
 	 */
 	port->ops->init_state(tmp, tmp->state);
-	parport_device_proc_register(tmp);
+	if (!test_and_set_bit(PARPORT_DEVPROC_REGISTERED, &port->devflags)) {
+		port->proc_device = tmp;
+		parport_device_proc_register(tmp);
+	}
 	return tmp;
 
  out_free_all:
@@ -646,10 +649,14 @@ void parport_unregister_device(struct pardevice *dev)
 	}
 #endif
 
-	parport_device_proc_unregister(dev);
-
 	port = dev->port->physport;
 
+	if (port->proc_device == dev) {
+		port->proc_device = NULL;
+		clear_bit(PARPORT_DEVPROC_REGISTERED, &port->devflags);
+		parport_device_proc_unregister(dev);
+	}
+
 	if (port->cad == dev) {
 		printk(KERN_DEBUG "%s: %s forgot to release port\n",
 		       port->name, dev->name);
diff --git a/include/linux/parport.h b/include/linux/parport.h
index e1f83c5..38a423e 100644
--- a/include/linux/parport.h
+++ b/include/linux/parport.h
@@ -324,6 +324,10 @@ struct parport {
 	int spintime;
 	atomic_t ref_count;
 
+	unsigned long devflags;
+#define PARPORT_DEVPROC_REGISTERED	0
+	struct pardevice *proc_device;	/* Currently register proc device */
+
 	struct list_head full_list;
 	struct parport *slaves[3];
 };
-- 
cgit v0.10.2


From 82310a3272d5a2a7652f5649ad8a55f58c8f74d9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Jun 2009 12:26:07 -0400
Subject: function-graph: enable the stack after initialization of other
 variables

The function graph tracer checks if the task_struct has ret_stack defined
to know if it is OK or not to use it. The initialization is done for
all tasks by one process, but the idle tasks use the same initialization
used by new tasks.

If an interrupt happens on an idle task that just had the ret_stack
created, but before the rest of the initialization took place, then
we can corrupt the return address of the functions.

This patch moves the setting of the task_struct's ret_stack to after
the other variables have been initialized.

[ Impact: prevent kernel panic on idle task when starting function graph ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ebff62e..20e0660 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2739,15 +2739,20 @@ void unregister_ftrace_graph(void)
 void ftrace_graph_init_task(struct task_struct *t)
 {
 	if (atomic_read(&ftrace_graph_active)) {
-		t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
+		struct ftrace_ret_stack *ret_stack;
+
+		ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
 				* sizeof(struct ftrace_ret_stack),
 				GFP_KERNEL);
-		if (!t->ret_stack)
+		if (!ret_stack)
 			return;
 		t->curr_ret_stack = -1;
 		atomic_set(&t->tracing_graph_pause, 0);
 		atomic_set(&t->trace_overrun, 0);
 		t->ftrace_timestamp = 0;
+		/* make curr_ret_stack visable before we add the ret_stack */
+		smp_wmb();
+		t->ret_stack = ret_stack;
 	} else
 		t->ret_stack = NULL;
 }
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d28687e..baeb5fe 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -65,6 +65,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
 	if (!current->ret_stack)
 		return -EBUSY;
 
+	/*
+	 * We must make sure the ret_stack is tested before we read
+	 * anything else.
+	 */
+	smp_rmb();
+
 	/* The return trace stack is full */
 	if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
 		atomic_inc(&current->trace_overrun);
-- 
cgit v0.10.2


From 26c01624a2a40f8a4ddf6449b65c9b1c418d0e72 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Jun 2009 14:01:19 -0400
Subject: function-graph: add memory barriers for accessing task's ret_stack

The code that handles the tasks ret_stack allocation for every task
assumes that only an interrupt can cause issues (even though interrupts
are disabled).

In reality, the code is allocating the ret_stack for tasks that may be
running on other CPUs and there are not efficient memory barriers to
handle this case.

[ Impact: prevent crash due to using of uninitialized ret_stack variables ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 20e0660..1664d3f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2580,12 +2580,12 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
 		}
 
 		if (t->ret_stack == NULL) {
-			t->curr_ret_stack = -1;
-			/* Make sure IRQs see the -1 first: */
-			barrier();
-			t->ret_stack = ret_stack_list[start++];
 			atomic_set(&t->tracing_graph_pause, 0);
 			atomic_set(&t->trace_overrun, 0);
+			t->curr_ret_stack = -1;
+			/* Make sure the tasks see the -1 first: */
+			smp_wmb();
+			t->ret_stack = ret_stack_list[start++];
 		}
 	} while_each_thread(g, t);
 
-- 
cgit v0.10.2


From 29c2810276fbf8419c9b4d942b99c873e9b7640a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 15:56:26 +0200
Subject: perf_counter tools: Remove the last nmi bits

Everything is nmi these days, remove the userspace bits so that
the kernel can drop the interface.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 5886791..644f850 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -84,7 +84,6 @@ static void create_perfstat_counter(int counter)
 	memset(&hw_event, 0, sizeof(hw_event));
 	hw_event.config		= event_id[counter];
 	hw_event.record_type	= 0;
-	hw_event.nmi		= 1;
 	hw_event.exclude_kernel = event_mask[counter] & EVENT_MASK_KERNEL;
 	hw_event.exclude_user   = event_mask[counter] & EVENT_MASK_USER;
 
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 24a8879..a2cff7b 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -581,7 +581,6 @@ static int __cmd_top(void)
 			hw_event.config		= event_id[counter];
 			hw_event.irq_period	= event_count[counter];
 			hw_event.record_type	= PERF_RECORD_IP | PERF_RECORD_TID;
-			hw_event.nmi		= 1;
 			hw_event.mmap		= use_mmap;
 			hw_event.munmap		= use_munmap;
 			hw_event.freq		= freq;
-- 
cgit v0.10.2


From 53e111a730ea8b002d57dd226098c12789993329 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 17:01:58 +0200
Subject: x86: Fix atomic_long_xchg() on 64bit

Apparently I'm the first to use it :-)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h
index 3673a13..81d3be4 100644
--- a/include/asm-generic/atomic.h
+++ b/include/asm-generic/atomic.h
@@ -134,7 +134,7 @@ static inline long atomic_long_add_unless(atomic_long_t *l, long a, long u)
 #define atomic_long_cmpxchg(l, old, new) \
 	(atomic64_cmpxchg((atomic64_t *)(l), (old), (new)))
 #define atomic_long_xchg(v, new) \
-	(atomic64_xchg((atomic64_t *)(l), (new)))
+	(atomic64_xchg((atomic64_t *)(v), (new)))
 
 #else  /*  BITS_PER_LONG == 64  */
 
-- 
cgit v0.10.2


From 8e5799b1ad2a0567fdfaaf0e91b40efee010f2c1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 15:08:15 +0200
Subject: perf_counter: Add unique counter id

Stephan raised the issue that we currently cannot distinguish between
similar counters within a group (PERF_RECORD_GROUP uses the config
value as identifier).

Therefore, generate a new ID for each counter using a global u64
sequence counter.

Reported-by: Stephane Eranian <eranian@googlemail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 9ec20fc..4845a21 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -114,8 +114,9 @@ enum perf_counter_record_format {
  * in increasing order of bit value, after the counter value.
  */
 enum perf_counter_read_format {
-	PERF_FORMAT_TOTAL_TIME_ENABLED	=  1,
-	PERF_FORMAT_TOTAL_TIME_RUNNING	=  2,
+	PERF_FORMAT_TOTAL_TIME_ENABLED	=  1U << 0,
+	PERF_FORMAT_TOTAL_TIME_RUNNING	=  1U << 1,
+	PERF_FORMAT_ID			=  1U << 2,
 };
 
 /*
@@ -290,7 +291,7 @@ enum perf_event_type {
 	 *	{ u32			cpu, res; } && PERF_RECORD_CPU
 	 *
 	 *	{ u64			nr;
-	 *	  { u64 event, val; }	cnt[nr];  } && PERF_RECORD_GROUP
+	 *	  { u64 id, val; }	cnt[nr];  } && PERF_RECORD_GROUP
 	 *
 	 *	{ u16			nr,
 	 *				hv,
@@ -503,6 +504,7 @@ struct perf_counter {
 	struct rcu_head			rcu_head;
 
 	struct pid_namespace		*ns;
+	u64				id;
 #endif
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index caa012c..978ecfc 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1510,6 +1510,8 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 		values[n++] = counter->total_time_running +
 			atomic64_read(&counter->child_total_time_running);
+	if (counter->hw_event.read_format & PERF_FORMAT_ID)
+		values[n++] = counter->id;
 	mutex_unlock(&counter->child_mutex);
 
 	if (count < n * sizeof(u64))
@@ -2303,7 +2305,7 @@ static void perf_counter_output(struct perf_counter *counter,
 		u32 pid, tid;
 	} tid_entry;
 	struct {
-		u64 event;
+		u64 id;
 		u64 counter;
 	} group_entry;
 	struct perf_callchain_entry *callchain = NULL;
@@ -2416,7 +2418,7 @@ static void perf_counter_output(struct perf_counter *counter,
 			if (sub != counter)
 				sub->pmu->read(sub);
 
-			group_entry.event = sub->hw_event.config;
+			group_entry.id = sub->id;
 			group_entry.counter = atomic64_read(&sub->count);
 
 			perf_output_put(&handle, group_entry);
@@ -3375,6 +3377,8 @@ done:
 	return counter;
 }
 
+static atomic64_t perf_counter_id;
+
 /**
  * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
  *
@@ -3470,6 +3474,7 @@ SYSCALL_DEFINE5(perf_counter_open,
 	mutex_unlock(&current->perf_counter_mutex);
 
 	counter->ns = get_pid_ns(current->nsproxy->pid_ns);
+	counter->id = atomic64_inc_return(&perf_counter_id);
 
 	fput_light(counter_file, fput_needed2);
 
-- 
cgit v0.10.2


From b23f3325ed465f1bd914384884269af0d106778c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 15:13:03 +0200
Subject: perf_counter: Rename various fields

A few renames:

  s/irq_period/sample_period/
  s/irq_freq/sample_freq/
  s/PERF_RECORD_/PERF_SAMPLE_/
  s/record_type/sample_type/

And change both the new sample_type and read_format to u64.

Reported-by: Stephane Eranian <eranian@googlemail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index f96d55f..c963332 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -535,7 +535,7 @@ void hw_perf_enable(void)
 			continue;
 		}
 		val = 0;
-		if (counter->hw.irq_period) {
+		if (counter->hw.sample_period) {
 			left = atomic64_read(&counter->hw.period_left);
 			if (left < 0x80000000L)
 				val = 0x80000000L - left;
@@ -749,12 +749,12 @@ static void power_pmu_unthrottle(struct perf_counter *counter)
 	s64 val, left;
 	unsigned long flags;
 
-	if (!counter->hw.idx || !counter->hw.irq_period)
+	if (!counter->hw.idx || !counter->hw.sample_period)
 		return;
 	local_irq_save(flags);
 	perf_disable();
 	power_pmu_read(counter);
-	left = counter->hw.irq_period;
+	left = counter->hw.sample_period;
 	val = 0;
 	if (left < 0x80000000L)
 		val = 0x80000000L - left;
@@ -789,7 +789,7 @@ static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev,
 	if (counter->hw_event.exclude_user
 	    || counter->hw_event.exclude_kernel
 	    || counter->hw_event.exclude_hv
-	    || counter->hw_event.irq_period)
+	    || counter->hw_event.sample_period)
 		return 0;
 
 	if (ppmu->limited_pmc_event(ev))
@@ -925,7 +925,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	counter->hw.config = events[n];
 	counter->hw.counter_base = cflags[n];
-	atomic64_set(&counter->hw.period_left, counter->hw.irq_period);
+	atomic64_set(&counter->hw.period_left, counter->hw.sample_period);
 
 	/*
 	 * See if we need to reserve the PMU.
@@ -958,7 +958,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 static void record_and_restart(struct perf_counter *counter, long val,
 			       struct pt_regs *regs, int nmi)
 {
-	u64 period = counter->hw.irq_period;
+	u64 period = counter->hw.sample_period;
 	s64 prev, delta, left;
 	int record = 0;
 	u64 addr, mmcra, sdsync;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 316b0c9..ec06aa5 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -290,11 +290,11 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	hwc->nmi	= 1;
 	hw_event->nmi	= 1;
 
-	if (!hwc->irq_period)
-		hwc->irq_period = x86_pmu.max_period;
+	if (!hwc->sample_period)
+		hwc->sample_period = x86_pmu.max_period;
 
 	atomic64_set(&hwc->period_left,
-			min(x86_pmu.max_period, hwc->irq_period));
+			min(x86_pmu.max_period, hwc->sample_period));
 
 	/*
 	 * Raw event type provide the config in the event structure
@@ -462,7 +462,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 			     struct hw_perf_counter *hwc, int idx)
 {
 	s64 left = atomic64_read(&hwc->period_left);
-	s64 period = min(x86_pmu.max_period, hwc->irq_period);
+	s64 period = min(x86_pmu.max_period, hwc->sample_period);
 	int err;
 
 	/*
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 4845a21..1fcd3cc 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -94,18 +94,18 @@ enum sw_event_ids {
 #define PERF_COUNTER_EVENT_MASK		__PERF_COUNTER_MASK(EVENT)
 
 /*
- * Bits that can be set in hw_event.record_type to request information
+ * Bits that can be set in hw_event.sample_type to request information
  * in the overflow packets.
  */
-enum perf_counter_record_format {
-	PERF_RECORD_IP			= 1U << 0,
-	PERF_RECORD_TID			= 1U << 1,
-	PERF_RECORD_TIME		= 1U << 2,
-	PERF_RECORD_ADDR		= 1U << 3,
-	PERF_RECORD_GROUP		= 1U << 4,
-	PERF_RECORD_CALLCHAIN		= 1U << 5,
-	PERF_RECORD_CONFIG		= 1U << 6,
-	PERF_RECORD_CPU			= 1U << 7,
+enum perf_counter_sample_format {
+	PERF_SAMPLE_IP			= 1U << 0,
+	PERF_SAMPLE_TID			= 1U << 1,
+	PERF_SAMPLE_TIME		= 1U << 2,
+	PERF_SAMPLE_ADDR		= 1U << 3,
+	PERF_SAMPLE_GROUP		= 1U << 4,
+	PERF_SAMPLE_CALLCHAIN		= 1U << 5,
+	PERF_SAMPLE_CONFIG		= 1U << 6,
+	PERF_SAMPLE_CPU			= 1U << 7,
 };
 
 /*
@@ -132,12 +132,12 @@ struct perf_counter_hw_event {
 	__u64			config;
 
 	union {
-		__u64		irq_period;
-		__u64		irq_freq;
+		__u64		sample_period;
+		__u64		sample_freq;
 	};
 
-	__u32			record_type;
-	__u32			read_format;
+	__u64			sample_type;
+	__u64			read_format;
 
 	__u64			disabled       :  1, /* off by default        */
 				nmi	       :  1, /* NMI sampling          */
@@ -262,7 +262,7 @@ enum perf_event_type {
 	 * struct {
 	 *	struct perf_event_header	header;
 	 *	u64				time;
-	 *	u64				irq_period;
+	 *	u64				sample_period;
 	 * };
 	 */
 	PERF_EVENT_PERIOD		= 4,
@@ -363,7 +363,7 @@ struct hw_perf_counter {
 		};
 	};
 	atomic64_t			prev_count;
-	u64				irq_period;
+	u64				sample_period;
 	atomic64_t			period_left;
 	u64				interrupts;
 #endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 978ecfc..5ecd9981 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1186,7 +1186,7 @@ static void perf_log_period(struct perf_counter *counter, u64 period);
 static void perf_adjust_freq(struct perf_counter_context *ctx)
 {
 	struct perf_counter *counter;
-	u64 interrupts, irq_period;
+	u64 interrupts, sample_period;
 	u64 events, period;
 	s64 delta;
 
@@ -1204,23 +1204,23 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 			interrupts = 2*sysctl_perf_counter_limit/HZ;
 		}
 
-		if (!counter->hw_event.freq || !counter->hw_event.irq_freq)
+		if (!counter->hw_event.freq || !counter->hw_event.sample_freq)
 			continue;
 
-		events = HZ * interrupts * counter->hw.irq_period;
-		period = div64_u64(events, counter->hw_event.irq_freq);
+		events = HZ * interrupts * counter->hw.sample_period;
+		period = div64_u64(events, counter->hw_event.sample_freq);
 
-		delta = (s64)(1 + period - counter->hw.irq_period);
+		delta = (s64)(1 + period - counter->hw.sample_period);
 		delta >>= 1;
 
-		irq_period = counter->hw.irq_period + delta;
+		sample_period = counter->hw.sample_period + delta;
 
-		if (!irq_period)
-			irq_period = 1;
+		if (!sample_period)
+			sample_period = 1;
 
-		perf_log_period(counter, irq_period);
+		perf_log_period(counter, sample_period);
 
-		counter->hw.irq_period = irq_period;
+		counter->hw.sample_period = sample_period;
 	}
 	spin_unlock(&ctx->lock);
 }
@@ -2297,7 +2297,7 @@ static void perf_counter_output(struct perf_counter *counter,
 				int nmi, struct pt_regs *regs, u64 addr)
 {
 	int ret;
-	u64 record_type = counter->hw_event.record_type;
+	u64 sample_type = counter->hw_event.sample_type;
 	struct perf_output_handle handle;
 	struct perf_event_header header;
 	u64 ip;
@@ -2321,61 +2321,61 @@ static void perf_counter_output(struct perf_counter *counter,
 	header.misc = PERF_EVENT_MISC_OVERFLOW;
 	header.misc |= perf_misc_flags(regs);
 
-	if (record_type & PERF_RECORD_IP) {
+	if (sample_type & PERF_SAMPLE_IP) {
 		ip = perf_instruction_pointer(regs);
-		header.type |= PERF_RECORD_IP;
+		header.type |= PERF_SAMPLE_IP;
 		header.size += sizeof(ip);
 	}
 
-	if (record_type & PERF_RECORD_TID) {
+	if (sample_type & PERF_SAMPLE_TID) {
 		/* namespace issues */
 		tid_entry.pid = perf_counter_pid(counter, current);
 		tid_entry.tid = perf_counter_tid(counter, current);
 
-		header.type |= PERF_RECORD_TID;
+		header.type |= PERF_SAMPLE_TID;
 		header.size += sizeof(tid_entry);
 	}
 
-	if (record_type & PERF_RECORD_TIME) {
+	if (sample_type & PERF_SAMPLE_TIME) {
 		/*
 		 * Maybe do better on x86 and provide cpu_clock_nmi()
 		 */
 		time = sched_clock();
 
-		header.type |= PERF_RECORD_TIME;
+		header.type |= PERF_SAMPLE_TIME;
 		header.size += sizeof(u64);
 	}
 
-	if (record_type & PERF_RECORD_ADDR) {
-		header.type |= PERF_RECORD_ADDR;
+	if (sample_type & PERF_SAMPLE_ADDR) {
+		header.type |= PERF_SAMPLE_ADDR;
 		header.size += sizeof(u64);
 	}
 
-	if (record_type & PERF_RECORD_CONFIG) {
-		header.type |= PERF_RECORD_CONFIG;
+	if (sample_type & PERF_SAMPLE_CONFIG) {
+		header.type |= PERF_SAMPLE_CONFIG;
 		header.size += sizeof(u64);
 	}
 
-	if (record_type & PERF_RECORD_CPU) {
-		header.type |= PERF_RECORD_CPU;
+	if (sample_type & PERF_SAMPLE_CPU) {
+		header.type |= PERF_SAMPLE_CPU;
 		header.size += sizeof(cpu_entry);
 
 		cpu_entry.cpu = raw_smp_processor_id();
 	}
 
-	if (record_type & PERF_RECORD_GROUP) {
-		header.type |= PERF_RECORD_GROUP;
+	if (sample_type & PERF_SAMPLE_GROUP) {
+		header.type |= PERF_SAMPLE_GROUP;
 		header.size += sizeof(u64) +
 			counter->nr_siblings * sizeof(group_entry);
 	}
 
-	if (record_type & PERF_RECORD_CALLCHAIN) {
+	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
 		callchain = perf_callchain(regs);
 
 		if (callchain) {
 			callchain_size = (1 + callchain->nr) * sizeof(u64);
 
-			header.type |= PERF_RECORD_CALLCHAIN;
+			header.type |= PERF_SAMPLE_CALLCHAIN;
 			header.size += callchain_size;
 		}
 	}
@@ -2386,28 +2386,28 @@ static void perf_counter_output(struct perf_counter *counter,
 
 	perf_output_put(&handle, header);
 
-	if (record_type & PERF_RECORD_IP)
+	if (sample_type & PERF_SAMPLE_IP)
 		perf_output_put(&handle, ip);
 
-	if (record_type & PERF_RECORD_TID)
+	if (sample_type & PERF_SAMPLE_TID)
 		perf_output_put(&handle, tid_entry);
 
-	if (record_type & PERF_RECORD_TIME)
+	if (sample_type & PERF_SAMPLE_TIME)
 		perf_output_put(&handle, time);
 
-	if (record_type & PERF_RECORD_ADDR)
+	if (sample_type & PERF_SAMPLE_ADDR)
 		perf_output_put(&handle, addr);
 
-	if (record_type & PERF_RECORD_CONFIG)
+	if (sample_type & PERF_SAMPLE_CONFIG)
 		perf_output_put(&handle, counter->hw_event.config);
 
-	if (record_type & PERF_RECORD_CPU)
+	if (sample_type & PERF_SAMPLE_CPU)
 		perf_output_put(&handle, cpu_entry);
 
 	/*
-	 * XXX PERF_RECORD_GROUP vs inherited counters seems difficult.
+	 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
 	 */
-	if (record_type & PERF_RECORD_GROUP) {
+	if (sample_type & PERF_SAMPLE_GROUP) {
 		struct perf_counter *leader, *sub;
 		u64 nr = counter->nr_siblings;
 
@@ -2702,7 +2702,7 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
 }
 
 /*
- * Log irq_period changes so that analyzing tools can re-normalize the
+ * Log sample_period changes so that analyzing tools can re-normalize the
  * event flow.
  */
 
@@ -2725,7 +2725,7 @@ static void perf_log_period(struct perf_counter *counter, u64 period)
 		.period = period,
 	};
 
-	if (counter->hw.irq_period == period)
+	if (counter->hw.sample_period == period)
 		return;
 
 	ret = perf_output_begin(&handle, counter, sizeof(freq_event), 0, 0);
@@ -2834,7 +2834,7 @@ static void perf_swcounter_set_period(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	s64 left = atomic64_read(&hwc->period_left);
-	s64 period = hwc->irq_period;
+	s64 period = hwc->sample_period;
 
 	if (unlikely(left <= -period)) {
 		left = period;
@@ -2874,7 +2874,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 			ret = HRTIMER_NORESTART;
 	}
 
-	period = max_t(u64, 10000, counter->hw.irq_period);
+	period = max_t(u64, 10000, counter->hw.sample_period);
 	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
 
 	return ret;
@@ -2959,7 +2959,7 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
 {
 	int neg = atomic64_add_negative(nr, &counter->hw.count);
 
-	if (counter->hw.irq_period && !neg && regs)
+	if (counter->hw.sample_period && !neg && regs)
 		perf_swcounter_overflow(counter, nmi, regs, addr);
 }
 
@@ -3080,8 +3080,8 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 	atomic64_set(&hwc->prev_count, cpu_clock(cpu));
 	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hwc->hrtimer.function = perf_swcounter_hrtimer;
-	if (hwc->irq_period) {
-		u64 period = max_t(u64, 10000, hwc->irq_period);
+	if (hwc->sample_period) {
+		u64 period = max_t(u64, 10000, hwc->sample_period);
 		__hrtimer_start_range_ns(&hwc->hrtimer,
 				ns_to_ktime(period), 0,
 				HRTIMER_MODE_REL, 0);
@@ -3092,7 +3092,7 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 
 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
 {
-	if (counter->hw.irq_period)
+	if (counter->hw.sample_period)
 		hrtimer_cancel(&counter->hw.hrtimer);
 	cpu_clock_perf_counter_update(counter);
 }
@@ -3132,8 +3132,8 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter)
 	atomic64_set(&hwc->prev_count, now);
 	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hwc->hrtimer.function = perf_swcounter_hrtimer;
-	if (hwc->irq_period) {
-		u64 period = max_t(u64, 10000, hwc->irq_period);
+	if (hwc->sample_period) {
+		u64 period = max_t(u64, 10000, hwc->sample_period);
 		__hrtimer_start_range_ns(&hwc->hrtimer,
 				ns_to_ktime(period), 0,
 				HRTIMER_MODE_REL, 0);
@@ -3144,7 +3144,7 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter)
 
 static void task_clock_perf_counter_disable(struct perf_counter *counter)
 {
-	if (counter->hw.irq_period)
+	if (counter->hw.sample_period)
 		hrtimer_cancel(&counter->hw.hrtimer);
 	task_clock_perf_counter_update(counter, counter->ctx->time);
 
@@ -3223,7 +3223,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 
 	counter->destroy = tp_perf_counter_destroy;
-	counter->hw.irq_period = counter->hw_event.irq_period;
+	counter->hw.sample_period = counter->hw_event.sample_period;
 
 	return &perf_ops_generic;
 }
@@ -3323,15 +3323,15 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	pmu = NULL;
 
 	hwc = &counter->hw;
-	if (hw_event->freq && hw_event->irq_freq)
-		hwc->irq_period = div64_u64(TICK_NSEC, hw_event->irq_freq);
+	if (hw_event->freq && hw_event->sample_freq)
+		hwc->sample_period = div64_u64(TICK_NSEC, hw_event->sample_freq);
 	else
-		hwc->irq_period = hw_event->irq_period;
+		hwc->sample_period = hw_event->sample_period;
 
 	/*
-	 * we currently do not support PERF_RECORD_GROUP on inherited counters
+	 * we currently do not support PERF_SAMPLE_GROUP on inherited counters
 	 */
-	if (hw_event->inherit && (hw_event->record_type & PERF_RECORD_GROUP))
+	if (hw_event->inherit && (hw_event->sample_type & PERF_SAMPLE_GROUP))
 		goto done;
 
 	if (perf_event_raw(hw_event)) {
-- 
cgit v0.10.2


From 8a016db386195b193e2a8aeddff9fe937dcb7a40 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 15:27:45 +0200
Subject: perf_counter: Remove the last nmi/irq bits

IRQ (non-NMI) sampling is not used anymore - remove the last few bits.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index ec06aa5..9e144fb 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -284,12 +284,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (!hw_event->exclude_kernel)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
 
-	/*
-	 * Use NMI events all the time:
-	 */
-	hwc->nmi	= 1;
-	hw_event->nmi	= 1;
-
 	if (!hwc->sample_period)
 		hwc->sample_period = x86_pmu.max_period;
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 1fcd3cc..cef9931 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -140,7 +140,6 @@ struct perf_counter_hw_event {
 	__u64			read_format;
 
 	__u64			disabled       :  1, /* off by default        */
-				nmi	       :  1, /* NMI sampling          */
 				inherit	       :  1, /* children inherit it   */
 				pinned	       :  1, /* must always be on PMU */
 				exclusive      :  1, /* only group on PMU     */
@@ -153,7 +152,7 @@ struct perf_counter_hw_event {
 				comm	       :  1, /* include comm data     */
 				freq           :  1, /* use freq, not period  */
 
-				__reserved_1   : 51;
+				__reserved_1   : 52;
 
 	__u32			wakeup_events;	/* wakeup every n events */
 	__u32			__reserved_2;
@@ -354,7 +353,6 @@ struct hw_perf_counter {
 			u64				config;
 			unsigned long			config_base;
 			unsigned long			counter_base;
-			int				nmi;
 			int				idx;
 		};
 		union { /* software */
-- 
cgit v0.10.2


From e4abb5d4f7ddabc1fc7c392cf0a10d8e5868c9ca Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 16:08:20 +0200
Subject: perf_counter: x86: Emulate longer sample periods

Do as Power already does, emulate sample periods up to 2^63-1 by
composing them of smaller values limited by hardware capabilities.
Only once we wrap the software period do we generate an overflow
event.

Just 10 lines of new code.

Reported-by: Stephane Eranian <eranian@googlemail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 9e144fb..904571b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -287,8 +287,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (!hwc->sample_period)
 		hwc->sample_period = x86_pmu.max_period;
 
-	atomic64_set(&hwc->period_left,
-			min(x86_pmu.max_period, hwc->sample_period));
+	atomic64_set(&hwc->period_left, hwc->sample_period);
 
 	/*
 	 * Raw event type provide the config in the event structure
@@ -451,13 +450,13 @@ static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
  * Set the next IRQ period, based on the hwc->period_left value.
  * To be called with the counter disabled in hw:
  */
-static void
+static int
 x86_perf_counter_set_period(struct perf_counter *counter,
 			     struct hw_perf_counter *hwc, int idx)
 {
 	s64 left = atomic64_read(&hwc->period_left);
-	s64 period = min(x86_pmu.max_period, hwc->sample_period);
-	int err;
+	s64 period = hwc->sample_period;
+	int err, ret = 0;
 
 	/*
 	 * If we are way outside a reasoable range then just skip forward:
@@ -465,11 +464,13 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 	if (unlikely(left <= -period)) {
 		left = period;
 		atomic64_set(&hwc->period_left, left);
+		ret = 1;
 	}
 
 	if (unlikely(left <= 0)) {
 		left += period;
 		atomic64_set(&hwc->period_left, left);
+		ret = 1;
 	}
 	/*
 	 * Quirk: certain CPUs dont like it if just 1 event is left:
@@ -477,6 +478,9 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 	if (unlikely(left < 2))
 		left = 2;
 
+	if (left > x86_pmu.max_period)
+		left = x86_pmu.max_period;
+
 	per_cpu(prev_left[idx], smp_processor_id()) = left;
 
 	/*
@@ -487,6 +491,8 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 
 	err = checking_wrmsrl(hwc->counter_base + idx,
 			     (u64)(-left) & x86_pmu.counter_mask);
+
+	return ret;
 }
 
 static inline void
@@ -706,16 +712,19 @@ static void x86_pmu_disable(struct perf_counter *counter)
  * Save and restart an expired counter. Called by NMI contexts,
  * so it has to be careful about preempting normal counter ops:
  */
-static void intel_pmu_save_and_restart(struct perf_counter *counter)
+static int intel_pmu_save_and_restart(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	int idx = hwc->idx;
+	int ret;
 
 	x86_perf_counter_update(counter, hwc, idx);
-	x86_perf_counter_set_period(counter, hwc, idx);
+	ret = x86_perf_counter_set_period(counter, hwc, idx);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		intel_pmu_enable_counter(hwc, idx);
+
+	return ret;
 }
 
 static void intel_pmu_reset(void)
@@ -782,7 +791,9 @@ again:
 		if (!test_bit(bit, cpuc->active_mask))
 			continue;
 
-		intel_pmu_save_and_restart(counter);
+		if (!intel_pmu_save_and_restart(counter))
+			continue;
+
 		if (perf_counter_overflow(counter, nmi, regs, 0))
 			intel_pmu_disable_counter(&counter->hw, bit);
 	}
@@ -824,9 +835,11 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 			continue;
 
 		/* counter overflow */
-		x86_perf_counter_set_period(counter, hwc, idx);
 		handled = 1;
 		inc_irq_stat(apic_perf_irqs);
+		if (!x86_perf_counter_set_period(counter, hwc, idx))
+			continue;
+
 		if (perf_counter_overflow(counter, nmi, regs, 0))
 			amd_pmu_disable_counter(hwc, idx);
 	}
-- 
cgit v0.10.2


From 8e3747c13c39246c7e46def7cf495d9d21d4c5f9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 16:16:02 +0200
Subject: perf_counter: Change data head from u32 to u64

Since some people worried that 4G might not be a large enough
as an mmap data window, extend it to 64 bit for capable
platforms.

Reported-by: Stephane Eranian <eranian@googlemail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index cef9931..c046f7d 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -212,7 +212,7 @@ struct perf_counter_mmap_page {
 	 * User-space reading this value should issue an rmb(), on SMP capable
 	 * platforms, after reading this value -- see perf_counter_wakeup().
 	 */
-	__u32   data_head;		/* head in the data section */
+	__u64   data_head;		/* head in the data section */
 };
 
 #define PERF_EVENT_MISC_CPUMODE_MASK	(3 << 0)
@@ -397,10 +397,11 @@ struct perf_mmap_data {
 	int				nr_locked;	/* nr pages mlocked  */
 
 	atomic_t			poll;		/* POLL_ for wakeups */
-	atomic_t			head;		/* write position    */
 	atomic_t			events;		/* event limit       */
 
-	atomic_t			done_head;	/* completed head    */
+	atomic_long_t			head;		/* write position    */
+	atomic_long_t			done_head;	/* completed head    */
+
 	atomic_t			lock;		/* concurrent writes */
 
 	atomic_t			wakeup;		/* needs a wakeup    */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 5ecd9981..3f11a2b 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2067,8 +2067,8 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 struct perf_output_handle {
 	struct perf_counter	*counter;
 	struct perf_mmap_data	*data;
-	unsigned int		offset;
-	unsigned int		head;
+	unsigned long		head;
+	unsigned long		offset;
 	int			nmi;
 	int			overflow;
 	int			locked;
@@ -2122,7 +2122,8 @@ static void perf_output_lock(struct perf_output_handle *handle)
 static void perf_output_unlock(struct perf_output_handle *handle)
 {
 	struct perf_mmap_data *data = handle->data;
-	int head, cpu;
+	unsigned long head;
+	int cpu;
 
 	data->done_head = data->head;
 
@@ -2135,7 +2136,7 @@ again:
 	 * before we publish the new head, matched by a rmb() in userspace when
 	 * reading this position.
 	 */
-	while ((head = atomic_xchg(&data->done_head, 0)))
+	while ((head = atomic_long_xchg(&data->done_head, 0)))
 		data->user_page->data_head = head;
 
 	/*
@@ -2148,7 +2149,7 @@ again:
 	/*
 	 * Therefore we have to validate we did not indeed do so.
 	 */
-	if (unlikely(atomic_read(&data->done_head))) {
+	if (unlikely(atomic_long_read(&data->done_head))) {
 		/*
 		 * Since we had it locked, we can lock it again.
 		 */
@@ -2195,7 +2196,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	do {
 		offset = head = atomic_read(&data->head);
 		head += size;
-	} while (atomic_cmpxchg(&data->head, offset, head) != offset);
+	} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
 
 	handle->offset	= offset;
 	handle->head	= head;
@@ -2246,7 +2247,7 @@ static void perf_output_copy(struct perf_output_handle *handle,
 	 * Check we didn't copy past our reservation window, taking the
 	 * possible unsigned int wrap into account.
 	 */
-	WARN_ON_ONCE(((int)(handle->head - handle->offset)) < 0);
+	WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
 }
 
 #define perf_output_put(handle, x) \
-- 
cgit v0.10.2


From 08247e31ca79b8f02cce47b7e8120797a8726606 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 16:46:57 +0200
Subject: perf_counter: Add ioctl for changing the sample period/frequency

Reported-by: Stephane Eranian <eranian@googlemail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c046f7d..45bdd3b 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -164,10 +164,11 @@ struct perf_counter_hw_event {
 /*
  * Ioctls that can be done on a perf counter fd:
  */
-#define PERF_COUNTER_IOC_ENABLE		_IOW('$', 0, u32)
-#define PERF_COUNTER_IOC_DISABLE	_IOW('$', 1, u32)
-#define PERF_COUNTER_IOC_REFRESH	_IOW('$', 2, u32)
-#define PERF_COUNTER_IOC_RESET		_IOW('$', 3, u32)
+#define PERF_COUNTER_IOC_ENABLE		_IO ('$', 0)
+#define PERF_COUNTER_IOC_DISABLE	_IO ('$', 1)
+#define PERF_COUNTER_IOC_REFRESH	_IO ('$', 2)
+#define PERF_COUNTER_IOC_RESET		_IO ('$', 3)
+#define PERF_COUNTER_IOC_PERIOD		_IOW('$', 4, u64)
 
 enum perf_counter_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 3f11a2b..abe2f3b 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1604,6 +1604,43 @@ static void perf_counter_for_each(struct perf_counter *counter,
 	mutex_unlock(&counter->child_mutex);
 }
 
+static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
+{
+	struct perf_counter_context *ctx = counter->ctx;
+	unsigned long size;
+	int ret = 0;
+	u64 value;
+
+	if (!counter->hw_event.sample_period)
+		return -EINVAL;
+
+	size = copy_from_user(&value, arg, sizeof(value));
+	if (size != sizeof(value))
+		return -EFAULT;
+
+	if (!value)
+		return -EINVAL;
+
+	spin_lock_irq(&ctx->lock);
+	if (counter->hw_event.freq) {
+		if (value > sysctl_perf_counter_limit) {
+			ret = -EINVAL;
+			goto unlock;
+		}
+
+		counter->hw_event.sample_freq = value;
+	} else {
+		counter->hw_event.sample_period = value;
+		counter->hw.sample_period = value;
+
+		perf_log_period(counter, value);
+	}
+unlock:
+	spin_unlock_irq(&ctx->lock);
+
+	return ret;
+}
+
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct perf_counter *counter = file->private_data;
@@ -1623,6 +1660,10 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 	case PERF_COUNTER_IOC_REFRESH:
 		return perf_counter_refresh(counter, arg);
+
+	case PERF_COUNTER_IOC_PERIOD:
+		return perf_counter_period(counter, (u64 __user *)arg);
+
 	default:
 		return -ENOTTY;
 	}
-- 
cgit v0.10.2


From 0d48696f87e3618b0d35bd3e4e9d7c188d51e7de Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 19:22:16 +0200
Subject: perf_counter: Rename perf_counter_hw_event => perf_counter_attr

The structure isn't hw only and when I read event, I think about those
things that fall out the other end. Rename the thing.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
Cc: Stephane Eranian <eranian@googlemail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index c963332..ea54686 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -262,13 +262,13 @@ static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[],
 		}
 		counter = ctrs[i];
 		if (first) {
-			eu = counter->hw_event.exclude_user;
-			ek = counter->hw_event.exclude_kernel;
-			eh = counter->hw_event.exclude_hv;
+			eu = counter->attr.exclude_user;
+			ek = counter->attr.exclude_kernel;
+			eh = counter->attr.exclude_hv;
 			first = 0;
-		} else if (counter->hw_event.exclude_user != eu ||
-			   counter->hw_event.exclude_kernel != ek ||
-			   counter->hw_event.exclude_hv != eh) {
+		} else if (counter->attr.exclude_user != eu ||
+			   counter->attr.exclude_kernel != ek ||
+			   counter->attr.exclude_hv != eh) {
 			return -EAGAIN;
 		}
 	}
@@ -483,16 +483,16 @@ void hw_perf_enable(void)
 
 	/*
 	 * Add in MMCR0 freeze bits corresponding to the
-	 * hw_event.exclude_* bits for the first counter.
+	 * attr.exclude_* bits for the first counter.
 	 * We have already checked that all counters have the
 	 * same values for these bits as the first counter.
 	 */
 	counter = cpuhw->counter[0];
-	if (counter->hw_event.exclude_user)
+	if (counter->attr.exclude_user)
 		cpuhw->mmcr[0] |= MMCR0_FCP;
-	if (counter->hw_event.exclude_kernel)
+	if (counter->attr.exclude_kernel)
 		cpuhw->mmcr[0] |= freeze_counters_kernel;
-	if (counter->hw_event.exclude_hv)
+	if (counter->attr.exclude_hv)
 		cpuhw->mmcr[0] |= MMCR0_FCHV;
 
 	/*
@@ -786,10 +786,10 @@ static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev,
 	int n;
 	u64 alt[MAX_EVENT_ALTERNATIVES];
 
-	if (counter->hw_event.exclude_user
-	    || counter->hw_event.exclude_kernel
-	    || counter->hw_event.exclude_hv
-	    || counter->hw_event.sample_period)
+	if (counter->attr.exclude_user
+	    || counter->attr.exclude_kernel
+	    || counter->attr.exclude_hv
+	    || counter->attr.sample_period)
 		return 0;
 
 	if (ppmu->limited_pmc_event(ev))
@@ -855,13 +855,13 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	if (!ppmu)
 		return ERR_PTR(-ENXIO);
-	if (!perf_event_raw(&counter->hw_event)) {
-		ev = perf_event_id(&counter->hw_event);
+	if (!perf_event_raw(&counter->attr)) {
+		ev = perf_event_id(&counter->attr);
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
 			return ERR_PTR(-EOPNOTSUPP);
 		ev = ppmu->generic_events[ev];
 	} else {
-		ev = perf_event_config(&counter->hw_event);
+		ev = perf_event_config(&counter->attr);
 	}
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
@@ -872,7 +872,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 	 * the user set it to.
 	 */
 	if (!firmware_has_feature(FW_FEATURE_LPAR))
-		counter->hw_event.exclude_hv = 0;
+		counter->attr.exclude_hv = 0;
 
 	/*
 	 * If this is a per-task counter, then we can use
@@ -990,7 +990,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 */
 	if (record) {
 		addr = 0;
-		if (counter->hw_event.record_type & PERF_RECORD_ADDR) {
+		if (counter->attr.record_type & PERF_RECORD_ADDR) {
 			/*
 			 * The user wants a data address recorded.
 			 * If we're not doing instruction sampling,
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 904571b..e16e8c1 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -247,11 +247,11 @@ static inline int x86_pmu_initialized(void)
 }
 
 /*
- * Setup the hardware configuration for a given hw_event_type
+ * Setup the hardware configuration for a given attr_type
  */
 static int __hw_perf_counter_init(struct perf_counter *counter)
 {
-	struct perf_counter_hw_event *hw_event = &counter->hw_event;
+	struct perf_counter_attr *attr = &counter->attr;
 	struct hw_perf_counter *hwc = &counter->hw;
 	int err;
 
@@ -279,9 +279,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * Count user and OS events unless requested not to.
 	 */
-	if (!hw_event->exclude_user)
+	if (!attr->exclude_user)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
-	if (!hw_event->exclude_kernel)
+	if (!attr->exclude_kernel)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
 
 	if (!hwc->sample_period)
@@ -292,15 +292,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
-	if (perf_event_raw(hw_event)) {
-		hwc->config |= x86_pmu.raw_event(perf_event_config(hw_event));
+	if (perf_event_raw(attr)) {
+		hwc->config |= x86_pmu.raw_event(perf_event_config(attr));
 	} else {
-		if (perf_event_id(hw_event) >= x86_pmu.max_events)
+		if (perf_event_id(attr) >= x86_pmu.max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= x86_pmu.event_map(perf_event_id(hw_event));
+		hwc->config |= x86_pmu.event_map(perf_event_id(attr));
 	}
 
 	counter->destroy = hw_perf_counter_destroy;
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 45bdd3b..37d5541 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -22,7 +22,7 @@
  */
 
 /*
- * hw_event.type
+ * attr.type
  */
 enum perf_event_types {
 	PERF_TYPE_HARDWARE		= 0,
@@ -37,10 +37,10 @@ enum perf_event_types {
 };
 
 /*
- * Generalized performance counter event types, used by the hw_event.event_id
+ * Generalized performance counter event types, used by the attr.event_id
  * parameter of the sys_perf_counter_open() syscall:
  */
-enum hw_event_ids {
+enum attr_ids {
 	/*
 	 * Common hardware events, generalized by the kernel:
 	 */
@@ -94,7 +94,7 @@ enum sw_event_ids {
 #define PERF_COUNTER_EVENT_MASK		__PERF_COUNTER_MASK(EVENT)
 
 /*
- * Bits that can be set in hw_event.sample_type to request information
+ * Bits that can be set in attr.sample_type to request information
  * in the overflow packets.
  */
 enum perf_counter_sample_format {
@@ -109,7 +109,7 @@ enum perf_counter_sample_format {
 };
 
 /*
- * Bits that can be set in hw_event.read_format to request that
+ * Bits that can be set in attr.read_format to request that
  * reads on the counter should return the indicated quantities,
  * in increasing order of bit value, after the counter value.
  */
@@ -122,7 +122,7 @@ enum perf_counter_read_format {
 /*
  * Hardware event to monitor via a performance monitoring counter:
  */
-struct perf_counter_hw_event {
+struct perf_counter_attr {
 	/*
 	 * The MSB of the config word signifies if the rest contains cpu
 	 * specific (raw) counter configuration data, if unset, the next
@@ -323,25 +323,25 @@ enum perf_event_type {
 
 struct task_struct;
 
-static inline u64 perf_event_raw(struct perf_counter_hw_event *hw_event)
+static inline u64 perf_event_raw(struct perf_counter_attr *attr)
 {
-	return hw_event->config & PERF_COUNTER_RAW_MASK;
+	return attr->config & PERF_COUNTER_RAW_MASK;
 }
 
-static inline u64 perf_event_config(struct perf_counter_hw_event *hw_event)
+static inline u64 perf_event_config(struct perf_counter_attr *attr)
 {
-	return hw_event->config & PERF_COUNTER_CONFIG_MASK;
+	return attr->config & PERF_COUNTER_CONFIG_MASK;
 }
 
-static inline u64 perf_event_type(struct perf_counter_hw_event *hw_event)
+static inline u64 perf_event_type(struct perf_counter_attr *attr)
 {
-	return (hw_event->config & PERF_COUNTER_TYPE_MASK) >>
+	return (attr->config & PERF_COUNTER_TYPE_MASK) >>
 		PERF_COUNTER_TYPE_SHIFT;
 }
 
-static inline u64 perf_event_id(struct perf_counter_hw_event *hw_event)
+static inline u64 perf_event_id(struct perf_counter_attr *attr)
 {
-	return hw_event->config & PERF_COUNTER_EVENT_MASK;
+	return attr->config & PERF_COUNTER_EVENT_MASK;
 }
 
 /**
@@ -457,7 +457,7 @@ struct perf_counter {
 	u64				tstamp_running;
 	u64				tstamp_stopped;
 
-	struct perf_counter_hw_event	hw_event;
+	struct perf_counter_attr	attr;
 	struct hw_perf_counter		hw;
 
 	struct perf_counter_context	*ctx;
@@ -605,8 +605,8 @@ extern int perf_counter_overflow(struct perf_counter *counter,
  */
 static inline int is_software_counter(struct perf_counter *counter)
 {
-	return !perf_event_raw(&counter->hw_event) &&
-		perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE;
+	return !perf_event_raw(&counter->attr) &&
+		perf_event_type(&counter->attr) != PERF_TYPE_HARDWARE;
 }
 
 extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 79faae9..c6c84ad 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -55,7 +55,7 @@ struct compat_timeval;
 struct robust_list_head;
 struct getcpu_cache;
 struct old_linux_dirent;
-struct perf_counter_hw_event;
+struct perf_counter_attr;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -758,6 +758,6 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
 
 asmlinkage long sys_perf_counter_open(
-		const struct perf_counter_hw_event __user *hw_event_uptr,
+		const struct perf_counter_attr __user *attr_uptr,
 		pid_t pid, int cpu, int group_fd, unsigned long flags);
 #endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index abe2f3b..317cef7 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -260,7 +260,7 @@ counter_sched_out(struct perf_counter *counter,
 	if (!is_software_counter(counter))
 		cpuctx->active_oncpu--;
 	ctx->nr_active--;
-	if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
+	if (counter->attr.exclusive || !cpuctx->active_oncpu)
 		cpuctx->exclusive = 0;
 }
 
@@ -282,7 +282,7 @@ group_sched_out(struct perf_counter *group_counter,
 	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
 		counter_sched_out(counter, cpuctx, ctx);
 
-	if (group_counter->hw_event.exclusive)
+	if (group_counter->attr.exclusive)
 		cpuctx->exclusive = 0;
 }
 
@@ -550,7 +550,7 @@ counter_sched_in(struct perf_counter *counter,
 		cpuctx->active_oncpu++;
 	ctx->nr_active++;
 
-	if (counter->hw_event.exclusive)
+	if (counter->attr.exclusive)
 		cpuctx->exclusive = 1;
 
 	return 0;
@@ -642,7 +642,7 @@ static int group_can_go_on(struct perf_counter *counter,
 	 * If this group is exclusive and there are already
 	 * counters on the CPU, it can't go on.
 	 */
-	if (counter->hw_event.exclusive && cpuctx->active_oncpu)
+	if (counter->attr.exclusive && cpuctx->active_oncpu)
 		return 0;
 	/*
 	 * Otherwise, try to add it if all previous groups were able
@@ -725,7 +725,7 @@ static void __perf_install_in_context(void *info)
 		 */
 		if (leader != counter)
 			group_sched_out(leader, cpuctx, ctx);
-		if (leader->hw_event.pinned) {
+		if (leader->attr.pinned) {
 			update_group_times(leader);
 			leader->state = PERF_COUNTER_STATE_ERROR;
 		}
@@ -849,7 +849,7 @@ static void __perf_counter_enable(void *info)
 		 */
 		if (leader != counter)
 			group_sched_out(leader, cpuctx, ctx);
-		if (leader->hw_event.pinned) {
+		if (leader->attr.pinned) {
 			update_group_times(leader);
 			leader->state = PERF_COUNTER_STATE_ERROR;
 		}
@@ -927,7 +927,7 @@ static int perf_counter_refresh(struct perf_counter *counter, int refresh)
 	/*
 	 * not supported on inherited counters
 	 */
-	if (counter->hw_event.inherit)
+	if (counter->attr.inherit)
 		return -EINVAL;
 
 	atomic_add(refresh, &counter->event_limit);
@@ -1094,7 +1094,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 	 */
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		if (counter->state <= PERF_COUNTER_STATE_OFF ||
-		    !counter->hw_event.pinned)
+		    !counter->attr.pinned)
 			continue;
 		if (counter->cpu != -1 && counter->cpu != cpu)
 			continue;
@@ -1122,7 +1122,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 		 * ignore pinned counters since we did them already.
 		 */
 		if (counter->state <= PERF_COUNTER_STATE_OFF ||
-		    counter->hw_event.pinned)
+		    counter->attr.pinned)
 			continue;
 
 		/*
@@ -1204,11 +1204,11 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 			interrupts = 2*sysctl_perf_counter_limit/HZ;
 		}
 
-		if (!counter->hw_event.freq || !counter->hw_event.sample_freq)
+		if (!counter->attr.freq || !counter->attr.sample_freq)
 			continue;
 
 		events = HZ * interrupts * counter->hw.sample_period;
-		period = div64_u64(events, counter->hw_event.sample_freq);
+		period = div64_u64(events, counter->attr.sample_freq);
 
 		delta = (s64)(1 + period - counter->hw.sample_period);
 		delta >>= 1;
@@ -1444,11 +1444,11 @@ static void free_counter(struct perf_counter *counter)
 	perf_pending_sync(counter);
 
 	atomic_dec(&nr_counters);
-	if (counter->hw_event.mmap)
+	if (counter->attr.mmap)
 		atomic_dec(&nr_mmap_tracking);
-	if (counter->hw_event.munmap)
+	if (counter->attr.munmap)
 		atomic_dec(&nr_munmap_tracking);
-	if (counter->hw_event.comm)
+	if (counter->attr.comm)
 		atomic_dec(&nr_comm_tracking);
 
 	if (counter->destroy)
@@ -1504,13 +1504,13 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 	mutex_lock(&counter->child_mutex);
 	values[0] = perf_counter_read(counter);
 	n = 1;
-	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
 		values[n++] = counter->total_time_enabled +
 			atomic64_read(&counter->child_total_time_enabled);
-	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 		values[n++] = counter->total_time_running +
 			atomic64_read(&counter->child_total_time_running);
-	if (counter->hw_event.read_format & PERF_FORMAT_ID)
+	if (counter->attr.read_format & PERF_FORMAT_ID)
 		values[n++] = counter->id;
 	mutex_unlock(&counter->child_mutex);
 
@@ -1611,7 +1611,7 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
 	int ret = 0;
 	u64 value;
 
-	if (!counter->hw_event.sample_period)
+	if (!counter->attr.sample_period)
 		return -EINVAL;
 
 	size = copy_from_user(&value, arg, sizeof(value));
@@ -1622,15 +1622,15 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
 		return -EINVAL;
 
 	spin_lock_irq(&ctx->lock);
-	if (counter->hw_event.freq) {
+	if (counter->attr.freq) {
 		if (value > sysctl_perf_counter_limit) {
 			ret = -EINVAL;
 			goto unlock;
 		}
 
-		counter->hw_event.sample_freq = value;
+		counter->attr.sample_freq = value;
 	} else {
-		counter->hw_event.sample_period = value;
+		counter->attr.sample_period = value;
 		counter->hw.sample_period = value;
 
 		perf_log_period(counter, value);
@@ -2299,7 +2299,7 @@ static void perf_output_end(struct perf_output_handle *handle)
 	struct perf_counter *counter = handle->counter;
 	struct perf_mmap_data *data = handle->data;
 
-	int wakeup_events = counter->hw_event.wakeup_events;
+	int wakeup_events = counter->attr.wakeup_events;
 
 	if (handle->overflow && wakeup_events) {
 		int events = atomic_inc_return(&data->events);
@@ -2339,7 +2339,7 @@ static void perf_counter_output(struct perf_counter *counter,
 				int nmi, struct pt_regs *regs, u64 addr)
 {
 	int ret;
-	u64 sample_type = counter->hw_event.sample_type;
+	u64 sample_type = counter->attr.sample_type;
 	struct perf_output_handle handle;
 	struct perf_event_header header;
 	u64 ip;
@@ -2441,7 +2441,7 @@ static void perf_counter_output(struct perf_counter *counter,
 		perf_output_put(&handle, addr);
 
 	if (sample_type & PERF_SAMPLE_CONFIG)
-		perf_output_put(&handle, counter->hw_event.config);
+		perf_output_put(&handle, counter->attr.config);
 
 	if (sample_type & PERF_SAMPLE_CPU)
 		perf_output_put(&handle, cpu_entry);
@@ -2512,7 +2512,7 @@ static void perf_counter_comm_output(struct perf_counter *counter,
 static int perf_counter_comm_match(struct perf_counter *counter,
 				   struct perf_comm_event *comm_event)
 {
-	if (counter->hw_event.comm &&
+	if (counter->attr.comm &&
 	    comm_event->event.header.type == PERF_EVENT_COMM)
 		return 1;
 
@@ -2623,11 +2623,11 @@ static void perf_counter_mmap_output(struct perf_counter *counter,
 static int perf_counter_mmap_match(struct perf_counter *counter,
 				   struct perf_mmap_event *mmap_event)
 {
-	if (counter->hw_event.mmap &&
+	if (counter->attr.mmap &&
 	    mmap_event->event.header.type == PERF_EVENT_MMAP)
 		return 1;
 
-	if (counter->hw_event.munmap &&
+	if (counter->attr.munmap &&
 	    mmap_event->event.header.type == PERF_EVENT_MUNMAP)
 		return 1;
 
@@ -2907,8 +2907,8 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 	 * In case we exclude kernel IPs or are somehow not in interrupt
 	 * context, provide the next best thing, the user IP.
 	 */
-	if ((counter->hw_event.exclude_kernel || !regs) &&
-			!counter->hw_event.exclude_user)
+	if ((counter->attr.exclude_kernel || !regs) &&
+			!counter->attr.exclude_user)
 		regs = task_pt_regs(current);
 
 	if (regs) {
@@ -2982,14 +2982,14 @@ static int perf_swcounter_match(struct perf_counter *counter,
 	if (!perf_swcounter_is_counting(counter))
 		return 0;
 
-	if (counter->hw_event.config != event_config)
+	if (counter->attr.config != event_config)
 		return 0;
 
 	if (regs) {
-		if (counter->hw_event.exclude_user && user_mode(regs))
+		if (counter->attr.exclude_user && user_mode(regs))
 			return 0;
 
-		if (counter->hw_event.exclude_kernel && !user_mode(regs))
+		if (counter->attr.exclude_kernel && !user_mode(regs))
 			return 0;
 	}
 
@@ -3252,12 +3252,12 @@ extern void ftrace_profile_disable(int);
 
 static void tp_perf_counter_destroy(struct perf_counter *counter)
 {
-	ftrace_profile_disable(perf_event_id(&counter->hw_event));
+	ftrace_profile_disable(perf_event_id(&counter->attr));
 }
 
 static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 {
-	int event_id = perf_event_id(&counter->hw_event);
+	int event_id = perf_event_id(&counter->attr);
 	int ret;
 
 	ret = ftrace_profile_enable(event_id);
@@ -3265,7 +3265,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 
 	counter->destroy = tp_perf_counter_destroy;
-	counter->hw.sample_period = counter->hw_event.sample_period;
+	counter->hw.sample_period = counter->attr.sample_period;
 
 	return &perf_ops_generic;
 }
@@ -3287,7 +3287,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	 * to be kernel events, and page faults are never hypervisor
 	 * events.
 	 */
-	switch (perf_event_id(&counter->hw_event)) {
+	switch (perf_event_id(&counter->attr)) {
 	case PERF_COUNT_CPU_CLOCK:
 		pmu = &perf_ops_cpu_clock;
 
@@ -3319,7 +3319,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
  * Allocate and initialize a counter structure
  */
 static struct perf_counter *
-perf_counter_alloc(struct perf_counter_hw_event *hw_event,
+perf_counter_alloc(struct perf_counter_attr *attr,
 		   int cpu,
 		   struct perf_counter_context *ctx,
 		   struct perf_counter *group_leader,
@@ -3352,36 +3352,36 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	mutex_init(&counter->mmap_mutex);
 
 	counter->cpu			= cpu;
-	counter->hw_event		= *hw_event;
+	counter->attr		= *attr;
 	counter->group_leader		= group_leader;
 	counter->pmu			= NULL;
 	counter->ctx			= ctx;
 	counter->oncpu			= -1;
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
-	if (hw_event->disabled)
+	if (attr->disabled)
 		counter->state = PERF_COUNTER_STATE_OFF;
 
 	pmu = NULL;
 
 	hwc = &counter->hw;
-	if (hw_event->freq && hw_event->sample_freq)
-		hwc->sample_period = div64_u64(TICK_NSEC, hw_event->sample_freq);
+	if (attr->freq && attr->sample_freq)
+		hwc->sample_period = div64_u64(TICK_NSEC, attr->sample_freq);
 	else
-		hwc->sample_period = hw_event->sample_period;
+		hwc->sample_period = attr->sample_period;
 
 	/*
 	 * we currently do not support PERF_SAMPLE_GROUP on inherited counters
 	 */
-	if (hw_event->inherit && (hw_event->sample_type & PERF_SAMPLE_GROUP))
+	if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
 		goto done;
 
-	if (perf_event_raw(hw_event)) {
+	if (perf_event_raw(attr)) {
 		pmu = hw_perf_counter_init(counter);
 		goto done;
 	}
 
-	switch (perf_event_type(hw_event)) {
+	switch (perf_event_type(attr)) {
 	case PERF_TYPE_HARDWARE:
 		pmu = hw_perf_counter_init(counter);
 		break;
@@ -3409,11 +3409,11 @@ done:
 	counter->pmu = pmu;
 
 	atomic_inc(&nr_counters);
-	if (counter->hw_event.mmap)
+	if (counter->attr.mmap)
 		atomic_inc(&nr_mmap_tracking);
-	if (counter->hw_event.munmap)
+	if (counter->attr.munmap)
 		atomic_inc(&nr_munmap_tracking);
-	if (counter->hw_event.comm)
+	if (counter->attr.comm)
 		atomic_inc(&nr_comm_tracking);
 
 	return counter;
@@ -3424,17 +3424,17 @@ static atomic64_t perf_counter_id;
 /**
  * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
  *
- * @hw_event_uptr:	event type attributes for monitoring/sampling
+ * @attr_uptr:	event type attributes for monitoring/sampling
  * @pid:		target pid
  * @cpu:		target cpu
  * @group_fd:		group leader counter fd
  */
 SYSCALL_DEFINE5(perf_counter_open,
-		const struct perf_counter_hw_event __user *, hw_event_uptr,
+		const struct perf_counter_attr __user *, attr_uptr,
 		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
 {
 	struct perf_counter *counter, *group_leader;
-	struct perf_counter_hw_event hw_event;
+	struct perf_counter_attr attr;
 	struct perf_counter_context *ctx;
 	struct file *counter_file = NULL;
 	struct file *group_file = NULL;
@@ -3446,7 +3446,7 @@ SYSCALL_DEFINE5(perf_counter_open,
 	if (flags)
 		return -EINVAL;
 
-	if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
+	if (copy_from_user(&attr, attr_uptr, sizeof(attr)) != 0)
 		return -EFAULT;
 
 	/*
@@ -3484,11 +3484,11 @@ SYSCALL_DEFINE5(perf_counter_open,
 		/*
 		 * Only a group leader can be exclusive or pinned
 		 */
-		if (hw_event.exclusive || hw_event.pinned)
+		if (attr.exclusive || attr.pinned)
 			goto err_put_context;
 	}
 
-	counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
+	counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
 				     GFP_KERNEL);
 	ret = PTR_ERR(counter);
 	if (IS_ERR(counter))
@@ -3556,7 +3556,7 @@ inherit_counter(struct perf_counter *parent_counter,
 	if (parent_counter->parent)
 		parent_counter = parent_counter->parent;
 
-	child_counter = perf_counter_alloc(&parent_counter->hw_event,
+	child_counter = perf_counter_alloc(&parent_counter->attr,
 					   parent_counter->cpu, child_ctx,
 					   group_leader, GFP_KERNEL);
 	if (IS_ERR(child_counter))
@@ -3565,7 +3565,7 @@ inherit_counter(struct perf_counter *parent_counter,
 
 	/*
 	 * Make the child state follow the state of the parent counter,
-	 * not its hw_event.disabled bit.  We hold the parent's mutex,
+	 * not its attr.disabled bit.  We hold the parent's mutex,
 	 * so we won't race with perf_counter_{en, dis}able_family.
 	 */
 	if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
@@ -3582,7 +3582,7 @@ inherit_counter(struct perf_counter *parent_counter,
 	/*
 	 * inherit into child's child as well:
 	 */
-	child_counter->hw_event.inherit = 1;
+	child_counter->attr.inherit = 1;
 
 	/*
 	 * Get a reference to the parent filp - we will fput it
@@ -3838,7 +3838,7 @@ int perf_counter_init_task(struct task_struct *child)
 		if (counter != counter->group_leader)
 			continue;
 
-		if (!counter->hw_event.inherit) {
+		if (!counter->attr.inherit) {
 			inherited_all = 0;
 			continue;
 		}
-- 
cgit v0.10.2


From c70975bc8d5bac487616785f5d5bc7b090dfa2d9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 17:38:21 +0200
Subject: perf_counter tools: Fix up the ABI shakeup

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
Cc: Stephane Eranian <eranian@googlemail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index bace7a8..c2fd042 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -322,7 +322,7 @@ static void synthesize_events(void)
 
 static void open_counters(int cpu, pid_t pid)
 {
-	struct perf_counter_hw_event hw_event;
+	struct perf_counter_attr attr;
 	int counter, group_fd;
 	int track = 1;
 
@@ -334,18 +334,18 @@ static void open_counters(int cpu, pid_t pid)
 	group_fd = -1;
 	for (counter = 0; counter < nr_counters; counter++) {
 
-		memset(&hw_event, 0, sizeof(hw_event));
-		hw_event.config		= event_id[counter];
-		hw_event.irq_period	= event_count[counter];
-		hw_event.record_type	= PERF_RECORD_IP | PERF_RECORD_TID;
-		hw_event.mmap		= track;
-		hw_event.comm		= track;
-		hw_event.inherit	= (cpu < 0) && inherit;
+		memset(&attr, 0, sizeof(attr));
+		attr.config		= event_id[counter];
+		attr.sample_period	= event_count[counter];
+		attr.sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+		attr.mmap		= track;
+		attr.comm		= track;
+		attr.inherit	= (cpu < 0) && inherit;
 
 		track = 0; // only the first counter needs these
 
 		fd[nr_cpu][counter] =
-			sys_perf_counter_open(&hw_event, pid, cpu, group_fd, 0);
+			sys_perf_counter_open(&attr, pid, cpu, group_fd, 0);
 
 		if (fd[nr_cpu][counter] < 0) {
 			int err = errno;
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 644f850..27abe6a 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -79,22 +79,22 @@ static __u64			walltime_nsecs;
 
 static void create_perfstat_counter(int counter)
 {
-	struct perf_counter_hw_event hw_event;
+	struct perf_counter_attr attr;
 
-	memset(&hw_event, 0, sizeof(hw_event));
-	hw_event.config		= event_id[counter];
-	hw_event.record_type	= 0;
-	hw_event.exclude_kernel = event_mask[counter] & EVENT_MASK_KERNEL;
-	hw_event.exclude_user   = event_mask[counter] & EVENT_MASK_USER;
+	memset(&attr, 0, sizeof(attr));
+	attr.config		= event_id[counter];
+	attr.sample_type	= 0;
+	attr.exclude_kernel = event_mask[counter] & EVENT_MASK_KERNEL;
+	attr.exclude_user   = event_mask[counter] & EVENT_MASK_USER;
 
 	if (scale)
-		hw_event.read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
+		attr.read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
 					  PERF_FORMAT_TOTAL_TIME_RUNNING;
 
 	if (system_wide) {
 		int cpu;
 		for (cpu = 0; cpu < nr_cpus; cpu ++) {
-			fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
+			fd[cpu][counter] = sys_perf_counter_open(&attr, -1, cpu, -1, 0);
 			if (fd[cpu][counter] < 0) {
 				printf("perfstat error: syscall returned with %d (%s)\n",
 						fd[cpu][counter], strerror(errno));
@@ -102,10 +102,10 @@ static void create_perfstat_counter(int counter)
 			}
 		}
 	} else {
-		hw_event.inherit	= inherit;
-		hw_event.disabled	= 1;
+		attr.inherit	= inherit;
+		attr.disabled	= 1;
 
-		fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
+		fd[0][counter] = sys_perf_counter_open(&attr, 0, -1, -1, 0);
 		if (fd[0][counter] < 0) {
 			printf("perfstat error: syscall returned with %d (%s)\n",
 					fd[0][counter], strerror(errno));
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index a2cff7b..5029d8e 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -537,7 +537,7 @@ static void mmap_read(struct mmap_data *md)
 		old += size;
 
 		if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
-			if (event->header.type & PERF_RECORD_IP)
+			if (event->header.type & PERF_SAMPLE_IP)
 				process_event(event->ip.ip, md->counter);
 		} else {
 			switch (event->header.type) {
@@ -563,7 +563,7 @@ static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 
 static int __cmd_top(void)
 {
-	struct perf_counter_hw_event hw_event;
+	struct perf_counter_attr attr;
 	pthread_t thread;
 	int i, counter, group_fd, nr_poll = 0;
 	unsigned int cpu;
@@ -577,15 +577,15 @@ static int __cmd_top(void)
 			if (target_pid == -1 && profile_cpu == -1)
 				cpu = i;
 
-			memset(&hw_event, 0, sizeof(hw_event));
-			hw_event.config		= event_id[counter];
-			hw_event.irq_period	= event_count[counter];
-			hw_event.record_type	= PERF_RECORD_IP | PERF_RECORD_TID;
-			hw_event.mmap		= use_mmap;
-			hw_event.munmap		= use_munmap;
-			hw_event.freq		= freq;
+			memset(&attr, 0, sizeof(attr));
+			attr.config		= event_id[counter];
+			attr.sample_period	= event_count[counter];
+			attr.sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+			attr.mmap		= use_mmap;
+			attr.munmap		= use_munmap;
+			attr.freq		= freq;
 
-			fd[i][counter] = sys_perf_counter_open(&hw_event, target_pid, cpu, group_fd, 0);
+			fd[i][counter] = sys_perf_counter_open(&attr, target_pid, cpu, group_fd, 0);
 			if (fd[i][counter] < 0) {
 				int err = errno;
 				printf("kerneltop error: syscall returned with %d (%s)\n",
diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h
index 5a2520b..10622a4 100644
--- a/Documentation/perf_counter/perf.h
+++ b/Documentation/perf_counter/perf.h
@@ -53,11 +53,11 @@ static inline unsigned long long rdclock(void)
 	_min1 < _min2 ? _min1 : _min2; })
 
 static inline int
-sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
+sys_perf_counter_open(struct perf_counter_attr *attr_uptr,
 		      pid_t pid, int cpu, int group_fd,
 		      unsigned long flags)
 {
-	return syscall(__NR_perf_counter_open, hw_event_uptr, pid, cpu,
+	return syscall(__NR_perf_counter_open, attr_uptr, pid, cpu,
 		       group_fd, flags);
 }
 
-- 
cgit v0.10.2


From 436224a6d8bb3e29fe0cc18122f8d1f593da67b8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 21:02:36 +0200
Subject: perf report: Separate out idle threads

Introduce the special comm name [idle] for idle theads.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 19c1e05..6d68f3a 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -612,6 +612,17 @@ static size_t output__fprintf(FILE *fp, uint64_t total_samples)
 	return ret;
 }
 
+static void register_idle_thread(void)
+{
+	struct thread *thread = threads__findnew(0);
+
+	if (thread == NULL ||
+			thread__set_comm(thread, "[idle]")) {
+		fprintf(stderr, "problem inserting idle task.\n");
+		exit(-1);
+	}
+}
+
 
 static int __cmd_report(void)
 {
@@ -626,6 +637,8 @@ static int __cmd_report(void)
 	char cwd[PATH_MAX], *cwdp = cwd;
 	int cwdlen;
 
+	register_idle_thread();
+
 	input = open(input_name, O_RDONLY);
 	if (input < 0) {
 		perror("failed to open file");
-- 
cgit v0.10.2


From cf25c63c609e99bfb9303b68a7a90a56a3a32cea Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 2 Jun 2009 22:12:14 +0200
Subject: perf report: Fix column width/alignment of dsos

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 6d68f3a..b84aaf1 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -303,11 +303,11 @@ sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
 static size_t
 sort__thread_print(FILE *fp, struct hist_entry *self)
 {
-	return fprintf(fp, "  %16s:%5d", self->thread->comm ?: "", self->thread->pid);
+	return fprintf(fp, " %16s:%5d", self->thread->comm ?: "", self->thread->pid);
 }
 
 static struct sort_entry sort_thread = {
-	.header = "          Command: Pid ",
+	.header = "         Command: Pid ",
 	.cmp	= sort__thread_cmp,
 	.print	= sort__thread_print,
 };
@@ -363,11 +363,11 @@ sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
 static size_t
 sort__dso_print(FILE *fp, struct hist_entry *self)
 {
-	return fprintf(fp, "  %s", self->dso ? self->dso->name : "<unknown>");
+	return fprintf(fp, "  %-25s", self->dso ? self->dso->name : "<unknown>");
 }
 
 static struct sort_entry sort_dso = {
-	.header = " Shared Object",
+	.header = " Shared Object          ",
 	.cmp	= sort__dso_cmp,
 	.print	= sort__dso_print,
 };
-- 
cgit v0.10.2


From f7e8b616ed1cc6f790b82324bce8a2a60295e5c2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Jun 2009 16:39:48 -0400
Subject: function-graph: move initialization of new tasks up in fork

When the function graph tracer is enabled, all new tasks must allocate
a ret_stack to place the return address of functions. This is because
the function graph tracer will replace the real return address with a
call to the tracing of the exit function.

This initialization happens in fork, but it happens too late. If fork
fails, then it will call free_task and that calls the freeing of this
ret_stack. But before initialization happens, the new (failed) task
points to its parents ret_stack. If a fork failure happens during
the function trace, it would be catastrophic for the parent.

Also, there's no need to call ftrace_graph_exit_task from fork, since
it is called by free_task which fork calls on failure.

[ Impact: prevent crash during failed fork running function graph tracer ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/fork.c b/kernel/fork.c
index b9e2edd..c4b1e35 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -982,6 +982,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (!p)
 		goto fork_out;
 
+	ftrace_graph_init_task(p);
+
 	rt_mutex_init_task(p);
 
 #ifdef CONFIG_PROVE_LOCKING
@@ -1131,8 +1133,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		}
 	}
 
-	ftrace_graph_init_task(p);
-
 	p->pid = pid_nr(pid);
 	p->tgid = p->pid;
 	if (clone_flags & CLONE_THREAD)
@@ -1141,7 +1141,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (current->nsproxy != p->nsproxy) {
 		retval = ns_cgroup_clone(p, pid);
 		if (retval)
-			goto bad_fork_free_graph;
+			goto bad_fork_free_pid;
 	}
 
 	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
@@ -1233,7 +1233,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		spin_unlock(&current->sighand->siglock);
 		write_unlock_irq(&tasklist_lock);
 		retval = -ERESTARTNOINTR;
-		goto bad_fork_free_graph;
+		goto bad_fork_free_pid;
 	}
 
 	if (clone_flags & CLONE_THREAD) {
@@ -1268,8 +1268,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	cgroup_post_fork(p);
 	return p;
 
-bad_fork_free_graph:
-	ftrace_graph_exit_task(p);
 bad_fork_free_pid:
 	if (pid != &init_struct_pid)
 		free_pid(pid);
-- 
cgit v0.10.2


From 84047e360af0394ac5861d433f26bbcf30f77dd1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Jun 2009 16:51:55 -0400
Subject: function-graph: always initialize task ret_stack

On creating a new task while running the function graph tracer, if
we fail to allocate the ret_stack, and then fail the fork, the
code will free the parent ret_stack. This is because the child
duplicated the parent and currently points to the parent's ret_stack.

This patch always initializes the task's ret_stack to NULL.

[ Impact: prevent crash of parent on low memory during fork ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1664d3f..bb081f3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2738,6 +2738,9 @@ void unregister_ftrace_graph(void)
 /* Allocate a return stack for newly created task */
 void ftrace_graph_init_task(struct task_struct *t)
 {
+	/* Make sure we do not use the parent ret_stack */
+	t->ret_stack = NULL;
+
 	if (atomic_read(&ftrace_graph_active)) {
 		struct ftrace_ret_stack *ret_stack;
 
@@ -2753,8 +2756,7 @@ void ftrace_graph_init_task(struct task_struct *t)
 		/* make curr_ret_stack visable before we add the ret_stack */
 		smp_wmb();
 		t->ret_stack = ret_stack;
-	} else
-		t->ret_stack = NULL;
+	}
 }
 
 void ftrace_graph_exit_task(struct task_struct *t)
-- 
cgit v0.10.2


From abaff32a03e26e5d6674cb2a26ad882efe7493a3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 2 Jun 2009 22:59:57 +0200
Subject: perf record: Add --append option

Allow incremental profiling via 'perf record -A' - this will append
to an existing perf.data.

Also reorder perf record options by utility / likelyhood of usage.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index c2fd042..19cba6b 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -1,5 +1,7 @@
-
-
+/*
+ * perf record: Record the profile of a workload (or a CPU, or a PID) into
+ * the perf.data output file - for later analysis via perf report.
+ */
 #include "perf.h"
 #include "builtin.h"
 #include "util/util.h"
@@ -28,6 +30,7 @@ static int			system_wide			= 0;
 static pid_t			target_pid			= -1;
 static int			inherit				= 1;
 static int			force				= 0;
+static int			append_file			= 0;
 
 const unsigned int default_count[] = {
 	1000000,
@@ -385,22 +388,29 @@ static void open_counters(int cpu, pid_t pid)
 static int __cmd_record(int argc, const char **argv)
 {
 	int i, counter;
+	struct stat st;
 	pid_t pid;
+	int flags;
 	int ret;
-	struct stat st;
 
 	page_size = sysconf(_SC_PAGE_SIZE);
 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
 	assert(nr_cpus <= MAX_NR_CPUS);
 	assert(nr_cpus >= 0);
 
-	if (!stat(output_name, &st) && !force) {
-		fprintf(stderr, "Error, output file: %s exists, use -f to overwrite.\n",
+	if (!stat(output_name, &st) && !force && !append_file) {
+		fprintf(stderr, "Error, output file %s exists, use -A to append or -f to overwrite.\n",
 				output_name);
 		exit(-1);
 	}
 
-	output = open(output_name, O_CREAT|O_TRUNC|O_RDWR, S_IRUSR|S_IWUSR);
+	flags = O_CREAT|O_RDWR;
+	if (append_file)
+		flags |= O_APPEND;
+	else
+		flags |= O_TRUNC;
+
+	output = open(output_name, flags, S_IRUSR|S_IWUSR);
 	if (output < 0) {
 		perror("failed to create output file");
 		exit(-1);
@@ -466,22 +476,24 @@ static char events_help_msg[EVENTS_HELP_MAX];
 static const struct option options[] = {
 	OPT_CALLBACK('e', "event", NULL, "event",
 		     events_help_msg, parse_events),
-	OPT_INTEGER('c', "count", &default_interval,
-		    "event period to sample"),
-	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
-		    "number of mmap data pages"),
-	OPT_STRING('o', "output", &output_name, "file",
-		    "output file name"),
-	OPT_BOOLEAN('i', "inherit", &inherit,
-		    "child tasks inherit counters"),
 	OPT_INTEGER('p', "pid", &target_pid,
 		    "record events on existing pid"),
 	OPT_INTEGER('r', "realtime", &realtime_prio,
 		    "collect data with this RT SCHED_FIFO priority"),
 	OPT_BOOLEAN('a', "all-cpus", &system_wide,
 			    "system-wide collection from all CPUs"),
+	OPT_BOOLEAN('A', "append", &append_file,
+			    "append to the output file to do incremental profiling"),
 	OPT_BOOLEAN('f', "force", &force,
 			"overwrite existing data file"),
+	OPT_INTEGER('c', "count", &default_interval,
+		    "event period to sample"),
+	OPT_STRING('o', "output", &output_name, "file",
+		    "output file name"),
+	OPT_BOOLEAN('i', "inherit", &inherit,
+		    "child tasks inherit counters"),
+	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
+		    "number of mmap data pages"),
 	OPT_END()
 };
 
-- 
cgit v0.10.2


From 3cf165fc2e7f221a7a95098b47eb990779e29f5f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 2 Jun 2009 23:02:59 +0200
Subject: perf record: Increase mmap buffering default

I've run into mmap overruns with the current 16 pages default,
increase it to 128 pages.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 19cba6b..8feb119 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -19,9 +19,9 @@ static int			default_interval = 100000;
 static int			event_count[MAX_COUNTERS];
 
 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
-static int			nr_cpus				=  0;
+static int			nr_cpus				= 0;
 static unsigned int		page_size;
-static unsigned int		mmap_pages			= 16;
+static unsigned int		mmap_pages			= 128;
 static int			output;
 static const char		*output_name			= "perf.data";
 static int			group				= 0;
-- 
cgit v0.10.2


From 0a520c63e1625b92ef775da40192e1542910e7f6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 2 Jun 2009 23:24:45 +0200
Subject: perf report: Print more info instead of <unknown> entries

Sometimes we still fail to find a DSO or look up a symbol,
print out the raw information in this case (which an help
debug the problem), instead of a not very helpful <unknown>
string.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index b84aaf1..270e986 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -201,7 +201,9 @@ static struct thread *thread__new(pid_t pid)
 
 	if (self != NULL) {
 		self->pid = pid;
-		self->comm = NULL;
+		self->comm = malloc(30);
+		if (self->comm)
+			sprintf(self->comm, ":%d", pid);
 		INIT_LIST_HEAD(&self->maps);
 	}
 
@@ -333,7 +335,7 @@ sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
 static size_t
 sort__comm_print(FILE *fp, struct hist_entry *self)
 {
-	return fprintf(fp, "  %16s", self->thread->comm ?: "<unknown>");
+	return fprintf(fp, "  %16s", self->thread->comm);
 }
 
 static struct sort_entry sort_comm = {
@@ -363,7 +365,10 @@ sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
 static size_t
 sort__dso_print(FILE *fp, struct hist_entry *self)
 {
-	return fprintf(fp, "  %-25s", self->dso ? self->dso->name : "<unknown>");
+	if (self->dso)
+		return fprintf(fp, "  %-25s", self->dso->name);
+
+	return fprintf(fp, "  %016llx", (__u64)self->ip);
 }
 
 static struct sort_entry sort_dso = {
@@ -392,11 +397,17 @@ sort__sym_print(FILE *fp, struct hist_entry *self)
 	size_t ret = 0;
 
 	if (verbose)
-		ret += fprintf(fp, "  %#018llx", (unsigned long long)self->ip);
+		ret += fprintf(fp, "  %#018llx", (__u64)self->ip);
+
+	if (self->dso)
+		ret += fprintf(fp, "  %s: ", self->dso->name);
+	else
+		ret += fprintf(fp, "  %#016llx: ", (__u64)self->ip);
 
-	ret += fprintf(fp, "  %s: %s",
-			self->dso ? self->dso->name : "<unknown>",
-			self->sym ? self->sym->name : "<unknown>");
+	if (self->sym)
+		ret += fprintf(fp, "%s", self->sym->name);
+	else
+		ret += fprintf(fp, "%#016llx", (__u64)self->ip);
 
 	return ret;
 }
@@ -772,7 +783,8 @@ more:
 				event->mmap.filename);
 		}
 		if (thread == NULL || map == NULL) {
-			fprintf(stderr, "problem processing PERF_EVENT_MMAP, skipping event.\n");
+			if (verbose)
+				fprintf(stderr, "problem processing PERF_EVENT_MMAP, skipping event.\n");
 			goto broken_event;
 		}
 		thread__insert_map(thread, map);
-- 
cgit v0.10.2


From eb5f4ca9536ba297c98721ecbbdf41ec5b987bd5 Mon Sep 17 00:00:00 2001
From: Martin Fuzzey <mfuzzey@gmail.com>
Date: Mon, 1 Jun 2009 09:19:37 +0100
Subject: [ARM] 5534/1: kmalloc must return a cache line aligned buffer

Define ARCH_KMALLOC_MINALIGN in asm/cache.h
At the request of Russell also move ARCH_SLAB_MINALIGN to this file.

Signed-off-by: Martin Fuzzey <mfuzzey@gmail.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

diff --git a/arch/arm/include/asm/cache.h b/arch/arm/include/asm/cache.h
index cb7a9e9..feaa75f 100644
--- a/arch/arm/include/asm/cache.h
+++ b/arch/arm/include/asm/cache.h
@@ -7,4 +7,20 @@
 #define L1_CACHE_SHIFT		5
 #define L1_CACHE_BYTES		(1 << L1_CACHE_SHIFT)
 
+/*
+ * Memory returned by kmalloc() may be used for DMA, so we must make
+ * sure that all such allocations are cache aligned. Otherwise,
+ * unrelated code may cause parts of the buffer to be read into the
+ * cache before the transfer is done, causing old data to be seen by
+ * the CPU.
+ */
+#define ARCH_KMALLOC_MINALIGN	L1_CACHE_BYTES
+
+/*
+ * With EABI on ARMv5 and above we must have 64-bit aligned slab pointers.
+ */
+#if defined(CONFIG_AEABI) && (__LINUX_ARM_ARCH__ >= 5)
+#define ARCH_SLAB_MINALIGN 8
+#endif
+
 #endif
diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h
index e6eb8a6..7b52277 100644
--- a/arch/arm/include/asm/page.h
+++ b/arch/arm/include/asm/page.h
@@ -202,13 +202,6 @@ typedef struct page *pgtable_t;
 	(((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \
 	 VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
 
-/*
- * With EABI on ARMv5 and above we must have 64-bit aligned slab pointers.
- */
-#if defined(CONFIG_AEABI) && (__LINUX_ARM_ARCH__ >= 5)
-#define ARCH_SLAB_MINALIGN 8
-#endif
-
 #include <asm-generic/page.h>
 
 #endif
-- 
cgit v0.10.2


From 1946d6ef9d7bd4ba97094fe6eb68a9b877bde6b7 Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Mon, 1 Jun 2009 12:50:33 +0100
Subject: [ARM] ARMv7 errata: only apply fixes when running on applicable CPU

Currently, whenever an erratum workaround is enabled, it will be
applied whether or not the erratum is relevent for the CPU.  This
patch changes this - we check the variant and revision fields in the
main ID register to determine which errata to apply.

We also avoid re-applying erratum 460075 if it has already been applied.
Applying this fix in non-secure mode results in the kernel failing to
boot (or even do anything.)

This fixes booting on some ARMv7 based platforms which otherwise
silently fail.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S
index 3397f1e..a08d9d2 100644
--- a/arch/arm/mm/proc-v7.S
+++ b/arch/arm/mm/proc-v7.S
@@ -184,23 +184,37 @@ __v7_setup:
 	stmia	r12, {r0-r5, r7, r9, r11, lr}
 	bl	v7_flush_dcache_all
 	ldmia	r12, {r0-r5, r7, r9, r11, lr}
+
+	mrc	p15, 0, r0, c0, c0, 0		@ read main ID register
+	and	r10, r0, #0xff000000		@ ARM?
+	teq	r10, #0x41000000
+	bne	2f
+	and	r5, r0, #0x00f00000		@ variant
+	and	r6, r0, #0x0000000f		@ revision
+	orr	r0, r6, r5, lsr #20-4		@ combine variant and revision
+
 #ifdef CONFIG_ARM_ERRATA_430973
-	mrc	p15, 0, r10, c1, c0, 1		@ read aux control register
-	orr	r10, r10, #(1 << 6)		@ set IBE to 1
-	mcr	p15, 0, r10, c1, c0, 1		@ write aux control register
+	teq	r5, #0x00100000			@ only present in r1p*
+	mrceq	p15, 0, r10, c1, c0, 1		@ read aux control register
+	orreq	r10, r10, #(1 << 6)		@ set IBE to 1
+	mcreq	p15, 0, r10, c1, c0, 1		@ write aux control register
 #endif
 #ifdef CONFIG_ARM_ERRATA_458693
-	mrc	p15, 0, r10, c1, c0, 1		@ read aux control register
-	orr	r10, r10, #(1 << 5)		@ set L1NEON to 1
-	orr	r10, r10, #(1 << 9)		@ set PLDNOP to 1
-	mcr	p15, 0, r10, c1, c0, 1		@ write aux control register
+	teq	r0, #0x20			@ only present in r2p0
+	mrceq	p15, 0, r10, c1, c0, 1		@ read aux control register
+	orreq	r10, r10, #(1 << 5)		@ set L1NEON to 1
+	orreq	r10, r10, #(1 << 9)		@ set PLDNOP to 1
+	mcreq	p15, 0, r10, c1, c0, 1		@ write aux control register
 #endif
 #ifdef CONFIG_ARM_ERRATA_460075
-	mrc	p15, 1, r10, c9, c0, 2		@ read L2 cache aux ctrl register
-	orr	r10, r10, #(1 << 22)		@ set the Write Allocate disable bit
-	mcr	p15, 1, r10, c9, c0, 2		@ write the L2 cache aux ctrl register
+	teq	r0, #0x20			@ only present in r2p0
+	mrceq	p15, 1, r10, c9, c0, 2		@ read L2 cache aux ctrl register
+	tsteq	r10, #1 << 22
+	orreq	r10, r10, #(1 << 22)		@ set the Write Allocate disable bit
+	mcreq	p15, 1, r10, c9, c0, 2		@ write the L2 cache aux ctrl register
 #endif
-	mov	r10, #0
+
+2:	mov	r10, #0
 #ifdef HARVARD_CACHE
 	mcr	p15, 0, r10, c7, c5, 0		@ I+BTB cache invalidate
 #endif
-- 
cgit v0.10.2


From bf9e187637ca3d85cee7407e3af93995868cc87c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 2 Jun 2009 23:37:05 +0200
Subject: perf_counter tools: Make source code headers more coherent

The perf commands had different ways of describing themselves,
introduce a coherent command-file-header format taken from the
Git project.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 8feb119..2741b35 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -1,9 +1,14 @@
 /*
- * perf record: Record the profile of a workload (or a CPU, or a PID) into
- * the perf.data output file - for later analysis via perf report.
+ * builtin-record.c
+ *
+ * Builtin record command: Record the profile of a workload
+ * (or a CPU, or a PID) into the perf.data output file - for
+ * later analysis via perf report.
  */
-#include "perf.h"
 #include "builtin.h"
+
+#include "perf.h"
+
 #include "util/util.h"
 #include "util/parse-options.h"
 #include "util/parse-events.h"
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 270e986..9da990f 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -1,6 +1,14 @@
-#include "util/util.h"
+/*
+ * builtin-report.c
+ *
+ * Builtin report command: Analyze the perf.data input file,
+ * look up and read DSOs and symbol information and display
+ * a histogram of results, along various sorting keys.
+ */
 #include "builtin.h"
 
+#include "util/util.h"
+
 #include "util/list.h"
 #include "util/cache.h"
 #include "util/rbtree.h"
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 27abe6a..2357a66 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -1,20 +1,27 @@
 /*
- * perf stat:  /usr/bin/time -alike performance counter statistics utility
+ * builtin-stat.c
+ *
+ * Builtin stat command: Give a precise performance counters summary
+ * overview about any workload, CPU or specific PID.
+ *
+ * Sample output:
 
-          It summarizes the counter events of all tasks (and child tasks),
-          covering all CPUs that the command (or workload) executes on.
-          It only counts the per-task events of the workload started,
-          independent of how many other tasks run on those CPUs.
+   $ perf stat ~/hackbench 10
+   Time: 0.104
 
-   Sample output:
+    Performance counter stats for '/home/mingo/hackbench':
 
-   $ perf stat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
+       1255.538611  task clock ticks     #      10.143 CPU utilization factor
+             54011  context switches     #       0.043 M/sec
+               385  CPU migrations       #       0.000 M/sec
+             17755  pagefaults           #       0.014 M/sec
+        3808323185  CPU cycles           #    3033.219 M/sec
+        1575111190  instructions         #    1254.530 M/sec
+          17367895  cache references     #      13.833 M/sec
+           7674421  cache misses         #       6.112 M/sec
 
-   Performance counter stats for 'ls':
+    Wall-clock time elapsed:   123.786620 msecs
 
-           163516953 instructions
-                2295 cache-misses
-             2855182 branch-misses
  *
  * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
  *
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 5029d8e..a639352 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -1,49 +1,25 @@
 /*
- * kerneltop.c: show top kernel functions - performance counters showcase
-
-   Build with:
-
-     make -C Documentation/perf_counter/
-
-   Sample output:
-
-------------------------------------------------------------------------------
- KernelTop:    2669 irqs/sec  [cache-misses/cache-refs],  (all, cpu: 2)
-------------------------------------------------------------------------------
-
-             weight         RIP          kernel function
-             ______   ________________   _______________
-
-              35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
-              33.00 - ffffffff804cb740 : sock_alloc_send_skb
-              31.26 - ffffffff804ce808 : skb_push
-              22.43 - ffffffff80510004 : tcp_established_options
-              19.00 - ffffffff8027d250 : find_get_page
-              15.76 - ffffffff804e4fc9 : eth_type_trans
-              15.20 - ffffffff804d8baa : dst_release
-              14.86 - ffffffff804cf5d8 : skb_release_head_state
-              14.00 - ffffffff802217d5 : read_hpet
-              12.00 - ffffffff804ffb7f : __ip_local_out
-              11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
-               8.54 - ffffffff805001a3 : ip_queue_xmit
+ * builtin-top.c
+ *
+ * Builtin top command: Display a continuously updated profile of
+ * any workload, CPU or specific PID.
+ *
+ * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
+ *
+ * Improvements and fixes by:
+ *
+ *   Arjan van de Ven <arjan@linux.intel.com>
+ *   Yanmin Zhang <yanmin.zhang@intel.com>
+ *   Wu Fengguang <fengguang.wu@intel.com>
+ *   Mike Galbraith <efault@gmx.de>
+ *   Paul Mackerras <paulus@samba.org>
+ *
+ * Released under the GPL v2. (and only v2, not any later version)
  */
-
- /*
-  * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
-  *
-  * Improvements and fixes by:
-  *
-  *   Arjan van de Ven <arjan@linux.intel.com>
-  *   Yanmin Zhang <yanmin.zhang@intel.com>
-  *   Wu Fengguang <fengguang.wu@intel.com>
-  *   Mike Galbraith <efault@gmx.de>
-  *   Paul Mackerras <paulus@samba.org>
-  *
-  * Released under the GPL v2. (and only v2, not any later version)
-  */
+#include "builtin.h"
 
 #include "perf.h"
-#include "builtin.h"
+
 #include "util/symbol.h"
 #include "util/util.h"
 #include "util/rbtree.h"
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
index e8a8584..ec7edb7 100644
--- a/Documentation/perf_counter/perf.c
+++ b/Documentation/perf_counter/perf.c
@@ -1,4 +1,13 @@
+/*
+ * perf.c
+ *
+ * Performance analysis utility.
+ *
+ * This is the main hub from which the sub-commands (perf stat,
+ * perf top, perf record, perf report, etc.) are started.
+ */
 #include "builtin.h"
+
 #include "util/exec_cmd.h"
 #include "util/cache.h"
 #include "util/quote.h"
-- 
cgit v0.10.2


From addc2785ce92ff05da8edf18317b6b4719e10d9f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 2 Jun 2009 23:43:11 +0200
Subject: perf record: Print out the number of events captured

It makes sense to inform the user about how many events
perf record has written - so that the sufficiency of
profiling coverage and intensity can be determined at
a glance.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 2741b35..ec3b73a 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -467,6 +467,9 @@ static int __cmd_record(int argc, const char **argv)
 			ret = poll(event_array, nr_poll, 100);
 	}
 
+
+	fprintf(stderr, "[ perf record: Captured and wrote %ld events. ]\n", events);
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From 850b0cee165576f969363a8c52021b5cf9ecbe67 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 2 Jun 2009 17:01:16 -0400
Subject: SELinux: define audit permissions for audit tree netlink messages

Audit trees defined 2 new netlink messages but the netlink mapping tables for
selinux permissions were not set up.  This patch maps these 2 new operations
to AUDIT_WRITE.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c
index c6875fd..dd7cc6d 100644
--- a/security/selinux/nlmsgtab.c
+++ b/security/selinux/nlmsgtab.c
@@ -112,6 +112,8 @@ static struct nlmsg_perm nlmsg_audit_perms[] =
 	{ AUDIT_DEL_RULE,	NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
 	{ AUDIT_USER,		NETLINK_AUDIT_SOCKET__NLMSG_RELAY    },
 	{ AUDIT_SIGNAL_INFO,	NETLINK_AUDIT_SOCKET__NLMSG_READ     },
+	{ AUDIT_TRIM,		NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
+	{ AUDIT_MAKE_EQUIV,	NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
 	{ AUDIT_TTY_GET,	NETLINK_AUDIT_SOCKET__NLMSG_READ     },
 	{ AUDIT_TTY_SET,	NETLINK_AUDIT_SOCKET__NLMSG_TTY_AUDIT	},
 };
-- 
cgit v0.10.2


From ab588ccadc80f6ef5495e83e176e88c5c0fc2d0e Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Tue, 2 Jun 2009 14:23:39 +0900
Subject: TOMOYO: Remove redundant markers.

Remove '/***** START/STOP *****/' markers.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c
index ddfb9cc..a42fe02 100644
--- a/security/tomoyo/common.c
+++ b/security/tomoyo/common.c
@@ -866,7 +866,6 @@ static struct tomoyo_profile *tomoyo_find_or_assign_new_profile(const unsigned
 
 	if (profile >= TOMOYO_MAX_PROFILES)
 		return NULL;
-	/***** EXCLUSIVE SECTION START *****/
 	mutex_lock(&lock);
 	ptr = tomoyo_profile_ptr[profile];
 	if (ptr)
@@ -880,7 +879,6 @@ static struct tomoyo_profile *tomoyo_find_or_assign_new_profile(const unsigned
 	tomoyo_profile_ptr[profile] = ptr;
  ok:
 	mutex_unlock(&lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return ptr;
 }
 
@@ -1050,7 +1048,6 @@ static int tomoyo_update_manager_entry(const char *manager,
 	saved_manager = tomoyo_save_name(manager);
 	if (!saved_manager)
 		return -ENOMEM;
-	/***** EXCLUSIVE SECTION START *****/
 	down_write(&tomoyo_policy_manager_list_lock);
 	list_for_each_entry(ptr, &tomoyo_policy_manager_list, list) {
 		if (ptr->manager != saved_manager)
@@ -1072,7 +1069,6 @@ static int tomoyo_update_manager_entry(const char *manager,
 	error = 0;
  out:
 	up_write(&tomoyo_policy_manager_list_lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return error;
 }
 
@@ -1197,13 +1193,11 @@ static bool tomoyo_is_select_one(struct tomoyo_io_buffer *head,
 
 	if (sscanf(data, "pid=%u", &pid) == 1) {
 		struct task_struct *p;
-		/***** CRITICAL SECTION START *****/
 		read_lock(&tasklist_lock);
 		p = find_task_by_vpid(pid);
 		if (p)
 			domain = tomoyo_real_domain(p);
 		read_unlock(&tasklist_lock);
-		/***** CRITICAL SECTION END *****/
 	} else if (!strncmp(data, "domain=", 7)) {
 		if (tomoyo_is_domain_def(data + 7)) {
 			down_read(&tomoyo_domain_list_lock);
@@ -1594,13 +1588,11 @@ static int tomoyo_read_pid(struct tomoyo_io_buffer *head)
 		const int pid = head->read_step;
 		struct task_struct *p;
 		struct tomoyo_domain_info *domain = NULL;
-		/***** CRITICAL SECTION START *****/
 		read_lock(&tasklist_lock);
 		p = find_task_by_vpid(pid);
 		if (p)
 			domain = tomoyo_real_domain(p);
 		read_unlock(&tasklist_lock);
-		/***** CRITICAL SECTION END *****/
 		if (domain)
 			tomoyo_io_printf(head, "%d %u %s", pid, domain->profile,
 					 domain->domainname->name);
diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index ee43631..aa119ca 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -67,14 +67,12 @@ void tomoyo_set_domain_flag(struct tomoyo_domain_info *domain,
 {
 	/* We need to serialize because this is bitfield operation. */
 	static DEFINE_SPINLOCK(lock);
-	/***** CRITICAL SECTION START *****/
 	spin_lock(&lock);
 	if (!is_delete)
 		domain->flags |= flags;
 	else
 		domain->flags &= ~flags;
 	spin_unlock(&lock);
-	/***** CRITICAL SECTION END *****/
 }
 
 /**
@@ -135,7 +133,6 @@ static int tomoyo_update_domain_initializer_entry(const char *domainname,
 	saved_program = tomoyo_save_name(program);
 	if (!saved_program)
 		return -ENOMEM;
-	/***** EXCLUSIVE SECTION START *****/
 	down_write(&tomoyo_domain_initializer_list_lock);
 	list_for_each_entry(ptr, &tomoyo_domain_initializer_list, list) {
 		if (ptr->is_not != is_not ||
@@ -161,7 +158,6 @@ static int tomoyo_update_domain_initializer_entry(const char *domainname,
 	error = 0;
  out:
 	up_write(&tomoyo_domain_initializer_list_lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return error;
 }
 
@@ -314,7 +310,6 @@ static int tomoyo_update_domain_keeper_entry(const char *domainname,
 	saved_domainname = tomoyo_save_name(domainname);
 	if (!saved_domainname)
 		return -ENOMEM;
-	/***** EXCLUSIVE SECTION START *****/
 	down_write(&tomoyo_domain_keeper_list_lock);
 	list_for_each_entry(ptr, &tomoyo_domain_keeper_list, list) {
 		if (ptr->is_not != is_not ||
@@ -340,7 +335,6 @@ static int tomoyo_update_domain_keeper_entry(const char *domainname,
 	error = 0;
  out:
 	up_write(&tomoyo_domain_keeper_list_lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return error;
 }
 
@@ -475,7 +469,6 @@ static int tomoyo_update_alias_entry(const char *original_name,
 	saved_aliased_name = tomoyo_save_name(aliased_name);
 	if (!saved_original_name || !saved_aliased_name)
 		return -ENOMEM;
-	/***** EXCLUSIVE SECTION START *****/
 	down_write(&tomoyo_alias_list_lock);
 	list_for_each_entry(ptr, &tomoyo_alias_list, list) {
 		if (ptr->original_name != saved_original_name ||
@@ -498,7 +491,6 @@ static int tomoyo_update_alias_entry(const char *original_name,
 	error = 0;
  out:
 	up_write(&tomoyo_alias_list_lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return error;
 }
 
@@ -566,7 +558,6 @@ int tomoyo_delete_domain(char *domainname)
 
 	name.name = domainname;
 	tomoyo_fill_path_info(&name);
-	/***** EXCLUSIVE SECTION START *****/
 	down_write(&tomoyo_domain_list_lock);
 	/* Is there an active domain? */
 	list_for_each_entry(domain, &tomoyo_domain_list, list) {
@@ -580,7 +571,6 @@ int tomoyo_delete_domain(char *domainname)
 		break;
 	}
 	up_write(&tomoyo_domain_list_lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return 0;
 }
 
@@ -599,7 +589,6 @@ struct tomoyo_domain_info *tomoyo_find_or_assign_new_domain(const char *
 	struct tomoyo_domain_info *domain = NULL;
 	const struct tomoyo_path_info *saved_domainname;
 
-	/***** EXCLUSIVE SECTION START *****/
 	down_write(&tomoyo_domain_list_lock);
 	domain = tomoyo_find_domain(domainname);
 	if (domain)
@@ -618,7 +607,6 @@ struct tomoyo_domain_info *tomoyo_find_or_assign_new_domain(const char *
 		    domain->domainname != saved_domainname)
 			continue;
 		flag = false;
-		/***** CRITICAL SECTION START *****/
 		read_lock(&tasklist_lock);
 		for_each_process(p) {
 			if (tomoyo_real_domain(p) != domain)
@@ -627,7 +615,6 @@ struct tomoyo_domain_info *tomoyo_find_or_assign_new_domain(const char *
 			break;
 		}
 		read_unlock(&tasklist_lock);
-		/***** CRITICAL SECTION END *****/
 		if (flag)
 			continue;
 		list_for_each_entry(ptr, &domain->acl_info_list, list) {
@@ -650,7 +637,6 @@ struct tomoyo_domain_info *tomoyo_find_or_assign_new_domain(const char *
 	}
  out:
 	up_write(&tomoyo_domain_list_lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return domain;
 }
 
diff --git a/security/tomoyo/file.c b/security/tomoyo/file.c
index 2316da8..adf786d 100644
--- a/security/tomoyo/file.c
+++ b/security/tomoyo/file.c
@@ -166,7 +166,6 @@ static int tomoyo_update_globally_readable_entry(const char *filename,
 	saved_filename = tomoyo_save_name(filename);
 	if (!saved_filename)
 		return -ENOMEM;
-	/***** EXCLUSIVE SECTION START *****/
 	down_write(&tomoyo_globally_readable_list_lock);
 	list_for_each_entry(ptr, &tomoyo_globally_readable_list, list) {
 		if (ptr->filename != saved_filename)
@@ -187,7 +186,6 @@ static int tomoyo_update_globally_readable_entry(const char *filename,
 	error = 0;
  out:
 	up_write(&tomoyo_globally_readable_list_lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return error;
 }
 
@@ -284,7 +282,6 @@ static int tomoyo_update_file_pattern_entry(const char *pattern,
 	saved_pattern = tomoyo_save_name(pattern);
 	if (!saved_pattern)
 		return -ENOMEM;
-	/***** EXCLUSIVE SECTION START *****/
 	down_write(&tomoyo_pattern_list_lock);
 	list_for_each_entry(ptr, &tomoyo_pattern_list, list) {
 		if (saved_pattern != ptr->pattern)
@@ -305,7 +302,6 @@ static int tomoyo_update_file_pattern_entry(const char *pattern,
 	error = 0;
  out:
 	up_write(&tomoyo_pattern_list_lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return error;
 }
 
@@ -407,7 +403,6 @@ static int tomoyo_update_no_rewrite_entry(const char *pattern,
 	saved_pattern = tomoyo_save_name(pattern);
 	if (!saved_pattern)
 		return -ENOMEM;
-	/***** EXCLUSIVE SECTION START *****/
 	down_write(&tomoyo_no_rewrite_list_lock);
 	list_for_each_entry(ptr, &tomoyo_no_rewrite_list, list) {
 		if (ptr->pattern != saved_pattern)
@@ -428,7 +423,6 @@ static int tomoyo_update_no_rewrite_entry(const char *pattern,
 	error = 0;
  out:
 	up_write(&tomoyo_no_rewrite_list_lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return error;
 }
 
@@ -745,7 +739,6 @@ static int tomoyo_update_single_path_acl(const u8 type, const char *filename,
 	saved_filename = tomoyo_save_name(filename);
 	if (!saved_filename)
 		return -ENOMEM;
-	/***** EXCLUSIVE SECTION START *****/
 	down_write(&tomoyo_domain_acl_info_list_lock);
 	if (is_delete)
 		goto delete;
@@ -800,7 +793,6 @@ static int tomoyo_update_single_path_acl(const u8 type, const char *filename,
 	}
  out:
 	up_write(&tomoyo_domain_acl_info_list_lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return error;
 }
 
@@ -836,7 +828,6 @@ static int tomoyo_update_double_path_acl(const u8 type, const char *filename1,
 	saved_filename2 = tomoyo_save_name(filename2);
 	if (!saved_filename1 || !saved_filename2)
 		return -ENOMEM;
-	/***** EXCLUSIVE SECTION START *****/
 	down_write(&tomoyo_domain_acl_info_list_lock);
 	if (is_delete)
 		goto delete;
@@ -884,7 +875,6 @@ static int tomoyo_update_double_path_acl(const u8 type, const char *filename1,
 	}
  out:
 	up_write(&tomoyo_domain_acl_info_list_lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return error;
 }
 
diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c
index 40927a8..3948f6b 100644
--- a/security/tomoyo/realpath.c
+++ b/security/tomoyo/realpath.c
@@ -220,7 +220,6 @@ void *tomoyo_alloc_element(const unsigned int size)
 		= roundup(size, max(sizeof(void *), sizeof(long)));
 	if (word_aligned_size > PATH_MAX)
 		return NULL;
-	/***** EXCLUSIVE SECTION START *****/
 	mutex_lock(&lock);
 	if (buf_used_len + word_aligned_size > PATH_MAX) {
 		if (!tomoyo_quota_for_elements ||
@@ -251,7 +250,6 @@ void *tomoyo_alloc_element(const unsigned int size)
 		}
 	}
 	mutex_unlock(&lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return ptr;
 }
 
@@ -318,7 +316,6 @@ const struct tomoyo_path_info *tomoyo_save_name(const char *name)
 		return NULL;
 	}
 	hash = full_name_hash((const unsigned char *) name, len - 1);
-	/***** EXCLUSIVE SECTION START *****/
 	mutex_lock(&lock);
 	list_for_each_entry(ptr, &tomoyo_name_list[hash % TOMOYO_MAX_HASH],
 			     list) {
@@ -366,7 +363,6 @@ const struct tomoyo_path_info *tomoyo_save_name(const char *name)
 	}
  out:
 	mutex_unlock(&lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return ptr ? &ptr->entry : NULL;
 }
 
-- 
cgit v0.10.2


From 7d2948b1248109dbc7f4aaf9867c54b1912d494c Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Tue, 2 Jun 2009 20:42:24 +0900
Subject: TOMOYO: Simplify policy reader.

We can directly assign the result of tomoyo_io_printf() to done flag.

Signed-off-by: Kentaro Takeda <takedakn@nttdata.co.jp>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Toshiharu Harada <haradats@nttdata.co.jp>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c
index a42fe02..6d25612 100644
--- a/security/tomoyo/common.c
+++ b/security/tomoyo/common.c
@@ -1113,10 +1113,9 @@ static int tomoyo_read_manager_policy(struct tomoyo_io_buffer *head)
 				 list);
 		if (ptr->is_deleted)
 			continue;
-		if (!tomoyo_io_printf(head, "%s\n", ptr->manager->name)) {
-			done = false;
+		done = tomoyo_io_printf(head, "%s\n", ptr->manager->name);
+		if (!done)
 			break;
-		}
 	}
 	up_read(&tomoyo_policy_manager_list_lock);
 	head->read_eof = done;
@@ -1441,15 +1440,14 @@ static int tomoyo_read_domain_policy(struct tomoyo_io_buffer *head)
 		    TOMOYO_DOMAIN_FLAGS_IGNORE_GLOBAL_ALLOW_READ)
 			ignore_global_allow_read
 				= TOMOYO_KEYWORD_IGNORE_GLOBAL_ALLOW_READ "\n";
-		if (!tomoyo_io_printf(head,
-				      "%s\n" TOMOYO_KEYWORD_USE_PROFILE "%u\n"
-				      "%s%s%s\n", domain->domainname->name,
-				      domain->profile, quota_exceeded,
-				      transition_failed,
-				      ignore_global_allow_read)) {
-			done = false;
+		done = tomoyo_io_printf(head, "%s\n" TOMOYO_KEYWORD_USE_PROFILE
+					"%u\n%s%s%s\n",
+					domain->domainname->name,
+					domain->profile, quota_exceeded,
+					transition_failed,
+					ignore_global_allow_read);
+		if (!done)
 			break;
-		}
 		head->read_step = 2;
 acl_loop:
 		if (head->read_step == 3)
@@ -1457,24 +1455,22 @@ acl_loop:
 		/* Print ACL entries in the domain. */
 		down_read(&tomoyo_domain_acl_info_list_lock);
 		list_for_each_cookie(apos, head->read_var2,
-				      &domain->acl_info_list) {
+				     &domain->acl_info_list) {
 			struct tomoyo_acl_info *ptr
 				= list_entry(apos, struct tomoyo_acl_info,
-					      list);
-			if (!tomoyo_print_entry(head, ptr)) {
-				done = false;
+					     list);
+			done = tomoyo_print_entry(head, ptr);
+			if (!done)
 				break;
-			}
 		}
 		up_read(&tomoyo_domain_acl_info_list_lock);
 		if (!done)
 			break;
 		head->read_step = 3;
 tail_mark:
-		if (!tomoyo_io_printf(head, "\n")) {
-			done = false;
+		done = tomoyo_io_printf(head, "\n");
+		if (!done)
 			break;
-		}
 		head->read_step = 1;
 		if (head->read_single_domain)
 			break;
@@ -1544,11 +1540,10 @@ static int tomoyo_read_domain_profile(struct tomoyo_io_buffer *head)
 		domain = list_entry(pos, struct tomoyo_domain_info, list);
 		if (domain->is_deleted)
 			continue;
-		if (!tomoyo_io_printf(head, "%u %s\n", domain->profile,
-				      domain->domainname->name)) {
-			done = false;
+		done = tomoyo_io_printf(head, "%u %s\n", domain->profile,
+					domain->domainname->name);
+		if (!done)
 			break;
-		}
 	}
 	up_read(&tomoyo_domain_list_lock);
 	head->read_eof = done;
diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index aa119ca..34bb641 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -189,13 +189,12 @@ bool tomoyo_read_domain_initializer_policy(struct tomoyo_io_buffer *head)
 			from = " from ";
 			domain = ptr->domainname->name;
 		}
-		if (!tomoyo_io_printf(head,
-				      "%s" TOMOYO_KEYWORD_INITIALIZE_DOMAIN
-				      "%s%s%s\n", no, ptr->program->name, from,
-				      domain)) {
-			done = false;
+		done = tomoyo_io_printf(head,
+					"%s" TOMOYO_KEYWORD_INITIALIZE_DOMAIN
+					"%s%s%s\n", no, ptr->program->name,
+					from, domain);
+		if (!done)
 			break;
-		}
 	}
 	up_read(&tomoyo_domain_initializer_list_lock);
 	return done;
@@ -387,13 +386,12 @@ bool tomoyo_read_domain_keeper_policy(struct tomoyo_io_buffer *head)
 			from = " from ";
 			program = ptr->program->name;
 		}
-		if (!tomoyo_io_printf(head,
-				      "%s" TOMOYO_KEYWORD_KEEP_DOMAIN
-				      "%s%s%s\n", no, program, from,
-				      ptr->domainname->name)) {
-			done = false;
+		done = tomoyo_io_printf(head,
+					"%s" TOMOYO_KEYWORD_KEEP_DOMAIN
+					"%s%s%s\n", no, program, from,
+					ptr->domainname->name);
+		if (!done)
 			break;
-		}
 	}
 	up_read(&tomoyo_domain_keeper_list_lock);
 	return done;
@@ -513,12 +511,11 @@ bool tomoyo_read_alias_policy(struct tomoyo_io_buffer *head)
 		ptr = list_entry(pos, struct tomoyo_alias_entry, list);
 		if (ptr->is_deleted)
 			continue;
-		if (!tomoyo_io_printf(head, TOMOYO_KEYWORD_ALIAS "%s %s\n",
-				      ptr->original_name->name,
-				      ptr->aliased_name->name)) {
-			done = false;
+		done = tomoyo_io_printf(head, TOMOYO_KEYWORD_ALIAS "%s %s\n",
+					ptr->original_name->name,
+					ptr->aliased_name->name);
+		if (!done)
 			break;
-		}
 	}
 	up_read(&tomoyo_alias_list_lock);
 	return done;
diff --git a/security/tomoyo/file.c b/security/tomoyo/file.c
index adf786d..a67f9e6 100644
--- a/security/tomoyo/file.c
+++ b/security/tomoyo/file.c
@@ -247,11 +247,10 @@ bool tomoyo_read_globally_readable_policy(struct tomoyo_io_buffer *head)
 				 list);
 		if (ptr->is_deleted)
 			continue;
-		if (!tomoyo_io_printf(head, TOMOYO_KEYWORD_ALLOW_READ "%s\n",
-				      ptr->filename->name)) {
-			done = false;
+		done = tomoyo_io_printf(head, TOMOYO_KEYWORD_ALLOW_READ "%s\n",
+					ptr->filename->name);
+		if (!done)
 			break;
-		}
 	}
 	up_read(&tomoyo_globally_readable_list_lock);
 	return done;
@@ -369,11 +368,10 @@ bool tomoyo_read_file_pattern(struct tomoyo_io_buffer *head)
 		ptr = list_entry(pos, struct tomoyo_pattern_entry, list);
 		if (ptr->is_deleted)
 			continue;
-		if (!tomoyo_io_printf(head, TOMOYO_KEYWORD_FILE_PATTERN "%s\n",
-				      ptr->pattern->name)) {
-			done = false;
+		done = tomoyo_io_printf(head, TOMOYO_KEYWORD_FILE_PATTERN
+					"%s\n", ptr->pattern->name);
+		if (!done)
 			break;
-		}
 	}
 	up_read(&tomoyo_pattern_list_lock);
 	return done;
@@ -483,11 +481,10 @@ bool tomoyo_read_no_rewrite_policy(struct tomoyo_io_buffer *head)
 		ptr = list_entry(pos, struct tomoyo_no_rewrite_entry, list);
 		if (ptr->is_deleted)
 			continue;
-		if (!tomoyo_io_printf(head, TOMOYO_KEYWORD_DENY_REWRITE "%s\n",
-				      ptr->pattern->name)) {
-			done = false;
+		done = tomoyo_io_printf(head, TOMOYO_KEYWORD_DENY_REWRITE
+					"%s\n", ptr->pattern->name);
+		if (!done)
 			break;
-		}
 	}
 	up_read(&tomoyo_no_rewrite_list_lock);
 	return done;
-- 
cgit v0.10.2


From 9fa7eb283c5cdc2b0f4a8cfe6387ed82e5e9a3d3 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 2 Jun 2009 20:07:25 -0700
Subject: Linux 2.6.30-rc8


diff --git a/Makefile b/Makefile
index 739fd34..610d1c3 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 30
-EXTRAVERSION = -rc7
+EXTRAVERSION = -rc8
 NAME = Man-Eating Seals of Antiquity
 
 # *DOCUMENTATION*
-- 
cgit v0.10.2


From 6799687a53a28536fd027ccb644833f66a778925 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 2 Jun 2009 08:23:58 +0200
Subject: x86, boot: add new generated files to the appropriate .gitignore
 files

git status complains of untracked (generated) files in arch/x86/boot..

# Untracked files:
#   (use "git add <file>..." to include in what will be committed)
#
#       ../../arch/x86/boot/compressed/mkpiggy
#       ../../arch/x86/boot/compressed/piggy.S
#       ../../arch/x86/boot/compressed/vmlinux.lds
#       ../../arch/x86/boot/voffset.h
#       ../../arch/x86/boot/zoffset.h

..so adjust .gitignore files accordingly.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore
index 172cf8a..851fe93 100644
--- a/arch/x86/boot/.gitignore
+++ b/arch/x86/boot/.gitignore
@@ -3,6 +3,8 @@ bzImage
 cpustr.h
 mkcpustr
 offsets.h
+voffset.h
+zoffset.h
 setup
 setup.bin
 setup.elf
diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore
index 63eff3b..4a46fab 100644
--- a/arch/x86/boot/compressed/.gitignore
+++ b/arch/x86/boot/compressed/.gitignore
@@ -1,3 +1,6 @@
 relocs
 vmlinux.bin.all
 vmlinux.relocs
+vmlinux.lds
+mkpiggy
+piggy.S
-- 
cgit v0.10.2


From a05c0205ba031c01bba33a21bf0a35920eb64833 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Wed, 3 Jun 2009 09:33:18 +0200
Subject: block: Fix bounce limit setting in DM

blk_queue_bounce_limit() is more than a wrapper about the request queue
limits.bounce_pfn variable.  Introduce blk_queue_bounce_pfn() which can
be called by stacking drivers that wish to set the bounce limit
explicitly.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 8d33934..9acd0b7 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -194,6 +194,23 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
 EXPORT_SYMBOL(blk_queue_bounce_limit);
 
 /**
+ * blk_queue_bounce_pfn - set the bounce buffer limit for queue
+ * @q: the request queue for the device
+ * @pfn: max address
+ *
+ * Description:
+ *    This function is similar to blk_queue_bounce_limit except it
+ *    neither changes allocation flags, nor does it set up the ISA DMA
+ *    pool. This function should only be used by stacking drivers.
+ *    Hardware drivers should use blk_queue_bounce_limit instead.
+ */
+void blk_queue_bounce_pfn(struct request_queue *q, u64 pfn)
+{
+	q->limits.bounce_pfn = pfn;
+}
+EXPORT_SYMBOL(blk_queue_bounce_pfn);
+
+/**
  * blk_queue_max_sectors - set max sectors for a request for this queue
  * @q:  the request queue for the device
  * @max_sectors:  max sectors in the usual 512b unit
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index e9a73bb..3ca1604 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -920,7 +920,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
 	blk_queue_max_segment_size(q, t->limits.max_segment_size);
 	blk_queue_max_hw_sectors(q, t->limits.max_hw_sectors);
 	blk_queue_segment_boundary(q, t->limits.seg_boundary_mask);
-	blk_queue_bounce_limit(q, t->limits.bounce_pfn);
+	blk_queue_bounce_pfn(q, t->limits.bounce_pfn);
 
 	if (t->limits.no_cluster)
 		queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5e740a1..989aa17 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -910,6 +910,7 @@ extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
 extern void blk_cleanup_queue(struct request_queue *);
 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
+extern void blk_queue_bounce_pfn(struct request_queue *, u64);
 extern void blk_queue_max_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short);
-- 
cgit v0.10.2


From a32881066e58346f2901afe0ebdfbf0c562877e5 Mon Sep 17 00:00:00 2001
From: Yong Wang <yong.y.wang@linux.intel.com>
Date: Wed, 3 Jun 2009 13:12:55 +0800
Subject: perf_counter/x86: Remove the IRQ (non-NMI) handling bits

Remove the IRQ (non-NMI) handling bits as NMI will be used always.

Signed-off-by: Yong Wang <yong.y.wang@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090603051255.GA2791@ywang-moblin2.bj.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index fe24d28..d750a10 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -49,7 +49,6 @@ BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
 BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
 
 #ifdef CONFIG_PERF_COUNTERS
-BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
 BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
 #endif
 
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 7309c0a..4b4921d 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,7 +29,6 @@
 extern void apic_timer_interrupt(void);
 extern void generic_interrupt(void);
 extern void error_interrupt(void);
-extern void perf_counter_interrupt(void);
 extern void perf_pending_interrupt(void);
 
 extern void spurious_interrupt(void);
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 545bb81..4492e19 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -107,11 +107,6 @@
 #define LOCAL_TIMER_VECTOR		0xef
 
 /*
- * Performance monitoring interrupt vector:
- */
-#define LOCAL_PERF_VECTOR		0xee
-
-/*
  * Generic system vector for platform specific use
  */
 #define GENERIC_INTERRUPT_VECTOR	0xed
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index e16e8c1..12cc05e 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -40,7 +40,7 @@ struct cpu_hw_counters {
 struct x86_pmu {
 	const char	*name;
 	int		version;
-	int		(*handle_irq)(struct pt_regs *, int);
+	int		(*handle_irq)(struct pt_regs *);
 	void		(*disable_all)(void);
 	void		(*enable_all)(void);
 	void		(*enable)(struct hw_perf_counter *, int);
@@ -755,7 +755,7 @@ static void intel_pmu_reset(void)
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
  */
-static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
+static int intel_pmu_handle_irq(struct pt_regs *regs)
 {
 	struct cpu_hw_counters *cpuc;
 	struct cpu_hw_counters;
@@ -794,7 +794,7 @@ again:
 		if (!intel_pmu_save_and_restart(counter))
 			continue;
 
-		if (perf_counter_overflow(counter, nmi, regs, 0))
+		if (perf_counter_overflow(counter, 1, regs, 0))
 			intel_pmu_disable_counter(&counter->hw, bit);
 	}
 
@@ -812,7 +812,7 @@ again:
 	return 1;
 }
 
-static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
+static int amd_pmu_handle_irq(struct pt_regs *regs)
 {
 	int cpu, idx, handled = 0;
 	struct cpu_hw_counters *cpuc;
@@ -840,22 +840,13 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 		if (!x86_perf_counter_set_period(counter, hwc, idx))
 			continue;
 
-		if (perf_counter_overflow(counter, nmi, regs, 0))
+		if (perf_counter_overflow(counter, 1, regs, 0))
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
 	return handled;
 }
 
-void smp_perf_counter_interrupt(struct pt_regs *regs)
-{
-	irq_enter();
-	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
-	ack_APIC_irq();
-	x86_pmu.handle_irq(regs, 0);
-	irq_exit();
-}
-
 void smp_perf_pending_interrupt(struct pt_regs *regs)
 {
 	irq_enter();
@@ -910,7 +901,7 @@ perf_counter_nmi_handler(struct notifier_block *self,
 	 * If the first NMI handles both, the latter will be empty and daze
 	 * the CPU.
 	 */
-	x86_pmu.handle_irq(regs, 1);
+	x86_pmu.handle_irq(regs);
 
 	return NOTIFY_STOP;
 }
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 8910046..7985c01 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1026,8 +1026,6 @@ apicinterrupt SPURIOUS_APIC_VECTOR \
 	spurious_interrupt smp_spurious_interrupt
 
 #ifdef CONFIG_PERF_COUNTERS
-apicinterrupt LOCAL_PERF_VECTOR \
-	perf_counter_interrupt smp_perf_counter_interrupt
 apicinterrupt LOCAL_PENDING_VECTOR \
 	perf_pending_interrupt smp_perf_pending_interrupt
 #endif
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 3190a6b..205bdd8 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -165,7 +165,6 @@ static void __init apic_intr_init(void)
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 # ifdef CONFIG_PERF_COUNTERS
-	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
 	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
 # endif
 
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 53ceb26..fa6ef69 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -155,7 +155,6 @@ static void __init apic_intr_init(void)
 
 	/* Performance monitoring interrupt: */
 #ifdef CONFIG_PERF_COUNTERS
-	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
 	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
 #endif
 }
-- 
cgit v0.10.2


From 8ce998d6693bd02ab3b74ee1cc303ecb1fa9b514 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 3 Jun 2009 00:54:33 -0300
Subject: perf_counter tools: Cover PLT symbols too

PLT, the Program Linking Table, is used with the dynamic linker to
allow PIC code in executables and shared objects to figure out
where functions are in other shared objects.

It is one of the sources of unknown/unresolved symbols - this patch
does what binutils figures out when you ask it to disassembly.
(objdump -S)

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Stephane Eranian <eranian@googlemail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/util/symbol.c b/Documentation/perf_counter/util/symbol.c
index 039931f..d52a1ae 100644
--- a/Documentation/perf_counter/util/symbol.c
+++ b/Documentation/perf_counter/util/symbol.c
@@ -258,6 +258,117 @@ static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
 	return sec;
 }
 
+#define elf_section__for_each_rel(reldata, pos, pos_mem, idx, nr_entries) \
+	for (idx = 0, pos = gelf_getrel(reldata, 0, &pos_mem); \
+	     idx < nr_entries; \
+	     ++idx, pos = gelf_getrel(reldata, idx, &pos_mem))
+
+#define elf_section__for_each_rela(reldata, pos, pos_mem, idx, nr_entries) \
+	for (idx = 0, pos = gelf_getrela(reldata, 0, &pos_mem); \
+	     idx < nr_entries; \
+	     ++idx, pos = gelf_getrela(reldata, idx, &pos_mem))
+
+static int dso__synthesize_plt_symbols(struct  dso *self, Elf *elf,
+				       GElf_Ehdr *ehdr, Elf_Scn *scn_dynsym,
+				       GElf_Shdr *shdr_dynsym,
+				       size_t dynsym_idx)
+{
+	uint32_t nr_rel_entries, idx;
+	GElf_Sym sym;
+	__u64 plt_offset;
+	GElf_Shdr shdr_plt;
+	struct symbol *f;
+	GElf_Shdr shdr_rel_plt;
+	Elf_Data *reldata, *syms, *symstrs;
+	Elf_Scn *scn_plt_rel, *scn_symstrs;
+	char sympltname[1024];
+	int nr = 0, symidx;
+
+	scn_plt_rel = elf_section_by_name(elf, ehdr, &shdr_rel_plt,
+					  ".rela.plt", NULL);
+	if (scn_plt_rel == NULL) {
+		scn_plt_rel = elf_section_by_name(elf, ehdr, &shdr_rel_plt,
+						  ".rel.plt", NULL);
+		if (scn_plt_rel == NULL)
+			return 0;
+	}
+
+	if (shdr_rel_plt.sh_link != dynsym_idx)
+		return 0;
+
+	if (elf_section_by_name(elf, ehdr, &shdr_plt, ".plt", NULL) == NULL)
+		return 0;
+
+	/*
+	 * Fetch the relocation section to find the indexes to the GOT
+	 * and the symbols in the .dynsym they refer to.
+	 */
+	reldata = elf_getdata(scn_plt_rel, NULL);
+	if (reldata == NULL)
+		return -1;
+
+	syms = elf_getdata(scn_dynsym, NULL);
+	if (syms == NULL)
+		return -1;
+
+	scn_symstrs = elf_getscn(elf, shdr_dynsym->sh_link);
+	if (scn_symstrs == NULL)
+		return -1;
+
+	symstrs = elf_getdata(scn_symstrs, NULL);
+	if (symstrs == NULL)
+		return -1;
+
+	nr_rel_entries = shdr_rel_plt.sh_size / shdr_rel_plt.sh_entsize;
+	plt_offset = shdr_plt.sh_offset;
+
+	if (shdr_rel_plt.sh_type == SHT_RELA) {
+		GElf_Rela pos_mem, *pos;
+
+		elf_section__for_each_rela(reldata, pos, pos_mem, idx,
+					   nr_rel_entries) {
+			symidx = GELF_R_SYM(pos->r_info);
+			plt_offset += shdr_plt.sh_entsize;
+			gelf_getsym(syms, symidx, &sym);
+			snprintf(sympltname, sizeof(sympltname),
+				 "%s@plt", elf_sym__name(&sym, symstrs));
+
+			f = symbol__new(plt_offset, shdr_plt.sh_entsize,
+					sympltname, self->sym_priv_size);
+			if (!f)
+				return -1;
+
+			dso__insert_symbol(self, f);
+			++nr;
+		}
+	} else if (shdr_rel_plt.sh_type == SHT_REL) {
+		GElf_Rel pos_mem, *pos;
+		elf_section__for_each_rel(reldata, pos, pos_mem, idx,
+					  nr_rel_entries) {
+			symidx = GELF_R_SYM(pos->r_info);
+			plt_offset += shdr_plt.sh_entsize;
+			gelf_getsym(syms, symidx, &sym);
+			snprintf(sympltname, sizeof(sympltname),
+				 "%s@plt", elf_sym__name(&sym, symstrs));
+
+			f = symbol__new(plt_offset, shdr_plt.sh_entsize,
+					sympltname, self->sym_priv_size);
+			if (!f)
+				return -1;
+
+			dso__insert_symbol(self, f);
+			++nr;
+		}
+	} else {
+		/*
+		 * TODO: There are still one more shdr_rel_plt.sh_type
+		 * I have to investigate, but probably should be ignored.
+		 */
+	}
+
+	return nr;
+}
+
 static int dso__load_sym(struct dso *self, int fd, const char *name,
 			 symbol_filter_t filter)
 {
@@ -269,8 +380,9 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
 	GElf_Shdr shdr;
 	Elf_Data *syms;
 	GElf_Sym sym;
-	Elf_Scn *sec;
+	Elf_Scn *sec, *sec_dynsym;
 	Elf *elf;
+	size_t dynsym_idx;
 	int nr = 0;
 
 	elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
@@ -285,12 +397,33 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
 		goto out_elf_end;
 	}
 
+	/*
+	 * We need to check if we have a .dynsym, so that we can handle the
+	 * .plt, synthesizing its symbols, that aren't on the symtabs (be it
+	 * .dynsym or .symtab)
+	 */
+	sec_dynsym = elf_section_by_name(elf, &ehdr, &shdr,
+					 ".dynsym", &dynsym_idx);
+	if (sec_dynsym != NULL) {
+		nr = dso__synthesize_plt_symbols(self, elf, &ehdr,
+						 sec_dynsym, &shdr,
+						 dynsym_idx);
+		if (nr < 0)
+			goto out_elf_end;
+	}
+
+	/*
+	 * But if we have a full .symtab (that is a superset of .dynsym) we
+	 * should add the symbols not in the .dynsyn
+	 */
 	sec = elf_section_by_name(elf, &ehdr, &shdr, ".symtab", NULL);
-	if (sec == NULL)
-		sec = elf_section_by_name(elf, &ehdr, &shdr, ".dynsym", NULL);
+	if (sec == NULL) {
+		if (sec_dynsym == NULL)
+			goto out_elf_end;
 
-	if (sec == NULL)
-		goto out_elf_end;
+		sec = sec_dynsym;
+		gelf_getshdr(sec, &shdr);
+	}
 
 	syms = elf_getdata(sec, NULL);
 	if (syms == NULL)
-- 
cgit v0.10.2


From 3502973d005ed89cc2b3f39780813a341ddba97f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 09:38:58 +0200
Subject: perf report: Print -D to stdout

-D prints to stderr - which is a bit confusing - print to stdout
instead.

Also clean up the if (dump_trace) patterns via a dprintf helper.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 9da990f..6207a31 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -31,6 +31,8 @@ static int		input;
 static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
 static int		dump_trace = 0;
+#define dprintf(x...)	do { if (dump_trace) printf(x); } while (0)
+
 static int		verbose;
 static int		full_paths;
 
@@ -729,14 +731,12 @@ more:
 		uint64_t ip = event->ip.ip;
 		struct map *map = NULL;
 
-		if (dump_trace) {
-			fprintf(stderr, "%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
-				(void *)(offset + head),
-				(void *)(long)(event->header.size),
-				event->header.misc,
-				event->ip.pid,
-				(void *)(long)ip);
-		}
+		dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
+			(void *)(offset + head),
+			(void *)(long)(event->header.size),
+			event->header.misc,
+			event->ip.pid,
+			(void *)(long)ip);
 
 		if (thread == NULL) {
 			fprintf(stderr, "problem processing %d event, skipping it.\n",
@@ -781,15 +781,14 @@ more:
 		struct thread *thread = threads__findnew(event->mmap.pid);
 		struct map *map = map__new(&event->mmap, cwdp, cwdlen);
 
-		if (dump_trace) {
-			fprintf(stderr, "%p [%p]: PERF_EVENT_MMAP: [%p(%p) @ %p]: %s\n",
-				(void *)(offset + head),
-				(void *)(long)(event->header.size),
-				(void *)(long)event->mmap.start,
-				(void *)(long)event->mmap.len,
-				(void *)(long)event->mmap.pgoff,
-				event->mmap.filename);
-		}
+		dprintf("%p [%p]: PERF_EVENT_MMAP: [%p(%p) @ %p]: %s\n",
+			(void *)(offset + head),
+			(void *)(long)(event->header.size),
+			(void *)(long)event->mmap.start,
+			(void *)(long)event->mmap.len,
+			(void *)(long)event->mmap.pgoff,
+			event->mmap.filename);
+
 		if (thread == NULL || map == NULL) {
 			if (verbose)
 				fprintf(stderr, "problem processing PERF_EVENT_MMAP, skipping event.\n");
@@ -802,12 +801,11 @@ more:
 	case PERF_EVENT_COMM: {
 		struct thread *thread = threads__findnew(event->comm.pid);
 
-		if (dump_trace) {
-			fprintf(stderr, "%p [%p]: PERF_EVENT_COMM: %s:%d\n",
-				(void *)(offset + head),
-				(void *)(long)(event->header.size),
-				event->comm.comm, event->comm.pid);
-		}
+		dprintf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
+			(void *)(offset + head),
+			(void *)(long)(event->header.size),
+			event->comm.comm, event->comm.pid);
+
 		if (thread == NULL ||
 		    thread__set_comm(thread, event->comm.comm)) {
 			fprintf(stderr, "problem processing PERF_EVENT_COMM, skipping event.\n");
@@ -818,11 +816,10 @@ more:
 	}
 	default: {
 broken_event:
-		if (dump_trace)
-			fprintf(stderr, "%p [%p]: skipping unknown header type: %d\n",
-					(void *)(offset + head),
-					(void *)(long)(event->header.size),
-					event->header.type);
+		dprintf("%p [%p]: skipping unknown header type: %d\n",
+			(void *)(offset + head),
+			(void *)(long)(event->header.size),
+			event->header.type);
 
 		total_unknown++;
 
@@ -846,14 +843,13 @@ broken_event:
 	rc = EXIT_SUCCESS;
 	close(input);
 
-	if (dump_trace) {
-		fprintf(stderr, "      IP events: %10ld\n", total);
-		fprintf(stderr, "    mmap events: %10ld\n", total_mmap);
-		fprintf(stderr, "    comm events: %10ld\n", total_comm);
-		fprintf(stderr, " unknown events: %10ld\n", total_unknown);
+	dprintf("      IP events: %10ld\n", total);
+	dprintf("    mmap events: %10ld\n", total_mmap);
+	dprintf("    comm events: %10ld\n", total_comm);
+	dprintf(" unknown events: %10ld\n", total_unknown);
 
+	if (dump_trace)
 		return 0;
-	}
 
 	if (verbose >= 2)
 		dsos__fprintf(stdout);
-- 
cgit v0.10.2


From 5352f35d6ae7b8b981d77137fb268bc54d10624f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 10:07:39 +0200
Subject: perf report: Improve sort key recognition

 - allow case-insensitive tokens - such as --sort Comm,Symbol
 - allow substring shortcuts: --sort sym
 - detect invalid tokens and bail out

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 6207a31..a8d3905 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -453,28 +453,18 @@ static int sort_dimension__add(char *tok)
 		if (sd->taken)
 			continue;
 
-		if (strcmp(tok, sd->name))
+		if (strncasecmp(tok, sd->name, strlen(tok)))
 			continue;
 
 		list_add_tail(&sd->entry->list, &hist_entry__sort_list);
 		sd->taken = 1;
+
 		return 0;
 	}
 
 	return -ESRCH;
 }
 
-static void setup_sorting(void)
-{
-	char *tmp, *tok, *str = strdup(sort_order);
-
-	for (tok = strtok_r(str, ", ", &tmp);
-			tok; tok = strtok_r(NULL, ", ", &tmp))
-		sort_dimension__add(tok);
-
-	free(str);
-}
-
 static int64_t
 hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
 {
@@ -880,6 +870,21 @@ static const struct option options[] = {
 	OPT_END()
 };
 
+static void setup_sorting(void)
+{
+	char *tmp, *tok, *str = strdup(sort_order);
+
+	for (tok = strtok_r(str, ", ", &tmp);
+			tok; tok = strtok_r(NULL, ", ", &tmp)) {
+		if (sort_dimension__add(tok) < 0) {
+			error("Unknown --sort key: `%s'", tok);
+			usage_with_options(report_usage, options);
+		}
+	}
+
+	free(str);
+}
+
 int cmd_report(int argc, const char **argv, const char *prefix)
 {
 	symbol__init();
-- 
cgit v0.10.2


From 367d04c4ec02dad34d80452e32e3370db7fb6fee Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 28 May 2009 09:54:48 +0200
Subject: amd_iommu: fix lock imbalance

In alloc_coherent there is an omitted unlock on the path where mapping
fails. Add the unlock.

[ Impact: fix lock imbalance in alloc_coherent ]

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Cc: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index d689883..9f89bb6 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1541,8 +1541,10 @@ static void *alloc_coherent(struct device *dev, size_t size,
 	*dma_addr = __map_single(dev, iommu, domain->priv, paddr,
 				 size, DMA_BIDIRECTIONAL, true, dma_mask);
 
-	if (*dma_addr == bad_dma_address)
+	if (*dma_addr == bad_dma_address) {
+		spin_unlock_irqrestore(&domain->lock, flags);
 		goto out_free;
+	}
 
 	iommu_completion_wait(iommu);
 
-- 
cgit v0.10.2


From ed966aac335a63083d3125198479447248637d9e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 10:39:26 +0200
Subject: perf report: Handle vDSO symbols properly

We were not looking up vDSO symbols properly, because they
are in the kallsyms but are user-mode entries.

Pass negative addresses to the kernel dso object, this
way we resolve them properly:

     0.05%  [kernel]: vread_tsc

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index a8d3905..0f88d9e 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -728,6 +728,8 @@ more:
 			event->ip.pid,
 			(void *)(long)ip);
 
+		dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
+
 		if (thread == NULL) {
 			fprintf(stderr, "problem processing %d event, skipping it.\n",
 				event->header.type);
@@ -740,6 +742,8 @@ more:
 
 			dso = kernel_dso;
 
+			dprintf(" ...... dso: %s\n", dso->name);
+
 		} else if (event->header.misc & PERF_EVENT_MISC_USER) {
 
 			show = SHOW_USER;
@@ -749,11 +753,22 @@ more:
 			if (map != NULL) {
 				dso = map->dso;
 				ip -= map->start + map->pgoff;
+			} else {
+				/*
+				 * If this is outside of all known maps,
+				 * and is a negative address, try to look it
+				 * up in the kernel dso, as it might be a
+				 * vsyscall (which executes in user-mode):
+				 */
+				if ((long long)ip < 0)
+					dso = kernel_dso;
 			}
+			dprintf(" ...... dso: %s\n", dso ? dso->name : "<not found>");
 
 		} else {
 			show = SHOW_HV;
 			level = 'H';
+			dprintf(" ...... dso: [hypervisor]\n");
 		}
 
 		if (show & show_mask) {
-- 
cgit v0.10.2


From e09f9446b94ac64b27d37e98c1110f29d712cdad Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 3 Jun 2009 10:07:44 +0100
Subject: GFS2: Remove unused variable

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 841ddc9..73318a3 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -694,8 +694,6 @@ static void do_unflock(struct file *file, struct file_lock *fl)
 
 static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 {
-	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-
 	if (!(fl->fl_flags & FL_FLOCK))
 		return -ENOLCK;
 	if (fl->fl_type & LOCK_MAND)
-- 
cgit v0.10.2


From 6984efb692e97ce5f75f26e595685c04c2061bac Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 3 Jun 2009 19:38:58 +1000
Subject: perf_counter: powerpc: Fix event alternative code generation on
 POWER5/5+

Commit ef923214 ("perf_counter: powerpc: use u64 for event
codes internally") introduced a bug where the return value from
function find_alternative_bdecode gets put into a u64 variable
and later tested to see if it is < 0.  The effect is that we
get extra, bogus event code alternatives on POWER5 and POWER5+,
leading to error messages such as "oops compute_mmcr failed"
being printed and counters not counting properly.

This fixes it by using s64 for the return type of
find_alternative_bdecode and for the local variable that the
caller puts the value in.  It also makes the event argument a
u64 on POWER5+ for consistency.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
Cc: Stephane Eranian <eranian@googlemail.com>
LKML-Reference: <18982.17586.666132.90983@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index c6cdfc1..8471e3c 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -242,7 +242,7 @@ static const unsigned char bytedecode_alternatives[4][4] = {
  * event code for those that do, or -1 otherwise.  This also handles
  * alternative PCMSEL values for add events.
  */
-static int find_alternative_bdecode(unsigned int event)
+static s64 find_alternative_bdecode(u64 event)
 {
 	int pmc, altpmc, pp, j;
 
@@ -277,7 +277,7 @@ static int power5p_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 {
 	int i, j, nalt = 1;
 	int nlim;
-	u64 ae;
+	s64 ae;
 
 	alt[0] = event;
 	nalt = 1;
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index d534496..1b44c5f 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -250,7 +250,7 @@ static const unsigned char bytedecode_alternatives[4][4] = {
  * PMCSEL values on other counters.  This returns the alternative
  * event code for those that do, or -1 otherwise.
  */
-static u64 find_alternative_bdecode(u64 event)
+static s64 find_alternative_bdecode(u64 event)
 {
 	int pmc, altpmc, pp, j;
 
@@ -272,7 +272,7 @@ static u64 find_alternative_bdecode(u64 event)
 static int power5_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 {
 	int i, j, nalt = 1;
-	u64 ae;
+	s64 ae;
 
 	alt[0] = event;
 	nalt = 1;
-- 
cgit v0.10.2


From dcd945e0d8a6d654e3e1de51faea9f98f1504aa5 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 3 Jun 2009 19:40:36 +1000
Subject: perf_counter: powerpc: Fix race causing "oops trying to read PMC0"
 errors

When using interrupting counters and limited (non-interrupting)
counters at the same time, it's possible that we get an
interrupt in write_mmcr0() after writing MMCR0 but before we
have set up the counters using limited PMCs.  What happens then
is that we get into perf_counter_interrupt() with
counter->hw.idx = 0 for the limited counters, leading to the
"oops trying to read PMC0" error message being printed.

This fixes the problem by making perf_counter_interrupt()
robust against counter->hw.idx being zero (the counter is just
ignored in that case) and also by changing write_mmcr0() to
write MMCR0 initially with the counter overflow interrupt
enable bits masked (set to 0).  If the MMCR0 value requested by
the caller has either of those bits set, we write MMCR0 again
with the requested value of those bits after setting up the
limited counters properly.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
Cc: Stephane Eranian <eranian@googlemail.com>
LKML-Reference: <18982.17684.138182.954599@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index ea54686..4cc4ac5 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -372,16 +372,28 @@ static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0)
 
 	/*
 	 * Write MMCR0, then read PMC5 and PMC6 immediately.
+	 * To ensure we don't get a performance monitor interrupt
+	 * between writing MMCR0 and freezing/thawing the limited
+	 * counters, we first write MMCR0 with the counter overflow
+	 * interrupt enable bits turned off.
 	 */
 	asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
 		     : "=&r" (pmc5), "=&r" (pmc6)
-		     : "r" (mmcr0), "i" (SPRN_MMCR0),
+		     : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)),
+		       "i" (SPRN_MMCR0),
 		       "i" (SPRN_PMC5), "i" (SPRN_PMC6));
 
 	if (mmcr0 & MMCR0_FC)
 		freeze_limited_counters(cpuhw, pmc5, pmc6);
 	else
 		thaw_limited_counters(cpuhw, pmc5, pmc6);
+
+	/*
+	 * Write the full MMCR0 including the counter overflow interrupt
+	 * enable bits, if necessary.
+	 */
+	if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE))
+		mtspr(SPRN_MMCR0, mmcr0);
 }
 
 /*
@@ -1108,7 +1120,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 
 	for (i = 0; i < cpuhw->n_counters; ++i) {
 		counter = cpuhw->counter[i];
-		if (is_limited_pmc(counter->hw.idx))
+		if (!counter->hw.idx || is_limited_pmc(counter->hw.idx))
 			continue;
 		val = read_pmc(counter->hw.idx);
 		if ((int)val < 0) {
-- 
cgit v0.10.2


From e76afc4e7816a0a5300073098cdac93a994eb5ca Mon Sep 17 00:00:00 2001
From: Eric Lammerts <linux-arm@lists.lammerts.org>
Date: Tue, 19 May 2009 20:53:20 -0400
Subject: fix oops when using console=ttymxcN with N > 0

Signed-off-by: Eric Lammerts <eric@lammerts.org>
Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>

diff --git a/drivers/serial/imx.c b/drivers/serial/imx.c
index 9f460b1..5f0be40 100644
--- a/drivers/serial/imx.c
+++ b/drivers/serial/imx.c
@@ -1031,6 +1031,8 @@ imx_console_setup(struct console *co, char *options)
 	if (co->index == -1 || co->index >= ARRAY_SIZE(imx_ports))
 		co->index = 0;
 	sport = imx_ports[co->index];
+	if(sport == NULL)
+		return -ENODEV;
 
 	if (options)
 		uart_parse_options(options, &baud, &parity, &bits, &flow);
-- 
cgit v0.10.2


From 6b4bfb87b638a4f114dfb6f72f4ac1be88a4ebe4 Mon Sep 17 00:00:00 2001
From: Rabin Vincent <rabin@rab.in>
Date: Tue, 26 May 2009 22:31:46 +0530
Subject: mx[23]: don't put clock lookups in __initdata
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove the __initdata annotation for the clock lookups, since they will
be needed when loading modules which use clk_get().

Tested-by: Agustín Ferrín Pozuelo <gatoguan-os@yahoo.com>
Signed-off-by: Rabin Vincent <rabin@rab.in>
Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>

diff --git a/arch/arm/mach-mx2/clock_imx21.c b/arch/arm/mach-mx2/clock_imx21.c
index 999d013..e4b08ca 100644
--- a/arch/arm/mach-mx2/clock_imx21.c
+++ b/arch/arm/mach-mx2/clock_imx21.c
@@ -890,7 +890,7 @@ static struct clk clko_clk = {
 		.con_id = n, \
 		.clk = &c, \
 	},
-static struct clk_lookup lookups[] __initdata = {
+static struct clk_lookup lookups[] = {
 /* It's unlikely that any driver wants one of them directly:
 	_REGISTER_CLOCK(NULL, "ckih", ckih_clk)
 	_REGISTER_CLOCK(NULL, "ckil", ckil_clk)
diff --git a/arch/arm/mach-mx2/clock_imx27.c b/arch/arm/mach-mx2/clock_imx27.c
index 3f7280c..2c97144 100644
--- a/arch/arm/mach-mx2/clock_imx27.c
+++ b/arch/arm/mach-mx2/clock_imx27.c
@@ -621,7 +621,7 @@ DEFINE_CLOCK1(csi_clk,     0, 0,      0, parent, &csi_clk1, &per4_clk);
 		.clk = &c, \
 	},
 
-static struct clk_lookup lookups[] __initdata = {
+static struct clk_lookup lookups[] = {
 	_REGISTER_CLOCK("imx-uart.0", NULL, uart1_clk)
 	_REGISTER_CLOCK("imx-uart.1", NULL, uart2_clk)
 	_REGISTER_CLOCK("imx-uart.2", NULL, uart3_clk)
diff --git a/arch/arm/mach-mx3/clock-imx35.c b/arch/arm/mach-mx3/clock-imx35.c
index 53a112d..3c1e06f 100644
--- a/arch/arm/mach-mx3/clock-imx35.c
+++ b/arch/arm/mach-mx3/clock-imx35.c
@@ -404,7 +404,7 @@ DEFINE_CLOCK(gpu2d_clk,  0, CCM_CGR3,  4, NULL, NULL);
 		.clk = &c,		\
 	},
 
-static struct clk_lookup lookups[] __initdata = {
+static struct clk_lookup lookups[] = {
 	_REGISTER_CLOCK(NULL, "asrc", asrc_clk)
 	_REGISTER_CLOCK(NULL, "ata", ata_clk)
 	_REGISTER_CLOCK(NULL, "audmux", audmux_clk)
diff --git a/arch/arm/mach-mx3/clock.c b/arch/arm/mach-mx3/clock.c
index 9957a11..a68fcf9 100644
--- a/arch/arm/mach-mx3/clock.c
+++ b/arch/arm/mach-mx3/clock.c
@@ -516,7 +516,7 @@ DEFINE_CLOCK(ipg_clk,     0, NULL,          0, ipg_get_rate, NULL, &ahb_clk);
 		.clk = &c, \
 	},
 
-static struct clk_lookup lookups[] __initdata = {
+static struct clk_lookup lookups[] = {
 	_REGISTER_CLOCK(NULL, "emi", emi_clk)
 	_REGISTER_CLOCK(NULL, "cspi", cspi1_clk)
 	_REGISTER_CLOCK(NULL, "cspi", cspi2_clk)
-- 
cgit v0.10.2


From 0e2595cdfd7df9f1128f7185152601ae5417483b Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 20 May 2009 08:10:57 -0500
Subject: x86: Fix UV BAU activation descriptor init

The UV tlb shootdown code has a serious initialization error.

An array of structures [32*8] is initialized as if it were [32].
The array is indexed by (cpu number on the blade)*8, so the short
initialization works for up to 4 cpus on a blade.
But above that, we provide an invalid opcode to the hub's
broadcast assist unit.

This patch changes the allocation of the array to use its symbolic
dimensions for better clarity. And initializes all 32*8 entries.

Shortened 'UV_ACTIVATION_DESCRIPTOR_SIZE' to 'UV_ADP_SIZE' per Ingo's
recommendation.

Tested on the UV simulator.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: <stable@kernel.org>
LKML-Reference: <E1M6lZR-0007kV-Aq@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 9b0e61b..bddd44f 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -37,7 +37,7 @@
 #define UV_CPUS_PER_ACT_STATUS		32
 #define UV_ACT_STATUS_MASK		0x3
 #define UV_ACT_STATUS_SIZE		2
-#define UV_ACTIVATION_DESCRIPTOR_SIZE	32
+#define UV_ADP_SIZE			32
 #define UV_DISTRIBUTION_SIZE		256
 #define UV_SW_ACK_NPENDING		8
 #define UV_NET_ENDPOINT_INTD		0x38
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index ed0c337..16f0fd4 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -715,7 +715,12 @@ uv_activation_descriptor_init(int node, int pnode)
 	struct bau_desc *adp;
 	struct bau_desc *ad2;
 
-	adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node);
+	/*
+	 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
+	 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade
+	 */
+	adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
+		UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
 	BUG_ON(!adp);
 
 	pa = uv_gpa(adp); /* need the real nasid*/
@@ -729,7 +734,13 @@ uv_activation_descriptor_init(int node, int pnode)
 				      (n << UV_DESC_BASE_PNODE_SHIFT | m));
 	}
 
-	for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) {
+	/*
+	 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
+	 * cpu even though we only use the first one; one descriptor can
+	 * describe a broadcast to 256 nodes.
+	 */
+	for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
+		i++, ad2++) {
 		memset(ad2, 0, sizeof(struct bau_desc));
 		ad2->header.sw_ack_flag = 1;
 		/*
-- 
cgit v0.10.2


From 226f62fdd53d5b2c74e242aa11f6ad43d0285d3f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 3 Jun 2009 11:23:56 +0200
Subject: perf_counter: Add a comm hook for pure fork()s

I noticed missing COMM events and found that we missed
reporting them for pure forks.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/fork.c b/kernel/fork.c
index 23bf757..b7d7a9f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1412,6 +1412,12 @@ long do_fork(unsigned long clone_flags,
 		if (clone_flags & CLONE_VFORK) {
 			p->vfork_done = &vfork;
 			init_completion(&vfork);
+		} else {
+			/*
+			 * vfork will do an exec which will call
+			 * set_task_comm()
+			 */
+			perf_counter_comm(p);
 		}
 
 		audit_finish_fork(p);
-- 
cgit v0.10.2


From e61078a0c88773d3465b0b9d665c5ed6b952b1cf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 3 Jun 2009 11:24:33 +0200
Subject: perf record: Use long arg for counter period

I wrote this to test the extended period emulation, we might as
well merge it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index ec3b73a..cea5b8d 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -20,8 +20,8 @@
 #define ALIGN(x, a)		__ALIGN_MASK(x, (typeof(x))(a)-1)
 #define __ALIGN_MASK(x, mask)	(((x)+(mask))&~(mask))
 
-static int			default_interval = 100000;
-static int			event_count[MAX_COUNTERS];
+static long			default_interval = 100000;
+static long			event_count[MAX_COUNTERS];
 
 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
 static int			nr_cpus				= 0;
@@ -494,7 +494,7 @@ static const struct option options[] = {
 			    "append to the output file to do incremental profiling"),
 	OPT_BOOLEAN('f', "force", &force,
 			"overwrite existing data file"),
-	OPT_INTEGER('c', "count", &default_interval,
+	OPT_LONG('c', "count", &default_interval,
 		    "event period to sample"),
 	OPT_STRING('o', "output", &output_name, "file",
 		    "output file name"),
diff --git a/Documentation/perf_counter/util/parse-options.c b/Documentation/perf_counter/util/parse-options.c
index 28b34c1..b80abd9 100644
--- a/Documentation/perf_counter/util/parse-options.c
+++ b/Documentation/perf_counter/util/parse-options.c
@@ -113,6 +113,22 @@ static int get_value(struct parse_opt_ctx_t *p,
 			return opterror(opt, "expects a numerical value", flags);
 		return 0;
 
+	case OPTION_LONG:
+		if (unset) {
+			*(long *)opt->value = 0;
+			return 0;
+		}
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+			*(long *)opt->value = opt->defval;
+			return 0;
+		}
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		*(long *)opt->value = strtol(arg, (char **)&s, 10);
+		if (*s)
+			return opterror(opt, "expects a numerical value", flags);
+		return 0;
+
 	default:
 		die("should not happen, someone must be hit on the forehead");
 	}
diff --git a/Documentation/perf_counter/util/parse-options.h b/Documentation/perf_counter/util/parse-options.h
index a81c7fa..a1039a6 100644
--- a/Documentation/perf_counter/util/parse-options.h
+++ b/Documentation/perf_counter/util/parse-options.h
@@ -14,6 +14,7 @@ enum parse_opt_type {
 	/* options with arguments (usually) */
 	OPTION_STRING,
 	OPTION_INTEGER,
+	OPTION_LONG,
 	OPTION_CALLBACK,
 };
 
@@ -97,6 +98,7 @@ struct option {
 #define OPT_SET_INT(s, l, v, h, i)  { OPTION_SET_INT, (s), (l), (v), NULL, (h), 0, NULL, (i) }
 #define OPT_SET_PTR(s, l, v, h, p)  { OPTION_SET_PTR, (s), (l), (v), NULL, (h), 0, NULL, (p) }
 #define OPT_INTEGER(s, l, v, h)     { OPTION_INTEGER, (s), (l), (v), NULL, (h) }
+#define OPT_LONG(s, l, v, h)        { OPTION_LONG, (s), (l), (v), NULL, (h) }
 #define OPT_STRING(s, l, v, a, h)   { OPTION_STRING,  (s), (l), (v), (a), (h) }
 #define OPT_DATE(s, l, v, h) \
 	{ OPTION_CALLBACK, (s), (l), (v), "time",(h), 0, \
-- 
cgit v0.10.2


From 8229289b607682f90b946ad2c319526303c17700 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 3 Jun 2009 12:37:36 +0200
Subject: perf report: Fix comm sorting

Since we can (and do) change comm strings during the collection
phase, we cannot actually sort on them to build the histogram.
Therefore add an (optional) third sorting phase to collapse the
histrogram.

Comm sorting now builds the histrogram on threads and then in
the collapse phase collects all threads with the same comm.

This collapsed histogram is then reversed and sorted on events.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 0f88d9e..6d359c9 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -211,9 +211,9 @@ static struct thread *thread__new(pid_t pid)
 
 	if (self != NULL) {
 		self->pid = pid;
-		self->comm = malloc(30);
+		self->comm = malloc(32);
 		if (self->comm)
-			sprintf(self->comm, ":%d", pid);
+			snprintf(self->comm, 32, ":%d", self->pid);
 		INIT_LIST_HEAD(&self->maps);
 	}
 
@@ -222,6 +222,8 @@ static struct thread *thread__new(pid_t pid)
 
 static int thread__set_comm(struct thread *self, const char *comm)
 {
+	if (self->comm)
+		free(self->comm);
 	self->comm = strdup(comm);
 	return self->comm ? 0 : -ENOMEM;
 }
@@ -303,9 +305,12 @@ struct sort_entry {
 	char *header;
 
 	int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
+	int64_t (*collapse)(struct hist_entry *, struct hist_entry *);
 	size_t	(*print)(FILE *fp, struct hist_entry *);
 };
 
+/* --sort pid */
+
 static int64_t
 sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
 {
@@ -324,9 +329,17 @@ static struct sort_entry sort_thread = {
 	.print	= sort__thread_print,
 };
 
+/* --sort comm */
+
 static int64_t
 sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
 {
+	return right->thread->pid - left->thread->pid;
+}
+
+static int64_t
+sort__comm_collapse(struct hist_entry *left, struct hist_entry *right)
+{
 	char *comm_l = left->thread->comm;
 	char *comm_r = right->thread->comm;
 
@@ -349,11 +362,14 @@ sort__comm_print(FILE *fp, struct hist_entry *self)
 }
 
 static struct sort_entry sort_comm = {
-	.header = "          Command",
-	.cmp	= sort__comm_cmp,
-	.print	= sort__comm_print,
+	.header 	= "          Command",
+	.cmp		= sort__comm_cmp,
+	.collapse	= sort__comm_collapse,
+	.print		= sort__comm_print,
 };
 
+/* --sort dso */
+
 static int64_t
 sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
 {
@@ -387,6 +403,8 @@ static struct sort_entry sort_dso = {
 	.print	= sort__dso_print,
 };
 
+/* --sort symbol */
+
 static int64_t
 sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
 {
@@ -428,6 +446,8 @@ static struct sort_entry sort_sym = {
 	.print	= sort__sym_print,
 };
 
+static int sort__need_collapse = 0;
+
 struct sort_dimension {
 	char *name;
 	struct sort_entry *entry;
@@ -456,6 +476,9 @@ static int sort_dimension__add(char *tok)
 		if (strncasecmp(tok, sd->name, strlen(tok)))
 			continue;
 
+		if (sd->entry->collapse)
+			sort__need_collapse = 1;
+
 		list_add_tail(&sd->entry->list, &hist_entry__sort_list);
 		sd->taken = 1;
 
@@ -480,6 +503,25 @@ hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
 	return cmp;
 }
 
+static int64_t
+hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
+{
+	struct sort_entry *se;
+	int64_t cmp = 0;
+
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		int64_t (*f)(struct hist_entry *, struct hist_entry *);
+
+		f = se->collapse ?: se->cmp;
+
+		cmp = f(left, right);
+		if (cmp)
+			break;
+	}
+
+	return cmp;
+}
+
 static size_t
 hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
 {
@@ -549,6 +591,64 @@ hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
 	return 0;
 }
 
+static void hist_entry__free(struct hist_entry *he)
+{
+	free(he);
+}
+
+/*
+ * collapse the histogram
+ */
+
+static struct rb_root collapse_hists;
+
+static void collapse__insert_entry(struct hist_entry *he)
+{
+	struct rb_node **p = &collapse_hists.rb_node;
+	struct rb_node *parent = NULL;
+	struct hist_entry *iter;
+	int64_t cmp;
+
+	while (*p != NULL) {
+		parent = *p;
+		iter = rb_entry(parent, struct hist_entry, rb_node);
+
+		cmp = hist_entry__collapse(iter, he);
+
+		if (!cmp) {
+			iter->count += he->count;
+			hist_entry__free(he);
+			return;
+		}
+
+		if (cmp < 0)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&he->rb_node, parent, p);
+	rb_insert_color(&he->rb_node, &collapse_hists);
+}
+
+static void collapse__resort(void)
+{
+	struct rb_node *next;
+	struct hist_entry *n;
+
+	if (!sort__need_collapse)
+		return;
+
+	next = rb_first(&hist);
+	while (next) {
+		n = rb_entry(next, struct hist_entry, rb_node);
+		next = rb_next(&n->rb_node);
+
+		rb_erase(&n->rb_node, &hist);
+		collapse__insert_entry(n);
+	}
+}
+
 /*
  * reverse the map, sort on count.
  */
@@ -577,9 +677,14 @@ static void output__insert_entry(struct hist_entry *he)
 
 static void output__resort(void)
 {
-	struct rb_node *next = rb_first(&hist);
+	struct rb_node *next;
 	struct hist_entry *n;
 
+	if (sort__need_collapse)
+		next = rb_first(&collapse_hists);
+	else
+		next = rb_first(&hist);
+
 	while (next) {
 		n = rb_entry(next, struct hist_entry, rb_node);
 		next = rb_next(&n->rb_node);
@@ -859,6 +964,7 @@ broken_event:
 	if (verbose >= 2)
 		dsos__fprintf(stdout);
 
+	collapse__resort();
 	output__resort();
 	output__fprintf(stdout, total);
 
-- 
cgit v0.10.2


From a96bbc16418bc691317f265d6bf98ba941ca9c1a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 3 Jun 2009 14:01:36 +0200
Subject: perf_counter: Fix race in counter initialization

We need the PID namespace and counter ID available when the
counter overflows and we need to generate a sample event.

[ Impact: fix kernel crash with high-frequency sampling ]

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
[ fixed a further crash and cleaned up the initialization a bit ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 317cef7..ab44554 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -48,6 +48,8 @@ int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
 int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
 int sysctl_perf_counter_limit __read_mostly = 100000; /* max NMIs per second */
 
+static atomic64_t perf_counter_id;
+
 /*
  * Lock for (sysadmin-configurable) counter reservations:
  */
@@ -3351,14 +3353,18 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 
 	mutex_init(&counter->mmap_mutex);
 
-	counter->cpu			= cpu;
+	counter->cpu		= cpu;
 	counter->attr		= *attr;
-	counter->group_leader		= group_leader;
-	counter->pmu			= NULL;
-	counter->ctx			= ctx;
-	counter->oncpu			= -1;
+	counter->group_leader	= group_leader;
+	counter->pmu		= NULL;
+	counter->ctx		= ctx;
+	counter->oncpu		= -1;
+
+	counter->ns		= get_pid_ns(current->nsproxy->pid_ns);
+	counter->id		= atomic64_inc_return(&perf_counter_id);
+
+	counter->state		= PERF_COUNTER_STATE_INACTIVE;
 
-	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	if (attr->disabled)
 		counter->state = PERF_COUNTER_STATE_OFF;
 
@@ -3402,6 +3408,8 @@ done:
 		err = PTR_ERR(pmu);
 
 	if (err) {
+		if (counter->ns)
+			put_pid_ns(counter->ns);
 		kfree(counter);
 		return ERR_PTR(err);
 	}
@@ -3419,8 +3427,6 @@ done:
 	return counter;
 }
 
-static atomic64_t perf_counter_id;
-
 /**
  * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
  *
@@ -3515,9 +3521,6 @@ SYSCALL_DEFINE5(perf_counter_open,
 	list_add_tail(&counter->owner_entry, &current->perf_counter_list);
 	mutex_unlock(&current->perf_counter_mutex);
 
-	counter->ns = get_pid_ns(current->nsproxy->pid_ns);
-	counter->id = atomic64_inc_return(&perf_counter_id);
-
 	fput_light(counter_file, fput_needed2);
 
 out_fput:
-- 
cgit v0.10.2


From a2023556409cf7fec5d67a26f7fcfa57c5a4086d Mon Sep 17 00:00:00 2001
From: Tim Bird <tim.bird@am.sony.com>
Date: Tue, 2 Jun 2009 17:06:54 -0700
Subject: ring-buffer: fix bug in ring_buffer_discard_commit

There's a bug in ring_buffer_discard_commit.  The wrong
pointer is being compared in order to check if the event
can be freed from the buffer rather than discarded
(i.e. marked as PAD).

I noticed this when I was working on duration filtering.
The bug is not deadly - it just results in lots of wasted
space in the buffer.  All filtered events are left in
the buffer and marked as discarded, rather than being
removed from the buffer to make space for other events.

Unfortunately, when I fixed this bug, I got errors doing a
filtered function trace.  Multiple TIME_EXTEND
events pile up in the buffer, and trigger the
following loop overage warning in rb_iter_peek():

again:
	...
	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
		return NULL;

I'm not sure what the best way is to fix this. I don't
know if I should extend the loop threshhold, or if I should
make the test more complex (ignore TIME_EXTEND
events), or just get rid of this loop check completely.

Note that if I implement a workaround for this, then I
see another problem from rb_advance_iter().  I haven't
tracked that one down yet.

In general, it seems like the case of removing filtered
events has not been working properly, and so some assumptions
about buffer invariant conditions need to be revisited.

Here's the patch for the simple fix:

Compare correct pointer for checking if an event can be
freed rather than left as discarded in the buffer.

Signed-off-by: Tim Bird <tim.bird@am.sony.com>
LKML-Reference: <4A25BE9E.5090909@am.sony.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 16b24d4..9453023 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1708,7 +1708,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 
 	bpage = cpu_buffer->tail_page;
 
-	if (bpage == (void *)addr && rb_page_write(bpage) == old_index) {
+	if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
 		/*
 		 * This is on the tail page. It is possible that
 		 * a write could come in and move the tail page
-- 
cgit v0.10.2


From edd813bffc62a980bb4fb9b1243f31c1cce78da3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Jun 2009 23:00:53 -0400
Subject: ring-buffer: try to discard unneeded timestamps

There are times that a race may happen that we add a timestamp in a
nested write. This timestamp would just contain a zero delta and serves
no purpose.

Now that we have a way to discard events, this patch will try to discard
the timestamp instead of just wasting the space in the ring buffer.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9453023..5092660 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1335,6 +1335,38 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	return event;
 }
 
+static inline int
+rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
+		  struct ring_buffer_event *event)
+{
+	unsigned long new_index, old_index;
+	struct buffer_page *bpage;
+	unsigned long index;
+	unsigned long addr;
+
+	new_index = rb_event_index(event);
+	old_index = new_index + rb_event_length(event);
+	addr = (unsigned long)event;
+	addr &= PAGE_MASK;
+
+	bpage = cpu_buffer->tail_page;
+
+	if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
+		/*
+		 * This is on the tail page. It is possible that
+		 * a write could come in and move the tail page
+		 * and write to the next page. That is fine
+		 * because we just shorten what is on this page.
+		 */
+		index = local_cmpxchg(&bpage->write, old_index, new_index);
+		if (index == old_index)
+			return 1;
+	}
+
+	/* could not discard */
+	return 0;
+}
+
 static int
 rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 		  u64 *ts, u64 *delta)
@@ -1384,10 +1416,13 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 		/* let the caller know this was the commit */
 		ret = 1;
 	} else {
-		/* Darn, this is just wasted space */
-		event->time_delta = 0;
-		event->array[0] = 0;
-		ret = 0;
+		/* Try to discard the event */
+		if (!rb_try_to_discard(cpu_buffer, event)) {
+			/* Darn, this is just wasted space */
+			event->time_delta = 0;
+			event->array[0] = 0;
+			ret = 0;
+		}
 	}
 
 	*delta = 0;
@@ -1682,10 +1717,6 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 				struct ring_buffer_event *event)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	unsigned long new_index, old_index;
-	struct buffer_page *bpage;
-	unsigned long index;
-	unsigned long addr;
 	int cpu;
 
 	/* The event is discarded regardless */
@@ -1701,24 +1732,8 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 	cpu = smp_processor_id();
 	cpu_buffer = buffer->buffers[cpu];
 
-	new_index = rb_event_index(event);
-	old_index = new_index + rb_event_length(event);
-	addr = (unsigned long)event;
-	addr &= PAGE_MASK;
-
-	bpage = cpu_buffer->tail_page;
-
-	if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
-		/*
-		 * This is on the tail page. It is possible that
-		 * a write could come in and move the tail page
-		 * and write to the next page. That is fine
-		 * because we just shorten what is on this page.
-		 */
-		index = local_cmpxchg(&bpage->write, old_index, new_index);
-		if (index == old_index)
-			goto out;
-	}
+	if (!rb_try_to_discard(cpu_buffer, event))
+		goto out;
 
 	/*
 	 * The commit is still visible by the reader, so we
-- 
cgit v0.10.2


From ea05b57cc19234d8de9887c8a32c2e58e84b56ba Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 3 Jun 2009 09:30:10 -0400
Subject: ring-buffer: discard timestamps that are at the start of the buffer

Every buffer page in the ring buffer includes its own time stamp.
When an event is recorded to the ring buffer with a delta time greater
than what can be held in the event header, a time stamp event is created.

If the the create timestamp falls over to the next buffer page, it is
redundant because the buffer page holds a full time stamp. This patch
will try to discard the time stamp when it falls to the start of the
next page.

This change also fixes a issues with disarding events. If most events are
discarded, timestamps will start to creep into the ring buffer. If we
do not discard the timestamps then they can fill up the ring buffer over
time and waste space.

This change will keep time stamps from filling up over another page. If
something is recorded in the buffer page, and the rest is filtered, then
the time stamps can only fill up to the end of the page.

[ Impact: prevent time stamps from filling ring buffer ]

Reported-by: Tim Bird <tim.bird@am.sony.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5092660..7102d7a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -370,6 +370,9 @@ static inline int test_time_stamp(u64 delta)
 /* Max payload is BUF_PAGE_SIZE - header (8bytes) */
 #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
 
+/* Max number of timestamps that can fit on a page */
+#define RB_TIMESTAMPS_PER_PAGE	(BUF_PAGE_SIZE / RB_LEN_TIME_STAMP)
+
 int ring_buffer_print_page_header(struct trace_seq *s)
 {
 	struct buffer_data_page field;
@@ -1409,8 +1412,12 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 			event->array[0] = *delta >> TS_SHIFT;
 		} else {
 			cpu_buffer->commit_page->page->time_stamp = *ts;
-			event->time_delta = 0;
-			event->array[0] = 0;
+			/* try to discard, since we do not need this */
+			if (!rb_try_to_discard(cpu_buffer, event)) {
+				/* nope, just zero it */
+				event->time_delta = 0;
+				event->array[0] = 0;
+			}
 		}
 		cpu_buffer->write_stamp = *ts;
 		/* let the caller know this was the commit */
@@ -2268,8 +2275,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
 	 * Check if we are at the end of the buffer.
 	 */
 	if (iter->head >= rb_page_size(iter->head_page)) {
-		if (RB_WARN_ON(buffer,
-			       iter->head_page == cpu_buffer->commit_page))
+		/* discarded commits can make the page empty */
+		if (iter->head_page == cpu_buffer->commit_page)
 			return;
 		rb_inc_iter(iter);
 		return;
@@ -2312,12 +2319,10 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	/*
 	 * We repeat when a timestamp is encountered. It is possible
 	 * to get multiple timestamps from an interrupt entering just
-	 * as one timestamp is about to be written. The max times
-	 * that this can happen is the number of nested interrupts we
-	 * can have.  Nesting 10 deep of interrupts is clearly
-	 * an anomaly.
+	 * as one timestamp is about to be written, or from discarded
+	 * commits. The most that we can have is the number on a single page.
 	 */
-	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
+	if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
 		return NULL;
 
 	reader = rb_get_reader_page(cpu_buffer);
@@ -2383,14 +2388,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 
  again:
 	/*
-	 * We repeat when a timestamp is encountered. It is possible
-	 * to get multiple timestamps from an interrupt entering just
-	 * as one timestamp is about to be written. The max times
-	 * that this can happen is the number of nested interrupts we
-	 * can have. Nesting 10 deep of interrupts is clearly
-	 * an anomaly.
+	 * We repeat when a timestamp is encountered.
+	 * We can get multiple timestamps by nested interrupts or also
+	 * if filtering is on (discarding commits). Since discarding
+	 * commits can be frequent we can get a lot of timestamps.
+	 * But we limit them by not adding timestamps if they begin
+	 * at the start of a page.
 	 */
-	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
+	if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
 		return NULL;
 
 	if (rb_per_cpu_empty(cpu_buffer))
-- 
cgit v0.10.2


From 083a63b48e4dd0a6a2d44216720076dc81ebb255 Mon Sep 17 00:00:00 2001
From: walimis <walimisdev@gmail.com>
Date: Wed, 3 Jun 2009 16:01:28 +0800
Subject: tracing/trace_stack: fix the number of entries in the header

The last entry in the stack_dump_trace is ULONG_MAX, which is not
a valid entry, but max_stack_trace.nr_entries has accounted for it.
So when printing the header, we should decrease it by one.
Before fix, print as following, for example:

	Depth    Size   Location    (53 entries)	<--- should be 52
	-----    ----   --------
  0)     3264     108   update_wall_time+0x4d5/0x9a0
  ...
 51)       80      80   syscall_call+0x7/0xb
 ^^^
   it's correct.

Signed-off-by: walimis <walimisdev@gmail.com>
LKML-Reference: <1244016090-7814-1-git-send-email-walimisdev@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 1796f00..2d7aebd 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -265,7 +265,7 @@ static int t_show(struct seq_file *m, void *v)
 		seq_printf(m, "        Depth    Size   Location"
 			   "    (%d entries)\n"
 			   "        -----    ----   --------\n",
-			   max_stack_trace.nr_entries);
+			   max_stack_trace.nr_entries - 1);
 
 		if (!stack_tracer_enabled && !max_stack_size)
 			print_disabled(m);
-- 
cgit v0.10.2


From f11b3f4e2932bfdcfc458ab8d1ece62724ceabfc Mon Sep 17 00:00:00 2001
From: walimis <walimisdev@gmail.com>
Date: Wed, 3 Jun 2009 16:01:29 +0800
Subject: tracing/events: fix output format of kernel stack

According to "events/ftrace/kernel_stack/format", output format of
kernel stack should use "=>" instead of "<=".

The second problem is that we shouldn't skip the first entry in the stack,
although it seems to be duplicated when used in the "function" tracer,
but events also use it. If we skip the first one, we will drop the topmost
entry of the stack.

The last problem is that if the last entry is ULONG_MAX(0xffffffff), we should
drop it, otherwise it will print a NULL name line.

before fix:

      sh-1072  [000]   26.957239: sched_process_fork: parent sh:1072 child sh:1073
      sh-1072  [000]   26.957262:
 <= syscall_call
 <=
      sh-1072  [000]   26.957744: sched_switch: task sh:1072 [120] (R) ==> sh:1073 [120]
      sh-1072  [000]   26.957752:
 <= preempt_schedule
 <= wake_up_new_task
 <= do_fork
 <= sys_clone
 <= syscall_call
 <=

After fix:

      sh-1075  [000]    39.791848: sched_process_fork: parent sh:1075  child sh:1076
      sh-1075  [000]    39.791871:
 => sys_clone
 => syscall_call
      sh-1075  [000]    39.792713: sched_switch: task sh:1075 [120] (R) ==> sh:1076 [120]
      sh-1075  [000]    39.792722:
 => schedule
 => preempt_schedule
 => wake_up_new_task
 => do_fork
 => sys_clone
 => syscall_call

Signed-off-by: walimis <walimisdev@gmail.com>
LKML-Reference: <1244016090-7814-2-git-send-email-walimisdev@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 0fe3b22..64596a5 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -975,16 +975,16 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 
 	trace_assign_type(field, iter->ent);
 
+	if (!trace_seq_puts(s, "\n"))
+		goto partial;
 	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
-		if (!field->caller[i])
+		if (!field->caller[i] || (field->caller[i] == ULONG_MAX))
 			break;
-		if (i) {
-			if (!trace_seq_puts(s, " <= "))
-				goto partial;
+		if (!trace_seq_puts(s, " => "))
+			goto partial;
 
-			if (!seq_print_ip_sym(s, field->caller[i], flags))
-				goto partial;
-		}
+		if (!seq_print_ip_sym(s, field->caller[i], flags))
+			goto partial;
 		if (!trace_seq_puts(s, "\n"))
 			goto partial;
 	}
-- 
cgit v0.10.2


From 048dc50c5e7eada19ebabbad70b7966d14283d41 Mon Sep 17 00:00:00 2001
From: walimis <walimisdev@gmail.com>
Date: Wed, 3 Jun 2009 16:01:30 +0800
Subject: tracing/events: fix output format of user stack

According to "events/ftrace/user_stack/format", fix the output of
user stack.

before fix:

  sh-1073  [000]    31.137561:  <b7f274fe> <-  <0804e33c> <-  <080835c1>

after fix:

  sh-1072  [000]    37.039329:
 =>  <b7f8a4fe>
 =>  <0804e33c>
 =>  <080835c1>

Signed-off-by: walimis <walimisdev@gmail.com>
LKML-Reference: <1244016090-7814-3-git-send-email-walimisdev@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 64596a5..8dadbbb 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -389,17 +389,20 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
 
 		if (ip == ULONG_MAX || !ret)
 			break;
-		if (i && ret)
-			ret = trace_seq_puts(s, " <- ");
+		if (ret)
+			ret = trace_seq_puts(s, " => ");
 		if (!ip) {
 			if (ret)
 				ret = trace_seq_puts(s, "??");
+			if (ret)
+				ret = trace_seq_puts(s, "\n");
 			continue;
 		}
 		if (!ret)
 			break;
 		if (ret)
 			ret = seq_print_user_ip(s, mm, ip, sym_flags);
+		ret = trace_seq_puts(s, "\n");
 	}
 
 	if (mm)
@@ -1012,10 +1015,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
 
 	trace_assign_type(field, iter->ent);
 
-	if (!seq_print_userip_objs(field, s, flags))
+	if (!trace_seq_putc(s, '\n'))
 		goto partial;
 
-	if (!trace_seq_putc(s, '\n'))
+	if (!seq_print_userip_objs(field, s, flags))
 		goto partial;
 
 	return TRACE_TYPE_HANDLED;
-- 
cgit v0.10.2


From 56d8bd3f0b98972312cad683947ec90b21011199 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 3 Jun 2009 14:52:03 +0100
Subject: tracing: fix multiple use of __print_flags and __print_symbolic

Here is an updated patch to include the extra call to
trace_seq_init() as requested. This is vs. the latest
-tip tree and fixes the use of multiple __print_flags
and __print_symbolic in a single tracer. Also tested
to ensure its working now:

mount.gfs2-2534  [000]   235.850587: gfs2_glock_queue: 8.7 glock 1:2 dequeue PR
mount.gfs2-2534  [000]   235.850591: gfs2_demote_rq: 8.7 glock 1:0 demote EX to NL flags:DI
mount.gfs2-2534  [000]   235.850591: gfs2_glock_queue: 8.7 glock 1:0 dequeue EX
glock_workqueue-2529  [000]   235.850666: gfs2_glock_state_change: 8.7 glock 1:0 state EX => NL tgt:NL dmt:NL flags:lDpI
glock_workqueue-2529  [000]   235.850672: gfs2_glock_put: 8.7 glock 1:0 state NL => IV flags:I

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
LKML-Reference: <1244037123.29604.603.camel@localhost.localdomain>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index b5478da..40ede4d 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -104,6 +104,7 @@
  *	field = (typeof(field))entry;
  *
  *	p = get_cpu_var(ftrace_event_seq);
+ *	trace_seq_init(p);
  *	ret = trace_seq_printf(s, <TP_printk> "\n");
  *	put_cpu();
  *	if (!ret)
@@ -167,6 +168,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 	field = (typeof(field))entry;					\
 									\
 	p = &get_cpu_var(ftrace_event_seq);				\
+	trace_seq_init(p);						\
 	ret = trace_seq_printf(s, #call ": " print);			\
 	put_cpu();							\
 	if (!ret)							\
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8dadbbb..8afeea4 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -223,10 +223,9 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
 {
 	unsigned long mask;
 	const char *str;
+	const char *ret = p->buffer + p->len;
 	int i;
 
-	trace_seq_init(p);
-
 	for (i = 0;  flag_array[i].name && flags; i++) {
 
 		mask = flag_array[i].mask;
@@ -249,7 +248,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
 
 	trace_seq_putc(p, 0);
 
-	return p->buffer;
+	return ret;
 }
 EXPORT_SYMBOL(ftrace_print_flags_seq);
 
@@ -258,8 +257,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 			 const struct trace_print_flags *symbol_array)
 {
 	int i;
-
-	trace_seq_init(p);
+	const char *ret = p->buffer + p->len;
 
 	for (i = 0;  symbol_array[i].name; i++) {
 
@@ -275,7 +273,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 		
 	trace_seq_putc(p, 0);
 
-	return p->buffer;
+	return ret;
 }
 EXPORT_SYMBOL(ftrace_print_symbols_seq);
 
-- 
cgit v0.10.2


From 563af16c30ede41eda2d614195d88e07f7c7103d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 3 Jun 2009 11:10:44 -0400
Subject: tracing: add annotation to what type of stack trace is recorded

The current method of printing out a stack trace is to add a new line
and print out the trace:

    yum-updatesd-3120  [002]   573.691303:
 => do_softirq
 => irq_exit
 => smp_apic_timer_interrupt
 => apic_timer_interrupt

This looks a bit awkward, and if we have both stack and user stack traces
running, it would be nice to have a title to tell them apart, although
it is easy to tell by the output.

This patch adds an annotation to the start of the stack traces:

            init-1     [003]   929.304979: <stack trace>
 => user_path_at
 => vfs_fstatat
 => vfs_stat
 => sys_newstat
 => system_call_fastpath

             cat-3459  [002]  1016.824040: <user stack trace>
 =>  <0000003aae6c0250>
 =>  <00007ffff4b06ae4>
 =>  <69636172742f6775>

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8afeea4..425725c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -976,7 +976,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 
 	trace_assign_type(field, iter->ent);
 
-	if (!trace_seq_puts(s, "\n"))
+	if (!trace_seq_puts(s, "<stack trace>\n"))
 		goto partial;
 	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
 		if (!field->caller[i] || (field->caller[i] == ULONG_MAX))
@@ -1013,7 +1013,7 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
 
 	trace_assign_type(field, iter->ent);
 
-	if (!trace_seq_putc(s, '\n'))
+	if (!trace_seq_puts(s, "<user stack trace>\n"))
 		goto partial;
 
 	if (!seq_print_userip_objs(field, s, flags))
-- 
cgit v0.10.2


From f2521b6e4c365bd0aac61b2c346e6e9f22607e31 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 19:17:25 +0200
Subject: perf_counter tools: Clean up old kerneltop references

kerneltop has been replaced with perf top - so fix up a few
remaining references to it in display text and error messages.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index cea5b8d..fa62525 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -357,7 +357,8 @@ static void open_counters(int cpu, pid_t pid)
 
 		if (fd[nr_cpu][counter] < 0) {
 			int err = errno;
-			printf("kerneltop error: syscall returned with %d (%s)\n",
+
+			error("syscall returned with %d (%s)\n",
 					fd[nr_cpu][counter], strerror(err));
 			if (err == EPERM)
 				printf("Are you root?\n");
@@ -382,8 +383,7 @@ static void open_counters(int cpu, pid_t pid)
 		mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
 				PROT_READ, MAP_SHARED, fd[nr_cpu][counter], 0);
 		if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
-			printf("kerneltop error: failed to mmap with %d (%s)\n",
-					errno, strerror(errno));
+			error("failed to mmap with %d (%s)\n", errno, strerror(errno));
 			exit(-1);
 		}
 	}
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index a639352..16a6184 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -208,7 +208,7 @@ static void print_sym_table(void)
 
 	printf(
 "------------------------------------------------------------------------------\n");
-	printf( " KernelTop:%8.0f irqs/sec  kernel:%4.1f%% [",
+	printf( "   PerfTop:%8.0f irqs/sec  kernel:%4.1f%% [",
 		events_per_sec,
 		100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)));
 
@@ -281,7 +281,7 @@ static void print_sym_table(void)
 
 static void *display_thread(void *arg)
 {
-	printf("KernelTop refresh period: %d seconds\n", delay_secs);
+	printf("PerfTop refresh period: %d seconds\n", delay_secs);
 
 	while (!sleep(delay_secs))
 		print_sym_table();
@@ -564,7 +564,8 @@ static int __cmd_top(void)
 			fd[i][counter] = sys_perf_counter_open(&attr, target_pid, cpu, group_fd, 0);
 			if (fd[i][counter] < 0) {
 				int err = errno;
-				printf("kerneltop error: syscall returned with %d (%s)\n",
+
+				error("syscall returned with %d (%s)\n",
 					fd[i][counter], strerror(err));
 				if (err == EPERM)
 					printf("Are you root?\n");
@@ -588,11 +589,8 @@ static int __cmd_top(void)
 			mmap_array[i][counter].mask = mmap_pages*page_size - 1;
 			mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
 					PROT_READ, MAP_SHARED, fd[i][counter], 0);
-			if (mmap_array[i][counter].base == MAP_FAILED) {
-				printf("kerneltop error: failed to mmap with %d (%s)\n",
-						errno, strerror(errno));
-				exit(-1);
-			}
+			if (mmap_array[i][counter].base == MAP_FAILED)
+				die("failed to mmap with %d (%s)\n", errno, strerror(errno));
 		}
 	}
 
-- 
cgit v0.10.2


From 021e9f476511ebe23d7f45854a52dfe74c09b6ee Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 19:27:19 +0200
Subject: perf record: Refine capture printout

Print out the number of bytes captured, and the (estimated) number of
events the output file contains.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index fa62525..efa2eb4 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -67,6 +67,8 @@ static unsigned int mmap_read_head(struct mmap_data *md)
 static long events;
 static struct timeval last_read, this_read;
 
+static __u64 bytes_written;
+
 static void mmap_read(struct mmap_data *md)
 {
 	unsigned int head = mmap_read_head(md);
@@ -114,28 +116,34 @@ static void mmap_read(struct mmap_data *md)
 		buf = &data[old & md->mask];
 		size = md->mask + 1 - (old & md->mask);
 		old += size;
+
 		while (size) {
 			int ret = write(output, buf, size);
-			if (ret < 0) {
-				perror("failed to write");
-				exit(-1);
-			}
+
+			if (ret < 0)
+				die("failed to write");
+
 			size -= ret;
 			buf += ret;
+
+			bytes_written += ret;
 		}
 	}
 
 	buf = &data[old & md->mask];
 	size = head - old;
 	old += size;
+
 	while (size) {
 		int ret = write(output, buf, size);
-		if (ret < 0) {
-			perror("failed to write");
-			exit(-1);
-		}
+
+		if (ret < 0)
+			die("failed to write");
+
 		size -= ret;
 		buf += ret;
+
+		bytes_written += ret;
 	}
 
 	md->prev = old;
@@ -467,8 +475,14 @@ static int __cmd_record(int argc, const char **argv)
 			ret = poll(event_array, nr_poll, 100);
 	}
 
-
-	fprintf(stderr, "[ perf record: Captured and wrote %ld events. ]\n", events);
+	/*
+	 * Approximate RIP event size: 24 bytes.
+	 */
+	fprintf(stderr,
+		"[ perf record: Captured and wrote %.3f MB %s (~%lld events) ]\n",
+		(double)bytes_written / 1024.0 / 1024.0,
+		output_name,
+		bytes_written / 24);
 
 	return 0;
 }
-- 
cgit v0.10.2


From e98e96fe43ae92fad0930f05fb2b298e49b9f3b5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 19:30:38 +0200
Subject: perf report: Display 100% correctly

Needs to be 6.2 not 5.2, for 100.00% to be aligned properly.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 6d359c9..e837bb9 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -529,7 +529,7 @@ hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
 	size_t ret;
 
 	if (total_samples) {
-		ret = fprintf(fp, "    %5.2f%%",
+		ret = fprintf(fp, "   %6.2f%%",
 				(self->count * 100.0) / total_samples);
 	} else
 		ret = fprintf(fp, "%12d ", self->count);
-- 
cgit v0.10.2


From 44db76c8553c328f4ae02481d77bb3a88ca17645 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 19:36:07 +0200
Subject: perf stat: Print out all arguments

Before:

 Performance counter stats for '/home/mingo/hackbench':

After:

 Performance counter stats for '/home/mingo/hackbench 10':

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 2357a66..4fc0d80 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -228,6 +228,7 @@ static int do_perfstat(int argc, const char **argv)
 	int counter;
 	int status;
 	int pid;
+	int i;
 
 	if (!system_wide)
 		nr_cpus = 1;
@@ -243,14 +244,17 @@ static int do_perfstat(int argc, const char **argv)
 
 	if ((pid = fork()) < 0)
 		perror("failed to fork");
+
 	if (!pid) {
 		if (execvp(argv[0], (char **)argv)) {
 			perror(argv[0]);
 			exit(-1);
 		}
 	}
+
 	while (wait(&status) >= 0)
 		;
+
 	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
 	t1 = rdclock();
 
@@ -259,8 +263,12 @@ static int do_perfstat(int argc, const char **argv)
 	fflush(stdout);
 
 	fprintf(stderr, "\n");
-	fprintf(stderr, " Performance counter stats for \'%s\':\n",
-		argv[0]);
+	fprintf(stderr, " Performance counter stats for \'%s", argv[0]);
+
+	for (i = 1; i < argc; i++)
+		fprintf(stderr, " %s", argv[i]);
+
+	fprintf(stderr, "\':\n");
 	fprintf(stderr, "\n");
 
 	for (counter = 0; counter < nr_counters; counter++)
-- 
cgit v0.10.2


From eed4dcd443da7a46131ef37c7a389b444905960e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 19:59:24 +0200
Subject: perf report: Add front-entry cache for lookups

Before:

 Performance counter stats for './perf report -i perf.data.big':

     12453988058  instructions

 Performance counter stats for './perf report -i perf.data.big':

     12379566017  instructions

0.60% reduction.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index e837bb9..33b3b15 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -229,6 +229,7 @@ static int thread__set_comm(struct thread *self, const char *comm)
 }
 
 static struct rb_root threads;
+static struct thread *last_match;
 
 static struct thread *threads__findnew(pid_t pid)
 {
@@ -236,12 +237,22 @@ static struct thread *threads__findnew(pid_t pid)
 	struct rb_node *parent = NULL;
 	struct thread *th;
 
+	/*
+	 * Font-end cache - PID lookups come in blocks,
+	 * so most of the time we dont have to look up
+	 * the full rbtree:
+	 */
+	if (last_match && last_match->pid == pid)
+		return last_match;
+
 	while (*p != NULL) {
 		parent = *p;
 		th = rb_entry(parent, struct thread, rb_node);
 
-		if (th->pid == pid)
+		if (th->pid == pid) {
+			last_match = th;
 			return th;
+		}
 
 		if (pid < th->pid)
 			p = &(*p)->rb_left;
@@ -253,7 +264,9 @@ static struct thread *threads__findnew(pid_t pid)
 	if (th != NULL) {
 		rb_link_node(&th->rb_node, parent, p);
 		rb_insert_color(&th->rb_node, &threads);
+		last_match = th;
 	}
+
 	return th;
 }
 
-- 
cgit v0.10.2


From 051cdc3c2d0e24443ac03aff03ee89807ec8c589 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 20:09:11 +0200
Subject: perf help: Fix bug when there's no perf-* command around

main_cmds can be empty - fix util/help.c to handle this case
without segfaulting.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/util/help.c b/Documentation/perf_counter/util/help.c
index 397487f..6653f7d 100644
--- a/Documentation/perf_counter/util/help.c
+++ b/Documentation/perf_counter/util/help.c
@@ -298,7 +298,7 @@ static void add_cmd_list(struct cmdnames *cmds, struct cmdnames *old)
 
 const char *help_unknown_cmd(const char *cmd)
 {
-	int i, n, best_similarity = 0;
+	int i, n = 0, best_similarity = 0;
 	struct cmdnames main_cmds, other_cmds;
 
 	memset(&main_cmds, 0, sizeof(main_cmds));
@@ -315,20 +315,24 @@ const char *help_unknown_cmd(const char *cmd)
 	      sizeof(main_cmds.names), cmdname_compare);
 	uniq(&main_cmds);
 
-	/* This reuses cmdname->len for similarity index */
-	for (i = 0; i < main_cmds.cnt; ++i)
-		main_cmds.names[i]->len =
-			levenshtein(cmd, main_cmds.names[i]->name, 0, 2, 1, 4);
+	if (main_cmds.cnt) {
+		/* This reuses cmdname->len for similarity index */
+		for (i = 0; i < main_cmds.cnt; ++i)
+			main_cmds.names[i]->len =
+				levenshtein(cmd, main_cmds.names[i]->name, 0, 2, 1, 4);
 
-	qsort(main_cmds.names, main_cmds.cnt,
-	      sizeof(*main_cmds.names), levenshtein_compare);
+		qsort(main_cmds.names, main_cmds.cnt,
+		      sizeof(*main_cmds.names), levenshtein_compare);
+
+		best_similarity = main_cmds.names[0]->len;
+		n = 1;
+		while (n < main_cmds.cnt && best_similarity == main_cmds.names[n]->len)
+			++n;
+	}
 
-	best_similarity = main_cmds.names[0]->len;
-	n = 1;
-	while (n < main_cmds.cnt && best_similarity == main_cmds.names[n]->len)
-		++n;
 	if (autocorrect && n == 1) {
 		const char *assumed = main_cmds.names[0]->name;
+
 		main_cmds.names[0] = NULL;
 		clean_cmdnames(&main_cmds);
 		fprintf(stderr, "WARNING: You called a Git program named '%s', "
@@ -345,7 +349,7 @@ const char *help_unknown_cmd(const char *cmd)
 
 	fprintf(stderr, "perf: '%s' is not a perf-command. See 'perf --help'.\n", cmd);
 
-	if (best_similarity < 6) {
+	if (main_cmds.cnt && best_similarity < 6) {
 		fprintf(stderr, "\nDid you mean %s?\n",
 			n < 2 ? "this": "one of these");
 
-- 
cgit v0.10.2


From 095b3a6a030f7d4f24825ae93fc384b3d4b4fafa Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 20:13:51 +0200
Subject: perf_counter tools: Optimize harder

Use -O6 to build the tools.

Before:

    12387507370  instructions         #    3121.653 M/sec

After:

     6244894971  instructions         #    3458.437 M/sec

Almost twice as fast!

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index eae8856..005709b 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -159,7 +159,7 @@ uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
 
 # CFLAGS and LDFLAGS are for the users to override from the command line.
 
-CFLAGS = -ggdb3 -Wall -Werror -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement
+CFLAGS = -ggdb3 -Wall -Werror -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement -O6
 LDFLAGS = -lpthread -lrt -lelf
 ALL_CFLAGS = $(CFLAGS)
 ALL_LDFLAGS = $(LDFLAGS)
-- 
cgit v0.10.2


From c499b0672f8df9379764965c5ec124751699d7c4 Mon Sep 17 00:00:00 2001
From: Sascha Hauer <s.hauer@pengutronix.de>
Date: Fri, 3 Apr 2009 14:41:56 +0200
Subject: mxcmmc: decrease minimum frequency to make MMC cards work

This is a temporary workaround until the MMC stack can be fixed.

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Signed-off-by: Pierre Ossman <pierre@ossman.eu>

diff --git a/drivers/mmc/host/mxcmmc.c b/drivers/mmc/host/mxcmmc.c
index b4a615c..5950102 100644
--- a/drivers/mmc/host/mxcmmc.c
+++ b/drivers/mmc/host/mxcmmc.c
@@ -724,7 +724,9 @@ static int mxcmci_probe(struct platform_device *pdev)
 		goto out_clk_put;
 	}
 
-	mmc->f_min = clk_get_rate(host->clk) >> 7;
+	mmc->f_min = clk_get_rate(host->clk) >> 16;
+	if (mmc->f_min < 400000)
+		mmc->f_min = 400000;
 	mmc->f_max = clk_get_rate(host->clk) >> 1;
 
 	/* recommended in data sheet */
-- 
cgit v0.10.2


From 18374ab76e3ec1cf1b0ca5a8d08e35cfc5d01669 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 3 Jun 2009 14:49:21 -0300
Subject: perf_counter tools: Fix off-by-one bug in symbol__new

The end is really (start + len - 1). Noticed when synthesizing
the PLT symbols, that are small (16 bytes), and hot on the
start RIP.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <20090603174921.GG7805@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/util/symbol.c b/Documentation/perf_counter/util/symbol.c
index d52a1ae..35ee6de 100644
--- a/Documentation/perf_counter/util/symbol.c
+++ b/Documentation/perf_counter/util/symbol.c
@@ -19,7 +19,7 @@ static struct symbol *symbol__new(uint64_t start, uint64_t len,
 			self = ((void *)self) + priv_size;
 		}
 		self->start = start;
-		self->end   = start + len;
+		self->end   = start + len - 1;
 		memcpy(self->name, name, namelen);
 	}
 
-- 
cgit v0.10.2


From 85b843227a9b8c1a27ebd354a80c89aef067f2ca Mon Sep 17 00:00:00 2001
From: Anand Gadiyar <gadiyar@ti.com>
Date: Wed, 15 Apr 2009 17:44:58 +0530
Subject: omap_hsmmc: Trivial fix for a typo in comment

Signed-off-by: Anand Gadiyar <gadiyar@ti.com>
Signed-off-by: Pierre Ossman <pierre@ossman.eu>

diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c
index e62a22a..c40cb96 100644
--- a/drivers/mmc/host/omap_hsmmc.c
+++ b/drivers/mmc/host/omap_hsmmc.c
@@ -680,7 +680,7 @@ static void mmc_omap_dma_cb(int lch, u16 ch_status, void *data)
 	host->dma_ch = -1;
 	/*
 	 * DMA Callback: run in interrupt context.
-	 * mutex_unlock will through a kernel warning if used.
+	 * mutex_unlock will throw a kernel warning if used.
 	 */
 	up(&host->sem);
 }
-- 
cgit v0.10.2


From 18489fa2ba4c170d96ffc1a41f7b9002dcb983b7 Mon Sep 17 00:00:00 2001
From: Martin Fuzzey <mfuzzey@gmail.com>
Date: Thu, 16 Apr 2009 22:00:36 +0200
Subject: mxcmmc : Reset the SDHC hardware if software timeout occurs.

When a software timeout occurs in polling mode hardware was left in
an indeterminate state causing subsequent operations to block.

Signed-off-by: Martin Fuzzey <mfuzzey@gmail.com>

diff --git a/drivers/mmc/host/mxcmmc.c b/drivers/mmc/host/mxcmmc.c
index 5950102..dcc9cdb 100644
--- a/drivers/mmc/host/mxcmmc.c
+++ b/drivers/mmc/host/mxcmmc.c
@@ -140,6 +140,8 @@ struct mxcmci_host {
 	struct work_struct	datawork;
 };
 
+static void mxcmci_set_clk_rate(struct mxcmci_host *host, unsigned int clk_ios);
+
 static inline int mxcmci_use_dma(struct mxcmci_host *host)
 {
 	return host->do_dma;
@@ -345,8 +347,11 @@ static int mxcmci_poll_status(struct mxcmci_host *host, u32 mask)
 		stat = readl(host->base + MMC_REG_STATUS);
 		if (stat & STATUS_ERR_MASK)
 			return stat;
-		if (time_after(jiffies, timeout))
+		if (time_after(jiffies, timeout)) {
+			mxcmci_softreset(host);
+			mxcmci_set_clk_rate(host, host->clock);
 			return STATUS_TIME_OUT_READ;
+		}
 		if (stat & mask)
 			return 0;
 		cpu_relax();
-- 
cgit v0.10.2


From 656217d28480f63313a488f6973980f9fbb921a1 Mon Sep 17 00:00:00 2001
From: Martin Fuzzey <mfuzzey@gmail.com>
Date: Thu, 16 Apr 2009 22:00:41 +0200
Subject: mxcmmc: Fix missing return value checking in DMA setup code.

Signed-off-by: Martin Fuzzey <mfuzzey@gmail.com>

diff --git a/drivers/mmc/host/mxcmmc.c b/drivers/mmc/host/mxcmmc.c
index dcc9cdb..f4cbe47 100644
--- a/drivers/mmc/host/mxcmmc.c
+++ b/drivers/mmc/host/mxcmmc.c
@@ -162,7 +162,7 @@ static void mxcmci_softreset(struct mxcmci_host *host)
 	writew(0xff, host->base + MMC_REG_RES_TO);
 }
 
-static void mxcmci_setup_data(struct mxcmci_host *host, struct mmc_data *data)
+static int mxcmci_setup_data(struct mxcmci_host *host, struct mmc_data *data)
 {
 	unsigned int nob = data->blocks;
 	unsigned int blksz = data->blksz;
@@ -170,6 +170,7 @@ static void mxcmci_setup_data(struct mxcmci_host *host, struct mmc_data *data)
 #ifdef HAS_DMA
 	struct scatterlist *sg;
 	int i;
+	int ret;
 #endif
 	if (data->flags & MMC_DATA_STREAM)
 		nob = 0xffff;
@@ -185,7 +186,7 @@ static void mxcmci_setup_data(struct mxcmci_host *host, struct mmc_data *data)
 	for_each_sg(data->sg, sg, data->sg_len, i) {
 		if (sg->offset & 3 || sg->length & 3) {
 			host->do_dma = 0;
-			return;
+			return 0;
 		}
 	}
 
@@ -194,23 +195,30 @@ static void mxcmci_setup_data(struct mxcmci_host *host, struct mmc_data *data)
 		host->dma_nents = dma_map_sg(mmc_dev(host->mmc), data->sg,
 					     data->sg_len,  host->dma_dir);
 
-		imx_dma_setup_sg(host->dma, data->sg, host->dma_nents, datasize,
-				 host->res->start + MMC_REG_BUFFER_ACCESS,
-				 DMA_MODE_READ);
+		ret = imx_dma_setup_sg(host->dma, data->sg, host->dma_nents,
+				datasize,
+				host->res->start + MMC_REG_BUFFER_ACCESS,
+				DMA_MODE_READ);
 	} else {
 		host->dma_dir = DMA_TO_DEVICE;
 		host->dma_nents = dma_map_sg(mmc_dev(host->mmc), data->sg,
 					     data->sg_len,  host->dma_dir);
 
-		imx_dma_setup_sg(host->dma, data->sg, host->dma_nents, datasize,
-				 host->res->start + MMC_REG_BUFFER_ACCESS,
-				 DMA_MODE_WRITE);
+		ret = imx_dma_setup_sg(host->dma, data->sg, host->dma_nents,
+				datasize,
+				host->res->start + MMC_REG_BUFFER_ACCESS,
+				DMA_MODE_WRITE);
 	}
 
+	if (ret) {
+		dev_err(mmc_dev(host->mmc), "failed to setup DMA : %d\n", ret);
+		return ret;
+	}
 	wmb();
 
 	imx_dma_enable(host->dma);
 #endif /* HAS_DMA */
+	return 0;
 }
 
 static int mxcmci_start_cmd(struct mxcmci_host *host, struct mmc_command *cmd,
@@ -536,6 +544,7 @@ static void mxcmci_request(struct mmc_host *mmc, struct mmc_request *req)
 {
 	struct mxcmci_host *host = mmc_priv(mmc);
 	unsigned int cmdat = host->cmdat;
+	int error;
 
 	WARN_ON(host->req != NULL);
 
@@ -545,7 +554,12 @@ static void mxcmci_request(struct mmc_host *mmc, struct mmc_request *req)
 	host->do_dma = 1;
 #endif
 	if (req->data) {
-		mxcmci_setup_data(host, req->data);
+		error = mxcmci_setup_data(host, req->data);
+		if (error) {
+			req->cmd->error = error;
+			goto out;
+		}
+
 
 		cmdat |= CMD_DAT_CONT_DATA_ENABLE;
 
@@ -553,7 +567,9 @@ static void mxcmci_request(struct mmc_host *mmc, struct mmc_request *req)
 			cmdat |= CMD_DAT_CONT_WRITE;
 	}
 
-	if (mxcmci_start_cmd(host, req->cmd, cmdat))
+	error = mxcmci_start_cmd(host, req->cmd, cmdat);
+out:
+	if (error)
 		mxcmci_finish_request(host, req);
 }
 
-- 
cgit v0.10.2


From 703aaced2b9c9a98285f265f3444c2f89d9d4d19 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@cam.org>
Date: Sat, 9 May 2009 01:03:52 -0400
Subject: mvsdio: allow automatic loading when modular

Signed-off-by: Nicolas Pitre <nico@marvell.com>
Tested-by: Martin Michlmayr <tbm@cyrius.com>
Signed-off-by: Pierre Ossman <pierre@ossman.eu>

diff --git a/drivers/mmc/host/mvsdio.c b/drivers/mmc/host/mvsdio.c
index c643d0f..1783043 100644
--- a/drivers/mmc/host/mvsdio.c
+++ b/drivers/mmc/host/mvsdio.c
@@ -882,3 +882,4 @@ module_param(nodma, int, 0);
 MODULE_AUTHOR("Maen Suleiman, Nicolas Pitre");
 MODULE_DESCRIPTION("Marvell MMC,SD,SDIO Host Controller driver");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:mvsdio");
-- 
cgit v0.10.2


From 992697e9b342115dcf052ffa41d418cb4fe1a841 Mon Sep 17 00:00:00 2001
From: Kumar Gala <galak@kernel.crashing.org>
Date: Fri, 8 May 2009 08:52:49 -0500
Subject: sdhci-of: Add fsl,esdhc as a valid compatible to bind against

We plan to use fsl,esdhc going forward as the base compatible so update
the driver to bind against it.

Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Pierre Ossman <pierre@ossman.eu>

diff --git a/drivers/mmc/host/sdhci-of.c b/drivers/mmc/host/sdhci-of.c
index 3ff4ac3..09cc597c 100644
--- a/drivers/mmc/host/sdhci-of.c
+++ b/drivers/mmc/host/sdhci-of.c
@@ -277,6 +277,7 @@ static int __devexit sdhci_of_remove(struct of_device *ofdev)
 static const struct of_device_id sdhci_of_match[] = {
 	{ .compatible = "fsl,mpc8379-esdhc", .data = &sdhci_esdhc, },
 	{ .compatible = "fsl,mpc8536-esdhc", .data = &sdhci_esdhc, },
+	{ .compatible = "fsl,esdhc", .data = &sdhci_esdhc, },
 	{ .compatible = "generic-sdhci", },
 	{},
 };
-- 
cgit v0.10.2


From e749c6f21fd7dc618f61dd178b4ee739c3cb1c31 Mon Sep 17 00:00:00 2001
From: Ben Nizette <bn@niasdigital.com>
Date: Thu, 16 Apr 2009 15:55:21 +1000
Subject: mmc/omap: Use disable_irq_nosync() from within irq handlers.

disable_irq() should wait for all running handlers to complete
before returning.  As such, if it's used to disable an interrupt
from that interrupt's handler it will deadlock.  This replaces
the dangerous instances with the _nosync() variant which doesn't
have this problem.

Signed-off-by: Ben Nizette <bn@niasdigital.com>
Acked-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Pierre Ossman <pierre@ossman.eu>

diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c
index bfa25c0..dceb5ee 100644
--- a/drivers/mmc/host/omap.c
+++ b/drivers/mmc/host/omap.c
@@ -822,7 +822,7 @@ static irqreturn_t mmc_omap_irq(int irq, void *dev_id)
 		del_timer(&host->cmd_abort_timer);
 		host->abort = 1;
 		OMAP_MMC_WRITE(host, IE, 0);
-		disable_irq(host->irq);
+		disable_irq_nosync(host->irq);
 		schedule_work(&host->cmd_abort_work);
 		return IRQ_HANDLED;
 	}
-- 
cgit v0.10.2


From 9ca6944cbfad11f2368cf10292e7f3eb036386c2 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@cam.org>
Date: Thu, 14 May 2009 21:28:05 -0400
Subject: mvsdio: ignore high speed timing requests from the core

Empirical evidences show that this is causing far more problems than it
solves when this mode is enabled in the host hardware.  Amongst those
cards that are known to be non functional when this bit is set are:

	A-Data "Speedy" 2GB SD card
	Kodak 512MB SD card
	Ativa 1GB MicroSD card
	Marvell 8688 (WIFI/Bluetooth) SDIO card

Since those cards do work on other host controllers which do honnor the
hs timing, the issue must be with this particular host hardware.

Signed-off-by: Nicolas Pitre <nico@marvell.com>
Signed-off-by: Pierre Ossman <pierre@ossman.eu>

diff --git a/drivers/mmc/host/mvsdio.c b/drivers/mmc/host/mvsdio.c
index 1783043..9d3cfa9 100644
--- a/drivers/mmc/host/mvsdio.c
+++ b/drivers/mmc/host/mvsdio.c
@@ -620,9 +620,18 @@ static void mvsd_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 	if (ios->bus_width == MMC_BUS_WIDTH_4)
 		ctrl_reg |= MVSD_HOST_CTRL_DATA_WIDTH_4_BITS;
 
+	/*
+	 * The HI_SPEED_EN bit is causing trouble with many (but not all)
+	 * high speed SD, SDHC and SDIO cards.  Not enabling that bit
+	 * makes all cards work.  So let's just ignore that bit for now
+	 * and revisit this issue if problems for not enabling this bit
+	 * are ever reported.
+	 */
+#if 0
 	if (ios->timing == MMC_TIMING_MMC_HS ||
 	    ios->timing == MMC_TIMING_SD_HS)
 		ctrl_reg |= MVSD_HOST_CTRL_HI_SPEED_EN;
+#endif
 
 	host->ctrl = ctrl_reg;
 	mvsd_write(MVSD_HOST_CTRL, ctrl_reg);
-- 
cgit v0.10.2


From 233f0b95ca3a0d1dcbd70bc7e519069a8e10d23e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 21:48:40 +0200
Subject: perf_counter tools: Work around warnings in older GCCs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GCC 4.1.2 produces:

 util/parse-options.c: In function ‘get_value’:
 util/parse-options.c:36: warning: ‘arg’ may be used uninitialized in this function

 builtin-top.c: In function ‘display_thread’:
 builtin-top.c:178: warning: ‘printed’ may be used uninitialized in this function

Annotate them away by initializing these variables to 0.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 16a6184..7c907e2 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -175,7 +175,7 @@ static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se)
 
 static void print_sym_table(void)
 {
-	int printed, j;
+	int printed = 0, j;
 	int counter;
 	float events_per_sec = events/delay_secs;
 	float kevents_per_sec = (events-userspace_events)/delay_secs;
diff --git a/Documentation/perf_counter/util/parse-options.c b/Documentation/perf_counter/util/parse-options.c
index b80abd9..551b6bc 100644
--- a/Documentation/perf_counter/util/parse-options.c
+++ b/Documentation/perf_counter/util/parse-options.c
@@ -33,7 +33,7 @@ static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt,
 static int get_value(struct parse_opt_ctx_t *p,
 		     const struct option *opt, int flags)
 {
-	const char *s, *arg;
+	const char *s, *arg = NULL;
 	const int unset = flags & OPT_UNSET;
 
 	if (unset && p->opt)
-- 
cgit v0.10.2


From a6d297f008e124d0bb4312369191b012c10a1a4e Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@cam.org>
Date: Tue, 26 May 2009 22:35:34 -0400
Subject: mvsdio: fix config failure with some high speed SDHC cards

Especially with Sandisk SDHC cards, the second SWITCH command was failing
with a timeout and the card was not recognized at all.  However if the
system was busy, or debugging was enabled, or a udelay(100) was inserted
before the second SWITCH command in the core code, then the timing was
so that the card started to work.

With some unusual block sizes, the data FIFO status doesn't indicate a
"empty" state right away when the data transfer is done.  Queuing
another data transfer in that condition results in a transfer timeout.

The empty FIFO bit eventually get set by itself in less than 50 usecs
when it is not set right away. So let's just poll for that bit before
configuring the controller with a new data transfer.

Signed-off-by: Nicolas Pitre <nico@marvell.com>
Signed-off-by: Pierre Ossman <pierre@ossman.eu>

diff --git a/drivers/mmc/host/mvsdio.c b/drivers/mmc/host/mvsdio.c
index 9d3cfa9..b56d72f 100644
--- a/drivers/mmc/host/mvsdio.c
+++ b/drivers/mmc/host/mvsdio.c
@@ -64,6 +64,31 @@ static int mvsd_setup_data(struct mvsd_host *host, struct mmc_data *data)
 	unsigned int tmout;
 	int tmout_index;
 
+	/*
+	 * Hardware weirdness.  The FIFO_EMPTY bit of the HW_STATE
+	 * register is sometimes not set before a while when some
+	 * "unusual" data block sizes are used (such as with the SWITCH
+	 * command), even despite the fact that the XFER_DONE interrupt
+	 * was raised.  And if another data transfer starts before
+	 * this bit comes to good sense (which eventually happens by
+	 * itself) then the new transfer simply fails with a timeout.
+	 */
+	if (!(mvsd_read(MVSD_HW_STATE) & (1 << 13))) {
+		unsigned long t = jiffies + HZ;
+		unsigned int hw_state,  count = 0;
+		do {
+			if (time_after(jiffies, t)) {
+				dev_warn(host->dev, "FIFO_EMPTY bit missing\n");
+				break;
+			}
+			hw_state = mvsd_read(MVSD_HW_STATE);
+			count++;
+		} while (!(hw_state & (1 << 13)));
+		dev_dbg(host->dev, "*** wait for FIFO_EMPTY bit "
+				   "(hw=0x%04x, count=%d, jiffies=%ld)\n",
+				   hw_state, count, jiffies - (t - HZ));
+	}
+
 	/* If timeout=0 then maximum timeout index is used. */
 	tmout = DIV_ROUND_UP(data->timeout_ns, host->ns_per_clk);
 	tmout += data->timeout_clks;
-- 
cgit v0.10.2


From fbf6a5fcbcc2248f1e676f7a0a7d49cd4b535d2a Mon Sep 17 00:00:00 2001
From: Dave Liu <daveliu@freescale.com>
Date: Wed, 6 May 2009 18:40:07 +0800
Subject: sdhci-of: Fix the wrong accessor to HOSTVER register

Freescale eSDHC controller has the special order for
the HOST version register. that is not same as the other's
registers. The address of HOSTVER in spec is 0xFE, and
we need use the in_be16(0xFE) to access it, not in_be16(0xFC).

Signed-off-by: Dave Liu <daveliu@freescale.com>
Acked-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Signed-off-by: Pierre Ossman <pierre@ossman.eu>

diff --git a/drivers/mmc/host/sdhci-of.c b/drivers/mmc/host/sdhci-of.c
index 09cc597c..128c614 100644
--- a/drivers/mmc/host/sdhci-of.c
+++ b/drivers/mmc/host/sdhci-of.c
@@ -55,7 +55,13 @@ static u32 esdhc_readl(struct sdhci_host *host, int reg)
 
 static u16 esdhc_readw(struct sdhci_host *host, int reg)
 {
-	return in_be16(host->ioaddr + (reg ^ 0x2));
+	u16 ret;
+
+	if (unlikely(reg == SDHCI_HOST_VERSION))
+		ret = in_be16(host->ioaddr + reg);
+	else
+		ret = in_be16(host->ioaddr + (reg ^ 0x2));
+	return ret;
 }
 
 static u8 esdhc_readb(struct sdhci_host *host, int reg)
-- 
cgit v0.10.2


From 128f048f0f0d2a477ad2555e7acd2ad15a1b6061 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 22:19:36 +0200
Subject: perf_counter: Fix throttling lock-up

Throttling logic is broken and we can lock up with too small
hw sampling intervals.

Make the throttling code more robust: disable counters even
if we already disabled them.

( Also clean up whitespace damage i noticed while reading
  various pieces of code related to throttling. )

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 12cc05e..8f53f3a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -91,7 +91,7 @@ static u64 intel_pmu_raw_event(u64 event)
 #define CORE_EVNTSEL_INV_MASK		0x00800000ULL
 #define CORE_EVNTSEL_COUNTER_MASK	0xFF000000ULL
 
-#define CORE_EVNTSEL_MASK 		\
+#define CORE_EVNTSEL_MASK		\
 	(CORE_EVNTSEL_EVENT_MASK |	\
 	 CORE_EVNTSEL_UNIT_MASK  |	\
 	 CORE_EVNTSEL_EDGE_MASK  |	\
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ab44554..0bb03f1 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2822,11 +2822,20 @@ int perf_counter_overflow(struct perf_counter *counter,
 
 	if (!throttle) {
 		counter->hw.interrupts++;
-	} else if (counter->hw.interrupts != MAX_INTERRUPTS) {
-		counter->hw.interrupts++;
-		if (HZ*counter->hw.interrupts > (u64)sysctl_perf_counter_limit) {
-			counter->hw.interrupts = MAX_INTERRUPTS;
-			perf_log_throttle(counter, 0);
+	} else {
+		if (counter->hw.interrupts != MAX_INTERRUPTS) {
+			counter->hw.interrupts++;
+			if (HZ*counter->hw.interrupts > (u64)sysctl_perf_counter_limit) {
+				counter->hw.interrupts = MAX_INTERRUPTS;
+				perf_log_throttle(counter, 0);
+				ret = 1;
+			}
+		} else {
+			/*
+			 * Keep re-disabling counters even though on the previous
+			 * pass we disabled it - just in case we raced with a
+			 * sched-in and the counter got enabled again:
+			 */
 			ret = 1;
 		}
 	}
-- 
cgit v0.10.2


From d80d338d2fb611b65830db7ea56680624776030f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 23:14:49 +0200
Subject: perf report: Clean up event processing

- Split out event processig into process_events() helper.

- Untangle the cwd parameters - it's constant so can be a static.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 33b3b15..333d312 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -147,7 +147,11 @@ static int load_kernel(void)
 	return err;
 }
 
-static int strcommon(const char *pathname, const char *cwd, int cwdlen)
+static char __cwd[PATH_MAX];
+static char *cwd = __cwd;
+static int cwdlen;
+
+static int strcommon(const char *pathname)
 {
 	int n = 0;
 
@@ -165,7 +169,7 @@ struct map {
 	struct dso	 *dso;
 };
 
-static struct map *map__new(struct mmap_event *event, char *cwd, int cwdlen)
+static struct map *map__new(struct mmap_event *event)
 {
 	struct map *self = malloc(sizeof(*self));
 
@@ -174,7 +178,8 @@ static struct map *map__new(struct mmap_event *event, char *cwd, int cwdlen)
 		char newfilename[PATH_MAX];
 
 		if (cwd) {
-			int n = strcommon(filename, cwd, cwdlen);
+			int n = strcommon(filename);
+
 			if (n == cwdlen) {
 				snprintf(newfilename, sizeof(newfilename),
 					 ".%s", filename + n);
@@ -752,85 +757,11 @@ static void register_idle_thread(void)
 	}
 }
 
+static unsigned long total = 0, total_mmap = 0, total_comm = 0, total_unknown = 0;
 
-static int __cmd_report(void)
+static int
+process_event(event_t *event, unsigned long offset, unsigned long head)
 {
-	unsigned long offset = 0;
-	unsigned long head = 0;
-	struct stat stat;
-	char *buf;
-	event_t *event;
-	int ret, rc = EXIT_FAILURE;
-	uint32_t size;
-	unsigned long total = 0, total_mmap = 0, total_comm = 0, total_unknown = 0;
-	char cwd[PATH_MAX], *cwdp = cwd;
-	int cwdlen;
-
-	register_idle_thread();
-
-	input = open(input_name, O_RDONLY);
-	if (input < 0) {
-		perror("failed to open file");
-		exit(-1);
-	}
-
-	ret = fstat(input, &stat);
-	if (ret < 0) {
-		perror("failed to stat file");
-		exit(-1);
-	}
-
-	if (!stat.st_size) {
-		fprintf(stderr, "zero-sized file, nothing to do!\n");
-		exit(0);
-	}
-
-	if (load_kernel() < 0) {
-		perror("failed to load kernel symbols");
-		return EXIT_FAILURE;
-	}
-
-	if (!full_paths) {
-		if (getcwd(cwd, sizeof(cwd)) == NULL) {
-			perror("failed to get the current directory");
-			return EXIT_FAILURE;
-		}
-		cwdlen = strlen(cwd);
-	} else {
-		cwdp = NULL;
-		cwdlen = 0;
-	}
-remap:
-	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
-			   MAP_SHARED, input, offset);
-	if (buf == MAP_FAILED) {
-		perror("failed to mmap file");
-		exit(-1);
-	}
-
-more:
-	event = (event_t *)(buf + head);
-
-	size = event->header.size;
-	if (!size)
-		size = 8;
-
-	if (head + event->header.size >= page_size * mmap_window) {
-		unsigned long shift = page_size * (head / page_size);
-		int ret;
-
-		ret = munmap(buf, page_size * mmap_window);
-		assert(ret == 0);
-
-		offset += shift;
-		head -= shift;
-		goto remap;
-	}
-
-	size = event->header.size;
-	if (!size)
-		goto broken_event;
-
 	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
 		char level;
 		int show = 0;
@@ -851,7 +782,7 @@ more:
 		if (thread == NULL) {
 			fprintf(stderr, "problem processing %d event, skipping it.\n",
 				event->header.type);
-			goto broken_event;
+			return -1;
 		}
 
 		if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
@@ -895,14 +826,14 @@ more:
 			if (hist_entry__add(thread, map, dso, sym, ip, level)) {
 				fprintf(stderr,
 		"problem incrementing symbol count, skipping event\n");
-				goto broken_event;
+				return -1;
 			}
 		}
 		total++;
 	} else switch (event->header.type) {
 	case PERF_EVENT_MMAP: {
 		struct thread *thread = threads__findnew(event->mmap.pid);
-		struct map *map = map__new(&event->mmap, cwdp, cwdlen);
+		struct map *map = map__new(&event->mmap);
 
 		dprintf("%p [%p]: PERF_EVENT_MMAP: [%p(%p) @ %p]: %s\n",
 			(void *)(offset + head),
@@ -915,7 +846,7 @@ more:
 		if (thread == NULL || map == NULL) {
 			if (verbose)
 				fprintf(stderr, "problem processing PERF_EVENT_MMAP, skipping event.\n");
-			goto broken_event;
+			return -1;
 		}
 		thread__insert_map(thread, map);
 		total_mmap++;
@@ -932,13 +863,93 @@ more:
 		if (thread == NULL ||
 		    thread__set_comm(thread, event->comm.comm)) {
 			fprintf(stderr, "problem processing PERF_EVENT_COMM, skipping event.\n");
-			goto broken_event;
+			return -1;
 		}
 		total_comm++;
 		break;
 	}
-	default: {
-broken_event:
+	default:
+		return -1;
+	}
+
+	return 0;
+}
+
+static int __cmd_report(void)
+{
+	unsigned long offset = 0;
+	unsigned long head = 0;
+	struct stat stat;
+	char *buf;
+	event_t *event;
+	int ret, rc = EXIT_FAILURE;
+	uint32_t size;
+
+	register_idle_thread();
+
+	input = open(input_name, O_RDONLY);
+	if (input < 0) {
+		perror("failed to open file");
+		exit(-1);
+	}
+
+	ret = fstat(input, &stat);
+	if (ret < 0) {
+		perror("failed to stat file");
+		exit(-1);
+	}
+
+	if (!stat.st_size) {
+		fprintf(stderr, "zero-sized file, nothing to do!\n");
+		exit(0);
+	}
+
+	if (load_kernel() < 0) {
+		perror("failed to load kernel symbols");
+		return EXIT_FAILURE;
+	}
+
+	if (!full_paths) {
+		if (getcwd(__cwd, sizeof(__cwd)) == NULL) {
+			perror("failed to get the current directory");
+			return EXIT_FAILURE;
+		}
+		cwdlen = strlen(cwd);
+	} else {
+		cwd = NULL;
+		cwdlen = 0;
+	}
+remap:
+	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
+			   MAP_SHARED, input, offset);
+	if (buf == MAP_FAILED) {
+		perror("failed to mmap file");
+		exit(-1);
+	}
+
+more:
+	event = (event_t *)(buf + head);
+
+	size = event->header.size;
+	if (!size)
+		size = 8;
+
+	if (head + event->header.size >= page_size * mmap_window) {
+		unsigned long shift = page_size * (head / page_size);
+		int ret;
+
+		ret = munmap(buf, page_size * mmap_window);
+		assert(ret == 0);
+
+		offset += shift;
+		head -= shift;
+		goto remap;
+	}
+
+	size = event->header.size;
+
+	if (!size || process_event(event, offset, head) < 0) {
+
 		dprintf("%p [%p]: skipping unknown header type: %d\n",
 			(void *)(offset + head),
 			(void *)(long)(event->header.size),
@@ -956,7 +967,6 @@ broken_event:
 
 		size = 8;
 	}
-	}
 
 	head += size;
 
-- 
cgit v0.10.2


From 75051724f78677532618dd164a515baf106990e5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 23:14:49 +0200
Subject: perf report: Split out event processing helpers

- Introduce per event helper functions

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 333d312..82b6252 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -50,6 +50,7 @@ struct ip_event {
 	__u64 ip;
 	__u32 pid, tid;
 };
+
 struct mmap_event {
 	struct perf_event_header header;
 	__u32 pid, tid;
@@ -58,9 +59,10 @@ struct mmap_event {
 	__u64 pgoff;
 	char filename[PATH_MAX];
 };
+
 struct comm_event {
 	struct perf_event_header header;
-	__u32 pid,tid;
+	__u32 pid, tid;
 	char comm[16];
 };
 
@@ -760,114 +762,137 @@ static void register_idle_thread(void)
 static unsigned long total = 0, total_mmap = 0, total_comm = 0, total_unknown = 0;
 
 static int
-process_event(event_t *event, unsigned long offset, unsigned long head)
+process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 {
-	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
-		char level;
-		int show = 0;
-		struct dso *dso = NULL;
-		struct thread *thread = threads__findnew(event->ip.pid);
-		uint64_t ip = event->ip.ip;
-		struct map *map = NULL;
-
-		dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
-			(void *)(offset + head),
-			(void *)(long)(event->header.size),
-			event->header.misc,
-			event->ip.pid,
-			(void *)(long)ip);
-
-		dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
-
-		if (thread == NULL) {
-			fprintf(stderr, "problem processing %d event, skipping it.\n",
-				event->header.type);
-			return -1;
-		}
-
-		if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
-			show = SHOW_KERNEL;
-			level = 'k';
+	char level;
+	int show = 0;
+	struct dso *dso = NULL;
+	struct thread *thread = threads__findnew(event->ip.pid);
+	uint64_t ip = event->ip.ip;
+	struct map *map = NULL;
+
+	dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->header.misc,
+		event->ip.pid,
+		(void *)(long)ip);
+
+	dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
+
+	if (thread == NULL) {
+		fprintf(stderr, "problem processing %d event, skipping it.\n",
+			event->header.type);
+		return -1;
+	}
 
-			dso = kernel_dso;
+	if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
+		show = SHOW_KERNEL;
+		level = 'k';
 
-			dprintf(" ...... dso: %s\n", dso->name);
+		dso = kernel_dso;
 
-		} else if (event->header.misc & PERF_EVENT_MISC_USER) {
+		dprintf(" ...... dso: %s\n", dso->name);
 
-			show = SHOW_USER;
-			level = '.';
+	} else if (event->header.misc & PERF_EVENT_MISC_USER) {
 
-			map = thread__find_map(thread, ip);
-			if (map != NULL) {
-				dso = map->dso;
-				ip -= map->start + map->pgoff;
-			} else {
-				/*
-				 * If this is outside of all known maps,
-				 * and is a negative address, try to look it
-				 * up in the kernel dso, as it might be a
-				 * vsyscall (which executes in user-mode):
-				 */
-				if ((long long)ip < 0)
-					dso = kernel_dso;
-			}
-			dprintf(" ...... dso: %s\n", dso ? dso->name : "<not found>");
+		show = SHOW_USER;
+		level = '.';
 
+		map = thread__find_map(thread, ip);
+		if (map != NULL) {
+			dso = map->dso;
+			ip -= map->start + map->pgoff;
 		} else {
-			show = SHOW_HV;
-			level = 'H';
-			dprintf(" ...... dso: [hypervisor]\n");
+			/*
+			 * If this is outside of all known maps,
+			 * and is a negative address, try to look it
+			 * up in the kernel dso, as it might be a
+			 * vsyscall (which executes in user-mode):
+			 */
+			if ((long long)ip < 0)
+				dso = kernel_dso;
 		}
+		dprintf(" ...... dso: %s\n", dso ? dso->name : "<not found>");
 
-		if (show & show_mask) {
-			struct symbol *sym = dso__find_symbol(dso, ip);
+	} else {
+		show = SHOW_HV;
+		level = 'H';
+		dprintf(" ...... dso: [hypervisor]\n");
+	}
 
-			if (hist_entry__add(thread, map, dso, sym, ip, level)) {
-				fprintf(stderr,
-		"problem incrementing symbol count, skipping event\n");
-				return -1;
-			}
-		}
-		total++;
-	} else switch (event->header.type) {
-	case PERF_EVENT_MMAP: {
-		struct thread *thread = threads__findnew(event->mmap.pid);
-		struct map *map = map__new(&event->mmap);
+	if (show & show_mask) {
+		struct symbol *sym = dso__find_symbol(dso, ip);
 
-		dprintf("%p [%p]: PERF_EVENT_MMAP: [%p(%p) @ %p]: %s\n",
-			(void *)(offset + head),
-			(void *)(long)(event->header.size),
-			(void *)(long)event->mmap.start,
-			(void *)(long)event->mmap.len,
-			(void *)(long)event->mmap.pgoff,
-			event->mmap.filename);
-
-		if (thread == NULL || map == NULL) {
-			if (verbose)
-				fprintf(stderr, "problem processing PERF_EVENT_MMAP, skipping event.\n");
+		if (hist_entry__add(thread, map, dso, sym, ip, level)) {
+			fprintf(stderr,
+		"problem incrementing symbol count, skipping event\n");
 			return -1;
 		}
-		thread__insert_map(thread, map);
-		total_mmap++;
-		break;
 	}
-	case PERF_EVENT_COMM: {
-		struct thread *thread = threads__findnew(event->comm.pid);
+	total++;
 
-		dprintf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
-			(void *)(offset + head),
-			(void *)(long)(event->header.size),
-			event->comm.comm, event->comm.pid);
+	return 0;
+}
 
-		if (thread == NULL ||
-		    thread__set_comm(thread, event->comm.comm)) {
-			fprintf(stderr, "problem processing PERF_EVENT_COMM, skipping event.\n");
-			return -1;
-		}
-		total_comm++;
-		break;
+static int
+process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	struct thread *thread = threads__findnew(event->mmap.pid);
+	struct map *map = map__new(&event->mmap);
+
+	dprintf("%p [%p]: PERF_EVENT_MMAP: [%p(%p) @ %p]: %s\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		(void *)(long)event->mmap.start,
+		(void *)(long)event->mmap.len,
+		(void *)(long)event->mmap.pgoff,
+		event->mmap.filename);
+
+	if (thread == NULL || map == NULL) {
+		dprintf("problem processing PERF_EVENT_MMAP, skipping event.\n");
+		return -1;
+	}
+
+	thread__insert_map(thread, map);
+	total_mmap++;
+
+	return 0;
+}
+
+static int
+process_comm_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	struct thread *thread = threads__findnew(event->comm.pid);
+
+	dprintf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->comm.comm, event->comm.pid);
+
+	if (thread == NULL ||
+	    thread__set_comm(thread, event->comm.comm)) {
+		dprintf("problem processing PERF_EVENT_COMM, skipping event.\n");
+		return -1;
 	}
+	total_comm++;
+
+	return 0;
+}
+
+static int
+process_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW)
+		return process_overflow_event(event, offset, head);
+
+	switch (event->header.type) {
+	case PERF_EVENT_MMAP:
+		return process_mmap_event(event, offset, head);
+
+	case PERF_EVENT_COMM:
+		return process_comm_event(event, offset, head);
+
 	default:
 		return -1;
 	}
@@ -877,13 +902,13 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 
 static int __cmd_report(void)
 {
+	int ret, rc = EXIT_FAILURE;
 	unsigned long offset = 0;
 	unsigned long head = 0;
 	struct stat stat;
-	char *buf;
 	event_t *event;
-	int ret, rc = EXIT_FAILURE;
 	uint32_t size;
+	char *buf;
 
 	register_idle_thread();
 
-- 
cgit v0.10.2


From d11444dfa78cdd887d8dfd2fab3883132aff2c2d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 23:29:14 +0200
Subject: perf report: Handle all known event types

We have munmap, throttle/unthrottle and period events as well,
process them - otherwise they are considered broke events and
we mis-parse the next few events.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 82b6252..6003cc3 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -893,6 +893,15 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 	case PERF_EVENT_COMM:
 		return process_comm_event(event, offset, head);
 
+	/*
+	 * We dont process them right now but they are fine:
+	 */
+	case PERF_EVENT_MUNMAP:
+	case PERF_EVENT_PERIOD:
+	case PERF_EVENT_THROTTLE:
+	case PERF_EVENT_UNTHROTTLE:
+		return 0;
+
 	default:
 		return -1;
 	}
-- 
cgit v0.10.2


From bfcd3555af478dbf04c87adc9bb1a739d0a6ccff Mon Sep 17 00:00:00 2001
From: Alberto Bertogli <albertito@blitiri.com.ar>
Date: Tue, 9 Jun 2009 00:06:20 -0400
Subject: jbd2: Fix minor typos in comments in fs/jbd2/journal.c

Signed-off-by: Alberto Bertogli <albertito@blitiri.com.ar>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 5814410..62be7d2 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1781,7 +1781,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
  * Journal abort has very specific semantics, which we describe
  * for journal abort.
  *
- * Two internal function, which provide abort to te jbd layer
+ * Two internal functions, which provide abort to the jbd layer
  * itself are here.
  */
 
@@ -1879,7 +1879,7 @@ void jbd2_journal_abort(journal_t *journal, int errno)
  * int jbd2_journal_errno () - returns the journal's error state.
  * @journal: journal to examine.
  *
- * This is the errno numbet set with jbd2_journal_abort(), the last
+ * This is the errno number set with jbd2_journal_abort(), the last
  * time the journal was mounted - if the journal was stopped
  * without calling abort this will be 0.
  *
@@ -1903,7 +1903,7 @@ int jbd2_journal_errno(journal_t *journal)
  * int jbd2_journal_clear_err () - clears the journal's error state
  * @journal: journal to act on.
  *
- * An error must be cleared or Acked to take a FS out of readonly
+ * An error must be cleared or acked to take a FS out of readonly
  * mode.
  */
 int jbd2_journal_clear_err(journal_t *journal)
@@ -1923,7 +1923,7 @@ int jbd2_journal_clear_err(journal_t *journal)
  * void jbd2_journal_ack_err() - Ack journal err.
  * @journal: journal to act on.
  *
- * An error must be cleared or Acked to take a FS out of readonly
+ * An error must be cleared or acked to take a FS out of readonly
  * mode.
  */
 void jbd2_journal_ack_err(journal_t *journal)
-- 
cgit v0.10.2


From 0b8e58a140cae2ba1c4a21ccae7c6c3c939c51f9 Mon Sep 17 00:00:00 2001
From: Andreas Dilger <adilger@sun.com>
Date: Wed, 3 Jun 2009 17:59:28 -0400
Subject: ext4: super.c whitespace cleanup

Cleanup of whitespace and formatting.  Initially driven by confusing indents
for the ext4_{block,inode}_bitmap() et. al. helper routines, but figured I'd
cleanup some other 80-column wrapping and other indenting problems at the
same time.

Signed-off-by: Andreas Dilger <adilger@sun.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 91b98b5..0a97b1a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -79,7 +79,7 @@ ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
 {
 	return le32_to_cpu(bg->bg_block_bitmap_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-		(ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
+		 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
 }
 
 ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
@@ -87,7 +87,7 @@ ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
 {
 	return le32_to_cpu(bg->bg_inode_bitmap_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-		(ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
+		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
 }
 
 ext4_fsblk_t ext4_inode_table(struct super_block *sb,
@@ -95,7 +95,7 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 {
 	return le32_to_cpu(bg->bg_inode_table_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-		(ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
+		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
 }
 
 __u32 ext4_free_blks_count(struct super_block *sb,
@@ -103,7 +103,7 @@ __u32 ext4_free_blks_count(struct super_block *sb,
 {
 	return le16_to_cpu(bg->bg_free_blocks_count_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-		(__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
+		 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
 }
 
 __u32 ext4_free_inodes_count(struct super_block *sb,
@@ -111,7 +111,7 @@ __u32 ext4_free_inodes_count(struct super_block *sb,
 {
 	return le16_to_cpu(bg->bg_free_inodes_count_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-		(__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
+		 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
 }
 
 __u32 ext4_used_dirs_count(struct super_block *sb,
@@ -119,7 +119,7 @@ __u32 ext4_used_dirs_count(struct super_block *sb,
 {
 	return le16_to_cpu(bg->bg_used_dirs_count_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-		(__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
+		 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
 }
 
 __u32 ext4_itable_unused_count(struct super_block *sb,
@@ -127,7 +127,7 @@ __u32 ext4_itable_unused_count(struct super_block *sb,
 {
 	return le16_to_cpu(bg->bg_itable_unused_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-		(__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
+		 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
 }
 
 void ext4_block_bitmap_set(struct super_block *sb,
@@ -207,8 +207,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
 	journal = EXT4_SB(sb)->s_journal;
 	if (journal) {
 		if (is_journal_aborted(journal)) {
-			ext4_abort(sb, __func__,
-				   "Detected aborted journal");
+			ext4_abort(sb, __func__, "Detected aborted journal");
 			return ERR_PTR(-EROFS);
 		}
 		return jbd2_journal_start(journal, nblocks);
@@ -436,7 +435,7 @@ void ext4_warning(struct super_block *sb, const char *function,
 }
 
 void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp,
-				const char *function, const char *fmt, ...)
+			   const char *function, const char *fmt, ...)
 __releases(bitlock)
 __acquires(bitlock)
 {
@@ -472,7 +471,6 @@ __acquires(bitlock)
 	return;
 }
 
-
 void ext4_update_dynamic_rev(struct super_block *sb)
 {
 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
@@ -638,7 +636,6 @@ static void ext4_put_super(struct super_block *sb)
 	lock_kernel();
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
-	return;
 }
 
 static struct kmem_cache *ext4_inode_cachep;
@@ -653,6 +650,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
 	if (!ei)
 		return NULL;
+
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 	ei->i_acl = EXT4_ACL_NOT_CACHED;
 	ei->i_default_acl = EXT4_ACL_NOT_CACHED;
@@ -673,6 +671,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ei->i_allocated_meta_blocks = 0;
 	ei->i_delalloc_reserved_flag = 0;
 	spin_lock_init(&(ei->i_block_reservation_lock));
+
 	return &ei->vfs_inode;
 }
 
@@ -879,12 +878,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",noauto_da_alloc");
 
 	ext4_show_quota_options(seq, sb);
+
 	return 0;
 }
 
-
 static struct inode *ext4_nfs_get_inode(struct super_block *sb,
-		u64 ino, u32 generation)
+					u64 ino, u32 generation)
 {
 	struct inode *inode;
 
@@ -913,14 +912,14 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb,
 }
 
 static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
-		int fh_len, int fh_type)
+					int fh_len, int fh_type)
 {
 	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
 				    ext4_nfs_get_inode);
 }
 
 static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
-		int fh_len, int fh_type)
+					int fh_len, int fh_type)
 {
 	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
 				    ext4_nfs_get_inode);
@@ -932,7 +931,8 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
  * which would prevent try_to_free_buffers() from freeing them, we must use
  * jbd2 layer's try_to_free_buffers() function to release them.
  */
-static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_t wait)
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
+				 gfp_t wait)
 {
 	journal_t *journal = EXT4_SB(sb)->s_journal;
 
@@ -1133,8 +1133,9 @@ static ext4_fsblk_t get_sb_block(void **data)
 
 	if (!options || strncmp(options, "sb=", 3) != 0)
 		return 1;	/* Default location */
+
 	options += 3;
-	/*todo: use simple_strtoll with >32bit ext4 */
+	/* TODO: use simple_strtoll with >32bit ext4 */
 	sb_block = simple_strtoul(options, &options, 0);
 	if (*options && *options != ',') {
 		printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
@@ -1144,6 +1145,7 @@ static ext4_fsblk_t get_sb_block(void **data)
 	if (*options == ',')
 		options++;
 	*data = (void *) options;
+
 	return sb_block;
 }
 
@@ -1626,7 +1628,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 		printk(KERN_WARNING
 		       "EXT4-fs warning: checktime reached, "
 		       "running e2fsck is recommended\n");
-	if (!sbi->s_journal) 
+	if (!sbi->s_journal)
 		es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
 	if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
 		es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
@@ -1810,7 +1812,7 @@ static int ext4_check_descriptors(struct super_block *sb)
 	}
 
 	ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
-	sbi->s_es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
+	sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
 	return 1;
 }
 
@@ -1926,6 +1928,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 #endif
 	sb->s_flags = s_flags; /* Restore MS_RDONLY status */
 }
+
 /*
  * Maximal extent format file size.
  * Resulting logical blkno at s_maxbytes must fit in our on-disk
@@ -1976,19 +1979,19 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
 	loff_t res = EXT4_NDIR_BLOCKS;
 	int meta_blocks;
 	loff_t upper_limit;
-	/* This is calculated to be the largest file size for a
-	 * dense, bitmapped file such that the total number of
-	 * sectors in the file, including data and all indirect blocks,
-	 * does not exceed 2^48 -1
-	 * __u32 i_blocks_lo and _u16 i_blocks_high representing the
-	 * total number of  512 bytes blocks of the file
+	/* This is calculated to be the largest file size for a dense, block
+	 * mapped file such that the file's total number of 512-byte sectors,
+	 * including data and all indirect blocks, does not exceed (2^48 - 1).
+	 *
+	 * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
+	 * number of 512-byte sectors of the file.
 	 */
 
 	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
 		/*
-		 * !has_huge_files or CONFIG_LBD is not enabled
-		 * implies the inode i_block represent total blocks in
-		 * 512 bytes 32 == size of vfs inode i_blocks * 8
+		 * !has_huge_files or CONFIG_LBD not enabled implies that
+		 * the inode i_block field represents total file blocks in
+		 * 2^32 512-byte sectors == size of vfs inode i_blocks * 8
 		 */
 		upper_limit = (1LL << 32) - 1;
 
@@ -2030,7 +2033,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
 }
 
 static ext4_fsblk_t descriptor_loc(struct super_block *sb,
-				ext4_fsblk_t logical_sb_block, int nr)
+				   ext4_fsblk_t logical_sb_block, int nr)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_group_t bg, first_meta_bg;
@@ -2044,6 +2047,7 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
 	bg = sbi->s_desc_per_block * nr;
 	if (ext4_bg_has_super(sb, bg))
 		has_super = 1;
+
 	return (has_super + ext4_group_first_block_no(sb, bg));
 }
 
@@ -2148,7 +2152,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
 }
 
 static ssize_t sbi_ui_show(struct ext4_attr *a,
-				struct ext4_sb_info *sbi, char *buf)
+			   struct ext4_sb_info *sbi, char *buf)
 {
 	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
 
@@ -2253,7 +2257,6 @@ static struct kobj_type ext4_ktype = {
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				__releases(kernel_lock)
 				__acquires(kernel_lock)
-
 {
 	struct buffer_head *bh;
 	struct ext4_super_block *es = NULL;
@@ -2379,7 +2382,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 */
 	set_opt(sbi->s_mount_opt, DELALLOC);
 
-
 	if (!parse_options((char *) data, sb, &journal_devnum,
 			   &journal_ioprio, NULL, 0))
 		goto failed_mount;
@@ -2442,7 +2444,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	if (sb->s_blocksize != blocksize) {
-
 		/* Validate the filesystem blocksize */
 		if (!sb_set_blocksize(sb, blocksize)) {
 			printk(KERN_ERR "EXT4-fs: bad block size %d.\n",
@@ -2489,6 +2490,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
 			sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
 	}
+
 	sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
 		if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
@@ -2501,10 +2503,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		}
 	} else
 		sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
+
 	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
 	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
 	if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
 		goto cantfind_ext4;
+
 	sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
 	if (sbi->s_inodes_per_block == 0)
 		goto cantfind_ext4;
@@ -2515,6 +2519,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_mount_state = le16_to_cpu(es->s_state);
 	sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
 	sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
+
 	for (i = 0; i < 4; i++)
 		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
 	sbi->s_def_hash_version = es->s_def_hash_version;
@@ -2566,12 +2571,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
-        /*
-         * It makes no sense for the first data block to be beyond the end
-         * of the filesystem.
-         */
-        if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
-                printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
+	/*
+	 * It makes no sense for the first data block to be beyond the end
+	 * of the filesystem.
+	 */
+	if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
+		printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
 		       "block %u is beyond end of filesystem (%llu)\n",
 		       le32_to_cpu(es->s_first_data_block),
 		       ext4_blocks_count(es));
@@ -3082,6 +3087,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	EXT4_SB(sb)->journal_bdev = bdev;
 	ext4_init_journal_params(sb, journal);
 	return journal;
+
 out_journal:
 	jbd2_journal_destroy(journal);
 out_bdev:
@@ -3116,7 +3122,6 @@ static int ext4_load_journal(struct super_block *sb,
 	 * crash?  For recovery, we need to check in advance whether we
 	 * can get read-write access to the device.
 	 */
-
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
 		if (sb->s_flags & MS_RDONLY) {
 			printk(KERN_INFO "EXT4-fs: INFO: recovery "
@@ -3234,7 +3239,6 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 	return error;
 }
 
-
 /*
  * Have we just finished recovery?  If so, and if we are mounting (or
  * remounting) the filesystem readonly, then we will end up with a
@@ -3485,8 +3489,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 
 			/*
 			 * Make sure the group descriptor checksums
-			 * are sane.  If they aren't, refuse to
-			 * remount r/w.
+			 * are sane.  If they aren't, refuse to remount r/w.
 			 */
 			for (g = 0; g < sbi->s_groups_count; g++) {
 				struct ext4_group_desc *gdp =
@@ -3545,6 +3548,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			kfree(old_opts.s_qf_names[i]);
 #endif
 	return 0;
+
 restore_opts:
 	sb->s_flags = old_sb_flags;
 	sbi->s_mount_opt = old_opts.s_mount_opt;
@@ -3628,11 +3632,12 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
 	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
 	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+
 	return 0;
 }
 
-/* Helper function for writing quotas on sync - we need to start transaction before quota file
- * is locked for write. Otherwise the are possible deadlocks:
+/* Helper function for writing quotas on sync - we need to start transaction
+ * before quota file is locked for write. Otherwise the are possible deadlocks:
  * Process 1                         Process 2
  * ext4_create()                     quota_sync()
  *   jbd2_journal_start()                  write_dquot()
@@ -3656,7 +3661,7 @@ static int ext4_write_dquot(struct dquot *dquot)
 
 	inode = dquot_to_inode(dquot);
 	handle = ext4_journal_start(inode,
-					EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+				    EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	ret = dquot_commit(dquot);
@@ -3672,7 +3677,7 @@ static int ext4_acquire_dquot(struct dquot *dquot)
 	handle_t *handle;
 
 	handle = ext4_journal_start(dquot_to_inode(dquot),
-					EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
+				    EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	ret = dquot_acquire(dquot);
@@ -3688,7 +3693,7 @@ static int ext4_release_dquot(struct dquot *dquot)
 	handle_t *handle;
 
 	handle = ext4_journal_start(dquot_to_inode(dquot),
-					EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+				    EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
 	if (IS_ERR(handle)) {
 		/* Release dquot anyway to avoid endless cycle in dqput() */
 		dquot_release(dquot);
@@ -3736,7 +3741,7 @@ static int ext4_write_info(struct super_block *sb, int type)
 static int ext4_quota_on_mount(struct super_block *sb, int type)
 {
 	return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
-			EXT4_SB(sb)->s_jquota_fmt, type);
+				  EXT4_SB(sb)->s_jquota_fmt, type);
 }
 
 /*
@@ -3907,10 +3912,10 @@ out:
 
 #endif
 
-static int ext4_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static int ext4_get_sb(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
 }
 
 static struct file_system_type ext4_fs_type = {
@@ -3922,14 +3927,14 @@ static struct file_system_type ext4_fs_type = {
 };
 
 #ifdef CONFIG_EXT4DEV_COMPAT
-static int ext4dev_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static int ext4dev_get_sb(struct file_system_type *fs_type, int flags,
+			  const char *dev_name, void *data,struct vfsmount *mnt)
 {
 	printk(KERN_WARNING "EXT4-fs: Update your userspace programs "
 	       "to mount using ext4\n");
 	printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility "
 	       "will go away by 2.6.31\n");
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
 }
 
 static struct file_system_type ext4dev_fs_type = {
-- 
cgit v0.10.2


From b8da7de56ca0ad34726478a50d138a29a9ff76cb Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Tue, 2 Jun 2009 16:50:35 +1000
Subject: drm: fix irq naming for kms drivers.

allocating devname in the i915 driver was a hack originally and I
forgot to figure out how to do this properly back then.

So this is the cleaner version that just picks devname or driver name
in the irq code.

It removes the devname allocs from the i915 driver.

Signed-off-by: Dave Airlie <airlied@redhat.com>

diff --git a/drivers/gpu/drm/drm_irq.c b/drivers/gpu/drm/drm_irq.c
index 93e677a..fc8e5ac 100644
--- a/drivers/gpu/drm/drm_irq.c
+++ b/drivers/gpu/drm/drm_irq.c
@@ -196,6 +196,7 @@ int drm_irq_install(struct drm_device *dev)
 {
 	int ret = 0;
 	unsigned long sh_flags = 0;
+	char *irqname;
 
 	if (!drm_core_check_feature(dev, DRIVER_HAVE_IRQ))
 		return -EINVAL;
@@ -227,8 +228,13 @@ int drm_irq_install(struct drm_device *dev)
 	if (drm_core_check_feature(dev, DRIVER_IRQ_SHARED))
 		sh_flags = IRQF_SHARED;
 
+	if (dev->devname)
+		irqname = dev->devname;
+	else
+		irqname = dev->driver->name;
+
 	ret = request_irq(drm_dev_to_irq(dev), dev->driver->irq_handler,
-			  sh_flags, dev->devname, dev);
+			  sh_flags, irqname, dev);
 
 	if (ret < 0) {
 		mutex_lock(&dev->struct_mutex);
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 53d5445..0ccb63e 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -987,12 +987,6 @@ static int i915_load_modeset_init(struct drm_device *dev)
 	int fb_bar = IS_I9XX(dev) ? 2 : 0;
 	int ret = 0;
 
-	dev->devname = kstrdup(DRIVER_NAME, GFP_KERNEL);
-	if (!dev->devname) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
 	dev->mode_config.fb_base = drm_get_resource_start(dev, fb_bar) &
 		0xff000000;
 
@@ -1006,7 +1000,7 @@ static int i915_load_modeset_init(struct drm_device *dev)
 
 	ret = i915_probe_agp(dev, &agp_size, &prealloc_size);
 	if (ret)
-		goto kfree_devname;
+		goto out;
 
 	/* Basic memrange allocator for stolen space (aka vram) */
 	drm_mm_init(&dev_priv->vram, 0, prealloc_size);
@@ -1024,7 +1018,7 @@ static int i915_load_modeset_init(struct drm_device *dev)
 
 	ret = i915_gem_init_ringbuffer(dev);
 	if (ret)
-		goto kfree_devname;
+		goto out;
 
 	/* Allow hardware batchbuffers unless told otherwise.
 	 */
@@ -1056,8 +1050,6 @@ static int i915_load_modeset_init(struct drm_device *dev)
 
 destroy_ringbuffer:
 	i915_gem_cleanup_ringbuffer(dev);
-kfree_devname:
-	kfree(dev->devname);
 out:
 	return ret;
 }
-- 
cgit v0.10.2


From 9863871bd1bbf218b921af5e0bc48ca4f6ea9f12 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@linux.ie>
Date: Thu, 4 Jun 2009 07:08:13 +1000
Subject: drm/radeon: fix ring free alignment calculations

fd.o bz#21849

We were aligning to +16 dwords, instead of to the next 16dword
boundary in the ring. Fix the calculation to go to the next 16dword
boundary when space checking.

Signed-off-by: Dave Airlie <airlied@redhat.com>

diff --git a/drivers/gpu/drm/radeon/radeon_cp.c b/drivers/gpu/drm/radeon/radeon_cp.c
index 77a7a4d..aff90bb 100644
--- a/drivers/gpu/drm/radeon/radeon_cp.c
+++ b/drivers/gpu/drm/radeon/radeon_cp.c
@@ -2185,9 +2185,9 @@ void radeon_commit_ring(drm_radeon_private_t *dev_priv)
 
 	/* check if the ring is padded out to 16-dword alignment */
 
-	tail_aligned = dev_priv->ring.tail & 0xf;
+	tail_aligned = dev_priv->ring.tail & (RADEON_RING_ALIGN-1);
 	if (tail_aligned) {
-		int num_p2 = 16 - tail_aligned;
+		int num_p2 = RADEON_RING_ALIGN - tail_aligned;
 
 		ring = dev_priv->ring.start;
 		/* pad with some CP_PACKET2 */
diff --git a/drivers/gpu/drm/radeon/radeon_drv.h b/drivers/gpu/drm/radeon/radeon_drv.h
index 8071d96..0c6bfc1 100644
--- a/drivers/gpu/drm/radeon/radeon_drv.h
+++ b/drivers/gpu/drm/radeon/radeon_drv.h
@@ -1964,11 +1964,14 @@ do {								\
 
 #define RING_LOCALS	int write, _nr, _align_nr; unsigned int mask; u32 *ring;
 
+#define RADEON_RING_ALIGN 16
+
 #define BEGIN_RING( n ) do {						\
 	if ( RADEON_VERBOSE ) {						\
 		DRM_INFO( "BEGIN_RING( %d )\n", (n));			\
 	}								\
-	_align_nr = (n + 0xf) & ~0xf;					\
+	_align_nr = RADEON_RING_ALIGN - ((dev_priv->ring.tail + n) & (RADEON_RING_ALIGN-1));	\
+	_align_nr += n;							\
 	if (dev_priv->ring.space <= (_align_nr * sizeof(u32))) {	\
                 COMMIT_RING();						\
 		radeon_wait_ring( dev_priv, _align_nr * sizeof(u32));	\
-- 
cgit v0.10.2


From 75185c929ed241f5cf1aa28999b8012181e2c7cb Mon Sep 17 00:00:00 2001
From: Keith Packard <keithp@keithp.com>
Date: Sat, 30 May 2009 20:42:25 -0700
Subject: drm: add newlines to text sysfs files

The contents of various simple text files in sysfs should end with
a newline to make them easier to read from the console.

Signed-off-by: Keith Packard <keithp@keithp.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>

diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
index 8f93729..182bdf9 100644
--- a/drivers/gpu/drm/drm_sysfs.c
+++ b/drivers/gpu/drm/drm_sysfs.c
@@ -147,7 +147,7 @@ static ssize_t status_show(struct device *device,
 	enum drm_connector_status status;
 
 	status = connector->funcs->detect(connector);
-	return snprintf(buf, PAGE_SIZE, "%s",
+	return snprintf(buf, PAGE_SIZE, "%s\n",
 			drm_get_connector_status_name(status));
 }
 
@@ -166,7 +166,7 @@ static ssize_t dpms_show(struct device *device,
 	if (ret)
 		return 0;
 
-	return snprintf(buf, PAGE_SIZE, "%s",
+	return snprintf(buf, PAGE_SIZE, "%s\n",
 			drm_get_dpms_name((int)dpms_status));
 }
 
@@ -176,7 +176,7 @@ static ssize_t enabled_show(struct device *device,
 {
 	struct drm_connector *connector = to_drm_connector(device);
 
-	return snprintf(buf, PAGE_SIZE, connector->encoder ? "enabled" :
+	return snprintf(buf, PAGE_SIZE, "%s\n", connector->encoder ? "enabled" :
 			"disabled");
 }
 
-- 
cgit v0.10.2


From e36ebaf49274ffa78f17b62bcae4c92c33b5b391 Mon Sep 17 00:00:00 2001
From: Keith Packard <keithp@keithp.com>
Date: Sat, 30 May 2009 20:42:26 -0700
Subject: drm: set permissions on edid file to 0444

Without initializing the sysfs attributes for the edid file,
it was created with mode 0, making it difficult for applications to use.

Signed-off-by: Keith Packard <keithp@keithp.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>

diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
index 182bdf9..9987ab8 100644
--- a/drivers/gpu/drm/drm_sysfs.c
+++ b/drivers/gpu/drm/drm_sysfs.c
@@ -317,6 +317,7 @@ static struct device_attribute connector_attrs_opt1[] = {
 
 static struct bin_attribute edid_attr = {
 	.attr.name = "edid",
+	.attr.mode = 0444,
 	.size = 128,
 	.read = edid_show,
 };
-- 
cgit v0.10.2


From c9fb15f60eb517c958dec64dca9357bf62bf2201 Mon Sep 17 00:00:00 2001
From: Keith Packard <keithp@keithp.com>
Date: Sat, 30 May 2009 20:42:28 -0700
Subject: drm: Hook up DPMS property handling in drm_crtc.c. Add
 drm_helper_connector_dpms.

Making the drm_crtc.c code recognize the DPMS property and invoke the
connector->dpms function doesn't remove any capability from the driver while
reducing code duplication.

That just highlighted the problem with the existing DPMS functions which
could turn off the connector, but failed to turn off any relevant crtcs. The
new drm_helper_connector_dpms function manages all of that, using the
drm_helper-specific crtc and encoder dpms functions, automatically computing
the appropriate DPMS level for each object in the system.

This fixes the current troubles in the i915 driver which left PLLs, pipes
and planes running while in DPMS_OFF mode or even while they were unused.

Signed-off-by: Keith Packard <keithp@keithp.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>

diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c
index 94a7688..8fab789 100644
--- a/drivers/gpu/drm/drm_crtc.c
+++ b/drivers/gpu/drm/drm_crtc.c
@@ -2294,7 +2294,12 @@ int drm_mode_connector_property_set_ioctl(struct drm_device *dev,
 		}
 	}
 
-	if (connector->funcs->set_property)
+	/* Do DPMS ourselves */
+	if (property == connector->dev->mode_config.dpms_property) {
+		if (connector->funcs->dpms)
+			(*connector->funcs->dpms)(connector, (int) out_resp->value);
+		ret = 0;
+	} else if (connector->funcs->set_property)
 		ret = connector->funcs->set_property(connector, property, out_resp->value);
 
 	/* store the property value if succesful */
diff --git a/drivers/gpu/drm/drm_crtc_helper.c b/drivers/gpu/drm/drm_crtc_helper.c
index 4589044..a6f73f1 100644
--- a/drivers/gpu/drm/drm_crtc_helper.c
+++ b/drivers/gpu/drm/drm_crtc_helper.c
@@ -199,6 +199,29 @@ static void drm_helper_add_std_modes(struct drm_device *dev,
 }
 
 /**
+ * drm_helper_encoder_in_use - check if a given encoder is in use
+ * @encoder: encoder to check
+ *
+ * LOCKING:
+ * Caller must hold mode config lock.
+ *
+ * Walk @encoders's DRM device's mode_config and see if it's in use.
+ *
+ * RETURNS:
+ * True if @encoder is part of the mode_config, false otherwise.
+ */
+bool drm_helper_encoder_in_use(struct drm_encoder *encoder)
+{
+	struct drm_connector *connector;
+	struct drm_device *dev = encoder->dev;
+	list_for_each_entry(connector, &dev->mode_config.connector_list, head)
+		if (connector->encoder == encoder)
+			return true;
+	return false;
+}
+EXPORT_SYMBOL(drm_helper_encoder_in_use);
+
+/**
  * drm_helper_crtc_in_use - check if a given CRTC is in a mode_config
  * @crtc: CRTC to check
  *
@@ -216,7 +239,7 @@ bool drm_helper_crtc_in_use(struct drm_crtc *crtc)
 	struct drm_device *dev = crtc->dev;
 	/* FIXME: Locking around list access? */
 	list_for_each_entry(encoder, &dev->mode_config.encoder_list, head)
-		if (encoder->crtc == crtc)
+		if (encoder->crtc == crtc && drm_helper_encoder_in_use(encoder))
 			return true;
 	return false;
 }
@@ -240,7 +263,7 @@ void drm_helper_disable_unused_functions(struct drm_device *dev)
 
 	list_for_each_entry(encoder, &dev->mode_config.encoder_list, head) {
 		encoder_funcs = encoder->helper_private;
-		if (!encoder->crtc)
+		if (!drm_helper_encoder_in_use(encoder))
 			(*encoder_funcs->dpms)(encoder, DRM_MODE_DPMS_OFF);
 	}
 
@@ -935,6 +958,88 @@ bool drm_helper_initial_config(struct drm_device *dev)
 }
 EXPORT_SYMBOL(drm_helper_initial_config);
 
+static int drm_helper_choose_encoder_dpms(struct drm_encoder *encoder)
+{
+	int dpms = DRM_MODE_DPMS_OFF;
+	struct drm_connector *connector;
+	struct drm_device *dev = encoder->dev;
+
+	list_for_each_entry(connector, &dev->mode_config.connector_list, head)
+		if (connector->encoder == encoder)
+			if (connector->dpms < dpms)
+				dpms = connector->dpms;
+	return dpms;
+}
+
+static int drm_helper_choose_crtc_dpms(struct drm_crtc *crtc)
+{
+	int dpms = DRM_MODE_DPMS_OFF;
+	struct drm_connector *connector;
+	struct drm_device *dev = crtc->dev;
+
+	list_for_each_entry(connector, &dev->mode_config.connector_list, head)
+		if (connector->encoder && connector->encoder->crtc == crtc)
+			if (connector->dpms < dpms)
+				dpms = connector->dpms;
+	return dpms;
+}
+
+/**
+ * drm_helper_connector_dpms
+ * @connector affected connector
+ * @mode DPMS mode
+ *
+ * Calls the low-level connector DPMS function, then
+ * calls appropriate encoder and crtc DPMS functions as well
+ */
+void drm_helper_connector_dpms(struct drm_connector *connector, int mode)
+{
+	struct drm_encoder *encoder = connector->encoder;
+	struct drm_crtc *crtc = encoder ? encoder->crtc : NULL;
+	int old_dpms;
+
+	if (mode == connector->dpms)
+		return;
+
+	old_dpms = connector->dpms;
+	connector->dpms = mode;
+
+	/* from off to on, do crtc then encoder */
+	if (mode < old_dpms) {
+		if (crtc) {
+			struct drm_crtc_helper_funcs *crtc_funcs = crtc->helper_private;
+			if (crtc_funcs->dpms)
+				(*crtc_funcs->dpms) (crtc,
+						     drm_helper_choose_crtc_dpms(crtc));
+		}
+		if (encoder) {
+			struct drm_encoder_helper_funcs *encoder_funcs = encoder->helper_private;
+			if (encoder_funcs->dpms)
+				(*encoder_funcs->dpms) (encoder,
+							drm_helper_choose_encoder_dpms(encoder));
+		}
+	}
+
+	/* from on to off, do encoder then crtc */
+	if (mode > old_dpms) {
+		if (encoder) {
+			struct drm_encoder_helper_funcs *encoder_funcs = encoder->helper_private;
+			if (encoder_funcs->dpms)
+				(*encoder_funcs->dpms) (encoder,
+							drm_helper_choose_encoder_dpms(encoder));
+		}
+		if (crtc) {
+			struct drm_crtc_helper_funcs *crtc_funcs = crtc->helper_private;
+			if (crtc_funcs->dpms)
+				(*crtc_funcs->dpms) (crtc,
+						     drm_helper_choose_crtc_dpms(crtc));
+		}
+	}
+
+	return;
+}
+EXPORT_SYMBOL(drm_helper_connector_dpms);
+
 /**
  * drm_hotplug_stage_two
  * @dev DRM device
diff --git a/drivers/gpu/drm/i915/intel_crt.c b/drivers/gpu/drm/i915/intel_crt.c
index 640f515..79acc4f 100644
--- a/drivers/gpu/drm/i915/intel_crt.c
+++ b/drivers/gpu/drm/i915/intel_crt.c
@@ -381,11 +381,6 @@ static int intel_crt_set_property(struct drm_connector *connector,
 				  struct drm_property *property,
 				  uint64_t value)
 {
-	struct drm_device *dev = connector->dev;
-
-	if (property == dev->mode_config.dpms_property && connector->encoder)
-		intel_crt_dpms(connector->encoder, (uint32_t)(value & 0xf));
-
 	return 0;
 }
 
@@ -402,6 +397,7 @@ static const struct drm_encoder_helper_funcs intel_crt_helper_funcs = {
 };
 
 static const struct drm_connector_funcs intel_crt_connector_funcs = {
+	.dpms = drm_helper_connector_dpms,
 	.detect = intel_crt_detect,
 	.fill_modes = drm_helper_probe_single_connector_modes,
 	.destroy = intel_crt_destroy,
diff --git a/drivers/gpu/drm/i915/intel_dvo.c b/drivers/gpu/drm/i915/intel_dvo.c
index 8b8d6e6..1ee3007 100644
--- a/drivers/gpu/drm/i915/intel_dvo.c
+++ b/drivers/gpu/drm/i915/intel_dvo.c
@@ -316,6 +316,7 @@ static const struct drm_encoder_helper_funcs intel_dvo_helper_funcs = {
 };
 
 static const struct drm_connector_funcs intel_dvo_connector_funcs = {
+	.dpms = drm_helper_connector_dpms,
 	.save = intel_dvo_save,
 	.restore = intel_dvo_restore,
 	.detect = intel_dvo_detect,
diff --git a/drivers/gpu/drm/i915/intel_hdmi.c b/drivers/gpu/drm/i915/intel_hdmi.c
index d0983bb..7d6bdd7 100644
--- a/drivers/gpu/drm/i915/intel_hdmi.c
+++ b/drivers/gpu/drm/i915/intel_hdmi.c
@@ -219,6 +219,7 @@ static const struct drm_encoder_helper_funcs intel_hdmi_helper_funcs = {
 };
 
 static const struct drm_connector_funcs intel_hdmi_connector_funcs = {
+	.dpms = drm_helper_connector_dpms,
 	.save = intel_hdmi_save,
 	.restore = intel_hdmi_restore,
 	.detect = intel_hdmi_detect,
diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c
index 53731f0..c92a64a 100644
--- a/drivers/gpu/drm/i915/intel_lvds.c
+++ b/drivers/gpu/drm/i915/intel_lvds.c
@@ -343,11 +343,6 @@ static int intel_lvds_set_property(struct drm_connector *connector,
 				   struct drm_property *property,
 				   uint64_t value)
 {
-	struct drm_device *dev = connector->dev;
-
-	if (property == dev->mode_config.dpms_property && connector->encoder)
-		intel_lvds_dpms(connector->encoder, (uint32_t)(value & 0xf));
-
 	return 0;
 }
 
@@ -366,6 +361,7 @@ static const struct drm_connector_helper_funcs intel_lvds_connector_helper_funcs
 };
 
 static const struct drm_connector_funcs intel_lvds_connector_funcs = {
+	.dpms = drm_helper_connector_dpms,
 	.save = intel_lvds_save,
 	.restore = intel_lvds_restore,
 	.detect = intel_lvds_detect,
diff --git a/drivers/gpu/drm/i915/intel_sdvo.c b/drivers/gpu/drm/i915/intel_sdvo.c
index f3ef6bf..3093b4d 100644
--- a/drivers/gpu/drm/i915/intel_sdvo.c
+++ b/drivers/gpu/drm/i915/intel_sdvo.c
@@ -1616,6 +1616,7 @@ static const struct drm_encoder_helper_funcs intel_sdvo_helper_funcs = {
 };
 
 static const struct drm_connector_funcs intel_sdvo_connector_funcs = {
+	.dpms = drm_helper_connector_dpms,
 	.save = intel_sdvo_save,
 	.restore = intel_sdvo_restore,
 	.detect = intel_sdvo_detect,
diff --git a/drivers/gpu/drm/i915/intel_tv.c b/drivers/gpu/drm/i915/intel_tv.c
index d2c3298..98ac054 100644
--- a/drivers/gpu/drm/i915/intel_tv.c
+++ b/drivers/gpu/drm/i915/intel_tv.c
@@ -1626,6 +1626,7 @@ static const struct drm_encoder_helper_funcs intel_tv_helper_funcs = {
 };
 
 static const struct drm_connector_funcs intel_tv_connector_funcs = {
+	.dpms = drm_helper_connector_dpms,
 	.save = intel_tv_save,
 	.restore = intel_tv_restore,
 	.detect = intel_tv_detect,
diff --git a/include/drm/drm_crtc.h b/include/drm/drm_crtc.h
index 3c1924c..7300fb8 100644
--- a/include/drm/drm_crtc.h
+++ b/include/drm/drm_crtc.h
@@ -471,6 +471,9 @@ struct drm_connector {
 	u32 property_ids[DRM_CONNECTOR_MAX_PROPERTY];
 	uint64_t property_values[DRM_CONNECTOR_MAX_PROPERTY];
 
+	/* requested DPMS state */
+	int dpms;
+
 	void *helper_private;
 
 	uint32_t encoder_ids[DRM_CONNECTOR_MAX_ENCODER];
diff --git a/include/drm/drm_crtc_helper.h b/include/drm/drm_crtc_helper.h
index ec073d82..6769ff6 100644
--- a/include/drm/drm_crtc_helper.h
+++ b/include/drm/drm_crtc_helper.h
@@ -99,6 +99,8 @@ extern bool drm_crtc_helper_set_mode(struct drm_crtc *crtc,
 				     struct drm_framebuffer *old_fb);
 extern bool drm_helper_crtc_in_use(struct drm_crtc *crtc);
 
+extern void drm_helper_connector_dpms(struct drm_connector *connector, int mode);
+
 extern int drm_helper_mode_fill_fb_struct(struct drm_framebuffer *fb,
 					  struct drm_mode_fb_cmd *mode_cmd);
 
-- 
cgit v0.10.2


From 93c05f222413e3a16e8785f252db4726693abd71 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Thu, 4 Jun 2009 09:41:19 +1000
Subject: drm/i915: intel_lvds.c fix section mismatch

intel_no_lvds[] does not require __initdata as it is used only by

void intel_lvds_init(struct drm_device *dev).

Signed-off-by: Jaswinder Singh Rajput <jaswinder@kernel.org>
Signed-off-by: Dave Airlie <airlied@redhat.com>

diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c
index c92a64a..53cccfa 100644
--- a/drivers/gpu/drm/i915/intel_lvds.c
+++ b/drivers/gpu/drm/i915/intel_lvds.c
@@ -387,7 +387,7 @@ static int __init intel_no_lvds_dmi_callback(const struct dmi_system_id *id)
 }
 
 /* These systems claim to have LVDS, but really don't */
-static const struct dmi_system_id __initdata intel_no_lvds[] = {
+static const struct dmi_system_id intel_no_lvds[] = {
 	{
 		.callback = intel_no_lvds_dmi_callback,
 		.ident = "Apple Mac Mini (Core series)",
-- 
cgit v0.10.2


From 6c51d1cfa0a370b48a157163340190cf5fd2346b Mon Sep 17 00:00:00 2001
From: Ben Skeggs <bskeggs@redhat.com>
Date: Tue, 26 May 2009 10:35:52 +1000
Subject: drm: don't associate _DRM_DRIVER maps with a master

A driver will use the _DRM_DRIVER map flag to indicate that it wants
to be responsible for removing the map itself, bypassing the DRM's
automagic cleanup code.

Since the multi-master changes this has been broken, resulting in some
drivers having their registers unmapped before it's finished with them.

Signed-off-by: Ben Skeggs <bskeggs@redhat.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>

diff --git a/drivers/gpu/drm/drm_bufs.c b/drivers/gpu/drm/drm_bufs.c
index 0411d91..80a2575 100644
--- a/drivers/gpu/drm/drm_bufs.c
+++ b/drivers/gpu/drm/drm_bufs.c
@@ -371,7 +371,8 @@ static int drm_addmap_core(struct drm_device * dev, resource_size_t offset,
 	list->user_token = list->hash.key << PAGE_SHIFT;
 	mutex_unlock(&dev->struct_mutex);
 
-	list->master = dev->primary->master;
+	if (!(map->flags & _DRM_DRIVER))
+		list->master = dev->primary->master;
 	*maplist = list;
 	return 0;
 	}
-- 
cgit v0.10.2


From fc43896630a421321a19d7970bac27ac94e9d162 Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Thu, 4 Jun 2009 10:20:34 +1000
Subject: drm: ignore EDID with really tiny modes.

Some EDIDs lie and report tiny modes that aren't possible. Ignore
these modes.

Signed-off-by: Adam Jackson <ajax@redhat.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>

diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index ca9c616..6f6b264 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -289,6 +289,11 @@ static struct drm_display_mode *drm_mode_detailed(struct drm_device *dev,
 	struct drm_display_mode *mode;
 	struct detailed_pixel_timing *pt = &timing->data.pixel_data;
 
+	/* ignore tiny modes */
+	if (((pt->hactive_hi << 8) | pt->hactive_lo) < 64 ||
+	    ((pt->vactive_hi << 8) | pt->hactive_lo) < 64)
+		return NULL;
+
 	if (pt->stereo) {
 		printk(KERN_WARNING "stereo mode not supported\n");
 		return NULL;
-- 
cgit v0.10.2


From e0a94c2a63f2644826069044649669b5e7ca75d3 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux-foundation.org>
Date: Wed, 3 Jun 2009 16:04:31 -0400
Subject: security: use mmap_min_addr indepedently of security models

This patch removes the dependency of mmap_min_addr on CONFIG_SECURITY.
It also sets a default mmap_min_addr of 4096.

mmapping of addresses below 4096 will only be possible for processes
with CAP_SYS_RAWIO.

Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Acked-by: Eric Paris <eparis@redhat.com>
Looks-ok-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/include/linux/mm.h b/include/linux/mm.h
index bff1f0d..0c21af6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -580,12 +580,10 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
  */
 static inline unsigned long round_hint_to_min(unsigned long hint)
 {
-#ifdef CONFIG_SECURITY
 	hint &= PAGE_MASK;
 	if (((void *)hint != NULL) &&
 	    (hint < mmap_min_addr))
 		return PAGE_ALIGN(mmap_min_addr);
-#endif
 	return hint;
 }
 
diff --git a/include/linux/security.h b/include/linux/security.h
index d5fd616..5eff459 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2197,6 +2197,8 @@ static inline int security_file_mmap(struct file *file, unsigned long reqprot,
 				     unsigned long addr,
 				     unsigned long addr_only)
 {
+	if ((addr < mmap_min_addr) && !capable(CAP_SYS_RAWIO))
+		return -EACCES;
 	return 0;
 }
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 149581f..45bd711 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1237,7 +1237,6 @@ static struct ctl_table vm_table[] = {
 		.strategy	= &sysctl_jiffies,
 	},
 #endif
-#ifdef CONFIG_SECURITY
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "mmap_min_addr",
@@ -1246,7 +1245,6 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_doulongvec_minmax,
 	},
-#endif
 #ifdef CONFIG_NUMA
 	{
 		.ctl_name	= CTL_UNNUMBERED,
diff --git a/mm/Kconfig b/mm/Kconfig
index c2b57d8..71830ba 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -226,6 +226,25 @@ config HAVE_MLOCKED_PAGE_BIT
 config MMU_NOTIFIER
 	bool
 
+config DEFAULT_MMAP_MIN_ADDR
+        int "Low address space to protect from user allocation"
+        default 4096
+        help
+	  This is the portion of low virtual memory which should be protected
+	  from userspace allocation.  Keeping a user from writing to low pages
+	  can help reduce the impact of kernel NULL pointer bugs.
+
+	  For most ia64, ppc64 and x86 users with lots of address space
+	  a value of 65536 is reasonable and should cause no problems.
+	  On arm and other archs it should not be higher than 32768.
+	  Programs which use vm86 functionality would either need additional
+	  permissions from either the LSM or the capabilities module or have
+	  this protection disabled.
+
+	  This value can be changed after boot using the
+	  /proc/sys/vm/mmap_min_addr tunable.
+
+
 config NOMMU_INITIAL_TRIM_EXCESS
 	int "Turn on mmap() excess space trimming before booting"
 	depends on !MMU
diff --git a/mm/mmap.c b/mm/mmap.c
index 6b7b1a9..2b43fa1 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -87,6 +87,9 @@ int sysctl_overcommit_ratio = 50;	/* default is 50% */
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
 struct percpu_counter vm_committed_as;
 
+/* amount of vm to protect from userspace access */
+unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
+
 /*
  * Check that a process has enough memory to allocate a new virtual
  * mapping. 0 means there is enough memory for the allocation to
diff --git a/security/Kconfig b/security/Kconfig
index bb24477..d23c839 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -110,28 +110,8 @@ config SECURITY_ROOTPLUG
 
 	  See <http://www.linuxjournal.com/article.php?sid=6279> for
 	  more information about this module.
-	  
-	  If you are unsure how to answer this question, answer N.
-
-config SECURITY_DEFAULT_MMAP_MIN_ADDR
-        int "Low address space to protect from user allocation"
-        depends on SECURITY
-        default 0
-        help
-	  This is the portion of low virtual memory which should be protected
-	  from userspace allocation.  Keeping a user from writing to low pages
-	  can help reduce the impact of kernel NULL pointer bugs.
-
-	  For most ia64, ppc64 and x86 users with lots of address space
-	  a value of 65536 is reasonable and should cause no problems.
-	  On arm and other archs it should not be higher than 32768.
-	  Programs which use vm86 functionality would either need additional
-	  permissions from either the LSM or the capabilities module or have
-	  this protection disabled.
-
-	  This value can be changed after boot using the
-	  /proc/sys/vm/mmap_min_addr tunable.
 
+	  If you are unsure how to answer this question, answer N.
 
 source security/selinux/Kconfig
 source security/smack/Kconfig
diff --git a/security/security.c b/security/security.c
index 5284255..dc7674f 100644
--- a/security/security.c
+++ b/security/security.c
@@ -26,9 +26,6 @@ extern void security_fixup_ops(struct security_operations *ops);
 
 struct security_operations *security_ops;	/* Initialized to NULL */
 
-/* amount of vm to protect from userspace access */
-unsigned long mmap_min_addr = CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR;
-
 static inline int verify(struct security_operations *ops)
 {
 	/* verify the security_operations structure exists */
-- 
cgit v0.10.2


From d81e77f041881d8d91c767b8c84f9068290b10c6 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <jic23@cam.ac.uk>
Date: Mon, 25 May 2009 16:50:10 +0000
Subject: [ARM] pxa/imote2: fix UCAM sensor board ADC model number

Signed-off-by: Jonathan Cameron <jic23@cam.ac.uk>
Signed-off-by: Eric Miao <eric.miao@marvell.com>

diff --git a/arch/arm/mach-pxa/imote2.c b/arch/arm/mach-pxa/imote2.c
index 2121309..2b27336 100644
--- a/arch/arm/mach-pxa/imote2.c
+++ b/arch/arm/mach-pxa/imote2.c
@@ -412,7 +412,7 @@ static struct platform_device imote2_flash_device = {
  */
 static struct i2c_board_info __initdata imote2_i2c_board_info[] = {
 	{ /* UCAM sensor board */
-		.type = "max1238",
+		.type = "max1239",
 		.addr = 0x35,
 	}, { /* ITS400 Sensor board only */
 		.type = "max1363",
-- 
cgit v0.10.2


From 1257629b0712a0a68a24c532a05a4cd23e3f7565 Mon Sep 17 00:00:00 2001
From: Philipp Zabel <philipp.zabel@gmail.com>
Date: Tue, 26 May 2009 22:03:32 +0200
Subject: [ARM] pxa: fix pxa27x_udc default pullup GPIO

Currently, pxa27x_udc tries to use GPIO 0 as D+ pullup if not
explicitly configured. Default to an invalid GPIO (-1) instead.

Signed-off-by: Philipp Zabel <philipp.zabel@gmail.com>
Acked-by: Robert Jarzmik <robert.jarzmik@free.fr>
Signed-off-by: Eric Miao <eric.miao@marvell.com>

diff --git a/arch/arm/mach-pxa/devices.c b/arch/arm/mach-pxa/devices.c
index d245e59c..29970f7 100644
--- a/arch/arm/mach-pxa/devices.c
+++ b/arch/arm/mach-pxa/devices.c
@@ -72,7 +72,10 @@ void __init pxa_set_mci_info(struct pxamci_platform_data *info)
 }
 
 
-static struct pxa2xx_udc_mach_info pxa_udc_info;
+static struct pxa2xx_udc_mach_info pxa_udc_info = {
+	.gpio_pullup = -1,
+	.gpio_vbus   = -1,
+};
 
 void __init pxa_set_udc_info(struct pxa2xx_udc_mach_info *info)
 {
-- 
cgit v0.10.2


From a4c43beaff0fe6c83aa2505dce8ffe65db8e0a33 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 3 Jun 2009 23:02:33 -0300
Subject: perf report: Fix rbtree bug

Ingo Molnar reported:

> FYI, i just got this crash (segfault) in perf report after
> collecting a long profile from Xorg:
>
> Starting program: /home/mingo/tip/Documentation/perf_counter/perf report
> [Thread debugging using libthread_db enabled]
> Detaching after fork from child process 20008.
> [New Thread 0x7f92fd62a6f0 (LWP 20005)]
>
> Program received signal SIGSEGV, Segmentation fault.
> 0x000000000041031a in __rb_erase_color (node=0x142c090, parent=0x0,
> root=0x881918)
>     at util/rbtree.c:143
> 143			if (parent->rb_left == node)

It was a problem introduced in this cset:

 perf report: Fix comm sorting - 8229289b607682f90b946ad2c319526303c17700

This patch should fix it.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Stephane Eranian <eranian@googlemail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 6003cc3..86f23f0 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -699,17 +699,18 @@ static void output__resort(void)
 {
 	struct rb_node *next;
 	struct hist_entry *n;
+	struct rb_root *tree = &hist;
 
 	if (sort__need_collapse)
-		next = rb_first(&collapse_hists);
-	else
-		next = rb_first(&hist);
+		tree = &collapse_hists;
+
+	next = rb_first(tree);
 
 	while (next) {
 		n = rb_entry(next, struct hist_entry, rb_node);
 		next = rb_next(&n->rb_node);
 
-		rb_erase(&n->rb_node, &hist);
+		rb_erase(&n->rb_node, tree);
 		output__insert_entry(n);
 	}
 }
-- 
cgit v0.10.2


From 6e53cdf11dfc8d302ebb67e7112d1baf8d7c66d4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 4 Jun 2009 08:53:05 +0200
Subject: perf top: Reduce default filter threshold

On idle systems 'perf top' comes up empty by default, because the event
count filter is set to 100.

Reduce it to 5 instead.

Also add an option to limit the number of functions displayed.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 7c907e2..3f7778b 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -64,9 +64,10 @@ static int			default_interval = 100000;
 static int			event_count[MAX_COUNTERS];
 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
 
-static __u64			count_filter		       = 100;
+static __u64			count_filter			=  5;
+static int			print_entries			= 15;
 
-static int			target_pid				= -1;
+static int			target_pid			= -1;
 static int			profile_cpu			= -1;
 static int			nr_cpus				=  0;
 static unsigned int		realtime_prio			=  0;
@@ -254,7 +255,7 @@ static void print_sym_table(void)
 		struct symbol *sym = (struct symbol *)(syme + 1);
 		float pcnt;
 
-		if (++printed > 18 || syme->snap_count < count_filter)
+		if (++printed > print_entries || syme->snap_count < count_filter)
 			continue;
 
 		pcnt = 100.0 - (100.0 * ((sum_kevents - syme->snap_count) /
@@ -650,7 +651,7 @@ static const struct option options[] = {
 		    "number of seconds to delay between refreshes"),
 	OPT_BOOLEAN('D', "dump-symtab", &dump_symtab,
 			    "dump the symbol table used for profiling"),
-	OPT_INTEGER('f', "--count-filter", &count_filter,
+	OPT_INTEGER('f', "count-filter", &count_filter,
 		    "only display functions with more events than this"),
 	OPT_BOOLEAN('g', "group", &group,
 			    "put the counters into a counter group"),
@@ -662,8 +663,10 @@ static const struct option options[] = {
 		    "track mmap events"),
 	OPT_BOOLEAN('U', "use-munmap", &use_munmap,
 		    "track munmap events"),
-	OPT_INTEGER('F', "--freq", &freq,
+	OPT_INTEGER('F', "freq", &freq,
 		    "profile at this frequency"),
+	OPT_INTEGER('E', "entries", &print_entries,
+		    "display this many functions"),
 	OPT_END()
 };
 
-- 
cgit v0.10.2


From bcb86975dbcc24f820f1a37918d53914af29ace7 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Thu, 4 Jun 2009 15:14:34 +0900
Subject: TOMOYO: Remove unused parameter.

TOMOYO 2.2.0 does not check argv[] and envp[] upon execve().
We don't need to pass "struct tomoyo_page_buffer".

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index 34bb641..eb75401 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -721,7 +721,7 @@ int tomoyo_find_next_domain(struct linux_binprm *bprm,
 	}
 
 	/* Check execute permission. */
-	retval = tomoyo_check_exec_perm(old_domain, &r, tmp);
+	retval = tomoyo_check_exec_perm(old_domain, &r);
 	if (retval < 0)
 		goto out;
 
diff --git a/security/tomoyo/file.c b/security/tomoyo/file.c
index a67f9e6..ab0cd35 100644
--- a/security/tomoyo/file.c
+++ b/security/tomoyo/file.c
@@ -1012,13 +1012,11 @@ int tomoyo_check_file_perm(struct tomoyo_domain_info *domain,
  *
  * @domain:   Pointer to "struct tomoyo_domain_info".
  * @filename: Check permission for "execute".
- * @tmp:      Buffer for temporary use.
  *
  * Returns 0 on success, negativevalue otherwise.
  */
 int tomoyo_check_exec_perm(struct tomoyo_domain_info *domain,
-			   const struct tomoyo_path_info *filename,
-			   struct tomoyo_page_buffer *tmp)
+			   const struct tomoyo_path_info *filename)
 {
 	const u8 mode = tomoyo_check_flags(domain, TOMOYO_MAC_FOR_FILE);
 
diff --git a/security/tomoyo/tomoyo.h b/security/tomoyo/tomoyo.h
index f12d5ad..0fd588a 100644
--- a/security/tomoyo/tomoyo.h
+++ b/security/tomoyo/tomoyo.h
@@ -17,13 +17,11 @@ struct path;
 struct inode;
 struct linux_binprm;
 struct pt_regs;
-struct tomoyo_page_buffer;
 
 int tomoyo_check_file_perm(struct tomoyo_domain_info *domain,
 			   const char *filename, const u8 perm);
 int tomoyo_check_exec_perm(struct tomoyo_domain_info *domain,
-			   const struct tomoyo_path_info *filename,
-			   struct tomoyo_page_buffer *buf);
+			   const struct tomoyo_path_info *filename);
 int tomoyo_check_open_permission(struct tomoyo_domain_info *domain,
 				 struct path *path, const int flag);
 int tomoyo_check_1path_perm(struct tomoyo_domain_info *domain,
-- 
cgit v0.10.2


From f79b1b146b52765ee38bfb91bb14eb850fa98017 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Thu, 28 May 2009 00:25:05 +0800
Subject: PCI: use fixed-up device class when configuring device

The device class may be changed after the fixup, so re-read the class
value from pci_dev when configuring the device.  Otherwise some devices
such as JMicron SATA controller won't work.

Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Reviewed-by: Grant Grundler <grundler@parisc-linux.org>
Tested-by: Marc Dionne <marc.c.dionne@gmail.com>
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index e3c3e08..f1ae247 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -745,6 +745,8 @@ int pci_setup_device(struct pci_dev *dev)
 
 	/* Early fixups, before probing the BARs */
 	pci_fixup_device(pci_fixup_early, dev);
+	/* device class may be changed after fixup */
+	class = dev->class >> 8;
 
 	switch (dev->hdr_type) {		    /* header type */
 	case PCI_HEADER_TYPE_NORMAL:		    /* standard header */
-- 
cgit v0.10.2


From 75e613cdc7bb2ba3795b1bc3ddf19476c767ba68 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 3 Jun 2009 00:13:13 -0700
Subject: x86/pci: fix mmconfig detection with 32bit near 4g

Pascal reported and bisected a commit:
|	x86/PCI: don't call e820_all_mapped with -1 in the mmconfig case

which broke one system system.

ACPI: Using IOAPIC for interrupt routing
PCI: MCFG configuration 0: base f0000000 segment 0 buses 0 - 255
PCI: MCFG area at f0000000 reserved in ACPI motherboard resources
PCI: Using MMCONFIG for extended config space

it didn't have
PCI: updated MCFG configuration 0: base f0000000 segment 0 buses 0 - 63
anymore, and try to use 0xf000000 - 0xffffffff for mmconfig

For 32bit, mcfg_res->end could be 32bit only (if 64 resources aren't used)
So use end - 1 to pass the value in mcfg->end to avoid overflow.

We don't need to worry about the e820 path, they are always 64 bit.

Reported-by: Pascal Terjan <pterjan@mandriva.com>
Bisected-by: Pascal Terjan <pterjan@mandriva.com>
Tested-by: Pascal Terjan <pterjan@mandriva.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: stable@kernel.org
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>

diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 5fa10bb..8766b0e 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -375,7 +375,7 @@ static acpi_status __init check_mcfg_resource(struct acpi_resource *res,
 		if (!fixmem32)
 			return AE_OK;
 		if ((mcfg_res->start >= fixmem32->address) &&
-		    (mcfg_res->end <= (fixmem32->address +
+		    (mcfg_res->end < (fixmem32->address +
 				      fixmem32->address_length))) {
 			mcfg_res->flags = 1;
 			return AE_CTRL_TERMINATE;
@@ -392,7 +392,7 @@ static acpi_status __init check_mcfg_resource(struct acpi_resource *res,
 		return AE_OK;
 
 	if ((mcfg_res->start >= address.minimum) &&
-	    (mcfg_res->end <= (address.minimum + address.address_length))) {
+	    (mcfg_res->end < (address.minimum + address.address_length))) {
 		mcfg_res->flags = 1;
 		return AE_CTRL_TERMINATE;
 	}
@@ -418,7 +418,7 @@ static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used)
 	struct resource mcfg_res;
 
 	mcfg_res.start = start;
-	mcfg_res.end = end;
+	mcfg_res.end = end - 1;
 	mcfg_res.flags = 0;
 
 	acpi_get_devices("PNP0C01", find_mboard_resource, &mcfg_res, NULL);
-- 
cgit v0.10.2


From 1b58c2515be48d5df79d20210ac5a86e30094de2 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 4 Jun 2009 09:49:59 +1000
Subject: perf_counter: powerpc: Use new identifier names in powerpc-specific
 code

Commit b23f3325 ("perf_counter: Rename various fields") fixed up
most of the uses of the renamed fields, but missed one instance
of "record_type" in powerpc-specific code which needs to be changed
to "sample_type", and a "PERF_RECORD_ADDR" in the same statement that
needs to be changed to "PERF_SAMPLE_ADDR", causing compilation
errors on powerpc.  This fixes it.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18983.3111.770392.800486@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 4cc4ac5..232b00a 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -1002,7 +1002,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 */
 	if (record) {
 		addr = 0;
-		if (counter->attr.record_type & PERF_RECORD_ADDR) {
+		if (counter->attr.sample_type & PERF_SAMPLE_ADDR) {
 			/*
 			 * The user wants a data address recorded.
 			 * If we're not doing instruction sampling,
-- 
cgit v0.10.2


From 3aff27ca84fa94311ae99189e54fed8d83b69fc1 Mon Sep 17 00:00:00 2001
From: Yong Wang <yong.y.wang@linux.intel.com>
Date: Wed, 3 Jun 2009 16:42:25 +0800
Subject: perf_counter: Documentation update

The 'nmi' bit is no longer there.

Signed-off-by: Yong Wang <yong.y.wang@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
LKML-Reference: <20090603084225.GA6553@ywang-moblin2.bj.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/design.txt b/Documentation/perf_counter/design.txt
index 9930c4b..d325076 100644
--- a/Documentation/perf_counter/design.txt
+++ b/Documentation/perf_counter/design.txt
@@ -48,7 +48,6 @@ struct perf_counter_hw_event {
         __u32                   read_format;
 
         __u64                   disabled       :  1, /* off by default        */
-                                nmi            :  1, /* NMI sampling          */
                                 inherit        :  1, /* children inherit it   */
                                 pinned         :  1, /* must always be on PMU */
                                 exclusive      :  1, /* only group on PMU     */
@@ -195,12 +194,6 @@ The 'disabled' bit specifies whether the counter starts out disabled
 or enabled.  If it is initially disabled, it can be enabled by ioctl
 or prctl (see below).
 
-The 'nmi' bit specifies, for hardware events, whether the counter
-should be set up to request non-maskable interrupts (NMIs) or normal
-interrupts.  This bit is ignored if the user doesn't have
-CAP_SYS_ADMIN privilege (i.e. is not root) or if the CPU doesn't
-generate NMIs from hardware counters.
-
 The 'inherit' bit, if set, specifies that this counter should count
 events on descendant tasks as well as the task specified.  This only
 applies to new descendents, not to any existing descendents at the
-- 
cgit v0.10.2


From 48c72fccbfb1db01b5d0b98baff4442fea50d7a4 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 4 Jun 2009 20:20:24 +0900
Subject: sh: 16-bit get_unaligned() sh4a fix

This patch fixes the 16-bit case of the sh4a specific
unaligned access implementation. Without this patch
the 16-bit version of sh4a get_unaligned() results in
a 32-bit read which may read more data than intended
and/or cross page boundaries.

Unbreaks mtd NOR write handling on Migo-R.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/unaligned-sh4a.h b/arch/sh/include/asm/unaligned-sh4a.h
index d8f8977..9f4dd25 100644
--- a/arch/sh/include/asm/unaligned-sh4a.h
+++ b/arch/sh/include/asm/unaligned-sh4a.h
@@ -3,9 +3,9 @@
 
 /*
  * SH-4A has support for unaligned 32-bit loads, and 32-bit loads only.
- * Support for 16 and 64-bit accesses are done through shifting and
- * masking relative to the endianness. Unaligned stores are not supported
- * by the instruction encoding, so these continue to use the packed
+ * Support for 64-bit accesses are done through shifting and masking
+ * relative to the endianness. Unaligned stores are not supported by the
+ * instruction encoding, so these continue to use the packed
  * struct.
  *
  * The same note as with the movli.l/movco.l pair applies here, as long
@@ -41,9 +41,9 @@ struct __una_u64 { u64 x __attribute__((packed)); };
 static inline u16 __get_unaligned_cpu16(const u8 *p)
 {
 #ifdef __LITTLE_ENDIAN
-	return __get_unaligned_cpu32(p) & 0xffff;
+	return p[0] | p[1] << 8;
 #else
-	return __get_unaligned_cpu32(p) >> 16;
+	return p[0] << 8 | p[1];
 #endif
 }
 
-- 
cgit v0.10.2


From df97992c6e25ffc66c549c8bc59262dc627c6d17 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 4 Jun 2009 13:41:22 +0200
Subject: perf record/report: Fix PID/COMM handling

Fix two bugs causing lost comm mappings:

 - initial PID is not 0 but getpid()

 - when we are unable to handle an mmap event, dont assume the event
   itself is broken - try to parse the stream. This way we wont lose
   comm events.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index efa2eb4..bf59df5 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -430,7 +430,7 @@ static int __cmd_record(int argc, const char **argv)
 	}
 
 	if (!system_wide) {
-		open_counters(-1, target_pid != -1 ? target_pid : 0);
+		open_counters(-1, target_pid != -1 ? target_pid : getpid());
 	} else for (i = 0; i < nr_cpus; i++)
 		open_counters(i, target_pid);
 
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 86f23f0..ff6f657 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -852,7 +852,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
 
 	if (thread == NULL || map == NULL) {
 		dprintf("problem processing PERF_EVENT_MMAP, skipping event.\n");
-		return -1;
+		return 0;
 	}
 
 	thread__insert_map(thread, map);
-- 
cgit v0.10.2


From 0e7ddf7eeeef5aea85412120539ab5369577faeb Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Thu, 4 Jun 2009 11:18:14 +0000
Subject: drm/i915: Remove a bad BUG_ON in the fence management code.

This could be triggered by a gtt mapping fault on 965 that decides to
remove the fence from another object that happens to be active currently.
Since the other object doesn't rely on the fence reg for its execution, we
don't wait for it to finish.  We'll soon be not waiting on 915 most of the
time as well, so just drop the BUG_ON.

Signed-off-by: Eric Anholt <eric@anholt.net>

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 670d128..39f5c65 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2260,9 +2260,6 @@ try_again:
 			goto try_again;
 		}
 
-		BUG_ON(old_obj_priv->active ||
-		       (reg->obj->write_domain & I915_GEM_GPU_DOMAINS));
-
 		/*
 		 * Zap this virtual mapping so we can set up a fence again
 		 * for this object next time we need it.
-- 
cgit v0.10.2


From af794b94ae8a16fb4a9da6ce640c122efb44e2a0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 4 Jun 2009 13:58:13 +0200
Subject: perf_counter tools: Build with native optimization

Build the tools with -march=native by default.

No measurable difference in speed though, compared to the
default, on a Nehalem testbox.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 005709b..414399c 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -159,7 +159,7 @@ uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
 
 # CFLAGS and LDFLAGS are for the users to override from the command line.
 
-CFLAGS = -ggdb3 -Wall -Werror -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement -O6
+CFLAGS = -ggdb3 -Wall -Werror -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement -O6 -march=native
 LDFLAGS = -lpthread -lrt -lelf
 ALL_CFLAGS = $(CFLAGS)
 ALL_LDFLAGS = $(LDFLAGS)
-- 
cgit v0.10.2


From 95ed6fd06e52bf850cd17524f0b36ed14300c10d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 4 Jun 2009 15:00:45 +0200
Subject: perf report: Simplify symbol output

The DSO can be printed already - no need to repeat it in the
symbol field.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index ff6f657..56c664d 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -382,7 +382,7 @@ sort__comm_print(FILE *fp, struct hist_entry *self)
 }
 
 static struct sort_entry sort_comm = {
-	.header 	= "          Command",
+	.header		= "          Command",
 	.cmp		= sort__comm_cmp,
 	.collapse	= sort__comm_collapse,
 	.print		= sort__comm_print,
@@ -414,11 +414,11 @@ sort__dso_print(FILE *fp, struct hist_entry *self)
 	if (self->dso)
 		return fprintf(fp, "  %-25s", self->dso->name);
 
-	return fprintf(fp, "  %016llx", (__u64)self->ip);
+	return fprintf(fp, "  %016llx         ", (__u64)self->ip);
 }
 
 static struct sort_entry sort_dso = {
-	.header = " Shared Object          ",
+	.header = " Shared Object            ",
 	.cmp	= sort__dso_cmp,
 	.print	= sort__dso_print,
 };
@@ -447,21 +447,16 @@ sort__sym_print(FILE *fp, struct hist_entry *self)
 	if (verbose)
 		ret += fprintf(fp, "  %#018llx", (__u64)self->ip);
 
-	if (self->dso)
-		ret += fprintf(fp, "  %s: ", self->dso->name);
-	else
-		ret += fprintf(fp, "  %#016llx: ", (__u64)self->ip);
-
 	if (self->sym)
-		ret += fprintf(fp, "%s", self->sym->name);
+		ret += fprintf(fp, "  %s", self->sym->name);
 	else
-		ret += fprintf(fp, "%#016llx", (__u64)self->ip);
+		ret += fprintf(fp, "  %#016llx", (__u64)self->ip);
 
 	return ret;
 }
 
 static struct sort_entry sort_sym = {
-	.header = " Shared Object: Symbol",
+	.header = " Symbol",
 	.cmp	= sort__sym_cmp,
 	.print	= sort__sym_print,
 };
-- 
cgit v0.10.2


From bd74137ec9aaca3df3ff22b92455fddf7afaced1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 4 Jun 2009 14:13:04 +0200
Subject: perf_counter tools: Print out symbol parsing errors only if --verbose

Also, add a suggestion to 'perf report', if the default sort order is
used.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 56c664d..15fe9da 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -26,7 +26,10 @@
 
 static char		const *input_name = "perf.data";
 static char		*vmlinux = NULL;
-static char		*sort_order = "comm,dso";
+
+static char		default_sort_order[] = "comm,dso";
+static char		*sort_order = default_sort_order;
+
 static int		input;
 static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
@@ -103,9 +106,10 @@ static struct dso *dsos__findnew(const char *name)
 	if (!dso)
 		goto out_delete_dso;
 
-	nr = dso__load(dso, NULL);
+	nr = dso__load(dso, NULL, verbose);
 	if (nr < 0) {
-		fprintf(stderr, "Failed to open: %s\n", name);
+		if (verbose)
+			fprintf(stderr, "Failed to open: %s\n", name);
 		goto out_delete_dso;
 	}
 	if (!nr && verbose) {
@@ -139,7 +143,7 @@ static int load_kernel(void)
 	if (!kernel_dso)
 		return -1;
 
-	err = dso__load_kernel(kernel_dso, vmlinux, NULL);
+	err = dso__load_kernel(kernel_dso, vmlinux, NULL, verbose);
 	if (err) {
 		dso__delete(kernel_dso);
 		kernel_dso = NULL;
@@ -741,6 +745,12 @@ static size_t output__fprintf(FILE *fp, uint64_t total_samples)
 		ret += hist_entry__fprintf(fp, pos, total_samples);
 	}
 
+	if (!strcmp(sort_order, default_sort_order)) {
+		fprintf(fp, "#\n");
+		fprintf(fp, "# ( For more details, try: perf report --sort comm,dso,symbol )\n");
+		fprintf(fp, "#\n");
+	}
+
 	return ret;
 }
 
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 3f7778b..548a8da 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -349,7 +349,7 @@ static int parse_symbols(void)
 	if (kernel_dso == NULL)
 		return -1;
 
-	if (dso__load_kernel(kernel_dso, NULL, symbol_filter) != 0)
+	if (dso__load_kernel(kernel_dso, NULL, symbol_filter, 1) != 0)
 		goto out_delete_dso;
 
 	node = rb_first(&kernel_dso->syms);
diff --git a/Documentation/perf_counter/util/symbol.c b/Documentation/perf_counter/util/symbol.c
index 35ee6de..15d5cf9 100644
--- a/Documentation/perf_counter/util/symbol.c
+++ b/Documentation/perf_counter/util/symbol.c
@@ -124,7 +124,7 @@ size_t dso__fprintf(struct dso *self, FILE *fp)
 	return ret;
 }
 
-static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter)
+static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter, int verbose)
 {
 	struct rb_node *nd, *prevnd;
 	char *line = NULL;
@@ -370,7 +370,7 @@ static int dso__synthesize_plt_symbols(struct  dso *self, Elf *elf,
 }
 
 static int dso__load_sym(struct dso *self, int fd, const char *name,
-			 symbol_filter_t filter)
+			 symbol_filter_t filter, int verbose)
 {
 	Elf_Data *symstrs;
 	uint32_t nr_syms;
@@ -387,13 +387,15 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
 
 	elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
 	if (elf == NULL) {
-		fprintf(stderr, "%s: cannot read %s ELF file.\n",
-			__func__, name);
+		if (verbose)
+			fprintf(stderr, "%s: cannot read %s ELF file.\n",
+				__func__, name);
 		goto out_close;
 	}
 
 	if (gelf_getehdr(elf, &ehdr) == NULL) {
-		fprintf(stderr, "%s: cannot get elf header.\n", __func__);
+		if (verbose)
+			fprintf(stderr, "%s: cannot get elf header.\n", __func__);
 		goto out_elf_end;
 	}
 
@@ -473,7 +475,7 @@ out_close:
 	return err;
 }
 
-int dso__load(struct dso *self, symbol_filter_t filter)
+int dso__load(struct dso *self, symbol_filter_t filter, int verbose)
 {
 	int size = strlen(self->name) + sizeof("/usr/lib/debug%s.debug");
 	char *name = malloc(size);
@@ -505,7 +507,7 @@ more:
 		fd = open(name, O_RDONLY);
 	} while (fd < 0);
 
-	ret = dso__load_sym(self, fd, name, filter);
+	ret = dso__load_sym(self, fd, name, filter, verbose);
 	close(fd);
 
 	/*
@@ -520,28 +522,29 @@ out:
 }
 
 static int dso__load_vmlinux(struct dso *self, const char *vmlinux,
-			     symbol_filter_t filter)
+			     symbol_filter_t filter, int verbose)
 {
 	int err, fd = open(vmlinux, O_RDONLY);
 
 	if (fd < 0)
 		return -1;
 
-	err = dso__load_sym(self, fd, vmlinux, filter);
+	err = dso__load_sym(self, fd, vmlinux, filter, verbose);
 	close(fd);
 
 	return err;
 }
 
-int dso__load_kernel(struct dso *self, const char *vmlinux, symbol_filter_t filter)
+int dso__load_kernel(struct dso *self, const char *vmlinux,
+		     symbol_filter_t filter, int verbose)
 {
 	int err = -1;
 
 	if (vmlinux)
-		err = dso__load_vmlinux(self, vmlinux, filter);
+		err = dso__load_vmlinux(self, vmlinux, filter, verbose);
 
 	if (err)
-		err = dso__load_kallsyms(self, filter);
+		err = dso__load_kallsyms(self, filter, verbose);
 
 	return err;
 }
diff --git a/Documentation/perf_counter/util/symbol.h b/Documentation/perf_counter/util/symbol.h
index b0299bc..8dd8522 100644
--- a/Documentation/perf_counter/util/symbol.h
+++ b/Documentation/perf_counter/util/symbol.h
@@ -32,8 +32,8 @@ static inline void *dso__sym_priv(struct dso *self, struct symbol *sym)
 struct symbol *dso__find_symbol(struct dso *self, uint64_t ip);
 
 int dso__load_kernel(struct dso *self, const char *vmlinux,
-		     symbol_filter_t filter);
-int dso__load(struct dso *self, symbol_filter_t filter);
+		     symbol_filter_t filter, int verbose);
+int dso__load(struct dso *self, symbol_filter_t filter, int verbose);
 
 size_t dso__fprintf(struct dso *self, FILE *fp);
 
-- 
cgit v0.10.2


From 05ca061eb9704ad9b0739f88046276792b75f2c1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 4 Jun 2009 14:21:16 +0200
Subject: perf report: Print out the total number of events

So that the statistical quality of the profile can be estimated at a glance.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 15fe9da..be392e0 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -722,6 +722,8 @@ static size_t output__fprintf(FILE *fp, uint64_t total_samples)
 	size_t ret = 0;
 
 	fprintf(fp, "#\n");
+	fprintf(fp, "# (%Ld profiler events)\n", (__u64)total_samples);
+	fprintf(fp, "#\n");
 
 	fprintf(fp, "# Overhead");
 	list_for_each_entry(se, &hist_entry__sort_list, list)
-- 
cgit v0.10.2


From 71dd8945d8d827ab101cd287f9480ef22fc7c1b6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 4 Jun 2009 15:16:56 +0200
Subject: perf report: Add consistent spacing rules

Make the sort header and the print function have the same column width.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index be392e0..e930b4e 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -344,11 +344,11 @@ sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
 static size_t
 sort__thread_print(FILE *fp, struct hist_entry *self)
 {
-	return fprintf(fp, " %16s:%5d", self->thread->comm ?: "", self->thread->pid);
+	return fprintf(fp, "%16s:%5d", self->thread->comm ?: "", self->thread->pid);
 }
 
 static struct sort_entry sort_thread = {
-	.header = "         Command: Pid ",
+	.header = "         Command:  Pid",
 	.cmp	= sort__thread_cmp,
 	.print	= sort__thread_print,
 };
@@ -382,11 +382,11 @@ sort__comm_collapse(struct hist_entry *left, struct hist_entry *right)
 static size_t
 sort__comm_print(FILE *fp, struct hist_entry *self)
 {
-	return fprintf(fp, "  %16s", self->thread->comm);
+	return fprintf(fp, "%16s", self->thread->comm);
 }
 
 static struct sort_entry sort_comm = {
-	.header		= "          Command",
+	.header 	= "         Command",
 	.cmp		= sort__comm_cmp,
 	.collapse	= sort__comm_collapse,
 	.print		= sort__comm_print,
@@ -416,13 +416,13 @@ static size_t
 sort__dso_print(FILE *fp, struct hist_entry *self)
 {
 	if (self->dso)
-		return fprintf(fp, "  %-25s", self->dso->name);
+		return fprintf(fp, "%-25s", self->dso->name);
 
-	return fprintf(fp, "  %016llx         ", (__u64)self->ip);
+	return fprintf(fp, "%016llx         ", (__u64)self->ip);
 }
 
 static struct sort_entry sort_dso = {
-	.header = " Shared Object            ",
+	.header = "Shared Object            ",
 	.cmp	= sort__dso_cmp,
 	.print	= sort__dso_print,
 };
@@ -449,18 +449,18 @@ sort__sym_print(FILE *fp, struct hist_entry *self)
 	size_t ret = 0;
 
 	if (verbose)
-		ret += fprintf(fp, "  %#018llx", (__u64)self->ip);
+		ret += fprintf(fp, "%#018llx  ", (__u64)self->ip);
 
 	if (self->sym)
-		ret += fprintf(fp, "  %s", self->sym->name);
+		ret += fprintf(fp, "%s", self->sym->name);
 	else
-		ret += fprintf(fp, "  %#016llx", (__u64)self->ip);
+		ret += fprintf(fp, "%#016llx", (__u64)self->ip);
 
 	return ret;
 }
 
 static struct sort_entry sort_sym = {
-	.header = " Symbol",
+	.header = "Symbol",
 	.cmp	= sort__sym_cmp,
 	.print	= sort__sym_print,
 };
@@ -553,8 +553,10 @@ hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
 	} else
 		ret = fprintf(fp, "%12d ", self->count);
 
-	list_for_each_entry(se, &hist_entry__sort_list, list)
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		fprintf(fp, "  ");
 		ret += se->print(fp, self);
+	}
 
 	ret += fprintf(fp, "\n");
 
@@ -721,13 +723,14 @@ static size_t output__fprintf(FILE *fp, uint64_t total_samples)
 	struct rb_node *nd;
 	size_t ret = 0;
 
+	fprintf(fp, "\n");
 	fprintf(fp, "#\n");
 	fprintf(fp, "# (%Ld profiler events)\n", (__u64)total_samples);
 	fprintf(fp, "#\n");
 
 	fprintf(fp, "# Overhead");
 	list_for_each_entry(se, &hist_entry__sort_list, list)
-		fprintf(fp, " %s", se->header);
+		fprintf(fp, "  %s", se->header);
 	fprintf(fp, "\n");
 
 	fprintf(fp, "# ........");
@@ -735,7 +738,7 @@ static size_t output__fprintf(FILE *fp, uint64_t total_samples)
 		int i;
 
 		fprintf(fp, "  ");
-		for (i = 0; i < strlen(se->header)-1; i++)
+		for (i = 0; i < strlen(se->header); i++)
 			fprintf(fp, ".");
 	}
 	fprintf(fp, "\n");
@@ -749,9 +752,10 @@ static size_t output__fprintf(FILE *fp, uint64_t total_samples)
 
 	if (!strcmp(sort_order, default_sort_order)) {
 		fprintf(fp, "#\n");
-		fprintf(fp, "# ( For more details, try: perf report --sort comm,dso,symbol )\n");
+		fprintf(fp, "# (For more details, try: perf report --sort comm,dso,symbol)\n");
 		fprintf(fp, "#\n");
 	}
+	fprintf(fp, "\n");
 
 	return ret;
 }
-- 
cgit v0.10.2


From 2cc3c559fb2fe8cecca82a517bc56e88b0c1effd Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Thu, 4 Jun 2009 09:23:50 -0400
Subject: Btrfs: set device->total_disk_bytes when adding new device

It was not being properly initialized, and so the size saved to
disk was not correct.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5f01dad..a6d35b0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1440,6 +1440,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	device->io_align = root->sectorsize;
 	device->sector_size = root->sectorsize;
 	device->total_bytes = i_size_read(bdev->bd_inode);
+	device->disk_total_bytes = device->total_bytes;
 	device->dev_root = root->fs_info->dev_root;
 	device->bdev = bdev;
 	device->in_fs_metadata = 1;
-- 
cgit v0.10.2


From 8fc0321f1ad0ffef969056dda91b453bbd7a494d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 4 Jun 2009 15:19:47 +0200
Subject: perf_counter tools: Add color terminal output support

Add Git's color printing library to util/color.[ch].

Add it to perf report, with a trivial example to print high-overhead
entries in red, low-overhead entries in green.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 414399c..c9ec458 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -298,6 +298,7 @@ LIB_H += util/string.h
 LIB_H += util/run-command.h
 LIB_H += util/sigchain.h
 LIB_H += util/symbol.h
+LIB_H += util/color.h
 
 LIB_OBJS += util/abspath.o
 LIB_OBJS += util/alias.o
@@ -319,6 +320,7 @@ LIB_OBJS += util/usage.o
 LIB_OBJS += util/wrapper.o
 LIB_OBJS += util/sigchain.o
 LIB_OBJS += util/symbol.o
+LIB_OBJS += util/color.o
 LIB_OBJS += util/pager.o
 
 BUILTIN_OBJS += builtin-help.o
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index e930b4e..7beedc6 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -9,6 +9,7 @@
 
 #include "util/util.h"
 
+#include "util/color.h"
 #include "util/list.h"
 #include "util/cache.h"
 #include "util/rbtree.h"
@@ -548,7 +549,19 @@ hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
 	size_t ret;
 
 	if (total_samples) {
-		ret = fprintf(fp, "   %6.2f%%",
+		double percent = self->count * 100.0 / total_samples;
+		char *color = PERF_COLOR_NORMAL;
+
+		/*
+		 * We color high-overhead entries in red, low-overhead
+		 * entries in green - and keep the middle ground normal:
+		 */
+		if (percent >= 5.0)
+			color = PERF_COLOR_RED;
+		if (percent < 0.5)
+			color = PERF_COLOR_GREEN;
+
+		ret = color_fprintf(fp, color, "   %6.2f%%",
 				(self->count * 100.0) / total_samples);
 	} else
 		ret = fprintf(fp, "%12d ", self->count);
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 548a8da..20e5b12 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -21,6 +21,7 @@
 #include "perf.h"
 
 #include "util/symbol.h"
+#include "util/color.h"
 #include "util/util.h"
 #include "util/rbtree.h"
 #include "util/parse-options.h"
@@ -253,7 +254,8 @@ static void print_sym_table(void)
 	for (nd = rb_first(&tmp); nd; nd = rb_next(nd)) {
 		struct sym_entry *syme = rb_entry(nd, struct sym_entry, rb_node);
 		struct symbol *sym = (struct symbol *)(syme + 1);
-		float pcnt;
+		char *color = PERF_COLOR_NORMAL;
+		double pcnt;
 
 		if (++printed > print_entries || syme->snap_count < count_filter)
 			continue;
@@ -261,13 +263,22 @@ static void print_sym_table(void)
 		pcnt = 100.0 - (100.0 * ((sum_kevents - syme->snap_count) /
 					 sum_kevents));
 
+		/*
+		 * We color high-overhead entries in red, low-overhead
+		 * entries in green - and keep the middle ground normal:
+		 */
+		if (pcnt >= 5.0)
+			color = PERF_COLOR_RED;
+		if (pcnt < 0.5)
+			color = PERF_COLOR_GREEN;
+
 		if (nr_counters == 1)
-			printf("%19.2f - %4.1f%% - %016llx : %s\n",
-				syme->weight, pcnt, sym->start, sym->name);
+			printf("%19.2f - ", syme->weight);
 		else
-			printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n",
-				syme->weight, syme->snap_count,
-				pcnt, sym->start, sym->name);
+			printf("%8.1f %10ld - ", syme->weight, syme->snap_count);
+
+		color_fprintf(stdout, color, "%4.1f%%", pcnt);
+		printf(" - %016llx : %s\n", sym->start, sym->name);
 	}
 
 	{
diff --git a/Documentation/perf_counter/util/color.c b/Documentation/perf_counter/util/color.c
new file mode 100644
index 0000000..a77975d
--- /dev/null
+++ b/Documentation/perf_counter/util/color.c
@@ -0,0 +1,230 @@
+#include "cache.h"
+#include "color.h"
+
+int perf_use_color_default = 0;
+
+static int parse_color(const char *name, int len)
+{
+	static const char * const color_names[] = {
+		"normal", "black", "red", "green", "yellow",
+		"blue", "magenta", "cyan", "white"
+	};
+	char *end;
+	int i;
+	for (i = 0; i < ARRAY_SIZE(color_names); i++) {
+		const char *str = color_names[i];
+		if (!strncasecmp(name, str, len) && !str[len])
+			return i - 1;
+	}
+	i = strtol(name, &end, 10);
+	if (end - name == len && i >= -1 && i <= 255)
+		return i;
+	return -2;
+}
+
+static int parse_attr(const char *name, int len)
+{
+	static const int attr_values[] = { 1, 2, 4, 5, 7 };
+	static const char * const attr_names[] = {
+		"bold", "dim", "ul", "blink", "reverse"
+	};
+	int i;
+	for (i = 0; i < ARRAY_SIZE(attr_names); i++) {
+		const char *str = attr_names[i];
+		if (!strncasecmp(name, str, len) && !str[len])
+			return attr_values[i];
+	}
+	return -1;
+}
+
+void color_parse(const char *value, const char *var, char *dst)
+{
+	color_parse_mem(value, strlen(value), var, dst);
+}
+
+void color_parse_mem(const char *value, int value_len, const char *var,
+		char *dst)
+{
+	const char *ptr = value;
+	int len = value_len;
+	int attr = -1;
+	int fg = -2;
+	int bg = -2;
+
+	if (!strncasecmp(value, "reset", len)) {
+		strcpy(dst, PERF_COLOR_RESET);
+		return;
+	}
+
+	/* [fg [bg]] [attr] */
+	while (len > 0) {
+		const char *word = ptr;
+		int val, wordlen = 0;
+
+		while (len > 0 && !isspace(word[wordlen])) {
+			wordlen++;
+			len--;
+		}
+
+		ptr = word + wordlen;
+		while (len > 0 && isspace(*ptr)) {
+			ptr++;
+			len--;
+		}
+
+		val = parse_color(word, wordlen);
+		if (val >= -1) {
+			if (fg == -2) {
+				fg = val;
+				continue;
+			}
+			if (bg == -2) {
+				bg = val;
+				continue;
+			}
+			goto bad;
+		}
+		val = parse_attr(word, wordlen);
+		if (val < 0 || attr != -1)
+			goto bad;
+		attr = val;
+	}
+
+	if (attr >= 0 || fg >= 0 || bg >= 0) {
+		int sep = 0;
+
+		*dst++ = '\033';
+		*dst++ = '[';
+		if (attr >= 0) {
+			*dst++ = '0' + attr;
+			sep++;
+		}
+		if (fg >= 0) {
+			if (sep++)
+				*dst++ = ';';
+			if (fg < 8) {
+				*dst++ = '3';
+				*dst++ = '0' + fg;
+			} else {
+				dst += sprintf(dst, "38;5;%d", fg);
+			}
+		}
+		if (bg >= 0) {
+			if (sep++)
+				*dst++ = ';';
+			if (bg < 8) {
+				*dst++ = '4';
+				*dst++ = '0' + bg;
+			} else {
+				dst += sprintf(dst, "48;5;%d", bg);
+			}
+		}
+		*dst++ = 'm';
+	}
+	*dst = 0;
+	return;
+bad:
+	die("bad color value '%.*s' for variable '%s'", value_len, value, var);
+}
+
+int perf_config_colorbool(const char *var, const char *value, int stdout_is_tty)
+{
+	if (value) {
+		if (!strcasecmp(value, "never"))
+			return 0;
+		if (!strcasecmp(value, "always"))
+			return 1;
+		if (!strcasecmp(value, "auto"))
+			goto auto_color;
+	}
+
+	/* Missing or explicit false to turn off colorization */
+	if (!perf_config_bool(var, value))
+		return 0;
+
+	/* any normal truth value defaults to 'auto' */
+ auto_color:
+	if (stdout_is_tty < 0)
+		stdout_is_tty = isatty(1);
+	if (stdout_is_tty || (pager_in_use() && pager_use_color)) {
+		char *term = getenv("TERM");
+		if (term && strcmp(term, "dumb"))
+			return 1;
+	}
+	return 0;
+}
+
+int perf_color_default_config(const char *var, const char *value, void *cb)
+{
+	if (!strcmp(var, "color.ui")) {
+		perf_use_color_default = perf_config_colorbool(var, value, -1);
+		return 0;
+	}
+
+	return perf_default_config(var, value, cb);
+}
+
+static int color_vfprintf(FILE *fp, const char *color, const char *fmt,
+		va_list args, const char *trail)
+{
+	int r = 0;
+
+	if (*color)
+		r += fprintf(fp, "%s", color);
+	r += vfprintf(fp, fmt, args);
+	if (*color)
+		r += fprintf(fp, "%s", PERF_COLOR_RESET);
+	if (trail)
+		r += fprintf(fp, "%s", trail);
+	return r;
+}
+
+
+
+int color_fprintf(FILE *fp, const char *color, const char *fmt, ...)
+{
+	va_list args;
+	int r;
+	va_start(args, fmt);
+	r = color_vfprintf(fp, color, fmt, args, NULL);
+	va_end(args);
+	return r;
+}
+
+int color_fprintf_ln(FILE *fp, const char *color, const char *fmt, ...)
+{
+	va_list args;
+	int r;
+	va_start(args, fmt);
+	r = color_vfprintf(fp, color, fmt, args, "\n");
+	va_end(args);
+	return r;
+}
+
+/*
+ * This function splits the buffer by newlines and colors the lines individually.
+ *
+ * Returns 0 on success.
+ */
+int color_fwrite_lines(FILE *fp, const char *color,
+		size_t count, const char *buf)
+{
+	if (!*color)
+		return fwrite(buf, count, 1, fp) != 1;
+	while (count) {
+		char *p = memchr(buf, '\n', count);
+		if (p != buf && (fputs(color, fp) < 0 ||
+				fwrite(buf, p ? p - buf : count, 1, fp) != 1 ||
+				fputs(PERF_COLOR_RESET, fp) < 0))
+			return -1;
+		if (!p)
+			return 0;
+		if (fputc('\n', fp) < 0)
+			return -1;
+		count -= p + 1 - buf;
+		buf = p + 1;
+	}
+	return 0;
+}
+
+
diff --git a/Documentation/perf_counter/util/color.h b/Documentation/perf_counter/util/color.h
new file mode 100644
index 0000000..5abfd37
--- /dev/null
+++ b/Documentation/perf_counter/util/color.h
@@ -0,0 +1,36 @@
+#ifndef COLOR_H
+#define COLOR_H
+
+/* "\033[1;38;5;2xx;48;5;2xxm\0" is 23 bytes */
+#define COLOR_MAXLEN 24
+
+#define PERF_COLOR_NORMAL	""
+#define PERF_COLOR_RESET	"\033[m"
+#define PERF_COLOR_BOLD		"\033[1m"
+#define PERF_COLOR_RED		"\033[31m"
+#define PERF_COLOR_GREEN	"\033[32m"
+#define PERF_COLOR_YELLOW	"\033[33m"
+#define PERF_COLOR_BLUE		"\033[34m"
+#define PERF_COLOR_MAGENTA	"\033[35m"
+#define PERF_COLOR_CYAN		"\033[36m"
+#define PERF_COLOR_BG_RED	"\033[41m"
+
+/*
+ * This variable stores the value of color.ui
+ */
+extern int perf_use_color_default;
+
+
+/*
+ * Use this instead of perf_default_config if you need the value of color.ui.
+ */
+int perf_color_default_config(const char *var, const char *value, void *cb);
+
+int perf_config_colorbool(const char *var, const char *value, int stdout_is_tty);
+void color_parse(const char *value, const char *var, char *dst);
+void color_parse_mem(const char *value, int len, const char *var, char *dst);
+int color_fprintf(FILE *fp, const char *color, const char *fmt, ...);
+int color_fprintf_ln(FILE *fp, const char *color, const char *fmt, ...);
+int color_fwrite_lines(FILE *fp, const char *color, size_t count, const char *buf);
+
+#endif /* COLOR_H */
diff --git a/Documentation/perf_counter/util/environment.c b/Documentation/perf_counter/util/environment.c
index 9b1c819..275b0ee 100644
--- a/Documentation/perf_counter/util/environment.c
+++ b/Documentation/perf_counter/util/environment.c
@@ -6,3 +6,4 @@
 #include "cache.h"
 
 const char *pager_program;
+int pager_use_color = 1;
-- 
cgit v0.10.2


From 13d0ab5ec29852a6925f612830fa9e822669ece6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 4 Jun 2009 15:40:25 +0200
Subject: perf_counter tools: Dont output in color on !tty

Dont emit ASCII color characters if the terminal is not a tty,
such as when perf report gets redirected into a file.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/util/color.c b/Documentation/perf_counter/util/color.c
index a77975d..9a8c20c 100644
--- a/Documentation/perf_counter/util/color.c
+++ b/Documentation/perf_counter/util/color.c
@@ -1,7 +1,7 @@
 #include "cache.h"
 #include "color.h"
 
-int perf_use_color_default = 0;
+int perf_use_color_default = -1;
 
 static int parse_color(const char *name, int len)
 {
@@ -169,10 +169,20 @@ static int color_vfprintf(FILE *fp, const char *color, const char *fmt,
 {
 	int r = 0;
 
-	if (*color)
+	/*
+	 * Auto-detect:
+	 */
+	if (perf_use_color_default < 0) {
+		if (isatty(1) || pager_in_use())
+			perf_use_color_default = 1;
+		else
+			perf_use_color_default = 0;
+	}
+
+	if (perf_use_color_default && *color)
 		r += fprintf(fp, "%s", color);
 	r += vfprintf(fp, fmt, args);
-	if (*color)
+	if (perf_use_color_default && *color)
 		r += fprintf(fp, "%s", PERF_COLOR_RESET);
 	if (trail)
 		r += fprintf(fp, "%s", trail);
@@ -185,6 +195,7 @@ int color_fprintf(FILE *fp, const char *color, const char *fmt, ...)
 {
 	va_list args;
 	int r;
+
 	va_start(args, fmt);
 	r = color_vfprintf(fp, color, fmt, args, NULL);
 	va_end(args);
-- 
cgit v0.10.2


From edc52deac624e4641211a325c23da2a73b01a85d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 4 Jun 2009 16:24:37 +0200
Subject: perf report: Bail out if there are unrecognized options/arguments

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 7beedc6..389ae25 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -1094,10 +1094,16 @@ int cmd_report(int argc, const char **argv, const char *prefix)
 
 	page_size = getpagesize();
 
-	parse_options(argc, argv, options, report_usage, 0);
+	argc = parse_options(argc, argv, options, report_usage, 0);
 
 	setup_sorting();
 
+	/*
+	 * Any (unrecognized) arguments left?
+	 */
+	if (argc)
+		usage_with_options(report_usage, options);
+
 	setup_pager();
 
 	return __cmd_report();
-- 
cgit v0.10.2


From 20c84e959ec11b1803d2b2832eef703d5fbe7f7b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 4 Jun 2009 16:33:00 +0200
Subject: perf stat: Update help text

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Documentation/perf-stat.txt b/Documentation/perf_counter/Documentation/perf-stat.txt
index a67d0e3..a340e7b 100644
--- a/Documentation/perf_counter/Documentation/perf-stat.txt
+++ b/Documentation/perf_counter/Documentation/perf-stat.txt
@@ -22,6 +22,7 @@ OPTIONS
 <command>...::
 	Any command you can specify in a shell.
 
+
 -e::
 --event=::
                              0:0: cpu-cycles          
@@ -45,6 +46,13 @@ OPTIONS
                              1:4: migrations          
                            rNNN: raw PMU events (eventsel+umask)
 
+-i::
+--inherit::
+        child tasks inherit counters
+-p::
+--pid=<pid>::
+        stat events on existing pid
+
 -a::
         system-wide collection
 
@@ -54,20 +62,20 @@ OPTIONS
 EXAMPLES
 --------
 
-$ perf stat sleep 1
+$ perf stat -- make -j
 
- Performance counter stats for 'sleep':
+ Performance counter stats for 'make -j':
 
-       0.678356  task clock ticks     (msecs)
-              7  context switches     (events)
-              4  CPU migrations       (events)
-            232  pagefaults           (events)
-        1810403  CPU cycles           (events)
-         946759  instructions         (events)
-          18952  cache references     (events)
-           4885  cache misses         (events)
+    8117.370256  task clock ticks     #      11.281 CPU utilization factor
+            678  context switches     #       0.000 M/sec
+            133  CPU migrations       #       0.000 M/sec
+         235724  pagefaults           #       0.029 M/sec
+    24821162526  CPU cycles           #    3057.784 M/sec
+    18687303457  instructions         #    2302.138 M/sec
+      172158895  cache references     #      21.209 M/sec
+       27075259  cache misses         #       3.335 M/sec
 
- Wall-clock time elapsed:  1001.252894 msecs
+ Wall-clock time elapsed:   719.554352 msecs
 
 SEE ALSO
 --------
-- 
cgit v0.10.2


From 60313ebed739b331e8e61079da27a11ee3b73a30 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 4 Jun 2009 16:53:44 +0200
Subject: perf_counter: Add fork event

Create a fork event so that we can easily clone the comm and
dso maps without having to generate all those events.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 37d5541..380247b 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -277,6 +277,14 @@ enum perf_event_type {
 	PERF_EVENT_UNTHROTTLE		= 6,
 
 	/*
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 * 	u32				pid, ppid;
+	 * };
+	 */
+	PERF_EVENT_FORK			= 7,
+
+	/*
 	 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
 	 * will be PERF_RECORD_*
 	 *
@@ -618,6 +626,7 @@ extern void perf_counter_munmap(unsigned long addr, unsigned long len,
 				unsigned long pgoff, struct file *file);
 
 extern void perf_counter_comm(struct task_struct *tsk);
+extern void perf_counter_fork(struct task_struct *tsk);
 
 extern void perf_counter_task_migration(struct task_struct *task, int cpu);
 
@@ -673,6 +682,7 @@ perf_counter_munmap(unsigned long addr, unsigned long len,
 		    unsigned long pgoff, struct file *file)		{ }
 
 static inline void perf_counter_comm(struct task_struct *tsk)		{ }
+static inline void perf_counter_fork(struct task_struct *tsk)		{ }
 static inline void perf_counter_init(void)				{ }
 static inline void perf_counter_task_migration(struct task_struct *task,
 					       int cpu)			{ }
diff --git a/kernel/fork.c b/kernel/fork.c
index b7d7a9f..f4466ca 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1412,12 +1412,12 @@ long do_fork(unsigned long clone_flags,
 		if (clone_flags & CLONE_VFORK) {
 			p->vfork_done = &vfork;
 			init_completion(&vfork);
-		} else {
+		} else if (!(clone_flags & CLONE_VM)) {
 			/*
 			 * vfork will do an exec which will call
 			 * set_task_comm()
 			 */
-			perf_counter_comm(p);
+			perf_counter_fork(p);
 		}
 
 		audit_finish_fork(p);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0bb03f1..78c5862 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -40,9 +40,9 @@ static int perf_reserved_percpu __read_mostly;
 static int perf_overcommit __read_mostly = 1;
 
 static atomic_t nr_counters __read_mostly;
-static atomic_t nr_mmap_tracking __read_mostly;
-static atomic_t nr_munmap_tracking __read_mostly;
-static atomic_t nr_comm_tracking __read_mostly;
+static atomic_t nr_mmap_counters __read_mostly;
+static atomic_t nr_munmap_counters __read_mostly;
+static atomic_t nr_comm_counters __read_mostly;
 
 int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
 int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
@@ -1447,11 +1447,11 @@ static void free_counter(struct perf_counter *counter)
 
 	atomic_dec(&nr_counters);
 	if (counter->attr.mmap)
-		atomic_dec(&nr_mmap_tracking);
+		atomic_dec(&nr_mmap_counters);
 	if (counter->attr.munmap)
-		atomic_dec(&nr_munmap_tracking);
+		atomic_dec(&nr_munmap_counters);
 	if (counter->attr.comm)
-		atomic_dec(&nr_comm_tracking);
+		atomic_dec(&nr_comm_counters);
 
 	if (counter->destroy)
 		counter->destroy(counter);
@@ -2476,6 +2476,105 @@ static void perf_counter_output(struct perf_counter *counter,
 }
 
 /*
+ * fork tracking
+ */
+
+struct perf_fork_event {
+	struct task_struct	*task;
+
+	struct {
+		struct perf_event_header	header;
+
+		u32				pid;
+		u32				ppid;
+	} event;
+};
+
+static void perf_counter_fork_output(struct perf_counter *counter,
+				     struct perf_fork_event *fork_event)
+{
+	struct perf_output_handle handle;
+	int size = fork_event->event.header.size;
+	struct task_struct *task = fork_event->task;
+	int ret = perf_output_begin(&handle, counter, size, 0, 0);
+
+	if (ret)
+		return;
+
+	fork_event->event.pid = perf_counter_pid(counter, task);
+	fork_event->event.ppid = perf_counter_pid(counter, task->real_parent);
+
+	perf_output_put(&handle, fork_event->event);
+	perf_output_end(&handle);
+}
+
+static int perf_counter_fork_match(struct perf_counter *counter)
+{
+	if (counter->attr.comm || counter->attr.mmap || counter->attr.munmap)
+		return 1;
+
+	return 0;
+}
+
+static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
+				  struct perf_fork_event *fork_event)
+{
+	struct perf_counter *counter;
+
+	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
+		if (perf_counter_fork_match(counter))
+			perf_counter_fork_output(counter, fork_event);
+	}
+	rcu_read_unlock();
+}
+
+static void perf_counter_fork_event(struct perf_fork_event *fork_event)
+{
+	struct perf_cpu_context *cpuctx;
+	struct perf_counter_context *ctx;
+
+	cpuctx = &get_cpu_var(perf_cpu_context);
+	perf_counter_fork_ctx(&cpuctx->ctx, fork_event);
+	put_cpu_var(perf_cpu_context);
+
+	rcu_read_lock();
+	/*
+	 * doesn't really matter which of the child contexts the
+	 * events ends up in.
+	 */
+	ctx = rcu_dereference(current->perf_counter_ctxp);
+	if (ctx)
+		perf_counter_fork_ctx(ctx, fork_event);
+	rcu_read_unlock();
+}
+
+void perf_counter_fork(struct task_struct *task)
+{
+	struct perf_fork_event fork_event;
+
+	if (!atomic_read(&nr_comm_counters) &&
+	    !atomic_read(&nr_mmap_counters) &&
+	    !atomic_read(&nr_munmap_counters))
+		return;
+
+	fork_event = (struct perf_fork_event){
+		.task	= task,
+		.event  = {
+			.header = {
+				.type = PERF_EVENT_FORK,
+				.size = sizeof(fork_event.event),
+			},
+		},
+	};
+
+	perf_counter_fork_event(&fork_event);
+}
+
+/*
  * comm tracking
  */
 
@@ -2511,11 +2610,9 @@ static void perf_counter_comm_output(struct perf_counter *counter,
 	perf_output_end(&handle);
 }
 
-static int perf_counter_comm_match(struct perf_counter *counter,
-				   struct perf_comm_event *comm_event)
+static int perf_counter_comm_match(struct perf_counter *counter)
 {
-	if (counter->attr.comm &&
-	    comm_event->event.header.type == PERF_EVENT_COMM)
+	if (counter->attr.comm)
 		return 1;
 
 	return 0;
@@ -2531,7 +2628,7 @@ static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-		if (perf_counter_comm_match(counter, comm_event))
+		if (perf_counter_comm_match(counter))
 			perf_counter_comm_output(counter, comm_event);
 	}
 	rcu_read_unlock();
@@ -2570,7 +2667,7 @@ void perf_counter_comm(struct task_struct *task)
 {
 	struct perf_comm_event comm_event;
 
-	if (!atomic_read(&nr_comm_tracking))
+	if (!atomic_read(&nr_comm_counters))
 		return;
 
 	comm_event = (struct perf_comm_event){
@@ -2708,7 +2805,7 @@ void perf_counter_mmap(unsigned long addr, unsigned long len,
 {
 	struct perf_mmap_event mmap_event;
 
-	if (!atomic_read(&nr_mmap_tracking))
+	if (!atomic_read(&nr_mmap_counters))
 		return;
 
 	mmap_event = (struct perf_mmap_event){
@@ -2729,7 +2826,7 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
 {
 	struct perf_mmap_event mmap_event;
 
-	if (!atomic_read(&nr_munmap_tracking))
+	if (!atomic_read(&nr_munmap_counters))
 		return;
 
 	mmap_event = (struct perf_mmap_event){
@@ -3427,11 +3524,11 @@ done:
 
 	atomic_inc(&nr_counters);
 	if (counter->attr.mmap)
-		atomic_inc(&nr_mmap_tracking);
+		atomic_inc(&nr_mmap_counters);
 	if (counter->attr.munmap)
-		atomic_inc(&nr_munmap_tracking);
+		atomic_inc(&nr_munmap_counters);
 	if (counter->attr.comm)
-		atomic_inc(&nr_comm_tracking);
+		atomic_inc(&nr_comm_counters);
 
 	return counter;
 }
-- 
cgit v0.10.2


From d99e9446200c1ffab28cb0e39b76c34a2bfafd06 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 4 Jun 2009 17:08:58 +0200
Subject: perf_counter: Remove munmap stuff

In name of keeping it simple, only track mmap events. Userspace
will have to remove old overlapping maps when it encounters them.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 380247b..6ca403a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -148,11 +148,10 @@ struct perf_counter_attr {
 				exclude_hv     :  1, /* ditto hypervisor      */
 				exclude_idle   :  1, /* don't count when idle */
 				mmap           :  1, /* include mmap data     */
-				munmap         :  1, /* include munmap data   */
 				comm	       :  1, /* include comm data     */
 				freq           :  1, /* use freq, not period  */
 
-				__reserved_1   : 52;
+				__reserved_1   : 53;
 
 	__u32			wakeup_events;	/* wakeup every n events */
 	__u32			__reserved_2;
@@ -246,7 +245,6 @@ enum perf_event_type {
 	 * };
 	 */
 	PERF_EVENT_MMAP			= 1,
-	PERF_EVENT_MUNMAP		= 2,
 
 	/*
 	 * struct {
@@ -622,9 +620,6 @@ extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
 extern void perf_counter_mmap(unsigned long addr, unsigned long len,
 			      unsigned long pgoff, struct file *file);
 
-extern void perf_counter_munmap(unsigned long addr, unsigned long len,
-				unsigned long pgoff, struct file *file);
-
 extern void perf_counter_comm(struct task_struct *tsk);
 extern void perf_counter_fork(struct task_struct *tsk);
 
@@ -677,10 +672,6 @@ static inline void
 perf_counter_mmap(unsigned long addr, unsigned long len,
 		  unsigned long pgoff, struct file *file)		{ }
 
-static inline void
-perf_counter_munmap(unsigned long addr, unsigned long len,
-		    unsigned long pgoff, struct file *file)		{ }
-
 static inline void perf_counter_comm(struct task_struct *tsk)		{ }
 static inline void perf_counter_fork(struct task_struct *tsk)		{ }
 static inline void perf_counter_init(void)				{ }
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 78c5862..195712e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -41,7 +41,6 @@ static int perf_overcommit __read_mostly = 1;
 
 static atomic_t nr_counters __read_mostly;
 static atomic_t nr_mmap_counters __read_mostly;
-static atomic_t nr_munmap_counters __read_mostly;
 static atomic_t nr_comm_counters __read_mostly;
 
 int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
@@ -1448,8 +1447,6 @@ static void free_counter(struct perf_counter *counter)
 	atomic_dec(&nr_counters);
 	if (counter->attr.mmap)
 		atomic_dec(&nr_mmap_counters);
-	if (counter->attr.munmap)
-		atomic_dec(&nr_munmap_counters);
 	if (counter->attr.comm)
 		atomic_dec(&nr_comm_counters);
 
@@ -2510,7 +2507,7 @@ static void perf_counter_fork_output(struct perf_counter *counter,
 
 static int perf_counter_fork_match(struct perf_counter *counter)
 {
-	if (counter->attr.comm || counter->attr.mmap || counter->attr.munmap)
+	if (counter->attr.comm || counter->attr.mmap)
 		return 1;
 
 	return 0;
@@ -2557,8 +2554,7 @@ void perf_counter_fork(struct task_struct *task)
 	struct perf_fork_event fork_event;
 
 	if (!atomic_read(&nr_comm_counters) &&
-	    !atomic_read(&nr_mmap_counters) &&
-	    !atomic_read(&nr_munmap_counters))
+	    !atomic_read(&nr_mmap_counters))
 		return;
 
 	fork_event = (struct perf_fork_event){
@@ -2722,12 +2718,7 @@ static void perf_counter_mmap_output(struct perf_counter *counter,
 static int perf_counter_mmap_match(struct perf_counter *counter,
 				   struct perf_mmap_event *mmap_event)
 {
-	if (counter->attr.mmap &&
-	    mmap_event->event.header.type == PERF_EVENT_MMAP)
-		return 1;
-
-	if (counter->attr.munmap &&
-	    mmap_event->event.header.type == PERF_EVENT_MUNMAP)
+	if (counter->attr.mmap)
 		return 1;
 
 	return 0;
@@ -2821,27 +2812,6 @@ void perf_counter_mmap(unsigned long addr, unsigned long len,
 	perf_counter_mmap_event(&mmap_event);
 }
 
-void perf_counter_munmap(unsigned long addr, unsigned long len,
-			 unsigned long pgoff, struct file *file)
-{
-	struct perf_mmap_event mmap_event;
-
-	if (!atomic_read(&nr_munmap_counters))
-		return;
-
-	mmap_event = (struct perf_mmap_event){
-		.file   = file,
-		.event  = {
-			.header = { .type = PERF_EVENT_MUNMAP, },
-			.start  = addr,
-			.len    = len,
-			.pgoff  = pgoff,
-		},
-	};
-
-	perf_counter_mmap_event(&mmap_event);
-}
-
 /*
  * Log sample_period changes so that analyzing tools can re-normalize the
  * event flow.
@@ -3525,8 +3495,6 @@ done:
 	atomic_inc(&nr_counters);
 	if (counter->attr.mmap)
 		atomic_inc(&nr_mmap_counters);
-	if (counter->attr.munmap)
-		atomic_inc(&nr_munmap_counters);
 	if (counter->attr.comm)
 		atomic_inc(&nr_comm_counters);
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 2c1c2cb..6451ce2 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1756,12 +1756,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
 	do {
 		long nrpages = vma_pages(vma);
 
-		if (vma->vm_flags & VM_EXEC) {
-			perf_counter_munmap(vma->vm_start,
-					nrpages << PAGE_SHIFT,
-					vma->vm_pgoff, vma->vm_file);
-		}
-
 		mm->total_vm -= nrpages;
 		vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
 		vma = remove_vma(vma);
-- 
cgit v0.10.2


From 62fc44536c14b5787531bac7417580fca54c88b4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 4 Jun 2009 16:53:49 +0200
Subject: perf_counter tools: Use fork and remove munmap events

Use fork events to clone comm and map data and remove everything
munmap related

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 389ae25..5d19121 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -43,12 +43,6 @@ static int		full_paths;
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
 
-const char *perf_event_names[] = {
-	[PERF_EVENT_MMAP]   = " PERF_EVENT_MMAP",
-	[PERF_EVENT_MUNMAP] = " PERF_EVENT_MUNMAP",
-	[PERF_EVENT_COMM]   = " PERF_EVENT_COMM",
-};
-
 struct ip_event {
 	struct perf_event_header header;
 	__u64 ip;
@@ -70,11 +64,17 @@ struct comm_event {
 	char comm[16];
 };
 
+struct fork_event {
+	struct perf_event_header header;
+	__u32 pid, ppid;
+};
+
 typedef union event_union {
 	struct perf_event_header header;
 	struct ip_event ip;
 	struct mmap_event mmap;
 	struct comm_event comm;
+	struct fork_event fork;
 } event_t;
 
 static LIST_HEAD(dsos);
@@ -208,7 +208,31 @@ out_delete:
 	return NULL;
 }
 
-struct thread;
+static struct map *map__clone(struct map *self)
+{
+	struct map *map = malloc(sizeof(*self));
+
+	if (!map)
+		return NULL;
+
+	memcpy(map, self, sizeof(*self));
+
+	return map;
+}
+
+static int map__overlap(struct map *l, struct map *r)
+{
+	if (l->start > r->start) {
+		struct map *t = l;
+		l = r;
+		r = t;
+	}
+
+	if (l->end > r->start)
+		return 1;
+
+	return 0;
+}
 
 struct thread {
 	struct rb_node	 rb_node;
@@ -284,9 +308,39 @@ static struct thread *threads__findnew(pid_t pid)
 
 static void thread__insert_map(struct thread *self, struct map *map)
 {
+	struct map *pos, *tmp;
+
+	list_for_each_entry_safe(pos, tmp, &self->maps, node) {
+		if (map__overlap(pos, map)) {
+			list_del_init(&pos->node);
+			/* XXX leaks dsos */
+			free(pos);
+		}
+	}
+
 	list_add_tail(&map->node, &self->maps);
 }
 
+static int thread__fork(struct thread *self, struct thread *parent)
+{
+	struct map *map;
+
+	if (self->comm)
+		free(self->comm);
+	self->comm = strdup(parent->comm);
+	if (!self->comm)
+		return -ENOMEM;
+
+	list_for_each_entry(map, &parent->maps, node) {
+		struct map *new = map__clone(map);
+		if (!new)
+			return -ENOMEM;
+		thread__insert_map(self, new);
+	}
+
+	return 0;
+}
+
 static struct map *thread__find_map(struct thread *self, uint64_t ip)
 {
 	struct map *pos;
@@ -784,7 +838,11 @@ static void register_idle_thread(void)
 	}
 }
 
-static unsigned long total = 0, total_mmap = 0, total_comm = 0, total_unknown = 0;
+static unsigned long total = 0,
+		     total_mmap = 0,
+		     total_comm = 0,
+		     total_fork = 0,
+		     total_unknown = 0;
 
 static int
 process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
@@ -866,9 +924,10 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
 	struct thread *thread = threads__findnew(event->mmap.pid);
 	struct map *map = map__new(&event->mmap);
 
-	dprintf("%p [%p]: PERF_EVENT_MMAP: [%p(%p) @ %p]: %s\n",
+	dprintf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
+		event->mmap.pid,
 		(void *)(long)event->mmap.start,
 		(void *)(long)event->mmap.len,
 		(void *)(long)event->mmap.pgoff,
@@ -906,6 +965,26 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head)
 }
 
 static int
+process_fork_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	struct thread *thread = threads__findnew(event->fork.pid);
+	struct thread *parent = threads__findnew(event->fork.ppid);
+
+	dprintf("%p [%p]: PERF_EVENT_FORK: %d:%d\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->fork.pid, event->fork.ppid);
+
+	if (!thread || !parent || thread__fork(thread, parent)) {
+		dprintf("problem processing PERF_EVENT_FORK, skipping event.\n");
+		return -1;
+	}
+	total_fork++;
+
+	return 0;
+}
+
+static int
 process_event(event_t *event, unsigned long offset, unsigned long head)
 {
 	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW)
@@ -918,10 +997,13 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 	case PERF_EVENT_COMM:
 		return process_comm_event(event, offset, head);
 
+	case PERF_EVENT_FORK:
+		return process_fork_event(event, offset, head);
+
 	/*
 	 * We dont process them right now but they are fine:
 	 */
-	case PERF_EVENT_MUNMAP:
+
 	case PERF_EVENT_PERIOD:
 	case PERF_EVENT_THROTTLE:
 	case PERF_EVENT_UNTHROTTLE:
@@ -1038,6 +1120,7 @@ more:
 	dprintf("      IP events: %10ld\n", total);
 	dprintf("    mmap events: %10ld\n", total_mmap);
 	dprintf("    comm events: %10ld\n", total_comm);
+	dprintf("    fork events: %10ld\n", total_fork);
 	dprintf(" unknown events: %10ld\n", total_unknown);
 
 	if (dump_trace)
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 20e5b12..31c00ba 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -75,8 +75,6 @@ static unsigned int		realtime_prio			=  0;
 static int			group				=  0;
 static unsigned int		page_size;
 static unsigned int		mmap_pages			=  16;
-static int			use_mmap			= 0;
-static int			use_munmap			= 0;
 static int			freq				= 0;
 
 static char			*sym_filter;
@@ -527,19 +525,6 @@ static void mmap_read(struct mmap_data *md)
 		if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
 			if (event->header.type & PERF_SAMPLE_IP)
 				process_event(event->ip.ip, md->counter);
-		} else {
-			switch (event->header.type) {
-				case PERF_EVENT_MMAP:
-				case PERF_EVENT_MUNMAP:
-					printf("%s: %Lu %Lu %Lu %s\n",
-							event->header.type == PERF_EVENT_MMAP
-							? "mmap" : "munmap",
-							event->mmap.start,
-							event->mmap.len,
-							event->mmap.pgoff,
-							event->mmap.filename);
-					break;
-			}
 		}
 	}
 
@@ -569,8 +554,6 @@ static int __cmd_top(void)
 			attr.config		= event_id[counter];
 			attr.sample_period	= event_count[counter];
 			attr.sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
-			attr.mmap		= use_mmap;
-			attr.munmap		= use_munmap;
 			attr.freq		= freq;
 
 			fd[i][counter] = sys_perf_counter_open(&attr, target_pid, cpu, group_fd, 0);
@@ -670,10 +653,6 @@ static const struct option options[] = {
 		    "only display symbols matchig this pattern"),
 	OPT_BOOLEAN('z', "zero", &group,
 		    "zero history across updates"),
-	OPT_BOOLEAN('M', "use-mmap", &use_mmap,
-		    "track mmap events"),
-	OPT_BOOLEAN('U', "use-munmap", &use_munmap,
-		    "track munmap events"),
 	OPT_INTEGER('F', "freq", &freq,
 		    "profile at this frequency"),
 	OPT_INTEGER('E', "entries", &print_entries,
-- 
cgit v0.10.2


From 2cb7878a3a4341d1faa208de962d66f0817d3e7a Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 3 Jun 2009 14:52:24 +0930
Subject: lguest: fix 'unhandled trap 13' with CONFIG_CC_STACKPROTECTOR

We don't set up the canary; let's disable stack protector on boot.c so
we can get into lguest_init, then set it up.  As a side effect,
switch_to_new_gdt() sets up %fs for us properly too.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile
index 27f0c9e..94e0e54 100644
--- a/arch/x86/lguest/Makefile
+++ b/arch/x86/lguest/Makefile
@@ -1 +1,2 @@
 obj-y		:= i386_head.o boot.o
+CFLAGS_boot.o	:= $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index ca7ec44..33a93b4 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -67,6 +67,7 @@
 #include <asm/mce.h>
 #include <asm/io.h>
 #include <asm/i387.h>
+#include <asm/stackprotector.h>
 #include <asm/reboot.h>		/* for struct machine_ops */
 
 /*G:010 Welcome to the Guest!
@@ -1088,13 +1089,21 @@ __init void lguest_init(void)
 	 * lguest_init() where the rest of the fairly chaotic boot setup
 	 * occurs. */
 
+	/* The stack protector is a weird thing where gcc places a canary
+	 * value on the stack and then checks it on return.  This file is
+	 * compiled with -fno-stack-protector it, so we got this far without
+	 * problems.  The value of the canary is kept at offset 20 from the
+	 * %gs register, so we need to set that up before calling C functions
+	 * in other files. */
+	setup_stack_canary_segment(0);
+	/* We could just call load_stack_canary_segment(), but we might as
+	 * call switch_to_new_gdt() which loads the whole table and sets up
+	 * the per-cpu segment descriptor register %fs as well. */
+	switch_to_new_gdt(0);
+
 	/* As described in head_32.S, we map the first 128M of memory. */
 	max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
 
-	/* Load the %fs segment register (the per-cpu segment register) with
-	 * the normal data segment to get through booting. */
-	asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
-
 	/* The Host<->Guest Switcher lives at the top of our address space, and
 	 * the Host told us how big it is when we made LGUEST_INIT hypercall:
 	 * it put the answer in lguest_data.reserve_mem  */
-- 
cgit v0.10.2


From 44fb5511638938a2c37c895abc14df648ffc07e9 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 4 Jun 2009 15:34:51 -0400
Subject: Btrfs: Fix oops and use after free during space balancing

The btrfs allocator uses list_for_each to walk the available block
groups when searching for free blocks.  It starts off with a hint
to help find the best block group for a given allocation.

The hint is resolved into a block group, but we don't properly check
to make sure the block group we find isn't in the middle of being
freed due to filesystem shrinking or balancing.  If it is being
freed, the list pointers in it are bogus and can't be trusted.  But,
the code happily goes along and uses them in the list_for_each loop,
leading to all kinds of fun.

The fix used here is to check to make sure the block group we find really
is on the list before we use it.  list_del_init is used when removing
it from the list, so we can do a proper check.

The allocation clustering code has a similar bug where it will trust
the block group in the current free space cluster.  If our allocation
flags have changed (going from single spindle dup to raid1 for example)
because the drives in the FS have changed, we're not allowed to use
the old block group any more.

The fix used here is to check the current cluster against the
current allocation flags.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3e2c7c7..35af933 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2622,7 +2622,18 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 						       search_start);
 		if (block_group && block_group_bits(block_group, data)) {
 			down_read(&space_info->groups_sem);
-			goto have_block_group;
+			if (list_empty(&block_group->list) ||
+			    block_group->ro) {
+				/*
+				 * someone is removing this block group,
+				 * we can't jump into the have_block_group
+				 * target because our list pointers are not
+				 * valid
+				 */
+				btrfs_put_block_group(block_group);
+				up_read(&space_info->groups_sem);
+			} else
+				goto have_block_group;
 		} else if (block_group) {
 			btrfs_put_block_group(block_group);
 		}
@@ -2656,6 +2667,13 @@ have_block_group:
 			 * people trying to start a new cluster
 			 */
 			spin_lock(&last_ptr->refill_lock);
+			if (last_ptr->block_group &&
+			    (last_ptr->block_group->ro ||
+			    !block_group_bits(last_ptr->block_group, data))) {
+				offset = 0;
+				goto refill_cluster;
+			}
+
 			offset = btrfs_alloc_from_cluster(block_group, last_ptr,
 						 num_bytes, search_start);
 			if (offset) {
@@ -2681,10 +2699,17 @@ have_block_group:
 
 				last_ptr_loop = 1;
 				search_start = block_group->key.objectid;
+				/*
+				 * we know this block group is properly
+				 * in the list because
+				 * btrfs_remove_block_group, drops the
+				 * cluster before it removes the block
+				 * group from the list
+				 */
 				goto have_block_group;
 			}
 			spin_unlock(&last_ptr->lock);
-
+refill_cluster:
 			/*
 			 * this cluster didn't work out, free it and
 			 * start over
@@ -5968,6 +5993,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_path *path;
 	struct btrfs_block_group_cache *block_group;
+	struct btrfs_free_cluster *cluster;
 	struct btrfs_key key;
 	int ret;
 
@@ -5979,6 +6005,21 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
 	memcpy(&key, &block_group->key, sizeof(key));
 
+	/* make sure this block group isn't part of an allocation cluster */
+	cluster = &root->fs_info->data_alloc_cluster;
+	spin_lock(&cluster->refill_lock);
+	btrfs_return_cluster_to_free_space(block_group, cluster);
+	spin_unlock(&cluster->refill_lock);
+
+	/*
+	 * make sure this block group isn't part of a metadata
+	 * allocation cluster
+	 */
+	cluster = &root->fs_info->meta_alloc_cluster;
+	spin_lock(&cluster->refill_lock);
+	btrfs_return_cluster_to_free_space(block_group, cluster);
+	spin_unlock(&cluster->refill_lock);
+
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
@@ -5988,7 +6029,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	spin_unlock(&root->fs_info->block_group_cache_lock);
 	btrfs_remove_free_space_cache(block_group);
 	down_write(&block_group->space_info->groups_sem);
-	list_del(&block_group->list);
+	/*
+	 * we must use list_del_init so people can check to see if they
+	 * are still on the list after taking the semaphore
+	 */
+	list_del_init(&block_group->list);
 	up_write(&block_group->space_info->groups_sem);
 
 	spin_lock(&block_group->space_info->lock);
-- 
cgit v0.10.2


From 0f5486b5c71a831a713ce356d8d06822e3c7c379 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 4 Jun 2009 20:48:04 +0200
Subject: perf_counter: Sleep before refresh using poll in perf top

perf top is refreshed every delay_secs the thread runs in such
loop:

while (sleep(delay_secs)) {
	print_sym_table();
}

At the end of print_sym_table(), poll is used without sleep delay
to check if we have something from stdin.

It means that this check is done only every delay_secs, which can
be higher that 2 secs if the user defined a custom refresh rate.

We can drop sleep() here and directly use poll to wait between
refresh periods, so that the reaction after the user stops perf top
after typing "Enter" is immediate and doesn't suffer from the
delay_secs latency.

Nb: poll doesn't add any overhead that can parasite perf top measures
since it sleeps the entire timeout here.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1244141284-7507-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 31c00ba..28cbde4 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -204,7 +204,7 @@ static void print_sym_table(void)
 			list_remove_active_sym(syme);
 	}
 
-	write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
+	puts(CONSOLE_CLEAR);
 
 	printf(
 "------------------------------------------------------------------------------\n");
@@ -278,23 +278,21 @@ static void print_sym_table(void)
 		color_fprintf(stdout, color, "%4.1f%%", pcnt);
 		printf(" - %016llx : %s\n", sym->start, sym->name);
 	}
-
-	{
-		struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
-
-		if (poll(&stdin_poll, 1, 0) == 1) {
-			printf("key pressed - exiting.\n");
-			exit(0);
-		}
-	}
 }
 
 static void *display_thread(void *arg)
 {
+	struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
+	int delay_msecs = delay_secs * 1000;
+
 	printf("PerfTop refresh period: %d seconds\n", delay_secs);
 
-	while (!sleep(delay_secs))
+	do {
 		print_sym_table();
+	} while (!poll(&stdin_poll, 1, delay_msecs) == 1);
+
+	printf("key pressed - exiting.\n");
+	exit(0);
 
 	return NULL;
 }
-- 
cgit v0.10.2


From 9ac995457b2a148ed9bb8860e8b7cb869327b102 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 4 Jun 2009 13:54:00 -0300
Subject: perf report: Add -vvv to print the list of threads and its mmaps

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 5d19121..1a1028d 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -234,6 +234,13 @@ static int map__overlap(struct map *l, struct map *r)
 	return 0;
 }
 
+static size_t map__fprintf(struct map *self, FILE *fp)
+{
+	return fprintf(fp, " %lx-%lx %lx %s\n",
+		       self->start, self->end, self->pgoff, self->dso->name);
+}
+
+
 struct thread {
 	struct rb_node	 rb_node;
 	struct list_head maps;
@@ -264,6 +271,18 @@ static int thread__set_comm(struct thread *self, const char *comm)
 	return self->comm ? 0 : -ENOMEM;
 }
 
+static size_t thread__fprintf(struct thread *self, FILE *fp)
+{
+	struct map *pos;
+	size_t ret = fprintf(fp, "Thread %d %s\n", self->pid, self->comm);
+
+	list_for_each_entry(pos, &self->maps, node)
+		ret += map__fprintf(pos, fp);
+
+	return ret;
+}
+
+
 static struct rb_root threads;
 static struct thread *last_match;
 
@@ -355,6 +374,20 @@ static struct map *thread__find_map(struct thread *self, uint64_t ip)
 	return NULL;
 }
 
+static size_t threads__fprintf(FILE *fp)
+{
+	size_t ret = 0;
+	struct rb_node *nd;
+
+	for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
+		struct thread *pos = rb_entry(nd, struct thread, rb_node);
+
+		ret += thread__fprintf(pos, fp);
+	}
+
+	return ret;
+}
+
 /*
  * histogram, sorted on item, collects counts
  */
@@ -1126,6 +1159,9 @@ more:
 	if (dump_trace)
 		return 0;
 
+	if (verbose >= 3)
+		threads__fprintf(stdout);
+
 	if (verbose >= 2)
 		dsos__fprintf(stdout);
 
-- 
cgit v0.10.2


From 172124e220f1854acc99ee394671781b8b5e2120 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 4 Jun 2009 22:34:44 +0200
Subject: Revert "block: implement blkdev_readpages"

This reverts commit db2dbb12dc47a50c7a4c5678f526014063e486f6.

It apparently causes problems with partition table read-ahead
on archs with large page sizes. Until that problem is diagnosed
further, just drop the readpages support on block devices.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/block_dev.c b/fs/block_dev.c
index a29b4dc..2dfc6cd 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -331,12 +331,6 @@ static int blkdev_readpage(struct file * file, struct page * page)
 	return block_read_full_page(page, blkdev_get_block);
 }
 
-static int blkdev_readpages(struct file *file, struct address_space *mapping,
-			struct list_head *pages, unsigned nr_pages)
-{
-	return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block);
-}
-
 static int blkdev_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
@@ -1405,7 +1399,6 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
 
 static const struct address_space_operations def_blk_aops = {
 	.readpage	= blkdev_readpage,
-	.readpages	= blkdev_readpages,
 	.writepage	= blkdev_writepage,
 	.sync_page	= block_sync_page,
 	.write_begin	= blkdev_write_begin,
-- 
cgit v0.10.2


From 76a0f40fd6eff1bce3b91925cea7587b3399fe80 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 4 Jun 2009 22:15:58 +0200
Subject: perf_counter tools: Fix warn_unused_result warnings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix warnings for return values that we don't care about:

 util/quote.c:222: attention : ignoring return value of ‘fwrite’, declared with attribute warn_unused_result
 util/quote.c:235: attention : ignoring return value of ‘fwrite’, declared with attribute warn_unused_result
 util/quote.c: In function ‘write_name_quotedpfx’:
 util/quote.c:290: attention : ignoring return value of ‘fwrite’, declared with attribute warn_unused_result

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1244146558-8635-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/util/quote.c b/Documentation/perf_counter/util/quote.c
index 7a49fcf..f18c521 100644
--- a/Documentation/perf_counter/util/quote.c
+++ b/Documentation/perf_counter/util/quote.c
@@ -201,8 +201,9 @@ static size_t quote_c_style_counted(const char *name, ssize_t maxlen,
 	} while (0)
 #define EMITBUF(s, l)                           \
 	do {                                        \
+		int __ret;				\
 		if (sb) strbuf_add(sb, (s), (l));       \
-		if (fp) fwrite((s), (l), 1, fp);        \
+		if (fp) __ret = fwrite((s), (l), 1, fp);        \
 		count += (l);                           \
 	} while (0)
 
@@ -287,7 +288,9 @@ extern void write_name_quotedpfx(const char *pfx, size_t pfxlen,
 		quote_c_style(name, NULL, fp, 1);
 		fputc('"', fp);
 	} else {
-		fwrite(pfx, pfxlen, 1, fp);
+		int ret;
+
+		ret = fwrite(pfx, pfxlen, 1, fp);
 		fputs(name, fp);
 	}
 	fputc(terminator, fp);
-- 
cgit v0.10.2


From 03f5d8bcf094a5e3b501bd2ae1553656efa8d1be Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 9 Jun 2009 00:17:05 -0400
Subject: ext4: Get rid of EXTEND_DISKSIZE flag of ext4_get_blocks_handle()

Get rid of EXTEND_DISKSIZE flag of ext4_get_blocks_handle(). This
seems to be a relict from some old days and setting disksize in this
function does not make much sense.  Currently it was set only by
ext4_getblk().  Since the parameter has some effect only if create ==
1, it is easy to check by grepping through the sources that the three
callers which end up calling ext4_getblk() with create == 1
(ext4_append, ext4_quota_write, ext4_mkdir) do the right thing and set
disksize themselves.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4311cc8..59657ff 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -323,15 +323,13 @@ struct ext4_new_group_data {
 #define EXT4_GET_BLOCKS_UNINIT_EXT		0x0002
 #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT	(EXT4_GET_BLOCKS_UNINIT_EXT|\
 						 EXT4_GET_BLOCKS_CREATE)
-	/* Update the ext4_inode_info i_disksize field */
-#define EXT4_GET_BLOCKS_EXTEND_DISKSIZE		0x0004
 	/* Caller is from the delayed allocation writeout path,
 	   so set the magic i_delalloc_reserve_flag after taking the 
 	   inode allocation semaphore for */
-#define EXT4_GET_BLOCKS_DELALLOC_RESERVE	0x0008
+#define EXT4_GET_BLOCKS_DELALLOC_RESERVE	0x0004
 	/* Call ext4_da_update_reserve_space() after successfully 
 	   allocating the blocks */
-#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE	0x0010
+#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE	0x0008
 
 
 /*
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index d4e99e9..9c35a7b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2779,7 +2779,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	int err = 0, depth, ret, cache_type;
 	unsigned int allocated = 0;
 	struct ext4_allocation_request ar;
-	loff_t disksize;
 
 	__clear_bit(BH_New, &bh_result->b_state);
 	ext_debug("blocks %u/%u requested for inode %u\n",
@@ -2969,14 +2968,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	newblock = ext_pblock(&newex);
 	allocated = ext4_ext_get_actual_len(&newex);
 outnew:
-	if (flags & EXT4_GET_BLOCKS_EXTEND_DISKSIZE) {
-		disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
-		if (disksize > i_size_read(inode))
-			disksize = i_size_read(inode);
-		if (disksize > EXT4_I(inode)->i_disksize)
-			EXT4_I(inode)->i_disksize = disksize;
-	}
-
 	set_buffer_new(bh_result);
 
 	/* Cache only when it is _not_ an uninitialized extent */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 14c00ff..17ed0d2 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -933,11 +933,8 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	int indirect_blks;
 	int blocks_to_boundary = 0;
 	int depth;
-	struct ext4_inode_info *ei = EXT4_I(inode);
 	int count = 0;
 	ext4_fsblk_t first_block = 0;
-	loff_t disksize;
-
 
 	J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
 	J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
@@ -1003,19 +1000,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	if (!err)
 		err = ext4_splice_branch(handle, inode, iblock,
 					partial, indirect_blks, count);
-	/*
-	 * i_disksize growing is protected by i_data_sem.  Don't forget to
-	 * protect it if you're about to implement concurrent
-	 * ext4_get_block() -bzzz
-	*/
-	if (!err && (flags & EXT4_GET_BLOCKS_EXTEND_DISKSIZE)) {
-		disksize = ((loff_t) iblock + count) << inode->i_blkbits;
-		if (disksize > i_size_read(inode))
-			disksize = i_size_read(inode);
-		if (disksize > ei->i_disksize)
-			ei->i_disksize = disksize;
-	}
-	if (err)
+	else 
 		goto cleanup;
 
 	set_buffer_new(bh_result);
@@ -1321,7 +1306,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 {
 	struct buffer_head dummy;
 	int fatal = 0, err;
-	int flags = EXT4_GET_BLOCKS_EXTEND_DISKSIZE;
+	int flags = 0;
 
 	J_ASSERT(handle != NULL || create == 0);
 
@@ -2153,9 +2138,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	}
 
 	/*
-	 * Update on-disk size along with block allocation we don't
-	 * use EXT4_GET_BLOCKS_EXTEND_DISKSIZE as size may change
-	 * within already allocated block -bzzz
+	 * Update on-disk size along with block allocation.
 	 */
 	disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
 	if (disksize > i_size_read(mpd->inode))
-- 
cgit v0.10.2


From b31e15527a9bb71b6a11a425d17ce139a62f5af5 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 4 Jun 2009 17:36:36 -0400
Subject: ext4: Change all super.c messages to print the device

This patch changes ext4 super.c to include the device name with all
warning/error messages, by using a new utility function ext4_msg.
It's a rather large patch, but very mechanic. I left debug printks
alone.

This is a straightforward port of a patch which Andi Kleen did for
ext3.

Cc: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 59657ff..cc7d5ed 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1401,6 +1401,8 @@ extern void ext4_abort(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
 extern void ext4_warning(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
+extern void ext4_msg(struct super_block *, const char *, const char *, ...)
+	__attribute__ ((format (printf, 3, 4)));
 extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
 				const char *, const char *, ...)
 	__attribute__ ((format (printf, 4, 5)));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0a97b1a..c191d0f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -306,7 +306,7 @@ static void ext4_handle_error(struct super_block *sb)
 			jbd2_journal_abort(journal, -EIO);
 	}
 	if (test_opt(sb, ERRORS_RO)) {
-		printk(KERN_CRIT "Remounting filesystem read-only\n");
+		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
 		sb->s_flags |= MS_RDONLY;
 	}
 	ext4_commit_super(sb, 1);
@@ -399,8 +399,6 @@ void ext4_abort(struct super_block *sb, const char *function,
 {
 	va_list args;
 
-	printk(KERN_CRIT "ext4_abort called.\n");
-
 	va_start(args, fmt);
 	printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
 	vprintk(fmt, args);
@@ -413,7 +411,7 @@ void ext4_abort(struct super_block *sb, const char *function,
 	if (sb->s_flags & MS_RDONLY)
 		return;
 
-	printk(KERN_CRIT "Remounting filesystem read-only\n");
+	ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
 	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 	sb->s_flags |= MS_RDONLY;
 	EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
@@ -421,6 +419,18 @@ void ext4_abort(struct super_block *sb, const char *function,
 		jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 }
 
+void ext4_msg (struct super_block * sb, const char *prefix,
+		   const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	printk("%sEXT4-fs (%s): ", prefix, sb->s_id);
+	vprintk(fmt, args);
+	printk("\n");
+	va_end(args);
+}
+
 void ext4_warning(struct super_block *sb, const char *function,
 		  const char *fmt, ...)
 {
@@ -499,7 +509,7 @@ void ext4_update_dynamic_rev(struct super_block *sb)
 /*
  * Open the external journal device
  */
-static struct block_device *ext4_blkdev_get(dev_t dev)
+static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
 {
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
@@ -510,7 +520,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev)
 	return bdev;
 
 fail:
-	printk(KERN_ERR "EXT4-fs: failed to open journal device %s: %ld\n",
+	ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld",
 			__bdevname(dev, b), PTR_ERR(bdev));
 	return NULL;
 }
@@ -546,8 +556,8 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
 {
 	struct list_head *l;
 
-	printk(KERN_ERR "sb orphan head is %d\n",
-	       le32_to_cpu(sbi->s_es->s_last_orphan));
+	ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
+		 le32_to_cpu(sbi->s_es->s_last_orphan));
 
 	printk(KERN_ERR "sb_info orphan list:\n");
 	list_for_each(l, &sbi->s_orphan) {
@@ -678,8 +688,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 static void ext4_destroy_inode(struct inode *inode)
 {
 	if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
-		printk("EXT4 Inode %p: orphan list check failed!\n",
-			EXT4_I(inode));
+		ext4_msg(inode->i_sb, KERN_ERR,
+			 "Inode %lu (%p): orphan list check failed!",
+			 inode->i_ino, EXT4_I(inode));
 		print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
 				EXT4_I(inode), sizeof(struct ext4_inode_info),
 				true);
@@ -1239,8 +1250,7 @@ static int parse_options(char *options, struct super_block *sb,
 #else
 		case Opt_user_xattr:
 		case Opt_nouser_xattr:
-			printk(KERN_ERR "EXT4 (no)user_xattr options "
-			       "not supported\n");
+			ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported");
 			break;
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -1253,8 +1263,7 @@ static int parse_options(char *options, struct super_block *sb,
 #else
 		case Opt_acl:
 		case Opt_noacl:
-			printk(KERN_ERR "EXT4 (no)acl options "
-			       "not supported\n");
+			ext4_msg(sb, KERN_ERR, "(no)acl options not supported");
 			break;
 #endif
 		case Opt_journal_update:
@@ -1264,16 +1273,16 @@ static int parse_options(char *options, struct super_block *sb,
 			   user to specify an existing inode to be the
 			   journal file. */
 			if (is_remount) {
-				printk(KERN_ERR "EXT4-fs: cannot specify "
-				       "journal on remount\n");
+				ext4_msg(sb, KERN_ERR,
+					 "Cannot specify journal on remount");
 				return 0;
 			}
 			set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
 			break;
 		case Opt_journal_dev:
 			if (is_remount) {
-				printk(KERN_ERR "EXT4-fs: cannot specify "
-				       "journal on remount\n");
+				ext4_msg(sb, KERN_ERR,
+					"Cannot specify journal on remount");
 				return 0;
 			}
 			if (match_int(&args[0], &option))
@@ -1327,9 +1336,8 @@ static int parse_options(char *options, struct super_block *sb,
 			if (is_remount) {
 				if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS)
 						!= data_opt) {
-					printk(KERN_ERR
-						"EXT4-fs: cannot change data "
-						"mode on remount\n");
+					ext4_msg(sb, KERN_ERR,
+						"Cannot change data mode on remount");
 					return 0;
 				}
 			} else {
@@ -1359,31 +1367,31 @@ static int parse_options(char *options, struct super_block *sb,
 set_qf_name:
 			if (sb_any_quota_loaded(sb) &&
 			    !sbi->s_qf_names[qtype]) {
-				printk(KERN_ERR
-				       "EXT4-fs: Cannot change journaled "
-				       "quota options when quota turned on.\n");
+				ext4_msg(sb, KERN_ERR,
+				       "Cannot change journaled "
+				       "quota options when quota turned on");
 				return 0;
 			}
 			qname = match_strdup(&args[0]);
 			if (!qname) {
-				printk(KERN_ERR
-					"EXT4-fs: not enough memory for "
-					"storing quotafile name.\n");
+				ext4_msg(sb, KERN_ERR,
+					"Not enough memory for "
+					"storing quotafile name");
 				return 0;
 			}
 			if (sbi->s_qf_names[qtype] &&
 			    strcmp(sbi->s_qf_names[qtype], qname)) {
-				printk(KERN_ERR
-					"EXT4-fs: %s quota file already "
-					"specified.\n", QTYPE2NAME(qtype));
+				ext4_msg(sb, KERN_ERR,
+					"%s quota file already "
+					"specified", QTYPE2NAME(qtype));
 				kfree(qname);
 				return 0;
 			}
 			sbi->s_qf_names[qtype] = qname;
 			if (strchr(sbi->s_qf_names[qtype], '/')) {
-				printk(KERN_ERR
-					"EXT4-fs: quotafile must be on "
-					"filesystem root.\n");
+				ext4_msg(sb, KERN_ERR,
+					"quotafile must be on "
+					"filesystem root");
 				kfree(sbi->s_qf_names[qtype]);
 				sbi->s_qf_names[qtype] = NULL;
 				return 0;
@@ -1398,9 +1406,9 @@ set_qf_name:
 clear_qf_name:
 			if (sb_any_quota_loaded(sb) &&
 			    sbi->s_qf_names[qtype]) {
-				printk(KERN_ERR "EXT4-fs: Cannot change "
+				ext4_msg(sb, KERN_ERR, "Cannot change "
 					"journaled quota options when "
-					"quota turned on.\n");
+					"quota turned on");
 				return 0;
 			}
 			/*
@@ -1417,9 +1425,9 @@ clear_qf_name:
 set_qf_format:
 			if (sb_any_quota_loaded(sb) &&
 			    sbi->s_jquota_fmt != qfmt) {
-				printk(KERN_ERR "EXT4-fs: Cannot change "
+				ext4_msg(sb, KERN_ERR, "Cannot change "
 					"journaled quota options when "
-					"quota turned on.\n");
+					"quota turned on");
 				return 0;
 			}
 			sbi->s_jquota_fmt = qfmt;
@@ -1435,8 +1443,8 @@ set_qf_format:
 			break;
 		case Opt_noquota:
 			if (sb_any_quota_loaded(sb)) {
-				printk(KERN_ERR "EXT4-fs: Cannot change quota "
-					"options when quota turned on.\n");
+				ext4_msg(sb, KERN_ERR, "Cannot change quota "
+					"options when quota turned on");
 				return 0;
 			}
 			clear_opt(sbi->s_mount_opt, QUOTA);
@@ -1447,8 +1455,8 @@ set_qf_format:
 		case Opt_quota:
 		case Opt_usrquota:
 		case Opt_grpquota:
-			printk(KERN_ERR
-				"EXT4-fs: quota options not supported.\n");
+			ext4_msg(sb, KERN_ERR,
+				"quota options not supported");
 			break;
 		case Opt_usrjquota:
 		case Opt_grpjquota:
@@ -1456,9 +1464,8 @@ set_qf_format:
 		case Opt_offgrpjquota:
 		case Opt_jqfmt_vfsold:
 		case Opt_jqfmt_vfsv0:
-			printk(KERN_ERR
-				"EXT4-fs: journaled quota options not "
-				"supported.\n");
+			ext4_msg(sb, KERN_ERR,
+				"journaled quota options not supported");
 			break;
 		case Opt_noquota:
 			break;
@@ -1483,8 +1490,9 @@ set_qf_format:
 			break;
 		case Opt_resize:
 			if (!is_remount) {
-				printk("EXT4-fs: resize option only available "
-					"for remount\n");
+				ext4_msg(sb, KERN_ERR,
+					"resize option only available "
+					"for remount");
 				return 0;
 			}
 			if (match_int(&args[0], &option) != 0)
@@ -1526,8 +1534,9 @@ set_qf_format:
 			if (option < 0 || option > (1 << 30))
 				return 0;
 			if (!is_power_of_2(option)) {
-				printk(KERN_ERR "EXT4-fs: inode_readahead_blks"
-				       " must be a power of 2\n");
+				ext4_msg(sb, KERN_ERR,
+					 "EXT4-fs: inode_readahead_blks"
+					 " must be a power of 2");
 				return 0;
 			}
 			sbi->s_inode_readahead_blks = option;
@@ -1554,9 +1563,9 @@ set_qf_format:
 				set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
 			break;
 		default:
-			printk(KERN_ERR
-			       "EXT4-fs: Unrecognized mount option \"%s\" "
-			       "or missing value\n", p);
+			ext4_msg(sb, KERN_ERR,
+			       "Unrecognized mount option \"%s\" "
+			       "or missing value", p);
 			return 0;
 		}
 	}
@@ -1574,21 +1583,21 @@ set_qf_format:
 				(sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
 		    (sbi->s_qf_names[GRPQUOTA] &&
 				(sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
-			printk(KERN_ERR "EXT4-fs: old and new quota "
-					"format mixing.\n");
+			ext4_msg(sb, KERN_ERR, "old and new quota "
+					"format mixing");
 			return 0;
 		}
 
 		if (!sbi->s_jquota_fmt) {
-			printk(KERN_ERR "EXT4-fs: journaled quota format "
-					"not specified.\n");
+			ext4_msg(sb, KERN_ERR, "journaled quota format "
+					"not specified");
 			return 0;
 		}
 	} else {
 		if (sbi->s_jquota_fmt) {
-			printk(KERN_ERR "EXT4-fs: journaled quota format "
+			ext4_msg(sb, KERN_ERR, "journaled quota format "
 					"specified with no journaling "
-					"enabled.\n");
+					"enabled");
 			return 0;
 		}
 	}
@@ -1603,31 +1612,31 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 	int res = 0;
 
 	if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
-		printk(KERN_ERR "EXT4-fs warning: revision level too high, "
-		       "forcing read-only mode\n");
+		ext4_msg(sb, KERN_ERR, "revision level too high, "
+			 "forcing read-only mode");
 		res = MS_RDONLY;
 	}
 	if (read_only)
 		return res;
 	if (!(sbi->s_mount_state & EXT4_VALID_FS))
-		printk(KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
-		       "running e2fsck is recommended\n");
+		ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
+			 "running e2fsck is recommended");
 	else if ((sbi->s_mount_state & EXT4_ERROR_FS))
-		printk(KERN_WARNING
-		       "EXT4-fs warning: mounting fs with errors, "
-		       "running e2fsck is recommended\n");
+		ext4_msg(sb, KERN_WARNING,
+			 "warning: mounting fs with errors, "
+			 "running e2fsck is recommended");
 	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
 		 le16_to_cpu(es->s_mnt_count) >=
 		 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
-		printk(KERN_WARNING
-		       "EXT4-fs warning: maximal mount count reached, "
-		       "running e2fsck is recommended\n");
+		ext4_msg(sb, KERN_WARNING,
+			 "warning: maximal mount count reached, "
+			 "running e2fsck is recommended");
 	else if (le32_to_cpu(es->s_checkinterval) &&
 		(le32_to_cpu(es->s_lastcheck) +
 			le32_to_cpu(es->s_checkinterval) <= get_seconds()))
-		printk(KERN_WARNING
-		       "EXT4-fs warning: checktime reached, "
-		       "running e2fsck is recommended\n");
+		ext4_msg(sb, KERN_WARNING,
+			 "warning: checktime reached, "
+			 "running e2fsck is recommended");
 	if (!sbi->s_journal)
 		es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
 	if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
@@ -1649,11 +1658,11 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 			sbi->s_mount_opt);
 
 	if (EXT4_SB(sb)->s_journal) {
-		printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
-		       sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
+		ext4_msg(sb, KERN_INFO, "%s journal on %s",
+		       EXT4_SB(sb)->s_journal->j_inode ? "internal" :
 		       "external", EXT4_SB(sb)->s_journal->j_devname);
 	} else {
-		printk(KERN_INFO "EXT4 FS on %s, no journal\n", sb->s_id);
+		ext4_msg(sb, KERN_INFO, "no journal");
 	}
 	return res;
 }
@@ -1688,8 +1697,8 @@ static int ext4_fill_flex_info(struct super_block *sb)
 			memset(sbi->s_flex_groups, 0, size);
 	}
 	if (sbi->s_flex_groups == NULL) {
-		printk(KERN_ERR "EXT4-fs: not enough memory for "
-				"%u flex groups\n", flex_group_count);
+		ext4_msg(sb, KERN_ERR, "not enough memory for "
+				"%u flex groups", flex_group_count);
 		goto failed;
 	}
 
@@ -1775,32 +1784,32 @@ static int ext4_check_descriptors(struct super_block *sb)
 
 		block_bitmap = ext4_block_bitmap(sb, gdp);
 		if (block_bitmap < first_block || block_bitmap > last_block) {
-			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
 			       "Block bitmap for group %u not in group "
-			       "(block %llu)!\n", i, block_bitmap);
+			       "(block %llu)!", i, block_bitmap);
 			return 0;
 		}
 		inode_bitmap = ext4_inode_bitmap(sb, gdp);
 		if (inode_bitmap < first_block || inode_bitmap > last_block) {
-			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
 			       "Inode bitmap for group %u not in group "
-			       "(block %llu)!\n", i, inode_bitmap);
+			       "(block %llu)!", i, inode_bitmap);
 			return 0;
 		}
 		inode_table = ext4_inode_table(sb, gdp);
 		if (inode_table < first_block ||
 		    inode_table + sbi->s_itb_per_group - 1 > last_block) {
-			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
 			       "Inode table for group %u not in group "
-			       "(block %llu)!\n", i, inode_table);
+			       "(block %llu)!", i, inode_table);
 			return 0;
 		}
 		ext4_lock_group(sb, i);
 		if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
-			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
-			       "Checksum for group %u failed (%u!=%u)\n",
-			       i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
-			       gdp)), le16_to_cpu(gdp->bg_checksum));
+			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+				 "Checksum for group %u failed (%u!=%u)",
+				 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
+				     gdp)), le16_to_cpu(gdp->bg_checksum));
 			if (!(sb->s_flags & MS_RDONLY)) {
 				ext4_unlock_group(sb, i);
 				return 0;
@@ -1847,8 +1856,8 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 	}
 
 	if (bdev_read_only(sb->s_bdev)) {
-		printk(KERN_ERR "EXT4-fs: write access "
-			"unavailable, skipping orphan cleanup.\n");
+		ext4_msg(sb, KERN_ERR, "write access "
+			"unavailable, skipping orphan cleanup");
 		return;
 	}
 
@@ -1862,8 +1871,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 	}
 
 	if (s_flags & MS_RDONLY) {
-		printk(KERN_INFO "EXT4-fs: %s: orphan cleanup on readonly fs\n",
-		       sb->s_id);
+		ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
 		sb->s_flags &= ~MS_RDONLY;
 	}
 #ifdef CONFIG_QUOTA
@@ -1874,9 +1882,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 		if (EXT4_SB(sb)->s_qf_names[i]) {
 			int ret = ext4_quota_on_mount(sb, i);
 			if (ret < 0)
-				printk(KERN_ERR
-					"EXT4-fs: Cannot turn on journaled "
-					"quota: error %d\n", ret);
+				ext4_msg(sb, KERN_ERR,
+					"Cannot turn on journaled "
+					"quota: error %d", ret);
 		}
 	}
 #endif
@@ -1893,16 +1901,16 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
 		vfs_dq_init(inode);
 		if (inode->i_nlink) {
-			printk(KERN_DEBUG
-				"%s: truncating inode %lu to %lld bytes\n",
+			ext4_msg(sb, KERN_DEBUG,
+				"%s: truncating inode %lu to %lld bytes",
 				__func__, inode->i_ino, inode->i_size);
 			jbd_debug(2, "truncating inode %lu to %lld bytes\n",
 				  inode->i_ino, inode->i_size);
 			ext4_truncate(inode);
 			nr_truncates++;
 		} else {
-			printk(KERN_DEBUG
-				"%s: deleting unreferenced inode %lu\n",
+			ext4_msg(sb, KERN_DEBUG,
+				"%s: deleting unreferenced inode %lu",
 				__func__, inode->i_ino);
 			jbd_debug(2, "deleting unreferenced inode %lu\n",
 				  inode->i_ino);
@@ -1914,11 +1922,11 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 #define PLURAL(x) (x), ((x) == 1) ? "" : "s"
 
 	if (nr_orphans)
-		printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n",
-		       sb->s_id, PLURAL(nr_orphans));
+		ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
+		       PLURAL(nr_orphans));
 	if (nr_truncates)
-		printk(KERN_INFO "EXT4-fs: %s: %d truncate%s cleaned up\n",
-		       sb->s_id, PLURAL(nr_truncates));
+		ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
+		       PLURAL(nr_truncates));
 #ifdef CONFIG_QUOTA
 	/* Turn quotas off */
 	for (i = 0; i < MAXQUOTAS; i++) {
@@ -2307,7 +2315,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
 	if (!blocksize) {
-		printk(KERN_ERR "EXT4-fs: unable to set blocksize\n");
+		ext4_msg(sb, KERN_ERR, "unable to set blocksize");
 		goto out_fail;
 	}
 
@@ -2323,7 +2331,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	if (!(bh = sb_bread(sb, logical_sb_block))) {
-		printk(KERN_ERR "EXT4-fs: unable to read superblock\n");
+		ext4_msg(sb, KERN_ERR, "unable to read superblock");
 		goto out_fail;
 	}
 	/*
@@ -2393,9 +2401,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	    (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
 	     EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
 	     EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
-		printk(KERN_WARNING
-		       "EXT4-fs warning: feature flags set on rev 0 fs, "
-		       "running e2fsck is recommended\n");
+		ext4_msg(sb, KERN_WARNING,
+		       "feature flags set on rev 0 fs, "
+		       "running e2fsck is recommended");
 
 	/*
 	 * Check feature flags regardless of the revision level, since we
@@ -2404,16 +2412,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 */
 	features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
 	if (features) {
-		printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
-		       "unsupported optional features (%x).\n", sb->s_id,
+		ext4_msg(sb, KERN_ERR,
+			"Couldn't mount because of "
+			"unsupported optional features (%x)",
 			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
 			~EXT4_FEATURE_INCOMPAT_SUPP));
 		goto failed_mount;
 	}
 	features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
 	if (!(sb->s_flags & MS_RDONLY) && features) {
-		printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
-		       "unsupported optional features (%x).\n", sb->s_id,
+		ext4_msg(sb, KERN_ERR,
+			"Couldn't mount RDWR because of "
+			"unsupported optional features (%x)",
 			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
 			~EXT4_FEATURE_RO_COMPAT_SUPP));
 		goto failed_mount;
@@ -2427,9 +2437,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		 */
 		if (sizeof(root->i_blocks) < sizeof(u64) &&
 				!(sb->s_flags & MS_RDONLY)) {
-			printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge "
+			ext4_msg(sb, KERN_ERR, "Filesystem with huge "
 					"files cannot be mounted read-write "
-					"without CONFIG_LBD.\n", sb->s_id);
+					"without CONFIG_LBD");
 			goto failed_mount;
 		}
 	}
@@ -2437,16 +2447,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	if (blocksize < EXT4_MIN_BLOCK_SIZE ||
 	    blocksize > EXT4_MAX_BLOCK_SIZE) {
-		printk(KERN_ERR
-		       "EXT4-fs: Unsupported filesystem blocksize %d on %s.\n",
-		       blocksize, sb->s_id);
+		ext4_msg(sb, KERN_ERR,
+		       "Unsupported filesystem blocksize %d", blocksize);
 		goto failed_mount;
 	}
 
 	if (sb->s_blocksize != blocksize) {
 		/* Validate the filesystem blocksize */
 		if (!sb_set_blocksize(sb, blocksize)) {
-			printk(KERN_ERR "EXT4-fs: bad block size %d.\n",
+			ext4_msg(sb, KERN_ERR, "bad block size %d",
 					blocksize);
 			goto failed_mount;
 		}
@@ -2456,15 +2465,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		offset = do_div(logical_sb_block, blocksize);
 		bh = sb_bread(sb, logical_sb_block);
 		if (!bh) {
-			printk(KERN_ERR
-			       "EXT4-fs: Can't read superblock on 2nd try.\n");
+			ext4_msg(sb, KERN_ERR,
+			       "Can't read superblock on 2nd try");
 			goto failed_mount;
 		}
 		es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
 		sbi->s_es = es;
 		if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
-			printk(KERN_ERR
-			       "EXT4-fs: Magic mismatch, very weird !\n");
+			ext4_msg(sb, KERN_ERR,
+			       "Magic mismatch, very weird!");
 			goto failed_mount;
 		}
 	}
@@ -2482,8 +2491,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
 		    (!is_power_of_2(sbi->s_inode_size)) ||
 		    (sbi->s_inode_size > blocksize)) {
-			printk(KERN_ERR
-			       "EXT4-fs: unsupported inode size: %d\n",
+			ext4_msg(sb, KERN_ERR,
+			       "unsupported inode size: %d",
 			       sbi->s_inode_size);
 			goto failed_mount;
 		}
@@ -2496,8 +2505,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
 		    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
 		    !is_power_of_2(sbi->s_desc_size)) {
-			printk(KERN_ERR
-			       "EXT4-fs: unsupported descriptor size %lu\n",
+			ext4_msg(sb, KERN_ERR,
+			       "unsupported descriptor size %lu",
 			       sbi->s_desc_size);
 			goto failed_mount;
 		}
@@ -2537,25 +2546,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	if (sbi->s_blocks_per_group > blocksize * 8) {
-		printk(KERN_ERR
-		       "EXT4-fs: #blocks per group too big: %lu\n",
+		ext4_msg(sb, KERN_ERR,
+		       "#blocks per group too big: %lu",
 		       sbi->s_blocks_per_group);
 		goto failed_mount;
 	}
 	if (sbi->s_inodes_per_group > blocksize * 8) {
-		printk(KERN_ERR
-		       "EXT4-fs: #inodes per group too big: %lu\n",
+		ext4_msg(sb, KERN_ERR,
+		       "#inodes per group too big: %lu",
 		       sbi->s_inodes_per_group);
 		goto failed_mount;
 	}
 
 	if (ext4_blocks_count(es) >
 		    (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
-		printk(KERN_ERR "EXT4-fs: filesystem on %s:"
-			" too large to mount safely\n", sb->s_id);
+		ext4_msg(sb, KERN_ERR, "filesystem"
+			" too large to mount safely");
 		if (sizeof(sector_t) < 8)
-			printk(KERN_WARNING "EXT4-fs: CONFIG_LBD not "
-					"enabled\n");
+			ext4_msg(sb, KERN_WARNING, "CONFIG_LBD not enabled");
 		goto failed_mount;
 	}
 
@@ -2565,8 +2573,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	/* check blocks count against device size */
 	blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
 	if (blocks_count && ext4_blocks_count(es) > blocks_count) {
-		printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu "
-		       "exceeds size of device (%llu blocks)\n",
+		ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
+		       "exceeds size of device (%llu blocks)",
 		       ext4_blocks_count(es), blocks_count);
 		goto failed_mount;
 	}
@@ -2576,10 +2584,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 * of the filesystem.
 	 */
 	if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
-		printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
-		       "block %u is beyond end of filesystem (%llu)\n",
-		       le32_to_cpu(es->s_first_data_block),
-		       ext4_blocks_count(es));
+                ext4_msg(sb, KERN_WARNING, "bad geometry: first data"
+			 "block %u is beyond end of filesystem (%llu)",
+			 le32_to_cpu(es->s_first_data_block),
+			 ext4_blocks_count(es));
 		goto failed_mount;
 	}
 	blocks_count = (ext4_blocks_count(es) -
@@ -2587,9 +2595,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			EXT4_BLOCKS_PER_GROUP(sb) - 1);
 	do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
 	if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
-		printk(KERN_WARNING "EXT4-fs: groups count too large: %u "
+		ext4_msg(sb, KERN_WARNING, "groups count too large: %u "
 		       "(block count %llu, first data block %u, "
-		       "blocks per group %lu)\n", sbi->s_groups_count,
+		       "blocks per group %lu)", sbi->s_groups_count,
 		       ext4_blocks_count(es),
 		       le32_to_cpu(es->s_first_data_block),
 		       EXT4_BLOCKS_PER_GROUP(sb));
@@ -2601,7 +2609,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
 				    GFP_KERNEL);
 	if (sbi->s_group_desc == NULL) {
-		printk(KERN_ERR "EXT4-fs: not enough memory\n");
+		ext4_msg(sb, KERN_ERR, "not enough memory");
 		goto failed_mount;
 	}
 
@@ -2616,21 +2624,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		block = descriptor_loc(sb, logical_sb_block, i);
 		sbi->s_group_desc[i] = sb_bread(sb, block);
 		if (!sbi->s_group_desc[i]) {
-			printk(KERN_ERR "EXT4-fs: "
-			       "can't read group descriptor %d\n", i);
+			ext4_msg(sb, KERN_ERR,
+			       "can't read group descriptor %d", i);
 			db_count = i;
 			goto failed_mount2;
 		}
 	}
 	if (!ext4_check_descriptors(sb)) {
-		printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
+		ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
 		goto failed_mount2;
 	}
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
 		if (!ext4_fill_flex_info(sb)) {
-			printk(KERN_ERR
-			       "EXT4-fs: unable to initialize "
-			       "flex_bg meta info!\n");
+			ext4_msg(sb, KERN_ERR,
+			       "unable to initialize "
+			       "flex_bg meta info!");
 			goto failed_mount2;
 		}
 
@@ -2652,7 +2660,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
 	}
 	if (err) {
-		printk(KERN_ERR "EXT4-fs: insufficient memory\n");
+		ext4_msg(sb, KERN_ERR, "insufficient memory");
 		goto failed_mount3;
 	}
 
@@ -2692,13 +2700,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			goto failed_mount3;
 		if (!(sb->s_flags & MS_RDONLY) &&
 		    EXT4_SB(sb)->s_journal->j_failed_commit) {
-			printk(KERN_CRIT "EXT4-fs error (device %s): "
+			ext4_msg(sb, KERN_CRIT, "error: "
 			       "ext4_fill_super: Journal transaction "
-			       "%u is corrupt\n", sb->s_id,
+			       "%u is corrupt",
 			       EXT4_SB(sb)->s_journal->j_failed_commit);
 			if (test_opt(sb, ERRORS_RO)) {
-				printk(KERN_CRIT
-				       "Mounting filesystem read-only\n");
+				ext4_msg(sb, KERN_CRIT,
+				       "Mounting filesystem read-only");
 				sb->s_flags |= MS_RDONLY;
 				EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 				es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
@@ -2712,8 +2720,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		}
 	} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
 	      EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
-		printk(KERN_ERR "EXT4-fs: required journal recovery "
-		       "suppressed and not mounted read-only\n");
+		ext4_msg(sb, KERN_ERR, "required journal recovery "
+		       "suppressed and not mounted read-only");
 		goto failed_mount4;
 	} else {
 		clear_opt(sbi->s_mount_opt, DATA_FLAGS);
@@ -2726,7 +2734,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (ext4_blocks_count(es) > 0xffffffffULL &&
 	    !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
 				       JBD2_FEATURE_INCOMPAT_64BIT)) {
-		printk(KERN_ERR "EXT4-fs: Failed to set 64-bit journal feature\n");
+		ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
 		goto failed_mount4;
 	}
 
@@ -2764,8 +2772,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	case EXT4_MOUNT_WRITEBACK_DATA:
 		if (!jbd2_journal_check_available_features
 		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
-			printk(KERN_ERR "EXT4-fs: Journal does not support "
-			       "requested data journaling mode\n");
+			ext4_msg(sb, KERN_ERR, "Journal does not support "
+			       "requested data journaling mode");
 			goto failed_mount4;
 		}
 	default:
@@ -2777,8 +2785,8 @@ no_journal:
 
 	if (test_opt(sb, NOBH)) {
 		if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
-			printk(KERN_WARNING "EXT4-fs: Ignoring nobh option - "
-				"its supported only with writeback mode\n");
+			ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
+				"its supported only with writeback mode");
 			clear_opt(sbi->s_mount_opt, NOBH);
 		}
 	}
@@ -2789,18 +2797,18 @@ no_journal:
 
 	root = ext4_iget(sb, EXT4_ROOT_INO);
 	if (IS_ERR(root)) {
-		printk(KERN_ERR "EXT4-fs: get root inode failed\n");
+		ext4_msg(sb, KERN_ERR, "get root inode failed");
 		ret = PTR_ERR(root);
 		goto failed_mount4;
 	}
 	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
 		iput(root);
-		printk(KERN_ERR "EXT4-fs: corrupt root inode, run e2fsck\n");
+		ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
 		goto failed_mount4;
 	}
 	sb->s_root = d_alloc_root(root);
 	if (!sb->s_root) {
-		printk(KERN_ERR "EXT4-fs: get root dentry failed\n");
+		ext4_msg(sb, KERN_ERR, "get root dentry failed");
 		iput(root);
 		ret = -ENOMEM;
 		goto failed_mount4;
@@ -2829,29 +2837,29 @@ no_journal:
 							sbi->s_inode_size) {
 		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
 						       EXT4_GOOD_OLD_INODE_SIZE;
-		printk(KERN_INFO "EXT4-fs: required extra inode space not"
-			"available.\n");
+		ext4_msg(sb, KERN_INFO, "required extra inode space not"
+			 "available");
 	}
 
 	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
-		printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
-				"requested data journaling mode\n");
+		ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
+			 "requested data journaling mode");
 		clear_opt(sbi->s_mount_opt, DELALLOC);
 	} else if (test_opt(sb, DELALLOC))
-		printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
+		ext4_msg(sb, KERN_INFO, "delayed allocation enabled");
 
 	err = ext4_setup_system_zone(sb);
 	if (err) {
-		printk(KERN_ERR "EXT4-fs: failed to initialize system "
-		       "zone (%d)\n", err);
+		ext4_msg(sb, KERN_ERR, "failed to initialize system "
+			 "zone (%d)\n", err);
 		goto failed_mount4;
 	}
 
 	ext4_ext_init(sb);
 	err = ext4_mb_init(sb, needs_recovery);
 	if (err) {
-		printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
-		       err);
+		ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)",
+			 err);
 		goto failed_mount4;
 	}
 
@@ -2869,7 +2877,7 @@ no_journal:
 	ext4_orphan_cleanup(sb, es);
 	EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
 	if (needs_recovery) {
-		printk(KERN_INFO "EXT4-fs: recovery complete.\n");
+		ext4_msg(sb, KERN_INFO, "recovery complete");
 		ext4_mark_recovery_complete(sb, es);
 	}
 	if (EXT4_SB(sb)->s_journal) {
@@ -2882,20 +2890,18 @@ no_journal:
 	} else
 		descr = "out journal";
 
-	printk(KERN_INFO "EXT4-fs: mounted filesystem %s with%s\n",
-	       sb->s_id, descr);
+	ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr);
 
 	lock_kernel();
 	return 0;
 
 cantfind_ext4:
 	if (!silent)
-		printk(KERN_ERR "VFS: Can't find ext4 filesystem on dev %s.\n",
-		       sb->s_id);
+		ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
 	goto failed_mount;
 
 failed_mount4:
-	printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id);
+	ext4_msg(sb, KERN_ERR, "mount failed");
 	ext4_release_system_zone(sb);
 	if (sbi->s_journal) {
 		jbd2_journal_destroy(sbi->s_journal);
@@ -2973,27 +2979,27 @@ static journal_t *ext4_get_journal(struct super_block *sb,
 
 	journal_inode = ext4_iget(sb, journal_inum);
 	if (IS_ERR(journal_inode)) {
-		printk(KERN_ERR "EXT4-fs: no journal found.\n");
+		ext4_msg(sb, KERN_ERR, "no journal found");
 		return NULL;
 	}
 	if (!journal_inode->i_nlink) {
 		make_bad_inode(journal_inode);
 		iput(journal_inode);
-		printk(KERN_ERR "EXT4-fs: journal inode is deleted.\n");
+		ext4_msg(sb, KERN_ERR, "journal inode is deleted");
 		return NULL;
 	}
 
 	jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
 		  journal_inode, journal_inode->i_size);
 	if (!S_ISREG(journal_inode->i_mode)) {
-		printk(KERN_ERR "EXT4-fs: invalid journal inode.\n");
+		ext4_msg(sb, KERN_ERR, "invalid journal inode");
 		iput(journal_inode);
 		return NULL;
 	}
 
 	journal = jbd2_journal_init_inode(journal_inode);
 	if (!journal) {
-		printk(KERN_ERR "EXT4-fs: Could not load journal inode\n");
+		ext4_msg(sb, KERN_ERR, "Could not load journal inode");
 		iput(journal_inode);
 		return NULL;
 	}
@@ -3017,13 +3023,13 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 
 	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
 
-	bdev = ext4_blkdev_get(j_dev);
+	bdev = ext4_blkdev_get(j_dev, sb);
 	if (bdev == NULL)
 		return NULL;
 
 	if (bd_claim(bdev, sb)) {
-		printk(KERN_ERR
-			"EXT4-fs: failed to claim external journal device.\n");
+		ext4_msg(sb, KERN_ERR,
+			"failed to claim external journal device");
 		blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 		return NULL;
 	}
@@ -3031,8 +3037,8 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	blocksize = sb->s_blocksize;
 	hblock = bdev_hardsect_size(bdev);
 	if (blocksize < hblock) {
-		printk(KERN_ERR
-			"EXT4-fs: blocksize too small for journal device.\n");
+		ext4_msg(sb, KERN_ERR,
+			"blocksize too small for journal device");
 		goto out_bdev;
 	}
 
@@ -3040,8 +3046,8 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	offset = EXT4_MIN_BLOCK_SIZE % blocksize;
 	set_blocksize(bdev, blocksize);
 	if (!(bh = __bread(bdev, sb_block, blocksize))) {
-		printk(KERN_ERR "EXT4-fs: couldn't read superblock of "
-		       "external journal\n");
+		ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
+		       "external journal");
 		goto out_bdev;
 	}
 
@@ -3049,14 +3055,14 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
 	    !(le32_to_cpu(es->s_feature_incompat) &
 	      EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
-		printk(KERN_ERR "EXT4-fs: external journal has "
-					"bad superblock\n");
+		ext4_msg(sb, KERN_ERR, "external journal has "
+					"bad superblock");
 		brelse(bh);
 		goto out_bdev;
 	}
 
 	if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
-		printk(KERN_ERR "EXT4-fs: journal UUID does not match\n");
+		ext4_msg(sb, KERN_ERR, "journal UUID does not match");
 		brelse(bh);
 		goto out_bdev;
 	}
@@ -3068,19 +3074,19 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
 					start, len, blocksize);
 	if (!journal) {
-		printk(KERN_ERR "EXT4-fs: failed to create device journal\n");
+		ext4_msg(sb, KERN_ERR, "failed to create device journal");
 		goto out_bdev;
 	}
 	journal->j_private = sb;
 	ll_rw_block(READ, 1, &journal->j_sb_buffer);
 	wait_on_buffer(journal->j_sb_buffer);
 	if (!buffer_uptodate(journal->j_sb_buffer)) {
-		printk(KERN_ERR "EXT4-fs: I/O error on journal device\n");
+		ext4_msg(sb, KERN_ERR, "I/O error on journal device");
 		goto out_journal;
 	}
 	if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
-		printk(KERN_ERR "EXT4-fs: External journal has more than one "
-					"user (unsupported) - %d\n",
+		ext4_msg(sb, KERN_ERR, "External journal has more than one "
+					"user (unsupported) - %d",
 			be32_to_cpu(journal->j_superblock->s_nr_users));
 		goto out_journal;
 	}
@@ -3109,8 +3115,8 @@ static int ext4_load_journal(struct super_block *sb,
 
 	if (journal_devnum &&
 	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
-		printk(KERN_INFO "EXT4-fs: external journal device major/minor "
-			"numbers have changed\n");
+		ext4_msg(sb, KERN_INFO, "external journal device major/minor "
+			"numbers have changed");
 		journal_dev = new_decode_dev(journal_devnum);
 	} else
 		journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
@@ -3124,21 +3130,21 @@ static int ext4_load_journal(struct super_block *sb,
 	 */
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
 		if (sb->s_flags & MS_RDONLY) {
-			printk(KERN_INFO "EXT4-fs: INFO: recovery "
-					"required on readonly filesystem.\n");
+			ext4_msg(sb, KERN_INFO, "INFO: recovery "
+					"required on readonly filesystem");
 			if (really_read_only) {
-				printk(KERN_ERR "EXT4-fs: write access "
-					"unavailable, cannot proceed.\n");
+				ext4_msg(sb, KERN_ERR, "write access "
+					"unavailable, cannot proceed");
 				return -EROFS;
 			}
-			printk(KERN_INFO "EXT4-fs: write access will "
-			       "be enabled during recovery.\n");
+			ext4_msg(sb, KERN_INFO, "write access will "
+			       "be enabled during recovery");
 		}
 	}
 
 	if (journal_inum && journal_dev) {
-		printk(KERN_ERR "EXT4-fs: filesystem has both journal "
-		       "and inode journals!\n");
+		ext4_msg(sb, KERN_ERR, "filesystem has both journal "
+		       "and inode journals!");
 		return -EINVAL;
 	}
 
@@ -3151,14 +3157,14 @@ static int ext4_load_journal(struct super_block *sb,
 	}
 
 	if (journal->j_flags & JBD2_BARRIER)
-		printk(KERN_INFO "EXT4-fs: barriers enabled\n");
+		ext4_msg(sb, KERN_INFO, "barriers enabled");
 	else
-		printk(KERN_INFO "EXT4-fs: barriers disabled\n");
+		ext4_msg(sb, KERN_INFO, "barriers disabled");
 
 	if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
 		err = jbd2_journal_update_format(journal);
 		if (err)  {
-			printk(KERN_ERR "EXT4-fs: error updating journal.\n");
+			ext4_msg(sb, KERN_ERR, "error updating journal");
 			jbd2_journal_destroy(journal);
 			return err;
 		}
@@ -3170,7 +3176,7 @@ static int ext4_load_journal(struct super_block *sb,
 		err = jbd2_journal_load(journal);
 
 	if (err) {
-		printk(KERN_ERR "EXT4-fs: error loading journal.\n");
+		ext4_msg(sb, KERN_ERR, "error loading journal");
 		jbd2_journal_destroy(journal);
 		return err;
 	}
@@ -3206,8 +3212,8 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 		 * be remapped.  Nothing we can do but to retry the
 		 * write and hope for the best.
 		 */
-		printk(KERN_ERR "EXT4-fs: previous I/O error to "
-		       "superblock detected for %s.\n", sb->s_id);
+		ext4_msg(sb, KERN_ERR, "previous I/O error to "
+		       "superblock detected");
 		clear_buffer_write_io_error(sbh);
 		set_buffer_uptodate(sbh);
 	}
@@ -3230,8 +3236,8 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 
 		error = buffer_write_io_error(sbh);
 		if (error) {
-			printk(KERN_ERR "EXT4-fs: I/O error while writing "
-			       "superblock for %s.\n", sb->s_id);
+			ext4_msg(sb, KERN_ERR, "I/O error while writing "
+			       "superblock");
 			clear_buffer_write_io_error(sbh);
 			set_buffer_uptodate(sbh);
 		}
@@ -3478,9 +3484,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			int ret;
 			if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
 					~EXT4_FEATURE_RO_COMPAT_SUPP))) {
-				printk(KERN_WARNING "EXT4-fs: %s: couldn't "
+				ext4_msg(sb, KERN_WARNING, "couldn't "
 				       "remount RDWR because of unsupported "
-				       "optional features (%x).\n", sb->s_id,
+				       "optional features (%x)",
 				(le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
 					~EXT4_FEATURE_RO_COMPAT_SUPP));
 				err = -EROFS;
@@ -3496,9 +3502,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 					ext4_get_group_desc(sb, g, NULL);
 
 				if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
-					printk(KERN_ERR
-	       "EXT4-fs: ext4_remount: "
-		"Checksum for group %u failed (%u!=%u)\n",
+					ext4_msg(sb, KERN_ERR,
+	       "ext4_remount: Checksum for group %u failed (%u!=%u)",
 		g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
 					       le16_to_cpu(gdp->bg_checksum));
 					err = -EINVAL;
@@ -3512,11 +3517,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			 * require a full umount/remount for now.
 			 */
 			if (es->s_last_orphan) {
-				printk(KERN_WARNING "EXT4-fs: %s: couldn't "
+				ext4_msg(sb, KERN_WARNING, "Couldn't "
 				       "remount RDWR because of unprocessed "
 				       "orphan inode list.  Please "
-				       "umount/remount instead.\n",
-				       sb->s_id);
+				       "umount/remount instead");
 				err = -EINVAL;
 				goto restore_opts;
 			}
@@ -3772,9 +3776,9 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 	if (EXT4_SB(sb)->s_qf_names[type]) {
 		/* Quotafile not in fs root? */
 		if (path.dentry->d_parent != sb->s_root)
-			printk(KERN_WARNING
-				"EXT4-fs: Quota file not on filesystem root. "
-				"Journaled quota will not work.\n");
+			ext4_msg(sb, KERN_WARNING,
+				"Quota file not on filesystem root. "
+				"Journaled quota will not work");
 	}
 
 	/*
@@ -3857,8 +3861,8 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 	handle_t *handle = journal_current_handle();
 
 	if (EXT4_SB(sb)->s_journal && !handle) {
-		printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
-			" cancelled because transaction is not started.\n",
+		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+			" cancelled because transaction is not started",
 			(unsigned long long)off, (unsigned long long)len);
 		return -EIO;
 	}
@@ -3930,10 +3934,10 @@ static struct file_system_type ext4_fs_type = {
 static int ext4dev_get_sb(struct file_system_type *fs_type, int flags,
 			  const char *dev_name, void *data,struct vfsmount *mnt)
 {
-	printk(KERN_WARNING "EXT4-fs: Update your userspace programs "
-	       "to mount using ext4\n");
-	printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility "
-	       "will go away by 2.6.31\n");
+	printk(KERN_WARNING "EXT4-fs (%s): Update your userspace programs "
+	       "to mount using ext4\n", dev_name);
+	printk(KERN_WARNING "EXT4-fs (%s): ext4dev backwards compatibility "
+	       "will go away by 2.6.31\n", dev_name);
 	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
 }
 
-- 
cgit v0.10.2


From 04288f42033607099cebf5ca15ce8dcec3a9688b Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Thu, 4 Jun 2009 13:53:10 -0400
Subject: integrity: ima audit dentry_open failure

Until we start appraising measurements, the ima_path_check()
return code should always be 0.

- Update the ima_path_check() return code comment
- Instead of the pr_info, audit the dentry_open failure

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Acked-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index a2eb233..6f61187 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -149,8 +149,8 @@ static void ima_update_counts(struct ima_iint_cache *iint, int mask)
  *	- Opening a file for read when already open for write,
  * 	  could result in a file measurement error.
  *
- * Return 0 on success, an error code on failure.
- * (Based on the results of appraise_measurement().)
+ * Always return 0 and audit dentry_open failures.
+ * (Return code will be based upon measurement appraisal.)
  */
 int ima_path_check(struct path *path, int mask, int update_counts)
 {
@@ -189,8 +189,13 @@ int ima_path_check(struct path *path, int mask, int update_counts)
 		file = dentry_open(dentry, mnt, O_RDONLY | O_LARGEFILE,
 				   current_cred());
 		if (IS_ERR(file)) {
-			pr_info("%s dentry_open failed\n", dentry->d_name.name);
-			rc = PTR_ERR(file);
+			int audit_info = 0;
+
+			integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode,
+					    dentry->d_name.name,
+					    "add_measurement",
+					    "dentry_open failed",
+					    1, audit_info);
 			file = NULL;
 			goto out;
 		}
-- 
cgit v0.10.2


From 730c586ad5228c339949b2eb4e72b80ae167abc4 Mon Sep 17 00:00:00 2001
From: Salman Qazi <sqazi@google.com>
Date: Thu, 4 Jun 2009 15:20:39 -0700
Subject: drivers/char/mem.c: avoid OOM lockup during large reads from
 /dev/zero

While running 20 parallel instances of dd as follows:

  #!/bin/bash
  for i in `seq 1 20`; do
           dd if=/dev/zero of=/export/hda3/dd_$i bs=1073741824 count=1 &
  done
  wait

on a 16G machine, we noticed that rather than just killing the processes,
the entire kernel went down.  Stracing dd reveals that it first does an
mmap2, which makes 1GB worth of zero page mappings.  Then it performs a
read on those pages from /dev/zero, and finally it performs a write.

The machine died during the reads.  Looking at the code, it was noticed
that /dev/zero's read operation had been changed by
557ed1fa2620dc119adb86b34c614e152a629a80 ("remove ZERO_PAGE") from giving
zero page mappings to actually zeroing the page.

The zeroing of the pages causes physical pages to be allocated to the
process.  But, when the process exhausts all the memory that it can, the
kernel cannot kill it, as it is still in the kernel mode allocating more
memory.  Consequently, the kernel eventually crashes.

To fix this, I propose that when a fatal signal is pending during
/dev/zero read operation, we simply return and let the user process die.

Signed-off-by: Salman Qazi <sqazi@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
[ Modified error return and comment trivially.  - Linus]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 8f05c38..65e12bc 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -694,6 +694,9 @@ static ssize_t read_zero(struct file * file, char __user * buf,
 		written += chunk - unwritten;
 		if (unwritten)
 			break;
+		/* Consider changing this to just 'signal_pending()' with lots of testing */
+		if (fatal_signal_pending(current))
+			return written ? written : -EINTR;
 		buf += chunk;
 		count -= chunk;
 		cond_resched();
-- 
cgit v0.10.2


From 087eb437051b3de817720f9c80c440fc9e7dcce8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 4 Jun 2009 16:29:07 -0700
Subject: ptrace: tracehook_report_clone: fix false positives

The "trace || CLONE_PTRACE" check in tracehook_report_clone() is not right,

- If the untraced task does clone(CLONE_PTRACE) the new child is not traced,
  we must not queue SIGSTOP.

- If we forked the traced task, but the tracer exits and untraces both the
  forking task and the new child (after copy_process() drops tasklist_lock),
  we should not queue SIGSTOP too.

Change the code to check task_ptrace() != 0 instead. This is still racy, but
the race is harmless.

We can race with another tracer attaching to this child, or the tracer can
exit and detach in parallel. But giwen that we didn't do wake_up_new_task()
yet, the child must have the pending SIGSTOP anyway.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index c7aa154..eb96603 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -259,14 +259,12 @@ static inline void tracehook_finish_clone(struct task_struct *child,
 
 /**
  * tracehook_report_clone - in parent, new child is about to start running
- * @trace:		return value from tracehook_prepare_clone()
  * @regs:		parent's user register state
  * @clone_flags:	flags from parent's system call
  * @pid:		new child's PID in the parent's namespace
  * @child:		new child task
  *
- * Called after a child is set up, but before it has been started
- * running.  @trace is the value returned by tracehook_prepare_clone().
+ * Called after a child is set up, but before it has been started running.
  * This is not a good place to block, because the child has not started
  * yet.  Suspend the child here if desired, and then block in
  * tracehook_report_clone_complete().  This must prevent the child from
@@ -276,13 +274,14 @@ static inline void tracehook_finish_clone(struct task_struct *child,
  *
  * Called with no locks held, but the child cannot run until this returns.
  */
-static inline void tracehook_report_clone(int trace, struct pt_regs *regs,
+static inline void tracehook_report_clone(struct pt_regs *regs,
 					  unsigned long clone_flags,
 					  pid_t pid, struct task_struct *child)
 {
-	if (unlikely(trace) || unlikely(clone_flags & CLONE_PTRACE)) {
+	if (unlikely(task_ptrace(child))) {
 		/*
-		 * The child starts up with an immediate SIGSTOP.
+		 * It doesn't matter who attached/attaching to this
+		 * task, the pending SIGSTOP is right in any case.
 		 */
 		sigaddset(&child->pending.signal, SIGSTOP);
 		set_tsk_thread_flag(child, TIF_SIGPENDING);
diff --git a/kernel/fork.c b/kernel/fork.c
index b9e2edd..875ffbd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1409,7 +1409,7 @@ long do_fork(unsigned long clone_flags,
 		}
 
 		audit_finish_fork(p);
-		tracehook_report_clone(trace, regs, clone_flags, nr, p);
+		tracehook_report_clone(regs, clone_flags, nr, p);
 
 		/*
 		 * We set PF_STARTING at creation in case tracing wants to
-- 
cgit v0.10.2


From 08f67461c609ad96bf26732b590569e02e322019 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Thu, 4 Jun 2009 16:29:08 -0700
Subject: kbuild: fix detection of CONFIG_FRAME_WARN=0

The checking of CONFIG_FRAME_WARN in the top level Makefile forgot to
actually derefence the variable thus leading to an always true check.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Cc: Andi Kleen <ak@suse.de>
Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Makefile b/Makefile
index 610d1c3..1065154 100644
--- a/Makefile
+++ b/Makefile
@@ -533,7 +533,7 @@ endif
 
 include $(srctree)/arch/$(SRCARCH)/Makefile
 
-ifneq (CONFIG_FRAME_WARN,0)
+ifneq ($(CONFIG_FRAME_WARN),0)
 KBUILD_CFLAGS += $(call cc-option,-Wframe-larger-than=${CONFIG_FRAME_WARN})
 endif
 
-- 
cgit v0.10.2


From edaba2c5334492f82d39ec35637c6dea5176a977 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 4 Jun 2009 16:29:09 -0700
Subject: ptrace: revert "ptrace_detach: the wrong wakeup breaks the
 ERESTARTxxx logic"

Commit 95a3540da9c81a5987be810e1d9a83640a366bd5 ("ptrace_detach: the wrong
wakeup breaks the ERESTARTxxx logic") removed the "extra"
wake_up_process() from ptrace_detach(), but as Jan pointed out this breaks
the compatibility.

I believe the changelog is right and this wake_up() is wrong in many
ways, but GDB assumes that ptrace(PTRACE_DETACH, child, 0, 0) always
wakes up the tracee.

Despite the fact this breaks SIGNAL_STOP_STOPPED/group_stop_count logic,
and despite the fact this wake_up_process() can break another
assumption: PTRACE_DETACH with SIGSTOP should leave the tracee in
TASK_STOPPED case.  Because the untraced child can dequeue SIGSTOP and
call do_signal_stop() before ptrace_detach() calls wake_up_process().

Revert this change for now.  We need some fixes even if we we want to keep
the current behaviour, but these fixes are not for 2.6.30.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 0692ab5..42c3178 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -304,6 +304,8 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	if (child->ptrace) {
 		child->exit_code = data;
 		dead = __ptrace_detach(current, child);
+		if (!child->exit_state)
+			wake_up_process(child);
 	}
 	write_unlock_irq(&tasklist_lock);
 
-- 
cgit v0.10.2


From 1938a150c25bf7c2c47182e753a1038945b70b0e Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 5 Jun 2009 01:00:26 -0400
Subject: ext4: Avoid leaking blocks after a block allocation failure

We should add inode to the orphan list in the same transaction
as block allocation.  This ensures that if we crash after a failed
block allocation and before we do a vmtruncate we don't leak block
(ie block marked as used in bitmap but not claimed by the inode).

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
CC:  Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 17ed0d2..8d21588 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1459,7 +1459,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 				struct page **pagep, void **fsdata)
 {
 	struct inode *inode = mapping->host;
-	int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
+	int ret, needed_blocks;
 	handle_t *handle;
 	int retries = 0;
 	struct page *page;
@@ -1470,6 +1470,11 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 		   "dev %s ino %lu pos %llu len %u flags %u",
 		   inode->i_sb->s_id, inode->i_ino,
 		   (unsigned long long) pos, len, flags);
+	/*
+	 * Reserve one block more for addition to orphan list in case
+	 * we allocate blocks but write fails for some reason
+	 */
+	needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
  	index = pos >> PAGE_CACHE_SHIFT;
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
@@ -1503,15 +1508,30 @@ retry:
 
 	if (ret) {
 		unlock_page(page);
-		ext4_journal_stop(handle);
 		page_cache_release(page);
 		/*
 		 * block_write_begin may have instantiated a few blocks
 		 * outside i_size.  Trim these off again. Don't need
 		 * i_size_read because we hold i_mutex.
+		 *
+		 * Add inode to orphan list in case we crash before
+		 * truncate finishes
 		 */
 		if (pos + len > inode->i_size)
+			ext4_orphan_add(handle, inode);
+
+		ext4_journal_stop(handle);
+		if (pos + len > inode->i_size) {
 			vmtruncate(inode, inode->i_size);
+			/* 
+			 * If vmtruncate failed early the inode might
+			 * still be on the orphan list; we need to
+			 * make sure the inode is removed from the
+			 * orphan list in that case.
+			 */
+			if (inode->i_nlink)
+				ext4_orphan_del(NULL, inode);
+		}
 	}
 
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-- 
cgit v0.10.2


From f8514083cd61daef12fba5ef883ad9352c450428 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 5 Jun 2009 00:56:49 -0400
Subject: ext4: truncate the file properly if we fail to copy data from
 userspace

In generic_perform_write if we fail to copy the user data we don't
update the inode->i_size.  We should truncate the file in the above
case so that we don't have blocks allocated outside inode->i_size.  Add
the inode to orphan list in the same transaction as block allocation
This ensures that if we crash in between the recovery would do the
truncate.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
CC:  Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8d21588..2c10d34 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1549,6 +1549,52 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 	return ext4_handle_dirty_metadata(handle, NULL, bh);
 }
 
+static int ext4_generic_write_end(struct file *file,
+				struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata)
+{
+	int i_size_changed = 0;
+	struct inode *inode = mapping->host;
+	handle_t *handle = ext4_journal_current_handle();
+
+	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hold i_mutex.
+	 *
+	 * But it's important to update i_size while still holding page lock:
+	 * page writeout could otherwise come in and zero beyond i_size.
+	 */
+	if (pos + copied > inode->i_size) {
+		i_size_write(inode, pos + copied);
+		i_size_changed = 1;
+	}
+
+	if (pos + copied >  EXT4_I(inode)->i_disksize) {
+		/* We need to mark inode dirty even if
+		 * new_i_size is less that inode->i_size
+		 * bu greater than i_disksize.(hint delalloc)
+		 */
+		ext4_update_i_disksize(inode, (pos + copied));
+		i_size_changed = 1;
+	}
+	unlock_page(page);
+	page_cache_release(page);
+
+	/*
+	 * Don't mark the inode dirty under page lock. First, it unnecessarily
+	 * makes the holding time of page lock longer. Second, it forces lock
+	 * ordering of page lock and transaction start for journaling
+	 * filesystems.
+	 */
+	if (i_size_changed)
+		ext4_mark_inode_dirty(handle, inode);
+
+	return copied;
+}
+
 /*
  * We need to pick up the new inode size which generic_commit_write gave us
  * `file' can be NULL - eg, when called from page_symlink().
@@ -1572,21 +1618,15 @@ static int ext4_ordered_write_end(struct file *file,
 	ret = ext4_jbd2_file_inode(handle, inode);
 
 	if (ret == 0) {
-		loff_t new_i_size;
-
-		new_i_size = pos + copied;
-		if (new_i_size > EXT4_I(inode)->i_disksize) {
-			ext4_update_i_disksize(inode, new_i_size);
-			/* We need to mark inode dirty even if
-			 * new_i_size is less that inode->i_size
-			 * bu greater than i_disksize.(hint delalloc)
-			 */
-			ext4_mark_inode_dirty(handle, inode);
-		}
-
-		ret2 = generic_write_end(file, mapping, pos, len, copied,
+		ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 		copied = ret2;
+		if (pos + len > inode->i_size)
+			/* if we have allocated more blocks and copied
+			 * less. We will have blocks allocated outside
+			 * inode->i_size. So truncate them
+			 */
+			ext4_orphan_add(handle, inode);
 		if (ret2 < 0)
 			ret = ret2;
 	}
@@ -1594,6 +1634,18 @@ static int ext4_ordered_write_end(struct file *file,
 	if (!ret)
 		ret = ret2;
 
+	if (pos + len > inode->i_size) {
+		vmtruncate(inode, inode->i_size);
+		/* 
+		 * If vmtruncate failed early the inode might still be
+		 * on the orphan list; we need to make sure the inode
+		 * is removed from the orphan list in that case.
+		 */
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+	}
+
+
 	return ret ? ret : copied;
 }
 
@@ -1605,25 +1657,21 @@ static int ext4_writeback_write_end(struct file *file,
 	handle_t *handle = ext4_journal_current_handle();
 	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
-	loff_t new_i_size;
 
 	trace_mark(ext4_writeback_write_end,
 		   "dev %s ino %lu pos %llu len %u copied %u",
 		   inode->i_sb->s_id, inode->i_ino,
 		   (unsigned long long) pos, len, copied);
-	new_i_size = pos + copied;
-	if (new_i_size > EXT4_I(inode)->i_disksize) {
-		ext4_update_i_disksize(inode, new_i_size);
-		/* We need to mark inode dirty even if
-		 * new_i_size is less that inode->i_size
-		 * bu greater than i_disksize.(hint delalloc)
-		 */
-		ext4_mark_inode_dirty(handle, inode);
-	}
-
-	ret2 = generic_write_end(file, mapping, pos, len, copied,
+	ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 	copied = ret2;
+	if (pos + len > inode->i_size)
+		/* if we have allocated more blocks and copied
+		 * less. We will have blocks allocated outside
+		 * inode->i_size. So truncate them
+		 */
+		ext4_orphan_add(handle, inode);
+
 	if (ret2 < 0)
 		ret = ret2;
 
@@ -1631,6 +1679,17 @@ static int ext4_writeback_write_end(struct file *file,
 	if (!ret)
 		ret = ret2;
 
+	if (pos + len > inode->i_size) {
+		vmtruncate(inode, inode->i_size);
+		/* 
+		 * If vmtruncate failed early the inode might still be
+		 * on the orphan list; we need to make sure the inode
+		 * is removed from the orphan list in that case.
+		 */
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+	}
+
 	return ret ? ret : copied;
 }
 
@@ -1675,10 +1734,27 @@ static int ext4_journalled_write_end(struct file *file,
 	}
 
 	unlock_page(page);
+	page_cache_release(page);
+	if (pos + len > inode->i_size)
+		/* if we have allocated more blocks and copied
+		 * less. We will have blocks allocated outside
+		 * inode->i_size. So truncate them
+		 */
+		ext4_orphan_add(handle, inode);
+
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-	page_cache_release(page);
+	if (pos + len > inode->i_size) {
+		vmtruncate(inode, inode->i_size);
+		/* 
+		 * If vmtruncate failed early the inode might still be
+		 * on the orphan list; we need to make sure the inode
+		 * is removed from the orphan list in that case.
+		 */
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+	}
 
 	return ret ? ret : copied;
 }
-- 
cgit v0.10.2


From 6dc5f2a41759987e35e757ef00192e7b424563bb Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 5 Jun 2009 12:36:28 +1000
Subject: perf_counter: Fix lockup with interrupting counters

Commit 8e3747c1 ("perf_counter: Change data head from u32 to u64")
changed the type of 'head' in struct perf_mmap_data from atomic_t
to atomic_long_t, but missed converting one use of atomic_read on
it to atomic_long_read.  The effect of using atomic_read rather than
atomic_long_read on powerpc (and other big-endian architectures) is
that we get the high half of the 64-bit quantity, resulting in the
cmpxchg retry loop in perf_output_begin spinning forever as soon as
data->head becomes non-zero.  On little-endian architectures such as
x86 we would get the low half, resulting in a lockup once data->head
becomes greater than 4G.

This fixes it by using atomic_long_read rather than atomic_read.

[ Impact: fix perfcounter lockup on PowerPC / big-endian systems ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18984.33964.21541.743096@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 195712e..a5d3e2a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2234,7 +2234,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	perf_output_lock(handle);
 
 	do {
-		offset = head = atomic_read(&data->head);
+		offset = head = atomic_long_read(&data->head);
 		head += size;
 	} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
 
-- 
cgit v0.10.2


From ee7b31fe5c5da8a038b96e54ae9fbd5dcab3b1da Mon Sep 17 00:00:00 2001
From: Yong Wang <yong.y.wang@linux.intel.com>
Date: Fri, 5 Jun 2009 11:37:35 +0800
Subject: perf_counter tools: Fix incorrect printf formats

Otherwise the code does not compile on 32-bit boxes.

builtin-report.c: In function 'map__fprintf':
builtin-report.c:240: error: format '%lx' expects type 'long unsigned int', but argument 3 has type 'uint64_t'
builtin-report.c:240: error: format '%lx' expects type 'long unsigned int', but argument 4 has type 'uint64_t'
builtin-report.c:240: error: format '%lx' expects type 'long unsigned int', but argument 5 has type 'uint64_t'

Signed-off-by: Yong Wang <yong.y.wang@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <20090605033735.GA20451@ywang-moblin2.bj.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 1a1028d..eb5424f 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -236,7 +236,7 @@ static int map__overlap(struct map *l, struct map *r)
 
 static size_t map__fprintf(struct map *self, FILE *fp)
 {
-	return fprintf(fp, " %lx-%lx %lx %s\n",
+	return fprintf(fp, " %"PRIx64"-%"PRIx64" %"PRIx64" %s\n",
 		       self->start, self->end, self->pgoff, self->dso->name);
 }
 
-- 
cgit v0.10.2


From f6d03139d745198b434f65a28aabed524f415a4c Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 5 Jun 2009 07:18:57 +0100
Subject: GFS2: Fix locking issue mounting gfs2meta fs

This patch uses sget() to get a reference to the
existing gfs2 sb when mouting the gfs2meta filesystem
(in fact thats just another mount of the gfs2
filesystem with a different root and this interface
is for backward compatibility).

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Reported-by: Benjamin Marzinski <bmarzins@redhat.com>
Tested-by: Benjamin Marzinski <bmarzins@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 2cd1164..9da161c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1273,9 +1273,20 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
 	return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
 }
 
+static int test_meta_super(struct super_block *s, void *ptr)
+{
+	struct block_device *bdev = ptr;
+	return (bdev == s->s_bdev);
+}
+
+static int set_meta_super(struct super_block *s, void *ptr)
+{
+	return -EINVAL;
+}
+
 static struct super_block *get_gfs2_sb(const char *dev_name)
 {
-	struct super_block *sb;
+	struct super_block *s;
 	struct path path;
 	int error;
 
@@ -1283,30 +1294,27 @@ static struct super_block *get_gfs2_sb(const char *dev_name)
 	if (error) {
 		printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
 		       dev_name, error);
-		return NULL;
+		return ERR_PTR(-ENOENT);
 	}
-	sb = path.dentry->d_inode->i_sb;
-	if (sb && (sb->s_type == &gfs2_fs_type))
-		atomic_inc(&sb->s_active);
-	else
-		sb = NULL;
+	s = sget(&gfs2_fs_type, test_meta_super, set_meta_super,
+		 path.dentry->d_inode->i_sb->s_bdev);
 	path_put(&path);
-	return sb;
+	return s;
 }
 
 static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
 			    const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	struct super_block *sb = NULL;
+	struct super_block *s;
 	struct gfs2_sbd *sdp;
 
-	sb = get_gfs2_sb(dev_name);
-	if (!sb) {
+	s = get_gfs2_sb(dev_name);
+	if (IS_ERR(s)) {
 		printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
-		return -ENOENT;
+		return PTR_ERR(s);
 	}
-	sdp = sb->s_fs_info;
-	mnt->mnt_sb = sb;
+	sdp = s->s_fs_info;
+	mnt->mnt_sb = s;
 	mnt->mnt_root = dget(sdp->sd_master_dir);
 	return 0;
 }
-- 
cgit v0.10.2


From f250c030a87273f8838a2302bee7c2b4d03e9151 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 5 Jun 2009 13:18:41 +0200
Subject: perf record: Split out counter creation into a helper function

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index bf59df5..7f2d7ce 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -336,65 +336,71 @@ static void synthesize_events(void)
 	closedir(proc);
 }
 
-static void open_counters(int cpu, pid_t pid)
+static int group_fd;
+
+static void create_counter(int counter, int cpu, pid_t pid)
 {
 	struct perf_counter_attr attr;
-	int counter, group_fd;
 	int track = 1;
 
-	if (pid > 0) {
-		pid_synthesize_comm_event(pid, 0);
-		pid_synthesize_mmap_events(pid);
-	}
+	memset(&attr, 0, sizeof(attr));
+	attr.config		= event_id[counter];
+	attr.sample_period	= event_count[counter];
+	attr.sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+	attr.mmap		= track;
+	attr.comm		= track;
+	attr.inherit	= (cpu < 0) && inherit;
 
-	group_fd = -1;
-	for (counter = 0; counter < nr_counters; counter++) {
+	track = 0; /* only the first counter needs these */
 
-		memset(&attr, 0, sizeof(attr));
-		attr.config		= event_id[counter];
-		attr.sample_period	= event_count[counter];
-		attr.sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
-		attr.mmap		= track;
-		attr.comm		= track;
-		attr.inherit	= (cpu < 0) && inherit;
+	fd[nr_cpu][counter] = sys_perf_counter_open(&attr, pid, cpu, group_fd, 0);
 
-		track = 0; // only the first counter needs these
+	if (fd[nr_cpu][counter] < 0) {
+		int err = errno;
 
-		fd[nr_cpu][counter] =
-			sys_perf_counter_open(&attr, pid, cpu, group_fd, 0);
+		error("syscall returned with %d (%s)\n",
+				fd[nr_cpu][counter], strerror(err));
+		if (err == EPERM)
+			printf("Are you root?\n");
+		exit(-1);
+	}
+	assert(fd[nr_cpu][counter] >= 0);
+	fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK);
 
-		if (fd[nr_cpu][counter] < 0) {
-			int err = errno;
+	/*
+	 * First counter acts as the group leader:
+	 */
+	if (group && group_fd == -1)
+		group_fd = fd[nr_cpu][counter];
+
+	event_array[nr_poll].fd = fd[nr_cpu][counter];
+	event_array[nr_poll].events = POLLIN;
+	nr_poll++;
+
+	mmap_array[nr_cpu][counter].counter = counter;
+	mmap_array[nr_cpu][counter].prev = 0;
+	mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
+	mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
+			PROT_READ, MAP_SHARED, fd[nr_cpu][counter], 0);
+	if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
+		error("failed to mmap with %d (%s)\n", errno, strerror(errno));
+		exit(-1);
+	}
+}
 
-			error("syscall returned with %d (%s)\n",
-					fd[nr_cpu][counter], strerror(err));
-			if (err == EPERM)
-				printf("Are you root?\n");
-			exit(-1);
-		}
-		assert(fd[nr_cpu][counter] >= 0);
-		fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK);
+static void open_counters(int cpu, pid_t pid)
+{
+	int counter;
 
-		/*
-		 * First counter acts as the group leader:
-		 */
-		if (group && group_fd == -1)
-			group_fd = fd[nr_cpu][counter];
-
-		event_array[nr_poll].fd = fd[nr_cpu][counter];
-		event_array[nr_poll].events = POLLIN;
-		nr_poll++;
-
-		mmap_array[nr_cpu][counter].counter = counter;
-		mmap_array[nr_cpu][counter].prev = 0;
-		mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
-		mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
-				PROT_READ, MAP_SHARED, fd[nr_cpu][counter], 0);
-		if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
-			error("failed to mmap with %d (%s)\n", errno, strerror(errno));
-			exit(-1);
-		}
+	if (pid > 0) {
+		pid_synthesize_comm_event(pid, 0);
+		pid_synthesize_mmap_events(pid);
 	}
+
+	group_fd = -1;
+	for (counter = 0; counter < nr_counters; counter++)
+		create_counter(counter, cpu, pid);
+
 	nr_cpu++;
 }
 
-- 
cgit v0.10.2


From cf1f45744c6fa3501e0a6f0ddc418f0ef27e725b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 5 Jun 2009 13:27:02 +0200
Subject: perf record, top: Implement --freq

Support frequency-based profiling and make it the default.

(Also add a Hz printout in perf top.)

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 7f2d7ce..e2301f3 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -27,6 +27,7 @@ static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
 static int			nr_cpus				= 0;
 static unsigned int		page_size;
 static unsigned int		mmap_pages			= 128;
+static int			freq				= 0;
 static int			output;
 static const char		*output_name			= "perf.data";
 static int			group				= 0;
@@ -347,9 +348,10 @@ static void create_counter(int counter, int cpu, pid_t pid)
 	attr.config		= event_id[counter];
 	attr.sample_period	= event_count[counter];
 	attr.sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+	attr.freq		= freq;
 	attr.mmap		= track;
 	attr.comm		= track;
-	attr.inherit	= (cpu < 0) && inherit;
+	attr.inherit		= (cpu < 0) && inherit;
 
 	track = 0; /* only the first counter needs these */
 
@@ -520,6 +522,8 @@ static const struct option options[] = {
 		    "output file name"),
 	OPT_BOOLEAN('i', "inherit", &inherit,
 		    "child tasks inherit counters"),
+	OPT_INTEGER('F', "freq", &freq,
+		    "profile at this frequency"),
 	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
 		    "number of mmap data pages"),
 	OPT_END()
@@ -540,6 +544,10 @@ int cmd_record(int argc, const char **argv, const char *prefix)
 		event_id[0] = 0;
 	}
 
+	if (freq) {
+		default_interval = freq;
+		freq = 1;
+	}
 	for (counter = 0; counter < nr_counters; counter++) {
 		if (event_count[counter])
 			continue;
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 28cbde4..2fee595 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -74,8 +74,8 @@ static int			nr_cpus				=  0;
 static unsigned int		realtime_prio			=  0;
 static int			group				=  0;
 static unsigned int		page_size;
-static unsigned int		mmap_pages			=  16;
-static int			freq				= 0;
+static unsigned int		mmap_pages			= 16;
+static int			freq				=  0;
 
 static char			*sym_filter;
 static unsigned long		filter_start;
@@ -212,8 +212,13 @@ static void print_sym_table(void)
 		events_per_sec,
 		100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)));
 
-	if (nr_counters == 1)
-		printf("%d ", event_count[0]);
+	if (nr_counters == 1) {
+		printf("%d", event_count[0]);
+		if (freq)
+			printf("Hz ");
+		else
+			printf(" ");
+	}
 
 	for (counter = 0; counter < nr_counters; counter++) {
 		if (counter)
-- 
cgit v0.10.2


From f7b6eb3fa07269da20dbbde8ba37a0273fdbd9c9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Jun 2009 14:04:51 +0200
Subject: x86: Set context.vdso before installing the mapping

In order to make arch_vma_name() work from inside
install_special_mapping() we need to set the context.vdso
before calling it.

( This is needed for performance counters to be able to track
  this special executable area. )

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 1241f11..58bc00f 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -338,6 +338,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 		}
 	}
 
+	current->mm->context.vdso = (void *)addr;
+
 	if (compat_uses_vma || !compat) {
 		/*
 		 * MAYWRITE to allow gdb to COW and set breakpoints
@@ -358,11 +360,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 			goto up_fail;
 	}
 
-	current->mm->context.vdso = (void *)addr;
 	current_thread_info()->sysenter_return =
 		VDSO32_SYMBOL(addr, SYSENTER_RETURN);
 
   up_fail:
+	if (ret)
+		current->mm->context.vdso = NULL;
+
 	up_write(&mm->mmap_sem);
 
 	return ret;
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 7133cdf..93b7a29 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -115,15 +115,18 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 		goto up_fail;
 	}
 
+	current->mm->context.vdso = (void *)addr;
+
 	ret = install_special_mapping(mm, addr, vdso_size,
 				      VM_READ|VM_EXEC|
 				      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
 				      VM_ALWAYSDUMP,
 				      vdso_pages);
-	if (ret)
+	if (ret) {
+		current->mm->context.vdso = NULL;
 		goto up_fail;
+	}
 
-	current->mm->context.vdso = (void *)addr;
 up_fail:
 	up_write(&mm->mmap_sem);
 	return ret;
-- 
cgit v0.10.2


From 089dd79db9264dc0da602bad45d42f1b3e7d1e07 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Jun 2009 14:04:55 +0200
Subject: perf_counter: Generate mmap events for install_special_mapping()

In order to track the vdso also generate mmap events for
install_special_mapping().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 6ca403a..40dc0e2 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -617,8 +617,13 @@ static inline int is_software_counter(struct perf_counter *counter)
 
 extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
 
-extern void perf_counter_mmap(unsigned long addr, unsigned long len,
-			      unsigned long pgoff, struct file *file);
+extern void __perf_counter_mmap(struct vm_area_struct *vma);
+
+static inline void perf_counter_mmap(struct vm_area_struct *vma)
+{
+	if (vma->vm_flags & VM_EXEC)
+		__perf_counter_mmap(vma);
+}
 
 extern void perf_counter_comm(struct task_struct *tsk);
 extern void perf_counter_fork(struct task_struct *tsk);
@@ -668,10 +673,7 @@ static inline void
 perf_swcounter_event(u32 event, u64 nr, int nmi,
 		     struct pt_regs *regs, u64 addr)			{ }
 
-static inline void
-perf_counter_mmap(unsigned long addr, unsigned long len,
-		  unsigned long pgoff, struct file *file)		{ }
-
+static inline void perf_counter_mmap(struct vm_area_struct *vma)	{ }
 static inline void perf_counter_comm(struct task_struct *tsk)		{ }
 static inline void perf_counter_fork(struct task_struct *tsk)		{ }
 static inline void perf_counter_init(void)				{ }
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index a5d3e2a..37a5a24 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2255,7 +2255,7 @@ out:
 }
 
 static void perf_output_copy(struct perf_output_handle *handle,
-			     void *buf, unsigned int len)
+			     const void *buf, unsigned int len)
 {
 	unsigned int pages_mask;
 	unsigned int offset;
@@ -2681,9 +2681,10 @@ void perf_counter_comm(struct task_struct *task)
  */
 
 struct perf_mmap_event {
-	struct file	*file;
-	char		*file_name;
-	int		file_size;
+	struct vm_area_struct	*vma;
+
+	const char		*file_name;
+	int			file_size;
 
 	struct {
 		struct perf_event_header	header;
@@ -2744,11 +2745,12 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
 {
 	struct perf_cpu_context *cpuctx;
 	struct perf_counter_context *ctx;
-	struct file *file = mmap_event->file;
+	struct vm_area_struct *vma = mmap_event->vma;
+	struct file *file = vma->vm_file;
 	unsigned int size;
 	char tmp[16];
 	char *buf = NULL;
-	char *name;
+	const char *name;
 
 	if (file) {
 		buf = kzalloc(PATH_MAX, GFP_KERNEL);
@@ -2762,6 +2764,15 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
 			goto got_name;
 		}
 	} else {
+		name = arch_vma_name(mmap_event->vma);
+		if (name)
+			goto got_name;
+
+		if (!vma->vm_mm) {
+			name = strncpy(tmp, "[vdso]", sizeof(tmp));
+			goto got_name;
+		}
+
 		name = strncpy(tmp, "//anon", sizeof(tmp));
 		goto got_name;
 	}
@@ -2791,8 +2802,7 @@ got_name:
 	kfree(buf);
 }
 
-void perf_counter_mmap(unsigned long addr, unsigned long len,
-		       unsigned long pgoff, struct file *file)
+void __perf_counter_mmap(struct vm_area_struct *vma)
 {
 	struct perf_mmap_event mmap_event;
 
@@ -2800,12 +2810,12 @@ void perf_counter_mmap(unsigned long addr, unsigned long len,
 		return;
 
 	mmap_event = (struct perf_mmap_event){
-		.file   = file,
+		.vma	= vma,
 		.event  = {
 			.header = { .type = PERF_EVENT_MMAP, },
-			.start  = addr,
-			.len    = len,
-			.pgoff  = pgoff,
+			.start  = vma->vm_start,
+			.len    = vma->vm_end - vma->vm_start,
+			.pgoff  = vma->vm_pgoff,
 		},
 	};
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 6451ce2..8101de4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1220,8 +1220,7 @@ munmap_back:
 	if (correct_wcount)
 		atomic_inc(&inode->i_writecount);
 out:
-	if (vm_flags & VM_EXEC)
-		perf_counter_mmap(addr, len, pgoff, file);
+	perf_counter_mmap(vma);
 
 	mm->total_vm += len >> PAGE_SHIFT;
 	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
@@ -2309,6 +2308,8 @@ int install_special_mapping(struct mm_struct *mm,
 
 	mm->total_vm += len >> PAGE_SHIFT;
 
+	perf_counter_mmap(vma);
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From fc54db5105d01ad691a7d747064c7890e17f936c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Jun 2009 14:04:59 +0200
Subject: perf report: Deal with maps

In order to deal with [vdso] maps generalize the ip->symbol path
a bit and allow to override some bits with custom functions.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index eb5424f..9783d1e 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -79,6 +79,7 @@ typedef union event_union {
 
 static LIST_HEAD(dsos);
 static struct dso *kernel_dso;
+static struct dso *vdso;
 
 static void dsos__add(struct dso *dso)
 {
@@ -136,6 +137,11 @@ static void dsos__fprintf(FILE *fp)
 		dso__fprintf(pos, fp);
 }
 
+static struct symbol *vdso__find_symbol(struct dso *dso, uint64_t ip)
+{
+	return dso__find_symbol(kernel_dso, ip);
+}
+
 static int load_kernel(void)
 {
 	int err;
@@ -151,6 +157,14 @@ static int load_kernel(void)
 	} else
 		dsos__add(kernel_dso);
 
+	vdso = dso__new("[vdso]", 0);
+	if (!vdso)
+		return -1;
+
+	vdso->find_symbol = vdso__find_symbol;
+
+	dsos__add(vdso);
+
 	return err;
 }
 
@@ -173,9 +187,20 @@ struct map {
 	uint64_t	 start;
 	uint64_t	 end;
 	uint64_t	 pgoff;
+	uint64_t	 (*map_ip)(struct map *, uint64_t);
 	struct dso	 *dso;
 };
 
+static uint64_t map__map_ip(struct map *map, uint64_t ip)
+{
+	return ip - map->start + map->pgoff;
+}
+
+static uint64_t vdso__map_ip(struct map *map, uint64_t ip)
+{
+	return ip;
+}
+
 static struct map *map__new(struct mmap_event *event)
 {
 	struct map *self = malloc(sizeof(*self));
@@ -201,6 +226,11 @@ static struct map *map__new(struct mmap_event *event)
 		self->dso = dsos__findnew(filename);
 		if (self->dso == NULL)
 			goto out_delete;
+
+		if (self->dso == vdso)
+			self->map_ip = vdso__map_ip;
+		else
+			self->map_ip = map__map_ip;
 	}
 	return self;
 out_delete:
@@ -917,8 +947,8 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 
 		map = thread__find_map(thread, ip);
 		if (map != NULL) {
+			ip = map->map_ip(map, ip);
 			dso = map->dso;
-			ip -= map->start + map->pgoff;
 		} else {
 			/*
 			 * If this is outside of all known maps,
@@ -938,7 +968,10 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 	}
 
 	if (show & show_mask) {
-		struct symbol *sym = dso__find_symbol(dso, ip);
+		struct symbol *sym = NULL;
+
+		if (dso)
+			sym = dso->find_symbol(dso, ip);
 
 		if (hist_entry__add(thread, map, dso, sym, ip, level)) {
 			fprintf(stderr,
diff --git a/Documentation/perf_counter/util/symbol.c b/Documentation/perf_counter/util/symbol.c
index 15d5cf9..a06bbfba 100644
--- a/Documentation/perf_counter/util/symbol.c
+++ b/Documentation/perf_counter/util/symbol.c
@@ -45,6 +45,7 @@ struct dso *dso__new(const char *name, unsigned int sym_priv_size)
 		strcpy(self->name, name);
 		self->syms = RB_ROOT;
 		self->sym_priv_size = sym_priv_size;
+		self->find_symbol = dso__find_symbol;
 	}
 
 	return self;
diff --git a/Documentation/perf_counter/util/symbol.h b/Documentation/perf_counter/util/symbol.h
index 8dd8522..e23cc31 100644
--- a/Documentation/perf_counter/util/symbol.h
+++ b/Documentation/perf_counter/util/symbol.h
@@ -16,6 +16,7 @@ struct dso {
 	struct list_head node;
 	struct rb_root	 syms;
 	unsigned int	 sym_priv_size;
+	struct symbol    *(*find_symbol)(struct dso *, uint64_t ip);
 	char		 name[0];
 };
 
-- 
cgit v0.10.2


From 8edd4286f99f78fe07fe9196e69d5643da86cada Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 5 Jun 2009 14:13:18 +0200
Subject: perf report: Display user/kernel differentiator

Before:

    25.96%  copy_user_generic_string
    15.23%  two_op
    15.19%  one_op
     6.92%  enough_duration
     1.23%  alloc_pages_current
     1.14%  acpi_os_read_port
     1.08%  _spin_lock

After:

    25.96%  [k] copy_user_generic_string
    15.23%  [.] two_op
    15.19%  [.] one_op
     6.92%  [.] enough_duration
     1.23%  [k] alloc_pages_current
     1.14%  [k] acpi_os_read_port
     1.08%  [k] _spin_lock

The '[k]' differentiator is a quick clue that it's a kernel symbol,
without having to bring in the full dso column.

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 9783d1e..ca303fd 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -504,7 +504,7 @@ sort__comm_print(FILE *fp, struct hist_entry *self)
 }
 
 static struct sort_entry sort_comm = {
-	.header 	= "         Command",
+	.header		= "         Command",
 	.cmp		= sort__comm_cmp,
 	.collapse	= sort__comm_collapse,
 	.print		= sort__comm_print,
@@ -569,10 +569,12 @@ sort__sym_print(FILE *fp, struct hist_entry *self)
 	if (verbose)
 		ret += fprintf(fp, "%#018llx  ", (__u64)self->ip);
 
-	if (self->sym)
-		ret += fprintf(fp, "%s", self->sym->name);
-	else
+	if (self->sym) {
+		ret += fprintf(fp, "[%c] %s",
+			self->dso == kernel_dso ? 'k' : '.', self->sym->name);
+	} else {
 		ret += fprintf(fp, "%#016llx", (__u64)self->ip);
+	}
 
 	return ret;
 }
@@ -586,9 +588,9 @@ static struct sort_entry sort_sym = {
 static int sort__need_collapse = 0;
 
 struct sort_dimension {
-	char *name;
-	struct sort_entry *entry;
-	int taken;
+	char			*name;
+	struct sort_entry	*entry;
+	int			taken;
 };
 
 static struct sort_dimension sort_dimensions[] = {
-- 
cgit v0.10.2


From 2debbc836696f2a815d02630230584a1754a5022 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 5 Jun 2009 14:29:10 +0200
Subject: perf_counter tools: Clarify events/samples naming

A number of places said 'events' while they should say 'samples'.

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index e2301f3..d4ad3057 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -65,7 +65,7 @@ static unsigned int mmap_read_head(struct mmap_data *md)
 	return head;
 }
 
-static long events;
+static long samples;
 static struct timeval last_read, this_read;
 
 static __u64 bytes_written;
@@ -83,7 +83,7 @@ static void mmap_read(struct mmap_data *md)
 
 	/*
 	 * If we're further behind than half the buffer, there's a chance
-	 * the writer will bite our tail and screw up the events under us.
+	 * the writer will bite our tail and mess up the samples under us.
 	 *
 	 * If we somehow ended up ahead of the head, we got messed up.
 	 *
@@ -109,7 +109,7 @@ static void mmap_read(struct mmap_data *md)
 	last_read = this_read;
 
 	if (old != head)
-		events++;
+		samples++;
 
 	size = head - old;
 
@@ -257,7 +257,7 @@ out_failure:
 	exit(EXIT_FAILURE);
 }
 
-static void pid_synthesize_mmap_events(pid_t pid)
+static void pid_synthesize_mmap_samples(pid_t pid)
 {
 	char filename[PATH_MAX];
 	FILE *fp;
@@ -315,7 +315,7 @@ static void pid_synthesize_mmap_events(pid_t pid)
 	fclose(fp);
 }
 
-static void synthesize_events(void)
+static void synthesize_samples(void)
 {
 	DIR *proc;
 	struct dirent dirent, *next;
@@ -331,7 +331,7 @@ static void synthesize_events(void)
 			continue;
 
 		pid_synthesize_comm_event(pid, 1);
-		pid_synthesize_mmap_events(pid);
+		pid_synthesize_mmap_samples(pid);
 	}
 
 	closedir(proc);
@@ -396,7 +396,7 @@ static void open_counters(int cpu, pid_t pid)
 
 	if (pid > 0) {
 		pid_synthesize_comm_event(pid, 0);
-		pid_synthesize_mmap_events(pid);
+		pid_synthesize_mmap_samples(pid);
 	}
 
 	group_fd = -1;
@@ -469,17 +469,17 @@ static int __cmd_record(int argc, const char **argv)
 	}
 
 	if (system_wide)
-		synthesize_events();
+		synthesize_samples();
 
 	while (!done) {
-		int hits = events;
+		int hits = samples;
 
 		for (i = 0; i < nr_cpu; i++) {
 			for (counter = 0; counter < nr_counters; counter++)
 				mmap_read(&mmap_array[i][counter]);
 		}
 
-		if (hits == events)
+		if (hits == samples)
 			ret = poll(event_array, nr_poll, 100);
 	}
 
@@ -487,7 +487,7 @@ static int __cmd_record(int argc, const char **argv)
 	 * Approximate RIP event size: 24 bytes.
 	 */
 	fprintf(stderr,
-		"[ perf record: Captured and wrote %.3f MB %s (~%lld events) ]\n",
+		"[ perf record: Captured and wrote %.3f MB %s (~%lld samples) ]\n",
 		(double)bytes_written / 1024.0 / 1024.0,
 		output_name,
 		bytes_written / 24);
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index ca303fd..5af105c 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -857,7 +857,7 @@ static size_t output__fprintf(FILE *fp, uint64_t total_samples)
 
 	fprintf(fp, "\n");
 	fprintf(fp, "#\n");
-	fprintf(fp, "# (%Ld profiler events)\n", (__u64)total_samples);
+	fprintf(fp, "# (%Ld samples)\n", (__u64)total_samples);
 	fprintf(fp, "#\n");
 
 	fprintf(fp, "# Overhead");
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 2fee595..ff7e13c4 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -137,8 +137,8 @@ static double sym_weight(const struct sym_entry *sym)
 	return weight;
 }
 
-static long			events;
-static long			userspace_events;
+static long			samples;
+static long			userspace_samples;
 static const char		CONSOLE_CLEAR[] = "[H[2J";
 
 static void __list_insert_active_sym(struct sym_entry *syme)
@@ -177,14 +177,14 @@ static void print_sym_table(void)
 {
 	int printed = 0, j;
 	int counter;
-	float events_per_sec = events/delay_secs;
-	float kevents_per_sec = (events-userspace_events)/delay_secs;
-	float sum_kevents = 0.0;
+	float samples_per_sec = samples/delay_secs;
+	float ksamples_per_sec = (samples-userspace_samples)/delay_secs;
+	float sum_ksamples = 0.0;
 	struct sym_entry *syme, *n;
 	struct rb_root tmp = RB_ROOT;
 	struct rb_node *nd;
 
-	events = userspace_events = 0;
+	samples = userspace_samples = 0;
 
 	/* Sort the active symbols */
 	pthread_mutex_lock(&active_symbols_lock);
@@ -196,7 +196,7 @@ static void print_sym_table(void)
 		if (syme->snap_count != 0) {
 			syme->weight = sym_weight(syme);
 			rb_insert_active_sym(&tmp, syme);
-			sum_kevents += syme->snap_count;
+			sum_ksamples += syme->snap_count;
 
 			for (j = 0; j < nr_counters; j++)
 				syme->count[j] = zero ? 0 : syme->count[j] * 7 / 8;
@@ -209,8 +209,8 @@ static void print_sym_table(void)
 	printf(
 "------------------------------------------------------------------------------\n");
 	printf( "   PerfTop:%8.0f irqs/sec  kernel:%4.1f%% [",
-		events_per_sec,
-		100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)));
+		samples_per_sec,
+		100.0 - (100.0*((samples_per_sec-ksamples_per_sec)/samples_per_sec)));
 
 	if (nr_counters == 1) {
 		printf("%d", event_count[0]);
@@ -246,12 +246,12 @@ static void print_sym_table(void)
 	printf("------------------------------------------------------------------------------\n\n");
 
 	if (nr_counters == 1)
-		printf("             events    pcnt");
+		printf("             samples    pcnt");
 	else
-		printf("  weight     events    pcnt");
+		printf("  weight     samples    pcnt");
 
 	printf("         RIP          kernel function\n"
-	       	       "  ______     ______   _____   ________________   _______________\n\n"
+	       	       "  ______     _______   _____   ________________   _______________\n\n"
 	);
 
 	for (nd = rb_first(&tmp); nd; nd = rb_next(nd)) {
@@ -263,8 +263,8 @@ static void print_sym_table(void)
 		if (++printed > print_entries || syme->snap_count < count_filter)
 			continue;
 
-		pcnt = 100.0 - (100.0 * ((sum_kevents - syme->snap_count) /
-					 sum_kevents));
+		pcnt = 100.0 - (100.0 * ((sum_ksamples - syme->snap_count) /
+					 sum_ksamples));
 
 		/*
 		 * We color high-overhead entries in red, low-overhead
@@ -276,9 +276,9 @@ static void print_sym_table(void)
 			color = PERF_COLOR_GREEN;
 
 		if (nr_counters == 1)
-			printf("%19.2f - ", syme->weight);
+			printf("%20.2f - ", syme->weight);
 		else
-			printf("%8.1f %10ld - ", syme->weight, syme->snap_count);
+			printf("%9.1f %10ld - ", syme->weight, syme->snap_count);
 
 		color_fprintf(stdout, color, "%4.1f%%", pcnt);
 		printf(" - %016llx : %s\n", sym->start, sym->name);
@@ -318,7 +318,7 @@ static int symbol_filter(struct dso *self, struct symbol *sym)
 		return 1;
 
 	syme = dso__sym_priv(self, sym);
-	/* Tag events to be skipped. */
+	/* Tag samples to be skipped. */
 	if (!strcmp("default_idle", name) ||
 	    !strcmp("cpu_idle", name) ||
 	    !strcmp("enter_idle", name) ||
@@ -405,15 +405,15 @@ static void record_ip(uint64_t ip, int counter)
 		}
 	}
 
-	events--;
+	samples--;
 }
 
 static void process_event(uint64_t ip, int counter)
 {
-	events++;
+	samples++;
 
 	if (ip < min_ip || ip > max_ip) {
-		userspace_events++;
+		userspace_samples++;
 		return;
 	}
 
@@ -451,7 +451,7 @@ static void mmap_read(struct mmap_data *md)
 
 	/*
 	 * If we're further behind than half the buffer, there's a chance
-	 * the writer will bite our tail and screw up the events under us.
+	 * the writer will bite our tail and mess up the samples under us.
 	 *
 	 * If we somehow ended up ahead of the head, we got messed up.
 	 *
@@ -608,14 +608,14 @@ static int __cmd_top(void)
 	}
 
 	while (1) {
-		int hits = events;
+		int hits = samples;
 
 		for (i = 0; i < nr_cpus; i++) {
 			for (counter = 0; counter < nr_counters; counter++)
 				mmap_read(&mmap_array[i][counter]);
 		}
 
-		if (hits == events)
+		if (hits == samples)
 			ret = poll(event_array, nr_poll, 100);
 	}
 
-- 
cgit v0.10.2


From e9e10124e269a39de089c5e0d9936fae2ff889b2 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Fri, 5 Jun 2009 11:56:18 +0100
Subject: ivtv: Fix PCI DMA direction

The ivtv stream buffers may be for receive or for send but the attached
sg handle is always destined cpu->device.  We flush it correctly but the
allocation is wrongly done with the same type as the buffers.

See bug: http://bugzilla.kernel.org/show_bug.cgi?id=13385

(Note this doesn't close the bug - it fixes the ivtv part and in turn
the logging next shows up some rather alarming DMA sg list warnings in
libata)

Signed-off-by: Alan Cox <alan@linux.intel.com>
Acked-by: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/media/video/ivtv/ivtv-queue.c b/drivers/media/video/ivtv/ivtv-queue.c
index ff7b7de..7fde36e 100644
--- a/drivers/media/video/ivtv/ivtv-queue.c
+++ b/drivers/media/video/ivtv/ivtv-queue.c
@@ -230,7 +230,8 @@ int ivtv_stream_alloc(struct ivtv_stream *s)
 		return -ENOMEM;
 	}
 	if (ivtv_might_use_dma(s)) {
-		s->sg_handle = pci_map_single(itv->pdev, s->sg_dma, sizeof(struct ivtv_sg_element), s->dma);
+		s->sg_handle = pci_map_single(itv->pdev, s->sg_dma,
+				sizeof(struct ivtv_sg_element), PCI_DMA_TODEVICE);
 		ivtv_stream_sync_for_cpu(s);
 	}
 
-- 
cgit v0.10.2


From 136107a76fe5f62906162f730834477b71cf131e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 5 Jun 2009 17:56:21 +0200
Subject: perf_counter tools: Remove -march=native

Turns out that neither PowerPC nor older x86 compilers know this switch
...

and since it does not make a measurable difference, just omit it.

Reported-by: Paul Mackerras <paulus@samba.org>
Reported-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index c9ec458..5b99f04 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -159,7 +159,7 @@ uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
 
 # CFLAGS and LDFLAGS are for the users to override from the command line.
 
-CFLAGS = -ggdb3 -Wall -Werror -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement -O6 -march=native
+CFLAGS = -ggdb3 -Wall -Werror -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement -O6
 LDFLAGS = -lpthread -lrt -lelf
 ALL_CFLAGS = $(CFLAGS)
 ALL_LDFLAGS = $(LDFLAGS)
-- 
cgit v0.10.2


From ac4bcf889469ffbca88f234d3184452886a47905 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Jun 2009 14:44:52 +0200
Subject: perf_counter: Change PERF_SAMPLE_CONFIG into PERF_SAMPLE_ID

The purpose of PERF_SAMPLE_CONFIG was to identify the counters,
since then we've added counter ids, use those instead.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 40dc0e2..9cea32a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -104,7 +104,7 @@ enum perf_counter_sample_format {
 	PERF_SAMPLE_ADDR		= 1U << 3,
 	PERF_SAMPLE_GROUP		= 1U << 4,
 	PERF_SAMPLE_CALLCHAIN		= 1U << 5,
-	PERF_SAMPLE_CONFIG		= 1U << 6,
+	PERF_SAMPLE_ID			= 1U << 6,
 	PERF_SAMPLE_CPU			= 1U << 7,
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 37a5a24..e75b91a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2392,8 +2392,8 @@ static void perf_counter_output(struct perf_counter *counter,
 		header.size += sizeof(u64);
 	}
 
-	if (sample_type & PERF_SAMPLE_CONFIG) {
-		header.type |= PERF_SAMPLE_CONFIG;
+	if (sample_type & PERF_SAMPLE_ID) {
+		header.type |= PERF_SAMPLE_ID;
 		header.size += sizeof(u64);
 	}
 
@@ -2439,8 +2439,8 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (sample_type & PERF_SAMPLE_ADDR)
 		perf_output_put(&handle, addr);
 
-	if (sample_type & PERF_SAMPLE_CONFIG)
-		perf_output_put(&handle, counter->attr.config);
+	if (sample_type & PERF_SAMPLE_ID)
+		perf_output_put(&handle, counter->id);
 
 	if (sample_type & PERF_SAMPLE_CPU)
 		perf_output_put(&handle, cpu_entry);
-- 
cgit v0.10.2


From 689802b2d0536e72281dc959ab9cb34fb3c304cf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Jun 2009 15:05:43 +0200
Subject: perf_counter: Add PERF_SAMPLE_PERIOD

In order to allow easy tracking of the period, also provide means of
adding it to the sample data.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 9cea32a..6bc2500 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -106,6 +106,7 @@ enum perf_counter_sample_format {
 	PERF_SAMPLE_CALLCHAIN		= 1U << 5,
 	PERF_SAMPLE_ID			= 1U << 6,
 	PERF_SAMPLE_CPU			= 1U << 7,
+	PERF_SAMPLE_PERIOD		= 1U << 8,
 };
 
 /*
@@ -260,6 +261,7 @@ enum perf_event_type {
 	 * struct {
 	 *	struct perf_event_header	header;
 	 *	u64				time;
+	 *	u64				id;
 	 *	u64				sample_period;
 	 * };
 	 */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index e75b91a..f839066 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2404,6 +2404,11 @@ static void perf_counter_output(struct perf_counter *counter,
 		cpu_entry.cpu = raw_smp_processor_id();
 	}
 
+	if (sample_type & PERF_SAMPLE_PERIOD) {
+		header.type |= PERF_SAMPLE_PERIOD;
+		header.size += sizeof(u64);
+	}
+
 	if (sample_type & PERF_SAMPLE_GROUP) {
 		header.type |= PERF_SAMPLE_GROUP;
 		header.size += sizeof(u64) +
@@ -2445,6 +2450,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (sample_type & PERF_SAMPLE_CPU)
 		perf_output_put(&handle, cpu_entry);
 
+	if (sample_type & PERF_SAMPLE_PERIOD)
+		perf_output_put(&handle, counter->hw.sample_period);
+
 	/*
 	 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
 	 */
@@ -2835,6 +2843,7 @@ static void perf_log_period(struct perf_counter *counter, u64 period)
 	struct {
 		struct perf_event_header	header;
 		u64				time;
+		u64				id;
 		u64				period;
 	} freq_event = {
 		.header = {
@@ -2843,6 +2852,7 @@ static void perf_log_period(struct perf_counter *counter, u64 period)
 			.size = sizeof(freq_event),
 		},
 		.time = sched_clock(),
+		.id = counter->id,
 		.period = period,
 	};
 
-- 
cgit v0.10.2


From 6a24ed6c6082ec65d19331a4bfa30c0512a1a822 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Jun 2009 18:01:29 +0200
Subject: perf_counter: Fix frequency adjustment for < HZ

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 6bc2500..4f9d39e 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -373,6 +373,9 @@ struct hw_perf_counter {
 	u64				sample_period;
 	atomic64_t			period_left;
 	u64				interrupts;
+
+	u64				freq_count;
+	u64				freq_interrupts;
 #endif
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f839066..47c92fb 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1187,8 +1187,9 @@ static void perf_log_period(struct perf_counter *counter, u64 period);
 static void perf_adjust_freq(struct perf_counter_context *ctx)
 {
 	struct perf_counter *counter;
+	struct hw_perf_counter *hwc;
 	u64 interrupts, sample_period;
-	u64 events, period;
+	u64 events, period, freq;
 	s64 delta;
 
 	spin_lock(&ctx->lock);
@@ -1196,8 +1197,10 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 		if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 			continue;
 
-		interrupts = counter->hw.interrupts;
-		counter->hw.interrupts = 0;
+		hwc = &counter->hw;
+
+		interrupts = hwc->interrupts;
+		hwc->interrupts = 0;
 
 		if (interrupts == MAX_INTERRUPTS) {
 			perf_log_throttle(counter, 1);
@@ -1208,20 +1211,35 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 		if (!counter->attr.freq || !counter->attr.sample_freq)
 			continue;
 
-		events = HZ * interrupts * counter->hw.sample_period;
+		if (counter->attr.sample_freq < HZ) {
+			freq = counter->attr.sample_freq;
+
+			hwc->freq_count += freq;
+			hwc->freq_interrupts += interrupts;
+
+			if (hwc->freq_count < HZ)
+				continue;
+
+			interrupts = hwc->freq_interrupts;
+			hwc->freq_interrupts = 0;
+			hwc->freq_count -= HZ;
+		} else
+			freq = HZ;
+
+		events = freq * interrupts * hwc->sample_period;
 		period = div64_u64(events, counter->attr.sample_freq);
 
-		delta = (s64)(1 + period - counter->hw.sample_period);
+		delta = (s64)(1 + period - hwc->sample_period);
 		delta >>= 1;
 
-		sample_period = counter->hw.sample_period + delta;
+		sample_period = hwc->sample_period + delta;
 
 		if (!sample_period)
 			sample_period = 1;
 
 		perf_log_period(counter, sample_period);
 
-		counter->hw.sample_period = sample_period;
+		hwc->sample_period = sample_period;
 	}
 	spin_unlock(&ctx->lock);
 }
-- 
cgit v0.10.2


From b2fef0762fdb65cf8702eea93f4e58abeb0ecefc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 5 Jun 2009 18:07:51 +0200
Subject: perf_counter tools: Sample and display frequency adjustment changes

To allow the debugging of frequency-adjusting counters, sample
those adjustments and display them in perf report -D.

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index d4ad3057..43ddab3 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -347,7 +347,7 @@ static void create_counter(int counter, int cpu, pid_t pid)
 	memset(&attr, 0, sizeof(attr));
 	attr.config		= event_id[counter];
 	attr.sample_period	= event_count[counter];
-	attr.sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+	attr.sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_PERIOD;
 	attr.freq		= freq;
 	attr.mmap		= track;
 	attr.comm		= track;
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 5af105c..242e09f 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -69,12 +69,20 @@ struct fork_event {
 	__u32 pid, ppid;
 };
 
-typedef union event_union {
+struct period_event {
 	struct perf_event_header header;
-	struct ip_event ip;
-	struct mmap_event mmap;
-	struct comm_event comm;
-	struct fork_event fork;
+	__u64 time;
+	__u64 id;
+	__u64 sample_period;
+};
+
+typedef union event_union {
+	struct perf_event_header	header;
+	struct ip_event			ip;
+	struct mmap_event		mmap;
+	struct comm_event		comm;
+	struct fork_event		fork;
+	struct period_event		period;
 } event_t;
 
 static LIST_HEAD(dsos);
@@ -1053,6 +1061,19 @@ process_fork_event(event_t *event, unsigned long offset, unsigned long head)
 }
 
 static int
+process_period_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	dprintf("%p [%p]: PERF_EVENT_PERIOD: time:%Ld, id:%Ld: period:%Ld\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->period.time,
+		event->period.id,
+		event->period.sample_period);
+
+	return 0;
+}
+
+static int
 process_event(event_t *event, unsigned long offset, unsigned long head)
 {
 	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW)
@@ -1068,11 +1089,12 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 	case PERF_EVENT_FORK:
 		return process_fork_event(event, offset, head);
 
+	case PERF_EVENT_PERIOD:
+		return process_period_event(event, offset, head);
 	/*
 	 * We dont process them right now but they are fine:
 	 */
 
-	case PERF_EVENT_PERIOD:
 	case PERF_EVENT_THROTTLE:
 	case PERF_EVENT_UNTHROTTLE:
 		return 0;
@@ -1157,6 +1179,11 @@ more:
 
 	size = event->header.size;
 
+	dprintf("%p [%p]: event: %d\n",
+			(void *)(offset + head),
+			(void *)(long)event->header.size,
+			event->header.type);
+
 	if (!size || process_event(event, offset, head) < 0) {
 
 		dprintf("%p [%p]: skipping unknown header type: %d\n",
-- 
cgit v0.10.2


From 1dba15e74aba5a90c1f2557f37e5d09f8a2df643 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 5 Jun 2009 18:37:22 +0200
Subject: perf record: Set frequency correctly

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 43ddab3..c22ea0c 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -348,7 +348,10 @@ static void create_counter(int counter, int cpu, pid_t pid)
 	attr.config		= event_id[counter];
 	attr.sample_period	= event_count[counter];
 	attr.sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_PERIOD;
-	attr.freq		= freq;
+	if (freq) {
+		attr.freq		= 1;
+		attr.sample_freq	= freq;
+	}
 	attr.mmap		= track;
 	attr.comm		= track;
 	attr.inherit		= (cpu < 0) && inherit;
@@ -544,10 +547,6 @@ int cmd_record(int argc, const char **argv, const char *prefix)
 		event_id[0] = 0;
 	}
 
-	if (freq) {
-		default_interval = freq;
-		freq = 1;
-	}
 	for (counter = 0; counter < nr_counters; counter++) {
 		if (event_count[counter])
 			continue;
-- 
cgit v0.10.2


From 27704a16c9e0fb4c6b04344c7c4c40ac16148ec0 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 5 Jun 2009 10:21:52 -0700
Subject: Revert "drm: don't associate _DRM_DRIVER maps with a master"

This reverts commit 6c51d1cfa0a370b48a157163340190cf5fd2346b, which
apparently causes DRI initialization failures on Radeons.

Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Requested-by: Dave Airlie <airlied@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/gpu/drm/drm_bufs.c b/drivers/gpu/drm/drm_bufs.c
index 80a2575..0411d91 100644
--- a/drivers/gpu/drm/drm_bufs.c
+++ b/drivers/gpu/drm/drm_bufs.c
@@ -371,8 +371,7 @@ static int drm_addmap_core(struct drm_device * dev, resource_size_t offset,
 	list->user_token = list->hash.key << PAGE_SHIFT;
 	mutex_unlock(&dev->struct_mutex);
 
-	if (!(map->flags & _DRM_DRIVER))
-		list->master = dev->primary->master;
+	list->master = dev->primary->master;
 	*maplist = list;
 	return 0;
 	}
-- 
cgit v0.10.2


From 2c701b10283b58937201004276319ef9d9051b5d Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@codemonkey.org.uk>
Date: Fri, 5 Jun 2009 12:37:07 -0400
Subject: [CPUFREQ] powernow-k8: check space_id of _PCT registers to be FFH

The powernow-k8 driver checks to see that the Performance Control/Status
Registers are declared as FFH (functional fixed hardware) by the BIOS.
However, this check got broken in the commit:
 0e64a0c982c06a6b8f5e2a7f29eb108fdf257b2f
 [CPUFREQ] checkpatch cleanups for powernow-k8

Fix based on an original patch from Naga Chumbalkar.

Signed-off-by: Naga Chumbalkar <nagananda.chumbalkar@hp.com>
Cc: Mark Langsdorf <mark.langsdorf@amd.com>
Signed-off-by: Dave Jones <davej@redhat.com>

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index f6b32d1..35dc8fb 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -835,7 +835,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
 {
 	struct cpufreq_frequency_table *powernow_table;
 	int ret_val = -ENODEV;
-	acpi_integer space_id;
+	acpi_integer control, status;
 
 	if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
 		dprintk("register performance failed: bad ACPI data\n");
@@ -848,12 +848,13 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
 		goto err_out;
 	}
 
-	space_id = data->acpi_data.control_register.space_id;
-	if ((space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
-		(space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
+	control = data->acpi_data.control_register.space_id;
+	status = data->acpi_data.status_register.space_id;
+
+	if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
+	    (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
 		dprintk("Invalid control/status registers (%x - %x)\n",
-			data->acpi_data.control_register.space_id,
-			space_id);
+			control, status);
 		goto err_out;
 	}
 
-- 
cgit v0.10.2


From 45bc955bb1324a46c9539550cc615994e6d0a43d Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Fri, 5 Jun 2009 10:41:39 -0400
Subject: pata_legacy: wait for async probing

The basic problem here that pata_legacy attaches the host, sees if it found
any devices and detaches it if none were found.  With async probing, it's not
waiting until discovery is finished before deciding it has no devices and
trying the detach leading to this warning:

ata1: PATA max PIO4 cmd 0x1f0 ctl 0x3f6 irq 14
------------[ cut here ]------------
WARNING: at drivers/ata/libata-core.c:6222 ata_host_detach+0x75/0x90()
Modules linked in:
Pid: 1, comm: swapper Not tainted 2.6.30-rc7 #1
Call Trace:
 [<c01fbb05>] ? ata_host_detach+0x75/0x90
 [<c01fbb05>] ? ata_host_detach+0x75/0x90
 [<c01139b5>] ? warn_slowpath_common+0x45/0x80
 [<c01139fa>] ? warn_slowpath_null+0xa/0x10
 [<c01fbb05>] ? ata_host_detach+0x75/0x90
 [<c02f40e0>] ? legacy_init+0x44e/0x87f
 [<c02f3c92>] ? legacy_init+0x0/0x87f
 [<c0101021>] ? _stext+0x21/0x140
 [<c01890ff>] ? proc_register+0x2f/0x190
 [<c018938c>] ? create_proc_entry+0x5c/0xc0
 [<c0135ebe>] ? register_irq_proc+0x6e/0x90
 [<c02e6484>] ? kernel_init+0x6e/0xbf
 [<c02e6416>] ? kernel_init+0x0/0xbf
 [<c01031d7>] ? kernel_thread_helper+0x7/0x10
---[ end trace ef1ee36e873ae3a0 ]---

Because it detaches before the probe is complete.

One way to fix it would be to put an async_synchronize_full() before looking
for devices, which this patch does.  A better way might be to separate libata
into its own domain and only wait for that.

Reported-by: Mikael Pettersson <mikpe@it.uu.se>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/pata_legacy.c b/drivers/ata/pata_legacy.c
index f72c6c5..6932e56 100644
--- a/drivers/ata/pata_legacy.c
+++ b/drivers/ata/pata_legacy.c
@@ -48,6 +48,7 @@
  *
  */
 
+#include <linux/async.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/pci.h>
@@ -1028,6 +1029,7 @@ static __init int legacy_init_one(struct legacy_probe *probe)
 				&legacy_sht);
 	if (ret)
 		goto fail;
+	async_synchronize_full();
 	ld->platform_dev = pdev;
 
 	/* Nothing found means we drop the port as its probably not there */
-- 
cgit v0.10.2


From 5f33b3bcd7aac66a51e6bfaf35e8cff4eabafb06 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Mon, 1 Jun 2009 22:42:10 +0300
Subject: pata_efar: fix PIO2 underclocking

Fix the PIO mode 2 using mode 0 timings -- this driver should enable the
fast timing bank starting with PIO2, just like the PIIX/ICH drivers do.
Also, fix/rephrase some comments while at it.

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/pata_efar.c b/drivers/ata/pata_efar.c
index 2085e0a..2a6412f 100644
--- a/drivers/ata/pata_efar.c
+++ b/drivers/ata/pata_efar.c
@@ -22,7 +22,7 @@
 #include <linux/ata.h>
 
 #define DRV_NAME	"pata_efar"
-#define DRV_VERSION	"0.4.4"
+#define DRV_VERSION	"0.4.5"
 
 /**
  *	efar_pre_reset	-	Enable bits
@@ -98,18 +98,17 @@ static void efar_set_piomode (struct ata_port *ap, struct ata_device *adev)
 			    { 2, 1 },
 			    { 2, 3 }, };
 
-	if (pio > 2)
-		control |= 1;	/* TIME1 enable */
+	if (pio > 1)
+		control |= 1;	/* TIME */
 	if (ata_pio_need_iordy(adev))	/* PIO 3/4 require IORDY */
-		control |= 2;	/* IE enable */
-	/* Intel specifies that the PPE functionality is for disk only */
+		control |= 2;	/* IE */
+	/* Intel specifies that the prefetch/posting is for disk only */
 	if (adev->class == ATA_DEV_ATA)
-		control |= 4;	/* PPE enable */
+		control |= 4;	/* PPE */
 
 	pci_read_config_word(dev, idetm_port, &idetm_data);
 
-	/* Enable PPE, IE and TIME as appropriate */
-
+	/* Set PPE, IE, and TIME as appropriate */
 	if (adev->devno == 0) {
 		idetm_data &= 0xCCF0;
 		idetm_data |= control;
@@ -129,7 +128,7 @@ static void efar_set_piomode (struct ata_port *ap, struct ata_device *adev)
 		pci_write_config_byte(dev, 0x44, slave_data);
 	}
 
-	idetm_data |= 0x4000;	/* Ensure SITRE is enabled */
+	idetm_data |= 0x4000;	/* Ensure SITRE is set */
 	pci_write_config_word(dev, idetm_port, idetm_data);
 }
 
-- 
cgit v0.10.2


From 9b10ae86d1616f46dabb67c663fe6a9c3a502663 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 30 May 2009 20:50:12 +0900
Subject: ahci: add warning messages for hp laptops with broken suspend

Harddisks on HP dv[4-6] and HDX18 fail to come online after resume on
earlier BIOSen.  Fortunately, HP recently released BIOS updates for
all machines to fix the issue.  Detect old BIOSen, warn the user to
update BIOS on boot and suspend attempts and fail suspend.

Kudos to all the bug reporters.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: kernel.org@epperson.homelinux.net
Cc: emisca@gmail.com
Cc: Gadi Cohen <dragon@wastelands.net>
Cc: Paul Swanson <paul@procursa.com>
Cc: s@ourada.org
Cc: Trevor Davenport <trevor.davenport@gmail.com>
Cc: corruptor1972 <steven_tierney@yahoo.co.uk>
Cc: Victoria Wilson <mail@vwilson.co.uk>
Cc: khiraly <khiraly.list@gmail.com>
Cc: Sean <wollombi@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 08186ec..6b91c26 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -220,6 +220,7 @@ enum {
 	AHCI_HFLAG_NO_HOTPLUG		= (1 << 7), /* ignore PxSERR.DIAG.N */
 	AHCI_HFLAG_SECT255		= (1 << 8), /* max 255 sectors */
 	AHCI_HFLAG_YES_NCQ		= (1 << 9), /* force NCQ cap on */
+	AHCI_HFLAG_NO_SUSPEND		= (1 << 10), /* don't suspend */
 
 	/* ap->flags bits */
 
@@ -2316,9 +2317,17 @@ static int ahci_port_suspend(struct ata_port *ap, pm_message_t mesg)
 static int ahci_pci_device_suspend(struct pci_dev *pdev, pm_message_t mesg)
 {
 	struct ata_host *host = dev_get_drvdata(&pdev->dev);
+	struct ahci_host_priv *hpriv = host->private_data;
 	void __iomem *mmio = host->iomap[AHCI_PCI_BAR];
 	u32 ctl;
 
+	if (mesg.event & PM_EVENT_SUSPEND &&
+	    hpriv->flags & AHCI_HFLAG_NO_SUSPEND) {
+		dev_printk(KERN_ERR, &pdev->dev,
+			   "BIOS update required for suspend/resume\n");
+		return -EIO;
+	}
+
 	if (mesg.event & PM_EVENT_SLEEP) {
 		/* AHCI spec rev1.1 section 8.3.3:
 		 * Software must disable interrupts prior to requesting a
@@ -2610,6 +2619,63 @@ static bool ahci_broken_system_poweroff(struct pci_dev *pdev)
 	return false;
 }
 
+static bool ahci_broken_suspend(struct pci_dev *pdev)
+{
+	static const struct dmi_system_id sysids[] = {
+		/*
+		 * On HP dv[4-6] and HDX18 with earlier BIOSen, link
+		 * to the harddisk doesn't become online after
+		 * resuming from STR.  Warn and fail suspend.
+		 */
+		{
+			.ident = "dv4",
+			.matches = {
+				DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+				DMI_MATCH(DMI_PRODUCT_NAME,
+					  "HP Pavilion dv4 Notebook PC"),
+			},
+			.driver_data = "F.30", /* cutoff BIOS version */
+		},
+		{
+			.ident = "dv5",
+			.matches = {
+				DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+				DMI_MATCH(DMI_PRODUCT_NAME,
+					  "HP Pavilion dv5 Notebook PC"),
+			},
+			.driver_data = "F.16", /* cutoff BIOS version */
+		},
+		{
+			.ident = "dv6",
+			.matches = {
+				DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+				DMI_MATCH(DMI_PRODUCT_NAME,
+					  "HP Pavilion dv6 Notebook PC"),
+			},
+			.driver_data = "F.21",	/* cutoff BIOS version */
+		},
+		{
+			.ident = "HDX18",
+			.matches = {
+				DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+				DMI_MATCH(DMI_PRODUCT_NAME,
+					  "HP HDX18 Notebook PC"),
+			},
+			.driver_data = "F.23",	/* cutoff BIOS version */
+		},
+		{ }	/* terminate list */
+	};
+	const struct dmi_system_id *dmi = dmi_first_match(sysids);
+	const char *ver;
+
+	if (!dmi || pdev->bus->number || pdev->devfn != PCI_DEVFN(0x1f, 2))
+		return false;
+
+	ver = dmi_get_system_info(DMI_BIOS_VERSION);
+
+	return !ver || strcmp(ver, dmi->driver_data) < 0;
+}
+
 static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
 	static int printed_version;
@@ -2715,6 +2781,12 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 			"quirky BIOS, skipping spindown on poweroff\n");
 	}
 
+	if (ahci_broken_suspend(pdev)) {
+		hpriv->flags |= AHCI_HFLAG_NO_SUSPEND;
+		dev_printk(KERN_WARNING, &pdev->dev,
+			   "BIOS update required for suspend/resume\n");
+	}
+
 	/* CAP.NP sometimes indicate the index of the last enabled
 	 * port, at other times, that of the last possible port, so
 	 * determining the maximum port number requires looking at
-- 
cgit v0.10.2


From 65e31643115349fd7a81acbe75ec4a54d5df8aad Mon Sep 17 00:00:00 2001
From: Ville Syrjala <syrjala@sci.fi>
Date: Tue, 19 May 2009 01:37:44 +0300
Subject: ata_piix: Add HP Compaq nc6000 to the broken poweroff list

HP Compaq nc6000 suffers from the double disk spindown issue.
Add it to the broken poweroff DMI list.

Signed-off-by: Ville Syrjala <syrjala@sci.fi>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c
index d51a17c..1aeb708 100644
--- a/drivers/ata/ata_piix.c
+++ b/drivers/ata/ata_piix.c
@@ -1455,6 +1455,15 @@ static bool piix_broken_system_poweroff(struct pci_dev *pdev)
 			/* PCI slot number of the controller */
 			.driver_data = (void *)0x1FUL,
 		},
+		{
+			.ident = "HP Compaq nc6000",
+			.matches = {
+				DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+				DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nc6000"),
+			},
+			/* PCI slot number of the controller */
+			.driver_data = (void *)0x1FUL,
+		},
 
 		{ }	/* terminate list */
 	};
-- 
cgit v0.10.2


From 2f335a02b3c816e77e7df1d15b12e3bbb8f4c8f0 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 5 Jun 2009 19:31:01 +0200
Subject: perf top: Fix zero or negative refresh delay

If perf top is executed with a zero value for the refresh rate,
we get a division by zero exception while computing samples_per_sec.

Also a zero refresh rate is not possible, neither do we want to
accept negative values.

[ Impact: fix division by zero in perf top ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1244223061-5399-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index ff7e13c4..b2f480b 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -693,6 +693,9 @@ int cmd_top(int argc, const char **argv, const char *prefix)
 		event_id[0] = 0;
 	}
 
+	if (delay_secs < 1)
+		delay_secs = 1;
+
 	for (counter = 0; counter < nr_counters; counter++) {
 		if (event_count[counter])
 			continue;
-- 
cgit v0.10.2


From cd24f8c1e7e27a2c6051a9a338d4704a2431dbf0 Mon Sep 17 00:00:00 2001
From: Kevin Hilman <khilman@deeprootsystems.com>
Date: Fri, 5 Jun 2009 18:48:08 +0100
Subject: mtd: davinci nand: update clock naming

DaVinci clock support has been updated in mainline.
Update clock names accordingly.

Signed-off-by: Kevin Hilman <khilman@deeprootsystems.com>
Acked-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/mtd/nand/davinci_nand.c b/drivers/mtd/nand/davinci_nand.c
index 0119220..02700f7 100644
--- a/drivers/mtd/nand/davinci_nand.c
+++ b/drivers/mtd/nand/davinci_nand.c
@@ -407,16 +407,17 @@ static int __init nand_davinci_probe(struct platform_device *pdev)
 	}
 	info->chip.ecc.mode = ecc_mode;
 
-	info->clk = clk_get(&pdev->dev, "AEMIFCLK");
+	info->clk = clk_get(&pdev->dev, "aemif");
 	if (IS_ERR(info->clk)) {
 		ret = PTR_ERR(info->clk);
-		dev_dbg(&pdev->dev, "unable to get AEMIFCLK, err %d\n", ret);
+		dev_dbg(&pdev->dev, "unable to get AEMIF clock, err %d\n", ret);
 		goto err_clk;
 	}
 
 	ret = clk_enable(info->clk);
 	if (ret < 0) {
-		dev_dbg(&pdev->dev, "unable to enable AEMIFCLK, err %d\n", ret);
+		dev_dbg(&pdev->dev, "unable to enable AEMIF clock, err %d\n",
+			ret);
 		goto err_clk_enable;
 	}
 
-- 
cgit v0.10.2


From a3cb900cc408977a11519bc7c760f3e499079589 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Wed, 13 May 2009 15:02:27 +0100
Subject: [libata] pata_ali: Use IGN_SIMPLEX

Some ALi devices report simplex if they have been disabled and re-enabled, and
restoring the byte does not work. Ignore it - the needed supporting logic is
already present for the SATA ULi ports.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/pata_ali.c b/drivers/ata/pata_ali.c
index 751b7ea..fc9c5d6 100644
--- a/drivers/ata/pata_ali.c
+++ b/drivers/ata/pata_ali.c
@@ -497,14 +497,16 @@ static int ali_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	};
 	/* Revision 0x20 added DMA */
 	static const struct ata_port_info info_20 = {
-		.flags = ATA_FLAG_SLAVE_POSS | ATA_FLAG_PIO_LBA48,
+		.flags = ATA_FLAG_SLAVE_POSS | ATA_FLAG_PIO_LBA48 |
+							ATA_FLAG_IGN_SIMPLEX,
 		.pio_mask = ATA_PIO4,
 		.mwdma_mask = ATA_MWDMA2,
 		.port_ops = &ali_20_port_ops
 	};
 	/* Revision 0x20 with support logic added UDMA */
 	static const struct ata_port_info info_20_udma = {
-		.flags = ATA_FLAG_SLAVE_POSS | ATA_FLAG_PIO_LBA48,
+		.flags = ATA_FLAG_SLAVE_POSS | ATA_FLAG_PIO_LBA48 |
+							ATA_FLAG_IGN_SIMPLEX,
 		.pio_mask = ATA_PIO4,
 		.mwdma_mask = ATA_MWDMA2,
 		.udma_mask = ATA_UDMA2,
@@ -512,7 +514,8 @@ static int ali_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	};
 	/* Revision 0xC2 adds UDMA66 */
 	static const struct ata_port_info info_c2 = {
-		.flags = ATA_FLAG_SLAVE_POSS | ATA_FLAG_PIO_LBA48,
+		.flags = ATA_FLAG_SLAVE_POSS | ATA_FLAG_PIO_LBA48 |
+							ATA_FLAG_IGN_SIMPLEX,
 		.pio_mask = ATA_PIO4,
 		.mwdma_mask = ATA_MWDMA2,
 		.udma_mask = ATA_UDMA4,
@@ -520,7 +523,8 @@ static int ali_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	};
 	/* Revision 0xC3 is UDMA66 for now */
 	static const struct ata_port_info info_c3 = {
-		.flags = ATA_FLAG_SLAVE_POSS | ATA_FLAG_PIO_LBA48,
+		.flags = ATA_FLAG_SLAVE_POSS | ATA_FLAG_PIO_LBA48 |
+							ATA_FLAG_IGN_SIMPLEX,
 		.pio_mask = ATA_PIO4,
 		.mwdma_mask = ATA_MWDMA2,
 		.udma_mask = ATA_UDMA4,
@@ -528,7 +532,8 @@ static int ali_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	};
 	/* Revision 0xC4 is UDMA100 */
 	static const struct ata_port_info info_c4 = {
-		.flags = ATA_FLAG_SLAVE_POSS | ATA_FLAG_PIO_LBA48,
+		.flags = ATA_FLAG_SLAVE_POSS | ATA_FLAG_PIO_LBA48 |
+							ATA_FLAG_IGN_SIMPLEX,
 		.pio_mask = ATA_PIO4,
 		.mwdma_mask = ATA_MWDMA2,
 		.udma_mask = ATA_UDMA5,
@@ -536,7 +541,7 @@ static int ali_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	};
 	/* Revision 0xC5 is UDMA133 with LBA48 DMA */
 	static const struct ata_port_info info_c5 = {
-		.flags = ATA_FLAG_SLAVE_POSS,
+		.flags = ATA_FLAG_SLAVE_POSS | 	ATA_FLAG_IGN_SIMPLEX,
 		.pio_mask = ATA_PIO4,
 		.mwdma_mask = ATA_MWDMA2,
 		.udma_mask = ATA_UDMA6,
-- 
cgit v0.10.2


From fe2245c905631a3a353504fc04388ce3dfaf9d9e Mon Sep 17 00:00:00 2001
From: Mark Langsdorf <mark.langsdorf@amd.com>
Date: Sun, 5 Jul 2009 15:50:52 -0500
Subject: x86: enable GART-IOMMU only after setting up protection methods

The current code to set up the GART as an IOMMU enables GART
translations before it removes the aperture from the kernel memory
map, sets the GART PTEs to UC, sets up the guard and scratch
pages, or does a wbinvd().  This leaves the possibility of cache
aliasing open and can cause system crashes.

Re-order the code so as to enable the GART translations only
after all safeguards are in place and the tlb has been flushed.

AMD has tested this patch on both Istanbul systems and 1st
generation Opteron systems with APG enabled and seen no adverse
effects.  Istanbul systems with HT Assist enabled sometimes
see MCE errors due to cache artifacts with the unmodified
code.

Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
Cc: <stable@kernel.org>
Cc: Joerg Roedel <joerg.roedel@amd.com>
Cc: akpm@linux-foundation.org
Cc: jbarnes@virtuousgeek.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 1e8920d..cfd9f90 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -658,8 +658,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 
 	agp_gatt_table = gatt;
 
-	enable_gart_translations();
-
 	error = sysdev_class_register(&gart_sysdev_class);
 	if (!error)
 		error = sysdev_register(&device_gart);
@@ -816,6 +814,14 @@ void __init gart_iommu_init(void)
 	 * the pages as Not-Present:
 	 */
 	wbinvd();
+	
+	/*
+	 * Now all caches are flushed and we can safely enable
+	 * GART hardware.  Doing it early leaves the possibility
+	 * of stale cache entries that can lead to GART PTE
+	 * errors.
+	 */
+	enable_gart_translations();
 
 	/*
 	 * Try to workaround a bug (thanks to BenH):
-- 
cgit v0.10.2


From a21ca2cac582886a3e95c8bb84ff7c52d4d15e54 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 6 Jun 2009 09:58:57 +0200
Subject: perf_counter: Separate out attr->type from attr->config

Counter type is a frequently used value and we do a lot of
bit juggling by encoding and decoding it from attr->config.

Clean this up by creating a separate attr->type field.

Also clean up the various similarly complex user-space bits
all around counter attribute management.

The net improvement is significant, and it will be easier
to add a new major type (which is what triggered this cleanup).

(This changes the ABI, all tools are adapted.)
(PowerPC build-tested.)

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index c22ea0c..130fd88 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -20,10 +20,10 @@
 #define ALIGN(x, a)		__ALIGN_MASK(x, (typeof(x))(a)-1)
 #define __ALIGN_MASK(x, mask)	(((x)+(mask))&~(mask))
 
-static long			default_interval = 100000;
-static long			event_count[MAX_COUNTERS];
-
 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+
+static long			default_interval		= 100000;
+
 static int			nr_cpus				= 0;
 static unsigned int		page_size;
 static unsigned int		mmap_pages			= 128;
@@ -38,22 +38,44 @@ static int			inherit				= 1;
 static int			force				= 0;
 static int			append_file			= 0;
 
-const unsigned int default_count[] = {
-	1000000,
-	1000000,
-	  10000,
-	  10000,
-	1000000,
-	  10000,
+static long			samples;
+static struct timeval		last_read;
+static struct timeval		this_read;
+
+static __u64			bytes_written;
+
+static struct pollfd		event_array[MAX_NR_CPUS * MAX_COUNTERS];
+
+static int			nr_poll;
+static int			nr_cpu;
+
+struct mmap_event {
+	struct perf_event_header	header;
+	__u32				pid;
+	__u32				tid;
+	__u64				start;
+	__u64				len;
+	__u64				pgoff;
+	char				filename[PATH_MAX];
+};
+
+struct comm_event {
+	struct perf_event_header	header;
+	__u32				pid;
+	__u32				tid;
+	char				comm[16];
 };
 
+
 struct mmap_data {
-	int counter;
-	void *base;
-	unsigned int mask;
-	unsigned int prev;
+	int			counter;
+	void			*base;
+	unsigned int		mask;
+	unsigned int		prev;
 };
 
+static struct mmap_data		mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
+
 static unsigned int mmap_read_head(struct mmap_data *md)
 {
 	struct perf_counter_mmap_page *pc = md->base;
@@ -65,11 +87,6 @@ static unsigned int mmap_read_head(struct mmap_data *md)
 	return head;
 }
 
-static long samples;
-static struct timeval last_read, this_read;
-
-static __u64 bytes_written;
-
 static void mmap_read(struct mmap_data *md)
 {
 	unsigned int head = mmap_read_head(md);
@@ -157,29 +174,6 @@ static void sig_handler(int sig)
 	done = 1;
 }
 
-static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
-static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
-
-static int nr_poll;
-static int nr_cpu;
-
-struct mmap_event {
-	struct perf_event_header	header;
-	__u32				pid;
-	__u32				tid;
-	__u64				start;
-	__u64				len;
-	__u64				pgoff;
-	char				filename[PATH_MAX];
-};
-
-struct comm_event {
-	struct perf_event_header	header;
-	__u32				pid;
-	__u32				tid;
-	char				comm[16];
-};
-
 static void pid_synthesize_comm_event(pid_t pid, int full)
 {
 	struct comm_event comm_ev;
@@ -341,24 +335,21 @@ static int group_fd;
 
 static void create_counter(int counter, int cpu, pid_t pid)
 {
-	struct perf_counter_attr attr;
+	struct perf_counter_attr *attr = attrs + counter;
 	int track = 1;
 
-	memset(&attr, 0, sizeof(attr));
-	attr.config		= event_id[counter];
-	attr.sample_period	= event_count[counter];
-	attr.sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_PERIOD;
+	attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_PERIOD;
 	if (freq) {
-		attr.freq		= 1;
-		attr.sample_freq	= freq;
+		attr->freq		= 1;
+		attr->sample_freq	= freq;
 	}
-	attr.mmap		= track;
-	attr.comm		= track;
-	attr.inherit		= (cpu < 0) && inherit;
+	attr->mmap		= track;
+	attr->comm		= track;
+	attr->inherit		= (cpu < 0) && inherit;
 
 	track = 0; /* only the first counter needs these */
 
-	fd[nr_cpu][counter] = sys_perf_counter_open(&attr, pid, cpu, group_fd, 0);
+	fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0);
 
 	if (fd[nr_cpu][counter] < 0) {
 		int err = errno;
@@ -542,16 +533,14 @@ int cmd_record(int argc, const char **argv, const char *prefix)
 	if (!argc && target_pid == -1 && !system_wide)
 		usage_with_options(record_usage, options);
 
-	if (!nr_counters) {
+	if (!nr_counters)
 		nr_counters = 1;
-		event_id[0] = 0;
-	}
 
 	for (counter = 0; counter < nr_counters; counter++) {
-		if (event_count[counter])
+		if (attrs[counter].sample_period)
 			continue;
 
-		event_count[counter] = default_interval;
+		attrs[counter].sample_period = default_interval;
 	}
 
 	return __cmd_record(argc, argv);
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 4fc0d80..9711e55 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -44,23 +44,22 @@
 
 #include <sys/prctl.h>
 
-static int			system_wide			=  0;
-static int			inherit				=  1;
+static struct perf_counter_attr default_attrs[MAX_COUNTERS] = {
 
-static __u64			default_event_id[MAX_COUNTERS]	= {
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_TASK_CLOCK		},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_CONTEXT_SWITCHES	},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_CPU_MIGRATIONS	},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_PAGE_FAULTS	},
 
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CPU_CYCLES		},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_INSTRUCTIONS	},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CACHE_REFERENCES	},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CACHE_MISSES	},
 };
 
-static int			default_interval = 100000;
-static int			event_count[MAX_COUNTERS];
+static int			system_wide			=  0;
+static int			inherit				=  1;
+
 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
 
 static int			target_pid			= -1;
@@ -86,22 +85,16 @@ static __u64			walltime_nsecs;
 
 static void create_perfstat_counter(int counter)
 {
-	struct perf_counter_attr attr;
-
-	memset(&attr, 0, sizeof(attr));
-	attr.config		= event_id[counter];
-	attr.sample_type	= 0;
-	attr.exclude_kernel = event_mask[counter] & EVENT_MASK_KERNEL;
-	attr.exclude_user   = event_mask[counter] & EVENT_MASK_USER;
+	struct perf_counter_attr *attr = attrs + counter;
 
 	if (scale)
-		attr.read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
-					  PERF_FORMAT_TOTAL_TIME_RUNNING;
+		attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
+				    PERF_FORMAT_TOTAL_TIME_RUNNING;
 
 	if (system_wide) {
 		int cpu;
 		for (cpu = 0; cpu < nr_cpus; cpu ++) {
-			fd[cpu][counter] = sys_perf_counter_open(&attr, -1, cpu, -1, 0);
+			fd[cpu][counter] = sys_perf_counter_open(attr, -1, cpu, -1, 0);
 			if (fd[cpu][counter] < 0) {
 				printf("perfstat error: syscall returned with %d (%s)\n",
 						fd[cpu][counter], strerror(errno));
@@ -109,10 +102,10 @@ static void create_perfstat_counter(int counter)
 			}
 		}
 	} else {
-		attr.inherit	= inherit;
-		attr.disabled	= 1;
+		attr->inherit	= inherit;
+		attr->disabled	= 1;
 
-		fd[0][counter] = sys_perf_counter_open(&attr, 0, -1, -1, 0);
+		fd[0][counter] = sys_perf_counter_open(attr, 0, -1, -1, 0);
 		if (fd[0][counter] < 0) {
 			printf("perfstat error: syscall returned with %d (%s)\n",
 					fd[0][counter], strerror(errno));
@@ -126,9 +119,13 @@ static void create_perfstat_counter(int counter)
  */
 static inline int nsec_counter(int counter)
 {
-	if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK))
+	if (attrs[counter].type != PERF_TYPE_SOFTWARE)
+		return 0;
+
+	if (attrs[counter].config == PERF_COUNT_CPU_CLOCK)
 		return 1;
-	if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK))
+
+	if (attrs[counter].config == PERF_COUNT_TASK_CLOCK)
 		return 1;
 
 	return 0;
@@ -177,7 +174,8 @@ static void read_counter(int counter)
 	/*
 	 * Save the full runtime - to allow normalization during printout:
 	 */
-	if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK))
+	if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
+		attrs[counter].config == PERF_COUNT_TASK_CLOCK)
 		runtime_nsecs = count[0];
 }
 
@@ -203,8 +201,8 @@ static void print_counter(int counter)
 
 		fprintf(stderr, " %14.6f  %-20s",
 			msecs, event_name(counter));
-		if (event_id[counter] ==
-				EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
+		if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
+			attrs[counter].config == PERF_COUNT_TASK_CLOCK) {
 
 			fprintf(stderr, " # %11.3f CPU utilization factor",
 				(double)count[0] / (double)walltime_nsecs);
@@ -300,8 +298,6 @@ static char events_help_msg[EVENTS_HELP_MAX];
 static const struct option options[] = {
 	OPT_CALLBACK('e', "event", NULL, "event",
 		     events_help_msg, parse_events),
-	OPT_INTEGER('c', "count", &default_interval,
-		    "event period to sample"),
 	OPT_BOOLEAN('i', "inherit", &inherit,
 		    "child tasks inherit counters"),
 	OPT_INTEGER('p', "pid", &target_pid,
@@ -315,27 +311,19 @@ static const struct option options[] = {
 
 int cmd_stat(int argc, const char **argv, const char *prefix)
 {
-	int counter;
-
 	page_size = sysconf(_SC_PAGE_SIZE);
 
 	create_events_help(events_help_msg);
-	memcpy(event_id, default_event_id, sizeof(default_event_id));
+
+	memcpy(attrs, default_attrs, sizeof(attrs));
 
 	argc = parse_options(argc, argv, options, stat_usage, 0);
 	if (!argc)
 		usage_with_options(stat_usage, options);
 
-	if (!nr_counters) {
+	if (!nr_counters)
 		nr_counters = 8;
-	}
-
-	for (counter = 0; counter < nr_counters; counter++) {
-		if (event_count[counter])
-			continue;
 
-		event_count[counter] = default_interval;
-	}
 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
 	assert(nr_cpus <= MAX_NR_CPUS);
 	assert(nr_cpus >= 0);
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index b2f480b..98a6d53 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -48,22 +48,11 @@
 #include <linux/unistd.h>
 #include <linux/types.h>
 
-static int			system_wide			=  0;
+static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
 
-static __u64			default_event_id[MAX_COUNTERS]		= {
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
+static int			system_wide			=  0;
 
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
-};
-static int			default_interval = 100000;
-static int			event_count[MAX_COUNTERS];
-static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+static int			default_interval		= 100000;
 
 static __u64			count_filter			=  5;
 static int			print_entries			= 15;
@@ -85,15 +74,6 @@ static int			delay_secs			=  2;
 static int			zero;
 static int			dump_symtab;
 
-static const unsigned int default_count[] = {
-	1000000,
-	1000000,
-	  10000,
-	  10000,
-	1000000,
-	  10000,
-};
-
 /*
  * Symbols
  */
@@ -112,7 +92,7 @@ struct sym_entry {
 
 struct sym_entry		*sym_filter_entry;
 
-struct dso *kernel_dso;
+struct dso			*kernel_dso;
 
 /*
  * Symbols will be added here in record_ip and will get out
@@ -213,7 +193,7 @@ static void print_sym_table(void)
 		100.0 - (100.0*((samples_per_sec-ksamples_per_sec)/samples_per_sec)));
 
 	if (nr_counters == 1) {
-		printf("%d", event_count[0]);
+		printf("%Ld", attrs[0].sample_period);
 		if (freq)
 			printf("Hz ");
 		else
@@ -421,10 +401,10 @@ static void process_event(uint64_t ip, int counter)
 }
 
 struct mmap_data {
-	int counter;
-	void *base;
-	unsigned int mask;
-	unsigned int prev;
+	int			counter;
+	void			*base;
+	unsigned int		mask;
+	unsigned int		prev;
 };
 
 static unsigned int mmap_read_head(struct mmap_data *md)
@@ -539,7 +519,7 @@ static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 
 static int __cmd_top(void)
 {
-	struct perf_counter_attr attr;
+	struct perf_counter_attr *attr;
 	pthread_t thread;
 	int i, counter, group_fd, nr_poll = 0;
 	unsigned int cpu;
@@ -553,13 +533,12 @@ static int __cmd_top(void)
 			if (target_pid == -1 && profile_cpu == -1)
 				cpu = i;
 
-			memset(&attr, 0, sizeof(attr));
-			attr.config		= event_id[counter];
-			attr.sample_period	= event_count[counter];
-			attr.sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
-			attr.freq		= freq;
+			attr = attrs + counter;
 
-			fd[i][counter] = sys_perf_counter_open(&attr, target_pid, cpu, group_fd, 0);
+			attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+			attr->freq		= freq;
+
+			fd[i][counter] = sys_perf_counter_open(attr, target_pid, cpu, group_fd, 0);
 			if (fd[i][counter] < 0) {
 				int err = errno;
 
@@ -670,7 +649,6 @@ int cmd_top(int argc, const char **argv, const char *prefix)
 	page_size = sysconf(_SC_PAGE_SIZE);
 
 	create_events_help(events_help_msg);
-	memcpy(event_id, default_event_id, sizeof(default_event_id));
 
 	argc = parse_options(argc, argv, options, top_usage, 0);
 	if (argc)
@@ -688,19 +666,22 @@ int cmd_top(int argc, const char **argv, const char *prefix)
 		profile_cpu = -1;
 	}
 
-	if (!nr_counters) {
+	if (!nr_counters)
 		nr_counters = 1;
-		event_id[0] = 0;
-	}
 
 	if (delay_secs < 1)
 		delay_secs = 1;
 
+	parse_symbols();
+
+	/*
+	 * Fill in the ones not specifically initialized via -c:
+	 */
 	for (counter = 0; counter < nr_counters; counter++) {
-		if (event_count[counter])
+		if (attrs[counter].sample_period)
 			continue;
 
-		event_count[counter] = default_interval;
+		attrs[counter].sample_period = default_interval;
 	}
 
 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
@@ -710,7 +691,5 @@ int cmd_top(int argc, const char **argv, const char *prefix)
 	if (target_pid != -1 || profile_cpu != -1)
 		nr_cpus = 1;
 
-	parse_symbols();
-
 	return __cmd_top();
 }
diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h
index 10622a4..af0a504 100644
--- a/Documentation/perf_counter/perf.h
+++ b/Documentation/perf_counter/perf.h
@@ -64,6 +64,4 @@ sys_perf_counter_open(struct perf_counter_attr *attr_uptr,
 #define MAX_COUNTERS			256
 #define MAX_NR_CPUS			256
 
-#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
-
 #endif
diff --git a/Documentation/perf_counter/util/parse-events.c b/Documentation/perf_counter/util/parse-events.c
index 2fdfd1d..eb56bd9 100644
--- a/Documentation/perf_counter/util/parse-events.c
+++ b/Documentation/perf_counter/util/parse-events.c
@@ -6,37 +6,39 @@
 #include "exec_cmd.h"
 #include "string.h"
 
-int nr_counters;
+int					nr_counters;
 
-__u64			event_id[MAX_COUNTERS]		= { };
-int			event_mask[MAX_COUNTERS];
+struct perf_counter_attr		attrs[MAX_COUNTERS];
 
 struct event_symbol {
-	__u64 event;
-	char *symbol;
+	__u8	type;
+	__u64	config;
+	char	*symbol;
 };
 
+#define C(x, y) .type = PERF_TYPE_##x, .config = PERF_COUNT_##y
+
 static struct event_symbol event_symbols[] = {
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
-
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
+  { C(HARDWARE, CPU_CYCLES),		"cpu-cycles",		},
+  { C(HARDWARE, CPU_CYCLES),		"cycles",		},
+  { C(HARDWARE, INSTRUCTIONS),		"instructions",		},
+  { C(HARDWARE, CACHE_REFERENCES),	"cache-references",	},
+  { C(HARDWARE, CACHE_MISSES),		"cache-misses",		},
+  { C(HARDWARE, BRANCH_INSTRUCTIONS),	"branch-instructions",	},
+  { C(HARDWARE, BRANCH_INSTRUCTIONS),	"branches",		},
+  { C(HARDWARE, BRANCH_MISSES),		"branch-misses",	},
+  { C(HARDWARE, BUS_CYCLES),		"bus-cycles",		},
+
+  { C(SOFTWARE, CPU_CLOCK),		"cpu-clock",		},
+  { C(SOFTWARE, TASK_CLOCK),		"task-clock",		},
+  { C(SOFTWARE, PAGE_FAULTS),		"page-faults",		},
+  { C(SOFTWARE, PAGE_FAULTS),		"faults",		},
+  { C(SOFTWARE, PAGE_FAULTS_MIN),	"minor-faults",		},
+  { C(SOFTWARE, PAGE_FAULTS_MAJ),	"major-faults",		},
+  { C(SOFTWARE, CONTEXT_SWITCHES),	"context-switches",	},
+  { C(SOFTWARE, CONTEXT_SWITCHES),	"cs",			},
+  { C(SOFTWARE, CPU_MIGRATIONS),	"cpu-migrations",	},
+  { C(SOFTWARE, CPU_MIGRATIONS),	"migrations",		},
 };
 
 #define __PERF_COUNTER_FIELD(config, name) \
@@ -67,27 +69,26 @@ static char *sw_event_names[] = {
 	"major faults",
 };
 
-char *event_name(int ctr)
+char *event_name(int counter)
 {
-	__u64 config = event_id[ctr];
-	int type = PERF_COUNTER_TYPE(config);
-	int id = PERF_COUNTER_ID(config);
+	__u64 config = attrs[counter].config;
+	int type = attrs[counter].type;
 	static char buf[32];
 
-	if (PERF_COUNTER_RAW(config)) {
-		sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
+	if (attrs[counter].type == PERF_TYPE_RAW) {
+		sprintf(buf, "raw 0x%llx", config);
 		return buf;
 	}
 
 	switch (type) {
 	case PERF_TYPE_HARDWARE:
-		if (id < PERF_HW_EVENTS_MAX)
-			return hw_event_names[id];
+		if (config < PERF_HW_EVENTS_MAX)
+			return hw_event_names[config];
 		return "unknown-hardware";
 
 	case PERF_TYPE_SOFTWARE:
-		if (id < PERF_SW_EVENTS_MAX)
-			return sw_event_names[id];
+		if (config < PERF_SW_EVENTS_MAX)
+			return sw_event_names[config];
 		return "unknown-software";
 
 	default:
@@ -101,15 +102,19 @@ char *event_name(int ctr)
  * Each event can have multiple symbolic names.
  * Symbolic names are (almost) exactly matched.
  */
-static __u64 match_event_symbols(const char *str)
+static int match_event_symbols(const char *str, struct perf_counter_attr *attr)
 {
 	__u64 config, id;
 	int type;
 	unsigned int i;
 	const char *sep, *pstr;
 
-	if (str[0] == 'r' && hex2u64(str + 1, &config) > 0)
-		return config | PERF_COUNTER_RAW_MASK;
+	if (str[0] == 'r' && hex2u64(str + 1, &config) > 0) {
+		attr->type = PERF_TYPE_RAW;
+		attr->config = config;
+
+		return 0;
+	}
 
 	pstr = str;
 	sep = strchr(pstr, ':');
@@ -121,35 +126,45 @@ static __u64 match_event_symbols(const char *str)
 		if (sep) {
 			pstr = sep + 1;
 			if (strchr(pstr, 'k'))
-				event_mask[nr_counters] |= EVENT_MASK_USER;
+				attr->exclude_user = 1;
 			if (strchr(pstr, 'u'))
-				event_mask[nr_counters] |= EVENT_MASK_KERNEL;
+				attr->exclude_kernel = 1;
 		}
-		return EID(type, id);
+		attr->type = type;
+		attr->config = id;
+
+		return 0;
 	}
 
 	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
 		if (!strncmp(str, event_symbols[i].symbol,
-			     strlen(event_symbols[i].symbol)))
-			return event_symbols[i].event;
+			     strlen(event_symbols[i].symbol))) {
+
+			attr->type = event_symbols[i].type;
+			attr->config = event_symbols[i].config;
+
+			return 0;
+		}
 	}
 
-	return ~0ULL;
+	return -EINVAL;
 }
 
 int parse_events(const struct option *opt, const char *str, int unset)
 {
-	__u64 config;
+	struct perf_counter_attr attr;
+	int ret;
 
+	memset(&attr, 0, sizeof(attr));
 again:
 	if (nr_counters == MAX_COUNTERS)
 		return -1;
 
-	config = match_event_symbols(str);
-	if (config == ~0ULL)
-		return -1;
+	ret = match_event_symbols(str, &attr);
+	if (ret < 0)
+		return ret;
 
-	event_id[nr_counters] = config;
+	attrs[nr_counters] = attr;
 	nr_counters++;
 
 	str = strstr(str, ",");
@@ -168,7 +183,6 @@ void create_events_help(char *events_help_msg)
 {
 	unsigned int i;
 	char *str;
-	__u64 e;
 
 	str = events_help_msg;
 
@@ -178,9 +192,8 @@ void create_events_help(char *events_help_msg)
 	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
 		int type, id;
 
-		e = event_symbols[i].event;
-		type = PERF_COUNTER_TYPE(e);
-		id = PERF_COUNTER_ID(e);
+		type = event_symbols[i].type;
+		id = event_symbols[i].config;
 
 		if (i)
 			str += sprintf(str, "|");
@@ -191,4 +204,3 @@ void create_events_help(char *events_help_msg)
 
 	str += sprintf(str, "|rNNN]");
 }
-
diff --git a/Documentation/perf_counter/util/parse-events.h b/Documentation/perf_counter/util/parse-events.h
index 0da306b..542971c 100644
--- a/Documentation/perf_counter/util/parse-events.h
+++ b/Documentation/perf_counter/util/parse-events.h
@@ -3,12 +3,9 @@
  * Parse symbolic events/counts passed in as options:
  */
 
-extern int nr_counters;
-extern __u64			event_id[MAX_COUNTERS];
-extern int			event_mask[MAX_COUNTERS];
+extern int			nr_counters;
 
-#define EVENT_MASK_KERNEL	1
-#define EVENT_MASK_USER		2
+extern struct perf_counter_attr attrs[MAX_COUNTERS];
 
 extern char *event_name(int ctr);
 
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 232b00a..4786ad9 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -867,13 +867,13 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	if (!ppmu)
 		return ERR_PTR(-ENXIO);
-	if (!perf_event_raw(&counter->attr)) {
-		ev = perf_event_id(&counter->attr);
+	if (counter->attr.type != PERF_TYPE_RAW) {
+		ev = counter->attr.config;
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
 			return ERR_PTR(-EOPNOTSUPP);
 		ev = ppmu->generic_events[ev];
 	} else {
-		ev = perf_event_config(&counter->attr);
+		ev = counter->attr.config;
 	}
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 8f53f3a..430e048 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -292,15 +292,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
-	if (perf_event_raw(attr)) {
-		hwc->config |= x86_pmu.raw_event(perf_event_config(attr));
+	if (attr->type == PERF_TYPE_RAW) {
+		hwc->config |= x86_pmu.raw_event(attr->config);
 	} else {
-		if (perf_event_id(attr) >= x86_pmu.max_events)
+		if (attr->config >= x86_pmu.max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= x86_pmu.event_map(perf_event_id(attr));
+		hwc->config |= x86_pmu.event_map(attr->config);
 	}
 
 	counter->destroy = hw_perf_counter_destroy;
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 4f9d39e..f794c69 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -73,26 +73,6 @@ enum sw_event_ids {
 	PERF_SW_EVENTS_MAX		= 7,
 };
 
-#define __PERF_COUNTER_MASK(name)			\
-	(((1ULL << PERF_COUNTER_##name##_BITS) - 1) <<	\
-	 PERF_COUNTER_##name##_SHIFT)
-
-#define PERF_COUNTER_RAW_BITS		1
-#define PERF_COUNTER_RAW_SHIFT		63
-#define PERF_COUNTER_RAW_MASK		__PERF_COUNTER_MASK(RAW)
-
-#define PERF_COUNTER_CONFIG_BITS	63
-#define PERF_COUNTER_CONFIG_SHIFT	0
-#define PERF_COUNTER_CONFIG_MASK	__PERF_COUNTER_MASK(CONFIG)
-
-#define PERF_COUNTER_TYPE_BITS		7
-#define PERF_COUNTER_TYPE_SHIFT		56
-#define PERF_COUNTER_TYPE_MASK		__PERF_COUNTER_MASK(TYPE)
-
-#define PERF_COUNTER_EVENT_BITS		56
-#define PERF_COUNTER_EVENT_SHIFT	0
-#define PERF_COUNTER_EVENT_MASK		__PERF_COUNTER_MASK(EVENT)
-
 /*
  * Bits that can be set in attr.sample_type to request information
  * in the overflow packets.
@@ -125,10 +105,13 @@ enum perf_counter_read_format {
  */
 struct perf_counter_attr {
 	/*
-	 * The MSB of the config word signifies if the rest contains cpu
-	 * specific (raw) counter configuration data, if unset, the next
-	 * 7 bits are an event type and the rest of the bits are the event
-	 * identifier.
+	 * Major type: hardware/software/tracepoint/etc.
+	 */
+	__u32			type;
+	__u32			__reserved_1;
+
+	/*
+	 * Type specific configuration information.
 	 */
 	__u64			config;
 
@@ -152,12 +135,11 @@ struct perf_counter_attr {
 				comm	       :  1, /* include comm data     */
 				freq           :  1, /* use freq, not period  */
 
-				__reserved_1   : 53;
+				__reserved_2   : 53;
 
 	__u32			wakeup_events;	/* wakeup every n events */
-	__u32			__reserved_2;
+	__u32			__reserved_3;
 
-	__u64			__reserved_3;
 	__u64			__reserved_4;
 };
 
@@ -278,8 +260,8 @@ enum perf_event_type {
 
 	/*
 	 * struct {
-	 * 	struct perf_event_header	header;
-	 * 	u32				pid, ppid;
+	 *	struct perf_event_header	header;
+	 *	u32				pid, ppid;
 	 * };
 	 */
 	PERF_EVENT_FORK			= 7,
@@ -331,27 +313,6 @@ enum perf_event_type {
 
 struct task_struct;
 
-static inline u64 perf_event_raw(struct perf_counter_attr *attr)
-{
-	return attr->config & PERF_COUNTER_RAW_MASK;
-}
-
-static inline u64 perf_event_config(struct perf_counter_attr *attr)
-{
-	return attr->config & PERF_COUNTER_CONFIG_MASK;
-}
-
-static inline u64 perf_event_type(struct perf_counter_attr *attr)
-{
-	return (attr->config & PERF_COUNTER_TYPE_MASK) >>
-		PERF_COUNTER_TYPE_SHIFT;
-}
-
-static inline u64 perf_event_id(struct perf_counter_attr *attr)
-{
-	return attr->config & PERF_COUNTER_EVENT_MASK;
-}
-
 /**
  * struct hw_perf_counter - performance counter hardware details:
  */
@@ -616,8 +577,8 @@ extern int perf_counter_overflow(struct perf_counter *counter,
  */
 static inline int is_software_counter(struct perf_counter *counter)
 {
-	return !perf_event_raw(&counter->attr) &&
-		perf_event_type(&counter->attr) != PERF_TYPE_HARDWARE;
+	return (counter->attr.type != PERF_TYPE_RAW) &&
+		(counter->attr.type != PERF_TYPE_HARDWARE);
 }
 
 extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 47c92fb..75ae767 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3091,14 +3091,12 @@ static int perf_swcounter_match(struct perf_counter *counter,
 				enum perf_event_types type,
 				u32 event, struct pt_regs *regs)
 {
-	u64 event_config;
-
-	event_config = ((u64) type << PERF_COUNTER_TYPE_SHIFT) | event;
-
 	if (!perf_swcounter_is_counting(counter))
 		return 0;
 
-	if (counter->attr.config != event_config)
+	if (counter->attr.type != type)
+		return 0;
+	if (counter->attr.config != event)
 		return 0;
 
 	if (regs) {
@@ -3403,7 +3401,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	 * to be kernel events, and page faults are never hypervisor
 	 * events.
 	 */
-	switch (perf_event_id(&counter->attr)) {
+	switch (counter->attr.config) {
 	case PERF_COUNT_CPU_CLOCK:
 		pmu = &perf_ops_cpu_clock;
 
@@ -3496,12 +3494,12 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 	if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
 		goto done;
 
-	if (perf_event_raw(attr)) {
+	if (attr->type == PERF_TYPE_RAW) {
 		pmu = hw_perf_counter_init(counter);
 		goto done;
 	}
 
-	switch (perf_event_type(attr)) {
+	switch (attr->type) {
 	case PERF_TYPE_HARDWARE:
 		pmu = hw_perf_counter_init(counter);
 		break;
-- 
cgit v0.10.2


From 460bcf57b128ce1c0dd553d905fedc097f9955c6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 12 May 2009 07:37:56 -0400
Subject: Fix nobh_truncate_page() to not pass stack garbage to get_block()

The nobh_truncate_page() function is used by ext2, exofs, and jfs.  Of
these three, only ext2 and jfs's get_block() function pays attention
to bh->b_size --- which is normally always the filesystem blocksize
except when the get_block() function is called by either
mpage_readpage(), mpage_readpages(), or the direct I/O routines in
fs/direct_io.c.

Unfortunately, nobh_truncate_page() does not initialize map_bh before
calling the filesystem-supplied get_block() function.  So ext2 and jfs
will try to calculate the number of blocks to map by taking stack
garbage and shifting it left by inode->i_blkbits.  This should be
*mostly* harmless (except the filesystem will do some unnneeded work)
unless the stack garbage is less than filesystem's blocksize, in which
case maxblocks will be zero, and the attempt to find out whether or
not the filesystem has a hole at a given logical block will fail, and
the page cache entry might not get zero'ed out.

Also if the stack garbage in in map_bh->state happens to have the
BH_Mapped bit set, there could be an attempt to call readpage() on a
non-existent page, which could cause nobh_truncate_page() to return an
error when it should not.

Fix this by initializing map_bh->state and map_bh->size.

Fortunately, it's probably fairly unlikely that ext2 and jfs users
mount with nobh these days.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/buffer.c b/fs/buffer.c
index aed2977..4910612 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2736,6 +2736,8 @@ has_buffers:
 		pos += blocksize;
 	}
 
+	map_bh.b_size = blocksize;
+	map_bh.b_state = 0;
 	err = get_block(inode, iblock, &map_bh, 0);
 	if (err)
 		goto unlock;
-- 
cgit v0.10.2


From 72a43d63cb51057393edfbcfc4596066205ad15d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Wed, 13 May 2009 19:13:40 +0100
Subject: ext3/4 with synchronous writes gets wedged by Postfix

OK, that's probably the easiest way to do that, as much as I don't like it...
Since iget() et.al. will not accept I_FREEING (will wait to go away
and restart), and since we'd better have serialization between new/free
on fs data structures anyway, we can afford simply skipping I_FREEING
et.al. in insert_inode_locked().

We do that from new_inode, so it won't race with free_inode in any interesting
ways and it won't race with iget (of any origin; nfsd or in case of fs
corruption a lookup) since both still will wait for I_LOCK.

Reviewed-by: "Theodore Ts'o" <tytso@mit.edu>
Acked-by: Jan Kara <jack@suse.cz>
Tested-by: David Watson <dbwatson@ukfsn.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/inode.c b/fs/inode.c
index 0571983..a4876e5 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1053,13 +1053,22 @@ int insert_inode_locked(struct inode *inode)
 	struct super_block *sb = inode->i_sb;
 	ino_t ino = inode->i_ino;
 	struct hlist_head *head = inode_hashtable + hash(sb, ino);
-	struct inode *old;
 
 	inode->i_state |= I_LOCK|I_NEW;
 	while (1) {
+		struct hlist_node *node;
+		struct inode *old = NULL;
 		spin_lock(&inode_lock);
-		old = find_inode_fast(sb, head, ino);
-		if (likely(!old)) {
+		hlist_for_each_entry(old, node, head, i_hash) {
+			if (old->i_ino != ino)
+				continue;
+			if (old->i_sb != sb)
+				continue;
+			if (old->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))
+				continue;
+			break;
+		}
+		if (likely(!node)) {
 			hlist_add_head(&inode->i_hash, head);
 			spin_unlock(&inode_lock);
 			return 0;
@@ -1081,14 +1090,24 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 {
 	struct super_block *sb = inode->i_sb;
 	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
-	struct inode *old;
 
 	inode->i_state |= I_LOCK|I_NEW;
 
 	while (1) {
+		struct hlist_node *node;
+		struct inode *old = NULL;
+
 		spin_lock(&inode_lock);
-		old = find_inode(sb, head, test, data);
-		if (likely(!old)) {
+		hlist_for_each_entry(old, node, head, i_hash) {
+			if (old->i_sb != sb)
+				continue;
+			if (!test(old, data))
+				continue;
+			if (old->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))
+				continue;
+			break;
+		}
+		if (likely(!node)) {
 			hlist_add_head(&inode->i_hash, head);
 			spin_unlock(&inode_lock);
 			return 0;
-- 
cgit v0.10.2


From 8326f44da090d6d304d29b9fdc7fb3e20889e329 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 5 Jun 2009 20:22:46 +0200
Subject: perf_counter: Implement generalized cache event types

Extend generic event enumeration with the PERF_TYPE_HW_CACHE
method.

This is a 3-dimensional space:

       { L1-D, L1-I, L2, ITLB, DTLB, BPU } x
       { load, store, prefetch } x
       { accesses, misses }

User-space passes in the 3 coordinates and the kernel provides
a counter. (if the hardware supports that type and if the
combination makes sense.)

Combinations that make no sense produce a -EINVAL.
Combinations that are not supported by the hardware produce -ENOTSUP.

Extend the tools to deal with this, and rewrite the event symbol
parsing code with various popular aliases for the units and
access methods above. So 'l1-cache-miss' and 'l1d-read-ops' are
both valid aliases.

( x86 is supported for now, with the Nehalem event table filled in,
  and with Core2 and Atom having placeholder tables. )

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/util/parse-events.c b/Documentation/perf_counter/util/parse-events.c
index eb56bd9..de9a77c 100644
--- a/Documentation/perf_counter/util/parse-events.c
+++ b/Documentation/perf_counter/util/parse-events.c
@@ -6,6 +6,8 @@
 #include "exec_cmd.h"
 #include "string.h"
 
+extern char *strcasestr(const char *haystack, const char *needle);
+
 int					nr_counters;
 
 struct perf_counter_attr		attrs[MAX_COUNTERS];
@@ -17,6 +19,7 @@ struct event_symbol {
 };
 
 #define C(x, y) .type = PERF_TYPE_##x, .config = PERF_COUNT_##y
+#define CR(x, y) .type = PERF_TYPE_##x, .config = y
 
 static struct event_symbol event_symbols[] = {
   { C(HARDWARE, CPU_CYCLES),		"cpu-cycles",		},
@@ -69,6 +72,28 @@ static char *sw_event_names[] = {
 	"major faults",
 };
 
+#define MAX_ALIASES 8
+
+static char *hw_cache [][MAX_ALIASES] = {
+	{ "l1-d" ,	"l1d" ,	"l1", "l1-data-cache"			},
+	{ "l1-i" ,	"l1i" ,	"l1-instruction-cache"		},
+	{ "l2"  , },
+	{ "dtlb", },
+	{ "itlb", },
+	{ "bpu" , "btb", "branch-cache", NULL },
+};
+
+static char *hw_cache_op [][MAX_ALIASES] = {
+	{ "read"	, "load" },
+	{ "write"	, "store" },
+	{ "prefetch"	, "speculative-read", "speculative-load" },
+};
+
+static char *hw_cache_result [][MAX_ALIASES] = {
+	{ "access", "ops" },
+	{ "miss", },
+};
+
 char *event_name(int counter)
 {
 	__u64 config = attrs[counter].config;
@@ -86,6 +111,30 @@ char *event_name(int counter)
 			return hw_event_names[config];
 		return "unknown-hardware";
 
+	case PERF_TYPE_HW_CACHE: {
+		__u8 cache_type, cache_op, cache_result;
+		static char name[100];
+
+		cache_type   = (config >>  0) & 0xff;
+		if (cache_type > PERF_COUNT_HW_CACHE_MAX)
+			return "unknown-ext-hardware-cache-type";
+
+		cache_op     = (config >>  8) & 0xff;
+		if (cache_type > PERF_COUNT_HW_CACHE_OP_MAX)
+			return "unknown-ext-hardware-cache-op-type";
+
+		cache_result = (config >> 16) & 0xff;
+		if (cache_type > PERF_COUNT_HW_CACHE_RESULT_MAX)
+			return "unknown-ext-hardware-cache-result-type";
+
+		sprintf(name, "%s:%s:%s",
+			hw_cache[cache_type][0],
+			hw_cache_op[cache_op][0],
+			hw_cache_result[cache_result][0]);
+
+		return name;
+	}
+
 	case PERF_TYPE_SOFTWARE:
 		if (config < PERF_SW_EVENTS_MAX)
 			return sw_event_names[config];
@@ -98,11 +147,60 @@ char *event_name(int counter)
 	return "unknown";
 }
 
+static int parse_aliases(const char *str, char *names[][MAX_ALIASES], int size)
+{
+	int i, j;
+
+	for (i = 0; i < size; i++) {
+		for (j = 0; j < MAX_ALIASES; j++) {
+			if (!names[i][j])
+				break;
+			if (strcasestr(str, names[i][j]))
+				return i;
+		}
+	}
+
+	return 0;
+}
+
+static int parse_generic_hw_symbols(const char *str, struct perf_counter_attr *attr)
+{
+	__u8 cache_type = -1, cache_op = 0, cache_result = 0;
+
+	cache_type = parse_aliases(str, hw_cache, PERF_COUNT_HW_CACHE_MAX);
+	/*
+	 * No fallback - if we cannot get a clear cache type
+	 * then bail out:
+	 */
+	if (cache_type == -1)
+		return -EINVAL;
+
+	cache_op = parse_aliases(str, hw_cache_op, PERF_COUNT_HW_CACHE_OP_MAX);
+	/*
+	 * Fall back to reads:
+	 */
+	if (cache_type == -1)
+		cache_type = PERF_COUNT_HW_CACHE_OP_READ;
+
+	cache_result = parse_aliases(str, hw_cache_result,
+					PERF_COUNT_HW_CACHE_RESULT_MAX);
+	/*
+	 * Fall back to accesses:
+	 */
+	if (cache_result == -1)
+		cache_result = PERF_COUNT_HW_CACHE_RESULT_ACCESS;
+
+	attr->config = cache_type | (cache_op << 8) | (cache_result << 16);
+	attr->type = PERF_TYPE_HW_CACHE;
+
+	return 0;
+}
+
 /*
  * Each event can have multiple symbolic names.
  * Symbolic names are (almost) exactly matched.
  */
-static int match_event_symbols(const char *str, struct perf_counter_attr *attr)
+static int parse_event_symbols(const char *str, struct perf_counter_attr *attr)
 {
 	__u64 config, id;
 	int type;
@@ -147,7 +245,7 @@ static int match_event_symbols(const char *str, struct perf_counter_attr *attr)
 		}
 	}
 
-	return -EINVAL;
+	return parse_generic_hw_symbols(str, attr);
 }
 
 int parse_events(const struct option *opt, const char *str, int unset)
@@ -160,7 +258,7 @@ again:
 	if (nr_counters == MAX_COUNTERS)
 		return -1;
 
-	ret = match_event_symbols(str, &attr);
+	ret = parse_event_symbols(str, &attr);
 	if (ret < 0)
 		return ret;
 
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 430e048..e86679f 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -83,6 +83,128 @@ static u64 intel_pmu_event_map(int event)
 	return intel_perfmon_event_map[event];
 }
 
+/*
+ * Generalized hw caching related event table, filled
+ * in on a per model basis. A value of 0 means
+ * 'not supported', -1 means 'event makes no sense on
+ * this CPU', any other value means the raw event
+ * ID.
+ */
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+static u64 __read_mostly hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+static const u64 nehalem_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI            */
+		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE         */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI            */
+		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE         */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
+		[ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0480, /* L1I.READS                    */
+		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(L2  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
+		[ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS                */
+		[ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0xc024, /* L2_RQSTS.PREFETCHES          */
+		[ C(RESULT_MISS)   ] = 0x8024, /* L2_RQSTS.PREFETCH_MISS       */
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */
+		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */
+		[ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
+		[ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISS_RETIRED            */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+static const u64 core2_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+	/* To be filled in */
+};
+
+static const u64 atom_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+	/* To be filled in */
+};
+
 static u64 intel_pmu_raw_event(u64 event)
 {
 #define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
@@ -246,6 +368,39 @@ static inline int x86_pmu_initialized(void)
 	return x86_pmu.handle_irq != NULL;
 }
 
+static inline int
+set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
+{
+	unsigned int cache_type, cache_op, cache_result;
+	u64 config, val;
+
+	config = attr->config;
+
+	cache_type = (config >>  0) & 0xff;
+	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
+		return -EINVAL;
+
+	cache_op = (config >>  8) & 0xff;
+	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
+		return -EINVAL;
+
+	cache_result = (config >> 16) & 0xff;
+	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+		return -EINVAL;
+
+	val = hw_cache_event_ids[cache_type][cache_op][cache_result];
+
+	if (val == 0)
+		return -ENOENT;
+
+	if (val == -1)
+		return -EINVAL;
+
+	hwc->config |= val;
+
+	return 0;
+}
+
 /*
  * Setup the hardware configuration for a given attr_type
  */
@@ -288,22 +443,25 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->sample_period = x86_pmu.max_period;
 
 	atomic64_set(&hwc->period_left, hwc->sample_period);
+	counter->destroy = hw_perf_counter_destroy;
 
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
 	if (attr->type == PERF_TYPE_RAW) {
 		hwc->config |= x86_pmu.raw_event(attr->config);
-	} else {
-		if (attr->config >= x86_pmu.max_events)
-			return -EINVAL;
-		/*
-		 * The generic map:
-		 */
-		hwc->config |= x86_pmu.event_map(attr->config);
+		return 0;
 	}
 
-	counter->destroy = hw_perf_counter_destroy;
+	if (attr->type == PERF_TYPE_HW_CACHE)
+		return set_ext_hw_attr(hwc, attr);
+
+	if (attr->config >= x86_pmu.max_events)
+		return -EINVAL;
+	/*
+	 * The generic map:
+	 */
+	hwc->config |= x86_pmu.event_map(attr->config);
 
 	return 0;
 }
@@ -989,6 +1147,33 @@ static int intel_pmu_init(void)
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
 
+	/*
+	 * Nehalem:
+	 */
+	switch (boot_cpu_data.x86_model) {
+	case 17:
+		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
+		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
+			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+
+		pr_info("... installed Core2 event tables\n");
+		break;
+	default:
+	case 26:
+		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
+		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
+			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+
+		pr_info("... installed Nehalem/Corei7 event tables\n");
+		break;
+	case 28:
+		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
+		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
+			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+
+		pr_info("... installed Atom event tables\n");
+		break;
+	}
 	return 0;
 }
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index f794c69..3586df8 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -28,6 +28,7 @@ enum perf_event_types {
 	PERF_TYPE_HARDWARE		= 0,
 	PERF_TYPE_SOFTWARE		= 1,
 	PERF_TYPE_TRACEPOINT		= 2,
+	PERF_TYPE_HW_CACHE		= 3,
 
 	/*
 	 * available TYPE space, raw is the max value.
@@ -56,6 +57,39 @@ enum attr_ids {
 };
 
 /*
+ * Generalized hardware cache counters:
+ *
+ *       { L1-D, L1-I, L2, LLC, ITLB, DTLB, BPU } x
+ *       { read, write, prefetch } x
+ *       { accesses, misses }
+ */
+enum hw_cache_id {
+	PERF_COUNT_HW_CACHE_L1D,
+	PERF_COUNT_HW_CACHE_L1I,
+	PERF_COUNT_HW_CACHE_L2,
+	PERF_COUNT_HW_CACHE_DTLB,
+	PERF_COUNT_HW_CACHE_ITLB,
+	PERF_COUNT_HW_CACHE_BPU,
+
+	PERF_COUNT_HW_CACHE_MAX,
+};
+
+enum hw_cache_op_id {
+	PERF_COUNT_HW_CACHE_OP_READ,
+	PERF_COUNT_HW_CACHE_OP_WRITE,
+	PERF_COUNT_HW_CACHE_OP_PREFETCH,
+
+	PERF_COUNT_HW_CACHE_OP_MAX,
+};
+
+enum hw_cache_op_result_id {
+	PERF_COUNT_HW_CACHE_RESULT_ACCESS,
+	PERF_COUNT_HW_CACHE_RESULT_MISS,
+
+	PERF_COUNT_HW_CACHE_RESULT_MAX,
+};
+
+/*
  * Special "software" counters provided by the kernel, even if the hardware
  * does not support performance counters. These counters measure various
  * physical and sw events of the kernel (and allow the profiling of them as
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 75ae767..5eacaaf 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3501,6 +3501,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 
 	switch (attr->type) {
 	case PERF_TYPE_HARDWARE:
+	case PERF_TYPE_HW_CACHE:
 		pmu = hw_perf_counter_init(counter);
 		break;
 
-- 
cgit v0.10.2


From 86847b62f0781ccc97a79936c9ed9dc818cff67b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 6 Jun 2009 12:24:17 +0200
Subject: perf_counter tools: Add 'perf list' to list available events

perf list: List all the available event types which can be used in
-e (--event) options.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 5b99f04..32c0bb2 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -328,6 +328,7 @@ BUILTIN_OBJS += builtin-record.o
 BUILTIN_OBJS += builtin-report.o
 BUILTIN_OBJS += builtin-stat.o
 BUILTIN_OBJS += builtin-top.o
+BUILTIN_OBJS += builtin-list.o
 
 PERFLIBS = $(LIB_FILE)
 EXTLIBS =
diff --git a/Documentation/perf_counter/builtin-list.c b/Documentation/perf_counter/builtin-list.c
new file mode 100644
index 0000000..fe60e37
--- /dev/null
+++ b/Documentation/perf_counter/builtin-list.c
@@ -0,0 +1,20 @@
+/*
+ * builtin-list.c
+ *
+ * Builtin list command: list all event types
+ *
+ * Copyright (C) 2009, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008-2009, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
+ */
+#include "builtin.h"
+
+#include "perf.h"
+
+#include "util/parse-options.h"
+#include "util/parse-events.h"
+
+int cmd_list(int argc, const char **argv, const char *prefix)
+{
+	print_events();
+	return 0;
+}
diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 130fd88..aeab9c4 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -495,11 +495,10 @@ static const char * const record_usage[] = {
 	NULL
 };
 
-static char events_help_msg[EVENTS_HELP_MAX];
-
 static const struct option options[] = {
 	OPT_CALLBACK('e', "event", NULL, "event",
-		     events_help_msg, parse_events),
+		     "event selector. use 'perf list' to list available events",
+		     parse_events),
 	OPT_INTEGER('p', "pid", &target_pid,
 		    "record events on existing pid"),
 	OPT_INTEGER('r', "realtime", &realtime_prio,
@@ -527,8 +526,6 @@ int cmd_record(int argc, const char **argv, const char *prefix)
 {
 	int counter;
 
-	create_events_help(events_help_msg);
-
 	argc = parse_options(argc, argv, options, record_usage, 0);
 	if (!argc && target_pid == -1 && !system_wide)
 		usage_with_options(record_usage, options);
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 9711e55..2cbf5a1 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -293,18 +293,17 @@ static const char * const stat_usage[] = {
 	NULL
 };
 
-static char events_help_msg[EVENTS_HELP_MAX];
-
 static const struct option options[] = {
 	OPT_CALLBACK('e', "event", NULL, "event",
-		     events_help_msg, parse_events),
+		     "event selector. use 'perf list' to list available events",
+		     parse_events),
 	OPT_BOOLEAN('i', "inherit", &inherit,
 		    "child tasks inherit counters"),
 	OPT_INTEGER('p', "pid", &target_pid,
 		    "stat events on existing pid"),
 	OPT_BOOLEAN('a', "all-cpus", &system_wide,
 			    "system-wide collection from all CPUs"),
-	OPT_BOOLEAN('l', "scale", &scale,
+	OPT_BOOLEAN('S', "scale", &scale,
 			    "scale/normalize counters"),
 	OPT_END()
 };
@@ -313,8 +312,6 @@ int cmd_stat(int argc, const char **argv, const char *prefix)
 {
 	page_size = sysconf(_SC_PAGE_SIZE);
 
-	create_events_help(events_help_msg);
-
 	memcpy(attrs, default_attrs, sizeof(attrs));
 
 	argc = parse_options(argc, argv, options, stat_usage, 0);
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 98a6d53..f2e7312 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -606,11 +606,10 @@ static const char * const top_usage[] = {
 	NULL
 };
 
-static char events_help_msg[EVENTS_HELP_MAX];
-
 static const struct option options[] = {
 	OPT_CALLBACK('e', "event", NULL, "event",
-		     events_help_msg, parse_events),
+		     "event selector. use 'perf list' to list available events",
+		     parse_events),
 	OPT_INTEGER('c', "count", &default_interval,
 		    "event period to sample"),
 	OPT_INTEGER('p', "pid", &target_pid,
@@ -648,8 +647,6 @@ int cmd_top(int argc, const char **argv, const char *prefix)
 
 	page_size = sysconf(_SC_PAGE_SIZE);
 
-	create_events_help(events_help_msg);
-
 	argc = parse_options(argc, argv, options, top_usage, 0);
 	if (argc)
 		usage_with_options(top_usage, options);
diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h
index 5bfea57..e7de47d 100644
--- a/Documentation/perf_counter/builtin.h
+++ b/Documentation/perf_counter/builtin.h
@@ -20,4 +20,6 @@ extern int cmd_report(int argc, const char **argv, const char *prefix);
 extern int cmd_stat(int argc, const char **argv, const char *prefix);
 extern int cmd_top(int argc, const char **argv, const char *prefix);
 extern int cmd_version(int argc, const char **argv, const char *prefix);
+extern int cmd_list(int argc, const char **argv, const char *prefix);
+
 #endif
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
index ec7edb7..9ac7565 100644
--- a/Documentation/perf_counter/perf.c
+++ b/Documentation/perf_counter/perf.c
@@ -258,6 +258,7 @@ static void handle_internal_command(int argc, const char **argv)
 	const char *cmd = argv[0];
 	static struct cmd_struct commands[] = {
 		{ "help", cmd_help, 0 },
+		{ "list", cmd_list, 0 },
 		{ "record", cmd_record, 0 },
 		{ "report", cmd_report, 0 },
 		{ "stat", cmd_stat, 0 },
diff --git a/Documentation/perf_counter/util/parse-events.c b/Documentation/perf_counter/util/parse-events.c
index de9a77c..150fbd2 100644
--- a/Documentation/perf_counter/util/parse-events.c
+++ b/Documentation/perf_counter/util/parse-events.c
@@ -274,31 +274,43 @@ again:
 	return 0;
 }
 
+static const char * const event_type_descriptors[] = {
+	"",
+	"Hardware event",
+	"Software event",
+	"Tracepoint event",
+	"Hardware cache event",
+};
+
 /*
- * Create the help text for the event symbols:
+ * Print the help text for the event symbols:
  */
-void create_events_help(char *events_help_msg)
+void print_events(void)
 {
-	unsigned int i;
-	char *str;
+	struct event_symbol *syms = event_symbols;
+	unsigned int i, type, prev_type = -1;
 
-	str = events_help_msg;
+	fprintf(stderr, "\n");
+	fprintf(stderr, "List of pre-defined events (to be used in -e):\n");
 
-	str += sprintf(str,
-	"event name: [");
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++, syms++) {
+		type = syms->type + 1;
+		if (type > ARRAY_SIZE(event_type_descriptors))
+			type = 0;
 
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		int type, id;
-
-		type = event_symbols[i].type;
-		id = event_symbols[i].config;
+		if (type != prev_type)
+			fprintf(stderr, "\n");
 
-		if (i)
-			str += sprintf(str, "|");
+		fprintf(stderr, "  %-30s [%s]\n", syms->symbol,
+			event_type_descriptors[type]);
 
-		str += sprintf(str, "%s",
-				event_symbols[i].symbol);
+		prev_type = type;
 	}
 
-	str += sprintf(str, "|rNNN]");
+	fprintf(stderr, "\n");
+	fprintf(stderr, "  %-30s [raw hardware event descriptor]\n",
+		"rNNN");
+	fprintf(stderr, "\n");
+
+	exit(129);
 }
diff --git a/Documentation/perf_counter/util/parse-events.h b/Documentation/perf_counter/util/parse-events.h
index 542971c..e3d5529 100644
--- a/Documentation/perf_counter/util/parse-events.h
+++ b/Documentation/perf_counter/util/parse-events.h
@@ -13,5 +13,5 @@ extern int parse_events(const struct option *opt, const char *str, int unset);
 
 #define EVENTS_HELP_MAX (128*1024)
 
-extern void create_events_help(char *help_msg);
+extern void print_events(void);
 
-- 
cgit v0.10.2


From 8faf3b547593bf6ea10df631e73204975273c4e0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 6 Jun 2009 13:58:12 +0200
Subject: perf_counter tools: Fix cache-event printout

Also standardize the cache printout (so that it can be pasted back
into the command) and sort out the aliases.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/util/parse-events.c b/Documentation/perf_counter/util/parse-events.c
index 150fbd2..e0820b4 100644
--- a/Documentation/perf_counter/util/parse-events.c
+++ b/Documentation/perf_counter/util/parse-events.c
@@ -53,45 +53,45 @@ static struct event_symbol event_symbols[] = {
 #define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
 
 static char *hw_event_names[] = {
-	"CPU cycles",
+	"cycles",
 	"instructions",
-	"cache references",
-	"cache misses",
+	"cache-references",
+	"cache-misses",
 	"branches",
-	"branch misses",
-	"bus cycles",
+	"branch-misses",
+	"bus-cycles",
 };
 
 static char *sw_event_names[] = {
-	"cpu clock ticks",
-	"task clock ticks",
-	"pagefaults",
-	"context switches",
-	"CPU migrations",
-	"minor faults",
-	"major faults",
+	"cpu-clock-ticks",
+	"task-clock-ticks",
+	"page-faults",
+	"context-switches",
+	"CPU-migrations",
+	"minor-faults",
+	"major-faults",
 };
 
 #define MAX_ALIASES 8
 
 static char *hw_cache [][MAX_ALIASES] = {
-	{ "l1-d" ,	"l1d" ,	"l1", "l1-data-cache"			},
-	{ "l1-i" ,	"l1i" ,	"l1-instruction-cache"		},
-	{ "l2"  , },
-	{ "dtlb", },
-	{ "itlb", },
-	{ "bpu" , "btb", "branch-cache", NULL },
+	{ "L1-data"		, "l1-d", "l1d", "l1"				},
+	{ "L1-instruction"	, "l1-i", "l1i"					},
+	{ "L2"			, "l2"						},
+	{ "Data-TLB"		, "dtlb", "d-tlb"				},
+	{ "Instruction-TLB"	, "itlb", "i-tlb"				},
+	{ "Branch"		, "bpu" , "btb", "bpc"				},
 };
 
 static char *hw_cache_op [][MAX_ALIASES] = {
-	{ "read"	, "load" },
-	{ "write"	, "store" },
-	{ "prefetch"	, "speculative-read", "speculative-load" },
+	{ "Load"		, "read"					},
+	{ "Store"		, "write"					},
+	{ "Prefetch"		, "speculative-read", "speculative-load"	},
 };
 
 static char *hw_cache_result [][MAX_ALIASES] = {
-	{ "access", "ops" },
-	{ "miss", },
+	{ "Reference"		, "ops", "access"				},
+	{ "Miss"								},
 };
 
 char *event_name(int counter)
@@ -120,14 +120,14 @@ char *event_name(int counter)
 			return "unknown-ext-hardware-cache-type";
 
 		cache_op     = (config >>  8) & 0xff;
-		if (cache_type > PERF_COUNT_HW_CACHE_OP_MAX)
-			return "unknown-ext-hardware-cache-op-type";
+		if (cache_op > PERF_COUNT_HW_CACHE_OP_MAX)
+			return "unknown-ext-hardware-cache-op";
 
 		cache_result = (config >> 16) & 0xff;
-		if (cache_type > PERF_COUNT_HW_CACHE_RESULT_MAX)
-			return "unknown-ext-hardware-cache-result-type";
+		if (cache_result > PERF_COUNT_HW_CACHE_RESULT_MAX)
+			return "unknown-ext-hardware-cache-result";
 
-		sprintf(name, "%s:%s:%s",
+		sprintf(name, "%s-Cache-%s-%ses",
 			hw_cache[cache_type][0],
 			hw_cache_op[cache_op][0],
 			hw_cache_result[cache_result][0]);
-- 
cgit v0.10.2


From 386b05e3a2f3c5b0a9c5575060421cca0911648a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 6 Jun 2009 14:56:33 +0200
Subject: perf_counter tools: Add help for perf list

Also update other areas of the help texts.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Documentation/perf-list.txt b/Documentation/perf_counter/Documentation/perf-list.txt
new file mode 100644
index 0000000..aa55a71
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf-list.txt
@@ -0,0 +1,25 @@
+perf-list(1)
+==============
+
+NAME
+----
+perf-list - List all symbolic event types
+
+SYNOPSIS
+--------
+[verse]
+'perf list
+
+DESCRIPTION
+-----------
+This command displays the symbolic event types which can be selected in the
+various perf commands with the -e option.
+
+OPTIONS
+-------
+None
+
+SEE ALSO
+--------
+linkperf:perf-stat[1], linkperf:perf-top[1],
+linkperf:perf-record[1]
diff --git a/Documentation/perf_counter/Documentation/perf-record.txt b/Documentation/perf_counter/Documentation/perf-record.txt
index 4d3416f..1dbc1ee 100644
--- a/Documentation/perf_counter/Documentation/perf-record.txt
+++ b/Documentation/perf_counter/Documentation/perf-record.txt
@@ -26,26 +26,10 @@ OPTIONS
 
 -e::
 --event=::
-                             0:0: cpu-cycles          
-                             0:0: cycles              
-                             0:1: instructions        
-                             0:2: cache-references    
-                             0:3: cache-misses        
-                             0:4: branch-instructions 
-                             0:4: branches            
-                             0:5: branch-misses       
-                             0:6: bus-cycles          
-                             1:0: cpu-clock           
-                             1:1: task-clock          
-                             1:2: page-faults         
-                             1:2: faults              
-                             1:5: minor-faults        
-                             1:6: major-faults        
-                             1:3: context-switches    
-                             1:3: cs                  
-                             1:4: cpu-migrations      
-                             1:4: migrations          
-                           rNNN: raw PMU events (eventsel+umask)
+	Select the PMU event. Selection can be a symbolic event name
+	(use 'perf list' to list all events) or a raw PMU
+	event (eventsel+umask) in the form of rNNN where NNN is a
+	 hexadecimal event descriptor.
 
 -a::
         system-wide collection
@@ -55,4 +39,4 @@ OPTIONS
 
 SEE ALSO
 --------
-linkperf:perf-stat[1]
+linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/Documentation/perf_counter/Documentation/perf-stat.txt b/Documentation/perf_counter/Documentation/perf-stat.txt
index a340e7b..5d95784 100644
--- a/Documentation/perf_counter/Documentation/perf-stat.txt
+++ b/Documentation/perf_counter/Documentation/perf-stat.txt
@@ -25,26 +25,10 @@ OPTIONS
 
 -e::
 --event=::
-                             0:0: cpu-cycles          
-                             0:0: cycles              
-                             0:1: instructions        
-                             0:2: cache-references    
-                             0:3: cache-misses        
-                             0:4: branch-instructions 
-                             0:4: branches            
-                             0:5: branch-misses       
-                             0:6: bus-cycles          
-                             1:0: cpu-clock           
-                             1:1: task-clock          
-                             1:2: page-faults         
-                             1:2: faults              
-                             1:5: minor-faults        
-                             1:6: major-faults        
-                             1:3: context-switches    
-                             1:3: cs                  
-                             1:4: cpu-migrations      
-                             1:4: migrations          
-                           rNNN: raw PMU events (eventsel+umask)
+	Select the PMU event. Selection can be a symbolic event name
+	(use 'perf list' to list all events) or a raw PMU
+	event (eventsel+umask) in the form of rNNN where NNN is a
+	 hexadecimal event descriptor.
 
 -i::
 --inherit::
@@ -79,4 +63,4 @@ $ perf stat -- make -j
 
 SEE ALSO
 --------
-linkperf:perf-tops[1]
+linkperf:perf-top[1], linkperf:perf-list[1]
diff --git a/Documentation/perf_counter/Documentation/perf-top.txt b/Documentation/perf_counter/Documentation/perf-top.txt
index 15251e4..c8eb7cf 100644
--- a/Documentation/perf_counter/Documentation/perf-top.txt
+++ b/Documentation/perf_counter/Documentation/perf-top.txt
@@ -23,26 +23,10 @@ OPTIONS
 
 -e::
 --event=::
-                             0:0: cpu-cycles          
-                             0:0: cycles              
-                             0:1: instructions        
-                             0:2: cache-references    
-                             0:3: cache-misses        
-                             0:4: branch-instructions 
-                             0:4: branches            
-                             0:5: branch-misses       
-                             0:6: bus-cycles          
-                             1:0: cpu-clock           
-                             1:1: task-clock          
-                             1:2: page-faults         
-                             1:2: faults              
-                             1:5: minor-faults        
-                             1:6: major-faults        
-                             1:3: context-switches    
-                             1:3: cs                  
-                             1:4: cpu-migrations      
-                             1:4: migrations          
-                           rNNN: raw PMU events (eventsel+umask)
+	Select the PMU event. Selection can be a symbolic event name
+	(use 'perf list' to list all events) or a raw PMU
+	event (eventsel+umask) in the form of rNNN where NNN is a
+	 hexadecimal event descriptor.
 
 -a::
         system-wide collection
@@ -52,4 +36,4 @@ OPTIONS
 
 SEE ALSO
 --------
-linkperf:perf-stat[1]
+linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/Documentation/perf_counter/Documentation/perf.txt b/Documentation/perf_counter/Documentation/perf.txt
index e3d8b38..69c8325 100644
--- a/Documentation/perf_counter/Documentation/perf.txt
+++ b/Documentation/perf_counter/Documentation/perf.txt
@@ -20,4 +20,5 @@ and software features (software counters, tracepoints) as well.
 SEE ALSO
 --------
 linkperf:perf-stat[1], linkperf:perf-top[1],
-linkperf:perf-record[1], linkperf:perf-report[1]
+linkperf:perf-record[1], linkperf:perf-report[1],
+linkperf:perf-list[1]
-- 
cgit v0.10.2


From 502fc5c72a886ff9d4d7a596e65ecc4dd5e4d458 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 03:20:49 +0100
Subject: perf_counter tools: Uniform help printouts

Also add perf list to command-list.txt.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/builtin-help.c b/Documentation/perf_counter/builtin-help.c
index a3894bf..0f32dc3 100644
--- a/Documentation/perf_counter/builtin-help.c
+++ b/Documentation/perf_counter/builtin-help.c
@@ -284,7 +284,7 @@ void list_common_cmds_help(void)
 			longest = strlen(common_cmds[i].name);
 	}
 
-	puts("The most commonly used perf commands are:");
+	puts(" The most commonly used perf commands are:");
 	for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
 		printf("   %s   ", common_cmds[i].name);
 		mput_char(' ', longest - strlen(common_cmds[i].name));
@@ -426,16 +426,16 @@ int cmd_help(int argc, const char **argv, const char *prefix)
 			builtin_help_usage, 0);
 
 	if (show_all) {
-		printf("usage: %s\n\n", perf_usage_string);
+		printf("\n usage: %s\n\n", perf_usage_string);
 		list_commands("perf commands", &main_cmds, &other_cmds);
-		printf("%s\n", perf_more_info_string);
+		printf(" %s\n\n", perf_more_info_string);
 		return 0;
 	}
 
 	if (!argv[0]) {
-		printf("usage: %s\n\n", perf_usage_string);
+		printf("\n usage: %s\n\n", perf_usage_string);
 		list_common_cmds_help();
-		printf("\n%s\n", perf_more_info_string);
+		printf("\n %s\n\n", perf_more_info_string);
 		return 0;
 	}
 
diff --git a/Documentation/perf_counter/command-list.txt b/Documentation/perf_counter/command-list.txt
index 4390292..f0b922c 100644
--- a/Documentation/perf_counter/command-list.txt
+++ b/Documentation/perf_counter/command-list.txt
@@ -1,7 +1,9 @@
+#
 # List of known perf commands.
-# command name				category [deprecated] [common]
-perf-record                             mainporcelain common
-perf-report                             mainporcelain common
-perf-stat                               mainporcelain common
-perf-top                                mainporcelain common
-
+# command name			category [deprecated] [common]
+#
+perf-record			mainporcelain common
+perf-report			mainporcelain common
+perf-stat			mainporcelain common
+perf-top			mainporcelain common
+perf-list			mainporcelain common
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
index 9ac7565..161824f 100644
--- a/Documentation/perf_counter/perf.c
+++ b/Documentation/perf_counter/perf.c
@@ -384,9 +384,9 @@ int main(int argc, const char **argv)
 			argv[0] += 2;
 	} else {
 		/* The user didn't specify a command; give them help */
-		printf("usage: %s\n\n", perf_usage_string);
+		printf("\n usage: %s\n\n", perf_usage_string);
 		list_common_cmds_help();
-		printf("\n%s\n", perf_more_info_string);
+		printf("\n %s\n\n", perf_more_info_string);
 		exit(1);
 	}
 	cmd = argv[0];
diff --git a/Documentation/perf_counter/util/parse-options.c b/Documentation/perf_counter/util/parse-options.c
index 551b6bc..e4d3533 100644
--- a/Documentation/perf_counter/util/parse-options.c
+++ b/Documentation/perf_counter/util/parse-options.c
@@ -385,7 +385,7 @@ int usage_with_options_internal(const char * const *usagestr,
 	if (!usagestr)
 		return PARSE_OPT_HELP;
 
-	fprintf(stderr, "usage: %s\n", *usagestr++);
+	fprintf(stderr, "\n usage: %s\n", *usagestr++);
 	while (*usagestr && **usagestr)
 		fprintf(stderr, "   or: %s\n", *usagestr++);
 	while (*usagestr) {
diff --git a/Documentation/perf_counter/util/usage.c b/Documentation/perf_counter/util/usage.c
index 7a10421..2cad286 100644
--- a/Documentation/perf_counter/util/usage.c
+++ b/Documentation/perf_counter/util/usage.c
@@ -14,7 +14,7 @@ static void report(const char *prefix, const char *err, va_list params)
 
 static NORETURN void usage_builtin(const char *err)
 {
-	fprintf(stderr, "usage: %s\n", err);
+	fprintf(stderr, "\n usage: %s\n", err);
 	exit(129);
 }
 
-- 
cgit v0.10.2


From 6e6b754ffdb6415723686c733f13275397e44422 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 15 Apr 2008 22:39:31 +0200
Subject: perf_counter tools: Tidy up manpage details

Also fix a misalignment in usage string printing.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Documentation/perf-help.txt b/Documentation/perf_counter/Documentation/perf-help.txt
index f85fed5..5143918 100644
--- a/Documentation/perf_counter/Documentation/perf-help.txt
+++ b/Documentation/perf_counter/Documentation/perf-help.txt
@@ -1,5 +1,5 @@
 perf-help(1)
-===========
+============
 
 NAME
 ----
diff --git a/Documentation/perf_counter/Documentation/perf-list.txt b/Documentation/perf_counter/Documentation/perf-list.txt
index aa55a71..8290b94 100644
--- a/Documentation/perf_counter/Documentation/perf-list.txt
+++ b/Documentation/perf_counter/Documentation/perf-list.txt
@@ -1,5 +1,5 @@
 perf-list(1)
-==============
+============
 
 NAME
 ----
@@ -8,7 +8,7 @@ perf-list - List all symbolic event types
 SYNOPSIS
 --------
 [verse]
-'perf list
+'perf list'
 
 DESCRIPTION
 -----------
diff --git a/Documentation/perf_counter/Documentation/perf-stat.txt b/Documentation/perf_counter/Documentation/perf-stat.txt
index 5d95784..c368a72 100644
--- a/Documentation/perf_counter/Documentation/perf-stat.txt
+++ b/Documentation/perf_counter/Documentation/perf-stat.txt
@@ -1,5 +1,5 @@
 perf-stat(1)
-==========
+============
 
 NAME
 ----
diff --git a/Documentation/perf_counter/Documentation/perf-top.txt b/Documentation/perf_counter/Documentation/perf-top.txt
index c8eb7cf..539d012 100644
--- a/Documentation/perf_counter/Documentation/perf-top.txt
+++ b/Documentation/perf_counter/Documentation/perf-top.txt
@@ -1,5 +1,5 @@
 perf-top(1)
-==========
+===========
 
 NAME
 ----
diff --git a/Documentation/perf_counter/util/parse-options.c b/Documentation/perf_counter/util/parse-options.c
index e4d3533..b3affb1 100644
--- a/Documentation/perf_counter/util/parse-options.c
+++ b/Documentation/perf_counter/util/parse-options.c
@@ -387,7 +387,7 @@ int usage_with_options_internal(const char * const *usagestr,
 
 	fprintf(stderr, "\n usage: %s\n", *usagestr++);
 	while (*usagestr && **usagestr)
-		fprintf(stderr, "   or: %s\n", *usagestr++);
+		fprintf(stderr, "    or: %s\n", *usagestr++);
 	while (*usagestr) {
 		fprintf(stderr, "%s%s\n",
 				**usagestr ? "    " : "",
-- 
cgit v0.10.2


From 8035e4288078cb806e7dd6bafe4d3e54d44cab3f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 6 Jun 2009 15:19:13 +0200
Subject: perf_counter tools: Prepare for 'perf annotate'

Prepare for the 'perf annotate' implementation by splitting off
builtin-annotate.c from builtin-report.c.

( We keep this commit separate to ease the later librarization
  of the facilities that perf-report and perf-annotate shares. )

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Documentation/perf-annotate.txt b/Documentation/perf_counter/Documentation/perf-annotate.txt
new file mode 100644
index 0000000..a9d6d5e
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf-annotate.txt
@@ -0,0 +1,26 @@
+perf-annotate(1)
+==============
+
+NAME
+----
+perf-annotate - Read perf.data (created by perf record) and annotate functions
+
+SYNOPSIS
+--------
+[verse]
+'perf annotate' [-i <file> | --input=file] symbol_name
+
+DESCRIPTION
+-----------
+This command displays the performance counter profile information recorded
+via perf record.
+
+OPTIONS
+-------
+-i::
+--input=::
+        Input file name. (default: perf.data)
+
+SEE ALSO
+--------
+linkperf:perf-record[1]
diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 32c0bb2..0cbd5d6 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -323,12 +323,13 @@ LIB_OBJS += util/symbol.o
 LIB_OBJS += util/color.o
 LIB_OBJS += util/pager.o
 
+BUILTIN_OBJS += builtin-annotate.o
 BUILTIN_OBJS += builtin-help.o
+BUILTIN_OBJS += builtin-list.o
 BUILTIN_OBJS += builtin-record.o
 BUILTIN_OBJS += builtin-report.o
 BUILTIN_OBJS += builtin-stat.o
 BUILTIN_OBJS += builtin-top.o
-BUILTIN_OBJS += builtin-list.o
 
 PERFLIBS = $(LIB_FILE)
 EXTLIBS =
diff --git a/Documentation/perf_counter/builtin-annotate.c b/Documentation/perf_counter/builtin-annotate.c
new file mode 100644
index 0000000..d656484
--- /dev/null
+++ b/Documentation/perf_counter/builtin-annotate.c
@@ -0,0 +1,1291 @@
+/*
+ * builtin-annotate.c
+ *
+ * Builtin annotate command: Analyze the perf.data input file,
+ * look up and read DSOs and symbol information and display
+ * a histogram of results, along various sorting keys.
+ */
+#include "builtin.h"
+
+#include "util/util.h"
+
+#include "util/color.h"
+#include "util/list.h"
+#include "util/cache.h"
+#include "util/rbtree.h"
+#include "util/symbol.h"
+#include "util/string.h"
+
+#include "perf.h"
+
+#include "util/parse-options.h"
+#include "util/parse-events.h"
+
+#define SHOW_KERNEL	1
+#define SHOW_USER	2
+#define SHOW_HV		4
+
+static char		const *input_name = "perf.data";
+static char		*vmlinux = NULL;
+
+static char		default_sort_order[] = "comm,dso";
+static char		*sort_order = default_sort_order;
+
+static int		input;
+static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
+
+static int		dump_trace = 0;
+#define dprintf(x...)	do { if (dump_trace) printf(x); } while (0)
+
+static int		verbose;
+static int		full_paths;
+
+static unsigned long	page_size;
+static unsigned long	mmap_window = 32;
+
+struct ip_event {
+	struct perf_event_header header;
+	__u64 ip;
+	__u32 pid, tid;
+};
+
+struct mmap_event {
+	struct perf_event_header header;
+	__u32 pid, tid;
+	__u64 start;
+	__u64 len;
+	__u64 pgoff;
+	char filename[PATH_MAX];
+};
+
+struct comm_event {
+	struct perf_event_header header;
+	__u32 pid, tid;
+	char comm[16];
+};
+
+struct fork_event {
+	struct perf_event_header header;
+	__u32 pid, ppid;
+};
+
+struct period_event {
+	struct perf_event_header header;
+	__u64 time;
+	__u64 id;
+	__u64 sample_period;
+};
+
+typedef union event_union {
+	struct perf_event_header	header;
+	struct ip_event			ip;
+	struct mmap_event		mmap;
+	struct comm_event		comm;
+	struct fork_event		fork;
+	struct period_event		period;
+} event_t;
+
+static LIST_HEAD(dsos);
+static struct dso *kernel_dso;
+static struct dso *vdso;
+
+static void dsos__add(struct dso *dso)
+{
+	list_add_tail(&dso->node, &dsos);
+}
+
+static struct dso *dsos__find(const char *name)
+{
+	struct dso *pos;
+
+	list_for_each_entry(pos, &dsos, node)
+		if (strcmp(pos->name, name) == 0)
+			return pos;
+	return NULL;
+}
+
+static struct dso *dsos__findnew(const char *name)
+{
+	struct dso *dso = dsos__find(name);
+	int nr;
+
+	if (dso)
+		return dso;
+
+	dso = dso__new(name, 0);
+	if (!dso)
+		goto out_delete_dso;
+
+	nr = dso__load(dso, NULL, verbose);
+	if (nr < 0) {
+		if (verbose)
+			fprintf(stderr, "Failed to open: %s\n", name);
+		goto out_delete_dso;
+	}
+	if (!nr && verbose) {
+		fprintf(stderr,
+		"No symbols found in: %s, maybe install a debug package?\n",
+				name);
+	}
+
+	dsos__add(dso);
+
+	return dso;
+
+out_delete_dso:
+	dso__delete(dso);
+	return NULL;
+}
+
+static void dsos__fprintf(FILE *fp)
+{
+	struct dso *pos;
+
+	list_for_each_entry(pos, &dsos, node)
+		dso__fprintf(pos, fp);
+}
+
+static struct symbol *vdso__find_symbol(struct dso *dso, uint64_t ip)
+{
+	return dso__find_symbol(kernel_dso, ip);
+}
+
+static int load_kernel(void)
+{
+	int err;
+
+	kernel_dso = dso__new("[kernel]", 0);
+	if (!kernel_dso)
+		return -1;
+
+	err = dso__load_kernel(kernel_dso, vmlinux, NULL, verbose);
+	if (err) {
+		dso__delete(kernel_dso);
+		kernel_dso = NULL;
+	} else
+		dsos__add(kernel_dso);
+
+	vdso = dso__new("[vdso]", 0);
+	if (!vdso)
+		return -1;
+
+	vdso->find_symbol = vdso__find_symbol;
+
+	dsos__add(vdso);
+
+	return err;
+}
+
+static char __cwd[PATH_MAX];
+static char *cwd = __cwd;
+static int cwdlen;
+
+static int strcommon(const char *pathname)
+{
+	int n = 0;
+
+	while (pathname[n] == cwd[n] && n < cwdlen)
+		++n;
+
+	return n;
+}
+
+struct map {
+	struct list_head node;
+	uint64_t	 start;
+	uint64_t	 end;
+	uint64_t	 pgoff;
+	uint64_t	 (*map_ip)(struct map *, uint64_t);
+	struct dso	 *dso;
+};
+
+static uint64_t map__map_ip(struct map *map, uint64_t ip)
+{
+	return ip - map->start + map->pgoff;
+}
+
+static uint64_t vdso__map_ip(struct map *map, uint64_t ip)
+{
+	return ip;
+}
+
+static struct map *map__new(struct mmap_event *event)
+{
+	struct map *self = malloc(sizeof(*self));
+
+	if (self != NULL) {
+		const char *filename = event->filename;
+		char newfilename[PATH_MAX];
+
+		if (cwd) {
+			int n = strcommon(filename);
+
+			if (n == cwdlen) {
+				snprintf(newfilename, sizeof(newfilename),
+					 ".%s", filename + n);
+				filename = newfilename;
+			}
+		}
+
+		self->start = event->start;
+		self->end   = event->start + event->len;
+		self->pgoff = event->pgoff;
+
+		self->dso = dsos__findnew(filename);
+		if (self->dso == NULL)
+			goto out_delete;
+
+		if (self->dso == vdso)
+			self->map_ip = vdso__map_ip;
+		else
+			self->map_ip = map__map_ip;
+	}
+	return self;
+out_delete:
+	free(self);
+	return NULL;
+}
+
+static struct map *map__clone(struct map *self)
+{
+	struct map *map = malloc(sizeof(*self));
+
+	if (!map)
+		return NULL;
+
+	memcpy(map, self, sizeof(*self));
+
+	return map;
+}
+
+static int map__overlap(struct map *l, struct map *r)
+{
+	if (l->start > r->start) {
+		struct map *t = l;
+		l = r;
+		r = t;
+	}
+
+	if (l->end > r->start)
+		return 1;
+
+	return 0;
+}
+
+static size_t map__fprintf(struct map *self, FILE *fp)
+{
+	return fprintf(fp, " %"PRIx64"-%"PRIx64" %"PRIx64" %s\n",
+		       self->start, self->end, self->pgoff, self->dso->name);
+}
+
+
+struct thread {
+	struct rb_node	 rb_node;
+	struct list_head maps;
+	pid_t		 pid;
+	char		 *comm;
+};
+
+static struct thread *thread__new(pid_t pid)
+{
+	struct thread *self = malloc(sizeof(*self));
+
+	if (self != NULL) {
+		self->pid = pid;
+		self->comm = malloc(32);
+		if (self->comm)
+			snprintf(self->comm, 32, ":%d", self->pid);
+		INIT_LIST_HEAD(&self->maps);
+	}
+
+	return self;
+}
+
+static int thread__set_comm(struct thread *self, const char *comm)
+{
+	if (self->comm)
+		free(self->comm);
+	self->comm = strdup(comm);
+	return self->comm ? 0 : -ENOMEM;
+}
+
+static size_t thread__fprintf(struct thread *self, FILE *fp)
+{
+	struct map *pos;
+	size_t ret = fprintf(fp, "Thread %d %s\n", self->pid, self->comm);
+
+	list_for_each_entry(pos, &self->maps, node)
+		ret += map__fprintf(pos, fp);
+
+	return ret;
+}
+
+
+static struct rb_root threads;
+static struct thread *last_match;
+
+static struct thread *threads__findnew(pid_t pid)
+{
+	struct rb_node **p = &threads.rb_node;
+	struct rb_node *parent = NULL;
+	struct thread *th;
+
+	/*
+	 * Font-end cache - PID lookups come in blocks,
+	 * so most of the time we dont have to look up
+	 * the full rbtree:
+	 */
+	if (last_match && last_match->pid == pid)
+		return last_match;
+
+	while (*p != NULL) {
+		parent = *p;
+		th = rb_entry(parent, struct thread, rb_node);
+
+		if (th->pid == pid) {
+			last_match = th;
+			return th;
+		}
+
+		if (pid < th->pid)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	th = thread__new(pid);
+	if (th != NULL) {
+		rb_link_node(&th->rb_node, parent, p);
+		rb_insert_color(&th->rb_node, &threads);
+		last_match = th;
+	}
+
+	return th;
+}
+
+static void thread__insert_map(struct thread *self, struct map *map)
+{
+	struct map *pos, *tmp;
+
+	list_for_each_entry_safe(pos, tmp, &self->maps, node) {
+		if (map__overlap(pos, map)) {
+			list_del_init(&pos->node);
+			/* XXX leaks dsos */
+			free(pos);
+		}
+	}
+
+	list_add_tail(&map->node, &self->maps);
+}
+
+static int thread__fork(struct thread *self, struct thread *parent)
+{
+	struct map *map;
+
+	if (self->comm)
+		free(self->comm);
+	self->comm = strdup(parent->comm);
+	if (!self->comm)
+		return -ENOMEM;
+
+	list_for_each_entry(map, &parent->maps, node) {
+		struct map *new = map__clone(map);
+		if (!new)
+			return -ENOMEM;
+		thread__insert_map(self, new);
+	}
+
+	return 0;
+}
+
+static struct map *thread__find_map(struct thread *self, uint64_t ip)
+{
+	struct map *pos;
+
+	if (self == NULL)
+		return NULL;
+
+	list_for_each_entry(pos, &self->maps, node)
+		if (ip >= pos->start && ip <= pos->end)
+			return pos;
+
+	return NULL;
+}
+
+static size_t threads__fprintf(FILE *fp)
+{
+	size_t ret = 0;
+	struct rb_node *nd;
+
+	for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
+		struct thread *pos = rb_entry(nd, struct thread, rb_node);
+
+		ret += thread__fprintf(pos, fp);
+	}
+
+	return ret;
+}
+
+/*
+ * histogram, sorted on item, collects counts
+ */
+
+static struct rb_root hist;
+
+struct hist_entry {
+	struct rb_node	 rb_node;
+
+	struct thread	 *thread;
+	struct map	 *map;
+	struct dso	 *dso;
+	struct symbol	 *sym;
+	uint64_t	 ip;
+	char		 level;
+
+	uint32_t	 count;
+};
+
+/*
+ * configurable sorting bits
+ */
+
+struct sort_entry {
+	struct list_head list;
+
+	char *header;
+
+	int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
+	int64_t (*collapse)(struct hist_entry *, struct hist_entry *);
+	size_t	(*print)(FILE *fp, struct hist_entry *);
+};
+
+/* --sort pid */
+
+static int64_t
+sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	return right->thread->pid - left->thread->pid;
+}
+
+static size_t
+sort__thread_print(FILE *fp, struct hist_entry *self)
+{
+	return fprintf(fp, "%16s:%5d", self->thread->comm ?: "", self->thread->pid);
+}
+
+static struct sort_entry sort_thread = {
+	.header = "         Command:  Pid",
+	.cmp	= sort__thread_cmp,
+	.print	= sort__thread_print,
+};
+
+/* --sort comm */
+
+static int64_t
+sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	return right->thread->pid - left->thread->pid;
+}
+
+static int64_t
+sort__comm_collapse(struct hist_entry *left, struct hist_entry *right)
+{
+	char *comm_l = left->thread->comm;
+	char *comm_r = right->thread->comm;
+
+	if (!comm_l || !comm_r) {
+		if (!comm_l && !comm_r)
+			return 0;
+		else if (!comm_l)
+			return -1;
+		else
+			return 1;
+	}
+
+	return strcmp(comm_l, comm_r);
+}
+
+static size_t
+sort__comm_print(FILE *fp, struct hist_entry *self)
+{
+	return fprintf(fp, "%16s", self->thread->comm);
+}
+
+static struct sort_entry sort_comm = {
+	.header		= "         Command",
+	.cmp		= sort__comm_cmp,
+	.collapse	= sort__comm_collapse,
+	.print		= sort__comm_print,
+};
+
+/* --sort dso */
+
+static int64_t
+sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	struct dso *dso_l = left->dso;
+	struct dso *dso_r = right->dso;
+
+	if (!dso_l || !dso_r) {
+		if (!dso_l && !dso_r)
+			return 0;
+		else if (!dso_l)
+			return -1;
+		else
+			return 1;
+	}
+
+	return strcmp(dso_l->name, dso_r->name);
+}
+
+static size_t
+sort__dso_print(FILE *fp, struct hist_entry *self)
+{
+	if (self->dso)
+		return fprintf(fp, "%-25s", self->dso->name);
+
+	return fprintf(fp, "%016llx         ", (__u64)self->ip);
+}
+
+static struct sort_entry sort_dso = {
+	.header = "Shared Object            ",
+	.cmp	= sort__dso_cmp,
+	.print	= sort__dso_print,
+};
+
+/* --sort symbol */
+
+static int64_t
+sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	uint64_t ip_l, ip_r;
+
+	if (left->sym == right->sym)
+		return 0;
+
+	ip_l = left->sym ? left->sym->start : left->ip;
+	ip_r = right->sym ? right->sym->start : right->ip;
+
+	return (int64_t)(ip_r - ip_l);
+}
+
+static size_t
+sort__sym_print(FILE *fp, struct hist_entry *self)
+{
+	size_t ret = 0;
+
+	if (verbose)
+		ret += fprintf(fp, "%#018llx  ", (__u64)self->ip);
+
+	if (self->sym) {
+		ret += fprintf(fp, "[%c] %s",
+			self->dso == kernel_dso ? 'k' : '.', self->sym->name);
+	} else {
+		ret += fprintf(fp, "%#016llx", (__u64)self->ip);
+	}
+
+	return ret;
+}
+
+static struct sort_entry sort_sym = {
+	.header = "Symbol",
+	.cmp	= sort__sym_cmp,
+	.print	= sort__sym_print,
+};
+
+static int sort__need_collapse = 0;
+
+struct sort_dimension {
+	char			*name;
+	struct sort_entry	*entry;
+	int			taken;
+};
+
+static struct sort_dimension sort_dimensions[] = {
+	{ .name = "pid",	.entry = &sort_thread,	},
+	{ .name = "comm",	.entry = &sort_comm,	},
+	{ .name = "dso",	.entry = &sort_dso,	},
+	{ .name = "symbol",	.entry = &sort_sym,	},
+};
+
+static LIST_HEAD(hist_entry__sort_list);
+
+static int sort_dimension__add(char *tok)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(sort_dimensions); i++) {
+		struct sort_dimension *sd = &sort_dimensions[i];
+
+		if (sd->taken)
+			continue;
+
+		if (strncasecmp(tok, sd->name, strlen(tok)))
+			continue;
+
+		if (sd->entry->collapse)
+			sort__need_collapse = 1;
+
+		list_add_tail(&sd->entry->list, &hist_entry__sort_list);
+		sd->taken = 1;
+
+		return 0;
+	}
+
+	return -ESRCH;
+}
+
+static int64_t
+hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	struct sort_entry *se;
+	int64_t cmp = 0;
+
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		cmp = se->cmp(left, right);
+		if (cmp)
+			break;
+	}
+
+	return cmp;
+}
+
+static int64_t
+hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
+{
+	struct sort_entry *se;
+	int64_t cmp = 0;
+
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		int64_t (*f)(struct hist_entry *, struct hist_entry *);
+
+		f = se->collapse ?: se->cmp;
+
+		cmp = f(left, right);
+		if (cmp)
+			break;
+	}
+
+	return cmp;
+}
+
+static size_t
+hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
+{
+	struct sort_entry *se;
+	size_t ret;
+
+	if (total_samples) {
+		double percent = self->count * 100.0 / total_samples;
+		char *color = PERF_COLOR_NORMAL;
+
+		/*
+		 * We color high-overhead entries in red, low-overhead
+		 * entries in green - and keep the middle ground normal:
+		 */
+		if (percent >= 5.0)
+			color = PERF_COLOR_RED;
+		if (percent < 0.5)
+			color = PERF_COLOR_GREEN;
+
+		ret = color_fprintf(fp, color, "   %6.2f%%",
+				(self->count * 100.0) / total_samples);
+	} else
+		ret = fprintf(fp, "%12d ", self->count);
+
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		fprintf(fp, "  ");
+		ret += se->print(fp, self);
+	}
+
+	ret += fprintf(fp, "\n");
+
+	return ret;
+}
+
+/*
+ * collect histogram counts
+ */
+
+static int
+hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
+		struct symbol *sym, uint64_t ip, char level)
+{
+	struct rb_node **p = &hist.rb_node;
+	struct rb_node *parent = NULL;
+	struct hist_entry *he;
+	struct hist_entry entry = {
+		.thread	= thread,
+		.map	= map,
+		.dso	= dso,
+		.sym	= sym,
+		.ip	= ip,
+		.level	= level,
+		.count	= 1,
+	};
+	int cmp;
+
+	while (*p != NULL) {
+		parent = *p;
+		he = rb_entry(parent, struct hist_entry, rb_node);
+
+		cmp = hist_entry__cmp(&entry, he);
+
+		if (!cmp) {
+			he->count++;
+			return 0;
+		}
+
+		if (cmp < 0)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	he = malloc(sizeof(*he));
+	if (!he)
+		return -ENOMEM;
+	*he = entry;
+	rb_link_node(&he->rb_node, parent, p);
+	rb_insert_color(&he->rb_node, &hist);
+
+	return 0;
+}
+
+static void hist_entry__free(struct hist_entry *he)
+{
+	free(he);
+}
+
+/*
+ * collapse the histogram
+ */
+
+static struct rb_root collapse_hists;
+
+static void collapse__insert_entry(struct hist_entry *he)
+{
+	struct rb_node **p = &collapse_hists.rb_node;
+	struct rb_node *parent = NULL;
+	struct hist_entry *iter;
+	int64_t cmp;
+
+	while (*p != NULL) {
+		parent = *p;
+		iter = rb_entry(parent, struct hist_entry, rb_node);
+
+		cmp = hist_entry__collapse(iter, he);
+
+		if (!cmp) {
+			iter->count += he->count;
+			hist_entry__free(he);
+			return;
+		}
+
+		if (cmp < 0)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&he->rb_node, parent, p);
+	rb_insert_color(&he->rb_node, &collapse_hists);
+}
+
+static void collapse__resort(void)
+{
+	struct rb_node *next;
+	struct hist_entry *n;
+
+	if (!sort__need_collapse)
+		return;
+
+	next = rb_first(&hist);
+	while (next) {
+		n = rb_entry(next, struct hist_entry, rb_node);
+		next = rb_next(&n->rb_node);
+
+		rb_erase(&n->rb_node, &hist);
+		collapse__insert_entry(n);
+	}
+}
+
+/*
+ * reverse the map, sort on count.
+ */
+
+static struct rb_root output_hists;
+
+static void output__insert_entry(struct hist_entry *he)
+{
+	struct rb_node **p = &output_hists.rb_node;
+	struct rb_node *parent = NULL;
+	struct hist_entry *iter;
+
+	while (*p != NULL) {
+		parent = *p;
+		iter = rb_entry(parent, struct hist_entry, rb_node);
+
+		if (he->count > iter->count)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&he->rb_node, parent, p);
+	rb_insert_color(&he->rb_node, &output_hists);
+}
+
+static void output__resort(void)
+{
+	struct rb_node *next;
+	struct hist_entry *n;
+	struct rb_root *tree = &hist;
+
+	if (sort__need_collapse)
+		tree = &collapse_hists;
+
+	next = rb_first(tree);
+
+	while (next) {
+		n = rb_entry(next, struct hist_entry, rb_node);
+		next = rb_next(&n->rb_node);
+
+		rb_erase(&n->rb_node, tree);
+		output__insert_entry(n);
+	}
+}
+
+static size_t output__fprintf(FILE *fp, uint64_t total_samples)
+{
+	struct hist_entry *pos;
+	struct sort_entry *se;
+	struct rb_node *nd;
+	size_t ret = 0;
+
+	fprintf(fp, "\n");
+	fprintf(fp, "#\n");
+	fprintf(fp, "# (%Ld samples)\n", (__u64)total_samples);
+	fprintf(fp, "#\n");
+
+	fprintf(fp, "# Overhead");
+	list_for_each_entry(se, &hist_entry__sort_list, list)
+		fprintf(fp, "  %s", se->header);
+	fprintf(fp, "\n");
+
+	fprintf(fp, "# ........");
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		int i;
+
+		fprintf(fp, "  ");
+		for (i = 0; i < strlen(se->header); i++)
+			fprintf(fp, ".");
+	}
+	fprintf(fp, "\n");
+
+	fprintf(fp, "#\n");
+
+	for (nd = rb_first(&output_hists); nd; nd = rb_next(nd)) {
+		pos = rb_entry(nd, struct hist_entry, rb_node);
+		ret += hist_entry__fprintf(fp, pos, total_samples);
+	}
+
+	if (!strcmp(sort_order, default_sort_order)) {
+		fprintf(fp, "#\n");
+		fprintf(fp, "# (For more details, try: perf annotate --sort comm,dso,symbol)\n");
+		fprintf(fp, "#\n");
+	}
+	fprintf(fp, "\n");
+
+	return ret;
+}
+
+static void register_idle_thread(void)
+{
+	struct thread *thread = threads__findnew(0);
+
+	if (thread == NULL ||
+			thread__set_comm(thread, "[idle]")) {
+		fprintf(stderr, "problem inserting idle task.\n");
+		exit(-1);
+	}
+}
+
+static unsigned long total = 0,
+		     total_mmap = 0,
+		     total_comm = 0,
+		     total_fork = 0,
+		     total_unknown = 0;
+
+static int
+process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	char level;
+	int show = 0;
+	struct dso *dso = NULL;
+	struct thread *thread = threads__findnew(event->ip.pid);
+	uint64_t ip = event->ip.ip;
+	struct map *map = NULL;
+
+	dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->header.misc,
+		event->ip.pid,
+		(void *)(long)ip);
+
+	dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
+
+	if (thread == NULL) {
+		fprintf(stderr, "problem processing %d event, skipping it.\n",
+			event->header.type);
+		return -1;
+	}
+
+	if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
+		show = SHOW_KERNEL;
+		level = 'k';
+
+		dso = kernel_dso;
+
+		dprintf(" ...... dso: %s\n", dso->name);
+
+	} else if (event->header.misc & PERF_EVENT_MISC_USER) {
+
+		show = SHOW_USER;
+		level = '.';
+
+		map = thread__find_map(thread, ip);
+		if (map != NULL) {
+			ip = map->map_ip(map, ip);
+			dso = map->dso;
+		} else {
+			/*
+			 * If this is outside of all known maps,
+			 * and is a negative address, try to look it
+			 * up in the kernel dso, as it might be a
+			 * vsyscall (which executes in user-mode):
+			 */
+			if ((long long)ip < 0)
+				dso = kernel_dso;
+		}
+		dprintf(" ...... dso: %s\n", dso ? dso->name : "<not found>");
+
+	} else {
+		show = SHOW_HV;
+		level = 'H';
+		dprintf(" ...... dso: [hypervisor]\n");
+	}
+
+	if (show & show_mask) {
+		struct symbol *sym = NULL;
+
+		if (dso)
+			sym = dso->find_symbol(dso, ip);
+
+		if (hist_entry__add(thread, map, dso, sym, ip, level)) {
+			fprintf(stderr,
+		"problem incrementing symbol count, skipping event\n");
+			return -1;
+		}
+	}
+	total++;
+
+	return 0;
+}
+
+static int
+process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	struct thread *thread = threads__findnew(event->mmap.pid);
+	struct map *map = map__new(&event->mmap);
+
+	dprintf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->mmap.pid,
+		(void *)(long)event->mmap.start,
+		(void *)(long)event->mmap.len,
+		(void *)(long)event->mmap.pgoff,
+		event->mmap.filename);
+
+	if (thread == NULL || map == NULL) {
+		dprintf("problem processing PERF_EVENT_MMAP, skipping event.\n");
+		return 0;
+	}
+
+	thread__insert_map(thread, map);
+	total_mmap++;
+
+	return 0;
+}
+
+static int
+process_comm_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	struct thread *thread = threads__findnew(event->comm.pid);
+
+	dprintf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->comm.comm, event->comm.pid);
+
+	if (thread == NULL ||
+	    thread__set_comm(thread, event->comm.comm)) {
+		dprintf("problem processing PERF_EVENT_COMM, skipping event.\n");
+		return -1;
+	}
+	total_comm++;
+
+	return 0;
+}
+
+static int
+process_fork_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	struct thread *thread = threads__findnew(event->fork.pid);
+	struct thread *parent = threads__findnew(event->fork.ppid);
+
+	dprintf("%p [%p]: PERF_EVENT_FORK: %d:%d\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->fork.pid, event->fork.ppid);
+
+	if (!thread || !parent || thread__fork(thread, parent)) {
+		dprintf("problem processing PERF_EVENT_FORK, skipping event.\n");
+		return -1;
+	}
+	total_fork++;
+
+	return 0;
+}
+
+static int
+process_period_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	dprintf("%p [%p]: PERF_EVENT_PERIOD: time:%Ld, id:%Ld: period:%Ld\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->period.time,
+		event->period.id,
+		event->period.sample_period);
+
+	return 0;
+}
+
+static int
+process_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW)
+		return process_overflow_event(event, offset, head);
+
+	switch (event->header.type) {
+	case PERF_EVENT_MMAP:
+		return process_mmap_event(event, offset, head);
+
+	case PERF_EVENT_COMM:
+		return process_comm_event(event, offset, head);
+
+	case PERF_EVENT_FORK:
+		return process_fork_event(event, offset, head);
+
+	case PERF_EVENT_PERIOD:
+		return process_period_event(event, offset, head);
+	/*
+	 * We dont process them right now but they are fine:
+	 */
+
+	case PERF_EVENT_THROTTLE:
+	case PERF_EVENT_UNTHROTTLE:
+		return 0;
+
+	default:
+		return -1;
+	}
+
+	return 0;
+}
+
+static int __cmd_annotate(void)
+{
+	int ret, rc = EXIT_FAILURE;
+	unsigned long offset = 0;
+	unsigned long head = 0;
+	struct stat stat;
+	event_t *event;
+	uint32_t size;
+	char *buf;
+
+	register_idle_thread();
+
+	input = open(input_name, O_RDONLY);
+	if (input < 0) {
+		perror("failed to open file");
+		exit(-1);
+	}
+
+	ret = fstat(input, &stat);
+	if (ret < 0) {
+		perror("failed to stat file");
+		exit(-1);
+	}
+
+	if (!stat.st_size) {
+		fprintf(stderr, "zero-sized file, nothing to do!\n");
+		exit(0);
+	}
+
+	if (load_kernel() < 0) {
+		perror("failed to load kernel symbols");
+		return EXIT_FAILURE;
+	}
+
+	if (!full_paths) {
+		if (getcwd(__cwd, sizeof(__cwd)) == NULL) {
+			perror("failed to get the current directory");
+			return EXIT_FAILURE;
+		}
+		cwdlen = strlen(cwd);
+	} else {
+		cwd = NULL;
+		cwdlen = 0;
+	}
+remap:
+	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
+			   MAP_SHARED, input, offset);
+	if (buf == MAP_FAILED) {
+		perror("failed to mmap file");
+		exit(-1);
+	}
+
+more:
+	event = (event_t *)(buf + head);
+
+	size = event->header.size;
+	if (!size)
+		size = 8;
+
+	if (head + event->header.size >= page_size * mmap_window) {
+		unsigned long shift = page_size * (head / page_size);
+		int ret;
+
+		ret = munmap(buf, page_size * mmap_window);
+		assert(ret == 0);
+
+		offset += shift;
+		head -= shift;
+		goto remap;
+	}
+
+	size = event->header.size;
+
+	dprintf("%p [%p]: event: %d\n",
+			(void *)(offset + head),
+			(void *)(long)event->header.size,
+			event->header.type);
+
+	if (!size || process_event(event, offset, head) < 0) {
+
+		dprintf("%p [%p]: skipping unknown header type: %d\n",
+			(void *)(offset + head),
+			(void *)(long)(event->header.size),
+			event->header.type);
+
+		total_unknown++;
+
+		/*
+		 * assume we lost track of the stream, check alignment, and
+		 * increment a single u64 in the hope to catch on again 'soon'.
+		 */
+
+		if (unlikely(head & 7))
+			head &= ~7ULL;
+
+		size = 8;
+	}
+
+	head += size;
+
+	if (offset + head < stat.st_size)
+		goto more;
+
+	rc = EXIT_SUCCESS;
+	close(input);
+
+	dprintf("      IP events: %10ld\n", total);
+	dprintf("    mmap events: %10ld\n", total_mmap);
+	dprintf("    comm events: %10ld\n", total_comm);
+	dprintf("    fork events: %10ld\n", total_fork);
+	dprintf(" unknown events: %10ld\n", total_unknown);
+
+	if (dump_trace)
+		return 0;
+
+	if (verbose >= 3)
+		threads__fprintf(stdout);
+
+	if (verbose >= 2)
+		dsos__fprintf(stdout);
+
+	collapse__resort();
+	output__resort();
+	output__fprintf(stdout, total);
+
+	return rc;
+}
+
+static const char * const annotate_usage[] = {
+	"perf annotate [<options>] <command>",
+	NULL
+};
+
+static const struct option options[] = {
+	OPT_STRING('i', "input", &input_name, "file",
+		    "input file name"),
+	OPT_BOOLEAN('v', "verbose", &verbose,
+		    "be more verbose (show symbol address, etc)"),
+	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
+		    "dump raw trace in ASCII"),
+	OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
+	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
+		   "sort by key(s): pid, comm, dso, symbol. Default: pid,symbol"),
+	OPT_BOOLEAN('P', "full-paths", &full_paths,
+		    "Don't shorten the pathnames taking into account the cwd"),
+	OPT_END()
+};
+
+static void setup_sorting(void)
+{
+	char *tmp, *tok, *str = strdup(sort_order);
+
+	for (tok = strtok_r(str, ", ", &tmp);
+			tok; tok = strtok_r(NULL, ", ", &tmp)) {
+		if (sort_dimension__add(tok) < 0) {
+			error("Unknown --sort key: `%s'", tok);
+			usage_with_options(annotate_usage, options);
+		}
+	}
+
+	free(str);
+}
+
+int cmd_annotate(int argc, const char **argv, const char *prefix)
+{
+	symbol__init();
+
+	page_size = getpagesize();
+
+	argc = parse_options(argc, argv, options, annotate_usage, 0);
+
+	setup_sorting();
+
+	/*
+	 * Any (unrecognized) arguments left?
+	 */
+	if (argc)
+		usage_with_options(annotate_usage, options);
+
+	setup_pager();
+
+	return __cmd_annotate();
+}
diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h
index e7de47d..51d1682 100644
--- a/Documentation/perf_counter/builtin.h
+++ b/Documentation/perf_counter/builtin.h
@@ -14,6 +14,7 @@ extern void prune_packed_objects(int);
 extern int read_line_with_nul(char *buf, int size, FILE *file);
 extern int check_pager_config(const char *cmd);
 
+extern int cmd_annotate(int argc, const char **argv, const char *prefix);
 extern int cmd_help(int argc, const char **argv, const char *prefix);
 extern int cmd_record(int argc, const char **argv, const char *prefix);
 extern int cmd_report(int argc, const char **argv, const char *prefix);
diff --git a/Documentation/perf_counter/command-list.txt b/Documentation/perf_counter/command-list.txt
index f0b922c..eebce30 100644
--- a/Documentation/perf_counter/command-list.txt
+++ b/Documentation/perf_counter/command-list.txt
@@ -2,8 +2,9 @@
 # List of known perf commands.
 # command name			category [deprecated] [common]
 #
+perf-annotate			mainporcelain common
+perf-list			mainporcelain common
 perf-record			mainporcelain common
 perf-report			mainporcelain common
 perf-stat			mainporcelain common
 perf-top			mainporcelain common
-perf-list			mainporcelain common
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
index 161824f..4eb7259 100644
--- a/Documentation/perf_counter/perf.c
+++ b/Documentation/perf_counter/perf.c
@@ -263,6 +263,7 @@ static void handle_internal_command(int argc, const char **argv)
 		{ "report", cmd_report, 0 },
 		{ "stat", cmd_stat, 0 },
 		{ "top", cmd_top, 0 },
+		{ "annotate", cmd_annotate, 0 },
 		{ "version", cmd_version, 0 },
 	};
 	int i;
@@ -402,9 +403,11 @@ int main(int argc, const char **argv)
 	while (1) {
 		static int done_help = 0;
 		static int was_alias = 0;
+
 		was_alias = run_argv(&argc, &argv);
 		if (errno != ENOENT)
 			break;
+
 		if (was_alias) {
 			fprintf(stderr, "Expansion of alias '%s' failed; "
 				"'%s' is not a perf-command\n",
-- 
cgit v0.10.2


From 0b73da3f40128eab6ca2a07508f424029a1edaeb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 6 Jun 2009 15:48:52 +0200
Subject: perf_counter tools: Add 'perf annotate' feature

Add new perf sub-command to display annotated source code:

 $ perf annotate decode_tree_entry

------------------------------------------------
 Percent |	Source code & Disassembly of /home/mingo/git/git
------------------------------------------------
         :
         :	/home/mingo/git/git:     file format elf64-x86-64
         :
         :
         :	Disassembly of section .text:
         :
         :	00000000004a0da0 <decode_tree_entry>:
         :		*modep = mode;
         :		return str;
         :	}
         :
         :	static void decode_tree_entry(struct tree_desc *desc, const char *buf, unsigned long size)
         :	{
    3.82 :	  4a0da0:	41 54                	push   %r12
         :		const char *path;
         :		unsigned int mode, len;
         :
         :		if (size < 24 || buf[size - 21])
    0.17 :	  4a0da2:	48 83 fa 17          	cmp    $0x17,%rdx
         :		*modep = mode;
         :		return str;
         :	}
         :
         :	static void decode_tree_entry(struct tree_desc *desc, const char *buf, unsigned long size)
         :	{
    0.00 :	  4a0da6:	49 89 fc             	mov    %rdi,%r12
    0.00 :	  4a0da9:	55                   	push   %rbp
    3.37 :	  4a0daa:	53                   	push   %rbx
         :		const char *path;
         :		unsigned int mode, len;
         :
         :		if (size < 24 || buf[size - 21])
    0.08 :	  4a0dab:	76 73                	jbe    4a0e20 <decode_tree_entry+0x80>
    0.00 :	  4a0dad:	80 7c 16 eb 00       	cmpb   $0x0,-0x15(%rsi,%rdx,1)
    3.48 :	  4a0db2:	75 6c                	jne    4a0e20 <decode_tree_entry+0x80>
         :	static const char *get_mode(const char *str, unsigned int *modep)
         :	{
         :		unsigned char c;
         :		unsigned int mode = 0;
         :
         :		if (*str == ' ')
    1.94 :	  4a0db4:	0f b6 06             	movzbl (%rsi),%eax
    0.39 :	  4a0db7:	3c 20                	cmp    $0x20,%al
    0.00 :	  4a0db9:	74 65                	je     4a0e20 <decode_tree_entry+0x80>
         :			return NULL;
         :
         :		while ((c = *str++) != ' ') {
    0.06 :	  4a0dbb:	89 c2                	mov    %eax,%edx
         :			if (c < '0' || c > '7')
    1.99 :	  4a0dbd:	31 ed                	xor    %ebp,%ebp
         :		unsigned int mode = 0;
         :
         :		if (*str == ' ')
         :			return NULL;
         :
         :		while ((c = *str++) != ' ') {
    1.74 :	  4a0dbf:	48 8d 5e 01          	lea    0x1(%rsi),%rbx
         :			if (c < '0' || c > '7')
    0.00 :	  4a0dc3:	8d 42 d0             	lea    -0x30(%rdx),%eax
    0.17 :	  4a0dc6:	3c 07                	cmp    $0x7,%al
    0.00 :	  4a0dc8:	76 0d                	jbe    4a0dd7 <decode_tree_entry+0x37>
    0.00 :	  4a0dca:	eb 54                	jmp    4a0e20 <decode_tree_entry+0x80>
    0.00 :	  4a0dcc:	0f 1f 40 00          	nopl   0x0(%rax)
   16.57 :	  4a0dd0:	8d 42 d0             	lea    -0x30(%rdx),%eax
    0.14 :	  4a0dd3:	3c 07                	cmp    $0x7,%al
    0.00 :	  4a0dd5:	77 49                	ja     4a0e20 <decode_tree_entry+0x80>
         :				return NULL;
         :			mode = (mode << 3) + (c - '0');
    3.12 :	  4a0dd7:	0f b6 c2             	movzbl %dl,%eax
         :		unsigned int mode = 0;
         :
         :		if (*str == ' ')
         :			return NULL;
         :
         :		while ((c = *str++) != ' ') {
    0.00 :	  4a0dda:	0f b6 13             	movzbl (%rbx),%edx
   16.74 :	  4a0ddd:	48 83 c3 01          	add    $0x1,%rbx
         :			if (c < '0' || c > '7')
         :				return NULL;
         :			mode = (mode << 3) + (c - '0');

The first column is the percentage of samples that arrived on that
particular line - relative to the total cost of the function.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/Documentation/perf-annotate.txt b/Documentation/perf_counter/Documentation/perf-annotate.txt
index a9d6d5e..c9dcade 100644
--- a/Documentation/perf_counter/Documentation/perf-annotate.txt
+++ b/Documentation/perf_counter/Documentation/perf-annotate.txt
@@ -3,7 +3,7 @@ perf-annotate(1)
 
 NAME
 ----
-perf-annotate - Read perf.data (created by perf record) and annotate functions
+perf-annotate - Read perf.data (created by perf record) and display annotated code
 
 SYNOPSIS
 --------
@@ -12,8 +12,11 @@ SYNOPSIS
 
 DESCRIPTION
 -----------
-This command displays the performance counter profile information recorded
-via perf record.
+This command reads the input file and displays an annotated version of the
+code. If the object file has debug symbols then the source code will be
+displayed alongside assembly code.
+
+If there is no debug info in the object, then annotated assembly is displayed.
 
 OPTIONS
 -------
diff --git a/Documentation/perf_counter/builtin-annotate.c b/Documentation/perf_counter/builtin-annotate.c
index d656484..116a397 100644
--- a/Documentation/perf_counter/builtin-annotate.c
+++ b/Documentation/perf_counter/builtin-annotate.c
@@ -28,7 +28,7 @@
 static char		const *input_name = "perf.data";
 static char		*vmlinux = NULL;
 
-static char		default_sort_order[] = "comm,dso";
+static char		default_sort_order[] = "comm,symbol";
 static char		*sort_order = default_sort_order;
 
 static int		input;
@@ -38,7 +38,6 @@ static int		dump_trace = 0;
 #define dprintf(x...)	do { if (dump_trace) printf(x); } while (0)
 
 static int		verbose;
-static int		full_paths;
 
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
@@ -89,6 +88,7 @@ static LIST_HEAD(dsos);
 static struct dso *kernel_dso;
 static struct dso *vdso;
 
+
 static void dsos__add(struct dso *dso)
 {
 	list_add_tail(&dso->node, &dsos);
@@ -176,20 +176,6 @@ static int load_kernel(void)
 	return err;
 }
 
-static char __cwd[PATH_MAX];
-static char *cwd = __cwd;
-static int cwdlen;
-
-static int strcommon(const char *pathname)
-{
-	int n = 0;
-
-	while (pathname[n] == cwd[n] && n < cwdlen)
-		++n;
-
-	return n;
-}
-
 struct map {
 	struct list_head node;
 	uint64_t	 start;
@@ -215,17 +201,6 @@ static struct map *map__new(struct mmap_event *event)
 
 	if (self != NULL) {
 		const char *filename = event->filename;
-		char newfilename[PATH_MAX];
-
-		if (cwd) {
-			int n = strcommon(filename);
-
-			if (n == cwdlen) {
-				snprintf(newfilename, sizeof(newfilename),
-					 ".%s", filename + n);
-				filename = newfilename;
-			}
-		}
 
 		self->start = event->start;
 		self->end   = event->start + event->len;
@@ -669,44 +644,36 @@ hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
 	return cmp;
 }
 
-static size_t
-hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
+/*
+ * collect histogram counts
+ */
+static void hist_hit(struct hist_entry *he, uint64_t ip)
 {
-	struct sort_entry *se;
-	size_t ret;
+	unsigned int sym_size, offset;
+	struct symbol *sym = he->sym;
 
-	if (total_samples) {
-		double percent = self->count * 100.0 / total_samples;
-		char *color = PERF_COLOR_NORMAL;
+	he->count++;
 
-		/*
-		 * We color high-overhead entries in red, low-overhead
-		 * entries in green - and keep the middle ground normal:
-		 */
-		if (percent >= 5.0)
-			color = PERF_COLOR_RED;
-		if (percent < 0.5)
-			color = PERF_COLOR_GREEN;
+	if (!sym || !sym->hist)
+		return;
 
-		ret = color_fprintf(fp, color, "   %6.2f%%",
-				(self->count * 100.0) / total_samples);
-	} else
-		ret = fprintf(fp, "%12d ", self->count);
+	sym_size = sym->end - sym->start;
+	offset = ip - sym->start;
 
-	list_for_each_entry(se, &hist_entry__sort_list, list) {
-		fprintf(fp, "  ");
-		ret += se->print(fp, self);
-	}
+	if (offset >= sym_size)
+		return;
 
-	ret += fprintf(fp, "\n");
+	sym->hist_sum++;
+	sym->hist[offset]++;
 
-	return ret;
+	if (verbose >= 3)
+		printf("%p %s: count++ [ip: %p, %08Lx] => %Ld\n",
+			(void *)he->sym->start,
+			he->sym->name,
+			(void *)ip, ip - he->sym->start,
+			sym->hist[offset]);
 }
 
-/*
- * collect histogram counts
- */
-
 static int
 hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
 		struct symbol *sym, uint64_t ip, char level)
@@ -732,7 +699,8 @@ hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
 		cmp = hist_entry__cmp(&entry, he);
 
 		if (!cmp) {
-			he->count++;
+			hist_hit(he, ip);
+
 			return 0;
 		}
 
@@ -856,50 +824,6 @@ static void output__resort(void)
 	}
 }
 
-static size_t output__fprintf(FILE *fp, uint64_t total_samples)
-{
-	struct hist_entry *pos;
-	struct sort_entry *se;
-	struct rb_node *nd;
-	size_t ret = 0;
-
-	fprintf(fp, "\n");
-	fprintf(fp, "#\n");
-	fprintf(fp, "# (%Ld samples)\n", (__u64)total_samples);
-	fprintf(fp, "#\n");
-
-	fprintf(fp, "# Overhead");
-	list_for_each_entry(se, &hist_entry__sort_list, list)
-		fprintf(fp, "  %s", se->header);
-	fprintf(fp, "\n");
-
-	fprintf(fp, "# ........");
-	list_for_each_entry(se, &hist_entry__sort_list, list) {
-		int i;
-
-		fprintf(fp, "  ");
-		for (i = 0; i < strlen(se->header); i++)
-			fprintf(fp, ".");
-	}
-	fprintf(fp, "\n");
-
-	fprintf(fp, "#\n");
-
-	for (nd = rb_first(&output_hists); nd; nd = rb_next(nd)) {
-		pos = rb_entry(nd, struct hist_entry, rb_node);
-		ret += hist_entry__fprintf(fp, pos, total_samples);
-	}
-
-	if (!strcmp(sort_order, default_sort_order)) {
-		fprintf(fp, "#\n");
-		fprintf(fp, "# (For more details, try: perf annotate --sort comm,dso,symbol)\n");
-		fprintf(fp, "#\n");
-	}
-	fprintf(fp, "\n");
-
-	return ret;
-}
-
 static void register_idle_thread(void)
 {
 	struct thread *thread = threads__findnew(0);
@@ -1106,6 +1030,149 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 	return 0;
 }
 
+static int
+parse_line(FILE *file, struct symbol *sym, uint64_t start, uint64_t len)
+{
+	char *line = NULL, *tmp, *tmp2;
+	unsigned int offset;
+	size_t line_len;
+	__u64 line_ip;
+	int ret;
+	char *c;
+
+	if (getline(&line, &line_len, file) < 0)
+		return -1;
+	if (!line)
+		return -1;
+
+	c = strchr(line, '\n');
+	if (c)
+		*c = 0;
+
+	line_ip = -1;
+	offset = 0;
+	ret = -2;
+
+	/*
+	 * Strip leading spaces:
+	 */
+	tmp = line;
+	while (*tmp) {
+		if (*tmp != ' ')
+			break;
+		tmp++;
+	}
+
+	if (*tmp) {
+		/*
+		 * Parse hexa addresses followed by ':'
+		 */
+		line_ip = strtoull(tmp, &tmp2, 16);
+		if (*tmp2 != ':')
+			line_ip = -1;
+	}
+
+	if (line_ip != -1) {
+		unsigned int hits = 0;
+		double percent = 0.0;
+		char *color = PERF_COLOR_NORMAL;
+
+		offset = line_ip - start;
+		if (offset < len)
+			hits = sym->hist[offset];
+
+		if (sym->hist_sum)
+			percent = 100.0 * hits / sym->hist_sum;
+
+		/*
+		 * We color high-overhead entries in red, low-overhead
+		 * entries in green - and keep the middle ground normal:
+		 */
+		if (percent >= 5.0)
+			color = PERF_COLOR_RED;
+		else {
+			if (percent > 0.5)
+				color = PERF_COLOR_GREEN;
+		}
+
+		color_fprintf(stdout, color, " %7.2f", percent);
+		printf(" :	");
+		color_fprintf(stdout, PERF_COLOR_BLUE, "%s\n", line);
+	} else {
+		if (!*line)
+			printf("         :\n");
+		else
+			printf("         :	%s\n", line);
+	}
+
+	return 0;
+}
+
+static void annotate_sym(struct dso *dso, struct symbol *sym)
+{
+	char *filename = dso->name;
+	uint64_t start, end, len;
+	char command[PATH_MAX*2];
+	FILE *file;
+
+	if (!filename)
+		return;
+	if (dso == kernel_dso)
+		filename = vmlinux;
+
+	printf("\n------------------------------------------------\n");
+	printf(" Percent |	Source code & Disassembly of %s\n", filename);
+	printf("------------------------------------------------\n");
+
+	if (verbose >= 2)
+		printf("annotating [%p] %30s : [%p] %30s\n", dso, dso->name, sym, sym->name);
+
+	start = sym->obj_start;
+	if (!start)
+		start = sym->start;
+
+	end = start + sym->end - sym->start + 1;
+	len = sym->end - sym->start;
+
+	sprintf(command, "objdump --start-address=0x%016Lx --stop-address=0x%016Lx -dS %s", (__u64)start, (__u64)end, filename);
+
+	if (verbose >= 3)
+		printf("doing: %s\n", command);
+
+	file = popen(command, "r");
+	if (!file)
+		return;
+
+	while (!feof(file)) {
+		if (parse_line(file, sym, start, len) < 0)
+			break;
+	}
+
+	pclose(file);
+}
+
+static void find_annotations(void)
+{
+	struct rb_node *nd;
+	struct dso *dso;
+	int count = 0;
+
+	list_for_each_entry(dso, &dsos, node) {
+
+		for (nd = rb_first(&dso->syms); nd; nd = rb_next(nd)) {
+			struct symbol *sym = rb_entry(nd, struct symbol, rb_node);
+
+			if (sym->hist) {
+				annotate_sym(dso, sym);
+				count++;
+			}
+		}
+	}
+
+	if (!count)
+		printf(" Error: symbol '%s' not present amongst the samples.\n", sym_hist_filter);
+}
+
 static int __cmd_annotate(void)
 {
 	int ret, rc = EXIT_FAILURE;
@@ -1140,16 +1207,6 @@ static int __cmd_annotate(void)
 		return EXIT_FAILURE;
 	}
 
-	if (!full_paths) {
-		if (getcwd(__cwd, sizeof(__cwd)) == NULL) {
-			perror("failed to get the current directory");
-			return EXIT_FAILURE;
-		}
-		cwdlen = strlen(cwd);
-	} else {
-		cwd = NULL;
-		cwdlen = 0;
-	}
 remap:
 	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
 			   MAP_SHARED, input, offset);
@@ -1229,7 +1286,8 @@ more:
 
 	collapse__resort();
 	output__resort();
-	output__fprintf(stdout, total);
+
+	find_annotations();
 
 	return rc;
 }
@@ -1242,15 +1300,13 @@ static const char * const annotate_usage[] = {
 static const struct option options[] = {
 	OPT_STRING('i', "input", &input_name, "file",
 		    "input file name"),
+	OPT_STRING('s', "symbol", &sym_hist_filter, "file",
+		    "symbol to annotate"),
 	OPT_BOOLEAN('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
 	OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
-	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
-		   "sort by key(s): pid, comm, dso, symbol. Default: pid,symbol"),
-	OPT_BOOLEAN('P', "full-paths", &full_paths,
-		    "Don't shorten the pathnames taking into account the cwd"),
 	OPT_END()
 };
 
@@ -1279,10 +1335,18 @@ int cmd_annotate(int argc, const char **argv, const char *prefix)
 
 	setup_sorting();
 
-	/*
-	 * Any (unrecognized) arguments left?
-	 */
-	if (argc)
+	if (argc) {
+		/*
+		 * Special case: if there's an argument left then assume tha
+		 * it's a symbol filter:
+		 */
+		if (argc > 1)
+			usage_with_options(annotate_usage, options);
+
+		sym_hist_filter = argv[0];
+	}
+
+	if (!sym_hist_filter)
 		usage_with_options(annotate_usage, options);
 
 	setup_pager();
diff --git a/Documentation/perf_counter/util/symbol.c b/Documentation/perf_counter/util/symbol.c
index a06bbfba..23f4f7b 100644
--- a/Documentation/perf_counter/util/symbol.c
+++ b/Documentation/perf_counter/util/symbol.c
@@ -7,21 +7,36 @@
 #include <gelf.h>
 #include <elf.h>
 
+const char *sym_hist_filter;
+
 static struct symbol *symbol__new(uint64_t start, uint64_t len,
-				  const char *name, unsigned int priv_size)
+				  const char *name, unsigned int priv_size,
+				  uint64_t obj_start, int verbose)
 {
 	size_t namelen = strlen(name) + 1;
-	struct symbol *self = malloc(priv_size + sizeof(*self) + namelen);
+	struct symbol *self = calloc(1, priv_size + sizeof(*self) + namelen);
 
-	if (self != NULL) {
-		if (priv_size) {
-			memset(self, 0, priv_size);
-			self = ((void *)self) + priv_size;
-		}
-		self->start = start;
-		self->end   = start + len - 1;
-		memcpy(self->name, name, namelen);
+	if (!self)
+		return NULL;
+
+	if (verbose >= 2)
+		printf("new symbol: %016Lx [%08lx]: %s, hist: %p, obj_start: %p\n",
+			(__u64)start, len, name, self->hist, (void *)obj_start);
+
+	self->obj_start= obj_start;
+	self->hist = NULL;
+	self->hist_sum = 0;
+
+	if (sym_hist_filter && !strcmp(name, sym_hist_filter))
+		self->hist = calloc(sizeof(__u64), len);
+
+	if (priv_size) {
+		memset(self, 0, priv_size);
+		self = ((void *)self) + priv_size;
 	}
+	self->start = start;
+	self->end   = start + len - 1;
+	memcpy(self->name, name, namelen);
 
 	return self;
 }
@@ -166,7 +181,7 @@ static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter, int verb
 		 * Well fix up the end later, when we have all sorted.
 		 */
 		sym = symbol__new(start, 0xdead, line + len + 2,
-				  self->sym_priv_size);
+				  self->sym_priv_size, 0, verbose);
 
 		if (sym == NULL)
 			goto out_delete_line;
@@ -272,7 +287,7 @@ static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
 static int dso__synthesize_plt_symbols(struct  dso *self, Elf *elf,
 				       GElf_Ehdr *ehdr, Elf_Scn *scn_dynsym,
 				       GElf_Shdr *shdr_dynsym,
-				       size_t dynsym_idx)
+				       size_t dynsym_idx, int verbose)
 {
 	uint32_t nr_rel_entries, idx;
 	GElf_Sym sym;
@@ -335,7 +350,7 @@ static int dso__synthesize_plt_symbols(struct  dso *self, Elf *elf,
 				 "%s@plt", elf_sym__name(&sym, symstrs));
 
 			f = symbol__new(plt_offset, shdr_plt.sh_entsize,
-					sympltname, self->sym_priv_size);
+					sympltname, self->sym_priv_size, 0, verbose);
 			if (!f)
 				return -1;
 
@@ -353,7 +368,7 @@ static int dso__synthesize_plt_symbols(struct  dso *self, Elf *elf,
 				 "%s@plt", elf_sym__name(&sym, symstrs));
 
 			f = symbol__new(plt_offset, shdr_plt.sh_entsize,
-					sympltname, self->sym_priv_size);
+					sympltname, self->sym_priv_size, 0, verbose);
 			if (!f)
 				return -1;
 
@@ -410,7 +425,7 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
 	if (sec_dynsym != NULL) {
 		nr = dso__synthesize_plt_symbols(self, elf, &ehdr,
 						 sec_dynsym, &shdr,
-						 dynsym_idx);
+						 dynsym_idx, verbose);
 		if (nr < 0)
 			goto out_elf_end;
 	}
@@ -444,6 +459,7 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
 
 	elf_symtab__for_each_symbol(syms, nr_syms, index, sym) {
 		struct symbol *f;
+		uint64_t obj_start;
 
 		if (!elf_sym__is_function(&sym))
 			continue;
@@ -453,11 +469,13 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
 			goto out_elf_end;
 
 		gelf_getshdr(sec, &shdr);
+		obj_start = sym.st_value;
+
 		sym.st_value -= shdr.sh_addr - shdr.sh_offset;
 
 		f = symbol__new(sym.st_value, sym.st_size,
 				elf_sym__name(&sym, symstrs),
-				self->sym_priv_size);
+				self->sym_priv_size, obj_start, verbose);
 		if (!f)
 			goto out_elf_end;
 
diff --git a/Documentation/perf_counter/util/symbol.h b/Documentation/perf_counter/util/symbol.h
index e23cc31..4839d68 100644
--- a/Documentation/perf_counter/util/symbol.h
+++ b/Documentation/perf_counter/util/symbol.h
@@ -9,6 +9,9 @@ struct symbol {
 	struct rb_node	rb_node;
 	__u64		start;
 	__u64		end;
+	__u64		obj_start;
+	__u64		hist_sum;
+	__u64		*hist;
 	char		name[0];
 };
 
@@ -20,6 +23,8 @@ struct dso {
 	char		 name[0];
 };
 
+const char *sym_hist_filter;
+
 typedef int (*symbol_filter_t)(struct dso *self, struct symbol *sym);
 
 struct dso *dso__new(const char *name, unsigned int sym_priv_size);
-- 
cgit v0.10.2


From 864709302a80f26fa9da3be5b47304f0b8bae192 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 6 Jun 2009 20:33:43 +0200
Subject: perf_counter tools: Move from Documentation/perf_counter/ to
 tools/perf/

Several people have suggested that 'perf' has become a full-fledged
tool that should be moved out of Documentation/. Move it to the
(new) tools/ directory.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/Documentation/perf_counter/.gitignore b/Documentation/perf_counter/.gitignore
deleted file mode 100644
index d69a759..0000000
--- a/Documentation/perf_counter/.gitignore
+++ /dev/null
@@ -1,16 +0,0 @@
-PERF-BUILD-OPTIONS
-PERF-CFLAGS
-PERF-GUI-VARS
-PERF-VERSION-FILE
-perf
-perf-help
-perf-record
-perf-report
-perf-stat
-perf-top
-perf*.1
-perf*.xml
-common-cmds.h
-tags
-TAGS
-cscope*
diff --git a/Documentation/perf_counter/Documentation/Makefile b/Documentation/perf_counter/Documentation/Makefile
deleted file mode 100644
index 5457192..0000000
--- a/Documentation/perf_counter/Documentation/Makefile
+++ /dev/null
@@ -1,300 +0,0 @@
-MAN1_TXT= \
-	$(filter-out $(addsuffix .txt, $(ARTICLES) $(SP_ARTICLES)), \
-		$(wildcard perf-*.txt)) \
-	perf.txt
-MAN5_TXT=
-MAN7_TXT=
-
-MAN_TXT = $(MAN1_TXT) $(MAN5_TXT) $(MAN7_TXT)
-MAN_XML=$(patsubst %.txt,%.xml,$(MAN_TXT))
-MAN_HTML=$(patsubst %.txt,%.html,$(MAN_TXT))
-
-DOC_HTML=$(MAN_HTML)
-
-ARTICLES =
-# with their own formatting rules.
-SP_ARTICLES =
-API_DOCS = $(patsubst %.txt,%,$(filter-out technical/api-index-skel.txt technical/api-index.txt, $(wildcard technical/api-*.txt)))
-SP_ARTICLES += $(API_DOCS)
-SP_ARTICLES += technical/api-index
-
-DOC_HTML += $(patsubst %,%.html,$(ARTICLES) $(SP_ARTICLES))
-
-DOC_MAN1=$(patsubst %.txt,%.1,$(MAN1_TXT))
-DOC_MAN5=$(patsubst %.txt,%.5,$(MAN5_TXT))
-DOC_MAN7=$(patsubst %.txt,%.7,$(MAN7_TXT))
-
-prefix?=$(HOME)
-bindir?=$(prefix)/bin
-htmldir?=$(prefix)/share/doc/perf-doc
-pdfdir?=$(prefix)/share/doc/perf-doc
-mandir?=$(prefix)/share/man
-man1dir=$(mandir)/man1
-man5dir=$(mandir)/man5
-man7dir=$(mandir)/man7
-# DESTDIR=
-
-ASCIIDOC=asciidoc
-ASCIIDOC_EXTRA =
-MANPAGE_XSL = manpage-normal.xsl
-XMLTO_EXTRA =
-INSTALL?=install
-RM ?= rm -f
-DOC_REF = origin/man
-HTML_REF = origin/html
-
-infodir?=$(prefix)/share/info
-MAKEINFO=makeinfo
-INSTALL_INFO=install-info
-DOCBOOK2X_TEXI=docbook2x-texi
-DBLATEX=dblatex
-ifndef PERL_PATH
-	PERL_PATH = /usr/bin/perl
-endif
-
--include ../config.mak.autogen
--include ../config.mak
-
-#
-# For asciidoc ...
-#	-7.1.2,	no extra settings are needed.
-#	8.0-,	set ASCIIDOC8.
-#
-
-#
-# For docbook-xsl ...
-#	-1.68.1,	set ASCIIDOC_NO_ROFF? (based on changelog from 1.73.0)
-#	1.69.0,		no extra settings are needed?
-#	1.69.1-1.71.0,	set DOCBOOK_SUPPRESS_SP?
-#	1.71.1,		no extra settings are needed?
-#	1.72.0,		set DOCBOOK_XSL_172.
-#	1.73.0-,	set ASCIIDOC_NO_ROFF
-#
-
-#
-# If you had been using DOCBOOK_XSL_172 in an attempt to get rid
-# of 'the ".ft C" problem' in your generated manpages, and you
-# instead ended up with weird characters around callouts, try
-# using ASCIIDOC_NO_ROFF instead (it works fine with ASCIIDOC8).
-#
-
-ifdef ASCIIDOC8
-ASCIIDOC_EXTRA += -a asciidoc7compatible
-endif
-ifdef DOCBOOK_XSL_172
-ASCIIDOC_EXTRA += -a perf-asciidoc-no-roff
-MANPAGE_XSL = manpage-1.72.xsl
-else
-	ifdef ASCIIDOC_NO_ROFF
-	# docbook-xsl after 1.72 needs the regular XSL, but will not
-	# pass-thru raw roff codes from asciidoc.conf, so turn them off.
-	ASCIIDOC_EXTRA += -a perf-asciidoc-no-roff
-	endif
-endif
-ifdef MAN_BOLD_LITERAL
-XMLTO_EXTRA += -m manpage-bold-literal.xsl
-endif
-ifdef DOCBOOK_SUPPRESS_SP
-XMLTO_EXTRA += -m manpage-suppress-sp.xsl
-endif
-
-SHELL_PATH ?= $(SHELL)
-# Shell quote;
-SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
-
-#
-# Please note that there is a minor bug in asciidoc.
-# The version after 6.0.3 _will_ include the patch found here:
-#   http://marc.theaimsgroup.com/?l=perf&m=111558757202243&w=2
-#
-# Until that version is released you may have to apply the patch
-# yourself - yes, all 6 characters of it!
-#
-
-QUIET_SUBDIR0  = +$(MAKE) -C # space to separate -C and subdir
-QUIET_SUBDIR1  =
-
-ifneq ($(findstring $(MAKEFLAGS),w),w)
-PRINT_DIR = --no-print-directory
-else # "make -w"
-NO_SUBDIR = :
-endif
-
-ifneq ($(findstring $(MAKEFLAGS),s),s)
-ifndef V
-	QUIET_ASCIIDOC	= @echo '   ' ASCIIDOC $@;
-	QUIET_XMLTO	= @echo '   ' XMLTO $@;
-	QUIET_DB2TEXI	= @echo '   ' DB2TEXI $@;
-	QUIET_MAKEINFO	= @echo '   ' MAKEINFO $@;
-	QUIET_DBLATEX	= @echo '   ' DBLATEX $@;
-	QUIET_XSLTPROC	= @echo '   ' XSLTPROC $@;
-	QUIET_GEN	= @echo '   ' GEN $@;
-	QUIET_STDERR	= 2> /dev/null
-	QUIET_SUBDIR0	= +@subdir=
-	QUIET_SUBDIR1	= ;$(NO_SUBDIR) echo '   ' SUBDIR $$subdir; \
-			  $(MAKE) $(PRINT_DIR) -C $$subdir
-	export V
-endif
-endif
-
-all: html man
-
-html: $(DOC_HTML)
-
-$(DOC_HTML) $(DOC_MAN1) $(DOC_MAN5) $(DOC_MAN7): asciidoc.conf
-
-man: man1 man5 man7
-man1: $(DOC_MAN1)
-man5: $(DOC_MAN5)
-man7: $(DOC_MAN7)
-
-info: perf.info perfman.info
-
-pdf: user-manual.pdf
-
-install: install-man
-
-install-man: man
-	$(INSTALL) -d -m 755 $(DESTDIR)$(man1dir)
-#	$(INSTALL) -d -m 755 $(DESTDIR)$(man5dir)
-#	$(INSTALL) -d -m 755 $(DESTDIR)$(man7dir)
-	$(INSTALL) -m 644 $(DOC_MAN1) $(DESTDIR)$(man1dir)
-#	$(INSTALL) -m 644 $(DOC_MAN5) $(DESTDIR)$(man5dir)
-#	$(INSTALL) -m 644 $(DOC_MAN7) $(DESTDIR)$(man7dir)
-
-install-info: info
-	$(INSTALL) -d -m 755 $(DESTDIR)$(infodir)
-	$(INSTALL) -m 644 perf.info perfman.info $(DESTDIR)$(infodir)
-	if test -r $(DESTDIR)$(infodir)/dir; then \
-	  $(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perf.info ;\
-	  $(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perfman.info ;\
-	else \
-	  echo "No directory found in $(DESTDIR)$(infodir)" >&2 ; \
-	fi
-
-install-pdf: pdf
-	$(INSTALL) -d -m 755 $(DESTDIR)$(pdfdir)
-	$(INSTALL) -m 644 user-manual.pdf $(DESTDIR)$(pdfdir)
-
-install-html: html
-	'$(SHELL_PATH_SQ)' ./install-webdoc.sh $(DESTDIR)$(htmldir)
-
-../PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
-	$(QUIET_SUBDIR0)../ $(QUIET_SUBDIR1) PERF-VERSION-FILE
-
--include ../PERF-VERSION-FILE
-
-#
-# Determine "include::" file references in asciidoc files.
-#
-doc.dep : $(wildcard *.txt) build-docdep.perl
-	$(QUIET_GEN)$(RM) $@+ $@ && \
-	$(PERL_PATH) ./build-docdep.perl >$@+ $(QUIET_STDERR) && \
-	mv $@+ $@
-
--include doc.dep
-
-cmds_txt = cmds-ancillaryinterrogators.txt \
-	cmds-ancillarymanipulators.txt \
-	cmds-mainporcelain.txt \
-	cmds-plumbinginterrogators.txt \
-	cmds-plumbingmanipulators.txt \
-	cmds-synchingrepositories.txt \
-	cmds-synchelpers.txt \
-	cmds-purehelpers.txt \
-	cmds-foreignscminterface.txt
-
-$(cmds_txt): cmd-list.made
-
-cmd-list.made: cmd-list.perl ../command-list.txt $(MAN1_TXT)
-	$(QUIET_GEN)$(RM) $@ && \
-	$(PERL_PATH) ./cmd-list.perl ../command-list.txt $(QUIET_STDERR) && \
-	date >$@
-
-clean:
-	$(RM) *.xml *.xml+ *.html *.html+ *.1 *.5 *.7
-	$(RM) *.texi *.texi+ *.texi++ perf.info perfman.info
-	$(RM) howto-index.txt howto/*.html doc.dep
-	$(RM) technical/api-*.html technical/api-index.txt
-	$(RM) $(cmds_txt) *.made
-
-$(MAN_HTML): %.html : %.txt
-	$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
-	$(ASCIIDOC) -b xhtml11 -d manpage -f asciidoc.conf \
-		$(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \
-	mv $@+ $@
-
-%.1 %.5 %.7 : %.xml
-	$(QUIET_XMLTO)$(RM) $@ && \
-	xmlto -m $(MANPAGE_XSL) $(XMLTO_EXTRA) man $<
-
-%.xml : %.txt
-	$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
-	$(ASCIIDOC) -b docbook -d manpage -f asciidoc.conf \
-		$(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \
-	mv $@+ $@
-
-XSLT = docbook.xsl
-XSLTOPTS = --xinclude --stringparam html.stylesheet docbook-xsl.css
-
-user-manual.html: user-manual.xml
-	$(QUIET_XSLTPROC)xsltproc $(XSLTOPTS) -o $@ $(XSLT) $<
-
-perf.info: user-manual.texi
-	$(QUIET_MAKEINFO)$(MAKEINFO) --no-split -o $@ user-manual.texi
-
-user-manual.texi: user-manual.xml
-	$(QUIET_DB2TEXI)$(RM) $@+ $@ && \
-	$(DOCBOOK2X_TEXI) user-manual.xml --encoding=UTF-8 --to-stdout >$@++ && \
-	$(PERL_PATH) fix-texi.perl <$@++ >$@+ && \
-	rm $@++ && \
-	mv $@+ $@
-
-user-manual.pdf: user-manual.xml
-	$(QUIET_DBLATEX)$(RM) $@+ $@ && \
-	$(DBLATEX) -o $@+ -p /etc/asciidoc/dblatex/asciidoc-dblatex.xsl -s /etc/asciidoc/dblatex/asciidoc-dblatex.sty $< && \
-	mv $@+ $@
-
-perfman.texi: $(MAN_XML) cat-texi.perl
-	$(QUIET_DB2TEXI)$(RM) $@+ $@ && \
-	($(foreach xml,$(MAN_XML),$(DOCBOOK2X_TEXI) --encoding=UTF-8 \
-		--to-stdout $(xml) &&) true) > $@++ && \
-	$(PERL_PATH) cat-texi.perl $@ <$@++ >$@+ && \
-	rm $@++ && \
-	mv $@+ $@
-
-perfman.info: perfman.texi
-	$(QUIET_MAKEINFO)$(MAKEINFO) --no-split --no-validate $*.texi
-
-$(patsubst %.txt,%.texi,$(MAN_TXT)): %.texi : %.xml
-	$(QUIET_DB2TEXI)$(RM) $@+ $@ && \
-	$(DOCBOOK2X_TEXI) --to-stdout $*.xml >$@+ && \
-	mv $@+ $@
-
-howto-index.txt: howto-index.sh $(wildcard howto/*.txt)
-	$(QUIET_GEN)$(RM) $@+ $@ && \
-	'$(SHELL_PATH_SQ)' ./howto-index.sh $(wildcard howto/*.txt) >$@+ && \
-	mv $@+ $@
-
-$(patsubst %,%.html,$(ARTICLES)) : %.html : %.txt
-	$(QUIET_ASCIIDOC)$(ASCIIDOC) -b xhtml11 $*.txt
-
-WEBDOC_DEST = /pub/software/tools/perf/docs
-
-$(patsubst %.txt,%.html,$(wildcard howto/*.txt)): %.html : %.txt
-	$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
-	sed -e '1,/^$$/d' $< | $(ASCIIDOC) -b xhtml11 - >$@+ && \
-	mv $@+ $@
-
-install-webdoc : html
-	'$(SHELL_PATH_SQ)' ./install-webdoc.sh $(WEBDOC_DEST)
-
-quick-install: quick-install-man
-
-quick-install-man:
-	'$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(DOC_REF) $(DESTDIR)$(mandir)
-
-quick-install-html:
-	'$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(HTML_REF) $(DESTDIR)$(htmldir)
-
-.PHONY: .FORCE-PERF-VERSION-FILE
diff --git a/Documentation/perf_counter/Documentation/asciidoc.conf b/Documentation/perf_counter/Documentation/asciidoc.conf
deleted file mode 100644
index 356b23a..0000000
--- a/Documentation/perf_counter/Documentation/asciidoc.conf
+++ /dev/null
@@ -1,91 +0,0 @@
-## linkperf: macro
-#
-# Usage: linkperf:command[manpage-section]
-#
-# Note, {0} is the manpage section, while {target} is the command.
-#
-# Show PERF link as: <command>(<section>); if section is defined, else just show
-# the command.
-
-[macros]
-(?su)[\\]?(?P<name>linkperf):(?P<target>\S*?)\[(?P<attrlist>.*?)\]=
-
-[attributes]
-asterisk=&#42;
-plus=&#43;
-caret=&#94;
-startsb=&#91;
-endsb=&#93;
-tilde=&#126;
-
-ifdef::backend-docbook[]
-[linkperf-inlinemacro]
-{0%{target}}
-{0#<citerefentry>}
-{0#<refentrytitle>{target}</refentrytitle><manvolnum>{0}</manvolnum>}
-{0#</citerefentry>}
-endif::backend-docbook[]
-
-ifdef::backend-docbook[]
-ifndef::perf-asciidoc-no-roff[]
-# "unbreak" docbook-xsl v1.68 for manpages. v1.69 works with or without this.
-# v1.72 breaks with this because it replaces dots not in roff requests.
-[listingblock]
-<example><title>{title}</title>
-<literallayout>
-ifdef::doctype-manpage[]
-&#10;.ft C&#10;
-endif::doctype-manpage[]
-|
-ifdef::doctype-manpage[]
-&#10;.ft&#10;
-endif::doctype-manpage[]
-</literallayout>
-{title#}</example>
-endif::perf-asciidoc-no-roff[]
-
-ifdef::perf-asciidoc-no-roff[]
-ifdef::doctype-manpage[]
-# The following two small workarounds insert a simple paragraph after screen
-[listingblock]
-<example><title>{title}</title>
-<literallayout>
-|
-</literallayout><simpara></simpara>
-{title#}</example>
-
-[verseblock]
-<formalpara{id? id="{id}"}><title>{title}</title><para>
-{title%}<literallayout{id? id="{id}"}>
-{title#}<literallayout>
-|
-</literallayout>
-{title#}</para></formalpara>
-{title%}<simpara></simpara>
-endif::doctype-manpage[]
-endif::perf-asciidoc-no-roff[]
-endif::backend-docbook[]
-
-ifdef::doctype-manpage[]
-ifdef::backend-docbook[]
-[header]
-template::[header-declarations]
-<refentry>
-<refmeta>
-<refentrytitle>{mantitle}</refentrytitle>
-<manvolnum>{manvolnum}</manvolnum>
-<refmiscinfo class="source">perf</refmiscinfo>
-<refmiscinfo class="version">{perf_version}</refmiscinfo>
-<refmiscinfo class="manual">perf Manual</refmiscinfo>
-</refmeta>
-<refnamediv>
-  <refname>{manname}</refname>
-  <refpurpose>{manpurpose}</refpurpose>
-</refnamediv>
-endif::backend-docbook[]
-endif::doctype-manpage[]
-
-ifdef::backend-xhtml11[]
-[linkperf-inlinemacro]
-<a href="{target}.html">{target}{0?({0})}</a>
-endif::backend-xhtml11[]
diff --git a/Documentation/perf_counter/Documentation/manpage-1.72.xsl b/Documentation/perf_counter/Documentation/manpage-1.72.xsl
deleted file mode 100644
index b4d315c..0000000
--- a/Documentation/perf_counter/Documentation/manpage-1.72.xsl
+++ /dev/null
@@ -1,14 +0,0 @@
-<!-- manpage-1.72.xsl:
-     special settings for manpages rendered from asciidoc+docbook
-     handles peculiarities in docbook-xsl 1.72.0 -->
-<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
-		version="1.0">
-
-<xsl:import href="manpage-base.xsl"/>
-
-<!-- these are the special values for the roff control characters
-     needed for docbook-xsl 1.72.0 -->
-<xsl:param name="git.docbook.backslash">&#x2593;</xsl:param>
-<xsl:param name="git.docbook.dot"      >&#x2302;</xsl:param>
-
-</xsl:stylesheet>
diff --git a/Documentation/perf_counter/Documentation/manpage-base.xsl b/Documentation/perf_counter/Documentation/manpage-base.xsl
deleted file mode 100644
index a264fa61..0000000
--- a/Documentation/perf_counter/Documentation/manpage-base.xsl
+++ /dev/null
@@ -1,35 +0,0 @@
-<!-- manpage-base.xsl:
-     special formatting for manpages rendered from asciidoc+docbook -->
-<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
-		version="1.0">
-
-<!-- these params silence some output from xmlto -->
-<xsl:param name="man.output.quietly" select="1"/>
-<xsl:param name="refentry.meta.get.quietly" select="1"/>
-
-<!-- convert asciidoc callouts to man page format;
-     git.docbook.backslash and git.docbook.dot params
-     must be supplied by another XSL file or other means -->
-<xsl:template match="co">
-	<xsl:value-of select="concat(
-			      $git.docbook.backslash,'fB(',
-			      substring-after(@id,'-'),')',
-			      $git.docbook.backslash,'fR')"/>
-</xsl:template>
-<xsl:template match="calloutlist">
-	<xsl:value-of select="$git.docbook.dot"/>
-	<xsl:text>sp&#10;</xsl:text>
-	<xsl:apply-templates/>
-	<xsl:text>&#10;</xsl:text>
-</xsl:template>
-<xsl:template match="callout">
-	<xsl:value-of select="concat(
-			      $git.docbook.backslash,'fB',
-			      substring-after(@arearefs,'-'),
-			      '. ',$git.docbook.backslash,'fR')"/>
-	<xsl:apply-templates/>
-	<xsl:value-of select="$git.docbook.dot"/>
-	<xsl:text>br&#10;</xsl:text>
-</xsl:template>
-
-</xsl:stylesheet>
diff --git a/Documentation/perf_counter/Documentation/manpage-bold-literal.xsl b/Documentation/perf_counter/Documentation/manpage-bold-literal.xsl
deleted file mode 100644
index 608eb5d..0000000
--- a/Documentation/perf_counter/Documentation/manpage-bold-literal.xsl
+++ /dev/null
@@ -1,17 +0,0 @@
-<!-- manpage-bold-literal.xsl:
-     special formatting for manpages rendered from asciidoc+docbook -->
-<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
-		version="1.0">
-
-<!-- render literal text as bold (instead of plain or monospace);
-     this makes literal text easier to distinguish in manpages
-     viewed on a tty -->
-<xsl:template match="literal">
-	<xsl:value-of select="$git.docbook.backslash"/>
-	<xsl:text>fB</xsl:text>
-	<xsl:apply-templates/>
-	<xsl:value-of select="$git.docbook.backslash"/>
-	<xsl:text>fR</xsl:text>
-</xsl:template>
-
-</xsl:stylesheet>
diff --git a/Documentation/perf_counter/Documentation/manpage-normal.xsl b/Documentation/perf_counter/Documentation/manpage-normal.xsl
deleted file mode 100644
index a48f5b1..0000000
--- a/Documentation/perf_counter/Documentation/manpage-normal.xsl
+++ /dev/null
@@ -1,13 +0,0 @@
-<!-- manpage-normal.xsl:
-     special settings for manpages rendered from asciidoc+docbook
-     handles anything we want to keep away from docbook-xsl 1.72.0 -->
-<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
-		version="1.0">
-
-<xsl:import href="manpage-base.xsl"/>
-
-<!-- these are the normal values for the roff control characters -->
-<xsl:param name="git.docbook.backslash">\</xsl:param>
-<xsl:param name="git.docbook.dot"	>.</xsl:param>
-
-</xsl:stylesheet>
diff --git a/Documentation/perf_counter/Documentation/manpage-suppress-sp.xsl b/Documentation/perf_counter/Documentation/manpage-suppress-sp.xsl
deleted file mode 100644
index a63c763..0000000
--- a/Documentation/perf_counter/Documentation/manpage-suppress-sp.xsl
+++ /dev/null
@@ -1,21 +0,0 @@
-<!-- manpage-suppress-sp.xsl:
-     special settings for manpages rendered from asciidoc+docbook
-     handles erroneous, inline .sp in manpage output of some
-     versions of docbook-xsl -->
-<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
-		version="1.0">
-
-<!-- attempt to work around spurious .sp at the tail of the line
-     that some versions of docbook stylesheets seem to add -->
-<xsl:template match="simpara">
-  <xsl:variable name="content">
-    <xsl:apply-templates/>
-  </xsl:variable>
-  <xsl:value-of select="normalize-space($content)"/>
-  <xsl:if test="not(ancestor::authorblurb) and
-                not(ancestor::personblurb)">
-    <xsl:text>&#10;&#10;</xsl:text>
-  </xsl:if>
-</xsl:template>
-
-</xsl:stylesheet>
diff --git a/Documentation/perf_counter/Documentation/perf-annotate.txt b/Documentation/perf_counter/Documentation/perf-annotate.txt
deleted file mode 100644
index c9dcade..0000000
--- a/Documentation/perf_counter/Documentation/perf-annotate.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-perf-annotate(1)
-==============
-
-NAME
-----
-perf-annotate - Read perf.data (created by perf record) and display annotated code
-
-SYNOPSIS
---------
-[verse]
-'perf annotate' [-i <file> | --input=file] symbol_name
-
-DESCRIPTION
------------
-This command reads the input file and displays an annotated version of the
-code. If the object file has debug symbols then the source code will be
-displayed alongside assembly code.
-
-If there is no debug info in the object, then annotated assembly is displayed.
-
-OPTIONS
--------
--i::
---input=::
-        Input file name. (default: perf.data)
-
-SEE ALSO
---------
-linkperf:perf-record[1]
diff --git a/Documentation/perf_counter/Documentation/perf-help.txt b/Documentation/perf_counter/Documentation/perf-help.txt
deleted file mode 100644
index 5143918..0000000
--- a/Documentation/perf_counter/Documentation/perf-help.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-perf-help(1)
-============
-
-NAME
-----
-perf-help - display help information about perf
-
-SYNOPSIS
---------
-'perf help' [-a|--all] [COMMAND]
-
-DESCRIPTION
------------
-
-With no options and no COMMAND given, the synopsis of the 'perf'
-command and a list of the most commonly used perf commands are printed
-on the standard output.
-
-If the option '--all' or '-a' is given, then all available commands are
-printed on the standard output.
-
-If a perf command is named, a manual page for that command is brought
-up. The 'man' program is used by default for this purpose, but this
-can be overridden by other options or configuration variables.
-
-Note that `perf --help ...` is identical to `perf help ...` because the
-former is internally converted into the latter.
-
-OPTIONS
--------
--a::
---all::
-	Prints all the available commands on the standard output. This
-	option supersedes any other option.
-
-PERF
-----
-Part of the linkperf:perf[1] suite
diff --git a/Documentation/perf_counter/Documentation/perf-list.txt b/Documentation/perf_counter/Documentation/perf-list.txt
deleted file mode 100644
index 8290b94..0000000
--- a/Documentation/perf_counter/Documentation/perf-list.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-perf-list(1)
-============
-
-NAME
-----
-perf-list - List all symbolic event types
-
-SYNOPSIS
---------
-[verse]
-'perf list'
-
-DESCRIPTION
------------
-This command displays the symbolic event types which can be selected in the
-various perf commands with the -e option.
-
-OPTIONS
--------
-None
-
-SEE ALSO
---------
-linkperf:perf-stat[1], linkperf:perf-top[1],
-linkperf:perf-record[1]
diff --git a/Documentation/perf_counter/Documentation/perf-record.txt b/Documentation/perf_counter/Documentation/perf-record.txt
deleted file mode 100644
index 1dbc1ee..0000000
--- a/Documentation/perf_counter/Documentation/perf-record.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-perf-record(1)
-==============
-
-NAME
-----
-perf-record - Run a command and record its profile into perf.data
-
-SYNOPSIS
---------
-[verse]
-'perf record' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
-'perf record' [-e <EVENT> | --event=EVENT] [-l] [-a] -- <command> [<options>]
-
-DESCRIPTION
------------
-This command runs a command and gathers a performance counter profile
-from it, into perf.data - without displaying anything.
-
-This file can then be inspected later on, using 'perf report'.
-
-
-OPTIONS
--------
-<command>...::
-	Any command you can specify in a shell.
-
--e::
---event=::
-	Select the PMU event. Selection can be a symbolic event name
-	(use 'perf list' to list all events) or a raw PMU
-	event (eventsel+umask) in the form of rNNN where NNN is a
-	 hexadecimal event descriptor.
-
--a::
-        system-wide collection
-
--l::
-        scale counter values
-
-SEE ALSO
---------
-linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/Documentation/perf_counter/Documentation/perf-report.txt b/Documentation/perf_counter/Documentation/perf-report.txt
deleted file mode 100644
index 52d3fc6..0000000
--- a/Documentation/perf_counter/Documentation/perf-report.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-perf-report(1)
-==============
-
-NAME
-----
-perf-report - Read perf.data (created by perf record) and display the profile
-
-SYNOPSIS
---------
-[verse]
-'perf report' [-i <file> | --input=file]
-
-DESCRIPTION
------------
-This command displays the performance counter profile information recorded
-via perf report.
-
-OPTIONS
--------
--i::
---input=::
-        Input file name. (default: perf.data)
-
-SEE ALSO
---------
-linkperf:perf-stat[1]
diff --git a/Documentation/perf_counter/Documentation/perf-stat.txt b/Documentation/perf_counter/Documentation/perf-stat.txt
deleted file mode 100644
index c368a72..0000000
--- a/Documentation/perf_counter/Documentation/perf-stat.txt
+++ /dev/null
@@ -1,66 +0,0 @@
-perf-stat(1)
-============
-
-NAME
-----
-perf-stat - Run a command and gather performance counter statistics
-
-SYNOPSIS
---------
-[verse]
-'perf stat' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
-'perf stat' [-e <EVENT> | --event=EVENT] [-l] [-a] -- <command> [<options>]
-
-DESCRIPTION
------------
-This command runs a command and gathers performance counter statistics
-from it.
-
-
-OPTIONS
--------
-<command>...::
-	Any command you can specify in a shell.
-
-
--e::
---event=::
-	Select the PMU event. Selection can be a symbolic event name
-	(use 'perf list' to list all events) or a raw PMU
-	event (eventsel+umask) in the form of rNNN where NNN is a
-	 hexadecimal event descriptor.
-
--i::
---inherit::
-        child tasks inherit counters
--p::
---pid=<pid>::
-        stat events on existing pid
-
--a::
-        system-wide collection
-
--l::
-        scale counter values
-
-EXAMPLES
---------
-
-$ perf stat -- make -j
-
- Performance counter stats for 'make -j':
-
-    8117.370256  task clock ticks     #      11.281 CPU utilization factor
-            678  context switches     #       0.000 M/sec
-            133  CPU migrations       #       0.000 M/sec
-         235724  pagefaults           #       0.029 M/sec
-    24821162526  CPU cycles           #    3057.784 M/sec
-    18687303457  instructions         #    2302.138 M/sec
-      172158895  cache references     #      21.209 M/sec
-       27075259  cache misses         #       3.335 M/sec
-
- Wall-clock time elapsed:   719.554352 msecs
-
-SEE ALSO
---------
-linkperf:perf-top[1], linkperf:perf-list[1]
diff --git a/Documentation/perf_counter/Documentation/perf-top.txt b/Documentation/perf_counter/Documentation/perf-top.txt
deleted file mode 100644
index 539d012..0000000
--- a/Documentation/perf_counter/Documentation/perf-top.txt
+++ /dev/null
@@ -1,39 +0,0 @@
-perf-top(1)
-===========
-
-NAME
-----
-perf-top - Run a command and profile it
-
-SYNOPSIS
---------
-[verse]
-'perf top' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
-
-DESCRIPTION
------------
-This command runs a command and gathers a performance counter profile
-from it.
-
-
-OPTIONS
--------
-<command>...::
-	Any command you can specify in a shell.
-
--e::
---event=::
-	Select the PMU event. Selection can be a symbolic event name
-	(use 'perf list' to list all events) or a raw PMU
-	event (eventsel+umask) in the form of rNNN where NNN is a
-	 hexadecimal event descriptor.
-
--a::
-        system-wide collection
-
--l::
-        scale counter values
-
-SEE ALSO
---------
-linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/Documentation/perf_counter/Documentation/perf.txt b/Documentation/perf_counter/Documentation/perf.txt
deleted file mode 100644
index 69c8325..0000000
--- a/Documentation/perf_counter/Documentation/perf.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-perf(1)
-=======
-
-NAME
-----
-perf - Performance analysis tools for Linux
-
-SYNOPSIS
---------
-[verse]
-'perf' [--version] [--help] COMMAND [ARGS]
-
-DESCRIPTION
------------
-Performance counters for Linux are are a new kernel-based subsystem
-that provide a framework for all things performance analysis. It
-covers hardware level (CPU/PMU, Performance Monitoring Unit) features
-and software features (software counters, tracepoints) as well.
-
-SEE ALSO
---------
-linkperf:perf-stat[1], linkperf:perf-top[1],
-linkperf:perf-record[1], linkperf:perf-report[1],
-linkperf:perf-list[1]
diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
deleted file mode 100644
index 0cbd5d6..0000000
--- a/Documentation/perf_counter/Makefile
+++ /dev/null
@@ -1,929 +0,0 @@
-# The default target of this Makefile is...
-all::
-
-# Define V=1 to have a more verbose compile.
-#
-# Define SNPRINTF_RETURNS_BOGUS if your are on a system which snprintf()
-# or vsnprintf() return -1 instead of number of characters which would
-# have been written to the final string if enough space had been available.
-#
-# Define FREAD_READS_DIRECTORIES if your are on a system which succeeds
-# when attempting to read from an fopen'ed directory.
-#
-# Define NO_OPENSSL environment variable if you do not have OpenSSL.
-# This also implies MOZILLA_SHA1.
-#
-# Define CURLDIR=/foo/bar if your curl header and library files are in
-# /foo/bar/include and /foo/bar/lib directories.
-#
-# Define EXPATDIR=/foo/bar if your expat header and library files are in
-# /foo/bar/include and /foo/bar/lib directories.
-#
-# Define NO_D_INO_IN_DIRENT if you don't have d_ino in your struct dirent.
-#
-# Define NO_D_TYPE_IN_DIRENT if your platform defines DT_UNKNOWN but lacks
-# d_type in struct dirent (latest Cygwin -- will be fixed soonish).
-#
-# Define NO_C99_FORMAT if your formatted IO functions (printf/scanf et.al.)
-# do not support the 'size specifiers' introduced by C99, namely ll, hh,
-# j, z, t. (representing long long int, char, intmax_t, size_t, ptrdiff_t).
-# some C compilers supported these specifiers prior to C99 as an extension.
-#
-# Define NO_STRCASESTR if you don't have strcasestr.
-#
-# Define NO_MEMMEM if you don't have memmem.
-#
-# Define NO_STRTOUMAX if you don't have strtoumax in the C library.
-# If your compiler also does not support long long or does not have
-# strtoull, define NO_STRTOULL.
-#
-# Define NO_SETENV if you don't have setenv in the C library.
-#
-# Define NO_UNSETENV if you don't have unsetenv in the C library.
-#
-# Define NO_MKDTEMP if you don't have mkdtemp in the C library.
-#
-# Define NO_SYS_SELECT_H if you don't have sys/select.h.
-#
-# Define NO_SYMLINK_HEAD if you never want .perf/HEAD to be a symbolic link.
-# Enable it on Windows.  By default, symrefs are still used.
-#
-# Define NO_SVN_TESTS if you want to skip time-consuming SVN interoperability
-# tests.  These tests take up a significant amount of the total test time
-# but are not needed unless you plan to talk to SVN repos.
-#
-# Define NO_FINK if you are building on Darwin/Mac OS X, have Fink
-# installed in /sw, but don't want PERF to link against any libraries
-# installed there.  If defined you may specify your own (or Fink's)
-# include directories and library directories by defining CFLAGS
-# and LDFLAGS appropriately.
-#
-# Define NO_DARWIN_PORTS if you are building on Darwin/Mac OS X,
-# have DarwinPorts installed in /opt/local, but don't want PERF to
-# link against any libraries installed there.  If defined you may
-# specify your own (or DarwinPort's) include directories and
-# library directories by defining CFLAGS and LDFLAGS appropriately.
-#
-# Define PPC_SHA1 environment variable when running make to make use of
-# a bundled SHA1 routine optimized for PowerPC.
-#
-# Define ARM_SHA1 environment variable when running make to make use of
-# a bundled SHA1 routine optimized for ARM.
-#
-# Define MOZILLA_SHA1 environment variable when running make to make use of
-# a bundled SHA1 routine coming from Mozilla. It is GPL'd and should be fast
-# on non-x86 architectures (e.g. PowerPC), while the OpenSSL version (default
-# choice) has very fast version optimized for i586.
-#
-# Define NEEDS_SSL_WITH_CRYPTO if you need -lcrypto with -lssl (Darwin).
-#
-# Define NEEDS_LIBICONV if linking with libc is not enough (Darwin).
-#
-# Define NEEDS_SOCKET if linking with libc is not enough (SunOS,
-# Patrick Mauritz).
-#
-# Define NO_MMAP if you want to avoid mmap.
-#
-# Define NO_PTHREADS if you do not have or do not want to use Pthreads.
-#
-# Define NO_PREAD if you have a problem with pread() system call (e.g.
-# cygwin.dll before v1.5.22).
-#
-# Define NO_FAST_WORKING_DIRECTORY if accessing objects in pack files is
-# generally faster on your platform than accessing the working directory.
-#
-# Define NO_TRUSTABLE_FILEMODE if your filesystem may claim to support
-# the executable mode bit, but doesn't really do so.
-#
-# Define NO_IPV6 if you lack IPv6 support and getaddrinfo().
-#
-# Define NO_SOCKADDR_STORAGE if your platform does not have struct
-# sockaddr_storage.
-#
-# Define NO_ICONV if your libc does not properly support iconv.
-#
-# Define OLD_ICONV if your library has an old iconv(), where the second
-# (input buffer pointer) parameter is declared with type (const char **).
-#
-# Define NO_DEFLATE_BOUND if your zlib does not have deflateBound.
-#
-# Define NO_R_TO_GCC_LINKER if your gcc does not like "-R/path/lib"
-# that tells runtime paths to dynamic libraries;
-# "-Wl,-rpath=/path/lib" is used instead.
-#
-# Define USE_NSEC below if you want perf to care about sub-second file mtimes
-# and ctimes. Note that you need recent glibc (at least 2.2.4) for this, and
-# it will BREAK YOUR LOCAL DIFFS! show-diff and anything using it will likely
-# randomly break unless your underlying filesystem supports those sub-second
-# times (my ext3 doesn't).
-#
-# Define USE_ST_TIMESPEC if your "struct stat" uses "st_ctimespec" instead of
-# "st_ctim"
-#
-# Define NO_NSEC if your "struct stat" does not have "st_ctim.tv_nsec"
-# available.  This automatically turns USE_NSEC off.
-#
-# Define USE_STDEV below if you want perf to care about the underlying device
-# change being considered an inode change from the update-index perspective.
-#
-# Define NO_ST_BLOCKS_IN_STRUCT_STAT if your platform does not have st_blocks
-# field that counts the on-disk footprint in 512-byte blocks.
-#
-# Define ASCIIDOC8 if you want to format documentation with AsciiDoc 8
-#
-# Define DOCBOOK_XSL_172 if you want to format man pages with DocBook XSL v1.72.
-#
-# Define NO_PERL_MAKEMAKER if you cannot use Makefiles generated by perl's
-# MakeMaker (e.g. using ActiveState under Cygwin).
-#
-# Define NO_PERL if you do not want Perl scripts or libraries at all.
-#
-# Define INTERNAL_QSORT to use Git's implementation of qsort(), which
-# is a simplified version of the merge sort used in glibc. This is
-# recommended if Git triggers O(n^2) behavior in your platform's qsort().
-#
-# Define NO_EXTERNAL_GREP if you don't want "perf grep" to ever call
-# your external grep (e.g., if your system lacks grep, if its grep is
-# broken, or spawning external process is slower than built-in grep perf has).
-
-PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
-	@$(SHELL_PATH) util/PERF-VERSION-GEN
--include PERF-VERSION-FILE
-
-uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
-uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not')
-uname_O := $(shell sh -c 'uname -o 2>/dev/null || echo not')
-uname_R := $(shell sh -c 'uname -r 2>/dev/null || echo not')
-uname_P := $(shell sh -c 'uname -p 2>/dev/null || echo not')
-uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
-
-# CFLAGS and LDFLAGS are for the users to override from the command line.
-
-CFLAGS = -ggdb3 -Wall -Werror -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement -O6
-LDFLAGS = -lpthread -lrt -lelf
-ALL_CFLAGS = $(CFLAGS)
-ALL_LDFLAGS = $(LDFLAGS)
-STRIP ?= strip
-
-# Among the variables below, these:
-#   perfexecdir
-#   template_dir
-#   mandir
-#   infodir
-#   htmldir
-#   ETC_PERFCONFIG (but not sysconfdir)
-# can be specified as a relative path some/where/else;
-# this is interpreted as relative to $(prefix) and "perf" at
-# runtime figures out where they are based on the path to the executable.
-# This can help installing the suite in a relocatable way.
-
-prefix = $(HOME)
-bindir_relative = bin
-bindir = $(prefix)/$(bindir_relative)
-mandir = share/man
-infodir = share/info
-perfexecdir = libexec/perf-core
-sharedir = $(prefix)/share
-template_dir = share/perf-core/templates
-htmldir = share/doc/perf-doc
-ifeq ($(prefix),/usr)
-sysconfdir = /etc
-ETC_PERFCONFIG = $(sysconfdir)/perfconfig
-else
-sysconfdir = $(prefix)/etc
-ETC_PERFCONFIG = etc/perfconfig
-endif
-lib = lib
-# DESTDIR=
-
-export prefix bindir sharedir sysconfdir
-
-CC = gcc
-AR = ar
-RM = rm -f
-TAR = tar
-FIND = find
-INSTALL = install
-RPMBUILD = rpmbuild
-PTHREAD_LIBS = -lpthread
-
-# sparse is architecture-neutral, which means that we need to tell it
-# explicitly what architecture to check for. Fix this up for yours..
-SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__
-
-
-
-### --- END CONFIGURATION SECTION ---
-
-# Those must not be GNU-specific; they are shared with perl/ which may
-# be built by a different compiler. (Note that this is an artifact now
-# but it still might be nice to keep that distinction.)
-BASIC_CFLAGS =
-BASIC_LDFLAGS =
-
-# Guard against environment variables
-BUILTIN_OBJS =
-BUILT_INS =
-COMPAT_CFLAGS =
-COMPAT_OBJS =
-LIB_H =
-LIB_OBJS =
-SCRIPT_PERL =
-SCRIPT_SH =
-TEST_PROGRAMS =
-
-#
-# No scripts right now:
-#
-
-# SCRIPT_SH += perf-am.sh
-
-#
-# No Perl scripts right now:
-#
-
-# SCRIPT_PERL += perf-add--interactive.perl
-
-SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH)) \
-	  $(patsubst %.perl,%,$(SCRIPT_PERL))
-
-# Empty...
-EXTRA_PROGRAMS =
-
-# ... and all the rest that could be moved out of bindir to perfexecdir
-PROGRAMS += $(EXTRA_PROGRAMS)
-
-#
-# Single 'perf' binary right now:
-#
-PROGRAMS += perf
-
-# List built-in command $C whose implementation cmd_$C() is not in
-# builtin-$C.o but is linked in as part of some other command.
-#
-# None right now:
-#
-# BUILT_INS += perf-init $X
-
-# what 'all' will build and 'install' will install, in perfexecdir
-ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS)
-
-# what 'all' will build but not install in perfexecdir
-OTHER_PROGRAMS = perf$X
-
-# Set paths to tools early so that they can be used for version tests.
-ifndef SHELL_PATH
-	SHELL_PATH = /bin/sh
-endif
-ifndef PERL_PATH
-	PERL_PATH = /usr/bin/perl
-endif
-
-export PERL_PATH
-
-LIB_FILE=libperf.a
-
-LIB_H += ../../include/linux/perf_counter.h
-LIB_H += perf.h
-LIB_H += util/list.h
-LIB_H += util/rbtree.h
-LIB_H += util/levenshtein.h
-LIB_H += util/parse-options.h
-LIB_H += util/parse-events.h
-LIB_H += util/quote.h
-LIB_H += util/util.h
-LIB_H += util/help.h
-LIB_H += util/strbuf.h
-LIB_H += util/string.h
-LIB_H += util/run-command.h
-LIB_H += util/sigchain.h
-LIB_H += util/symbol.h
-LIB_H += util/color.h
-
-LIB_OBJS += util/abspath.o
-LIB_OBJS += util/alias.o
-LIB_OBJS += util/config.o
-LIB_OBJS += util/ctype.o
-LIB_OBJS += util/environment.o
-LIB_OBJS += util/exec_cmd.o
-LIB_OBJS += util/help.o
-LIB_OBJS += util/levenshtein.o
-LIB_OBJS += util/parse-options.o
-LIB_OBJS += util/parse-events.o
-LIB_OBJS += util/path.o
-LIB_OBJS += util/rbtree.o
-LIB_OBJS += util/run-command.o
-LIB_OBJS += util/quote.o
-LIB_OBJS += util/strbuf.o
-LIB_OBJS += util/string.o
-LIB_OBJS += util/usage.o
-LIB_OBJS += util/wrapper.o
-LIB_OBJS += util/sigchain.o
-LIB_OBJS += util/symbol.o
-LIB_OBJS += util/color.o
-LIB_OBJS += util/pager.o
-
-BUILTIN_OBJS += builtin-annotate.o
-BUILTIN_OBJS += builtin-help.o
-BUILTIN_OBJS += builtin-list.o
-BUILTIN_OBJS += builtin-record.o
-BUILTIN_OBJS += builtin-report.o
-BUILTIN_OBJS += builtin-stat.o
-BUILTIN_OBJS += builtin-top.o
-
-PERFLIBS = $(LIB_FILE)
-EXTLIBS =
-
-#
-# Platform specific tweaks
-#
-
-# We choose to avoid "if .. else if .. else .. endif endif"
-# because maintaining the nesting to match is a pain.  If
-# we had "elif" things would have been much nicer...
-
--include config.mak.autogen
--include config.mak
-
-ifeq ($(uname_S),Darwin)
-	ifndef NO_FINK
-		ifeq ($(shell test -d /sw/lib && echo y),y)
-			BASIC_CFLAGS += -I/sw/include
-			BASIC_LDFLAGS += -L/sw/lib
-		endif
-	endif
-	ifndef NO_DARWIN_PORTS
-		ifeq ($(shell test -d /opt/local/lib && echo y),y)
-			BASIC_CFLAGS += -I/opt/local/include
-			BASIC_LDFLAGS += -L/opt/local/lib
-		endif
-	endif
-	PTHREAD_LIBS =
-endif
-
-ifndef CC_LD_DYNPATH
-	ifdef NO_R_TO_GCC_LINKER
-		# Some gcc does not accept and pass -R to the linker to specify
-		# the runtime dynamic library path.
-		CC_LD_DYNPATH = -Wl,-rpath,
-	else
-		CC_LD_DYNPATH = -R
-	endif
-endif
-
-ifdef ZLIB_PATH
-	BASIC_CFLAGS += -I$(ZLIB_PATH)/include
-	EXTLIBS += -L$(ZLIB_PATH)/$(lib) $(CC_LD_DYNPATH)$(ZLIB_PATH)/$(lib)
-endif
-EXTLIBS += -lz
-
-ifdef NEEDS_SOCKET
-	EXTLIBS += -lsocket
-endif
-ifdef NEEDS_NSL
-	EXTLIBS += -lnsl
-endif
-ifdef NO_D_TYPE_IN_DIRENT
-	BASIC_CFLAGS += -DNO_D_TYPE_IN_DIRENT
-endif
-ifdef NO_D_INO_IN_DIRENT
-	BASIC_CFLAGS += -DNO_D_INO_IN_DIRENT
-endif
-ifdef NO_ST_BLOCKS_IN_STRUCT_STAT
-	BASIC_CFLAGS += -DNO_ST_BLOCKS_IN_STRUCT_STAT
-endif
-ifdef USE_NSEC
-	BASIC_CFLAGS += -DUSE_NSEC
-endif
-ifdef USE_ST_TIMESPEC
-	BASIC_CFLAGS += -DUSE_ST_TIMESPEC
-endif
-ifdef NO_NSEC
-	BASIC_CFLAGS += -DNO_NSEC
-endif
-ifdef NO_C99_FORMAT
-	BASIC_CFLAGS += -DNO_C99_FORMAT
-endif
-ifdef SNPRINTF_RETURNS_BOGUS
-	COMPAT_CFLAGS += -DSNPRINTF_RETURNS_BOGUS
-	COMPAT_OBJS += compat/snprintf.o
-endif
-ifdef FREAD_READS_DIRECTORIES
-	COMPAT_CFLAGS += -DFREAD_READS_DIRECTORIES
-	COMPAT_OBJS += compat/fopen.o
-endif
-ifdef NO_SYMLINK_HEAD
-	BASIC_CFLAGS += -DNO_SYMLINK_HEAD
-endif
-ifdef NO_STRCASESTR
-	COMPAT_CFLAGS += -DNO_STRCASESTR
-	COMPAT_OBJS += compat/strcasestr.o
-endif
-ifdef NO_STRTOUMAX
-	COMPAT_CFLAGS += -DNO_STRTOUMAX
-	COMPAT_OBJS += compat/strtoumax.o
-endif
-ifdef NO_STRTOULL
-	COMPAT_CFLAGS += -DNO_STRTOULL
-endif
-ifdef NO_SETENV
-	COMPAT_CFLAGS += -DNO_SETENV
-	COMPAT_OBJS += compat/setenv.o
-endif
-ifdef NO_MKDTEMP
-	COMPAT_CFLAGS += -DNO_MKDTEMP
-	COMPAT_OBJS += compat/mkdtemp.o
-endif
-ifdef NO_UNSETENV
-	COMPAT_CFLAGS += -DNO_UNSETENV
-	COMPAT_OBJS += compat/unsetenv.o
-endif
-ifdef NO_SYS_SELECT_H
-	BASIC_CFLAGS += -DNO_SYS_SELECT_H
-endif
-ifdef NO_MMAP
-	COMPAT_CFLAGS += -DNO_MMAP
-	COMPAT_OBJS += compat/mmap.o
-else
-	ifdef USE_WIN32_MMAP
-		COMPAT_CFLAGS += -DUSE_WIN32_MMAP
-		COMPAT_OBJS += compat/win32mmap.o
-	endif
-endif
-ifdef NO_PREAD
-	COMPAT_CFLAGS += -DNO_PREAD
-	COMPAT_OBJS += compat/pread.o
-endif
-ifdef NO_FAST_WORKING_DIRECTORY
-	BASIC_CFLAGS += -DNO_FAST_WORKING_DIRECTORY
-endif
-ifdef NO_TRUSTABLE_FILEMODE
-	BASIC_CFLAGS += -DNO_TRUSTABLE_FILEMODE
-endif
-ifdef NO_IPV6
-	BASIC_CFLAGS += -DNO_IPV6
-endif
-ifdef NO_UINTMAX_T
-	BASIC_CFLAGS += -Duintmax_t=uint32_t
-endif
-ifdef NO_SOCKADDR_STORAGE
-ifdef NO_IPV6
-	BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in
-else
-	BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in6
-endif
-endif
-ifdef NO_INET_NTOP
-	LIB_OBJS += compat/inet_ntop.o
-endif
-ifdef NO_INET_PTON
-	LIB_OBJS += compat/inet_pton.o
-endif
-
-ifdef NO_ICONV
-	BASIC_CFLAGS += -DNO_ICONV
-endif
-
-ifdef OLD_ICONV
-	BASIC_CFLAGS += -DOLD_ICONV
-endif
-
-ifdef NO_DEFLATE_BOUND
-	BASIC_CFLAGS += -DNO_DEFLATE_BOUND
-endif
-
-ifdef PPC_SHA1
-	SHA1_HEADER = "ppc/sha1.h"
-	LIB_OBJS += ppc/sha1.o ppc/sha1ppc.o
-else
-ifdef ARM_SHA1
-	SHA1_HEADER = "arm/sha1.h"
-	LIB_OBJS += arm/sha1.o arm/sha1_arm.o
-else
-ifdef MOZILLA_SHA1
-	SHA1_HEADER = "mozilla-sha1/sha1.h"
-	LIB_OBJS += mozilla-sha1/sha1.o
-else
-	SHA1_HEADER = <openssl/sha.h>
-	EXTLIBS += $(LIB_4_CRYPTO)
-endif
-endif
-endif
-ifdef NO_PERL_MAKEMAKER
-	export NO_PERL_MAKEMAKER
-endif
-ifdef NO_HSTRERROR
-	COMPAT_CFLAGS += -DNO_HSTRERROR
-	COMPAT_OBJS += compat/hstrerror.o
-endif
-ifdef NO_MEMMEM
-	COMPAT_CFLAGS += -DNO_MEMMEM
-	COMPAT_OBJS += compat/memmem.o
-endif
-ifdef INTERNAL_QSORT
-	COMPAT_CFLAGS += -DINTERNAL_QSORT
-	COMPAT_OBJS += compat/qsort.o
-endif
-ifdef RUNTIME_PREFIX
-	COMPAT_CFLAGS += -DRUNTIME_PREFIX
-endif
-
-ifdef DIR_HAS_BSD_GROUP_SEMANTICS
-	COMPAT_CFLAGS += -DDIR_HAS_BSD_GROUP_SEMANTICS
-endif
-ifdef NO_EXTERNAL_GREP
-	BASIC_CFLAGS += -DNO_EXTERNAL_GREP
-endif
-
-ifeq ($(PERL_PATH),)
-NO_PERL=NoThanks
-endif
-
-QUIET_SUBDIR0  = +$(MAKE) -C # space to separate -C and subdir
-QUIET_SUBDIR1  =
-
-ifneq ($(findstring $(MAKEFLAGS),w),w)
-PRINT_DIR = --no-print-directory
-else # "make -w"
-NO_SUBDIR = :
-endif
-
-ifneq ($(findstring $(MAKEFLAGS),s),s)
-ifndef V
-	QUIET_CC       = @echo '   ' CC $@;
-	QUIET_AR       = @echo '   ' AR $@;
-	QUIET_LINK     = @echo '   ' LINK $@;
-	QUIET_BUILT_IN = @echo '   ' BUILTIN $@;
-	QUIET_GEN      = @echo '   ' GEN $@;
-	QUIET_SUBDIR0  = +@subdir=
-	QUIET_SUBDIR1  = ;$(NO_SUBDIR) echo '   ' SUBDIR $$subdir; \
-			 $(MAKE) $(PRINT_DIR) -C $$subdir
-	export V
-	export QUIET_GEN
-	export QUIET_BUILT_IN
-endif
-endif
-
-ifdef ASCIIDOC8
-	export ASCIIDOC8
-endif
-
-# Shell quote (do not use $(call) to accommodate ancient setups);
-
-SHA1_HEADER_SQ = $(subst ','\'',$(SHA1_HEADER))
-ETC_PERFCONFIG_SQ = $(subst ','\'',$(ETC_PERFCONFIG))
-
-DESTDIR_SQ = $(subst ','\'',$(DESTDIR))
-bindir_SQ = $(subst ','\'',$(bindir))
-bindir_relative_SQ = $(subst ','\'',$(bindir_relative))
-mandir_SQ = $(subst ','\'',$(mandir))
-infodir_SQ = $(subst ','\'',$(infodir))
-perfexecdir_SQ = $(subst ','\'',$(perfexecdir))
-template_dir_SQ = $(subst ','\'',$(template_dir))
-htmldir_SQ = $(subst ','\'',$(htmldir))
-prefix_SQ = $(subst ','\'',$(prefix))
-
-SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
-PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH))
-
-LIBS = $(PERFLIBS) $(EXTLIBS)
-
-BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \
-	$(COMPAT_CFLAGS)
-LIB_OBJS += $(COMPAT_OBJS)
-
-ALL_CFLAGS += $(BASIC_CFLAGS)
-ALL_LDFLAGS += $(BASIC_LDFLAGS)
-
-export TAR INSTALL DESTDIR SHELL_PATH
-
-
-### Build rules
-
-SHELL = $(SHELL_PATH)
-
-all:: shell_compatibility_test $(ALL_PROGRAMS) $(BUILT_INS) $(OTHER_PROGRAMS) PERF-BUILD-OPTIONS
-ifneq (,$X)
-	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), test '$p' -ef '$p$X' || $(RM) '$p';)
-endif
-
-all::
-
-please_set_SHELL_PATH_to_a_more_modern_shell:
-	@$$(:)
-
-shell_compatibility_test: please_set_SHELL_PATH_to_a_more_modern_shell
-
-strip: $(PROGRAMS) perf$X
-	$(STRIP) $(STRIP_OPTS) $(PROGRAMS) perf$X
-
-perf.o: perf.c common-cmds.h PERF-CFLAGS
-	$(QUIET_CC)$(CC) -DPERF_VERSION='"$(PERF_VERSION)"' \
-		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \
-		$(ALL_CFLAGS) -c $(filter %.c,$^)
-
-perf$X: perf.o $(BUILTIN_OBJS) $(PERFLIBS)
-	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ perf.o \
-		$(BUILTIN_OBJS) $(ALL_LDFLAGS) $(LIBS)
-
-builtin-help.o: builtin-help.c common-cmds.h PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
-		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \
-		'-DPERF_MAN_PATH="$(mandir_SQ)"' \
-		'-DPERF_INFO_PATH="$(infodir_SQ)"' $<
-
-$(BUILT_INS): perf$X
-	$(QUIET_BUILT_IN)$(RM) $@ && \
-	ln perf$X $@ 2>/dev/null || \
-	ln -s perf$X $@ 2>/dev/null || \
-	cp perf$X $@
-
-common-cmds.h: util/generate-cmdlist.sh command-list.txt
-
-common-cmds.h: $(wildcard Documentation/perf-*.txt)
-	$(QUIET_GEN)util/generate-cmdlist.sh > $@+ && mv $@+ $@
-
-$(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh
-	$(QUIET_GEN)$(RM) $@ $@+ && \
-	sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \
-	    -e 's|@SHELL_PATH@|$(SHELL_PATH_SQ)|' \
-	    -e 's|@@PERL@@|$(PERL_PATH_SQ)|g' \
-	    -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
-	    -e 's/@@NO_CURL@@/$(NO_CURL)/g' \
-	    $@.sh >$@+ && \
-	chmod +x $@+ && \
-	mv $@+ $@
-
-configure: configure.ac
-	$(QUIET_GEN)$(RM) $@ $<+ && \
-	sed -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
-	    $< > $<+ && \
-	autoconf -o $@ $<+ && \
-	$(RM) $<+
-
-# These can record PERF_VERSION
-perf.o perf.spec \
-	$(patsubst %.sh,%,$(SCRIPT_SH)) \
-	$(patsubst %.perl,%,$(SCRIPT_PERL)) \
-	: PERF-VERSION-FILE
-
-%.o: %.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
-%.s: %.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -S $(ALL_CFLAGS) $<
-%.o: %.S
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
-
-util/exec_cmd.o: util/exec_cmd.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
-		'-DPERF_EXEC_PATH="$(perfexecdir_SQ)"' \
-		'-DBINDIR="$(bindir_relative_SQ)"' \
-		'-DPREFIX="$(prefix_SQ)"' \
-		$<
-
-builtin-init-db.o: builtin-init-db.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DDEFAULT_PERF_TEMPLATE_DIR='"$(template_dir_SQ)"' $<
-
-util/config.o: util/config.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
-
-perf-%$X: %.o $(PERFLIBS)
-	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS)
-
-$(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H)
-$(patsubst perf-%$X,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h)
-builtin-revert.o wt-status.o: wt-status.h
-
-$(LIB_FILE): $(LIB_OBJS)
-	$(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS)
-
-doc:
-	$(MAKE) -C Documentation all
-
-man:
-	$(MAKE) -C Documentation man
-
-html:
-	$(MAKE) -C Documentation html
-
-info:
-	$(MAKE) -C Documentation info
-
-pdf:
-	$(MAKE) -C Documentation pdf
-
-TAGS:
-	$(RM) TAGS
-	$(FIND) . -name '*.[hcS]' -print | xargs etags -a
-
-tags:
-	$(RM) tags
-	$(FIND) . -name '*.[hcS]' -print | xargs ctags -a
-
-cscope:
-	$(RM) cscope*
-	$(FIND) . -name '*.[hcS]' -print | xargs cscope -b
-
-### Detect prefix changes
-TRACK_CFLAGS = $(subst ','\'',$(ALL_CFLAGS)):\
-             $(bindir_SQ):$(perfexecdir_SQ):$(template_dir_SQ):$(prefix_SQ)
-
-PERF-CFLAGS: .FORCE-PERF-CFLAGS
-	@FLAGS='$(TRACK_CFLAGS)'; \
-	    if test x"$$FLAGS" != x"`cat PERF-CFLAGS 2>/dev/null`" ; then \
-		echo 1>&2 "    * new build flags or prefix"; \
-		echo "$$FLAGS" >PERF-CFLAGS; \
-            fi
-
-# We need to apply sq twice, once to protect from the shell
-# that runs PERF-BUILD-OPTIONS, and then again to protect it
-# and the first level quoting from the shell that runs "echo".
-PERF-BUILD-OPTIONS: .FORCE-PERF-BUILD-OPTIONS
-	@echo SHELL_PATH=\''$(subst ','\'',$(SHELL_PATH_SQ))'\' >$@
-	@echo TAR=\''$(subst ','\'',$(subst ','\'',$(TAR)))'\' >>$@
-	@echo NO_CURL=\''$(subst ','\'',$(subst ','\'',$(NO_CURL)))'\' >>$@
-	@echo NO_PERL=\''$(subst ','\'',$(subst ','\'',$(NO_PERL)))'\' >>$@
-
-### Testing rules
-
-#
-# None right now:
-#
-# TEST_PROGRAMS += test-something$X
-
-all:: $(TEST_PROGRAMS)
-
-# GNU make supports exporting all variables by "export" without parameters.
-# However, the environment gets quite big, and some programs have problems
-# with that.
-
-export NO_SVN_TESTS
-
-check: common-cmds.h
-	if sparse; \
-	then \
-		for i in *.c */*.c; \
-		do \
-			sparse $(ALL_CFLAGS) $(SPARSE_FLAGS) $$i || exit; \
-		done; \
-	else \
-		echo 2>&1 "Did you mean 'make test'?"; \
-		exit 1; \
-	fi
-
-remove-dashes:
-	./fixup-builtins $(BUILT_INS) $(PROGRAMS) $(SCRIPTS)
-
-### Installation rules
-
-ifneq ($(filter /%,$(firstword $(template_dir))),)
-template_instdir = $(template_dir)
-else
-template_instdir = $(prefix)/$(template_dir)
-endif
-export template_instdir
-
-ifneq ($(filter /%,$(firstword $(perfexecdir))),)
-perfexec_instdir = $(perfexecdir)
-else
-perfexec_instdir = $(prefix)/$(perfexecdir)
-endif
-perfexec_instdir_SQ = $(subst ','\'',$(perfexec_instdir))
-export perfexec_instdir
-
-install: all
-	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
-	$(INSTALL) perf$X '$(DESTDIR_SQ)$(bindir_SQ)'
-ifdef BUILT_INS
-	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
-	$(INSTALL) $(BUILT_INS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
-ifneq (,$X)
-	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';)
-endif
-endif
-
-install-doc:
-	$(MAKE) -C Documentation install
-
-install-man:
-	$(MAKE) -C Documentation install-man
-
-install-html:
-	$(MAKE) -C Documentation install-html
-
-install-info:
-	$(MAKE) -C Documentation install-info
-
-install-pdf:
-	$(MAKE) -C Documentation install-pdf
-
-quick-install-doc:
-	$(MAKE) -C Documentation quick-install
-
-quick-install-man:
-	$(MAKE) -C Documentation quick-install-man
-
-quick-install-html:
-	$(MAKE) -C Documentation quick-install-html
-
-
-### Maintainer's dist rules
-#
-# None right now
-#
-#
-# perf.spec: perf.spec.in
-#	sed -e 's/@@VERSION@@/$(PERF_VERSION)/g' < $< > $@+
-#	mv $@+ $@
-#
-# PERF_TARNAME=perf-$(PERF_VERSION)
-# dist: perf.spec perf-archive$(X) configure
-#	./perf-archive --format=tar \
-#		--prefix=$(PERF_TARNAME)/ HEAD^{tree} > $(PERF_TARNAME).tar
-#	@mkdir -p $(PERF_TARNAME)
-#	@cp perf.spec configure $(PERF_TARNAME)
-#	@echo $(PERF_VERSION) > $(PERF_TARNAME)/version
-#	$(TAR) rf $(PERF_TARNAME).tar \
-#		$(PERF_TARNAME)/perf.spec \
-#		$(PERF_TARNAME)/configure \
-#		$(PERF_TARNAME)/version
-#	@$(RM) -r $(PERF_TARNAME)
-#	gzip -f -9 $(PERF_TARNAME).tar
-#
-# htmldocs = perf-htmldocs-$(PERF_VERSION)
-# manpages = perf-manpages-$(PERF_VERSION)
-# dist-doc:
-#	$(RM) -r .doc-tmp-dir
-#	mkdir .doc-tmp-dir
-#	$(MAKE) -C Documentation WEBDOC_DEST=../.doc-tmp-dir install-webdoc
-#	cd .doc-tmp-dir && $(TAR) cf ../$(htmldocs).tar .
-#	gzip -n -9 -f $(htmldocs).tar
-#	:
-#	$(RM) -r .doc-tmp-dir
-#	mkdir -p .doc-tmp-dir/man1 .doc-tmp-dir/man5 .doc-tmp-dir/man7
-#	$(MAKE) -C Documentation DESTDIR=./ \
-#		man1dir=../.doc-tmp-dir/man1 \
-#		man5dir=../.doc-tmp-dir/man5 \
-#		man7dir=../.doc-tmp-dir/man7 \
-#		install
-#	cd .doc-tmp-dir && $(TAR) cf ../$(manpages).tar .
-#	gzip -n -9 -f $(manpages).tar
-#	$(RM) -r .doc-tmp-dir
-#
-# rpm: dist
-#	$(RPMBUILD) -ta $(PERF_TARNAME).tar.gz
-
-### Cleaning rules
-
-distclean: clean
-#	$(RM) configure
-
-clean:
-	$(RM) *.o */*.o $(LIB_FILE)
-	$(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf$X
-	$(RM) $(TEST_PROGRAMS)
-	$(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo common-cmds.h TAGS tags cscope*
-	$(RM) -r autom4te.cache
-	$(RM) config.log config.mak.autogen config.mak.append config.status config.cache
-	$(RM) -r $(PERF_TARNAME) .doc-tmp-dir
-	$(RM) $(PERF_TARNAME).tar.gz perf-core_$(PERF_VERSION)-*.tar.gz
-	$(RM) $(htmldocs).tar.gz $(manpages).tar.gz
-	$(MAKE) -C Documentation/ clean
-	$(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-BUILD-OPTIONS
-
-.PHONY: all install clean strip
-.PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
-.PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope .FORCE-PERF-CFLAGS
-.PHONY: .FORCE-PERF-BUILD-OPTIONS
-
-### Make sure built-ins do not have dups and listed in perf.c
-#
-check-builtins::
-	./check-builtins.sh
-
-### Test suite coverage testing
-#
-# None right now
-#
-# .PHONY: coverage coverage-clean coverage-build coverage-report
-#
-# coverage:
-#	$(MAKE) coverage-build
-#	$(MAKE) coverage-report
-#
-# coverage-clean:
-#	rm -f *.gcda *.gcno
-#
-# COVERAGE_CFLAGS = $(CFLAGS) -O0 -ftest-coverage -fprofile-arcs
-# COVERAGE_LDFLAGS = $(CFLAGS)  -O0 -lgcov
-#
-# coverage-build: coverage-clean
-#	$(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" all
-#	$(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" \
-#		-j1 test
-#
-# coverage-report:
-#	gcov -b *.c */*.c
-#	grep '^function.*called 0 ' *.c.gcov */*.c.gcov \
-#		| sed -e 's/\([^:]*\)\.gcov: *function \([^ ]*\) called.*/\1: \2/' \
-#		| tee coverage-untested-functions
diff --git a/Documentation/perf_counter/builtin-annotate.c b/Documentation/perf_counter/builtin-annotate.c
deleted file mode 100644
index 116a397..0000000
--- a/Documentation/perf_counter/builtin-annotate.c
+++ /dev/null
@@ -1,1355 +0,0 @@
-/*
- * builtin-annotate.c
- *
- * Builtin annotate command: Analyze the perf.data input file,
- * look up and read DSOs and symbol information and display
- * a histogram of results, along various sorting keys.
- */
-#include "builtin.h"
-
-#include "util/util.h"
-
-#include "util/color.h"
-#include "util/list.h"
-#include "util/cache.h"
-#include "util/rbtree.h"
-#include "util/symbol.h"
-#include "util/string.h"
-
-#include "perf.h"
-
-#include "util/parse-options.h"
-#include "util/parse-events.h"
-
-#define SHOW_KERNEL	1
-#define SHOW_USER	2
-#define SHOW_HV		4
-
-static char		const *input_name = "perf.data";
-static char		*vmlinux = NULL;
-
-static char		default_sort_order[] = "comm,symbol";
-static char		*sort_order = default_sort_order;
-
-static int		input;
-static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
-
-static int		dump_trace = 0;
-#define dprintf(x...)	do { if (dump_trace) printf(x); } while (0)
-
-static int		verbose;
-
-static unsigned long	page_size;
-static unsigned long	mmap_window = 32;
-
-struct ip_event {
-	struct perf_event_header header;
-	__u64 ip;
-	__u32 pid, tid;
-};
-
-struct mmap_event {
-	struct perf_event_header header;
-	__u32 pid, tid;
-	__u64 start;
-	__u64 len;
-	__u64 pgoff;
-	char filename[PATH_MAX];
-};
-
-struct comm_event {
-	struct perf_event_header header;
-	__u32 pid, tid;
-	char comm[16];
-};
-
-struct fork_event {
-	struct perf_event_header header;
-	__u32 pid, ppid;
-};
-
-struct period_event {
-	struct perf_event_header header;
-	__u64 time;
-	__u64 id;
-	__u64 sample_period;
-};
-
-typedef union event_union {
-	struct perf_event_header	header;
-	struct ip_event			ip;
-	struct mmap_event		mmap;
-	struct comm_event		comm;
-	struct fork_event		fork;
-	struct period_event		period;
-} event_t;
-
-static LIST_HEAD(dsos);
-static struct dso *kernel_dso;
-static struct dso *vdso;
-
-
-static void dsos__add(struct dso *dso)
-{
-	list_add_tail(&dso->node, &dsos);
-}
-
-static struct dso *dsos__find(const char *name)
-{
-	struct dso *pos;
-
-	list_for_each_entry(pos, &dsos, node)
-		if (strcmp(pos->name, name) == 0)
-			return pos;
-	return NULL;
-}
-
-static struct dso *dsos__findnew(const char *name)
-{
-	struct dso *dso = dsos__find(name);
-	int nr;
-
-	if (dso)
-		return dso;
-
-	dso = dso__new(name, 0);
-	if (!dso)
-		goto out_delete_dso;
-
-	nr = dso__load(dso, NULL, verbose);
-	if (nr < 0) {
-		if (verbose)
-			fprintf(stderr, "Failed to open: %s\n", name);
-		goto out_delete_dso;
-	}
-	if (!nr && verbose) {
-		fprintf(stderr,
-		"No symbols found in: %s, maybe install a debug package?\n",
-				name);
-	}
-
-	dsos__add(dso);
-
-	return dso;
-
-out_delete_dso:
-	dso__delete(dso);
-	return NULL;
-}
-
-static void dsos__fprintf(FILE *fp)
-{
-	struct dso *pos;
-
-	list_for_each_entry(pos, &dsos, node)
-		dso__fprintf(pos, fp);
-}
-
-static struct symbol *vdso__find_symbol(struct dso *dso, uint64_t ip)
-{
-	return dso__find_symbol(kernel_dso, ip);
-}
-
-static int load_kernel(void)
-{
-	int err;
-
-	kernel_dso = dso__new("[kernel]", 0);
-	if (!kernel_dso)
-		return -1;
-
-	err = dso__load_kernel(kernel_dso, vmlinux, NULL, verbose);
-	if (err) {
-		dso__delete(kernel_dso);
-		kernel_dso = NULL;
-	} else
-		dsos__add(kernel_dso);
-
-	vdso = dso__new("[vdso]", 0);
-	if (!vdso)
-		return -1;
-
-	vdso->find_symbol = vdso__find_symbol;
-
-	dsos__add(vdso);
-
-	return err;
-}
-
-struct map {
-	struct list_head node;
-	uint64_t	 start;
-	uint64_t	 end;
-	uint64_t	 pgoff;
-	uint64_t	 (*map_ip)(struct map *, uint64_t);
-	struct dso	 *dso;
-};
-
-static uint64_t map__map_ip(struct map *map, uint64_t ip)
-{
-	return ip - map->start + map->pgoff;
-}
-
-static uint64_t vdso__map_ip(struct map *map, uint64_t ip)
-{
-	return ip;
-}
-
-static struct map *map__new(struct mmap_event *event)
-{
-	struct map *self = malloc(sizeof(*self));
-
-	if (self != NULL) {
-		const char *filename = event->filename;
-
-		self->start = event->start;
-		self->end   = event->start + event->len;
-		self->pgoff = event->pgoff;
-
-		self->dso = dsos__findnew(filename);
-		if (self->dso == NULL)
-			goto out_delete;
-
-		if (self->dso == vdso)
-			self->map_ip = vdso__map_ip;
-		else
-			self->map_ip = map__map_ip;
-	}
-	return self;
-out_delete:
-	free(self);
-	return NULL;
-}
-
-static struct map *map__clone(struct map *self)
-{
-	struct map *map = malloc(sizeof(*self));
-
-	if (!map)
-		return NULL;
-
-	memcpy(map, self, sizeof(*self));
-
-	return map;
-}
-
-static int map__overlap(struct map *l, struct map *r)
-{
-	if (l->start > r->start) {
-		struct map *t = l;
-		l = r;
-		r = t;
-	}
-
-	if (l->end > r->start)
-		return 1;
-
-	return 0;
-}
-
-static size_t map__fprintf(struct map *self, FILE *fp)
-{
-	return fprintf(fp, " %"PRIx64"-%"PRIx64" %"PRIx64" %s\n",
-		       self->start, self->end, self->pgoff, self->dso->name);
-}
-
-
-struct thread {
-	struct rb_node	 rb_node;
-	struct list_head maps;
-	pid_t		 pid;
-	char		 *comm;
-};
-
-static struct thread *thread__new(pid_t pid)
-{
-	struct thread *self = malloc(sizeof(*self));
-
-	if (self != NULL) {
-		self->pid = pid;
-		self->comm = malloc(32);
-		if (self->comm)
-			snprintf(self->comm, 32, ":%d", self->pid);
-		INIT_LIST_HEAD(&self->maps);
-	}
-
-	return self;
-}
-
-static int thread__set_comm(struct thread *self, const char *comm)
-{
-	if (self->comm)
-		free(self->comm);
-	self->comm = strdup(comm);
-	return self->comm ? 0 : -ENOMEM;
-}
-
-static size_t thread__fprintf(struct thread *self, FILE *fp)
-{
-	struct map *pos;
-	size_t ret = fprintf(fp, "Thread %d %s\n", self->pid, self->comm);
-
-	list_for_each_entry(pos, &self->maps, node)
-		ret += map__fprintf(pos, fp);
-
-	return ret;
-}
-
-
-static struct rb_root threads;
-static struct thread *last_match;
-
-static struct thread *threads__findnew(pid_t pid)
-{
-	struct rb_node **p = &threads.rb_node;
-	struct rb_node *parent = NULL;
-	struct thread *th;
-
-	/*
-	 * Font-end cache - PID lookups come in blocks,
-	 * so most of the time we dont have to look up
-	 * the full rbtree:
-	 */
-	if (last_match && last_match->pid == pid)
-		return last_match;
-
-	while (*p != NULL) {
-		parent = *p;
-		th = rb_entry(parent, struct thread, rb_node);
-
-		if (th->pid == pid) {
-			last_match = th;
-			return th;
-		}
-
-		if (pid < th->pid)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-
-	th = thread__new(pid);
-	if (th != NULL) {
-		rb_link_node(&th->rb_node, parent, p);
-		rb_insert_color(&th->rb_node, &threads);
-		last_match = th;
-	}
-
-	return th;
-}
-
-static void thread__insert_map(struct thread *self, struct map *map)
-{
-	struct map *pos, *tmp;
-
-	list_for_each_entry_safe(pos, tmp, &self->maps, node) {
-		if (map__overlap(pos, map)) {
-			list_del_init(&pos->node);
-			/* XXX leaks dsos */
-			free(pos);
-		}
-	}
-
-	list_add_tail(&map->node, &self->maps);
-}
-
-static int thread__fork(struct thread *self, struct thread *parent)
-{
-	struct map *map;
-
-	if (self->comm)
-		free(self->comm);
-	self->comm = strdup(parent->comm);
-	if (!self->comm)
-		return -ENOMEM;
-
-	list_for_each_entry(map, &parent->maps, node) {
-		struct map *new = map__clone(map);
-		if (!new)
-			return -ENOMEM;
-		thread__insert_map(self, new);
-	}
-
-	return 0;
-}
-
-static struct map *thread__find_map(struct thread *self, uint64_t ip)
-{
-	struct map *pos;
-
-	if (self == NULL)
-		return NULL;
-
-	list_for_each_entry(pos, &self->maps, node)
-		if (ip >= pos->start && ip <= pos->end)
-			return pos;
-
-	return NULL;
-}
-
-static size_t threads__fprintf(FILE *fp)
-{
-	size_t ret = 0;
-	struct rb_node *nd;
-
-	for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
-		struct thread *pos = rb_entry(nd, struct thread, rb_node);
-
-		ret += thread__fprintf(pos, fp);
-	}
-
-	return ret;
-}
-
-/*
- * histogram, sorted on item, collects counts
- */
-
-static struct rb_root hist;
-
-struct hist_entry {
-	struct rb_node	 rb_node;
-
-	struct thread	 *thread;
-	struct map	 *map;
-	struct dso	 *dso;
-	struct symbol	 *sym;
-	uint64_t	 ip;
-	char		 level;
-
-	uint32_t	 count;
-};
-
-/*
- * configurable sorting bits
- */
-
-struct sort_entry {
-	struct list_head list;
-
-	char *header;
-
-	int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
-	int64_t (*collapse)(struct hist_entry *, struct hist_entry *);
-	size_t	(*print)(FILE *fp, struct hist_entry *);
-};
-
-/* --sort pid */
-
-static int64_t
-sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
-{
-	return right->thread->pid - left->thread->pid;
-}
-
-static size_t
-sort__thread_print(FILE *fp, struct hist_entry *self)
-{
-	return fprintf(fp, "%16s:%5d", self->thread->comm ?: "", self->thread->pid);
-}
-
-static struct sort_entry sort_thread = {
-	.header = "         Command:  Pid",
-	.cmp	= sort__thread_cmp,
-	.print	= sort__thread_print,
-};
-
-/* --sort comm */
-
-static int64_t
-sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
-{
-	return right->thread->pid - left->thread->pid;
-}
-
-static int64_t
-sort__comm_collapse(struct hist_entry *left, struct hist_entry *right)
-{
-	char *comm_l = left->thread->comm;
-	char *comm_r = right->thread->comm;
-
-	if (!comm_l || !comm_r) {
-		if (!comm_l && !comm_r)
-			return 0;
-		else if (!comm_l)
-			return -1;
-		else
-			return 1;
-	}
-
-	return strcmp(comm_l, comm_r);
-}
-
-static size_t
-sort__comm_print(FILE *fp, struct hist_entry *self)
-{
-	return fprintf(fp, "%16s", self->thread->comm);
-}
-
-static struct sort_entry sort_comm = {
-	.header		= "         Command",
-	.cmp		= sort__comm_cmp,
-	.collapse	= sort__comm_collapse,
-	.print		= sort__comm_print,
-};
-
-/* --sort dso */
-
-static int64_t
-sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
-{
-	struct dso *dso_l = left->dso;
-	struct dso *dso_r = right->dso;
-
-	if (!dso_l || !dso_r) {
-		if (!dso_l && !dso_r)
-			return 0;
-		else if (!dso_l)
-			return -1;
-		else
-			return 1;
-	}
-
-	return strcmp(dso_l->name, dso_r->name);
-}
-
-static size_t
-sort__dso_print(FILE *fp, struct hist_entry *self)
-{
-	if (self->dso)
-		return fprintf(fp, "%-25s", self->dso->name);
-
-	return fprintf(fp, "%016llx         ", (__u64)self->ip);
-}
-
-static struct sort_entry sort_dso = {
-	.header = "Shared Object            ",
-	.cmp	= sort__dso_cmp,
-	.print	= sort__dso_print,
-};
-
-/* --sort symbol */
-
-static int64_t
-sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
-{
-	uint64_t ip_l, ip_r;
-
-	if (left->sym == right->sym)
-		return 0;
-
-	ip_l = left->sym ? left->sym->start : left->ip;
-	ip_r = right->sym ? right->sym->start : right->ip;
-
-	return (int64_t)(ip_r - ip_l);
-}
-
-static size_t
-sort__sym_print(FILE *fp, struct hist_entry *self)
-{
-	size_t ret = 0;
-
-	if (verbose)
-		ret += fprintf(fp, "%#018llx  ", (__u64)self->ip);
-
-	if (self->sym) {
-		ret += fprintf(fp, "[%c] %s",
-			self->dso == kernel_dso ? 'k' : '.', self->sym->name);
-	} else {
-		ret += fprintf(fp, "%#016llx", (__u64)self->ip);
-	}
-
-	return ret;
-}
-
-static struct sort_entry sort_sym = {
-	.header = "Symbol",
-	.cmp	= sort__sym_cmp,
-	.print	= sort__sym_print,
-};
-
-static int sort__need_collapse = 0;
-
-struct sort_dimension {
-	char			*name;
-	struct sort_entry	*entry;
-	int			taken;
-};
-
-static struct sort_dimension sort_dimensions[] = {
-	{ .name = "pid",	.entry = &sort_thread,	},
-	{ .name = "comm",	.entry = &sort_comm,	},
-	{ .name = "dso",	.entry = &sort_dso,	},
-	{ .name = "symbol",	.entry = &sort_sym,	},
-};
-
-static LIST_HEAD(hist_entry__sort_list);
-
-static int sort_dimension__add(char *tok)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(sort_dimensions); i++) {
-		struct sort_dimension *sd = &sort_dimensions[i];
-
-		if (sd->taken)
-			continue;
-
-		if (strncasecmp(tok, sd->name, strlen(tok)))
-			continue;
-
-		if (sd->entry->collapse)
-			sort__need_collapse = 1;
-
-		list_add_tail(&sd->entry->list, &hist_entry__sort_list);
-		sd->taken = 1;
-
-		return 0;
-	}
-
-	return -ESRCH;
-}
-
-static int64_t
-hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
-{
-	struct sort_entry *se;
-	int64_t cmp = 0;
-
-	list_for_each_entry(se, &hist_entry__sort_list, list) {
-		cmp = se->cmp(left, right);
-		if (cmp)
-			break;
-	}
-
-	return cmp;
-}
-
-static int64_t
-hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
-{
-	struct sort_entry *se;
-	int64_t cmp = 0;
-
-	list_for_each_entry(se, &hist_entry__sort_list, list) {
-		int64_t (*f)(struct hist_entry *, struct hist_entry *);
-
-		f = se->collapse ?: se->cmp;
-
-		cmp = f(left, right);
-		if (cmp)
-			break;
-	}
-
-	return cmp;
-}
-
-/*
- * collect histogram counts
- */
-static void hist_hit(struct hist_entry *he, uint64_t ip)
-{
-	unsigned int sym_size, offset;
-	struct symbol *sym = he->sym;
-
-	he->count++;
-
-	if (!sym || !sym->hist)
-		return;
-
-	sym_size = sym->end - sym->start;
-	offset = ip - sym->start;
-
-	if (offset >= sym_size)
-		return;
-
-	sym->hist_sum++;
-	sym->hist[offset]++;
-
-	if (verbose >= 3)
-		printf("%p %s: count++ [ip: %p, %08Lx] => %Ld\n",
-			(void *)he->sym->start,
-			he->sym->name,
-			(void *)ip, ip - he->sym->start,
-			sym->hist[offset]);
-}
-
-static int
-hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
-		struct symbol *sym, uint64_t ip, char level)
-{
-	struct rb_node **p = &hist.rb_node;
-	struct rb_node *parent = NULL;
-	struct hist_entry *he;
-	struct hist_entry entry = {
-		.thread	= thread,
-		.map	= map,
-		.dso	= dso,
-		.sym	= sym,
-		.ip	= ip,
-		.level	= level,
-		.count	= 1,
-	};
-	int cmp;
-
-	while (*p != NULL) {
-		parent = *p;
-		he = rb_entry(parent, struct hist_entry, rb_node);
-
-		cmp = hist_entry__cmp(&entry, he);
-
-		if (!cmp) {
-			hist_hit(he, ip);
-
-			return 0;
-		}
-
-		if (cmp < 0)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-
-	he = malloc(sizeof(*he));
-	if (!he)
-		return -ENOMEM;
-	*he = entry;
-	rb_link_node(&he->rb_node, parent, p);
-	rb_insert_color(&he->rb_node, &hist);
-
-	return 0;
-}
-
-static void hist_entry__free(struct hist_entry *he)
-{
-	free(he);
-}
-
-/*
- * collapse the histogram
- */
-
-static struct rb_root collapse_hists;
-
-static void collapse__insert_entry(struct hist_entry *he)
-{
-	struct rb_node **p = &collapse_hists.rb_node;
-	struct rb_node *parent = NULL;
-	struct hist_entry *iter;
-	int64_t cmp;
-
-	while (*p != NULL) {
-		parent = *p;
-		iter = rb_entry(parent, struct hist_entry, rb_node);
-
-		cmp = hist_entry__collapse(iter, he);
-
-		if (!cmp) {
-			iter->count += he->count;
-			hist_entry__free(he);
-			return;
-		}
-
-		if (cmp < 0)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-
-	rb_link_node(&he->rb_node, parent, p);
-	rb_insert_color(&he->rb_node, &collapse_hists);
-}
-
-static void collapse__resort(void)
-{
-	struct rb_node *next;
-	struct hist_entry *n;
-
-	if (!sort__need_collapse)
-		return;
-
-	next = rb_first(&hist);
-	while (next) {
-		n = rb_entry(next, struct hist_entry, rb_node);
-		next = rb_next(&n->rb_node);
-
-		rb_erase(&n->rb_node, &hist);
-		collapse__insert_entry(n);
-	}
-}
-
-/*
- * reverse the map, sort on count.
- */
-
-static struct rb_root output_hists;
-
-static void output__insert_entry(struct hist_entry *he)
-{
-	struct rb_node **p = &output_hists.rb_node;
-	struct rb_node *parent = NULL;
-	struct hist_entry *iter;
-
-	while (*p != NULL) {
-		parent = *p;
-		iter = rb_entry(parent, struct hist_entry, rb_node);
-
-		if (he->count > iter->count)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-
-	rb_link_node(&he->rb_node, parent, p);
-	rb_insert_color(&he->rb_node, &output_hists);
-}
-
-static void output__resort(void)
-{
-	struct rb_node *next;
-	struct hist_entry *n;
-	struct rb_root *tree = &hist;
-
-	if (sort__need_collapse)
-		tree = &collapse_hists;
-
-	next = rb_first(tree);
-
-	while (next) {
-		n = rb_entry(next, struct hist_entry, rb_node);
-		next = rb_next(&n->rb_node);
-
-		rb_erase(&n->rb_node, tree);
-		output__insert_entry(n);
-	}
-}
-
-static void register_idle_thread(void)
-{
-	struct thread *thread = threads__findnew(0);
-
-	if (thread == NULL ||
-			thread__set_comm(thread, "[idle]")) {
-		fprintf(stderr, "problem inserting idle task.\n");
-		exit(-1);
-	}
-}
-
-static unsigned long total = 0,
-		     total_mmap = 0,
-		     total_comm = 0,
-		     total_fork = 0,
-		     total_unknown = 0;
-
-static int
-process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
-{
-	char level;
-	int show = 0;
-	struct dso *dso = NULL;
-	struct thread *thread = threads__findnew(event->ip.pid);
-	uint64_t ip = event->ip.ip;
-	struct map *map = NULL;
-
-	dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
-		(void *)(offset + head),
-		(void *)(long)(event->header.size),
-		event->header.misc,
-		event->ip.pid,
-		(void *)(long)ip);
-
-	dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
-
-	if (thread == NULL) {
-		fprintf(stderr, "problem processing %d event, skipping it.\n",
-			event->header.type);
-		return -1;
-	}
-
-	if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
-		show = SHOW_KERNEL;
-		level = 'k';
-
-		dso = kernel_dso;
-
-		dprintf(" ...... dso: %s\n", dso->name);
-
-	} else if (event->header.misc & PERF_EVENT_MISC_USER) {
-
-		show = SHOW_USER;
-		level = '.';
-
-		map = thread__find_map(thread, ip);
-		if (map != NULL) {
-			ip = map->map_ip(map, ip);
-			dso = map->dso;
-		} else {
-			/*
-			 * If this is outside of all known maps,
-			 * and is a negative address, try to look it
-			 * up in the kernel dso, as it might be a
-			 * vsyscall (which executes in user-mode):
-			 */
-			if ((long long)ip < 0)
-				dso = kernel_dso;
-		}
-		dprintf(" ...... dso: %s\n", dso ? dso->name : "<not found>");
-
-	} else {
-		show = SHOW_HV;
-		level = 'H';
-		dprintf(" ...... dso: [hypervisor]\n");
-	}
-
-	if (show & show_mask) {
-		struct symbol *sym = NULL;
-
-		if (dso)
-			sym = dso->find_symbol(dso, ip);
-
-		if (hist_entry__add(thread, map, dso, sym, ip, level)) {
-			fprintf(stderr,
-		"problem incrementing symbol count, skipping event\n");
-			return -1;
-		}
-	}
-	total++;
-
-	return 0;
-}
-
-static int
-process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
-{
-	struct thread *thread = threads__findnew(event->mmap.pid);
-	struct map *map = map__new(&event->mmap);
-
-	dprintf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n",
-		(void *)(offset + head),
-		(void *)(long)(event->header.size),
-		event->mmap.pid,
-		(void *)(long)event->mmap.start,
-		(void *)(long)event->mmap.len,
-		(void *)(long)event->mmap.pgoff,
-		event->mmap.filename);
-
-	if (thread == NULL || map == NULL) {
-		dprintf("problem processing PERF_EVENT_MMAP, skipping event.\n");
-		return 0;
-	}
-
-	thread__insert_map(thread, map);
-	total_mmap++;
-
-	return 0;
-}
-
-static int
-process_comm_event(event_t *event, unsigned long offset, unsigned long head)
-{
-	struct thread *thread = threads__findnew(event->comm.pid);
-
-	dprintf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
-		(void *)(offset + head),
-		(void *)(long)(event->header.size),
-		event->comm.comm, event->comm.pid);
-
-	if (thread == NULL ||
-	    thread__set_comm(thread, event->comm.comm)) {
-		dprintf("problem processing PERF_EVENT_COMM, skipping event.\n");
-		return -1;
-	}
-	total_comm++;
-
-	return 0;
-}
-
-static int
-process_fork_event(event_t *event, unsigned long offset, unsigned long head)
-{
-	struct thread *thread = threads__findnew(event->fork.pid);
-	struct thread *parent = threads__findnew(event->fork.ppid);
-
-	dprintf("%p [%p]: PERF_EVENT_FORK: %d:%d\n",
-		(void *)(offset + head),
-		(void *)(long)(event->header.size),
-		event->fork.pid, event->fork.ppid);
-
-	if (!thread || !parent || thread__fork(thread, parent)) {
-		dprintf("problem processing PERF_EVENT_FORK, skipping event.\n");
-		return -1;
-	}
-	total_fork++;
-
-	return 0;
-}
-
-static int
-process_period_event(event_t *event, unsigned long offset, unsigned long head)
-{
-	dprintf("%p [%p]: PERF_EVENT_PERIOD: time:%Ld, id:%Ld: period:%Ld\n",
-		(void *)(offset + head),
-		(void *)(long)(event->header.size),
-		event->period.time,
-		event->period.id,
-		event->period.sample_period);
-
-	return 0;
-}
-
-static int
-process_event(event_t *event, unsigned long offset, unsigned long head)
-{
-	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW)
-		return process_overflow_event(event, offset, head);
-
-	switch (event->header.type) {
-	case PERF_EVENT_MMAP:
-		return process_mmap_event(event, offset, head);
-
-	case PERF_EVENT_COMM:
-		return process_comm_event(event, offset, head);
-
-	case PERF_EVENT_FORK:
-		return process_fork_event(event, offset, head);
-
-	case PERF_EVENT_PERIOD:
-		return process_period_event(event, offset, head);
-	/*
-	 * We dont process them right now but they are fine:
-	 */
-
-	case PERF_EVENT_THROTTLE:
-	case PERF_EVENT_UNTHROTTLE:
-		return 0;
-
-	default:
-		return -1;
-	}
-
-	return 0;
-}
-
-static int
-parse_line(FILE *file, struct symbol *sym, uint64_t start, uint64_t len)
-{
-	char *line = NULL, *tmp, *tmp2;
-	unsigned int offset;
-	size_t line_len;
-	__u64 line_ip;
-	int ret;
-	char *c;
-
-	if (getline(&line, &line_len, file) < 0)
-		return -1;
-	if (!line)
-		return -1;
-
-	c = strchr(line, '\n');
-	if (c)
-		*c = 0;
-
-	line_ip = -1;
-	offset = 0;
-	ret = -2;
-
-	/*
-	 * Strip leading spaces:
-	 */
-	tmp = line;
-	while (*tmp) {
-		if (*tmp != ' ')
-			break;
-		tmp++;
-	}
-
-	if (*tmp) {
-		/*
-		 * Parse hexa addresses followed by ':'
-		 */
-		line_ip = strtoull(tmp, &tmp2, 16);
-		if (*tmp2 != ':')
-			line_ip = -1;
-	}
-
-	if (line_ip != -1) {
-		unsigned int hits = 0;
-		double percent = 0.0;
-		char *color = PERF_COLOR_NORMAL;
-
-		offset = line_ip - start;
-		if (offset < len)
-			hits = sym->hist[offset];
-
-		if (sym->hist_sum)
-			percent = 100.0 * hits / sym->hist_sum;
-
-		/*
-		 * We color high-overhead entries in red, low-overhead
-		 * entries in green - and keep the middle ground normal:
-		 */
-		if (percent >= 5.0)
-			color = PERF_COLOR_RED;
-		else {
-			if (percent > 0.5)
-				color = PERF_COLOR_GREEN;
-		}
-
-		color_fprintf(stdout, color, " %7.2f", percent);
-		printf(" :	");
-		color_fprintf(stdout, PERF_COLOR_BLUE, "%s\n", line);
-	} else {
-		if (!*line)
-			printf("         :\n");
-		else
-			printf("         :	%s\n", line);
-	}
-
-	return 0;
-}
-
-static void annotate_sym(struct dso *dso, struct symbol *sym)
-{
-	char *filename = dso->name;
-	uint64_t start, end, len;
-	char command[PATH_MAX*2];
-	FILE *file;
-
-	if (!filename)
-		return;
-	if (dso == kernel_dso)
-		filename = vmlinux;
-
-	printf("\n------------------------------------------------\n");
-	printf(" Percent |	Source code & Disassembly of %s\n", filename);
-	printf("------------------------------------------------\n");
-
-	if (verbose >= 2)
-		printf("annotating [%p] %30s : [%p] %30s\n", dso, dso->name, sym, sym->name);
-
-	start = sym->obj_start;
-	if (!start)
-		start = sym->start;
-
-	end = start + sym->end - sym->start + 1;
-	len = sym->end - sym->start;
-
-	sprintf(command, "objdump --start-address=0x%016Lx --stop-address=0x%016Lx -dS %s", (__u64)start, (__u64)end, filename);
-
-	if (verbose >= 3)
-		printf("doing: %s\n", command);
-
-	file = popen(command, "r");
-	if (!file)
-		return;
-
-	while (!feof(file)) {
-		if (parse_line(file, sym, start, len) < 0)
-			break;
-	}
-
-	pclose(file);
-}
-
-static void find_annotations(void)
-{
-	struct rb_node *nd;
-	struct dso *dso;
-	int count = 0;
-
-	list_for_each_entry(dso, &dsos, node) {
-
-		for (nd = rb_first(&dso->syms); nd; nd = rb_next(nd)) {
-			struct symbol *sym = rb_entry(nd, struct symbol, rb_node);
-
-			if (sym->hist) {
-				annotate_sym(dso, sym);
-				count++;
-			}
-		}
-	}
-
-	if (!count)
-		printf(" Error: symbol '%s' not present amongst the samples.\n", sym_hist_filter);
-}
-
-static int __cmd_annotate(void)
-{
-	int ret, rc = EXIT_FAILURE;
-	unsigned long offset = 0;
-	unsigned long head = 0;
-	struct stat stat;
-	event_t *event;
-	uint32_t size;
-	char *buf;
-
-	register_idle_thread();
-
-	input = open(input_name, O_RDONLY);
-	if (input < 0) {
-		perror("failed to open file");
-		exit(-1);
-	}
-
-	ret = fstat(input, &stat);
-	if (ret < 0) {
-		perror("failed to stat file");
-		exit(-1);
-	}
-
-	if (!stat.st_size) {
-		fprintf(stderr, "zero-sized file, nothing to do!\n");
-		exit(0);
-	}
-
-	if (load_kernel() < 0) {
-		perror("failed to load kernel symbols");
-		return EXIT_FAILURE;
-	}
-
-remap:
-	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
-			   MAP_SHARED, input, offset);
-	if (buf == MAP_FAILED) {
-		perror("failed to mmap file");
-		exit(-1);
-	}
-
-more:
-	event = (event_t *)(buf + head);
-
-	size = event->header.size;
-	if (!size)
-		size = 8;
-
-	if (head + event->header.size >= page_size * mmap_window) {
-		unsigned long shift = page_size * (head / page_size);
-		int ret;
-
-		ret = munmap(buf, page_size * mmap_window);
-		assert(ret == 0);
-
-		offset += shift;
-		head -= shift;
-		goto remap;
-	}
-
-	size = event->header.size;
-
-	dprintf("%p [%p]: event: %d\n",
-			(void *)(offset + head),
-			(void *)(long)event->header.size,
-			event->header.type);
-
-	if (!size || process_event(event, offset, head) < 0) {
-
-		dprintf("%p [%p]: skipping unknown header type: %d\n",
-			(void *)(offset + head),
-			(void *)(long)(event->header.size),
-			event->header.type);
-
-		total_unknown++;
-
-		/*
-		 * assume we lost track of the stream, check alignment, and
-		 * increment a single u64 in the hope to catch on again 'soon'.
-		 */
-
-		if (unlikely(head & 7))
-			head &= ~7ULL;
-
-		size = 8;
-	}
-
-	head += size;
-
-	if (offset + head < stat.st_size)
-		goto more;
-
-	rc = EXIT_SUCCESS;
-	close(input);
-
-	dprintf("      IP events: %10ld\n", total);
-	dprintf("    mmap events: %10ld\n", total_mmap);
-	dprintf("    comm events: %10ld\n", total_comm);
-	dprintf("    fork events: %10ld\n", total_fork);
-	dprintf(" unknown events: %10ld\n", total_unknown);
-
-	if (dump_trace)
-		return 0;
-
-	if (verbose >= 3)
-		threads__fprintf(stdout);
-
-	if (verbose >= 2)
-		dsos__fprintf(stdout);
-
-	collapse__resort();
-	output__resort();
-
-	find_annotations();
-
-	return rc;
-}
-
-static const char * const annotate_usage[] = {
-	"perf annotate [<options>] <command>",
-	NULL
-};
-
-static const struct option options[] = {
-	OPT_STRING('i', "input", &input_name, "file",
-		    "input file name"),
-	OPT_STRING('s', "symbol", &sym_hist_filter, "file",
-		    "symbol to annotate"),
-	OPT_BOOLEAN('v', "verbose", &verbose,
-		    "be more verbose (show symbol address, etc)"),
-	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
-		    "dump raw trace in ASCII"),
-	OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
-	OPT_END()
-};
-
-static void setup_sorting(void)
-{
-	char *tmp, *tok, *str = strdup(sort_order);
-
-	for (tok = strtok_r(str, ", ", &tmp);
-			tok; tok = strtok_r(NULL, ", ", &tmp)) {
-		if (sort_dimension__add(tok) < 0) {
-			error("Unknown --sort key: `%s'", tok);
-			usage_with_options(annotate_usage, options);
-		}
-	}
-
-	free(str);
-}
-
-int cmd_annotate(int argc, const char **argv, const char *prefix)
-{
-	symbol__init();
-
-	page_size = getpagesize();
-
-	argc = parse_options(argc, argv, options, annotate_usage, 0);
-
-	setup_sorting();
-
-	if (argc) {
-		/*
-		 * Special case: if there's an argument left then assume tha
-		 * it's a symbol filter:
-		 */
-		if (argc > 1)
-			usage_with_options(annotate_usage, options);
-
-		sym_hist_filter = argv[0];
-	}
-
-	if (!sym_hist_filter)
-		usage_with_options(annotate_usage, options);
-
-	setup_pager();
-
-	return __cmd_annotate();
-}
diff --git a/Documentation/perf_counter/builtin-help.c b/Documentation/perf_counter/builtin-help.c
deleted file mode 100644
index 0f32dc3..0000000
--- a/Documentation/perf_counter/builtin-help.c
+++ /dev/null
@@ -1,461 +0,0 @@
-/*
- * builtin-help.c
- *
- * Builtin help command
- */
-#include "util/cache.h"
-#include "builtin.h"
-#include "util/exec_cmd.h"
-#include "common-cmds.h"
-#include "util/parse-options.h"
-#include "util/run-command.h"
-#include "util/help.h"
-
-static struct man_viewer_list {
-	struct man_viewer_list *next;
-	char name[FLEX_ARRAY];
-} *man_viewer_list;
-
-static struct man_viewer_info_list {
-	struct man_viewer_info_list *next;
-	const char *info;
-	char name[FLEX_ARRAY];
-} *man_viewer_info_list;
-
-enum help_format {
-	HELP_FORMAT_MAN,
-	HELP_FORMAT_INFO,
-	HELP_FORMAT_WEB,
-};
-
-static int show_all = 0;
-static enum help_format help_format = HELP_FORMAT_MAN;
-static struct option builtin_help_options[] = {
-	OPT_BOOLEAN('a', "all", &show_all, "print all available commands"),
-	OPT_SET_INT('m', "man", &help_format, "show man page", HELP_FORMAT_MAN),
-	OPT_SET_INT('w', "web", &help_format, "show manual in web browser",
-			HELP_FORMAT_WEB),
-	OPT_SET_INT('i', "info", &help_format, "show info page",
-			HELP_FORMAT_INFO),
-	OPT_END(),
-};
-
-static const char * const builtin_help_usage[] = {
-	"perf help [--all] [--man|--web|--info] [command]",
-	NULL
-};
-
-static enum help_format parse_help_format(const char *format)
-{
-	if (!strcmp(format, "man"))
-		return HELP_FORMAT_MAN;
-	if (!strcmp(format, "info"))
-		return HELP_FORMAT_INFO;
-	if (!strcmp(format, "web") || !strcmp(format, "html"))
-		return HELP_FORMAT_WEB;
-	die("unrecognized help format '%s'", format);
-}
-
-static const char *get_man_viewer_info(const char *name)
-{
-	struct man_viewer_info_list *viewer;
-
-	for (viewer = man_viewer_info_list; viewer; viewer = viewer->next)
-	{
-		if (!strcasecmp(name, viewer->name))
-			return viewer->info;
-	}
-	return NULL;
-}
-
-static int check_emacsclient_version(void)
-{
-	struct strbuf buffer = STRBUF_INIT;
-	struct child_process ec_process;
-	const char *argv_ec[] = { "emacsclient", "--version", NULL };
-	int version;
-
-	/* emacsclient prints its version number on stderr */
-	memset(&ec_process, 0, sizeof(ec_process));
-	ec_process.argv = argv_ec;
-	ec_process.err = -1;
-	ec_process.stdout_to_stderr = 1;
-	if (start_command(&ec_process)) {
-		fprintf(stderr, "Failed to start emacsclient.\n");
-		return -1;
-	}
-	strbuf_read(&buffer, ec_process.err, 20);
-	close(ec_process.err);
-
-	/*
-	 * Don't bother checking return value, because "emacsclient --version"
-	 * seems to always exits with code 1.
-	 */
-	finish_command(&ec_process);
-
-	if (prefixcmp(buffer.buf, "emacsclient")) {
-		fprintf(stderr, "Failed to parse emacsclient version.\n");
-		strbuf_release(&buffer);
-		return -1;
-	}
-
-	strbuf_remove(&buffer, 0, strlen("emacsclient"));
-	version = atoi(buffer.buf);
-
-	if (version < 22) {
-		fprintf(stderr,
-			"emacsclient version '%d' too old (< 22).\n",
-			version);
-		strbuf_release(&buffer);
-		return -1;
-	}
-
-	strbuf_release(&buffer);
-	return 0;
-}
-
-static void exec_woman_emacs(const char* path, const char *page)
-{
-	if (!check_emacsclient_version()) {
-		/* This works only with emacsclient version >= 22. */
-		struct strbuf man_page = STRBUF_INIT;
-
-		if (!path)
-			path = "emacsclient";
-		strbuf_addf(&man_page, "(woman \"%s\")", page);
-		execlp(path, "emacsclient", "-e", man_page.buf, NULL);
-		warning("failed to exec '%s': %s", path, strerror(errno));
-	}
-}
-
-static void exec_man_konqueror(const char* path, const char *page)
-{
-	const char *display = getenv("DISPLAY");
-	if (display && *display) {
-		struct strbuf man_page = STRBUF_INIT;
-		const char *filename = "kfmclient";
-
-		/* It's simpler to launch konqueror using kfmclient. */
-		if (path) {
-			const char *file = strrchr(path, '/');
-			if (file && !strcmp(file + 1, "konqueror")) {
-				char *new = strdup(path);
-				char *dest = strrchr(new, '/');
-
-				/* strlen("konqueror") == strlen("kfmclient") */
-				strcpy(dest + 1, "kfmclient");
-				path = new;
-			}
-			if (file)
-				filename = file;
-		} else
-			path = "kfmclient";
-		strbuf_addf(&man_page, "man:%s(1)", page);
-		execlp(path, filename, "newTab", man_page.buf, NULL);
-		warning("failed to exec '%s': %s", path, strerror(errno));
-	}
-}
-
-static void exec_man_man(const char* path, const char *page)
-{
-	if (!path)
-		path = "man";
-	execlp(path, "man", page, NULL);
-	warning("failed to exec '%s': %s", path, strerror(errno));
-}
-
-static void exec_man_cmd(const char *cmd, const char *page)
-{
-	struct strbuf shell_cmd = STRBUF_INIT;
-	strbuf_addf(&shell_cmd, "%s %s", cmd, page);
-	execl("/bin/sh", "sh", "-c", shell_cmd.buf, NULL);
-	warning("failed to exec '%s': %s", cmd, strerror(errno));
-}
-
-static void add_man_viewer(const char *name)
-{
-	struct man_viewer_list **p = &man_viewer_list;
-	size_t len = strlen(name);
-
-	while (*p)
-		p = &((*p)->next);
-	*p = calloc(1, (sizeof(**p) + len + 1));
-	strncpy((*p)->name, name, len);
-}
-
-static int supported_man_viewer(const char *name, size_t len)
-{
-	return (!strncasecmp("man", name, len) ||
-		!strncasecmp("woman", name, len) ||
-		!strncasecmp("konqueror", name, len));
-}
-
-static void do_add_man_viewer_info(const char *name,
-				   size_t len,
-				   const char *value)
-{
-	struct man_viewer_info_list *new = calloc(1, sizeof(*new) + len + 1);
-
-	strncpy(new->name, name, len);
-	new->info = strdup(value);
-	new->next = man_viewer_info_list;
-	man_viewer_info_list = new;
-}
-
-static int add_man_viewer_path(const char *name,
-			       size_t len,
-			       const char *value)
-{
-	if (supported_man_viewer(name, len))
-		do_add_man_viewer_info(name, len, value);
-	else
-		warning("'%s': path for unsupported man viewer.\n"
-			"Please consider using 'man.<tool>.cmd' instead.",
-			name);
-
-	return 0;
-}
-
-static int add_man_viewer_cmd(const char *name,
-			      size_t len,
-			      const char *value)
-{
-	if (supported_man_viewer(name, len))
-		warning("'%s': cmd for supported man viewer.\n"
-			"Please consider using 'man.<tool>.path' instead.",
-			name);
-	else
-		do_add_man_viewer_info(name, len, value);
-
-	return 0;
-}
-
-static int add_man_viewer_info(const char *var, const char *value)
-{
-	const char *name = var + 4;
-	const char *subkey = strrchr(name, '.');
-
-	if (!subkey)
-		return error("Config with no key for man viewer: %s", name);
-
-	if (!strcmp(subkey, ".path")) {
-		if (!value)
-			return config_error_nonbool(var);
-		return add_man_viewer_path(name, subkey - name, value);
-	}
-	if (!strcmp(subkey, ".cmd")) {
-		if (!value)
-			return config_error_nonbool(var);
-		return add_man_viewer_cmd(name, subkey - name, value);
-	}
-
-	warning("'%s': unsupported man viewer sub key.", subkey);
-	return 0;
-}
-
-static int perf_help_config(const char *var, const char *value, void *cb)
-{
-	if (!strcmp(var, "help.format")) {
-		if (!value)
-			return config_error_nonbool(var);
-		help_format = parse_help_format(value);
-		return 0;
-	}
-	if (!strcmp(var, "man.viewer")) {
-		if (!value)
-			return config_error_nonbool(var);
-		add_man_viewer(value);
-		return 0;
-	}
-	if (!prefixcmp(var, "man."))
-		return add_man_viewer_info(var, value);
-
-	return perf_default_config(var, value, cb);
-}
-
-static struct cmdnames main_cmds, other_cmds;
-
-void list_common_cmds_help(void)
-{
-	int i, longest = 0;
-
-	for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
-		if (longest < strlen(common_cmds[i].name))
-			longest = strlen(common_cmds[i].name);
-	}
-
-	puts(" The most commonly used perf commands are:");
-	for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
-		printf("   %s   ", common_cmds[i].name);
-		mput_char(' ', longest - strlen(common_cmds[i].name));
-		puts(common_cmds[i].help);
-	}
-}
-
-static int is_perf_command(const char *s)
-{
-	return is_in_cmdlist(&main_cmds, s) ||
-		is_in_cmdlist(&other_cmds, s);
-}
-
-static const char *prepend(const char *prefix, const char *cmd)
-{
-	size_t pre_len = strlen(prefix);
-	size_t cmd_len = strlen(cmd);
-	char *p = malloc(pre_len + cmd_len + 1);
-	memcpy(p, prefix, pre_len);
-	strcpy(p + pre_len, cmd);
-	return p;
-}
-
-static const char *cmd_to_page(const char *perf_cmd)
-{
-	if (!perf_cmd)
-		return "perf";
-	else if (!prefixcmp(perf_cmd, "perf"))
-		return perf_cmd;
-	else if (is_perf_command(perf_cmd))
-		return prepend("perf-", perf_cmd);
-	else
-		return prepend("perf-", perf_cmd);
-}
-
-static void setup_man_path(void)
-{
-	struct strbuf new_path = STRBUF_INIT;
-	const char *old_path = getenv("MANPATH");
-
-	/* We should always put ':' after our path. If there is no
-	 * old_path, the ':' at the end will let 'man' to try
-	 * system-wide paths after ours to find the manual page. If
-	 * there is old_path, we need ':' as delimiter. */
-	strbuf_addstr(&new_path, system_path(PERF_MAN_PATH));
-	strbuf_addch(&new_path, ':');
-	if (old_path)
-		strbuf_addstr(&new_path, old_path);
-
-	setenv("MANPATH", new_path.buf, 1);
-
-	strbuf_release(&new_path);
-}
-
-static void exec_viewer(const char *name, const char *page)
-{
-	const char *info = get_man_viewer_info(name);
-
-	if (!strcasecmp(name, "man"))
-		exec_man_man(info, page);
-	else if (!strcasecmp(name, "woman"))
-		exec_woman_emacs(info, page);
-	else if (!strcasecmp(name, "konqueror"))
-		exec_man_konqueror(info, page);
-	else if (info)
-		exec_man_cmd(info, page);
-	else
-		warning("'%s': unknown man viewer.", name);
-}
-
-static void show_man_page(const char *perf_cmd)
-{
-	struct man_viewer_list *viewer;
-	const char *page = cmd_to_page(perf_cmd);
-	const char *fallback = getenv("PERF_MAN_VIEWER");
-
-	setup_man_path();
-	for (viewer = man_viewer_list; viewer; viewer = viewer->next)
-	{
-		exec_viewer(viewer->name, page); /* will return when unable */
-	}
-	if (fallback)
-		exec_viewer(fallback, page);
-	exec_viewer("man", page);
-	die("no man viewer handled the request");
-}
-
-static void show_info_page(const char *perf_cmd)
-{
-	const char *page = cmd_to_page(perf_cmd);
-	setenv("INFOPATH", system_path(PERF_INFO_PATH), 1);
-	execlp("info", "info", "perfman", page, NULL);
-}
-
-static void get_html_page_path(struct strbuf *page_path, const char *page)
-{
-	struct stat st;
-	const char *html_path = system_path(PERF_HTML_PATH);
-
-	/* Check that we have a perf documentation directory. */
-	if (stat(mkpath("%s/perf.html", html_path), &st)
-	    || !S_ISREG(st.st_mode))
-		die("'%s': not a documentation directory.", html_path);
-
-	strbuf_init(page_path, 0);
-	strbuf_addf(page_path, "%s/%s.html", html_path, page);
-}
-
-/*
- * If open_html is not defined in a platform-specific way (see for
- * example compat/mingw.h), we use the script web--browse to display
- * HTML.
- */
-#ifndef open_html
-static void open_html(const char *path)
-{
-	execl_perf_cmd("web--browse", "-c", "help.browser", path, NULL);
-}
-#endif
-
-static void show_html_page(const char *perf_cmd)
-{
-	const char *page = cmd_to_page(perf_cmd);
-	struct strbuf page_path; /* it leaks but we exec bellow */
-
-	get_html_page_path(&page_path, page);
-
-	open_html(page_path.buf);
-}
-
-int cmd_help(int argc, const char **argv, const char *prefix)
-{
-	const char *alias;
-	load_command_list("perf-", &main_cmds, &other_cmds);
-
-	perf_config(perf_help_config, NULL);
-
-	argc = parse_options(argc, argv, builtin_help_options,
-			builtin_help_usage, 0);
-
-	if (show_all) {
-		printf("\n usage: %s\n\n", perf_usage_string);
-		list_commands("perf commands", &main_cmds, &other_cmds);
-		printf(" %s\n\n", perf_more_info_string);
-		return 0;
-	}
-
-	if (!argv[0]) {
-		printf("\n usage: %s\n\n", perf_usage_string);
-		list_common_cmds_help();
-		printf("\n %s\n\n", perf_more_info_string);
-		return 0;
-	}
-
-	alias = alias_lookup(argv[0]);
-	if (alias && !is_perf_command(argv[0])) {
-		printf("`perf %s' is aliased to `%s'\n", argv[0], alias);
-		return 0;
-	}
-
-	switch (help_format) {
-	case HELP_FORMAT_MAN:
-		show_man_page(argv[0]);
-		break;
-	case HELP_FORMAT_INFO:
-		show_info_page(argv[0]);
-		break;
-	case HELP_FORMAT_WEB:
-		show_html_page(argv[0]);
-		break;
-	}
-
-	return 0;
-}
diff --git a/Documentation/perf_counter/builtin-list.c b/Documentation/perf_counter/builtin-list.c
deleted file mode 100644
index fe60e37..0000000
--- a/Documentation/perf_counter/builtin-list.c
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * builtin-list.c
- *
- * Builtin list command: list all event types
- *
- * Copyright (C) 2009, Thomas Gleixner <tglx@linutronix.de>
- * Copyright (C) 2008-2009, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
- */
-#include "builtin.h"
-
-#include "perf.h"
-
-#include "util/parse-options.h"
-#include "util/parse-events.h"
-
-int cmd_list(int argc, const char **argv, const char *prefix)
-{
-	print_events();
-	return 0;
-}
diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
deleted file mode 100644
index aeab9c4..0000000
--- a/Documentation/perf_counter/builtin-record.c
+++ /dev/null
@@ -1,544 +0,0 @@
-/*
- * builtin-record.c
- *
- * Builtin record command: Record the profile of a workload
- * (or a CPU, or a PID) into the perf.data output file - for
- * later analysis via perf report.
- */
-#include "builtin.h"
-
-#include "perf.h"
-
-#include "util/util.h"
-#include "util/parse-options.h"
-#include "util/parse-events.h"
-#include "util/string.h"
-
-#include <unistd.h>
-#include <sched.h>
-
-#define ALIGN(x, a)		__ALIGN_MASK(x, (typeof(x))(a)-1)
-#define __ALIGN_MASK(x, mask)	(((x)+(mask))&~(mask))
-
-static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
-
-static long			default_interval		= 100000;
-
-static int			nr_cpus				= 0;
-static unsigned int		page_size;
-static unsigned int		mmap_pages			= 128;
-static int			freq				= 0;
-static int			output;
-static const char		*output_name			= "perf.data";
-static int			group				= 0;
-static unsigned int		realtime_prio			= 0;
-static int			system_wide			= 0;
-static pid_t			target_pid			= -1;
-static int			inherit				= 1;
-static int			force				= 0;
-static int			append_file			= 0;
-
-static long			samples;
-static struct timeval		last_read;
-static struct timeval		this_read;
-
-static __u64			bytes_written;
-
-static struct pollfd		event_array[MAX_NR_CPUS * MAX_COUNTERS];
-
-static int			nr_poll;
-static int			nr_cpu;
-
-struct mmap_event {
-	struct perf_event_header	header;
-	__u32				pid;
-	__u32				tid;
-	__u64				start;
-	__u64				len;
-	__u64				pgoff;
-	char				filename[PATH_MAX];
-};
-
-struct comm_event {
-	struct perf_event_header	header;
-	__u32				pid;
-	__u32				tid;
-	char				comm[16];
-};
-
-
-struct mmap_data {
-	int			counter;
-	void			*base;
-	unsigned int		mask;
-	unsigned int		prev;
-};
-
-static struct mmap_data		mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
-
-static unsigned int mmap_read_head(struct mmap_data *md)
-{
-	struct perf_counter_mmap_page *pc = md->base;
-	int head;
-
-	head = pc->data_head;
-	rmb();
-
-	return head;
-}
-
-static void mmap_read(struct mmap_data *md)
-{
-	unsigned int head = mmap_read_head(md);
-	unsigned int old = md->prev;
-	unsigned char *data = md->base + page_size;
-	unsigned long size;
-	void *buf;
-	int diff;
-
-	gettimeofday(&this_read, NULL);
-
-	/*
-	 * If we're further behind than half the buffer, there's a chance
-	 * the writer will bite our tail and mess up the samples under us.
-	 *
-	 * If we somehow ended up ahead of the head, we got messed up.
-	 *
-	 * In either case, truncate and restart at head.
-	 */
-	diff = head - old;
-	if (diff > md->mask / 2 || diff < 0) {
-		struct timeval iv;
-		unsigned long msecs;
-
-		timersub(&this_read, &last_read, &iv);
-		msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
-
-		fprintf(stderr, "WARNING: failed to keep up with mmap data."
-				"  Last read %lu msecs ago.\n", msecs);
-
-		/*
-		 * head points to a known good entry, start there.
-		 */
-		old = head;
-	}
-
-	last_read = this_read;
-
-	if (old != head)
-		samples++;
-
-	size = head - old;
-
-	if ((old & md->mask) + size != (head & md->mask)) {
-		buf = &data[old & md->mask];
-		size = md->mask + 1 - (old & md->mask);
-		old += size;
-
-		while (size) {
-			int ret = write(output, buf, size);
-
-			if (ret < 0)
-				die("failed to write");
-
-			size -= ret;
-			buf += ret;
-
-			bytes_written += ret;
-		}
-	}
-
-	buf = &data[old & md->mask];
-	size = head - old;
-	old += size;
-
-	while (size) {
-		int ret = write(output, buf, size);
-
-		if (ret < 0)
-			die("failed to write");
-
-		size -= ret;
-		buf += ret;
-
-		bytes_written += ret;
-	}
-
-	md->prev = old;
-}
-
-static volatile int done = 0;
-
-static void sig_handler(int sig)
-{
-	done = 1;
-}
-
-static void pid_synthesize_comm_event(pid_t pid, int full)
-{
-	struct comm_event comm_ev;
-	char filename[PATH_MAX];
-	char bf[BUFSIZ];
-	int fd, ret;
-	size_t size;
-	char *field, *sep;
-	DIR *tasks;
-	struct dirent dirent, *next;
-
-	snprintf(filename, sizeof(filename), "/proc/%d/stat", pid);
-
-	fd = open(filename, O_RDONLY);
-	if (fd < 0) {
-		fprintf(stderr, "couldn't open %s\n", filename);
-		exit(EXIT_FAILURE);
-	}
-	if (read(fd, bf, sizeof(bf)) < 0) {
-		fprintf(stderr, "couldn't read %s\n", filename);
-		exit(EXIT_FAILURE);
-	}
-	close(fd);
-
-	/* 9027 (cat) R 6747 9027 6747 34816 9027 ... */
-	memset(&comm_ev, 0, sizeof(comm_ev));
-	field = strchr(bf, '(');
-	if (field == NULL)
-		goto out_failure;
-	sep = strchr(++field, ')');
-	if (sep == NULL)
-		goto out_failure;
-	size = sep - field;
-	memcpy(comm_ev.comm, field, size++);
-
-	comm_ev.pid = pid;
-	comm_ev.header.type = PERF_EVENT_COMM;
-	size = ALIGN(size, sizeof(uint64_t));
-	comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
-
-	if (!full) {
-		comm_ev.tid = pid;
-
-		ret = write(output, &comm_ev, comm_ev.header.size);
-		if (ret < 0) {
-			perror("failed to write");
-			exit(-1);
-		}
-		return;
-	}
-
-	snprintf(filename, sizeof(filename), "/proc/%d/task", pid);
-
-	tasks = opendir(filename);
-	while (!readdir_r(tasks, &dirent, &next) && next) {
-		char *end;
-		pid = strtol(dirent.d_name, &end, 10);
-		if (*end)
-			continue;
-
-		comm_ev.tid = pid;
-
-		ret = write(output, &comm_ev, comm_ev.header.size);
-		if (ret < 0) {
-			perror("failed to write");
-			exit(-1);
-		}
-	}
-	closedir(tasks);
-	return;
-
-out_failure:
-	fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n",
-		filename);
-	exit(EXIT_FAILURE);
-}
-
-static void pid_synthesize_mmap_samples(pid_t pid)
-{
-	char filename[PATH_MAX];
-	FILE *fp;
-
-	snprintf(filename, sizeof(filename), "/proc/%d/maps", pid);
-
-	fp = fopen(filename, "r");
-	if (fp == NULL) {
-		fprintf(stderr, "couldn't open %s\n", filename);
-		exit(EXIT_FAILURE);
-	}
-	while (1) {
-		char bf[BUFSIZ], *pbf = bf;
-		struct mmap_event mmap_ev = {
-			.header.type = PERF_EVENT_MMAP,
-		};
-		int n;
-		size_t size;
-		if (fgets(bf, sizeof(bf), fp) == NULL)
-			break;
-
-		/* 00400000-0040c000 r-xp 00000000 fd:01 41038  /bin/cat */
-		n = hex2u64(pbf, &mmap_ev.start);
-		if (n < 0)
-			continue;
-		pbf += n + 1;
-		n = hex2u64(pbf, &mmap_ev.len);
-		if (n < 0)
-			continue;
-		pbf += n + 3;
-		if (*pbf == 'x') { /* vm_exec */
-			char *execname = strrchr(bf, ' ');
-
-			if (execname == NULL || execname[1] != '/')
-				continue;
-
-			execname += 1;
-			size = strlen(execname);
-			execname[size - 1] = '\0'; /* Remove \n */
-			memcpy(mmap_ev.filename, execname, size);
-			size = ALIGN(size, sizeof(uint64_t));
-			mmap_ev.len -= mmap_ev.start;
-			mmap_ev.header.size = (sizeof(mmap_ev) -
-					       (sizeof(mmap_ev.filename) - size));
-			mmap_ev.pid = pid;
-			mmap_ev.tid = pid;
-
-			if (write(output, &mmap_ev, mmap_ev.header.size) < 0) {
-				perror("failed to write");
-				exit(-1);
-			}
-		}
-	}
-
-	fclose(fp);
-}
-
-static void synthesize_samples(void)
-{
-	DIR *proc;
-	struct dirent dirent, *next;
-
-	proc = opendir("/proc");
-
-	while (!readdir_r(proc, &dirent, &next) && next) {
-		char *end;
-		pid_t pid;
-
-		pid = strtol(dirent.d_name, &end, 10);
-		if (*end) /* only interested in proper numerical dirents */
-			continue;
-
-		pid_synthesize_comm_event(pid, 1);
-		pid_synthesize_mmap_samples(pid);
-	}
-
-	closedir(proc);
-}
-
-static int group_fd;
-
-static void create_counter(int counter, int cpu, pid_t pid)
-{
-	struct perf_counter_attr *attr = attrs + counter;
-	int track = 1;
-
-	attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_PERIOD;
-	if (freq) {
-		attr->freq		= 1;
-		attr->sample_freq	= freq;
-	}
-	attr->mmap		= track;
-	attr->comm		= track;
-	attr->inherit		= (cpu < 0) && inherit;
-
-	track = 0; /* only the first counter needs these */
-
-	fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0);
-
-	if (fd[nr_cpu][counter] < 0) {
-		int err = errno;
-
-		error("syscall returned with %d (%s)\n",
-				fd[nr_cpu][counter], strerror(err));
-		if (err == EPERM)
-			printf("Are you root?\n");
-		exit(-1);
-	}
-	assert(fd[nr_cpu][counter] >= 0);
-	fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK);
-
-	/*
-	 * First counter acts as the group leader:
-	 */
-	if (group && group_fd == -1)
-		group_fd = fd[nr_cpu][counter];
-
-	event_array[nr_poll].fd = fd[nr_cpu][counter];
-	event_array[nr_poll].events = POLLIN;
-	nr_poll++;
-
-	mmap_array[nr_cpu][counter].counter = counter;
-	mmap_array[nr_cpu][counter].prev = 0;
-	mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
-	mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
-			PROT_READ, MAP_SHARED, fd[nr_cpu][counter], 0);
-	if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
-		error("failed to mmap with %d (%s)\n", errno, strerror(errno));
-		exit(-1);
-	}
-}
-
-static void open_counters(int cpu, pid_t pid)
-{
-	int counter;
-
-	if (pid > 0) {
-		pid_synthesize_comm_event(pid, 0);
-		pid_synthesize_mmap_samples(pid);
-	}
-
-	group_fd = -1;
-	for (counter = 0; counter < nr_counters; counter++)
-		create_counter(counter, cpu, pid);
-
-	nr_cpu++;
-}
-
-static int __cmd_record(int argc, const char **argv)
-{
-	int i, counter;
-	struct stat st;
-	pid_t pid;
-	int flags;
-	int ret;
-
-	page_size = sysconf(_SC_PAGE_SIZE);
-	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
-	assert(nr_cpus <= MAX_NR_CPUS);
-	assert(nr_cpus >= 0);
-
-	if (!stat(output_name, &st) && !force && !append_file) {
-		fprintf(stderr, "Error, output file %s exists, use -A to append or -f to overwrite.\n",
-				output_name);
-		exit(-1);
-	}
-
-	flags = O_CREAT|O_RDWR;
-	if (append_file)
-		flags |= O_APPEND;
-	else
-		flags |= O_TRUNC;
-
-	output = open(output_name, flags, S_IRUSR|S_IWUSR);
-	if (output < 0) {
-		perror("failed to create output file");
-		exit(-1);
-	}
-
-	if (!system_wide) {
-		open_counters(-1, target_pid != -1 ? target_pid : getpid());
-	} else for (i = 0; i < nr_cpus; i++)
-		open_counters(i, target_pid);
-
-	signal(SIGCHLD, sig_handler);
-	signal(SIGINT, sig_handler);
-
-	if (target_pid == -1 && argc) {
-		pid = fork();
-		if (pid < 0)
-			perror("failed to fork");
-
-		if (!pid) {
-			if (execvp(argv[0], (char **)argv)) {
-				perror(argv[0]);
-				exit(-1);
-			}
-		}
-	}
-
-	if (realtime_prio) {
-		struct sched_param param;
-
-		param.sched_priority = realtime_prio;
-		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
-			printf("Could not set realtime priority.\n");
-			exit(-1);
-		}
-	}
-
-	if (system_wide)
-		synthesize_samples();
-
-	while (!done) {
-		int hits = samples;
-
-		for (i = 0; i < nr_cpu; i++) {
-			for (counter = 0; counter < nr_counters; counter++)
-				mmap_read(&mmap_array[i][counter]);
-		}
-
-		if (hits == samples)
-			ret = poll(event_array, nr_poll, 100);
-	}
-
-	/*
-	 * Approximate RIP event size: 24 bytes.
-	 */
-	fprintf(stderr,
-		"[ perf record: Captured and wrote %.3f MB %s (~%lld samples) ]\n",
-		(double)bytes_written / 1024.0 / 1024.0,
-		output_name,
-		bytes_written / 24);
-
-	return 0;
-}
-
-static const char * const record_usage[] = {
-	"perf record [<options>] [<command>]",
-	"perf record [<options>] -- <command> [<options>]",
-	NULL
-};
-
-static const struct option options[] = {
-	OPT_CALLBACK('e', "event", NULL, "event",
-		     "event selector. use 'perf list' to list available events",
-		     parse_events),
-	OPT_INTEGER('p', "pid", &target_pid,
-		    "record events on existing pid"),
-	OPT_INTEGER('r', "realtime", &realtime_prio,
-		    "collect data with this RT SCHED_FIFO priority"),
-	OPT_BOOLEAN('a', "all-cpus", &system_wide,
-			    "system-wide collection from all CPUs"),
-	OPT_BOOLEAN('A', "append", &append_file,
-			    "append to the output file to do incremental profiling"),
-	OPT_BOOLEAN('f', "force", &force,
-			"overwrite existing data file"),
-	OPT_LONG('c', "count", &default_interval,
-		    "event period to sample"),
-	OPT_STRING('o', "output", &output_name, "file",
-		    "output file name"),
-	OPT_BOOLEAN('i', "inherit", &inherit,
-		    "child tasks inherit counters"),
-	OPT_INTEGER('F', "freq", &freq,
-		    "profile at this frequency"),
-	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
-		    "number of mmap data pages"),
-	OPT_END()
-};
-
-int cmd_record(int argc, const char **argv, const char *prefix)
-{
-	int counter;
-
-	argc = parse_options(argc, argv, options, record_usage, 0);
-	if (!argc && target_pid == -1 && !system_wide)
-		usage_with_options(record_usage, options);
-
-	if (!nr_counters)
-		nr_counters = 1;
-
-	for (counter = 0; counter < nr_counters; counter++) {
-		if (attrs[counter].sample_period)
-			continue;
-
-		attrs[counter].sample_period = default_interval;
-	}
-
-	return __cmd_record(argc, argv);
-}
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
deleted file mode 100644
index 242e09f..0000000
--- a/Documentation/perf_counter/builtin-report.c
+++ /dev/null
@@ -1,1291 +0,0 @@
-/*
- * builtin-report.c
- *
- * Builtin report command: Analyze the perf.data input file,
- * look up and read DSOs and symbol information and display
- * a histogram of results, along various sorting keys.
- */
-#include "builtin.h"
-
-#include "util/util.h"
-
-#include "util/color.h"
-#include "util/list.h"
-#include "util/cache.h"
-#include "util/rbtree.h"
-#include "util/symbol.h"
-#include "util/string.h"
-
-#include "perf.h"
-
-#include "util/parse-options.h"
-#include "util/parse-events.h"
-
-#define SHOW_KERNEL	1
-#define SHOW_USER	2
-#define SHOW_HV		4
-
-static char		const *input_name = "perf.data";
-static char		*vmlinux = NULL;
-
-static char		default_sort_order[] = "comm,dso";
-static char		*sort_order = default_sort_order;
-
-static int		input;
-static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
-
-static int		dump_trace = 0;
-#define dprintf(x...)	do { if (dump_trace) printf(x); } while (0)
-
-static int		verbose;
-static int		full_paths;
-
-static unsigned long	page_size;
-static unsigned long	mmap_window = 32;
-
-struct ip_event {
-	struct perf_event_header header;
-	__u64 ip;
-	__u32 pid, tid;
-};
-
-struct mmap_event {
-	struct perf_event_header header;
-	__u32 pid, tid;
-	__u64 start;
-	__u64 len;
-	__u64 pgoff;
-	char filename[PATH_MAX];
-};
-
-struct comm_event {
-	struct perf_event_header header;
-	__u32 pid, tid;
-	char comm[16];
-};
-
-struct fork_event {
-	struct perf_event_header header;
-	__u32 pid, ppid;
-};
-
-struct period_event {
-	struct perf_event_header header;
-	__u64 time;
-	__u64 id;
-	__u64 sample_period;
-};
-
-typedef union event_union {
-	struct perf_event_header	header;
-	struct ip_event			ip;
-	struct mmap_event		mmap;
-	struct comm_event		comm;
-	struct fork_event		fork;
-	struct period_event		period;
-} event_t;
-
-static LIST_HEAD(dsos);
-static struct dso *kernel_dso;
-static struct dso *vdso;
-
-static void dsos__add(struct dso *dso)
-{
-	list_add_tail(&dso->node, &dsos);
-}
-
-static struct dso *dsos__find(const char *name)
-{
-	struct dso *pos;
-
-	list_for_each_entry(pos, &dsos, node)
-		if (strcmp(pos->name, name) == 0)
-			return pos;
-	return NULL;
-}
-
-static struct dso *dsos__findnew(const char *name)
-{
-	struct dso *dso = dsos__find(name);
-	int nr;
-
-	if (dso)
-		return dso;
-
-	dso = dso__new(name, 0);
-	if (!dso)
-		goto out_delete_dso;
-
-	nr = dso__load(dso, NULL, verbose);
-	if (nr < 0) {
-		if (verbose)
-			fprintf(stderr, "Failed to open: %s\n", name);
-		goto out_delete_dso;
-	}
-	if (!nr && verbose) {
-		fprintf(stderr,
-		"No symbols found in: %s, maybe install a debug package?\n",
-				name);
-	}
-
-	dsos__add(dso);
-
-	return dso;
-
-out_delete_dso:
-	dso__delete(dso);
-	return NULL;
-}
-
-static void dsos__fprintf(FILE *fp)
-{
-	struct dso *pos;
-
-	list_for_each_entry(pos, &dsos, node)
-		dso__fprintf(pos, fp);
-}
-
-static struct symbol *vdso__find_symbol(struct dso *dso, uint64_t ip)
-{
-	return dso__find_symbol(kernel_dso, ip);
-}
-
-static int load_kernel(void)
-{
-	int err;
-
-	kernel_dso = dso__new("[kernel]", 0);
-	if (!kernel_dso)
-		return -1;
-
-	err = dso__load_kernel(kernel_dso, vmlinux, NULL, verbose);
-	if (err) {
-		dso__delete(kernel_dso);
-		kernel_dso = NULL;
-	} else
-		dsos__add(kernel_dso);
-
-	vdso = dso__new("[vdso]", 0);
-	if (!vdso)
-		return -1;
-
-	vdso->find_symbol = vdso__find_symbol;
-
-	dsos__add(vdso);
-
-	return err;
-}
-
-static char __cwd[PATH_MAX];
-static char *cwd = __cwd;
-static int cwdlen;
-
-static int strcommon(const char *pathname)
-{
-	int n = 0;
-
-	while (pathname[n] == cwd[n] && n < cwdlen)
-		++n;
-
-	return n;
-}
-
-struct map {
-	struct list_head node;
-	uint64_t	 start;
-	uint64_t	 end;
-	uint64_t	 pgoff;
-	uint64_t	 (*map_ip)(struct map *, uint64_t);
-	struct dso	 *dso;
-};
-
-static uint64_t map__map_ip(struct map *map, uint64_t ip)
-{
-	return ip - map->start + map->pgoff;
-}
-
-static uint64_t vdso__map_ip(struct map *map, uint64_t ip)
-{
-	return ip;
-}
-
-static struct map *map__new(struct mmap_event *event)
-{
-	struct map *self = malloc(sizeof(*self));
-
-	if (self != NULL) {
-		const char *filename = event->filename;
-		char newfilename[PATH_MAX];
-
-		if (cwd) {
-			int n = strcommon(filename);
-
-			if (n == cwdlen) {
-				snprintf(newfilename, sizeof(newfilename),
-					 ".%s", filename + n);
-				filename = newfilename;
-			}
-		}
-
-		self->start = event->start;
-		self->end   = event->start + event->len;
-		self->pgoff = event->pgoff;
-
-		self->dso = dsos__findnew(filename);
-		if (self->dso == NULL)
-			goto out_delete;
-
-		if (self->dso == vdso)
-			self->map_ip = vdso__map_ip;
-		else
-			self->map_ip = map__map_ip;
-	}
-	return self;
-out_delete:
-	free(self);
-	return NULL;
-}
-
-static struct map *map__clone(struct map *self)
-{
-	struct map *map = malloc(sizeof(*self));
-
-	if (!map)
-		return NULL;
-
-	memcpy(map, self, sizeof(*self));
-
-	return map;
-}
-
-static int map__overlap(struct map *l, struct map *r)
-{
-	if (l->start > r->start) {
-		struct map *t = l;
-		l = r;
-		r = t;
-	}
-
-	if (l->end > r->start)
-		return 1;
-
-	return 0;
-}
-
-static size_t map__fprintf(struct map *self, FILE *fp)
-{
-	return fprintf(fp, " %"PRIx64"-%"PRIx64" %"PRIx64" %s\n",
-		       self->start, self->end, self->pgoff, self->dso->name);
-}
-
-
-struct thread {
-	struct rb_node	 rb_node;
-	struct list_head maps;
-	pid_t		 pid;
-	char		 *comm;
-};
-
-static struct thread *thread__new(pid_t pid)
-{
-	struct thread *self = malloc(sizeof(*self));
-
-	if (self != NULL) {
-		self->pid = pid;
-		self->comm = malloc(32);
-		if (self->comm)
-			snprintf(self->comm, 32, ":%d", self->pid);
-		INIT_LIST_HEAD(&self->maps);
-	}
-
-	return self;
-}
-
-static int thread__set_comm(struct thread *self, const char *comm)
-{
-	if (self->comm)
-		free(self->comm);
-	self->comm = strdup(comm);
-	return self->comm ? 0 : -ENOMEM;
-}
-
-static size_t thread__fprintf(struct thread *self, FILE *fp)
-{
-	struct map *pos;
-	size_t ret = fprintf(fp, "Thread %d %s\n", self->pid, self->comm);
-
-	list_for_each_entry(pos, &self->maps, node)
-		ret += map__fprintf(pos, fp);
-
-	return ret;
-}
-
-
-static struct rb_root threads;
-static struct thread *last_match;
-
-static struct thread *threads__findnew(pid_t pid)
-{
-	struct rb_node **p = &threads.rb_node;
-	struct rb_node *parent = NULL;
-	struct thread *th;
-
-	/*
-	 * Font-end cache - PID lookups come in blocks,
-	 * so most of the time we dont have to look up
-	 * the full rbtree:
-	 */
-	if (last_match && last_match->pid == pid)
-		return last_match;
-
-	while (*p != NULL) {
-		parent = *p;
-		th = rb_entry(parent, struct thread, rb_node);
-
-		if (th->pid == pid) {
-			last_match = th;
-			return th;
-		}
-
-		if (pid < th->pid)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-
-	th = thread__new(pid);
-	if (th != NULL) {
-		rb_link_node(&th->rb_node, parent, p);
-		rb_insert_color(&th->rb_node, &threads);
-		last_match = th;
-	}
-
-	return th;
-}
-
-static void thread__insert_map(struct thread *self, struct map *map)
-{
-	struct map *pos, *tmp;
-
-	list_for_each_entry_safe(pos, tmp, &self->maps, node) {
-		if (map__overlap(pos, map)) {
-			list_del_init(&pos->node);
-			/* XXX leaks dsos */
-			free(pos);
-		}
-	}
-
-	list_add_tail(&map->node, &self->maps);
-}
-
-static int thread__fork(struct thread *self, struct thread *parent)
-{
-	struct map *map;
-
-	if (self->comm)
-		free(self->comm);
-	self->comm = strdup(parent->comm);
-	if (!self->comm)
-		return -ENOMEM;
-
-	list_for_each_entry(map, &parent->maps, node) {
-		struct map *new = map__clone(map);
-		if (!new)
-			return -ENOMEM;
-		thread__insert_map(self, new);
-	}
-
-	return 0;
-}
-
-static struct map *thread__find_map(struct thread *self, uint64_t ip)
-{
-	struct map *pos;
-
-	if (self == NULL)
-		return NULL;
-
-	list_for_each_entry(pos, &self->maps, node)
-		if (ip >= pos->start && ip <= pos->end)
-			return pos;
-
-	return NULL;
-}
-
-static size_t threads__fprintf(FILE *fp)
-{
-	size_t ret = 0;
-	struct rb_node *nd;
-
-	for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
-		struct thread *pos = rb_entry(nd, struct thread, rb_node);
-
-		ret += thread__fprintf(pos, fp);
-	}
-
-	return ret;
-}
-
-/*
- * histogram, sorted on item, collects counts
- */
-
-static struct rb_root hist;
-
-struct hist_entry {
-	struct rb_node	 rb_node;
-
-	struct thread	 *thread;
-	struct map	 *map;
-	struct dso	 *dso;
-	struct symbol	 *sym;
-	uint64_t	 ip;
-	char		 level;
-
-	uint32_t	 count;
-};
-
-/*
- * configurable sorting bits
- */
-
-struct sort_entry {
-	struct list_head list;
-
-	char *header;
-
-	int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
-	int64_t (*collapse)(struct hist_entry *, struct hist_entry *);
-	size_t	(*print)(FILE *fp, struct hist_entry *);
-};
-
-/* --sort pid */
-
-static int64_t
-sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
-{
-	return right->thread->pid - left->thread->pid;
-}
-
-static size_t
-sort__thread_print(FILE *fp, struct hist_entry *self)
-{
-	return fprintf(fp, "%16s:%5d", self->thread->comm ?: "", self->thread->pid);
-}
-
-static struct sort_entry sort_thread = {
-	.header = "         Command:  Pid",
-	.cmp	= sort__thread_cmp,
-	.print	= sort__thread_print,
-};
-
-/* --sort comm */
-
-static int64_t
-sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
-{
-	return right->thread->pid - left->thread->pid;
-}
-
-static int64_t
-sort__comm_collapse(struct hist_entry *left, struct hist_entry *right)
-{
-	char *comm_l = left->thread->comm;
-	char *comm_r = right->thread->comm;
-
-	if (!comm_l || !comm_r) {
-		if (!comm_l && !comm_r)
-			return 0;
-		else if (!comm_l)
-			return -1;
-		else
-			return 1;
-	}
-
-	return strcmp(comm_l, comm_r);
-}
-
-static size_t
-sort__comm_print(FILE *fp, struct hist_entry *self)
-{
-	return fprintf(fp, "%16s", self->thread->comm);
-}
-
-static struct sort_entry sort_comm = {
-	.header		= "         Command",
-	.cmp		= sort__comm_cmp,
-	.collapse	= sort__comm_collapse,
-	.print		= sort__comm_print,
-};
-
-/* --sort dso */
-
-static int64_t
-sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
-{
-	struct dso *dso_l = left->dso;
-	struct dso *dso_r = right->dso;
-
-	if (!dso_l || !dso_r) {
-		if (!dso_l && !dso_r)
-			return 0;
-		else if (!dso_l)
-			return -1;
-		else
-			return 1;
-	}
-
-	return strcmp(dso_l->name, dso_r->name);
-}
-
-static size_t
-sort__dso_print(FILE *fp, struct hist_entry *self)
-{
-	if (self->dso)
-		return fprintf(fp, "%-25s", self->dso->name);
-
-	return fprintf(fp, "%016llx         ", (__u64)self->ip);
-}
-
-static struct sort_entry sort_dso = {
-	.header = "Shared Object            ",
-	.cmp	= sort__dso_cmp,
-	.print	= sort__dso_print,
-};
-
-/* --sort symbol */
-
-static int64_t
-sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
-{
-	uint64_t ip_l, ip_r;
-
-	if (left->sym == right->sym)
-		return 0;
-
-	ip_l = left->sym ? left->sym->start : left->ip;
-	ip_r = right->sym ? right->sym->start : right->ip;
-
-	return (int64_t)(ip_r - ip_l);
-}
-
-static size_t
-sort__sym_print(FILE *fp, struct hist_entry *self)
-{
-	size_t ret = 0;
-
-	if (verbose)
-		ret += fprintf(fp, "%#018llx  ", (__u64)self->ip);
-
-	if (self->sym) {
-		ret += fprintf(fp, "[%c] %s",
-			self->dso == kernel_dso ? 'k' : '.', self->sym->name);
-	} else {
-		ret += fprintf(fp, "%#016llx", (__u64)self->ip);
-	}
-
-	return ret;
-}
-
-static struct sort_entry sort_sym = {
-	.header = "Symbol",
-	.cmp	= sort__sym_cmp,
-	.print	= sort__sym_print,
-};
-
-static int sort__need_collapse = 0;
-
-struct sort_dimension {
-	char			*name;
-	struct sort_entry	*entry;
-	int			taken;
-};
-
-static struct sort_dimension sort_dimensions[] = {
-	{ .name = "pid",	.entry = &sort_thread,	},
-	{ .name = "comm",	.entry = &sort_comm,	},
-	{ .name = "dso",	.entry = &sort_dso,	},
-	{ .name = "symbol",	.entry = &sort_sym,	},
-};
-
-static LIST_HEAD(hist_entry__sort_list);
-
-static int sort_dimension__add(char *tok)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(sort_dimensions); i++) {
-		struct sort_dimension *sd = &sort_dimensions[i];
-
-		if (sd->taken)
-			continue;
-
-		if (strncasecmp(tok, sd->name, strlen(tok)))
-			continue;
-
-		if (sd->entry->collapse)
-			sort__need_collapse = 1;
-
-		list_add_tail(&sd->entry->list, &hist_entry__sort_list);
-		sd->taken = 1;
-
-		return 0;
-	}
-
-	return -ESRCH;
-}
-
-static int64_t
-hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
-{
-	struct sort_entry *se;
-	int64_t cmp = 0;
-
-	list_for_each_entry(se, &hist_entry__sort_list, list) {
-		cmp = se->cmp(left, right);
-		if (cmp)
-			break;
-	}
-
-	return cmp;
-}
-
-static int64_t
-hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
-{
-	struct sort_entry *se;
-	int64_t cmp = 0;
-
-	list_for_each_entry(se, &hist_entry__sort_list, list) {
-		int64_t (*f)(struct hist_entry *, struct hist_entry *);
-
-		f = se->collapse ?: se->cmp;
-
-		cmp = f(left, right);
-		if (cmp)
-			break;
-	}
-
-	return cmp;
-}
-
-static size_t
-hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
-{
-	struct sort_entry *se;
-	size_t ret;
-
-	if (total_samples) {
-		double percent = self->count * 100.0 / total_samples;
-		char *color = PERF_COLOR_NORMAL;
-
-		/*
-		 * We color high-overhead entries in red, low-overhead
-		 * entries in green - and keep the middle ground normal:
-		 */
-		if (percent >= 5.0)
-			color = PERF_COLOR_RED;
-		if (percent < 0.5)
-			color = PERF_COLOR_GREEN;
-
-		ret = color_fprintf(fp, color, "   %6.2f%%",
-				(self->count * 100.0) / total_samples);
-	} else
-		ret = fprintf(fp, "%12d ", self->count);
-
-	list_for_each_entry(se, &hist_entry__sort_list, list) {
-		fprintf(fp, "  ");
-		ret += se->print(fp, self);
-	}
-
-	ret += fprintf(fp, "\n");
-
-	return ret;
-}
-
-/*
- * collect histogram counts
- */
-
-static int
-hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
-		struct symbol *sym, uint64_t ip, char level)
-{
-	struct rb_node **p = &hist.rb_node;
-	struct rb_node *parent = NULL;
-	struct hist_entry *he;
-	struct hist_entry entry = {
-		.thread	= thread,
-		.map	= map,
-		.dso	= dso,
-		.sym	= sym,
-		.ip	= ip,
-		.level	= level,
-		.count	= 1,
-	};
-	int cmp;
-
-	while (*p != NULL) {
-		parent = *p;
-		he = rb_entry(parent, struct hist_entry, rb_node);
-
-		cmp = hist_entry__cmp(&entry, he);
-
-		if (!cmp) {
-			he->count++;
-			return 0;
-		}
-
-		if (cmp < 0)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-
-	he = malloc(sizeof(*he));
-	if (!he)
-		return -ENOMEM;
-	*he = entry;
-	rb_link_node(&he->rb_node, parent, p);
-	rb_insert_color(&he->rb_node, &hist);
-
-	return 0;
-}
-
-static void hist_entry__free(struct hist_entry *he)
-{
-	free(he);
-}
-
-/*
- * collapse the histogram
- */
-
-static struct rb_root collapse_hists;
-
-static void collapse__insert_entry(struct hist_entry *he)
-{
-	struct rb_node **p = &collapse_hists.rb_node;
-	struct rb_node *parent = NULL;
-	struct hist_entry *iter;
-	int64_t cmp;
-
-	while (*p != NULL) {
-		parent = *p;
-		iter = rb_entry(parent, struct hist_entry, rb_node);
-
-		cmp = hist_entry__collapse(iter, he);
-
-		if (!cmp) {
-			iter->count += he->count;
-			hist_entry__free(he);
-			return;
-		}
-
-		if (cmp < 0)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-
-	rb_link_node(&he->rb_node, parent, p);
-	rb_insert_color(&he->rb_node, &collapse_hists);
-}
-
-static void collapse__resort(void)
-{
-	struct rb_node *next;
-	struct hist_entry *n;
-
-	if (!sort__need_collapse)
-		return;
-
-	next = rb_first(&hist);
-	while (next) {
-		n = rb_entry(next, struct hist_entry, rb_node);
-		next = rb_next(&n->rb_node);
-
-		rb_erase(&n->rb_node, &hist);
-		collapse__insert_entry(n);
-	}
-}
-
-/*
- * reverse the map, sort on count.
- */
-
-static struct rb_root output_hists;
-
-static void output__insert_entry(struct hist_entry *he)
-{
-	struct rb_node **p = &output_hists.rb_node;
-	struct rb_node *parent = NULL;
-	struct hist_entry *iter;
-
-	while (*p != NULL) {
-		parent = *p;
-		iter = rb_entry(parent, struct hist_entry, rb_node);
-
-		if (he->count > iter->count)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-
-	rb_link_node(&he->rb_node, parent, p);
-	rb_insert_color(&he->rb_node, &output_hists);
-}
-
-static void output__resort(void)
-{
-	struct rb_node *next;
-	struct hist_entry *n;
-	struct rb_root *tree = &hist;
-
-	if (sort__need_collapse)
-		tree = &collapse_hists;
-
-	next = rb_first(tree);
-
-	while (next) {
-		n = rb_entry(next, struct hist_entry, rb_node);
-		next = rb_next(&n->rb_node);
-
-		rb_erase(&n->rb_node, tree);
-		output__insert_entry(n);
-	}
-}
-
-static size_t output__fprintf(FILE *fp, uint64_t total_samples)
-{
-	struct hist_entry *pos;
-	struct sort_entry *se;
-	struct rb_node *nd;
-	size_t ret = 0;
-
-	fprintf(fp, "\n");
-	fprintf(fp, "#\n");
-	fprintf(fp, "# (%Ld samples)\n", (__u64)total_samples);
-	fprintf(fp, "#\n");
-
-	fprintf(fp, "# Overhead");
-	list_for_each_entry(se, &hist_entry__sort_list, list)
-		fprintf(fp, "  %s", se->header);
-	fprintf(fp, "\n");
-
-	fprintf(fp, "# ........");
-	list_for_each_entry(se, &hist_entry__sort_list, list) {
-		int i;
-
-		fprintf(fp, "  ");
-		for (i = 0; i < strlen(se->header); i++)
-			fprintf(fp, ".");
-	}
-	fprintf(fp, "\n");
-
-	fprintf(fp, "#\n");
-
-	for (nd = rb_first(&output_hists); nd; nd = rb_next(nd)) {
-		pos = rb_entry(nd, struct hist_entry, rb_node);
-		ret += hist_entry__fprintf(fp, pos, total_samples);
-	}
-
-	if (!strcmp(sort_order, default_sort_order)) {
-		fprintf(fp, "#\n");
-		fprintf(fp, "# (For more details, try: perf report --sort comm,dso,symbol)\n");
-		fprintf(fp, "#\n");
-	}
-	fprintf(fp, "\n");
-
-	return ret;
-}
-
-static void register_idle_thread(void)
-{
-	struct thread *thread = threads__findnew(0);
-
-	if (thread == NULL ||
-			thread__set_comm(thread, "[idle]")) {
-		fprintf(stderr, "problem inserting idle task.\n");
-		exit(-1);
-	}
-}
-
-static unsigned long total = 0,
-		     total_mmap = 0,
-		     total_comm = 0,
-		     total_fork = 0,
-		     total_unknown = 0;
-
-static int
-process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
-{
-	char level;
-	int show = 0;
-	struct dso *dso = NULL;
-	struct thread *thread = threads__findnew(event->ip.pid);
-	uint64_t ip = event->ip.ip;
-	struct map *map = NULL;
-
-	dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
-		(void *)(offset + head),
-		(void *)(long)(event->header.size),
-		event->header.misc,
-		event->ip.pid,
-		(void *)(long)ip);
-
-	dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
-
-	if (thread == NULL) {
-		fprintf(stderr, "problem processing %d event, skipping it.\n",
-			event->header.type);
-		return -1;
-	}
-
-	if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
-		show = SHOW_KERNEL;
-		level = 'k';
-
-		dso = kernel_dso;
-
-		dprintf(" ...... dso: %s\n", dso->name);
-
-	} else if (event->header.misc & PERF_EVENT_MISC_USER) {
-
-		show = SHOW_USER;
-		level = '.';
-
-		map = thread__find_map(thread, ip);
-		if (map != NULL) {
-			ip = map->map_ip(map, ip);
-			dso = map->dso;
-		} else {
-			/*
-			 * If this is outside of all known maps,
-			 * and is a negative address, try to look it
-			 * up in the kernel dso, as it might be a
-			 * vsyscall (which executes in user-mode):
-			 */
-			if ((long long)ip < 0)
-				dso = kernel_dso;
-		}
-		dprintf(" ...... dso: %s\n", dso ? dso->name : "<not found>");
-
-	} else {
-		show = SHOW_HV;
-		level = 'H';
-		dprintf(" ...... dso: [hypervisor]\n");
-	}
-
-	if (show & show_mask) {
-		struct symbol *sym = NULL;
-
-		if (dso)
-			sym = dso->find_symbol(dso, ip);
-
-		if (hist_entry__add(thread, map, dso, sym, ip, level)) {
-			fprintf(stderr,
-		"problem incrementing symbol count, skipping event\n");
-			return -1;
-		}
-	}
-	total++;
-
-	return 0;
-}
-
-static int
-process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
-{
-	struct thread *thread = threads__findnew(event->mmap.pid);
-	struct map *map = map__new(&event->mmap);
-
-	dprintf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n",
-		(void *)(offset + head),
-		(void *)(long)(event->header.size),
-		event->mmap.pid,
-		(void *)(long)event->mmap.start,
-		(void *)(long)event->mmap.len,
-		(void *)(long)event->mmap.pgoff,
-		event->mmap.filename);
-
-	if (thread == NULL || map == NULL) {
-		dprintf("problem processing PERF_EVENT_MMAP, skipping event.\n");
-		return 0;
-	}
-
-	thread__insert_map(thread, map);
-	total_mmap++;
-
-	return 0;
-}
-
-static int
-process_comm_event(event_t *event, unsigned long offset, unsigned long head)
-{
-	struct thread *thread = threads__findnew(event->comm.pid);
-
-	dprintf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
-		(void *)(offset + head),
-		(void *)(long)(event->header.size),
-		event->comm.comm, event->comm.pid);
-
-	if (thread == NULL ||
-	    thread__set_comm(thread, event->comm.comm)) {
-		dprintf("problem processing PERF_EVENT_COMM, skipping event.\n");
-		return -1;
-	}
-	total_comm++;
-
-	return 0;
-}
-
-static int
-process_fork_event(event_t *event, unsigned long offset, unsigned long head)
-{
-	struct thread *thread = threads__findnew(event->fork.pid);
-	struct thread *parent = threads__findnew(event->fork.ppid);
-
-	dprintf("%p [%p]: PERF_EVENT_FORK: %d:%d\n",
-		(void *)(offset + head),
-		(void *)(long)(event->header.size),
-		event->fork.pid, event->fork.ppid);
-
-	if (!thread || !parent || thread__fork(thread, parent)) {
-		dprintf("problem processing PERF_EVENT_FORK, skipping event.\n");
-		return -1;
-	}
-	total_fork++;
-
-	return 0;
-}
-
-static int
-process_period_event(event_t *event, unsigned long offset, unsigned long head)
-{
-	dprintf("%p [%p]: PERF_EVENT_PERIOD: time:%Ld, id:%Ld: period:%Ld\n",
-		(void *)(offset + head),
-		(void *)(long)(event->header.size),
-		event->period.time,
-		event->period.id,
-		event->period.sample_period);
-
-	return 0;
-}
-
-static int
-process_event(event_t *event, unsigned long offset, unsigned long head)
-{
-	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW)
-		return process_overflow_event(event, offset, head);
-
-	switch (event->header.type) {
-	case PERF_EVENT_MMAP:
-		return process_mmap_event(event, offset, head);
-
-	case PERF_EVENT_COMM:
-		return process_comm_event(event, offset, head);
-
-	case PERF_EVENT_FORK:
-		return process_fork_event(event, offset, head);
-
-	case PERF_EVENT_PERIOD:
-		return process_period_event(event, offset, head);
-	/*
-	 * We dont process them right now but they are fine:
-	 */
-
-	case PERF_EVENT_THROTTLE:
-	case PERF_EVENT_UNTHROTTLE:
-		return 0;
-
-	default:
-		return -1;
-	}
-
-	return 0;
-}
-
-static int __cmd_report(void)
-{
-	int ret, rc = EXIT_FAILURE;
-	unsigned long offset = 0;
-	unsigned long head = 0;
-	struct stat stat;
-	event_t *event;
-	uint32_t size;
-	char *buf;
-
-	register_idle_thread();
-
-	input = open(input_name, O_RDONLY);
-	if (input < 0) {
-		perror("failed to open file");
-		exit(-1);
-	}
-
-	ret = fstat(input, &stat);
-	if (ret < 0) {
-		perror("failed to stat file");
-		exit(-1);
-	}
-
-	if (!stat.st_size) {
-		fprintf(stderr, "zero-sized file, nothing to do!\n");
-		exit(0);
-	}
-
-	if (load_kernel() < 0) {
-		perror("failed to load kernel symbols");
-		return EXIT_FAILURE;
-	}
-
-	if (!full_paths) {
-		if (getcwd(__cwd, sizeof(__cwd)) == NULL) {
-			perror("failed to get the current directory");
-			return EXIT_FAILURE;
-		}
-		cwdlen = strlen(cwd);
-	} else {
-		cwd = NULL;
-		cwdlen = 0;
-	}
-remap:
-	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
-			   MAP_SHARED, input, offset);
-	if (buf == MAP_FAILED) {
-		perror("failed to mmap file");
-		exit(-1);
-	}
-
-more:
-	event = (event_t *)(buf + head);
-
-	size = event->header.size;
-	if (!size)
-		size = 8;
-
-	if (head + event->header.size >= page_size * mmap_window) {
-		unsigned long shift = page_size * (head / page_size);
-		int ret;
-
-		ret = munmap(buf, page_size * mmap_window);
-		assert(ret == 0);
-
-		offset += shift;
-		head -= shift;
-		goto remap;
-	}
-
-	size = event->header.size;
-
-	dprintf("%p [%p]: event: %d\n",
-			(void *)(offset + head),
-			(void *)(long)event->header.size,
-			event->header.type);
-
-	if (!size || process_event(event, offset, head) < 0) {
-
-		dprintf("%p [%p]: skipping unknown header type: %d\n",
-			(void *)(offset + head),
-			(void *)(long)(event->header.size),
-			event->header.type);
-
-		total_unknown++;
-
-		/*
-		 * assume we lost track of the stream, check alignment, and
-		 * increment a single u64 in the hope to catch on again 'soon'.
-		 */
-
-		if (unlikely(head & 7))
-			head &= ~7ULL;
-
-		size = 8;
-	}
-
-	head += size;
-
-	if (offset + head < stat.st_size)
-		goto more;
-
-	rc = EXIT_SUCCESS;
-	close(input);
-
-	dprintf("      IP events: %10ld\n", total);
-	dprintf("    mmap events: %10ld\n", total_mmap);
-	dprintf("    comm events: %10ld\n", total_comm);
-	dprintf("    fork events: %10ld\n", total_fork);
-	dprintf(" unknown events: %10ld\n", total_unknown);
-
-	if (dump_trace)
-		return 0;
-
-	if (verbose >= 3)
-		threads__fprintf(stdout);
-
-	if (verbose >= 2)
-		dsos__fprintf(stdout);
-
-	collapse__resort();
-	output__resort();
-	output__fprintf(stdout, total);
-
-	return rc;
-}
-
-static const char * const report_usage[] = {
-	"perf report [<options>] <command>",
-	NULL
-};
-
-static const struct option options[] = {
-	OPT_STRING('i', "input", &input_name, "file",
-		    "input file name"),
-	OPT_BOOLEAN('v', "verbose", &verbose,
-		    "be more verbose (show symbol address, etc)"),
-	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
-		    "dump raw trace in ASCII"),
-	OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
-	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
-		   "sort by key(s): pid, comm, dso, symbol. Default: pid,symbol"),
-	OPT_BOOLEAN('P', "full-paths", &full_paths,
-		    "Don't shorten the pathnames taking into account the cwd"),
-	OPT_END()
-};
-
-static void setup_sorting(void)
-{
-	char *tmp, *tok, *str = strdup(sort_order);
-
-	for (tok = strtok_r(str, ", ", &tmp);
-			tok; tok = strtok_r(NULL, ", ", &tmp)) {
-		if (sort_dimension__add(tok) < 0) {
-			error("Unknown --sort key: `%s'", tok);
-			usage_with_options(report_usage, options);
-		}
-	}
-
-	free(str);
-}
-
-int cmd_report(int argc, const char **argv, const char *prefix)
-{
-	symbol__init();
-
-	page_size = getpagesize();
-
-	argc = parse_options(argc, argv, options, report_usage, 0);
-
-	setup_sorting();
-
-	/*
-	 * Any (unrecognized) arguments left?
-	 */
-	if (argc)
-		usage_with_options(report_usage, options);
-
-	setup_pager();
-
-	return __cmd_report();
-}
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
deleted file mode 100644
index 2cbf5a1..0000000
--- a/Documentation/perf_counter/builtin-stat.c
+++ /dev/null
@@ -1,339 +0,0 @@
-/*
- * builtin-stat.c
- *
- * Builtin stat command: Give a precise performance counters summary
- * overview about any workload, CPU or specific PID.
- *
- * Sample output:
-
-   $ perf stat ~/hackbench 10
-   Time: 0.104
-
-    Performance counter stats for '/home/mingo/hackbench':
-
-       1255.538611  task clock ticks     #      10.143 CPU utilization factor
-             54011  context switches     #       0.043 M/sec
-               385  CPU migrations       #       0.000 M/sec
-             17755  pagefaults           #       0.014 M/sec
-        3808323185  CPU cycles           #    3033.219 M/sec
-        1575111190  instructions         #    1254.530 M/sec
-          17367895  cache references     #      13.833 M/sec
-           7674421  cache misses         #       6.112 M/sec
-
-    Wall-clock time elapsed:   123.786620 msecs
-
- *
- * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
- *
- * Improvements and fixes by:
- *
- *   Arjan van de Ven <arjan@linux.intel.com>
- *   Yanmin Zhang <yanmin.zhang@intel.com>
- *   Wu Fengguang <fengguang.wu@intel.com>
- *   Mike Galbraith <efault@gmx.de>
- *   Paul Mackerras <paulus@samba.org>
- *
- * Released under the GPL v2. (and only v2, not any later version)
- */
-
-#include "perf.h"
-#include "builtin.h"
-#include "util/util.h"
-#include "util/parse-options.h"
-#include "util/parse-events.h"
-
-#include <sys/prctl.h>
-
-static struct perf_counter_attr default_attrs[MAX_COUNTERS] = {
-
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_TASK_CLOCK		},
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_CONTEXT_SWITCHES	},
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_CPU_MIGRATIONS	},
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_PAGE_FAULTS	},
-
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CPU_CYCLES		},
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_INSTRUCTIONS	},
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CACHE_REFERENCES	},
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CACHE_MISSES	},
-};
-
-static int			system_wide			=  0;
-static int			inherit				=  1;
-
-static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
-
-static int			target_pid			= -1;
-static int			nr_cpus				=  0;
-static unsigned int		page_size;
-
-static int			scale				=  1;
-
-static const unsigned int default_count[] = {
-	1000000,
-	1000000,
-	  10000,
-	  10000,
-	1000000,
-	  10000,
-};
-
-static __u64			event_res[MAX_COUNTERS][3];
-static __u64			event_scaled[MAX_COUNTERS];
-
-static __u64			runtime_nsecs;
-static __u64			walltime_nsecs;
-
-static void create_perfstat_counter(int counter)
-{
-	struct perf_counter_attr *attr = attrs + counter;
-
-	if (scale)
-		attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
-				    PERF_FORMAT_TOTAL_TIME_RUNNING;
-
-	if (system_wide) {
-		int cpu;
-		for (cpu = 0; cpu < nr_cpus; cpu ++) {
-			fd[cpu][counter] = sys_perf_counter_open(attr, -1, cpu, -1, 0);
-			if (fd[cpu][counter] < 0) {
-				printf("perfstat error: syscall returned with %d (%s)\n",
-						fd[cpu][counter], strerror(errno));
-				exit(-1);
-			}
-		}
-	} else {
-		attr->inherit	= inherit;
-		attr->disabled	= 1;
-
-		fd[0][counter] = sys_perf_counter_open(attr, 0, -1, -1, 0);
-		if (fd[0][counter] < 0) {
-			printf("perfstat error: syscall returned with %d (%s)\n",
-					fd[0][counter], strerror(errno));
-			exit(-1);
-		}
-	}
-}
-
-/*
- * Does the counter have nsecs as a unit?
- */
-static inline int nsec_counter(int counter)
-{
-	if (attrs[counter].type != PERF_TYPE_SOFTWARE)
-		return 0;
-
-	if (attrs[counter].config == PERF_COUNT_CPU_CLOCK)
-		return 1;
-
-	if (attrs[counter].config == PERF_COUNT_TASK_CLOCK)
-		return 1;
-
-	return 0;
-}
-
-/*
- * Read out the results of a single counter:
- */
-static void read_counter(int counter)
-{
-	__u64 *count, single_count[3];
-	ssize_t res;
-	int cpu, nv;
-	int scaled;
-
-	count = event_res[counter];
-
-	count[0] = count[1] = count[2] = 0;
-
-	nv = scale ? 3 : 1;
-	for (cpu = 0; cpu < nr_cpus; cpu ++) {
-		res = read(fd[cpu][counter], single_count, nv * sizeof(__u64));
-		assert(res == nv * sizeof(__u64));
-
-		count[0] += single_count[0];
-		if (scale) {
-			count[1] += single_count[1];
-			count[2] += single_count[2];
-		}
-	}
-
-	scaled = 0;
-	if (scale) {
-		if (count[2] == 0) {
-			event_scaled[counter] = -1;
-			count[0] = 0;
-			return;
-		}
-
-		if (count[2] < count[1]) {
-			event_scaled[counter] = 1;
-			count[0] = (unsigned long long)
-				((double)count[0] * count[1] / count[2] + 0.5);
-		}
-	}
-	/*
-	 * Save the full runtime - to allow normalization during printout:
-	 */
-	if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
-		attrs[counter].config == PERF_COUNT_TASK_CLOCK)
-		runtime_nsecs = count[0];
-}
-
-/*
- * Print out the results of a single counter:
- */
-static void print_counter(int counter)
-{
-	__u64 *count;
-	int scaled;
-
-	count = event_res[counter];
-	scaled = event_scaled[counter];
-
-	if (scaled == -1) {
-		fprintf(stderr, " %14s  %-20s\n",
-			"<not counted>", event_name(counter));
-		return;
-	}
-
-	if (nsec_counter(counter)) {
-		double msecs = (double)count[0] / 1000000;
-
-		fprintf(stderr, " %14.6f  %-20s",
-			msecs, event_name(counter));
-		if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
-			attrs[counter].config == PERF_COUNT_TASK_CLOCK) {
-
-			fprintf(stderr, " # %11.3f CPU utilization factor",
-				(double)count[0] / (double)walltime_nsecs);
-		}
-	} else {
-		fprintf(stderr, " %14Ld  %-20s",
-			count[0], event_name(counter));
-		if (runtime_nsecs)
-			fprintf(stderr, " # %11.3f M/sec",
-				(double)count[0]/runtime_nsecs*1000.0);
-	}
-	if (scaled)
-		fprintf(stderr, "  (scaled from %.2f%%)",
-			(double) count[2] / count[1] * 100);
-	fprintf(stderr, "\n");
-}
-
-static int do_perfstat(int argc, const char **argv)
-{
-	unsigned long long t0, t1;
-	int counter;
-	int status;
-	int pid;
-	int i;
-
-	if (!system_wide)
-		nr_cpus = 1;
-
-	for (counter = 0; counter < nr_counters; counter++)
-		create_perfstat_counter(counter);
-
-	/*
-	 * Enable counters and exec the command:
-	 */
-	t0 = rdclock();
-	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
-
-	if ((pid = fork()) < 0)
-		perror("failed to fork");
-
-	if (!pid) {
-		if (execvp(argv[0], (char **)argv)) {
-			perror(argv[0]);
-			exit(-1);
-		}
-	}
-
-	while (wait(&status) >= 0)
-		;
-
-	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
-	t1 = rdclock();
-
-	walltime_nsecs = t1 - t0;
-
-	fflush(stdout);
-
-	fprintf(stderr, "\n");
-	fprintf(stderr, " Performance counter stats for \'%s", argv[0]);
-
-	for (i = 1; i < argc; i++)
-		fprintf(stderr, " %s", argv[i]);
-
-	fprintf(stderr, "\':\n");
-	fprintf(stderr, "\n");
-
-	for (counter = 0; counter < nr_counters; counter++)
-		read_counter(counter);
-
-	for (counter = 0; counter < nr_counters; counter++)
-		print_counter(counter);
-
-
-	fprintf(stderr, "\n");
-	fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
-			(double)(t1-t0)/1e6);
-	fprintf(stderr, "\n");
-
-	return 0;
-}
-
-static void skip_signal(int signo)
-{
-}
-
-static const char * const stat_usage[] = {
-	"perf stat [<options>] <command>",
-	NULL
-};
-
-static const struct option options[] = {
-	OPT_CALLBACK('e', "event", NULL, "event",
-		     "event selector. use 'perf list' to list available events",
-		     parse_events),
-	OPT_BOOLEAN('i', "inherit", &inherit,
-		    "child tasks inherit counters"),
-	OPT_INTEGER('p', "pid", &target_pid,
-		    "stat events on existing pid"),
-	OPT_BOOLEAN('a', "all-cpus", &system_wide,
-			    "system-wide collection from all CPUs"),
-	OPT_BOOLEAN('S', "scale", &scale,
-			    "scale/normalize counters"),
-	OPT_END()
-};
-
-int cmd_stat(int argc, const char **argv, const char *prefix)
-{
-	page_size = sysconf(_SC_PAGE_SIZE);
-
-	memcpy(attrs, default_attrs, sizeof(attrs));
-
-	argc = parse_options(argc, argv, options, stat_usage, 0);
-	if (!argc)
-		usage_with_options(stat_usage, options);
-
-	if (!nr_counters)
-		nr_counters = 8;
-
-	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
-	assert(nr_cpus <= MAX_NR_CPUS);
-	assert(nr_cpus >= 0);
-
-	/*
-	 * We dont want to block the signals - that would cause
-	 * child tasks to inherit that and Ctrl-C would not work.
-	 * What we want is for Ctrl-C to work in the exec()-ed
-	 * task, but being ignored by perf stat itself:
-	 */
-	signal(SIGINT,  skip_signal);
-	signal(SIGALRM, skip_signal);
-	signal(SIGABRT, skip_signal);
-
-	return do_perfstat(argc, argv);
-}
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
deleted file mode 100644
index f2e7312..0000000
--- a/Documentation/perf_counter/builtin-top.c
+++ /dev/null
@@ -1,692 +0,0 @@
-/*
- * builtin-top.c
- *
- * Builtin top command: Display a continuously updated profile of
- * any workload, CPU or specific PID.
- *
- * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
- *
- * Improvements and fixes by:
- *
- *   Arjan van de Ven <arjan@linux.intel.com>
- *   Yanmin Zhang <yanmin.zhang@intel.com>
- *   Wu Fengguang <fengguang.wu@intel.com>
- *   Mike Galbraith <efault@gmx.de>
- *   Paul Mackerras <paulus@samba.org>
- *
- * Released under the GPL v2. (and only v2, not any later version)
- */
-#include "builtin.h"
-
-#include "perf.h"
-
-#include "util/symbol.h"
-#include "util/color.h"
-#include "util/util.h"
-#include "util/rbtree.h"
-#include "util/parse-options.h"
-#include "util/parse-events.h"
-
-#include <assert.h>
-#include <fcntl.h>
-
-#include <stdio.h>
-
-#include <errno.h>
-#include <time.h>
-#include <sched.h>
-#include <pthread.h>
-
-#include <sys/syscall.h>
-#include <sys/ioctl.h>
-#include <sys/poll.h>
-#include <sys/prctl.h>
-#include <sys/wait.h>
-#include <sys/uio.h>
-#include <sys/mman.h>
-
-#include <linux/unistd.h>
-#include <linux/types.h>
-
-static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
-
-static int			system_wide			=  0;
-
-static int			default_interval		= 100000;
-
-static __u64			count_filter			=  5;
-static int			print_entries			= 15;
-
-static int			target_pid			= -1;
-static int			profile_cpu			= -1;
-static int			nr_cpus				=  0;
-static unsigned int		realtime_prio			=  0;
-static int			group				=  0;
-static unsigned int		page_size;
-static unsigned int		mmap_pages			= 16;
-static int			freq				=  0;
-
-static char			*sym_filter;
-static unsigned long		filter_start;
-static unsigned long		filter_end;
-
-static int			delay_secs			=  2;
-static int			zero;
-static int			dump_symtab;
-
-/*
- * Symbols
- */
-
-static uint64_t			min_ip;
-static uint64_t			max_ip = -1ll;
-
-struct sym_entry {
-	struct rb_node		rb_node;
-	struct list_head	node;
-	unsigned long		count[MAX_COUNTERS];
-	unsigned long		snap_count;
-	double			weight;
-	int			skip;
-};
-
-struct sym_entry		*sym_filter_entry;
-
-struct dso			*kernel_dso;
-
-/*
- * Symbols will be added here in record_ip and will get out
- * after decayed.
- */
-static LIST_HEAD(active_symbols);
-static pthread_mutex_t active_symbols_lock = PTHREAD_MUTEX_INITIALIZER;
-
-/*
- * Ordering weight: count-1 * count-2 * ... / count-n
- */
-static double sym_weight(const struct sym_entry *sym)
-{
-	double weight = sym->snap_count;
-	int counter;
-
-	for (counter = 1; counter < nr_counters-1; counter++)
-		weight *= sym->count[counter];
-
-	weight /= (sym->count[counter] + 1);
-
-	return weight;
-}
-
-static long			samples;
-static long			userspace_samples;
-static const char		CONSOLE_CLEAR[] = "[H[2J";
-
-static void __list_insert_active_sym(struct sym_entry *syme)
-{
-	list_add(&syme->node, &active_symbols);
-}
-
-static void list_remove_active_sym(struct sym_entry *syme)
-{
-	pthread_mutex_lock(&active_symbols_lock);
-	list_del_init(&syme->node);
-	pthread_mutex_unlock(&active_symbols_lock);
-}
-
-static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se)
-{
-	struct rb_node **p = &tree->rb_node;
-	struct rb_node *parent = NULL;
-	struct sym_entry *iter;
-
-	while (*p != NULL) {
-		parent = *p;
-		iter = rb_entry(parent, struct sym_entry, rb_node);
-
-		if (se->weight > iter->weight)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-
-	rb_link_node(&se->rb_node, parent, p);
-	rb_insert_color(&se->rb_node, tree);
-}
-
-static void print_sym_table(void)
-{
-	int printed = 0, j;
-	int counter;
-	float samples_per_sec = samples/delay_secs;
-	float ksamples_per_sec = (samples-userspace_samples)/delay_secs;
-	float sum_ksamples = 0.0;
-	struct sym_entry *syme, *n;
-	struct rb_root tmp = RB_ROOT;
-	struct rb_node *nd;
-
-	samples = userspace_samples = 0;
-
-	/* Sort the active symbols */
-	pthread_mutex_lock(&active_symbols_lock);
-	syme = list_entry(active_symbols.next, struct sym_entry, node);
-	pthread_mutex_unlock(&active_symbols_lock);
-
-	list_for_each_entry_safe_from(syme, n, &active_symbols, node) {
-		syme->snap_count = syme->count[0];
-		if (syme->snap_count != 0) {
-			syme->weight = sym_weight(syme);
-			rb_insert_active_sym(&tmp, syme);
-			sum_ksamples += syme->snap_count;
-
-			for (j = 0; j < nr_counters; j++)
-				syme->count[j] = zero ? 0 : syme->count[j] * 7 / 8;
-		} else
-			list_remove_active_sym(syme);
-	}
-
-	puts(CONSOLE_CLEAR);
-
-	printf(
-"------------------------------------------------------------------------------\n");
-	printf( "   PerfTop:%8.0f irqs/sec  kernel:%4.1f%% [",
-		samples_per_sec,
-		100.0 - (100.0*((samples_per_sec-ksamples_per_sec)/samples_per_sec)));
-
-	if (nr_counters == 1) {
-		printf("%Ld", attrs[0].sample_period);
-		if (freq)
-			printf("Hz ");
-		else
-			printf(" ");
-	}
-
-	for (counter = 0; counter < nr_counters; counter++) {
-		if (counter)
-			printf("/");
-
-		printf("%s", event_name(counter));
-	}
-
-	printf( "], ");
-
-	if (target_pid != -1)
-		printf(" (target_pid: %d", target_pid);
-	else
-		printf(" (all");
-
-	if (profile_cpu != -1)
-		printf(", cpu: %d)\n", profile_cpu);
-	else {
-		if (target_pid != -1)
-			printf(")\n");
-		else
-			printf(", %d CPUs)\n", nr_cpus);
-	}
-
-	printf("------------------------------------------------------------------------------\n\n");
-
-	if (nr_counters == 1)
-		printf("             samples    pcnt");
-	else
-		printf("  weight     samples    pcnt");
-
-	printf("         RIP          kernel function\n"
-	       	       "  ______     _______   _____   ________________   _______________\n\n"
-	);
-
-	for (nd = rb_first(&tmp); nd; nd = rb_next(nd)) {
-		struct sym_entry *syme = rb_entry(nd, struct sym_entry, rb_node);
-		struct symbol *sym = (struct symbol *)(syme + 1);
-		char *color = PERF_COLOR_NORMAL;
-		double pcnt;
-
-		if (++printed > print_entries || syme->snap_count < count_filter)
-			continue;
-
-		pcnt = 100.0 - (100.0 * ((sum_ksamples - syme->snap_count) /
-					 sum_ksamples));
-
-		/*
-		 * We color high-overhead entries in red, low-overhead
-		 * entries in green - and keep the middle ground normal:
-		 */
-		if (pcnt >= 5.0)
-			color = PERF_COLOR_RED;
-		if (pcnt < 0.5)
-			color = PERF_COLOR_GREEN;
-
-		if (nr_counters == 1)
-			printf("%20.2f - ", syme->weight);
-		else
-			printf("%9.1f %10ld - ", syme->weight, syme->snap_count);
-
-		color_fprintf(stdout, color, "%4.1f%%", pcnt);
-		printf(" - %016llx : %s\n", sym->start, sym->name);
-	}
-}
-
-static void *display_thread(void *arg)
-{
-	struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
-	int delay_msecs = delay_secs * 1000;
-
-	printf("PerfTop refresh period: %d seconds\n", delay_secs);
-
-	do {
-		print_sym_table();
-	} while (!poll(&stdin_poll, 1, delay_msecs) == 1);
-
-	printf("key pressed - exiting.\n");
-	exit(0);
-
-	return NULL;
-}
-
-static int symbol_filter(struct dso *self, struct symbol *sym)
-{
-	static int filter_match;
-	struct sym_entry *syme;
-	const char *name = sym->name;
-
-	if (!strcmp(name, "_text") ||
-	    !strcmp(name, "_etext") ||
-	    !strcmp(name, "_sinittext") ||
-	    !strncmp("init_module", name, 11) ||
-	    !strncmp("cleanup_module", name, 14) ||
-	    strstr(name, "_text_start") ||
-	    strstr(name, "_text_end"))
-		return 1;
-
-	syme = dso__sym_priv(self, sym);
-	/* Tag samples to be skipped. */
-	if (!strcmp("default_idle", name) ||
-	    !strcmp("cpu_idle", name) ||
-	    !strcmp("enter_idle", name) ||
-	    !strcmp("exit_idle", name) ||
-	    !strcmp("mwait_idle", name))
-		syme->skip = 1;
-
-	if (filter_match == 1) {
-		filter_end = sym->start;
-		filter_match = -1;
-		if (filter_end - filter_start > 10000) {
-			fprintf(stderr,
-				"hm, too large filter symbol <%s> - skipping.\n",
-				sym_filter);
-			fprintf(stderr, "symbol filter start: %016lx\n",
-				filter_start);
-			fprintf(stderr, "                end: %016lx\n",
-				filter_end);
-			filter_end = filter_start = 0;
-			sym_filter = NULL;
-			sleep(1);
-		}
-	}
-
-	if (filter_match == 0 && sym_filter && !strcmp(name, sym_filter)) {
-		filter_match = 1;
-		filter_start = sym->start;
-	}
-
-
-	return 0;
-}
-
-static int parse_symbols(void)
-{
-	struct rb_node *node;
-	struct symbol  *sym;
-
-	kernel_dso = dso__new("[kernel]", sizeof(struct sym_entry));
-	if (kernel_dso == NULL)
-		return -1;
-
-	if (dso__load_kernel(kernel_dso, NULL, symbol_filter, 1) != 0)
-		goto out_delete_dso;
-
-	node = rb_first(&kernel_dso->syms);
-	sym = rb_entry(node, struct symbol, rb_node);
-	min_ip = sym->start;
-
-	node = rb_last(&kernel_dso->syms);
-	sym = rb_entry(node, struct symbol, rb_node);
-	max_ip = sym->end;
-
-	if (dump_symtab)
-		dso__fprintf(kernel_dso, stderr);
-
-	return 0;
-
-out_delete_dso:
-	dso__delete(kernel_dso);
-	kernel_dso = NULL;
-	return -1;
-}
-
-#define TRACE_COUNT     3
-
-/*
- * Binary search in the histogram table and record the hit:
- */
-static void record_ip(uint64_t ip, int counter)
-{
-	struct symbol *sym = dso__find_symbol(kernel_dso, ip);
-
-	if (sym != NULL) {
-		struct sym_entry *syme = dso__sym_priv(kernel_dso, sym);
-
-		if (!syme->skip) {
-			syme->count[counter]++;
-			pthread_mutex_lock(&active_symbols_lock);
-			if (list_empty(&syme->node) || !syme->node.next)
-				__list_insert_active_sym(syme);
-			pthread_mutex_unlock(&active_symbols_lock);
-			return;
-		}
-	}
-
-	samples--;
-}
-
-static void process_event(uint64_t ip, int counter)
-{
-	samples++;
-
-	if (ip < min_ip || ip > max_ip) {
-		userspace_samples++;
-		return;
-	}
-
-	record_ip(ip, counter);
-}
-
-struct mmap_data {
-	int			counter;
-	void			*base;
-	unsigned int		mask;
-	unsigned int		prev;
-};
-
-static unsigned int mmap_read_head(struct mmap_data *md)
-{
-	struct perf_counter_mmap_page *pc = md->base;
-	int head;
-
-	head = pc->data_head;
-	rmb();
-
-	return head;
-}
-
-struct timeval last_read, this_read;
-
-static void mmap_read(struct mmap_data *md)
-{
-	unsigned int head = mmap_read_head(md);
-	unsigned int old = md->prev;
-	unsigned char *data = md->base + page_size;
-	int diff;
-
-	gettimeofday(&this_read, NULL);
-
-	/*
-	 * If we're further behind than half the buffer, there's a chance
-	 * the writer will bite our tail and mess up the samples under us.
-	 *
-	 * If we somehow ended up ahead of the head, we got messed up.
-	 *
-	 * In either case, truncate and restart at head.
-	 */
-	diff = head - old;
-	if (diff > md->mask / 2 || diff < 0) {
-		struct timeval iv;
-		unsigned long msecs;
-
-		timersub(&this_read, &last_read, &iv);
-		msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
-
-		fprintf(stderr, "WARNING: failed to keep up with mmap data."
-				"  Last read %lu msecs ago.\n", msecs);
-
-		/*
-		 * head points to a known good entry, start there.
-		 */
-		old = head;
-	}
-
-	last_read = this_read;
-
-	for (; old != head;) {
-		struct ip_event {
-			struct perf_event_header header;
-			__u64 ip;
-			__u32 pid, target_pid;
-		};
-		struct mmap_event {
-			struct perf_event_header header;
-			__u32 pid, target_pid;
-			__u64 start;
-			__u64 len;
-			__u64 pgoff;
-			char filename[PATH_MAX];
-		};
-
-		typedef union event_union {
-			struct perf_event_header header;
-			struct ip_event ip;
-			struct mmap_event mmap;
-		} event_t;
-
-		event_t *event = (event_t *)&data[old & md->mask];
-
-		event_t event_copy;
-
-		size_t size = event->header.size;
-
-		/*
-		 * Event straddles the mmap boundary -- header should always
-		 * be inside due to u64 alignment of output.
-		 */
-		if ((old & md->mask) + size != ((old + size) & md->mask)) {
-			unsigned int offset = old;
-			unsigned int len = min(sizeof(*event), size), cpy;
-			void *dst = &event_copy;
-
-			do {
-				cpy = min(md->mask + 1 - (offset & md->mask), len);
-				memcpy(dst, &data[offset & md->mask], cpy);
-				offset += cpy;
-				dst += cpy;
-				len -= cpy;
-			} while (len);
-
-			event = &event_copy;
-		}
-
-		old += size;
-
-		if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
-			if (event->header.type & PERF_SAMPLE_IP)
-				process_event(event->ip.ip, md->counter);
-		}
-	}
-
-	md->prev = old;
-}
-
-static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
-static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
-
-static int __cmd_top(void)
-{
-	struct perf_counter_attr *attr;
-	pthread_t thread;
-	int i, counter, group_fd, nr_poll = 0;
-	unsigned int cpu;
-	int ret;
-
-	for (i = 0; i < nr_cpus; i++) {
-		group_fd = -1;
-		for (counter = 0; counter < nr_counters; counter++) {
-
-			cpu	= profile_cpu;
-			if (target_pid == -1 && profile_cpu == -1)
-				cpu = i;
-
-			attr = attrs + counter;
-
-			attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
-			attr->freq		= freq;
-
-			fd[i][counter] = sys_perf_counter_open(attr, target_pid, cpu, group_fd, 0);
-			if (fd[i][counter] < 0) {
-				int err = errno;
-
-				error("syscall returned with %d (%s)\n",
-					fd[i][counter], strerror(err));
-				if (err == EPERM)
-					printf("Are you root?\n");
-				exit(-1);
-			}
-			assert(fd[i][counter] >= 0);
-			fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
-
-			/*
-			 * First counter acts as the group leader:
-			 */
-			if (group && group_fd == -1)
-				group_fd = fd[i][counter];
-
-			event_array[nr_poll].fd = fd[i][counter];
-			event_array[nr_poll].events = POLLIN;
-			nr_poll++;
-
-			mmap_array[i][counter].counter = counter;
-			mmap_array[i][counter].prev = 0;
-			mmap_array[i][counter].mask = mmap_pages*page_size - 1;
-			mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
-					PROT_READ, MAP_SHARED, fd[i][counter], 0);
-			if (mmap_array[i][counter].base == MAP_FAILED)
-				die("failed to mmap with %d (%s)\n", errno, strerror(errno));
-		}
-	}
-
-	if (pthread_create(&thread, NULL, display_thread, NULL)) {
-		printf("Could not create display thread.\n");
-		exit(-1);
-	}
-
-	if (realtime_prio) {
-		struct sched_param param;
-
-		param.sched_priority = realtime_prio;
-		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
-			printf("Could not set realtime priority.\n");
-			exit(-1);
-		}
-	}
-
-	while (1) {
-		int hits = samples;
-
-		for (i = 0; i < nr_cpus; i++) {
-			for (counter = 0; counter < nr_counters; counter++)
-				mmap_read(&mmap_array[i][counter]);
-		}
-
-		if (hits == samples)
-			ret = poll(event_array, nr_poll, 100);
-	}
-
-	return 0;
-}
-
-static const char * const top_usage[] = {
-	"perf top [<options>]",
-	NULL
-};
-
-static const struct option options[] = {
-	OPT_CALLBACK('e', "event", NULL, "event",
-		     "event selector. use 'perf list' to list available events",
-		     parse_events),
-	OPT_INTEGER('c', "count", &default_interval,
-		    "event period to sample"),
-	OPT_INTEGER('p', "pid", &target_pid,
-		    "profile events on existing pid"),
-	OPT_BOOLEAN('a', "all-cpus", &system_wide,
-			    "system-wide collection from all CPUs"),
-	OPT_INTEGER('C', "CPU", &profile_cpu,
-		    "CPU to profile on"),
-	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
-		    "number of mmap data pages"),
-	OPT_INTEGER('r', "realtime", &realtime_prio,
-		    "collect data with this RT SCHED_FIFO priority"),
-	OPT_INTEGER('d', "delay", &delay_secs,
-		    "number of seconds to delay between refreshes"),
-	OPT_BOOLEAN('D', "dump-symtab", &dump_symtab,
-			    "dump the symbol table used for profiling"),
-	OPT_INTEGER('f', "count-filter", &count_filter,
-		    "only display functions with more events than this"),
-	OPT_BOOLEAN('g', "group", &group,
-			    "put the counters into a counter group"),
-	OPT_STRING('s', "sym-filter", &sym_filter, "pattern",
-		    "only display symbols matchig this pattern"),
-	OPT_BOOLEAN('z', "zero", &group,
-		    "zero history across updates"),
-	OPT_INTEGER('F', "freq", &freq,
-		    "profile at this frequency"),
-	OPT_INTEGER('E', "entries", &print_entries,
-		    "display this many functions"),
-	OPT_END()
-};
-
-int cmd_top(int argc, const char **argv, const char *prefix)
-{
-	int counter;
-
-	page_size = sysconf(_SC_PAGE_SIZE);
-
-	argc = parse_options(argc, argv, options, top_usage, 0);
-	if (argc)
-		usage_with_options(top_usage, options);
-
-	if (freq) {
-		default_interval = freq;
-		freq = 1;
-	}
-
-	/* CPU and PID are mutually exclusive */
-	if (target_pid != -1 && profile_cpu != -1) {
-		printf("WARNING: PID switch overriding CPU\n");
-		sleep(1);
-		profile_cpu = -1;
-	}
-
-	if (!nr_counters)
-		nr_counters = 1;
-
-	if (delay_secs < 1)
-		delay_secs = 1;
-
-	parse_symbols();
-
-	/*
-	 * Fill in the ones not specifically initialized via -c:
-	 */
-	for (counter = 0; counter < nr_counters; counter++) {
-		if (attrs[counter].sample_period)
-			continue;
-
-		attrs[counter].sample_period = default_interval;
-	}
-
-	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
-	assert(nr_cpus <= MAX_NR_CPUS);
-	assert(nr_cpus >= 0);
-
-	if (target_pid != -1 || profile_cpu != -1)
-		nr_cpus = 1;
-
-	return __cmd_top();
-}
diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h
deleted file mode 100644
index 51d1682..0000000
--- a/Documentation/perf_counter/builtin.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef BUILTIN_H
-#define BUILTIN_H
-
-#include "util/util.h"
-#include "util/strbuf.h"
-
-extern const char perf_version_string[];
-extern const char perf_usage_string[];
-extern const char perf_more_info_string[];
-
-extern void list_common_cmds_help(void);
-extern const char *help_unknown_cmd(const char *cmd);
-extern void prune_packed_objects(int);
-extern int read_line_with_nul(char *buf, int size, FILE *file);
-extern int check_pager_config(const char *cmd);
-
-extern int cmd_annotate(int argc, const char **argv, const char *prefix);
-extern int cmd_help(int argc, const char **argv, const char *prefix);
-extern int cmd_record(int argc, const char **argv, const char *prefix);
-extern int cmd_report(int argc, const char **argv, const char *prefix);
-extern int cmd_stat(int argc, const char **argv, const char *prefix);
-extern int cmd_top(int argc, const char **argv, const char *prefix);
-extern int cmd_version(int argc, const char **argv, const char *prefix);
-extern int cmd_list(int argc, const char **argv, const char *prefix);
-
-#endif
diff --git a/Documentation/perf_counter/command-list.txt b/Documentation/perf_counter/command-list.txt
deleted file mode 100644
index eebce30..0000000
--- a/Documentation/perf_counter/command-list.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-#
-# List of known perf commands.
-# command name			category [deprecated] [common]
-#
-perf-annotate			mainporcelain common
-perf-list			mainporcelain common
-perf-record			mainporcelain common
-perf-report			mainporcelain common
-perf-stat			mainporcelain common
-perf-top			mainporcelain common
diff --git a/Documentation/perf_counter/design.txt b/Documentation/perf_counter/design.txt
deleted file mode 100644
index d325076..0000000
--- a/Documentation/perf_counter/design.txt
+++ /dev/null
@@ -1,442 +0,0 @@
-
-Performance Counters for Linux
-------------------------------
-
-Performance counters are special hardware registers available on most modern
-CPUs. These registers count the number of certain types of hw events: such
-as instructions executed, cachemisses suffered, or branches mis-predicted -
-without slowing down the kernel or applications. These registers can also
-trigger interrupts when a threshold number of events have passed - and can
-thus be used to profile the code that runs on that CPU.
-
-The Linux Performance Counter subsystem provides an abstraction of these
-hardware capabilities. It provides per task and per CPU counters, counter
-groups, and it provides event capabilities on top of those.  It
-provides "virtual" 64-bit counters, regardless of the width of the
-underlying hardware counters.
-
-Performance counters are accessed via special file descriptors.
-There's one file descriptor per virtual counter used.
-
-The special file descriptor is opened via the perf_counter_open()
-system call:
-
-   int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
-			     pid_t pid, int cpu, int group_fd,
-			     unsigned long flags);
-
-The syscall returns the new fd. The fd can be used via the normal
-VFS system calls: read() can be used to read the counter, fcntl()
-can be used to set the blocking mode, etc.
-
-Multiple counters can be kept open at a time, and the counters
-can be poll()ed.
-
-When creating a new counter fd, 'perf_counter_hw_event' is:
-
-struct perf_counter_hw_event {
-        /*
-         * The MSB of the config word signifies if the rest contains cpu
-         * specific (raw) counter configuration data, if unset, the next
-         * 7 bits are an event type and the rest of the bits are the event
-         * identifier.
-         */
-        __u64                   config;
-
-        __u64                   irq_period;
-        __u32                   record_type;
-        __u32                   read_format;
-
-        __u64                   disabled       :  1, /* off by default        */
-                                inherit        :  1, /* children inherit it   */
-                                pinned         :  1, /* must always be on PMU */
-                                exclusive      :  1, /* only group on PMU     */
-                                exclude_user   :  1, /* don't count user      */
-                                exclude_kernel :  1, /* ditto kernel          */
-                                exclude_hv     :  1, /* ditto hypervisor      */
-                                exclude_idle   :  1, /* don't count when idle */
-                                mmap           :  1, /* include mmap data     */
-                                munmap         :  1, /* include munmap data   */
-                                comm           :  1, /* include comm data     */
-
-                                __reserved_1   : 52;
-
-        __u32                   extra_config_len;
-        __u32                   wakeup_events;  /* wakeup every n events */
-
-        __u64                   __reserved_2;
-        __u64                   __reserved_3;
-};
-
-The 'config' field specifies what the counter should count.  It
-is divided into 3 bit-fields:
-
-raw_type: 1 bit   (most significant bit)	0x8000_0000_0000_0000
-type:	  7 bits  (next most significant)	0x7f00_0000_0000_0000
-event_id: 56 bits (least significant)		0x00ff_ffff_ffff_ffff
-
-If 'raw_type' is 1, then the counter will count a hardware event
-specified by the remaining 63 bits of event_config.  The encoding is
-machine-specific.
-
-If 'raw_type' is 0, then the 'type' field says what kind of counter
-this is, with the following encoding:
-
-enum perf_event_types {
-	PERF_TYPE_HARDWARE		= 0,
-	PERF_TYPE_SOFTWARE		= 1,
-	PERF_TYPE_TRACEPOINT		= 2,
-};
-
-A counter of PERF_TYPE_HARDWARE will count the hardware event
-specified by 'event_id':
-
-/*
- * Generalized performance counter event types, used by the hw_event.event_id
- * parameter of the sys_perf_counter_open() syscall:
- */
-enum hw_event_ids {
-	/*
-	 * Common hardware events, generalized by the kernel:
-	 */
-	PERF_COUNT_CPU_CYCLES		= 0,
-	PERF_COUNT_INSTRUCTIONS		= 1,
-	PERF_COUNT_CACHE_REFERENCES	= 2,
-	PERF_COUNT_CACHE_MISSES		= 3,
-	PERF_COUNT_BRANCH_INSTRUCTIONS	= 4,
-	PERF_COUNT_BRANCH_MISSES	= 5,
-	PERF_COUNT_BUS_CYCLES		= 6,
-};
-
-These are standardized types of events that work relatively uniformly
-on all CPUs that implement Performance Counters support under Linux,
-although there may be variations (e.g., different CPUs might count
-cache references and misses at different levels of the cache hierarchy).
-If a CPU is not able to count the selected event, then the system call
-will return -EINVAL.
-
-More hw_event_types are supported as well, but they are CPU-specific
-and accessed as raw events.  For example, to count "External bus
-cycles while bus lock signal asserted" events on Intel Core CPUs, pass
-in a 0x4064 event_id value and set hw_event.raw_type to 1.
-
-A counter of type PERF_TYPE_SOFTWARE will count one of the available
-software events, selected by 'event_id':
-
-/*
- * Special "software" counters provided by the kernel, even if the hardware
- * does not support performance counters. These counters measure various
- * physical and sw events of the kernel (and allow the profiling of them as
- * well):
- */
-enum sw_event_ids {
-	PERF_COUNT_CPU_CLOCK		= 0,
-	PERF_COUNT_TASK_CLOCK		= 1,
-	PERF_COUNT_PAGE_FAULTS		= 2,
-	PERF_COUNT_CONTEXT_SWITCHES	= 3,
-	PERF_COUNT_CPU_MIGRATIONS	= 4,
-	PERF_COUNT_PAGE_FAULTS_MIN	= 5,
-	PERF_COUNT_PAGE_FAULTS_MAJ	= 6,
-};
-
-Counters of the type PERF_TYPE_TRACEPOINT are available when the ftrace event
-tracer is available, and event_id values can be obtained from
-/debug/tracing/events/*/*/id
-
-
-Counters come in two flavours: counting counters and sampling
-counters.  A "counting" counter is one that is used for counting the
-number of events that occur, and is characterised by having
-irq_period = 0.
-
-
-A read() on a counter returns the current value of the counter and possible
-additional values as specified by 'read_format', each value is a u64 (8 bytes)
-in size.
-
-/*
- * Bits that can be set in hw_event.read_format to request that
- * reads on the counter should return the indicated quantities,
- * in increasing order of bit value, after the counter value.
- */
-enum perf_counter_read_format {
-        PERF_FORMAT_TOTAL_TIME_ENABLED  =  1,
-        PERF_FORMAT_TOTAL_TIME_RUNNING  =  2,
-};
-
-Using these additional values one can establish the overcommit ratio for a
-particular counter allowing one to take the round-robin scheduling effect
-into account.
-
-
-A "sampling" counter is one that is set up to generate an interrupt
-every N events, where N is given by 'irq_period'.  A sampling counter
-has irq_period > 0. The record_type controls what data is recorded on each
-interrupt:
-
-/*
- * Bits that can be set in hw_event.record_type to request information
- * in the overflow packets.
- */
-enum perf_counter_record_format {
-        PERF_RECORD_IP          = 1U << 0,
-        PERF_RECORD_TID         = 1U << 1,
-        PERF_RECORD_TIME        = 1U << 2,
-        PERF_RECORD_ADDR        = 1U << 3,
-        PERF_RECORD_GROUP       = 1U << 4,
-        PERF_RECORD_CALLCHAIN   = 1U << 5,
-};
-
-Such (and other) events will be recorded in a ring-buffer, which is
-available to user-space using mmap() (see below).
-
-The 'disabled' bit specifies whether the counter starts out disabled
-or enabled.  If it is initially disabled, it can be enabled by ioctl
-or prctl (see below).
-
-The 'inherit' bit, if set, specifies that this counter should count
-events on descendant tasks as well as the task specified.  This only
-applies to new descendents, not to any existing descendents at the
-time the counter is created (nor to any new descendents of existing
-descendents).
-
-The 'pinned' bit, if set, specifies that the counter should always be
-on the CPU if at all possible.  It only applies to hardware counters
-and only to group leaders.  If a pinned counter cannot be put onto the
-CPU (e.g. because there are not enough hardware counters or because of
-a conflict with some other event), then the counter goes into an
-'error' state, where reads return end-of-file (i.e. read() returns 0)
-until the counter is subsequently enabled or disabled.
-
-The 'exclusive' bit, if set, specifies that when this counter's group
-is on the CPU, it should be the only group using the CPU's counters.
-In future, this will allow sophisticated monitoring programs to supply
-extra configuration information via 'extra_config_len' to exploit
-advanced features of the CPU's Performance Monitor Unit (PMU) that are
-not otherwise accessible and that might disrupt other hardware
-counters.
-
-The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a
-way to request that counting of events be restricted to times when the
-CPU is in user, kernel and/or hypervisor mode.
-
-The 'mmap' and 'munmap' bits allow recording of PROT_EXEC mmap/munmap
-operations, these can be used to relate userspace IP addresses to actual
-code, even after the mapping (or even the whole process) is gone,
-these events are recorded in the ring-buffer (see below).
-
-The 'comm' bit allows tracking of process comm data on process creation.
-This too is recorded in the ring-buffer (see below).
-
-The 'pid' parameter to the perf_counter_open() system call allows the
-counter to be specific to a task:
-
- pid == 0: if the pid parameter is zero, the counter is attached to the
- current task.
-
- pid > 0: the counter is attached to a specific task (if the current task
- has sufficient privilege to do so)
-
- pid < 0: all tasks are counted (per cpu counters)
-
-The 'cpu' parameter allows a counter to be made specific to a CPU:
-
- cpu >= 0: the counter is restricted to a specific CPU
- cpu == -1: the counter counts on all CPUs
-
-(Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.)
-
-A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts
-events of that task and 'follows' that task to whatever CPU the task
-gets schedule to. Per task counters can be created by any user, for
-their own tasks.
-
-A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts
-all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege.
-
-The 'flags' parameter is currently unused and must be zero.
-
-The 'group_fd' parameter allows counter "groups" to be set up.  A
-counter group has one counter which is the group "leader".  The leader
-is created first, with group_fd = -1 in the perf_counter_open call
-that creates it.  The rest of the group members are created
-subsequently, with group_fd giving the fd of the group leader.
-(A single counter on its own is created with group_fd = -1 and is
-considered to be a group with only 1 member.)
-
-A counter group is scheduled onto the CPU as a unit, that is, it will
-only be put onto the CPU if all of the counters in the group can be
-put onto the CPU.  This means that the values of the member counters
-can be meaningfully compared, added, divided (to get ratios), etc.,
-with each other, since they have counted events for the same set of
-executed instructions.
-
-
-Like stated, asynchronous events, like counter overflow or PROT_EXEC mmap
-tracking are logged into a ring-buffer. This ring-buffer is created and
-accessed through mmap().
-
-The mmap size should be 1+2^n pages, where the first page is a meta-data page
-(struct perf_counter_mmap_page) that contains various bits of information such
-as where the ring-buffer head is.
-
-/*
- * Structure of the page that can be mapped via mmap
- */
-struct perf_counter_mmap_page {
-        __u32   version;                /* version number of this structure */
-        __u32   compat_version;         /* lowest version this is compat with */
-
-        /*
-         * Bits needed to read the hw counters in user-space.
-         *
-         *   u32 seq;
-         *   s64 count;
-         *
-         *   do {
-         *     seq = pc->lock;
-         *
-         *     barrier()
-         *     if (pc->index) {
-         *       count = pmc_read(pc->index - 1);
-         *       count += pc->offset;
-         *     } else
-         *       goto regular_read;
-         *
-         *     barrier();
-         *   } while (pc->lock != seq);
-         *
-         * NOTE: for obvious reason this only works on self-monitoring
-         *       processes.
-         */
-        __u32   lock;                   /* seqlock for synchronization */
-        __u32   index;                  /* hardware counter identifier */
-        __s64   offset;                 /* add to hardware counter value */
-
-        /*
-         * Control data for the mmap() data buffer.
-         *
-         * User-space reading this value should issue an rmb(), on SMP capable
-         * platforms, after reading this value -- see perf_counter_wakeup().
-         */
-        __u32   data_head;              /* head in the data section */
-};
-
-NOTE: the hw-counter userspace bits are arch specific and are currently only
-      implemented on powerpc.
-
-The following 2^n pages are the ring-buffer which contains events of the form:
-
-#define PERF_EVENT_MISC_KERNEL          (1 << 0)
-#define PERF_EVENT_MISC_USER            (1 << 1)
-#define PERF_EVENT_MISC_OVERFLOW        (1 << 2)
-
-struct perf_event_header {
-        __u32   type;
-        __u16   misc;
-        __u16   size;
-};
-
-enum perf_event_type {
-
-        /*
-         * The MMAP events record the PROT_EXEC mappings so that we can
-         * correlate userspace IPs to code. They have the following structure:
-         *
-         * struct {
-         *      struct perf_event_header        header;
-         *
-         *      u32                             pid, tid;
-         *      u64                             addr;
-         *      u64                             len;
-         *      u64                             pgoff;
-         *      char                            filename[];
-         * };
-         */
-        PERF_EVENT_MMAP                 = 1,
-        PERF_EVENT_MUNMAP               = 2,
-
-        /*
-         * struct {
-         *      struct perf_event_header        header;
-         *
-         *      u32                             pid, tid;
-         *      char                            comm[];
-         * };
-         */
-        PERF_EVENT_COMM                 = 3,
-
-        /*
-         * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
-         * will be PERF_RECORD_*
-         *
-         * struct {
-         *      struct perf_event_header        header;
-         *
-         *      { u64                   ip;       } && PERF_RECORD_IP
-         *      { u32                   pid, tid; } && PERF_RECORD_TID
-         *      { u64                   time;     } && PERF_RECORD_TIME
-         *      { u64                   addr;     } && PERF_RECORD_ADDR
-         *
-         *      { u64                   nr;
-         *        { u64 event, val; }   cnt[nr];  } && PERF_RECORD_GROUP
-         *
-         *      { u16                   nr,
-         *                              hv,
-         *                              kernel,
-         *                              user;
-         *        u64                   ips[nr];  } && PERF_RECORD_CALLCHAIN
-         * };
-         */
-};
-
-NOTE: PERF_RECORD_CALLCHAIN is arch specific and currently only implemented
-      on x86.
-
-Notification of new events is possible through poll()/select()/epoll() and
-fcntl() managing signals.
-
-Normally a notification is generated for every page filled, however one can
-additionally set perf_counter_hw_event.wakeup_events to generate one every
-so many counter overflow events.
-
-Future work will include a splice() interface to the ring-buffer.
-
-
-Counters can be enabled and disabled in two ways: via ioctl and via
-prctl.  When a counter is disabled, it doesn't count or generate
-events but does continue to exist and maintain its count value.
-
-An individual counter or counter group can be enabled with
-
-	ioctl(fd, PERF_COUNTER_IOC_ENABLE);
-
-or disabled with
-
-	ioctl(fd, PERF_COUNTER_IOC_DISABLE);
-
-Enabling or disabling the leader of a group enables or disables the
-whole group; that is, while the group leader is disabled, none of the
-counters in the group will count.  Enabling or disabling a member of a
-group other than the leader only affects that counter - disabling an
-non-leader stops that counter from counting but doesn't affect any
-other counter.
-
-Additionally, non-inherited overflow counters can use
-
-	ioctl(fd, PERF_COUNTER_IOC_REFRESH, nr);
-
-to enable a counter for 'nr' events, after which it gets disabled again.
-
-A process can enable or disable all the counter groups that are
-attached to it, using prctl:
-
-	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
-
-	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
-
-This applies to all counters on the current process, whether created
-by this process or by another, and doesn't affect any counters that
-this process has created on other processes.  It only enables or
-disables the group leaders, not any other members in the groups.
-
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
deleted file mode 100644
index 4eb7259..0000000
--- a/Documentation/perf_counter/perf.c
+++ /dev/null
@@ -1,428 +0,0 @@
-/*
- * perf.c
- *
- * Performance analysis utility.
- *
- * This is the main hub from which the sub-commands (perf stat,
- * perf top, perf record, perf report, etc.) are started.
- */
-#include "builtin.h"
-
-#include "util/exec_cmd.h"
-#include "util/cache.h"
-#include "util/quote.h"
-#include "util/run-command.h"
-
-const char perf_usage_string[] =
-	"perf [--version] [--help] COMMAND [ARGS]";
-
-const char perf_more_info_string[] =
-	"See 'perf help COMMAND' for more information on a specific command.";
-
-static int use_pager = -1;
-struct pager_config {
-	const char *cmd;
-	int val;
-};
-
-static int pager_command_config(const char *var, const char *value, void *data)
-{
-	struct pager_config *c = data;
-	if (!prefixcmp(var, "pager.") && !strcmp(var + 6, c->cmd))
-		c->val = perf_config_bool(var, value);
-	return 0;
-}
-
-/* returns 0 for "no pager", 1 for "use pager", and -1 for "not specified" */
-int check_pager_config(const char *cmd)
-{
-	struct pager_config c;
-	c.cmd = cmd;
-	c.val = -1;
-	perf_config(pager_command_config, &c);
-	return c.val;
-}
-
-static void commit_pager_choice(void) {
-	switch (use_pager) {
-	case 0:
-		setenv("PERF_PAGER", "cat", 1);
-		break;
-	case 1:
-		/* setup_pager(); */
-		break;
-	default:
-		break;
-	}
-}
-
-static int handle_options(const char*** argv, int* argc, int* envchanged)
-{
-	int handled = 0;
-
-	while (*argc > 0) {
-		const char *cmd = (*argv)[0];
-		if (cmd[0] != '-')
-			break;
-
-		/*
-		 * For legacy reasons, the "version" and "help"
-		 * commands can be written with "--" prepended
-		 * to make them look like flags.
-		 */
-		if (!strcmp(cmd, "--help") || !strcmp(cmd, "--version"))
-			break;
-
-		/*
-		 * Check remaining flags.
-		 */
-		if (!prefixcmp(cmd, "--exec-path")) {
-			cmd += 11;
-			if (*cmd == '=')
-				perf_set_argv_exec_path(cmd + 1);
-			else {
-				puts(perf_exec_path());
-				exit(0);
-			}
-		} else if (!strcmp(cmd, "--html-path")) {
-			puts(system_path(PERF_HTML_PATH));
-			exit(0);
-		} else if (!strcmp(cmd, "-p") || !strcmp(cmd, "--paginate")) {
-			use_pager = 1;
-		} else if (!strcmp(cmd, "--no-pager")) {
-			use_pager = 0;
-			if (envchanged)
-				*envchanged = 1;
-		} else if (!strcmp(cmd, "--perf-dir")) {
-			if (*argc < 2) {
-				fprintf(stderr, "No directory given for --perf-dir.\n" );
-				usage(perf_usage_string);
-			}
-			setenv(PERF_DIR_ENVIRONMENT, (*argv)[1], 1);
-			if (envchanged)
-				*envchanged = 1;
-			(*argv)++;
-			(*argc)--;
-			handled++;
-		} else if (!prefixcmp(cmd, "--perf-dir=")) {
-			setenv(PERF_DIR_ENVIRONMENT, cmd + 10, 1);
-			if (envchanged)
-				*envchanged = 1;
-		} else if (!strcmp(cmd, "--work-tree")) {
-			if (*argc < 2) {
-				fprintf(stderr, "No directory given for --work-tree.\n" );
-				usage(perf_usage_string);
-			}
-			setenv(PERF_WORK_TREE_ENVIRONMENT, (*argv)[1], 1);
-			if (envchanged)
-				*envchanged = 1;
-			(*argv)++;
-			(*argc)--;
-		} else if (!prefixcmp(cmd, "--work-tree=")) {
-			setenv(PERF_WORK_TREE_ENVIRONMENT, cmd + 12, 1);
-			if (envchanged)
-				*envchanged = 1;
-		} else {
-			fprintf(stderr, "Unknown option: %s\n", cmd);
-			usage(perf_usage_string);
-		}
-
-		(*argv)++;
-		(*argc)--;
-		handled++;
-	}
-	return handled;
-}
-
-static int handle_alias(int *argcp, const char ***argv)
-{
-	int envchanged = 0, ret = 0, saved_errno = errno;
-	int count, option_count;
-	const char** new_argv;
-	const char *alias_command;
-	char *alias_string;
-
-	alias_command = (*argv)[0];
-	alias_string = alias_lookup(alias_command);
-	if (alias_string) {
-		if (alias_string[0] == '!') {
-			if (*argcp > 1) {
-				struct strbuf buf;
-
-				strbuf_init(&buf, PATH_MAX);
-				strbuf_addstr(&buf, alias_string);
-				sq_quote_argv(&buf, (*argv) + 1, PATH_MAX);
-				free(alias_string);
-				alias_string = buf.buf;
-			}
-			ret = system(alias_string + 1);
-			if (ret >= 0 && WIFEXITED(ret) &&
-			    WEXITSTATUS(ret) != 127)
-				exit(WEXITSTATUS(ret));
-			die("Failed to run '%s' when expanding alias '%s'",
-			    alias_string + 1, alias_command);
-		}
-		count = split_cmdline(alias_string, &new_argv);
-		if (count < 0)
-			die("Bad alias.%s string", alias_command);
-		option_count = handle_options(&new_argv, &count, &envchanged);
-		if (envchanged)
-			die("alias '%s' changes environment variables\n"
-				 "You can use '!perf' in the alias to do this.",
-				 alias_command);
-		memmove(new_argv - option_count, new_argv,
-				count * sizeof(char *));
-		new_argv -= option_count;
-
-		if (count < 1)
-			die("empty alias for %s", alias_command);
-
-		if (!strcmp(alias_command, new_argv[0]))
-			die("recursive alias: %s", alias_command);
-
-		new_argv = realloc(new_argv, sizeof(char*) *
-				    (count + *argcp + 1));
-		/* insert after command name */
-		memcpy(new_argv + count, *argv + 1, sizeof(char*) * *argcp);
-		new_argv[count+*argcp] = NULL;
-
-		*argv = new_argv;
-		*argcp += count - 1;
-
-		ret = 1;
-	}
-
-	errno = saved_errno;
-
-	return ret;
-}
-
-const char perf_version_string[] = PERF_VERSION;
-
-#define RUN_SETUP	(1<<0)
-#define USE_PAGER	(1<<1)
-/*
- * require working tree to be present -- anything uses this needs
- * RUN_SETUP for reading from the configuration file.
- */
-#define NEED_WORK_TREE	(1<<2)
-
-struct cmd_struct {
-	const char *cmd;
-	int (*fn)(int, const char **, const char *);
-	int option;
-};
-
-static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
-{
-	int status;
-	struct stat st;
-	const char *prefix;
-
-	prefix = NULL;
-	if (p->option & RUN_SETUP)
-		prefix = NULL; /* setup_perf_directory(); */
-
-	if (use_pager == -1 && p->option & RUN_SETUP)
-		use_pager = check_pager_config(p->cmd);
-	if (use_pager == -1 && p->option & USE_PAGER)
-		use_pager = 1;
-	commit_pager_choice();
-
-	if (p->option & NEED_WORK_TREE)
-		/* setup_work_tree() */;
-
-	status = p->fn(argc, argv, prefix);
-	if (status)
-		return status & 0xff;
-
-	/* Somebody closed stdout? */
-	if (fstat(fileno(stdout), &st))
-		return 0;
-	/* Ignore write errors for pipes and sockets.. */
-	if (S_ISFIFO(st.st_mode) || S_ISSOCK(st.st_mode))
-		return 0;
-
-	/* Check for ENOSPC and EIO errors.. */
-	if (fflush(stdout))
-		die("write failure on standard output: %s", strerror(errno));
-	if (ferror(stdout))
-		die("unknown write failure on standard output");
-	if (fclose(stdout))
-		die("close failed on standard output: %s", strerror(errno));
-	return 0;
-}
-
-static void handle_internal_command(int argc, const char **argv)
-{
-	const char *cmd = argv[0];
-	static struct cmd_struct commands[] = {
-		{ "help", cmd_help, 0 },
-		{ "list", cmd_list, 0 },
-		{ "record", cmd_record, 0 },
-		{ "report", cmd_report, 0 },
-		{ "stat", cmd_stat, 0 },
-		{ "top", cmd_top, 0 },
-		{ "annotate", cmd_annotate, 0 },
-		{ "version", cmd_version, 0 },
-	};
-	int i;
-	static const char ext[] = STRIP_EXTENSION;
-
-	if (sizeof(ext) > 1) {
-		i = strlen(argv[0]) - strlen(ext);
-		if (i > 0 && !strcmp(argv[0] + i, ext)) {
-			char *argv0 = strdup(argv[0]);
-			argv[0] = cmd = argv0;
-			argv0[i] = '\0';
-		}
-	}
-
-	/* Turn "perf cmd --help" into "perf help cmd" */
-	if (argc > 1 && !strcmp(argv[1], "--help")) {
-		argv[1] = argv[0];
-		argv[0] = cmd = "help";
-	}
-
-	for (i = 0; i < ARRAY_SIZE(commands); i++) {
-		struct cmd_struct *p = commands+i;
-		if (strcmp(p->cmd, cmd))
-			continue;
-		exit(run_builtin(p, argc, argv));
-	}
-}
-
-static void execv_dashed_external(const char **argv)
-{
-	struct strbuf cmd = STRBUF_INIT;
-	const char *tmp;
-	int status;
-
-	strbuf_addf(&cmd, "perf-%s", argv[0]);
-
-	/*
-	 * argv[0] must be the perf command, but the argv array
-	 * belongs to the caller, and may be reused in
-	 * subsequent loop iterations. Save argv[0] and
-	 * restore it on error.
-	 */
-	tmp = argv[0];
-	argv[0] = cmd.buf;
-
-	/*
-	 * if we fail because the command is not found, it is
-	 * OK to return. Otherwise, we just pass along the status code.
-	 */
-	status = run_command_v_opt(argv, 0);
-	if (status != -ERR_RUN_COMMAND_EXEC) {
-		if (IS_RUN_COMMAND_ERR(status))
-			die("unable to run '%s'", argv[0]);
-		exit(-status);
-	}
-	errno = ENOENT; /* as if we called execvp */
-
-	argv[0] = tmp;
-
-	strbuf_release(&cmd);
-}
-
-static int run_argv(int *argcp, const char ***argv)
-{
-	int done_alias = 0;
-
-	while (1) {
-		/* See if it's an internal command */
-		handle_internal_command(*argcp, *argv);
-
-		/* .. then try the external ones */
-		execv_dashed_external(*argv);
-
-		/* It could be an alias -- this works around the insanity
-		 * of overriding "perf log" with "perf show" by having
-		 * alias.log = show
-		 */
-		if (done_alias || !handle_alias(argcp, argv))
-			break;
-		done_alias = 1;
-	}
-
-	return done_alias;
-}
-
-
-int main(int argc, const char **argv)
-{
-	const char *cmd;
-
-	cmd = perf_extract_argv0_path(argv[0]);
-	if (!cmd)
-		cmd = "perf-help";
-
-	/*
-	 * "perf-xxxx" is the same as "perf xxxx", but we obviously:
-	 *
-	 *  - cannot take flags in between the "perf" and the "xxxx".
-	 *  - cannot execute it externally (since it would just do
-	 *    the same thing over again)
-	 *
-	 * So we just directly call the internal command handler, and
-	 * die if that one cannot handle it.
-	 */
-	if (!prefixcmp(cmd, "perf-")) {
-		cmd += 5;
-		argv[0] = cmd;
-		handle_internal_command(argc, argv);
-		die("cannot handle %s internally", cmd);
-	}
-
-	/* Look for flags.. */
-	argv++;
-	argc--;
-	handle_options(&argv, &argc, NULL);
-	commit_pager_choice();
-	if (argc > 0) {
-		if (!prefixcmp(argv[0], "--"))
-			argv[0] += 2;
-	} else {
-		/* The user didn't specify a command; give them help */
-		printf("\n usage: %s\n\n", perf_usage_string);
-		list_common_cmds_help();
-		printf("\n %s\n\n", perf_more_info_string);
-		exit(1);
-	}
-	cmd = argv[0];
-
-	/*
-	 * We use PATH to find perf commands, but we prepend some higher
-	 * precidence paths: the "--exec-path" option, the PERF_EXEC_PATH
-	 * environment, and the $(perfexecdir) from the Makefile at build
-	 * time.
-	 */
-	setup_path();
-
-	while (1) {
-		static int done_help = 0;
-		static int was_alias = 0;
-
-		was_alias = run_argv(&argc, &argv);
-		if (errno != ENOENT)
-			break;
-
-		if (was_alias) {
-			fprintf(stderr, "Expansion of alias '%s' failed; "
-				"'%s' is not a perf-command\n",
-				cmd, argv[0]);
-			exit(1);
-		}
-		if (!done_help) {
-			cmd = argv[0] = help_unknown_cmd(cmd);
-			done_help = 1;
-		} else
-			break;
-	}
-
-	fprintf(stderr, "Failed to run command '%s': %s\n",
-		cmd, strerror(errno));
-
-	return 1;
-}
diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h
deleted file mode 100644
index af0a504..0000000
--- a/Documentation/perf_counter/perf.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef _PERF_PERF_H
-#define _PERF_PERF_H
-
-#if defined(__x86_64__) || defined(__i386__)
-#include "../../arch/x86/include/asm/unistd.h"
-#define rmb()		asm volatile("lfence" ::: "memory")
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#endif
-
-#ifdef __powerpc__
-#include "../../arch/powerpc/include/asm/unistd.h"
-#define rmb()		asm volatile ("sync" ::: "memory")
-#define cpu_relax()	asm volatile ("" ::: "memory");
-#endif
-
-#include <time.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-
-#include "../../include/linux/perf_counter.h"
-
-/*
- * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
- * counters in the current task.
- */
-#define PR_TASK_PERF_COUNTERS_DISABLE   31
-#define PR_TASK_PERF_COUNTERS_ENABLE    32
-
-#ifndef NSEC_PER_SEC
-# define NSEC_PER_SEC			1000000000ULL
-#endif
-
-static inline unsigned long long rdclock(void)
-{
-	struct timespec ts;
-
-	clock_gettime(CLOCK_MONOTONIC, &ts);
-	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
-}
-
-/*
- * Pick up some kernel type conventions:
- */
-#define __user
-#define asmlinkage
-
-#define unlikely(x)	__builtin_expect(!!(x), 0)
-#define min(x, y) ({				\
-	typeof(x) _min1 = (x);			\
-	typeof(y) _min2 = (y);			\
-	(void) (&_min1 == &_min2);		\
-	_min1 < _min2 ? _min1 : _min2; })
-
-static inline int
-sys_perf_counter_open(struct perf_counter_attr *attr_uptr,
-		      pid_t pid, int cpu, int group_fd,
-		      unsigned long flags)
-{
-	return syscall(__NR_perf_counter_open, attr_uptr, pid, cpu,
-		       group_fd, flags);
-}
-
-#define MAX_COUNTERS			256
-#define MAX_NR_CPUS			256
-
-#endif
diff --git a/Documentation/perf_counter/util/PERF-VERSION-GEN b/Documentation/perf_counter/util/PERF-VERSION-GEN
deleted file mode 100755
index c561d15..0000000
--- a/Documentation/perf_counter/util/PERF-VERSION-GEN
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/sh
-
-GVF=PERF-VERSION-FILE
-DEF_VER=v0.0.1.PERF
-
-LF='
-'
-
-# First see if there is a version file (included in release tarballs),
-# then try git-describe, then default.
-if test -f version
-then
-	VN=$(cat version) || VN="$DEF_VER"
-elif test -d .git -o -f .git &&
-	VN=$(git describe --abbrev=4 HEAD 2>/dev/null) &&
-	case "$VN" in
-	*$LF*) (exit 1) ;;
-	v[0-9]*)
-		git update-index -q --refresh
-		test -z "$(git diff-index --name-only HEAD --)" ||
-		VN="$VN-dirty" ;;
-	esac
-then
-	VN=$(echo "$VN" | sed -e 's/-/./g');
-else
-	VN="$DEF_VER"
-fi
-
-VN=$(expr "$VN" : v*'\(.*\)')
-
-if test -r $GVF
-then
-	VC=$(sed -e 's/^PERF_VERSION = //' <$GVF)
-else
-	VC=unset
-fi
-test "$VN" = "$VC" || {
-	echo >&2 "PERF_VERSION = $VN"
-	echo "PERF_VERSION = $VN" >$GVF
-}
-
-
diff --git a/Documentation/perf_counter/util/abspath.c b/Documentation/perf_counter/util/abspath.c
deleted file mode 100644
index 61d33b8..0000000
--- a/Documentation/perf_counter/util/abspath.c
+++ /dev/null
@@ -1,117 +0,0 @@
-#include "cache.h"
-
-/*
- * Do not use this for inspecting *tracked* content.  When path is a
- * symlink to a directory, we do not want to say it is a directory when
- * dealing with tracked content in the working tree.
- */
-static int is_directory(const char *path)
-{
-	struct stat st;
-	return (!stat(path, &st) && S_ISDIR(st.st_mode));
-}
-
-/* We allow "recursive" symbolic links. Only within reason, though. */
-#define MAXDEPTH 5
-
-const char *make_absolute_path(const char *path)
-{
-	static char bufs[2][PATH_MAX + 1], *buf = bufs[0], *next_buf = bufs[1];
-	char cwd[1024] = "";
-	int buf_index = 1, len;
-
-	int depth = MAXDEPTH;
-	char *last_elem = NULL;
-	struct stat st;
-
-	if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
-		die ("Too long path: %.*s", 60, path);
-
-	while (depth--) {
-		if (!is_directory(buf)) {
-			char *last_slash = strrchr(buf, '/');
-			if (last_slash) {
-				*last_slash = '\0';
-				last_elem = xstrdup(last_slash + 1);
-			} else {
-				last_elem = xstrdup(buf);
-				*buf = '\0';
-			}
-		}
-
-		if (*buf) {
-			if (!*cwd && !getcwd(cwd, sizeof(cwd)))
-				die ("Could not get current working directory");
-
-			if (chdir(buf))
-				die ("Could not switch to '%s'", buf);
-		}
-		if (!getcwd(buf, PATH_MAX))
-			die ("Could not get current working directory");
-
-		if (last_elem) {
-			int len = strlen(buf);
-			if (len + strlen(last_elem) + 2 > PATH_MAX)
-				die ("Too long path name: '%s/%s'",
-						buf, last_elem);
-			buf[len] = '/';
-			strcpy(buf + len + 1, last_elem);
-			free(last_elem);
-			last_elem = NULL;
-		}
-
-		if (!lstat(buf, &st) && S_ISLNK(st.st_mode)) {
-			len = readlink(buf, next_buf, PATH_MAX);
-			if (len < 0)
-				die ("Invalid symlink: %s", buf);
-			if (PATH_MAX <= len)
-				die("symbolic link too long: %s", buf);
-			next_buf[len] = '\0';
-			buf = next_buf;
-			buf_index = 1 - buf_index;
-			next_buf = bufs[buf_index];
-		} else
-			break;
-	}
-
-	if (*cwd && chdir(cwd))
-		die ("Could not change back to '%s'", cwd);
-
-	return buf;
-}
-
-static const char *get_pwd_cwd(void)
-{
-	static char cwd[PATH_MAX + 1];
-	char *pwd;
-	struct stat cwd_stat, pwd_stat;
-	if (getcwd(cwd, PATH_MAX) == NULL)
-		return NULL;
-	pwd = getenv("PWD");
-	if (pwd && strcmp(pwd, cwd)) {
-		stat(cwd, &cwd_stat);
-		if (!stat(pwd, &pwd_stat) &&
-		    pwd_stat.st_dev == cwd_stat.st_dev &&
-		    pwd_stat.st_ino == cwd_stat.st_ino) {
-			strlcpy(cwd, pwd, PATH_MAX);
-		}
-	}
-	return cwd;
-}
-
-const char *make_nonrelative_path(const char *path)
-{
-	static char buf[PATH_MAX + 1];
-
-	if (is_absolute_path(path)) {
-		if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
-			die("Too long path: %.*s", 60, path);
-	} else {
-		const char *cwd = get_pwd_cwd();
-		if (!cwd)
-			die("Cannot determine the current working directory");
-		if (snprintf(buf, PATH_MAX, "%s/%s", cwd, path) >= PATH_MAX)
-			die("Too long path: %.*s", 60, path);
-	}
-	return buf;
-}
diff --git a/Documentation/perf_counter/util/alias.c b/Documentation/perf_counter/util/alias.c
deleted file mode 100644
index 9b3dd2b..0000000
--- a/Documentation/perf_counter/util/alias.c
+++ /dev/null
@@ -1,77 +0,0 @@
-#include "cache.h"
-
-static const char *alias_key;
-static char *alias_val;
-
-static int alias_lookup_cb(const char *k, const char *v, void *cb)
-{
-	if (!prefixcmp(k, "alias.") && !strcmp(k+6, alias_key)) {
-		if (!v)
-			return config_error_nonbool(k);
-		alias_val = strdup(v);
-		return 0;
-	}
-	return 0;
-}
-
-char *alias_lookup(const char *alias)
-{
-	alias_key = alias;
-	alias_val = NULL;
-	perf_config(alias_lookup_cb, NULL);
-	return alias_val;
-}
-
-int split_cmdline(char *cmdline, const char ***argv)
-{
-	int src, dst, count = 0, size = 16;
-	char quoted = 0;
-
-	*argv = malloc(sizeof(char*) * size);
-
-	/* split alias_string */
-	(*argv)[count++] = cmdline;
-	for (src = dst = 0; cmdline[src];) {
-		char c = cmdline[src];
-		if (!quoted && isspace(c)) {
-			cmdline[dst++] = 0;
-			while (cmdline[++src]
-					&& isspace(cmdline[src]))
-				; /* skip */
-			if (count >= size) {
-				size += 16;
-				*argv = realloc(*argv, sizeof(char*) * size);
-			}
-			(*argv)[count++] = cmdline + dst;
-		} else if (!quoted && (c == '\'' || c == '"')) {
-			quoted = c;
-			src++;
-		} else if (c == quoted) {
-			quoted = 0;
-			src++;
-		} else {
-			if (c == '\\' && quoted != '\'') {
-				src++;
-				c = cmdline[src];
-				if (!c) {
-					free(*argv);
-					*argv = NULL;
-					return error("cmdline ends with \\");
-				}
-			}
-			cmdline[dst++] = c;
-			src++;
-		}
-	}
-
-	cmdline[dst] = 0;
-
-	if (quoted) {
-		free(*argv);
-		*argv = NULL;
-		return error("unclosed quote");
-	}
-
-	return count;
-}
-
diff --git a/Documentation/perf_counter/util/cache.h b/Documentation/perf_counter/util/cache.h
deleted file mode 100644
index 393d614..0000000
--- a/Documentation/perf_counter/util/cache.h
+++ /dev/null
@@ -1,119 +0,0 @@
-#ifndef CACHE_H
-#define CACHE_H
-
-#include "util.h"
-#include "strbuf.h"
-
-#define PERF_DIR_ENVIRONMENT "PERF_DIR"
-#define PERF_WORK_TREE_ENVIRONMENT "PERF_WORK_TREE"
-#define DEFAULT_PERF_DIR_ENVIRONMENT ".perf"
-#define DB_ENVIRONMENT "PERF_OBJECT_DIRECTORY"
-#define INDEX_ENVIRONMENT "PERF_INDEX_FILE"
-#define GRAFT_ENVIRONMENT "PERF_GRAFT_FILE"
-#define TEMPLATE_DIR_ENVIRONMENT "PERF_TEMPLATE_DIR"
-#define CONFIG_ENVIRONMENT "PERF_CONFIG"
-#define EXEC_PATH_ENVIRONMENT "PERF_EXEC_PATH"
-#define CEILING_DIRECTORIES_ENVIRONMENT "PERF_CEILING_DIRECTORIES"
-#define PERFATTRIBUTES_FILE ".perfattributes"
-#define INFOATTRIBUTES_FILE "info/attributes"
-#define ATTRIBUTE_MACRO_PREFIX "[attr]"
-
-typedef int (*config_fn_t)(const char *, const char *, void *);
-extern int perf_default_config(const char *, const char *, void *);
-extern int perf_config_from_file(config_fn_t fn, const char *, void *);
-extern int perf_config(config_fn_t fn, void *);
-extern int perf_parse_ulong(const char *, unsigned long *);
-extern int perf_config_int(const char *, const char *);
-extern unsigned long perf_config_ulong(const char *, const char *);
-extern int perf_config_bool_or_int(const char *, const char *, int *);
-extern int perf_config_bool(const char *, const char *);
-extern int perf_config_string(const char **, const char *, const char *);
-extern int perf_config_set(const char *, const char *);
-extern int perf_config_set_multivar(const char *, const char *, const char *, int);
-extern int perf_config_rename_section(const char *, const char *);
-extern const char *perf_etc_perfconfig(void);
-extern int check_repository_format_version(const char *var, const char *value, void *cb);
-extern int perf_config_system(void);
-extern int perf_config_global(void);
-extern int config_error_nonbool(const char *);
-extern const char *config_exclusive_filename;
-
-#define MAX_PERFNAME (1000)
-extern char perf_default_email[MAX_PERFNAME];
-extern char perf_default_name[MAX_PERFNAME];
-extern int user_ident_explicitly_given;
-
-extern const char *perf_log_output_encoding;
-extern const char *perf_mailmap_file;
-
-/* IO helper functions */
-extern void maybe_flush_or_die(FILE *, const char *);
-extern int copy_fd(int ifd, int ofd);
-extern int copy_file(const char *dst, const char *src, int mode);
-extern ssize_t read_in_full(int fd, void *buf, size_t count);
-extern ssize_t write_in_full(int fd, const void *buf, size_t count);
-extern void write_or_die(int fd, const void *buf, size_t count);
-extern int write_or_whine(int fd, const void *buf, size_t count, const char *msg);
-extern int write_or_whine_pipe(int fd, const void *buf, size_t count, const char *msg);
-extern void fsync_or_die(int fd, const char *);
-
-/* pager.c */
-extern void setup_pager(void);
-extern const char *pager_program;
-extern int pager_in_use(void);
-extern int pager_use_color;
-
-extern const char *editor_program;
-extern const char *excludes_file;
-
-char *alias_lookup(const char *alias);
-int split_cmdline(char *cmdline, const char ***argv);
-
-#define alloc_nr(x) (((x)+16)*3/2)
-
-/*
- * Realloc the buffer pointed at by variable 'x' so that it can hold
- * at least 'nr' entries; the number of entries currently allocated
- * is 'alloc', using the standard growing factor alloc_nr() macro.
- *
- * DO NOT USE any expression with side-effect for 'x' or 'alloc'.
- */
-#define ALLOC_GROW(x, nr, alloc) \
-	do { \
-		if ((nr) > alloc) { \
-			if (alloc_nr(alloc) < (nr)) \
-				alloc = (nr); \
-			else \
-				alloc = alloc_nr(alloc); \
-			x = xrealloc((x), alloc * sizeof(*(x))); \
-		} \
-	} while(0)
-
-
-static inline int is_absolute_path(const char *path)
-{
-	return path[0] == '/';
-}
-
-const char *make_absolute_path(const char *path);
-const char *make_nonrelative_path(const char *path);
-const char *make_relative_path(const char *abs, const char *base);
-int normalize_path_copy(char *dst, const char *src);
-int longest_ancestor_length(const char *path, const char *prefix_list);
-char *strip_path_suffix(const char *path, const char *suffix);
-
-extern char *mkpath(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
-extern char *perf_path(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
-/* perf_mkstemp() - create tmp file honoring TMPDIR variable */
-extern int perf_mkstemp(char *path, size_t len, const char *template);
-
-extern char *mksnpath(char *buf, size_t n, const char *fmt, ...)
-	__attribute__((format (printf, 3, 4)));
-extern char *perf_snpath(char *buf, size_t n, const char *fmt, ...)
-	__attribute__((format (printf, 3, 4)));
-extern char *perf_pathdup(const char *fmt, ...)
-	__attribute__((format (printf, 1, 2)));
-
-extern size_t strlcpy(char *dest, const char *src, size_t size);
-
-#endif /* CACHE_H */
diff --git a/Documentation/perf_counter/util/color.c b/Documentation/perf_counter/util/color.c
deleted file mode 100644
index 9a8c20c..0000000
--- a/Documentation/perf_counter/util/color.c
+++ /dev/null
@@ -1,241 +0,0 @@
-#include "cache.h"
-#include "color.h"
-
-int perf_use_color_default = -1;
-
-static int parse_color(const char *name, int len)
-{
-	static const char * const color_names[] = {
-		"normal", "black", "red", "green", "yellow",
-		"blue", "magenta", "cyan", "white"
-	};
-	char *end;
-	int i;
-	for (i = 0; i < ARRAY_SIZE(color_names); i++) {
-		const char *str = color_names[i];
-		if (!strncasecmp(name, str, len) && !str[len])
-			return i - 1;
-	}
-	i = strtol(name, &end, 10);
-	if (end - name == len && i >= -1 && i <= 255)
-		return i;
-	return -2;
-}
-
-static int parse_attr(const char *name, int len)
-{
-	static const int attr_values[] = { 1, 2, 4, 5, 7 };
-	static const char * const attr_names[] = {
-		"bold", "dim", "ul", "blink", "reverse"
-	};
-	int i;
-	for (i = 0; i < ARRAY_SIZE(attr_names); i++) {
-		const char *str = attr_names[i];
-		if (!strncasecmp(name, str, len) && !str[len])
-			return attr_values[i];
-	}
-	return -1;
-}
-
-void color_parse(const char *value, const char *var, char *dst)
-{
-	color_parse_mem(value, strlen(value), var, dst);
-}
-
-void color_parse_mem(const char *value, int value_len, const char *var,
-		char *dst)
-{
-	const char *ptr = value;
-	int len = value_len;
-	int attr = -1;
-	int fg = -2;
-	int bg = -2;
-
-	if (!strncasecmp(value, "reset", len)) {
-		strcpy(dst, PERF_COLOR_RESET);
-		return;
-	}
-
-	/* [fg [bg]] [attr] */
-	while (len > 0) {
-		const char *word = ptr;
-		int val, wordlen = 0;
-
-		while (len > 0 && !isspace(word[wordlen])) {
-			wordlen++;
-			len--;
-		}
-
-		ptr = word + wordlen;
-		while (len > 0 && isspace(*ptr)) {
-			ptr++;
-			len--;
-		}
-
-		val = parse_color(word, wordlen);
-		if (val >= -1) {
-			if (fg == -2) {
-				fg = val;
-				continue;
-			}
-			if (bg == -2) {
-				bg = val;
-				continue;
-			}
-			goto bad;
-		}
-		val = parse_attr(word, wordlen);
-		if (val < 0 || attr != -1)
-			goto bad;
-		attr = val;
-	}
-
-	if (attr >= 0 || fg >= 0 || bg >= 0) {
-		int sep = 0;
-
-		*dst++ = '\033';
-		*dst++ = '[';
-		if (attr >= 0) {
-			*dst++ = '0' + attr;
-			sep++;
-		}
-		if (fg >= 0) {
-			if (sep++)
-				*dst++ = ';';
-			if (fg < 8) {
-				*dst++ = '3';
-				*dst++ = '0' + fg;
-			} else {
-				dst += sprintf(dst, "38;5;%d", fg);
-			}
-		}
-		if (bg >= 0) {
-			if (sep++)
-				*dst++ = ';';
-			if (bg < 8) {
-				*dst++ = '4';
-				*dst++ = '0' + bg;
-			} else {
-				dst += sprintf(dst, "48;5;%d", bg);
-			}
-		}
-		*dst++ = 'm';
-	}
-	*dst = 0;
-	return;
-bad:
-	die("bad color value '%.*s' for variable '%s'", value_len, value, var);
-}
-
-int perf_config_colorbool(const char *var, const char *value, int stdout_is_tty)
-{
-	if (value) {
-		if (!strcasecmp(value, "never"))
-			return 0;
-		if (!strcasecmp(value, "always"))
-			return 1;
-		if (!strcasecmp(value, "auto"))
-			goto auto_color;
-	}
-
-	/* Missing or explicit false to turn off colorization */
-	if (!perf_config_bool(var, value))
-		return 0;
-
-	/* any normal truth value defaults to 'auto' */
- auto_color:
-	if (stdout_is_tty < 0)
-		stdout_is_tty = isatty(1);
-	if (stdout_is_tty || (pager_in_use() && pager_use_color)) {
-		char *term = getenv("TERM");
-		if (term && strcmp(term, "dumb"))
-			return 1;
-	}
-	return 0;
-}
-
-int perf_color_default_config(const char *var, const char *value, void *cb)
-{
-	if (!strcmp(var, "color.ui")) {
-		perf_use_color_default = perf_config_colorbool(var, value, -1);
-		return 0;
-	}
-
-	return perf_default_config(var, value, cb);
-}
-
-static int color_vfprintf(FILE *fp, const char *color, const char *fmt,
-		va_list args, const char *trail)
-{
-	int r = 0;
-
-	/*
-	 * Auto-detect:
-	 */
-	if (perf_use_color_default < 0) {
-		if (isatty(1) || pager_in_use())
-			perf_use_color_default = 1;
-		else
-			perf_use_color_default = 0;
-	}
-
-	if (perf_use_color_default && *color)
-		r += fprintf(fp, "%s", color);
-	r += vfprintf(fp, fmt, args);
-	if (perf_use_color_default && *color)
-		r += fprintf(fp, "%s", PERF_COLOR_RESET);
-	if (trail)
-		r += fprintf(fp, "%s", trail);
-	return r;
-}
-
-
-
-int color_fprintf(FILE *fp, const char *color, const char *fmt, ...)
-{
-	va_list args;
-	int r;
-
-	va_start(args, fmt);
-	r = color_vfprintf(fp, color, fmt, args, NULL);
-	va_end(args);
-	return r;
-}
-
-int color_fprintf_ln(FILE *fp, const char *color, const char *fmt, ...)
-{
-	va_list args;
-	int r;
-	va_start(args, fmt);
-	r = color_vfprintf(fp, color, fmt, args, "\n");
-	va_end(args);
-	return r;
-}
-
-/*
- * This function splits the buffer by newlines and colors the lines individually.
- *
- * Returns 0 on success.
- */
-int color_fwrite_lines(FILE *fp, const char *color,
-		size_t count, const char *buf)
-{
-	if (!*color)
-		return fwrite(buf, count, 1, fp) != 1;
-	while (count) {
-		char *p = memchr(buf, '\n', count);
-		if (p != buf && (fputs(color, fp) < 0 ||
-				fwrite(buf, p ? p - buf : count, 1, fp) != 1 ||
-				fputs(PERF_COLOR_RESET, fp) < 0))
-			return -1;
-		if (!p)
-			return 0;
-		if (fputc('\n', fp) < 0)
-			return -1;
-		count -= p + 1 - buf;
-		buf = p + 1;
-	}
-	return 0;
-}
-
-
diff --git a/Documentation/perf_counter/util/color.h b/Documentation/perf_counter/util/color.h
deleted file mode 100644
index 5abfd37..0000000
--- a/Documentation/perf_counter/util/color.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef COLOR_H
-#define COLOR_H
-
-/* "\033[1;38;5;2xx;48;5;2xxm\0" is 23 bytes */
-#define COLOR_MAXLEN 24
-
-#define PERF_COLOR_NORMAL	""
-#define PERF_COLOR_RESET	"\033[m"
-#define PERF_COLOR_BOLD		"\033[1m"
-#define PERF_COLOR_RED		"\033[31m"
-#define PERF_COLOR_GREEN	"\033[32m"
-#define PERF_COLOR_YELLOW	"\033[33m"
-#define PERF_COLOR_BLUE		"\033[34m"
-#define PERF_COLOR_MAGENTA	"\033[35m"
-#define PERF_COLOR_CYAN		"\033[36m"
-#define PERF_COLOR_BG_RED	"\033[41m"
-
-/*
- * This variable stores the value of color.ui
- */
-extern int perf_use_color_default;
-
-
-/*
- * Use this instead of perf_default_config if you need the value of color.ui.
- */
-int perf_color_default_config(const char *var, const char *value, void *cb);
-
-int perf_config_colorbool(const char *var, const char *value, int stdout_is_tty);
-void color_parse(const char *value, const char *var, char *dst);
-void color_parse_mem(const char *value, int len, const char *var, char *dst);
-int color_fprintf(FILE *fp, const char *color, const char *fmt, ...);
-int color_fprintf_ln(FILE *fp, const char *color, const char *fmt, ...);
-int color_fwrite_lines(FILE *fp, const char *color, size_t count, const char *buf);
-
-#endif /* COLOR_H */
diff --git a/Documentation/perf_counter/util/config.c b/Documentation/perf_counter/util/config.c
deleted file mode 100644
index 3dd13fa..0000000
--- a/Documentation/perf_counter/util/config.c
+++ /dev/null
@@ -1,873 +0,0 @@
-/*
- * GIT - The information manager from hell
- *
- * Copyright (C) Linus Torvalds, 2005
- * Copyright (C) Johannes Schindelin, 2005
- *
- */
-#include "util.h"
-#include "cache.h"
-#include "exec_cmd.h"
-
-#define MAXNAME (256)
-
-static FILE *config_file;
-static const char *config_file_name;
-static int config_linenr;
-static int config_file_eof;
-
-const char *config_exclusive_filename = NULL;
-
-static int get_next_char(void)
-{
-	int c;
-	FILE *f;
-
-	c = '\n';
-	if ((f = config_file) != NULL) {
-		c = fgetc(f);
-		if (c == '\r') {
-			/* DOS like systems */
-			c = fgetc(f);
-			if (c != '\n') {
-				ungetc(c, f);
-				c = '\r';
-			}
-		}
-		if (c == '\n')
-			config_linenr++;
-		if (c == EOF) {
-			config_file_eof = 1;
-			c = '\n';
-		}
-	}
-	return c;
-}
-
-static char *parse_value(void)
-{
-	static char value[1024];
-	int quote = 0, comment = 0, len = 0, space = 0;
-
-	for (;;) {
-		int c = get_next_char();
-		if (len >= sizeof(value) - 1)
-			return NULL;
-		if (c == '\n') {
-			if (quote)
-				return NULL;
-			value[len] = 0;
-			return value;
-		}
-		if (comment)
-			continue;
-		if (isspace(c) && !quote) {
-			space = 1;
-			continue;
-		}
-		if (!quote) {
-			if (c == ';' || c == '#') {
-				comment = 1;
-				continue;
-			}
-		}
-		if (space) {
-			if (len)
-				value[len++] = ' ';
-			space = 0;
-		}
-		if (c == '\\') {
-			c = get_next_char();
-			switch (c) {
-			case '\n':
-				continue;
-			case 't':
-				c = '\t';
-				break;
-			case 'b':
-				c = '\b';
-				break;
-			case 'n':
-				c = '\n';
-				break;
-			/* Some characters escape as themselves */
-			case '\\': case '"':
-				break;
-			/* Reject unknown escape sequences */
-			default:
-				return NULL;
-			}
-			value[len++] = c;
-			continue;
-		}
-		if (c == '"') {
-			quote = 1-quote;
-			continue;
-		}
-		value[len++] = c;
-	}
-}
-
-static inline int iskeychar(int c)
-{
-	return isalnum(c) || c == '-';
-}
-
-static int get_value(config_fn_t fn, void *data, char *name, unsigned int len)
-{
-	int c;
-	char *value;
-
-	/* Get the full name */
-	for (;;) {
-		c = get_next_char();
-		if (config_file_eof)
-			break;
-		if (!iskeychar(c))
-			break;
-		name[len++] = tolower(c);
-		if (len >= MAXNAME)
-			return -1;
-	}
-	name[len] = 0;
-	while (c == ' ' || c == '\t')
-		c = get_next_char();
-
-	value = NULL;
-	if (c != '\n') {
-		if (c != '=')
-			return -1;
-		value = parse_value();
-		if (!value)
-			return -1;
-	}
-	return fn(name, value, data);
-}
-
-static int get_extended_base_var(char *name, int baselen, int c)
-{
-	do {
-		if (c == '\n')
-			return -1;
-		c = get_next_char();
-	} while (isspace(c));
-
-	/* We require the format to be '[base "extension"]' */
-	if (c != '"')
-		return -1;
-	name[baselen++] = '.';
-
-	for (;;) {
-		int c = get_next_char();
-		if (c == '\n')
-			return -1;
-		if (c == '"')
-			break;
-		if (c == '\\') {
-			c = get_next_char();
-			if (c == '\n')
-				return -1;
-		}
-		name[baselen++] = c;
-		if (baselen > MAXNAME / 2)
-			return -1;
-	}
-
-	/* Final ']' */
-	if (get_next_char() != ']')
-		return -1;
-	return baselen;
-}
-
-static int get_base_var(char *name)
-{
-	int baselen = 0;
-
-	for (;;) {
-		int c = get_next_char();
-		if (config_file_eof)
-			return -1;
-		if (c == ']')
-			return baselen;
-		if (isspace(c))
-			return get_extended_base_var(name, baselen, c);
-		if (!iskeychar(c) && c != '.')
-			return -1;
-		if (baselen > MAXNAME / 2)
-			return -1;
-		name[baselen++] = tolower(c);
-	}
-}
-
-static int perf_parse_file(config_fn_t fn, void *data)
-{
-	int comment = 0;
-	int baselen = 0;
-	static char var[MAXNAME];
-
-	/* U+FEFF Byte Order Mark in UTF8 */
-	static const unsigned char *utf8_bom = (unsigned char *) "\xef\xbb\xbf";
-	const unsigned char *bomptr = utf8_bom;
-
-	for (;;) {
-		int c = get_next_char();
-		if (bomptr && *bomptr) {
-			/* We are at the file beginning; skip UTF8-encoded BOM
-			 * if present. Sane editors won't put this in on their
-			 * own, but e.g. Windows Notepad will do it happily. */
-			if ((unsigned char) c == *bomptr) {
-				bomptr++;
-				continue;
-			} else {
-				/* Do not tolerate partial BOM. */
-				if (bomptr != utf8_bom)
-					break;
-				/* No BOM at file beginning. Cool. */
-				bomptr = NULL;
-			}
-		}
-		if (c == '\n') {
-			if (config_file_eof)
-				return 0;
-			comment = 0;
-			continue;
-		}
-		if (comment || isspace(c))
-			continue;
-		if (c == '#' || c == ';') {
-			comment = 1;
-			continue;
-		}
-		if (c == '[') {
-			baselen = get_base_var(var);
-			if (baselen <= 0)
-				break;
-			var[baselen++] = '.';
-			var[baselen] = 0;
-			continue;
-		}
-		if (!isalpha(c))
-			break;
-		var[baselen] = tolower(c);
-		if (get_value(fn, data, var, baselen+1) < 0)
-			break;
-	}
-	die("bad config file line %d in %s", config_linenr, config_file_name);
-}
-
-static int parse_unit_factor(const char *end, unsigned long *val)
-{
-	if (!*end)
-		return 1;
-	else if (!strcasecmp(end, "k")) {
-		*val *= 1024;
-		return 1;
-	}
-	else if (!strcasecmp(end, "m")) {
-		*val *= 1024 * 1024;
-		return 1;
-	}
-	else if (!strcasecmp(end, "g")) {
-		*val *= 1024 * 1024 * 1024;
-		return 1;
-	}
-	return 0;
-}
-
-static int perf_parse_long(const char *value, long *ret)
-{
-	if (value && *value) {
-		char *end;
-		long val = strtol(value, &end, 0);
-		unsigned long factor = 1;
-		if (!parse_unit_factor(end, &factor))
-			return 0;
-		*ret = val * factor;
-		return 1;
-	}
-	return 0;
-}
-
-int perf_parse_ulong(const char *value, unsigned long *ret)
-{
-	if (value && *value) {
-		char *end;
-		unsigned long val = strtoul(value, &end, 0);
-		if (!parse_unit_factor(end, &val))
-			return 0;
-		*ret = val;
-		return 1;
-	}
-	return 0;
-}
-
-static void die_bad_config(const char *name)
-{
-	if (config_file_name)
-		die("bad config value for '%s' in %s", name, config_file_name);
-	die("bad config value for '%s'", name);
-}
-
-int perf_config_int(const char *name, const char *value)
-{
-	long ret = 0;
-	if (!perf_parse_long(value, &ret))
-		die_bad_config(name);
-	return ret;
-}
-
-unsigned long perf_config_ulong(const char *name, const char *value)
-{
-	unsigned long ret;
-	if (!perf_parse_ulong(value, &ret))
-		die_bad_config(name);
-	return ret;
-}
-
-int perf_config_bool_or_int(const char *name, const char *value, int *is_bool)
-{
-	*is_bool = 1;
-	if (!value)
-		return 1;
-	if (!*value)
-		return 0;
-	if (!strcasecmp(value, "true") || !strcasecmp(value, "yes") || !strcasecmp(value, "on"))
-		return 1;
-	if (!strcasecmp(value, "false") || !strcasecmp(value, "no") || !strcasecmp(value, "off"))
-		return 0;
-	*is_bool = 0;
-	return perf_config_int(name, value);
-}
-
-int perf_config_bool(const char *name, const char *value)
-{
-	int discard;
-	return !!perf_config_bool_or_int(name, value, &discard);
-}
-
-int perf_config_string(const char **dest, const char *var, const char *value)
-{
-	if (!value)
-		return config_error_nonbool(var);
-	*dest = strdup(value);
-	return 0;
-}
-
-static int perf_default_core_config(const char *var, const char *value)
-{
-	/* Add other config variables here and to Documentation/config.txt. */
-	return 0;
-}
-
-int perf_default_config(const char *var, const char *value, void *dummy)
-{
-	if (!prefixcmp(var, "core."))
-		return perf_default_core_config(var, value);
-
-	/* Add other config variables here and to Documentation/config.txt. */
-	return 0;
-}
-
-int perf_config_from_file(config_fn_t fn, const char *filename, void *data)
-{
-	int ret;
-	FILE *f = fopen(filename, "r");
-
-	ret = -1;
-	if (f) {
-		config_file = f;
-		config_file_name = filename;
-		config_linenr = 1;
-		config_file_eof = 0;
-		ret = perf_parse_file(fn, data);
-		fclose(f);
-		config_file_name = NULL;
-	}
-	return ret;
-}
-
-const char *perf_etc_perfconfig(void)
-{
-	static const char *system_wide;
-	if (!system_wide)
-		system_wide = system_path(ETC_PERFCONFIG);
-	return system_wide;
-}
-
-static int perf_env_bool(const char *k, int def)
-{
-	const char *v = getenv(k);
-	return v ? perf_config_bool(k, v) : def;
-}
-
-int perf_config_system(void)
-{
-	return !perf_env_bool("PERF_CONFIG_NOSYSTEM", 0);
-}
-
-int perf_config_global(void)
-{
-	return !perf_env_bool("PERF_CONFIG_NOGLOBAL", 0);
-}
-
-int perf_config(config_fn_t fn, void *data)
-{
-	int ret = 0, found = 0;
-	char *repo_config = NULL;
-	const char *home = NULL;
-
-	/* Setting $PERF_CONFIG makes perf read _only_ the given config file. */
-	if (config_exclusive_filename)
-		return perf_config_from_file(fn, config_exclusive_filename, data);
-	if (perf_config_system() && !access(perf_etc_perfconfig(), R_OK)) {
-		ret += perf_config_from_file(fn, perf_etc_perfconfig(),
-					    data);
-		found += 1;
-	}
-
-	home = getenv("HOME");
-	if (perf_config_global() && home) {
-		char *user_config = strdup(mkpath("%s/.perfconfig", home));
-		if (!access(user_config, R_OK)) {
-			ret += perf_config_from_file(fn, user_config, data);
-			found += 1;
-		}
-		free(user_config);
-	}
-
-	repo_config = perf_pathdup("config");
-	if (!access(repo_config, R_OK)) {
-		ret += perf_config_from_file(fn, repo_config, data);
-		found += 1;
-	}
-	free(repo_config);
-	if (found == 0)
-		return -1;
-	return ret;
-}
-
-/*
- * Find all the stuff for perf_config_set() below.
- */
-
-#define MAX_MATCHES 512
-
-static struct {
-	int baselen;
-	char* key;
-	int do_not_match;
-	regex_t* value_regex;
-	int multi_replace;
-	size_t offset[MAX_MATCHES];
-	enum { START, SECTION_SEEN, SECTION_END_SEEN, KEY_SEEN } state;
-	int seen;
-} store;
-
-static int matches(const char* key, const char* value)
-{
-	return !strcmp(key, store.key) &&
-		(store.value_regex == NULL ||
-		 (store.do_not_match ^
-		  !regexec(store.value_regex, value, 0, NULL, 0)));
-}
-
-static int store_aux(const char* key, const char* value, void *cb)
-{
-	const char *ep;
-	size_t section_len;
-
-	switch (store.state) {
-	case KEY_SEEN:
-		if (matches(key, value)) {
-			if (store.seen == 1 && store.multi_replace == 0) {
-				warning("%s has multiple values", key);
-			} else if (store.seen >= MAX_MATCHES) {
-				error("too many matches for %s", key);
-				return 1;
-			}
-
-			store.offset[store.seen] = ftell(config_file);
-			store.seen++;
-		}
-		break;
-	case SECTION_SEEN:
-		/*
-		 * What we are looking for is in store.key (both
-		 * section and var), and its section part is baselen
-		 * long.  We found key (again, both section and var).
-		 * We would want to know if this key is in the same
-		 * section as what we are looking for.  We already
-		 * know we are in the same section as what should
-		 * hold store.key.
-		 */
-		ep = strrchr(key, '.');
-		section_len = ep - key;
-
-		if ((section_len != store.baselen) ||
-		    memcmp(key, store.key, section_len+1)) {
-			store.state = SECTION_END_SEEN;
-			break;
-		}
-
-		/*
-		 * Do not increment matches: this is no match, but we
-		 * just made sure we are in the desired section.
-		 */
-		store.offset[store.seen] = ftell(config_file);
-		/* fallthru */
-	case SECTION_END_SEEN:
-	case START:
-		if (matches(key, value)) {
-			store.offset[store.seen] = ftell(config_file);
-			store.state = KEY_SEEN;
-			store.seen++;
-		} else {
-			if (strrchr(key, '.') - key == store.baselen &&
-			      !strncmp(key, store.key, store.baselen)) {
-					store.state = SECTION_SEEN;
-					store.offset[store.seen] = ftell(config_file);
-			}
-		}
-	}
-	return 0;
-}
-
-static int store_write_section(int fd, const char* key)
-{
-	const char *dot;
-	int i, success;
-	struct strbuf sb = STRBUF_INIT;
-
-	dot = memchr(key, '.', store.baselen);
-	if (dot) {
-		strbuf_addf(&sb, "[%.*s \"", (int)(dot - key), key);
-		for (i = dot - key + 1; i < store.baselen; i++) {
-			if (key[i] == '"' || key[i] == '\\')
-				strbuf_addch(&sb, '\\');
-			strbuf_addch(&sb, key[i]);
-		}
-		strbuf_addstr(&sb, "\"]\n");
-	} else {
-		strbuf_addf(&sb, "[%.*s]\n", store.baselen, key);
-	}
-
-	success = write_in_full(fd, sb.buf, sb.len) == sb.len;
-	strbuf_release(&sb);
-
-	return success;
-}
-
-static int store_write_pair(int fd, const char* key, const char* value)
-{
-	int i, success;
-	int length = strlen(key + store.baselen + 1);
-	const char *quote = "";
-	struct strbuf sb = STRBUF_INIT;
-
-	/*
-	 * Check to see if the value needs to be surrounded with a dq pair.
-	 * Note that problematic characters are always backslash-quoted; this
-	 * check is about not losing leading or trailing SP and strings that
-	 * follow beginning-of-comment characters (i.e. ';' and '#') by the
-	 * configuration parser.
-	 */
-	if (value[0] == ' ')
-		quote = "\"";
-	for (i = 0; value[i]; i++)
-		if (value[i] == ';' || value[i] == '#')
-			quote = "\"";
-	if (i && value[i - 1] == ' ')
-		quote = "\"";
-
-	strbuf_addf(&sb, "\t%.*s = %s",
-		    length, key + store.baselen + 1, quote);
-
-	for (i = 0; value[i]; i++)
-		switch (value[i]) {
-		case '\n':
-			strbuf_addstr(&sb, "\\n");
-			break;
-		case '\t':
-			strbuf_addstr(&sb, "\\t");
-			break;
-		case '"':
-		case '\\':
-			strbuf_addch(&sb, '\\');
-		default:
-			strbuf_addch(&sb, value[i]);
-			break;
-		}
-	strbuf_addf(&sb, "%s\n", quote);
-
-	success = write_in_full(fd, sb.buf, sb.len) == sb.len;
-	strbuf_release(&sb);
-
-	return success;
-}
-
-static ssize_t find_beginning_of_line(const char* contents, size_t size,
-	size_t offset_, int* found_bracket)
-{
-	size_t equal_offset = size, bracket_offset = size;
-	ssize_t offset;
-
-contline:
-	for (offset = offset_-2; offset > 0
-			&& contents[offset] != '\n'; offset--)
-		switch (contents[offset]) {
-			case '=': equal_offset = offset; break;
-			case ']': bracket_offset = offset; break;
-		}
-	if (offset > 0 && contents[offset-1] == '\\') {
-		offset_ = offset;
-		goto contline;
-	}
-	if (bracket_offset < equal_offset) {
-		*found_bracket = 1;
-		offset = bracket_offset+1;
-	} else
-		offset++;
-
-	return offset;
-}
-
-int perf_config_set(const char* key, const char* value)
-{
-	return perf_config_set_multivar(key, value, NULL, 0);
-}
-
-/*
- * If value==NULL, unset in (remove from) config,
- * if value_regex!=NULL, disregard key/value pairs where value does not match.
- * if multi_replace==0, nothing, or only one matching key/value is replaced,
- *     else all matching key/values (regardless how many) are removed,
- *     before the new pair is written.
- *
- * Returns 0 on success.
- *
- * This function does this:
- *
- * - it locks the config file by creating ".perf/config.lock"
- *
- * - it then parses the config using store_aux() as validator to find
- *   the position on the key/value pair to replace. If it is to be unset,
- *   it must be found exactly once.
- *
- * - the config file is mmap()ed and the part before the match (if any) is
- *   written to the lock file, then the changed part and the rest.
- *
- * - the config file is removed and the lock file rename()d to it.
- *
- */
-int perf_config_set_multivar(const char* key, const char* value,
-	const char* value_regex, int multi_replace)
-{
-	int i, dot;
-	int fd = -1, in_fd;
-	int ret = 0;
-	char* config_filename;
-	const char* last_dot = strrchr(key, '.');
-
-	if (config_exclusive_filename)
-		config_filename = strdup(config_exclusive_filename);
-	else
-		config_filename = perf_pathdup("config");
-
-	/*
-	 * Since "key" actually contains the section name and the real
-	 * key name separated by a dot, we have to know where the dot is.
-	 */
-
-	if (last_dot == NULL) {
-		error("key does not contain a section: %s", key);
-		ret = 2;
-		goto out_free;
-	}
-	store.baselen = last_dot - key;
-
-	store.multi_replace = multi_replace;
-
-	/*
-	 * Validate the key and while at it, lower case it for matching.
-	 */
-	store.key = malloc(strlen(key) + 1);
-	dot = 0;
-	for (i = 0; key[i]; i++) {
-		unsigned char c = key[i];
-		if (c == '.')
-			dot = 1;
-		/* Leave the extended basename untouched.. */
-		if (!dot || i > store.baselen) {
-			if (!iskeychar(c) || (i == store.baselen+1 && !isalpha(c))) {
-				error("invalid key: %s", key);
-				free(store.key);
-				ret = 1;
-				goto out_free;
-			}
-			c = tolower(c);
-		} else if (c == '\n') {
-			error("invalid key (newline): %s", key);
-			free(store.key);
-			ret = 1;
-			goto out_free;
-		}
-		store.key[i] = c;
-	}
-	store.key[i] = 0;
-
-	/*
-	 * If .perf/config does not exist yet, write a minimal version.
-	 */
-	in_fd = open(config_filename, O_RDONLY);
-	if ( in_fd < 0 ) {
-		free(store.key);
-
-		if ( ENOENT != errno ) {
-			error("opening %s: %s", config_filename,
-			      strerror(errno));
-			ret = 3; /* same as "invalid config file" */
-			goto out_free;
-		}
-		/* if nothing to unset, error out */
-		if (value == NULL) {
-			ret = 5;
-			goto out_free;
-		}
-
-		store.key = (char*)key;
-		if (!store_write_section(fd, key) ||
-		    !store_write_pair(fd, key, value))
-			goto write_err_out;
-	} else {
-		struct stat st;
-		char* contents;
-		size_t contents_sz, copy_begin, copy_end;
-		int i, new_line = 0;
-
-		if (value_regex == NULL)
-			store.value_regex = NULL;
-		else {
-			if (value_regex[0] == '!') {
-				store.do_not_match = 1;
-				value_regex++;
-			} else
-				store.do_not_match = 0;
-
-			store.value_regex = (regex_t*)malloc(sizeof(regex_t));
-			if (regcomp(store.value_regex, value_regex,
-					REG_EXTENDED)) {
-				error("invalid pattern: %s", value_regex);
-				free(store.value_regex);
-				ret = 6;
-				goto out_free;
-			}
-		}
-
-		store.offset[0] = 0;
-		store.state = START;
-		store.seen = 0;
-
-		/*
-		 * After this, store.offset will contain the *end* offset
-		 * of the last match, or remain at 0 if no match was found.
-		 * As a side effect, we make sure to transform only a valid
-		 * existing config file.
-		 */
-		if (perf_config_from_file(store_aux, config_filename, NULL)) {
-			error("invalid config file %s", config_filename);
-			free(store.key);
-			if (store.value_regex != NULL) {
-				regfree(store.value_regex);
-				free(store.value_regex);
-			}
-			ret = 3;
-			goto out_free;
-		}
-
-		free(store.key);
-		if (store.value_regex != NULL) {
-			regfree(store.value_regex);
-			free(store.value_regex);
-		}
-
-		/* if nothing to unset, or too many matches, error out */
-		if ((store.seen == 0 && value == NULL) ||
-				(store.seen > 1 && multi_replace == 0)) {
-			ret = 5;
-			goto out_free;
-		}
-
-		fstat(in_fd, &st);
-		contents_sz = xsize_t(st.st_size);
-		contents = mmap(NULL, contents_sz, PROT_READ,
-			MAP_PRIVATE, in_fd, 0);
-		close(in_fd);
-
-		if (store.seen == 0)
-			store.seen = 1;
-
-		for (i = 0, copy_begin = 0; i < store.seen; i++) {
-			if (store.offset[i] == 0) {
-				store.offset[i] = copy_end = contents_sz;
-			} else if (store.state != KEY_SEEN) {
-				copy_end = store.offset[i];
-			} else
-				copy_end = find_beginning_of_line(
-					contents, contents_sz,
-					store.offset[i]-2, &new_line);
-
-			if (copy_end > 0 && contents[copy_end-1] != '\n')
-				new_line = 1;
-
-			/* write the first part of the config */
-			if (copy_end > copy_begin) {
-				if (write_in_full(fd, contents + copy_begin,
-						  copy_end - copy_begin) <
-				    copy_end - copy_begin)
-					goto write_err_out;
-				if (new_line &&
-				    write_in_full(fd, "\n", 1) != 1)
-					goto write_err_out;
-			}
-			copy_begin = store.offset[i];
-		}
-
-		/* write the pair (value == NULL means unset) */
-		if (value != NULL) {
-			if (store.state == START) {
-				if (!store_write_section(fd, key))
-					goto write_err_out;
-			}
-			if (!store_write_pair(fd, key, value))
-				goto write_err_out;
-		}
-
-		/* write the rest of the config */
-		if (copy_begin < contents_sz)
-			if (write_in_full(fd, contents + copy_begin,
-					  contents_sz - copy_begin) <
-			    contents_sz - copy_begin)
-				goto write_err_out;
-
-		munmap(contents, contents_sz);
-	}
-
-	ret = 0;
-
-out_free:
-	free(config_filename);
-	return ret;
-
-write_err_out:
-	goto out_free;
-
-}
-
-/*
- * Call this to report error for your variable that should not
- * get a boolean value (i.e. "[my] var" means "true").
- */
-int config_error_nonbool(const char *var)
-{
-	return error("Missing value for '%s'", var);
-}
diff --git a/Documentation/perf_counter/util/ctype.c b/Documentation/perf_counter/util/ctype.c
deleted file mode 100644
index b90ec00..0000000
--- a/Documentation/perf_counter/util/ctype.c
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Sane locale-independent, ASCII ctype.
- *
- * No surprises, and works with signed and unsigned chars.
- */
-#include "cache.h"
-
-enum {
-	S = GIT_SPACE,
-	A = GIT_ALPHA,
-	D = GIT_DIGIT,
-	G = GIT_GLOB_SPECIAL,	/* *, ?, [, \\ */
-	R = GIT_REGEX_SPECIAL,	/* $, (, ), +, ., ^, {, | * */
-};
-
-unsigned char sane_ctype[256] = {
-	0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, 0, S, 0, 0,		/*   0.. 15 */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,		/*  16.. 31 */
-	S, 0, 0, 0, R, 0, 0, 0, R, R, G, R, 0, 0, R, 0,		/*  32.. 47 */
-	D, D, D, D, D, D, D, D, D, D, 0, 0, 0, 0, 0, G,		/*  48.. 63 */
-	0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  64.. 79 */
-	A, A, A, A, A, A, A, A, A, A, A, G, G, 0, R, 0,		/*  80.. 95 */
-	0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  96..111 */
-	A, A, A, A, A, A, A, A, A, A, A, R, R, 0, 0, 0,		/* 112..127 */
-	/* Nothing in the 128.. range */
-};
diff --git a/Documentation/perf_counter/util/environment.c b/Documentation/perf_counter/util/environment.c
deleted file mode 100644
index 275b0ee..0000000
--- a/Documentation/perf_counter/util/environment.c
+++ /dev/null
@@ -1,9 +0,0 @@
-/*
- * We put all the perf config variables in this same object
- * file, so that programs can link against the config parser
- * without having to link against all the rest of perf.
- */
-#include "cache.h"
-
-const char *pager_program;
-int pager_use_color = 1;
diff --git a/Documentation/perf_counter/util/exec_cmd.c b/Documentation/perf_counter/util/exec_cmd.c
deleted file mode 100644
index d392922..0000000
--- a/Documentation/perf_counter/util/exec_cmd.c
+++ /dev/null
@@ -1,165 +0,0 @@
-#include "cache.h"
-#include "exec_cmd.h"
-#include "quote.h"
-#define MAX_ARGS	32
-
-extern char **environ;
-static const char *argv_exec_path;
-static const char *argv0_path;
-
-const char *system_path(const char *path)
-{
-#ifdef RUNTIME_PREFIX
-	static const char *prefix;
-#else
-	static const char *prefix = PREFIX;
-#endif
-	struct strbuf d = STRBUF_INIT;
-
-	if (is_absolute_path(path))
-		return path;
-
-#ifdef RUNTIME_PREFIX
-	assert(argv0_path);
-	assert(is_absolute_path(argv0_path));
-
-	if (!prefix &&
-	    !(prefix = strip_path_suffix(argv0_path, PERF_EXEC_PATH)) &&
-	    !(prefix = strip_path_suffix(argv0_path, BINDIR)) &&
-	    !(prefix = strip_path_suffix(argv0_path, "perf"))) {
-		prefix = PREFIX;
-		fprintf(stderr, "RUNTIME_PREFIX requested, "
-				"but prefix computation failed.  "
-				"Using static fallback '%s'.\n", prefix);
-	}
-#endif
-
-	strbuf_addf(&d, "%s/%s", prefix, path);
-	path = strbuf_detach(&d, NULL);
-	return path;
-}
-
-const char *perf_extract_argv0_path(const char *argv0)
-{
-	const char *slash;
-
-	if (!argv0 || !*argv0)
-		return NULL;
-	slash = argv0 + strlen(argv0);
-
-	while (argv0 <= slash && !is_dir_sep(*slash))
-		slash--;
-
-	if (slash >= argv0) {
-		argv0_path = strndup(argv0, slash - argv0);
-		return slash + 1;
-	}
-
-	return argv0;
-}
-
-void perf_set_argv_exec_path(const char *exec_path)
-{
-	argv_exec_path = exec_path;
-	/*
-	 * Propagate this setting to external programs.
-	 */
-	setenv(EXEC_PATH_ENVIRONMENT, exec_path, 1);
-}
-
-
-/* Returns the highest-priority, location to look for perf programs. */
-const char *perf_exec_path(void)
-{
-	const char *env;
-
-	if (argv_exec_path)
-		return argv_exec_path;
-
-	env = getenv(EXEC_PATH_ENVIRONMENT);
-	if (env && *env) {
-		return env;
-	}
-
-	return system_path(PERF_EXEC_PATH);
-}
-
-static void add_path(struct strbuf *out, const char *path)
-{
-	if (path && *path) {
-		if (is_absolute_path(path))
-			strbuf_addstr(out, path);
-		else
-			strbuf_addstr(out, make_nonrelative_path(path));
-
-		strbuf_addch(out, PATH_SEP);
-	}
-}
-
-void setup_path(void)
-{
-	const char *old_path = getenv("PATH");
-	struct strbuf new_path = STRBUF_INIT;
-
-	add_path(&new_path, perf_exec_path());
-	add_path(&new_path, argv0_path);
-
-	if (old_path)
-		strbuf_addstr(&new_path, old_path);
-	else
-		strbuf_addstr(&new_path, "/usr/local/bin:/usr/bin:/bin");
-
-	setenv("PATH", new_path.buf, 1);
-
-	strbuf_release(&new_path);
-}
-
-const char **prepare_perf_cmd(const char **argv)
-{
-	int argc;
-	const char **nargv;
-
-	for (argc = 0; argv[argc]; argc++)
-		; /* just counting */
-	nargv = malloc(sizeof(*nargv) * (argc + 2));
-
-	nargv[0] = "perf";
-	for (argc = 0; argv[argc]; argc++)
-		nargv[argc + 1] = argv[argc];
-	nargv[argc + 1] = NULL;
-	return nargv;
-}
-
-int execv_perf_cmd(const char **argv) {
-	const char **nargv = prepare_perf_cmd(argv);
-
-	/* execvp() can only ever return if it fails */
-	execvp("perf", (char **)nargv);
-
-	free(nargv);
-	return -1;
-}
-
-
-int execl_perf_cmd(const char *cmd,...)
-{
-	int argc;
-	const char *argv[MAX_ARGS + 1];
-	const char *arg;
-	va_list param;
-
-	va_start(param, cmd);
-	argv[0] = cmd;
-	argc = 1;
-	while (argc < MAX_ARGS) {
-		arg = argv[argc++] = va_arg(param, char *);
-		if (!arg)
-			break;
-	}
-	va_end(param);
-	if (MAX_ARGS <= argc)
-		return error("too many args to run %s", cmd);
-
-	argv[argc] = NULL;
-	return execv_perf_cmd(argv);
-}
diff --git a/Documentation/perf_counter/util/exec_cmd.h b/Documentation/perf_counter/util/exec_cmd.h
deleted file mode 100644
index effe25e..0000000
--- a/Documentation/perf_counter/util/exec_cmd.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef PERF_EXEC_CMD_H
-#define PERF_EXEC_CMD_H
-
-extern void perf_set_argv_exec_path(const char *exec_path);
-extern const char *perf_extract_argv0_path(const char *path);
-extern const char *perf_exec_path(void);
-extern void setup_path(void);
-extern const char **prepare_perf_cmd(const char **argv);
-extern int execv_perf_cmd(const char **argv); /* NULL terminated */
-extern int execl_perf_cmd(const char *cmd, ...);
-extern const char *system_path(const char *path);
-
-#endif /* PERF_EXEC_CMD_H */
diff --git a/Documentation/perf_counter/util/generate-cmdlist.sh b/Documentation/perf_counter/util/generate-cmdlist.sh
deleted file mode 100755
index f06f6fd..0000000
--- a/Documentation/perf_counter/util/generate-cmdlist.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/sh
-
-echo "/* Automatically generated by $0 */
-struct cmdname_help
-{
-    char name[16];
-    char help[80];
-};
-
-static struct cmdname_help common_cmds[] = {"
-
-sed -n -e 's/^perf-\([^ 	]*\)[ 	].* common.*/\1/p' command-list.txt |
-sort |
-while read cmd
-do
-     sed -n '
-     /^NAME/,/perf-'"$cmd"'/H
-     ${
-            x
-            s/.*perf-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/
-	    p
-     }' "Documentation/perf-$cmd.txt"
-done
-echo "};"
diff --git a/Documentation/perf_counter/util/help.c b/Documentation/perf_counter/util/help.c
deleted file mode 100644
index 6653f7d..0000000
--- a/Documentation/perf_counter/util/help.c
+++ /dev/null
@@ -1,367 +0,0 @@
-#include "cache.h"
-#include "../builtin.h"
-#include "exec_cmd.h"
-#include "levenshtein.h"
-#include "help.h"
-
-/* most GUI terminals set COLUMNS (although some don't export it) */
-static int term_columns(void)
-{
-	char *col_string = getenv("COLUMNS");
-	int n_cols;
-
-	if (col_string && (n_cols = atoi(col_string)) > 0)
-		return n_cols;
-
-#ifdef TIOCGWINSZ
-	{
-		struct winsize ws;
-		if (!ioctl(1, TIOCGWINSZ, &ws)) {
-			if (ws.ws_col)
-				return ws.ws_col;
-		}
-	}
-#endif
-
-	return 80;
-}
-
-void add_cmdname(struct cmdnames *cmds, const char *name, int len)
-{
-	struct cmdname *ent = malloc(sizeof(*ent) + len + 1);
-
-	ent->len = len;
-	memcpy(ent->name, name, len);
-	ent->name[len] = 0;
-
-	ALLOC_GROW(cmds->names, cmds->cnt + 1, cmds->alloc);
-	cmds->names[cmds->cnt++] = ent;
-}
-
-static void clean_cmdnames(struct cmdnames *cmds)
-{
-	int i;
-	for (i = 0; i < cmds->cnt; ++i)
-		free(cmds->names[i]);
-	free(cmds->names);
-	cmds->cnt = 0;
-	cmds->alloc = 0;
-}
-
-static int cmdname_compare(const void *a_, const void *b_)
-{
-	struct cmdname *a = *(struct cmdname **)a_;
-	struct cmdname *b = *(struct cmdname **)b_;
-	return strcmp(a->name, b->name);
-}
-
-static void uniq(struct cmdnames *cmds)
-{
-	int i, j;
-
-	if (!cmds->cnt)
-		return;
-
-	for (i = j = 1; i < cmds->cnt; i++)
-		if (strcmp(cmds->names[i]->name, cmds->names[i-1]->name))
-			cmds->names[j++] = cmds->names[i];
-
-	cmds->cnt = j;
-}
-
-void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes)
-{
-	int ci, cj, ei;
-	int cmp;
-
-	ci = cj = ei = 0;
-	while (ci < cmds->cnt && ei < excludes->cnt) {
-		cmp = strcmp(cmds->names[ci]->name, excludes->names[ei]->name);
-		if (cmp < 0)
-			cmds->names[cj++] = cmds->names[ci++];
-		else if (cmp == 0)
-			ci++, ei++;
-		else if (cmp > 0)
-			ei++;
-	}
-
-	while (ci < cmds->cnt)
-		cmds->names[cj++] = cmds->names[ci++];
-
-	cmds->cnt = cj;
-}
-
-static void pretty_print_string_list(struct cmdnames *cmds, int longest)
-{
-	int cols = 1, rows;
-	int space = longest + 1; /* min 1 SP between words */
-	int max_cols = term_columns() - 1; /* don't print *on* the edge */
-	int i, j;
-
-	if (space < max_cols)
-		cols = max_cols / space;
-	rows = (cmds->cnt + cols - 1) / cols;
-
-	for (i = 0; i < rows; i++) {
-		printf("  ");
-
-		for (j = 0; j < cols; j++) {
-			int n = j * rows + i;
-			int size = space;
-			if (n >= cmds->cnt)
-				break;
-			if (j == cols-1 || n + rows >= cmds->cnt)
-				size = 1;
-			printf("%-*s", size, cmds->names[n]->name);
-		}
-		putchar('\n');
-	}
-}
-
-static int is_executable(const char *name)
-{
-	struct stat st;
-
-	if (stat(name, &st) || /* stat, not lstat */
-	    !S_ISREG(st.st_mode))
-		return 0;
-
-#ifdef __MINGW32__
-	/* cannot trust the executable bit, peek into the file instead */
-	char buf[3] = { 0 };
-	int n;
-	int fd = open(name, O_RDONLY);
-	st.st_mode &= ~S_IXUSR;
-	if (fd >= 0) {
-		n = read(fd, buf, 2);
-		if (n == 2)
-			/* DOS executables start with "MZ" */
-			if (!strcmp(buf, "#!") || !strcmp(buf, "MZ"))
-				st.st_mode |= S_IXUSR;
-		close(fd);
-	}
-#endif
-	return st.st_mode & S_IXUSR;
-}
-
-static void list_commands_in_dir(struct cmdnames *cmds,
-					 const char *path,
-					 const char *prefix)
-{
-	int prefix_len;
-	DIR *dir = opendir(path);
-	struct dirent *de;
-	struct strbuf buf = STRBUF_INIT;
-	int len;
-
-	if (!dir)
-		return;
-	if (!prefix)
-		prefix = "perf-";
-	prefix_len = strlen(prefix);
-
-	strbuf_addf(&buf, "%s/", path);
-	len = buf.len;
-
-	while ((de = readdir(dir)) != NULL) {
-		int entlen;
-
-		if (prefixcmp(de->d_name, prefix))
-			continue;
-
-		strbuf_setlen(&buf, len);
-		strbuf_addstr(&buf, de->d_name);
-		if (!is_executable(buf.buf))
-			continue;
-
-		entlen = strlen(de->d_name) - prefix_len;
-		if (has_extension(de->d_name, ".exe"))
-			entlen -= 4;
-
-		add_cmdname(cmds, de->d_name + prefix_len, entlen);
-	}
-	closedir(dir);
-	strbuf_release(&buf);
-}
-
-void load_command_list(const char *prefix,
-		struct cmdnames *main_cmds,
-		struct cmdnames *other_cmds)
-{
-	const char *env_path = getenv("PATH");
-	const char *exec_path = perf_exec_path();
-
-	if (exec_path) {
-		list_commands_in_dir(main_cmds, exec_path, prefix);
-		qsort(main_cmds->names, main_cmds->cnt,
-		      sizeof(*main_cmds->names), cmdname_compare);
-		uniq(main_cmds);
-	}
-
-	if (env_path) {
-		char *paths, *path, *colon;
-		path = paths = strdup(env_path);
-		while (1) {
-			if ((colon = strchr(path, PATH_SEP)))
-				*colon = 0;
-			if (!exec_path || strcmp(path, exec_path))
-				list_commands_in_dir(other_cmds, path, prefix);
-
-			if (!colon)
-				break;
-			path = colon + 1;
-		}
-		free(paths);
-
-		qsort(other_cmds->names, other_cmds->cnt,
-		      sizeof(*other_cmds->names), cmdname_compare);
-		uniq(other_cmds);
-	}
-	exclude_cmds(other_cmds, main_cmds);
-}
-
-void list_commands(const char *title, struct cmdnames *main_cmds,
-		   struct cmdnames *other_cmds)
-{
-	int i, longest = 0;
-
-	for (i = 0; i < main_cmds->cnt; i++)
-		if (longest < main_cmds->names[i]->len)
-			longest = main_cmds->names[i]->len;
-	for (i = 0; i < other_cmds->cnt; i++)
-		if (longest < other_cmds->names[i]->len)
-			longest = other_cmds->names[i]->len;
-
-	if (main_cmds->cnt) {
-		const char *exec_path = perf_exec_path();
-		printf("available %s in '%s'\n", title, exec_path);
-		printf("----------------");
-		mput_char('-', strlen(title) + strlen(exec_path));
-		putchar('\n');
-		pretty_print_string_list(main_cmds, longest);
-		putchar('\n');
-	}
-
-	if (other_cmds->cnt) {
-		printf("%s available from elsewhere on your $PATH\n", title);
-		printf("---------------------------------------");
-		mput_char('-', strlen(title));
-		putchar('\n');
-		pretty_print_string_list(other_cmds, longest);
-		putchar('\n');
-	}
-}
-
-int is_in_cmdlist(struct cmdnames *c, const char *s)
-{
-	int i;
-	for (i = 0; i < c->cnt; i++)
-		if (!strcmp(s, c->names[i]->name))
-			return 1;
-	return 0;
-}
-
-static int autocorrect;
-static struct cmdnames aliases;
-
-static int perf_unknown_cmd_config(const char *var, const char *value, void *cb)
-{
-	if (!strcmp(var, "help.autocorrect"))
-		autocorrect = perf_config_int(var,value);
-	/* Also use aliases for command lookup */
-	if (!prefixcmp(var, "alias."))
-		add_cmdname(&aliases, var + 6, strlen(var + 6));
-
-	return perf_default_config(var, value, cb);
-}
-
-static int levenshtein_compare(const void *p1, const void *p2)
-{
-	const struct cmdname *const *c1 = p1, *const *c2 = p2;
-	const char *s1 = (*c1)->name, *s2 = (*c2)->name;
-	int l1 = (*c1)->len;
-	int l2 = (*c2)->len;
-	return l1 != l2 ? l1 - l2 : strcmp(s1, s2);
-}
-
-static void add_cmd_list(struct cmdnames *cmds, struct cmdnames *old)
-{
-	int i;
-	ALLOC_GROW(cmds->names, cmds->cnt + old->cnt, cmds->alloc);
-
-	for (i = 0; i < old->cnt; i++)
-		cmds->names[cmds->cnt++] = old->names[i];
-	free(old->names);
-	old->cnt = 0;
-	old->names = NULL;
-}
-
-const char *help_unknown_cmd(const char *cmd)
-{
-	int i, n = 0, best_similarity = 0;
-	struct cmdnames main_cmds, other_cmds;
-
-	memset(&main_cmds, 0, sizeof(main_cmds));
-	memset(&other_cmds, 0, sizeof(main_cmds));
-	memset(&aliases, 0, sizeof(aliases));
-
-	perf_config(perf_unknown_cmd_config, NULL);
-
-	load_command_list("perf-", &main_cmds, &other_cmds);
-
-	add_cmd_list(&main_cmds, &aliases);
-	add_cmd_list(&main_cmds, &other_cmds);
-	qsort(main_cmds.names, main_cmds.cnt,
-	      sizeof(main_cmds.names), cmdname_compare);
-	uniq(&main_cmds);
-
-	if (main_cmds.cnt) {
-		/* This reuses cmdname->len for similarity index */
-		for (i = 0; i < main_cmds.cnt; ++i)
-			main_cmds.names[i]->len =
-				levenshtein(cmd, main_cmds.names[i]->name, 0, 2, 1, 4);
-
-		qsort(main_cmds.names, main_cmds.cnt,
-		      sizeof(*main_cmds.names), levenshtein_compare);
-
-		best_similarity = main_cmds.names[0]->len;
-		n = 1;
-		while (n < main_cmds.cnt && best_similarity == main_cmds.names[n]->len)
-			++n;
-	}
-
-	if (autocorrect && n == 1) {
-		const char *assumed = main_cmds.names[0]->name;
-
-		main_cmds.names[0] = NULL;
-		clean_cmdnames(&main_cmds);
-		fprintf(stderr, "WARNING: You called a Git program named '%s', "
-			"which does not exist.\n"
-			"Continuing under the assumption that you meant '%s'\n",
-			cmd, assumed);
-		if (autocorrect > 0) {
-			fprintf(stderr, "in %0.1f seconds automatically...\n",
-				(float)autocorrect/10.0);
-			poll(NULL, 0, autocorrect * 100);
-		}
-		return assumed;
-	}
-
-	fprintf(stderr, "perf: '%s' is not a perf-command. See 'perf --help'.\n", cmd);
-
-	if (main_cmds.cnt && best_similarity < 6) {
-		fprintf(stderr, "\nDid you mean %s?\n",
-			n < 2 ? "this": "one of these");
-
-		for (i = 0; i < n; i++)
-			fprintf(stderr, "\t%s\n", main_cmds.names[i]->name);
-	}
-
-	exit(1);
-}
-
-int cmd_version(int argc, const char **argv, const char *prefix)
-{
-	printf("perf version %s\n", perf_version_string);
-	return 0;
-}
diff --git a/Documentation/perf_counter/util/help.h b/Documentation/perf_counter/util/help.h
deleted file mode 100644
index 56bc154..0000000
--- a/Documentation/perf_counter/util/help.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef HELP_H
-#define HELP_H
-
-struct cmdnames {
-	int alloc;
-	int cnt;
-	struct cmdname {
-		size_t len; /* also used for similarity index in help.c */
-		char name[FLEX_ARRAY];
-	} **names;
-};
-
-static inline void mput_char(char c, unsigned int num)
-{
-	while(num--)
-		putchar(c);
-}
-
-void load_command_list(const char *prefix,
-		struct cmdnames *main_cmds,
-		struct cmdnames *other_cmds);
-void add_cmdname(struct cmdnames *cmds, const char *name, int len);
-/* Here we require that excludes is a sorted list. */
-void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes);
-int is_in_cmdlist(struct cmdnames *c, const char *s);
-void list_commands(const char *title, struct cmdnames *main_cmds,
-		   struct cmdnames *other_cmds);
-
-#endif /* HELP_H */
diff --git a/Documentation/perf_counter/util/levenshtein.c b/Documentation/perf_counter/util/levenshtein.c
deleted file mode 100644
index e521d15..0000000
--- a/Documentation/perf_counter/util/levenshtein.c
+++ /dev/null
@@ -1,84 +0,0 @@
-#include "cache.h"
-#include "levenshtein.h"
-
-/*
- * This function implements the Damerau-Levenshtein algorithm to
- * calculate a distance between strings.
- *
- * Basically, it says how many letters need to be swapped, substituted,
- * deleted from, or added to string1, at least, to get string2.
- *
- * The idea is to build a distance matrix for the substrings of both
- * strings.  To avoid a large space complexity, only the last three rows
- * are kept in memory (if swaps had the same or higher cost as one deletion
- * plus one insertion, only two rows would be needed).
- *
- * At any stage, "i + 1" denotes the length of the current substring of
- * string1 that the distance is calculated for.
- *
- * row2 holds the current row, row1 the previous row (i.e. for the substring
- * of string1 of length "i"), and row0 the row before that.
- *
- * In other words, at the start of the big loop, row2[j + 1] contains the
- * Damerau-Levenshtein distance between the substring of string1 of length
- * "i" and the substring of string2 of length "j + 1".
- *
- * All the big loop does is determine the partial minimum-cost paths.
- *
- * It does so by calculating the costs of the path ending in characters
- * i (in string1) and j (in string2), respectively, given that the last
- * operation is a substition, a swap, a deletion, or an insertion.
- *
- * This implementation allows the costs to be weighted:
- *
- * - w (as in "sWap")
- * - s (as in "Substitution")
- * - a (for insertion, AKA "Add")
- * - d (as in "Deletion")
- *
- * Note that this algorithm calculates a distance _iff_ d == a.
- */
-int levenshtein(const char *string1, const char *string2,
-		int w, int s, int a, int d)
-{
-	int len1 = strlen(string1), len2 = strlen(string2);
-	int *row0 = malloc(sizeof(int) * (len2 + 1));
-	int *row1 = malloc(sizeof(int) * (len2 + 1));
-	int *row2 = malloc(sizeof(int) * (len2 + 1));
-	int i, j;
-
-	for (j = 0; j <= len2; j++)
-		row1[j] = j * a;
-	for (i = 0; i < len1; i++) {
-		int *dummy;
-
-		row2[0] = (i + 1) * d;
-		for (j = 0; j < len2; j++) {
-			/* substitution */
-			row2[j + 1] = row1[j] + s * (string1[i] != string2[j]);
-			/* swap */
-			if (i > 0 && j > 0 && string1[i - 1] == string2[j] &&
-					string1[i] == string2[j - 1] &&
-					row2[j + 1] > row0[j - 1] + w)
-				row2[j + 1] = row0[j - 1] + w;
-			/* deletion */
-			if (row2[j + 1] > row1[j + 1] + d)
-				row2[j + 1] = row1[j + 1] + d;
-			/* insertion */
-			if (row2[j + 1] > row2[j] + a)
-				row2[j + 1] = row2[j] + a;
-		}
-
-		dummy = row0;
-		row0 = row1;
-		row1 = row2;
-		row2 = dummy;
-	}
-
-	i = row1[len2];
-	free(row0);
-	free(row1);
-	free(row2);
-
-	return i;
-}
diff --git a/Documentation/perf_counter/util/levenshtein.h b/Documentation/perf_counter/util/levenshtein.h
deleted file mode 100644
index 0173abe..0000000
--- a/Documentation/perf_counter/util/levenshtein.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef LEVENSHTEIN_H
-#define LEVENSHTEIN_H
-
-int levenshtein(const char *string1, const char *string2,
-	int swap_penalty, int substition_penalty,
-	int insertion_penalty, int deletion_penalty);
-
-#endif
diff --git a/Documentation/perf_counter/util/list.h b/Documentation/perf_counter/util/list.h
deleted file mode 100644
index e2548e8..0000000
--- a/Documentation/perf_counter/util/list.h
+++ /dev/null
@@ -1,603 +0,0 @@
-#ifndef _LINUX_LIST_H
-#define _LINUX_LIST_H
-/*
-  Copyright (C) Cast of dozens, comes from the Linux kernel
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-*/
-
-#include <stddef.h>
-
-/*
- * These are non-NULL pointers that will result in page faults
- * under normal circumstances, used to verify that nobody uses
- * non-initialized list entries.
- */
-#define LIST_POISON1 ((void *)0x00100100)
-#define LIST_POISON2 ((void *)0x00200200)
-
-/**
- * container_of - cast a member of a structure out to the containing structure
- * @ptr:	the pointer to the member.
- * @type:	the type of the container struct this is embedded in.
- * @member:	the name of the member within the struct.
- *
- */
-#define container_of(ptr, type, member) ({			\
-        const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
-        (type *)( (char *)__mptr - offsetof(type,member) );})
-
-/*
- * Simple doubly linked list implementation.
- *
- * Some of the internal functions ("__xxx") are useful when
- * manipulating whole lists rather than single entries, as
- * sometimes we already know the next/prev entries and we can
- * generate better code by using them directly rather than
- * using the generic single-entry routines.
- */
-
-struct list_head {
-	struct list_head *next, *prev;
-};
-
-#define LIST_HEAD_INIT(name) { &(name), &(name) }
-
-#define LIST_HEAD(name) \
-	struct list_head name = LIST_HEAD_INIT(name)
-
-static inline void INIT_LIST_HEAD(struct list_head *list)
-{
-	list->next = list;
-	list->prev = list;
-}
-
-/*
- * Insert a new entry between two known consecutive entries.
- *
- * This is only for internal list manipulation where we know
- * the prev/next entries already!
- */
-static inline void __list_add(struct list_head *new,
-			      struct list_head *prev,
-			      struct list_head *next)
-{
-	next->prev = new;
-	new->next = next;
-	new->prev = prev;
-	prev->next = new;
-}
-
-/**
- * list_add - add a new entry
- * @new: new entry to be added
- * @head: list head to add it after
- *
- * Insert a new entry after the specified head.
- * This is good for implementing stacks.
- */
-static inline void list_add(struct list_head *new, struct list_head *head)
-{
-	__list_add(new, head, head->next);
-}
-
-/**
- * list_add_tail - add a new entry
- * @new: new entry to be added
- * @head: list head to add it before
- *
- * Insert a new entry before the specified head.
- * This is useful for implementing queues.
- */
-static inline void list_add_tail(struct list_head *new, struct list_head *head)
-{
-	__list_add(new, head->prev, head);
-}
-
-/*
- * Delete a list entry by making the prev/next entries
- * point to each other.
- *
- * This is only for internal list manipulation where we know
- * the prev/next entries already!
- */
-static inline void __list_del(struct list_head * prev, struct list_head * next)
-{
-	next->prev = prev;
-	prev->next = next;
-}
-
-/**
- * list_del - deletes entry from list.
- * @entry: the element to delete from the list.
- * Note: list_empty on entry does not return true after this, the entry is
- * in an undefined state.
- */
-static inline void list_del(struct list_head *entry)
-{
-	__list_del(entry->prev, entry->next);
-	entry->next = LIST_POISON1;
-	entry->prev = LIST_POISON2;
-}
-
-/**
- * list_del_range - deletes range of entries from list.
- * @beging: first element in the range to delete from the list.
- * @beging: first element in the range to delete from the list.
- * Note: list_empty on the range of entries does not return true after this,
- * the entries is in an undefined state.
- */
-static inline void list_del_range(struct list_head *begin,
-				  struct list_head *end)
-{
-	begin->prev->next = end->next;
-	end->next->prev = begin->prev;
-}
-
-/**
- * list_replace - replace old entry by new one
- * @old : the element to be replaced
- * @new : the new element to insert
- * Note: if 'old' was empty, it will be overwritten.
- */
-static inline void list_replace(struct list_head *old,
-				struct list_head *new)
-{
-	new->next = old->next;
-	new->next->prev = new;
-	new->prev = old->prev;
-	new->prev->next = new;
-}
-
-static inline void list_replace_init(struct list_head *old,
-					struct list_head *new)
-{
-	list_replace(old, new);
-	INIT_LIST_HEAD(old);
-}
-
-/**
- * list_del_init - deletes entry from list and reinitialize it.
- * @entry: the element to delete from the list.
- */
-static inline void list_del_init(struct list_head *entry)
-{
-	__list_del(entry->prev, entry->next);
-	INIT_LIST_HEAD(entry);
-}
-
-/**
- * list_move - delete from one list and add as another's head
- * @list: the entry to move
- * @head: the head that will precede our entry
- */
-static inline void list_move(struct list_head *list, struct list_head *head)
-{
-        __list_del(list->prev, list->next);
-        list_add(list, head);
-}
-
-/**
- * list_move_tail - delete from one list and add as another's tail
- * @list: the entry to move
- * @head: the head that will follow our entry
- */
-static inline void list_move_tail(struct list_head *list,
-				  struct list_head *head)
-{
-        __list_del(list->prev, list->next);
-        list_add_tail(list, head);
-}
-
-/**
- * list_is_last - tests whether @list is the last entry in list @head
- * @list: the entry to test
- * @head: the head of the list
- */
-static inline int list_is_last(const struct list_head *list,
-				const struct list_head *head)
-{
-	return list->next == head;
-}
-
-/**
- * list_empty - tests whether a list is empty
- * @head: the list to test.
- */
-static inline int list_empty(const struct list_head *head)
-{
-	return head->next == head;
-}
-
-/**
- * list_empty_careful - tests whether a list is empty and not being modified
- * @head: the list to test
- *
- * Description:
- * tests whether a list is empty _and_ checks that no other CPU might be
- * in the process of modifying either member (next or prev)
- *
- * NOTE: using list_empty_careful() without synchronization
- * can only be safe if the only activity that can happen
- * to the list entry is list_del_init(). Eg. it cannot be used
- * if another CPU could re-list_add() it.
- */
-static inline int list_empty_careful(const struct list_head *head)
-{
-	struct list_head *next = head->next;
-	return (next == head) && (next == head->prev);
-}
-
-static inline void __list_splice(struct list_head *list,
-				 struct list_head *head)
-{
-	struct list_head *first = list->next;
-	struct list_head *last = list->prev;
-	struct list_head *at = head->next;
-
-	first->prev = head;
-	head->next = first;
-
-	last->next = at;
-	at->prev = last;
-}
-
-/**
- * list_splice - join two lists
- * @list: the new list to add.
- * @head: the place to add it in the first list.
- */
-static inline void list_splice(struct list_head *list, struct list_head *head)
-{
-	if (!list_empty(list))
-		__list_splice(list, head);
-}
-
-/**
- * list_splice_init - join two lists and reinitialise the emptied list.
- * @list: the new list to add.
- * @head: the place to add it in the first list.
- *
- * The list at @list is reinitialised
- */
-static inline void list_splice_init(struct list_head *list,
-				    struct list_head *head)
-{
-	if (!list_empty(list)) {
-		__list_splice(list, head);
-		INIT_LIST_HEAD(list);
-	}
-}
-
-/**
- * list_entry - get the struct for this entry
- * @ptr:	the &struct list_head pointer.
- * @type:	the type of the struct this is embedded in.
- * @member:	the name of the list_struct within the struct.
- */
-#define list_entry(ptr, type, member) \
-	container_of(ptr, type, member)
-
-/**
- * list_first_entry - get the first element from a list
- * @ptr:       the list head to take the element from.
- * @type:      the type of the struct this is embedded in.
- * @member:    the name of the list_struct within the struct.
- *
- * Note, that list is expected to be not empty.
- */
-#define list_first_entry(ptr, type, member) \
-	list_entry((ptr)->next, type, member)
-
-/**
- * list_for_each	-	iterate over a list
- * @pos:	the &struct list_head to use as a loop cursor.
- * @head:	the head for your list.
- */
-#define list_for_each(pos, head) \
-	for (pos = (head)->next; pos != (head); \
-        	pos = pos->next)
-
-/**
- * __list_for_each	-	iterate over a list
- * @pos:	the &struct list_head to use as a loop cursor.
- * @head:	the head for your list.
- *
- * This variant differs from list_for_each() in that it's the
- * simplest possible list iteration code, no prefetching is done.
- * Use this for code that knows the list to be very short (empty
- * or 1 entry) most of the time.
- */
-#define __list_for_each(pos, head) \
-	for (pos = (head)->next; pos != (head); pos = pos->next)
-
-/**
- * list_for_each_prev	-	iterate over a list backwards
- * @pos:	the &struct list_head to use as a loop cursor.
- * @head:	the head for your list.
- */
-#define list_for_each_prev(pos, head) \
-	for (pos = (head)->prev; pos != (head); \
-        	pos = pos->prev)
-
-/**
- * list_for_each_safe - iterate over a list safe against removal of list entry
- * @pos:	the &struct list_head to use as a loop cursor.
- * @n:		another &struct list_head to use as temporary storage
- * @head:	the head for your list.
- */
-#define list_for_each_safe(pos, n, head) \
-	for (pos = (head)->next, n = pos->next; pos != (head); \
-		pos = n, n = pos->next)
-
-/**
- * list_for_each_entry	-	iterate over list of given type
- * @pos:	the type * to use as a loop cursor.
- * @head:	the head for your list.
- * @member:	the name of the list_struct within the struct.
- */
-#define list_for_each_entry(pos, head, member)				\
-	for (pos = list_entry((head)->next, typeof(*pos), member);	\
-	     &pos->member != (head); 	\
-	     pos = list_entry(pos->member.next, typeof(*pos), member))
-
-/**
- * list_for_each_entry_reverse - iterate backwards over list of given type.
- * @pos:	the type * to use as a loop cursor.
- * @head:	the head for your list.
- * @member:	the name of the list_struct within the struct.
- */
-#define list_for_each_entry_reverse(pos, head, member)			\
-	for (pos = list_entry((head)->prev, typeof(*pos), member);	\
-	     &pos->member != (head); 	\
-	     pos = list_entry(pos->member.prev, typeof(*pos), member))
-
-/**
- * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue
- * @pos:	the type * to use as a start point
- * @head:	the head of the list
- * @member:	the name of the list_struct within the struct.
- *
- * Prepares a pos entry for use as a start point in list_for_each_entry_continue.
- */
-#define list_prepare_entry(pos, head, member) \
-	((pos) ? : list_entry(head, typeof(*pos), member))
-
-/**
- * list_for_each_entry_continue - continue iteration over list of given type
- * @pos:	the type * to use as a loop cursor.
- * @head:	the head for your list.
- * @member:	the name of the list_struct within the struct.
- *
- * Continue to iterate over list of given type, continuing after
- * the current position.
- */
-#define list_for_each_entry_continue(pos, head, member) 		\
-	for (pos = list_entry(pos->member.next, typeof(*pos), member);	\
-	     &pos->member != (head);	\
-	     pos = list_entry(pos->member.next, typeof(*pos), member))
-
-/**
- * list_for_each_entry_from - iterate over list of given type from the current point
- * @pos:	the type * to use as a loop cursor.
- * @head:	the head for your list.
- * @member:	the name of the list_struct within the struct.
- *
- * Iterate over list of given type, continuing from current position.
- */
-#define list_for_each_entry_from(pos, head, member) 			\
-	for (; &pos->member != (head);	\
-	     pos = list_entry(pos->member.next, typeof(*pos), member))
-
-/**
- * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
- * @pos:	the type * to use as a loop cursor.
- * @n:		another type * to use as temporary storage
- * @head:	the head for your list.
- * @member:	the name of the list_struct within the struct.
- */
-#define list_for_each_entry_safe(pos, n, head, member)			\
-	for (pos = list_entry((head)->next, typeof(*pos), member),	\
-		n = list_entry(pos->member.next, typeof(*pos), member);	\
-	     &pos->member != (head); 					\
-	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
-
-/**
- * list_for_each_entry_safe_continue
- * @pos:	the type * to use as a loop cursor.
- * @n:		another type * to use as temporary storage
- * @head:	the head for your list.
- * @member:	the name of the list_struct within the struct.
- *
- * Iterate over list of given type, continuing after current point,
- * safe against removal of list entry.
- */
-#define list_for_each_entry_safe_continue(pos, n, head, member) 		\
-	for (pos = list_entry(pos->member.next, typeof(*pos), member), 		\
-		n = list_entry(pos->member.next, typeof(*pos), member);		\
-	     &pos->member != (head);						\
-	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
-
-/**
- * list_for_each_entry_safe_from
- * @pos:	the type * to use as a loop cursor.
- * @n:		another type * to use as temporary storage
- * @head:	the head for your list.
- * @member:	the name of the list_struct within the struct.
- *
- * Iterate over list of given type from current point, safe against
- * removal of list entry.
- */
-#define list_for_each_entry_safe_from(pos, n, head, member) 			\
-	for (n = list_entry(pos->member.next, typeof(*pos), member);		\
-	     &pos->member != (head);						\
-	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
-
-/**
- * list_for_each_entry_safe_reverse
- * @pos:	the type * to use as a loop cursor.
- * @n:		another type * to use as temporary storage
- * @head:	the head for your list.
- * @member:	the name of the list_struct within the struct.
- *
- * Iterate backwards over list of given type, safe against removal
- * of list entry.
- */
-#define list_for_each_entry_safe_reverse(pos, n, head, member)		\
-	for (pos = list_entry((head)->prev, typeof(*pos), member),	\
-		n = list_entry(pos->member.prev, typeof(*pos), member);	\
-	     &pos->member != (head); 					\
-	     pos = n, n = list_entry(n->member.prev, typeof(*n), member))
-
-/*
- * Double linked lists with a single pointer list head.
- * Mostly useful for hash tables where the two pointer list head is
- * too wasteful.
- * You lose the ability to access the tail in O(1).
- */
-
-struct hlist_head {
-	struct hlist_node *first;
-};
-
-struct hlist_node {
-	struct hlist_node *next, **pprev;
-};
-
-#define HLIST_HEAD_INIT { .first = NULL }
-#define HLIST_HEAD(name) struct hlist_head name = {  .first = NULL }
-#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
-static inline void INIT_HLIST_NODE(struct hlist_node *h)
-{
-	h->next = NULL;
-	h->pprev = NULL;
-}
-
-static inline int hlist_unhashed(const struct hlist_node *h)
-{
-	return !h->pprev;
-}
-
-static inline int hlist_empty(const struct hlist_head *h)
-{
-	return !h->first;
-}
-
-static inline void __hlist_del(struct hlist_node *n)
-{
-	struct hlist_node *next = n->next;
-	struct hlist_node **pprev = n->pprev;
-	*pprev = next;
-	if (next)
-		next->pprev = pprev;
-}
-
-static inline void hlist_del(struct hlist_node *n)
-{
-	__hlist_del(n);
-	n->next = LIST_POISON1;
-	n->pprev = LIST_POISON2;
-}
-
-static inline void hlist_del_init(struct hlist_node *n)
-{
-	if (!hlist_unhashed(n)) {
-		__hlist_del(n);
-		INIT_HLIST_NODE(n);
-	}
-}
-
-static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
-{
-	struct hlist_node *first = h->first;
-	n->next = first;
-	if (first)
-		first->pprev = &n->next;
-	h->first = n;
-	n->pprev = &h->first;
-}
-
-/* next must be != NULL */
-static inline void hlist_add_before(struct hlist_node *n,
-					struct hlist_node *next)
-{
-	n->pprev = next->pprev;
-	n->next = next;
-	next->pprev = &n->next;
-	*(n->pprev) = n;
-}
-
-static inline void hlist_add_after(struct hlist_node *n,
-					struct hlist_node *next)
-{
-	next->next = n->next;
-	n->next = next;
-	next->pprev = &n->next;
-
-	if(next->next)
-		next->next->pprev  = &next->next;
-}
-
-#define hlist_entry(ptr, type, member) container_of(ptr,type,member)
-
-#define hlist_for_each(pos, head) \
-	for (pos = (head)->first; pos; \
-	     pos = pos->next)
-
-#define hlist_for_each_safe(pos, n, head) \
-	for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \
-	     pos = n)
-
-/**
- * hlist_for_each_entry	- iterate over list of given type
- * @tpos:	the type * to use as a loop cursor.
- * @pos:	the &struct hlist_node to use as a loop cursor.
- * @head:	the head for your list.
- * @member:	the name of the hlist_node within the struct.
- */
-#define hlist_for_each_entry(tpos, pos, head, member)			 \
-	for (pos = (head)->first;					 \
-	     pos && 			 \
-		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
-	     pos = pos->next)
-
-/**
- * hlist_for_each_entry_continue - iterate over a hlist continuing after current point
- * @tpos:	the type * to use as a loop cursor.
- * @pos:	the &struct hlist_node to use as a loop cursor.
- * @member:	the name of the hlist_node within the struct.
- */
-#define hlist_for_each_entry_continue(tpos, pos, member)		 \
-	for (pos = (pos)->next;						 \
-	     pos && 			 \
-		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
-	     pos = pos->next)
-
-/**
- * hlist_for_each_entry_from - iterate over a hlist continuing from current point
- * @tpos:	the type * to use as a loop cursor.
- * @pos:	the &struct hlist_node to use as a loop cursor.
- * @member:	the name of the hlist_node within the struct.
- */
-#define hlist_for_each_entry_from(tpos, pos, member)			 \
-	for (; pos && 			 \
-		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
-	     pos = pos->next)
-
-/**
- * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry
- * @tpos:	the type * to use as a loop cursor.
- * @pos:	the &struct hlist_node to use as a loop cursor.
- * @n:		another &struct hlist_node to use as temporary storage
- * @head:	the head for your list.
- * @member:	the name of the hlist_node within the struct.
- */
-#define hlist_for_each_entry_safe(tpos, pos, n, head, member) 		 \
-	for (pos = (head)->first;					 \
-	     pos && ({ n = pos->next; 1; }) && 				 \
-		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
-	     pos = n)
-
-#endif
diff --git a/Documentation/perf_counter/util/pager.c b/Documentation/perf_counter/util/pager.c
deleted file mode 100644
index a28bcca..0000000
--- a/Documentation/perf_counter/util/pager.c
+++ /dev/null
@@ -1,99 +0,0 @@
-#include "cache.h"
-#include "run-command.h"
-#include "sigchain.h"
-
-/*
- * This is split up from the rest of git so that we can do
- * something different on Windows.
- */
-
-static int spawned_pager;
-
-#ifndef __MINGW32__
-static void pager_preexec(void)
-{
-	/*
-	 * Work around bug in "less" by not starting it until we
-	 * have real input
-	 */
-	fd_set in;
-
-	FD_ZERO(&in);
-	FD_SET(0, &in);
-	select(1, &in, NULL, &in, NULL);
-
-	setenv("LESS", "FRSX", 0);
-}
-#endif
-
-static const char *pager_argv[] = { "sh", "-c", NULL, NULL };
-static struct child_process pager_process;
-
-static void wait_for_pager(void)
-{
-	fflush(stdout);
-	fflush(stderr);
-	/* signal EOF to pager */
-	close(1);
-	close(2);
-	finish_command(&pager_process);
-}
-
-static void wait_for_pager_signal(int signo)
-{
-	wait_for_pager();
-	sigchain_pop(signo);
-	raise(signo);
-}
-
-void setup_pager(void)
-{
-	const char *pager = getenv("PERF_PAGER");
-
-	if (!isatty(1))
-		return;
-	if (!pager) {
-		if (!pager_program)
-			perf_config(perf_default_config, NULL);
-		pager = pager_program;
-	}
-	if (!pager)
-		pager = getenv("PAGER");
-	if (!pager)
-		pager = "less";
-	else if (!*pager || !strcmp(pager, "cat"))
-		return;
-
-	spawned_pager = 1; /* means we are emitting to terminal */
-
-	/* spawn the pager */
-	pager_argv[2] = pager;
-	pager_process.argv = pager_argv;
-	pager_process.in = -1;
-#ifndef __MINGW32__
-	pager_process.preexec_cb = pager_preexec;
-#endif
-	if (start_command(&pager_process))
-		return;
-
-	/* original process continues, but writes to the pipe */
-	dup2(pager_process.in, 1);
-	if (isatty(2))
-		dup2(pager_process.in, 2);
-	close(pager_process.in);
-
-	/* this makes sure that the parent terminates after the pager */
-	sigchain_push_common(wait_for_pager_signal);
-	atexit(wait_for_pager);
-}
-
-int pager_in_use(void)
-{
-	const char *env;
-
-	if (spawned_pager)
-		return 1;
-
-	env = getenv("PERF_PAGER_IN_USE");
-	return env ? perf_config_bool("PERF_PAGER_IN_USE", env) : 0;
-}
diff --git a/Documentation/perf_counter/util/parse-events.c b/Documentation/perf_counter/util/parse-events.c
deleted file mode 100644
index e0820b4..0000000
--- a/Documentation/perf_counter/util/parse-events.c
+++ /dev/null
@@ -1,316 +0,0 @@
-
-#include "../perf.h"
-#include "util.h"
-#include "parse-options.h"
-#include "parse-events.h"
-#include "exec_cmd.h"
-#include "string.h"
-
-extern char *strcasestr(const char *haystack, const char *needle);
-
-int					nr_counters;
-
-struct perf_counter_attr		attrs[MAX_COUNTERS];
-
-struct event_symbol {
-	__u8	type;
-	__u64	config;
-	char	*symbol;
-};
-
-#define C(x, y) .type = PERF_TYPE_##x, .config = PERF_COUNT_##y
-#define CR(x, y) .type = PERF_TYPE_##x, .config = y
-
-static struct event_symbol event_symbols[] = {
-  { C(HARDWARE, CPU_CYCLES),		"cpu-cycles",		},
-  { C(HARDWARE, CPU_CYCLES),		"cycles",		},
-  { C(HARDWARE, INSTRUCTIONS),		"instructions",		},
-  { C(HARDWARE, CACHE_REFERENCES),	"cache-references",	},
-  { C(HARDWARE, CACHE_MISSES),		"cache-misses",		},
-  { C(HARDWARE, BRANCH_INSTRUCTIONS),	"branch-instructions",	},
-  { C(HARDWARE, BRANCH_INSTRUCTIONS),	"branches",		},
-  { C(HARDWARE, BRANCH_MISSES),		"branch-misses",	},
-  { C(HARDWARE, BUS_CYCLES),		"bus-cycles",		},
-
-  { C(SOFTWARE, CPU_CLOCK),		"cpu-clock",		},
-  { C(SOFTWARE, TASK_CLOCK),		"task-clock",		},
-  { C(SOFTWARE, PAGE_FAULTS),		"page-faults",		},
-  { C(SOFTWARE, PAGE_FAULTS),		"faults",		},
-  { C(SOFTWARE, PAGE_FAULTS_MIN),	"minor-faults",		},
-  { C(SOFTWARE, PAGE_FAULTS_MAJ),	"major-faults",		},
-  { C(SOFTWARE, CONTEXT_SWITCHES),	"context-switches",	},
-  { C(SOFTWARE, CONTEXT_SWITCHES),	"cs",			},
-  { C(SOFTWARE, CPU_MIGRATIONS),	"cpu-migrations",	},
-  { C(SOFTWARE, CPU_MIGRATIONS),	"migrations",		},
-};
-
-#define __PERF_COUNTER_FIELD(config, name) \
-	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
-
-#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
-#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
-#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
-#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
-
-static char *hw_event_names[] = {
-	"cycles",
-	"instructions",
-	"cache-references",
-	"cache-misses",
-	"branches",
-	"branch-misses",
-	"bus-cycles",
-};
-
-static char *sw_event_names[] = {
-	"cpu-clock-ticks",
-	"task-clock-ticks",
-	"page-faults",
-	"context-switches",
-	"CPU-migrations",
-	"minor-faults",
-	"major-faults",
-};
-
-#define MAX_ALIASES 8
-
-static char *hw_cache [][MAX_ALIASES] = {
-	{ "L1-data"		, "l1-d", "l1d", "l1"				},
-	{ "L1-instruction"	, "l1-i", "l1i"					},
-	{ "L2"			, "l2"						},
-	{ "Data-TLB"		, "dtlb", "d-tlb"				},
-	{ "Instruction-TLB"	, "itlb", "i-tlb"				},
-	{ "Branch"		, "bpu" , "btb", "bpc"				},
-};
-
-static char *hw_cache_op [][MAX_ALIASES] = {
-	{ "Load"		, "read"					},
-	{ "Store"		, "write"					},
-	{ "Prefetch"		, "speculative-read", "speculative-load"	},
-};
-
-static char *hw_cache_result [][MAX_ALIASES] = {
-	{ "Reference"		, "ops", "access"				},
-	{ "Miss"								},
-};
-
-char *event_name(int counter)
-{
-	__u64 config = attrs[counter].config;
-	int type = attrs[counter].type;
-	static char buf[32];
-
-	if (attrs[counter].type == PERF_TYPE_RAW) {
-		sprintf(buf, "raw 0x%llx", config);
-		return buf;
-	}
-
-	switch (type) {
-	case PERF_TYPE_HARDWARE:
-		if (config < PERF_HW_EVENTS_MAX)
-			return hw_event_names[config];
-		return "unknown-hardware";
-
-	case PERF_TYPE_HW_CACHE: {
-		__u8 cache_type, cache_op, cache_result;
-		static char name[100];
-
-		cache_type   = (config >>  0) & 0xff;
-		if (cache_type > PERF_COUNT_HW_CACHE_MAX)
-			return "unknown-ext-hardware-cache-type";
-
-		cache_op     = (config >>  8) & 0xff;
-		if (cache_op > PERF_COUNT_HW_CACHE_OP_MAX)
-			return "unknown-ext-hardware-cache-op";
-
-		cache_result = (config >> 16) & 0xff;
-		if (cache_result > PERF_COUNT_HW_CACHE_RESULT_MAX)
-			return "unknown-ext-hardware-cache-result";
-
-		sprintf(name, "%s-Cache-%s-%ses",
-			hw_cache[cache_type][0],
-			hw_cache_op[cache_op][0],
-			hw_cache_result[cache_result][0]);
-
-		return name;
-	}
-
-	case PERF_TYPE_SOFTWARE:
-		if (config < PERF_SW_EVENTS_MAX)
-			return sw_event_names[config];
-		return "unknown-software";
-
-	default:
-		break;
-	}
-
-	return "unknown";
-}
-
-static int parse_aliases(const char *str, char *names[][MAX_ALIASES], int size)
-{
-	int i, j;
-
-	for (i = 0; i < size; i++) {
-		for (j = 0; j < MAX_ALIASES; j++) {
-			if (!names[i][j])
-				break;
-			if (strcasestr(str, names[i][j]))
-				return i;
-		}
-	}
-
-	return 0;
-}
-
-static int parse_generic_hw_symbols(const char *str, struct perf_counter_attr *attr)
-{
-	__u8 cache_type = -1, cache_op = 0, cache_result = 0;
-
-	cache_type = parse_aliases(str, hw_cache, PERF_COUNT_HW_CACHE_MAX);
-	/*
-	 * No fallback - if we cannot get a clear cache type
-	 * then bail out:
-	 */
-	if (cache_type == -1)
-		return -EINVAL;
-
-	cache_op = parse_aliases(str, hw_cache_op, PERF_COUNT_HW_CACHE_OP_MAX);
-	/*
-	 * Fall back to reads:
-	 */
-	if (cache_type == -1)
-		cache_type = PERF_COUNT_HW_CACHE_OP_READ;
-
-	cache_result = parse_aliases(str, hw_cache_result,
-					PERF_COUNT_HW_CACHE_RESULT_MAX);
-	/*
-	 * Fall back to accesses:
-	 */
-	if (cache_result == -1)
-		cache_result = PERF_COUNT_HW_CACHE_RESULT_ACCESS;
-
-	attr->config = cache_type | (cache_op << 8) | (cache_result << 16);
-	attr->type = PERF_TYPE_HW_CACHE;
-
-	return 0;
-}
-
-/*
- * Each event can have multiple symbolic names.
- * Symbolic names are (almost) exactly matched.
- */
-static int parse_event_symbols(const char *str, struct perf_counter_attr *attr)
-{
-	__u64 config, id;
-	int type;
-	unsigned int i;
-	const char *sep, *pstr;
-
-	if (str[0] == 'r' && hex2u64(str + 1, &config) > 0) {
-		attr->type = PERF_TYPE_RAW;
-		attr->config = config;
-
-		return 0;
-	}
-
-	pstr = str;
-	sep = strchr(pstr, ':');
-	if (sep) {
-		type = atoi(pstr);
-		pstr = sep + 1;
-		id = atoi(pstr);
-		sep = strchr(pstr, ':');
-		if (sep) {
-			pstr = sep + 1;
-			if (strchr(pstr, 'k'))
-				attr->exclude_user = 1;
-			if (strchr(pstr, 'u'))
-				attr->exclude_kernel = 1;
-		}
-		attr->type = type;
-		attr->config = id;
-
-		return 0;
-	}
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		if (!strncmp(str, event_symbols[i].symbol,
-			     strlen(event_symbols[i].symbol))) {
-
-			attr->type = event_symbols[i].type;
-			attr->config = event_symbols[i].config;
-
-			return 0;
-		}
-	}
-
-	return parse_generic_hw_symbols(str, attr);
-}
-
-int parse_events(const struct option *opt, const char *str, int unset)
-{
-	struct perf_counter_attr attr;
-	int ret;
-
-	memset(&attr, 0, sizeof(attr));
-again:
-	if (nr_counters == MAX_COUNTERS)
-		return -1;
-
-	ret = parse_event_symbols(str, &attr);
-	if (ret < 0)
-		return ret;
-
-	attrs[nr_counters] = attr;
-	nr_counters++;
-
-	str = strstr(str, ",");
-	if (str) {
-		str++;
-		goto again;
-	}
-
-	return 0;
-}
-
-static const char * const event_type_descriptors[] = {
-	"",
-	"Hardware event",
-	"Software event",
-	"Tracepoint event",
-	"Hardware cache event",
-};
-
-/*
- * Print the help text for the event symbols:
- */
-void print_events(void)
-{
-	struct event_symbol *syms = event_symbols;
-	unsigned int i, type, prev_type = -1;
-
-	fprintf(stderr, "\n");
-	fprintf(stderr, "List of pre-defined events (to be used in -e):\n");
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++, syms++) {
-		type = syms->type + 1;
-		if (type > ARRAY_SIZE(event_type_descriptors))
-			type = 0;
-
-		if (type != prev_type)
-			fprintf(stderr, "\n");
-
-		fprintf(stderr, "  %-30s [%s]\n", syms->symbol,
-			event_type_descriptors[type]);
-
-		prev_type = type;
-	}
-
-	fprintf(stderr, "\n");
-	fprintf(stderr, "  %-30s [raw hardware event descriptor]\n",
-		"rNNN");
-	fprintf(stderr, "\n");
-
-	exit(129);
-}
diff --git a/Documentation/perf_counter/util/parse-events.h b/Documentation/perf_counter/util/parse-events.h
deleted file mode 100644
index e3d5529..0000000
--- a/Documentation/perf_counter/util/parse-events.h
+++ /dev/null
@@ -1,17 +0,0 @@
-
-/*
- * Parse symbolic events/counts passed in as options:
- */
-
-extern int			nr_counters;
-
-extern struct perf_counter_attr attrs[MAX_COUNTERS];
-
-extern char *event_name(int ctr);
-
-extern int parse_events(const struct option *opt, const char *str, int unset);
-
-#define EVENTS_HELP_MAX (128*1024)
-
-extern void print_events(void);
-
diff --git a/Documentation/perf_counter/util/parse-options.c b/Documentation/perf_counter/util/parse-options.c
deleted file mode 100644
index b3affb1..0000000
--- a/Documentation/perf_counter/util/parse-options.c
+++ /dev/null
@@ -1,508 +0,0 @@
-#include "util.h"
-#include "parse-options.h"
-#include "cache.h"
-
-#define OPT_SHORT 1
-#define OPT_UNSET 2
-
-static int opterror(const struct option *opt, const char *reason, int flags)
-{
-	if (flags & OPT_SHORT)
-		return error("switch `%c' %s", opt->short_name, reason);
-	if (flags & OPT_UNSET)
-		return error("option `no-%s' %s", opt->long_name, reason);
-	return error("option `%s' %s", opt->long_name, reason);
-}
-
-static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt,
-		   int flags, const char **arg)
-{
-	if (p->opt) {
-		*arg = p->opt;
-		p->opt = NULL;
-	} else if (p->argc == 1 && (opt->flags & PARSE_OPT_LASTARG_DEFAULT)) {
-		*arg = (const char *)opt->defval;
-	} else if (p->argc > 1) {
-		p->argc--;
-		*arg = *++p->argv;
-	} else
-		return opterror(opt, "requires a value", flags);
-	return 0;
-}
-
-static int get_value(struct parse_opt_ctx_t *p,
-		     const struct option *opt, int flags)
-{
-	const char *s, *arg = NULL;
-	const int unset = flags & OPT_UNSET;
-
-	if (unset && p->opt)
-		return opterror(opt, "takes no value", flags);
-	if (unset && (opt->flags & PARSE_OPT_NONEG))
-		return opterror(opt, "isn't available", flags);
-
-	if (!(flags & OPT_SHORT) && p->opt) {
-		switch (opt->type) {
-		case OPTION_CALLBACK:
-			if (!(opt->flags & PARSE_OPT_NOARG))
-				break;
-			/* FALLTHROUGH */
-		case OPTION_BOOLEAN:
-		case OPTION_BIT:
-		case OPTION_SET_INT:
-		case OPTION_SET_PTR:
-			return opterror(opt, "takes no value", flags);
-		default:
-			break;
-		}
-	}
-
-	switch (opt->type) {
-	case OPTION_BIT:
-		if (unset)
-			*(int *)opt->value &= ~opt->defval;
-		else
-			*(int *)opt->value |= opt->defval;
-		return 0;
-
-	case OPTION_BOOLEAN:
-		*(int *)opt->value = unset ? 0 : *(int *)opt->value + 1;
-		return 0;
-
-	case OPTION_SET_INT:
-		*(int *)opt->value = unset ? 0 : opt->defval;
-		return 0;
-
-	case OPTION_SET_PTR:
-		*(void **)opt->value = unset ? NULL : (void *)opt->defval;
-		return 0;
-
-	case OPTION_STRING:
-		if (unset)
-			*(const char **)opt->value = NULL;
-		else if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
-			*(const char **)opt->value = (const char *)opt->defval;
-		else
-			return get_arg(p, opt, flags, (const char **)opt->value);
-		return 0;
-
-	case OPTION_CALLBACK:
-		if (unset)
-			return (*opt->callback)(opt, NULL, 1) ? (-1) : 0;
-		if (opt->flags & PARSE_OPT_NOARG)
-			return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
-		if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
-			return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
-		if (get_arg(p, opt, flags, &arg))
-			return -1;
-		return (*opt->callback)(opt, arg, 0) ? (-1) : 0;
-
-	case OPTION_INTEGER:
-		if (unset) {
-			*(int *)opt->value = 0;
-			return 0;
-		}
-		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
-			*(int *)opt->value = opt->defval;
-			return 0;
-		}
-		if (get_arg(p, opt, flags, &arg))
-			return -1;
-		*(int *)opt->value = strtol(arg, (char **)&s, 10);
-		if (*s)
-			return opterror(opt, "expects a numerical value", flags);
-		return 0;
-
-	case OPTION_LONG:
-		if (unset) {
-			*(long *)opt->value = 0;
-			return 0;
-		}
-		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
-			*(long *)opt->value = opt->defval;
-			return 0;
-		}
-		if (get_arg(p, opt, flags, &arg))
-			return -1;
-		*(long *)opt->value = strtol(arg, (char **)&s, 10);
-		if (*s)
-			return opterror(opt, "expects a numerical value", flags);
-		return 0;
-
-	default:
-		die("should not happen, someone must be hit on the forehead");
-	}
-}
-
-static int parse_short_opt(struct parse_opt_ctx_t *p, const struct option *options)
-{
-	for (; options->type != OPTION_END; options++) {
-		if (options->short_name == *p->opt) {
-			p->opt = p->opt[1] ? p->opt + 1 : NULL;
-			return get_value(p, options, OPT_SHORT);
-		}
-	}
-	return -2;
-}
-
-static int parse_long_opt(struct parse_opt_ctx_t *p, const char *arg,
-                          const struct option *options)
-{
-	const char *arg_end = strchr(arg, '=');
-	const struct option *abbrev_option = NULL, *ambiguous_option = NULL;
-	int abbrev_flags = 0, ambiguous_flags = 0;
-
-	if (!arg_end)
-		arg_end = arg + strlen(arg);
-
-	for (; options->type != OPTION_END; options++) {
-		const char *rest;
-		int flags = 0;
-
-		if (!options->long_name)
-			continue;
-
-		rest = skip_prefix(arg, options->long_name);
-		if (options->type == OPTION_ARGUMENT) {
-			if (!rest)
-				continue;
-			if (*rest == '=')
-				return opterror(options, "takes no value", flags);
-			if (*rest)
-				continue;
-			p->out[p->cpidx++] = arg - 2;
-			return 0;
-		}
-		if (!rest) {
-			/* abbreviated? */
-			if (!strncmp(options->long_name, arg, arg_end - arg)) {
-is_abbreviated:
-				if (abbrev_option) {
-					/*
-					 * If this is abbreviated, it is
-					 * ambiguous. So when there is no
-					 * exact match later, we need to
-					 * error out.
-					 */
-					ambiguous_option = abbrev_option;
-					ambiguous_flags = abbrev_flags;
-				}
-				if (!(flags & OPT_UNSET) && *arg_end)
-					p->opt = arg_end + 1;
-				abbrev_option = options;
-				abbrev_flags = flags;
-				continue;
-			}
-			/* negated and abbreviated very much? */
-			if (!prefixcmp("no-", arg)) {
-				flags |= OPT_UNSET;
-				goto is_abbreviated;
-			}
-			/* negated? */
-			if (strncmp(arg, "no-", 3))
-				continue;
-			flags |= OPT_UNSET;
-			rest = skip_prefix(arg + 3, options->long_name);
-			/* abbreviated and negated? */
-			if (!rest && !prefixcmp(options->long_name, arg + 3))
-				goto is_abbreviated;
-			if (!rest)
-				continue;
-		}
-		if (*rest) {
-			if (*rest != '=')
-				continue;
-			p->opt = rest + 1;
-		}
-		return get_value(p, options, flags);
-	}
-
-	if (ambiguous_option)
-		return error("Ambiguous option: %s "
-			"(could be --%s%s or --%s%s)",
-			arg,
-			(ambiguous_flags & OPT_UNSET) ?  "no-" : "",
-			ambiguous_option->long_name,
-			(abbrev_flags & OPT_UNSET) ?  "no-" : "",
-			abbrev_option->long_name);
-	if (abbrev_option)
-		return get_value(p, abbrev_option, abbrev_flags);
-	return -2;
-}
-
-static void check_typos(const char *arg, const struct option *options)
-{
-	if (strlen(arg) < 3)
-		return;
-
-	if (!prefixcmp(arg, "no-")) {
-		error ("did you mean `--%s` (with two dashes ?)", arg);
-		exit(129);
-	}
-
-	for (; options->type != OPTION_END; options++) {
-		if (!options->long_name)
-			continue;
-		if (!prefixcmp(options->long_name, arg)) {
-			error ("did you mean `--%s` (with two dashes ?)", arg);
-			exit(129);
-		}
-	}
-}
-
-void parse_options_start(struct parse_opt_ctx_t *ctx,
-			 int argc, const char **argv, int flags)
-{
-	memset(ctx, 0, sizeof(*ctx));
-	ctx->argc = argc - 1;
-	ctx->argv = argv + 1;
-	ctx->out  = argv;
-	ctx->cpidx = ((flags & PARSE_OPT_KEEP_ARGV0) != 0);
-	ctx->flags = flags;
-	if ((flags & PARSE_OPT_KEEP_UNKNOWN) &&
-	    (flags & PARSE_OPT_STOP_AT_NON_OPTION))
-		die("STOP_AT_NON_OPTION and KEEP_UNKNOWN don't go together");
-}
-
-static int usage_with_options_internal(const char * const *,
-				       const struct option *, int);
-
-int parse_options_step(struct parse_opt_ctx_t *ctx,
-		       const struct option *options,
-		       const char * const usagestr[])
-{
-	int internal_help = !(ctx->flags & PARSE_OPT_NO_INTERNAL_HELP);
-
-	/* we must reset ->opt, unknown short option leave it dangling */
-	ctx->opt = NULL;
-
-	for (; ctx->argc; ctx->argc--, ctx->argv++) {
-		const char *arg = ctx->argv[0];
-
-		if (*arg != '-' || !arg[1]) {
-			if (ctx->flags & PARSE_OPT_STOP_AT_NON_OPTION)
-				break;
-			ctx->out[ctx->cpidx++] = ctx->argv[0];
-			continue;
-		}
-
-		if (arg[1] != '-') {
-			ctx->opt = arg + 1;
-			if (internal_help && *ctx->opt == 'h')
-				return parse_options_usage(usagestr, options);
-			switch (parse_short_opt(ctx, options)) {
-			case -1:
-				return parse_options_usage(usagestr, options);
-			case -2:
-				goto unknown;
-			}
-			if (ctx->opt)
-				check_typos(arg + 1, options);
-			while (ctx->opt) {
-				if (internal_help && *ctx->opt == 'h')
-					return parse_options_usage(usagestr, options);
-				switch (parse_short_opt(ctx, options)) {
-				case -1:
-					return parse_options_usage(usagestr, options);
-				case -2:
-					/* fake a short option thing to hide the fact that we may have
-					 * started to parse aggregated stuff
-					 *
-					 * This is leaky, too bad.
-					 */
-					ctx->argv[0] = strdup(ctx->opt - 1);
-					*(char *)ctx->argv[0] = '-';
-					goto unknown;
-				}
-			}
-			continue;
-		}
-
-		if (!arg[2]) { /* "--" */
-			if (!(ctx->flags & PARSE_OPT_KEEP_DASHDASH)) {
-				ctx->argc--;
-				ctx->argv++;
-			}
-			break;
-		}
-
-		if (internal_help && !strcmp(arg + 2, "help-all"))
-			return usage_with_options_internal(usagestr, options, 1);
-		if (internal_help && !strcmp(arg + 2, "help"))
-			return parse_options_usage(usagestr, options);
-		switch (parse_long_opt(ctx, arg + 2, options)) {
-		case -1:
-			return parse_options_usage(usagestr, options);
-		case -2:
-			goto unknown;
-		}
-		continue;
-unknown:
-		if (!(ctx->flags & PARSE_OPT_KEEP_UNKNOWN))
-			return PARSE_OPT_UNKNOWN;
-		ctx->out[ctx->cpidx++] = ctx->argv[0];
-		ctx->opt = NULL;
-	}
-	return PARSE_OPT_DONE;
-}
-
-int parse_options_end(struct parse_opt_ctx_t *ctx)
-{
-	memmove(ctx->out + ctx->cpidx, ctx->argv, ctx->argc * sizeof(*ctx->out));
-	ctx->out[ctx->cpidx + ctx->argc] = NULL;
-	return ctx->cpidx + ctx->argc;
-}
-
-int parse_options(int argc, const char **argv, const struct option *options,
-		  const char * const usagestr[], int flags)
-{
-	struct parse_opt_ctx_t ctx;
-
-	parse_options_start(&ctx, argc, argv, flags);
-	switch (parse_options_step(&ctx, options, usagestr)) {
-	case PARSE_OPT_HELP:
-		exit(129);
-	case PARSE_OPT_DONE:
-		break;
-	default: /* PARSE_OPT_UNKNOWN */
-		if (ctx.argv[0][1] == '-') {
-			error("unknown option `%s'", ctx.argv[0] + 2);
-		} else {
-			error("unknown switch `%c'", *ctx.opt);
-		}
-		usage_with_options(usagestr, options);
-	}
-
-	return parse_options_end(&ctx);
-}
-
-#define USAGE_OPTS_WIDTH 24
-#define USAGE_GAP         2
-
-int usage_with_options_internal(const char * const *usagestr,
-				const struct option *opts, int full)
-{
-	if (!usagestr)
-		return PARSE_OPT_HELP;
-
-	fprintf(stderr, "\n usage: %s\n", *usagestr++);
-	while (*usagestr && **usagestr)
-		fprintf(stderr, "    or: %s\n", *usagestr++);
-	while (*usagestr) {
-		fprintf(stderr, "%s%s\n",
-				**usagestr ? "    " : "",
-				*usagestr);
-		usagestr++;
-	}
-
-	if (opts->type != OPTION_GROUP)
-		fputc('\n', stderr);
-
-	for (; opts->type != OPTION_END; opts++) {
-		size_t pos;
-		int pad;
-
-		if (opts->type == OPTION_GROUP) {
-			fputc('\n', stderr);
-			if (*opts->help)
-				fprintf(stderr, "%s\n", opts->help);
-			continue;
-		}
-		if (!full && (opts->flags & PARSE_OPT_HIDDEN))
-			continue;
-
-		pos = fprintf(stderr, "    ");
-		if (opts->short_name)
-			pos += fprintf(stderr, "-%c", opts->short_name);
-		if (opts->long_name && opts->short_name)
-			pos += fprintf(stderr, ", ");
-		if (opts->long_name)
-			pos += fprintf(stderr, "--%s", opts->long_name);
-
-		switch (opts->type) {
-		case OPTION_ARGUMENT:
-			break;
-		case OPTION_INTEGER:
-			if (opts->flags & PARSE_OPT_OPTARG)
-				if (opts->long_name)
-					pos += fprintf(stderr, "[=<n>]");
-				else
-					pos += fprintf(stderr, "[<n>]");
-			else
-				pos += fprintf(stderr, " <n>");
-			break;
-		case OPTION_CALLBACK:
-			if (opts->flags & PARSE_OPT_NOARG)
-				break;
-			/* FALLTHROUGH */
-		case OPTION_STRING:
-			if (opts->argh) {
-				if (opts->flags & PARSE_OPT_OPTARG)
-					if (opts->long_name)
-						pos += fprintf(stderr, "[=<%s>]", opts->argh);
-					else
-						pos += fprintf(stderr, "[<%s>]", opts->argh);
-				else
-					pos += fprintf(stderr, " <%s>", opts->argh);
-			} else {
-				if (opts->flags & PARSE_OPT_OPTARG)
-					if (opts->long_name)
-						pos += fprintf(stderr, "[=...]");
-					else
-						pos += fprintf(stderr, "[...]");
-				else
-					pos += fprintf(stderr, " ...");
-			}
-			break;
-		default: /* OPTION_{BIT,BOOLEAN,SET_INT,SET_PTR} */
-			break;
-		}
-
-		if (pos <= USAGE_OPTS_WIDTH)
-			pad = USAGE_OPTS_WIDTH - pos;
-		else {
-			fputc('\n', stderr);
-			pad = USAGE_OPTS_WIDTH;
-		}
-		fprintf(stderr, "%*s%s\n", pad + USAGE_GAP, "", opts->help);
-	}
-	fputc('\n', stderr);
-
-	return PARSE_OPT_HELP;
-}
-
-void usage_with_options(const char * const *usagestr,
-			const struct option *opts)
-{
-	usage_with_options_internal(usagestr, opts, 0);
-	exit(129);
-}
-
-int parse_options_usage(const char * const *usagestr,
-			const struct option *opts)
-{
-	return usage_with_options_internal(usagestr, opts, 0);
-}
-
-
-int parse_opt_verbosity_cb(const struct option *opt, const char *arg,
-			   int unset)
-{
-	int *target = opt->value;
-
-	if (unset)
-		/* --no-quiet, --no-verbose */
-		*target = 0;
-	else if (opt->short_name == 'v') {
-		if (*target >= 0)
-			(*target)++;
-		else
-			*target = 1;
-	} else {
-		if (*target <= 0)
-			(*target)--;
-		else
-			*target = -1;
-	}
-	return 0;
-}
diff --git a/Documentation/perf_counter/util/parse-options.h b/Documentation/perf_counter/util/parse-options.h
deleted file mode 100644
index a1039a6..0000000
--- a/Documentation/perf_counter/util/parse-options.h
+++ /dev/null
@@ -1,174 +0,0 @@
-#ifndef PARSE_OPTIONS_H
-#define PARSE_OPTIONS_H
-
-enum parse_opt_type {
-	/* special types */
-	OPTION_END,
-	OPTION_ARGUMENT,
-	OPTION_GROUP,
-	/* options with no arguments */
-	OPTION_BIT,
-	OPTION_BOOLEAN, /* _INCR would have been a better name */
-	OPTION_SET_INT,
-	OPTION_SET_PTR,
-	/* options with arguments (usually) */
-	OPTION_STRING,
-	OPTION_INTEGER,
-	OPTION_LONG,
-	OPTION_CALLBACK,
-};
-
-enum parse_opt_flags {
-	PARSE_OPT_KEEP_DASHDASH = 1,
-	PARSE_OPT_STOP_AT_NON_OPTION = 2,
-	PARSE_OPT_KEEP_ARGV0 = 4,
-	PARSE_OPT_KEEP_UNKNOWN = 8,
-	PARSE_OPT_NO_INTERNAL_HELP = 16,
-};
-
-enum parse_opt_option_flags {
-	PARSE_OPT_OPTARG  = 1,
-	PARSE_OPT_NOARG   = 2,
-	PARSE_OPT_NONEG   = 4,
-	PARSE_OPT_HIDDEN  = 8,
-	PARSE_OPT_LASTARG_DEFAULT = 16,
-};
-
-struct option;
-typedef int parse_opt_cb(const struct option *, const char *arg, int unset);
-
-/*
- * `type`::
- *   holds the type of the option, you must have an OPTION_END last in your
- *   array.
- *
- * `short_name`::
- *   the character to use as a short option name, '\0' if none.
- *
- * `long_name`::
- *   the long option name, without the leading dashes, NULL if none.
- *
- * `value`::
- *   stores pointers to the values to be filled.
- *
- * `argh`::
- *   token to explain the kind of argument this option wants. Keep it
- *   homogenous across the repository.
- *
- * `help`::
- *   the short help associated to what the option does.
- *   Must never be NULL (except for OPTION_END).
- *   OPTION_GROUP uses this pointer to store the group header.
- *
- * `flags`::
- *   mask of parse_opt_option_flags.
- *   PARSE_OPT_OPTARG: says that the argument is optionnal (not for BOOLEANs)
- *   PARSE_OPT_NOARG: says that this option takes no argument, for CALLBACKs
- *   PARSE_OPT_NONEG: says that this option cannot be negated
- *   PARSE_OPT_HIDDEN this option is skipped in the default usage, showed in
- *                    the long one.
- *
- * `callback`::
- *   pointer to the callback to use for OPTION_CALLBACK.
- *
- * `defval`::
- *   default value to fill (*->value) with for PARSE_OPT_OPTARG.
- *   OPTION_{BIT,SET_INT,SET_PTR} store the {mask,integer,pointer} to put in
- *   the value when met.
- *   CALLBACKS can use it like they want.
- */
-struct option {
-	enum parse_opt_type type;
-	int short_name;
-	const char *long_name;
-	void *value;
-	const char *argh;
-	const char *help;
-
-	int flags;
-	parse_opt_cb *callback;
-	intptr_t defval;
-};
-
-#define OPT_END()                   { OPTION_END }
-#define OPT_ARGUMENT(l, h)          { OPTION_ARGUMENT, 0, (l), NULL, NULL, (h) }
-#define OPT_GROUP(h)                { OPTION_GROUP, 0, NULL, NULL, NULL, (h) }
-#define OPT_BIT(s, l, v, h, b)      { OPTION_BIT, (s), (l), (v), NULL, (h), 0, NULL, (b) }
-#define OPT_BOOLEAN(s, l, v, h)     { OPTION_BOOLEAN, (s), (l), (v), NULL, (h) }
-#define OPT_SET_INT(s, l, v, h, i)  { OPTION_SET_INT, (s), (l), (v), NULL, (h), 0, NULL, (i) }
-#define OPT_SET_PTR(s, l, v, h, p)  { OPTION_SET_PTR, (s), (l), (v), NULL, (h), 0, NULL, (p) }
-#define OPT_INTEGER(s, l, v, h)     { OPTION_INTEGER, (s), (l), (v), NULL, (h) }
-#define OPT_LONG(s, l, v, h)        { OPTION_LONG, (s), (l), (v), NULL, (h) }
-#define OPT_STRING(s, l, v, a, h)   { OPTION_STRING,  (s), (l), (v), (a), (h) }
-#define OPT_DATE(s, l, v, h) \
-	{ OPTION_CALLBACK, (s), (l), (v), "time",(h), 0, \
-	  parse_opt_approxidate_cb }
-#define OPT_CALLBACK(s, l, v, a, h, f) \
-	{ OPTION_CALLBACK, (s), (l), (v), (a), (h), 0, (f) }
-
-/* parse_options() will filter out the processed options and leave the
- * non-option argments in argv[].
- * Returns the number of arguments left in argv[].
- */
-extern int parse_options(int argc, const char **argv,
-                         const struct option *options,
-                         const char * const usagestr[], int flags);
-
-extern NORETURN void usage_with_options(const char * const *usagestr,
-                                        const struct option *options);
-
-/*----- incremantal advanced APIs -----*/
-
-enum {
-	PARSE_OPT_HELP = -1,
-	PARSE_OPT_DONE,
-	PARSE_OPT_UNKNOWN,
-};
-
-/*
- * It's okay for the caller to consume argv/argc in the usual way.
- * Other fields of that structure are private to parse-options and should not
- * be modified in any way.
- */
-struct parse_opt_ctx_t {
-	const char **argv;
-	const char **out;
-	int argc, cpidx;
-	const char *opt;
-	int flags;
-};
-
-extern int parse_options_usage(const char * const *usagestr,
-			       const struct option *opts);
-
-extern void parse_options_start(struct parse_opt_ctx_t *ctx,
-				int argc, const char **argv, int flags);
-
-extern int parse_options_step(struct parse_opt_ctx_t *ctx,
-			      const struct option *options,
-			      const char * const usagestr[]);
-
-extern int parse_options_end(struct parse_opt_ctx_t *ctx);
-
-
-/*----- some often used options -----*/
-extern int parse_opt_abbrev_cb(const struct option *, const char *, int);
-extern int parse_opt_approxidate_cb(const struct option *, const char *, int);
-extern int parse_opt_verbosity_cb(const struct option *, const char *, int);
-
-#define OPT__VERBOSE(var)  OPT_BOOLEAN('v', "verbose", (var), "be verbose")
-#define OPT__QUIET(var)    OPT_BOOLEAN('q', "quiet",   (var), "be quiet")
-#define OPT__VERBOSITY(var) \
-	{ OPTION_CALLBACK, 'v', "verbose", (var), NULL, "be more verbose", \
-	  PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }, \
-	{ OPTION_CALLBACK, 'q', "quiet", (var), NULL, "be more quiet", \
-	  PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }
-#define OPT__DRY_RUN(var)  OPT_BOOLEAN('n', "dry-run", (var), "dry run")
-#define OPT__ABBREV(var)  \
-	{ OPTION_CALLBACK, 0, "abbrev", (var), "n", \
-	  "use <n> digits to display SHA-1s", \
-	  PARSE_OPT_OPTARG, &parse_opt_abbrev_cb, 0 }
-
-extern const char *parse_options_fix_filename(const char *prefix, const char *file);
-
-#endif
diff --git a/Documentation/perf_counter/util/path.c b/Documentation/perf_counter/util/path.c
deleted file mode 100644
index a501a40..0000000
--- a/Documentation/perf_counter/util/path.c
+++ /dev/null
@@ -1,353 +0,0 @@
-/*
- * I'm tired of doing "vsnprintf()" etc just to open a
- * file, so here's a "return static buffer with printf"
- * interface for paths.
- *
- * It's obviously not thread-safe. Sue me. But it's quite
- * useful for doing things like
- *
- *   f = open(mkpath("%s/%s.perf", base, name), O_RDONLY);
- *
- * which is what it's designed for.
- */
-#include "cache.h"
-
-static char bad_path[] = "/bad-path/";
-/*
- * Two hacks:
- */
-
-static char *get_perf_dir(void)
-{
-	return ".";
-}
-
-size_t strlcpy(char *dest, const char *src, size_t size)
-{
-	size_t ret = strlen(src);
-
-	if (size) {
-		size_t len = (ret >= size) ? size - 1 : ret;
-		memcpy(dest, src, len);
-		dest[len] = '\0';
-	}
-	return ret;
-}
-
-
-static char *get_pathname(void)
-{
-	static char pathname_array[4][PATH_MAX];
-	static int index;
-	return pathname_array[3 & ++index];
-}
-
-static char *cleanup_path(char *path)
-{
-	/* Clean it up */
-	if (!memcmp(path, "./", 2)) {
-		path += 2;
-		while (*path == '/')
-			path++;
-	}
-	return path;
-}
-
-char *mksnpath(char *buf, size_t n, const char *fmt, ...)
-{
-	va_list args;
-	unsigned len;
-
-	va_start(args, fmt);
-	len = vsnprintf(buf, n, fmt, args);
-	va_end(args);
-	if (len >= n) {
-		strlcpy(buf, bad_path, n);
-		return buf;
-	}
-	return cleanup_path(buf);
-}
-
-static char *perf_vsnpath(char *buf, size_t n, const char *fmt, va_list args)
-{
-	const char *perf_dir = get_perf_dir();
-	size_t len;
-
-	len = strlen(perf_dir);
-	if (n < len + 1)
-		goto bad;
-	memcpy(buf, perf_dir, len);
-	if (len && !is_dir_sep(perf_dir[len-1]))
-		buf[len++] = '/';
-	len += vsnprintf(buf + len, n - len, fmt, args);
-	if (len >= n)
-		goto bad;
-	return cleanup_path(buf);
-bad:
-	strlcpy(buf, bad_path, n);
-	return buf;
-}
-
-char *perf_snpath(char *buf, size_t n, const char *fmt, ...)
-{
-	va_list args;
-	va_start(args, fmt);
-	(void)perf_vsnpath(buf, n, fmt, args);
-	va_end(args);
-	return buf;
-}
-
-char *perf_pathdup(const char *fmt, ...)
-{
-	char path[PATH_MAX];
-	va_list args;
-	va_start(args, fmt);
-	(void)perf_vsnpath(path, sizeof(path), fmt, args);
-	va_end(args);
-	return xstrdup(path);
-}
-
-char *mkpath(const char *fmt, ...)
-{
-	va_list args;
-	unsigned len;
-	char *pathname = get_pathname();
-
-	va_start(args, fmt);
-	len = vsnprintf(pathname, PATH_MAX, fmt, args);
-	va_end(args);
-	if (len >= PATH_MAX)
-		return bad_path;
-	return cleanup_path(pathname);
-}
-
-char *perf_path(const char *fmt, ...)
-{
-	const char *perf_dir = get_perf_dir();
-	char *pathname = get_pathname();
-	va_list args;
-	unsigned len;
-
-	len = strlen(perf_dir);
-	if (len > PATH_MAX-100)
-		return bad_path;
-	memcpy(pathname, perf_dir, len);
-	if (len && perf_dir[len-1] != '/')
-		pathname[len++] = '/';
-	va_start(args, fmt);
-	len += vsnprintf(pathname + len, PATH_MAX - len, fmt, args);
-	va_end(args);
-	if (len >= PATH_MAX)
-		return bad_path;
-	return cleanup_path(pathname);
-}
-
-
-/* perf_mkstemp() - create tmp file honoring TMPDIR variable */
-int perf_mkstemp(char *path, size_t len, const char *template)
-{
-	const char *tmp;
-	size_t n;
-
-	tmp = getenv("TMPDIR");
-	if (!tmp)
-		tmp = "/tmp";
-	n = snprintf(path, len, "%s/%s", tmp, template);
-	if (len <= n) {
-		errno = ENAMETOOLONG;
-		return -1;
-	}
-	return mkstemp(path);
-}
-
-
-const char *make_relative_path(const char *abs, const char *base)
-{
-	static char buf[PATH_MAX + 1];
-	int baselen;
-	if (!base)
-		return abs;
-	baselen = strlen(base);
-	if (prefixcmp(abs, base))
-		return abs;
-	if (abs[baselen] == '/')
-		baselen++;
-	else if (base[baselen - 1] != '/')
-		return abs;
-	strcpy(buf, abs + baselen);
-	return buf;
-}
-
-/*
- * It is okay if dst == src, but they should not overlap otherwise.
- *
- * Performs the following normalizations on src, storing the result in dst:
- * - Ensures that components are separated by '/' (Windows only)
- * - Squashes sequences of '/'.
- * - Removes "." components.
- * - Removes ".." components, and the components the precede them.
- * Returns failure (non-zero) if a ".." component appears as first path
- * component anytime during the normalization. Otherwise, returns success (0).
- *
- * Note that this function is purely textual.  It does not follow symlinks,
- * verify the existence of the path, or make any system calls.
- */
-int normalize_path_copy(char *dst, const char *src)
-{
-	char *dst0;
-
-	if (has_dos_drive_prefix(src)) {
-		*dst++ = *src++;
-		*dst++ = *src++;
-	}
-	dst0 = dst;
-
-	if (is_dir_sep(*src)) {
-		*dst++ = '/';
-		while (is_dir_sep(*src))
-			src++;
-	}
-
-	for (;;) {
-		char c = *src;
-
-		/*
-		 * A path component that begins with . could be
-		 * special:
-		 * (1) "." and ends   -- ignore and terminate.
-		 * (2) "./"           -- ignore them, eat slash and continue.
-		 * (3) ".." and ends  -- strip one and terminate.
-		 * (4) "../"          -- strip one, eat slash and continue.
-		 */
-		if (c == '.') {
-			if (!src[1]) {
-				/* (1) */
-				src++;
-			} else if (is_dir_sep(src[1])) {
-				/* (2) */
-				src += 2;
-				while (is_dir_sep(*src))
-					src++;
-				continue;
-			} else if (src[1] == '.') {
-				if (!src[2]) {
-					/* (3) */
-					src += 2;
-					goto up_one;
-				} else if (is_dir_sep(src[2])) {
-					/* (4) */
-					src += 3;
-					while (is_dir_sep(*src))
-						src++;
-					goto up_one;
-				}
-			}
-		}
-
-		/* copy up to the next '/', and eat all '/' */
-		while ((c = *src++) != '\0' && !is_dir_sep(c))
-			*dst++ = c;
-		if (is_dir_sep(c)) {
-			*dst++ = '/';
-			while (is_dir_sep(c))
-				c = *src++;
-			src--;
-		} else if (!c)
-			break;
-		continue;
-
-	up_one:
-		/*
-		 * dst0..dst is prefix portion, and dst[-1] is '/';
-		 * go up one level.
-		 */
-		dst--;	/* go to trailing '/' */
-		if (dst <= dst0)
-			return -1;
-		/* Windows: dst[-1] cannot be backslash anymore */
-		while (dst0 < dst && dst[-1] != '/')
-			dst--;
-	}
-	*dst = '\0';
-	return 0;
-}
-
-/*
- * path = Canonical absolute path
- * prefix_list = Colon-separated list of absolute paths
- *
- * Determines, for each path in prefix_list, whether the "prefix" really
- * is an ancestor directory of path.  Returns the length of the longest
- * ancestor directory, excluding any trailing slashes, or -1 if no prefix
- * is an ancestor.  (Note that this means 0 is returned if prefix_list is
- * "/".) "/foo" is not considered an ancestor of "/foobar".  Directories
- * are not considered to be their own ancestors.  path must be in a
- * canonical form: empty components, or "." or ".." components are not
- * allowed.  prefix_list may be null, which is like "".
- */
-int longest_ancestor_length(const char *path, const char *prefix_list)
-{
-	char buf[PATH_MAX+1];
-	const char *ceil, *colon;
-	int len, max_len = -1;
-
-	if (prefix_list == NULL || !strcmp(path, "/"))
-		return -1;
-
-	for (colon = ceil = prefix_list; *colon; ceil = colon+1) {
-		for (colon = ceil; *colon && *colon != PATH_SEP; colon++);
-		len = colon - ceil;
-		if (len == 0 || len > PATH_MAX || !is_absolute_path(ceil))
-			continue;
-		strlcpy(buf, ceil, len+1);
-		if (normalize_path_copy(buf, buf) < 0)
-			continue;
-		len = strlen(buf);
-		if (len > 0 && buf[len-1] == '/')
-			buf[--len] = '\0';
-
-		if (!strncmp(path, buf, len) &&
-		    path[len] == '/' &&
-		    len > max_len) {
-			max_len = len;
-		}
-	}
-
-	return max_len;
-}
-
-/* strip arbitrary amount of directory separators at end of path */
-static inline int chomp_trailing_dir_sep(const char *path, int len)
-{
-	while (len && is_dir_sep(path[len - 1]))
-		len--;
-	return len;
-}
-
-/*
- * If path ends with suffix (complete path components), returns the
- * part before suffix (sans trailing directory separators).
- * Otherwise returns NULL.
- */
-char *strip_path_suffix(const char *path, const char *suffix)
-{
-	int path_len = strlen(path), suffix_len = strlen(suffix);
-
-	while (suffix_len) {
-		if (!path_len)
-			return NULL;
-
-		if (is_dir_sep(path[path_len - 1])) {
-			if (!is_dir_sep(suffix[suffix_len - 1]))
-				return NULL;
-			path_len = chomp_trailing_dir_sep(path, path_len);
-			suffix_len = chomp_trailing_dir_sep(suffix, suffix_len);
-		}
-		else if (path[--path_len] != suffix[--suffix_len])
-			return NULL;
-	}
-
-	if (path_len && !is_dir_sep(path[path_len - 1]))
-		return NULL;
-	return xstrndup(path, chomp_trailing_dir_sep(path, path_len));
-}
diff --git a/Documentation/perf_counter/util/quote.c b/Documentation/perf_counter/util/quote.c
deleted file mode 100644
index f18c521..0000000
--- a/Documentation/perf_counter/util/quote.c
+++ /dev/null
@@ -1,481 +0,0 @@
-#include "cache.h"
-#include "quote.h"
-
-int quote_path_fully = 1;
-
-/* Help to copy the thing properly quoted for the shell safety.
- * any single quote is replaced with '\'', any exclamation point
- * is replaced with '\!', and the whole thing is enclosed in a
- *
- * E.g.
- *  original     sq_quote     result
- *  name     ==> name      ==> 'name'
- *  a b      ==> a b       ==> 'a b'
- *  a'b      ==> a'\''b    ==> 'a'\''b'
- *  a!b      ==> a'\!'b    ==> 'a'\!'b'
- */
-static inline int need_bs_quote(char c)
-{
-	return (c == '\'' || c == '!');
-}
-
-void sq_quote_buf(struct strbuf *dst, const char *src)
-{
-	char *to_free = NULL;
-
-	if (dst->buf == src)
-		to_free = strbuf_detach(dst, NULL);
-
-	strbuf_addch(dst, '\'');
-	while (*src) {
-		size_t len = strcspn(src, "'!");
-		strbuf_add(dst, src, len);
-		src += len;
-		while (need_bs_quote(*src)) {
-			strbuf_addstr(dst, "'\\");
-			strbuf_addch(dst, *src++);
-			strbuf_addch(dst, '\'');
-		}
-	}
-	strbuf_addch(dst, '\'');
-	free(to_free);
-}
-
-void sq_quote_print(FILE *stream, const char *src)
-{
-	char c;
-
-	fputc('\'', stream);
-	while ((c = *src++)) {
-		if (need_bs_quote(c)) {
-			fputs("'\\", stream);
-			fputc(c, stream);
-			fputc('\'', stream);
-		} else {
-			fputc(c, stream);
-		}
-	}
-	fputc('\'', stream);
-}
-
-void sq_quote_argv(struct strbuf *dst, const char** argv, size_t maxlen)
-{
-	int i;
-
-	/* Copy into destination buffer. */
-	strbuf_grow(dst, 255);
-	for (i = 0; argv[i]; ++i) {
-		strbuf_addch(dst, ' ');
-		sq_quote_buf(dst, argv[i]);
-		if (maxlen && dst->len > maxlen)
-			die("Too many or long arguments");
-	}
-}
-
-char *sq_dequote_step(char *arg, char **next)
-{
-	char *dst = arg;
-	char *src = arg;
-	char c;
-
-	if (*src != '\'')
-		return NULL;
-	for (;;) {
-		c = *++src;
-		if (!c)
-			return NULL;
-		if (c != '\'') {
-			*dst++ = c;
-			continue;
-		}
-		/* We stepped out of sq */
-		switch (*++src) {
-		case '\0':
-			*dst = 0;
-			if (next)
-				*next = NULL;
-			return arg;
-		case '\\':
-			c = *++src;
-			if (need_bs_quote(c) && *++src == '\'') {
-				*dst++ = c;
-				continue;
-			}
-		/* Fallthrough */
-		default:
-			if (!next || !isspace(*src))
-				return NULL;
-			do {
-				c = *++src;
-			} while (isspace(c));
-			*dst = 0;
-			*next = src;
-			return arg;
-		}
-	}
-}
-
-char *sq_dequote(char *arg)
-{
-	return sq_dequote_step(arg, NULL);
-}
-
-int sq_dequote_to_argv(char *arg, const char ***argv, int *nr, int *alloc)
-{
-	char *next = arg;
-
-	if (!*arg)
-		return 0;
-	do {
-		char *dequoted = sq_dequote_step(next, &next);
-		if (!dequoted)
-			return -1;
-		ALLOC_GROW(*argv, *nr + 1, *alloc);
-		(*argv)[(*nr)++] = dequoted;
-	} while (next);
-
-	return 0;
-}
-
-/* 1 means: quote as octal
- * 0 means: quote as octal if (quote_path_fully)
- * -1 means: never quote
- * c: quote as "\\c"
- */
-#define X8(x)   x, x, x, x, x, x, x, x
-#define X16(x)  X8(x), X8(x)
-static signed char const sq_lookup[256] = {
-	/*           0    1    2    3    4    5    6    7 */
-	/* 0x00 */   1,   1,   1,   1,   1,   1,   1, 'a',
-	/* 0x08 */ 'b', 't', 'n', 'v', 'f', 'r',   1,   1,
-	/* 0x10 */ X16(1),
-	/* 0x20 */  -1,  -1, '"',  -1,  -1,  -1,  -1,  -1,
-	/* 0x28 */ X16(-1), X16(-1), X16(-1),
-	/* 0x58 */  -1,  -1,  -1,  -1,'\\',  -1,  -1,  -1,
-	/* 0x60 */ X16(-1), X8(-1),
-	/* 0x78 */  -1,  -1,  -1,  -1,  -1,  -1,  -1,   1,
-	/* 0x80 */ /* set to 0 */
-};
-
-static inline int sq_must_quote(char c)
-{
-	return sq_lookup[(unsigned char)c] + quote_path_fully > 0;
-}
-
-/* returns the longest prefix not needing a quote up to maxlen if positive.
-   This stops at the first \0 because it's marked as a character needing an
-   escape */
-static size_t next_quote_pos(const char *s, ssize_t maxlen)
-{
-	size_t len;
-	if (maxlen < 0) {
-		for (len = 0; !sq_must_quote(s[len]); len++);
-	} else {
-		for (len = 0; len < maxlen && !sq_must_quote(s[len]); len++);
-	}
-	return len;
-}
-
-/*
- * C-style name quoting.
- *
- * (1) if sb and fp are both NULL, inspect the input name and counts the
- *     number of bytes that are needed to hold c_style quoted version of name,
- *     counting the double quotes around it but not terminating NUL, and
- *     returns it.
- *     However, if name does not need c_style quoting, it returns 0.
- *
- * (2) if sb or fp are not NULL, it emits the c_style quoted version
- *     of name, enclosed with double quotes if asked and needed only.
- *     Return value is the same as in (1).
- */
-static size_t quote_c_style_counted(const char *name, ssize_t maxlen,
-                                    struct strbuf *sb, FILE *fp, int no_dq)
-{
-#undef EMIT
-#define EMIT(c)                                 \
-	do {                                        \
-		if (sb) strbuf_addch(sb, (c));          \
-		if (fp) fputc((c), fp);                 \
-		count++;                                \
-	} while (0)
-#define EMITBUF(s, l)                           \
-	do {                                        \
-		int __ret;				\
-		if (sb) strbuf_add(sb, (s), (l));       \
-		if (fp) __ret = fwrite((s), (l), 1, fp);        \
-		count += (l);                           \
-	} while (0)
-
-	size_t len, count = 0;
-	const char *p = name;
-
-	for (;;) {
-		int ch;
-
-		len = next_quote_pos(p, maxlen);
-		if (len == maxlen || !p[len])
-			break;
-
-		if (!no_dq && p == name)
-			EMIT('"');
-
-		EMITBUF(p, len);
-		EMIT('\\');
-		p += len;
-		ch = (unsigned char)*p++;
-		if (sq_lookup[ch] >= ' ') {
-			EMIT(sq_lookup[ch]);
-		} else {
-			EMIT(((ch >> 6) & 03) + '0');
-			EMIT(((ch >> 3) & 07) + '0');
-			EMIT(((ch >> 0) & 07) + '0');
-		}
-	}
-
-	EMITBUF(p, len);
-	if (p == name)   /* no ending quote needed */
-		return 0;
-
-	if (!no_dq)
-		EMIT('"');
-	return count;
-}
-
-size_t quote_c_style(const char *name, struct strbuf *sb, FILE *fp, int nodq)
-{
-	return quote_c_style_counted(name, -1, sb, fp, nodq);
-}
-
-void quote_two_c_style(struct strbuf *sb, const char *prefix, const char *path, int nodq)
-{
-	if (quote_c_style(prefix, NULL, NULL, 0) ||
-	    quote_c_style(path, NULL, NULL, 0)) {
-		if (!nodq)
-			strbuf_addch(sb, '"');
-		quote_c_style(prefix, sb, NULL, 1);
-		quote_c_style(path, sb, NULL, 1);
-		if (!nodq)
-			strbuf_addch(sb, '"');
-	} else {
-		strbuf_addstr(sb, prefix);
-		strbuf_addstr(sb, path);
-	}
-}
-
-void write_name_quoted(const char *name, FILE *fp, int terminator)
-{
-	if (terminator) {
-		quote_c_style(name, NULL, fp, 0);
-	} else {
-		fputs(name, fp);
-	}
-	fputc(terminator, fp);
-}
-
-extern void write_name_quotedpfx(const char *pfx, size_t pfxlen,
-                                 const char *name, FILE *fp, int terminator)
-{
-	int needquote = 0;
-
-	if (terminator) {
-		needquote = next_quote_pos(pfx, pfxlen) < pfxlen
-			|| name[next_quote_pos(name, -1)];
-	}
-	if (needquote) {
-		fputc('"', fp);
-		quote_c_style_counted(pfx, pfxlen, NULL, fp, 1);
-		quote_c_style(name, NULL, fp, 1);
-		fputc('"', fp);
-	} else {
-		int ret;
-
-		ret = fwrite(pfx, pfxlen, 1, fp);
-		fputs(name, fp);
-	}
-	fputc(terminator, fp);
-}
-
-/* quote path as relative to the given prefix */
-char *quote_path_relative(const char *in, int len,
-			  struct strbuf *out, const char *prefix)
-{
-	int needquote;
-
-	if (len < 0)
-		len = strlen(in);
-
-	/* "../" prefix itself does not need quoting, but "in" might. */
-	needquote = next_quote_pos(in, len) < len;
-	strbuf_setlen(out, 0);
-	strbuf_grow(out, len);
-
-	if (needquote)
-		strbuf_addch(out, '"');
-	if (prefix) {
-		int off = 0;
-		while (prefix[off] && off < len && prefix[off] == in[off])
-			if (prefix[off] == '/') {
-				prefix += off + 1;
-				in += off + 1;
-				len -= off + 1;
-				off = 0;
-			} else
-				off++;
-
-		for (; *prefix; prefix++)
-			if (*prefix == '/')
-				strbuf_addstr(out, "../");
-	}
-
-	quote_c_style_counted (in, len, out, NULL, 1);
-
-	if (needquote)
-		strbuf_addch(out, '"');
-	if (!out->len)
-		strbuf_addstr(out, "./");
-
-	return out->buf;
-}
-
-/*
- * C-style name unquoting.
- *
- * Quoted should point at the opening double quote.
- * + Returns 0 if it was able to unquote the string properly, and appends the
- *   result in the strbuf `sb'.
- * + Returns -1 in case of error, and doesn't touch the strbuf. Though note
- *   that this function will allocate memory in the strbuf, so calling
- *   strbuf_release is mandatory whichever result unquote_c_style returns.
- *
- * Updates endp pointer to point at one past the ending double quote if given.
- */
-int unquote_c_style(struct strbuf *sb, const char *quoted, const char **endp)
-{
-	size_t oldlen = sb->len, len;
-	int ch, ac;
-
-	if (*quoted++ != '"')
-		return -1;
-
-	for (;;) {
-		len = strcspn(quoted, "\"\\");
-		strbuf_add(sb, quoted, len);
-		quoted += len;
-
-		switch (*quoted++) {
-		  case '"':
-			if (endp)
-				*endp = quoted;
-			return 0;
-		  case '\\':
-			break;
-		  default:
-			goto error;
-		}
-
-		switch ((ch = *quoted++)) {
-		case 'a': ch = '\a'; break;
-		case 'b': ch = '\b'; break;
-		case 'f': ch = '\f'; break;
-		case 'n': ch = '\n'; break;
-		case 'r': ch = '\r'; break;
-		case 't': ch = '\t'; break;
-		case 'v': ch = '\v'; break;
-
-		case '\\': case '"':
-			break; /* verbatim */
-
-		/* octal values with first digit over 4 overflow */
-		case '0': case '1': case '2': case '3':
-					ac = ((ch - '0') << 6);
-			if ((ch = *quoted++) < '0' || '7' < ch)
-				goto error;
-					ac |= ((ch - '0') << 3);
-			if ((ch = *quoted++) < '0' || '7' < ch)
-				goto error;
-					ac |= (ch - '0');
-					ch = ac;
-					break;
-				default:
-			goto error;
-			}
-		strbuf_addch(sb, ch);
-		}
-
-  error:
-	strbuf_setlen(sb, oldlen);
-	return -1;
-}
-
-/* quoting as a string literal for other languages */
-
-void perl_quote_print(FILE *stream, const char *src)
-{
-	const char sq = '\'';
-	const char bq = '\\';
-	char c;
-
-	fputc(sq, stream);
-	while ((c = *src++)) {
-		if (c == sq || c == bq)
-			fputc(bq, stream);
-		fputc(c, stream);
-	}
-	fputc(sq, stream);
-}
-
-void python_quote_print(FILE *stream, const char *src)
-{
-	const char sq = '\'';
-	const char bq = '\\';
-	const char nl = '\n';
-	char c;
-
-	fputc(sq, stream);
-	while ((c = *src++)) {
-		if (c == nl) {
-			fputc(bq, stream);
-			fputc('n', stream);
-			continue;
-		}
-		if (c == sq || c == bq)
-			fputc(bq, stream);
-		fputc(c, stream);
-	}
-	fputc(sq, stream);
-}
-
-void tcl_quote_print(FILE *stream, const char *src)
-{
-	char c;
-
-	fputc('"', stream);
-	while ((c = *src++)) {
-		switch (c) {
-		case '[': case ']':
-		case '{': case '}':
-		case '$': case '\\': case '"':
-			fputc('\\', stream);
-		default:
-			fputc(c, stream);
-			break;
-		case '\f':
-			fputs("\\f", stream);
-			break;
-		case '\r':
-			fputs("\\r", stream);
-			break;
-		case '\n':
-			fputs("\\n", stream);
-			break;
-		case '\t':
-			fputs("\\t", stream);
-			break;
-		case '\v':
-			fputs("\\v", stream);
-			break;
-		}
-	}
-	fputc('"', stream);
-}
diff --git a/Documentation/perf_counter/util/quote.h b/Documentation/perf_counter/util/quote.h
deleted file mode 100644
index 5dfad89..0000000
--- a/Documentation/perf_counter/util/quote.h
+++ /dev/null
@@ -1,68 +0,0 @@
-#ifndef QUOTE_H
-#define QUOTE_H
-
-#include <stddef.h>
-#include <stdio.h>
-
-/* Help to copy the thing properly quoted for the shell safety.
- * any single quote is replaced with '\'', any exclamation point
- * is replaced with '\!', and the whole thing is enclosed in a
- * single quote pair.
- *
- * For example, if you are passing the result to system() as an
- * argument:
- *
- * sprintf(cmd, "foobar %s %s", sq_quote(arg0), sq_quote(arg1))
- *
- * would be appropriate.  If the system() is going to call ssh to
- * run the command on the other side:
- *
- * sprintf(cmd, "git-diff-tree %s %s", sq_quote(arg0), sq_quote(arg1));
- * sprintf(rcmd, "ssh %s %s", sq_util/quote.host), sq_quote(cmd));
- *
- * Note that the above examples leak memory!  Remember to free result from
- * sq_quote() in a real application.
- *
- * sq_quote_buf() writes to an existing buffer of specified size; it
- * will return the number of characters that would have been written
- * excluding the final null regardless of the buffer size.
- */
-
-extern void sq_quote_print(FILE *stream, const char *src);
-
-extern void sq_quote_buf(struct strbuf *, const char *src);
-extern void sq_quote_argv(struct strbuf *, const char **argv, size_t maxlen);
-
-/* This unwraps what sq_quote() produces in place, but returns
- * NULL if the input does not look like what sq_quote would have
- * produced.
- */
-extern char *sq_dequote(char *);
-
-/*
- * Same as the above, but can be used to unwrap many arguments in the
- * same string separated by space. "next" is changed to point to the
- * next argument that should be passed as first parameter. When there
- * is no more argument to be dequoted, "next" is updated to point to NULL.
- */
-extern char *sq_dequote_step(char *arg, char **next);
-extern int sq_dequote_to_argv(char *arg, const char ***argv, int *nr, int *alloc);
-
-extern int unquote_c_style(struct strbuf *, const char *quoted, const char **endp);
-extern size_t quote_c_style(const char *name, struct strbuf *, FILE *, int no_dq);
-extern void quote_two_c_style(struct strbuf *, const char *, const char *, int);
-
-extern void write_name_quoted(const char *name, FILE *, int terminator);
-extern void write_name_quotedpfx(const char *pfx, size_t pfxlen,
-                                 const char *name, FILE *, int terminator);
-
-/* quote path as relative to the given prefix */
-char *quote_path_relative(const char *in, int len,
-			  struct strbuf *out, const char *prefix);
-
-/* quoting as a string literal for other languages */
-extern void perl_quote_print(FILE *stream, const char *src);
-extern void python_quote_print(FILE *stream, const char *src);
-extern void tcl_quote_print(FILE *stream, const char *src);
-
-#endif
diff --git a/Documentation/perf_counter/util/rbtree.c b/Documentation/perf_counter/util/rbtree.c
deleted file mode 100644
index b15ba9c..0000000
--- a/Documentation/perf_counter/util/rbtree.c
+++ /dev/null
@@ -1,383 +0,0 @@
-/*
-  Red Black Trees
-  (C) 1999  Andrea Arcangeli <andrea@suse.de>
-  (C) 2002  David Woodhouse <dwmw2@infradead.org>
-  
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation; either version 2 of the License, or
-  (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-  linux/lib/rbtree.c
-*/
-
-#include "rbtree.h"
-
-static void __rb_rotate_left(struct rb_node *node, struct rb_root *root)
-{
-	struct rb_node *right = node->rb_right;
-	struct rb_node *parent = rb_parent(node);
-
-	if ((node->rb_right = right->rb_left))
-		rb_set_parent(right->rb_left, node);
-	right->rb_left = node;
-
-	rb_set_parent(right, parent);
-
-	if (parent)
-	{
-		if (node == parent->rb_left)
-			parent->rb_left = right;
-		else
-			parent->rb_right = right;
-	}
-	else
-		root->rb_node = right;
-	rb_set_parent(node, right);
-}
-
-static void __rb_rotate_right(struct rb_node *node, struct rb_root *root)
-{
-	struct rb_node *left = node->rb_left;
-	struct rb_node *parent = rb_parent(node);
-
-	if ((node->rb_left = left->rb_right))
-		rb_set_parent(left->rb_right, node);
-	left->rb_right = node;
-
-	rb_set_parent(left, parent);
-
-	if (parent)
-	{
-		if (node == parent->rb_right)
-			parent->rb_right = left;
-		else
-			parent->rb_left = left;
-	}
-	else
-		root->rb_node = left;
-	rb_set_parent(node, left);
-}
-
-void rb_insert_color(struct rb_node *node, struct rb_root *root)
-{
-	struct rb_node *parent, *gparent;
-
-	while ((parent = rb_parent(node)) && rb_is_red(parent))
-	{
-		gparent = rb_parent(parent);
-
-		if (parent == gparent->rb_left)
-		{
-			{
-				register struct rb_node *uncle = gparent->rb_right;
-				if (uncle && rb_is_red(uncle))
-				{
-					rb_set_black(uncle);
-					rb_set_black(parent);
-					rb_set_red(gparent);
-					node = gparent;
-					continue;
-				}
-			}
-
-			if (parent->rb_right == node)
-			{
-				register struct rb_node *tmp;
-				__rb_rotate_left(parent, root);
-				tmp = parent;
-				parent = node;
-				node = tmp;
-			}
-
-			rb_set_black(parent);
-			rb_set_red(gparent);
-			__rb_rotate_right(gparent, root);
-		} else {
-			{
-				register struct rb_node *uncle = gparent->rb_left;
-				if (uncle && rb_is_red(uncle))
-				{
-					rb_set_black(uncle);
-					rb_set_black(parent);
-					rb_set_red(gparent);
-					node = gparent;
-					continue;
-				}
-			}
-
-			if (parent->rb_left == node)
-			{
-				register struct rb_node *tmp;
-				__rb_rotate_right(parent, root);
-				tmp = parent;
-				parent = node;
-				node = tmp;
-			}
-
-			rb_set_black(parent);
-			rb_set_red(gparent);
-			__rb_rotate_left(gparent, root);
-		}
-	}
-
-	rb_set_black(root->rb_node);
-}
-
-static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
-			     struct rb_root *root)
-{
-	struct rb_node *other;
-
-	while ((!node || rb_is_black(node)) && node != root->rb_node)
-	{
-		if (parent->rb_left == node)
-		{
-			other = parent->rb_right;
-			if (rb_is_red(other))
-			{
-				rb_set_black(other);
-				rb_set_red(parent);
-				__rb_rotate_left(parent, root);
-				other = parent->rb_right;
-			}
-			if ((!other->rb_left || rb_is_black(other->rb_left)) &&
-			    (!other->rb_right || rb_is_black(other->rb_right)))
-			{
-				rb_set_red(other);
-				node = parent;
-				parent = rb_parent(node);
-			}
-			else
-			{
-				if (!other->rb_right || rb_is_black(other->rb_right))
-				{
-					rb_set_black(other->rb_left);
-					rb_set_red(other);
-					__rb_rotate_right(other, root);
-					other = parent->rb_right;
-				}
-				rb_set_color(other, rb_color(parent));
-				rb_set_black(parent);
-				rb_set_black(other->rb_right);
-				__rb_rotate_left(parent, root);
-				node = root->rb_node;
-				break;
-			}
-		}
-		else
-		{
-			other = parent->rb_left;
-			if (rb_is_red(other))
-			{
-				rb_set_black(other);
-				rb_set_red(parent);
-				__rb_rotate_right(parent, root);
-				other = parent->rb_left;
-			}
-			if ((!other->rb_left || rb_is_black(other->rb_left)) &&
-			    (!other->rb_right || rb_is_black(other->rb_right)))
-			{
-				rb_set_red(other);
-				node = parent;
-				parent = rb_parent(node);
-			}
-			else
-			{
-				if (!other->rb_left || rb_is_black(other->rb_left))
-				{
-					rb_set_black(other->rb_right);
-					rb_set_red(other);
-					__rb_rotate_left(other, root);
-					other = parent->rb_left;
-				}
-				rb_set_color(other, rb_color(parent));
-				rb_set_black(parent);
-				rb_set_black(other->rb_left);
-				__rb_rotate_right(parent, root);
-				node = root->rb_node;
-				break;
-			}
-		}
-	}
-	if (node)
-		rb_set_black(node);
-}
-
-void rb_erase(struct rb_node *node, struct rb_root *root)
-{
-	struct rb_node *child, *parent;
-	int color;
-
-	if (!node->rb_left)
-		child = node->rb_right;
-	else if (!node->rb_right)
-		child = node->rb_left;
-	else
-	{
-		struct rb_node *old = node, *left;
-
-		node = node->rb_right;
-		while ((left = node->rb_left) != NULL)
-			node = left;
-		child = node->rb_right;
-		parent = rb_parent(node);
-		color = rb_color(node);
-
-		if (child)
-			rb_set_parent(child, parent);
-		if (parent == old) {
-			parent->rb_right = child;
-			parent = node;
-		} else
-			parent->rb_left = child;
-
-		node->rb_parent_color = old->rb_parent_color;
-		node->rb_right = old->rb_right;
-		node->rb_left = old->rb_left;
-
-		if (rb_parent(old))
-		{
-			if (rb_parent(old)->rb_left == old)
-				rb_parent(old)->rb_left = node;
-			else
-				rb_parent(old)->rb_right = node;
-		} else
-			root->rb_node = node;
-
-		rb_set_parent(old->rb_left, node);
-		if (old->rb_right)
-			rb_set_parent(old->rb_right, node);
-		goto color;
-	}
-
-	parent = rb_parent(node);
-	color = rb_color(node);
-
-	if (child)
-		rb_set_parent(child, parent);
-	if (parent)
-	{
-		if (parent->rb_left == node)
-			parent->rb_left = child;
-		else
-			parent->rb_right = child;
-	}
-	else
-		root->rb_node = child;
-
- color:
-	if (color == RB_BLACK)
-		__rb_erase_color(child, parent, root);
-}
-
-/*
- * This function returns the first node (in sort order) of the tree.
- */
-struct rb_node *rb_first(const struct rb_root *root)
-{
-	struct rb_node	*n;
-
-	n = root->rb_node;
-	if (!n)
-		return NULL;
-	while (n->rb_left)
-		n = n->rb_left;
-	return n;
-}
-
-struct rb_node *rb_last(const struct rb_root *root)
-{
-	struct rb_node	*n;
-
-	n = root->rb_node;
-	if (!n)
-		return NULL;
-	while (n->rb_right)
-		n = n->rb_right;
-	return n;
-}
-
-struct rb_node *rb_next(const struct rb_node *node)
-{
-	struct rb_node *parent;
-
-	if (rb_parent(node) == node)
-		return NULL;
-
-	/* If we have a right-hand child, go down and then left as far
-	   as we can. */
-	if (node->rb_right) {
-		node = node->rb_right; 
-		while (node->rb_left)
-			node=node->rb_left;
-		return (struct rb_node *)node;
-	}
-
-	/* No right-hand children.  Everything down and left is
-	   smaller than us, so any 'next' node must be in the general
-	   direction of our parent. Go up the tree; any time the
-	   ancestor is a right-hand child of its parent, keep going
-	   up. First time it's a left-hand child of its parent, said
-	   parent is our 'next' node. */
-	while ((parent = rb_parent(node)) && node == parent->rb_right)
-		node = parent;
-
-	return parent;
-}
-
-struct rb_node *rb_prev(const struct rb_node *node)
-{
-	struct rb_node *parent;
-
-	if (rb_parent(node) == node)
-		return NULL;
-
-	/* If we have a left-hand child, go down and then right as far
-	   as we can. */
-	if (node->rb_left) {
-		node = node->rb_left; 
-		while (node->rb_right)
-			node=node->rb_right;
-		return (struct rb_node *)node;
-	}
-
-	/* No left-hand children. Go up till we find an ancestor which
-	   is a right-hand child of its parent */
-	while ((parent = rb_parent(node)) && node == parent->rb_left)
-		node = parent;
-
-	return parent;
-}
-
-void rb_replace_node(struct rb_node *victim, struct rb_node *new,
-		     struct rb_root *root)
-{
-	struct rb_node *parent = rb_parent(victim);
-
-	/* Set the surrounding nodes to point to the replacement */
-	if (parent) {
-		if (victim == parent->rb_left)
-			parent->rb_left = new;
-		else
-			parent->rb_right = new;
-	} else {
-		root->rb_node = new;
-	}
-	if (victim->rb_left)
-		rb_set_parent(victim->rb_left, new);
-	if (victim->rb_right)
-		rb_set_parent(victim->rb_right, new);
-
-	/* Copy the pointers/colour from the victim to the replacement */
-	*new = *victim;
-}
diff --git a/Documentation/perf_counter/util/rbtree.h b/Documentation/perf_counter/util/rbtree.h
deleted file mode 100644
index 6bdc488..0000000
--- a/Documentation/perf_counter/util/rbtree.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
-  Red Black Trees
-  (C) 1999  Andrea Arcangeli <andrea@suse.de>
-  
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation; either version 2 of the License, or
-  (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-  linux/include/linux/rbtree.h
-
-  To use rbtrees you'll have to implement your own insert and search cores.
-  This will avoid us to use callbacks and to drop drammatically performances.
-  I know it's not the cleaner way,  but in C (not in C++) to get
-  performances and genericity...
-
-  Some example of insert and search follows here. The search is a plain
-  normal search over an ordered tree. The insert instead must be implemented
-  int two steps: as first thing the code must insert the element in
-  order as a red leaf in the tree, then the support library function
-  rb_insert_color() must be called. Such function will do the
-  not trivial work to rebalance the rbtree if necessary.
-
------------------------------------------------------------------------
-static inline struct page * rb_search_page_cache(struct inode * inode,
-						 unsigned long offset)
-{
-	struct rb_node * n = inode->i_rb_page_cache.rb_node;
-	struct page * page;
-
-	while (n)
-	{
-		page = rb_entry(n, struct page, rb_page_cache);
-
-		if (offset < page->offset)
-			n = n->rb_left;
-		else if (offset > page->offset)
-			n = n->rb_right;
-		else
-			return page;
-	}
-	return NULL;
-}
-
-static inline struct page * __rb_insert_page_cache(struct inode * inode,
-						   unsigned long offset,
-						   struct rb_node * node)
-{
-	struct rb_node ** p = &inode->i_rb_page_cache.rb_node;
-	struct rb_node * parent = NULL;
-	struct page * page;
-
-	while (*p)
-	{
-		parent = *p;
-		page = rb_entry(parent, struct page, rb_page_cache);
-
-		if (offset < page->offset)
-			p = &(*p)->rb_left;
-		else if (offset > page->offset)
-			p = &(*p)->rb_right;
-		else
-			return page;
-	}
-
-	rb_link_node(node, parent, p);
-
-	return NULL;
-}
-
-static inline struct page * rb_insert_page_cache(struct inode * inode,
-						 unsigned long offset,
-						 struct rb_node * node)
-{
-	struct page * ret;
-	if ((ret = __rb_insert_page_cache(inode, offset, node)))
-		goto out;
-	rb_insert_color(node, &inode->i_rb_page_cache);
- out:
-	return ret;
-}
------------------------------------------------------------------------
-*/
-
-#ifndef	_LINUX_RBTREE_H
-#define	_LINUX_RBTREE_H
-
-#include <stddef.h>
-
-/**
- * container_of - cast a member of a structure out to the containing structure
- * @ptr:	the pointer to the member.
- * @type:	the type of the container struct this is embedded in.
- * @member:	the name of the member within the struct.
- *
- */
-#define container_of(ptr, type, member) ({			\
-	const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
-	(type *)( (char *)__mptr - offsetof(type,member) );})
-
-struct rb_node
-{
-	unsigned long  rb_parent_color;
-#define	RB_RED		0
-#define	RB_BLACK	1
-	struct rb_node *rb_right;
-	struct rb_node *rb_left;
-} __attribute__((aligned(sizeof(long))));
-    /* The alignment might seem pointless, but allegedly CRIS needs it */
-
-struct rb_root
-{
-	struct rb_node *rb_node;
-};
-
-
-#define rb_parent(r)   ((struct rb_node *)((r)->rb_parent_color & ~3))
-#define rb_color(r)   ((r)->rb_parent_color & 1)
-#define rb_is_red(r)   (!rb_color(r))
-#define rb_is_black(r) rb_color(r)
-#define rb_set_red(r)  do { (r)->rb_parent_color &= ~1; } while (0)
-#define rb_set_black(r)  do { (r)->rb_parent_color |= 1; } while (0)
-
-static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
-{
-	rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p;
-}
-static inline void rb_set_color(struct rb_node *rb, int color)
-{
-	rb->rb_parent_color = (rb->rb_parent_color & ~1) | color;
-}
-
-#define RB_ROOT	(struct rb_root) { NULL, }
-#define	rb_entry(ptr, type, member) container_of(ptr, type, member)
-
-#define RB_EMPTY_ROOT(root)	((root)->rb_node == NULL)
-#define RB_EMPTY_NODE(node)	(rb_parent(node) == node)
-#define RB_CLEAR_NODE(node)	(rb_set_parent(node, node))
-
-extern void rb_insert_color(struct rb_node *, struct rb_root *);
-extern void rb_erase(struct rb_node *, struct rb_root *);
-
-/* Find logical next and previous nodes in a tree */
-extern struct rb_node *rb_next(const struct rb_node *);
-extern struct rb_node *rb_prev(const struct rb_node *);
-extern struct rb_node *rb_first(const struct rb_root *);
-extern struct rb_node *rb_last(const struct rb_root *);
-
-/* Fast replacement of a single node without remove/rebalance/add/rebalance */
-extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, 
-			    struct rb_root *root);
-
-static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
-				struct rb_node ** rb_link)
-{
-	node->rb_parent_color = (unsigned long )parent;
-	node->rb_left = node->rb_right = NULL;
-
-	*rb_link = node;
-}
-
-#endif	/* _LINUX_RBTREE_H */
diff --git a/Documentation/perf_counter/util/run-command.c b/Documentation/perf_counter/util/run-command.c
deleted file mode 100644
index b2f5e85..0000000
--- a/Documentation/perf_counter/util/run-command.c
+++ /dev/null
@@ -1,395 +0,0 @@
-#include "cache.h"
-#include "run-command.h"
-#include "exec_cmd.h"
-
-static inline void close_pair(int fd[2])
-{
-	close(fd[0]);
-	close(fd[1]);
-}
-
-static inline void dup_devnull(int to)
-{
-	int fd = open("/dev/null", O_RDWR);
-	dup2(fd, to);
-	close(fd);
-}
-
-int start_command(struct child_process *cmd)
-{
-	int need_in, need_out, need_err;
-	int fdin[2], fdout[2], fderr[2];
-
-	/*
-	 * In case of errors we must keep the promise to close FDs
-	 * that have been passed in via ->in and ->out.
-	 */
-
-	need_in = !cmd->no_stdin && cmd->in < 0;
-	if (need_in) {
-		if (pipe(fdin) < 0) {
-			if (cmd->out > 0)
-				close(cmd->out);
-			return -ERR_RUN_COMMAND_PIPE;
-		}
-		cmd->in = fdin[1];
-	}
-
-	need_out = !cmd->no_stdout
-		&& !cmd->stdout_to_stderr
-		&& cmd->out < 0;
-	if (need_out) {
-		if (pipe(fdout) < 0) {
-			if (need_in)
-				close_pair(fdin);
-			else if (cmd->in)
-				close(cmd->in);
-			return -ERR_RUN_COMMAND_PIPE;
-		}
-		cmd->out = fdout[0];
-	}
-
-	need_err = !cmd->no_stderr && cmd->err < 0;
-	if (need_err) {
-		if (pipe(fderr) < 0) {
-			if (need_in)
-				close_pair(fdin);
-			else if (cmd->in)
-				close(cmd->in);
-			if (need_out)
-				close_pair(fdout);
-			else if (cmd->out)
-				close(cmd->out);
-			return -ERR_RUN_COMMAND_PIPE;
-		}
-		cmd->err = fderr[0];
-	}
-
-#ifndef __MINGW32__
-	fflush(NULL);
-	cmd->pid = fork();
-	if (!cmd->pid) {
-		if (cmd->no_stdin)
-			dup_devnull(0);
-		else if (need_in) {
-			dup2(fdin[0], 0);
-			close_pair(fdin);
-		} else if (cmd->in) {
-			dup2(cmd->in, 0);
-			close(cmd->in);
-		}
-
-		if (cmd->no_stderr)
-			dup_devnull(2);
-		else if (need_err) {
-			dup2(fderr[1], 2);
-			close_pair(fderr);
-		}
-
-		if (cmd->no_stdout)
-			dup_devnull(1);
-		else if (cmd->stdout_to_stderr)
-			dup2(2, 1);
-		else if (need_out) {
-			dup2(fdout[1], 1);
-			close_pair(fdout);
-		} else if (cmd->out > 1) {
-			dup2(cmd->out, 1);
-			close(cmd->out);
-		}
-
-		if (cmd->dir && chdir(cmd->dir))
-			die("exec %s: cd to %s failed (%s)", cmd->argv[0],
-			    cmd->dir, strerror(errno));
-		if (cmd->env) {
-			for (; *cmd->env; cmd->env++) {
-				if (strchr(*cmd->env, '='))
-					putenv((char*)*cmd->env);
-				else
-					unsetenv(*cmd->env);
-			}
-		}
-		if (cmd->preexec_cb)
-			cmd->preexec_cb();
-		if (cmd->perf_cmd) {
-			execv_perf_cmd(cmd->argv);
-		} else {
-			execvp(cmd->argv[0], (char *const*) cmd->argv);
-		}
-		exit(127);
-	}
-#else
-	int s0 = -1, s1 = -1, s2 = -1;	/* backups of stdin, stdout, stderr */
-	const char **sargv = cmd->argv;
-	char **env = environ;
-
-	if (cmd->no_stdin) {
-		s0 = dup(0);
-		dup_devnull(0);
-	} else if (need_in) {
-		s0 = dup(0);
-		dup2(fdin[0], 0);
-	} else if (cmd->in) {
-		s0 = dup(0);
-		dup2(cmd->in, 0);
-	}
-
-	if (cmd->no_stderr) {
-		s2 = dup(2);
-		dup_devnull(2);
-	} else if (need_err) {
-		s2 = dup(2);
-		dup2(fderr[1], 2);
-	}
-
-	if (cmd->no_stdout) {
-		s1 = dup(1);
-		dup_devnull(1);
-	} else if (cmd->stdout_to_stderr) {
-		s1 = dup(1);
-		dup2(2, 1);
-	} else if (need_out) {
-		s1 = dup(1);
-		dup2(fdout[1], 1);
-	} else if (cmd->out > 1) {
-		s1 = dup(1);
-		dup2(cmd->out, 1);
-	}
-
-	if (cmd->dir)
-		die("chdir in start_command() not implemented");
-	if (cmd->env) {
-		env = copy_environ();
-		for (; *cmd->env; cmd->env++)
-			env = env_setenv(env, *cmd->env);
-	}
-
-	if (cmd->perf_cmd) {
-		cmd->argv = prepare_perf_cmd(cmd->argv);
-	}
-
-	cmd->pid = mingw_spawnvpe(cmd->argv[0], cmd->argv, env);
-
-	if (cmd->env)
-		free_environ(env);
-	if (cmd->perf_cmd)
-		free(cmd->argv);
-
-	cmd->argv = sargv;
-	if (s0 >= 0)
-		dup2(s0, 0), close(s0);
-	if (s1 >= 0)
-		dup2(s1, 1), close(s1);
-	if (s2 >= 0)
-		dup2(s2, 2), close(s2);
-#endif
-
-	if (cmd->pid < 0) {
-		int err = errno;
-		if (need_in)
-			close_pair(fdin);
-		else if (cmd->in)
-			close(cmd->in);
-		if (need_out)
-			close_pair(fdout);
-		else if (cmd->out)
-			close(cmd->out);
-		if (need_err)
-			close_pair(fderr);
-		return err == ENOENT ?
-			-ERR_RUN_COMMAND_EXEC :
-			-ERR_RUN_COMMAND_FORK;
-	}
-
-	if (need_in)
-		close(fdin[0]);
-	else if (cmd->in)
-		close(cmd->in);
-
-	if (need_out)
-		close(fdout[1]);
-	else if (cmd->out)
-		close(cmd->out);
-
-	if (need_err)
-		close(fderr[1]);
-
-	return 0;
-}
-
-static int wait_or_whine(pid_t pid)
-{
-	for (;;) {
-		int status, code;
-		pid_t waiting = waitpid(pid, &status, 0);
-
-		if (waiting < 0) {
-			if (errno == EINTR)
-				continue;
-			error("waitpid failed (%s)", strerror(errno));
-			return -ERR_RUN_COMMAND_WAITPID;
-		}
-		if (waiting != pid)
-			return -ERR_RUN_COMMAND_WAITPID_WRONG_PID;
-		if (WIFSIGNALED(status))
-			return -ERR_RUN_COMMAND_WAITPID_SIGNAL;
-
-		if (!WIFEXITED(status))
-			return -ERR_RUN_COMMAND_WAITPID_NOEXIT;
-		code = WEXITSTATUS(status);
-		switch (code) {
-		case 127:
-			return -ERR_RUN_COMMAND_EXEC;
-		case 0:
-			return 0;
-		default:
-			return -code;
-		}
-	}
-}
-
-int finish_command(struct child_process *cmd)
-{
-	return wait_or_whine(cmd->pid);
-}
-
-int run_command(struct child_process *cmd)
-{
-	int code = start_command(cmd);
-	if (code)
-		return code;
-	return finish_command(cmd);
-}
-
-static void prepare_run_command_v_opt(struct child_process *cmd,
-				      const char **argv,
-				      int opt)
-{
-	memset(cmd, 0, sizeof(*cmd));
-	cmd->argv = argv;
-	cmd->no_stdin = opt & RUN_COMMAND_NO_STDIN ? 1 : 0;
-	cmd->perf_cmd = opt & RUN_PERF_CMD ? 1 : 0;
-	cmd->stdout_to_stderr = opt & RUN_COMMAND_STDOUT_TO_STDERR ? 1 : 0;
-}
-
-int run_command_v_opt(const char **argv, int opt)
-{
-	struct child_process cmd;
-	prepare_run_command_v_opt(&cmd, argv, opt);
-	return run_command(&cmd);
-}
-
-int run_command_v_opt_cd_env(const char **argv, int opt, const char *dir, const char *const *env)
-{
-	struct child_process cmd;
-	prepare_run_command_v_opt(&cmd, argv, opt);
-	cmd.dir = dir;
-	cmd.env = env;
-	return run_command(&cmd);
-}
-
-#ifdef __MINGW32__
-static __stdcall unsigned run_thread(void *data)
-{
-	struct async *async = data;
-	return async->proc(async->fd_for_proc, async->data);
-}
-#endif
-
-int start_async(struct async *async)
-{
-	int pipe_out[2];
-
-	if (pipe(pipe_out) < 0)
-		return error("cannot create pipe: %s", strerror(errno));
-	async->out = pipe_out[0];
-
-#ifndef __MINGW32__
-	/* Flush stdio before fork() to avoid cloning buffers */
-	fflush(NULL);
-
-	async->pid = fork();
-	if (async->pid < 0) {
-		error("fork (async) failed: %s", strerror(errno));
-		close_pair(pipe_out);
-		return -1;
-	}
-	if (!async->pid) {
-		close(pipe_out[0]);
-		exit(!!async->proc(pipe_out[1], async->data));
-	}
-	close(pipe_out[1]);
-#else
-	async->fd_for_proc = pipe_out[1];
-	async->tid = (HANDLE) _beginthreadex(NULL, 0, run_thread, async, 0, NULL);
-	if (!async->tid) {
-		error("cannot create thread: %s", strerror(errno));
-		close_pair(pipe_out);
-		return -1;
-	}
-#endif
-	return 0;
-}
-
-int finish_async(struct async *async)
-{
-#ifndef __MINGW32__
-	int ret = 0;
-
-	if (wait_or_whine(async->pid))
-		ret = error("waitpid (async) failed");
-#else
-	DWORD ret = 0;
-	if (WaitForSingleObject(async->tid, INFINITE) != WAIT_OBJECT_0)
-		ret = error("waiting for thread failed: %lu", GetLastError());
-	else if (!GetExitCodeThread(async->tid, &ret))
-		ret = error("cannot get thread exit code: %lu", GetLastError());
-	CloseHandle(async->tid);
-#endif
-	return ret;
-}
-
-int run_hook(const char *index_file, const char *name, ...)
-{
-	struct child_process hook;
-	const char **argv = NULL, *env[2];
-	char index[PATH_MAX];
-	va_list args;
-	int ret;
-	size_t i = 0, alloc = 0;
-
-	if (access(perf_path("hooks/%s", name), X_OK) < 0)
-		return 0;
-
-	va_start(args, name);
-	ALLOC_GROW(argv, i + 1, alloc);
-	argv[i++] = perf_path("hooks/%s", name);
-	while (argv[i-1]) {
-		ALLOC_GROW(argv, i + 1, alloc);
-		argv[i++] = va_arg(args, const char *);
-	}
-	va_end(args);
-
-	memset(&hook, 0, sizeof(hook));
-	hook.argv = argv;
-	hook.no_stdin = 1;
-	hook.stdout_to_stderr = 1;
-	if (index_file) {
-		snprintf(index, sizeof(index), "PERF_INDEX_FILE=%s", index_file);
-		env[0] = index;
-		env[1] = NULL;
-		hook.env = env;
-	}
-
-	ret = start_command(&hook);
-	free(argv);
-	if (ret) {
-		warning("Could not spawn %s", argv[0]);
-		return ret;
-	}
-	ret = finish_command(&hook);
-	if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
-		warning("%s exited due to uncaught signal", argv[0]);
-
-	return ret;
-}
diff --git a/Documentation/perf_counter/util/run-command.h b/Documentation/perf_counter/util/run-command.h
deleted file mode 100644
index 328289f..0000000
--- a/Documentation/perf_counter/util/run-command.h
+++ /dev/null
@@ -1,93 +0,0 @@
-#ifndef RUN_COMMAND_H
-#define RUN_COMMAND_H
-
-enum {
-	ERR_RUN_COMMAND_FORK = 10000,
-	ERR_RUN_COMMAND_EXEC,
-	ERR_RUN_COMMAND_PIPE,
-	ERR_RUN_COMMAND_WAITPID,
-	ERR_RUN_COMMAND_WAITPID_WRONG_PID,
-	ERR_RUN_COMMAND_WAITPID_SIGNAL,
-	ERR_RUN_COMMAND_WAITPID_NOEXIT,
-};
-#define IS_RUN_COMMAND_ERR(x) (-(x) >= ERR_RUN_COMMAND_FORK)
-
-struct child_process {
-	const char **argv;
-	pid_t pid;
-	/*
-	 * Using .in, .out, .err:
-	 * - Specify 0 for no redirections (child inherits stdin, stdout,
-	 *   stderr from parent).
-	 * - Specify -1 to have a pipe allocated as follows:
-	 *     .in: returns the writable pipe end; parent writes to it,
-	 *          the readable pipe end becomes child's stdin
-	 *     .out, .err: returns the readable pipe end; parent reads from
-	 *          it, the writable pipe end becomes child's stdout/stderr
-	 *   The caller of start_command() must close the returned FDs
-	 *   after it has completed reading from/writing to it!
-	 * - Specify > 0 to set a channel to a particular FD as follows:
-	 *     .in: a readable FD, becomes child's stdin
-	 *     .out: a writable FD, becomes child's stdout/stderr
-	 *     .err > 0 not supported
-	 *   The specified FD is closed by start_command(), even in case
-	 *   of errors!
-	 */
-	int in;
-	int out;
-	int err;
-	const char *dir;
-	const char *const *env;
-	unsigned no_stdin:1;
-	unsigned no_stdout:1;
-	unsigned no_stderr:1;
-	unsigned perf_cmd:1; /* if this is to be perf sub-command */
-	unsigned stdout_to_stderr:1;
-	void (*preexec_cb)(void);
-};
-
-int start_command(struct child_process *);
-int finish_command(struct child_process *);
-int run_command(struct child_process *);
-
-extern int run_hook(const char *index_file, const char *name, ...);
-
-#define RUN_COMMAND_NO_STDIN 1
-#define RUN_PERF_CMD	     2	/*If this is to be perf sub-command */
-#define RUN_COMMAND_STDOUT_TO_STDERR 4
-int run_command_v_opt(const char **argv, int opt);
-
-/*
- * env (the environment) is to be formatted like environ: "VAR=VALUE".
- * To unset an environment variable use just "VAR".
- */
-int run_command_v_opt_cd_env(const char **argv, int opt, const char *dir, const char *const *env);
-
-/*
- * The purpose of the following functions is to feed a pipe by running
- * a function asynchronously and providing output that the caller reads.
- *
- * It is expected that no synchronization and mutual exclusion between
- * the caller and the feed function is necessary so that the function
- * can run in a thread without interfering with the caller.
- */
-struct async {
-	/*
-	 * proc writes to fd and closes it;
-	 * returns 0 on success, non-zero on failure
-	 */
-	int (*proc)(int fd, void *data);
-	void *data;
-	int out;	/* caller reads from here and closes it */
-#ifndef __MINGW32__
-	pid_t pid;
-#else
-	HANDLE tid;
-	int fd_for_proc;
-#endif
-};
-
-int start_async(struct async *async);
-int finish_async(struct async *async);
-
-#endif
diff --git a/Documentation/perf_counter/util/sigchain.c b/Documentation/perf_counter/util/sigchain.c
deleted file mode 100644
index 1118b99..0000000
--- a/Documentation/perf_counter/util/sigchain.c
+++ /dev/null
@@ -1,52 +0,0 @@
-#include "sigchain.h"
-#include "cache.h"
-
-#define SIGCHAIN_MAX_SIGNALS 32
-
-struct sigchain_signal {
-	sigchain_fun *old;
-	int n;
-	int alloc;
-};
-static struct sigchain_signal signals[SIGCHAIN_MAX_SIGNALS];
-
-static void check_signum(int sig)
-{
-	if (sig < 1 || sig >= SIGCHAIN_MAX_SIGNALS)
-		die("BUG: signal out of range: %d", sig);
-}
-
-int sigchain_push(int sig, sigchain_fun f)
-{
-	struct sigchain_signal *s = signals + sig;
-	check_signum(sig);
-
-	ALLOC_GROW(s->old, s->n + 1, s->alloc);
-	s->old[s->n] = signal(sig, f);
-	if (s->old[s->n] == SIG_ERR)
-		return -1;
-	s->n++;
-	return 0;
-}
-
-int sigchain_pop(int sig)
-{
-	struct sigchain_signal *s = signals + sig;
-	check_signum(sig);
-	if (s->n < 1)
-		return 0;
-
-	if (signal(sig, s->old[s->n - 1]) == SIG_ERR)
-		return -1;
-	s->n--;
-	return 0;
-}
-
-void sigchain_push_common(sigchain_fun f)
-{
-	sigchain_push(SIGINT, f);
-	sigchain_push(SIGHUP, f);
-	sigchain_push(SIGTERM, f);
-	sigchain_push(SIGQUIT, f);
-	sigchain_push(SIGPIPE, f);
-}
diff --git a/Documentation/perf_counter/util/sigchain.h b/Documentation/perf_counter/util/sigchain.h
deleted file mode 100644
index 618083bc..0000000
--- a/Documentation/perf_counter/util/sigchain.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef SIGCHAIN_H
-#define SIGCHAIN_H
-
-typedef void (*sigchain_fun)(int);
-
-int sigchain_push(int sig, sigchain_fun f);
-int sigchain_pop(int sig);
-
-void sigchain_push_common(sigchain_fun f);
-
-#endif /* SIGCHAIN_H */
diff --git a/Documentation/perf_counter/util/strbuf.c b/Documentation/perf_counter/util/strbuf.c
deleted file mode 100644
index eaba093..0000000
--- a/Documentation/perf_counter/util/strbuf.c
+++ /dev/null
@@ -1,359 +0,0 @@
-#include "cache.h"
-
-int prefixcmp(const char *str, const char *prefix)
-{
-	for (; ; str++, prefix++)
-		if (!*prefix)
-			return 0;
-		else if (*str != *prefix)
-			return (unsigned char)*prefix - (unsigned char)*str;
-}
-
-/*
- * Used as the default ->buf value, so that people can always assume
- * buf is non NULL and ->buf is NUL terminated even for a freshly
- * initialized strbuf.
- */
-char strbuf_slopbuf[1];
-
-void strbuf_init(struct strbuf *sb, size_t hint)
-{
-	sb->alloc = sb->len = 0;
-	sb->buf = strbuf_slopbuf;
-	if (hint)
-		strbuf_grow(sb, hint);
-}
-
-void strbuf_release(struct strbuf *sb)
-{
-	if (sb->alloc) {
-		free(sb->buf);
-		strbuf_init(sb, 0);
-	}
-}
-
-char *strbuf_detach(struct strbuf *sb, size_t *sz)
-{
-	char *res = sb->alloc ? sb->buf : NULL;
-	if (sz)
-		*sz = sb->len;
-	strbuf_init(sb, 0);
-	return res;
-}
-
-void strbuf_attach(struct strbuf *sb, void *buf, size_t len, size_t alloc)
-{
-	strbuf_release(sb);
-	sb->buf   = buf;
-	sb->len   = len;
-	sb->alloc = alloc;
-	strbuf_grow(sb, 0);
-	sb->buf[sb->len] = '\0';
-}
-
-void strbuf_grow(struct strbuf *sb, size_t extra)
-{
-	if (sb->len + extra + 1 <= sb->len)
-		die("you want to use way too much memory");
-	if (!sb->alloc)
-		sb->buf = NULL;
-	ALLOC_GROW(sb->buf, sb->len + extra + 1, sb->alloc);
-}
-
-void strbuf_trim(struct strbuf *sb)
-{
-	char *b = sb->buf;
-	while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1]))
-		sb->len--;
-	while (sb->len > 0 && isspace(*b)) {
-		b++;
-		sb->len--;
-	}
-	memmove(sb->buf, b, sb->len);
-	sb->buf[sb->len] = '\0';
-}
-void strbuf_rtrim(struct strbuf *sb)
-{
-	while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1]))
-		sb->len--;
-	sb->buf[sb->len] = '\0';
-}
-
-void strbuf_ltrim(struct strbuf *sb)
-{
-	char *b = sb->buf;
-	while (sb->len > 0 && isspace(*b)) {
-		b++;
-		sb->len--;
-	}
-	memmove(sb->buf, b, sb->len);
-	sb->buf[sb->len] = '\0';
-}
-
-void strbuf_tolower(struct strbuf *sb)
-{
-	int i;
-	for (i = 0; i < sb->len; i++)
-		sb->buf[i] = tolower(sb->buf[i]);
-}
-
-struct strbuf **strbuf_split(const struct strbuf *sb, int delim)
-{
-	int alloc = 2, pos = 0;
-	char *n, *p;
-	struct strbuf **ret;
-	struct strbuf *t;
-
-	ret = calloc(alloc, sizeof(struct strbuf *));
-	p = n = sb->buf;
-	while (n < sb->buf + sb->len) {
-		int len;
-		n = memchr(n, delim, sb->len - (n - sb->buf));
-		if (pos + 1 >= alloc) {
-			alloc = alloc * 2;
-			ret = realloc(ret, sizeof(struct strbuf *) * alloc);
-		}
-		if (!n)
-			n = sb->buf + sb->len - 1;
-		len = n - p + 1;
-		t = malloc(sizeof(struct strbuf));
-		strbuf_init(t, len);
-		strbuf_add(t, p, len);
-		ret[pos] = t;
-		ret[++pos] = NULL;
-		p = ++n;
-	}
-	return ret;
-}
-
-void strbuf_list_free(struct strbuf **sbs)
-{
-	struct strbuf **s = sbs;
-
-	while (*s) {
-		strbuf_release(*s);
-		free(*s++);
-	}
-	free(sbs);
-}
-
-int strbuf_cmp(const struct strbuf *a, const struct strbuf *b)
-{
-	int len = a->len < b->len ? a->len: b->len;
-	int cmp = memcmp(a->buf, b->buf, len);
-	if (cmp)
-		return cmp;
-	return a->len < b->len ? -1: a->len != b->len;
-}
-
-void strbuf_splice(struct strbuf *sb, size_t pos, size_t len,
-				   const void *data, size_t dlen)
-{
-	if (pos + len < pos)
-		die("you want to use way too much memory");
-	if (pos > sb->len)
-		die("`pos' is too far after the end of the buffer");
-	if (pos + len > sb->len)
-		die("`pos + len' is too far after the end of the buffer");
-
-	if (dlen >= len)
-		strbuf_grow(sb, dlen - len);
-	memmove(sb->buf + pos + dlen,
-			sb->buf + pos + len,
-			sb->len - pos - len);
-	memcpy(sb->buf + pos, data, dlen);
-	strbuf_setlen(sb, sb->len + dlen - len);
-}
-
-void strbuf_insert(struct strbuf *sb, size_t pos, const void *data, size_t len)
-{
-	strbuf_splice(sb, pos, 0, data, len);
-}
-
-void strbuf_remove(struct strbuf *sb, size_t pos, size_t len)
-{
-	strbuf_splice(sb, pos, len, NULL, 0);
-}
-
-void strbuf_add(struct strbuf *sb, const void *data, size_t len)
-{
-	strbuf_grow(sb, len);
-	memcpy(sb->buf + sb->len, data, len);
-	strbuf_setlen(sb, sb->len + len);
-}
-
-void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len)
-{
-	strbuf_grow(sb, len);
-	memcpy(sb->buf + sb->len, sb->buf + pos, len);
-	strbuf_setlen(sb, sb->len + len);
-}
-
-void strbuf_addf(struct strbuf *sb, const char *fmt, ...)
-{
-	int len;
-	va_list ap;
-
-	if (!strbuf_avail(sb))
-		strbuf_grow(sb, 64);
-	va_start(ap, fmt);
-	len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap);
-	va_end(ap);
-	if (len < 0)
-		die("your vsnprintf is broken");
-	if (len > strbuf_avail(sb)) {
-		strbuf_grow(sb, len);
-		va_start(ap, fmt);
-		len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap);
-		va_end(ap);
-		if (len > strbuf_avail(sb)) {
-			die("this should not happen, your snprintf is broken");
-		}
-	}
-	strbuf_setlen(sb, sb->len + len);
-}
-
-void strbuf_expand(struct strbuf *sb, const char *format, expand_fn_t fn,
-		   void *context)
-{
-	for (;;) {
-		const char *percent;
-		size_t consumed;
-
-		percent = strchrnul(format, '%');
-		strbuf_add(sb, format, percent - format);
-		if (!*percent)
-			break;
-		format = percent + 1;
-
-		consumed = fn(sb, format, context);
-		if (consumed)
-			format += consumed;
-		else
-			strbuf_addch(sb, '%');
-	}
-}
-
-size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder,
-		void *context)
-{
-	struct strbuf_expand_dict_entry *e = context;
-	size_t len;
-
-	for (; e->placeholder && (len = strlen(e->placeholder)); e++) {
-		if (!strncmp(placeholder, e->placeholder, len)) {
-			if (e->value)
-				strbuf_addstr(sb, e->value);
-			return len;
-		}
-	}
-	return 0;
-}
-
-size_t strbuf_fread(struct strbuf *sb, size_t size, FILE *f)
-{
-	size_t res;
-	size_t oldalloc = sb->alloc;
-
-	strbuf_grow(sb, size);
-	res = fread(sb->buf + sb->len, 1, size, f);
-	if (res > 0)
-		strbuf_setlen(sb, sb->len + res);
-	else if (res < 0 && oldalloc == 0)
-		strbuf_release(sb);
-	return res;
-}
-
-ssize_t strbuf_read(struct strbuf *sb, int fd, size_t hint)
-{
-	size_t oldlen = sb->len;
-	size_t oldalloc = sb->alloc;
-
-	strbuf_grow(sb, hint ? hint : 8192);
-	for (;;) {
-		ssize_t cnt;
-
-		cnt = read(fd, sb->buf + sb->len, sb->alloc - sb->len - 1);
-		if (cnt < 0) {
-			if (oldalloc == 0)
-				strbuf_release(sb);
-			else
-				strbuf_setlen(sb, oldlen);
-			return -1;
-		}
-		if (!cnt)
-			break;
-		sb->len += cnt;
-		strbuf_grow(sb, 8192);
-	}
-
-	sb->buf[sb->len] = '\0';
-	return sb->len - oldlen;
-}
-
-#define STRBUF_MAXLINK (2*PATH_MAX)
-
-int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint)
-{
-	size_t oldalloc = sb->alloc;
-
-	if (hint < 32)
-		hint = 32;
-
-	while (hint < STRBUF_MAXLINK) {
-		int len;
-
-		strbuf_grow(sb, hint);
-		len = readlink(path, sb->buf, hint);
-		if (len < 0) {
-			if (errno != ERANGE)
-				break;
-		} else if (len < hint) {
-			strbuf_setlen(sb, len);
-			return 0;
-		}
-
-		/* .. the buffer was too small - try again */
-		hint *= 2;
-	}
-	if (oldalloc == 0)
-		strbuf_release(sb);
-	return -1;
-}
-
-int strbuf_getline(struct strbuf *sb, FILE *fp, int term)
-{
-	int ch;
-
-	strbuf_grow(sb, 0);
-	if (feof(fp))
-		return EOF;
-
-	strbuf_reset(sb);
-	while ((ch = fgetc(fp)) != EOF) {
-		if (ch == term)
-			break;
-		strbuf_grow(sb, 1);
-		sb->buf[sb->len++] = ch;
-	}
-	if (ch == EOF && sb->len == 0)
-		return EOF;
-
-	sb->buf[sb->len] = '\0';
-	return 0;
-}
-
-int strbuf_read_file(struct strbuf *sb, const char *path, size_t hint)
-{
-	int fd, len;
-
-	fd = open(path, O_RDONLY);
-	if (fd < 0)
-		return -1;
-	len = strbuf_read(sb, fd, hint);
-	close(fd);
-	if (len < 0)
-		return -1;
-
-	return len;
-}
diff --git a/Documentation/perf_counter/util/strbuf.h b/Documentation/perf_counter/util/strbuf.h
deleted file mode 100644
index 9ee908a..0000000
--- a/Documentation/perf_counter/util/strbuf.h
+++ /dev/null
@@ -1,137 +0,0 @@
-#ifndef STRBUF_H
-#define STRBUF_H
-
-/*
- * Strbuf's can be use in many ways: as a byte array, or to store arbitrary
- * long, overflow safe strings.
- *
- * Strbufs has some invariants that are very important to keep in mind:
- *
- * 1. the ->buf member is always malloc-ed, hence strbuf's can be used to
- *    build complex strings/buffers whose final size isn't easily known.
- *
- *    It is NOT legal to copy the ->buf pointer away.
- *    `strbuf_detach' is the operation that detachs a buffer from its shell
- *    while keeping the shell valid wrt its invariants.
- *
- * 2. the ->buf member is a byte array that has at least ->len + 1 bytes
- *    allocated. The extra byte is used to store a '\0', allowing the ->buf
- *    member to be a valid C-string. Every strbuf function ensure this
- *    invariant is preserved.
- *
- *    Note that it is OK to "play" with the buffer directly if you work it
- *    that way:
- *
- *    strbuf_grow(sb, SOME_SIZE);
- *       ... Here, the memory array starting at sb->buf, and of length
- *       ... strbuf_avail(sb) is all yours, and you are sure that
- *       ... strbuf_avail(sb) is at least SOME_SIZE.
- *    strbuf_setlen(sb, sb->len + SOME_OTHER_SIZE);
- *
- *    Of course, SOME_OTHER_SIZE must be smaller or equal to strbuf_avail(sb).
- *
- *    Doing so is safe, though if it has to be done in many places, adding the
- *    missing API to the strbuf module is the way to go.
- *
- *    XXX: do _not_ assume that the area that is yours is of size ->alloc - 1
- *         even if it's true in the current implementation. Alloc is somehow a
- *         "private" member that should not be messed with.
- */
-
-#include <assert.h>
-
-extern char strbuf_slopbuf[];
-struct strbuf {
-	size_t alloc;
-	size_t len;
-	char *buf;
-};
-
-#define STRBUF_INIT  { 0, 0, strbuf_slopbuf }
-
-/*----- strbuf life cycle -----*/
-extern void strbuf_init(struct strbuf *, size_t);
-extern void strbuf_release(struct strbuf *);
-extern char *strbuf_detach(struct strbuf *, size_t *);
-extern void strbuf_attach(struct strbuf *, void *, size_t, size_t);
-static inline void strbuf_swap(struct strbuf *a, struct strbuf *b) {
-	struct strbuf tmp = *a;
-	*a = *b;
-	*b = tmp;
-}
-
-/*----- strbuf size related -----*/
-static inline size_t strbuf_avail(const struct strbuf *sb) {
-	return sb->alloc ? sb->alloc - sb->len - 1 : 0;
-}
-
-extern void strbuf_grow(struct strbuf *, size_t);
-
-static inline void strbuf_setlen(struct strbuf *sb, size_t len) {
-	if (!sb->alloc)
-		strbuf_grow(sb, 0);
-	assert(len < sb->alloc);
-	sb->len = len;
-	sb->buf[len] = '\0';
-}
-#define strbuf_reset(sb)  strbuf_setlen(sb, 0)
-
-/*----- content related -----*/
-extern void strbuf_trim(struct strbuf *);
-extern void strbuf_rtrim(struct strbuf *);
-extern void strbuf_ltrim(struct strbuf *);
-extern int strbuf_cmp(const struct strbuf *, const struct strbuf *);
-extern void strbuf_tolower(struct strbuf *);
-
-extern struct strbuf **strbuf_split(const struct strbuf *, int delim);
-extern void strbuf_list_free(struct strbuf **);
-
-/*----- add data in your buffer -----*/
-static inline void strbuf_addch(struct strbuf *sb, int c) {
-	strbuf_grow(sb, 1);
-	sb->buf[sb->len++] = c;
-	sb->buf[sb->len] = '\0';
-}
-
-extern void strbuf_insert(struct strbuf *, size_t pos, const void *, size_t);
-extern void strbuf_remove(struct strbuf *, size_t pos, size_t len);
-
-/* splice pos..pos+len with given data */
-extern void strbuf_splice(struct strbuf *, size_t pos, size_t len,
-                          const void *, size_t);
-
-extern void strbuf_add(struct strbuf *, const void *, size_t);
-static inline void strbuf_addstr(struct strbuf *sb, const char *s) {
-	strbuf_add(sb, s, strlen(s));
-}
-static inline void strbuf_addbuf(struct strbuf *sb, const struct strbuf *sb2) {
-	strbuf_add(sb, sb2->buf, sb2->len);
-}
-extern void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len);
-
-typedef size_t (*expand_fn_t) (struct strbuf *sb, const char *placeholder, void *context);
-extern void strbuf_expand(struct strbuf *sb, const char *format, expand_fn_t fn, void *context);
-struct strbuf_expand_dict_entry {
-	const char *placeholder;
-	const char *value;
-};
-extern size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder, void *context);
-
-__attribute__((format(printf,2,3)))
-extern void strbuf_addf(struct strbuf *sb, const char *fmt, ...);
-
-extern size_t strbuf_fread(struct strbuf *, size_t, FILE *);
-/* XXX: if read fails, any partial read is undone */
-extern ssize_t strbuf_read(struct strbuf *, int fd, size_t hint);
-extern int strbuf_read_file(struct strbuf *sb, const char *path, size_t hint);
-extern int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint);
-
-extern int strbuf_getline(struct strbuf *, FILE *, int);
-
-extern void stripspace(struct strbuf *buf, int skip_comments);
-extern int launch_editor(const char *path, struct strbuf *buffer, const char *const *env);
-
-extern int strbuf_branchname(struct strbuf *sb, const char *name);
-extern int strbuf_check_branch_ref(struct strbuf *sb, const char *name);
-
-#endif /* STRBUF_H */
diff --git a/Documentation/perf_counter/util/string.c b/Documentation/perf_counter/util/string.c
deleted file mode 100644
index ec33c0c..0000000
--- a/Documentation/perf_counter/util/string.c
+++ /dev/null
@@ -1,34 +0,0 @@
-#include "string.h"
-
-static int hex(char ch)
-{
-	if ((ch >= '0') && (ch <= '9'))
-		return ch - '0';
-	if ((ch >= 'a') && (ch <= 'f'))
-		return ch - 'a' + 10;
-	if ((ch >= 'A') && (ch <= 'F'))
-		return ch - 'A' + 10;
-	return -1;
-}
-
-/*
- * While we find nice hex chars, build a long_val.
- * Return number of chars processed.
- */
-int hex2u64(const char *ptr, __u64 *long_val)
-{
-	const char *p = ptr;
-	*long_val = 0;
-
-	while (*p) {
-		const int hex_val = hex(*p);
-
-		if (hex_val < 0)
-			break;
-
-		*long_val = (*long_val << 4) | hex_val;
-		p++;
-	}
-
-	return p - ptr;
-}
diff --git a/Documentation/perf_counter/util/string.h b/Documentation/perf_counter/util/string.h
deleted file mode 100644
index 72812c1..0000000
--- a/Documentation/perf_counter/util/string.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _PERF_STRING_H_
-#define _PERF_STRING_H_
-
-#include <linux/types.h>
-
-int hex2u64(const char *ptr, __u64 *val);
-
-#endif
diff --git a/Documentation/perf_counter/util/symbol.c b/Documentation/perf_counter/util/symbol.c
deleted file mode 100644
index 23f4f7b..0000000
--- a/Documentation/perf_counter/util/symbol.c
+++ /dev/null
@@ -1,574 +0,0 @@
-#include "util.h"
-#include "../perf.h"
-#include "string.h"
-#include "symbol.h"
-
-#include <libelf.h>
-#include <gelf.h>
-#include <elf.h>
-
-const char *sym_hist_filter;
-
-static struct symbol *symbol__new(uint64_t start, uint64_t len,
-				  const char *name, unsigned int priv_size,
-				  uint64_t obj_start, int verbose)
-{
-	size_t namelen = strlen(name) + 1;
-	struct symbol *self = calloc(1, priv_size + sizeof(*self) + namelen);
-
-	if (!self)
-		return NULL;
-
-	if (verbose >= 2)
-		printf("new symbol: %016Lx [%08lx]: %s, hist: %p, obj_start: %p\n",
-			(__u64)start, len, name, self->hist, (void *)obj_start);
-
-	self->obj_start= obj_start;
-	self->hist = NULL;
-	self->hist_sum = 0;
-
-	if (sym_hist_filter && !strcmp(name, sym_hist_filter))
-		self->hist = calloc(sizeof(__u64), len);
-
-	if (priv_size) {
-		memset(self, 0, priv_size);
-		self = ((void *)self) + priv_size;
-	}
-	self->start = start;
-	self->end   = start + len - 1;
-	memcpy(self->name, name, namelen);
-
-	return self;
-}
-
-static void symbol__delete(struct symbol *self, unsigned int priv_size)
-{
-	free(((void *)self) - priv_size);
-}
-
-static size_t symbol__fprintf(struct symbol *self, FILE *fp)
-{
-	return fprintf(fp, " %llx-%llx %s\n",
-		       self->start, self->end, self->name);
-}
-
-struct dso *dso__new(const char *name, unsigned int sym_priv_size)
-{
-	struct dso *self = malloc(sizeof(*self) + strlen(name) + 1);
-
-	if (self != NULL) {
-		strcpy(self->name, name);
-		self->syms = RB_ROOT;
-		self->sym_priv_size = sym_priv_size;
-		self->find_symbol = dso__find_symbol;
-	}
-
-	return self;
-}
-
-static void dso__delete_symbols(struct dso *self)
-{
-	struct symbol *pos;
-	struct rb_node *next = rb_first(&self->syms);
-
-	while (next) {
-		pos = rb_entry(next, struct symbol, rb_node);
-		next = rb_next(&pos->rb_node);
-		rb_erase(&pos->rb_node, &self->syms);
-		symbol__delete(pos, self->sym_priv_size);
-	}
-}
-
-void dso__delete(struct dso *self)
-{
-	dso__delete_symbols(self);
-	free(self);
-}
-
-static void dso__insert_symbol(struct dso *self, struct symbol *sym)
-{
-	struct rb_node **p = &self->syms.rb_node;
-	struct rb_node *parent = NULL;
-	const uint64_t ip = sym->start;
-	struct symbol *s;
-
-	while (*p != NULL) {
-		parent = *p;
-		s = rb_entry(parent, struct symbol, rb_node);
-		if (ip < s->start)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-	rb_link_node(&sym->rb_node, parent, p);
-	rb_insert_color(&sym->rb_node, &self->syms);
-}
-
-struct symbol *dso__find_symbol(struct dso *self, uint64_t ip)
-{
-	struct rb_node *n;
-
-	if (self == NULL)
-		return NULL;
-
-	n = self->syms.rb_node;
-
-	while (n) {
-		struct symbol *s = rb_entry(n, struct symbol, rb_node);
-
-		if (ip < s->start)
-			n = n->rb_left;
-		else if (ip > s->end)
-			n = n->rb_right;
-		else
-			return s;
-	}
-
-	return NULL;
-}
-
-size_t dso__fprintf(struct dso *self, FILE *fp)
-{
-	size_t ret = fprintf(fp, "dso: %s\n", self->name);
-
-	struct rb_node *nd;
-	for (nd = rb_first(&self->syms); nd; nd = rb_next(nd)) {
-		struct symbol *pos = rb_entry(nd, struct symbol, rb_node);
-		ret += symbol__fprintf(pos, fp);
-	}
-
-	return ret;
-}
-
-static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter, int verbose)
-{
-	struct rb_node *nd, *prevnd;
-	char *line = NULL;
-	size_t n;
-	FILE *file = fopen("/proc/kallsyms", "r");
-
-	if (file == NULL)
-		goto out_failure;
-
-	while (!feof(file)) {
-		__u64 start;
-		struct symbol *sym;
-		int line_len, len;
-		char symbol_type;
-
-		line_len = getline(&line, &n, file);
-		if (line_len < 0)
-			break;
-
-		if (!line)
-			goto out_failure;
-
-		line[--line_len] = '\0'; /* \n */
-
-		len = hex2u64(line, &start);
-
-		len++;
-		if (len + 2 >= line_len)
-			continue;
-
-		symbol_type = toupper(line[len]);
-		/*
-		 * We're interested only in code ('T'ext)
-		 */
-		if (symbol_type != 'T' && symbol_type != 'W')
-			continue;
-		/*
-		 * Well fix up the end later, when we have all sorted.
-		 */
-		sym = symbol__new(start, 0xdead, line + len + 2,
-				  self->sym_priv_size, 0, verbose);
-
-		if (sym == NULL)
-			goto out_delete_line;
-
-		if (filter && filter(self, sym))
-			symbol__delete(sym, self->sym_priv_size);
-		else
-			dso__insert_symbol(self, sym);
-	}
-
-	/*
-	 * Now that we have all sorted out, just set the ->end of all
-	 * symbols
-	 */
-	prevnd = rb_first(&self->syms);
-
-	if (prevnd == NULL)
-		goto out_delete_line;
-
-	for (nd = rb_next(prevnd); nd; nd = rb_next(nd)) {
-		struct symbol *prev = rb_entry(prevnd, struct symbol, rb_node),
-			      *curr = rb_entry(nd, struct symbol, rb_node);
-
-		prev->end = curr->start - 1;
-		prevnd = nd;
-	}
-
-	free(line);
-	fclose(file);
-
-	return 0;
-
-out_delete_line:
-	free(line);
-out_failure:
-	return -1;
-}
-
-/**
- * elf_symtab__for_each_symbol - iterate thru all the symbols
- *
- * @self: struct elf_symtab instance to iterate
- * @index: uint32_t index
- * @sym: GElf_Sym iterator
- */
-#define elf_symtab__for_each_symbol(syms, nr_syms, index, sym) \
-	for (index = 0, gelf_getsym(syms, index, &sym);\
-	     index < nr_syms; \
-	     index++, gelf_getsym(syms, index, &sym))
-
-static inline uint8_t elf_sym__type(const GElf_Sym *sym)
-{
-	return GELF_ST_TYPE(sym->st_info);
-}
-
-static inline int elf_sym__is_function(const GElf_Sym *sym)
-{
-	return elf_sym__type(sym) == STT_FUNC &&
-	       sym->st_name != 0 &&
-	       sym->st_shndx != SHN_UNDEF &&
-	       sym->st_size != 0;
-}
-
-static inline const char *elf_sym__name(const GElf_Sym *sym,
-					const Elf_Data *symstrs)
-{
-	return symstrs->d_buf + sym->st_name;
-}
-
-static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
-				    GElf_Shdr *shp, const char *name,
-				    size_t *index)
-{
-	Elf_Scn *sec = NULL;
-	size_t cnt = 1;
-
-	while ((sec = elf_nextscn(elf, sec)) != NULL) {
-		char *str;
-
-		gelf_getshdr(sec, shp);
-		str = elf_strptr(elf, ep->e_shstrndx, shp->sh_name);
-		if (!strcmp(name, str)) {
-			if (index)
-				*index = cnt;
-			break;
-		}
-		++cnt;
-	}
-
-	return sec;
-}
-
-#define elf_section__for_each_rel(reldata, pos, pos_mem, idx, nr_entries) \
-	for (idx = 0, pos = gelf_getrel(reldata, 0, &pos_mem); \
-	     idx < nr_entries; \
-	     ++idx, pos = gelf_getrel(reldata, idx, &pos_mem))
-
-#define elf_section__for_each_rela(reldata, pos, pos_mem, idx, nr_entries) \
-	for (idx = 0, pos = gelf_getrela(reldata, 0, &pos_mem); \
-	     idx < nr_entries; \
-	     ++idx, pos = gelf_getrela(reldata, idx, &pos_mem))
-
-static int dso__synthesize_plt_symbols(struct  dso *self, Elf *elf,
-				       GElf_Ehdr *ehdr, Elf_Scn *scn_dynsym,
-				       GElf_Shdr *shdr_dynsym,
-				       size_t dynsym_idx, int verbose)
-{
-	uint32_t nr_rel_entries, idx;
-	GElf_Sym sym;
-	__u64 plt_offset;
-	GElf_Shdr shdr_plt;
-	struct symbol *f;
-	GElf_Shdr shdr_rel_plt;
-	Elf_Data *reldata, *syms, *symstrs;
-	Elf_Scn *scn_plt_rel, *scn_symstrs;
-	char sympltname[1024];
-	int nr = 0, symidx;
-
-	scn_plt_rel = elf_section_by_name(elf, ehdr, &shdr_rel_plt,
-					  ".rela.plt", NULL);
-	if (scn_plt_rel == NULL) {
-		scn_plt_rel = elf_section_by_name(elf, ehdr, &shdr_rel_plt,
-						  ".rel.plt", NULL);
-		if (scn_plt_rel == NULL)
-			return 0;
-	}
-
-	if (shdr_rel_plt.sh_link != dynsym_idx)
-		return 0;
-
-	if (elf_section_by_name(elf, ehdr, &shdr_plt, ".plt", NULL) == NULL)
-		return 0;
-
-	/*
-	 * Fetch the relocation section to find the indexes to the GOT
-	 * and the symbols in the .dynsym they refer to.
-	 */
-	reldata = elf_getdata(scn_plt_rel, NULL);
-	if (reldata == NULL)
-		return -1;
-
-	syms = elf_getdata(scn_dynsym, NULL);
-	if (syms == NULL)
-		return -1;
-
-	scn_symstrs = elf_getscn(elf, shdr_dynsym->sh_link);
-	if (scn_symstrs == NULL)
-		return -1;
-
-	symstrs = elf_getdata(scn_symstrs, NULL);
-	if (symstrs == NULL)
-		return -1;
-
-	nr_rel_entries = shdr_rel_plt.sh_size / shdr_rel_plt.sh_entsize;
-	plt_offset = shdr_plt.sh_offset;
-
-	if (shdr_rel_plt.sh_type == SHT_RELA) {
-		GElf_Rela pos_mem, *pos;
-
-		elf_section__for_each_rela(reldata, pos, pos_mem, idx,
-					   nr_rel_entries) {
-			symidx = GELF_R_SYM(pos->r_info);
-			plt_offset += shdr_plt.sh_entsize;
-			gelf_getsym(syms, symidx, &sym);
-			snprintf(sympltname, sizeof(sympltname),
-				 "%s@plt", elf_sym__name(&sym, symstrs));
-
-			f = symbol__new(plt_offset, shdr_plt.sh_entsize,
-					sympltname, self->sym_priv_size, 0, verbose);
-			if (!f)
-				return -1;
-
-			dso__insert_symbol(self, f);
-			++nr;
-		}
-	} else if (shdr_rel_plt.sh_type == SHT_REL) {
-		GElf_Rel pos_mem, *pos;
-		elf_section__for_each_rel(reldata, pos, pos_mem, idx,
-					  nr_rel_entries) {
-			symidx = GELF_R_SYM(pos->r_info);
-			plt_offset += shdr_plt.sh_entsize;
-			gelf_getsym(syms, symidx, &sym);
-			snprintf(sympltname, sizeof(sympltname),
-				 "%s@plt", elf_sym__name(&sym, symstrs));
-
-			f = symbol__new(plt_offset, shdr_plt.sh_entsize,
-					sympltname, self->sym_priv_size, 0, verbose);
-			if (!f)
-				return -1;
-
-			dso__insert_symbol(self, f);
-			++nr;
-		}
-	} else {
-		/*
-		 * TODO: There are still one more shdr_rel_plt.sh_type
-		 * I have to investigate, but probably should be ignored.
-		 */
-	}
-
-	return nr;
-}
-
-static int dso__load_sym(struct dso *self, int fd, const char *name,
-			 symbol_filter_t filter, int verbose)
-{
-	Elf_Data *symstrs;
-	uint32_t nr_syms;
-	int err = -1;
-	uint32_t index;
-	GElf_Ehdr ehdr;
-	GElf_Shdr shdr;
-	Elf_Data *syms;
-	GElf_Sym sym;
-	Elf_Scn *sec, *sec_dynsym;
-	Elf *elf;
-	size_t dynsym_idx;
-	int nr = 0;
-
-	elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
-	if (elf == NULL) {
-		if (verbose)
-			fprintf(stderr, "%s: cannot read %s ELF file.\n",
-				__func__, name);
-		goto out_close;
-	}
-
-	if (gelf_getehdr(elf, &ehdr) == NULL) {
-		if (verbose)
-			fprintf(stderr, "%s: cannot get elf header.\n", __func__);
-		goto out_elf_end;
-	}
-
-	/*
-	 * We need to check if we have a .dynsym, so that we can handle the
-	 * .plt, synthesizing its symbols, that aren't on the symtabs (be it
-	 * .dynsym or .symtab)
-	 */
-	sec_dynsym = elf_section_by_name(elf, &ehdr, &shdr,
-					 ".dynsym", &dynsym_idx);
-	if (sec_dynsym != NULL) {
-		nr = dso__synthesize_plt_symbols(self, elf, &ehdr,
-						 sec_dynsym, &shdr,
-						 dynsym_idx, verbose);
-		if (nr < 0)
-			goto out_elf_end;
-	}
-
-	/*
-	 * But if we have a full .symtab (that is a superset of .dynsym) we
-	 * should add the symbols not in the .dynsyn
-	 */
-	sec = elf_section_by_name(elf, &ehdr, &shdr, ".symtab", NULL);
-	if (sec == NULL) {
-		if (sec_dynsym == NULL)
-			goto out_elf_end;
-
-		sec = sec_dynsym;
-		gelf_getshdr(sec, &shdr);
-	}
-
-	syms = elf_getdata(sec, NULL);
-	if (syms == NULL)
-		goto out_elf_end;
-
-	sec = elf_getscn(elf, shdr.sh_link);
-	if (sec == NULL)
-		goto out_elf_end;
-
-	symstrs = elf_getdata(sec, NULL);
-	if (symstrs == NULL)
-		goto out_elf_end;
-
-	nr_syms = shdr.sh_size / shdr.sh_entsize;
-
-	elf_symtab__for_each_symbol(syms, nr_syms, index, sym) {
-		struct symbol *f;
-		uint64_t obj_start;
-
-		if (!elf_sym__is_function(&sym))
-			continue;
-
-		sec = elf_getscn(elf, sym.st_shndx);
-		if (!sec)
-			goto out_elf_end;
-
-		gelf_getshdr(sec, &shdr);
-		obj_start = sym.st_value;
-
-		sym.st_value -= shdr.sh_addr - shdr.sh_offset;
-
-		f = symbol__new(sym.st_value, sym.st_size,
-				elf_sym__name(&sym, symstrs),
-				self->sym_priv_size, obj_start, verbose);
-		if (!f)
-			goto out_elf_end;
-
-		if (filter && filter(self, f))
-			symbol__delete(f, self->sym_priv_size);
-		else {
-			dso__insert_symbol(self, f);
-			nr++;
-		}
-	}
-
-	err = nr;
-out_elf_end:
-	elf_end(elf);
-out_close:
-	return err;
-}
-
-int dso__load(struct dso *self, symbol_filter_t filter, int verbose)
-{
-	int size = strlen(self->name) + sizeof("/usr/lib/debug%s.debug");
-	char *name = malloc(size);
-	int variant = 0;
-	int ret = -1;
-	int fd;
-
-	if (!name)
-		return -1;
-
-more:
-	do {
-		switch (variant) {
-		case 0: /* Fedora */
-			snprintf(name, size, "/usr/lib/debug%s.debug", self->name);
-			break;
-		case 1: /* Ubuntu */
-			snprintf(name, size, "/usr/lib/debug%s", self->name);
-			break;
-		case 2: /* Sane people */
-			snprintf(name, size, "%s", self->name);
-			break;
-
-		default:
-			goto out;
-		}
-		variant++;
-
-		fd = open(name, O_RDONLY);
-	} while (fd < 0);
-
-	ret = dso__load_sym(self, fd, name, filter, verbose);
-	close(fd);
-
-	/*
-	 * Some people seem to have debuginfo files _WITHOUT_ debug info!?!?
-	 */
-	if (!ret)
-		goto more;
-
-out:
-	free(name);
-	return ret;
-}
-
-static int dso__load_vmlinux(struct dso *self, const char *vmlinux,
-			     symbol_filter_t filter, int verbose)
-{
-	int err, fd = open(vmlinux, O_RDONLY);
-
-	if (fd < 0)
-		return -1;
-
-	err = dso__load_sym(self, fd, vmlinux, filter, verbose);
-	close(fd);
-
-	return err;
-}
-
-int dso__load_kernel(struct dso *self, const char *vmlinux,
-		     symbol_filter_t filter, int verbose)
-{
-	int err = -1;
-
-	if (vmlinux)
-		err = dso__load_vmlinux(self, vmlinux, filter, verbose);
-
-	if (err)
-		err = dso__load_kallsyms(self, filter, verbose);
-
-	return err;
-}
-
-void symbol__init(void)
-{
-	elf_version(EV_CURRENT);
-}
diff --git a/Documentation/perf_counter/util/symbol.h b/Documentation/perf_counter/util/symbol.h
deleted file mode 100644
index 4839d68..0000000
--- a/Documentation/perf_counter/util/symbol.h
+++ /dev/null
@@ -1,47 +0,0 @@
-#ifndef _PERF_SYMBOL_
-#define _PERF_SYMBOL_ 1
-
-#include <linux/types.h>
-#include "list.h"
-#include "rbtree.h"
-
-struct symbol {
-	struct rb_node	rb_node;
-	__u64		start;
-	__u64		end;
-	__u64		obj_start;
-	__u64		hist_sum;
-	__u64		*hist;
-	char		name[0];
-};
-
-struct dso {
-	struct list_head node;
-	struct rb_root	 syms;
-	unsigned int	 sym_priv_size;
-	struct symbol    *(*find_symbol)(struct dso *, uint64_t ip);
-	char		 name[0];
-};
-
-const char *sym_hist_filter;
-
-typedef int (*symbol_filter_t)(struct dso *self, struct symbol *sym);
-
-struct dso *dso__new(const char *name, unsigned int sym_priv_size);
-void dso__delete(struct dso *self);
-
-static inline void *dso__sym_priv(struct dso *self, struct symbol *sym)
-{
-	return ((void *)sym) - self->sym_priv_size;
-}
-
-struct symbol *dso__find_symbol(struct dso *self, uint64_t ip);
-
-int dso__load_kernel(struct dso *self, const char *vmlinux,
-		     symbol_filter_t filter, int verbose);
-int dso__load(struct dso *self, symbol_filter_t filter, int verbose);
-
-size_t dso__fprintf(struct dso *self, FILE *fp);
-
-void symbol__init(void);
-#endif /* _PERF_SYMBOL_ */
diff --git a/Documentation/perf_counter/util/usage.c b/Documentation/perf_counter/util/usage.c
deleted file mode 100644
index 2cad286..0000000
--- a/Documentation/perf_counter/util/usage.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * GIT - The information manager from hell
- *
- * Copyright (C) Linus Torvalds, 2005
- */
-#include "util.h"
-
-static void report(const char *prefix, const char *err, va_list params)
-{
-	char msg[1024];
-	vsnprintf(msg, sizeof(msg), err, params);
-	fprintf(stderr, "%s%s\n", prefix, msg);
-}
-
-static NORETURN void usage_builtin(const char *err)
-{
-	fprintf(stderr, "\n usage: %s\n", err);
-	exit(129);
-}
-
-static NORETURN void die_builtin(const char *err, va_list params)
-{
-	report("fatal: ", err, params);
-	exit(128);
-}
-
-static void error_builtin(const char *err, va_list params)
-{
-	report("error: ", err, params);
-}
-
-static void warn_builtin(const char *warn, va_list params)
-{
-	report("warning: ", warn, params);
-}
-
-/* If we are in a dlopen()ed .so write to a global variable would segfault
- * (ugh), so keep things static. */
-static void (*usage_routine)(const char *err) NORETURN = usage_builtin;
-static void (*die_routine)(const char *err, va_list params) NORETURN = die_builtin;
-static void (*error_routine)(const char *err, va_list params) = error_builtin;
-static void (*warn_routine)(const char *err, va_list params) = warn_builtin;
-
-void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN)
-{
-	die_routine = routine;
-}
-
-void usage(const char *err)
-{
-	usage_routine(err);
-}
-
-void die(const char *err, ...)
-{
-	va_list params;
-
-	va_start(params, err);
-	die_routine(err, params);
-	va_end(params);
-}
-
-int error(const char *err, ...)
-{
-	va_list params;
-
-	va_start(params, err);
-	error_routine(err, params);
-	va_end(params);
-	return -1;
-}
-
-void warning(const char *warn, ...)
-{
-	va_list params;
-
-	va_start(params, warn);
-	warn_routine(warn, params);
-	va_end(params);
-}
diff --git a/Documentation/perf_counter/util/util.h b/Documentation/perf_counter/util/util.h
deleted file mode 100644
index 76590a1..0000000
--- a/Documentation/perf_counter/util/util.h
+++ /dev/null
@@ -1,410 +0,0 @@
-#ifndef GIT_COMPAT_UTIL_H
-#define GIT_COMPAT_UTIL_H
-
-#define _FILE_OFFSET_BITS 64
-
-#ifndef FLEX_ARRAY
-/*
- * See if our compiler is known to support flexible array members.
- */
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
-# define FLEX_ARRAY /* empty */
-#elif defined(__GNUC__)
-# if (__GNUC__ >= 3)
-#  define FLEX_ARRAY /* empty */
-# else
-#  define FLEX_ARRAY 0 /* older GNU extension */
-# endif
-#endif
-
-/*
- * Otherwise, default to safer but a bit wasteful traditional style
- */
-#ifndef FLEX_ARRAY
-# define FLEX_ARRAY 1
-#endif
-#endif
-
-#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
-
-#ifdef __GNUC__
-#define TYPEOF(x) (__typeof__(x))
-#else
-#define TYPEOF(x)
-#endif
-
-#define MSB(x, bits) ((x) & TYPEOF(x)(~0ULL << (sizeof(x) * 8 - (bits))))
-#define HAS_MULTI_BITS(i)  ((i) & ((i) - 1))  /* checks if an integer has more than 1 bit set */
-
-/* Approximation of the length of the decimal representation of this type. */
-#define decimal_length(x)	((int)(sizeof(x) * 2.56 + 0.5) + 1)
-
-#if !defined(__APPLE__) && !defined(__FreeBSD__)  && !defined(__USLC__) && !defined(_M_UNIX)
-#define _XOPEN_SOURCE 600 /* glibc2 and AIX 5.3L need 500, OpenBSD needs 600 for S_ISLNK() */
-#define _XOPEN_SOURCE_EXTENDED 1 /* AIX 5.3L needs this */
-#endif
-#define _ALL_SOURCE 1
-#define _GNU_SOURCE 1
-#define _BSD_SOURCE 1
-
-#include <unistd.h>
-#include <stdio.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <string.h>
-#include <errno.h>
-#include <limits.h>
-#include <sys/param.h>
-#include <sys/types.h>
-#include <dirent.h>
-#include <sys/time.h>
-#include <time.h>
-#include <signal.h>
-#include <fnmatch.h>
-#include <assert.h>
-#include <regex.h>
-#include <utime.h>
-#ifndef __MINGW32__
-#include <sys/wait.h>
-#include <sys/poll.h>
-#include <sys/socket.h>
-#include <sys/ioctl.h>
-#ifndef NO_SYS_SELECT_H
-#include <sys/select.h>
-#endif
-#include <netinet/in.h>
-#include <netinet/tcp.h>
-#include <arpa/inet.h>
-#include <netdb.h>
-#include <pwd.h>
-#include <inttypes.h>
-#if defined(__CYGWIN__)
-#undef _XOPEN_SOURCE
-#include <grp.h>
-#define _XOPEN_SOURCE 600
-#include "compat/cygwin.h"
-#else
-#undef _ALL_SOURCE /* AIX 5.3L defines a struct list with _ALL_SOURCE. */
-#include <grp.h>
-#define _ALL_SOURCE 1
-#endif
-#else 	/* __MINGW32__ */
-/* pull in Windows compatibility stuff */
-#include "compat/mingw.h"
-#endif	/* __MINGW32__ */
-
-#ifndef NO_ICONV
-#include <iconv.h>
-#endif
-
-#ifndef NO_OPENSSL
-#include <openssl/ssl.h>
-#include <openssl/err.h>
-#endif
-
-/* On most systems <limits.h> would have given us this, but
- * not on some systems (e.g. GNU/Hurd).
- */
-#ifndef PATH_MAX
-#define PATH_MAX 4096
-#endif
-
-#ifndef PRIuMAX
-#define PRIuMAX "llu"
-#endif
-
-#ifndef PRIu32
-#define PRIu32 "u"
-#endif
-
-#ifndef PRIx32
-#define PRIx32 "x"
-#endif
-
-#ifndef PATH_SEP
-#define PATH_SEP ':'
-#endif
-
-#ifndef STRIP_EXTENSION
-#define STRIP_EXTENSION ""
-#endif
-
-#ifndef has_dos_drive_prefix
-#define has_dos_drive_prefix(path) 0
-#endif
-
-#ifndef is_dir_sep
-#define is_dir_sep(c) ((c) == '/')
-#endif
-
-#ifdef __GNUC__
-#define NORETURN __attribute__((__noreturn__))
-#else
-#define NORETURN
-#ifndef __attribute__
-#define __attribute__(x)
-#endif
-#endif
-
-/* General helper functions */
-extern void usage(const char *err) NORETURN;
-extern void die(const char *err, ...) NORETURN __attribute__((format (printf, 1, 2)));
-extern int error(const char *err, ...) __attribute__((format (printf, 1, 2)));
-extern void warning(const char *err, ...) __attribute__((format (printf, 1, 2)));
-
-extern void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN);
-
-extern int prefixcmp(const char *str, const char *prefix);
-extern time_t tm_to_time_t(const struct tm *tm);
-
-static inline const char *skip_prefix(const char *str, const char *prefix)
-{
-	size_t len = strlen(prefix);
-	return strncmp(str, prefix, len) ? NULL : str + len;
-}
-
-#if defined(NO_MMAP) || defined(USE_WIN32_MMAP)
-
-#ifndef PROT_READ
-#define PROT_READ 1
-#define PROT_WRITE 2
-#define MAP_PRIVATE 1
-#define MAP_FAILED ((void*)-1)
-#endif
-
-#define mmap git_mmap
-#define munmap git_munmap
-extern void *git_mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
-extern int git_munmap(void *start, size_t length);
-
-#else /* NO_MMAP || USE_WIN32_MMAP */
-
-#include <sys/mman.h>
-
-#endif /* NO_MMAP || USE_WIN32_MMAP */
-
-#ifdef NO_MMAP
-
-/* This value must be multiple of (pagesize * 2) */
-#define DEFAULT_PACKED_GIT_WINDOW_SIZE (1 * 1024 * 1024)
-
-#else /* NO_MMAP */
-
-/* This value must be multiple of (pagesize * 2) */
-#define DEFAULT_PACKED_GIT_WINDOW_SIZE \
-	(sizeof(void*) >= 8 \
-		?  1 * 1024 * 1024 * 1024 \
-		: 32 * 1024 * 1024)
-
-#endif /* NO_MMAP */
-
-#ifdef NO_ST_BLOCKS_IN_STRUCT_STAT
-#define on_disk_bytes(st) ((st).st_size)
-#else
-#define on_disk_bytes(st) ((st).st_blocks * 512)
-#endif
-
-#define DEFAULT_PACKED_GIT_LIMIT \
-	((1024L * 1024L) * (sizeof(void*) >= 8 ? 8192 : 256))
-
-#ifdef NO_PREAD
-#define pread git_pread
-extern ssize_t git_pread(int fd, void *buf, size_t count, off_t offset);
-#endif
-/*
- * Forward decl that will remind us if its twin in cache.h changes.
- * This function is used in compat/pread.c.  But we can't include
- * cache.h there.
- */
-extern ssize_t read_in_full(int fd, void *buf, size_t count);
-
-#ifdef NO_SETENV
-#define setenv gitsetenv
-extern int gitsetenv(const char *, const char *, int);
-#endif
-
-#ifdef NO_MKDTEMP
-#define mkdtemp gitmkdtemp
-extern char *gitmkdtemp(char *);
-#endif
-
-#ifdef NO_UNSETENV
-#define unsetenv gitunsetenv
-extern void gitunsetenv(const char *);
-#endif
-
-#ifdef NO_STRCASESTR
-#define strcasestr gitstrcasestr
-extern char *gitstrcasestr(const char *haystack, const char *needle);
-#endif
-
-#ifdef NO_STRLCPY
-#define strlcpy gitstrlcpy
-extern size_t gitstrlcpy(char *, const char *, size_t);
-#endif
-
-#ifdef NO_STRTOUMAX
-#define strtoumax gitstrtoumax
-extern uintmax_t gitstrtoumax(const char *, char **, int);
-#endif
-
-#ifdef NO_HSTRERROR
-#define hstrerror githstrerror
-extern const char *githstrerror(int herror);
-#endif
-
-#ifdef NO_MEMMEM
-#define memmem gitmemmem
-void *gitmemmem(const void *haystack, size_t haystacklen,
-                const void *needle, size_t needlelen);
-#endif
-
-#ifdef FREAD_READS_DIRECTORIES
-#ifdef fopen
-#undef fopen
-#endif
-#define fopen(a,b) git_fopen(a,b)
-extern FILE *git_fopen(const char*, const char*);
-#endif
-
-#ifdef SNPRINTF_RETURNS_BOGUS
-#define snprintf git_snprintf
-extern int git_snprintf(char *str, size_t maxsize,
-			const char *format, ...);
-#define vsnprintf git_vsnprintf
-extern int git_vsnprintf(char *str, size_t maxsize,
-			 const char *format, va_list ap);
-#endif
-
-#ifdef __GLIBC_PREREQ
-#if __GLIBC_PREREQ(2, 1)
-#define HAVE_STRCHRNUL
-#endif
-#endif
-
-#ifndef HAVE_STRCHRNUL
-#define strchrnul gitstrchrnul
-static inline char *gitstrchrnul(const char *s, int c)
-{
-	while (*s && *s != c)
-		s++;
-	return (char *)s;
-}
-#endif
-
-/*
- * Wrappers:
- */
-extern char *xstrdup(const char *str);
-extern void *xmalloc(size_t size);
-extern void *xmemdupz(const void *data, size_t len);
-extern char *xstrndup(const char *str, size_t len);
-extern void *xrealloc(void *ptr, size_t size);
-extern void *xcalloc(size_t nmemb, size_t size);
-extern void *xmmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
-extern ssize_t xread(int fd, void *buf, size_t len);
-extern ssize_t xwrite(int fd, const void *buf, size_t len);
-extern int xdup(int fd);
-extern FILE *xfdopen(int fd, const char *mode);
-extern int xmkstemp(char *template);
-
-static inline size_t xsize_t(off_t len)
-{
-	return (size_t)len;
-}
-
-static inline int has_extension(const char *filename, const char *ext)
-{
-	size_t len = strlen(filename);
-	size_t extlen = strlen(ext);
-	return len > extlen && !memcmp(filename + len - extlen, ext, extlen);
-}
-
-/* Sane ctype - no locale, and works with signed chars */
-#undef isascii
-#undef isspace
-#undef isdigit
-#undef isalpha
-#undef isalnum
-#undef tolower
-#undef toupper
-extern unsigned char sane_ctype[256];
-#define GIT_SPACE 0x01
-#define GIT_DIGIT 0x02
-#define GIT_ALPHA 0x04
-#define GIT_GLOB_SPECIAL 0x08
-#define GIT_REGEX_SPECIAL 0x10
-#define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0)
-#define isascii(x) (((x) & ~0x7f) == 0)
-#define isspace(x) sane_istest(x,GIT_SPACE)
-#define isdigit(x) sane_istest(x,GIT_DIGIT)
-#define isalpha(x) sane_istest(x,GIT_ALPHA)
-#define isalnum(x) sane_istest(x,GIT_ALPHA | GIT_DIGIT)
-#define is_glob_special(x) sane_istest(x,GIT_GLOB_SPECIAL)
-#define is_regex_special(x) sane_istest(x,GIT_GLOB_SPECIAL | GIT_REGEX_SPECIAL)
-#define tolower(x) sane_case((unsigned char)(x), 0x20)
-#define toupper(x) sane_case((unsigned char)(x), 0)
-
-static inline int sane_case(int x, int high)
-{
-	if (sane_istest(x, GIT_ALPHA))
-		x = (x & ~0x20) | high;
-	return x;
-}
-
-static inline int strtoul_ui(char const *s, int base, unsigned int *result)
-{
-	unsigned long ul;
-	char *p;
-
-	errno = 0;
-	ul = strtoul(s, &p, base);
-	if (errno || *p || p == s || (unsigned int) ul != ul)
-		return -1;
-	*result = ul;
-	return 0;
-}
-
-static inline int strtol_i(char const *s, int base, int *result)
-{
-	long ul;
-	char *p;
-
-	errno = 0;
-	ul = strtol(s, &p, base);
-	if (errno || *p || p == s || (int) ul != ul)
-		return -1;
-	*result = ul;
-	return 0;
-}
-
-#ifdef INTERNAL_QSORT
-void git_qsort(void *base, size_t nmemb, size_t size,
-	       int(*compar)(const void *, const void *));
-#define qsort git_qsort
-#endif
-
-#ifndef DIR_HAS_BSD_GROUP_SEMANTICS
-# define FORCE_DIR_SET_GID S_ISGID
-#else
-# define FORCE_DIR_SET_GID 0
-#endif
-
-#ifdef NO_NSEC
-#undef USE_NSEC
-#define ST_CTIME_NSEC(st) 0
-#define ST_MTIME_NSEC(st) 0
-#else
-#ifdef USE_ST_TIMESPEC
-#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctimespec.tv_nsec))
-#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtimespec.tv_nsec))
-#else
-#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctim.tv_nsec))
-#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtim.tv_nsec))
-#endif
-#endif
-
-#endif
diff --git a/Documentation/perf_counter/util/wrapper.c b/Documentation/perf_counter/util/wrapper.c
deleted file mode 100644
index 6350d65..0000000
--- a/Documentation/perf_counter/util/wrapper.c
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Various trivial helper wrappers around standard functions
- */
-#include "cache.h"
-
-/*
- * There's no pack memory to release - but stay close to the Git
- * version so wrap this away:
- */
-static inline void release_pack_memory(size_t size, int flag)
-{
-}
-
-char *xstrdup(const char *str)
-{
-	char *ret = strdup(str);
-	if (!ret) {
-		release_pack_memory(strlen(str) + 1, -1);
-		ret = strdup(str);
-		if (!ret)
-			die("Out of memory, strdup failed");
-	}
-	return ret;
-}
-
-void *xmalloc(size_t size)
-{
-	void *ret = malloc(size);
-	if (!ret && !size)
-		ret = malloc(1);
-	if (!ret) {
-		release_pack_memory(size, -1);
-		ret = malloc(size);
-		if (!ret && !size)
-			ret = malloc(1);
-		if (!ret)
-			die("Out of memory, malloc failed");
-	}
-#ifdef XMALLOC_POISON
-	memset(ret, 0xA5, size);
-#endif
-	return ret;
-}
-
-/*
- * xmemdupz() allocates (len + 1) bytes of memory, duplicates "len" bytes of
- * "data" to the allocated memory, zero terminates the allocated memory,
- * and returns a pointer to the allocated memory. If the allocation fails,
- * the program dies.
- */
-void *xmemdupz(const void *data, size_t len)
-{
-	char *p = xmalloc(len + 1);
-	memcpy(p, data, len);
-	p[len] = '\0';
-	return p;
-}
-
-char *xstrndup(const char *str, size_t len)
-{
-	char *p = memchr(str, '\0', len);
-	return xmemdupz(str, p ? p - str : len);
-}
-
-void *xrealloc(void *ptr, size_t size)
-{
-	void *ret = realloc(ptr, size);
-	if (!ret && !size)
-		ret = realloc(ptr, 1);
-	if (!ret) {
-		release_pack_memory(size, -1);
-		ret = realloc(ptr, size);
-		if (!ret && !size)
-			ret = realloc(ptr, 1);
-		if (!ret)
-			die("Out of memory, realloc failed");
-	}
-	return ret;
-}
-
-void *xcalloc(size_t nmemb, size_t size)
-{
-	void *ret = calloc(nmemb, size);
-	if (!ret && (!nmemb || !size))
-		ret = calloc(1, 1);
-	if (!ret) {
-		release_pack_memory(nmemb * size, -1);
-		ret = calloc(nmemb, size);
-		if (!ret && (!nmemb || !size))
-			ret = calloc(1, 1);
-		if (!ret)
-			die("Out of memory, calloc failed");
-	}
-	return ret;
-}
-
-void *xmmap(void *start, size_t length,
-	int prot, int flags, int fd, off_t offset)
-{
-	void *ret = mmap(start, length, prot, flags, fd, offset);
-	if (ret == MAP_FAILED) {
-		if (!length)
-			return NULL;
-		release_pack_memory(length, fd);
-		ret = mmap(start, length, prot, flags, fd, offset);
-		if (ret == MAP_FAILED)
-			die("Out of memory? mmap failed: %s", strerror(errno));
-	}
-	return ret;
-}
-
-/*
- * xread() is the same a read(), but it automatically restarts read()
- * operations with a recoverable error (EAGAIN and EINTR). xread()
- * DOES NOT GUARANTEE that "len" bytes is read even if the data is available.
- */
-ssize_t xread(int fd, void *buf, size_t len)
-{
-	ssize_t nr;
-	while (1) {
-		nr = read(fd, buf, len);
-		if ((nr < 0) && (errno == EAGAIN || errno == EINTR))
-			continue;
-		return nr;
-	}
-}
-
-/*
- * xwrite() is the same a write(), but it automatically restarts write()
- * operations with a recoverable error (EAGAIN and EINTR). xwrite() DOES NOT
- * GUARANTEE that "len" bytes is written even if the operation is successful.
- */
-ssize_t xwrite(int fd, const void *buf, size_t len)
-{
-	ssize_t nr;
-	while (1) {
-		nr = write(fd, buf, len);
-		if ((nr < 0) && (errno == EAGAIN || errno == EINTR))
-			continue;
-		return nr;
-	}
-}
-
-ssize_t read_in_full(int fd, void *buf, size_t count)
-{
-	char *p = buf;
-	ssize_t total = 0;
-
-	while (count > 0) {
-		ssize_t loaded = xread(fd, p, count);
-		if (loaded <= 0)
-			return total ? total : loaded;
-		count -= loaded;
-		p += loaded;
-		total += loaded;
-	}
-
-	return total;
-}
-
-ssize_t write_in_full(int fd, const void *buf, size_t count)
-{
-	const char *p = buf;
-	ssize_t total = 0;
-
-	while (count > 0) {
-		ssize_t written = xwrite(fd, p, count);
-		if (written < 0)
-			return -1;
-		if (!written) {
-			errno = ENOSPC;
-			return -1;
-		}
-		count -= written;
-		p += written;
-		total += written;
-	}
-
-	return total;
-}
-
-int xdup(int fd)
-{
-	int ret = dup(fd);
-	if (ret < 0)
-		die("dup failed: %s", strerror(errno));
-	return ret;
-}
-
-FILE *xfdopen(int fd, const char *mode)
-{
-	FILE *stream = fdopen(fd, mode);
-	if (stream == NULL)
-		die("Out of memory? fdopen failed: %s", strerror(errno));
-	return stream;
-}
-
-int xmkstemp(char *template)
-{
-	int fd;
-
-	fd = mkstemp(template);
-	if (fd < 0)
-		die("Unable to create temporary file: %s", strerror(errno));
-	return fd;
-}
diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore
new file mode 100644
index 0000000..d69a759
--- /dev/null
+++ b/tools/perf/.gitignore
@@ -0,0 +1,16 @@
+PERF-BUILD-OPTIONS
+PERF-CFLAGS
+PERF-GUI-VARS
+PERF-VERSION-FILE
+perf
+perf-help
+perf-record
+perf-report
+perf-stat
+perf-top
+perf*.1
+perf*.xml
+common-cmds.h
+tags
+TAGS
+cscope*
diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile
new file mode 100644
index 0000000..5457192
--- /dev/null
+++ b/tools/perf/Documentation/Makefile
@@ -0,0 +1,300 @@
+MAN1_TXT= \
+	$(filter-out $(addsuffix .txt, $(ARTICLES) $(SP_ARTICLES)), \
+		$(wildcard perf-*.txt)) \
+	perf.txt
+MAN5_TXT=
+MAN7_TXT=
+
+MAN_TXT = $(MAN1_TXT) $(MAN5_TXT) $(MAN7_TXT)
+MAN_XML=$(patsubst %.txt,%.xml,$(MAN_TXT))
+MAN_HTML=$(patsubst %.txt,%.html,$(MAN_TXT))
+
+DOC_HTML=$(MAN_HTML)
+
+ARTICLES =
+# with their own formatting rules.
+SP_ARTICLES =
+API_DOCS = $(patsubst %.txt,%,$(filter-out technical/api-index-skel.txt technical/api-index.txt, $(wildcard technical/api-*.txt)))
+SP_ARTICLES += $(API_DOCS)
+SP_ARTICLES += technical/api-index
+
+DOC_HTML += $(patsubst %,%.html,$(ARTICLES) $(SP_ARTICLES))
+
+DOC_MAN1=$(patsubst %.txt,%.1,$(MAN1_TXT))
+DOC_MAN5=$(patsubst %.txt,%.5,$(MAN5_TXT))
+DOC_MAN7=$(patsubst %.txt,%.7,$(MAN7_TXT))
+
+prefix?=$(HOME)
+bindir?=$(prefix)/bin
+htmldir?=$(prefix)/share/doc/perf-doc
+pdfdir?=$(prefix)/share/doc/perf-doc
+mandir?=$(prefix)/share/man
+man1dir=$(mandir)/man1
+man5dir=$(mandir)/man5
+man7dir=$(mandir)/man7
+# DESTDIR=
+
+ASCIIDOC=asciidoc
+ASCIIDOC_EXTRA =
+MANPAGE_XSL = manpage-normal.xsl
+XMLTO_EXTRA =
+INSTALL?=install
+RM ?= rm -f
+DOC_REF = origin/man
+HTML_REF = origin/html
+
+infodir?=$(prefix)/share/info
+MAKEINFO=makeinfo
+INSTALL_INFO=install-info
+DOCBOOK2X_TEXI=docbook2x-texi
+DBLATEX=dblatex
+ifndef PERL_PATH
+	PERL_PATH = /usr/bin/perl
+endif
+
+-include ../config.mak.autogen
+-include ../config.mak
+
+#
+# For asciidoc ...
+#	-7.1.2,	no extra settings are needed.
+#	8.0-,	set ASCIIDOC8.
+#
+
+#
+# For docbook-xsl ...
+#	-1.68.1,	set ASCIIDOC_NO_ROFF? (based on changelog from 1.73.0)
+#	1.69.0,		no extra settings are needed?
+#	1.69.1-1.71.0,	set DOCBOOK_SUPPRESS_SP?
+#	1.71.1,		no extra settings are needed?
+#	1.72.0,		set DOCBOOK_XSL_172.
+#	1.73.0-,	set ASCIIDOC_NO_ROFF
+#
+
+#
+# If you had been using DOCBOOK_XSL_172 in an attempt to get rid
+# of 'the ".ft C" problem' in your generated manpages, and you
+# instead ended up with weird characters around callouts, try
+# using ASCIIDOC_NO_ROFF instead (it works fine with ASCIIDOC8).
+#
+
+ifdef ASCIIDOC8
+ASCIIDOC_EXTRA += -a asciidoc7compatible
+endif
+ifdef DOCBOOK_XSL_172
+ASCIIDOC_EXTRA += -a perf-asciidoc-no-roff
+MANPAGE_XSL = manpage-1.72.xsl
+else
+	ifdef ASCIIDOC_NO_ROFF
+	# docbook-xsl after 1.72 needs the regular XSL, but will not
+	# pass-thru raw roff codes from asciidoc.conf, so turn them off.
+	ASCIIDOC_EXTRA += -a perf-asciidoc-no-roff
+	endif
+endif
+ifdef MAN_BOLD_LITERAL
+XMLTO_EXTRA += -m manpage-bold-literal.xsl
+endif
+ifdef DOCBOOK_SUPPRESS_SP
+XMLTO_EXTRA += -m manpage-suppress-sp.xsl
+endif
+
+SHELL_PATH ?= $(SHELL)
+# Shell quote;
+SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
+
+#
+# Please note that there is a minor bug in asciidoc.
+# The version after 6.0.3 _will_ include the patch found here:
+#   http://marc.theaimsgroup.com/?l=perf&m=111558757202243&w=2
+#
+# Until that version is released you may have to apply the patch
+# yourself - yes, all 6 characters of it!
+#
+
+QUIET_SUBDIR0  = +$(MAKE) -C # space to separate -C and subdir
+QUIET_SUBDIR1  =
+
+ifneq ($(findstring $(MAKEFLAGS),w),w)
+PRINT_DIR = --no-print-directory
+else # "make -w"
+NO_SUBDIR = :
+endif
+
+ifneq ($(findstring $(MAKEFLAGS),s),s)
+ifndef V
+	QUIET_ASCIIDOC	= @echo '   ' ASCIIDOC $@;
+	QUIET_XMLTO	= @echo '   ' XMLTO $@;
+	QUIET_DB2TEXI	= @echo '   ' DB2TEXI $@;
+	QUIET_MAKEINFO	= @echo '   ' MAKEINFO $@;
+	QUIET_DBLATEX	= @echo '   ' DBLATEX $@;
+	QUIET_XSLTPROC	= @echo '   ' XSLTPROC $@;
+	QUIET_GEN	= @echo '   ' GEN $@;
+	QUIET_STDERR	= 2> /dev/null
+	QUIET_SUBDIR0	= +@subdir=
+	QUIET_SUBDIR1	= ;$(NO_SUBDIR) echo '   ' SUBDIR $$subdir; \
+			  $(MAKE) $(PRINT_DIR) -C $$subdir
+	export V
+endif
+endif
+
+all: html man
+
+html: $(DOC_HTML)
+
+$(DOC_HTML) $(DOC_MAN1) $(DOC_MAN5) $(DOC_MAN7): asciidoc.conf
+
+man: man1 man5 man7
+man1: $(DOC_MAN1)
+man5: $(DOC_MAN5)
+man7: $(DOC_MAN7)
+
+info: perf.info perfman.info
+
+pdf: user-manual.pdf
+
+install: install-man
+
+install-man: man
+	$(INSTALL) -d -m 755 $(DESTDIR)$(man1dir)
+#	$(INSTALL) -d -m 755 $(DESTDIR)$(man5dir)
+#	$(INSTALL) -d -m 755 $(DESTDIR)$(man7dir)
+	$(INSTALL) -m 644 $(DOC_MAN1) $(DESTDIR)$(man1dir)
+#	$(INSTALL) -m 644 $(DOC_MAN5) $(DESTDIR)$(man5dir)
+#	$(INSTALL) -m 644 $(DOC_MAN7) $(DESTDIR)$(man7dir)
+
+install-info: info
+	$(INSTALL) -d -m 755 $(DESTDIR)$(infodir)
+	$(INSTALL) -m 644 perf.info perfman.info $(DESTDIR)$(infodir)
+	if test -r $(DESTDIR)$(infodir)/dir; then \
+	  $(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perf.info ;\
+	  $(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perfman.info ;\
+	else \
+	  echo "No directory found in $(DESTDIR)$(infodir)" >&2 ; \
+	fi
+
+install-pdf: pdf
+	$(INSTALL) -d -m 755 $(DESTDIR)$(pdfdir)
+	$(INSTALL) -m 644 user-manual.pdf $(DESTDIR)$(pdfdir)
+
+install-html: html
+	'$(SHELL_PATH_SQ)' ./install-webdoc.sh $(DESTDIR)$(htmldir)
+
+../PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
+	$(QUIET_SUBDIR0)../ $(QUIET_SUBDIR1) PERF-VERSION-FILE
+
+-include ../PERF-VERSION-FILE
+
+#
+# Determine "include::" file references in asciidoc files.
+#
+doc.dep : $(wildcard *.txt) build-docdep.perl
+	$(QUIET_GEN)$(RM) $@+ $@ && \
+	$(PERL_PATH) ./build-docdep.perl >$@+ $(QUIET_STDERR) && \
+	mv $@+ $@
+
+-include doc.dep
+
+cmds_txt = cmds-ancillaryinterrogators.txt \
+	cmds-ancillarymanipulators.txt \
+	cmds-mainporcelain.txt \
+	cmds-plumbinginterrogators.txt \
+	cmds-plumbingmanipulators.txt \
+	cmds-synchingrepositories.txt \
+	cmds-synchelpers.txt \
+	cmds-purehelpers.txt \
+	cmds-foreignscminterface.txt
+
+$(cmds_txt): cmd-list.made
+
+cmd-list.made: cmd-list.perl ../command-list.txt $(MAN1_TXT)
+	$(QUIET_GEN)$(RM) $@ && \
+	$(PERL_PATH) ./cmd-list.perl ../command-list.txt $(QUIET_STDERR) && \
+	date >$@
+
+clean:
+	$(RM) *.xml *.xml+ *.html *.html+ *.1 *.5 *.7
+	$(RM) *.texi *.texi+ *.texi++ perf.info perfman.info
+	$(RM) howto-index.txt howto/*.html doc.dep
+	$(RM) technical/api-*.html technical/api-index.txt
+	$(RM) $(cmds_txt) *.made
+
+$(MAN_HTML): %.html : %.txt
+	$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
+	$(ASCIIDOC) -b xhtml11 -d manpage -f asciidoc.conf \
+		$(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \
+	mv $@+ $@
+
+%.1 %.5 %.7 : %.xml
+	$(QUIET_XMLTO)$(RM) $@ && \
+	xmlto -m $(MANPAGE_XSL) $(XMLTO_EXTRA) man $<
+
+%.xml : %.txt
+	$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
+	$(ASCIIDOC) -b docbook -d manpage -f asciidoc.conf \
+		$(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \
+	mv $@+ $@
+
+XSLT = docbook.xsl
+XSLTOPTS = --xinclude --stringparam html.stylesheet docbook-xsl.css
+
+user-manual.html: user-manual.xml
+	$(QUIET_XSLTPROC)xsltproc $(XSLTOPTS) -o $@ $(XSLT) $<
+
+perf.info: user-manual.texi
+	$(QUIET_MAKEINFO)$(MAKEINFO) --no-split -o $@ user-manual.texi
+
+user-manual.texi: user-manual.xml
+	$(QUIET_DB2TEXI)$(RM) $@+ $@ && \
+	$(DOCBOOK2X_TEXI) user-manual.xml --encoding=UTF-8 --to-stdout >$@++ && \
+	$(PERL_PATH) fix-texi.perl <$@++ >$@+ && \
+	rm $@++ && \
+	mv $@+ $@
+
+user-manual.pdf: user-manual.xml
+	$(QUIET_DBLATEX)$(RM) $@+ $@ && \
+	$(DBLATEX) -o $@+ -p /etc/asciidoc/dblatex/asciidoc-dblatex.xsl -s /etc/asciidoc/dblatex/asciidoc-dblatex.sty $< && \
+	mv $@+ $@
+
+perfman.texi: $(MAN_XML) cat-texi.perl
+	$(QUIET_DB2TEXI)$(RM) $@+ $@ && \
+	($(foreach xml,$(MAN_XML),$(DOCBOOK2X_TEXI) --encoding=UTF-8 \
+		--to-stdout $(xml) &&) true) > $@++ && \
+	$(PERL_PATH) cat-texi.perl $@ <$@++ >$@+ && \
+	rm $@++ && \
+	mv $@+ $@
+
+perfman.info: perfman.texi
+	$(QUIET_MAKEINFO)$(MAKEINFO) --no-split --no-validate $*.texi
+
+$(patsubst %.txt,%.texi,$(MAN_TXT)): %.texi : %.xml
+	$(QUIET_DB2TEXI)$(RM) $@+ $@ && \
+	$(DOCBOOK2X_TEXI) --to-stdout $*.xml >$@+ && \
+	mv $@+ $@
+
+howto-index.txt: howto-index.sh $(wildcard howto/*.txt)
+	$(QUIET_GEN)$(RM) $@+ $@ && \
+	'$(SHELL_PATH_SQ)' ./howto-index.sh $(wildcard howto/*.txt) >$@+ && \
+	mv $@+ $@
+
+$(patsubst %,%.html,$(ARTICLES)) : %.html : %.txt
+	$(QUIET_ASCIIDOC)$(ASCIIDOC) -b xhtml11 $*.txt
+
+WEBDOC_DEST = /pub/software/tools/perf/docs
+
+$(patsubst %.txt,%.html,$(wildcard howto/*.txt)): %.html : %.txt
+	$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
+	sed -e '1,/^$$/d' $< | $(ASCIIDOC) -b xhtml11 - >$@+ && \
+	mv $@+ $@
+
+install-webdoc : html
+	'$(SHELL_PATH_SQ)' ./install-webdoc.sh $(WEBDOC_DEST)
+
+quick-install: quick-install-man
+
+quick-install-man:
+	'$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(DOC_REF) $(DESTDIR)$(mandir)
+
+quick-install-html:
+	'$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(HTML_REF) $(DESTDIR)$(htmldir)
+
+.PHONY: .FORCE-PERF-VERSION-FILE
diff --git a/tools/perf/Documentation/asciidoc.conf b/tools/perf/Documentation/asciidoc.conf
new file mode 100644
index 0000000..356b23a
--- /dev/null
+++ b/tools/perf/Documentation/asciidoc.conf
@@ -0,0 +1,91 @@
+## linkperf: macro
+#
+# Usage: linkperf:command[manpage-section]
+#
+# Note, {0} is the manpage section, while {target} is the command.
+#
+# Show PERF link as: <command>(<section>); if section is defined, else just show
+# the command.
+
+[macros]
+(?su)[\\]?(?P<name>linkperf):(?P<target>\S*?)\[(?P<attrlist>.*?)\]=
+
+[attributes]
+asterisk=&#42;
+plus=&#43;
+caret=&#94;
+startsb=&#91;
+endsb=&#93;
+tilde=&#126;
+
+ifdef::backend-docbook[]
+[linkperf-inlinemacro]
+{0%{target}}
+{0#<citerefentry>}
+{0#<refentrytitle>{target}</refentrytitle><manvolnum>{0}</manvolnum>}
+{0#</citerefentry>}
+endif::backend-docbook[]
+
+ifdef::backend-docbook[]
+ifndef::perf-asciidoc-no-roff[]
+# "unbreak" docbook-xsl v1.68 for manpages. v1.69 works with or without this.
+# v1.72 breaks with this because it replaces dots not in roff requests.
+[listingblock]
+<example><title>{title}</title>
+<literallayout>
+ifdef::doctype-manpage[]
+&#10;.ft C&#10;
+endif::doctype-manpage[]
+|
+ifdef::doctype-manpage[]
+&#10;.ft&#10;
+endif::doctype-manpage[]
+</literallayout>
+{title#}</example>
+endif::perf-asciidoc-no-roff[]
+
+ifdef::perf-asciidoc-no-roff[]
+ifdef::doctype-manpage[]
+# The following two small workarounds insert a simple paragraph after screen
+[listingblock]
+<example><title>{title}</title>
+<literallayout>
+|
+</literallayout><simpara></simpara>
+{title#}</example>
+
+[verseblock]
+<formalpara{id? id="{id}"}><title>{title}</title><para>
+{title%}<literallayout{id? id="{id}"}>
+{title#}<literallayout>
+|
+</literallayout>
+{title#}</para></formalpara>
+{title%}<simpara></simpara>
+endif::doctype-manpage[]
+endif::perf-asciidoc-no-roff[]
+endif::backend-docbook[]
+
+ifdef::doctype-manpage[]
+ifdef::backend-docbook[]
+[header]
+template::[header-declarations]
+<refentry>
+<refmeta>
+<refentrytitle>{mantitle}</refentrytitle>
+<manvolnum>{manvolnum}</manvolnum>
+<refmiscinfo class="source">perf</refmiscinfo>
+<refmiscinfo class="version">{perf_version}</refmiscinfo>
+<refmiscinfo class="manual">perf Manual</refmiscinfo>
+</refmeta>
+<refnamediv>
+  <refname>{manname}</refname>
+  <refpurpose>{manpurpose}</refpurpose>
+</refnamediv>
+endif::backend-docbook[]
+endif::doctype-manpage[]
+
+ifdef::backend-xhtml11[]
+[linkperf-inlinemacro]
+<a href="{target}.html">{target}{0?({0})}</a>
+endif::backend-xhtml11[]
diff --git a/tools/perf/Documentation/manpage-1.72.xsl b/tools/perf/Documentation/manpage-1.72.xsl
new file mode 100644
index 0000000..b4d315c
--- /dev/null
+++ b/tools/perf/Documentation/manpage-1.72.xsl
@@ -0,0 +1,14 @@
+<!-- manpage-1.72.xsl:
+     special settings for manpages rendered from asciidoc+docbook
+     handles peculiarities in docbook-xsl 1.72.0 -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+		version="1.0">
+
+<xsl:import href="manpage-base.xsl"/>
+
+<!-- these are the special values for the roff control characters
+     needed for docbook-xsl 1.72.0 -->
+<xsl:param name="git.docbook.backslash">&#x2593;</xsl:param>
+<xsl:param name="git.docbook.dot"      >&#x2302;</xsl:param>
+
+</xsl:stylesheet>
diff --git a/tools/perf/Documentation/manpage-base.xsl b/tools/perf/Documentation/manpage-base.xsl
new file mode 100644
index 0000000..a264fa61
--- /dev/null
+++ b/tools/perf/Documentation/manpage-base.xsl
@@ -0,0 +1,35 @@
+<!-- manpage-base.xsl:
+     special formatting for manpages rendered from asciidoc+docbook -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+		version="1.0">
+
+<!-- these params silence some output from xmlto -->
+<xsl:param name="man.output.quietly" select="1"/>
+<xsl:param name="refentry.meta.get.quietly" select="1"/>
+
+<!-- convert asciidoc callouts to man page format;
+     git.docbook.backslash and git.docbook.dot params
+     must be supplied by another XSL file or other means -->
+<xsl:template match="co">
+	<xsl:value-of select="concat(
+			      $git.docbook.backslash,'fB(',
+			      substring-after(@id,'-'),')',
+			      $git.docbook.backslash,'fR')"/>
+</xsl:template>
+<xsl:template match="calloutlist">
+	<xsl:value-of select="$git.docbook.dot"/>
+	<xsl:text>sp&#10;</xsl:text>
+	<xsl:apply-templates/>
+	<xsl:text>&#10;</xsl:text>
+</xsl:template>
+<xsl:template match="callout">
+	<xsl:value-of select="concat(
+			      $git.docbook.backslash,'fB',
+			      substring-after(@arearefs,'-'),
+			      '. ',$git.docbook.backslash,'fR')"/>
+	<xsl:apply-templates/>
+	<xsl:value-of select="$git.docbook.dot"/>
+	<xsl:text>br&#10;</xsl:text>
+</xsl:template>
+
+</xsl:stylesheet>
diff --git a/tools/perf/Documentation/manpage-bold-literal.xsl b/tools/perf/Documentation/manpage-bold-literal.xsl
new file mode 100644
index 0000000..608eb5d
--- /dev/null
+++ b/tools/perf/Documentation/manpage-bold-literal.xsl
@@ -0,0 +1,17 @@
+<!-- manpage-bold-literal.xsl:
+     special formatting for manpages rendered from asciidoc+docbook -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+		version="1.0">
+
+<!-- render literal text as bold (instead of plain or monospace);
+     this makes literal text easier to distinguish in manpages
+     viewed on a tty -->
+<xsl:template match="literal">
+	<xsl:value-of select="$git.docbook.backslash"/>
+	<xsl:text>fB</xsl:text>
+	<xsl:apply-templates/>
+	<xsl:value-of select="$git.docbook.backslash"/>
+	<xsl:text>fR</xsl:text>
+</xsl:template>
+
+</xsl:stylesheet>
diff --git a/tools/perf/Documentation/manpage-normal.xsl b/tools/perf/Documentation/manpage-normal.xsl
new file mode 100644
index 0000000..a48f5b1
--- /dev/null
+++ b/tools/perf/Documentation/manpage-normal.xsl
@@ -0,0 +1,13 @@
+<!-- manpage-normal.xsl:
+     special settings for manpages rendered from asciidoc+docbook
+     handles anything we want to keep away from docbook-xsl 1.72.0 -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+		version="1.0">
+
+<xsl:import href="manpage-base.xsl"/>
+
+<!-- these are the normal values for the roff control characters -->
+<xsl:param name="git.docbook.backslash">\</xsl:param>
+<xsl:param name="git.docbook.dot"	>.</xsl:param>
+
+</xsl:stylesheet>
diff --git a/tools/perf/Documentation/manpage-suppress-sp.xsl b/tools/perf/Documentation/manpage-suppress-sp.xsl
new file mode 100644
index 0000000..a63c763
--- /dev/null
+++ b/tools/perf/Documentation/manpage-suppress-sp.xsl
@@ -0,0 +1,21 @@
+<!-- manpage-suppress-sp.xsl:
+     special settings for manpages rendered from asciidoc+docbook
+     handles erroneous, inline .sp in manpage output of some
+     versions of docbook-xsl -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+		version="1.0">
+
+<!-- attempt to work around spurious .sp at the tail of the line
+     that some versions of docbook stylesheets seem to add -->
+<xsl:template match="simpara">
+  <xsl:variable name="content">
+    <xsl:apply-templates/>
+  </xsl:variable>
+  <xsl:value-of select="normalize-space($content)"/>
+  <xsl:if test="not(ancestor::authorblurb) and
+                not(ancestor::personblurb)">
+    <xsl:text>&#10;&#10;</xsl:text>
+  </xsl:if>
+</xsl:template>
+
+</xsl:stylesheet>
diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt
new file mode 100644
index 0000000..c9dcade
--- /dev/null
+++ b/tools/perf/Documentation/perf-annotate.txt
@@ -0,0 +1,29 @@
+perf-annotate(1)
+==============
+
+NAME
+----
+perf-annotate - Read perf.data (created by perf record) and display annotated code
+
+SYNOPSIS
+--------
+[verse]
+'perf annotate' [-i <file> | --input=file] symbol_name
+
+DESCRIPTION
+-----------
+This command reads the input file and displays an annotated version of the
+code. If the object file has debug symbols then the source code will be
+displayed alongside assembly code.
+
+If there is no debug info in the object, then annotated assembly is displayed.
+
+OPTIONS
+-------
+-i::
+--input=::
+        Input file name. (default: perf.data)
+
+SEE ALSO
+--------
+linkperf:perf-record[1]
diff --git a/tools/perf/Documentation/perf-help.txt b/tools/perf/Documentation/perf-help.txt
new file mode 100644
index 0000000..5143918
--- /dev/null
+++ b/tools/perf/Documentation/perf-help.txt
@@ -0,0 +1,38 @@
+perf-help(1)
+============
+
+NAME
+----
+perf-help - display help information about perf
+
+SYNOPSIS
+--------
+'perf help' [-a|--all] [COMMAND]
+
+DESCRIPTION
+-----------
+
+With no options and no COMMAND given, the synopsis of the 'perf'
+command and a list of the most commonly used perf commands are printed
+on the standard output.
+
+If the option '--all' or '-a' is given, then all available commands are
+printed on the standard output.
+
+If a perf command is named, a manual page for that command is brought
+up. The 'man' program is used by default for this purpose, but this
+can be overridden by other options or configuration variables.
+
+Note that `perf --help ...` is identical to `perf help ...` because the
+former is internally converted into the latter.
+
+OPTIONS
+-------
+-a::
+--all::
+	Prints all the available commands on the standard output. This
+	option supersedes any other option.
+
+PERF
+----
+Part of the linkperf:perf[1] suite
diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt
new file mode 100644
index 0000000..8290b94
--- /dev/null
+++ b/tools/perf/Documentation/perf-list.txt
@@ -0,0 +1,25 @@
+perf-list(1)
+============
+
+NAME
+----
+perf-list - List all symbolic event types
+
+SYNOPSIS
+--------
+[verse]
+'perf list'
+
+DESCRIPTION
+-----------
+This command displays the symbolic event types which can be selected in the
+various perf commands with the -e option.
+
+OPTIONS
+-------
+None
+
+SEE ALSO
+--------
+linkperf:perf-stat[1], linkperf:perf-top[1],
+linkperf:perf-record[1]
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
new file mode 100644
index 0000000..1dbc1ee
--- /dev/null
+++ b/tools/perf/Documentation/perf-record.txt
@@ -0,0 +1,42 @@
+perf-record(1)
+==============
+
+NAME
+----
+perf-record - Run a command and record its profile into perf.data
+
+SYNOPSIS
+--------
+[verse]
+'perf record' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
+'perf record' [-e <EVENT> | --event=EVENT] [-l] [-a] -- <command> [<options>]
+
+DESCRIPTION
+-----------
+This command runs a command and gathers a performance counter profile
+from it, into perf.data - without displaying anything.
+
+This file can then be inspected later on, using 'perf report'.
+
+
+OPTIONS
+-------
+<command>...::
+	Any command you can specify in a shell.
+
+-e::
+--event=::
+	Select the PMU event. Selection can be a symbolic event name
+	(use 'perf list' to list all events) or a raw PMU
+	event (eventsel+umask) in the form of rNNN where NNN is a
+	 hexadecimal event descriptor.
+
+-a::
+        system-wide collection
+
+-l::
+        scale counter values
+
+SEE ALSO
+--------
+linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
new file mode 100644
index 0000000..52d3fc6
--- /dev/null
+++ b/tools/perf/Documentation/perf-report.txt
@@ -0,0 +1,26 @@
+perf-report(1)
+==============
+
+NAME
+----
+perf-report - Read perf.data (created by perf record) and display the profile
+
+SYNOPSIS
+--------
+[verse]
+'perf report' [-i <file> | --input=file]
+
+DESCRIPTION
+-----------
+This command displays the performance counter profile information recorded
+via perf report.
+
+OPTIONS
+-------
+-i::
+--input=::
+        Input file name. (default: perf.data)
+
+SEE ALSO
+--------
+linkperf:perf-stat[1]
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
new file mode 100644
index 0000000..c368a72
--- /dev/null
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -0,0 +1,66 @@
+perf-stat(1)
+============
+
+NAME
+----
+perf-stat - Run a command and gather performance counter statistics
+
+SYNOPSIS
+--------
+[verse]
+'perf stat' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
+'perf stat' [-e <EVENT> | --event=EVENT] [-l] [-a] -- <command> [<options>]
+
+DESCRIPTION
+-----------
+This command runs a command and gathers performance counter statistics
+from it.
+
+
+OPTIONS
+-------
+<command>...::
+	Any command you can specify in a shell.
+
+
+-e::
+--event=::
+	Select the PMU event. Selection can be a symbolic event name
+	(use 'perf list' to list all events) or a raw PMU
+	event (eventsel+umask) in the form of rNNN where NNN is a
+	 hexadecimal event descriptor.
+
+-i::
+--inherit::
+        child tasks inherit counters
+-p::
+--pid=<pid>::
+        stat events on existing pid
+
+-a::
+        system-wide collection
+
+-l::
+        scale counter values
+
+EXAMPLES
+--------
+
+$ perf stat -- make -j
+
+ Performance counter stats for 'make -j':
+
+    8117.370256  task clock ticks     #      11.281 CPU utilization factor
+            678  context switches     #       0.000 M/sec
+            133  CPU migrations       #       0.000 M/sec
+         235724  pagefaults           #       0.029 M/sec
+    24821162526  CPU cycles           #    3057.784 M/sec
+    18687303457  instructions         #    2302.138 M/sec
+      172158895  cache references     #      21.209 M/sec
+       27075259  cache misses         #       3.335 M/sec
+
+ Wall-clock time elapsed:   719.554352 msecs
+
+SEE ALSO
+--------
+linkperf:perf-top[1], linkperf:perf-list[1]
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
new file mode 100644
index 0000000..539d012
--- /dev/null
+++ b/tools/perf/Documentation/perf-top.txt
@@ -0,0 +1,39 @@
+perf-top(1)
+===========
+
+NAME
+----
+perf-top - Run a command and profile it
+
+SYNOPSIS
+--------
+[verse]
+'perf top' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
+
+DESCRIPTION
+-----------
+This command runs a command and gathers a performance counter profile
+from it.
+
+
+OPTIONS
+-------
+<command>...::
+	Any command you can specify in a shell.
+
+-e::
+--event=::
+	Select the PMU event. Selection can be a symbolic event name
+	(use 'perf list' to list all events) or a raw PMU
+	event (eventsel+umask) in the form of rNNN where NNN is a
+	 hexadecimal event descriptor.
+
+-a::
+        system-wide collection
+
+-l::
+        scale counter values
+
+SEE ALSO
+--------
+linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/Documentation/perf.txt b/tools/perf/Documentation/perf.txt
new file mode 100644
index 0000000..69c8325
--- /dev/null
+++ b/tools/perf/Documentation/perf.txt
@@ -0,0 +1,24 @@
+perf(1)
+=======
+
+NAME
+----
+perf - Performance analysis tools for Linux
+
+SYNOPSIS
+--------
+[verse]
+'perf' [--version] [--help] COMMAND [ARGS]
+
+DESCRIPTION
+-----------
+Performance counters for Linux are are a new kernel-based subsystem
+that provide a framework for all things performance analysis. It
+covers hardware level (CPU/PMU, Performance Monitoring Unit) features
+and software features (software counters, tracepoints) as well.
+
+SEE ALSO
+--------
+linkperf:perf-stat[1], linkperf:perf-top[1],
+linkperf:perf-record[1], linkperf:perf-report[1],
+linkperf:perf-list[1]
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
new file mode 100644
index 0000000..0cbd5d6
--- /dev/null
+++ b/tools/perf/Makefile
@@ -0,0 +1,929 @@
+# The default target of this Makefile is...
+all::
+
+# Define V=1 to have a more verbose compile.
+#
+# Define SNPRINTF_RETURNS_BOGUS if your are on a system which snprintf()
+# or vsnprintf() return -1 instead of number of characters which would
+# have been written to the final string if enough space had been available.
+#
+# Define FREAD_READS_DIRECTORIES if your are on a system which succeeds
+# when attempting to read from an fopen'ed directory.
+#
+# Define NO_OPENSSL environment variable if you do not have OpenSSL.
+# This also implies MOZILLA_SHA1.
+#
+# Define CURLDIR=/foo/bar if your curl header and library files are in
+# /foo/bar/include and /foo/bar/lib directories.
+#
+# Define EXPATDIR=/foo/bar if your expat header and library files are in
+# /foo/bar/include and /foo/bar/lib directories.
+#
+# Define NO_D_INO_IN_DIRENT if you don't have d_ino in your struct dirent.
+#
+# Define NO_D_TYPE_IN_DIRENT if your platform defines DT_UNKNOWN but lacks
+# d_type in struct dirent (latest Cygwin -- will be fixed soonish).
+#
+# Define NO_C99_FORMAT if your formatted IO functions (printf/scanf et.al.)
+# do not support the 'size specifiers' introduced by C99, namely ll, hh,
+# j, z, t. (representing long long int, char, intmax_t, size_t, ptrdiff_t).
+# some C compilers supported these specifiers prior to C99 as an extension.
+#
+# Define NO_STRCASESTR if you don't have strcasestr.
+#
+# Define NO_MEMMEM if you don't have memmem.
+#
+# Define NO_STRTOUMAX if you don't have strtoumax in the C library.
+# If your compiler also does not support long long or does not have
+# strtoull, define NO_STRTOULL.
+#
+# Define NO_SETENV if you don't have setenv in the C library.
+#
+# Define NO_UNSETENV if you don't have unsetenv in the C library.
+#
+# Define NO_MKDTEMP if you don't have mkdtemp in the C library.
+#
+# Define NO_SYS_SELECT_H if you don't have sys/select.h.
+#
+# Define NO_SYMLINK_HEAD if you never want .perf/HEAD to be a symbolic link.
+# Enable it on Windows.  By default, symrefs are still used.
+#
+# Define NO_SVN_TESTS if you want to skip time-consuming SVN interoperability
+# tests.  These tests take up a significant amount of the total test time
+# but are not needed unless you plan to talk to SVN repos.
+#
+# Define NO_FINK if you are building on Darwin/Mac OS X, have Fink
+# installed in /sw, but don't want PERF to link against any libraries
+# installed there.  If defined you may specify your own (or Fink's)
+# include directories and library directories by defining CFLAGS
+# and LDFLAGS appropriately.
+#
+# Define NO_DARWIN_PORTS if you are building on Darwin/Mac OS X,
+# have DarwinPorts installed in /opt/local, but don't want PERF to
+# link against any libraries installed there.  If defined you may
+# specify your own (or DarwinPort's) include directories and
+# library directories by defining CFLAGS and LDFLAGS appropriately.
+#
+# Define PPC_SHA1 environment variable when running make to make use of
+# a bundled SHA1 routine optimized for PowerPC.
+#
+# Define ARM_SHA1 environment variable when running make to make use of
+# a bundled SHA1 routine optimized for ARM.
+#
+# Define MOZILLA_SHA1 environment variable when running make to make use of
+# a bundled SHA1 routine coming from Mozilla. It is GPL'd and should be fast
+# on non-x86 architectures (e.g. PowerPC), while the OpenSSL version (default
+# choice) has very fast version optimized for i586.
+#
+# Define NEEDS_SSL_WITH_CRYPTO if you need -lcrypto with -lssl (Darwin).
+#
+# Define NEEDS_LIBICONV if linking with libc is not enough (Darwin).
+#
+# Define NEEDS_SOCKET if linking with libc is not enough (SunOS,
+# Patrick Mauritz).
+#
+# Define NO_MMAP if you want to avoid mmap.
+#
+# Define NO_PTHREADS if you do not have or do not want to use Pthreads.
+#
+# Define NO_PREAD if you have a problem with pread() system call (e.g.
+# cygwin.dll before v1.5.22).
+#
+# Define NO_FAST_WORKING_DIRECTORY if accessing objects in pack files is
+# generally faster on your platform than accessing the working directory.
+#
+# Define NO_TRUSTABLE_FILEMODE if your filesystem may claim to support
+# the executable mode bit, but doesn't really do so.
+#
+# Define NO_IPV6 if you lack IPv6 support and getaddrinfo().
+#
+# Define NO_SOCKADDR_STORAGE if your platform does not have struct
+# sockaddr_storage.
+#
+# Define NO_ICONV if your libc does not properly support iconv.
+#
+# Define OLD_ICONV if your library has an old iconv(), where the second
+# (input buffer pointer) parameter is declared with type (const char **).
+#
+# Define NO_DEFLATE_BOUND if your zlib does not have deflateBound.
+#
+# Define NO_R_TO_GCC_LINKER if your gcc does not like "-R/path/lib"
+# that tells runtime paths to dynamic libraries;
+# "-Wl,-rpath=/path/lib" is used instead.
+#
+# Define USE_NSEC below if you want perf to care about sub-second file mtimes
+# and ctimes. Note that you need recent glibc (at least 2.2.4) for this, and
+# it will BREAK YOUR LOCAL DIFFS! show-diff and anything using it will likely
+# randomly break unless your underlying filesystem supports those sub-second
+# times (my ext3 doesn't).
+#
+# Define USE_ST_TIMESPEC if your "struct stat" uses "st_ctimespec" instead of
+# "st_ctim"
+#
+# Define NO_NSEC if your "struct stat" does not have "st_ctim.tv_nsec"
+# available.  This automatically turns USE_NSEC off.
+#
+# Define USE_STDEV below if you want perf to care about the underlying device
+# change being considered an inode change from the update-index perspective.
+#
+# Define NO_ST_BLOCKS_IN_STRUCT_STAT if your platform does not have st_blocks
+# field that counts the on-disk footprint in 512-byte blocks.
+#
+# Define ASCIIDOC8 if you want to format documentation with AsciiDoc 8
+#
+# Define DOCBOOK_XSL_172 if you want to format man pages with DocBook XSL v1.72.
+#
+# Define NO_PERL_MAKEMAKER if you cannot use Makefiles generated by perl's
+# MakeMaker (e.g. using ActiveState under Cygwin).
+#
+# Define NO_PERL if you do not want Perl scripts or libraries at all.
+#
+# Define INTERNAL_QSORT to use Git's implementation of qsort(), which
+# is a simplified version of the merge sort used in glibc. This is
+# recommended if Git triggers O(n^2) behavior in your platform's qsort().
+#
+# Define NO_EXTERNAL_GREP if you don't want "perf grep" to ever call
+# your external grep (e.g., if your system lacks grep, if its grep is
+# broken, or spawning external process is slower than built-in grep perf has).
+
+PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
+	@$(SHELL_PATH) util/PERF-VERSION-GEN
+-include PERF-VERSION-FILE
+
+uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
+uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not')
+uname_O := $(shell sh -c 'uname -o 2>/dev/null || echo not')
+uname_R := $(shell sh -c 'uname -r 2>/dev/null || echo not')
+uname_P := $(shell sh -c 'uname -p 2>/dev/null || echo not')
+uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
+
+# CFLAGS and LDFLAGS are for the users to override from the command line.
+
+CFLAGS = -ggdb3 -Wall -Werror -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement -O6
+LDFLAGS = -lpthread -lrt -lelf
+ALL_CFLAGS = $(CFLAGS)
+ALL_LDFLAGS = $(LDFLAGS)
+STRIP ?= strip
+
+# Among the variables below, these:
+#   perfexecdir
+#   template_dir
+#   mandir
+#   infodir
+#   htmldir
+#   ETC_PERFCONFIG (but not sysconfdir)
+# can be specified as a relative path some/where/else;
+# this is interpreted as relative to $(prefix) and "perf" at
+# runtime figures out where they are based on the path to the executable.
+# This can help installing the suite in a relocatable way.
+
+prefix = $(HOME)
+bindir_relative = bin
+bindir = $(prefix)/$(bindir_relative)
+mandir = share/man
+infodir = share/info
+perfexecdir = libexec/perf-core
+sharedir = $(prefix)/share
+template_dir = share/perf-core/templates
+htmldir = share/doc/perf-doc
+ifeq ($(prefix),/usr)
+sysconfdir = /etc
+ETC_PERFCONFIG = $(sysconfdir)/perfconfig
+else
+sysconfdir = $(prefix)/etc
+ETC_PERFCONFIG = etc/perfconfig
+endif
+lib = lib
+# DESTDIR=
+
+export prefix bindir sharedir sysconfdir
+
+CC = gcc
+AR = ar
+RM = rm -f
+TAR = tar
+FIND = find
+INSTALL = install
+RPMBUILD = rpmbuild
+PTHREAD_LIBS = -lpthread
+
+# sparse is architecture-neutral, which means that we need to tell it
+# explicitly what architecture to check for. Fix this up for yours..
+SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__
+
+
+
+### --- END CONFIGURATION SECTION ---
+
+# Those must not be GNU-specific; they are shared with perl/ which may
+# be built by a different compiler. (Note that this is an artifact now
+# but it still might be nice to keep that distinction.)
+BASIC_CFLAGS =
+BASIC_LDFLAGS =
+
+# Guard against environment variables
+BUILTIN_OBJS =
+BUILT_INS =
+COMPAT_CFLAGS =
+COMPAT_OBJS =
+LIB_H =
+LIB_OBJS =
+SCRIPT_PERL =
+SCRIPT_SH =
+TEST_PROGRAMS =
+
+#
+# No scripts right now:
+#
+
+# SCRIPT_SH += perf-am.sh
+
+#
+# No Perl scripts right now:
+#
+
+# SCRIPT_PERL += perf-add--interactive.perl
+
+SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH)) \
+	  $(patsubst %.perl,%,$(SCRIPT_PERL))
+
+# Empty...
+EXTRA_PROGRAMS =
+
+# ... and all the rest that could be moved out of bindir to perfexecdir
+PROGRAMS += $(EXTRA_PROGRAMS)
+
+#
+# Single 'perf' binary right now:
+#
+PROGRAMS += perf
+
+# List built-in command $C whose implementation cmd_$C() is not in
+# builtin-$C.o but is linked in as part of some other command.
+#
+# None right now:
+#
+# BUILT_INS += perf-init $X
+
+# what 'all' will build and 'install' will install, in perfexecdir
+ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS)
+
+# what 'all' will build but not install in perfexecdir
+OTHER_PROGRAMS = perf$X
+
+# Set paths to tools early so that they can be used for version tests.
+ifndef SHELL_PATH
+	SHELL_PATH = /bin/sh
+endif
+ifndef PERL_PATH
+	PERL_PATH = /usr/bin/perl
+endif
+
+export PERL_PATH
+
+LIB_FILE=libperf.a
+
+LIB_H += ../../include/linux/perf_counter.h
+LIB_H += perf.h
+LIB_H += util/list.h
+LIB_H += util/rbtree.h
+LIB_H += util/levenshtein.h
+LIB_H += util/parse-options.h
+LIB_H += util/parse-events.h
+LIB_H += util/quote.h
+LIB_H += util/util.h
+LIB_H += util/help.h
+LIB_H += util/strbuf.h
+LIB_H += util/string.h
+LIB_H += util/run-command.h
+LIB_H += util/sigchain.h
+LIB_H += util/symbol.h
+LIB_H += util/color.h
+
+LIB_OBJS += util/abspath.o
+LIB_OBJS += util/alias.o
+LIB_OBJS += util/config.o
+LIB_OBJS += util/ctype.o
+LIB_OBJS += util/environment.o
+LIB_OBJS += util/exec_cmd.o
+LIB_OBJS += util/help.o
+LIB_OBJS += util/levenshtein.o
+LIB_OBJS += util/parse-options.o
+LIB_OBJS += util/parse-events.o
+LIB_OBJS += util/path.o
+LIB_OBJS += util/rbtree.o
+LIB_OBJS += util/run-command.o
+LIB_OBJS += util/quote.o
+LIB_OBJS += util/strbuf.o
+LIB_OBJS += util/string.o
+LIB_OBJS += util/usage.o
+LIB_OBJS += util/wrapper.o
+LIB_OBJS += util/sigchain.o
+LIB_OBJS += util/symbol.o
+LIB_OBJS += util/color.o
+LIB_OBJS += util/pager.o
+
+BUILTIN_OBJS += builtin-annotate.o
+BUILTIN_OBJS += builtin-help.o
+BUILTIN_OBJS += builtin-list.o
+BUILTIN_OBJS += builtin-record.o
+BUILTIN_OBJS += builtin-report.o
+BUILTIN_OBJS += builtin-stat.o
+BUILTIN_OBJS += builtin-top.o
+
+PERFLIBS = $(LIB_FILE)
+EXTLIBS =
+
+#
+# Platform specific tweaks
+#
+
+# We choose to avoid "if .. else if .. else .. endif endif"
+# because maintaining the nesting to match is a pain.  If
+# we had "elif" things would have been much nicer...
+
+-include config.mak.autogen
+-include config.mak
+
+ifeq ($(uname_S),Darwin)
+	ifndef NO_FINK
+		ifeq ($(shell test -d /sw/lib && echo y),y)
+			BASIC_CFLAGS += -I/sw/include
+			BASIC_LDFLAGS += -L/sw/lib
+		endif
+	endif
+	ifndef NO_DARWIN_PORTS
+		ifeq ($(shell test -d /opt/local/lib && echo y),y)
+			BASIC_CFLAGS += -I/opt/local/include
+			BASIC_LDFLAGS += -L/opt/local/lib
+		endif
+	endif
+	PTHREAD_LIBS =
+endif
+
+ifndef CC_LD_DYNPATH
+	ifdef NO_R_TO_GCC_LINKER
+		# Some gcc does not accept and pass -R to the linker to specify
+		# the runtime dynamic library path.
+		CC_LD_DYNPATH = -Wl,-rpath,
+	else
+		CC_LD_DYNPATH = -R
+	endif
+endif
+
+ifdef ZLIB_PATH
+	BASIC_CFLAGS += -I$(ZLIB_PATH)/include
+	EXTLIBS += -L$(ZLIB_PATH)/$(lib) $(CC_LD_DYNPATH)$(ZLIB_PATH)/$(lib)
+endif
+EXTLIBS += -lz
+
+ifdef NEEDS_SOCKET
+	EXTLIBS += -lsocket
+endif
+ifdef NEEDS_NSL
+	EXTLIBS += -lnsl
+endif
+ifdef NO_D_TYPE_IN_DIRENT
+	BASIC_CFLAGS += -DNO_D_TYPE_IN_DIRENT
+endif
+ifdef NO_D_INO_IN_DIRENT
+	BASIC_CFLAGS += -DNO_D_INO_IN_DIRENT
+endif
+ifdef NO_ST_BLOCKS_IN_STRUCT_STAT
+	BASIC_CFLAGS += -DNO_ST_BLOCKS_IN_STRUCT_STAT
+endif
+ifdef USE_NSEC
+	BASIC_CFLAGS += -DUSE_NSEC
+endif
+ifdef USE_ST_TIMESPEC
+	BASIC_CFLAGS += -DUSE_ST_TIMESPEC
+endif
+ifdef NO_NSEC
+	BASIC_CFLAGS += -DNO_NSEC
+endif
+ifdef NO_C99_FORMAT
+	BASIC_CFLAGS += -DNO_C99_FORMAT
+endif
+ifdef SNPRINTF_RETURNS_BOGUS
+	COMPAT_CFLAGS += -DSNPRINTF_RETURNS_BOGUS
+	COMPAT_OBJS += compat/snprintf.o
+endif
+ifdef FREAD_READS_DIRECTORIES
+	COMPAT_CFLAGS += -DFREAD_READS_DIRECTORIES
+	COMPAT_OBJS += compat/fopen.o
+endif
+ifdef NO_SYMLINK_HEAD
+	BASIC_CFLAGS += -DNO_SYMLINK_HEAD
+endif
+ifdef NO_STRCASESTR
+	COMPAT_CFLAGS += -DNO_STRCASESTR
+	COMPAT_OBJS += compat/strcasestr.o
+endif
+ifdef NO_STRTOUMAX
+	COMPAT_CFLAGS += -DNO_STRTOUMAX
+	COMPAT_OBJS += compat/strtoumax.o
+endif
+ifdef NO_STRTOULL
+	COMPAT_CFLAGS += -DNO_STRTOULL
+endif
+ifdef NO_SETENV
+	COMPAT_CFLAGS += -DNO_SETENV
+	COMPAT_OBJS += compat/setenv.o
+endif
+ifdef NO_MKDTEMP
+	COMPAT_CFLAGS += -DNO_MKDTEMP
+	COMPAT_OBJS += compat/mkdtemp.o
+endif
+ifdef NO_UNSETENV
+	COMPAT_CFLAGS += -DNO_UNSETENV
+	COMPAT_OBJS += compat/unsetenv.o
+endif
+ifdef NO_SYS_SELECT_H
+	BASIC_CFLAGS += -DNO_SYS_SELECT_H
+endif
+ifdef NO_MMAP
+	COMPAT_CFLAGS += -DNO_MMAP
+	COMPAT_OBJS += compat/mmap.o
+else
+	ifdef USE_WIN32_MMAP
+		COMPAT_CFLAGS += -DUSE_WIN32_MMAP
+		COMPAT_OBJS += compat/win32mmap.o
+	endif
+endif
+ifdef NO_PREAD
+	COMPAT_CFLAGS += -DNO_PREAD
+	COMPAT_OBJS += compat/pread.o
+endif
+ifdef NO_FAST_WORKING_DIRECTORY
+	BASIC_CFLAGS += -DNO_FAST_WORKING_DIRECTORY
+endif
+ifdef NO_TRUSTABLE_FILEMODE
+	BASIC_CFLAGS += -DNO_TRUSTABLE_FILEMODE
+endif
+ifdef NO_IPV6
+	BASIC_CFLAGS += -DNO_IPV6
+endif
+ifdef NO_UINTMAX_T
+	BASIC_CFLAGS += -Duintmax_t=uint32_t
+endif
+ifdef NO_SOCKADDR_STORAGE
+ifdef NO_IPV6
+	BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in
+else
+	BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in6
+endif
+endif
+ifdef NO_INET_NTOP
+	LIB_OBJS += compat/inet_ntop.o
+endif
+ifdef NO_INET_PTON
+	LIB_OBJS += compat/inet_pton.o
+endif
+
+ifdef NO_ICONV
+	BASIC_CFLAGS += -DNO_ICONV
+endif
+
+ifdef OLD_ICONV
+	BASIC_CFLAGS += -DOLD_ICONV
+endif
+
+ifdef NO_DEFLATE_BOUND
+	BASIC_CFLAGS += -DNO_DEFLATE_BOUND
+endif
+
+ifdef PPC_SHA1
+	SHA1_HEADER = "ppc/sha1.h"
+	LIB_OBJS += ppc/sha1.o ppc/sha1ppc.o
+else
+ifdef ARM_SHA1
+	SHA1_HEADER = "arm/sha1.h"
+	LIB_OBJS += arm/sha1.o arm/sha1_arm.o
+else
+ifdef MOZILLA_SHA1
+	SHA1_HEADER = "mozilla-sha1/sha1.h"
+	LIB_OBJS += mozilla-sha1/sha1.o
+else
+	SHA1_HEADER = <openssl/sha.h>
+	EXTLIBS += $(LIB_4_CRYPTO)
+endif
+endif
+endif
+ifdef NO_PERL_MAKEMAKER
+	export NO_PERL_MAKEMAKER
+endif
+ifdef NO_HSTRERROR
+	COMPAT_CFLAGS += -DNO_HSTRERROR
+	COMPAT_OBJS += compat/hstrerror.o
+endif
+ifdef NO_MEMMEM
+	COMPAT_CFLAGS += -DNO_MEMMEM
+	COMPAT_OBJS += compat/memmem.o
+endif
+ifdef INTERNAL_QSORT
+	COMPAT_CFLAGS += -DINTERNAL_QSORT
+	COMPAT_OBJS += compat/qsort.o
+endif
+ifdef RUNTIME_PREFIX
+	COMPAT_CFLAGS += -DRUNTIME_PREFIX
+endif
+
+ifdef DIR_HAS_BSD_GROUP_SEMANTICS
+	COMPAT_CFLAGS += -DDIR_HAS_BSD_GROUP_SEMANTICS
+endif
+ifdef NO_EXTERNAL_GREP
+	BASIC_CFLAGS += -DNO_EXTERNAL_GREP
+endif
+
+ifeq ($(PERL_PATH),)
+NO_PERL=NoThanks
+endif
+
+QUIET_SUBDIR0  = +$(MAKE) -C # space to separate -C and subdir
+QUIET_SUBDIR1  =
+
+ifneq ($(findstring $(MAKEFLAGS),w),w)
+PRINT_DIR = --no-print-directory
+else # "make -w"
+NO_SUBDIR = :
+endif
+
+ifneq ($(findstring $(MAKEFLAGS),s),s)
+ifndef V
+	QUIET_CC       = @echo '   ' CC $@;
+	QUIET_AR       = @echo '   ' AR $@;
+	QUIET_LINK     = @echo '   ' LINK $@;
+	QUIET_BUILT_IN = @echo '   ' BUILTIN $@;
+	QUIET_GEN      = @echo '   ' GEN $@;
+	QUIET_SUBDIR0  = +@subdir=
+	QUIET_SUBDIR1  = ;$(NO_SUBDIR) echo '   ' SUBDIR $$subdir; \
+			 $(MAKE) $(PRINT_DIR) -C $$subdir
+	export V
+	export QUIET_GEN
+	export QUIET_BUILT_IN
+endif
+endif
+
+ifdef ASCIIDOC8
+	export ASCIIDOC8
+endif
+
+# Shell quote (do not use $(call) to accommodate ancient setups);
+
+SHA1_HEADER_SQ = $(subst ','\'',$(SHA1_HEADER))
+ETC_PERFCONFIG_SQ = $(subst ','\'',$(ETC_PERFCONFIG))
+
+DESTDIR_SQ = $(subst ','\'',$(DESTDIR))
+bindir_SQ = $(subst ','\'',$(bindir))
+bindir_relative_SQ = $(subst ','\'',$(bindir_relative))
+mandir_SQ = $(subst ','\'',$(mandir))
+infodir_SQ = $(subst ','\'',$(infodir))
+perfexecdir_SQ = $(subst ','\'',$(perfexecdir))
+template_dir_SQ = $(subst ','\'',$(template_dir))
+htmldir_SQ = $(subst ','\'',$(htmldir))
+prefix_SQ = $(subst ','\'',$(prefix))
+
+SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
+PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH))
+
+LIBS = $(PERFLIBS) $(EXTLIBS)
+
+BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \
+	$(COMPAT_CFLAGS)
+LIB_OBJS += $(COMPAT_OBJS)
+
+ALL_CFLAGS += $(BASIC_CFLAGS)
+ALL_LDFLAGS += $(BASIC_LDFLAGS)
+
+export TAR INSTALL DESTDIR SHELL_PATH
+
+
+### Build rules
+
+SHELL = $(SHELL_PATH)
+
+all:: shell_compatibility_test $(ALL_PROGRAMS) $(BUILT_INS) $(OTHER_PROGRAMS) PERF-BUILD-OPTIONS
+ifneq (,$X)
+	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), test '$p' -ef '$p$X' || $(RM) '$p';)
+endif
+
+all::
+
+please_set_SHELL_PATH_to_a_more_modern_shell:
+	@$$(:)
+
+shell_compatibility_test: please_set_SHELL_PATH_to_a_more_modern_shell
+
+strip: $(PROGRAMS) perf$X
+	$(STRIP) $(STRIP_OPTS) $(PROGRAMS) perf$X
+
+perf.o: perf.c common-cmds.h PERF-CFLAGS
+	$(QUIET_CC)$(CC) -DPERF_VERSION='"$(PERF_VERSION)"' \
+		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \
+		$(ALL_CFLAGS) -c $(filter %.c,$^)
+
+perf$X: perf.o $(BUILTIN_OBJS) $(PERFLIBS)
+	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ perf.o \
+		$(BUILTIN_OBJS) $(ALL_LDFLAGS) $(LIBS)
+
+builtin-help.o: builtin-help.c common-cmds.h PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
+		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \
+		'-DPERF_MAN_PATH="$(mandir_SQ)"' \
+		'-DPERF_INFO_PATH="$(infodir_SQ)"' $<
+
+$(BUILT_INS): perf$X
+	$(QUIET_BUILT_IN)$(RM) $@ && \
+	ln perf$X $@ 2>/dev/null || \
+	ln -s perf$X $@ 2>/dev/null || \
+	cp perf$X $@
+
+common-cmds.h: util/generate-cmdlist.sh command-list.txt
+
+common-cmds.h: $(wildcard Documentation/perf-*.txt)
+	$(QUIET_GEN)util/generate-cmdlist.sh > $@+ && mv $@+ $@
+
+$(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh
+	$(QUIET_GEN)$(RM) $@ $@+ && \
+	sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \
+	    -e 's|@SHELL_PATH@|$(SHELL_PATH_SQ)|' \
+	    -e 's|@@PERL@@|$(PERL_PATH_SQ)|g' \
+	    -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
+	    -e 's/@@NO_CURL@@/$(NO_CURL)/g' \
+	    $@.sh >$@+ && \
+	chmod +x $@+ && \
+	mv $@+ $@
+
+configure: configure.ac
+	$(QUIET_GEN)$(RM) $@ $<+ && \
+	sed -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
+	    $< > $<+ && \
+	autoconf -o $@ $<+ && \
+	$(RM) $<+
+
+# These can record PERF_VERSION
+perf.o perf.spec \
+	$(patsubst %.sh,%,$(SCRIPT_SH)) \
+	$(patsubst %.perl,%,$(SCRIPT_PERL)) \
+	: PERF-VERSION-FILE
+
+%.o: %.c PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
+%.s: %.c PERF-CFLAGS
+	$(QUIET_CC)$(CC) -S $(ALL_CFLAGS) $<
+%.o: %.S
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
+
+util/exec_cmd.o: util/exec_cmd.c PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
+		'-DPERF_EXEC_PATH="$(perfexecdir_SQ)"' \
+		'-DBINDIR="$(bindir_relative_SQ)"' \
+		'-DPREFIX="$(prefix_SQ)"' \
+		$<
+
+builtin-init-db.o: builtin-init-db.c PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DDEFAULT_PERF_TEMPLATE_DIR='"$(template_dir_SQ)"' $<
+
+util/config.o: util/config.c PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
+
+perf-%$X: %.o $(PERFLIBS)
+	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS)
+
+$(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H)
+$(patsubst perf-%$X,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h)
+builtin-revert.o wt-status.o: wt-status.h
+
+$(LIB_FILE): $(LIB_OBJS)
+	$(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS)
+
+doc:
+	$(MAKE) -C Documentation all
+
+man:
+	$(MAKE) -C Documentation man
+
+html:
+	$(MAKE) -C Documentation html
+
+info:
+	$(MAKE) -C Documentation info
+
+pdf:
+	$(MAKE) -C Documentation pdf
+
+TAGS:
+	$(RM) TAGS
+	$(FIND) . -name '*.[hcS]' -print | xargs etags -a
+
+tags:
+	$(RM) tags
+	$(FIND) . -name '*.[hcS]' -print | xargs ctags -a
+
+cscope:
+	$(RM) cscope*
+	$(FIND) . -name '*.[hcS]' -print | xargs cscope -b
+
+### Detect prefix changes
+TRACK_CFLAGS = $(subst ','\'',$(ALL_CFLAGS)):\
+             $(bindir_SQ):$(perfexecdir_SQ):$(template_dir_SQ):$(prefix_SQ)
+
+PERF-CFLAGS: .FORCE-PERF-CFLAGS
+	@FLAGS='$(TRACK_CFLAGS)'; \
+	    if test x"$$FLAGS" != x"`cat PERF-CFLAGS 2>/dev/null`" ; then \
+		echo 1>&2 "    * new build flags or prefix"; \
+		echo "$$FLAGS" >PERF-CFLAGS; \
+            fi
+
+# We need to apply sq twice, once to protect from the shell
+# that runs PERF-BUILD-OPTIONS, and then again to protect it
+# and the first level quoting from the shell that runs "echo".
+PERF-BUILD-OPTIONS: .FORCE-PERF-BUILD-OPTIONS
+	@echo SHELL_PATH=\''$(subst ','\'',$(SHELL_PATH_SQ))'\' >$@
+	@echo TAR=\''$(subst ','\'',$(subst ','\'',$(TAR)))'\' >>$@
+	@echo NO_CURL=\''$(subst ','\'',$(subst ','\'',$(NO_CURL)))'\' >>$@
+	@echo NO_PERL=\''$(subst ','\'',$(subst ','\'',$(NO_PERL)))'\' >>$@
+
+### Testing rules
+
+#
+# None right now:
+#
+# TEST_PROGRAMS += test-something$X
+
+all:: $(TEST_PROGRAMS)
+
+# GNU make supports exporting all variables by "export" without parameters.
+# However, the environment gets quite big, and some programs have problems
+# with that.
+
+export NO_SVN_TESTS
+
+check: common-cmds.h
+	if sparse; \
+	then \
+		for i in *.c */*.c; \
+		do \
+			sparse $(ALL_CFLAGS) $(SPARSE_FLAGS) $$i || exit; \
+		done; \
+	else \
+		echo 2>&1 "Did you mean 'make test'?"; \
+		exit 1; \
+	fi
+
+remove-dashes:
+	./fixup-builtins $(BUILT_INS) $(PROGRAMS) $(SCRIPTS)
+
+### Installation rules
+
+ifneq ($(filter /%,$(firstword $(template_dir))),)
+template_instdir = $(template_dir)
+else
+template_instdir = $(prefix)/$(template_dir)
+endif
+export template_instdir
+
+ifneq ($(filter /%,$(firstword $(perfexecdir))),)
+perfexec_instdir = $(perfexecdir)
+else
+perfexec_instdir = $(prefix)/$(perfexecdir)
+endif
+perfexec_instdir_SQ = $(subst ','\'',$(perfexec_instdir))
+export perfexec_instdir
+
+install: all
+	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
+	$(INSTALL) perf$X '$(DESTDIR_SQ)$(bindir_SQ)'
+ifdef BUILT_INS
+	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
+	$(INSTALL) $(BUILT_INS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
+ifneq (,$X)
+	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';)
+endif
+endif
+
+install-doc:
+	$(MAKE) -C Documentation install
+
+install-man:
+	$(MAKE) -C Documentation install-man
+
+install-html:
+	$(MAKE) -C Documentation install-html
+
+install-info:
+	$(MAKE) -C Documentation install-info
+
+install-pdf:
+	$(MAKE) -C Documentation install-pdf
+
+quick-install-doc:
+	$(MAKE) -C Documentation quick-install
+
+quick-install-man:
+	$(MAKE) -C Documentation quick-install-man
+
+quick-install-html:
+	$(MAKE) -C Documentation quick-install-html
+
+
+### Maintainer's dist rules
+#
+# None right now
+#
+#
+# perf.spec: perf.spec.in
+#	sed -e 's/@@VERSION@@/$(PERF_VERSION)/g' < $< > $@+
+#	mv $@+ $@
+#
+# PERF_TARNAME=perf-$(PERF_VERSION)
+# dist: perf.spec perf-archive$(X) configure
+#	./perf-archive --format=tar \
+#		--prefix=$(PERF_TARNAME)/ HEAD^{tree} > $(PERF_TARNAME).tar
+#	@mkdir -p $(PERF_TARNAME)
+#	@cp perf.spec configure $(PERF_TARNAME)
+#	@echo $(PERF_VERSION) > $(PERF_TARNAME)/version
+#	$(TAR) rf $(PERF_TARNAME).tar \
+#		$(PERF_TARNAME)/perf.spec \
+#		$(PERF_TARNAME)/configure \
+#		$(PERF_TARNAME)/version
+#	@$(RM) -r $(PERF_TARNAME)
+#	gzip -f -9 $(PERF_TARNAME).tar
+#
+# htmldocs = perf-htmldocs-$(PERF_VERSION)
+# manpages = perf-manpages-$(PERF_VERSION)
+# dist-doc:
+#	$(RM) -r .doc-tmp-dir
+#	mkdir .doc-tmp-dir
+#	$(MAKE) -C Documentation WEBDOC_DEST=../.doc-tmp-dir install-webdoc
+#	cd .doc-tmp-dir && $(TAR) cf ../$(htmldocs).tar .
+#	gzip -n -9 -f $(htmldocs).tar
+#	:
+#	$(RM) -r .doc-tmp-dir
+#	mkdir -p .doc-tmp-dir/man1 .doc-tmp-dir/man5 .doc-tmp-dir/man7
+#	$(MAKE) -C Documentation DESTDIR=./ \
+#		man1dir=../.doc-tmp-dir/man1 \
+#		man5dir=../.doc-tmp-dir/man5 \
+#		man7dir=../.doc-tmp-dir/man7 \
+#		install
+#	cd .doc-tmp-dir && $(TAR) cf ../$(manpages).tar .
+#	gzip -n -9 -f $(manpages).tar
+#	$(RM) -r .doc-tmp-dir
+#
+# rpm: dist
+#	$(RPMBUILD) -ta $(PERF_TARNAME).tar.gz
+
+### Cleaning rules
+
+distclean: clean
+#	$(RM) configure
+
+clean:
+	$(RM) *.o */*.o $(LIB_FILE)
+	$(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf$X
+	$(RM) $(TEST_PROGRAMS)
+	$(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo common-cmds.h TAGS tags cscope*
+	$(RM) -r autom4te.cache
+	$(RM) config.log config.mak.autogen config.mak.append config.status config.cache
+	$(RM) -r $(PERF_TARNAME) .doc-tmp-dir
+	$(RM) $(PERF_TARNAME).tar.gz perf-core_$(PERF_VERSION)-*.tar.gz
+	$(RM) $(htmldocs).tar.gz $(manpages).tar.gz
+	$(MAKE) -C Documentation/ clean
+	$(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-BUILD-OPTIONS
+
+.PHONY: all install clean strip
+.PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
+.PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope .FORCE-PERF-CFLAGS
+.PHONY: .FORCE-PERF-BUILD-OPTIONS
+
+### Make sure built-ins do not have dups and listed in perf.c
+#
+check-builtins::
+	./check-builtins.sh
+
+### Test suite coverage testing
+#
+# None right now
+#
+# .PHONY: coverage coverage-clean coverage-build coverage-report
+#
+# coverage:
+#	$(MAKE) coverage-build
+#	$(MAKE) coverage-report
+#
+# coverage-clean:
+#	rm -f *.gcda *.gcno
+#
+# COVERAGE_CFLAGS = $(CFLAGS) -O0 -ftest-coverage -fprofile-arcs
+# COVERAGE_LDFLAGS = $(CFLAGS)  -O0 -lgcov
+#
+# coverage-build: coverage-clean
+#	$(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" all
+#	$(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" \
+#		-j1 test
+#
+# coverage-report:
+#	gcov -b *.c */*.c
+#	grep '^function.*called 0 ' *.c.gcov */*.c.gcov \
+#		| sed -e 's/\([^:]*\)\.gcov: *function \([^ ]*\) called.*/\1: \2/' \
+#		| tee coverage-untested-functions
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
new file mode 100644
index 0000000..116a397
--- /dev/null
+++ b/tools/perf/builtin-annotate.c
@@ -0,0 +1,1355 @@
+/*
+ * builtin-annotate.c
+ *
+ * Builtin annotate command: Analyze the perf.data input file,
+ * look up and read DSOs and symbol information and display
+ * a histogram of results, along various sorting keys.
+ */
+#include "builtin.h"
+
+#include "util/util.h"
+
+#include "util/color.h"
+#include "util/list.h"
+#include "util/cache.h"
+#include "util/rbtree.h"
+#include "util/symbol.h"
+#include "util/string.h"
+
+#include "perf.h"
+
+#include "util/parse-options.h"
+#include "util/parse-events.h"
+
+#define SHOW_KERNEL	1
+#define SHOW_USER	2
+#define SHOW_HV		4
+
+static char		const *input_name = "perf.data";
+static char		*vmlinux = NULL;
+
+static char		default_sort_order[] = "comm,symbol";
+static char		*sort_order = default_sort_order;
+
+static int		input;
+static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
+
+static int		dump_trace = 0;
+#define dprintf(x...)	do { if (dump_trace) printf(x); } while (0)
+
+static int		verbose;
+
+static unsigned long	page_size;
+static unsigned long	mmap_window = 32;
+
+struct ip_event {
+	struct perf_event_header header;
+	__u64 ip;
+	__u32 pid, tid;
+};
+
+struct mmap_event {
+	struct perf_event_header header;
+	__u32 pid, tid;
+	__u64 start;
+	__u64 len;
+	__u64 pgoff;
+	char filename[PATH_MAX];
+};
+
+struct comm_event {
+	struct perf_event_header header;
+	__u32 pid, tid;
+	char comm[16];
+};
+
+struct fork_event {
+	struct perf_event_header header;
+	__u32 pid, ppid;
+};
+
+struct period_event {
+	struct perf_event_header header;
+	__u64 time;
+	__u64 id;
+	__u64 sample_period;
+};
+
+typedef union event_union {
+	struct perf_event_header	header;
+	struct ip_event			ip;
+	struct mmap_event		mmap;
+	struct comm_event		comm;
+	struct fork_event		fork;
+	struct period_event		period;
+} event_t;
+
+static LIST_HEAD(dsos);
+static struct dso *kernel_dso;
+static struct dso *vdso;
+
+
+static void dsos__add(struct dso *dso)
+{
+	list_add_tail(&dso->node, &dsos);
+}
+
+static struct dso *dsos__find(const char *name)
+{
+	struct dso *pos;
+
+	list_for_each_entry(pos, &dsos, node)
+		if (strcmp(pos->name, name) == 0)
+			return pos;
+	return NULL;
+}
+
+static struct dso *dsos__findnew(const char *name)
+{
+	struct dso *dso = dsos__find(name);
+	int nr;
+
+	if (dso)
+		return dso;
+
+	dso = dso__new(name, 0);
+	if (!dso)
+		goto out_delete_dso;
+
+	nr = dso__load(dso, NULL, verbose);
+	if (nr < 0) {
+		if (verbose)
+			fprintf(stderr, "Failed to open: %s\n", name);
+		goto out_delete_dso;
+	}
+	if (!nr && verbose) {
+		fprintf(stderr,
+		"No symbols found in: %s, maybe install a debug package?\n",
+				name);
+	}
+
+	dsos__add(dso);
+
+	return dso;
+
+out_delete_dso:
+	dso__delete(dso);
+	return NULL;
+}
+
+static void dsos__fprintf(FILE *fp)
+{
+	struct dso *pos;
+
+	list_for_each_entry(pos, &dsos, node)
+		dso__fprintf(pos, fp);
+}
+
+static struct symbol *vdso__find_symbol(struct dso *dso, uint64_t ip)
+{
+	return dso__find_symbol(kernel_dso, ip);
+}
+
+static int load_kernel(void)
+{
+	int err;
+
+	kernel_dso = dso__new("[kernel]", 0);
+	if (!kernel_dso)
+		return -1;
+
+	err = dso__load_kernel(kernel_dso, vmlinux, NULL, verbose);
+	if (err) {
+		dso__delete(kernel_dso);
+		kernel_dso = NULL;
+	} else
+		dsos__add(kernel_dso);
+
+	vdso = dso__new("[vdso]", 0);
+	if (!vdso)
+		return -1;
+
+	vdso->find_symbol = vdso__find_symbol;
+
+	dsos__add(vdso);
+
+	return err;
+}
+
+struct map {
+	struct list_head node;
+	uint64_t	 start;
+	uint64_t	 end;
+	uint64_t	 pgoff;
+	uint64_t	 (*map_ip)(struct map *, uint64_t);
+	struct dso	 *dso;
+};
+
+static uint64_t map__map_ip(struct map *map, uint64_t ip)
+{
+	return ip - map->start + map->pgoff;
+}
+
+static uint64_t vdso__map_ip(struct map *map, uint64_t ip)
+{
+	return ip;
+}
+
+static struct map *map__new(struct mmap_event *event)
+{
+	struct map *self = malloc(sizeof(*self));
+
+	if (self != NULL) {
+		const char *filename = event->filename;
+
+		self->start = event->start;
+		self->end   = event->start + event->len;
+		self->pgoff = event->pgoff;
+
+		self->dso = dsos__findnew(filename);
+		if (self->dso == NULL)
+			goto out_delete;
+
+		if (self->dso == vdso)
+			self->map_ip = vdso__map_ip;
+		else
+			self->map_ip = map__map_ip;
+	}
+	return self;
+out_delete:
+	free(self);
+	return NULL;
+}
+
+static struct map *map__clone(struct map *self)
+{
+	struct map *map = malloc(sizeof(*self));
+
+	if (!map)
+		return NULL;
+
+	memcpy(map, self, sizeof(*self));
+
+	return map;
+}
+
+static int map__overlap(struct map *l, struct map *r)
+{
+	if (l->start > r->start) {
+		struct map *t = l;
+		l = r;
+		r = t;
+	}
+
+	if (l->end > r->start)
+		return 1;
+
+	return 0;
+}
+
+static size_t map__fprintf(struct map *self, FILE *fp)
+{
+	return fprintf(fp, " %"PRIx64"-%"PRIx64" %"PRIx64" %s\n",
+		       self->start, self->end, self->pgoff, self->dso->name);
+}
+
+
+struct thread {
+	struct rb_node	 rb_node;
+	struct list_head maps;
+	pid_t		 pid;
+	char		 *comm;
+};
+
+static struct thread *thread__new(pid_t pid)
+{
+	struct thread *self = malloc(sizeof(*self));
+
+	if (self != NULL) {
+		self->pid = pid;
+		self->comm = malloc(32);
+		if (self->comm)
+			snprintf(self->comm, 32, ":%d", self->pid);
+		INIT_LIST_HEAD(&self->maps);
+	}
+
+	return self;
+}
+
+static int thread__set_comm(struct thread *self, const char *comm)
+{
+	if (self->comm)
+		free(self->comm);
+	self->comm = strdup(comm);
+	return self->comm ? 0 : -ENOMEM;
+}
+
+static size_t thread__fprintf(struct thread *self, FILE *fp)
+{
+	struct map *pos;
+	size_t ret = fprintf(fp, "Thread %d %s\n", self->pid, self->comm);
+
+	list_for_each_entry(pos, &self->maps, node)
+		ret += map__fprintf(pos, fp);
+
+	return ret;
+}
+
+
+static struct rb_root threads;
+static struct thread *last_match;
+
+static struct thread *threads__findnew(pid_t pid)
+{
+	struct rb_node **p = &threads.rb_node;
+	struct rb_node *parent = NULL;
+	struct thread *th;
+
+	/*
+	 * Font-end cache - PID lookups come in blocks,
+	 * so most of the time we dont have to look up
+	 * the full rbtree:
+	 */
+	if (last_match && last_match->pid == pid)
+		return last_match;
+
+	while (*p != NULL) {
+		parent = *p;
+		th = rb_entry(parent, struct thread, rb_node);
+
+		if (th->pid == pid) {
+			last_match = th;
+			return th;
+		}
+
+		if (pid < th->pid)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	th = thread__new(pid);
+	if (th != NULL) {
+		rb_link_node(&th->rb_node, parent, p);
+		rb_insert_color(&th->rb_node, &threads);
+		last_match = th;
+	}
+
+	return th;
+}
+
+static void thread__insert_map(struct thread *self, struct map *map)
+{
+	struct map *pos, *tmp;
+
+	list_for_each_entry_safe(pos, tmp, &self->maps, node) {
+		if (map__overlap(pos, map)) {
+			list_del_init(&pos->node);
+			/* XXX leaks dsos */
+			free(pos);
+		}
+	}
+
+	list_add_tail(&map->node, &self->maps);
+}
+
+static int thread__fork(struct thread *self, struct thread *parent)
+{
+	struct map *map;
+
+	if (self->comm)
+		free(self->comm);
+	self->comm = strdup(parent->comm);
+	if (!self->comm)
+		return -ENOMEM;
+
+	list_for_each_entry(map, &parent->maps, node) {
+		struct map *new = map__clone(map);
+		if (!new)
+			return -ENOMEM;
+		thread__insert_map(self, new);
+	}
+
+	return 0;
+}
+
+static struct map *thread__find_map(struct thread *self, uint64_t ip)
+{
+	struct map *pos;
+
+	if (self == NULL)
+		return NULL;
+
+	list_for_each_entry(pos, &self->maps, node)
+		if (ip >= pos->start && ip <= pos->end)
+			return pos;
+
+	return NULL;
+}
+
+static size_t threads__fprintf(FILE *fp)
+{
+	size_t ret = 0;
+	struct rb_node *nd;
+
+	for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
+		struct thread *pos = rb_entry(nd, struct thread, rb_node);
+
+		ret += thread__fprintf(pos, fp);
+	}
+
+	return ret;
+}
+
+/*
+ * histogram, sorted on item, collects counts
+ */
+
+static struct rb_root hist;
+
+struct hist_entry {
+	struct rb_node	 rb_node;
+
+	struct thread	 *thread;
+	struct map	 *map;
+	struct dso	 *dso;
+	struct symbol	 *sym;
+	uint64_t	 ip;
+	char		 level;
+
+	uint32_t	 count;
+};
+
+/*
+ * configurable sorting bits
+ */
+
+struct sort_entry {
+	struct list_head list;
+
+	char *header;
+
+	int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
+	int64_t (*collapse)(struct hist_entry *, struct hist_entry *);
+	size_t	(*print)(FILE *fp, struct hist_entry *);
+};
+
+/* --sort pid */
+
+static int64_t
+sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	return right->thread->pid - left->thread->pid;
+}
+
+static size_t
+sort__thread_print(FILE *fp, struct hist_entry *self)
+{
+	return fprintf(fp, "%16s:%5d", self->thread->comm ?: "", self->thread->pid);
+}
+
+static struct sort_entry sort_thread = {
+	.header = "         Command:  Pid",
+	.cmp	= sort__thread_cmp,
+	.print	= sort__thread_print,
+};
+
+/* --sort comm */
+
+static int64_t
+sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	return right->thread->pid - left->thread->pid;
+}
+
+static int64_t
+sort__comm_collapse(struct hist_entry *left, struct hist_entry *right)
+{
+	char *comm_l = left->thread->comm;
+	char *comm_r = right->thread->comm;
+
+	if (!comm_l || !comm_r) {
+		if (!comm_l && !comm_r)
+			return 0;
+		else if (!comm_l)
+			return -1;
+		else
+			return 1;
+	}
+
+	return strcmp(comm_l, comm_r);
+}
+
+static size_t
+sort__comm_print(FILE *fp, struct hist_entry *self)
+{
+	return fprintf(fp, "%16s", self->thread->comm);
+}
+
+static struct sort_entry sort_comm = {
+	.header		= "         Command",
+	.cmp		= sort__comm_cmp,
+	.collapse	= sort__comm_collapse,
+	.print		= sort__comm_print,
+};
+
+/* --sort dso */
+
+static int64_t
+sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	struct dso *dso_l = left->dso;
+	struct dso *dso_r = right->dso;
+
+	if (!dso_l || !dso_r) {
+		if (!dso_l && !dso_r)
+			return 0;
+		else if (!dso_l)
+			return -1;
+		else
+			return 1;
+	}
+
+	return strcmp(dso_l->name, dso_r->name);
+}
+
+static size_t
+sort__dso_print(FILE *fp, struct hist_entry *self)
+{
+	if (self->dso)
+		return fprintf(fp, "%-25s", self->dso->name);
+
+	return fprintf(fp, "%016llx         ", (__u64)self->ip);
+}
+
+static struct sort_entry sort_dso = {
+	.header = "Shared Object            ",
+	.cmp	= sort__dso_cmp,
+	.print	= sort__dso_print,
+};
+
+/* --sort symbol */
+
+static int64_t
+sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	uint64_t ip_l, ip_r;
+
+	if (left->sym == right->sym)
+		return 0;
+
+	ip_l = left->sym ? left->sym->start : left->ip;
+	ip_r = right->sym ? right->sym->start : right->ip;
+
+	return (int64_t)(ip_r - ip_l);
+}
+
+static size_t
+sort__sym_print(FILE *fp, struct hist_entry *self)
+{
+	size_t ret = 0;
+
+	if (verbose)
+		ret += fprintf(fp, "%#018llx  ", (__u64)self->ip);
+
+	if (self->sym) {
+		ret += fprintf(fp, "[%c] %s",
+			self->dso == kernel_dso ? 'k' : '.', self->sym->name);
+	} else {
+		ret += fprintf(fp, "%#016llx", (__u64)self->ip);
+	}
+
+	return ret;
+}
+
+static struct sort_entry sort_sym = {
+	.header = "Symbol",
+	.cmp	= sort__sym_cmp,
+	.print	= sort__sym_print,
+};
+
+static int sort__need_collapse = 0;
+
+struct sort_dimension {
+	char			*name;
+	struct sort_entry	*entry;
+	int			taken;
+};
+
+static struct sort_dimension sort_dimensions[] = {
+	{ .name = "pid",	.entry = &sort_thread,	},
+	{ .name = "comm",	.entry = &sort_comm,	},
+	{ .name = "dso",	.entry = &sort_dso,	},
+	{ .name = "symbol",	.entry = &sort_sym,	},
+};
+
+static LIST_HEAD(hist_entry__sort_list);
+
+static int sort_dimension__add(char *tok)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(sort_dimensions); i++) {
+		struct sort_dimension *sd = &sort_dimensions[i];
+
+		if (sd->taken)
+			continue;
+
+		if (strncasecmp(tok, sd->name, strlen(tok)))
+			continue;
+
+		if (sd->entry->collapse)
+			sort__need_collapse = 1;
+
+		list_add_tail(&sd->entry->list, &hist_entry__sort_list);
+		sd->taken = 1;
+
+		return 0;
+	}
+
+	return -ESRCH;
+}
+
+static int64_t
+hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	struct sort_entry *se;
+	int64_t cmp = 0;
+
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		cmp = se->cmp(left, right);
+		if (cmp)
+			break;
+	}
+
+	return cmp;
+}
+
+static int64_t
+hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
+{
+	struct sort_entry *se;
+	int64_t cmp = 0;
+
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		int64_t (*f)(struct hist_entry *, struct hist_entry *);
+
+		f = se->collapse ?: se->cmp;
+
+		cmp = f(left, right);
+		if (cmp)
+			break;
+	}
+
+	return cmp;
+}
+
+/*
+ * collect histogram counts
+ */
+static void hist_hit(struct hist_entry *he, uint64_t ip)
+{
+	unsigned int sym_size, offset;
+	struct symbol *sym = he->sym;
+
+	he->count++;
+
+	if (!sym || !sym->hist)
+		return;
+
+	sym_size = sym->end - sym->start;
+	offset = ip - sym->start;
+
+	if (offset >= sym_size)
+		return;
+
+	sym->hist_sum++;
+	sym->hist[offset]++;
+
+	if (verbose >= 3)
+		printf("%p %s: count++ [ip: %p, %08Lx] => %Ld\n",
+			(void *)he->sym->start,
+			he->sym->name,
+			(void *)ip, ip - he->sym->start,
+			sym->hist[offset]);
+}
+
+static int
+hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
+		struct symbol *sym, uint64_t ip, char level)
+{
+	struct rb_node **p = &hist.rb_node;
+	struct rb_node *parent = NULL;
+	struct hist_entry *he;
+	struct hist_entry entry = {
+		.thread	= thread,
+		.map	= map,
+		.dso	= dso,
+		.sym	= sym,
+		.ip	= ip,
+		.level	= level,
+		.count	= 1,
+	};
+	int cmp;
+
+	while (*p != NULL) {
+		parent = *p;
+		he = rb_entry(parent, struct hist_entry, rb_node);
+
+		cmp = hist_entry__cmp(&entry, he);
+
+		if (!cmp) {
+			hist_hit(he, ip);
+
+			return 0;
+		}
+
+		if (cmp < 0)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	he = malloc(sizeof(*he));
+	if (!he)
+		return -ENOMEM;
+	*he = entry;
+	rb_link_node(&he->rb_node, parent, p);
+	rb_insert_color(&he->rb_node, &hist);
+
+	return 0;
+}
+
+static void hist_entry__free(struct hist_entry *he)
+{
+	free(he);
+}
+
+/*
+ * collapse the histogram
+ */
+
+static struct rb_root collapse_hists;
+
+static void collapse__insert_entry(struct hist_entry *he)
+{
+	struct rb_node **p = &collapse_hists.rb_node;
+	struct rb_node *parent = NULL;
+	struct hist_entry *iter;
+	int64_t cmp;
+
+	while (*p != NULL) {
+		parent = *p;
+		iter = rb_entry(parent, struct hist_entry, rb_node);
+
+		cmp = hist_entry__collapse(iter, he);
+
+		if (!cmp) {
+			iter->count += he->count;
+			hist_entry__free(he);
+			return;
+		}
+
+		if (cmp < 0)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&he->rb_node, parent, p);
+	rb_insert_color(&he->rb_node, &collapse_hists);
+}
+
+static void collapse__resort(void)
+{
+	struct rb_node *next;
+	struct hist_entry *n;
+
+	if (!sort__need_collapse)
+		return;
+
+	next = rb_first(&hist);
+	while (next) {
+		n = rb_entry(next, struct hist_entry, rb_node);
+		next = rb_next(&n->rb_node);
+
+		rb_erase(&n->rb_node, &hist);
+		collapse__insert_entry(n);
+	}
+}
+
+/*
+ * reverse the map, sort on count.
+ */
+
+static struct rb_root output_hists;
+
+static void output__insert_entry(struct hist_entry *he)
+{
+	struct rb_node **p = &output_hists.rb_node;
+	struct rb_node *parent = NULL;
+	struct hist_entry *iter;
+
+	while (*p != NULL) {
+		parent = *p;
+		iter = rb_entry(parent, struct hist_entry, rb_node);
+
+		if (he->count > iter->count)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&he->rb_node, parent, p);
+	rb_insert_color(&he->rb_node, &output_hists);
+}
+
+static void output__resort(void)
+{
+	struct rb_node *next;
+	struct hist_entry *n;
+	struct rb_root *tree = &hist;
+
+	if (sort__need_collapse)
+		tree = &collapse_hists;
+
+	next = rb_first(tree);
+
+	while (next) {
+		n = rb_entry(next, struct hist_entry, rb_node);
+		next = rb_next(&n->rb_node);
+
+		rb_erase(&n->rb_node, tree);
+		output__insert_entry(n);
+	}
+}
+
+static void register_idle_thread(void)
+{
+	struct thread *thread = threads__findnew(0);
+
+	if (thread == NULL ||
+			thread__set_comm(thread, "[idle]")) {
+		fprintf(stderr, "problem inserting idle task.\n");
+		exit(-1);
+	}
+}
+
+static unsigned long total = 0,
+		     total_mmap = 0,
+		     total_comm = 0,
+		     total_fork = 0,
+		     total_unknown = 0;
+
+static int
+process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	char level;
+	int show = 0;
+	struct dso *dso = NULL;
+	struct thread *thread = threads__findnew(event->ip.pid);
+	uint64_t ip = event->ip.ip;
+	struct map *map = NULL;
+
+	dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->header.misc,
+		event->ip.pid,
+		(void *)(long)ip);
+
+	dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
+
+	if (thread == NULL) {
+		fprintf(stderr, "problem processing %d event, skipping it.\n",
+			event->header.type);
+		return -1;
+	}
+
+	if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
+		show = SHOW_KERNEL;
+		level = 'k';
+
+		dso = kernel_dso;
+
+		dprintf(" ...... dso: %s\n", dso->name);
+
+	} else if (event->header.misc & PERF_EVENT_MISC_USER) {
+
+		show = SHOW_USER;
+		level = '.';
+
+		map = thread__find_map(thread, ip);
+		if (map != NULL) {
+			ip = map->map_ip(map, ip);
+			dso = map->dso;
+		} else {
+			/*
+			 * If this is outside of all known maps,
+			 * and is a negative address, try to look it
+			 * up in the kernel dso, as it might be a
+			 * vsyscall (which executes in user-mode):
+			 */
+			if ((long long)ip < 0)
+				dso = kernel_dso;
+		}
+		dprintf(" ...... dso: %s\n", dso ? dso->name : "<not found>");
+
+	} else {
+		show = SHOW_HV;
+		level = 'H';
+		dprintf(" ...... dso: [hypervisor]\n");
+	}
+
+	if (show & show_mask) {
+		struct symbol *sym = NULL;
+
+		if (dso)
+			sym = dso->find_symbol(dso, ip);
+
+		if (hist_entry__add(thread, map, dso, sym, ip, level)) {
+			fprintf(stderr,
+		"problem incrementing symbol count, skipping event\n");
+			return -1;
+		}
+	}
+	total++;
+
+	return 0;
+}
+
+static int
+process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	struct thread *thread = threads__findnew(event->mmap.pid);
+	struct map *map = map__new(&event->mmap);
+
+	dprintf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->mmap.pid,
+		(void *)(long)event->mmap.start,
+		(void *)(long)event->mmap.len,
+		(void *)(long)event->mmap.pgoff,
+		event->mmap.filename);
+
+	if (thread == NULL || map == NULL) {
+		dprintf("problem processing PERF_EVENT_MMAP, skipping event.\n");
+		return 0;
+	}
+
+	thread__insert_map(thread, map);
+	total_mmap++;
+
+	return 0;
+}
+
+static int
+process_comm_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	struct thread *thread = threads__findnew(event->comm.pid);
+
+	dprintf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->comm.comm, event->comm.pid);
+
+	if (thread == NULL ||
+	    thread__set_comm(thread, event->comm.comm)) {
+		dprintf("problem processing PERF_EVENT_COMM, skipping event.\n");
+		return -1;
+	}
+	total_comm++;
+
+	return 0;
+}
+
+static int
+process_fork_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	struct thread *thread = threads__findnew(event->fork.pid);
+	struct thread *parent = threads__findnew(event->fork.ppid);
+
+	dprintf("%p [%p]: PERF_EVENT_FORK: %d:%d\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->fork.pid, event->fork.ppid);
+
+	if (!thread || !parent || thread__fork(thread, parent)) {
+		dprintf("problem processing PERF_EVENT_FORK, skipping event.\n");
+		return -1;
+	}
+	total_fork++;
+
+	return 0;
+}
+
+static int
+process_period_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	dprintf("%p [%p]: PERF_EVENT_PERIOD: time:%Ld, id:%Ld: period:%Ld\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->period.time,
+		event->period.id,
+		event->period.sample_period);
+
+	return 0;
+}
+
+static int
+process_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW)
+		return process_overflow_event(event, offset, head);
+
+	switch (event->header.type) {
+	case PERF_EVENT_MMAP:
+		return process_mmap_event(event, offset, head);
+
+	case PERF_EVENT_COMM:
+		return process_comm_event(event, offset, head);
+
+	case PERF_EVENT_FORK:
+		return process_fork_event(event, offset, head);
+
+	case PERF_EVENT_PERIOD:
+		return process_period_event(event, offset, head);
+	/*
+	 * We dont process them right now but they are fine:
+	 */
+
+	case PERF_EVENT_THROTTLE:
+	case PERF_EVENT_UNTHROTTLE:
+		return 0;
+
+	default:
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+parse_line(FILE *file, struct symbol *sym, uint64_t start, uint64_t len)
+{
+	char *line = NULL, *tmp, *tmp2;
+	unsigned int offset;
+	size_t line_len;
+	__u64 line_ip;
+	int ret;
+	char *c;
+
+	if (getline(&line, &line_len, file) < 0)
+		return -1;
+	if (!line)
+		return -1;
+
+	c = strchr(line, '\n');
+	if (c)
+		*c = 0;
+
+	line_ip = -1;
+	offset = 0;
+	ret = -2;
+
+	/*
+	 * Strip leading spaces:
+	 */
+	tmp = line;
+	while (*tmp) {
+		if (*tmp != ' ')
+			break;
+		tmp++;
+	}
+
+	if (*tmp) {
+		/*
+		 * Parse hexa addresses followed by ':'
+		 */
+		line_ip = strtoull(tmp, &tmp2, 16);
+		if (*tmp2 != ':')
+			line_ip = -1;
+	}
+
+	if (line_ip != -1) {
+		unsigned int hits = 0;
+		double percent = 0.0;
+		char *color = PERF_COLOR_NORMAL;
+
+		offset = line_ip - start;
+		if (offset < len)
+			hits = sym->hist[offset];
+
+		if (sym->hist_sum)
+			percent = 100.0 * hits / sym->hist_sum;
+
+		/*
+		 * We color high-overhead entries in red, low-overhead
+		 * entries in green - and keep the middle ground normal:
+		 */
+		if (percent >= 5.0)
+			color = PERF_COLOR_RED;
+		else {
+			if (percent > 0.5)
+				color = PERF_COLOR_GREEN;
+		}
+
+		color_fprintf(stdout, color, " %7.2f", percent);
+		printf(" :	");
+		color_fprintf(stdout, PERF_COLOR_BLUE, "%s\n", line);
+	} else {
+		if (!*line)
+			printf("         :\n");
+		else
+			printf("         :	%s\n", line);
+	}
+
+	return 0;
+}
+
+static void annotate_sym(struct dso *dso, struct symbol *sym)
+{
+	char *filename = dso->name;
+	uint64_t start, end, len;
+	char command[PATH_MAX*2];
+	FILE *file;
+
+	if (!filename)
+		return;
+	if (dso == kernel_dso)
+		filename = vmlinux;
+
+	printf("\n------------------------------------------------\n");
+	printf(" Percent |	Source code & Disassembly of %s\n", filename);
+	printf("------------------------------------------------\n");
+
+	if (verbose >= 2)
+		printf("annotating [%p] %30s : [%p] %30s\n", dso, dso->name, sym, sym->name);
+
+	start = sym->obj_start;
+	if (!start)
+		start = sym->start;
+
+	end = start + sym->end - sym->start + 1;
+	len = sym->end - sym->start;
+
+	sprintf(command, "objdump --start-address=0x%016Lx --stop-address=0x%016Lx -dS %s", (__u64)start, (__u64)end, filename);
+
+	if (verbose >= 3)
+		printf("doing: %s\n", command);
+
+	file = popen(command, "r");
+	if (!file)
+		return;
+
+	while (!feof(file)) {
+		if (parse_line(file, sym, start, len) < 0)
+			break;
+	}
+
+	pclose(file);
+}
+
+static void find_annotations(void)
+{
+	struct rb_node *nd;
+	struct dso *dso;
+	int count = 0;
+
+	list_for_each_entry(dso, &dsos, node) {
+
+		for (nd = rb_first(&dso->syms); nd; nd = rb_next(nd)) {
+			struct symbol *sym = rb_entry(nd, struct symbol, rb_node);
+
+			if (sym->hist) {
+				annotate_sym(dso, sym);
+				count++;
+			}
+		}
+	}
+
+	if (!count)
+		printf(" Error: symbol '%s' not present amongst the samples.\n", sym_hist_filter);
+}
+
+static int __cmd_annotate(void)
+{
+	int ret, rc = EXIT_FAILURE;
+	unsigned long offset = 0;
+	unsigned long head = 0;
+	struct stat stat;
+	event_t *event;
+	uint32_t size;
+	char *buf;
+
+	register_idle_thread();
+
+	input = open(input_name, O_RDONLY);
+	if (input < 0) {
+		perror("failed to open file");
+		exit(-1);
+	}
+
+	ret = fstat(input, &stat);
+	if (ret < 0) {
+		perror("failed to stat file");
+		exit(-1);
+	}
+
+	if (!stat.st_size) {
+		fprintf(stderr, "zero-sized file, nothing to do!\n");
+		exit(0);
+	}
+
+	if (load_kernel() < 0) {
+		perror("failed to load kernel symbols");
+		return EXIT_FAILURE;
+	}
+
+remap:
+	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
+			   MAP_SHARED, input, offset);
+	if (buf == MAP_FAILED) {
+		perror("failed to mmap file");
+		exit(-1);
+	}
+
+more:
+	event = (event_t *)(buf + head);
+
+	size = event->header.size;
+	if (!size)
+		size = 8;
+
+	if (head + event->header.size >= page_size * mmap_window) {
+		unsigned long shift = page_size * (head / page_size);
+		int ret;
+
+		ret = munmap(buf, page_size * mmap_window);
+		assert(ret == 0);
+
+		offset += shift;
+		head -= shift;
+		goto remap;
+	}
+
+	size = event->header.size;
+
+	dprintf("%p [%p]: event: %d\n",
+			(void *)(offset + head),
+			(void *)(long)event->header.size,
+			event->header.type);
+
+	if (!size || process_event(event, offset, head) < 0) {
+
+		dprintf("%p [%p]: skipping unknown header type: %d\n",
+			(void *)(offset + head),
+			(void *)(long)(event->header.size),
+			event->header.type);
+
+		total_unknown++;
+
+		/*
+		 * assume we lost track of the stream, check alignment, and
+		 * increment a single u64 in the hope to catch on again 'soon'.
+		 */
+
+		if (unlikely(head & 7))
+			head &= ~7ULL;
+
+		size = 8;
+	}
+
+	head += size;
+
+	if (offset + head < stat.st_size)
+		goto more;
+
+	rc = EXIT_SUCCESS;
+	close(input);
+
+	dprintf("      IP events: %10ld\n", total);
+	dprintf("    mmap events: %10ld\n", total_mmap);
+	dprintf("    comm events: %10ld\n", total_comm);
+	dprintf("    fork events: %10ld\n", total_fork);
+	dprintf(" unknown events: %10ld\n", total_unknown);
+
+	if (dump_trace)
+		return 0;
+
+	if (verbose >= 3)
+		threads__fprintf(stdout);
+
+	if (verbose >= 2)
+		dsos__fprintf(stdout);
+
+	collapse__resort();
+	output__resort();
+
+	find_annotations();
+
+	return rc;
+}
+
+static const char * const annotate_usage[] = {
+	"perf annotate [<options>] <command>",
+	NULL
+};
+
+static const struct option options[] = {
+	OPT_STRING('i', "input", &input_name, "file",
+		    "input file name"),
+	OPT_STRING('s', "symbol", &sym_hist_filter, "file",
+		    "symbol to annotate"),
+	OPT_BOOLEAN('v', "verbose", &verbose,
+		    "be more verbose (show symbol address, etc)"),
+	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
+		    "dump raw trace in ASCII"),
+	OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
+	OPT_END()
+};
+
+static void setup_sorting(void)
+{
+	char *tmp, *tok, *str = strdup(sort_order);
+
+	for (tok = strtok_r(str, ", ", &tmp);
+			tok; tok = strtok_r(NULL, ", ", &tmp)) {
+		if (sort_dimension__add(tok) < 0) {
+			error("Unknown --sort key: `%s'", tok);
+			usage_with_options(annotate_usage, options);
+		}
+	}
+
+	free(str);
+}
+
+int cmd_annotate(int argc, const char **argv, const char *prefix)
+{
+	symbol__init();
+
+	page_size = getpagesize();
+
+	argc = parse_options(argc, argv, options, annotate_usage, 0);
+
+	setup_sorting();
+
+	if (argc) {
+		/*
+		 * Special case: if there's an argument left then assume tha
+		 * it's a symbol filter:
+		 */
+		if (argc > 1)
+			usage_with_options(annotate_usage, options);
+
+		sym_hist_filter = argv[0];
+	}
+
+	if (!sym_hist_filter)
+		usage_with_options(annotate_usage, options);
+
+	setup_pager();
+
+	return __cmd_annotate();
+}
diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c
new file mode 100644
index 0000000..0f32dc3
--- /dev/null
+++ b/tools/perf/builtin-help.c
@@ -0,0 +1,461 @@
+/*
+ * builtin-help.c
+ *
+ * Builtin help command
+ */
+#include "util/cache.h"
+#include "builtin.h"
+#include "util/exec_cmd.h"
+#include "common-cmds.h"
+#include "util/parse-options.h"
+#include "util/run-command.h"
+#include "util/help.h"
+
+static struct man_viewer_list {
+	struct man_viewer_list *next;
+	char name[FLEX_ARRAY];
+} *man_viewer_list;
+
+static struct man_viewer_info_list {
+	struct man_viewer_info_list *next;
+	const char *info;
+	char name[FLEX_ARRAY];
+} *man_viewer_info_list;
+
+enum help_format {
+	HELP_FORMAT_MAN,
+	HELP_FORMAT_INFO,
+	HELP_FORMAT_WEB,
+};
+
+static int show_all = 0;
+static enum help_format help_format = HELP_FORMAT_MAN;
+static struct option builtin_help_options[] = {
+	OPT_BOOLEAN('a', "all", &show_all, "print all available commands"),
+	OPT_SET_INT('m', "man", &help_format, "show man page", HELP_FORMAT_MAN),
+	OPT_SET_INT('w', "web", &help_format, "show manual in web browser",
+			HELP_FORMAT_WEB),
+	OPT_SET_INT('i', "info", &help_format, "show info page",
+			HELP_FORMAT_INFO),
+	OPT_END(),
+};
+
+static const char * const builtin_help_usage[] = {
+	"perf help [--all] [--man|--web|--info] [command]",
+	NULL
+};
+
+static enum help_format parse_help_format(const char *format)
+{
+	if (!strcmp(format, "man"))
+		return HELP_FORMAT_MAN;
+	if (!strcmp(format, "info"))
+		return HELP_FORMAT_INFO;
+	if (!strcmp(format, "web") || !strcmp(format, "html"))
+		return HELP_FORMAT_WEB;
+	die("unrecognized help format '%s'", format);
+}
+
+static const char *get_man_viewer_info(const char *name)
+{
+	struct man_viewer_info_list *viewer;
+
+	for (viewer = man_viewer_info_list; viewer; viewer = viewer->next)
+	{
+		if (!strcasecmp(name, viewer->name))
+			return viewer->info;
+	}
+	return NULL;
+}
+
+static int check_emacsclient_version(void)
+{
+	struct strbuf buffer = STRBUF_INIT;
+	struct child_process ec_process;
+	const char *argv_ec[] = { "emacsclient", "--version", NULL };
+	int version;
+
+	/* emacsclient prints its version number on stderr */
+	memset(&ec_process, 0, sizeof(ec_process));
+	ec_process.argv = argv_ec;
+	ec_process.err = -1;
+	ec_process.stdout_to_stderr = 1;
+	if (start_command(&ec_process)) {
+		fprintf(stderr, "Failed to start emacsclient.\n");
+		return -1;
+	}
+	strbuf_read(&buffer, ec_process.err, 20);
+	close(ec_process.err);
+
+	/*
+	 * Don't bother checking return value, because "emacsclient --version"
+	 * seems to always exits with code 1.
+	 */
+	finish_command(&ec_process);
+
+	if (prefixcmp(buffer.buf, "emacsclient")) {
+		fprintf(stderr, "Failed to parse emacsclient version.\n");
+		strbuf_release(&buffer);
+		return -1;
+	}
+
+	strbuf_remove(&buffer, 0, strlen("emacsclient"));
+	version = atoi(buffer.buf);
+
+	if (version < 22) {
+		fprintf(stderr,
+			"emacsclient version '%d' too old (< 22).\n",
+			version);
+		strbuf_release(&buffer);
+		return -1;
+	}
+
+	strbuf_release(&buffer);
+	return 0;
+}
+
+static void exec_woman_emacs(const char* path, const char *page)
+{
+	if (!check_emacsclient_version()) {
+		/* This works only with emacsclient version >= 22. */
+		struct strbuf man_page = STRBUF_INIT;
+
+		if (!path)
+			path = "emacsclient";
+		strbuf_addf(&man_page, "(woman \"%s\")", page);
+		execlp(path, "emacsclient", "-e", man_page.buf, NULL);
+		warning("failed to exec '%s': %s", path, strerror(errno));
+	}
+}
+
+static void exec_man_konqueror(const char* path, const char *page)
+{
+	const char *display = getenv("DISPLAY");
+	if (display && *display) {
+		struct strbuf man_page = STRBUF_INIT;
+		const char *filename = "kfmclient";
+
+		/* It's simpler to launch konqueror using kfmclient. */
+		if (path) {
+			const char *file = strrchr(path, '/');
+			if (file && !strcmp(file + 1, "konqueror")) {
+				char *new = strdup(path);
+				char *dest = strrchr(new, '/');
+
+				/* strlen("konqueror") == strlen("kfmclient") */
+				strcpy(dest + 1, "kfmclient");
+				path = new;
+			}
+			if (file)
+				filename = file;
+		} else
+			path = "kfmclient";
+		strbuf_addf(&man_page, "man:%s(1)", page);
+		execlp(path, filename, "newTab", man_page.buf, NULL);
+		warning("failed to exec '%s': %s", path, strerror(errno));
+	}
+}
+
+static void exec_man_man(const char* path, const char *page)
+{
+	if (!path)
+		path = "man";
+	execlp(path, "man", page, NULL);
+	warning("failed to exec '%s': %s", path, strerror(errno));
+}
+
+static void exec_man_cmd(const char *cmd, const char *page)
+{
+	struct strbuf shell_cmd = STRBUF_INIT;
+	strbuf_addf(&shell_cmd, "%s %s", cmd, page);
+	execl("/bin/sh", "sh", "-c", shell_cmd.buf, NULL);
+	warning("failed to exec '%s': %s", cmd, strerror(errno));
+}
+
+static void add_man_viewer(const char *name)
+{
+	struct man_viewer_list **p = &man_viewer_list;
+	size_t len = strlen(name);
+
+	while (*p)
+		p = &((*p)->next);
+	*p = calloc(1, (sizeof(**p) + len + 1));
+	strncpy((*p)->name, name, len);
+}
+
+static int supported_man_viewer(const char *name, size_t len)
+{
+	return (!strncasecmp("man", name, len) ||
+		!strncasecmp("woman", name, len) ||
+		!strncasecmp("konqueror", name, len));
+}
+
+static void do_add_man_viewer_info(const char *name,
+				   size_t len,
+				   const char *value)
+{
+	struct man_viewer_info_list *new = calloc(1, sizeof(*new) + len + 1);
+
+	strncpy(new->name, name, len);
+	new->info = strdup(value);
+	new->next = man_viewer_info_list;
+	man_viewer_info_list = new;
+}
+
+static int add_man_viewer_path(const char *name,
+			       size_t len,
+			       const char *value)
+{
+	if (supported_man_viewer(name, len))
+		do_add_man_viewer_info(name, len, value);
+	else
+		warning("'%s': path for unsupported man viewer.\n"
+			"Please consider using 'man.<tool>.cmd' instead.",
+			name);
+
+	return 0;
+}
+
+static int add_man_viewer_cmd(const char *name,
+			      size_t len,
+			      const char *value)
+{
+	if (supported_man_viewer(name, len))
+		warning("'%s': cmd for supported man viewer.\n"
+			"Please consider using 'man.<tool>.path' instead.",
+			name);
+	else
+		do_add_man_viewer_info(name, len, value);
+
+	return 0;
+}
+
+static int add_man_viewer_info(const char *var, const char *value)
+{
+	const char *name = var + 4;
+	const char *subkey = strrchr(name, '.');
+
+	if (!subkey)
+		return error("Config with no key for man viewer: %s", name);
+
+	if (!strcmp(subkey, ".path")) {
+		if (!value)
+			return config_error_nonbool(var);
+		return add_man_viewer_path(name, subkey - name, value);
+	}
+	if (!strcmp(subkey, ".cmd")) {
+		if (!value)
+			return config_error_nonbool(var);
+		return add_man_viewer_cmd(name, subkey - name, value);
+	}
+
+	warning("'%s': unsupported man viewer sub key.", subkey);
+	return 0;
+}
+
+static int perf_help_config(const char *var, const char *value, void *cb)
+{
+	if (!strcmp(var, "help.format")) {
+		if (!value)
+			return config_error_nonbool(var);
+		help_format = parse_help_format(value);
+		return 0;
+	}
+	if (!strcmp(var, "man.viewer")) {
+		if (!value)
+			return config_error_nonbool(var);
+		add_man_viewer(value);
+		return 0;
+	}
+	if (!prefixcmp(var, "man."))
+		return add_man_viewer_info(var, value);
+
+	return perf_default_config(var, value, cb);
+}
+
+static struct cmdnames main_cmds, other_cmds;
+
+void list_common_cmds_help(void)
+{
+	int i, longest = 0;
+
+	for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
+		if (longest < strlen(common_cmds[i].name))
+			longest = strlen(common_cmds[i].name);
+	}
+
+	puts(" The most commonly used perf commands are:");
+	for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
+		printf("   %s   ", common_cmds[i].name);
+		mput_char(' ', longest - strlen(common_cmds[i].name));
+		puts(common_cmds[i].help);
+	}
+}
+
+static int is_perf_command(const char *s)
+{
+	return is_in_cmdlist(&main_cmds, s) ||
+		is_in_cmdlist(&other_cmds, s);
+}
+
+static const char *prepend(const char *prefix, const char *cmd)
+{
+	size_t pre_len = strlen(prefix);
+	size_t cmd_len = strlen(cmd);
+	char *p = malloc(pre_len + cmd_len + 1);
+	memcpy(p, prefix, pre_len);
+	strcpy(p + pre_len, cmd);
+	return p;
+}
+
+static const char *cmd_to_page(const char *perf_cmd)
+{
+	if (!perf_cmd)
+		return "perf";
+	else if (!prefixcmp(perf_cmd, "perf"))
+		return perf_cmd;
+	else if (is_perf_command(perf_cmd))
+		return prepend("perf-", perf_cmd);
+	else
+		return prepend("perf-", perf_cmd);
+}
+
+static void setup_man_path(void)
+{
+	struct strbuf new_path = STRBUF_INIT;
+	const char *old_path = getenv("MANPATH");
+
+	/* We should always put ':' after our path. If there is no
+	 * old_path, the ':' at the end will let 'man' to try
+	 * system-wide paths after ours to find the manual page. If
+	 * there is old_path, we need ':' as delimiter. */
+	strbuf_addstr(&new_path, system_path(PERF_MAN_PATH));
+	strbuf_addch(&new_path, ':');
+	if (old_path)
+		strbuf_addstr(&new_path, old_path);
+
+	setenv("MANPATH", new_path.buf, 1);
+
+	strbuf_release(&new_path);
+}
+
+static void exec_viewer(const char *name, const char *page)
+{
+	const char *info = get_man_viewer_info(name);
+
+	if (!strcasecmp(name, "man"))
+		exec_man_man(info, page);
+	else if (!strcasecmp(name, "woman"))
+		exec_woman_emacs(info, page);
+	else if (!strcasecmp(name, "konqueror"))
+		exec_man_konqueror(info, page);
+	else if (info)
+		exec_man_cmd(info, page);
+	else
+		warning("'%s': unknown man viewer.", name);
+}
+
+static void show_man_page(const char *perf_cmd)
+{
+	struct man_viewer_list *viewer;
+	const char *page = cmd_to_page(perf_cmd);
+	const char *fallback = getenv("PERF_MAN_VIEWER");
+
+	setup_man_path();
+	for (viewer = man_viewer_list; viewer; viewer = viewer->next)
+	{
+		exec_viewer(viewer->name, page); /* will return when unable */
+	}
+	if (fallback)
+		exec_viewer(fallback, page);
+	exec_viewer("man", page);
+	die("no man viewer handled the request");
+}
+
+static void show_info_page(const char *perf_cmd)
+{
+	const char *page = cmd_to_page(perf_cmd);
+	setenv("INFOPATH", system_path(PERF_INFO_PATH), 1);
+	execlp("info", "info", "perfman", page, NULL);
+}
+
+static void get_html_page_path(struct strbuf *page_path, const char *page)
+{
+	struct stat st;
+	const char *html_path = system_path(PERF_HTML_PATH);
+
+	/* Check that we have a perf documentation directory. */
+	if (stat(mkpath("%s/perf.html", html_path), &st)
+	    || !S_ISREG(st.st_mode))
+		die("'%s': not a documentation directory.", html_path);
+
+	strbuf_init(page_path, 0);
+	strbuf_addf(page_path, "%s/%s.html", html_path, page);
+}
+
+/*
+ * If open_html is not defined in a platform-specific way (see for
+ * example compat/mingw.h), we use the script web--browse to display
+ * HTML.
+ */
+#ifndef open_html
+static void open_html(const char *path)
+{
+	execl_perf_cmd("web--browse", "-c", "help.browser", path, NULL);
+}
+#endif
+
+static void show_html_page(const char *perf_cmd)
+{
+	const char *page = cmd_to_page(perf_cmd);
+	struct strbuf page_path; /* it leaks but we exec bellow */
+
+	get_html_page_path(&page_path, page);
+
+	open_html(page_path.buf);
+}
+
+int cmd_help(int argc, const char **argv, const char *prefix)
+{
+	const char *alias;
+	load_command_list("perf-", &main_cmds, &other_cmds);
+
+	perf_config(perf_help_config, NULL);
+
+	argc = parse_options(argc, argv, builtin_help_options,
+			builtin_help_usage, 0);
+
+	if (show_all) {
+		printf("\n usage: %s\n\n", perf_usage_string);
+		list_commands("perf commands", &main_cmds, &other_cmds);
+		printf(" %s\n\n", perf_more_info_string);
+		return 0;
+	}
+
+	if (!argv[0]) {
+		printf("\n usage: %s\n\n", perf_usage_string);
+		list_common_cmds_help();
+		printf("\n %s\n\n", perf_more_info_string);
+		return 0;
+	}
+
+	alias = alias_lookup(argv[0]);
+	if (alias && !is_perf_command(argv[0])) {
+		printf("`perf %s' is aliased to `%s'\n", argv[0], alias);
+		return 0;
+	}
+
+	switch (help_format) {
+	case HELP_FORMAT_MAN:
+		show_man_page(argv[0]);
+		break;
+	case HELP_FORMAT_INFO:
+		show_info_page(argv[0]);
+		break;
+	case HELP_FORMAT_WEB:
+		show_html_page(argv[0]);
+		break;
+	}
+
+	return 0;
+}
diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c
new file mode 100644
index 0000000..fe60e37
--- /dev/null
+++ b/tools/perf/builtin-list.c
@@ -0,0 +1,20 @@
+/*
+ * builtin-list.c
+ *
+ * Builtin list command: list all event types
+ *
+ * Copyright (C) 2009, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008-2009, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
+ */
+#include "builtin.h"
+
+#include "perf.h"
+
+#include "util/parse-options.h"
+#include "util/parse-events.h"
+
+int cmd_list(int argc, const char **argv, const char *prefix)
+{
+	print_events();
+	return 0;
+}
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
new file mode 100644
index 0000000..aeab9c4
--- /dev/null
+++ b/tools/perf/builtin-record.c
@@ -0,0 +1,544 @@
+/*
+ * builtin-record.c
+ *
+ * Builtin record command: Record the profile of a workload
+ * (or a CPU, or a PID) into the perf.data output file - for
+ * later analysis via perf report.
+ */
+#include "builtin.h"
+
+#include "perf.h"
+
+#include "util/util.h"
+#include "util/parse-options.h"
+#include "util/parse-events.h"
+#include "util/string.h"
+
+#include <unistd.h>
+#include <sched.h>
+
+#define ALIGN(x, a)		__ALIGN_MASK(x, (typeof(x))(a)-1)
+#define __ALIGN_MASK(x, mask)	(((x)+(mask))&~(mask))
+
+static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+
+static long			default_interval		= 100000;
+
+static int			nr_cpus				= 0;
+static unsigned int		page_size;
+static unsigned int		mmap_pages			= 128;
+static int			freq				= 0;
+static int			output;
+static const char		*output_name			= "perf.data";
+static int			group				= 0;
+static unsigned int		realtime_prio			= 0;
+static int			system_wide			= 0;
+static pid_t			target_pid			= -1;
+static int			inherit				= 1;
+static int			force				= 0;
+static int			append_file			= 0;
+
+static long			samples;
+static struct timeval		last_read;
+static struct timeval		this_read;
+
+static __u64			bytes_written;
+
+static struct pollfd		event_array[MAX_NR_CPUS * MAX_COUNTERS];
+
+static int			nr_poll;
+static int			nr_cpu;
+
+struct mmap_event {
+	struct perf_event_header	header;
+	__u32				pid;
+	__u32				tid;
+	__u64				start;
+	__u64				len;
+	__u64				pgoff;
+	char				filename[PATH_MAX];
+};
+
+struct comm_event {
+	struct perf_event_header	header;
+	__u32				pid;
+	__u32				tid;
+	char				comm[16];
+};
+
+
+struct mmap_data {
+	int			counter;
+	void			*base;
+	unsigned int		mask;
+	unsigned int		prev;
+};
+
+static struct mmap_data		mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
+
+static unsigned int mmap_read_head(struct mmap_data *md)
+{
+	struct perf_counter_mmap_page *pc = md->base;
+	int head;
+
+	head = pc->data_head;
+	rmb();
+
+	return head;
+}
+
+static void mmap_read(struct mmap_data *md)
+{
+	unsigned int head = mmap_read_head(md);
+	unsigned int old = md->prev;
+	unsigned char *data = md->base + page_size;
+	unsigned long size;
+	void *buf;
+	int diff;
+
+	gettimeofday(&this_read, NULL);
+
+	/*
+	 * If we're further behind than half the buffer, there's a chance
+	 * the writer will bite our tail and mess up the samples under us.
+	 *
+	 * If we somehow ended up ahead of the head, we got messed up.
+	 *
+	 * In either case, truncate and restart at head.
+	 */
+	diff = head - old;
+	if (diff > md->mask / 2 || diff < 0) {
+		struct timeval iv;
+		unsigned long msecs;
+
+		timersub(&this_read, &last_read, &iv);
+		msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
+
+		fprintf(stderr, "WARNING: failed to keep up with mmap data."
+				"  Last read %lu msecs ago.\n", msecs);
+
+		/*
+		 * head points to a known good entry, start there.
+		 */
+		old = head;
+	}
+
+	last_read = this_read;
+
+	if (old != head)
+		samples++;
+
+	size = head - old;
+
+	if ((old & md->mask) + size != (head & md->mask)) {
+		buf = &data[old & md->mask];
+		size = md->mask + 1 - (old & md->mask);
+		old += size;
+
+		while (size) {
+			int ret = write(output, buf, size);
+
+			if (ret < 0)
+				die("failed to write");
+
+			size -= ret;
+			buf += ret;
+
+			bytes_written += ret;
+		}
+	}
+
+	buf = &data[old & md->mask];
+	size = head - old;
+	old += size;
+
+	while (size) {
+		int ret = write(output, buf, size);
+
+		if (ret < 0)
+			die("failed to write");
+
+		size -= ret;
+		buf += ret;
+
+		bytes_written += ret;
+	}
+
+	md->prev = old;
+}
+
+static volatile int done = 0;
+
+static void sig_handler(int sig)
+{
+	done = 1;
+}
+
+static void pid_synthesize_comm_event(pid_t pid, int full)
+{
+	struct comm_event comm_ev;
+	char filename[PATH_MAX];
+	char bf[BUFSIZ];
+	int fd, ret;
+	size_t size;
+	char *field, *sep;
+	DIR *tasks;
+	struct dirent dirent, *next;
+
+	snprintf(filename, sizeof(filename), "/proc/%d/stat", pid);
+
+	fd = open(filename, O_RDONLY);
+	if (fd < 0) {
+		fprintf(stderr, "couldn't open %s\n", filename);
+		exit(EXIT_FAILURE);
+	}
+	if (read(fd, bf, sizeof(bf)) < 0) {
+		fprintf(stderr, "couldn't read %s\n", filename);
+		exit(EXIT_FAILURE);
+	}
+	close(fd);
+
+	/* 9027 (cat) R 6747 9027 6747 34816 9027 ... */
+	memset(&comm_ev, 0, sizeof(comm_ev));
+	field = strchr(bf, '(');
+	if (field == NULL)
+		goto out_failure;
+	sep = strchr(++field, ')');
+	if (sep == NULL)
+		goto out_failure;
+	size = sep - field;
+	memcpy(comm_ev.comm, field, size++);
+
+	comm_ev.pid = pid;
+	comm_ev.header.type = PERF_EVENT_COMM;
+	size = ALIGN(size, sizeof(uint64_t));
+	comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
+
+	if (!full) {
+		comm_ev.tid = pid;
+
+		ret = write(output, &comm_ev, comm_ev.header.size);
+		if (ret < 0) {
+			perror("failed to write");
+			exit(-1);
+		}
+		return;
+	}
+
+	snprintf(filename, sizeof(filename), "/proc/%d/task", pid);
+
+	tasks = opendir(filename);
+	while (!readdir_r(tasks, &dirent, &next) && next) {
+		char *end;
+		pid = strtol(dirent.d_name, &end, 10);
+		if (*end)
+			continue;
+
+		comm_ev.tid = pid;
+
+		ret = write(output, &comm_ev, comm_ev.header.size);
+		if (ret < 0) {
+			perror("failed to write");
+			exit(-1);
+		}
+	}
+	closedir(tasks);
+	return;
+
+out_failure:
+	fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n",
+		filename);
+	exit(EXIT_FAILURE);
+}
+
+static void pid_synthesize_mmap_samples(pid_t pid)
+{
+	char filename[PATH_MAX];
+	FILE *fp;
+
+	snprintf(filename, sizeof(filename), "/proc/%d/maps", pid);
+
+	fp = fopen(filename, "r");
+	if (fp == NULL) {
+		fprintf(stderr, "couldn't open %s\n", filename);
+		exit(EXIT_FAILURE);
+	}
+	while (1) {
+		char bf[BUFSIZ], *pbf = bf;
+		struct mmap_event mmap_ev = {
+			.header.type = PERF_EVENT_MMAP,
+		};
+		int n;
+		size_t size;
+		if (fgets(bf, sizeof(bf), fp) == NULL)
+			break;
+
+		/* 00400000-0040c000 r-xp 00000000 fd:01 41038  /bin/cat */
+		n = hex2u64(pbf, &mmap_ev.start);
+		if (n < 0)
+			continue;
+		pbf += n + 1;
+		n = hex2u64(pbf, &mmap_ev.len);
+		if (n < 0)
+			continue;
+		pbf += n + 3;
+		if (*pbf == 'x') { /* vm_exec */
+			char *execname = strrchr(bf, ' ');
+
+			if (execname == NULL || execname[1] != '/')
+				continue;
+
+			execname += 1;
+			size = strlen(execname);
+			execname[size - 1] = '\0'; /* Remove \n */
+			memcpy(mmap_ev.filename, execname, size);
+			size = ALIGN(size, sizeof(uint64_t));
+			mmap_ev.len -= mmap_ev.start;
+			mmap_ev.header.size = (sizeof(mmap_ev) -
+					       (sizeof(mmap_ev.filename) - size));
+			mmap_ev.pid = pid;
+			mmap_ev.tid = pid;
+
+			if (write(output, &mmap_ev, mmap_ev.header.size) < 0) {
+				perror("failed to write");
+				exit(-1);
+			}
+		}
+	}
+
+	fclose(fp);
+}
+
+static void synthesize_samples(void)
+{
+	DIR *proc;
+	struct dirent dirent, *next;
+
+	proc = opendir("/proc");
+
+	while (!readdir_r(proc, &dirent, &next) && next) {
+		char *end;
+		pid_t pid;
+
+		pid = strtol(dirent.d_name, &end, 10);
+		if (*end) /* only interested in proper numerical dirents */
+			continue;
+
+		pid_synthesize_comm_event(pid, 1);
+		pid_synthesize_mmap_samples(pid);
+	}
+
+	closedir(proc);
+}
+
+static int group_fd;
+
+static void create_counter(int counter, int cpu, pid_t pid)
+{
+	struct perf_counter_attr *attr = attrs + counter;
+	int track = 1;
+
+	attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_PERIOD;
+	if (freq) {
+		attr->freq		= 1;
+		attr->sample_freq	= freq;
+	}
+	attr->mmap		= track;
+	attr->comm		= track;
+	attr->inherit		= (cpu < 0) && inherit;
+
+	track = 0; /* only the first counter needs these */
+
+	fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0);
+
+	if (fd[nr_cpu][counter] < 0) {
+		int err = errno;
+
+		error("syscall returned with %d (%s)\n",
+				fd[nr_cpu][counter], strerror(err));
+		if (err == EPERM)
+			printf("Are you root?\n");
+		exit(-1);
+	}
+	assert(fd[nr_cpu][counter] >= 0);
+	fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK);
+
+	/*
+	 * First counter acts as the group leader:
+	 */
+	if (group && group_fd == -1)
+		group_fd = fd[nr_cpu][counter];
+
+	event_array[nr_poll].fd = fd[nr_cpu][counter];
+	event_array[nr_poll].events = POLLIN;
+	nr_poll++;
+
+	mmap_array[nr_cpu][counter].counter = counter;
+	mmap_array[nr_cpu][counter].prev = 0;
+	mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
+	mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
+			PROT_READ, MAP_SHARED, fd[nr_cpu][counter], 0);
+	if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
+		error("failed to mmap with %d (%s)\n", errno, strerror(errno));
+		exit(-1);
+	}
+}
+
+static void open_counters(int cpu, pid_t pid)
+{
+	int counter;
+
+	if (pid > 0) {
+		pid_synthesize_comm_event(pid, 0);
+		pid_synthesize_mmap_samples(pid);
+	}
+
+	group_fd = -1;
+	for (counter = 0; counter < nr_counters; counter++)
+		create_counter(counter, cpu, pid);
+
+	nr_cpu++;
+}
+
+static int __cmd_record(int argc, const char **argv)
+{
+	int i, counter;
+	struct stat st;
+	pid_t pid;
+	int flags;
+	int ret;
+
+	page_size = sysconf(_SC_PAGE_SIZE);
+	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	assert(nr_cpus <= MAX_NR_CPUS);
+	assert(nr_cpus >= 0);
+
+	if (!stat(output_name, &st) && !force && !append_file) {
+		fprintf(stderr, "Error, output file %s exists, use -A to append or -f to overwrite.\n",
+				output_name);
+		exit(-1);
+	}
+
+	flags = O_CREAT|O_RDWR;
+	if (append_file)
+		flags |= O_APPEND;
+	else
+		flags |= O_TRUNC;
+
+	output = open(output_name, flags, S_IRUSR|S_IWUSR);
+	if (output < 0) {
+		perror("failed to create output file");
+		exit(-1);
+	}
+
+	if (!system_wide) {
+		open_counters(-1, target_pid != -1 ? target_pid : getpid());
+	} else for (i = 0; i < nr_cpus; i++)
+		open_counters(i, target_pid);
+
+	signal(SIGCHLD, sig_handler);
+	signal(SIGINT, sig_handler);
+
+	if (target_pid == -1 && argc) {
+		pid = fork();
+		if (pid < 0)
+			perror("failed to fork");
+
+		if (!pid) {
+			if (execvp(argv[0], (char **)argv)) {
+				perror(argv[0]);
+				exit(-1);
+			}
+		}
+	}
+
+	if (realtime_prio) {
+		struct sched_param param;
+
+		param.sched_priority = realtime_prio;
+		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
+			printf("Could not set realtime priority.\n");
+			exit(-1);
+		}
+	}
+
+	if (system_wide)
+		synthesize_samples();
+
+	while (!done) {
+		int hits = samples;
+
+		for (i = 0; i < nr_cpu; i++) {
+			for (counter = 0; counter < nr_counters; counter++)
+				mmap_read(&mmap_array[i][counter]);
+		}
+
+		if (hits == samples)
+			ret = poll(event_array, nr_poll, 100);
+	}
+
+	/*
+	 * Approximate RIP event size: 24 bytes.
+	 */
+	fprintf(stderr,
+		"[ perf record: Captured and wrote %.3f MB %s (~%lld samples) ]\n",
+		(double)bytes_written / 1024.0 / 1024.0,
+		output_name,
+		bytes_written / 24);
+
+	return 0;
+}
+
+static const char * const record_usage[] = {
+	"perf record [<options>] [<command>]",
+	"perf record [<options>] -- <command> [<options>]",
+	NULL
+};
+
+static const struct option options[] = {
+	OPT_CALLBACK('e', "event", NULL, "event",
+		     "event selector. use 'perf list' to list available events",
+		     parse_events),
+	OPT_INTEGER('p', "pid", &target_pid,
+		    "record events on existing pid"),
+	OPT_INTEGER('r', "realtime", &realtime_prio,
+		    "collect data with this RT SCHED_FIFO priority"),
+	OPT_BOOLEAN('a', "all-cpus", &system_wide,
+			    "system-wide collection from all CPUs"),
+	OPT_BOOLEAN('A', "append", &append_file,
+			    "append to the output file to do incremental profiling"),
+	OPT_BOOLEAN('f', "force", &force,
+			"overwrite existing data file"),
+	OPT_LONG('c', "count", &default_interval,
+		    "event period to sample"),
+	OPT_STRING('o', "output", &output_name, "file",
+		    "output file name"),
+	OPT_BOOLEAN('i', "inherit", &inherit,
+		    "child tasks inherit counters"),
+	OPT_INTEGER('F', "freq", &freq,
+		    "profile at this frequency"),
+	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
+		    "number of mmap data pages"),
+	OPT_END()
+};
+
+int cmd_record(int argc, const char **argv, const char *prefix)
+{
+	int counter;
+
+	argc = parse_options(argc, argv, options, record_usage, 0);
+	if (!argc && target_pid == -1 && !system_wide)
+		usage_with_options(record_usage, options);
+
+	if (!nr_counters)
+		nr_counters = 1;
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		if (attrs[counter].sample_period)
+			continue;
+
+		attrs[counter].sample_period = default_interval;
+	}
+
+	return __cmd_record(argc, argv);
+}
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
new file mode 100644
index 0000000..242e09f
--- /dev/null
+++ b/tools/perf/builtin-report.c
@@ -0,0 +1,1291 @@
+/*
+ * builtin-report.c
+ *
+ * Builtin report command: Analyze the perf.data input file,
+ * look up and read DSOs and symbol information and display
+ * a histogram of results, along various sorting keys.
+ */
+#include "builtin.h"
+
+#include "util/util.h"
+
+#include "util/color.h"
+#include "util/list.h"
+#include "util/cache.h"
+#include "util/rbtree.h"
+#include "util/symbol.h"
+#include "util/string.h"
+
+#include "perf.h"
+
+#include "util/parse-options.h"
+#include "util/parse-events.h"
+
+#define SHOW_KERNEL	1
+#define SHOW_USER	2
+#define SHOW_HV		4
+
+static char		const *input_name = "perf.data";
+static char		*vmlinux = NULL;
+
+static char		default_sort_order[] = "comm,dso";
+static char		*sort_order = default_sort_order;
+
+static int		input;
+static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
+
+static int		dump_trace = 0;
+#define dprintf(x...)	do { if (dump_trace) printf(x); } while (0)
+
+static int		verbose;
+static int		full_paths;
+
+static unsigned long	page_size;
+static unsigned long	mmap_window = 32;
+
+struct ip_event {
+	struct perf_event_header header;
+	__u64 ip;
+	__u32 pid, tid;
+};
+
+struct mmap_event {
+	struct perf_event_header header;
+	__u32 pid, tid;
+	__u64 start;
+	__u64 len;
+	__u64 pgoff;
+	char filename[PATH_MAX];
+};
+
+struct comm_event {
+	struct perf_event_header header;
+	__u32 pid, tid;
+	char comm[16];
+};
+
+struct fork_event {
+	struct perf_event_header header;
+	__u32 pid, ppid;
+};
+
+struct period_event {
+	struct perf_event_header header;
+	__u64 time;
+	__u64 id;
+	__u64 sample_period;
+};
+
+typedef union event_union {
+	struct perf_event_header	header;
+	struct ip_event			ip;
+	struct mmap_event		mmap;
+	struct comm_event		comm;
+	struct fork_event		fork;
+	struct period_event		period;
+} event_t;
+
+static LIST_HEAD(dsos);
+static struct dso *kernel_dso;
+static struct dso *vdso;
+
+static void dsos__add(struct dso *dso)
+{
+	list_add_tail(&dso->node, &dsos);
+}
+
+static struct dso *dsos__find(const char *name)
+{
+	struct dso *pos;
+
+	list_for_each_entry(pos, &dsos, node)
+		if (strcmp(pos->name, name) == 0)
+			return pos;
+	return NULL;
+}
+
+static struct dso *dsos__findnew(const char *name)
+{
+	struct dso *dso = dsos__find(name);
+	int nr;
+
+	if (dso)
+		return dso;
+
+	dso = dso__new(name, 0);
+	if (!dso)
+		goto out_delete_dso;
+
+	nr = dso__load(dso, NULL, verbose);
+	if (nr < 0) {
+		if (verbose)
+			fprintf(stderr, "Failed to open: %s\n", name);
+		goto out_delete_dso;
+	}
+	if (!nr && verbose) {
+		fprintf(stderr,
+		"No symbols found in: %s, maybe install a debug package?\n",
+				name);
+	}
+
+	dsos__add(dso);
+
+	return dso;
+
+out_delete_dso:
+	dso__delete(dso);
+	return NULL;
+}
+
+static void dsos__fprintf(FILE *fp)
+{
+	struct dso *pos;
+
+	list_for_each_entry(pos, &dsos, node)
+		dso__fprintf(pos, fp);
+}
+
+static struct symbol *vdso__find_symbol(struct dso *dso, uint64_t ip)
+{
+	return dso__find_symbol(kernel_dso, ip);
+}
+
+static int load_kernel(void)
+{
+	int err;
+
+	kernel_dso = dso__new("[kernel]", 0);
+	if (!kernel_dso)
+		return -1;
+
+	err = dso__load_kernel(kernel_dso, vmlinux, NULL, verbose);
+	if (err) {
+		dso__delete(kernel_dso);
+		kernel_dso = NULL;
+	} else
+		dsos__add(kernel_dso);
+
+	vdso = dso__new("[vdso]", 0);
+	if (!vdso)
+		return -1;
+
+	vdso->find_symbol = vdso__find_symbol;
+
+	dsos__add(vdso);
+
+	return err;
+}
+
+static char __cwd[PATH_MAX];
+static char *cwd = __cwd;
+static int cwdlen;
+
+static int strcommon(const char *pathname)
+{
+	int n = 0;
+
+	while (pathname[n] == cwd[n] && n < cwdlen)
+		++n;
+
+	return n;
+}
+
+struct map {
+	struct list_head node;
+	uint64_t	 start;
+	uint64_t	 end;
+	uint64_t	 pgoff;
+	uint64_t	 (*map_ip)(struct map *, uint64_t);
+	struct dso	 *dso;
+};
+
+static uint64_t map__map_ip(struct map *map, uint64_t ip)
+{
+	return ip - map->start + map->pgoff;
+}
+
+static uint64_t vdso__map_ip(struct map *map, uint64_t ip)
+{
+	return ip;
+}
+
+static struct map *map__new(struct mmap_event *event)
+{
+	struct map *self = malloc(sizeof(*self));
+
+	if (self != NULL) {
+		const char *filename = event->filename;
+		char newfilename[PATH_MAX];
+
+		if (cwd) {
+			int n = strcommon(filename);
+
+			if (n == cwdlen) {
+				snprintf(newfilename, sizeof(newfilename),
+					 ".%s", filename + n);
+				filename = newfilename;
+			}
+		}
+
+		self->start = event->start;
+		self->end   = event->start + event->len;
+		self->pgoff = event->pgoff;
+
+		self->dso = dsos__findnew(filename);
+		if (self->dso == NULL)
+			goto out_delete;
+
+		if (self->dso == vdso)
+			self->map_ip = vdso__map_ip;
+		else
+			self->map_ip = map__map_ip;
+	}
+	return self;
+out_delete:
+	free(self);
+	return NULL;
+}
+
+static struct map *map__clone(struct map *self)
+{
+	struct map *map = malloc(sizeof(*self));
+
+	if (!map)
+		return NULL;
+
+	memcpy(map, self, sizeof(*self));
+
+	return map;
+}
+
+static int map__overlap(struct map *l, struct map *r)
+{
+	if (l->start > r->start) {
+		struct map *t = l;
+		l = r;
+		r = t;
+	}
+
+	if (l->end > r->start)
+		return 1;
+
+	return 0;
+}
+
+static size_t map__fprintf(struct map *self, FILE *fp)
+{
+	return fprintf(fp, " %"PRIx64"-%"PRIx64" %"PRIx64" %s\n",
+		       self->start, self->end, self->pgoff, self->dso->name);
+}
+
+
+struct thread {
+	struct rb_node	 rb_node;
+	struct list_head maps;
+	pid_t		 pid;
+	char		 *comm;
+};
+
+static struct thread *thread__new(pid_t pid)
+{
+	struct thread *self = malloc(sizeof(*self));
+
+	if (self != NULL) {
+		self->pid = pid;
+		self->comm = malloc(32);
+		if (self->comm)
+			snprintf(self->comm, 32, ":%d", self->pid);
+		INIT_LIST_HEAD(&self->maps);
+	}
+
+	return self;
+}
+
+static int thread__set_comm(struct thread *self, const char *comm)
+{
+	if (self->comm)
+		free(self->comm);
+	self->comm = strdup(comm);
+	return self->comm ? 0 : -ENOMEM;
+}
+
+static size_t thread__fprintf(struct thread *self, FILE *fp)
+{
+	struct map *pos;
+	size_t ret = fprintf(fp, "Thread %d %s\n", self->pid, self->comm);
+
+	list_for_each_entry(pos, &self->maps, node)
+		ret += map__fprintf(pos, fp);
+
+	return ret;
+}
+
+
+static struct rb_root threads;
+static struct thread *last_match;
+
+static struct thread *threads__findnew(pid_t pid)
+{
+	struct rb_node **p = &threads.rb_node;
+	struct rb_node *parent = NULL;
+	struct thread *th;
+
+	/*
+	 * Font-end cache - PID lookups come in blocks,
+	 * so most of the time we dont have to look up
+	 * the full rbtree:
+	 */
+	if (last_match && last_match->pid == pid)
+		return last_match;
+
+	while (*p != NULL) {
+		parent = *p;
+		th = rb_entry(parent, struct thread, rb_node);
+
+		if (th->pid == pid) {
+			last_match = th;
+			return th;
+		}
+
+		if (pid < th->pid)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	th = thread__new(pid);
+	if (th != NULL) {
+		rb_link_node(&th->rb_node, parent, p);
+		rb_insert_color(&th->rb_node, &threads);
+		last_match = th;
+	}
+
+	return th;
+}
+
+static void thread__insert_map(struct thread *self, struct map *map)
+{
+	struct map *pos, *tmp;
+
+	list_for_each_entry_safe(pos, tmp, &self->maps, node) {
+		if (map__overlap(pos, map)) {
+			list_del_init(&pos->node);
+			/* XXX leaks dsos */
+			free(pos);
+		}
+	}
+
+	list_add_tail(&map->node, &self->maps);
+}
+
+static int thread__fork(struct thread *self, struct thread *parent)
+{
+	struct map *map;
+
+	if (self->comm)
+		free(self->comm);
+	self->comm = strdup(parent->comm);
+	if (!self->comm)
+		return -ENOMEM;
+
+	list_for_each_entry(map, &parent->maps, node) {
+		struct map *new = map__clone(map);
+		if (!new)
+			return -ENOMEM;
+		thread__insert_map(self, new);
+	}
+
+	return 0;
+}
+
+static struct map *thread__find_map(struct thread *self, uint64_t ip)
+{
+	struct map *pos;
+
+	if (self == NULL)
+		return NULL;
+
+	list_for_each_entry(pos, &self->maps, node)
+		if (ip >= pos->start && ip <= pos->end)
+			return pos;
+
+	return NULL;
+}
+
+static size_t threads__fprintf(FILE *fp)
+{
+	size_t ret = 0;
+	struct rb_node *nd;
+
+	for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
+		struct thread *pos = rb_entry(nd, struct thread, rb_node);
+
+		ret += thread__fprintf(pos, fp);
+	}
+
+	return ret;
+}
+
+/*
+ * histogram, sorted on item, collects counts
+ */
+
+static struct rb_root hist;
+
+struct hist_entry {
+	struct rb_node	 rb_node;
+
+	struct thread	 *thread;
+	struct map	 *map;
+	struct dso	 *dso;
+	struct symbol	 *sym;
+	uint64_t	 ip;
+	char		 level;
+
+	uint32_t	 count;
+};
+
+/*
+ * configurable sorting bits
+ */
+
+struct sort_entry {
+	struct list_head list;
+
+	char *header;
+
+	int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
+	int64_t (*collapse)(struct hist_entry *, struct hist_entry *);
+	size_t	(*print)(FILE *fp, struct hist_entry *);
+};
+
+/* --sort pid */
+
+static int64_t
+sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	return right->thread->pid - left->thread->pid;
+}
+
+static size_t
+sort__thread_print(FILE *fp, struct hist_entry *self)
+{
+	return fprintf(fp, "%16s:%5d", self->thread->comm ?: "", self->thread->pid);
+}
+
+static struct sort_entry sort_thread = {
+	.header = "         Command:  Pid",
+	.cmp	= sort__thread_cmp,
+	.print	= sort__thread_print,
+};
+
+/* --sort comm */
+
+static int64_t
+sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	return right->thread->pid - left->thread->pid;
+}
+
+static int64_t
+sort__comm_collapse(struct hist_entry *left, struct hist_entry *right)
+{
+	char *comm_l = left->thread->comm;
+	char *comm_r = right->thread->comm;
+
+	if (!comm_l || !comm_r) {
+		if (!comm_l && !comm_r)
+			return 0;
+		else if (!comm_l)
+			return -1;
+		else
+			return 1;
+	}
+
+	return strcmp(comm_l, comm_r);
+}
+
+static size_t
+sort__comm_print(FILE *fp, struct hist_entry *self)
+{
+	return fprintf(fp, "%16s", self->thread->comm);
+}
+
+static struct sort_entry sort_comm = {
+	.header		= "         Command",
+	.cmp		= sort__comm_cmp,
+	.collapse	= sort__comm_collapse,
+	.print		= sort__comm_print,
+};
+
+/* --sort dso */
+
+static int64_t
+sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	struct dso *dso_l = left->dso;
+	struct dso *dso_r = right->dso;
+
+	if (!dso_l || !dso_r) {
+		if (!dso_l && !dso_r)
+			return 0;
+		else if (!dso_l)
+			return -1;
+		else
+			return 1;
+	}
+
+	return strcmp(dso_l->name, dso_r->name);
+}
+
+static size_t
+sort__dso_print(FILE *fp, struct hist_entry *self)
+{
+	if (self->dso)
+		return fprintf(fp, "%-25s", self->dso->name);
+
+	return fprintf(fp, "%016llx         ", (__u64)self->ip);
+}
+
+static struct sort_entry sort_dso = {
+	.header = "Shared Object            ",
+	.cmp	= sort__dso_cmp,
+	.print	= sort__dso_print,
+};
+
+/* --sort symbol */
+
+static int64_t
+sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	uint64_t ip_l, ip_r;
+
+	if (left->sym == right->sym)
+		return 0;
+
+	ip_l = left->sym ? left->sym->start : left->ip;
+	ip_r = right->sym ? right->sym->start : right->ip;
+
+	return (int64_t)(ip_r - ip_l);
+}
+
+static size_t
+sort__sym_print(FILE *fp, struct hist_entry *self)
+{
+	size_t ret = 0;
+
+	if (verbose)
+		ret += fprintf(fp, "%#018llx  ", (__u64)self->ip);
+
+	if (self->sym) {
+		ret += fprintf(fp, "[%c] %s",
+			self->dso == kernel_dso ? 'k' : '.', self->sym->name);
+	} else {
+		ret += fprintf(fp, "%#016llx", (__u64)self->ip);
+	}
+
+	return ret;
+}
+
+static struct sort_entry sort_sym = {
+	.header = "Symbol",
+	.cmp	= sort__sym_cmp,
+	.print	= sort__sym_print,
+};
+
+static int sort__need_collapse = 0;
+
+struct sort_dimension {
+	char			*name;
+	struct sort_entry	*entry;
+	int			taken;
+};
+
+static struct sort_dimension sort_dimensions[] = {
+	{ .name = "pid",	.entry = &sort_thread,	},
+	{ .name = "comm",	.entry = &sort_comm,	},
+	{ .name = "dso",	.entry = &sort_dso,	},
+	{ .name = "symbol",	.entry = &sort_sym,	},
+};
+
+static LIST_HEAD(hist_entry__sort_list);
+
+static int sort_dimension__add(char *tok)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(sort_dimensions); i++) {
+		struct sort_dimension *sd = &sort_dimensions[i];
+
+		if (sd->taken)
+			continue;
+
+		if (strncasecmp(tok, sd->name, strlen(tok)))
+			continue;
+
+		if (sd->entry->collapse)
+			sort__need_collapse = 1;
+
+		list_add_tail(&sd->entry->list, &hist_entry__sort_list);
+		sd->taken = 1;
+
+		return 0;
+	}
+
+	return -ESRCH;
+}
+
+static int64_t
+hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	struct sort_entry *se;
+	int64_t cmp = 0;
+
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		cmp = se->cmp(left, right);
+		if (cmp)
+			break;
+	}
+
+	return cmp;
+}
+
+static int64_t
+hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
+{
+	struct sort_entry *se;
+	int64_t cmp = 0;
+
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		int64_t (*f)(struct hist_entry *, struct hist_entry *);
+
+		f = se->collapse ?: se->cmp;
+
+		cmp = f(left, right);
+		if (cmp)
+			break;
+	}
+
+	return cmp;
+}
+
+static size_t
+hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
+{
+	struct sort_entry *se;
+	size_t ret;
+
+	if (total_samples) {
+		double percent = self->count * 100.0 / total_samples;
+		char *color = PERF_COLOR_NORMAL;
+
+		/*
+		 * We color high-overhead entries in red, low-overhead
+		 * entries in green - and keep the middle ground normal:
+		 */
+		if (percent >= 5.0)
+			color = PERF_COLOR_RED;
+		if (percent < 0.5)
+			color = PERF_COLOR_GREEN;
+
+		ret = color_fprintf(fp, color, "   %6.2f%%",
+				(self->count * 100.0) / total_samples);
+	} else
+		ret = fprintf(fp, "%12d ", self->count);
+
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		fprintf(fp, "  ");
+		ret += se->print(fp, self);
+	}
+
+	ret += fprintf(fp, "\n");
+
+	return ret;
+}
+
+/*
+ * collect histogram counts
+ */
+
+static int
+hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
+		struct symbol *sym, uint64_t ip, char level)
+{
+	struct rb_node **p = &hist.rb_node;
+	struct rb_node *parent = NULL;
+	struct hist_entry *he;
+	struct hist_entry entry = {
+		.thread	= thread,
+		.map	= map,
+		.dso	= dso,
+		.sym	= sym,
+		.ip	= ip,
+		.level	= level,
+		.count	= 1,
+	};
+	int cmp;
+
+	while (*p != NULL) {
+		parent = *p;
+		he = rb_entry(parent, struct hist_entry, rb_node);
+
+		cmp = hist_entry__cmp(&entry, he);
+
+		if (!cmp) {
+			he->count++;
+			return 0;
+		}
+
+		if (cmp < 0)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	he = malloc(sizeof(*he));
+	if (!he)
+		return -ENOMEM;
+	*he = entry;
+	rb_link_node(&he->rb_node, parent, p);
+	rb_insert_color(&he->rb_node, &hist);
+
+	return 0;
+}
+
+static void hist_entry__free(struct hist_entry *he)
+{
+	free(he);
+}
+
+/*
+ * collapse the histogram
+ */
+
+static struct rb_root collapse_hists;
+
+static void collapse__insert_entry(struct hist_entry *he)
+{
+	struct rb_node **p = &collapse_hists.rb_node;
+	struct rb_node *parent = NULL;
+	struct hist_entry *iter;
+	int64_t cmp;
+
+	while (*p != NULL) {
+		parent = *p;
+		iter = rb_entry(parent, struct hist_entry, rb_node);
+
+		cmp = hist_entry__collapse(iter, he);
+
+		if (!cmp) {
+			iter->count += he->count;
+			hist_entry__free(he);
+			return;
+		}
+
+		if (cmp < 0)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&he->rb_node, parent, p);
+	rb_insert_color(&he->rb_node, &collapse_hists);
+}
+
+static void collapse__resort(void)
+{
+	struct rb_node *next;
+	struct hist_entry *n;
+
+	if (!sort__need_collapse)
+		return;
+
+	next = rb_first(&hist);
+	while (next) {
+		n = rb_entry(next, struct hist_entry, rb_node);
+		next = rb_next(&n->rb_node);
+
+		rb_erase(&n->rb_node, &hist);
+		collapse__insert_entry(n);
+	}
+}
+
+/*
+ * reverse the map, sort on count.
+ */
+
+static struct rb_root output_hists;
+
+static void output__insert_entry(struct hist_entry *he)
+{
+	struct rb_node **p = &output_hists.rb_node;
+	struct rb_node *parent = NULL;
+	struct hist_entry *iter;
+
+	while (*p != NULL) {
+		parent = *p;
+		iter = rb_entry(parent, struct hist_entry, rb_node);
+
+		if (he->count > iter->count)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&he->rb_node, parent, p);
+	rb_insert_color(&he->rb_node, &output_hists);
+}
+
+static void output__resort(void)
+{
+	struct rb_node *next;
+	struct hist_entry *n;
+	struct rb_root *tree = &hist;
+
+	if (sort__need_collapse)
+		tree = &collapse_hists;
+
+	next = rb_first(tree);
+
+	while (next) {
+		n = rb_entry(next, struct hist_entry, rb_node);
+		next = rb_next(&n->rb_node);
+
+		rb_erase(&n->rb_node, tree);
+		output__insert_entry(n);
+	}
+}
+
+static size_t output__fprintf(FILE *fp, uint64_t total_samples)
+{
+	struct hist_entry *pos;
+	struct sort_entry *se;
+	struct rb_node *nd;
+	size_t ret = 0;
+
+	fprintf(fp, "\n");
+	fprintf(fp, "#\n");
+	fprintf(fp, "# (%Ld samples)\n", (__u64)total_samples);
+	fprintf(fp, "#\n");
+
+	fprintf(fp, "# Overhead");
+	list_for_each_entry(se, &hist_entry__sort_list, list)
+		fprintf(fp, "  %s", se->header);
+	fprintf(fp, "\n");
+
+	fprintf(fp, "# ........");
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		int i;
+
+		fprintf(fp, "  ");
+		for (i = 0; i < strlen(se->header); i++)
+			fprintf(fp, ".");
+	}
+	fprintf(fp, "\n");
+
+	fprintf(fp, "#\n");
+
+	for (nd = rb_first(&output_hists); nd; nd = rb_next(nd)) {
+		pos = rb_entry(nd, struct hist_entry, rb_node);
+		ret += hist_entry__fprintf(fp, pos, total_samples);
+	}
+
+	if (!strcmp(sort_order, default_sort_order)) {
+		fprintf(fp, "#\n");
+		fprintf(fp, "# (For more details, try: perf report --sort comm,dso,symbol)\n");
+		fprintf(fp, "#\n");
+	}
+	fprintf(fp, "\n");
+
+	return ret;
+}
+
+static void register_idle_thread(void)
+{
+	struct thread *thread = threads__findnew(0);
+
+	if (thread == NULL ||
+			thread__set_comm(thread, "[idle]")) {
+		fprintf(stderr, "problem inserting idle task.\n");
+		exit(-1);
+	}
+}
+
+static unsigned long total = 0,
+		     total_mmap = 0,
+		     total_comm = 0,
+		     total_fork = 0,
+		     total_unknown = 0;
+
+static int
+process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	char level;
+	int show = 0;
+	struct dso *dso = NULL;
+	struct thread *thread = threads__findnew(event->ip.pid);
+	uint64_t ip = event->ip.ip;
+	struct map *map = NULL;
+
+	dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->header.misc,
+		event->ip.pid,
+		(void *)(long)ip);
+
+	dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
+
+	if (thread == NULL) {
+		fprintf(stderr, "problem processing %d event, skipping it.\n",
+			event->header.type);
+		return -1;
+	}
+
+	if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
+		show = SHOW_KERNEL;
+		level = 'k';
+
+		dso = kernel_dso;
+
+		dprintf(" ...... dso: %s\n", dso->name);
+
+	} else if (event->header.misc & PERF_EVENT_MISC_USER) {
+
+		show = SHOW_USER;
+		level = '.';
+
+		map = thread__find_map(thread, ip);
+		if (map != NULL) {
+			ip = map->map_ip(map, ip);
+			dso = map->dso;
+		} else {
+			/*
+			 * If this is outside of all known maps,
+			 * and is a negative address, try to look it
+			 * up in the kernel dso, as it might be a
+			 * vsyscall (which executes in user-mode):
+			 */
+			if ((long long)ip < 0)
+				dso = kernel_dso;
+		}
+		dprintf(" ...... dso: %s\n", dso ? dso->name : "<not found>");
+
+	} else {
+		show = SHOW_HV;
+		level = 'H';
+		dprintf(" ...... dso: [hypervisor]\n");
+	}
+
+	if (show & show_mask) {
+		struct symbol *sym = NULL;
+
+		if (dso)
+			sym = dso->find_symbol(dso, ip);
+
+		if (hist_entry__add(thread, map, dso, sym, ip, level)) {
+			fprintf(stderr,
+		"problem incrementing symbol count, skipping event\n");
+			return -1;
+		}
+	}
+	total++;
+
+	return 0;
+}
+
+static int
+process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	struct thread *thread = threads__findnew(event->mmap.pid);
+	struct map *map = map__new(&event->mmap);
+
+	dprintf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->mmap.pid,
+		(void *)(long)event->mmap.start,
+		(void *)(long)event->mmap.len,
+		(void *)(long)event->mmap.pgoff,
+		event->mmap.filename);
+
+	if (thread == NULL || map == NULL) {
+		dprintf("problem processing PERF_EVENT_MMAP, skipping event.\n");
+		return 0;
+	}
+
+	thread__insert_map(thread, map);
+	total_mmap++;
+
+	return 0;
+}
+
+static int
+process_comm_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	struct thread *thread = threads__findnew(event->comm.pid);
+
+	dprintf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->comm.comm, event->comm.pid);
+
+	if (thread == NULL ||
+	    thread__set_comm(thread, event->comm.comm)) {
+		dprintf("problem processing PERF_EVENT_COMM, skipping event.\n");
+		return -1;
+	}
+	total_comm++;
+
+	return 0;
+}
+
+static int
+process_fork_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	struct thread *thread = threads__findnew(event->fork.pid);
+	struct thread *parent = threads__findnew(event->fork.ppid);
+
+	dprintf("%p [%p]: PERF_EVENT_FORK: %d:%d\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->fork.pid, event->fork.ppid);
+
+	if (!thread || !parent || thread__fork(thread, parent)) {
+		dprintf("problem processing PERF_EVENT_FORK, skipping event.\n");
+		return -1;
+	}
+	total_fork++;
+
+	return 0;
+}
+
+static int
+process_period_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	dprintf("%p [%p]: PERF_EVENT_PERIOD: time:%Ld, id:%Ld: period:%Ld\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->period.time,
+		event->period.id,
+		event->period.sample_period);
+
+	return 0;
+}
+
+static int
+process_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW)
+		return process_overflow_event(event, offset, head);
+
+	switch (event->header.type) {
+	case PERF_EVENT_MMAP:
+		return process_mmap_event(event, offset, head);
+
+	case PERF_EVENT_COMM:
+		return process_comm_event(event, offset, head);
+
+	case PERF_EVENT_FORK:
+		return process_fork_event(event, offset, head);
+
+	case PERF_EVENT_PERIOD:
+		return process_period_event(event, offset, head);
+	/*
+	 * We dont process them right now but they are fine:
+	 */
+
+	case PERF_EVENT_THROTTLE:
+	case PERF_EVENT_UNTHROTTLE:
+		return 0;
+
+	default:
+		return -1;
+	}
+
+	return 0;
+}
+
+static int __cmd_report(void)
+{
+	int ret, rc = EXIT_FAILURE;
+	unsigned long offset = 0;
+	unsigned long head = 0;
+	struct stat stat;
+	event_t *event;
+	uint32_t size;
+	char *buf;
+
+	register_idle_thread();
+
+	input = open(input_name, O_RDONLY);
+	if (input < 0) {
+		perror("failed to open file");
+		exit(-1);
+	}
+
+	ret = fstat(input, &stat);
+	if (ret < 0) {
+		perror("failed to stat file");
+		exit(-1);
+	}
+
+	if (!stat.st_size) {
+		fprintf(stderr, "zero-sized file, nothing to do!\n");
+		exit(0);
+	}
+
+	if (load_kernel() < 0) {
+		perror("failed to load kernel symbols");
+		return EXIT_FAILURE;
+	}
+
+	if (!full_paths) {
+		if (getcwd(__cwd, sizeof(__cwd)) == NULL) {
+			perror("failed to get the current directory");
+			return EXIT_FAILURE;
+		}
+		cwdlen = strlen(cwd);
+	} else {
+		cwd = NULL;
+		cwdlen = 0;
+	}
+remap:
+	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
+			   MAP_SHARED, input, offset);
+	if (buf == MAP_FAILED) {
+		perror("failed to mmap file");
+		exit(-1);
+	}
+
+more:
+	event = (event_t *)(buf + head);
+
+	size = event->header.size;
+	if (!size)
+		size = 8;
+
+	if (head + event->header.size >= page_size * mmap_window) {
+		unsigned long shift = page_size * (head / page_size);
+		int ret;
+
+		ret = munmap(buf, page_size * mmap_window);
+		assert(ret == 0);
+
+		offset += shift;
+		head -= shift;
+		goto remap;
+	}
+
+	size = event->header.size;
+
+	dprintf("%p [%p]: event: %d\n",
+			(void *)(offset + head),
+			(void *)(long)event->header.size,
+			event->header.type);
+
+	if (!size || process_event(event, offset, head) < 0) {
+
+		dprintf("%p [%p]: skipping unknown header type: %d\n",
+			(void *)(offset + head),
+			(void *)(long)(event->header.size),
+			event->header.type);
+
+		total_unknown++;
+
+		/*
+		 * assume we lost track of the stream, check alignment, and
+		 * increment a single u64 in the hope to catch on again 'soon'.
+		 */
+
+		if (unlikely(head & 7))
+			head &= ~7ULL;
+
+		size = 8;
+	}
+
+	head += size;
+
+	if (offset + head < stat.st_size)
+		goto more;
+
+	rc = EXIT_SUCCESS;
+	close(input);
+
+	dprintf("      IP events: %10ld\n", total);
+	dprintf("    mmap events: %10ld\n", total_mmap);
+	dprintf("    comm events: %10ld\n", total_comm);
+	dprintf("    fork events: %10ld\n", total_fork);
+	dprintf(" unknown events: %10ld\n", total_unknown);
+
+	if (dump_trace)
+		return 0;
+
+	if (verbose >= 3)
+		threads__fprintf(stdout);
+
+	if (verbose >= 2)
+		dsos__fprintf(stdout);
+
+	collapse__resort();
+	output__resort();
+	output__fprintf(stdout, total);
+
+	return rc;
+}
+
+static const char * const report_usage[] = {
+	"perf report [<options>] <command>",
+	NULL
+};
+
+static const struct option options[] = {
+	OPT_STRING('i', "input", &input_name, "file",
+		    "input file name"),
+	OPT_BOOLEAN('v', "verbose", &verbose,
+		    "be more verbose (show symbol address, etc)"),
+	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
+		    "dump raw trace in ASCII"),
+	OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
+	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
+		   "sort by key(s): pid, comm, dso, symbol. Default: pid,symbol"),
+	OPT_BOOLEAN('P', "full-paths", &full_paths,
+		    "Don't shorten the pathnames taking into account the cwd"),
+	OPT_END()
+};
+
+static void setup_sorting(void)
+{
+	char *tmp, *tok, *str = strdup(sort_order);
+
+	for (tok = strtok_r(str, ", ", &tmp);
+			tok; tok = strtok_r(NULL, ", ", &tmp)) {
+		if (sort_dimension__add(tok) < 0) {
+			error("Unknown --sort key: `%s'", tok);
+			usage_with_options(report_usage, options);
+		}
+	}
+
+	free(str);
+}
+
+int cmd_report(int argc, const char **argv, const char *prefix)
+{
+	symbol__init();
+
+	page_size = getpagesize();
+
+	argc = parse_options(argc, argv, options, report_usage, 0);
+
+	setup_sorting();
+
+	/*
+	 * Any (unrecognized) arguments left?
+	 */
+	if (argc)
+		usage_with_options(report_usage, options);
+
+	setup_pager();
+
+	return __cmd_report();
+}
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
new file mode 100644
index 0000000..2cbf5a1
--- /dev/null
+++ b/tools/perf/builtin-stat.c
@@ -0,0 +1,339 @@
+/*
+ * builtin-stat.c
+ *
+ * Builtin stat command: Give a precise performance counters summary
+ * overview about any workload, CPU or specific PID.
+ *
+ * Sample output:
+
+   $ perf stat ~/hackbench 10
+   Time: 0.104
+
+    Performance counter stats for '/home/mingo/hackbench':
+
+       1255.538611  task clock ticks     #      10.143 CPU utilization factor
+             54011  context switches     #       0.043 M/sec
+               385  CPU migrations       #       0.000 M/sec
+             17755  pagefaults           #       0.014 M/sec
+        3808323185  CPU cycles           #    3033.219 M/sec
+        1575111190  instructions         #    1254.530 M/sec
+          17367895  cache references     #      13.833 M/sec
+           7674421  cache misses         #       6.112 M/sec
+
+    Wall-clock time elapsed:   123.786620 msecs
+
+ *
+ * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
+ *
+ * Improvements and fixes by:
+ *
+ *   Arjan van de Ven <arjan@linux.intel.com>
+ *   Yanmin Zhang <yanmin.zhang@intel.com>
+ *   Wu Fengguang <fengguang.wu@intel.com>
+ *   Mike Galbraith <efault@gmx.de>
+ *   Paul Mackerras <paulus@samba.org>
+ *
+ * Released under the GPL v2. (and only v2, not any later version)
+ */
+
+#include "perf.h"
+#include "builtin.h"
+#include "util/util.h"
+#include "util/parse-options.h"
+#include "util/parse-events.h"
+
+#include <sys/prctl.h>
+
+static struct perf_counter_attr default_attrs[MAX_COUNTERS] = {
+
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_TASK_CLOCK		},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_CONTEXT_SWITCHES	},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_CPU_MIGRATIONS	},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_PAGE_FAULTS	},
+
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CPU_CYCLES		},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_INSTRUCTIONS	},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CACHE_REFERENCES	},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CACHE_MISSES	},
+};
+
+static int			system_wide			=  0;
+static int			inherit				=  1;
+
+static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+
+static int			target_pid			= -1;
+static int			nr_cpus				=  0;
+static unsigned int		page_size;
+
+static int			scale				=  1;
+
+static const unsigned int default_count[] = {
+	1000000,
+	1000000,
+	  10000,
+	  10000,
+	1000000,
+	  10000,
+};
+
+static __u64			event_res[MAX_COUNTERS][3];
+static __u64			event_scaled[MAX_COUNTERS];
+
+static __u64			runtime_nsecs;
+static __u64			walltime_nsecs;
+
+static void create_perfstat_counter(int counter)
+{
+	struct perf_counter_attr *attr = attrs + counter;
+
+	if (scale)
+		attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
+				    PERF_FORMAT_TOTAL_TIME_RUNNING;
+
+	if (system_wide) {
+		int cpu;
+		for (cpu = 0; cpu < nr_cpus; cpu ++) {
+			fd[cpu][counter] = sys_perf_counter_open(attr, -1, cpu, -1, 0);
+			if (fd[cpu][counter] < 0) {
+				printf("perfstat error: syscall returned with %d (%s)\n",
+						fd[cpu][counter], strerror(errno));
+				exit(-1);
+			}
+		}
+	} else {
+		attr->inherit	= inherit;
+		attr->disabled	= 1;
+
+		fd[0][counter] = sys_perf_counter_open(attr, 0, -1, -1, 0);
+		if (fd[0][counter] < 0) {
+			printf("perfstat error: syscall returned with %d (%s)\n",
+					fd[0][counter], strerror(errno));
+			exit(-1);
+		}
+	}
+}
+
+/*
+ * Does the counter have nsecs as a unit?
+ */
+static inline int nsec_counter(int counter)
+{
+	if (attrs[counter].type != PERF_TYPE_SOFTWARE)
+		return 0;
+
+	if (attrs[counter].config == PERF_COUNT_CPU_CLOCK)
+		return 1;
+
+	if (attrs[counter].config == PERF_COUNT_TASK_CLOCK)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Read out the results of a single counter:
+ */
+static void read_counter(int counter)
+{
+	__u64 *count, single_count[3];
+	ssize_t res;
+	int cpu, nv;
+	int scaled;
+
+	count = event_res[counter];
+
+	count[0] = count[1] = count[2] = 0;
+
+	nv = scale ? 3 : 1;
+	for (cpu = 0; cpu < nr_cpus; cpu ++) {
+		res = read(fd[cpu][counter], single_count, nv * sizeof(__u64));
+		assert(res == nv * sizeof(__u64));
+
+		count[0] += single_count[0];
+		if (scale) {
+			count[1] += single_count[1];
+			count[2] += single_count[2];
+		}
+	}
+
+	scaled = 0;
+	if (scale) {
+		if (count[2] == 0) {
+			event_scaled[counter] = -1;
+			count[0] = 0;
+			return;
+		}
+
+		if (count[2] < count[1]) {
+			event_scaled[counter] = 1;
+			count[0] = (unsigned long long)
+				((double)count[0] * count[1] / count[2] + 0.5);
+		}
+	}
+	/*
+	 * Save the full runtime - to allow normalization during printout:
+	 */
+	if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
+		attrs[counter].config == PERF_COUNT_TASK_CLOCK)
+		runtime_nsecs = count[0];
+}
+
+/*
+ * Print out the results of a single counter:
+ */
+static void print_counter(int counter)
+{
+	__u64 *count;
+	int scaled;
+
+	count = event_res[counter];
+	scaled = event_scaled[counter];
+
+	if (scaled == -1) {
+		fprintf(stderr, " %14s  %-20s\n",
+			"<not counted>", event_name(counter));
+		return;
+	}
+
+	if (nsec_counter(counter)) {
+		double msecs = (double)count[0] / 1000000;
+
+		fprintf(stderr, " %14.6f  %-20s",
+			msecs, event_name(counter));
+		if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
+			attrs[counter].config == PERF_COUNT_TASK_CLOCK) {
+
+			fprintf(stderr, " # %11.3f CPU utilization factor",
+				(double)count[0] / (double)walltime_nsecs);
+		}
+	} else {
+		fprintf(stderr, " %14Ld  %-20s",
+			count[0], event_name(counter));
+		if (runtime_nsecs)
+			fprintf(stderr, " # %11.3f M/sec",
+				(double)count[0]/runtime_nsecs*1000.0);
+	}
+	if (scaled)
+		fprintf(stderr, "  (scaled from %.2f%%)",
+			(double) count[2] / count[1] * 100);
+	fprintf(stderr, "\n");
+}
+
+static int do_perfstat(int argc, const char **argv)
+{
+	unsigned long long t0, t1;
+	int counter;
+	int status;
+	int pid;
+	int i;
+
+	if (!system_wide)
+		nr_cpus = 1;
+
+	for (counter = 0; counter < nr_counters; counter++)
+		create_perfstat_counter(counter);
+
+	/*
+	 * Enable counters and exec the command:
+	 */
+	t0 = rdclock();
+	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
+
+	if ((pid = fork()) < 0)
+		perror("failed to fork");
+
+	if (!pid) {
+		if (execvp(argv[0], (char **)argv)) {
+			perror(argv[0]);
+			exit(-1);
+		}
+	}
+
+	while (wait(&status) >= 0)
+		;
+
+	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
+	t1 = rdclock();
+
+	walltime_nsecs = t1 - t0;
+
+	fflush(stdout);
+
+	fprintf(stderr, "\n");
+	fprintf(stderr, " Performance counter stats for \'%s", argv[0]);
+
+	for (i = 1; i < argc; i++)
+		fprintf(stderr, " %s", argv[i]);
+
+	fprintf(stderr, "\':\n");
+	fprintf(stderr, "\n");
+
+	for (counter = 0; counter < nr_counters; counter++)
+		read_counter(counter);
+
+	for (counter = 0; counter < nr_counters; counter++)
+		print_counter(counter);
+
+
+	fprintf(stderr, "\n");
+	fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
+			(double)(t1-t0)/1e6);
+	fprintf(stderr, "\n");
+
+	return 0;
+}
+
+static void skip_signal(int signo)
+{
+}
+
+static const char * const stat_usage[] = {
+	"perf stat [<options>] <command>",
+	NULL
+};
+
+static const struct option options[] = {
+	OPT_CALLBACK('e', "event", NULL, "event",
+		     "event selector. use 'perf list' to list available events",
+		     parse_events),
+	OPT_BOOLEAN('i', "inherit", &inherit,
+		    "child tasks inherit counters"),
+	OPT_INTEGER('p', "pid", &target_pid,
+		    "stat events on existing pid"),
+	OPT_BOOLEAN('a', "all-cpus", &system_wide,
+			    "system-wide collection from all CPUs"),
+	OPT_BOOLEAN('S', "scale", &scale,
+			    "scale/normalize counters"),
+	OPT_END()
+};
+
+int cmd_stat(int argc, const char **argv, const char *prefix)
+{
+	page_size = sysconf(_SC_PAGE_SIZE);
+
+	memcpy(attrs, default_attrs, sizeof(attrs));
+
+	argc = parse_options(argc, argv, options, stat_usage, 0);
+	if (!argc)
+		usage_with_options(stat_usage, options);
+
+	if (!nr_counters)
+		nr_counters = 8;
+
+	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	assert(nr_cpus <= MAX_NR_CPUS);
+	assert(nr_cpus >= 0);
+
+	/*
+	 * We dont want to block the signals - that would cause
+	 * child tasks to inherit that and Ctrl-C would not work.
+	 * What we want is for Ctrl-C to work in the exec()-ed
+	 * task, but being ignored by perf stat itself:
+	 */
+	signal(SIGINT,  skip_signal);
+	signal(SIGALRM, skip_signal);
+	signal(SIGABRT, skip_signal);
+
+	return do_perfstat(argc, argv);
+}
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
new file mode 100644
index 0000000..f2e7312
--- /dev/null
+++ b/tools/perf/builtin-top.c
@@ -0,0 +1,692 @@
+/*
+ * builtin-top.c
+ *
+ * Builtin top command: Display a continuously updated profile of
+ * any workload, CPU or specific PID.
+ *
+ * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
+ *
+ * Improvements and fixes by:
+ *
+ *   Arjan van de Ven <arjan@linux.intel.com>
+ *   Yanmin Zhang <yanmin.zhang@intel.com>
+ *   Wu Fengguang <fengguang.wu@intel.com>
+ *   Mike Galbraith <efault@gmx.de>
+ *   Paul Mackerras <paulus@samba.org>
+ *
+ * Released under the GPL v2. (and only v2, not any later version)
+ */
+#include "builtin.h"
+
+#include "perf.h"
+
+#include "util/symbol.h"
+#include "util/color.h"
+#include "util/util.h"
+#include "util/rbtree.h"
+#include "util/parse-options.h"
+#include "util/parse-events.h"
+
+#include <assert.h>
+#include <fcntl.h>
+
+#include <stdio.h>
+
+#include <errno.h>
+#include <time.h>
+#include <sched.h>
+#include <pthread.h>
+
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <sys/uio.h>
+#include <sys/mman.h>
+
+#include <linux/unistd.h>
+#include <linux/types.h>
+
+static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+
+static int			system_wide			=  0;
+
+static int			default_interval		= 100000;
+
+static __u64			count_filter			=  5;
+static int			print_entries			= 15;
+
+static int			target_pid			= -1;
+static int			profile_cpu			= -1;
+static int			nr_cpus				=  0;
+static unsigned int		realtime_prio			=  0;
+static int			group				=  0;
+static unsigned int		page_size;
+static unsigned int		mmap_pages			= 16;
+static int			freq				=  0;
+
+static char			*sym_filter;
+static unsigned long		filter_start;
+static unsigned long		filter_end;
+
+static int			delay_secs			=  2;
+static int			zero;
+static int			dump_symtab;
+
+/*
+ * Symbols
+ */
+
+static uint64_t			min_ip;
+static uint64_t			max_ip = -1ll;
+
+struct sym_entry {
+	struct rb_node		rb_node;
+	struct list_head	node;
+	unsigned long		count[MAX_COUNTERS];
+	unsigned long		snap_count;
+	double			weight;
+	int			skip;
+};
+
+struct sym_entry		*sym_filter_entry;
+
+struct dso			*kernel_dso;
+
+/*
+ * Symbols will be added here in record_ip and will get out
+ * after decayed.
+ */
+static LIST_HEAD(active_symbols);
+static pthread_mutex_t active_symbols_lock = PTHREAD_MUTEX_INITIALIZER;
+
+/*
+ * Ordering weight: count-1 * count-2 * ... / count-n
+ */
+static double sym_weight(const struct sym_entry *sym)
+{
+	double weight = sym->snap_count;
+	int counter;
+
+	for (counter = 1; counter < nr_counters-1; counter++)
+		weight *= sym->count[counter];
+
+	weight /= (sym->count[counter] + 1);
+
+	return weight;
+}
+
+static long			samples;
+static long			userspace_samples;
+static const char		CONSOLE_CLEAR[] = "[H[2J";
+
+static void __list_insert_active_sym(struct sym_entry *syme)
+{
+	list_add(&syme->node, &active_symbols);
+}
+
+static void list_remove_active_sym(struct sym_entry *syme)
+{
+	pthread_mutex_lock(&active_symbols_lock);
+	list_del_init(&syme->node);
+	pthread_mutex_unlock(&active_symbols_lock);
+}
+
+static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se)
+{
+	struct rb_node **p = &tree->rb_node;
+	struct rb_node *parent = NULL;
+	struct sym_entry *iter;
+
+	while (*p != NULL) {
+		parent = *p;
+		iter = rb_entry(parent, struct sym_entry, rb_node);
+
+		if (se->weight > iter->weight)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&se->rb_node, parent, p);
+	rb_insert_color(&se->rb_node, tree);
+}
+
+static void print_sym_table(void)
+{
+	int printed = 0, j;
+	int counter;
+	float samples_per_sec = samples/delay_secs;
+	float ksamples_per_sec = (samples-userspace_samples)/delay_secs;
+	float sum_ksamples = 0.0;
+	struct sym_entry *syme, *n;
+	struct rb_root tmp = RB_ROOT;
+	struct rb_node *nd;
+
+	samples = userspace_samples = 0;
+
+	/* Sort the active symbols */
+	pthread_mutex_lock(&active_symbols_lock);
+	syme = list_entry(active_symbols.next, struct sym_entry, node);
+	pthread_mutex_unlock(&active_symbols_lock);
+
+	list_for_each_entry_safe_from(syme, n, &active_symbols, node) {
+		syme->snap_count = syme->count[0];
+		if (syme->snap_count != 0) {
+			syme->weight = sym_weight(syme);
+			rb_insert_active_sym(&tmp, syme);
+			sum_ksamples += syme->snap_count;
+
+			for (j = 0; j < nr_counters; j++)
+				syme->count[j] = zero ? 0 : syme->count[j] * 7 / 8;
+		} else
+			list_remove_active_sym(syme);
+	}
+
+	puts(CONSOLE_CLEAR);
+
+	printf(
+"------------------------------------------------------------------------------\n");
+	printf( "   PerfTop:%8.0f irqs/sec  kernel:%4.1f%% [",
+		samples_per_sec,
+		100.0 - (100.0*((samples_per_sec-ksamples_per_sec)/samples_per_sec)));
+
+	if (nr_counters == 1) {
+		printf("%Ld", attrs[0].sample_period);
+		if (freq)
+			printf("Hz ");
+		else
+			printf(" ");
+	}
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		if (counter)
+			printf("/");
+
+		printf("%s", event_name(counter));
+	}
+
+	printf( "], ");
+
+	if (target_pid != -1)
+		printf(" (target_pid: %d", target_pid);
+	else
+		printf(" (all");
+
+	if (profile_cpu != -1)
+		printf(", cpu: %d)\n", profile_cpu);
+	else {
+		if (target_pid != -1)
+			printf(")\n");
+		else
+			printf(", %d CPUs)\n", nr_cpus);
+	}
+
+	printf("------------------------------------------------------------------------------\n\n");
+
+	if (nr_counters == 1)
+		printf("             samples    pcnt");
+	else
+		printf("  weight     samples    pcnt");
+
+	printf("         RIP          kernel function\n"
+	       	       "  ______     _______   _____   ________________   _______________\n\n"
+	);
+
+	for (nd = rb_first(&tmp); nd; nd = rb_next(nd)) {
+		struct sym_entry *syme = rb_entry(nd, struct sym_entry, rb_node);
+		struct symbol *sym = (struct symbol *)(syme + 1);
+		char *color = PERF_COLOR_NORMAL;
+		double pcnt;
+
+		if (++printed > print_entries || syme->snap_count < count_filter)
+			continue;
+
+		pcnt = 100.0 - (100.0 * ((sum_ksamples - syme->snap_count) /
+					 sum_ksamples));
+
+		/*
+		 * We color high-overhead entries in red, low-overhead
+		 * entries in green - and keep the middle ground normal:
+		 */
+		if (pcnt >= 5.0)
+			color = PERF_COLOR_RED;
+		if (pcnt < 0.5)
+			color = PERF_COLOR_GREEN;
+
+		if (nr_counters == 1)
+			printf("%20.2f - ", syme->weight);
+		else
+			printf("%9.1f %10ld - ", syme->weight, syme->snap_count);
+
+		color_fprintf(stdout, color, "%4.1f%%", pcnt);
+		printf(" - %016llx : %s\n", sym->start, sym->name);
+	}
+}
+
+static void *display_thread(void *arg)
+{
+	struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
+	int delay_msecs = delay_secs * 1000;
+
+	printf("PerfTop refresh period: %d seconds\n", delay_secs);
+
+	do {
+		print_sym_table();
+	} while (!poll(&stdin_poll, 1, delay_msecs) == 1);
+
+	printf("key pressed - exiting.\n");
+	exit(0);
+
+	return NULL;
+}
+
+static int symbol_filter(struct dso *self, struct symbol *sym)
+{
+	static int filter_match;
+	struct sym_entry *syme;
+	const char *name = sym->name;
+
+	if (!strcmp(name, "_text") ||
+	    !strcmp(name, "_etext") ||
+	    !strcmp(name, "_sinittext") ||
+	    !strncmp("init_module", name, 11) ||
+	    !strncmp("cleanup_module", name, 14) ||
+	    strstr(name, "_text_start") ||
+	    strstr(name, "_text_end"))
+		return 1;
+
+	syme = dso__sym_priv(self, sym);
+	/* Tag samples to be skipped. */
+	if (!strcmp("default_idle", name) ||
+	    !strcmp("cpu_idle", name) ||
+	    !strcmp("enter_idle", name) ||
+	    !strcmp("exit_idle", name) ||
+	    !strcmp("mwait_idle", name))
+		syme->skip = 1;
+
+	if (filter_match == 1) {
+		filter_end = sym->start;
+		filter_match = -1;
+		if (filter_end - filter_start > 10000) {
+			fprintf(stderr,
+				"hm, too large filter symbol <%s> - skipping.\n",
+				sym_filter);
+			fprintf(stderr, "symbol filter start: %016lx\n",
+				filter_start);
+			fprintf(stderr, "                end: %016lx\n",
+				filter_end);
+			filter_end = filter_start = 0;
+			sym_filter = NULL;
+			sleep(1);
+		}
+	}
+
+	if (filter_match == 0 && sym_filter && !strcmp(name, sym_filter)) {
+		filter_match = 1;
+		filter_start = sym->start;
+	}
+
+
+	return 0;
+}
+
+static int parse_symbols(void)
+{
+	struct rb_node *node;
+	struct symbol  *sym;
+
+	kernel_dso = dso__new("[kernel]", sizeof(struct sym_entry));
+	if (kernel_dso == NULL)
+		return -1;
+
+	if (dso__load_kernel(kernel_dso, NULL, symbol_filter, 1) != 0)
+		goto out_delete_dso;
+
+	node = rb_first(&kernel_dso->syms);
+	sym = rb_entry(node, struct symbol, rb_node);
+	min_ip = sym->start;
+
+	node = rb_last(&kernel_dso->syms);
+	sym = rb_entry(node, struct symbol, rb_node);
+	max_ip = sym->end;
+
+	if (dump_symtab)
+		dso__fprintf(kernel_dso, stderr);
+
+	return 0;
+
+out_delete_dso:
+	dso__delete(kernel_dso);
+	kernel_dso = NULL;
+	return -1;
+}
+
+#define TRACE_COUNT     3
+
+/*
+ * Binary search in the histogram table and record the hit:
+ */
+static void record_ip(uint64_t ip, int counter)
+{
+	struct symbol *sym = dso__find_symbol(kernel_dso, ip);
+
+	if (sym != NULL) {
+		struct sym_entry *syme = dso__sym_priv(kernel_dso, sym);
+
+		if (!syme->skip) {
+			syme->count[counter]++;
+			pthread_mutex_lock(&active_symbols_lock);
+			if (list_empty(&syme->node) || !syme->node.next)
+				__list_insert_active_sym(syme);
+			pthread_mutex_unlock(&active_symbols_lock);
+			return;
+		}
+	}
+
+	samples--;
+}
+
+static void process_event(uint64_t ip, int counter)
+{
+	samples++;
+
+	if (ip < min_ip || ip > max_ip) {
+		userspace_samples++;
+		return;
+	}
+
+	record_ip(ip, counter);
+}
+
+struct mmap_data {
+	int			counter;
+	void			*base;
+	unsigned int		mask;
+	unsigned int		prev;
+};
+
+static unsigned int mmap_read_head(struct mmap_data *md)
+{
+	struct perf_counter_mmap_page *pc = md->base;
+	int head;
+
+	head = pc->data_head;
+	rmb();
+
+	return head;
+}
+
+struct timeval last_read, this_read;
+
+static void mmap_read(struct mmap_data *md)
+{
+	unsigned int head = mmap_read_head(md);
+	unsigned int old = md->prev;
+	unsigned char *data = md->base + page_size;
+	int diff;
+
+	gettimeofday(&this_read, NULL);
+
+	/*
+	 * If we're further behind than half the buffer, there's a chance
+	 * the writer will bite our tail and mess up the samples under us.
+	 *
+	 * If we somehow ended up ahead of the head, we got messed up.
+	 *
+	 * In either case, truncate and restart at head.
+	 */
+	diff = head - old;
+	if (diff > md->mask / 2 || diff < 0) {
+		struct timeval iv;
+		unsigned long msecs;
+
+		timersub(&this_read, &last_read, &iv);
+		msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
+
+		fprintf(stderr, "WARNING: failed to keep up with mmap data."
+				"  Last read %lu msecs ago.\n", msecs);
+
+		/*
+		 * head points to a known good entry, start there.
+		 */
+		old = head;
+	}
+
+	last_read = this_read;
+
+	for (; old != head;) {
+		struct ip_event {
+			struct perf_event_header header;
+			__u64 ip;
+			__u32 pid, target_pid;
+		};
+		struct mmap_event {
+			struct perf_event_header header;
+			__u32 pid, target_pid;
+			__u64 start;
+			__u64 len;
+			__u64 pgoff;
+			char filename[PATH_MAX];
+		};
+
+		typedef union event_union {
+			struct perf_event_header header;
+			struct ip_event ip;
+			struct mmap_event mmap;
+		} event_t;
+
+		event_t *event = (event_t *)&data[old & md->mask];
+
+		event_t event_copy;
+
+		size_t size = event->header.size;
+
+		/*
+		 * Event straddles the mmap boundary -- header should always
+		 * be inside due to u64 alignment of output.
+		 */
+		if ((old & md->mask) + size != ((old + size) & md->mask)) {
+			unsigned int offset = old;
+			unsigned int len = min(sizeof(*event), size), cpy;
+			void *dst = &event_copy;
+
+			do {
+				cpy = min(md->mask + 1 - (offset & md->mask), len);
+				memcpy(dst, &data[offset & md->mask], cpy);
+				offset += cpy;
+				dst += cpy;
+				len -= cpy;
+			} while (len);
+
+			event = &event_copy;
+		}
+
+		old += size;
+
+		if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
+			if (event->header.type & PERF_SAMPLE_IP)
+				process_event(event->ip.ip, md->counter);
+		}
+	}
+
+	md->prev = old;
+}
+
+static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
+static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
+
+static int __cmd_top(void)
+{
+	struct perf_counter_attr *attr;
+	pthread_t thread;
+	int i, counter, group_fd, nr_poll = 0;
+	unsigned int cpu;
+	int ret;
+
+	for (i = 0; i < nr_cpus; i++) {
+		group_fd = -1;
+		for (counter = 0; counter < nr_counters; counter++) {
+
+			cpu	= profile_cpu;
+			if (target_pid == -1 && profile_cpu == -1)
+				cpu = i;
+
+			attr = attrs + counter;
+
+			attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+			attr->freq		= freq;
+
+			fd[i][counter] = sys_perf_counter_open(attr, target_pid, cpu, group_fd, 0);
+			if (fd[i][counter] < 0) {
+				int err = errno;
+
+				error("syscall returned with %d (%s)\n",
+					fd[i][counter], strerror(err));
+				if (err == EPERM)
+					printf("Are you root?\n");
+				exit(-1);
+			}
+			assert(fd[i][counter] >= 0);
+			fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
+
+			/*
+			 * First counter acts as the group leader:
+			 */
+			if (group && group_fd == -1)
+				group_fd = fd[i][counter];
+
+			event_array[nr_poll].fd = fd[i][counter];
+			event_array[nr_poll].events = POLLIN;
+			nr_poll++;
+
+			mmap_array[i][counter].counter = counter;
+			mmap_array[i][counter].prev = 0;
+			mmap_array[i][counter].mask = mmap_pages*page_size - 1;
+			mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
+					PROT_READ, MAP_SHARED, fd[i][counter], 0);
+			if (mmap_array[i][counter].base == MAP_FAILED)
+				die("failed to mmap with %d (%s)\n", errno, strerror(errno));
+		}
+	}
+
+	if (pthread_create(&thread, NULL, display_thread, NULL)) {
+		printf("Could not create display thread.\n");
+		exit(-1);
+	}
+
+	if (realtime_prio) {
+		struct sched_param param;
+
+		param.sched_priority = realtime_prio;
+		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
+			printf("Could not set realtime priority.\n");
+			exit(-1);
+		}
+	}
+
+	while (1) {
+		int hits = samples;
+
+		for (i = 0; i < nr_cpus; i++) {
+			for (counter = 0; counter < nr_counters; counter++)
+				mmap_read(&mmap_array[i][counter]);
+		}
+
+		if (hits == samples)
+			ret = poll(event_array, nr_poll, 100);
+	}
+
+	return 0;
+}
+
+static const char * const top_usage[] = {
+	"perf top [<options>]",
+	NULL
+};
+
+static const struct option options[] = {
+	OPT_CALLBACK('e', "event", NULL, "event",
+		     "event selector. use 'perf list' to list available events",
+		     parse_events),
+	OPT_INTEGER('c', "count", &default_interval,
+		    "event period to sample"),
+	OPT_INTEGER('p', "pid", &target_pid,
+		    "profile events on existing pid"),
+	OPT_BOOLEAN('a', "all-cpus", &system_wide,
+			    "system-wide collection from all CPUs"),
+	OPT_INTEGER('C', "CPU", &profile_cpu,
+		    "CPU to profile on"),
+	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
+		    "number of mmap data pages"),
+	OPT_INTEGER('r', "realtime", &realtime_prio,
+		    "collect data with this RT SCHED_FIFO priority"),
+	OPT_INTEGER('d', "delay", &delay_secs,
+		    "number of seconds to delay between refreshes"),
+	OPT_BOOLEAN('D', "dump-symtab", &dump_symtab,
+			    "dump the symbol table used for profiling"),
+	OPT_INTEGER('f', "count-filter", &count_filter,
+		    "only display functions with more events than this"),
+	OPT_BOOLEAN('g', "group", &group,
+			    "put the counters into a counter group"),
+	OPT_STRING('s', "sym-filter", &sym_filter, "pattern",
+		    "only display symbols matchig this pattern"),
+	OPT_BOOLEAN('z', "zero", &group,
+		    "zero history across updates"),
+	OPT_INTEGER('F', "freq", &freq,
+		    "profile at this frequency"),
+	OPT_INTEGER('E', "entries", &print_entries,
+		    "display this many functions"),
+	OPT_END()
+};
+
+int cmd_top(int argc, const char **argv, const char *prefix)
+{
+	int counter;
+
+	page_size = sysconf(_SC_PAGE_SIZE);
+
+	argc = parse_options(argc, argv, options, top_usage, 0);
+	if (argc)
+		usage_with_options(top_usage, options);
+
+	if (freq) {
+		default_interval = freq;
+		freq = 1;
+	}
+
+	/* CPU and PID are mutually exclusive */
+	if (target_pid != -1 && profile_cpu != -1) {
+		printf("WARNING: PID switch overriding CPU\n");
+		sleep(1);
+		profile_cpu = -1;
+	}
+
+	if (!nr_counters)
+		nr_counters = 1;
+
+	if (delay_secs < 1)
+		delay_secs = 1;
+
+	parse_symbols();
+
+	/*
+	 * Fill in the ones not specifically initialized via -c:
+	 */
+	for (counter = 0; counter < nr_counters; counter++) {
+		if (attrs[counter].sample_period)
+			continue;
+
+		attrs[counter].sample_period = default_interval;
+	}
+
+	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	assert(nr_cpus <= MAX_NR_CPUS);
+	assert(nr_cpus >= 0);
+
+	if (target_pid != -1 || profile_cpu != -1)
+		nr_cpus = 1;
+
+	return __cmd_top();
+}
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
new file mode 100644
index 0000000..51d1682
--- /dev/null
+++ b/tools/perf/builtin.h
@@ -0,0 +1,26 @@
+#ifndef BUILTIN_H
+#define BUILTIN_H
+
+#include "util/util.h"
+#include "util/strbuf.h"
+
+extern const char perf_version_string[];
+extern const char perf_usage_string[];
+extern const char perf_more_info_string[];
+
+extern void list_common_cmds_help(void);
+extern const char *help_unknown_cmd(const char *cmd);
+extern void prune_packed_objects(int);
+extern int read_line_with_nul(char *buf, int size, FILE *file);
+extern int check_pager_config(const char *cmd);
+
+extern int cmd_annotate(int argc, const char **argv, const char *prefix);
+extern int cmd_help(int argc, const char **argv, const char *prefix);
+extern int cmd_record(int argc, const char **argv, const char *prefix);
+extern int cmd_report(int argc, const char **argv, const char *prefix);
+extern int cmd_stat(int argc, const char **argv, const char *prefix);
+extern int cmd_top(int argc, const char **argv, const char *prefix);
+extern int cmd_version(int argc, const char **argv, const char *prefix);
+extern int cmd_list(int argc, const char **argv, const char *prefix);
+
+#endif
diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt
new file mode 100644
index 0000000..eebce30
--- /dev/null
+++ b/tools/perf/command-list.txt
@@ -0,0 +1,10 @@
+#
+# List of known perf commands.
+# command name			category [deprecated] [common]
+#
+perf-annotate			mainporcelain common
+perf-list			mainporcelain common
+perf-record			mainporcelain common
+perf-report			mainporcelain common
+perf-stat			mainporcelain common
+perf-top			mainporcelain common
diff --git a/tools/perf/design.txt b/tools/perf/design.txt
new file mode 100644
index 0000000..d325076
--- /dev/null
+++ b/tools/perf/design.txt
@@ -0,0 +1,442 @@
+
+Performance Counters for Linux
+------------------------------
+
+Performance counters are special hardware registers available on most modern
+CPUs. These registers count the number of certain types of hw events: such
+as instructions executed, cachemisses suffered, or branches mis-predicted -
+without slowing down the kernel or applications. These registers can also
+trigger interrupts when a threshold number of events have passed - and can
+thus be used to profile the code that runs on that CPU.
+
+The Linux Performance Counter subsystem provides an abstraction of these
+hardware capabilities. It provides per task and per CPU counters, counter
+groups, and it provides event capabilities on top of those.  It
+provides "virtual" 64-bit counters, regardless of the width of the
+underlying hardware counters.
+
+Performance counters are accessed via special file descriptors.
+There's one file descriptor per virtual counter used.
+
+The special file descriptor is opened via the perf_counter_open()
+system call:
+
+   int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
+			     pid_t pid, int cpu, int group_fd,
+			     unsigned long flags);
+
+The syscall returns the new fd. The fd can be used via the normal
+VFS system calls: read() can be used to read the counter, fcntl()
+can be used to set the blocking mode, etc.
+
+Multiple counters can be kept open at a time, and the counters
+can be poll()ed.
+
+When creating a new counter fd, 'perf_counter_hw_event' is:
+
+struct perf_counter_hw_event {
+        /*
+         * The MSB of the config word signifies if the rest contains cpu
+         * specific (raw) counter configuration data, if unset, the next
+         * 7 bits are an event type and the rest of the bits are the event
+         * identifier.
+         */
+        __u64                   config;
+
+        __u64                   irq_period;
+        __u32                   record_type;
+        __u32                   read_format;
+
+        __u64                   disabled       :  1, /* off by default        */
+                                inherit        :  1, /* children inherit it   */
+                                pinned         :  1, /* must always be on PMU */
+                                exclusive      :  1, /* only group on PMU     */
+                                exclude_user   :  1, /* don't count user      */
+                                exclude_kernel :  1, /* ditto kernel          */
+                                exclude_hv     :  1, /* ditto hypervisor      */
+                                exclude_idle   :  1, /* don't count when idle */
+                                mmap           :  1, /* include mmap data     */
+                                munmap         :  1, /* include munmap data   */
+                                comm           :  1, /* include comm data     */
+
+                                __reserved_1   : 52;
+
+        __u32                   extra_config_len;
+        __u32                   wakeup_events;  /* wakeup every n events */
+
+        __u64                   __reserved_2;
+        __u64                   __reserved_3;
+};
+
+The 'config' field specifies what the counter should count.  It
+is divided into 3 bit-fields:
+
+raw_type: 1 bit   (most significant bit)	0x8000_0000_0000_0000
+type:	  7 bits  (next most significant)	0x7f00_0000_0000_0000
+event_id: 56 bits (least significant)		0x00ff_ffff_ffff_ffff
+
+If 'raw_type' is 1, then the counter will count a hardware event
+specified by the remaining 63 bits of event_config.  The encoding is
+machine-specific.
+
+If 'raw_type' is 0, then the 'type' field says what kind of counter
+this is, with the following encoding:
+
+enum perf_event_types {
+	PERF_TYPE_HARDWARE		= 0,
+	PERF_TYPE_SOFTWARE		= 1,
+	PERF_TYPE_TRACEPOINT		= 2,
+};
+
+A counter of PERF_TYPE_HARDWARE will count the hardware event
+specified by 'event_id':
+
+/*
+ * Generalized performance counter event types, used by the hw_event.event_id
+ * parameter of the sys_perf_counter_open() syscall:
+ */
+enum hw_event_ids {
+	/*
+	 * Common hardware events, generalized by the kernel:
+	 */
+	PERF_COUNT_CPU_CYCLES		= 0,
+	PERF_COUNT_INSTRUCTIONS		= 1,
+	PERF_COUNT_CACHE_REFERENCES	= 2,
+	PERF_COUNT_CACHE_MISSES		= 3,
+	PERF_COUNT_BRANCH_INSTRUCTIONS	= 4,
+	PERF_COUNT_BRANCH_MISSES	= 5,
+	PERF_COUNT_BUS_CYCLES		= 6,
+};
+
+These are standardized types of events that work relatively uniformly
+on all CPUs that implement Performance Counters support under Linux,
+although there may be variations (e.g., different CPUs might count
+cache references and misses at different levels of the cache hierarchy).
+If a CPU is not able to count the selected event, then the system call
+will return -EINVAL.
+
+More hw_event_types are supported as well, but they are CPU-specific
+and accessed as raw events.  For example, to count "External bus
+cycles while bus lock signal asserted" events on Intel Core CPUs, pass
+in a 0x4064 event_id value and set hw_event.raw_type to 1.
+
+A counter of type PERF_TYPE_SOFTWARE will count one of the available
+software events, selected by 'event_id':
+
+/*
+ * Special "software" counters provided by the kernel, even if the hardware
+ * does not support performance counters. These counters measure various
+ * physical and sw events of the kernel (and allow the profiling of them as
+ * well):
+ */
+enum sw_event_ids {
+	PERF_COUNT_CPU_CLOCK		= 0,
+	PERF_COUNT_TASK_CLOCK		= 1,
+	PERF_COUNT_PAGE_FAULTS		= 2,
+	PERF_COUNT_CONTEXT_SWITCHES	= 3,
+	PERF_COUNT_CPU_MIGRATIONS	= 4,
+	PERF_COUNT_PAGE_FAULTS_MIN	= 5,
+	PERF_COUNT_PAGE_FAULTS_MAJ	= 6,
+};
+
+Counters of the type PERF_TYPE_TRACEPOINT are available when the ftrace event
+tracer is available, and event_id values can be obtained from
+/debug/tracing/events/*/*/id
+
+
+Counters come in two flavours: counting counters and sampling
+counters.  A "counting" counter is one that is used for counting the
+number of events that occur, and is characterised by having
+irq_period = 0.
+
+
+A read() on a counter returns the current value of the counter and possible
+additional values as specified by 'read_format', each value is a u64 (8 bytes)
+in size.
+
+/*
+ * Bits that can be set in hw_event.read_format to request that
+ * reads on the counter should return the indicated quantities,
+ * in increasing order of bit value, after the counter value.
+ */
+enum perf_counter_read_format {
+        PERF_FORMAT_TOTAL_TIME_ENABLED  =  1,
+        PERF_FORMAT_TOTAL_TIME_RUNNING  =  2,
+};
+
+Using these additional values one can establish the overcommit ratio for a
+particular counter allowing one to take the round-robin scheduling effect
+into account.
+
+
+A "sampling" counter is one that is set up to generate an interrupt
+every N events, where N is given by 'irq_period'.  A sampling counter
+has irq_period > 0. The record_type controls what data is recorded on each
+interrupt:
+
+/*
+ * Bits that can be set in hw_event.record_type to request information
+ * in the overflow packets.
+ */
+enum perf_counter_record_format {
+        PERF_RECORD_IP          = 1U << 0,
+        PERF_RECORD_TID         = 1U << 1,
+        PERF_RECORD_TIME        = 1U << 2,
+        PERF_RECORD_ADDR        = 1U << 3,
+        PERF_RECORD_GROUP       = 1U << 4,
+        PERF_RECORD_CALLCHAIN   = 1U << 5,
+};
+
+Such (and other) events will be recorded in a ring-buffer, which is
+available to user-space using mmap() (see below).
+
+The 'disabled' bit specifies whether the counter starts out disabled
+or enabled.  If it is initially disabled, it can be enabled by ioctl
+or prctl (see below).
+
+The 'inherit' bit, if set, specifies that this counter should count
+events on descendant tasks as well as the task specified.  This only
+applies to new descendents, not to any existing descendents at the
+time the counter is created (nor to any new descendents of existing
+descendents).
+
+The 'pinned' bit, if set, specifies that the counter should always be
+on the CPU if at all possible.  It only applies to hardware counters
+and only to group leaders.  If a pinned counter cannot be put onto the
+CPU (e.g. because there are not enough hardware counters or because of
+a conflict with some other event), then the counter goes into an
+'error' state, where reads return end-of-file (i.e. read() returns 0)
+until the counter is subsequently enabled or disabled.
+
+The 'exclusive' bit, if set, specifies that when this counter's group
+is on the CPU, it should be the only group using the CPU's counters.
+In future, this will allow sophisticated monitoring programs to supply
+extra configuration information via 'extra_config_len' to exploit
+advanced features of the CPU's Performance Monitor Unit (PMU) that are
+not otherwise accessible and that might disrupt other hardware
+counters.
+
+The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a
+way to request that counting of events be restricted to times when the
+CPU is in user, kernel and/or hypervisor mode.
+
+The 'mmap' and 'munmap' bits allow recording of PROT_EXEC mmap/munmap
+operations, these can be used to relate userspace IP addresses to actual
+code, even after the mapping (or even the whole process) is gone,
+these events are recorded in the ring-buffer (see below).
+
+The 'comm' bit allows tracking of process comm data on process creation.
+This too is recorded in the ring-buffer (see below).
+
+The 'pid' parameter to the perf_counter_open() system call allows the
+counter to be specific to a task:
+
+ pid == 0: if the pid parameter is zero, the counter is attached to the
+ current task.
+
+ pid > 0: the counter is attached to a specific task (if the current task
+ has sufficient privilege to do so)
+
+ pid < 0: all tasks are counted (per cpu counters)
+
+The 'cpu' parameter allows a counter to be made specific to a CPU:
+
+ cpu >= 0: the counter is restricted to a specific CPU
+ cpu == -1: the counter counts on all CPUs
+
+(Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.)
+
+A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts
+events of that task and 'follows' that task to whatever CPU the task
+gets schedule to. Per task counters can be created by any user, for
+their own tasks.
+
+A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts
+all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege.
+
+The 'flags' parameter is currently unused and must be zero.
+
+The 'group_fd' parameter allows counter "groups" to be set up.  A
+counter group has one counter which is the group "leader".  The leader
+is created first, with group_fd = -1 in the perf_counter_open call
+that creates it.  The rest of the group members are created
+subsequently, with group_fd giving the fd of the group leader.
+(A single counter on its own is created with group_fd = -1 and is
+considered to be a group with only 1 member.)
+
+A counter group is scheduled onto the CPU as a unit, that is, it will
+only be put onto the CPU if all of the counters in the group can be
+put onto the CPU.  This means that the values of the member counters
+can be meaningfully compared, added, divided (to get ratios), etc.,
+with each other, since they have counted events for the same set of
+executed instructions.
+
+
+Like stated, asynchronous events, like counter overflow or PROT_EXEC mmap
+tracking are logged into a ring-buffer. This ring-buffer is created and
+accessed through mmap().
+
+The mmap size should be 1+2^n pages, where the first page is a meta-data page
+(struct perf_counter_mmap_page) that contains various bits of information such
+as where the ring-buffer head is.
+
+/*
+ * Structure of the page that can be mapped via mmap
+ */
+struct perf_counter_mmap_page {
+        __u32   version;                /* version number of this structure */
+        __u32   compat_version;         /* lowest version this is compat with */
+
+        /*
+         * Bits needed to read the hw counters in user-space.
+         *
+         *   u32 seq;
+         *   s64 count;
+         *
+         *   do {
+         *     seq = pc->lock;
+         *
+         *     barrier()
+         *     if (pc->index) {
+         *       count = pmc_read(pc->index - 1);
+         *       count += pc->offset;
+         *     } else
+         *       goto regular_read;
+         *
+         *     barrier();
+         *   } while (pc->lock != seq);
+         *
+         * NOTE: for obvious reason this only works on self-monitoring
+         *       processes.
+         */
+        __u32   lock;                   /* seqlock for synchronization */
+        __u32   index;                  /* hardware counter identifier */
+        __s64   offset;                 /* add to hardware counter value */
+
+        /*
+         * Control data for the mmap() data buffer.
+         *
+         * User-space reading this value should issue an rmb(), on SMP capable
+         * platforms, after reading this value -- see perf_counter_wakeup().
+         */
+        __u32   data_head;              /* head in the data section */
+};
+
+NOTE: the hw-counter userspace bits are arch specific and are currently only
+      implemented on powerpc.
+
+The following 2^n pages are the ring-buffer which contains events of the form:
+
+#define PERF_EVENT_MISC_KERNEL          (1 << 0)
+#define PERF_EVENT_MISC_USER            (1 << 1)
+#define PERF_EVENT_MISC_OVERFLOW        (1 << 2)
+
+struct perf_event_header {
+        __u32   type;
+        __u16   misc;
+        __u16   size;
+};
+
+enum perf_event_type {
+
+        /*
+         * The MMAP events record the PROT_EXEC mappings so that we can
+         * correlate userspace IPs to code. They have the following structure:
+         *
+         * struct {
+         *      struct perf_event_header        header;
+         *
+         *      u32                             pid, tid;
+         *      u64                             addr;
+         *      u64                             len;
+         *      u64                             pgoff;
+         *      char                            filename[];
+         * };
+         */
+        PERF_EVENT_MMAP                 = 1,
+        PERF_EVENT_MUNMAP               = 2,
+
+        /*
+         * struct {
+         *      struct perf_event_header        header;
+         *
+         *      u32                             pid, tid;
+         *      char                            comm[];
+         * };
+         */
+        PERF_EVENT_COMM                 = 3,
+
+        /*
+         * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
+         * will be PERF_RECORD_*
+         *
+         * struct {
+         *      struct perf_event_header        header;
+         *
+         *      { u64                   ip;       } && PERF_RECORD_IP
+         *      { u32                   pid, tid; } && PERF_RECORD_TID
+         *      { u64                   time;     } && PERF_RECORD_TIME
+         *      { u64                   addr;     } && PERF_RECORD_ADDR
+         *
+         *      { u64                   nr;
+         *        { u64 event, val; }   cnt[nr];  } && PERF_RECORD_GROUP
+         *
+         *      { u16                   nr,
+         *                              hv,
+         *                              kernel,
+         *                              user;
+         *        u64                   ips[nr];  } && PERF_RECORD_CALLCHAIN
+         * };
+         */
+};
+
+NOTE: PERF_RECORD_CALLCHAIN is arch specific and currently only implemented
+      on x86.
+
+Notification of new events is possible through poll()/select()/epoll() and
+fcntl() managing signals.
+
+Normally a notification is generated for every page filled, however one can
+additionally set perf_counter_hw_event.wakeup_events to generate one every
+so many counter overflow events.
+
+Future work will include a splice() interface to the ring-buffer.
+
+
+Counters can be enabled and disabled in two ways: via ioctl and via
+prctl.  When a counter is disabled, it doesn't count or generate
+events but does continue to exist and maintain its count value.
+
+An individual counter or counter group can be enabled with
+
+	ioctl(fd, PERF_COUNTER_IOC_ENABLE);
+
+or disabled with
+
+	ioctl(fd, PERF_COUNTER_IOC_DISABLE);
+
+Enabling or disabling the leader of a group enables or disables the
+whole group; that is, while the group leader is disabled, none of the
+counters in the group will count.  Enabling or disabling a member of a
+group other than the leader only affects that counter - disabling an
+non-leader stops that counter from counting but doesn't affect any
+other counter.
+
+Additionally, non-inherited overflow counters can use
+
+	ioctl(fd, PERF_COUNTER_IOC_REFRESH, nr);
+
+to enable a counter for 'nr' events, after which it gets disabled again.
+
+A process can enable or disable all the counter groups that are
+attached to it, using prctl:
+
+	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
+
+	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
+
+This applies to all counters on the current process, whether created
+by this process or by another, and doesn't affect any counters that
+this process has created on other processes.  It only enables or
+disables the group leaders, not any other members in the groups.
+
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
new file mode 100644
index 0000000..4eb7259
--- /dev/null
+++ b/tools/perf/perf.c
@@ -0,0 +1,428 @@
+/*
+ * perf.c
+ *
+ * Performance analysis utility.
+ *
+ * This is the main hub from which the sub-commands (perf stat,
+ * perf top, perf record, perf report, etc.) are started.
+ */
+#include "builtin.h"
+
+#include "util/exec_cmd.h"
+#include "util/cache.h"
+#include "util/quote.h"
+#include "util/run-command.h"
+
+const char perf_usage_string[] =
+	"perf [--version] [--help] COMMAND [ARGS]";
+
+const char perf_more_info_string[] =
+	"See 'perf help COMMAND' for more information on a specific command.";
+
+static int use_pager = -1;
+struct pager_config {
+	const char *cmd;
+	int val;
+};
+
+static int pager_command_config(const char *var, const char *value, void *data)
+{
+	struct pager_config *c = data;
+	if (!prefixcmp(var, "pager.") && !strcmp(var + 6, c->cmd))
+		c->val = perf_config_bool(var, value);
+	return 0;
+}
+
+/* returns 0 for "no pager", 1 for "use pager", and -1 for "not specified" */
+int check_pager_config(const char *cmd)
+{
+	struct pager_config c;
+	c.cmd = cmd;
+	c.val = -1;
+	perf_config(pager_command_config, &c);
+	return c.val;
+}
+
+static void commit_pager_choice(void) {
+	switch (use_pager) {
+	case 0:
+		setenv("PERF_PAGER", "cat", 1);
+		break;
+	case 1:
+		/* setup_pager(); */
+		break;
+	default:
+		break;
+	}
+}
+
+static int handle_options(const char*** argv, int* argc, int* envchanged)
+{
+	int handled = 0;
+
+	while (*argc > 0) {
+		const char *cmd = (*argv)[0];
+		if (cmd[0] != '-')
+			break;
+
+		/*
+		 * For legacy reasons, the "version" and "help"
+		 * commands can be written with "--" prepended
+		 * to make them look like flags.
+		 */
+		if (!strcmp(cmd, "--help") || !strcmp(cmd, "--version"))
+			break;
+
+		/*
+		 * Check remaining flags.
+		 */
+		if (!prefixcmp(cmd, "--exec-path")) {
+			cmd += 11;
+			if (*cmd == '=')
+				perf_set_argv_exec_path(cmd + 1);
+			else {
+				puts(perf_exec_path());
+				exit(0);
+			}
+		} else if (!strcmp(cmd, "--html-path")) {
+			puts(system_path(PERF_HTML_PATH));
+			exit(0);
+		} else if (!strcmp(cmd, "-p") || !strcmp(cmd, "--paginate")) {
+			use_pager = 1;
+		} else if (!strcmp(cmd, "--no-pager")) {
+			use_pager = 0;
+			if (envchanged)
+				*envchanged = 1;
+		} else if (!strcmp(cmd, "--perf-dir")) {
+			if (*argc < 2) {
+				fprintf(stderr, "No directory given for --perf-dir.\n" );
+				usage(perf_usage_string);
+			}
+			setenv(PERF_DIR_ENVIRONMENT, (*argv)[1], 1);
+			if (envchanged)
+				*envchanged = 1;
+			(*argv)++;
+			(*argc)--;
+			handled++;
+		} else if (!prefixcmp(cmd, "--perf-dir=")) {
+			setenv(PERF_DIR_ENVIRONMENT, cmd + 10, 1);
+			if (envchanged)
+				*envchanged = 1;
+		} else if (!strcmp(cmd, "--work-tree")) {
+			if (*argc < 2) {
+				fprintf(stderr, "No directory given for --work-tree.\n" );
+				usage(perf_usage_string);
+			}
+			setenv(PERF_WORK_TREE_ENVIRONMENT, (*argv)[1], 1);
+			if (envchanged)
+				*envchanged = 1;
+			(*argv)++;
+			(*argc)--;
+		} else if (!prefixcmp(cmd, "--work-tree=")) {
+			setenv(PERF_WORK_TREE_ENVIRONMENT, cmd + 12, 1);
+			if (envchanged)
+				*envchanged = 1;
+		} else {
+			fprintf(stderr, "Unknown option: %s\n", cmd);
+			usage(perf_usage_string);
+		}
+
+		(*argv)++;
+		(*argc)--;
+		handled++;
+	}
+	return handled;
+}
+
+static int handle_alias(int *argcp, const char ***argv)
+{
+	int envchanged = 0, ret = 0, saved_errno = errno;
+	int count, option_count;
+	const char** new_argv;
+	const char *alias_command;
+	char *alias_string;
+
+	alias_command = (*argv)[0];
+	alias_string = alias_lookup(alias_command);
+	if (alias_string) {
+		if (alias_string[0] == '!') {
+			if (*argcp > 1) {
+				struct strbuf buf;
+
+				strbuf_init(&buf, PATH_MAX);
+				strbuf_addstr(&buf, alias_string);
+				sq_quote_argv(&buf, (*argv) + 1, PATH_MAX);
+				free(alias_string);
+				alias_string = buf.buf;
+			}
+			ret = system(alias_string + 1);
+			if (ret >= 0 && WIFEXITED(ret) &&
+			    WEXITSTATUS(ret) != 127)
+				exit(WEXITSTATUS(ret));
+			die("Failed to run '%s' when expanding alias '%s'",
+			    alias_string + 1, alias_command);
+		}
+		count = split_cmdline(alias_string, &new_argv);
+		if (count < 0)
+			die("Bad alias.%s string", alias_command);
+		option_count = handle_options(&new_argv, &count, &envchanged);
+		if (envchanged)
+			die("alias '%s' changes environment variables\n"
+				 "You can use '!perf' in the alias to do this.",
+				 alias_command);
+		memmove(new_argv - option_count, new_argv,
+				count * sizeof(char *));
+		new_argv -= option_count;
+
+		if (count < 1)
+			die("empty alias for %s", alias_command);
+
+		if (!strcmp(alias_command, new_argv[0]))
+			die("recursive alias: %s", alias_command);
+
+		new_argv = realloc(new_argv, sizeof(char*) *
+				    (count + *argcp + 1));
+		/* insert after command name */
+		memcpy(new_argv + count, *argv + 1, sizeof(char*) * *argcp);
+		new_argv[count+*argcp] = NULL;
+
+		*argv = new_argv;
+		*argcp += count - 1;
+
+		ret = 1;
+	}
+
+	errno = saved_errno;
+
+	return ret;
+}
+
+const char perf_version_string[] = PERF_VERSION;
+
+#define RUN_SETUP	(1<<0)
+#define USE_PAGER	(1<<1)
+/*
+ * require working tree to be present -- anything uses this needs
+ * RUN_SETUP for reading from the configuration file.
+ */
+#define NEED_WORK_TREE	(1<<2)
+
+struct cmd_struct {
+	const char *cmd;
+	int (*fn)(int, const char **, const char *);
+	int option;
+};
+
+static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
+{
+	int status;
+	struct stat st;
+	const char *prefix;
+
+	prefix = NULL;
+	if (p->option & RUN_SETUP)
+		prefix = NULL; /* setup_perf_directory(); */
+
+	if (use_pager == -1 && p->option & RUN_SETUP)
+		use_pager = check_pager_config(p->cmd);
+	if (use_pager == -1 && p->option & USE_PAGER)
+		use_pager = 1;
+	commit_pager_choice();
+
+	if (p->option & NEED_WORK_TREE)
+		/* setup_work_tree() */;
+
+	status = p->fn(argc, argv, prefix);
+	if (status)
+		return status & 0xff;
+
+	/* Somebody closed stdout? */
+	if (fstat(fileno(stdout), &st))
+		return 0;
+	/* Ignore write errors for pipes and sockets.. */
+	if (S_ISFIFO(st.st_mode) || S_ISSOCK(st.st_mode))
+		return 0;
+
+	/* Check for ENOSPC and EIO errors.. */
+	if (fflush(stdout))
+		die("write failure on standard output: %s", strerror(errno));
+	if (ferror(stdout))
+		die("unknown write failure on standard output");
+	if (fclose(stdout))
+		die("close failed on standard output: %s", strerror(errno));
+	return 0;
+}
+
+static void handle_internal_command(int argc, const char **argv)
+{
+	const char *cmd = argv[0];
+	static struct cmd_struct commands[] = {
+		{ "help", cmd_help, 0 },
+		{ "list", cmd_list, 0 },
+		{ "record", cmd_record, 0 },
+		{ "report", cmd_report, 0 },
+		{ "stat", cmd_stat, 0 },
+		{ "top", cmd_top, 0 },
+		{ "annotate", cmd_annotate, 0 },
+		{ "version", cmd_version, 0 },
+	};
+	int i;
+	static const char ext[] = STRIP_EXTENSION;
+
+	if (sizeof(ext) > 1) {
+		i = strlen(argv[0]) - strlen(ext);
+		if (i > 0 && !strcmp(argv[0] + i, ext)) {
+			char *argv0 = strdup(argv[0]);
+			argv[0] = cmd = argv0;
+			argv0[i] = '\0';
+		}
+	}
+
+	/* Turn "perf cmd --help" into "perf help cmd" */
+	if (argc > 1 && !strcmp(argv[1], "--help")) {
+		argv[1] = argv[0];
+		argv[0] = cmd = "help";
+	}
+
+	for (i = 0; i < ARRAY_SIZE(commands); i++) {
+		struct cmd_struct *p = commands+i;
+		if (strcmp(p->cmd, cmd))
+			continue;
+		exit(run_builtin(p, argc, argv));
+	}
+}
+
+static void execv_dashed_external(const char **argv)
+{
+	struct strbuf cmd = STRBUF_INIT;
+	const char *tmp;
+	int status;
+
+	strbuf_addf(&cmd, "perf-%s", argv[0]);
+
+	/*
+	 * argv[0] must be the perf command, but the argv array
+	 * belongs to the caller, and may be reused in
+	 * subsequent loop iterations. Save argv[0] and
+	 * restore it on error.
+	 */
+	tmp = argv[0];
+	argv[0] = cmd.buf;
+
+	/*
+	 * if we fail because the command is not found, it is
+	 * OK to return. Otherwise, we just pass along the status code.
+	 */
+	status = run_command_v_opt(argv, 0);
+	if (status != -ERR_RUN_COMMAND_EXEC) {
+		if (IS_RUN_COMMAND_ERR(status))
+			die("unable to run '%s'", argv[0]);
+		exit(-status);
+	}
+	errno = ENOENT; /* as if we called execvp */
+
+	argv[0] = tmp;
+
+	strbuf_release(&cmd);
+}
+
+static int run_argv(int *argcp, const char ***argv)
+{
+	int done_alias = 0;
+
+	while (1) {
+		/* See if it's an internal command */
+		handle_internal_command(*argcp, *argv);
+
+		/* .. then try the external ones */
+		execv_dashed_external(*argv);
+
+		/* It could be an alias -- this works around the insanity
+		 * of overriding "perf log" with "perf show" by having
+		 * alias.log = show
+		 */
+		if (done_alias || !handle_alias(argcp, argv))
+			break;
+		done_alias = 1;
+	}
+
+	return done_alias;
+}
+
+
+int main(int argc, const char **argv)
+{
+	const char *cmd;
+
+	cmd = perf_extract_argv0_path(argv[0]);
+	if (!cmd)
+		cmd = "perf-help";
+
+	/*
+	 * "perf-xxxx" is the same as "perf xxxx", but we obviously:
+	 *
+	 *  - cannot take flags in between the "perf" and the "xxxx".
+	 *  - cannot execute it externally (since it would just do
+	 *    the same thing over again)
+	 *
+	 * So we just directly call the internal command handler, and
+	 * die if that one cannot handle it.
+	 */
+	if (!prefixcmp(cmd, "perf-")) {
+		cmd += 5;
+		argv[0] = cmd;
+		handle_internal_command(argc, argv);
+		die("cannot handle %s internally", cmd);
+	}
+
+	/* Look for flags.. */
+	argv++;
+	argc--;
+	handle_options(&argv, &argc, NULL);
+	commit_pager_choice();
+	if (argc > 0) {
+		if (!prefixcmp(argv[0], "--"))
+			argv[0] += 2;
+	} else {
+		/* The user didn't specify a command; give them help */
+		printf("\n usage: %s\n\n", perf_usage_string);
+		list_common_cmds_help();
+		printf("\n %s\n\n", perf_more_info_string);
+		exit(1);
+	}
+	cmd = argv[0];
+
+	/*
+	 * We use PATH to find perf commands, but we prepend some higher
+	 * precidence paths: the "--exec-path" option, the PERF_EXEC_PATH
+	 * environment, and the $(perfexecdir) from the Makefile at build
+	 * time.
+	 */
+	setup_path();
+
+	while (1) {
+		static int done_help = 0;
+		static int was_alias = 0;
+
+		was_alias = run_argv(&argc, &argv);
+		if (errno != ENOENT)
+			break;
+
+		if (was_alias) {
+			fprintf(stderr, "Expansion of alias '%s' failed; "
+				"'%s' is not a perf-command\n",
+				cmd, argv[0]);
+			exit(1);
+		}
+		if (!done_help) {
+			cmd = argv[0] = help_unknown_cmd(cmd);
+			done_help = 1;
+		} else
+			break;
+	}
+
+	fprintf(stderr, "Failed to run command '%s': %s\n",
+		cmd, strerror(errno));
+
+	return 1;
+}
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
new file mode 100644
index 0000000..af0a504
--- /dev/null
+++ b/tools/perf/perf.h
@@ -0,0 +1,67 @@
+#ifndef _PERF_PERF_H
+#define _PERF_PERF_H
+
+#if defined(__x86_64__) || defined(__i386__)
+#include "../../arch/x86/include/asm/unistd.h"
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#endif
+
+#ifdef __powerpc__
+#include "../../arch/powerpc/include/asm/unistd.h"
+#define rmb()		asm volatile ("sync" ::: "memory")
+#define cpu_relax()	asm volatile ("" ::: "memory");
+#endif
+
+#include <time.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+
+#include "../../include/linux/perf_counter.h"
+
+/*
+ * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
+ * counters in the current task.
+ */
+#define PR_TASK_PERF_COUNTERS_DISABLE   31
+#define PR_TASK_PERF_COUNTERS_ENABLE    32
+
+#ifndef NSEC_PER_SEC
+# define NSEC_PER_SEC			1000000000ULL
+#endif
+
+static inline unsigned long long rdclock(void)
+{
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
+}
+
+/*
+ * Pick up some kernel type conventions:
+ */
+#define __user
+#define asmlinkage
+
+#define unlikely(x)	__builtin_expect(!!(x), 0)
+#define min(x, y) ({				\
+	typeof(x) _min1 = (x);			\
+	typeof(y) _min2 = (y);			\
+	(void) (&_min1 == &_min2);		\
+	_min1 < _min2 ? _min1 : _min2; })
+
+static inline int
+sys_perf_counter_open(struct perf_counter_attr *attr_uptr,
+		      pid_t pid, int cpu, int group_fd,
+		      unsigned long flags)
+{
+	return syscall(__NR_perf_counter_open, attr_uptr, pid, cpu,
+		       group_fd, flags);
+}
+
+#define MAX_COUNTERS			256
+#define MAX_NR_CPUS			256
+
+#endif
diff --git a/tools/perf/util/PERF-VERSION-GEN b/tools/perf/util/PERF-VERSION-GEN
new file mode 100755
index 0000000..c561d15
--- /dev/null
+++ b/tools/perf/util/PERF-VERSION-GEN
@@ -0,0 +1,42 @@
+#!/bin/sh
+
+GVF=PERF-VERSION-FILE
+DEF_VER=v0.0.1.PERF
+
+LF='
+'
+
+# First see if there is a version file (included in release tarballs),
+# then try git-describe, then default.
+if test -f version
+then
+	VN=$(cat version) || VN="$DEF_VER"
+elif test -d .git -o -f .git &&
+	VN=$(git describe --abbrev=4 HEAD 2>/dev/null) &&
+	case "$VN" in
+	*$LF*) (exit 1) ;;
+	v[0-9]*)
+		git update-index -q --refresh
+		test -z "$(git diff-index --name-only HEAD --)" ||
+		VN="$VN-dirty" ;;
+	esac
+then
+	VN=$(echo "$VN" | sed -e 's/-/./g');
+else
+	VN="$DEF_VER"
+fi
+
+VN=$(expr "$VN" : v*'\(.*\)')
+
+if test -r $GVF
+then
+	VC=$(sed -e 's/^PERF_VERSION = //' <$GVF)
+else
+	VC=unset
+fi
+test "$VN" = "$VC" || {
+	echo >&2 "PERF_VERSION = $VN"
+	echo "PERF_VERSION = $VN" >$GVF
+}
+
+
diff --git a/tools/perf/util/abspath.c b/tools/perf/util/abspath.c
new file mode 100644
index 0000000..61d33b8
--- /dev/null
+++ b/tools/perf/util/abspath.c
@@ -0,0 +1,117 @@
+#include "cache.h"
+
+/*
+ * Do not use this for inspecting *tracked* content.  When path is a
+ * symlink to a directory, we do not want to say it is a directory when
+ * dealing with tracked content in the working tree.
+ */
+static int is_directory(const char *path)
+{
+	struct stat st;
+	return (!stat(path, &st) && S_ISDIR(st.st_mode));
+}
+
+/* We allow "recursive" symbolic links. Only within reason, though. */
+#define MAXDEPTH 5
+
+const char *make_absolute_path(const char *path)
+{
+	static char bufs[2][PATH_MAX + 1], *buf = bufs[0], *next_buf = bufs[1];
+	char cwd[1024] = "";
+	int buf_index = 1, len;
+
+	int depth = MAXDEPTH;
+	char *last_elem = NULL;
+	struct stat st;
+
+	if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
+		die ("Too long path: %.*s", 60, path);
+
+	while (depth--) {
+		if (!is_directory(buf)) {
+			char *last_slash = strrchr(buf, '/');
+			if (last_slash) {
+				*last_slash = '\0';
+				last_elem = xstrdup(last_slash + 1);
+			} else {
+				last_elem = xstrdup(buf);
+				*buf = '\0';
+			}
+		}
+
+		if (*buf) {
+			if (!*cwd && !getcwd(cwd, sizeof(cwd)))
+				die ("Could not get current working directory");
+
+			if (chdir(buf))
+				die ("Could not switch to '%s'", buf);
+		}
+		if (!getcwd(buf, PATH_MAX))
+			die ("Could not get current working directory");
+
+		if (last_elem) {
+			int len = strlen(buf);
+			if (len + strlen(last_elem) + 2 > PATH_MAX)
+				die ("Too long path name: '%s/%s'",
+						buf, last_elem);
+			buf[len] = '/';
+			strcpy(buf + len + 1, last_elem);
+			free(last_elem);
+			last_elem = NULL;
+		}
+
+		if (!lstat(buf, &st) && S_ISLNK(st.st_mode)) {
+			len = readlink(buf, next_buf, PATH_MAX);
+			if (len < 0)
+				die ("Invalid symlink: %s", buf);
+			if (PATH_MAX <= len)
+				die("symbolic link too long: %s", buf);
+			next_buf[len] = '\0';
+			buf = next_buf;
+			buf_index = 1 - buf_index;
+			next_buf = bufs[buf_index];
+		} else
+			break;
+	}
+
+	if (*cwd && chdir(cwd))
+		die ("Could not change back to '%s'", cwd);
+
+	return buf;
+}
+
+static const char *get_pwd_cwd(void)
+{
+	static char cwd[PATH_MAX + 1];
+	char *pwd;
+	struct stat cwd_stat, pwd_stat;
+	if (getcwd(cwd, PATH_MAX) == NULL)
+		return NULL;
+	pwd = getenv("PWD");
+	if (pwd && strcmp(pwd, cwd)) {
+		stat(cwd, &cwd_stat);
+		if (!stat(pwd, &pwd_stat) &&
+		    pwd_stat.st_dev == cwd_stat.st_dev &&
+		    pwd_stat.st_ino == cwd_stat.st_ino) {
+			strlcpy(cwd, pwd, PATH_MAX);
+		}
+	}
+	return cwd;
+}
+
+const char *make_nonrelative_path(const char *path)
+{
+	static char buf[PATH_MAX + 1];
+
+	if (is_absolute_path(path)) {
+		if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
+			die("Too long path: %.*s", 60, path);
+	} else {
+		const char *cwd = get_pwd_cwd();
+		if (!cwd)
+			die("Cannot determine the current working directory");
+		if (snprintf(buf, PATH_MAX, "%s/%s", cwd, path) >= PATH_MAX)
+			die("Too long path: %.*s", 60, path);
+	}
+	return buf;
+}
diff --git a/tools/perf/util/alias.c b/tools/perf/util/alias.c
new file mode 100644
index 0000000..9b3dd2b
--- /dev/null
+++ b/tools/perf/util/alias.c
@@ -0,0 +1,77 @@
+#include "cache.h"
+
+static const char *alias_key;
+static char *alias_val;
+
+static int alias_lookup_cb(const char *k, const char *v, void *cb)
+{
+	if (!prefixcmp(k, "alias.") && !strcmp(k+6, alias_key)) {
+		if (!v)
+			return config_error_nonbool(k);
+		alias_val = strdup(v);
+		return 0;
+	}
+	return 0;
+}
+
+char *alias_lookup(const char *alias)
+{
+	alias_key = alias;
+	alias_val = NULL;
+	perf_config(alias_lookup_cb, NULL);
+	return alias_val;
+}
+
+int split_cmdline(char *cmdline, const char ***argv)
+{
+	int src, dst, count = 0, size = 16;
+	char quoted = 0;
+
+	*argv = malloc(sizeof(char*) * size);
+
+	/* split alias_string */
+	(*argv)[count++] = cmdline;
+	for (src = dst = 0; cmdline[src];) {
+		char c = cmdline[src];
+		if (!quoted && isspace(c)) {
+			cmdline[dst++] = 0;
+			while (cmdline[++src]
+					&& isspace(cmdline[src]))
+				; /* skip */
+			if (count >= size) {
+				size += 16;
+				*argv = realloc(*argv, sizeof(char*) * size);
+			}
+			(*argv)[count++] = cmdline + dst;
+		} else if (!quoted && (c == '\'' || c == '"')) {
+			quoted = c;
+			src++;
+		} else if (c == quoted) {
+			quoted = 0;
+			src++;
+		} else {
+			if (c == '\\' && quoted != '\'') {
+				src++;
+				c = cmdline[src];
+				if (!c) {
+					free(*argv);
+					*argv = NULL;
+					return error("cmdline ends with \\");
+				}
+			}
+			cmdline[dst++] = c;
+			src++;
+		}
+	}
+
+	cmdline[dst] = 0;
+
+	if (quoted) {
+		free(*argv);
+		*argv = NULL;
+		return error("unclosed quote");
+	}
+
+	return count;
+}
+
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h
new file mode 100644
index 0000000..393d614
--- /dev/null
+++ b/tools/perf/util/cache.h
@@ -0,0 +1,119 @@
+#ifndef CACHE_H
+#define CACHE_H
+
+#include "util.h"
+#include "strbuf.h"
+
+#define PERF_DIR_ENVIRONMENT "PERF_DIR"
+#define PERF_WORK_TREE_ENVIRONMENT "PERF_WORK_TREE"
+#define DEFAULT_PERF_DIR_ENVIRONMENT ".perf"
+#define DB_ENVIRONMENT "PERF_OBJECT_DIRECTORY"
+#define INDEX_ENVIRONMENT "PERF_INDEX_FILE"
+#define GRAFT_ENVIRONMENT "PERF_GRAFT_FILE"
+#define TEMPLATE_DIR_ENVIRONMENT "PERF_TEMPLATE_DIR"
+#define CONFIG_ENVIRONMENT "PERF_CONFIG"
+#define EXEC_PATH_ENVIRONMENT "PERF_EXEC_PATH"
+#define CEILING_DIRECTORIES_ENVIRONMENT "PERF_CEILING_DIRECTORIES"
+#define PERFATTRIBUTES_FILE ".perfattributes"
+#define INFOATTRIBUTES_FILE "info/attributes"
+#define ATTRIBUTE_MACRO_PREFIX "[attr]"
+
+typedef int (*config_fn_t)(const char *, const char *, void *);
+extern int perf_default_config(const char *, const char *, void *);
+extern int perf_config_from_file(config_fn_t fn, const char *, void *);
+extern int perf_config(config_fn_t fn, void *);
+extern int perf_parse_ulong(const char *, unsigned long *);
+extern int perf_config_int(const char *, const char *);
+extern unsigned long perf_config_ulong(const char *, const char *);
+extern int perf_config_bool_or_int(const char *, const char *, int *);
+extern int perf_config_bool(const char *, const char *);
+extern int perf_config_string(const char **, const char *, const char *);
+extern int perf_config_set(const char *, const char *);
+extern int perf_config_set_multivar(const char *, const char *, const char *, int);
+extern int perf_config_rename_section(const char *, const char *);
+extern const char *perf_etc_perfconfig(void);
+extern int check_repository_format_version(const char *var, const char *value, void *cb);
+extern int perf_config_system(void);
+extern int perf_config_global(void);
+extern int config_error_nonbool(const char *);
+extern const char *config_exclusive_filename;
+
+#define MAX_PERFNAME (1000)
+extern char perf_default_email[MAX_PERFNAME];
+extern char perf_default_name[MAX_PERFNAME];
+extern int user_ident_explicitly_given;
+
+extern const char *perf_log_output_encoding;
+extern const char *perf_mailmap_file;
+
+/* IO helper functions */
+extern void maybe_flush_or_die(FILE *, const char *);
+extern int copy_fd(int ifd, int ofd);
+extern int copy_file(const char *dst, const char *src, int mode);
+extern ssize_t read_in_full(int fd, void *buf, size_t count);
+extern ssize_t write_in_full(int fd, const void *buf, size_t count);
+extern void write_or_die(int fd, const void *buf, size_t count);
+extern int write_or_whine(int fd, const void *buf, size_t count, const char *msg);
+extern int write_or_whine_pipe(int fd, const void *buf, size_t count, const char *msg);
+extern void fsync_or_die(int fd, const char *);
+
+/* pager.c */
+extern void setup_pager(void);
+extern const char *pager_program;
+extern int pager_in_use(void);
+extern int pager_use_color;
+
+extern const char *editor_program;
+extern const char *excludes_file;
+
+char *alias_lookup(const char *alias);
+int split_cmdline(char *cmdline, const char ***argv);
+
+#define alloc_nr(x) (((x)+16)*3/2)
+
+/*
+ * Realloc the buffer pointed at by variable 'x' so that it can hold
+ * at least 'nr' entries; the number of entries currently allocated
+ * is 'alloc', using the standard growing factor alloc_nr() macro.
+ *
+ * DO NOT USE any expression with side-effect for 'x' or 'alloc'.
+ */
+#define ALLOC_GROW(x, nr, alloc) \
+	do { \
+		if ((nr) > alloc) { \
+			if (alloc_nr(alloc) < (nr)) \
+				alloc = (nr); \
+			else \
+				alloc = alloc_nr(alloc); \
+			x = xrealloc((x), alloc * sizeof(*(x))); \
+		} \
+	} while(0)
+
+
+static inline int is_absolute_path(const char *path)
+{
+	return path[0] == '/';
+}
+
+const char *make_absolute_path(const char *path);
+const char *make_nonrelative_path(const char *path);
+const char *make_relative_path(const char *abs, const char *base);
+int normalize_path_copy(char *dst, const char *src);
+int longest_ancestor_length(const char *path, const char *prefix_list);
+char *strip_path_suffix(const char *path, const char *suffix);
+
+extern char *mkpath(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
+extern char *perf_path(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
+/* perf_mkstemp() - create tmp file honoring TMPDIR variable */
+extern int perf_mkstemp(char *path, size_t len, const char *template);
+
+extern char *mksnpath(char *buf, size_t n, const char *fmt, ...)
+	__attribute__((format (printf, 3, 4)));
+extern char *perf_snpath(char *buf, size_t n, const char *fmt, ...)
+	__attribute__((format (printf, 3, 4)));
+extern char *perf_pathdup(const char *fmt, ...)
+	__attribute__((format (printf, 1, 2)));
+
+extern size_t strlcpy(char *dest, const char *src, size_t size);
+
+#endif /* CACHE_H */
diff --git a/tools/perf/util/color.c b/tools/perf/util/color.c
new file mode 100644
index 0000000..9a8c20c
--- /dev/null
+++ b/tools/perf/util/color.c
@@ -0,0 +1,241 @@
+#include "cache.h"
+#include "color.h"
+
+int perf_use_color_default = -1;
+
+static int parse_color(const char *name, int len)
+{
+	static const char * const color_names[] = {
+		"normal", "black", "red", "green", "yellow",
+		"blue", "magenta", "cyan", "white"
+	};
+	char *end;
+	int i;
+	for (i = 0; i < ARRAY_SIZE(color_names); i++) {
+		const char *str = color_names[i];
+		if (!strncasecmp(name, str, len) && !str[len])
+			return i - 1;
+	}
+	i = strtol(name, &end, 10);
+	if (end - name == len && i >= -1 && i <= 255)
+		return i;
+	return -2;
+}
+
+static int parse_attr(const char *name, int len)
+{
+	static const int attr_values[] = { 1, 2, 4, 5, 7 };
+	static const char * const attr_names[] = {
+		"bold", "dim", "ul", "blink", "reverse"
+	};
+	int i;
+	for (i = 0; i < ARRAY_SIZE(attr_names); i++) {
+		const char *str = attr_names[i];
+		if (!strncasecmp(name, str, len) && !str[len])
+			return attr_values[i];
+	}
+	return -1;
+}
+
+void color_parse(const char *value, const char *var, char *dst)
+{
+	color_parse_mem(value, strlen(value), var, dst);
+}
+
+void color_parse_mem(const char *value, int value_len, const char *var,
+		char *dst)
+{
+	const char *ptr = value;
+	int len = value_len;
+	int attr = -1;
+	int fg = -2;
+	int bg = -2;
+
+	if (!strncasecmp(value, "reset", len)) {
+		strcpy(dst, PERF_COLOR_RESET);
+		return;
+	}
+
+	/* [fg [bg]] [attr] */
+	while (len > 0) {
+		const char *word = ptr;
+		int val, wordlen = 0;
+
+		while (len > 0 && !isspace(word[wordlen])) {
+			wordlen++;
+			len--;
+		}
+
+		ptr = word + wordlen;
+		while (len > 0 && isspace(*ptr)) {
+			ptr++;
+			len--;
+		}
+
+		val = parse_color(word, wordlen);
+		if (val >= -1) {
+			if (fg == -2) {
+				fg = val;
+				continue;
+			}
+			if (bg == -2) {
+				bg = val;
+				continue;
+			}
+			goto bad;
+		}
+		val = parse_attr(word, wordlen);
+		if (val < 0 || attr != -1)
+			goto bad;
+		attr = val;
+	}
+
+	if (attr >= 0 || fg >= 0 || bg >= 0) {
+		int sep = 0;
+
+		*dst++ = '\033';
+		*dst++ = '[';
+		if (attr >= 0) {
+			*dst++ = '0' + attr;
+			sep++;
+		}
+		if (fg >= 0) {
+			if (sep++)
+				*dst++ = ';';
+			if (fg < 8) {
+				*dst++ = '3';
+				*dst++ = '0' + fg;
+			} else {
+				dst += sprintf(dst, "38;5;%d", fg);
+			}
+		}
+		if (bg >= 0) {
+			if (sep++)
+				*dst++ = ';';
+			if (bg < 8) {
+				*dst++ = '4';
+				*dst++ = '0' + bg;
+			} else {
+				dst += sprintf(dst, "48;5;%d", bg);
+			}
+		}
+		*dst++ = 'm';
+	}
+	*dst = 0;
+	return;
+bad:
+	die("bad color value '%.*s' for variable '%s'", value_len, value, var);
+}
+
+int perf_config_colorbool(const char *var, const char *value, int stdout_is_tty)
+{
+	if (value) {
+		if (!strcasecmp(value, "never"))
+			return 0;
+		if (!strcasecmp(value, "always"))
+			return 1;
+		if (!strcasecmp(value, "auto"))
+			goto auto_color;
+	}
+
+	/* Missing or explicit false to turn off colorization */
+	if (!perf_config_bool(var, value))
+		return 0;
+
+	/* any normal truth value defaults to 'auto' */
+ auto_color:
+	if (stdout_is_tty < 0)
+		stdout_is_tty = isatty(1);
+	if (stdout_is_tty || (pager_in_use() && pager_use_color)) {
+		char *term = getenv("TERM");
+		if (term && strcmp(term, "dumb"))
+			return 1;
+	}
+	return 0;
+}
+
+int perf_color_default_config(const char *var, const char *value, void *cb)
+{
+	if (!strcmp(var, "color.ui")) {
+		perf_use_color_default = perf_config_colorbool(var, value, -1);
+		return 0;
+	}
+
+	return perf_default_config(var, value, cb);
+}
+
+static int color_vfprintf(FILE *fp, const char *color, const char *fmt,
+		va_list args, const char *trail)
+{
+	int r = 0;
+
+	/*
+	 * Auto-detect:
+	 */
+	if (perf_use_color_default < 0) {
+		if (isatty(1) || pager_in_use())
+			perf_use_color_default = 1;
+		else
+			perf_use_color_default = 0;
+	}
+
+	if (perf_use_color_default && *color)
+		r += fprintf(fp, "%s", color);
+	r += vfprintf(fp, fmt, args);
+	if (perf_use_color_default && *color)
+		r += fprintf(fp, "%s", PERF_COLOR_RESET);
+	if (trail)
+		r += fprintf(fp, "%s", trail);
+	return r;
+}
+
+
+
+int color_fprintf(FILE *fp, const char *color, const char *fmt, ...)
+{
+	va_list args;
+	int r;
+
+	va_start(args, fmt);
+	r = color_vfprintf(fp, color, fmt, args, NULL);
+	va_end(args);
+	return r;
+}
+
+int color_fprintf_ln(FILE *fp, const char *color, const char *fmt, ...)
+{
+	va_list args;
+	int r;
+	va_start(args, fmt);
+	r = color_vfprintf(fp, color, fmt, args, "\n");
+	va_end(args);
+	return r;
+}
+
+/*
+ * This function splits the buffer by newlines and colors the lines individually.
+ *
+ * Returns 0 on success.
+ */
+int color_fwrite_lines(FILE *fp, const char *color,
+		size_t count, const char *buf)
+{
+	if (!*color)
+		return fwrite(buf, count, 1, fp) != 1;
+	while (count) {
+		char *p = memchr(buf, '\n', count);
+		if (p != buf && (fputs(color, fp) < 0 ||
+				fwrite(buf, p ? p - buf : count, 1, fp) != 1 ||
+				fputs(PERF_COLOR_RESET, fp) < 0))
+			return -1;
+		if (!p)
+			return 0;
+		if (fputc('\n', fp) < 0)
+			return -1;
+		count -= p + 1 - buf;
+		buf = p + 1;
+	}
+	return 0;
+}
+
+
diff --git a/tools/perf/util/color.h b/tools/perf/util/color.h
new file mode 100644
index 0000000..5abfd37
--- /dev/null
+++ b/tools/perf/util/color.h
@@ -0,0 +1,36 @@
+#ifndef COLOR_H
+#define COLOR_H
+
+/* "\033[1;38;5;2xx;48;5;2xxm\0" is 23 bytes */
+#define COLOR_MAXLEN 24
+
+#define PERF_COLOR_NORMAL	""
+#define PERF_COLOR_RESET	"\033[m"
+#define PERF_COLOR_BOLD		"\033[1m"
+#define PERF_COLOR_RED		"\033[31m"
+#define PERF_COLOR_GREEN	"\033[32m"
+#define PERF_COLOR_YELLOW	"\033[33m"
+#define PERF_COLOR_BLUE		"\033[34m"
+#define PERF_COLOR_MAGENTA	"\033[35m"
+#define PERF_COLOR_CYAN		"\033[36m"
+#define PERF_COLOR_BG_RED	"\033[41m"
+
+/*
+ * This variable stores the value of color.ui
+ */
+extern int perf_use_color_default;
+
+
+/*
+ * Use this instead of perf_default_config if you need the value of color.ui.
+ */
+int perf_color_default_config(const char *var, const char *value, void *cb);
+
+int perf_config_colorbool(const char *var, const char *value, int stdout_is_tty);
+void color_parse(const char *value, const char *var, char *dst);
+void color_parse_mem(const char *value, int len, const char *var, char *dst);
+int color_fprintf(FILE *fp, const char *color, const char *fmt, ...);
+int color_fprintf_ln(FILE *fp, const char *color, const char *fmt, ...);
+int color_fwrite_lines(FILE *fp, const char *color, size_t count, const char *buf);
+
+#endif /* COLOR_H */
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
new file mode 100644
index 0000000..3dd13fa
--- /dev/null
+++ b/tools/perf/util/config.c
@@ -0,0 +1,873 @@
+/*
+ * GIT - The information manager from hell
+ *
+ * Copyright (C) Linus Torvalds, 2005
+ * Copyright (C) Johannes Schindelin, 2005
+ *
+ */
+#include "util.h"
+#include "cache.h"
+#include "exec_cmd.h"
+
+#define MAXNAME (256)
+
+static FILE *config_file;
+static const char *config_file_name;
+static int config_linenr;
+static int config_file_eof;
+
+const char *config_exclusive_filename = NULL;
+
+static int get_next_char(void)
+{
+	int c;
+	FILE *f;
+
+	c = '\n';
+	if ((f = config_file) != NULL) {
+		c = fgetc(f);
+		if (c == '\r') {
+			/* DOS like systems */
+			c = fgetc(f);
+			if (c != '\n') {
+				ungetc(c, f);
+				c = '\r';
+			}
+		}
+		if (c == '\n')
+			config_linenr++;
+		if (c == EOF) {
+			config_file_eof = 1;
+			c = '\n';
+		}
+	}
+	return c;
+}
+
+static char *parse_value(void)
+{
+	static char value[1024];
+	int quote = 0, comment = 0, len = 0, space = 0;
+
+	for (;;) {
+		int c = get_next_char();
+		if (len >= sizeof(value) - 1)
+			return NULL;
+		if (c == '\n') {
+			if (quote)
+				return NULL;
+			value[len] = 0;
+			return value;
+		}
+		if (comment)
+			continue;
+		if (isspace(c) && !quote) {
+			space = 1;
+			continue;
+		}
+		if (!quote) {
+			if (c == ';' || c == '#') {
+				comment = 1;
+				continue;
+			}
+		}
+		if (space) {
+			if (len)
+				value[len++] = ' ';
+			space = 0;
+		}
+		if (c == '\\') {
+			c = get_next_char();
+			switch (c) {
+			case '\n':
+				continue;
+			case 't':
+				c = '\t';
+				break;
+			case 'b':
+				c = '\b';
+				break;
+			case 'n':
+				c = '\n';
+				break;
+			/* Some characters escape as themselves */
+			case '\\': case '"':
+				break;
+			/* Reject unknown escape sequences */
+			default:
+				return NULL;
+			}
+			value[len++] = c;
+			continue;
+		}
+		if (c == '"') {
+			quote = 1-quote;
+			continue;
+		}
+		value[len++] = c;
+	}
+}
+
+static inline int iskeychar(int c)
+{
+	return isalnum(c) || c == '-';
+}
+
+static int get_value(config_fn_t fn, void *data, char *name, unsigned int len)
+{
+	int c;
+	char *value;
+
+	/* Get the full name */
+	for (;;) {
+		c = get_next_char();
+		if (config_file_eof)
+			break;
+		if (!iskeychar(c))
+			break;
+		name[len++] = tolower(c);
+		if (len >= MAXNAME)
+			return -1;
+	}
+	name[len] = 0;
+	while (c == ' ' || c == '\t')
+		c = get_next_char();
+
+	value = NULL;
+	if (c != '\n') {
+		if (c != '=')
+			return -1;
+		value = parse_value();
+		if (!value)
+			return -1;
+	}
+	return fn(name, value, data);
+}
+
+static int get_extended_base_var(char *name, int baselen, int c)
+{
+	do {
+		if (c == '\n')
+			return -1;
+		c = get_next_char();
+	} while (isspace(c));
+
+	/* We require the format to be '[base "extension"]' */
+	if (c != '"')
+		return -1;
+	name[baselen++] = '.';
+
+	for (;;) {
+		int c = get_next_char();
+		if (c == '\n')
+			return -1;
+		if (c == '"')
+			break;
+		if (c == '\\') {
+			c = get_next_char();
+			if (c == '\n')
+				return -1;
+		}
+		name[baselen++] = c;
+		if (baselen > MAXNAME / 2)
+			return -1;
+	}
+
+	/* Final ']' */
+	if (get_next_char() != ']')
+		return -1;
+	return baselen;
+}
+
+static int get_base_var(char *name)
+{
+	int baselen = 0;
+
+	for (;;) {
+		int c = get_next_char();
+		if (config_file_eof)
+			return -1;
+		if (c == ']')
+			return baselen;
+		if (isspace(c))
+			return get_extended_base_var(name, baselen, c);
+		if (!iskeychar(c) && c != '.')
+			return -1;
+		if (baselen > MAXNAME / 2)
+			return -1;
+		name[baselen++] = tolower(c);
+	}
+}
+
+static int perf_parse_file(config_fn_t fn, void *data)
+{
+	int comment = 0;
+	int baselen = 0;
+	static char var[MAXNAME];
+
+	/* U+FEFF Byte Order Mark in UTF8 */
+	static const unsigned char *utf8_bom = (unsigned char *) "\xef\xbb\xbf";
+	const unsigned char *bomptr = utf8_bom;
+
+	for (;;) {
+		int c = get_next_char();
+		if (bomptr && *bomptr) {
+			/* We are at the file beginning; skip UTF8-encoded BOM
+			 * if present. Sane editors won't put this in on their
+			 * own, but e.g. Windows Notepad will do it happily. */
+			if ((unsigned char) c == *bomptr) {
+				bomptr++;
+				continue;
+			} else {
+				/* Do not tolerate partial BOM. */
+				if (bomptr != utf8_bom)
+					break;
+				/* No BOM at file beginning. Cool. */
+				bomptr = NULL;
+			}
+		}
+		if (c == '\n') {
+			if (config_file_eof)
+				return 0;
+			comment = 0;
+			continue;
+		}
+		if (comment || isspace(c))
+			continue;
+		if (c == '#' || c == ';') {
+			comment = 1;
+			continue;
+		}
+		if (c == '[') {
+			baselen = get_base_var(var);
+			if (baselen <= 0)
+				break;
+			var[baselen++] = '.';
+			var[baselen] = 0;
+			continue;
+		}
+		if (!isalpha(c))
+			break;
+		var[baselen] = tolower(c);
+		if (get_value(fn, data, var, baselen+1) < 0)
+			break;
+	}
+	die("bad config file line %d in %s", config_linenr, config_file_name);
+}
+
+static int parse_unit_factor(const char *end, unsigned long *val)
+{
+	if (!*end)
+		return 1;
+	else if (!strcasecmp(end, "k")) {
+		*val *= 1024;
+		return 1;
+	}
+	else if (!strcasecmp(end, "m")) {
+		*val *= 1024 * 1024;
+		return 1;
+	}
+	else if (!strcasecmp(end, "g")) {
+		*val *= 1024 * 1024 * 1024;
+		return 1;
+	}
+	return 0;
+}
+
+static int perf_parse_long(const char *value, long *ret)
+{
+	if (value && *value) {
+		char *end;
+		long val = strtol(value, &end, 0);
+		unsigned long factor = 1;
+		if (!parse_unit_factor(end, &factor))
+			return 0;
+		*ret = val * factor;
+		return 1;
+	}
+	return 0;
+}
+
+int perf_parse_ulong(const char *value, unsigned long *ret)
+{
+	if (value && *value) {
+		char *end;
+		unsigned long val = strtoul(value, &end, 0);
+		if (!parse_unit_factor(end, &val))
+			return 0;
+		*ret = val;
+		return 1;
+	}
+	return 0;
+}
+
+static void die_bad_config(const char *name)
+{
+	if (config_file_name)
+		die("bad config value for '%s' in %s", name, config_file_name);
+	die("bad config value for '%s'", name);
+}
+
+int perf_config_int(const char *name, const char *value)
+{
+	long ret = 0;
+	if (!perf_parse_long(value, &ret))
+		die_bad_config(name);
+	return ret;
+}
+
+unsigned long perf_config_ulong(const char *name, const char *value)
+{
+	unsigned long ret;
+	if (!perf_parse_ulong(value, &ret))
+		die_bad_config(name);
+	return ret;
+}
+
+int perf_config_bool_or_int(const char *name, const char *value, int *is_bool)
+{
+	*is_bool = 1;
+	if (!value)
+		return 1;
+	if (!*value)
+		return 0;
+	if (!strcasecmp(value, "true") || !strcasecmp(value, "yes") || !strcasecmp(value, "on"))
+		return 1;
+	if (!strcasecmp(value, "false") || !strcasecmp(value, "no") || !strcasecmp(value, "off"))
+		return 0;
+	*is_bool = 0;
+	return perf_config_int(name, value);
+}
+
+int perf_config_bool(const char *name, const char *value)
+{
+	int discard;
+	return !!perf_config_bool_or_int(name, value, &discard);
+}
+
+int perf_config_string(const char **dest, const char *var, const char *value)
+{
+	if (!value)
+		return config_error_nonbool(var);
+	*dest = strdup(value);
+	return 0;
+}
+
+static int perf_default_core_config(const char *var, const char *value)
+{
+	/* Add other config variables here and to Documentation/config.txt. */
+	return 0;
+}
+
+int perf_default_config(const char *var, const char *value, void *dummy)
+{
+	if (!prefixcmp(var, "core."))
+		return perf_default_core_config(var, value);
+
+	/* Add other config variables here and to Documentation/config.txt. */
+	return 0;
+}
+
+int perf_config_from_file(config_fn_t fn, const char *filename, void *data)
+{
+	int ret;
+	FILE *f = fopen(filename, "r");
+
+	ret = -1;
+	if (f) {
+		config_file = f;
+		config_file_name = filename;
+		config_linenr = 1;
+		config_file_eof = 0;
+		ret = perf_parse_file(fn, data);
+		fclose(f);
+		config_file_name = NULL;
+	}
+	return ret;
+}
+
+const char *perf_etc_perfconfig(void)
+{
+	static const char *system_wide;
+	if (!system_wide)
+		system_wide = system_path(ETC_PERFCONFIG);
+	return system_wide;
+}
+
+static int perf_env_bool(const char *k, int def)
+{
+	const char *v = getenv(k);
+	return v ? perf_config_bool(k, v) : def;
+}
+
+int perf_config_system(void)
+{
+	return !perf_env_bool("PERF_CONFIG_NOSYSTEM", 0);
+}
+
+int perf_config_global(void)
+{
+	return !perf_env_bool("PERF_CONFIG_NOGLOBAL", 0);
+}
+
+int perf_config(config_fn_t fn, void *data)
+{
+	int ret = 0, found = 0;
+	char *repo_config = NULL;
+	const char *home = NULL;
+
+	/* Setting $PERF_CONFIG makes perf read _only_ the given config file. */
+	if (config_exclusive_filename)
+		return perf_config_from_file(fn, config_exclusive_filename, data);
+	if (perf_config_system() && !access(perf_etc_perfconfig(), R_OK)) {
+		ret += perf_config_from_file(fn, perf_etc_perfconfig(),
+					    data);
+		found += 1;
+	}
+
+	home = getenv("HOME");
+	if (perf_config_global() && home) {
+		char *user_config = strdup(mkpath("%s/.perfconfig", home));
+		if (!access(user_config, R_OK)) {
+			ret += perf_config_from_file(fn, user_config, data);
+			found += 1;
+		}
+		free(user_config);
+	}
+
+	repo_config = perf_pathdup("config");
+	if (!access(repo_config, R_OK)) {
+		ret += perf_config_from_file(fn, repo_config, data);
+		found += 1;
+	}
+	free(repo_config);
+	if (found == 0)
+		return -1;
+	return ret;
+}
+
+/*
+ * Find all the stuff for perf_config_set() below.
+ */
+
+#define MAX_MATCHES 512
+
+static struct {
+	int baselen;
+	char* key;
+	int do_not_match;
+	regex_t* value_regex;
+	int multi_replace;
+	size_t offset[MAX_MATCHES];
+	enum { START, SECTION_SEEN, SECTION_END_SEEN, KEY_SEEN } state;
+	int seen;
+} store;
+
+static int matches(const char* key, const char* value)
+{
+	return !strcmp(key, store.key) &&
+		(store.value_regex == NULL ||
+		 (store.do_not_match ^
+		  !regexec(store.value_regex, value, 0, NULL, 0)));
+}
+
+static int store_aux(const char* key, const char* value, void *cb)
+{
+	const char *ep;
+	size_t section_len;
+
+	switch (store.state) {
+	case KEY_SEEN:
+		if (matches(key, value)) {
+			if (store.seen == 1 && store.multi_replace == 0) {
+				warning("%s has multiple values", key);
+			} else if (store.seen >= MAX_MATCHES) {
+				error("too many matches for %s", key);
+				return 1;
+			}
+
+			store.offset[store.seen] = ftell(config_file);
+			store.seen++;
+		}
+		break;
+	case SECTION_SEEN:
+		/*
+		 * What we are looking for is in store.key (both
+		 * section and var), and its section part is baselen
+		 * long.  We found key (again, both section and var).
+		 * We would want to know if this key is in the same
+		 * section as what we are looking for.  We already
+		 * know we are in the same section as what should
+		 * hold store.key.
+		 */
+		ep = strrchr(key, '.');
+		section_len = ep - key;
+
+		if ((section_len != store.baselen) ||
+		    memcmp(key, store.key, section_len+1)) {
+			store.state = SECTION_END_SEEN;
+			break;
+		}
+
+		/*
+		 * Do not increment matches: this is no match, but we
+		 * just made sure we are in the desired section.
+		 */
+		store.offset[store.seen] = ftell(config_file);
+		/* fallthru */
+	case SECTION_END_SEEN:
+	case START:
+		if (matches(key, value)) {
+			store.offset[store.seen] = ftell(config_file);
+			store.state = KEY_SEEN;
+			store.seen++;
+		} else {
+			if (strrchr(key, '.') - key == store.baselen &&
+			      !strncmp(key, store.key, store.baselen)) {
+					store.state = SECTION_SEEN;
+					store.offset[store.seen] = ftell(config_file);
+			}
+		}
+	}
+	return 0;
+}
+
+static int store_write_section(int fd, const char* key)
+{
+	const char *dot;
+	int i, success;
+	struct strbuf sb = STRBUF_INIT;
+
+	dot = memchr(key, '.', store.baselen);
+	if (dot) {
+		strbuf_addf(&sb, "[%.*s \"", (int)(dot - key), key);
+		for (i = dot - key + 1; i < store.baselen; i++) {
+			if (key[i] == '"' || key[i] == '\\')
+				strbuf_addch(&sb, '\\');
+			strbuf_addch(&sb, key[i]);
+		}
+		strbuf_addstr(&sb, "\"]\n");
+	} else {
+		strbuf_addf(&sb, "[%.*s]\n", store.baselen, key);
+	}
+
+	success = write_in_full(fd, sb.buf, sb.len) == sb.len;
+	strbuf_release(&sb);
+
+	return success;
+}
+
+static int store_write_pair(int fd, const char* key, const char* value)
+{
+	int i, success;
+	int length = strlen(key + store.baselen + 1);
+	const char *quote = "";
+	struct strbuf sb = STRBUF_INIT;
+
+	/*
+	 * Check to see if the value needs to be surrounded with a dq pair.
+	 * Note that problematic characters are always backslash-quoted; this
+	 * check is about not losing leading or trailing SP and strings that
+	 * follow beginning-of-comment characters (i.e. ';' and '#') by the
+	 * configuration parser.
+	 */
+	if (value[0] == ' ')
+		quote = "\"";
+	for (i = 0; value[i]; i++)
+		if (value[i] == ';' || value[i] == '#')
+			quote = "\"";
+	if (i && value[i - 1] == ' ')
+		quote = "\"";
+
+	strbuf_addf(&sb, "\t%.*s = %s",
+		    length, key + store.baselen + 1, quote);
+
+	for (i = 0; value[i]; i++)
+		switch (value[i]) {
+		case '\n':
+			strbuf_addstr(&sb, "\\n");
+			break;
+		case '\t':
+			strbuf_addstr(&sb, "\\t");
+			break;
+		case '"':
+		case '\\':
+			strbuf_addch(&sb, '\\');
+		default:
+			strbuf_addch(&sb, value[i]);
+			break;
+		}
+	strbuf_addf(&sb, "%s\n", quote);
+
+	success = write_in_full(fd, sb.buf, sb.len) == sb.len;
+	strbuf_release(&sb);
+
+	return success;
+}
+
+static ssize_t find_beginning_of_line(const char* contents, size_t size,
+	size_t offset_, int* found_bracket)
+{
+	size_t equal_offset = size, bracket_offset = size;
+	ssize_t offset;
+
+contline:
+	for (offset = offset_-2; offset > 0
+			&& contents[offset] != '\n'; offset--)
+		switch (contents[offset]) {
+			case '=': equal_offset = offset; break;
+			case ']': bracket_offset = offset; break;
+		}
+	if (offset > 0 && contents[offset-1] == '\\') {
+		offset_ = offset;
+		goto contline;
+	}
+	if (bracket_offset < equal_offset) {
+		*found_bracket = 1;
+		offset = bracket_offset+1;
+	} else
+		offset++;
+
+	return offset;
+}
+
+int perf_config_set(const char* key, const char* value)
+{
+	return perf_config_set_multivar(key, value, NULL, 0);
+}
+
+/*
+ * If value==NULL, unset in (remove from) config,
+ * if value_regex!=NULL, disregard key/value pairs where value does not match.
+ * if multi_replace==0, nothing, or only one matching key/value is replaced,
+ *     else all matching key/values (regardless how many) are removed,
+ *     before the new pair is written.
+ *
+ * Returns 0 on success.
+ *
+ * This function does this:
+ *
+ * - it locks the config file by creating ".perf/config.lock"
+ *
+ * - it then parses the config using store_aux() as validator to find
+ *   the position on the key/value pair to replace. If it is to be unset,
+ *   it must be found exactly once.
+ *
+ * - the config file is mmap()ed and the part before the match (if any) is
+ *   written to the lock file, then the changed part and the rest.
+ *
+ * - the config file is removed and the lock file rename()d to it.
+ *
+ */
+int perf_config_set_multivar(const char* key, const char* value,
+	const char* value_regex, int multi_replace)
+{
+	int i, dot;
+	int fd = -1, in_fd;
+	int ret = 0;
+	char* config_filename;
+	const char* last_dot = strrchr(key, '.');
+
+	if (config_exclusive_filename)
+		config_filename = strdup(config_exclusive_filename);
+	else
+		config_filename = perf_pathdup("config");
+
+	/*
+	 * Since "key" actually contains the section name and the real
+	 * key name separated by a dot, we have to know where the dot is.
+	 */
+
+	if (last_dot == NULL) {
+		error("key does not contain a section: %s", key);
+		ret = 2;
+		goto out_free;
+	}
+	store.baselen = last_dot - key;
+
+	store.multi_replace = multi_replace;
+
+	/*
+	 * Validate the key and while at it, lower case it for matching.
+	 */
+	store.key = malloc(strlen(key) + 1);
+	dot = 0;
+	for (i = 0; key[i]; i++) {
+		unsigned char c = key[i];
+		if (c == '.')
+			dot = 1;
+		/* Leave the extended basename untouched.. */
+		if (!dot || i > store.baselen) {
+			if (!iskeychar(c) || (i == store.baselen+1 && !isalpha(c))) {
+				error("invalid key: %s", key);
+				free(store.key);
+				ret = 1;
+				goto out_free;
+			}
+			c = tolower(c);
+		} else if (c == '\n') {
+			error("invalid key (newline): %s", key);
+			free(store.key);
+			ret = 1;
+			goto out_free;
+		}
+		store.key[i] = c;
+	}
+	store.key[i] = 0;
+
+	/*
+	 * If .perf/config does not exist yet, write a minimal version.
+	 */
+	in_fd = open(config_filename, O_RDONLY);
+	if ( in_fd < 0 ) {
+		free(store.key);
+
+		if ( ENOENT != errno ) {
+			error("opening %s: %s", config_filename,
+			      strerror(errno));
+			ret = 3; /* same as "invalid config file" */
+			goto out_free;
+		}
+		/* if nothing to unset, error out */
+		if (value == NULL) {
+			ret = 5;
+			goto out_free;
+		}
+
+		store.key = (char*)key;
+		if (!store_write_section(fd, key) ||
+		    !store_write_pair(fd, key, value))
+			goto write_err_out;
+	} else {
+		struct stat st;
+		char* contents;
+		size_t contents_sz, copy_begin, copy_end;
+		int i, new_line = 0;
+
+		if (value_regex == NULL)
+			store.value_regex = NULL;
+		else {
+			if (value_regex[0] == '!') {
+				store.do_not_match = 1;
+				value_regex++;
+			} else
+				store.do_not_match = 0;
+
+			store.value_regex = (regex_t*)malloc(sizeof(regex_t));
+			if (regcomp(store.value_regex, value_regex,
+					REG_EXTENDED)) {
+				error("invalid pattern: %s", value_regex);
+				free(store.value_regex);
+				ret = 6;
+				goto out_free;
+			}
+		}
+
+		store.offset[0] = 0;
+		store.state = START;
+		store.seen = 0;
+
+		/*
+		 * After this, store.offset will contain the *end* offset
+		 * of the last match, or remain at 0 if no match was found.
+		 * As a side effect, we make sure to transform only a valid
+		 * existing config file.
+		 */
+		if (perf_config_from_file(store_aux, config_filename, NULL)) {
+			error("invalid config file %s", config_filename);
+			free(store.key);
+			if (store.value_regex != NULL) {
+				regfree(store.value_regex);
+				free(store.value_regex);
+			}
+			ret = 3;
+			goto out_free;
+		}
+
+		free(store.key);
+		if (store.value_regex != NULL) {
+			regfree(store.value_regex);
+			free(store.value_regex);
+		}
+
+		/* if nothing to unset, or too many matches, error out */
+		if ((store.seen == 0 && value == NULL) ||
+				(store.seen > 1 && multi_replace == 0)) {
+			ret = 5;
+			goto out_free;
+		}
+
+		fstat(in_fd, &st);
+		contents_sz = xsize_t(st.st_size);
+		contents = mmap(NULL, contents_sz, PROT_READ,
+			MAP_PRIVATE, in_fd, 0);
+		close(in_fd);
+
+		if (store.seen == 0)
+			store.seen = 1;
+
+		for (i = 0, copy_begin = 0; i < store.seen; i++) {
+			if (store.offset[i] == 0) {
+				store.offset[i] = copy_end = contents_sz;
+			} else if (store.state != KEY_SEEN) {
+				copy_end = store.offset[i];
+			} else
+				copy_end = find_beginning_of_line(
+					contents, contents_sz,
+					store.offset[i]-2, &new_line);
+
+			if (copy_end > 0 && contents[copy_end-1] != '\n')
+				new_line = 1;
+
+			/* write the first part of the config */
+			if (copy_end > copy_begin) {
+				if (write_in_full(fd, contents + copy_begin,
+						  copy_end - copy_begin) <
+				    copy_end - copy_begin)
+					goto write_err_out;
+				if (new_line &&
+				    write_in_full(fd, "\n", 1) != 1)
+					goto write_err_out;
+			}
+			copy_begin = store.offset[i];
+		}
+
+		/* write the pair (value == NULL means unset) */
+		if (value != NULL) {
+			if (store.state == START) {
+				if (!store_write_section(fd, key))
+					goto write_err_out;
+			}
+			if (!store_write_pair(fd, key, value))
+				goto write_err_out;
+		}
+
+		/* write the rest of the config */
+		if (copy_begin < contents_sz)
+			if (write_in_full(fd, contents + copy_begin,
+					  contents_sz - copy_begin) <
+			    contents_sz - copy_begin)
+				goto write_err_out;
+
+		munmap(contents, contents_sz);
+	}
+
+	ret = 0;
+
+out_free:
+	free(config_filename);
+	return ret;
+
+write_err_out:
+	goto out_free;
+
+}
+
+/*
+ * Call this to report error for your variable that should not
+ * get a boolean value (i.e. "[my] var" means "true").
+ */
+int config_error_nonbool(const char *var)
+{
+	return error("Missing value for '%s'", var);
+}
diff --git a/tools/perf/util/ctype.c b/tools/perf/util/ctype.c
new file mode 100644
index 0000000..b90ec00
--- /dev/null
+++ b/tools/perf/util/ctype.c
@@ -0,0 +1,26 @@
+/*
+ * Sane locale-independent, ASCII ctype.
+ *
+ * No surprises, and works with signed and unsigned chars.
+ */
+#include "cache.h"
+
+enum {
+	S = GIT_SPACE,
+	A = GIT_ALPHA,
+	D = GIT_DIGIT,
+	G = GIT_GLOB_SPECIAL,	/* *, ?, [, \\ */
+	R = GIT_REGEX_SPECIAL,	/* $, (, ), +, ., ^, {, | * */
+};
+
+unsigned char sane_ctype[256] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, 0, S, 0, 0,		/*   0.. 15 */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,		/*  16.. 31 */
+	S, 0, 0, 0, R, 0, 0, 0, R, R, G, R, 0, 0, R, 0,		/*  32.. 47 */
+	D, D, D, D, D, D, D, D, D, D, 0, 0, 0, 0, 0, G,		/*  48.. 63 */
+	0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  64.. 79 */
+	A, A, A, A, A, A, A, A, A, A, A, G, G, 0, R, 0,		/*  80.. 95 */
+	0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  96..111 */
+	A, A, A, A, A, A, A, A, A, A, A, R, R, 0, 0, 0,		/* 112..127 */
+	/* Nothing in the 128.. range */
+};
diff --git a/tools/perf/util/environment.c b/tools/perf/util/environment.c
new file mode 100644
index 0000000..275b0ee
--- /dev/null
+++ b/tools/perf/util/environment.c
@@ -0,0 +1,9 @@
+/*
+ * We put all the perf config variables in this same object
+ * file, so that programs can link against the config parser
+ * without having to link against all the rest of perf.
+ */
+#include "cache.h"
+
+const char *pager_program;
+int pager_use_color = 1;
diff --git a/tools/perf/util/exec_cmd.c b/tools/perf/util/exec_cmd.c
new file mode 100644
index 0000000..d392922
--- /dev/null
+++ b/tools/perf/util/exec_cmd.c
@@ -0,0 +1,165 @@
+#include "cache.h"
+#include "exec_cmd.h"
+#include "quote.h"
+#define MAX_ARGS	32
+
+extern char **environ;
+static const char *argv_exec_path;
+static const char *argv0_path;
+
+const char *system_path(const char *path)
+{
+#ifdef RUNTIME_PREFIX
+	static const char *prefix;
+#else
+	static const char *prefix = PREFIX;
+#endif
+	struct strbuf d = STRBUF_INIT;
+
+	if (is_absolute_path(path))
+		return path;
+
+#ifdef RUNTIME_PREFIX
+	assert(argv0_path);
+	assert(is_absolute_path(argv0_path));
+
+	if (!prefix &&
+	    !(prefix = strip_path_suffix(argv0_path, PERF_EXEC_PATH)) &&
+	    !(prefix = strip_path_suffix(argv0_path, BINDIR)) &&
+	    !(prefix = strip_path_suffix(argv0_path, "perf"))) {
+		prefix = PREFIX;
+		fprintf(stderr, "RUNTIME_PREFIX requested, "
+				"but prefix computation failed.  "
+				"Using static fallback '%s'.\n", prefix);
+	}
+#endif
+
+	strbuf_addf(&d, "%s/%s", prefix, path);
+	path = strbuf_detach(&d, NULL);
+	return path;
+}
+
+const char *perf_extract_argv0_path(const char *argv0)
+{
+	const char *slash;
+
+	if (!argv0 || !*argv0)
+		return NULL;
+	slash = argv0 + strlen(argv0);
+
+	while (argv0 <= slash && !is_dir_sep(*slash))
+		slash--;
+
+	if (slash >= argv0) {
+		argv0_path = strndup(argv0, slash - argv0);
+		return slash + 1;
+	}
+
+	return argv0;
+}
+
+void perf_set_argv_exec_path(const char *exec_path)
+{
+	argv_exec_path = exec_path;
+	/*
+	 * Propagate this setting to external programs.
+	 */
+	setenv(EXEC_PATH_ENVIRONMENT, exec_path, 1);
+}
+
+
+/* Returns the highest-priority, location to look for perf programs. */
+const char *perf_exec_path(void)
+{
+	const char *env;
+
+	if (argv_exec_path)
+		return argv_exec_path;
+
+	env = getenv(EXEC_PATH_ENVIRONMENT);
+	if (env && *env) {
+		return env;
+	}
+
+	return system_path(PERF_EXEC_PATH);
+}
+
+static void add_path(struct strbuf *out, const char *path)
+{
+	if (path && *path) {
+		if (is_absolute_path(path))
+			strbuf_addstr(out, path);
+		else
+			strbuf_addstr(out, make_nonrelative_path(path));
+
+		strbuf_addch(out, PATH_SEP);
+	}
+}
+
+void setup_path(void)
+{
+	const char *old_path = getenv("PATH");
+	struct strbuf new_path = STRBUF_INIT;
+
+	add_path(&new_path, perf_exec_path());
+	add_path(&new_path, argv0_path);
+
+	if (old_path)
+		strbuf_addstr(&new_path, old_path);
+	else
+		strbuf_addstr(&new_path, "/usr/local/bin:/usr/bin:/bin");
+
+	setenv("PATH", new_path.buf, 1);
+
+	strbuf_release(&new_path);
+}
+
+const char **prepare_perf_cmd(const char **argv)
+{
+	int argc;
+	const char **nargv;
+
+	for (argc = 0; argv[argc]; argc++)
+		; /* just counting */
+	nargv = malloc(sizeof(*nargv) * (argc + 2));
+
+	nargv[0] = "perf";
+	for (argc = 0; argv[argc]; argc++)
+		nargv[argc + 1] = argv[argc];
+	nargv[argc + 1] = NULL;
+	return nargv;
+}
+
+int execv_perf_cmd(const char **argv) {
+	const char **nargv = prepare_perf_cmd(argv);
+
+	/* execvp() can only ever return if it fails */
+	execvp("perf", (char **)nargv);
+
+	free(nargv);
+	return -1;
+}
+
+
+int execl_perf_cmd(const char *cmd,...)
+{
+	int argc;
+	const char *argv[MAX_ARGS + 1];
+	const char *arg;
+	va_list param;
+
+	va_start(param, cmd);
+	argv[0] = cmd;
+	argc = 1;
+	while (argc < MAX_ARGS) {
+		arg = argv[argc++] = va_arg(param, char *);
+		if (!arg)
+			break;
+	}
+	va_end(param);
+	if (MAX_ARGS <= argc)
+		return error("too many args to run %s", cmd);
+
+	argv[argc] = NULL;
+	return execv_perf_cmd(argv);
+}
diff --git a/tools/perf/util/exec_cmd.h b/tools/perf/util/exec_cmd.h
new file mode 100644
index 0000000..effe25e
--- /dev/null
+++ b/tools/perf/util/exec_cmd.h
@@ -0,0 +1,13 @@
+#ifndef PERF_EXEC_CMD_H
+#define PERF_EXEC_CMD_H
+
+extern void perf_set_argv_exec_path(const char *exec_path);
+extern const char *perf_extract_argv0_path(const char *path);
+extern const char *perf_exec_path(void);
+extern void setup_path(void);
+extern const char **prepare_perf_cmd(const char **argv);
+extern int execv_perf_cmd(const char **argv); /* NULL terminated */
+extern int execl_perf_cmd(const char *cmd, ...);
+extern const char *system_path(const char *path);
+
+#endif /* PERF_EXEC_CMD_H */
diff --git a/tools/perf/util/generate-cmdlist.sh b/tools/perf/util/generate-cmdlist.sh
new file mode 100755
index 0000000..f06f6fd
--- /dev/null
+++ b/tools/perf/util/generate-cmdlist.sh
@@ -0,0 +1,24 @@
+#!/bin/sh
+
+echo "/* Automatically generated by $0 */
+struct cmdname_help
+{
+    char name[16];
+    char help[80];
+};
+
+static struct cmdname_help common_cmds[] = {"
+
+sed -n -e 's/^perf-\([^ 	]*\)[ 	].* common.*/\1/p' command-list.txt |
+sort |
+while read cmd
+do
+     sed -n '
+     /^NAME/,/perf-'"$cmd"'/H
+     ${
+            x
+            s/.*perf-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/
+	    p
+     }' "Documentation/perf-$cmd.txt"
+done
+echo "};"
diff --git a/tools/perf/util/help.c b/tools/perf/util/help.c
new file mode 100644
index 0000000..6653f7d
--- /dev/null
+++ b/tools/perf/util/help.c
@@ -0,0 +1,367 @@
+#include "cache.h"
+#include "../builtin.h"
+#include "exec_cmd.h"
+#include "levenshtein.h"
+#include "help.h"
+
+/* most GUI terminals set COLUMNS (although some don't export it) */
+static int term_columns(void)
+{
+	char *col_string = getenv("COLUMNS");
+	int n_cols;
+
+	if (col_string && (n_cols = atoi(col_string)) > 0)
+		return n_cols;
+
+#ifdef TIOCGWINSZ
+	{
+		struct winsize ws;
+		if (!ioctl(1, TIOCGWINSZ, &ws)) {
+			if (ws.ws_col)
+				return ws.ws_col;
+		}
+	}
+#endif
+
+	return 80;
+}
+
+void add_cmdname(struct cmdnames *cmds, const char *name, int len)
+{
+	struct cmdname *ent = malloc(sizeof(*ent) + len + 1);
+
+	ent->len = len;
+	memcpy(ent->name, name, len);
+	ent->name[len] = 0;
+
+	ALLOC_GROW(cmds->names, cmds->cnt + 1, cmds->alloc);
+	cmds->names[cmds->cnt++] = ent;
+}
+
+static void clean_cmdnames(struct cmdnames *cmds)
+{
+	int i;
+	for (i = 0; i < cmds->cnt; ++i)
+		free(cmds->names[i]);
+	free(cmds->names);
+	cmds->cnt = 0;
+	cmds->alloc = 0;
+}
+
+static int cmdname_compare(const void *a_, const void *b_)
+{
+	struct cmdname *a = *(struct cmdname **)a_;
+	struct cmdname *b = *(struct cmdname **)b_;
+	return strcmp(a->name, b->name);
+}
+
+static void uniq(struct cmdnames *cmds)
+{
+	int i, j;
+
+	if (!cmds->cnt)
+		return;
+
+	for (i = j = 1; i < cmds->cnt; i++)
+		if (strcmp(cmds->names[i]->name, cmds->names[i-1]->name))
+			cmds->names[j++] = cmds->names[i];
+
+	cmds->cnt = j;
+}
+
+void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes)
+{
+	int ci, cj, ei;
+	int cmp;
+
+	ci = cj = ei = 0;
+	while (ci < cmds->cnt && ei < excludes->cnt) {
+		cmp = strcmp(cmds->names[ci]->name, excludes->names[ei]->name);
+		if (cmp < 0)
+			cmds->names[cj++] = cmds->names[ci++];
+		else if (cmp == 0)
+			ci++, ei++;
+		else if (cmp > 0)
+			ei++;
+	}
+
+	while (ci < cmds->cnt)
+		cmds->names[cj++] = cmds->names[ci++];
+
+	cmds->cnt = cj;
+}
+
+static void pretty_print_string_list(struct cmdnames *cmds, int longest)
+{
+	int cols = 1, rows;
+	int space = longest + 1; /* min 1 SP between words */
+	int max_cols = term_columns() - 1; /* don't print *on* the edge */
+	int i, j;
+
+	if (space < max_cols)
+		cols = max_cols / space;
+	rows = (cmds->cnt + cols - 1) / cols;
+
+	for (i = 0; i < rows; i++) {
+		printf("  ");
+
+		for (j = 0; j < cols; j++) {
+			int n = j * rows + i;
+			int size = space;
+			if (n >= cmds->cnt)
+				break;
+			if (j == cols-1 || n + rows >= cmds->cnt)
+				size = 1;
+			printf("%-*s", size, cmds->names[n]->name);
+		}
+		putchar('\n');
+	}
+}
+
+static int is_executable(const char *name)
+{
+	struct stat st;
+
+	if (stat(name, &st) || /* stat, not lstat */
+	    !S_ISREG(st.st_mode))
+		return 0;
+
+#ifdef __MINGW32__
+	/* cannot trust the executable bit, peek into the file instead */
+	char buf[3] = { 0 };
+	int n;
+	int fd = open(name, O_RDONLY);
+	st.st_mode &= ~S_IXUSR;
+	if (fd >= 0) {
+		n = read(fd, buf, 2);
+		if (n == 2)
+			/* DOS executables start with "MZ" */
+			if (!strcmp(buf, "#!") || !strcmp(buf, "MZ"))
+				st.st_mode |= S_IXUSR;
+		close(fd);
+	}
+#endif
+	return st.st_mode & S_IXUSR;
+}
+
+static void list_commands_in_dir(struct cmdnames *cmds,
+					 const char *path,
+					 const char *prefix)
+{
+	int prefix_len;
+	DIR *dir = opendir(path);
+	struct dirent *de;
+	struct strbuf buf = STRBUF_INIT;
+	int len;
+
+	if (!dir)
+		return;
+	if (!prefix)
+		prefix = "perf-";
+	prefix_len = strlen(prefix);
+
+	strbuf_addf(&buf, "%s/", path);
+	len = buf.len;
+
+	while ((de = readdir(dir)) != NULL) {
+		int entlen;
+
+		if (prefixcmp(de->d_name, prefix))
+			continue;
+
+		strbuf_setlen(&buf, len);
+		strbuf_addstr(&buf, de->d_name);
+		if (!is_executable(buf.buf))
+			continue;
+
+		entlen = strlen(de->d_name) - prefix_len;
+		if (has_extension(de->d_name, ".exe"))
+			entlen -= 4;
+
+		add_cmdname(cmds, de->d_name + prefix_len, entlen);
+	}
+	closedir(dir);
+	strbuf_release(&buf);
+}
+
+void load_command_list(const char *prefix,
+		struct cmdnames *main_cmds,
+		struct cmdnames *other_cmds)
+{
+	const char *env_path = getenv("PATH");
+	const char *exec_path = perf_exec_path();
+
+	if (exec_path) {
+		list_commands_in_dir(main_cmds, exec_path, prefix);
+		qsort(main_cmds->names, main_cmds->cnt,
+		      sizeof(*main_cmds->names), cmdname_compare);
+		uniq(main_cmds);
+	}
+
+	if (env_path) {
+		char *paths, *path, *colon;
+		path = paths = strdup(env_path);
+		while (1) {
+			if ((colon = strchr(path, PATH_SEP)))
+				*colon = 0;
+			if (!exec_path || strcmp(path, exec_path))
+				list_commands_in_dir(other_cmds, path, prefix);
+
+			if (!colon)
+				break;
+			path = colon + 1;
+		}
+		free(paths);
+
+		qsort(other_cmds->names, other_cmds->cnt,
+		      sizeof(*other_cmds->names), cmdname_compare);
+		uniq(other_cmds);
+	}
+	exclude_cmds(other_cmds, main_cmds);
+}
+
+void list_commands(const char *title, struct cmdnames *main_cmds,
+		   struct cmdnames *other_cmds)
+{
+	int i, longest = 0;
+
+	for (i = 0; i < main_cmds->cnt; i++)
+		if (longest < main_cmds->names[i]->len)
+			longest = main_cmds->names[i]->len;
+	for (i = 0; i < other_cmds->cnt; i++)
+		if (longest < other_cmds->names[i]->len)
+			longest = other_cmds->names[i]->len;
+
+	if (main_cmds->cnt) {
+		const char *exec_path = perf_exec_path();
+		printf("available %s in '%s'\n", title, exec_path);
+		printf("----------------");
+		mput_char('-', strlen(title) + strlen(exec_path));
+		putchar('\n');
+		pretty_print_string_list(main_cmds, longest);
+		putchar('\n');
+	}
+
+	if (other_cmds->cnt) {
+		printf("%s available from elsewhere on your $PATH\n", title);
+		printf("---------------------------------------");
+		mput_char('-', strlen(title));
+		putchar('\n');
+		pretty_print_string_list(other_cmds, longest);
+		putchar('\n');
+	}
+}
+
+int is_in_cmdlist(struct cmdnames *c, const char *s)
+{
+	int i;
+	for (i = 0; i < c->cnt; i++)
+		if (!strcmp(s, c->names[i]->name))
+			return 1;
+	return 0;
+}
+
+static int autocorrect;
+static struct cmdnames aliases;
+
+static int perf_unknown_cmd_config(const char *var, const char *value, void *cb)
+{
+	if (!strcmp(var, "help.autocorrect"))
+		autocorrect = perf_config_int(var,value);
+	/* Also use aliases for command lookup */
+	if (!prefixcmp(var, "alias."))
+		add_cmdname(&aliases, var + 6, strlen(var + 6));
+
+	return perf_default_config(var, value, cb);
+}
+
+static int levenshtein_compare(const void *p1, const void *p2)
+{
+	const struct cmdname *const *c1 = p1, *const *c2 = p2;
+	const char *s1 = (*c1)->name, *s2 = (*c2)->name;
+	int l1 = (*c1)->len;
+	int l2 = (*c2)->len;
+	return l1 != l2 ? l1 - l2 : strcmp(s1, s2);
+}
+
+static void add_cmd_list(struct cmdnames *cmds, struct cmdnames *old)
+{
+	int i;
+	ALLOC_GROW(cmds->names, cmds->cnt + old->cnt, cmds->alloc);
+
+	for (i = 0; i < old->cnt; i++)
+		cmds->names[cmds->cnt++] = old->names[i];
+	free(old->names);
+	old->cnt = 0;
+	old->names = NULL;
+}
+
+const char *help_unknown_cmd(const char *cmd)
+{
+	int i, n = 0, best_similarity = 0;
+	struct cmdnames main_cmds, other_cmds;
+
+	memset(&main_cmds, 0, sizeof(main_cmds));
+	memset(&other_cmds, 0, sizeof(main_cmds));
+	memset(&aliases, 0, sizeof(aliases));
+
+	perf_config(perf_unknown_cmd_config, NULL);
+
+	load_command_list("perf-", &main_cmds, &other_cmds);
+
+	add_cmd_list(&main_cmds, &aliases);
+	add_cmd_list(&main_cmds, &other_cmds);
+	qsort(main_cmds.names, main_cmds.cnt,
+	      sizeof(main_cmds.names), cmdname_compare);
+	uniq(&main_cmds);
+
+	if (main_cmds.cnt) {
+		/* This reuses cmdname->len for similarity index */
+		for (i = 0; i < main_cmds.cnt; ++i)
+			main_cmds.names[i]->len =
+				levenshtein(cmd, main_cmds.names[i]->name, 0, 2, 1, 4);
+
+		qsort(main_cmds.names, main_cmds.cnt,
+		      sizeof(*main_cmds.names), levenshtein_compare);
+
+		best_similarity = main_cmds.names[0]->len;
+		n = 1;
+		while (n < main_cmds.cnt && best_similarity == main_cmds.names[n]->len)
+			++n;
+	}
+
+	if (autocorrect && n == 1) {
+		const char *assumed = main_cmds.names[0]->name;
+
+		main_cmds.names[0] = NULL;
+		clean_cmdnames(&main_cmds);
+		fprintf(stderr, "WARNING: You called a Git program named '%s', "
+			"which does not exist.\n"
+			"Continuing under the assumption that you meant '%s'\n",
+			cmd, assumed);
+		if (autocorrect > 0) {
+			fprintf(stderr, "in %0.1f seconds automatically...\n",
+				(float)autocorrect/10.0);
+			poll(NULL, 0, autocorrect * 100);
+		}
+		return assumed;
+	}
+
+	fprintf(stderr, "perf: '%s' is not a perf-command. See 'perf --help'.\n", cmd);
+
+	if (main_cmds.cnt && best_similarity < 6) {
+		fprintf(stderr, "\nDid you mean %s?\n",
+			n < 2 ? "this": "one of these");
+
+		for (i = 0; i < n; i++)
+			fprintf(stderr, "\t%s\n", main_cmds.names[i]->name);
+	}
+
+	exit(1);
+}
+
+int cmd_version(int argc, const char **argv, const char *prefix)
+{
+	printf("perf version %s\n", perf_version_string);
+	return 0;
+}
diff --git a/tools/perf/util/help.h b/tools/perf/util/help.h
new file mode 100644
index 0000000..56bc154
--- /dev/null
+++ b/tools/perf/util/help.h
@@ -0,0 +1,29 @@
+#ifndef HELP_H
+#define HELP_H
+
+struct cmdnames {
+	int alloc;
+	int cnt;
+	struct cmdname {
+		size_t len; /* also used for similarity index in help.c */
+		char name[FLEX_ARRAY];
+	} **names;
+};
+
+static inline void mput_char(char c, unsigned int num)
+{
+	while(num--)
+		putchar(c);
+}
+
+void load_command_list(const char *prefix,
+		struct cmdnames *main_cmds,
+		struct cmdnames *other_cmds);
+void add_cmdname(struct cmdnames *cmds, const char *name, int len);
+/* Here we require that excludes is a sorted list. */
+void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes);
+int is_in_cmdlist(struct cmdnames *c, const char *s);
+void list_commands(const char *title, struct cmdnames *main_cmds,
+		   struct cmdnames *other_cmds);
+
+#endif /* HELP_H */
diff --git a/tools/perf/util/levenshtein.c b/tools/perf/util/levenshtein.c
new file mode 100644
index 0000000..e521d15
--- /dev/null
+++ b/tools/perf/util/levenshtein.c
@@ -0,0 +1,84 @@
+#include "cache.h"
+#include "levenshtein.h"
+
+/*
+ * This function implements the Damerau-Levenshtein algorithm to
+ * calculate a distance between strings.
+ *
+ * Basically, it says how many letters need to be swapped, substituted,
+ * deleted from, or added to string1, at least, to get string2.
+ *
+ * The idea is to build a distance matrix for the substrings of both
+ * strings.  To avoid a large space complexity, only the last three rows
+ * are kept in memory (if swaps had the same or higher cost as one deletion
+ * plus one insertion, only two rows would be needed).
+ *
+ * At any stage, "i + 1" denotes the length of the current substring of
+ * string1 that the distance is calculated for.
+ *
+ * row2 holds the current row, row1 the previous row (i.e. for the substring
+ * of string1 of length "i"), and row0 the row before that.
+ *
+ * In other words, at the start of the big loop, row2[j + 1] contains the
+ * Damerau-Levenshtein distance between the substring of string1 of length
+ * "i" and the substring of string2 of length "j + 1".
+ *
+ * All the big loop does is determine the partial minimum-cost paths.
+ *
+ * It does so by calculating the costs of the path ending in characters
+ * i (in string1) and j (in string2), respectively, given that the last
+ * operation is a substition, a swap, a deletion, or an insertion.
+ *
+ * This implementation allows the costs to be weighted:
+ *
+ * - w (as in "sWap")
+ * - s (as in "Substitution")
+ * - a (for insertion, AKA "Add")
+ * - d (as in "Deletion")
+ *
+ * Note that this algorithm calculates a distance _iff_ d == a.
+ */
+int levenshtein(const char *string1, const char *string2,
+		int w, int s, int a, int d)
+{
+	int len1 = strlen(string1), len2 = strlen(string2);
+	int *row0 = malloc(sizeof(int) * (len2 + 1));
+	int *row1 = malloc(sizeof(int) * (len2 + 1));
+	int *row2 = malloc(sizeof(int) * (len2 + 1));
+	int i, j;
+
+	for (j = 0; j <= len2; j++)
+		row1[j] = j * a;
+	for (i = 0; i < len1; i++) {
+		int *dummy;
+
+		row2[0] = (i + 1) * d;
+		for (j = 0; j < len2; j++) {
+			/* substitution */
+			row2[j + 1] = row1[j] + s * (string1[i] != string2[j]);
+			/* swap */
+			if (i > 0 && j > 0 && string1[i - 1] == string2[j] &&
+					string1[i] == string2[j - 1] &&
+					row2[j + 1] > row0[j - 1] + w)
+				row2[j + 1] = row0[j - 1] + w;
+			/* deletion */
+			if (row2[j + 1] > row1[j + 1] + d)
+				row2[j + 1] = row1[j + 1] + d;
+			/* insertion */
+			if (row2[j + 1] > row2[j] + a)
+				row2[j + 1] = row2[j] + a;
+		}
+
+		dummy = row0;
+		row0 = row1;
+		row1 = row2;
+		row2 = dummy;
+	}
+
+	i = row1[len2];
+	free(row0);
+	free(row1);
+	free(row2);
+
+	return i;
+}
diff --git a/tools/perf/util/levenshtein.h b/tools/perf/util/levenshtein.h
new file mode 100644
index 0000000..0173abe
--- /dev/null
+++ b/tools/perf/util/levenshtein.h
@@ -0,0 +1,8 @@
+#ifndef LEVENSHTEIN_H
+#define LEVENSHTEIN_H
+
+int levenshtein(const char *string1, const char *string2,
+	int swap_penalty, int substition_penalty,
+	int insertion_penalty, int deletion_penalty);
+
+#endif
diff --git a/tools/perf/util/list.h b/tools/perf/util/list.h
new file mode 100644
index 0000000..e2548e8
--- /dev/null
+++ b/tools/perf/util/list.h
@@ -0,0 +1,603 @@
+#ifndef _LINUX_LIST_H
+#define _LINUX_LIST_H
+/*
+  Copyright (C) Cast of dozens, comes from the Linux kernel
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+*/
+
+#include <stddef.h>
+
+/*
+ * These are non-NULL pointers that will result in page faults
+ * under normal circumstances, used to verify that nobody uses
+ * non-initialized list entries.
+ */
+#define LIST_POISON1 ((void *)0x00100100)
+#define LIST_POISON2 ((void *)0x00200200)
+
+/**
+ * container_of - cast a member of a structure out to the containing structure
+ * @ptr:	the pointer to the member.
+ * @type:	the type of the container struct this is embedded in.
+ * @member:	the name of the member within the struct.
+ *
+ */
+#define container_of(ptr, type, member) ({			\
+        const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
+        (type *)( (char *)__mptr - offsetof(type,member) );})
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+struct list_head {
+	struct list_head *next, *prev;
+};
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define LIST_HEAD(name) \
+	struct list_head name = LIST_HEAD_INIT(name)
+
+static inline void INIT_LIST_HEAD(struct list_head *list)
+{
+	list->next = list;
+	list->prev = list;
+}
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_add(struct list_head *new,
+			      struct list_head *prev,
+			      struct list_head *next)
+{
+	next->prev = new;
+	new->next = next;
+	new->prev = prev;
+	prev->next = new;
+}
+
+/**
+ * list_add - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+	__list_add(new, head, head->next);
+}
+
+/**
+ * list_add_tail - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static inline void list_add_tail(struct list_head *new, struct list_head *head)
+{
+	__list_add(new, head->prev, head);
+}
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_del(struct list_head * prev, struct list_head * next)
+{
+	next->prev = prev;
+	prev->next = next;
+}
+
+/**
+ * list_del - deletes entry from list.
+ * @entry: the element to delete from the list.
+ * Note: list_empty on entry does not return true after this, the entry is
+ * in an undefined state.
+ */
+static inline void list_del(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	entry->next = LIST_POISON1;
+	entry->prev = LIST_POISON2;
+}
+
+/**
+ * list_del_range - deletes range of entries from list.
+ * @beging: first element in the range to delete from the list.
+ * @beging: first element in the range to delete from the list.
+ * Note: list_empty on the range of entries does not return true after this,
+ * the entries is in an undefined state.
+ */
+static inline void list_del_range(struct list_head *begin,
+				  struct list_head *end)
+{
+	begin->prev->next = end->next;
+	end->next->prev = begin->prev;
+}
+
+/**
+ * list_replace - replace old entry by new one
+ * @old : the element to be replaced
+ * @new : the new element to insert
+ * Note: if 'old' was empty, it will be overwritten.
+ */
+static inline void list_replace(struct list_head *old,
+				struct list_head *new)
+{
+	new->next = old->next;
+	new->next->prev = new;
+	new->prev = old->prev;
+	new->prev->next = new;
+}
+
+static inline void list_replace_init(struct list_head *old,
+					struct list_head *new)
+{
+	list_replace(old, new);
+	INIT_LIST_HEAD(old);
+}
+
+/**
+ * list_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static inline void list_del_init(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	INIT_LIST_HEAD(entry);
+}
+
+/**
+ * list_move - delete from one list and add as another's head
+ * @list: the entry to move
+ * @head: the head that will precede our entry
+ */
+static inline void list_move(struct list_head *list, struct list_head *head)
+{
+        __list_del(list->prev, list->next);
+        list_add(list, head);
+}
+
+/**
+ * list_move_tail - delete from one list and add as another's tail
+ * @list: the entry to move
+ * @head: the head that will follow our entry
+ */
+static inline void list_move_tail(struct list_head *list,
+				  struct list_head *head)
+{
+        __list_del(list->prev, list->next);
+        list_add_tail(list, head);
+}
+
+/**
+ * list_is_last - tests whether @list is the last entry in list @head
+ * @list: the entry to test
+ * @head: the head of the list
+ */
+static inline int list_is_last(const struct list_head *list,
+				const struct list_head *head)
+{
+	return list->next == head;
+}
+
+/**
+ * list_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+static inline int list_empty(const struct list_head *head)
+{
+	return head->next == head;
+}
+
+/**
+ * list_empty_careful - tests whether a list is empty and not being modified
+ * @head: the list to test
+ *
+ * Description:
+ * tests whether a list is empty _and_ checks that no other CPU might be
+ * in the process of modifying either member (next or prev)
+ *
+ * NOTE: using list_empty_careful() without synchronization
+ * can only be safe if the only activity that can happen
+ * to the list entry is list_del_init(). Eg. it cannot be used
+ * if another CPU could re-list_add() it.
+ */
+static inline int list_empty_careful(const struct list_head *head)
+{
+	struct list_head *next = head->next;
+	return (next == head) && (next == head->prev);
+}
+
+static inline void __list_splice(struct list_head *list,
+				 struct list_head *head)
+{
+	struct list_head *first = list->next;
+	struct list_head *last = list->prev;
+	struct list_head *at = head->next;
+
+	first->prev = head;
+	head->next = first;
+
+	last->next = at;
+	at->prev = last;
+}
+
+/**
+ * list_splice - join two lists
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+static inline void list_splice(struct list_head *list, struct list_head *head)
+{
+	if (!list_empty(list))
+		__list_splice(list, head);
+}
+
+/**
+ * list_splice_init - join two lists and reinitialise the emptied list.
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ *
+ * The list at @list is reinitialised
+ */
+static inline void list_splice_init(struct list_head *list,
+				    struct list_head *head)
+{
+	if (!list_empty(list)) {
+		__list_splice(list, head);
+		INIT_LIST_HEAD(list);
+	}
+}
+
+/**
+ * list_entry - get the struct for this entry
+ * @ptr:	the &struct list_head pointer.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_entry(ptr, type, member) \
+	container_of(ptr, type, member)
+
+/**
+ * list_first_entry - get the first element from a list
+ * @ptr:       the list head to take the element from.
+ * @type:      the type of the struct this is embedded in.
+ * @member:    the name of the list_struct within the struct.
+ *
+ * Note, that list is expected to be not empty.
+ */
+#define list_first_entry(ptr, type, member) \
+	list_entry((ptr)->next, type, member)
+
+/**
+ * list_for_each	-	iterate over a list
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @head:	the head for your list.
+ */
+#define list_for_each(pos, head) \
+	for (pos = (head)->next; pos != (head); \
+        	pos = pos->next)
+
+/**
+ * __list_for_each	-	iterate over a list
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @head:	the head for your list.
+ *
+ * This variant differs from list_for_each() in that it's the
+ * simplest possible list iteration code, no prefetching is done.
+ * Use this for code that knows the list to be very short (empty
+ * or 1 entry) most of the time.
+ */
+#define __list_for_each(pos, head) \
+	for (pos = (head)->next; pos != (head); pos = pos->next)
+
+/**
+ * list_for_each_prev	-	iterate over a list backwards
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @head:	the head for your list.
+ */
+#define list_for_each_prev(pos, head) \
+	for (pos = (head)->prev; pos != (head); \
+        	pos = pos->prev)
+
+/**
+ * list_for_each_safe - iterate over a list safe against removal of list entry
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @n:		another &struct list_head to use as temporary storage
+ * @head:	the head for your list.
+ */
+#define list_for_each_safe(pos, n, head) \
+	for (pos = (head)->next, n = pos->next; pos != (head); \
+		pos = n, n = pos->next)
+
+/**
+ * list_for_each_entry	-	iterate over list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_for_each_entry(pos, head, member)				\
+	for (pos = list_entry((head)->next, typeof(*pos), member);	\
+	     &pos->member != (head); 	\
+	     pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_reverse - iterate backwards over list of given type.
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_reverse(pos, head, member)			\
+	for (pos = list_entry((head)->prev, typeof(*pos), member);	\
+	     &pos->member != (head); 	\
+	     pos = list_entry(pos->member.prev, typeof(*pos), member))
+
+/**
+ * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue
+ * @pos:	the type * to use as a start point
+ * @head:	the head of the list
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Prepares a pos entry for use as a start point in list_for_each_entry_continue.
+ */
+#define list_prepare_entry(pos, head, member) \
+	((pos) ? : list_entry(head, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_continue - continue iteration over list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Continue to iterate over list of given type, continuing after
+ * the current position.
+ */
+#define list_for_each_entry_continue(pos, head, member) 		\
+	for (pos = list_entry(pos->member.next, typeof(*pos), member);	\
+	     &pos->member != (head);	\
+	     pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_from - iterate over list of given type from the current point
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Iterate over list of given type, continuing from current position.
+ */
+#define list_for_each_entry_from(pos, head, member) 			\
+	for (; &pos->member != (head);	\
+	     pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_safe(pos, n, head, member)			\
+	for (pos = list_entry((head)->next, typeof(*pos), member),	\
+		n = list_entry(pos->member.next, typeof(*pos), member);	\
+	     &pos->member != (head); 					\
+	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+/**
+ * list_for_each_entry_safe_continue
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Iterate over list of given type, continuing after current point,
+ * safe against removal of list entry.
+ */
+#define list_for_each_entry_safe_continue(pos, n, head, member) 		\
+	for (pos = list_entry(pos->member.next, typeof(*pos), member), 		\
+		n = list_entry(pos->member.next, typeof(*pos), member);		\
+	     &pos->member != (head);						\
+	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+/**
+ * list_for_each_entry_safe_from
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Iterate over list of given type from current point, safe against
+ * removal of list entry.
+ */
+#define list_for_each_entry_safe_from(pos, n, head, member) 			\
+	for (n = list_entry(pos->member.next, typeof(*pos), member);		\
+	     &pos->member != (head);						\
+	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+/**
+ * list_for_each_entry_safe_reverse
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Iterate backwards over list of given type, safe against removal
+ * of list entry.
+ */
+#define list_for_each_entry_safe_reverse(pos, n, head, member)		\
+	for (pos = list_entry((head)->prev, typeof(*pos), member),	\
+		n = list_entry(pos->member.prev, typeof(*pos), member);	\
+	     &pos->member != (head); 					\
+	     pos = n, n = list_entry(n->member.prev, typeof(*n), member))
+
+/*
+ * Double linked lists with a single pointer list head.
+ * Mostly useful for hash tables where the two pointer list head is
+ * too wasteful.
+ * You lose the ability to access the tail in O(1).
+ */
+
+struct hlist_head {
+	struct hlist_node *first;
+};
+
+struct hlist_node {
+	struct hlist_node *next, **pprev;
+};
+
+#define HLIST_HEAD_INIT { .first = NULL }
+#define HLIST_HEAD(name) struct hlist_head name = {  .first = NULL }
+#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
+static inline void INIT_HLIST_NODE(struct hlist_node *h)
+{
+	h->next = NULL;
+	h->pprev = NULL;
+}
+
+static inline int hlist_unhashed(const struct hlist_node *h)
+{
+	return !h->pprev;
+}
+
+static inline int hlist_empty(const struct hlist_head *h)
+{
+	return !h->first;
+}
+
+static inline void __hlist_del(struct hlist_node *n)
+{
+	struct hlist_node *next = n->next;
+	struct hlist_node **pprev = n->pprev;
+	*pprev = next;
+	if (next)
+		next->pprev = pprev;
+}
+
+static inline void hlist_del(struct hlist_node *n)
+{
+	__hlist_del(n);
+	n->next = LIST_POISON1;
+	n->pprev = LIST_POISON2;
+}
+
+static inline void hlist_del_init(struct hlist_node *n)
+{
+	if (!hlist_unhashed(n)) {
+		__hlist_del(n);
+		INIT_HLIST_NODE(n);
+	}
+}
+
+static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
+{
+	struct hlist_node *first = h->first;
+	n->next = first;
+	if (first)
+		first->pprev = &n->next;
+	h->first = n;
+	n->pprev = &h->first;
+}
+
+/* next must be != NULL */
+static inline void hlist_add_before(struct hlist_node *n,
+					struct hlist_node *next)
+{
+	n->pprev = next->pprev;
+	n->next = next;
+	next->pprev = &n->next;
+	*(n->pprev) = n;
+}
+
+static inline void hlist_add_after(struct hlist_node *n,
+					struct hlist_node *next)
+{
+	next->next = n->next;
+	n->next = next;
+	next->pprev = &n->next;
+
+	if(next->next)
+		next->next->pprev  = &next->next;
+}
+
+#define hlist_entry(ptr, type, member) container_of(ptr,type,member)
+
+#define hlist_for_each(pos, head) \
+	for (pos = (head)->first; pos; \
+	     pos = pos->next)
+
+#define hlist_for_each_safe(pos, n, head) \
+	for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \
+	     pos = n)
+
+/**
+ * hlist_for_each_entry	- iterate over list of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry(tpos, pos, head, member)			 \
+	for (pos = (head)->first;					 \
+	     pos && 			 \
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+/**
+ * hlist_for_each_entry_continue - iterate over a hlist continuing after current point
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_continue(tpos, pos, member)		 \
+	for (pos = (pos)->next;						 \
+	     pos && 			 \
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+/**
+ * hlist_for_each_entry_from - iterate over a hlist continuing from current point
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_from(tpos, pos, member)			 \
+	for (; pos && 			 \
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+/**
+ * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @n:		another &struct hlist_node to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_safe(tpos, pos, n, head, member) 		 \
+	for (pos = (head)->first;					 \
+	     pos && ({ n = pos->next; 1; }) && 				 \
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = n)
+
+#endif
diff --git a/tools/perf/util/pager.c b/tools/perf/util/pager.c
new file mode 100644
index 0000000..a28bcca
--- /dev/null
+++ b/tools/perf/util/pager.c
@@ -0,0 +1,99 @@
+#include "cache.h"
+#include "run-command.h"
+#include "sigchain.h"
+
+/*
+ * This is split up from the rest of git so that we can do
+ * something different on Windows.
+ */
+
+static int spawned_pager;
+
+#ifndef __MINGW32__
+static void pager_preexec(void)
+{
+	/*
+	 * Work around bug in "less" by not starting it until we
+	 * have real input
+	 */
+	fd_set in;
+
+	FD_ZERO(&in);
+	FD_SET(0, &in);
+	select(1, &in, NULL, &in, NULL);
+
+	setenv("LESS", "FRSX", 0);
+}
+#endif
+
+static const char *pager_argv[] = { "sh", "-c", NULL, NULL };
+static struct child_process pager_process;
+
+static void wait_for_pager(void)
+{
+	fflush(stdout);
+	fflush(stderr);
+	/* signal EOF to pager */
+	close(1);
+	close(2);
+	finish_command(&pager_process);
+}
+
+static void wait_for_pager_signal(int signo)
+{
+	wait_for_pager();
+	sigchain_pop(signo);
+	raise(signo);
+}
+
+void setup_pager(void)
+{
+	const char *pager = getenv("PERF_PAGER");
+
+	if (!isatty(1))
+		return;
+	if (!pager) {
+		if (!pager_program)
+			perf_config(perf_default_config, NULL);
+		pager = pager_program;
+	}
+	if (!pager)
+		pager = getenv("PAGER");
+	if (!pager)
+		pager = "less";
+	else if (!*pager || !strcmp(pager, "cat"))
+		return;
+
+	spawned_pager = 1; /* means we are emitting to terminal */
+
+	/* spawn the pager */
+	pager_argv[2] = pager;
+	pager_process.argv = pager_argv;
+	pager_process.in = -1;
+#ifndef __MINGW32__
+	pager_process.preexec_cb = pager_preexec;
+#endif
+	if (start_command(&pager_process))
+		return;
+
+	/* original process continues, but writes to the pipe */
+	dup2(pager_process.in, 1);
+	if (isatty(2))
+		dup2(pager_process.in, 2);
+	close(pager_process.in);
+
+	/* this makes sure that the parent terminates after the pager */
+	sigchain_push_common(wait_for_pager_signal);
+	atexit(wait_for_pager);
+}
+
+int pager_in_use(void)
+{
+	const char *env;
+
+	if (spawned_pager)
+		return 1;
+
+	env = getenv("PERF_PAGER_IN_USE");
+	return env ? perf_config_bool("PERF_PAGER_IN_USE", env) : 0;
+}
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
new file mode 100644
index 0000000..e0820b4
--- /dev/null
+++ b/tools/perf/util/parse-events.c
@@ -0,0 +1,316 @@
+
+#include "../perf.h"
+#include "util.h"
+#include "parse-options.h"
+#include "parse-events.h"
+#include "exec_cmd.h"
+#include "string.h"
+
+extern char *strcasestr(const char *haystack, const char *needle);
+
+int					nr_counters;
+
+struct perf_counter_attr		attrs[MAX_COUNTERS];
+
+struct event_symbol {
+	__u8	type;
+	__u64	config;
+	char	*symbol;
+};
+
+#define C(x, y) .type = PERF_TYPE_##x, .config = PERF_COUNT_##y
+#define CR(x, y) .type = PERF_TYPE_##x, .config = y
+
+static struct event_symbol event_symbols[] = {
+  { C(HARDWARE, CPU_CYCLES),		"cpu-cycles",		},
+  { C(HARDWARE, CPU_CYCLES),		"cycles",		},
+  { C(HARDWARE, INSTRUCTIONS),		"instructions",		},
+  { C(HARDWARE, CACHE_REFERENCES),	"cache-references",	},
+  { C(HARDWARE, CACHE_MISSES),		"cache-misses",		},
+  { C(HARDWARE, BRANCH_INSTRUCTIONS),	"branch-instructions",	},
+  { C(HARDWARE, BRANCH_INSTRUCTIONS),	"branches",		},
+  { C(HARDWARE, BRANCH_MISSES),		"branch-misses",	},
+  { C(HARDWARE, BUS_CYCLES),		"bus-cycles",		},
+
+  { C(SOFTWARE, CPU_CLOCK),		"cpu-clock",		},
+  { C(SOFTWARE, TASK_CLOCK),		"task-clock",		},
+  { C(SOFTWARE, PAGE_FAULTS),		"page-faults",		},
+  { C(SOFTWARE, PAGE_FAULTS),		"faults",		},
+  { C(SOFTWARE, PAGE_FAULTS_MIN),	"minor-faults",		},
+  { C(SOFTWARE, PAGE_FAULTS_MAJ),	"major-faults",		},
+  { C(SOFTWARE, CONTEXT_SWITCHES),	"context-switches",	},
+  { C(SOFTWARE, CONTEXT_SWITCHES),	"cs",			},
+  { C(SOFTWARE, CPU_MIGRATIONS),	"cpu-migrations",	},
+  { C(SOFTWARE, CPU_MIGRATIONS),	"migrations",		},
+};
+
+#define __PERF_COUNTER_FIELD(config, name) \
+	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
+
+#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
+#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
+#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
+#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
+
+static char *hw_event_names[] = {
+	"cycles",
+	"instructions",
+	"cache-references",
+	"cache-misses",
+	"branches",
+	"branch-misses",
+	"bus-cycles",
+};
+
+static char *sw_event_names[] = {
+	"cpu-clock-ticks",
+	"task-clock-ticks",
+	"page-faults",
+	"context-switches",
+	"CPU-migrations",
+	"minor-faults",
+	"major-faults",
+};
+
+#define MAX_ALIASES 8
+
+static char *hw_cache [][MAX_ALIASES] = {
+	{ "L1-data"		, "l1-d", "l1d", "l1"				},
+	{ "L1-instruction"	, "l1-i", "l1i"					},
+	{ "L2"			, "l2"						},
+	{ "Data-TLB"		, "dtlb", "d-tlb"				},
+	{ "Instruction-TLB"	, "itlb", "i-tlb"				},
+	{ "Branch"		, "bpu" , "btb", "bpc"				},
+};
+
+static char *hw_cache_op [][MAX_ALIASES] = {
+	{ "Load"		, "read"					},
+	{ "Store"		, "write"					},
+	{ "Prefetch"		, "speculative-read", "speculative-load"	},
+};
+
+static char *hw_cache_result [][MAX_ALIASES] = {
+	{ "Reference"		, "ops", "access"				},
+	{ "Miss"								},
+};
+
+char *event_name(int counter)
+{
+	__u64 config = attrs[counter].config;
+	int type = attrs[counter].type;
+	static char buf[32];
+
+	if (attrs[counter].type == PERF_TYPE_RAW) {
+		sprintf(buf, "raw 0x%llx", config);
+		return buf;
+	}
+
+	switch (type) {
+	case PERF_TYPE_HARDWARE:
+		if (config < PERF_HW_EVENTS_MAX)
+			return hw_event_names[config];
+		return "unknown-hardware";
+
+	case PERF_TYPE_HW_CACHE: {
+		__u8 cache_type, cache_op, cache_result;
+		static char name[100];
+
+		cache_type   = (config >>  0) & 0xff;
+		if (cache_type > PERF_COUNT_HW_CACHE_MAX)
+			return "unknown-ext-hardware-cache-type";
+
+		cache_op     = (config >>  8) & 0xff;
+		if (cache_op > PERF_COUNT_HW_CACHE_OP_MAX)
+			return "unknown-ext-hardware-cache-op";
+
+		cache_result = (config >> 16) & 0xff;
+		if (cache_result > PERF_COUNT_HW_CACHE_RESULT_MAX)
+			return "unknown-ext-hardware-cache-result";
+
+		sprintf(name, "%s-Cache-%s-%ses",
+			hw_cache[cache_type][0],
+			hw_cache_op[cache_op][0],
+			hw_cache_result[cache_result][0]);
+
+		return name;
+	}
+
+	case PERF_TYPE_SOFTWARE:
+		if (config < PERF_SW_EVENTS_MAX)
+			return sw_event_names[config];
+		return "unknown-software";
+
+	default:
+		break;
+	}
+
+	return "unknown";
+}
+
+static int parse_aliases(const char *str, char *names[][MAX_ALIASES], int size)
+{
+	int i, j;
+
+	for (i = 0; i < size; i++) {
+		for (j = 0; j < MAX_ALIASES; j++) {
+			if (!names[i][j])
+				break;
+			if (strcasestr(str, names[i][j]))
+				return i;
+		}
+	}
+
+	return 0;
+}
+
+static int parse_generic_hw_symbols(const char *str, struct perf_counter_attr *attr)
+{
+	__u8 cache_type = -1, cache_op = 0, cache_result = 0;
+
+	cache_type = parse_aliases(str, hw_cache, PERF_COUNT_HW_CACHE_MAX);
+	/*
+	 * No fallback - if we cannot get a clear cache type
+	 * then bail out:
+	 */
+	if (cache_type == -1)
+		return -EINVAL;
+
+	cache_op = parse_aliases(str, hw_cache_op, PERF_COUNT_HW_CACHE_OP_MAX);
+	/*
+	 * Fall back to reads:
+	 */
+	if (cache_type == -1)
+		cache_type = PERF_COUNT_HW_CACHE_OP_READ;
+
+	cache_result = parse_aliases(str, hw_cache_result,
+					PERF_COUNT_HW_CACHE_RESULT_MAX);
+	/*
+	 * Fall back to accesses:
+	 */
+	if (cache_result == -1)
+		cache_result = PERF_COUNT_HW_CACHE_RESULT_ACCESS;
+
+	attr->config = cache_type | (cache_op << 8) | (cache_result << 16);
+	attr->type = PERF_TYPE_HW_CACHE;
+
+	return 0;
+}
+
+/*
+ * Each event can have multiple symbolic names.
+ * Symbolic names are (almost) exactly matched.
+ */
+static int parse_event_symbols(const char *str, struct perf_counter_attr *attr)
+{
+	__u64 config, id;
+	int type;
+	unsigned int i;
+	const char *sep, *pstr;
+
+	if (str[0] == 'r' && hex2u64(str + 1, &config) > 0) {
+		attr->type = PERF_TYPE_RAW;
+		attr->config = config;
+
+		return 0;
+	}
+
+	pstr = str;
+	sep = strchr(pstr, ':');
+	if (sep) {
+		type = atoi(pstr);
+		pstr = sep + 1;
+		id = atoi(pstr);
+		sep = strchr(pstr, ':');
+		if (sep) {
+			pstr = sep + 1;
+			if (strchr(pstr, 'k'))
+				attr->exclude_user = 1;
+			if (strchr(pstr, 'u'))
+				attr->exclude_kernel = 1;
+		}
+		attr->type = type;
+		attr->config = id;
+
+		return 0;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		if (!strncmp(str, event_symbols[i].symbol,
+			     strlen(event_symbols[i].symbol))) {
+
+			attr->type = event_symbols[i].type;
+			attr->config = event_symbols[i].config;
+
+			return 0;
+		}
+	}
+
+	return parse_generic_hw_symbols(str, attr);
+}
+
+int parse_events(const struct option *opt, const char *str, int unset)
+{
+	struct perf_counter_attr attr;
+	int ret;
+
+	memset(&attr, 0, sizeof(attr));
+again:
+	if (nr_counters == MAX_COUNTERS)
+		return -1;
+
+	ret = parse_event_symbols(str, &attr);
+	if (ret < 0)
+		return ret;
+
+	attrs[nr_counters] = attr;
+	nr_counters++;
+
+	str = strstr(str, ",");
+	if (str) {
+		str++;
+		goto again;
+	}
+
+	return 0;
+}
+
+static const char * const event_type_descriptors[] = {
+	"",
+	"Hardware event",
+	"Software event",
+	"Tracepoint event",
+	"Hardware cache event",
+};
+
+/*
+ * Print the help text for the event symbols:
+ */
+void print_events(void)
+{
+	struct event_symbol *syms = event_symbols;
+	unsigned int i, type, prev_type = -1;
+
+	fprintf(stderr, "\n");
+	fprintf(stderr, "List of pre-defined events (to be used in -e):\n");
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++, syms++) {
+		type = syms->type + 1;
+		if (type > ARRAY_SIZE(event_type_descriptors))
+			type = 0;
+
+		if (type != prev_type)
+			fprintf(stderr, "\n");
+
+		fprintf(stderr, "  %-30s [%s]\n", syms->symbol,
+			event_type_descriptors[type]);
+
+		prev_type = type;
+	}
+
+	fprintf(stderr, "\n");
+	fprintf(stderr, "  %-30s [raw hardware event descriptor]\n",
+		"rNNN");
+	fprintf(stderr, "\n");
+
+	exit(129);
+}
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
new file mode 100644
index 0000000..e3d5529
--- /dev/null
+++ b/tools/perf/util/parse-events.h
@@ -0,0 +1,17 @@
+
+/*
+ * Parse symbolic events/counts passed in as options:
+ */
+
+extern int			nr_counters;
+
+extern struct perf_counter_attr attrs[MAX_COUNTERS];
+
+extern char *event_name(int ctr);
+
+extern int parse_events(const struct option *opt, const char *str, int unset);
+
+#define EVENTS_HELP_MAX (128*1024)
+
+extern void print_events(void);
+
diff --git a/tools/perf/util/parse-options.c b/tools/perf/util/parse-options.c
new file mode 100644
index 0000000..b3affb1
--- /dev/null
+++ b/tools/perf/util/parse-options.c
@@ -0,0 +1,508 @@
+#include "util.h"
+#include "parse-options.h"
+#include "cache.h"
+
+#define OPT_SHORT 1
+#define OPT_UNSET 2
+
+static int opterror(const struct option *opt, const char *reason, int flags)
+{
+	if (flags & OPT_SHORT)
+		return error("switch `%c' %s", opt->short_name, reason);
+	if (flags & OPT_UNSET)
+		return error("option `no-%s' %s", opt->long_name, reason);
+	return error("option `%s' %s", opt->long_name, reason);
+}
+
+static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt,
+		   int flags, const char **arg)
+{
+	if (p->opt) {
+		*arg = p->opt;
+		p->opt = NULL;
+	} else if (p->argc == 1 && (opt->flags & PARSE_OPT_LASTARG_DEFAULT)) {
+		*arg = (const char *)opt->defval;
+	} else if (p->argc > 1) {
+		p->argc--;
+		*arg = *++p->argv;
+	} else
+		return opterror(opt, "requires a value", flags);
+	return 0;
+}
+
+static int get_value(struct parse_opt_ctx_t *p,
+		     const struct option *opt, int flags)
+{
+	const char *s, *arg = NULL;
+	const int unset = flags & OPT_UNSET;
+
+	if (unset && p->opt)
+		return opterror(opt, "takes no value", flags);
+	if (unset && (opt->flags & PARSE_OPT_NONEG))
+		return opterror(opt, "isn't available", flags);
+
+	if (!(flags & OPT_SHORT) && p->opt) {
+		switch (opt->type) {
+		case OPTION_CALLBACK:
+			if (!(opt->flags & PARSE_OPT_NOARG))
+				break;
+			/* FALLTHROUGH */
+		case OPTION_BOOLEAN:
+		case OPTION_BIT:
+		case OPTION_SET_INT:
+		case OPTION_SET_PTR:
+			return opterror(opt, "takes no value", flags);
+		default:
+			break;
+		}
+	}
+
+	switch (opt->type) {
+	case OPTION_BIT:
+		if (unset)
+			*(int *)opt->value &= ~opt->defval;
+		else
+			*(int *)opt->value |= opt->defval;
+		return 0;
+
+	case OPTION_BOOLEAN:
+		*(int *)opt->value = unset ? 0 : *(int *)opt->value + 1;
+		return 0;
+
+	case OPTION_SET_INT:
+		*(int *)opt->value = unset ? 0 : opt->defval;
+		return 0;
+
+	case OPTION_SET_PTR:
+		*(void **)opt->value = unset ? NULL : (void *)opt->defval;
+		return 0;
+
+	case OPTION_STRING:
+		if (unset)
+			*(const char **)opt->value = NULL;
+		else if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+			*(const char **)opt->value = (const char *)opt->defval;
+		else
+			return get_arg(p, opt, flags, (const char **)opt->value);
+		return 0;
+
+	case OPTION_CALLBACK:
+		if (unset)
+			return (*opt->callback)(opt, NULL, 1) ? (-1) : 0;
+		if (opt->flags & PARSE_OPT_NOARG)
+			return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+			return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		return (*opt->callback)(opt, arg, 0) ? (-1) : 0;
+
+	case OPTION_INTEGER:
+		if (unset) {
+			*(int *)opt->value = 0;
+			return 0;
+		}
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+			*(int *)opt->value = opt->defval;
+			return 0;
+		}
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		*(int *)opt->value = strtol(arg, (char **)&s, 10);
+		if (*s)
+			return opterror(opt, "expects a numerical value", flags);
+		return 0;
+
+	case OPTION_LONG:
+		if (unset) {
+			*(long *)opt->value = 0;
+			return 0;
+		}
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+			*(long *)opt->value = opt->defval;
+			return 0;
+		}
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		*(long *)opt->value = strtol(arg, (char **)&s, 10);
+		if (*s)
+			return opterror(opt, "expects a numerical value", flags);
+		return 0;
+
+	default:
+		die("should not happen, someone must be hit on the forehead");
+	}
+}
+
+static int parse_short_opt(struct parse_opt_ctx_t *p, const struct option *options)
+{
+	for (; options->type != OPTION_END; options++) {
+		if (options->short_name == *p->opt) {
+			p->opt = p->opt[1] ? p->opt + 1 : NULL;
+			return get_value(p, options, OPT_SHORT);
+		}
+	}
+	return -2;
+}
+
+static int parse_long_opt(struct parse_opt_ctx_t *p, const char *arg,
+                          const struct option *options)
+{
+	const char *arg_end = strchr(arg, '=');
+	const struct option *abbrev_option = NULL, *ambiguous_option = NULL;
+	int abbrev_flags = 0, ambiguous_flags = 0;
+
+	if (!arg_end)
+		arg_end = arg + strlen(arg);
+
+	for (; options->type != OPTION_END; options++) {
+		const char *rest;
+		int flags = 0;
+
+		if (!options->long_name)
+			continue;
+
+		rest = skip_prefix(arg, options->long_name);
+		if (options->type == OPTION_ARGUMENT) {
+			if (!rest)
+				continue;
+			if (*rest == '=')
+				return opterror(options, "takes no value", flags);
+			if (*rest)
+				continue;
+			p->out[p->cpidx++] = arg - 2;
+			return 0;
+		}
+		if (!rest) {
+			/* abbreviated? */
+			if (!strncmp(options->long_name, arg, arg_end - arg)) {
+is_abbreviated:
+				if (abbrev_option) {
+					/*
+					 * If this is abbreviated, it is
+					 * ambiguous. So when there is no
+					 * exact match later, we need to
+					 * error out.
+					 */
+					ambiguous_option = abbrev_option;
+					ambiguous_flags = abbrev_flags;
+				}
+				if (!(flags & OPT_UNSET) && *arg_end)
+					p->opt = arg_end + 1;
+				abbrev_option = options;
+				abbrev_flags = flags;
+				continue;
+			}
+			/* negated and abbreviated very much? */
+			if (!prefixcmp("no-", arg)) {
+				flags |= OPT_UNSET;
+				goto is_abbreviated;
+			}
+			/* negated? */
+			if (strncmp(arg, "no-", 3))
+				continue;
+			flags |= OPT_UNSET;
+			rest = skip_prefix(arg + 3, options->long_name);
+			/* abbreviated and negated? */
+			if (!rest && !prefixcmp(options->long_name, arg + 3))
+				goto is_abbreviated;
+			if (!rest)
+				continue;
+		}
+		if (*rest) {
+			if (*rest != '=')
+				continue;
+			p->opt = rest + 1;
+		}
+		return get_value(p, options, flags);
+	}
+
+	if (ambiguous_option)
+		return error("Ambiguous option: %s "
+			"(could be --%s%s or --%s%s)",
+			arg,
+			(ambiguous_flags & OPT_UNSET) ?  "no-" : "",
+			ambiguous_option->long_name,
+			(abbrev_flags & OPT_UNSET) ?  "no-" : "",
+			abbrev_option->long_name);
+	if (abbrev_option)
+		return get_value(p, abbrev_option, abbrev_flags);
+	return -2;
+}
+
+static void check_typos(const char *arg, const struct option *options)
+{
+	if (strlen(arg) < 3)
+		return;
+
+	if (!prefixcmp(arg, "no-")) {
+		error ("did you mean `--%s` (with two dashes ?)", arg);
+		exit(129);
+	}
+
+	for (; options->type != OPTION_END; options++) {
+		if (!options->long_name)
+			continue;
+		if (!prefixcmp(options->long_name, arg)) {
+			error ("did you mean `--%s` (with two dashes ?)", arg);
+			exit(129);
+		}
+	}
+}
+
+void parse_options_start(struct parse_opt_ctx_t *ctx,
+			 int argc, const char **argv, int flags)
+{
+	memset(ctx, 0, sizeof(*ctx));
+	ctx->argc = argc - 1;
+	ctx->argv = argv + 1;
+	ctx->out  = argv;
+	ctx->cpidx = ((flags & PARSE_OPT_KEEP_ARGV0) != 0);
+	ctx->flags = flags;
+	if ((flags & PARSE_OPT_KEEP_UNKNOWN) &&
+	    (flags & PARSE_OPT_STOP_AT_NON_OPTION))
+		die("STOP_AT_NON_OPTION and KEEP_UNKNOWN don't go together");
+}
+
+static int usage_with_options_internal(const char * const *,
+				       const struct option *, int);
+
+int parse_options_step(struct parse_opt_ctx_t *ctx,
+		       const struct option *options,
+		       const char * const usagestr[])
+{
+	int internal_help = !(ctx->flags & PARSE_OPT_NO_INTERNAL_HELP);
+
+	/* we must reset ->opt, unknown short option leave it dangling */
+	ctx->opt = NULL;
+
+	for (; ctx->argc; ctx->argc--, ctx->argv++) {
+		const char *arg = ctx->argv[0];
+
+		if (*arg != '-' || !arg[1]) {
+			if (ctx->flags & PARSE_OPT_STOP_AT_NON_OPTION)
+				break;
+			ctx->out[ctx->cpidx++] = ctx->argv[0];
+			continue;
+		}
+
+		if (arg[1] != '-') {
+			ctx->opt = arg + 1;
+			if (internal_help && *ctx->opt == 'h')
+				return parse_options_usage(usagestr, options);
+			switch (parse_short_opt(ctx, options)) {
+			case -1:
+				return parse_options_usage(usagestr, options);
+			case -2:
+				goto unknown;
+			}
+			if (ctx->opt)
+				check_typos(arg + 1, options);
+			while (ctx->opt) {
+				if (internal_help && *ctx->opt == 'h')
+					return parse_options_usage(usagestr, options);
+				switch (parse_short_opt(ctx, options)) {
+				case -1:
+					return parse_options_usage(usagestr, options);
+				case -2:
+					/* fake a short option thing to hide the fact that we may have
+					 * started to parse aggregated stuff
+					 *
+					 * This is leaky, too bad.
+					 */
+					ctx->argv[0] = strdup(ctx->opt - 1);
+					*(char *)ctx->argv[0] = '-';
+					goto unknown;
+				}
+			}
+			continue;
+		}
+
+		if (!arg[2]) { /* "--" */
+			if (!(ctx->flags & PARSE_OPT_KEEP_DASHDASH)) {
+				ctx->argc--;
+				ctx->argv++;
+			}
+			break;
+		}
+
+		if (internal_help && !strcmp(arg + 2, "help-all"))
+			return usage_with_options_internal(usagestr, options, 1);
+		if (internal_help && !strcmp(arg + 2, "help"))
+			return parse_options_usage(usagestr, options);
+		switch (parse_long_opt(ctx, arg + 2, options)) {
+		case -1:
+			return parse_options_usage(usagestr, options);
+		case -2:
+			goto unknown;
+		}
+		continue;
+unknown:
+		if (!(ctx->flags & PARSE_OPT_KEEP_UNKNOWN))
+			return PARSE_OPT_UNKNOWN;
+		ctx->out[ctx->cpidx++] = ctx->argv[0];
+		ctx->opt = NULL;
+	}
+	return PARSE_OPT_DONE;
+}
+
+int parse_options_end(struct parse_opt_ctx_t *ctx)
+{
+	memmove(ctx->out + ctx->cpidx, ctx->argv, ctx->argc * sizeof(*ctx->out));
+	ctx->out[ctx->cpidx + ctx->argc] = NULL;
+	return ctx->cpidx + ctx->argc;
+}
+
+int parse_options(int argc, const char **argv, const struct option *options,
+		  const char * const usagestr[], int flags)
+{
+	struct parse_opt_ctx_t ctx;
+
+	parse_options_start(&ctx, argc, argv, flags);
+	switch (parse_options_step(&ctx, options, usagestr)) {
+	case PARSE_OPT_HELP:
+		exit(129);
+	case PARSE_OPT_DONE:
+		break;
+	default: /* PARSE_OPT_UNKNOWN */
+		if (ctx.argv[0][1] == '-') {
+			error("unknown option `%s'", ctx.argv[0] + 2);
+		} else {
+			error("unknown switch `%c'", *ctx.opt);
+		}
+		usage_with_options(usagestr, options);
+	}
+
+	return parse_options_end(&ctx);
+}
+
+#define USAGE_OPTS_WIDTH 24
+#define USAGE_GAP         2
+
+int usage_with_options_internal(const char * const *usagestr,
+				const struct option *opts, int full)
+{
+	if (!usagestr)
+		return PARSE_OPT_HELP;
+
+	fprintf(stderr, "\n usage: %s\n", *usagestr++);
+	while (*usagestr && **usagestr)
+		fprintf(stderr, "    or: %s\n", *usagestr++);
+	while (*usagestr) {
+		fprintf(stderr, "%s%s\n",
+				**usagestr ? "    " : "",
+				*usagestr);
+		usagestr++;
+	}
+
+	if (opts->type != OPTION_GROUP)
+		fputc('\n', stderr);
+
+	for (; opts->type != OPTION_END; opts++) {
+		size_t pos;
+		int pad;
+
+		if (opts->type == OPTION_GROUP) {
+			fputc('\n', stderr);
+			if (*opts->help)
+				fprintf(stderr, "%s\n", opts->help);
+			continue;
+		}
+		if (!full && (opts->flags & PARSE_OPT_HIDDEN))
+			continue;
+
+		pos = fprintf(stderr, "    ");
+		if (opts->short_name)
+			pos += fprintf(stderr, "-%c", opts->short_name);
+		if (opts->long_name && opts->short_name)
+			pos += fprintf(stderr, ", ");
+		if (opts->long_name)
+			pos += fprintf(stderr, "--%s", opts->long_name);
+
+		switch (opts->type) {
+		case OPTION_ARGUMENT:
+			break;
+		case OPTION_INTEGER:
+			if (opts->flags & PARSE_OPT_OPTARG)
+				if (opts->long_name)
+					pos += fprintf(stderr, "[=<n>]");
+				else
+					pos += fprintf(stderr, "[<n>]");
+			else
+				pos += fprintf(stderr, " <n>");
+			break;
+		case OPTION_CALLBACK:
+			if (opts->flags & PARSE_OPT_NOARG)
+				break;
+			/* FALLTHROUGH */
+		case OPTION_STRING:
+			if (opts->argh) {
+				if (opts->flags & PARSE_OPT_OPTARG)
+					if (opts->long_name)
+						pos += fprintf(stderr, "[=<%s>]", opts->argh);
+					else
+						pos += fprintf(stderr, "[<%s>]", opts->argh);
+				else
+					pos += fprintf(stderr, " <%s>", opts->argh);
+			} else {
+				if (opts->flags & PARSE_OPT_OPTARG)
+					if (opts->long_name)
+						pos += fprintf(stderr, "[=...]");
+					else
+						pos += fprintf(stderr, "[...]");
+				else
+					pos += fprintf(stderr, " ...");
+			}
+			break;
+		default: /* OPTION_{BIT,BOOLEAN,SET_INT,SET_PTR} */
+			break;
+		}
+
+		if (pos <= USAGE_OPTS_WIDTH)
+			pad = USAGE_OPTS_WIDTH - pos;
+		else {
+			fputc('\n', stderr);
+			pad = USAGE_OPTS_WIDTH;
+		}
+		fprintf(stderr, "%*s%s\n", pad + USAGE_GAP, "", opts->help);
+	}
+	fputc('\n', stderr);
+
+	return PARSE_OPT_HELP;
+}
+
+void usage_with_options(const char * const *usagestr,
+			const struct option *opts)
+{
+	usage_with_options_internal(usagestr, opts, 0);
+	exit(129);
+}
+
+int parse_options_usage(const char * const *usagestr,
+			const struct option *opts)
+{
+	return usage_with_options_internal(usagestr, opts, 0);
+}
+
+
+int parse_opt_verbosity_cb(const struct option *opt, const char *arg,
+			   int unset)
+{
+	int *target = opt->value;
+
+	if (unset)
+		/* --no-quiet, --no-verbose */
+		*target = 0;
+	else if (opt->short_name == 'v') {
+		if (*target >= 0)
+			(*target)++;
+		else
+			*target = 1;
+	} else {
+		if (*target <= 0)
+			(*target)--;
+		else
+			*target = -1;
+	}
+	return 0;
+}
diff --git a/tools/perf/util/parse-options.h b/tools/perf/util/parse-options.h
new file mode 100644
index 0000000..a1039a6
--- /dev/null
+++ b/tools/perf/util/parse-options.h
@@ -0,0 +1,174 @@
+#ifndef PARSE_OPTIONS_H
+#define PARSE_OPTIONS_H
+
+enum parse_opt_type {
+	/* special types */
+	OPTION_END,
+	OPTION_ARGUMENT,
+	OPTION_GROUP,
+	/* options with no arguments */
+	OPTION_BIT,
+	OPTION_BOOLEAN, /* _INCR would have been a better name */
+	OPTION_SET_INT,
+	OPTION_SET_PTR,
+	/* options with arguments (usually) */
+	OPTION_STRING,
+	OPTION_INTEGER,
+	OPTION_LONG,
+	OPTION_CALLBACK,
+};
+
+enum parse_opt_flags {
+	PARSE_OPT_KEEP_DASHDASH = 1,
+	PARSE_OPT_STOP_AT_NON_OPTION = 2,
+	PARSE_OPT_KEEP_ARGV0 = 4,
+	PARSE_OPT_KEEP_UNKNOWN = 8,
+	PARSE_OPT_NO_INTERNAL_HELP = 16,
+};
+
+enum parse_opt_option_flags {
+	PARSE_OPT_OPTARG  = 1,
+	PARSE_OPT_NOARG   = 2,
+	PARSE_OPT_NONEG   = 4,
+	PARSE_OPT_HIDDEN  = 8,
+	PARSE_OPT_LASTARG_DEFAULT = 16,
+};
+
+struct option;
+typedef int parse_opt_cb(const struct option *, const char *arg, int unset);
+
+/*
+ * `type`::
+ *   holds the type of the option, you must have an OPTION_END last in your
+ *   array.
+ *
+ * `short_name`::
+ *   the character to use as a short option name, '\0' if none.
+ *
+ * `long_name`::
+ *   the long option name, without the leading dashes, NULL if none.
+ *
+ * `value`::
+ *   stores pointers to the values to be filled.
+ *
+ * `argh`::
+ *   token to explain the kind of argument this option wants. Keep it
+ *   homogenous across the repository.
+ *
+ * `help`::
+ *   the short help associated to what the option does.
+ *   Must never be NULL (except for OPTION_END).
+ *   OPTION_GROUP uses this pointer to store the group header.
+ *
+ * `flags`::
+ *   mask of parse_opt_option_flags.
+ *   PARSE_OPT_OPTARG: says that the argument is optionnal (not for BOOLEANs)
+ *   PARSE_OPT_NOARG: says that this option takes no argument, for CALLBACKs
+ *   PARSE_OPT_NONEG: says that this option cannot be negated
+ *   PARSE_OPT_HIDDEN this option is skipped in the default usage, showed in
+ *                    the long one.
+ *
+ * `callback`::
+ *   pointer to the callback to use for OPTION_CALLBACK.
+ *
+ * `defval`::
+ *   default value to fill (*->value) with for PARSE_OPT_OPTARG.
+ *   OPTION_{BIT,SET_INT,SET_PTR} store the {mask,integer,pointer} to put in
+ *   the value when met.
+ *   CALLBACKS can use it like they want.
+ */
+struct option {
+	enum parse_opt_type type;
+	int short_name;
+	const char *long_name;
+	void *value;
+	const char *argh;
+	const char *help;
+
+	int flags;
+	parse_opt_cb *callback;
+	intptr_t defval;
+};
+
+#define OPT_END()                   { OPTION_END }
+#define OPT_ARGUMENT(l, h)          { OPTION_ARGUMENT, 0, (l), NULL, NULL, (h) }
+#define OPT_GROUP(h)                { OPTION_GROUP, 0, NULL, NULL, NULL, (h) }
+#define OPT_BIT(s, l, v, h, b)      { OPTION_BIT, (s), (l), (v), NULL, (h), 0, NULL, (b) }
+#define OPT_BOOLEAN(s, l, v, h)     { OPTION_BOOLEAN, (s), (l), (v), NULL, (h) }
+#define OPT_SET_INT(s, l, v, h, i)  { OPTION_SET_INT, (s), (l), (v), NULL, (h), 0, NULL, (i) }
+#define OPT_SET_PTR(s, l, v, h, p)  { OPTION_SET_PTR, (s), (l), (v), NULL, (h), 0, NULL, (p) }
+#define OPT_INTEGER(s, l, v, h)     { OPTION_INTEGER, (s), (l), (v), NULL, (h) }
+#define OPT_LONG(s, l, v, h)        { OPTION_LONG, (s), (l), (v), NULL, (h) }
+#define OPT_STRING(s, l, v, a, h)   { OPTION_STRING,  (s), (l), (v), (a), (h) }
+#define OPT_DATE(s, l, v, h) \
+	{ OPTION_CALLBACK, (s), (l), (v), "time",(h), 0, \
+	  parse_opt_approxidate_cb }
+#define OPT_CALLBACK(s, l, v, a, h, f) \
+	{ OPTION_CALLBACK, (s), (l), (v), (a), (h), 0, (f) }
+
+/* parse_options() will filter out the processed options and leave the
+ * non-option argments in argv[].
+ * Returns the number of arguments left in argv[].
+ */
+extern int parse_options(int argc, const char **argv,
+                         const struct option *options,
+                         const char * const usagestr[], int flags);
+
+extern NORETURN void usage_with_options(const char * const *usagestr,
+                                        const struct option *options);
+
+/*----- incremantal advanced APIs -----*/
+
+enum {
+	PARSE_OPT_HELP = -1,
+	PARSE_OPT_DONE,
+	PARSE_OPT_UNKNOWN,
+};
+
+/*
+ * It's okay for the caller to consume argv/argc in the usual way.
+ * Other fields of that structure are private to parse-options and should not
+ * be modified in any way.
+ */
+struct parse_opt_ctx_t {
+	const char **argv;
+	const char **out;
+	int argc, cpidx;
+	const char *opt;
+	int flags;
+};
+
+extern int parse_options_usage(const char * const *usagestr,
+			       const struct option *opts);
+
+extern void parse_options_start(struct parse_opt_ctx_t *ctx,
+				int argc, const char **argv, int flags);
+
+extern int parse_options_step(struct parse_opt_ctx_t *ctx,
+			      const struct option *options,
+			      const char * const usagestr[]);
+
+extern int parse_options_end(struct parse_opt_ctx_t *ctx);
+
+
+/*----- some often used options -----*/
+extern int parse_opt_abbrev_cb(const struct option *, const char *, int);
+extern int parse_opt_approxidate_cb(const struct option *, const char *, int);
+extern int parse_opt_verbosity_cb(const struct option *, const char *, int);
+
+#define OPT__VERBOSE(var)  OPT_BOOLEAN('v', "verbose", (var), "be verbose")
+#define OPT__QUIET(var)    OPT_BOOLEAN('q', "quiet",   (var), "be quiet")
+#define OPT__VERBOSITY(var) \
+	{ OPTION_CALLBACK, 'v', "verbose", (var), NULL, "be more verbose", \
+	  PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }, \
+	{ OPTION_CALLBACK, 'q', "quiet", (var), NULL, "be more quiet", \
+	  PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }
+#define OPT__DRY_RUN(var)  OPT_BOOLEAN('n', "dry-run", (var), "dry run")
+#define OPT__ABBREV(var)  \
+	{ OPTION_CALLBACK, 0, "abbrev", (var), "n", \
+	  "use <n> digits to display SHA-1s", \
+	  PARSE_OPT_OPTARG, &parse_opt_abbrev_cb, 0 }
+
+extern const char *parse_options_fix_filename(const char *prefix, const char *file);
+
+#endif
diff --git a/tools/perf/util/path.c b/tools/perf/util/path.c
new file mode 100644
index 0000000..a501a40
--- /dev/null
+++ b/tools/perf/util/path.c
@@ -0,0 +1,353 @@
+/*
+ * I'm tired of doing "vsnprintf()" etc just to open a
+ * file, so here's a "return static buffer with printf"
+ * interface for paths.
+ *
+ * It's obviously not thread-safe. Sue me. But it's quite
+ * useful for doing things like
+ *
+ *   f = open(mkpath("%s/%s.perf", base, name), O_RDONLY);
+ *
+ * which is what it's designed for.
+ */
+#include "cache.h"
+
+static char bad_path[] = "/bad-path/";
+/*
+ * Two hacks:
+ */
+
+static char *get_perf_dir(void)
+{
+	return ".";
+}
+
+size_t strlcpy(char *dest, const char *src, size_t size)
+{
+	size_t ret = strlen(src);
+
+	if (size) {
+		size_t len = (ret >= size) ? size - 1 : ret;
+		memcpy(dest, src, len);
+		dest[len] = '\0';
+	}
+	return ret;
+}
+
+
+static char *get_pathname(void)
+{
+	static char pathname_array[4][PATH_MAX];
+	static int index;
+	return pathname_array[3 & ++index];
+}
+
+static char *cleanup_path(char *path)
+{
+	/* Clean it up */
+	if (!memcmp(path, "./", 2)) {
+		path += 2;
+		while (*path == '/')
+			path++;
+	}
+	return path;
+}
+
+char *mksnpath(char *buf, size_t n, const char *fmt, ...)
+{
+	va_list args;
+	unsigned len;
+
+	va_start(args, fmt);
+	len = vsnprintf(buf, n, fmt, args);
+	va_end(args);
+	if (len >= n) {
+		strlcpy(buf, bad_path, n);
+		return buf;
+	}
+	return cleanup_path(buf);
+}
+
+static char *perf_vsnpath(char *buf, size_t n, const char *fmt, va_list args)
+{
+	const char *perf_dir = get_perf_dir();
+	size_t len;
+
+	len = strlen(perf_dir);
+	if (n < len + 1)
+		goto bad;
+	memcpy(buf, perf_dir, len);
+	if (len && !is_dir_sep(perf_dir[len-1]))
+		buf[len++] = '/';
+	len += vsnprintf(buf + len, n - len, fmt, args);
+	if (len >= n)
+		goto bad;
+	return cleanup_path(buf);
+bad:
+	strlcpy(buf, bad_path, n);
+	return buf;
+}
+
+char *perf_snpath(char *buf, size_t n, const char *fmt, ...)
+{
+	va_list args;
+	va_start(args, fmt);
+	(void)perf_vsnpath(buf, n, fmt, args);
+	va_end(args);
+	return buf;
+}
+
+char *perf_pathdup(const char *fmt, ...)
+{
+	char path[PATH_MAX];
+	va_list args;
+	va_start(args, fmt);
+	(void)perf_vsnpath(path, sizeof(path), fmt, args);
+	va_end(args);
+	return xstrdup(path);
+}
+
+char *mkpath(const char *fmt, ...)
+{
+	va_list args;
+	unsigned len;
+	char *pathname = get_pathname();
+
+	va_start(args, fmt);
+	len = vsnprintf(pathname, PATH_MAX, fmt, args);
+	va_end(args);
+	if (len >= PATH_MAX)
+		return bad_path;
+	return cleanup_path(pathname);
+}
+
+char *perf_path(const char *fmt, ...)
+{
+	const char *perf_dir = get_perf_dir();
+	char *pathname = get_pathname();
+	va_list args;
+	unsigned len;
+
+	len = strlen(perf_dir);
+	if (len > PATH_MAX-100)
+		return bad_path;
+	memcpy(pathname, perf_dir, len);
+	if (len && perf_dir[len-1] != '/')
+		pathname[len++] = '/';
+	va_start(args, fmt);
+	len += vsnprintf(pathname + len, PATH_MAX - len, fmt, args);
+	va_end(args);
+	if (len >= PATH_MAX)
+		return bad_path;
+	return cleanup_path(pathname);
+}
+
+
+/* perf_mkstemp() - create tmp file honoring TMPDIR variable */
+int perf_mkstemp(char *path, size_t len, const char *template)
+{
+	const char *tmp;
+	size_t n;
+
+	tmp = getenv("TMPDIR");
+	if (!tmp)
+		tmp = "/tmp";
+	n = snprintf(path, len, "%s/%s", tmp, template);
+	if (len <= n) {
+		errno = ENAMETOOLONG;
+		return -1;
+	}
+	return mkstemp(path);
+}
+
+
+const char *make_relative_path(const char *abs, const char *base)
+{
+	static char buf[PATH_MAX + 1];
+	int baselen;
+	if (!base)
+		return abs;
+	baselen = strlen(base);
+	if (prefixcmp(abs, base))
+		return abs;
+	if (abs[baselen] == '/')
+		baselen++;
+	else if (base[baselen - 1] != '/')
+		return abs;
+	strcpy(buf, abs + baselen);
+	return buf;
+}
+
+/*
+ * It is okay if dst == src, but they should not overlap otherwise.
+ *
+ * Performs the following normalizations on src, storing the result in dst:
+ * - Ensures that components are separated by '/' (Windows only)
+ * - Squashes sequences of '/'.
+ * - Removes "." components.
+ * - Removes ".." components, and the components the precede them.
+ * Returns failure (non-zero) if a ".." component appears as first path
+ * component anytime during the normalization. Otherwise, returns success (0).
+ *
+ * Note that this function is purely textual.  It does not follow symlinks,
+ * verify the existence of the path, or make any system calls.
+ */
+int normalize_path_copy(char *dst, const char *src)
+{
+	char *dst0;
+
+	if (has_dos_drive_prefix(src)) {
+		*dst++ = *src++;
+		*dst++ = *src++;
+	}
+	dst0 = dst;
+
+	if (is_dir_sep(*src)) {
+		*dst++ = '/';
+		while (is_dir_sep(*src))
+			src++;
+	}
+
+	for (;;) {
+		char c = *src;
+
+		/*
+		 * A path component that begins with . could be
+		 * special:
+		 * (1) "." and ends   -- ignore and terminate.
+		 * (2) "./"           -- ignore them, eat slash and continue.
+		 * (3) ".." and ends  -- strip one and terminate.
+		 * (4) "../"          -- strip one, eat slash and continue.
+		 */
+		if (c == '.') {
+			if (!src[1]) {
+				/* (1) */
+				src++;
+			} else if (is_dir_sep(src[1])) {
+				/* (2) */
+				src += 2;
+				while (is_dir_sep(*src))
+					src++;
+				continue;
+			} else if (src[1] == '.') {
+				if (!src[2]) {
+					/* (3) */
+					src += 2;
+					goto up_one;
+				} else if (is_dir_sep(src[2])) {
+					/* (4) */
+					src += 3;
+					while (is_dir_sep(*src))
+						src++;
+					goto up_one;
+				}
+			}
+		}
+
+		/* copy up to the next '/', and eat all '/' */
+		while ((c = *src++) != '\0' && !is_dir_sep(c))
+			*dst++ = c;
+		if (is_dir_sep(c)) {
+			*dst++ = '/';
+			while (is_dir_sep(c))
+				c = *src++;
+			src--;
+		} else if (!c)
+			break;
+		continue;
+
+	up_one:
+		/*
+		 * dst0..dst is prefix portion, and dst[-1] is '/';
+		 * go up one level.
+		 */
+		dst--;	/* go to trailing '/' */
+		if (dst <= dst0)
+			return -1;
+		/* Windows: dst[-1] cannot be backslash anymore */
+		while (dst0 < dst && dst[-1] != '/')
+			dst--;
+	}
+	*dst = '\0';
+	return 0;
+}
+
+/*
+ * path = Canonical absolute path
+ * prefix_list = Colon-separated list of absolute paths
+ *
+ * Determines, for each path in prefix_list, whether the "prefix" really
+ * is an ancestor directory of path.  Returns the length of the longest
+ * ancestor directory, excluding any trailing slashes, or -1 if no prefix
+ * is an ancestor.  (Note that this means 0 is returned if prefix_list is
+ * "/".) "/foo" is not considered an ancestor of "/foobar".  Directories
+ * are not considered to be their own ancestors.  path must be in a
+ * canonical form: empty components, or "." or ".." components are not
+ * allowed.  prefix_list may be null, which is like "".
+ */
+int longest_ancestor_length(const char *path, const char *prefix_list)
+{
+	char buf[PATH_MAX+1];
+	const char *ceil, *colon;
+	int len, max_len = -1;
+
+	if (prefix_list == NULL || !strcmp(path, "/"))
+		return -1;
+
+	for (colon = ceil = prefix_list; *colon; ceil = colon+1) {
+		for (colon = ceil; *colon && *colon != PATH_SEP; colon++);
+		len = colon - ceil;
+		if (len == 0 || len > PATH_MAX || !is_absolute_path(ceil))
+			continue;
+		strlcpy(buf, ceil, len+1);
+		if (normalize_path_copy(buf, buf) < 0)
+			continue;
+		len = strlen(buf);
+		if (len > 0 && buf[len-1] == '/')
+			buf[--len] = '\0';
+
+		if (!strncmp(path, buf, len) &&
+		    path[len] == '/' &&
+		    len > max_len) {
+			max_len = len;
+		}
+	}
+
+	return max_len;
+}
+
+/* strip arbitrary amount of directory separators at end of path */
+static inline int chomp_trailing_dir_sep(const char *path, int len)
+{
+	while (len && is_dir_sep(path[len - 1]))
+		len--;
+	return len;
+}
+
+/*
+ * If path ends with suffix (complete path components), returns the
+ * part before suffix (sans trailing directory separators).
+ * Otherwise returns NULL.
+ */
+char *strip_path_suffix(const char *path, const char *suffix)
+{
+	int path_len = strlen(path), suffix_len = strlen(suffix);
+
+	while (suffix_len) {
+		if (!path_len)
+			return NULL;
+
+		if (is_dir_sep(path[path_len - 1])) {
+			if (!is_dir_sep(suffix[suffix_len - 1]))
+				return NULL;
+			path_len = chomp_trailing_dir_sep(path, path_len);
+			suffix_len = chomp_trailing_dir_sep(suffix, suffix_len);
+		}
+		else if (path[--path_len] != suffix[--suffix_len])
+			return NULL;
+	}
+
+	if (path_len && !is_dir_sep(path[path_len - 1]))
+		return NULL;
+	return xstrndup(path, chomp_trailing_dir_sep(path, path_len));
+}
diff --git a/tools/perf/util/quote.c b/tools/perf/util/quote.c
new file mode 100644
index 0000000..f18c521
--- /dev/null
+++ b/tools/perf/util/quote.c
@@ -0,0 +1,481 @@
+#include "cache.h"
+#include "quote.h"
+
+int quote_path_fully = 1;
+
+/* Help to copy the thing properly quoted for the shell safety.
+ * any single quote is replaced with '\'', any exclamation point
+ * is replaced with '\!', and the whole thing is enclosed in a
+ *
+ * E.g.
+ *  original     sq_quote     result
+ *  name     ==> name      ==> 'name'
+ *  a b      ==> a b       ==> 'a b'
+ *  a'b      ==> a'\''b    ==> 'a'\''b'
+ *  a!b      ==> a'\!'b    ==> 'a'\!'b'
+ */
+static inline int need_bs_quote(char c)
+{
+	return (c == '\'' || c == '!');
+}
+
+void sq_quote_buf(struct strbuf *dst, const char *src)
+{
+	char *to_free = NULL;
+
+	if (dst->buf == src)
+		to_free = strbuf_detach(dst, NULL);
+
+	strbuf_addch(dst, '\'');
+	while (*src) {
+		size_t len = strcspn(src, "'!");
+		strbuf_add(dst, src, len);
+		src += len;
+		while (need_bs_quote(*src)) {
+			strbuf_addstr(dst, "'\\");
+			strbuf_addch(dst, *src++);
+			strbuf_addch(dst, '\'');
+		}
+	}
+	strbuf_addch(dst, '\'');
+	free(to_free);
+}
+
+void sq_quote_print(FILE *stream, const char *src)
+{
+	char c;
+
+	fputc('\'', stream);
+	while ((c = *src++)) {
+		if (need_bs_quote(c)) {
+			fputs("'\\", stream);
+			fputc(c, stream);
+			fputc('\'', stream);
+		} else {
+			fputc(c, stream);
+		}
+	}
+	fputc('\'', stream);
+}
+
+void sq_quote_argv(struct strbuf *dst, const char** argv, size_t maxlen)
+{
+	int i;
+
+	/* Copy into destination buffer. */
+	strbuf_grow(dst, 255);
+	for (i = 0; argv[i]; ++i) {
+		strbuf_addch(dst, ' ');
+		sq_quote_buf(dst, argv[i]);
+		if (maxlen && dst->len > maxlen)
+			die("Too many or long arguments");
+	}
+}
+
+char *sq_dequote_step(char *arg, char **next)
+{
+	char *dst = arg;
+	char *src = arg;
+	char c;
+
+	if (*src != '\'')
+		return NULL;
+	for (;;) {
+		c = *++src;
+		if (!c)
+			return NULL;
+		if (c != '\'') {
+			*dst++ = c;
+			continue;
+		}
+		/* We stepped out of sq */
+		switch (*++src) {
+		case '\0':
+			*dst = 0;
+			if (next)
+				*next = NULL;
+			return arg;
+		case '\\':
+			c = *++src;
+			if (need_bs_quote(c) && *++src == '\'') {
+				*dst++ = c;
+				continue;
+			}
+		/* Fallthrough */
+		default:
+			if (!next || !isspace(*src))
+				return NULL;
+			do {
+				c = *++src;
+			} while (isspace(c));
+			*dst = 0;
+			*next = src;
+			return arg;
+		}
+	}
+}
+
+char *sq_dequote(char *arg)
+{
+	return sq_dequote_step(arg, NULL);
+}
+
+int sq_dequote_to_argv(char *arg, const char ***argv, int *nr, int *alloc)
+{
+	char *next = arg;
+
+	if (!*arg)
+		return 0;
+	do {
+		char *dequoted = sq_dequote_step(next, &next);
+		if (!dequoted)
+			return -1;
+		ALLOC_GROW(*argv, *nr + 1, *alloc);
+		(*argv)[(*nr)++] = dequoted;
+	} while (next);
+
+	return 0;
+}
+
+/* 1 means: quote as octal
+ * 0 means: quote as octal if (quote_path_fully)
+ * -1 means: never quote
+ * c: quote as "\\c"
+ */
+#define X8(x)   x, x, x, x, x, x, x, x
+#define X16(x)  X8(x), X8(x)
+static signed char const sq_lookup[256] = {
+	/*           0    1    2    3    4    5    6    7 */
+	/* 0x00 */   1,   1,   1,   1,   1,   1,   1, 'a',
+	/* 0x08 */ 'b', 't', 'n', 'v', 'f', 'r',   1,   1,
+	/* 0x10 */ X16(1),
+	/* 0x20 */  -1,  -1, '"',  -1,  -1,  -1,  -1,  -1,
+	/* 0x28 */ X16(-1), X16(-1), X16(-1),
+	/* 0x58 */  -1,  -1,  -1,  -1,'\\',  -1,  -1,  -1,
+	/* 0x60 */ X16(-1), X8(-1),
+	/* 0x78 */  -1,  -1,  -1,  -1,  -1,  -1,  -1,   1,
+	/* 0x80 */ /* set to 0 */
+};
+
+static inline int sq_must_quote(char c)
+{
+	return sq_lookup[(unsigned char)c] + quote_path_fully > 0;
+}
+
+/* returns the longest prefix not needing a quote up to maxlen if positive.
+   This stops at the first \0 because it's marked as a character needing an
+   escape */
+static size_t next_quote_pos(const char *s, ssize_t maxlen)
+{
+	size_t len;
+	if (maxlen < 0) {
+		for (len = 0; !sq_must_quote(s[len]); len++);
+	} else {
+		for (len = 0; len < maxlen && !sq_must_quote(s[len]); len++);
+	}
+	return len;
+}
+
+/*
+ * C-style name quoting.
+ *
+ * (1) if sb and fp are both NULL, inspect the input name and counts the
+ *     number of bytes that are needed to hold c_style quoted version of name,
+ *     counting the double quotes around it but not terminating NUL, and
+ *     returns it.
+ *     However, if name does not need c_style quoting, it returns 0.
+ *
+ * (2) if sb or fp are not NULL, it emits the c_style quoted version
+ *     of name, enclosed with double quotes if asked and needed only.
+ *     Return value is the same as in (1).
+ */
+static size_t quote_c_style_counted(const char *name, ssize_t maxlen,
+                                    struct strbuf *sb, FILE *fp, int no_dq)
+{
+#undef EMIT
+#define EMIT(c)                                 \
+	do {                                        \
+		if (sb) strbuf_addch(sb, (c));          \
+		if (fp) fputc((c), fp);                 \
+		count++;                                \
+	} while (0)
+#define EMITBUF(s, l)                           \
+	do {                                        \
+		int __ret;				\
+		if (sb) strbuf_add(sb, (s), (l));       \
+		if (fp) __ret = fwrite((s), (l), 1, fp);        \
+		count += (l);                           \
+	} while (0)
+
+	size_t len, count = 0;
+	const char *p = name;
+
+	for (;;) {
+		int ch;
+
+		len = next_quote_pos(p, maxlen);
+		if (len == maxlen || !p[len])
+			break;
+
+		if (!no_dq && p == name)
+			EMIT('"');
+
+		EMITBUF(p, len);
+		EMIT('\\');
+		p += len;
+		ch = (unsigned char)*p++;
+		if (sq_lookup[ch] >= ' ') {
+			EMIT(sq_lookup[ch]);
+		} else {
+			EMIT(((ch >> 6) & 03) + '0');
+			EMIT(((ch >> 3) & 07) + '0');
+			EMIT(((ch >> 0) & 07) + '0');
+		}
+	}
+
+	EMITBUF(p, len);
+	if (p == name)   /* no ending quote needed */
+		return 0;
+
+	if (!no_dq)
+		EMIT('"');
+	return count;
+}
+
+size_t quote_c_style(const char *name, struct strbuf *sb, FILE *fp, int nodq)
+{
+	return quote_c_style_counted(name, -1, sb, fp, nodq);
+}
+
+void quote_two_c_style(struct strbuf *sb, const char *prefix, const char *path, int nodq)
+{
+	if (quote_c_style(prefix, NULL, NULL, 0) ||
+	    quote_c_style(path, NULL, NULL, 0)) {
+		if (!nodq)
+			strbuf_addch(sb, '"');
+		quote_c_style(prefix, sb, NULL, 1);
+		quote_c_style(path, sb, NULL, 1);
+		if (!nodq)
+			strbuf_addch(sb, '"');
+	} else {
+		strbuf_addstr(sb, prefix);
+		strbuf_addstr(sb, path);
+	}
+}
+
+void write_name_quoted(const char *name, FILE *fp, int terminator)
+{
+	if (terminator) {
+		quote_c_style(name, NULL, fp, 0);
+	} else {
+		fputs(name, fp);
+	}
+	fputc(terminator, fp);
+}
+
+extern void write_name_quotedpfx(const char *pfx, size_t pfxlen,
+                                 const char *name, FILE *fp, int terminator)
+{
+	int needquote = 0;
+
+	if (terminator) {
+		needquote = next_quote_pos(pfx, pfxlen) < pfxlen
+			|| name[next_quote_pos(name, -1)];
+	}
+	if (needquote) {
+		fputc('"', fp);
+		quote_c_style_counted(pfx, pfxlen, NULL, fp, 1);
+		quote_c_style(name, NULL, fp, 1);
+		fputc('"', fp);
+	} else {
+		int ret;
+
+		ret = fwrite(pfx, pfxlen, 1, fp);
+		fputs(name, fp);
+	}
+	fputc(terminator, fp);
+}
+
+/* quote path as relative to the given prefix */
+char *quote_path_relative(const char *in, int len,
+			  struct strbuf *out, const char *prefix)
+{
+	int needquote;
+
+	if (len < 0)
+		len = strlen(in);
+
+	/* "../" prefix itself does not need quoting, but "in" might. */
+	needquote = next_quote_pos(in, len) < len;
+	strbuf_setlen(out, 0);
+	strbuf_grow(out, len);
+
+	if (needquote)
+		strbuf_addch(out, '"');
+	if (prefix) {
+		int off = 0;
+		while (prefix[off] && off < len && prefix[off] == in[off])
+			if (prefix[off] == '/') {
+				prefix += off + 1;
+				in += off + 1;
+				len -= off + 1;
+				off = 0;
+			} else
+				off++;
+
+		for (; *prefix; prefix++)
+			if (*prefix == '/')
+				strbuf_addstr(out, "../");
+	}
+
+	quote_c_style_counted (in, len, out, NULL, 1);
+
+	if (needquote)
+		strbuf_addch(out, '"');
+	if (!out->len)
+		strbuf_addstr(out, "./");
+
+	return out->buf;
+}
+
+/*
+ * C-style name unquoting.
+ *
+ * Quoted should point at the opening double quote.
+ * + Returns 0 if it was able to unquote the string properly, and appends the
+ *   result in the strbuf `sb'.
+ * + Returns -1 in case of error, and doesn't touch the strbuf. Though note
+ *   that this function will allocate memory in the strbuf, so calling
+ *   strbuf_release is mandatory whichever result unquote_c_style returns.
+ *
+ * Updates endp pointer to point at one past the ending double quote if given.
+ */
+int unquote_c_style(struct strbuf *sb, const char *quoted, const char **endp)
+{
+	size_t oldlen = sb->len, len;
+	int ch, ac;
+
+	if (*quoted++ != '"')
+		return -1;
+
+	for (;;) {
+		len = strcspn(quoted, "\"\\");
+		strbuf_add(sb, quoted, len);
+		quoted += len;
+
+		switch (*quoted++) {
+		  case '"':
+			if (endp)
+				*endp = quoted;
+			return 0;
+		  case '\\':
+			break;
+		  default:
+			goto error;
+		}
+
+		switch ((ch = *quoted++)) {
+		case 'a': ch = '\a'; break;
+		case 'b': ch = '\b'; break;
+		case 'f': ch = '\f'; break;
+		case 'n': ch = '\n'; break;
+		case 'r': ch = '\r'; break;
+		case 't': ch = '\t'; break;
+		case 'v': ch = '\v'; break;
+
+		case '\\': case '"':
+			break; /* verbatim */
+
+		/* octal values with first digit over 4 overflow */
+		case '0': case '1': case '2': case '3':
+					ac = ((ch - '0') << 6);
+			if ((ch = *quoted++) < '0' || '7' < ch)
+				goto error;
+					ac |= ((ch - '0') << 3);
+			if ((ch = *quoted++) < '0' || '7' < ch)
+				goto error;
+					ac |= (ch - '0');
+					ch = ac;
+					break;
+				default:
+			goto error;
+			}
+		strbuf_addch(sb, ch);
+		}
+
+  error:
+	strbuf_setlen(sb, oldlen);
+	return -1;
+}
+
+/* quoting as a string literal for other languages */
+
+void perl_quote_print(FILE *stream, const char *src)
+{
+	const char sq = '\'';
+	const char bq = '\\';
+	char c;
+
+	fputc(sq, stream);
+	while ((c = *src++)) {
+		if (c == sq || c == bq)
+			fputc(bq, stream);
+		fputc(c, stream);
+	}
+	fputc(sq, stream);
+}
+
+void python_quote_print(FILE *stream, const char *src)
+{
+	const char sq = '\'';
+	const char bq = '\\';
+	const char nl = '\n';
+	char c;
+
+	fputc(sq, stream);
+	while ((c = *src++)) {
+		if (c == nl) {
+			fputc(bq, stream);
+			fputc('n', stream);
+			continue;
+		}
+		if (c == sq || c == bq)
+			fputc(bq, stream);
+		fputc(c, stream);
+	}
+	fputc(sq, stream);
+}
+
+void tcl_quote_print(FILE *stream, const char *src)
+{
+	char c;
+
+	fputc('"', stream);
+	while ((c = *src++)) {
+		switch (c) {
+		case '[': case ']':
+		case '{': case '}':
+		case '$': case '\\': case '"':
+			fputc('\\', stream);
+		default:
+			fputc(c, stream);
+			break;
+		case '\f':
+			fputs("\\f", stream);
+			break;
+		case '\r':
+			fputs("\\r", stream);
+			break;
+		case '\n':
+			fputs("\\n", stream);
+			break;
+		case '\t':
+			fputs("\\t", stream);
+			break;
+		case '\v':
+			fputs("\\v", stream);
+			break;
+		}
+	}
+	fputc('"', stream);
+}
diff --git a/tools/perf/util/quote.h b/tools/perf/util/quote.h
new file mode 100644
index 0000000..5dfad89
--- /dev/null
+++ b/tools/perf/util/quote.h
@@ -0,0 +1,68 @@
+#ifndef QUOTE_H
+#define QUOTE_H
+
+#include <stddef.h>
+#include <stdio.h>
+
+/* Help to copy the thing properly quoted for the shell safety.
+ * any single quote is replaced with '\'', any exclamation point
+ * is replaced with '\!', and the whole thing is enclosed in a
+ * single quote pair.
+ *
+ * For example, if you are passing the result to system() as an
+ * argument:
+ *
+ * sprintf(cmd, "foobar %s %s", sq_quote(arg0), sq_quote(arg1))
+ *
+ * would be appropriate.  If the system() is going to call ssh to
+ * run the command on the other side:
+ *
+ * sprintf(cmd, "git-diff-tree %s %s", sq_quote(arg0), sq_quote(arg1));
+ * sprintf(rcmd, "ssh %s %s", sq_util/quote.host), sq_quote(cmd));
+ *
+ * Note that the above examples leak memory!  Remember to free result from
+ * sq_quote() in a real application.
+ *
+ * sq_quote_buf() writes to an existing buffer of specified size; it
+ * will return the number of characters that would have been written
+ * excluding the final null regardless of the buffer size.
+ */
+
+extern void sq_quote_print(FILE *stream, const char *src);
+
+extern void sq_quote_buf(struct strbuf *, const char *src);
+extern void sq_quote_argv(struct strbuf *, const char **argv, size_t maxlen);
+
+/* This unwraps what sq_quote() produces in place, but returns
+ * NULL if the input does not look like what sq_quote would have
+ * produced.
+ */
+extern char *sq_dequote(char *);
+
+/*
+ * Same as the above, but can be used to unwrap many arguments in the
+ * same string separated by space. "next" is changed to point to the
+ * next argument that should be passed as first parameter. When there
+ * is no more argument to be dequoted, "next" is updated to point to NULL.
+ */
+extern char *sq_dequote_step(char *arg, char **next);
+extern int sq_dequote_to_argv(char *arg, const char ***argv, int *nr, int *alloc);
+
+extern int unquote_c_style(struct strbuf *, const char *quoted, const char **endp);
+extern size_t quote_c_style(const char *name, struct strbuf *, FILE *, int no_dq);
+extern void quote_two_c_style(struct strbuf *, const char *, const char *, int);
+
+extern void write_name_quoted(const char *name, FILE *, int terminator);
+extern void write_name_quotedpfx(const char *pfx, size_t pfxlen,
+                                 const char *name, FILE *, int terminator);
+
+/* quote path as relative to the given prefix */
+char *quote_path_relative(const char *in, int len,
+			  struct strbuf *out, const char *prefix);
+
+/* quoting as a string literal for other languages */
+extern void perl_quote_print(FILE *stream, const char *src);
+extern void python_quote_print(FILE *stream, const char *src);
+extern void tcl_quote_print(FILE *stream, const char *src);
+
+#endif
diff --git a/tools/perf/util/rbtree.c b/tools/perf/util/rbtree.c
new file mode 100644
index 0000000..b15ba9c
--- /dev/null
+++ b/tools/perf/util/rbtree.c
@@ -0,0 +1,383 @@
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  (C) 2002  David Woodhouse <dwmw2@infradead.org>
+  
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  linux/lib/rbtree.c
+*/
+
+#include "rbtree.h"
+
+static void __rb_rotate_left(struct rb_node *node, struct rb_root *root)
+{
+	struct rb_node *right = node->rb_right;
+	struct rb_node *parent = rb_parent(node);
+
+	if ((node->rb_right = right->rb_left))
+		rb_set_parent(right->rb_left, node);
+	right->rb_left = node;
+
+	rb_set_parent(right, parent);
+
+	if (parent)
+	{
+		if (node == parent->rb_left)
+			parent->rb_left = right;
+		else
+			parent->rb_right = right;
+	}
+	else
+		root->rb_node = right;
+	rb_set_parent(node, right);
+}
+
+static void __rb_rotate_right(struct rb_node *node, struct rb_root *root)
+{
+	struct rb_node *left = node->rb_left;
+	struct rb_node *parent = rb_parent(node);
+
+	if ((node->rb_left = left->rb_right))
+		rb_set_parent(left->rb_right, node);
+	left->rb_right = node;
+
+	rb_set_parent(left, parent);
+
+	if (parent)
+	{
+		if (node == parent->rb_right)
+			parent->rb_right = left;
+		else
+			parent->rb_left = left;
+	}
+	else
+		root->rb_node = left;
+	rb_set_parent(node, left);
+}
+
+void rb_insert_color(struct rb_node *node, struct rb_root *root)
+{
+	struct rb_node *parent, *gparent;
+
+	while ((parent = rb_parent(node)) && rb_is_red(parent))
+	{
+		gparent = rb_parent(parent);
+
+		if (parent == gparent->rb_left)
+		{
+			{
+				register struct rb_node *uncle = gparent->rb_right;
+				if (uncle && rb_is_red(uncle))
+				{
+					rb_set_black(uncle);
+					rb_set_black(parent);
+					rb_set_red(gparent);
+					node = gparent;
+					continue;
+				}
+			}
+
+			if (parent->rb_right == node)
+			{
+				register struct rb_node *tmp;
+				__rb_rotate_left(parent, root);
+				tmp = parent;
+				parent = node;
+				node = tmp;
+			}
+
+			rb_set_black(parent);
+			rb_set_red(gparent);
+			__rb_rotate_right(gparent, root);
+		} else {
+			{
+				register struct rb_node *uncle = gparent->rb_left;
+				if (uncle && rb_is_red(uncle))
+				{
+					rb_set_black(uncle);
+					rb_set_black(parent);
+					rb_set_red(gparent);
+					node = gparent;
+					continue;
+				}
+			}
+
+			if (parent->rb_left == node)
+			{
+				register struct rb_node *tmp;
+				__rb_rotate_right(parent, root);
+				tmp = parent;
+				parent = node;
+				node = tmp;
+			}
+
+			rb_set_black(parent);
+			rb_set_red(gparent);
+			__rb_rotate_left(gparent, root);
+		}
+	}
+
+	rb_set_black(root->rb_node);
+}
+
+static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
+			     struct rb_root *root)
+{
+	struct rb_node *other;
+
+	while ((!node || rb_is_black(node)) && node != root->rb_node)
+	{
+		if (parent->rb_left == node)
+		{
+			other = parent->rb_right;
+			if (rb_is_red(other))
+			{
+				rb_set_black(other);
+				rb_set_red(parent);
+				__rb_rotate_left(parent, root);
+				other = parent->rb_right;
+			}
+			if ((!other->rb_left || rb_is_black(other->rb_left)) &&
+			    (!other->rb_right || rb_is_black(other->rb_right)))
+			{
+				rb_set_red(other);
+				node = parent;
+				parent = rb_parent(node);
+			}
+			else
+			{
+				if (!other->rb_right || rb_is_black(other->rb_right))
+				{
+					rb_set_black(other->rb_left);
+					rb_set_red(other);
+					__rb_rotate_right(other, root);
+					other = parent->rb_right;
+				}
+				rb_set_color(other, rb_color(parent));
+				rb_set_black(parent);
+				rb_set_black(other->rb_right);
+				__rb_rotate_left(parent, root);
+				node = root->rb_node;
+				break;
+			}
+		}
+		else
+		{
+			other = parent->rb_left;
+			if (rb_is_red(other))
+			{
+				rb_set_black(other);
+				rb_set_red(parent);
+				__rb_rotate_right(parent, root);
+				other = parent->rb_left;
+			}
+			if ((!other->rb_left || rb_is_black(other->rb_left)) &&
+			    (!other->rb_right || rb_is_black(other->rb_right)))
+			{
+				rb_set_red(other);
+				node = parent;
+				parent = rb_parent(node);
+			}
+			else
+			{
+				if (!other->rb_left || rb_is_black(other->rb_left))
+				{
+					rb_set_black(other->rb_right);
+					rb_set_red(other);
+					__rb_rotate_left(other, root);
+					other = parent->rb_left;
+				}
+				rb_set_color(other, rb_color(parent));
+				rb_set_black(parent);
+				rb_set_black(other->rb_left);
+				__rb_rotate_right(parent, root);
+				node = root->rb_node;
+				break;
+			}
+		}
+	}
+	if (node)
+		rb_set_black(node);
+}
+
+void rb_erase(struct rb_node *node, struct rb_root *root)
+{
+	struct rb_node *child, *parent;
+	int color;
+
+	if (!node->rb_left)
+		child = node->rb_right;
+	else if (!node->rb_right)
+		child = node->rb_left;
+	else
+	{
+		struct rb_node *old = node, *left;
+
+		node = node->rb_right;
+		while ((left = node->rb_left) != NULL)
+			node = left;
+		child = node->rb_right;
+		parent = rb_parent(node);
+		color = rb_color(node);
+
+		if (child)
+			rb_set_parent(child, parent);
+		if (parent == old) {
+			parent->rb_right = child;
+			parent = node;
+		} else
+			parent->rb_left = child;
+
+		node->rb_parent_color = old->rb_parent_color;
+		node->rb_right = old->rb_right;
+		node->rb_left = old->rb_left;
+
+		if (rb_parent(old))
+		{
+			if (rb_parent(old)->rb_left == old)
+				rb_parent(old)->rb_left = node;
+			else
+				rb_parent(old)->rb_right = node;
+		} else
+			root->rb_node = node;
+
+		rb_set_parent(old->rb_left, node);
+		if (old->rb_right)
+			rb_set_parent(old->rb_right, node);
+		goto color;
+	}
+
+	parent = rb_parent(node);
+	color = rb_color(node);
+
+	if (child)
+		rb_set_parent(child, parent);
+	if (parent)
+	{
+		if (parent->rb_left == node)
+			parent->rb_left = child;
+		else
+			parent->rb_right = child;
+	}
+	else
+		root->rb_node = child;
+
+ color:
+	if (color == RB_BLACK)
+		__rb_erase_color(child, parent, root);
+}
+
+/*
+ * This function returns the first node (in sort order) of the tree.
+ */
+struct rb_node *rb_first(const struct rb_root *root)
+{
+	struct rb_node	*n;
+
+	n = root->rb_node;
+	if (!n)
+		return NULL;
+	while (n->rb_left)
+		n = n->rb_left;
+	return n;
+}
+
+struct rb_node *rb_last(const struct rb_root *root)
+{
+	struct rb_node	*n;
+
+	n = root->rb_node;
+	if (!n)
+		return NULL;
+	while (n->rb_right)
+		n = n->rb_right;
+	return n;
+}
+
+struct rb_node *rb_next(const struct rb_node *node)
+{
+	struct rb_node *parent;
+
+	if (rb_parent(node) == node)
+		return NULL;
+
+	/* If we have a right-hand child, go down and then left as far
+	   as we can. */
+	if (node->rb_right) {
+		node = node->rb_right; 
+		while (node->rb_left)
+			node=node->rb_left;
+		return (struct rb_node *)node;
+	}
+
+	/* No right-hand children.  Everything down and left is
+	   smaller than us, so any 'next' node must be in the general
+	   direction of our parent. Go up the tree; any time the
+	   ancestor is a right-hand child of its parent, keep going
+	   up. First time it's a left-hand child of its parent, said
+	   parent is our 'next' node. */
+	while ((parent = rb_parent(node)) && node == parent->rb_right)
+		node = parent;
+
+	return parent;
+}
+
+struct rb_node *rb_prev(const struct rb_node *node)
+{
+	struct rb_node *parent;
+
+	if (rb_parent(node) == node)
+		return NULL;
+
+	/* If we have a left-hand child, go down and then right as far
+	   as we can. */
+	if (node->rb_left) {
+		node = node->rb_left; 
+		while (node->rb_right)
+			node=node->rb_right;
+		return (struct rb_node *)node;
+	}
+
+	/* No left-hand children. Go up till we find an ancestor which
+	   is a right-hand child of its parent */
+	while ((parent = rb_parent(node)) && node == parent->rb_left)
+		node = parent;
+
+	return parent;
+}
+
+void rb_replace_node(struct rb_node *victim, struct rb_node *new,
+		     struct rb_root *root)
+{
+	struct rb_node *parent = rb_parent(victim);
+
+	/* Set the surrounding nodes to point to the replacement */
+	if (parent) {
+		if (victim == parent->rb_left)
+			parent->rb_left = new;
+		else
+			parent->rb_right = new;
+	} else {
+		root->rb_node = new;
+	}
+	if (victim->rb_left)
+		rb_set_parent(victim->rb_left, new);
+	if (victim->rb_right)
+		rb_set_parent(victim->rb_right, new);
+
+	/* Copy the pointers/colour from the victim to the replacement */
+	*new = *victim;
+}
diff --git a/tools/perf/util/rbtree.h b/tools/perf/util/rbtree.h
new file mode 100644
index 0000000..6bdc488
--- /dev/null
+++ b/tools/perf/util/rbtree.h
@@ -0,0 +1,171 @@
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  linux/include/linux/rbtree.h
+
+  To use rbtrees you'll have to implement your own insert and search cores.
+  This will avoid us to use callbacks and to drop drammatically performances.
+  I know it's not the cleaner way,  but in C (not in C++) to get
+  performances and genericity...
+
+  Some example of insert and search follows here. The search is a plain
+  normal search over an ordered tree. The insert instead must be implemented
+  int two steps: as first thing the code must insert the element in
+  order as a red leaf in the tree, then the support library function
+  rb_insert_color() must be called. Such function will do the
+  not trivial work to rebalance the rbtree if necessary.
+
+-----------------------------------------------------------------------
+static inline struct page * rb_search_page_cache(struct inode * inode,
+						 unsigned long offset)
+{
+	struct rb_node * n = inode->i_rb_page_cache.rb_node;
+	struct page * page;
+
+	while (n)
+	{
+		page = rb_entry(n, struct page, rb_page_cache);
+
+		if (offset < page->offset)
+			n = n->rb_left;
+		else if (offset > page->offset)
+			n = n->rb_right;
+		else
+			return page;
+	}
+	return NULL;
+}
+
+static inline struct page * __rb_insert_page_cache(struct inode * inode,
+						   unsigned long offset,
+						   struct rb_node * node)
+{
+	struct rb_node ** p = &inode->i_rb_page_cache.rb_node;
+	struct rb_node * parent = NULL;
+	struct page * page;
+
+	while (*p)
+	{
+		parent = *p;
+		page = rb_entry(parent, struct page, rb_page_cache);
+
+		if (offset < page->offset)
+			p = &(*p)->rb_left;
+		else if (offset > page->offset)
+			p = &(*p)->rb_right;
+		else
+			return page;
+	}
+
+	rb_link_node(node, parent, p);
+
+	return NULL;
+}
+
+static inline struct page * rb_insert_page_cache(struct inode * inode,
+						 unsigned long offset,
+						 struct rb_node * node)
+{
+	struct page * ret;
+	if ((ret = __rb_insert_page_cache(inode, offset, node)))
+		goto out;
+	rb_insert_color(node, &inode->i_rb_page_cache);
+ out:
+	return ret;
+}
+-----------------------------------------------------------------------
+*/
+
+#ifndef	_LINUX_RBTREE_H
+#define	_LINUX_RBTREE_H
+
+#include <stddef.h>
+
+/**
+ * container_of - cast a member of a structure out to the containing structure
+ * @ptr:	the pointer to the member.
+ * @type:	the type of the container struct this is embedded in.
+ * @member:	the name of the member within the struct.
+ *
+ */
+#define container_of(ptr, type, member) ({			\
+	const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
+	(type *)( (char *)__mptr - offsetof(type,member) );})
+
+struct rb_node
+{
+	unsigned long  rb_parent_color;
+#define	RB_RED		0
+#define	RB_BLACK	1
+	struct rb_node *rb_right;
+	struct rb_node *rb_left;
+} __attribute__((aligned(sizeof(long))));
+    /* The alignment might seem pointless, but allegedly CRIS needs it */
+
+struct rb_root
+{
+	struct rb_node *rb_node;
+};
+
+
+#define rb_parent(r)   ((struct rb_node *)((r)->rb_parent_color & ~3))
+#define rb_color(r)   ((r)->rb_parent_color & 1)
+#define rb_is_red(r)   (!rb_color(r))
+#define rb_is_black(r) rb_color(r)
+#define rb_set_red(r)  do { (r)->rb_parent_color &= ~1; } while (0)
+#define rb_set_black(r)  do { (r)->rb_parent_color |= 1; } while (0)
+
+static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
+{
+	rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p;
+}
+static inline void rb_set_color(struct rb_node *rb, int color)
+{
+	rb->rb_parent_color = (rb->rb_parent_color & ~1) | color;
+}
+
+#define RB_ROOT	(struct rb_root) { NULL, }
+#define	rb_entry(ptr, type, member) container_of(ptr, type, member)
+
+#define RB_EMPTY_ROOT(root)	((root)->rb_node == NULL)
+#define RB_EMPTY_NODE(node)	(rb_parent(node) == node)
+#define RB_CLEAR_NODE(node)	(rb_set_parent(node, node))
+
+extern void rb_insert_color(struct rb_node *, struct rb_root *);
+extern void rb_erase(struct rb_node *, struct rb_root *);
+
+/* Find logical next and previous nodes in a tree */
+extern struct rb_node *rb_next(const struct rb_node *);
+extern struct rb_node *rb_prev(const struct rb_node *);
+extern struct rb_node *rb_first(const struct rb_root *);
+extern struct rb_node *rb_last(const struct rb_root *);
+
+/* Fast replacement of a single node without remove/rebalance/add/rebalance */
+extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, 
+			    struct rb_root *root);
+
+static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
+				struct rb_node ** rb_link)
+{
+	node->rb_parent_color = (unsigned long )parent;
+	node->rb_left = node->rb_right = NULL;
+
+	*rb_link = node;
+}
+
+#endif	/* _LINUX_RBTREE_H */
diff --git a/tools/perf/util/run-command.c b/tools/perf/util/run-command.c
new file mode 100644
index 0000000..b2f5e85
--- /dev/null
+++ b/tools/perf/util/run-command.c
@@ -0,0 +1,395 @@
+#include "cache.h"
+#include "run-command.h"
+#include "exec_cmd.h"
+
+static inline void close_pair(int fd[2])
+{
+	close(fd[0]);
+	close(fd[1]);
+}
+
+static inline void dup_devnull(int to)
+{
+	int fd = open("/dev/null", O_RDWR);
+	dup2(fd, to);
+	close(fd);
+}
+
+int start_command(struct child_process *cmd)
+{
+	int need_in, need_out, need_err;
+	int fdin[2], fdout[2], fderr[2];
+
+	/*
+	 * In case of errors we must keep the promise to close FDs
+	 * that have been passed in via ->in and ->out.
+	 */
+
+	need_in = !cmd->no_stdin && cmd->in < 0;
+	if (need_in) {
+		if (pipe(fdin) < 0) {
+			if (cmd->out > 0)
+				close(cmd->out);
+			return -ERR_RUN_COMMAND_PIPE;
+		}
+		cmd->in = fdin[1];
+	}
+
+	need_out = !cmd->no_stdout
+		&& !cmd->stdout_to_stderr
+		&& cmd->out < 0;
+	if (need_out) {
+		if (pipe(fdout) < 0) {
+			if (need_in)
+				close_pair(fdin);
+			else if (cmd->in)
+				close(cmd->in);
+			return -ERR_RUN_COMMAND_PIPE;
+		}
+		cmd->out = fdout[0];
+	}
+
+	need_err = !cmd->no_stderr && cmd->err < 0;
+	if (need_err) {
+		if (pipe(fderr) < 0) {
+			if (need_in)
+				close_pair(fdin);
+			else if (cmd->in)
+				close(cmd->in);
+			if (need_out)
+				close_pair(fdout);
+			else if (cmd->out)
+				close(cmd->out);
+			return -ERR_RUN_COMMAND_PIPE;
+		}
+		cmd->err = fderr[0];
+	}
+
+#ifndef __MINGW32__
+	fflush(NULL);
+	cmd->pid = fork();
+	if (!cmd->pid) {
+		if (cmd->no_stdin)
+			dup_devnull(0);
+		else if (need_in) {
+			dup2(fdin[0], 0);
+			close_pair(fdin);
+		} else if (cmd->in) {
+			dup2(cmd->in, 0);
+			close(cmd->in);
+		}
+
+		if (cmd->no_stderr)
+			dup_devnull(2);
+		else if (need_err) {
+			dup2(fderr[1], 2);
+			close_pair(fderr);
+		}
+
+		if (cmd->no_stdout)
+			dup_devnull(1);
+		else if (cmd->stdout_to_stderr)
+			dup2(2, 1);
+		else if (need_out) {
+			dup2(fdout[1], 1);
+			close_pair(fdout);
+		} else if (cmd->out > 1) {
+			dup2(cmd->out, 1);
+			close(cmd->out);
+		}
+
+		if (cmd->dir && chdir(cmd->dir))
+			die("exec %s: cd to %s failed (%s)", cmd->argv[0],
+			    cmd->dir, strerror(errno));
+		if (cmd->env) {
+			for (; *cmd->env; cmd->env++) {
+				if (strchr(*cmd->env, '='))
+					putenv((char*)*cmd->env);
+				else
+					unsetenv(*cmd->env);
+			}
+		}
+		if (cmd->preexec_cb)
+			cmd->preexec_cb();
+		if (cmd->perf_cmd) {
+			execv_perf_cmd(cmd->argv);
+		} else {
+			execvp(cmd->argv[0], (char *const*) cmd->argv);
+		}
+		exit(127);
+	}
+#else
+	int s0 = -1, s1 = -1, s2 = -1;	/* backups of stdin, stdout, stderr */
+	const char **sargv = cmd->argv;
+	char **env = environ;
+
+	if (cmd->no_stdin) {
+		s0 = dup(0);
+		dup_devnull(0);
+	} else if (need_in) {
+		s0 = dup(0);
+		dup2(fdin[0], 0);
+	} else if (cmd->in) {
+		s0 = dup(0);
+		dup2(cmd->in, 0);
+	}
+
+	if (cmd->no_stderr) {
+		s2 = dup(2);
+		dup_devnull(2);
+	} else if (need_err) {
+		s2 = dup(2);
+		dup2(fderr[1], 2);
+	}
+
+	if (cmd->no_stdout) {
+		s1 = dup(1);
+		dup_devnull(1);
+	} else if (cmd->stdout_to_stderr) {
+		s1 = dup(1);
+		dup2(2, 1);
+	} else if (need_out) {
+		s1 = dup(1);
+		dup2(fdout[1], 1);
+	} else if (cmd->out > 1) {
+		s1 = dup(1);
+		dup2(cmd->out, 1);
+	}
+
+	if (cmd->dir)
+		die("chdir in start_command() not implemented");
+	if (cmd->env) {
+		env = copy_environ();
+		for (; *cmd->env; cmd->env++)
+			env = env_setenv(env, *cmd->env);
+	}
+
+	if (cmd->perf_cmd) {
+		cmd->argv = prepare_perf_cmd(cmd->argv);
+	}
+
+	cmd->pid = mingw_spawnvpe(cmd->argv[0], cmd->argv, env);
+
+	if (cmd->env)
+		free_environ(env);
+	if (cmd->perf_cmd)
+		free(cmd->argv);
+
+	cmd->argv = sargv;
+	if (s0 >= 0)
+		dup2(s0, 0), close(s0);
+	if (s1 >= 0)
+		dup2(s1, 1), close(s1);
+	if (s2 >= 0)
+		dup2(s2, 2), close(s2);
+#endif
+
+	if (cmd->pid < 0) {
+		int err = errno;
+		if (need_in)
+			close_pair(fdin);
+		else if (cmd->in)
+			close(cmd->in);
+		if (need_out)
+			close_pair(fdout);
+		else if (cmd->out)
+			close(cmd->out);
+		if (need_err)
+			close_pair(fderr);
+		return err == ENOENT ?
+			-ERR_RUN_COMMAND_EXEC :
+			-ERR_RUN_COMMAND_FORK;
+	}
+
+	if (need_in)
+		close(fdin[0]);
+	else if (cmd->in)
+		close(cmd->in);
+
+	if (need_out)
+		close(fdout[1]);
+	else if (cmd->out)
+		close(cmd->out);
+
+	if (need_err)
+		close(fderr[1]);
+
+	return 0;
+}
+
+static int wait_or_whine(pid_t pid)
+{
+	for (;;) {
+		int status, code;
+		pid_t waiting = waitpid(pid, &status, 0);
+
+		if (waiting < 0) {
+			if (errno == EINTR)
+				continue;
+			error("waitpid failed (%s)", strerror(errno));
+			return -ERR_RUN_COMMAND_WAITPID;
+		}
+		if (waiting != pid)
+			return -ERR_RUN_COMMAND_WAITPID_WRONG_PID;
+		if (WIFSIGNALED(status))
+			return -ERR_RUN_COMMAND_WAITPID_SIGNAL;
+
+		if (!WIFEXITED(status))
+			return -ERR_RUN_COMMAND_WAITPID_NOEXIT;
+		code = WEXITSTATUS(status);
+		switch (code) {
+		case 127:
+			return -ERR_RUN_COMMAND_EXEC;
+		case 0:
+			return 0;
+		default:
+			return -code;
+		}
+	}
+}
+
+int finish_command(struct child_process *cmd)
+{
+	return wait_or_whine(cmd->pid);
+}
+
+int run_command(struct child_process *cmd)
+{
+	int code = start_command(cmd);
+	if (code)
+		return code;
+	return finish_command(cmd);
+}
+
+static void prepare_run_command_v_opt(struct child_process *cmd,
+				      const char **argv,
+				      int opt)
+{
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->argv = argv;
+	cmd->no_stdin = opt & RUN_COMMAND_NO_STDIN ? 1 : 0;
+	cmd->perf_cmd = opt & RUN_PERF_CMD ? 1 : 0;
+	cmd->stdout_to_stderr = opt & RUN_COMMAND_STDOUT_TO_STDERR ? 1 : 0;
+}
+
+int run_command_v_opt(const char **argv, int opt)
+{
+	struct child_process cmd;
+	prepare_run_command_v_opt(&cmd, argv, opt);
+	return run_command(&cmd);
+}
+
+int run_command_v_opt_cd_env(const char **argv, int opt, const char *dir, const char *const *env)
+{
+	struct child_process cmd;
+	prepare_run_command_v_opt(&cmd, argv, opt);
+	cmd.dir = dir;
+	cmd.env = env;
+	return run_command(&cmd);
+}
+
+#ifdef __MINGW32__
+static __stdcall unsigned run_thread(void *data)
+{
+	struct async *async = data;
+	return async->proc(async->fd_for_proc, async->data);
+}
+#endif
+
+int start_async(struct async *async)
+{
+	int pipe_out[2];
+
+	if (pipe(pipe_out) < 0)
+		return error("cannot create pipe: %s", strerror(errno));
+	async->out = pipe_out[0];
+
+#ifndef __MINGW32__
+	/* Flush stdio before fork() to avoid cloning buffers */
+	fflush(NULL);
+
+	async->pid = fork();
+	if (async->pid < 0) {
+		error("fork (async) failed: %s", strerror(errno));
+		close_pair(pipe_out);
+		return -1;
+	}
+	if (!async->pid) {
+		close(pipe_out[0]);
+		exit(!!async->proc(pipe_out[1], async->data));
+	}
+	close(pipe_out[1]);
+#else
+	async->fd_for_proc = pipe_out[1];
+	async->tid = (HANDLE) _beginthreadex(NULL, 0, run_thread, async, 0, NULL);
+	if (!async->tid) {
+		error("cannot create thread: %s", strerror(errno));
+		close_pair(pipe_out);
+		return -1;
+	}
+#endif
+	return 0;
+}
+
+int finish_async(struct async *async)
+{
+#ifndef __MINGW32__
+	int ret = 0;
+
+	if (wait_or_whine(async->pid))
+		ret = error("waitpid (async) failed");
+#else
+	DWORD ret = 0;
+	if (WaitForSingleObject(async->tid, INFINITE) != WAIT_OBJECT_0)
+		ret = error("waiting for thread failed: %lu", GetLastError());
+	else if (!GetExitCodeThread(async->tid, &ret))
+		ret = error("cannot get thread exit code: %lu", GetLastError());
+	CloseHandle(async->tid);
+#endif
+	return ret;
+}
+
+int run_hook(const char *index_file, const char *name, ...)
+{
+	struct child_process hook;
+	const char **argv = NULL, *env[2];
+	char index[PATH_MAX];
+	va_list args;
+	int ret;
+	size_t i = 0, alloc = 0;
+
+	if (access(perf_path("hooks/%s", name), X_OK) < 0)
+		return 0;
+
+	va_start(args, name);
+	ALLOC_GROW(argv, i + 1, alloc);
+	argv[i++] = perf_path("hooks/%s", name);
+	while (argv[i-1]) {
+		ALLOC_GROW(argv, i + 1, alloc);
+		argv[i++] = va_arg(args, const char *);
+	}
+	va_end(args);
+
+	memset(&hook, 0, sizeof(hook));
+	hook.argv = argv;
+	hook.no_stdin = 1;
+	hook.stdout_to_stderr = 1;
+	if (index_file) {
+		snprintf(index, sizeof(index), "PERF_INDEX_FILE=%s", index_file);
+		env[0] = index;
+		env[1] = NULL;
+		hook.env = env;
+	}
+
+	ret = start_command(&hook);
+	free(argv);
+	if (ret) {
+		warning("Could not spawn %s", argv[0]);
+		return ret;
+	}
+	ret = finish_command(&hook);
+	if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
+		warning("%s exited due to uncaught signal", argv[0]);
+
+	return ret;
+}
diff --git a/tools/perf/util/run-command.h b/tools/perf/util/run-command.h
new file mode 100644
index 0000000..328289f
--- /dev/null
+++ b/tools/perf/util/run-command.h
@@ -0,0 +1,93 @@
+#ifndef RUN_COMMAND_H
+#define RUN_COMMAND_H
+
+enum {
+	ERR_RUN_COMMAND_FORK = 10000,
+	ERR_RUN_COMMAND_EXEC,
+	ERR_RUN_COMMAND_PIPE,
+	ERR_RUN_COMMAND_WAITPID,
+	ERR_RUN_COMMAND_WAITPID_WRONG_PID,
+	ERR_RUN_COMMAND_WAITPID_SIGNAL,
+	ERR_RUN_COMMAND_WAITPID_NOEXIT,
+};
+#define IS_RUN_COMMAND_ERR(x) (-(x) >= ERR_RUN_COMMAND_FORK)
+
+struct child_process {
+	const char **argv;
+	pid_t pid;
+	/*
+	 * Using .in, .out, .err:
+	 * - Specify 0 for no redirections (child inherits stdin, stdout,
+	 *   stderr from parent).
+	 * - Specify -1 to have a pipe allocated as follows:
+	 *     .in: returns the writable pipe end; parent writes to it,
+	 *          the readable pipe end becomes child's stdin
+	 *     .out, .err: returns the readable pipe end; parent reads from
+	 *          it, the writable pipe end becomes child's stdout/stderr
+	 *   The caller of start_command() must close the returned FDs
+	 *   after it has completed reading from/writing to it!
+	 * - Specify > 0 to set a channel to a particular FD as follows:
+	 *     .in: a readable FD, becomes child's stdin
+	 *     .out: a writable FD, becomes child's stdout/stderr
+	 *     .err > 0 not supported
+	 *   The specified FD is closed by start_command(), even in case
+	 *   of errors!
+	 */
+	int in;
+	int out;
+	int err;
+	const char *dir;
+	const char *const *env;
+	unsigned no_stdin:1;
+	unsigned no_stdout:1;
+	unsigned no_stderr:1;
+	unsigned perf_cmd:1; /* if this is to be perf sub-command */
+	unsigned stdout_to_stderr:1;
+	void (*preexec_cb)(void);
+};
+
+int start_command(struct child_process *);
+int finish_command(struct child_process *);
+int run_command(struct child_process *);
+
+extern int run_hook(const char *index_file, const char *name, ...);
+
+#define RUN_COMMAND_NO_STDIN 1
+#define RUN_PERF_CMD	     2	/*If this is to be perf sub-command */
+#define RUN_COMMAND_STDOUT_TO_STDERR 4
+int run_command_v_opt(const char **argv, int opt);
+
+/*
+ * env (the environment) is to be formatted like environ: "VAR=VALUE".
+ * To unset an environment variable use just "VAR".
+ */
+int run_command_v_opt_cd_env(const char **argv, int opt, const char *dir, const char *const *env);
+
+/*
+ * The purpose of the following functions is to feed a pipe by running
+ * a function asynchronously and providing output that the caller reads.
+ *
+ * It is expected that no synchronization and mutual exclusion between
+ * the caller and the feed function is necessary so that the function
+ * can run in a thread without interfering with the caller.
+ */
+struct async {
+	/*
+	 * proc writes to fd and closes it;
+	 * returns 0 on success, non-zero on failure
+	 */
+	int (*proc)(int fd, void *data);
+	void *data;
+	int out;	/* caller reads from here and closes it */
+#ifndef __MINGW32__
+	pid_t pid;
+#else
+	HANDLE tid;
+	int fd_for_proc;
+#endif
+};
+
+int start_async(struct async *async);
+int finish_async(struct async *async);
+
+#endif
diff --git a/tools/perf/util/sigchain.c b/tools/perf/util/sigchain.c
new file mode 100644
index 0000000..1118b99
--- /dev/null
+++ b/tools/perf/util/sigchain.c
@@ -0,0 +1,52 @@
+#include "sigchain.h"
+#include "cache.h"
+
+#define SIGCHAIN_MAX_SIGNALS 32
+
+struct sigchain_signal {
+	sigchain_fun *old;
+	int n;
+	int alloc;
+};
+static struct sigchain_signal signals[SIGCHAIN_MAX_SIGNALS];
+
+static void check_signum(int sig)
+{
+	if (sig < 1 || sig >= SIGCHAIN_MAX_SIGNALS)
+		die("BUG: signal out of range: %d", sig);
+}
+
+int sigchain_push(int sig, sigchain_fun f)
+{
+	struct sigchain_signal *s = signals + sig;
+	check_signum(sig);
+
+	ALLOC_GROW(s->old, s->n + 1, s->alloc);
+	s->old[s->n] = signal(sig, f);
+	if (s->old[s->n] == SIG_ERR)
+		return -1;
+	s->n++;
+	return 0;
+}
+
+int sigchain_pop(int sig)
+{
+	struct sigchain_signal *s = signals + sig;
+	check_signum(sig);
+	if (s->n < 1)
+		return 0;
+
+	if (signal(sig, s->old[s->n - 1]) == SIG_ERR)
+		return -1;
+	s->n--;
+	return 0;
+}
+
+void sigchain_push_common(sigchain_fun f)
+{
+	sigchain_push(SIGINT, f);
+	sigchain_push(SIGHUP, f);
+	sigchain_push(SIGTERM, f);
+	sigchain_push(SIGQUIT, f);
+	sigchain_push(SIGPIPE, f);
+}
diff --git a/tools/perf/util/sigchain.h b/tools/perf/util/sigchain.h
new file mode 100644
index 0000000..618083bc
--- /dev/null
+++ b/tools/perf/util/sigchain.h
@@ -0,0 +1,11 @@
+#ifndef SIGCHAIN_H
+#define SIGCHAIN_H
+
+typedef void (*sigchain_fun)(int);
+
+int sigchain_push(int sig, sigchain_fun f);
+int sigchain_pop(int sig);
+
+void sigchain_push_common(sigchain_fun f);
+
+#endif /* SIGCHAIN_H */
diff --git a/tools/perf/util/strbuf.c b/tools/perf/util/strbuf.c
new file mode 100644
index 0000000..eaba093
--- /dev/null
+++ b/tools/perf/util/strbuf.c
@@ -0,0 +1,359 @@
+#include "cache.h"
+
+int prefixcmp(const char *str, const char *prefix)
+{
+	for (; ; str++, prefix++)
+		if (!*prefix)
+			return 0;
+		else if (*str != *prefix)
+			return (unsigned char)*prefix - (unsigned char)*str;
+}
+
+/*
+ * Used as the default ->buf value, so that people can always assume
+ * buf is non NULL and ->buf is NUL terminated even for a freshly
+ * initialized strbuf.
+ */
+char strbuf_slopbuf[1];
+
+void strbuf_init(struct strbuf *sb, size_t hint)
+{
+	sb->alloc = sb->len = 0;
+	sb->buf = strbuf_slopbuf;
+	if (hint)
+		strbuf_grow(sb, hint);
+}
+
+void strbuf_release(struct strbuf *sb)
+{
+	if (sb->alloc) {
+		free(sb->buf);
+		strbuf_init(sb, 0);
+	}
+}
+
+char *strbuf_detach(struct strbuf *sb, size_t *sz)
+{
+	char *res = sb->alloc ? sb->buf : NULL;
+	if (sz)
+		*sz = sb->len;
+	strbuf_init(sb, 0);
+	return res;
+}
+
+void strbuf_attach(struct strbuf *sb, void *buf, size_t len, size_t alloc)
+{
+	strbuf_release(sb);
+	sb->buf   = buf;
+	sb->len   = len;
+	sb->alloc = alloc;
+	strbuf_grow(sb, 0);
+	sb->buf[sb->len] = '\0';
+}
+
+void strbuf_grow(struct strbuf *sb, size_t extra)
+{
+	if (sb->len + extra + 1 <= sb->len)
+		die("you want to use way too much memory");
+	if (!sb->alloc)
+		sb->buf = NULL;
+	ALLOC_GROW(sb->buf, sb->len + extra + 1, sb->alloc);
+}
+
+void strbuf_trim(struct strbuf *sb)
+{
+	char *b = sb->buf;
+	while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1]))
+		sb->len--;
+	while (sb->len > 0 && isspace(*b)) {
+		b++;
+		sb->len--;
+	}
+	memmove(sb->buf, b, sb->len);
+	sb->buf[sb->len] = '\0';
+}
+void strbuf_rtrim(struct strbuf *sb)
+{
+	while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1]))
+		sb->len--;
+	sb->buf[sb->len] = '\0';
+}
+
+void strbuf_ltrim(struct strbuf *sb)
+{
+	char *b = sb->buf;
+	while (sb->len > 0 && isspace(*b)) {
+		b++;
+		sb->len--;
+	}
+	memmove(sb->buf, b, sb->len);
+	sb->buf[sb->len] = '\0';
+}
+
+void strbuf_tolower(struct strbuf *sb)
+{
+	int i;
+	for (i = 0; i < sb->len; i++)
+		sb->buf[i] = tolower(sb->buf[i]);
+}
+
+struct strbuf **strbuf_split(const struct strbuf *sb, int delim)
+{
+	int alloc = 2, pos = 0;
+	char *n, *p;
+	struct strbuf **ret;
+	struct strbuf *t;
+
+	ret = calloc(alloc, sizeof(struct strbuf *));
+	p = n = sb->buf;
+	while (n < sb->buf + sb->len) {
+		int len;
+		n = memchr(n, delim, sb->len - (n - sb->buf));
+		if (pos + 1 >= alloc) {
+			alloc = alloc * 2;
+			ret = realloc(ret, sizeof(struct strbuf *) * alloc);
+		}
+		if (!n)
+			n = sb->buf + sb->len - 1;
+		len = n - p + 1;
+		t = malloc(sizeof(struct strbuf));
+		strbuf_init(t, len);
+		strbuf_add(t, p, len);
+		ret[pos] = t;
+		ret[++pos] = NULL;
+		p = ++n;
+	}
+	return ret;
+}
+
+void strbuf_list_free(struct strbuf **sbs)
+{
+	struct strbuf **s = sbs;
+
+	while (*s) {
+		strbuf_release(*s);
+		free(*s++);
+	}
+	free(sbs);
+}
+
+int strbuf_cmp(const struct strbuf *a, const struct strbuf *b)
+{
+	int len = a->len < b->len ? a->len: b->len;
+	int cmp = memcmp(a->buf, b->buf, len);
+	if (cmp)
+		return cmp;
+	return a->len < b->len ? -1: a->len != b->len;
+}
+
+void strbuf_splice(struct strbuf *sb, size_t pos, size_t len,
+				   const void *data, size_t dlen)
+{
+	if (pos + len < pos)
+		die("you want to use way too much memory");
+	if (pos > sb->len)
+		die("`pos' is too far after the end of the buffer");
+	if (pos + len > sb->len)
+		die("`pos + len' is too far after the end of the buffer");
+
+	if (dlen >= len)
+		strbuf_grow(sb, dlen - len);
+	memmove(sb->buf + pos + dlen,
+			sb->buf + pos + len,
+			sb->len - pos - len);
+	memcpy(sb->buf + pos, data, dlen);
+	strbuf_setlen(sb, sb->len + dlen - len);
+}
+
+void strbuf_insert(struct strbuf *sb, size_t pos, const void *data, size_t len)
+{
+	strbuf_splice(sb, pos, 0, data, len);
+}
+
+void strbuf_remove(struct strbuf *sb, size_t pos, size_t len)
+{
+	strbuf_splice(sb, pos, len, NULL, 0);
+}
+
+void strbuf_add(struct strbuf *sb, const void *data, size_t len)
+{
+	strbuf_grow(sb, len);
+	memcpy(sb->buf + sb->len, data, len);
+	strbuf_setlen(sb, sb->len + len);
+}
+
+void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len)
+{
+	strbuf_grow(sb, len);
+	memcpy(sb->buf + sb->len, sb->buf + pos, len);
+	strbuf_setlen(sb, sb->len + len);
+}
+
+void strbuf_addf(struct strbuf *sb, const char *fmt, ...)
+{
+	int len;
+	va_list ap;
+
+	if (!strbuf_avail(sb))
+		strbuf_grow(sb, 64);
+	va_start(ap, fmt);
+	len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap);
+	va_end(ap);
+	if (len < 0)
+		die("your vsnprintf is broken");
+	if (len > strbuf_avail(sb)) {
+		strbuf_grow(sb, len);
+		va_start(ap, fmt);
+		len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap);
+		va_end(ap);
+		if (len > strbuf_avail(sb)) {
+			die("this should not happen, your snprintf is broken");
+		}
+	}
+	strbuf_setlen(sb, sb->len + len);
+}
+
+void strbuf_expand(struct strbuf *sb, const char *format, expand_fn_t fn,
+		   void *context)
+{
+	for (;;) {
+		const char *percent;
+		size_t consumed;
+
+		percent = strchrnul(format, '%');
+		strbuf_add(sb, format, percent - format);
+		if (!*percent)
+			break;
+		format = percent + 1;
+
+		consumed = fn(sb, format, context);
+		if (consumed)
+			format += consumed;
+		else
+			strbuf_addch(sb, '%');
+	}
+}
+
+size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder,
+		void *context)
+{
+	struct strbuf_expand_dict_entry *e = context;
+	size_t len;
+
+	for (; e->placeholder && (len = strlen(e->placeholder)); e++) {
+		if (!strncmp(placeholder, e->placeholder, len)) {
+			if (e->value)
+				strbuf_addstr(sb, e->value);
+			return len;
+		}
+	}
+	return 0;
+}
+
+size_t strbuf_fread(struct strbuf *sb, size_t size, FILE *f)
+{
+	size_t res;
+	size_t oldalloc = sb->alloc;
+
+	strbuf_grow(sb, size);
+	res = fread(sb->buf + sb->len, 1, size, f);
+	if (res > 0)
+		strbuf_setlen(sb, sb->len + res);
+	else if (res < 0 && oldalloc == 0)
+		strbuf_release(sb);
+	return res;
+}
+
+ssize_t strbuf_read(struct strbuf *sb, int fd, size_t hint)
+{
+	size_t oldlen = sb->len;
+	size_t oldalloc = sb->alloc;
+
+	strbuf_grow(sb, hint ? hint : 8192);
+	for (;;) {
+		ssize_t cnt;
+
+		cnt = read(fd, sb->buf + sb->len, sb->alloc - sb->len - 1);
+		if (cnt < 0) {
+			if (oldalloc == 0)
+				strbuf_release(sb);
+			else
+				strbuf_setlen(sb, oldlen);
+			return -1;
+		}
+		if (!cnt)
+			break;
+		sb->len += cnt;
+		strbuf_grow(sb, 8192);
+	}
+
+	sb->buf[sb->len] = '\0';
+	return sb->len - oldlen;
+}
+
+#define STRBUF_MAXLINK (2*PATH_MAX)
+
+int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint)
+{
+	size_t oldalloc = sb->alloc;
+
+	if (hint < 32)
+		hint = 32;
+
+	while (hint < STRBUF_MAXLINK) {
+		int len;
+
+		strbuf_grow(sb, hint);
+		len = readlink(path, sb->buf, hint);
+		if (len < 0) {
+			if (errno != ERANGE)
+				break;
+		} else if (len < hint) {
+			strbuf_setlen(sb, len);
+			return 0;
+		}
+
+		/* .. the buffer was too small - try again */
+		hint *= 2;
+	}
+	if (oldalloc == 0)
+		strbuf_release(sb);
+	return -1;
+}
+
+int strbuf_getline(struct strbuf *sb, FILE *fp, int term)
+{
+	int ch;
+
+	strbuf_grow(sb, 0);
+	if (feof(fp))
+		return EOF;
+
+	strbuf_reset(sb);
+	while ((ch = fgetc(fp)) != EOF) {
+		if (ch == term)
+			break;
+		strbuf_grow(sb, 1);
+		sb->buf[sb->len++] = ch;
+	}
+	if (ch == EOF && sb->len == 0)
+		return EOF;
+
+	sb->buf[sb->len] = '\0';
+	return 0;
+}
+
+int strbuf_read_file(struct strbuf *sb, const char *path, size_t hint)
+{
+	int fd, len;
+
+	fd = open(path, O_RDONLY);
+	if (fd < 0)
+		return -1;
+	len = strbuf_read(sb, fd, hint);
+	close(fd);
+	if (len < 0)
+		return -1;
+
+	return len;
+}
diff --git a/tools/perf/util/strbuf.h b/tools/perf/util/strbuf.h
new file mode 100644
index 0000000..9ee908a
--- /dev/null
+++ b/tools/perf/util/strbuf.h
@@ -0,0 +1,137 @@
+#ifndef STRBUF_H
+#define STRBUF_H
+
+/*
+ * Strbuf's can be use in many ways: as a byte array, or to store arbitrary
+ * long, overflow safe strings.
+ *
+ * Strbufs has some invariants that are very important to keep in mind:
+ *
+ * 1. the ->buf member is always malloc-ed, hence strbuf's can be used to
+ *    build complex strings/buffers whose final size isn't easily known.
+ *
+ *    It is NOT legal to copy the ->buf pointer away.
+ *    `strbuf_detach' is the operation that detachs a buffer from its shell
+ *    while keeping the shell valid wrt its invariants.
+ *
+ * 2. the ->buf member is a byte array that has at least ->len + 1 bytes
+ *    allocated. The extra byte is used to store a '\0', allowing the ->buf
+ *    member to be a valid C-string. Every strbuf function ensure this
+ *    invariant is preserved.
+ *
+ *    Note that it is OK to "play" with the buffer directly if you work it
+ *    that way:
+ *
+ *    strbuf_grow(sb, SOME_SIZE);
+ *       ... Here, the memory array starting at sb->buf, and of length
+ *       ... strbuf_avail(sb) is all yours, and you are sure that
+ *       ... strbuf_avail(sb) is at least SOME_SIZE.
+ *    strbuf_setlen(sb, sb->len + SOME_OTHER_SIZE);
+ *
+ *    Of course, SOME_OTHER_SIZE must be smaller or equal to strbuf_avail(sb).
+ *
+ *    Doing so is safe, though if it has to be done in many places, adding the
+ *    missing API to the strbuf module is the way to go.
+ *
+ *    XXX: do _not_ assume that the area that is yours is of size ->alloc - 1
+ *         even if it's true in the current implementation. Alloc is somehow a
+ *         "private" member that should not be messed with.
+ */
+
+#include <assert.h>
+
+extern char strbuf_slopbuf[];
+struct strbuf {
+	size_t alloc;
+	size_t len;
+	char *buf;
+};
+
+#define STRBUF_INIT  { 0, 0, strbuf_slopbuf }
+
+/*----- strbuf life cycle -----*/
+extern void strbuf_init(struct strbuf *, size_t);
+extern void strbuf_release(struct strbuf *);
+extern char *strbuf_detach(struct strbuf *, size_t *);
+extern void strbuf_attach(struct strbuf *, void *, size_t, size_t);
+static inline void strbuf_swap(struct strbuf *a, struct strbuf *b) {
+	struct strbuf tmp = *a;
+	*a = *b;
+	*b = tmp;
+}
+
+/*----- strbuf size related -----*/
+static inline size_t strbuf_avail(const struct strbuf *sb) {
+	return sb->alloc ? sb->alloc - sb->len - 1 : 0;
+}
+
+extern void strbuf_grow(struct strbuf *, size_t);
+
+static inline void strbuf_setlen(struct strbuf *sb, size_t len) {
+	if (!sb->alloc)
+		strbuf_grow(sb, 0);
+	assert(len < sb->alloc);
+	sb->len = len;
+	sb->buf[len] = '\0';
+}
+#define strbuf_reset(sb)  strbuf_setlen(sb, 0)
+
+/*----- content related -----*/
+extern void strbuf_trim(struct strbuf *);
+extern void strbuf_rtrim(struct strbuf *);
+extern void strbuf_ltrim(struct strbuf *);
+extern int strbuf_cmp(const struct strbuf *, const struct strbuf *);
+extern void strbuf_tolower(struct strbuf *);
+
+extern struct strbuf **strbuf_split(const struct strbuf *, int delim);
+extern void strbuf_list_free(struct strbuf **);
+
+/*----- add data in your buffer -----*/
+static inline void strbuf_addch(struct strbuf *sb, int c) {
+	strbuf_grow(sb, 1);
+	sb->buf[sb->len++] = c;
+	sb->buf[sb->len] = '\0';
+}
+
+extern void strbuf_insert(struct strbuf *, size_t pos, const void *, size_t);
+extern void strbuf_remove(struct strbuf *, size_t pos, size_t len);
+
+/* splice pos..pos+len with given data */
+extern void strbuf_splice(struct strbuf *, size_t pos, size_t len,
+                          const void *, size_t);
+
+extern void strbuf_add(struct strbuf *, const void *, size_t);
+static inline void strbuf_addstr(struct strbuf *sb, const char *s) {
+	strbuf_add(sb, s, strlen(s));
+}
+static inline void strbuf_addbuf(struct strbuf *sb, const struct strbuf *sb2) {
+	strbuf_add(sb, sb2->buf, sb2->len);
+}
+extern void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len);
+
+typedef size_t (*expand_fn_t) (struct strbuf *sb, const char *placeholder, void *context);
+extern void strbuf_expand(struct strbuf *sb, const char *format, expand_fn_t fn, void *context);
+struct strbuf_expand_dict_entry {
+	const char *placeholder;
+	const char *value;
+};
+extern size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder, void *context);
+
+__attribute__((format(printf,2,3)))
+extern void strbuf_addf(struct strbuf *sb, const char *fmt, ...);
+
+extern size_t strbuf_fread(struct strbuf *, size_t, FILE *);
+/* XXX: if read fails, any partial read is undone */
+extern ssize_t strbuf_read(struct strbuf *, int fd, size_t hint);
+extern int strbuf_read_file(struct strbuf *sb, const char *path, size_t hint);
+extern int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint);
+
+extern int strbuf_getline(struct strbuf *, FILE *, int);
+
+extern void stripspace(struct strbuf *buf, int skip_comments);
+extern int launch_editor(const char *path, struct strbuf *buffer, const char *const *env);
+
+extern int strbuf_branchname(struct strbuf *sb, const char *name);
+extern int strbuf_check_branch_ref(struct strbuf *sb, const char *name);
+
+#endif /* STRBUF_H */
diff --git a/tools/perf/util/string.c b/tools/perf/util/string.c
new file mode 100644
index 0000000..ec33c0c
--- /dev/null
+++ b/tools/perf/util/string.c
@@ -0,0 +1,34 @@
+#include "string.h"
+
+static int hex(char ch)
+{
+	if ((ch >= '0') && (ch <= '9'))
+		return ch - '0';
+	if ((ch >= 'a') && (ch <= 'f'))
+		return ch - 'a' + 10;
+	if ((ch >= 'A') && (ch <= 'F'))
+		return ch - 'A' + 10;
+	return -1;
+}
+
+/*
+ * While we find nice hex chars, build a long_val.
+ * Return number of chars processed.
+ */
+int hex2u64(const char *ptr, __u64 *long_val)
+{
+	const char *p = ptr;
+	*long_val = 0;
+
+	while (*p) {
+		const int hex_val = hex(*p);
+
+		if (hex_val < 0)
+			break;
+
+		*long_val = (*long_val << 4) | hex_val;
+		p++;
+	}
+
+	return p - ptr;
+}
diff --git a/tools/perf/util/string.h b/tools/perf/util/string.h
new file mode 100644
index 0000000..72812c1
--- /dev/null
+++ b/tools/perf/util/string.h
@@ -0,0 +1,8 @@
+#ifndef _PERF_STRING_H_
+#define _PERF_STRING_H_
+
+#include <linux/types.h>
+
+int hex2u64(const char *ptr, __u64 *val);
+
+#endif
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
new file mode 100644
index 0000000..23f4f7b
--- /dev/null
+++ b/tools/perf/util/symbol.c
@@ -0,0 +1,574 @@
+#include "util.h"
+#include "../perf.h"
+#include "string.h"
+#include "symbol.h"
+
+#include <libelf.h>
+#include <gelf.h>
+#include <elf.h>
+
+const char *sym_hist_filter;
+
+static struct symbol *symbol__new(uint64_t start, uint64_t len,
+				  const char *name, unsigned int priv_size,
+				  uint64_t obj_start, int verbose)
+{
+	size_t namelen = strlen(name) + 1;
+	struct symbol *self = calloc(1, priv_size + sizeof(*self) + namelen);
+
+	if (!self)
+		return NULL;
+
+	if (verbose >= 2)
+		printf("new symbol: %016Lx [%08lx]: %s, hist: %p, obj_start: %p\n",
+			(__u64)start, len, name, self->hist, (void *)obj_start);
+
+	self->obj_start= obj_start;
+	self->hist = NULL;
+	self->hist_sum = 0;
+
+	if (sym_hist_filter && !strcmp(name, sym_hist_filter))
+		self->hist = calloc(sizeof(__u64), len);
+
+	if (priv_size) {
+		memset(self, 0, priv_size);
+		self = ((void *)self) + priv_size;
+	}
+	self->start = start;
+	self->end   = start + len - 1;
+	memcpy(self->name, name, namelen);
+
+	return self;
+}
+
+static void symbol__delete(struct symbol *self, unsigned int priv_size)
+{
+	free(((void *)self) - priv_size);
+}
+
+static size_t symbol__fprintf(struct symbol *self, FILE *fp)
+{
+	return fprintf(fp, " %llx-%llx %s\n",
+		       self->start, self->end, self->name);
+}
+
+struct dso *dso__new(const char *name, unsigned int sym_priv_size)
+{
+	struct dso *self = malloc(sizeof(*self) + strlen(name) + 1);
+
+	if (self != NULL) {
+		strcpy(self->name, name);
+		self->syms = RB_ROOT;
+		self->sym_priv_size = sym_priv_size;
+		self->find_symbol = dso__find_symbol;
+	}
+
+	return self;
+}
+
+static void dso__delete_symbols(struct dso *self)
+{
+	struct symbol *pos;
+	struct rb_node *next = rb_first(&self->syms);
+
+	while (next) {
+		pos = rb_entry(next, struct symbol, rb_node);
+		next = rb_next(&pos->rb_node);
+		rb_erase(&pos->rb_node, &self->syms);
+		symbol__delete(pos, self->sym_priv_size);
+	}
+}
+
+void dso__delete(struct dso *self)
+{
+	dso__delete_symbols(self);
+	free(self);
+}
+
+static void dso__insert_symbol(struct dso *self, struct symbol *sym)
+{
+	struct rb_node **p = &self->syms.rb_node;
+	struct rb_node *parent = NULL;
+	const uint64_t ip = sym->start;
+	struct symbol *s;
+
+	while (*p != NULL) {
+		parent = *p;
+		s = rb_entry(parent, struct symbol, rb_node);
+		if (ip < s->start)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+	rb_link_node(&sym->rb_node, parent, p);
+	rb_insert_color(&sym->rb_node, &self->syms);
+}
+
+struct symbol *dso__find_symbol(struct dso *self, uint64_t ip)
+{
+	struct rb_node *n;
+
+	if (self == NULL)
+		return NULL;
+
+	n = self->syms.rb_node;
+
+	while (n) {
+		struct symbol *s = rb_entry(n, struct symbol, rb_node);
+
+		if (ip < s->start)
+			n = n->rb_left;
+		else if (ip > s->end)
+			n = n->rb_right;
+		else
+			return s;
+	}
+
+	return NULL;
+}
+
+size_t dso__fprintf(struct dso *self, FILE *fp)
+{
+	size_t ret = fprintf(fp, "dso: %s\n", self->name);
+
+	struct rb_node *nd;
+	for (nd = rb_first(&self->syms); nd; nd = rb_next(nd)) {
+		struct symbol *pos = rb_entry(nd, struct symbol, rb_node);
+		ret += symbol__fprintf(pos, fp);
+	}
+
+	return ret;
+}
+
+static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter, int verbose)
+{
+	struct rb_node *nd, *prevnd;
+	char *line = NULL;
+	size_t n;
+	FILE *file = fopen("/proc/kallsyms", "r");
+
+	if (file == NULL)
+		goto out_failure;
+
+	while (!feof(file)) {
+		__u64 start;
+		struct symbol *sym;
+		int line_len, len;
+		char symbol_type;
+
+		line_len = getline(&line, &n, file);
+		if (line_len < 0)
+			break;
+
+		if (!line)
+			goto out_failure;
+
+		line[--line_len] = '\0'; /* \n */
+
+		len = hex2u64(line, &start);
+
+		len++;
+		if (len + 2 >= line_len)
+			continue;
+
+		symbol_type = toupper(line[len]);
+		/*
+		 * We're interested only in code ('T'ext)
+		 */
+		if (symbol_type != 'T' && symbol_type != 'W')
+			continue;
+		/*
+		 * Well fix up the end later, when we have all sorted.
+		 */
+		sym = symbol__new(start, 0xdead, line + len + 2,
+				  self->sym_priv_size, 0, verbose);
+
+		if (sym == NULL)
+			goto out_delete_line;
+
+		if (filter && filter(self, sym))
+			symbol__delete(sym, self->sym_priv_size);
+		else
+			dso__insert_symbol(self, sym);
+	}
+
+	/*
+	 * Now that we have all sorted out, just set the ->end of all
+	 * symbols
+	 */
+	prevnd = rb_first(&self->syms);
+
+	if (prevnd == NULL)
+		goto out_delete_line;
+
+	for (nd = rb_next(prevnd); nd; nd = rb_next(nd)) {
+		struct symbol *prev = rb_entry(prevnd, struct symbol, rb_node),
+			      *curr = rb_entry(nd, struct symbol, rb_node);
+
+		prev->end = curr->start - 1;
+		prevnd = nd;
+	}
+
+	free(line);
+	fclose(file);
+
+	return 0;
+
+out_delete_line:
+	free(line);
+out_failure:
+	return -1;
+}
+
+/**
+ * elf_symtab__for_each_symbol - iterate thru all the symbols
+ *
+ * @self: struct elf_symtab instance to iterate
+ * @index: uint32_t index
+ * @sym: GElf_Sym iterator
+ */
+#define elf_symtab__for_each_symbol(syms, nr_syms, index, sym) \
+	for (index = 0, gelf_getsym(syms, index, &sym);\
+	     index < nr_syms; \
+	     index++, gelf_getsym(syms, index, &sym))
+
+static inline uint8_t elf_sym__type(const GElf_Sym *sym)
+{
+	return GELF_ST_TYPE(sym->st_info);
+}
+
+static inline int elf_sym__is_function(const GElf_Sym *sym)
+{
+	return elf_sym__type(sym) == STT_FUNC &&
+	       sym->st_name != 0 &&
+	       sym->st_shndx != SHN_UNDEF &&
+	       sym->st_size != 0;
+}
+
+static inline const char *elf_sym__name(const GElf_Sym *sym,
+					const Elf_Data *symstrs)
+{
+	return symstrs->d_buf + sym->st_name;
+}
+
+static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
+				    GElf_Shdr *shp, const char *name,
+				    size_t *index)
+{
+	Elf_Scn *sec = NULL;
+	size_t cnt = 1;
+
+	while ((sec = elf_nextscn(elf, sec)) != NULL) {
+		char *str;
+
+		gelf_getshdr(sec, shp);
+		str = elf_strptr(elf, ep->e_shstrndx, shp->sh_name);
+		if (!strcmp(name, str)) {
+			if (index)
+				*index = cnt;
+			break;
+		}
+		++cnt;
+	}
+
+	return sec;
+}
+
+#define elf_section__for_each_rel(reldata, pos, pos_mem, idx, nr_entries) \
+	for (idx = 0, pos = gelf_getrel(reldata, 0, &pos_mem); \
+	     idx < nr_entries; \
+	     ++idx, pos = gelf_getrel(reldata, idx, &pos_mem))
+
+#define elf_section__for_each_rela(reldata, pos, pos_mem, idx, nr_entries) \
+	for (idx = 0, pos = gelf_getrela(reldata, 0, &pos_mem); \
+	     idx < nr_entries; \
+	     ++idx, pos = gelf_getrela(reldata, idx, &pos_mem))
+
+static int dso__synthesize_plt_symbols(struct  dso *self, Elf *elf,
+				       GElf_Ehdr *ehdr, Elf_Scn *scn_dynsym,
+				       GElf_Shdr *shdr_dynsym,
+				       size_t dynsym_idx, int verbose)
+{
+	uint32_t nr_rel_entries, idx;
+	GElf_Sym sym;
+	__u64 plt_offset;
+	GElf_Shdr shdr_plt;
+	struct symbol *f;
+	GElf_Shdr shdr_rel_plt;
+	Elf_Data *reldata, *syms, *symstrs;
+	Elf_Scn *scn_plt_rel, *scn_symstrs;
+	char sympltname[1024];
+	int nr = 0, symidx;
+
+	scn_plt_rel = elf_section_by_name(elf, ehdr, &shdr_rel_plt,
+					  ".rela.plt", NULL);
+	if (scn_plt_rel == NULL) {
+		scn_plt_rel = elf_section_by_name(elf, ehdr, &shdr_rel_plt,
+						  ".rel.plt", NULL);
+		if (scn_plt_rel == NULL)
+			return 0;
+	}
+
+	if (shdr_rel_plt.sh_link != dynsym_idx)
+		return 0;
+
+	if (elf_section_by_name(elf, ehdr, &shdr_plt, ".plt", NULL) == NULL)
+		return 0;
+
+	/*
+	 * Fetch the relocation section to find the indexes to the GOT
+	 * and the symbols in the .dynsym they refer to.
+	 */
+	reldata = elf_getdata(scn_plt_rel, NULL);
+	if (reldata == NULL)
+		return -1;
+
+	syms = elf_getdata(scn_dynsym, NULL);
+	if (syms == NULL)
+		return -1;
+
+	scn_symstrs = elf_getscn(elf, shdr_dynsym->sh_link);
+	if (scn_symstrs == NULL)
+		return -1;
+
+	symstrs = elf_getdata(scn_symstrs, NULL);
+	if (symstrs == NULL)
+		return -1;
+
+	nr_rel_entries = shdr_rel_plt.sh_size / shdr_rel_plt.sh_entsize;
+	plt_offset = shdr_plt.sh_offset;
+
+	if (shdr_rel_plt.sh_type == SHT_RELA) {
+		GElf_Rela pos_mem, *pos;
+
+		elf_section__for_each_rela(reldata, pos, pos_mem, idx,
+					   nr_rel_entries) {
+			symidx = GELF_R_SYM(pos->r_info);
+			plt_offset += shdr_plt.sh_entsize;
+			gelf_getsym(syms, symidx, &sym);
+			snprintf(sympltname, sizeof(sympltname),
+				 "%s@plt", elf_sym__name(&sym, symstrs));
+
+			f = symbol__new(plt_offset, shdr_plt.sh_entsize,
+					sympltname, self->sym_priv_size, 0, verbose);
+			if (!f)
+				return -1;
+
+			dso__insert_symbol(self, f);
+			++nr;
+		}
+	} else if (shdr_rel_plt.sh_type == SHT_REL) {
+		GElf_Rel pos_mem, *pos;
+		elf_section__for_each_rel(reldata, pos, pos_mem, idx,
+					  nr_rel_entries) {
+			symidx = GELF_R_SYM(pos->r_info);
+			plt_offset += shdr_plt.sh_entsize;
+			gelf_getsym(syms, symidx, &sym);
+			snprintf(sympltname, sizeof(sympltname),
+				 "%s@plt", elf_sym__name(&sym, symstrs));
+
+			f = symbol__new(plt_offset, shdr_plt.sh_entsize,
+					sympltname, self->sym_priv_size, 0, verbose);
+			if (!f)
+				return -1;
+
+			dso__insert_symbol(self, f);
+			++nr;
+		}
+	} else {
+		/*
+		 * TODO: There are still one more shdr_rel_plt.sh_type
+		 * I have to investigate, but probably should be ignored.
+		 */
+	}
+
+	return nr;
+}
+
+static int dso__load_sym(struct dso *self, int fd, const char *name,
+			 symbol_filter_t filter, int verbose)
+{
+	Elf_Data *symstrs;
+	uint32_t nr_syms;
+	int err = -1;
+	uint32_t index;
+	GElf_Ehdr ehdr;
+	GElf_Shdr shdr;
+	Elf_Data *syms;
+	GElf_Sym sym;
+	Elf_Scn *sec, *sec_dynsym;
+	Elf *elf;
+	size_t dynsym_idx;
+	int nr = 0;
+
+	elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
+	if (elf == NULL) {
+		if (verbose)
+			fprintf(stderr, "%s: cannot read %s ELF file.\n",
+				__func__, name);
+		goto out_close;
+	}
+
+	if (gelf_getehdr(elf, &ehdr) == NULL) {
+		if (verbose)
+			fprintf(stderr, "%s: cannot get elf header.\n", __func__);
+		goto out_elf_end;
+	}
+
+	/*
+	 * We need to check if we have a .dynsym, so that we can handle the
+	 * .plt, synthesizing its symbols, that aren't on the symtabs (be it
+	 * .dynsym or .symtab)
+	 */
+	sec_dynsym = elf_section_by_name(elf, &ehdr, &shdr,
+					 ".dynsym", &dynsym_idx);
+	if (sec_dynsym != NULL) {
+		nr = dso__synthesize_plt_symbols(self, elf, &ehdr,
+						 sec_dynsym, &shdr,
+						 dynsym_idx, verbose);
+		if (nr < 0)
+			goto out_elf_end;
+	}
+
+	/*
+	 * But if we have a full .symtab (that is a superset of .dynsym) we
+	 * should add the symbols not in the .dynsyn
+	 */
+	sec = elf_section_by_name(elf, &ehdr, &shdr, ".symtab", NULL);
+	if (sec == NULL) {
+		if (sec_dynsym == NULL)
+			goto out_elf_end;
+
+		sec = sec_dynsym;
+		gelf_getshdr(sec, &shdr);
+	}
+
+	syms = elf_getdata(sec, NULL);
+	if (syms == NULL)
+		goto out_elf_end;
+
+	sec = elf_getscn(elf, shdr.sh_link);
+	if (sec == NULL)
+		goto out_elf_end;
+
+	symstrs = elf_getdata(sec, NULL);
+	if (symstrs == NULL)
+		goto out_elf_end;
+
+	nr_syms = shdr.sh_size / shdr.sh_entsize;
+
+	elf_symtab__for_each_symbol(syms, nr_syms, index, sym) {
+		struct symbol *f;
+		uint64_t obj_start;
+
+		if (!elf_sym__is_function(&sym))
+			continue;
+
+		sec = elf_getscn(elf, sym.st_shndx);
+		if (!sec)
+			goto out_elf_end;
+
+		gelf_getshdr(sec, &shdr);
+		obj_start = sym.st_value;
+
+		sym.st_value -= shdr.sh_addr - shdr.sh_offset;
+
+		f = symbol__new(sym.st_value, sym.st_size,
+				elf_sym__name(&sym, symstrs),
+				self->sym_priv_size, obj_start, verbose);
+		if (!f)
+			goto out_elf_end;
+
+		if (filter && filter(self, f))
+			symbol__delete(f, self->sym_priv_size);
+		else {
+			dso__insert_symbol(self, f);
+			nr++;
+		}
+	}
+
+	err = nr;
+out_elf_end:
+	elf_end(elf);
+out_close:
+	return err;
+}
+
+int dso__load(struct dso *self, symbol_filter_t filter, int verbose)
+{
+	int size = strlen(self->name) + sizeof("/usr/lib/debug%s.debug");
+	char *name = malloc(size);
+	int variant = 0;
+	int ret = -1;
+	int fd;
+
+	if (!name)
+		return -1;
+
+more:
+	do {
+		switch (variant) {
+		case 0: /* Fedora */
+			snprintf(name, size, "/usr/lib/debug%s.debug", self->name);
+			break;
+		case 1: /* Ubuntu */
+			snprintf(name, size, "/usr/lib/debug%s", self->name);
+			break;
+		case 2: /* Sane people */
+			snprintf(name, size, "%s", self->name);
+			break;
+
+		default:
+			goto out;
+		}
+		variant++;
+
+		fd = open(name, O_RDONLY);
+	} while (fd < 0);
+
+	ret = dso__load_sym(self, fd, name, filter, verbose);
+	close(fd);
+
+	/*
+	 * Some people seem to have debuginfo files _WITHOUT_ debug info!?!?
+	 */
+	if (!ret)
+		goto more;
+
+out:
+	free(name);
+	return ret;
+}
+
+static int dso__load_vmlinux(struct dso *self, const char *vmlinux,
+			     symbol_filter_t filter, int verbose)
+{
+	int err, fd = open(vmlinux, O_RDONLY);
+
+	if (fd < 0)
+		return -1;
+
+	err = dso__load_sym(self, fd, vmlinux, filter, verbose);
+	close(fd);
+
+	return err;
+}
+
+int dso__load_kernel(struct dso *self, const char *vmlinux,
+		     symbol_filter_t filter, int verbose)
+{
+	int err = -1;
+
+	if (vmlinux)
+		err = dso__load_vmlinux(self, vmlinux, filter, verbose);
+
+	if (err)
+		err = dso__load_kallsyms(self, filter, verbose);
+
+	return err;
+}
+
+void symbol__init(void)
+{
+	elf_version(EV_CURRENT);
+}
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
new file mode 100644
index 0000000..4839d68
--- /dev/null
+++ b/tools/perf/util/symbol.h
@@ -0,0 +1,47 @@
+#ifndef _PERF_SYMBOL_
+#define _PERF_SYMBOL_ 1
+
+#include <linux/types.h>
+#include "list.h"
+#include "rbtree.h"
+
+struct symbol {
+	struct rb_node	rb_node;
+	__u64		start;
+	__u64		end;
+	__u64		obj_start;
+	__u64		hist_sum;
+	__u64		*hist;
+	char		name[0];
+};
+
+struct dso {
+	struct list_head node;
+	struct rb_root	 syms;
+	unsigned int	 sym_priv_size;
+	struct symbol    *(*find_symbol)(struct dso *, uint64_t ip);
+	char		 name[0];
+};
+
+const char *sym_hist_filter;
+
+typedef int (*symbol_filter_t)(struct dso *self, struct symbol *sym);
+
+struct dso *dso__new(const char *name, unsigned int sym_priv_size);
+void dso__delete(struct dso *self);
+
+static inline void *dso__sym_priv(struct dso *self, struct symbol *sym)
+{
+	return ((void *)sym) - self->sym_priv_size;
+}
+
+struct symbol *dso__find_symbol(struct dso *self, uint64_t ip);
+
+int dso__load_kernel(struct dso *self, const char *vmlinux,
+		     symbol_filter_t filter, int verbose);
+int dso__load(struct dso *self, symbol_filter_t filter, int verbose);
+
+size_t dso__fprintf(struct dso *self, FILE *fp);
+
+void symbol__init(void);
+#endif /* _PERF_SYMBOL_ */
diff --git a/tools/perf/util/usage.c b/tools/perf/util/usage.c
new file mode 100644
index 0000000..2cad286
--- /dev/null
+++ b/tools/perf/util/usage.c
@@ -0,0 +1,80 @@
+/*
+ * GIT - The information manager from hell
+ *
+ * Copyright (C) Linus Torvalds, 2005
+ */
+#include "util.h"
+
+static void report(const char *prefix, const char *err, va_list params)
+{
+	char msg[1024];
+	vsnprintf(msg, sizeof(msg), err, params);
+	fprintf(stderr, "%s%s\n", prefix, msg);
+}
+
+static NORETURN void usage_builtin(const char *err)
+{
+	fprintf(stderr, "\n usage: %s\n", err);
+	exit(129);
+}
+
+static NORETURN void die_builtin(const char *err, va_list params)
+{
+	report("fatal: ", err, params);
+	exit(128);
+}
+
+static void error_builtin(const char *err, va_list params)
+{
+	report("error: ", err, params);
+}
+
+static void warn_builtin(const char *warn, va_list params)
+{
+	report("warning: ", warn, params);
+}
+
+/* If we are in a dlopen()ed .so write to a global variable would segfault
+ * (ugh), so keep things static. */
+static void (*usage_routine)(const char *err) NORETURN = usage_builtin;
+static void (*die_routine)(const char *err, va_list params) NORETURN = die_builtin;
+static void (*error_routine)(const char *err, va_list params) = error_builtin;
+static void (*warn_routine)(const char *err, va_list params) = warn_builtin;
+
+void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN)
+{
+	die_routine = routine;
+}
+
+void usage(const char *err)
+{
+	usage_routine(err);
+}
+
+void die(const char *err, ...)
+{
+	va_list params;
+
+	va_start(params, err);
+	die_routine(err, params);
+	va_end(params);
+}
+
+int error(const char *err, ...)
+{
+	va_list params;
+
+	va_start(params, err);
+	error_routine(err, params);
+	va_end(params);
+	return -1;
+}
+
+void warning(const char *warn, ...)
+{
+	va_list params;
+
+	va_start(params, warn);
+	warn_routine(warn, params);
+	va_end(params);
+}
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
new file mode 100644
index 0000000..76590a1
--- /dev/null
+++ b/tools/perf/util/util.h
@@ -0,0 +1,410 @@
+#ifndef GIT_COMPAT_UTIL_H
+#define GIT_COMPAT_UTIL_H
+
+#define _FILE_OFFSET_BITS 64
+
+#ifndef FLEX_ARRAY
+/*
+ * See if our compiler is known to support flexible array members.
+ */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
+# define FLEX_ARRAY /* empty */
+#elif defined(__GNUC__)
+# if (__GNUC__ >= 3)
+#  define FLEX_ARRAY /* empty */
+# else
+#  define FLEX_ARRAY 0 /* older GNU extension */
+# endif
+#endif
+
+/*
+ * Otherwise, default to safer but a bit wasteful traditional style
+ */
+#ifndef FLEX_ARRAY
+# define FLEX_ARRAY 1
+#endif
+#endif
+
+#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
+
+#ifdef __GNUC__
+#define TYPEOF(x) (__typeof__(x))
+#else
+#define TYPEOF(x)
+#endif
+
+#define MSB(x, bits) ((x) & TYPEOF(x)(~0ULL << (sizeof(x) * 8 - (bits))))
+#define HAS_MULTI_BITS(i)  ((i) & ((i) - 1))  /* checks if an integer has more than 1 bit set */
+
+/* Approximation of the length of the decimal representation of this type. */
+#define decimal_length(x)	((int)(sizeof(x) * 2.56 + 0.5) + 1)
+
+#if !defined(__APPLE__) && !defined(__FreeBSD__)  && !defined(__USLC__) && !defined(_M_UNIX)
+#define _XOPEN_SOURCE 600 /* glibc2 and AIX 5.3L need 500, OpenBSD needs 600 for S_ISLNK() */
+#define _XOPEN_SOURCE_EXTENDED 1 /* AIX 5.3L needs this */
+#endif
+#define _ALL_SOURCE 1
+#define _GNU_SOURCE 1
+#define _BSD_SOURCE 1
+
+#include <unistd.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <errno.h>
+#include <limits.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <sys/time.h>
+#include <time.h>
+#include <signal.h>
+#include <fnmatch.h>
+#include <assert.h>
+#include <regex.h>
+#include <utime.h>
+#ifndef __MINGW32__
+#include <sys/wait.h>
+#include <sys/poll.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#ifndef NO_SYS_SELECT_H
+#include <sys/select.h>
+#endif
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <pwd.h>
+#include <inttypes.h>
+#if defined(__CYGWIN__)
+#undef _XOPEN_SOURCE
+#include <grp.h>
+#define _XOPEN_SOURCE 600
+#include "compat/cygwin.h"
+#else
+#undef _ALL_SOURCE /* AIX 5.3L defines a struct list with _ALL_SOURCE. */
+#include <grp.h>
+#define _ALL_SOURCE 1
+#endif
+#else 	/* __MINGW32__ */
+/* pull in Windows compatibility stuff */
+#include "compat/mingw.h"
+#endif	/* __MINGW32__ */
+
+#ifndef NO_ICONV
+#include <iconv.h>
+#endif
+
+#ifndef NO_OPENSSL
+#include <openssl/ssl.h>
+#include <openssl/err.h>
+#endif
+
+/* On most systems <limits.h> would have given us this, but
+ * not on some systems (e.g. GNU/Hurd).
+ */
+#ifndef PATH_MAX
+#define PATH_MAX 4096
+#endif
+
+#ifndef PRIuMAX
+#define PRIuMAX "llu"
+#endif
+
+#ifndef PRIu32
+#define PRIu32 "u"
+#endif
+
+#ifndef PRIx32
+#define PRIx32 "x"
+#endif
+
+#ifndef PATH_SEP
+#define PATH_SEP ':'
+#endif
+
+#ifndef STRIP_EXTENSION
+#define STRIP_EXTENSION ""
+#endif
+
+#ifndef has_dos_drive_prefix
+#define has_dos_drive_prefix(path) 0
+#endif
+
+#ifndef is_dir_sep
+#define is_dir_sep(c) ((c) == '/')
+#endif
+
+#ifdef __GNUC__
+#define NORETURN __attribute__((__noreturn__))
+#else
+#define NORETURN
+#ifndef __attribute__
+#define __attribute__(x)
+#endif
+#endif
+
+/* General helper functions */
+extern void usage(const char *err) NORETURN;
+extern void die(const char *err, ...) NORETURN __attribute__((format (printf, 1, 2)));
+extern int error(const char *err, ...) __attribute__((format (printf, 1, 2)));
+extern void warning(const char *err, ...) __attribute__((format (printf, 1, 2)));
+
+extern void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN);
+
+extern int prefixcmp(const char *str, const char *prefix);
+extern time_t tm_to_time_t(const struct tm *tm);
+
+static inline const char *skip_prefix(const char *str, const char *prefix)
+{
+	size_t len = strlen(prefix);
+	return strncmp(str, prefix, len) ? NULL : str + len;
+}
+
+#if defined(NO_MMAP) || defined(USE_WIN32_MMAP)
+
+#ifndef PROT_READ
+#define PROT_READ 1
+#define PROT_WRITE 2
+#define MAP_PRIVATE 1
+#define MAP_FAILED ((void*)-1)
+#endif
+
+#define mmap git_mmap
+#define munmap git_munmap
+extern void *git_mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
+extern int git_munmap(void *start, size_t length);
+
+#else /* NO_MMAP || USE_WIN32_MMAP */
+
+#include <sys/mman.h>
+
+#endif /* NO_MMAP || USE_WIN32_MMAP */
+
+#ifdef NO_MMAP
+
+/* This value must be multiple of (pagesize * 2) */
+#define DEFAULT_PACKED_GIT_WINDOW_SIZE (1 * 1024 * 1024)
+
+#else /* NO_MMAP */
+
+/* This value must be multiple of (pagesize * 2) */
+#define DEFAULT_PACKED_GIT_WINDOW_SIZE \
+	(sizeof(void*) >= 8 \
+		?  1 * 1024 * 1024 * 1024 \
+		: 32 * 1024 * 1024)
+
+#endif /* NO_MMAP */
+
+#ifdef NO_ST_BLOCKS_IN_STRUCT_STAT
+#define on_disk_bytes(st) ((st).st_size)
+#else
+#define on_disk_bytes(st) ((st).st_blocks * 512)
+#endif
+
+#define DEFAULT_PACKED_GIT_LIMIT \
+	((1024L * 1024L) * (sizeof(void*) >= 8 ? 8192 : 256))
+
+#ifdef NO_PREAD
+#define pread git_pread
+extern ssize_t git_pread(int fd, void *buf, size_t count, off_t offset);
+#endif
+/*
+ * Forward decl that will remind us if its twin in cache.h changes.
+ * This function is used in compat/pread.c.  But we can't include
+ * cache.h there.
+ */
+extern ssize_t read_in_full(int fd, void *buf, size_t count);
+
+#ifdef NO_SETENV
+#define setenv gitsetenv
+extern int gitsetenv(const char *, const char *, int);
+#endif
+
+#ifdef NO_MKDTEMP
+#define mkdtemp gitmkdtemp
+extern char *gitmkdtemp(char *);
+#endif
+
+#ifdef NO_UNSETENV
+#define unsetenv gitunsetenv
+extern void gitunsetenv(const char *);
+#endif
+
+#ifdef NO_STRCASESTR
+#define strcasestr gitstrcasestr
+extern char *gitstrcasestr(const char *haystack, const char *needle);
+#endif
+
+#ifdef NO_STRLCPY
+#define strlcpy gitstrlcpy
+extern size_t gitstrlcpy(char *, const char *, size_t);
+#endif
+
+#ifdef NO_STRTOUMAX
+#define strtoumax gitstrtoumax
+extern uintmax_t gitstrtoumax(const char *, char **, int);
+#endif
+
+#ifdef NO_HSTRERROR
+#define hstrerror githstrerror
+extern const char *githstrerror(int herror);
+#endif
+
+#ifdef NO_MEMMEM
+#define memmem gitmemmem
+void *gitmemmem(const void *haystack, size_t haystacklen,
+                const void *needle, size_t needlelen);
+#endif
+
+#ifdef FREAD_READS_DIRECTORIES
+#ifdef fopen
+#undef fopen
+#endif
+#define fopen(a,b) git_fopen(a,b)
+extern FILE *git_fopen(const char*, const char*);
+#endif
+
+#ifdef SNPRINTF_RETURNS_BOGUS
+#define snprintf git_snprintf
+extern int git_snprintf(char *str, size_t maxsize,
+			const char *format, ...);
+#define vsnprintf git_vsnprintf
+extern int git_vsnprintf(char *str, size_t maxsize,
+			 const char *format, va_list ap);
+#endif
+
+#ifdef __GLIBC_PREREQ
+#if __GLIBC_PREREQ(2, 1)
+#define HAVE_STRCHRNUL
+#endif
+#endif
+
+#ifndef HAVE_STRCHRNUL
+#define strchrnul gitstrchrnul
+static inline char *gitstrchrnul(const char *s, int c)
+{
+	while (*s && *s != c)
+		s++;
+	return (char *)s;
+}
+#endif
+
+/*
+ * Wrappers:
+ */
+extern char *xstrdup(const char *str);
+extern void *xmalloc(size_t size);
+extern void *xmemdupz(const void *data, size_t len);
+extern char *xstrndup(const char *str, size_t len);
+extern void *xrealloc(void *ptr, size_t size);
+extern void *xcalloc(size_t nmemb, size_t size);
+extern void *xmmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
+extern ssize_t xread(int fd, void *buf, size_t len);
+extern ssize_t xwrite(int fd, const void *buf, size_t len);
+extern int xdup(int fd);
+extern FILE *xfdopen(int fd, const char *mode);
+extern int xmkstemp(char *template);
+
+static inline size_t xsize_t(off_t len)
+{
+	return (size_t)len;
+}
+
+static inline int has_extension(const char *filename, const char *ext)
+{
+	size_t len = strlen(filename);
+	size_t extlen = strlen(ext);
+	return len > extlen && !memcmp(filename + len - extlen, ext, extlen);
+}
+
+/* Sane ctype - no locale, and works with signed chars */
+#undef isascii
+#undef isspace
+#undef isdigit
+#undef isalpha
+#undef isalnum
+#undef tolower
+#undef toupper
+extern unsigned char sane_ctype[256];
+#define GIT_SPACE 0x01
+#define GIT_DIGIT 0x02
+#define GIT_ALPHA 0x04
+#define GIT_GLOB_SPECIAL 0x08
+#define GIT_REGEX_SPECIAL 0x10
+#define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0)
+#define isascii(x) (((x) & ~0x7f) == 0)
+#define isspace(x) sane_istest(x,GIT_SPACE)
+#define isdigit(x) sane_istest(x,GIT_DIGIT)
+#define isalpha(x) sane_istest(x,GIT_ALPHA)
+#define isalnum(x) sane_istest(x,GIT_ALPHA | GIT_DIGIT)
+#define is_glob_special(x) sane_istest(x,GIT_GLOB_SPECIAL)
+#define is_regex_special(x) sane_istest(x,GIT_GLOB_SPECIAL | GIT_REGEX_SPECIAL)
+#define tolower(x) sane_case((unsigned char)(x), 0x20)
+#define toupper(x) sane_case((unsigned char)(x), 0)
+
+static inline int sane_case(int x, int high)
+{
+	if (sane_istest(x, GIT_ALPHA))
+		x = (x & ~0x20) | high;
+	return x;
+}
+
+static inline int strtoul_ui(char const *s, int base, unsigned int *result)
+{
+	unsigned long ul;
+	char *p;
+
+	errno = 0;
+	ul = strtoul(s, &p, base);
+	if (errno || *p || p == s || (unsigned int) ul != ul)
+		return -1;
+	*result = ul;
+	return 0;
+}
+
+static inline int strtol_i(char const *s, int base, int *result)
+{
+	long ul;
+	char *p;
+
+	errno = 0;
+	ul = strtol(s, &p, base);
+	if (errno || *p || p == s || (int) ul != ul)
+		return -1;
+	*result = ul;
+	return 0;
+}
+
+#ifdef INTERNAL_QSORT
+void git_qsort(void *base, size_t nmemb, size_t size,
+	       int(*compar)(const void *, const void *));
+#define qsort git_qsort
+#endif
+
+#ifndef DIR_HAS_BSD_GROUP_SEMANTICS
+# define FORCE_DIR_SET_GID S_ISGID
+#else
+# define FORCE_DIR_SET_GID 0
+#endif
+
+#ifdef NO_NSEC
+#undef USE_NSEC
+#define ST_CTIME_NSEC(st) 0
+#define ST_MTIME_NSEC(st) 0
+#else
+#ifdef USE_ST_TIMESPEC
+#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctimespec.tv_nsec))
+#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtimespec.tv_nsec))
+#else
+#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctim.tv_nsec))
+#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtim.tv_nsec))
+#endif
+#endif
+
+#endif
diff --git a/tools/perf/util/wrapper.c b/tools/perf/util/wrapper.c
new file mode 100644
index 0000000..6350d65
--- /dev/null
+++ b/tools/perf/util/wrapper.c
@@ -0,0 +1,206 @@
+/*
+ * Various trivial helper wrappers around standard functions
+ */
+#include "cache.h"
+
+/*
+ * There's no pack memory to release - but stay close to the Git
+ * version so wrap this away:
+ */
+static inline void release_pack_memory(size_t size, int flag)
+{
+}
+
+char *xstrdup(const char *str)
+{
+	char *ret = strdup(str);
+	if (!ret) {
+		release_pack_memory(strlen(str) + 1, -1);
+		ret = strdup(str);
+		if (!ret)
+			die("Out of memory, strdup failed");
+	}
+	return ret;
+}
+
+void *xmalloc(size_t size)
+{
+	void *ret = malloc(size);
+	if (!ret && !size)
+		ret = malloc(1);
+	if (!ret) {
+		release_pack_memory(size, -1);
+		ret = malloc(size);
+		if (!ret && !size)
+			ret = malloc(1);
+		if (!ret)
+			die("Out of memory, malloc failed");
+	}
+#ifdef XMALLOC_POISON
+	memset(ret, 0xA5, size);
+#endif
+	return ret;
+}
+
+/*
+ * xmemdupz() allocates (len + 1) bytes of memory, duplicates "len" bytes of
+ * "data" to the allocated memory, zero terminates the allocated memory,
+ * and returns a pointer to the allocated memory. If the allocation fails,
+ * the program dies.
+ */
+void *xmemdupz(const void *data, size_t len)
+{
+	char *p = xmalloc(len + 1);
+	memcpy(p, data, len);
+	p[len] = '\0';
+	return p;
+}
+
+char *xstrndup(const char *str, size_t len)
+{
+	char *p = memchr(str, '\0', len);
+	return xmemdupz(str, p ? p - str : len);
+}
+
+void *xrealloc(void *ptr, size_t size)
+{
+	void *ret = realloc(ptr, size);
+	if (!ret && !size)
+		ret = realloc(ptr, 1);
+	if (!ret) {
+		release_pack_memory(size, -1);
+		ret = realloc(ptr, size);
+		if (!ret && !size)
+			ret = realloc(ptr, 1);
+		if (!ret)
+			die("Out of memory, realloc failed");
+	}
+	return ret;
+}
+
+void *xcalloc(size_t nmemb, size_t size)
+{
+	void *ret = calloc(nmemb, size);
+	if (!ret && (!nmemb || !size))
+		ret = calloc(1, 1);
+	if (!ret) {
+		release_pack_memory(nmemb * size, -1);
+		ret = calloc(nmemb, size);
+		if (!ret && (!nmemb || !size))
+			ret = calloc(1, 1);
+		if (!ret)
+			die("Out of memory, calloc failed");
+	}
+	return ret;
+}
+
+void *xmmap(void *start, size_t length,
+	int prot, int flags, int fd, off_t offset)
+{
+	void *ret = mmap(start, length, prot, flags, fd, offset);
+	if (ret == MAP_FAILED) {
+		if (!length)
+			return NULL;
+		release_pack_memory(length, fd);
+		ret = mmap(start, length, prot, flags, fd, offset);
+		if (ret == MAP_FAILED)
+			die("Out of memory? mmap failed: %s", strerror(errno));
+	}
+	return ret;
+}
+
+/*
+ * xread() is the same a read(), but it automatically restarts read()
+ * operations with a recoverable error (EAGAIN and EINTR). xread()
+ * DOES NOT GUARANTEE that "len" bytes is read even if the data is available.
+ */
+ssize_t xread(int fd, void *buf, size_t len)
+{
+	ssize_t nr;
+	while (1) {
+		nr = read(fd, buf, len);
+		if ((nr < 0) && (errno == EAGAIN || errno == EINTR))
+			continue;
+		return nr;
+	}
+}
+
+/*
+ * xwrite() is the same a write(), but it automatically restarts write()
+ * operations with a recoverable error (EAGAIN and EINTR). xwrite() DOES NOT
+ * GUARANTEE that "len" bytes is written even if the operation is successful.
+ */
+ssize_t xwrite(int fd, const void *buf, size_t len)
+{
+	ssize_t nr;
+	while (1) {
+		nr = write(fd, buf, len);
+		if ((nr < 0) && (errno == EAGAIN || errno == EINTR))
+			continue;
+		return nr;
+	}
+}
+
+ssize_t read_in_full(int fd, void *buf, size_t count)
+{
+	char *p = buf;
+	ssize_t total = 0;
+
+	while (count > 0) {
+		ssize_t loaded = xread(fd, p, count);
+		if (loaded <= 0)
+			return total ? total : loaded;
+		count -= loaded;
+		p += loaded;
+		total += loaded;
+	}
+
+	return total;
+}
+
+ssize_t write_in_full(int fd, const void *buf, size_t count)
+{
+	const char *p = buf;
+	ssize_t total = 0;
+
+	while (count > 0) {
+		ssize_t written = xwrite(fd, p, count);
+		if (written < 0)
+			return -1;
+		if (!written) {
+			errno = ENOSPC;
+			return -1;
+		}
+		count -= written;
+		p += written;
+		total += written;
+	}
+
+	return total;
+}
+
+int xdup(int fd)
+{
+	int ret = dup(fd);
+	if (ret < 0)
+		die("dup failed: %s", strerror(errno));
+	return ret;
+}
+
+FILE *xfdopen(int fd, const char *mode)
+{
+	FILE *stream = fdopen(fd, mode);
+	if (stream == NULL)
+		die("Out of memory? fdopen failed: %s", strerror(errno));
+	return stream;
+}
+
+int xmkstemp(char *template)
+{
+	int fd;
+
+	fd = mkstemp(template);
+	if (fd < 0)
+		die("Unable to create temporary file: %s", strerror(errno));
+	return fd;
+}
-- 
cgit v0.10.2


From 7d37a0cbd68c875fa984fa97bcf5c7f4b7950b6d Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 6 Jun 2009 20:36:38 +0200
Subject: perf_counter tools: Warning fixes on 32-bit

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 116a397..4a3c279 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -668,9 +668,9 @@ static void hist_hit(struct hist_entry *he, uint64_t ip)
 
 	if (verbose >= 3)
 		printf("%p %s: count++ [ip: %p, %08Lx] => %Ld\n",
-			(void *)he->sym->start,
+			(void *)(unsigned long)he->sym->start,
 			he->sym->name,
-			(void *)ip, ip - he->sym->start,
+			(void *)(unsigned long)ip, ip - he->sym->start,
 			sym->hist[offset]);
 }
 
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 23f4f7b..253821d 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -21,7 +21,7 @@ static struct symbol *symbol__new(uint64_t start, uint64_t len,
 
 	if (verbose >= 2)
 		printf("new symbol: %016Lx [%08lx]: %s, hist: %p, obj_start: %p\n",
-			(__u64)start, len, name, self->hist, (void *)obj_start);
+			(__u64)start, (unsigned long)len, name, self->hist, (void *)(unsigned long)obj_start);
 
 	self->obj_start= obj_start;
 	self->hist = NULL;
-- 
cgit v0.10.2


From 8953645fec933f992223286ad407dc371ac2caa5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 6 Jun 2009 21:04:17 +0200
Subject: perf_counter tools: Fix error condition in parse_aliases()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

gcc warned about this bug:

util/parse-events.c: In function ‘parse_generic_hw_symbols’:
util/parse-events.c:175: warning: comparison is always false due to limited range of data type
util/parse-events.c:182: warning: comparison is always false due to limited range of data type
util/parse-events.c:190: warning: comparison is always false due to limited range of data type

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index e0820b4..f18a9a0 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -160,12 +160,12 @@ static int parse_aliases(const char *str, char *names[][MAX_ALIASES], int size)
 		}
 	}
 
-	return 0;
+	return -1;
 }
 
 static int parse_generic_hw_symbols(const char *str, struct perf_counter_attr *attr)
 {
-	__u8 cache_type = -1, cache_op = 0, cache_result = 0;
+	int cache_type = -1, cache_op = 0, cache_result = 0;
 
 	cache_type = parse_aliases(str, hw_cache, PERF_COUNT_HW_CACHE_MAX);
 	/*
@@ -179,8 +179,8 @@ static int parse_generic_hw_symbols(const char *str, struct perf_counter_attr *a
 	/*
 	 * Fall back to reads:
 	 */
-	if (cache_type == -1)
-		cache_type = PERF_COUNT_HW_CACHE_OP_READ;
+	if (cache_op == -1)
+		cache_op = PERF_COUNT_HW_CACHE_OP_READ;
 
 	cache_result = parse_aliases(str, hw_cache_result,
 					PERF_COUNT_HW_CACHE_RESULT_MAX);
-- 
cgit v0.10.2


From 39273ee9756917129de3190d469b0b120f87e763 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 6 Jun 2009 21:17:03 +0200
Subject: perf annotate: Automatically pick up vmlinux in the local directory

Right now kernel debug info does not get resolved by default, because
we dont know where to look for the vmlinux.

The -k option can be used for that - but if no option is given, pick
up vmlinux files in the current directory - in case a kernel hacker
runs profiling from the source directory that the kernel was built in.

The real solution would be to embedd the location (and perhaps the
date/timestamp) of the vmlinux file in /proc/kallsyms, so that
tools can pick it up automatically.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 4a3c279..80c5aa0 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -26,7 +26,7 @@
 #define SHOW_HV		4
 
 static char		const *input_name = "perf.data";
-static char		*vmlinux = NULL;
+static char		*vmlinux = "vmlinux";
 
 static char		default_sort_order[] = "comm,symbol";
 static char		*sort_order = default_sort_order;
-- 
cgit v0.10.2


From e9fbc9dc9214d6a9de7d62627be5414804fd7b9f Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 6 Jun 2009 21:22:33 +0200
Subject: perf_counter tools: Initialize a stack variable before use

the "perf report" utility crashed in some circumstances
because the "sym" stack variable was not initialized before used
(as also proven by valgrind).

With this fix both the crash goes away and valgrind no longer complains.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 253821d..158588c 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -457,6 +457,8 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
 
 	nr_syms = shdr.sh_size / shdr.sh_entsize;
 
+	memset(&sym, 0, sizeof(sym));
+
 	elf_symtab__for_each_symbol(syms, nr_syms, index, sym) {
 		struct symbol *f;
 		uint64_t obj_start;
-- 
cgit v0.10.2


From 23b87116c7c4f73597965218b66041acbdb4e79f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 6 Jun 2009 21:25:29 +0200
Subject: perf annotate: Fix command line help text

Arjan noticed this bug in the perf annotate help output:

    -s, --symbol <file>   symbol to annotate

that should be <symbol> instead.

Reported-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 80c5aa0..0e23fe9 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -1300,7 +1300,7 @@ static const char * const annotate_usage[] = {
 static const struct option options[] = {
 	OPT_STRING('i', "input", &input_name, "file",
 		    "input file name"),
-	OPT_STRING('s', "symbol", &sym_hist_filter, "file",
+	OPT_STRING('s', "symbol", &sym_hist_filter, "symbol",
 		    "symbol to annotate"),
 	OPT_BOOLEAN('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
-- 
cgit v0.10.2


From 4ae1507f6d266d0cc3dd36e474d83aad70fec9e4 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sun, 24 May 2009 18:45:15 -0400
Subject: cifs: make overriding of ownership conditional on new mount options

We have a bit of a problem with the uid= option. The basic issue is that
it means too many things and has too many side-effects.

It's possible to allow an unprivileged user to mount a filesystem if the
user owns the mountpoint, /bin/mount is setuid root, and the mount is
set up in /etc/fstab with the "user" option.

When doing this though, /bin/mount automatically adds the "uid=" and
"gid=" options to the share. This is fortunate since the correct uid=
option is needed in order to tell the upcall what user's credcache to
use when generating the SPNEGO blob.

On a mount without unix extensions this is fine -- you generally will
want the files to be owned by the "owner" of the mount. The problem
comes in on a mount with unix extensions. With those enabled, the
uid/gid options cause the ownership of files to be overriden even though
the server is sending along the ownership info.

This means that it's not possible to have a mount by an unprivileged
user that shows the server's file ownership info. The result is also
inode permissions that have no reflection at all on the server. You
simply cannot separate ownership from the mode in this fashion.

This behavior also makes MultiuserMount option less usable. Once you
pass in the uid= option for a mount, then you can't use unix ownership
info and allow someone to share the mount.

While I'm not thrilled with it, the only solution I can see is to stop
making uid=/gid= force the overriding of ownership on mounts, and to add
new mount options that turn this behavior on.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 74b5a87..10151f8 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1096,17 +1096,17 @@ cifs_parse_mount_options(char *options, const char *devname,
 				return 1;
 			}
 		} else if (strnicmp(data, "uid", 3) == 0) {
-			if (value && *value) {
+			if (value && *value)
 				vol->linux_uid =
 					simple_strtoul(value, &value, 0);
+		} else if (strnicmp(data, "forceuid", 8) == 0) {
 				vol->override_uid = 1;
-			}
 		} else if (strnicmp(data, "gid", 3) == 0) {
-			if (value && *value) {
+			if (value && *value)
 				vol->linux_gid =
 					simple_strtoul(value, &value, 0);
+		} else if (strnicmp(data, "forcegid", 8) == 0) {
 				vol->override_gid = 1;
-			}
 		} else if (strnicmp(data, "file_mode", 4) == 0) {
 			if (value && *value) {
 				vol->file_mode =
-- 
cgit v0.10.2


From f0472d0ec89bef2ea4432828c3daa1b26ef569aa Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Sat, 6 Jun 2009 21:09:39 +0000
Subject: [CIFS] Add mention of new mount parm (forceuid) to cifs readme

Also update fs/cifs/CHANGES

Signed-off-by: Steve French <sfrench@us.ibm.com>

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 227c681..b486898 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -2,7 +2,10 @@ Version 1.59
 ------------
 Client uses server inode numbers (which are persistent) rather than
 client generated ones by default (mount option "serverino" turned
-on by default if server supports it).
+on by default if server supports it).  Add forceuid and forcegid
+mount options (so that when negotiating unix extensions specifying
+which uid mounted does not immediately force the server's reported
+uids to be overridden).
 
 Version 1.58
 ------------
diff --git a/fs/cifs/README b/fs/cifs/README
index 6d1608f..ad92921 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -262,7 +262,8 @@ A partial list of the supported mount options follows:
 		mount.	
   domain	Set the SMB/CIFS workgroup name prepended to the
 		username during CIFS session establishment
-  uid		Set the default uid for inodes. For mounts to servers
+  forceuid	Set the default uid for inodes based on the uid
+		passed in. For mounts to servers
 		which do support the CIFS Unix extensions, such as a
 		properly configured Samba server, the server provides
 		the uid, gid and mode so this parameter should  not be
@@ -292,6 +293,12 @@ A partial list of the supported mount options follows:
 		the client.  Note that the mount.cifs helper must be
 		at version 1.10 or higher to support specifying the uid
 		(or gid) in non-numeric form.
+  forcegid	(similar to above but for the groupid instead of uid)
+  uid		Set the default uid for inodes, and indicate to the
+		cifs kernel driver which local user mounted . If the server
+		supports the unix extensions the default uid is
+		not used to fill in the owner fields of inodes (files)
+		unless the "forceuid" parameter is specified.
   gid		Set the default gid for inodes (similar to above).
   file_mode     If CIFS Unix extensions are not supported by the server
 		this overrides the default mode for file inodes.
-- 
cgit v0.10.2


From f07502dae230a2c3b65381fd1b06e8a18b2c7525 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Date: Sat, 6 Jun 2009 21:18:09 +0100
Subject: integrity: fix IMA inode leak

CONFIG_IMA=y inode activity leaks iint_cache and radix_tree_node objects
until the system runs out of memory.  Nowhere is calling ima_inode_free()
a.k.a. ima_iint_delete().  Fix that by calling it from destroy_inode().

Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/inode.c b/fs/inode.c
index a4876e5..bca0c61 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -219,6 +219,7 @@ static struct inode *alloc_inode(struct super_block *sb)
 void destroy_inode(struct inode *inode)
 {
 	BUG_ON(inode_has_buffers(inode));
+	ima_inode_free(inode);
 	security_inode_free(inode);
 	if (inode->i_sb->s_op->destroy_inode)
 		inode->i_sb->s_op->destroy_inode(inode);
-- 
cgit v0.10.2


From 2f01190aa62fe9dd0a98205927b9f09fd191c017 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 6 Jun 2009 23:10:43 +0200
Subject: perf top: Wait for a minimal set of events before reading first
 snapshot

The first snapshot reading often occur before any events have
been read in the mapped perfcounter files.

Just wait until we have at least one event before starting the
snapshot, or the delay before the first set of entries to be
displayed may be long in case of low refresh rate.

Note: we could also use a semaphore to wait before
"print_entries" number of eveents is reached, but again this
value is tunable and we can't ensure we will even reach it.
Also we could base on a default mimimum set of entries for the
first refresh, say 15, but again, the minimal sample is
tunable, and we could end up displaying nothing until we have a
minimal default set of events, which can take some time in case
of high samples filters.

Hence this simple solution which partially covers the default
case.

[ Impact: fix display artifacts in perf top ]

Signed-off-by: Frederic Weisbecker <fweisbeec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <1244322643-6447-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index f2e7312..fdc1d58 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -420,7 +420,7 @@ static unsigned int mmap_read_head(struct mmap_data *md)
 
 struct timeval last_read, this_read;
 
-static void mmap_read(struct mmap_data *md)
+static void mmap_read_counter(struct mmap_data *md)
 {
 	unsigned int head = mmap_read_head(md);
 	unsigned int old = md->prev;
@@ -517,6 +517,16 @@ static void mmap_read(struct mmap_data *md)
 static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
 static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 
+static void mmap_read(void)
+{
+	int i, counter;
+
+	for (i = 0; i < nr_cpus; i++) {
+		for (counter = 0; counter < nr_counters; counter++)
+			mmap_read_counter(&mmap_array[i][counter]);
+	}
+}
+
 static int __cmd_top(void)
 {
 	struct perf_counter_attr *attr;
@@ -571,6 +581,11 @@ static int __cmd_top(void)
 		}
 	}
 
+	/* Wait for a minimal set of events before starting the snapshot */
+	poll(event_array, nr_poll, 100);
+
+	mmap_read();
+
 	if (pthread_create(&thread, NULL, display_thread, NULL)) {
 		printf("Could not create display thread.\n");
 		exit(-1);
@@ -589,10 +604,7 @@ static int __cmd_top(void)
 	while (1) {
 		int hits = samples;
 
-		for (i = 0; i < nr_cpus; i++) {
-			for (counter = 0; counter < nr_counters; counter++)
-				mmap_read(&mmap_array[i][counter]);
-		}
+		mmap_read();
 
 		if (hits == samples)
 			ret = poll(event_array, nr_poll, 100);
-- 
cgit v0.10.2


From 7caf6a49bb17d0377210693af5737563b31aa5ee Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 5 Jun 2009 12:01:35 +0200
Subject: dma-debug: change hash_bucket_find from first-fit to best-fit

Some device drivers map the same physical address multiple times to a
dma address. Without an IOMMU this results in the same dma address being
put into the dma-debug hash multiple times. With a first-fit match in
hash_bucket_find() this function may return the wrong dma_debug_entry.

This can result in false positive warnings. This patch fixes it by
changing the first-fit behavior of hash_bucket_find() into a best-fit
algorithm.

Reported-by: Torsten Kaiser <just.for.lkml@googlemail.com>
Reported-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: lethal@linux-sh.org
Cc: just.for.lkml@googlemail.com
Cc: hancockrwd@gmail.com
Cc: jens.axboe@oracle.com
Cc: bharrosh@panasas.com
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: <stable@kernel.org>
LKML-Reference: <20090605104132.GE24836@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index cdd205d..8fcc09c 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -186,15 +186,50 @@ static void put_hash_bucket(struct hash_bucket *bucket,
 static struct dma_debug_entry *hash_bucket_find(struct hash_bucket *bucket,
 						struct dma_debug_entry *ref)
 {
-	struct dma_debug_entry *entry;
+	struct dma_debug_entry *entry, *ret = NULL;
+	int matches = 0, match_lvl, last_lvl = 0;
 
 	list_for_each_entry(entry, &bucket->list, list) {
-		if ((entry->dev_addr == ref->dev_addr) &&
-		    (entry->dev == ref->dev))
+		if ((entry->dev_addr != ref->dev_addr) ||
+		    (entry->dev != ref->dev))
+			continue;
+
+		/*
+		 * Some drivers map the same physical address multiple
+		 * times. Without a hardware IOMMU this results in the
+		 * same device addresses being put into the dma-debug
+		 * hash multiple times too. This can result in false
+		 * positives being reported. Therfore we implement a
+		 * best-fit algorithm here which returns the entry from
+		 * the hash which fits best to the reference value
+		 * instead of the first-fit.
+		 */
+		matches += 1;
+		match_lvl = 0;
+		entry->size      == ref->size      ? ++match_lvl : match_lvl;
+		entry->type      == ref->type      ? ++match_lvl : match_lvl;
+		entry->direction == ref->direction ? ++match_lvl : match_lvl;
+
+		if (match_lvl == 3) {
+			/* perfect-fit - return the result */
 			return entry;
+		} else if (match_lvl > last_lvl) {
+			/*
+			 * We found an entry that fits better then the
+			 * previous one
+			 */
+			last_lvl = match_lvl;
+			ret      = entry;
+		}
 	}
 
-	return NULL;
+	/*
+	 * If we have multiple matches but no perfect-fit, just return
+	 * NULL.
+	 */
+	ret = (matches == 1) ? ret : NULL;
+
+	return ret;
 }
 
 /*
-- 
cgit v0.10.2


From 5095f59bda6793a7b8f0856096d6893fe98e0e51 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Fri, 5 Jun 2009 23:27:17 +0530
Subject: x86: cpu_debug: Remove model information to reduce encoding-decoding

Remove model information, encoding/decoding and reduce bookkeeping.

This, besides removing a lot of code and cleaning up the code, also
enables these features on many more CPUs that were enumerated before.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
LKML-Reference: <1244224637.8212.6.camel@ht.satnam>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h
index 2228020..d96c1ee 100644
--- a/arch/x86/include/asm/cpu_debug.h
+++ b/arch/x86/include/asm/cpu_debug.h
@@ -86,105 +86,7 @@ enum cpu_file_bit {
 	CPU_VALUE_BIT,				/* value		*/
 };
 
-#define	CPU_FILE_VALUE			(1 << CPU_VALUE_BIT)
-
-/*
- * DisplayFamily_DisplayModel	Processor Families/Processor Number Series
- * --------------------------	------------------------------------------
- * 05_01, 05_02, 05_04		Pentium, Pentium with MMX
- *
- * 06_01			Pentium Pro
- * 06_03, 06_05			Pentium II Xeon, Pentium II
- * 06_07, 06_08, 06_0A, 06_0B	Pentium III Xeon, Pentum III
- *
- * 06_09, 060D			Pentium M
- *
- * 06_0E			Core Duo, Core Solo
- *
- * 06_0F			Xeon 3000, 3200, 5100, 5300, 7300 series,
- *				Core 2 Quad, Core 2 Extreme, Core 2 Duo,
- *				Pentium dual-core
- * 06_17			Xeon 5200, 5400 series, Core 2 Quad Q9650
- *
- * 06_1C			Atom
- *
- * 0F_00, 0F_01, 0F_02		Xeon, Xeon MP, Pentium 4
- * 0F_03, 0F_04			Xeon, Xeon MP, Pentium 4, Pentium D
- *
- * 0F_06			Xeon 7100, 5000 Series, Xeon MP,
- *				Pentium 4, Pentium D
- */
-
-/* Register processors bits */
-enum cpu_processor_bit {
-	CPU_NONE,
-/* Intel */
-	CPU_INTEL_PENTIUM_BIT,
-	CPU_INTEL_P6_BIT,
-	CPU_INTEL_PENTIUM_M_BIT,
-	CPU_INTEL_CORE_BIT,
-	CPU_INTEL_CORE2_BIT,
-	CPU_INTEL_ATOM_BIT,
-	CPU_INTEL_XEON_P4_BIT,
-	CPU_INTEL_XEON_MP_BIT,
-/* AMD */
-	CPU_AMD_K6_BIT,
-	CPU_AMD_K7_BIT,
-	CPU_AMD_K8_BIT,
-	CPU_AMD_0F_BIT,
-	CPU_AMD_10_BIT,
-	CPU_AMD_11_BIT,
-};
-
-#define	CPU_INTEL_PENTIUM	(1 << CPU_INTEL_PENTIUM_BIT)
-#define	CPU_INTEL_P6		(1 << CPU_INTEL_P6_BIT)
-#define	CPU_INTEL_PENTIUM_M	(1 << CPU_INTEL_PENTIUM_M_BIT)
-#define	CPU_INTEL_CORE		(1 << CPU_INTEL_CORE_BIT)
-#define	CPU_INTEL_CORE2		(1 << CPU_INTEL_CORE2_BIT)
-#define	CPU_INTEL_ATOM		(1 << CPU_INTEL_ATOM_BIT)
-#define	CPU_INTEL_XEON_P4	(1 << CPU_INTEL_XEON_P4_BIT)
-#define	CPU_INTEL_XEON_MP	(1 << CPU_INTEL_XEON_MP_BIT)
-
-#define	CPU_INTEL_PX		(CPU_INTEL_P6 | CPU_INTEL_PENTIUM_M)
-#define	CPU_INTEL_COREX		(CPU_INTEL_CORE | CPU_INTEL_CORE2)
-#define	CPU_INTEL_XEON		(CPU_INTEL_XEON_P4 | CPU_INTEL_XEON_MP)
-#define	CPU_CO_AT		(CPU_INTEL_CORE | CPU_INTEL_ATOM)
-#define	CPU_C2_AT		(CPU_INTEL_CORE2 | CPU_INTEL_ATOM)
-#define	CPU_CX_AT		(CPU_INTEL_COREX | CPU_INTEL_ATOM)
-#define	CPU_CX_XE		(CPU_INTEL_COREX | CPU_INTEL_XEON)
-#define	CPU_P6_XE		(CPU_INTEL_P6 | CPU_INTEL_XEON)
-#define	CPU_PM_CO_AT		(CPU_INTEL_PENTIUM_M | CPU_CO_AT)
-#define	CPU_C2_AT_XE		(CPU_C2_AT | CPU_INTEL_XEON)
-#define	CPU_CX_AT_XE		(CPU_CX_AT | CPU_INTEL_XEON)
-#define	CPU_P6_CX_AT		(CPU_INTEL_P6 | CPU_CX_AT)
-#define	CPU_P6_CX_XE		(CPU_P6_XE | CPU_INTEL_COREX)
-#define	CPU_P6_CX_AT_XE		(CPU_INTEL_P6 | CPU_CX_AT_XE)
-#define	CPU_PM_CX_AT_XE		(CPU_INTEL_PENTIUM_M | CPU_CX_AT_XE)
-#define	CPU_PM_CX_AT		(CPU_INTEL_PENTIUM_M | CPU_CX_AT)
-#define	CPU_PM_CX_XE		(CPU_INTEL_PENTIUM_M | CPU_CX_XE)
-#define	CPU_PX_CX_AT		(CPU_INTEL_PX | CPU_CX_AT)
-#define	CPU_PX_CX_AT_XE		(CPU_INTEL_PX | CPU_CX_AT_XE)
-
-/* Select all supported Intel CPUs */
-#define	CPU_INTEL_ALL		(CPU_INTEL_PENTIUM | CPU_PX_CX_AT_XE)
-
-#define	CPU_AMD_K6		(1 << CPU_AMD_K6_BIT)
-#define	CPU_AMD_K7		(1 << CPU_AMD_K7_BIT)
-#define	CPU_AMD_K8		(1 << CPU_AMD_K8_BIT)
-#define	CPU_AMD_0F		(1 << CPU_AMD_0F_BIT)
-#define	CPU_AMD_10		(1 << CPU_AMD_10_BIT)
-#define	CPU_AMD_11		(1 << CPU_AMD_11_BIT)
-
-#define	CPU_K10_PLUS		(CPU_AMD_10 | CPU_AMD_11)
-#define	CPU_K0F_PLUS		(CPU_AMD_0F | CPU_K10_PLUS)
-#define	CPU_K8_PLUS		(CPU_AMD_K8 | CPU_K0F_PLUS)
-#define	CPU_K7_PLUS		(CPU_AMD_K7 | CPU_K8_PLUS)
-
-/* Select all supported AMD CPUs */
-#define	CPU_AMD_ALL		(CPU_AMD_K6 | CPU_K7_PLUS)
-
-/* Select all supported CPUs */
-#define	CPU_ALL			(CPU_INTEL_ALL | CPU_AMD_ALL)
+#define	CPU_FILE_VALUE		(1 << CPU_VALUE_BIT)
 
 #define MAX_CPU_FILES		512
 
@@ -220,7 +122,6 @@ struct cpu_debug_range {
 	unsigned		min;		/* Register range min	*/
 	unsigned		max;		/* Register range max	*/
 	unsigned		flag;		/* Supported flags	*/
-	unsigned		model;		/* Supported models	*/
 };
 
 #endif /* _ASM_X86_CPU_DEBUG_H */
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 46e29ab..86afe13 100644
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -32,9 +32,7 @@
 
 static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
 static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
-static DEFINE_PER_CPU(unsigned, cpu_modelflag);
 static DEFINE_PER_CPU(int, cpu_priv_count);
-static DEFINE_PER_CPU(unsigned, cpu_model);
 
 static DEFINE_MUTEX(cpu_debug_lock);
 
@@ -80,302 +78,102 @@ static struct cpu_file_base cpu_file[] = {
 	{ "value",	CPU_REG_ALL,	1	},
 };
 
-/* Intel Registers Range */
-static struct cpu_debug_range cpu_intel_range[] = {
-	{ 0x00000000, 0x00000001, CPU_MC,	CPU_INTEL_ALL		},
-	{ 0x00000006, 0x00000007, CPU_MONITOR,	CPU_CX_AT_XE		},
-	{ 0x00000010, 0x00000010, CPU_TIME,	CPU_INTEL_ALL		},
-	{ 0x00000011, 0x00000013, CPU_PMC,	CPU_INTEL_PENTIUM	},
-	{ 0x00000017, 0x00000017, CPU_PLATFORM,	CPU_PX_CX_AT_XE		},
-	{ 0x0000001B, 0x0000001B, CPU_APIC,	CPU_P6_CX_AT_XE		},
-
-	{ 0x0000002A, 0x0000002A, CPU_POWERON,	CPU_PX_CX_AT_XE		},
-	{ 0x0000002B, 0x0000002B, CPU_POWERON,	CPU_INTEL_XEON		},
-	{ 0x0000002C, 0x0000002C, CPU_FREQ,	CPU_INTEL_XEON		},
-	{ 0x0000003A, 0x0000003A, CPU_CONTROL,	CPU_CX_AT_XE		},
-
-	{ 0x00000040, 0x00000043, CPU_LBRANCH,	CPU_PM_CX_AT_XE		},
-	{ 0x00000044, 0x00000047, CPU_LBRANCH,	CPU_PM_CO_AT		},
-	{ 0x00000060, 0x00000063, CPU_LBRANCH,	CPU_C2_AT		},
-	{ 0x00000064, 0x00000067, CPU_LBRANCH,	CPU_INTEL_ATOM		},
-
-	{ 0x00000079, 0x00000079, CPU_BIOS,	CPU_P6_CX_AT_XE		},
-	{ 0x00000088, 0x0000008A, CPU_CACHE,	CPU_INTEL_P6		},
-	{ 0x0000008B, 0x0000008B, CPU_BIOS,	CPU_P6_CX_AT_XE		},
-	{ 0x0000009B, 0x0000009B, CPU_MONITOR,	CPU_INTEL_XEON		},
-
-	{ 0x000000C1, 0x000000C2, CPU_PMC,	CPU_P6_CX_AT		},
-	{ 0x000000CD, 0x000000CD, CPU_FREQ,	CPU_CX_AT		},
-	{ 0x000000E7, 0x000000E8, CPU_PERF,	CPU_CX_AT		},
-	{ 0x000000FE, 0x000000FE, CPU_MTRR,	CPU_P6_CX_XE		},
-
-	{ 0x00000116, 0x00000116, CPU_CACHE,	CPU_INTEL_P6		},
-	{ 0x00000118, 0x00000118, CPU_CACHE,	CPU_INTEL_P6		},
-	{ 0x00000119, 0x00000119, CPU_CACHE,	CPU_INTEL_PX		},
-	{ 0x0000011A, 0x0000011B, CPU_CACHE,	CPU_INTEL_P6		},
-	{ 0x0000011E, 0x0000011E, CPU_CACHE,	CPU_PX_CX_AT		},
-
-	{ 0x00000174, 0x00000176, CPU_SYSENTER,	CPU_P6_CX_AT_XE		},
-	{ 0x00000179, 0x0000017A, CPU_MC,	CPU_PX_CX_AT_XE		},
-	{ 0x0000017B, 0x0000017B, CPU_MC,	CPU_P6_XE		},
-	{ 0x00000186, 0x00000187, CPU_PMC,	CPU_P6_CX_AT		},
-	{ 0x00000198, 0x00000199, CPU_PERF,	CPU_PM_CX_AT_XE		},
-	{ 0x0000019A, 0x0000019A, CPU_TIME,	CPU_PM_CX_AT_XE		},
-	{ 0x0000019B, 0x0000019D, CPU_THERM,	CPU_PM_CX_AT_XE		},
-	{ 0x000001A0, 0x000001A0, CPU_MISC,	CPU_PM_CX_AT_XE		},
-
-	{ 0x000001C9, 0x000001C9, CPU_LBRANCH,	CPU_PM_CX_AT		},
-	{ 0x000001D7, 0x000001D8, CPU_LBRANCH,	CPU_INTEL_XEON		},
-	{ 0x000001D9, 0x000001D9, CPU_DEBUG,	CPU_CX_AT_XE		},
-	{ 0x000001DA, 0x000001DA, CPU_LBRANCH,	CPU_INTEL_XEON		},
-	{ 0x000001DB, 0x000001DB, CPU_LBRANCH,	CPU_P6_XE		},
-	{ 0x000001DC, 0x000001DC, CPU_LBRANCH,	CPU_INTEL_P6		},
-	{ 0x000001DD, 0x000001DE, CPU_LBRANCH,	CPU_PX_CX_AT_XE		},
-	{ 0x000001E0, 0x000001E0, CPU_LBRANCH,	CPU_INTEL_P6		},
-
-	{ 0x00000200, 0x0000020F, CPU_MTRR,	CPU_P6_CX_XE		},
-	{ 0x00000250, 0x00000250, CPU_MTRR,	CPU_P6_CX_XE		},
-	{ 0x00000258, 0x00000259, CPU_MTRR,	CPU_P6_CX_XE		},
-	{ 0x00000268, 0x0000026F, CPU_MTRR,	CPU_P6_CX_XE		},
-	{ 0x00000277, 0x00000277, CPU_PAT,	CPU_C2_AT_XE		},
-	{ 0x000002FF, 0x000002FF, CPU_MTRR,	CPU_P6_CX_XE		},
-
-	{ 0x00000300, 0x00000308, CPU_PMC,	CPU_INTEL_XEON		},
-	{ 0x00000309, 0x0000030B, CPU_PMC,	CPU_C2_AT_XE		},
-	{ 0x0000030C, 0x00000311, CPU_PMC,	CPU_INTEL_XEON		},
-	{ 0x00000345, 0x00000345, CPU_PMC,	CPU_C2_AT		},
-	{ 0x00000360, 0x00000371, CPU_PMC,	CPU_INTEL_XEON		},
-	{ 0x0000038D, 0x00000390, CPU_PMC,	CPU_C2_AT		},
-	{ 0x000003A0, 0x000003BE, CPU_PMC,	CPU_INTEL_XEON		},
-	{ 0x000003C0, 0x000003CD, CPU_PMC,	CPU_INTEL_XEON		},
-	{ 0x000003E0, 0x000003E1, CPU_PMC,	CPU_INTEL_XEON		},
-	{ 0x000003F0, 0x000003F0, CPU_PMC,	CPU_INTEL_XEON		},
-	{ 0x000003F1, 0x000003F1, CPU_PMC,	CPU_C2_AT_XE		},
-	{ 0x000003F2, 0x000003F2, CPU_PMC,	CPU_INTEL_XEON		},
-
-	{ 0x00000400, 0x00000402, CPU_MC,	CPU_PM_CX_AT_XE		},
-	{ 0x00000403, 0x00000403, CPU_MC,	CPU_INTEL_XEON		},
-	{ 0x00000404, 0x00000406, CPU_MC,	CPU_PM_CX_AT_XE		},
-	{ 0x00000407, 0x00000407, CPU_MC,	CPU_INTEL_XEON		},
-	{ 0x00000408, 0x0000040A, CPU_MC,	CPU_PM_CX_AT_XE		},
-	{ 0x0000040B, 0x0000040B, CPU_MC,	CPU_INTEL_XEON		},
-	{ 0x0000040C, 0x0000040E, CPU_MC,	CPU_PM_CX_XE		},
-	{ 0x0000040F, 0x0000040F, CPU_MC,	CPU_INTEL_XEON		},
-	{ 0x00000410, 0x00000412, CPU_MC,	CPU_PM_CX_AT_XE		},
-	{ 0x00000413, 0x00000417, CPU_MC,	CPU_CX_AT_XE		},
-	{ 0x00000480, 0x0000048B, CPU_VMX,	CPU_CX_AT_XE		},
-
-	{ 0x00000600, 0x00000600, CPU_DEBUG,	CPU_PM_CX_AT_XE		},
-	{ 0x00000680, 0x0000068F, CPU_LBRANCH,	CPU_INTEL_XEON		},
-	{ 0x000006C0, 0x000006CF, CPU_LBRANCH,	CPU_INTEL_XEON		},
-
-	{ 0x000107CC, 0x000107D3, CPU_PMC,	CPU_INTEL_XEON_MP	},
-
-	{ 0xC0000080, 0xC0000080, CPU_FEATURES,	CPU_INTEL_XEON		},
-	{ 0xC0000081, 0xC0000082, CPU_CALL,	CPU_INTEL_XEON		},
-	{ 0xC0000084, 0xC0000084, CPU_CALL,	CPU_INTEL_XEON		},
-	{ 0xC0000100, 0xC0000102, CPU_BASE,	CPU_INTEL_XEON		},
+/* CPU Registers Range */
+static struct cpu_debug_range cpu_reg_range[] = {
+	{ 0x00000000, 0x00000001, CPU_MC,	},
+	{ 0x00000006, 0x00000007, CPU_MONITOR,	},
+	{ 0x00000010, 0x00000010, CPU_TIME,	},
+	{ 0x00000011, 0x00000013, CPU_PMC,	},
+	{ 0x00000017, 0x00000017, CPU_PLATFORM,	},
+	{ 0x0000001B, 0x0000001B, CPU_APIC,	},
+	{ 0x0000002A, 0x0000002B, CPU_POWERON,	},
+	{ 0x0000002C, 0x0000002C, CPU_FREQ,	},
+	{ 0x0000003A, 0x0000003A, CPU_CONTROL,	},
+	{ 0x00000040, 0x00000047, CPU_LBRANCH,	},
+	{ 0x00000060, 0x00000067, CPU_LBRANCH,	},
+	{ 0x00000079, 0x00000079, CPU_BIOS,	},
+	{ 0x00000088, 0x0000008A, CPU_CACHE,	},
+	{ 0x0000008B, 0x0000008B, CPU_BIOS,	},
+	{ 0x0000009B, 0x0000009B, CPU_MONITOR,	},
+	{ 0x000000C1, 0x000000C4, CPU_PMC,	},
+	{ 0x000000CD, 0x000000CD, CPU_FREQ,	},
+	{ 0x000000E7, 0x000000E8, CPU_PERF,	},
+	{ 0x000000FE, 0x000000FE, CPU_MTRR,	},
+
+	{ 0x00000116, 0x0000011E, CPU_CACHE,	},
+	{ 0x00000174, 0x00000176, CPU_SYSENTER,	},
+	{ 0x00000179, 0x0000017B, CPU_MC,	},
+	{ 0x00000186, 0x00000189, CPU_PMC,	},
+	{ 0x00000198, 0x00000199, CPU_PERF,	},
+	{ 0x0000019A, 0x0000019A, CPU_TIME,	},
+	{ 0x0000019B, 0x0000019D, CPU_THERM,	},
+	{ 0x000001A0, 0x000001A0, CPU_MISC,	},
+	{ 0x000001C9, 0x000001C9, CPU_LBRANCH,	},
+	{ 0x000001D7, 0x000001D8, CPU_LBRANCH,	},
+	{ 0x000001D9, 0x000001D9, CPU_DEBUG,	},
+	{ 0x000001DA, 0x000001E0, CPU_LBRANCH,	},
+
+	{ 0x00000200, 0x0000020F, CPU_MTRR,	},
+	{ 0x00000250, 0x00000250, CPU_MTRR,	},
+	{ 0x00000258, 0x00000259, CPU_MTRR,	},
+	{ 0x00000268, 0x0000026F, CPU_MTRR,	},
+	{ 0x00000277, 0x00000277, CPU_PAT,	},
+	{ 0x000002FF, 0x000002FF, CPU_MTRR,	},
+
+	{ 0x00000300, 0x00000311, CPU_PMC,	},
+	{ 0x00000345, 0x00000345, CPU_PMC,	},
+	{ 0x00000360, 0x00000371, CPU_PMC,	},
+	{ 0x0000038D, 0x00000390, CPU_PMC,	},
+	{ 0x000003A0, 0x000003BE, CPU_PMC,	},
+	{ 0x000003C0, 0x000003CD, CPU_PMC,	},
+	{ 0x000003E0, 0x000003E1, CPU_PMC,	},
+	{ 0x000003F0, 0x000003F2, CPU_PMC,	},
+
+	{ 0x00000400, 0x00000417, CPU_MC,	},
+	{ 0x00000480, 0x0000048B, CPU_VMX,	},
+
+	{ 0x00000600, 0x00000600, CPU_DEBUG,	},
+	{ 0x00000680, 0x0000068F, CPU_LBRANCH,	},
+	{ 0x000006C0, 0x000006CF, CPU_LBRANCH,	},
+
+	{ 0x000107CC, 0x000107D3, CPU_PMC,	},
+
+	{ 0xC0000080, 0xC0000080, CPU_FEATURES,	},
+	{ 0xC0000081, 0xC0000084, CPU_CALL,	},
+	{ 0xC0000100, 0xC0000102, CPU_BASE,	},
+	{ 0xC0000103, 0xC0000103, CPU_TIME,	},
+
+	{ 0xC0010000, 0xC0010007, CPU_PMC,	},
+	{ 0xC0010010, 0xC0010010, CPU_CONF,	},
+	{ 0xC0010015, 0xC0010015, CPU_CONF,	},
+	{ 0xC0010016, 0xC001001A, CPU_MTRR,	},
+	{ 0xC001001D, 0xC001001D, CPU_MTRR,	},
+	{ 0xC001001F, 0xC001001F, CPU_CONF,	},
+	{ 0xC0010030, 0xC0010035, CPU_BIOS,	},
+	{ 0xC0010044, 0xC0010048, CPU_MC,	},
+	{ 0xC0010050, 0xC0010056, CPU_SMM,	},
+	{ 0xC0010058, 0xC0010058, CPU_CONF,	},
+	{ 0xC0010060, 0xC0010060, CPU_CACHE,	},
+	{ 0xC0010061, 0xC0010068, CPU_SMM,	},
+	{ 0xC0010069, 0xC001006B, CPU_SMM,	},
+	{ 0xC0010070, 0xC0010071, CPU_SMM,	},
+	{ 0xC0010111, 0xC0010113, CPU_SMM,	},
+	{ 0xC0010114, 0xC0010118, CPU_SVM,	},
+	{ 0xC0010140, 0xC0010141, CPU_OSVM,	},
+	{ 0xC0011022, 0xC0011023, CPU_CONF,	},
 };
 
-/* AMD Registers Range */
-static struct cpu_debug_range cpu_amd_range[] = {
-	{ 0x00000000, 0x00000001, CPU_MC,	CPU_K10_PLUS,		},
-	{ 0x00000010, 0x00000010, CPU_TIME,	CPU_K8_PLUS,		},
-	{ 0x0000001B, 0x0000001B, CPU_APIC,	CPU_K8_PLUS,		},
-	{ 0x0000002A, 0x0000002A, CPU_POWERON,	CPU_K7_PLUS		},
-	{ 0x0000008B, 0x0000008B, CPU_VER,	CPU_K8_PLUS		},
-	{ 0x000000FE, 0x000000FE, CPU_MTRR,	CPU_K8_PLUS,		},
-
-	{ 0x00000174, 0x00000176, CPU_SYSENTER,	CPU_K8_PLUS,		},
-	{ 0x00000179, 0x0000017B, CPU_MC,	CPU_K8_PLUS,		},
-	{ 0x000001D9, 0x000001D9, CPU_DEBUG,	CPU_K8_PLUS,		},
-	{ 0x000001DB, 0x000001DE, CPU_LBRANCH,	CPU_K8_PLUS,		},
-
-	{ 0x00000200, 0x0000020F, CPU_MTRR,	CPU_K8_PLUS,		},
-	{ 0x00000250, 0x00000250, CPU_MTRR,	CPU_K8_PLUS,		},
-	{ 0x00000258, 0x00000259, CPU_MTRR,	CPU_K8_PLUS,		},
-	{ 0x00000268, 0x0000026F, CPU_MTRR,	CPU_K8_PLUS,		},
-	{ 0x00000277, 0x00000277, CPU_PAT,	CPU_K8_PLUS,		},
-	{ 0x000002FF, 0x000002FF, CPU_MTRR,	CPU_K8_PLUS,		},
-
-	{ 0x00000400, 0x00000413, CPU_MC,	CPU_K8_PLUS,		},
-
-	{ 0xC0000080, 0xC0000080, CPU_FEATURES,	CPU_AMD_ALL,		},
-	{ 0xC0000081, 0xC0000084, CPU_CALL,	CPU_K8_PLUS,		},
-	{ 0xC0000100, 0xC0000102, CPU_BASE,	CPU_K8_PLUS,		},
-	{ 0xC0000103, 0xC0000103, CPU_TIME,	CPU_K10_PLUS,		},
-
-	{ 0xC0010000, 0xC0010007, CPU_PMC,	CPU_K8_PLUS,		},
-	{ 0xC0010010, 0xC0010010, CPU_CONF,	CPU_K7_PLUS,		},
-	{ 0xC0010015, 0xC0010015, CPU_CONF,	CPU_K7_PLUS,		},
-	{ 0xC0010016, 0xC001001A, CPU_MTRR,	CPU_K8_PLUS,		},
-	{ 0xC001001D, 0xC001001D, CPU_MTRR,	CPU_K8_PLUS,		},
-	{ 0xC001001F, 0xC001001F, CPU_CONF,	CPU_K8_PLUS,		},
-	{ 0xC0010030, 0xC0010035, CPU_BIOS,	CPU_K8_PLUS,		},
-	{ 0xC0010044, 0xC0010048, CPU_MC,	CPU_K8_PLUS,		},
-	{ 0xC0010050, 0xC0010056, CPU_SMM,	CPU_K0F_PLUS,		},
-	{ 0xC0010058, 0xC0010058, CPU_CONF,	CPU_K10_PLUS,		},
-	{ 0xC0010060, 0xC0010060, CPU_CACHE,	CPU_AMD_11,		},
-	{ 0xC0010061, 0xC0010068, CPU_SMM,	CPU_K10_PLUS,		},
-	{ 0xC0010069, 0xC001006B, CPU_SMM,	CPU_AMD_11,		},
-	{ 0xC0010070, 0xC0010071, CPU_SMM,	CPU_K10_PLUS,		},
-	{ 0xC0010111, 0xC0010113, CPU_SMM,	CPU_K8_PLUS,		},
-	{ 0xC0010114, 0xC0010118, CPU_SVM,	CPU_K10_PLUS,		},
-	{ 0xC0010140, 0xC0010141, CPU_OSVM,	CPU_K10_PLUS,		},
-	{ 0xC0011022, 0xC0011023, CPU_CONF,	CPU_K10_PLUS,		},
-};
-
-
-/* Intel */
-static int get_intel_modelflag(unsigned model)
-{
-	int flag;
-
-	switch (model) {
-	case 0x0501:
-	case 0x0502:
-	case 0x0504:
-		flag = CPU_INTEL_PENTIUM;
-		break;
-	case 0x0601:
-	case 0x0603:
-	case 0x0605:
-	case 0x0607:
-	case 0x0608:
-	case 0x060A:
-	case 0x060B:
-		flag = CPU_INTEL_P6;
-		break;
-	case 0x0609:
-	case 0x060D:
-		flag = CPU_INTEL_PENTIUM_M;
-		break;
-	case 0x060E:
-		flag = CPU_INTEL_CORE;
-		break;
-	case 0x060F:
-	case 0x0617:
-		flag = CPU_INTEL_CORE2;
-		break;
-	case 0x061C:
-		flag = CPU_INTEL_ATOM;
-		break;
-	case 0x0F00:
-	case 0x0F01:
-	case 0x0F02:
-	case 0x0F03:
-	case 0x0F04:
-		flag = CPU_INTEL_XEON_P4;
-		break;
-	case 0x0F06:
-		flag = CPU_INTEL_XEON_MP;
-		break;
-	default:
-		flag = CPU_NONE;
-		break;
-	}
-
-	return flag;
-}
-
-/* AMD */
-static int get_amd_modelflag(unsigned model)
-{
-	int flag;
-
-	switch (model >> 8) {
-	case 0x6:
-		flag = CPU_AMD_K6;
-		break;
-	case 0x7:
-		flag = CPU_AMD_K7;
-		break;
-	case 0x8:
-		flag = CPU_AMD_K8;
-		break;
-	case 0xf:
-		flag = CPU_AMD_0F;
-		break;
-	case 0x10:
-		flag = CPU_AMD_10;
-		break;
-	case 0x11:
-		flag = CPU_AMD_11;
-		break;
-	default:
-		flag = CPU_NONE;
-		break;
-	}
-
-	return flag;
-}
-
-static int get_cpu_modelflag(unsigned cpu)
-{
-	int flag;
-
-	flag = per_cpu(cpu_model, cpu);
-
-	switch (flag >> 16) {
-	case X86_VENDOR_INTEL:
-		flag = get_intel_modelflag(flag);
-		break;
-	case X86_VENDOR_AMD:
-		flag = get_amd_modelflag(flag & 0xffff);
-		break;
-	default:
-		flag = CPU_NONE;
-		break;
-	}
-
-	return flag;
-}
-
-static int get_cpu_range_count(unsigned cpu)
-{
-	int index;
-
-	switch (per_cpu(cpu_model, cpu) >> 16) {
-	case X86_VENDOR_INTEL:
-		index = ARRAY_SIZE(cpu_intel_range);
-		break;
-	case X86_VENDOR_AMD:
-		index = ARRAY_SIZE(cpu_amd_range);
-		break;
-	default:
-		index = 0;
-		break;
-	}
-
-	return index;
-}
-
 static int is_typeflag_valid(unsigned cpu, unsigned flag)
 {
-	unsigned vendor, modelflag;
-	int i, index;
+	int i;
 
 	/* Standard Registers should be always valid */
 	if (flag >= CPU_TSS)
 		return 1;
 
-	modelflag = per_cpu(cpu_modelflag, cpu);
-	vendor = per_cpu(cpu_model, cpu) >> 16;
-	index = get_cpu_range_count(cpu);
-
-	for (i = 0; i < index; i++) {
-		switch (vendor) {
-		case X86_VENDOR_INTEL:
-			if ((cpu_intel_range[i].model & modelflag) &&
-			    (cpu_intel_range[i].flag & flag))
-				return 1;
-			break;
-		case X86_VENDOR_AMD:
-			if ((cpu_amd_range[i].model & modelflag) &&
-			    (cpu_amd_range[i].flag & flag))
-				return 1;
-			break;
-		}
+	for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
+		if (cpu_reg_range[i].flag == flag)
+			return 1;
 	}
 
 	/* Invalid */
@@ -385,26 +183,11 @@ static int is_typeflag_valid(unsigned cpu, unsigned flag)
 static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
 			      int index, unsigned flag)
 {
-	unsigned modelflag;
-
-	modelflag = per_cpu(cpu_modelflag, cpu);
-	*max = 0;
-	switch (per_cpu(cpu_model, cpu) >> 16) {
-	case X86_VENDOR_INTEL:
-		if ((cpu_intel_range[index].model & modelflag) &&
-		    (cpu_intel_range[index].flag & flag)) {
-			*min = cpu_intel_range[index].min;
-			*max = cpu_intel_range[index].max;
-		}
-		break;
-	case X86_VENDOR_AMD:
-		if ((cpu_amd_range[index].model & modelflag) &&
-		    (cpu_amd_range[index].flag & flag)) {
-			*min = cpu_amd_range[index].min;
-			*max = cpu_amd_range[index].max;
-		}
-		break;
-	}
+	if (cpu_reg_range[index].flag == flag) {
+		*min = cpu_reg_range[index].min;
+		*max = cpu_reg_range[index].max;
+	} else
+		*max = 0;
 
 	return *max;
 }
@@ -434,7 +217,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
 	unsigned msr, msr_min, msr_max;
 	struct cpu_private *priv;
 	u32 low, high;
-	int i, range;
+	int i;
 
 	if (seq) {
 		priv = seq->private;
@@ -446,9 +229,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
 		}
 	}
 
-	range = get_cpu_range_count(cpu);
-
-	for (i = 0; i < range; i++) {
+	for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
 		if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
 			continue;
 
@@ -788,13 +569,11 @@ static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
 {
 	struct dentry *cpu_dentry = NULL;
 	unsigned reg, reg_min, reg_max;
-	int i, range, err = 0;
+	int i, err = 0;
 	char reg_dir[12];
 	u32 low, high;
 
-	range = get_cpu_range_count(cpu);
-
-	for (i = 0; i < range; i++) {
+	for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
 		if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
 				   cpu_base[type].flag))
 			continue;
@@ -850,10 +629,6 @@ static int cpu_init_cpu(void)
 		cpui = &cpu_data(cpu);
 		if (!cpu_has(cpui, X86_FEATURE_MSR))
 			continue;
-		per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) |
-					   (cpui->x86 << 8) |
-					   (cpui->x86_model));
-		per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu);
 
 		sprintf(cpu_dir, "cpu%d", cpu);
 		cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
-- 
cgit v0.10.2


From 521a415c9f6d4e5463807ce6d36598acabcd204f Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Sun, 7 Jun 2009 13:52:50 +0200
Subject: pdc202xx_old: fix 'pdc20246_dma_ops'

Commit ac95beedf8bc97b24f9540d4da9952f07221c023 (ide: add struct ide_port_ops
(take 2)) erroneously converted the driver's dma_timeout() and dma_lost_irq()
methods to call the driver's resetproc() method regardless of whether it was
defined for this specific controller while it hadn't been defined and hence
called for PDC20246. So the dma_clear() method, the successor of dma_timeout(),
shouldn't exist and the dma_lost_irq() method should be standard for PDC20246.

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>

diff --git a/drivers/ide/pdc202xx_old.c b/drivers/ide/pdc202xx_old.c
index 248a54b..8df2630 100644
--- a/drivers/ide/pdc202xx_old.c
+++ b/drivers/ide/pdc202xx_old.c
@@ -328,9 +328,8 @@ static const struct ide_dma_ops pdc20246_dma_ops = {
 	.dma_start		= ide_dma_start,
 	.dma_end		= ide_dma_end,
 	.dma_test_irq		= pdc202xx_dma_test_irq,
-	.dma_lost_irq		= pdc202xx_dma_lost_irq,
+	.dma_lost_irq		= ide_dma_lost_irq,
 	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_clear		= pdc202xx_reset,
 	.dma_sff_read_status	= ide_dma_sff_read_status,
 };
 
-- 
cgit v0.10.2


From 669165daad2ec839df85b8c5f7bc155e76a2f404 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Sun, 7 Jun 2009 13:52:50 +0200
Subject: pdc202xx_old: fix resetproc() method

pdc202xx_reset() calls pdc202xx_reset_host() twice, for both channels, while
that function actually twiddles the single, shared software reset bit -- the
net effect is a duplicated reset and horrendous 4 second delay happening not
only on a channel reset but also when dma_lost_irq() and dma_clear() methods
are called.  Fold pdc202xx_reset_host() into pdc202xx_reset(), fix printk(),
and move it before the actual reset...

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>

diff --git a/drivers/ide/pdc202xx_old.c b/drivers/ide/pdc202xx_old.c
index 8df2630..b3bc96f 100644
--- a/drivers/ide/pdc202xx_old.c
+++ b/drivers/ide/pdc202xx_old.c
@@ -1,6 +1,6 @@
 /*
  *  Copyright (C) 1998-2002		Andre Hedrick <andre@linux-ide.org>
- *  Copyright (C) 2006-2007		MontaVista Software, Inc.
+ *  Copyright (C) 2006-2007, 2009	MontaVista Software, Inc.
  *  Copyright (C) 2007			Bartlomiej Zolnierkiewicz
  *
  *  Portions Copyright (C) 1999 Promise Technology, Inc.
@@ -227,28 +227,19 @@ somebody_else:
 	return (dma_stat & 4) == 4;	/* return 1 if INTR asserted */
 }
 
-static void pdc202xx_reset_host (ide_hwif_t *hwif)
+static void pdc202xx_reset(ide_drive_t *drive)
 {
+	ide_hwif_t *hwif	= drive->hwif;
 	unsigned long high_16	= hwif->extra_base - 16;
 	u8 udma_speed_flag	= inb(high_16 | 0x001f);
 
+	printk(KERN_WARNING "PDC202xx: software reset...\n");
+
 	outb(udma_speed_flag | 0x10, high_16 | 0x001f);
 	mdelay(100);
 	outb(udma_speed_flag & ~0x10, high_16 | 0x001f);
 	mdelay(2000);	/* 2 seconds ?! */
 
-	printk(KERN_WARNING "PDC202XX: %s channel reset.\n",
-		hwif->channel ? "Secondary" : "Primary");
-}
-
-static void pdc202xx_reset (ide_drive_t *drive)
-{
-	ide_hwif_t *hwif	= drive->hwif;
-	ide_hwif_t *mate	= hwif->mate;
-
-	pdc202xx_reset_host(hwif);
-	pdc202xx_reset_host(mate);
-
 	ide_set_max_pio(drive);
 }
 
-- 
cgit v0.10.2


From 4a4aca641bc4598e77b866804f47c651ec4a764d Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Fri, 5 Jun 2009 12:02:38 +0200
Subject: x86: Add quirk for reboot stalls on a Dell Optiplex 360

The Dell Optiplex 360 hangs on reboot, just like the Optiplex 330, so
the same quirk is needed.

Signed-off-by: Jean Delvare <jdelvare@suse.de>
Cc: Steve Conklin <steve.conklin@canonical.com>
Cc: Leann Ogasawara <leann.ogasawara@canonical.com>
Cc: <stable@kernel.org>
LKML-Reference: <200906051202.38311.jdelvare@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 667188e..d2d1ce8 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -192,6 +192,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
 		},
 	},
+	{   /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
+		.callback = set_bios_reboot,
+		.ident = "Dell OptiPlex 360",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"),
+			DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
+		},
+	},
 	{	/* Handle problems with rebooting on Dell 2400's */
 		.callback = set_bios_reboot,
 		.ident = "Dell PowerEdge 2400",
-- 
cgit v0.10.2


From 103428e57be323c3c5545db8ad12667099bc6005 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sun, 7 Jun 2009 16:48:40 +0400
Subject: x86, apic: Fix dummy apic read operation together with broken MP
 handling

Ingo Molnar reported that read_apic is buggy novadays:

[    0.000000] Using APIC driver default
[    0.000000] SMP: Allowing 1 CPUs, 0 hotplug CPUs
[    0.000000] Local APIC disabled by BIOS -- you can enable it with "lapic"
[    0.000000] APIC: disable apic facility
[    0.000000] ------------[ cut here ]------------
[    0.000000] WARNING: at arch/x86/kernel/apic/apic.c:254 native_apic_read_dummy+0x2d/0x3b()
[    0.000000] Hardware name: HP OmniBook PC

Indeed we still rely on apic->read operation for SMP compiled
kernel. And instead of disfigure the SMP code with #ifdef we
allow to call apic->read. To capture any unexpected results
we check for apic->read being called for sane reason via
WARN_ON_ONCE but(!) instead of OR we should use AND logical
operation (thanks Yinghai for spotting the root of the problem).

Along with that we could be have bad MP table and we are
to fix it that way no SMP started and no complains about
BIOS bug if apic was just disabled via command line.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <20090607124840.GD4547@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e82488d..a4c9cf0 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -249,7 +249,7 @@ static void native_apic_write_dummy(u32 reg, u32 v)
 
 static u32 native_apic_read_dummy(u32 reg)
 {
-	WARN_ON_ONCE((cpu_has_apic || !disable_apic));
+	WARN_ON_ONCE((cpu_has_apic && !disable_apic));
 	return 0;
 }
 
@@ -1609,6 +1609,13 @@ void __init init_apic_mappings(void)
 	new_apicid = read_apic_id();
 	if (boot_cpu_physical_apicid != new_apicid) {
 		boot_cpu_physical_apicid = new_apicid;
+		/*
+		 * yeah -- we lie about apic_version
+		 * in case if apic was disabled via boot option
+		 * but it's not a problem for SMP compiled kernel
+		 * since smp_sanity_check is prepared for such a case
+		 * and disable smp mode
+		 */
 		apic_version[new_apicid] =
 			 GET_APIC_VERSION(apic_read(APIC_LVR));
 	}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index d2e8de9..7c80007 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -992,10 +992,12 @@ static int __init smp_sanity_check(unsigned max_cpus)
 	 */
 	if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) &&
 	    !cpu_has_apic) {
-		printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
-			boot_cpu_physical_apicid);
-		printk(KERN_ERR "... forcing use of dummy APIC emulation."
+		if (!disable_apic) {
+			pr_err("BIOS bug, local APIC #%d not detected!...\n",
+				boot_cpu_physical_apicid);
+			pr_err("... forcing use of dummy APIC emulation."
 				"(tell your hw vendor)\n");
+		}
 		smpboot_clear_io_apic();
 		arch_disable_smp_support();
 		return -1;
-- 
cgit v0.10.2


From a4046f8d299e00e9855ae292527c2d66a42670eb Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sun, 7 Jun 2009 12:19:37 +0400
Subject: x86, nmi: Use predefined numbers instead of hardcoded one

[ Impact: cleanup ]

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <20090607081937.GC4547@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c45a0a5..c972644 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -64,7 +64,7 @@ static inline int nmi_watchdog_active(void)
 	 * but since they are power of two we could use a
 	 * cheaper way --cvg
 	 */
-	return nmi_watchdog & 0x3;
+	return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC);
 }
 #endif
 
-- 
cgit v0.10.2


From 3aa6b186f86c5d06d6d92d14311ffed51f091f40 Mon Sep 17 00:00:00 2001
From: Lubomir Rintel <lkundrak@v3.sk>
Date: Sun, 7 Jun 2009 16:23:48 +0200
Subject: x86: Fix non-lazy GS handling in sys_vm86()

This fixes a stack corruption panic or null dereference oops
due to a bad GS in resume_userspace() when returning from
sys_vm86() and calling lockdep_sys_exit().

Only a problem when CONFIG_LOCKDEP and CONFIG_CC_STACKPROTECTOR
enabled.

Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>
Cc: H. Peter Anvin <hpa@zytor.com>
LKML-Reference: <1244384628.2323.4.camel@bimbo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index d7ac84e..6a17769 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -287,10 +287,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 	info->regs.pt.ds = 0;
 	info->regs.pt.es = 0;
 	info->regs.pt.fs = 0;
-
-/* we are clearing gs later just before "jmp resume_userspace",
- * because it is not saved/restored.
- */
+#ifndef CONFIG_X86_32_LAZY_GS
+	info->regs.pt.gs = 0;
+#endif
 
 /*
  * The flags register is also special: we cannot trust that the user
@@ -343,7 +342,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 	__asm__ __volatile__(
 		"movl %0,%%esp\n\t"
 		"movl %1,%%ebp\n\t"
+#ifdef CONFIG_X86_32_LAZY_GS
 		"mov  %2, %%gs\n\t"
+#endif
 		"jmp resume_userspace"
 		: /* no outputs */
 		:"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
-- 
cgit v0.10.2


From aeef50bc0483fa70ce0bddb686ec84a274b7f3d4 Mon Sep 17 00:00:00 2001
From: "Figo.zhang" <figo1802@gmail.com>
Date: Sun, 7 Jun 2009 22:30:36 +0800
Subject: x86, microcode: Simplify vfree() use

vfree() does its own 'NULL' check, so no need for check before
calling it.

In v2, remove the stray newline.

[ Impact: cleanup ]

Signed-off-by: Figo.zhang <figo1802@gmail.com>
Cc: Dmitry Adamushko <dmitry.adamushko@gmail.com>
LKML-Reference: <1244385036.3402.11.camel@myhost>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index c8be20f..366baa1 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -241,10 +241,8 @@ static int install_equiv_cpu_table(const u8 *buf)
 
 static void free_equiv_cpu_table(void)
 {
-	if (equiv_cpu_table) {
-		vfree(equiv_cpu_table);
-		equiv_cpu_table = NULL;
-	}
+	vfree(equiv_cpu_table);
+	equiv_cpu_table = NULL;
 }
 
 static enum ucode_state
@@ -279,8 +277,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
 
 		mc_header = (struct microcode_header_amd *)mc;
 		if (get_matching_microcode(cpu, mc, new_rev)) {
-			if (new_mc)
-				vfree(new_mc);
+			vfree(new_mc);
 			new_rev = mc_header->patch_id;
 			new_mc  = mc;
 		} else
@@ -292,8 +289,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
 
 	if (new_mc) {
 		if (!leftover) {
-			if (uci->mc)
-				vfree(uci->mc);
+			vfree(uci->mc);
 			uci->mc = new_mc;
 			pr_debug("microcode: CPU%d found a matching microcode "
 				 "update with version 0x%x (current=0x%x)\n",
-- 
cgit v0.10.2


From 743ee1f80434138495bbb95ffb897acf46b51d54 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 7 Jun 2009 17:06:46 +0200
Subject: perf stat: Continue even on counter creation error

Before:

 $ perf stat ~/hackbench 5

 error: syscall returned with -1 (No such device)

After:

 $ perf stat ~/hackbench 5
 Time: 1.640

 Performance counter stats for '/home/mingo/hackbench 5':

    6524.570382  task-clock-ticks     #       3.838 CPU utilization factor
          35704  context-switches     #       0.005 M/sec
            191  CPU-migrations       #       0.000 M/sec
           8958  page-faults          #       0.001 M/sec
  <not counted>  cycles
  <not counted>  instructions
  <not counted>  cache-references
  <not counted>  cache-misses

 Wall-clock time elapsed:  1699.999995 msecs

Also add -v (--verbose) option to allow the printing of failed
counter opens.

Plus dont print 'inf' if wall-time is zero (due to jiffies granularity),
instead skip the printing of the CPU utilization factor.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 2cbf5a1..184ff95 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -59,6 +59,7 @@ static struct perf_counter_attr default_attrs[MAX_COUNTERS] = {
 
 static int			system_wide			=  0;
 static int			inherit				=  1;
+static int			verbose				=  0;
 
 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
 
@@ -83,7 +84,7 @@ static __u64			event_scaled[MAX_COUNTERS];
 static __u64			runtime_nsecs;
 static __u64			walltime_nsecs;
 
-static void create_perfstat_counter(int counter)
+static void create_perf_stat_counter(int counter)
 {
 	struct perf_counter_attr *attr = attrs + counter;
 
@@ -95,10 +96,8 @@ static void create_perfstat_counter(int counter)
 		int cpu;
 		for (cpu = 0; cpu < nr_cpus; cpu ++) {
 			fd[cpu][counter] = sys_perf_counter_open(attr, -1, cpu, -1, 0);
-			if (fd[cpu][counter] < 0) {
-				printf("perfstat error: syscall returned with %d (%s)\n",
-						fd[cpu][counter], strerror(errno));
-				exit(-1);
+			if (fd[cpu][counter] < 0 && verbose) {
+				printf("Error: counter %d, sys_perf_counter_open() syscall returned with %d (%s)\n", counter, fd[cpu][counter], strerror(errno));
 			}
 		}
 	} else {
@@ -106,10 +105,8 @@ static void create_perfstat_counter(int counter)
 		attr->disabled	= 1;
 
 		fd[0][counter] = sys_perf_counter_open(attr, 0, -1, -1, 0);
-		if (fd[0][counter] < 0) {
-			printf("perfstat error: syscall returned with %d (%s)\n",
-					fd[0][counter], strerror(errno));
-			exit(-1);
+		if (fd[0][counter] < 0 && verbose) {
+			printf("Error: counter %d, sys_perf_counter_open() syscall returned with %d (%s)\n", counter, fd[0][counter], strerror(errno));
 		}
 	}
 }
@@ -147,6 +144,9 @@ static void read_counter(int counter)
 
 	nv = scale ? 3 : 1;
 	for (cpu = 0; cpu < nr_cpus; cpu ++) {
+		if (fd[cpu][counter] < 0)
+			continue;
+
 		res = read(fd[cpu][counter], single_count, nv * sizeof(__u64));
 		assert(res == nv * sizeof(__u64));
 
@@ -204,8 +204,9 @@ static void print_counter(int counter)
 		if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
 			attrs[counter].config == PERF_COUNT_TASK_CLOCK) {
 
-			fprintf(stderr, " # %11.3f CPU utilization factor",
-				(double)count[0] / (double)walltime_nsecs);
+			if (walltime_nsecs)
+				fprintf(stderr, " # %11.3f CPU utilization factor",
+					(double)count[0] / (double)walltime_nsecs);
 		}
 	} else {
 		fprintf(stderr, " %14Ld  %-20s",
@@ -220,7 +221,7 @@ static void print_counter(int counter)
 	fprintf(stderr, "\n");
 }
 
-static int do_perfstat(int argc, const char **argv)
+static int do_perf_stat(int argc, const char **argv)
 {
 	unsigned long long t0, t1;
 	int counter;
@@ -232,7 +233,7 @@ static int do_perfstat(int argc, const char **argv)
 		nr_cpus = 1;
 
 	for (counter = 0; counter < nr_counters; counter++)
-		create_perfstat_counter(counter);
+		create_perf_stat_counter(counter);
 
 	/*
 	 * Enable counters and exec the command:
@@ -305,6 +306,8 @@ static const struct option options[] = {
 			    "system-wide collection from all CPUs"),
 	OPT_BOOLEAN('S', "scale", &scale,
 			    "scale/normalize counters"),
+	OPT_BOOLEAN('v', "verbose", &verbose,
+		    "be more verbose (show counter open errors, etc)"),
 	OPT_END()
 };
 
@@ -335,5 +338,5 @@ int cmd_stat(int argc, const char **argv, const char *prefix)
 	signal(SIGALRM, skip_signal);
 	signal(SIGABRT, skip_signal);
 
-	return do_perfstat(argc, argv);
+	return do_perf_stat(argc, argv);
 }
-- 
cgit v0.10.2


From aa853f85d9ed593672d0f24a98c72a2518cb63e6 Mon Sep 17 00:00:00 2001
From: Alessandro Rubini <rubini@gnudd.com>
Date: Sat, 6 Jun 2009 10:17:57 +0100
Subject: [ARM] 5543/1: arm: serial amba: add missing declaration in serial.h

This header is sometimes included in the uncompress stage to get
register values, but no <linux/amba/bus.h> can be included there.
So declare "struct amba_device" here before using it in a prototype.

Signed-off-by: Alessandro Rubini <rubini@unipv.it>
Acked-by: Andrea Gallo <andrea.gallo@stericsson.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

diff --git a/include/linux/amba/serial.h b/include/linux/amba/serial.h
index 48ee32a..64a982e 100644
--- a/include/linux/amba/serial.h
+++ b/include/linux/amba/serial.h
@@ -159,6 +159,7 @@
 #define UART01x_FR_MODEM_ANY	(UART01x_FR_DCD|UART01x_FR_DSR|UART01x_FR_CTS)
 
 #ifndef __ASSEMBLY__
+struct amba_device; /* in uncompress this is included but amba/bus.h is not */
 struct amba_pl010_data {
 	void (*set_mctrl)(struct amba_device *dev, void __iomem *base, unsigned int mctrl);
 };
-- 
cgit v0.10.2


From 716c69fecacd42f2a304a97158e04af2786a3f65 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 7 Jun 2009 17:31:52 +0200
Subject: perf top: Fall back to cpu-clock-tick hrtimer sampling if no cycle
 counter available

On architectures/CPUs without PMU support but with perfcounters
enabled 'perf top' currently fails because it cannot create a
cycle based hw-perfcounter.

Fall back to the cpu-clock-tick sw-perfcounter in this case, which
is hrtimer based and will always work (as long as perfcounters
is enabled).

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index fdc1d58..6da30a1 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -527,58 +527,81 @@ static void mmap_read(void)
 	}
 }
 
-static int __cmd_top(void)
+int nr_poll;
+int group_fd;
+
+static void start_counter(int i, int counter)
 {
 	struct perf_counter_attr *attr;
-	pthread_t thread;
-	int i, counter, group_fd, nr_poll = 0;
 	unsigned int cpu;
+
+	cpu = profile_cpu;
+	if (target_pid == -1 && profile_cpu == -1)
+		cpu = i;
+
+	attr = attrs + counter;
+
+	attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+	attr->freq		= freq;
+
+try_again:
+	fd[i][counter] = sys_perf_counter_open(attr, target_pid, cpu, group_fd, 0);
+
+	if (fd[i][counter] < 0) {
+		int err = errno;
+
+		error("sys_perf_counter_open() syscall returned with %d (%s)\n",
+			fd[i][counter], strerror(err));
+
+		if (err == EPERM)
+			die(" No permission - are you root?\n");
+		/*
+		 * If it's cycles then fall back to hrtimer
+		 * based cpu-clock-tick sw counter, which
+		 * is always available even if no PMU support:
+		 */
+		if (attr->type == PERF_TYPE_HARDWARE
+			&& attr->config == PERF_COUNT_CPU_CYCLES) {
+
+			warning(" ... trying to fall back to cpu-clock-ticks\n");
+			attr->type = PERF_TYPE_SOFTWARE;
+			attr->config = PERF_COUNT_CPU_CLOCK;
+			goto try_again;
+		}
+		exit(-1);
+	}
+	assert(fd[i][counter] >= 0);
+	fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
+
+	/*
+	 * First counter acts as the group leader:
+	 */
+	if (group && group_fd == -1)
+		group_fd = fd[i][counter];
+
+	event_array[nr_poll].fd = fd[i][counter];
+	event_array[nr_poll].events = POLLIN;
+	nr_poll++;
+
+	mmap_array[i][counter].counter = counter;
+	mmap_array[i][counter].prev = 0;
+	mmap_array[i][counter].mask = mmap_pages*page_size - 1;
+	mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
+			PROT_READ, MAP_SHARED, fd[i][counter], 0);
+	if (mmap_array[i][counter].base == MAP_FAILED)
+		die("failed to mmap with %d (%s)\n", errno, strerror(errno));
+}
+
+static int __cmd_top(void)
+{
+	pthread_t thread;
+	int i, counter;
 	int ret;
 
 	for (i = 0; i < nr_cpus; i++) {
 		group_fd = -1;
-		for (counter = 0; counter < nr_counters; counter++) {
-
-			cpu	= profile_cpu;
-			if (target_pid == -1 && profile_cpu == -1)
-				cpu = i;
-
-			attr = attrs + counter;
-
-			attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
-			attr->freq		= freq;
-
-			fd[i][counter] = sys_perf_counter_open(attr, target_pid, cpu, group_fd, 0);
-			if (fd[i][counter] < 0) {
-				int err = errno;
-
-				error("syscall returned with %d (%s)\n",
-					fd[i][counter], strerror(err));
-				if (err == EPERM)
-					printf("Are you root?\n");
-				exit(-1);
-			}
-			assert(fd[i][counter] >= 0);
-			fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
-
-			/*
-			 * First counter acts as the group leader:
-			 */
-			if (group && group_fd == -1)
-				group_fd = fd[i][counter];
-
-			event_array[nr_poll].fd = fd[i][counter];
-			event_array[nr_poll].events = POLLIN;
-			nr_poll++;
-
-			mmap_array[i][counter].counter = counter;
-			mmap_array[i][counter].prev = 0;
-			mmap_array[i][counter].mask = mmap_pages*page_size - 1;
-			mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
-					PROT_READ, MAP_SHARED, fd[i][counter], 0);
-			if (mmap_array[i][counter].base == MAP_FAILED)
-				die("failed to mmap with %d (%s)\n", errno, strerror(errno));
-		}
+		for (counter = 0; counter < nr_counters; counter++)
+			start_counter(i, counter);
 	}
 
 	/* Wait for a minimal set of events before starting the snapshot */
diff --git a/tools/perf/util/usage.c b/tools/perf/util/usage.c
index 2cad286..e16bf9a7 100644
--- a/tools/perf/util/usage.c
+++ b/tools/perf/util/usage.c
@@ -9,29 +9,29 @@ static void report(const char *prefix, const char *err, va_list params)
 {
 	char msg[1024];
 	vsnprintf(msg, sizeof(msg), err, params);
-	fprintf(stderr, "%s%s\n", prefix, msg);
+	fprintf(stderr, " %s%s\n", prefix, msg);
 }
 
 static NORETURN void usage_builtin(const char *err)
 {
-	fprintf(stderr, "\n usage: %s\n", err);
+	fprintf(stderr, "\n Usage: %s\n", err);
 	exit(129);
 }
 
 static NORETURN void die_builtin(const char *err, va_list params)
 {
-	report("fatal: ", err, params);
+	report(" Fatal: ", err, params);
 	exit(128);
 }
 
 static void error_builtin(const char *err, va_list params)
 {
-	report("error: ", err, params);
+	report(" Error: ", err, params);
 }
 
 static void warn_builtin(const char *warn, va_list params)
 {
-	report("warning: ", warn, params);
+	report(" Warning: ", warn, params);
 }
 
 /* If we are in a dlopen()ed .so write to a global variable would segfault
-- 
cgit v0.10.2


From 3da297a60f7e8840f79f7d0b343af078890939ea Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 7 Jun 2009 17:39:02 +0200
Subject: perf record: Fall back to cpu-clock-ticks if no PMU

On architectures/CPUs without PMU support but with perfcounters
enabled 'perf record' currently fails because it cannot create a
cycle based hw-perfcounter.

Fall back to the cpu-clock-tick sw-perfcounter in this case, which
is hrtimer based and will always work (as long as perfcounters
are enabled).

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index aeab9c4..8786629 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -37,6 +37,7 @@ static pid_t			target_pid			= -1;
 static int			inherit				= 1;
 static int			force				= 0;
 static int			append_file			= 0;
+static int			verbose				= 0;
 
 static long			samples;
 static struct timeval		last_read;
@@ -349,17 +350,35 @@ static void create_counter(int counter, int cpu, pid_t pid)
 
 	track = 0; /* only the first counter needs these */
 
+try_again:
 	fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0);
 
 	if (fd[nr_cpu][counter] < 0) {
 		int err = errno;
 
-		error("syscall returned with %d (%s)\n",
+		if (verbose)
+			error("sys_perf_counter_open() syscall returned with %d (%s)\n",
 				fd[nr_cpu][counter], strerror(err));
 		if (err == EPERM)
-			printf("Are you root?\n");
+			die("Permission error - are you root?\n");
+
+		/*
+		 * If it's cycles then fall back to hrtimer
+		 * based cpu-clock-tick sw counter, which
+		 * is always available even if no PMU support:
+		 */
+		if (attr->type == PERF_TYPE_HARDWARE
+			&& attr->config == PERF_COUNT_CPU_CYCLES) {
+
+			if (verbose)
+				warning(" ... trying to fall back to cpu-clock-ticks\n");
+			attr->type = PERF_TYPE_SOFTWARE;
+			attr->config = PERF_COUNT_CPU_CLOCK;
+			goto try_again;
+		}
 		exit(-1);
 	}
+
 	assert(fd[nr_cpu][counter] >= 0);
 	fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK);
 
@@ -519,6 +538,8 @@ static const struct option options[] = {
 		    "profile at this frequency"),
 	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
 		    "number of mmap data pages"),
+	OPT_BOOLEAN('v', "verbose", &verbose,
+		    "be more verbose (show counter open errors, etc)"),
 	OPT_END()
 };
 
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 6da30a1..1f8c97d 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -65,6 +65,7 @@ static int			group				=  0;
 static unsigned int		page_size;
 static unsigned int		mmap_pages			= 16;
 static int			freq				=  0;
+static int			verbose				=  0;
 
 static char			*sym_filter;
 static unsigned long		filter_start;
@@ -550,11 +551,12 @@ try_again:
 	if (fd[i][counter] < 0) {
 		int err = errno;
 
-		error("sys_perf_counter_open() syscall returned with %d (%s)\n",
-			fd[i][counter], strerror(err));
+		if (verbose)
+			error("sys_perf_counter_open() syscall returned with %d (%s)\n",
+				fd[i][counter], strerror(err));
 
 		if (err == EPERM)
-			die(" No permission - are you root?\n");
+			die("No permission - are you root?\n");
 		/*
 		 * If it's cycles then fall back to hrtimer
 		 * based cpu-clock-tick sw counter, which
@@ -563,7 +565,9 @@ try_again:
 		if (attr->type == PERF_TYPE_HARDWARE
 			&& attr->config == PERF_COUNT_CPU_CYCLES) {
 
-			warning(" ... trying to fall back to cpu-clock-ticks\n");
+			if (verbose)
+				warning(" ... trying to fall back to cpu-clock-ticks\n");
+
 			attr->type = PERF_TYPE_SOFTWARE;
 			attr->config = PERF_COUNT_CPU_CLOCK;
 			goto try_again;
@@ -673,6 +677,8 @@ static const struct option options[] = {
 		    "profile at this frequency"),
 	OPT_INTEGER('E', "entries", &print_entries,
 		    "display this many functions"),
+	OPT_BOOLEAN('v', "verbose", &verbose,
+		    "be more verbose (show counter open errors, etc)"),
 	OPT_END()
 };
 
-- 
cgit v0.10.2


From 30c806a094493beb7691bc7957dfa02dee96230a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 7 Jun 2009 17:46:24 +0200
Subject: perf_counter tools: Handle kernels with !CONFIG_PERF_COUNTER

If perf is run on a !CONFIG_PERF_COUNTER kernel right now it
bails out with no messages or with confusing messages.

Standardize this case some more and explain the situation.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 8786629..deaee42 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -356,9 +356,6 @@ try_again:
 	if (fd[nr_cpu][counter] < 0) {
 		int err = errno;
 
-		if (verbose)
-			error("sys_perf_counter_open() syscall returned with %d (%s)\n",
-				fd[nr_cpu][counter], strerror(err));
 		if (err == EPERM)
 			die("Permission error - are you root?\n");
 
@@ -376,6 +373,10 @@ try_again:
 			attr->config = PERF_COUNT_CPU_CLOCK;
 			goto try_again;
 		}
+		printf("\n");
+		error("perfcounter syscall returned with %d (%s)\n",
+			fd[nr_cpu][counter], strerror(err));
+		die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n");
 		exit(-1);
 	}
 
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 1f8c97d..be1698f 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -551,10 +551,6 @@ try_again:
 	if (fd[i][counter] < 0) {
 		int err = errno;
 
-		if (verbose)
-			error("sys_perf_counter_open() syscall returned with %d (%s)\n",
-				fd[i][counter], strerror(err));
-
 		if (err == EPERM)
 			die("No permission - are you root?\n");
 		/*
@@ -572,6 +568,10 @@ try_again:
 			attr->config = PERF_COUNT_CPU_CLOCK;
 			goto try_again;
 		}
+		printf("\n");
+		error("perfcounter syscall returned with %d (%s)\n",
+			fd[i][counter], strerror(err));
+		die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n");
 		exit(-1);
 	}
 	assert(fd[i][counter] >= 0);
-- 
cgit v0.10.2


From a14832ff977e78d1982cdf78cdabb1f2320d9ac8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 7 Jun 2009 17:58:23 +0200
Subject: perf report: Print more expressive message in case of file open error

Before:

 $ perf report
 failed to open file: No such file or directory

After:

 $ perf report
  failed to open file: perf.data  (try 'perf record' first)

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 242e09f..f053a74 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -1120,7 +1120,10 @@ static int __cmd_report(void)
 
 	input = open(input_name, O_RDONLY);
 	if (input < 0) {
-		perror("failed to open file");
+		fprintf(stderr, " failed to open file: %s", input_name);
+		if (!strcmp(input_name, "perf.data"))
+			fprintf(stderr, "  (try 'perf record' first)");
+		fprintf(stderr, "\n");
 		exit(-1);
 	}
 
-- 
cgit v0.10.2


From e779898aa74cd2e97216368b3f3689ceffe8aeed Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 7 Jun 2009 18:14:46 +0200
Subject: perf stat: Print out instructins/cycle metric

Before:

     7549326754  cycles               #    3201.811 M/sec
    10007594937  instructions         #    4244.408 M/sec

After:

     7542051194  cycles               #    3201.996 M/sec
    10007743852  instructions         #    4248.811 M/sec # 1.327 per cycle

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 184ff95..8085509 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -83,6 +83,7 @@ static __u64			event_scaled[MAX_COUNTERS];
 
 static __u64			runtime_nsecs;
 static __u64			walltime_nsecs;
+static __u64			runtime_cycles;
 
 static void create_perf_stat_counter(int counter)
 {
@@ -177,6 +178,9 @@ static void read_counter(int counter)
 	if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
 		attrs[counter].config == PERF_COUNT_TASK_CLOCK)
 		runtime_nsecs = count[0];
+	if (attrs[counter].type == PERF_TYPE_HARDWARE &&
+		attrs[counter].config == PERF_COUNT_CPU_CYCLES)
+		runtime_cycles = count[0];
 }
 
 /*
@@ -214,6 +218,13 @@ static void print_counter(int counter)
 		if (runtime_nsecs)
 			fprintf(stderr, " # %11.3f M/sec",
 				(double)count[0]/runtime_nsecs*1000.0);
+		if (runtime_cycles &&
+			attrs[counter].type == PERF_TYPE_HARDWARE &&
+				attrs[counter].config == PERF_COUNT_INSTRUCTIONS) {
+
+			fprintf(stderr, " # %1.3f per cycle",
+				(double)count[0] / (double)runtime_cycles);
+		}
 	}
 	if (scaled)
 		fprintf(stderr, "  (scaled from %.2f%%)",
-- 
cgit v0.10.2


From a4c0364be3f43d3e17fe19270f8b3d64881606e6 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sat, 6 Jun 2009 12:34:39 +0300
Subject: KVM: Explicity initialize cpus_hardware_enabled

Under CONFIG_MAXSMP, cpus_hardware_enabled is allocated from the heap and
not statically initialized.  This causes a crash on reboot when kvm thinks
vmx is enabled on random nonexistent cpus and accesses nonexistent percpu
lists.

Fix by explicitly clearing the variable.

Cc: stable@kernel.org
Reported-and-tested-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1ecbe23..4293528 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2305,6 +2305,7 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
 		r = -ENOMEM;
 		goto out_free_0;
 	}
+	cpumask_clear(cpus_hardware_enabled);
 
 	r = kvm_arch_hardware_setup();
 	if (r < 0)
-- 
cgit v0.10.2


From 0312af84164215a452f2a94957ebd9bce86e0204 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 8 Jun 2009 07:42:04 +0200
Subject: perf_counter, x86: Implement generalized cache event types, add Core2
 support

Fill in core2_hw_cache_event_id[] with the Core2 model specific events.

The events can be used in all the tools via the -e (--event) parameter,
for example "-e l1-misses" or -"-e l2-accesses" or "-e l2-write-misses".

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index e86679f..b1f71ff 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -194,7 +194,90 @@ static const u64 core2_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
 {
-	/* To be filled in */
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */
+		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */
+		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
+		[ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L2  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
+		[ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
+		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
 };
 
 static const u64 atom_hw_cache_event_ids
-- 
cgit v0.10.2


From ad689220614b6c7c0b13b70d742f358e9310e71e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 8 Jun 2009 09:30:41 +0200
Subject: perf_counter, x86: Implement generalized cache event types, add Atom
 support

Fill in core2_hw_cache_event_id[] with the Atom model specific events.

The events can be used in all the tools via the -e (--event) parameter,
for example "-e l1-misses" or -"-e l2-accesses" or "-e l2-write-misses".

( Note: these are straight from the Intel manuals - not tested yet.)

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index b1f71ff..71590e0 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -285,7 +285,90 @@ static const u64 atom_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
 {
-	/* To be filled in */
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD               */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2241, /* L1D_CACHE.ST               */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
+		[ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L2  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
+		[ C(RESULT_MISS)   ] = 0x0282, /* ITLB.MISSES                */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
+		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
 };
 
 static u64 intel_pmu_raw_event(u64 event)
-- 
cgit v0.10.2


From 1123e3ad73697d64ad99f0104bbe49f8b52d7d65 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 29 May 2009 11:25:09 +0200
Subject: perf_counter: Clean up x86 boot messages

Standardize and tidy up all the messages we print during
perfcounter initialization.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 71590e0..0339d19 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1298,23 +1298,22 @@ static int intel_pmu_init(void)
 	if (version < 2)
 		return -ENODEV;
 
-	x86_pmu = intel_pmu;
-	x86_pmu.version = version;
-	x86_pmu.num_counters = eax.split.num_counters;
+	x86_pmu				= intel_pmu;
+	x86_pmu.version			= version;
+	x86_pmu.num_counters		= eax.split.num_counters;
+	x86_pmu.counter_bits		= eax.split.bit_width;
+	x86_pmu.counter_mask		= (1ULL << eax.split.bit_width) - 1;
 
 	/*
 	 * Quirk: v2 perfmon does not report fixed-purpose counters, so
 	 * assume at least 3 counters:
 	 */
-	x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
-
-	x86_pmu.counter_bits = eax.split.bit_width;
-	x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
+	x86_pmu.num_counters_fixed	= max((int)edx.split.num_counters_fixed, 3);
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
 
 	/*
-	 * Nehalem:
+	 * Install the hw-cache-events table:
 	 */
 	switch (boot_cpu_data.x86_model) {
 	case 17:
@@ -1322,7 +1321,7 @@ static int intel_pmu_init(void)
 		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
 			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
 
-		pr_info("... installed Core2 event tables\n");
+		pr_cont("Core2 events, ");
 		break;
 	default:
 	case 26:
@@ -1330,14 +1329,14 @@ static int intel_pmu_init(void)
 		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
 			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
 
-		pr_info("... installed Nehalem/Corei7 event tables\n");
+		pr_cont("Nehalem/Corei7 events, ");
 		break;
 	case 28:
 		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
 		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
 			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
 
-		pr_info("... installed Atom event tables\n");
+		pr_cont("Atom events, ");
 		break;
 	}
 	return 0;
@@ -1353,6 +1352,8 @@ void __init init_hw_perf_counters(void)
 {
 	int err;
 
+	pr_info("Performance Counters: ");
+
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_INTEL:
 		err = intel_pmu_init();
@@ -1363,14 +1364,13 @@ void __init init_hw_perf_counters(void)
 	default:
 		return;
 	}
-	if (err != 0)
+	if (err != 0) {
+		pr_cont("no PMU driver, software counters only.\n");
 		return;
+	}
 
-	pr_info("%s Performance Monitoring support detected.\n", x86_pmu.name);
-	pr_info("... version:         %d\n", x86_pmu.version);
-	pr_info("... bit width:       %d\n", x86_pmu.counter_bits);
+	pr_cont("%s PMU driver.\n", x86_pmu.name);
 
-	pr_info("... num counters:    %d\n", x86_pmu.num_counters);
 	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
 		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
 		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
@@ -1379,23 +1379,25 @@ void __init init_hw_perf_counters(void)
 	perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
 	perf_max_counters = x86_pmu.num_counters;
 
-	pr_info("... value mask:      %016Lx\n", x86_pmu.counter_mask);
-	pr_info("... max period:      %016Lx\n", x86_pmu.max_period);
-
 	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
 		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
 		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
 		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
 	}
-	pr_info("... fixed counters:  %d\n", x86_pmu.num_counters_fixed);
 
 	perf_counter_mask |=
 		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
 
-	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
-
 	perf_counters_lapic_init();
 	register_die_notifier(&perf_counter_nmi_notifier);
+
+	pr_info("... version:                 %d\n",     x86_pmu.version);
+	pr_info("... bit width:               %d\n",     x86_pmu.counter_bits);
+	pr_info("... generic counters:        %d\n",     x86_pmu.num_counters);
+	pr_info("... value mask:              %016Lx\n", x86_pmu.counter_mask);
+	pr_info("... max period:              %016Lx\n", x86_pmu.max_period);
+	pr_info("... fixed-purpose counters:  %d\n",     x86_pmu.num_counters_fixed);
+	pr_info("... counter mask:            %016Lx\n", perf_counter_mask);
 }
 
 static inline void x86_pmu_read(struct perf_counter *counter)
-- 
cgit v0.10.2


From 312325094785566a0e42a88c1bf6e7eb54c5d70e Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 8 Jun 2009 15:07:08 +0200
Subject: dma-debug: comment style fixes

Last patch series introduced some new comment which does not fit the
Kernel comment style guidelines. Fix it with this patch.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 77053d9..b8a61ff 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -542,7 +542,8 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf,
 
 	write_lock_irqsave(&driver_name_lock, flags);
 
-	/* Now handle the string we got from userspace very carefully.
+	/*
+	 * Now handle the string we got from userspace very carefully.
 	 * The rules are:
 	 *         - only use the first token we got
 	 *         - token delimiter is everything looking like a space
@@ -551,7 +552,7 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf,
 	 */
 	if (!isalnum(buf[0])) {
 		/*
-		   If the first character userspace gave us is not
+		 * If the first character userspace gave us is not
 		 * alphanumerical then assume the filter should be
 		 * switched off.
 		 */
-- 
cgit v0.10.2


From c17e2cf7376a2010b8b114fdeebd4e340a5e9cb2 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 8 Jun 2009 15:19:29 +0200
Subject: dma-debug: code style fixes

This patch changes the recent updates to dma-debug to conform with
coding style guidelines of Linux and the -tip tree.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index b8a61ff..9561825 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -501,8 +501,8 @@ out_err:
 static ssize_t filter_read(struct file *file, char __user *user_buf,
 			   size_t count, loff_t *ppos)
 {
-	unsigned long flags;
 	char buf[NAME_MAX_LEN + 1];
+	unsigned long flags;
 	int len;
 
 	if (!current_driver_name[0])
@@ -523,9 +523,9 @@ static ssize_t filter_read(struct file *file, char __user *user_buf,
 static ssize_t filter_write(struct file *file, const char __user *userbuf,
 			    size_t count, loff_t *ppos)
 {
-	unsigned long flags;
 	char buf[NAME_MAX_LEN];
-	size_t len = NAME_MAX_LEN - 1;
+	unsigned long flags;
+	size_t len;
 	int i;
 
 	/*
@@ -534,7 +534,7 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf,
 	 * disabled. Since copy_from_user can fault and may sleep we
 	 * need to copy to temporary buffer first
 	 */
-	len = min(count, len);
+	len = min(count, NAME_MAX_LEN - 1);
 	if (copy_from_user(buf, userbuf, len))
 		return -EFAULT;
 
@@ -1040,18 +1040,19 @@ EXPORT_SYMBOL(debug_dma_map_sg);
 
 static int get_nr_mapped_entries(struct device *dev, struct scatterlist *s)
 {
-	struct dma_debug_entry *entry;
+	struct dma_debug_entry *entry, ref;
 	struct hash_bucket *bucket;
 	unsigned long flags;
-	int mapped_ents = 0;
-	struct dma_debug_entry ref;
+	int mapped_ents;
 
-	ref.dev = dev;
+	ref.dev      = dev;
 	ref.dev_addr = sg_dma_address(s);
-	ref.size = sg_dma_len(s),
+	ref.size     = sg_dma_len(s),
+
+	bucket       = get_hash_bucket(&ref, &flags);
+	entry        = hash_bucket_find(bucket, &ref);
+	mapped_ents  = 0;
 
-	bucket = get_hash_bucket(&ref, &flags);
-	entry = hash_bucket_find(bucket, &ref);
 	if (entry)
 		mapped_ents = entry->sg_mapped_ents;
 	put_hash_bucket(bucket, &flags);
-- 
cgit v0.10.2


From e7ed70eedccc78e79ce6da2155e9caf90aff4003 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 8 Jun 2009 15:39:24 +0200
Subject: dma-debug: use pr_* instead of printk(KERN_* ...)

The pr_* macros are shorter than the old printk(KERN_ ...) variant.
Change the dma-debug code to use the new macros and save a few
unnecessary line breaks. If lines don't break the source code can also
be grepped more easily.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 9561825..24c4a2c 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -139,7 +139,7 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry)
 {
 #ifdef CONFIG_STACKTRACE
 	if (entry) {
-		printk(KERN_WARNING "Mapped at:\n");
+		pr_warning("Mapped at:\n");
 		print_stack_trace(&entry->stacktrace, 0);
 	}
 #endif
@@ -377,8 +377,7 @@ static struct dma_debug_entry *dma_entry_alloc(void)
 	spin_lock_irqsave(&free_entries_lock, flags);
 
 	if (list_empty(&free_entries)) {
-		printk(KERN_ERR "DMA-API: debugging out of memory "
-				"- disabling\n");
+		pr_err("DMA-API: debugging out of memory - disabling\n");
 		global_disable = true;
 		goto out;
 	}
@@ -483,8 +482,7 @@ static int prealloc_memory(u32 num_entries)
 	num_free_entries = num_entries;
 	min_free_entries = num_entries;
 
-	printk(KERN_INFO "DMA-API: preallocated %d debug entries\n",
-			num_entries);
+	pr_info("DMA-API: preallocated %d debug entries\n", num_entries);
 
 	return 0;
 
@@ -534,7 +532,7 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf,
 	 * disabled. Since copy_from_user can fault and may sleep we
 	 * need to copy to temporary buffer first
 	 */
-	len = min(count, NAME_MAX_LEN - 1);
+	len = min(count, (size_t)(NAME_MAX_LEN - 1));
 	if (copy_from_user(buf, userbuf, len))
 		return -EFAULT;
 
@@ -557,8 +555,7 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf,
 		 * switched off.
 		 */
 		if (current_driver_name[0])
-			printk(KERN_INFO "DMA-API: switching off dma-debug "
-					 "driver filter\n");
+			pr_info("DMA-API: switching off dma-debug driver filter\n");
 		current_driver_name[0] = 0;
 		current_driver = NULL;
 		goto out_unlock;
@@ -576,8 +573,8 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf,
 	current_driver_name[i] = 0;
 	current_driver = NULL;
 
-	printk(KERN_INFO "DMA-API: enable driver filter for driver [%s]\n",
-	       current_driver_name);
+	pr_info("DMA-API: enable driver filter for driver [%s]\n",
+		current_driver_name);
 
 out_unlock:
 	write_unlock_irqrestore(&driver_name_lock, flags);
@@ -594,7 +591,7 @@ static int dma_debug_fs_init(void)
 {
 	dma_debug_dent = debugfs_create_dir("dma-api", NULL);
 	if (!dma_debug_dent) {
-		printk(KERN_ERR "DMA-API: can not create debugfs directory\n");
+		pr_err("DMA-API: can not create debugfs directory\n");
 		return -ENOMEM;
 	}
 
@@ -693,7 +690,7 @@ void dma_debug_add_bus(struct bus_type *bus)
 
 	nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL);
 	if (nb == NULL) {
-		printk(KERN_ERR "dma_debug_add_bus: out of memory\n");
+		pr_err("dma_debug_add_bus: out of memory\n");
 		return;
 	}
 
@@ -718,8 +715,7 @@ void dma_debug_init(u32 num_entries)
 	}
 
 	if (dma_debug_fs_init() != 0) {
-		printk(KERN_ERR "DMA-API: error creating debugfs entries "
-				"- disabling\n");
+		pr_err("DMA-API: error creating debugfs entries - disabling\n");
 		global_disable = true;
 
 		return;
@@ -729,8 +725,7 @@ void dma_debug_init(u32 num_entries)
 		num_entries = req_entries;
 
 	if (prealloc_memory(num_entries) != 0) {
-		printk(KERN_ERR "DMA-API: debugging out of memory error "
-				"- disabled\n");
+		pr_err("DMA-API: debugging out of memory error - disabled\n");
 		global_disable = true;
 
 		return;
@@ -738,7 +733,7 @@ void dma_debug_init(u32 num_entries)
 
 	nr_total_entries = num_free_entries;
 
-	printk(KERN_INFO "DMA-API: debugging enabled by kernel config\n");
+	pr_info("DMA-API: debugging enabled by kernel config\n");
 }
 
 static __init int dma_debug_cmdline(char *str)
@@ -747,8 +742,7 @@ static __init int dma_debug_cmdline(char *str)
 		return -EINVAL;
 
 	if (strncmp(str, "off", 3) == 0) {
-		printk(KERN_INFO "DMA-API: debugging disabled on kernel "
-				 "command line\n");
+		pr_info("DMA-API: debugging disabled on kernel command line\n");
 		global_disable = true;
 	}
 
@@ -1239,8 +1233,8 @@ static int __init dma_debug_driver_setup(char *str)
 	}
 
 	if (current_driver_name[0])
-		printk(KERN_INFO "DMA-API: enable driver filter for "
-				 "driver [%s]\n", current_driver_name);
+		pr_info("DMA-API: enable driver filter for driver [%s]\n",
+			current_driver_name);
 
 
 	return 1;
-- 
cgit v0.10.2


From be81c6ea23b8b471141734ef4bc005f5127aaf43 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 8 Jun 2009 15:46:19 +0200
Subject: dma-debug: disable/enable irqs only once in device_dma_allocations

There is no need to disable/enable irqs on each loop iteration. Just
disable irqs for the whole time the loop runs.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 24c4a2c..27b369d 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -649,15 +649,19 @@ static int device_dma_allocations(struct device *dev)
 	unsigned long flags;
 	int count = 0, i;
 
+	local_irq_save(flags);
+
 	for (i = 0; i < HASH_SIZE; ++i) {
-		spin_lock_irqsave(&dma_entry_hash[i].lock, flags);
+		spin_lock(&dma_entry_hash[i].lock);
 		list_for_each_entry(entry, &dma_entry_hash[i].list, list) {
 			if (entry->dev == dev)
 				count += 1;
 		}
-		spin_unlock_irqrestore(&dma_entry_hash[i].lock, flags);
+		spin_unlock(&dma_entry_hash[i].lock);
 	}
 
+	local_irq_restore(flags);
+
 	return count;
 }
 
-- 
cgit v0.10.2


From 0bf841281e58d0b3cc9fe9dc4383df7694bde6bd Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 8 Jun 2009 15:53:46 +0200
Subject: dma-debug: simplify logic in driver_filter()

This patch makes the driver_filter function more readable by
reorganizing the code. The removal of a code code block to an upper
indentation level makes hard-to-read line-wraps unnecessary.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 27b369d..ad65fc0 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -147,6 +147,10 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry)
 
 static bool driver_filter(struct device *dev)
 {
+	struct device_driver *drv;
+	unsigned long flags;
+	bool ret;
+
 	/* driver filter off */
 	if (likely(!current_driver_name[0]))
 		return true;
@@ -155,32 +159,28 @@ static bool driver_filter(struct device *dev)
 	if (current_driver && dev->driver == current_driver)
 		return true;
 
-	/* driver filter on but not yet initialized */
-	if (!current_driver && current_driver_name[0]) {
-		struct device_driver *drv = get_driver(dev->driver);
-		unsigned long flags;
-		bool ret = false;
-
-		if (!drv)
-			return false;
-
-		/* lock to protect against change of current_driver_name */
-		read_lock_irqsave(&driver_name_lock, flags);
+	if (current_driver || !current_driver_name[0])
+		return false;
 
-		if (drv->name &&
-		    strncmp(current_driver_name, drv->name,
-			    NAME_MAX_LEN-1) == 0) {
-			current_driver = drv;
-			ret = true;
-		}
+	/* driver filter on but not yet initialized */
+	drv = get_driver(dev->driver);
+	if (!drv)
+		return false;
 
-		read_unlock_irqrestore(&driver_name_lock, flags);
-		put_driver(drv);
+	/* lock to protect against change of current_driver_name */
+	read_lock_irqsave(&driver_name_lock, flags);
 
-		return ret;
+	ret = false;
+	if (drv->name &&
+	    strncmp(current_driver_name, drv->name, NAME_MAX_LEN - 1) == 0) {
+		current_driver = drv;
+		ret = true;
 	}
 
-	return false;
+	read_unlock_irqrestore(&driver_name_lock, flags);
+	put_driver(drv);
+
+	return ret;
 }
 
 #define err_printk(dev, entry, format, arg...) do {		\
-- 
cgit v0.10.2


From e36b80b658d471be5a8a40f00e2c7614524b86a2 Mon Sep 17 00:00:00 2001
From: Matthieu Castet <castet.matthieu@free.fr>
Date: Fri, 22 May 2009 22:25:04 +0200
Subject: SSB: BCM47xx: Export ssb_watchdog_timer_set

this patch export ssb_watchdog_timer_set to allow to use it in a Linux
watchdog driver.

Signed-off-by: Matthieu CASTET <castet.matthieu@free.fr>
Acked-by : Michael Buesch <mb@bu3sch.de>
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>

diff --git a/drivers/ssb/embedded.c b/drivers/ssb/embedded.c
index 7dc3a6b..a0e0d24 100644
--- a/drivers/ssb/embedded.c
+++ b/drivers/ssb/embedded.c
@@ -29,6 +29,7 @@ int ssb_watchdog_timer_set(struct ssb_bus *bus, u32 ticks)
 	}
 	return -ENODEV;
 }
+EXPORT_SYMBOL(ssb_watchdog_timer_set);
 
 u32 ssb_gpio_in(struct ssb_bus *bus, u32 mask)
 {
-- 
cgit v0.10.2


From e082f188f774544bc2c2edf51176157503c98fe4 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Tue, 2 Jun 2009 19:05:28 +0100
Subject: MIPS: Sibyte: Honor CONFIG_CMDLINE

Original patch by Imre Kaloz <kaloz@openwrt.org>.

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>

diff --git a/arch/mips/sibyte/cfe/setup.c b/arch/mips/sibyte/cfe/setup.c
index 3de30f7..eb5396c 100644
--- a/arch/mips/sibyte/cfe/setup.c
+++ b/arch/mips/sibyte/cfe/setup.c
@@ -288,13 +288,7 @@ void __init prom_init(void)
 	 */
 	cfe_cons_handle = cfe_getstdhandle(CFE_STDHANDLE_CONSOLE);
 	if (cfe_getenv("LINUX_CMDLINE", arcs_cmdline, CL_SIZE) < 0) {
-		if (argc < 0) {
-			/*
-			 * It's OK for direct boot to not provide a
-			 *  command line
-			 */
-			strcpy(arcs_cmdline, "root=/dev/ram0 ");
-		} else {
+		if (argc >= 0) {
 			/* The loader should have set the command line */
 			/* too early for panic to do any good */
 			printk("LINUX_CMDLINE not defined in cfe.");
-- 
cgit v0.10.2


From c9d89d97f0d174b9154820dd5c6726d1c794cd99 Mon Sep 17 00:00:00 2001
From: Yoichi Yuasa <yoichi_yuasa@tripeaks.co.jp>
Date: Tue, 2 Jun 2009 23:15:10 +0900
Subject: MIPS: Kconfig: Remove "Support for" from Cavium system type

Signed-off-by: Yoichi Yuasa <yoichi_yuasa@tripeaks.co.jp>
Acked-by: David Daney <ddaney@caviumnetworks.com>
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 09b1287..28119e6 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -593,7 +593,7 @@ config WR_PPMC
 	  board, which is based on GT64120 bridge chip.
 
 config CAVIUM_OCTEON_SIMULATOR
-	bool "Support for the Cavium Networks Octeon Simulator"
+	bool "Cavium Networks Octeon Simulator"
 	select CEVT_R4K
 	select 64BIT_PHYS_ADDR
 	select DMA_COHERENT
@@ -607,7 +607,7 @@ config CAVIUM_OCTEON_SIMULATOR
 	  hardware.
 
 config CAVIUM_OCTEON_REFERENCE_BOARD
-	bool "Support for the Cavium Networks Octeon reference board"
+	bool "Cavium Networks Octeon reference board"
 	select CEVT_R4K
 	select 64BIT_PHYS_ADDR
 	select DMA_COHERENT
-- 
cgit v0.10.2


From e25bfc9243f2eab12a2ce92b7f4b8a2e3e6949a6 Mon Sep 17 00:00:00 2001
From: Yoichi Yuasa <yoichi_yuasa@tripeaks.co.jp>
Date: Tue, 2 Jun 2009 23:17:07 +0900
Subject: MIPS: Cobalt: PCI bus is always required to obtain the board ID

Signed-off-by: Yoichi Yuasa <yoichi_yuasa@tripeaks.co.jp>
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 28119e6..25f3b0a 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -72,6 +72,7 @@ config MIPS_COBALT
 	select IRQ_CPU
 	select IRQ_GT641XX
 	select PCI_GT64XXX_PCI0
+	select PCI
 	select SYS_HAS_CPU_NEVADA
 	select SYS_HAS_EARLY_PRINTK
 	select SYS_SUPPORTS_32BIT_KERNEL
-- 
cgit v0.10.2


From 3a553147eaad5d4de90ab1f695aa13ddbea684ec Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Thu, 4 Jun 2009 18:05:49 +0530
Subject: MIPS: ioctl.h: Fix headers_check warnings

Make ioctl.h compatible with asm-generic/ioctl.h and userspace

fix the following 'make headers_check' warning:

  usr/include/asm-mips/ioctl.h:64: extern's make no sense in userspace

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>

diff --git a/arch/mips/include/asm/ioctl.h b/arch/mips/include/asm/ioctl.h
index 85067e2..9161634 100644
--- a/arch/mips/include/asm/ioctl.h
+++ b/arch/mips/include/asm/ioctl.h
@@ -60,12 +60,16 @@
 	 ((nr)   << _IOC_NRSHIFT) | \
 	 ((size) << _IOC_SIZESHIFT))
 
+#ifdef __KERNEL__
 /* provoke compile error for invalid uses of size argument */
 extern unsigned int __invalid_size_argument_for_IOC;
 #define _IOC_TYPECHECK(t) \
 	((sizeof(t) == sizeof(t[1]) && \
 	  sizeof(t) < (1 << _IOC_SIZEBITS)) ? \
 	  sizeof(t) : __invalid_size_argument_for_IOC)
+#else
+#define _IOC_TYPECHECK(t)	(sizeof(t))
+#endif
 
 /* used to create numbers */
 #define _IO(type, nr)		_IOC(_IOC_NONE, (type), (nr), 0)
-- 
cgit v0.10.2


From 5636919b5c909fee54a6ef5226475ecae012ad02 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Sat, 28 Feb 2009 09:44:28 +0000
Subject: MIPS: Outline udelay and fix a few issues.

Outlining fixes the issue were on certain CPUs such as the R10000 family
the delay loop would need an extra cycle if it overlaps a cacheline
boundary.

The rewrite also fixes build errors with GCC 4.4 which was changed in
way incompatible with the kernel's inline assembly.

Relying on pure C for computation of the delay value removes the need for
explicit.  The price we pay is a slight slowdown of the computation - to
be fixed on another day.

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>

diff --git a/arch/mips/include/asm/cpu-info.h b/arch/mips/include/asm/cpu-info.h
index 744cd8f..1260443 100644
--- a/arch/mips/include/asm/cpu-info.h
+++ b/arch/mips/include/asm/cpu-info.h
@@ -39,8 +39,8 @@ struct cache_desc {
 #define MIPS_CACHE_PINDEX	0x00000020	/* Physically indexed cache */
 
 struct cpuinfo_mips {
-	unsigned long		udelay_val;
-	unsigned long		asid_cache;
+	unsigned int		udelay_val;
+	unsigned int		asid_cache;
 
 	/*
 	 * Capability and feature descriptor structure for MIPS CPU
diff --git a/arch/mips/include/asm/delay.h b/arch/mips/include/asm/delay.h
index b0bccd2..a07e51b 100644
--- a/arch/mips/include/asm/delay.h
+++ b/arch/mips/include/asm/delay.h
@@ -11,94 +11,12 @@
 #ifndef _ASM_DELAY_H
 #define _ASM_DELAY_H
 
-#include <linux/param.h>
-#include <linux/smp.h>
+extern void __delay(unsigned int loops);
+extern void __ndelay(unsigned int ns);
+extern void __udelay(unsigned int us);
 
-#include <asm/compiler.h>
-#include <asm/war.h>
-
-static inline void __delay(unsigned long loops)
-{
-	if (sizeof(long) == 4)
-		__asm__ __volatile__ (
-		"	.set	noreorder				\n"
-		"	.align	3					\n"
-		"1:	bnez	%0, 1b					\n"
-		"	subu	%0, 1					\n"
-		"	.set	reorder					\n"
-		: "=r" (loops)
-		: "0" (loops));
-	else if (sizeof(long) == 8 && !DADDI_WAR)
-		__asm__ __volatile__ (
-		"	.set	noreorder				\n"
-		"	.align	3					\n"
-		"1:	bnez	%0, 1b					\n"
-		"	dsubu	%0, 1					\n"
-		"	.set	reorder					\n"
-		: "=r" (loops)
-		: "0" (loops));
-	else if (sizeof(long) == 8 && DADDI_WAR)
-		__asm__ __volatile__ (
-		"	.set	noreorder				\n"
-		"	.align	3					\n"
-		"1:	bnez	%0, 1b					\n"
-		"	dsubu	%0, %2					\n"
-		"	.set	reorder					\n"
-		: "=r" (loops)
-		: "0" (loops), "r" (1));
-}
-
-
-/*
- * Division by multiplication: you don't have to worry about
- * loss of precision.
- *
- * Use only for very small delays ( < 1 msec).  Should probably use a
- * lookup table, really, as the multiplications take much too long with
- * short delays.  This is a "reasonable" implementation, though (and the
- * first constant multiplications gets optimized away if the delay is
- * a constant)
- */
-
-static inline void __udelay(unsigned long usecs, unsigned long lpj)
-{
-	unsigned long hi, lo;
-
-	/*
-	 * The rates of 128 is rounded wrongly by the catchall case
-	 * for 64-bit.  Excessive precission?  Probably ...
-	 */
-#if defined(CONFIG_64BIT) && (HZ == 128)
-	usecs *= 0x0008637bd05af6c7UL;		/* 2**64 / (1000000 / HZ) */
-#elif defined(CONFIG_64BIT)
-	usecs *= (0x8000000000000000UL / (500000 / HZ));
-#else /* 32-bit junk follows here */
-	usecs *= (unsigned long) (((0x8000000000000000ULL / (500000 / HZ)) +
-	                           0x80000000ULL) >> 32);
-#endif
-
-	if (sizeof(long) == 4)
-		__asm__("multu\t%2, %3"
-		: "=h" (usecs), "=l" (lo)
-		: "r" (usecs), "r" (lpj)
-		: GCC_REG_ACCUM);
-	else if (sizeof(long) == 8 && !R4000_WAR)
-		__asm__("dmultu\t%2, %3"
-		: "=h" (usecs), "=l" (lo)
-		: "r" (usecs), "r" (lpj)
-		: GCC_REG_ACCUM);
-	else if (sizeof(long) == 8 && R4000_WAR)
-		__asm__("dmultu\t%3, %4\n\tmfhi\t%0"
-		: "=r" (usecs), "=h" (hi), "=l" (lo)
-		: "r" (usecs), "r" (lpj)
-		: GCC_REG_ACCUM);
-
-	__delay(usecs);
-}
-
-#define __udelay_val cpu_data[raw_smp_processor_id()].udelay_val
-
-#define udelay(usecs) __udelay((usecs), __udelay_val)
+#define ndelay(ns) __udelay(ns)
+#define udelay(us) __udelay(us)
 
 /* make sure "usecs *= ..." in udelay do not overflow. */
 #if HZ >= 1000
diff --git a/arch/mips/kernel/proc.c b/arch/mips/kernel/proc.c
index 26760ca..e0a4ac1 100644
--- a/arch/mips/kernel/proc.c
+++ b/arch/mips/kernel/proc.c
@@ -42,7 +42,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 	seq_printf(m, fmt, __cpu_name[n],
 	                           (version >> 4) & 0x0f, version & 0x0f,
 	                           (fp_vers >> 4) & 0x0f, fp_vers & 0x0f);
-	seq_printf(m, "BogoMIPS\t\t: %lu.%02lu\n",
+	seq_printf(m, "BogoMIPS\t\t: %u.%02u\n",
 	              cpu_data[n].udelay_val / (500000/HZ),
 	              (cpu_data[n].udelay_val / (5000/HZ)) % 100);
 	seq_printf(m, "wait instruction\t: %s\n", cpu_wait ? "yes" : "no");
diff --git a/arch/mips/lib/Makefile b/arch/mips/lib/Makefile
index c13c7ad..2adead5 100644
--- a/arch/mips/lib/Makefile
+++ b/arch/mips/lib/Makefile
@@ -2,8 +2,8 @@
 # Makefile for MIPS-specific library files..
 #
 
-lib-y	+= csum_partial.o memcpy.o memcpy-inatomic.o memset.o strlen_user.o \
-	   strncpy_user.o strnlen_user.o uncached.o
+lib-y	+= csum_partial.o delay.o memcpy.o memcpy-inatomic.o memset.o \
+	   strlen_user.o strncpy_user.o strnlen_user.o uncached.o
 
 obj-y			+= iomap.o
 obj-$(CONFIG_PCI)	+= iomap-pci.o
diff --git a/arch/mips/lib/delay.c b/arch/mips/lib/delay.c
new file mode 100644
index 0000000..f69c6b5
--- /dev/null
+++ b/arch/mips/lib/delay.c
@@ -0,0 +1,56 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1994 by Waldorf Electronics
+ * Copyright (C) 1995 - 2000, 01, 03 by Ralf Baechle
+ * Copyright (C) 1999, 2000 Silicon Graphics, Inc.
+ * Copyright (C) 2007  Maciej W. Rozycki
+ */
+#include <linux/module.h>
+#include <linux/param.h>
+#include <linux/smp.h>
+
+#include <asm/compiler.h>
+#include <asm/war.h>
+
+inline void __delay(unsigned int loops)
+{
+	__asm__ __volatile__ (
+	"	.set	noreorder				\n"
+	"	.align	3					\n"
+	"1:	bnez	%0, 1b					\n"
+	"	subu	%0, 1					\n"
+	"	.set	reorder					\n"
+	: "=r" (loops)
+	: "0" (loops));
+}
+EXPORT_SYMBOL(__delay);
+
+/*
+ * Division by multiplication: you don't have to worry about
+ * loss of precision.
+ *
+ * Use only for very small delays ( < 1 msec).  Should probably use a
+ * lookup table, really, as the multiplications take much too long with
+ * short delays.  This is a "reasonable" implementation, though (and the
+ * first constant multiplications gets optimized away if the delay is
+ * a constant)
+ */
+
+void __udelay(unsigned long us)
+{
+	unsigned int lpj = current_cpu_data.udelay_val;
+
+	__delay((us * 0x000010c7 * HZ * lpj) >> 32);
+}
+EXPORT_SYMBOL(__udelay);
+
+void __ndelay(unsigned long ns)
+{
+	unsigned int lpj = current_cpu_data.udelay_val;
+
+	__delay((us * 0x00000005 * HZ * lpj) >> 32);
+}
+EXPORT_SYMBOL(__ndelay);
-- 
cgit v0.10.2


From 5284c6b99ea017f73c09b50f34a637ff9d5d26a0 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Mon, 8 Jun 2009 12:31:00 +0100
Subject: pata_netcell: Fix typo

The previous patch submission had a I typo I didn't catch but Bartlomiej
noted. Guess this proves the point about any patch being risky late in an rc

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/ata/pata_netcell.c b/drivers/ata/pata_netcell.c
index 9a69809..f0d52f7 100644
--- a/drivers/ata/pata_netcell.c
+++ b/drivers/ata/pata_netcell.c
@@ -26,7 +26,7 @@ static unsigned int netcell_read_id(struct ata_device *adev,
 	unsigned int err_mask = ata_do_dev_read_id(adev, tf, id);
 	/* Firmware forgets to mark words 85-87 valid */
 	if (err_mask == 0)
-		id[ATA_ID_CSF_DEFAULT] |= 0x0400;
+		id[ATA_ID_CSF_DEFAULT] |= 0x4000;
 	return err_mask;
 }
 
-- 
cgit v0.10.2


From 9aee2286071c23c535fe9928eec1a26e0bcf256d Mon Sep 17 00:00:00 2001
From: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Date: Mon, 8 Jun 2009 12:41:35 -0400
Subject: ext4: fix dx_map_entry to support 256k directory blocks

The dx_map_entry structure doesn't support over 64KB block size by
current usage of its member("offs"). Because "offs" treats an offset
of copies of the ext4_dir_entry_2 structure as is. This member size is
16 bits. But real offset for over 64KB(256KB) block size needs 18
bits. However, real offset keeps 4 byte boundary, so lower 2 bits is
not used.

Therefore, we do the following to fix this limitation:
For "store":
	we divide the real offset by 4 and then store this result to "offs"
	member.
For "use":
	we multiply "offs" member by 4 and then use this result
	as real offset.

Signed-off-by: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index f2bc160..07eb664 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -749,7 +749,7 @@ static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
 			ext4fs_dirhash(de->name, de->name_len, &h);
 			map_tail--;
 			map_tail->hash = h.hash;
-			map_tail->offs = (u16) ((char *) de - base);
+			map_tail->offs = ((char *) de - base)>>2;
 			map_tail->size = le16_to_cpu(de->rec_len);
 			count++;
 			cond_resched();
@@ -1147,7 +1147,8 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
 	unsigned rec_len = 0;
 
 	while (count--) {
-		struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + map->offs);
+		struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) 
+						(from + (map->offs<<2));
 		rec_len = EXT4_DIR_REC_LEN(de->name_len);
 		memcpy (to, de, rec_len);
 		((struct ext4_dir_entry_2 *) to)->rec_len =
-- 
cgit v0.10.2


From c4ed3f04ba9defe22aa729d1646f970f791c03d7 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Mon, 8 Jun 2009 10:44:05 -0500
Subject: x86, UV: Fix macros for multiple coherency domains

Fix bug in the SGI UV macros that support systems with multiple
coherency domains.  The macros used for referencing global MMR
(chipset registers) are failing to correctly "or" the NASID
(node identifier) bits that reside above M+N. These high bits
are supplied automatically by the chipset for memory accesses
coming from the processor socket.

However, the bits must be present for references to the special
global MMR space used to map chipset registers. (See uv_hub.h
for more details ...)

The bug results in references to invalid/incorrect nodes.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Cc: <stable@kernel.org>
LKML-Reference: <20090608154405.GA16395@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index d3a98ea..341070f 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -133,6 +133,7 @@ struct uv_scir_s {
 struct uv_hub_info_s {
 	unsigned long		global_mmr_base;
 	unsigned long		gpa_mask;
+	unsigned int		gnode_extra;
 	unsigned long		gnode_upper;
 	unsigned long		lowmem_remap_top;
 	unsigned long		lowmem_remap_base;
@@ -159,7 +160,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
  * 		p -  PNODE (local part of nsids, right shifted 1)
  */
 #define UV_NASID_TO_PNODE(n)		(((n) >> 1) & uv_hub_info->pnode_mask)
-#define UV_PNODE_TO_NASID(p)		(((p) << 1) | uv_hub_info->gnode_upper)
+#define UV_PNODE_TO_GNODE(p)		((p) |uv_hub_info->gnode_extra)
+#define UV_PNODE_TO_NASID(p)		(UV_PNODE_TO_GNODE(p) << 1)
 
 #define UV_LOCAL_MMR_BASE		0xf4000000UL
 #define UV_GLOBAL_MMR32_BASE		0xf8000000UL
@@ -173,7 +175,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
 #define UV_GLOBAL_MMR32_PNODE_BITS(p)	((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT))
 
 #define UV_GLOBAL_MMR64_PNODE_BITS(p)					\
-	((unsigned long)(p) << UV_GLOBAL_MMR64_PNODE_SHIFT)
+	((unsigned long)(UV_PNODE_TO_GNODE(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT)
 
 #define UV_APIC_PNODE_SHIFT	6
 
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 2bda693..39f2af4 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -562,7 +562,7 @@ void __init uv_system_init(void)
 	union uvh_node_id_u node_id;
 	unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
 	int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
-	int max_pnode = 0;
+	int gnode_extra, max_pnode = 0;
 	unsigned long mmr_base, present, paddr;
 	unsigned short pnode_mask;
 
@@ -574,6 +574,13 @@ void __init uv_system_init(void)
 	mmr_base =
 	    uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
 	    ~UV_MMR_ENABLE;
+	pnode_mask = (1 << n_val) - 1;
+	node_id.v = uv_read_local_mmr(UVH_NODE_ID);
+	gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
+	gnode_upper = ((unsigned long)gnode_extra  << m_val);
+	printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n",
+			n_val, m_val, gnode_upper, gnode_extra);
+
 	printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
 
 	for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
@@ -607,11 +614,6 @@ void __init uv_system_init(void)
 		}
 	}
 
-	pnode_mask = (1 << n_val) - 1;
-	node_id.v = uv_read_local_mmr(UVH_NODE_ID);
-	gnode_upper = (((unsigned long)node_id.s.node_id) &
-		       ~((1 << n_val) - 1)) << m_val;
-
 	uv_bios_init();
 	uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
 			    &sn_coherency_id, &sn_region_size);
@@ -634,6 +636,7 @@ void __init uv_system_init(void)
 		uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
 		uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
 		uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
+		uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
 		uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
 		uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
 		uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
-- 
cgit v0.10.2


From c9690998ef48ffefeccb91c70a7739eebdea57f9 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Mon, 8 Jun 2009 19:09:39 +0200
Subject: x86: memtest: remove 64-bit division

Using gcc 3.3.5 a "make allmodconfig" + "CONFIG_KVM=n"
triggers a build error:

 arch/x86/mm/built-in.o(.init.text+0x43f7): In function `__change_page_attr':
 arch/x86/mm/pageattr.c:114: undefined reference to `__udivdi3'
 make: *** [.tmp_vmlinux1] Error 1

The culprit turned out to be a division in arch/x86/mm/memtest.c
For more info see this thread:

  http://marc.info/?l=linux-kernel&m=124416232620683

The patch entirely removes the division that caused the build
error.

[ Impact: build fix with certain GCC versions ]

Reported-by: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: xiyou.wangcong@gmail.com
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: <stable@kernel.org>
LKML-Reference: <20090608170939.GB12431@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 605c8be..c0bedcd 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -40,23 +40,23 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
 
 static void __init memtest(u64 pattern, u64 start_phys, u64 size)
 {
-	u64 i, count;
-	u64 *start;
+	u64 *p;
+	void *start, *end;
 	u64 start_bad, last_bad;
 	u64 start_phys_aligned;
 	size_t incr;
 
 	incr = sizeof(pattern);
 	start_phys_aligned = ALIGN(start_phys, incr);
-	count = (size - (start_phys_aligned - start_phys))/incr;
 	start = __va(start_phys_aligned);
+	end = start + size - (start_phys_aligned - start_phys);
 	start_bad = 0;
 	last_bad = 0;
 
-	for (i = 0; i < count; i++)
-		start[i] = pattern;
-	for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
-		if (*start == pattern)
+	for (p = start; p < end; p++)
+		*p = pattern;
+	for (p = start; p < end; p++, start_phys_aligned += incr) {
+		if (*p == pattern)
 			continue;
 		if (start_phys_aligned == last_bad + incr) {
 			last_bad += incr;
-- 
cgit v0.10.2


From 3af968e066d593bc4dacc021715f3e95ddf0996f Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 8 Jun 2009 12:31:53 -0700
Subject: async: Fix lack of boot-time console due to insufficient
 synchronization

Our async work synchronization was broken by "async: make sure
independent async domains can't accidentally entangle" (commit
d5a877e8dd409d8c702986d06485c374b705d340), because it would report
the wrong lowest active async ID when there was both running and
pending async work.

This caused things like no being able to read the root filesystem,
resulting in missing console devices and inability to run 'init',
causing a boot-time panic.

This fixes it by properly returning the lowest pending async ID: if
there is any running async work, that will have a lower ID than any
pending work, and we should _not_ look at the pending work list.

There were alternative patches from Jaswinder and James, but this one
also cleans up the code by removing the pointless 'ret' variable and
the unnecesary testing for an empty list around 'for_each_entry()' (if
the list is empty, the for_each_entry() thing just won't execute).

Fixes-bug: http://bugzilla.kernel.org/show_bug.cgi?id=13474
Reported-and-tested-by: Chris Clayton <chris2553@googlemail.com>
Cc: Jaswinder Singh Rajput <jaswinder@kernel.org>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/async.c b/kernel/async.c
index 5054030..27235f5 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -92,23 +92,18 @@ extern int initcall_debug;
 static async_cookie_t  __lowest_in_progress(struct list_head *running)
 {
 	struct async_entry *entry;
-	async_cookie_t ret = next_cookie; /* begin with "infinity" value */
 
 	if (!list_empty(running)) {
 		entry = list_first_entry(running,
 			struct async_entry, list);
-		ret = entry->cookie;
+		return entry->cookie;
 	}
 
-	if (!list_empty(&async_pending)) {
-		list_for_each_entry(entry, &async_pending, list)
-			if (entry->running == running) {
-				ret = entry->cookie;
-				break;
-			}
-	}
+	list_for_each_entry(entry, &async_pending, list)
+		if (entry->running == running)
+			return entry->cookie;
 
-	return ret;
+	return next_cookie;	/* "infinity" value */
 }
 
 static async_cookie_t  lowest_in_progress(struct list_head *running)
-- 
cgit v0.10.2


From f86748e91a14bd6cc49477560f33ed5d59896e89 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 8 Jun 2009 22:33:10 +0200
Subject: perf_counter, x86: Implement generalized cache event types, add AMD
 support

Fill in amd_hw_cache_event_id[] with the AMD CPU specific events,
for family 0x0f, 0x10 and 0x11.

There's apparently no distinction between load and store events, so
we only fill in the load events.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 0339d19..93af821 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -389,6 +389,97 @@ static u64 intel_pmu_raw_event(u64 event)
 	return event & CORE_EVNTSEL_MASK;
 }
 
+static const u64 amd_0f_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */
+		[ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L2  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
+		[ C(RESULT_MISS)   ] = 0x0085, /* Instr. fetch ITLB misses   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */
+		[ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
 /*
  * AMD Performance Monitor K7 and later.
  */
@@ -1345,6 +1436,17 @@ static int intel_pmu_init(void)
 static int amd_pmu_init(void)
 {
 	x86_pmu = amd_pmu;
+
+	switch (boot_cpu_data.x86) {
+	case 0x0f:
+	case 0x10:
+	case 0x11:
+		memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+
+		pr_cont("AMD Family 0f/10/11 events, ");
+		break;
+	}
 	return 0;
 }
 
-- 
cgit v0.10.2


From 820a644211bc1ac7715333abdb0f0b9ea4fbb549 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 8 Jun 2009 19:10:25 +0200
Subject: perf_counter, x86: Clean up hw_cache_event ids copies

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 93af821..56001fe 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1409,23 +1409,20 @@ static int intel_pmu_init(void)
 	switch (boot_cpu_data.x86_model) {
 	case 17:
 		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
-		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
-			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+		       sizeof(hw_cache_event_ids));
 
 		pr_cont("Core2 events, ");
 		break;
 	default:
 	case 26:
 		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
-		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
-			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+		       sizeof(hw_cache_event_ids));
 
 		pr_cont("Nehalem/Corei7 events, ");
 		break;
 	case 28:
 		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
-		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
-			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+		       sizeof(hw_cache_event_ids));
 
 		pr_cont("Atom events, ");
 		break;
-- 
cgit v0.10.2


From dab5855b12411334355ba21349a06700e4ae7a3b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 8 Jun 2009 21:11:57 +0300
Subject: perf_counter: Add mmap event hooks to mprotect()

Some JIT compilers allocate memory for generated code with
posix_memalign() + mprotect() so we need to hook into mprotect()
to make sure 'perf' is aware that we're executing code in
anonymous memory.

[ penberg@cs.helsinki.fi: move the hook to sys_mprotect() ]
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
LKML-Reference: <Pine.LNX.4.64.0906082111030.12407@melkki.cs.Helsinki.FI>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/mm/mprotect.c b/mm/mprotect.c
index 258197b..d80311b 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -23,6 +23,7 @@
 #include <linux/swapops.h>
 #include <linux/mmu_notifier.h>
 #include <linux/migrate.h>
+#include <linux/perf_counter.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
@@ -299,6 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
 		error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
 		if (error)
 			goto out;
+		perf_counter_mmap(vma);
 		nstart = tmp;
 
 		if (nstart < prev->vm_end)
-- 
cgit v0.10.2


From 80d496be89ed7dede5abee5c057634e80a31c82d Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Mon, 8 Jun 2009 21:12:48 +0300
Subject: perf report: Add support for profiling JIT generated code

This patch adds support for profiling JIT generated code to 'perf
report'. A JIT compiler is required to generate a "/tmp/perf-$PID.map"
symbols map that is parsed when looking and displaying symbols.

Thanks to Peter Zijlstra for his help with this patch!

Example "perf report" output with the Jato JIT:

 #
 # (40311 samples)
 #
 # Overhead           Command  Shared Object              Symbol
 # ........  ................  .........................  ......
 #
     97.80%              jato  /tmp/perf-11915.map        [.] Fibonacci.fib(I)I
      0.56%              jato  00000000b7fa023b           0x000000b7fa023b
      0.45%              jato  /tmp/perf-11915.map        [.] Fibonacci.main([Ljava/lang/String;)V
      0.38%              jato  [kernel]                   [k] get_page_from_freelist
      0.06%              jato  [kernel]                   [k] kunmap_atomic
      0.05%              jato  ./jato                     [.] utf8Hash
      0.04%              jato  ./jato                     [.] executeJava
      0.04%              jato  ./jato                     [.] defineClass

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: a.p.zijlstra@chello.nl
Cc: acme@redhat.com
LKML-Reference: <Pine.LNX.4.64.0906082111590.12407@melkki.cs.Helsinki.FI>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index f053a74..61d8718 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -209,6 +209,11 @@ static uint64_t vdso__map_ip(struct map *map, uint64_t ip)
 	return ip;
 }
 
+static inline int is_anon_memory(const char *filename)
+{
+     return strcmp(filename, "//anon") == 0;
+}
+
 static struct map *map__new(struct mmap_event *event)
 {
 	struct map *self = malloc(sizeof(*self));
@@ -216,6 +221,7 @@ static struct map *map__new(struct mmap_event *event)
 	if (self != NULL) {
 		const char *filename = event->filename;
 		char newfilename[PATH_MAX];
+		int anon;
 
 		if (cwd) {
 			int n = strcommon(filename);
@@ -227,6 +233,13 @@ static struct map *map__new(struct mmap_event *event)
 			}
 		}
 
+		anon = is_anon_memory(filename);
+
+		if (anon) {
+			snprintf(newfilename, sizeof(newfilename), "/tmp/perf-%d.map", event->pid);
+			filename = newfilename;
+		}
+
 		self->start = event->start;
 		self->end   = event->start + event->len;
 		self->pgoff = event->pgoff;
@@ -235,7 +248,7 @@ static struct map *map__new(struct mmap_event *event)
 		if (self->dso == NULL)
 			goto out_delete;
 
-		if (self->dso == vdso)
+		if (self->dso == vdso || anon)
 			self->map_ip = vdso__map_ip;
 		else
 			self->map_ip = map__map_ip;
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 158588c..32dd47d 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -220,6 +220,68 @@ out_failure:
 	return -1;
 }
 
+static int dso__load_perf_map(struct dso *self, symbol_filter_t filter, int verbose)
+{
+	char *line = NULL;
+	size_t n;
+	FILE *file;
+	int nr_syms = 0;
+
+	file = fopen(self->name, "r");
+	if (file == NULL)
+		goto out_failure;
+
+	while (!feof(file)) {
+		__u64 start, size;
+		struct symbol *sym;
+		int line_len, len;
+
+		line_len = getline(&line, &n, file);
+		if (line_len < 0)
+			break;
+
+		if (!line)
+			goto out_failure;
+
+		line[--line_len] = '\0'; /* \n */
+
+		len = hex2u64(line, &start);
+
+		len++;
+		if (len + 2 >= line_len)
+			continue;
+
+		len += hex2u64(line + len, &size);
+
+		len++;
+		if (len + 2 >= line_len)
+			continue;
+
+		sym = symbol__new(start, size, line + len,
+				  self->sym_priv_size, start, verbose);
+
+		if (sym == NULL)
+			goto out_delete_line;
+
+		if (filter && filter(self, sym))
+			symbol__delete(sym, self->sym_priv_size);
+		else {
+			dso__insert_symbol(self, sym);
+			nr_syms++;
+		}
+	}
+
+	free(line);
+	fclose(file);
+
+	return nr_syms;
+
+out_delete_line:
+	free(line);
+out_failure:
+	return -1;
+}
+
 /**
  * elf_symtab__for_each_symbol - iterate thru all the symbols
  *
@@ -507,6 +569,9 @@ int dso__load(struct dso *self, symbol_filter_t filter, int verbose)
 	if (!name)
 		return -1;
 
+	if (strncmp(self->name, "/tmp/perf-", 10) == 0)
+		return dso__load_perf_map(self, filter, verbose);
+
 more:
 	do {
 		switch (variant) {
-- 
cgit v0.10.2


From aefcf37b82886260d8540c9fb815e613c8977e06 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 8 Jun 2009 23:15:28 +0200
Subject: perf_counter tools: Standardize color printing

The rule is:

 - high overhead: red
 -  mid overhead: green
 -  low overhead: normal (white/black)

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 0e23fe9..3334a8b 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -1085,8 +1085,9 @@ parse_line(FILE *file, struct symbol *sym, uint64_t start, uint64_t len)
 			percent = 100.0 * hits / sym->hist_sum;
 
 		/*
-		 * We color high-overhead entries in red, low-overhead
-		 * entries in green - and keep the middle ground normal:
+		 * We color high-overhead entries in red, mid-overhead
+		 * entries in green - and keep the low overhead places
+		 * normal:
 		 */
 		if (percent >= 5.0)
 			color = PERF_COLOR_RED;
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 61d8718..0b18cb9 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -693,13 +693,16 @@ hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
 		char *color = PERF_COLOR_NORMAL;
 
 		/*
-		 * We color high-overhead entries in red, low-overhead
-		 * entries in green - and keep the middle ground normal:
+		 * We color high-overhead entries in red, mid-overhead
+		 * entries in green - and keep the low overhead places
+		 * normal:
 		 */
-		if (percent >= 5.0)
+		if (percent >= 5.0) {
 			color = PERF_COLOR_RED;
-		if (percent < 0.5)
-			color = PERF_COLOR_GREEN;
+		} else {
+			if (percent >= 0.5)
+				color = PERF_COLOR_GREEN;
+		}
 
 		ret = color_fprintf(fp, color, "   %6.2f%%",
 				(self->count * 100.0) / total_samples);
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index be1698f..8ba2480 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -248,13 +248,16 @@ static void print_sym_table(void)
 					 sum_ksamples));
 
 		/*
-		 * We color high-overhead entries in red, low-overhead
-		 * entries in green - and keep the middle ground normal:
+		 * We color high-overhead entries in red, mid-overhead
+		 * entries in green - and keep the low overhead places
+		 * normal:
 		 */
-		if (pcnt >= 5.0)
+		if (pcnt >= 5.0) {
 			color = PERF_COLOR_RED;
-		if (pcnt < 0.5)
-			color = PERF_COLOR_GREEN;
+		} else {
+			if (pcnt >= 0.5)
+				color = PERF_COLOR_GREEN;
+		}
 
 		if (nr_counters == 1)
 			printf("%20.2f - ", syme->weight);
-- 
cgit v0.10.2


From 1f8a6a10fb9437eac3f516ea4324a19087872f30 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 8 Jun 2009 18:18:39 +0200
Subject: ring-buffer: pass in lockdep class key for reader_lock

On Sun, 7 Jun 2009, Ingo Molnar wrote:
> Testing tracer sched_switch: <6>Starting ring buffer hammer
> PASSED
> Testing tracer sysprof: PASSED
> Testing tracer function: PASSED
> Testing tracer irqsoff:
> =============================================
> PASSED
> Testing tracer preemptoff: PASSED
> Testing tracer preemptirqsoff: [ INFO: possible recursive locking detected ]
> PASSED
> Testing tracer branch: 2.6.30-rc8-tip-01972-ge5b9078-dirty #5760
> ---------------------------------------------
> rb_consumer/431 is trying to acquire lock:
>  (&cpu_buffer->reader_lock){......}, at: [<c109eef7>] ring_buffer_reset_cpu+0x37/0x70
>
> but task is already holding lock:
>  (&cpu_buffer->reader_lock){......}, at: [<c10a019e>] ring_buffer_consume+0x7e/0xc0
>
> other info that might help us debug this:
> 1 lock held by rb_consumer/431:
>  #0:  (&cpu_buffer->reader_lock){......}, at: [<c10a019e>] ring_buffer_consume+0x7e/0xc0

The ring buffer is a generic structure, and can be used outside of
ftrace. If ftrace traces within the use of the ring buffer, it can produce
false positives with lockdep.

This patch passes in a static lock key into the allocation of the ring
buffer, so that different ring buffers will have their own lock class.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1244477919.13761.9042.camel@twins>

[ store key in ring buffer descriptor ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index f134582..8670f15 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -105,7 +105,19 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
  * size is in bytes for each per CPU buffer.
  */
 struct ring_buffer *
-ring_buffer_alloc(unsigned long size, unsigned flags);
+__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key);
+
+/*
+ * Because the ring buffer is generic, if other users of the ring buffer get
+ * traced by ftrace, it can produce lockdep warnings. We need to keep each
+ * ring buffer's lock class separate.
+ */
+#define ring_buffer_alloc(size, flags)			\
+({							\
+	static struct lock_class_key __key;		\
+	__ring_buffer_alloc((size), (flags), &__key);	\
+})
+
 void ring_buffer_free(struct ring_buffer *buffer);
 
 int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7102d7a..22878b0 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -426,6 +426,8 @@ struct ring_buffer {
 	atomic_t			record_disabled;
 	cpumask_var_t			cpumask;
 
+	struct lock_class_key		*reader_lock_key;
+
 	struct mutex			mutex;
 
 	struct ring_buffer_per_cpu	**buffers;
@@ -565,6 +567,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 	cpu_buffer->cpu = cpu;
 	cpu_buffer->buffer = buffer;
 	spin_lock_init(&cpu_buffer->reader_lock);
+	lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
 	cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 	INIT_LIST_HEAD(&cpu_buffer->pages);
 
@@ -635,7 +638,8 @@ static int rb_cpu_notify(struct notifier_block *self,
  * when the buffer wraps. If this flag is not set, the buffer will
  * drop data when the tail hits the head.
  */
-struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
+struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
+					struct lock_class_key *key)
 {
 	struct ring_buffer *buffer;
 	int bsize;
@@ -658,6 +662,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 	buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
 	buffer->flags = flags;
 	buffer->clock = trace_clock_local;
+	buffer->reader_lock_key = key;
 
 	/* need at least two pages */
 	if (buffer->pages == 1)
@@ -715,7 +720,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 	kfree(buffer);
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(ring_buffer_alloc);
+EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
 
 /**
  * ring_buffer_free - free a ring buffer.
-- 
cgit v0.10.2


From 5bf1692f65c12a8aa359dc883468284ffc3c4587 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Fri, 5 Jun 2009 14:44:58 +0900
Subject: TOMOYO: Remove unused field.

TOMOYO 2.2.0 is not using total_len field of "struct tomoyo_path_info".

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c
index 6d25612..a44f655 100644
--- a/security/tomoyo/common.c
+++ b/security/tomoyo/common.c
@@ -428,7 +428,6 @@ void tomoyo_fill_path_info(struct tomoyo_path_info *ptr)
 	const char *name = ptr->name;
 	const int len = strlen(name);
 
-	ptr->total_len = len;
 	ptr->const_len = tomoyo_const_part_length(name);
 	ptr->is_dir = len && (name[len - 1] == '/');
 	ptr->is_patterned = (ptr->const_len < len);
diff --git a/security/tomoyo/common.h b/security/tomoyo/common.h
index 678f4ff..d8b9504 100644
--- a/security/tomoyo/common.h
+++ b/security/tomoyo/common.h
@@ -35,7 +35,6 @@ struct tomoyo_page_buffer {
 struct tomoyo_path_info {
 	const char *name;
 	u32 hash;          /* = full_name_hash(name, strlen(name)) */
-	u16 total_len;     /* = strlen(name)                       */
 	u16 const_len;     /* = tomoyo_const_part_length(name)     */
 	bool is_dir;       /* = tomoyo_strendswith(name, "/")      */
 	bool is_patterned; /* = tomoyo_path_contains_pattern(name) */
-- 
cgit v0.10.2


From c3fa109a5894077d1eaf8731ea741a15dd117b3c Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Mon, 8 Jun 2009 12:37:39 +0900
Subject: TOMOYO: Add description of lists and structures.

This patch adds some descriptions of lists and structures.
This patch contains no code changes.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c
index a44f655..fdd1f4b 100644
--- a/security/tomoyo/common.c
+++ b/security/tomoyo/common.c
@@ -28,7 +28,13 @@ static const char *tomoyo_mode_2[4] = {
 	"disabled", "enabled", "enabled", "enabled"
 };
 
-/* Table for profile. */
+/*
+ * tomoyo_control_array is a static data which contains
+ *
+ *  (1) functionality name used by /sys/kernel/security/tomoyo/profile .
+ *  (2) initial values for "struct tomoyo_profile".
+ *  (3) max values for "struct tomoyo_profile".
+ */
 static struct {
 	const char *keyword;
 	unsigned int current_value;
@@ -39,7 +45,13 @@ static struct {
 	[TOMOYO_VERBOSE]          = { "TOMOYO_VERBOSE",      1,       1 },
 };
 
-/* Profile table. Memory is allocated as needed. */
+/*
+ * tomoyo_profile is a structure which is used for holding the mode of access
+ * controls. TOMOYO has 4 modes: disabled, learning, permissive, enforcing.
+ * An administrator can define up to 256 profiles.
+ * The ->profile of "struct tomoyo_domain_info" is used for remembering
+ * the profile's number (0 - 255) assigned to that domain.
+ */
 static struct tomoyo_profile {
 	unsigned int value[TOMOYO_MAX_CONTROL_INDEX];
 	const struct tomoyo_path_info *comment;
@@ -1006,7 +1018,19 @@ static int tomoyo_read_profile(struct tomoyo_io_buffer *head)
 	return 0;
 }
 
-/* Structure for policy manager. */
+/*
+ * tomoyo_policy_manager_entry is a structure which is used for holding list of
+ * domainnames or programs which are permitted to modify configuration via
+ * /sys/kernel/security/tomoyo/ interface.
+ * It has following fields.
+ *
+ *  (1) "list" which is linked to tomoyo_policy_manager_list .
+ *  (2) "manager" is a domainname or a program's pathname.
+ *  (3) "is_domain" is a bool which is true if "manager" is a domainname, false
+ *      otherwise.
+ *  (4) "is_deleted" is a bool which is true if marked as deleted, false
+ *      otherwise.
+ */
 struct tomoyo_policy_manager_entry {
 	struct list_head list;
 	/* A path to program or a domainname. */
@@ -1015,7 +1039,36 @@ struct tomoyo_policy_manager_entry {
 	bool is_deleted; /* True if this entry is deleted. */
 };
 
-/* The list for "struct tomoyo_policy_manager_entry". */
+/*
+ * tomoyo_policy_manager_list is used for holding list of domainnames or
+ * programs which are permitted to modify configuration via
+ * /sys/kernel/security/tomoyo/ interface.
+ *
+ * An entry is added by
+ *
+ * # echo '<kernel> /sbin/mingetty /bin/login /bin/bash' > \
+ *                                        /sys/kernel/security/tomoyo/manager
+ *  (if you want to specify by a domainname)
+ *
+ *  or
+ *
+ * # echo '/usr/lib/ccs/editpolicy' > /sys/kernel/security/tomoyo/manager
+ *  (if you want to specify by a program's location)
+ *
+ * and is deleted by
+ *
+ * # echo 'delete <kernel> /sbin/mingetty /bin/login /bin/bash' > \
+ *                                        /sys/kernel/security/tomoyo/manager
+ *
+ *  or
+ *
+ * # echo 'delete /usr/lib/ccs/editpolicy' > \
+ *                                        /sys/kernel/security/tomoyo/manager
+ *
+ * and all entries are retrieved by
+ *
+ * # cat /sys/kernel/security/tomoyo/manager
+ */
 static LIST_HEAD(tomoyo_policy_manager_list);
 static DECLARE_RWSEM(tomoyo_policy_manager_list_lock);
 
@@ -2124,7 +2177,13 @@ static ssize_t tomoyo_write(struct file *file, const char __user *buf,
 	return tomoyo_write_control(file, buf, count);
 }
 
-/* Operations for /sys/kernel/security/tomoyo/ interface. */
+/*
+ * tomoyo_operations is a "struct file_operations" which is used for handling
+ * /sys/kernel/security/tomoyo/ interface.
+ *
+ * Some files under /sys/kernel/security/tomoyo/ directory accept open(O_RDWR).
+ * See tomoyo_io_buffer for internals.
+ */
 static const struct file_operations tomoyo_operations = {
 	.open    = tomoyo_open,
 	.release = tomoyo_release,
diff --git a/security/tomoyo/common.h b/security/tomoyo/common.h
index d8b9504..6d6ba09 100644
--- a/security/tomoyo/common.h
+++ b/security/tomoyo/common.h
@@ -26,12 +26,40 @@
 struct dentry;
 struct vfsmount;
 
-/* Temporary buffer for holding pathnames. */
+/*
+ * tomoyo_page_buffer is a structure which is used for holding a pathname
+ * obtained from "struct dentry" and "struct vfsmount" pair.
+ * As of now, it is 4096 bytes. If users complain that 4096 bytes is too small
+ * (because TOMOYO escapes non ASCII printable characters using \ooo format),
+ * we will make the buffer larger.
+ */
 struct tomoyo_page_buffer {
 	char buffer[4096];
 };
 
-/* Structure for holding a token. */
+/*
+ * tomoyo_path_info is a structure which is used for holding a string data
+ * used by TOMOYO.
+ * This structure has several fields for supporting pattern matching.
+ *
+ * (1) "name" is the '\0' terminated string data.
+ * (2) "hash" is full_name_hash(name, strlen(name)).
+ *     This allows tomoyo_pathcmp() to compare by hash before actually compare
+ *     using strcmp().
+ * (3) "const_len" is the length of the initial segment of "name" which
+ *     consists entirely of non wildcard characters. In other words, the length
+ *     which we can compare two strings using strncmp().
+ * (4) "is_dir" is a bool which is true if "name" ends with "/",
+ *     false otherwise.
+ *     TOMOYO distinguishes directory and non-directory. A directory ends with
+ *     "/" and non-directory does not end with "/".
+ * (5) "is_patterned" is a bool which is true if "name" contains wildcard
+ *     characters, false otherwise. This allows TOMOYO to use "hash" and
+ *     strcmp() for string comparison if "is_patterned" is false.
+ * (6) "depth" is calculated using the number of "/" characters in "name".
+ *     This allows TOMOYO to avoid comparing two pathnames which never match
+ *     (e.g. whether "/var/www/html/index.html" matches "/tmp/sh-thd-\$").
+ */
 struct tomoyo_path_info {
 	const char *name;
 	u32 hash;          /* = full_name_hash(name, strlen(name)) */
@@ -50,7 +78,20 @@ struct tomoyo_path_info {
  */
 #define TOMOYO_MAX_PATHNAME_LEN 4000
 
-/* Structure for holding requested pathname. */
+/*
+ * tomoyo_path_info_with_data is a structure which is used for holding a
+ * pathname obtained from "struct dentry" and "struct vfsmount" pair.
+ *
+ * "struct tomoyo_path_info_with_data" consists of "struct tomoyo_path_info"
+ * and buffer for the pathname, while "struct tomoyo_page_buffer" consists of
+ * buffer for the pathname only.
+ *
+ * "struct tomoyo_path_info_with_data" is intended to allow TOMOYO to release
+ * both "struct tomoyo_path_info" and buffer for the pathname by single kfree()
+ * so that we don't need to return two pointers to the caller. If the caller
+ * puts "struct tomoyo_path_info" on stack memory, we will be able to remove
+ * "struct tomoyo_path_info_with_data".
+ */
 struct tomoyo_path_info_with_data {
 	/* Keep "head" first, for this pointer is passed to tomoyo_free(). */
 	struct tomoyo_path_info head;
@@ -60,7 +101,15 @@ struct tomoyo_path_info_with_data {
 };
 
 /*
- * Common header for holding ACL entries.
+ * tomoyo_acl_info is a structure which is used for holding
+ *
+ *  (1) "list" which is linked to the ->acl_info_list of
+ *      "struct tomoyo_domain_info"
+ *  (2) "type" which tells
+ *      (a) type & 0x7F : type of the entry (either
+ *          "struct tomoyo_single_path_acl_record" or
+ *          "struct tomoyo_double_path_acl_record")
+ *      (b) type & 0x80 : whether the entry is marked as "deleted".
  *
  * Packing "struct tomoyo_acl_info" allows
  * "struct tomoyo_single_path_acl_record" to embed "u16" and
@@ -80,7 +129,28 @@ struct tomoyo_acl_info {
 /* This ACL entry is deleted.           */
 #define TOMOYO_ACL_DELETED        0x80
 
-/* Structure for domain information. */
+/*
+ * tomoyo_domain_info is a structure which is used for holding permissions
+ * (e.g. "allow_read /lib/libc-2.5.so") given to each domain.
+ * It has following fields.
+ *
+ *  (1) "list" which is linked to tomoyo_domain_list .
+ *  (2) "acl_info_list" which is linked to "struct tomoyo_acl_info".
+ *  (3) "domainname" which holds the name of the domain.
+ *  (4) "profile" which remembers profile number assigned to this domain.
+ *  (5) "is_deleted" is a bool which is true if this domain is marked as
+ *      "deleted", false otherwise.
+ *  (6) "quota_warned" is a bool which is used for suppressing warning message
+ *      when learning mode learned too much entries.
+ *  (7) "flags" which remembers this domain's attributes.
+ *
+ * A domain's lifecycle is an analogy of files on / directory.
+ * Multiple domains with the same domainname cannot be created (as with
+ * creating files with the same filename fails with -EEXIST).
+ * If a process reached a domain, that process can reside in that domain after
+ * that domain is marked as "deleted" (as with a process can access an already
+ * open()ed file after that file was unlink()ed).
+ */
 struct tomoyo_domain_info {
 	struct list_head list;
 	struct list_head acl_info_list;
@@ -107,10 +177,18 @@ struct tomoyo_domain_info {
 #define TOMOYO_DOMAIN_FLAGS_TRANSITION_FAILED        2
 
 /*
- * Structure for "allow_read/write", "allow_execute", "allow_read",
- * "allow_write", "allow_create", "allow_unlink", "allow_mkdir", "allow_rmdir",
- * "allow_mkfifo", "allow_mksock", "allow_mkblock", "allow_mkchar",
- * "allow_truncate", "allow_symlink" and "allow_rewrite" directive.
+ * tomoyo_single_path_acl_record is a structure which is used for holding an
+ * entry with one pathname operation (e.g. open(), mkdir()).
+ * It has following fields.
+ *
+ *  (1) "head" which is a "struct tomoyo_acl_info".
+ *  (2) "perm" which is a bitmask of permitted operations.
+ *  (3) "filename" is the pathname.
+ *
+ * Directives held by this structure are "allow_read/write", "allow_execute",
+ * "allow_read", "allow_write", "allow_create", "allow_unlink", "allow_mkdir",
+ * "allow_rmdir", "allow_mkfifo", "allow_mksock", "allow_mkblock",
+ * "allow_mkchar", "allow_truncate", "allow_symlink" and "allow_rewrite".
  */
 struct tomoyo_single_path_acl_record {
 	struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_SINGLE_PATH_ACL */
@@ -119,7 +197,18 @@ struct tomoyo_single_path_acl_record {
 	const struct tomoyo_path_info *filename;
 };
 
-/* Structure for "allow_rename" and "allow_link" directive. */
+/*
+ * tomoyo_double_path_acl_record is a structure which is used for holding an
+ * entry with two pathnames operation (i.e. link() and rename()).
+ * It has following fields.
+ *
+ *  (1) "head" which is a "struct tomoyo_acl_info".
+ *  (2) "perm" which is a bitmask of permitted operations.
+ *  (3) "filename1" is the source/old pathname.
+ *  (4) "filename2" is the destination/new pathname.
+ *
+ * Directives held by this structure are "allow_rename" and "allow_link".
+ */
 struct tomoyo_double_path_acl_record {
 	struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_DOUBLE_PATH_ACL */
 	u8 perm;
@@ -152,7 +241,29 @@ struct tomoyo_double_path_acl_record {
 #define TOMOYO_VERBOSE                       2
 #define TOMOYO_MAX_CONTROL_INDEX             3
 
-/* Structure for reading/writing policy via securityfs interfaces. */
+/*
+ * tomoyo_io_buffer is a structure which is used for reading and modifying
+ * configuration via /sys/kernel/security/tomoyo/ interface.
+ * It has many fields. ->read_var1 , ->read_var2 , ->write_var1 are used as
+ * cursors.
+ *
+ * Since the content of /sys/kernel/security/tomoyo/domain_policy is a list of
+ * "struct tomoyo_domain_info" entries and each "struct tomoyo_domain_info"
+ * entry has a list of "struct tomoyo_acl_info", we need two cursors when
+ * reading (one is for traversing tomoyo_domain_list and the other is for
+ * traversing "struct tomoyo_acl_info"->acl_info_list ).
+ *
+ * If a line written to /sys/kernel/security/tomoyo/domain_policy starts with
+ * "select ", TOMOYO seeks the cursor ->read_var1 and ->write_var1 to the
+ * domain with the domainname specified by the rest of that line (NULL is set
+ * if seek failed).
+ * If a line written to /sys/kernel/security/tomoyo/domain_policy starts with
+ * "delete ", TOMOYO deletes an entry or a domain specified by the rest of that
+ * line (->write_var1 is set to NULL if a domain was deleted).
+ * If a line written to /sys/kernel/security/tomoyo/domain_policy starts with
+ * neither "select " nor "delete ", an entry or a domain specified by that line
+ * is appended.
+ */
 struct tomoyo_io_buffer {
 	int (*read) (struct tomoyo_io_buffer *);
 	int (*write) (struct tomoyo_io_buffer *);
diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index eb75401..1d8b169 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -19,11 +19,63 @@
 /* The initial domain. */
 struct tomoyo_domain_info tomoyo_kernel_domain;
 
-/* The list for "struct tomoyo_domain_info". */
+/*
+ * tomoyo_domain_list is used for holding list of domains.
+ * The ->acl_info_list of "struct tomoyo_domain_info" is used for holding
+ * permissions (e.g. "allow_read /lib/libc-2.5.so") given to each domain.
+ *
+ * An entry is added by
+ *
+ * # ( echo "<kernel>"; echo "allow_execute /sbin/init" ) > \
+ *                                  /sys/kernel/security/tomoyo/domain_policy
+ *
+ * and is deleted by
+ *
+ * # ( echo "<kernel>"; echo "delete allow_execute /sbin/init" ) > \
+ *                                  /sys/kernel/security/tomoyo/domain_policy
+ *
+ * and all entries are retrieved by
+ *
+ * # cat /sys/kernel/security/tomoyo/domain_policy
+ *
+ * A domain is added by
+ *
+ * # echo "<kernel>" > /sys/kernel/security/tomoyo/domain_policy
+ *
+ * and is deleted by
+ *
+ * # echo "delete <kernel>" > /sys/kernel/security/tomoyo/domain_policy
+ *
+ * and all domains are retrieved by
+ *
+ * # grep '^<kernel>' /sys/kernel/security/tomoyo/domain_policy
+ *
+ * Normally, a domainname is monotonically getting longer because a domainname
+ * which the process will belong to if an execve() operation succeeds is
+ * defined as a concatenation of "current domainname" + "pathname passed to
+ * execve()".
+ * See tomoyo_domain_initializer_list and tomoyo_domain_keeper_list for
+ * exceptions.
+ */
 LIST_HEAD(tomoyo_domain_list);
 DECLARE_RWSEM(tomoyo_domain_list_lock);
 
-/* Structure for "initialize_domain" and "no_initialize_domain" keyword. */
+/*
+ * tomoyo_domain_initializer_entry is a structure which is used for holding
+ * "initialize_domain" and "no_initialize_domain" entries.
+ * It has following fields.
+ *
+ *  (1) "list" which is linked to tomoyo_domain_initializer_list .
+ *  (2) "domainname" which is "a domainname" or "the last component of a
+ *      domainname". This field is NULL if "from" clause is not specified.
+ *  (3) "program" which is a program's pathname.
+ *  (4) "is_deleted" is a bool which is true if marked as deleted, false
+ *      otherwise.
+ *  (5) "is_not" is a bool which is true if "no_initialize_domain", false
+ *      otherwise.
+ *  (6) "is_last_name" is a bool which is true if "domainname" is "the last
+ *      component of a domainname", false otherwise.
+ */
 struct tomoyo_domain_initializer_entry {
 	struct list_head list;
 	const struct tomoyo_path_info *domainname;    /* This may be NULL */
@@ -34,7 +86,23 @@ struct tomoyo_domain_initializer_entry {
 	bool is_last_name;
 };
 
-/* Structure for "keep_domain" and "no_keep_domain" keyword. */
+/*
+ * tomoyo_domain_keeper_entry is a structure which is used for holding
+ * "keep_domain" and "no_keep_domain" entries.
+ * It has following fields.
+ *
+ *  (1) "list" which is linked to tomoyo_domain_keeper_list .
+ *  (2) "domainname" which is "a domainname" or "the last component of a
+ *      domainname".
+ *  (3) "program" which is a program's pathname.
+ *      This field is NULL if "from" clause is not specified.
+ *  (4) "is_deleted" is a bool which is true if marked as deleted, false
+ *      otherwise.
+ *  (5) "is_not" is a bool which is true if "no_initialize_domain", false
+ *      otherwise.
+ *  (6) "is_last_name" is a bool which is true if "domainname" is "the last
+ *      component of a domainname", false otherwise.
+ */
 struct tomoyo_domain_keeper_entry {
 	struct list_head list;
 	const struct tomoyo_path_info *domainname;
@@ -45,7 +113,16 @@ struct tomoyo_domain_keeper_entry {
 	bool is_last_name;
 };
 
-/* Structure for "alias" keyword. */
+/*
+ * tomoyo_alias_entry is a structure which is used for holding "alias" entries.
+ * It has following fields.
+ *
+ *  (1) "list" which is linked to tomoyo_alias_list .
+ *  (2) "original_name" which is a dereferenced pathname.
+ *  (3) "aliased_name" which is a symlink's pathname.
+ *  (4) "is_deleted" is a bool which is true if marked as deleted, false
+ *      otherwise.
+ */
 struct tomoyo_alias_entry {
 	struct list_head list;
 	const struct tomoyo_path_info *original_name;
@@ -92,7 +169,42 @@ const char *tomoyo_get_last_name(const struct tomoyo_domain_info *domain)
 	return cp0;
 }
 
-/* The list for "struct tomoyo_domain_initializer_entry". */
+/*
+ * tomoyo_domain_initializer_list is used for holding list of programs which
+ * triggers reinitialization of domainname. Normally, a domainname is
+ * monotonically getting longer. But sometimes, we restart daemon programs.
+ * It would be convenient for us that "a daemon started upon system boot" and
+ * "the daemon restarted from console" belong to the same domain. Thus, TOMOYO
+ * provides a way to shorten domainnames.
+ *
+ * An entry is added by
+ *
+ * # echo 'initialize_domain /usr/sbin/httpd' > \
+ *                               /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and is deleted by
+ *
+ * # echo 'delete initialize_domain /usr/sbin/httpd' > \
+ *                               /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and all entries are retrieved by
+ *
+ * # grep ^initialize_domain /sys/kernel/security/tomoyo/exception_policy
+ *
+ * In the example above, /usr/sbin/httpd will belong to
+ * "<kernel> /usr/sbin/httpd" domain.
+ *
+ * You may specify a domainname using "from" keyword.
+ * "initialize_domain /usr/sbin/httpd from <kernel> /etc/rc.d/init.d/httpd"
+ * will cause "/usr/sbin/httpd" executed from "<kernel> /etc/rc.d/init.d/httpd"
+ * domain to belong to "<kernel> /usr/sbin/httpd" domain.
+ *
+ * You may add "no_" prefix to "initialize_domain".
+ * "initialize_domain /usr/sbin/httpd" and
+ * "no_initialize_domain /usr/sbin/httpd from <kernel> /etc/rc.d/init.d/httpd"
+ * will cause "/usr/sbin/httpd" to belong to "<kernel> /usr/sbin/httpd" domain
+ * unless executed from "<kernel> /etc/rc.d/init.d/httpd" domain.
+ */
 static LIST_HEAD(tomoyo_domain_initializer_list);
 static DECLARE_RWSEM(tomoyo_domain_initializer_list_lock);
 
@@ -268,7 +380,44 @@ static bool tomoyo_is_domain_initializer(const struct tomoyo_path_info *
 	return flag;
 }
 
-/* The list for "struct tomoyo_domain_keeper_entry". */
+/*
+ * tomoyo_domain_keeper_list is used for holding list of domainnames which
+ * suppresses domain transition. Normally, a domainname is monotonically
+ * getting longer. But sometimes, we want to suppress domain transition.
+ * It would be convenient for us that programs executed from a login session
+ * belong to the same domain. Thus, TOMOYO provides a way to suppress domain
+ * transition.
+ *
+ * An entry is added by
+ *
+ * # echo 'keep_domain <kernel> /usr/sbin/sshd /bin/bash' > \
+ *                              /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and is deleted by
+ *
+ * # echo 'delete keep_domain <kernel> /usr/sbin/sshd /bin/bash' > \
+ *                              /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and all entries are retrieved by
+ *
+ * # grep ^keep_domain /sys/kernel/security/tomoyo/exception_policy
+ *
+ * In the example above, any process which belongs to
+ * "<kernel> /usr/sbin/sshd /bin/bash" domain will remain in that domain,
+ * unless explicitly specified by "initialize_domain" or "no_keep_domain".
+ *
+ * You may specify a program using "from" keyword.
+ * "keep_domain /bin/pwd from <kernel> /usr/sbin/sshd /bin/bash"
+ * will cause "/bin/pwd" executed from "<kernel> /usr/sbin/sshd /bin/bash"
+ * domain to remain in "<kernel> /usr/sbin/sshd /bin/bash" domain.
+ *
+ * You may add "no_" prefix to "keep_domain".
+ * "keep_domain <kernel> /usr/sbin/sshd /bin/bash" and
+ * "no_keep_domain /usr/bin/passwd from <kernel> /usr/sbin/sshd /bin/bash" will
+ * cause "/usr/bin/passwd" to belong to
+ * "<kernel> /usr/sbin/sshd /bin/bash /usr/bin/passwd" domain, unless
+ * explicitly specified by "initialize_domain".
+ */
 static LIST_HEAD(tomoyo_domain_keeper_list);
 static DECLARE_RWSEM(tomoyo_domain_keeper_list_lock);
 
@@ -437,7 +586,36 @@ static bool tomoyo_is_domain_keeper(const struct tomoyo_path_info *domainname,
 	return flag;
 }
 
-/* The list for "struct tomoyo_alias_entry". */
+/*
+ * tomoyo_alias_list is used for holding list of symlink's pathnames which are
+ * allowed to be passed to an execve() request. Normally, the domainname which
+ * the current process will belong to after execve() succeeds is calculated
+ * using dereferenced pathnames. But some programs behave differently depending
+ * on the name passed to argv[0]. For busybox, calculating domainname using
+ * dereferenced pathnames will cause all programs in the busybox to belong to
+ * the same domain. Thus, TOMOYO provides a way to allow use of symlink's
+ * pathname for checking execve()'s permission and calculating domainname which
+ * the current process will belong to after execve() succeeds.
+ *
+ * An entry is added by
+ *
+ * # echo 'alias /bin/busybox /bin/cat' > \
+ *                            /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and is deleted by
+ *
+ * # echo 'delete alias /bin/busybox /bin/cat' > \
+ *                            /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and all entries are retrieved by
+ *
+ * # grep ^alias /sys/kernel/security/tomoyo/exception_policy
+ *
+ * In the example above, if /bin/cat is a symlink to /bin/busybox and execution
+ * of /bin/cat is requested, permission is checked for /bin/cat rather than
+ * /bin/busybox and domainname which the current process will belong to after
+ * execve() succeeds is calculated using /bin/cat rather than /bin/busybox .
+ */
 static LIST_HEAD(tomoyo_alias_list);
 static DECLARE_RWSEM(tomoyo_alias_list_lock);
 
diff --git a/security/tomoyo/file.c b/security/tomoyo/file.c
index ab0cd35..5ae3a57 100644
--- a/security/tomoyo/file.c
+++ b/security/tomoyo/file.c
@@ -14,21 +14,50 @@
 #include "realpath.h"
 #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
 
-/* Structure for "allow_read" keyword. */
+/*
+ * tomoyo_globally_readable_file_entry is a structure which is used for holding
+ * "allow_read" entries.
+ * It has following fields.
+ *
+ *  (1) "list" which is linked to tomoyo_globally_readable_list .
+ *  (2) "filename" is a pathname which is allowed to open(O_RDONLY).
+ *  (3) "is_deleted" is a bool which is true if marked as deleted, false
+ *      otherwise.
+ */
 struct tomoyo_globally_readable_file_entry {
 	struct list_head list;
 	const struct tomoyo_path_info *filename;
 	bool is_deleted;
 };
 
-/* Structure for "file_pattern" keyword. */
+/*
+ * tomoyo_pattern_entry is a structure which is used for holding
+ * "tomoyo_pattern_list" entries.
+ * It has following fields.
+ *
+ *  (1) "list" which is linked to tomoyo_pattern_list .
+ *  (2) "pattern" is a pathname pattern which is used for converting pathnames
+ *      to pathname patterns during learning mode.
+ *  (3) "is_deleted" is a bool which is true if marked as deleted, false
+ *      otherwise.
+ */
 struct tomoyo_pattern_entry {
 	struct list_head list;
 	const struct tomoyo_path_info *pattern;
 	bool is_deleted;
 };
 
-/* Structure for "deny_rewrite" keyword. */
+/*
+ * tomoyo_no_rewrite_entry is a structure which is used for holding
+ * "deny_rewrite" entries.
+ * It has following fields.
+ *
+ *  (1) "list" which is linked to tomoyo_no_rewrite_list .
+ *  (2) "pattern" is a pathname which is by default not permitted to modify
+ *      already existing content.
+ *  (3) "is_deleted" is a bool which is true if marked as deleted, false
+ *      otherwise.
+ */
 struct tomoyo_no_rewrite_entry {
 	struct list_head list;
 	const struct tomoyo_path_info *pattern;
@@ -141,7 +170,31 @@ static int tomoyo_update_single_path_acl(const u8 type, const char *filename,
 					 struct tomoyo_domain_info *
 					 const domain, const bool is_delete);
 
-/* The list for "struct tomoyo_globally_readable_file_entry". */
+/*
+ * tomoyo_globally_readable_list is used for holding list of pathnames which
+ * are by default allowed to be open()ed for reading by any process.
+ *
+ * An entry is added by
+ *
+ * # echo 'allow_read /lib/libc-2.5.so' > \
+ *                               /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and is deleted by
+ *
+ * # echo 'delete allow_read /lib/libc-2.5.so' > \
+ *                               /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and all entries are retrieved by
+ *
+ * # grep ^allow_read /sys/kernel/security/tomoyo/exception_policy
+ *
+ * In the example above, any process is allowed to
+ * open("/lib/libc-2.5.so", O_RDONLY).
+ * One exception is, if the domain which current process belongs to is marked
+ * as "ignore_global_allow_read", current process can't do so unless explicitly
+ * given "allow_read /lib/libc-2.5.so" to the domain which current process
+ * belongs to.
+ */
 static LIST_HEAD(tomoyo_globally_readable_list);
 static DECLARE_RWSEM(tomoyo_globally_readable_list_lock);
 
@@ -256,7 +309,35 @@ bool tomoyo_read_globally_readable_policy(struct tomoyo_io_buffer *head)
 	return done;
 }
 
-/* The list for "struct tomoyo_pattern_entry". */
+/* tomoyo_pattern_list is used for holding list of pathnames which are used for
+ * converting pathnames to pathname patterns during learning mode.
+ *
+ * An entry is added by
+ *
+ * # echo 'file_pattern /proc/\$/mounts' > \
+ *                             /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and is deleted by
+ *
+ * # echo 'delete file_pattern /proc/\$/mounts' > \
+ *                             /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and all entries are retrieved by
+ *
+ * # grep ^file_pattern /sys/kernel/security/tomoyo/exception_policy
+ *
+ * In the example above, if a process which belongs to a domain which is in
+ * learning mode requested open("/proc/1/mounts", O_RDONLY),
+ * "allow_read /proc/\$/mounts" is automatically added to the domain which that
+ * process belongs to.
+ *
+ * It is not a desirable behavior that we have to use /proc/\$/ instead of
+ * /proc/self/ when current process needs to access only current process's
+ * information. As of now, LSM version of TOMOYO is using __d_path() for
+ * calculating pathname. Non LSM version of TOMOYO is using its own function
+ * which pretends as if /proc/self/ is not a symlink; so that we can forbid
+ * current process from accessing other process's information.
+ */
 static LIST_HEAD(tomoyo_pattern_list);
 static DECLARE_RWSEM(tomoyo_pattern_list_lock);
 
@@ -377,7 +458,35 @@ bool tomoyo_read_file_pattern(struct tomoyo_io_buffer *head)
 	return done;
 }
 
-/* The list for "struct tomoyo_no_rewrite_entry". */
+/*
+ * tomoyo_no_rewrite_list is used for holding list of pathnames which are by
+ * default forbidden to modify already written content of a file.
+ *
+ * An entry is added by
+ *
+ * # echo 'deny_rewrite /var/log/messages' > \
+ *                              /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and is deleted by
+ *
+ * # echo 'delete deny_rewrite /var/log/messages' > \
+ *                              /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and all entries are retrieved by
+ *
+ * # grep ^deny_rewrite /sys/kernel/security/tomoyo/exception_policy
+ *
+ * In the example above, if a process requested to rewrite /var/log/messages ,
+ * the process can't rewrite unless the domain which that process belongs to
+ * has "allow_rewrite /var/log/messages" entry.
+ *
+ * It is not a desirable behavior that we have to add "\040(deleted)" suffix
+ * when we want to allow rewriting already unlink()ed file. As of now,
+ * LSM version of TOMOYO is using __d_path() for calculating pathname.
+ * Non LSM version of TOMOYO is using its own function which doesn't append
+ * " (deleted)" suffix if the file is already unlink()ed; so that we don't
+ * need to worry whether the file is already unlink()ed or not.
+ */
 static LIST_HEAD(tomoyo_no_rewrite_list);
 static DECLARE_RWSEM(tomoyo_no_rewrite_list_lock);
 
diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c
index 3948f6b..5f2e332 100644
--- a/security/tomoyo/realpath.c
+++ b/security/tomoyo/realpath.c
@@ -265,7 +265,16 @@ static unsigned int tomoyo_quota_for_savename;
  */
 #define TOMOYO_MAX_HASH 256
 
-/* Structure for string data. */
+/*
+ * tomoyo_name_entry is a structure which is used for linking
+ * "struct tomoyo_path_info" into tomoyo_name_list .
+ *
+ * Since tomoyo_name_list manages a list of strings which are shared by
+ * multiple processes (whereas "struct tomoyo_path_info" inside
+ * "struct tomoyo_path_info_with_data" is not shared), a reference counter will
+ * be added to "struct tomoyo_name_entry" rather than "struct tomoyo_path_info"
+ * when TOMOYO starts supporting garbage collector.
+ */
 struct tomoyo_name_entry {
 	struct list_head list;
 	struct tomoyo_path_info entry;
@@ -279,10 +288,10 @@ struct tomoyo_free_memory_block_list {
 };
 
 /*
- * The list for "struct tomoyo_name_entry".
- *
- * This list is updated only inside tomoyo_save_name(), thus
- * no global mutex exists.
+ * tomoyo_name_list is used for holding string data used by TOMOYO.
+ * Since same string data is likely used for multiple times (e.g.
+ * "/lib/libc-2.5.so"), TOMOYO shares string data in the form of
+ * "const struct tomoyo_path_info *".
  */
 static struct list_head tomoyo_name_list[TOMOYO_MAX_HASH];
 
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index e42be5c..3194d09 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -262,6 +262,10 @@ static int tomoyo_dentry_open(struct file *f, const struct cred *cred)
 	return tomoyo_check_open_permission(tomoyo_domain(), &f->f_path, flags);
 }
 
+/*
+ * tomoyo_security_ops is a "struct security_operations" which is used for
+ * registering TOMOYO.
+ */
 static struct security_operations tomoyo_security_ops = {
 	.name                = "tomoyo",
 	.cred_prepare        = tomoyo_cred_prepare,
-- 
cgit v0.10.2


From dbb66c4be020b01dc2f3d7c609ddb0e09d2c0af7 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 9 Jun 2009 05:47:10 +0200
Subject: block: needs to set the residual length of a bidi request

Tejun's "block: set rq->resid_len to blk_rq_bytes() on issue" patch
seems to be incomplete; It doesn't set rq->resid_len to blk_rq_bytes()
for a bidi request (req->next_rq). As a result, all bidi users are
broken.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index 7ae83a1..03c5a64 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1846,6 +1846,9 @@ void blk_start_request(struct request *req)
 	 * resid_len to full count and add the timeout handler.
 	 */
 	req->resid_len = blk_rq_bytes(req);
+	if (unlikely(blk_bidi_rq(req)))
+		req->next_rq->resid_len = blk_rq_bytes(req->next_rq);
+
 	blk_add_timer(req);
 }
 EXPORT_SYMBOL(blk_start_request);
-- 
cgit v0.10.2


From 40df6ae4277a67e97aa0a8bd8e293fdbb00e5623 Mon Sep 17 00:00:00 2001
From: "scameron@beardog.cca.cpqcorp.net" <scameron@beardog.cca.cpqcorp.net>
Date: Mon, 8 Jun 2009 15:59:38 -0500
Subject: cciss: Use schedule_timeout_uninterruptible in SCSI error handling
 code

Use schedule_timeout_uninterruptible instead of schedule_timeout in the
scsi error handling code when waiting between TUR polls since we are not
interested in nor want to be interrupted by signals.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index 2edfc9b..134fdf2 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -1608,7 +1608,7 @@ static int wait_for_device_to_become_ready(ctlr_info_t *h,
 		/* Wait for a bit.  do this first, because if we send
 		 * the TUR right away, the reset will just abort it.
 		 */
-		schedule_timeout_interruptible(waittime);
+		schedule_timeout_uninterruptible(waittime);
 		count++;
 
 		/* Increase wait time with each try, up to a point. */
-- 
cgit v0.10.2


From 5390cfc3fea49d015ae1eed8551c0bf00489b50e Mon Sep 17 00:00:00 2001
From: "scameron@beardog.cca.cpqcorp.net" <scameron@beardog.cca.cpqcorp.net>
Date: Mon, 8 Jun 2009 16:01:11 -0500
Subject: cciss: factor out core of sendcmd_withirq() for use by SCSI error
 handling code

Factor the core of sendcmd_withirq out to provide a simpler interface
which provides access to full error information.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index e7d0095..74fc85a 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2278,114 +2278,123 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff, size_
 	return status;
 }
 
-static int sendcmd_withirq(__u8 cmd,
-			   int ctlr,
-			   void *buff,
-			   size_t size,
-			   unsigned int use_unit_num,
-			   unsigned int log_unit, __u8 page_code, int cmd_type)
+static int sendcmd_withirq_core(ctlr_info_t *h, CommandList_struct *c)
 {
-	ctlr_info_t *h = hba[ctlr];
-	CommandList_struct *c;
+	DECLARE_COMPLETION_ONSTACK(wait);
 	u64bit buff_dma_handle;
 	unsigned long flags;
-	int return_status;
-	DECLARE_COMPLETION_ONSTACK(wait);
+	int return_status = IO_OK;
 
-	if ((c = cmd_alloc(h, 0)) == NULL)
-		return -ENOMEM;
-	return_status = fill_cmd(c, cmd, ctlr, buff, size, use_unit_num,
-				 log_unit, page_code, NULL, cmd_type);
-	if (return_status != IO_OK) {
-		cmd_free(h, c, 0);
-		return return_status;
-	}
-      resend_cmd2:
+resend_cmd2:
 	c->waiting = &wait;
-
 	/* Put the request on the tail of the queue and send it */
-	spin_lock_irqsave(CCISS_LOCK(ctlr), flags);
+	spin_lock_irqsave(CCISS_LOCK(h->ctlr), flags);
 	addQ(&h->reqQ, c);
 	h->Qdepth++;
 	start_io(h);
-	spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
+	spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
 
 	wait_for_completion(&wait);
 
-	if (c->err_info->CommandStatus != 0) {	/* an error has occurred */
-		switch (c->err_info->CommandStatus) {
-		case CMD_TARGET_STATUS:
-			printk(KERN_WARNING "cciss: cmd %p has "
-			       " completed with errors\n", c);
-			if (c->err_info->ScsiStatus) {
-				printk(KERN_WARNING "cciss: cmd %p "
-				       "has SCSI Status = %x\n",
-				       c, c->err_info->ScsiStatus);
-			}
+	if (c->err_info->CommandStatus == 0)
+		goto command_done;
 
-			break;
-		case CMD_DATA_UNDERRUN:
-		case CMD_DATA_OVERRUN:
-			/* expected for inquire and report lun commands */
-			break;
-		case CMD_INVALID:
-			printk(KERN_WARNING "cciss: Cmd %p is "
-			       "reported invalid\n", c);
-			return_status = IO_ERROR;
-			break;
-		case CMD_PROTOCOL_ERR:
-			printk(KERN_WARNING "cciss: cmd %p has "
-			       "protocol error \n", c);
-			return_status = IO_ERROR;
-			break;
-		case CMD_HARDWARE_ERR:
-			printk(KERN_WARNING "cciss: cmd %p had "
-			       " hardware error\n", c);
-			return_status = IO_ERROR;
-			break;
-		case CMD_CONNECTION_LOST:
-			printk(KERN_WARNING "cciss: cmd %p had "
-			       "connection lost\n", c);
-			return_status = IO_ERROR;
-			break;
-		case CMD_ABORTED:
-			printk(KERN_WARNING "cciss: cmd %p was "
-			       "aborted\n", c);
-			return_status = IO_ERROR;
-			break;
-		case CMD_ABORT_FAILED:
-			printk(KERN_WARNING "cciss: cmd %p reports "
-			       "abort failed\n", c);
-			return_status = IO_ERROR;
-			break;
-		case CMD_UNSOLICITED_ABORT:
+	switch (c->err_info->CommandStatus) {
+	case CMD_TARGET_STATUS:
+		printk(KERN_WARNING "cciss: cmd 0x%02x "
+		"has completed with errors\n", c->Request.CDB[0]);
+		if (c->err_info->ScsiStatus) {
+			printk(KERN_WARNING "cciss: cmd 0x%02x "
+			       "has SCSI Status = %x\n",
+			       c->Request.CDB[0], c->err_info->ScsiStatus);
+		}
+		break;
+	case CMD_DATA_UNDERRUN:
+	case CMD_DATA_OVERRUN:
+		/* expected for inquiry and report lun commands */
+		break;
+	case CMD_INVALID:
+		printk(KERN_WARNING "cciss: Cmd 0x%02x is "
+		       "reported invalid\n", c->Request.CDB[0]);
+		return_status = IO_ERROR;
+		break;
+	case CMD_PROTOCOL_ERR:
+		printk(KERN_WARNING "cciss: cmd 0x%02x has "
+		       "protocol error \n", c->Request.CDB[0]);
+		return_status = IO_ERROR;
+		break;
+	case CMD_HARDWARE_ERR:
+		printk(KERN_WARNING "cciss: cmd 0x%02x had "
+		       " hardware error\n", c->Request.CDB[0]);
+		return_status = IO_ERROR;
+		break;
+	case CMD_CONNECTION_LOST:
+		printk(KERN_WARNING "cciss: cmd 0x%02x had "
+		       "connection lost\n", c->Request.CDB[0]);
+		return_status = IO_ERROR;
+		break;
+	case CMD_ABORTED:
+		printk(KERN_WARNING "cciss: cmd 0x%02x was "
+		       "aborted\n", c->Request.CDB[0]);
+		return_status = IO_ERROR;
+		break;
+	case CMD_ABORT_FAILED:
+		printk(KERN_WARNING "cciss: cmd 0x%02x reports "
+		       "abort failed\n", c->Request.CDB[0]);
+		return_status = IO_ERROR;
+		break;
+	case CMD_UNSOLICITED_ABORT:
+		printk(KERN_WARNING
+		       "cciss%d: unsolicited abort 0x%02x\n", h->ctlr,
+			c->Request.CDB[0]);
+		if (c->retry_count < MAX_CMD_RETRIES) {
 			printk(KERN_WARNING
-			       "cciss%d: unsolicited abort %p\n", ctlr, c);
-			if (c->retry_count < MAX_CMD_RETRIES) {
-				printk(KERN_WARNING
-				       "cciss%d: retrying %p\n", ctlr, c);
-				c->retry_count++;
-				/* erase the old error information */
-				memset(c->err_info, 0,
-				       sizeof(ErrorInfo_struct));
-				return_status = IO_OK;
-				INIT_COMPLETION(wait);
-				goto resend_cmd2;
-			}
-			return_status = IO_ERROR;
-			break;
-		default:
-			printk(KERN_WARNING "cciss: cmd %p returned "
-			       "unknown status %x\n", c,
-			       c->err_info->CommandStatus);
-			return_status = IO_ERROR;
+			       "cciss%d: retrying 0x%02x\n", h->ctlr,
+				c->Request.CDB[0]);
+			c->retry_count++;
+			/* erase the old error information */
+			memset(c->err_info, 0,
+			       sizeof(ErrorInfo_struct));
+			return_status = IO_OK;
+			INIT_COMPLETION(wait);
+			goto resend_cmd2;
 		}
+		return_status = IO_ERROR;
+		break;
+	default:
+		printk(KERN_WARNING "cciss: cmd 0x%02x returned "
+		       "unknown status %x\n", c->Request.CDB[0],
+		       c->err_info->CommandStatus);
+		return_status = IO_ERROR;
 	}
+
+command_done:
 	/* unlock the buffers from DMA */
 	buff_dma_handle.val32.lower = c->SG[0].Addr.lower;
 	buff_dma_handle.val32.upper = c->SG[0].Addr.upper;
 	pci_unmap_single(h->pdev, (dma_addr_t) buff_dma_handle.val,
 			 c->SG[0].Len, PCI_DMA_BIDIRECTIONAL);
+	return return_status;
+}
+
+static int sendcmd_withirq(__u8 cmd,
+			   int ctlr,
+			   void *buff,
+			   size_t size,
+			   unsigned int use_unit_num,
+			   unsigned int log_unit, __u8 page_code, int cmd_type)
+{
+	ctlr_info_t *h = hba[ctlr];
+	CommandList_struct *c;
+	int return_status;
+
+	c = cmd_alloc(h, 0);
+	if (!c)
+		return -ENOMEM;
+	return_status = fill_cmd(c, cmd, ctlr, buff, size, use_unit_num,
+				 log_unit, page_code, NULL, cmd_type);
+	if (return_status == IO_OK)
+		return_status = sendcmd_withirq_core(h, c);
 	cmd_free(h, c, 0);
 	return return_status;
 }
-- 
cgit v0.10.2


From b57695fe131b13d3f2460cfeb9175cff673ed337 Mon Sep 17 00:00:00 2001
From: "scameron@beardog.cca.cpqcorp.net" <scameron@beardog.cca.cpqcorp.net>
Date: Mon, 8 Jun 2009 16:02:17 -0500
Subject: cciss: simplify interface of sendcmd() and sendcmd_withirq()

Simplify interfaces of sendcmd() and sendcmd_withirq() so that they
provide only one way to address commands instead of three ways.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 74fc85a..885ea1e 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -180,11 +180,10 @@ static void __devinit cciss_interrupt_mode(ctlr_info_t *, struct pci_dev *,
 					   __u32);
 static void start_io(ctlr_info_t *h);
 static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size,
-		   unsigned int use_unit_num, unsigned int log_unit,
 		   __u8 page_code, unsigned char *scsi3addr, int cmd_type);
 static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size,
-			   unsigned int use_unit_num, unsigned int log_unit,
-			   __u8 page_code, int cmd_type);
+			__u8 page_code, unsigned char scsi3addr[],
+			int cmd_type);
 
 static void fail_all_cmds(unsigned long ctlr);
 static int scan_thread(void *data);
@@ -1520,6 +1519,15 @@ static void cciss_softirq_done(struct request *rq)
 	spin_unlock_irqrestore(&h->lock, flags);
 }
 
+static void log_unit_to_scsi3addr(ctlr_info_t *h, unsigned char scsi3addr[],
+	uint32_t log_unit)
+{
+	log_unit = h->drv[log_unit].LunID & 0x03fff;
+	memset(&scsi3addr[4], 0, 4);
+	memcpy(&scsi3addr[0], &log_unit, 4);
+	scsi3addr[3] |= 0x40;
+}
+
 /* This function gets the SCSI vendor, model, and revision of a logical drive
  * via the inquiry page 0.  Model, vendor, and rev are set to empty strings if
  * they cannot be read.
@@ -1529,6 +1537,7 @@ static void cciss_get_device_descr(int ctlr, int logvol, int withirq,
 {
 	int rc;
 	InquiryData_struct *inq_buf;
+	unsigned char scsi3addr[8];
 
 	*vendor = '\0';
 	*model = '\0';
@@ -1538,14 +1547,15 @@ static void cciss_get_device_descr(int ctlr, int logvol, int withirq,
 	if (!inq_buf)
 		return;
 
+	log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
 	if (withirq)
 		rc = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buf,
-				     sizeof(InquiryData_struct), 1, logvol,
-				     0, TYPE_CMD);
+			     sizeof(InquiryData_struct), 0,
+				scsi3addr, TYPE_CMD);
 	else
 		rc = sendcmd(CISS_INQUIRY, ctlr, inq_buf,
-			     sizeof(InquiryData_struct), 1, logvol, 0, NULL,
-			     TYPE_CMD);
+			     sizeof(InquiryData_struct), 0,
+				scsi3addr, TYPE_CMD);
 	if (rc == IO_OK) {
 		memcpy(vendor, &inq_buf->data_byte[8], VENDOR_LEN);
 		vendor[VENDOR_LEN] = '\0';
@@ -1570,6 +1580,7 @@ static void cciss_get_serial_no(int ctlr, int logvol, int withirq,
 #define PAGE_83_INQ_BYTES 64
 	int rc;
 	unsigned char *buf;
+	unsigned char scsi3addr[8];
 
 	if (buflen > 16)
 		buflen = 16;
@@ -1578,12 +1589,13 @@ static void cciss_get_serial_no(int ctlr, int logvol, int withirq,
 	if (!buf)
 		return;
 	memset(serial_no, 0, buflen);
+	log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
 	if (withirq)
 		rc = sendcmd_withirq(CISS_INQUIRY, ctlr, buf,
-			PAGE_83_INQ_BYTES, 1, logvol, 0x83, TYPE_CMD);
+			PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD);
 	else
 		rc = sendcmd(CISS_INQUIRY, ctlr, buf,
-			PAGE_83_INQ_BYTES, 1, logvol, 0x83, NULL, TYPE_CMD);
+			PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD);
 	if (rc == IO_OK)
 		memcpy(serial_no, &buf[8], buflen);
 	kfree(buf);
@@ -1902,8 +1914,8 @@ static int rebuild_lun_table(ctlr_info_t *h, int first_time)
 		goto mem_msg;
 
 	return_code = sendcmd_withirq(CISS_REPORT_LOG, ctlr, ld_buff,
-				      sizeof(ReportLunData_struct), 0,
-				      0, 0, TYPE_CMD);
+				      sizeof(ReportLunData_struct),
+				      0, CTLR_LUNID, TYPE_CMD);
 
 	if (return_code == IO_OK)
 		listlength = be32_to_cpu(*(__be32 *) ld_buff->LUNListLength);
@@ -2112,11 +2124,9 @@ static int deregister_disk(ctlr_info_t *h, int drv_index,
 	return 0;
 }
 
-static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff, size_t size, unsigned int use_unit_num,	/* 0: address the controller,
-															   1: address logical volume log_unit,
-															   2: periph device address is scsi3addr */
-		    unsigned int log_unit, __u8 page_code,
-		    unsigned char *scsi3addr, int cmd_type)
+static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff,
+		size_t size, __u8 page_code, unsigned char *scsi3addr,
+		int cmd_type)
 {
 	ctlr_info_t *h = hba[ctlr];
 	u64bit buff_dma_handle;
@@ -2132,27 +2142,12 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff, size_
 		c->Header.SGTotal = 0;
 	}
 	c->Header.Tag.lower = c->busaddr;
+	memcpy(c->Header.LUN.LunAddrBytes, scsi3addr, 8);
 
 	c->Request.Type.Type = cmd_type;
 	if (cmd_type == TYPE_CMD) {
 		switch (cmd) {
 		case CISS_INQUIRY:
-			/* If the logical unit number is 0 then, this is going
-			   to controller so It's a physical command
-			   mode = 0 target = 0.  So we have nothing to write.
-			   otherwise, if use_unit_num == 1,
-			   mode = 1(volume set addressing) target = LUNID
-			   otherwise, if use_unit_num == 2,
-			   mode = 0(periph dev addr) target = scsi3addr */
-			if (use_unit_num == 1) {
-				c->Header.LUN.LogDev.VolId =
-				    h->drv[log_unit].LunID;
-				c->Header.LUN.LogDev.Mode = 1;
-			} else if (use_unit_num == 2) {
-				memcpy(c->Header.LUN.LunAddrBytes, scsi3addr,
-				       8);
-				c->Header.LUN.LogDev.Mode = 0;
-			}
 			/* are we trying to read a vital product page */
 			if (page_code != 0) {
 				c->Request.CDB[1] = 0x01;
@@ -2182,8 +2177,6 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff, size_
 			break;
 
 		case CCISS_READ_CAPACITY:
-			c->Header.LUN.LogDev.VolId = h->drv[log_unit].LunID;
-			c->Header.LUN.LogDev.Mode = 1;
 			c->Request.CDBLen = 10;
 			c->Request.Type.Attribute = ATTR_SIMPLE;
 			c->Request.Type.Direction = XFER_READ;
@@ -2191,8 +2184,6 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff, size_
 			c->Request.CDB[0] = cmd;
 			break;
 		case CCISS_READ_CAPACITY_16:
-			c->Header.LUN.LogDev.VolId = h->drv[log_unit].LunID;
-			c->Header.LUN.LogDev.Mode = 1;
 			c->Request.CDBLen = 16;
 			c->Request.Type.Attribute = ATTR_SIMPLE;
 			c->Request.Type.Direction = XFER_READ;
@@ -2215,7 +2206,6 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff, size_
 			c->Request.CDB[6] = BMIC_CACHE_FLUSH;
 			break;
 		case TEST_UNIT_READY:
-			memcpy(c->Header. LUN.LunAddrBytes, scsi3addr, 8);
 			c->Request.CDBLen = 6;
 			c->Request.Type.Attribute = ATTR_SIMPLE;
 			c->Request.Type.Direction = XFER_NONE;
@@ -2239,7 +2229,6 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff, size_
 			memcpy(&c->Request.CDB[4], buff, 8);
 			break;
 		case 1:	/* RESET message */
-			memcpy(c->Header.LUN.LunAddrBytes, scsi3addr, 8);
 			c->Request.CDBLen = 16;
 			c->Request.Type.Attribute = ATTR_SIMPLE;
 			c->Request.Type.Direction = XFER_NONE;
@@ -2307,6 +2296,9 @@ resend_cmd2:
 			printk(KERN_WARNING "cciss: cmd 0x%02x "
 			       "has SCSI Status = %x\n",
 			       c->Request.CDB[0], c->err_info->ScsiStatus);
+			if (c->err_info->ScsiStatus == SAM_STAT_CHECK_CONDITION)
+				printk(KERN_WARNING "sense key = 0x%02x\n",
+					0xf & c->err_info->SenseInfo[2]);
 		}
 		break;
 	case CMD_DATA_UNDERRUN:
@@ -2377,12 +2369,9 @@ command_done:
 	return return_status;
 }
 
-static int sendcmd_withirq(__u8 cmd,
-			   int ctlr,
-			   void *buff,
-			   size_t size,
-			   unsigned int use_unit_num,
-			   unsigned int log_unit, __u8 page_code, int cmd_type)
+static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size,
+			   __u8 page_code, unsigned char scsi3addr[],
+			int cmd_type)
 {
 	ctlr_info_t *h = hba[ctlr];
 	CommandList_struct *c;
@@ -2391,8 +2380,8 @@ static int sendcmd_withirq(__u8 cmd,
 	c = cmd_alloc(h, 0);
 	if (!c)
 		return -ENOMEM;
-	return_status = fill_cmd(c, cmd, ctlr, buff, size, use_unit_num,
-				 log_unit, page_code, NULL, cmd_type);
+	return_status = fill_cmd(c, cmd, ctlr, buff, size, page_code,
+		scsi3addr, cmd_type);
 	if (return_status == IO_OK)
 		return_status = sendcmd_withirq_core(h, c);
 	cmd_free(h, c, 0);
@@ -2407,15 +2396,17 @@ static void cciss_geometry_inquiry(int ctlr, int logvol,
 {
 	int return_code;
 	unsigned long t;
+	unsigned char scsi3addr[8];
 
 	memset(inq_buff, 0, sizeof(InquiryData_struct));
+	log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
 	if (withirq)
 		return_code = sendcmd_withirq(CISS_INQUIRY, ctlr,
-					      inq_buff, sizeof(*inq_buff), 1,
-					      logvol, 0xC1, TYPE_CMD);
+					      inq_buff, sizeof(*inq_buff),
+					      0xC1, scsi3addr, TYPE_CMD);
 	else
 		return_code = sendcmd(CISS_INQUIRY, ctlr, inq_buff,
-				      sizeof(*inq_buff), 1, logvol, 0xC1, NULL,
+				      sizeof(*inq_buff), 0xC1, scsi3addr,
 				      TYPE_CMD);
 	if (return_code == IO_OK) {
 		if (inq_buff->data_byte[8] == 0xFF) {
@@ -2456,6 +2447,7 @@ cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size,
 {
 	ReadCapdata_struct *buf;
 	int return_code;
+	unsigned char scsi3addr[8];
 
 	buf = kzalloc(sizeof(ReadCapdata_struct), GFP_KERNEL);
 	if (!buf) {
@@ -2463,14 +2455,15 @@ cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size,
 		return;
 	}
 
+	log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
 	if (withirq)
 		return_code = sendcmd_withirq(CCISS_READ_CAPACITY,
 				ctlr, buf, sizeof(ReadCapdata_struct),
-					1, logvol, 0, TYPE_CMD);
+					0, scsi3addr, TYPE_CMD);
 	else
 		return_code = sendcmd(CCISS_READ_CAPACITY,
 				ctlr, buf, sizeof(ReadCapdata_struct),
-					1, logvol, 0, NULL, TYPE_CMD);
+					0, scsi3addr, TYPE_CMD);
 	if (return_code == IO_OK) {
 		*total_size = be32_to_cpu(*(__be32 *) buf->total_size);
 		*block_size = be32_to_cpu(*(__be32 *) buf->block_size);
@@ -2490,6 +2483,7 @@ cciss_read_capacity_16(int ctlr, int logvol, int withirq, sector_t *total_size,
 {
 	ReadCapdata_struct_16 *buf;
 	int return_code;
+	unsigned char scsi3addr[8];
 
 	buf = kzalloc(sizeof(ReadCapdata_struct_16), GFP_KERNEL);
 	if (!buf) {
@@ -2497,15 +2491,16 @@ cciss_read_capacity_16(int ctlr, int logvol, int withirq, sector_t *total_size,
 		return;
 	}
 
+	log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
 	if (withirq) {
 		return_code = sendcmd_withirq(CCISS_READ_CAPACITY_16,
 			ctlr, buf, sizeof(ReadCapdata_struct_16),
-				1, logvol, 0, TYPE_CMD);
+				0, scsi3addr, TYPE_CMD);
 	}
 	else {
 		return_code = sendcmd(CCISS_READ_CAPACITY_16,
 			ctlr, buf, sizeof(ReadCapdata_struct_16),
-				1, logvol, 0, NULL, TYPE_CMD);
+				0, scsi3addr, TYPE_CMD);
 	}
 	if (return_code == IO_OK) {
 		*total_size = be64_to_cpu(*(__be64 *) buf->total_size);
@@ -2760,10 +2755,6 @@ cleanup1:
  * Used at init time, and during SCSI error recovery.
  */
 static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size,
-	unsigned int use_unit_num,/* 0: address the controller,
-				     1: address logical volume log_unit,
-				     2: periph device address is scsi3addr */
-	unsigned int log_unit,
 	__u8 page_code, unsigned char *scsi3addr, int cmd_type)
 {
 	CommandList_struct *c;
@@ -2774,8 +2765,8 @@ static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size,
 		printk(KERN_WARNING "cciss: unable to get memory");
 		return IO_ERROR;
 	}
-	status = fill_cmd(c, cmd, ctlr, buff, size, use_unit_num,
-			  log_unit, page_code, scsi3addr, cmd_type);
+	status = fill_cmd(c, cmd, ctlr, buff, size, page_code,
+		scsi3addr, cmd_type);
 	if (status == IO_OK)
 		status = sendcmd_core(hba[ctlr], c);
 	cmd_free(hba[ctlr], c, 1);
@@ -4076,7 +4067,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 	}
 
 	return_code = sendcmd_withirq(CISS_INQUIRY, i, inq_buff,
-		sizeof(InquiryData_struct), 0, 0 , 0, TYPE_CMD);
+		sizeof(InquiryData_struct), 0, CTLR_LUNID, TYPE_CMD);
 	if (return_code == IO_OK) {
 		hba[i]->firm_ver[0] = inq_buff->data_byte[32];
 		hba[i]->firm_ver[1] = inq_buff->data_byte[33];
@@ -4157,8 +4148,8 @@ static void cciss_shutdown(struct pci_dev *pdev)
 	/* sendcmd will turn off interrupt, and send the flush...
 	 * To write all data in the battery backed cache to disks */
 	memset(flush_buf, 0, 4);
-	return_code = sendcmd(CCISS_CACHE_FLUSH, i, flush_buf, 4, 0, 0, 0, NULL,
-			      TYPE_CMD);
+	return_code = sendcmd(CCISS_CACHE_FLUSH, i, flush_buf, 4, 0,
+		CTLR_LUNID, TYPE_CMD);
 	if (return_code == IO_OK) {
 		printk(KERN_INFO "Completed flushing cache on controller %d\n", i);
 	} else {
diff --git a/drivers/block/cciss_cmd.h b/drivers/block/cciss_cmd.h
index 40b1b92..cd665b0 100644
--- a/drivers/block/cciss_cmd.h
+++ b/drivers/block/cciss_cmd.h
@@ -217,6 +217,8 @@ typedef union _LUNAddr_struct {
   LogDevAddr_struct  LogDev;
 } LUNAddr_struct;
 
+#define CTLR_LUNID "\0\0\0\0\0\0\0\0"
+
 typedef struct _CommandListHeader_struct {
   BYTE              ReplyQueue;
   BYTE              SGList;
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index 134fdf2..007ab8a 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -44,26 +44,9 @@
 #define CCISS_ABORT_MSG 0x00
 #define CCISS_RESET_MSG 0x01
 
-/* some prototypes... */ 
-static int sendcmd(
-	__u8	cmd,
-	int	ctlr,
-	void	*buff,
-	size_t	size,
-	unsigned int use_unit_num, /* 0: address the controller,
-				      1: address logical volume log_unit, 
-				      2: address is in scsi3addr */
-	unsigned int log_unit,
-	__u8	page_code,
-	unsigned char *scsi3addr,
-	int cmd_type);
-
 static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff,
 	size_t size,
-	unsigned int use_unit_num, /* 0: address the controller,
-				      1: address logical volume log_unit,
-				      2: periph device address is scsi3addr */
-	unsigned int log_unit, __u8 page_code, unsigned char *scsi3addr,
+	__u8 page_code, unsigned char *scsi3addr,
 	int cmd_type);
 
 static int sendcmd_core(ctlr_info_t *h, CommandList_struct *c);
@@ -1616,7 +1599,7 @@ static int wait_for_device_to_become_ready(ctlr_info_t *h,
 			waittime = waittime * 2;
 
 		/* Send the Test Unit Ready */
-		rc = fill_cmd(c, TEST_UNIT_READY, h->ctlr, NULL, 0, 0, 0, 0,
+		rc = fill_cmd(c, TEST_UNIT_READY, h->ctlr, NULL, 0, 0,
 			lunaddr, TYPE_CMD);
 		if (rc == 0) {
 			rc = sendcmd_core(h, c);
@@ -1680,7 +1663,7 @@ static int cciss_eh_device_reset_handler(struct scsi_cmnd *scsicmd)
 		return FAILED;
 	memcpy(lunaddr, &cmd_in_trouble->Header.LUN.LunAddrBytes[0], 8);
 	/* send a reset to the SCSI LUN which the command was sent to */
-	rc = sendcmd(CCISS_RESET_MSG, ctlr, NULL, 0, 2, 0, 0, lunaddr,
+	rc = sendcmd(CCISS_RESET_MSG, ctlr, NULL, 0, 0, lunaddr,
 		TYPE_MSG);
 	/* sendcmd turned off interrupts on the board, turn 'em back on. */
 	(*c)->access.set_intr_mask(*c, CCISS_INTR_ON);
@@ -1708,8 +1691,7 @@ static int  cciss_eh_abort_handler(struct scsi_cmnd *scsicmd)
 	cmd_to_abort = (CommandList_struct *) scsicmd->host_scribble;
 	if (cmd_to_abort == NULL) /* paranoia */
 		return FAILED;
-	rc = sendcmd(CCISS_ABORT_MSG, ctlr, &cmd_to_abort->Header.Tag, 
-		0, 2, 0, 0, 
+	rc = sendcmd(CCISS_ABORT_MSG, ctlr, &cmd_to_abort->Header.Tag, 0, 0,
 		(unsigned char *) &cmd_to_abort->Header.LUN.LunAddrBytes[0], 
 		TYPE_MSG);
 	/* sendcmd turned off interrupts on the board, turn 'em back on. */
-- 
cgit v0.10.2


From 3c2ab40296894d1f7ad9714550fdf9b96d4e9ee6 Mon Sep 17 00:00:00 2001
From: "scameron@beardog.cca.cpqcorp.net" <scameron@beardog.cca.cpqcorp.net>
Date: Mon, 8 Jun 2009 16:04:35 -0500
Subject: cciss: factor out fix target status processing code from sendcmd
 functions

Factor out code to process target status of completed commands in sendcmd()
and sendcmd_withirq_core(), and fix problem that bad target status was ignored in
sendcmd_withirq_core.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 885ea1e..2d12883 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2267,6 +2267,31 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff,
 	return status;
 }
 
+static int check_target_status(ctlr_info_t *h, CommandList_struct *c)
+{
+	switch (c->err_info->ScsiStatus) {
+	case SAM_STAT_GOOD:
+		return IO_OK;
+	case SAM_STAT_CHECK_CONDITION:
+		switch (0xf & c->err_info->SenseInfo[2]) {
+		case 0: return IO_OK; /* no sense */
+		case 1: return IO_OK; /* recovered error */
+		default:
+			printk(KERN_WARNING "cciss%d: cmd 0x%02x "
+				"check condition, sense key = 0x%02x\n",
+				h->ctlr, c->Request.CDB[0],
+				c->err_info->SenseInfo[2]);
+		}
+		break;
+	default:
+		printk(KERN_WARNING "cciss%d: cmd 0x%02x"
+			"scsi status = 0x%02x\n", h->ctlr,
+			c->Request.CDB[0], c->err_info->ScsiStatus);
+		break;
+	}
+	return IO_ERROR;
+}
+
 static int sendcmd_withirq_core(ctlr_info_t *h, CommandList_struct *c)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
@@ -2290,16 +2315,7 @@ resend_cmd2:
 
 	switch (c->err_info->CommandStatus) {
 	case CMD_TARGET_STATUS:
-		printk(KERN_WARNING "cciss: cmd 0x%02x "
-		"has completed with errors\n", c->Request.CDB[0]);
-		if (c->err_info->ScsiStatus) {
-			printk(KERN_WARNING "cciss: cmd 0x%02x "
-			       "has SCSI Status = %x\n",
-			       c->Request.CDB[0], c->err_info->ScsiStatus);
-			if (c->err_info->ScsiStatus == SAM_STAT_CHECK_CONDITION)
-				printk(KERN_WARNING "sense key = 0x%02x\n",
-					0xf & c->err_info->SenseInfo[2]);
-		}
+		return_status = check_target_status(h, c);
 		break;
 	case CMD_DATA_UNDERRUN:
 	case CMD_DATA_OVERRUN:
@@ -2710,33 +2726,29 @@ resend_cmd1:
 			printk(KERN_WARNING "cciss%d: retried %p too many "
 				"times\n", h->ctlr, c);
 			status = IO_ERROR;
-			goto cleanup1;
+			break;
 		}
 
 		if (c->err_info->CommandStatus == CMD_UNABORTABLE) {
 			printk(KERN_WARNING "cciss%d: command could not be "
 				"aborted.\n", h->ctlr);
 			status = IO_ERROR;
-			goto cleanup1;
+			break;
 		}
 
-		printk(KERN_WARNING "cciss%d: sendcmd error\n", h->ctlr);
-		printk(KERN_WARNING "cmd = 0x%02x, CommandStatus = 0x%02x\n",
-			c->Request.CDB[0], c->err_info->CommandStatus);
 		if (c->err_info->CommandStatus == CMD_TARGET_STATUS) {
-			printk(KERN_WARNING "Target status = 0x%02x\n",
-			c->err_info->ScsiStatus);
-			if (c->err_info->ScsiStatus == 2) /* chk cond */
-				printk(KERN_WARNING "Sense key = 0x%02x\n",
-					0xf & c->err_info->SenseInfo[2]);
+			status = check_target_status(h, c);
+			break;
 		}
 
+		printk(KERN_WARNING "cciss%d: sendcmd error\n", h->ctlr);
+		printk(KERN_WARNING "cmd = 0x%02x, CommandStatus = 0x%02x\n",
+			c->Request.CDB[0], c->err_info->CommandStatus);
 		status = IO_ERROR;
-		goto cleanup1;
+		break;
 
 	} while (1);
 
-cleanup1:
 	/* unlock the data buffer from DMA */
 	buff_dma_handle.val32.lower = c->SG[0].Addr.lower;
 	buff_dma_handle.val32.upper = c->SG[0].Addr.upper;
-- 
cgit v0.10.2


From 789a424ad1352b335960e7c56494d0410577fa61 Mon Sep 17 00:00:00 2001
From: "scameron@beardog.cca.cpqcorp.net" <scameron@beardog.cca.cpqcorp.net>
Date: Mon, 8 Jun 2009 16:05:56 -0500
Subject: cciss: separate error processing and command retrying code in
 sendcmd_withirq_core()

Separate the error processing from sendcmd_withirq_core from the code
which retries commands.  The rationale for this is that the SCSI error
handling code can then be made to use sendcmd_withirq_core, but avoid
retrying commands.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 2d12883..7001707 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2292,26 +2292,12 @@ static int check_target_status(ctlr_info_t *h, CommandList_struct *c)
 	return IO_ERROR;
 }
 
-static int sendcmd_withirq_core(ctlr_info_t *h, CommandList_struct *c)
+static int process_sendcmd_error(ctlr_info_t *h, CommandList_struct *c)
 {
-	DECLARE_COMPLETION_ONSTACK(wait);
-	u64bit buff_dma_handle;
-	unsigned long flags;
 	int return_status = IO_OK;
 
-resend_cmd2:
-	c->waiting = &wait;
-	/* Put the request on the tail of the queue and send it */
-	spin_lock_irqsave(CCISS_LOCK(h->ctlr), flags);
-	addQ(&h->reqQ, c);
-	h->Qdepth++;
-	start_io(h);
-	spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
-
-	wait_for_completion(&wait);
-
-	if (c->err_info->CommandStatus == 0)
-		goto command_done;
+	if (c->err_info->CommandStatus == CMD_SUCCESS)
+		return IO_OK;
 
 	switch (c->err_info->CommandStatus) {
 	case CMD_TARGET_STATUS:
@@ -2322,7 +2308,7 @@ resend_cmd2:
 		/* expected for inquiry and report lun commands */
 		break;
 	case CMD_INVALID:
-		printk(KERN_WARNING "cciss: Cmd 0x%02x is "
+		printk(KERN_WARNING "cciss: cmd 0x%02x is "
 		       "reported invalid\n", c->Request.CDB[0]);
 		return_status = IO_ERROR;
 		break;
@@ -2355,19 +2341,7 @@ resend_cmd2:
 		printk(KERN_WARNING
 		       "cciss%d: unsolicited abort 0x%02x\n", h->ctlr,
 			c->Request.CDB[0]);
-		if (c->retry_count < MAX_CMD_RETRIES) {
-			printk(KERN_WARNING
-			       "cciss%d: retrying 0x%02x\n", h->ctlr,
-				c->Request.CDB[0]);
-			c->retry_count++;
-			/* erase the old error information */
-			memset(c->err_info, 0,
-			       sizeof(ErrorInfo_struct));
-			return_status = IO_OK;
-			INIT_COMPLETION(wait);
-			goto resend_cmd2;
-		}
-		return_status = IO_ERROR;
+		return_status = IO_NEEDS_RETRY;
 		break;
 	default:
 		printk(KERN_WARNING "cciss: cmd 0x%02x returned "
@@ -2375,6 +2349,44 @@ resend_cmd2:
 		       c->err_info->CommandStatus);
 		return_status = IO_ERROR;
 	}
+	return return_status;
+}
+
+static int sendcmd_withirq_core(ctlr_info_t *h, CommandList_struct *c,
+	int attempt_retry)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+	u64bit buff_dma_handle;
+	unsigned long flags;
+	int return_status = IO_OK;
+
+resend_cmd2:
+	c->waiting = &wait;
+	/* Put the request on the tail of the queue and send it */
+	spin_lock_irqsave(CCISS_LOCK(h->ctlr), flags);
+	addQ(&h->reqQ, c);
+	h->Qdepth++;
+	start_io(h);
+	spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
+
+	wait_for_completion(&wait);
+
+	if (c->err_info->CommandStatus == 0 || !attempt_retry)
+		goto command_done;
+
+	return_status = process_sendcmd_error(h, c);
+
+	if (return_status == IO_NEEDS_RETRY &&
+		c->retry_count < MAX_CMD_RETRIES) {
+		printk(KERN_WARNING "cciss%d: retrying 0x%02x\n", h->ctlr,
+			c->Request.CDB[0]);
+		c->retry_count++;
+		/* erase the old error information */
+		memset(c->err_info, 0, sizeof(ErrorInfo_struct));
+		return_status = IO_OK;
+		INIT_COMPLETION(wait);
+		goto resend_cmd2;
+	}
 
 command_done:
 	/* unlock the buffers from DMA */
@@ -2399,7 +2411,8 @@ static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size,
 	return_status = fill_cmd(c, cmd, ctlr, buff, size, page_code,
 		scsi3addr, cmd_type);
 	if (return_status == IO_OK)
-		return_status = sendcmd_withirq_core(h, c);
+		return_status = sendcmd_withirq_core(h, c, 1);
+
 	cmd_free(h, c, 0);
 	return return_status;
 }
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index dd1926d..25cd58e 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -11,6 +11,7 @@
 
 #define IO_OK		0
 #define IO_ERROR	1
+#define IO_NEEDS_RETRY  3
 
 #define VENDOR_LEN	8
 #define MODEL_LEN	16
-- 
cgit v0.10.2


From 85cc61ae41084cb6d8ecc6c9e01ac4563005c8ac Mon Sep 17 00:00:00 2001
From: "scameron@beardog.cca.cpqcorp.net" <scameron@beardog.cca.cpqcorp.net>
Date: Mon, 8 Jun 2009 16:07:45 -0500
Subject: cciss: change SCSI error handling routines to work with interrupts
 enabled.

Change cciss scsi error handling routines to work with interrupts enabled.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 7001707..9a9db6d 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -184,6 +184,9 @@ static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size,
 static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size,
 			__u8 page_code, unsigned char scsi3addr[],
 			int cmd_type);
+static int sendcmd_withirq_core(ctlr_info_t *h, CommandList_struct *c,
+	int attempt_retry);
+static int process_sendcmd_error(ctlr_info_t *h, CommandList_struct *c);
 
 static void fail_all_cmds(unsigned long ctlr);
 static int scan_thread(void *data);
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index 007ab8a..d2eb489 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -49,8 +49,6 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff,
 	__u8 page_code, unsigned char *scsi3addr,
 	int cmd_type);
 
-static int sendcmd_core(ctlr_info_t *h, CommandList_struct *c);
-
 static CommandList_struct *cmd_alloc(ctlr_info_t *h, int get_from_pool);
 static void cmd_free(ctlr_info_t *h, CommandList_struct *c, int got_from_pool);
 
@@ -1601,11 +1599,10 @@ static int wait_for_device_to_become_ready(ctlr_info_t *h,
 		/* Send the Test Unit Ready */
 		rc = fill_cmd(c, TEST_UNIT_READY, h->ctlr, NULL, 0, 0,
 			lunaddr, TYPE_CMD);
-		if (rc == 0) {
-			rc = sendcmd_core(h, c);
-			/* sendcmd turned off interrupts, turn 'em back on. */
-			h->access.set_intr_mask(h, CCISS_INTR_ON);
-		}
+		if (rc == 0)
+			rc = sendcmd_withirq_core(h, c, 0);
+
+		(void) process_sendcmd_error(h, c);
 
 		if (rc == 0 && c->err_info->CommandStatus == CMD_SUCCESS)
 			break;
@@ -1663,10 +1660,8 @@ static int cciss_eh_device_reset_handler(struct scsi_cmnd *scsicmd)
 		return FAILED;
 	memcpy(lunaddr, &cmd_in_trouble->Header.LUN.LunAddrBytes[0], 8);
 	/* send a reset to the SCSI LUN which the command was sent to */
-	rc = sendcmd(CCISS_RESET_MSG, ctlr, NULL, 0, 0, lunaddr,
+	rc = sendcmd_withirq(CCISS_RESET_MSG, ctlr, NULL, 0, 0, lunaddr,
 		TYPE_MSG);
-	/* sendcmd turned off interrupts on the board, turn 'em back on. */
-	(*c)->access.set_intr_mask(*c, CCISS_INTR_ON);
 	if (rc == 0 && wait_for_device_to_become_ready(*c, lunaddr) == 0)
 		return SUCCESS;
 	printk(KERN_WARNING "cciss%d: resetting device failed.\n", ctlr);
@@ -1677,6 +1672,7 @@ static int  cciss_eh_abort_handler(struct scsi_cmnd *scsicmd)
 {
 	int rc;
 	CommandList_struct *cmd_to_abort;
+	unsigned char lunaddr[8];
 	ctlr_info_t **c;
 	int ctlr;
 
@@ -1691,11 +1687,9 @@ static int  cciss_eh_abort_handler(struct scsi_cmnd *scsicmd)
 	cmd_to_abort = (CommandList_struct *) scsicmd->host_scribble;
 	if (cmd_to_abort == NULL) /* paranoia */
 		return FAILED;
-	rc = sendcmd(CCISS_ABORT_MSG, ctlr, &cmd_to_abort->Header.Tag, 0, 0,
-		(unsigned char *) &cmd_to_abort->Header.LUN.LunAddrBytes[0], 
-		TYPE_MSG);
-	/* sendcmd turned off interrupts on the board, turn 'em back on. */
-	(*c)->access.set_intr_mask(*c, CCISS_INTR_ON);
+	memcpy(lunaddr, &cmd_to_abort->Header.LUN.LunAddrBytes[0], 8);
+	rc = sendcmd_withirq(CCISS_ABORT_MSG, ctlr, &cmd_to_abort->Header.Tag,
+		0, 0, lunaddr, TYPE_MSG);
 	if (rc == 0)
 		return SUCCESS;
 	return FAILED;
-- 
cgit v0.10.2


From 72f9f1324fc4cd450c92e4600a710231b0445c75 Mon Sep 17 00:00:00 2001
From: "scameron@beardog.cca.cpqcorp.net" <scameron@beardog.cca.cpqcorp.net>
Date: Mon, 8 Jun 2009 16:09:32 -0500
Subject: cciss: Remove no longer needed sendcmd reject processing code

Now that the cciss SCSI error handling routines operate with interrupts
enabled, we no longer need to maintain the list of command completions that
sendcmd() might inadvertantly scoop up, since now it only runs at driver init
time, and there won't be any other commands for it to scoop up.  So we
can remove that list and the code that adds to it and processes it.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 9a9db6d..b22cec9 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2612,51 +2612,6 @@ static unsigned long pollcomplete(int ctlr)
 	return 1;
 }
 
-static int add_sendcmd_reject(__u8 cmd, int ctlr, unsigned long complete)
-{
-	/* We get in here if sendcmd() is polling for completions
-	   and gets some command back that it wasn't expecting --
-	   something other than that which it just sent down.
-	   Ordinarily, that shouldn't happen, but it can happen when
-	   the scsi tape stuff gets into error handling mode, and
-	   starts using sendcmd() to try to abort commands and
-	   reset tape drives.  In that case, sendcmd may pick up
-	   completions of commands that were sent to logical drives
-	   through the block i/o system, or cciss ioctls completing, etc.
-	   In that case, we need to save those completions for later
-	   processing by the interrupt handler.
-	 */
-
-#ifdef CONFIG_CISS_SCSI_TAPE
-	struct sendcmd_reject_list *srl = &hba[ctlr]->scsi_rejects;
-
-	/* If it's not the scsi tape stuff doing error handling, (abort */
-	/* or reset) then we don't expect anything weird. */
-	if (cmd != CCISS_RESET_MSG && cmd != CCISS_ABORT_MSG) {
-#endif
-		printk(KERN_WARNING "cciss cciss%d: SendCmd "
-		       "Invalid command list address returned! (%lx)\n",
-		       ctlr, complete);
-		/* not much we can do. */
-#ifdef CONFIG_CISS_SCSI_TAPE
-		return 1;
-	}
-
-	/* We've sent down an abort or reset, but something else
-	   has completed */
-	if (srl->ncompletions >= (hba[ctlr]->nr_cmds + 2)) {
-		/* Uh oh.  No room to save it for later... */
-		printk(KERN_WARNING "cciss%d: Sendcmd: Invalid command addr, "
-		       "reject list overflow, command lost!\n", ctlr);
-		return 1;
-	}
-	/* Save it for later */
-	srl->complete[srl->ncompletions] = complete;
-	srl->ncompletions++;
-#endif
-	return 0;
-}
-
 /* Send command c to controller h and poll for it to complete.
  * Turns interrupts off on the board.  Used at driver init time
  * and during SCSI error recovery.
@@ -2701,11 +2656,10 @@ resend_cmd1:
 			break;
 		}
 
-		/* If it's not the cmd we're looking for, save it for later */
+		/* Make sure it's the command we're expecting. */
 		if ((complete & ~CISS_ERROR_BIT) != c->busaddr) {
-			if (add_sendcmd_reject(c->Request.CDB[0],
-				h->ctlr, complete) != 0)
-				BUG(); /* we are hosed if we get here. */
+			printk(KERN_WARNING "cciss%d: Unexpected command "
+				"completion.\n", h->ctlr);
 			continue;
 		}
 
@@ -2770,11 +2724,6 @@ resend_cmd1:
 	buff_dma_handle.val32.upper = c->SG[0].Addr.upper;
 	pci_unmap_single(h->pdev, (dma_addr_t) buff_dma_handle.val,
 			 c->SG[0].Len, PCI_DMA_BIDIRECTIONAL);
-#ifdef CONFIG_CISS_SCSI_TAPE
-	/* if we saved some commands for later, process them now. */
-	if (h->scsi_rejects.ncompletions > 0)
-		do_cciss_intr(0, h);
-#endif
 	return status;
 }
 
@@ -3195,44 +3144,18 @@ startio:
 
 static inline unsigned long get_next_completion(ctlr_info_t *h)
 {
-#ifdef CONFIG_CISS_SCSI_TAPE
-	/* Any rejects from sendcmd() lying around? Process them first */
-	if (h->scsi_rejects.ncompletions == 0)
-		return h->access.command_completed(h);
-	else {
-		struct sendcmd_reject_list *srl;
-		int n;
-		srl = &h->scsi_rejects;
-		n = --srl->ncompletions;
-		/* printk("cciss%d: processing saved reject\n", h->ctlr); */
-		printk("p");
-		return srl->complete[n];
-	}
-#else
 	return h->access.command_completed(h);
-#endif
 }
 
 static inline int interrupt_pending(ctlr_info_t *h)
 {
-#ifdef CONFIG_CISS_SCSI_TAPE
-	return (h->access.intr_pending(h)
-		|| (h->scsi_rejects.ncompletions > 0));
-#else
 	return h->access.intr_pending(h);
-#endif
 }
 
 static inline long interrupt_not_for_us(ctlr_info_t *h)
 {
-#ifdef CONFIG_CISS_SCSI_TAPE
-	return (((h->access.intr_pending(h) == 0) ||
-		 (h->interrupts_enabled == 0))
-		&& (h->scsi_rejects.ncompletions == 0));
-#else
 	return (((h->access.intr_pending(h) == 0) ||
 		 (h->interrupts_enabled == 0)));
-#endif
 }
 
 static irqreturn_t do_cciss_intr(int irq, void *dev_id)
@@ -4054,15 +3977,6 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 		printk(KERN_ERR "cciss: out of memory");
 		goto clean4;
 	}
-#ifdef CONFIG_CISS_SCSI_TAPE
-	hba[i]->scsi_rejects.complete =
-	    kmalloc(sizeof(hba[i]->scsi_rejects.complete[0]) *
-		    (hba[i]->nr_cmds + 5), GFP_KERNEL);
-	if (hba[i]->scsi_rejects.complete == NULL) {
-		printk(KERN_ERR "cciss: out of memory");
-		goto clean4;
-	}
-#endif
 	spin_lock_init(&hba[i]->lock);
 
 	/* Initialize the pdev driver private data.
@@ -4122,9 +4036,6 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 
 clean4:
 	kfree(inq_buff);
-#ifdef CONFIG_CISS_SCSI_TAPE
-	kfree(hba[i]->scsi_rejects.complete);
-#endif
 	kfree(hba[i]->cmd_pool_bits);
 	if (hba[i]->cmd_pool)
 		pci_free_consistent(hba[i]->pdev,
@@ -4242,9 +4153,6 @@ static void __devexit cciss_remove_one(struct pci_dev *pdev)
 	pci_free_consistent(hba[i]->pdev, hba[i]->nr_cmds * sizeof(ErrorInfo_struct),
 			    hba[i]->errinfo_pool, hba[i]->errinfo_pool_dhandle);
 	kfree(hba[i]->cmd_pool_bits);
-#ifdef CONFIG_CISS_SCSI_TAPE
-	kfree(hba[i]->scsi_rejects.complete);
-#endif
 	/*
 	 * Deliberately omit pci_disable_device(): it does something nasty to
 	 * Smart Array controllers that pci_enable_device does not undo
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index 25cd58e..06a5db2 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -53,14 +53,6 @@ typedef struct _drive_info_struct
 	char rev[REV_LEN + 1];       /* SCSI revision string */
 } drive_info_struct;
 
-#ifdef CONFIG_CISS_SCSI_TAPE
-
-struct sendcmd_reject_list {
-	int ncompletions;
-	unsigned long *complete; /* array of NR_CMDS tags */
-};
-
-#endif
 struct ctlr_info 
 {
 	int	ctlr;
@@ -128,7 +120,6 @@ struct ctlr_info
 	void *scsi_ctlr; /* ptr to structure containing scsi related stuff */
 	/* list of block side commands the scsi error handling sucked up */
 	/* and saved for later processing */
-	struct sendcmd_reject_list scsi_rejects;
 #endif
 	unsigned char alive;
 	struct completion *rescan_wait;
-- 
cgit v0.10.2


From 3969251b80a7b143d01576780073fe0cc9ef6253 Mon Sep 17 00:00:00 2001
From: "scameron@beardog.cca.cpqcorp.net" <scameron@beardog.cca.cpqcorp.net>
Date: Mon, 8 Jun 2009 16:10:57 -0500
Subject: cciss: decode unit attention in SCSI error handling code

Make SCSI reset error handler decode unit attention ASC
and after a target reset wait for a unit attention that indicates
a reset occurred rather than just for any old unit attention.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index d2eb489..3315268 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -1604,16 +1604,25 @@ static int wait_for_device_to_become_ready(ctlr_info_t *h,
 
 		(void) process_sendcmd_error(h, c);
 
-		if (rc == 0 && c->err_info->CommandStatus == CMD_SUCCESS)
-			break;
+		if (rc != 0)
+			goto retry_tur;
 
-		if (rc == 0 &&
-			c->err_info->CommandStatus == CMD_TARGET_STATUS &&
-			c->err_info->ScsiStatus == SAM_STAT_CHECK_CONDITION &&
-			(c->err_info->SenseInfo[2] == NO_SENSE ||
-			c->err_info->SenseInfo[2] == UNIT_ATTENTION))
+		if (c->err_info->CommandStatus == CMD_SUCCESS)
 			break;
 
+		if (c->err_info->CommandStatus == CMD_TARGET_STATUS &&
+			c->err_info->ScsiStatus == SAM_STAT_CHECK_CONDITION) {
+			if (c->err_info->SenseInfo[2] == NO_SENSE)
+				break;
+			if (c->err_info->SenseInfo[2] == UNIT_ATTENTION) {
+				unsigned char asc;
+				asc = c->err_info->SenseInfo[12];
+				check_for_unit_attention(h, c);
+				if (asc == POWER_OR_RESET)
+					break;
+			}
+		}
+retry_tur:
 		printk(KERN_WARNING "cciss%d: Waiting %d secs "
 			"for device to become ready.\n",
 			h->ctlr, waittime / HZ);
-- 
cgit v0.10.2


From 9df1bb9b516daeece159ab7fb262d01a0359247c Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 9 Jun 2009 06:22:57 +0200
Subject: Revert "block: Fix bounce limit setting in DM"

This reverts commit a05c0205ba031c01bba33a21bf0a35920eb64833.

DM doesn't need to access the bounce_pfn directly.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 9acd0b7..8d33934 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -194,23 +194,6 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
 EXPORT_SYMBOL(blk_queue_bounce_limit);
 
 /**
- * blk_queue_bounce_pfn - set the bounce buffer limit for queue
- * @q: the request queue for the device
- * @pfn: max address
- *
- * Description:
- *    This function is similar to blk_queue_bounce_limit except it
- *    neither changes allocation flags, nor does it set up the ISA DMA
- *    pool. This function should only be used by stacking drivers.
- *    Hardware drivers should use blk_queue_bounce_limit instead.
- */
-void blk_queue_bounce_pfn(struct request_queue *q, u64 pfn)
-{
-	q->limits.bounce_pfn = pfn;
-}
-EXPORT_SYMBOL(blk_queue_bounce_pfn);
-
-/**
  * blk_queue_max_sectors - set max sectors for a request for this queue
  * @q:  the request queue for the device
  * @max_sectors:  max sectors in the usual 512b unit
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 3ca1604..e9a73bb 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -920,7 +920,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
 	blk_queue_max_segment_size(q, t->limits.max_segment_size);
 	blk_queue_max_hw_sectors(q, t->limits.max_hw_sectors);
 	blk_queue_segment_boundary(q, t->limits.seg_boundary_mask);
-	blk_queue_bounce_pfn(q, t->limits.bounce_pfn);
+	blk_queue_bounce_limit(q, t->limits.bounce_pfn);
 
 	if (t->limits.no_cluster)
 		queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 989aa17..5e740a1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -910,7 +910,6 @@ extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
 extern void blk_cleanup_queue(struct request_queue *);
 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
-extern void blk_queue_bounce_pfn(struct request_queue *, u64);
 extern void blk_queue_max_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short);
-- 
cgit v0.10.2


From 77634f33d4078542cf1087995cced0ffdae25aa2 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Tue, 9 Jun 2009 06:23:22 +0200
Subject: block: Add missing bounce_pfn stacking and fix comments

DM no longer needs to set limits explicitly when calling blk_stack_limits.
Let the latter automatically deal with bounce_pfn scaling.

Fix kerneldoc variable names.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 8d33934..1c4df9b 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -445,7 +445,7 @@ EXPORT_SYMBOL(blk_queue_stack_limits);
 /**
  * blk_stack_limits - adjust queue_limits for stacked devices
  * @t:	the stacking driver limits (top)
- * @bdev:  the underlying queue limits (bottom)
+ * @b:  the underlying queue limits (bottom)
  * @offset:  offset to beginning of data within component device
  *
  * Description:
@@ -458,6 +458,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 {
 	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
 	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
+	t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
 
 	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
 					    b->seg_boundary_mask);
@@ -504,7 +505,7 @@ EXPORT_SYMBOL(blk_stack_limits);
 
 /**
  * disk_stack_limits - adjust queue limits for stacked drivers
- * @t:	MD/DM gendisk (top)
+ * @disk:  MD/DM gendisk (top)
  * @bdev:  the underlying block device (bottom)
  * @offset:  offset to beginning of data within component device
  *
-- 
cgit v0.10.2


From f001a70cdc61c01452d42e8b32fd7c7842ef62d5 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 9 Jun 2009 14:30:31 +1000
Subject: md/raid5: use conf->raid_disks in preference to mddev->raid_disk

mddev->raid_disks can be changed and any time by a request from
user-space.  It is a suggestion as to what number of raid_disks is
desired.

conf->raid_disks can only be changed by the raid5 module with suitable
locks in place.  It is a statement as to the current number of
raid_disks.

There are two places where the latter should be used, but the former
is used.  This can lead to a crash when reshaping an array.

This patch changes to mddev-> to conf->

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5d400ae..75469e6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3288,7 +3288,7 @@ static void unplug_slaves(mddev_t *mddev)
 	int i;
 
 	rcu_read_lock();
-	for (i=0; i<mddev->raid_disks; i++) {
+	for (i = 0; i < conf->raid_disks; i++) {
 		mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
 		if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
 			struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
@@ -4034,7 +4034,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
 	 * We don't need to check the 'failed' flag as when that gets set,
 	 * recovery aborts.
 	 */
-	for (i=0; i<mddev->raid_disks; i++)
+	for (i = 0; i < conf->raid_disks; i++)
 		if (conf->disks[i].rdev == NULL)
 			still_degraded = 1;
 
-- 
cgit v0.10.2


From a8c906ca3f63d05f0d25490cf82276f73c6fe095 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 9 Jun 2009 14:39:59 +1000
Subject: md/raid5 - avoid deadlocks in get_active_stripe during reshape

md has functionality to 'quiesce' and array so that all pending
IO completed and no new IO starts.  This is used to achieve a
stable state before making internal changes.

Currently this quiescing applies equally to normal IO, resync
IO, and reshape IO.
However there is a problem with applying it to reshape IO.
Reshape can have multiple 'stripe_heads' that must be active together.
If the quiesce come between allocating the first and the last of
such a collection, then we deadlock, as the last will not be allocated
until the quiesce is lifted, the quiesce will not be lifted until the
first (which has been allocated) gets used, and that first cannot be
used until the last is allocated.

It is not necessary to inhibit reshape IO when a quiesce is
requested.  Those places in the code that require a full quiesce will
ensure the reshape thread is not running at all.

So allow reshape requests to get access to new stripe_heads without
being blocked by a 'quiesce'.

This only affects in-place reshapes (i.e. where the array does not
grow or shrink) and these are only newly supported.  So this patch is
not needed in earlier kernels.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 75469e6..59f2ec0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -362,7 +362,7 @@ static void raid5_unplug_device(struct request_queue *q);
 
 static struct stripe_head *
 get_active_stripe(raid5_conf_t *conf, sector_t sector,
-		  int previous, int noblock)
+		  int previous, int noblock, int noquiesce)
 {
 	struct stripe_head *sh;
 
@@ -372,7 +372,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector,
 
 	do {
 		wait_event_lock_irq(conf->wait_for_stripe,
-				    conf->quiesce == 0,
+				    conf->quiesce == 0 || noquiesce,
 				    conf->device_lock, /* nothing */);
 		sh = __find_stripe(conf, sector, conf->generation - previous);
 		if (!sh) {
@@ -2671,7 +2671,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
 			sector_t bn = compute_blocknr(sh, i, 1);
 			sector_t s = raid5_compute_sector(conf, bn, 0,
 							  &dd_idx, NULL);
-			sh2 = get_active_stripe(conf, s, 0, 1);
+			sh2 = get_active_stripe(conf, s, 0, 1, 1);
 			if (sh2 == NULL)
 				/* so far only the early blocks of this stripe
 				 * have been requested.  When later blocks
@@ -2944,7 +2944,7 @@ static bool handle_stripe5(struct stripe_head *sh)
 	/* Finish reconstruct operations initiated by the expansion process */
 	if (sh->reconstruct_state == reconstruct_state_result) {
 		struct stripe_head *sh2
-			= get_active_stripe(conf, sh->sector, 1, 1);
+			= get_active_stripe(conf, sh->sector, 1, 1, 1);
 		if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
 			/* sh cannot be written until sh2 has been read.
 			 * so arrange for sh to be delayed a little
@@ -3189,7 +3189,7 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 
 	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
 		struct stripe_head *sh2
-			= get_active_stripe(conf, sh->sector, 1, 1);
+			= get_active_stripe(conf, sh->sector, 1, 1, 1);
 		if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
 			/* sh cannot be written until sh2 has been read.
 			 * so arrange for sh to be delayed a little
@@ -3675,7 +3675,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
 			(unsigned long long)logical_sector);
 
 		sh = get_active_stripe(conf, new_sector, previous,
-				       (bi->bi_rw&RWA_MASK));
+				       (bi->bi_rw&RWA_MASK), 0);
 		if (sh) {
 			if (unlikely(previous)) {
 				/* expansion might have moved on while waiting for a
@@ -3873,7 +3873,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 	for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
 		int j;
 		int skipped = 0;
-		sh = get_active_stripe(conf, stripe_addr+i, 0, 0);
+		sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
 		set_bit(STRIPE_EXPANDING, &sh->state);
 		atomic_inc(&conf->reshape_stripes);
 		/* If any of this stripe is beyond the end of the old
@@ -3922,7 +3922,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 	if (last_sector >= mddev->dev_sectors)
 		last_sector = mddev->dev_sectors - 1;
 	while (first_sector <= last_sector) {
-		sh = get_active_stripe(conf, first_sector, 1, 0);
+		sh = get_active_stripe(conf, first_sector, 1, 0, 1);
 		set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
 		set_bit(STRIPE_HANDLE, &sh->state);
 		release_stripe(sh);
@@ -4022,9 +4022,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
 
 	bitmap_cond_end_sync(mddev->bitmap, sector_nr);
 
-	sh = get_active_stripe(conf, sector_nr, 0, 1);
+	sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
 	if (sh == NULL) {
-		sh = get_active_stripe(conf, sector_nr, 0, 0);
+		sh = get_active_stripe(conf, sector_nr, 0, 0, 0);
 		/* make sure we don't swamp the stripe cache if someone else
 		 * is trying to get access
 		 */
@@ -4086,7 +4086,7 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
 			/* already done this stripe */
 			continue;
 
-		sh = get_active_stripe(conf, sector, 0, 1);
+		sh = get_active_stripe(conf, sector, 0, 1, 0);
 
 		if (!sh) {
 			/* failed to get a stripe - must wait */
-- 
cgit v0.10.2


From 0e6e0271a210817e202c8a4bfffbde3e3c0616d1 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 9 Jun 2009 16:32:22 +1000
Subject: md/raid5: fix bug in reshape code when chunk_size decreases.

Now that we support changing the chunksize, we calculate
"reshape_sectors" to be the max of number of sectors in old
and new chunk size.
However there is one please where we still use 'chunksize'
rather than 'reshape_sectors'.
This causes a reshape that reduces the size of chunks to freeze.

Signed-off-by: NeilBrown <neilb@suse.de>

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 59f2ec0..bb37fb1 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3916,7 +3916,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 		raid5_compute_sector(conf, stripe_addr*(new_data_disks),
 				     1, &dd_idx, NULL);
 	last_sector =
-		raid5_compute_sector(conf, ((stripe_addr+conf->chunk_size/512)
+		raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
 					    *(new_data_disks) - 1),
 				     1, &dd_idx, NULL);
 	if (last_sector >= mddev->dev_sectors)
-- 
cgit v0.10.2


From 29150078d7a1758df8c7a6cd2ec066ac65e1fab9 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 9 Jun 2009 10:54:18 +0200
Subject: amd-iommu: remove BUS_NOTIFY_BOUND_DRIVER handling

Handling this event causes device assignment in KVM to fail because the
device gets re-attached as soon as the pci-stub registers as the driver
for the device.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 8510e90..8187260 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1145,17 +1145,6 @@ static int device_change_notifier(struct notifier_block *nb,
 			  "to a non-dma-ops domain\n", dev_name(dev));
 
 	switch (action) {
-	case BUS_NOTIFY_BOUND_DRIVER:
-		if (domain)
-			goto out;
-		dma_domain = find_protection_domain(devid);
-		if (!dma_domain)
-			dma_domain = iommu->default_dom;
-		attach_device(iommu, &dma_domain->domain, devid);
-		DUMP_printk(KERN_INFO "AMD IOMMU: Using protection domain "
-			    "%d for device %s\n",
-			    dma_domain->domain.id, dev_name(dev));
-		break;
 	case BUS_NOTIFY_UNBOUND_DRIVER:
 		if (!domain)
 			goto out;
-- 
cgit v0.10.2


From 71ff3bca2f70264effe8cbbdd5bc10cf6be5f2f0 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 8 Jun 2009 13:47:33 -0700
Subject: amd-iommu: detach device explicitly before attaching it to a new
 domain

This fixes a bug with a device that could not be assigned to a KVM guest
because it is still assigned to a dma_ops protection domain.

[chrisw: simply remove WARN_ON(), will always fire since dev->driver
will be pci-sub]

Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 8187260..772e910 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2073,7 +2073,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
 
 	old_domain = domain_for_device(devid);
 	if (old_domain)
-		return -EBUSY;
+		detach_device(old_domain, devid);
 
 	attach_device(iommu, domain, devid);
 
-- 
cgit v0.10.2


From e9a22a13c71986851e931bdfa054f68839ff8576 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 9 Jun 2009 12:00:37 +0200
Subject: amd-iommu: remove unnecessary "AMD IOMMU: " prefix

That prefix is already included in the DUMP_printk macro. So there is no
need to repeat it in the format string.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 772e910..1c60554 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1266,9 +1266,8 @@ static int get_device_resources(struct device *dev,
 			dma_dom = (*iommu)->default_dom;
 		*domain = &dma_dom->domain;
 		attach_device(*iommu, *domain, *bdf);
-		DUMP_printk(KERN_INFO "AMD IOMMU: Using protection domain "
-				"%d for device %s\n",
-				(*domain)->id, dev_name(dev));
+		DUMP_printk("Using protection domain %d for device %s\n",
+			    (*domain)->id, dev_name(dev));
 	}
 
 	if (domain_for_device(_bdf) == NULL)
-- 
cgit v0.10.2


From fdd7b4c3302c93f6833e338903ea77245eb510b4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 9 Jun 2009 04:01:02 -0700
Subject: r8169: fix crash when large packets are received

Michael Tokarev reported receiving a large packet could crash
a machine with RTL8169 NIC.
( original thread at http://lkml.org/lkml/2009/6/8/192 )

Problem is this driver tells that NIC frames up to 16383 bytes
can be received but provides skb to rx ring allocated with
smaller sizes (1536 bytes in case standard 1500 bytes MTU is used)

When a frame larger than what was allocated by driver is received,
dma transfert can occurs past the end of buffer and corrupt
kernel memory.

Fix is to tell to NIC what is the maximum size a frame can be.

This bug is very old, (before git introduction, linux-2.6.10), and
should be backported to stable versions.

Reported-by: Michael Tokarev <mjt@tls.msk.ru>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Tested-by: Michael Tokarev <mjt@tls.msk.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 8247a94..3b19e0c 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -66,7 +66,6 @@ static const int multicast_filter_limit = 32;
 #define RX_DMA_BURST	6	/* Maximum PCI burst, '6' is 1024 */
 #define TX_DMA_BURST	6	/* Maximum PCI burst, '6' is 1024 */
 #define EarlyTxThld	0x3F	/* 0x3F means NO early transmit */
-#define RxPacketMaxSize	0x3FE8	/* 16K - 1 - ETH_HLEN - VLAN - CRC... */
 #define SafeMtu		0x1c20	/* ... actually life sucks beyond ~7k */
 #define InterFrameGap	0x03	/* 3 means InterFrameGap = the shortest one */
 
@@ -2357,10 +2356,10 @@ static u16 rtl_rw_cpluscmd(void __iomem *ioaddr)
 	return cmd;
 }
 
-static void rtl_set_rx_max_size(void __iomem *ioaddr)
+static void rtl_set_rx_max_size(void __iomem *ioaddr, unsigned int rx_buf_sz)
 {
 	/* Low hurts. Let's disable the filtering. */
-	RTL_W16(RxMaxSize, 16383);
+	RTL_W16(RxMaxSize, rx_buf_sz);
 }
 
 static void rtl8169_set_magic_reg(void __iomem *ioaddr, unsigned mac_version)
@@ -2407,7 +2406,7 @@ static void rtl_hw_start_8169(struct net_device *dev)
 
 	RTL_W8(EarlyTxThres, EarlyTxThld);
 
-	rtl_set_rx_max_size(ioaddr);
+	rtl_set_rx_max_size(ioaddr, tp->rx_buf_sz);
 
 	if ((tp->mac_version == RTL_GIGA_MAC_VER_01) ||
 	    (tp->mac_version == RTL_GIGA_MAC_VER_02) ||
@@ -2668,7 +2667,7 @@ static void rtl_hw_start_8168(struct net_device *dev)
 
 	RTL_W8(EarlyTxThres, EarlyTxThld);
 
-	rtl_set_rx_max_size(ioaddr);
+	rtl_set_rx_max_size(ioaddr, tp->rx_buf_sz);
 
 	tp->cp_cmd |= RTL_R16(CPlusCmd) | PktCntrDisable | INTT_1;
 
@@ -2846,7 +2845,7 @@ static void rtl_hw_start_8101(struct net_device *dev)
 
 	RTL_W8(EarlyTxThres, EarlyTxThld);
 
-	rtl_set_rx_max_size(ioaddr);
+	rtl_set_rx_max_size(ioaddr, tp->rx_buf_sz);
 
 	tp->cp_cmd |= rtl_rw_cpluscmd(ioaddr) | PCIMulRW;
 
-- 
cgit v0.10.2


From 52ea3a56a3268bc2a5a7c75e98c81463004e38ef Mon Sep 17 00:00:00 2001
From: Minoru Usui <usui@mxm.nes.nec.co.jp>
Date: Tue, 9 Jun 2009 04:03:09 -0700
Subject: cls_cgroup: Fix oops when user send improperly 'tc filter add'
 request

I found a bug in cls_cgroup_change() in cls_cgroup.c.
cls_cgroup_change() expected tca[TCA_OPTIONS] was set from user space properly,
but tc in iproute2-2.6.29-1 (which I used) didn't set it.

In the current source code of tc in git, it set tca[TCA_OPTIONS].

  git://git.kernel.org/pub/scm/linux/kernel/git/shemminger/iproute2.git

If we always use a newest iproute2 in git when we use cls_cgroup,
we don't face this oops probably.
But I think, kernel shouldn't panic regardless of use program's behaviour.

Signed-off-by: Minoru Usui <usui@mxm.nes.nec.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index cc29b44..e5becb9 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -167,6 +167,9 @@ static int cls_cgroup_change(struct tcf_proto *tp, unsigned long base,
 	struct tcf_exts e;
 	int err;
 
+	if (!tca[TCA_OPTIONS])
+		return -EINVAL;
+
 	if (head == NULL) {
 		if (!handle)
 			return -EINVAL;
-- 
cgit v0.10.2


From 1d589bb16b825b3a7b4edd34d997f1f1f953033d Mon Sep 17 00:00:00 2001
From: john cooper <john.cooper@redhat.com>
Date: Tue, 9 Jun 2009 14:41:40 +0200
Subject: Add serial number support for virtio_blk, V4a

This patch extracts the opaque data from pci i/o
region 0 via the added VIRTIO_BLK_F_IDENTIFY
field.  By convention this data takes the form of
that returned by an ATA IDENTIFY DEVICE command,
however the driver (except for structure size)
makes no interpretation of the data.  The structure
data is copied wholesale to userspace via a
HDIO_GET_IDENTITY ioctl command (eg: hdparm -i <dev>).

Signed-off-by: john cooper <john.cooper@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index c4845b1..c0facaa 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -171,11 +171,43 @@ static void do_virtblk_request(struct request_queue *q)
 		vblk->vq->vq_ops->kick(vblk->vq);
 }
 
+/* return ATA identify data
+ */
+static int virtblk_identify(struct gendisk *disk, void *argp)
+{
+	struct virtio_blk *vblk = disk->private_data;
+	void *opaque;
+	int err = -ENOMEM;
+
+	opaque = kmalloc(VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
+	if (!opaque)
+		goto out;
+
+	err = virtio_config_buf(vblk->vdev, VIRTIO_BLK_F_IDENTIFY,
+		offsetof(struct virtio_blk_config, identify), opaque,
+		VIRTIO_BLK_ID_BYTES);
+
+	if (err)
+		goto out_kfree;
+
+	if (copy_to_user(argp, opaque, VIRTIO_BLK_ID_BYTES))
+		err = -EFAULT;
+
+out_kfree:
+	kfree(opaque);
+out:
+	return err;
+}
+
 static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
 			 unsigned cmd, unsigned long data)
 {
 	struct gendisk *disk = bdev->bd_disk;
 	struct virtio_blk *vblk = disk->private_data;
+	void __user *argp = (void __user *)data;
+
+	if (cmd == HDIO_GET_IDENTITY)
+		return virtblk_identify(disk, argp);
 
 	/*
 	 * Only allow the generic SCSI ioctls if the host can support it.
@@ -183,8 +215,7 @@ static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
 	if (!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_SCSI))
 		return -ENOIOCTLCMD;
 
-	return scsi_cmd_ioctl(disk->queue, disk, mode, cmd,
-			      (void __user *)data);
+	return scsi_cmd_ioctl(disk->queue, disk, mode, cmd, argp);
 }
 
 /* We provide getgeo only to please some old bootloader/partitioning tools */
@@ -390,7 +421,7 @@ static struct virtio_device_id id_table[] = {
 static unsigned int features[] = {
 	VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX,
 	VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
-	VIRTIO_BLK_F_SCSI,
+	VIRTIO_BLK_F_SCSI, VIRTIO_BLK_F_IDENTIFY
 };
 
 static struct virtio_driver virtio_blk = {
diff --git a/include/linux/virtio_blk.h b/include/linux/virtio_blk.h
index 4dbcbc1..be7d255 100644
--- a/include/linux/virtio_blk.h
+++ b/include/linux/virtio_blk.h
@@ -16,6 +16,9 @@
 #define VIRTIO_BLK_F_RO		5	/* Disk is read-only */
 #define VIRTIO_BLK_F_BLK_SIZE	6	/* Block size of disk is available*/
 #define VIRTIO_BLK_F_SCSI	7	/* Supports scsi command passthru */
+#define VIRTIO_BLK_F_IDENTIFY	8	/* ATA IDENTIFY supported */
+
+#define VIRTIO_BLK_ID_BYTES	(sizeof(__u16[256]))	/* IDENTIFY DATA */
 
 struct virtio_blk_config
 {
@@ -33,6 +36,7 @@ struct virtio_blk_config
 	} geometry;
 	/* block size of device (if VIRTIO_BLK_F_BLK_SIZE) */
 	__u32 blk_size;
+	__u8 identify[VIRTIO_BLK_ID_BYTES];
 } __attribute__((packed));
 
 /* These two define direction. */
-- 
cgit v0.10.2


From 0281b5dc0350cbf6dd21ed558a33cccce77abc02 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 6 Jun 2009 14:50:36 -0700
Subject: cpumask: introduce zalloc_cpumask_var

So can get cpumask_var with cpumask_clear

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 9f31538..c5ac87c 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -1022,6 +1022,8 @@ typedef struct cpumask *cpumask_var_t;
 
 bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
 bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags);
+bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
+bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags);
 void alloc_bootmem_cpumask_var(cpumask_var_t *mask);
 void free_cpumask_var(cpumask_var_t mask);
 void free_bootmem_cpumask_var(cpumask_var_t mask);
@@ -1040,6 +1042,19 @@ static inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
 	return true;
 }
 
+static inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
+{
+	cpumask_clear(*mask);
+	return true;
+}
+
+static inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
+					  int node)
+{
+	cpumask_clear(*mask);
+	return true;
+}
+
 static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask)
 {
 }
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 1f71b97..eb23aaa 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -119,6 +119,12 @@ bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
 }
 EXPORT_SYMBOL(alloc_cpumask_var_node);
 
+bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
+{
+	return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node);
+}
+EXPORT_SYMBOL(zalloc_cpumask_var_node);
+
 /**
  * alloc_cpumask_var - allocate a struct cpumask
  * @mask: pointer to cpumask_var_t where the cpumask is returned
@@ -135,6 +141,12 @@ bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
 }
 EXPORT_SYMBOL(alloc_cpumask_var);
 
+bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
+{
+	return alloc_cpumask_var(mask, flags | __GFP_ZERO);
+}
+EXPORT_SYMBOL(zalloc_cpumask_var);
+
 /**
  * alloc_bootmem_cpumask_var - allocate a struct cpumask from the bootmem arena.
  * @mask: pointer to cpumask_var_t where the cpumask is returned
-- 
cgit v0.10.2


From eaa958402ea40851097d051f52ba1bb7a885efe9 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 6 Jun 2009 14:51:36 -0700
Subject: cpumask: alloc zeroed cpumask for static cpumask_var_ts

These are defined as static cpumask_var_t so if MAXSMP is not used,
they are cleared already.  Avoid surprises when MAXSMP is enabled.

Signed-off-by: Yinghai Lu <yinghai.lu@kernel.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 54b6de2..752e8c6 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -550,7 +550,7 @@ static int __init acpi_cpufreq_early_init(void)
 		return -ENOMEM;
 	}
 	for_each_possible_cpu(i) {
-		if (!alloc_cpumask_var_node(
+		if (!zalloc_cpumask_var_node(
 			&per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,
 			GFP_KERNEL, cpu_to_node(i))) {
 
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index a8363e5..d47c775 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -322,7 +322,7 @@ static int powernow_acpi_init(void)
 		goto err0;
 	}
 
-	if (!alloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,
+	if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,
 								GFP_KERNEL)) {
 		retval = -ENOMEM;
 		goto err05;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 35dc8fb..cf52215 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -887,7 +887,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
 	/* notify BIOS that we exist */
 	acpi_processor_notify_smm(THIS_MODULE);
 
-	if (!alloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {
+	if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {
 		printk(KERN_ERR PFX
 				"unable to alloc powernow_k8_data cpumask\n");
 		ret_val = -ENOMEM;
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index c9f1fdc..55c831e 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -471,7 +471,7 @@ static int centrino_target (struct cpufreq_policy *policy,
 
 	if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL)))
 		return -ENOMEM;
-	if (unlikely(!alloc_cpumask_var(&covered_cpus, GFP_KERNEL))) {
+	if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) {
 		free_cpumask_var(saved_mask);
 		return -ENOMEM;
 	}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 6fb0b35..09dd1d4 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -1163,7 +1163,7 @@ static __init int mce_init_device(void)
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
 
-	alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
+	zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
 
 	err = mce_init_banks();
 	if (err)
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index ed0c337..8c7b03b 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -832,7 +832,7 @@ static int __init uv_bau_init(void)
 		return 0;
 
 	for_each_possible_cpu(cur_cpu)
-		alloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
+		zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
 				       GFP_KERNEL, cpu_to_node(cur_cpu));
 
 	uv_bau_retry_limit = 1;
diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index 45ad328..23f0fb8 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -844,7 +844,7 @@ static int acpi_processor_add(struct acpi_device *device)
 	if (!pr)
 		return -ENOMEM;
 
-	if (!alloc_cpumask_var(&pr->throttling.shared_cpu_map, GFP_KERNEL)) {
+	if (!zalloc_cpumask_var(&pr->throttling.shared_cpu_map, GFP_KERNEL)) {
 		kfree(pr);
 		return -ENOMEM;
 	}
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 47d2ad0..6e2ec0b 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -808,7 +808,7 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 		ret = -ENOMEM;
 		goto nomem_out;
 	}
-	if (!alloc_cpumask_var(&policy->related_cpus, GFP_KERNEL)) {
+	if (!zalloc_cpumask_var(&policy->related_cpus, GFP_KERNEL)) {
 		free_cpumask_var(policy->cpus);
 		kfree(policy);
 		ret = -ENOMEM;
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index cdd3c89..344712a 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -165,7 +165,7 @@ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
 		vec->count = 0;
 		if (bootmem)
 			alloc_bootmem_cpumask_var(&vec->mask);
-		else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL))
+		else if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
 			goto cleanup;
 	}
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f2c66f8..9bf0d2a 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1591,7 +1591,7 @@ static inline void init_sched_rt_class(void)
 	unsigned int i;
 
 	for_each_possible_cpu(i)
-		alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
+		zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
 					GFP_KERNEL, cpu_to_node(i));
 }
 #endif /* CONFIG_SMP */
diff --git a/kernel/smp.c b/kernel/smp.c
index 858baac..ad63d85 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -52,7 +52,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
-		if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
+		if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
 				cpu_to_node(cpu)))
 			return NOTIFY_BAD;
 		break;
-- 
cgit v0.10.2


From 8437a617708d014d6f220df201a24960e00d57b1 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sat, 6 Jun 2009 14:52:35 -0700
Subject: kvm: fix kvm reboot crash when MAXSMP is used

one system was found there is crash during reboot then kvm/MAXSMP
Sending all processes the KILL signal...                              done
Please stand by while rebooting the system...
[ 1721.856538] md: stopping all md devices.
[ 1722.852139] kvm: exiting hardware virtualization
[ 1722.854601] BUG: unable to handle kernel NULL pointer dereference at (null)
[ 1722.872219] IP: [<ffffffff8102c6b6>] hardware_disable+0x4c/0xb4
[ 1722.877955] PGD 0
[ 1722.880042] Oops: 0000 [#1] SMP
[ 1722.892548] last sysfs file: /sys/devices/pci0000:00/0000:00:01.0/0000:01:00.0/host0/target0:2:0/0:2:0:0/vendor
[ 1722.900977] CPU 9
[ 1722.912606] Modules linked in:
[ 1722.914226] Pid: 0, comm: swapper Not tainted 2.6.30-rc7-tip-01843-g2305324-dirty #299 ...
[ 1722.932589] RIP: 0010:[<ffffffff8102c6b6>]  [<ffffffff8102c6b6>] hardware_disable+0x4c/0xb4
[ 1722.942709] RSP: 0018:ffffc900010b6ed8  EFLAGS: 00010046
[ 1722.956121] RAX: 0000000000000000 RBX: ffffc9000e253140 RCX: 0000000000000009
[ 1722.972202] RDX: 000000000000b020 RSI: ffffc900010c3220 RDI: ffffffffffffd790
[ 1722.977399] RBP: ffffc900010b6f08 R08: 0000000000000000 R09: 0000000000000000
[ 1722.995149] R10: 00000000000004b8 R11: 966912b6c78fddbd R12: 0000000000000009
[ 1723.011551] R13: 000000000000b020 R14: 0000000000000009 R15: 0000000000000000
[ 1723.019898] FS:  0000000000000000(0000) GS:ffffc900010b3000(0000) knlGS:0000000000000000
[ 1723.034389] CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
[ 1723.041164] CR2: 0000000000000000 CR3: 0000000001001000 CR4: 00000000000006e0
[ 1723.056192] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1723.072546] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 1723.080562] Process swapper (pid: 0, threadinfo ffff88107e464000, task ffff88047e5a2550)
[ 1723.096144] Stack:
[ 1723.099071]  0000000000000046 ffffc9000e253168 966912b6c78fddbd ffffc9000e253140
[ 1723.115471]  ffff880c7d4304d0 ffffc9000e253168 ffffc900010b6f28 ffffffff81011022
[ 1723.132428]  ffffc900010b6f48 966912b6c78fddbd ffffc900010b6f48 ffffffff8100b83b
[ 1723.141973] Call Trace:
[ 1723.142981]  <IRQ> <0> [<ffffffff81011022>] kvm_arch_hardware_disable+0x26/0x3c
[ 1723.158153]  [<ffffffff8100b83b>] hardware_disable+0x3f/0x55
[ 1723.172168]  [<ffffffff810b95f6>] generic_smp_call_function_interrupt+0x76/0x13c
[ 1723.178836]  [<ffffffff8104cbea>] smp_call_function_interrupt+0x3a/0x5e
[ 1723.194689]  [<ffffffff81035bf3>] call_function_interrupt+0x13/0x20
[ 1723.199750]  <EOI> <0> [<ffffffff814ad3b4>] ? acpi_idle_enter_c1+0xd3/0xf4
[ 1723.217508]  [<ffffffff814ad3ae>] ? acpi_idle_enter_c1+0xcd/0xf4
[ 1723.232172]  [<ffffffff814ad4bc>] ? acpi_idle_enter_bm+0xe7/0x2ce
[ 1723.235141]  [<ffffffff81a8d93f>] ? __atomic_notifier_call_chain+0x0/0xac
[ 1723.253381]  [<ffffffff818c3dff>] ? menu_select+0x58/0xd2
[ 1723.258179]  [<ffffffff818c2c9d>] ? cpuidle_idle_call+0xa4/0xf3
[ 1723.272828]  [<ffffffff81034085>] ? cpu_idle+0xb8/0x101
[ 1723.277085]  [<ffffffff81a80163>] ? start_secondary+0x1bc/0x1d7
[ 1723.293708] Code: b0 00 00 65 48 8b 04 25 28 00 00 00 48 89 45 e0 31 c0 48 8b 04 cd 30 ee 27 82 49 89 cc 49 89 d5 48 8b 04 10 48 8d b8 90 d7 ff ff <48> 8b 87 70 28 00 00 48 8d 98 90 d7 ff ff eb 16 e8 e9 fe ff ff
[ 1723.335524] RIP  [<ffffffff8102c6b6>] hardware_disable+0x4c/0xb4
[ 1723.342076]  RSP <ffffc900010b6ed8>
[ 1723.352021] CR2: 0000000000000000
[ 1723.354348] ---[ end trace e2aec53dae150aa1 ]---

it turns out that we need clear cpus_hardware_enabled in that case.

Reported-and-tested-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4293528..4d0dd39 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2301,7 +2301,7 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
 
 	bad_pfn = page_to_pfn(bad_page);
 
-	if (!alloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
+	if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
 		r = -ENOMEM;
 		goto out_free_0;
 	}
-- 
cgit v0.10.2


From c1d4c41f2fdfe66dea957b76d005affba3e56b26 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 9 Jun 2009 15:17:37 +0200
Subject: bsg: setting rq->bio to NULL

Due to commit 1cd96c242a829d52f7a5ae98f554ca9775429685 ("block: WARN
in __blk_put_request() for potential bio leak"), BSG SMP requests get
the false warnings:

WARNING: at block/blk-core.c:1068 __blk_put_request+0x52/0xc0()

This sets rq->bio to NULL to avoid that false warnings.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/bsg.c b/block/bsg.c
index 206060e..dd81be4 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -315,6 +315,7 @@ out:
 	blk_put_request(rq);
 	if (next_rq) {
 		blk_rq_unmap_user(next_rq->bio);
+		next_rq->bio = NULL;
 		blk_put_request(next_rq);
 	}
 	return ERR_PTR(ret);
@@ -448,6 +449,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
 		hdr->dout_resid = rq->data_len;
 		hdr->din_resid = rq->next_rq->data_len;
 		blk_rq_unmap_user(bidi_bio);
+		rq->next_rq->bio = NULL;
 		blk_put_request(rq->next_rq);
 	} else if (rq_data_dir(rq) == READ)
 		hdr->din_resid = rq->data_len;
@@ -466,6 +468,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
 	blk_rq_unmap_user(bio);
 	if (rq->cmd != rq->__cmd)
 		kfree(rq->cmd);
+	rq->bio = NULL;
 	blk_put_request(rq);
 
 	return ret;
-- 
cgit v0.10.2


From 42937e81a82b6bbc51a309c83da140b3a7ca5945 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Mon, 8 Jun 2009 15:55:09 +0200
Subject: x86: Detect use of extended APIC ID for AMD CPUs

Booting a 32-bit kernel on Magny-Cours results in the following panic:

  ...
  Using APIC driver default
  ...
  Overriding APIC driver with bigsmp
  ...
  Getting VERSION: 80050010
  Getting VERSION: 80050010
  Getting ID: 10000000
  Getting ID: ef000000
  Getting LVT0: 700
  Getting LVT1: 10000
  Kernel panic - not syncing: Boot APIC ID in local APIC unexpected (16 vs 0)
  Pid: 1, comm: swapper Not tainted 2.6.30-rcX #2
  Call Trace:
   [<c05194da>] ? panic+0x38/0xd3
   [<c0743102>] ? native_smp_prepare_cpus+0x259/0x31f
   [<c073b19d>] ? kernel_init+0x3e/0x141
   [<c073b15f>] ? kernel_init+0x0/0x141
   [<c020325f>] ? kernel_thread_helper+0x7/0x10

The reason is that default_get_apic_id handled extension of local APIC
ID field just in case of XAPIC.

Thus for this AMD CPU, default_get_apic_id() returns 0 and
bigsmp_get_apic_id() returns 16 which leads to the respective kernel
panic.

This patch introduces a Linux specific feature flag to indicate
support for extended APIC id (8 bits instead of 4 bits width) and sets
the flag on AMD CPUs if applicable.

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: <stable@kernel.org>
LKML-Reference: <20090608135509.GA12431@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 42f2f83..9b2c049 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -410,7 +410,7 @@ static inline unsigned default_get_apic_id(unsigned long x)
 {
 	unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
 
-	if (APIC_XAPIC(ver))
+	if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID))
 		return (x >> 24) & 0xFF;
 	else
 		return (x >> 24) & 0x0F;
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index bb83b1c..78dee4f 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -94,6 +94,7 @@
 #define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
 #define X86_FEATURE_NONSTOP_TSC	(3*32+24) /* TSC does not stop in C states */
 #define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */
+#define X86_FEATURE_EXTD_APICID	(3*32+26) /* has extended APICID (8 bits) */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* "pni" SSE-3 */
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 7e4a459..0802e15 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -6,6 +6,7 @@
 #include <asm/processor.h>
 #include <asm/apic.h>
 #include <asm/cpu.h>
+#include <asm/pci-direct.h>
 
 #ifdef CONFIG_X86_64
 # include <asm/numa_64.h>
@@ -351,6 +352,15 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
 		    (c->x86_model == 8 && c->x86_mask >= 8))
 			set_cpu_cap(c, X86_FEATURE_K6_MTRR);
 #endif
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
+	/* check CPU config space for extended APIC ID */
+	if (c->x86 >= 0xf) {
+		unsigned int val;
+		val = read_pci_config(0, 24, 0, 0x68);
+		if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
+			set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+	}
+#endif
 }
 
 static void __cpuinit init_amd(struct cpuinfo_x86 *c)
-- 
cgit v0.10.2


From 0eab928221bac8895a0b494a16a8810002bd8645 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 9 Jun 2009 09:54:40 -0400
Subject: ext4: Don't treat a truncation of a zero-length file as
 replace-via-truncate

If a non-existent file is opened via O_WRONLY|O_CREAT|O_TRUNC, there's
no need to treat this as a true file truncation, so we shouldn't
activate the replace-via-truncate hueristic.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2c10d34..875db94 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4127,7 +4127,8 @@ void ext4_truncate(struct inode *inode)
 	if (!ext4_can_truncate(inode))
 		return;
 
-	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+	if (ei->i_disksize && inode->i_size == 0 &&
+	    !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
 		ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
 
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
-- 
cgit v0.10.2


From fecc8ac8496fce96069724f54daba8e7078b0082 Mon Sep 17 00:00:00 2001
From: Yong Wang <yong.y.wang@linux.intel.com>
Date: Tue, 9 Jun 2009 21:15:53 +0800
Subject: perf_counter, x86: Correct some event and umask values for Intel
 processors

Correct some event and UMASK values according to Intel SDM,
in the Nehalem and Atom tables.

Signed-off-by: Yong Wang <yong.y.wang@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <20090609131553.GA12489@ywang-moblin2.bj.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 56001fe..40978aa 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -119,7 +119,7 @@ static const u64 nehalem_hw_cache_event_ids
  },
  [ C(L1I ) ] = {
 	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0480, /* L1I.READS                    */
+		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
 		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
 	},
 	[ C(OP_WRITE) ] = {
@@ -162,7 +162,7 @@ static const u64 nehalem_hw_cache_event_ids
  [ C(ITLB) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
-		[ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISS_RETIRED            */
+		[ C(RESULT_MISS)   ] = 0x20c8, /* ITLB_MISS_RETIRED            */
 	},
 	[ C(OP_WRITE) ] = {
 		[ C(RESULT_ACCESS) ] = -1,
@@ -291,7 +291,7 @@ static const u64 atom_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0,
 	},
 	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x2241, /* L1D_CACHE.ST               */
+		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST               */
 		[ C(RESULT_MISS)   ] = 0,
 	},
 	[ C(OP_PREFETCH) ] = {
@@ -301,8 +301,8 @@ static const u64 atom_hw_cache_event_ids
  },
  [ C(L1I ) ] = {
 	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
-		[ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
+		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                  */
+		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                 */
 	},
 	[ C(OP_WRITE) ] = {
 		[ C(RESULT_ACCESS) ] = -1,
@@ -329,11 +329,11 @@ static const u64 atom_hw_cache_event_ids
  },
  [ C(DTLB) ] = {
 	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
+		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI  (alias) */
 		[ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
 	},
 	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
+		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI  (alias) */
 		[ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
 	},
 	[ C(OP_PREFETCH) ] = {
-- 
cgit v0.10.2


From 0b8c3d5ab000c22889af7f9409799a6cdc31a2b2 Mon Sep 17 00:00:00 2001
From: Chuck Ebbert <cebbert@redhat.com>
Date: Tue, 9 Jun 2009 10:40:50 -0400
Subject: x86: Clear TS in irq_ts_save() when in an atomic section

The dynamic FPU context allocation changes caused the padlock driver
to generate the below warning. Fix it by masking TS when doing padlock
encryption operations in an atomic section.

This solves:

BUG: sleeping function called from invalid context at mm/slub.c:1602
in_atomic(): 1, irqs_disabled(): 0, pid: 82, name: cryptomgr_test
Pid: 82, comm: cryptomgr_test Not tainted 2.6.29.4-168.test7.fc11.x86_64 #1
Call Trace:
[<ffffffff8103ff16>] __might_sleep+0x10b/0x110
[<ffffffff810cd3b2>] kmem_cache_alloc+0x37/0xf1
[<ffffffff81018505>] init_fpu+0x49/0x8a
[<ffffffff81012a83>] math_state_restore+0x3e/0xbc
[<ffffffff813ac6d0>] do_device_not_available+0x9/0xb
[<ffffffff810123ab>] device_not_available+0x1b/0x20
[<ffffffffa001c066>] ? aes_crypt+0x66/0x74 [padlock_aes]
[<ffffffff8119a51a>] ? blkcipher_walk_next+0x257/0x2e0
[<ffffffff8119a731>] ? blkcipher_walk_first+0x18e/0x19d
[<ffffffffa001c1fe>] aes_encrypt+0x9d/0xe5 [padlock_aes]
[<ffffffffa0027253>] crypt+0x6b/0x114 [xts]
[<ffffffffa001c161>] ? aes_encrypt+0x0/0xe5 [padlock_aes]
[<ffffffffa001c161>] ? aes_encrypt+0x0/0xe5 [padlock_aes]
[<ffffffffa0027390>] encrypt+0x49/0x4b [xts]
[<ffffffff81199acc>] async_encrypt+0x3c/0x3e
[<ffffffff8119dafc>] test_skcipher+0x1da/0x658
[<ffffffff811979c3>] ? crypto_spawn_tfm+0x8e/0xb1
[<ffffffff8119672d>] ? __crypto_alloc_tfm+0x11b/0x15f
[<ffffffff811979c3>] ? crypto_spawn_tfm+0x8e/0xb1
[<ffffffff81199dbe>] ? skcipher_geniv_init+0x2b/0x47
[<ffffffff8119a905>] ? async_chainiv_init+0x5c/0x61
[<ffffffff8119dfdd>] alg_test_skcipher+0x63/0x9b
[<ffffffff8119e1bc>] alg_test+0x12d/0x175
[<ffffffff8119c488>] cryptomgr_test+0x38/0x54
[<ffffffff8119c450>] ? cryptomgr_test+0x0/0x54
[<ffffffff8105c6c9>] kthread+0x4d/0x78
[<ffffffff8101264a>] child_rip+0xa/0x20
[<ffffffff81011f67>] ? restore_args+0x0/0x30
[<ffffffff8105c67c>] ? kthread+0x0/0x78
[<ffffffff81012640>] ? child_rip+0x0/0x20

Signed-off-by: Chuck Ebbert <cebbert@redhat.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <20090609104050.50158cfe@dhcp-100-2-144.bos.redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 71c9e51..4aab52f 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -305,18 +305,18 @@ static inline void kernel_fpu_end(void)
 /*
  * Some instructions like VIA's padlock instructions generate a spurious
  * DNA fault but don't modify SSE registers. And these instructions
- * get used from interrupt context aswell. To prevent these kernel instructions
- * in interrupt context interact wrongly with other user/kernel fpu usage, we
+ * get used from interrupt context as well. To prevent these kernel instructions
+ * in interrupt context interacting wrongly with other user/kernel fpu usage, we
  * should use them only in the context of irq_ts_save/restore()
  */
 static inline int irq_ts_save(void)
 {
 	/*
-	 * If we are in process context, we are ok to take a spurious DNA fault.
-	 * Otherwise, doing clts() in process context require pre-emption to
-	 * be disabled or some heavy lifting like kernel_fpu_begin()
+	 * If in process context and not atomic, we can take a spurious DNA fault.
+	 * Otherwise, doing clts() in process context requires disabling preemption
+	 * or some heavy lifting like kernel_fpu_begin()
 	 */
-	if (!in_interrupt())
+	if (!in_atomic())
 		return 0;
 
 	if (read_cr0() & X86_CR0_TS) {
-- 
cgit v0.10.2


From f57a8a1911342265e7acdc190333c4e9235a6632 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 5 Jun 2009 14:11:30 -0400
Subject: ring-buffer: fix ret in rb_add_time_stamp

The update of ret got mistakenly added to the if statement of
rb_try_to_discard. The variable ret should be 1 on commit and zero
otherwise.

[ Impact: fix compiler warning and real bug ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 22878b0..2e642b2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1433,8 +1433,8 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 			/* Darn, this is just wasted space */
 			event->time_delta = 0;
 			event->array[0] = 0;
-			ret = 0;
 		}
+		ret = 0;
 	}
 
 	*delta = 0;
-- 
cgit v0.10.2


From 55782138e47d9baf2f7d3a7af9e7cf42adf72c56 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 9 Jun 2009 13:43:05 +0800
Subject: tracing/events: convert block trace points to TRACE_EVENT()

TRACE_EVENT is a more generic way to define tracepoints. Doing so adds
these new capabilities to this tracepoint:

  - zero-copy and per-cpu splice() tracing
  - binary tracing without printf overhead
  - structured logging records exposed under /debug/tracing/events
  - trace events embedded in function tracer output and other plugins
  - user-defined, per tracepoint filter expressions
  ...

Cons:

  - no dev_t info for the output of plug, unplug_timer and unplug_io events.
    no dev_t info for getrq and sleeprq events if bio == NULL.
    no dev_t info for rq_abort,...,rq_requeue events if rq->rq_disk == NULL.

    This is mainly because we can't get the deivce from a request queue.
    But this may change in the future.

  - A packet command is converted to a string in TP_assign, not TP_print.
    While blktrace do the convertion just before output.

    Since pc requests should be rather rare, this is not a big issue.

  - In blktrace, an event can have 2 different print formats, but a TRACE_EVENT
    has a unique format, which means we have some unused data in a trace entry.

    The overhead is minimized by using __dynamic_array() instead of __array().

I've benchmarked the ioctl blktrace vs the splice based TRACE_EVENT tracing:

      dd                   dd + ioctl blktrace       dd + TRACE_EVENT (splice)
1     7.36s, 42.7 MB/s     7.50s, 42.0 MB/s          7.41s, 42.5 MB/s
2     7.43s, 42.3 MB/s     7.48s, 42.1 MB/s          7.43s, 42.4 MB/s
3     7.38s, 42.6 MB/s     7.45s, 42.2 MB/s          7.41s, 42.5 MB/s

So the overhead of tracing is very small, and no regression when using
those trace events vs blktrace.

And the binary output of TRACE_EVENT is much smaller than blktrace:

 # ls -l -h
 -rw-r--r-- 1 root root 8.8M 06-09 13:24 sda.blktrace.0
 -rw-r--r-- 1 root root 195K 06-09 13:24 sda.blktrace.1
 -rw-r--r-- 1 root root 2.7M 06-09 13:25 trace_splice.out

Following are some comparisons between TRACE_EVENT and blktrace:

plug:
  kjournald-480   [000]   303.084981: block_plug: [kjournald]
  kjournald-480   [000]   303.084981:   8,0    P   N [kjournald]

unplug_io:
  kblockd/0-118   [000]   300.052973: block_unplug_io: [kblockd/0] 1
  kblockd/0-118   [000]   300.052974:   8,0    U   N [kblockd/0] 1

remap:
  kjournald-480   [000]   303.085042: block_remap: 8,0 W 102736992 + 8 <- (8,8) 33384
  kjournald-480   [000]   303.085043:   8,0    A   W 102736992 + 8 <- (8,8) 33384

bio_backmerge:
  kjournald-480   [000]   303.085086: block_bio_backmerge: 8,0 W 102737032 + 8 [kjournald]
  kjournald-480   [000]   303.085086:   8,0    M   W 102737032 + 8 [kjournald]

getrq:
  kjournald-480   [000]   303.084974: block_getrq: 8,0 W 102736984 + 8 [kjournald]
  kjournald-480   [000]   303.084975:   8,0    G   W 102736984 + 8 [kjournald]

  bash-2066  [001]  1072.953770:   8,0    G   N [bash]
  bash-2066  [001]  1072.953773: block_getrq: 0,0 N 0 + 0 [bash]

rq_complete:
  konsole-2065  [001]   300.053184: block_rq_complete: 8,0 W () 103669040 + 16 [0]
  konsole-2065  [001]   300.053191:   8,0    C   W 103669040 + 16 [0]

  ksoftirqd/1-7   [001]  1072.953811:   8,0    C   N (5a 00 08 00 00 00 00 00 24 00) [0]
  ksoftirqd/1-7   [001]  1072.953813: block_rq_complete: 0,0 N (5a 00 08 00 00 00 00 00 24 00) 0 + 0 [0]

rq_insert:
  kjournald-480   [000]   303.084985: block_rq_insert: 8,0 W 0 () 102736984 + 8 [kjournald]
  kjournald-480   [000]   303.084986:   8,0    I   W 102736984 + 8 [kjournald]

Changelog from v2 -> v3:

- use the newly introduced __dynamic_array().

Changelog from v1 -> v2:

- use __string() instead of __array() to minimize the memory required
  to store hex dump of rq->cmd().

- support large pc requests.

- add missing blk_fill_rwbs_rq() in block_rq_requeue TRACE_EVENT.

- some cleanups.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4A2DF669.5070905@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/block/blk-core.c b/block/blk-core.c
index 1306de9..9475bf9 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -28,22 +28,14 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
-#include <trace/block.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/block.h>
 
 #include "blk.h"
 
-DEFINE_TRACE(block_plug);
-DEFINE_TRACE(block_unplug_io);
-DEFINE_TRACE(block_unplug_timer);
-DEFINE_TRACE(block_getrq);
-DEFINE_TRACE(block_sleeprq);
-DEFINE_TRACE(block_rq_requeue);
-DEFINE_TRACE(block_bio_backmerge);
-DEFINE_TRACE(block_bio_frontmerge);
-DEFINE_TRACE(block_bio_queue);
-DEFINE_TRACE(block_rq_complete);
-DEFINE_TRACE(block_remap);	/* Also used in drivers/md/dm.c */
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
 
 static int __make_request(struct request_queue *q, struct bio *bio);
 
diff --git a/block/elevator.c b/block/elevator.c
index 7073a90..e220f0c 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -33,17 +33,16 @@
 #include <linux/compiler.h>
 #include <linux/delay.h>
 #include <linux/blktrace_api.h>
-#include <trace/block.h>
 #include <linux/hash.h>
 #include <linux/uaccess.h>
 
+#include <trace/events/block.h>
+
 #include "blk.h"
 
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
 
-DEFINE_TRACE(block_rq_abort);
-
 /*
  * Merge hash stuff.
  */
@@ -55,9 +54,6 @@ static const int elv_hash_shift = 6;
 #define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
 #define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
 
-DEFINE_TRACE(block_rq_insert);
-DEFINE_TRACE(block_rq_issue);
-
 /*
  * Query io scheduler to see if the current process issuing bio may be
  * merged with rq.
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index e2ee4a7..3fd8b1e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -20,7 +20,8 @@
 #include <linux/idr.h>
 #include <linux/hdreg.h>
 #include <linux/blktrace_api.h>
-#include <trace/block.h>
+
+#include <trace/events/block.h>
 
 #define DM_MSG_PREFIX "core"
 
@@ -53,8 +54,6 @@ struct dm_target_io {
 	union map_info info;
 };
 
-DEFINE_TRACE(block_bio_complete);
-
 /*
  * For request-based dm.
  * One of these is allocated per request.
diff --git a/fs/bio.c b/fs/bio.c
index 9871164..740699c 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -26,10 +26,9 @@
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
 #include <linux/blktrace_api.h>
-#include <trace/block.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
-DEFINE_TRACE(block_split);
+#include <trace/events/block.h>
 
 /*
  * Test patch to inline a certain number of bi_io_vec's inside the bio
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 82b4636..c7ec31d 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -218,5 +218,18 @@ static inline int blk_trace_init_sysfs(struct device *dev)
 
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
 
+#ifdef CONFIG_EVENT_TRACING
+
+static inline int blk_cmd_buf_len(struct request *rq)
+{
+	return blk_pc_request(rq) ? rq->cmd_len * 3 : 1;
+}
+
+extern void blk_dump_cmd(char *buf, struct request *rq);
+extern void blk_fill_rwbs(char *rwbs, u32 rw, int bytes);
+extern void blk_fill_rwbs_rq(char *rwbs, struct request *rq);
+
+#endif /* CONFIG_EVENT_TRACING */
+
 #endif /* __KERNEL__ */
 #endif
diff --git a/include/trace/block.h b/include/trace/block.h
deleted file mode 100644
index 5b12efa..0000000
--- a/include/trace/block.h
+++ /dev/null
@@ -1,76 +0,0 @@
-#ifndef _TRACE_BLOCK_H
-#define _TRACE_BLOCK_H
-
-#include <linux/blkdev.h>
-#include <linux/tracepoint.h>
-
-DECLARE_TRACE(block_rq_abort,
-	TP_PROTO(struct request_queue *q, struct request *rq),
-	      TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_insert,
-	TP_PROTO(struct request_queue *q, struct request *rq),
-	      TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_issue,
-	TP_PROTO(struct request_queue *q, struct request *rq),
-	      TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_requeue,
-	TP_PROTO(struct request_queue *q, struct request *rq),
-	      TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_complete,
-	TP_PROTO(struct request_queue *q, struct request *rq),
-	      TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_bio_bounce,
-	TP_PROTO(struct request_queue *q, struct bio *bio),
-	      TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_complete,
-	TP_PROTO(struct request_queue *q, struct bio *bio),
-	      TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_backmerge,
-	TP_PROTO(struct request_queue *q, struct bio *bio),
-	      TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_frontmerge,
-	TP_PROTO(struct request_queue *q, struct bio *bio),
-	      TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_queue,
-	TP_PROTO(struct request_queue *q, struct bio *bio),
-	      TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_getrq,
-	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
-	      TP_ARGS(q, bio, rw));
-
-DECLARE_TRACE(block_sleeprq,
-	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
-	      TP_ARGS(q, bio, rw));
-
-DECLARE_TRACE(block_plug,
-	TP_PROTO(struct request_queue *q),
-	      TP_ARGS(q));
-
-DECLARE_TRACE(block_unplug_timer,
-	TP_PROTO(struct request_queue *q),
-	      TP_ARGS(q));
-
-DECLARE_TRACE(block_unplug_io,
-	TP_PROTO(struct request_queue *q),
-	      TP_ARGS(q));
-
-DECLARE_TRACE(block_split,
-	TP_PROTO(struct request_queue *q, struct bio *bio, unsigned int pdu),
-	      TP_ARGS(q, bio, pdu));
-
-DECLARE_TRACE(block_remap,
-	TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
-		 sector_t from),
-	      TP_ARGS(q, bio, dev, from));
-
-#endif
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
new file mode 100644
index 0000000..a99d1e5
--- /dev/null
+++ b/include/trace/events/block.h
@@ -0,0 +1,483 @@
+#if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BLOCK_H
+
+#include <linux/blktrace_api.h>
+#include <linux/blkdev.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM block
+
+TRACE_EVENT(block_rq_abort,
+
+	TP_PROTO(struct request_queue *q, struct request *rq),
+
+	TP_ARGS(q, rq),
+
+	TP_STRUCT__entry(
+		__field(  dev_t,	dev			)
+		__field(  sector_t,	sector			)
+		__field(  unsigned int,	nr_sector		)
+		__field(  int,		errors			)
+		__array(  char,		rwbs,	6		)
+		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+		__entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+		__entry->nr_sector = blk_pc_request(rq) ?
+						0 : rq->hard_nr_sectors;
+		__entry->errors    = rq->errors;
+
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_dump_cmd(__get_str(cmd), rq);
+	),
+
+	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, __get_str(cmd),
+		  __entry->sector, __entry->nr_sector, __entry->errors)
+);
+
+TRACE_EVENT(block_rq_insert,
+
+	TP_PROTO(struct request_queue *q, struct request *rq),
+
+	TP_ARGS(q, rq),
+
+	TP_STRUCT__entry(
+		__field(  dev_t,	dev			)
+		__field(  sector_t,	sector			)
+		__field(  unsigned int,	nr_sector		)
+		__field(  unsigned int,	bytes			)
+		__array(  char,		rwbs,	6		)
+		__array(  char,         comm,   TASK_COMM_LEN   )
+		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+		__entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+		__entry->nr_sector = blk_pc_request(rq) ?
+						0 : rq->hard_nr_sectors;
+		__entry->bytes     = blk_pc_request(rq) ? rq->data_len : 0;
+
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_dump_cmd(__get_str(cmd), rq);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, __entry->bytes, __get_str(cmd),
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_rq_issue,
+
+	TP_PROTO(struct request_queue *q, struct request *rq),
+
+	TP_ARGS(q, rq),
+
+	TP_STRUCT__entry(
+		__field(  dev_t,	dev			)
+		__field(  sector_t,	sector			)
+		__field(  unsigned int,	nr_sector		)
+		__field(  unsigned int,	bytes			)
+		__array(  char,		rwbs,	6		)
+		__array(  char,		comm,   TASK_COMM_LEN   )
+		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+		__entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+		__entry->nr_sector = blk_pc_request(rq) ?
+						0 : rq->hard_nr_sectors;
+		__entry->bytes     = blk_pc_request(rq) ? rq->data_len : 0;
+
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_dump_cmd(__get_str(cmd), rq);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, __entry->bytes, __get_str(cmd),
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_rq_requeue,
+
+	TP_PROTO(struct request_queue *q, struct request *rq),
+
+	TP_ARGS(q, rq),
+
+	TP_STRUCT__entry(
+		__field(  dev_t,	dev			)
+		__field(  sector_t,	sector			)
+		__field(  unsigned int,	nr_sector		)
+		__field(  int,		errors			)
+		__array(  char,		rwbs,	6		)
+		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+		__entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+		__entry->nr_sector = blk_pc_request(rq) ?
+						0 : rq->hard_nr_sectors;
+		__entry->errors	   = rq->errors;
+
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_dump_cmd(__get_str(cmd), rq);
+	),
+
+	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, __get_str(cmd),
+		  __entry->sector, __entry->nr_sector, __entry->errors)
+);
+
+TRACE_EVENT(block_rq_complete,
+
+	TP_PROTO(struct request_queue *q, struct request *rq),
+
+	TP_ARGS(q, rq),
+
+	TP_STRUCT__entry(
+		__field(  dev_t,	dev			)
+		__field(  sector_t,	sector			)
+		__field(  unsigned int,	nr_sector		)
+		__field(  int,		errors			)
+		__array(  char,		rwbs,	6		)
+		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+		__entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+		__entry->nr_sector = blk_pc_request(rq) ?
+						0 : rq->hard_nr_sectors;
+		__entry->errors    = rq->errors;
+
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_dump_cmd(__get_str(cmd), rq);
+	),
+
+	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, __get_str(cmd),
+		  __entry->sector, __entry->nr_sector, __entry->errors)
+);
+TRACE_EVENT(block_bio_bounce,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+
+	TP_ARGS(q, bio),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned int,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_complete,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+
+	TP_ARGS(q, bio),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev		)
+		__field( sector_t,	sector		)
+		__field( unsigned,	nr_sector	)
+		__field( int,		error		)
+		__array( char,		rwbs,	6	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%d]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->error)
+);
+
+TRACE_EVENT(block_bio_backmerge,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+
+	TP_ARGS(q, bio),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned int,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_frontmerge,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+
+	TP_ARGS(q, bio),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_queue,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+
+	TP_ARGS(q, bio),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned int,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_getrq,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
+
+	TP_ARGS(q, bio, rw),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned int,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+        ),
+
+	TP_fast_assign(
+		__entry->dev		= bio ? bio->bi_bdev->bd_dev : 0;
+		__entry->sector		= bio ? bio->bi_sector : 0;
+		__entry->nr_sector	= bio ? bio->bi_size >> 9 : 0;
+		blk_fill_rwbs(__entry->rwbs,
+			      bio ? bio->bi_rw : 0, __entry->nr_sector);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+        ),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_sleeprq,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
+
+	TP_ARGS(q, bio, rw),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned int,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio ? bio->bi_bdev->bd_dev : 0;
+		__entry->sector		= bio ? bio->bi_sector : 0;
+		__entry->nr_sector	= bio ? bio->bi_size >> 9 : 0;
+		blk_fill_rwbs(__entry->rwbs,
+			    bio ? bio->bi_rw : 0, __entry->nr_sector);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_plug,
+
+	TP_PROTO(struct request_queue *q),
+
+	TP_ARGS(q),
+
+	TP_STRUCT__entry(
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("[%s]", __entry->comm)
+);
+
+TRACE_EVENT(block_unplug_timer,
+
+	TP_PROTO(struct request_queue *q),
+
+	TP_ARGS(q),
+
+	TP_STRUCT__entry(
+		__field( int,		nr_rq			)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->nr_rq	= q->rq.count[READ] + q->rq.count[WRITE];
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
+);
+
+TRACE_EVENT(block_unplug_io,
+
+	TP_PROTO(struct request_queue *q),
+
+	TP_ARGS(q),
+
+	TP_STRUCT__entry(
+		__field( int,		nr_rq			)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->nr_rq	= q->rq.count[READ] + q->rq.count[WRITE];
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
+);
+
+TRACE_EVENT(block_split,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio,
+		 unsigned int new_sector),
+
+	TP_ARGS(q, bio, new_sector),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev				)
+		__field( sector_t,	sector				)
+		__field( sector_t,	new_sector			)
+		__array( char,		rwbs,		6		)
+		__array( char,		comm,		TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->new_sector	= new_sector;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu / %llu [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->new_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_remap,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
+		 sector_t from),
+
+	TP_ARGS(q, bio, dev, from),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev		)
+		__field( sector_t,	sector		)
+		__field( unsigned int,	nr_sector	)
+		__field( dev_t,		old_dev		)
+		__field( sector_t,	old_sector	)
+		__array( char,		rwbs,	6	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		__entry->old_dev	= dev;
+		__entry->old_sector	= from;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+	),
+
+	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector,
+		  MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
+		  __entry->old_sector)
+);
+
+#endif /* _TRACE_BLOCK_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
+
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 06b8585..844164d 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -45,7 +45,10 @@ obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_POWER_TRACER) += trace_power.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
-obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
+obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
+ifeq ($(CONFIG_BLOCK),y)
+obj-$(CONFIG_EVENT_TRACING) += blktrace.o
+endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events.o
 obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e3abf55..7bd6a98 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -23,10 +23,14 @@
 #include <linux/mutex.h>
 #include <linux/debugfs.h>
 #include <linux/time.h>
-#include <trace/block.h>
 #include <linux/uaccess.h>
+
+#include <trace/events/block.h>
+
 #include "trace_output.h"
 
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+
 static unsigned int blktrace_seq __read_mostly = 1;
 
 static struct trace_array *blk_tr;
@@ -1658,3 +1662,75 @@ int blk_trace_init_sysfs(struct device *dev)
 	return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
 }
 
+#endif /* CONFIG_BLK_DEV_IO_TRACE */
+
+#ifdef CONFIG_EVENT_TRACING
+
+void blk_dump_cmd(char *buf, struct request *rq)
+{
+	int i, end;
+	int len = rq->cmd_len;
+	unsigned char *cmd = rq->cmd;
+
+	if (!blk_pc_request(rq)) {
+		buf[0] = '\0';
+		return;
+	}
+
+	for (end = len - 1; end >= 0; end--)
+		if (cmd[end])
+			break;
+	end++;
+
+	for (i = 0; i < len; i++) {
+		buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]);
+		if (i == end && end != len - 1) {
+			sprintf(buf, " ..");
+			break;
+		}
+	}
+}
+
+void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
+{
+	int i = 0;
+
+	if (rw & WRITE)
+		rwbs[i++] = 'W';
+	else if (rw & 1 << BIO_RW_DISCARD)
+		rwbs[i++] = 'D';
+	else if (bytes)
+		rwbs[i++] = 'R';
+	else
+		rwbs[i++] = 'N';
+
+	if (rw & 1 << BIO_RW_AHEAD)
+		rwbs[i++] = 'A';
+	if (rw & 1 << BIO_RW_BARRIER)
+		rwbs[i++] = 'B';
+	if (rw & 1 << BIO_RW_SYNCIO)
+		rwbs[i++] = 'S';
+	if (rw & 1 << BIO_RW_META)
+		rwbs[i++] = 'M';
+
+	rwbs[i] = '\0';
+}
+
+void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
+{
+	int rw = rq->cmd_flags & 0x03;
+	int bytes;
+
+	if (blk_discard_rq(rq))
+		rw |= (1 << BIO_RW_DISCARD);
+
+	if (blk_pc_request(rq))
+		bytes = rq->data_len;
+	else
+		bytes = rq->hard_nr_sectors << 9;
+
+	blk_fill_rwbs(rwbs, rw, bytes);
+}
+
+#endif /* CONFIG_EVENT_TRACING */
+
diff --git a/mm/bounce.c b/mm/bounce.c
index e590272..65f5e17 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -14,16 +14,15 @@
 #include <linux/hash.h>
 #include <linux/highmem.h>
 #include <linux/blktrace_api.h>
-#include <trace/block.h>
 #include <asm/tlbflush.h>
 
+#include <trace/events/block.h>
+
 #define POOL_SIZE	64
 #define ISA_POOL_SIZE	16
 
 static mempool_t *page_pool, *isa_page_pool;
 
-DEFINE_TRACE(block_bio_bounce);
-
 #ifdef CONFIG_HIGHMEM
 static __init int init_emergency_pool(void)
 {
-- 
cgit v0.10.2


From 6556d1df88fe68f9836beeb43342a336691cb67c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 9 Jun 2009 14:04:26 -0400
Subject: tracing: fix the block trace points print size

The sector field is either u64 or unsigned long depending on
the arch. This patch casts the sector to unsigned long long to
prevent the printf warnings.

[ Impact: remove compile warnings ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index a99d1e5..53effd4 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -37,7 +37,8 @@ TRACE_EVENT(block_rq_abort,
 	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->rwbs, __get_str(cmd),
-		  __entry->sector, __entry->nr_sector, __entry->errors)
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->errors)
 );
 
 TRACE_EVENT(block_rq_insert,
@@ -71,7 +72,8 @@ TRACE_EVENT(block_rq_insert,
 	TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->rwbs, __entry->bytes, __get_str(cmd),
-		  __entry->sector, __entry->nr_sector, __entry->comm)
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm)
 );
 
 TRACE_EVENT(block_rq_issue,
@@ -105,7 +107,8 @@ TRACE_EVENT(block_rq_issue,
 	TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->rwbs, __entry->bytes, __get_str(cmd),
-		  __entry->sector, __entry->nr_sector, __entry->comm)
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm)
 );
 
 TRACE_EVENT(block_rq_requeue,
@@ -137,7 +140,8 @@ TRACE_EVENT(block_rq_requeue,
 	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->rwbs, __get_str(cmd),
-		  __entry->sector, __entry->nr_sector, __entry->errors)
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->errors)
 );
 
 TRACE_EVENT(block_rq_complete,
@@ -169,7 +173,8 @@ TRACE_EVENT(block_rq_complete,
 	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->rwbs, __get_str(cmd),
-		  __entry->sector, __entry->nr_sector, __entry->errors)
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->errors)
 );
 TRACE_EVENT(block_bio_bounce,
 
@@ -195,7 +200,8 @@ TRACE_EVENT(block_bio_bounce,
 
 	TP_printk("%d,%d %s %llu + %u [%s]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
-		  __entry->sector, __entry->nr_sector, __entry->comm)
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm)
 );
 
 TRACE_EVENT(block_bio_complete,
@@ -221,7 +227,8 @@ TRACE_EVENT(block_bio_complete,
 
 	TP_printk("%d,%d %s %llu + %u [%d]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
-		  __entry->sector, __entry->nr_sector, __entry->error)
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->error)
 );
 
 TRACE_EVENT(block_bio_backmerge,
@@ -248,7 +255,8 @@ TRACE_EVENT(block_bio_backmerge,
 
 	TP_printk("%d,%d %s %llu + %u [%s]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
-		  __entry->sector, __entry->nr_sector, __entry->comm)
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm)
 );
 
 TRACE_EVENT(block_bio_frontmerge,
@@ -275,7 +283,8 @@ TRACE_EVENT(block_bio_frontmerge,
 
 	TP_printk("%d,%d %s %llu + %u [%s]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
-		  __entry->sector, __entry->nr_sector, __entry->comm)
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm)
 );
 
 TRACE_EVENT(block_bio_queue,
@@ -302,7 +311,8 @@ TRACE_EVENT(block_bio_queue,
 
 	TP_printk("%d,%d %s %llu + %u [%s]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
-		  __entry->sector, __entry->nr_sector, __entry->comm)
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm)
 );
 
 TRACE_EVENT(block_getrq,
@@ -330,7 +340,8 @@ TRACE_EVENT(block_getrq,
 
 	TP_printk("%d,%d %s %llu + %u [%s]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
-		  __entry->sector, __entry->nr_sector, __entry->comm)
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm)
 );
 
 TRACE_EVENT(block_sleeprq,
@@ -358,7 +369,8 @@ TRACE_EVENT(block_sleeprq,
 
 	TP_printk("%d,%d %s %llu + %u [%s]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
-		  __entry->sector, __entry->nr_sector, __entry->comm)
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm)
 );
 
 TRACE_EVENT(block_plug,
@@ -441,7 +453,9 @@ TRACE_EVENT(block_split,
 
 	TP_printk("%d,%d %s %llu / %llu [%s]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
-		  __entry->sector, __entry->new_sector, __entry->comm)
+		  (unsigned long long)__entry->sector,
+		  (unsigned long long)__entry->new_sector,
+		  __entry->comm)
 );
 
 TRACE_EVENT(block_remap,
@@ -471,9 +485,10 @@ TRACE_EVENT(block_remap,
 
 	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
-		  __entry->sector, __entry->nr_sector,
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector,
 		  MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
-		  __entry->old_sector)
+		  (unsigned long long)__entry->old_sector)
 );
 
 #endif /* _TRACE_BLOCK_H */
-- 
cgit v0.10.2


From 725c624a58a10ef90a2ff889e122158fabf36147 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 8 Jun 2009 19:09:45 -0400
Subject: tracing: add trace_seq_vprint interface

The code to update the print formats for events requires a vprintf
format in the trace_seq. This patch adds that interface.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index ba9627f..c68bccb 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -27,6 +27,8 @@ trace_seq_init(struct trace_seq *s)
 #ifdef CONFIG_TRACING
 extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
+extern int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+	__attribute__ ((format (printf, 2, 0)));
 extern int
 trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
 extern void trace_print_seq(struct seq_file *m, struct trace_seq *s);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 425725c..c05aff4 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -100,6 +100,38 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 }
 EXPORT_SYMBOL_GPL(trace_seq_printf);
 
+/**
+ * trace_seq_vprintf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+int
+trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+{
+	int len = (PAGE_SIZE - 1) - s->len;
+	int ret;
+
+	if (!len)
+		return 0;
+
+	ret = vsnprintf(s->buffer + s->len, len, fmt, args);
+
+	/* If we can't write it all, don't bother writing anything */
+	if (ret >= len)
+		return 0;
+
+	s->len += ret;
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_vprintf);
+
 int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
 {
 	int len = (PAGE_SIZE - 1) - s->len;
-- 
cgit v0.10.2


From 110bf2b764eb6026b868d84499263cb24b1bcc8d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 9 Jun 2009 17:29:07 -0400
Subject: tracing: add protection around module events unload

When reading the trace buffer, there is a race that when a module
is unloaded it removes events that is stilled referenced in the buffers.
This patch adds the protection around the unloading of the events
from modules and the reading of the trace buffers.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 6c81f9c..aa08be6 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1050,12 +1050,13 @@ static void trace_module_remove_events(struct module *mod)
 	struct ftrace_event_call *call, *p;
 	bool found = false;
 
+	down_write(&trace_event_mutex);
 	list_for_each_entry_safe(call, p, &ftrace_events, list) {
 		if (call->mod == mod) {
 			found = true;
 			ftrace_event_enable_disable(call, 0);
 			if (call->event)
-				unregister_ftrace_event(call->event);
+				__unregister_ftrace_event(call->event);
 			debugfs_remove_recursive(call->dir);
 			list_del(&call->list);
 			trace_destroy_fields(call);
@@ -1079,6 +1080,7 @@ static void trace_module_remove_events(struct module *mod)
 	 */
 	if (found)
 		tracing_reset_current_online_cpus();
+	up_write(&trace_event_mutex);
 }
 
 static int trace_module_notify(struct notifier_block *self,
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c05aff4..7938f3a 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -14,7 +14,7 @@
 /* must be a power of 2 */
 #define EVENT_HASHSIZE	128
 
-static DECLARE_RWSEM(trace_event_mutex);
+DECLARE_RWSEM(trace_event_mutex);
 
 DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
 EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
@@ -702,6 +702,16 @@ int register_ftrace_event(struct trace_event *event)
 }
 EXPORT_SYMBOL_GPL(register_ftrace_event);
 
+/*
+ * Used by module code with the trace_event_mutex held for write.
+ */
+int __unregister_ftrace_event(struct trace_event *event)
+{
+	hlist_del(&event->node);
+	list_del(&event->list);
+	return 0;
+}
+
 /**
  * unregister_ftrace_event - remove a no longer used event
  * @event: the event to remove
@@ -709,8 +719,7 @@ EXPORT_SYMBOL_GPL(register_ftrace_event);
 int unregister_ftrace_event(struct trace_event *event)
 {
 	down_write(&trace_event_mutex);
-	hlist_del(&event->node);
-	list_del(&event->list);
+	__unregister_ftrace_event(event);
 	up_write(&trace_event_mutex);
 
 	return 0;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index ac240e7..d38bec4 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -27,6 +27,10 @@ extern struct trace_event *ftrace_find_event(int type);
 extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
 					 int flags);
 
+/* used by module unregistering */
+extern int __unregister_ftrace_event(struct trace_event *event);
+extern struct rw_semaphore trace_event_mutex;
+
 #define MAX_MEMHEX_BYTES	8
 #define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1)
 
-- 
cgit v0.10.2


From 35f2c2f6f6ae13ef23c4f68e6d3073753077ca43 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 9 Jun 2009 17:48:56 +0900
Subject: nommu: Provide mmap_min_addr definition.

With the "security: use mmap_min_addr indepedently of security models"
change, mmap_min_addr is used in common areas, which susbsequently blows
up the nommu build. This stubs in the definition in the nommu case as
well.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

--

 mm/nommu.c |    3 +++
 1 file changed, 3 insertions(+)
Signed-off-by: James Morris <jmorris@namei.org>

diff --git a/mm/nommu.c b/mm/nommu.c
index b571ef7..2fd2ad5 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -69,6 +69,9 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
 int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
 int heap_stack_gap = 0;
 
+/* amount of vm to protect from userspace access */
+unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
+
 atomic_long_t mmap_pages_allocated;
 
 EXPORT_SYMBOL(mem_map);
-- 
cgit v0.10.2


From 586c7e6a280580fd94b662bf486f9bb31098d14b Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Tue, 9 Jun 2009 16:26:23 -0700
Subject: shm: fix unused warnings on nommu

The massive nommu update (8feae131) resulted in these warnings:
ipc/shm.c: In function `sys_shmdt':
ipc/shm.c:974: warning: unused variable `size'
ipc/shm.c:972: warning: unused variable `next'

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/ipc/shm.c b/ipc/shm.c
index faa46da..4259716 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -969,10 +969,13 @@ SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg)
 SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
 {
 	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma, *next;
+	struct vm_area_struct *vma;
 	unsigned long addr = (unsigned long)shmaddr;
-	loff_t size = 0;
 	int retval = -EINVAL;
+#ifdef CONFIG_MMU
+	loff_t size = 0;
+	struct vm_area_struct *next;
+#endif
 
 	if (addr & ~PAGE_MASK)
 		return retval;
-- 
cgit v0.10.2


From 463aea1a1c49f1a7d4b50656dfd6c8bb33358b1b Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 9 Jun 2009 16:26:24 -0700
Subject: autofs4: remove hashed check in validate_wait()

The recent ->lookup() deadlock correction required the directory inode
mutex to be dropped while waiting for expire completion.  We were
concerned about side effects from this change and one has been identified.

I saw several error messages.

They cause autofs to become quite confused and don't really point to the
actual problem.

Things like:

handle_packet_missing_direct:1376: can't find map entry for (43,1827932)

which is usually totally fatal (although in this case it wouldn't be
except that I treat is as such because it normally is).

do_mount_direct: direct trigger not valid or already mounted
/test/nested/g3c/s1/ss1

which is recoverable, however if this problem is at play it can cause
autofs to become quite confused as to the dependencies in the mount tree
because mount triggers end up mounted multiple times.  It's hard to
accurately check for this over mounting case and automount shouldn't need
to if the kernel module is doing its job.

There was one other message, similar in consequence of this last one but I
can't locate a log example just now.

When checking if a mount has already completed prior to adding a new mount
request to the wait queue we check if the dentry is hashed and, if so, if
it is a mount point.  But, if a mount successfully completed while we
slept on the wait queue mutex the dentry must exist for the mount to have
completed so the test is not really needed.

Mounts can also be done on top of a global root dentry, so for the above
case, where a mount request completes and the wait queue entry has already
been removed, the hashed test returning false can cause an incorrect
callback to the daemon.  Also, d_mountpoint() is not sufficient to check
if a mount has completed for the multi-mount case when we don't have a
real mount at the base of the tree.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index eeb2468..2341375 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -297,20 +297,14 @@ static int validate_request(struct autofs_wait_queue **wait,
 	 */
 	if (notify == NFY_MOUNT) {
 		/*
-		 * If the dentry isn't hashed just go ahead and try the
-		 * mount again with a new wait (not much else we can do).
-		*/
-		if (!d_unhashed(dentry)) {
-			/*
-			 * But if the dentry is hashed, that means that we
-			 * got here through the revalidate path.  Thus, we
-			 * need to check if the dentry has been mounted
-			 * while we waited on the wq_mutex. If it has,
-			 * simply return success.
-			 */
-			if (d_mountpoint(dentry))
-				return 0;
-		}
+		 * If the dentry was successfully mounted while we slept
+		 * on the wait queue mutex we can return success. If it
+		 * isn't mounted (doesn't have submounts for the case of
+		 * a multi-mount with no mount at it's base) we can
+		 * continue on and create a new request.
+		 */
+		if (have_submounts(dentry))
+			return 0;
 	}
 
 	return 1;
-- 
cgit v0.10.2


From a61d90d75d0f9e86432c45b496b4b0fbf0fd03dc Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 9 Jun 2009 16:26:26 -0700
Subject: jbd: fix race in buffer processing in commit code

In commit code, we scan buffers attached to a transaction.  During this
scan, we sometimes have to drop j_list_lock and then we recheck whether
the journal buffer head didn't get freed by journal_try_to_free_buffers().
 But checking for buffer_jbd(bh) isn't enough because a new journal head
could get attached to our buffer head.  So add a check whether the journal
head remained the same and whether it's still at the same transaction and
list.

This is a nasty bug and can cause problems like memory corruption (use after
free) or trigger various assertions in JBD code (observed).

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: <stable@kernel.org>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 06560c5..618e21c 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -241,7 +241,7 @@ write_out_data:
 			spin_lock(&journal->j_list_lock);
 		}
 		/* Someone already cleaned up the buffer? */
-		if (!buffer_jbd(bh)
+		if (!buffer_jbd(bh) || bh2jh(bh) != jh
 			|| jh->b_transaction != commit_transaction
 			|| jh->b_jlist != BJ_SyncData) {
 			jbd_unlock_bh_state(bh);
@@ -478,7 +478,9 @@ void journal_commit_transaction(journal_t *journal)
 			spin_lock(&journal->j_list_lock);
 			continue;
 		}
-		if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
+		if (buffer_jbd(bh) && bh2jh(bh) == jh &&
+		    jh->b_transaction == commit_transaction &&
+		    jh->b_jlist == BJ_Locked) {
 			__journal_unfile_buffer(jh);
 			jbd_unlock_bh_state(bh);
 			journal_remove_journal_head(bh);
-- 
cgit v0.10.2


From 96050dfb25966612008dcea7d342e91fa01e993c Mon Sep 17 00:00:00 2001
From: Peter Botha <peterb@goldcircle.co.za>
Date: Tue, 9 Jun 2009 17:16:32 -0700
Subject: char: mxser, fix ISA board lookup

There's a bug in the mxser kernel module that still appears in the
2.6.29.4 kernel.

mxser_get_ISA_conf takes a ioaddress as its first argument, by passing the
not of the ioaddr, you're effectively passing 0 which means it won't be
able to talk to an ISA card.  I have tested this, and removing the !
fixes the problem.

Cc: "Peter Botha" <peterb@goldcircle.co.za>
Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Acked-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/mxser.c b/drivers/char/mxser.c
index a420e8d..13f8871 100644
--- a/drivers/char/mxser.c
+++ b/drivers/char/mxser.c
@@ -2711,7 +2711,7 @@ static int __init mxser_module_init(void)
 			continue;
 
 		brd = &mxser_boards[m];
-		retval = mxser_get_ISA_conf(!ioaddr[b], brd);
+		retval = mxser_get_ISA_conf(ioaddr[b], brd);
 		if (retval <= 0) {
 			brd->info = NULL;
 			continue;
-- 
cgit v0.10.2


From 07a2039b8eb0af4ff464efd3dfd95de5c02648c6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 9 Jun 2009 20:05:27 -0700
Subject: Linux 2.6.30


diff --git a/Makefile b/Makefile
index 1065154..03373bb 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 30
-EXTRAVERSION = -rc8
+EXTRAVERSION =
 NAME = Man-Eating Seals of Antiquity
 
 # *DOCUMENTATION*
-- 
cgit v0.10.2


From 2b83868723d090078ac0e2120e06a1cc94dbaef0 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 9 Jun 2009 20:40:25 -0700
Subject: Make /dev/zero reads interruptible by signals

This helps with bad latencies for large reads from /dev/zero, but might
conceivably break some application that "knows" that a read of /dev/zero
cannot return early.  So do this early in the merge window to give us
maximal test coverage, even if the patch is totally trivial.

Obviously, no well-behaved application should ever depend on the read
being uninterruptible, but hey, bugs happen.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 65e12bc..f96d0be 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -694,9 +694,8 @@ static ssize_t read_zero(struct file * file, char __user * buf,
 		written += chunk - unwritten;
 		if (unwritten)
 			break;
-		/* Consider changing this to just 'signal_pending()' with lots of testing */
-		if (fatal_signal_pending(current))
-			return written ? written : -EINTR;
+		if (signal_pending(current))
+			return written ? written : -ERESTARTSYS;
 		buf += chunk;
 		count -= chunk;
 		cond_resched();
-- 
cgit v0.10.2


From c1d0d32a603ed06377f404adf2c538de33bb3634 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 10 Jun 2009 09:48:33 +0300
Subject: sh: plug vsyscall dir in to archclean.

The vsyscall targets are presently not cleaned up, so just handle it in
the archclean rule.

Reported-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Makefile b/arch/sh/Makefile
index 0fe35cf..75d049b0 100644
--- a/arch/sh/Makefile
+++ b/arch/sh/Makefile
@@ -205,6 +205,7 @@ archprepare: maketools
 
 archclean:
 	$(Q)$(MAKE) $(clean)=$(boot)
+	$(Q)$(MAKE) $(clean)=arch/sh/kernel/vsyscall
 
 define archhelp
 	@echo '* zImage 	           - Compressed kernel image'
-- 
cgit v0.10.2


From 40bc9a27e00d6c8c7e4dc2865c02d7402a950472 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 10 Jun 2009 09:09:40 +0100
Subject: GFS2: Fix cache coherency between truncate and O_DIRECT read

If a page was partially zeroed as the result of a truncate, then it was
not being correctly marked dirty. This resulted in the deleted data
reappearing if the file was read back via direct I/O.

Reported-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 1153a07..3297635 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1012,7 +1012,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping)
 		gfs2_trans_add_bh(ip->i_gl, bh, 0);
 
 	zero_user(page, offset, length);
-
+	mark_buffer_dirty(bh);
 unlock:
 	unlock_page(page);
 	page_cache_release(page);
-- 
cgit v0.10.2


From 3e7c73e9b15eab73e9cf72daf3931925da8afcff Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 24 Feb 2009 21:46:19 +0200
Subject: KVM: VMX: Don't use highmem pages for the msr and pio bitmaps

Highmem pages are a pain, and saving three lowmem pages on i386 isn't worth
the extra code.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bb48133..b20c9e4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -111,9 +111,9 @@ static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
 
-static struct page *vmx_io_bitmap_a;
-static struct page *vmx_io_bitmap_b;
-static struct page *vmx_msr_bitmap;
+static unsigned long *vmx_io_bitmap_a;
+static unsigned long *vmx_io_bitmap_b;
+static unsigned long *vmx_msr_bitmap;
 
 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
 static DEFINE_SPINLOCK(vmx_vpid_lock);
@@ -2082,9 +2082,9 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
 	spin_unlock(&vmx_vpid_lock);
 }
 
-static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
+static void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
 {
-	void *va;
+	int f = sizeof(unsigned long);
 
 	if (!cpu_has_vmx_msr_bitmap())
 		return;
@@ -2094,16 +2094,14 @@ static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
 	 * have the write-low and read-high bitmap offsets the wrong way round.
 	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
 	 */
-	va = kmap(msr_bitmap);
 	if (msr <= 0x1fff) {
-		__clear_bit(msr, va + 0x000); /* read-low */
-		__clear_bit(msr, va + 0x800); /* write-low */
+		__clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
+		__clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
 	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
 		msr &= 0x1fff;
-		__clear_bit(msr, va + 0x400); /* read-high */
-		__clear_bit(msr, va + 0xc00); /* write-high */
+		__clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
+		__clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
 	}
-	kunmap(msr_bitmap);
 }
 
 /*
@@ -2121,11 +2119,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	u32 exec_control;
 
 	/* I/O */
-	vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
-	vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
+	vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
+	vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
 
 	if (cpu_has_vmx_msr_bitmap())
-		vmcs_write64(MSR_BITMAP, page_to_phys(vmx_msr_bitmap));
+		vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap));
 
 	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
 
@@ -3695,20 +3693,19 @@ static struct kvm_x86_ops vmx_x86_ops = {
 
 static int __init vmx_init(void)
 {
-	void *va;
 	int r;
 
-	vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+	vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
 	if (!vmx_io_bitmap_a)
 		return -ENOMEM;
 
-	vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+	vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
 	if (!vmx_io_bitmap_b) {
 		r = -ENOMEM;
 		goto out;
 	}
 
-	vmx_msr_bitmap = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+	vmx_msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
 	if (!vmx_msr_bitmap) {
 		r = -ENOMEM;
 		goto out1;
@@ -3718,18 +3715,12 @@ static int __init vmx_init(void)
 	 * Allow direct access to the PC debug port (it is often used for I/O
 	 * delays, but the vmexits simply slow things down).
 	 */
-	va = kmap(vmx_io_bitmap_a);
-	memset(va, 0xff, PAGE_SIZE);
-	clear_bit(0x80, va);
-	kunmap(vmx_io_bitmap_a);
+	memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
+	clear_bit(0x80, vmx_io_bitmap_a);
 
-	va = kmap(vmx_io_bitmap_b);
-	memset(va, 0xff, PAGE_SIZE);
-	kunmap(vmx_io_bitmap_b);
+	memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
 
-	va = kmap(vmx_msr_bitmap);
-	memset(va, 0xff, PAGE_SIZE);
-	kunmap(vmx_msr_bitmap);
+	memset(vmx_msr_bitmap, 0xff, PAGE_SIZE);
 
 	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 
@@ -3762,19 +3753,19 @@ static int __init vmx_init(void)
 	return 0;
 
 out2:
-	__free_page(vmx_msr_bitmap);
+	free_page((unsigned long)vmx_msr_bitmap);
 out1:
-	__free_page(vmx_io_bitmap_b);
+	free_page((unsigned long)vmx_io_bitmap_b);
 out:
-	__free_page(vmx_io_bitmap_a);
+	free_page((unsigned long)vmx_io_bitmap_a);
 	return r;
 }
 
 static void __exit vmx_exit(void)
 {
-	__free_page(vmx_msr_bitmap);
-	__free_page(vmx_io_bitmap_b);
-	__free_page(vmx_io_bitmap_a);
+	free_page((unsigned long)vmx_msr_bitmap);
+	free_page((unsigned long)vmx_io_bitmap_b);
+	free_page((unsigned long)vmx_io_bitmap_a);
 
 	kvm_exit();
 }
-- 
cgit v0.10.2


From 5897297bc228fc3c85fdc421fd5c487f9a99821a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 24 Feb 2009 22:26:47 +0200
Subject: KVM: VMX: Don't intercept MSR_KERNEL_GS_BASE

Windows 2008 accesses this MSR often on context switch intensive workloads;
since we run in guest context with the guest MSR value loaded (so swapgs can
work correctly), we can simply disable interception of rdmsr/wrmsr for this
MSR.

A complication occurs since in legacy mode, we run with the host MSR value
loaded. In this case we enable interception.  This means we need two MSR
bitmaps, one for legacy mode and one for long mode.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b20c9e4..b5eae7a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -113,7 +113,8 @@ static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
 
 static unsigned long *vmx_io_bitmap_a;
 static unsigned long *vmx_io_bitmap_b;
-static unsigned long *vmx_msr_bitmap;
+static unsigned long *vmx_msr_bitmap_legacy;
+static unsigned long *vmx_msr_bitmap_longmode;
 
 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
 static DEFINE_SPINLOCK(vmx_vpid_lock);
@@ -812,6 +813,7 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
 static void setup_msrs(struct vcpu_vmx *vmx)
 {
 	int save_nmsrs;
+	unsigned long *msr_bitmap;
 
 	vmx_load_host_state(vmx);
 	save_nmsrs = 0;
@@ -847,6 +849,15 @@ static void setup_msrs(struct vcpu_vmx *vmx)
 		__find_msr_index(vmx, MSR_KERNEL_GS_BASE);
 #endif
 	vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
+
+	if (cpu_has_vmx_msr_bitmap()) {
+		if (is_long_mode(&vmx->vcpu))
+			msr_bitmap = vmx_msr_bitmap_longmode;
+		else
+			msr_bitmap = vmx_msr_bitmap_legacy;
+
+		vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
+	}
 }
 
 /*
@@ -2082,7 +2093,7 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
 	spin_unlock(&vmx_vpid_lock);
 }
 
-static void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
+static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
 {
 	int f = sizeof(unsigned long);
 
@@ -2104,6 +2115,13 @@ static void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
 	}
 }
 
+static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
+{
+	if (!longmode_only)
+		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
+	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
+}
+
 /*
  * Sets up the vmcs for emulated real mode.
  */
@@ -2123,7 +2141,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
 
 	if (cpu_has_vmx_msr_bitmap())
-		vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap));
+		vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
 
 	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
 
@@ -3705,12 +3723,18 @@ static int __init vmx_init(void)
 		goto out;
 	}
 
-	vmx_msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
-	if (!vmx_msr_bitmap) {
+	vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
+	if (!vmx_msr_bitmap_legacy) {
 		r = -ENOMEM;
 		goto out1;
 	}
 
+	vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
+	if (!vmx_msr_bitmap_longmode) {
+		r = -ENOMEM;
+		goto out2;
+	}
+
 	/*
 	 * Allow direct access to the PC debug port (it is often used for I/O
 	 * delays, but the vmexits simply slow things down).
@@ -3720,19 +3744,21 @@ static int __init vmx_init(void)
 
 	memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
 
-	memset(vmx_msr_bitmap, 0xff, PAGE_SIZE);
+	memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
+	memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
 
 	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 
 	r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
 	if (r)
-		goto out2;
+		goto out3;
 
-	vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_FS_BASE);
-	vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_GS_BASE);
-	vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_CS);
-	vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP);
-	vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP);
+	vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
+	vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
+	vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
+	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
+	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
+	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
 
 	if (vm_need_ept()) {
 		bypass_guest_pf = 0;
@@ -3752,8 +3778,10 @@ static int __init vmx_init(void)
 
 	return 0;
 
+out3:
+	free_page((unsigned long)vmx_msr_bitmap_longmode);
 out2:
-	free_page((unsigned long)vmx_msr_bitmap);
+	free_page((unsigned long)vmx_msr_bitmap_legacy);
 out1:
 	free_page((unsigned long)vmx_io_bitmap_b);
 out:
@@ -3763,7 +3791,8 @@ out:
 
 static void __exit vmx_exit(void)
 {
-	free_page((unsigned long)vmx_msr_bitmap);
+	free_page((unsigned long)vmx_msr_bitmap_legacy);
+	free_page((unsigned long)vmx_msr_bitmap_longmode);
 	free_page((unsigned long)vmx_io_bitmap_b);
 	free_page((unsigned long)vmx_io_bitmap_a);
 
-- 
cgit v0.10.2


From cf9e4e15e8f6306b2559979269ead7c02e6b2b95 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 11 Feb 2009 16:03:36 +0800
Subject: KVM: Split IOAPIC structure

Prepared for reuse ioapic_redir_entry for MSI.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 2b8318c..b84aca3 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -40,4 +40,21 @@ typedef unsigned long  hfn_t;
 
 typedef hfn_t pfn_t;
 
+union kvm_ioapic_redirect_entry {
+	u64 bits;
+	struct {
+		u8 vector;
+		u8 delivery_mode:3;
+		u8 dest_mode:1;
+		u8 delivery_status:1;
+		u8 polarity:1;
+		u8 remote_irr:1;
+		u8 trig_mode:1;
+		u8 mask:1;
+		u8 reserve:7;
+		u8 reserved[4];
+		u8 dest_id;
+	} fields;
+};
+
 #endif /* __KVM_TYPES_H__ */
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index c3b99de..8128013 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -85,7 +85,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
 
 static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
 {
-	union ioapic_redir_entry *pent;
+	union kvm_ioapic_redirect_entry *pent;
 	int injected = -1;
 
 	pent = &ioapic->redirtbl[idx];
@@ -284,7 +284,7 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
 {
 	u32 old_irr = ioapic->irr;
 	u32 mask = 1 << irq;
-	union ioapic_redir_entry entry;
+	union kvm_ioapic_redirect_entry entry;
 	int ret = 1;
 
 	if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
@@ -305,7 +305,7 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
 static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int pin,
 				    int trigger_mode)
 {
-	union ioapic_redir_entry *ent;
+	union kvm_ioapic_redirect_entry *ent;
 
 	ent = &ioapic->redirtbl[pin];
 
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index a34bd5e..008ec87 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -40,22 +40,7 @@ struct kvm_ioapic {
 	u32 id;
 	u32 irr;
 	u32 pad;
-	union ioapic_redir_entry {
-		u64 bits;
-		struct {
-			u8 vector;
-			u8 delivery_mode:3;
-			u8 dest_mode:1;
-			u8 delivery_status:1;
-			u8 polarity:1;
-			u8 remote_irr:1;
-			u8 trig_mode:1;
-			u8 mask:1;
-			u8 reserve:7;
-			u8 reserved[4];
-			u8 dest_id;
-		} fields;
-	} redirtbl[IOAPIC_NUM_PINS];
+	union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS];
 	struct kvm_io_device dev;
 	struct kvm *kvm;
 	void (*ack_notifier)(void *opaque, int irq);
-- 
cgit v0.10.2


From 116191b69b608d0f1513e3abe71d6a46800f2bd6 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 11 Feb 2009 16:03:37 +0800
Subject: KVM: Unify the delivery of IOAPIC and MSI interrupts

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 894a56e..1a2f98f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -352,6 +352,9 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 				      struct kvm_irq_mask_notifier *kimn);
 void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask);
 
+void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
+				   union kvm_ioapic_redirect_entry *entry,
+				   unsigned long *deliver_bitmask);
 int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 8128013..883fd0d 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -203,79 +203,56 @@ u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
 
 static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 {
-	u8 dest = ioapic->redirtbl[irq].fields.dest_id;
-	u8 dest_mode = ioapic->redirtbl[irq].fields.dest_mode;
-	u8 delivery_mode = ioapic->redirtbl[irq].fields.delivery_mode;
-	u8 vector = ioapic->redirtbl[irq].fields.vector;
-	u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode;
-	u32 deliver_bitmask;
+	union kvm_ioapic_redirect_entry entry = ioapic->redirtbl[irq];
+	unsigned long deliver_bitmask;
 	struct kvm_vcpu *vcpu;
 	int vcpu_id, r = -1;
 
 	ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
 		     "vector=%x trig_mode=%x\n",
-		     dest, dest_mode, delivery_mode, vector, trig_mode);
+		     entry.fields.dest, entry.fields.dest_mode,
+		     entry.fields.delivery_mode, entry.fields.vector,
+		     entry.fields.trig_mode);
 
-	deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic, dest,
-							  dest_mode);
+	kvm_get_intr_delivery_bitmask(ioapic, &entry, &deliver_bitmask);
 	if (!deliver_bitmask) {
 		ioapic_debug("no target on destination\n");
 		return 0;
 	}
 
-	switch (delivery_mode) {
-	case IOAPIC_LOWEST_PRIORITY:
-		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector,
-				deliver_bitmask);
+	/* Always delivery PIT interrupt to vcpu 0 */
 #ifdef CONFIG_X86
-		if (irq == 0)
-			vcpu = ioapic->kvm->vcpus[0];
+	if (irq == 0)
+		deliver_bitmask = 1;
 #endif
-		if (vcpu != NULL)
-			r = ioapic_inj_irq(ioapic, vcpu, vector,
-				       trig_mode, delivery_mode);
-		else
-			ioapic_debug("null lowest prio vcpu: "
-				     "mask=%x vector=%x delivery_mode=%x\n",
-				     deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY);
-		break;
-	case IOAPIC_FIXED:
-#ifdef CONFIG_X86
-		if (irq == 0)
-			deliver_bitmask = 1;
-#endif
-		for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
-			if (!(deliver_bitmask & (1 << vcpu_id)))
-				continue;
-			deliver_bitmask &= ~(1 << vcpu_id);
-			vcpu = ioapic->kvm->vcpus[vcpu_id];
-			if (vcpu) {
+
+	for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
+		if (!(deliver_bitmask & (1 << vcpu_id)))
+			continue;
+		deliver_bitmask &= ~(1 << vcpu_id);
+		vcpu = ioapic->kvm->vcpus[vcpu_id];
+		if (vcpu) {
+			if (entry.fields.delivery_mode ==
+					IOAPIC_LOWEST_PRIORITY ||
+			    entry.fields.delivery_mode == IOAPIC_FIXED) {
 				if (r < 0)
 					r = 0;
-				r += ioapic_inj_irq(ioapic, vcpu, vector,
-					       trig_mode, delivery_mode);
-			}
-		}
-		break;
-	case IOAPIC_NMI:
-		for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
-			if (!(deliver_bitmask & (1 << vcpu_id)))
-				continue;
-			deliver_bitmask &= ~(1 << vcpu_id);
-			vcpu = ioapic->kvm->vcpus[vcpu_id];
-			if (vcpu) {
-				ioapic_inj_nmi(vcpu);
+				r += ioapic_inj_irq(ioapic, vcpu,
+						    entry.fields.vector,
+						    entry.fields.trig_mode,
+						    entry.fields.delivery_mode);
+			} else if (entry.fields.delivery_mode == IOAPIC_NMI) {
 				r = 1;
-			}
-			else
-				ioapic_debug("NMI to vcpu %d failed\n",
-						vcpu->vcpu_id);
-		}
-		break;
-	default:
-		printk(KERN_WARNING "Unsupported delivery mode %d\n",
-		       delivery_mode);
-		break;
+				ioapic_inj_nmi(vcpu);
+			} else
+				ioapic_debug("unsupported delivery mode %x!\n",
+					     entry.fields.delivery_mode);
+		} else
+			ioapic_debug("null destination vcpu: "
+				     "mask=%x vector=%x delivery_mode=%x\n",
+				     entry.fields.deliver_bitmask,
+				     entry.fields.vector,
+				     entry.fields.delivery_mode);
 	}
 	return r;
 }
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 864ac54..aec7a0d 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -43,53 +43,74 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
 	return kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level);
 }
 
+void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
+				   union kvm_ioapic_redirect_entry *entry,
+				   unsigned long *deliver_bitmask)
+{
+	struct kvm_vcpu *vcpu;
+
+	*deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic,
+				entry->fields.dest_id, entry->fields.dest_mode);
+	switch (entry->fields.delivery_mode) {
+	case IOAPIC_LOWEST_PRIORITY:
+		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm,
+				entry->fields.vector, *deliver_bitmask);
+		*deliver_bitmask = 1 << vcpu->vcpu_id;
+		break;
+	case IOAPIC_FIXED:
+	case IOAPIC_NMI:
+		break;
+	default:
+		if (printk_ratelimit())
+			printk(KERN_INFO "kvm: unsupported delivery mode %d\n",
+				entry->fields.delivery_mode);
+		*deliver_bitmask = 0;
+	}
+}
+
 static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 		       struct kvm *kvm, int level)
 {
 	int vcpu_id, r = -1;
 	struct kvm_vcpu *vcpu;
 	struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
-	int dest_id = (e->msi.address_lo & MSI_ADDR_DEST_ID_MASK)
-			>> MSI_ADDR_DEST_ID_SHIFT;
-	int vector = (e->msi.data & MSI_DATA_VECTOR_MASK)
-			>> MSI_DATA_VECTOR_SHIFT;
-	int dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT,
-				(unsigned long *)&e->msi.address_lo);
-	int trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT,
-				(unsigned long *)&e->msi.data);
-	int delivery_mode = test_bit(MSI_DATA_DELIVERY_MODE_SHIFT,
-				(unsigned long *)&e->msi.data);
-	u32 deliver_bitmask;
+	union kvm_ioapic_redirect_entry entry;
+	unsigned long deliver_bitmask;
 
 	BUG_ON(!ioapic);
 
-	deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic,
-				dest_id, dest_mode);
-	/* IOAPIC delivery mode value is the same as MSI here */
-	switch (delivery_mode) {
-	case IOAPIC_LOWEST_PRIORITY:
-		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector,
-				deliver_bitmask);
-		if (vcpu != NULL)
-			r = kvm_apic_set_irq(vcpu, vector, trig_mode);
-		else
-			printk(KERN_INFO "kvm: null lowest priority vcpu!\n");
-		break;
-	case IOAPIC_FIXED:
-		for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
-			if (!(deliver_bitmask & (1 << vcpu_id)))
-				continue;
-			deliver_bitmask &= ~(1 << vcpu_id);
-			vcpu = ioapic->kvm->vcpus[vcpu_id];
-			if (vcpu) {
-				if (r < 0)
-					r = 0;
-				r += kvm_apic_set_irq(vcpu, vector, trig_mode);
-			}
+	entry.bits = 0;
+	entry.fields.dest_id = (e->msi.address_lo &
+			MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
+	entry.fields.vector = (e->msi.data &
+			MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
+	entry.fields.dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT,
+			(unsigned long *)&e->msi.address_lo);
+	entry.fields.trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT,
+			(unsigned long *)&e->msi.data);
+	entry.fields.delivery_mode = test_bit(
+			MSI_DATA_DELIVERY_MODE_SHIFT,
+			(unsigned long *)&e->msi.data);
+
+	/* TODO Deal with RH bit of MSI message address */
+
+	kvm_get_intr_delivery_bitmask(ioapic, &entry, &deliver_bitmask);
+
+	if (!deliver_bitmask) {
+		printk(KERN_WARNING "kvm: no destination for MSI delivery!");
+		return -1;
+	}
+	for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
+		if (!(deliver_bitmask & (1 << vcpu_id)))
+			continue;
+		deliver_bitmask &= ~(1 << vcpu_id);
+		vcpu = ioapic->kvm->vcpus[vcpu_id];
+		if (vcpu) {
+			if (r < 0)
+				r = 0;
+			r += kvm_apic_set_irq(vcpu, entry.fields.vector,
+					      entry.fields.trig_mode);
 		}
-		break;
-	default:
-		break;
 	}
 	return r;
 }
-- 
cgit v0.10.2


From e5871be0f5d6847bc9585c997acb1b917c168f03 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 11 Feb 2009 16:03:38 +0800
Subject: KVM: Change API of kvm_ioapic_get_delivery_bitmask

In order to use with bit ops.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 883fd0d..3b537129 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -161,22 +161,22 @@ static void ioapic_inj_nmi(struct kvm_vcpu *vcpu)
 	kvm_vcpu_kick(vcpu);
 }
 
-u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
-				    u8 dest_mode)
+void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
+				     u8 dest_mode, unsigned long *mask)
 {
-	u32 mask = 0;
 	int i;
 	struct kvm *kvm = ioapic->kvm;
 	struct kvm_vcpu *vcpu;
 
 	ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode);
 
+	*mask = 0;
 	if (dest_mode == 0) {	/* Physical mode. */
 		if (dest == 0xFF) {	/* Broadcast. */
 			for (i = 0; i < KVM_MAX_VCPUS; ++i)
 				if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic)
-					mask |= 1 << i;
-			return mask;
+					*mask |= 1 << i;
+			return;
 		}
 		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
 			vcpu = kvm->vcpus[i];
@@ -184,7 +184,7 @@ u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
 				continue;
 			if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) {
 				if (vcpu->arch.apic)
-					mask = 1 << i;
+					*mask = 1 << i;
 				break;
 			}
 		}
@@ -195,10 +195,9 @@ u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
 				continue;
 			if (vcpu->arch.apic &&
 			    kvm_apic_match_logical_addr(vcpu->arch.apic, dest))
-				mask |= 1 << vcpu->vcpu_id;
+				*mask |= 1 << vcpu->vcpu_id;
 		}
-	ioapic_debug("mask %x\n", mask);
-	return mask;
+	ioapic_debug("mask %x\n", *mask);
 }
 
 static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 008ec87..f395798 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -70,7 +70,7 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
-u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
-				u8 dest_mode);
+void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
+				     u8 dest_mode, unsigned long *mask);
 
 #endif
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index aec7a0d..e8ff89c 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -49,8 +49,9 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
 {
 	struct kvm_vcpu *vcpu;
 
-	*deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic,
-				entry->fields.dest_id, entry->fields.dest_mode);
+	kvm_ioapic_get_delivery_bitmask(ioapic, entry->fields.dest_id,
+					entry->fields.dest_mode,
+					deliver_bitmask);
 	switch (entry->fields.delivery_mode) {
 	case IOAPIC_LOWEST_PRIORITY:
 		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm,
-- 
cgit v0.10.2


From 110c2faeba1f1994bcb1de55b9c31f4147dbfdb6 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 11 Feb 2009 16:03:39 +0800
Subject: KVM: Update intr delivery func to accept unsigned long* bitmap

Would be used with bit ops, and would be easily extended if KVM_MAX_VCPUS is
increased.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index f0b67f2..6aa8d20 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -409,7 +409,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 }
 
 static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
-				       unsigned long bitmap)
+				       unsigned long *bitmap)
 {
 	int last;
 	int next;
@@ -421,7 +421,7 @@ static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
 	do {
 		if (++next == KVM_MAX_VCPUS)
 			next = 0;
-		if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
+		if (kvm->vcpus[next] == NULL || !test_bit(next, bitmap))
 			continue;
 		apic = kvm->vcpus[next]->arch.apic;
 		if (apic && apic_enabled(apic))
@@ -437,7 +437,7 @@ static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
 }
 
 struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
-		unsigned long bitmap)
+		unsigned long *bitmap)
 {
 	struct kvm_lapic *apic;
 
@@ -508,7 +508,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 	}
 
 	if (delivery_mode == APIC_DM_LOWEST) {
-		target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
+		target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, &lpr_map);
 		if (target != NULL)
 			__apic_accept_irq(target->arch.apic, delivery_mode,
 					  vector, level, trig_mode);
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index f395798..7275f87 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -65,7 +65,7 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
 }
 
 struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
-				       unsigned long bitmap);
+				       unsigned long *bitmap);
 void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index e8ff89c..d4421cd 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -55,7 +55,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
 	switch (entry->fields.delivery_mode) {
 	case IOAPIC_LOWEST_PRIORITY:
 		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm,
-				entry->fields.vector, *deliver_bitmask);
+				entry->fields.vector, deliver_bitmask);
 		*deliver_bitmask = 1 << vcpu->vcpu_id;
 		break;
 	case IOAPIC_FIXED:
-- 
cgit v0.10.2


From bfd349d073b2838a6a031f057d25e266619b7093 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 11 Feb 2009 16:03:40 +0800
Subject: KVM: bit ops for deliver_bitmap

It's also convenient when we extend KVM supported vcpu number in the future.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 6aa8d20..afc59b2 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -483,9 +483,10 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 
 	struct kvm_vcpu *target;
 	struct kvm_vcpu *vcpu;
-	unsigned long lpr_map = 0;
+	DECLARE_BITMAP(lpr_map, KVM_MAX_VCPUS);
 	int i;
 
+	bitmap_zero(lpr_map, KVM_MAX_VCPUS);
 	apic_debug("icr_high 0x%x, icr_low 0x%x, "
 		   "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
 		   "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
@@ -500,7 +501,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 		if (vcpu->arch.apic &&
 		    apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
 			if (delivery_mode == APIC_DM_LOWEST)
-				set_bit(vcpu->vcpu_id, &lpr_map);
+				__set_bit(vcpu->vcpu_id, lpr_map);
 			else
 				__apic_accept_irq(vcpu->arch.apic, delivery_mode,
 						  vector, level, trig_mode);
@@ -508,7 +509,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 	}
 
 	if (delivery_mode == APIC_DM_LOWEST) {
-		target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, &lpr_map);
+		target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
 		if (target != NULL)
 			__apic_accept_irq(target->arch.apic, delivery_mode,
 					  vector, level, trig_mode);
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 3b537129..7c2cb2b 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -203,7 +203,7 @@ void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
 static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 {
 	union kvm_ioapic_redirect_entry entry = ioapic->redirtbl[irq];
-	unsigned long deliver_bitmask;
+	DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS);
 	struct kvm_vcpu *vcpu;
 	int vcpu_id, r = -1;
 
@@ -213,22 +213,24 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 		     entry.fields.delivery_mode, entry.fields.vector,
 		     entry.fields.trig_mode);
 
-	kvm_get_intr_delivery_bitmask(ioapic, &entry, &deliver_bitmask);
-	if (!deliver_bitmask) {
-		ioapic_debug("no target on destination\n");
-		return 0;
-	}
+	bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
 
 	/* Always delivery PIT interrupt to vcpu 0 */
 #ifdef CONFIG_X86
 	if (irq == 0)
-		deliver_bitmask = 1;
+		__set_bit(0, deliver_bitmask);
+	else
 #endif
+		kvm_get_intr_delivery_bitmask(ioapic, &entry, deliver_bitmask);
+
+	if (find_first_bit(deliver_bitmask, KVM_MAX_VCPUS) >= KVM_MAX_VCPUS) {
+		ioapic_debug("no target on destination\n");
+		return 0;
+	}
 
-	for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
-		if (!(deliver_bitmask & (1 << vcpu_id)))
-			continue;
-		deliver_bitmask &= ~(1 << vcpu_id);
+	while ((vcpu_id = find_first_bit(deliver_bitmask, KVM_MAX_VCPUS))
+			< KVM_MAX_VCPUS) {
+		__clear_bit(vcpu_id, deliver_bitmask);
 		vcpu = ioapic->kvm->vcpus[vcpu_id];
 		if (vcpu) {
 			if (entry.fields.delivery_mode ==
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index d4421cd..d165e05 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -56,7 +56,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
 	case IOAPIC_LOWEST_PRIORITY:
 		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm,
 				entry->fields.vector, deliver_bitmask);
-		*deliver_bitmask = 1 << vcpu->vcpu_id;
+		__set_bit(vcpu->vcpu_id, deliver_bitmask);
 		break;
 	case IOAPIC_FIXED:
 	case IOAPIC_NMI:
@@ -76,10 +76,12 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 	struct kvm_vcpu *vcpu;
 	struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
 	union kvm_ioapic_redirect_entry entry;
-	unsigned long deliver_bitmask;
+	DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS);
 
 	BUG_ON(!ioapic);
 
+	bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
+
 	entry.bits = 0;
 	entry.fields.dest_id = (e->msi.address_lo &
 			MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
@@ -95,16 +97,15 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 
 	/* TODO Deal with RH bit of MSI message address */
 
-	kvm_get_intr_delivery_bitmask(ioapic, &entry, &deliver_bitmask);
+	kvm_get_intr_delivery_bitmask(ioapic, &entry, deliver_bitmask);
 
-	if (!deliver_bitmask) {
+	if (find_first_bit(deliver_bitmask, KVM_MAX_VCPUS) >= KVM_MAX_VCPUS) {
 		printk(KERN_WARNING "kvm: no destination for MSI delivery!");
 		return -1;
 	}
-	for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
-		if (!(deliver_bitmask & (1 << vcpu_id)))
-			continue;
-		deliver_bitmask &= ~(1 << vcpu_id);
+	while ((vcpu_id = find_first_bit(deliver_bitmask,
+					KVM_MAX_VCPUS)) < KVM_MAX_VCPUS) {
+		__clear_bit(vcpu_id, deliver_bitmask);
 		vcpu = ioapic->kvm->vcpus[vcpu_id];
 		if (vcpu) {
 			if (r < 0)
-- 
cgit v0.10.2


From c1e01514296e8a4a43ff0c88dcff635cb90feb5f Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 25 Feb 2009 17:22:26 +0800
Subject: KVM: Ioctls for init MSI-X entry

Introduce KVM_SET_MSIX_NR and KVM_SET_MSIX_ENTRY two ioctls.

This two ioctls are used by userspace to specific guest device MSI-X entry
number and correlate MSI-X entry with GSI during the initialization stage.

MSI-X should be well initialzed before enabling.

Don't support change MSI-X entry number for now.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 8cc1379..78cdee8 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -487,6 +487,10 @@ struct kvm_irq_routing {
 #define KVM_REINJECT_CONTROL      _IO(KVMIO, 0x71)
 #define KVM_DEASSIGN_PCI_DEVICE _IOW(KVMIO, 0x72, \
 				     struct kvm_assigned_pci_dev)
+#define KVM_ASSIGN_SET_MSIX_NR \
+			_IOW(KVMIO, 0x73, struct kvm_assigned_msix_nr)
+#define KVM_ASSIGN_SET_MSIX_ENTRY \
+			_IOW(KVMIO, 0x74, struct kvm_assigned_msix_entry)
 
 /*
  * ioctls for vcpu fds
@@ -607,4 +611,18 @@ struct kvm_assigned_irq {
 #define KVM_DEV_IRQ_ASSIGN_MSI_ACTION	KVM_DEV_IRQ_ASSIGN_ENABLE_MSI
 #define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI	(1 << 0)
 
+struct kvm_assigned_msix_nr {
+	__u32 assigned_dev_id;
+	__u16 entry_nr;
+	__u16 padding;
+};
+
+#define KVM_MAX_MSIX_PER_DEV		512
+struct kvm_assigned_msix_entry {
+	__u32 assigned_dev_id;
+	__u32 gsi;
+	__u16 entry; /* The index of entry in the MSI-X table */
+	__u16 padding[3];
+};
+
 #endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1a2f98f..432edc2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -319,6 +319,12 @@ struct kvm_irq_ack_notifier {
 	void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
 };
 
+struct kvm_guest_msix_entry {
+	u32 vector;
+	u16 entry;
+	u16 flags;
+};
+
 struct kvm_assigned_dev_kernel {
 	struct kvm_irq_ack_notifier ack_notifier;
 	struct work_struct interrupt_work;
@@ -326,13 +332,17 @@ struct kvm_assigned_dev_kernel {
 	int assigned_dev_id;
 	int host_busnr;
 	int host_devfn;
+	unsigned int entries_nr;
 	int host_irq;
 	bool host_irq_disabled;
+	struct msix_entry *host_msix_entries;
 	int guest_irq;
+	struct kvm_guest_msix_entry *guest_msix_entries;
 #define KVM_ASSIGNED_DEV_GUEST_INTX	(1 << 0)
 #define KVM_ASSIGNED_DEV_GUEST_MSI	(1 << 1)
 #define KVM_ASSIGNED_DEV_HOST_INTX	(1 << 8)
 #define KVM_ASSIGNED_DEV_HOST_MSI	(1 << 9)
+#define KVM_ASSIGNED_DEV_MSIX		((1 << 2) | (1 << 10))
 	unsigned long irq_requested_type;
 	int irq_source_id;
 	int flags;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4d0dd39..1ceb969 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1593,6 +1593,88 @@ static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
 	return 0;
 }
 
+#ifdef __KVM_HAVE_MSIX
+static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
+				    struct kvm_assigned_msix_nr *entry_nr)
+{
+	int r = 0;
+	struct kvm_assigned_dev_kernel *adev;
+
+	mutex_lock(&kvm->lock);
+
+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      entry_nr->assigned_dev_id);
+	if (!adev) {
+		r = -EINVAL;
+		goto msix_nr_out;
+	}
+
+	if (adev->entries_nr == 0) {
+		adev->entries_nr = entry_nr->entry_nr;
+		if (adev->entries_nr == 0 ||
+		    adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) {
+			r = -EINVAL;
+			goto msix_nr_out;
+		}
+
+		adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
+						entry_nr->entry_nr,
+						GFP_KERNEL);
+		if (!adev->host_msix_entries) {
+			r = -ENOMEM;
+			goto msix_nr_out;
+		}
+		adev->guest_msix_entries = kzalloc(
+				sizeof(struct kvm_guest_msix_entry) *
+				entry_nr->entry_nr, GFP_KERNEL);
+		if (!adev->guest_msix_entries) {
+			kfree(adev->host_msix_entries);
+			r = -ENOMEM;
+			goto msix_nr_out;
+		}
+	} else /* Not allowed set MSI-X number twice */
+		r = -EINVAL;
+msix_nr_out:
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
+static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
+				       struct kvm_assigned_msix_entry *entry)
+{
+	int r = 0, i;
+	struct kvm_assigned_dev_kernel *adev;
+
+	mutex_lock(&kvm->lock);
+
+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      entry->assigned_dev_id);
+
+	if (!adev) {
+		r = -EINVAL;
+		goto msix_entry_out;
+	}
+
+	for (i = 0; i < adev->entries_nr; i++)
+		if (adev->guest_msix_entries[i].vector == 0 ||
+		    adev->guest_msix_entries[i].entry == entry->entry) {
+			adev->guest_msix_entries[i].entry = entry->entry;
+			adev->guest_msix_entries[i].vector = entry->gsi;
+			adev->host_msix_entries[i].entry = entry->entry;
+			break;
+		}
+	if (i == adev->entries_nr) {
+		r = -ENOSPC;
+		goto msix_entry_out;
+	}
+
+msix_entry_out:
+	mutex_unlock(&kvm->lock);
+
+	return r;
+}
+#endif
+
 static long kvm_vcpu_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {
@@ -1917,7 +1999,29 @@ static long kvm_vm_ioctl(struct file *filp,
 		vfree(entries);
 		break;
 	}
+#ifdef __KVM_HAVE_MSIX
+	case KVM_ASSIGN_SET_MSIX_NR: {
+		struct kvm_assigned_msix_nr entry_nr;
+		r = -EFAULT;
+		if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
+			goto out;
+		r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_ASSIGN_SET_MSIX_ENTRY: {
+		struct kvm_assigned_msix_entry entry;
+		r = -EFAULT;
+		if (copy_from_user(&entry, argp, sizeof entry))
+			goto out;
+		r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
+		if (r)
+			goto out;
+		break;
+	}
 #endif
+#endif /* KVM_CAP_IRQ_ROUTING */
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 	}
-- 
cgit v0.10.2


From 2350bd1f62c8706c22b8e58c3bfff10806c0a31b Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 25 Feb 2009 17:22:27 +0800
Subject: KVM: Add MSI-X interrupt injection logic

We have to handle more than one interrupt with one handler for MSI-X. Avi
suggested to use a flag to indicate the pending. So here is it.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 432edc2..3832243 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -319,6 +319,7 @@ struct kvm_irq_ack_notifier {
 	void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
 };
 
+#define KVM_ASSIGNED_MSIX_PENDING		0x1
 struct kvm_guest_msix_entry {
 	u32 vector;
 	u16 entry;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1ceb969..8bd44d6 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -95,25 +95,69 @@ static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *h
 	return NULL;
 }
 
+static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
+				    *assigned_dev, int irq)
+{
+	int i, index;
+	struct msix_entry *host_msix_entries;
+
+	host_msix_entries = assigned_dev->host_msix_entries;
+
+	index = -1;
+	for (i = 0; i < assigned_dev->entries_nr; i++)
+		if (irq == host_msix_entries[i].vector) {
+			index = i;
+			break;
+		}
+	if (index < 0) {
+		printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
+		return 0;
+	}
+
+	return index;
+}
+
 static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 {
 	struct kvm_assigned_dev_kernel *assigned_dev;
+	struct kvm *kvm;
+	int irq, i;
 
 	assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
 				    interrupt_work);
+	kvm = assigned_dev->kvm;
 
 	/* This is taken to safely inject irq inside the guest. When
 	 * the interrupt injection (or the ioapic code) uses a
 	 * finer-grained lock, update this
 	 */
-	mutex_lock(&assigned_dev->kvm->lock);
-	kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-		    assigned_dev->guest_irq, 1);
-
-	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_MSI) {
-		enable_irq(assigned_dev->host_irq);
-		assigned_dev->host_irq_disabled = false;
+	mutex_lock(&kvm->lock);
+	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_MSIX) {
+		struct kvm_guest_msix_entry *guest_entries =
+			assigned_dev->guest_msix_entries;
+		for (i = 0; i < assigned_dev->entries_nr; i++) {
+			if (!(guest_entries[i].flags &
+					KVM_ASSIGNED_MSIX_PENDING))
+				continue;
+			guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING;
+			kvm_set_irq(assigned_dev->kvm,
+				    assigned_dev->irq_source_id,
+				    guest_entries[i].vector, 1);
+			irq = assigned_dev->host_msix_entries[i].vector;
+			if (irq != 0)
+				enable_irq(irq);
+			assigned_dev->host_irq_disabled = false;
+		}
+	} else {
+		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+			    assigned_dev->guest_irq, 1);
+		if (assigned_dev->irq_requested_type &
+				KVM_ASSIGNED_DEV_GUEST_MSI) {
+			enable_irq(assigned_dev->host_irq);
+			assigned_dev->host_irq_disabled = false;
+		}
 	}
+
 	mutex_unlock(&assigned_dev->kvm->lock);
 }
 
@@ -122,6 +166,14 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
 	struct kvm_assigned_dev_kernel *assigned_dev =
 		(struct kvm_assigned_dev_kernel *) dev_id;
 
+	if (assigned_dev->irq_requested_type == KVM_ASSIGNED_DEV_MSIX) {
+		int index = find_index_from_host_irq(assigned_dev, irq);
+		if (index < 0)
+			return IRQ_HANDLED;
+		assigned_dev->guest_msix_entries[index].flags |=
+			KVM_ASSIGNED_MSIX_PENDING;
+	}
+
 	schedule_work(&assigned_dev->interrupt_work);
 
 	disable_irq_nosync(irq);
-- 
cgit v0.10.2


From d510d6cc653bc4b3094ea73afe12600d0ab445b3 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 25 Feb 2009 17:22:28 +0800
Subject: KVM: Enable MSI-X for KVM assigned device

This patch finally enable MSI-X.

What we need for MSI-X:
1. Intercept one page in MMIO region of device. So that we can get guest desired
MSI-X table and set up the real one. Now this have been done by guest, and
transfer to kernel using ioctl KVM_SET_MSIX_NR and KVM_SET_MSIX_ENTRY.

2. Information for incoming interrupt. Now one device can have more than one
interrupt, and they are all handled by one workqueue structure. So we need to
identify them. The previous patch enable gsi_msg_pending_bitmap get this done.

3. Mapping from host IRQ to guest gsi as well as guest gsi to real MSI/MSI-X
message address/data. We used same entry number for the host and guest here, so
that it's easy to find the correlated guest gsi.

What we lack for now:
1. The PCI spec said nothing can existed with MSI-X table in the same page of
MMIO region, except pending bits. The patch ignore pending bits as the first
step (so they are always 0 - no pending).

2. The PCI spec allowed to change MSI-X table dynamically. That means, the OS
can enable MSI-X, then mask one MSI-X entry, modify it, and unmask it. The patch
didn't support this, and Linux also don't work in this way.

3. The patch didn't implement MSI-X mask all and mask single entry. I would
implement the former in driver/pci/msi.c later. And for single entry, userspace
should have reposibility to handle it.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index dc3f6cf..125be8b 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -16,6 +16,7 @@
 #define __KVM_HAVE_MSI
 #define __KVM_HAVE_USER_NMI
 #define __KVM_HAVE_GUEST_DEBUG
+#define __KVM_HAVE_MSIX
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 78cdee8..640835e 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -409,6 +409,9 @@ struct kvm_trace_rec {
 #ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
 #define KVM_CAP_DEVICE_DEASSIGNMENT 27
 #endif
+#ifdef __KVM_HAVE_MSIX
+#define KVM_CAP_DEVICE_MSIX 28
+#endif
 /* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */
 #define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30
 
@@ -611,6 +614,11 @@ struct kvm_assigned_irq {
 #define KVM_DEV_IRQ_ASSIGN_MSI_ACTION	KVM_DEV_IRQ_ASSIGN_ENABLE_MSI
 #define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI	(1 << 0)
 
+#define KVM_DEV_IRQ_ASSIGN_MSIX_ACTION  (KVM_DEV_IRQ_ASSIGN_ENABLE_MSIX |\
+					KVM_DEV_IRQ_ASSIGN_MASK_MSIX)
+#define KVM_DEV_IRQ_ASSIGN_ENABLE_MSIX  (1 << 1)
+#define KVM_DEV_IRQ_ASSIGN_MASK_MSIX    (1 << 2)
+
 struct kvm_assigned_msix_nr {
 	__u32 assigned_dev_id;
 	__u16 entry_nr;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8bd44d6..3bed827 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -236,13 +236,33 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
 	 * now, the kvm state is still legal for probably we also have to wait
 	 * interrupt_work done.
 	 */
-	disable_irq_nosync(assigned_dev->host_irq);
-	cancel_work_sync(&assigned_dev->interrupt_work);
+	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_MSIX) {
+		int i;
+		for (i = 0; i < assigned_dev->entries_nr; i++)
+			disable_irq_nosync(assigned_dev->
+					   host_msix_entries[i].vector);
+
+		cancel_work_sync(&assigned_dev->interrupt_work);
 
-	free_irq(assigned_dev->host_irq, (void *)assigned_dev);
+		for (i = 0; i < assigned_dev->entries_nr; i++)
+			free_irq(assigned_dev->host_msix_entries[i].vector,
+				 (void *)assigned_dev);
 
-	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)
-		pci_disable_msi(assigned_dev->dev);
+		assigned_dev->entries_nr = 0;
+		kfree(assigned_dev->host_msix_entries);
+		kfree(assigned_dev->guest_msix_entries);
+		pci_disable_msix(assigned_dev->dev);
+	} else {
+		/* Deal with MSI and INTx */
+		disable_irq_nosync(assigned_dev->host_irq);
+		cancel_work_sync(&assigned_dev->interrupt_work);
+
+		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
+
+		if (assigned_dev->irq_requested_type &
+				KVM_ASSIGNED_DEV_HOST_MSI)
+			pci_disable_msi(assigned_dev->dev);
+	}
 
 	assigned_dev->irq_requested_type = 0;
 }
@@ -373,6 +393,60 @@ static int assigned_device_update_msi(struct kvm *kvm,
 }
 #endif
 
+#ifdef __KVM_HAVE_MSIX
+static int assigned_device_update_msix(struct kvm *kvm,
+			struct kvm_assigned_dev_kernel *adev,
+			struct kvm_assigned_irq *airq)
+{
+	/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
+	int i, r;
+
+	adev->ack_notifier.gsi = -1;
+
+	if (irqchip_in_kernel(kvm)) {
+		if (airq->flags & KVM_DEV_IRQ_ASSIGN_MASK_MSIX)
+			return -ENOTTY;
+
+		if (!(airq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSIX)) {
+			/* Guest disable MSI-X */
+			kvm_free_assigned_irq(kvm, adev);
+			if (msi2intx) {
+				pci_enable_msi(adev->dev);
+				if (adev->dev->msi_enabled)
+					return assigned_device_update_msi(kvm,
+							adev, airq);
+			}
+			return assigned_device_update_intx(kvm, adev, airq);
+		}
+
+		/* host_msix_entries and guest_msix_entries should have been
+		 * initialized */
+		if (adev->entries_nr == 0)
+			return -EINVAL;
+
+		kvm_free_assigned_irq(kvm, adev);
+
+		r = pci_enable_msix(adev->dev, adev->host_msix_entries,
+				    adev->entries_nr);
+		if (r)
+			return r;
+
+		for (i = 0; i < adev->entries_nr; i++) {
+			r = request_irq((adev->host_msix_entries + i)->vector,
+					kvm_assigned_dev_intr, 0,
+					"kvm_assigned_msix_device",
+					(void *)adev);
+			if (r)
+				return r;
+		}
+	}
+
+	adev->irq_requested_type |= KVM_ASSIGNED_DEV_MSIX;
+
+	return 0;
+}
+#endif
+
 static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 				   struct kvm_assigned_irq
 				   *assigned_irq)
@@ -417,12 +491,24 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 		}
 	}
 
-	if ((match->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) &&
+	if (match->irq_requested_type & KVM_ASSIGNED_DEV_MSIX)
+		current_flags |= KVM_DEV_IRQ_ASSIGN_ENABLE_MSIX;
+	else if ((match->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) &&
 		 (match->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_MSI))
 		current_flags |= KVM_DEV_IRQ_ASSIGN_ENABLE_MSI;
 
 	changed_flags = assigned_irq->flags ^ current_flags;
 
+#ifdef __KVM_HAVE_MSIX
+	if (changed_flags & KVM_DEV_IRQ_ASSIGN_MSIX_ACTION) {
+		r = assigned_device_update_msix(kvm, match, assigned_irq);
+		if (r) {
+			printk(KERN_WARNING "kvm: failed to execute "
+					"MSI-X action!\n");
+			goto out_release;
+		}
+	} else
+#endif
 	if ((changed_flags & KVM_DEV_IRQ_ASSIGN_MSI_ACTION) ||
 	    (msi2intx && match->dev->msi_enabled)) {
 #ifdef CONFIG_X86
-- 
cgit v0.10.2


From 2dea4c84bc936731668b5a7a9fba5b436a422668 Mon Sep 17 00:00:00 2001
From: "Matt T. Yourst" <yourst@users.sourceforge.net>
Date: Tue, 24 Feb 2009 15:28:00 -0300
Subject: KVM: x86: silence preempt warning on kvm_write_guest_time

This issue just appeared in kvm-84 when running on 2.6.28.7 (x86-64)
with PREEMPT enabled.

We're getting syslog warnings like this many (but not all) times qemu
tells KVM to run the VCPU:

BUG: using smp_processor_id() in preemptible [00000000] code:
qemu-system-x86/28938
caller is kvm_arch_vcpu_ioctl_run+0x5d1/0xc70 [kvm]
Pid: 28938, comm: qemu-system-x86 2.6.28.7-mtyrel-64bit
Call Trace:
debug_smp_processor_id+0xf7/0x100
kvm_arch_vcpu_ioctl_run+0x5d1/0xc70 [kvm]
? __wake_up+0x4e/0x70
? wake_futex+0x27/0x40
kvm_vcpu_ioctl+0x2e9/0x5a0 [kvm]
enqueue_hrtimer+0x8a/0x110
_spin_unlock_irqrestore+0x27/0x50
vfs_ioctl+0x31/0xa0
do_vfs_ioctl+0x74/0x480
sys_futex+0xb4/0x140
sys_ioctl+0x99/0xa0
system_call_fastpath+0x16/0x1b

As it turns out, the call trace is messed up due to gcc's inlining, but
I isolated the problem anyway: kvm_write_guest_time() is being used in a
non-thread-safe manner on preemptable kernels.

Basically kvm_write_guest_time()'s body needs to be surrounded by
preempt_disable() and preempt_enable(), since the kernel won't let us
query any per-CPU data (indirectly using smp_processor_id()) without
preemption disabled. The attached patch fixes this issue by disabling
preemption inside kvm_write_guest_time().

[marcelo: surround only __get_cpu_var calls since the warning
is harmless]

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3944e91..43e049a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -634,10 +634,12 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 	if ((!vcpu->time_page))
 		return;
 
+	preempt_disable();
 	if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) {
 		kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock);
 		vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz);
 	}
+	preempt_enable();
 
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
-- 
cgit v0.10.2


From a90ede7b17d122acd58e6e1ff911be9dcf5263cc Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Wed, 11 Feb 2009 22:45:42 -0200
Subject: KVM: x86: paravirt skip pit-through-ioapic boot check

Skip the test which checks if the PIT is properly routed when
using the IOAPIC, aimed at buggy hardware.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 33019dd..057173d 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,6 +27,7 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/hardirq.h>
+#include <asm/timer.h>
 
 #define MMU_QUEUE_SIZE 1024
 
@@ -230,6 +231,9 @@ static void paravirt_ops_setup(void)
 		pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
 		pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
 	}
+#ifdef CONFIG_X86_IO_APIC
+	no_timer_check = 1;
+#endif
 }
 
 void __init kvm_guest_init(void)
-- 
cgit v0.10.2


From b95b51d580bff9376850eef29d34c3aa08c26db7 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Thu, 26 Feb 2009 13:55:33 +0100
Subject: KVM: declare ioapic functions only on affected hardware

Since "KVM: Unify the delivery of IOAPIC and MSI interrupts"
I get the following warnings:

  CC [M]  arch/s390/kvm/kvm-s390.o
In file included from arch/s390/kvm/kvm-s390.c:22:
include/linux/kvm_host.h:357: warning: 'struct kvm_ioapic' declared inside parameter list
include/linux/kvm_host.h:357: warning: its scope is only this definition or declaration, which is probably not what you want

This patch limits IOAPIC functions for architectures that have one.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3832243..3b91ec9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -363,9 +363,11 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 				      struct kvm_irq_mask_notifier *kimn);
 void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask);
 
+#ifdef __KVM_HAVE_IOAPIC
 void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
 				   union kvm_ioapic_redirect_entry *entry,
 				   unsigned long *deliver_bitmask);
+#endif
 int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
-- 
cgit v0.10.2


From 5a05d54554f19a128306eca7f7f5ed31f7d7eeb9 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 23 Feb 2009 10:57:39 -0300
Subject: KVM: PIT: remove unused scheduled variable

Unused.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c13bb92..3eddae6 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -208,7 +208,6 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
 		wake_up_interruptible(&vcpu0->wq);
 
 	hrtimer_add_expires_ns(&pt->timer, pt->period);
-	pt->scheduled = hrtimer_get_expires_ns(&pt->timer);
 	if (pt->period)
 		ps->channels[0].count_load_time = ktime_get();
 
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 6acbe4b..521accf 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -7,7 +7,6 @@ struct kvm_kpit_timer {
 	struct hrtimer timer;
 	int irq;
 	s64 period; /* unit: ns */
-	s64 scheduled;
 	atomic_t pending;
 	bool reinject;
 };
-- 
cgit v0.10.2


From fd66842370e32f3bbe429677280a326c07e508c1 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 23 Feb 2009 10:57:40 -0300
Subject: KVM: PIT: remove usage of count_load_time for channel 0

We can infer elapsed time from hrtimer_expires_remaining.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 3eddae6..09ae841 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -98,6 +98,32 @@ static int pit_get_gate(struct kvm *kvm, int channel)
 	return kvm->arch.vpit->pit_state.channels[channel].gate;
 }
 
+static s64 __kpit_elapsed(struct kvm *kvm)
+{
+	s64 elapsed;
+	ktime_t remaining;
+	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+
+	remaining = hrtimer_expires_remaining(&ps->pit_timer.timer);
+	if (ktime_to_ns(remaining) < 0)
+		remaining = ktime_set(0, 0);
+
+	elapsed = ps->pit_timer.period;
+	if (ktime_to_ns(remaining) <= ps->pit_timer.period)
+		elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
+
+	return elapsed;
+}
+
+static s64 kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c,
+			int channel)
+{
+	if (channel == 0)
+		return __kpit_elapsed(kvm);
+
+	return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+}
+
 static int pit_get_count(struct kvm *kvm, int channel)
 {
 	struct kvm_kpit_channel_state *c =
@@ -107,7 +133,7 @@ static int pit_get_count(struct kvm *kvm, int channel)
 
 	WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
 
-	t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+	t = kpit_elapsed(kvm, c, channel);
 	d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
 
 	switch (c->mode) {
@@ -137,7 +163,7 @@ static int pit_get_out(struct kvm *kvm, int channel)
 
 	WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
 
-	t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+	t = kpit_elapsed(kvm, c, channel);
 	d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
 
 	switch (c->mode) {
@@ -208,8 +234,6 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
 		wake_up_interruptible(&vcpu0->wq);
 
 	hrtimer_add_expires_ns(&pt->timer, pt->period);
-	if (pt->period)
-		ps->channels[0].count_load_time = ktime_get();
 
 	return (pt->period == 0 ? 0 : 1);
 }
@@ -305,11 +329,12 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
 	if (val == 0)
 		val = 0x10000;
 
-	ps->channels[channel].count_load_time = ktime_get();
 	ps->channels[channel].count = val;
 
-	if (channel != 0)
+	if (channel != 0) {
+		ps->channels[channel].count_load_time = ktime_get();
 		return;
+	}
 
 	/* Two types of timer
 	 * mode 1 is one shot, mode 2 is period, otherwise del timer */
-- 
cgit v0.10.2


From d3c7b77d1a6e7a0a27035a7ba723a3455317883e Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 23 Feb 2009 10:57:41 -0300
Subject: KVM: unify part of generic timer handling

Hide the internals of vcpu awakening / injection from the in-kernel
emulated timers. This makes future changes in this logic easier and
decreases the distance to more generic timer handling.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index d3ec292..b43c4ef 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -14,7 +14,7 @@ endif
 EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
 
 kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
-	i8254.o
+	i8254.o timer.o
 obj-$(CONFIG_KVM) += kvm.o
 kvm-intel-objs = vmx.o
 obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 09ae841..4e2e3f2 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -219,25 +219,6 @@ static void pit_latch_status(struct kvm *kvm, int channel)
 	}
 }
 
-static int __pit_timer_fn(struct kvm_kpit_state *ps)
-{
-	struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0];
-	struct kvm_kpit_timer *pt = &ps->pit_timer;
-
-	if (!atomic_inc_and_test(&pt->pending))
-		set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
-
-	if (!pt->reinject)
-		atomic_set(&pt->pending, 1);
-
-	if (vcpu0 && waitqueue_active(&vcpu0->wq))
-		wake_up_interruptible(&vcpu0->wq);
-
-	hrtimer_add_expires_ns(&pt->timer, pt->period);
-
-	return (pt->period == 0 ? 0 : 1);
-}
-
 int pit_has_pending_timer(struct kvm_vcpu *vcpu)
 {
 	struct kvm_pit *pit = vcpu->kvm->arch.vpit;
@@ -258,21 +239,6 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 	spin_unlock(&ps->inject_lock);
 }
 
-static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
-{
-	struct kvm_kpit_state *ps;
-	int restart_timer = 0;
-
-	ps = container_of(data, struct kvm_kpit_state, pit_timer.timer);
-
-	restart_timer = __pit_timer_fn(ps);
-
-	if (restart_timer)
-		return HRTIMER_RESTART;
-	else
-		return HRTIMER_NORESTART;
-}
-
 void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
 {
 	struct kvm_pit *pit = vcpu->kvm->arch.vpit;
@@ -286,15 +252,26 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
 		hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 }
 
-static void destroy_pit_timer(struct kvm_kpit_timer *pt)
+static void destroy_pit_timer(struct kvm_timer *pt)
 {
 	pr_debug("pit: execute del timer!\n");
 	hrtimer_cancel(&pt->timer);
 }
 
+static bool kpit_is_periodic(struct kvm_timer *ktimer)
+{
+	struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state,
+						 pit_timer);
+	return ps->is_periodic;
+}
+
+struct kvm_timer_ops kpit_ops = {
+	.is_periodic = kpit_is_periodic,
+};
+
 static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
 {
-	struct kvm_kpit_timer *pt = &ps->pit_timer;
+	struct kvm_timer *pt = &ps->pit_timer;
 	s64 interval;
 
 	interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
@@ -304,7 +281,13 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
 	/* TODO The new value only affected after the retriggered */
 	hrtimer_cancel(&pt->timer);
 	pt->period = (is_period == 0) ? 0 : interval;
-	pt->timer.function = pit_timer_fn;
+	ps->is_periodic = is_period;
+
+	pt->timer.function = kvm_timer_fn;
+	pt->t_ops = &kpit_ops;
+	pt->kvm = ps->pit->kvm;
+	pt->vcpu_id = 0;
+
 	atomic_set(&pt->pending, 0);
 	ps->irq_ack = 1;
 
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 521accf..bbd863f 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -3,14 +3,6 @@
 
 #include "iodev.h"
 
-struct kvm_kpit_timer {
-	struct hrtimer timer;
-	int irq;
-	s64 period; /* unit: ns */
-	atomic_t pending;
-	bool reinject;
-};
-
 struct kvm_kpit_channel_state {
 	u32 count; /* can be 65536 */
 	u16 latched_count;
@@ -29,7 +21,8 @@ struct kvm_kpit_channel_state {
 
 struct kvm_kpit_state {
 	struct kvm_kpit_channel_state channels[3];
-	struct kvm_kpit_timer pit_timer;
+	struct kvm_timer pit_timer;
+	bool is_periodic;
 	u32    speaker_data_on;
 	struct mutex lock;
 	struct kvm_pit *pit;
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
new file mode 100644
index 0000000..26bd6ba
--- /dev/null
+++ b/arch/x86/kvm/kvm_timer.h
@@ -0,0 +1,18 @@
+
+struct kvm_timer {
+	struct hrtimer timer;
+	s64 period; 				/* unit: ns */
+	atomic_t pending;			/* accumulated triggered timers */
+	bool reinject;
+	struct kvm_timer_ops *t_ops;
+	struct kvm *kvm;
+	int vcpu_id;
+};
+
+struct kvm_timer_ops {
+        bool (*is_periodic)(struct kvm_timer *);
+};
+
+
+enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
+
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index afc59b2..27ca43e 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -528,12 +528,13 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
 	if (apic_get_reg(apic, APIC_TMICT) == 0)
 		return 0;
 
-	remaining = hrtimer_expires_remaining(&apic->timer.dev);
+	remaining = hrtimer_expires_remaining(&apic->lapic_timer.timer);
 	if (ktime_to_ns(remaining) < 0)
 		remaining = ktime_set(0, 0);
 
-	ns = mod_64(ktime_to_ns(remaining), apic->timer.period);
-	tmcct = div64_u64(ns, (APIC_BUS_CYCLE_NS * apic->timer.divide_count));
+	ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period);
+	tmcct = div64_u64(ns,
+			 (APIC_BUS_CYCLE_NS * apic->divide_count));
 
 	return tmcct;
 }
@@ -620,25 +621,25 @@ static void update_divide_count(struct kvm_lapic *apic)
 	tdcr = apic_get_reg(apic, APIC_TDCR);
 	tmp1 = tdcr & 0xf;
 	tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
-	apic->timer.divide_count = 0x1 << (tmp2 & 0x7);
+	apic->divide_count = 0x1 << (tmp2 & 0x7);
 
 	apic_debug("timer divide count is 0x%x\n",
-				   apic->timer.divide_count);
+				   apic->lapic_timer.divide_count);
 }
 
 static void start_apic_timer(struct kvm_lapic *apic)
 {
-	ktime_t now = apic->timer.dev.base->get_time();
+	ktime_t now = apic->lapic_timer.timer.base->get_time();
 
-	apic->timer.period = apic_get_reg(apic, APIC_TMICT) *
-		    APIC_BUS_CYCLE_NS * apic->timer.divide_count;
-	atomic_set(&apic->timer.pending, 0);
+	apic->lapic_timer.period = apic_get_reg(apic, APIC_TMICT) *
+		    APIC_BUS_CYCLE_NS * apic->divide_count;
+	atomic_set(&apic->lapic_timer.pending, 0);
 
-	if (!apic->timer.period)
+	if (!apic->lapic_timer.period)
 		return;
 
-	hrtimer_start(&apic->timer.dev,
-		      ktime_add_ns(now, apic->timer.period),
+	hrtimer_start(&apic->lapic_timer.timer,
+		      ktime_add_ns(now, apic->lapic_timer.period),
 		      HRTIMER_MODE_ABS);
 
 	apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
@@ -647,9 +648,9 @@ static void start_apic_timer(struct kvm_lapic *apic)
 			   "expire @ 0x%016" PRIx64 ".\n", __func__,
 			   APIC_BUS_CYCLE_NS, ktime_to_ns(now),
 			   apic_get_reg(apic, APIC_TMICT),
-			   apic->timer.period,
+			   apic->lapic_timer.period,
 			   ktime_to_ns(ktime_add_ns(now,
-					apic->timer.period)));
+					apic->lapic_timer.period)));
 }
 
 static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
@@ -731,7 +732,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
 				apic_set_reg(apic, APIC_LVTT + 0x10 * i,
 					     lvt_val | APIC_LVT_MASKED);
 			}
-			atomic_set(&apic->timer.pending, 0);
+			atomic_set(&apic->lapic_timer.pending, 0);
 
 		}
 		break;
@@ -763,7 +764,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
 		break;
 
 	case APIC_TMICT:
-		hrtimer_cancel(&apic->timer.dev);
+		hrtimer_cancel(&apic->lapic_timer.timer);
 		apic_set_reg(apic, APIC_TMICT, val);
 		start_apic_timer(apic);
 		return;
@@ -803,7 +804,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
 	if (!vcpu->arch.apic)
 		return;
 
-	hrtimer_cancel(&vcpu->arch.apic->timer.dev);
+	hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
 
 	if (vcpu->arch.apic->regs_page)
 		__free_page(vcpu->arch.apic->regs_page);
@@ -881,7 +882,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 	ASSERT(apic != NULL);
 
 	/* Stop the timer in case it's a reset to an active apic */
-	hrtimer_cancel(&apic->timer.dev);
+	hrtimer_cancel(&apic->lapic_timer.timer);
 
 	apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
 	apic_set_reg(apic, APIC_LVR, APIC_VERSION);
@@ -906,7 +907,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 		apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
 	}
 	update_divide_count(apic);
-	atomic_set(&apic->timer.pending, 0);
+	atomic_set(&apic->lapic_timer.pending, 0);
 	if (vcpu->vcpu_id == 0)
 		vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
 	apic_update_ppr(apic);
@@ -937,22 +938,11 @@ EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
  *----------------------------------------------------------------------
  */
 
-/* TODO: make sure __apic_timer_fn runs in current pCPU */
-static int __apic_timer_fn(struct kvm_lapic *apic)
+static bool lapic_is_periodic(struct kvm_timer *ktimer)
 {
-	int result = 0;
-	wait_queue_head_t *q = &apic->vcpu->wq;
-
-	if(!atomic_inc_and_test(&apic->timer.pending))
-		set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
-	if (waitqueue_active(q))
-		wake_up_interruptible(q);
-
-	if (apic_lvtt_period(apic)) {
-		result = 1;
-		hrtimer_add_expires_ns(&apic->timer.dev, apic->timer.period);
-	}
-	return result;
+	struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic,
+					      lapic_timer);
+	return apic_lvtt_period(apic);
 }
 
 int apic_has_pending_timer(struct kvm_vcpu *vcpu)
@@ -960,7 +950,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
 	struct kvm_lapic *lapic = vcpu->arch.apic;
 
 	if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT))
-		return atomic_read(&lapic->timer.pending);
+		return atomic_read(&lapic->lapic_timer.pending);
 
 	return 0;
 }
@@ -987,20 +977,9 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
 		kvm_apic_local_deliver(apic, APIC_LVT0);
 }
 
-static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
-{
-	struct kvm_lapic *apic;
-	int restart_timer = 0;
-
-	apic = container_of(data, struct kvm_lapic, timer.dev);
-
-	restart_timer = __apic_timer_fn(apic);
-
-	if (restart_timer)
-		return HRTIMER_RESTART;
-	else
-		return HRTIMER_NORESTART;
-}
+struct kvm_timer_ops lapic_timer_ops = {
+	.is_periodic = lapic_is_periodic,
+};
 
 int kvm_create_lapic(struct kvm_vcpu *vcpu)
 {
@@ -1025,8 +1004,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
 	memset(apic->regs, 0, PAGE_SIZE);
 	apic->vcpu = vcpu;
 
-	hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
-	apic->timer.dev.function = apic_timer_fn;
+	hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
+		     HRTIMER_MODE_ABS);
+	apic->lapic_timer.timer.function = kvm_timer_fn;
+	apic->lapic_timer.t_ops = &lapic_timer_ops;
+	apic->lapic_timer.kvm = vcpu->kvm;
+	apic->lapic_timer.vcpu_id = vcpu->vcpu_id;
+
 	apic->base_address = APIC_DEFAULT_PHYS_BASE;
 	vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
 
@@ -1079,9 +1063,9 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (apic && atomic_read(&apic->timer.pending) > 0) {
+	if (apic && atomic_read(&apic->lapic_timer.pending) > 0) {
 		if (kvm_apic_local_deliver(apic, APIC_LVTT))
-			atomic_dec(&apic->timer.pending);
+			atomic_dec(&apic->lapic_timer.pending);
 	}
 }
 
@@ -1107,7 +1091,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
 			     MSR_IA32_APICBASE_BASE;
 	apic_set_reg(apic, APIC_LVR, APIC_VERSION);
 	apic_update_ppr(apic);
-	hrtimer_cancel(&apic->timer.dev);
+	hrtimer_cancel(&apic->lapic_timer.timer);
 	update_divide_count(apic);
 	start_apic_timer(apic);
 }
@@ -1120,7 +1104,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
 	if (!apic)
 		return;
 
-	timer = &apic->timer.dev;
+	timer = &apic->lapic_timer.timer;
 	if (hrtimer_cancel(timer))
 		hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 }
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 45ab6ee..2fc0d3c 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -2,18 +2,15 @@
 #define __KVM_X86_LAPIC_H
 
 #include "iodev.h"
+#include "kvm_timer.h"
 
 #include <linux/kvm_host.h>
 
 struct kvm_lapic {
 	unsigned long base_address;
 	struct kvm_io_device dev;
-	struct {
-		atomic_t pending;
-		s64 period;	/* unit: ns */
-		u32 divide_count;
-		struct hrtimer dev;
-	} timer;
+	struct kvm_timer lapic_timer;
+	u32 divide_count;
 	struct kvm_vcpu *vcpu;
 	struct page *regs_page;
 	void *regs;
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
new file mode 100644
index 0000000..86dbac0
--- /dev/null
+++ b/arch/x86/kvm/timer.c
@@ -0,0 +1,46 @@
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/hrtimer.h>
+#include <asm/atomic.h>
+#include "kvm_timer.h"
+
+static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
+{
+	int restart_timer = 0;
+	wait_queue_head_t *q = &vcpu->wq;
+
+	/* FIXME: this code should not know anything about vcpus */
+	if (!atomic_inc_and_test(&ktimer->pending))
+		set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+
+	if (!ktimer->reinject)
+		atomic_set(&ktimer->pending, 1);
+
+	if (waitqueue_active(q))
+		wake_up_interruptible(q);
+
+	if (ktimer->t_ops->is_periodic(ktimer)) {
+		hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+		restart_timer = 1;
+	}
+
+	return restart_timer;
+}
+
+enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
+{
+	int restart_timer;
+	struct kvm_vcpu *vcpu;
+	struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+
+	vcpu = ktimer->kvm->vcpus[ktimer->vcpu_id];
+	if (!vcpu)
+		return HRTIMER_NORESTART;
+
+	restart_timer = __kvm_timer_fn(vcpu, ktimer);
+	if (restart_timer)
+		return HRTIMER_RESTART;
+	else
+		return HRTIMER_NORESTART;
+}
+
-- 
cgit v0.10.2


From 3f5e06f8799adca3e7e1bbafe1cd780a3e69f604 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.zhang@intel.com>
Date: Mon, 2 Mar 2009 22:06:41 -0500
Subject: KVM: ia64: fix compilation error in kvm_get_lowest_prio_vcpu

Modify the arg of kvm_get_lowest_prio_vcpu().
Make it consistent with its declaration.

Signed-off-by: Yang Zhang <yang.zhang@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index d20a5db..774f0d7 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1837,7 +1837,7 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
 }
 
 struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
-				       unsigned long bitmap)
+				       unsigned long *bitmap)
 {
 	struct kvm_vcpu *lvcpu = kvm->vcpus[0];
 	int i;
-- 
cgit v0.10.2


From 74a3a8f152053394a016518cc2f2fee216897fa4 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 4 Mar 2009 13:33:02 +0800
Subject: KVM: Merge kvm_ioapic_get_delivery_bitmask into
 kvm_get_intr_delivery_bitmask

Gleb fixed bitmap ops usage in kvm_ioapic_get_delivery_bitmask.

Sheng merged two functions, as well as fixed several issues in
kvm_get_intr_delivery_bitmask
1. deliver_bitmask is a bitmap rather than a unsigned long intereger.
2. Lowest priority target bitmap wrong calculated by mistake.
3. Prevent potential NULL reference.
4. Declaration in include/kvm_host.h caused powerpc compilation warning.
5. Add warning for guest broadcast interrupt with lowest priority delivery mode.
6. Removed duplicate bitmap clean up in caller of kvm_get_intr_delivery_bitmask.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 7c2cb2b..ea268a8 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -161,45 +161,6 @@ static void ioapic_inj_nmi(struct kvm_vcpu *vcpu)
 	kvm_vcpu_kick(vcpu);
 }
 
-void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
-				     u8 dest_mode, unsigned long *mask)
-{
-	int i;
-	struct kvm *kvm = ioapic->kvm;
-	struct kvm_vcpu *vcpu;
-
-	ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode);
-
-	*mask = 0;
-	if (dest_mode == 0) {	/* Physical mode. */
-		if (dest == 0xFF) {	/* Broadcast. */
-			for (i = 0; i < KVM_MAX_VCPUS; ++i)
-				if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic)
-					*mask |= 1 << i;
-			return;
-		}
-		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-			vcpu = kvm->vcpus[i];
-			if (!vcpu)
-				continue;
-			if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) {
-				if (vcpu->arch.apic)
-					*mask = 1 << i;
-				break;
-			}
-		}
-	} else if (dest != 0)	/* Logical mode, MDA non-zero. */
-		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-			vcpu = kvm->vcpus[i];
-			if (!vcpu)
-				continue;
-			if (vcpu->arch.apic &&
-			    kvm_apic_match_logical_addr(vcpu->arch.apic, dest))
-				*mask |= 1 << vcpu->vcpu_id;
-		}
-	ioapic_debug("mask %x\n", *mask);
-}
-
 static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 {
 	union kvm_ioapic_redirect_entry entry = ioapic->redirtbl[irq];
@@ -213,13 +174,12 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 		     entry.fields.delivery_mode, entry.fields.vector,
 		     entry.fields.trig_mode);
 
-	bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
-
 	/* Always delivery PIT interrupt to vcpu 0 */
 #ifdef CONFIG_X86
-	if (irq == 0)
+	if (irq == 0) {
+                bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
 		__set_bit(0, deliver_bitmask);
-	else
+        } else
 #endif
 		kvm_get_intr_delivery_bitmask(ioapic, &entry, deliver_bitmask);
 
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 7275f87..c8032ab 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -70,7 +70,8 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
-void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
-				     u8 dest_mode, unsigned long *mask);
+void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
+				   union kvm_ioapic_redirect_entry *entry,
+				   unsigned long *deliver_bitmask);
 
 #endif
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index d165e05..1c6ff6d 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -47,15 +47,54 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
 				   union kvm_ioapic_redirect_entry *entry,
 				   unsigned long *deliver_bitmask)
 {
+	int i;
+	struct kvm *kvm = ioapic->kvm;
 	struct kvm_vcpu *vcpu;
 
-	kvm_ioapic_get_delivery_bitmask(ioapic, entry->fields.dest_id,
-					entry->fields.dest_mode,
-					deliver_bitmask);
+	bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
+
+	if (entry->fields.dest_mode == 0) {	/* Physical mode. */
+		if (entry->fields.dest_id == 0xFF) {	/* Broadcast. */
+			for (i = 0; i < KVM_MAX_VCPUS; ++i)
+				if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic)
+					__set_bit(i, deliver_bitmask);
+			/* Lowest priority shouldn't combine with broadcast */
+			if (entry->fields.delivery_mode ==
+			    IOAPIC_LOWEST_PRIORITY && printk_ratelimit())
+				printk(KERN_INFO "kvm: apic: phys broadcast "
+						  "and lowest prio\n");
+			return;
+		}
+		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+			vcpu = kvm->vcpus[i];
+			if (!vcpu)
+				continue;
+			if (kvm_apic_match_physical_addr(vcpu->arch.apic,
+					entry->fields.dest_id)) {
+				if (vcpu->arch.apic)
+					__set_bit(i, deliver_bitmask);
+				break;
+			}
+		}
+	} else if (entry->fields.dest_id != 0) /* Logical mode, MDA non-zero. */
+		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+			vcpu = kvm->vcpus[i];
+			if (!vcpu)
+				continue;
+			if (vcpu->arch.apic &&
+			    kvm_apic_match_logical_addr(vcpu->arch.apic,
+					entry->fields.dest_id))
+				__set_bit(i, deliver_bitmask);
+		}
+
 	switch (entry->fields.delivery_mode) {
 	case IOAPIC_LOWEST_PRIORITY:
+		/* Select one in deliver_bitmask */
 		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm,
 				entry->fields.vector, deliver_bitmask);
+		bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
+		if (!vcpu)
+			return;
 		__set_bit(vcpu->vcpu_id, deliver_bitmask);
 		break;
 	case IOAPIC_FIXED:
@@ -65,7 +104,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
 		if (printk_ratelimit())
 			printk(KERN_INFO "kvm: unsupported delivery mode %d\n",
 				entry->fields.delivery_mode);
-		*deliver_bitmask = 0;
+		bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
 	}
 }
 
@@ -80,8 +119,6 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 
 	BUG_ON(!ioapic);
 
-	bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
-
 	entry.bits = 0;
 	entry.fields.dest_id = (e->msi.address_lo &
 			MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
-- 
cgit v0.10.2


From f5a1e9f89504f57b2b45645a7239dc8a8ddb0f4c Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 5 Mar 2009 13:12:29 +0100
Subject: KVM: MMU: remove call to kvm_mmu_pte_write from walk_addr

There is no reason to update the shadow pte here because the guest pte
is only changed to dirty state.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6bd7020..855eb71 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -209,7 +209,6 @@ walk:
 		if (ret)
 			goto walk;
 		pte |= PT_DIRTY_MASK;
-		kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 0);
 		walker->ptes[walker->level - 1] = pte;
 	}
 
-- 
cgit v0.10.2


From 6da7e3f643cf7099965d75fda8606b9d3a8650b9 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 5 Mar 2009 16:34:44 +0200
Subject: KVM: APIC: kvm_apic_set_irq deliver all kinds of interrupts

Get rid of ioapic_inj_irq() and ioapic_inj_nmi() functions.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 4542651..5608488 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -585,7 +585,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu);
 int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
 void kvm_sal_emul(struct kvm_vcpu *vcpu);
 
-static inline void kvm_inject_nmi(struct kvm_vcpu *vcpu) {}
 #endif /* __ASSEMBLY__*/
 
 #endif
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 774f0d7..99d6d17 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -292,13 +292,13 @@ static void vcpu_deliver_ipi(struct kvm_vcpu *vcpu, uint64_t dm,
 {
 	switch (dm) {
 	case SAPIC_FIXED:
-		kvm_apic_set_irq(vcpu, vector, 0);
+		kvm_apic_set_irq(vcpu, vector, dm, 0);
 		break;
 	case SAPIC_NMI:
-		kvm_apic_set_irq(vcpu, 2, 0);
+		kvm_apic_set_irq(vcpu, 2, dm, 0);
 		break;
 	case SAPIC_EXTINT:
-		kvm_apic_set_irq(vcpu, 0, 0);
+		kvm_apic_set_irq(vcpu, 0, dm, 0);
 		break;
 	case SAPIC_INIT:
 	case SAPIC_PMI:
@@ -1813,7 +1813,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 	put_cpu();
 }
 
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig)
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 dmode, u8 trig)
 {
 
 	struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h
index 6d6cbcb..cbcfaa6 100644
--- a/arch/ia64/kvm/lapic.h
+++ b/arch/ia64/kvm/lapic.h
@@ -20,6 +20,6 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu);
 
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 dmode, u8 trig);
 
 #endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 27ca43e..a42f968 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -196,20 +196,30 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
 
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig)
+static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
+			     int vector, int level, int trig_mode);
+
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 dmode, u8 trig)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
+	int lapic_dmode;
 
-	if (!apic_test_and_set_irr(vec, apic)) {
-		/* a new pending irq is set in IRR */
-		if (trig)
-			apic_set_vector(vec, apic->regs + APIC_TMR);
-		else
-			apic_clear_vector(vec, apic->regs + APIC_TMR);
-		kvm_vcpu_kick(apic->vcpu);
-		return 1;
+	switch (dmode) {
+	case IOAPIC_LOWEST_PRIORITY:
+		lapic_dmode = APIC_DM_LOWEST;
+		break;
+	case IOAPIC_FIXED:
+		lapic_dmode = APIC_DM_FIXED;
+		break;
+	case IOAPIC_NMI:
+		lapic_dmode = APIC_DM_NMI;
+		break;
+	default:
+		printk(KERN_DEBUG"Ignoring delivery mode %d\n", dmode);
+		return 0;
+		break;
 	}
-	return 0;
+	return __apic_accept_irq(apic, lapic_dmode, vec, 1, trig);
 }
 
 static inline int apic_find_highest_isr(struct kvm_lapic *apic)
@@ -327,7 +337,7 @@ static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 			     int vector, int level, int trig_mode)
 {
-	int orig_irr, result = 0;
+	int result = 0;
 	struct kvm_vcpu *vcpu = apic->vcpu;
 
 	switch (delivery_mode) {
@@ -337,10 +347,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		if (unlikely(!apic_enabled(apic)))
 			break;
 
-		orig_irr = apic_test_and_set_irr(vector, apic);
-		if (orig_irr && trig_mode) {
-			apic_debug("level trig mode repeatedly for vector %d",
-				   vector);
+		result = !apic_test_and_set_irr(vector, apic);
+		if (!result) {
+			if (trig_mode)
+				apic_debug("level trig mode repeatedly for "
+						"vector %d", vector);
 			break;
 		}
 
@@ -349,10 +360,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 			apic_set_vector(vector, apic->regs + APIC_TMR);
 		} else
 			apic_clear_vector(vector, apic->regs + APIC_TMR);
-
 		kvm_vcpu_kick(vcpu);
-
-		result = (orig_irr == 0);
 		break;
 
 	case APIC_DM_REMRD:
@@ -364,12 +372,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		break;
 
 	case APIC_DM_NMI:
+		result = 1;
 		kvm_inject_nmi(vcpu);
 		kvm_vcpu_kick(vcpu);
 		break;
 
 	case APIC_DM_INIT:
 		if (level) {
+			result = 1;
 			if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
 				printk(KERN_DEBUG
 				       "INIT on a runnable vcpu %d\n",
@@ -386,6 +396,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		apic_debug("SIPI to vcpu %d vector 0x%02x\n",
 			   vcpu->vcpu_id, vector);
 		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+			result = 1;
 			vcpu->arch.sipi_vector = vector;
 			vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
 			kvm_vcpu_kick(vcpu);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 2fc0d3c..1b0e3c0 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -31,7 +31,7 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
 
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 dmode, u8 trig);
 
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index ea268a8..d4a7948 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -142,25 +142,6 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 	}
 }
 
-static int ioapic_inj_irq(struct kvm_ioapic *ioapic,
-			   struct kvm_vcpu *vcpu,
-			   u8 vector, u8 trig_mode, u8 delivery_mode)
-{
-	ioapic_debug("irq %d trig %d deliv %d\n", vector, trig_mode,
-		     delivery_mode);
-
-	ASSERT((delivery_mode == IOAPIC_FIXED) ||
-	       (delivery_mode == IOAPIC_LOWEST_PRIORITY));
-
-	return kvm_apic_set_irq(vcpu, vector, trig_mode);
-}
-
-static void ioapic_inj_nmi(struct kvm_vcpu *vcpu)
-{
-	kvm_inject_nmi(vcpu);
-	kvm_vcpu_kick(vcpu);
-}
-
 static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 {
 	union kvm_ioapic_redirect_entry entry = ioapic->redirtbl[irq];
@@ -193,21 +174,12 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 		__clear_bit(vcpu_id, deliver_bitmask);
 		vcpu = ioapic->kvm->vcpus[vcpu_id];
 		if (vcpu) {
-			if (entry.fields.delivery_mode ==
-					IOAPIC_LOWEST_PRIORITY ||
-			    entry.fields.delivery_mode == IOAPIC_FIXED) {
-				if (r < 0)
-					r = 0;
-				r += ioapic_inj_irq(ioapic, vcpu,
-						    entry.fields.vector,
-						    entry.fields.trig_mode,
-						    entry.fields.delivery_mode);
-			} else if (entry.fields.delivery_mode == IOAPIC_NMI) {
-				r = 1;
-				ioapic_inj_nmi(vcpu);
-			} else
-				ioapic_debug("unsupported delivery mode %x!\n",
-					     entry.fields.delivery_mode);
+			if (r < 0)
+				r = 0;
+			r += kvm_apic_set_irq(vcpu,
+					entry.fields.vector,
+					entry.fields.trig_mode,
+					entry.fields.delivery_mode);
 		} else
 			ioapic_debug("null destination vcpu: "
 				     "mask=%x vector=%x delivery_mode=%x\n",
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 1c6ff6d..325c668 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -148,6 +148,7 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 			if (r < 0)
 				r = 0;
 			r += kvm_apic_set_irq(vcpu, entry.fields.vector,
+					      entry.fields.dest_mode,
 					      entry.fields.trig_mode);
 		}
 	}
-- 
cgit v0.10.2


From a53c17d21c46a752f5ac6695376481bc27865b04 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 5 Mar 2009 16:34:49 +0200
Subject: KVM: ioapic/msi interrupt delivery consolidation

ioapic_deliver() and kvm_set_msi() have code duplication. Move
the code into ioapic_deliver_entry() function and call it from
both places.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3b91ec9..ec9d078 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -364,7 +364,7 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask);
 
 #ifdef __KVM_HAVE_IOAPIC
-void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
+void kvm_get_intr_delivery_bitmask(struct kvm *kvm,
 				   union kvm_ioapic_redirect_entry *entry,
 				   unsigned long *deliver_bitmask);
 #endif
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index d4a7948..b71c044 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -142,54 +142,57 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 	}
 }
 
-static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
+int ioapic_deliver_entry(struct kvm *kvm, union kvm_ioapic_redirect_entry *e)
 {
-	union kvm_ioapic_redirect_entry entry = ioapic->redirtbl[irq];
 	DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS);
-	struct kvm_vcpu *vcpu;
-	int vcpu_id, r = -1;
+	int i, r = -1;
 
-	ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
-		     "vector=%x trig_mode=%x\n",
-		     entry.fields.dest, entry.fields.dest_mode,
-		     entry.fields.delivery_mode, entry.fields.vector,
-		     entry.fields.trig_mode);
-
-	/* Always delivery PIT interrupt to vcpu 0 */
-#ifdef CONFIG_X86
-	if (irq == 0) {
-                bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
-		__set_bit(0, deliver_bitmask);
-        } else
-#endif
-		kvm_get_intr_delivery_bitmask(ioapic, &entry, deliver_bitmask);
+	kvm_get_intr_delivery_bitmask(kvm, e, deliver_bitmask);
 
 	if (find_first_bit(deliver_bitmask, KVM_MAX_VCPUS) >= KVM_MAX_VCPUS) {
 		ioapic_debug("no target on destination\n");
-		return 0;
+		return r;
 	}
 
-	while ((vcpu_id = find_first_bit(deliver_bitmask, KVM_MAX_VCPUS))
+	while ((i = find_first_bit(deliver_bitmask, KVM_MAX_VCPUS))
 			< KVM_MAX_VCPUS) {
-		__clear_bit(vcpu_id, deliver_bitmask);
-		vcpu = ioapic->kvm->vcpus[vcpu_id];
+		struct kvm_vcpu *vcpu = kvm->vcpus[i];
+		__clear_bit(i, deliver_bitmask);
 		if (vcpu) {
 			if (r < 0)
 				r = 0;
-			r += kvm_apic_set_irq(vcpu,
-					entry.fields.vector,
-					entry.fields.trig_mode,
-					entry.fields.delivery_mode);
+			r += kvm_apic_set_irq(vcpu, e->fields.vector,
+					e->fields.delivery_mode,
+					e->fields.trig_mode);
 		} else
 			ioapic_debug("null destination vcpu: "
 				     "mask=%x vector=%x delivery_mode=%x\n",
-				     entry.fields.deliver_bitmask,
-				     entry.fields.vector,
-				     entry.fields.delivery_mode);
+				     e->fields.deliver_bitmask,
+				     e->fields.vector, e->fields.delivery_mode);
 	}
 	return r;
 }
 
+static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
+{
+	union kvm_ioapic_redirect_entry entry = ioapic->redirtbl[irq];
+
+	ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
+		     "vector=%x trig_mode=%x\n",
+		     entry.fields.dest, entry.fields.dest_mode,
+		     entry.fields.delivery_mode, entry.fields.vector,
+		     entry.fields.trig_mode);
+
+#ifdef CONFIG_X86
+	/* Always delivery PIT interrupt to vcpu 0 */
+	if (irq == 0) {
+		entry.fields.dest_mode = 0; /* Physical mode. */
+		entry.fields.dest_id = ioapic->kvm->vcpus[0]->vcpu_id;
+	}
+#endif
+	return ioapic_deliver_entry(ioapic->kvm, &entry);
+}
+
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
 {
 	u32 old_irr = ioapic->irr;
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index c8032ab..bedeea5 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -70,8 +70,8 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
-void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
+void kvm_get_intr_delivery_bitmask(struct kvm *kvm,
 				   union kvm_ioapic_redirect_entry *entry,
 				   unsigned long *deliver_bitmask);
-
+int ioapic_deliver_entry(struct kvm *kvm, union kvm_ioapic_redirect_entry *e);
 #endif
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 325c668..35397a5 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -43,12 +43,11 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
 	return kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level);
 }
 
-void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
+void kvm_get_intr_delivery_bitmask(struct kvm *kvm,
 				   union kvm_ioapic_redirect_entry *entry,
 				   unsigned long *deliver_bitmask)
 {
 	int i;
-	struct kvm *kvm = ioapic->kvm;
 	struct kvm_vcpu *vcpu;
 
 	bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
@@ -90,7 +89,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
 	switch (entry->fields.delivery_mode) {
 	case IOAPIC_LOWEST_PRIORITY:
 		/* Select one in deliver_bitmask */
-		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm,
+		vcpu = kvm_get_lowest_prio_vcpu(kvm,
 				entry->fields.vector, deliver_bitmask);
 		bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
 		if (!vcpu)
@@ -111,13 +110,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
 static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 		       struct kvm *kvm, int level)
 {
-	int vcpu_id, r = -1;
-	struct kvm_vcpu *vcpu;
-	struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
 	union kvm_ioapic_redirect_entry entry;
-	DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS);
-
-	BUG_ON(!ioapic);
 
 	entry.bits = 0;
 	entry.fields.dest_id = (e->msi.address_lo &
@@ -133,26 +126,7 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 			(unsigned long *)&e->msi.data);
 
 	/* TODO Deal with RH bit of MSI message address */
-
-	kvm_get_intr_delivery_bitmask(ioapic, &entry, deliver_bitmask);
-
-	if (find_first_bit(deliver_bitmask, KVM_MAX_VCPUS) >= KVM_MAX_VCPUS) {
-		printk(KERN_WARNING "kvm: no destination for MSI delivery!");
-		return -1;
-	}
-	while ((vcpu_id = find_first_bit(deliver_bitmask,
-					KVM_MAX_VCPUS)) < KVM_MAX_VCPUS) {
-		__clear_bit(vcpu_id, deliver_bitmask);
-		vcpu = ioapic->kvm->vcpus[vcpu_id];
-		if (vcpu) {
-			if (r < 0)
-				r = 0;
-			r += kvm_apic_set_irq(vcpu, entry.fields.vector,
-					      entry.fields.dest_mode,
-					      entry.fields.trig_mode);
-		}
-	}
-	return r;
+	return ioapic_deliver_entry(kvm, &entry);
 }
 
 /* This should be called with the kvm->lock mutex held
-- 
cgit v0.10.2


From 343f94fe4d16ec898da77720c03da9e09f8523d2 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 5 Mar 2009 16:34:54 +0200
Subject: KVM: consolidate ioapic/ipi interrupt delivery logic

Use kvm_apic_match_dest() in kvm_get_intr_delivery_bitmask() instead
of duplicating the same code. Use kvm_get_intr_delivery_bitmask() in
apic_send_ipi() to figure out ipi destination instead of reimplementing
the logic.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 99d6d17..8eea9cb 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1852,6 +1852,14 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
 	return lvcpu;
 }
 
+int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+		int short_hand, int dest, int dest_mode)
+{
+	return (dest_mode == 0) ?
+		kvm_apic_match_physical_addr(target, dest) :
+		kvm_apic_match_logical_addr(target, dest);
+}
+
 static int find_highest_bits(int *dat)
 {
 	u32  bits, bitnum;
diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h
index cbcfaa6..31602e7 100644
--- a/arch/ia64/kvm/lapic.h
+++ b/arch/ia64/kvm/lapic.h
@@ -20,6 +20,9 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu);
 
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
+int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+		int short_hand, int dest, int dest_mode);
+bool kvm_apic_present(struct kvm_vcpu *vcpu);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 dmode, u8 trig);
 
 #endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index a42f968..998862a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -260,7 +260,7 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
 
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
 {
-	return kvm_apic_id(apic) == dest;
+	return dest == 0xff || kvm_apic_id(apic) == dest;
 }
 
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
@@ -289,37 +289,34 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
 	return result;
 }
 
-static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 			   int short_hand, int dest, int dest_mode)
 {
 	int result = 0;
 	struct kvm_lapic *target = vcpu->arch.apic;
 
 	apic_debug("target %p, source %p, dest 0x%x, "
-		   "dest_mode 0x%x, short_hand 0x%x",
+		   "dest_mode 0x%x, short_hand 0x%x\n",
 		   target, source, dest, dest_mode, short_hand);
 
 	ASSERT(!target);
 	switch (short_hand) {
 	case APIC_DEST_NOSHORT:
-		if (dest_mode == 0) {
+		if (dest_mode == 0)
 			/* Physical mode. */
-			if ((dest == 0xFF) || (dest == kvm_apic_id(target)))
-				result = 1;
-		} else
+			result = kvm_apic_match_physical_addr(target, dest);
+		else
 			/* Logical mode. */
 			result = kvm_apic_match_logical_addr(target, dest);
 		break;
 	case APIC_DEST_SELF:
-		if (target == source)
-			result = 1;
+		result = (target == source);
 		break;
 	case APIC_DEST_ALLINC:
 		result = 1;
 		break;
 	case APIC_DEST_ALLBUT:
-		if (target != source)
-			result = 1;
+		result = (target != source);
 		break;
 	default:
 		printk(KERN_WARNING "Bad dest shorthand value %x\n",
@@ -492,38 +489,26 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 	unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
 	unsigned int vector = icr_low & APIC_VECTOR_MASK;
 
-	struct kvm_vcpu *target;
-	struct kvm_vcpu *vcpu;
-	DECLARE_BITMAP(lpr_map, KVM_MAX_VCPUS);
+	DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS);
 	int i;
 
-	bitmap_zero(lpr_map, KVM_MAX_VCPUS);
 	apic_debug("icr_high 0x%x, icr_low 0x%x, "
 		   "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
 		   "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
 		   icr_high, icr_low, short_hand, dest,
 		   trig_mode, level, dest_mode, delivery_mode, vector);
 
-	for (i = 0; i < KVM_MAX_VCPUS; i++) {
-		vcpu = apic->vcpu->kvm->vcpus[i];
-		if (!vcpu)
-			continue;
-
-		if (vcpu->arch.apic &&
-		    apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
-			if (delivery_mode == APIC_DM_LOWEST)
-				__set_bit(vcpu->vcpu_id, lpr_map);
-			else
-				__apic_accept_irq(vcpu->arch.apic, delivery_mode,
-						  vector, level, trig_mode);
-		}
-	}
-
-	if (delivery_mode == APIC_DM_LOWEST) {
-		target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
-		if (target != NULL)
-			__apic_accept_irq(target->arch.apic, delivery_mode,
-					  vector, level, trig_mode);
+	kvm_get_intr_delivery_bitmask(apic->vcpu->kvm, apic, dest, dest_mode,
+			delivery_mode == APIC_DM_LOWEST, short_hand,
+			deliver_bitmask);
+
+	while ((i = find_first_bit(deliver_bitmask, KVM_MAX_VCPUS))
+			< KVM_MAX_VCPUS) {
+		struct kvm_vcpu *vcpu = apic->vcpu->kvm->vcpus[i];
+		__clear_bit(i, deliver_bitmask);
+		if (vcpu)
+			__apic_accept_irq(vcpu->arch.apic, delivery_mode,
+					vector, level, trig_mode);
 	}
 }
 
@@ -930,16 +915,14 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_reset);
 
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+bool kvm_apic_present(struct kvm_vcpu *vcpu)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
-	int ret = 0;
-
-	if (!apic)
-		return 0;
-	ret = apic_enabled(apic);
+	return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic);
+}
 
-	return ret;
+int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+{
+	return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
 
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 1b0e3c0..b66dc14 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -37,6 +37,8 @@ u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
 void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
 int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
+bool kvm_apic_present(struct kvm_vcpu *vcpu);
+bool kvm_lapic_present(struct kvm_vcpu *vcpu);
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
 
 void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ec9d078..fb60f31 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -363,11 +363,6 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 				      struct kvm_irq_mask_notifier *kimn);
 void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask);
 
-#ifdef __KVM_HAVE_IOAPIC
-void kvm_get_intr_delivery_bitmask(struct kvm *kvm,
-				   union kvm_ioapic_redirect_entry *entry,
-				   unsigned long *deliver_bitmask);
-#endif
 int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index b71c044..43969bb 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -147,7 +147,10 @@ int ioapic_deliver_entry(struct kvm *kvm, union kvm_ioapic_redirect_entry *e)
 	DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS);
 	int i, r = -1;
 
-	kvm_get_intr_delivery_bitmask(kvm, e, deliver_bitmask);
+	kvm_get_intr_delivery_bitmask(kvm, NULL, e->fields.dest_id,
+			e->fields.dest_mode,
+			e->fields.delivery_mode == IOAPIC_LOWEST_PRIORITY,
+			0, deliver_bitmask);
 
 	if (find_first_bit(deliver_bitmask, KVM_MAX_VCPUS) >= KVM_MAX_VCPUS) {
 		ioapic_debug("no target on destination\n");
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index bedeea5..d996c7a 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -65,13 +65,15 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
 }
 
 struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
-				       unsigned long *bitmap);
+		unsigned long *bitmap);
+int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+		int short_hand, int dest, int dest_mode);
 void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
-void kvm_get_intr_delivery_bitmask(struct kvm *kvm,
-				   union kvm_ioapic_redirect_entry *entry,
-				   unsigned long *deliver_bitmask);
+void kvm_get_intr_delivery_bitmask(struct kvm *kvm, struct kvm_lapic *src,
+		int dest_id, int dest_mode, bool low_prio, int short_hand,
+		unsigned long *deliver_bitmask);
 int ioapic_deliver_entry(struct kvm *kvm, union kvm_ioapic_redirect_entry *e);
 #endif
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 35397a5..e43701c 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -43,67 +43,35 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
 	return kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level);
 }
 
-void kvm_get_intr_delivery_bitmask(struct kvm *kvm,
-				   union kvm_ioapic_redirect_entry *entry,
-				   unsigned long *deliver_bitmask)
+void kvm_get_intr_delivery_bitmask(struct kvm *kvm, struct kvm_lapic *src,
+		int dest_id, int dest_mode, bool low_prio, int short_hand,
+		unsigned long *deliver_bitmask)
 {
 	int i;
 	struct kvm_vcpu *vcpu;
 
+	if (dest_mode == 0 && dest_id == 0xff && low_prio)
+		printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
+
 	bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
+	for (i = 0; i < KVM_MAX_VCPUS; i++) {
+		vcpu = kvm->vcpus[i];
 
-	if (entry->fields.dest_mode == 0) {	/* Physical mode. */
-		if (entry->fields.dest_id == 0xFF) {	/* Broadcast. */
-			for (i = 0; i < KVM_MAX_VCPUS; ++i)
-				if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic)
-					__set_bit(i, deliver_bitmask);
-			/* Lowest priority shouldn't combine with broadcast */
-			if (entry->fields.delivery_mode ==
-			    IOAPIC_LOWEST_PRIORITY && printk_ratelimit())
-				printk(KERN_INFO "kvm: apic: phys broadcast "
-						  "and lowest prio\n");
-			return;
-		}
-		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-			vcpu = kvm->vcpus[i];
-			if (!vcpu)
-				continue;
-			if (kvm_apic_match_physical_addr(vcpu->arch.apic,
-					entry->fields.dest_id)) {
-				if (vcpu->arch.apic)
-					__set_bit(i, deliver_bitmask);
-				break;
-			}
-		}
-	} else if (entry->fields.dest_id != 0) /* Logical mode, MDA non-zero. */
-		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-			vcpu = kvm->vcpus[i];
-			if (!vcpu)
-				continue;
-			if (vcpu->arch.apic &&
-			    kvm_apic_match_logical_addr(vcpu->arch.apic,
-					entry->fields.dest_id))
-				__set_bit(i, deliver_bitmask);
-		}
+		if (!vcpu || !kvm_apic_present(vcpu))
+			continue;
 
-	switch (entry->fields.delivery_mode) {
-	case IOAPIC_LOWEST_PRIORITY:
-		/* Select one in deliver_bitmask */
-		vcpu = kvm_get_lowest_prio_vcpu(kvm,
-				entry->fields.vector, deliver_bitmask);
-		bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
-		if (!vcpu)
-			return;
-		__set_bit(vcpu->vcpu_id, deliver_bitmask);
-		break;
-	case IOAPIC_FIXED:
-	case IOAPIC_NMI:
-		break;
-	default:
-		if (printk_ratelimit())
-			printk(KERN_INFO "kvm: unsupported delivery mode %d\n",
-				entry->fields.delivery_mode);
+		if (!kvm_apic_match_dest(vcpu, src, short_hand, dest_id,
+					dest_mode))
+			continue;
+
+		__set_bit(i, deliver_bitmask);
+	}
+
+	if (low_prio) {
+		vcpu = kvm_get_lowest_prio_vcpu(kvm, 0, deliver_bitmask);
 		bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
+		if (vcpu)
+			__set_bit(vcpu->vcpu_id, deliver_bitmask);
 	}
 }
 
-- 
cgit v0.10.2


From e1035715ef8d3171e29f9c6aee6f40d57b3fead5 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 5 Mar 2009 16:34:59 +0200
Subject: KVM: change the way how lowest priority vcpu is calculated

The new way does not require additional loop over vcpus to calculate
the one with lowest priority as one is chosen during delivery bitmap
construction.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 8eea9cb..1887a93 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1836,20 +1836,9 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
 	return 0;
 }
 
-struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
-				       unsigned long *bitmap)
+int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
 {
-	struct kvm_vcpu *lvcpu = kvm->vcpus[0];
-	int i;
-
-	for (i = 1; i < kvm->arch.online_vcpus; i++) {
-		if (!kvm->vcpus[i])
-			continue;
-		if (lvcpu->arch.xtp > kvm->vcpus[i]->arch.xtp)
-			lvcpu = kvm->vcpus[i];
-	}
-
-	return lvcpu;
+	return vcpu1->arch.xtp - vcpu2->arch.xtp;
 }
 
 int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h
index 31602e7..e42109e 100644
--- a/arch/ia64/kvm/lapic.h
+++ b/arch/ia64/kvm/lapic.h
@@ -22,6 +22,7 @@ int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
 int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 		int short_hand, int dest, int dest_mode);
+int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
 bool kvm_apic_present(struct kvm_vcpu *vcpu);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 dmode, u8 trig);
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f0faf58..4627627 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -286,6 +286,7 @@ struct kvm_vcpu_arch {
 	u64 shadow_efer;
 	u64 apic_base;
 	struct kvm_lapic *apic;    /* kernel irqchip context */
+	int32_t apic_arb_prio;
 	int mp_state;
 	int sipi_vector;
 	u64 ia32_misc_enable_msr;
@@ -400,7 +401,6 @@ struct kvm_arch{
 	struct hlist_head irq_ack_notifier_list;
 	int vapics_in_nmi_mode;
 
-	int round_robin_prev_vcpu;
 	unsigned int tss_addr;
 	struct page *apic_access_page;
 
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 998862a..814466f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -338,8 +338,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 	struct kvm_vcpu *vcpu = apic->vcpu;
 
 	switch (delivery_mode) {
-	case APIC_DM_FIXED:
 	case APIC_DM_LOWEST:
+		vcpu->arch.apic_arb_prio++;
+	case APIC_DM_FIXED:
 		/* FIXME add logic for vcpu on reset */
 		if (unlikely(!apic_enabled(apic)))
 			break;
@@ -416,43 +417,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 	return result;
 }
 
-static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
-				       unsigned long *bitmap)
+int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
 {
-	int last;
-	int next;
-	struct kvm_lapic *apic = NULL;
-
-	last = kvm->arch.round_robin_prev_vcpu;
-	next = last;
-
-	do {
-		if (++next == KVM_MAX_VCPUS)
-			next = 0;
-		if (kvm->vcpus[next] == NULL || !test_bit(next, bitmap))
-			continue;
-		apic = kvm->vcpus[next]->arch.apic;
-		if (apic && apic_enabled(apic))
-			break;
-		apic = NULL;
-	} while (next != last);
-	kvm->arch.round_robin_prev_vcpu = next;
-
-	if (!apic)
-		printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
-
-	return apic;
-}
-
-struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
-		unsigned long *bitmap)
-{
-	struct kvm_lapic *apic;
-
-	apic = kvm_apic_round_robin(kvm, vector, bitmap);
-	if (apic)
-		return apic->vcpu;
-	return NULL;
+	return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
 }
 
 static void apic_set_eoi(struct kvm_lapic *apic)
@@ -908,6 +875,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 		vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
 	apic_update_ppr(apic);
 
+	vcpu->arch.apic_arb_prio = 0;
+
 	apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
 		   "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
 		   vcpu, kvm_apic_id(apic),
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index d996c7a..e7bc92d 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -64,10 +64,9 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
 	return kvm->arch.vioapic;
 }
 
-struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
-		unsigned long *bitmap);
 int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 		int short_hand, int dest, int dest_mode);
+int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
 void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index e43701c..f5e059b 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -47,7 +47,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm *kvm, struct kvm_lapic *src,
 		int dest_id, int dest_mode, bool low_prio, int short_hand,
 		unsigned long *deliver_bitmask)
 {
-	int i;
+	int i, lowest = -1;
 	struct kvm_vcpu *vcpu;
 
 	if (dest_mode == 0 && dest_id == 0xff && low_prio)
@@ -64,15 +64,18 @@ void kvm_get_intr_delivery_bitmask(struct kvm *kvm, struct kvm_lapic *src,
 					dest_mode))
 			continue;
 
-		__set_bit(i, deliver_bitmask);
+		if (!low_prio) {
+			__set_bit(i, deliver_bitmask);
+		} else {
+			if (lowest < 0)
+				lowest = i;
+			if (kvm_apic_compare_prio(vcpu, kvm->vcpus[lowest]) < 0)
+				lowest = i;
+		}
 	}
 
-	if (low_prio) {
-		vcpu = kvm_get_lowest_prio_vcpu(kvm, 0, deliver_bitmask);
-		bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
-		if (vcpu)
-			__set_bit(vcpu->vcpu_id, deliver_bitmask);
-	}
+	if (lowest != -1)
+		__set_bit(lowest, deliver_bitmask);
 }
 
 static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
-- 
cgit v0.10.2


From 58c2dde17d6eb6c8c0566e52d184aa16755d890f Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 5 Mar 2009 16:35:04 +0200
Subject: KVM: APIC: get rid of deliver_bitmask

Deliver interrupt during destination matching loop.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Acked-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 1887a93..acf43ec 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -283,6 +283,18 @@ static int handle_sal_call(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 }
 
+static int __apic_accept_irq(struct kvm_vcpu *vcpu, uint64_t vector)
+{
+	struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
+
+	if (!test_and_set_bit(vector, &vpd->irr[0])) {
+		vcpu->arch.irq_new_pending = 1;
+		kvm_vcpu_kick(vcpu);
+		return 1;
+	}
+	return 0;
+}
+
 /*
  *  offset: address offset to IPI space.
  *  value:  deliver value.
@@ -292,20 +304,20 @@ static void vcpu_deliver_ipi(struct kvm_vcpu *vcpu, uint64_t dm,
 {
 	switch (dm) {
 	case SAPIC_FIXED:
-		kvm_apic_set_irq(vcpu, vector, dm, 0);
 		break;
 	case SAPIC_NMI:
-		kvm_apic_set_irq(vcpu, 2, dm, 0);
+		vector = 2;
 		break;
 	case SAPIC_EXTINT:
-		kvm_apic_set_irq(vcpu, 0, dm, 0);
+		vector = 0;
 		break;
 	case SAPIC_INIT:
 	case SAPIC_PMI:
 	default:
 		printk(KERN_ERR"kvm: Unimplemented Deliver reserved IPI!\n");
-		break;
+		return;
 	}
+	__apic_accept_irq(vcpu, vector);
 }
 
 static struct kvm_vcpu *lid_to_vcpu(struct kvm *kvm, unsigned long id,
@@ -1813,17 +1825,9 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 	put_cpu();
 }
 
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 dmode, u8 trig)
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
 {
-
-	struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
-
-	if (!test_and_set_bit(vec, &vpd->irr[0])) {
-		vcpu->arch.irq_new_pending = 1;
-		kvm_vcpu_kick(vcpu);
-		return 1;
-	}
-	return 0;
+	return __apic_accept_irq(vcpu, irq->vector);
 }
 
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
@@ -1844,6 +1848,7 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
 int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 		int short_hand, int dest, int dest_mode)
 {
+	struct kvm_lapic *target = vcpu->arch.apic;
 	return (dest_mode == 0) ?
 		kvm_apic_match_physical_addr(target, dest) :
 		kvm_apic_match_logical_addr(target, dest);
diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h
index e42109e..ee541ce 100644
--- a/arch/ia64/kvm/lapic.h
+++ b/arch/ia64/kvm/lapic.h
@@ -23,7 +23,7 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
 int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 		int short_hand, int dest, int dest_mode);
 int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
-bool kvm_apic_present(struct kvm_vcpu *vcpu);
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 dmode, u8 trig);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
+#define kvm_apic_present(x) (true)
 
 #endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 814466f..dd934d2 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -199,27 +199,12 @@ EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 			     int vector, int level, int trig_mode);
 
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 dmode, u8 trig)
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
-	int lapic_dmode;
 
-	switch (dmode) {
-	case IOAPIC_LOWEST_PRIORITY:
-		lapic_dmode = APIC_DM_LOWEST;
-		break;
-	case IOAPIC_FIXED:
-		lapic_dmode = APIC_DM_FIXED;
-		break;
-	case IOAPIC_NMI:
-		lapic_dmode = APIC_DM_NMI;
-		break;
-	default:
-		printk(KERN_DEBUG"Ignoring delivery mode %d\n", dmode);
-		return 0;
-		break;
-	}
-	return __apic_accept_irq(apic, lapic_dmode, vec, 1, trig);
+	return __apic_accept_irq(apic, irq->delivery_mode, irq->vector,
+			irq->level, irq->trig_mode);
 }
 
 static inline int apic_find_highest_isr(struct kvm_lapic *apic)
@@ -447,36 +432,24 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 {
 	u32 icr_low = apic_get_reg(apic, APIC_ICR);
 	u32 icr_high = apic_get_reg(apic, APIC_ICR2);
+	struct kvm_lapic_irq irq;
 
-	unsigned int dest = GET_APIC_DEST_FIELD(icr_high);
-	unsigned int short_hand = icr_low & APIC_SHORT_MASK;
-	unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG;
-	unsigned int level = icr_low & APIC_INT_ASSERT;
-	unsigned int dest_mode = icr_low & APIC_DEST_MASK;
-	unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
-	unsigned int vector = icr_low & APIC_VECTOR_MASK;
-
-	DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS);
-	int i;
+	irq.vector = icr_low & APIC_VECTOR_MASK;
+	irq.delivery_mode = icr_low & APIC_MODE_MASK;
+	irq.dest_mode = icr_low & APIC_DEST_MASK;
+	irq.level = icr_low & APIC_INT_ASSERT;
+	irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
+	irq.shorthand = icr_low & APIC_SHORT_MASK;
+	irq.dest_id = GET_APIC_DEST_FIELD(icr_high);
 
 	apic_debug("icr_high 0x%x, icr_low 0x%x, "
 		   "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
 		   "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
-		   icr_high, icr_low, short_hand, dest,
-		   trig_mode, level, dest_mode, delivery_mode, vector);
-
-	kvm_get_intr_delivery_bitmask(apic->vcpu->kvm, apic, dest, dest_mode,
-			delivery_mode == APIC_DM_LOWEST, short_hand,
-			deliver_bitmask);
-
-	while ((i = find_first_bit(deliver_bitmask, KVM_MAX_VCPUS))
-			< KVM_MAX_VCPUS) {
-		struct kvm_vcpu *vcpu = apic->vcpu->kvm->vcpus[i];
-		__clear_bit(i, deliver_bitmask);
-		if (vcpu)
-			__apic_accept_irq(vcpu->arch.apic, delivery_mode,
-					vector, level, trig_mode);
-	}
+		   icr_high, icr_low, irq.shorthand, irq.dest,
+		   irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
+		   irq.vector);
+
+	kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
 }
 
 static u32 apic_get_tmcct(struct kvm_lapic *apic)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index b66dc14..a587f83 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -31,14 +31,13 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
 
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 dmode, u8 trig);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
 
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
 void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
 int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
 bool kvm_apic_present(struct kvm_vcpu *vcpu);
-bool kvm_lapic_present(struct kvm_vcpu *vcpu);
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
 
 void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index b84aca3..fb46efb 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -57,4 +57,14 @@ union kvm_ioapic_redirect_entry {
 	} fields;
 };
 
+struct kvm_lapic_irq {
+	u32 vector;
+	u32 delivery_mode;
+	u32 dest_mode;
+	u32 level;
+	u32 trig_mode;
+	u32 shorthand;
+	u32 dest_id;
+};
+
 #endif /* __KVM_TYPES_H__ */
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 43969bb..1eddae9 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -142,58 +142,33 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 	}
 }
 
-int ioapic_deliver_entry(struct kvm *kvm, union kvm_ioapic_redirect_entry *e)
-{
-	DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS);
-	int i, r = -1;
-
-	kvm_get_intr_delivery_bitmask(kvm, NULL, e->fields.dest_id,
-			e->fields.dest_mode,
-			e->fields.delivery_mode == IOAPIC_LOWEST_PRIORITY,
-			0, deliver_bitmask);
-
-	if (find_first_bit(deliver_bitmask, KVM_MAX_VCPUS) >= KVM_MAX_VCPUS) {
-		ioapic_debug("no target on destination\n");
-		return r;
-	}
-
-	while ((i = find_first_bit(deliver_bitmask, KVM_MAX_VCPUS))
-			< KVM_MAX_VCPUS) {
-		struct kvm_vcpu *vcpu = kvm->vcpus[i];
-		__clear_bit(i, deliver_bitmask);
-		if (vcpu) {
-			if (r < 0)
-				r = 0;
-			r += kvm_apic_set_irq(vcpu, e->fields.vector,
-					e->fields.delivery_mode,
-					e->fields.trig_mode);
-		} else
-			ioapic_debug("null destination vcpu: "
-				     "mask=%x vector=%x delivery_mode=%x\n",
-				     e->fields.deliver_bitmask,
-				     e->fields.vector, e->fields.delivery_mode);
-	}
-	return r;
-}
-
 static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 {
-	union kvm_ioapic_redirect_entry entry = ioapic->redirtbl[irq];
+	union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
+	struct kvm_lapic_irq irqe;
 
 	ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
 		     "vector=%x trig_mode=%x\n",
-		     entry.fields.dest, entry.fields.dest_mode,
-		     entry.fields.delivery_mode, entry.fields.vector,
-		     entry.fields.trig_mode);
+		     entry->fields.dest, entry->fields.dest_mode,
+		     entry->fields.delivery_mode, entry->fields.vector,
+		     entry->fields.trig_mode);
+
+	irqe.dest_id = entry->fields.dest_id;
+	irqe.vector = entry->fields.vector;
+	irqe.dest_mode = entry->fields.dest_mode;
+	irqe.trig_mode = entry->fields.trig_mode;
+	irqe.delivery_mode = entry->fields.delivery_mode << 8;
+	irqe.level = 1;
+	irqe.shorthand = 0;
 
 #ifdef CONFIG_X86
 	/* Always delivery PIT interrupt to vcpu 0 */
 	if (irq == 0) {
-		entry.fields.dest_mode = 0; /* Physical mode. */
-		entry.fields.dest_id = ioapic->kvm->vcpus[0]->vcpu_id;
+		irqe.dest_mode = 0; /* Physical mode. */
+		irqe.dest_id = ioapic->kvm->vcpus[0]->vcpu_id;
 	}
 #endif
-	return ioapic_deliver_entry(ioapic->kvm, &entry);
+	return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe);
 }
 
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index e7bc92d..7080b71 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -71,8 +71,6 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
-void kvm_get_intr_delivery_bitmask(struct kvm *kvm, struct kvm_lapic *src,
-		int dest_id, int dest_mode, bool low_prio, int short_hand,
-		unsigned long *deliver_bitmask);
-int ioapic_deliver_entry(struct kvm *kvm, union kvm_ioapic_redirect_entry *e);
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+		struct kvm_lapic_irq *irq);
 #endif
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index f5e059b..4fa1f60 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -22,6 +22,9 @@
 #include <linux/kvm_host.h>
 
 #include <asm/msidef.h>
+#ifdef CONFIG_IA64
+#include <asm/iosapic.h>
+#endif
 
 #include "irq.h"
 
@@ -43,61 +46,71 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
 	return kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level);
 }
 
-void kvm_get_intr_delivery_bitmask(struct kvm *kvm, struct kvm_lapic *src,
-		int dest_id, int dest_mode, bool low_prio, int short_hand,
-		unsigned long *deliver_bitmask)
+inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
 {
-	int i, lowest = -1;
-	struct kvm_vcpu *vcpu;
+#ifdef CONFIG_IA64
+	return irq->delivery_mode ==
+		(IOSAPIC_LOWEST_PRIORITY << IOSAPIC_DELIVERY_SHIFT);
+#else
+	return irq->delivery_mode == APIC_DM_LOWEST;
+#endif
+}
 
-	if (dest_mode == 0 && dest_id == 0xff && low_prio)
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+		struct kvm_lapic_irq *irq)
+{
+	int i, r = -1;
+	struct kvm_vcpu *vcpu, *lowest = NULL;
+
+	if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
+			kvm_is_dm_lowest_prio(irq))
 		printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
 
-	bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
 	for (i = 0; i < KVM_MAX_VCPUS; i++) {
 		vcpu = kvm->vcpus[i];
 
 		if (!vcpu || !kvm_apic_present(vcpu))
 			continue;
 
-		if (!kvm_apic_match_dest(vcpu, src, short_hand, dest_id,
-					dest_mode))
+		if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
+					irq->dest_id, irq->dest_mode))
 			continue;
 
-		if (!low_prio) {
-			__set_bit(i, deliver_bitmask);
+		if (!kvm_is_dm_lowest_prio(irq)) {
+			if (r < 0)
+				r = 0;
+			r += kvm_apic_set_irq(vcpu, irq);
 		} else {
-			if (lowest < 0)
-				lowest = i;
-			if (kvm_apic_compare_prio(vcpu, kvm->vcpus[lowest]) < 0)
-				lowest = i;
+			if (!lowest)
+				lowest = vcpu;
+			else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
+				lowest = vcpu;
 		}
 	}
 
-	if (lowest != -1)
-		__set_bit(lowest, deliver_bitmask);
+	if (lowest)
+		r = kvm_apic_set_irq(lowest, irq);
+
+	return r;
 }
 
 static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 		       struct kvm *kvm, int level)
 {
-	union kvm_ioapic_redirect_entry entry;
+	struct kvm_lapic_irq irq;
 
-	entry.bits = 0;
-	entry.fields.dest_id = (e->msi.address_lo &
+	irq.dest_id = (e->msi.address_lo &
 			MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
-	entry.fields.vector = (e->msi.data &
+	irq.vector = (e->msi.data &
 			MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
-	entry.fields.dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT,
-			(unsigned long *)&e->msi.address_lo);
-	entry.fields.trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT,
-			(unsigned long *)&e->msi.data);
-	entry.fields.delivery_mode = test_bit(
-			MSI_DATA_DELIVERY_MODE_SHIFT,
-			(unsigned long *)&e->msi.data);
+	irq.dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
+	irq.trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
+	irq.delivery_mode = e->msi.data & 0x700;
+	irq.level = 1;
+	irq.shorthand = 0;
 
 	/* TODO Deal with RH bit of MSI message address */
-	return ioapic_deliver_entry(kvm, &entry);
+	return kvm_irq_delivery_to_apic(kvm, NULL, &irq);
 }
 
 /* This should be called with the kvm->lock mutex held
-- 
cgit v0.10.2


From 0c72ea7fb8a39b4bba071b19f5f835af5b5e538a Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Wed, 25 Feb 2009 10:38:52 -0600
Subject: KVM: ia64: Map in SN2 RTC registers to the VMM module

On SN2, map in the SN2 RTC registers to the VMM module, needed for ITC
emulation.

Signed-off-by: Jes Sorensen <jes@sgi.com>
Acked-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 5608488..1243995 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -371,6 +371,7 @@ struct kvm_vcpu_arch {
 	int last_run_cpu;
 	int vmm_tr_slot;
 	int vm_tr_slot;
+	int sn_rtc_tr_slot;
 
 #define KVM_MP_STATE_RUNNABLE          0
 #define KVM_MP_STATE_UNINITIALIZED     1
@@ -465,6 +466,7 @@ struct kvm_arch {
 	unsigned long	vmm_init_rr;
 
 	int		online_vcpus;
+	int		is_sn2;
 
 	struct kvm_ioapic *vioapic;
 	struct kvm_vm_stat stat;
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index 7a9bff4..0a9cc73 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -146,6 +146,8 @@
 #define PAGE_GATE	__pgprot(__ACCESS_BITS | _PAGE_PL_0 | _PAGE_AR_X_RX)
 #define PAGE_KERNEL	__pgprot(__DIRTY_BITS  | _PAGE_PL_0 | _PAGE_AR_RWX)
 #define PAGE_KERNELRX	__pgprot(__ACCESS_BITS | _PAGE_PL_0 | _PAGE_AR_RX)
+#define PAGE_KERNEL_UC	__pgprot(__DIRTY_BITS  | _PAGE_PL_0 | _PAGE_AR_RWX | \
+				 _PAGE_MA_UC)
 
 # ifndef __ASSEMBLY__
 
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index acf43ec..14a3fab 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -41,6 +41,9 @@
 #include <asm/div64.h>
 #include <asm/tlb.h>
 #include <asm/elf.h>
+#include <asm/sn/addrs.h>
+#include <asm/sn/clksupport.h>
+#include <asm/sn/shub_mmr.h>
 
 #include "misc.h"
 #include "vti.h"
@@ -119,8 +122,7 @@ void kvm_arch_hardware_enable(void *garbage)
 	unsigned long saved_psr;
 	int slot;
 
-	pte = pte_val(mk_pte_phys(__pa(kvm_vmm_base),
-				PAGE_KERNEL));
+	pte = pte_val(mk_pte_phys(__pa(kvm_vmm_base), PAGE_KERNEL));
 	local_irq_save(saved_psr);
 	slot = ia64_itr_entry(0x3, KVM_VMM_BASE, pte, KVM_VMM_SHIFT);
 	local_irq_restore(saved_psr);
@@ -425,6 +427,23 @@ static int handle_switch_rr6(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	return 1;
 }
 
+static int kvm_sn2_setup_mappings(struct kvm_vcpu *vcpu)
+{
+	unsigned long pte, rtc_phys_addr, map_addr;
+	int slot;
+
+	map_addr = KVM_VMM_BASE + (1UL << KVM_VMM_SHIFT);
+	rtc_phys_addr = LOCAL_MMR_OFFSET | SH_RTC;
+	pte = pte_val(mk_pte_phys(rtc_phys_addr, PAGE_KERNEL_UC));
+	slot = ia64_itr_entry(0x3, map_addr, pte, PAGE_SHIFT);
+	vcpu->arch.sn_rtc_tr_slot = slot;
+	if (slot < 0) {
+		printk(KERN_ERR "Mayday mayday! RTC mapping failed!\n");
+		slot = 0;
+	}
+	return slot;
+}
+
 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 {
 
@@ -563,18 +582,29 @@ static int kvm_insert_vmm_mapping(struct kvm_vcpu *vcpu)
 	if (r < 0)
 		goto out;
 	vcpu->arch.vm_tr_slot = r;
+
+#if defined(CONFIG_IA64_SGI_SN2) || defined(CONFIG_IA64_GENERIC)
+	if (kvm->arch.is_sn2) {
+		r = kvm_sn2_setup_mappings(vcpu);
+		if (r < 0)
+			goto out;
+	}
+#endif
+
 	r = 0;
 out:
 	return r;
-
 }
 
 static void kvm_purge_vmm_mapping(struct kvm_vcpu *vcpu)
 {
-
+	struct kvm *kvm = vcpu->kvm;
 	ia64_ptr_entry(0x3, vcpu->arch.vmm_tr_slot);
 	ia64_ptr_entry(0x3, vcpu->arch.vm_tr_slot);
-
+#if defined(CONFIG_IA64_SGI_SN2) || defined(CONFIG_IA64_GENERIC)
+	if (kvm->arch.is_sn2)
+		ia64_ptr_entry(0x3, vcpu->arch.sn_rtc_tr_slot);
+#endif
 }
 
 static int kvm_vcpu_pre_transition(struct kvm_vcpu *vcpu)
@@ -800,6 +830,9 @@ struct  kvm *kvm_arch_create_vm(void)
 
 	if (IS_ERR(kvm))
 		return ERR_PTR(-ENOMEM);
+
+	kvm->arch.is_sn2 = ia64_platform_is("sn2");
+
 	kvm_init_vm(kvm);
 
 	kvm->arch.online_vcpus = 0;
-- 
cgit v0.10.2


From c6c9fcdf0fff7da77c088be11e3c4d0181b36941 Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Wed, 25 Feb 2009 10:38:53 -0600
Subject: KVM: ia64: Create inline function kvm_get_itc() to centralize ITC
 reading.

Move all reading of special register 'AR_ITC' into two functions, one
in the kernel and one in the VMM module. When running on SN2, base the
result on the RTC rather the system ITC, as the ITC isn't
synchronized.

Signed-off-by: Jes Sorensen <jes@sgi.com>
Acked-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 14a3fab..ebbd995 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -68,6 +68,16 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ NULL }
 };
 
+static unsigned long kvm_get_itc(struct kvm_vcpu *vcpu)
+{
+#if defined(CONFIG_IA64_SGI_SN2) || defined(CONFIG_IA64_GENERIC)
+	if (vcpu->kvm->arch.is_sn2)
+		return rtc_time();
+	else
+#endif
+		return ia64_getreg(_IA64_REG_AR_ITC);
+}
+
 static void kvm_flush_icache(unsigned long start, unsigned long len)
 {
 	int l;
@@ -457,7 +467,7 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 
 	if (irqchip_in_kernel(vcpu->kvm)) {
 
-		vcpu_now_itc = ia64_getreg(_IA64_REG_AR_ITC) + vcpu->arch.itc_offset;
+		vcpu_now_itc = kvm_get_itc(vcpu) + vcpu->arch.itc_offset;
 
 		if (time_after(vcpu_now_itc, vpd->itm)) {
 			vcpu->arch.timer_check = 1;
@@ -929,7 +939,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	RESTORE_REGS(saved_gp);
 
 	vcpu->arch.irq_new_pending = 1;
-	vcpu->arch.itc_offset = regs->saved_itc - ia64_getreg(_IA64_REG_AR_ITC);
+	vcpu->arch.itc_offset = regs->saved_itc - kvm_get_itc(vcpu);
 	set_bit(KVM_REQ_RESUME, &vcpu->requests);
 
 	vcpu_put(vcpu);
@@ -1210,7 +1220,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 		regs->cr_iip = PALE_RESET_ENTRY;
 
 		/*Initialize itc offset for vcpus*/
-		itc_offset = 0UL - ia64_getreg(_IA64_REG_AR_ITC);
+		itc_offset = 0UL - kvm_get_itc(vcpu);
 		for (i = 0; i < kvm->arch.online_vcpus; i++) {
 			v = (struct kvm_vcpu *)((char *)vcpu +
 					sizeof(struct kvm_vcpu_data) * i);
@@ -1472,7 +1482,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	}
 	for (i = 0; i < 4; i++)
 		regs->insvc[i] = vcpu->arch.insvc[i];
-	regs->saved_itc = vcpu->arch.itc_offset + ia64_getreg(_IA64_REG_AR_ITC);
+	regs->saved_itc = vcpu->arch.itc_offset + kvm_get_itc(vcpu);
 	SAVE_REGS(xtp);
 	SAVE_REGS(metaphysical_rr0);
 	SAVE_REGS(metaphysical_rr4);
diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c
index a18ee17..a2c6c15 100644
--- a/arch/ia64/kvm/vcpu.c
+++ b/arch/ia64/kvm/vcpu.c
@@ -788,13 +788,29 @@ void vcpu_set_fpreg(struct kvm_vcpu *vcpu, unsigned long reg,
 		setfpreg(reg, val, regs);   /* FIXME: handle NATs later*/
 }
 
+/*
+ * The Altix RTC is mapped specially here for the vmm module
+ */
+#define SN_RTC_BASE	(u64 *)(KVM_VMM_BASE+(1UL<<KVM_VMM_SHIFT))
+static long kvm_get_itc(struct kvm_vcpu *vcpu)
+{
+#if defined(CONFIG_IA64_SGI_SN2) || defined(CONFIG_IA64_GENERIC)
+	struct kvm *kvm = (struct kvm *)KVM_VM_BASE;
+
+	if (kvm->arch.is_sn2)
+		return (*SN_RTC_BASE);
+	else
+#endif
+		return ia64_getreg(_IA64_REG_AR_ITC);
+}
+
 /************************************************************************
  * lsapic timer
  ***********************************************************************/
 u64 vcpu_get_itc(struct kvm_vcpu *vcpu)
 {
 	unsigned long guest_itc;
-	guest_itc = VMX(vcpu, itc_offset) + ia64_getreg(_IA64_REG_AR_ITC);
+	guest_itc = VMX(vcpu, itc_offset) + kvm_get_itc(vcpu);
 
 	if (guest_itc >= VMX(vcpu, last_itc)) {
 		VMX(vcpu, last_itc) = guest_itc;
@@ -809,7 +825,7 @@ static void vcpu_set_itc(struct kvm_vcpu *vcpu, u64 val)
 	struct kvm_vcpu *v;
 	struct kvm *kvm;
 	int i;
-	long itc_offset = val - ia64_getreg(_IA64_REG_AR_ITC);
+	long itc_offset = val - kvm_get_itc(vcpu);
 	unsigned long vitv = VCPU(vcpu, itv);
 
 	kvm = (struct kvm *)KVM_VM_BASE;
-- 
cgit v0.10.2


From ce17c643738bebcacf8d19d8cab7dd3eb96f9f32 Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Wed, 25 Feb 2009 10:38:54 -0600
Subject: KVM: ia64: SN2 adjust emulated ITC frequency to match RTC frequency

On SN2 do not pass down the real ITC frequency, but rather patch the
values to match the SN2 RTC frequency.

Signed-off-by: Jes Sorensen <jes@sgi.com>
Acked-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/ia64/kvm/kvm_fw.c b/arch/ia64/kvm/kvm_fw.c
index a8ae52e..e4b8231 100644
--- a/arch/ia64/kvm/kvm_fw.c
+++ b/arch/ia64/kvm/kvm_fw.c
@@ -21,6 +21,9 @@
 
 #include <linux/kvm_host.h>
 #include <linux/smp.h>
+#include <asm/sn/addrs.h>
+#include <asm/sn/clksupport.h>
+#include <asm/sn/shub_mmr.h>
 
 #include "vti.h"
 #include "misc.h"
@@ -188,12 +191,35 @@ static struct ia64_pal_retval pal_freq_base(struct kvm_vcpu *vcpu)
 	return result;
 }
 
-static struct ia64_pal_retval pal_freq_ratios(struct kvm_vcpu *vcpu)
+/*
+ * On the SGI SN2, the ITC isn't stable. Emulation backed by the SN2
+ * RTC is used instead. This function patches the ratios from SAL
+ * to match the RTC before providing them to the guest.
+ */
+static void sn2_patch_itc_freq_ratios(struct ia64_pal_retval *result)
 {
+	struct pal_freq_ratio *ratio;
+	unsigned long sal_freq, sal_drift, factor;
+
+	result->status = ia64_sal_freq_base(SAL_FREQ_BASE_PLATFORM,
+					    &sal_freq, &sal_drift);
+	ratio = (struct pal_freq_ratio *)&result->v2;
+	factor = ((sal_freq * 3) + (sn_rtc_cycles_per_second / 2)) /
+		sn_rtc_cycles_per_second;
+
+	ratio->num = 3;
+	ratio->den = factor;
+}
 
+static struct ia64_pal_retval pal_freq_ratios(struct kvm_vcpu *vcpu)
+{
 	struct ia64_pal_retval result;
 
 	PAL_CALL(result, PAL_FREQ_RATIOS, 0, 0, 0);
+
+	if (vcpu->kvm->arch.is_sn2)
+		sn2_patch_itc_freq_ratios(&result);
+
 	return result;
 }
 
-- 
cgit v0.10.2


From 0b5d7a2ccb98f0403b1969295429724af9dc9d8e Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Wed, 25 Feb 2009 10:38:55 -0600
Subject: KVM: ia64: Drop in SN2 replacement of fast path ITC emulation fault
 handler

Copy in SN2 RTC based ITC emulation for fast exit. The two versions
have the same size, so a dropin is simpler than patching the branch
instruction to hit the SN2 version.

Signed-off-by: Jes Sorensen <jes@sgi.com>
Acked-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 1243995..589536f 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -580,6 +580,8 @@ struct kvm_vmm_info{
 	kvm_vmm_entry 	*vmm_entry;
 	kvm_tramp_entry *tramp_entry;
 	unsigned long 	vmm_ivt;
+	unsigned long	patch_mov_ar;
+	unsigned long	patch_mov_ar_sn2;
 };
 
 int kvm_highest_pending_irq(struct kvm_vcpu *vcpu);
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index ebbd995..4623a90 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1671,8 +1671,37 @@ out:
 	return 0;
 }
 
+
+/*
+ * On SN2, the ITC isn't stable, so copy in fast path code to use the
+ * SN2 RTC, replacing the ITC based default verion.
+ */
+static void kvm_patch_vmm(struct kvm_vmm_info *vmm_info,
+			  struct module *module)
+{
+	unsigned long new_ar, new_ar_sn2;
+	unsigned long module_base;
+
+	if (!ia64_platform_is("sn2"))
+		return;
+
+	module_base = (unsigned long)module->module_core;
+
+	new_ar = kvm_vmm_base + vmm_info->patch_mov_ar - module_base;
+	new_ar_sn2 = kvm_vmm_base + vmm_info->patch_mov_ar_sn2 - module_base;
+
+	printk(KERN_INFO "kvm: Patching ITC emulation to use SGI SN2 RTC "
+	       "as source\n");
+
+	/*
+	 * Copy the SN2 version of mov_ar into place. They are both
+	 * the same size, so 6 bundles is sufficient (6 * 0x10).
+	 */
+	memcpy((void *)new_ar, (void *)new_ar_sn2, 0x60);
+}
+
 static int kvm_relocate_vmm(struct kvm_vmm_info *vmm_info,
-						struct module *module)
+			    struct module *module)
 {
 	unsigned long module_base;
 	unsigned long vmm_size;
@@ -1694,6 +1723,7 @@ static int kvm_relocate_vmm(struct kvm_vmm_info *vmm_info,
 		return -EFAULT;
 
 	memcpy((void *)kvm_vmm_base, (void *)module_base, vmm_size);
+	kvm_patch_vmm(vmm_info, module);
 	kvm_flush_icache(kvm_vmm_base, vmm_size);
 
 	/*Recalculate kvm_vmm_info based on new VMM*/
diff --git a/arch/ia64/kvm/optvfault.S b/arch/ia64/kvm/optvfault.S
index 32254ce..f793be3 100644
--- a/arch/ia64/kvm/optvfault.S
+++ b/arch/ia64/kvm/optvfault.S
@@ -11,6 +11,7 @@
 
 #include <asm/asmmacro.h>
 #include <asm/processor.h>
+#include <asm/kvm_host.h>
 
 #include "vti.h"
 #include "asm-offsets.h"
@@ -140,6 +141,35 @@ GLOBAL_ENTRY(kvm_asm_mov_from_ar)
 	;;
 END(kvm_asm_mov_from_ar)
 
+/*
+ * Special SGI SN2 optimized version of mov_from_ar using the SN2 RTC
+ * clock as it's source for emulating the ITC. This version will be
+ * copied on top of the original version if the host is determined to
+ * be an SN2.
+ */
+GLOBAL_ENTRY(kvm_asm_mov_from_ar_sn2)
+	add r18=VMM_VCPU_ITC_OFS_OFFSET, r21
+	movl r19 = (KVM_VMM_BASE+(1<<KVM_VMM_SHIFT))
+
+	add r16=VMM_VCPU_LAST_ITC_OFFSET,r21
+	extr.u r17=r25,6,7
+	mov r24=b0
+	;;
+	ld8 r18=[r18]
+	ld8 r19=[r19]
+	addl r20=@gprel(asm_mov_to_reg),gp
+	;;
+	add r19=r19,r18
+	shladd r17=r17,4,r20
+	;;
+	adds r30=kvm_resume_to_guest-asm_mov_to_reg,r20
+	st8 [r16] = r19
+	mov b0=r17
+	br.sptk.few b0
+	;;
+END(kvm_asm_mov_from_ar_sn2)
+
+
 
 // mov r1=rr[r3]
 GLOBAL_ENTRY(kvm_asm_mov_from_rr)
diff --git a/arch/ia64/kvm/vmm.c b/arch/ia64/kvm/vmm.c
index 9eee5c0..f4b4c89 100644
--- a/arch/ia64/kvm/vmm.c
+++ b/arch/ia64/kvm/vmm.c
@@ -30,15 +30,19 @@ MODULE_AUTHOR("Intel");
 MODULE_LICENSE("GPL");
 
 extern char kvm_ia64_ivt;
+extern char kvm_asm_mov_from_ar;
+extern char kvm_asm_mov_from_ar_sn2;
 extern fpswa_interface_t *vmm_fpswa_interface;
 
 long vmm_sanity = 1;
 
 struct kvm_vmm_info vmm_info = {
-	.module	     = THIS_MODULE,
-	.vmm_entry   = vmm_entry,
-	.tramp_entry = vmm_trampoline,
-	.vmm_ivt     = (unsigned long)&kvm_ia64_ivt,
+	.module			= THIS_MODULE,
+	.vmm_entry		= vmm_entry,
+	.tramp_entry		= vmm_trampoline,
+	.vmm_ivt		= (unsigned long)&kvm_ia64_ivt,
+	.patch_mov_ar		= (unsigned long)&kvm_asm_mov_from_ar,
+	.patch_mov_ar_sn2	= (unsigned long)&kvm_asm_mov_from_ar_sn2,
 };
 
 static int __init  kvm_vmm_init(void)
-- 
cgit v0.10.2


From 386eb6e8b3caface8a0514da70a47c05cabb5b96 Mon Sep 17 00:00:00 2001
From: Hannes Eder <hannes@hanneseder.net>
Date: Tue, 10 Mar 2009 22:51:09 +0100
Subject: KVM: make 'lapic_timer_ops' and 'kpit_ops' static

Fix this sparse warnings:
  arch/x86/kvm/lapic.c:916:22: warning: symbol 'lapic_timer_ops' was not declared. Should it be static?
  arch/x86/kvm/i8254.c:268:22: warning: symbol 'kpit_ops' was not declared. Should it be static?

Signed-off-by: Hannes Eder <hannes@hanneseder.net>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 4e2e3f2..cf09bb6 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -265,7 +265,7 @@ static bool kpit_is_periodic(struct kvm_timer *ktimer)
 	return ps->is_periodic;
 }
 
-struct kvm_timer_ops kpit_ops = {
+static struct kvm_timer_ops kpit_ops = {
 	.is_periodic = kpit_is_periodic,
 };
 
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index dd934d2..4d76bb6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -913,7 +913,7 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
 		kvm_apic_local_deliver(apic, APIC_LVT0);
 }
 
-struct kvm_timer_ops lapic_timer_ops = {
+static struct kvm_timer_ops lapic_timer_ops = {
 	.is_periodic = lapic_is_periodic,
 };
 
-- 
cgit v0.10.2


From e56d532f20c890a06bbe7cd479f4201e3a03cd73 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Thu, 12 Mar 2009 21:45:39 +0800
Subject: KVM: Device assignment framework rework

After discussion with Marcelo, we decided to rework device assignment framework
together. The old problems are kernel logic is unnecessary complex. So Marcelo
suggest to split it into a more elegant way:

1. Split host IRQ assign and guest IRQ assign. And userspace determine the
combination. Also discard msi2intx parameter, userspace can specific
KVM_DEV_IRQ_HOST_MSI | KVM_DEV_IRQ_GUEST_INTX in assigned_irq->flags to
enable MSI to INTx convertion.

2. Split assign IRQ and deassign IRQ. Import two new ioctls:
KVM_ASSIGN_DEV_IRQ and KVM_DEASSIGN_DEV_IRQ.

This patch also fixed the reversed _IOR vs _IOW in definition(by deprecated the
old interface).

[avi: replace homemade bitcount() by hweight_long()]

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 43e049a..41123fc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1026,6 +1026,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_SYNC_MMU:
 	case KVM_CAP_REINJECT_CONTROL:
 	case KVM_CAP_IRQ_INJECT_STATUS:
+	case KVM_CAP_ASSIGN_DEV_IRQ:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 640835e..644e3a9 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -412,6 +412,7 @@ struct kvm_trace_rec {
 #ifdef __KVM_HAVE_MSIX
 #define KVM_CAP_DEVICE_MSIX 28
 #endif
+#define KVM_CAP_ASSIGN_DEV_IRQ 29
 /* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */
 #define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30
 
@@ -485,8 +486,10 @@ struct kvm_irq_routing {
 #define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \
 				   struct kvm_assigned_pci_dev)
 #define KVM_SET_GSI_ROUTING       _IOW(KVMIO, 0x6a, struct kvm_irq_routing)
+/* deprecated, replaced by KVM_ASSIGN_DEV_IRQ */
 #define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \
 			    struct kvm_assigned_irq)
+#define KVM_ASSIGN_DEV_IRQ        _IOW(KVMIO, 0x70, struct kvm_assigned_irq)
 #define KVM_REINJECT_CONTROL      _IO(KVMIO, 0x71)
 #define KVM_DEASSIGN_PCI_DEVICE _IOW(KVMIO, 0x72, \
 				     struct kvm_assigned_pci_dev)
@@ -494,6 +497,7 @@ struct kvm_irq_routing {
 			_IOW(KVMIO, 0x73, struct kvm_assigned_msix_nr)
 #define KVM_ASSIGN_SET_MSIX_ENTRY \
 			_IOW(KVMIO, 0x74, struct kvm_assigned_msix_entry)
+#define KVM_DEASSIGN_DEV_IRQ       _IOW(KVMIO, 0x75, struct kvm_assigned_irq)
 
 /*
  * ioctls for vcpu fds
@@ -584,6 +588,8 @@ struct kvm_debug_guest {
 #define KVM_TRC_STLB_INVAL       (KVM_TRC_HANDLER + 0x18)
 #define KVM_TRC_PPC_INSTR        (KVM_TRC_HANDLER + 0x19)
 
+#define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
+
 struct kvm_assigned_pci_dev {
 	__u32 assigned_dev_id;
 	__u32 busnr;
@@ -594,6 +600,17 @@ struct kvm_assigned_pci_dev {
 	};
 };
 
+#define KVM_DEV_IRQ_HOST_INTX    (1 << 0)
+#define KVM_DEV_IRQ_HOST_MSI     (1 << 1)
+#define KVM_DEV_IRQ_HOST_MSIX    (1 << 2)
+
+#define KVM_DEV_IRQ_GUEST_INTX   (1 << 8)
+#define KVM_DEV_IRQ_GUEST_MSI    (1 << 9)
+#define KVM_DEV_IRQ_GUEST_MSIX   (1 << 10)
+
+#define KVM_DEV_IRQ_HOST_MASK	 0x00ff
+#define KVM_DEV_IRQ_GUEST_MASK   0xff00
+
 struct kvm_assigned_irq {
 	__u32 assigned_dev_id;
 	__u32 host_irq;
@@ -609,15 +626,6 @@ struct kvm_assigned_irq {
 	};
 };
 
-#define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
-
-#define KVM_DEV_IRQ_ASSIGN_MSI_ACTION	KVM_DEV_IRQ_ASSIGN_ENABLE_MSI
-#define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI	(1 << 0)
-
-#define KVM_DEV_IRQ_ASSIGN_MSIX_ACTION  (KVM_DEV_IRQ_ASSIGN_ENABLE_MSIX |\
-					KVM_DEV_IRQ_ASSIGN_MASK_MSIX)
-#define KVM_DEV_IRQ_ASSIGN_ENABLE_MSIX  (1 << 1)
-#define KVM_DEV_IRQ_ASSIGN_MASK_MSIX    (1 << 2)
 
 struct kvm_assigned_msix_nr {
 	__u32 assigned_dev_id;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index fb60f31..40e49ed 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -339,11 +339,6 @@ struct kvm_assigned_dev_kernel {
 	struct msix_entry *host_msix_entries;
 	int guest_irq;
 	struct kvm_guest_msix_entry *guest_msix_entries;
-#define KVM_ASSIGNED_DEV_GUEST_INTX	(1 << 0)
-#define KVM_ASSIGNED_DEV_GUEST_MSI	(1 << 1)
-#define KVM_ASSIGNED_DEV_HOST_INTX	(1 << 8)
-#define KVM_ASSIGNED_DEV_HOST_MSI	(1 << 9)
-#define KVM_ASSIGNED_DEV_MSIX		((1 << 2) | (1 << 10))
 	unsigned long irq_requested_type;
 	int irq_source_id;
 	int flags;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 3bed827..792fb7f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -41,6 +41,7 @@
 #include <linux/pagemap.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
+#include <linux/bitops.h>
 
 #include <asm/processor.h>
 #include <asm/io.h>
@@ -60,9 +61,6 @@
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
-static int msi2intx = 1;
-module_param(msi2intx, bool, 0);
-
 DEFINE_SPINLOCK(kvm_lock);
 LIST_HEAD(vm_list);
 
@@ -132,7 +130,7 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 	 * finer-grained lock, update this
 	 */
 	mutex_lock(&kvm->lock);
-	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_MSIX) {
+	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
 		struct kvm_guest_msix_entry *guest_entries =
 			assigned_dev->guest_msix_entries;
 		for (i = 0; i < assigned_dev->entries_nr; i++) {
@@ -152,7 +150,7 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
 			    assigned_dev->guest_irq, 1);
 		if (assigned_dev->irq_requested_type &
-				KVM_ASSIGNED_DEV_GUEST_MSI) {
+				KVM_DEV_IRQ_GUEST_MSI) {
 			enable_irq(assigned_dev->host_irq);
 			assigned_dev->host_irq_disabled = false;
 		}
@@ -166,7 +164,7 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
 	struct kvm_assigned_dev_kernel *assigned_dev =
 		(struct kvm_assigned_dev_kernel *) dev_id;
 
-	if (assigned_dev->irq_requested_type == KVM_ASSIGNED_DEV_MSIX) {
+	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
 		int index = find_index_from_host_irq(assigned_dev, irq);
 		if (index < 0)
 			return IRQ_HANDLED;
@@ -204,22 +202,22 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 	}
 }
 
-/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
-static void kvm_free_assigned_irq(struct kvm *kvm,
-				  struct kvm_assigned_dev_kernel *assigned_dev)
+static void deassign_guest_irq(struct kvm *kvm,
+			       struct kvm_assigned_dev_kernel *assigned_dev)
 {
-	if (!irqchip_in_kernel(kvm))
-		return;
-
 	kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier);
+	assigned_dev->ack_notifier.gsi = -1;
 
 	if (assigned_dev->irq_source_id != -1)
 		kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
 	assigned_dev->irq_source_id = -1;
+	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
+}
 
-	if (!assigned_dev->irq_requested_type)
-		return;
-
+/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
+static void deassign_host_irq(struct kvm *kvm,
+			      struct kvm_assigned_dev_kernel *assigned_dev)
+{
 	/*
 	 * In kvm_free_device_irq, cancel_work_sync return true if:
 	 * 1. work is scheduled, and then cancelled.
@@ -236,7 +234,7 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
 	 * now, the kvm state is still legal for probably we also have to wait
 	 * interrupt_work done.
 	 */
-	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_MSIX) {
+	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
 		int i;
 		for (i = 0; i < assigned_dev->entries_nr; i++)
 			disable_irq_nosync(assigned_dev->
@@ -259,14 +257,41 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
 
 		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
 
-		if (assigned_dev->irq_requested_type &
-				KVM_ASSIGNED_DEV_HOST_MSI)
+		if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
 			pci_disable_msi(assigned_dev->dev);
 	}
 
-	assigned_dev->irq_requested_type = 0;
+	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
+}
+
+static int kvm_deassign_irq(struct kvm *kvm,
+			    struct kvm_assigned_dev_kernel *assigned_dev,
+			    unsigned long irq_requested_type)
+{
+	unsigned long guest_irq_type, host_irq_type;
+
+	if (!irqchip_in_kernel(kvm))
+		return -EINVAL;
+	/* no irq assignment to deassign */
+	if (!assigned_dev->irq_requested_type)
+		return -ENXIO;
+
+	host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
+	guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
+
+	if (host_irq_type)
+		deassign_host_irq(kvm, assigned_dev);
+	if (guest_irq_type)
+		deassign_guest_irq(kvm, assigned_dev);
+
+	return 0;
 }
 
+static void kvm_free_assigned_irq(struct kvm *kvm,
+				  struct kvm_assigned_dev_kernel *assigned_dev)
+{
+	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
+}
 
 static void kvm_free_assigned_device(struct kvm *kvm,
 				     struct kvm_assigned_dev_kernel
@@ -298,256 +323,244 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
 	}
 }
 
-static int assigned_device_update_intx(struct kvm *kvm,
-			struct kvm_assigned_dev_kernel *adev,
-			struct kvm_assigned_irq *airq)
+static int assigned_device_enable_host_intx(struct kvm *kvm,
+					    struct kvm_assigned_dev_kernel *dev)
 {
-	adev->guest_irq = airq->guest_irq;
-	adev->ack_notifier.gsi = airq->guest_irq;
-
-	if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_INTX)
-		return 0;
-
-	if (irqchip_in_kernel(kvm)) {
-		if (!msi2intx &&
-		    (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)) {
-			free_irq(adev->host_irq, (void *)adev);
-			pci_disable_msi(adev->dev);
-		}
+	dev->host_irq = dev->dev->irq;
+	/* Even though this is PCI, we don't want to use shared
+	 * interrupts. Sharing host devices with guest-assigned devices
+	 * on the same interrupt line is not a happy situation: there
+	 * are going to be long delays in accepting, acking, etc.
+	 */
+	if (request_irq(dev->host_irq, kvm_assigned_dev_intr,
+			0, "kvm_assigned_intx_device", (void *)dev))
+		return -EIO;
+	return 0;
+}
 
-		if (!capable(CAP_SYS_RAWIO))
-			return -EPERM;
+#ifdef __KVM_HAVE_MSI
+static int assigned_device_enable_host_msi(struct kvm *kvm,
+					   struct kvm_assigned_dev_kernel *dev)
+{
+	int r;
 
-		if (airq->host_irq)
-			adev->host_irq = airq->host_irq;
-		else
-			adev->host_irq = adev->dev->irq;
+	if (!dev->dev->msi_enabled) {
+		r = pci_enable_msi(dev->dev);
+		if (r)
+			return r;
+	}
 
-		/* Even though this is PCI, we don't want to use shared
-		 * interrupts. Sharing host devices with guest-assigned devices
-		 * on the same interrupt line is not a happy situation: there
-		 * are going to be long delays in accepting, acking, etc.
-		 */
-		if (request_irq(adev->host_irq, kvm_assigned_dev_intr,
-				0, "kvm_assigned_intx_device", (void *)adev))
-			return -EIO;
+	dev->host_irq = dev->dev->irq;
+	if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0,
+			"kvm_assigned_msi_device", (void *)dev)) {
+		pci_disable_msi(dev->dev);
+		return -EIO;
 	}
 
-	adev->irq_requested_type = KVM_ASSIGNED_DEV_GUEST_INTX |
-				   KVM_ASSIGNED_DEV_HOST_INTX;
 	return 0;
 }
+#endif
 
-#ifdef CONFIG_X86
-static int assigned_device_update_msi(struct kvm *kvm,
-			struct kvm_assigned_dev_kernel *adev,
-			struct kvm_assigned_irq *airq)
+#ifdef __KVM_HAVE_MSIX
+static int assigned_device_enable_host_msix(struct kvm *kvm,
+					    struct kvm_assigned_dev_kernel *dev)
 {
-	int r;
+	int i, r = -EINVAL;
 
-	adev->guest_irq = airq->guest_irq;
-	if (airq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSI) {
-		/* x86 don't care upper address of guest msi message addr */
-		adev->irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_MSI;
-		adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_INTX;
-		adev->ack_notifier.gsi = -1;
-	} else if (msi2intx) {
-		adev->irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_INTX;
-		adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_MSI;
-		adev->ack_notifier.gsi = airq->guest_irq;
-	} else {
-		/*
-		 * Guest require to disable device MSI, we disable MSI and
-		 * re-enable INTx by default again. Notice it's only for
-		 * non-msi2intx.
-		 */
-		assigned_device_update_intx(kvm, adev, airq);
-		return 0;
-	}
+	/* host_msix_entries and guest_msix_entries should have been
+	 * initialized */
+	if (dev->entries_nr == 0)
+		return r;
 
-	if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)
-		return 0;
+	r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr);
+	if (r)
+		return r;
 
-	if (irqchip_in_kernel(kvm)) {
-		if (!msi2intx) {
-			if (adev->irq_requested_type &
-					KVM_ASSIGNED_DEV_HOST_INTX)
-				free_irq(adev->host_irq, (void *)adev);
+	for (i = 0; i < dev->entries_nr; i++) {
+		r = request_irq(dev->host_msix_entries[i].vector,
+				kvm_assigned_dev_intr, 0,
+				"kvm_assigned_msix_device",
+				(void *)dev);
+		/* FIXME: free requested_irq's on failure */
+		if (r)
+			return r;
+	}
 
-			r = pci_enable_msi(adev->dev);
-			if (r)
-				return r;
-		}
+	return 0;
+}
 
-		adev->host_irq = adev->dev->irq;
-		if (request_irq(adev->host_irq, kvm_assigned_dev_intr, 0,
-				"kvm_assigned_msi_device", (void *)adev))
-			return -EIO;
-	}
+#endif
 
-	if (!msi2intx)
-		adev->irq_requested_type = KVM_ASSIGNED_DEV_GUEST_MSI;
+static int assigned_device_enable_guest_intx(struct kvm *kvm,
+				struct kvm_assigned_dev_kernel *dev,
+				struct kvm_assigned_irq *irq)
+{
+	dev->guest_irq = irq->guest_irq;
+	dev->ack_notifier.gsi = irq->guest_irq;
+	return 0;
+}
 
-	adev->irq_requested_type |= KVM_ASSIGNED_DEV_HOST_MSI;
+#ifdef __KVM_HAVE_MSI
+static int assigned_device_enable_guest_msi(struct kvm *kvm,
+			struct kvm_assigned_dev_kernel *dev,
+			struct kvm_assigned_irq *irq)
+{
+	dev->guest_irq = irq->guest_irq;
+	dev->ack_notifier.gsi = -1;
 	return 0;
 }
 #endif
+#ifdef __KVM_HAVE_MSIX
+static int assigned_device_enable_guest_msix(struct kvm *kvm,
+			struct kvm_assigned_dev_kernel *dev,
+			struct kvm_assigned_irq *irq)
+{
+	dev->guest_irq = irq->guest_irq;
+	dev->ack_notifier.gsi = -1;
+	return 0;
+}
+#endif
+
+static int assign_host_irq(struct kvm *kvm,
+			   struct kvm_assigned_dev_kernel *dev,
+			   __u32 host_irq_type)
+{
+	int r = -EEXIST;
+
+	if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
+		return r;
 
+	switch (host_irq_type) {
+	case KVM_DEV_IRQ_HOST_INTX:
+		r = assigned_device_enable_host_intx(kvm, dev);
+		break;
+#ifdef __KVM_HAVE_MSI
+	case KVM_DEV_IRQ_HOST_MSI:
+		r = assigned_device_enable_host_msi(kvm, dev);
+		break;
+#endif
 #ifdef __KVM_HAVE_MSIX
-static int assigned_device_update_msix(struct kvm *kvm,
-			struct kvm_assigned_dev_kernel *adev,
-			struct kvm_assigned_irq *airq)
-{
-	/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
-	int i, r;
-
-	adev->ack_notifier.gsi = -1;
-
-	if (irqchip_in_kernel(kvm)) {
-		if (airq->flags & KVM_DEV_IRQ_ASSIGN_MASK_MSIX)
-			return -ENOTTY;
-
-		if (!(airq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSIX)) {
-			/* Guest disable MSI-X */
-			kvm_free_assigned_irq(kvm, adev);
-			if (msi2intx) {
-				pci_enable_msi(adev->dev);
-				if (adev->dev->msi_enabled)
-					return assigned_device_update_msi(kvm,
-							adev, airq);
-			}
-			return assigned_device_update_intx(kvm, adev, airq);
-		}
+	case KVM_DEV_IRQ_HOST_MSIX:
+		r = assigned_device_enable_host_msix(kvm, dev);
+		break;
+#endif
+	default:
+		r = -EINVAL;
+	}
 
-		/* host_msix_entries and guest_msix_entries should have been
-		 * initialized */
-		if (adev->entries_nr == 0)
-			return -EINVAL;
+	if (!r)
+		dev->irq_requested_type |= host_irq_type;
 
-		kvm_free_assigned_irq(kvm, adev);
+	return r;
+}
 
-		r = pci_enable_msix(adev->dev, adev->host_msix_entries,
-				    adev->entries_nr);
-		if (r)
-			return r;
+static int assign_guest_irq(struct kvm *kvm,
+			    struct kvm_assigned_dev_kernel *dev,
+			    struct kvm_assigned_irq *irq,
+			    unsigned long guest_irq_type)
+{
+	int id;
+	int r = -EEXIST;
 
-		for (i = 0; i < adev->entries_nr; i++) {
-			r = request_irq((adev->host_msix_entries + i)->vector,
-					kvm_assigned_dev_intr, 0,
-					"kvm_assigned_msix_device",
-					(void *)adev);
-			if (r)
-				return r;
-		}
+	if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
+		return r;
+
+	id = kvm_request_irq_source_id(kvm);
+	if (id < 0)
+		return id;
+
+	dev->irq_source_id = id;
+
+	switch (guest_irq_type) {
+	case KVM_DEV_IRQ_GUEST_INTX:
+		r = assigned_device_enable_guest_intx(kvm, dev, irq);
+		break;
+#ifdef __KVM_HAVE_MSI
+	case KVM_DEV_IRQ_GUEST_MSI:
+		r = assigned_device_enable_guest_msi(kvm, dev, irq);
+		break;
+#endif
+#ifdef __KVM_HAVE_MSIX
+	case KVM_DEV_IRQ_GUEST_MSIX:
+		r = assigned_device_enable_guest_msix(kvm, dev, irq);
+		break;
+#endif
+	default:
+		r = -EINVAL;
 	}
 
-	adev->irq_requested_type |= KVM_ASSIGNED_DEV_MSIX;
+	if (!r) {
+		dev->irq_requested_type |= guest_irq_type;
+		kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
+	} else
+		kvm_free_irq_source_id(kvm, dev->irq_source_id);
 
-	return 0;
+	return r;
 }
-#endif
 
+/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
 static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
-				   struct kvm_assigned_irq
-				   *assigned_irq)
+				   struct kvm_assigned_irq *assigned_irq)
 {
-	int r = 0;
+	int r = -EINVAL;
 	struct kvm_assigned_dev_kernel *match;
-	u32 current_flags = 0, changed_flags;
+	unsigned long host_irq_type, guest_irq_type;
 
-	mutex_lock(&kvm->lock);
+	if (!capable(CAP_SYS_RAWIO))
+		return -EPERM;
 
+	if (!irqchip_in_kernel(kvm))
+		return r;
+
+	mutex_lock(&kvm->lock);
+	r = -ENODEV;
 	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
 				      assigned_irq->assigned_dev_id);
-	if (!match) {
-		mutex_unlock(&kvm->lock);
-		return -EINVAL;
-	}
-
-	if (!match->irq_requested_type) {
-		INIT_WORK(&match->interrupt_work,
-				kvm_assigned_dev_interrupt_work_handler);
-		if (irqchip_in_kernel(kvm)) {
-			/* Register ack nofitier */
-			match->ack_notifier.gsi = -1;
-			match->ack_notifier.irq_acked =
-					kvm_assigned_dev_ack_irq;
-			kvm_register_irq_ack_notifier(kvm,
-					&match->ack_notifier);
-
-			/* Request IRQ source ID */
-			r = kvm_request_irq_source_id(kvm);
-			if (r < 0)
-				goto out_release;
-			else
-				match->irq_source_id = r;
-
-#ifdef CONFIG_X86
-			/* Determine host device irq type, we can know the
-			 * result from dev->msi_enabled */
-			if (msi2intx)
-				pci_enable_msi(match->dev);
-#endif
-		}
-	}
+	if (!match)
+		goto out;
 
-	if (match->irq_requested_type & KVM_ASSIGNED_DEV_MSIX)
-		current_flags |= KVM_DEV_IRQ_ASSIGN_ENABLE_MSIX;
-	else if ((match->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) &&
-		 (match->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_MSI))
-		current_flags |= KVM_DEV_IRQ_ASSIGN_ENABLE_MSI;
+	host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
+	guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
 
-	changed_flags = assigned_irq->flags ^ current_flags;
+	r = -EINVAL;
+	/* can only assign one type at a time */
+	if (hweight_long(host_irq_type) > 1)
+		goto out;
+	if (hweight_long(guest_irq_type) > 1)
+		goto out;
+	if (host_irq_type == 0 && guest_irq_type == 0)
+		goto out;
 
-#ifdef __KVM_HAVE_MSIX
-	if (changed_flags & KVM_DEV_IRQ_ASSIGN_MSIX_ACTION) {
-		r = assigned_device_update_msix(kvm, match, assigned_irq);
-		if (r) {
-			printk(KERN_WARNING "kvm: failed to execute "
-					"MSI-X action!\n");
-			goto out_release;
-		}
-	} else
-#endif
-	if ((changed_flags & KVM_DEV_IRQ_ASSIGN_MSI_ACTION) ||
-	    (msi2intx && match->dev->msi_enabled)) {
-#ifdef CONFIG_X86
-		r = assigned_device_update_msi(kvm, match, assigned_irq);
-		if (r) {
-			printk(KERN_WARNING "kvm: failed to enable "
-					"MSI device!\n");
-			goto out_release;
-		}
-#else
-		r = -ENOTTY;
-#endif
-	} else if (assigned_irq->host_irq == 0 && match->dev->irq == 0) {
-		/* Host device IRQ 0 means don't support INTx */
-		if (!msi2intx) {
-			printk(KERN_WARNING
-			       "kvm: wait device to enable MSI!\n");
-			r = 0;
-		} else {
-			printk(KERN_WARNING
-			       "kvm: failed to enable MSI device!\n");
-			r = -ENOTTY;
-			goto out_release;
-		}
-	} else {
-		/* Non-sharing INTx mode */
-		r = assigned_device_update_intx(kvm, match, assigned_irq);
-		if (r) {
-			printk(KERN_WARNING "kvm: failed to enable "
-					"INTx device!\n");
-			goto out_release;
-		}
-	}
+	r = 0;
+	if (host_irq_type)
+		r = assign_host_irq(kvm, match, host_irq_type);
+	if (r)
+		goto out;
 
+	if (guest_irq_type)
+		r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
+out:
 	mutex_unlock(&kvm->lock);
 	return r;
-out_release:
+}
+
+static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
+					 struct kvm_assigned_irq
+					 *assigned_irq)
+{
+	int r = -ENODEV;
+	struct kvm_assigned_dev_kernel *match;
+
+	mutex_lock(&kvm->lock);
+
+	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_irq->assigned_dev_id);
+	if (!match)
+		goto out;
+
+	r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
+out:
 	mutex_unlock(&kvm->lock);
-	kvm_free_assigned_device(kvm, match);
 	return r;
 }
 
@@ -565,7 +578,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 				      assigned_dev->assigned_dev_id);
 	if (match) {
 		/* device already assigned */
-		r = -EINVAL;
+		r = -EEXIST;
 		goto out;
 	}
 
@@ -604,6 +617,9 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 	match->dev = dev;
 	match->irq_source_id = -1;
 	match->kvm = kvm;
+	match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
+	INIT_WORK(&match->interrupt_work,
+		  kvm_assigned_dev_interrupt_work_handler);
 
 	list_add(&match->list, &kvm->arch.assigned_dev_head);
 
@@ -2084,6 +2100,11 @@ static long kvm_vm_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_ASSIGN_IRQ: {
+		r = -EOPNOTSUPP;
+		break;
+	}
+#ifdef KVM_CAP_ASSIGN_DEV_IRQ
+	case KVM_ASSIGN_DEV_IRQ: {
 		struct kvm_assigned_irq assigned_irq;
 
 		r = -EFAULT;
@@ -2094,6 +2115,18 @@ static long kvm_vm_ioctl(struct file *filp,
 			goto out;
 		break;
 	}
+	case KVM_DEASSIGN_DEV_IRQ: {
+		struct kvm_assigned_irq assigned_irq;
+
+		r = -EFAULT;
+		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
+			goto out;
+		r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
+		if (r)
+			goto out;
+		break;
+	}
+#endif
 #endif
 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT
 	case KVM_DEASSIGN_PCI_DEVICE: {
@@ -2596,9 +2629,6 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
 
 	kvm_preempt_ops.sched_in = kvm_sched_in;
 	kvm_preempt_ops.sched_out = kvm_sched_out;
-#ifndef CONFIG_X86
-	msi2intx = 0;
-#endif
 
 	return 0;
 
-- 
cgit v0.10.2


From f00be0cae4e6ad0a8c7be381c6d9be3586800b3e Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 19 Mar 2009 12:20:36 +0200
Subject: KVM: MMU: do not free active mmu pages in free_mmu_pages()

free_mmu_pages() should only undo what alloc_mmu_pages() does.
Free mmu pages from the generic VM destruction function, kvm_destroy_vm().

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 32cf11e..8aac67c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2638,14 +2638,6 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
 
 static void free_mmu_pages(struct kvm_vcpu *vcpu)
 {
-	struct kvm_mmu_page *sp;
-
-	while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
-		sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
-				  struct kvm_mmu_page, link);
-		kvm_mmu_zap_page(vcpu->kvm, sp);
-		cond_resched();
-	}
 	free_page((unsigned long)vcpu->arch.mmu.pae_root);
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 792fb7f..934dd1c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1032,6 +1032,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
 #endif
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
+#else
+	kvm_arch_flush_shadow(kvm);
 #endif
 	kvm_arch_destroy_vm(kvm);
 	mmdrop(mm);
-- 
cgit v0.10.2


From 7fe29e0faacb650d31b9e9f538203a157bec821d Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@redhat.com>
Date: Fri, 20 Mar 2009 12:39:00 +0530
Subject: KVM: x86: Ignore reads to EVNTSEL MSRs

We ignore writes to the performance counters and performance event
selector registers already. Kaspersky antivirus reads the eventsel
MSR causing it to crash with the current behaviour.

Return 0 as data when the eventsel registers are read to stop the
crash.

Signed-off-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 41123fc..c0ae5e6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -895,6 +895,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_IA32_LASTINTFROMIP:
 	case MSR_IA32_LASTINTTOIP:
 	case MSR_VM_HSAVE_PA:
+	case MSR_P6_EVNTSEL0:
+	case MSR_P6_EVNTSEL1:
 		data = 0;
 		break;
 	case MSR_MTRRcap:
-- 
cgit v0.10.2


From 61c50edfcd40be9126579f9cec68c789b6089998 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 22 Mar 2009 12:37:05 +0200
Subject: KVM: SVM: Remove duplicate code in svm_do_inject_vector()

svm_do_inject_vector() reimplements pop_irq().

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1f8510c..5b35ebd 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2346,15 +2346,7 @@ static void kvm_reput_irq(struct vcpu_svm *svm)
 
 static void svm_do_inject_vector(struct vcpu_svm *svm)
 {
-	struct kvm_vcpu *vcpu = &svm->vcpu;
-	int word_index = __ffs(vcpu->arch.irq_summary);
-	int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
-	int irq = word_index * BITS_PER_LONG + bit_index;
-
-	clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
-	if (!vcpu->arch.irq_pending[word_index])
-		clear_bit(word_index, &vcpu->arch.irq_summary);
-	svm_inject_irq(svm, irq);
+	svm_inject_irq(svm, pop_irq(&svm->vcpu));
 }
 
 static void do_interrupt_requests(struct kvm_vcpu *vcpu,
-- 
cgit v0.10.2


From fe4c7b1914ac46af751d256f5a20c2e12dcbaaae Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 23 Mar 2009 11:23:18 +0200
Subject: KVM: reuse (pop|push)_irq from svm.c in vmx.c

The prioritized bit vector manipulation functions are useful in both vmx and
svm.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 5b35ebd..aa528db 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -19,6 +19,7 @@
 #include "irq.h"
 #include "mmu.h"
 #include "kvm_cache_regs.h"
+#include "x86.h"
 
 #include <linux/module.h>
 #include <linux/kernel.h>
@@ -132,24 +133,6 @@ static inline u32 svm_has(u32 feat)
 	return svm_features & feat;
 }
 
-static inline u8 pop_irq(struct kvm_vcpu *vcpu)
-{
-	int word_index = __ffs(vcpu->arch.irq_summary);
-	int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
-	int irq = word_index * BITS_PER_LONG + bit_index;
-
-	clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
-	if (!vcpu->arch.irq_pending[word_index])
-		clear_bit(word_index, &vcpu->arch.irq_summary);
-	return irq;
-}
-
-static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
-{
-	set_bit(irq, vcpu->arch.irq_pending);
-	set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
-}
-
 static inline void clgi(void)
 {
 	asm volatile (__ex(SVM_CLGI));
@@ -1116,7 +1099,7 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	if (!irqchip_in_kernel(kvm) &&
 	    is_external_interrupt(exit_int_info)) {
 		event_injection = true;
-		push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
+		kvm_push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
 	}
 
 	fault_address  = svm->vmcb->control.exit_info_2;
@@ -2336,7 +2319,7 @@ static void kvm_reput_irq(struct vcpu_svm *svm)
 	if ((control->int_ctl & V_IRQ_MASK)
 	    && !irqchip_in_kernel(svm->vcpu.kvm)) {
 		control->int_ctl &= ~V_IRQ_MASK;
-		push_irq(&svm->vcpu, control->int_vector);
+		kvm_push_irq(&svm->vcpu, control->int_vector);
 	}
 
 	svm->vcpu.arch.interrupt_window_open =
@@ -2346,7 +2329,7 @@ static void kvm_reput_irq(struct vcpu_svm *svm)
 
 static void svm_do_inject_vector(struct vcpu_svm *svm)
 {
-	svm_inject_irq(svm, pop_irq(&svm->vcpu));
+	svm_inject_irq(svm, kvm_pop_irq(&svm->vcpu));
 }
 
 static void do_interrupt_requests(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b5eae7a..2c0a2ed 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2489,18 +2489,6 @@ static void vmx_update_window_states(struct kvm_vcpu *vcpu)
 				 GUEST_INTR_STATE_MOV_SS)));
 }
 
-static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
-{
-	int word_index = __ffs(vcpu->arch.irq_summary);
-	int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
-	int irq = word_index * BITS_PER_LONG + bit_index;
-
-	clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
-	if (!vcpu->arch.irq_pending[word_index])
-		clear_bit(word_index, &vcpu->arch.irq_summary);
-	kvm_queue_interrupt(vcpu, irq);
-}
-
 static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 				       struct kvm_run *kvm_run)
 {
@@ -2534,7 +2522,7 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 
 	if (vcpu->arch.interrupt_window_open) {
 		if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
-			kvm_do_inject_irq(vcpu);
+			kvm_queue_interrupt(vcpu, kvm_pop_irq(vcpu));
 
 		if (vcpu->arch.interrupt.pending)
 			vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
@@ -2619,8 +2607,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
 		int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
-		set_bit(irq, vcpu->arch.irq_pending);
-		set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
+		kvm_push_irq(vcpu, irq);
 	}
 
 	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 6a4be78..2ab6791 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -19,4 +19,22 @@ static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
 	vcpu->arch.interrupt.pending = false;
 }
 
+static inline u8 kvm_pop_irq(struct kvm_vcpu *vcpu)
+{
+	int word_index = __ffs(vcpu->arch.irq_summary);
+	int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
+	int irq = word_index * BITS_PER_LONG + bit_index;
+
+	clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
+	if (!vcpu->arch.irq_pending[word_index])
+		clear_bit(word_index, &vcpu->arch.irq_summary);
+	return irq;
+}
+
+static inline void kvm_push_irq(struct kvm_vcpu *vcpu, u8 irq)
+{
+        set_bit(irq, vcpu->arch.irq_pending);
+        set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
+}
+
 #endif
-- 
cgit v0.10.2


From c1f8bc04c6f8576553dc87abe7562e868433a19f Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 23 Mar 2009 15:41:17 +0200
Subject: KVM: VMX: Make module parameters readable

Useful to see how the module was loaded.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2c0a2ed..469787ce 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -39,19 +39,19 @@ MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
 static int bypass_guest_pf = 1;
-module_param(bypass_guest_pf, bool, 0);
+module_param(bypass_guest_pf, bool, S_IRUGO);
 
 static int enable_vpid = 1;
-module_param(enable_vpid, bool, 0);
+module_param(enable_vpid, bool, 0444);
 
 static int flexpriority_enabled = 1;
-module_param(flexpriority_enabled, bool, 0);
+module_param(flexpriority_enabled, bool, S_IRUGO);
 
 static int enable_ept = 1;
-module_param(enable_ept, bool, 0);
+module_param(enable_ept, bool, S_IRUGO);
 
 static int emulate_invalid_guest_state = 0;
-module_param(emulate_invalid_guest_state, bool, 0);
+module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 
 struct vmcs {
 	u32 revision_id;
-- 
cgit v0.10.2


From 6062d012ed23c29672bb0f93ebcfb8e556def726 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 23 Mar 2009 17:35:17 +0200
Subject: KVM: VMX: Rename kvm_handle_exit() to vmx_handle_exit()

It is a static vmx-specific function.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 469787ce..85f4fd5 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3162,7 +3162,7 @@ static const int kvm_vmx_max_exit_handlers =
  * The guest has exited.  See if we can fix it or if we need userspace
  * assistance.
  */
-static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
 	u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3681,7 +3681,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.tlb_flush = vmx_flush_tlb,
 
 	.run = vmx_vcpu_run,
-	.handle_exit = kvm_handle_exit,
+	.handle_exit = vmx_handle_exit,
 	.skip_emulated_instruction = skip_emulated_instruction,
 	.patch_hypercall = vmx_patch_hypercall,
 	.get_irq = vmx_get_irq,
-- 
cgit v0.10.2


From 736caefe1511d9d1116ed4ffb0ea95b7368beb1f Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 23 Mar 2009 17:39:48 +0200
Subject: KVM: VMX: Simplify module parameter names

Instead of 'enable_vpid=1', use a simple 'vpid=1'.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 85f4fd5..a69ba6b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -42,13 +42,13 @@ static int bypass_guest_pf = 1;
 module_param(bypass_guest_pf, bool, S_IRUGO);
 
 static int enable_vpid = 1;
-module_param(enable_vpid, bool, 0444);
+module_param_named(vpid, enable_vpid, bool, 0444);
 
 static int flexpriority_enabled = 1;
-module_param(flexpriority_enabled, bool, S_IRUGO);
+module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
 
 static int enable_ept = 1;
-module_param(enable_ept, bool, S_IRUGO);
+module_param_named(ept, enable_ept, bool, S_IRUGO);
 
 static int emulate_invalid_guest_state = 0;
 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
-- 
cgit v0.10.2


From 4462d21a615dfd0c0f672c10832a011d6f280d5a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 23 Mar 2009 17:53:37 +0200
Subject: KVM: VMX: Annotate module parameters as __read_mostly

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a69ba6b..f4b6c4b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -38,19 +38,19 @@
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
-static int bypass_guest_pf = 1;
+static int __read_mostly bypass_guest_pf = 1;
 module_param(bypass_guest_pf, bool, S_IRUGO);
 
-static int enable_vpid = 1;
+static int __read_mostly enable_vpid = 1;
 module_param_named(vpid, enable_vpid, bool, 0444);
 
-static int flexpriority_enabled = 1;
+static int __read_mostly flexpriority_enabled = 1;
 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
 
-static int enable_ept = 1;
+static int __read_mostly enable_ept = 1;
 module_param_named(ept, enable_ept, bool, S_IRUGO);
 
-static int emulate_invalid_guest_state = 0;
+static int __read_mostly emulate_invalid_guest_state = 0;
 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 
 struct vmcs {
-- 
cgit v0.10.2


From 919818abc2ca0721f1cd296fbc24601d9044f993 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 23 Mar 2009 18:01:29 +0200
Subject: KVM: VMX: Zero the vpid module parameter if vpid is not supported

This allows reading back how the hardware is configured.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f4b6c4b..9b97c8e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1202,6 +1202,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 		      vmx_capability.ept, vmx_capability.vpid);
 	}
 
+	if (!cpu_has_vmx_vpid())
+		enable_vpid = 0;
+
 	min = 0;
 #ifdef CONFIG_X86_64
 	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
@@ -2082,7 +2085,7 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
 	int vpid;
 
 	vmx->vpid = 0;
-	if (!enable_vpid || !cpu_has_vmx_vpid())
+	if (!enable_vpid)
 		return;
 	spin_lock(&vmx_vpid_lock);
 	vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
-- 
cgit v0.10.2


From 575ff2dcb25608d53737d1126ee0e7e4d6f11752 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 23 Mar 2009 18:25:15 +0200
Subject: KVM: VMX: Zero ept module parameter if ept is not present

Allows reading back hardware capability.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9b97c8e..2f65120 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -265,7 +265,7 @@ static inline int cpu_has_vmx_ept(void)
 
 static inline int vm_need_ept(void)
 {
-	return (cpu_has_vmx_ept() && enable_ept);
+	return enable_ept;
 }
 
 static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
@@ -1205,6 +1205,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	if (!cpu_has_vmx_vpid())
 		enable_vpid = 0;
 
+	if (!cpu_has_vmx_ept())
+		enable_ept = 0;
+
 	min = 0;
 #ifdef CONFIG_X86_64
 	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
-- 
cgit v0.10.2


From 089d034e0c4538d2436512fa64782b91008d4a7c Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 23 Mar 2009 18:26:32 +0200
Subject: KVM: VMX: Fold vm_need_ept() into callers

Trivial.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2f65120..da6461d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -263,11 +263,6 @@ static inline int cpu_has_vmx_ept(void)
 		SECONDARY_EXEC_ENABLE_EPT);
 }
 
-static inline int vm_need_ept(void)
-{
-	return enable_ept;
-}
-
 static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
 {
 	return ((cpu_has_vmx_virtualize_apic_accesses()) &&
@@ -382,7 +377,7 @@ static inline void ept_sync_global(void)
 
 static inline void ept_sync_context(u64 eptp)
 {
-	if (vm_need_ept()) {
+	if (enable_ept) {
 		if (cpu_has_vmx_invept_context())
 			__invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
 		else
@@ -392,7 +387,7 @@ static inline void ept_sync_context(u64 eptp)
 
 static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
 {
-	if (vm_need_ept()) {
+	if (enable_ept) {
 		if (cpu_has_vmx_invept_individual_addr())
 			__invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
 					eptp, gpa);
@@ -491,7 +486,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 	}
 	if (vcpu->arch.rmode.active)
 		eb = ~0;
-	if (vm_need_ept())
+	if (enable_ept)
 		eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
 	vmcs_write32(EXCEPTION_BITMAP, eb);
 }
@@ -1502,7 +1497,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
 {
 	vpid_sync_vcpu_all(to_vmx(vcpu));
-	if (vm_need_ept())
+	if (enable_ept)
 		ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
 }
 
@@ -1587,7 +1582,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	}
 #endif
 
-	if (vm_need_ept())
+	if (enable_ept)
 		ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
 
 	vmcs_writel(CR0_READ_SHADOW, cr0);
@@ -1616,7 +1611,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 	u64 eptp;
 
 	guest_cr3 = cr3;
-	if (vm_need_ept()) {
+	if (enable_ept) {
 		eptp = construct_eptp(cr3);
 		vmcs_write64(EPT_POINTER, eptp);
 		ept_sync_context(eptp);
@@ -1637,7 +1632,7 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 		    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
 
 	vcpu->arch.cr4 = cr4;
-	if (vm_need_ept())
+	if (enable_ept)
 		ept_update_paging_mode_cr4(&hw_cr4, vcpu);
 
 	vmcs_writel(CR4_READ_SHADOW, cr4);
@@ -1999,7 +1994,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
 	pfn_t identity_map_pfn;
 	u32 tmp;
 
-	if (!vm_need_ept())
+	if (!enable_ept)
 		return 1;
 	if (unlikely(!kvm->arch.ept_identity_pagetable)) {
 		printk(KERN_ERR "EPT: identity-mapping pagetable "
@@ -2163,7 +2158,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 				CPU_BASED_CR8_LOAD_EXITING;
 #endif
 	}
-	if (!vm_need_ept())
+	if (!enable_ept)
 		exec_control |= CPU_BASED_CR3_STORE_EXITING |
 				CPU_BASED_CR3_LOAD_EXITING  |
 				CPU_BASED_INVLPG_EXITING;
@@ -2176,7 +2171,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 				~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 		if (vmx->vpid == 0)
 			exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
-		if (!vm_need_ept())
+		if (!enable_ept)
 			exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
 		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
 	}
@@ -2637,7 +2632,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
 	if (is_page_fault(intr_info)) {
 		/* EPT won't cause page fault directly */
-		if (vm_need_ept())
+		if (enable_ept)
 			BUG();
 		cr2 = vmcs_readl(EXIT_QUALIFICATION);
 		KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
@@ -3187,7 +3182,7 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
 	/* Access CR3 don't cause VMExit in paging mode, so we need
 	 * to sync with guest real CR3. */
-	if (vm_need_ept() && is_paging(vcpu)) {
+	if (enable_ept && is_paging(vcpu)) {
 		vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
 		ept_load_pdptrs(vcpu);
 	}
@@ -3602,7 +3597,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 		if (alloc_apic_access_page(kvm) != 0)
 			goto free_vmcs;
 
-	if (vm_need_ept())
+	if (enable_ept)
 		if (alloc_identity_pagetable(kvm) != 0)
 			goto free_vmcs;
 
@@ -3753,7 +3748,7 @@ static int __init vmx_init(void)
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
 
-	if (vm_need_ept()) {
+	if (enable_ept) {
 		bypass_guest_pf = 0;
 		kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
 			VMX_EPT_WRITABLE_MASK);
-- 
cgit v0.10.2


From 09cec754885f900f6aab23801878c0cd217ee1d6 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 23 Mar 2009 15:11:44 +0200
Subject: KVM: Timer event should not unconditionally unhalt vcpu.

Currently timer events are processed before entering guest mode. Move it
to main vcpu event loop since timer events should be processed even while
vcpu is halted.  Timer may cause interrupt/nmi to be injected and only then
vcpu will be unhalted.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 4623a90..d2a90fd 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -488,10 +488,10 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 		hrtimer_cancel(p_ht);
 		vcpu->arch.ht_active = 0;
 
-		if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
+		if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests) ||
+				kvm_cpu_has_pending_timer(vcpu))
 			if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
-				vcpu->arch.mp_state =
-					KVM_MP_STATE_RUNNABLE;
+				vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 
 		if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
 			return -EINTR;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c0ae5e6..8fca7a4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3133,9 +3133,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		}
 	}
 
-	clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
-	kvm_inject_pending_timer_irqs(vcpu);
-
 	preempt_disable();
 
 	kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -3235,6 +3232,7 @@ out:
 	return r;
 }
 
+
 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	int r;
@@ -3261,29 +3259,42 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 			kvm_vcpu_block(vcpu);
 			down_read(&vcpu->kvm->slots_lock);
 			if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
-				if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
+			{
+				switch(vcpu->arch.mp_state) {
+				case KVM_MP_STATE_HALTED:
 					vcpu->arch.mp_state =
-							KVM_MP_STATE_RUNNABLE;
-			if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
-				r = -EINTR;
+						KVM_MP_STATE_RUNNABLE;
+				case KVM_MP_STATE_RUNNABLE:
+					break;
+				case KVM_MP_STATE_SIPI_RECEIVED:
+				default:
+					r = -EINTR;
+					break;
+				}
+			}
 		}
 
-		if (r > 0) {
-			if (dm_request_for_irq_injection(vcpu, kvm_run)) {
-				r = -EINTR;
-				kvm_run->exit_reason = KVM_EXIT_INTR;
-				++vcpu->stat.request_irq_exits;
-			}
-			if (signal_pending(current)) {
-				r = -EINTR;
-				kvm_run->exit_reason = KVM_EXIT_INTR;
-				++vcpu->stat.signal_exits;
-			}
-			if (need_resched()) {
-				up_read(&vcpu->kvm->slots_lock);
-				kvm_resched(vcpu);
-				down_read(&vcpu->kvm->slots_lock);
-			}
+		if (r <= 0)
+			break;
+
+		clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+		if (kvm_cpu_has_pending_timer(vcpu))
+			kvm_inject_pending_timer_irqs(vcpu);
+
+		if (dm_request_for_irq_injection(vcpu, kvm_run)) {
+			r = -EINTR;
+			kvm_run->exit_reason = KVM_EXIT_INTR;
+			++vcpu->stat.request_irq_exits;
+		}
+		if (signal_pending(current)) {
+			r = -EINTR;
+			kvm_run->exit_reason = KVM_EXIT_INTR;
+			++vcpu->stat.signal_exits;
+		}
+		if (need_resched()) {
+			up_read(&vcpu->kvm->slots_lock);
+			kvm_resched(vcpu);
+			down_read(&vcpu->kvm->slots_lock);
 		}
 	}
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 934dd1c..a1a4272 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1611,11 +1611,12 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 		prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
 
 		if (kvm_cpu_has_interrupt(vcpu) ||
-		    kvm_cpu_has_pending_timer(vcpu) ||
-		    kvm_arch_vcpu_runnable(vcpu)) {
+				kvm_arch_vcpu_runnable(vcpu)) {
 			set_bit(KVM_REQ_UNHALT, &vcpu->requests);
 			break;
 		}
+		if (kvm_cpu_has_pending_timer(vcpu))
+			break;
 		if (signal_pending(current))
 			break;
 
-- 
cgit v0.10.2


From 78646121e9a2fcf7977cc15966420e572a450bc3 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 23 Mar 2009 12:12:11 +0200
Subject: KVM: Fix interrupt unhalting a vcpu when it shouldn't

kvm_vcpu_block() unhalts vpu on an interrupt/timer without checking
if interrupt window is actually opened.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index d2a90fd..3bf0a34 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1963,6 +1963,12 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+	/* do real check here */
+	return 1;
+}
+
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.timer_fired;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 9057335..2cf915e 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -41,6 +41,12 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 	return !!(v->arch.pending_exceptions);
 }
 
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+	/* do real check here */
+	return 1;
+}
+
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
 	return !(v->arch.msr & MSR_WE);
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 0189356..4ed4c3a 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -318,6 +318,12 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
 	return rc;
 }
 
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+	/* do real check here */
+	return 1;
+}
+
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
 	return 0;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4627627..8351c4d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -521,7 +521,7 @@ struct kvm_x86_ops {
 	void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
 	void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
 				       struct kvm_run *run);
-
+	int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
 	int (*get_mt_mask_shift)(void);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index aa528db..de74104 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2270,6 +2270,15 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
 		vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
 }
 
+static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	struct vmcb *vmcb = svm->vmcb;
+	return (vmcb->save.rflags & X86_EFLAGS_IF) &&
+		!(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
+		(svm->vcpu.arch.hflags & HF_GIF_MASK);
+}
+
 static void svm_intr_assist(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -2649,6 +2658,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.exception_injected = svm_exception_injected,
 	.inject_pending_irq = svm_intr_assist,
 	.inject_pending_vectors = do_interrupt_requests,
+	.interrupt_allowed = svm_interrupt_allowed,
 
 	.set_tss_addr = svm_set_tss_addr,
 	.get_tdp_level = get_npt_level,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index da6461d..b9e06b0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2490,6 +2490,12 @@ static void vmx_update_window_states(struct kvm_vcpu *vcpu)
 				 GUEST_INTR_STATE_MOV_SS)));
 }
 
+static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+	vmx_update_window_states(vcpu);
+	return vcpu->arch.interrupt_window_open;
+}
+
 static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 				       struct kvm_run *kvm_run)
 {
@@ -3691,7 +3697,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.exception_injected = vmx_exception_injected,
 	.inject_pending_irq = vmx_intr_assist,
 	.inject_pending_vectors = do_interrupt_requests,
-
+	.interrupt_allowed = vmx_interrupt_allowed,
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,
 	.get_mt_mask_shift = vmx_get_mt_mask_shift,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8fca7a4..5bbcad3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4475,3 +4475,8 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 		smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
 	put_cpu();
 }
+
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+	return kvm_x86_ops->interrupt_allowed(vcpu);
+}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 40e49ed..72d5684 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -298,6 +298,7 @@ int kvm_arch_hardware_setup(void);
 void kvm_arch_hardware_unsetup(void);
 void kvm_arch_check_processor_compat(void *rtn);
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
 
 void kvm_free_physmem(struct kvm *kvm);
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a1a4272..63d5fa2 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1610,7 +1610,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 	for (;;) {
 		prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
 
-		if (kvm_cpu_has_interrupt(vcpu) ||
+		if ((kvm_arch_interrupt_allowed(vcpu) &&
+					kvm_cpu_has_interrupt(vcpu)) ||
 				kvm_arch_vcpu_runnable(vcpu)) {
 			set_bit(KVM_REQ_UNHALT, &vcpu->requests);
 			break;
-- 
cgit v0.10.2


From 7d433b9f942606f66da8ef68b8baecd2915c5627 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 24 Mar 2009 14:27:47 +0200
Subject: KVM: VMX: Make flexpriority module parameter reflect hardware
 capability

If the hardware does not support flexpriority, zero the module parameter.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b9e06b0..37ae13d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -237,9 +237,7 @@ static inline int cpu_has_secondary_exec_ctrls(void)
 
 static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
 {
-	return flexpriority_enabled
-		&& (vmcs_config.cpu_based_2nd_exec_ctrl &
-		    SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+	return flexpriority_enabled;
 }
 
 static inline int cpu_has_vmx_invept_individual_addr(void)
@@ -1203,6 +1201,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	if (!cpu_has_vmx_ept())
 		enable_ept = 0;
 
+	if (!(vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+		flexpriority_enabled = 0;
+
 	min = 0;
 #ifdef CONFIG_X86_64
 	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
-- 
cgit v0.10.2


From f9c617f61127615dd054f3f159213bdd12451cab Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 25 Mar 2009 10:08:52 +0800
Subject: KVM: VMX: Correct wrong vmcs field sizes

EXIT_QUALIFICATION and GUEST_LINEAR_ADDRESS are natural width, not 64-bit.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 37ae13d..aba41ae 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2991,7 +2991,7 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
-	u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
 	kvm_mmu_invlpg(vcpu, exit_qualification);
 	skip_emulated_instruction(vcpu);
@@ -3007,11 +3007,11 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
-	u64 exit_qualification;
+	unsigned long exit_qualification;
 	enum emulation_result er;
 	unsigned long offset;
 
-	exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 	offset = exit_qualification & 0xffful;
 
 	er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
@@ -3062,11 +3062,11 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
-	u64 exit_qualification;
+	unsigned long exit_qualification;
 	gpa_t gpa;
 	int gla_validity;
 
-	exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
 	if (exit_qualification & (1 << 6)) {
 		printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
@@ -3078,7 +3078,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
 		printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
 			(long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
-			(long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS));
+			vmcs_readl(GUEST_LINEAR_ADDRESS));
 		printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
 			(long unsigned int)exit_qualification);
 		kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
-- 
cgit v0.10.2


From a8b876b1a469cb364fee16ba3aef01613a1231cc Mon Sep 17 00:00:00 2001
From: Eddie Dong <eddie.dong@intel.com>
Date: Thu, 26 Mar 2009 15:28:40 +0800
Subject: KVM: MMU: Fix comment in page_fault()

The original one is for the code before refactoring.

Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 855eb71..eae9499 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -379,7 +379,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 		return r;
 
 	/*
-	 * Look up the shadow pte for the faulting address.
+	 * Look up the guest pte for the faulting address.
 	 */
 	r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
 			     fetch_fault);
-- 
cgit v0.10.2


From 362c1055e58ecd25a9393c520ab263c80b147497 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.zhang@intel.com>
Date: Mon, 23 Mar 2009 03:31:04 -0400
Subject: KVM: ia64: enable external interrupt in vmm

Currently, the interrupt enable bit is cleared when in
the vmm.  This patch sets the bit and the external interrupts can
be dealt with when in the vmm.  This improves the I/O performance.

Signed-off-by: Yang Zhang <yang.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/ia64/kvm/process.c b/arch/ia64/kvm/process.c
index b1dc809..a8f84da 100644
--- a/arch/ia64/kvm/process.c
+++ b/arch/ia64/kvm/process.c
@@ -652,20 +652,25 @@ void  kvm_ia64_handle_break(unsigned long ifa, struct kvm_pt_regs *regs,
 		unsigned long isr, unsigned long iim)
 {
 	struct kvm_vcpu *v = current_vcpu;
+	long psr;
 
 	if (ia64_psr(regs)->cpl == 0) {
 		/* Allow hypercalls only when cpl = 0.  */
 		if (iim == DOMN_PAL_REQUEST) {
+			local_irq_save(psr);
 			set_pal_call_data(v);
 			vmm_transition(v);
 			get_pal_call_result(v);
 			vcpu_increment_iip(v);
+			local_irq_restore(psr);
 			return;
 		} else if (iim == DOMN_SAL_REQUEST) {
+			local_irq_save(psr);
 			set_sal_call_data(v);
 			vmm_transition(v);
 			get_sal_call_result(v);
 			vcpu_increment_iip(v);
+			local_irq_restore(psr);
 			return;
 		}
 	}
diff --git a/arch/ia64/kvm/vmm_ivt.S b/arch/ia64/kvm/vmm_ivt.S
index 3ef1a01..40920c6 100644
--- a/arch/ia64/kvm/vmm_ivt.S
+++ b/arch/ia64/kvm/vmm_ivt.S
@@ -95,7 +95,7 @@ GLOBAL_ENTRY(kvm_vmm_panic)
 	;;
 	srlz.i    // guarantee that interruption collection is on
 	;;
-	//(p15) ssm psr.i               // restore psr.i
+	(p15) ssm psr.i               // restore psr.
 	addl r14=@gprel(ia64_leave_hypervisor),gp
 	;;
 	KVM_SAVE_REST
@@ -249,7 +249,7 @@ ENTRY(kvm_break_fault)
 	;;
 	srlz.i         // guarantee that interruption collection is on
 	;;
-	//(p15)ssm psr.i               // restore psr.i
+	(p15)ssm psr.i               // restore psr.i
 	addl r14=@gprel(ia64_leave_hypervisor),gp
 	;;
 	KVM_SAVE_REST
@@ -439,7 +439,7 @@ kvm_dispatch_vexirq:
 	;;
 	srlz.i // guarantee that interruption collection is on
 	;;
-	//(p15) ssm psr.i               // restore psr.i
+	(p15) ssm psr.i               // restore psr.i
 	adds r3=8,r2                // set up second base pointer
 	;;
 	KVM_SAVE_REST
@@ -819,7 +819,7 @@ ENTRY(kvm_dtlb_miss_dispatch)
 	;;
 	srlz.i     // guarantee that interruption collection is on
 	;;
-	//(p15) ssm psr.i               // restore psr.i
+	(p15) ssm psr.i               // restore psr.i
 	addl r14=@gprel(ia64_leave_hypervisor_prepare),gp
 	;;
 	KVM_SAVE_REST
@@ -842,7 +842,7 @@ ENTRY(kvm_itlb_miss_dispatch)
 	;;
 	srlz.i   // guarantee that interruption collection is on
 	;;
-	//(p15) ssm psr.i               // restore psr.i
+	(p15) ssm psr.i               // restore psr.i
 	addl r14=@gprel(ia64_leave_hypervisor),gp
 	;;
 	KVM_SAVE_REST
@@ -871,7 +871,7 @@ ENTRY(kvm_dispatch_reflection)
 	;;
 	srlz.i   // guarantee that interruption collection is on
 	;;
-	//(p15) ssm psr.i               // restore psr.i
+	(p15) ssm psr.i               // restore psr.i
 	addl r14=@gprel(ia64_leave_hypervisor),gp
 	;;
 	KVM_SAVE_REST
@@ -898,7 +898,7 @@ ENTRY(kvm_dispatch_virtualization_fault)
 	;;
 	srlz.i    // guarantee that interruption collection is on
 	;;
-	//(p15) ssm psr.i               // restore psr.i
+	(p15) ssm psr.i               // restore psr.i
 	addl r14=@gprel(ia64_leave_hypervisor_prepare),gp
 	;;
 	KVM_SAVE_REST
@@ -920,7 +920,7 @@ ENTRY(kvm_dispatch_interrupt)
 	;;
 	srlz.i
 	;;
-	//(p15) ssm psr.i
+	(p15) ssm psr.i
 	addl r14=@gprel(ia64_leave_hypervisor),gp
 	;;
 	KVM_SAVE_REST
@@ -1333,7 +1333,7 @@ hostret =   r24
 	;;
 (p7)    srlz.i
 	;;
-//(p6)    ssm psr.i
+(p6)    ssm psr.i
 	;;
 	mov rp=rpsave
 	mov ar.pfs=pfssave
diff --git a/arch/ia64/kvm/vtlb.c b/arch/ia64/kvm/vtlb.c
index 2c2501f..4290a42 100644
--- a/arch/ia64/kvm/vtlb.c
+++ b/arch/ia64/kvm/vtlb.c
@@ -254,7 +254,8 @@ u64 guest_vhpt_lookup(u64 iha, u64 *pte)
 			"(p7) st8 [%2]=r9;;"
 			"ssm psr.ic;;"
 			"srlz.d;;"
-			/* "ssm psr.i;;" Once interrupts in vmm open, need fix*/
+			"ssm psr.i;;"
+			"srlz.d;;"
 			: "=r"(ret) : "r"(iha), "r"(pte):"memory");
 
 	return ret;
-- 
cgit v0.10.2


From 82725b20e22fb85377f61a16f6d0d5cfc28b45d3 Mon Sep 17 00:00:00 2001
From: "Dong, Eddie" <eddie.dong@intel.com>
Date: Mon, 30 Mar 2009 16:21:08 +0800
Subject: KVM: MMU: Emulate #PF error code of reserved bits violation

Detect, indicate, and propagate page faults where reserved bits are set.
Take care to handle the different paging modes, each of which has different
sets of reserved bits.

[avi: fix pte reserved bits for efer.nxe=0]

Signed-off-by: Eddie Dong <eddie.dong@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8351c4d..548b97d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -261,6 +261,7 @@ struct kvm_mmu {
 	union kvm_mmu_page_role base_role;
 
 	u64 *pae_root;
+	u64 rsvd_bits_mask[2][4];
 };
 
 struct kvm_vcpu_arch {
@@ -791,5 +792,6 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
 
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8aac67c..b2c8e28 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -126,6 +126,7 @@ module_param(oos_shadow, bool, 0644);
 #define PFERR_PRESENT_MASK (1U << 0)
 #define PFERR_WRITE_MASK (1U << 1)
 #define PFERR_USER_MASK (1U << 2)
+#define PFERR_RSVD_MASK (1U << 3)
 #define PFERR_FETCH_MASK (1U << 4)
 
 #define PT_DIRECTORY_LEVEL 2
@@ -179,6 +180,11 @@ static u64 __read_mostly shadow_accessed_mask;
 static u64 __read_mostly shadow_dirty_mask;
 static u64 __read_mostly shadow_mt_mask;
 
+static inline u64 rsvd_bits(int s, int e)
+{
+	return ((1ULL << (e - s + 1)) - 1) << s;
+}
+
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
 {
 	shadow_trap_nonpresent_pte = trap_pte;
@@ -2151,6 +2157,14 @@ static void paging_free(struct kvm_vcpu *vcpu)
 	nonpaging_free(vcpu);
 }
 
+static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
+{
+	int bit7;
+
+	bit7 = (gpte >> 7) & 1;
+	return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0;
+}
+
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
@@ -2159,6 +2173,55 @@ static void paging_free(struct kvm_vcpu *vcpu)
 #include "paging_tmpl.h"
 #undef PTTYPE
 
+static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
+{
+	struct kvm_mmu *context = &vcpu->arch.mmu;
+	int maxphyaddr = cpuid_maxphyaddr(vcpu);
+	u64 exb_bit_rsvd = 0;
+
+	if (!is_nx(vcpu))
+		exb_bit_rsvd = rsvd_bits(63, 63);
+	switch (level) {
+	case PT32_ROOT_LEVEL:
+		/* no rsvd bits for 2 level 4K page table entries */
+		context->rsvd_bits_mask[0][1] = 0;
+		context->rsvd_bits_mask[0][0] = 0;
+		if (is_cpuid_PSE36())
+			/* 36bits PSE 4MB page */
+			context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
+		else
+			/* 32 bits PSE 4MB page */
+			context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
+		context->rsvd_bits_mask[1][0] = ~0ull;
+		break;
+	case PT32E_ROOT_LEVEL:
+		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
+			rsvd_bits(maxphyaddr, 62);		/* PDE */
+		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
+			rsvd_bits(maxphyaddr, 62); 	/* PTE */
+		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
+			rsvd_bits(maxphyaddr, 62) |
+			rsvd_bits(13, 20);		/* large page */
+		context->rsvd_bits_mask[1][0] = ~0ull;
+		break;
+	case PT64_ROOT_LEVEL:
+		context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
+			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+		context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
+			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
+			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
+			rsvd_bits(maxphyaddr, 51);
+		context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
+		context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2];
+		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
+			rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 20);
+		context->rsvd_bits_mask[1][0] = ~0ull;
+		break;
+	}
+}
+
 static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
 {
 	struct kvm_mmu *context = &vcpu->arch.mmu;
@@ -2179,6 +2242,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
 
 static int paging64_init_context(struct kvm_vcpu *vcpu)
 {
+	reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
 	return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
 }
 
@@ -2186,6 +2250,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu *context = &vcpu->arch.mmu;
 
+	reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
 	context->new_cr3 = paging_new_cr3;
 	context->page_fault = paging32_page_fault;
 	context->gva_to_gpa = paging32_gva_to_gpa;
@@ -2201,6 +2266,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
 
 static int paging32E_init_context(struct kvm_vcpu *vcpu)
 {
+	reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
 	return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
 }
 
@@ -2221,12 +2287,15 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
 		context->root_level = 0;
 	} else if (is_long_mode(vcpu)) {
+		reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
 		context->gva_to_gpa = paging64_gva_to_gpa;
 		context->root_level = PT64_ROOT_LEVEL;
 	} else if (is_pae(vcpu)) {
+		reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
 		context->gva_to_gpa = paging64_gva_to_gpa;
 		context->root_level = PT32E_ROOT_LEVEL;
 	} else {
+		reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
 		context->gva_to_gpa = paging32_gva_to_gpa;
 		context->root_level = PT32_ROOT_LEVEL;
 	}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index eae9499..09782a9 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -123,6 +123,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
 	gfn_t table_gfn;
 	unsigned index, pt_access, pte_access;
 	gpa_t pte_gpa;
+	int rsvd_fault = 0;
 
 	pgprintk("%s: addr %lx\n", __func__, addr);
 walk:
@@ -157,6 +158,10 @@ walk:
 		if (!is_present_pte(pte))
 			goto not_present;
 
+		rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
+		if (rsvd_fault)
+			goto access_error;
+
 		if (write_fault && !is_writeble_pte(pte))
 			if (user_fault || is_write_protection(vcpu))
 				goto access_error;
@@ -232,6 +237,8 @@ err:
 		walker->error_code |= PFERR_USER_MASK;
 	if (fetch_fault)
 		walker->error_code |= PFERR_FETCH_MASK;
+	if (rsvd_fault)
+		walker->error_code |= PFERR_RSVD_MASK;
 	return 0;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5bbcad3..df86668 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3017,6 +3017,16 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
 	return best;
 }
 
+int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
+	if (best)
+		return best->eax & 0xff;
+	return 36;
+}
+
 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
 {
 	u32 function, index;
-- 
cgit v0.10.2


From 9645bb56b31a1b70ab9e470387b5264cafc04aa9 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 31 Mar 2009 11:31:54 +0300
Subject: KVM: MMU: Use different shadows when EFER.NXE changes

A pte that is shadowed when the guest EFER.NXE=1 is not valid when
EFER.NXE=0; if bit 63 is set, the pte should cause a fault, and since the
shadow EFER always has NX enabled, this won't happen.

Fix by using a different shadow page table for different EFER.NXE bits.  This
allows vcpus to run correctly with different values of EFER.NXE, and for
transitions on this bit to be handled correctly without requiring a full
flush.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 548b97d..3fc4623 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -185,6 +185,7 @@ union kvm_mmu_page_role {
 		unsigned access:3;
 		unsigned invalid:1;
 		unsigned cr4_pge:1;
+		unsigned nxe:1;
 	};
 };
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index df86668..007fadd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -523,6 +523,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 	efer |= vcpu->arch.shadow_efer & EFER_LMA;
 
 	vcpu->arch.shadow_efer = efer;
+
+	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
+	kvm_mmu_reset_context(vcpu);
 }
 
 void kvm_enable_efer_bits(u64 mask)
-- 
cgit v0.10.2


From 7a6ce84c74a3064f6b6dda9e2d55c509cd4e78e4 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 31 Mar 2009 16:47:44 +0800
Subject: KVM: remove pointless conditional before kfree() in lapic
 initialization

Remove pointless conditional before kfree().

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 007fadd..7fe83fe 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1592,8 +1592,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = -EINVAL;
 	}
 out:
-	if (lapic)
-		kfree(lapic);
+	kfree(lapic);
 	return r;
 }
 
-- 
cgit v0.10.2


From 045471563df4b8723202a66ae54d58788b0f8e88 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 1 Apr 2009 15:52:31 +0800
Subject: KVM: VMX: Clean up Flex Priority related

And clean paranthes on returns.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index aba41ae..1caa1fc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -216,61 +216,69 @@ static inline int is_external_interrupt(u32 intr_info)
 
 static inline int cpu_has_vmx_msr_bitmap(void)
 {
-	return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS);
+	return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
 }
 
 static inline int cpu_has_vmx_tpr_shadow(void)
 {
-	return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW);
+	return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
 }
 
 static inline int vm_need_tpr_shadow(struct kvm *kvm)
 {
-	return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)));
+	return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
 }
 
 static inline int cpu_has_secondary_exec_ctrls(void)
 {
-	return (vmcs_config.cpu_based_exec_ctrl &
-		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
+	return vmcs_config.cpu_based_exec_ctrl &
+		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
 }
 
 static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
 {
-	return flexpriority_enabled;
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+}
+
+static inline bool cpu_has_vmx_flexpriority(void)
+{
+	return cpu_has_vmx_tpr_shadow() &&
+		cpu_has_vmx_virtualize_apic_accesses();
 }
 
 static inline int cpu_has_vmx_invept_individual_addr(void)
 {
-	return (!!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT));
+	return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
 }
 
 static inline int cpu_has_vmx_invept_context(void)
 {
-	return (!!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT));
+	return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT);
 }
 
 static inline int cpu_has_vmx_invept_global(void)
 {
-	return (!!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT));
+	return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT);
 }
 
 static inline int cpu_has_vmx_ept(void)
 {
-	return (vmcs_config.cpu_based_2nd_exec_ctrl &
-		SECONDARY_EXEC_ENABLE_EPT);
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_ENABLE_EPT;
 }
 
 static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
 {
-	return ((cpu_has_vmx_virtualize_apic_accesses()) &&
-		(irqchip_in_kernel(kvm)));
+	return flexpriority_enabled &&
+		(cpu_has_vmx_virtualize_apic_accesses()) &&
+		(irqchip_in_kernel(kvm));
 }
 
 static inline int cpu_has_vmx_vpid(void)
 {
-	return (vmcs_config.cpu_based_2nd_exec_ctrl &
-		SECONDARY_EXEC_ENABLE_VPID);
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_ENABLE_VPID;
 }
 
 static inline int cpu_has_virtual_nmis(void)
@@ -278,6 +286,11 @@ static inline int cpu_has_virtual_nmis(void)
 	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
 }
 
+static inline bool report_flexpriority(void)
+{
+	return flexpriority_enabled;
+}
+
 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 {
 	int i;
@@ -1201,7 +1214,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	if (!cpu_has_vmx_ept())
 		enable_ept = 0;
 
-	if (!(vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+	if (!cpu_has_vmx_flexpriority())
 		flexpriority_enabled = 0;
 
 	min = 0;
@@ -3655,7 +3668,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.check_processor_compatibility = vmx_check_processor_compat,
 	.hardware_enable = hardware_enable,
 	.hardware_disable = hardware_disable,
-	.cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses,
+	.cpu_has_accelerated_tpr = report_flexpriority,
 
 	.vcpu_create = vmx_create_vcpu,
 	.vcpu_free = vmx_free_vcpu,
-- 
cgit v0.10.2


From 93ba03c2e2aba23b042cc15eef83b7a66d3ac17a Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 1 Apr 2009 15:52:32 +0800
Subject: KVM: VMX: Fix feature testing

The testing of feature is too early now, before vmcs_config complete initialization.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1caa1fc..7d7b0d6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1208,15 +1208,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 		      vmx_capability.ept, vmx_capability.vpid);
 	}
 
-	if (!cpu_has_vmx_vpid())
-		enable_vpid = 0;
-
-	if (!cpu_has_vmx_ept())
-		enable_ept = 0;
-
-	if (!cpu_has_vmx_flexpriority())
-		flexpriority_enabled = 0;
-
 	min = 0;
 #ifdef CONFIG_X86_64
 	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
@@ -1320,6 +1311,15 @@ static __init int hardware_setup(void)
 	if (boot_cpu_has(X86_FEATURE_NX))
 		kvm_enable_efer_bits(EFER_NX);
 
+	if (!cpu_has_vmx_vpid())
+		enable_vpid = 0;
+
+	if (!cpu_has_vmx_ept())
+		enable_ept = 0;
+
+	if (!cpu_has_vmx_flexpriority())
+		flexpriority_enabled = 0;
+
 	return alloc_kvm_area();
 }
 
-- 
cgit v0.10.2


From 20c466b56168ddccf034c136510d73e4a0e18605 Mon Sep 17 00:00:00 2001
From: "Dong, Eddie" <eddie.dong@intel.com>
Date: Tue, 31 Mar 2009 23:03:45 +0800
Subject: KVM: Use rsvd_bits_mask in load_pdptrs()

Also remove bit 5-6 from rsvd_bits_mask per latest SDM.

Signed-off-by: Eddie Dong <Eddie.Dong@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b2c8e28..da3ad3c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -225,11 +225,6 @@ static int is_nx(struct kvm_vcpu *vcpu)
 	return vcpu->arch.shadow_efer & EFER_NX;
 }
 
-static int is_present_pte(unsigned long pte)
-{
-	return pte & PT_PRESENT_MASK;
-}
-
 static int is_shadow_present_pte(u64 pte)
 {
 	return pte != shadow_trap_nonpresent_pte
@@ -2195,6 +2190,9 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
 		context->rsvd_bits_mask[1][0] = ~0ull;
 		break;
 	case PT32E_ROOT_LEVEL:
+		context->rsvd_bits_mask[0][2] =
+			rsvd_bits(maxphyaddr, 63) |
+			rsvd_bits(7, 8) | rsvd_bits(1, 2);	/* PDPTE */
 		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
 			rsvd_bits(maxphyaddr, 62);		/* PDE */
 		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index eaab214..3494a2f 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -75,4 +75,9 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
 	return vcpu->arch.cr0 & X86_CR0_PG;
 }
 
+static inline int is_present_pte(unsigned long pte)
+{
+	return pte & PT_PRESENT_MASK;
+}
+
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7fe83fe..70ee81e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -234,7 +234,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 		goto out;
 	}
 	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
-		if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
+		if (is_present_pte(pdpte[i]) &&
+		    (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
 			ret = 0;
 			goto out;
 		}
-- 
cgit v0.10.2


From 7b4a25cb296e2a73d2e87a4af65361d45d450a27 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 30 Mar 2009 16:03:08 +0300
Subject: KVM: VMX: Fix handling of a fault during NMI unblocked due to IRET

Bit 12 is undefined in any of the following cases:
 If the VM exit sets the valid bit in the IDT-vectoring information field.
 If the VM exit is due to a double fault.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7d7b0d6..631f9b7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3272,36 +3272,41 @@ static void update_tpr_threshold(struct kvm_vcpu *vcpu)
 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 {
 	u32 exit_intr_info;
-	u32 idt_vectoring_info;
+	u32 idt_vectoring_info = vmx->idt_vectoring_info;
 	bool unblock_nmi;
 	u8 vector;
 	int type;
 	bool idtv_info_valid;
 	u32 error;
 
+	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 	if (cpu_has_virtual_nmis()) {
 		unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
 		vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
 		/*
-		 * SDM 3: 25.7.1.2
+		 * SDM 3: 27.7.1.2 (September 2008)
 		 * Re-set bit "block by NMI" before VM entry if vmexit caused by
 		 * a guest IRET fault.
+		 * SDM 3: 23.2.2 (September 2008)
+		 * Bit 12 is undefined in any of the following cases:
+		 *  If the VM exit sets the valid bit in the IDT-vectoring
+		 *   information field.
+		 *  If the VM exit is due to a double fault.
 		 */
-		if (unblock_nmi && vector != DF_VECTOR)
+		if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
+		    vector != DF_VECTOR && !idtv_info_valid)
 			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
 				      GUEST_INTR_STATE_NMI);
 	} else if (unlikely(vmx->soft_vnmi_blocked))
 		vmx->vnmi_blocked_time +=
 			ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
 
-	idt_vectoring_info = vmx->idt_vectoring_info;
-	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 	vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
 	type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
 	if (vmx->vcpu.arch.nmi_injected) {
 		/*
-		 * SDM 3: 25.7.1.2
+		 * SDM 3: 27.7.1.2 (September 2008)
 		 * Clear bit "block by NMI" before VM entry if a NMI delivery
 		 * faulted.
 		 */
-- 
cgit v0.10.2


From 37b96e988053c4dd21811b0408a12f8f60b4d0c8 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 30 Mar 2009 16:03:13 +0300
Subject: KVM: VMX: Rewrite vmx_complete_interrupt()'s twisted maze of if()
 statements

...with a more straightforward switch().

Also fix a bug when NMI could be dropped on exit. Although this should
never happen in practice, since NMIs can only be injected, never triggered
internally by the guest like exceptions.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 631f9b7..577aa95 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3277,7 +3277,6 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	u8 vector;
 	int type;
 	bool idtv_info_valid;
-	u32 error;
 
 	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
@@ -3302,34 +3301,42 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 		vmx->vnmi_blocked_time +=
 			ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
 
+	vmx->vcpu.arch.nmi_injected = false;
+	kvm_clear_exception_queue(&vmx->vcpu);
+	kvm_clear_interrupt_queue(&vmx->vcpu);
+
+	if (!idtv_info_valid)
+		return;
+
 	vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
 	type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
-	if (vmx->vcpu.arch.nmi_injected) {
+
+	switch(type) {
+	case INTR_TYPE_NMI_INTR:
+		vmx->vcpu.arch.nmi_injected = true;
 		/*
 		 * SDM 3: 27.7.1.2 (September 2008)
-		 * Clear bit "block by NMI" before VM entry if a NMI delivery
-		 * faulted.
+		 * Clear bit "block by NMI" before VM entry if a NMI
+		 * delivery faulted.
 		 */
-		if (idtv_info_valid && type == INTR_TYPE_NMI_INTR)
-			vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
-					GUEST_INTR_STATE_NMI);
-		else
-			vmx->vcpu.arch.nmi_injected = false;
-	}
-	kvm_clear_exception_queue(&vmx->vcpu);
-	if (idtv_info_valid && (type == INTR_TYPE_HARD_EXCEPTION ||
-				type == INTR_TYPE_SOFT_EXCEPTION)) {
+		vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+				GUEST_INTR_STATE_NMI);
+		break;
+	case INTR_TYPE_HARD_EXCEPTION:
+	case INTR_TYPE_SOFT_EXCEPTION:
 		if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
-			error = vmcs_read32(IDT_VECTORING_ERROR_CODE);
-			kvm_queue_exception_e(&vmx->vcpu, vector, error);
+			u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE);
+			kvm_queue_exception_e(&vmx->vcpu, vector, err);
 		} else
 			kvm_queue_exception(&vmx->vcpu, vector);
 		vmx->idt_vectoring_info = 0;
-	}
-	kvm_clear_interrupt_queue(&vmx->vcpu);
-	if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) {
+		break;
+	case INTR_TYPE_EXT_INTR:
 		kvm_queue_interrupt(&vmx->vcpu, vector);
 		vmx->idt_vectoring_info = 0;
+		break;
+	default:
+		break;
 	}
 }
 
-- 
cgit v0.10.2


From 8843419048e500f8f38df555bca1bf7948804b7f Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 30 Mar 2009 16:03:18 +0300
Subject: KVM: VMX: Do not zero idt_vectoring_info in
 vmx_complete_interrupts().

We will need it later in task_switch().
Code in handle_exception() is dead. is_external_interrupt(vect_info)
will always be false since idt_vectoring_info is zeroed in
vmx_complete_interrupts().

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 577aa95..e4ad9d3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2626,11 +2626,6 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
 		       "intr info 0x%x\n", __func__, vect_info, intr_info);
 
-	if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
-		int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
-		kvm_push_irq(vcpu, irq);
-	}
-
 	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
 		return 1;  /* already handled by vmx_vcpu_run() */
 
@@ -3329,11 +3324,9 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 			kvm_queue_exception_e(&vmx->vcpu, vector, err);
 		} else
 			kvm_queue_exception(&vmx->vcpu, vector);
-		vmx->idt_vectoring_info = 0;
 		break;
 	case INTR_TYPE_EXT_INTR:
 		kvm_queue_interrupt(&vmx->vcpu, vector);
-		vmx->idt_vectoring_info = 0;
 		break;
 	default:
 		break;
-- 
cgit v0.10.2


From b237ac37a149e8b56436fabf093532483bff13b0 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 30 Mar 2009 16:03:24 +0300
Subject: KVM: Fix task switch back link handling.

Back link is written to a wrong TSS now.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 70ee81e..adcf738 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3717,7 +3717,6 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
 	tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
 	tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
 	tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
-	tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
 }
 
 static int load_state_from_tss32(struct kvm_vcpu *vcpu,
@@ -3814,8 +3813,8 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,
 }
 
 static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
-		       u32 old_tss_base,
-		       struct desc_struct *nseg_desc)
+			      u16 old_tss_sel, u32 old_tss_base,
+			      struct desc_struct *nseg_desc)
 {
 	struct tss_segment_16 tss_segment_16;
 	int ret = 0;
@@ -3834,6 +3833,16 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
 			   &tss_segment_16, sizeof tss_segment_16))
 		goto out;
 
+	if (old_tss_sel != 0xffff) {
+		tss_segment_16.prev_task_link = old_tss_sel;
+
+		if (kvm_write_guest(vcpu->kvm,
+				    get_tss_base_addr(vcpu, nseg_desc),
+				    &tss_segment_16.prev_task_link,
+				    sizeof tss_segment_16.prev_task_link))
+			goto out;
+	}
+
 	if (load_state_from_tss16(vcpu, &tss_segment_16))
 		goto out;
 
@@ -3843,7 +3852,7 @@ out:
 }
 
 static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
-		       u32 old_tss_base,
+		       u16 old_tss_sel, u32 old_tss_base,
 		       struct desc_struct *nseg_desc)
 {
 	struct tss_segment_32 tss_segment_32;
@@ -3863,6 +3872,16 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
 			   &tss_segment_32, sizeof tss_segment_32))
 		goto out;
 
+	if (old_tss_sel != 0xffff) {
+		tss_segment_32.prev_task_link = old_tss_sel;
+
+		if (kvm_write_guest(vcpu->kvm,
+				    get_tss_base_addr(vcpu, nseg_desc),
+				    &tss_segment_32.prev_task_link,
+				    sizeof tss_segment_32.prev_task_link))
+			goto out;
+	}
+
 	if (load_state_from_tss32(vcpu, &tss_segment_32))
 		goto out;
 
@@ -3918,12 +3937,17 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
 
 	kvm_x86_ops->skip_emulated_instruction(vcpu);
 
+	/* set back link to prev task only if NT bit is set in eflags
+	   note that old_tss_sel is not used afetr this point */
+	if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+		old_tss_sel = 0xffff;
+
 	if (nseg_desc.type & 8)
-		ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
-					 &nseg_desc);
+		ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
+					 old_tss_base, &nseg_desc);
 	else
-		ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base,
-					 &nseg_desc);
+		ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
+					 old_tss_base, &nseg_desc);
 
 	if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
 		u32 eflags = kvm_x86_ops->get_rflags(vcpu);
-- 
cgit v0.10.2


From 64a7ec066813443440bfc9f60a9e76a47cfa6b2b Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 30 Mar 2009 16:03:29 +0300
Subject: KVM: Fix unneeded instruction skipping during task switching.

There is no need to skip instruction if the reason for a task switch
is a task gate in IDT and access to it is caused by an external even.
The problem  is currently solved only for VMX since there is no reliable
way to skip an instruction in SVM. We should emulate it instead.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 82ada75..85574b7 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -225,6 +225,7 @@ struct __attribute__ ((__packed__)) vmcb {
 #define SVM_EVTINJ_VALID_ERR (1 << 11)
 
 #define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
+#define SVM_EXITINTINFO_TYPE_MASK SVM_EVTINJ_TYPE_MASK
 
 #define	SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
 #define	SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index de74104..bba67b7 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1825,17 +1825,28 @@ static int task_switch_interception(struct vcpu_svm *svm,
 				    struct kvm_run *kvm_run)
 {
 	u16 tss_selector;
+	int reason;
+	int int_type = svm->vmcb->control.exit_int_info &
+		SVM_EXITINTINFO_TYPE_MASK;
 
 	tss_selector = (u16)svm->vmcb->control.exit_info_1;
+
 	if (svm->vmcb->control.exit_info_2 &
 	    (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
-		return kvm_task_switch(&svm->vcpu, tss_selector,
-				       TASK_SWITCH_IRET);
-	if (svm->vmcb->control.exit_info_2 &
-	    (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
-		return kvm_task_switch(&svm->vcpu, tss_selector,
-				       TASK_SWITCH_JMP);
-	return kvm_task_switch(&svm->vcpu, tss_selector, TASK_SWITCH_CALL);
+		reason = TASK_SWITCH_IRET;
+	else if (svm->vmcb->control.exit_info_2 &
+		 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
+		reason = TASK_SWITCH_JMP;
+	else if (svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID)
+		reason = TASK_SWITCH_GATE;
+	else
+		reason = TASK_SWITCH_CALL;
+
+
+	if (reason != TASK_SWITCH_GATE || int_type == SVM_EXITINTINFO_TYPE_SOFT)
+		skip_emulated_instruction(&svm->vcpu);
+
+	return kvm_task_switch(&svm->vcpu, tss_selector, reason);
 }
 
 static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e4ad9d3..c6997c0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3038,22 +3038,40 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	unsigned long exit_qualification;
 	u16 tss_selector;
-	int reason;
+	int reason, type, idt_v;
+
+	idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+	type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
 
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
 	reason = (u32)exit_qualification >> 30;
-	if (reason == TASK_SWITCH_GATE && vmx->vcpu.arch.nmi_injected &&
-	    (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
-	    (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK)
-	    == INTR_TYPE_NMI_INTR) {
-		vcpu->arch.nmi_injected = false;
-		if (cpu_has_virtual_nmis())
-			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
-				      GUEST_INTR_STATE_NMI);
+	if (reason == TASK_SWITCH_GATE && idt_v) {
+		switch (type) {
+		case INTR_TYPE_NMI_INTR:
+			vcpu->arch.nmi_injected = false;
+			if (cpu_has_virtual_nmis())
+				vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+					      GUEST_INTR_STATE_NMI);
+			break;
+		case INTR_TYPE_EXT_INTR:
+			kvm_clear_interrupt_queue(vcpu);
+			break;
+		case INTR_TYPE_HARD_EXCEPTION:
+		case INTR_TYPE_SOFT_EXCEPTION:
+			kvm_clear_exception_queue(vcpu);
+			break;
+		default:
+			break;
+		}
 	}
 	tss_selector = exit_qualification;
 
+	if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
+		       type != INTR_TYPE_EXT_INTR &&
+		       type != INTR_TYPE_NMI_INTR))
+		skip_emulated_instruction(vcpu);
+
 	if (!kvm_task_switch(vcpu, tss_selector, reason))
 		return 0;
 
@@ -3306,7 +3324,7 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
 	type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
 
-	switch(type) {
+	switch (type) {
 	case INTR_TYPE_NMI_INTR:
 		vmx->vcpu.arch.nmi_injected = true;
 		/*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index adcf738..bb04f11 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3935,7 +3935,10 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
 		kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
 	}
 
-	kvm_x86_ops->skip_emulated_instruction(vcpu);
+	/* set back link to prev task only if NT bit is set in eflags
+	   note that old_tss_sel is not used afetr this point */
+	if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+		old_tss_sel = 0xffff;
 
 	/* set back link to prev task only if NT bit is set in eflags
 	   note that old_tss_sel is not used afetr this point */
-- 
cgit v0.10.2


From 4c26b4cd6ff6c3f7534f2aea9615a561c372ed05 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Thu, 2 Apr 2009 10:28:37 +0800
Subject: KVM: MMU: Discard reserved bits checking on PDE bit 7-8

1. It's related to a Linux kernel bug which fixed by Ingo on
07a66d7c53a538e1a9759954a82bb6c07365eff9. The original code exists for quite a
long time, and it would convert a PDE for large page into a normal PDE. But it
fail to fit normal PDE well.  With the code before Ingo's fix, the kernel would
fall reserved bit checking with bit 8 - the remaining global bit of PTE. So the
kernel would receive a double-fault.

2. After discussion, we decide to discard PDE bit 7-8 reserved checking for now.
For this marked as reserved in SDM, but didn't checked by the processor in
fact...

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index da3ad3c..b582add 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2194,7 +2194,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
 			rsvd_bits(maxphyaddr, 63) |
 			rsvd_bits(7, 8) | rsvd_bits(1, 2);	/* PDPTE */
 		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
-			rsvd_bits(maxphyaddr, 62);		/* PDE */
+			rsvd_bits(maxphyaddr, 62);	/* PDE */
 		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
 			rsvd_bits(maxphyaddr, 62); 	/* PTE */
 		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
@@ -2208,13 +2208,14 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
 		context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
 			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
 		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
-			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+			rsvd_bits(maxphyaddr, 51);
 		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
 			rsvd_bits(maxphyaddr, 51);
 		context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
 		context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2];
 		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
-			rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 20);
+			rsvd_bits(maxphyaddr, 51) |
+			rsvd_bits(13, 20);		/* large page */
 		context->rsvd_bits_mask[1][0] = ~0ull;
 		break;
 	}
-- 
cgit v0.10.2


From 2d033196541959d91802d5a62e63771448101557 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 2 Apr 2009 15:51:46 +0300
Subject: KVM: x86 emulator: fix call near emulation

The length of pushed on to the stack return address depends on operand
size not address size.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index ca91749..d7c9f6f 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -1792,7 +1792,6 @@ special_insn:
 		}
 		c->src.val = (unsigned long) c->eip;
 		jmp_rel(c, rel);
-		c->op_bytes = c->ad_bytes;
 		emulate_push(ctxt);
 		break;
 	}
-- 
cgit v0.10.2


From 2906e79f21a559e49611e1e188c4993cd45d9ce1 Mon Sep 17 00:00:00 2001
From: "Zhang, Xiantao" <xiantao.zhang@intel.com>
Date: Thu, 9 Apr 2009 21:37:28 +0800
Subject: KVM: ia64: make kvm depend on CONFIG_MODULES.

Since kvm-intel modue can't be built-in, make kvm depend on
CONFIG_MODULES.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig
index 0a2d6b8..64d5209 100644
--- a/arch/ia64/kvm/Kconfig
+++ b/arch/ia64/kvm/Kconfig
@@ -23,7 +23,7 @@ if VIRTUALIZATION
 
 config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support"
-	depends on HAVE_KVM && EXPERIMENTAL
+	depends on HAVE_KVM && MODULES && EXPERIMENTAL
 	# for device assignment:
 	depends on PCI
 	select PREEMPT_NOTIFIERS
-- 
cgit v0.10.2


From ede2ccc51742059d356d419260460cbbf3e36273 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Wed, 8 Apr 2009 13:14:19 -0300
Subject: KVM: PIT: fix count read and mode 0 handling

Commit 46ee278652f4cbd51013471b64c7897ba9bcd1b1 causes Solaris 10
to hang on boot.

Assuming that PIT counter reads should return 0 for an expired timer
is wrong: when it is active, the counter never stops (see comment on
__kpit_elapsed).

Also arm a one shot timer for mode 0.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index cf09bb6..4d6f0d2 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -104,13 +104,18 @@ static s64 __kpit_elapsed(struct kvm *kvm)
 	ktime_t remaining;
 	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
 
+	/*
+	 * The Counter does not stop when it reaches zero. In
+	 * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to
+	 * the highest count, either FFFF hex for binary counting
+	 * or 9999 for BCD counting, and continues counting.
+	 * Modes 2 and 3 are periodic; the Counter reloads
+	 * itself with the initial count and continues counting
+	 * from there.
+	 */
 	remaining = hrtimer_expires_remaining(&ps->pit_timer.timer);
-	if (ktime_to_ns(remaining) < 0)
-		remaining = ktime_set(0, 0);
-
-	elapsed = ps->pit_timer.period;
-	if (ktime_to_ns(remaining) <= ps->pit_timer.period)
-		elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
+	elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
+	elapsed = mod_64(elapsed, ps->pit_timer.period);
 
 	return elapsed;
 }
@@ -280,7 +285,7 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
 
 	/* TODO The new value only affected after the retriggered */
 	hrtimer_cancel(&pt->timer);
-	pt->period = (is_period == 0) ? 0 : interval;
+	pt->period = interval;
 	ps->is_periodic = is_period;
 
 	pt->timer.function = kvm_timer_fn;
@@ -304,10 +309,8 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
 	pr_debug("pit: load_count val is %d, channel is %d\n", val, channel);
 
 	/*
-	 * Though spec said the state of 8254 is undefined after power-up,
-	 * seems some tricky OS like Windows XP depends on IRQ0 interrupt
-	 * when booting up.
-	 * So here setting initialize rate for it, and not a specific number
+	 * The largest possible initial count is 0; this is equivalent
+	 * to 216 for binary counting and 104 for BCD counting.
 	 */
 	if (val == 0)
 		val = 0x10000;
@@ -322,6 +325,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
 	/* Two types of timer
 	 * mode 1 is one shot, mode 2 is period, otherwise del timer */
 	switch (ps->channels[0].mode) {
+	case 0:
 	case 1:
         /* FIXME: enhance mode 4 precision */
 	case 4:
-- 
cgit v0.10.2


From 2f8b9ee14eb439008e0c5131116ea6baa40dba50 Mon Sep 17 00:00:00 2001
From: nathan binkert <nate@binkert.org>
Date: Fri, 27 Mar 2009 21:53:05 -0700
Subject: KVM: Make kvm header C++ friendly

Two things needed fixing: 1) g++ does not allow a named structure type
within an anonymous union and 2) Avoid name clash between two padding
fields within the same struct by giving them different names as is
done elsewhere in the header.

Signed-off-by: Nathan Binkert <nate@binkert.org>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 644e3a9..3db5d8d 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -119,7 +119,7 @@ struct kvm_run {
 			__u32 error_code;
 		} ex;
 		/* KVM_EXIT_IO */
-		struct kvm_io {
+		struct {
 #define KVM_EXIT_IO_IN  0
 #define KVM_EXIT_IO_OUT 1
 			__u8 direction;
@@ -224,10 +224,10 @@ struct kvm_interrupt {
 /* for KVM_GET_DIRTY_LOG */
 struct kvm_dirty_log {
 	__u32 slot;
-	__u32 padding;
+	__u32 padding1;
 	union {
 		void __user *dirty_bitmap; /* one bit per page */
-		__u64 padding;
+		__u64 padding2;
 	};
 };
 
-- 
cgit v0.10.2


From c2d0ee46e6e633a3c23ecbcb9b03ad731906cd79 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Sun, 5 Apr 2009 14:54:47 -0300
Subject: KVM: MMU: remove global page optimization logic

Complexity to fix it not worthwhile the gains, as discussed
in http://article.gmane.org/gmane.comp.emulators.kvm.devel/28649.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3fc4623..0e3a7c6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -213,7 +213,6 @@ struct kvm_mmu_page {
 	int multimapped;         /* More than one parent_pte? */
 	int root_count;          /* Currently serving as active root */
 	bool unsync;
-	bool global;
 	unsigned int unsync_children;
 	union {
 		u64 *parent_pte;               /* !multimapped */
@@ -395,7 +394,6 @@ struct kvm_arch{
 	 */
 	struct list_head active_mmu_pages;
 	struct list_head assigned_dev_head;
-	struct list_head oos_global_pages;
 	struct iommu_domain *iommu_domain;
 	struct kvm_pic *vpic;
 	struct kvm_ioapic *vioapic;
@@ -425,7 +423,6 @@ struct kvm_vm_stat {
 	u32 mmu_recycled;
 	u32 mmu_cache_miss;
 	u32 mmu_unsync;
-	u32 mmu_unsync_global;
 	u32 remote_tlb_flush;
 	u32 lpages;
 };
@@ -640,7 +637,6 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
 void kvm_mmu_unload(struct kvm_vcpu *vcpu);
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
-void kvm_mmu_sync_global(struct kvm_vcpu *vcpu);
 
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b582add..b39ec62 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1075,18 +1075,10 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
 	return NULL;
 }
 
-static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
-	list_del(&sp->oos_link);
-	--kvm->stat.mmu_unsync_global;
-}
-
 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	WARN_ON(!sp->unsync);
 	sp->unsync = 0;
-	if (sp->global)
-		kvm_unlink_unsync_global(kvm, sp);
 	--kvm->stat.mmu_unsync;
 }
 
@@ -1249,7 +1241,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
 	sp->gfn = gfn;
 	sp->role = role;
-	sp->global = 0;
 	hlist_add_head(&sp->hash_link, bucket);
 	if (!direct) {
 		if (rmap_write_protect(vcpu->kvm, gfn))
@@ -1647,11 +1638,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 	++vcpu->kvm->stat.mmu_unsync;
 	sp->unsync = 1;
 
-	if (sp->global) {
-		list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages);
-		++vcpu->kvm->stat.mmu_unsync_global;
-	} else
-		kvm_mmu_mark_parents_unsync(vcpu, sp);
+	kvm_mmu_mark_parents_unsync(vcpu, sp);
 
 	mmu_convert_notrap(sp);
 	return 0;
@@ -1678,21 +1665,12 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		    unsigned pte_access, int user_fault,
 		    int write_fault, int dirty, int largepage,
-		    int global, gfn_t gfn, pfn_t pfn, bool speculative,
+		    gfn_t gfn, pfn_t pfn, bool speculative,
 		    bool can_unsync)
 {
 	u64 spte;
 	int ret = 0;
 	u64 mt_mask = shadow_mt_mask;
-	struct kvm_mmu_page *sp = page_header(__pa(shadow_pte));
-
-	if (!global && sp->global) {
-		sp->global = 0;
-		if (sp->unsync) {
-			kvm_unlink_unsync_global(vcpu->kvm, sp);
-			kvm_mmu_mark_parents_unsync(vcpu, sp);
-		}
-	}
 
 	/*
 	 * We don't set the accessed bit, since we sometimes want to see
@@ -1766,8 +1744,8 @@ set_pte:
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 			 unsigned pt_access, unsigned pte_access,
 			 int user_fault, int write_fault, int dirty,
-			 int *ptwrite, int largepage, int global,
-			 gfn_t gfn, pfn_t pfn, bool speculative)
+			 int *ptwrite, int largepage, gfn_t gfn,
+			 pfn_t pfn, bool speculative)
 {
 	int was_rmapped = 0;
 	int was_writeble = is_writeble_pte(*shadow_pte);
@@ -1796,7 +1774,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 			was_rmapped = 1;
 	}
 	if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
-		      dirty, largepage, global, gfn, pfn, speculative, true)) {
+		      dirty, largepage, gfn, pfn, speculative, true)) {
 		if (write_fault)
 			*ptwrite = 1;
 		kvm_x86_ops->tlb_flush(vcpu);
@@ -1844,7 +1822,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 		    || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) {
 			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
 				     0, write, 1, &pt_write,
-				     largepage, 0, gfn, pfn, false);
+				     largepage, gfn, pfn, false);
 			++vcpu->stat.pf_fixed;
 			break;
 		}
@@ -2015,15 +1993,6 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 	}
 }
 
-static void mmu_sync_global(struct kvm_vcpu *vcpu)
-{
-	struct kvm *kvm = vcpu->kvm;
-	struct kvm_mmu_page *sp, *n;
-
-	list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link)
-		kvm_sync_page(vcpu, sp);
-}
-
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 {
 	spin_lock(&vcpu->kvm->mmu_lock);
@@ -2031,13 +2000,6 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 	spin_unlock(&vcpu->kvm->mmu_lock);
 }
 
-void kvm_mmu_sync_global(struct kvm_vcpu *vcpu)
-{
-	spin_lock(&vcpu->kvm->mmu_lock);
-	mmu_sync_global(vcpu);
-	spin_unlock(&vcpu->kvm->mmu_lock);
-}
-
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
 {
 	return vaddr;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 09782a9..258e459 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -268,8 +268,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 	kvm_get_pfn(pfn);
 	mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
 		     gpte & PT_DIRTY_MASK, NULL, largepage,
-		     gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte),
-		     pfn, true);
+		     gpte_to_gfn(gpte), pfn, true);
 }
 
 /*
@@ -303,7 +302,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 				     user_fault, write_fault,
 				     gw->ptes[gw->level-1] & PT_DIRTY_MASK,
 				     ptwrite, largepage,
-				     gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
 				     gw->gfn, pfn, false);
 			break;
 		}
@@ -592,7 +590,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		nr_present++;
 		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
 		set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
-			 is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn,
+			 is_dirty_pte(gpte), 0, gfn,
 			 spte_to_pfn(sp->spt[i]), true, false);
 	}
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bb04f11..b5ac1b7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -108,7 +108,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "mmu_recycled", VM_STAT(mmu_recycled) },
 	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
 	{ "mmu_unsync", VM_STAT(mmu_unsync) },
-	{ "mmu_unsync_global", VM_STAT(mmu_unsync_global) },
 	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
 	{ "largepages", VM_STAT(lpages) },
 	{ NULL }
@@ -322,7 +321,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	kvm_x86_ops->set_cr0(vcpu, cr0);
 	vcpu->arch.cr0 = cr0;
 
-	kvm_mmu_sync_global(vcpu);
 	kvm_mmu_reset_context(vcpu);
 	return;
 }
@@ -371,7 +369,6 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	kvm_x86_ops->set_cr4(vcpu, cr4);
 	vcpu->arch.cr4 = cr4;
 	vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
-	kvm_mmu_sync_global(vcpu);
 	kvm_mmu_reset_context(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -4364,7 +4361,6 @@ struct  kvm *kvm_arch_create_vm(void)
 		return ERR_PTR(-ENOMEM);
 
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
-	INIT_LIST_HEAD(&kvm->arch.oos_global_pages);
 	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 
 	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
-- 
cgit v0.10.2


From a5f868bd45a64ce1f502d228723a5b6c357790cd Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 12 Apr 2009 13:36:14 +0300
Subject: KVM: x86 emulator: Add decoding of 16bit second immediate argument

Such as segment number in lcall/ljmp

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index d7c9f6f..c015063 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -76,6 +76,7 @@
 #define Src2CL      (1<<29)
 #define Src2ImmByte (2<<29)
 #define Src2One     (3<<29)
+#define Src2Imm16   (4<<29)
 #define Src2Mask    (7<<29)
 
 enum {
@@ -1072,6 +1073,12 @@ done_prefixes:
 		c->src2.bytes = 1;
 		c->src2.val = insn_fetch(u8, 1, c->eip);
 		break;
+	case Src2Imm16:
+		c->src2.type = OP_IMM;
+		c->src2.ptr = (unsigned long *)c->eip;
+		c->src2.bytes = 2;
+		c->src2.val = insn_fetch(u16, 2, c->eip);
+		break;
 	case Src2One:
 		c->src2.bytes = 1;
 		c->src2.val = 1;
-- 
cgit v0.10.2


From 0654169e7309f2f68ec4bea9257b14b05ea94d7d Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 12 Apr 2009 13:36:20 +0300
Subject: KVM: x86 emulator: Add lcall decoding

No emulation yet.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index c015063..71b4bee 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -154,7 +154,8 @@ static u32 opcode_table[256] = {
 	/* 0x90 - 0x97 */
 	DstReg, DstReg, DstReg, DstReg,	DstReg, DstReg, DstReg, DstReg,
 	/* 0x98 - 0x9F */
-	0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
+	0, 0, SrcImm | Src2Imm16, 0,
+	ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
 	/* 0xA0 - 0xA7 */
 	ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
 	ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
-- 
cgit v0.10.2


From 782b877c8073a9ef307ad6638ee472b8336b2b85 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 12 Apr 2009 13:36:25 +0300
Subject: KVM: x86 emulator: Complete ljmp decoding at decode stage

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 71b4bee..8779cf2 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -193,7 +193,7 @@ static u32 opcode_table[256] = {
 	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
 	/* 0xE8 - 0xEF */
 	ImplicitOps | Stack, SrcImm | ImplicitOps,
-	ImplicitOps, SrcImmByte | ImplicitOps,
+	SrcImm | Src2Imm16, SrcImmByte | ImplicitOps,
 	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
 	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
 	/* 0xF0 - 0xF7 */
@@ -1805,30 +1805,15 @@ special_insn:
 	}
 	case 0xe9: /* jmp rel */
 		goto jmp;
-	case 0xea: /* jmp far */ {
-		uint32_t eip;
-		uint16_t sel;
-
-		switch (c->op_bytes) {
-		case 2:
-			eip = insn_fetch(u16, 2, c->eip);
-			break;
-		case 4:
-			eip = insn_fetch(u32, 4, c->eip);
-			break;
-		default:
-			DPRINTF("jmp far: Invalid op_bytes\n");
-			goto cannot_emulate;
-		}
-		sel = insn_fetch(u16, 2, c->eip);
-		if (kvm_load_segment_descriptor(ctxt->vcpu, sel, 9, VCPU_SREG_CS) < 0) {
+	case 0xea: /* jmp far */
+		if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9,
+					VCPU_SREG_CS) < 0) {
 			DPRINTF("jmp far: Failed to load CS descriptor\n");
 			goto cannot_emulate;
 		}
 
-		c->eip = eip;
+		c->eip = c->src.val;
 		break;
-	}
 	case 0xeb:
 	      jmp:		/* jmp rel short */
 		jmp_rel(c, c->src.val);
-- 
cgit v0.10.2


From b2833e3cdebfe3ea4d0d1d3ce4d2ff1c42a4f8f4 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 12 Apr 2009 13:36:30 +0300
Subject: KVM: x86 emulator: Complete short/near jcc decoding in decode stage

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 8779cf2..14b8ee2 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -136,11 +136,11 @@ static u32 opcode_table[256] = {
 	SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* insb, insw/insd */
 	SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* outsb, outsw/outsd */
 	/* 0x70 - 0x77 */
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
+	SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
 	/* 0x78 - 0x7F */
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
+	SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
 	/* 0x80 - 0x87 */
 	Group | Group1_80, Group | Group1_81,
 	Group | Group1_82, Group | Group1_83,
@@ -232,10 +232,8 @@ static u32 twobyte_table[256] = {
 	/* 0x70 - 0x7F */
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0x80 - 0x8F */
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
+	SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
 	/* 0x90 - 0x9F */
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0xA0 - 0xA7 */
@@ -1539,13 +1537,10 @@ special_insn:
 			return -1;
 		}
 		return 0;
-	case 0x70 ... 0x7f: /* jcc (short) */ {
-		int rel = insn_fetch(s8, 1, c->eip);
-
+	case 0x70 ... 0x7f: /* jcc (short) */
 		if (test_cc(c->b, ctxt->eflags))
-			jmp_rel(c, rel);
+			jmp_rel(c, c->src.val);
 		break;
-	}
 	case 0x80 ... 0x83:	/* Grp1 */
 		switch (c->modrm_reg) {
 		case 0:
@@ -2031,28 +2026,11 @@ twobyte_insn:
 		if (!test_cc(c->b, ctxt->eflags))
 			c->dst.type = OP_NONE; /* no writeback */
 		break;
-	case 0x80 ... 0x8f: /* jnz rel, etc*/ {
-		long int rel;
-
-		switch (c->op_bytes) {
-		case 2:
-			rel = insn_fetch(s16, 2, c->eip);
-			break;
-		case 4:
-			rel = insn_fetch(s32, 4, c->eip);
-			break;
-		case 8:
-			rel = insn_fetch(s64, 8, c->eip);
-			break;
-		default:
-			DPRINTF("jnz: Invalid op_bytes\n");
-			goto cannot_emulate;
-		}
+	case 0x80 ... 0x8f: /* jnz rel, etc*/
 		if (test_cc(c->b, ctxt->eflags))
-			jmp_rel(c, rel);
+			jmp_rel(c, c->src.val);
 		c->dst.type = OP_NONE;
 		break;
-	}
 	case 0xa3:
 	      bt:		/* bt */
 		c->dst.type = OP_NONE;
-- 
cgit v0.10.2


From d53c4777b3a3e5031710d0664851d1309325884b Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 12 Apr 2009 13:36:36 +0300
Subject: KVM: x86 emulator: Complete decoding of call near in decode stage

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 14b8ee2..4a9cd4c 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -192,7 +192,7 @@ static u32 opcode_table[256] = {
 	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
 	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
 	/* 0xE8 - 0xEF */
-	ImplicitOps | Stack, SrcImm | ImplicitOps,
+	SrcImm | Stack, SrcImm | ImplicitOps,
 	SrcImm | Src2Imm16, SrcImmByte | ImplicitOps,
 	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
 	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
@@ -1781,18 +1781,7 @@ special_insn:
 		io_dir_in = 0;
 		goto do_io;
 	case 0xe8: /* call (near) */ {
-		long int rel;
-		switch (c->op_bytes) {
-		case 2:
-			rel = insn_fetch(s16, 2, c->eip);
-			break;
-		case 4:
-			rel = insn_fetch(s32, 4, c->eip);
-			break;
-		default:
-			DPRINTF("Call: Invalid op_bytes\n");
-			goto cannot_emulate;
-		}
+		long int rel = c->src.val;
 		c->src.val = (unsigned long) c->eip;
 		jmp_rel(c, rel);
 		emulate_push(ctxt);
-- 
cgit v0.10.2


From 341de7e3728ade102eaadf56af404f4ce865a73d Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 12 Apr 2009 13:36:41 +0300
Subject: KVM: x86 emulator: Add unsigned byte immediate decode

Extend "Source operand type" opcode description field to 4 bites
to accommodate new option.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 4a9cd4c..0988a13 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -59,13 +59,14 @@
 #define SrcImm      (5<<4)	/* Immediate operand. */
 #define SrcImmByte  (6<<4)	/* 8-bit sign-extended immediate operand. */
 #define SrcOne      (7<<4)	/* Implied '1' */
-#define SrcMask     (7<<4)
+#define SrcImmUByte (8<<4)      /* 8-bit unsigned immediate operand. */
+#define SrcMask     (0xf<<4)
 /* Generic ModRM decode. */
-#define ModRM       (1<<7)
+#define ModRM       (1<<8)
 /* Destination is only written; never read. */
-#define Mov         (1<<8)
-#define BitOp       (1<<9)
-#define MemAbs      (1<<10)      /* Memory operand is absolute displacement */
+#define Mov         (1<<9)
+#define BitOp       (1<<10)
+#define MemAbs      (1<<11)      /* Memory operand is absolute displacement */
 #define String      (1<<12)     /* String instruction (rep capable) */
 #define Stack       (1<<13)     /* Stack instruction (push/pop) */
 #define Group       (1<<14)     /* Bits 3:5 of modrm byte extend opcode */
@@ -1044,10 +1045,14 @@ done_prefixes:
 		}
 		break;
 	case SrcImmByte:
+	case SrcImmUByte:
 		c->src.type = OP_IMM;
 		c->src.ptr = (unsigned long *)c->eip;
 		c->src.bytes = 1;
-		c->src.val = insn_fetch(s8, 1, c->eip);
+		if ((c->d & SrcMask) == SrcImmByte)
+			c->src.val = insn_fetch(s8, 1, c->eip);
+		else
+			c->src.val = insn_fetch(u8, 1, c->eip);
 		break;
 	case SrcOne:
 		c->src.bytes = 1;
-- 
cgit v0.10.2


From 84ce66a6865192567172f08ab5269aa380fa6ead Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 12 Apr 2009 13:36:46 +0300
Subject: KVM: x86 emulator: Completely decode in/out at decoding stage

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 0988a13..c2f55ca 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -190,8 +190,8 @@ static u32 opcode_table[256] = {
 	0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0xE0 - 0xE7 */
 	0, 0, 0, 0,
-	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
-	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
+	ByteOp | SrcImmUByte, SrcImmUByte,
+	ByteOp | SrcImmUByte, SrcImmUByte,
 	/* 0xE8 - 0xEF */
 	SrcImm | Stack, SrcImm | ImplicitOps,
 	SrcImm | Src2Imm16, SrcImmByte | ImplicitOps,
@@ -1777,12 +1777,12 @@ special_insn:
 		break;
 	case 0xe4: 	/* inb */
 	case 0xe5: 	/* in */
-		port = insn_fetch(u8, 1, c->eip);
+		port = c->src.val;
 		io_dir_in = 1;
 		goto do_io;
 	case 0xe6: /* outb */
 	case 0xe7: /* out */
-		port = insn_fetch(u8, 1, c->eip);
+		port = c->src.val;
 		io_dir_in = 0;
 		goto do_io;
 	case 0xe8: /* call (near) */ {
-- 
cgit v0.10.2


From e637b8238ad1783ebdd113bd34cd6982dec1006d Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 12 Apr 2009 13:36:52 +0300
Subject: KVM: x86 emulator: Decode soft interrupt instructions

Do not emulate them yet.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index c2f55ca..d2664fc 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -181,7 +181,8 @@ static u32 opcode_table[256] = {
 	0, ImplicitOps | Stack, 0, 0,
 	ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
 	/* 0xC8 - 0xCF */
-	0, 0, 0, ImplicitOps | Stack, 0, 0, 0, 0,
+	0, 0, 0, ImplicitOps | Stack,
+	ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps,
 	/* 0xD0 - 0xD7 */
 	ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
 	ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
-- 
cgit v0.10.2


From ba8afb6b0a2c7e06da760ffe5d078245058619b5 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 12 Apr 2009 13:36:57 +0300
Subject: KVM: x86 emulator: Add new mode of instruction emulation: skip

In the new mode instruction is decoded, but not executed. The EIP
is moved to point after the instruction.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0e3a7c6..cb306cf 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -562,6 +562,7 @@ enum emulation_result {
 
 #define EMULTYPE_NO_DECODE	    (1 << 0)
 #define EMULTYPE_TRAP_UD	    (1 << 1)
+#define EMULTYPE_SKIP		    (1 << 2)
 int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
 			unsigned long cr2, u16 error_code, int emulation_type);
 void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b5ac1b7..8beccaa 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2413,6 +2413,11 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		}
 	}
 
+	if (emulation_type & EMULTYPE_SKIP) {
+		kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
+		return EMULATE_DONE;
+	}
+
 	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
 
 	if (vcpu->arch.pio.string)
-- 
cgit v0.10.2


From 8317c298eab14cc20e2de5289743ff0d4c5a6b30 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 12 Apr 2009 13:37:02 +0300
Subject: KVM: SVM: Skip instruction on a task switch only when appropriate

If a task switch was initiated because off a task gate in IDT and IDT
was accessed because of an external even the instruction should not
be skipped.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index bba67b7..8fc6eea 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1828,6 +1828,7 @@ static int task_switch_interception(struct vcpu_svm *svm,
 	int reason;
 	int int_type = svm->vmcb->control.exit_int_info &
 		SVM_EXITINTINFO_TYPE_MASK;
+	int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
 
 	tss_selector = (u16)svm->vmcb->control.exit_info_1;
 
@@ -1843,8 +1844,14 @@ static int task_switch_interception(struct vcpu_svm *svm,
 		reason = TASK_SWITCH_CALL;
 
 
-	if (reason != TASK_SWITCH_GATE || int_type == SVM_EXITINTINFO_TYPE_SOFT)
-		skip_emulated_instruction(&svm->vcpu);
+	if (reason != TASK_SWITCH_GATE ||
+	    int_type == SVM_EXITINTINFO_TYPE_SOFT ||
+	    (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
+	     (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
+		if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0,
+					EMULTYPE_SKIP) != EMULATE_DONE)
+			return 0;
+	}
 
 	return kvm_task_switch(&svm->vcpu, tss_selector, reason);
 }
-- 
cgit v0.10.2


From 463656c0007ddccee78db383eeb9e6eac75ccb7f Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 12 Apr 2009 15:49:07 +0300
Subject: KVM: Replace kvmclock open-coded get_cpu_var() with the real thing

Suggested by Ingo Molnar.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8beccaa..ffbb2c8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -631,16 +631,17 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 	unsigned long flags;
 	struct kvm_vcpu_arch *vcpu = &v->arch;
 	void *shared_kaddr;
+	unsigned long this_tsc_khz;
 
 	if ((!vcpu->time_page))
 		return;
 
-	preempt_disable();
-	if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) {
-		kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock);
-		vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz);
+	this_tsc_khz = get_cpu_var(cpu_tsc_khz);
+	if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
+		kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
+		vcpu->hv_clock_tsc_khz = this_tsc_khz;
 	}
-	preempt_enable();
+	put_cpu_var(cpu_tsc_khz);
 
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
-- 
cgit v0.10.2


From c6b60c6921381130e5288b19f5fdf81152230b37 Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Thu, 16 Apr 2009 10:43:48 +0200
Subject: KVM: ia64: Don't hold slots_lock in guest mode

Reorder locking to avoid holding the slots_lock when entering
the guest.

Signed-off-by: Jes Sorensen <jes@sgi.com>
Acked-by : Xiantao Zhang<xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 3bf0a34..f127fb7 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -632,34 +632,22 @@ static int kvm_vcpu_pre_transition(struct kvm_vcpu *vcpu)
 	vti_set_rr6(vcpu->arch.vmm_rr);
 	return kvm_insert_vmm_mapping(vcpu);
 }
+
 static void kvm_vcpu_post_transition(struct kvm_vcpu *vcpu)
 {
 	kvm_purge_vmm_mapping(vcpu);
 	vti_set_rr6(vcpu->arch.host_rr6);
 }
 
-static int  vti_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	union context *host_ctx, *guest_ctx;
 	int r;
 
-	/*Get host and guest context with guest address space.*/
-	host_ctx = kvm_get_host_context(vcpu);
-	guest_ctx = kvm_get_guest_context(vcpu);
-
-	r = kvm_vcpu_pre_transition(vcpu);
-	if (r < 0)
-		goto out;
-	kvm_vmm_info->tramp_entry(host_ctx, guest_ctx);
-	kvm_vcpu_post_transition(vcpu);
-	r = 0;
-out:
-	return r;
-}
-
-static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-	int r;
+	/*
+	 * down_read() may sleep and return with interrupts enabled
+	 */
+	down_read(&vcpu->kvm->slots_lock);
 
 again:
 	if (signal_pending(current)) {
@@ -668,23 +656,28 @@ again:
 		goto out;
 	}
 
-	/*
-	 * down_read() may sleep and return with interrupts enabled
-	 */
-	down_read(&vcpu->kvm->slots_lock);
-
 	preempt_disable();
 	local_irq_disable();
 
+	/*Get host and guest context with guest address space.*/
+	host_ctx = kvm_get_host_context(vcpu);
+	guest_ctx = kvm_get_guest_context(vcpu);
+
 	vcpu->guest_mode = 1;
+
+	r = kvm_vcpu_pre_transition(vcpu);
+	if (r < 0)
+		goto vcpu_run_fail;
+
+	up_read(&vcpu->kvm->slots_lock);
 	kvm_guest_enter();
-	r = vti_vcpu_run(vcpu, kvm_run);
-	if (r < 0) {
-		local_irq_enable();
-		preempt_enable();
-		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
-		goto out;
-	}
+
+	/*
+	 * Transition to the guest
+	 */
+	kvm_vmm_info->tramp_entry(host_ctx, guest_ctx);
+
+	kvm_vcpu_post_transition(vcpu);
 
 	vcpu->arch.launched = 1;
 	vcpu->guest_mode = 0;
@@ -698,9 +691,10 @@ again:
 	 */
 	barrier();
 	kvm_guest_exit();
-	up_read(&vcpu->kvm->slots_lock);
 	preempt_enable();
 
+	down_read(&vcpu->kvm->slots_lock);
+
 	r = kvm_handle_exit(kvm_run, vcpu);
 
 	if (r > 0) {
@@ -709,12 +703,20 @@ again:
 	}
 
 out:
+	up_read(&vcpu->kvm->slots_lock);
 	if (r > 0) {
 		kvm_resched(vcpu);
+		down_read(&vcpu->kvm->slots_lock);
 		goto again;
 	}
 
 	return r;
+
+vcpu_run_fail:
+	local_irq_enable();
+	preempt_enable();
+	kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+	goto out;
 }
 
 static void kvm_set_mmio_data(struct kvm_vcpu *vcpu)
-- 
cgit v0.10.2


From 59839dfff5eabca01cc4e20b45797a60a80af8cb Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Thu, 16 Apr 2009 08:30:44 -0300
Subject: KVM: x86: check for cr3 validity in ioctl_set_sregs

Matt T. Yourst notes that kvm_arch_vcpu_ioctl_set_sregs lacks validity
checking for the new cr3 value:

"Userspace callers of KVM_SET_SREGS can pass a bogus value of cr3 to
the kernel. This will trigger a NULL pointer access in gfn_to_rmap()
when userspace next tries to call KVM_RUN on the affected VCPU and kvm
attempts to activate the new non-existent page table root.

This happens since kvm only validates that cr3 points to a valid guest
physical memory page when code *inside* the guest sets cr3. However, kvm
currently trusts the userspace caller (e.g. QEMU) on the host machine to
always supply a valid page table root, rather than properly validating
it along with the rest of the reloaded guest state."

http://sourceforge.net/tracker/?func=detail&atid=893831&aid=2687641&group_id=180599

Check for a valid cr3 address in kvm_arch_vcpu_ioctl_set_sregs, triple
fault in case of failure.

Cc: stable@kernel.org
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ffbb2c8..2bad49b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3993,7 +3993,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 
 	vcpu->arch.cr2 = sregs->cr2;
 	mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
-	vcpu->arch.cr3 = sregs->cr3;
+
+	down_read(&vcpu->kvm->slots_lock);
+	if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT))
+		vcpu->arch.cr3 = sregs->cr3;
+	else
+		set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+	up_read(&vcpu->kvm->slots_lock);
 
 	kvm_set_cr8(vcpu, sregs->cr8);
 
-- 
cgit v0.10.2


From 64f6afbd4c3317eb6845d770aa345afd3f120471 Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Thu, 16 Apr 2009 17:59:16 +0800
Subject: KVM: ia64: Flush all TLBs once guest's memory mapping changes.

Flush all vcpu's TLB entries once changes guest's memory mapping.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index f127fb7..9addca6 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1631,6 +1631,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 
 void kvm_arch_flush_shadow(struct kvm *kvm)
 {
+	kvm_flush_remote_tlbs(kvm);
 }
 
 long kvm_arch_dev_ioctl(struct file *filp,
-- 
cgit v0.10.2


From f9b647adda0e821538b6d80e52873d861ffb1799 Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Thu, 16 Apr 2009 11:24:58 +0200
Subject: KVM: ia64: remove empty function vti_vcpu_load()

vti_vcpu_load() doesn't do anything, so lets get rid of it.

Signed-off-by: Jes Sorensen <jes@sgi.com>
Acked-by : Xiantao Zhang<xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 9addca6..7263171 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1100,10 +1100,6 @@ static void kvm_free_vmm_area(void)
 	}
 }
 
-static void vti_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
-}
-
 static int vti_init_vpd(struct kvm_vcpu *vcpu)
 {
 	int i;
@@ -1348,7 +1344,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 	vcpu->kvm = kvm;
 
 	cpu = get_cpu();
-	vti_vcpu_load(vcpu, cpu);
 	r = vti_vcpu_setup(vcpu, id);
 	put_cpu();
 
-- 
cgit v0.10.2


From 457459c3c738dfb37226ba116ba301140da0d1fb Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Thu, 16 Apr 2009 16:08:29 +0200
Subject: KVM: ia64: restore irq state before calling kvm_vcpu_init

Make sure to restore the psr after calling kvm_insert_vmm_mapping()
which calls ia64_itr_entry() as it disables local interrupts and
kvm_vcpu_init() may sleep.

Avoids a warning from the lock debugging code.

Signed-off-by: Jes Sorensen <jes@sgi.com>
Acked-by : Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 7263171..5b868db 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1290,6 +1290,7 @@ static int vti_vcpu_setup(struct kvm_vcpu *vcpu, int id)
 
 	local_irq_save(psr);
 	r = kvm_insert_vmm_mapping(vcpu);
+	local_irq_restore(psr);
 	if (r)
 		goto fail;
 	r = kvm_vcpu_init(vcpu, vcpu->kvm, id);
@@ -1307,13 +1308,11 @@ static int vti_vcpu_setup(struct kvm_vcpu *vcpu, int id)
 		goto uninit;
 
 	kvm_purge_vmm_mapping(vcpu);
-	local_irq_restore(psr);
 
 	return 0;
 uninit:
 	kvm_vcpu_uninit(vcpu);
 fail:
-	local_irq_restore(psr);
 	return r;
 }
 
-- 
cgit v0.10.2


From 4d13c3b04f14a9a72ffcdd082acc243e7e56b4e0 Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Thu, 16 Apr 2009 16:53:13 +0200
Subject: KVM: ia64: preserve int status through call to kvm_insert_vmm_mapping

Preserve interrupt status around call to kvm_insert_vmm_mappin()
in kvm_vcpu_pre_transition().

Signed-off-by: Jes Sorensen <jes@sgi.com>
Acked-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 5b868db..cf5a193 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -619,6 +619,8 @@ static void kvm_purge_vmm_mapping(struct kvm_vcpu *vcpu)
 
 static int kvm_vcpu_pre_transition(struct kvm_vcpu *vcpu)
 {
+	unsigned long psr;
+	int r;
 	int cpu = smp_processor_id();
 
 	if (vcpu->arch.last_run_cpu != cpu ||
@@ -630,7 +632,10 @@ static int kvm_vcpu_pre_transition(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.host_rr6 = ia64_get_rr(RR6);
 	vti_set_rr6(vcpu->arch.vmm_rr);
-	return kvm_insert_vmm_mapping(vcpu);
+	local_irq_save(psr);
+	r = kvm_insert_vmm_mapping(vcpu);
+	local_irq_restore(psr);
+	return r;
 }
 
 static void kvm_vcpu_post_transition(struct kvm_vcpu *vcpu)
-- 
cgit v0.10.2


From 43890ae8bcc5fa88ef3061613efe041b866fbc5a Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Fri, 17 Apr 2009 16:43:27 +0200
Subject: KVM: ia64: ia64 vcpu_reset() do not call kmalloc() with irqs disabled

Restore local irq enabled state before calling kvm_arch_vcpu_init(),
which calls kmalloc(GFP_KERNEL).

Signed-off-by: Jes Sorensen <jes@sgi.com>
Acked-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index cf5a193..be4413e 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -2001,6 +2001,7 @@ static int vcpu_reset(struct kvm_vcpu *vcpu)
 	long psr;
 	local_irq_save(psr);
 	r = kvm_insert_vmm_mapping(vcpu);
+	local_irq_restore(psr);
 	if (r)
 		goto fail;
 
@@ -2013,7 +2014,6 @@ static int vcpu_reset(struct kvm_vcpu *vcpu)
 	kvm_purge_vmm_mapping(vcpu);
 	r = 0;
 fail:
-	local_irq_restore(psr);
 	return r;
 }
 
-- 
cgit v0.10.2


From 3438253926822a6bf8487b4f7d82f26a2c0b2388 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@web.de>
Date: Sat, 25 Apr 2009 12:43:21 +0200
Subject: KVM: MMU: Fix auditing code

Fix build breakage of hpa lookup in audit_mappings_page. Moreover, make
this function robust against shadow_notrap_nonpresent_pte entries.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b39ec62..3592aea 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3029,11 +3029,13 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
 				       " in nonleaf level: levels %d gva %lx"
 				       " level %d pte %llx\n", audit_msg,
 				       vcpu->arch.mmu.root_level, va, level, ent);
-
-			audit_mappings_page(vcpu, ent, va, level - 1);
+			else
+				audit_mappings_page(vcpu, ent, va, level - 1);
 		} else {
 			gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
-			hpa_t hpa = (hpa_t)gpa_to_pfn(vcpu, gpa) << PAGE_SHIFT;
+			gfn_t gfn = gpa >> PAGE_SHIFT;
+			pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
+			hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
 
 			if (is_shadow_present_pte(ent)
 			    && (ent & PT64_BASE_ADDR_MASK) != hpa)
-- 
cgit v0.10.2


From 8061823a25218174f30c3dd943989e1d72f7d06e Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 21 Apr 2009 17:44:56 +0300
Subject: KVM: Make kvm_cpu_(has|get)_interrupt() work for userspace irqchip
 too

At the vector level, kernel and userspace irqchip are fairly similar.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index cf17ed5..11c2757 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -24,6 +24,7 @@
 
 #include "irq.h"
 #include "i8254.h"
+#include "x86.h"
 
 /*
  * check if there are pending timer events
@@ -48,6 +49,9 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 {
 	struct kvm_pic *s;
 
+	if (!irqchip_in_kernel(v->kvm))
+		return v->arch.irq_summary;
+
 	if (kvm_apic_has_interrupt(v) == -1) {	/* LAPIC */
 		if (kvm_apic_accept_pic_intr(v)) {
 			s = pic_irqchip(v->kvm);	/* PIC */
@@ -67,6 +71,9 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
 	struct kvm_pic *s;
 	int vector;
 
+	if (!irqchip_in_kernel(v->kvm))
+		return kvm_pop_irq(v);
+
 	vector = kvm_get_apic_interrupt(v);	/* APIC */
 	if (vector == -1) {
 		if (kvm_apic_accept_pic_intr(v)) {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8fc6eea..6eef6d2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2091,8 +2091,9 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
 	 * If the user space waits to inject interrupts, exit as soon as
 	 * possible
 	 */
-	if (kvm_run->request_interrupt_window &&
-	    !svm->vcpu.arch.irq_summary) {
+	if (!irqchip_in_kernel(svm->vcpu.kvm) &&
+	    kvm_run->request_interrupt_window &&
+	    !kvm_cpu_has_interrupt(&svm->vcpu)) {
 		++svm->vcpu.stat.irq_window_exits;
 		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
 		return 0;
@@ -2373,7 +2374,8 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 		 (svm->vmcb->save.rflags & X86_EFLAGS_IF) &&
 		 (svm->vcpu.arch.hflags & HF_GIF_MASK));
 
-	if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary)
+	if (svm->vcpu.arch.interrupt_window_open &&
+	    kvm_cpu_has_interrupt(&svm->vcpu))
 		/*
 		 * If interrupts enabled, and not blocked by sti or mov ss. Good.
 		 */
@@ -2383,7 +2385,8 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 	 * Interrupts blocked.  Wait for unblock.
 	 */
 	if (!svm->vcpu.arch.interrupt_window_open &&
-	    (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
+	    (kvm_cpu_has_interrupt(&svm->vcpu) ||
+	     kvm_run->request_interrupt_window))
 		svm_set_vintr(svm);
 	else
 		svm_clear_vintr(svm);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c6997c0..b3292c1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2535,21 +2535,20 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 		vmx_inject_nmi(vcpu);
 		if (vcpu->arch.nmi_pending)
 			enable_nmi_window(vcpu);
-		else if (vcpu->arch.irq_summary
-			 || kvm_run->request_interrupt_window)
+		else if (kvm_cpu_has_interrupt(vcpu) ||
+			 kvm_run->request_interrupt_window)
 			enable_irq_window(vcpu);
 		return;
 	}
 
 	if (vcpu->arch.interrupt_window_open) {
-		if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
-			kvm_queue_interrupt(vcpu, kvm_pop_irq(vcpu));
+		if (kvm_cpu_has_interrupt(vcpu) && !vcpu->arch.interrupt.pending)
+			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
 
 		if (vcpu->arch.interrupt.pending)
 			vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
-	}
-	if (!vcpu->arch.interrupt_window_open &&
-	    (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
+	} else if(kvm_cpu_has_interrupt(vcpu) ||
+		  kvm_run->request_interrupt_window)
 		enable_irq_window(vcpu);
 }
 
@@ -2976,8 +2975,9 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
 	 * If the user space waits to inject interrupts, exit as soon as
 	 * possible
 	 */
-	if (kvm_run->request_interrupt_window &&
-	    !vcpu->arch.irq_summary) {
+	if (!irqchip_in_kernel(vcpu->kvm) &&
+	    kvm_run->request_interrupt_window &&
+	    !kvm_cpu_has_interrupt(vcpu)) {
 		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
 		return 0;
 	}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2bad49b..4c2eb7c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3069,7 +3069,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
 					  struct kvm_run *kvm_run)
 {
-	return (!vcpu->arch.irq_summary &&
+	return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
 		kvm_run->request_interrupt_window &&
 		vcpu->arch.interrupt_window_open &&
 		(kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
@@ -3086,7 +3086,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
 	else
 		kvm_run->ready_for_interrupt_injection =
 					(vcpu->arch.interrupt_window_open &&
-					 vcpu->arch.irq_summary == 0);
+					 !kvm_cpu_has_interrupt(vcpu));
 }
 
 static void vapic_enter(struct kvm_vcpu *vcpu)
-- 
cgit v0.10.2


From 863e8e658ee9ac6e5931b295eb7428456e450a0f Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 21 Apr 2009 17:44:57 +0300
Subject: KVM: VMX: Consolidate userspace and kernel interrupt injection for
 VMX

Use the same callback to inject irq/nmi events no matter what irqchip is
in use. Only from VMX for now.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index cb306cf..5edae35 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -517,7 +517,7 @@ struct kvm_x86_ops {
 	void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
 				bool has_error_code, u32 error_code);
 	bool (*exception_injected)(struct kvm_vcpu *vcpu);
-	void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
+	void (*inject_pending_irq)(struct kvm_vcpu *vcpu, struct kvm_run *run);
 	void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
 				       struct kvm_run *run);
 	int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6eef6d2..f2933ab 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2298,7 +2298,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
 		(svm->vcpu.arch.hflags & HF_GIF_MASK);
 }
 
-static void svm_intr_assist(struct kvm_vcpu *vcpu)
+static void svm_intr_assist(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vmcb *vmcb = svm->vmcb;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b3292c1..06252f7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2510,48 +2510,6 @@ static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 	return vcpu->arch.interrupt_window_open;
 }
 
-static void do_interrupt_requests(struct kvm_vcpu *vcpu,
-				       struct kvm_run *kvm_run)
-{
-	vmx_update_window_states(vcpu);
-
-	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
-		vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
-				GUEST_INTR_STATE_STI |
-				GUEST_INTR_STATE_MOV_SS);
-
-	if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
-		if (vcpu->arch.interrupt.pending) {
-			enable_nmi_window(vcpu);
-		} else if (vcpu->arch.nmi_window_open) {
-			vcpu->arch.nmi_pending = false;
-			vcpu->arch.nmi_injected = true;
-		} else {
-			enable_nmi_window(vcpu);
-			return;
-		}
-	}
-	if (vcpu->arch.nmi_injected) {
-		vmx_inject_nmi(vcpu);
-		if (vcpu->arch.nmi_pending)
-			enable_nmi_window(vcpu);
-		else if (kvm_cpu_has_interrupt(vcpu) ||
-			 kvm_run->request_interrupt_window)
-			enable_irq_window(vcpu);
-		return;
-	}
-
-	if (vcpu->arch.interrupt_window_open) {
-		if (kvm_cpu_has_interrupt(vcpu) && !vcpu->arch.interrupt.pending)
-			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
-
-		if (vcpu->arch.interrupt.pending)
-			vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
-	} else if(kvm_cpu_has_interrupt(vcpu) ||
-		  kvm_run->request_interrupt_window)
-		enable_irq_window(vcpu);
-}
-
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
 {
 	int ret;
@@ -3351,8 +3309,11 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	}
 }
 
-static void vmx_intr_assist(struct kvm_vcpu *vcpu)
+static void vmx_intr_assist(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
+	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
+		kvm_run->request_interrupt_window;
+
 	update_tpr_threshold(vcpu);
 
 	vmx_update_window_states(vcpu);
@@ -3373,25 +3334,25 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 			return;
 		}
 	}
+
 	if (vcpu->arch.nmi_injected) {
 		vmx_inject_nmi(vcpu);
-		if (vcpu->arch.nmi_pending)
-			enable_nmi_window(vcpu);
-		else if (kvm_cpu_has_interrupt(vcpu))
-			enable_irq_window(vcpu);
-		return;
+		goto out;
 	}
+
 	if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
 		if (vcpu->arch.interrupt_window_open)
 			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
-		else
-			enable_irq_window(vcpu);
 	}
-	if (vcpu->arch.interrupt.pending) {
+
+	if (vcpu->arch.interrupt.pending)
 		vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
-		if (kvm_cpu_has_interrupt(vcpu))
-			enable_irq_window(vcpu);
-	}
+
+out:
+	if (vcpu->arch.nmi_pending)
+		enable_nmi_window(vcpu);
+	else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+		enable_irq_window(vcpu);
 }
 
 /*
@@ -3733,7 +3694,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.queue_exception = vmx_queue_exception,
 	.exception_injected = vmx_exception_injected,
 	.inject_pending_irq = vmx_intr_assist,
-	.inject_pending_vectors = do_interrupt_requests,
+	.inject_pending_vectors = vmx_intr_assist,
 	.interrupt_allowed = vmx_interrupt_allowed,
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4c2eb7c..a84c96a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3173,7 +3173,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (vcpu->arch.exception.pending)
 		__queue_exception(vcpu);
 	else if (irqchip_in_kernel(vcpu->kvm))
-		kvm_x86_ops->inject_pending_irq(vcpu);
+		kvm_x86_ops->inject_pending_irq(vcpu, kvm_run);
 	else
 		kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
 
-- 
cgit v0.10.2


From 1f21e79aaced0a041e9399346960ce26ae0f5a4e Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 21 Apr 2009 17:44:58 +0300
Subject: KVM: VMX: Cleanup vmx_intr_assist()

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 06252f7..9eb518f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3309,6 +3309,34 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	}
 }
 
+static void vmx_intr_inject(struct kvm_vcpu *vcpu)
+{
+	/* try to reinject previous events if any */
+	if (vcpu->arch.nmi_injected) {
+		vmx_inject_nmi(vcpu);
+		return;
+	}
+
+	if (vcpu->arch.interrupt.pending) {
+		vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+		return;
+	}
+
+	/* try to inject new event if pending */
+	if (vcpu->arch.nmi_pending) {
+		if (vcpu->arch.nmi_window_open) {
+			vcpu->arch.nmi_pending = false;
+			vcpu->arch.nmi_injected = true;
+			vmx_inject_nmi(vcpu);
+		}
+	} else if (kvm_cpu_has_interrupt(vcpu)) {
+		if (vcpu->arch.interrupt_window_open) {
+			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
+			vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+		}
+	}
+}
+
 static void vmx_intr_assist(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
@@ -3323,32 +3351,9 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 				GUEST_INTR_STATE_STI |
 				GUEST_INTR_STATE_MOV_SS);
 
-	if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
-		if (vcpu->arch.interrupt.pending) {
-			enable_nmi_window(vcpu);
-		} else if (vcpu->arch.nmi_window_open) {
-			vcpu->arch.nmi_pending = false;
-			vcpu->arch.nmi_injected = true;
-		} else {
-			enable_nmi_window(vcpu);
-			return;
-		}
-	}
-
-	if (vcpu->arch.nmi_injected) {
-		vmx_inject_nmi(vcpu);
-		goto out;
-	}
-
-	if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
-		if (vcpu->arch.interrupt_window_open)
-			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
-	}
-
-	if (vcpu->arch.interrupt.pending)
-		vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+	vmx_intr_inject(vcpu);
 
-out:
+	/* enable NMI/IRQ window open exits if needed */
 	if (vcpu->arch.nmi_pending)
 		enable_nmi_window(vcpu);
 	else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
-- 
cgit v0.10.2


From 5df56646472c42495dd2412c8d8aa72e59efe79a Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 21 Apr 2009 17:44:59 +0300
Subject: KVM: Use kvm_arch_interrupt_allowed() instead of checking
 interrupt_window_open directly

kvm_arch_interrupt_allowed() also checks IF so drop the check.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a84c96a..ae6250b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3071,8 +3071,7 @@ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
 {
 	return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
 		kvm_run->request_interrupt_window &&
-		vcpu->arch.interrupt_window_open &&
-		(kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
+		kvm_arch_interrupt_allowed(vcpu));
 }
 
 static void post_kvm_run_save(struct kvm_vcpu *vcpu,
@@ -3085,7 +3084,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
 		kvm_run->ready_for_interrupt_injection = 1;
 	else
 		kvm_run->ready_for_interrupt_injection =
-					(vcpu->arch.interrupt_window_open &&
+					(kvm_arch_interrupt_allowed(vcpu) &&
 					 !kvm_cpu_has_interrupt(vcpu));
 }
 
-- 
cgit v0.10.2


From 9222be18f76b4410b4da0d06e1cc21079e64b3ec Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 23 Apr 2009 17:14:37 +0300
Subject: KVM: SVM: Coalesce userspace/kernel irqchip interrupt injection logic

Start to use interrupt/exception queues like VMX does.
This also fix the bug that if exit was caused by a guest
internal exception access to IDT the exception was not
reinjected.

Use EVENTINJ to inject interrupts.  Use VINT only for detecting when IRQ
windows is open again.  EVENTINJ ensures
the interrupt is injected immediately and not delayed.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f2933ab..a80ffaa 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -70,7 +70,6 @@ module_param(npt, int, S_IRUGO);
 static int nested = 0;
 module_param(nested, int, S_IRUGO);
 
-static void kvm_reput_irq(struct vcpu_svm *svm);
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 
 static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override);
@@ -199,9 +198,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 
 static bool svm_exception_injected(struct kvm_vcpu *vcpu)
 {
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID);
+	return false;
 }
 
 static int is_external_interrupt(u32 info)
@@ -978,12 +975,9 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
 
 static int svm_get_irq(struct kvm_vcpu *vcpu)
 {
-	struct vcpu_svm *svm = to_svm(vcpu);
-	u32 exit_int_info = svm->vmcb->control.exit_int_info;
-
-	if (is_external_interrupt(exit_int_info))
-		return exit_int_info & SVM_EVTINJ_VEC_MASK;
-	return -1;
+	if (!vcpu->arch.interrupt.pending)
+		return -1;
+	return vcpu->arch.interrupt.nr;
 }
 
 static void load_host_msrs(struct kvm_vcpu *vcpu)
@@ -1090,17 +1084,8 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
 
 static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-	u32 exit_int_info = svm->vmcb->control.exit_int_info;
-	struct kvm *kvm = svm->vcpu.kvm;
 	u64 fault_address;
 	u32 error_code;
-	bool event_injection = false;
-
-	if (!irqchip_in_kernel(kvm) &&
-	    is_external_interrupt(exit_int_info)) {
-		event_injection = true;
-		kvm_push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
-	}
 
 	fault_address  = svm->vmcb->control.exit_info_2;
 	error_code = svm->vmcb->control.exit_info_1;
@@ -1120,9 +1105,11 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	 */
 	if (npt_enabled)
 		svm_flush_tlb(&svm->vcpu);
-
-	if (!npt_enabled && event_injection)
-		kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
+	else {
+		if (svm->vcpu.arch.interrupt.pending ||
+				svm->vcpu.arch.exception.pending)
+			kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
+	}
 	return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
 }
 
@@ -2196,7 +2183,6 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		}
 	}
 
-	kvm_reput_irq(svm);
 
 	if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
 		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -2259,13 +2245,19 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
 		((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
 }
 
+static void svm_queue_irq(struct vcpu_svm *svm, unsigned nr)
+{
+	svm->vmcb->control.event_inj = nr |
+		SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
+}
+
 static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	nested_svm_intr(svm);
 
-	svm_inject_irq(svm, irq);
+	svm_queue_irq(svm, irq);
 }
 
 static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -2298,98 +2290,47 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
 		(svm->vcpu.arch.hflags & HF_GIF_MASK);
 }
 
-static void svm_intr_assist(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
-	struct vcpu_svm *svm = to_svm(vcpu);
-	struct vmcb *vmcb = svm->vmcb;
-	int intr_vector = -1;
-
-	if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
-	    ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
-		intr_vector = vmcb->control.exit_int_info &
-			      SVM_EVTINJ_VEC_MASK;
-		vmcb->control.exit_int_info = 0;
-		svm_inject_irq(svm, intr_vector);
-		goto out;
-	}
-
-	if (vmcb->control.int_ctl & V_IRQ_MASK)
-		goto out;
-
-	if (!kvm_cpu_has_interrupt(vcpu))
-		goto out;
-
-	if (nested_svm_intr(svm))
-		goto out;
-
-	if (!(svm->vcpu.arch.hflags & HF_GIF_MASK))
-		goto out;
-
-	if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
-	    (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
-	    (vmcb->control.event_inj & SVM_EVTINJ_VALID)) {
-		/* unable to deliver irq, set pending irq */
-		svm_set_vintr(svm);
-		svm_inject_irq(svm, 0x0);
-		goto out;
-	}
-	/* Okay, we can deliver the interrupt: grab it and update PIC state. */
-	intr_vector = kvm_cpu_get_interrupt(vcpu);
-	svm_inject_irq(svm, intr_vector);
-out:
-	update_cr8_intercept(vcpu);
+	svm_set_vintr(to_svm(vcpu));
+	svm_inject_irq(to_svm(vcpu), 0x0);
 }
 
-static void kvm_reput_irq(struct vcpu_svm *svm)
+static void svm_intr_inject(struct kvm_vcpu *vcpu)
 {
-	struct vmcb_control_area *control = &svm->vmcb->control;
-
-	if ((control->int_ctl & V_IRQ_MASK)
-	    && !irqchip_in_kernel(svm->vcpu.kvm)) {
-		control->int_ctl &= ~V_IRQ_MASK;
-		kvm_push_irq(&svm->vcpu, control->int_vector);
+	/* try to reinject previous events if any */
+	if (vcpu->arch.interrupt.pending) {
+		svm_queue_irq(to_svm(vcpu), vcpu->arch.interrupt.nr);
+		return;
 	}
 
-	svm->vcpu.arch.interrupt_window_open =
-		!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
-		 (svm->vcpu.arch.hflags & HF_GIF_MASK);
-}
-
-static void svm_do_inject_vector(struct vcpu_svm *svm)
-{
-	svm_inject_irq(svm, kvm_pop_irq(&svm->vcpu));
+	/* try to inject new event if pending */
+	if (kvm_cpu_has_interrupt(vcpu)) {
+		if (vcpu->arch.interrupt_window_open) {
+			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
+			svm_queue_irq(to_svm(vcpu), vcpu->arch.interrupt.nr);
+		}
+	}
 }
 
-static void do_interrupt_requests(struct kvm_vcpu *vcpu,
-				       struct kvm_run *kvm_run)
+static void svm_intr_assist(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
-	struct vmcb_control_area *control = &svm->vmcb->control;
+	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
+		kvm_run->request_interrupt_window;
 
 	if (nested_svm_intr(svm))
-		return;
+		goto out;
 
-	svm->vcpu.arch.interrupt_window_open =
-		(!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
-		 (svm->vmcb->save.rflags & X86_EFLAGS_IF) &&
-		 (svm->vcpu.arch.hflags & HF_GIF_MASK));
+	svm->vcpu.arch.interrupt_window_open = svm_interrupt_allowed(vcpu);
 
-	if (svm->vcpu.arch.interrupt_window_open &&
-	    kvm_cpu_has_interrupt(&svm->vcpu))
-		/*
-		 * If interrupts enabled, and not blocked by sti or mov ss. Good.
-		 */
-		svm_do_inject_vector(svm);
+	svm_intr_inject(vcpu);
 
-	/*
-	 * Interrupts blocked.  Wait for unblock.
-	 */
-	if (!svm->vcpu.arch.interrupt_window_open &&
-	    (kvm_cpu_has_interrupt(&svm->vcpu) ||
-	     kvm_run->request_interrupt_window))
-		svm_set_vintr(svm);
-	else
-		svm_clear_vintr(svm);
+	if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+		enable_irq_window(vcpu);
+
+out:
+	update_cr8_intercept(vcpu);
 }
 
 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -2429,6 +2370,46 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
 	svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
 }
 
+static void svm_complete_interrupts(struct vcpu_svm *svm)
+{
+	u8 vector;
+	int type;
+	u32 exitintinfo = svm->vmcb->control.exit_int_info;
+
+	svm->vcpu.arch.nmi_injected = false;
+	kvm_clear_exception_queue(&svm->vcpu);
+	kvm_clear_interrupt_queue(&svm->vcpu);
+
+	if (!(exitintinfo & SVM_EXITINTINFO_VALID))
+		return;
+
+	vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
+	type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
+
+	switch (type) {
+	case SVM_EXITINTINFO_TYPE_NMI:
+		svm->vcpu.arch.nmi_injected = true;
+		break;
+	case SVM_EXITINTINFO_TYPE_EXEPT:
+		/* In case of software exception do not reinject an exception
+		   vector, but re-execute and instruction instead */
+		if (vector == BP_VECTOR || vector == OF_VECTOR)
+			break;
+		if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
+			u32 err = svm->vmcb->control.exit_int_info_err;
+			kvm_queue_exception_e(&svm->vcpu, vector, err);
+
+		} else
+			kvm_queue_exception(&svm->vcpu, vector);
+		break;
+	case SVM_EXITINTINFO_TYPE_INTR:
+		kvm_queue_interrupt(&svm->vcpu, vector);
+		break;
+	default:
+		break;
+	}
+}
+
 #ifdef CONFIG_X86_64
 #define R "r"
 #else
@@ -2557,6 +2538,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	sync_cr8_to_lapic(vcpu);
 
 	svm->next_rip = 0;
+
+	svm_complete_interrupts(svm);
 }
 
 #undef R
@@ -2678,7 +2661,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.queue_exception = svm_queue_exception,
 	.exception_injected = svm_exception_injected,
 	.inject_pending_irq = svm_intr_assist,
-	.inject_pending_vectors = do_interrupt_requests,
+	.inject_pending_vectors = svm_intr_assist,
 	.interrupt_allowed = svm_interrupt_allowed,
 
 	.set_tss_addr = svm_set_tss_addr,
-- 
cgit v0.10.2


From 1cb948ae86f3d95cce58fac51d00766825f5f783 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 21 Apr 2009 17:45:02 +0300
Subject: KVM: Remove exception_injected() callback.

It always return false for VMX/SVM now.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5edae35..ea3741e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -516,7 +516,6 @@ struct kvm_x86_ops {
 	void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
 	void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
 				bool has_error_code, u32 error_code);
-	bool (*exception_injected)(struct kvm_vcpu *vcpu);
 	void (*inject_pending_irq)(struct kvm_vcpu *vcpu, struct kvm_run *run);
 	void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
 				       struct kvm_run *run);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a80ffaa..8fa5a0e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -196,11 +196,6 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 	svm->vmcb->control.event_inj_err = error_code;
 }
 
-static bool svm_exception_injected(struct kvm_vcpu *vcpu)
-{
-	return false;
-}
-
 static int is_external_interrupt(u32 info)
 {
 	info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
@@ -2659,7 +2654,6 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.get_irq = svm_get_irq,
 	.set_irq = svm_set_irq,
 	.queue_exception = svm_queue_exception,
-	.exception_injected = svm_exception_injected,
 	.inject_pending_irq = svm_intr_assist,
 	.inject_pending_vectors = svm_intr_assist,
 	.interrupt_allowed = svm_interrupt_allowed,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9eb518f..3186fcf 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -789,11 +789,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
 }
 
-static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
-{
-	return false;
-}
-
 /*
  * Swap MSR entry in host/guest MSR entry array.
  */
@@ -3697,7 +3692,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.get_irq = vmx_get_irq,
 	.set_irq = vmx_inject_irq,
 	.queue_exception = vmx_queue_exception,
-	.exception_injected = vmx_exception_injected,
 	.inject_pending_irq = vmx_intr_assist,
 	.inject_pending_vectors = vmx_intr_assist,
 	.interrupt_allowed = vmx_interrupt_allowed,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ae6250b..b81970b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3237,8 +3237,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		profile_hit(KVM_PROFILING, (void *)rip);
 	}
 
-	if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
-		vcpu->arch.exception.pending = false;
 
 	kvm_lapic_sync_from_vapic(vcpu);
 
-- 
cgit v0.10.2


From 1d6ed0cb95a2f0839e1a31f1971dc37cd60c258a Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 21 Apr 2009 17:45:03 +0300
Subject: KVM: Remove inject_pending_vectors() callback

It is the same as inject_pending_irq() for VMX/SVM now.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ea3741e..aa5a54e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -517,8 +517,6 @@ struct kvm_x86_ops {
 	void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
 				bool has_error_code, u32 error_code);
 	void (*inject_pending_irq)(struct kvm_vcpu *vcpu, struct kvm_run *run);
-	void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
-				       struct kvm_run *run);
 	int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8fa5a0e..2905475 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2655,7 +2655,6 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.set_irq = svm_set_irq,
 	.queue_exception = svm_queue_exception,
 	.inject_pending_irq = svm_intr_assist,
-	.inject_pending_vectors = svm_intr_assist,
 	.interrupt_allowed = svm_interrupt_allowed,
 
 	.set_tss_addr = svm_set_tss_addr,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3186fcf..9162b4c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3693,7 +3693,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.set_irq = vmx_inject_irq,
 	.queue_exception = vmx_queue_exception,
 	.inject_pending_irq = vmx_intr_assist,
-	.inject_pending_vectors = vmx_intr_assist,
 	.interrupt_allowed = vmx_interrupt_allowed,
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b81970b..0890df9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3171,10 +3171,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	if (vcpu->arch.exception.pending)
 		__queue_exception(vcpu);
-	else if (irqchip_in_kernel(vcpu->kvm))
-		kvm_x86_ops->inject_pending_irq(vcpu, kvm_run);
 	else
-		kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
+		kvm_x86_ops->inject_pending_irq(vcpu, kvm_run);
 
 	kvm_lapic_sync_to_vapic(vcpu);
 
-- 
cgit v0.10.2


From 115666dfc733a6749d99bcf3b2fe3fa253218b36 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 21 Apr 2009 17:45:04 +0300
Subject: KVM: Remove kvm_push_irq()

No longer used.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2ab6791..39350b2 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -30,11 +30,4 @@ static inline u8 kvm_pop_irq(struct kvm_vcpu *vcpu)
 		clear_bit(word_index, &vcpu->arch.irq_summary);
 	return irq;
 }
-
-static inline void kvm_push_irq(struct kvm_vcpu *vcpu, u8 irq)
-{
-        set_bit(irq, vcpu->arch.irq_pending);
-        set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
-}
-
 #endif
-- 
cgit v0.10.2


From 615d5193055880d44db92b72403b7549251ac2a6 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 21 Apr 2009 17:45:05 +0300
Subject: KVM: sync_lapic_to_cr8() should always sync cr8 to V_TPR

Even if IRQ chip is in userspace.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2905475..143818e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2348,7 +2348,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
 
 	if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
 		int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
-		kvm_lapic_set_tpr(vcpu, cr8);
+		kvm_set_cr8(vcpu, cr8);
 	}
 }
 
@@ -2357,9 +2357,6 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 	u64 cr8;
 
-	if (!irqchip_in_kernel(vcpu->kvm))
-		return;
-
 	cr8 = kvm_get_cr8(vcpu);
 	svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
 	svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
-- 
cgit v0.10.2


From 0a5fff192388d2a74aa9ab5e0d394b745df9f225 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 21 Apr 2009 17:45:06 +0300
Subject: KVM: Do not report TPR write to userspace if new value bigger or
 equal to a previous one.

Saves many exits to userspace in a case of IRQ chip in userspace.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 143818e..e283a63 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1862,9 +1862,13 @@ static int emulate_on_interception(struct vcpu_svm *svm,
 
 static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
+	u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
+	/* instruction emulation calls kvm_set_cr8() */
 	emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
 	if (irqchip_in_kernel(svm->vcpu.kvm))
 		return 1;
+	if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
+		return 1;
 	kvm_run->exit_reason = KVM_EXIT_SET_TPR;
 	return 0;
 }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9162b4c..51f804c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2724,13 +2724,18 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 			kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg));
 			skip_emulated_instruction(vcpu);
 			return 1;
-		case 8:
-			kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg));
-			skip_emulated_instruction(vcpu);
-			if (irqchip_in_kernel(vcpu->kvm))
-				return 1;
-			kvm_run->exit_reason = KVM_EXIT_SET_TPR;
-			return 0;
+		case 8: {
+				u8 cr8_prev = kvm_get_cr8(vcpu);
+				u8 cr8 = kvm_register_read(vcpu, reg);
+				kvm_set_cr8(vcpu, cr8);
+				skip_emulated_instruction(vcpu);
+				if (irqchip_in_kernel(vcpu->kvm))
+					return 1;
+				if (cr8_prev <= cr8)
+					return 1;
+				kvm_run->exit_reason = KVM_EXIT_SET_TPR;
+				return 0;
+			}
 		};
 		break;
 	case 2: /* clts */
-- 
cgit v0.10.2


From c4282df98ae0993983924c00ed76428a6609d68b Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 21 Apr 2009 17:45:07 +0300
Subject: KVM: Get rid of arch.interrupt_window_open & arch.nmi_window_open

They are recalculated before each use anyway.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index aa5a54e..53533ea 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -266,7 +266,6 @@ struct kvm_mmu {
 
 struct kvm_vcpu_arch {
 	u64 host_tsc;
-	int interrupt_window_open;
 	unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
 	DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
 	/*
@@ -360,7 +359,6 @@ struct kvm_vcpu_arch {
 
 	bool nmi_pending;
 	bool nmi_injected;
-	bool nmi_window_open;
 
 	struct mtrr_state_type mtrr_state;
 	u32 pat;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e283a63..0f53439 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -216,8 +216,6 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 
 	kvm_rip_write(vcpu, svm->next_rip);
 	svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
-
-	vcpu->arch.interrupt_window_open = (svm->vcpu.arch.hflags & HF_GIF_MASK);
 }
 
 static int has_svm(void)
@@ -2305,7 +2303,7 @@ static void svm_intr_inject(struct kvm_vcpu *vcpu)
 
 	/* try to inject new event if pending */
 	if (kvm_cpu_has_interrupt(vcpu)) {
-		if (vcpu->arch.interrupt_window_open) {
+		if (svm_interrupt_allowed(vcpu)) {
 			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
 			svm_queue_irq(to_svm(vcpu), vcpu->arch.interrupt.nr);
 		}
@@ -2321,8 +2319,6 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (nested_svm_intr(svm))
 		goto out;
 
-	svm->vcpu.arch.interrupt_window_open = svm_interrupt_allowed(vcpu);
-
 	svm_intr_inject(vcpu);
 
 	if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 51f804c..116eac0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -753,7 +753,6 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	if (interruptibility & 3)
 		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
 			     interruptibility & ~3);
-	vcpu->arch.interrupt_window_open = 1;
 }
 
 static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
@@ -2482,27 +2481,21 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
 }
 
-static void vmx_update_window_states(struct kvm_vcpu *vcpu)
+static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
 {
-	u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
-
-	vcpu->arch.nmi_window_open =
-		!(guest_intr & (GUEST_INTR_STATE_STI |
-				GUEST_INTR_STATE_MOV_SS |
-				GUEST_INTR_STATE_NMI));
 	if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
-		vcpu->arch.nmi_window_open = 0;
+		return 0;
 
-	vcpu->arch.interrupt_window_open =
-		((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
-		 !(guest_intr & (GUEST_INTR_STATE_STI |
-				 GUEST_INTR_STATE_MOV_SS)));
+	return	!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS |
+				GUEST_INTR_STATE_NMI));
 }
 
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
-	vmx_update_window_states(vcpu);
-	return vcpu->arch.interrupt_window_open;
+	return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+		!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
 }
 
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -3194,9 +3187,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		       __func__, vectoring_info, exit_reason);
 
 	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
-		if (vcpu->arch.interrupt_window_open) {
+		if (vmx_interrupt_allowed(vcpu)) {
 			vmx->soft_vnmi_blocked = 0;
-			vcpu->arch.nmi_window_open = 1;
 		} else if (vmx->vnmi_blocked_time > 1000000000LL &&
 			   vcpu->arch.nmi_pending) {
 			/*
@@ -3209,7 +3201,6 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 			       "state on VCPU %d after 1 s timeout\n",
 			       __func__, vcpu->vcpu_id);
 			vmx->soft_vnmi_blocked = 0;
-			vmx->vcpu.arch.nmi_window_open = 1;
 		}
 	}
 
@@ -3324,13 +3315,13 @@ static void vmx_intr_inject(struct kvm_vcpu *vcpu)
 
 	/* try to inject new event if pending */
 	if (vcpu->arch.nmi_pending) {
-		if (vcpu->arch.nmi_window_open) {
+		if (vmx_nmi_allowed(vcpu)) {
 			vcpu->arch.nmi_pending = false;
 			vcpu->arch.nmi_injected = true;
 			vmx_inject_nmi(vcpu);
 		}
 	} else if (kvm_cpu_has_interrupt(vcpu)) {
-		if (vcpu->arch.interrupt_window_open) {
+		if (vmx_interrupt_allowed(vcpu)) {
 			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
 			vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
 		}
@@ -3344,8 +3335,6 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	update_tpr_threshold(vcpu);
 
-	vmx_update_window_states(vcpu);
-
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 		vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
 				GUEST_INTR_STATE_STI |
@@ -3518,8 +3507,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (vmx->rmode.irq.pending)
 		fixup_rmode_irq(vmx);
 
-	vmx_update_window_states(vcpu);
-
 	asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
 	vmx->launched = 1;
 
-- 
cgit v0.10.2


From 95ba82731374eb1c2af4dd442526c4b314f0e8b6 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 21 Apr 2009 17:45:08 +0300
Subject: KVM: SVM: Add NMI injection support

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 53533ea..dd9ecd3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -512,10 +512,15 @@ struct kvm_x86_ops {
 				unsigned char *hypercall_addr);
 	int (*get_irq)(struct kvm_vcpu *vcpu);
 	void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
+	void (*set_nmi)(struct kvm_vcpu *vcpu);
 	void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
 				bool has_error_code, u32 error_code);
-	void (*inject_pending_irq)(struct kvm_vcpu *vcpu, struct kvm_run *run);
 	int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
+	int (*nmi_allowed)(struct kvm_vcpu *vcpu);
+	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
+	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
+	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
+	void (*drop_interrupt_shadow)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
 	int (*get_mt_mask_shift)(void);
@@ -763,6 +768,7 @@ enum {
 #define HF_GIF_MASK		(1 << 0)
 #define HF_HIF_MASK		(1 << 1)
 #define HF_VINTR_MASK		(1 << 2)
+#define HF_NMI_MASK		(1 << 3)
 
 /*
  * Hardware virtualization extension instructions may fault if a
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 0f53439..1807288 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1843,6 +1843,14 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	return 1;
 }
 
+static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	++svm->vcpu.stat.nmi_window_exits;
+	svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
+	svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
+	return 1;
+}
+
 static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
 	if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
@@ -1863,8 +1871,10 @@ static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
 	/* instruction emulation calls kvm_set_cr8() */
 	emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
-	if (irqchip_in_kernel(svm->vcpu.kvm))
+	if (irqchip_in_kernel(svm->vcpu.kvm)) {
+		svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
 		return 1;
+	}
 	if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
 		return 1;
 	kvm_run->exit_reason = KVM_EXIT_SET_TPR;
@@ -2120,6 +2130,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 	[SVM_EXIT_VINTR]			= interrupt_window_interception,
 	/* [SVM_EXIT_CR0_SEL_WRITE]		= emulate_on_interception, */
 	[SVM_EXIT_CPUID]			= cpuid_interception,
+	[SVM_EXIT_IRET]                         = iret_interception,
 	[SVM_EXIT_INVD]                         = emulate_on_interception,
 	[SVM_EXIT_HLT]				= halt_interception,
 	[SVM_EXIT_INVLPG]			= invlpg_interception,
@@ -2227,6 +2238,21 @@ static void pre_svm_run(struct vcpu_svm *svm)
 		new_asid(svm, svm_data);
 }
 
+static void svm_drop_interrupt_shadow(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
+}
+
+static void svm_inject_nmi(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
+	vcpu->arch.hflags |= HF_NMI_MASK;
+	svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
+	++vcpu->stat.nmi_injections;
+}
 
 static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
 {
@@ -2242,8 +2268,10 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
 		((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
 }
 
-static void svm_queue_irq(struct vcpu_svm *svm, unsigned nr)
+static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned nr)
 {
+	struct vcpu_svm *svm = to_svm(vcpu);
+
 	svm->vmcb->control.event_inj = nr |
 		SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
 }
@@ -2254,28 +2282,26 @@ static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
 
 	nested_svm_intr(svm);
 
-	svm_queue_irq(svm, irq);
+	svm_queue_irq(vcpu, irq);
 }
 
-static void update_cr8_intercept(struct kvm_vcpu *vcpu)
+static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
-	struct vmcb *vmcb = svm->vmcb;
-	int max_irr, tpr;
 
-	if (!irqchip_in_kernel(vcpu->kvm) || vcpu->arch.apic->vapic_addr)
+	if (irr == -1)
 		return;
 
-	vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
-
-	max_irr = kvm_lapic_find_highest_irr(vcpu);
-	if (max_irr == -1)
-		return;
-
-	tpr = kvm_lapic_get_cr8(vcpu) << 4;
+	if (tpr >= irr)
+		svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
+}
 
-	if (tpr >= (max_irr & 0xf0))
-		vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
+static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	struct vmcb *vmcb = svm->vmcb;
+	return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
+		!(svm->vcpu.arch.hflags & HF_NMI_MASK);
 }
 
 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
@@ -2293,39 +2319,12 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
 	svm_inject_irq(to_svm(vcpu), 0x0);
 }
 
-static void svm_intr_inject(struct kvm_vcpu *vcpu)
-{
-	/* try to reinject previous events if any */
-	if (vcpu->arch.interrupt.pending) {
-		svm_queue_irq(to_svm(vcpu), vcpu->arch.interrupt.nr);
-		return;
-	}
-
-	/* try to inject new event if pending */
-	if (kvm_cpu_has_interrupt(vcpu)) {
-		if (svm_interrupt_allowed(vcpu)) {
-			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
-			svm_queue_irq(to_svm(vcpu), vcpu->arch.interrupt.nr);
-		}
-	}
-}
-
-static void svm_intr_assist(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
-	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
-		kvm_run->request_interrupt_window;
 
-	if (nested_svm_intr(svm))
-		goto out;
-
-	svm_intr_inject(vcpu);
-
-	if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+	if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
 		enable_irq_window(vcpu);
-
-out:
-	update_cr8_intercept(vcpu);
 }
 
 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -2650,9 +2649,14 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.patch_hypercall = svm_patch_hypercall,
 	.get_irq = svm_get_irq,
 	.set_irq = svm_set_irq,
+	.set_nmi = svm_inject_nmi,
 	.queue_exception = svm_queue_exception,
-	.inject_pending_irq = svm_intr_assist,
 	.interrupt_allowed = svm_interrupt_allowed,
+	.nmi_allowed = svm_nmi_allowed,
+	.enable_nmi_window = enable_nmi_window,
+	.enable_irq_window = enable_irq_window,
+	.update_cr8_intercept = update_cr8_intercept,
+	.drop_interrupt_shadow = svm_drop_interrupt_shadow,
 
 	.set_tss_addr = svm_set_tss_addr,
 	.get_tdp_level = get_npt_level,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 116eac0..bad2413 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1314,6 +1314,9 @@ static __init int hardware_setup(void)
 	if (!cpu_has_vmx_flexpriority())
 		flexpriority_enabled = 0;
 
+	if (!cpu_has_vmx_tpr_shadow())
+		kvm_x86_ops->update_cr8_intercept = NULL;
+
 	return alloc_kvm_area();
 }
 
@@ -2404,6 +2407,12 @@ out:
 	return ret;
 }
 
+void vmx_drop_interrupt_shadow(struct kvm_vcpu *vcpu)
+{
+	vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+			GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
+}
+
 static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	u32 cpu_based_vm_exec_control;
@@ -3214,21 +3223,14 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static void update_tpr_threshold(struct kvm_vcpu *vcpu)
+static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 {
-	int max_irr, tpr;
-
-	if (!vm_need_tpr_shadow(vcpu->kvm))
-		return;
-
-	if (!kvm_lapic_enabled(vcpu) ||
-	    ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) {
+	if (irr == -1 || tpr < irr) {
 		vmcs_write32(TPR_THRESHOLD, 0);
 		return;
 	}
 
-	tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4;
-	vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
+	vmcs_write32(TPR_THRESHOLD, irr);
 }
 
 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
@@ -3300,55 +3302,6 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	}
 }
 
-static void vmx_intr_inject(struct kvm_vcpu *vcpu)
-{
-	/* try to reinject previous events if any */
-	if (vcpu->arch.nmi_injected) {
-		vmx_inject_nmi(vcpu);
-		return;
-	}
-
-	if (vcpu->arch.interrupt.pending) {
-		vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
-		return;
-	}
-
-	/* try to inject new event if pending */
-	if (vcpu->arch.nmi_pending) {
-		if (vmx_nmi_allowed(vcpu)) {
-			vcpu->arch.nmi_pending = false;
-			vcpu->arch.nmi_injected = true;
-			vmx_inject_nmi(vcpu);
-		}
-	} else if (kvm_cpu_has_interrupt(vcpu)) {
-		if (vmx_interrupt_allowed(vcpu)) {
-			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
-			vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
-		}
-	}
-}
-
-static void vmx_intr_assist(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
-		kvm_run->request_interrupt_window;
-
-	update_tpr_threshold(vcpu);
-
-	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
-		vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
-				GUEST_INTR_STATE_STI |
-				GUEST_INTR_STATE_MOV_SS);
-
-	vmx_intr_inject(vcpu);
-
-	/* enable NMI/IRQ window open exits if needed */
-	if (vcpu->arch.nmi_pending)
-		enable_nmi_window(vcpu);
-	else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
-		enable_irq_window(vcpu);
-}
-
 /*
  * Failure to inject an interrupt should give us the information
  * in IDT_VECTORING_INFO_FIELD.  However, if the failure occurs
@@ -3683,9 +3636,15 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.patch_hypercall = vmx_patch_hypercall,
 	.get_irq = vmx_get_irq,
 	.set_irq = vmx_inject_irq,
+	.set_nmi = vmx_inject_nmi,
 	.queue_exception = vmx_queue_exception,
-	.inject_pending_irq = vmx_intr_assist,
 	.interrupt_allowed = vmx_interrupt_allowed,
+	.nmi_allowed = vmx_nmi_allowed,
+	.enable_nmi_window = enable_nmi_window,
+	.enable_irq_window = enable_irq_window,
+	.update_cr8_intercept = update_cr8_intercept,
+	.drop_interrupt_shadow = vmx_drop_interrupt_shadow,
+
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,
 	.get_mt_mask_shift = vmx_get_mt_mask_shift,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0890df9..96e995c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3114,6 +3114,68 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
 	up_read(&vcpu->kvm->slots_lock);
 }
 
+static void update_cr8_intercept(struct kvm_vcpu *vcpu)
+{
+	int max_irr, tpr;
+
+	if (!kvm_x86_ops->update_cr8_intercept)
+		return;
+
+	max_irr = kvm_lapic_find_highest_irr(vcpu);
+
+	if (max_irr != -1)
+		max_irr >>= 4;
+
+	tpr = kvm_lapic_get_cr8(vcpu);
+
+	kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
+}
+
+static void inject_irq(struct kvm_vcpu *vcpu)
+{
+	/* try to reinject previous events if any */
+	if (vcpu->arch.nmi_injected) {
+		kvm_x86_ops->set_nmi(vcpu);
+		return;
+	}
+
+	if (vcpu->arch.interrupt.pending) {
+		kvm_x86_ops->set_irq(vcpu, vcpu->arch.interrupt.nr);
+		return;
+	}
+
+	/* try to inject new event if pending */
+	if (vcpu->arch.nmi_pending) {
+		if (kvm_x86_ops->nmi_allowed(vcpu)) {
+			vcpu->arch.nmi_pending = false;
+			vcpu->arch.nmi_injected = true;
+			kvm_x86_ops->set_nmi(vcpu);
+		}
+	} else if (kvm_cpu_has_interrupt(vcpu)) {
+		if (kvm_x86_ops->interrupt_allowed(vcpu)) {
+			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
+			kvm_x86_ops->set_irq(vcpu, vcpu->arch.interrupt.nr);
+		}
+	}
+}
+
+static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
+		kvm_run->request_interrupt_window;
+
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		kvm_x86_ops->drop_interrupt_shadow(vcpu);
+
+	inject_irq(vcpu);
+
+	/* enable NMI/IRQ window open exits if needed */
+	if (vcpu->arch.nmi_pending)
+		kvm_x86_ops->enable_nmi_window(vcpu);
+	else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+		kvm_x86_ops->enable_irq_window(vcpu);
+}
+
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	int r;
@@ -3172,9 +3234,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (vcpu->arch.exception.pending)
 		__queue_exception(vcpu);
 	else
-		kvm_x86_ops->inject_pending_irq(vcpu, kvm_run);
+		inject_pending_irq(vcpu, kvm_run);
 
-	kvm_lapic_sync_to_vapic(vcpu);
+	if (kvm_lapic_enabled(vcpu)) {
+		if (!vcpu->arch.apic->vapic_addr)
+			update_cr8_intercept(vcpu);
+		else
+			kvm_lapic_sync_to_vapic(vcpu);
+	}
 
 	up_read(&vcpu->kvm->slots_lock);
 
-- 
cgit v0.10.2


From 16d7a191170f0ca48c2c3277017b3e6d275e0711 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 21 Apr 2009 17:45:10 +0300
Subject: KVM: Fix userspace IRQ chip migration

Re-put pending IRQ vector into interrupt_bitmap before migration.
Otherwise it will be lost if migration happens in the wrong time.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 96e995c..6391721 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3566,17 +3566,17 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 	sregs->efer = vcpu->arch.shadow_efer;
 	sregs->apic_base = kvm_get_apic_base(vcpu);
 
-	if (irqchip_in_kernel(vcpu->kvm)) {
+	if (irqchip_in_kernel(vcpu->kvm))
 		memset(sregs->interrupt_bitmap, 0,
 		       sizeof sregs->interrupt_bitmap);
-		pending_vec = kvm_x86_ops->get_irq(vcpu);
-		if (pending_vec >= 0)
-			set_bit(pending_vec,
-				(unsigned long *)sregs->interrupt_bitmap);
-	} else
+	else
 		memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
 		       sizeof sregs->interrupt_bitmap);
 
+	pending_vec = kvm_x86_ops->get_irq(vcpu);
+	if (pending_vec >= 0)
+		set_bit(pending_vec, (unsigned long *)sregs->interrupt_bitmap);
+
 	vcpu_put(vcpu);
 
 	return 0;
-- 
cgit v0.10.2


From 14d0bc1f7c8226d5088e7182c3b53e0c7e91d1af Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 21 Apr 2009 17:45:11 +0300
Subject: KVM: Get rid of get_irq() callback

It just returns pending IRQ vector from the queue for VMX/SVM.
Get IRQ directly from the queue before migration and put it back
after.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index dd9ecd3..3e94d05 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -510,7 +510,6 @@ struct kvm_x86_ops {
 	void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
 	void (*patch_hypercall)(struct kvm_vcpu *vcpu,
 				unsigned char *hypercall_addr);
-	int (*get_irq)(struct kvm_vcpu *vcpu);
 	void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
 	void (*set_nmi)(struct kvm_vcpu *vcpu);
 	void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1807288..d96a6d3 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -966,13 +966,6 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
 	return 0;
 }
 
-static int svm_get_irq(struct kvm_vcpu *vcpu)
-{
-	if (!vcpu->arch.interrupt.pending)
-		return -1;
-	return vcpu->arch.interrupt.nr;
-}
-
 static void load_host_msrs(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_X86_64
@@ -2647,7 +2640,6 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.handle_exit = handle_exit,
 	.skip_emulated_instruction = skip_emulated_instruction,
 	.patch_hypercall = svm_patch_hypercall,
-	.get_irq = svm_get_irq,
 	.set_irq = svm_set_irq,
 	.set_nmi = svm_inject_nmi,
 	.queue_exception = svm_queue_exception,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bad2413..25be53a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1045,13 +1045,6 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
 	return 0;
 }
 
-static int vmx_get_irq(struct kvm_vcpu *vcpu)
-{
-	if (!vcpu->arch.interrupt.pending)
-		return -1;
-	return vcpu->arch.interrupt.nr;
-}
-
 static __init int cpu_has_kvm_support(void)
 {
 	return cpu_has_vmx();
@@ -3634,7 +3627,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.handle_exit = vmx_handle_exit,
 	.skip_emulated_instruction = skip_emulated_instruction,
 	.patch_hypercall = vmx_patch_hypercall,
-	.get_irq = vmx_get_irq,
 	.set_irq = vmx_inject_irq,
 	.set_nmi = vmx_inject_nmi,
 	.queue_exception = vmx_queue_exception,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6391721..0f3e04b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3536,7 +3536,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 				  struct kvm_sregs *sregs)
 {
 	struct descriptor_table dt;
-	int pending_vec;
 
 	vcpu_load(vcpu);
 
@@ -3573,9 +3572,9 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 		memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
 		       sizeof sregs->interrupt_bitmap);
 
-	pending_vec = kvm_x86_ops->get_irq(vcpu);
-	if (pending_vec >= 0)
-		set_bit(pending_vec, (unsigned long *)sregs->interrupt_bitmap);
+	if (vcpu->arch.interrupt.pending)
+		set_bit(vcpu->arch.interrupt.nr,
+			(unsigned long *)sregs->interrupt_bitmap);
 
 	vcpu_put(vcpu);
 
@@ -4097,9 +4096,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 			max_bits);
 		/* Only pending external irq is handled here */
 		if (pending_vec < max_bits) {
-			kvm_x86_ops->set_irq(vcpu, pending_vec);
-			pr_debug("Set back pending irq %d\n",
-				 pending_vec);
+			kvm_queue_interrupt(vcpu, pending_vec);
+			pr_debug("Set back pending irq %d\n", pending_vec);
 		}
 		kvm_pic_clear_isr_ack(vcpu->kvm);
 	}
-- 
cgit v0.10.2


From 9b62e5b10ff0f98346bcbe4a4fe3a0ca8fa7be30 Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@suse.de>
Date: Tue, 7 Apr 2009 23:58:56 +0000
Subject: KVM: Wake up waitqueue before calling get_cpu()

This moves the get_cpu() call down to be called after we wake up the
waiters. Therefore the waitqueue locks can safely be rt mutex.

Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Sven-Thorsten Dietrich <sven@thebigcorporation.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0f3e04b..e271371 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4569,7 +4569,7 @@ static void vcpu_kick_intr(void *info)
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 {
 	int ipi_pcpu = vcpu->cpu;
-	int cpu = get_cpu();
+	int cpu;
 
 	if (waitqueue_active(&vcpu->wq)) {
 		wake_up_interruptible(&vcpu->wq);
@@ -4579,6 +4579,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 	 * We may be called synchronously with irqs disabled in guest mode,
 	 * So need not to call smp_call_function_single() in that case.
 	 */
+	cpu = get_cpu();
 	if (vcpu->guest_mode && vcpu->cpu != cpu)
 		smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
 	put_cpu();
-- 
cgit v0.10.2


From 4b12f0de33a64dfc624b2480f55b674f7fa23ef2 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 27 Apr 2009 20:35:42 +0800
Subject: KVM: Replace get_mt_mask_shift with get_mt_mask

Shadow_mt_mask is out of date, now it have only been used as a flag to indicate
if TDP enabled. Get rid of it and use tdp_enabled instead.

Also put memory type logical in kvm_x86_ops->get_mt_mask().

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3e94d05..8a6f6b6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -522,7 +522,7 @@ struct kvm_x86_ops {
 	void (*drop_interrupt_shadow)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
-	int (*get_mt_mask_shift)(void);
+	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
 };
 
 extern struct kvm_x86_ops *kvm_x86_ops;
@@ -536,7 +536,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
 void kvm_mmu_set_base_ptes(u64 base_pte);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask);
+		u64 dirty_mask, u64 nx_mask, u64 x_mask);
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
@@ -550,6 +550,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 			  const void *val, int bytes);
 int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
 		  gpa_t addr, unsigned long *ret);
+u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
 
 extern bool tdp_enabled;
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3592aea..bc614f9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -178,7 +178,6 @@ static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */
 static u64 __read_mostly shadow_user_mask;
 static u64 __read_mostly shadow_accessed_mask;
 static u64 __read_mostly shadow_dirty_mask;
-static u64 __read_mostly shadow_mt_mask;
 
 static inline u64 rsvd_bits(int s, int e)
 {
@@ -199,14 +198,13 @@ void kvm_mmu_set_base_ptes(u64 base_pte)
 EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
 
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask)
+		u64 dirty_mask, u64 nx_mask, u64 x_mask)
 {
 	shadow_user_mask = user_mask;
 	shadow_accessed_mask = accessed_mask;
 	shadow_dirty_mask = dirty_mask;
 	shadow_nx_mask = nx_mask;
 	shadow_x_mask = x_mask;
-	shadow_mt_mask = mt_mask;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
@@ -1608,7 +1606,7 @@ static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
 	return mtrr_state->def_type;
 }
 
-static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
+u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
 	u8 mtrr;
 
@@ -1618,6 +1616,7 @@ static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
 		mtrr = MTRR_TYPE_WRBACK;
 	return mtrr;
 }
+EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
 
 static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
@@ -1670,7 +1669,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 {
 	u64 spte;
 	int ret = 0;
-	u64 mt_mask = shadow_mt_mask;
 
 	/*
 	 * We don't set the accessed bit, since we sometimes want to see
@@ -1690,16 +1688,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		spte |= shadow_user_mask;
 	if (largepage)
 		spte |= PT_PAGE_SIZE_MASK;
-	if (mt_mask) {
-		if (!kvm_is_mmio_pfn(pfn)) {
-			mt_mask = get_memory_type(vcpu, gfn) <<
-				kvm_x86_ops->get_mt_mask_shift();
-			mt_mask |= VMX_EPT_IGMT_BIT;
-		} else
-			mt_mask = MTRR_TYPE_UNCACHABLE <<
-				kvm_x86_ops->get_mt_mask_shift();
-		spte |= mt_mask;
-	}
+	if (tdp_enabled)
+		spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
+			kvm_is_mmio_pfn(pfn));
 
 	spte |= (u64)pfn << PAGE_SHIFT;
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d96a6d3..6350378 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2589,7 +2589,7 @@ static int get_npt_level(void)
 #endif
 }
 
-static int svm_get_mt_mask_shift(void)
+static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 {
 	return 0;
 }
@@ -2652,7 +2652,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 
 	.set_tss_addr = svm_set_tss_addr,
 	.get_tdp_level = get_npt_level,
-	.get_mt_mask_shift = svm_get_mt_mask_shift,
+	.get_mt_mask = svm_get_mt_mask,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 25be53a..59b080c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3577,9 +3577,17 @@ static int get_ept_level(void)
 	return VMX_EPT_DEFAULT_GAW + 1;
 }
 
-static int vmx_get_mt_mask_shift(void)
+static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 {
-	return VMX_EPT_MT_EPTE_SHIFT;
+	u64 ret;
+
+	if (is_mmio)
+		ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
+	else
+		ret = (kvm_get_guest_memory_type(vcpu, gfn) <<
+			VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IGMT_BIT;
+
+	return ret;
 }
 
 static struct kvm_x86_ops vmx_x86_ops = {
@@ -3639,7 +3647,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,
-	.get_mt_mask_shift = vmx_get_mt_mask_shift,
+	.get_mt_mask = vmx_get_mt_mask,
 };
 
 static int __init vmx_init(void)
@@ -3698,8 +3706,7 @@ static int __init vmx_init(void)
 		kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
 			VMX_EPT_WRITABLE_MASK);
 		kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
-				VMX_EPT_EXECUTABLE_MASK,
-				VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
+				VMX_EPT_EXECUTABLE_MASK);
 		kvm_enable_tdp();
 	} else
 		kvm_disable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e271371..dd05682 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2772,7 +2772,7 @@ int kvm_arch_init(void *opaque)
 	kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
 	kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
 	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
-			PT_DIRTY_MASK, PT64_NX_MASK, 0, 0);
+			PT_DIRTY_MASK, PT64_NX_MASK, 0);
 
 	for_each_possible_cpu(cpu)
 		per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
-- 
cgit v0.10.2


From 522c68c4416de3cd3e11a9ff10d58e776a69ae1e Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 27 Apr 2009 20:35:43 +0800
Subject: KVM: Enable snooping control for supported hardware

Memory aliases with different memory type is a problem for guest. For the guest
without assigned device, the memory type of guest memory would always been the
same as host(WB); but for the assigned device, some part of memory may be used
as DMA and then set to uncacheable memory type(UC/WC), which would be a conflict of
host memory type then be a potential issue.

Snooping control can guarantee the cache correctness of memory go through the
DMA engine of VT-d.

[avi: fix build on ia64]

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 589536f..5f43697 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -474,6 +474,7 @@ struct kvm_arch {
 
 	struct list_head assigned_dev_head;
 	struct iommu_domain *iommu_domain;
+	int iommu_flags;
 	struct hlist_head irq_ack_notifier_list;
 
 	unsigned long irq_sources_bitmap;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8a6f6b6..253d8f6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -393,6 +393,7 @@ struct kvm_arch{
 	struct list_head active_mmu_pages;
 	struct list_head assigned_dev_head;
 	struct iommu_domain *iommu_domain;
+	int iommu_flags;
 	struct kvm_pic *vpic;
 	struct kvm_ioapic *vioapic;
 	struct kvm_pit *vpit;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 59b080c..e8a5649 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3581,11 +3581,26 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 {
 	u64 ret;
 
+	/* For VT-d and EPT combination
+	 * 1. MMIO: always map as UC
+	 * 2. EPT with VT-d:
+	 *   a. VT-d without snooping control feature: can't guarantee the
+	 *	result, try to trust guest.
+	 *   b. VT-d with snooping control feature: snooping control feature of
+	 *	VT-d engine can guarantee the cache correctness. Just set it
+	 *	to WB to keep consistent with host. So the same as item 3.
+	 * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep
+	 *    consistent with host MTRR
+	 */
 	if (is_mmio)
 		ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
+	else if (vcpu->kvm->arch.iommu_domain &&
+		!(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY))
+		ret = kvm_get_guest_memory_type(vcpu, gfn) <<
+		      VMX_EPT_MT_EPTE_SHIFT;
 	else
-		ret = (kvm_get_guest_memory_type(vcpu, gfn) <<
-			VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IGMT_BIT;
+		ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
+			| VMX_EPT_IGMT_BIT;
 
 	return ret;
 }
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 72d5684..bdce8e1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -367,6 +367,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian);
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 
+/* For vcpu->arch.iommu_flags */
+#define KVM_IOMMU_CACHE_COHERENCY	0x1
+
 #ifdef CONFIG_IOMMU_API
 int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn,
 			unsigned long npages);
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 4c40375..1514758 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -39,11 +39,16 @@ int kvm_iommu_map_pages(struct kvm *kvm,
 	pfn_t pfn;
 	int i, r = 0;
 	struct iommu_domain *domain = kvm->arch.iommu_domain;
+	int flags;
 
 	/* check if iommu exists and in use */
 	if (!domain)
 		return 0;
 
+	flags = IOMMU_READ | IOMMU_WRITE;
+	if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)
+		flags |= IOMMU_CACHE;
+
 	for (i = 0; i < npages; i++) {
 		/* check if already mapped */
 		if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn)))
@@ -53,8 +58,7 @@ int kvm_iommu_map_pages(struct kvm *kvm,
 		r = iommu_map_range(domain,
 				    gfn_to_gpa(gfn),
 				    pfn_to_hpa(pfn),
-				    PAGE_SIZE,
-				    IOMMU_READ | IOMMU_WRITE);
+				    PAGE_SIZE, flags);
 		if (r) {
 			printk(KERN_ERR "kvm_iommu_map_address:"
 			       "iommu failed to map pfn=%lx\n", pfn);
@@ -88,7 +92,7 @@ int kvm_assign_device(struct kvm *kvm,
 {
 	struct pci_dev *pdev = NULL;
 	struct iommu_domain *domain = kvm->arch.iommu_domain;
-	int r;
+	int r, last_flags;
 
 	/* check if iommu exists and in use */
 	if (!domain)
@@ -107,12 +111,29 @@ int kvm_assign_device(struct kvm *kvm,
 		return r;
 	}
 
+	last_flags = kvm->arch.iommu_flags;
+	if (iommu_domain_has_cap(kvm->arch.iommu_domain,
+				 IOMMU_CAP_CACHE_COHERENCY))
+		kvm->arch.iommu_flags |= KVM_IOMMU_CACHE_COHERENCY;
+
+	/* Check if need to update IOMMU page table for guest memory */
+	if ((last_flags ^ kvm->arch.iommu_flags) ==
+			KVM_IOMMU_CACHE_COHERENCY) {
+		kvm_iommu_unmap_memslots(kvm);
+		r = kvm_iommu_map_memslots(kvm);
+		if (r)
+			goto out_unmap;
+	}
+
 	printk(KERN_DEBUG "assign device: host bdf = %x:%x:%x\n",
 		assigned_dev->host_busnr,
 		PCI_SLOT(assigned_dev->host_devfn),
 		PCI_FUNC(assigned_dev->host_devfn));
 
 	return 0;
+out_unmap:
+	kvm_iommu_unmap_memslots(kvm);
+	return r;
 }
 
 int kvm_deassign_device(struct kvm *kvm,
-- 
cgit v0.10.2


From 8e1c18157d8772e65247ac55525a05a26b83fffe Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 29 Apr 2009 11:09:04 +0800
Subject: KVM: VMX: Disable VMX when system shutdown

Intel TXT(Trusted Execution Technology) required VMX off for all cpu to work
when system shutdown.

CC: Joseph Cihula <joseph.cihula@intel.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 63d5fa2..29c0afb 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2391,15 +2391,15 @@ EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot);
 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
 		      void *v)
 {
-	if (val == SYS_RESTART) {
-		/*
-		 * Some (well, at least mine) BIOSes hang on reboot if
-		 * in vmx root mode.
-		 */
-		printk(KERN_INFO "kvm: exiting hardware virtualization\n");
-		kvm_rebooting = true;
-		on_each_cpu(hardware_disable, NULL, 1);
-	}
+	/*
+	 * Some (well, at least mine) BIOSes hang on reboot if
+	 * in vmx root mode.
+	 *
+	 * And Intel TXT required VMX off for all cpu when system shutdown.
+	 */
+	printk(KERN_INFO "kvm: exiting hardware virtualization\n");
+	kvm_rebooting = true;
+	on_each_cpu(hardware_disable, NULL, 1);
 	return NOTIFY_OK;
 }
 
-- 
cgit v0.10.2


From efbc100c20f9dc4a219a2a92763d68e4064109f2 Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@sous-sol.org>
Date: Fri, 1 May 2009 14:15:43 -0700
Subject: KVM: Trivial format fix in setup_routing_entry()

Remove extra tab.

Signed-off-by: Chris Wright <chrisw@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 4fa1f60..a8bd466 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -271,7 +271,7 @@ static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e,
 			delta = 8;
 			break;
 		case KVM_IRQCHIP_IOAPIC:
-				e->set = kvm_set_ioapic_irq;
+			e->set = kvm_set_ioapic_irq;
 			break;
 		default:
 			goto out;
-- 
cgit v0.10.2


From 9b5843ddd20557b77c73ddbcc814a8c816bf3d3a Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Wed, 29 Apr 2009 17:29:09 -0400
Subject: KVM: fix apic_debug instances

Apparently nobody turned this on in a while...
setting apic_debug to something compilable, generates
some errors. This patch fixes it.

Signed-off-by: Glauber Costa <glommer@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 4d76bb6..ae99d83 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -445,7 +445,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 	apic_debug("icr_high 0x%x, icr_low 0x%x, "
 		   "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
 		   "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
-		   icr_high, icr_low, irq.shorthand, irq.dest,
+		   icr_high, icr_low, irq.shorthand, irq.dest_id,
 		   irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
 		   irq.vector);
 
@@ -560,7 +560,7 @@ static void update_divide_count(struct kvm_lapic *apic)
 	apic->divide_count = 0x1 << (tmp2 & 0x7);
 
 	apic_debug("timer divide count is 0x%x\n",
-				   apic->lapic_timer.divide_count);
+				   apic->divide_count);
 }
 
 static void start_apic_timer(struct kvm_lapic *apic)
-- 
cgit v0.10.2


From b586eb0253083795e58dcbe76665410d4676dc08 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@amd.com>
Date: Tue, 28 Apr 2009 12:45:43 +0200
Subject: KVM: SVM: Fix cross vendor migration issue in segment segment
 descriptor

On AMD CPUs sometimes the DB bit in the stack segment
descriptor is left as 1, although the whole segment has
been made unusable. Clear it here to pass an Intel VMX
entry check when cross vendor migrating.

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6350378..e3ea982 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -803,6 +803,15 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
 		if (!var->unusable)
 			var->type |= 0x1;
 		break;
+	case VCPU_SREG_SS:
+		/* On AMD CPUs sometimes the DB bit in the segment
+		 * descriptor is left as 1, although the whole segment has
+		 * been made unusable. Clear it here to pass an Intel VMX
+		 * entry check when cross vendor migrating.
+		 */
+		if (var->unusable)
+			var->db = 0;
+		break;
 	}
 }
 
-- 
cgit v0.10.2


From fe8e7f83de3f0979f13f57992cb33e4a8041272d Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 23 Apr 2009 17:03:48 +0300
Subject: KVM: SVM: Don't reinject event that caused a task switch

If a task switch caused by an event remove it from the event queue.
VMX already does that.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e3ea982..f994c6d 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1811,6 +1811,10 @@ static int task_switch_interception(struct vcpu_svm *svm,
 	int int_type = svm->vmcb->control.exit_int_info &
 		SVM_EXITINTINFO_TYPE_MASK;
 	int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
+	uint32_t type =
+		svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
+	uint32_t idt_v =
+		svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
 
 	tss_selector = (u16)svm->vmcb->control.exit_info_1;
 
@@ -1820,11 +1824,26 @@ static int task_switch_interception(struct vcpu_svm *svm,
 	else if (svm->vmcb->control.exit_info_2 &
 		 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
 		reason = TASK_SWITCH_JMP;
-	else if (svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID)
+	else if (idt_v)
 		reason = TASK_SWITCH_GATE;
 	else
 		reason = TASK_SWITCH_CALL;
 
+	if (reason == TASK_SWITCH_GATE) {
+		switch (type) {
+		case SVM_EXITINTINFO_TYPE_NMI:
+			svm->vcpu.arch.nmi_injected = false;
+			break;
+		case SVM_EXITINTINFO_TYPE_EXEPT:
+			kvm_clear_exception_queue(&svm->vcpu);
+			break;
+		case SVM_EXITINTINFO_TYPE_INTR:
+			kvm_clear_interrupt_queue(&svm->vcpu);
+			break;
+		default:
+			break;
+		}
+	}
 
 	if (reason != TASK_SWITCH_GATE ||
 	    int_type == SVM_EXITINTINFO_TYPE_SOFT ||
@@ -2203,7 +2222,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
 	if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
 	    exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
-	    exit_code != SVM_EXIT_NPF)
+	    exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH)
 		printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
 		       "exit_code 0x%x\n",
 		       __func__, svm->vmcb->control.exit_int_info,
-- 
cgit v0.10.2


From d6a8c875f35a6e1b3fb3f21e93eabb183b1f39ee Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 20 Apr 2009 18:10:07 +0200
Subject: KVM: Drop request_nmi from stats

The stats entry request_nmi is no longer used as the related user space
interface was dropped. So clean it up.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 253d8f6..ab7de4a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -441,7 +441,6 @@ struct kvm_vcpu_stat {
 	u32 halt_exits;
 	u32 halt_wakeup;
 	u32 request_irq_exits;
-	u32 request_nmi_exits;
 	u32 irq_exits;
 	u32 host_state_reload;
 	u32 efer_reload;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dd05682..a0faa48 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -91,7 +91,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ "hypercalls", VCPU_STAT(hypercalls) },
 	{ "request_irq", VCPU_STAT(request_irq_exits) },
-	{ "request_nmi", VCPU_STAT(request_nmi_exits) },
 	{ "irq_exits", VCPU_STAT(irq_exits) },
 	{ "host_state_reload", VCPU_STAT(host_state_reload) },
 	{ "efer_reload", VCPU_STAT(efer_reload) },
-- 
cgit v0.10.2


From 8d753f369bd28fff1706ffe9fb9fea4fd88cf85b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 10 May 2009 11:41:39 +0300
Subject: KVM: Fix cpuid feature misreporting

MTRR, PAT, MCE, and MCA are all supported (to some extent) but not reported.
Vista requires these features, so if userspace relies on kernel cpuid
reporting, it loses support for Vista.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a0faa48..da3e65b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1254,9 +1254,12 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
 		bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
 		bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
+		bit(X86_FEATURE_MCE) |
 		bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
-		bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
-		bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
+		bit(X86_FEATURE_SEP) | bit(X86_FEATURE_MTRR) |
+		bit(X86_FEATURE_PGE) | bit(X86_FEATURE_MCA) |
+		bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PAT) |
+		bit(X86_FEATURE_PSE36) |
 		bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
 		bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
 		bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
-- 
cgit v0.10.2


From 7faa4ee1c777a5f8e8373430cfd9cb6172aa3503 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 10 May 2009 13:55:35 +0300
Subject: KVM: Add AMD cpuid bit: cr8_legacy, abm, misaligned sse, sse4, 3dnow
 prefetch

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index da3e65b..95de3d3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1247,44 +1247,47 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	entry->flags = 0;
 }
 
+#define F(x) bit(X86_FEATURE_##x)
+
 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 			 u32 index, int *nent, int maxnent)
 {
-	const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
-		bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
-		bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
-		bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
-		bit(X86_FEATURE_MCE) |
-		bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
-		bit(X86_FEATURE_SEP) | bit(X86_FEATURE_MTRR) |
-		bit(X86_FEATURE_PGE) | bit(X86_FEATURE_MCA) |
-		bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PAT) |
-		bit(X86_FEATURE_PSE36) |
-		bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
-		bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
-		bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
-	const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
-		bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
-		bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
-		bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
-		bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
-		bit(X86_FEATURE_PGE) |
-		bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
-		bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
-		bit(X86_FEATURE_SYSCALL) |
-		(is_efer_nx() ? bit(X86_FEATURE_NX) : 0) |
+	unsigned f_nx = is_efer_nx() ? F(NX) : 0;
 #ifdef CONFIG_X86_64
-		bit(X86_FEATURE_LM) |
+	unsigned f_lm = F(LM);
+#else
+	unsigned f_lm = 0;
 #endif
-		bit(X86_FEATURE_FXSR_OPT) |
-		bit(X86_FEATURE_MMXEXT) |
-		bit(X86_FEATURE_3DNOWEXT) |
-		bit(X86_FEATURE_3DNOW);
-	const u32 kvm_supported_word3_x86_features =
-		bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
+
+	/* cpuid 1.edx */
+	const u32 kvm_supported_word0_x86_features =
+		F(FPU) | F(VME) | F(DE) | F(PSE) |
+		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
+		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+		F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
+		0 /* Reserved, DS, ACPI */ | F(MMX) |
+		F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
+		0 /* HTT, TM, Reserved, PBE */;
+	/* cpuid 0x80000001.edx */
+	const u32 kvm_supported_word1_x86_features =
+		F(FPU) | F(VME) | F(DE) | F(PSE) |
+		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
+		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+		F(PAT) | F(PSE36) | 0 /* Reserved */ |
+		f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
+		F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ |
+		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
+	/* cpuid 1.ecx */
+	const u32 kvm_supported_word4_x86_features =
+		F(XMM3) | F(CX16);
+	/* cpuid 0x80000001.ecx */
 	const u32 kvm_supported_word6_x86_features =
-		bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY) |
-		bit(X86_FEATURE_SVM);
+		F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
+		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
+		F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
+		0 /* SKINIT */ | 0 /* WDT */;
 
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
@@ -1297,7 +1300,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		break;
 	case 1:
 		entry->edx &= kvm_supported_word0_x86_features;
-		entry->ecx &= kvm_supported_word3_x86_features;
+		entry->ecx &= kvm_supported_word4_x86_features;
 		break;
 	/* function 2 entries are STATEFUL. That is, repeated cpuid commands
 	 * may return different values. This forces us to get_cpu() before
@@ -1359,6 +1362,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	put_cpu();
 }
 
+#undef F
+
 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 				     struct kvm_cpuid_entry2 __user *entries)
 {
-- 
cgit v0.10.2


From 069ebaa4644521e8e80b6849ace4dee53f93f55e Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 10 May 2009 14:37:56 +0300
Subject: x86: Add cpu features MOVBE and POPCNT

Add cpu feature bit support for the MOVBE and POPCNT instructions.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index bb83b1c..9c63bf3 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -115,6 +115,8 @@
 #define X86_FEATURE_XMM4_1	(4*32+19) /* "sse4_1" SSE-4.1 */
 #define X86_FEATURE_XMM4_2	(4*32+20) /* "sse4_2" SSE-4.2 */
 #define X86_FEATURE_X2APIC	(4*32+21) /* x2APIC */
+#define X86_FEATURE_MOVBE	(4*32+22) /* MOVBE instruction */
+#define X86_FEATURE_POPCNT      (4*32+23) /* POPCNT instruction */
 #define X86_FEATURE_AES		(4*32+25) /* AES instructions */
 #define X86_FEATURE_XSAVE	(4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
 #define X86_FEATURE_OSXSAVE	(4*32+27) /* "" XSAVE enabled in the OS */
-- 
cgit v0.10.2


From d149c731e4f71982247a14409951259f36271dd7 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 10 May 2009 14:41:56 +0300
Subject: KVM: Update cpuid 1.ecx reporting

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 95de3d3..7592770 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1281,7 +1281,13 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
 	/* cpuid 1.ecx */
 	const u32 kvm_supported_word4_x86_features =
-		F(XMM3) | F(CX16);
+		F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
+		0 /* DS-CPL, VMX, SMX, EST */ |
+		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
+		0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
+		0 /* Reserved, DCA */ | F(XMM4_1) |
+		F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) |
+		0 /* Reserved, XSAVE, OSXSAVE */;
 	/* cpuid 0x80000001.ecx */
 	const u32 kvm_supported_word6_x86_features =
 		F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
-- 
cgit v0.10.2


From 32f8840064d88cc3f6e85203aec7b6b57bebcb97 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Thu, 7 May 2009 17:55:12 -0300
Subject: KVM: use smp_send_reschedule in kvm_vcpu_kick

KVM uses a function call IPI to cause the exit of a guest running on a
physical cpu. For virtual interrupt notification there is no need to
wait on IPI receival, or to execute any function.

This is exactly what the reschedule IPI does, without the overhead
of function IPI. So use it instead of smp_call_function_single in
kvm_vcpu_kick.

Also change the "guest_mode" variable to a bit in vcpu->requests, and
use that to collapse multiple IPI's that would be issued between the
first one and zeroing of guest mode.

This allows kvm_vcpu_kick to called with interrupts disabled.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c
index acc4d19..b448197 100644
--- a/arch/ia64/kernel/irq_ia64.c
+++ b/arch/ia64/kernel/irq_ia64.c
@@ -610,6 +610,9 @@ static struct irqaction ipi_irqaction = {
 	.name =		"IPI"
 };
 
+/*
+ * KVM uses this interrupt to force a cpu out of guest mode
+ */
 static struct irqaction resched_irqaction = {
 	.handler =	dummy_handler,
 	.flags =	IRQF_DISABLED,
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index be4413e..80c57b0 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -668,7 +668,7 @@ again:
 	host_ctx = kvm_get_host_context(vcpu);
 	guest_ctx = kvm_get_guest_context(vcpu);
 
-	vcpu->guest_mode = 1;
+	clear_bit(KVM_REQ_KICK, &vcpu->requests);
 
 	r = kvm_vcpu_pre_transition(vcpu);
 	if (r < 0)
@@ -685,7 +685,7 @@ again:
 	kvm_vcpu_post_transition(vcpu);
 
 	vcpu->arch.launched = 1;
-	vcpu->guest_mode = 0;
+	set_bit(KVM_REQ_KICK, &vcpu->requests);
 	local_irq_enable();
 
 	/*
@@ -1879,24 +1879,18 @@ void kvm_arch_hardware_unsetup(void)
 {
 }
 
-static void vcpu_kick_intr(void *info)
-{
-#ifdef DEBUG
-	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
-	printk(KERN_DEBUG"vcpu_kick_intr %p \n", vcpu);
-#endif
-}
-
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 {
-	int ipi_pcpu = vcpu->cpu;
-	int cpu = get_cpu();
+	int me;
+	int cpu = vcpu->cpu;
 
 	if (waitqueue_active(&vcpu->wq))
 		wake_up_interruptible(&vcpu->wq);
 
-	if (vcpu->guest_mode && cpu != ipi_pcpu)
-		smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
+	me = get_cpu();
+	if (cpu != me && (unsigned) cpu < nr_cpu_ids && cpu_online(cpu))
+		if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
+			smp_send_reschedule(cpu);
 	put_cpu();
 }
 
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 13f33ea..3b2e55e 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -172,6 +172,9 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
 {
 	ack_APIC_irq();
 	inc_irq_stat(irq_resched_count);
+	/*
+	 * KVM uses this interrupt to force a cpu out of guest mode
+	 */
 }
 
 void smp_call_function_interrupt(struct pt_regs *regs)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7592770..3c4c327 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3230,6 +3230,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	local_irq_disable();
 
+	clear_bit(KVM_REQ_KICK, &vcpu->requests);
+	smp_mb__after_clear_bit();
+
 	if (vcpu->requests || need_resched() || signal_pending(current)) {
 		local_irq_enable();
 		preempt_enable();
@@ -3237,13 +3240,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		goto out;
 	}
 
-	vcpu->guest_mode = 1;
-	/*
-	 * Make sure that guest_mode assignment won't happen after
-	 * testing the pending IRQ vector bitmap.
-	 */
-	smp_wmb();
-
 	if (vcpu->arch.exception.pending)
 		__queue_exception(vcpu);
 	else
@@ -3288,7 +3284,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	set_debugreg(vcpu->arch.host_dr6, 6);
 	set_debugreg(vcpu->arch.host_dr7, 7);
 
-	vcpu->guest_mode = 0;
+	set_bit(KVM_REQ_KICK, &vcpu->requests);
 	local_irq_enable();
 
 	++vcpu->stat.exits;
@@ -4571,30 +4567,20 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 	       || vcpu->arch.nmi_pending;
 }
 
-static void vcpu_kick_intr(void *info)
-{
-#ifdef DEBUG
-	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
-	printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
-#endif
-}
-
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 {
-	int ipi_pcpu = vcpu->cpu;
-	int cpu;
+	int me;
+	int cpu = vcpu->cpu;
 
 	if (waitqueue_active(&vcpu->wq)) {
 		wake_up_interruptible(&vcpu->wq);
 		++vcpu->stat.halt_wakeup;
 	}
-	/*
-	 * We may be called synchronously with irqs disabled in guest mode,
-	 * So need not to call smp_call_function_single() in that case.
-	 */
-	cpu = get_cpu();
-	if (vcpu->guest_mode && vcpu->cpu != cpu)
-		smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
+
+	me = get_cpu();
+	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
+		if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
+			smp_send_reschedule(cpu);
 	put_cpu();
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bdce8e1..1618162 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -38,6 +38,7 @@
 #define KVM_REQ_UNHALT             6
 #define KVM_REQ_MMU_SYNC           7
 #define KVM_REQ_KVMCLOCK_UPDATE    8
+#define KVM_REQ_KICK               9
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID	0
 
@@ -72,7 +73,6 @@ struct kvm_vcpu {
 	struct mutex mutex;
 	int   cpu;
 	struct kvm_run *run;
-	int guest_mode;
 	unsigned long requests;
 	unsigned long guest_debug;
 	int fpu_active;
-- 
cgit v0.10.2


From 547de29e5b1662deb05b5f90917902dc0e9ac182 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Thu, 7 May 2009 17:55:13 -0300
Subject: KVM: protect assigned dev workqueue, int handler and irq acker

kvm_assigned_dev_ack_irq is vulnerable to a race condition with the
interrupt handler function. It does:

        if (dev->host_irq_disabled) {
                enable_irq(dev->host_irq);
                dev->host_irq_disabled = false;
        }

If an interrupt triggers before the host->dev_irq_disabled assignment,
it will disable the interrupt and set dev->host_irq_disabled to true.

On return to kvm_assigned_dev_ack_irq, dev->host_irq_disabled is set to
false, and the next kvm_assigned_dev_ack_irq call will fail to reenable
it.

Other than that, having the interrupt handler and work handlers run in
parallel sounds like asking for trouble (could not spot any obvious
problem, but better not have to, its fragile).

CC: sheng.yang@intel.com
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1618162..aacc544 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -345,6 +345,7 @@ struct kvm_assigned_dev_kernel {
 	int flags;
 	struct pci_dev *dev;
 	struct kvm *kvm;
+	spinlock_t assigned_dev_lock;
 };
 
 struct kvm_irq_mask_notifier {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 29c0afb..687d113 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -42,6 +42,7 @@
 #include <linux/mman.h>
 #include <linux/swap.h>
 #include <linux/bitops.h>
+#include <linux/spinlock.h>
 
 #include <asm/processor.h>
 #include <asm/io.h>
@@ -130,6 +131,7 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 	 * finer-grained lock, update this
 	 */
 	mutex_lock(&kvm->lock);
+	spin_lock_irq(&assigned_dev->assigned_dev_lock);
 	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
 		struct kvm_guest_msix_entry *guest_entries =
 			assigned_dev->guest_msix_entries;
@@ -156,18 +158,21 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 		}
 	}
 
+	spin_unlock_irq(&assigned_dev->assigned_dev_lock);
 	mutex_unlock(&assigned_dev->kvm->lock);
 }
 
 static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
 {
+	unsigned long flags;
 	struct kvm_assigned_dev_kernel *assigned_dev =
 		(struct kvm_assigned_dev_kernel *) dev_id;
 
+	spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags);
 	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
 		int index = find_index_from_host_irq(assigned_dev, irq);
 		if (index < 0)
-			return IRQ_HANDLED;
+			goto out;
 		assigned_dev->guest_msix_entries[index].flags |=
 			KVM_ASSIGNED_MSIX_PENDING;
 	}
@@ -177,6 +182,8 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
 	disable_irq_nosync(irq);
 	assigned_dev->host_irq_disabled = true;
 
+out:
+	spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags);
 	return IRQ_HANDLED;
 }
 
@@ -184,6 +191,7 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 {
 	struct kvm_assigned_dev_kernel *dev;
+	unsigned long flags;
 
 	if (kian->gsi == -1)
 		return;
@@ -196,10 +204,12 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 	/* The guest irq may be shared so this ack may be
 	 * from another device.
 	 */
+	spin_lock_irqsave(&dev->assigned_dev_lock, flags);
 	if (dev->host_irq_disabled) {
 		enable_irq(dev->host_irq);
 		dev->host_irq_disabled = false;
 	}
+	spin_unlock_irqrestore(&dev->assigned_dev_lock, flags);
 }
 
 static void deassign_guest_irq(struct kvm *kvm,
@@ -615,6 +625,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 	match->host_devfn = assigned_dev->devfn;
 	match->flags = assigned_dev->flags;
 	match->dev = dev;
+	spin_lock_init(&match->assigned_dev_lock);
 	match->irq_source_id = -1;
 	match->kvm = kvm;
 	match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
-- 
cgit v0.10.2


From 2809f5d2c4cfad171167b131bb2a21ab65eba40f Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Tue, 12 May 2009 16:21:05 -0400
Subject: KVM: Replace ->drop_interrupt_shadow() by ->set_interrupt_shadow()

This patch replaces drop_interrupt_shadow with the more
general set_interrupt_shadow, that can either drop or raise
it, depending on its parameter.  It also adds ->get_interrupt_shadow()
for future use.

Signed-off-by: Glauber Costa <glommer@redhat.com>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ab7de4a..16d1481 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -508,6 +508,8 @@ struct kvm_x86_ops {
 	void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
 	int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
 	void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+	void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
+	u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
 	void (*patch_hypercall)(struct kvm_vcpu *vcpu,
 				unsigned char *hypercall_addr);
 	void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
@@ -519,7 +521,6 @@ struct kvm_x86_ops {
 	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
 	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
 	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
-	void (*drop_interrupt_shadow)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
 	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h
index 6a15973..be40d6e 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -143,6 +143,9 @@ struct decode_cache {
 	struct fetch_cache fetch;
 };
 
+#define X86_SHADOW_INT_MOV_SS  1
+#define X86_SHADOW_INT_STI     2
+
 struct x86_emulate_ctxt {
 	/* Register state before/after emulation. */
 	struct kvm_vcpu *vcpu;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f994c6d..8b5ffbd 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -202,6 +202,27 @@ static int is_external_interrupt(u32 info)
 	return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
 }
 
+static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	u32 ret = 0;
+
+	if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
+		ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS;
+	return ret & mask;
+}
+
+static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	if (mask == 0)
+		svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
+	else
+		svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
+
+}
+
 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -215,7 +236,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 		       __func__, kvm_rip_read(vcpu), svm->next_rip);
 
 	kvm_rip_write(vcpu, svm->next_rip);
-	svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
+	svm_set_interrupt_shadow(vcpu, 0);
 }
 
 static int has_svm(void)
@@ -2259,12 +2280,6 @@ static void pre_svm_run(struct vcpu_svm *svm)
 		new_asid(svm, svm_data);
 }
 
-static void svm_drop_interrupt_shadow(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-	svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
-}
-
 static void svm_inject_nmi(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -2667,6 +2682,8 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.run = svm_vcpu_run,
 	.handle_exit = handle_exit,
 	.skip_emulated_instruction = skip_emulated_instruction,
+	.set_interrupt_shadow = svm_set_interrupt_shadow,
+	.get_interrupt_shadow = svm_get_interrupt_shadow,
 	.patch_hypercall = svm_patch_hypercall,
 	.set_irq = svm_set_irq,
 	.set_nmi = svm_inject_nmi,
@@ -2676,7 +2693,6 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.enable_nmi_window = enable_nmi_window,
 	.enable_irq_window = enable_irq_window,
 	.update_cr8_intercept = update_cr8_intercept,
-	.drop_interrupt_shadow = svm_drop_interrupt_shadow,
 
 	.set_tss_addr = svm_set_tss_addr,
 	.get_tdp_level = get_npt_level,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e8a5649..f3ab27b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -736,23 +736,45 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 	vmcs_writel(GUEST_RFLAGS, rflags);
 }
 
+static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+	u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+	int ret = 0;
+
+	if (interruptibility & GUEST_INTR_STATE_STI)
+		ret |= X86_SHADOW_INT_STI;
+	if (interruptibility & GUEST_INTR_STATE_MOV_SS)
+		ret |= X86_SHADOW_INT_MOV_SS;
+
+	return ret & mask;
+}
+
+static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+	u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+	u32 interruptibility = interruptibility_old;
+
+	interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
+
+	if (mask & X86_SHADOW_INT_MOV_SS)
+		interruptibility |= GUEST_INTR_STATE_MOV_SS;
+	if (mask & X86_SHADOW_INT_STI)
+		interruptibility |= GUEST_INTR_STATE_STI;
+
+	if ((interruptibility != interruptibility_old))
+		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
+}
+
 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
 	unsigned long rip;
-	u32 interruptibility;
 
 	rip = kvm_rip_read(vcpu);
 	rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
 	kvm_rip_write(vcpu, rip);
 
-	/*
-	 * We emulated an instruction, so temporary interrupt blocking
-	 * should be removed, if set.
-	 */
-	interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
-	if (interruptibility & 3)
-		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
-			     interruptibility & ~3);
+	/* skipping an emulated instruction also counts */
+	vmx_set_interrupt_shadow(vcpu, 0);
 }
 
 static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
@@ -2400,12 +2422,6 @@ out:
 	return ret;
 }
 
-void vmx_drop_interrupt_shadow(struct kvm_vcpu *vcpu)
-{
-	vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
-			GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
-}
-
 static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	u32 cpu_based_vm_exec_control;
@@ -3649,6 +3665,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.run = vmx_vcpu_run,
 	.handle_exit = vmx_handle_exit,
 	.skip_emulated_instruction = skip_emulated_instruction,
+	.set_interrupt_shadow = vmx_set_interrupt_shadow,
+	.get_interrupt_shadow = vmx_get_interrupt_shadow,
 	.patch_hypercall = vmx_patch_hypercall,
 	.set_irq = vmx_inject_irq,
 	.set_nmi = vmx_inject_nmi,
@@ -3658,7 +3676,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.enable_nmi_window = enable_nmi_window,
 	.enable_irq_window = enable_irq_window,
 	.update_cr8_intercept = update_cr8_intercept,
-	.drop_interrupt_shadow = vmx_drop_interrupt_shadow,
 
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3c4c327..7475b02 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3178,7 +3178,7 @@ static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		kvm_run->request_interrupt_window;
 
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
-		kvm_x86_ops->drop_interrupt_shadow(vcpu);
+		kvm_x86_ops->set_interrupt_shadow(vcpu, 0);
 
 	inject_irq(vcpu);
 
-- 
cgit v0.10.2


From 310b5d306c1aee7ebe32f702c0e33e7988d50646 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Tue, 12 May 2009 16:21:06 -0400
Subject: KVM: Deal with interrupt shadow state for emulated instructions

We currently unblock shadow interrupt state when we skip an instruction,
but failing to do so when we actually emulate one. This blocks interrupts
in key instruction blocks, in particular sti; hlt; sequences

If the instruction emulated is an sti, we have to block shadow interrupts.
The same goes for mov ss. pop ss also needs it, but we don't currently
emulate it.

Without this patch, I cannot boot gpxe option roms at vmx machines.
This is described at https://bugzilla.redhat.com/show_bug.cgi?id=494469

Signed-off-by: Glauber Costa <glommer@redhat.com>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h
index be40d6e..b7ed2c4 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -155,6 +155,9 @@ struct x86_emulate_ctxt {
 	int mode;
 	u32 cs_base;
 
+	/* interruptibility state, as a result of execution of STI or MOV SS */
+	int interruptibility;
+
 	/* decode cache */
 	struct decode_cache decode;
 };
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7475b02..48f744f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2379,7 +2379,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 			u16 error_code,
 			int emulation_type)
 {
-	int r;
+	int r, shadow_mask;
 	struct decode_cache *c;
 
 	kvm_clear_exception_queue(vcpu);
@@ -2433,6 +2433,10 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 	}
 
 	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+	shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
+
+	if (r == 0)
+		kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
 
 	if (vcpu->arch.pio.string)
 		return EMULATE_DO_MMIO;
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index d2664fc..c1b6c23 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -1361,6 +1361,20 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 	return 0;
 }
 
+void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
+{
+	u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
+	/*
+	 * an sti; sti; sequence only disable interrupts for the first
+	 * instruction. So, if the last instruction, be it emulated or
+	 * not, left the system with the INT_STI flag enabled, it
+	 * means that the last instruction is an sti. We should not
+	 * leave the flag on in this case. The same goes for mov ss
+	 */
+	if (!(int_shadow & mask))
+		ctxt->interruptibility = mask;
+}
+
 int
 x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
@@ -1372,6 +1386,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	int io_dir_in;
 	int rc = 0;
 
+	ctxt->interruptibility = 0;
+
 	/* Shadow copy of register state. Committed on successful emulation.
 	 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
 	 * modify them.
@@ -1618,6 +1634,9 @@ special_insn:
 		int err;
 
 		sel = c->src.val;
+		if (c->modrm_reg == VCPU_SREG_SS)
+			toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
+
 		if (c->modrm_reg <= 5) {
 			type_bits = (c->modrm_reg == 1) ? 9 : 1;
 			err = kvm_load_segment_descriptor(ctxt->vcpu, sel,
@@ -1847,6 +1866,7 @@ special_insn:
 		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xfb: /* sti */
+		toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
 		ctxt->eflags |= X86_EFLAGS_IF;
 		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
-- 
cgit v0.10.2


From 7c8a83b75a38a807d37f5a4398eca2a42c8cf513 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 12 May 2009 18:55:43 -0300
Subject: KVM: MMU: protect kvm_mmu_change_mmu_pages with mmu_lock

kvm_handle_hva, called by MMU notifiers, manipulates mmu data only with
the protection of mmu_lock.

Update kvm_mmu_change_mmu_pages callers to take mmu_lock, thus protecting
against kvm_handle_hva.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index bc614f9..3ce60ad 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2723,7 +2723,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 {
 	struct kvm_mmu_page *sp;
 
-	spin_lock(&kvm->mmu_lock);
 	list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
 		int i;
 		u64 *pt;
@@ -2738,7 +2737,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 				pt[i] &= ~PT_WRITABLE_MASK;
 	}
 	kvm_flush_remote_tlbs(kvm);
-	spin_unlock(&kvm->mmu_lock);
 }
 
 void kvm_mmu_zap_all(struct kvm *kvm)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 48f744f..d2a4eca 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1625,10 +1625,12 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
 		return -EINVAL;
 
 	down_write(&kvm->slots_lock);
+	spin_lock(&kvm->mmu_lock);
 
 	kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
 	kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
 
+	spin_unlock(&kvm->mmu_lock);
 	up_write(&kvm->slots_lock);
 	return 0;
 }
@@ -1804,7 +1806,9 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 
 	/* If nothing is dirty, don't bother messing with page tables. */
 	if (is_dirty) {
+		spin_lock(&kvm->mmu_lock);
 		kvm_mmu_slot_remove_write_access(kvm, log->slot);
+		spin_unlock(&kvm->mmu_lock);
 		kvm_flush_remote_tlbs(kvm);
 		memslot = &kvm->memslots[log->slot];
 		n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
@@ -4548,12 +4552,14 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 		}
 	}
 
+	spin_lock(&kvm->mmu_lock);
 	if (!kvm->arch.n_requested_mmu_pages) {
 		unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
 		kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
 	}
 
 	kvm_mmu_slot_remove_write_access(kvm, mem->slot);
+	spin_unlock(&kvm->mmu_lock);
 	kvm_flush_remote_tlbs(kvm);
 
 	return 0;
-- 
cgit v0.10.2


From b43b1901ad282aeb74161837fb403927102687a1 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 12 May 2009 18:55:44 -0300
Subject: KVM: take mmu_lock when updating a deleted slot

kvm_handle_hva relies on mmu_lock protection to safely access
the memslot structures.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 687d113..5fed9bf 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1210,8 +1210,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
 
 	kvm_free_physmem_slot(&old, npages ? &new : NULL);
 	/* Slot deletion case: we have to update the current slot */
+	spin_lock(&kvm->mmu_lock);
 	if (!npages)
 		*memslot = old;
+	spin_unlock(&kvm->mmu_lock);
 #ifdef CONFIG_DMAR
 	/* map the pages in iommu page table */
 	r = kvm_iommu_map_pages(kvm, base_gfn, npages);
-- 
cgit v0.10.2


From 8986ecc0ef58c96eec48d8502c048f3ab67fd8e2 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 12 May 2009 18:55:45 -0300
Subject: KVM: x86: check for cr3 validity in mmu_alloc_roots

Verify the cr3 address stored in vcpu->arch.cr3 points to an existant
memslot. If not, inject a triple fault.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3ce60ad..5c3d6e8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1912,7 +1912,19 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 }
 
-static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
+static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
+{
+	int ret = 0;
+
+	if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
+		set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+		ret = 1;
+	}
+
+	return ret;
+}
+
+static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 {
 	int i;
 	gfn_t root_gfn;
@@ -1927,13 +1939,15 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 		ASSERT(!VALID_PAGE(root));
 		if (tdp_enabled)
 			direct = 1;
+		if (mmu_check_root(vcpu, root_gfn))
+			return 1;
 		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
 				      PT64_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
 		vcpu->arch.mmu.root_hpa = root;
-		return;
+		return 0;
 	}
 	direct = !is_paging(vcpu);
 	if (tdp_enabled)
@@ -1950,6 +1964,8 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 			root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
 		} else if (vcpu->arch.mmu.root_level == 0)
 			root_gfn = 0;
+		if (mmu_check_root(vcpu, root_gfn))
+			return 1;
 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
 				      PT32_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
@@ -1958,6 +1974,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 		vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
 	}
 	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+	return 0;
 }
 
 static void mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -1976,7 +1993,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
-		if (root) {
+		if (root && VALID_PAGE(root)) {
 			root &= PT64_BASE_ADDR_MASK;
 			sp = page_header(root);
 			mmu_sync_children(vcpu, sp);
@@ -2311,9 +2328,11 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 		goto out;
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
-	mmu_alloc_roots(vcpu);
+	r = mmu_alloc_roots(vcpu);
 	mmu_sync_roots(vcpu);
 	spin_unlock(&vcpu->kvm->mmu_lock);
+	if (r)
+		goto out;
 	kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
 	kvm_mmu_flush_tlb(vcpu);
 out:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d2a4eca..3244437 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4568,6 +4568,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 void kvm_arch_flush_shadow(struct kvm *kvm)
 {
 	kvm_mmu_zap_all(kvm);
+	kvm_reload_remote_mmus(kvm);
 }
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
-- 
cgit v0.10.2


From 58f8ac279a8c46eb0a3193edd521ed2e41c4f914 Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@crashcourse.ca>
Date: Tue, 12 May 2009 13:44:06 -0700
Subject: KVM: Expand on "help" info to specify kvm intel and amd module names

Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Cc: Avi Kivity <avi@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index a58504e..8600a09 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -50,6 +50,9 @@ config KVM_INTEL
 	  Provides support for KVM on Intel processors equipped with the VT
 	  extensions.
 
+	  To compile this as a module, choose M here: the module
+	  will be called kvm-intel.
+
 config KVM_AMD
 	tristate "KVM for AMD processors support"
 	depends on KVM
@@ -57,6 +60,9 @@ config KVM_AMD
 	  Provides support for KVM on AMD processors equipped with the AMD-V
 	  (SVM) extensions.
 
+	  To compile this as a module, choose M here: the module
+	  will be called kvm-amd.
+
 config KVM_TRACE
 	bool "KVM trace support"
 	depends on KVM && SYSFS
-- 
cgit v0.10.2


From 2668dab794272f0898491acaf1e77e9a005abc0f Mon Sep 17 00:00:00 2001
From: Carsten Otte <cotte@de.ibm.com>
Date: Tue, 12 May 2009 17:21:48 +0200
Subject: KVM: s390: Fix memory slot versus run - v3

This patch fixes an incorrectness in the kvm backend for s390.
In case virtual cpus are being created before the corresponding
memory slot is being registered, we need to update the sie
control blocks for the virtual cpus.

*updates in v3*
In consideration of the s390 memslot constraints locking was changed
to trylock. These locks should never be held, as vcpu's can't run without
the single memslot we just assign when running this code. To ensure this
never deadlocks in case other code changes the code uses trylocks and bail
out if it can't get all locks.

Additionally most of the discussed special conditions for s390 like
only one memslot and no user_alloc are now checked for validity in
kvm_arch_set_memory_region.

Reported-by: Mijo Safradin <mijo@linux.vnet.ibm.com>
Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Christian Ehrhardt <ehrhardt@de.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index f4d56e9..86567e1 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -657,6 +657,8 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot old,
 				int user_alloc)
 {
+	int i;
+
 	/* A few sanity checks. We can have exactly one memory slot which has
 	   to start at guest virtual zero and which has to be located at a
 	   page boundary in userland and which has to end at a page boundary.
@@ -664,7 +666,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 	   vmas. It is okay to mmap() and munmap() stuff in this slot after
 	   doing this call at any time */
 
-	if (mem->slot)
+	if (mem->slot || kvm->arch.guest_memsize)
 		return -EINVAL;
 
 	if (mem->guest_phys_addr)
@@ -676,15 +678,39 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 	if (mem->memory_size & (PAGE_SIZE - 1))
 		return -EINVAL;
 
+	if (!user_alloc)
+		return -EINVAL;
+
+	/* lock all vcpus */
+	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+		if (!kvm->vcpus[i])
+			continue;
+		if (!mutex_trylock(&kvm->vcpus[i]->mutex))
+			goto fail_out;
+	}
+
 	kvm->arch.guest_origin = mem->userspace_addr;
 	kvm->arch.guest_memsize = mem->memory_size;
 
-	/* FIXME: we do want to interrupt running CPUs and update their memory
-	   configuration now to avoid race conditions. But hey, changing the
-	   memory layout while virtual CPUs are running is usually bad
-	   programming practice. */
+	/* update sie control blocks, and unlock all vcpus */
+	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+		if (kvm->vcpus[i]) {
+			kvm->vcpus[i]->arch.sie_block->gmsor =
+				kvm->arch.guest_origin;
+			kvm->vcpus[i]->arch.sie_block->gmslm =
+				kvm->arch.guest_memsize +
+				kvm->arch.guest_origin +
+				VIRTIODESCSPACE - 1ul;
+			mutex_unlock(&kvm->vcpus[i]->mutex);
+		}
+	}
 
 	return 0;
+
+fail_out:
+	for (; i >= 0; i--)
+		mutex_unlock(&kvm->vcpus[i]->mutex);
+	return -EINVAL;
 }
 
 void kvm_arch_flush_shadow(struct kvm *kvm)
-- 
cgit v0.10.2


From ca8723023f25c9a70d76cbd6101f8fb4ffec2fa0 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 12 May 2009 17:21:49 +0200
Subject: KVM: s390: use hrtimer for clock wakeup from idle - v2

This patch reworks the s390 clock comparator wakeup to hrtimer. The clock
comparator is a per-cpu value that is compared against the TOD clock. If
ckc <= TOD an external interrupt 1004 is triggered. Since the clock comparator
and the TOD clock have a much higher resolution than jiffies we should use
hrtimers to trigger the wakeup. This speeds up guest nanosleep for small
values.

Since hrtimers callbacks run in hard-irq context, I added a tasklet to do
the actual work with enabled interrupts.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Christian Ehrhardt <ehrhardt@de.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 54ea39f..a27d0d5 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -13,6 +13,8 @@
 
 #ifndef ASM_KVM_HOST_H
 #define ASM_KVM_HOST_H
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
 #include <linux/kvm_host.h>
 #include <asm/debug.h>
 #include <asm/cpuid.h>
@@ -210,7 +212,8 @@ struct kvm_vcpu_arch {
 	s390_fp_regs      guest_fpregs;
 	unsigned int      guest_acrs[NUM_ACRS];
 	struct kvm_s390_local_interrupt local_int;
-	struct timer_list ckc_timer;
+	struct hrtimer    ckc_timer;
+	struct tasklet_struct tasklet;
 	union  {
 		cpuid_t	  cpu_id;
 		u64	  stidp_data;
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 4ed4c3a..a48830f 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -12,6 +12,8 @@
 
 #include <asm/lowcore.h>
 #include <asm/uaccess.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
 #include <linux/kvm_host.h>
 #include <linux/signal.h>
 #include "kvm-s390.h"
@@ -361,12 +363,10 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
 		return 0;
 	}
 
-	sltime = (vcpu->arch.sie_block->ckc - now) / (0xf4240000ul / HZ) + 1;
+	sltime = ((vcpu->arch.sie_block->ckc - now)*125)>>9;
 
-	vcpu->arch.ckc_timer.expires = jiffies + sltime;
-
-	add_timer(&vcpu->arch.ckc_timer);
-	VCPU_EVENT(vcpu, 5, "enabled wait timer:%llx jiffies", sltime);
+	hrtimer_start(&vcpu->arch.ckc_timer, ktime_set (0, sltime) , HRTIMER_MODE_REL);
+	VCPU_EVENT(vcpu, 5, "enabled wait via clock comparator: %llx ns", sltime);
 no_timer:
 	spin_lock_bh(&vcpu->arch.local_int.float_int->lock);
 	spin_lock_bh(&vcpu->arch.local_int.lock);
@@ -389,21 +389,34 @@ no_timer:
 	remove_wait_queue(&vcpu->wq, &wait);
 	spin_unlock_bh(&vcpu->arch.local_int.lock);
 	spin_unlock_bh(&vcpu->arch.local_int.float_int->lock);
-	del_timer(&vcpu->arch.ckc_timer);
+	hrtimer_try_to_cancel(&vcpu->arch.ckc_timer);
 	return 0;
 }
 
-void kvm_s390_idle_wakeup(unsigned long data)
+void kvm_s390_tasklet(unsigned long parm)
 {
-	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu *) parm;
 
-	spin_lock_bh(&vcpu->arch.local_int.lock);
+	spin_lock(&vcpu->arch.local_int.lock);
 	vcpu->arch.local_int.timer_due = 1;
 	if (waitqueue_active(&vcpu->arch.local_int.wq))
 		wake_up_interruptible(&vcpu->arch.local_int.wq);
-	spin_unlock_bh(&vcpu->arch.local_int.lock);
+	spin_unlock(&vcpu->arch.local_int.lock);
 }
 
+/*
+ * low level hrtimer wake routine. Because this runs in hardirq context
+ * we schedule a tasklet to do the real work.
+ */
+enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
+{
+	struct kvm_vcpu *vcpu;
+
+	vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer);
+	tasklet_schedule(&vcpu->arch.tasklet);
+
+	return HRTIMER_NORESTART;
+}
 
 void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 86567e1..dc3d068 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -15,6 +15,7 @@
 #include <linux/compiler.h>
 #include <linux/err.h>
 #include <linux/fs.h>
+#include <linux/hrtimer.h>
 #include <linux/init.h>
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
@@ -283,8 +284,10 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	vcpu->arch.sie_block->gmsor = vcpu->kvm->arch.guest_origin;
 	vcpu->arch.sie_block->ecb   = 2;
 	vcpu->arch.sie_block->eca   = 0xC1002001U;
-	setup_timer(&vcpu->arch.ckc_timer, kvm_s390_idle_wakeup,
-		 (unsigned long) vcpu);
+	hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+	tasklet_init(&vcpu->arch.tasklet, kvm_s390_tasklet,
+		     (unsigned long) vcpu);
+	vcpu->arch.ckc_timer.function = kvm_s390_idle_wakeup;
 	get_cpu_id(&vcpu->arch.cpu_id);
 	vcpu->arch.cpu_id.version = 0xff;
 	return 0;
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 00bbe69..748fee8 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -14,6 +14,7 @@
 #ifndef ARCH_S390_KVM_S390_H
 #define ARCH_S390_KVM_S390_H
 
+#include <linux/hrtimer.h>
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 
@@ -41,7 +42,8 @@ static inline int __cpu_is_stopped(struct kvm_vcpu *vcpu)
 }
 
 int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
-void kvm_s390_idle_wakeup(unsigned long data);
+enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
+void kvm_s390_tasklet(unsigned long parm);
 void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu);
 int kvm_s390_inject_vm(struct kvm *kvm,
 		struct kvm_s390_interrupt *s390int);
-- 
cgit v0.10.2


From b037a4f34ec51b6c8ccb352a04056c04a4bfc269 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 12 May 2009 17:21:50 +0200
Subject: KVM: s390: optimize float int lock: spin_lock_bh --> spin_lock

The floating interrupt lock is only taken in process context. We can
replace all spin_lock_bh with standard spin_lock calls.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Christian Ehrhardt <ehrhardt@de.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index a48830f..f04f530 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -301,13 +301,13 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
 	}
 
 	if ((!rc) && atomic_read(&fi->active)) {
-		spin_lock_bh(&fi->lock);
+		spin_lock(&fi->lock);
 		list_for_each_entry(inti, &fi->list, list)
 			if (__interrupt_is_deliverable(vcpu, inti)) {
 				rc = 1;
 				break;
 			}
-		spin_unlock_bh(&fi->lock);
+		spin_unlock(&fi->lock);
 	}
 
 	if ((!rc) && (vcpu->arch.sie_block->ckc <
@@ -368,7 +368,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
 	hrtimer_start(&vcpu->arch.ckc_timer, ktime_set (0, sltime) , HRTIMER_MODE_REL);
 	VCPU_EVENT(vcpu, 5, "enabled wait via clock comparator: %llx ns", sltime);
 no_timer:
-	spin_lock_bh(&vcpu->arch.local_int.float_int->lock);
+	spin_lock(&vcpu->arch.local_int.float_int->lock);
 	spin_lock_bh(&vcpu->arch.local_int.lock);
 	add_wait_queue(&vcpu->arch.local_int.wq, &wait);
 	while (list_empty(&vcpu->arch.local_int.list) &&
@@ -377,18 +377,18 @@ no_timer:
 		!signal_pending(current)) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		spin_unlock_bh(&vcpu->arch.local_int.lock);
-		spin_unlock_bh(&vcpu->arch.local_int.float_int->lock);
+		spin_unlock(&vcpu->arch.local_int.float_int->lock);
 		vcpu_put(vcpu);
 		schedule();
 		vcpu_load(vcpu);
-		spin_lock_bh(&vcpu->arch.local_int.float_int->lock);
+		spin_lock(&vcpu->arch.local_int.float_int->lock);
 		spin_lock_bh(&vcpu->arch.local_int.lock);
 	}
 	__unset_cpu_idle(vcpu);
 	__set_current_state(TASK_RUNNING);
 	remove_wait_queue(&vcpu->wq, &wait);
 	spin_unlock_bh(&vcpu->arch.local_int.lock);
-	spin_unlock_bh(&vcpu->arch.local_int.float_int->lock);
+	spin_unlock(&vcpu->arch.local_int.float_int->lock);
 	hrtimer_try_to_cancel(&vcpu->arch.ckc_timer);
 	return 0;
 }
@@ -455,7 +455,7 @@ void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
 	if (atomic_read(&fi->active)) {
 		do {
 			deliver = 0;
-			spin_lock_bh(&fi->lock);
+			spin_lock(&fi->lock);
 			list_for_each_entry_safe(inti, n, &fi->list, list) {
 				if (__interrupt_is_deliverable(vcpu, inti)) {
 					list_del(&inti->list);
@@ -466,7 +466,7 @@ void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
 			}
 			if (list_empty(&fi->list))
 				atomic_set(&fi->active, 0);
-			spin_unlock_bh(&fi->lock);
+			spin_unlock(&fi->lock);
 			if (deliver) {
 				__do_deliver_interrupt(vcpu, inti);
 				kfree(inti);
@@ -531,7 +531,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
 
 	mutex_lock(&kvm->lock);
 	fi = &kvm->arch.float_int;
-	spin_lock_bh(&fi->lock);
+	spin_lock(&fi->lock);
 	list_add_tail(&inti->list, &fi->list);
 	atomic_set(&fi->active, 1);
 	sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
@@ -548,7 +548,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
 	if (waitqueue_active(&li->wq))
 		wake_up_interruptible(&li->wq);
 	spin_unlock_bh(&li->lock);
-	spin_unlock_bh(&fi->lock);
+	spin_unlock(&fi->lock);
 	mutex_unlock(&kvm->lock);
 	return 0;
 }
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index dc3d068..36c654d 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -318,11 +318,11 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 	spin_lock_init(&vcpu->arch.local_int.lock);
 	INIT_LIST_HEAD(&vcpu->arch.local_int.list);
 	vcpu->arch.local_int.float_int = &kvm->arch.float_int;
-	spin_lock_bh(&kvm->arch.float_int.lock);
+	spin_lock(&kvm->arch.float_int.lock);
 	kvm->arch.float_int.local_int[id] = &vcpu->arch.local_int;
 	init_waitqueue_head(&vcpu->arch.local_int.wq);
 	vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags;
-	spin_unlock_bh(&kvm->arch.float_int.lock);
+	spin_unlock(&kvm->arch.float_int.lock);
 
 	rc = kvm_vcpu_init(vcpu, kvm, id);
 	if (rc)
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 4b88834..93ecd06 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -204,11 +204,11 @@ static void handle_stsi_3_2_2(struct kvm_vcpu *vcpu, struct sysinfo_3_2_2 *mem)
 	int cpus = 0;
 	int n;
 
-	spin_lock_bh(&fi->lock);
+	spin_lock(&fi->lock);
 	for (n = 0; n < KVM_MAX_VCPUS; n++)
 		if (fi->local_int[n])
 			cpus++;
-	spin_unlock_bh(&fi->lock);
+	spin_unlock(&fi->lock);
 
 	/* deal with other level 3 hypervisors */
 	if (stsi(mem, 3, 2, 2) == -ENOSYS)
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index f27dbed..3667883 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -52,7 +52,7 @@ static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr,
 	if (cpu_addr >= KVM_MAX_VCPUS)
 		return 3; /* not operational */
 
-	spin_lock_bh(&fi->lock);
+	spin_lock(&fi->lock);
 	if (fi->local_int[cpu_addr] == NULL)
 		rc = 3; /* not operational */
 	else if (atomic_read(fi->local_int[cpu_addr]->cpuflags)
@@ -64,7 +64,7 @@ static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr,
 		*reg |= SIGP_STAT_STOPPED;
 		rc = 1; /* status stored */
 	}
-	spin_unlock_bh(&fi->lock);
+	spin_unlock(&fi->lock);
 
 	VCPU_EVENT(vcpu, 4, "sensed status of cpu %x rc %x", cpu_addr, rc);
 	return rc;
@@ -86,7 +86,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
 
 	inti->type = KVM_S390_INT_EMERGENCY;
 
-	spin_lock_bh(&fi->lock);
+	spin_lock(&fi->lock);
 	li = fi->local_int[cpu_addr];
 	if (li == NULL) {
 		rc = 3; /* not operational */
@@ -102,7 +102,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
 	spin_unlock_bh(&li->lock);
 	rc = 0; /* order accepted */
 unlock:
-	spin_unlock_bh(&fi->lock);
+	spin_unlock(&fi->lock);
 	VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr);
 	return rc;
 }
@@ -123,7 +123,7 @@ static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int store)
 
 	inti->type = KVM_S390_SIGP_STOP;
 
-	spin_lock_bh(&fi->lock);
+	spin_lock(&fi->lock);
 	li = fi->local_int[cpu_addr];
 	if (li == NULL) {
 		rc = 3; /* not operational */
@@ -142,7 +142,7 @@ static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int store)
 	spin_unlock_bh(&li->lock);
 	rc = 0; /* order accepted */
 unlock:
-	spin_unlock_bh(&fi->lock);
+	spin_unlock(&fi->lock);
 	VCPU_EVENT(vcpu, 4, "sent sigp stop to cpu %x", cpu_addr);
 	return rc;
 }
@@ -188,7 +188,7 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
 	if (!inti)
 		return 2; /* busy */
 
-	spin_lock_bh(&fi->lock);
+	spin_lock(&fi->lock);
 	li = fi->local_int[cpu_addr];
 
 	if ((cpu_addr >= KVM_MAX_VCPUS) || (li == NULL)) {
@@ -220,7 +220,7 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
 out_li:
 	spin_unlock_bh(&li->lock);
 out_fi:
-	spin_unlock_bh(&fi->lock);
+	spin_unlock(&fi->lock);
 	return rc;
 }
 
-- 
cgit v0.10.2


From abf4a71ed95ff29d696bf04633958b2068ed2e0b Mon Sep 17 00:00:00 2001
From: Carsten Otte <cotte@de.ibm.com>
Date: Tue, 12 May 2009 17:21:51 +0200
Subject: KVM: s390: Unlink vcpu on destroy - v2

This patch makes sure we do unlink a vcpu's sie control block
from the system control area in kvm_arch_vcpu_destroy. This
prevents illegal accesses to the sie control block from other
virtual cpus after free.

Reported-by: Mijo Safradin <mijo@linux.vnet.ibm.com>
Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Christian Ehrhardt <ehrhardt@de.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 36c654d..628494a 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -196,6 +196,10 @@ out_nokvm:
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
 	VCPU_EVENT(vcpu, 3, "%s", "free cpu");
+	if (vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda ==
+		(__u64) vcpu->arch.sie_block)
+		vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda = 0;
+	smp_mb();
 	free_page((unsigned long)(vcpu->arch.sie_block));
 	kvm_vcpu_uninit(vcpu);
 	kfree(vcpu);
@@ -310,8 +314,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 
 	vcpu->arch.sie_block->icpua = id;
 	BUG_ON(!kvm->arch.sca);
-	BUG_ON(kvm->arch.sca->cpu[id].sda);
-	kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block;
+	if (!kvm->arch.sca->cpu[id].sda)
+		kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block;
+	else
+		BUG_ON(!kvm->vcpus[id]); /* vcpu does already exist */
 	vcpu->arch.sie_block->scaoh = (__u32)(((__u64)kvm->arch.sca) >> 32);
 	vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca;
 
-- 
cgit v0.10.2


From 3edbcff9bfe2ed632e518b3cfe807d062cee8269 Mon Sep 17 00:00:00 2001
From: Carsten Otte <cotte@de.ibm.com>
Date: Tue, 12 May 2009 17:21:52 +0200
Subject: KVM: s390: Sanity check on validity intercept

This patch adds a sanity check for the content of the guest
prefix register content before faulting in the cpu lowcore
that it refers to. The guest might end up in an endless loop
where SIE complains about missing lowcore with incorrect
content of the prefix register without this fix.

Reported-by: Mijo Safradin <mijo@linux.vnet.ibm.com>
Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Christian Ehrhardt <ehrhardt@de.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 9d19803..98997cc 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -154,17 +154,25 @@ static int handle_stop(struct kvm_vcpu *vcpu)
 static int handle_validity(struct kvm_vcpu *vcpu)
 {
 	int viwhy = vcpu->arch.sie_block->ipb >> 16;
+	int rc;
+
 	vcpu->stat.exit_validity++;
-	if (viwhy == 0x37) {
-		fault_in_pages_writeable((char __user *)
-					 vcpu->kvm->arch.guest_origin +
-					 vcpu->arch.sie_block->prefix,
-					 PAGE_SIZE);
-		return 0;
-	}
-	VCPU_EVENT(vcpu, 2, "unhandled validity intercept code %d",
-		   viwhy);
-	return -ENOTSUPP;
+	if ((viwhy == 0x37) && (vcpu->arch.sie_block->prefix
+		<= vcpu->kvm->arch.guest_memsize - 2*PAGE_SIZE)){
+		rc = fault_in_pages_writeable((char __user *)
+			 vcpu->kvm->arch.guest_origin +
+			 vcpu->arch.sie_block->prefix,
+			 2*PAGE_SIZE);
+		if (rc)
+			/* user will receive sigsegv, exit to user */
+			rc = -ENOTSUPP;
+	} else
+		rc = -ENOTSUPP;
+
+	if (rc)
+		VCPU_EVENT(vcpu, 2, "unhandled validity intercept code %d",
+			   viwhy);
+	return rc;
 }
 
 static int handle_instruction(struct kvm_vcpu *vcpu)
-- 
cgit v0.10.2


From 51e4d5ab28c75d819b3840da11559ad5c1429135 Mon Sep 17 00:00:00 2001
From: Carsten Otte <cotte@de.ibm.com>
Date: Tue, 12 May 2009 17:21:53 +0200
Subject: KVM: s390: Verify memory in kvm run

This check verifies that the guest we're trying to run in KVM_RUN
has some memory assigned to it. It enters an endless exception
loop if this is not the case.

Reported-by: Mijo Safradin <mijo@linux.vnet.ibm.com>
Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Christian Ehrhardt <ehrhardt@de.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 628494a..10bccd1 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -487,6 +487,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	vcpu_load(vcpu);
 
+	/* verify, that memory has been registered */
+	if (!vcpu->kvm->arch.guest_memsize) {
+		vcpu_put(vcpu);
+		return -EINVAL;
+	}
+
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
-- 
cgit v0.10.2


From 3298b75c880d6f0fd70750233c0f3e71a72a5bfb Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 11 May 2009 13:35:46 +0300
Subject: KVM: Unprotect a page if #PF happens during NMI injection.

It is done for exception and interrupt already.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8b5ffbd..ac3d5ba 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1122,8 +1122,7 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	if (npt_enabled)
 		svm_flush_tlb(&svm->vcpu);
 	else {
-		if (svm->vcpu.arch.interrupt.pending ||
-				svm->vcpu.arch.exception.pending)
+		if (kvm_event_needs_reinjection(&svm->vcpu))
 			kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
 	}
 	return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f3ab27b..8981654 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2615,7 +2615,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		cr2 = vmcs_readl(EXIT_QUALIFICATION);
 		KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
 			    (u32)((u64)cr2 >> 32), handler);
-		if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending)
+		if (kvm_event_needs_reinjection(vcpu))
 			kvm_mmu_unprotect_page_virt(vcpu, cr2);
 		return kvm_mmu_page_fault(vcpu, cr2, error_code);
 	}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 39350b2..21203d4 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -30,4 +30,10 @@ static inline u8 kvm_pop_irq(struct kvm_vcpu *vcpu)
 		clear_bit(word_index, &vcpu->arch.irq_summary);
 	return irq;
 }
+
+static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.exception.pending || vcpu->arch.interrupt.pending ||
+		vcpu->arch.nmi_injected;
+}
 #endif
-- 
cgit v0.10.2


From fa9726b0733461781933ab7180aca45e46d0a891 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 11 May 2009 13:35:47 +0300
Subject: KVM: Do not allow interrupt injection from userspace if there is a
 pending event.

The exception will immediately close the interrupt window.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3244437..96413f4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3105,8 +3105,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
 		kvm_run->ready_for_interrupt_injection = 1;
 	else
 		kvm_run->ready_for_interrupt_injection =
-					(kvm_arch_interrupt_allowed(vcpu) &&
-					 !kvm_cpu_has_interrupt(vcpu));
+			kvm_arch_interrupt_allowed(vcpu) &&
+			!kvm_cpu_has_interrupt(vcpu) &&
+			!kvm_event_needs_reinjection(vcpu);
 }
 
 static void vapic_enter(struct kvm_vcpu *vcpu)
-- 
cgit v0.10.2


From 923c61bbc6413e87e5f6b0bae663d202a8de0537 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 11 May 2009 13:35:48 +0300
Subject: KVM: Remove irq_pending bitmap

Only one interrupt vector can be injected from userspace irqchip at
any given time so no need to store it in a bitmap. Put it into interrupt
queue directly.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 16d1481..977a785 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -266,8 +266,6 @@ struct kvm_mmu {
 
 struct kvm_vcpu_arch {
 	u64 host_tsc;
-	unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
-	DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
 	/*
 	 * rip and regs accesses must go through
 	 * kvm_{register,rip}_{read,write} functions.
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 11c2757..96dfbb6 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -50,7 +50,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 	struct kvm_pic *s;
 
 	if (!irqchip_in_kernel(v->kvm))
-		return v->arch.irq_summary;
+		return v->arch.interrupt.pending;
 
 	if (kvm_apic_has_interrupt(v) == -1) {	/* LAPIC */
 		if (kvm_apic_accept_pic_intr(v)) {
@@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
 	int vector;
 
 	if (!irqchip_in_kernel(v->kvm))
-		return kvm_pop_irq(v);
+		return v->arch.interrupt.nr;
 
 	vector = kvm_get_apic_interrupt(v);	/* APIC */
 	if (vector == -1) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 96413f4..54eec35 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1441,8 +1441,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 		return -ENXIO;
 	vcpu_load(vcpu);
 
-	set_bit(irq->irq, vcpu->arch.irq_pending);
-	set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
+	kvm_queue_interrupt(vcpu, irq->irq);
 
 	vcpu_put(vcpu);
 
@@ -3583,12 +3582,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 	sregs->efer = vcpu->arch.shadow_efer;
 	sregs->apic_base = kvm_get_apic_base(vcpu);
 
-	if (irqchip_in_kernel(vcpu->kvm))
-		memset(sregs->interrupt_bitmap, 0,
-		       sizeof sregs->interrupt_bitmap);
-	else
-		memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
-		       sizeof sregs->interrupt_bitmap);
+	memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
 
 	if (vcpu->arch.interrupt.pending)
 		set_bit(vcpu->arch.interrupt.nr,
@@ -4058,7 +4052,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 				  struct kvm_sregs *sregs)
 {
 	int mmu_reset_needed = 0;
-	int i, pending_vec, max_bits;
+	int pending_vec, max_bits;
 	struct descriptor_table dt;
 
 	vcpu_load(vcpu);
@@ -4100,24 +4094,14 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	if (mmu_reset_needed)
 		kvm_mmu_reset_context(vcpu);
 
-	if (!irqchip_in_kernel(vcpu->kvm)) {
-		memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
-		       sizeof vcpu->arch.irq_pending);
-		vcpu->arch.irq_summary = 0;
-		for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
-			if (vcpu->arch.irq_pending[i])
-				__set_bit(i, &vcpu->arch.irq_summary);
-	} else {
-		max_bits = (sizeof sregs->interrupt_bitmap) << 3;
-		pending_vec = find_first_bit(
-			(const unsigned long *)sregs->interrupt_bitmap,
-			max_bits);
-		/* Only pending external irq is handled here */
-		if (pending_vec < max_bits) {
-			kvm_queue_interrupt(vcpu, pending_vec);
-			pr_debug("Set back pending irq %d\n", pending_vec);
-		}
-		kvm_pic_clear_isr_ack(vcpu->kvm);
+	max_bits = (sizeof sregs->interrupt_bitmap) << 3;
+	pending_vec = find_first_bit(
+		(const unsigned long *)sregs->interrupt_bitmap, max_bits);
+	if (pending_vec < max_bits) {
+		kvm_queue_interrupt(vcpu, pending_vec);
+		pr_debug("Set back pending irq %d\n", pending_vec);
+		if (irqchip_in_kernel(vcpu->kvm))
+			kvm_pic_clear_isr_ack(vcpu->kvm);
 	}
 
 	kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 21203d4..c1f1a8c 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -19,18 +19,6 @@ static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
 	vcpu->arch.interrupt.pending = false;
 }
 
-static inline u8 kvm_pop_irq(struct kvm_vcpu *vcpu)
-{
-	int word_index = __ffs(vcpu->arch.irq_summary);
-	int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
-	int irq = word_index * BITS_PER_LONG + bit_index;
-
-	clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
-	if (!vcpu->arch.irq_pending[word_index])
-		clear_bit(word_index, &vcpu->arch.irq_summary);
-	return irq;
-}
-
 static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.exception.pending || vcpu->arch.interrupt.pending ||
-- 
cgit v0.10.2


From f629cf8485c9e1063fd8b915fa3bde80917400a1 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 11 May 2009 13:35:49 +0300
Subject: KVM: skip_emulated_instruction() decode instruction if size is not
 known

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ac3d5ba..1315ce0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -228,7 +228,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	if (!svm->next_rip) {
-		printk(KERN_DEBUG "%s: NOP\n", __func__);
+		if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) !=
+				EMULATE_DONE)
+			printk(KERN_DEBUG "%s: NOP\n", __func__);
 		return;
 	}
 	if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
@@ -1868,11 +1870,8 @@ static int task_switch_interception(struct vcpu_svm *svm,
 	if (reason != TASK_SWITCH_GATE ||
 	    int_type == SVM_EXITINTINFO_TYPE_SOFT ||
 	    (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
-	     (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
-		if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0,
-					EMULTYPE_SKIP) != EMULATE_DONE)
-			return 0;
-	}
+	     (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
+		skip_emulated_instruction(&svm->vcpu);
 
 	return kvm_task_switch(&svm->vcpu, tss_selector, reason);
 }
-- 
cgit v0.10.2


From 66fd3f7f901f29a557a473af595bf11b270b9ac2 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 11 May 2009 13:35:50 +0300
Subject: KVM: Do not re-execute INTn instruction.

Re-inject event instead. This is what Intel suggest. Also use correct
instruction length when re-injecting soft fault/interrupt.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 977a785..1d6c3f7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -319,6 +319,8 @@ struct kvm_vcpu_arch {
 	struct kvm_pio_request pio;
 	void *pio_data;
 
+	u8 event_exit_inst_len;
+
 	struct kvm_queued_exception {
 		bool pending;
 		bool has_error_code;
@@ -328,6 +330,7 @@ struct kvm_vcpu_arch {
 
 	struct kvm_queued_interrupt {
 		bool pending;
+		bool soft;
 		u8 nr;
 	} interrupt;
 
@@ -510,7 +513,7 @@ struct kvm_x86_ops {
 	u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
 	void (*patch_hypercall)(struct kvm_vcpu *vcpu,
 				unsigned char *hypercall_addr);
-	void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
+	void (*set_irq)(struct kvm_vcpu *vcpu);
 	void (*set_nmi)(struct kvm_vcpu *vcpu);
 	void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
 				bool has_error_code, u32 error_code);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1315ce0..377c4f1 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2310,13 +2310,13 @@ static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned nr)
 		SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
 }
 
-static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
+static void svm_set_irq(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	nested_svm_intr(svm);
 
-	svm_queue_irq(vcpu, irq);
+	svm_queue_irq(vcpu, vcpu->arch.interrupt.nr);
 }
 
 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
@@ -2418,7 +2418,7 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 	case SVM_EXITINTINFO_TYPE_EXEPT:
 		/* In case of software exception do not reinject an exception
 		   vector, but re-execute and instruction instead */
-		if (vector == BP_VECTOR || vector == OF_VECTOR)
+		if (kvm_exception_is_soft(vector))
 			break;
 		if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
 			u32 err = svm->vmcb->control.exit_int_info_err;
@@ -2428,7 +2428,7 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 			kvm_queue_exception(&svm->vcpu, vector);
 		break;
 	case SVM_EXITINTINFO_TYPE_INTR:
-		kvm_queue_interrupt(&svm->vcpu, vector);
+		kvm_queue_interrupt(&svm->vcpu, vector, false);
 		break;
 	default:
 		break;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8981654..29b49f0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -801,8 +801,9 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 		return;
 	}
 
-	if (nr == BP_VECTOR || nr == OF_VECTOR) {
-		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+	if (kvm_exception_is_soft(nr)) {
+		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+			     vmx->vcpu.arch.event_exit_inst_len);
 		intr_info |= INTR_TYPE_SOFT_EXCEPTION;
 	} else
 		intr_info |= INTR_TYPE_HARD_EXCEPTION;
@@ -2445,9 +2446,11 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
 }
 
-static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
+static void vmx_inject_irq(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	uint32_t intr;
+	int irq = vcpu->arch.interrupt.nr;
 
 	KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
 
@@ -2462,8 +2465,14 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
 		kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
 		return;
 	}
-	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-			irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
+	intr = irq | INTR_INFO_VALID_MASK;
+	if (vcpu->arch.interrupt.soft) {
+		intr |= INTR_TYPE_SOFT_INTR;
+		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+			     vmx->vcpu.arch.event_exit_inst_len);
+	} else
+		intr |= INTR_TYPE_EXT_INTR;
+	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
 }
 
 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -3024,6 +3033,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 					      GUEST_INTR_STATE_NMI);
 			break;
 		case INTR_TYPE_EXT_INTR:
+		case INTR_TYPE_SOFT_INTR:
 			kvm_clear_interrupt_queue(vcpu);
 			break;
 		case INTR_TYPE_HARD_EXCEPTION:
@@ -3295,16 +3305,24 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 		vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
 				GUEST_INTR_STATE_NMI);
 		break;
-	case INTR_TYPE_HARD_EXCEPTION:
 	case INTR_TYPE_SOFT_EXCEPTION:
+		vmx->vcpu.arch.event_exit_inst_len =
+			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+		/* fall through */
+	case INTR_TYPE_HARD_EXCEPTION:
 		if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
 			u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE);
 			kvm_queue_exception_e(&vmx->vcpu, vector, err);
 		} else
 			kvm_queue_exception(&vmx->vcpu, vector);
 		break;
+	case INTR_TYPE_SOFT_INTR:
+		vmx->vcpu.arch.event_exit_inst_len =
+			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+		/* fall through */
 	case INTR_TYPE_EXT_INTR:
-		kvm_queue_interrupt(&vmx->vcpu, vector);
+		kvm_queue_interrupt(&vmx->vcpu, vector,
+			type == INTR_TYPE_SOFT_INTR);
 		break;
 	default:
 		break;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 54eec35..73cfe87 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1441,7 +1441,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 		return -ENXIO;
 	vcpu_load(vcpu);
 
-	kvm_queue_interrupt(vcpu, irq->irq);
+	kvm_queue_interrupt(vcpu, irq->irq, false);
 
 	vcpu_put(vcpu);
 
@@ -3161,7 +3161,7 @@ static void inject_irq(struct kvm_vcpu *vcpu)
 	}
 
 	if (vcpu->arch.interrupt.pending) {
-		kvm_x86_ops->set_irq(vcpu, vcpu->arch.interrupt.nr);
+		kvm_x86_ops->set_irq(vcpu);
 		return;
 	}
 
@@ -3174,8 +3174,9 @@ static void inject_irq(struct kvm_vcpu *vcpu)
 		}
 	} else if (kvm_cpu_has_interrupt(vcpu)) {
 		if (kvm_x86_ops->interrupt_allowed(vcpu)) {
-			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
-			kvm_x86_ops->set_irq(vcpu, vcpu->arch.interrupt.nr);
+			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
+					    false);
+			kvm_x86_ops->set_irq(vcpu);
 		}
 	}
 }
@@ -4098,7 +4099,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	pending_vec = find_first_bit(
 		(const unsigned long *)sregs->interrupt_bitmap, max_bits);
 	if (pending_vec < max_bits) {
-		kvm_queue_interrupt(vcpu, pending_vec);
+		kvm_queue_interrupt(vcpu, pending_vec, false);
 		pr_debug("Set back pending irq %d\n", pending_vec);
 		if (irqchip_in_kernel(vcpu->kvm))
 			kvm_pic_clear_isr_ack(vcpu->kvm);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index c1f1a8c..4c8e10a 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -8,9 +8,11 @@ static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
 	vcpu->arch.exception.pending = false;
 }
 
-static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector)
+static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector,
+	bool soft)
 {
 	vcpu->arch.interrupt.pending = true;
+	vcpu->arch.interrupt.soft = soft;
 	vcpu->arch.interrupt.nr = vector;
 }
 
@@ -24,4 +26,9 @@ static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
 	return vcpu->arch.exception.pending || vcpu->arch.interrupt.pending ||
 		vcpu->arch.nmi_injected;
 }
+
+static inline bool kvm_exception_is_soft(unsigned int nr)
+{
+	return (nr == BP_VECTOR) || (nr == OF_VECTOR);
+}
 #endif
-- 
cgit v0.10.2


From 6a8b1d13121f8226783987dc7ddd861ee2245410 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 11 May 2009 13:35:51 +0300
Subject: KVM: Always request IRQ/NMI window if an interrupt is pending

Currently they are not requested if there is pending exception.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 73cfe87..199426c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3152,8 +3152,11 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
 	kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
 }
 
-static void inject_irq(struct kvm_vcpu *vcpu)
+static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		kvm_x86_ops->set_interrupt_shadow(vcpu, 0);
+
 	/* try to reinject previous events if any */
 	if (vcpu->arch.nmi_injected) {
 		kvm_x86_ops->set_nmi(vcpu);
@@ -3181,26 +3184,11 @@ static void inject_irq(struct kvm_vcpu *vcpu)
 	}
 }
 
-static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
-		kvm_run->request_interrupt_window;
-
-	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
-		kvm_x86_ops->set_interrupt_shadow(vcpu, 0);
-
-	inject_irq(vcpu);
-
-	/* enable NMI/IRQ window open exits if needed */
-	if (vcpu->arch.nmi_pending)
-		kvm_x86_ops->enable_nmi_window(vcpu);
-	else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
-		kvm_x86_ops->enable_irq_window(vcpu);
-}
-
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	int r;
+	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
+		kvm_run->request_interrupt_window;
 
 	if (vcpu->requests)
 		if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
@@ -3254,6 +3242,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	else
 		inject_pending_irq(vcpu, kvm_run);
 
+	/* enable NMI/IRQ window open exits if needed */
+	if (vcpu->arch.nmi_pending)
+		kvm_x86_ops->enable_nmi_window(vcpu);
+	else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+		kvm_x86_ops->enable_irq_window(vcpu);
+
 	if (kvm_lapic_enabled(vcpu)) {
 		if (!vcpu->arch.apic->vapic_addr)
 			update_cr8_intercept(vcpu);
-- 
cgit v0.10.2


From 44c11430b52cbad0a467bc023a802d122dfd285c Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 11 May 2009 13:35:52 +0300
Subject: KVM: inject NMI after IRET from a previous NMI, not before.

If NMI is received during handling of another NMI it should be injected
immediately after IRET from previous NMI handler, but SVM intercept IRET
before instruction execution so we can't inject pending NMI at this
point and there is not way to request exit when NMI window opens. This
patch fix SVM code to open NMI window after IRET by single stepping over
IRET instruction.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1d6c3f7..8212943 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -358,6 +358,7 @@ struct kvm_vcpu_arch {
 	unsigned int time_offset;
 	struct page *time_page;
 
+	bool singlestep; /* guest is single stepped by KVM */
 	bool nmi_pending;
 	bool nmi_injected;
 
@@ -771,6 +772,7 @@ enum {
 #define HF_HIF_MASK		(1 << 1)
 #define HF_VINTR_MASK		(1 << 2)
 #define HF_NMI_MASK		(1 << 3)
+#define HF_IRET_MASK		(1 << 4)
 
 /*
  * Hardware virtualization extension instructions may fault if a
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 377c4f1..71510e0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -965,15 +965,16 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
 
 }
 
-static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
+static void update_db_intercept(struct kvm_vcpu *vcpu)
 {
-	int old_debug = vcpu->guest_debug;
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	vcpu->guest_debug = dbg->control;
-
 	svm->vmcb->control.intercept_exceptions &=
 		~((1 << DB_VECTOR) | (1 << BP_VECTOR));
+
+	if (vcpu->arch.singlestep)
+		svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
+
 	if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
 		if (vcpu->guest_debug &
 		    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
@@ -984,6 +985,16 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
 				1 << BP_VECTOR;
 	} else
 		vcpu->guest_debug = 0;
+}
+
+static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
+{
+	int old_debug = vcpu->guest_debug;
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	vcpu->guest_debug = dbg->control;
+
+	update_db_intercept(vcpu);
 
 	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
 		svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
@@ -1133,14 +1144,30 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
 	if (!(svm->vcpu.guest_debug &
-	      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+	      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
+		!svm->vcpu.arch.singlestep) {
 		kvm_queue_exception(&svm->vcpu, DB_VECTOR);
 		return 1;
 	}
-	kvm_run->exit_reason = KVM_EXIT_DEBUG;
-	kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
-	kvm_run->debug.arch.exception = DB_VECTOR;
-	return 0;
+
+	if (svm->vcpu.arch.singlestep) {
+		svm->vcpu.arch.singlestep = false;
+		if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
+			svm->vmcb->save.rflags &=
+				~(X86_EFLAGS_TF | X86_EFLAGS_RF);
+		update_db_intercept(&svm->vcpu);
+	}
+
+	if (svm->vcpu.guest_debug &
+	    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){
+		kvm_run->exit_reason = KVM_EXIT_DEBUG;
+		kvm_run->debug.arch.pc =
+			svm->vmcb->save.cs.base + svm->vmcb->save.rip;
+		kvm_run->debug.arch.exception = DB_VECTOR;
+		return 0;
+	}
+
+	return 1;
 }
 
 static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
@@ -1887,7 +1914,7 @@ static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
 	++svm->vcpu.stat.nmi_window_exits;
 	svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
-	svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
+	svm->vcpu.arch.hflags |= HF_IRET_MASK;
 	return 1;
 }
 
@@ -2357,8 +2384,16 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
-		enable_irq_window(vcpu);
+	if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
+	    == HF_NMI_MASK)
+		return; /* IRET will cause a vm exit */
+
+	/* Something prevents NMI from been injected. Single step over
+	   possible problem (IRET or exception injection or interrupt
+	   shadow) */
+	vcpu->arch.singlestep = true;
+	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+	update_db_intercept(vcpu);
 }
 
 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -2401,6 +2436,9 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 	int type;
 	u32 exitintinfo = svm->vmcb->control.exit_int_info;
 
+	if (svm->vcpu.arch.hflags & HF_IRET_MASK)
+		svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
+
 	svm->vcpu.arch.nmi_injected = false;
 	kvm_clear_exception_queue(&svm->vcpu);
 	kvm_clear_interrupt_queue(&svm->vcpu);
-- 
cgit v0.10.2


From 36752c9b91f75aa3ff0f214a89f13d806cb2f61f Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 11 May 2009 13:35:53 +0300
Subject: KVM: Do not migrate pending software interrupts.

INTn will be re-executed after migration. If we wanted to migrate
pending software interrupt we would need to migrate interrupt type
and instruction length too, but we do not have all required info on
SVM, so SVM->VMX migration would need to re-execute INTn anyway. To
make it simple never migrate pending soft interrupt.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 199426c..beb806b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3579,7 +3579,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 
 	memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
 
-	if (vcpu->arch.interrupt.pending)
+	if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
 		set_bit(vcpu->arch.interrupt.nr,
 			(unsigned long *)sregs->interrupt_bitmap);
 
-- 
cgit v0.10.2


From 8db3baa2db34035b2ddb7d0e8b186eb92a056532 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 11 May 2009 13:35:54 +0300
Subject: KVM: Disable CR8 intercept if tpr patching is active

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index beb806b..249540f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3142,7 +3142,10 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
 	if (!kvm_x86_ops->update_cr8_intercept)
 		return;
 
-	max_irr = kvm_lapic_find_highest_irr(vcpu);
+	if (!vcpu->arch.apic->vapic_addr)
+		max_irr = kvm_lapic_find_highest_irr(vcpu);
+	else
+		max_irr = -1;
 
 	if (max_irr != -1)
 		max_irr >>= 4;
@@ -3249,10 +3252,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		kvm_x86_ops->enable_irq_window(vcpu);
 
 	if (kvm_lapic_enabled(vcpu)) {
-		if (!vcpu->arch.apic->vapic_addr)
-			update_cr8_intercept(vcpu);
-		else
-			kvm_lapic_sync_to_vapic(vcpu);
+		update_cr8_intercept(vcpu);
+		kvm_lapic_sync_to_vapic(vcpu);
 	}
 
 	up_read(&vcpu->kvm->slots_lock);
-- 
cgit v0.10.2


From 20f65983e30f222e5383f77206e3f571d1d64610 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 11 May 2009 13:35:55 +0300
Subject: KVM: Move "exit due to NMI" handling into vmx_complete_interrupts()

To save us one reading of VM_EXIT_INTR_INFO.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 29b49f0..fe2ce2b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3261,8 +3261,17 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	int type;
 	bool idtv_info_valid;
 
-	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+	/* We need to handle NMIs before interrupts are enabled */
+	if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
+	    (exit_intr_info & INTR_INFO_VALID_MASK)) {
+		KVMTRACE_0D(NMI, &vmx->vcpu, handler);
+		asm("int $2");
+	}
+
+	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+
 	if (cpu_has_virtual_nmis()) {
 		unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
 		vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
@@ -3363,7 +3372,6 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
 static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u32 intr_info;
 
 	/* Record the guest's net vcpu time for enforced NMI injections. */
 	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
@@ -3490,15 +3498,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
 	vmx->launched = 1;
 
-	intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
-	/* We need to handle NMIs before interrupts are enabled */
-	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
-	    (intr_info & INTR_INFO_VALID_MASK)) {
-		KVMTRACE_0D(NMI, vcpu, handler);
-		asm("int $2");
-	}
-
 	vmx_complete_interrupts(vmx);
 }
 
-- 
cgit v0.10.2


From 56b237e31abf4d6dbc6e2a0214049b9a23be4883 Mon Sep 17 00:00:00 2001
From: Nitin A Kamble <nitin.a.kamble@intel.com>
Date: Thu, 4 Jun 2009 15:04:08 -0700
Subject: KVM: VMX: Rename rmode.active to rmode.vm86_active

That way the interpretation of rmode.active becomes more clear with
unrestricted guest code.

Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8212943..eabdc1c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -335,7 +335,7 @@ struct kvm_vcpu_arch {
 	} interrupt;
 
 	struct {
-		int active;
+		int vm86_active;
 		u8 save_iopl;
 		struct kvm_save_segment {
 			u16 selector;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index fe2ce2b..c379a34 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -495,7 +495,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
 			eb |= 1u << BP_VECTOR;
 	}
-	if (vcpu->arch.rmode.active)
+	if (vcpu->arch.rmode.vm86_active)
 		eb = ~0;
 	if (enable_ept)
 		eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
@@ -731,7 +731,7 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
 
 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
-	if (vcpu->arch.rmode.active)
+	if (vcpu->arch.rmode.vm86_active)
 		rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
 	vmcs_writel(GUEST_RFLAGS, rflags);
 }
@@ -788,7 +788,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
 	}
 
-	if (vcpu->arch.rmode.active) {
+	if (vcpu->arch.rmode.vm86_active) {
 		vmx->rmode.irq.pending = true;
 		vmx->rmode.irq.vector = nr;
 		vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -1363,7 +1363,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 	vmx->emulation_required = 1;
-	vcpu->arch.rmode.active = 0;
+	vcpu->arch.rmode.vm86_active = 0;
 
 	vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
 	vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
@@ -1425,7 +1425,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 	vmx->emulation_required = 1;
-	vcpu->arch.rmode.active = 1;
+	vcpu->arch.rmode.vm86_active = 1;
 
 	vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
 	vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
@@ -1594,10 +1594,10 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 
 	vmx_fpu_deactivate(vcpu);
 
-	if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
+	if (vcpu->arch.rmode.vm86_active && (cr0 & X86_CR0_PE))
 		enter_pmode(vcpu);
 
-	if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE))
+	if (!vcpu->arch.rmode.vm86_active && !(cr0 & X86_CR0_PE))
 		enter_rmode(vcpu);
 
 #ifdef CONFIG_X86_64
@@ -1655,7 +1655,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 
 static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
-	unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.active ?
+	unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.vm86_active ?
 		    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
 
 	vcpu->arch.cr4 = cr4;
@@ -1738,7 +1738,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
 	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
 	u32 ar;
 
-	if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) {
+	if (vcpu->arch.rmode.vm86_active && seg == VCPU_SREG_TR) {
 		vcpu->arch.rmode.tr.selector = var->selector;
 		vcpu->arch.rmode.tr.base = var->base;
 		vcpu->arch.rmode.tr.limit = var->limit;
@@ -1748,7 +1748,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
 	vmcs_writel(sf->base, var->base);
 	vmcs_write32(sf->limit, var->limit);
 	vmcs_write16(sf->selector, var->selector);
-	if (vcpu->arch.rmode.active && var->s) {
+	if (vcpu->arch.rmode.vm86_active && var->s) {
 		/*
 		 * Hack real-mode segments into vm86 compatibility.
 		 */
@@ -2317,7 +2317,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 		goto out;
 	}
 
-	vmx->vcpu.arch.rmode.active = 0;
+	vmx->vcpu.arch.rmode.vm86_active = 0;
 
 	vmx->soft_vnmi_blocked = 0;
 
@@ -2455,7 +2455,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
 	KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
 
 	++vcpu->stat.irq_injections;
-	if (vcpu->arch.rmode.active) {
+	if (vcpu->arch.rmode.vm86_active) {
 		vmx->rmode.irq.pending = true;
 		vmx->rmode.irq.vector = irq;
 		vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -2493,7 +2493,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 	}
 
 	++vcpu->stat.nmi_injections;
-	if (vcpu->arch.rmode.active) {
+	if (vcpu->arch.rmode.vm86_active) {
 		vmx->rmode.irq.pending = true;
 		vmx->rmode.irq.vector = NMI_VECTOR;
 		vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -2629,7 +2629,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		return kvm_mmu_page_fault(vcpu, cr2, error_code);
 	}
 
-	if (vcpu->arch.rmode.active &&
+	if (vcpu->arch.rmode.vm86_active &&
 	    handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
 								error_code)) {
 		if (vcpu->arch.halt_request) {
-- 
cgit v0.10.2


From a0861c02a981c943573478ea13b29b1fb958ee5b Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 8 Jun 2009 17:37:09 +0800
Subject: KVM: Add VT-x machine check support

VT-x needs an explicit MC vector intercept to handle machine checks in the
hyper visor.

It also has a special option to catch machine checks that happen
during VT entry.

Do these interceptions and forward them to the Linux machine check
handler. Make it always look like user space is interrupted because
the machine check handler treats kernel/user space differently.

Thanks to Jiang Yunhong for help and testing.

Cc: stable@kernel.org
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 498f944..11be5ad 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -247,6 +247,7 @@ enum vmcs_field {
 #define EXIT_REASON_MSR_READ            31
 #define EXIT_REASON_MSR_WRITE           32
 #define EXIT_REASON_MWAIT_INSTRUCTION   36
+#define EXIT_REASON_MCE_DURING_VMENTRY	 41
 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
 #define EXIT_REASON_APIC_ACCESS         44
 #define EXIT_REASON_EPT_VIOLATION       48
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 09dd1d4..289cc48 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -420,6 +420,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
  out2:
 	atomic_dec(&mce_entry);
 }
+EXPORT_SYMBOL_GPL(do_machine_check);
 
 #ifdef CONFIG_X86_MCE_INTEL
 /***
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c379a34..32d6ae8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -32,6 +32,7 @@
 #include <asm/desc.h>
 #include <asm/vmx.h>
 #include <asm/virtext.h>
+#include <asm/mce.h>
 
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 
@@ -97,6 +98,7 @@ struct vcpu_vmx {
 	int soft_vnmi_blocked;
 	ktime_t entry_time;
 	s64 vnmi_blocked_time;
+	u32 exit_reason;
 };
 
 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -214,6 +216,13 @@ static inline int is_external_interrupt(u32 intr_info)
 		== (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
 }
 
+static inline int is_machine_check(u32 intr_info)
+{
+	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
+			     INTR_INFO_VALID_MASK)) ==
+		(INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
+}
+
 static inline int cpu_has_vmx_msr_bitmap(void)
 {
 	return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
@@ -485,7 +494,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 {
 	u32 eb;
 
-	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
+	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR);
 	if (!vcpu->fpu_active)
 		eb |= 1u << NM_VECTOR;
 	if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
@@ -2582,6 +2591,31 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+/*
+ * Trigger machine check on the host. We assume all the MSRs are already set up
+ * by the CPU and that we still run on the same CPU as the MCE occurred on.
+ * We pass a fake environment to the machine check handler because we want
+ * the guest to be always treated like user space, no matter what context
+ * it used internally.
+ */
+static void kvm_machine_check(void)
+{
+#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
+	struct pt_regs regs = {
+		.cs = 3, /* Fake ring 3 no matter what the guest ran on */
+		.flags = X86_EFLAGS_IF,
+	};
+
+	do_machine_check(&regs, 0);
+#endif
+}
+
+static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	/* already handled by vcpu_run */
+	return 1;
+}
+
 static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2593,6 +2627,9 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	vect_info = vmx->idt_vectoring_info;
 	intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 
+	if (is_machine_check(intr_info))
+		return handle_machine_check(vcpu, kvm_run);
+
 	if ((vect_info & VECTORING_INFO_VALID_MASK) &&
 						!is_page_fault(intr_info))
 		printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
@@ -3166,6 +3203,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
 	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
 	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
 	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,
+	[EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -3177,8 +3215,8 @@ static const int kvm_vmx_max_exit_handlers =
  */
 static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
-	u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 exit_reason = vmx->exit_reason;
 	u32 vectoring_info = vmx->idt_vectoring_info;
 
 	KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
@@ -3263,6 +3301,14 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 
 	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 
+	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+
+	/* Handle machine checks before interrupts are enabled */
+	if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
+	    || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
+		&& is_machine_check(exit_intr_info)))
+		kvm_machine_check();
+
 	/* We need to handle NMIs before interrupts are enabled */
 	if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
 	    (exit_intr_info & INTR_INFO_VALID_MASK)) {
-- 
cgit v0.10.2


From 003dec8913d6bebb4ecc989ec04a235cf38f5ea9 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 10 Jun 2009 10:31:45 +0100
Subject: GFS2: Merge gfs2_get_sb into gfs2_get_sb_meta

These don't need to be separate functions.

Reported-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 9da161c..f234aba 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1284,9 +1284,11 @@ static int set_meta_super(struct super_block *s, void *ptr)
 	return -EINVAL;
 }
 
-static struct super_block *get_gfs2_sb(const char *dev_name)
+static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
+			    const char *dev_name, void *data, struct vfsmount *mnt)
 {
 	struct super_block *s;
+	struct gfs2_sbd *sdp;
 	struct path path;
 	int error;
 
@@ -1294,21 +1296,11 @@ static struct super_block *get_gfs2_sb(const char *dev_name)
 	if (error) {
 		printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
 		       dev_name, error);
-		return ERR_PTR(-ENOENT);
+		return error;
 	}
 	s = sget(&gfs2_fs_type, test_meta_super, set_meta_super,
 		 path.dentry->d_inode->i_sb->s_bdev);
 	path_put(&path);
-	return s;
-}
-
-static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
-			    const char *dev_name, void *data, struct vfsmount *mnt)
-{
-	struct super_block *s;
-	struct gfs2_sbd *sdp;
-
-	s = get_gfs2_sb(dev_name);
 	if (IS_ERR(s)) {
 		printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
 		return PTR_ERR(s);
-- 
cgit v0.10.2


From 04dce7d9d429ea5ea04e9432d1726c930f4d67da Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 10 Jun 2009 16:59:46 +1000
Subject: spinlock: Add missing __raw_spin_lock_flags() stub for UP

This was only defined with CONFIG_DEBUG_SPINLOCK set, but some
obscure arch/powerpc code wants it always.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h
index 938234c..d4841ed 100644
--- a/include/linux/spinlock_up.h
+++ b/include/linux/spinlock_up.h
@@ -60,6 +60,7 @@ static inline void __raw_spin_unlock(raw_spinlock_t *lock)
 #define __raw_spin_is_locked(lock)	((void)(lock), 0)
 /* for sched.c and kernel_lock.c: */
 # define __raw_spin_lock(lock)		do { (void)(lock); } while (0)
+# define __raw_spin_lock_flags(lock, flags)	do { (void)(lock); } while (0)
 # define __raw_spin_unlock(lock)	do { (void)(lock); } while (0)
 # define __raw_spin_trylock(lock)	({ (void)(lock); 1; })
 #endif /* DEBUG_SPINLOCK */
-- 
cgit v0.10.2


From f1db457ce6e2f63cb01022f58c0c023838958bd1 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 10 Jun 2009 10:06:24 +0800
Subject: tracing/events: convert block trace points to TRACE_EVENT(), fix
 !CONFIG_BLOCK

Fix building failures when CONFIG_BLOCK == n.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4A2F1520.8020003@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index c7ec31d..7e4350e 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -218,7 +218,7 @@ static inline int blk_trace_init_sysfs(struct device *dev)
 
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
 
-#ifdef CONFIG_EVENT_TRACING
+#if defined(CONFIG_EVENT_TRACING) && defined(CONFIG_BLOCK)
 
 static inline int blk_cmd_buf_len(struct request *rq)
 {
@@ -229,7 +229,7 @@ extern void blk_dump_cmd(char *buf, struct request *rq);
 extern void blk_fill_rwbs(char *rwbs, u32 rw, int bytes);
 extern void blk_fill_rwbs_rq(char *rwbs, struct request *rq);
 
-#endif /* CONFIG_EVENT_TRACING */
+#endif /* CONFIG_EVENT_TRACING && CONFIG_BLOCK */
 
 #endif /* __KERNEL__ */
 #endif
-- 
cgit v0.10.2


From 6bc1096d7ab3621b3ffcf06616d1f4e0325d903d Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Fri, 22 May 2009 12:12:01 +0200
Subject: x86: MSR: add a struct representation of an MSR

Add a struct representing a 64bit MSR pair consisting of a low and high
register part and convert msr_info to use it. Also, rename msr-on-cpu.c
to msr.c.

Side note: Put the cpumask.h include in __KERNEL__ space thus fixing an
allmodconfig build failure in the headers_check target.

CC: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 638bf62..5e12132 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -12,6 +12,17 @@
 
 #include <asm/asm.h>
 #include <asm/errno.h>
+#include <asm/cpumask.h>
+
+struct msr {
+	union {
+		struct {
+			u32 l;
+			u32 h;
+		};
+		u64 q;
+	};
+};
 
 static inline unsigned long long native_read_tscp(unsigned int *aux)
 {
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 55e11aa..f9d3563 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -2,7 +2,7 @@
 # Makefile for x86 specific library files.
 #
 
-obj-$(CONFIG_SMP) := msr-on-cpu.o
+obj-$(CONFIG_SMP) := msr.o
 
 lib-y := delay.o
 lib-y += thunk_$(BITS).o
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c
deleted file mode 100644
index 321cf72..0000000
--- a/arch/x86/lib/msr-on-cpu.c
+++ /dev/null
@@ -1,97 +0,0 @@
-#include <linux/module.h>
-#include <linux/preempt.h>
-#include <linux/smp.h>
-#include <asm/msr.h>
-
-struct msr_info {
-	u32 msr_no;
-	u32 l, h;
-	int err;
-};
-
-static void __rdmsr_on_cpu(void *info)
-{
-	struct msr_info *rv = info;
-
-	rdmsr(rv->msr_no, rv->l, rv->h);
-}
-
-static void __wrmsr_on_cpu(void *info)
-{
-	struct msr_info *rv = info;
-
-	wrmsr(rv->msr_no, rv->l, rv->h);
-}
-
-int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
-{
-	int err;
-	struct msr_info rv;
-
-	rv.msr_no = msr_no;
-	err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
-	*l = rv.l;
-	*h = rv.h;
-
-	return err;
-}
-
-int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
-{
-	int err;
-	struct msr_info rv;
-
-	rv.msr_no = msr_no;
-	rv.l = l;
-	rv.h = h;
-	err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
-
-	return err;
-}
-
-/* These "safe" variants are slower and should be used when the target MSR
-   may not actually exist. */
-static void __rdmsr_safe_on_cpu(void *info)
-{
-	struct msr_info *rv = info;
-
-	rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h);
-}
-
-static void __wrmsr_safe_on_cpu(void *info)
-{
-	struct msr_info *rv = info;
-
-	rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h);
-}
-
-int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
-{
-	int err;
-	struct msr_info rv;
-
-	rv.msr_no = msr_no;
-	err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
-	*l = rv.l;
-	*h = rv.h;
-
-	return err ? err : rv.err;
-}
-
-int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
-{
-	int err;
-	struct msr_info rv;
-
-	rv.msr_no = msr_no;
-	rv.l = l;
-	rv.h = h;
-	err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
-
-	return err ? err : rv.err;
-}
-
-EXPORT_SYMBOL(rdmsr_on_cpu);
-EXPORT_SYMBOL(wrmsr_on_cpu);
-EXPORT_SYMBOL(rdmsr_safe_on_cpu);
-EXPORT_SYMBOL(wrmsr_safe_on_cpu);
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
new file mode 100644
index 0000000..cade714
--- /dev/null
+++ b/arch/x86/lib/msr.c
@@ -0,0 +1,97 @@
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/smp.h>
+#include <asm/msr.h>
+
+struct msr_info {
+	u32 msr_no;
+	struct msr reg;
+	int err;
+};
+
+static void __rdmsr_on_cpu(void *info)
+{
+	struct msr_info *rv = info;
+
+	rdmsr(rv->msr_no, rv->reg.l, rv->reg.h);
+}
+
+static void __wrmsr_on_cpu(void *info)
+{
+	struct msr_info *rv = info;
+
+	wrmsr(rv->msr_no, rv->reg.l, rv->reg.h);
+}
+
+int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+	int err;
+	struct msr_info rv;
+
+	rv.msr_no = msr_no;
+	err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
+	*l = rv.reg.l;
+	*h = rv.reg.h;
+
+	return err;
+}
+
+int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+	int err;
+	struct msr_info rv;
+
+	rv.msr_no = msr_no;
+	rv.reg.l = l;
+	rv.reg.h = h;
+	err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
+
+	return err;
+}
+
+/* These "safe" variants are slower and should be used when the target MSR
+   may not actually exist. */
+static void __rdmsr_safe_on_cpu(void *info)
+{
+	struct msr_info *rv = info;
+
+	rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h);
+}
+
+static void __wrmsr_safe_on_cpu(void *info)
+{
+	struct msr_info *rv = info;
+
+	rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h);
+}
+
+int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+	int err;
+	struct msr_info rv;
+
+	rv.msr_no = msr_no;
+	err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
+	*l = rv.reg.l;
+	*h = rv.reg.h;
+
+	return err ? err : rv.err;
+}
+
+int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+	int err;
+	struct msr_info rv;
+
+	rv.msr_no = msr_no;
+	rv.reg.l = l;
+	rv.reg.h = h;
+	err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
+
+	return err ? err : rv.err;
+}
+
+EXPORT_SYMBOL(rdmsr_on_cpu);
+EXPORT_SYMBOL(wrmsr_on_cpu);
+EXPORT_SYMBOL(rdmsr_safe_on_cpu);
+EXPORT_SYMBOL(wrmsr_safe_on_cpu);
-- 
cgit v0.10.2


From b034c19f9f61c8b6f2435aa2e77f52348ebde767 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Fri, 22 May 2009 13:52:19 +0200
Subject: x86: MSR: add methods for writing of an MSR on several CPUs

Provide for concurrent MSR writes on all the CPUs in the cpumask. Also,
add a temporary workaround for smp_call_function_many which skips the
CPU we're executing on.

Bart: zero out rv struct which is allocated on stack.

CC: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 5e12132..2260376 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -227,6 +227,8 @@ do {                                                            \
 #ifdef CONFIG_SMP
 int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
 int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
+void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs);
+void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs);
 int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
 int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
 #else  /*  CONFIG_SMP  */
@@ -240,6 +242,16 @@ static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
 	wrmsr(msr_no, l, h);
 	return 0;
 }
+static inline void rdmsr_on_cpus(const cpumask_t *m, u32 msr_no,
+				struct msr *msrs)
+{
+       rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h));
+}
+static inline void wrmsr_on_cpus(const cpumask_t *m, u32 msr_no,
+				struct msr *msrs)
+{
+       wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h);
+}
 static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no,
 				    u32 *l, u32 *h)
 {
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index cade714..1440b9c 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -6,21 +6,37 @@
 struct msr_info {
 	u32 msr_no;
 	struct msr reg;
+	struct msr *msrs;
+	int off;
 	int err;
 };
 
 static void __rdmsr_on_cpu(void *info)
 {
 	struct msr_info *rv = info;
+	struct msr *reg;
+	int this_cpu = raw_smp_processor_id();
 
-	rdmsr(rv->msr_no, rv->reg.l, rv->reg.h);
+	if (rv->msrs)
+		reg = &rv->msrs[this_cpu - rv->off];
+	else
+		reg = &rv->reg;
+
+	rdmsr(rv->msr_no, reg->l, reg->h);
 }
 
 static void __wrmsr_on_cpu(void *info)
 {
 	struct msr_info *rv = info;
+	struct msr *reg;
+	int this_cpu = raw_smp_processor_id();
+
+	if (rv->msrs)
+		reg = &rv->msrs[this_cpu - rv->off];
+	else
+		reg = &rv->reg;
 
-	wrmsr(rv->msr_no, rv->reg.l, rv->reg.h);
+	wrmsr(rv->msr_no, reg->l, reg->h);
 }
 
 int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
@@ -28,6 +44,8 @@ int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
 	int err;
 	struct msr_info rv;
 
+	memset(&rv, 0, sizeof(rv));
+
 	rv.msr_no = msr_no;
 	err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
 	*l = rv.reg.l;
@@ -35,12 +53,15 @@ int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
 
 	return err;
 }
+EXPORT_SYMBOL(rdmsr_on_cpu);
 
 int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
 {
 	int err;
 	struct msr_info rv;
 
+	memset(&rv, 0, sizeof(rv));
+
 	rv.msr_no = msr_no;
 	rv.reg.l = l;
 	rv.reg.h = h;
@@ -48,6 +69,70 @@ int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
 
 	return err;
 }
+EXPORT_SYMBOL(wrmsr_on_cpu);
+
+/* rdmsr on a bunch of CPUs
+ *
+ * @mask:       which CPUs
+ * @msr_no:     which MSR
+ * @msrs:       array of MSR values
+ *
+ */
+void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
+{
+	struct msr_info rv;
+	int this_cpu;
+
+	memset(&rv, 0, sizeof(rv));
+
+	rv.off    = cpumask_first(mask);
+	rv.msrs	  = msrs;
+	rv.msr_no = msr_no;
+
+	preempt_disable();
+	/*
+	 * FIXME: handle the CPU we're executing on separately for now until
+	 * smp_call_function_many has been fixed to not skip it.
+	 */
+	this_cpu = raw_smp_processor_id();
+	smp_call_function_single(this_cpu, __rdmsr_on_cpu, &rv, 1);
+
+	smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1);
+	preempt_enable();
+}
+EXPORT_SYMBOL(rdmsr_on_cpus);
+
+/*
+ * wrmsr on a bunch of CPUs
+ *
+ * @mask:       which CPUs
+ * @msr_no:     which MSR
+ * @msrs:       array of MSR values
+ *
+ */
+void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
+{
+	struct msr_info rv;
+	int this_cpu;
+
+	memset(&rv, 0, sizeof(rv));
+
+	rv.off    = cpumask_first(mask);
+	rv.msrs   = msrs;
+	rv.msr_no = msr_no;
+
+	preempt_disable();
+	/*
+	 * FIXME: handle the CPU we're executing on separately for now until
+	 * smp_call_function_many has been fixed to not skip it.
+	 */
+	this_cpu = raw_smp_processor_id();
+	smp_call_function_single(this_cpu, __wrmsr_on_cpu, &rv, 1);
+
+	smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1);
+	preempt_enable();
+}
+EXPORT_SYMBOL(wrmsr_on_cpus);
 
 /* These "safe" variants are slower and should be used when the target MSR
    may not actually exist. */
@@ -70,6 +155,8 @@ int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
 	int err;
 	struct msr_info rv;
 
+	memset(&rv, 0, sizeof(rv));
+
 	rv.msr_no = msr_no;
 	err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
 	*l = rv.reg.l;
@@ -77,12 +164,15 @@ int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
 
 	return err ? err : rv.err;
 }
+EXPORT_SYMBOL(rdmsr_safe_on_cpu);
 
 int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
 {
 	int err;
 	struct msr_info rv;
 
+	memset(&rv, 0, sizeof(rv));
+
 	rv.msr_no = msr_no;
 	rv.reg.l = l;
 	rv.reg.h = h;
@@ -90,8 +180,4 @@ int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
 
 	return err ? err : rv.err;
 }
-
-EXPORT_SYMBOL(rdmsr_on_cpu);
-EXPORT_SYMBOL(wrmsr_on_cpu);
-EXPORT_SYMBOL(rdmsr_safe_on_cpu);
 EXPORT_SYMBOL(wrmsr_safe_on_cpu);
-- 
cgit v0.10.2


From d357cbb445208ea0c33b268e08a65e53fdbb5e86 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Thu, 14 May 2009 17:49:28 +0200
Subject: edac: fold __func__ into edac_debug_printk

This shortens debugfX() calls a bit.

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
CC: Doug Thompson <norsk5@yahoo.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h
index 6ad95c8..48d3b14 100644
--- a/drivers/edac/edac_core.h
+++ b/drivers/edac/edac_core.h
@@ -76,10 +76,11 @@
 extern int edac_debug_level;
 
 #ifndef CONFIG_EDAC_DEBUG_VERBOSE
-#define edac_debug_printk(level, fmt, arg...)                            \
-	do {                                                             \
-		if (level <= edac_debug_level)                           \
-			edac_printk(KERN_DEBUG, EDAC_DEBUG, fmt, ##arg); \
+#define edac_debug_printk(level, fmt, arg...)                           \
+	do {                                                            \
+		if (level <= edac_debug_level)                          \
+			edac_printk(KERN_DEBUG, EDAC_DEBUG,		\
+				    "%s: " fmt, __func__, ##arg);	\
 	} while (0)
 #else  /* CONFIG_EDAC_DEBUG_VERBOSE */
 #define edac_debug_printk(level, fmt, arg...)                            \
-- 
cgit v0.10.2


From cfe40fdb4a46a68e45fa9a5ecbe588e94b89b4f3 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 4 May 2009 19:25:34 +0200
Subject: amd64_edac: add driver header

Borislav:
- remove register bit descriptions (complete text in BKDG)
- cleanup and remove excessive/superfluous comments

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
new file mode 100644
index 0000000..6f5d5d6
--- /dev/null
+++ b/drivers/edac/amd64_edac.h
@@ -0,0 +1,628 @@
+/*
+ * AMD64 class Memory Controller kernel module
+ *
+ * Copyright (c) 2009 SoftwareBitMaker.
+ * Copyright (c) 2009 Advanced Micro Devices, Inc.
+ *
+ * This file may be distributed under the terms of the
+ * GNU General Public License.
+ *
+ *	Originally Written by Thayne Harbaugh
+ *
+ *      Changes by Douglas "norsk" Thompson  <dougthompson@xmission.com>:
+ *		- K8 CPU Revision D and greater support
+ *
+ *      Changes by Dave Peterson <dsp@llnl.gov> <dave_peterson@pobox.com>:
+ *		- Module largely rewritten, with new (and hopefully correct)
+ *		code for dealing with node and chip select interleaving,
+ *		various code cleanup, and bug fixes
+ *		- Added support for memory hoisting using DRAM hole address
+ *		register
+ *
+ *	Changes by Douglas "norsk" Thompson <dougthompson@xmission.com>:
+ *		-K8 Rev (1207) revision support added, required Revision
+ *		specific mini-driver code to support Rev F as well as
+ *		prior revisions
+ *
+ *	Changes by Douglas "norsk" Thompson <dougthompson@xmission.com>:
+ *		-Family 10h revision support added. New PCI Device IDs,
+ *		indicating new changes. Actual registers modified
+ *		were slight, less than the Rev E to Rev F transition
+ *		but changing the PCI Device ID was the proper thing to
+ *		do, as it provides for almost automactic family
+ *		detection. The mods to Rev F required more family
+ *		information detection.
+ *
+ *	Changes/Fixes by Borislav Petkov <borislav.petkov@amd.com>:
+ *		- misc fixes and code cleanups
+ *
+ * This module is based on the following documents
+ * (available from http://www.amd.com/):
+ *
+ *	Title:	BIOS and Kernel Developer's Guide for AMD Athlon 64 and AMD
+ *		Opteron Processors
+ *	AMD publication #: 26094
+ *`	Revision: 3.26
+ *
+ *	Title:	BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh
+ *		Processors
+ *	AMD publication #: 32559
+ *	Revision: 3.00
+ *	Issue Date: May 2006
+ *
+ *	Title:	BIOS and Kernel Developer's Guide (BKDG) For AMD Family 10h
+ *		Processors
+ *	AMD publication #: 31116
+ *	Revision: 3.00
+ *	Issue Date: September 07, 2007
+ *
+ * Sections in the first 2 documents are no longer in sync with each other.
+ * The Family 10h BKDG was totally re-written from scratch with a new
+ * presentation model.
+ * Therefore, comments that refer to a Document section might be off.
+ */
+
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+#include <linux/slab.h>
+#include <linux/mmzone.h>
+#include <linux/edac.h>
+#include "edac_core.h"
+
+#define amd64_printk(level, fmt, arg...) \
+	edac_printk(level, "amd64", fmt, ##arg)
+
+#define amd64_mc_printk(mci, level, fmt, arg...) \
+	edac_mc_chipset_printk(mci, level, "amd64", fmt, ##arg)
+
+/*
+ * Throughout the comments in this code, the following terms are used:
+ *
+ *	SysAddr, DramAddr, and InputAddr
+ *
+ *  These terms come directly from the amd64 documentation
+ * (AMD publication #26094).  They are defined as follows:
+ *
+ *     SysAddr:
+ *         This is a physical address generated by a CPU core or a device
+ *         doing DMA.  If generated by a CPU core, a SysAddr is the result of
+ *         a virtual to physical address translation by the CPU core's address
+ *         translation mechanism (MMU).
+ *
+ *     DramAddr:
+ *         A DramAddr is derived from a SysAddr by subtracting an offset that
+ *         depends on which node the SysAddr maps to and whether the SysAddr
+ *         is within a range affected by memory hoisting.  The DRAM Base
+ *         (section 3.4.4.1) and DRAM Limit (section 3.4.4.2) registers
+ *         determine which node a SysAddr maps to.
+ *
+ *         If the DRAM Hole Address Register (DHAR) is enabled and the SysAddr
+ *         is within the range of addresses specified by this register, then
+ *         a value x from the DHAR is subtracted from the SysAddr to produce a
+ *         DramAddr.  Here, x represents the base address for the node that
+ *         the SysAddr maps to plus an offset due to memory hoisting.  See
+ *         section 3.4.8 and the comments in amd64_get_dram_hole_info() and
+ *         sys_addr_to_dram_addr() below for more information.
+ *
+ *         If the SysAddr is not affected by the DHAR then a value y is
+ *         subtracted from the SysAddr to produce a DramAddr.  Here, y is the
+ *         base address for the node that the SysAddr maps to.  See section
+ *         3.4.4 and the comments in sys_addr_to_dram_addr() below for more
+ *         information.
+ *
+ *     InputAddr:
+ *         A DramAddr is translated to an InputAddr before being passed to the
+ *         memory controller for the node that the DramAddr is associated
+ *         with.  The memory controller then maps the InputAddr to a csrow.
+ *         If node interleaving is not in use, then the InputAddr has the same
+ *         value as the DramAddr.  Otherwise, the InputAddr is produced by
+ *         discarding the bits used for node interleaving from the DramAddr.
+ *         See section 3.4.4 for more information.
+ *
+ *         The memory controller for a given node uses its DRAM CS Base and
+ *         DRAM CS Mask registers to map an InputAddr to a csrow.  See
+ *         sections 3.5.4 and 3.5.5 for more information.
+ */
+
+#define EDAC_AMD64_VERSION		" Ver: 3.2.0 " __DATE__
+#define EDAC_MOD_STR			"amd64_edac"
+
+/* Extended Model from CPUID, for CPU Revision numbers */
+#define OPTERON_CPU_LE_REV_C		0
+#define OPTERON_CPU_REV_D		1
+#define OPTERON_CPU_REV_E		2
+
+/* NPT processors have the following Extended Models */
+#define OPTERON_CPU_REV_F		4
+#define OPTERON_CPU_REV_FA		5
+
+/* Hardware limit on ChipSelect rows per MC and processors per system */
+#define CHIPSELECT_COUNT		8
+#define DRAM_REG_COUNT			8
+
+
+/*
+ * PCI-defined configuration space registers
+ */
+
+
+/*
+ * Function 1 - Address Map
+ */
+#define K8_DRAM_BASE_LOW		0x40
+#define K8_DRAM_LIMIT_LOW		0x44
+#define K8_DHAR				0xf0
+
+#define DHAR_VALID			BIT(0)
+#define F10_DRAM_MEM_HOIST_VALID	BIT(1)
+
+#define DHAR_BASE_MASK			0xff000000
+#define dhar_base(dhar)			(dhar & DHAR_BASE_MASK)
+
+#define K8_DHAR_OFFSET_MASK		0x0000ff00
+#define k8_dhar_offset(dhar)		((dhar & K8_DHAR_OFFSET_MASK) << 16)
+
+#define F10_DHAR_OFFSET_MASK		0x0000ff80
+					/* NOTE: Extra mask bit vs K8 */
+#define f10_dhar_offset(dhar)		((dhar & F10_DHAR_OFFSET_MASK) << 16)
+
+
+/* F10 High BASE/LIMIT registers */
+#define F10_DRAM_BASE_HIGH		0x140
+#define F10_DRAM_LIMIT_HIGH		0x144
+
+
+/*
+ * Function 2 - DRAM controller
+ */
+#define K8_DCSB0			0x40
+#define F10_DCSB1			0x140
+
+#define K8_DCSB_CS_ENABLE		BIT(0)
+#define K8_DCSB_NPT_SPARE		BIT(1)
+#define K8_DCSB_NPT_TESTFAIL		BIT(2)
+
+/*
+ * REV E: select [31:21] and [15:9] from DCSB and the shift amount to form
+ * the address
+ */
+#define REV_E_DCSB_BASE_BITS		(0xFFE0FE00ULL)
+#define REV_E_DCS_SHIFT			4
+#define REV_E_DCSM_COUNT		8
+
+#define REV_F_F1Xh_DCSB_BASE_BITS	(0x1FF83FE0ULL)
+#define REV_F_F1Xh_DCS_SHIFT		8
+
+/*
+ * REV F and later: selects [28:19] and [13:5] from DCSB and the shift amount
+ * to form the address
+ */
+#define REV_F_DCSB_BASE_BITS		(0x1FF83FE0ULL)
+#define REV_F_DCS_SHIFT			8
+#define REV_F_DCSM_COUNT		4
+#define F10_DCSM_COUNT			4
+#define F11_DCSM_COUNT			2
+
+/* DRAM CS Mask Registers */
+#define K8_DCSM0			0x60
+#define F10_DCSM1			0x160
+
+/* REV E: select [29:21] and [15:9] from DCSM */
+#define REV_E_DCSM_MASK_BITS		0x3FE0FE00
+
+/* unused bits [24:20] and [12:0] */
+#define REV_E_DCS_NOTUSED_BITS		0x01F01FFF
+
+/* REV F and later: select [28:19] and [13:5] from DCSM */
+#define REV_F_F1Xh_DCSM_MASK_BITS	0x1FF83FE0
+
+/* unused bits [26:22] and [12:0] */
+#define REV_F_F1Xh_DCS_NOTUSED_BITS	0x07C01FFF
+
+#define DBAM0				0x80
+#define DBAM1				0x180
+
+/* Extract the DIMM 'type' on the i'th DIMM from the DBAM reg value passed */
+#define DBAM_DIMM(i, reg)		((((reg) >> (4*i))) & 0xF)
+
+#define DBAM_MAX_VALUE			11
+
+
+#define F10_DCLR_0			0x90
+#define F10_DCLR_1			0x190
+#define REVE_WIDTH_128			BIT(16)
+#define F10_WIDTH_128			BIT(11)
+
+
+#define F10_DCHR_0			0x94
+#define F10_DCHR_1			0x194
+
+#define F10_DCHR_FOUR_RANK_DIMM		BIT(18)
+#define F10_DCHR_Ddr3Mode		BIT(8)
+#define F10_DCHR_MblMode		BIT(6)
+
+
+#define F10_DCTL_SEL_LOW		0x110
+
+#define dct_sel_baseaddr(pvt)    \
+	((pvt->dram_ctl_select_low) & 0xFFFFF800)
+
+#define dct_sel_interleave_addr(pvt)    \
+	(((pvt->dram_ctl_select_low) >> 6) & 0x3)
+
+enum {
+	F10_DCTL_SEL_LOW_DctSelHiRngEn	= BIT(0),
+	F10_DCTL_SEL_LOW_DctSelIntLvEn	= BIT(2),
+	F10_DCTL_SEL_LOW_DctGangEn	= BIT(4),
+	F10_DCTL_SEL_LOW_DctDatIntLv	= BIT(5),
+	F10_DCTL_SEL_LOW_DramEnable	= BIT(8),
+	F10_DCTL_SEL_LOW_MemCleared	= BIT(10),
+};
+
+#define    dct_high_range_enabled(pvt)    \
+	(pvt->dram_ctl_select_low & F10_DCTL_SEL_LOW_DctSelHiRngEn)
+
+#define dct_interleave_enabled(pvt)	   \
+	(pvt->dram_ctl_select_low & F10_DCTL_SEL_LOW_DctSelIntLvEn)
+
+#define dct_ganging_enabled(pvt)        \
+	(pvt->dram_ctl_select_low & F10_DCTL_SEL_LOW_DctGangEn)
+
+#define dct_data_intlv_enabled(pvt)    \
+	(pvt->dram_ctl_select_low & F10_DCTL_SEL_LOW_DctDatIntLv)
+
+#define dct_dram_enabled(pvt)    \
+	(pvt->dram_ctl_select_low & F10_DCTL_SEL_LOW_DramEnable)
+
+#define dct_memory_cleared(pvt)    \
+	(pvt->dram_ctl_select_low & F10_DCTL_SEL_LOW_MemCleared)
+
+
+#define F10_DCTL_SEL_HIGH		0x114
+
+
+/*
+ * Function 3 - Misc Control
+ */
+#define K8_NBCTL			0x40
+
+/* Correctable ECC error reporting enable */
+#define K8_NBCTL_CECCEn			BIT(0)
+
+/* UnCorrectable ECC error reporting enable */
+#define K8_NBCTL_UECCEn			BIT(1)
+
+#define K8_NBCFG			0x44
+#define K8_NBCFG_CHIPKILL		BIT(23)
+#define K8_NBCFG_ECC_ENABLE		BIT(22)
+
+#define K8_NBSL				0x48
+
+
+#define EXTRACT_HIGH_SYNDROME(x)	(((x) >> 24) & 0xff)
+#define EXTRACT_EXT_ERROR_CODE(x)	(((x) >> 16) & 0x1f)
+
+/* Family F10h: Normalized Extended Error Codes */
+#define F10_NBSL_EXT_ERR_RES		0x0
+#define F10_NBSL_EXT_ERR_CRC		0x1
+#define F10_NBSL_EXT_ERR_SYNC		0x2
+#define F10_NBSL_EXT_ERR_MST		0x3
+#define F10_NBSL_EXT_ERR_TGT		0x4
+#define F10_NBSL_EXT_ERR_GART		0x5
+#define F10_NBSL_EXT_ERR_RMW		0x6
+#define F10_NBSL_EXT_ERR_WDT		0x7
+#define F10_NBSL_EXT_ERR_ECC		0x8
+#define F10_NBSL_EXT_ERR_DEV		0x9
+#define F10_NBSL_EXT_ERR_LINK_DATA	0xA
+
+/* Next two are overloaded values */
+#define F10_NBSL_EXT_ERR_LINK_PROTO	0xB
+#define F10_NBSL_EXT_ERR_L3_PROTO	0xB
+
+#define F10_NBSL_EXT_ERR_NB_ARRAY	0xC
+#define F10_NBSL_EXT_ERR_DRAM_PARITY	0xD
+#define F10_NBSL_EXT_ERR_LINK_RETRY	0xE
+
+/* Next two are overloaded values */
+#define F10_NBSL_EXT_ERR_GART_WALK	0xF
+#define F10_NBSL_EXT_ERR_DEV_WALK	0xF
+
+/* 0x10 to 0x1B: Reserved */
+#define F10_NBSL_EXT_ERR_L3_DATA	0x1C
+#define F10_NBSL_EXT_ERR_L3_TAG		0x1D
+#define F10_NBSL_EXT_ERR_L3_LRU		0x1E
+
+/* K8: Normalized Extended Error Codes */
+#define K8_NBSL_EXT_ERR_ECC		0x0
+#define K8_NBSL_EXT_ERR_CRC		0x1
+#define K8_NBSL_EXT_ERR_SYNC		0x2
+#define K8_NBSL_EXT_ERR_MST		0x3
+#define K8_NBSL_EXT_ERR_TGT		0x4
+#define K8_NBSL_EXT_ERR_GART		0x5
+#define K8_NBSL_EXT_ERR_RMW		0x6
+#define K8_NBSL_EXT_ERR_WDT		0x7
+#define K8_NBSL_EXT_ERR_CHIPKILL_ECC	0x8
+#define K8_NBSL_EXT_ERR_DRAM_PARITY	0xD
+
+#define EXTRACT_ERROR_CODE(x)		((x) & 0xffff)
+#define	TEST_TLB_ERROR(x)		(((x) & 0xFFF0) == 0x0010)
+#define	TEST_MEM_ERROR(x)		(((x) & 0xFF00) == 0x0100)
+#define	TEST_BUS_ERROR(x)		(((x) & 0xF800) == 0x0800)
+#define	EXTRACT_TT_CODE(x)		(((x) >> 2) & 0x3)
+#define	EXTRACT_II_CODE(x)		(((x) >> 2) & 0x3)
+#define	EXTRACT_LL_CODE(x)		(((x) >> 0) & 0x3)
+#define	EXTRACT_RRRR_CODE(x)		(((x) >> 4) & 0xf)
+#define	EXTRACT_TO_CODE(x)		(((x) >> 8) & 0x1)
+#define	EXTRACT_PP_CODE(x)		(((x) >> 9) & 0x3)
+
+/*
+ * The following are for BUS type errors AFTER values have been normalized by
+ * shifting right
+ */
+#define K8_NBSL_PP_SRC			0x0
+#define K8_NBSL_PP_RES			0x1
+#define K8_NBSL_PP_OBS			0x2
+#define K8_NBSL_PP_GENERIC		0x3
+
+
+#define K8_NBSH				0x4C
+
+#define K8_NBSH_VALID_BIT		BIT(31)
+#define K8_NBSH_OVERFLOW		BIT(30)
+#define K8_NBSH_UNCORRECTED_ERR		BIT(29)
+#define K8_NBSH_ERR_ENABLE		BIT(28)
+#define K8_NBSH_MISC_ERR_VALID		BIT(27)
+#define K8_NBSH_VALID_ERROR_ADDR	BIT(26)
+#define K8_NBSH_PCC			BIT(25)
+#define K8_NBSH_CECC			BIT(14)
+#define K8_NBSH_UECC			BIT(13)
+#define K8_NBSH_ERR_SCRUBER		BIT(8)
+#define K8_NBSH_CORE3			BIT(3)
+#define K8_NBSH_CORE2			BIT(2)
+#define K8_NBSH_CORE1			BIT(1)
+#define K8_NBSH_CORE0			BIT(0)
+
+#define EXTRACT_LDT_LINK(x)		(((x) >> 4) & 0x7)
+#define EXTRACT_ERR_CPU_MAP(x)		((x) & 0xF)
+#define EXTRACT_LOW_SYNDROME(x)		(((x) >> 15) & 0xff)
+
+
+#define K8_NBEAL			0x50
+#define K8_NBEAH			0x54
+#define K8_SCRCTRL			0x58
+
+#define F10_NB_CFG_LOW			0x88
+#define	F10_NB_CFG_LOW_ENABLE_EXT_CFG	BIT(14)
+
+#define F10_NB_CFG_HIGH			0x8C
+
+#define F10_ONLINE_SPARE		0xB0
+#define F10_ONLINE_SPARE_SWAPDONE0(x)	((x) & BIT(1))
+#define F10_ONLINE_SPARE_SWAPDONE1(x)	((x) & BIT(3))
+#define F10_ONLINE_SPARE_BADDRAM_CS0(x) (((x) >> 4) & 0x00000007)
+#define F10_ONLINE_SPARE_BADDRAM_CS1(x) (((x) >> 8) & 0x00000007)
+
+#define F10_NB_ARRAY_ADDR		0xB8
+
+#define F10_NB_ARRAY_DRAM_ECC		0x80000000
+
+/* Bits [2:1] are used to select 16-byte section within a 64-byte cacheline  */
+#define SET_NB_ARRAY_ADDRESS(section)	(((section) & 0x3) << 1)
+
+#define F10_NB_ARRAY_DATA		0xBC
+
+#define SET_NB_DRAM_INJECTION_WRITE(word, bits)  \
+					(BIT(((word) & 0xF) + 20) | \
+					BIT(17) |  \
+					((bits) & 0xF))
+
+#define SET_NB_DRAM_INJECTION_READ(word, bits)  \
+					(BIT(((word) & 0xF) + 20) | \
+					BIT(16) |  \
+					((bits) & 0xF))
+
+#define K8_NBCAP			0xE8
+#define K8_NBCAP_CORES			(BIT(12)|BIT(13))
+#define K8_NBCAP_CHIPKILL		BIT(4)
+#define K8_NBCAP_SECDED			BIT(3)
+#define K8_NBCAP_8_NODE			BIT(2)
+#define K8_NBCAP_DUAL_NODE		BIT(1)
+#define K8_NBCAP_DCT_DUAL		BIT(0)
+
+/*
+ * MSR Regs
+ */
+#define K8_MSR_MCGCTL			0x017b
+#define K8_MSR_MCGCTL_NBE		BIT(4)
+
+#define K8_MSR_MC4CTL			0x0410
+#define K8_MSR_MC4STAT			0x0411
+#define K8_MSR_MC4ADDR			0x0412
+
+/* AMD sets the first MC device at device ID 0x18. */
+static inline int get_mc_node_id_from_pdev(struct pci_dev *pdev)
+{
+	return PCI_SLOT(pdev->devfn) - 0x18;
+}
+
+enum amd64_chipset_families {
+	K8_CPUS = 0,
+	F10_CPUS,
+	F11_CPUS,
+};
+
+/*
+ * Structure to hold:
+ *
+ * 1) dynamically read status and error address HW registers
+ * 2) sysfs entered values
+ * 3) MCE values
+ *
+ * Depends on entry into the modules
+ */
+struct amd64_error_info_regs {
+	u32 nbcfg;
+	u32 nbsh;
+	u32 nbsl;
+	u32 nbeah;
+	u32 nbeal;
+};
+
+/* Error injection control structure */
+struct error_injection {
+	u32	section;
+	u32	word;
+	u32	bit_map;
+};
+
+struct amd64_pvt {
+	/* pci_device handles which we utilize */
+	struct pci_dev *addr_f1_ctl;
+	struct pci_dev *dram_f2_ctl;
+	struct pci_dev *misc_f3_ctl;
+
+	int mc_node_id;		/* MC index of this MC node */
+	int ext_model;		/* extended model value of this node */
+
+	struct low_ops *ops;	/* pointer to per PCI Device ID func table */
+
+	int channel_count;
+
+	/* Raw registers */
+	u32 dclr0;		/* DRAM Configuration Low DCT0 reg */
+	u32 dclr1;		/* DRAM Configuration Low DCT1 reg */
+	u32 dchr0;		/* DRAM Configuration High DCT0 reg */
+	u32 dchr1;		/* DRAM Configuration High DCT1 reg */
+	u32 nbcap;		/* North Bridge Capabilities */
+	u32 nbcfg;		/* F10 North Bridge Configuration */
+	u32 ext_nbcfg;		/* Extended F10 North Bridge Configuration */
+	u32 dhar;		/* DRAM Hoist reg */
+	u32 dbam0;		/* DRAM Base Address Mapping reg for DCT0 */
+	u32 dbam1;		/* DRAM Base Address Mapping reg for DCT1 */
+
+	/* DRAM CS Base Address Registers F2x[1,0][5C:40] */
+	u32 dcsb0[CHIPSELECT_COUNT];
+	u32 dcsb1[CHIPSELECT_COUNT];
+
+	/* DRAM CS Mask Registers F2x[1,0][6C:60] */
+	u32 dcsm0[CHIPSELECT_COUNT];
+	u32 dcsm1[CHIPSELECT_COUNT];
+
+	/*
+	 * Decoded parts of DRAM BASE and LIMIT Registers
+	 * F1x[78,70,68,60,58,50,48,40]
+	 */
+	u64 dram_base[DRAM_REG_COUNT];
+	u64 dram_limit[DRAM_REG_COUNT];
+	u8  dram_IntlvSel[DRAM_REG_COUNT];
+	u8  dram_IntlvEn[DRAM_REG_COUNT];
+	u8  dram_DstNode[DRAM_REG_COUNT];
+	u8  dram_rw_en[DRAM_REG_COUNT];
+
+	/*
+	 * The following fields are set at (load) run time, after CPU revision
+	 * has been determined, since the dct_base and dct_mask registers vary
+	 * based on revision
+	 */
+	u32 dcsb_base;		/* DCSB base bits */
+	u32 dcsm_mask;		/* DCSM mask bits */
+	u32 num_dcsm;		/* Number of DCSM registers */
+	u32 dcs_mask_notused;	/* DCSM notused mask bits */
+	u32 dcs_shift;		/* DCSB and DCSM shift value */
+
+	u64 top_mem;		/* top of memory below 4GB */
+	u64 top_mem2;		/* top of memory above 4GB */
+
+	u32 dram_ctl_select_low;	/* DRAM Controller Select Low Reg */
+	u32 dram_ctl_select_high;	/* DRAM Controller Select High Reg */
+	u32 online_spare;               /* On-Line spare Reg */
+
+	/* temp storage for when input is received from sysfs */
+	struct amd64_error_info_regs ctl_error_info;
+
+	/* place to store error injection parameters prior to issue */
+	struct error_injection injection;
+
+	/* Save old hw registers' values before we modified them */
+	u32 nbctl_mcgctl_saved;		/* When true, following 2 are valid */
+	u32 old_nbctl;
+	u32 *old_mcgctl;		/* per core on this node */
+
+	/* MC Type Index value: socket F vs Family 10h */
+	u32 mc_type_index;
+
+	/* misc settings */
+	struct flags {
+		unsigned long cf8_extcfg:1;
+	} flags;
+};
+
+struct scrubrate {
+       u32 scrubval;           /* bit pattern for scrub rate */
+       u32 bandwidth;          /* bandwidth consumed (bytes/sec) */
+};
+
+extern struct scrubrate scrubrates[23];
+extern u32 revf_quad_ddr2_shift[16];
+extern const char *tt_msgs[4];
+extern const char *ll_msgs[4];
+extern const char *rrrr_msgs[16];
+extern const char *to_msgs[2];
+extern const char *pp_msgs[4];
+extern const char *ii_msgs[4];
+extern const char *ext_msgs[32];
+extern const char *htlink_msgs[8];
+
+/*
+ * Each of the PCI Device IDs types have their own set of hardware accessor
+ * functions and per device encoding/decoding logic.
+ */
+struct low_ops {
+	int (*probe_valid_hardware)(struct amd64_pvt *pvt);
+	int (*early_channel_count)(struct amd64_pvt *pvt);
+
+	u64 (*get_error_address)(struct mem_ctl_info *mci,
+			struct amd64_error_info_regs *info);
+	void (*read_dram_base_limit)(struct amd64_pvt *pvt, int dram);
+	void (*read_dram_ctl_register)(struct amd64_pvt *pvt);
+	void (*map_sysaddr_to_csrow)(struct mem_ctl_info *mci,
+					struct amd64_error_info_regs *info,
+					u64 SystemAddr);
+	int (*dbam_map_to_pages)(struct amd64_pvt *pvt, int dram_map);
+};
+
+struct amd64_family_type {
+	const char *ctl_name;
+	u16 addr_f1_ctl;
+	u16 misc_f3_ctl;
+	struct low_ops ops;
+};
+
+static struct amd64_family_type amd64_family_types[];
+
+static inline const char *get_amd_family_name(int index)
+{
+	return amd64_family_types[index].ctl_name;
+}
+
+static inline struct low_ops *family_ops(int index)
+{
+	return &amd64_family_types[index].ops;
+}
+
+/*
+ * For future CPU versions, verify the following as new 'slow' rates appear and
+ * modify the necessary skip values for the supported CPU.
+ */
+#define K8_MIN_SCRUB_RATE_BITS	0x0
+#define F10_MIN_SCRUB_RATE_BITS	0x5
+#define F11_MIN_SCRUB_RATE_BITS	0x6
+
+int amd64_process_error_info(struct mem_ctl_info *mci,
+			     struct amd64_error_info_regs *info,
+			     int handle_errors);
+int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base,
+			     u64 *hole_offset, u64 *hole_size);
-- 
cgit v0.10.2


From fd3d6780f7358225a6ea1ffb1bb8fd8b2d6df446 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 19:50:23 +0200
Subject: amd64_edac: add debugging/testing code

This is for dumping different registers and testing the address mapping
logic using the ECC syndromes.

Borislav:

- split sysfs attrs per file
- use more conform names for the sysfs attrs
- fix function return value patterns
- cleanup/fix comments
- cleanup debug calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/drivers/edac/amd64_edac_dbg.c b/drivers/edac/amd64_edac_dbg.c
new file mode 100644
index 0000000..0a41b24
--- /dev/null
+++ b/drivers/edac/amd64_edac_dbg.c
@@ -0,0 +1,255 @@
+#include "amd64_edac.h"
+
+/*
+ * accept a hex value and store it into the virtual error register file, field:
+ * nbeal and nbeah. Assume virtual error values have already been set for: NBSL,
+ * NBSH and NBCFG. Then proceed to map the error values to a MC, CSROW and
+ * CHANNEL
+ */
+static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data,
+				size_t count)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	unsigned long long value;
+	int ret = 0;
+
+	ret = strict_strtoull(data, 16, &value);
+	if (ret != -EINVAL) {
+		debugf0("received NBEA= 0x%llx\n", value);
+
+		/* place the value into the virtual error packet */
+		pvt->ctl_error_info.nbeal = (u32) value;
+		value >>= 32;
+		pvt->ctl_error_info.nbeah = (u32) value;
+
+		/* Process the Mapping request */
+		/* TODO: Add race prevention */
+		amd64_process_error_info(mci, &pvt->ctl_error_info, 1);
+
+		return count;
+	}
+	return ret;
+}
+
+/* display back what the last NBEA (MCA NB Address (MC4_ADDR)) was written */
+static ssize_t amd64_nbea_show(struct mem_ctl_info *mci, char *data)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	u64 value;
+
+	value = pvt->ctl_error_info.nbeah;
+	value <<= 32;
+	value |= pvt->ctl_error_info.nbeal;
+
+	return sprintf(data, "%llx\n", value);
+}
+
+/* store the NBSL (MCA NB Status Low (MC4_STATUS)) value user desires */
+static ssize_t amd64_nbsl_store(struct mem_ctl_info *mci, const char *data,
+				size_t count)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	unsigned long value;
+	int ret = 0;
+
+	ret = strict_strtoul(data, 16, &value);
+	if (ret != -EINVAL) {
+		debugf0("received NBSL= 0x%lx\n", value);
+
+		pvt->ctl_error_info.nbsl = (u32) value;
+
+		return count;
+	}
+	return ret;
+}
+
+/* display back what the last NBSL value written */
+static ssize_t amd64_nbsl_show(struct mem_ctl_info *mci, char *data)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	u32 value;
+
+	value = pvt->ctl_error_info.nbsl;
+
+	return sprintf(data, "%x\n", value);
+}
+
+/* store the NBSH (MCA NB Status High) value user desires */
+static ssize_t amd64_nbsh_store(struct mem_ctl_info *mci, const char *data,
+				size_t count)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	unsigned long value;
+	int ret = 0;
+
+	ret = strict_strtoul(data, 16, &value);
+	if (ret != -EINVAL) {
+		debugf0("received NBSH= 0x%lx\n", value);
+
+		pvt->ctl_error_info.nbsh = (u32) value;
+
+		return count;
+	}
+	return ret;
+}
+
+/* display back what the last NBSH value written */
+static ssize_t amd64_nbsh_show(struct mem_ctl_info *mci, char *data)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	u32 value;
+
+	value = pvt->ctl_error_info.nbsh;
+
+	return sprintf(data, "%x\n", value);
+}
+
+/* accept and store the NBCFG (MCA NB Configuration) value user desires */
+static ssize_t amd64_nbcfg_store(struct mem_ctl_info *mci,
+					const char *data, size_t count)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	unsigned long value;
+	int ret = 0;
+
+	ret = strict_strtoul(data, 16, &value);
+	if (ret != -EINVAL) {
+		debugf0("received NBCFG= 0x%lx\n", value);
+
+		pvt->ctl_error_info.nbcfg = (u32) value;
+
+		return count;
+	}
+	return ret;
+}
+
+/* various show routines for the controls of a MCI */
+static ssize_t amd64_nbcfg_show(struct mem_ctl_info *mci, char *data)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+
+	return sprintf(data, "%x\n", pvt->ctl_error_info.nbcfg);
+}
+
+
+static ssize_t amd64_dhar_show(struct mem_ctl_info *mci, char *data)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+
+	return sprintf(data, "%x\n", pvt->dhar);
+}
+
+
+static ssize_t amd64_dbam_show(struct mem_ctl_info *mci, char *data)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+
+	return sprintf(data, "%x\n", pvt->dbam0);
+}
+
+
+static ssize_t amd64_topmem_show(struct mem_ctl_info *mci, char *data)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+
+	return sprintf(data, "%llx\n", pvt->top_mem);
+}
+
+
+static ssize_t amd64_topmem2_show(struct mem_ctl_info *mci, char *data)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+
+	return sprintf(data, "%llx\n", pvt->top_mem2);
+}
+
+static ssize_t amd64_hole_show(struct mem_ctl_info *mci, char *data)
+{
+	u64 hole_base = 0;
+	u64 hole_offset = 0;
+	u64 hole_size = 0;
+
+	amd64_get_dram_hole_info(mci, &hole_base, &hole_offset, &hole_size);
+
+	return sprintf(data, "%llx %llx %llx\n", hole_base, hole_offset,
+						 hole_size);
+}
+
+/*
+ * update NUM_DBG_ATTRS in case you add new members
+ */
+struct mcidev_sysfs_attribute amd64_dbg_attrs[] = {
+
+	{
+		.attr = {
+			.name = "nbea_ctl",
+			.mode = (S_IRUGO | S_IWUSR)
+		},
+		.show = amd64_nbea_show,
+		.store = amd64_nbea_store,
+	},
+	{
+		.attr = {
+			.name = "nbsl_ctl",
+			.mode = (S_IRUGO | S_IWUSR)
+		},
+		.show = amd64_nbsl_show,
+		.store = amd64_nbsl_store,
+	},
+	{
+		.attr = {
+			.name = "nbsh_ctl",
+			.mode = (S_IRUGO | S_IWUSR)
+		},
+		.show = amd64_nbsh_show,
+		.store = amd64_nbsh_store,
+	},
+	{
+		.attr = {
+			.name = "nbcfg_ctl",
+			.mode = (S_IRUGO | S_IWUSR)
+		},
+		.show = amd64_nbcfg_show,
+		.store = amd64_nbcfg_store,
+	},
+	{
+		.attr = {
+			.name = "dhar",
+			.mode = (S_IRUGO)
+		},
+		.show = amd64_dhar_show,
+		.store = NULL,
+	},
+	{
+		.attr = {
+			.name = "dbam",
+			.mode = (S_IRUGO)
+		},
+		.show = amd64_dbam_show,
+		.store = NULL,
+	},
+	{
+		.attr = {
+			.name = "topmem",
+			.mode = (S_IRUGO)
+		},
+		.show = amd64_topmem_show,
+		.store = NULL,
+	},
+	{
+		.attr = {
+			.name = "topmem2",
+			.mode = (S_IRUGO)
+		},
+		.show = amd64_topmem2_show,
+		.store = NULL,
+	},
+	{
+		.attr = {
+			.name = "dram_hole",
+			.mode = (S_IRUGO)
+		},
+		.show = amd64_hole_show,
+		.store = NULL,
+	},
+};
-- 
cgit v0.10.2


From eb919690be994386eac326f8c53c4540602de563 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Tue, 5 May 2009 20:07:11 +0200
Subject: amd64_edac: add DRAM error injection logic using sysfs

Borislav:
- rename sysfs attrs to more conform names
- cleanup/fix comments according to BKDG text
- fix function return value patterns
- cleanup debug calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/drivers/edac/amd64_edac_inj.c b/drivers/edac/amd64_edac_inj.c
new file mode 100644
index 0000000..d3675b7
--- /dev/null
+++ b/drivers/edac/amd64_edac_inj.c
@@ -0,0 +1,185 @@
+#include "amd64_edac.h"
+
+/*
+ * store error injection section value which refers to one of 4 16-byte sections
+ * within a 64-byte cacheline
+ *
+ * range: 0..3
+ */
+static ssize_t amd64_inject_section_store(struct mem_ctl_info *mci,
+					  const char *data, size_t count)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	unsigned long value;
+	int ret = 0;
+
+	ret = strict_strtoul(data, 10, &value);
+	if (ret != -EINVAL) {
+		pvt->injection.section = (u32) value;
+		return count;
+	}
+	return ret;
+}
+
+/*
+ * store error injection word value which refers to one of 9 16-bit word of the
+ * 16-byte (128-bit + ECC bits) section
+ *
+ * range: 0..8
+ */
+static ssize_t amd64_inject_word_store(struct mem_ctl_info *mci,
+					const char *data, size_t count)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	unsigned long value;
+	int ret = 0;
+
+	ret = strict_strtoul(data, 10, &value);
+	if (ret != -EINVAL) {
+
+		value = (value <= 8) ? value : 0;
+		pvt->injection.word = (u32) value;
+
+		return count;
+	}
+	return ret;
+}
+
+/*
+ * store 16 bit error injection vector which enables injecting errors to the
+ * corresponding bit within the error injection word above. When used during a
+ * DRAM ECC read, it holds the contents of the of the DRAM ECC bits.
+ */
+static ssize_t amd64_inject_ecc_vector_store(struct mem_ctl_info *mci,
+					     const char *data, size_t count)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	unsigned long value;
+	int ret = 0;
+
+	ret = strict_strtoul(data, 16, &value);
+	if (ret != -EINVAL) {
+
+		pvt->injection.bit_map = (u32) value & 0xFFFF;
+
+		return count;
+	}
+	return ret;
+}
+
+/*
+ * Do a DRAM ECC read. Assemble staged values in the pvt area, format into
+ * fields needed by the injection registers and read the NB Array Data Port.
+ */
+static ssize_t amd64_inject_read_store(struct mem_ctl_info *mci,
+					const char *data, size_t count)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	unsigned long value;
+	u32 section, word_bits;
+	int ret = 0;
+
+	ret = strict_strtoul(data, 10, &value);
+	if (ret != -EINVAL) {
+
+		/* Form value to choose 16-byte section of cacheline */
+		section = F10_NB_ARRAY_DRAM_ECC |
+				SET_NB_ARRAY_ADDRESS(pvt->injection.section);
+		pci_write_config_dword(pvt->misc_f3_ctl,
+					F10_NB_ARRAY_ADDR, section);
+
+		word_bits = SET_NB_DRAM_INJECTION_READ(pvt->injection.word,
+						pvt->injection.bit_map);
+
+		/* Issue 'word' and 'bit' along with the READ request */
+		pci_write_config_dword(pvt->misc_f3_ctl,
+					F10_NB_ARRAY_DATA, word_bits);
+
+		debugf0("section=0x%x word_bits=0x%x\n", section, word_bits);
+
+		return count;
+	}
+	return ret;
+}
+
+/*
+ * Do a DRAM ECC write. Assemble staged values in the pvt area and format into
+ * fields needed by the injection registers.
+ */
+static ssize_t amd64_inject_write_store(struct mem_ctl_info *mci,
+					const char *data, size_t count)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	unsigned long value;
+	u32 section, word_bits;
+	int ret = 0;
+
+	ret = strict_strtoul(data, 10, &value);
+	if (ret != -EINVAL) {
+
+		/* Form value to choose 16-byte section of cacheline */
+		section = F10_NB_ARRAY_DRAM_ECC |
+				SET_NB_ARRAY_ADDRESS(pvt->injection.section);
+		pci_write_config_dword(pvt->misc_f3_ctl,
+					F10_NB_ARRAY_ADDR, section);
+
+		word_bits = SET_NB_DRAM_INJECTION_WRITE(pvt->injection.word,
+						pvt->injection.bit_map);
+
+		/* Issue 'word' and 'bit' along with the READ request */
+		pci_write_config_dword(pvt->misc_f3_ctl,
+					F10_NB_ARRAY_DATA, word_bits);
+
+		debugf0("section=0x%x word_bits=0x%x\n", section, word_bits);
+
+		return count;
+	}
+	return ret;
+}
+
+/*
+ * update NUM_INJ_ATTRS in case you add new members
+ */
+struct mcidev_sysfs_attribute amd64_inj_attrs[] = {
+
+	{
+		.attr = {
+			.name = "inject_section",
+			.mode = (S_IRUGO | S_IWUSR)
+		},
+		.show = NULL,
+		.store = amd64_inject_section_store,
+	},
+	{
+		.attr = {
+			.name = "inject_word",
+			.mode = (S_IRUGO | S_IWUSR)
+		},
+		.show = NULL,
+		.store = amd64_inject_word_store,
+	},
+	{
+		.attr = {
+			.name = "inject_ecc_vector",
+			.mode = (S_IRUGO | S_IWUSR)
+		},
+		.show = NULL,
+		.store = amd64_inject_ecc_vector_store,
+	},
+	{
+		.attr = {
+			.name = "inject_write",
+			.mode = (S_IRUGO | S_IWUSR)
+		},
+		.show = NULL,
+		.store = amd64_inject_write_store,
+	},
+	{
+		.attr = {
+			.name = "inject_read",
+			.mode = (S_IRUGO | S_IWUSR)
+		},
+		.show = NULL,
+		.store = amd64_inject_read_store,
+	},
+};
-- 
cgit v0.10.2


From b52401cecedf669211f0b7a5095abcb6d3fc82c2 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Wed, 6 May 2009 17:57:20 +0200
Subject: amd64_edac: add MCA error types

Borislav:
- cleanup comments

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/drivers/edac/amd64_edac_err_types.c b/drivers/edac/amd64_edac_err_types.c
new file mode 100644
index 0000000..f212ff1
--- /dev/null
+++ b/drivers/edac/amd64_edac_err_types.c
@@ -0,0 +1,161 @@
+#include "amd64_edac.h"
+
+/*
+ * See F2x80 for K8 and F2x[1,0]80 for Fam10 and later. The table below is only
+ * for DDR2 DRAM mapping.
+ */
+u32 revf_quad_ddr2_shift[] = {
+	0,	/* 0000b NULL DIMM (128mb) */
+	28,	/* 0001b 256mb */
+	29,	/* 0010b 512mb */
+	29,	/* 0011b 512mb */
+	29,	/* 0100b 512mb */
+	30,	/* 0101b 1gb */
+	30,	/* 0110b 1gb */
+	31,	/* 0111b 2gb */
+	31,	/* 1000b 2gb */
+	32,	/* 1001b 4gb */
+	32,	/* 1010b 4gb */
+	33,	/* 1011b 8gb */
+	0,	/* 1100b future */
+	0,	/* 1101b future */
+	0,	/* 1110b future */
+	0	/* 1111b future */
+};
+
+/*
+ * Valid scrub rates for the K8 hardware memory scrubber. We map the scrubbing
+ * bandwidth to a valid bit pattern. The 'set' operation finds the 'matching-
+ * or higher value'.
+ *
+ *FIXME: Produce a better mapping/linearisation.
+ */
+
+struct scrubrate scrubrates[] = {
+	{ 0x01, 1600000000UL},
+	{ 0x02, 800000000UL},
+	{ 0x03, 400000000UL},
+	{ 0x04, 200000000UL},
+	{ 0x05, 100000000UL},
+	{ 0x06, 50000000UL},
+	{ 0x07, 25000000UL},
+	{ 0x08, 12284069UL},
+	{ 0x09, 6274509UL},
+	{ 0x0A, 3121951UL},
+	{ 0x0B, 1560975UL},
+	{ 0x0C, 781440UL},
+	{ 0x0D, 390720UL},
+	{ 0x0E, 195300UL},
+	{ 0x0F, 97650UL},
+	{ 0x10, 48854UL},
+	{ 0x11, 24427UL},
+	{ 0x12, 12213UL},
+	{ 0x13, 6101UL},
+	{ 0x14, 3051UL},
+	{ 0x15, 1523UL},
+	{ 0x16, 761UL},
+	{ 0x00, 0UL},        /* scrubbing off */
+};
+
+/*
+ * string representation for the different MCA reported error types, see F3x48
+ * or MSR0000_0411.
+ */
+const char *tt_msgs[] = {        /* transaction type */
+	"instruction",
+	"data",
+	"generic",
+	"reserved"
+};
+
+const char *ll_msgs[] = {	/* cache level */
+	"L0",
+	"L1",
+	"L2",
+	"L3/generic"
+};
+
+const char *rrrr_msgs[] = {
+	"generic",
+	"generic read",
+	"generic write",
+	"data read",
+	"data write",
+	"inst fetch",
+	"prefetch",
+	"evict",
+	"snoop",
+	"reserved RRRR= 9",
+	"reserved RRRR= 10",
+	"reserved RRRR= 11",
+	"reserved RRRR= 12",
+	"reserved RRRR= 13",
+	"reserved RRRR= 14",
+	"reserved RRRR= 15"
+};
+
+const char *pp_msgs[] = {	/* participating processor */
+	"local node originated (SRC)",
+	"local node responded to request (RES)",
+	"local node observed as 3rd party (OBS)",
+	"generic"
+};
+
+const char *to_msgs[] = {
+	"no timeout",
+	"timed out"
+};
+
+const char *ii_msgs[] = {	/* memory or i/o */
+	"mem access",
+	"reserved",
+	"i/o access",
+	"generic"
+};
+
+/* Map the 5 bits of Extended Error code to the string table. */
+const char *ext_msgs[] = {	/* extended error */
+	"K8 ECC error/F10 reserved",	/* 0_0000b */
+	"CRC error",			/* 0_0001b */
+	"sync error",			/* 0_0010b */
+	"mst abort",			/* 0_0011b */
+	"tgt abort",			/* 0_0100b */
+	"GART error",			/* 0_0101b */
+	"RMW error",			/* 0_0110b */
+	"Wdog timer error",		/* 0_0111b */
+	"F10-ECC/K8-Chipkill error",	/* 0_1000b */
+	"DEV Error",			/* 0_1001b */
+	"Link Data error",		/* 0_1010b */
+	"Link or L3 Protocol error",	/* 0_1011b */
+	"NB Array error",		/* 0_1100b */
+	"DRAM Parity error",		/* 0_1101b */
+	"Link Retry/GART Table Walk/DEV Table Walk error", /* 0_1110b */
+	"Res 0x0ff error",		/* 0_1111b */
+	"Res 0x100 error",		/* 1_0000b */
+	"Res 0x101 error",		/* 1_0001b */
+	"Res 0x102 error",		/* 1_0010b */
+	"Res 0x103 error",		/* 1_0011b */
+	"Res 0x104 error",		/* 1_0100b */
+	"Res 0x105 error",		/* 1_0101b */
+	"Res 0x106 error",		/* 1_0110b */
+	"Res 0x107 error",		/* 1_0111b */
+	"Res 0x108 error",		/* 1_1000b */
+	"Res 0x109 error",		/* 1_1001b */
+	"Res 0x10A error",		/* 1_1010b */
+	"Res 0x10B error",		/* 1_1011b */
+	"L3 Cache Data error",		/* 1_1100b */
+	"L3 CacheTag error",		/* 1_1101b */
+	"L3 Cache LRU error",		/* 1_1110b */
+	"Res 0x1FF error"		/* 1_1111b */
+};
+
+const char *htlink_msgs[] = {
+	"none",
+	"1",
+	"2",
+	"1 2",
+	"3",
+	"1 3",
+	"2 3",
+	"1 2 3"
+};
-- 
cgit v0.10.2


From 2bc65418724a0ddbb20c9eeb9de8285af7748c16 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 4 May 2009 20:11:14 +0200
Subject: amd64_edac: add memory scrubber interface

Borislav:
- fix/cleanup comments
- fix function return value patterns
- cleanup debug calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
new file mode 100644
index 0000000..6876d43
--- /dev/null
+++ b/drivers/edac/amd64_edac.c
@@ -0,0 +1,130 @@
+#include "amd64_edac.h"
+
+static struct edac_pci_ctl_info *amd64_ctl_pci;
+
+static int report_gart_errors;
+module_param(report_gart_errors, int, 0644);
+
+/*
+ * Set by command line parameter. If BIOS has enabled the ECC, this override is
+ * cleared to prevent re-enabling the hardware by this driver.
+ */
+static int ecc_enable_override;
+module_param(ecc_enable_override, int, 0644);
+
+/* Lookup table for all possible MC control instances */
+struct amd64_pvt;
+static struct mem_ctl_info *mci_lookup[MAX_NUMNODES];
+static struct amd64_pvt *pvt_lookup[MAX_NUMNODES];
+
+/*
+ * Memory scrubber control interface. For K8, memory scrubbing is handled by
+ * hardware and can involve L2 cache, dcache as well as the main memory. With
+ * F10, this is extended to L3 cache scrubbing on CPU models sporting that
+ * functionality.
+ *
+ * This causes the "units" for the scrubbing speed to vary from 64 byte blocks
+ * (dram) over to cache lines. This is nasty, so we will use bandwidth in
+ * bytes/sec for the setting.
+ *
+ * Currently, we only do dram scrubbing. If the scrubbing is done in software on
+ * other archs, we might not have access to the caches directly.
+ */
+
+/*
+ * scan the scrub rate mapping table for a close or matching bandwidth value to
+ * issue. If requested is too big, then use last maximum value found.
+ */
+static int amd64_search_set_scrub_rate(struct pci_dev *ctl, u32 new_bw,
+				       u32 min_scrubrate)
+{
+	u32 scrubval;
+	int i;
+
+	/*
+	 * map the configured rate (new_bw) to a value specific to the AMD64
+	 * memory controller and apply to register. Search for the first
+	 * bandwidth entry that is greater or equal than the setting requested
+	 * and program that. If at last entry, turn off DRAM scrubbing.
+	 */
+	for (i = 0; i < ARRAY_SIZE(scrubrates); i++) {
+		/*
+		 * skip scrub rates which aren't recommended
+		 * (see F10 BKDG, F3x58)
+		 */
+		if (scrubrates[i].scrubval < min_scrubrate)
+			continue;
+
+		if (scrubrates[i].bandwidth <= new_bw)
+			break;
+
+		/*
+		 * if no suitable bandwidth found, turn off DRAM scrubbing
+		 * entirely by falling back to the last element in the
+		 * scrubrates array.
+		 */
+	}
+
+	scrubval = scrubrates[i].scrubval;
+	if (scrubval)
+		edac_printk(KERN_DEBUG, EDAC_MC,
+			    "Setting scrub rate bandwidth: %u\n",
+			    scrubrates[i].bandwidth);
+	else
+		edac_printk(KERN_DEBUG, EDAC_MC, "Turning scrubbing off.\n");
+
+	pci_write_bits32(ctl, K8_SCRCTRL, scrubval, 0x001F);
+
+	return 0;
+}
+
+static int amd64_set_scrub_rate(struct mem_ctl_info *mci, u32 *bandwidth)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	u32 min_scrubrate = 0x0;
+
+	switch (boot_cpu_data.x86) {
+	case 0xf:
+		min_scrubrate = K8_MIN_SCRUB_RATE_BITS;
+		break;
+	case 0x10:
+		min_scrubrate = F10_MIN_SCRUB_RATE_BITS;
+		break;
+	case 0x11:
+		min_scrubrate = F11_MIN_SCRUB_RATE_BITS;
+		break;
+
+	default:
+		amd64_printk(KERN_ERR, "Unsupported family!\n");
+		break;
+	}
+	return amd64_search_set_scrub_rate(pvt->misc_f3_ctl, *bandwidth,
+			min_scrubrate);
+}
+
+static int amd64_get_scrub_rate(struct mem_ctl_info *mci, u32 *bw)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	u32 scrubval = 0;
+	int status = -1, i, ret = 0;
+
+	ret = pci_read_config_dword(pvt->misc_f3_ctl, K8_SCRCTRL, &scrubval);
+	if (ret)
+		debugf0("Reading K8_SCRCTRL failed\n");
+
+	scrubval = scrubval & 0x001F;
+
+	edac_printk(KERN_DEBUG, EDAC_MC,
+		    "pci-read, sdram scrub control value: %d \n", scrubval);
+
+	for (i = 0; ARRAY_SIZE(scrubrates); i++) {
+		if (scrubrates[i].scrubval == scrubval) {
+			*bw = scrubrates[i].bandwidth;
+			status = 0;
+			break;
+		}
+	}
+
+	return status;
+}
+
-- 
cgit v0.10.2


From 6775763a2377e1ea865299b6daadc875d622de3f Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 15:53:22 +0200
Subject: amd64_edac: add sys addr to memory controller mapping helpers

Borislav:

- cleanup comments
- cleanup debug calls
- simplify find_mc_by_sys_addr's exit path

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 6876d43..a774f34 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -128,3 +128,144 @@ static int amd64_get_scrub_rate(struct mem_ctl_info *mci, u32 *bw)
 	return status;
 }
 
+/* Map from a CSROW entry to the mask entry that operates on it */
+static inline u32 amd64_map_to_dcs_mask(struct amd64_pvt *pvt, int csrow)
+{
+	return csrow >> (pvt->num_dcsm >> 3);
+}
+
+/* return the 'base' address the i'th CS entry of the 'dct' DRAM controller */
+static u32 amd64_get_dct_base(struct amd64_pvt *pvt, int dct, int csrow)
+{
+	if (dct == 0)
+		return pvt->dcsb0[csrow];
+	else
+		return pvt->dcsb1[csrow];
+}
+
+/*
+ * Return the 'mask' address the i'th CS entry. This function is needed because
+ * there number of DCSM registers on Rev E and prior vs Rev F and later is
+ * different.
+ */
+static u32 amd64_get_dct_mask(struct amd64_pvt *pvt, int dct, int csrow)
+{
+	if (dct == 0)
+		return pvt->dcsm0[amd64_map_to_dcs_mask(pvt, csrow)];
+	else
+		return pvt->dcsm1[amd64_map_to_dcs_mask(pvt, csrow)];
+}
+
+
+/*
+ * In *base and *limit, pass back the full 40-bit base and limit physical
+ * addresses for the node given by node_id.  This information is obtained from
+ * DRAM Base (section 3.4.4.1) and DRAM Limit (section 3.4.4.2) registers. The
+ * base and limit addresses are of type SysAddr, as defined at the start of
+ * section 3.4.4 (p. 70).  They are the lowest and highest physical addresses
+ * in the address range they represent.
+ */
+static void amd64_get_base_and_limit(struct amd64_pvt *pvt, int node_id,
+			       u64 *base, u64 *limit)
+{
+	*base = pvt->dram_base[node_id];
+	*limit = pvt->dram_limit[node_id];
+}
+
+/*
+ * Return 1 if the SysAddr given by sys_addr matches the base/limit associated
+ * with node_id
+ */
+static int amd64_base_limit_match(struct amd64_pvt *pvt,
+					u64 sys_addr, int node_id)
+{
+	u64 base, limit, addr;
+
+	amd64_get_base_and_limit(pvt, node_id, &base, &limit);
+
+	/* The K8 treats this as a 40-bit value.  However, bits 63-40 will be
+	 * all ones if the most significant implemented address bit is 1.
+	 * Here we discard bits 63-40.  See section 3.4.2 of AMD publication
+	 * 24592: AMD x86-64 Architecture Programmer's Manual Volume 1
+	 * Application Programming.
+	 */
+	addr = sys_addr & 0x000000ffffffffffull;
+
+	return (addr >= base) && (addr <= limit);
+}
+
+/*
+ * Attempt to map a SysAddr to a node. On success, return a pointer to the
+ * mem_ctl_info structure for the node that the SysAddr maps to.
+ *
+ * On failure, return NULL.
+ */
+static struct mem_ctl_info *find_mc_by_sys_addr(struct mem_ctl_info *mci,
+						u64 sys_addr)
+{
+	struct amd64_pvt *pvt;
+	int node_id;
+	u32 intlv_en, bits;
+
+	/*
+	 * Here we use the DRAM Base (section 3.4.4.1) and DRAM Limit (section
+	 * 3.4.4.2) registers to map the SysAddr to a node ID.
+	 */
+	pvt = mci->pvt_info;
+
+	/*
+	 * The value of this field should be the same for all DRAM Base
+	 * registers.  Therefore we arbitrarily choose to read it from the
+	 * register for node 0.
+	 */
+	intlv_en = pvt->dram_IntlvEn[0];
+
+	if (intlv_en == 0) {
+		for (node_id = 0; ; ) {
+			if (amd64_base_limit_match(pvt, sys_addr, node_id))
+				break;
+
+			if (++node_id >= DRAM_REG_COUNT)
+				goto err_no_match;
+		}
+		goto found;
+	}
+
+	if (unlikely((intlv_en != (0x01 << 8)) &&
+		     (intlv_en != (0x03 << 8)) &&
+		     (intlv_en != (0x07 << 8)))) {
+		amd64_printk(KERN_WARNING, "junk value of 0x%x extracted from "
+			     "IntlvEn field of DRAM Base Register for node 0: "
+			     "This probably indicates a BIOS bug.\n", intlv_en);
+		return NULL;
+	}
+
+	bits = (((u32) sys_addr) >> 12) & intlv_en;
+
+	for (node_id = 0; ; ) {
+		if ((pvt->dram_limit[node_id] & intlv_en) == bits)
+			break;	/* intlv_sel field matches */
+
+		if (++node_id >= DRAM_REG_COUNT)
+			goto err_no_match;
+	}
+
+	/* sanity test for sys_addr */
+	if (unlikely(!amd64_base_limit_match(pvt, sys_addr, node_id))) {
+		amd64_printk(KERN_WARNING,
+			  "%s(): sys_addr 0x%lx falls outside base/limit "
+			  "address range for node %d with node interleaving "
+			  "enabled.\n", __func__, (unsigned long)sys_addr,
+			  node_id);
+		return NULL;
+	}
+
+found:
+	return edac_mc_find(node_id);
+
+err_no_match:
+	debugf2("sys_addr 0x%lx doesn't match any node\n",
+		(unsigned long)sys_addr);
+
+	return NULL;
+}
-- 
cgit v0.10.2


From e2ce7255e84db656853e91536e6023f92ff89f97 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 15:57:12 +0200
Subject: amd64_edac: add functionality to compute the DRAM hole

Borislav:

- cleanup/fix comments, add BKDG refs
- cleanup debug calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index a774f34..4716fb5 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -269,3 +269,169 @@ err_no_match:
 
 	return NULL;
 }
+
+/*
+ * Extract the DRAM CS base address from selected csrow register.
+ */
+static u64 base_from_dct_base(struct amd64_pvt *pvt, int csrow)
+{
+	return ((u64) (amd64_get_dct_base(pvt, 0, csrow) & pvt->dcsb_base)) <<
+				pvt->dcs_shift;
+}
+
+/*
+ * Extract the mask from the dcsb0[csrow] entry in a CPU revision-specific way.
+ */
+static u64 mask_from_dct_mask(struct amd64_pvt *pvt, int csrow)
+{
+	u64 dcsm_bits, other_bits;
+	u64 mask;
+
+	/* Extract bits from DRAM CS Mask. */
+	dcsm_bits = amd64_get_dct_mask(pvt, 0, csrow) & pvt->dcsm_mask;
+
+	other_bits = pvt->dcsm_mask;
+	other_bits = ~(other_bits << pvt->dcs_shift);
+
+	/*
+	 * The extracted bits from DCSM belong in the spaces represented by
+	 * the cleared bits in other_bits.
+	 */
+	mask = (dcsm_bits << pvt->dcs_shift) | other_bits;
+
+	return mask;
+}
+
+/*
+ * @input_addr is an InputAddr associated with the node given by mci. Return the
+ * csrow that input_addr maps to, or -1 on failure (no csrow claims input_addr).
+ */
+static int input_addr_to_csrow(struct mem_ctl_info *mci, u64 input_addr)
+{
+	struct amd64_pvt *pvt;
+	int csrow;
+	u64 base, mask;
+
+	pvt = mci->pvt_info;
+
+	/*
+	 * Here we use the DRAM CS Base and DRAM CS Mask registers. For each CS
+	 * base/mask register pair, test the condition shown near the start of
+	 * section 3.5.4 (p. 84, BKDG #26094, K8, revA-E).
+	 */
+	for (csrow = 0; csrow < CHIPSELECT_COUNT; csrow++) {
+
+		/* This DRAM chip select is disabled on this node */
+		if ((pvt->dcsb0[csrow] & K8_DCSB_CS_ENABLE) == 0)
+			continue;
+
+		base = base_from_dct_base(pvt, csrow);
+		mask = ~mask_from_dct_mask(pvt, csrow);
+
+		if ((input_addr & mask) == (base & mask)) {
+			debugf2("InputAddr 0x%lx matches csrow %d (node %d)\n",
+				(unsigned long)input_addr, csrow,
+				pvt->mc_node_id);
+
+			return csrow;
+		}
+	}
+
+	debugf2("no matching csrow for InputAddr 0x%lx (MC node %d)\n",
+		(unsigned long)input_addr, pvt->mc_node_id);
+
+	return -1;
+}
+
+/*
+ * Return the base value defined by the DRAM Base register for the node
+ * represented by mci.  This function returns the full 40-bit value despite the
+ * fact that the register only stores bits 39-24 of the value. See section
+ * 3.4.4.1 (BKDG #26094, K8, revA-E)
+ */
+static inline u64 get_dram_base(struct mem_ctl_info *mci)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+
+	return pvt->dram_base[pvt->mc_node_id];
+}
+
+/*
+ * Obtain info from the DRAM Hole Address Register (section 3.4.8, pub #26094)
+ * for the node represented by mci. Info is passed back in *hole_base,
+ * *hole_offset, and *hole_size.  Function returns 0 if info is valid or 1 if
+ * info is invalid. Info may be invalid for either of the following reasons:
+ *
+ * - The revision of the node is not E or greater.  In this case, the DRAM Hole
+ *   Address Register does not exist.
+ *
+ * - The DramHoleValid bit is cleared in the DRAM Hole Address Register,
+ *   indicating that its contents are not valid.
+ *
+ * The values passed back in *hole_base, *hole_offset, and *hole_size are
+ * complete 32-bit values despite the fact that the bitfields in the DHAR
+ * only represent bits 31-24 of the base and offset values.
+ */
+int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base,
+			     u64 *hole_offset, u64 *hole_size)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	u64 base;
+
+	/* only revE and later have the DRAM Hole Address Register */
+	if (boot_cpu_data.x86 == 0xf && pvt->ext_model < OPTERON_CPU_REV_E) {
+		debugf1("  revision %d for node %d does not support DHAR\n",
+			pvt->ext_model, pvt->mc_node_id);
+		return 1;
+	}
+
+	/* only valid for Fam10h */
+	if (boot_cpu_data.x86 == 0x10 &&
+	    (pvt->dhar & F10_DRAM_MEM_HOIST_VALID) == 0) {
+		debugf1("  Dram Memory Hoisting is DISABLED on this system\n");
+		return 1;
+	}
+
+	if ((pvt->dhar & DHAR_VALID) == 0) {
+		debugf1("  Dram Memory Hoisting is DISABLED on this node %d\n",
+			pvt->mc_node_id);
+		return 1;
+	}
+
+	/* This node has Memory Hoisting */
+
+	/* +------------------+--------------------+--------------------+-----
+	 * | memory           | DRAM hole          | relocated          |
+	 * | [0, (x - 1)]     | [x, 0xffffffff]    | addresses from     |
+	 * |                  |                    | DRAM hole          |
+	 * |                  |                    | [0x100000000,      |
+	 * |                  |                    |  (0x100000000+     |
+	 * |                  |                    |   (0xffffffff-x))] |
+	 * +------------------+--------------------+--------------------+-----
+	 *
+	 * Above is a diagram of physical memory showing the DRAM hole and the
+	 * relocated addresses from the DRAM hole.  As shown, the DRAM hole
+	 * starts at address x (the base address) and extends through address
+	 * 0xffffffff.  The DRAM Hole Address Register (DHAR) relocates the
+	 * addresses in the hole so that they start at 0x100000000.
+	 */
+
+	base = dhar_base(pvt->dhar);
+
+	*hole_base = base;
+	*hole_size = (0x1ull << 32) - base;
+
+	if (boot_cpu_data.x86 > 0xf)
+		*hole_offset = f10_dhar_offset(pvt->dhar);
+	else
+		*hole_offset = k8_dhar_offset(pvt->dhar);
+
+	debugf1("  DHAR info for node %d base 0x%lx offset 0x%lx size 0x%lx\n",
+		pvt->mc_node_id, (unsigned long)*hole_base,
+		(unsigned long)*hole_offset, (unsigned long)*hole_size);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(amd64_get_dram_hole_info);
+
+
-- 
cgit v0.10.2


From 93c2df58b5b1a434cca8f60067e0e12d1942b7f1 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 4 May 2009 20:46:50 +0200
Subject: amd64_edac: add DRAM address type conversion facilities

Borislav:

- cleanup/fix comments, add BKDG refs
- fix function return value patterns
- cleanup dbg calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 4716fb5..28f85c9e 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -434,4 +434,298 @@ int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base,
 }
 EXPORT_SYMBOL_GPL(amd64_get_dram_hole_info);
 
+/*
+ * Return the DramAddr that the SysAddr given by @sys_addr maps to.  It is
+ * assumed that sys_addr maps to the node given by mci.
+ *
+ * The first part of section 3.4.4 (p. 70) shows how the DRAM Base (section
+ * 3.4.4.1) and DRAM Limit (section 3.4.4.2) registers are used to translate a
+ * SysAddr to a DramAddr. If the DRAM Hole Address Register (DHAR) is enabled,
+ * then it is also involved in translating a SysAddr to a DramAddr. Sections
+ * 3.4.8 and 3.5.8.2 describe the DHAR and how it is used for memory hoisting.
+ * These parts of the documentation are unclear. I interpret them as follows:
+ *
+ * When node n receives a SysAddr, it processes the SysAddr as follows:
+ *
+ * 1. It extracts the DRAMBase and DRAMLimit values from the DRAM Base and DRAM
+ *    Limit registers for node n. If the SysAddr is not within the range
+ *    specified by the base and limit values, then node n ignores the Sysaddr
+ *    (since it does not map to node n). Otherwise continue to step 2 below.
+ *
+ * 2. If the DramHoleValid bit of the DHAR for node n is clear, the DHAR is
+ *    disabled so skip to step 3 below. Otherwise see if the SysAddr is within
+ *    the range of relocated addresses (starting at 0x100000000) from the DRAM
+ *    hole. If not, skip to step 3 below. Else get the value of the
+ *    DramHoleOffset field from the DHAR. To obtain the DramAddr, subtract the
+ *    offset defined by this value from the SysAddr.
+ *
+ * 3. Obtain the base address for node n from the DRAMBase field of the DRAM
+ *    Base register for node n. To obtain the DramAddr, subtract the base
+ *    address from the SysAddr, as shown near the start of section 3.4.4 (p.70).
+ */
+static u64 sys_addr_to_dram_addr(struct mem_ctl_info *mci, u64 sys_addr)
+{
+	u64 dram_base, hole_base, hole_offset, hole_size, dram_addr;
+	int ret = 0;
+
+	dram_base = get_dram_base(mci);
+
+	ret = amd64_get_dram_hole_info(mci, &hole_base, &hole_offset,
+				      &hole_size);
+	if (!ret) {
+		if ((sys_addr >= (1ull << 32)) &&
+		    (sys_addr < ((1ull << 32) + hole_size))) {
+			/* use DHAR to translate SysAddr to DramAddr */
+			dram_addr = sys_addr - hole_offset;
+
+			debugf2("using DHAR to translate SysAddr 0x%lx to "
+				"DramAddr 0x%lx\n",
+				(unsigned long)sys_addr,
+				(unsigned long)dram_addr);
+
+			return dram_addr;
+		}
+	}
+
+	/*
+	 * Translate the SysAddr to a DramAddr as shown near the start of
+	 * section 3.4.4 (p. 70).  Although sys_addr is a 64-bit value, the k8
+	 * only deals with 40-bit values.  Therefore we discard bits 63-40 of
+	 * sys_addr below.  If bit 39 of sys_addr is 1 then the bits we
+	 * discard are all 1s.  Otherwise the bits we discard are all 0s.  See
+	 * section 3.4.2 of AMD publication 24592: AMD x86-64 Architecture
+	 * Programmer's Manual Volume 1 Application Programming.
+	 */
+	dram_addr = (sys_addr & 0xffffffffffull) - dram_base;
+
+	debugf2("using DRAM Base register to translate SysAddr 0x%lx to "
+		"DramAddr 0x%lx\n", (unsigned long)sys_addr,
+		(unsigned long)dram_addr);
+	return dram_addr;
+}
+
+/*
+ * @intlv_en is the value of the IntlvEn field from a DRAM Base register
+ * (section 3.4.4.1).  Return the number of bits from a SysAddr that are used
+ * for node interleaving.
+ */
+static int num_node_interleave_bits(unsigned intlv_en)
+{
+	static const int intlv_shift_table[] = { 0, 1, 0, 2, 0, 0, 0, 3 };
+	int n;
+
+	BUG_ON(intlv_en > 7);
+	n = intlv_shift_table[intlv_en];
+	return n;
+}
+
+/* Translate the DramAddr given by @dram_addr to an InputAddr. */
+static u64 dram_addr_to_input_addr(struct mem_ctl_info *mci, u64 dram_addr)
+{
+	struct amd64_pvt *pvt;
+	int intlv_shift;
+	u64 input_addr;
+
+	pvt = mci->pvt_info;
+
+	/*
+	 * See the start of section 3.4.4 (p. 70, BKDG #26094, K8, revA-E)
+	 * concerning translating a DramAddr to an InputAddr.
+	 */
+	intlv_shift = num_node_interleave_bits(pvt->dram_IntlvEn[0]);
+	input_addr = ((dram_addr >> intlv_shift) & 0xffffff000ull) +
+	    (dram_addr & 0xfff);
+
+	debugf2("  Intlv Shift=%d DramAddr=0x%lx maps to InputAddr=0x%lx\n",
+		intlv_shift, (unsigned long)dram_addr,
+		(unsigned long)input_addr);
+
+	return input_addr;
+}
+
+/*
+ * Translate the SysAddr represented by @sys_addr to an InputAddr.  It is
+ * assumed that @sys_addr maps to the node given by mci.
+ */
+static u64 sys_addr_to_input_addr(struct mem_ctl_info *mci, u64 sys_addr)
+{
+	u64 input_addr;
+
+	input_addr =
+	    dram_addr_to_input_addr(mci, sys_addr_to_dram_addr(mci, sys_addr));
+
+	debugf2("SysAdddr 0x%lx translates to InputAddr 0x%lx\n",
+		(unsigned long)sys_addr, (unsigned long)input_addr);
+
+	return input_addr;
+}
+
+
+/*
+ * @input_addr is an InputAddr associated with the node represented by mci.
+ * Translate @input_addr to a DramAddr and return the result.
+ */
+static u64 input_addr_to_dram_addr(struct mem_ctl_info *mci, u64 input_addr)
+{
+	struct amd64_pvt *pvt;
+	int node_id, intlv_shift;
+	u64 bits, dram_addr;
+	u32 intlv_sel;
+
+	/*
+	 * Near the start of section 3.4.4 (p. 70, BKDG #26094, K8, revA-E)
+	 * shows how to translate a DramAddr to an InputAddr. Here we reverse
+	 * this procedure. When translating from a DramAddr to an InputAddr, the
+	 * bits used for node interleaving are discarded.  Here we recover these
+	 * bits from the IntlvSel field of the DRAM Limit register (section
+	 * 3.4.4.2) for the node that input_addr is associated with.
+	 */
+	pvt = mci->pvt_info;
+	node_id = pvt->mc_node_id;
+	BUG_ON((node_id < 0) || (node_id > 7));
+
+	intlv_shift = num_node_interleave_bits(pvt->dram_IntlvEn[0]);
+
+	if (intlv_shift == 0) {
+		debugf1("    InputAddr 0x%lx translates to DramAddr of "
+			"same value\n",	(unsigned long)input_addr);
+
+		return input_addr;
+	}
+
+	bits = ((input_addr & 0xffffff000ull) << intlv_shift) +
+	    (input_addr & 0xfff);
+
+	intlv_sel = pvt->dram_IntlvSel[node_id] & ((1 << intlv_shift) - 1);
+	dram_addr = bits + (intlv_sel << 12);
+
+	debugf1("InputAddr 0x%lx translates to DramAddr 0x%lx "
+		"(%d node interleave bits)\n", (unsigned long)input_addr,
+		(unsigned long)dram_addr, intlv_shift);
+
+	return dram_addr;
+}
+
+/*
+ * @dram_addr is a DramAddr that maps to the node represented by mci. Convert
+ * @dram_addr to a SysAddr.
+ */
+static u64 dram_addr_to_sys_addr(struct mem_ctl_info *mci, u64 dram_addr)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	u64 hole_base, hole_offset, hole_size, base, limit, sys_addr;
+	int ret = 0;
+
+	ret = amd64_get_dram_hole_info(mci, &hole_base, &hole_offset,
+				      &hole_size);
+	if (!ret) {
+		if ((dram_addr >= hole_base) &&
+		    (dram_addr < (hole_base + hole_size))) {
+			sys_addr = dram_addr + hole_offset;
+
+			debugf1("using DHAR to translate DramAddr 0x%lx to "
+				"SysAddr 0x%lx\n", (unsigned long)dram_addr,
+				(unsigned long)sys_addr);
+
+			return sys_addr;
+		}
+	}
+
+	amd64_get_base_and_limit(pvt, pvt->mc_node_id, &base, &limit);
+	sys_addr = dram_addr + base;
+
+	/*
+	 * The sys_addr we have computed up to this point is a 40-bit value
+	 * because the k8 deals with 40-bit values.  However, the value we are
+	 * supposed to return is a full 64-bit physical address.  The AMD
+	 * x86-64 architecture specifies that the most significant implemented
+	 * address bit through bit 63 of a physical address must be either all
+	 * 0s or all 1s.  Therefore we sign-extend the 40-bit sys_addr to a
+	 * 64-bit value below.  See section 3.4.2 of AMD publication 24592:
+	 * AMD x86-64 Architecture Programmer's Manual Volume 1 Application
+	 * Programming.
+	 */
+	sys_addr |= ~((sys_addr & (1ull << 39)) - 1);
+
+	debugf1("    Node %d, DramAddr 0x%lx to SysAddr 0x%lx\n",
+		pvt->mc_node_id, (unsigned long)dram_addr,
+		(unsigned long)sys_addr);
+
+	return sys_addr;
+}
+
+/*
+ * @input_addr is an InputAddr associated with the node given by mci. Translate
+ * @input_addr to a SysAddr.
+ */
+static inline u64 input_addr_to_sys_addr(struct mem_ctl_info *mci,
+					 u64 input_addr)
+{
+	return dram_addr_to_sys_addr(mci,
+				     input_addr_to_dram_addr(mci, input_addr));
+}
+
+/*
+ * Find the minimum and maximum InputAddr values that map to the given @csrow.
+ * Pass back these values in *input_addr_min and *input_addr_max.
+ */
+static void find_csrow_limits(struct mem_ctl_info *mci, int csrow,
+			      u64 *input_addr_min, u64 *input_addr_max)
+{
+	struct amd64_pvt *pvt;
+	u64 base, mask;
+
+	pvt = mci->pvt_info;
+	BUG_ON((csrow < 0) || (csrow >= CHIPSELECT_COUNT));
+
+	base = base_from_dct_base(pvt, csrow);
+	mask = mask_from_dct_mask(pvt, csrow);
+
+	*input_addr_min = base & ~mask;
+	*input_addr_max = base | mask | pvt->dcs_mask_notused;
+}
+
+/*
+ * Extract error address from MCA NB Address Low (section 3.6.4.5) and MCA NB
+ * Address High (section 3.6.4.6) register values and return the result. Address
+ * is located in the info structure (nbeah and nbeal), the encoding is device
+ * specific.
+ */
+static u64 extract_error_address(struct mem_ctl_info *mci,
+				 struct amd64_error_info_regs *info)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+
+	return pvt->ops->get_error_address(mci, info);
+}
+
+
+/* Map the Error address to a PAGE and PAGE OFFSET. */
+static inline void error_address_to_page_and_offset(u64 error_address,
+						    u32 *page, u32 *offset)
+{
+	*page = (u32) (error_address >> PAGE_SHIFT);
+	*offset = ((u32) error_address) & ~PAGE_MASK;
+}
+
+/*
+ * @sys_addr is an error address (a SysAddr) extracted from the MCA NB Address
+ * Low (section 3.6.4.5) and MCA NB Address High (section 3.6.4.6) registers
+ * of a node that detected an ECC memory error.  mci represents the node that
+ * the error address maps to (possibly different from the node that detected
+ * the error).  Return the number of the csrow that sys_addr maps to, or -1 on
+ * error.
+ */
+static int sys_addr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr)
+{
+	int csrow;
+
+	csrow = input_addr_to_csrow(mci, sys_addr_to_input_addr(mci, sys_addr));
+
+	if (csrow == -1)
+		amd64_mc_printk(mci, KERN_ERR,
+			     "Failed to translate InputAddr to csrow for "
+			     "address 0x%lx\n", (unsigned long)sys_addr);
+	return csrow;
+}
 
-- 
cgit v0.10.2


From 2da11654ea4d32c5700693675dfbd55c70619519 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 16:09:09 +0200
Subject: amd64_edac: add helper to dump relevant registers

Borislav:

- cleanup/fix comments
- fix function return value patterns
- cleanup dbg calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 28f85c9e..3d6d000 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -729,3 +729,145 @@ static int sys_addr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr)
 	return csrow;
 }
 
+static int get_channel_from_ecc_syndrome(unsigned short syndrome);
+
+static void amd64_cpu_display_info(struct amd64_pvt *pvt)
+{
+	if (boot_cpu_data.x86 == 0x11)
+		edac_printk(KERN_DEBUG, EDAC_MC, "F11h CPU detected\n");
+	else if (boot_cpu_data.x86 == 0x10)
+		edac_printk(KERN_DEBUG, EDAC_MC, "F10h CPU detected\n");
+	else if (boot_cpu_data.x86 == 0xf)
+		edac_printk(KERN_DEBUG, EDAC_MC, "%s detected\n",
+			(pvt->ext_model >= OPTERON_CPU_REV_F) ?
+			"Rev F or later" : "Rev E or earlier");
+	else
+		/* we'll hardly ever ever get here */
+		edac_printk(KERN_ERR, EDAC_MC, "Unknown cpu!\n");
+}
+
+/*
+ * Determine if the DIMMs have ECC enabled. ECC is enabled ONLY if all the DIMMs
+ * are ECC capable.
+ */
+static enum edac_type amd64_determine_edac_cap(struct amd64_pvt *pvt)
+{
+	int bit;
+	enum dev_type edac_cap = EDAC_NONE;
+
+	bit = (boot_cpu_data.x86 > 0xf || pvt->ext_model >= OPTERON_CPU_REV_F)
+		? 19
+		: 17;
+
+	if (pvt->dclr0 >> BIT(bit))
+		edac_cap = EDAC_FLAG_SECDED;
+
+	return edac_cap;
+}
+
+
+static void f10_debug_display_dimm_sizes(int ctrl, struct amd64_pvt *pvt,
+					 int ganged);
+
+/* Display and decode various NB registers for debug purposes. */
+static void amd64_dump_misc_regs(struct amd64_pvt *pvt)
+{
+	int ganged;
+
+	debugf1("  nbcap:0x%8.08x DctDualCap=%s DualNode=%s 8-Node=%s\n",
+		pvt->nbcap,
+		(pvt->nbcap & K8_NBCAP_DCT_DUAL) ? "True" : "False",
+		(pvt->nbcap & K8_NBCAP_DUAL_NODE) ? "True" : "False",
+		(pvt->nbcap & K8_NBCAP_8_NODE) ? "True" : "False");
+	debugf1("    ECC Capable=%s   ChipKill Capable=%s\n",
+		(pvt->nbcap & K8_NBCAP_SECDED) ? "True" : "False",
+		(pvt->nbcap & K8_NBCAP_CHIPKILL) ? "True" : "False");
+	debugf1("  DramCfg0-low=0x%08x DIMM-ECC=%s Parity=%s Width=%s\n",
+		pvt->dclr0,
+		(pvt->dclr0 & BIT(19)) ?  "Enabled" : "Disabled",
+		(pvt->dclr0 & BIT(8)) ?  "Enabled" : "Disabled",
+		(pvt->dclr0 & BIT(11)) ?  "128b" : "64b");
+	debugf1("    DIMM x4 Present: L0=%s L1=%s L2=%s L3=%s  DIMM Type=%s\n",
+		(pvt->dclr0 & BIT(12)) ?  "Y" : "N",
+		(pvt->dclr0 & BIT(13)) ?  "Y" : "N",
+		(pvt->dclr0 & BIT(14)) ?  "Y" : "N",
+		(pvt->dclr0 & BIT(15)) ?  "Y" : "N",
+		(pvt->dclr0 & BIT(16)) ?  "UN-Buffered" : "Buffered");
+
+
+	debugf1("  online-spare: 0x%8.08x\n", pvt->online_spare);
+
+	if (boot_cpu_data.x86 == 0xf) {
+		debugf1("  dhar: 0x%8.08x Base=0x%08x Offset=0x%08x\n",
+			pvt->dhar, dhar_base(pvt->dhar),
+			k8_dhar_offset(pvt->dhar));
+		debugf1("      DramHoleValid=%s\n",
+			(pvt->dhar & DHAR_VALID) ?  "True" : "False");
+
+		debugf1("  dbam-dkt: 0x%8.08x\n", pvt->dbam0);
+
+		/* everything below this point is Fam10h and above */
+		return;
+
+	} else {
+		debugf1("  dhar: 0x%8.08x Base=0x%08x Offset=0x%08x\n",
+			pvt->dhar, dhar_base(pvt->dhar),
+			f10_dhar_offset(pvt->dhar));
+		debugf1("    DramMemHoistValid=%s DramHoleValid=%s\n",
+			(pvt->dhar & F10_DRAM_MEM_HOIST_VALID) ?
+			"True" : "False",
+			(pvt->dhar & DHAR_VALID) ?
+			"True" : "False");
+	}
+
+	/* Only if NOT ganged does dcl1 have valid info */
+	if (!dct_ganging_enabled(pvt)) {
+		debugf1("  DramCfg1-low=0x%08x DIMM-ECC=%s Parity=%s "
+			"Width=%s\n", pvt->dclr1,
+			(pvt->dclr1 & BIT(19)) ?  "Enabled" : "Disabled",
+			(pvt->dclr1 & BIT(8)) ?  "Enabled" : "Disabled",
+			(pvt->dclr1 & BIT(11)) ?  "128b" : "64b");
+		debugf1("    DIMM x4 Present: L0=%s L1=%s L2=%s L3=%s  "
+			"DIMM Type=%s\n",
+			(pvt->dclr1 & BIT(12)) ?  "Y" : "N",
+			(pvt->dclr1 & BIT(13)) ?  "Y" : "N",
+			(pvt->dclr1 & BIT(14)) ?  "Y" : "N",
+			(pvt->dclr1 & BIT(15)) ?  "Y" : "N",
+			(pvt->dclr1 & BIT(16)) ?  "UN-Buffered" : "Buffered");
+	}
+
+	/*
+	 * Determine if ganged and then dump memory sizes for first controller,
+	 * and if NOT ganged dump info for 2nd controller.
+	 */
+	ganged = dct_ganging_enabled(pvt);
+
+	f10_debug_display_dimm_sizes(0, pvt, ganged);
+
+	if (!ganged)
+		f10_debug_display_dimm_sizes(1, pvt, ganged);
+}
+
+/* Read in both of DBAM registers */
+static void amd64_read_dbam_reg(struct amd64_pvt *pvt)
+{
+	int err = 0;
+	unsigned int reg;
+
+	reg = DBAM0;
+	err = pci_read_config_dword(pvt->dram_f2_ctl, reg, &pvt->dbam0);
+	if (err)
+		goto err_reg;
+
+	if (boot_cpu_data.x86 >= 0x10) {
+		reg = DBAM1;
+		err = pci_read_config_dword(pvt->dram_f2_ctl, reg, &pvt->dbam1);
+
+		if (err)
+			goto err_reg;
+	}
+
+err_reg:
+	debugf0("Error reading F2x%03x.\n", reg);
+}
+
-- 
cgit v0.10.2


From 94be4bff21990674ac418c970c708a1a01cf709f Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 16:12:00 +0200
Subject: amd64_edac: assign DRAM chip select base and mask in a
 family-specific way

Borislav:

- cleanup/fix comments
- fix function return value patterns
- cleanup debug calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 3d6d000..5ec44a4 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -871,3 +871,145 @@ err_reg:
 	debugf0("Error reading F2x%03x.\n", reg);
 }
 
+/*
+ * NOTE: CPU Revision Dependent code: Rev E and Rev F
+ *
+ * Set the DCSB and DCSM mask values depending on the CPU revision value. Also
+ * set the shift factor for the DCSB and DCSM values.
+ *
+ * ->dcs_mask_notused, RevE:
+ *
+ * To find the max InputAddr for the csrow, start with the base address and set
+ * all bits that are "don't care" bits in the test at the start of section
+ * 3.5.4 (p. 84).
+ *
+ * The "don't care" bits are all set bits in the mask and all bits in the gaps
+ * between bit ranges [35:25] and [19:13]. The value REV_E_DCS_NOTUSED_BITS
+ * represents bits [24:20] and [12:0], which are all bits in the above-mentioned
+ * gaps.
+ *
+ * ->dcs_mask_notused, RevF and later:
+ *
+ * To find the max InputAddr for the csrow, start with the base address and set
+ * all bits that are "don't care" bits in the test at the start of NPT section
+ * 4.5.4 (p. 87).
+ *
+ * The "don't care" bits are all set bits in the mask and all bits in the gaps
+ * between bit ranges [36:27] and [21:13].
+ *
+ * The value REV_F_F1Xh_DCS_NOTUSED_BITS represents bits [26:22] and [12:0],
+ * which are all bits in the above-mentioned gaps.
+ */
+static void amd64_set_dct_base_and_mask(struct amd64_pvt *pvt)
+{
+	if (pvt->ext_model >= OPTERON_CPU_REV_F) {
+		pvt->dcsb_base		= REV_F_F1Xh_DCSB_BASE_BITS;
+		pvt->dcsm_mask		= REV_F_F1Xh_DCSM_MASK_BITS;
+		pvt->dcs_mask_notused	= REV_F_F1Xh_DCS_NOTUSED_BITS;
+		pvt->dcs_shift		= REV_F_F1Xh_DCS_SHIFT;
+
+		switch (boot_cpu_data.x86) {
+		case 0xf:
+			pvt->num_dcsm = REV_F_DCSM_COUNT;
+			break;
+
+		case 0x10:
+			pvt->num_dcsm = F10_DCSM_COUNT;
+			break;
+
+		case 0x11:
+			pvt->num_dcsm = F11_DCSM_COUNT;
+			break;
+
+		default:
+			amd64_printk(KERN_ERR, "Unsupported family!\n");
+			break;
+		}
+	} else {
+		pvt->dcsb_base		= REV_E_DCSB_BASE_BITS;
+		pvt->dcsm_mask		= REV_E_DCSM_MASK_BITS;
+		pvt->dcs_mask_notused	= REV_E_DCS_NOTUSED_BITS;
+		pvt->dcs_shift		= REV_E_DCS_SHIFT;
+		pvt->num_dcsm		= REV_E_DCSM_COUNT;
+	}
+}
+
+/*
+ * Function 2 Offset F10_DCSB0; read in the DCS Base and DCS Mask hw registers
+ */
+static void amd64_read_dct_base_mask(struct amd64_pvt *pvt)
+{
+	int cs, reg, err = 0;
+
+	amd64_set_dct_base_and_mask(pvt);
+
+	for (cs = 0; cs < CHIPSELECT_COUNT; cs++) {
+		reg = K8_DCSB0 + (cs * 4);
+		err = pci_read_config_dword(pvt->dram_f2_ctl, reg,
+						&pvt->dcsb0[cs]);
+		if (unlikely(err))
+			debugf0("Reading K8_DCSB0[%d] failed\n", cs);
+		else
+			debugf0("  DCSB0[%d]=0x%08x reg: F2x%x\n",
+				cs, pvt->dcsb0[cs], reg);
+
+		/* If DCT are NOT ganged, then read in DCT1's base */
+		if (boot_cpu_data.x86 >= 0x10 && !dct_ganging_enabled(pvt)) {
+			reg = F10_DCSB1 + (cs * 4);
+			err = pci_read_config_dword(pvt->dram_f2_ctl, reg,
+							&pvt->dcsb1[cs]);
+			if (unlikely(err))
+				debugf0("Reading F10_DCSB1[%d] failed\n", cs);
+			else
+				debugf0("  DCSB1[%d]=0x%08x reg: F2x%x\n",
+					cs, pvt->dcsb1[cs], reg);
+		} else {
+			pvt->dcsb1[cs] = 0;
+		}
+	}
+
+	for (cs = 0; cs < pvt->num_dcsm; cs++) {
+		reg = K8_DCSB0 + (cs * 4);
+		err = pci_read_config_dword(pvt->dram_f2_ctl, reg,
+					&pvt->dcsm0[cs]);
+		if (unlikely(err))
+			debugf0("Reading K8_DCSM0 failed\n");
+		else
+			debugf0("    DCSM0[%d]=0x%08x reg: F2x%x\n",
+				cs, pvt->dcsm0[cs], reg);
+
+		/* If DCT are NOT ganged, then read in DCT1's mask */
+		if (boot_cpu_data.x86 >= 0x10 && !dct_ganging_enabled(pvt)) {
+			reg = F10_DCSM1 + (cs * 4);
+			err = pci_read_config_dword(pvt->dram_f2_ctl, reg,
+					&pvt->dcsm1[cs]);
+			if (unlikely(err))
+				debugf0("Reading F10_DCSM1[%d] failed\n", cs);
+			else
+				debugf0("    DCSM1[%d]=0x%08x reg: F2x%x\n",
+					cs, pvt->dcsm1[cs], reg);
+		} else
+			pvt->dcsm1[cs] = 0;
+	}
+}
+
+static enum mem_type amd64_determine_memory_type(struct amd64_pvt *pvt)
+{
+	enum mem_type type;
+
+	if (boot_cpu_data.x86 >= 0x10 || pvt->ext_model >= OPTERON_CPU_REV_F) {
+		/* Rev F and later */
+		type = (pvt->dclr0 & BIT(16)) ? MEM_DDR2 : MEM_RDDR2;
+	} else {
+		/* Rev E and earlier */
+		type = (pvt->dclr0 & BIT(18)) ? MEM_DDR : MEM_RDDR;
+	}
+
+	debugf1("  Memory type is: %s\n",
+		(type == MEM_DDR2) ? "MEM_DDR2" :
+		(type == MEM_RDDR2) ? "MEM_RDDR2" :
+		(type == MEM_DDR) ? "MEM_DDR" : "MEM_RDDR");
+
+	return type;
+}
+
-- 
cgit v0.10.2


From ddff876d2022c5e06509f6535bc4fd61c4d6ffd6 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 16:14:52 +0200
Subject: amd64_edac: add k8-specific methods

Borislav:

- fix/cleanup/move comments
- fix function return value patterns
- cleanup debug calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 5ec44a4..24c031f 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1013,3 +1013,176 @@ static enum mem_type amd64_determine_memory_type(struct amd64_pvt *pvt)
 	return type;
 }
 
+/*
+ * Read the DRAM Configuration Low register. It differs between CG, D & E revs
+ * and the later RevF memory controllers (DDR vs DDR2)
+ *
+ * Return:
+ *      number of memory channels in operation
+ * Pass back:
+ *      contents of the DCL0_LOW register
+ */
+static int k8_early_channel_count(struct amd64_pvt *pvt)
+{
+	int flag, err = 0;
+
+	err = pci_read_config_dword(pvt->dram_f2_ctl, F10_DCLR_0, &pvt->dclr0);
+	if (err)
+		return err;
+
+	if ((boot_cpu_data.x86_model >> 4) >= OPTERON_CPU_REV_F) {
+		/* RevF (NPT) and later */
+		flag = pvt->dclr0 & F10_WIDTH_128;
+	} else {
+		/* RevE and earlier */
+		flag = pvt->dclr0 & REVE_WIDTH_128;
+	}
+
+	/* not used */
+	pvt->dclr1 = 0;
+
+	return (flag) ? 2 : 1;
+}
+
+/* extract the ERROR ADDRESS for the K8 CPUs */
+static u64 k8_get_error_address(struct mem_ctl_info *mci,
+				struct amd64_error_info_regs *info)
+{
+	return (((u64) (info->nbeah & 0xff)) << 32) +
+			(info->nbeal & ~0x03);
+}
+
+/*
+ * Read the Base and Limit registers for K8 based Memory controllers; extract
+ * fields from the 'raw' reg into separate data fields
+ *
+ * Isolates: BASE, LIMIT, IntlvEn, IntlvSel, RW_EN
+ */
+static void k8_read_dram_base_limit(struct amd64_pvt *pvt, int dram)
+{
+	u32 low;
+	u32 off = dram << 3;	/* 8 bytes between DRAM entries */
+	int err;
+
+	err = pci_read_config_dword(pvt->addr_f1_ctl,
+				    K8_DRAM_BASE_LOW + off, &low);
+	if (err)
+		debugf0("Reading K8_DRAM_BASE_LOW failed\n");
+
+	/* Extract parts into separate data entries */
+	pvt->dram_base[dram] = ((u64) low & 0xFFFF0000) << 8;
+	pvt->dram_IntlvEn[dram] = (low >> 8) & 0x7;
+	pvt->dram_rw_en[dram] = (low & 0x3);
+
+	err = pci_read_config_dword(pvt->addr_f1_ctl,
+				    K8_DRAM_LIMIT_LOW + off, &low);
+	if (err)
+		debugf0("Reading K8_DRAM_LIMIT_LOW failed\n");
+
+	/*
+	 * Extract parts into separate data entries. Limit is the HIGHEST memory
+	 * location of the region, so lower 24 bits need to be all ones
+	 */
+	pvt->dram_limit[dram] = (((u64) low & 0xFFFF0000) << 8) | 0x00FFFFFF;
+	pvt->dram_IntlvSel[dram] = (low >> 8) & 0x7;
+	pvt->dram_DstNode[dram] = (low & 0x7);
+}
+
+static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci,
+					struct amd64_error_info_regs *info,
+					u64 SystemAddress)
+{
+	struct mem_ctl_info *src_mci;
+	unsigned short syndrome;
+	int channel, csrow;
+	u32 page, offset;
+
+	/* Extract the syndrome parts and form a 16-bit syndrome */
+	syndrome = EXTRACT_HIGH_SYNDROME(info->nbsl) << 8;
+	syndrome |= EXTRACT_LOW_SYNDROME(info->nbsh);
+
+	/* CHIPKILL enabled */
+	if (info->nbcfg & K8_NBCFG_CHIPKILL) {
+		channel = get_channel_from_ecc_syndrome(syndrome);
+		if (channel < 0) {
+			/*
+			 * Syndrome didn't map, so we don't know which of the
+			 * 2 DIMMs is in error. So we need to ID 'both' of them
+			 * as suspect.
+			 */
+			amd64_mc_printk(mci, KERN_WARNING,
+				       "unknown syndrome 0x%x - possible error "
+				       "reporting race\n", syndrome);
+			edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR);
+			return;
+		}
+	} else {
+		/*
+		 * non-chipkill ecc mode
+		 *
+		 * The k8 documentation is unclear about how to determine the
+		 * channel number when using non-chipkill memory.  This method
+		 * was obtained from email communication with someone at AMD.
+		 * (Wish the email was placed in this comment - norsk)
+		 */
+		channel = ((SystemAddress & BIT(3)) != 0);
+	}
+
+	/*
+	 * Find out which node the error address belongs to. This may be
+	 * different from the node that detected the error.
+	 */
+	src_mci = find_mc_by_sys_addr(mci, SystemAddress);
+	if (src_mci) {
+		amd64_mc_printk(mci, KERN_ERR,
+			     "failed to map error address 0x%lx to a node\n",
+			     (unsigned long)SystemAddress);
+		edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR);
+		return;
+	}
+
+	/* Now map the SystemAddress to a CSROW */
+	csrow = sys_addr_to_csrow(src_mci, SystemAddress);
+	if (csrow < 0) {
+		edac_mc_handle_ce_no_info(src_mci, EDAC_MOD_STR);
+	} else {
+		error_address_to_page_and_offset(SystemAddress, &page, &offset);
+
+		edac_mc_handle_ce(src_mci, page, offset, syndrome, csrow,
+				  channel, EDAC_MOD_STR);
+	}
+}
+
+/*
+ * determrine the number of PAGES in for this DIMM's size based on its DRAM
+ * Address Mapping.
+ *
+ * First step is to calc the number of bits to shift a value of 1 left to
+ * indicate show many pages. Start with the DBAM value as the starting bits,
+ * then proceed to adjust those shift bits, based on CPU rev and the table.
+ * See BKDG on the DBAM
+ */
+static int k8_dbam_map_to_pages(struct amd64_pvt *pvt, int dram_map)
+{
+	int nr_pages;
+
+	if (pvt->ext_model >= OPTERON_CPU_REV_F) {
+		nr_pages = 1 << (revf_quad_ddr2_shift[dram_map] - PAGE_SHIFT);
+	} else {
+		/*
+		 * RevE and less section; this line is tricky. It collapses the
+		 * table used by RevD and later to one that matches revisions CG
+		 * and earlier.
+		 */
+		dram_map -= (pvt->ext_model >= OPTERON_CPU_REV_D) ?
+				(dram_map > 8 ? 4 : (dram_map > 5 ?
+				3 : (dram_map > 2 ? 1 : 0))) : 0;
+
+		/* 25 shift is 32MiB minimum DIMM size in RevE and prior */
+		nr_pages = 1 << (dram_map + 25 - PAGE_SHIFT);
+	}
+
+	return nr_pages;
+}
+
+
-- 
cgit v0.10.2


From 1afd3c98b5e8df68e1840b56c0ced15f314ce30d Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 16:16:50 +0200
Subject: amd64_edac: add F10h-and-later methods-p1

Borislav:

Fail f10_early_channel_count() if error encountered while reading a NB
register since those cached register contents are accessed afterwards.

- fix/cleanup comments
- fix function return value patterns
- cleanup debug calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 24c031f..b4ef2c7 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1185,4 +1185,185 @@ static int k8_dbam_map_to_pages(struct amd64_pvt *pvt, int dram_map)
 	return nr_pages;
 }
 
+/*
+ * Get the number of DCT channels in use.
+ *
+ * Return:
+ *	number of Memory Channels in operation
+ * Pass back:
+ *	contents of the DCL0_LOW register
+ */
+static int f10_early_channel_count(struct amd64_pvt *pvt)
+{
+	int err = 0, channels = 0;
+	u32 dbam;
+
+	err = pci_read_config_dword(pvt->dram_f2_ctl, F10_DCLR_0, &pvt->dclr0);
+	if (err)
+		goto err_reg;
+
+	err = pci_read_config_dword(pvt->dram_f2_ctl, F10_DCLR_1, &pvt->dclr1);
+	if (err)
+		goto err_reg;
+
+	/* If we are in 128 bit mode, then we are using 2 channels */
+	if (pvt->dclr0 & F10_WIDTH_128) {
+		debugf0("Data WIDTH is 128 bits - 2 channels\n");
+		channels = 2;
+		return channels;
+	}
+
+	/*
+	 * Need to check if in UN-ganged mode: In such, there are 2 channels,
+	 * but they are NOT in 128 bit mode and thus the above 'dcl0' status bit
+	 * will be OFF.
+	 *
+	 * Need to check DCT0[0] and DCT1[0] to see if only one of them has
+	 * their CSEnable bit on. If so, then SINGLE DIMM case.
+	 */
+	debugf0("Data WIDTH is NOT 128 bits - need more decoding\n");
 
+	/*
+	 * Check DRAM Bank Address Mapping values for each DIMM to see if there
+	 * is more than just one DIMM present in unganged mode. Need to check
+	 * both controllers since DIMMs can be placed in either one.
+	 */
+	channels = 0;
+	err = pci_read_config_dword(pvt->dram_f2_ctl, DBAM0, &dbam);
+	if (err)
+		goto err_reg;
+
+	if (DBAM_DIMM(0, dbam) > 0)
+		channels++;
+	if (DBAM_DIMM(1, dbam) > 0)
+		channels++;
+	if (DBAM_DIMM(2, dbam) > 0)
+		channels++;
+	if (DBAM_DIMM(3, dbam) > 0)
+		channels++;
+
+	/* If more than 2 DIMMs are present, then we have 2 channels */
+	if (channels > 2)
+		channels = 2;
+	else if (channels == 0) {
+		/* No DIMMs on DCT0, so look at DCT1 */
+		err = pci_read_config_dword(pvt->dram_f2_ctl, DBAM1, &dbam);
+		if (err)
+			goto err_reg;
+
+		if (DBAM_DIMM(0, dbam) > 0)
+			channels++;
+		if (DBAM_DIMM(1, dbam) > 0)
+			channels++;
+		if (DBAM_DIMM(2, dbam) > 0)
+			channels++;
+		if (DBAM_DIMM(3, dbam) > 0)
+			channels++;
+
+		if (channels > 2)
+			channels = 2;
+	}
+
+	/* If we found ALL 0 values, then assume just ONE DIMM-ONE Channel */
+	if (channels == 0)
+		channels = 1;
+
+	debugf0("DIMM count= %d\n", channels);
+
+	return channels;
+
+err_reg:
+	return -1;
+
+}
+
+static int f10_dbam_map_to_pages(struct amd64_pvt *pvt, int dram_map)
+{
+	return 1 << (revf_quad_ddr2_shift[dram_map] - PAGE_SHIFT);
+}
+
+/* Enable extended configuration access via 0xCF8 feature */
+static void amd64_setup(struct amd64_pvt *pvt)
+{
+	u32 reg;
+
+	pci_read_config_dword(pvt->misc_f3_ctl, F10_NB_CFG_HIGH, &reg);
+
+	pvt->flags.cf8_extcfg = !!(reg & F10_NB_CFG_LOW_ENABLE_EXT_CFG);
+	reg |= F10_NB_CFG_LOW_ENABLE_EXT_CFG;
+	pci_write_config_dword(pvt->misc_f3_ctl, F10_NB_CFG_HIGH, reg);
+}
+
+/* Restore the extended configuration access via 0xCF8 feature */
+static void amd64_teardown(struct amd64_pvt *pvt)
+{
+	u32 reg;
+
+	pci_read_config_dword(pvt->misc_f3_ctl, F10_NB_CFG_HIGH, &reg);
+
+	reg &= ~F10_NB_CFG_LOW_ENABLE_EXT_CFG;
+	if (pvt->flags.cf8_extcfg)
+		reg |= F10_NB_CFG_LOW_ENABLE_EXT_CFG;
+	pci_write_config_dword(pvt->misc_f3_ctl, F10_NB_CFG_HIGH, reg);
+}
+
+static u64 f10_get_error_address(struct mem_ctl_info *mci,
+			struct amd64_error_info_regs *info)
+{
+	return (((u64) (info->nbeah & 0xffff)) << 32) +
+			(info->nbeal & ~0x01);
+}
+
+/*
+ * Read the Base and Limit registers for F10 based Memory controllers. Extract
+ * fields from the 'raw' reg into separate data fields.
+ *
+ * Isolates: BASE, LIMIT, IntlvEn, IntlvSel, RW_EN.
+ */
+static void f10_read_dram_base_limit(struct amd64_pvt *pvt, int dram)
+{
+	u32 high_offset, low_offset, high_base, low_base, high_limit, low_limit;
+
+	low_offset = K8_DRAM_BASE_LOW + (dram << 3);
+	high_offset = F10_DRAM_BASE_HIGH + (dram << 3);
+
+	/* read the 'raw' DRAM BASE Address register */
+	pci_read_config_dword(pvt->addr_f1_ctl, low_offset, &low_base);
+
+	/* Read from the ECS data register */
+	pci_read_config_dword(pvt->addr_f1_ctl, high_offset, &high_base);
+
+	/* Extract parts into separate data entries */
+	pvt->dram_rw_en[dram] = (low_base & 0x3);
+
+	if (pvt->dram_rw_en[dram] == 0)
+		return;
+
+	pvt->dram_IntlvEn[dram] = (low_base >> 8) & 0x7;
+
+	pvt->dram_base[dram] = (((((u64) high_base & 0x000000FF) << 32) |
+				((u64) low_base & 0xFFFF0000))) << 8;
+
+	low_offset = K8_DRAM_LIMIT_LOW + (dram << 3);
+	high_offset = F10_DRAM_LIMIT_HIGH + (dram << 3);
+
+	/* read the 'raw' LIMIT registers */
+	pci_read_config_dword(pvt->addr_f1_ctl, low_offset, &low_limit);
+
+	/* Read from the ECS data register for the HIGH portion */
+	pci_read_config_dword(pvt->addr_f1_ctl, high_offset, &high_limit);
+
+	debugf0("  HW Regs: BASE=0x%08x-%08x      LIMIT=  0x%08x-%08x\n",
+		high_base, low_base, high_limit, low_limit);
+
+	pvt->dram_DstNode[dram] = (low_limit & 0x7);
+	pvt->dram_IntlvSel[dram] = (low_limit >> 8) & 0x7;
+
+	/*
+	 * Extract address values and form a LIMIT address. Limit is the HIGHEST
+	 * memory location of the region, so low 24 bits need to be all ones.
+	 */
+	low_limit |= 0x0000FFFF;
+	pvt->dram_limit[dram] =
+		((((u64) high_limit << 32) + (u64) low_limit) << 8) | (0xFF);
+}
-- 
cgit v0.10.2


From 6163b5d4fb45d20e3eb92d627943f26572726889 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 16:20:17 +0200
Subject: amd64_edac: add F10h-and-later methods-p2

Borislav:

- fix a wrong negation in f10_determine_base_addr_offset()
- fix a wrong mask in f10_determine_base_addr_offset() which should
select DctSelBaseAddr[31:11] and not [31:16] as it was before
- remove StinkyIdentifiers, trivially simplify code.
- fix/cleanup comments
- fix function return value patterns

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index b4ef2c7..744a49a 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1367,3 +1367,199 @@ static void f10_read_dram_base_limit(struct amd64_pvt *pvt, int dram)
 	pvt->dram_limit[dram] =
 		((((u64) high_limit << 32) + (u64) low_limit) << 8) | (0xFF);
 }
+
+static void f10_read_dram_ctl_register(struct amd64_pvt *pvt)
+{
+	int err = 0;
+
+	err = pci_read_config_dword(pvt->dram_f2_ctl, F10_DCTL_SEL_LOW,
+				    &pvt->dram_ctl_select_low);
+	if (err) {
+		debugf0("Reading F10_DCTL_SEL_LOW failed\n");
+	} else {
+		debugf0("DRAM_DCTL_SEL_LOW=0x%x  DctSelBaseAddr=0x%x\n",
+			pvt->dram_ctl_select_low, dct_sel_baseaddr(pvt));
+
+		debugf0("  DRAM DCTs are=%s DRAM Is=%s DRAM-Ctl-"
+				"sel-hi-range=%s\n",
+			(dct_ganging_enabled(pvt) ? "GANGED" : "NOT GANGED"),
+			(dct_dram_enabled(pvt) ? "Enabled"   : "Disabled"),
+			(dct_high_range_enabled(pvt) ? "Enabled" : "Disabled"));
+
+		debugf0("  DctDatIntLv=%s MemCleared=%s DctSelIntLvAddr=0x%x\n",
+			(dct_data_intlv_enabled(pvt) ? "Enabled" : "Disabled"),
+			(dct_memory_cleared(pvt) ? "True " : "False "),
+			dct_sel_interleave_addr(pvt));
+	}
+
+	err = pci_read_config_dword(pvt->dram_f2_ctl, F10_DCTL_SEL_HIGH,
+				    &pvt->dram_ctl_select_high);
+	if (err)
+		debugf0("Reading F10_DCTL_SEL_HIGH failed\n");
+}
+
+static u32 f10_determine_channel(struct amd64_pvt *pvt, u64 sys_addr,
+				int hi_range_sel, u32 intlv_en)
+{
+	u32 cs, temp, dct_sel_high = (pvt->dram_ctl_select_low >> 1) & 1;
+
+	if (dct_ganging_enabled(pvt))
+		cs = 0;
+	else if (hi_range_sel)
+		cs = dct_sel_high;
+	else if (dct_interleave_enabled(pvt)) {
+		if (dct_sel_interleave_addr(pvt) == 0)
+			cs = sys_addr >> 6 & 1;
+		else if ((dct_sel_interleave_addr(pvt) >> 1) & 1) {
+			temp = hweight_long((u32) ((sys_addr >> 16) & 0x1F)) % 2;
+
+			if (dct_sel_interleave_addr(pvt) & 1)
+				cs = (sys_addr >> 9 & 1) ^ temp;
+			else
+				cs = (sys_addr >> 6 & 1) ^ temp;
+		} else if (intlv_en & 4)
+			cs = sys_addr >> 15 & 1;
+		else if (intlv_en & 2)
+			cs = sys_addr >> 14 & 1;
+		else if (intlv_en & 1)
+			cs = sys_addr >> 13 & 1;
+		else
+			cs = sys_addr >> 12 & 1;
+	} else if (dct_high_range_enabled(pvt) && !dct_ganging_enabled(pvt))
+		cs = ~dct_sel_high & 1;
+	else
+		cs = 0;
+
+	return cs;
+}
+
+static inline u32 f10_map_intlv_en_to_shift(u32 intlv_en)
+{
+	if (intlv_en == 1)
+		return 1;
+	else if (intlv_en == 3)
+		return 2;
+	else if (intlv_en == 7)
+		return 3;
+
+	return 0;
+}
+
+static inline u64 f10_determine_base_addr_offset(u64 sys_addr, int hi_range_sel,
+						 u32 dct_sel_base_addr,
+						 u64 dct_sel_base_off,
+						 u32 hole_en, u32 hole_off,
+						 u64 dram_base)
+{
+	u64 chan_off;
+
+	if (hi_range_sel) {
+		if (!(dct_sel_base_addr & 0xFFFFF800) &&
+		   (hole_en & 1) && (sys_addr >= 0x100000000ULL))
+			chan_off = hole_off << 16;
+		else
+			chan_off = dct_sel_base_off;
+	} else {
+		if ((hole_en & 1) && (sys_addr >= 0x100000000ULL))
+			chan_off = hole_off << 16;
+		else
+			chan_off = dram_base & 0xFFFFF8000000ULL;
+	}
+
+	return (sys_addr & 0x0000FFFFFFFFFFC0ULL) -
+			(chan_off & 0x0000FFFFFF800000ULL);
+}
+
+/* Hack for the time being - Can we get this from BIOS?? */
+#define	CH0SPARE_RANK	0
+#define	CH1SPARE_RANK	1
+
+/*
+ * checks if the csrow passed in is marked as SPARED, if so returns the new
+ * spare row
+ */
+static inline int f10_process_possible_spare(int csrow,
+				u32 cs, struct amd64_pvt *pvt)
+{
+	u32 swap_done;
+	u32 bad_dram_cs;
+
+	/* Depending on channel, isolate respective SPARING info */
+	if (cs) {
+		swap_done = F10_ONLINE_SPARE_SWAPDONE1(pvt->online_spare);
+		bad_dram_cs = F10_ONLINE_SPARE_BADDRAM_CS1(pvt->online_spare);
+		if (swap_done && (csrow == bad_dram_cs))
+			csrow = CH1SPARE_RANK;
+	} else {
+		swap_done = F10_ONLINE_SPARE_SWAPDONE0(pvt->online_spare);
+		bad_dram_cs = F10_ONLINE_SPARE_BADDRAM_CS0(pvt->online_spare);
+		if (swap_done && (csrow == bad_dram_cs))
+			csrow = CH0SPARE_RANK;
+	}
+	return csrow;
+}
+
+/*
+ * Iterate over the DRAM DCT "base" and "mask" registers looking for a
+ * SystemAddr match on the specified 'ChannelSelect' and 'NodeID'
+ *
+ * Return:
+ *	-EINVAL:  NOT FOUND
+ *	0..csrow = Chip-Select Row
+ */
+static int f10_lookup_addr_in_dct(u32 in_addr, u32 nid, u32 cs)
+{
+	struct mem_ctl_info *mci;
+	struct amd64_pvt *pvt;
+	u32 cs_base, cs_mask;
+	int cs_found = -EINVAL;
+	int csrow;
+
+	mci = mci_lookup[nid];
+	if (!mci)
+		return cs_found;
+
+	pvt = mci->pvt_info;
+
+	debugf1("InputAddr=0x%x  channelselect=%d\n", in_addr, cs);
+
+	for (csrow = 0; csrow < CHIPSELECT_COUNT; csrow++) {
+
+		cs_base = amd64_get_dct_base(pvt, cs, csrow);
+		if (!(cs_base & K8_DCSB_CS_ENABLE))
+			continue;
+
+		/*
+		 * We have an ENABLED CSROW, Isolate just the MASK bits of the
+		 * target: [28:19] and [13:5], which map to [36:27] and [21:13]
+		 * of the actual address.
+		 */
+		cs_base &= REV_F_F1Xh_DCSB_BASE_BITS;
+
+		/*
+		 * Get the DCT Mask, and ENABLE the reserved bits: [18:16] and
+		 * [4:0] to become ON. Then mask off bits [28:0] ([36:8])
+		 */
+		cs_mask = amd64_get_dct_mask(pvt, cs, csrow);
+
+		debugf1("    CSROW=%d CSBase=0x%x RAW CSMask=0x%x\n",
+				csrow, cs_base, cs_mask);
+
+		cs_mask = (cs_mask | 0x0007C01F) & 0x1FFFFFFF;
+
+		debugf1("              Final CSMask=0x%x\n", cs_mask);
+		debugf1("    (InputAddr & ~CSMask)=0x%x "
+				"(CSBase & ~CSMask)=0x%x\n",
+				(in_addr & ~cs_mask), (cs_base & ~cs_mask));
+
+		if ((in_addr & ~cs_mask) == (cs_base & ~cs_mask)) {
+			cs_found = f10_process_possible_spare(csrow, cs, pvt);
+
+			debugf1(" MATCH csrow=%d\n", cs_found);
+			break;
+		}
+	}
+	return cs_found;
+}
+
+
-- 
cgit v0.10.2


From f71d0a05001afd10e2be491ca002c55c7df42ed8 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 16:22:43 +0200
Subject: amd64_edac: add F10h-and-later methods-p3

Borislav:

- compute dct_sel_base_off in f10_match_to_this_node() correctly since
it cannot be assumed that the Reserved bits are zero and they have to be
masked out instead.

- cleanup, remove StinkyIdentifiers, simplify logic
- fix function return value patterns
- cleanup debug calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 744a49a..c2e2c3c 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1398,6 +1398,10 @@ static void f10_read_dram_ctl_register(struct amd64_pvt *pvt)
 		debugf0("Reading F10_DCTL_SEL_HIGH failed\n");
 }
 
+/*
+ * determine channel based on the interleaving mode: F10h BKDG, 2.8.9 Memory
+ * Interleaving Modes.
+ */
 static u32 f10_determine_channel(struct amd64_pvt *pvt, u64 sys_addr,
 				int hi_range_sel, u32 intlv_en)
 {
@@ -1408,6 +1412,9 @@ static u32 f10_determine_channel(struct amd64_pvt *pvt, u64 sys_addr,
 	else if (hi_range_sel)
 		cs = dct_sel_high;
 	else if (dct_interleave_enabled(pvt)) {
+		/*
+		 * see F2x110[DctSelIntLvAddr] - channel interleave mode
+		 */
 		if (dct_sel_interleave_addr(pvt) == 0)
 			cs = sys_addr >> 6 & 1;
 		else if ((dct_sel_interleave_addr(pvt) >> 1) & 1) {
@@ -1445,22 +1452,23 @@ static inline u32 f10_map_intlv_en_to_shift(u32 intlv_en)
 	return 0;
 }
 
-static inline u64 f10_determine_base_addr_offset(u64 sys_addr, int hi_range_sel,
+/* See F10h BKDG, 2.8.10.2 DctSelBaseOffset Programming */
+static inline u64 f10_get_base_addr_offset(u64 sys_addr, int hi_range_sel,
 						 u32 dct_sel_base_addr,
 						 u64 dct_sel_base_off,
-						 u32 hole_en, u32 hole_off,
+						 u32 hole_valid, u32 hole_off,
 						 u64 dram_base)
 {
 	u64 chan_off;
 
 	if (hi_range_sel) {
 		if (!(dct_sel_base_addr & 0xFFFFF800) &&
-		   (hole_en & 1) && (sys_addr >= 0x100000000ULL))
+		   hole_valid && (sys_addr >= 0x100000000ULL))
 			chan_off = hole_off << 16;
 		else
 			chan_off = dct_sel_base_off;
 	} else {
-		if ((hole_en & 1) && (sys_addr >= 0x100000000ULL))
+		if (hole_valid && (sys_addr >= 0x100000000ULL))
 			chan_off = hole_off << 16;
 		else
 			chan_off = dram_base & 0xFFFFF8000000ULL;
@@ -1562,4 +1570,257 @@ static int f10_lookup_addr_in_dct(u32 in_addr, u32 nid, u32 cs)
 	return cs_found;
 }
 
+/* For a given @dram_range, check if @sys_addr falls within it. */
+static int f10_match_to_this_node(struct amd64_pvt *pvt, int dram_range,
+				  u64 sys_addr, int *nid, int *chan_sel)
+{
+	int node_id, cs_found = -EINVAL, high_range = 0;
+	u32 intlv_en, intlv_sel, intlv_shift, hole_off;
+	u32 hole_valid, tmp, dct_sel_base, channel;
+	u64 dram_base, chan_addr, dct_sel_base_off;
+
+	dram_base = pvt->dram_base[dram_range];
+	intlv_en = pvt->dram_IntlvEn[dram_range];
+
+	node_id = pvt->dram_DstNode[dram_range];
+	intlv_sel = pvt->dram_IntlvSel[dram_range];
+
+	debugf1("(dram=%d) Base=0x%llx SystemAddr= 0x%llx Limit=0x%llx\n",
+		dram_range, dram_base, sys_addr, pvt->dram_limit[dram_range]);
+
+	/*
+	 * This assumes that one node's DHAR is the same as all the other
+	 * nodes' DHAR.
+	 */
+	hole_off = (pvt->dhar & 0x0000FF80);
+	hole_valid = (pvt->dhar & 0x1);
+	dct_sel_base_off = (pvt->dram_ctl_select_high & 0xFFFFFC00) << 16;
+
+	debugf1("   HoleOffset=0x%x  HoleValid=0x%x IntlvSel=0x%x\n",
+			hole_off, hole_valid, intlv_sel);
+
+	if (intlv_en ||
+	    (intlv_sel != ((sys_addr >> 12) & intlv_en)))
+		return -EINVAL;
+
+	dct_sel_base = dct_sel_baseaddr(pvt);
+
+	/*
+	 * check whether addresses >= DctSelBaseAddr[47:27] are to be used to
+	 * select between DCT0 and DCT1.
+	 */
+	if (dct_high_range_enabled(pvt) &&
+	   !dct_ganging_enabled(pvt) &&
+	   ((sys_addr >> 27) >= (dct_sel_base >> 11)))
+		high_range = 1;
+
+	channel = f10_determine_channel(pvt, sys_addr, high_range, intlv_en);
+
+	chan_addr = f10_get_base_addr_offset(sys_addr, high_range, dct_sel_base,
+					     dct_sel_base_off, hole_valid,
+					     hole_off, dram_base);
+
+	intlv_shift = f10_map_intlv_en_to_shift(intlv_en);
+
+	/* remove Node ID (in case of memory interleaving) */
+	tmp = chan_addr & 0xFC0;
+
+	chan_addr = ((chan_addr >> intlv_shift) & 0xFFFFFFFFF000ULL) | tmp;
+
+	/* remove channel interleave and hash */
+	if (dct_interleave_enabled(pvt) &&
+	   !dct_high_range_enabled(pvt) &&
+	   !dct_ganging_enabled(pvt)) {
+		if (dct_sel_interleave_addr(pvt) != 1)
+			chan_addr = (chan_addr >> 1) & 0xFFFFFFFFFFFFFFC0ULL;
+		else {
+			tmp = chan_addr & 0xFC0;
+			chan_addr = ((chan_addr & 0xFFFFFFFFFFFFC000ULL) >> 1)
+					| tmp;
+		}
+	}
+
+	debugf1("   (ChannelAddrLong=0x%llx) >> 8 becomes InputAddr=0x%x\n",
+		chan_addr, (u32)(chan_addr >> 8));
+
+	cs_found = f10_lookup_addr_in_dct(chan_addr >> 8, node_id, channel);
+
+	if (cs_found >= 0) {
+		*nid = node_id;
+		*chan_sel = channel;
+	}
+	return cs_found;
+}
+
+static int f10_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr,
+				       int *node, int *chan_sel)
+{
+	int dram_range, cs_found = -EINVAL;
+	u64 dram_base, dram_limit;
+
+	for (dram_range = 0; dram_range < DRAM_REG_COUNT; dram_range++) {
+
+		if (!pvt->dram_rw_en[dram_range])
+			continue;
+
+		dram_base = pvt->dram_base[dram_range];
+		dram_limit = pvt->dram_limit[dram_range];
+
+		if ((dram_base <= sys_addr) && (sys_addr <= dram_limit)) {
+
+			cs_found = f10_match_to_this_node(pvt, dram_range,
+							  sys_addr, node,
+							  chan_sel);
+			if (cs_found >= 0)
+				break;
+		}
+	}
+	return cs_found;
+}
+
+/*
+ * This the F10h reference code from AMD to map a @sys_addr to NodeID,
+ * CSROW, Channel.
+ *
+ * The @sys_addr is usually an error address received from the hardware.
+ */
+static void f10_map_sysaddr_to_csrow(struct mem_ctl_info *mci,
+				     struct amd64_error_info_regs *info,
+				     u64 sys_addr)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	u32 page, offset;
+	unsigned short syndrome;
+	int nid, csrow, chan = 0;
+
+	csrow = f10_translate_sysaddr_to_cs(pvt, sys_addr, &nid, &chan);
+
+	if (csrow >= 0) {
+		error_address_to_page_and_offset(sys_addr, &page, &offset);
+
+		syndrome = EXTRACT_HIGH_SYNDROME(info->nbsl) << 8;
+		syndrome |= EXTRACT_LOW_SYNDROME(info->nbsh);
+
+		/*
+		 * Is CHIPKILL on? If so, then we can attempt to use the
+		 * syndrome to isolate which channel the error was on.
+		 */
+		if (pvt->nbcfg & K8_NBCFG_CHIPKILL)
+			chan = get_channel_from_ecc_syndrome(syndrome);
+
+		if (chan >= 0) {
+			edac_mc_handle_ce(mci, page, offset, syndrome,
+					csrow, chan, EDAC_MOD_STR);
+		} else {
+			/*
+			 * Channel unknown, report all channels on this
+			 * CSROW as failed.
+			 */
+			for (chan = 0; chan < mci->csrows[csrow].nr_channels;
+								chan++) {
+					edac_mc_handle_ce(mci, page, offset,
+							syndrome,
+							csrow, chan,
+							EDAC_MOD_STR);
+			}
+		}
+
+	} else {
+		edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR);
+	}
+}
+
+/*
+ * Input (@index) is the DBAM DIMM value (1 of 4) used as an index into a shift
+ * table (revf_quad_ddr2_shift) which starts at 128MB DIMM size. Index of 0
+ * indicates an empty DIMM slot, as reported by Hardware on empty slots.
+ *
+ * Normalize to 128MB by subracting 27 bit shift.
+ */
+static int map_dbam_to_csrow_size(int index)
+{
+	int mega_bytes = 0;
+
+	if (index > 0 && index <= DBAM_MAX_VALUE)
+		mega_bytes = ((128 << (revf_quad_ddr2_shift[index]-27)));
+
+	return mega_bytes;
+}
+
+/*
+ * debug routine to display the memory sizes of a DIMM (ganged or not) and it
+ * CSROWs as well
+ */
+static void f10_debug_display_dimm_sizes(int ctrl, struct amd64_pvt *pvt,
+					 int ganged)
+{
+	int dimm, size0, size1;
+	u32 dbam;
+	u32 *dcsb;
+
+	debugf1("  dbam%d: 0x%8.08x  CSROW is %s\n", ctrl,
+			ctrl ? pvt->dbam1 : pvt->dbam0,
+			ganged ? "GANGED - dbam1 not used" : "NON-GANGED");
+
+	dbam = ctrl ? pvt->dbam1 : pvt->dbam0;
+	dcsb = ctrl ? pvt->dcsb1 : pvt->dcsb0;
+
+	/* Dump memory sizes for DIMM and its CSROWs */
+	for (dimm = 0; dimm < 4; dimm++) {
+
+		size0 = 0;
+		if (dcsb[dimm*2] & K8_DCSB_CS_ENABLE)
+			size0 = map_dbam_to_csrow_size(DBAM_DIMM(dimm, dbam));
+
+		size1 = 0;
+		if (dcsb[dimm*2 + 1] & K8_DCSB_CS_ENABLE)
+			size1 = map_dbam_to_csrow_size(DBAM_DIMM(dimm, dbam));
+
+		debugf1("     CTRL-%d DIMM-%d=%5dMB   CSROW-%d=%5dMB "
+				"CSROW-%d=%5dMB\n",
+				ctrl,
+				dimm,
+				size0 + size1,
+				dimm * 2,
+				size0,
+				dimm * 2 + 1,
+				size1);
+	}
+}
+
+/*
+ * Very early hardware probe on pci_probe thread to determine if this module
+ * supports the hardware.
+ *
+ * Return:
+ *      0 for OK
+ *      1 for error
+ */
+static int f10_probe_valid_hardware(struct amd64_pvt *pvt)
+{
+	int ret = 0;
+
+	/*
+	 * If we are on a DDR3 machine, we don't know yet if
+	 * we support that properly at this time
+	 */
+	if ((pvt->dchr0 & F10_DCHR_Ddr3Mode) ||
+	    (pvt->dchr1 & F10_DCHR_Ddr3Mode)) {
+
+		amd64_printk(KERN_WARNING,
+			"%s() This machine is running with DDR3 memory. "
+			"This is not currently supported. "
+			"DCHR0=0x%x DCHR1=0x%x\n",
+			__func__, pvt->dchr0, pvt->dchr1);
+
+		amd64_printk(KERN_WARNING,
+			"   Contact '%s' module MAINTAINER to help add"
+			" support.\n",
+			EDAC_MOD_STR);
+
+		ret = 1;
+
+	}
+	return ret;
+}
 
-- 
cgit v0.10.2


From 4d37607adbff69596a3170cf84badaf26efc59ac Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 16:25:05 +0200
Subject: amd64_edac: add per-family descriptors

Borislav:

- fix comments
- fix function return value patterns

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index c2e2c3c..e5f4a92 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1824,3 +1824,75 @@ static int f10_probe_valid_hardware(struct amd64_pvt *pvt)
 	return ret;
 }
 
+/*
+ * There currently are 3 types type of MC devices for AMD Athlon/Opterons
+ * (as per PCI DEVICE_IDs):
+ *
+ * Family K8: That is the Athlon64 and Opteron CPUs. They all have the same PCI
+ * DEVICE ID, even though there is differences between the different Revisions
+ * (CG,D,E,F).
+ *
+ * Family F10h and F11h.
+ *
+ */
+static struct amd64_family_type amd64_family_types[] = {
+	[K8_CPUS] = {
+		.ctl_name = "RevF",
+		.addr_f1_ctl = PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP,
+		.misc_f3_ctl = PCI_DEVICE_ID_AMD_K8_NB_MISC,
+		.ops = {
+			.early_channel_count = k8_early_channel_count,
+			.get_error_address = k8_get_error_address,
+			.read_dram_base_limit = k8_read_dram_base_limit,
+			.map_sysaddr_to_csrow = k8_map_sysaddr_to_csrow,
+			.dbam_map_to_pages = k8_dbam_map_to_pages,
+		}
+	},
+	[F10_CPUS] = {
+		.ctl_name = "Family 10h",
+		.addr_f1_ctl = PCI_DEVICE_ID_AMD_10H_NB_MAP,
+		.misc_f3_ctl = PCI_DEVICE_ID_AMD_10H_NB_MISC,
+		.ops = {
+			.probe_valid_hardware = f10_probe_valid_hardware,
+			.early_channel_count = f10_early_channel_count,
+			.get_error_address = f10_get_error_address,
+			.read_dram_base_limit = f10_read_dram_base_limit,
+			.read_dram_ctl_register = f10_read_dram_ctl_register,
+			.map_sysaddr_to_csrow = f10_map_sysaddr_to_csrow,
+			.dbam_map_to_pages = f10_dbam_map_to_pages,
+		}
+	},
+	[F11_CPUS] = {
+		.ctl_name = "Family 11h",
+		.addr_f1_ctl = PCI_DEVICE_ID_AMD_11H_NB_MAP,
+		.misc_f3_ctl = PCI_DEVICE_ID_AMD_11H_NB_MISC,
+		.ops = {
+			.probe_valid_hardware = f10_probe_valid_hardware,
+			.early_channel_count = f10_early_channel_count,
+			.get_error_address = f10_get_error_address,
+			.read_dram_base_limit = f10_read_dram_base_limit,
+			.read_dram_ctl_register = f10_read_dram_ctl_register,
+			.map_sysaddr_to_csrow = f10_map_sysaddr_to_csrow,
+			.dbam_map_to_pages = f10_dbam_map_to_pages,
+		}
+	},
+};
+
+static struct pci_dev *pci_get_related_function(unsigned int vendor,
+						unsigned int device,
+						struct pci_dev *related)
+{
+	struct pci_dev *dev = NULL;
+
+	dev = pci_get_device(vendor, device, dev);
+	while (dev) {
+		if ((dev->bus->number == related->bus->number) &&
+		    (PCI_SLOT(dev->devfn) == PCI_SLOT(related->devfn)))
+			break;
+		dev = pci_get_device(vendor, device, dev);
+	}
+
+	return dev;
+}
+
+
-- 
cgit v0.10.2


From b1289d6f9d23abab396077abb65d5a23a775cdb0 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 16:37:05 +0200
Subject: amd64_edac: add ECC chipkill syndrome mapping table

Borislav:

- fix comments
- cleanup debug calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index e5f4a92..feb4986 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1895,4 +1895,139 @@ static struct pci_dev *pci_get_related_function(unsigned int vendor,
 	return dev;
 }
 
+/*
+ * syndrome mapping table for ECC ChipKill devices
+ *
+ * The comment in each row is the token (nibble) number that is in error.
+ * The least significant nibble of the syndrome is the mask for the bits
+ * that are in error (need to be toggled) for the particular nibble.
+ *
+ * Each row contains 16 entries.
+ * The first entry (0th) is the channel number for that row of syndromes.
+ * The remaining 15 entries are the syndromes for the respective Error
+ * bit mask index.
+ *
+ * 1st index entry is 0x0001 mask, indicating that the rightmost bit is the
+ * bit in error.
+ * The 2nd index entry is 0x0010 that the second bit is damaged.
+ * The 3rd index entry is 0x0011 indicating that the rightmost 2 bits
+ * are damaged.
+ * Thus so on until index 15, 0x1111, whose entry has the syndrome
+ * indicating that all 4 bits are damaged.
+ *
+ * A search is performed on this table looking for a given syndrome.
+ *
+ * See the AMD documentation for ECC syndromes. This ECC table is valid
+ * across all the versions of the AMD64 processors.
+ *
+ * A fast lookup is to use the LAST four bits of the 16-bit syndrome as a
+ * COLUMN index, then search all ROWS of that column, looking for a match
+ * with the input syndrome. The ROW value will be the token number.
+ *
+ * The 0'th entry on that row, can be returned as the CHANNEL (0 or 1) of this
+ * error.
+ */
+#define NUMBER_ECC_ROWS  36
+static const unsigned short ecc_chipkill_syndromes[NUMBER_ECC_ROWS][16] = {
+	/* Channel 0 syndromes */
+	{/*0*/  0, 0xe821, 0x7c32, 0x9413, 0xbb44, 0x5365, 0xc776, 0x2f57,
+	   0xdd88, 0x35a9, 0xa1ba, 0x499b, 0x66cc, 0x8eed, 0x1afe, 0xf2df },
+	{/*1*/  0, 0x5d31, 0xa612, 0xfb23, 0x9584, 0xc8b5, 0x3396, 0x6ea7,
+	   0xeac8, 0xb7f9, 0x4cda, 0x11eb, 0x7f4c, 0x227d, 0xd95e, 0x846f },
+	{/*2*/  0, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
+	   0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f },
+	{/*3*/  0, 0x2021, 0x3032, 0x1013, 0x4044, 0x6065, 0x7076, 0x5057,
+	   0x8088, 0xa0a9, 0xb0ba, 0x909b, 0xc0cc, 0xe0ed, 0xf0fe, 0xd0df },
+	{/*4*/  0, 0x5041, 0xa082, 0xf0c3, 0x9054, 0xc015, 0x30d6, 0x6097,
+	   0xe0a8, 0xb0e9, 0x402a, 0x106b, 0x70fc, 0x20bd, 0xd07e, 0x803f },
+	{/*5*/  0, 0xbe21, 0xd732, 0x6913, 0x2144, 0x9f65, 0xf676, 0x4857,
+	   0x3288, 0x8ca9, 0xe5ba, 0x5b9b, 0x13cc, 0xaded, 0xc4fe, 0x7adf },
+	{/*6*/  0, 0x4951, 0x8ea2, 0xc7f3, 0x5394, 0x1ac5, 0xdd36, 0x9467,
+	   0xa1e8, 0xe8b9, 0x2f4a, 0x661b, 0xf27c, 0xbb2d, 0x7cde, 0x358f },
+	{/*7*/  0, 0x74e1, 0x9872, 0xec93, 0xd6b4, 0xa255, 0x4ec6, 0x3a27,
+	   0x6bd8, 0x1f39, 0xf3aa, 0x874b, 0xbd6c, 0xc98d, 0x251e, 0x51ff },
+	{/*8*/  0, 0x15c1, 0x2a42, 0x3f83, 0xcef4, 0xdb35, 0xe4b6, 0xf177,
+	   0x4758, 0x5299, 0x6d1a, 0x78db, 0x89ac, 0x9c6d, 0xa3ee, 0xb62f },
+	{/*9*/  0, 0x3d01, 0x1602, 0x2b03, 0x8504, 0xb805, 0x9306, 0xae07,
+	   0xca08, 0xf709, 0xdc0a, 0xe10b, 0x4f0c, 0x720d, 0x590e, 0x640f },
+	{/*a*/  0, 0x9801, 0xec02, 0x7403, 0x6b04, 0xf305, 0x8706, 0x1f07,
+	   0xbd08, 0x2509, 0x510a, 0xc90b, 0xd60c, 0x4e0d, 0x3a0e, 0xa20f },
+	{/*b*/  0, 0xd131, 0x6212, 0xb323, 0x3884, 0xe9b5, 0x5a96, 0x8ba7,
+	   0x1cc8, 0xcdf9, 0x7eda, 0xafeb, 0x244c, 0xf57d, 0x465e, 0x976f },
+	{/*c*/  0, 0xe1d1, 0x7262, 0x93b3, 0xb834, 0x59e5, 0xca56, 0x2b87,
+	   0xdc18, 0x3dc9, 0xae7a, 0x4fab, 0x542c, 0x85fd, 0x164e, 0xf79f },
+	{/*d*/  0, 0x6051, 0xb0a2, 0xd0f3, 0x1094, 0x70c5, 0xa036, 0xc067,
+	   0x20e8, 0x40b9, 0x904a, 0x601b, 0x307c, 0x502d, 0x80de, 0xe08f },
+	{/*e*/  0, 0xa4c1, 0xf842, 0x5c83, 0xe6f4, 0x4235, 0x1eb6, 0xba77,
+	   0x7b58, 0xdf99, 0x831a, 0x27db, 0x9dac, 0x396d, 0x65ee, 0xc12f },
+	{/*f*/  0, 0x11c1, 0x2242, 0x3383, 0xc8f4, 0xd935, 0xeab6, 0xfb77,
+	   0x4c58, 0x5d99, 0x6e1a, 0x7fdb, 0x84ac, 0x956d, 0xa6ee, 0xb72f },
+
+	/* Channel 1 syndromes */
+	{/*10*/ 1, 0x45d1, 0x8a62, 0xcfb3, 0x5e34, 0x1be5, 0xd456, 0x9187,
+	   0xa718, 0xe2c9, 0x2d7a, 0x68ab, 0xf92c, 0xbcfd, 0x734e, 0x369f },
+	{/*11*/ 1, 0x63e1, 0xb172, 0xd293, 0x14b4, 0x7755, 0xa5c6, 0xc627,
+	   0x28d8, 0x4b39, 0x99aa, 0xfa4b, 0x3c6c, 0x5f8d, 0x8d1e, 0xeeff },
+	{/*12*/ 1, 0xb741, 0xd982, 0x6ec3, 0x2254, 0x9515, 0xfbd6, 0x4c97,
+	   0x33a8, 0x84e9, 0xea2a, 0x5d6b, 0x11fc, 0xa6bd, 0xc87e, 0x7f3f },
+	{/*13*/ 1, 0xdd41, 0x6682, 0xbbc3, 0x3554, 0xe815, 0x53d6, 0xce97,
+	   0x1aa8, 0xc7e9, 0x7c2a, 0xa1fb, 0x2ffc, 0xf2bd, 0x497e, 0x943f },
+	{/*14*/ 1, 0x2bd1, 0x3d62, 0x16b3, 0x4f34, 0x64e5, 0x7256, 0x5987,
+	   0x8518, 0xaec9, 0xb87a, 0x93ab, 0xca2c, 0xe1fd, 0xf74e, 0xdc9f },
+	{/*15*/ 1, 0x83c1, 0xc142, 0x4283, 0xa4f4, 0x2735, 0x65b6, 0xe677,
+	   0xf858, 0x7b99, 0x391a, 0xbadb, 0x5cac, 0xdf6d, 0x9dee, 0x1e2f },
+	{/*16*/ 1, 0x8fd1, 0xc562, 0x4ab3, 0xa934, 0x26e5, 0x6c56, 0xe387,
+	   0xfe18, 0x71c9, 0x3b7a, 0xb4ab, 0x572c, 0xd8fd, 0x924e, 0x1d9f },
+	{/*17*/ 1, 0x4791, 0x89e2, 0xce73, 0x5264, 0x15f5, 0xdb86, 0x9c17,
+	   0xa3b8, 0xe429, 0x2a5a, 0x6dcb, 0xf1dc, 0xb64d, 0x783e, 0x3faf },
+	{/*18*/ 1, 0x5781, 0xa9c2, 0xfe43, 0x92a4, 0xc525, 0x3b66, 0x6ce7,
+	   0xe3f8, 0xb479, 0x4a3a, 0x1dbb, 0x715c, 0x26dd, 0xd89e, 0x8f1f },
+	{/*19*/ 1, 0xbf41, 0xd582, 0x6ac3, 0x2954, 0x9615, 0xfcd6, 0x4397,
+	   0x3ea8, 0x81e9, 0xeb2a, 0x546b, 0x17fc, 0xa8bd, 0xc27e, 0x7d3f },
+	{/*1a*/ 1, 0x9891, 0xe1e2, 0x7273, 0x6464, 0xf7f5, 0x8586, 0x1617,
+	   0xb8b8, 0x2b29, 0x595a, 0xcacb, 0xdcdc, 0x4f4d, 0x3d3e, 0xaeaf },
+	{/*1b*/ 1, 0xcce1, 0x4472, 0x8893, 0xfdb4, 0x3f55, 0xb9c6, 0x7527,
+	   0x56d8, 0x9a39, 0x12aa, 0xde4b, 0xab6c, 0x678d, 0xef1e, 0x23ff },
+	{/*1c*/ 1, 0xa761, 0xf9b2, 0x5ed3, 0xe214, 0x4575, 0x1ba6, 0xbcc7,
+	   0x7328, 0xd449, 0x8a9a, 0x2dfb, 0x913c, 0x365d, 0x688e, 0xcfef },
+	{/*1d*/ 1, 0xff61, 0x55b2, 0xaad3, 0x7914, 0x8675, 0x2ca6, 0xd3c7,
+	   0x9e28, 0x6149, 0xcb9a, 0x34fb, 0xe73c, 0x185d, 0xb28e, 0x4def },
+	{/*1e*/ 1, 0x5451, 0xa8a2, 0xfcf3, 0x9694, 0xc2c5, 0x3e36, 0x6a67,
+	   0xebe8, 0xbfb9, 0x434a, 0x171b, 0x7d7c, 0x292d, 0xd5de, 0x818f },
+	{/*1f*/ 1, 0x6fc1, 0xb542, 0xda83, 0x19f4, 0x7635, 0xacb6, 0xc377,
+	   0x2e58, 0x4199, 0x9b1a, 0xf4db, 0x37ac, 0x586d, 0x82ee, 0xed2f },
+
+	/* ECC bits are also in the set of tokens and they too can go bad
+	 * first 2 cover channel 0, while the second 2 cover channel 1
+	 */
+	{/*20*/ 0, 0xbe01, 0xd702, 0x6903, 0x2104, 0x9f05, 0xf606, 0x4807,
+	   0x3208, 0x8c09, 0xe50a, 0x5b0b, 0x130c, 0xad0d, 0xc40e, 0x7a0f },
+	{/*21*/ 0, 0x4101, 0x8202, 0xc303, 0x5804, 0x1905, 0xda06, 0x9b07,
+	   0xac08, 0xed09, 0x2e0a, 0x6f0b, 0x640c, 0xb50d, 0x760e, 0x370f },
+	{/*22*/ 1, 0xc441, 0x4882, 0x8cc3, 0xf654, 0x3215, 0xbed6, 0x7a97,
+	   0x5ba8, 0x9fe9, 0x132a, 0xd76b, 0xadfc, 0x69bd, 0xe57e, 0x213f },
+	{/*23*/ 1, 0x7621, 0x9b32, 0xed13, 0xda44, 0xac65, 0x4176, 0x3757,
+	   0x6f88, 0x19a9, 0xf4ba, 0x829b, 0xb5cc, 0xc3ed, 0x2efe, 0x58df }
+};
+
+/*
+ * Given the syndrome argument, scan each of the channel tables for a syndrome
+ * match. Depending on which table it is found, return the channel number.
+ */
+static int get_channel_from_ecc_syndrome(unsigned short syndrome)
+{
+	int row;
+	int column;
 
+	/* Determine column to scan */
+	column = syndrome & 0xF;
+
+	/* Scan all rows, looking for syndrome, or end of table */
+	for (row = 0; row < NUMBER_ECC_ROWS; row++) {
+		if (ecc_chipkill_syndromes[row][column] == syndrome)
+			return ecc_chipkill_syndromes[row][0];
+	}
+
+	debugf0("syndrome(%x) not found\n", syndrome);
+	return -1;
+}
-- 
cgit v0.10.2


From d27bf6fa369ca0272df10558d2f290d6fc72e675 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Wed, 6 May 2009 17:55:27 +0200
Subject: amd64_edac: add error decoding logic

Borislav:

- fold amd64_error_info_valid() into its only user
- fix/cleanup comments
- fix function return value patterns
- cleanup debug calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index feb4986..09991c8 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2031,3 +2031,428 @@ static int get_channel_from_ecc_syndrome(unsigned short syndrome)
 	debugf0("syndrome(%x) not found\n", syndrome);
 	return -1;
 }
+
+/*
+ * Check for valid error in the NB Status High register. If so, proceed to read
+ * NB Status Low, NB Address Low and NB Address High registers and store data
+ * into error structure.
+ *
+ * Returns:
+ *	- 1: if hardware regs contains valid error info
+ *	- 0: if no valid error is indicated
+ */
+static int amd64_get_error_info_regs(struct mem_ctl_info *mci,
+				     struct amd64_error_info_regs *regs)
+{
+	struct amd64_pvt *pvt;
+	struct pci_dev *misc_f3_ctl;
+	int err = 0;
+
+	pvt = mci->pvt_info;
+	misc_f3_ctl = pvt->misc_f3_ctl;
+
+	err = pci_read_config_dword(misc_f3_ctl, K8_NBSH, &regs->nbsh);
+	if (err)
+		goto err_reg;
+
+	if (!(regs->nbsh & K8_NBSH_VALID_BIT))
+		return 0;
+
+	/* valid error, read remaining error information registers */
+	err = pci_read_config_dword(misc_f3_ctl, K8_NBSL, &regs->nbsl);
+	if (err)
+		goto err_reg;
+
+	err = pci_read_config_dword(misc_f3_ctl, K8_NBEAL, &regs->nbeal);
+	if (err)
+		goto err_reg;
+
+	err = pci_read_config_dword(misc_f3_ctl, K8_NBEAH, &regs->nbeah);
+	if (err)
+		goto err_reg;
+
+	err = pci_read_config_dword(misc_f3_ctl, K8_NBCFG, &regs->nbcfg);
+	if (err)
+		goto err_reg;
+
+	return 1;
+
+err_reg:
+	debugf0("Reading error info register failed\n");
+	return 0;
+}
+
+/*
+ * This function is called to retrieve the error data from hardware and store it
+ * in the info structure.
+ *
+ * Returns:
+ *	- 1: if a valid error is found
+ *	- 0: if no error is found
+ */
+static int amd64_get_error_info(struct mem_ctl_info *mci,
+				struct amd64_error_info_regs *info)
+{
+	struct amd64_pvt *pvt;
+	struct amd64_error_info_regs regs;
+
+	pvt = mci->pvt_info;
+
+	if (!amd64_get_error_info_regs(mci, info))
+		return 0;
+
+	/*
+	 * Here's the problem with the K8's EDAC reporting: There are four
+	 * registers which report pieces of error information. They are shared
+	 * between CEs and UEs. Furthermore, contrary to what is stated in the
+	 * BKDG, the overflow bit is never used! Every error always updates the
+	 * reporting registers.
+	 *
+	 * Can you see the race condition? All four error reporting registers
+	 * must be read before a new error updates them! There is no way to read
+	 * all four registers atomically. The best than can be done is to detect
+	 * that a race has occured and then report the error without any kind of
+	 * precision.
+	 *
+	 * What is still positive is that errors are still reported and thus
+	 * problems can still be detected - just not localized because the
+	 * syndrome and address are spread out across registers.
+	 *
+	 * Grrrrr!!!!!  Here's hoping that AMD fixes this in some future K8 rev.
+	 * UEs and CEs should have separate register sets with proper overflow
+	 * bits that are used! At very least the problem can be fixed by
+	 * honoring the ErrValid bit in 'nbsh' and not updating registers - just
+	 * set the overflow bit - unless the current error is CE and the new
+	 * error is UE which would be the only situation for overwriting the
+	 * current values.
+	 */
+
+	regs = *info;
+
+	/* Use info from the second read - most current */
+	if (unlikely(!amd64_get_error_info_regs(mci, info)))
+		return 0;
+
+	/* clear the error bits in hardware */
+	pci_write_bits32(pvt->misc_f3_ctl, K8_NBSH, 0, K8_NBSH_VALID_BIT);
+
+	/* Check for the possible race condition */
+	if ((regs.nbsh != info->nbsh) ||
+	     (regs.nbsl != info->nbsl) ||
+	     (regs.nbeah != info->nbeah) ||
+	     (regs.nbeal != info->nbeal)) {
+		amd64_mc_printk(mci, KERN_WARNING,
+				"hardware STATUS read access race condition "
+				"detected!\n");
+		return 0;
+	}
+	return 1;
+}
+
+static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci,
+					 struct amd64_error_info_regs *info)
+{
+	u32 err_code;
+	u32 ec_tt;		/* error code transaction type (2b) */
+	u32 ec_ll;		/* error code cache level (2b) */
+
+	err_code = EXTRACT_ERROR_CODE(info->nbsl);
+	ec_ll = EXTRACT_LL_CODE(err_code);
+	ec_tt = EXTRACT_TT_CODE(err_code);
+
+	amd64_mc_printk(mci, KERN_ERR,
+		     "GART TLB event: transaction type(%s), "
+		     "cache level(%s)\n", tt_msgs[ec_tt], ll_msgs[ec_ll]);
+}
+
+static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci,
+				      struct amd64_error_info_regs *info)
+{
+	u32 err_code;
+	u32 ec_rrrr;		/* error code memory transaction (4b) */
+	u32 ec_tt;		/* error code transaction type (2b) */
+	u32 ec_ll;		/* error code cache level (2b) */
+
+	err_code = EXTRACT_ERROR_CODE(info->nbsl);
+	ec_ll = EXTRACT_LL_CODE(err_code);
+	ec_tt = EXTRACT_TT_CODE(err_code);
+	ec_rrrr = EXTRACT_RRRR_CODE(err_code);
+
+	amd64_mc_printk(mci, KERN_ERR,
+		     "cache hierarchy error: memory transaction type(%s), "
+		     "transaction type(%s), cache level(%s)\n",
+		     rrrr_msgs[ec_rrrr], tt_msgs[ec_tt], ll_msgs[ec_ll]);
+}
+
+
+/*
+ * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR
+ * ADDRESS and process.
+ */
+static void amd64_handle_ce(struct mem_ctl_info *mci,
+			    struct amd64_error_info_regs *info)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	u64 SystemAddress;
+
+	/* Ensure that the Error Address is VALID */
+	if ((info->nbsh & K8_NBSH_VALID_ERROR_ADDR) == 0) {
+		amd64_mc_printk(mci, KERN_ERR,
+			"HW has no ERROR_ADDRESS available\n");
+		edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR);
+		return;
+	}
+
+	SystemAddress = extract_error_address(mci, info);
+
+	amd64_mc_printk(mci, KERN_ERR,
+		"CE ERROR_ADDRESS= 0x%llx\n", SystemAddress);
+
+	pvt->ops->map_sysaddr_to_csrow(mci, info, SystemAddress);
+}
+
+/* Handle any Un-correctable Errors (UEs) */
+static void amd64_handle_ue(struct mem_ctl_info *mci,
+			    struct amd64_error_info_regs *info)
+{
+	int csrow;
+	u64 SystemAddress;
+	u32 page, offset;
+	struct mem_ctl_info *log_mci, *src_mci = NULL;
+
+	log_mci = mci;
+
+	if ((info->nbsh & K8_NBSH_VALID_ERROR_ADDR) == 0) {
+		amd64_mc_printk(mci, KERN_CRIT,
+			"HW has no ERROR_ADDRESS available\n");
+		edac_mc_handle_ue_no_info(log_mci, EDAC_MOD_STR);
+		return;
+	}
+
+	SystemAddress = extract_error_address(mci, info);
+
+	/*
+	 * Find out which node the error address belongs to. This may be
+	 * different from the node that detected the error.
+	 */
+	src_mci = find_mc_by_sys_addr(mci, SystemAddress);
+	if (!src_mci) {
+		amd64_mc_printk(mci, KERN_CRIT,
+			"ERROR ADDRESS (0x%lx) value NOT mapped to a MC\n",
+			(unsigned long)SystemAddress);
+		edac_mc_handle_ue_no_info(log_mci, EDAC_MOD_STR);
+		return;
+	}
+
+	log_mci = src_mci;
+
+	csrow = sys_addr_to_csrow(log_mci, SystemAddress);
+	if (csrow < 0) {
+		amd64_mc_printk(mci, KERN_CRIT,
+			"ERROR_ADDRESS (0x%lx) value NOT mapped to 'csrow'\n",
+			(unsigned long)SystemAddress);
+		edac_mc_handle_ue_no_info(log_mci, EDAC_MOD_STR);
+	} else {
+		error_address_to_page_and_offset(SystemAddress, &page, &offset);
+		edac_mc_handle_ue(log_mci, page, offset, csrow, EDAC_MOD_STR);
+	}
+}
+
+static void amd64_decode_bus_error(struct mem_ctl_info *mci,
+				   struct amd64_error_info_regs *info)
+{
+	u32 err_code, ext_ec;
+	u32 ec_pp;		/* error code participating processor (2p) */
+	u32 ec_to;		/* error code timed out (1b) */
+	u32 ec_rrrr;		/* error code memory transaction (4b) */
+	u32 ec_ii;		/* error code memory or I/O (2b) */
+	u32 ec_ll;		/* error code cache level (2b) */
+
+	ext_ec = EXTRACT_EXT_ERROR_CODE(info->nbsl);
+	err_code = EXTRACT_ERROR_CODE(info->nbsl);
+
+	ec_ll = EXTRACT_LL_CODE(err_code);
+	ec_ii = EXTRACT_II_CODE(err_code);
+	ec_rrrr = EXTRACT_RRRR_CODE(err_code);
+	ec_to = EXTRACT_TO_CODE(err_code);
+	ec_pp = EXTRACT_PP_CODE(err_code);
+
+	amd64_mc_printk(mci, KERN_ERR,
+		"BUS ERROR:\n"
+		"  time-out(%s) mem or i/o(%s)\n"
+		"  participating processor(%s)\n"
+		"  memory transaction type(%s)\n"
+		"  cache level(%s) Error Found by: %s\n",
+		to_msgs[ec_to],
+		ii_msgs[ec_ii],
+		pp_msgs[ec_pp],
+		rrrr_msgs[ec_rrrr],
+		ll_msgs[ec_ll],
+		(info->nbsh & K8_NBSH_ERR_SCRUBER) ?
+			"Scrubber" : "Normal Operation");
+
+	/* If this was an 'observed' error, early out */
+	if (ec_pp == K8_NBSL_PP_OBS)
+		return;		/* We aren't the node involved */
+
+	/* Parse out the extended error code for ECC events */
+	switch (ext_ec) {
+	/* F10 changed to one Extended ECC error code */
+	case F10_NBSL_EXT_ERR_RES:		/* Reserved field */
+	case F10_NBSL_EXT_ERR_ECC:		/* F10 ECC ext err code */
+		break;
+
+	default:
+		amd64_mc_printk(mci, KERN_ERR, "NOT ECC: no special error "
+					       "handling for this error\n");
+		return;
+	}
+
+	if (info->nbsh & K8_NBSH_CECC)
+		amd64_handle_ce(mci, info);
+	else if (info->nbsh & K8_NBSH_UECC)
+		amd64_handle_ue(mci, info);
+
+	/*
+	 * If main error is CE then overflow must be CE.  If main error is UE
+	 * then overflow is unknown.  We'll call the overflow a CE - if
+	 * panic_on_ue is set then we're already panic'ed and won't arrive
+	 * here. Else, then apparently someone doesn't think that UE's are
+	 * catastrophic.
+	 */
+	if (info->nbsh & K8_NBSH_OVERFLOW)
+		edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR
+					  "Error Overflow set");
+}
+
+int amd64_process_error_info(struct mem_ctl_info *mci,
+			     struct amd64_error_info_regs *info,
+			     int handle_errors)
+{
+	struct amd64_pvt *pvt;
+	struct amd64_error_info_regs *regs;
+	u32 err_code, ext_ec;
+	int gart_tlb_error = 0;
+
+	pvt = mci->pvt_info;
+
+	/* If caller doesn't want us to process the error, return */
+	if (!handle_errors)
+		return 1;
+
+	regs = info;
+
+	debugf1("NorthBridge ERROR: mci(0x%p)\n", mci);
+	debugf1("  MC node(%d) Error-Address(0x%.8x-%.8x)\n",
+		pvt->mc_node_id, regs->nbeah, regs->nbeal);
+	debugf1("  nbsh(0x%.8x) nbsl(0x%.8x)\n",
+		regs->nbsh, regs->nbsl);
+	debugf1("  Valid Error=%s Overflow=%s\n",
+		(regs->nbsh & K8_NBSH_VALID_BIT) ? "True" : "False",
+		(regs->nbsh & K8_NBSH_OVERFLOW) ? "True" : "False");
+	debugf1("  Err Uncorrected=%s MCA Error Reporting=%s\n",
+		(regs->nbsh & K8_NBSH_UNCORRECTED_ERR) ?
+			"True" : "False",
+		(regs->nbsh & K8_NBSH_ERR_ENABLE) ?
+			"True" : "False");
+	debugf1("  MiscErr Valid=%s ErrAddr Valid=%s PCC=%s\n",
+		(regs->nbsh & K8_NBSH_MISC_ERR_VALID) ?
+			"True" : "False",
+		(regs->nbsh & K8_NBSH_VALID_ERROR_ADDR) ?
+			"True" : "False",
+		(regs->nbsh & K8_NBSH_PCC) ?
+			"True" : "False");
+	debugf1("  CECC=%s UECC=%s Found by Scruber=%s\n",
+		(regs->nbsh & K8_NBSH_CECC) ?
+			"True" : "False",
+		(regs->nbsh & K8_NBSH_UECC) ?
+			"True" : "False",
+		(regs->nbsh & K8_NBSH_ERR_SCRUBER) ?
+			"True" : "False");
+	debugf1("  CORE0=%s CORE1=%s CORE2=%s CORE3=%s\n",
+		(regs->nbsh & K8_NBSH_CORE0) ? "True" : "False",
+		(regs->nbsh & K8_NBSH_CORE1) ? "True" : "False",
+		(regs->nbsh & K8_NBSH_CORE2) ? "True" : "False",
+		(regs->nbsh & K8_NBSH_CORE3) ? "True" : "False");
+
+
+	err_code = EXTRACT_ERROR_CODE(regs->nbsl);
+
+	/* Determine which error type:
+	 *	1) GART errors - non-fatal, developmental events
+	 *	2) MEMORY errors
+	 *	3) BUS errors
+	 *	4) Unknown error
+	 */
+	if (TEST_TLB_ERROR(err_code)) {
+		/*
+		 * GART errors are intended to help graphics driver developers
+		 * to detect bad GART PTEs. It is recommended by AMD to disable
+		 * GART table walk error reporting by default[1] (currently
+		 * being disabled in mce_cpu_quirks()) and according to the
+		 * comment in mce_cpu_quirks(), such GART errors can be
+		 * incorrectly triggered. We may see these errors anyway and
+		 * unless requested by the user, they won't be reported.
+		 *
+		 * [1] section 13.10.1 on BIOS and Kernel Developers Guide for
+		 *     AMD NPT family 0Fh processors
+		 */
+		if (report_gart_errors == 0)
+			return 1;
+
+		/*
+		 * Only if GART error reporting is requested should we generate
+		 * any logs.
+		 */
+		gart_tlb_error = 1;
+
+		debugf1("GART TLB error\n");
+		amd64_decode_gart_tlb_error(mci, info);
+	} else if (TEST_MEM_ERROR(err_code)) {
+		debugf1("Memory/Cache error\n");
+		amd64_decode_mem_cache_error(mci, info);
+	} else if (TEST_BUS_ERROR(err_code)) {
+		debugf1("Bus (Link/DRAM) error\n");
+		amd64_decode_bus_error(mci, info);
+	} else {
+		/* shouldn't reach here! */
+		amd64_mc_printk(mci, KERN_WARNING,
+			     "%s(): unknown MCE error 0x%x\n", __func__,
+			     err_code);
+	}
+
+	ext_ec = EXTRACT_EXT_ERROR_CODE(regs->nbsl);
+	amd64_mc_printk(mci, KERN_ERR,
+		"ExtErr=(0x%x) %s\n", ext_ec, ext_msgs[ext_ec]);
+
+	if (((ext_ec >= F10_NBSL_EXT_ERR_CRC &&
+			ext_ec <= F10_NBSL_EXT_ERR_TGT) ||
+			(ext_ec == F10_NBSL_EXT_ERR_RMW)) &&
+			EXTRACT_LDT_LINK(info->nbsh)) {
+
+		amd64_mc_printk(mci, KERN_ERR,
+			"Error on hypertransport link: %s\n",
+			htlink_msgs[
+			EXTRACT_LDT_LINK(info->nbsh)]);
+	}
+
+	/*
+	 * Check the UE bit of the NB status high register, if set generate some
+	 * logs. If NOT a GART error, then process the event as a NO-INFO event.
+	 * If it was a GART error, skip that process.
+	 */
+	if (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) {
+		amd64_mc_printk(mci, KERN_CRIT, "uncorrected error\n");
+		if (!gart_tlb_error)
+			edac_mc_handle_ue_no_info(mci, "UE bit is set\n");
+	}
+
+	if (regs->nbsh & K8_NBSH_PCC)
+		amd64_mc_printk(mci, KERN_CRIT,
+			"PCC (processor context corrupt) set\n");
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(amd64_process_error_info);
+
+
-- 
cgit v0.10.2


From 0ec449ee95b20245fef4aa9fa2486456f1540514 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 19:41:25 +0200
Subject: amd64_edac: add EDAC core-related initializers

Borislav:

- add a amd64_free_mc_sibling_devices() helper instead of opencoding the
  release-path.
- fix/cleanup comments
- fix function return value patterns
- cleanup debug calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 09991c8..5a6e714 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2455,4 +2455,319 @@ int amd64_process_error_info(struct mem_ctl_info *mci,
 }
 EXPORT_SYMBOL_GPL(amd64_process_error_info);
 
+/*
+ * The main polling 'check' function, called FROM the edac core to perform the
+ * error checking and if an error is encountered, error processing.
+ */
+static void amd64_check(struct mem_ctl_info *mci)
+{
+	struct amd64_error_info_regs info;
+
+	if (amd64_get_error_info(mci, &info))
+		amd64_process_error_info(mci, &info, 1);
+}
+
+/*
+ * Input:
+ *	1) struct amd64_pvt which contains pvt->dram_f2_ctl pointer
+ *	2) AMD Family index value
+ *
+ * Ouput:
+ *	Upon return of 0, the following filled in:
+ *
+ *		struct pvt->addr_f1_ctl
+ *		struct pvt->misc_f3_ctl
+ *
+ *	Filled in with related device funcitions of 'dram_f2_ctl'
+ *	These devices are "reserved" via the pci_get_device()
+ *
+ *	Upon return of 1 (error status):
+ *
+ *		Nothing reserved
+ */
+static int amd64_reserve_mc_sibling_devices(struct amd64_pvt *pvt, int mc_idx)
+{
+	const struct amd64_family_type *amd64_dev = &amd64_family_types[mc_idx];
+
+	/* Reserve the ADDRESS MAP Device */
+	pvt->addr_f1_ctl = pci_get_related_function(pvt->dram_f2_ctl->vendor,
+						    amd64_dev->addr_f1_ctl,
+						    pvt->dram_f2_ctl);
+
+	if (!pvt->addr_f1_ctl) {
+		amd64_printk(KERN_ERR, "error address map device not found: "
+			     "vendor %x device 0x%x (broken BIOS?)\n",
+			     PCI_VENDOR_ID_AMD, amd64_dev->addr_f1_ctl);
+		return 1;
+	}
+
+	/* Reserve the MISC Device */
+	pvt->misc_f3_ctl = pci_get_related_function(pvt->dram_f2_ctl->vendor,
+						    amd64_dev->misc_f3_ctl,
+						    pvt->dram_f2_ctl);
+
+	if (!pvt->misc_f3_ctl) {
+		pci_dev_put(pvt->addr_f1_ctl);
+		pvt->addr_f1_ctl = NULL;
+
+		amd64_printk(KERN_ERR, "error miscellaneous device not found: "
+			     "vendor %x device 0x%x (broken BIOS?)\n",
+			     PCI_VENDOR_ID_AMD, amd64_dev->misc_f3_ctl);
+		return 1;
+	}
+
+	debugf1("    Addr Map device PCI Bus ID:\t%s\n",
+		pci_name(pvt->addr_f1_ctl));
+	debugf1("    DRAM MEM-CTL PCI Bus ID:\t%s\n",
+		pci_name(pvt->dram_f2_ctl));
+	debugf1("    Misc device PCI Bus ID:\t%s\n",
+		pci_name(pvt->misc_f3_ctl));
+
+	return 0;
+}
+
+static void amd64_free_mc_sibling_devices(struct amd64_pvt *pvt)
+{
+	pci_dev_put(pvt->addr_f1_ctl);
+	pci_dev_put(pvt->misc_f3_ctl);
+}
+
+/*
+ * Retrieve the hardware registers of the memory controller (this includes the
+ * 'Address Map' and 'Misc' device regs)
+ */
+static void amd64_read_mc_registers(struct amd64_pvt *pvt)
+{
+	u64 msr_val;
+	int dram, err = 0;
+
+	/*
+	 * Retrieve TOP_MEM and TOP_MEM2; no masking off of reserved bits since
+	 * those are Read-As-Zero
+	 */
+	rdmsrl(MSR_K8_TOP_MEM1, msr_val);
+	pvt->top_mem = msr_val >> 23;
+	debugf0("  TOP_MEM=0x%08llx\n", pvt->top_mem);
+
+	/* check first whether TOP_MEM2 is enabled */
+	rdmsrl(MSR_K8_SYSCFG, msr_val);
+	if (msr_val & (1U << 21)) {
+		rdmsrl(MSR_K8_TOP_MEM2, msr_val);
+		pvt->top_mem2 = msr_val >> 23;
+		debugf0("  TOP_MEM2=0x%08llx\n", pvt->top_mem2);
+	} else
+		debugf0("  TOP_MEM2 disabled.\n");
+
+	amd64_cpu_display_info(pvt);
+
+	err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCAP, &pvt->nbcap);
+	if (err)
+		goto err_reg;
+
+	if (pvt->ops->read_dram_ctl_register)
+		pvt->ops->read_dram_ctl_register(pvt);
+
+	for (dram = 0; dram < DRAM_REG_COUNT; dram++) {
+		/*
+		 * Call CPU specific READ function to get the DRAM Base and
+		 * Limit values from the DCT.
+		 */
+		pvt->ops->read_dram_base_limit(pvt, dram);
+
+		/*
+		 * Only print out debug info on rows with both R and W Enabled.
+		 * Normal processing, compiler should optimize this whole 'if'
+		 * debug output block away.
+		 */
+		if (pvt->dram_rw_en[dram] != 0) {
+			debugf1("  DRAM_BASE[%d]: 0x%8.08x-%8.08x "
+				"DRAM_LIMIT:  0x%8.08x-%8.08x\n",
+				dram,
+				(u32)(pvt->dram_base[dram] >> 32),
+				(u32)(pvt->dram_base[dram] & 0xFFFFFFFF),
+				(u32)(pvt->dram_limit[dram] >> 32),
+				(u32)(pvt->dram_limit[dram] & 0xFFFFFFFF));
+			debugf1("        IntlvEn=%s %s %s "
+				"IntlvSel=%d DstNode=%d\n",
+				pvt->dram_IntlvEn[dram] ?
+					"Enabled" : "Disabled",
+				(pvt->dram_rw_en[dram] & 0x2) ? "W" : "!W",
+				(pvt->dram_rw_en[dram] & 0x1) ? "R" : "!R",
+				pvt->dram_IntlvSel[dram],
+				pvt->dram_DstNode[dram]);
+		}
+	}
+
+	amd64_read_dct_base_mask(pvt);
+
+	err = pci_read_config_dword(pvt->addr_f1_ctl, K8_DHAR, &pvt->dhar);
+	if (err)
+		goto err_reg;
+
+	amd64_read_dbam_reg(pvt);
+
+	err = pci_read_config_dword(pvt->misc_f3_ctl,
+				F10_ONLINE_SPARE, &pvt->online_spare);
+	if (err)
+		goto err_reg;
+
+	err = pci_read_config_dword(pvt->dram_f2_ctl, F10_DCLR_0, &pvt->dclr0);
+	if (err)
+		goto err_reg;
+
+	err = pci_read_config_dword(pvt->dram_f2_ctl, F10_DCHR_0, &pvt->dchr0);
+	if (err)
+		goto err_reg;
+
+	if (!dct_ganging_enabled(pvt)) {
+		err = pci_read_config_dword(pvt->dram_f2_ctl, F10_DCLR_1,
+						&pvt->dclr1);
+		if (err)
+			goto err_reg;
+
+		err = pci_read_config_dword(pvt->dram_f2_ctl, F10_DCHR_1,
+						&pvt->dchr1);
+		if (err)
+			goto err_reg;
+	}
+
+	amd64_dump_misc_regs(pvt);
+
+err_reg:
+	debugf0("Reading an MC register failed\n");
+
+}
+
+/*
+ * NOTE: CPU Revision Dependent code
+ *
+ * Input:
+ *	@csrow_nr ChipSelect Row Number (0..CHIPSELECT_COUNT-1)
+ *	k8 private pointer to -->
+ *			DRAM Bank Address mapping register
+ *			node_id
+ *			DCL register where dual_channel_active is
+ *
+ * The DBAM register consists of 4 sets of 4 bits each definitions:
+ *
+ * Bits:	CSROWs
+ * 0-3		CSROWs 0 and 1
+ * 4-7		CSROWs 2 and 3
+ * 8-11		CSROWs 4 and 5
+ * 12-15	CSROWs 6 and 7
+ *
+ * Values range from: 0 to 15
+ * The meaning of the values depends on CPU revision and dual-channel state,
+ * see relevant BKDG more info.
+ *
+ * The memory controller provides for total of only 8 CSROWs in its current
+ * architecture. Each "pair" of CSROWs normally represents just one DIMM in
+ * single channel or two (2) DIMMs in dual channel mode.
+ *
+ * The following code logic collapses the various tables for CSROW based on CPU
+ * revision.
+ *
+ * Returns:
+ *	The number of PAGE_SIZE pages on the specified CSROW number it
+ *	encompasses
+ *
+ */
+static u32 amd64_csrow_nr_pages(int csrow_nr, struct amd64_pvt *pvt)
+{
+	u32 dram_map, nr_pages;
+
+	/*
+	 * The math on this doesn't look right on the surface because x/2*4 can
+	 * be simplified to x*2 but this expression makes use of the fact that
+	 * it is integral math where 1/2=0. This intermediate value becomes the
+	 * number of bits to shift the DBAM register to extract the proper CSROW
+	 * field.
+	 */
+	dram_map = (pvt->dbam0 >> ((csrow_nr / 2) * 4)) & 0xF;
+
+	nr_pages = pvt->ops->dbam_map_to_pages(pvt, dram_map);
+
+	/*
+	 * If dual channel then double the memory size of single channel.
+	 * Channel count is 1 or 2
+	 */
+	nr_pages <<= (pvt->channel_count - 1);
+
+	debugf0("  (csrow=%d) DBAM map index= %d\n", csrow_nr, dram_map);
+	debugf0("    nr_pages= %u  channel-count = %d\n",
+		nr_pages, pvt->channel_count);
+
+	return nr_pages;
+}
+
+/*
+ * Initialize the array of csrow attribute instances, based on the values
+ * from pci config hardware registers.
+ */
+static int amd64_init_csrows(struct mem_ctl_info *mci)
+{
+	struct csrow_info *csrow;
+	struct amd64_pvt *pvt;
+	u64 input_addr_min, input_addr_max, sys_addr;
+	int i, err = 0, empty = 1;
+
+	pvt = mci->pvt_info;
+
+	err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCFG, &pvt->nbcfg);
+	if (err)
+		debugf0("Reading K8_NBCFG failed\n");
+
+	debugf0("NBCFG= 0x%x  CHIPKILL= %s DRAM ECC= %s\n", pvt->nbcfg,
+		(pvt->nbcfg & K8_NBCFG_CHIPKILL) ? "Enabled" : "Disabled",
+		(pvt->nbcfg & K8_NBCFG_ECC_ENABLE) ? "Enabled" : "Disabled"
+		);
+
+	for (i = 0; i < CHIPSELECT_COUNT; i++) {
+		csrow = &mci->csrows[i];
+
+		if ((pvt->dcsb0[i] & K8_DCSB_CS_ENABLE) == 0) {
+			debugf1("----CSROW %d EMPTY for node %d\n", i,
+				pvt->mc_node_id);
+			continue;
+		}
+
+		debugf1("----CSROW %d VALID for MC node %d\n",
+			i, pvt->mc_node_id);
+
+		empty = 0;
+		csrow->nr_pages = amd64_csrow_nr_pages(i, pvt);
+		find_csrow_limits(mci, i, &input_addr_min, &input_addr_max);
+		sys_addr = input_addr_to_sys_addr(mci, input_addr_min);
+		csrow->first_page = (u32) (sys_addr >> PAGE_SHIFT);
+		sys_addr = input_addr_to_sys_addr(mci, input_addr_max);
+		csrow->last_page = (u32) (sys_addr >> PAGE_SHIFT);
+		csrow->page_mask = ~mask_from_dct_mask(pvt, i);
+		/* 8 bytes of resolution */
+
+		csrow->mtype = amd64_determine_memory_type(pvt);
+
+		debugf1("  for MC node %d csrow %d:\n", pvt->mc_node_id, i);
+		debugf1("    input_addr_min: 0x%lx input_addr_max: 0x%lx\n",
+			(unsigned long)input_addr_min,
+			(unsigned long)input_addr_max);
+		debugf1("    sys_addr: 0x%lx  page_mask: 0x%lx\n",
+			(unsigned long)sys_addr, csrow->page_mask);
+		debugf1("    nr_pages: %u  first_page: 0x%lx "
+			"last_page: 0x%lx\n",
+			(unsigned)csrow->nr_pages,
+			csrow->first_page, csrow->last_page);
+
+		/*
+		 * determine whether CHIPKILL or JUST ECC or NO ECC is operating
+		 */
+		if (pvt->nbcfg & K8_NBCFG_ECC_ENABLE)
+			csrow->edac_mode =
+			    (pvt->nbcfg & K8_NBCFG_CHIPKILL) ?
+			    EDAC_S4ECD4ED : EDAC_SECDED;
+		else
+			csrow->edac_mode = EDAC_NONE;
+	}
+
+	return empty;
+}
 
-- 
cgit v0.10.2


From f9431992b6227069bc54800d55531c6f78d276a7 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 19:46:08 +0200
Subject: amd64_edac: add ECC reporting initializers

Borislav:
- convert to the new {rd|wr}msr_on_cpus interfaces.
- convert pvt->old_mcgctl to a bitmask thus saving some bytes
- fix/cleanup comments
- fix function return value patterns
- add a proper bugfix found by Doug to amd64_check_ecc_enabled where we
  missed checking for the ECC enabled bit in NB CFG.
- cleanup debug calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 5a6e714..3b6c421 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2771,3 +2771,210 @@ static int amd64_init_csrows(struct mem_ctl_info *mci)
 	return empty;
 }
 
+/*
+ * Only if 'ecc_enable_override' is set AND BIOS had ECC disabled, do "we"
+ * enable it.
+ */
+static void amd64_enable_ecc_error_reporting(struct mem_ctl_info *mci)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	const cpumask_t *cpumask = cpumask_of_node(pvt->mc_node_id);
+	int cpu, idx = 0, err = 0;
+	struct msr msrs[cpumask_weight(cpumask)];
+	u32 value;
+	u32 mask = K8_NBCTL_CECCEn | K8_NBCTL_UECCEn;
+
+	if (!ecc_enable_override)
+		return;
+
+	memset(msrs, 0, sizeof(msrs));
+
+	amd64_printk(KERN_WARNING,
+		"'ecc_enable_override' parameter is active, "
+		"Enabling AMD ECC hardware now: CAUTION\n");
+
+	err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCTL, &value);
+	if (err)
+		debugf0("Reading K8_NBCTL failed\n");
+
+	/* turn on UECCn and CECCEn bits */
+	pvt->old_nbctl = value & mask;
+	pvt->nbctl_mcgctl_saved = 1;
+
+	value |= mask;
+	pci_write_config_dword(pvt->misc_f3_ctl, K8_NBCTL, value);
+
+	rdmsr_on_cpus(cpumask, K8_MSR_MCGCTL, msrs);
+
+	for_each_cpu(cpu, cpumask) {
+		if (msrs[idx].l & K8_MSR_MCGCTL_NBE)
+			set_bit(idx, &pvt->old_mcgctl);
+
+		msrs[idx].l |= K8_MSR_MCGCTL_NBE;
+		idx++;
+	}
+	wrmsr_on_cpus(cpumask, K8_MSR_MCGCTL, msrs);
+
+	err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCFG, &value);
+	if (err)
+		debugf0("Reading K8_NBCFG failed\n");
+
+	debugf0("NBCFG(1)= 0x%x  CHIPKILL= %s ECC_ENABLE= %s\n", value,
+		(value & K8_NBCFG_CHIPKILL) ? "Enabled" : "Disabled",
+		(value & K8_NBCFG_ECC_ENABLE) ? "Enabled" : "Disabled");
+
+	if (!(value & K8_NBCFG_ECC_ENABLE)) {
+		amd64_printk(KERN_WARNING,
+			"This node reports that DRAM ECC is "
+			"currently Disabled; ENABLING now\n");
+
+		/* Attempt to turn on DRAM ECC Enable */
+		value |= K8_NBCFG_ECC_ENABLE;
+		pci_write_config_dword(pvt->misc_f3_ctl, K8_NBCFG, value);
+
+		err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCFG, &value);
+		if (err)
+			debugf0("Reading K8_NBCFG failed\n");
+
+		if (!(value & K8_NBCFG_ECC_ENABLE)) {
+			amd64_printk(KERN_WARNING,
+				"Hardware rejects Enabling DRAM ECC checking\n"
+				"Check memory DIMM configuration\n");
+		} else {
+			amd64_printk(KERN_DEBUG,
+				"Hardware accepted DRAM ECC Enable\n");
+		}
+	}
+	debugf0("NBCFG(2)= 0x%x  CHIPKILL= %s ECC_ENABLE= %s\n", value,
+		(value & K8_NBCFG_CHIPKILL) ? "Enabled" : "Disabled",
+		(value & K8_NBCFG_ECC_ENABLE) ? "Enabled" : "Disabled");
+
+	pvt->ctl_error_info.nbcfg = value;
+}
+
+static void amd64_restore_ecc_error_reporting(struct amd64_pvt *pvt)
+{
+	const cpumask_t *cpumask = cpumask_of_node(pvt->mc_node_id);
+	int cpu, idx = 0, err = 0;
+	struct msr msrs[cpumask_weight(cpumask)];
+	u32 value;
+	u32 mask = K8_NBCTL_CECCEn | K8_NBCTL_UECCEn;
+
+	if (!pvt->nbctl_mcgctl_saved)
+		return;
+
+	memset(msrs, 0, sizeof(msrs));
+
+	err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCTL, &value);
+	if (err)
+		debugf0("Reading K8_NBCTL failed\n");
+	value &= ~mask;
+	value |= pvt->old_nbctl;
+
+	/* restore the NB Enable MCGCTL bit */
+	pci_write_config_dword(pvt->misc_f3_ctl, K8_NBCTL, value);
+
+	rdmsr_on_cpus(cpumask, K8_MSR_MCGCTL, msrs);
+
+	for_each_cpu(cpu, cpumask) {
+		msrs[idx].l &= ~K8_MSR_MCGCTL_NBE;
+		msrs[idx].l |=
+			test_bit(idx, &pvt->old_mcgctl) << K8_MSR_MCGCTL_NBE;
+		idx++;
+	}
+
+	wrmsr_on_cpus(cpumask, K8_MSR_MCGCTL, msrs);
+}
+
+static void check_mcg_ctl(void *ret)
+{
+	u64 msr_val = 0;
+	u8 nbe;
+
+	rdmsrl(MSR_IA32_MCG_CTL, msr_val);
+	nbe = msr_val & K8_MSR_MCGCTL_NBE;
+
+	debugf0("core: %u, MCG_CTL: 0x%llx, NB MSR is %s\n",
+		raw_smp_processor_id(), msr_val,
+		(nbe ? "enabled" : "disabled"));
+
+	if (!nbe)
+		*(int *)ret = 0;
+}
+
+/* check MCG_CTL on all the cpus on this node */
+static int amd64_mcg_ctl_enabled_on_cpus(const cpumask_t *mask)
+{
+	int ret = 1;
+	preempt_disable();
+	smp_call_function_many(mask, check_mcg_ctl, &ret, 1);
+	preempt_enable();
+
+	return ret;
+}
+
+/*
+ * EDAC requires that the BIOS have ECC enabled before taking over the
+ * processing of ECC errors. This is because the BIOS can properly initialize
+ * the memory system completely. A command line option allows to force-enable
+ * hardware ECC later in amd64_enable_ecc_error_reporting().
+ */
+static int amd64_check_ecc_enabled(struct amd64_pvt *pvt)
+{
+	u32 value;
+	int err = 0, ret = 0;
+	u8 ecc_enabled = 0;
+
+	err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCFG, &value);
+	if (err)
+		debugf0("Reading K8_NBCTL failed\n");
+
+	ecc_enabled = !!(value & K8_NBCFG_ECC_ENABLE);
+
+	ret = amd64_mcg_ctl_enabled_on_cpus(cpumask_of_node(pvt->mc_node_id));
+
+	debugf0("K8_NBCFG=0x%x,  DRAM ECC is %s\n", value,
+			(value & K8_NBCFG_ECC_ENABLE ? "enabled" : "disabled"));
+
+	if (!ecc_enabled || !ret) {
+		if (!ecc_enabled) {
+			amd64_printk(KERN_WARNING, "This node reports that "
+						   "Memory ECC is currently "
+						   "disabled.\n");
+
+			amd64_printk(KERN_WARNING, "bit 0x%lx in register "
+				"F3x%x of the MISC_CONTROL device (%s) "
+				"should be enabled\n", K8_NBCFG_ECC_ENABLE,
+				K8_NBCFG, pci_name(pvt->misc_f3_ctl));
+		}
+		if (!ret) {
+			amd64_printk(KERN_WARNING, "bit 0x%016lx in MSR 0x%08x "
+					"of node %d should be enabled\n",
+					K8_MSR_MCGCTL_NBE, MSR_IA32_MCG_CTL,
+					pvt->mc_node_id);
+		}
+		if (!ecc_enable_override) {
+			amd64_printk(KERN_WARNING, "WARNING: ECC is NOT "
+				"currently enabled by the BIOS. Module "
+				"will NOT be loaded.\n"
+				"    Either Enable ECC in the BIOS, "
+				"or use the 'ecc_enable_override' "
+				"parameter.\n"
+				"    Might be a BIOS bug, if BIOS says "
+				"ECC is enabled\n"
+				"    Use of the override can cause "
+				"unknown side effects.\n");
+			ret = -ENODEV;
+		}
+	} else {
+		amd64_printk(KERN_INFO,
+			"ECC is enabled by BIOS, Proceeding "
+			"with EDAC module initialization\n");
+
+		/* CLEAR the override, since BIOS controlled it */
+		ecc_enable_override = 0;
+	}
+
+	return ret;
+}
+
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index 6f5d5d6..e7aa760 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -70,6 +70,7 @@
 #include <linux/slab.h>
 #include <linux/mmzone.h>
 #include <linux/edac.h>
+#include <asm/msr.h>
 #include "edac_core.h"
 
 #define amd64_printk(level, fmt, arg...) \
@@ -549,7 +550,7 @@ struct amd64_pvt {
 	/* Save old hw registers' values before we modified them */
 	u32 nbctl_mcgctl_saved;		/* When true, following 2 are valid */
 	u32 old_nbctl;
-	u32 *old_mcgctl;		/* per core on this node */
+	unsigned long old_mcgctl;	/* per core on this node */
 
 	/* MC Type Index value: socket F vs Family 10h */
 	u32 mc_type_index;
-- 
cgit v0.10.2


From 7d6034d3213e2dd1c0f8678e11064007413011c4 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 20:01:01 +0200
Subject: amd64_edac: add module registration routines

Also, link into Kbuild by adding Kconfig and Makefile entries.

Borislav:
- Kconfig/Makefile splitting
- use zero-sized arrays for the sysfs attrs if not enabled
- rename sysfs attrs to more conform values
- shorten CONFIG_ names
- make multiple structure members assignment vertically aligned
- fix/cleanup comments
- fix function return value patterns
- fix err labels
- fix a memleak bug caught by Ingo
- remove the NUMA dependency and use num_k8_northbrides for initializing
  a driver instance per NB.
- do not copy the pvt contents into the mci struct in
  amd64_init_2nd_stage() and save it in the mci->pvt_info void ptr
  instead.
- cleanup debug calls
- simplify amd64_setup_pci_device()

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 956982f..3dc650a 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -58,6 +58,32 @@ config EDAC_MM_EDAC
 	  occurred so that a particular failing memory module can be
 	  replaced.  If unsure, select 'Y'.
 
+config EDAC_AMD64
+	tristate "AMD64 (Opteron, Athlon64) K8, F10h, F11h"
+	depends on EDAC_MM_EDAC && X86 && PCI
+	default m
+	help
+	Support for error detection and correction on the AMD 64
+	Families of Memory Controllers (K8, F10h and F11h)
+
+config EDAC_AMD64_ERROR_INJECTION
+	bool "Sysfs Error Injection facilities"
+	depends on EDAC_AMD64
+	help
+	  Recent Opterons (Family 10h and later) provide for Memory Error
+	  Injection into the ECC detection circuits. The amd64_edac module
+	  allows the operator/user to inject Uncorrectable and Correctable
+	  errors into DRAM.
+
+	  When enabled, in each of the respective memory controller directories
+	  (/sys/devices/system/edac/mc/mcX), there are 3 input files:
+
+	  - inject_section (0..3, 16-byte section of 64-byte cacheline),
+	  - inject_word (0..8, 16-bit word of 16-byte section),
+	  - inject_ecc_vector (hex ecc vector: select bits of inject word)
+
+	  In addition, there are two control files, inject_read and inject_write,
+	  which trigger the DRAM ECC Read and Write respectively.
 
 config EDAC_AMD76X
 	tristate "AMD 76x (760, 762, 768)"
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index 5907681..633dc56 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -30,6 +30,13 @@ obj-$(CONFIG_EDAC_I3000)		+= i3000_edac.o
 obj-$(CONFIG_EDAC_X38)			+= x38_edac.o
 obj-$(CONFIG_EDAC_I82860)		+= i82860_edac.o
 obj-$(CONFIG_EDAC_R82600)		+= r82600_edac.o
+
+amd64_edac_mod-y :=  amd64_edac_err_types.o amd64_edac.o
+amd64_edac_mod-$(CONFIG_EDAC_DEBUG) += amd64_edac_dbg.o
+amd64_edac_mod-$(CONFIG_EDAC_AMD64_ERROR_INJECTION) += amd64_edac_inj.o
+
+obj-$(CONFIG_EDAC_AMD64)		+= amd64_edac_mod.o
+
 obj-$(CONFIG_EDAC_PASEMI)		+= pasemi_edac.o
 obj-$(CONFIG_EDAC_MPC85XX)		+= mpc85xx_edac.o
 obj-$(CONFIG_EDAC_MV64X60)		+= mv64x60_edac.o
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 3b6c421..c36bf40 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1,4 +1,5 @@
 #include "amd64_edac.h"
+#include <asm/k8.h>
 
 static struct edac_pci_ctl_info *amd64_ctl_pci;
 
@@ -2978,3 +2979,376 @@ static int amd64_check_ecc_enabled(struct amd64_pvt *pvt)
 	return ret;
 }
 
+struct mcidev_sysfs_attribute sysfs_attrs[ARRAY_SIZE(amd64_dbg_attrs) +
+					  ARRAY_SIZE(amd64_inj_attrs) +
+					  1];
+
+struct mcidev_sysfs_attribute terminator = { .attr = { .name = NULL } };
+
+static void amd64_set_mc_sysfs_attributes(struct mem_ctl_info *mci)
+{
+	unsigned int i = 0, j = 0;
+
+	for (; i < ARRAY_SIZE(amd64_dbg_attrs); i++)
+		sysfs_attrs[i] = amd64_dbg_attrs[i];
+
+	for (j = 0; j < ARRAY_SIZE(amd64_inj_attrs); j++, i++)
+		sysfs_attrs[i] = amd64_inj_attrs[j];
+
+	sysfs_attrs[i] = terminator;
+
+	mci->mc_driver_sysfs_attributes = sysfs_attrs;
+}
+
+static void amd64_setup_mci_misc_attributes(struct mem_ctl_info *mci)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+
+	mci->mtype_cap		= MEM_FLAG_DDR2 | MEM_FLAG_RDDR2;
+	mci->edac_ctl_cap	= EDAC_FLAG_NONE;
+	mci->edac_cap		= EDAC_FLAG_NONE;
+
+	if (pvt->nbcap & K8_NBCAP_SECDED)
+		mci->edac_ctl_cap |= EDAC_FLAG_SECDED;
+
+	if (pvt->nbcap & K8_NBCAP_CHIPKILL)
+		mci->edac_ctl_cap |= EDAC_FLAG_S4ECD4ED;
+
+	mci->edac_cap		= amd64_determine_edac_cap(pvt);
+	mci->mod_name		= EDAC_MOD_STR;
+	mci->mod_ver		= EDAC_AMD64_VERSION;
+	mci->ctl_name		= get_amd_family_name(pvt->mc_type_index);
+	mci->dev_name		= pci_name(pvt->dram_f2_ctl);
+	mci->ctl_page_to_phys	= NULL;
+
+	/* IMPORTANT: Set the polling 'check' function in this module */
+	mci->edac_check		= amd64_check;
+
+	/* memory scrubber interface */
+	mci->set_sdram_scrub_rate = amd64_set_scrub_rate;
+	mci->get_sdram_scrub_rate = amd64_get_scrub_rate;
+}
+
+/*
+ * Init stuff for this DRAM Controller device.
+ *
+ * Due to a hardware feature on Fam10h CPUs, the Enable Extended Configuration
+ * Space feature MUST be enabled on ALL Processors prior to actually reading
+ * from the ECS registers. Since the loading of the module can occur on any
+ * 'core', and cores don't 'see' all the other processors ECS data when the
+ * others are NOT enabled. Our solution is to first enable ECS access in this
+ * routine on all processors, gather some data in a amd64_pvt structure and
+ * later come back in a finish-setup function to perform that final
+ * initialization. See also amd64_init_2nd_stage() for that.
+ */
+static int amd64_probe_one_instance(struct pci_dev *dram_f2_ctl,
+				    int mc_type_index)
+{
+	struct amd64_pvt *pvt = NULL;
+	int err = 0, ret;
+
+	ret = -ENOMEM;
+	pvt = kzalloc(sizeof(struct amd64_pvt), GFP_KERNEL);
+	if (!pvt)
+		goto err_exit;
+
+	pvt->mc_node_id = get_mc_node_id_from_pdev(dram_f2_ctl);
+
+	pvt->dram_f2_ctl	= dram_f2_ctl;
+	pvt->ext_model		= boot_cpu_data.x86_model >> 4;
+	pvt->mc_type_index	= mc_type_index;
+	pvt->ops		= family_ops(mc_type_index);
+	pvt->old_mcgctl		= 0;
+
+	/*
+	 * We have the dram_f2_ctl device as an argument, now go reserve its
+	 * sibling devices from the PCI system.
+	 */
+	ret = -ENODEV;
+	err = amd64_reserve_mc_sibling_devices(pvt, mc_type_index);
+	if (err)
+		goto err_free;
+
+	ret = -EINVAL;
+	err = amd64_check_ecc_enabled(pvt);
+	if (err)
+		goto err_put;
+
+	/*
+	 * Key operation here: setup of HW prior to performing ops on it. Some
+	 * setup is required to access ECS data. After this is performed, the
+	 * 'teardown' function must be called upon error and normal exit paths.
+	 */
+	if (boot_cpu_data.x86 >= 0x10)
+		amd64_setup(pvt);
+
+	/*
+	 * Save the pointer to the private data for use in 2nd initialization
+	 * stage
+	 */
+	pvt_lookup[pvt->mc_node_id] = pvt;
+
+	return 0;
+
+err_put:
+	amd64_free_mc_sibling_devices(pvt);
+
+err_free:
+	kfree(pvt);
+
+err_exit:
+	return ret;
+}
+
+/*
+ * This is the finishing stage of the init code. Needs to be performed after all
+ * MCs' hardware have been prepped for accessing extended config space.
+ */
+static int amd64_init_2nd_stage(struct amd64_pvt *pvt)
+{
+	int node_id = pvt->mc_node_id;
+	struct mem_ctl_info *mci;
+	int ret, err = 0;
+
+	amd64_read_mc_registers(pvt);
+
+	ret = -ENODEV;
+	if (pvt->ops->probe_valid_hardware) {
+		err = pvt->ops->probe_valid_hardware(pvt);
+		if (err)
+			goto err_exit;
+	}
+
+	/*
+	 * We need to determine how many memory channels there are. Then use
+	 * that information for calculating the size of the dynamic instance
+	 * tables in the 'mci' structure
+	 */
+	pvt->channel_count = pvt->ops->early_channel_count(pvt);
+	if (pvt->channel_count < 0)
+		goto err_exit;
+
+	ret = -ENOMEM;
+	mci = edac_mc_alloc(0, CHIPSELECT_COUNT, pvt->channel_count, node_id);
+	if (!mci)
+		goto err_exit;
+
+	mci->pvt_info = pvt;
+
+	mci->dev = &pvt->dram_f2_ctl->dev;
+	amd64_setup_mci_misc_attributes(mci);
+
+	if (amd64_init_csrows(mci))
+		mci->edac_cap = EDAC_FLAG_NONE;
+
+	amd64_enable_ecc_error_reporting(mci);
+	amd64_set_mc_sysfs_attributes(mci);
+
+	ret = -ENODEV;
+	if (edac_mc_add_mc(mci)) {
+		debugf1("failed edac_mc_add_mc()\n");
+		goto err_add_mc;
+	}
+
+	mci_lookup[node_id] = mci;
+	pvt_lookup[node_id] = NULL;
+	return 0;
+
+err_add_mc:
+	edac_mc_free(mci);
+
+err_exit:
+	debugf0("failure to init 2nd stage: ret=%d\n", ret);
+
+	amd64_restore_ecc_error_reporting(pvt);
+
+	if (boot_cpu_data.x86 > 0xf)
+		amd64_teardown(pvt);
+
+	amd64_free_mc_sibling_devices(pvt);
+
+	kfree(pvt_lookup[pvt->mc_node_id]);
+	pvt_lookup[node_id] = NULL;
+
+	return ret;
+}
+
+
+static int __devinit amd64_init_one_instance(struct pci_dev *pdev,
+				 const struct pci_device_id *mc_type)
+{
+	int ret = 0;
+
+	debugf0("(MC node=%d,mc_type='%s')\n",
+		get_mc_node_id_from_pdev(pdev),
+		get_amd_family_name(mc_type->driver_data));
+
+	ret = pci_enable_device(pdev);
+	if (ret < 0)
+		ret = -EIO;
+	else
+		ret = amd64_probe_one_instance(pdev, mc_type->driver_data);
+
+	if (ret < 0)
+		debugf0("ret=%d\n", ret);
+
+	return ret;
+}
+
+static void __devexit amd64_remove_one_instance(struct pci_dev *pdev)
+{
+	struct mem_ctl_info *mci;
+	struct amd64_pvt *pvt;
+
+	/* Remove from EDAC CORE tracking list */
+	mci = edac_mc_del_mc(&pdev->dev);
+	if (!mci)
+		return;
+
+	pvt = mci->pvt_info;
+
+	amd64_restore_ecc_error_reporting(pvt);
+
+	if (boot_cpu_data.x86 > 0xf)
+		amd64_teardown(pvt);
+
+	amd64_free_mc_sibling_devices(pvt);
+
+	kfree(pvt);
+	mci->pvt_info = NULL;
+
+	mci_lookup[pvt->mc_node_id] = NULL;
+
+	/* Free the EDAC CORE resources */
+	edac_mc_free(mci);
+}
+
+/*
+ * This table is part of the interface for loading drivers for PCI devices. The
+ * PCI core identifies what devices are on a system during boot, and then
+ * inquiry this table to see if this driver is for a given device found.
+ */
+static const struct pci_device_id amd64_pci_table[] __devinitdata = {
+	{
+		.vendor		= PCI_VENDOR_ID_AMD,
+		.device		= PCI_DEVICE_ID_AMD_K8_NB_MEMCTL,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.class		= 0,
+		.class_mask	= 0,
+		.driver_data	= K8_CPUS
+	},
+	{
+		.vendor		= PCI_VENDOR_ID_AMD,
+		.device		= PCI_DEVICE_ID_AMD_10H_NB_DRAM,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.class		= 0,
+		.class_mask	= 0,
+		.driver_data	= F10_CPUS
+	},
+	{
+		.vendor		= PCI_VENDOR_ID_AMD,
+		.device		= PCI_DEVICE_ID_AMD_11H_NB_DRAM,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.class		= 0,
+		.class_mask	= 0,
+		.driver_data	= F11_CPUS
+	},
+	{0, }
+};
+MODULE_DEVICE_TABLE(pci, amd64_pci_table);
+
+static struct pci_driver amd64_pci_driver = {
+	.name		= EDAC_MOD_STR,
+	.probe		= amd64_init_one_instance,
+	.remove		= __devexit_p(amd64_remove_one_instance),
+	.id_table	= amd64_pci_table,
+};
+
+static void amd64_setup_pci_device(void)
+{
+	struct mem_ctl_info *mci;
+	struct amd64_pvt *pvt;
+
+	if (amd64_ctl_pci)
+		return;
+
+	mci = mci_lookup[0];
+	if (mci) {
+
+		pvt = mci->pvt_info;
+		amd64_ctl_pci =
+			edac_pci_create_generic_ctl(&pvt->dram_f2_ctl->dev,
+						    EDAC_MOD_STR);
+
+		if (!amd64_ctl_pci) {
+			pr_warning("%s(): Unable to create PCI control\n",
+				   __func__);
+
+			pr_warning("%s(): PCI error report via EDAC not set\n",
+				   __func__);
+			}
+	}
+}
+
+static int __init amd64_edac_init(void)
+{
+	int nb, err = -ENODEV;
+
+	edac_printk(KERN_INFO, EDAC_MOD_STR, EDAC_AMD64_VERSION "\n");
+
+	opstate_init();
+
+	if (cache_k8_northbridges() < 0)
+		goto err_exit;
+
+	err = pci_register_driver(&amd64_pci_driver);
+	if (err)
+		return err;
+
+	/*
+	 * At this point, the array 'pvt_lookup[]' contains pointers to alloc'd
+	 * amd64_pvt structs. These will be used in the 2nd stage init function
+	 * to finish initialization of the MC instances.
+	 */
+	for (nb = 0; nb < num_k8_northbridges; nb++) {
+		if (!pvt_lookup[nb])
+			continue;
+
+		err = amd64_init_2nd_stage(pvt_lookup[nb]);
+		if (err)
+			goto err_exit;
+	}
+
+	amd64_setup_pci_device();
+
+	return 0;
+
+err_exit:
+	debugf0("'finish_setup' stage failed\n");
+	pci_unregister_driver(&amd64_pci_driver);
+
+	return err;
+}
+
+static void __exit amd64_edac_exit(void)
+{
+	if (amd64_ctl_pci)
+		edac_pci_release_generic_ctl(amd64_ctl_pci);
+
+	pci_unregister_driver(&amd64_pci_driver);
+}
+
+module_init(amd64_edac_init);
+module_exit(amd64_edac_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("SoftwareBitMaker: Doug Thompson, "
+		"Dave Peterson, Thayne Harbaugh");
+MODULE_DESCRIPTION("MC support for AMD64 memory controllers - "
+		EDAC_AMD64_VERSION);
+
+module_param(edac_op_state, int, 0444);
+MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI");
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index e7aa760..a159957 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -577,6 +577,21 @@ extern const char *ii_msgs[4];
 extern const char *ext_msgs[32];
 extern const char *htlink_msgs[8];
 
+#ifdef CONFIG_EDAC_DEBUG
+#define NUM_DBG_ATTRS 9
+#else
+#define NUM_DBG_ATTRS 0
+#endif
+
+#ifdef CONFIG_EDAC_AMD64_ERROR_INJECTION
+#define NUM_INJ_ATTRS 5
+#else
+#define NUM_INJ_ATTRS 0
+#endif
+
+extern struct mcidev_sysfs_attribute amd64_dbg_attrs[NUM_DBG_ATTRS],
+				     amd64_inj_attrs[NUM_INJ_ATTRS];
+
 /*
  * Each of the PCI Device IDs types have their own set of hardware accessor
  * functions and per device encoding/decoding logic.
-- 
cgit v0.10.2


From 3d373290450b10933dad7c387c42179f54417009 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Wed, 20 May 2009 20:18:46 +0200
Subject: amd64_edac: do not enable module by default

While at it, fix a link failure when !K8_NB.

Acked-by: Doug Thompson <dougthompson@xmission.com>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Tested-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 3dc650a..eff363d 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -60,11 +60,10 @@ config EDAC_MM_EDAC
 
 config EDAC_AMD64
 	tristate "AMD64 (Opteron, Athlon64) K8, F10h, F11h"
-	depends on EDAC_MM_EDAC && X86 && PCI
-	default m
+	depends on EDAC_MM_EDAC && K8_NB && X86_64 && PCI
 	help
-	Support for error detection and correction on the AMD 64
-	Families of Memory Controllers (K8, F10h and F11h)
+	  Support for error detection and correction on the AMD 64
+	  Families of Memory Controllers (K8, F10h and F11h)
 
 config EDAC_AMD64_ERROR_INJECTION
 	bool "Sysfs Error Injection facilities"
-- 
cgit v0.10.2


From 9456ffffcf3c2f8fafeb9dee45f1502209602253 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Wed, 20 May 2009 20:26:53 +0200
Subject: EDAC: do not enable modules by default

Prevent EDAC compilation units from being built by default and let the
user explicitly select the needed modules.

Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Tested-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index eff363d..ab4f359 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -49,7 +49,6 @@ config EDAC_DEBUG_VERBOSE
 
 config EDAC_MM_EDAC
 	tristate "Main Memory EDAC (Error Detection And Correction) reporting"
-	default y
 	help
 	  Some systems are able to detect and correct errors in main
 	  memory.  EDAC can report statistics on memory error
-- 
cgit v0.10.2


From c476c23b45a41eb4e3ea63af786cc4d74762fe11 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Wed, 20 May 2009 20:31:58 +0200
Subject: amd64_edac: add MAINTAINERS entry

Acked-by: Doug Thompson <dougthompson@xmission.com>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/MAINTAINERS b/MAINTAINERS
index cf4abdd..6362ee8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1978,6 +1978,16 @@ F:	Documentation/edac.txt
 F:	drivers/edac/edac_*
 F:	include/linux/edac.h
 
+EDAC-AMD64
+P:	Doug Thompson
+M:	dougthompson@xmission.com
+P:	Borislav Petkov
+M:	borislav.petkov@amd.com
+L:	bluesmoke-devel@lists.sourceforge.net (moderated for non-subscribers)
+W:	bluesmoke.sourceforge.net
+S:	Supported
+F:	drivers/edac/amd64_edac*
+
 EDAC-E752X
 P:	Mark Gross
 M:	mark.gross@intel.com
-- 
cgit v0.10.2


From dc81081b2d9a6a9d64dad1bef1e5fc9fb660e53e Mon Sep 17 00:00:00 2001
From: Yong Wang <yong.y.wang@linux.intel.com>
Date: Wed, 10 Jun 2009 17:06:12 +0800
Subject: perf_counter/x86: Fix the model number of Intel Core2 processors

Fix the model number of Intel Core2 processors according to the
documentation: Intel Processor Identification with the CPUID
Instruction: http://www.intel.com/support/processors/sb/cs-009861.htm

Signed-off-by: Yong Wang <yong.y.wang@intel.com>
Also-Reported-by: Arnd Bergmann <arnd@arndb.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <20090610090612.GA26580@ywang-moblin2.bj.intel.com>
[ Added two more model numbers suggested by Arnd Bergmann ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 40978aa..49f2585 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1407,7 +1407,10 @@ static int intel_pmu_init(void)
 	 * Install the hw-cache-events table:
 	 */
 	switch (boot_cpu_data.x86_model) {
-	case 17:
+	case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
+	case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
+	case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
+	case 29: /* six-core 45 nm xeon "Dunnington" */
 		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 
-- 
cgit v0.10.2


From 4da646b7b52552f3b43eae27ffa5aa2c200f6db6 Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jeff@garzik.org>
Date: Wed, 8 Apr 2009 02:00:13 -0400
Subject: [libata] ahci: use less error-prone array initializers

Also, remove unneeded prototype.

Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 6b91c26..cd832cb 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -313,7 +313,6 @@ static void ahci_error_handler(struct ata_port *ap);
 static void ahci_post_internal_cmd(struct ata_queued_cmd *qc);
 static int ahci_port_resume(struct ata_port *ap);
 static void ahci_dev_config(struct ata_device *dev);
-static unsigned int ahci_fill_sg(struct ata_queued_cmd *qc, void *cmd_tbl);
 static void ahci_fill_cmd_slot(struct ahci_port_priv *pp, unsigned int tag,
 			       u32 opts);
 #ifdef CONFIG_PM
@@ -404,14 +403,14 @@ static struct ata_port_operations ahci_sb600_ops = {
 #define AHCI_HFLAGS(flags)	.private_data	= (void *)(flags)
 
 static const struct ata_port_info ahci_port_info[] = {
-	/* board_ahci */
+	[board_ahci] =
 	{
 		.flags		= AHCI_FLAG_COMMON,
 		.pio_mask	= ATA_PIO4,
 		.udma_mask	= ATA_UDMA6,
 		.port_ops	= &ahci_ops,
 	},
-	/* board_ahci_vt8251 */
+	[board_ahci_vt8251] =
 	{
 		AHCI_HFLAGS	(AHCI_HFLAG_NO_NCQ | AHCI_HFLAG_NO_PMP),
 		.flags		= AHCI_FLAG_COMMON,
@@ -419,7 +418,7 @@ static const struct ata_port_info ahci_port_info[] = {
 		.udma_mask	= ATA_UDMA6,
 		.port_ops	= &ahci_vt8251_ops,
 	},
-	/* board_ahci_ign_iferr */
+	[board_ahci_ign_iferr] =
 	{
 		AHCI_HFLAGS	(AHCI_HFLAG_IGN_IRQ_IF_ERR),
 		.flags		= AHCI_FLAG_COMMON,
@@ -427,7 +426,7 @@ static const struct ata_port_info ahci_port_info[] = {
 		.udma_mask	= ATA_UDMA6,
 		.port_ops	= &ahci_ops,
 	},
-	/* board_ahci_sb600 */
+	[board_ahci_sb600] =
 	{
 		AHCI_HFLAGS	(AHCI_HFLAG_IGN_SERR_INTERNAL |
 				 AHCI_HFLAG_32BIT_ONLY | AHCI_HFLAG_NO_MSI |
@@ -437,7 +436,7 @@ static const struct ata_port_info ahci_port_info[] = {
 		.udma_mask	= ATA_UDMA6,
 		.port_ops	= &ahci_sb600_ops,
 	},
-	/* board_ahci_mv */
+	[board_ahci_mv] =
 	{
 		AHCI_HFLAGS	(AHCI_HFLAG_NO_NCQ | AHCI_HFLAG_NO_MSI |
 				 AHCI_HFLAG_MV_PATA | AHCI_HFLAG_NO_PMP),
@@ -447,7 +446,7 @@ static const struct ata_port_info ahci_port_info[] = {
 		.udma_mask	= ATA_UDMA6,
 		.port_ops	= &ahci_ops,
 	},
-	/* board_ahci_sb700, for SB700 and SB800 */
+	[board_ahci_sb700] =	/* for SB700 and SB800 */
 	{
 		AHCI_HFLAGS	(AHCI_HFLAG_IGN_SERR_INTERNAL),
 		.flags		= AHCI_FLAG_COMMON,
@@ -455,7 +454,7 @@ static const struct ata_port_info ahci_port_info[] = {
 		.udma_mask	= ATA_UDMA6,
 		.port_ops	= &ahci_sb600_ops,
 	},
-	/* board_ahci_mcp65 */
+	[board_ahci_mcp65] =
 	{
 		AHCI_HFLAGS	(AHCI_HFLAG_YES_NCQ),
 		.flags		= AHCI_FLAG_COMMON,
@@ -463,7 +462,7 @@ static const struct ata_port_info ahci_port_info[] = {
 		.udma_mask	= ATA_UDMA6,
 		.port_ops	= &ahci_ops,
 	},
-	/* board_ahci_nopmp */
+	[board_ahci_nopmp] =
 	{
 		AHCI_HFLAGS	(AHCI_HFLAG_NO_PMP),
 		.flags		= AHCI_FLAG_COMMON,
-- 
cgit v0.10.2


From 2102d7497393e982bf38ffe8f5fd3d487104880d Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Sun, 15 Feb 2009 23:30:38 +0400
Subject: libata-sff: avoid byte swapping in ata_sff_data_xfer()

Handling of the trailing byte in ata_sff_data_xfer() is suboptimal bacause:

- it always initializes the padding buffer to 0 which is not really needed in
  both the read and write cases;

- it has to use memcpy() to transfer a single byte from/to the padding buffer;

- it uses io{read|write}16() accessors which swap bytes on the big endian CPUs
  and so have to additionally convert the data from/to the little endian format
  instead of using io{read|write}16_rep() accessors which are not supposed to
  change the byte ordering.

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index bb18415..bbbb1fa 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -727,17 +727,23 @@ unsigned int ata_sff_data_xfer(struct ata_device *dev, unsigned char *buf,
 	else
 		iowrite16_rep(data_addr, buf, words);
 
-	/* Transfer trailing 1 byte, if any. */
+	/* Transfer trailing byte, if any. */
 	if (unlikely(buflen & 0x01)) {
-		__le16 align_buf[1] = { 0 };
-		unsigned char *trailing_buf = buf + buflen - 1;
+		unsigned char pad[2];
 
+		/* Point buf to the tail of buffer */
+		buf += buflen - 1;
+
+		/*
+		 * Use io*16_rep() accessors here as well to avoid pointlessly
+		 * swapping bytes to and fro on the big endian machines...
+		 */
 		if (rw == READ) {
-			align_buf[0] = cpu_to_le16(ioread16(data_addr));
-			memcpy(trailing_buf, align_buf, 1);
+			ioread16_rep(data_addr, pad, 1);
+			*buf = pad[0];
 		} else {
-			memcpy(align_buf, trailing_buf, 1);
-			iowrite16(le16_to_cpu(align_buf[0]), data_addr);
+			pad[0] = *buf;
+			iowrite16_rep(data_addr, pad, 1);
 		}
 		words++;
 	}
-- 
cgit v0.10.2


From f35b5e7c066a41f60683d2689e52a1336479913b Mon Sep 17 00:00:00 2001
From: Alexander Beregalov <a.beregalov@gmail.com>
Date: Wed, 15 Apr 2009 00:00:54 +0400
Subject: sata_sx4: speed up ECC initialization

ECC initialization takes too long. It writes zeroes by portions
of 4 byte, it takes more than 6 minutes on my machine to initialize
512Mb ECC DIMM module. Change portion to 128Kb - it significantly
reduces initialization time.

Signed-off-by: Alexander Beregalov <a.beregalov@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/sata_sx4.c b/drivers/ata/sata_sx4.c
index eb05a3c..bbcf970 100644
--- a/drivers/ata/sata_sx4.c
+++ b/drivers/ata/sata_sx4.c
@@ -193,6 +193,7 @@ enum {
 					  PDC_TIMER_MASK_INT,
 };
 
+#define ECC_ERASE_BUF_SZ (128 * 1024)
 
 struct pdc_port_priv {
 	u8			dimm_buf[(ATA_PRD_SZ * ATA_MAX_PRD) + 512];
@@ -1280,7 +1281,6 @@ static unsigned int pdc20621_dimm_init(struct ata_host *host)
 {
 	int speed, size, length;
 	u32 addr, spd0, pci_status;
-	u32 tmp = 0;
 	u32 time_period = 0;
 	u32 tcount = 0;
 	u32 ticks = 0;
@@ -1395,14 +1395,17 @@ static unsigned int pdc20621_dimm_init(struct ata_host *host)
 	pdc20621_i2c_read(host, PDC_DIMM0_SPD_DEV_ADDRESS,
 			  PDC_DIMM_SPD_TYPE, &spd0);
 	if (spd0 == 0x02) {
+		void *buf;
 		VPRINTK("Start ECC initialization\n");
 		addr = 0;
 		length = size * 1024 * 1024;
+		buf = kzalloc(ECC_ERASE_BUF_SZ, GFP_KERNEL);
 		while (addr < length) {
-			pdc20621_put_to_dimm(host, (void *) &tmp, addr,
-					     sizeof(u32));
-			addr += sizeof(u32);
+			pdc20621_put_to_dimm(host, buf, addr,
+					     ECC_ERASE_BUF_SZ);
+			addr += ECC_ERASE_BUF_SZ;
 		}
+		kfree(buf);
 		VPRINTK("Finish ECC initialization\n");
 	}
 	return 0;
-- 
cgit v0.10.2


From 31f80112cc7e7ea4c220d6f62b0a7052754befb3 Mon Sep 17 00:00:00 2001
From: Robert Hancock <hancockrwd@gmail.com>
Date: Mon, 13 Apr 2009 22:57:28 -0600
Subject: sata_sil: enable 32-bit PIO

32-bit PIO seems to work fine on sata_sil hardware (tested on SiI3114) and is
listed as OK in the Silicon Image datasheets. Enable it.

Signed-off-by: Robert Hancock <hancockrwd@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/sata_sil.c b/drivers/ata/sata_sil.c
index e67ce8e..030ec07 100644
--- a/drivers/ata/sata_sil.c
+++ b/drivers/ata/sata_sil.c
@@ -183,7 +183,7 @@ static struct scsi_host_template sil_sht = {
 };
 
 static struct ata_port_operations sil_ops = {
-	.inherits		= &ata_bmdma_port_ops,
+	.inherits		= &ata_bmdma32_port_ops,
 	.dev_config		= sil_dev_config,
 	.set_mode		= sil_set_mode,
 	.bmdma_setup            = sil_bmdma_setup,
-- 
cgit v0.10.2


From 437681800bdaa9feb58cf943dfbbd239c21d3705 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 20 May 2009 09:44:39 +0200
Subject: [libata] get rid of ATA_MAX_QUEUE loop in ata_qc_complete_multiple()
 v2

We very rarely (if ever) complete more than one command in the
sactive mask at the time, even for extremely high IO rates. So
looping over the entire range of possible tags is pointless,
instead use __ffs() to just find the completed tags directly.

Updated to clear the tag from the done_mask instead of shifting
done_mask down as suggested by From: Tejun Heo <htejun@gmail.com>
Verified with a user space tester to produce the same results.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index c924230..ca4d208 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -5031,7 +5031,6 @@ int ata_qc_complete_multiple(struct ata_port *ap, u32 qc_active)
 {
 	int nr_done = 0;
 	u32 done_mask;
-	int i;
 
 	done_mask = ap->qc_active ^ qc_active;
 
@@ -5041,16 +5040,16 @@ int ata_qc_complete_multiple(struct ata_port *ap, u32 qc_active)
 		return -EINVAL;
 	}
 
-	for (i = 0; i < ATA_MAX_QUEUE; i++) {
+	while (done_mask) {
 		struct ata_queued_cmd *qc;
+		unsigned int tag = __ffs(done_mask);
 
-		if (!(done_mask & (1 << i)))
-			continue;
-
-		if ((qc = ata_qc_from_tag(ap, i))) {
+		qc = ata_qc_from_tag(ap, tag);
+		if (qc) {
 			ata_qc_complete(qc);
 			nr_done++;
 		}
+		done_mask &= ~(1 << tag);
 	}
 
 	return nr_done;
-- 
cgit v0.10.2


From d50ce07d6fd8a422757a231f9d7293cbddeaec31 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 12 May 2009 10:57:41 +0900
Subject: ahci: misc cleanups for EM stuff

Make the following EM related cleanups.

* Use msleep(1) instead of udelay(100) and reduce retry count to 5.

* s/MAX_SLOTS/EM_MAX_SLOTS/, s/MAX_RETRY/EM_MAX_RETRY/

* Make EM constants enums as suggested by Jeff.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David Milburn <dmilburn@redhat.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index cd832cb..2141a31 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -77,8 +77,6 @@ static ssize_t ahci_led_store(struct ata_port *ap, const char *buf,
 			      size_t size);
 static ssize_t ahci_transmit_led_message(struct ata_port *ap, u32 state,
 					ssize_t size);
-#define MAX_SLOTS 8
-#define MAX_RETRY 15
 
 enum {
 	AHCI_PCI_BAR		= 5,
@@ -231,6 +229,10 @@ enum {
 
 	ICH_MAP				= 0x90, /* ICH MAP register */
 
+	/* em constants */
+	EM_MAX_SLOTS			= 8,
+	EM_MAX_RETRY			= 5,
+
 	/* em_ctl bits */
 	EM_CTL_RST			= (1 << 9), /* Reset */
 	EM_CTL_TM			= (1 << 8), /* Transmit Message */
@@ -282,8 +284,8 @@ struct ahci_port_priv {
 	unsigned int		ncq_saw_dmas:1;
 	unsigned int		ncq_saw_sdb:1;
 	u32 			intr_mask;	/* interrupts to enable */
-	struct ahci_em_priv	em_priv[MAX_SLOTS];/* enclosure management info
-					 	 * per PM slot */
+	/* enclosure management info per PM slot */
+	struct ahci_em_priv	em_priv[EM_MAX_SLOTS];
 };
 
 static int ahci_scr_read(struct ata_link *link, unsigned int sc_reg, u32 *val);
@@ -1140,12 +1142,12 @@ static void ahci_start_port(struct ata_port *ap)
 			emp = &pp->em_priv[link->pmp];
 
 			/* EM Transmit bit maybe busy during init */
-			for (i = 0; i < MAX_RETRY; i++) {
+			for (i = 0; i < EM_MAX_RETRY; i++) {
 				rc = ahci_transmit_led_message(ap,
 							       emp->led_state,
 							       4);
 				if (rc == -EBUSY)
-					udelay(100);
+					msleep(1);
 				else
 					break;
 			}
@@ -1339,7 +1341,7 @@ static ssize_t ahci_transmit_led_message(struct ata_port *ap, u32 state,
 
 	/* get the slot number from the message */
 	pmp = (state & EM_MSG_LED_PMP_SLOT) >> 8;
-	if (pmp < MAX_SLOTS)
+	if (pmp < EM_MAX_SLOTS)
 		emp = &pp->em_priv[pmp];
 	else
 		return -EINVAL;
@@ -1407,7 +1409,7 @@ static ssize_t ahci_led_store(struct ata_port *ap, const char *buf,
 
 	/* get the slot number from the message */
 	pmp = (state & EM_MSG_LED_PMP_SLOT) >> 8;
-	if (pmp < MAX_SLOTS)
+	if (pmp < EM_MAX_SLOTS)
 		emp = &pp->em_priv[pmp];
 	else
 		return -EINVAL;
-- 
cgit v0.10.2


From 347979a034539ab20f3bc0c30ac0ccd3c4fd4c2e Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Wed, 6 May 2009 17:10:08 +0100
Subject: ata_piix: Turn on hotplugging support for older chips

We can't do this for the later ones as they have all sorts of magic boot
time stuff that needs reviewing and the like. However we can do it for the
older ones and it turns out we need to as some IBM docking stations have a
second PIIX series device in them and without this change you can't use it
very well

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c
index 1aeb708..716e369 100644
--- a/drivers/ata/ata_piix.c
+++ b/drivers/ata/ata_piix.c
@@ -1509,8 +1509,8 @@ static int __devinit piix_init_one(struct pci_dev *pdev,
 		dev_printk(KERN_DEBUG, &pdev->dev,
 			   "version " DRV_VERSION "\n");
 
-	/* no hotplugging support (FIXME) */
-	if (!in_module_init)
+	/* no hotplugging support for later devices (FIXME) */
+	if (!in_module_init && ent->driver_data >= ich5_sata)
 		return -ENODEV;
 
 	if (piix_broken_system_poweroff(pdev)) {
-- 
cgit v0.10.2


From 7654db1a9256d746ae4d229ba675f616a5d5e1a1 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Wed, 6 May 2009 17:10:17 +0100
Subject: ata_piix: Remove stale comment

Combined mode pci quirk hacks went away - so the table to keep in sync
no longer exists.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c
index 716e369..6e165bf 100644
--- a/drivers/ata/ata_piix.c
+++ b/drivers/ata/ata_piix.c
@@ -223,10 +223,8 @@ static const struct pci_device_id piix_pci_tbl[] = {
 	/* ICH8 Mobile PATA Controller */
 	{ 0x8086, 0x2850, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ich_pata_100 },
 
-	/* NOTE: The following PCI ids must be kept in sync with the
-	 * list in drivers/pci/quirks.c.
-	 */
-
+	/* SATA ports */
+	
 	/* 82801EB (ICH5) */
 	{ 0x8086, 0x24d1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ich5_sata },
 	/* 82801EB (ICH5) */
-- 
cgit v0.10.2


From ac04527f7947020c5890090b2ac87af4e98d977e Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 8 Jun 2009 15:52:39 +0300
Subject: KVM: Disable large pages on misaligned memory slots

If a slots guest physical address and host virtual address unequal (mod
large page size), then we would erronously try to back guest large pages
with host large pages.  Detect this misalignment and diable large page
support for the trouble slot.

Cc: stable@kernel.org
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5fed9bf..5f865ed 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1086,7 +1086,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 {
 	int r;
 	gfn_t base_gfn;
-	unsigned long npages;
+	unsigned long npages, ugfn;
 	int largepages;
 	unsigned long i;
 	struct kvm_memory_slot *memslot;
@@ -1177,6 +1177,14 @@ int __kvm_set_memory_region(struct kvm *kvm,
 			new.lpage_info[0].write_count = 1;
 		if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE)
 			new.lpage_info[largepages-1].write_count = 1;
+		ugfn = new.userspace_addr >> PAGE_SHIFT;
+		/*
+		 * If the gfn and userspace address are not aligned wrt each
+		 * other, disable large page support for this slot
+		 */
+		if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE - 1))
+			for (i = 0; i < largepages; ++i)
+				new.lpage_info[i].write_count = 1;
 	}
 
 	/* Allocate page dirty bitmap if needed */
-- 
cgit v0.10.2


From 09f8ca74ae6c2d78b2c7f6c0751ed0cbe815a3d9 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 8 Jun 2009 15:55:21 +0300
Subject: KVM: Prevent overflow in largepages calculation

If userspace specifies a memory slot that is larger than 8 petabytes, it
could overflow the largepages variable.

Cc: stable@kernel.org
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5f865ed..e211945 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1087,8 +1087,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	int r;
 	gfn_t base_gfn;
 	unsigned long npages, ugfn;
-	int largepages;
-	unsigned long i;
+	unsigned long largepages, i;
 	struct kvm_memory_slot *memslot;
 	struct kvm_memory_slot old, new;
 
-- 
cgit v0.10.2


From bd2b5b12849a3446abad0b25e920f86f5480b309 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 10 Jun 2009 13:40:57 +0200
Subject: perf_counter: More aggressive frequency adjustment

Also employ the overflow handler to adjust the frequency, this results
in a stable frequency in about 40~50 samples, instead of that many ticks.

This also means we can start sampling at a sample period of 1 without
running head-first into the throttle.

It relies on sched_clock() to accurately measure the time difference
between the overflow NMIs.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 49f2585..240ca56 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -696,10 +696,11 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (!attr->exclude_kernel)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
 
-	if (!hwc->sample_period)
+	if (!hwc->sample_period) {
 		hwc->sample_period = x86_pmu.max_period;
+		atomic64_set(&hwc->period_left, hwc->sample_period);
+	}
 
-	atomic64_set(&hwc->period_left, hwc->sample_period);
 	counter->destroy = hw_perf_counter_destroy;
 
 	/*
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 3586df8..282d8cc 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -371,6 +371,7 @@ struct hw_perf_counter {
 
 	u64				freq_count;
 	u64				freq_interrupts;
+	u64				freq_stamp;
 #endif
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 5eacaaf..51c571e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1184,13 +1184,33 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 static void perf_log_throttle(struct perf_counter *counter, int enable);
 static void perf_log_period(struct perf_counter *counter, u64 period);
 
-static void perf_adjust_freq(struct perf_counter_context *ctx)
+static void perf_adjust_period(struct perf_counter *counter, u64 events)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+	u64 period, sample_period;
+	s64 delta;
+
+	events *= hwc->sample_period;
+	period = div64_u64(events, counter->attr.sample_freq);
+
+	delta = (s64)(period - hwc->sample_period);
+	delta = (delta + 7) / 8; /* low pass filter */
+
+	sample_period = hwc->sample_period + delta;
+
+	if (!sample_period)
+		sample_period = 1;
+
+	perf_log_period(counter, sample_period);
+
+	hwc->sample_period = sample_period;
+}
+
+static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
 {
 	struct perf_counter *counter;
 	struct hw_perf_counter *hwc;
-	u64 interrupts, sample_period;
-	u64 events, period, freq;
-	s64 delta;
+	u64 interrupts, freq;
 
 	spin_lock(&ctx->lock);
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
@@ -1202,6 +1222,9 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 		interrupts = hwc->interrupts;
 		hwc->interrupts = 0;
 
+		/*
+		 * unthrottle counters on the tick
+		 */
 		if (interrupts == MAX_INTERRUPTS) {
 			perf_log_throttle(counter, 1);
 			counter->pmu->unthrottle(counter);
@@ -1211,6 +1234,9 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 		if (!counter->attr.freq || !counter->attr.sample_freq)
 			continue;
 
+		/*
+		 * if the specified freq < HZ then we need to skip ticks
+		 */
 		if (counter->attr.sample_freq < HZ) {
 			freq = counter->attr.sample_freq;
 
@@ -1226,20 +1252,20 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 		} else
 			freq = HZ;
 
-		events = freq * interrupts * hwc->sample_period;
-		period = div64_u64(events, counter->attr.sample_freq);
-
-		delta = (s64)(1 + period - hwc->sample_period);
-		delta >>= 1;
-
-		sample_period = hwc->sample_period + delta;
-
-		if (!sample_period)
-			sample_period = 1;
+		perf_adjust_period(counter, freq * interrupts);
 
-		perf_log_period(counter, sample_period);
-
-		hwc->sample_period = sample_period;
+		/*
+		 * In order to avoid being stalled by an (accidental) huge
+		 * sample period, force reset the sample period if we didn't
+		 * get any events in this freq period.
+		 */
+		if (!interrupts) {
+			perf_disable();
+			counter->pmu->disable(counter);
+			atomic_set(&hwc->period_left, 0);
+			counter->pmu->enable(counter);
+			perf_enable();
+		}
 	}
 	spin_unlock(&ctx->lock);
 }
@@ -1279,9 +1305,9 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	cpuctx = &per_cpu(perf_cpu_context, cpu);
 	ctx = curr->perf_counter_ctxp;
 
-	perf_adjust_freq(&cpuctx->ctx);
+	perf_ctx_adjust_freq(&cpuctx->ctx);
 	if (ctx)
-		perf_adjust_freq(ctx);
+		perf_ctx_adjust_freq(ctx);
 
 	perf_counter_cpu_sched_out(cpuctx);
 	if (ctx)
@@ -1647,10 +1673,10 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
 
 		counter->attr.sample_freq = value;
 	} else {
+		perf_log_period(counter, value);
+
 		counter->attr.sample_period = value;
 		counter->hw.sample_period = value;
-
-		perf_log_period(counter, value);
 	}
 unlock:
 	spin_unlock_irq(&ctx->lock);
@@ -2853,35 +2879,41 @@ void __perf_counter_mmap(struct vm_area_struct *vma)
  * event flow.
  */
 
+struct freq_event {
+	struct perf_event_header	header;
+	u64				time;
+	u64				id;
+	u64				period;
+};
+
 static void perf_log_period(struct perf_counter *counter, u64 period)
 {
 	struct perf_output_handle handle;
+	struct freq_event event;
 	int ret;
 
-	struct {
-		struct perf_event_header	header;
-		u64				time;
-		u64				id;
-		u64				period;
-	} freq_event = {
+	if (counter->hw.sample_period == period)
+		return;
+
+	if (counter->attr.sample_type & PERF_SAMPLE_PERIOD)
+		return;
+
+	event = (struct freq_event) {
 		.header = {
 			.type = PERF_EVENT_PERIOD,
 			.misc = 0,
-			.size = sizeof(freq_event),
+			.size = sizeof(event),
 		},
 		.time = sched_clock(),
 		.id = counter->id,
 		.period = period,
 	};
 
-	if (counter->hw.sample_period == period)
-		return;
-
-	ret = perf_output_begin(&handle, counter, sizeof(freq_event), 0, 0);
+	ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0);
 	if (ret)
 		return;
 
-	perf_output_put(&handle, freq_event);
+	perf_output_put(&handle, event);
 	perf_output_end(&handle);
 }
 
@@ -2923,15 +2955,16 @@ int perf_counter_overflow(struct perf_counter *counter,
 {
 	int events = atomic_read(&counter->event_limit);
 	int throttle = counter->pmu->unthrottle != NULL;
+	struct hw_perf_counter *hwc = &counter->hw;
 	int ret = 0;
 
 	if (!throttle) {
-		counter->hw.interrupts++;
+		hwc->interrupts++;
 	} else {
-		if (counter->hw.interrupts != MAX_INTERRUPTS) {
-			counter->hw.interrupts++;
-			if (HZ*counter->hw.interrupts > (u64)sysctl_perf_counter_limit) {
-				counter->hw.interrupts = MAX_INTERRUPTS;
+		if (hwc->interrupts != MAX_INTERRUPTS) {
+			hwc->interrupts++;
+			if (HZ * hwc->interrupts > (u64)sysctl_perf_counter_limit) {
+				hwc->interrupts = MAX_INTERRUPTS;
 				perf_log_throttle(counter, 0);
 				ret = 1;
 			}
@@ -2945,6 +2978,16 @@ int perf_counter_overflow(struct perf_counter *counter,
 		}
 	}
 
+	if (counter->attr.freq) {
+		u64 now = sched_clock();
+		s64 delta = now - hwc->freq_stamp;
+
+		hwc->freq_stamp = now;
+
+		if (delta > 0 && delta < TICK_NSEC)
+			perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
+	}
+
 	/*
 	 * XXX event_limit might not quite work as expected on inherited
 	 * counters
@@ -3379,7 +3422,6 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 
 	counter->destroy = tp_perf_counter_destroy;
-	counter->hw.sample_period = counter->attr.sample_period;
 
 	return &perf_ops_generic;
 }
@@ -3483,10 +3525,11 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 	pmu = NULL;
 
 	hwc = &counter->hw;
+	hwc->sample_period = attr->sample_period;
 	if (attr->freq && attr->sample_freq)
-		hwc->sample_period = div64_u64(TICK_NSEC, attr->sample_freq);
-	else
-		hwc->sample_period = attr->sample_period;
+		hwc->sample_period = 1;
+
+	atomic64_set(&hwc->period_left, hwc->sample_period);
 
 	/*
 	 * we currently do not support PERF_SAMPLE_GROUP on inherited counters
@@ -3687,6 +3730,9 @@ inherit_counter(struct perf_counter *parent_counter,
 	else
 		child_counter->state = PERF_COUNTER_STATE_OFF;
 
+	if (parent_counter->attr.freq)
+		child_counter->hw.sample_period = parent_counter->hw.sample_period;
+
 	/*
 	 * Link it up in the child's context:
 	 */
-- 
cgit v0.10.2


From 4502d77c1d8f15f20c04b92cb96c12d4e465de29 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 10 Jun 2009 15:03:06 +0200
Subject: perf_counter tools: Small frequency related fixes

Create the counter in a disabled state and only enable it after we
mmap() the buffer, this allows us to see the first few samples (and
observe the frequency ramp).

Furthermore, print the period in the verbose report.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index deaee42..a5698ad 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -347,6 +347,7 @@ static void create_counter(int counter, int cpu, pid_t pid)
 	attr->mmap		= track;
 	attr->comm		= track;
 	attr->inherit		= (cpu < 0) && inherit;
+	attr->disabled		= 1;
 
 	track = 0; /* only the first counter needs these */
 
@@ -402,6 +403,8 @@ try_again:
 		error("failed to mmap with %d (%s)\n", errno, strerror(errno));
 		exit(-1);
 	}
+
+	ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE);
 }
 
 static void open_counters(int cpu, pid_t pid)
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 0b18cb9..9a0e31e 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -47,6 +47,7 @@ struct ip_event {
 	struct perf_event_header header;
 	__u64 ip;
 	__u32 pid, tid;
+	__u64 period;
 };
 
 struct mmap_event {
@@ -943,12 +944,13 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 	uint64_t ip = event->ip.ip;
 	struct map *map = NULL;
 
-	dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
+	dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p period: %Ld\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->header.misc,
 		event->ip.pid,
-		(void *)(long)ip);
+		(void *)(long)ip,
+		(long long)event->ip.period);
 
 	dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
 
-- 
cgit v0.10.2


From f7b7c26e01e51fe46097e11f179dc71ce7950084 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 10 Jun 2009 15:55:59 +0200
Subject: perf_counter tools: Propagate signals properly

Currently report and stat catch SIGINT (and others) without altering
their exit state. This means that things like:

   while :; do perf stat ./foo ; done

Loops become hard-to-interrupt, because bash never sees perf terminate
due to interruption. Fix this.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index a5698ad..c10553c 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -169,10 +169,21 @@ static void mmap_read(struct mmap_data *md)
 }
 
 static volatile int done = 0;
+static volatile int signr = -1;
 
 static void sig_handler(int sig)
 {
 	done = 1;
+	signr = sig;
+}
+
+static void sig_atexit(void)
+{
+	if (signr == -1)
+		return;
+
+	signal(signr, SIG_DFL);
+	kill(getpid(), signr);
 }
 
 static void pid_synthesize_comm_event(pid_t pid, int full)
@@ -459,6 +470,7 @@ static int __cmd_record(int argc, const char **argv)
 	} else for (i = 0; i < nr_cpus; i++)
 		open_counters(i, target_pid);
 
+	atexit(sig_atexit);
 	signal(SIGCHLD, sig_handler);
 	signal(SIGINT, sig_handler);
 
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 8085509..6404906 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -296,8 +296,20 @@ static int do_perf_stat(int argc, const char **argv)
 	return 0;
 }
 
+static volatile int signr = -1;
+
 static void skip_signal(int signo)
 {
+	signr = signo;
+}
+
+static void sig_atexit(void)
+{
+	if (signr == -1)
+		return;
+
+	signal(signr, SIG_DFL);
+	kill(getpid(), signr);
 }
 
 static const char * const stat_usage[] = {
@@ -345,6 +357,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix)
 	 * What we want is for Ctrl-C to work in the exec()-ed
 	 * task, but being ignored by perf stat itself:
 	 */
+	atexit(sig_atexit);
 	signal(SIGINT,  skip_signal);
 	signal(SIGALRM, skip_signal);
 	signal(SIGABRT, skip_signal);
-- 
cgit v0.10.2


From 58a09b38cfcd700b796ea07ae3d2e0efbb28b561 Mon Sep 17 00:00:00 2001
From: Shane Huang <shane.huang@amd.com>
Date: Wed, 27 May 2009 15:04:43 +0800
Subject: [libata] ahci: Restore SB600 SATA controller 64 bit DMA

Community reported one SB600 SATA issue(BZ #9412), which led to 64 bit
DMA disablement for all SB600 revisions by driver maintainers with
commits c7a42156d99bcea7f8173ba7a6034bbaa2ecb77c and
4cde32fc4b32e96a99063af3183acdfd54c563f0.

But the root cause is ASUS M2A-VM system BIOS bug in old revisions
like 0901, while forcing into 32bit DMA happens to work as workaround.
Now it's time to withdraw 4cde32fc4b32e96a99063af3183acdfd54c563f0
so as to restore the SB600 SATA 64bit DMA capability.
This patch is also adding the workaround for M2A-VM old BIOS revisions,
but users are suggested to upgrade their system BIOS to the latest one
if they meet this issue.

Signed-off-by: Shane Huang <shane.huang@amd.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 2141a31..15a2303 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -431,8 +431,7 @@ static const struct ata_port_info ahci_port_info[] = {
 	[board_ahci_sb600] =
 	{
 		AHCI_HFLAGS	(AHCI_HFLAG_IGN_SERR_INTERNAL |
-				 AHCI_HFLAG_32BIT_ONLY | AHCI_HFLAG_NO_MSI |
-				 AHCI_HFLAG_SECT255),
+				 AHCI_HFLAG_NO_MSI | AHCI_HFLAG_SECT255),
 		.flags		= AHCI_FLAG_COMMON,
 		.pio_mask	= ATA_PIO4,
 		.udma_mask	= ATA_UDMA6,
@@ -2585,6 +2584,51 @@ static void ahci_p5wdh_workaround(struct ata_host *host)
 	}
 }
 
+/*
+ * SB600 ahci controller on ASUS M2A-VM can't do 64bit DMA with older
+ * BIOS.  The oldest version known to be broken is 0901 and working is
+ * 1501 which was released on 2007-10-26.  Force 32bit DMA on anything
+ * older than 1501.  Please read bko#9412 for more info.
+ */
+static bool ahci_asus_m2a_vm_32bit_only(struct pci_dev *pdev)
+{
+	static const struct dmi_system_id sysids[] = {
+		{
+			.ident = "ASUS M2A-VM",
+			.matches = {
+				DMI_MATCH(DMI_BOARD_VENDOR,
+					  "ASUSTeK Computer INC."),
+				DMI_MATCH(DMI_BOARD_NAME, "M2A-VM"),
+			},
+		},
+		{ }
+	};
+	const char *cutoff_mmdd = "10/26";
+	const char *date;
+	int year;
+
+	if (pdev->bus->number != 0 || pdev->devfn != PCI_DEVFN(0x12, 0) ||
+	    !dmi_check_system(sysids))
+		return false;
+
+	/*
+	 * Argh.... both version and date are free form strings.
+	 * Let's hope they're using the same date format across
+	 * different versions.
+	 */
+	date = dmi_get_system_info(DMI_BIOS_DATE);
+	year = dmi_get_year(DMI_BIOS_DATE);
+	if (date && strlen(date) >= 10 && date[2] == '/' && date[5] == '/' &&
+	    (year > 2007 ||
+	     (year == 2007 && strncmp(date, cutoff_mmdd, 5) >= 0)))
+		return false;
+
+	dev_printk(KERN_WARNING, &pdev->dev, "ASUS M2A-VM: BIOS too old, "
+		   "forcing 32bit DMA, update BIOS\n");
+
+	return true;
+}
+
 static bool ahci_broken_system_poweroff(struct pci_dev *pdev)
 {
 	static const struct dmi_system_id broken_systems[] = {
@@ -2745,6 +2789,10 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (board_id == board_ahci_sb700 && pdev->revision >= 0x40)
 		hpriv->flags &= ~AHCI_HFLAG_IGN_SERR_INTERNAL;
 
+	/* apply ASUS M2A_VM quirk */
+	if (ahci_asus_m2a_vm_32bit_only(pdev))
+		hpriv->flags |= AHCI_HFLAG_32BIT_ONLY;
+
 	if (!(hpriv->flags & AHCI_HFLAG_NO_MSI))
 		pci_enable_msi(pdev);
 
diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index 5f1b540..24c84ae 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -596,6 +596,7 @@ int dmi_get_year(int field)
 
 	return year;
 }
+EXPORT_SYMBOL(dmi_get_year);
 
 /**
  *	dmi_walk - Walk the DMI table and get called back for every record
-- 
cgit v0.10.2


From 7f4774b38ee6270bbc6c3015cb3fa6c415ffb340 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 10 Jun 2009 16:29:07 +0900
Subject: sata_nv: use hardreset only for post-boot probing

When I thought it was finally defeated, it came back with vengeance.
The failure cases are ever more convoluted.  Now there is a single
combination which fails boot probing - MCP5x + Intel SSD and there are
two hotplug failure reports on different flavors where softreset fails
to bring up the device.

Through the many bug reports after the switch to hardreset, the
following patterns emerged.

- Softreset during boot always works.

- Hardreset during boot sometimes fails to bring up the link on
  certain comibnations and device signature acquisition is unreliable.

- Hardreset is often necessary after hotplug.

It looks like the old behavior of preferring softreset was somehow
pretty close to the working reset protocol although it could have lost
a device during phy error handling by issuing hardreset.

This patch implements nv_hardreset() which kicks in only for post-boot
(!LOADING) device probing resets.  This should be able to work around
all known problem cases.  This isn't perfect but given the various
hardreset quirks on these controllers, I think this is as good as it
can get.

Tested on mcp5x (swncq), nf3 and ck804 for all both boot, warm and
hot probing cases.

Kudos to all the bug reporters and their painful hours with these damn
controllers.  ;-)

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Robert Hancock <hancockr@shaw.ca>
Reported-by: David Lang <david@lang.hm>
Reported-by: Samo Vodopivec <lament.email.si@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c
index 6cda12b..b2d11f3 100644
--- a/drivers/ata/sata_nv.c
+++ b/drivers/ata/sata_nv.c
@@ -305,8 +305,8 @@ static irqreturn_t nv_ck804_interrupt(int irq, void *dev_instance);
 static int nv_scr_read(struct ata_link *link, unsigned int sc_reg, u32 *val);
 static int nv_scr_write(struct ata_link *link, unsigned int sc_reg, u32 val);
 
-static int nv_noclassify_hardreset(struct ata_link *link, unsigned int *class,
-				   unsigned long deadline);
+static int nv_hardreset(struct ata_link *link, unsigned int *class,
+			unsigned long deadline);
 static void nv_nf2_freeze(struct ata_port *ap);
 static void nv_nf2_thaw(struct ata_port *ap);
 static void nv_ck804_freeze(struct ata_port *ap);
@@ -406,49 +406,82 @@ static struct scsi_host_template nv_swncq_sht = {
 	.slave_configure	= nv_swncq_slave_config,
 };
 
-static struct ata_port_operations nv_common_ops = {
+/*
+ * NV SATA controllers have various different problems with hardreset
+ * protocol depending on the specific controller and device.
+ *
+ * GENERIC:
+ *
+ *  bko11195 reports that link doesn't come online after hardreset on
+ *  generic nv's and there have been several other similar reports on
+ *  linux-ide.
+ *
+ *  bko12351#c23 reports that warmplug on MCP61 doesn't work with
+ *  softreset.
+ *
+ * NF2/3:
+ *
+ *  bko3352 reports nf2/3 controllers can't determine device signature
+ *  reliably after hardreset.  The following thread reports detection
+ *  failure on cold boot with the standard debouncing timing.
+ *
+ *  http://thread.gmane.org/gmane.linux.ide/34098
+ *
+ *  bko12176 reports that hardreset fails to bring up the link during
+ *  boot on nf2.
+ *
+ * CK804:
+ *
+ *  For initial probing after boot and hot plugging, hardreset mostly
+ *  works fine on CK804 but curiously, reprobing on the initial port
+ *  by rescanning or rmmod/insmod fails to acquire the initial D2H Reg
+ *  FIS in somewhat undeterministic way.
+ *
+ * SWNCQ:
+ *
+ *  bko12351 reports that when SWNCQ is enabled, for hotplug to work,
+ *  hardreset should be used and hardreset can't report proper
+ *  signature, which suggests that mcp5x is closer to nf2 as long as
+ *  reset quirkiness is concerned.
+ *
+ *  bko12703 reports that boot probing fails for intel SSD with
+ *  hardreset.  Link fails to come online.  Softreset works fine.
+ *
+ * The failures are varied but the following patterns seem true for
+ * all flavors.
+ *
+ * - Softreset during boot always works.
+ *
+ * - Hardreset during boot sometimes fails to bring up the link on
+ *   certain comibnations and device signature acquisition is
+ *   unreliable.
+ *
+ * - Hardreset is often necessary after hotplug.
+ *
+ * So, preferring softreset for boot probing and error handling (as
+ * hardreset might bring down the link) but using hardreset for
+ * post-boot probing should work around the above issues in most
+ * cases.  Define nv_hardreset() which only kicks in for post-boot
+ * probing and use it for all variants.
+ */
+static struct ata_port_operations nv_generic_ops = {
 	.inherits		= &ata_bmdma_port_ops,
 	.lost_interrupt		= ATA_OP_NULL,
 	.scr_read		= nv_scr_read,
 	.scr_write		= nv_scr_write,
+	.hardreset		= nv_hardreset,
 };
 
-/* OSDL bz11195 reports that link doesn't come online after hardreset
- * on generic nv's and there have been several other similar reports
- * on linux-ide.  Disable hardreset for generic nv's.
- */
-static struct ata_port_operations nv_generic_ops = {
-	.inherits		= &nv_common_ops,
-	.hardreset		= ATA_OP_NULL,
-};
-
-/* nf2 is ripe with hardreset related problems.
- *
- * kernel bz#3352 reports nf2/3 controllers can't determine device
- * signature reliably.  The following thread reports detection failure
- * on cold boot with the standard debouncing timing.
- *
- * http://thread.gmane.org/gmane.linux.ide/34098
- *
- * And bz#12176 reports that hardreset simply doesn't work on nf2.
- * Give up on it and just don't do hardreset.
- */
 static struct ata_port_operations nv_nf2_ops = {
 	.inherits		= &nv_generic_ops,
 	.freeze			= nv_nf2_freeze,
 	.thaw			= nv_nf2_thaw,
 };
 
-/* For initial probing after boot and hot plugging, hardreset mostly
- * works fine on CK804 but curiously, reprobing on the initial port by
- * rescanning or rmmod/insmod fails to acquire the initial D2H Reg FIS
- * in somewhat undeterministic way.  Use noclassify hardreset.
- */
 static struct ata_port_operations nv_ck804_ops = {
-	.inherits		= &nv_common_ops,
+	.inherits		= &nv_generic_ops,
 	.freeze			= nv_ck804_freeze,
 	.thaw			= nv_ck804_thaw,
-	.hardreset		= nv_noclassify_hardreset,
 	.host_stop		= nv_ck804_host_stop,
 };
 
@@ -476,19 +509,8 @@ static struct ata_port_operations nv_adma_ops = {
 	.host_stop		= nv_adma_host_stop,
 };
 
-/* Kernel bz#12351 reports that when SWNCQ is enabled, for hotplug to
- * work, hardreset should be used and hardreset can't report proper
- * signature, which suggests that mcp5x is closer to nf2 as long as
- * reset quirkiness is concerned.  Define separate ops for mcp5x with
- * nv_noclassify_hardreset().
- */
-static struct ata_port_operations nv_mcp5x_ops = {
-	.inherits		= &nv_common_ops,
-	.hardreset		= nv_noclassify_hardreset,
-};
-
 static struct ata_port_operations nv_swncq_ops = {
-	.inherits		= &nv_mcp5x_ops,
+	.inherits		= &nv_generic_ops,
 
 	.qc_defer		= ata_std_qc_defer,
 	.qc_prep		= nv_swncq_qc_prep,
@@ -557,7 +579,7 @@ static const struct ata_port_info nv_port_info[] = {
 		.pio_mask	= NV_PIO_MASK,
 		.mwdma_mask	= NV_MWDMA_MASK,
 		.udma_mask	= NV_UDMA_MASK,
-		.port_ops	= &nv_mcp5x_ops,
+		.port_ops	= &nv_generic_ops,
 		.private_data	= NV_PI_PRIV(nv_generic_interrupt, &nv_sht),
 	},
 	/* SWNCQ */
@@ -1559,15 +1581,24 @@ static int nv_scr_write(struct ata_link *link, unsigned int sc_reg, u32 val)
 	return 0;
 }
 
-static int nv_noclassify_hardreset(struct ata_link *link, unsigned int *class,
-				   unsigned long deadline)
+static int nv_hardreset(struct ata_link *link, unsigned int *class,
+			unsigned long deadline)
 {
-	bool online;
-	int rc;
+	struct ata_eh_context *ehc = &link->eh_context;
 
-	rc = sata_link_hardreset(link, sata_deb_timing_hotplug, deadline,
-				 &online, NULL);
-	return online ? -EAGAIN : rc;
+	/* Do hardreset iff it's post-boot probing, please read the
+	 * comment above port ops for details.
+	 */
+	if (!(link->ap->pflags & ATA_PFLAG_LOADING) &&
+	    !ata_dev_enabled(link->device))
+		sata_link_hardreset(link, sata_deb_timing_hotplug, deadline,
+				    NULL, NULL);
+	else if (!(ehc->i.flags & ATA_EHI_QUIET))
+		ata_link_printk(link, KERN_INFO,
+				"nv: skipping hardreset on occupied port\n");
+
+	/* device signature acquisition is unreliable */
+	return -EAGAIN;
 }
 
 static void nv_nf2_freeze(struct ata_port *ap)
-- 
cgit v0.10.2


From 517d3cc15b36392e518abab6bacbb72089658313 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Wed, 13 May 2009 15:02:42 +0100
Subject: [libata] ata_piix: Enable parallel scan

This patch turns on parallel scanning for the ata_piix driver.
This driver is used on most netbooks (no AHCI for cheap storage it seems).
The scan is the dominating time factor in the kernel boot for these
devices; with this flag it gets cut in half for the device I used
for testing (eeepc).
Alan took a look at the driver source and concluded that it ought to be safe
to do for this driver.  Alan has also checked with the hardware team.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>

diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c
index 6e165bf..d0a14cf 100644
--- a/drivers/ata/ata_piix.c
+++ b/drivers/ata/ata_piix.c
@@ -1589,6 +1589,7 @@ static int __devinit piix_init_one(struct pci_dev *pdev,
 		host->ports[1]->mwdma_mask = 0;
 		host->ports[1]->udma_mask = 0;
 	}
+	host->flags |= ATA_HOST_PARALLEL_SCAN;
 
 	pci_set_master(pdev);
 	return ata_pci_sff_activate_host(host, ata_sff_interrupt, &piix_sht);
-- 
cgit v0.10.2


From 5c939df56c3ea018b58e5aa76181284c2053d699 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Wed, 27 May 2009 09:16:03 -0400
Subject: btrfs: Fix set/clear_extent_bit for 'end == (u64)-1'

There are some 'start = state->end + 1;' like code in set_extent_bit
and clear_extent_bit. They overflow when end == (u64)-1.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index fe9eb99..6826018 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -476,6 +476,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
 	struct rb_node *node;
+	u64 last_end;
 	int err;
 	int set = 0;
 
@@ -498,6 +499,7 @@ again:
 	if (state->start > end)
 		goto out;
 	WARN_ON(state->end < start);
+	last_end = state->end;
 
 	/*
 	 *     | ---- desired range ---- |
@@ -524,9 +526,11 @@ again:
 		if (err)
 			goto out;
 		if (state->end <= end) {
-			start = state->end + 1;
 			set |= clear_state_bit(tree, state, bits,
 					wake, delete);
+			if (last_end == (u64)-1)
+				goto out;
+			start = last_end + 1;
 		} else {
 			start = state->start;
 		}
@@ -552,8 +556,10 @@ again:
 		goto out;
 	}
 
-	start = state->end + 1;
 	set |= clear_state_bit(tree, state, bits, wake, delete);
+	if (last_end == (u64)-1)
+		goto out;
+	start = last_end + 1;
 	goto search_again;
 
 out:
@@ -707,8 +713,10 @@ again:
 			goto out;
 		}
 		set_state_bits(tree, state, bits);
-		start = state->end + 1;
 		merge_state(tree, state);
+		if (last_end == (u64)-1)
+			goto out;
+		start = last_end + 1;
 		goto search_again;
 	}
 
@@ -742,8 +750,10 @@ again:
 			goto out;
 		if (state->end <= end) {
 			set_state_bits(tree, state, bits);
-			start = state->end + 1;
 			merge_state(tree, state);
+			if (last_end == (u64)-1)
+				goto out;
+			start = last_end + 1;
 		} else {
 			start = state->start;
 		}
-- 
cgit v0.10.2


From 5d4f98a28c7d334091c1b7744f48a1acdd2a4ae0 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Wed, 10 Jun 2009 10:45:14 -0400
Subject: Btrfs: Mixed back reference  (FORWARD ROLLING FORMAT CHANGE)

This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.

When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one.  At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.

The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root.  This commit reduces the
transaction overhead by avoiding the need for dead root records.

When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.

This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.

We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.

This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.

This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.

This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.

The improved balancing code scales significantly better with a large
number of snapshots.

This is a very large commit and was written in a number of
pieces.  But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 9421284..a35eb36 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   transaction.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-	   ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
-	   compression.o delayed-ref.o
+	   export.o tree-log.o acl.o free-space-cache.o zlib.o \
+	   compression.o delayed-ref.o relocation.o
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index b30986f..ecf5f7d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -72,6 +72,9 @@ struct btrfs_inode {
 	 */
 	struct list_head ordered_operations;
 
+	/* node for the red-black tree that links inodes in subvolume root */
+	struct rb_node rb_node;
+
 	/* the space_info for where this inode's data allocations are done */
 	struct btrfs_space_info *space_info;
 
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index fedf8b9..2b96027 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -197,14 +197,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	u32 nritems;
 	int ret = 0;
 	int level;
-	struct btrfs_root *new_root;
-
-	new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
-	if (!new_root)
-		return -ENOMEM;
-
-	memcpy(new_root, root, sizeof(*new_root));
-	new_root->root_key.objectid = new_root_objectid;
+	struct btrfs_disk_key disk_key;
 
 	WARN_ON(root->ref_cows && trans->transid !=
 		root->fs_info->running_transaction->transid);
@@ -212,28 +205,37 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 
 	level = btrfs_header_level(buf);
 	nritems = btrfs_header_nritems(buf);
+	if (level == 0)
+		btrfs_item_key(buf, &disk_key, 0);
+	else
+		btrfs_node_key(buf, &disk_key, 0);
 
-	cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
-				     new_root_objectid, trans->transid,
-				     level, buf->start, 0);
-	if (IS_ERR(cow)) {
-		kfree(new_root);
+	cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
+				     new_root_objectid, &disk_key, level,
+				     buf->start, 0);
+	if (IS_ERR(cow))
 		return PTR_ERR(cow);
-	}
 
 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
 	btrfs_set_header_bytenr(cow, cow->start);
 	btrfs_set_header_generation(cow, trans->transid);
-	btrfs_set_header_owner(cow, new_root_objectid);
-	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
+	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
+	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
+				     BTRFS_HEADER_FLAG_RELOC);
+	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
+	else
+		btrfs_set_header_owner(cow, new_root_objectid);
 
 	write_extent_buffer(cow, root->fs_info->fsid,
 			    (unsigned long)btrfs_header_fsid(cow),
 			    BTRFS_FSID_SIZE);
 
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
-	ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
-	kfree(new_root);
+	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+		ret = btrfs_inc_ref(trans, root, cow, 1);
+	else
+		ret = btrfs_inc_ref(trans, root, cow, 0);
 
 	if (ret)
 		return ret;
@@ -244,6 +246,125 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 }
 
 /*
+ * check if the tree block can be shared by multiple trees
+ */
+int btrfs_block_can_be_shared(struct btrfs_root *root,
+			      struct extent_buffer *buf)
+{
+	/*
+	 * Tree blocks not in refernece counted trees and tree roots
+	 * are never shared. If a block was allocated after the last
+	 * snapshot and the block was not allocated by tree relocation,
+	 * we know the block is not shared.
+	 */
+	if (root->ref_cows &&
+	    buf != root->node && buf != root->commit_root &&
+	    (btrfs_header_generation(buf) <=
+	     btrfs_root_last_snapshot(&root->root_item) ||
+	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
+		return 1;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (root->ref_cows &&
+	    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+		return 1;
+#endif
+	return 0;
+}
+
+static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root,
+				       struct extent_buffer *buf,
+				       struct extent_buffer *cow)
+{
+	u64 refs;
+	u64 owner;
+	u64 flags;
+	u64 new_flags = 0;
+	int ret;
+
+	/*
+	 * Backrefs update rules:
+	 *
+	 * Always use full backrefs for extent pointers in tree block
+	 * allocated by tree relocation.
+	 *
+	 * If a shared tree block is no longer referenced by its owner
+	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
+	 * use full backrefs for extent pointers in tree block.
+	 *
+	 * If a tree block is been relocating
+	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
+	 * use full backrefs for extent pointers in tree block.
+	 * The reason for this is some operations (such as drop tree)
+	 * are only allowed for blocks use full backrefs.
+	 */
+
+	if (btrfs_block_can_be_shared(root, buf)) {
+		ret = btrfs_lookup_extent_info(trans, root, buf->start,
+					       buf->len, &refs, &flags);
+		BUG_ON(ret);
+		BUG_ON(refs == 0);
+	} else {
+		refs = 1;
+		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+		else
+			flags = 0;
+	}
+
+	owner = btrfs_header_owner(buf);
+	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
+	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+
+	if (refs > 1) {
+		if ((owner == root->root_key.objectid ||
+		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
+		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
+			ret = btrfs_inc_ref(trans, root, buf, 1);
+			BUG_ON(ret);
+
+			if (root->root_key.objectid ==
+			    BTRFS_TREE_RELOC_OBJECTID) {
+				ret = btrfs_dec_ref(trans, root, buf, 0);
+				BUG_ON(ret);
+				ret = btrfs_inc_ref(trans, root, cow, 1);
+				BUG_ON(ret);
+			}
+			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+		} else {
+
+			if (root->root_key.objectid ==
+			    BTRFS_TREE_RELOC_OBJECTID)
+				ret = btrfs_inc_ref(trans, root, cow, 1);
+			else
+				ret = btrfs_inc_ref(trans, root, cow, 0);
+			BUG_ON(ret);
+		}
+		if (new_flags != 0) {
+			ret = btrfs_set_disk_extent_flags(trans, root,
+							  buf->start,
+							  buf->len,
+							  new_flags, 0);
+			BUG_ON(ret);
+		}
+	} else {
+		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+			if (root->root_key.objectid ==
+			    BTRFS_TREE_RELOC_OBJECTID)
+				ret = btrfs_inc_ref(trans, root, cow, 1);
+			else
+				ret = btrfs_inc_ref(trans, root, cow, 0);
+			BUG_ON(ret);
+			ret = btrfs_dec_ref(trans, root, buf, 1);
+			BUG_ON(ret);
+		}
+		clean_tree_block(trans, root, buf);
+	}
+	return 0;
+}
+
+/*
  * does the dirty work in cow of a single block.  The parent block (if
  * supplied) is updated to point to the new cow copy.  The new buffer is marked
  * dirty and returned locked.  If you modify the block it needs to be marked
@@ -262,34 +383,39 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 			     struct extent_buffer **cow_ret,
 			     u64 search_start, u64 empty_size)
 {
-	u64 parent_start;
+	struct btrfs_disk_key disk_key;
 	struct extent_buffer *cow;
-	u32 nritems;
-	int ret = 0;
 	int level;
 	int unlock_orig = 0;
+	u64 parent_start;
 
 	if (*cow_ret == buf)
 		unlock_orig = 1;
 
 	btrfs_assert_tree_locked(buf);
 
-	if (parent)
-		parent_start = parent->start;
-	else
-		parent_start = 0;
-
 	WARN_ON(root->ref_cows && trans->transid !=
 		root->fs_info->running_transaction->transid);
 	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
 
 	level = btrfs_header_level(buf);
-	nritems = btrfs_header_nritems(buf);
 
-	cow = btrfs_alloc_free_block(trans, root, buf->len,
-				     parent_start, root->root_key.objectid,
-				     trans->transid, level,
-				     search_start, empty_size);
+	if (level == 0)
+		btrfs_item_key(buf, &disk_key, 0);
+	else
+		btrfs_node_key(buf, &disk_key, 0);
+
+	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+		if (parent)
+			parent_start = parent->start;
+		else
+			parent_start = 0;
+	} else
+		parent_start = 0;
+
+	cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
+				     root->root_key.objectid, &disk_key,
+				     level, search_start, empty_size);
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
@@ -298,83 +424,53 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
 	btrfs_set_header_bytenr(cow, cow->start);
 	btrfs_set_header_generation(cow, trans->transid);
-	btrfs_set_header_owner(cow, root->root_key.objectid);
-	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
+	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
+	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
+				     BTRFS_HEADER_FLAG_RELOC);
+	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
+	else
+		btrfs_set_header_owner(cow, root->root_key.objectid);
 
 	write_extent_buffer(cow, root->fs_info->fsid,
 			    (unsigned long)btrfs_header_fsid(cow),
 			    BTRFS_FSID_SIZE);
 
-	WARN_ON(btrfs_header_generation(buf) > trans->transid);
-	if (btrfs_header_generation(buf) != trans->transid) {
-		u32 nr_extents;
-		ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
-		if (ret)
-			return ret;
-
-		ret = btrfs_cache_ref(trans, root, buf, nr_extents);
-		WARN_ON(ret);
-	} else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
-		/*
-		 * There are only two places that can drop reference to
-		 * tree blocks owned by living reloc trees, one is here,
-		 * the other place is btrfs_drop_subtree. In both places,
-		 * we check reference count while tree block is locked.
-		 * Furthermore, if reference count is one, it won't get
-		 * increased by someone else.
-		 */
-		u32 refs;
-		ret = btrfs_lookup_extent_ref(trans, root, buf->start,
-					      buf->len, &refs);
-		BUG_ON(ret);
-		if (refs == 1) {
-			ret = btrfs_update_ref(trans, root, buf, cow,
-					       0, nritems);
-			clean_tree_block(trans, root, buf);
-		} else {
-			ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
-		}
-		BUG_ON(ret);
-	} else {
-		ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
-		if (ret)
-			return ret;
-		clean_tree_block(trans, root, buf);
-	}
-
-	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
-		ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
-		WARN_ON(ret);
-	}
+	update_ref_for_cow(trans, root, buf, cow);
 
 	if (buf == root->node) {
 		WARN_ON(parent && parent != buf);
+		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+			parent_start = buf->start;
+		else
+			parent_start = 0;
 
 		spin_lock(&root->node_lock);
 		root->node = cow;
 		extent_buffer_get(cow);
 		spin_unlock(&root->node_lock);
 
-		if (buf != root->commit_root) {
-			btrfs_free_extent(trans, root, buf->start,
-					  buf->len, buf->start,
-					  root->root_key.objectid,
-					  btrfs_header_generation(buf),
-					  level, 1);
-		}
+		btrfs_free_extent(trans, root, buf->start, buf->len,
+				  parent_start, root->root_key.objectid,
+				  level, 0);
 		free_extent_buffer(buf);
 		add_root_to_dirty_list(root);
 	} else {
+		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+			parent_start = parent->start;
+		else
+			parent_start = 0;
+
+		WARN_ON(trans->transid != btrfs_header_generation(parent));
 		btrfs_set_node_blockptr(parent, parent_slot,
 					cow->start);
-		WARN_ON(trans->transid == 0);
 		btrfs_set_node_ptr_generation(parent, parent_slot,
 					      trans->transid);
 		btrfs_mark_buffer_dirty(parent);
-		WARN_ON(btrfs_header_generation(parent) != trans->transid);
 		btrfs_free_extent(trans, root, buf->start, buf->len,
-				  parent_start, btrfs_header_owner(parent),
-				  btrfs_header_generation(parent), level, 1);
+				  parent_start, root->root_key.objectid,
+				  level, 0);
 	}
 	if (unlock_orig)
 		btrfs_tree_unlock(buf);
@@ -384,6 +480,18 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static inline int should_cow_block(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct extent_buffer *buf)
+{
+	if (btrfs_header_generation(buf) == trans->transid &&
+	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
+	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
+		return 0;
+	return 1;
+}
+
 /*
  * cows a single block, see __btrfs_cow_block for the real work.
  * This version of it has extra checks so that a block isn't cow'd more than
@@ -411,9 +519,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		WARN_ON(1);
 	}
 
-	if (btrfs_header_generation(buf) == trans->transid &&
-	    btrfs_header_owner(buf) == root->root_key.objectid &&
-	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+	if (!should_cow_block(trans, root, buf)) {
 		*cow_ret = buf;
 		return 0;
 	}
@@ -469,7 +575,7 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
 /*
  * same as comp_keys only with two btrfs_key's
  */
-static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
+int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
 {
 	if (k1->objectid > k2->objectid)
 		return 1;
@@ -845,6 +951,12 @@ static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 	return -1;
 }
 
+int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+		     int level, int *slot)
+{
+	return bin_search(eb, key, level, slot);
+}
+
 /* given a node and slot number, this reads the blocks it points to.  The
  * extent buffer is returned with a reference taken (but unlocked).
  * NULL is returned on error.
@@ -921,13 +1033,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		root->node = child;
 		spin_unlock(&root->node_lock);
 
-		ret = btrfs_update_extent_ref(trans, root, child->start,
-					      child->len,
-					      mid->start, child->start,
-					      root->root_key.objectid,
-					      trans->transid, level - 1);
-		BUG_ON(ret);
-
 		add_root_to_dirty_list(root);
 		btrfs_tree_unlock(child);
 
@@ -938,9 +1043,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		/* once for the path */
 		free_extent_buffer(mid);
 		ret = btrfs_free_extent(trans, root, mid->start, mid->len,
-					mid->start, root->root_key.objectid,
-					btrfs_header_generation(mid),
-					level, 1);
+					0, root->root_key.objectid, level, 1);
 		/* once for the root ptr */
 		free_extent_buffer(mid);
 		return ret;
@@ -998,7 +1101,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 			ret = wret;
 		if (btrfs_header_nritems(right) == 0) {
 			u64 bytenr = right->start;
-			u64 generation = btrfs_header_generation(parent);
 			u32 blocksize = right->len;
 
 			clean_tree_block(trans, root, right);
@@ -1010,9 +1112,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 			if (wret)
 				ret = wret;
 			wret = btrfs_free_extent(trans, root, bytenr,
-						 blocksize, parent->start,
-						 btrfs_header_owner(parent),
-						 generation, level, 1);
+						 blocksize, 0,
+						 root->root_key.objectid,
+						 level, 0);
 			if (wret)
 				ret = wret;
 		} else {
@@ -1047,7 +1149,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 	}
 	if (btrfs_header_nritems(mid) == 0) {
 		/* we've managed to empty the middle node, drop it */
-		u64 root_gen = btrfs_header_generation(parent);
 		u64 bytenr = mid->start;
 		u32 blocksize = mid->len;
 
@@ -1059,9 +1160,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		if (wret)
 			ret = wret;
 		wret = btrfs_free_extent(trans, root, bytenr, blocksize,
-					 parent->start,
-					 btrfs_header_owner(parent),
-					 root_gen, level, 1);
+					 0, root->root_key.objectid,
+					 level, 0);
 		if (wret)
 			ret = wret;
 	} else {
@@ -1437,7 +1537,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
 {
 	int i;
 
-	if (path->keep_locks || path->lowest_level)
+	if (path->keep_locks)
 		return;
 
 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -1614,10 +1714,17 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		lowest_unlock = 2;
 
 again:
-	if (p->skip_locking)
-		b = btrfs_root_node(root);
-	else
-		b = btrfs_lock_root_node(root);
+	if (p->search_commit_root) {
+		b = root->commit_root;
+		extent_buffer_get(b);
+		if (!p->skip_locking)
+			btrfs_tree_lock(b);
+	} else {
+		if (p->skip_locking)
+			b = btrfs_root_node(root);
+		else
+			b = btrfs_lock_root_node(root);
+	}
 
 	while (b) {
 		level = btrfs_header_level(b);
@@ -1638,11 +1745,9 @@ again:
 			 * then we don't want to set the path blocking,
 			 * so we test it here
 			 */
-			if (btrfs_header_generation(b) == trans->transid &&
-			    btrfs_header_owner(b) == root->root_key.objectid &&
-			    !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
+			if (!should_cow_block(trans, root, b))
 				goto cow_done;
-			}
+
 			btrfs_set_path_blocking(p);
 
 			wret = btrfs_cow_block(trans, root, b,
@@ -1764,138 +1869,6 @@ done:
 	return ret;
 }
 
-int btrfs_merge_path(struct btrfs_trans_handle *trans,
-		     struct btrfs_root *root,
-		     struct btrfs_key *node_keys,
-		     u64 *nodes, int lowest_level)
-{
-	struct extent_buffer *eb;
-	struct extent_buffer *parent;
-	struct btrfs_key key;
-	u64 bytenr;
-	u64 generation;
-	u32 blocksize;
-	int level;
-	int slot;
-	int key_match;
-	int ret;
-
-	eb = btrfs_lock_root_node(root);
-	ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
-	BUG_ON(ret);
-
-	btrfs_set_lock_blocking(eb);
-
-	parent = eb;
-	while (1) {
-		level = btrfs_header_level(parent);
-		if (level == 0 || level <= lowest_level)
-			break;
-
-		ret = bin_search(parent, &node_keys[lowest_level], level,
-				 &slot);
-		if (ret && slot > 0)
-			slot--;
-
-		bytenr = btrfs_node_blockptr(parent, slot);
-		if (nodes[level - 1] == bytenr)
-			break;
-
-		blocksize = btrfs_level_size(root, level - 1);
-		generation = btrfs_node_ptr_generation(parent, slot);
-		btrfs_node_key_to_cpu(eb, &key, slot);
-		key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
-
-		if (generation == trans->transid) {
-			eb = read_tree_block(root, bytenr, blocksize,
-					     generation);
-			btrfs_tree_lock(eb);
-			btrfs_set_lock_blocking(eb);
-		}
-
-		/*
-		 * if node keys match and node pointer hasn't been modified
-		 * in the running transaction, we can merge the path. for
-		 * blocks owened by reloc trees, the node pointer check is
-		 * skipped, this is because these blocks are fully controlled
-		 * by the space balance code, no one else can modify them.
-		 */
-		if (!nodes[level - 1] || !key_match ||
-		    (generation == trans->transid &&
-		     btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
-			if (level == 1 || level == lowest_level + 1) {
-				if (generation == trans->transid) {
-					btrfs_tree_unlock(eb);
-					free_extent_buffer(eb);
-				}
-				break;
-			}
-
-			if (generation != trans->transid) {
-				eb = read_tree_block(root, bytenr, blocksize,
-						generation);
-				btrfs_tree_lock(eb);
-				btrfs_set_lock_blocking(eb);
-			}
-
-			ret = btrfs_cow_block(trans, root, eb, parent, slot,
-					      &eb);
-			BUG_ON(ret);
-
-			if (root->root_key.objectid ==
-			    BTRFS_TREE_RELOC_OBJECTID) {
-				if (!nodes[level - 1]) {
-					nodes[level - 1] = eb->start;
-					memcpy(&node_keys[level - 1], &key,
-					       sizeof(node_keys[0]));
-				} else {
-					WARN_ON(1);
-				}
-			}
-
-			btrfs_tree_unlock(parent);
-			free_extent_buffer(parent);
-			parent = eb;
-			continue;
-		}
-
-		btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
-		btrfs_set_node_ptr_generation(parent, slot, trans->transid);
-		btrfs_mark_buffer_dirty(parent);
-
-		ret = btrfs_inc_extent_ref(trans, root,
-					nodes[level - 1],
-					blocksize, parent->start,
-					btrfs_header_owner(parent),
-					btrfs_header_generation(parent),
-					level - 1);
-		BUG_ON(ret);
-
-		/*
-		 * If the block was created in the running transaction,
-		 * it's possible this is the last reference to it, so we
-		 * should drop the subtree.
-		 */
-		if (generation == trans->transid) {
-			ret = btrfs_drop_subtree(trans, root, eb, parent);
-			BUG_ON(ret);
-			btrfs_tree_unlock(eb);
-			free_extent_buffer(eb);
-		} else {
-			ret = btrfs_free_extent(trans, root, bytenr,
-					blocksize, parent->start,
-					btrfs_header_owner(parent),
-					btrfs_header_generation(parent),
-					level - 1, 1);
-			BUG_ON(ret);
-		}
-		break;
-	}
-	btrfs_tree_unlock(parent);
-	free_extent_buffer(parent);
-	return 0;
-}
-
 /*
  * adjust the pointers going up the tree, starting at level
  * making sure the right key of each node is points to 'key'.
@@ -2021,9 +1994,6 @@ static int push_node_left(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(src);
 	btrfs_mark_buffer_dirty(dst);
 
-	ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
-	BUG_ON(ret);
-
 	return ret;
 }
 
@@ -2083,9 +2053,6 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(src);
 	btrfs_mark_buffer_dirty(dst);
 
-	ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
-	BUG_ON(ret);
-
 	return ret;
 }
 
@@ -2105,7 +2072,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 	struct extent_buffer *c;
 	struct extent_buffer *old;
 	struct btrfs_disk_key lower_key;
-	int ret;
 
 	BUG_ON(path->nodes[level]);
 	BUG_ON(path->nodes[level-1] != root->node);
@@ -2117,16 +2083,17 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 		btrfs_node_key(lower, &lower_key, 0);
 
 	c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
-				   root->root_key.objectid, trans->transid,
+				   root->root_key.objectid, &lower_key,
 				   level, root->node->start, 0);
 	if (IS_ERR(c))
 		return PTR_ERR(c);
 
-	memset_extent_buffer(c, 0, 0, root->nodesize);
+	memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
 	btrfs_set_header_nritems(c, 1);
 	btrfs_set_header_level(c, level);
 	btrfs_set_header_bytenr(c, c->start);
 	btrfs_set_header_generation(c, trans->transid);
+	btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(c, root->root_key.objectid);
 
 	write_extent_buffer(c, root->fs_info->fsid,
@@ -2151,12 +2118,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 	root->node = c;
 	spin_unlock(&root->node_lock);
 
-	ret = btrfs_update_extent_ref(trans, root, lower->start,
-				      lower->len, lower->start, c->start,
-				      root->root_key.objectid,
-				      trans->transid, level - 1);
-	BUG_ON(ret);
-
 	/* the super has an extra ref to root->node */
 	free_extent_buffer(old);
 
@@ -2244,20 +2205,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 	}
 
 	c_nritems = btrfs_header_nritems(c);
+	mid = (c_nritems + 1) / 2;
+	btrfs_node_key(c, &disk_key, mid);
 
-	split = btrfs_alloc_free_block(trans, root, root->nodesize,
-					path->nodes[level + 1]->start,
+	split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
 					root->root_key.objectid,
-					trans->transid, level, c->start, 0);
+					&disk_key, level, c->start, 0);
 	if (IS_ERR(split))
 		return PTR_ERR(split);
 
-	btrfs_set_header_flags(split, btrfs_header_flags(c));
+	memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
 	btrfs_set_header_level(split, btrfs_header_level(c));
 	btrfs_set_header_bytenr(split, split->start);
 	btrfs_set_header_generation(split, trans->transid);
+	btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(split, root->root_key.objectid);
-	btrfs_set_header_flags(split, 0);
 	write_extent_buffer(split, root->fs_info->fsid,
 			    (unsigned long)btrfs_header_fsid(split),
 			    BTRFS_FSID_SIZE);
@@ -2265,7 +2227,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 			    (unsigned long)btrfs_header_chunk_tree_uuid(split),
 			    BTRFS_UUID_SIZE);
 
-	mid = (c_nritems + 1) / 2;
 
 	copy_extent_buffer(split, c,
 			   btrfs_node_key_ptr_offset(0),
@@ -2278,16 +2239,12 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(c);
 	btrfs_mark_buffer_dirty(split);
 
-	btrfs_node_key(split, &disk_key, 0);
 	wret = insert_ptr(trans, root, path, &disk_key, split->start,
 			  path->slots[level + 1] + 1,
 			  level + 1);
 	if (wret)
 		ret = wret;
 
-	ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
-	BUG_ON(ret);
-
 	if (path->slots[level] >= mid) {
 		path->slots[level] -= mid;
 		btrfs_tree_unlock(c);
@@ -2360,7 +2317,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 	u32 right_nritems;
 	u32 data_end;
 	u32 this_item_size;
-	int ret;
 
 	if (empty)
 		nr = 0;
@@ -2473,9 +2429,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 		btrfs_mark_buffer_dirty(left);
 	btrfs_mark_buffer_dirty(right);
 
-	ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
-	BUG_ON(ret);
-
 	btrfs_item_key(right, &disk_key, 0);
 	btrfs_set_node_key(upper, &disk_key, slot + 1);
 	btrfs_mark_buffer_dirty(upper);
@@ -2720,10 +2673,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 	if (right_nritems)
 		btrfs_mark_buffer_dirty(right);
 
-	ret = btrfs_update_ref(trans, root, right, left,
-			       old_left_nritems, push_items);
-	BUG_ON(ret);
-
 	btrfs_item_key(right, &disk_key, 0);
 	wret = fixup_low_keys(trans, root, path, &disk_key, 1);
 	if (wret)
@@ -2880,9 +2829,6 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(l);
 	BUG_ON(path->slots[0] != slot);
 
-	ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
-	BUG_ON(ret);
-
 	if (mid <= slot) {
 		btrfs_tree_unlock(path->nodes[0]);
 		free_extent_buffer(path->nodes[0]);
@@ -2911,6 +2857,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 			       struct btrfs_path *path, int data_size,
 			       int extend)
 {
+	struct btrfs_disk_key disk_key;
 	struct extent_buffer *l;
 	u32 nritems;
 	int mid;
@@ -2918,7 +2865,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	struct extent_buffer *right;
 	int ret = 0;
 	int wret;
-	int double_split;
+	int split;
 	int num_doubles = 0;
 
 	/* first try to make some room by pushing left and right */
@@ -2945,16 +2892,53 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 			return ret;
 	}
 again:
-	double_split = 0;
+	split = 1;
 	l = path->nodes[0];
 	slot = path->slots[0];
 	nritems = btrfs_header_nritems(l);
 	mid = (nritems + 1) / 2;
 
-	right = btrfs_alloc_free_block(trans, root, root->leafsize,
-					path->nodes[1]->start,
+	if (mid <= slot) {
+		if (nritems == 1 ||
+		    leaf_space_used(l, mid, nritems - mid) + data_size >
+			BTRFS_LEAF_DATA_SIZE(root)) {
+			if (slot >= nritems) {
+				split = 0;
+			} else {
+				mid = slot;
+				if (mid != nritems &&
+				    leaf_space_used(l, mid, nritems - mid) +
+				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+					split = 2;
+				}
+			}
+		}
+	} else {
+		if (leaf_space_used(l, 0, mid) + data_size >
+			BTRFS_LEAF_DATA_SIZE(root)) {
+			if (!extend && data_size && slot == 0) {
+				split = 0;
+			} else if ((extend || !data_size) && slot == 0) {
+				mid = 1;
+			} else {
+				mid = slot;
+				if (mid != nritems &&
+				    leaf_space_used(l, mid, nritems - mid) +
+				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+					split = 2 ;
+				}
+			}
+		}
+	}
+
+	if (split == 0)
+		btrfs_cpu_key_to_disk(&disk_key, ins_key);
+	else
+		btrfs_item_key(l, &disk_key, mid);
+
+	right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
 					root->root_key.objectid,
-					trans->transid, 0, l->start, 0);
+					&disk_key, 0, l->start, 0);
 	if (IS_ERR(right)) {
 		BUG_ON(1);
 		return PTR_ERR(right);
@@ -2963,6 +2947,7 @@ again:
 	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
 	btrfs_set_header_bytenr(right, right->start);
 	btrfs_set_header_generation(right, trans->transid);
+	btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(right, root->root_key.objectid);
 	btrfs_set_header_level(right, 0);
 	write_extent_buffer(right, root->fs_info->fsid,
@@ -2973,79 +2958,47 @@ again:
 			    (unsigned long)btrfs_header_chunk_tree_uuid(right),
 			    BTRFS_UUID_SIZE);
 
-	if (mid <= slot) {
-		if (nritems == 1 ||
-		    leaf_space_used(l, mid, nritems - mid) + data_size >
-			BTRFS_LEAF_DATA_SIZE(root)) {
-			if (slot >= nritems) {
-				struct btrfs_disk_key disk_key;
-
-				btrfs_cpu_key_to_disk(&disk_key, ins_key);
-				btrfs_set_header_nritems(right, 0);
-				wret = insert_ptr(trans, root, path,
-						  &disk_key, right->start,
-						  path->slots[1] + 1, 1);
-				if (wret)
-					ret = wret;
+	if (split == 0) {
+		if (mid <= slot) {
+			btrfs_set_header_nritems(right, 0);
+			wret = insert_ptr(trans, root, path,
+					  &disk_key, right->start,
+					  path->slots[1] + 1, 1);
+			if (wret)
+				ret = wret;
 
-				btrfs_tree_unlock(path->nodes[0]);
-				free_extent_buffer(path->nodes[0]);
-				path->nodes[0] = right;
-				path->slots[0] = 0;
-				path->slots[1] += 1;
-				btrfs_mark_buffer_dirty(right);
-				return ret;
-			}
-			mid = slot;
-			if (mid != nritems &&
-			    leaf_space_used(l, mid, nritems - mid) +
-			    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
-				double_split = 1;
-			}
-		}
-	} else {
-		if (leaf_space_used(l, 0, mid) + data_size >
-			BTRFS_LEAF_DATA_SIZE(root)) {
-			if (!extend && data_size && slot == 0) {
-				struct btrfs_disk_key disk_key;
-
-				btrfs_cpu_key_to_disk(&disk_key, ins_key);
-				btrfs_set_header_nritems(right, 0);
-				wret = insert_ptr(trans, root, path,
-						  &disk_key,
-						  right->start,
-						  path->slots[1], 1);
+			btrfs_tree_unlock(path->nodes[0]);
+			free_extent_buffer(path->nodes[0]);
+			path->nodes[0] = right;
+			path->slots[0] = 0;
+			path->slots[1] += 1;
+		} else {
+			btrfs_set_header_nritems(right, 0);
+			wret = insert_ptr(trans, root, path,
+					  &disk_key,
+					  right->start,
+					  path->slots[1], 1);
+			if (wret)
+				ret = wret;
+			btrfs_tree_unlock(path->nodes[0]);
+			free_extent_buffer(path->nodes[0]);
+			path->nodes[0] = right;
+			path->slots[0] = 0;
+			if (path->slots[1] == 0) {
+				wret = fixup_low_keys(trans, root,
+						path, &disk_key, 1);
 				if (wret)
 					ret = wret;
-				btrfs_tree_unlock(path->nodes[0]);
-				free_extent_buffer(path->nodes[0]);
-				path->nodes[0] = right;
-				path->slots[0] = 0;
-				if (path->slots[1] == 0) {
-					wret = fixup_low_keys(trans, root,
-						      path, &disk_key, 1);
-					if (wret)
-						ret = wret;
-				}
-				btrfs_mark_buffer_dirty(right);
-				return ret;
-			} else if ((extend || !data_size) && slot == 0) {
-				mid = 1;
-			} else {
-				mid = slot;
-				if (mid != nritems &&
-				    leaf_space_used(l, mid, nritems - mid) +
-				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
-					double_split = 1;
-				}
 			}
 		}
+		btrfs_mark_buffer_dirty(right);
+		return ret;
 	}
 
 	ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
 	BUG_ON(ret);
 
-	if (double_split) {
+	if (split == 2) {
 		BUG_ON(num_doubles != 0);
 		num_doubles++;
 		goto again;
@@ -3447,7 +3400,7 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
 		/* figure out how many keys we can insert in here */
 		total_data = data_size[0];
 		for (i = 1; i < nr; i++) {
-			if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
+			if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
 				break;
 			total_data += data_size[i];
 		}
@@ -3745,9 +3698,7 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
 /*
  * a helper function to delete the leaf pointed to by path->slots[1] and
- * path->nodes[1].  bytenr is the node block pointer, but since the callers
- * already know it, it is faster to have them pass it down than to
- * read it out of the node again.
+ * path->nodes[1].
  *
  * This deletes the pointer in path->nodes[1] and frees the leaf
  * block extent.  zero is returned if it all worked out, < 0 otherwise.
@@ -3755,15 +3706,14 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  * The path must have already been setup for deleting the leaf, including
  * all the proper balancing.  path->nodes[1] must be locked.
  */
-noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root,
-			    struct btrfs_path *path, u64 bytenr)
+static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_path *path,
+				   struct extent_buffer *leaf)
 {
 	int ret;
-	u64 root_gen = btrfs_header_generation(path->nodes[1]);
-	u64 parent_start = path->nodes[1]->start;
-	u64 parent_owner = btrfs_header_owner(path->nodes[1]);
 
+	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
 	ret = del_ptr(trans, root, path, 1, path->slots[1]);
 	if (ret)
 		return ret;
@@ -3774,10 +3724,8 @@ noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
 	 */
 	btrfs_unlock_up_safe(path, 0);
 
-	ret = btrfs_free_extent(trans, root, bytenr,
-				btrfs_level_size(root, 0),
-				parent_start, parent_owner,
-				root_gen, 0, 1);
+	ret = btrfs_free_extent(trans, root, leaf->start, leaf->len,
+				0, root->root_key.objectid, 0, 0);
 	return ret;
 }
 /*
@@ -3845,7 +3793,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		if (leaf == root->node) {
 			btrfs_set_header_level(leaf, 0);
 		} else {
-			ret = btrfs_del_leaf(trans, root, path, leaf->start);
+			ret = btrfs_del_leaf(trans, root, path, leaf);
 			BUG_ON(ret);
 		}
 	} else {
@@ -3884,8 +3832,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
 			if (btrfs_header_nritems(leaf) == 0) {
 				path->slots[1] = slot;
-				ret = btrfs_del_leaf(trans, root, path,
-						     leaf->start);
+				ret = btrfs_del_leaf(trans, root, path, leaf);
 				BUG_ON(ret);
 				free_extent_buffer(leaf);
 			} else {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4414a5d..ce3ab4e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -45,6 +45,8 @@ struct btrfs_ordered_sum;
 
 #define BTRFS_MAX_LEVEL 8
 
+#define BTRFS_COMPAT_EXTENT_TREE_V0
+
 /*
  * files bigger than this get some pre-flushing when they are added
  * to the ordered operations list.  That way we limit the total
@@ -267,7 +269,18 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 }
 
 #define BTRFS_FSID_SIZE 16
-#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0)
+#define BTRFS_HEADER_FLAG_WRITTEN	(1ULL << 0)
+#define BTRFS_HEADER_FLAG_RELOC		(1ULL << 1)
+#define BTRFS_SUPER_FLAG_SEEDING	(1ULL << 32)
+#define BTRFS_SUPER_FLAG_METADUMP	(1ULL << 33)
+
+#define BTRFS_BACKREF_REV_MAX		256
+#define BTRFS_BACKREF_REV_SHIFT		56
+#define BTRFS_BACKREF_REV_MASK		(((u64)BTRFS_BACKREF_REV_MAX - 1) << \
+					 BTRFS_BACKREF_REV_SHIFT)
+
+#define BTRFS_OLD_BACKREF_REV		0
+#define BTRFS_MIXED_BACKREF_REV		1
 
 /*
  * every tree block (leaf or node) starts with this header.
@@ -296,7 +309,6 @@ struct btrfs_header {
 					sizeof(struct btrfs_item) - \
 					sizeof(struct btrfs_file_extent_item))
 
-#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
 
 /*
  * this is a very generous portion of the super block, giving us
@@ -355,9 +367,12 @@ struct btrfs_super_block {
  * Compat flags that we support.  If any incompat flags are set other than the
  * ones specified below then we will fail to mount
  */
-#define BTRFS_FEATURE_COMPAT_SUPP	0x0
-#define BTRFS_FEATURE_COMPAT_RO_SUPP	0x0
-#define BTRFS_FEATURE_INCOMPAT_SUPP	0x0
+#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF	(1ULL << 0)
+
+#define BTRFS_FEATURE_COMPAT_SUPP		0ULL
+#define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
+#define BTRFS_FEATURE_INCOMPAT_SUPP		\
+	BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF
 
 /*
  * A leaf is full of items. offset and size tell us where to find
@@ -421,23 +436,65 @@ struct btrfs_path {
 	unsigned int keep_locks:1;
 	unsigned int skip_locking:1;
 	unsigned int leave_spinning:1;
+	unsigned int search_commit_root:1;
 };
 
 /*
  * items in the extent btree are used to record the objectid of the
  * owner of the block and the number of references
  */
+
 struct btrfs_extent_item {
+	__le64 refs;
+	__le64 generation;
+	__le64 flags;
+} __attribute__ ((__packed__));
+
+struct btrfs_extent_item_v0 {
 	__le32 refs;
 } __attribute__ ((__packed__));
 
-struct btrfs_extent_ref {
+#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \
+					sizeof(struct btrfs_item))
+
+#define BTRFS_EXTENT_FLAG_DATA		(1ULL << 0)
+#define BTRFS_EXTENT_FLAG_TREE_BLOCK	(1ULL << 1)
+
+/* following flags only apply to tree blocks */
+
+/* use full backrefs for extent pointers in the block */
+#define BTRFS_BLOCK_FLAG_FULL_BACKREF	(1ULL << 8)
+
+struct btrfs_tree_block_info {
+	struct btrfs_disk_key key;
+	u8 level;
+} __attribute__ ((__packed__));
+
+struct btrfs_extent_data_ref {
+	__le64 root;
+	__le64 objectid;
+	__le64 offset;
+	__le32 count;
+} __attribute__ ((__packed__));
+
+struct btrfs_shared_data_ref {
+	__le32 count;
+} __attribute__ ((__packed__));
+
+struct btrfs_extent_inline_ref {
+	u8 type;
+	u64 offset;
+} __attribute__ ((__packed__));
+
+/* old style backrefs item */
+struct btrfs_extent_ref_v0 {
 	__le64 root;
 	__le64 generation;
 	__le64 objectid;
-	__le32 num_refs;
+	__le32 count;
 } __attribute__ ((__packed__));
 
+
 /* dev extents record free space on individual devices.  The owner
  * field points back to the chunk allocation mapping tree that allocated
  * the extent.  The chunk tree uuid field is a way to double check the owner
@@ -695,12 +752,7 @@ struct btrfs_block_group_cache {
 	struct list_head cluster_list;
 };
 
-struct btrfs_leaf_ref_tree {
-	struct rb_root root;
-	struct list_head list;
-	spinlock_t lock;
-};
-
+struct reloc_control;
 struct btrfs_device;
 struct btrfs_fs_devices;
 struct btrfs_fs_info {
@@ -831,18 +883,11 @@ struct btrfs_fs_info {
 	struct task_struct *cleaner_kthread;
 	int thread_pool_size;
 
-	/* tree relocation relocated fields */
-	struct list_head dead_reloc_roots;
-	struct btrfs_leaf_ref_tree reloc_ref_tree;
-	struct btrfs_leaf_ref_tree shared_ref_tree;
-
 	struct kobject super_kobj;
 	struct completion kobj_unregister;
 	int do_barriers;
 	int closing;
 	int log_root_recovering;
-	atomic_t throttles;
-	atomic_t throttle_gen;
 
 	u64 total_pinned;
 
@@ -861,6 +906,8 @@ struct btrfs_fs_info {
 	 */
 	struct list_head space_info;
 
+	struct reloc_control *reloc_ctl;
+
 	spinlock_t delalloc_lock;
 	spinlock_t new_trans_lock;
 	u64 delalloc_bytes;
@@ -891,7 +938,6 @@ struct btrfs_fs_info {
  * in ram representation of the tree.  extent_root is used for all allocations
  * and for the extent tree extent_root root.
  */
-struct btrfs_dirty_root;
 struct btrfs_root {
 	struct extent_buffer *node;
 
@@ -899,9 +945,6 @@ struct btrfs_root {
 	spinlock_t node_lock;
 
 	struct extent_buffer *commit_root;
-	struct btrfs_leaf_ref_tree *ref_tree;
-	struct btrfs_leaf_ref_tree ref_tree_struct;
-	struct btrfs_dirty_root *dirty_root;
 	struct btrfs_root *log_root;
 	struct btrfs_root *reloc_root;
 
@@ -952,10 +995,15 @@ struct btrfs_root {
 	/* the dirty list is only used by non-reference counted roots */
 	struct list_head dirty_list;
 
+	struct list_head root_list;
+
 	spinlock_t list_lock;
-	struct list_head dead_list;
 	struct list_head orphan_list;
 
+	spinlock_t inode_lock;
+	/* red-black tree that keeps track of in-memory inodes */
+	struct rb_root inode_tree;
+
 	/*
 	 * right now this just gets used so that a root has its own devid
 	 * for stat.  It may be used for more later
@@ -1017,7 +1065,16 @@ struct btrfs_root {
  * are used, and how many references there are to each block
  */
 #define BTRFS_EXTENT_ITEM_KEY	168
-#define BTRFS_EXTENT_REF_KEY	180
+
+#define BTRFS_TREE_BLOCK_REF_KEY	176
+
+#define BTRFS_EXTENT_DATA_REF_KEY	178
+
+#define BTRFS_EXTENT_REF_V0_KEY		180
+
+#define BTRFS_SHARED_BLOCK_REF_KEY	182
+
+#define BTRFS_SHARED_DATA_REF_KEY	184
 
 /*
  * block groups give us hints into the extent allocation trees.  Which
@@ -1317,24 +1374,67 @@ static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
 	return (u8 *)((unsigned long)dev + ptr);
 }
 
-/* struct btrfs_extent_ref */
-BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
-BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
-BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64);
-BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32);
+BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64);
+BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item,
+		   generation, 64);
+BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64);
 
-BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref,
-			 generation, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref,
-			 objectid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref,
-			 num_refs, 32);
+BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32);
+
+
+BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8);
+
+static inline void btrfs_tree_block_key(struct extent_buffer *eb,
+					struct btrfs_tree_block_info *item,
+					struct btrfs_disk_key *key)
+{
+	read_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
+}
+
+static inline void btrfs_set_tree_block_key(struct extent_buffer *eb,
+					    struct btrfs_tree_block_info *item,
+					    struct btrfs_disk_key *key)
+{
+	write_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
+}
 
-/* struct btrfs_extent_item */
-BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item,
-			 refs, 32);
+BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref,
+		   root, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref,
+		   objectid, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref,
+		   offset, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref,
+		   count, 32);
+
+BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref,
+		   count, 32);
+
+BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref,
+		   type, 8);
+BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref,
+		   offset, 64);
+
+static inline u32 btrfs_extent_inline_ref_size(int type)
+{
+	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+	    type == BTRFS_SHARED_BLOCK_REF_KEY)
+		return sizeof(struct btrfs_extent_inline_ref);
+	if (type == BTRFS_SHARED_DATA_REF_KEY)
+		return sizeof(struct btrfs_shared_data_ref) +
+		       sizeof(struct btrfs_extent_inline_ref);
+	if (type == BTRFS_EXTENT_DATA_REF_KEY)
+		return sizeof(struct btrfs_extent_data_ref) +
+		       offsetof(struct btrfs_extent_inline_ref, offset);
+	BUG();
+	return 0;
+}
+
+BTRFS_SETGET_FUNCS(ref_root_v0, struct btrfs_extent_ref_v0, root, 64);
+BTRFS_SETGET_FUNCS(ref_generation_v0, struct btrfs_extent_ref_v0,
+		   generation, 64);
+BTRFS_SETGET_FUNCS(ref_objectid_v0, struct btrfs_extent_ref_v0, objectid, 64);
+BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32);
 
 /* struct btrfs_node */
 BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
@@ -1558,6 +1658,21 @@ static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
 	return (flags & flag) == flag;
 }
 
+static inline int btrfs_header_backref_rev(struct extent_buffer *eb)
+{
+	u64 flags = btrfs_header_flags(eb);
+	return flags >> BTRFS_BACKREF_REV_SHIFT;
+}
+
+static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb,
+						int rev)
+{
+	u64 flags = btrfs_header_flags(eb);
+	flags &= ~BTRFS_BACKREF_REV_MASK;
+	flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT;
+	btrfs_set_header_flags(eb, flags);
+}
+
 static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
 {
 	unsigned long ptr = offsetof(struct btrfs_header, fsid);
@@ -1790,39 +1905,32 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct extent_buffer *leaf);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, u64 objectid, u64 bytenr);
+			  struct btrfs_root *root,
+			  u64 objectid, u64 offset, u64 bytenr);
 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(
 						 struct btrfs_fs_info *info,
 						 u64 bytenr);
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 u64 btrfs_find_block_group(struct btrfs_root *root,
 			   u64 search_start, u64 search_hint, int owner);
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-					     struct btrfs_root *root,
-					     u32 blocksize, u64 parent,
-					     u64 root_objectid,
-					     u64 ref_generation,
-					     int level,
-					     u64 hint,
-					     u64 empty_size);
+					struct btrfs_root *root, u32 blocksize,
+					u64 parent, u64 root_objectid,
+					struct btrfs_disk_key *key, int level,
+					u64 hint, u64 empty_size);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize,
 					    int level);
-int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root,
-		       u64 num_bytes, u64 parent, u64 min_bytes,
-		       u64 root_objectid, u64 ref_generation,
-		       u64 owner, u64 empty_size, u64 hint_byte,
-		       u64 search_end, struct btrfs_key *ins, u64 data);
-int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root, u64 parent,
-				u64 root_objectid, u64 ref_generation,
-				u64 owner, struct btrfs_key *ins);
-int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root, u64 parent,
-				u64 root_objectid, u64 ref_generation,
-				u64 owner, struct btrfs_key *ins);
+int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     u64 root_objectid, u64 owner,
+				     u64 offset, struct btrfs_key *ins);
+int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   u64 root_objectid, u64 owner, u64 offset,
+				   struct btrfs_key *ins);
 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
 				  u64 num_bytes, u64 min_alloc_size,
@@ -1830,18 +1938,18 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 				  u64 search_end, struct btrfs_key *ins,
 				  u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *orig_buf, struct extent_buffer *buf,
-		  u32 *nr_extents);
-int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		    struct extent_buffer *buf, u32 nr_extents);
-int btrfs_update_ref(struct btrfs_trans_handle *trans,
-		     struct btrfs_root *root, struct extent_buffer *orig_buf,
-		     struct extent_buffer *buf, int start_slot, int nr);
+		  struct extent_buffer *buf, int full_backref);
+int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		  struct extent_buffer *buf, int full_backref);
+int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 bytenr, u64 num_bytes, u64 flags,
+				int is_data);
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      u64 bytenr, u64 num_bytes, u64 parent,
-		      u64 root_objectid, u64 ref_generation,
-		      u64 owner_objectid, int pin);
+		      u64 root_objectid, u64 owner, u64 offset);
+
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
@@ -1849,13 +1957,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
-			 u64 root_objectid, u64 ref_generation,
-			 u64 owner_objectid);
-int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, u64 bytenr, u64 num_bytes,
-			    u64 orig_parent, u64 parent,
-			    u64 root_objectid, u64 ref_generation,
-			    u64 owner_objectid);
+			 u64 root_objectid, u64 owner, u64 offset);
+
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root);
 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
@@ -1867,16 +1970,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 			   u64 size);
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 group_start);
-int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
-int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root);
-int btrfs_drop_dead_reloc_roots(struct btrfs_root *root);
-int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root,
-			       struct extent_buffer *buf, u64 orig_start);
-int btrfs_add_dead_reloc_root(struct btrfs_root *root);
-int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
-int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
+int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
+				struct btrfs_block_group_cache *group);
+
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
@@ -1891,13 +1987,12 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
 			      u64 bytes);
 /* ctree.c */
+int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+		     int level, int *slot);
+int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2);
 int btrfs_previous_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid,
 			int type);
-int btrfs_merge_path(struct btrfs_trans_handle *trans,
-		     struct btrfs_root *root,
-		     struct btrfs_key *node_keys,
-		     u64 *nodes, int lowest_level);
 int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root, struct btrfs_path *path,
 			    struct btrfs_key *new_key);
@@ -1918,6 +2013,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      struct extent_buffer *buf,
 		      struct extent_buffer **cow_ret, u64 new_root_objectid);
+int btrfs_block_can_be_shared(struct btrfs_root *root,
+			      struct extent_buffer *buf);
 int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, u32 data_size);
 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
@@ -1944,9 +2041,6 @@ void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
 
 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_path *path, int slot, int nr);
-int btrfs_del_leaf(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root,
-			    struct btrfs_path *path, u64 bytenr);
 static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct btrfs_path *path)
@@ -2005,8 +2099,9 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
 			 btrfs_root_item *item, struct btrfs_key *key);
 int btrfs_search_root(struct btrfs_root *root, u64 search_start,
 		      u64 *found_objectid);
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
-			  struct btrfs_root *latest_root);
+int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
+int btrfs_set_root_node(struct btrfs_root_item *item,
+			struct extent_buffer *node);
 /* dir-item.c */
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, const char *name,
@@ -2139,7 +2234,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_delete_inode(struct inode *inode);
 void btrfs_put_inode(struct inode *inode);
-void btrfs_read_locked_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, int wait);
 void btrfs_dirty_inode(struct inode *inode);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
@@ -2147,12 +2241,8 @@ void btrfs_destroy_inode(struct inode *inode);
 int btrfs_init_cachep(void);
 void btrfs_destroy_cachep(void);
 long btrfs_ioctl_trans_end(struct file *file);
-struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
-			    struct btrfs_root *root, int wait);
-struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
-				struct btrfs_root *root);
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
-			 struct btrfs_root *root, int *is_new);
+			 struct btrfs_root *root);
 int btrfs_commit_write(struct file *file, struct page *page,
 		       unsigned from, unsigned to);
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
@@ -2209,4 +2299,12 @@ int btrfs_check_acl(struct inode *inode, int mask);
 int btrfs_init_acl(struct inode *inode, struct inode *dir);
 int btrfs_acl_chmod(struct inode *inode);
 
+/* relocation.c */
+int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root);
+int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root);
+int btrfs_recover_relocation(struct btrfs_root *root);
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
 #endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index d6c01c0..84e6781 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -29,27 +29,87 @@
  * add extents in the middle of btrfs_search_slot, and it allows
  * us to buffer up frequently modified backrefs in an rb tree instead
  * of hammering updates on the extent allocation tree.
- *
- * Right now this code is only used for reference counted trees, but
- * the long term goal is to get rid of the similar code for delayed
- * extent tree modifications.
  */
 
 /*
- * entries in the rb tree are ordered by the byte number of the extent
- * and by the byte number of the parent block.
+ * compare two delayed tree backrefs with same bytenr and type
+ */
+static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
+			  struct btrfs_delayed_tree_ref *ref1)
+{
+	if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
+		if (ref1->root < ref2->root)
+			return -1;
+		if (ref1->root > ref2->root)
+			return 1;
+	} else {
+		if (ref1->parent < ref2->parent)
+			return -1;
+		if (ref1->parent > ref2->parent)
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * compare two delayed data backrefs with same bytenr and type
  */
-static int comp_entry(struct btrfs_delayed_ref_node *ref,
-		      u64 bytenr, u64 parent)
+static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
+			  struct btrfs_delayed_data_ref *ref1)
 {
-	if (bytenr < ref->bytenr)
+	if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) {
+		if (ref1->root < ref2->root)
+			return -1;
+		if (ref1->root > ref2->root)
+			return 1;
+		if (ref1->objectid < ref2->objectid)
+			return -1;
+		if (ref1->objectid > ref2->objectid)
+			return 1;
+		if (ref1->offset < ref2->offset)
+			return -1;
+		if (ref1->offset > ref2->offset)
+			return 1;
+	} else {
+		if (ref1->parent < ref2->parent)
+			return -1;
+		if (ref1->parent > ref2->parent)
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * entries in the rb tree are ordered by the byte number of the extent,
+ * type of the delayed backrefs and content of delayed backrefs.
+ */
+static int comp_entry(struct btrfs_delayed_ref_node *ref2,
+		      struct btrfs_delayed_ref_node *ref1)
+{
+	if (ref1->bytenr < ref2->bytenr)
 		return -1;
-	if (bytenr > ref->bytenr)
+	if (ref1->bytenr > ref2->bytenr)
 		return 1;
-	if (parent < ref->parent)
+	if (ref1->is_head && ref2->is_head)
+		return 0;
+	if (ref2->is_head)
 		return -1;
-	if (parent > ref->parent)
+	if (ref1->is_head)
 		return 1;
+	if (ref1->type < ref2->type)
+		return -1;
+	if (ref1->type > ref2->type)
+		return 1;
+	if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
+	    ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
+		return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
+				      btrfs_delayed_node_to_tree_ref(ref1));
+	} else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY ||
+		   ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
+		return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2),
+				      btrfs_delayed_node_to_data_ref(ref1));
+	}
+	BUG();
 	return 0;
 }
 
@@ -59,20 +119,21 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref,
  * inserted.
  */
 static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
-						  u64 bytenr, u64 parent,
 						  struct rb_node *node)
 {
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent_node = NULL;
 	struct btrfs_delayed_ref_node *entry;
+	struct btrfs_delayed_ref_node *ins;
 	int cmp;
 
+	ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
 	while (*p) {
 		parent_node = *p;
 		entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
 				 rb_node);
 
-		cmp = comp_entry(entry, bytenr, parent);
+		cmp = comp_entry(entry, ins);
 		if (cmp < 0)
 			p = &(*p)->rb_left;
 		else if (cmp > 0)
@@ -81,18 +142,17 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
 			return entry;
 	}
 
-	entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
 	rb_link_node(node, parent_node, p);
 	rb_insert_color(node, root);
 	return NULL;
 }
 
 /*
- * find an entry based on (bytenr,parent).  This returns the delayed
- * ref if it was able to find one, or NULL if nothing was in that spot
+ * find an head entry based on bytenr. This returns the delayed ref
+ * head if it was able to find one, or NULL if nothing was in that spot
  */
-static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
-				  u64 bytenr, u64 parent,
+static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
+				  u64 bytenr,
 				  struct btrfs_delayed_ref_node **last)
 {
 	struct rb_node *n = root->rb_node;
@@ -105,7 +165,15 @@ static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
 		if (last)
 			*last = entry;
 
-		cmp = comp_entry(entry, bytenr, parent);
+		if (bytenr < entry->bytenr)
+			cmp = -1;
+		else if (bytenr > entry->bytenr)
+			cmp = 1;
+		else if (!btrfs_delayed_ref_is_head(entry))
+			cmp = 1;
+		else
+			cmp = 0;
+
 		if (cmp < 0)
 			n = n->rb_left;
 		else if (cmp > 0)
@@ -154,7 +222,7 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
 		node = rb_first(&delayed_refs->root);
 	} else {
 		ref = NULL;
-		tree_search(&delayed_refs->root, start, (u64)-1, &ref);
+		find_ref_head(&delayed_refs->root, start, &ref);
 		if (ref) {
 			struct btrfs_delayed_ref_node *tmp;
 
@@ -234,7 +302,7 @@ int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
 	delayed_refs = &trans->transaction->delayed_refs;
 	spin_lock(&delayed_refs->lock);
 
-	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+	ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
 	if (ref) {
 		prev_node = rb_prev(&ref->rb_node);
 		if (!prev_node)
@@ -250,25 +318,28 @@ out:
 }
 
 /*
- * helper function to lookup reference count
+ * helper function to lookup reference count and flags of extent.
  *
  * the head node for delayed ref is used to store the sum of all the
- * reference count modifications queued up in the rbtree.  This way you
- * can check to see what the reference count would be if all of the
- * delayed refs are processed.
+ * reference count modifications queued up in the rbtree. the head
+ * node may also store the extent flags to set. This way you can check
+ * to see what the reference count and extent flags would be if all of
+ * the delayed refs are not processed.
  */
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, u64 bytenr,
-			    u64 num_bytes, u32 *refs)
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 bytenr,
+			     u64 num_bytes, u64 *refs, u64 *flags)
 {
 	struct btrfs_delayed_ref_node *ref;
 	struct btrfs_delayed_ref_head *head;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_path *path;
-	struct extent_buffer *leaf;
 	struct btrfs_extent_item *ei;
+	struct extent_buffer *leaf;
 	struct btrfs_key key;
-	u32 num_refs;
+	u32 item_size;
+	u64 num_refs;
+	u64 extent_flags;
 	int ret;
 
 	path = btrfs_alloc_path();
@@ -287,37 +358,60 @@ again:
 
 	if (ret == 0) {
 		leaf = path->nodes[0];
-		ei = btrfs_item_ptr(leaf, path->slots[0],
-				    struct btrfs_extent_item);
-		num_refs = btrfs_extent_refs(leaf, ei);
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		if (item_size >= sizeof(*ei)) {
+			ei = btrfs_item_ptr(leaf, path->slots[0],
+					    struct btrfs_extent_item);
+			num_refs = btrfs_extent_refs(leaf, ei);
+			extent_flags = btrfs_extent_flags(leaf, ei);
+		} else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+			struct btrfs_extent_item_v0 *ei0;
+			BUG_ON(item_size != sizeof(*ei0));
+			ei0 = btrfs_item_ptr(leaf, path->slots[0],
+					     struct btrfs_extent_item_v0);
+			num_refs = btrfs_extent_refs_v0(leaf, ei0);
+			/* FIXME: this isn't correct for data */
+			extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+#else
+			BUG();
+#endif
+		}
+		BUG_ON(num_refs == 0);
 	} else {
 		num_refs = 0;
+		extent_flags = 0;
 		ret = 0;
 	}
 
 	spin_lock(&delayed_refs->lock);
-	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+	ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
 	if (ref) {
 		head = btrfs_delayed_node_to_head(ref);
-		if (mutex_trylock(&head->mutex)) {
-			num_refs += ref->ref_mod;
-			mutex_unlock(&head->mutex);
-			*refs = num_refs;
-			goto out;
-		}
+		if (!mutex_trylock(&head->mutex)) {
+			atomic_inc(&ref->refs);
+			spin_unlock(&delayed_refs->lock);
 
-		atomic_inc(&ref->refs);
-		spin_unlock(&delayed_refs->lock);
+			btrfs_release_path(root->fs_info->extent_root, path);
 
-		btrfs_release_path(root->fs_info->extent_root, path);
+			mutex_lock(&head->mutex);
+			mutex_unlock(&head->mutex);
+			btrfs_put_delayed_ref(ref);
+			goto again;
+		}
+		if (head->extent_op && head->extent_op->update_flags)
+			extent_flags |= head->extent_op->flags_to_set;
+		else
+			BUG_ON(num_refs == 0);
 
-		mutex_lock(&head->mutex);
+		num_refs += ref->ref_mod;
 		mutex_unlock(&head->mutex);
-		btrfs_put_delayed_ref(ref);
-		goto again;
-	} else {
-		*refs = num_refs;
 	}
+	WARN_ON(num_refs == 0);
+	if (refs)
+		*refs = num_refs;
+	if (flags)
+		*flags = extent_flags;
 out:
 	spin_unlock(&delayed_refs->lock);
 	btrfs_free_path(path);
@@ -338,16 +432,7 @@ update_existing_ref(struct btrfs_trans_handle *trans,
 		    struct btrfs_delayed_ref_node *existing,
 		    struct btrfs_delayed_ref_node *update)
 {
-	struct btrfs_delayed_ref *existing_ref;
-	struct btrfs_delayed_ref *ref;
-
-	existing_ref = btrfs_delayed_node_to_ref(existing);
-	ref = btrfs_delayed_node_to_ref(update);
-
-	if (ref->pin)
-		existing_ref->pin = 1;
-
-	if (ref->action != existing_ref->action) {
+	if (update->action != existing->action) {
 		/*
 		 * this is effectively undoing either an add or a
 		 * drop.  We decrement the ref_mod, and if it goes
@@ -363,20 +448,13 @@ update_existing_ref(struct btrfs_trans_handle *trans,
 			delayed_refs->num_entries--;
 			if (trans->delayed_ref_updates)
 				trans->delayed_ref_updates--;
+		} else {
+			WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
+				existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
 		}
 	} else {
-		if (existing_ref->action == BTRFS_ADD_DELAYED_REF) {
-			/* if we're adding refs, make sure all the
-			 * details match up.  The extent could
-			 * have been totally freed and reallocated
-			 * by a different owner before the delayed
-			 * ref entries were removed.
-			 */
-			existing_ref->owner_objectid = ref->owner_objectid;
-			existing_ref->generation = ref->generation;
-			existing_ref->root = ref->root;
-			existing->num_bytes = update->num_bytes;
-		}
+		WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
+			existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
 		/*
 		 * the action on the existing ref matches
 		 * the action on the ref we're trying to add.
@@ -401,6 +479,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
 
 	existing_ref = btrfs_delayed_node_to_head(existing);
 	ref = btrfs_delayed_node_to_head(update);
+	BUG_ON(existing_ref->is_data != ref->is_data);
 
 	if (ref->must_insert_reserved) {
 		/* if the extent was freed and then
@@ -420,6 +499,24 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
 
 	}
 
+	if (ref->extent_op) {
+		if (!existing_ref->extent_op) {
+			existing_ref->extent_op = ref->extent_op;
+		} else {
+			if (ref->extent_op->update_key) {
+				memcpy(&existing_ref->extent_op->key,
+				       &ref->extent_op->key,
+				       sizeof(ref->extent_op->key));
+				existing_ref->extent_op->update_key = 1;
+			}
+			if (ref->extent_op->update_flags) {
+				existing_ref->extent_op->flags_to_set |=
+					ref->extent_op->flags_to_set;
+				existing_ref->extent_op->update_flags = 1;
+			}
+			kfree(ref->extent_op);
+		}
+	}
 	/*
 	 * update the reference mod on the head to reflect this new operation
 	 */
@@ -427,19 +524,16 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
 }
 
 /*
- * helper function to actually insert a delayed ref into the rbtree.
+ * helper function to actually insert a head node into the rbtree.
  * this does all the dirty work in terms of maintaining the correct
- * overall modification count in the head node and properly dealing
- * with updating existing nodes as new modifications are queued.
+ * overall modification count.
  */
-static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
-			  struct btrfs_delayed_ref_node *ref,
-			  u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
-			  u64 ref_generation, u64 owner_objectid, int action,
-			  int pin)
+static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
+					struct btrfs_delayed_ref_node *ref,
+					u64 bytenr, u64 num_bytes,
+					int action, int is_data)
 {
 	struct btrfs_delayed_ref_node *existing;
-	struct btrfs_delayed_ref *full_ref;
 	struct btrfs_delayed_ref_head *head_ref = NULL;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	int count_mod = 1;
@@ -449,12 +543,10 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
 	 * the head node stores the sum of all the mods, so dropping a ref
 	 * should drop the sum in the head node by one.
 	 */
-	if (parent == (u64)-1) {
-		if (action == BTRFS_DROP_DELAYED_REF)
-			count_mod = -1;
-		else if (action == BTRFS_UPDATE_DELAYED_HEAD)
-			count_mod = 0;
-	}
+	if (action == BTRFS_UPDATE_DELAYED_HEAD)
+		count_mod = 0;
+	else if (action == BTRFS_DROP_DELAYED_REF)
+		count_mod = -1;
 
 	/*
 	 * BTRFS_ADD_DELAYED_EXTENT means that we need to update
@@ -467,57 +559,148 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
 	 * Once we record must_insert_reserved, switch the action to
 	 * BTRFS_ADD_DELAYED_REF because other special casing is not required.
 	 */
-	if (action == BTRFS_ADD_DELAYED_EXTENT) {
+	if (action == BTRFS_ADD_DELAYED_EXTENT)
 		must_insert_reserved = 1;
-		action = BTRFS_ADD_DELAYED_REF;
-	} else {
+	else
 		must_insert_reserved = 0;
-	}
-
 
 	delayed_refs = &trans->transaction->delayed_refs;
 
 	/* first set the basic ref node struct up */
 	atomic_set(&ref->refs, 1);
 	ref->bytenr = bytenr;
-	ref->parent = parent;
+	ref->num_bytes = num_bytes;
 	ref->ref_mod = count_mod;
+	ref->type  = 0;
+	ref->action  = 0;
+	ref->is_head = 1;
 	ref->in_tree = 1;
+
+	head_ref = btrfs_delayed_node_to_head(ref);
+	head_ref->must_insert_reserved = must_insert_reserved;
+	head_ref->is_data = is_data;
+
+	INIT_LIST_HEAD(&head_ref->cluster);
+	mutex_init(&head_ref->mutex);
+
+	existing = tree_insert(&delayed_refs->root, &ref->rb_node);
+
+	if (existing) {
+		update_existing_head_ref(existing, ref);
+		/*
+		 * we've updated the existing ref, free the newly
+		 * allocated ref
+		 */
+		kfree(ref);
+	} else {
+		delayed_refs->num_heads++;
+		delayed_refs->num_heads_ready++;
+		delayed_refs->num_entries++;
+		trans->delayed_ref_updates++;
+	}
+	return 0;
+}
+
+/*
+ * helper to insert a delayed tree ref into the rbtree.
+ */
+static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+					 struct btrfs_delayed_ref_node *ref,
+					 u64 bytenr, u64 num_bytes, u64 parent,
+					 u64 ref_root, int level, int action)
+{
+	struct btrfs_delayed_ref_node *existing;
+	struct btrfs_delayed_tree_ref *full_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+
+	if (action == BTRFS_ADD_DELAYED_EXTENT)
+		action = BTRFS_ADD_DELAYED_REF;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+
+	/* first set the basic ref node struct up */
+	atomic_set(&ref->refs, 1);
+	ref->bytenr = bytenr;
 	ref->num_bytes = num_bytes;
+	ref->ref_mod = 1;
+	ref->action = action;
+	ref->is_head = 0;
+	ref->in_tree = 1;
 
-	if (btrfs_delayed_ref_is_head(ref)) {
-		head_ref = btrfs_delayed_node_to_head(ref);
-		head_ref->must_insert_reserved = must_insert_reserved;
-		INIT_LIST_HEAD(&head_ref->cluster);
-		mutex_init(&head_ref->mutex);
+	full_ref = btrfs_delayed_node_to_tree_ref(ref);
+	if (parent) {
+		full_ref->parent = parent;
+		ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
 	} else {
-		full_ref = btrfs_delayed_node_to_ref(ref);
 		full_ref->root = ref_root;
-		full_ref->generation = ref_generation;
-		full_ref->owner_objectid = owner_objectid;
-		full_ref->pin = pin;
-		full_ref->action = action;
+		ref->type = BTRFS_TREE_BLOCK_REF_KEY;
 	}
+	full_ref->level = level;
 
-	existing = tree_insert(&delayed_refs->root, bytenr,
-			       parent, &ref->rb_node);
+	existing = tree_insert(&delayed_refs->root, &ref->rb_node);
 
 	if (existing) {
-		if (btrfs_delayed_ref_is_head(ref))
-			update_existing_head_ref(existing, ref);
-		else
-			update_existing_ref(trans, delayed_refs, existing, ref);
+		update_existing_ref(trans, delayed_refs, existing, ref);
+		/*
+		 * we've updated the existing ref, free the newly
+		 * allocated ref
+		 */
+		kfree(ref);
+	} else {
+		delayed_refs->num_entries++;
+		trans->delayed_ref_updates++;
+	}
+	return 0;
+}
+
+/*
+ * helper to insert a delayed data ref into the rbtree.
+ */
+static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
+					 struct btrfs_delayed_ref_node *ref,
+					 u64 bytenr, u64 num_bytes, u64 parent,
+					 u64 ref_root, u64 owner, u64 offset,
+					 int action)
+{
+	struct btrfs_delayed_ref_node *existing;
+	struct btrfs_delayed_data_ref *full_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+
+	if (action == BTRFS_ADD_DELAYED_EXTENT)
+		action = BTRFS_ADD_DELAYED_REF;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+
+	/* first set the basic ref node struct up */
+	atomic_set(&ref->refs, 1);
+	ref->bytenr = bytenr;
+	ref->num_bytes = num_bytes;
+	ref->ref_mod = 1;
+	ref->action = action;
+	ref->is_head = 0;
+	ref->in_tree = 1;
+
+	full_ref = btrfs_delayed_node_to_data_ref(ref);
+	if (parent) {
+		full_ref->parent = parent;
+		ref->type = BTRFS_SHARED_DATA_REF_KEY;
+	} else {
+		full_ref->root = ref_root;
+		ref->type = BTRFS_EXTENT_DATA_REF_KEY;
+	}
+	full_ref->objectid = owner;
+	full_ref->offset = offset;
 
+	existing = tree_insert(&delayed_refs->root, &ref->rb_node);
+
+	if (existing) {
+		update_existing_ref(trans, delayed_refs, existing, ref);
 		/*
 		 * we've updated the existing ref, free the newly
 		 * allocated ref
 		 */
 		kfree(ref);
 	} else {
-		if (btrfs_delayed_ref_is_head(ref)) {
-			delayed_refs->num_heads++;
-			delayed_refs->num_heads_ready++;
-		}
 		delayed_refs->num_entries++;
 		trans->delayed_ref_updates++;
 	}
@@ -525,37 +708,78 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
 }
 
 /*
- * add a delayed ref to the tree.  This does all of the accounting required
+ * add a delayed tree ref.  This does all of the accounting required
  * to make sure the delayed ref is eventually processed before this
  * transaction commits.
  */
-int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
-			  u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
-			  u64 ref_generation, u64 owner_objectid, int action,
-			  int pin)
+int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+			       u64 bytenr, u64 num_bytes, u64 parent,
+			       u64 ref_root,  int level, int action,
+			       struct btrfs_delayed_extent_op *extent_op)
 {
-	struct btrfs_delayed_ref *ref;
+	struct btrfs_delayed_tree_ref *ref;
 	struct btrfs_delayed_ref_head *head_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	int ret;
 
+	BUG_ON(extent_op && extent_op->is_data);
 	ref = kmalloc(sizeof(*ref), GFP_NOFS);
 	if (!ref)
 		return -ENOMEM;
 
+	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+	if (!head_ref) {
+		kfree(ref);
+		return -ENOMEM;
+	}
+
+	head_ref->extent_op = extent_op;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+
 	/*
-	 * the parent = 0 case comes from cases where we don't actually
-	 * know the parent yet.  It will get updated later via a add/drop
-	 * pair.
+	 * insert both the head node and the new ref without dropping
+	 * the spin lock
 	 */
-	if (parent == 0)
-		parent = bytenr;
+	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
+				   action, 0);
+	BUG_ON(ret);
+
+	ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes,
+				   parent, ref_root, level, action);
+	BUG_ON(ret);
+	spin_unlock(&delayed_refs->lock);
+	return 0;
+}
+
+/*
+ * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
+ */
+int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+			       u64 bytenr, u64 num_bytes,
+			       u64 parent, u64 ref_root,
+			       u64 owner, u64 offset, int action,
+			       struct btrfs_delayed_extent_op *extent_op)
+{
+	struct btrfs_delayed_data_ref *ref;
+	struct btrfs_delayed_ref_head *head_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	int ret;
+
+	BUG_ON(extent_op && !extent_op->is_data);
+	ref = kmalloc(sizeof(*ref), GFP_NOFS);
+	if (!ref)
+		return -ENOMEM;
 
 	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
 	if (!head_ref) {
 		kfree(ref);
 		return -ENOMEM;
 	}
+
+	head_ref->extent_op = extent_op;
+
 	delayed_refs = &trans->transaction->delayed_refs;
 	spin_lock(&delayed_refs->lock);
 
@@ -563,14 +787,39 @@ int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
 	 * insert both the head node and the new ref without dropping
 	 * the spin lock
 	 */
-	ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
-				      (u64)-1, 0, 0, 0, action, pin);
+	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
+				   action, 1);
 	BUG_ON(ret);
 
-	ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
-				      parent, ref_root, ref_generation,
-				      owner_objectid, action, pin);
+	ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes,
+				   parent, ref_root, owner, offset, action);
+	BUG_ON(ret);
+	spin_unlock(&delayed_refs->lock);
+	return 0;
+}
+
+int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+				u64 bytenr, u64 num_bytes,
+				struct btrfs_delayed_extent_op *extent_op)
+{
+	struct btrfs_delayed_ref_head *head_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	int ret;
+
+	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+	if (!head_ref)
+		return -ENOMEM;
+
+	head_ref->extent_op = extent_op;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+
+	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr,
+				   num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
+				   extent_op->is_data);
 	BUG_ON(ret);
+
 	spin_unlock(&delayed_refs->lock);
 	return 0;
 }
@@ -587,7 +836,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
 	struct btrfs_delayed_ref_root *delayed_refs;
 
 	delayed_refs = &trans->transaction->delayed_refs;
-	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+	ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
 	if (ref)
 		return btrfs_delayed_node_to_head(ref);
 	return NULL;
@@ -603,6 +852,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
  *
  * It is the same as doing a ref add and delete in two separate calls.
  */
+#if 0
 int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
 			  u64 bytenr, u64 num_bytes, u64 orig_parent,
 			  u64 parent, u64 orig_ref_root, u64 ref_root,
@@ -666,3 +916,4 @@ int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
 	spin_unlock(&delayed_refs->lock);
 	return 0;
 }
+#endif
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 3bec2ff..f6fc67d 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -30,9 +30,6 @@ struct btrfs_delayed_ref_node {
 	/* the starting bytenr of the extent */
 	u64 bytenr;
 
-	/* the parent our backref will point to */
-	u64 parent;
-
 	/* the size of the extent */
 	u64 num_bytes;
 
@@ -50,10 +47,21 @@ struct btrfs_delayed_ref_node {
 	 */
 	int ref_mod;
 
+	unsigned int action:8;
+	unsigned int type:8;
 	/* is this node still in the rbtree? */
+	unsigned int is_head:1;
 	unsigned int in_tree:1;
 };
 
+struct btrfs_delayed_extent_op {
+	struct btrfs_disk_key key;
+	u64 flags_to_set;
+	unsigned int update_key:1;
+	unsigned int update_flags:1;
+	unsigned int is_data:1;
+};
+
 /*
  * the head refs are used to hold a lock on a given extent, which allows us
  * to make sure that only one process is running the delayed refs
@@ -71,6 +79,7 @@ struct btrfs_delayed_ref_head {
 
 	struct list_head cluster;
 
+	struct btrfs_delayed_extent_op *extent_op;
 	/*
 	 * when a new extent is allocated, it is just reserved in memory
 	 * The actual extent isn't inserted into the extent allocation tree
@@ -84,27 +93,26 @@ struct btrfs_delayed_ref_head {
 	 * the free has happened.
 	 */
 	unsigned int must_insert_reserved:1;
+	unsigned int is_data:1;
 };
 
-struct btrfs_delayed_ref {
+struct btrfs_delayed_tree_ref {
 	struct btrfs_delayed_ref_node node;
+	union {
+		u64 root;
+		u64 parent;
+	};
+	int level;
+};
 
-	/* the root objectid our ref will point to */
-	u64 root;
-
-	/* the generation for the backref */
-	u64 generation;
-
-	/* owner_objectid of the backref  */
-	u64 owner_objectid;
-
-	/* operation done by this entry in the rbtree */
-	u8 action;
-
-	/* if pin == 1, when the extent is freed it will be pinned until
-	 * transaction commit
-	 */
-	unsigned int pin:1;
+struct btrfs_delayed_data_ref {
+	struct btrfs_delayed_ref_node node;
+	union {
+		u64 root;
+		u64 parent;
+	};
+	u64 objectid;
+	u64 offset;
 };
 
 struct btrfs_delayed_ref_root {
@@ -143,17 +151,25 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
 	}
 }
 
-int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
-			  u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
-			  u64 ref_generation, u64 owner_objectid, int action,
-			  int pin);
+int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+			       u64 bytenr, u64 num_bytes, u64 parent,
+			       u64 ref_root, int level, int action,
+			       struct btrfs_delayed_extent_op *extent_op);
+int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+			       u64 bytenr, u64 num_bytes,
+			       u64 parent, u64 ref_root,
+			       u64 owner, u64 offset, int action,
+			       struct btrfs_delayed_extent_op *extent_op);
+int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+				u64 bytenr, u64 num_bytes,
+				struct btrfs_delayed_extent_op *extent_op);
 
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
 int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, u64 bytenr,
-			    u64 num_bytes, u32 *refs);
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 bytenr,
+			     u64 num_bytes, u64 *refs, u64 *flags);
 int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
 			  u64 bytenr, u64 num_bytes, u64 orig_parent,
 			  u64 parent, u64 orig_ref_root, u64 ref_root,
@@ -169,18 +185,24 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
  */
 static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
 {
-	return node->parent == (u64)-1;
+	return node->is_head;
 }
 
 /*
  * helper functions to cast a node into its container
  */
-static inline struct btrfs_delayed_ref *
-btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node)
+static inline struct btrfs_delayed_tree_ref *
+btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node)
 {
 	WARN_ON(btrfs_delayed_ref_is_head(node));
-	return container_of(node, struct btrfs_delayed_ref, node);
+	return container_of(node, struct btrfs_delayed_tree_ref, node);
+}
 
+static inline struct btrfs_delayed_data_ref *
+btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node)
+{
+	WARN_ON(btrfs_delayed_ref_is_head(node));
+	return container_of(node, struct btrfs_delayed_data_ref, node);
 }
 
 static inline struct btrfs_delayed_ref_head *
@@ -188,6 +210,5 @@ btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
 {
 	WARN_ON(!btrfs_delayed_ref_is_head(node));
 	return container_of(node, struct btrfs_delayed_ref_head, node);
-
 }
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4b0ea0b..7f5c6e3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -36,7 +36,6 @@
 #include "print-tree.h"
 #include "async-thread.h"
 #include "locking.h"
-#include "ref-cache.h"
 #include "tree-log.h"
 #include "free-space-cache.h"
 
@@ -884,7 +883,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 {
 	root->node = NULL;
 	root->commit_root = NULL;
-	root->ref_tree = NULL;
 	root->sectorsize = sectorsize;
 	root->nodesize = nodesize;
 	root->leafsize = leafsize;
@@ -899,12 +897,14 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	root->last_inode_alloc = 0;
 	root->name = NULL;
 	root->in_sysfs = 0;
+	root->inode_tree.rb_node = NULL;
 
 	INIT_LIST_HEAD(&root->dirty_list);
 	INIT_LIST_HEAD(&root->orphan_list);
-	INIT_LIST_HEAD(&root->dead_list);
+	INIT_LIST_HEAD(&root->root_list);
 	spin_lock_init(&root->node_lock);
 	spin_lock_init(&root->list_lock);
+	spin_lock_init(&root->inode_lock);
 	mutex_init(&root->objectid_mutex);
 	mutex_init(&root->log_mutex);
 	init_waitqueue_head(&root->log_writer_wait);
@@ -918,9 +918,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	extent_io_tree_init(&root->dirty_log_pages,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 
-	btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
-	root->ref_tree = &root->ref_tree_struct;
-
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
 	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
@@ -959,6 +956,7 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
 				     blocksize, generation);
+	root->commit_root = btrfs_root_node(root);
 	BUG_ON(!root->node);
 	return 0;
 }
@@ -1025,20 +1023,19 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 	 */
 	root->ref_cows = 0;
 
-	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
-				      0, BTRFS_TREE_LOG_OBJECTID,
-				      trans->transid, 0, 0, 0);
+	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+				      BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
 	if (IS_ERR(leaf)) {
 		kfree(root);
 		return ERR_CAST(leaf);
 	}
 
+	memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+	btrfs_set_header_bytenr(leaf, leaf->start);
+	btrfs_set_header_generation(leaf, trans->transid);
+	btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
+	btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
 	root->node = leaf;
-	btrfs_set_header_nritems(root->node, 0);
-	btrfs_set_header_level(root->node, 0);
-	btrfs_set_header_bytenr(root->node, root->node->start);
-	btrfs_set_header_generation(root->node, trans->transid);
-	btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID);
 
 	write_extent_buffer(root->node, root->fs_info->fsid,
 			    (unsigned long)btrfs_header_fsid(root->node),
@@ -1081,8 +1078,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 	inode_item->nbytes = cpu_to_le64(root->leafsize);
 	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
 
-	btrfs_set_root_bytenr(&log_root->root_item, log_root->node->start);
-	btrfs_set_root_generation(&log_root->root_item, trans->transid);
+	btrfs_set_root_node(&log_root->root_item, log_root->node);
 
 	WARN_ON(root->log_root);
 	root->log_root = log_root;
@@ -1144,6 +1140,7 @@ out:
 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
 				     blocksize, generation);
+	root->commit_root = btrfs_root_node(root);
 	BUG_ON(!root->node);
 insert:
 	if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
@@ -1210,7 +1207,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
 	}
 	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
 		ret = btrfs_find_dead_roots(fs_info->tree_root,
-					    root->root_key.objectid, root);
+					    root->root_key.objectid);
 		BUG_ON(ret);
 		btrfs_orphan_cleanup(root);
 	}
@@ -1569,8 +1566,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	atomic_set(&fs_info->async_delalloc_pages, 0);
 	atomic_set(&fs_info->async_submit_draining, 0);
 	atomic_set(&fs_info->nr_async_bios, 0);
-	atomic_set(&fs_info->throttles, 0);
-	atomic_set(&fs_info->throttle_gen, 0);
 	fs_info->sb = sb;
 	fs_info->max_extent = (u64)-1;
 	fs_info->max_inline = 8192 * 1024;
@@ -1598,6 +1593,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
 	fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
 
+	RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
 	extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
 			     fs_info->btree_inode->i_mapping,
 			     GFP_NOFS);
@@ -1613,10 +1609,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->do_barriers = 1;
 
-	INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
-	btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
-	btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
-
 	BTRFS_I(fs_info->btree_inode)->root = tree_root;
 	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
 	       sizeof(struct btrfs_key));
@@ -1674,6 +1666,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		goto fail_iput;
 	}
 
+	features = btrfs_super_incompat_flags(disk_super);
+	if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) {
+		features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+		btrfs_set_super_incompat_flags(disk_super, features);
+	}
+
 	features = btrfs_super_compat_ro_flags(disk_super) &
 		~BTRFS_FEATURE_COMPAT_RO_SUPP;
 	if (!(sb->s_flags & MS_RDONLY) && features) {
@@ -1771,7 +1769,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (ret) {
 		printk(KERN_WARNING "btrfs: failed to read the system "
 		       "array on %s\n", sb->s_id);
-		goto fail_sys_array;
+		goto fail_sb_buffer;
 	}
 
 	blocksize = btrfs_level_size(tree_root,
@@ -1785,6 +1783,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 					   btrfs_super_chunk_root(disk_super),
 					   blocksize, generation);
 	BUG_ON(!chunk_root->node);
+	btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
+	chunk_root->commit_root = btrfs_root_node(chunk_root);
 
 	read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
 	   (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
@@ -1810,7 +1810,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 					  blocksize, generation);
 	if (!tree_root->node)
 		goto fail_chunk_root;
-
+	btrfs_set_root_node(&tree_root->root_item, tree_root->node);
+	tree_root->commit_root = btrfs_root_node(tree_root);
 
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
@@ -1820,14 +1821,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_DEV_TREE_OBJECTID, dev_root);
-	dev_root->track_dirty = 1;
 	if (ret)
 		goto fail_extent_root;
+	dev_root->track_dirty = 1;
 
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_CSUM_TREE_OBJECTID, csum_root);
 	if (ret)
-		goto fail_extent_root;
+		goto fail_dev_root;
 
 	csum_root->track_dirty = 1;
 
@@ -1881,7 +1882,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	}
 
 	if (!(sb->s_flags & MS_RDONLY)) {
-		ret = btrfs_cleanup_reloc_trees(tree_root);
+		ret = btrfs_recover_relocation(tree_root);
 		BUG_ON(ret);
 	}
 
@@ -1908,14 +1909,19 @@ fail_cleaner:
 
 fail_csum_root:
 	free_extent_buffer(csum_root->node);
+	free_extent_buffer(csum_root->commit_root);
+fail_dev_root:
+	free_extent_buffer(dev_root->node);
+	free_extent_buffer(dev_root->commit_root);
 fail_extent_root:
 	free_extent_buffer(extent_root->node);
+	free_extent_buffer(extent_root->commit_root);
 fail_tree_root:
 	free_extent_buffer(tree_root->node);
+	free_extent_buffer(tree_root->commit_root);
 fail_chunk_root:
 	free_extent_buffer(chunk_root->node);
-fail_sys_array:
-	free_extent_buffer(dev_root->node);
+	free_extent_buffer(chunk_root->commit_root);
 fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->fixup_workers);
 	btrfs_stop_workers(&fs_info->delalloc_workers);
@@ -2173,6 +2179,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
 
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 {
+	WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
 	radix_tree_delete(&fs_info->fs_roots_radix,
 			  (unsigned long)root->root_key.objectid);
 	if (root->anon_super.s_dev) {
@@ -2219,10 +2226,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
 					     ARRAY_SIZE(gang));
 		if (!ret)
 			break;
+
+		root_objectid = gang[ret - 1]->root_key.objectid + 1;
 		for (i = 0; i < ret; i++) {
 			root_objectid = gang[i]->root_key.objectid;
 			ret = btrfs_find_dead_roots(fs_info->tree_root,
-						    root_objectid, gang[i]);
+						    root_objectid);
 			BUG_ON(ret);
 			btrfs_orphan_cleanup(gang[i]);
 		}
@@ -2278,20 +2287,16 @@ int close_ctree(struct btrfs_root *root)
 		       (unsigned long long)fs_info->total_ref_cache_size);
 	}
 
-	if (fs_info->extent_root->node)
-		free_extent_buffer(fs_info->extent_root->node);
-
-	if (fs_info->tree_root->node)
-		free_extent_buffer(fs_info->tree_root->node);
-
-	if (root->fs_info->chunk_root->node)
-		free_extent_buffer(root->fs_info->chunk_root->node);
-
-	if (root->fs_info->dev_root->node)
-		free_extent_buffer(root->fs_info->dev_root->node);
-
-	if (root->fs_info->csum_root->node)
-		free_extent_buffer(root->fs_info->csum_root->node);
+	free_extent_buffer(fs_info->extent_root->node);
+	free_extent_buffer(fs_info->extent_root->commit_root);
+	free_extent_buffer(fs_info->tree_root->node);
+	free_extent_buffer(fs_info->tree_root->commit_root);
+	free_extent_buffer(root->fs_info->chunk_root->node);
+	free_extent_buffer(root->fs_info->chunk_root->commit_root);
+	free_extent_buffer(root->fs_info->dev_root->node);
+	free_extent_buffer(root->fs_info->dev_root->commit_root);
+	free_extent_buffer(root->fs_info->csum_root->node);
+	free_extent_buffer(root->fs_info->csum_root->commit_root);
 
 	btrfs_free_block_groups(root->fs_info);
 
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 85315d2..9596b40 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -78,7 +78,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 	key.offset = 0;
 
-	inode = btrfs_iget(sb, &key, root, NULL);
+	inode = btrfs_iget(sb, &key, root);
 	if (IS_ERR(inode))
 		return (void *)inode;
 
@@ -192,7 +192,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 	key.offset = 0;
 
-	return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
+	return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
 }
 
 const struct export_operations btrfs_export_ops = {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 35af933..a42419c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -30,43 +30,33 @@
 #include "transaction.h"
 #include "volumes.h"
 #include "locking.h"
-#include "ref-cache.h"
 #include "free-space-cache.h"
 
-#define PENDING_EXTENT_INSERT 0
-#define PENDING_EXTENT_DELETE 1
-#define PENDING_BACKREF_UPDATE 2
-
-struct pending_extent_op {
-	int type;
-	u64 bytenr;
-	u64 num_bytes;
-	u64 parent;
-	u64 orig_parent;
-	u64 generation;
-	u64 orig_generation;
-	int level;
-	struct list_head list;
-	int del;
-};
-
-static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
-					 struct btrfs_root *root, u64 parent,
-					 u64 root_objectid, u64 ref_generation,
-					 u64 owner, struct btrfs_key *ins,
-					 int ref_mod);
 static int update_reserved_extents(struct btrfs_root *root,
 				   u64 bytenr, u64 num, int reserve);
 static int update_block_group(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      u64 bytenr, u64 num_bytes, int alloc,
 			      int mark_free);
-static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans,
-					struct btrfs_root *root,
-					u64 bytenr, u64 num_bytes, u64 parent,
-					u64 root_objectid, u64 ref_generation,
-					u64 owner_objectid, int pin,
-					int ref_to_drop);
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 bytenr, u64 num_bytes, u64 parent,
+				u64 root_objectid, u64 owner_objectid,
+				u64 owner_offset, int refs_to_drop,
+				struct btrfs_delayed_extent_op *extra_op);
+static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
+				    struct extent_buffer *leaf,
+				    struct btrfs_extent_item *ei);
+static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      u64 parent, u64 root_objectid,
+				      u64 flags, u64 owner, u64 offset,
+				      struct btrfs_key *ins, int ref_mod);
+static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     u64 parent, u64 root_objectid,
+				     u64 flags, struct btrfs_disk_key *key,
+				     int level, struct btrfs_key *ins);
 
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
@@ -453,348 +443,1078 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
  *    maintenance.  This is actually the same as #2, but with a slightly
  *    different use case.
  *
+ * There are two kinds of back refs. The implicit back refs is optimized
+ * for pointers in non-shared tree blocks. For a given pointer in a block,
+ * back refs of this kind provide information about the block's owner tree
+ * and the pointer's key. These information allow us to find the block by
+ * b-tree searching. The full back refs is for pointers in tree blocks not
+ * referenced by their owner trees. The location of tree block is recorded
+ * in the back refs. Actually the full back refs is generic, and can be
+ * used in all cases the implicit back refs is used. The major shortcoming
+ * of the full back refs is its overhead. Every time a tree block gets
+ * COWed, we have to update back refs entry for all pointers in it.
+ *
+ * For a newly allocated tree block, we use implicit back refs for
+ * pointers in it. This means most tree related operations only involve
+ * implicit back refs. For a tree block created in old transaction, the
+ * only way to drop a reference to it is COW it. So we can detect the
+ * event that tree block loses its owner tree's reference and do the
+ * back refs conversion.
+ *
+ * When a tree block is COW'd through a tree, there are four cases:
+ *
+ * The reference count of the block is one and the tree is the block's
+ * owner tree. Nothing to do in this case.
+ *
+ * The reference count of the block is one and the tree is not the
+ * block's owner tree. In this case, full back refs is used for pointers
+ * in the block. Remove these full back refs, add implicit back refs for
+ * every pointers in the new block.
+ *
+ * The reference count of the block is greater than one and the tree is
+ * the block's owner tree. In this case, implicit back refs is used for
+ * pointers in the block. Add full back refs for every pointers in the
+ * block, increase lower level extents' reference counts. The original
+ * implicit back refs are entailed to the new block.
+ *
+ * The reference count of the block is greater than one and the tree is
+ * not the block's owner tree. Add implicit back refs for every pointer in
+ * the new block, increase lower level extents' reference count.
+ *
+ * Back Reference Key composing:
+ *
+ * The key objectid corresponds to the first byte in the extent,
+ * The key type is used to differentiate between types of back refs.
+ * There are different meanings of the key offset for different types
+ * of back refs.
+ *
  * File extents can be referenced by:
  *
  * - multiple snapshots, subvolumes, or different generations in one subvol
  * - different files inside a single subvolume
  * - different offsets inside a file (bookend extents in file.c)
  *
- * The extent ref structure has fields for:
+ * The extent ref structure for the implicit back refs has fields for:
  *
  * - Objectid of the subvolume root
- * - Generation number of the tree holding the reference
  * - objectid of the file holding the reference
- * - number of references holding by parent node (alway 1 for tree blocks)
- *
- * Btree leaf may hold multiple references to a file extent. In most cases,
- * these references are from same file and the corresponding offsets inside
- * the file are close together.
- *
- * When a file extent is allocated the fields are filled in:
- *     (root_key.objectid, trans->transid, inode objectid, 1)
+ * - original offset in the file
+ * - how many bookend extents
  *
- * When a leaf is cow'd new references are added for every file extent found
- * in the leaf.  It looks similar to the create case, but trans->transid will
- * be different when the block is cow'd.
+ * The key offset for the implicit back refs is hash of the first
+ * three fields.
  *
- *     (root_key.objectid, trans->transid, inode objectid,
- *      number of references in the leaf)
+ * The extent ref structure for the full back refs has field for:
  *
- * When a file extent is removed either during snapshot deletion or
- * file truncation, we find the corresponding back reference and check
- * the following fields:
+ * - number of pointers in the tree leaf
  *
- *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
- *      inode objectid)
+ * The key offset for the implicit back refs is the first byte of
+ * the tree leaf
  *
- * Btree extents can be referenced by:
- *
- * - Different subvolumes
- * - Different generations of the same subvolume
- *
- * When a tree block is created, back references are inserted:
+ * When a file extent is allocated, The implicit back refs is used.
+ * the fields are filled in:
  *
- * (root->root_key.objectid, trans->transid, level, 1)
+ *     (root_key.objectid, inode objectid, offset in file, 1)
  *
- * When a tree block is cow'd, new back references are added for all the
- * blocks it points to. If the tree block isn't in reference counted root,
- * the old back references are removed. These new back references are of
- * the form (trans->transid will have increased since creation):
+ * When a file extent is removed file truncation, we find the
+ * corresponding implicit back refs and check the following fields:
  *
- * (root->root_key.objectid, trans->transid, level, 1)
+ *     (btrfs_header_owner(leaf), inode objectid, offset in file)
  *
- * When a backref is in deleting, the following fields are checked:
+ * Btree extents can be referenced by:
  *
- * if backref was for a tree root:
- *     (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
- * else
- *     (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
+ * - Different subvolumes
  *
- * Back Reference Key composing:
+ * Both the implicit back refs and the full back refs for tree blocks
+ * only consist of key. The key offset for the implicit back refs is
+ * objectid of block's owner tree. The key offset for the full back refs
+ * is the first byte of parent block.
  *
- * The key objectid corresponds to the first byte in the extent, the key
- * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
- * byte of parent extent. If a extent is tree root, the key offset is set
- * to the key objectid.
+ * When implicit back refs is used, information about the lowest key and
+ * level of the tree block are required. These information are stored in
+ * tree block info structure.
  */
 
-static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans,
-					  struct btrfs_root *root,
-					  struct btrfs_path *path,
-					  u64 bytenr, u64 parent,
-					  u64 ref_root, u64 ref_generation,
-					  u64 owner_objectid, int del)
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  struct btrfs_path *path,
+				  u64 owner, u32 extra_size)
 {
+	struct btrfs_extent_item *item;
+	struct btrfs_extent_item_v0 *ei0;
+	struct btrfs_extent_ref_v0 *ref0;
+	struct btrfs_tree_block_info *bi;
+	struct extent_buffer *leaf;
 	struct btrfs_key key;
-	struct btrfs_extent_ref *ref;
+	struct btrfs_key found_key;
+	u32 new_size = sizeof(*item);
+	u64 refs;
+	int ret;
+
+	leaf = path->nodes[0];
+	BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
+
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+	ei0 = btrfs_item_ptr(leaf, path->slots[0],
+			     struct btrfs_extent_item_v0);
+	refs = btrfs_extent_refs_v0(leaf, ei0);
+
+	if (owner == (u64)-1) {
+		while (1) {
+			if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+				ret = btrfs_next_leaf(root, path);
+				if (ret < 0)
+					return ret;
+				BUG_ON(ret > 0);
+				leaf = path->nodes[0];
+			}
+			btrfs_item_key_to_cpu(leaf, &found_key,
+					      path->slots[0]);
+			BUG_ON(key.objectid != found_key.objectid);
+			if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
+				path->slots[0]++;
+				continue;
+			}
+			ref0 = btrfs_item_ptr(leaf, path->slots[0],
+					      struct btrfs_extent_ref_v0);
+			owner = btrfs_ref_objectid_v0(leaf, ref0);
+			break;
+		}
+	}
+	btrfs_release_path(root, path);
+
+	if (owner < BTRFS_FIRST_FREE_OBJECTID)
+		new_size += sizeof(*bi);
+
+	new_size -= sizeof(*ei0);
+	ret = btrfs_search_slot(trans, root, &key, path,
+				new_size + extra_size, 1);
+	if (ret < 0)
+		return ret;
+	BUG_ON(ret);
+
+	ret = btrfs_extend_item(trans, root, path, new_size);
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	btrfs_set_extent_refs(leaf, item, refs);
+	/* FIXME: get real generation */
+	btrfs_set_extent_generation(leaf, item, 0);
+	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		btrfs_set_extent_flags(leaf, item,
+				       BTRFS_EXTENT_FLAG_TREE_BLOCK |
+				       BTRFS_BLOCK_FLAG_FULL_BACKREF);
+		bi = (struct btrfs_tree_block_info *)(item + 1);
+		/* FIXME: get first key of the block */
+		memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
+		btrfs_set_tree_block_level(leaf, bi, (int)owner);
+	} else {
+		btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
+	}
+	btrfs_mark_buffer_dirty(leaf);
+	return 0;
+}
+#endif
+
+static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
+{
+	u32 high_crc = ~(u32)0;
+	u32 low_crc = ~(u32)0;
+	__le64 lenum;
+
+	lenum = cpu_to_le64(root_objectid);
+	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
+	lenum = cpu_to_le64(owner);
+	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+	lenum = cpu_to_le64(offset);
+	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+
+	return ((u64)high_crc << 31) ^ (u64)low_crc;
+}
+
+static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
+				     struct btrfs_extent_data_ref *ref)
+{
+	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
+				    btrfs_extent_data_ref_objectid(leaf, ref),
+				    btrfs_extent_data_ref_offset(leaf, ref));
+}
+
+static int match_extent_data_ref(struct extent_buffer *leaf,
+				 struct btrfs_extent_data_ref *ref,
+				 u64 root_objectid, u64 owner, u64 offset)
+{
+	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
+	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
+	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
+		return 0;
+	return 1;
+}
+
+static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
+					   struct btrfs_root *root,
+					   struct btrfs_path *path,
+					   u64 bytenr, u64 parent,
+					   u64 root_objectid,
+					   u64 owner, u64 offset)
+{
+	struct btrfs_key key;
+	struct btrfs_extent_data_ref *ref;
 	struct extent_buffer *leaf;
-	u64 ref_objectid;
+	u32 nritems;
 	int ret;
+	int recow;
+	int err = -ENOENT;
 
 	key.objectid = bytenr;
-	key.type = BTRFS_EXTENT_REF_KEY;
-	key.offset = parent;
+	if (parent) {
+		key.type = BTRFS_SHARED_DATA_REF_KEY;
+		key.offset = parent;
+	} else {
+		key.type = BTRFS_EXTENT_DATA_REF_KEY;
+		key.offset = hash_extent_data_ref(root_objectid,
+						  owner, offset);
+	}
+again:
+	recow = 0;
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0) {
+		err = ret;
+		goto fail;
+	}
 
-	ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
-	if (ret < 0)
-		goto out;
-	if (ret > 0) {
-		ret = -ENOENT;
-		goto out;
+	if (parent) {
+		if (!ret)
+			return 0;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		key.type = BTRFS_EXTENT_REF_V0_KEY;
+		btrfs_release_path(root, path);
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret < 0) {
+			err = ret;
+			goto fail;
+		}
+		if (!ret)
+			return 0;
+#endif
+		goto fail;
 	}
 
 	leaf = path->nodes[0];
-	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
-	ref_objectid = btrfs_ref_objectid(leaf, ref);
-	if (btrfs_ref_root(leaf, ref) != ref_root ||
-	    btrfs_ref_generation(leaf, ref) != ref_generation ||
-	    (ref_objectid != owner_objectid &&
-	     ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
-		ret = -EIO;
-		WARN_ON(1);
-		goto out;
+	nritems = btrfs_header_nritems(leaf);
+	while (1) {
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				err = ret;
+			if (ret)
+				goto fail;
+
+			leaf = path->nodes[0];
+			nritems = btrfs_header_nritems(leaf);
+			recow = 1;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid != bytenr ||
+		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
+			goto fail;
+
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_extent_data_ref);
+
+		if (match_extent_data_ref(leaf, ref, root_objectid,
+					  owner, offset)) {
+			if (recow) {
+				btrfs_release_path(root, path);
+				goto again;
+			}
+			err = 0;
+			break;
+		}
+		path->slots[0]++;
 	}
-	ret = 0;
-out:
-	return ret;
+fail:
+	return err;
 }
 
-static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
-					  struct btrfs_root *root,
-					  struct btrfs_path *path,
-					  u64 bytenr, u64 parent,
-					  u64 ref_root, u64 ref_generation,
-					  u64 owner_objectid,
-					  int refs_to_add)
+static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
+					   struct btrfs_root *root,
+					   struct btrfs_path *path,
+					   u64 bytenr, u64 parent,
+					   u64 root_objectid, u64 owner,
+					   u64 offset, int refs_to_add)
 {
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
-	struct btrfs_extent_ref *ref;
+	u32 size;
 	u32 num_refs;
 	int ret;
 
 	key.objectid = bytenr;
-	key.type = BTRFS_EXTENT_REF_KEY;
-	key.offset = parent;
+	if (parent) {
+		key.type = BTRFS_SHARED_DATA_REF_KEY;
+		key.offset = parent;
+		size = sizeof(struct btrfs_shared_data_ref);
+	} else {
+		key.type = BTRFS_EXTENT_DATA_REF_KEY;
+		key.offset = hash_extent_data_ref(root_objectid,
+						  owner, offset);
+		size = sizeof(struct btrfs_extent_data_ref);
+	}
 
-	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
-	if (ret == 0) {
-		leaf = path->nodes[0];
-		ref = btrfs_item_ptr(leaf, path->slots[0],
-				     struct btrfs_extent_ref);
-		btrfs_set_ref_root(leaf, ref, ref_root);
-		btrfs_set_ref_generation(leaf, ref, ref_generation);
-		btrfs_set_ref_objectid(leaf, ref, owner_objectid);
-		btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
-	} else if (ret == -EEXIST) {
-		u64 existing_owner;
-
-		BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
-		leaf = path->nodes[0];
+	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
+	if (ret && ret != -EEXIST)
+		goto fail;
+
+	leaf = path->nodes[0];
+	if (parent) {
+		struct btrfs_shared_data_ref *ref;
 		ref = btrfs_item_ptr(leaf, path->slots[0],
-				     struct btrfs_extent_ref);
-		if (btrfs_ref_root(leaf, ref) != ref_root ||
-		    btrfs_ref_generation(leaf, ref) != ref_generation) {
-			ret = -EIO;
-			WARN_ON(1);
-			goto out;
+				     struct btrfs_shared_data_ref);
+		if (ret == 0) {
+			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
+		} else {
+			num_refs = btrfs_shared_data_ref_count(leaf, ref);
+			num_refs += refs_to_add;
+			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
 		}
+	} else {
+		struct btrfs_extent_data_ref *ref;
+		while (ret == -EEXIST) {
+			ref = btrfs_item_ptr(leaf, path->slots[0],
+					     struct btrfs_extent_data_ref);
+			if (match_extent_data_ref(leaf, ref, root_objectid,
+						  owner, offset))
+				break;
+			btrfs_release_path(root, path);
+			key.offset++;
+			ret = btrfs_insert_empty_item(trans, root, path, &key,
+						      size);
+			if (ret && ret != -EEXIST)
+				goto fail;
 
-		num_refs = btrfs_ref_num_refs(leaf, ref);
-		BUG_ON(num_refs == 0);
-		btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add);
-
-		existing_owner = btrfs_ref_objectid(leaf, ref);
-		if (existing_owner != owner_objectid &&
-		    existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
-			btrfs_set_ref_objectid(leaf, ref,
-					BTRFS_MULTIPLE_OBJECTIDS);
+			leaf = path->nodes[0];
+		}
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_extent_data_ref);
+		if (ret == 0) {
+			btrfs_set_extent_data_ref_root(leaf, ref,
+						       root_objectid);
+			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
+			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
+			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
+		} else {
+			num_refs = btrfs_extent_data_ref_count(leaf, ref);
+			num_refs += refs_to_add;
+			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
 		}
-		ret = 0;
-	} else {
-		goto out;
 	}
-	btrfs_unlock_up_safe(path, 1);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
-out:
+	btrfs_mark_buffer_dirty(leaf);
+	ret = 0;
+fail:
 	btrfs_release_path(root, path);
 	return ret;
 }
 
-static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
-					  struct btrfs_root *root,
-					  struct btrfs_path *path,
-					  int refs_to_drop)
+static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
+					   struct btrfs_root *root,
+					   struct btrfs_path *path,
+					   int refs_to_drop)
 {
+	struct btrfs_key key;
+	struct btrfs_extent_data_ref *ref1 = NULL;
+	struct btrfs_shared_data_ref *ref2 = NULL;
 	struct extent_buffer *leaf;
-	struct btrfs_extent_ref *ref;
-	u32 num_refs;
+	u32 num_refs = 0;
 	int ret = 0;
 
 	leaf = path->nodes[0];
-	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
-	num_refs = btrfs_ref_num_refs(leaf, ref);
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+		ref1 = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_extent_data_ref);
+		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+		ref2 = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_shared_data_ref);
+		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
+		struct btrfs_extent_ref_v0 *ref0;
+		ref0 = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_extent_ref_v0);
+		num_refs = btrfs_ref_count_v0(leaf, ref0);
+#endif
+	} else {
+		BUG();
+	}
+
 	BUG_ON(num_refs < refs_to_drop);
 	num_refs -= refs_to_drop;
+
 	if (num_refs == 0) {
 		ret = btrfs_del_item(trans, root, path);
 	} else {
-		btrfs_set_ref_num_refs(leaf, ref, num_refs);
+		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
+			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
+		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
+			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		else {
+			struct btrfs_extent_ref_v0 *ref0;
+			ref0 = btrfs_item_ptr(leaf, path->slots[0],
+					struct btrfs_extent_ref_v0);
+			btrfs_set_ref_count_v0(leaf, ref0, num_refs);
+		}
+#endif
 		btrfs_mark_buffer_dirty(leaf);
 	}
-	btrfs_release_path(root, path);
 	return ret;
 }
 
-#ifdef BIO_RW_DISCARD
-static void btrfs_issue_discard(struct block_device *bdev,
-				u64 start, u64 len)
+static noinline u32 extent_data_ref_count(struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  struct btrfs_extent_inline_ref *iref)
 {
-	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
-}
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_data_ref *ref1;
+	struct btrfs_shared_data_ref *ref2;
+	u32 num_refs = 0;
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+	if (iref) {
+		if (btrfs_extent_inline_ref_type(leaf, iref) ==
+		    BTRFS_EXTENT_DATA_REF_KEY) {
+			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
+			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+		} else {
+			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
+			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+		}
+	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+		ref1 = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_extent_data_ref);
+		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+		ref2 = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_shared_data_ref);
+		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
+		struct btrfs_extent_ref_v0 *ref0;
+		ref0 = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_extent_ref_v0);
+		num_refs = btrfs_ref_count_v0(leaf, ref0);
 #endif
+	} else {
+		WARN_ON(1);
+	}
+	return num_refs;
+}
 
-static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
-				u64 num_bytes)
+static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  u64 bytenr, u64 parent,
+					  u64 root_objectid)
 {
-#ifdef BIO_RW_DISCARD
+	struct btrfs_key key;
 	int ret;
-	u64 map_length = num_bytes;
-	struct btrfs_multi_bio *multi = NULL;
-
-	/* Tell the block device(s) that the sectors can be discarded */
-	ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
-			      bytenr, &map_length, &multi, 0);
-	if (!ret) {
-		struct btrfs_bio_stripe *stripe = multi->stripes;
-		int i;
-
-		if (map_length > num_bytes)
-			map_length = num_bytes;
 
-		for (i = 0; i < multi->num_stripes; i++, stripe++) {
-			btrfs_issue_discard(stripe->dev->bdev,
-					    stripe->physical,
-					    map_length);
-		}
-		kfree(multi);
+	key.objectid = bytenr;
+	if (parent) {
+		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+		key.offset = parent;
+	} else {
+		key.type = BTRFS_TREE_BLOCK_REF_KEY;
+		key.offset = root_objectid;
 	}
 
-	return ret;
-#else
-	return 0;
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0)
+		ret = -ENOENT;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (ret == -ENOENT && parent) {
+		btrfs_release_path(root, path);
+		key.type = BTRFS_EXTENT_REF_V0_KEY;
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret > 0)
+			ret = -ENOENT;
+	}
 #endif
+	return ret;
 }
 
-static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
-				     struct btrfs_root *root, u64 bytenr,
-				     u64 num_bytes,
-				     u64 orig_parent, u64 parent,
-				     u64 orig_root, u64 ref_root,
-				     u64 orig_generation, u64 ref_generation,
-				     u64 owner_objectid)
+static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  u64 bytenr, u64 parent,
+					  u64 root_objectid)
 {
+	struct btrfs_key key;
 	int ret;
-	int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID;
 
-	ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes,
-				       orig_parent, parent, orig_root,
-				       ref_root, orig_generation,
-				       ref_generation, owner_objectid, pin);
-	BUG_ON(ret);
+	key.objectid = bytenr;
+	if (parent) {
+		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+		key.offset = parent;
+	} else {
+		key.type = BTRFS_TREE_BLOCK_REF_KEY;
+		key.offset = root_objectid;
+	}
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+	btrfs_release_path(root, path);
 	return ret;
 }
 
-int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, u64 bytenr,
-			    u64 num_bytes, u64 orig_parent, u64 parent,
-			    u64 ref_root, u64 ref_generation,
-			    u64 owner_objectid)
+static inline int extent_ref_type(u64 parent, u64 owner)
 {
-	int ret;
-	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
-	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
-		return 0;
-
-	ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
-					orig_parent, parent, ref_root,
-					ref_root, ref_generation,
-					ref_generation, owner_objectid);
-	return ret;
+	int type;
+	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		if (parent > 0)
+			type = BTRFS_SHARED_BLOCK_REF_KEY;
+		else
+			type = BTRFS_TREE_BLOCK_REF_KEY;
+	} else {
+		if (parent > 0)
+			type = BTRFS_SHARED_DATA_REF_KEY;
+		else
+			type = BTRFS_EXTENT_DATA_REF_KEY;
+	}
+	return type;
 }
-static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-				  struct btrfs_root *root, u64 bytenr,
-				  u64 num_bytes,
-				  u64 orig_parent, u64 parent,
-				  u64 orig_root, u64 ref_root,
-				  u64 orig_generation, u64 ref_generation,
-				  u64 owner_objectid)
-{
-	int ret;
 
-	ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
-				    ref_generation, owner_objectid,
-				    BTRFS_ADD_DELAYED_REF, 0);
-	BUG_ON(ret);
-	return ret;
-}
+static int find_next_key(struct btrfs_path *path, struct btrfs_key *key)
 
-static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, u64 bytenr,
-			  u64 num_bytes, u64 parent, u64 ref_root,
-			  u64 ref_generation, u64 owner_objectid,
-			  int refs_to_add)
 {
-	struct btrfs_path *path;
-	int ret;
-	struct btrfs_key key;
-	struct extent_buffer *l;
-	struct btrfs_extent_item *item;
-	u32 refs;
+	int level;
+	BUG_ON(!path->keep_locks);
+	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+		if (!path->nodes[level])
+			break;
+		btrfs_assert_tree_locked(path->nodes[level]);
+		if (path->slots[level] + 1 >=
+		    btrfs_header_nritems(path->nodes[level]))
+			continue;
+		if (level == 0)
+			btrfs_item_key_to_cpu(path->nodes[level], key,
+					      path->slots[level] + 1);
+		else
+			btrfs_node_key_to_cpu(path->nodes[level], key,
+					      path->slots[level] + 1);
+		return 0;
+	}
+	return 1;
+}
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
+/*
+ * look for inline back ref. if back ref is found, *ref_ret is set
+ * to the address of inline back ref, and 0 is returned.
+ *
+ * if back ref isn't found, *ref_ret is set to the address where it
+ * should be inserted, and -ENOENT is returned.
+ *
+ * if insert is true and there are too many inline back refs, the path
+ * points to the extent item, and -EAGAIN is returned.
+ *
+ * NOTE: inline back refs are ordered in the same way that back ref
+ *	 items in the tree are ordered.
+ */
+static noinline_for_stack
+int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct btrfs_extent_inline_ref **ref_ret,
+				 u64 bytenr, u64 num_bytes,
+				 u64 parent, u64 root_objectid,
+				 u64 owner, u64 offset, int insert)
+{
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_item *ei;
+	struct btrfs_extent_inline_ref *iref;
+	u64 flags;
+	u64 item_size;
+	unsigned long ptr;
+	unsigned long end;
+	int extra_size;
+	int type;
+	int want;
+	int ret;
+	int err = 0;
 
-	path->reada = 1;
-	path->leave_spinning = 1;
 	key.objectid = bytenr;
 	key.type = BTRFS_EXTENT_ITEM_KEY;
 	key.offset = num_bytes;
 
-	/* first find the extent item and update its reference count */
-	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
-				path, 0, 1);
+	want = extent_ref_type(parent, owner);
+	if (insert) {
+		extra_size = btrfs_extent_inline_ref_size(want);
+		if (owner >= BTRFS_FIRST_FREE_OBJECTID)
+			path->keep_locks = 1;
+	} else
+		extra_size = -1;
+	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
 	if (ret < 0) {
-		btrfs_set_path_blocking(path);
+		err = ret;
+		goto out;
+	}
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (item_size < sizeof(*ei)) {
+		if (!insert) {
+			err = -ENOENT;
+			goto out;
+		}
+		ret = convert_extent_item_v0(trans, root, path, owner,
+					     extra_size);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	}
+#endif
+	BUG_ON(item_size < sizeof(*ei));
+
+	if (owner < BTRFS_FIRST_FREE_OBJECTID && insert &&
+	    item_size + extra_size >= BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
+		err = -EAGAIN;
+		goto out;
+	}
+
+	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	flags = btrfs_extent_flags(leaf, ei);
+
+	ptr = (unsigned long)(ei + 1);
+	end = (unsigned long)ei + item_size;
+
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+		ptr += sizeof(struct btrfs_tree_block_info);
+		BUG_ON(ptr > end);
+	} else {
+		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
+	}
+
+	err = -ENOENT;
+	while (1) {
+		if (ptr >= end) {
+			WARN_ON(ptr > end);
+			break;
+		}
+		iref = (struct btrfs_extent_inline_ref *)ptr;
+		type = btrfs_extent_inline_ref_type(leaf, iref);
+		if (want < type)
+			break;
+		if (want > type) {
+			ptr += btrfs_extent_inline_ref_size(type);
+			continue;
+		}
+
+		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+			struct btrfs_extent_data_ref *dref;
+			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+			if (match_extent_data_ref(leaf, dref, root_objectid,
+						  owner, offset)) {
+				err = 0;
+				break;
+			}
+			if (hash_extent_data_ref_item(leaf, dref) <
+			    hash_extent_data_ref(root_objectid, owner, offset))
+				break;
+		} else {
+			u64 ref_offset;
+			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
+			if (parent > 0) {
+				if (parent == ref_offset) {
+					err = 0;
+					break;
+				}
+				if (ref_offset < parent)
+					break;
+			} else {
+				if (root_objectid == ref_offset) {
+					err = 0;
+					break;
+				}
+				if (ref_offset < root_objectid)
+					break;
+			}
+		}
+		ptr += btrfs_extent_inline_ref_size(type);
+	}
+	if (err == -ENOENT && insert) {
+		if (item_size + extra_size >=
+		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
+			err = -EAGAIN;
+			goto out;
+		}
+		/*
+		 * To add new inline back ref, we have to make sure
+		 * there is no corresponding back ref item.
+		 * For simplicity, we just do not add new inline back
+		 * ref if there is any kind of item for this block
+		 */
+		if (owner >= BTRFS_FIRST_FREE_OBJECTID &&
+		    find_next_key(path, &key) == 0 && key.objectid == bytenr) {
+			err = -EAGAIN;
+			goto out;
+		}
+	}
+	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
+out:
+	if (insert && owner >= BTRFS_FIRST_FREE_OBJECTID) {
+		path->keep_locks = 0;
+		btrfs_unlock_up_safe(path, 1);
+	}
+	return err;
+}
+
+/*
+ * helper to add new inline back ref
+ */
+static noinline_for_stack
+int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct btrfs_path *path,
+				struct btrfs_extent_inline_ref *iref,
+				u64 parent, u64 root_objectid,
+				u64 owner, u64 offset, int refs_to_add,
+				struct btrfs_delayed_extent_op *extent_op)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_extent_item *ei;
+	unsigned long ptr;
+	unsigned long end;
+	unsigned long item_offset;
+	u64 refs;
+	int size;
+	int type;
+	int ret;
+
+	leaf = path->nodes[0];
+	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	item_offset = (unsigned long)iref - (unsigned long)ei;
+
+	type = extent_ref_type(parent, owner);
+	size = btrfs_extent_inline_ref_size(type);
+
+	ret = btrfs_extend_item(trans, root, path, size);
+	BUG_ON(ret);
+
+	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	refs = btrfs_extent_refs(leaf, ei);
+	refs += refs_to_add;
+	btrfs_set_extent_refs(leaf, ei, refs);
+	if (extent_op)
+		__run_delayed_extent_op(extent_op, leaf, ei);
+
+	ptr = (unsigned long)ei + item_offset;
+	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
+	if (ptr < end - size)
+		memmove_extent_buffer(leaf, ptr + size, ptr,
+				      end - size - ptr);
+
+	iref = (struct btrfs_extent_inline_ref *)ptr;
+	btrfs_set_extent_inline_ref_type(leaf, iref, type);
+	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+		struct btrfs_extent_data_ref *dref;
+		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
+		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
+		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
+		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
+	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+		struct btrfs_shared_data_ref *sref;
+		sref = (struct btrfs_shared_data_ref *)(iref + 1);
+		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
+		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
+		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+	} else {
+		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+	}
+	btrfs_mark_buffer_dirty(leaf);
+	return 0;
+}
+
+static int lookup_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct btrfs_extent_inline_ref **ref_ret,
+				 u64 bytenr, u64 num_bytes, u64 parent,
+				 u64 root_objectid, u64 owner, u64 offset)
+{
+	int ret;
+
+	ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
+					   bytenr, num_bytes, parent,
+					   root_objectid, owner, offset, 0);
+	if (ret != -ENOENT)
 		return ret;
+
+	btrfs_release_path(root, path);
+	*ref_ret = NULL;
+
+	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
+					    root_objectid);
+	} else {
+		ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
+					     root_objectid, owner, offset);
 	}
+	return ret;
+}
 
-	if (ret > 0) {
-		WARN_ON(1);
-		btrfs_free_path(path);
-		return -EIO;
+/*
+ * helper to update/remove inline back ref
+ */
+static noinline_for_stack
+int update_inline_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct btrfs_extent_inline_ref *iref,
+				 int refs_to_mod,
+				 struct btrfs_delayed_extent_op *extent_op)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_extent_item *ei;
+	struct btrfs_extent_data_ref *dref = NULL;
+	struct btrfs_shared_data_ref *sref = NULL;
+	unsigned long ptr;
+	unsigned long end;
+	u32 item_size;
+	int size;
+	int type;
+	int ret;
+	u64 refs;
+
+	leaf = path->nodes[0];
+	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	refs = btrfs_extent_refs(leaf, ei);
+	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
+	refs += refs_to_mod;
+	btrfs_set_extent_refs(leaf, ei, refs);
+	if (extent_op)
+		__run_delayed_extent_op(extent_op, leaf, ei);
+
+	type = btrfs_extent_inline_ref_type(leaf, iref);
+
+	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+		refs = btrfs_extent_data_ref_count(leaf, dref);
+	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+		sref = (struct btrfs_shared_data_ref *)(iref + 1);
+		refs = btrfs_shared_data_ref_count(leaf, sref);
+	} else {
+		refs = 1;
+		BUG_ON(refs_to_mod != -1);
 	}
-	l = path->nodes[0];
 
-	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
-	if (key.objectid != bytenr) {
-		btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
-		printk(KERN_ERR "btrfs wanted %llu found %llu\n",
-		       (unsigned long long)bytenr,
-		       (unsigned long long)key.objectid);
-		BUG();
+	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
+	refs += refs_to_mod;
+
+	if (refs > 0) {
+		if (type == BTRFS_EXTENT_DATA_REF_KEY)
+			btrfs_set_extent_data_ref_count(leaf, dref, refs);
+		else
+			btrfs_set_shared_data_ref_count(leaf, sref, refs);
+	} else {
+		size =  btrfs_extent_inline_ref_size(type);
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		ptr = (unsigned long)iref;
+		end = (unsigned long)ei + item_size;
+		if (ptr + size < end)
+			memmove_extent_buffer(leaf, ptr, ptr + size,
+					      end - ptr - size);
+		item_size -= size;
+		ret = btrfs_truncate_item(trans, root, path, item_size, 1);
+		BUG_ON(ret);
+	}
+	btrfs_mark_buffer_dirty(leaf);
+	return 0;
+}
+
+static noinline_for_stack
+int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 u64 bytenr, u64 num_bytes, u64 parent,
+				 u64 root_objectid, u64 owner,
+				 u64 offset, int refs_to_add,
+				 struct btrfs_delayed_extent_op *extent_op)
+{
+	struct btrfs_extent_inline_ref *iref;
+	int ret;
+
+	ret = lookup_inline_extent_backref(trans, root, path, &iref,
+					   bytenr, num_bytes, parent,
+					   root_objectid, owner, offset, 1);
+	if (ret == 0) {
+		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
+		ret = update_inline_extent_backref(trans, root, path, iref,
+						   refs_to_add, extent_op);
+	} else if (ret == -ENOENT) {
+		ret = setup_inline_extent_backref(trans, root, path, iref,
+						  parent, root_objectid,
+						  owner, offset, refs_to_add,
+						  extent_op);
 	}
-	BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
+	return ret;
+}
 
-	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
+static int insert_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 u64 bytenr, u64 parent, u64 root_objectid,
+				 u64 owner, u64 offset, int refs_to_add)
+{
+	int ret;
+	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		BUG_ON(refs_to_add != 1);
+		ret = insert_tree_block_ref(trans, root, path, bytenr,
+					    parent, root_objectid);
+	} else {
+		ret = insert_extent_data_ref(trans, root, path, bytenr,
+					     parent, root_objectid,
+					     owner, offset, refs_to_add);
+	}
+	return ret;
+}
 
-	refs = btrfs_extent_refs(l, item);
-	btrfs_set_extent_refs(l, item, refs + refs_to_add);
-	btrfs_unlock_up_safe(path, 1);
+static int remove_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct btrfs_extent_inline_ref *iref,
+				 int refs_to_drop, int is_data)
+{
+	int ret;
 
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	BUG_ON(!is_data && refs_to_drop != 1);
+	if (iref) {
+		ret = update_inline_extent_backref(trans, root, path, iref,
+						   -refs_to_drop, NULL);
+	} else if (is_data) {
+		ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
+	} else {
+		ret = btrfs_del_item(trans, root, path);
+	}
+	return ret;
+}
+
+#ifdef BIO_RW_DISCARD
+static void btrfs_issue_discard(struct block_device *bdev,
+				u64 start, u64 len)
+{
+	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+}
+#endif
+
+static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+				u64 num_bytes)
+{
+#ifdef BIO_RW_DISCARD
+	int ret;
+	u64 map_length = num_bytes;
+	struct btrfs_multi_bio *multi = NULL;
+
+	/* Tell the block device(s) that the sectors can be discarded */
+	ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
+			      bytenr, &map_length, &multi, 0);
+	if (!ret) {
+		struct btrfs_bio_stripe *stripe = multi->stripes;
+		int i;
+
+		if (map_length > num_bytes)
+			map_length = num_bytes;
+
+		for (i = 0; i < multi->num_stripes; i++, stripe++) {
+			btrfs_issue_discard(stripe->dev->bdev,
+					    stripe->physical,
+					    map_length);
+		}
+		kfree(multi);
+	}
+
+	return ret;
+#else
+	return 0;
+#endif
+}
+
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 u64 bytenr, u64 num_bytes, u64 parent,
+			 u64 root_objectid, u64 owner, u64 offset)
+{
+	int ret;
+	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
+	       root_objectid == BTRFS_TREE_LOG_OBJECTID);
+
+	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+					parent, root_objectid, (int)owner,
+					BTRFS_ADD_DELAYED_REF, NULL);
+	} else {
+		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+					parent, root_objectid, owner, offset,
+					BTRFS_ADD_DELAYED_REF, NULL);
+	}
+	return ret;
+}
+
+static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  u64 bytenr, u64 num_bytes,
+				  u64 parent, u64 root_objectid,
+				  u64 owner, u64 offset, int refs_to_add,
+				  struct btrfs_delayed_extent_op *extent_op)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_item *item;
+	u64 refs;
+	int ret;
+	int err = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->reada = 1;
+	path->leave_spinning = 1;
+	/* this will setup the path even if it fails to insert the back ref */
+	ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
+					   path, bytenr, num_bytes, parent,
+					   root_objectid, owner, offset,
+					   refs_to_add, extent_op);
+	if (ret == 0)
+		goto out;
+
+	if (ret != -EAGAIN) {
+		err = ret;
+		goto out;
+	}
 
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	refs = btrfs_extent_refs(leaf, item);
+	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
+	if (extent_op)
+		__run_delayed_extent_op(extent_op, leaf, item);
+
+	btrfs_mark_buffer_dirty(leaf);
 	btrfs_release_path(root->fs_info->extent_root, path);
 
 	path->reada = 1;
@@ -802,56 +1522,197 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
 
 	/* now insert the actual backref */
 	ret = insert_extent_backref(trans, root->fs_info->extent_root,
-				    path, bytenr, parent,
-				    ref_root, ref_generation,
-				    owner_objectid, refs_to_add);
+				    path, bytenr, parent, root_objectid,
+				    owner, offset, refs_to_add);
 	BUG_ON(ret);
+out:
 	btrfs_free_path(path);
-	return 0;
+	return err;
 }
 
-int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-			 struct btrfs_root *root,
-			 u64 bytenr, u64 num_bytes, u64 parent,
-			 u64 ref_root, u64 ref_generation,
-			 u64 owner_objectid)
+static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct btrfs_delayed_ref_node *node,
+				struct btrfs_delayed_extent_op *extent_op,
+				int insert_reserved)
+{
+	int ret = 0;
+	struct btrfs_delayed_data_ref *ref;
+	struct btrfs_key ins;
+	u64 parent = 0;
+	u64 ref_root = 0;
+	u64 flags = 0;
+
+	ins.objectid = node->bytenr;
+	ins.offset = node->num_bytes;
+	ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+	ref = btrfs_delayed_node_to_data_ref(node);
+	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
+		parent = ref->parent;
+	else
+		ref_root = ref->root;
+
+	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+		if (extent_op) {
+			BUG_ON(extent_op->update_key);
+			flags |= extent_op->flags_to_set;
+		}
+		ret = alloc_reserved_file_extent(trans, root,
+						 parent, ref_root, flags,
+						 ref->objectid, ref->offset,
+						 &ins, node->ref_mod);
+		update_reserved_extents(root, ins.objectid, ins.offset, 0);
+	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
+		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
+					     node->num_bytes, parent,
+					     ref_root, ref->objectid,
+					     ref->offset, node->ref_mod,
+					     extent_op);
+	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
+		ret = __btrfs_free_extent(trans, root, node->bytenr,
+					  node->num_bytes, parent,
+					  ref_root, ref->objectid,
+					  ref->offset, node->ref_mod,
+					  extent_op);
+	} else {
+		BUG();
+	}
+	return ret;
+}
+
+static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
+				    struct extent_buffer *leaf,
+				    struct btrfs_extent_item *ei)
+{
+	u64 flags = btrfs_extent_flags(leaf, ei);
+	if (extent_op->update_flags) {
+		flags |= extent_op->flags_to_set;
+		btrfs_set_extent_flags(leaf, ei, flags);
+	}
+
+	if (extent_op->update_key) {
+		struct btrfs_tree_block_info *bi;
+		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
+		bi = (struct btrfs_tree_block_info *)(ei + 1);
+		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
+	}
+}
+
+static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_delayed_ref_node *node,
+				 struct btrfs_delayed_extent_op *extent_op)
 {
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct btrfs_extent_item *ei;
+	struct extent_buffer *leaf;
+	u32 item_size;
 	int ret;
-	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
-	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
-		return 0;
+	int err = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = node->bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = node->num_bytes;
+
+	path->reada = 1;
+	path->leave_spinning = 1;
+	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
+				path, 0, 1);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+	if (ret > 0) {
+		err = -EIO;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (item_size < sizeof(*ei)) {
+		ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
+					     path, (u64)-1, 0);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	}
+#endif
+	BUG_ON(item_size < sizeof(*ei));
+	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	__run_delayed_extent_op(extent_op, leaf, ei);
 
-	ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent,
-				     0, ref_root, 0, ref_generation,
-				     owner_objectid);
-	return ret;
+	btrfs_mark_buffer_dirty(leaf);
+out:
+	btrfs_free_path(path);
+	return err;
 }
 
-static int drop_delayed_ref(struct btrfs_trans_handle *trans,
-					struct btrfs_root *root,
-					struct btrfs_delayed_ref_node *node)
+static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct btrfs_delayed_ref_node *node,
+				struct btrfs_delayed_extent_op *extent_op,
+				int insert_reserved)
 {
 	int ret = 0;
-	struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node);
+	struct btrfs_delayed_tree_ref *ref;
+	struct btrfs_key ins;
+	u64 parent = 0;
+	u64 ref_root = 0;
 
-	BUG_ON(node->ref_mod == 0);
-	ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes,
-				  node->parent, ref->root, ref->generation,
-				  ref->owner_objectid, ref->pin, node->ref_mod);
+	ins.objectid = node->bytenr;
+	ins.offset = node->num_bytes;
+	ins.type = BTRFS_EXTENT_ITEM_KEY;
 
+	ref = btrfs_delayed_node_to_tree_ref(node);
+	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
+		parent = ref->parent;
+	else
+		ref_root = ref->root;
+
+	BUG_ON(node->ref_mod != 1);
+	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+		BUG_ON(!extent_op || !extent_op->update_flags ||
+		       !extent_op->update_key);
+		ret = alloc_reserved_tree_block(trans, root,
+						parent, ref_root,
+						extent_op->flags_to_set,
+						&extent_op->key,
+						ref->level, &ins);
+		update_reserved_extents(root, ins.objectid, ins.offset, 0);
+	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
+		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
+					     node->num_bytes, parent, ref_root,
+					     ref->level, 0, 1, extent_op);
+	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
+		ret = __btrfs_free_extent(trans, root, node->bytenr,
+					  node->num_bytes, parent, ref_root,
+					  ref->level, 0, 1, extent_op);
+	} else {
+		BUG();
+	}
 	return ret;
 }
 
+
 /* helper function to actually process a single delayed ref entry */
-static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
-					struct btrfs_root *root,
-					struct btrfs_delayed_ref_node *node,
-					int insert_reserved)
+static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_delayed_ref_node *node,
+			       struct btrfs_delayed_extent_op *extent_op,
+			       int insert_reserved)
 {
 	int ret;
-	struct btrfs_delayed_ref *ref;
-
-	if (node->parent == (u64)-1) {
+	if (btrfs_delayed_ref_is_head(node)) {
 		struct btrfs_delayed_ref_head *head;
 		/*
 		 * we've hit the end of the chain and we were supposed
@@ -859,44 +1720,35 @@ static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 		 * deleted before we ever needed to insert it, so all
 		 * we have to do is clean up the accounting
 		 */
+		BUG_ON(extent_op);
+		head = btrfs_delayed_node_to_head(node);
 		if (insert_reserved) {
+			if (head->is_data) {
+				ret = btrfs_del_csums(trans, root,
+						      node->bytenr,
+						      node->num_bytes);
+				BUG_ON(ret);
+			}
+			btrfs_update_pinned_extents(root, node->bytenr,
+						    node->num_bytes, 1);
 			update_reserved_extents(root, node->bytenr,
 						node->num_bytes, 0);
 		}
-		head = btrfs_delayed_node_to_head(node);
 		mutex_unlock(&head->mutex);
 		return 0;
 	}
 
-	ref = btrfs_delayed_node_to_ref(node);
-	if (ref->action == BTRFS_ADD_DELAYED_REF) {
-		if (insert_reserved) {
-			struct btrfs_key ins;
-
-			ins.objectid = node->bytenr;
-			ins.offset = node->num_bytes;
-			ins.type = BTRFS_EXTENT_ITEM_KEY;
-
-			/* record the full extent allocation */
-			ret = __btrfs_alloc_reserved_extent(trans, root,
-					node->parent, ref->root,
-					ref->generation, ref->owner_objectid,
-					&ins, node->ref_mod);
-			update_reserved_extents(root, node->bytenr,
-						node->num_bytes, 0);
-		} else {
-			/* just add one backref */
-			ret = add_extent_ref(trans, root, node->bytenr,
-				     node->num_bytes,
-				     node->parent, ref->root, ref->generation,
-				     ref->owner_objectid, node->ref_mod);
-		}
-		BUG_ON(ret);
-	} else if (ref->action == BTRFS_DROP_DELAYED_REF) {
-		WARN_ON(insert_reserved);
-		ret = drop_delayed_ref(trans, root, node);
-	}
-	return 0;
+	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
+	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
+		ret = run_delayed_tree_ref(trans, root, node, extent_op,
+					   insert_reserved);
+	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
+		 node->type == BTRFS_SHARED_DATA_REF_KEY)
+		ret = run_delayed_data_ref(trans, root, node, extent_op,
+					   insert_reserved);
+	else
+		BUG();
+	return ret;
 }
 
 static noinline struct btrfs_delayed_ref_node *
@@ -919,7 +1771,7 @@ again:
 				rb_node);
 		if (ref->bytenr != head->node.bytenr)
 			break;
-		if (btrfs_delayed_node_to_ref(ref)->action == action)
+		if (ref->action == action)
 			return ref;
 		node = rb_prev(node);
 	}
@@ -937,6 +1789,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_delayed_ref_node *ref;
 	struct btrfs_delayed_ref_head *locked_ref = NULL;
+	struct btrfs_delayed_extent_op *extent_op;
 	int ret;
 	int count = 0;
 	int must_insert_reserved = 0;
@@ -975,6 +1828,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		must_insert_reserved = locked_ref->must_insert_reserved;
 		locked_ref->must_insert_reserved = 0;
 
+		extent_op = locked_ref->extent_op;
+		locked_ref->extent_op = NULL;
+
 		/*
 		 * locked_ref is the head node, so we have to go one
 		 * node back for any delayed ref updates
@@ -986,6 +1842,25 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 			 * so that any accounting fixes can happen
 			 */
 			ref = &locked_ref->node;
+
+			if (extent_op && must_insert_reserved) {
+				kfree(extent_op);
+				extent_op = NULL;
+			}
+
+			if (extent_op) {
+				spin_unlock(&delayed_refs->lock);
+
+				ret = run_delayed_extent_op(trans, root,
+							    ref, extent_op);
+				BUG_ON(ret);
+				kfree(extent_op);
+
+				cond_resched();
+				spin_lock(&delayed_refs->lock);
+				continue;
+			}
+
 			list_del_init(&locked_ref->cluster);
 			locked_ref = NULL;
 		}
@@ -993,14 +1868,17 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		ref->in_tree = 0;
 		rb_erase(&ref->rb_node, &delayed_refs->root);
 		delayed_refs->num_entries--;
+
 		spin_unlock(&delayed_refs->lock);
 
-		ret = run_one_delayed_ref(trans, root, ref,
+		ret = run_one_delayed_ref(trans, root, ref, extent_op,
 					  must_insert_reserved);
 		BUG_ON(ret);
-		btrfs_put_delayed_ref(ref);
 
+		btrfs_put_delayed_ref(ref);
+		kfree(extent_op);
 		count++;
+
 		cond_resched();
 		spin_lock(&delayed_refs->lock);
 	}
@@ -1095,25 +1973,112 @@ out:
 	return 0;
 }
 
-int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, u64 objectid, u64 bytenr)
+int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 bytenr, u64 num_bytes, u64 flags,
+				int is_data)
+{
+	struct btrfs_delayed_extent_op *extent_op;
+	int ret;
+
+	extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+	if (!extent_op)
+		return -ENOMEM;
+
+	extent_op->flags_to_set = flags;
+	extent_op->update_flags = 1;
+	extent_op->update_key = 0;
+	extent_op->is_data = is_data ? 1 : 0;
+
+	ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
+	if (ret)
+		kfree(extent_op);
+	return ret;
+}
+
+static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      u64 objectid, u64 offset, u64 bytenr)
+{
+	struct btrfs_delayed_ref_head *head;
+	struct btrfs_delayed_ref_node *ref;
+	struct btrfs_delayed_data_ref *data_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct rb_node *node;
+	int ret = 0;
+
+	ret = -ENOENT;
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+	head = btrfs_find_delayed_ref_head(trans, bytenr);
+	if (!head)
+		goto out;
+
+	if (!mutex_trylock(&head->mutex)) {
+		atomic_inc(&head->node.refs);
+		spin_unlock(&delayed_refs->lock);
+
+		btrfs_release_path(root->fs_info->extent_root, path);
+
+		mutex_lock(&head->mutex);
+		mutex_unlock(&head->mutex);
+		btrfs_put_delayed_ref(&head->node);
+		return -EAGAIN;
+	}
+
+	node = rb_prev(&head->node.rb_node);
+	if (!node)
+		goto out_unlock;
+
+	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+
+	if (ref->bytenr != bytenr)
+		goto out_unlock;
+
+	ret = 1;
+	if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
+		goto out_unlock;
+
+	data_ref = btrfs_delayed_node_to_data_ref(ref);
+
+	node = rb_prev(node);
+	if (node) {
+		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+		if (ref->bytenr == bytenr)
+			goto out_unlock;
+	}
+
+	if (data_ref->root != root->root_key.objectid ||
+	    data_ref->objectid != objectid || data_ref->offset != offset)
+		goto out_unlock;
+
+	ret = 0;
+out_unlock:
+	mutex_unlock(&head->mutex);
+out:
+	spin_unlock(&delayed_refs->lock);
+	return ret;
+}
+
+static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_path *path,
+					u64 objectid, u64 offset, u64 bytenr)
 {
 	struct btrfs_root *extent_root = root->fs_info->extent_root;
-	struct btrfs_path *path;
 	struct extent_buffer *leaf;
-	struct btrfs_extent_ref *ref_item;
+	struct btrfs_extent_data_ref *ref;
+	struct btrfs_extent_inline_ref *iref;
+	struct btrfs_extent_item *ei;
 	struct btrfs_key key;
-	struct btrfs_key found_key;
-	u64 ref_root;
-	u64 last_snapshot;
-	u32 nritems;
+	u32 item_size;
 	int ret;
 
 	key.objectid = bytenr;
 	key.offset = (u64)-1;
 	key.type = BTRFS_EXTENT_ITEM_KEY;
 
-	path = btrfs_alloc_path();
 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
@@ -1125,55 +2090,83 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
 
 	path->slots[0]--;
 	leaf = path->nodes[0];
-	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 
-	if (found_key.objectid != bytenr ||
-	    found_key.type != BTRFS_EXTENT_ITEM_KEY)
+	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
 		goto out;
 
-	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
-	while (1) {
-		leaf = path->nodes[0];
-		nritems = btrfs_header_nritems(leaf);
-		if (path->slots[0] >= nritems) {
-			ret = btrfs_next_leaf(extent_root, path);
-			if (ret < 0)
-				goto out;
-			if (ret == 0)
-				continue;
-			break;
-		}
-		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		if (found_key.objectid != bytenr)
-			break;
+	ret = 1;
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (item_size < sizeof(*ei)) {
+		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
+		goto out;
+	}
+#endif
+	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 
-		if (found_key.type != BTRFS_EXTENT_REF_KEY) {
-			path->slots[0]++;
-			continue;
-		}
+	if (item_size != sizeof(*ei) +
+	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
+		goto out;
 
-		ref_item = btrfs_item_ptr(leaf, path->slots[0],
-					  struct btrfs_extent_ref);
-		ref_root = btrfs_ref_root(leaf, ref_item);
-		if ((ref_root != root->root_key.objectid &&
-		     ref_root != BTRFS_TREE_LOG_OBJECTID) ||
-		     objectid != btrfs_ref_objectid(leaf, ref_item)) {
-			ret = 1;
-			goto out;
-		}
-		if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) {
-			ret = 1;
+	if (btrfs_extent_generation(leaf, ei) <=
+	    btrfs_root_last_snapshot(&root->root_item))
+		goto out;
+
+	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+	if (btrfs_extent_inline_ref_type(leaf, iref) !=
+	    BTRFS_EXTENT_DATA_REF_KEY)
+		goto out;
+
+	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
+	if (btrfs_extent_refs(leaf, ei) !=
+	    btrfs_extent_data_ref_count(leaf, ref) ||
+	    btrfs_extent_data_ref_root(leaf, ref) !=
+	    root->root_key.objectid ||
+	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
+	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
+		goto out;
+
+	ret = 0;
+out:
+	return ret;
+}
+
+int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  u64 objectid, u64 offset, u64 bytenr)
+{
+	struct btrfs_path *path;
+	int ret;
+	int ret2;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOENT;
+
+	do {
+		ret = check_committed_ref(trans, root, path, objectid,
+					  offset, bytenr);
+		if (ret && ret != -ENOENT)
 			goto out;
-		}
 
-		path->slots[0]++;
+		ret2 = check_delayed_ref(trans, root, path, objectid,
+					 offset, bytenr);
+	} while (ret2 == -EAGAIN);
+
+	if (ret2 && ret2 != -ENOENT) {
+		ret = ret2;
+		goto out;
 	}
-	ret = 0;
+
+	if (ret != -ENOENT || ret2 != -ENOENT)
+		ret = 0;
 out:
 	btrfs_free_path(path);
 	return ret;
 }
 
+#if 0
 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		    struct extent_buffer *buf, u32 nr_extents)
 {
@@ -1291,191 +2284,49 @@ static int refsort_cmp(const void *a_void, const void *b_void)
 		return 1;
 	return 0;
 }
+#endif
 
-
-noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
+static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
-			   struct extent_buffer *orig_buf,
-			   struct extent_buffer *buf, u32 *nr_extents)
+			   struct extent_buffer *buf,
+			   int full_backref, int inc)
 {
 	u64 bytenr;
+	u64 num_bytes;
+	u64 parent;
 	u64 ref_root;
-	u64 orig_root;
-	u64 ref_generation;
-	u64 orig_generation;
-	struct refsort *sorted;
 	u32 nritems;
-	u32 nr_file_extents = 0;
 	struct btrfs_key key;
 	struct btrfs_file_extent_item *fi;
 	int i;
 	int level;
 	int ret = 0;
-	int faili = 0;
-	int refi = 0;
-	int slot;
 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-			    u64, u64, u64, u64, u64, u64, u64, u64, u64);
+			    u64, u64, u64, u64, u64, u64);
 
 	ref_root = btrfs_header_owner(buf);
-	ref_generation = btrfs_header_generation(buf);
-	orig_root = btrfs_header_owner(orig_buf);
-	orig_generation = btrfs_header_generation(orig_buf);
-
 	nritems = btrfs_header_nritems(buf);
 	level = btrfs_header_level(buf);
 
-	sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS);
-	BUG_ON(!sorted);
-
-	if (root->ref_cows) {
-		process_func = __btrfs_inc_extent_ref;
-	} else {
-		if (level == 0 &&
-		    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
-			goto out;
-		if (level != 0 &&
-		    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
-			goto out;
-		process_func = __btrfs_update_extent_ref;
-	}
-
-	/*
-	 * we make two passes through the items.  In the first pass we
-	 * only record the byte number and slot.  Then we sort based on
-	 * byte number and do the actual work based on the sorted results
-	 */
-	for (i = 0; i < nritems; i++) {
-		cond_resched();
-		if (level == 0) {
-			btrfs_item_key_to_cpu(buf, &key, i);
-			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
-				continue;
-			fi = btrfs_item_ptr(buf, i,
-					    struct btrfs_file_extent_item);
-			if (btrfs_file_extent_type(buf, fi) ==
-			    BTRFS_FILE_EXTENT_INLINE)
-				continue;
-			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
-			if (bytenr == 0)
-				continue;
-
-			nr_file_extents++;
-			sorted[refi].bytenr = bytenr;
-			sorted[refi].slot = i;
-			refi++;
-		} else {
-			bytenr = btrfs_node_blockptr(buf, i);
-			sorted[refi].bytenr = bytenr;
-			sorted[refi].slot = i;
-			refi++;
-		}
-	}
-	/*
-	 * if refi == 0, we didn't actually put anything into the sorted
-	 * array and we're done
-	 */
-	if (refi == 0)
-		goto out;
-
-	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
-
-	for (i = 0; i < refi; i++) {
-		cond_resched();
-		slot = sorted[i].slot;
-		bytenr = sorted[i].bytenr;
-
-		if (level == 0) {
-			btrfs_item_key_to_cpu(buf, &key, slot);
-			fi = btrfs_item_ptr(buf, slot,
-					    struct btrfs_file_extent_item);
-
-			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
-			if (bytenr == 0)
-				continue;
-
-			ret = process_func(trans, root, bytenr,
-				   btrfs_file_extent_disk_num_bytes(buf, fi),
-				   orig_buf->start, buf->start,
-				   orig_root, ref_root,
-				   orig_generation, ref_generation,
-				   key.objectid);
-
-			if (ret) {
-				faili = slot;
-				WARN_ON(1);
-				goto fail;
-			}
-		} else {
-			ret = process_func(trans, root, bytenr, buf->len,
-					   orig_buf->start, buf->start,
-					   orig_root, ref_root,
-					   orig_generation, ref_generation,
-					   level - 1);
-			if (ret) {
-				faili = slot;
-				WARN_ON(1);
-				goto fail;
-			}
-		}
-	}
-out:
-	kfree(sorted);
-	if (nr_extents) {
-		if (level == 0)
-			*nr_extents = nr_file_extents;
-		else
-			*nr_extents = nritems;
-	}
-	return 0;
-fail:
-	kfree(sorted);
-	WARN_ON(1);
-	return ret;
-}
-
-int btrfs_update_ref(struct btrfs_trans_handle *trans,
-		     struct btrfs_root *root, struct extent_buffer *orig_buf,
-		     struct extent_buffer *buf, int start_slot, int nr)
-
-{
-	u64 bytenr;
-	u64 ref_root;
-	u64 orig_root;
-	u64 ref_generation;
-	u64 orig_generation;
-	struct btrfs_key key;
-	struct btrfs_file_extent_item *fi;
-	int i;
-	int ret;
-	int slot;
-	int level;
-
-	BUG_ON(start_slot < 0);
-	BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
-
-	ref_root = btrfs_header_owner(buf);
-	ref_generation = btrfs_header_generation(buf);
-	orig_root = btrfs_header_owner(orig_buf);
-	orig_generation = btrfs_header_generation(orig_buf);
-	level = btrfs_header_level(buf);
+	if (!root->ref_cows && level == 0)
+		return 0;
 
-	if (!root->ref_cows) {
-		if (level == 0 &&
-		    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
-			return 0;
-		if (level != 0 &&
-		    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
-			return 0;
-	}
+	if (inc)
+		process_func = btrfs_inc_extent_ref;
+	else
+		process_func = btrfs_free_extent;
 
-	for (i = 0, slot = start_slot; i < nr; i++, slot++) {
-		cond_resched();
+	if (full_backref)
+		parent = buf->start;
+	else
+		parent = 0;
+
+	for (i = 0; i < nritems; i++) {
 		if (level == 0) {
-			btrfs_item_key_to_cpu(buf, &key, slot);
+			btrfs_item_key_to_cpu(buf, &key, i);
 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
 				continue;
-			fi = btrfs_item_ptr(buf, slot,
+			fi = btrfs_item_ptr(buf, i,
 					    struct btrfs_file_extent_item);
 			if (btrfs_file_extent_type(buf, fi) ==
 			    BTRFS_FILE_EXTENT_INLINE)
@@ -1483,28 +2334,39 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans,
 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
 			if (bytenr == 0)
 				continue;
-			ret = __btrfs_update_extent_ref(trans, root, bytenr,
-				    btrfs_file_extent_disk_num_bytes(buf, fi),
-				    orig_buf->start, buf->start,
-				    orig_root, ref_root, orig_generation,
-				    ref_generation, key.objectid);
+
+			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
+			key.offset -= btrfs_file_extent_offset(buf, fi);
+			ret = process_func(trans, root, bytenr, num_bytes,
+					   parent, ref_root, key.objectid,
+					   key.offset);
 			if (ret)
 				goto fail;
 		} else {
-			bytenr = btrfs_node_blockptr(buf, slot);
-			ret = __btrfs_update_extent_ref(trans, root, bytenr,
-					    buf->len, orig_buf->start,
-					    buf->start, orig_root, ref_root,
-					    orig_generation, ref_generation,
-					    level - 1);
+			bytenr = btrfs_node_blockptr(buf, i);
+			num_bytes = btrfs_level_size(root, level - 1);
+			ret = process_func(trans, root, bytenr, num_bytes,
+					   parent, ref_root, level - 1, 0);
 			if (ret)
 				goto fail;
 		}
 	}
 	return 0;
 fail:
-	WARN_ON(1);
-	return -1;
+	BUG();
+	return ret;
+}
+
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		  struct extent_buffer *buf, int full_backref)
+{
+	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
+}
+
+int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		  struct extent_buffer *buf, int full_backref)
+{
+	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
 }
 
 static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -2007,6 +2869,24 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	u64 old_val;
 	u64 byte_in_group;
 
+	/* block accounting for super block */
+	spin_lock(&info->delalloc_lock);
+	old_val = btrfs_super_bytes_used(&info->super_copy);
+	if (alloc)
+		old_val += num_bytes;
+	else
+		old_val -= num_bytes;
+	btrfs_set_super_bytes_used(&info->super_copy, old_val);
+
+	/* block accounting for root item */
+	old_val = btrfs_root_used(&root->root_item);
+	if (alloc)
+		old_val += num_bytes;
+	else
+		old_val -= num_bytes;
+	btrfs_set_root_used(&root->root_item, old_val);
+	spin_unlock(&info->delalloc_lock);
+
 	while (total) {
 		cache = btrfs_lookup_block_group(info, bytenr);
 		if (!cache)
@@ -2216,8 +3096,6 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
 		u64 header_owner = btrfs_header_owner(buf);
 		u64 header_transid = btrfs_header_generation(buf);
 		if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
-		    header_owner != BTRFS_TREE_RELOC_OBJECTID &&
-		    header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
 		    header_transid == trans->transid &&
 		    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
 			*must_clean = buf;
@@ -2235,63 +3113,77 @@ pinit:
 	return 0;
 }
 
-/*
- * remove an extent from the root, returns 0 on success
- */
-static int __free_extent(struct btrfs_trans_handle *trans,
-			 struct btrfs_root *root,
-			 u64 bytenr, u64 num_bytes, u64 parent,
-			 u64 root_objectid, u64 ref_generation,
-			 u64 owner_objectid, int pin, int mark_free,
-			 int refs_to_drop)
+
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 bytenr, u64 num_bytes, u64 parent,
+				u64 root_objectid, u64 owner_objectid,
+				u64 owner_offset, int refs_to_drop,
+				struct btrfs_delayed_extent_op *extent_op)
 {
-	struct btrfs_path *path;
 	struct btrfs_key key;
+	struct btrfs_path *path;
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_root *extent_root = info->extent_root;
 	struct extent_buffer *leaf;
+	struct btrfs_extent_item *ei;
+	struct btrfs_extent_inline_ref *iref;
 	int ret;
+	int is_data;
 	int extent_slot = 0;
 	int found_extent = 0;
 	int num_to_del = 1;
-	struct btrfs_extent_item *ei;
-	u32 refs;
+	u32 item_size;
+	u64 refs;
 
-	key.objectid = bytenr;
-	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
-	key.offset = num_bytes;
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
 	path->reada = 1;
 	path->leave_spinning = 1;
-	ret = lookup_extent_backref(trans, extent_root, path,
-				    bytenr, parent, root_objectid,
-				    ref_generation, owner_objectid, 1);
+
+	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
+	BUG_ON(!is_data && refs_to_drop != 1);
+
+	ret = lookup_extent_backref(trans, extent_root, path, &iref,
+				    bytenr, num_bytes, parent,
+				    root_objectid, owner_objectid,
+				    owner_offset);
 	if (ret == 0) {
-		struct btrfs_key found_key;
 		extent_slot = path->slots[0];
-		while (extent_slot > 0) {
-			extent_slot--;
-			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+		while (extent_slot >= 0) {
+			btrfs_item_key_to_cpu(path->nodes[0], &key,
 					      extent_slot);
-			if (found_key.objectid != bytenr)
+			if (key.objectid != bytenr)
 				break;
-			if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
-			    found_key.offset == num_bytes) {
+			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
+			    key.offset == num_bytes) {
 				found_extent = 1;
 				break;
 			}
 			if (path->slots[0] - extent_slot > 5)
 				break;
+			extent_slot--;
 		}
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
+		if (found_extent && item_size < sizeof(*ei))
+			found_extent = 0;
+#endif
 		if (!found_extent) {
+			BUG_ON(iref);
 			ret = remove_extent_backref(trans, extent_root, path,
-						    refs_to_drop);
+						    NULL, refs_to_drop,
+						    is_data);
 			BUG_ON(ret);
 			btrfs_release_path(extent_root, path);
 			path->leave_spinning = 1;
+
+			key.objectid = bytenr;
+			key.type = BTRFS_EXTENT_ITEM_KEY;
+			key.offset = num_bytes;
+
 			ret = btrfs_search_slot(trans, extent_root,
 						&key, path, -1, 1);
 			if (ret) {
@@ -2307,82 +3199,98 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		btrfs_print_leaf(extent_root, path->nodes[0]);
 		WARN_ON(1);
 		printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
-		       "parent %llu root %llu gen %llu owner %llu\n",
+		       "parent %llu root %llu  owner %llu offset %llu\n",
 		       (unsigned long long)bytenr,
 		       (unsigned long long)parent,
 		       (unsigned long long)root_objectid,
-		       (unsigned long long)ref_generation,
-		       (unsigned long long)owner_objectid);
+		       (unsigned long long)owner_objectid,
+		       (unsigned long long)owner_offset);
 	}
 
 	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, extent_slot);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (item_size < sizeof(*ei)) {
+		BUG_ON(found_extent || extent_slot != path->slots[0]);
+		ret = convert_extent_item_v0(trans, extent_root, path,
+					     owner_objectid, 0);
+		BUG_ON(ret < 0);
+
+		btrfs_release_path(extent_root, path);
+		path->leave_spinning = 1;
+
+		key.objectid = bytenr;
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.offset = num_bytes;
+
+		ret = btrfs_search_slot(trans, extent_root, &key, path,
+					-1, 1);
+		if (ret) {
+			printk(KERN_ERR "umm, got %d back from search"
+			       ", was looking for %llu\n", ret,
+			       (unsigned long long)bytenr);
+			btrfs_print_leaf(extent_root, path->nodes[0]);
+		}
+		BUG_ON(ret);
+		extent_slot = path->slots[0];
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, extent_slot);
+	}
+#endif
+	BUG_ON(item_size < sizeof(*ei));
 	ei = btrfs_item_ptr(leaf, extent_slot,
 			    struct btrfs_extent_item);
-	refs = btrfs_extent_refs(leaf, ei);
-
-	/*
-	 * we're not allowed to delete the extent item if there
-	 * are other delayed ref updates pending
-	 */
+	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+		struct btrfs_tree_block_info *bi;
+		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
+		bi = (struct btrfs_tree_block_info *)(ei + 1);
+		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
+	}
 
+	refs = btrfs_extent_refs(leaf, ei);
 	BUG_ON(refs < refs_to_drop);
 	refs -= refs_to_drop;
-	btrfs_set_extent_refs(leaf, ei, refs);
-	btrfs_mark_buffer_dirty(leaf);
 
-	if (refs == 0 && found_extent &&
-	    path->slots[0] == extent_slot + 1) {
-		struct btrfs_extent_ref *ref;
-		ref = btrfs_item_ptr(leaf, path->slots[0],
-				     struct btrfs_extent_ref);
-		BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop);
-		/* if the back ref and the extent are next to each other
-		 * they get deleted below in one shot
+	if (refs > 0) {
+		if (extent_op)
+			__run_delayed_extent_op(extent_op, leaf, ei);
+		/*
+		 * In the case of inline back ref, reference count will
+		 * be updated by remove_extent_backref
 		 */
-		path->slots[0] = extent_slot;
-		num_to_del = 2;
-	} else if (found_extent) {
-		/* otherwise delete the extent back ref */
-		ret = remove_extent_backref(trans, extent_root, path,
-					    refs_to_drop);
-		BUG_ON(ret);
-		/* if refs are 0, we need to setup the path for deletion */
-		if (refs == 0) {
-			btrfs_release_path(extent_root, path);
-			path->leave_spinning = 1;
-			ret = btrfs_search_slot(trans, extent_root, &key, path,
-						-1, 1);
+		if (iref) {
+			BUG_ON(!found_extent);
+		} else {
+			btrfs_set_extent_refs(leaf, ei, refs);
+			btrfs_mark_buffer_dirty(leaf);
+		}
+		if (found_extent) {
+			ret = remove_extent_backref(trans, extent_root, path,
+						    iref, refs_to_drop,
+						    is_data);
 			BUG_ON(ret);
 		}
-	}
-
-	if (refs == 0) {
-		u64 super_used;
-		u64 root_used;
+	} else {
+		int mark_free = 0;
 		struct extent_buffer *must_clean = NULL;
 
-		if (pin) {
-			ret = pin_down_bytes(trans, root, path,
-				bytenr, num_bytes,
-				owner_objectid >= BTRFS_FIRST_FREE_OBJECTID,
-				&must_clean);
-			if (ret > 0)
-				mark_free = 1;
-			BUG_ON(ret < 0);
+		if (found_extent) {
+			BUG_ON(is_data && refs_to_drop !=
+			       extent_data_ref_count(root, path, iref));
+			if (iref) {
+				BUG_ON(path->slots[0] != extent_slot);
+			} else {
+				BUG_ON(path->slots[0] != extent_slot + 1);
+				path->slots[0] = extent_slot;
+				num_to_del = 2;
+			}
 		}
 
-		/* block accounting for super block */
-		spin_lock(&info->delalloc_lock);
-		super_used = btrfs_super_bytes_used(&info->super_copy);
-		btrfs_set_super_bytes_used(&info->super_copy,
-					   super_used - num_bytes);
-
-		/* block accounting for root item */
-		root_used = btrfs_root_used(&root->root_item);
-		btrfs_set_root_used(&root->root_item,
-					   root_used - num_bytes);
-		spin_unlock(&info->delalloc_lock);
-
+		ret = pin_down_bytes(trans, root, path, bytenr,
+				     num_bytes, is_data, &must_clean);
+		if (ret > 0)
+			mark_free = 1;
+		BUG_ON(ret < 0);
 		/*
 		 * it is going to be very rare for someone to be waiting
 		 * on the block we're freeing.  del_items might need to
@@ -2403,7 +3311,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 			free_extent_buffer(must_clean);
 		}
 
-		if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+		if (is_data) {
 			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
 			BUG_ON(ret);
 		} else {
@@ -2421,34 +3329,6 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 }
 
 /*
- * remove an extent from the root, returns 0 on success
- */
-static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
-					struct btrfs_root *root,
-					u64 bytenr, u64 num_bytes, u64 parent,
-					u64 root_objectid, u64 ref_generation,
-					u64 owner_objectid, int pin,
-					int refs_to_drop)
-{
-	WARN_ON(num_bytes < root->sectorsize);
-
-	/*
-	 * if metadata always pin
-	 * if data pin when any transaction has committed this
-	 */
-	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
-	    ref_generation != trans->transid)
-		pin = 1;
-
-	if (ref_generation != trans->transid)
-		pin = 1;
-
-	return __free_extent(trans, root, bytenr, num_bytes, parent,
-			    root_objectid, ref_generation,
-			    owner_objectid, pin, pin == 0, refs_to_drop);
-}
-
-/*
  * when we free an extent, it is possible (and likely) that we free the last
  * delayed ref for that extent as well.  This searches the delayed ref tree for
  * a given extent, and if there are no other delayed refs to be processed, it
@@ -2479,6 +3359,13 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	if (ref->bytenr == bytenr)
 		goto out;
 
+	if (head->extent_op) {
+		if (!head->must_insert_reserved)
+			goto out;
+		kfree(head->extent_op);
+		head->extent_op = NULL;
+	}
+
 	/*
 	 * waiting for the lock here would deadlock.  If someone else has it
 	 * locked they are already in the process of dropping it anyway
@@ -2507,7 +3394,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	spin_unlock(&delayed_refs->lock);
 
 	ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
-				  &head->node, head->must_insert_reserved);
+				  &head->node, head->extent_op,
+				  head->must_insert_reserved);
 	BUG_ON(ret);
 	btrfs_put_delayed_ref(&head->node);
 	return 0;
@@ -2519,32 +3407,32 @@ out:
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      u64 bytenr, u64 num_bytes, u64 parent,
-		      u64 root_objectid, u64 ref_generation,
-		      u64 owner_objectid, int pin)
+		      u64 root_objectid, u64 owner, u64 offset)
 {
 	int ret;
 
 	/*
 	 * tree log blocks never actually go into the extent allocation
 	 * tree, just update pinning info and exit early.
-	 *
-	 * data extents referenced by the tree log do need to have
-	 * their reference counts bumped.
 	 */
-	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
-	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
+		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
 		/* unlocks the pinned mutex */
 		btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
 		update_reserved_extents(root, bytenr, num_bytes, 0);
 		ret = 0;
-	} else {
-		ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent,
-				       root_objectid, ref_generation,
-				       owner_objectid,
-				       BTRFS_DROP_DELAYED_REF, 1);
+	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+					parent, root_objectid, (int)owner,
+					BTRFS_DROP_DELAYED_REF, NULL);
 		BUG_ON(ret);
 		ret = check_ref_cleanup(trans, root, bytenr);
 		BUG_ON(ret);
+	} else {
+		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+					parent, root_objectid, owner,
+					offset, BTRFS_DROP_DELAYED_REF, NULL);
+		BUG_ON(ret);
 	}
 	return ret;
 }
@@ -2969,99 +3857,147 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
-					 struct btrfs_root *root, u64 parent,
-					 u64 root_objectid, u64 ref_generation,
-					 u64 owner, struct btrfs_key *ins,
-					 int ref_mod)
+static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      u64 parent, u64 root_objectid,
+				      u64 flags, u64 owner, u64 offset,
+				      struct btrfs_key *ins, int ref_mod)
 {
 	int ret;
-	u64 super_used;
-	u64 root_used;
-	u64 num_bytes = ins->offset;
-	u32 sizes[2];
-	struct btrfs_fs_info *info = root->fs_info;
-	struct btrfs_root *extent_root = info->extent_root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_extent_item *extent_item;
-	struct btrfs_extent_ref *ref;
+	struct btrfs_extent_inline_ref *iref;
 	struct btrfs_path *path;
-	struct btrfs_key keys[2];
-
-	if (parent == 0)
-		parent = ins->objectid;
-
-	/* block accounting for super block */
-	spin_lock(&info->delalloc_lock);
-	super_used = btrfs_super_bytes_used(&info->super_copy);
-	btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
+	struct extent_buffer *leaf;
+	int type;
+	u32 size;
 
-	/* block accounting for root item */
-	root_used = btrfs_root_used(&root->root_item);
-	btrfs_set_root_used(&root->root_item, root_used + num_bytes);
-	spin_unlock(&info->delalloc_lock);
+	if (parent > 0)
+		type = BTRFS_SHARED_DATA_REF_KEY;
+	else
+		type = BTRFS_EXTENT_DATA_REF_KEY;
 
-	memcpy(&keys[0], ins, sizeof(*ins));
-	keys[1].objectid = ins->objectid;
-	keys[1].type = BTRFS_EXTENT_REF_KEY;
-	keys[1].offset = parent;
-	sizes[0] = sizeof(*extent_item);
-	sizes[1] = sizeof(*ref);
+	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
 	path->leave_spinning = 1;
-	ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
-				       sizes, 2);
+	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
+				      ins, size);
 	BUG_ON(ret);
 
-	extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+	leaf = path->nodes[0];
+	extent_item = btrfs_item_ptr(leaf, path->slots[0],
 				     struct btrfs_extent_item);
-	btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod);
-	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
-			     struct btrfs_extent_ref);
-
-	btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
-	btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
-	btrfs_set_ref_objectid(path->nodes[0], ref, owner);
-	btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod);
+	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
+	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
+	btrfs_set_extent_flags(leaf, extent_item,
+			       flags | BTRFS_EXTENT_FLAG_DATA);
+
+	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
+	btrfs_set_extent_inline_ref_type(leaf, iref, type);
+	if (parent > 0) {
+		struct btrfs_shared_data_ref *ref;
+		ref = (struct btrfs_shared_data_ref *)(iref + 1);
+		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
+	} else {
+		struct btrfs_extent_data_ref *ref;
+		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
+		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
+		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
+		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
+		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
+	}
 
 	btrfs_mark_buffer_dirty(path->nodes[0]);
-
-	trans->alloc_exclude_start = 0;
-	trans->alloc_exclude_nr = 0;
 	btrfs_free_path(path);
 
-	if (ret)
-		goto out;
-
-	ret = update_block_group(trans, root, ins->objectid,
-				 ins->offset, 1, 0);
+	ret = update_block_group(trans, root, ins->objectid, ins->offset,
+				 1, 0);
 	if (ret) {
 		printk(KERN_ERR "btrfs update block group failed for %llu "
 		       "%llu\n", (unsigned long long)ins->objectid,
 		       (unsigned long long)ins->offset);
 		BUG();
 	}
-out:
 	return ret;
 }
 
-int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root, u64 parent,
-				u64 root_objectid, u64 ref_generation,
-				u64 owner, struct btrfs_key *ins)
+static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     u64 parent, u64 root_objectid,
+				     u64 flags, struct btrfs_disk_key *key,
+				     int level, struct btrfs_key *ins)
 {
 	int ret;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_extent_item *extent_item;
+	struct btrfs_tree_block_info *block_info;
+	struct btrfs_extent_inline_ref *iref;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
 
-	if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
-		return 0;
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
 
-	ret = btrfs_add_delayed_ref(trans, ins->objectid,
-				    ins->offset, parent, root_objectid,
-				    ref_generation, owner,
-				    BTRFS_ADD_DELAYED_EXTENT, 0);
+	path->leave_spinning = 1;
+	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
+				      ins, size);
 	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+	extent_item = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_extent_item);
+	btrfs_set_extent_refs(leaf, extent_item, 1);
+	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
+	btrfs_set_extent_flags(leaf, extent_item,
+			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
+	block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
+
+	btrfs_set_tree_block_key(leaf, block_info, key);
+	btrfs_set_tree_block_level(leaf, block_info, level);
+
+	iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
+	if (parent > 0) {
+		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+		btrfs_set_extent_inline_ref_type(leaf, iref,
+						 BTRFS_SHARED_BLOCK_REF_KEY);
+		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+	} else {
+		btrfs_set_extent_inline_ref_type(leaf, iref,
+						 BTRFS_TREE_BLOCK_REF_KEY);
+		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+	}
+
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_free_path(path);
+
+	ret = update_block_group(trans, root, ins->objectid, ins->offset,
+				 1, 0);
+	if (ret) {
+		printk(KERN_ERR "btrfs update block group failed for %llu "
+		       "%llu\n", (unsigned long long)ins->objectid,
+		       (unsigned long long)ins->offset);
+		BUG();
+	}
+	return ret;
+}
+
+int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     u64 root_objectid, u64 owner,
+				     u64 offset, struct btrfs_key *ins)
+{
+	int ret;
+
+	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
+
+	ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
+					 0, root_objectid, owner, offset,
+					 BTRFS_ADD_DELAYED_EXTENT, NULL);
 	return ret;
 }
 
@@ -3070,10 +4006,10 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
  * an extent has been allocated and makes sure to clear the free
  * space cache bits as well
  */
-int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root, u64 parent,
-				u64 root_objectid, u64 ref_generation,
-				u64 owner, struct btrfs_key *ins)
+int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   u64 root_objectid, u64 owner, u64 offset,
+				   struct btrfs_key *ins)
 {
 	int ret;
 	struct btrfs_block_group_cache *block_group;
@@ -3087,8 +4023,8 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
 				      ins->offset);
 	BUG_ON(ret);
 	btrfs_put_block_group(block_group);
-	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
-					    ref_generation, owner, ins, 1);
+	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
+					 0, owner, offset, ins, 1);
 	return ret;
 }
 
@@ -3099,26 +4035,48 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
  *
  * returns 0 if everything worked, non-zero otherwise.
  */
-int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root,
-		       u64 num_bytes, u64 parent, u64 min_alloc_size,
-		       u64 root_objectid, u64 ref_generation,
-		       u64 owner_objectid, u64 empty_size, u64 hint_byte,
-		       u64 search_end, struct btrfs_key *ins, u64 data)
+static int alloc_tree_block(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    u64 num_bytes, u64 parent, u64 root_objectid,
+			    struct btrfs_disk_key *key, int level,
+			    u64 empty_size, u64 hint_byte, u64 search_end,
+			    struct btrfs_key *ins)
 {
 	int ret;
-	ret = __btrfs_reserve_extent(trans, root, num_bytes,
-				     min_alloc_size, empty_size, hint_byte,
-				     search_end, ins, data);
+	u64 flags = 0;
+
+	ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
+				     empty_size, hint_byte, search_end,
+				     ins, 0);
 	BUG_ON(ret);
+
+	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
+		if (parent == 0)
+			parent = ins->objectid;
+		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+	} else
+		BUG_ON(parent > 0);
+
+	update_reserved_extents(root, ins->objectid, ins->offset, 1);
 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
-		ret = btrfs_add_delayed_ref(trans, ins->objectid,
-					    ins->offset, parent, root_objectid,
-					    ref_generation, owner_objectid,
-					    BTRFS_ADD_DELAYED_EXTENT, 0);
+		struct btrfs_delayed_extent_op *extent_op;
+		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+		BUG_ON(!extent_op);
+		if (key)
+			memcpy(&extent_op->key, key, sizeof(extent_op->key));
+		else
+			memset(&extent_op->key, 0, sizeof(extent_op->key));
+		extent_op->flags_to_set = flags;
+		extent_op->update_key = 1;
+		extent_op->update_flags = 1;
+		extent_op->is_data = 0;
+
+		ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
+					ins->offset, parent, root_objectid,
+					level, BTRFS_ADD_DELAYED_EXTENT,
+					extent_op);
 		BUG_ON(ret);
 	}
-	update_reserved_extents(root, ins->objectid, ins->offset, 1);
 	return ret;
 }
 
@@ -3157,21 +4115,17 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
  * returns the tree buffer or NULL.
  */
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-					     struct btrfs_root *root,
-					     u32 blocksize, u64 parent,
-					     u64 root_objectid,
-					     u64 ref_generation,
-					     int level,
-					     u64 hint,
-					     u64 empty_size)
+					struct btrfs_root *root, u32 blocksize,
+					u64 parent, u64 root_objectid,
+					struct btrfs_disk_key *key, int level,
+					u64 hint, u64 empty_size)
 {
 	struct btrfs_key ins;
 	int ret;
 	struct extent_buffer *buf;
 
-	ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
-				 root_objectid, ref_generation, level,
-				 empty_size, hint, (u64)-1, &ins, 0);
+	ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid,
+			       key, level, empty_size, hint, (u64)-1, &ins);
 	if (ret) {
 		BUG_ON(ret > 0);
 		return ERR_PTR(ret);
@@ -3185,32 +4139,19 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct extent_buffer *leaf)
 {
-	u64 leaf_owner;
-	u64 leaf_generation;
-	struct refsort *sorted;
+	u64 disk_bytenr;
+	u64 num_bytes;
 	struct btrfs_key key;
 	struct btrfs_file_extent_item *fi;
+	u32 nritems;
 	int i;
-	int nritems;
 	int ret;
-	int refi = 0;
-	int slot;
 
 	BUG_ON(!btrfs_is_leaf(leaf));
 	nritems = btrfs_header_nritems(leaf);
-	leaf_owner = btrfs_header_owner(leaf);
-	leaf_generation = btrfs_header_generation(leaf);
 
-	sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
-	/* we do this loop twice.  The first time we build a list
-	 * of the extents we have a reference on, then we sort the list
-	 * by bytenr.  The second time around we actually do the
-	 * extent freeing.
-	 */
 	for (i = 0; i < nritems; i++) {
-		u64 disk_bytenr;
 		cond_resched();
-
 		btrfs_item_key_to_cpu(leaf, &key, i);
 
 		/* only extents have references, skip everything else */
@@ -3230,45 +4171,16 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 		if (disk_bytenr == 0)
 			continue;
 
-		sorted[refi].bytenr = disk_bytenr;
-		sorted[refi].slot = i;
-		refi++;
-	}
-
-	if (refi == 0)
-		goto out;
-
-	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
-
-	for (i = 0; i < refi; i++) {
-		u64 disk_bytenr;
-
-		disk_bytenr = sorted[i].bytenr;
-		slot = sorted[i].slot;
-
-		cond_resched();
-
-		btrfs_item_key_to_cpu(leaf, &key, slot);
-		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
-			continue;
-
-		fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
-
-		ret = btrfs_free_extent(trans, root, disk_bytenr,
-				btrfs_file_extent_disk_num_bytes(leaf, fi),
-				leaf->start, leaf_owner, leaf_generation,
-				key.objectid, 0);
+		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+		ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes,
+					leaf->start, 0, key.objectid, 0);
 		BUG_ON(ret);
-
-		atomic_inc(&root->fs_info->throttle_gen);
-		wake_up(&root->fs_info->transaction_throttle);
-		cond_resched();
 	}
-out:
-	kfree(sorted);
 	return 0;
 }
 
+#if 0
+
 static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root,
 					struct btrfs_leaf_ref *ref)
@@ -3311,13 +4223,14 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+
 static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root, u64 start,
 				     u64 len, u32 *refs)
 {
 	int ret;
 
-	ret = btrfs_lookup_extent_ref(trans, root, start, len, refs);
+	ret = btrfs_lookup_extent_refs(trans, root, start, len, refs);
 	BUG_ON(ret);
 
 #if 0 /* some debugging code in case we see problems here */
@@ -3352,6 +4265,7 @@ static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+
 /*
  * this is used while deleting old snapshots, and it drops the refs
  * on a whole subtree starting from a level 1 node.
@@ -3645,32 +4559,36 @@ out:
 	cond_resched();
 	return 0;
 }
+#endif
 
 /*
  * helper function for drop_subtree, this function is similar to
  * walk_down_tree. The main difference is that it checks reference
  * counts while tree blocks are locked.
  */
-static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
-				      struct btrfs_root *root,
-				      struct btrfs_path *path, int *level)
+static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_path *path, int *level)
 {
 	struct extent_buffer *next;
 	struct extent_buffer *cur;
 	struct extent_buffer *parent;
 	u64 bytenr;
 	u64 ptr_gen;
+	u64 refs;
+	u64 flags;
 	u32 blocksize;
-	u32 refs;
 	int ret;
 
 	cur = path->nodes[*level];
-	ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len,
-				      &refs);
+	ret = btrfs_lookup_extent_info(trans, root, cur->start, cur->len,
+				       &refs, &flags);
 	BUG_ON(ret);
 	if (refs > 1)
 		goto out;
 
+	BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+
 	while (*level >= 0) {
 		cur = path->nodes[*level];
 		if (*level == 0) {
@@ -3692,16 +4610,15 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
 		btrfs_tree_lock(next);
 		btrfs_set_lock_blocking(next);
 
-		ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
-					      &refs);
+		ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+					       &refs, &flags);
 		BUG_ON(ret);
 		if (refs > 1) {
 			parent = path->nodes[*level];
 			ret = btrfs_free_extent(trans, root, bytenr,
-					blocksize, parent->start,
-					btrfs_header_owner(parent),
-					btrfs_header_generation(parent),
-					*level - 1, 1);
+						blocksize, parent->start,
+						btrfs_header_owner(parent),
+						*level - 1, 0);
 			BUG_ON(ret);
 			path->slots[*level]++;
 			btrfs_tree_unlock(next);
@@ -3709,6 +4626,8 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
 			continue;
 		}
 
+		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+
 		*level = btrfs_header_level(next);
 		path->nodes[*level] = next;
 		path->slots[*level] = 0;
@@ -3716,13 +4635,15 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
 		cond_resched();
 	}
 out:
-	parent = path->nodes[*level + 1];
+	if (path->nodes[*level] == root->node)
+		parent = path->nodes[*level];
+	else
+		parent = path->nodes[*level + 1];
 	bytenr = path->nodes[*level]->start;
 	blocksize = path->nodes[*level]->len;
 
-	ret = btrfs_free_extent(trans, root, bytenr, blocksize,
-			parent->start, btrfs_header_owner(parent),
-			btrfs_header_generation(parent), *level, 1);
+	ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent->start,
+				btrfs_header_owner(parent), *level, 0);
 	BUG_ON(ret);
 
 	if (path->locks[*level]) {
@@ -3746,8 +4667,6 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 				 struct btrfs_path *path,
 				 int *level, int max_level)
 {
-	u64 root_owner;
-	u64 root_gen;
 	struct btrfs_root_item *root_item = &root->root_item;
 	int i;
 	int slot;
@@ -3755,24 +4674,22 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 
 	for (i = *level; i < max_level && path->nodes[i]; i++) {
 		slot = path->slots[i];
-		if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
-			struct extent_buffer *node;
-			struct btrfs_disk_key disk_key;
-
+		if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
 			/*
 			 * there is more work to do in this level.
 			 * Update the drop_progress marker to reflect
 			 * the work we've done so far, and then bump
 			 * the slot number
 			 */
-			node = path->nodes[i];
 			path->slots[i]++;
-			*level = i;
 			WARN_ON(*level == 0);
-			btrfs_node_key(node, &disk_key, path->slots[i]);
-			memcpy(&root_item->drop_progress,
-			       &disk_key, sizeof(disk_key));
-			root_item->drop_level = i;
+			if (max_level == BTRFS_MAX_LEVEL) {
+				btrfs_node_key(path->nodes[i],
+					       &root_item->drop_progress,
+					       path->slots[i]);
+				root_item->drop_level = i;
+			}
+			*level = i;
 			return 0;
 		} else {
 			struct extent_buffer *parent;
@@ -3786,22 +4703,20 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 			else
 				parent = path->nodes[*level + 1];
 
-			root_owner = btrfs_header_owner(parent);
-			root_gen = btrfs_header_generation(parent);
-
-			clean_tree_block(trans, root, path->nodes[*level]);
+			clean_tree_block(trans, root, path->nodes[i]);
 			ret = btrfs_free_extent(trans, root,
-						path->nodes[*level]->start,
-						path->nodes[*level]->len,
-						parent->start, root_owner,
-						root_gen, *level, 1);
+						path->nodes[i]->start,
+						path->nodes[i]->len,
+						parent->start,
+						btrfs_header_owner(parent),
+						*level, 0);
 			BUG_ON(ret);
 			if (path->locks[*level]) {
-				btrfs_tree_unlock(path->nodes[*level]);
-				path->locks[*level] = 0;
+				btrfs_tree_unlock(path->nodes[i]);
+				path->locks[i] = 0;
 			}
-			free_extent_buffer(path->nodes[*level]);
-			path->nodes[*level] = NULL;
+			free_extent_buffer(path->nodes[i]);
+			path->nodes[i] = NULL;
 			*level = i + 1;
 		}
 	}
@@ -3820,21 +4735,18 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 	int wret;
 	int level;
 	struct btrfs_path *path;
-	int i;
-	int orig_level;
 	int update_count;
 	struct btrfs_root_item *root_item = &root->root_item;
 
-	WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
 	level = btrfs_header_level(root->node);
-	orig_level = level;
 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
-		path->nodes[level] = root->node;
-		extent_buffer_get(root->node);
+		path->nodes[level] = btrfs_lock_root_node(root);
+		btrfs_set_lock_blocking(path->nodes[level]);
 		path->slots[level] = 0;
+		path->locks[level] = 1;
 	} else {
 		struct btrfs_key key;
 		struct btrfs_disk_key found_key;
@@ -3856,12 +4768,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 		 * unlock our path, this is safe because only this
 		 * function is allowed to delete this snapshot
 		 */
-		for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
-			if (path->nodes[i] && path->locks[i]) {
-				path->locks[i] = 0;
-				btrfs_tree_unlock(path->nodes[i]);
-			}
-		}
+		btrfs_unlock_up_safe(path, 0);
 	}
 	while (1) {
 		unsigned long update;
@@ -3882,8 +4789,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 			ret = -EAGAIN;
 			break;
 		}
-		atomic_inc(&root->fs_info->throttle_gen);
-		wake_up(&root->fs_info->transaction_throttle);
 		for (update_count = 0; update_count < 16; update_count++) {
 			update = trans->delayed_ref_updates;
 			trans->delayed_ref_updates = 0;
@@ -3893,12 +4798,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 				break;
 		}
 	}
-	for (i = 0; i <= orig_level; i++) {
-		if (path->nodes[i]) {
-			free_extent_buffer(path->nodes[i]);
-			path->nodes[i] = NULL;
-		}
-	}
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -3931,7 +4830,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 	path->slots[level] = 0;
 
 	while (1) {
-		wret = walk_down_subtree(trans, root, path, &level);
+		wret = walk_down_tree(trans, root, path, &level);
 		if (wret < 0)
 			ret = wret;
 		if (wret != 0)
@@ -3948,6 +4847,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+#if 0
 static unsigned long calc_ra(unsigned long start, unsigned long last,
 			     unsigned long nr)
 {
@@ -5429,6 +6329,7 @@ out:
 	kfree(ref_path);
 	return ret;
 }
+#endif
 
 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 {
@@ -5477,7 +6378,8 @@ static int __alloc_chunk_for_shrink(struct btrfs_root *root,
 	u64 calc;
 
 	spin_lock(&shrink_block_group->lock);
-	if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
+	if (btrfs_block_group_used(&shrink_block_group->item) +
+	    shrink_block_group->reserved > 0) {
 		spin_unlock(&shrink_block_group->lock);
 
 		trans = btrfs_start_transaction(root, 1);
@@ -5502,6 +6404,17 @@ static int __alloc_chunk_for_shrink(struct btrfs_root *root,
 	return 0;
 }
 
+
+int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
+					 struct btrfs_block_group_cache *group)
+
+{
+	__alloc_chunk_for_shrink(root, group, 1);
+	set_block_group_readonly(group);
+	return 0;
+}
+
+#if 0
 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 u64 objectid, u64 size)
@@ -5781,6 +6694,7 @@ out:
 	btrfs_free_path(path);
 	return ret;
 }
+#endif
 
 static int find_first_block_group(struct btrfs_root *root,
 		struct btrfs_path *path, struct btrfs_key *key)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1d51dc3..0726a73 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -291,16 +291,12 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 {
 	u64 extent_end = 0;
 	u64 search_start = start;
-	u64 leaf_start;
 	u64 ram_bytes = 0;
-	u64 orig_parent = 0;
 	u64 disk_bytenr = 0;
 	u64 orig_locked_end = locked_end;
 	u8 compression;
 	u8 encryption;
 	u16 other_encoding = 0;
-	u64 root_gen;
-	u64 root_owner;
 	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *extent;
 	struct btrfs_path *path;
@@ -340,9 +336,6 @@ next_slot:
 		bookend = 0;
 		found_extent = 0;
 		found_inline = 0;
-		leaf_start = 0;
-		root_gen = 0;
-		root_owner = 0;
 		compression = 0;
 		encryption = 0;
 		extent = NULL;
@@ -417,9 +410,6 @@ next_slot:
 		if (found_extent) {
 			read_extent_buffer(leaf, &old, (unsigned long)extent,
 					   sizeof(old));
-			root_gen = btrfs_header_generation(leaf);
-			root_owner = btrfs_header_owner(leaf);
-			leaf_start = leaf->start;
 		}
 
 		if (end < extent_end && end >= key.offset) {
@@ -443,14 +433,14 @@ next_slot:
 				}
 				locked_end = extent_end;
 			}
-			orig_parent = path->nodes[0]->start;
 			disk_bytenr = le64_to_cpu(old.disk_bytenr);
 			if (disk_bytenr != 0) {
 				ret = btrfs_inc_extent_ref(trans, root,
 					   disk_bytenr,
-					   le64_to_cpu(old.disk_num_bytes),
-					   orig_parent, root->root_key.objectid,
-					   trans->transid, inode->i_ino);
+					   le64_to_cpu(old.disk_num_bytes), 0,
+					   root->root_key.objectid,
+					   key.objectid, key.offset -
+					   le64_to_cpu(old.offset));
 				BUG_ON(ret);
 			}
 		}
@@ -568,17 +558,6 @@ next_slot:
 			btrfs_mark_buffer_dirty(path->nodes[0]);
 			btrfs_set_lock_blocking(path->nodes[0]);
 
-			if (disk_bytenr != 0) {
-				ret = btrfs_update_extent_ref(trans, root,
-						disk_bytenr,
-						le64_to_cpu(old.disk_num_bytes),
-						orig_parent,
-						leaf->start,
-						root->root_key.objectid,
-						trans->transid, ins.objectid);
-
-				BUG_ON(ret);
-			}
 			path->leave_spinning = 0;
 			btrfs_release_path(root, path);
 			if (disk_bytenr != 0)
@@ -594,8 +573,9 @@ next_slot:
 				ret = btrfs_free_extent(trans, root,
 						old_disk_bytenr,
 						le64_to_cpu(old.disk_num_bytes),
-						leaf_start, root_owner,
-						root_gen, key.objectid, 0);
+						0, root->root_key.objectid,
+						key.objectid, key.offset -
+						le64_to_cpu(old.offset));
 				BUG_ON(ret);
 				*hint_byte = old_disk_bytenr;
 			}
@@ -664,12 +644,11 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 	u64 bytenr;
 	u64 num_bytes;
 	u64 extent_end;
-	u64 extent_offset;
+	u64 orig_offset;
 	u64 other_start;
 	u64 other_end;
 	u64 split = start;
 	u64 locked_end = end;
-	u64 orig_parent;
 	int extent_type;
 	int split_end = 1;
 	int ret;
@@ -703,7 +682,7 @@ again:
 
 	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
-	extent_offset = btrfs_file_extent_offset(leaf, fi);
+	orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
 
 	if (key.offset == start)
 		split = end;
@@ -711,8 +690,6 @@ again:
 	if (key.offset == start && extent_end == end) {
 		int del_nr = 0;
 		int del_slot = 0;
-		u64 leaf_owner = btrfs_header_owner(leaf);
-		u64 leaf_gen = btrfs_header_generation(leaf);
 		other_start = end;
 		other_end = 0;
 		if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
@@ -721,8 +698,8 @@ again:
 			del_slot = path->slots[0] + 1;
 			del_nr++;
 			ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
-						leaf->start, leaf_owner,
-						leaf_gen, inode->i_ino, 0);
+						0, root->root_key.objectid,
+						inode->i_ino, orig_offset);
 			BUG_ON(ret);
 		}
 		other_start = 0;
@@ -733,8 +710,8 @@ again:
 			del_slot = path->slots[0];
 			del_nr++;
 			ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
-						leaf->start, leaf_owner,
-						leaf_gen, inode->i_ino, 0);
+						0, root->root_key.objectid,
+						inode->i_ino, orig_offset);
 			BUG_ON(ret);
 		}
 		split_end = 0;
@@ -768,13 +745,12 @@ again:
 			locked_end = extent_end;
 		}
 		btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
-		extent_offset += split - key.offset;
 	} else  {
 		BUG_ON(key.offset != start);
-		btrfs_set_file_extent_offset(leaf, fi, extent_offset +
-					     split - key.offset);
-		btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
 		key.offset = split;
+		btrfs_set_file_extent_offset(leaf, fi, key.offset -
+					     orig_offset);
+		btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
 		btrfs_set_item_key_safe(trans, root, path, &key);
 		extent_end = split;
 	}
@@ -793,7 +769,8 @@ again:
 					    struct btrfs_file_extent_item);
 			key.offset = split;
 			btrfs_set_item_key_safe(trans, root, path, &key);
-			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+			btrfs_set_file_extent_offset(leaf, fi, key.offset -
+						     orig_offset);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							other_end - split);
 			goto done;
@@ -815,10 +792,9 @@ again:
 
 	btrfs_mark_buffer_dirty(leaf);
 
-	orig_parent = leaf->start;
-	ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
-				   orig_parent, root->root_key.objectid,
-				   trans->transid, inode->i_ino);
+	ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
+				   root->root_key.objectid,
+				   inode->i_ino, orig_offset);
 	BUG_ON(ret);
 	btrfs_release_path(root, path);
 
@@ -833,20 +809,12 @@ again:
 	btrfs_set_file_extent_type(leaf, fi, extent_type);
 	btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
 	btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
-	btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+	btrfs_set_file_extent_offset(leaf, fi, key.offset - orig_offset);
 	btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
 	btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
 	btrfs_set_file_extent_compression(leaf, fi, 0);
 	btrfs_set_file_extent_encryption(leaf, fi, 0);
 	btrfs_set_file_extent_other_encoding(leaf, fi, 0);
-
-	if (orig_parent != leaf->start) {
-		ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
-					      orig_parent, leaf->start,
-					      root->root_key.objectid,
-					      trans->transid, inode->i_ino);
-		BUG_ON(ret);
-	}
 done:
 	btrfs_mark_buffer_dirty(leaf);
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1c8b019..917bf10 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -48,7 +48,6 @@
 #include "ordered-data.h"
 #include "xattr.h"
 #include "tree-log.h"
-#include "ref-cache.h"
 #include "compression.h"
 #include "locking.h"
 
@@ -944,6 +943,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 	u64 cow_start;
 	u64 cur_offset;
 	u64 extent_end;
+	u64 extent_offset;
 	u64 disk_bytenr;
 	u64 num_bytes;
 	int extent_type;
@@ -1005,6 +1005,7 @@ next_slot:
 		if (extent_type == BTRFS_FILE_EXTENT_REG ||
 		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+			extent_offset = btrfs_file_extent_offset(leaf, fi);
 			extent_end = found_key.offset +
 				btrfs_file_extent_num_bytes(leaf, fi);
 			if (extent_end <= start) {
@@ -1022,9 +1023,10 @@ next_slot:
 			if (btrfs_extent_readonly(root, disk_bytenr))
 				goto out_check;
 			if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
-						  disk_bytenr))
+						  found_key.offset -
+						  extent_offset, disk_bytenr))
 				goto out_check;
-			disk_bytenr += btrfs_file_extent_offset(leaf, fi);
+			disk_bytenr += extent_offset;
 			disk_bytenr += cur_offset - found_key.offset;
 			num_bytes = min(end + 1, extent_end) - cur_offset;
 			/*
@@ -1489,9 +1491,9 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	ins.objectid = disk_bytenr;
 	ins.offset = disk_num_bytes;
 	ins.type = BTRFS_EXTENT_ITEM_KEY;
-	ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
-					  root->root_key.objectid,
-					  trans->transid, inode->i_ino, &ins);
+	ret = btrfs_alloc_reserved_file_extent(trans, root,
+					root->root_key.objectid,
+					inode->i_ino, file_pos, &ins);
 	BUG_ON(ret);
 	btrfs_free_path(path);
 
@@ -1956,23 +1958,13 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 		 * crossing root thing.  we store the inode number in the
 		 * offset of the orphan item.
 		 */
-		inode = btrfs_iget_locked(root->fs_info->sb,
-					  found_key.offset, root);
-		if (!inode)
+		found_key.objectid = found_key.offset;
+		found_key.type = BTRFS_INODE_ITEM_KEY;
+		found_key.offset = 0;
+		inode = btrfs_iget(root->fs_info->sb, &found_key, root);
+		if (IS_ERR(inode))
 			break;
 
-		if (inode->i_state & I_NEW) {
-			BTRFS_I(inode)->root = root;
-
-			/* have to set the location manually */
-			BTRFS_I(inode)->location.objectid = inode->i_ino;
-			BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
-			BTRFS_I(inode)->location.offset = 0;
-
-			btrfs_read_locked_inode(inode);
-			unlock_new_inode(inode);
-		}
-
 		/*
 		 * add this inode to the orphan list so btrfs_orphan_del does
 		 * the proper thing when we hit it
@@ -2069,7 +2061,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 /*
  * read an inode from the btree into the in-memory inode
  */
-void btrfs_read_locked_inode(struct inode *inode)
+static void btrfs_read_locked_inode(struct inode *inode)
 {
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
@@ -2599,9 +2591,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	struct btrfs_file_extent_item *fi;
 	u64 extent_start = 0;
 	u64 extent_num_bytes = 0;
+	u64 extent_offset = 0;
 	u64 item_end = 0;
-	u64 root_gen = 0;
-	u64 root_owner = 0;
 	int found_extent;
 	int del_item;
 	int pending_del_nr = 0;
@@ -2716,6 +2707,9 @@ search_again:
 				extent_num_bytes =
 					btrfs_file_extent_disk_num_bytes(leaf,
 									 fi);
+				extent_offset = found_key.offset -
+					btrfs_file_extent_offset(leaf, fi);
+
 				/* FIXME blocksize != 4096 */
 				num_dec = btrfs_file_extent_num_bytes(leaf, fi);
 				if (extent_start != 0) {
@@ -2723,8 +2717,6 @@ search_again:
 					if (root->ref_cows)
 						inode_sub_bytes(inode, num_dec);
 				}
-				root_gen = btrfs_header_generation(leaf);
-				root_owner = btrfs_header_owner(leaf);
 			}
 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 			/*
@@ -2768,12 +2760,12 @@ delete:
 		} else {
 			break;
 		}
-		if (found_extent) {
+		if (found_extent && root->ref_cows) {
 			btrfs_set_path_blocking(path);
 			ret = btrfs_free_extent(trans, root, extent_start,
-						extent_num_bytes,
-						leaf->start, root_owner,
-						root_gen, inode->i_ino, 0);
+						extent_num_bytes, 0,
+						btrfs_header_owner(leaf),
+						inode->i_ino, extent_offset);
 			BUG_ON(ret);
 		}
 next:
@@ -3105,6 +3097,45 @@ static int fixup_tree_root_location(struct btrfs_root *root,
 	return 0;
 }
 
+static void inode_tree_add(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_inode *entry;
+	struct rb_node **p = &root->inode_tree.rb_node;
+	struct rb_node *parent = NULL;
+
+	spin_lock(&root->inode_lock);
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct btrfs_inode, rb_node);
+
+		if (inode->i_ino < entry->vfs_inode.i_ino)
+			p = &(*p)->rb_left;
+		else if (inode->i_ino > entry->vfs_inode.i_ino)
+			p = &(*p)->rb_right;
+		else {
+			WARN_ON(!(entry->vfs_inode.i_state &
+				  (I_WILL_FREE | I_FREEING | I_CLEAR)));
+			break;
+		}
+	}
+	rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
+	rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree);
+	spin_unlock(&root->inode_lock);
+}
+
+static void inode_tree_del(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
+		spin_lock(&root->inode_lock);
+		rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
+		spin_unlock(&root->inode_lock);
+		RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+	}
+}
+
 static noinline void init_btrfs_i(struct inode *inode)
 {
 	struct btrfs_inode *bi = BTRFS_I(inode);
@@ -3130,6 +3161,7 @@ static noinline void init_btrfs_i(struct inode *inode)
 			     inode->i_mapping, GFP_NOFS);
 	INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
 	INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
+	RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
 	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
 	mutex_init(&BTRFS_I(inode)->extent_mutex);
 	mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -3152,26 +3184,9 @@ static int btrfs_find_actor(struct inode *inode, void *opaque)
 		args->root == BTRFS_I(inode)->root;
 }
 
-struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
-			    struct btrfs_root *root, int wait)
-{
-	struct inode *inode;
-	struct btrfs_iget_args args;
-	args.ino = objectid;
-	args.root = root;
-
-	if (wait) {
-		inode = ilookup5(s, objectid, btrfs_find_actor,
-				 (void *)&args);
-	} else {
-		inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
-					(void *)&args);
-	}
-	return inode;
-}
-
-struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
-				struct btrfs_root *root)
+static struct inode *btrfs_iget_locked(struct super_block *s,
+				       u64 objectid,
+				       struct btrfs_root *root)
 {
 	struct inode *inode;
 	struct btrfs_iget_args args;
@@ -3188,24 +3203,21 @@ struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
  * Returns in *is_new if the inode was read from disk
  */
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
-			 struct btrfs_root *root, int *is_new)
+			 struct btrfs_root *root)
 {
 	struct inode *inode;
 
 	inode = btrfs_iget_locked(s, location->objectid, root);
 	if (!inode)
-		return ERR_PTR(-EACCES);
+		return ERR_PTR(-ENOMEM);
 
 	if (inode->i_state & I_NEW) {
 		BTRFS_I(inode)->root = root;
 		memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
 		btrfs_read_locked_inode(inode);
+
+		inode_tree_add(inode);
 		unlock_new_inode(inode);
-		if (is_new)
-			*is_new = 1;
-	} else {
-		if (is_new)
-			*is_new = 0;
 	}
 
 	return inode;
@@ -3218,7 +3230,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 	struct btrfs_root *root = bi->root;
 	struct btrfs_root *sub_root = root;
 	struct btrfs_key location;
-	int ret, new;
+	int ret;
 
 	if (dentry->d_name.len > BTRFS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
@@ -3236,7 +3248,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 			return ERR_PTR(ret);
 		if (ret > 0)
 			return ERR_PTR(-ENOENT);
-		inode = btrfs_iget(dir->i_sb, &location, sub_root, &new);
+		inode = btrfs_iget(dir->i_sb, &location, sub_root);
 		if (IS_ERR(inode))
 			return ERR_CAST(inode);
 	}
@@ -3631,6 +3643,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
 
 	insert_inode_hash(inode);
+	inode_tree_add(inode);
 	return inode;
 fail:
 	if (dir)
@@ -4683,6 +4696,7 @@ void btrfs_destroy_inode(struct inode *inode)
 			btrfs_put_ordered_extent(ordered);
 		}
 	}
+	inode_tree_del(inode);
 	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2624b53..54dfd45 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -82,22 +82,25 @@ static noinline int create_subvol(struct btrfs_root *root,
 	if (ret)
 		goto fail;
 
-	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
-				      objectid, trans->transid, 0, 0, 0);
+	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+				      0, objectid, NULL, 0, 0, 0);
 	if (IS_ERR(leaf)) {
 		ret = PTR_ERR(leaf);
 		goto fail;
 	}
 
-	btrfs_set_header_nritems(leaf, 0);
-	btrfs_set_header_level(leaf, 0);
+	memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
 	btrfs_set_header_bytenr(leaf, leaf->start);
 	btrfs_set_header_generation(leaf, trans->transid);
+	btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(leaf, objectid);
 
 	write_extent_buffer(leaf, root->fs_info->fsid,
 			    (unsigned long)btrfs_header_fsid(leaf),
 			    BTRFS_FSID_SIZE);
+	write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
+			    (unsigned long)btrfs_header_chunk_tree_uuid(leaf),
+			    BTRFS_UUID_SIZE);
 	btrfs_mark_buffer_dirty(leaf);
 
 	inode_item = &root_item.inode;
@@ -125,7 +128,7 @@ static noinline int create_subvol(struct btrfs_root *root,
 	btrfs_set_root_dirid(&root_item, new_dirid);
 
 	key.objectid = objectid;
-	key.offset = 1;
+	key.offset = 0;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 				&root_item);
@@ -911,10 +914,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 				if (disko) {
 					inode_add_bytes(inode, datal);
 					ret = btrfs_inc_extent_ref(trans, root,
-						   disko, diskl, leaf->start,
-						   root->root_key.objectid,
-						   trans->transid,
-						   inode->i_ino);
+							disko, diskl, 0,
+							root->root_key.objectid,
+							inode->i_ino,
+							new_key.offset - datao);
 					BUG_ON(ret);
 				}
 			} else if (type == BTRFS_FILE_EXTENT_INLINE) {
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 5f8f218..6d6523d 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -45,22 +45,132 @@ static void print_dev_item(struct extent_buffer *eb,
 	       (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
 	       (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
 }
+static void print_extent_data_ref(struct extent_buffer *eb,
+				  struct btrfs_extent_data_ref *ref)
+{
+	printk(KERN_INFO "\t\textent data backref root %llu "
+	       "objectid %llu offset %llu count %u\n",
+	       (unsigned long long)btrfs_extent_data_ref_root(eb, ref),
+	       (unsigned long long)btrfs_extent_data_ref_objectid(eb, ref),
+	       (unsigned long long)btrfs_extent_data_ref_offset(eb, ref),
+	       btrfs_extent_data_ref_count(eb, ref));
+}
+
+static void print_extent_item(struct extent_buffer *eb, int slot)
+{
+	struct btrfs_extent_item *ei;
+	struct btrfs_extent_inline_ref *iref;
+	struct btrfs_extent_data_ref *dref;
+	struct btrfs_shared_data_ref *sref;
+	struct btrfs_disk_key key;
+	unsigned long end;
+	unsigned long ptr;
+	int type;
+	u32 item_size = btrfs_item_size_nr(eb, slot);
+	u64 flags;
+	u64 offset;
+
+	if (item_size < sizeof(*ei)) {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		struct btrfs_extent_item_v0 *ei0;
+		BUG_ON(item_size != sizeof(*ei0));
+		ei0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_item_v0);
+		printk(KERN_INFO "\t\textent refs %u\n",
+		       btrfs_extent_refs_v0(eb, ei0));
+		return;
+#else
+		BUG();
+#endif
+	}
+
+	ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item);
+	flags = btrfs_extent_flags(eb, ei);
+
+	printk(KERN_INFO "\t\textent refs %llu gen %llu flags %llu\n",
+	       (unsigned long long)btrfs_extent_refs(eb, ei),
+	       (unsigned long long)btrfs_extent_generation(eb, ei),
+	       (unsigned long long)flags);
+
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+		struct btrfs_tree_block_info *info;
+		info = (struct btrfs_tree_block_info *)(ei + 1);
+		btrfs_tree_block_key(eb, info, &key);
+		printk(KERN_INFO "\t\ttree block key (%llu %x %llu) "
+		       "level %d\n",
+		       (unsigned long long)btrfs_disk_key_objectid(&key),
+		       key.type,
+		       (unsigned long long)btrfs_disk_key_offset(&key),
+		       btrfs_tree_block_level(eb, info));
+		iref = (struct btrfs_extent_inline_ref *)(info + 1);
+	} else {
+		iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+	}
+
+	ptr = (unsigned long)iref;
+	end = (unsigned long)ei + item_size;
+	while (ptr < end) {
+		iref = (struct btrfs_extent_inline_ref *)ptr;
+		type = btrfs_extent_inline_ref_type(eb, iref);
+		offset = btrfs_extent_inline_ref_offset(eb, iref);
+		switch (type) {
+		case BTRFS_TREE_BLOCK_REF_KEY:
+			printk(KERN_INFO "\t\ttree block backref "
+				"root %llu\n", (unsigned long long)offset);
+			break;
+		case BTRFS_SHARED_BLOCK_REF_KEY:
+			printk(KERN_INFO "\t\tshared block backref "
+				"parent %llu\n", (unsigned long long)offset);
+			break;
+		case BTRFS_EXTENT_DATA_REF_KEY:
+			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+			print_extent_data_ref(eb, dref);
+			break;
+		case BTRFS_SHARED_DATA_REF_KEY:
+			sref = (struct btrfs_shared_data_ref *)(iref + 1);
+			printk(KERN_INFO "\t\tshared data backref "
+			       "parent %llu count %u\n",
+			       (unsigned long long)offset,
+			       btrfs_shared_data_ref_count(eb, sref));
+			break;
+		default:
+			BUG();
+		}
+		ptr += btrfs_extent_inline_ref_size(type);
+	}
+	WARN_ON(ptr > end);
+}
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
+{
+	struct btrfs_extent_ref_v0 *ref0;
+
+	ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0);
+	printk("\t\textent back ref root %llu gen %llu "
+		"owner %llu num_refs %lu\n",
+		(unsigned long long)btrfs_ref_root_v0(eb, ref0),
+		(unsigned long long)btrfs_ref_generation_v0(eb, ref0),
+		(unsigned long long)btrfs_ref_objectid_v0(eb, ref0),
+		(unsigned long)btrfs_ref_count_v0(eb, ref0));
+}
+#endif
+
 void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 {
 	int i;
+	u32 type;
 	u32 nr = btrfs_header_nritems(l);
 	struct btrfs_item *item;
-	struct btrfs_extent_item *ei;
 	struct btrfs_root_item *ri;
 	struct btrfs_dir_item *di;
 	struct btrfs_inode_item *ii;
 	struct btrfs_block_group_item *bi;
 	struct btrfs_file_extent_item *fi;
+	struct btrfs_extent_data_ref *dref;
+	struct btrfs_shared_data_ref *sref;
+	struct btrfs_dev_extent *dev_extent;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
-	struct btrfs_extent_ref *ref;
-	struct btrfs_dev_extent *dev_extent;
-	u32 type;
 
 	printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
 		(unsigned long long)btrfs_header_bytenr(l), nr,
@@ -100,20 +210,25 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 				btrfs_disk_root_refs(l, ri));
 			break;
 		case BTRFS_EXTENT_ITEM_KEY:
-			ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
-			printk(KERN_INFO "\t\textent data refs %u\n",
-				btrfs_extent_refs(l, ei));
-			break;
-		case BTRFS_EXTENT_REF_KEY:
-			ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref);
-			printk(KERN_INFO "\t\textent back ref root %llu "
-			       "gen %llu owner %llu num_refs %lu\n",
-			       (unsigned long long)btrfs_ref_root(l, ref),
-			       (unsigned long long)btrfs_ref_generation(l, ref),
-			       (unsigned long long)btrfs_ref_objectid(l, ref),
-			       (unsigned long)btrfs_ref_num_refs(l, ref));
+			print_extent_item(l, i);
+			break;
+		case BTRFS_TREE_BLOCK_REF_KEY:
+			printk(KERN_INFO "\t\ttree block backref\n");
+			break;
+		case BTRFS_SHARED_BLOCK_REF_KEY:
+			printk(KERN_INFO "\t\tshared block backref\n");
+			break;
+		case BTRFS_EXTENT_DATA_REF_KEY:
+			dref = btrfs_item_ptr(l, i,
+					      struct btrfs_extent_data_ref);
+			print_extent_data_ref(l, dref);
+			break;
+		case BTRFS_SHARED_DATA_REF_KEY:
+			sref = btrfs_item_ptr(l, i,
+					      struct btrfs_shared_data_ref);
+			printk(KERN_INFO "\t\tshared data backref count %u\n",
+			       btrfs_shared_data_ref_count(l, sref));
 			break;
-
 		case BTRFS_EXTENT_DATA_KEY:
 			fi = btrfs_item_ptr(l, i,
 					    struct btrfs_file_extent_item);
@@ -139,6 +254,12 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 			       (unsigned long long)
 			       btrfs_file_extent_ram_bytes(l, fi));
 			break;
+		case BTRFS_EXTENT_REF_V0_KEY:
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+			print_extent_ref_v0(l, i);
+#else
+			BUG();
+#endif
 		case BTRFS_BLOCK_GROUP_ITEM_KEY:
 			bi = btrfs_item_ptr(l, i,
 					    struct btrfs_block_group_item);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
new file mode 100644
index 0000000..b23dc20
--- /dev/null
+++ b/fs/btrfs/relocation.c
@@ -0,0 +1,3711 @@
+/*
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "volumes.h"
+#include "locking.h"
+#include "btrfs_inode.h"
+#include "async-thread.h"
+
+/*
+ * backref_node, mapping_node and tree_block start with this
+ */
+struct tree_entry {
+	struct rb_node rb_node;
+	u64 bytenr;
+};
+
+/*
+ * present a tree block in the backref cache
+ */
+struct backref_node {
+	struct rb_node rb_node;
+	u64 bytenr;
+	/* objectid tree block owner */
+	u64 owner;
+	/* list of upper level blocks reference this block */
+	struct list_head upper;
+	/* list of child blocks in the cache */
+	struct list_head lower;
+	/* NULL if this node is not tree root */
+	struct btrfs_root *root;
+	/* extent buffer got by COW the block */
+	struct extent_buffer *eb;
+	/* level of tree block */
+	unsigned int level:8;
+	/* 1 if the block is root of old snapshot */
+	unsigned int old_root:1;
+	/* 1 if no child blocks in the cache */
+	unsigned int lowest:1;
+	/* is the extent buffer locked */
+	unsigned int locked:1;
+	/* has the block been processed */
+	unsigned int processed:1;
+	/* have backrefs of this block been checked */
+	unsigned int checked:1;
+};
+
+/*
+ * present a block pointer in the backref cache
+ */
+struct backref_edge {
+	struct list_head list[2];
+	struct backref_node *node[2];
+	u64 blockptr;
+};
+
+#define LOWER	0
+#define UPPER	1
+
+struct backref_cache {
+	/* red black tree of all backref nodes in the cache */
+	struct rb_root rb_root;
+	/* list of backref nodes with no child block in the cache */
+	struct list_head pending[BTRFS_MAX_LEVEL];
+	spinlock_t lock;
+};
+
+/*
+ * map address of tree root to tree
+ */
+struct mapping_node {
+	struct rb_node rb_node;
+	u64 bytenr;
+	void *data;
+};
+
+struct mapping_tree {
+	struct rb_root rb_root;
+	spinlock_t lock;
+};
+
+/*
+ * present a tree block to process
+ */
+struct tree_block {
+	struct rb_node rb_node;
+	u64 bytenr;
+	struct btrfs_key key;
+	unsigned int level:8;
+	unsigned int key_ready:1;
+};
+
+/* inode vector */
+#define INODEVEC_SIZE 16
+
+struct inodevec {
+	struct list_head list;
+	struct inode *inode[INODEVEC_SIZE];
+	int nr;
+};
+
+struct reloc_control {
+	/* block group to relocate */
+	struct btrfs_block_group_cache *block_group;
+	/* extent tree */
+	struct btrfs_root *extent_root;
+	/* inode for moving data */
+	struct inode *data_inode;
+	struct btrfs_workers workers;
+	/* tree blocks have been processed */
+	struct extent_io_tree processed_blocks;
+	/* map start of tree root to corresponding reloc tree */
+	struct mapping_tree reloc_root_tree;
+	/* list of reloc trees */
+	struct list_head reloc_roots;
+	u64 search_start;
+	u64 extents_found;
+	u64 extents_skipped;
+	int stage;
+	int create_reloc_root;
+	unsigned int found_file_extent:1;
+	unsigned int found_old_snapshot:1;
+};
+
+/* stages of data relocation */
+#define MOVE_DATA_EXTENTS	0
+#define UPDATE_DATA_PTRS	1
+
+/*
+ * merge reloc tree to corresponding fs tree in worker threads
+ */
+struct async_merge {
+	struct btrfs_work work;
+	struct reloc_control *rc;
+	struct btrfs_root *root;
+	struct completion *done;
+	atomic_t *num_pending;
+};
+
+static void mapping_tree_init(struct mapping_tree *tree)
+{
+	tree->rb_root.rb_node = NULL;
+	spin_lock_init(&tree->lock);
+}
+
+static void backref_cache_init(struct backref_cache *cache)
+{
+	int i;
+	cache->rb_root.rb_node = NULL;
+	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+		INIT_LIST_HEAD(&cache->pending[i]);
+	spin_lock_init(&cache->lock);
+}
+
+static void backref_node_init(struct backref_node *node)
+{
+	memset(node, 0, sizeof(*node));
+	INIT_LIST_HEAD(&node->upper);
+	INIT_LIST_HEAD(&node->lower);
+	RB_CLEAR_NODE(&node->rb_node);
+}
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
+				   struct rb_node *node)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct tree_entry *entry;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct tree_entry, rb_node);
+
+		if (bytenr < entry->bytenr)
+			p = &(*p)->rb_left;
+		else if (bytenr > entry->bytenr)
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
+{
+	struct rb_node *n = root->rb_node;
+	struct tree_entry *entry;
+
+	while (n) {
+		entry = rb_entry(n, struct tree_entry, rb_node);
+
+		if (bytenr < entry->bytenr)
+			n = n->rb_left;
+		else if (bytenr > entry->bytenr)
+			n = n->rb_right;
+		else
+			return n;
+	}
+	return NULL;
+}
+
+/*
+ * walk up backref nodes until reach node presents tree root
+ */
+static struct backref_node *walk_up_backref(struct backref_node *node,
+					    struct backref_edge *edges[],
+					    int *index)
+{
+	struct backref_edge *edge;
+	int idx = *index;
+
+	while (!list_empty(&node->upper)) {
+		edge = list_entry(node->upper.next,
+				  struct backref_edge, list[LOWER]);
+		edges[idx++] = edge;
+		node = edge->node[UPPER];
+	}
+	*index = idx;
+	return node;
+}
+
+/*
+ * walk down backref nodes to find start of next reference path
+ */
+static struct backref_node *walk_down_backref(struct backref_edge *edges[],
+					      int *index)
+{
+	struct backref_edge *edge;
+	struct backref_node *lower;
+	int idx = *index;
+
+	while (idx > 0) {
+		edge = edges[idx - 1];
+		lower = edge->node[LOWER];
+		if (list_is_last(&edge->list[LOWER], &lower->upper)) {
+			idx--;
+			continue;
+		}
+		edge = list_entry(edge->list[LOWER].next,
+				  struct backref_edge, list[LOWER]);
+		edges[idx - 1] = edge;
+		*index = idx;
+		return edge->node[UPPER];
+	}
+	*index = 0;
+	return NULL;
+}
+
+static void drop_node_buffer(struct backref_node *node)
+{
+	if (node->eb) {
+		if (node->locked) {
+			btrfs_tree_unlock(node->eb);
+			node->locked = 0;
+		}
+		free_extent_buffer(node->eb);
+		node->eb = NULL;
+	}
+}
+
+static void drop_backref_node(struct backref_cache *tree,
+			      struct backref_node *node)
+{
+	BUG_ON(!node->lowest);
+	BUG_ON(!list_empty(&node->upper));
+
+	drop_node_buffer(node);
+	list_del(&node->lower);
+
+	rb_erase(&node->rb_node, &tree->rb_root);
+	kfree(node);
+}
+
+/*
+ * remove a backref node from the backref cache
+ */
+static void remove_backref_node(struct backref_cache *cache,
+				struct backref_node *node)
+{
+	struct backref_node *upper;
+	struct backref_edge *edge;
+
+	if (!node)
+		return;
+
+	BUG_ON(!node->lowest);
+	while (!list_empty(&node->upper)) {
+		edge = list_entry(node->upper.next, struct backref_edge,
+				  list[LOWER]);
+		upper = edge->node[UPPER];
+		list_del(&edge->list[LOWER]);
+		list_del(&edge->list[UPPER]);
+		kfree(edge);
+		/*
+		 * add the node to pending list if no other
+		 * child block cached.
+		 */
+		if (list_empty(&upper->lower)) {
+			list_add_tail(&upper->lower,
+				      &cache->pending[upper->level]);
+			upper->lowest = 1;
+		}
+	}
+	drop_backref_node(cache, node);
+}
+
+/*
+ * find reloc tree by address of tree root
+ */
+static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
+					  u64 bytenr)
+{
+	struct rb_node *rb_node;
+	struct mapping_node *node;
+	struct btrfs_root *root = NULL;
+
+	spin_lock(&rc->reloc_root_tree.lock);
+	rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
+	if (rb_node) {
+		node = rb_entry(rb_node, struct mapping_node, rb_node);
+		root = (struct btrfs_root *)node->data;
+	}
+	spin_unlock(&rc->reloc_root_tree.lock);
+	return root;
+}
+
+static int is_cowonly_root(u64 root_objectid)
+{
+	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
+	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
+	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
+	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
+	    root_objectid == BTRFS_TREE_LOG_OBJECTID ||
+	    root_objectid == BTRFS_CSUM_TREE_OBJECTID)
+		return 1;
+	return 0;
+}
+
+static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
+					u64 root_objectid)
+{
+	struct btrfs_key key;
+
+	key.objectid = root_objectid;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	if (is_cowonly_root(root_objectid))
+		key.offset = 0;
+	else
+		key.offset = (u64)-1;
+
+	return btrfs_read_fs_root_no_name(fs_info, &key);
+}
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static noinline_for_stack
+struct btrfs_root *find_tree_root(struct reloc_control *rc,
+				  struct extent_buffer *leaf,
+				  struct btrfs_extent_ref_v0 *ref0)
+{
+	struct btrfs_root *root;
+	u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
+	u64 generation = btrfs_ref_generation_v0(leaf, ref0);
+
+	BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
+
+	root = read_fs_root(rc->extent_root->fs_info, root_objectid);
+	BUG_ON(IS_ERR(root));
+
+	if (root->ref_cows &&
+	    generation != btrfs_root_generation(&root->root_item))
+		return NULL;
+
+	return root;
+}
+#endif
+
+static noinline_for_stack
+int find_inline_backref(struct extent_buffer *leaf, int slot,
+			unsigned long *ptr, unsigned long *end)
+{
+	struct btrfs_extent_item *ei;
+	struct btrfs_tree_block_info *bi;
+	u32 item_size;
+
+	item_size = btrfs_item_size_nr(leaf, slot);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (item_size < sizeof(*ei)) {
+		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
+		return 1;
+	}
+#endif
+	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+	WARN_ON(!(btrfs_extent_flags(leaf, ei) &
+		  BTRFS_EXTENT_FLAG_TREE_BLOCK));
+
+	if (item_size <= sizeof(*ei) + sizeof(*bi)) {
+		WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
+		return 1;
+	}
+
+	bi = (struct btrfs_tree_block_info *)(ei + 1);
+	*ptr = (unsigned long)(bi + 1);
+	*end = (unsigned long)ei + item_size;
+	return 0;
+}
+
+/*
+ * build backref tree for a given tree block. root of the backref tree
+ * corresponds the tree block, leaves of the backref tree correspond
+ * roots of b-trees that reference the tree block.
+ *
+ * the basic idea of this function is check backrefs of a given block
+ * to find upper level blocks that refernece the block, and then check
+ * bakcrefs of these upper level blocks recursively. the recursion stop
+ * when tree root is reached or backrefs for the block is cached.
+ *
+ * NOTE: if we find backrefs for a block are cached, we know backrefs
+ * for all upper level blocks that directly/indirectly reference the
+ * block are also cached.
+ */
+static struct backref_node *build_backref_tree(struct reloc_control *rc,
+					       struct backref_cache *cache,
+					       struct btrfs_key *node_key,
+					       int level, u64 bytenr)
+{
+	struct btrfs_path *path1;
+	struct btrfs_path *path2;
+	struct extent_buffer *eb;
+	struct btrfs_root *root;
+	struct backref_node *cur;
+	struct backref_node *upper;
+	struct backref_node *lower;
+	struct backref_node *node = NULL;
+	struct backref_node *exist = NULL;
+	struct backref_edge *edge;
+	struct rb_node *rb_node;
+	struct btrfs_key key;
+	unsigned long end;
+	unsigned long ptr;
+	LIST_HEAD(list);
+	int ret;
+	int err = 0;
+
+	path1 = btrfs_alloc_path();
+	path2 = btrfs_alloc_path();
+	if (!path1 || !path2) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	node = kmalloc(sizeof(*node), GFP_NOFS);
+	if (!node) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	backref_node_init(node);
+	node->bytenr = bytenr;
+	node->owner = 0;
+	node->level = level;
+	node->lowest = 1;
+	cur = node;
+again:
+	end = 0;
+	ptr = 0;
+	key.objectid = cur->bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	path1->search_commit_root = 1;
+	path1->skip_locking = 1;
+	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
+				0, 0);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+	BUG_ON(!ret || !path1->slots[0]);
+
+	path1->slots[0]--;
+
+	WARN_ON(cur->checked);
+	if (!list_empty(&cur->upper)) {
+		/*
+		 * the backref was added previously when processsing
+		 * backref of type BTRFS_TREE_BLOCK_REF_KEY
+		 */
+		BUG_ON(!list_is_singular(&cur->upper));
+		edge = list_entry(cur->upper.next, struct backref_edge,
+				  list[LOWER]);
+		BUG_ON(!list_empty(&edge->list[UPPER]));
+		exist = edge->node[UPPER];
+		/*
+		 * add the upper level block to pending list if we need
+		 * check its backrefs
+		 */
+		if (!exist->checked)
+			list_add_tail(&edge->list[UPPER], &list);
+	} else {
+		exist = NULL;
+	}
+
+	while (1) {
+		cond_resched();
+		eb = path1->nodes[0];
+
+		if (ptr >= end) {
+			if (path1->slots[0] >= btrfs_header_nritems(eb)) {
+				ret = btrfs_next_leaf(rc->extent_root, path1);
+				if (ret < 0) {
+					err = ret;
+					goto out;
+				}
+				if (ret > 0)
+					break;
+				eb = path1->nodes[0];
+			}
+
+			btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
+			if (key.objectid != cur->bytenr) {
+				WARN_ON(exist);
+				break;
+			}
+
+			if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+				ret = find_inline_backref(eb, path1->slots[0],
+							  &ptr, &end);
+				if (ret)
+					goto next;
+			}
+		}
+
+		if (ptr < end) {
+			/* update key for inline back ref */
+			struct btrfs_extent_inline_ref *iref;
+			iref = (struct btrfs_extent_inline_ref *)ptr;
+			key.type = btrfs_extent_inline_ref_type(eb, iref);
+			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
+			WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
+				key.type != BTRFS_SHARED_BLOCK_REF_KEY);
+		}
+
+		if (exist &&
+		    ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
+		      exist->owner == key.offset) ||
+		     (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
+		      exist->bytenr == key.offset))) {
+			exist = NULL;
+			goto next;
+		}
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
+		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
+			if (key.objectid == key.offset &&
+			    key.type == BTRFS_EXTENT_REF_V0_KEY) {
+				struct btrfs_extent_ref_v0 *ref0;
+				ref0 = btrfs_item_ptr(eb, path1->slots[0],
+						struct btrfs_extent_ref_v0);
+				root = find_tree_root(rc, eb, ref0);
+				if (root)
+					cur->root = root;
+				else
+					cur->old_root = 1;
+				break;
+			}
+#else
+		BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
+		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
+#endif
+			if (key.objectid == key.offset) {
+				/*
+				 * only root blocks of reloc trees use
+				 * backref of this type.
+				 */
+				root = find_reloc_root(rc, cur->bytenr);
+				BUG_ON(!root);
+				cur->root = root;
+				break;
+			}
+
+			edge = kzalloc(sizeof(*edge), GFP_NOFS);
+			if (!edge) {
+				err = -ENOMEM;
+				goto out;
+			}
+			rb_node = tree_search(&cache->rb_root, key.offset);
+			if (!rb_node) {
+				upper = kmalloc(sizeof(*upper), GFP_NOFS);
+				if (!upper) {
+					kfree(edge);
+					err = -ENOMEM;
+					goto out;
+				}
+				backref_node_init(upper);
+				upper->bytenr = key.offset;
+				upper->owner = 0;
+				upper->level = cur->level + 1;
+				/*
+				 *  backrefs for the upper level block isn't
+				 *  cached, add the block to pending list
+				 */
+				list_add_tail(&edge->list[UPPER], &list);
+			} else {
+				upper = rb_entry(rb_node, struct backref_node,
+						 rb_node);
+				INIT_LIST_HEAD(&edge->list[UPPER]);
+			}
+			list_add(&edge->list[LOWER], &cur->upper);
+			edge->node[UPPER] = upper;
+			edge->node[LOWER] = cur;
+
+			goto next;
+		} else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
+			goto next;
+		}
+
+		/* key.type == BTRFS_TREE_BLOCK_REF_KEY */
+		root = read_fs_root(rc->extent_root->fs_info, key.offset);
+		if (IS_ERR(root)) {
+			err = PTR_ERR(root);
+			goto out;
+		}
+
+		if (btrfs_root_level(&root->root_item) == cur->level) {
+			/* tree root */
+			BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+			       cur->bytenr);
+			cur->root = root;
+			break;
+		}
+
+		level = cur->level + 1;
+
+		/*
+		 * searching the tree to find upper level blocks
+		 * reference the block.
+		 */
+		path2->search_commit_root = 1;
+		path2->skip_locking = 1;
+		path2->lowest_level = level;
+		ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
+		path2->lowest_level = 0;
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+
+		eb = path2->nodes[level];
+		WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
+			cur->bytenr);
+
+		lower = cur;
+		for (; level < BTRFS_MAX_LEVEL; level++) {
+			if (!path2->nodes[level]) {
+				BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+				       lower->bytenr);
+				lower->root = root;
+				break;
+			}
+
+			edge = kzalloc(sizeof(*edge), GFP_NOFS);
+			if (!edge) {
+				err = -ENOMEM;
+				goto out;
+			}
+
+			eb = path2->nodes[level];
+			rb_node = tree_search(&cache->rb_root, eb->start);
+			if (!rb_node) {
+				upper = kmalloc(sizeof(*upper), GFP_NOFS);
+				if (!upper) {
+					kfree(edge);
+					err = -ENOMEM;
+					goto out;
+				}
+				backref_node_init(upper);
+				upper->bytenr = eb->start;
+				upper->owner = btrfs_header_owner(eb);
+				upper->level = lower->level + 1;
+
+				/*
+				 * if we know the block isn't shared
+				 * we can void checking its backrefs.
+				 */
+				if (btrfs_block_can_be_shared(root, eb))
+					upper->checked = 0;
+				else
+					upper->checked = 1;
+
+				/*
+				 * add the block to pending list if we
+				 * need check its backrefs. only block
+				 * at 'cur->level + 1' is added to the
+				 * tail of pending list. this guarantees
+				 * we check backrefs from lower level
+				 * blocks to upper level blocks.
+				 */
+				if (!upper->checked &&
+				    level == cur->level + 1) {
+					list_add_tail(&edge->list[UPPER],
+						      &list);
+				} else
+					INIT_LIST_HEAD(&edge->list[UPPER]);
+			} else {
+				upper = rb_entry(rb_node, struct backref_node,
+						 rb_node);
+				BUG_ON(!upper->checked);
+				INIT_LIST_HEAD(&edge->list[UPPER]);
+			}
+			list_add_tail(&edge->list[LOWER], &lower->upper);
+			edge->node[UPPER] = upper;
+			edge->node[LOWER] = lower;
+
+			if (rb_node)
+				break;
+			lower = upper;
+			upper = NULL;
+		}
+		btrfs_release_path(root, path2);
+next:
+		if (ptr < end) {
+			ptr += btrfs_extent_inline_ref_size(key.type);
+			if (ptr >= end) {
+				WARN_ON(ptr > end);
+				ptr = 0;
+				end = 0;
+			}
+		}
+		if (ptr >= end)
+			path1->slots[0]++;
+	}
+	btrfs_release_path(rc->extent_root, path1);
+
+	cur->checked = 1;
+	WARN_ON(exist);
+
+	/* the pending list isn't empty, take the first block to process */
+	if (!list_empty(&list)) {
+		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
+		list_del_init(&edge->list[UPPER]);
+		cur = edge->node[UPPER];
+		goto again;
+	}
+
+	/*
+	 * everything goes well, connect backref nodes and insert backref nodes
+	 * into the cache.
+	 */
+	BUG_ON(!node->checked);
+	rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
+	BUG_ON(rb_node);
+
+	list_for_each_entry(edge, &node->upper, list[LOWER])
+		list_add_tail(&edge->list[UPPER], &list);
+
+	while (!list_empty(&list)) {
+		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
+		list_del_init(&edge->list[UPPER]);
+		upper = edge->node[UPPER];
+
+		if (!RB_EMPTY_NODE(&upper->rb_node)) {
+			if (upper->lowest) {
+				list_del_init(&upper->lower);
+				upper->lowest = 0;
+			}
+
+			list_add_tail(&edge->list[UPPER], &upper->lower);
+			continue;
+		}
+
+		BUG_ON(!upper->checked);
+		rb_node = tree_insert(&cache->rb_root, upper->bytenr,
+				      &upper->rb_node);
+		BUG_ON(rb_node);
+
+		list_add_tail(&edge->list[UPPER], &upper->lower);
+
+		list_for_each_entry(edge, &upper->upper, list[LOWER])
+			list_add_tail(&edge->list[UPPER], &list);
+	}
+out:
+	btrfs_free_path(path1);
+	btrfs_free_path(path2);
+	if (err) {
+		INIT_LIST_HEAD(&list);
+		upper = node;
+		while (upper) {
+			if (RB_EMPTY_NODE(&upper->rb_node)) {
+				list_splice_tail(&upper->upper, &list);
+				kfree(upper);
+			}
+
+			if (list_empty(&list))
+				break;
+
+			edge = list_entry(list.next, struct backref_edge,
+					  list[LOWER]);
+			upper = edge->node[UPPER];
+			kfree(edge);
+		}
+		return ERR_PTR(err);
+	}
+	return node;
+}
+
+/*
+ * helper to add 'address of tree root -> reloc tree' mapping
+ */
+static int __add_reloc_root(struct btrfs_root *root)
+{
+	struct rb_node *rb_node;
+	struct mapping_node *node;
+	struct reloc_control *rc = root->fs_info->reloc_ctl;
+
+	node = kmalloc(sizeof(*node), GFP_NOFS);
+	BUG_ON(!node);
+
+	node->bytenr = root->node->start;
+	node->data = root;
+
+	spin_lock(&rc->reloc_root_tree.lock);
+	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
+			      node->bytenr, &node->rb_node);
+	spin_unlock(&rc->reloc_root_tree.lock);
+	BUG_ON(rb_node);
+
+	list_add_tail(&root->root_list, &rc->reloc_roots);
+	return 0;
+}
+
+/*
+ * helper to update/delete the 'address of tree root -> reloc tree'
+ * mapping
+ */
+static int __update_reloc_root(struct btrfs_root *root, int del)
+{
+	struct rb_node *rb_node;
+	struct mapping_node *node = NULL;
+	struct reloc_control *rc = root->fs_info->reloc_ctl;
+
+	spin_lock(&rc->reloc_root_tree.lock);
+	rb_node = tree_search(&rc->reloc_root_tree.rb_root,
+			      root->commit_root->start);
+	if (rb_node) {
+		node = rb_entry(rb_node, struct mapping_node, rb_node);
+		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
+	}
+	spin_unlock(&rc->reloc_root_tree.lock);
+
+	BUG_ON((struct btrfs_root *)node->data != root);
+
+	if (!del) {
+		spin_lock(&rc->reloc_root_tree.lock);
+		node->bytenr = root->node->start;
+		rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
+				      node->bytenr, &node->rb_node);
+		spin_unlock(&rc->reloc_root_tree.lock);
+		BUG_ON(rb_node);
+	} else {
+		list_del_init(&root->root_list);
+		kfree(node);
+	}
+	return 0;
+}
+
+/*
+ * create reloc tree for a given fs tree. reloc tree is just a
+ * snapshot of the fs tree with special root objectid.
+ */
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root)
+{
+	struct btrfs_root *reloc_root;
+	struct extent_buffer *eb;
+	struct btrfs_root_item *root_item;
+	struct btrfs_key root_key;
+	int ret;
+
+	if (root->reloc_root) {
+		reloc_root = root->reloc_root;
+		reloc_root->last_trans = trans->transid;
+		return 0;
+	}
+
+	if (!root->fs_info->reloc_ctl ||
+	    !root->fs_info->reloc_ctl->create_reloc_root ||
+	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+		return 0;
+
+	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
+	BUG_ON(!root_item);
+
+	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+	root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root_key.offset = root->root_key.objectid;
+
+	ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
+			      BTRFS_TREE_RELOC_OBJECTID);
+	BUG_ON(ret);
+
+	btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
+	memcpy(root_item, &root->root_item, sizeof(*root_item));
+	btrfs_set_root_refs(root_item, 1);
+	btrfs_set_root_bytenr(root_item, eb->start);
+	btrfs_set_root_level(root_item, btrfs_header_level(eb));
+	btrfs_set_root_generation(root_item, trans->transid);
+	memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key));
+	root_item->drop_level = 0;
+
+	btrfs_tree_unlock(eb);
+	free_extent_buffer(eb);
+
+	ret = btrfs_insert_root(trans, root->fs_info->tree_root,
+				&root_key, root_item);
+	BUG_ON(ret);
+	kfree(root_item);
+
+	reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
+						 &root_key);
+	BUG_ON(IS_ERR(reloc_root));
+	reloc_root->last_trans = trans->transid;
+
+	__add_reloc_root(reloc_root);
+	root->reloc_root = reloc_root;
+	return 0;
+}
+
+/*
+ * update root item of reloc tree
+ */
+int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root)
+{
+	struct btrfs_root *reloc_root;
+	struct btrfs_root_item *root_item;
+	int del = 0;
+	int ret;
+
+	if (!root->reloc_root)
+		return 0;
+
+	reloc_root = root->reloc_root;
+	root_item = &reloc_root->root_item;
+
+	if (btrfs_root_refs(root_item) == 0) {
+		root->reloc_root = NULL;
+		del = 1;
+	}
+
+	__update_reloc_root(reloc_root, del);
+
+	if (reloc_root->commit_root != reloc_root->node) {
+		btrfs_set_root_node(root_item, reloc_root->node);
+		free_extent_buffer(reloc_root->commit_root);
+		reloc_root->commit_root = btrfs_root_node(reloc_root);
+	}
+
+	ret = btrfs_update_root(trans, root->fs_info->tree_root,
+				&reloc_root->root_key, root_item);
+	BUG_ON(ret);
+	return 0;
+}
+
+/*
+ * helper to find first cached inode with inode number >= objectid
+ * in a subvolume
+ */
+static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
+{
+	struct rb_node *node;
+	struct rb_node *prev;
+	struct btrfs_inode *entry;
+	struct inode *inode;
+
+	spin_lock(&root->inode_lock);
+again:
+	node = root->inode_tree.rb_node;
+	prev = NULL;
+	while (node) {
+		prev = node;
+		entry = rb_entry(node, struct btrfs_inode, rb_node);
+
+		if (objectid < entry->vfs_inode.i_ino)
+			node = node->rb_left;
+		else if (objectid > entry->vfs_inode.i_ino)
+			node = node->rb_right;
+		else
+			break;
+	}
+	if (!node) {
+		while (prev) {
+			entry = rb_entry(prev, struct btrfs_inode, rb_node);
+			if (objectid <= entry->vfs_inode.i_ino) {
+				node = prev;
+				break;
+			}
+			prev = rb_next(prev);
+		}
+	}
+	while (node) {
+		entry = rb_entry(node, struct btrfs_inode, rb_node);
+		inode = igrab(&entry->vfs_inode);
+		if (inode) {
+			spin_unlock(&root->inode_lock);
+			return inode;
+		}
+
+		objectid = entry->vfs_inode.i_ino + 1;
+		if (cond_resched_lock(&root->inode_lock))
+			goto again;
+
+		node = rb_next(node);
+	}
+	spin_unlock(&root->inode_lock);
+	return NULL;
+}
+
+static int in_block_group(u64 bytenr,
+			  struct btrfs_block_group_cache *block_group)
+{
+	if (bytenr >= block_group->key.objectid &&
+	    bytenr < block_group->key.objectid + block_group->key.offset)
+		return 1;
+	return 0;
+}
+
+/*
+ * get new location of data
+ */
+static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
+			    u64 bytenr, u64 num_bytes)
+{
+	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+	struct btrfs_path *path;
+	struct btrfs_file_extent_item *fi;
+	struct extent_buffer *leaf;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	bytenr -= BTRFS_I(reloc_inode)->index_cnt;
+	ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
+				       bytenr, 0);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	fi = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+
+	BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
+	       btrfs_file_extent_compression(leaf, fi) ||
+	       btrfs_file_extent_encryption(leaf, fi) ||
+	       btrfs_file_extent_other_encoding(leaf, fi));
+
+	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
+		ret = 1;
+		goto out;
+	}
+
+	if (new_bytenr)
+		*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * update file extent items in the tree leaf to point to
+ * the new locations.
+ */
+static int replace_file_extents(struct btrfs_trans_handle *trans,
+				struct reloc_control *rc,
+				struct btrfs_root *root,
+				struct extent_buffer *leaf,
+				struct list_head *inode_list)
+{
+	struct btrfs_key key;
+	struct btrfs_file_extent_item *fi;
+	struct inode *inode = NULL;
+	struct inodevec *ivec = NULL;
+	u64 parent;
+	u64 bytenr;
+	u64 new_bytenr;
+	u64 num_bytes;
+	u64 end;
+	u32 nritems;
+	u32 i;
+	int ret;
+	int first = 1;
+	int dirty = 0;
+
+	if (rc->stage != UPDATE_DATA_PTRS)
+		return 0;
+
+	/* reloc trees always use full backref */
+	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+		parent = leaf->start;
+	else
+		parent = 0;
+
+	nritems = btrfs_header_nritems(leaf);
+	for (i = 0; i < nritems; i++) {
+		cond_resched();
+		btrfs_item_key_to_cpu(leaf, &key, i);
+		if (key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+		if (btrfs_file_extent_type(leaf, fi) ==
+		    BTRFS_FILE_EXTENT_INLINE)
+			continue;
+		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+		if (bytenr == 0)
+			continue;
+		if (!in_block_group(bytenr, rc->block_group))
+			continue;
+
+		/*
+		 * if we are modifying block in fs tree, wait for readpage
+		 * to complete and drop the extent cache
+		 */
+		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+			if (!ivec || ivec->nr == INODEVEC_SIZE) {
+				ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
+				BUG_ON(!ivec);
+				ivec->nr = 0;
+				list_add_tail(&ivec->list, inode_list);
+			}
+			if (first) {
+				inode = find_next_inode(root, key.objectid);
+				if (inode)
+					ivec->inode[ivec->nr++] = inode;
+				first = 0;
+			} else if (inode && inode->i_ino < key.objectid) {
+				inode = find_next_inode(root, key.objectid);
+				if (inode)
+					ivec->inode[ivec->nr++] = inode;
+			}
+			if (inode && inode->i_ino == key.objectid) {
+				end = key.offset +
+				      btrfs_file_extent_num_bytes(leaf, fi);
+				WARN_ON(!IS_ALIGNED(key.offset,
+						    root->sectorsize));
+				WARN_ON(!IS_ALIGNED(end, root->sectorsize));
+				end--;
+				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+						      key.offset, end,
+						      GFP_NOFS);
+				if (!ret)
+					continue;
+
+				btrfs_drop_extent_cache(inode, key.offset, end,
+							1);
+				unlock_extent(&BTRFS_I(inode)->io_tree,
+					      key.offset, end, GFP_NOFS);
+			}
+		}
+
+		ret = get_new_location(rc->data_inode, &new_bytenr,
+				       bytenr, num_bytes);
+		if (ret > 0)
+			continue;
+		BUG_ON(ret < 0);
+
+		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
+		dirty = 1;
+
+		key.offset -= btrfs_file_extent_offset(leaf, fi);
+		ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
+					   num_bytes, parent,
+					   btrfs_header_owner(leaf),
+					   key.objectid, key.offset);
+		BUG_ON(ret);
+
+		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+					parent, btrfs_header_owner(leaf),
+					key.objectid, key.offset);
+		BUG_ON(ret);
+	}
+	if (dirty)
+		btrfs_mark_buffer_dirty(leaf);
+	return 0;
+}
+
+static noinline_for_stack
+int memcmp_node_keys(struct extent_buffer *eb, int slot,
+		     struct btrfs_path *path, int level)
+{
+	struct btrfs_disk_key key1;
+	struct btrfs_disk_key key2;
+	btrfs_node_key(eb, &key1, slot);
+	btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
+	return memcmp(&key1, &key2, sizeof(key1));
+}
+
+/*
+ * try to replace tree blocks in fs tree with the new blocks
+ * in reloc tree. tree blocks haven't been modified since the
+ * reloc tree was create can be replaced.
+ *
+ * if a block was replaced, level of the block + 1 is returned.
+ * if no block got replaced, 0 is returned. if there are other
+ * errors, a negative error number is returned.
+ */
+static int replace_path(struct btrfs_trans_handle *trans,
+			struct btrfs_root *dest, struct btrfs_root *src,
+			struct btrfs_path *path, struct btrfs_key *next_key,
+			struct extent_buffer **leaf,
+			int lowest_level, int max_level)
+{
+	struct extent_buffer *eb;
+	struct extent_buffer *parent;
+	struct btrfs_key key;
+	u64 old_bytenr;
+	u64 new_bytenr;
+	u64 old_ptr_gen;
+	u64 new_ptr_gen;
+	u64 last_snapshot;
+	u32 blocksize;
+	int level;
+	int ret;
+	int slot;
+
+	BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+	BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
+	BUG_ON(lowest_level > 1 && leaf);
+
+	last_snapshot = btrfs_root_last_snapshot(&src->root_item);
+
+	slot = path->slots[lowest_level];
+	btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
+
+	eb = btrfs_lock_root_node(dest);
+	btrfs_set_lock_blocking(eb);
+	level = btrfs_header_level(eb);
+
+	if (level < lowest_level) {
+		btrfs_tree_unlock(eb);
+		free_extent_buffer(eb);
+		return 0;
+	}
+
+	ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+	BUG_ON(ret);
+	btrfs_set_lock_blocking(eb);
+
+	if (next_key) {
+		next_key->objectid = (u64)-1;
+		next_key->type = (u8)-1;
+		next_key->offset = (u64)-1;
+	}
+
+	parent = eb;
+	while (1) {
+		level = btrfs_header_level(parent);
+		BUG_ON(level < lowest_level);
+
+		ret = btrfs_bin_search(parent, &key, level, &slot);
+		if (ret && slot > 0)
+			slot--;
+
+		if (next_key && slot + 1 < btrfs_header_nritems(parent))
+			btrfs_node_key_to_cpu(parent, next_key, slot + 1);
+
+		old_bytenr = btrfs_node_blockptr(parent, slot);
+		blocksize = btrfs_level_size(dest, level - 1);
+		old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
+
+		if (level <= max_level) {
+			eb = path->nodes[level];
+			new_bytenr = btrfs_node_blockptr(eb,
+							path->slots[level]);
+			new_ptr_gen = btrfs_node_ptr_generation(eb,
+							path->slots[level]);
+		} else {
+			new_bytenr = 0;
+			new_ptr_gen = 0;
+		}
+
+		if (new_bytenr > 0 && new_bytenr == old_bytenr) {
+			WARN_ON(1);
+			ret = level;
+			break;
+		}
+
+		if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
+		    memcmp_node_keys(parent, slot, path, level)) {
+			if (level <= lowest_level && !leaf) {
+				ret = 0;
+				break;
+			}
+
+			eb = read_tree_block(dest, old_bytenr, blocksize,
+					     old_ptr_gen);
+			btrfs_tree_lock(eb);
+			ret = btrfs_cow_block(trans, dest, eb, parent,
+					      slot, &eb);
+			BUG_ON(ret);
+			btrfs_set_lock_blocking(eb);
+
+			if (level <= lowest_level) {
+				*leaf = eb;
+				ret = 0;
+				break;
+			}
+
+			btrfs_tree_unlock(parent);
+			free_extent_buffer(parent);
+
+			parent = eb;
+			continue;
+		}
+
+		btrfs_node_key_to_cpu(path->nodes[level], &key,
+				      path->slots[level]);
+		btrfs_release_path(src, path);
+
+		path->lowest_level = level;
+		ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
+		path->lowest_level = 0;
+		BUG_ON(ret);
+
+		/*
+		 * swap blocks in fs tree and reloc tree.
+		 */
+		btrfs_set_node_blockptr(parent, slot, new_bytenr);
+		btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
+		btrfs_mark_buffer_dirty(parent);
+
+		btrfs_set_node_blockptr(path->nodes[level],
+					path->slots[level], old_bytenr);
+		btrfs_set_node_ptr_generation(path->nodes[level],
+					      path->slots[level], old_ptr_gen);
+		btrfs_mark_buffer_dirty(path->nodes[level]);
+
+		ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
+					path->nodes[level]->start,
+					src->root_key.objectid, level - 1, 0);
+		BUG_ON(ret);
+		ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
+					0, dest->root_key.objectid, level - 1,
+					0);
+		BUG_ON(ret);
+
+		ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
+					path->nodes[level]->start,
+					src->root_key.objectid, level - 1, 0);
+		BUG_ON(ret);
+
+		ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
+					0, dest->root_key.objectid, level - 1,
+					0);
+		BUG_ON(ret);
+
+		btrfs_unlock_up_safe(path, 0);
+
+		ret = level;
+		break;
+	}
+	btrfs_tree_unlock(parent);
+	free_extent_buffer(parent);
+	return ret;
+}
+
+/*
+ * helper to find next relocated block in reloc tree
+ */
+static noinline_for_stack
+int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
+		       int *level)
+{
+	struct extent_buffer *eb;
+	int i;
+	u64 last_snapshot;
+	u32 nritems;
+
+	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+
+	for (i = 0; i < *level; i++) {
+		free_extent_buffer(path->nodes[i]);
+		path->nodes[i] = NULL;
+	}
+
+	for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
+		eb = path->nodes[i];
+		nritems = btrfs_header_nritems(eb);
+		while (path->slots[i] + 1 < nritems) {
+			path->slots[i]++;
+			if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
+			    last_snapshot)
+				continue;
+
+			*level = i;
+			return 0;
+		}
+		free_extent_buffer(path->nodes[i]);
+		path->nodes[i] = NULL;
+	}
+	return 1;
+}
+
+/*
+ * walk down reloc tree to find relocated block of lowest level
+ */
+static noinline_for_stack
+int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
+			 int *level)
+{
+	struct extent_buffer *eb = NULL;
+	int i;
+	u64 bytenr;
+	u64 ptr_gen = 0;
+	u64 last_snapshot;
+	u32 blocksize;
+	u32 nritems;
+
+	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+
+	for (i = *level; i > 0; i--) {
+		eb = path->nodes[i];
+		nritems = btrfs_header_nritems(eb);
+		while (path->slots[i] < nritems) {
+			ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
+			if (ptr_gen > last_snapshot)
+				break;
+			path->slots[i]++;
+		}
+		if (path->slots[i] >= nritems) {
+			if (i == *level)
+				break;
+			*level = i + 1;
+			return 0;
+		}
+		if (i == 1) {
+			*level = i;
+			return 0;
+		}
+
+		bytenr = btrfs_node_blockptr(eb, path->slots[i]);
+		blocksize = btrfs_level_size(root, i - 1);
+		eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
+		BUG_ON(btrfs_header_level(eb) != i - 1);
+		path->nodes[i - 1] = eb;
+		path->slots[i - 1] = 0;
+	}
+	return 1;
+}
+
+/*
+ * invalidate extent cache for file extents whose key in range of
+ * [min_key, max_key)
+ */
+static int invalidate_extent_cache(struct btrfs_root *root,
+				   struct btrfs_key *min_key,
+				   struct btrfs_key *max_key)
+{
+	struct inode *inode = NULL;
+	u64 objectid;
+	u64 start, end;
+
+	objectid = min_key->objectid;
+	while (1) {
+		cond_resched();
+		iput(inode);
+
+		if (objectid > max_key->objectid)
+			break;
+
+		inode = find_next_inode(root, objectid);
+		if (!inode)
+			break;
+
+		if (inode->i_ino > max_key->objectid) {
+			iput(inode);
+			break;
+		}
+
+		objectid = inode->i_ino + 1;
+		if (!S_ISREG(inode->i_mode))
+			continue;
+
+		if (unlikely(min_key->objectid == inode->i_ino)) {
+			if (min_key->type > BTRFS_EXTENT_DATA_KEY)
+				continue;
+			if (min_key->type < BTRFS_EXTENT_DATA_KEY)
+				start = 0;
+			else {
+				start = min_key->offset;
+				WARN_ON(!IS_ALIGNED(start, root->sectorsize));
+			}
+		} else {
+			start = 0;
+		}
+
+		if (unlikely(max_key->objectid == inode->i_ino)) {
+			if (max_key->type < BTRFS_EXTENT_DATA_KEY)
+				continue;
+			if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
+				end = (u64)-1;
+			} else {
+				if (max_key->offset == 0)
+					continue;
+				end = max_key->offset;
+				WARN_ON(!IS_ALIGNED(end, root->sectorsize));
+				end--;
+			}
+		} else {
+			end = (u64)-1;
+		}
+
+		/* the lock_extent waits for readpage to complete */
+		lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+		btrfs_drop_extent_cache(inode, start, end, 1);
+		unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+	}
+	return 0;
+}
+
+static int find_next_key(struct btrfs_path *path, int level,
+			 struct btrfs_key *key)
+
+{
+	while (level < BTRFS_MAX_LEVEL) {
+		if (!path->nodes[level])
+			break;
+		if (path->slots[level] + 1 <
+		    btrfs_header_nritems(path->nodes[level])) {
+			btrfs_node_key_to_cpu(path->nodes[level], key,
+					      path->slots[level] + 1);
+			return 0;
+		}
+		level++;
+	}
+	return 1;
+}
+
+/*
+ * merge the relocated tree blocks in reloc tree with corresponding
+ * fs tree.
+ */
+static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
+					       struct btrfs_root *root)
+{
+	LIST_HEAD(inode_list);
+	struct btrfs_key key;
+	struct btrfs_key next_key;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *reloc_root;
+	struct btrfs_root_item *root_item;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf = NULL;
+	unsigned long nr;
+	int level;
+	int max_level;
+	int replaced = 0;
+	int ret;
+	int err = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	reloc_root = root->reloc_root;
+	root_item = &reloc_root->root_item;
+
+	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
+		level = btrfs_root_level(root_item);
+		extent_buffer_get(reloc_root->node);
+		path->nodes[level] = reloc_root->node;
+		path->slots[level] = 0;
+	} else {
+		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+
+		level = root_item->drop_level;
+		BUG_ON(level == 0);
+		path->lowest_level = level;
+		ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
+		if (ret < 0) {
+			btrfs_free_path(path);
+			return ret;
+		}
+
+		btrfs_node_key_to_cpu(path->nodes[level], &next_key,
+				      path->slots[level]);
+		WARN_ON(memcmp(&key, &next_key, sizeof(key)));
+
+		btrfs_unlock_up_safe(path, 0);
+	}
+
+	if (level == 0 && rc->stage == UPDATE_DATA_PTRS) {
+		trans = btrfs_start_transaction(root, 1);
+
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, 0);
+		btrfs_release_path(reloc_root, path);
+
+		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+
+		leaf = path->nodes[0];
+		btrfs_unlock_up_safe(path, 1);
+		ret = replace_file_extents(trans, rc, root, leaf,
+					   &inode_list);
+		if (ret < 0)
+			err = ret;
+		goto out;
+	}
+
+	memset(&next_key, 0, sizeof(next_key));
+
+	while (1) {
+		leaf = NULL;
+		replaced = 0;
+		trans = btrfs_start_transaction(root, 1);
+		max_level = level;
+
+		ret = walk_down_reloc_tree(reloc_root, path, &level);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		if (ret > 0)
+			break;
+
+		if (!find_next_key(path, level, &key) &&
+		    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
+			ret = 0;
+		} else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
+			ret = replace_path(trans, root, reloc_root,
+					   path, &next_key, &leaf,
+					   level, max_level);
+		} else {
+			ret = replace_path(trans, root, reloc_root,
+					   path, &next_key, NULL,
+					   level, max_level);
+		}
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+
+		if (ret > 0) {
+			level = ret;
+			btrfs_node_key_to_cpu(path->nodes[level], &key,
+					      path->slots[level]);
+			replaced = 1;
+		} else if (leaf) {
+			/*
+			 * no block got replaced, try replacing file extents
+			 */
+			btrfs_item_key_to_cpu(leaf, &key, 0);
+			ret = replace_file_extents(trans, rc, root, leaf,
+						   &inode_list);
+			btrfs_tree_unlock(leaf);
+			free_extent_buffer(leaf);
+			BUG_ON(ret < 0);
+		}
+
+		ret = walk_up_reloc_tree(reloc_root, path, &level);
+		if (ret > 0)
+			break;
+
+		BUG_ON(level == 0);
+		/*
+		 * save the merging progress in the drop_progress.
+		 * this is OK since root refs == 1 in this case.
+		 */
+		btrfs_node_key(path->nodes[level], &root_item->drop_progress,
+			       path->slots[level]);
+		root_item->drop_level = level;
+
+		nr = trans->blocks_used;
+		btrfs_end_transaction(trans, root);
+
+		btrfs_btree_balance_dirty(root, nr);
+
+		if (replaced && rc->stage == UPDATE_DATA_PTRS)
+			invalidate_extent_cache(root, &key, &next_key);
+	}
+
+	/*
+	 * handle the case only one block in the fs tree need to be
+	 * relocated and the block is tree root.
+	 */
+	leaf = btrfs_lock_root_node(root);
+	ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
+	btrfs_tree_unlock(leaf);
+	free_extent_buffer(leaf);
+	if (ret < 0)
+		err = ret;
+out:
+	btrfs_free_path(path);
+
+	if (err == 0) {
+		memset(&root_item->drop_progress, 0,
+		       sizeof(root_item->drop_progress));
+		root_item->drop_level = 0;
+		btrfs_set_root_refs(root_item, 0);
+	}
+
+	nr = trans->blocks_used;
+	btrfs_end_transaction(trans, root);
+
+	btrfs_btree_balance_dirty(root, nr);
+
+	/*
+	 * put inodes while we aren't holding the tree locks
+	 */
+	while (!list_empty(&inode_list)) {
+		struct inodevec *ivec;
+		ivec = list_entry(inode_list.next, struct inodevec, list);
+		list_del(&ivec->list);
+		while (ivec->nr > 0) {
+			ivec->nr--;
+			iput(ivec->inode[ivec->nr]);
+		}
+		kfree(ivec);
+	}
+
+	if (replaced && rc->stage == UPDATE_DATA_PTRS)
+		invalidate_extent_cache(root, &key, &next_key);
+
+	return err;
+}
+
+/*
+ * callback for the work threads.
+ * this function merges reloc tree with corresponding fs tree,
+ * and then drops the reloc tree.
+ */
+static void merge_func(struct btrfs_work *work)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root;
+	struct btrfs_root *reloc_root;
+	struct async_merge *async;
+
+	async = container_of(work, struct async_merge, work);
+	reloc_root = async->root;
+
+	if (btrfs_root_refs(&reloc_root->root_item) > 0) {
+		root = read_fs_root(reloc_root->fs_info,
+				    reloc_root->root_key.offset);
+		BUG_ON(IS_ERR(root));
+		BUG_ON(root->reloc_root != reloc_root);
+
+		merge_reloc_root(async->rc, root);
+
+		trans = btrfs_start_transaction(root, 1);
+		btrfs_update_reloc_root(trans, root);
+		btrfs_end_transaction(trans, root);
+	}
+
+	btrfs_drop_dead_root(reloc_root);
+
+	if (atomic_dec_and_test(async->num_pending))
+		complete(async->done);
+
+	kfree(async);
+}
+
+static int merge_reloc_roots(struct reloc_control *rc)
+{
+	struct async_merge *async;
+	struct btrfs_root *root;
+	struct completion done;
+	atomic_t num_pending;
+
+	init_completion(&done);
+	atomic_set(&num_pending, 1);
+
+	while (!list_empty(&rc->reloc_roots)) {
+		root = list_entry(rc->reloc_roots.next,
+				  struct btrfs_root, root_list);
+		list_del_init(&root->root_list);
+
+		async = kmalloc(sizeof(*async), GFP_NOFS);
+		BUG_ON(!async);
+		async->work.func = merge_func;
+		async->work.flags = 0;
+		async->rc = rc;
+		async->root = root;
+		async->done = &done;
+		async->num_pending = &num_pending;
+		atomic_inc(&num_pending);
+		btrfs_queue_worker(&rc->workers, &async->work);
+	}
+
+	if (!atomic_dec_and_test(&num_pending))
+		wait_for_completion(&done);
+
+	BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
+	return 0;
+}
+
+static void free_block_list(struct rb_root *blocks)
+{
+	struct tree_block *block;
+	struct rb_node *rb_node;
+	while ((rb_node = rb_first(blocks))) {
+		block = rb_entry(rb_node, struct tree_block, rb_node);
+		rb_erase(rb_node, blocks);
+		kfree(block);
+	}
+}
+
+static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *reloc_root)
+{
+	struct btrfs_root *root;
+
+	if (reloc_root->last_trans == trans->transid)
+		return 0;
+
+	root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
+	BUG_ON(IS_ERR(root));
+	BUG_ON(root->reloc_root != reloc_root);
+
+	return btrfs_record_root_in_trans(trans, root);
+}
+
+/*
+ * select one tree from trees that references the block.
+ * for blocks in refernce counted trees, we preper reloc tree.
+ * if no reloc tree found and reloc_only is true, NULL is returned.
+ */
+static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
+					    struct backref_node *node,
+					    struct backref_edge *edges[],
+					    int *nr, int reloc_only)
+{
+	struct backref_node *next;
+	struct btrfs_root *root;
+	int index;
+	int loop = 0;
+again:
+	index = 0;
+	next = node;
+	while (1) {
+		cond_resched();
+		next = walk_up_backref(next, edges, &index);
+		root = next->root;
+		if (!root) {
+			BUG_ON(!node->old_root);
+			goto skip;
+		}
+
+		/* no other choice for non-refernce counted tree */
+		if (!root->ref_cows) {
+			BUG_ON(reloc_only);
+			break;
+		}
+
+		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+			record_reloc_root_in_trans(trans, root);
+			break;
+		}
+
+		if (loop) {
+			btrfs_record_root_in_trans(trans, root);
+			break;
+		}
+
+		if (reloc_only || next != node) {
+			if (!root->reloc_root)
+				btrfs_record_root_in_trans(trans, root);
+			root = root->reloc_root;
+			/*
+			 * if the reloc tree was created in current
+			 * transation, there is no node in backref tree
+			 * corresponds to the root of the reloc tree.
+			 */
+			if (btrfs_root_last_snapshot(&root->root_item) ==
+			    trans->transid - 1)
+				break;
+		}
+skip:
+		root = NULL;
+		next = walk_down_backref(edges, &index);
+		if (!next || next->level <= node->level)
+			break;
+	}
+
+	if (!root && !loop && !reloc_only) {
+		loop = 1;
+		goto again;
+	}
+
+	if (root)
+		*nr = index;
+	else
+		*nr = 0;
+
+	return root;
+}
+
+static noinline_for_stack
+struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
+				   struct backref_node *node)
+{
+	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+	int nr;
+	return __select_one_root(trans, node, edges, &nr, 0);
+}
+
+static noinline_for_stack
+struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
+				     struct backref_node *node,
+				     struct backref_edge *edges[], int *nr)
+{
+	return __select_one_root(trans, node, edges, nr, 1);
+}
+
+static void grab_path_buffers(struct btrfs_path *path,
+			      struct backref_node *node,
+			      struct backref_edge *edges[], int nr)
+{
+	int i = 0;
+	while (1) {
+		drop_node_buffer(node);
+		node->eb = path->nodes[node->level];
+		BUG_ON(!node->eb);
+		if (path->locks[node->level])
+			node->locked = 1;
+		path->nodes[node->level] = NULL;
+		path->locks[node->level] = 0;
+
+		if (i >= nr)
+			break;
+
+		edges[i]->blockptr = node->eb->start;
+		node = edges[i]->node[UPPER];
+		i++;
+	}
+}
+
+/*
+ * relocate a block tree, and then update pointers in upper level
+ * blocks that reference the block to point to the new location.
+ *
+ * if called by link_to_upper, the block has already been relocated.
+ * in that case this function just updates pointers.
+ */
+static int do_relocation(struct btrfs_trans_handle *trans,
+			 struct backref_node *node,
+			 struct btrfs_key *key,
+			 struct btrfs_path *path, int lowest)
+{
+	struct backref_node *upper;
+	struct backref_edge *edge;
+	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+	struct btrfs_root *root;
+	struct extent_buffer *eb;
+	u32 blocksize;
+	u64 bytenr;
+	u64 generation;
+	int nr;
+	int slot;
+	int ret;
+	int err = 0;
+
+	BUG_ON(lowest && node->eb);
+
+	path->lowest_level = node->level + 1;
+	list_for_each_entry(edge, &node->upper, list[LOWER]) {
+		cond_resched();
+		if (node->eb && node->eb->start == edge->blockptr)
+			continue;
+
+		upper = edge->node[UPPER];
+		root = select_reloc_root(trans, upper, edges, &nr);
+		if (!root)
+			continue;
+
+		if (upper->eb && !upper->locked)
+			drop_node_buffer(upper);
+
+		if (!upper->eb) {
+			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+			if (ret < 0) {
+				err = ret;
+				break;
+			}
+			BUG_ON(ret > 0);
+
+			slot = path->slots[upper->level];
+
+			btrfs_unlock_up_safe(path, upper->level + 1);
+			grab_path_buffers(path, upper, edges, nr);
+
+			btrfs_release_path(NULL, path);
+		} else {
+			ret = btrfs_bin_search(upper->eb, key, upper->level,
+					       &slot);
+			BUG_ON(ret);
+		}
+
+		bytenr = btrfs_node_blockptr(upper->eb, slot);
+		if (!lowest) {
+			if (node->eb->start == bytenr) {
+				btrfs_tree_unlock(upper->eb);
+				upper->locked = 0;
+				continue;
+			}
+		} else {
+			BUG_ON(node->bytenr != bytenr);
+		}
+
+		blocksize = btrfs_level_size(root, node->level);
+		generation = btrfs_node_ptr_generation(upper->eb, slot);
+		eb = read_tree_block(root, bytenr, blocksize, generation);
+		btrfs_tree_lock(eb);
+		btrfs_set_lock_blocking(eb);
+
+		if (!node->eb) {
+			ret = btrfs_cow_block(trans, root, eb, upper->eb,
+					      slot, &eb);
+			if (ret < 0) {
+				err = ret;
+				break;
+			}
+			btrfs_set_lock_blocking(eb);
+			node->eb = eb;
+			node->locked = 1;
+		} else {
+			btrfs_set_node_blockptr(upper->eb, slot,
+						node->eb->start);
+			btrfs_set_node_ptr_generation(upper->eb, slot,
+						      trans->transid);
+			btrfs_mark_buffer_dirty(upper->eb);
+
+			ret = btrfs_inc_extent_ref(trans, root,
+						node->eb->start, blocksize,
+						upper->eb->start,
+						btrfs_header_owner(upper->eb),
+						node->level, 0);
+			BUG_ON(ret);
+
+			ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
+			BUG_ON(ret);
+
+			btrfs_tree_unlock(eb);
+			free_extent_buffer(eb);
+		}
+		if (!lowest) {
+			btrfs_tree_unlock(upper->eb);
+			upper->locked = 0;
+		}
+	}
+	path->lowest_level = 0;
+	return err;
+}
+
+static int link_to_upper(struct btrfs_trans_handle *trans,
+			 struct backref_node *node,
+			 struct btrfs_path *path)
+{
+	struct btrfs_key key;
+	if (!node->eb || list_empty(&node->upper))
+		return 0;
+
+	btrfs_node_key_to_cpu(node->eb, &key, 0);
+	return do_relocation(trans, node, &key, path, 0);
+}
+
+static int finish_pending_nodes(struct btrfs_trans_handle *trans,
+				struct backref_cache *cache,
+				struct btrfs_path *path)
+{
+	struct backref_node *node;
+	int level;
+	int ret;
+	int err = 0;
+
+	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+		while (!list_empty(&cache->pending[level])) {
+			node = list_entry(cache->pending[level].next,
+					  struct backref_node, lower);
+			BUG_ON(node->level != level);
+
+			ret = link_to_upper(trans, node, path);
+			if (ret < 0)
+				err = ret;
+			/*
+			 * this remove the node from the pending list and
+			 * may add some other nodes to the level + 1
+			 * pending list
+			 */
+			remove_backref_node(cache, node);
+		}
+	}
+	BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
+	return err;
+}
+
+static void mark_block_processed(struct reloc_control *rc,
+				 struct backref_node *node)
+{
+	u32 blocksize;
+	if (node->level == 0 ||
+	    in_block_group(node->bytenr, rc->block_group)) {
+		blocksize = btrfs_level_size(rc->extent_root, node->level);
+		set_extent_bits(&rc->processed_blocks, node->bytenr,
+				node->bytenr + blocksize - 1, EXTENT_DIRTY,
+				GFP_NOFS);
+	}
+	node->processed = 1;
+}
+
+/*
+ * mark a block and all blocks directly/indirectly reference the block
+ * as processed.
+ */
+static void update_processed_blocks(struct reloc_control *rc,
+				    struct backref_node *node)
+{
+	struct backref_node *next = node;
+	struct backref_edge *edge;
+	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+	int index = 0;
+
+	while (next) {
+		cond_resched();
+		while (1) {
+			if (next->processed)
+				break;
+
+			mark_block_processed(rc, next);
+
+			if (list_empty(&next->upper))
+				break;
+
+			edge = list_entry(next->upper.next,
+					  struct backref_edge, list[LOWER]);
+			edges[index++] = edge;
+			next = edge->node[UPPER];
+		}
+		next = walk_down_backref(edges, &index);
+	}
+}
+
+static int tree_block_processed(u64 bytenr, u32 blocksize,
+				struct reloc_control *rc)
+{
+	if (test_range_bit(&rc->processed_blocks, bytenr,
+			   bytenr + blocksize - 1, EXTENT_DIRTY, 1))
+		return 1;
+	return 0;
+}
+
+/*
+ * check if there are any file extent pointers in the leaf point to
+ * data require processing
+ */
+static int check_file_extents(struct reloc_control *rc,
+			      u64 bytenr, u32 blocksize, u64 ptr_gen)
+{
+	struct btrfs_key found_key;
+	struct btrfs_file_extent_item *fi;
+	struct extent_buffer *leaf;
+	u32 nritems;
+	int i;
+	int ret = 0;
+
+	leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
+
+	nritems = btrfs_header_nritems(leaf);
+	for (i = 0; i < nritems; i++) {
+		cond_resched();
+		btrfs_item_key_to_cpu(leaf, &found_key, i);
+		if (found_key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+		if (btrfs_file_extent_type(leaf, fi) ==
+		    BTRFS_FILE_EXTENT_INLINE)
+			continue;
+		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+		if (bytenr == 0)
+			continue;
+		if (in_block_group(bytenr, rc->block_group)) {
+			ret = 1;
+			break;
+		}
+	}
+	free_extent_buffer(leaf);
+	return ret;
+}
+
+/*
+ * scan child blocks of a given block to find blocks require processing
+ */
+static int add_child_blocks(struct btrfs_trans_handle *trans,
+			    struct reloc_control *rc,
+			    struct backref_node *node,
+			    struct rb_root *blocks)
+{
+	struct tree_block *block;
+	struct rb_node *rb_node;
+	u64 bytenr;
+	u64 ptr_gen;
+	u32 blocksize;
+	u32 nritems;
+	int i;
+	int err = 0;
+
+	nritems = btrfs_header_nritems(node->eb);
+	blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
+	for (i = 0; i < nritems; i++) {
+		cond_resched();
+		bytenr = btrfs_node_blockptr(node->eb, i);
+		ptr_gen = btrfs_node_ptr_generation(node->eb, i);
+		if (ptr_gen == trans->transid)
+			continue;
+		if (!in_block_group(bytenr, rc->block_group) &&
+		    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
+			continue;
+		if (tree_block_processed(bytenr, blocksize, rc))
+			continue;
+
+		readahead_tree_block(rc->extent_root,
+				     bytenr, blocksize, ptr_gen);
+	}
+
+	for (i = 0; i < nritems; i++) {
+		cond_resched();
+		bytenr = btrfs_node_blockptr(node->eb, i);
+		ptr_gen = btrfs_node_ptr_generation(node->eb, i);
+		if (ptr_gen == trans->transid)
+			continue;
+		if (!in_block_group(bytenr, rc->block_group) &&
+		    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
+			continue;
+		if (tree_block_processed(bytenr, blocksize, rc))
+			continue;
+		if (!in_block_group(bytenr, rc->block_group) &&
+		    !check_file_extents(rc, bytenr, blocksize, ptr_gen))
+			continue;
+
+		block = kmalloc(sizeof(*block), GFP_NOFS);
+		if (!block) {
+			err = -ENOMEM;
+			break;
+		}
+		block->bytenr = bytenr;
+		btrfs_node_key_to_cpu(node->eb, &block->key, i);
+		block->level = node->level - 1;
+		block->key_ready = 1;
+		rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
+		BUG_ON(rb_node);
+	}
+	if (err)
+		free_block_list(blocks);
+	return err;
+}
+
+/*
+ * find adjacent blocks require processing
+ */
+static noinline_for_stack
+int add_adjacent_blocks(struct btrfs_trans_handle *trans,
+			struct reloc_control *rc,
+			struct backref_cache *cache,
+			struct rb_root *blocks, int level,
+			struct backref_node **upper)
+{
+	struct backref_node *node;
+	int ret = 0;
+
+	WARN_ON(!list_empty(&cache->pending[level]));
+
+	if (list_empty(&cache->pending[level + 1]))
+		return 1;
+
+	node = list_entry(cache->pending[level + 1].next,
+			  struct backref_node, lower);
+	if (node->eb)
+		ret = add_child_blocks(trans, rc, node, blocks);
+
+	*upper = node;
+	return ret;
+}
+
+static int get_tree_block_key(struct reloc_control *rc,
+			      struct tree_block *block)
+{
+	struct extent_buffer *eb;
+
+	BUG_ON(block->key_ready);
+	eb = read_tree_block(rc->extent_root, block->bytenr,
+			     block->key.objectid, block->key.offset);
+	WARN_ON(btrfs_header_level(eb) != block->level);
+	if (block->level == 0)
+		btrfs_item_key_to_cpu(eb, &block->key, 0);
+	else
+		btrfs_node_key_to_cpu(eb, &block->key, 0);
+	free_extent_buffer(eb);
+	block->key_ready = 1;
+	return 0;
+}
+
+static int reada_tree_block(struct reloc_control *rc,
+			    struct tree_block *block)
+{
+	BUG_ON(block->key_ready);
+	readahead_tree_block(rc->extent_root, block->bytenr,
+			     block->key.objectid, block->key.offset);
+	return 0;
+}
+
+/*
+ * helper function to relocate a tree block
+ */
+static int relocate_tree_block(struct btrfs_trans_handle *trans,
+				struct reloc_control *rc,
+				struct backref_node *node,
+				struct btrfs_key *key,
+				struct btrfs_path *path)
+{
+	struct btrfs_root *root;
+	int ret;
+
+	root = select_one_root(trans, node);
+	if (unlikely(!root)) {
+		rc->found_old_snapshot = 1;
+		update_processed_blocks(rc, node);
+		return 0;
+	}
+
+	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+		ret = do_relocation(trans, node, key, path, 1);
+		if (ret < 0)
+			goto out;
+		if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
+			ret = replace_file_extents(trans, rc, root,
+						   node->eb, NULL);
+			if (ret < 0)
+				goto out;
+		}
+		drop_node_buffer(node);
+	} else if (!root->ref_cows) {
+		path->lowest_level = node->level;
+		ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+		btrfs_release_path(root, path);
+		if (ret < 0)
+			goto out;
+	} else if (root != node->root) {
+		WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
+	}
+
+	update_processed_blocks(rc, node);
+	ret = 0;
+out:
+	drop_node_buffer(node);
+	return ret;
+}
+
+/*
+ * relocate a list of blocks
+ */
+static noinline_for_stack
+int relocate_tree_blocks(struct btrfs_trans_handle *trans,
+			 struct reloc_control *rc, struct rb_root *blocks)
+{
+	struct backref_cache *cache;
+	struct backref_node *node;
+	struct btrfs_path *path;
+	struct tree_block *block;
+	struct rb_node *rb_node;
+	int level = -1;
+	int ret;
+	int err = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	cache = kmalloc(sizeof(*cache), GFP_NOFS);
+	if (!cache) {
+		btrfs_free_path(path);
+		return -ENOMEM;
+	}
+
+	backref_cache_init(cache);
+
+	rb_node = rb_first(blocks);
+	while (rb_node) {
+		block = rb_entry(rb_node, struct tree_block, rb_node);
+		if (level == -1)
+			level = block->level;
+		else
+			BUG_ON(level != block->level);
+		if (!block->key_ready)
+			reada_tree_block(rc, block);
+		rb_node = rb_next(rb_node);
+	}
+
+	rb_node = rb_first(blocks);
+	while (rb_node) {
+		block = rb_entry(rb_node, struct tree_block, rb_node);
+		if (!block->key_ready)
+			get_tree_block_key(rc, block);
+		rb_node = rb_next(rb_node);
+	}
+
+	rb_node = rb_first(blocks);
+	while (rb_node) {
+		block = rb_entry(rb_node, struct tree_block, rb_node);
+
+		node = build_backref_tree(rc, cache, &block->key,
+					  block->level, block->bytenr);
+		if (IS_ERR(node)) {
+			err = PTR_ERR(node);
+			goto out;
+		}
+
+		ret = relocate_tree_block(trans, rc, node, &block->key,
+					  path);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		remove_backref_node(cache, node);
+		rb_node = rb_next(rb_node);
+	}
+
+	if (level > 0)
+		goto out;
+
+	free_block_list(blocks);
+
+	/*
+	 * now backrefs of some upper level tree blocks have been cached,
+	 * try relocating blocks referenced by these upper level blocks.
+	 */
+	while (1) {
+		struct backref_node *upper = NULL;
+		if (trans->transaction->in_commit ||
+		    trans->transaction->delayed_refs.flushing)
+			break;
+
+		ret = add_adjacent_blocks(trans, rc, cache, blocks, level,
+					  &upper);
+		if (ret < 0)
+			err = ret;
+		if (ret != 0)
+			break;
+
+		rb_node = rb_first(blocks);
+		while (rb_node) {
+			block = rb_entry(rb_node, struct tree_block, rb_node);
+			if (trans->transaction->in_commit ||
+			    trans->transaction->delayed_refs.flushing)
+				goto out;
+			BUG_ON(!block->key_ready);
+			node = build_backref_tree(rc, cache, &block->key,
+						  level, block->bytenr);
+			if (IS_ERR(node)) {
+				err = PTR_ERR(node);
+				goto out;
+			}
+
+			ret = relocate_tree_block(trans, rc, node,
+						  &block->key, path);
+			if (ret < 0) {
+				err = ret;
+				goto out;
+			}
+			remove_backref_node(cache, node);
+			rb_node = rb_next(rb_node);
+		}
+		free_block_list(blocks);
+
+		if (upper) {
+			ret = link_to_upper(trans, upper, path);
+			if (ret < 0) {
+				err = ret;
+				break;
+			}
+			remove_backref_node(cache, upper);
+		}
+	}
+out:
+	free_block_list(blocks);
+
+	ret = finish_pending_nodes(trans, cache, path);
+	if (ret < 0)
+		err = ret;
+
+	kfree(cache);
+	btrfs_free_path(path);
+	return err;
+}
+
+static noinline_for_stack
+int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
+{
+	u64 page_start;
+	u64 page_end;
+	unsigned long i;
+	unsigned long first_index;
+	unsigned long last_index;
+	unsigned int total_read = 0;
+	unsigned int total_dirty = 0;
+	struct page *page;
+	struct file_ra_state *ra;
+	struct btrfs_ordered_extent *ordered;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	int ret = 0;
+
+	ra = kzalloc(sizeof(*ra), GFP_NOFS);
+	if (!ra)
+		return -ENOMEM;
+
+	mutex_lock(&inode->i_mutex);
+	first_index = start >> PAGE_CACHE_SHIFT;
+	last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
+
+	/* make sure the dirty trick played by the caller work */
+	ret = invalidate_inode_pages2_range(inode->i_mapping,
+					    first_index, last_index);
+	if (ret)
+		goto out_unlock;
+
+	file_ra_state_init(ra, inode->i_mapping);
+
+	for (i = first_index ; i <= last_index; i++) {
+		if (total_read % ra->ra_pages == 0) {
+			btrfs_force_ra(inode->i_mapping, ra, NULL, i,
+				min(last_index, ra->ra_pages + i - 1));
+		}
+		total_read++;
+again:
+		if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
+			BUG_ON(1);
+		page = grab_cache_page(inode->i_mapping, i);
+		if (!page) {
+			ret = -ENOMEM;
+			goto out_unlock;
+		}
+		if (!PageUptodate(page)) {
+			btrfs_readpage(NULL, page);
+			lock_page(page);
+			if (!PageUptodate(page)) {
+				unlock_page(page);
+				page_cache_release(page);
+				ret = -EIO;
+				goto out_unlock;
+			}
+		}
+		wait_on_page_writeback(page);
+
+		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+		page_end = page_start + PAGE_CACHE_SIZE - 1;
+		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+
+		ordered = btrfs_lookup_ordered_extent(inode, page_start);
+		if (ordered) {
+			unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+			unlock_page(page);
+			page_cache_release(page);
+			btrfs_start_ordered_extent(inode, ordered, 1);
+			btrfs_put_ordered_extent(ordered);
+			goto again;
+		}
+		set_page_extent_mapped(page);
+
+		if (i == first_index)
+			set_extent_bits(io_tree, page_start, page_end,
+					EXTENT_BOUNDARY, GFP_NOFS);
+		btrfs_set_extent_delalloc(inode, page_start, page_end);
+
+		set_page_dirty(page);
+		total_dirty++;
+
+		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		unlock_page(page);
+		page_cache_release(page);
+	}
+out_unlock:
+	mutex_unlock(&inode->i_mutex);
+	kfree(ra);
+	balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
+	return ret;
+}
+
+static noinline_for_stack
+int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *em;
+	u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt;
+	u64 end = start + extent_key->offset - 1;
+
+	em = alloc_extent_map(GFP_NOFS);
+	em->start = start;
+	em->len = extent_key->offset;
+	em->block_len = extent_key->offset;
+	em->block_start = extent_key->objectid;
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
+	set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+	/* setup extent map to cheat btrfs_readpage */
+	lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+	while (1) {
+		int ret;
+		spin_lock(&em_tree->lock);
+		ret = add_extent_mapping(em_tree, em);
+		spin_unlock(&em_tree->lock);
+		if (ret != -EEXIST) {
+			free_extent_map(em);
+			break;
+		}
+		btrfs_drop_extent_cache(inode, start, end, 0);
+	}
+	unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+
+	return relocate_inode_pages(inode, start, extent_key->offset);
+}
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static int get_ref_objectid_v0(struct reloc_control *rc,
+			       struct btrfs_path *path,
+			       struct btrfs_key *extent_key,
+			       u64 *ref_objectid, int *path_change)
+{
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_ref_v0 *ref0;
+	int ret;
+	int slot;
+
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+	while (1) {
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(rc->extent_root, path);
+			if (ret < 0)
+				return ret;
+			BUG_ON(ret > 0);
+			leaf = path->nodes[0];
+			slot = path->slots[0];
+			if (path_change)
+				*path_change = 1;
+		}
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (key.objectid != extent_key->objectid)
+			return -ENOENT;
+
+		if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
+			slot++;
+			continue;
+		}
+		ref0 = btrfs_item_ptr(leaf, slot,
+				struct btrfs_extent_ref_v0);
+		*ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
+		break;
+	}
+	return 0;
+}
+#endif
+
+/*
+ * helper to add a tree block to the list.
+ * the major work is getting the generation and level of the block
+ */
+static int add_tree_block(struct reloc_control *rc,
+			  struct btrfs_key *extent_key,
+			  struct btrfs_path *path,
+			  struct rb_root *blocks)
+{
+	struct extent_buffer *eb;
+	struct btrfs_extent_item *ei;
+	struct btrfs_tree_block_info *bi;
+	struct tree_block *block;
+	struct rb_node *rb_node;
+	u32 item_size;
+	int level = -1;
+	int generation;
+
+	eb =  path->nodes[0];
+	item_size = btrfs_item_size_nr(eb, path->slots[0]);
+
+	if (item_size >= sizeof(*ei) + sizeof(*bi)) {
+		ei = btrfs_item_ptr(eb, path->slots[0],
+				struct btrfs_extent_item);
+		bi = (struct btrfs_tree_block_info *)(ei + 1);
+		generation = btrfs_extent_generation(eb, ei);
+		level = btrfs_tree_block_level(eb, bi);
+	} else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		u64 ref_owner;
+		int ret;
+
+		BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
+		ret = get_ref_objectid_v0(rc, path, extent_key,
+					  &ref_owner, NULL);
+		BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
+		level = (int)ref_owner;
+		/* FIXME: get real generation */
+		generation = 0;
+#else
+		BUG();
+#endif
+	}
+
+	btrfs_release_path(rc->extent_root, path);
+
+	BUG_ON(level == -1);
+
+	block = kmalloc(sizeof(*block), GFP_NOFS);
+	if (!block)
+		return -ENOMEM;
+
+	block->bytenr = extent_key->objectid;
+	block->key.objectid = extent_key->offset;
+	block->key.offset = generation;
+	block->level = level;
+	block->key_ready = 0;
+
+	rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
+	BUG_ON(rb_node);
+
+	return 0;
+}
+
+/*
+ * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
+ */
+static int __add_tree_block(struct reloc_control *rc,
+			    u64 bytenr, u32 blocksize,
+			    struct rb_root *blocks)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret;
+
+	if (tree_block_processed(bytenr, blocksize, rc))
+		return 0;
+
+	if (tree_search(blocks, bytenr))
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = blocksize;
+
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
+	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret);
+
+	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+	ret = add_tree_block(rc, &key, path, blocks);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * helper to check if the block use full backrefs for pointers in it
+ */
+static int block_use_full_backref(struct reloc_control *rc,
+				  struct extent_buffer *eb)
+{
+	struct btrfs_path *path;
+	struct btrfs_extent_item *ei;
+	struct btrfs_key key;
+	u64 flags;
+	int ret;
+
+	if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
+	    btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
+		return 1;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	key.objectid = eb->start;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = eb->len;
+
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
+	ret = btrfs_search_slot(NULL, rc->extent_root,
+				&key, path, 0, 0);
+	BUG_ON(ret);
+
+	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			    struct btrfs_extent_item);
+	flags = btrfs_extent_flags(path->nodes[0], ei);
+	BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
+	if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+		ret = 1;
+	else
+		ret = 0;
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
+ * this function scans fs tree to find blocks reference the data extent
+ */
+static int find_data_references(struct reloc_control *rc,
+				struct btrfs_key *extent_key,
+				struct extent_buffer *leaf,
+				struct btrfs_extent_data_ref *ref,
+				struct rb_root *blocks)
+{
+	struct btrfs_path *path;
+	struct tree_block *block;
+	struct btrfs_root *root;
+	struct btrfs_file_extent_item *fi;
+	struct rb_node *rb_node;
+	struct btrfs_key key;
+	u64 ref_root;
+	u64 ref_objectid;
+	u64 ref_offset;
+	u32 ref_count;
+	u32 nritems;
+	int err = 0;
+	int added = 0;
+	int counted;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ref_root = btrfs_extent_data_ref_root(leaf, ref);
+	ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
+	ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
+	ref_count = btrfs_extent_data_ref_count(leaf, ref);
+
+	root = read_fs_root(rc->extent_root->fs_info, ref_root);
+	if (IS_ERR(root)) {
+		err = PTR_ERR(root);
+		goto out;
+	}
+
+	key.objectid = ref_objectid;
+	key.offset = ref_offset;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	nritems = btrfs_header_nritems(leaf);
+	/*
+	 * the references in tree blocks that use full backrefs
+	 * are not counted in
+	 */
+	if (block_use_full_backref(rc, leaf))
+		counted = 0;
+	else
+		counted = 1;
+	rb_node = tree_search(blocks, leaf->start);
+	if (rb_node) {
+		if (counted)
+			added = 1;
+		else
+			path->slots[0] = nritems;
+	}
+
+	while (ref_count > 0) {
+		while (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0) {
+				err = ret;
+				goto out;
+			}
+			if (ret > 0) {
+				WARN_ON(1);
+				goto out;
+			}
+
+			leaf = path->nodes[0];
+			nritems = btrfs_header_nritems(leaf);
+			added = 0;
+
+			if (block_use_full_backref(rc, leaf))
+				counted = 0;
+			else
+				counted = 1;
+			rb_node = tree_search(blocks, leaf->start);
+			if (rb_node) {
+				if (counted)
+					added = 1;
+				else
+					path->slots[0] = nritems;
+			}
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid != ref_objectid ||
+		    key.type != BTRFS_EXTENT_DATA_KEY) {
+			WARN_ON(1);
+			break;
+		}
+
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+
+		if (btrfs_file_extent_type(leaf, fi) ==
+		    BTRFS_FILE_EXTENT_INLINE)
+			goto next;
+
+		if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
+		    extent_key->objectid)
+			goto next;
+
+		key.offset -= btrfs_file_extent_offset(leaf, fi);
+		if (key.offset != ref_offset)
+			goto next;
+
+		if (counted)
+			ref_count--;
+		if (added)
+			goto next;
+
+		if (!tree_block_processed(leaf->start, leaf->len, rc)) {
+			block = kmalloc(sizeof(*block), GFP_NOFS);
+			if (!block) {
+				err = -ENOMEM;
+				break;
+			}
+			block->bytenr = leaf->start;
+			btrfs_item_key_to_cpu(leaf, &block->key, 0);
+			block->level = 0;
+			block->key_ready = 1;
+			rb_node = tree_insert(blocks, block->bytenr,
+					      &block->rb_node);
+			BUG_ON(rb_node);
+		}
+		if (counted)
+			added = 1;
+		else
+			path->slots[0] = nritems;
+next:
+		path->slots[0]++;
+
+	}
+out:
+	btrfs_free_path(path);
+	return err;
+}
+
+/*
+ * hepler to find all tree blocks that reference a given data extent
+ */
+static noinline_for_stack
+int add_data_references(struct reloc_control *rc,
+			struct btrfs_key *extent_key,
+			struct btrfs_path *path,
+			struct rb_root *blocks)
+{
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+	struct btrfs_extent_data_ref *dref;
+	struct btrfs_extent_inline_ref *iref;
+	unsigned long ptr;
+	unsigned long end;
+	u32 blocksize;
+	int ret;
+	int err = 0;
+
+	ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
+			       extent_key->offset);
+	BUG_ON(ret < 0);
+	if (ret > 0) {
+		/* the relocated data is fragmented */
+		rc->extents_skipped++;
+		btrfs_release_path(rc->extent_root, path);
+		return 0;
+	}
+
+	blocksize = btrfs_level_size(rc->extent_root, 0);
+
+	eb = path->nodes[0];
+	ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
+	end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
+		ptr = end;
+	else
+#endif
+		ptr += sizeof(struct btrfs_extent_item);
+
+	while (ptr < end) {
+		iref = (struct btrfs_extent_inline_ref *)ptr;
+		key.type = btrfs_extent_inline_ref_type(eb, iref);
+		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
+			ret = __add_tree_block(rc, key.offset, blocksize,
+					       blocks);
+		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+			ret = find_data_references(rc, extent_key,
+						   eb, dref, blocks);
+		} else {
+			BUG();
+		}
+		ptr += btrfs_extent_inline_ref_size(key.type);
+	}
+	WARN_ON(ptr > end);
+
+	while (1) {
+		cond_resched();
+		eb = path->nodes[0];
+		if (path->slots[0] >= btrfs_header_nritems(eb)) {
+			ret = btrfs_next_leaf(rc->extent_root, path);
+			if (ret < 0) {
+				err = ret;
+				break;
+			}
+			if (ret > 0)
+				break;
+			eb = path->nodes[0];
+		}
+
+		btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
+		if (key.objectid != extent_key->objectid)
+			break;
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
+		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
+#else
+		BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
+		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+#endif
+			ret = __add_tree_block(rc, key.offset, blocksize,
+					       blocks);
+		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+			dref = btrfs_item_ptr(eb, path->slots[0],
+					      struct btrfs_extent_data_ref);
+			ret = find_data_references(rc, extent_key,
+						   eb, dref, blocks);
+		} else {
+			ret = 0;
+		}
+		if (ret) {
+			err = ret;
+			break;
+		}
+		path->slots[0]++;
+	}
+	btrfs_release_path(rc->extent_root, path);
+	if (err)
+		free_block_list(blocks);
+	return err;
+}
+
+/*
+ * hepler to find next unprocessed extent
+ */
+static noinline_for_stack
+int find_next_extent(struct btrfs_trans_handle *trans,
+		     struct reloc_control *rc, struct btrfs_path *path)
+{
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	u64 start, end, last;
+	int ret;
+
+	last = rc->block_group->key.objectid + rc->block_group->key.offset;
+	while (1) {
+		cond_resched();
+		if (rc->search_start >= last) {
+			ret = 1;
+			break;
+		}
+
+		key.objectid = rc->search_start;
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.offset = 0;
+
+		path->search_commit_root = 1;
+		path->skip_locking = 1;
+		ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
+					0, 0);
+		if (ret < 0)
+			break;
+next:
+		leaf = path->nodes[0];
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(rc->extent_root, path);
+			if (ret != 0)
+				break;
+			leaf = path->nodes[0];
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid >= last) {
+			ret = 1;
+			break;
+		}
+
+		if (key.type != BTRFS_EXTENT_ITEM_KEY ||
+		    key.objectid + key.offset <= rc->search_start) {
+			path->slots[0]++;
+			goto next;
+		}
+
+		ret = find_first_extent_bit(&rc->processed_blocks,
+					    key.objectid, &start, &end,
+					    EXTENT_DIRTY);
+
+		if (ret == 0 && start <= key.objectid) {
+			btrfs_release_path(rc->extent_root, path);
+			rc->search_start = end + 1;
+		} else {
+			rc->search_start = key.objectid + key.offset;
+			return 0;
+		}
+	}
+	btrfs_release_path(rc->extent_root, path);
+	return ret;
+}
+
+static void set_reloc_control(struct reloc_control *rc)
+{
+	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+	mutex_lock(&fs_info->trans_mutex);
+	fs_info->reloc_ctl = rc;
+	mutex_unlock(&fs_info->trans_mutex);
+}
+
+static void unset_reloc_control(struct reloc_control *rc)
+{
+	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+	mutex_lock(&fs_info->trans_mutex);
+	fs_info->reloc_ctl = NULL;
+	mutex_unlock(&fs_info->trans_mutex);
+}
+
+static int check_extent_flags(u64 flags)
+{
+	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
+	    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
+		return 1;
+	if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
+	    !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
+		return 1;
+	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
+	    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+		return 1;
+	return 0;
+}
+
+static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
+{
+	struct rb_root blocks = RB_ROOT;
+	struct btrfs_key key;
+	struct btrfs_trans_handle *trans = NULL;
+	struct btrfs_path *path;
+	struct btrfs_extent_item *ei;
+	unsigned long nr;
+	u64 flags;
+	u32 item_size;
+	int ret;
+	int err = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	rc->search_start = rc->block_group->key.objectid;
+	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
+			  GFP_NOFS);
+
+	rc->create_reloc_root = 1;
+	set_reloc_control(rc);
+
+	trans = btrfs_start_transaction(rc->extent_root, 1);
+	btrfs_commit_transaction(trans, rc->extent_root);
+
+	while (1) {
+		trans = btrfs_start_transaction(rc->extent_root, 1);
+
+		ret = find_next_extent(trans, rc, path);
+		if (ret < 0)
+			err = ret;
+		if (ret != 0)
+			break;
+
+		rc->extents_found++;
+
+		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				    struct btrfs_extent_item);
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+		item_size = btrfs_item_size_nr(path->nodes[0],
+					       path->slots[0]);
+		if (item_size >= sizeof(*ei)) {
+			flags = btrfs_extent_flags(path->nodes[0], ei);
+			ret = check_extent_flags(flags);
+			BUG_ON(ret);
+
+		} else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+			u64 ref_owner;
+			int path_change = 0;
+
+			BUG_ON(item_size !=
+			       sizeof(struct btrfs_extent_item_v0));
+			ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
+						  &path_change);
+			if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
+				flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
+			else
+				flags = BTRFS_EXTENT_FLAG_DATA;
+
+			if (path_change) {
+				btrfs_release_path(rc->extent_root, path);
+
+				path->search_commit_root = 1;
+				path->skip_locking = 1;
+				ret = btrfs_search_slot(NULL, rc->extent_root,
+							&key, path, 0, 0);
+				if (ret < 0) {
+					err = ret;
+					break;
+				}
+				BUG_ON(ret > 0);
+			}
+#else
+			BUG();
+#endif
+		}
+
+		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+			ret = add_tree_block(rc, &key, path, &blocks);
+		} else if (rc->stage == UPDATE_DATA_PTRS &&
+			 (flags & BTRFS_EXTENT_FLAG_DATA)) {
+			ret = add_data_references(rc, &key, path, &blocks);
+		} else {
+			btrfs_release_path(rc->extent_root, path);
+			ret = 0;
+		}
+		if (ret < 0) {
+			err = 0;
+			break;
+		}
+
+		if (!RB_EMPTY_ROOT(&blocks)) {
+			ret = relocate_tree_blocks(trans, rc, &blocks);
+			if (ret < 0) {
+				err = ret;
+				break;
+			}
+		}
+
+		nr = trans->blocks_used;
+		btrfs_end_transaction_throttle(trans, rc->extent_root);
+		trans = NULL;
+		btrfs_btree_balance_dirty(rc->extent_root, nr);
+
+		if (rc->stage == MOVE_DATA_EXTENTS &&
+		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
+			rc->found_file_extent = 1;
+			ret = relocate_data_extent(rc->data_inode, &key);
+			if (ret < 0) {
+				err = ret;
+				break;
+			}
+		}
+	}
+	btrfs_free_path(path);
+
+	if (trans) {
+		nr = trans->blocks_used;
+		btrfs_end_transaction(trans, rc->extent_root);
+		btrfs_btree_balance_dirty(rc->extent_root, nr);
+	}
+
+	rc->create_reloc_root = 0;
+	smp_mb();
+
+	if (rc->extents_found > 0) {
+		trans = btrfs_start_transaction(rc->extent_root, 1);
+		btrfs_commit_transaction(trans, rc->extent_root);
+	}
+
+	merge_reloc_roots(rc);
+
+	unset_reloc_control(rc);
+
+	/* get rid of pinned extents */
+	trans = btrfs_start_transaction(rc->extent_root, 1);
+	btrfs_commit_transaction(trans, rc->extent_root);
+
+	return err;
+}
+
+static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 u64 objectid, u64 size)
+{
+	struct btrfs_path *path;
+	struct btrfs_inode_item *item;
+	struct extent_buffer *leaf;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
+	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+	btrfs_set_inode_generation(leaf, item, 1);
+	btrfs_set_inode_size(leaf, item, size);
+	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
+	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(root, path);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * helper to create inode for data relocation.
+ * the inode is in data relocation tree and its link count is 0
+ */
+static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+					struct btrfs_block_group_cache *group)
+{
+	struct inode *inode = NULL;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root;
+	struct btrfs_key key;
+	unsigned long nr;
+	u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
+	int err = 0;
+
+	root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
+	if (IS_ERR(root))
+		return ERR_CAST(root);
+
+	trans = btrfs_start_transaction(root, 1);
+	BUG_ON(!trans);
+
+	err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
+	if (err)
+		goto out;
+
+	err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
+	BUG_ON(err);
+
+	err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
+				       group->key.offset, 0, group->key.offset,
+				       0, 0, 0);
+	BUG_ON(err);
+
+	key.objectid = objectid;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+	inode = btrfs_iget(root->fs_info->sb, &key, root);
+	BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
+	BTRFS_I(inode)->index_cnt = group->key.objectid;
+
+	err = btrfs_orphan_add(trans, inode);
+out:
+	nr = trans->blocks_used;
+	btrfs_end_transaction(trans, root);
+
+	btrfs_btree_balance_dirty(root, nr);
+	if (err) {
+		if (inode)
+			iput(inode);
+		inode = ERR_PTR(err);
+	}
+	return inode;
+}
+
+/*
+ * function to relocate all extents in a block group.
+ */
+int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
+{
+	struct btrfs_fs_info *fs_info = extent_root->fs_info;
+	struct reloc_control *rc;
+	int ret;
+	int err = 0;
+
+	rc = kzalloc(sizeof(*rc), GFP_NOFS);
+	if (!rc)
+		return -ENOMEM;
+
+	mapping_tree_init(&rc->reloc_root_tree);
+	extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
+	INIT_LIST_HEAD(&rc->reloc_roots);
+
+	rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
+	BUG_ON(!rc->block_group);
+
+	btrfs_init_workers(&rc->workers, "relocate",
+			   fs_info->thread_pool_size);
+
+	rc->extent_root = extent_root;
+	btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
+
+	rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
+	if (IS_ERR(rc->data_inode)) {
+		err = PTR_ERR(rc->data_inode);
+		rc->data_inode = NULL;
+		goto out;
+	}
+
+	printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
+	       (unsigned long long)rc->block_group->key.objectid,
+	       (unsigned long long)rc->block_group->flags);
+
+	btrfs_start_delalloc_inodes(fs_info->tree_root);
+	btrfs_wait_ordered_extents(fs_info->tree_root, 0);
+
+	while (1) {
+		mutex_lock(&fs_info->cleaner_mutex);
+		btrfs_clean_old_snapshots(fs_info->tree_root);
+		mutex_unlock(&fs_info->cleaner_mutex);
+
+		rc->extents_found = 0;
+		rc->extents_skipped = 0;
+
+		ret = relocate_block_group(rc);
+		if (ret < 0) {
+			err = ret;
+			break;
+		}
+
+		if (rc->extents_found == 0)
+			break;
+
+		printk(KERN_INFO "btrfs: found %llu extents\n",
+			(unsigned long long)rc->extents_found);
+
+		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
+			btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
+			invalidate_mapping_pages(rc->data_inode->i_mapping,
+						 0, -1);
+			rc->stage = UPDATE_DATA_PTRS;
+		} else if (rc->stage == UPDATE_DATA_PTRS &&
+			   rc->extents_skipped >= rc->extents_found) {
+			iput(rc->data_inode);
+			rc->data_inode = create_reloc_inode(fs_info,
+							    rc->block_group);
+			if (IS_ERR(rc->data_inode)) {
+				err = PTR_ERR(rc->data_inode);
+				rc->data_inode = NULL;
+				break;
+			}
+			rc->stage = MOVE_DATA_EXTENTS;
+			rc->found_file_extent = 0;
+		}
+	}
+
+	filemap_fdatawrite_range(fs_info->btree_inode->i_mapping,
+				 rc->block_group->key.objectid,
+				 rc->block_group->key.objectid +
+				 rc->block_group->key.offset - 1);
+
+	WARN_ON(rc->block_group->pinned > 0);
+	WARN_ON(rc->block_group->reserved > 0);
+	WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
+out:
+	iput(rc->data_inode);
+	btrfs_stop_workers(&rc->workers);
+	btrfs_put_block_group(rc->block_group);
+	kfree(rc);
+	return err;
+}
+
+/*
+ * recover relocation interrupted by system crash.
+ *
+ * this function resumes merging reloc trees with corresponding fs trees.
+ * this is important for keeping the sharing of tree blocks
+ */
+int btrfs_recover_relocation(struct btrfs_root *root)
+{
+	LIST_HEAD(reloc_roots);
+	struct btrfs_key key;
+	struct btrfs_root *fs_root;
+	struct btrfs_root *reloc_root;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct reloc_control *rc = NULL;
+	struct btrfs_trans_handle *trans;
+	int ret;
+	int err = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
+					path, 0, 0);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		if (ret > 0) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		btrfs_release_path(root->fs_info->tree_root, path);
+
+		if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
+		    key.type != BTRFS_ROOT_ITEM_KEY)
+			break;
+
+		reloc_root = btrfs_read_fs_root_no_radix(root, &key);
+		if (IS_ERR(reloc_root)) {
+			err = PTR_ERR(reloc_root);
+			goto out;
+		}
+
+		list_add(&reloc_root->root_list, &reloc_roots);
+
+		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
+			fs_root = read_fs_root(root->fs_info,
+					       reloc_root->root_key.offset);
+			if (IS_ERR(fs_root)) {
+				err = PTR_ERR(fs_root);
+				goto out;
+			}
+		}
+
+		if (key.offset == 0)
+			break;
+
+		key.offset--;
+	}
+	btrfs_release_path(root->fs_info->tree_root, path);
+
+	if (list_empty(&reloc_roots))
+		goto out;
+
+	rc = kzalloc(sizeof(*rc), GFP_NOFS);
+	if (!rc) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	mapping_tree_init(&rc->reloc_root_tree);
+	INIT_LIST_HEAD(&rc->reloc_roots);
+	btrfs_init_workers(&rc->workers, "relocate",
+			   root->fs_info->thread_pool_size);
+	rc->extent_root = root->fs_info->extent_root;
+
+	set_reloc_control(rc);
+
+	while (!list_empty(&reloc_roots)) {
+		reloc_root = list_entry(reloc_roots.next,
+					struct btrfs_root, root_list);
+		list_del(&reloc_root->root_list);
+
+		if (btrfs_root_refs(&reloc_root->root_item) == 0) {
+			list_add_tail(&reloc_root->root_list,
+				      &rc->reloc_roots);
+			continue;
+		}
+
+		fs_root = read_fs_root(root->fs_info,
+				       reloc_root->root_key.offset);
+		BUG_ON(IS_ERR(fs_root));
+
+		__add_reloc_root(reloc_root);
+		fs_root->reloc_root = reloc_root;
+	}
+
+	trans = btrfs_start_transaction(rc->extent_root, 1);
+	btrfs_commit_transaction(trans, rc->extent_root);
+
+	merge_reloc_roots(rc);
+
+	unset_reloc_control(rc);
+
+	trans = btrfs_start_transaction(rc->extent_root, 1);
+	btrfs_commit_transaction(trans, rc->extent_root);
+out:
+	if (rc) {
+		btrfs_stop_workers(&rc->workers);
+		kfree(rc);
+	}
+	while (!list_empty(&reloc_roots)) {
+		reloc_root = list_entry(reloc_roots.next,
+					struct btrfs_root, root_list);
+		list_del(&reloc_root->root_list);
+		free_extent_buffer(reloc_root->node);
+		free_extent_buffer(reloc_root->commit_root);
+		kfree(reloc_root);
+	}
+	btrfs_free_path(path);
+
+	if (err == 0) {
+		/* cleanup orphan inode in data relocation tree */
+		fs_root = read_fs_root(root->fs_info,
+				       BTRFS_DATA_RELOC_TREE_OBJECTID);
+		if (IS_ERR(fs_root))
+			err = PTR_ERR(fs_root);
+	}
+	return err;
+}
+
+/*
+ * helper to add ordered checksum for data relocation.
+ *
+ * cloning checksum properly handles the nodatasum extents.
+ * it also saves CPU time to re-calculate the checksum.
+ */
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
+{
+	struct btrfs_ordered_sum *sums;
+	struct btrfs_sector_sum *sector_sum;
+	struct btrfs_ordered_extent *ordered;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	size_t offset;
+	int ret;
+	u64 disk_bytenr;
+	LIST_HEAD(list);
+
+	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
+	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
+
+	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
+	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
+				       disk_bytenr + len - 1, &list);
+
+	while (!list_empty(&list)) {
+		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+		list_del_init(&sums->list);
+
+		sector_sum = sums->sums;
+		sums->bytenr = ordered->start;
+
+		offset = 0;
+		while (offset < sums->len) {
+			sector_sum->bytenr += ordered->start - disk_bytenr;
+			sector_sum++;
+			offset += root->sectorsize;
+		}
+
+		btrfs_add_ordered_sum(inode, ordered, sums);
+	}
+	btrfs_put_ordered_extent(ordered);
+	return 0;
+}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index b48650d..0ddc6d6 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -111,6 +111,15 @@ out:
 	return ret;
 }
 
+int btrfs_set_root_node(struct btrfs_root_item *item,
+			struct extent_buffer *node)
+{
+	btrfs_set_root_bytenr(item, node->start);
+	btrfs_set_root_level(item, btrfs_header_level(node));
+	btrfs_set_root_generation(item, btrfs_header_generation(node));
+	return 0;
+}
+
 /*
  * copy the data in 'item' into the btree
  */
@@ -164,8 +173,7 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
  * offset lower than the latest root.  They need to be queued for deletion to
  * finish what was happening when we crashed.
  */
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
-			  struct btrfs_root *latest)
+int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
 {
 	struct btrfs_root *dead_root;
 	struct btrfs_item *item;
@@ -227,10 +235,7 @@ again:
 			goto err;
 		}
 
-		if (objectid == BTRFS_TREE_RELOC_OBJECTID)
-			ret = btrfs_add_dead_reloc_root(dead_root);
-		else
-			ret = btrfs_add_dead_root(dead_root, latest);
+		ret = btrfs_add_dead_root(dead_root);
 		if (ret)
 			goto err;
 		goto again;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2ff7cd2..e9ef8c3 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -52,7 +52,6 @@
 #include "export.h"
 #include "compression.h"
 
-
 static struct super_operations btrfs_super_ops;
 
 static void btrfs_put_super(struct super_block *sb)
@@ -322,7 +321,7 @@ static int btrfs_fill_super(struct super_block *sb,
 	struct dentry *root_dentry;
 	struct btrfs_super_block *disk_super;
 	struct btrfs_root *tree_root;
-	struct btrfs_inode *bi;
+	struct btrfs_key key;
 	int err;
 
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -341,23 +340,15 @@ static int btrfs_fill_super(struct super_block *sb,
 	}
 	sb->s_fs_info = tree_root;
 	disk_super = &tree_root->fs_info->super_copy;
-	inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID,
-				  tree_root->fs_info->fs_root);
-	bi = BTRFS_I(inode);
-	bi->location.objectid = inode->i_ino;
-	bi->location.offset = 0;
-	bi->root = tree_root->fs_info->fs_root;
-
-	btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
 
-	if (!inode) {
-		err = -ENOMEM;
+	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+	inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
 		goto fail_close;
 	}
-	if (inode->i_state & I_NEW) {
-		btrfs_read_locked_inode(inode);
-		unlock_new_inode(inode);
-	}
 
 	root_dentry = d_alloc_root(inode);
 	if (!root_dentry) {
@@ -584,7 +575,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
 			return -EINVAL;
 
-		ret = btrfs_cleanup_reloc_trees(root);
+		/* recover relocation */
+		ret = btrfs_recover_relocation(root);
 		WARN_ON(ret);
 
 		ret = btrfs_cleanup_fs_roots(root->fs_info);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 01b1436..2e177d7 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -25,7 +25,6 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "locking.h"
-#include "ref-cache.h"
 #include "tree-log.h"
 
 #define BTRFS_ROOT_TRANS_TAG 0
@@ -94,45 +93,37 @@ static noinline int join_transaction(struct btrfs_root *root)
  * to make sure the old root from before we joined the transaction is deleted
  * when the transaction commits
  */
-noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
+static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root)
 {
-	struct btrfs_dirty_root *dirty;
-	u64 running_trans_id = root->fs_info->running_transaction->transid;
-	if (root->ref_cows && root->last_trans < running_trans_id) {
+	if (root->ref_cows && root->last_trans < trans->transid) {
 		WARN_ON(root == root->fs_info->extent_root);
-		if (root->root_item.refs != 0) {
-			radix_tree_tag_set(&root->fs_info->fs_roots_radix,
-				   (unsigned long)root->root_key.objectid,
-				   BTRFS_ROOT_TRANS_TAG);
-
-			dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
-			BUG_ON(!dirty);
-			dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
-			BUG_ON(!dirty->root);
-			dirty->latest_root = root;
-			INIT_LIST_HEAD(&dirty->list);
-
-			root->commit_root = btrfs_root_node(root);
-
-			memcpy(dirty->root, root, sizeof(*root));
-			spin_lock_init(&dirty->root->node_lock);
-			spin_lock_init(&dirty->root->list_lock);
-			mutex_init(&dirty->root->objectid_mutex);
-			mutex_init(&dirty->root->log_mutex);
-			INIT_LIST_HEAD(&dirty->root->dead_list);
-			dirty->root->node = root->commit_root;
-			dirty->root->commit_root = NULL;
+		WARN_ON(root->root_item.refs == 0);
+		WARN_ON(root->commit_root != root->node);
+
+		radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+			   (unsigned long)root->root_key.objectid,
+			   BTRFS_ROOT_TRANS_TAG);
+		root->last_trans = trans->transid;
+		btrfs_init_reloc_root(trans, root);
+	}
+	return 0;
+}
 
-			spin_lock(&root->list_lock);
-			list_add(&dirty->root->dead_list, &root->dead_list);
-			spin_unlock(&root->list_lock);
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root)
+{
+	if (!root->ref_cows)
+		return 0;
 
-			root->dirty_root = dirty;
-		} else {
-			WARN_ON(1);
-		}
-		root->last_trans = running_trans_id;
+	mutex_lock(&root->fs_info->trans_mutex);
+	if (root->last_trans == trans->transid) {
+		mutex_unlock(&root->fs_info->trans_mutex);
+		return 0;
 	}
+
+	record_root_in_trans(trans, root);
+	mutex_unlock(&root->fs_info->trans_mutex);
 	return 0;
 }
 
@@ -181,7 +172,6 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	ret = join_transaction(root);
 	BUG_ON(ret);
 
-	btrfs_record_root_in_trans(root);
 	h->transid = root->fs_info->running_transaction->transid;
 	h->transaction = root->fs_info->running_transaction;
 	h->blocks_reserved = num_blocks;
@@ -192,6 +182,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	h->delayed_ref_updates = 0;
 
 	root->fs_info->running_transaction->use_count++;
+	record_root_in_trans(h, root);
 	mutex_unlock(&root->fs_info->trans_mutex);
 	return h;
 }
@@ -233,6 +224,7 @@ static noinline int wait_for_commit(struct btrfs_root *root,
 	return 0;
 }
 
+#if 0
 /*
  * rate limit against the drop_snapshot code.  This helps to slow down new
  * operations if the drop_snapshot code isn't able to keep up.
@@ -273,6 +265,7 @@ harder:
 			goto harder;
 	}
 }
+#endif
 
 void btrfs_throttle(struct btrfs_root *root)
 {
@@ -280,7 +273,6 @@ void btrfs_throttle(struct btrfs_root *root)
 	if (!root->fs_info->open_ioctl_trans)
 		wait_current_trans(root);
 	mutex_unlock(&root->fs_info->trans_mutex);
-	throttle_on_drops(root);
 }
 
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
@@ -323,9 +315,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	memset(trans, 0, sizeof(*trans));
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 
-	if (throttle)
-		throttle_on_drops(root);
-
 	return 0;
 }
 
@@ -462,12 +451,8 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
 		if (old_root_bytenr == root->node->start)
 			break;
-		btrfs_set_root_bytenr(&root->root_item,
-				       root->node->start);
-		btrfs_set_root_level(&root->root_item,
-				     btrfs_header_level(root->node));
-		btrfs_set_root_generation(&root->root_item, trans->transid);
 
+		btrfs_set_root_node(&root->root_item, root->node);
 		ret = btrfs_update_root(trans, tree_root,
 					&root->root_key,
 					&root->root_item);
@@ -477,14 +462,16 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
 		BUG_ON(ret);
 	}
+	free_extent_buffer(root->commit_root);
+	root->commit_root = btrfs_root_node(root);
 	return 0;
 }
 
 /*
  * update all the cowonly tree roots on disk
  */
-int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root)
+static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct list_head *next;
@@ -520,118 +507,54 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
  * a dirty root struct and adds it into the list of dead roots that need to
  * be deleted
  */
-int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
+int btrfs_add_dead_root(struct btrfs_root *root)
 {
-	struct btrfs_dirty_root *dirty;
-
-	dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
-	if (!dirty)
-		return -ENOMEM;
-	dirty->root = root;
-	dirty->latest_root = latest;
-
 	mutex_lock(&root->fs_info->trans_mutex);
-	list_add(&dirty->list, &latest->fs_info->dead_roots);
+	list_add(&root->root_list, &root->fs_info->dead_roots);
 	mutex_unlock(&root->fs_info->trans_mutex);
 	return 0;
 }
 
 /*
- * at transaction commit time we need to schedule the old roots for
- * deletion via btrfs_drop_snapshot.  This runs through all the
- * reference counted roots that were modified in the current
- * transaction and puts them into the drop list
+ * update all the cowonly tree roots on disk
  */
-static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
-				    struct radix_tree_root *radix,
-				    struct list_head *list)
+static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root)
 {
-	struct btrfs_dirty_root *dirty;
 	struct btrfs_root *gang[8];
-	struct btrfs_root *root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
 	int i;
 	int ret;
 	int err = 0;
-	u32 refs;
 
 	while (1) {
-		ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
+		ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
+						 (void **)gang, 0,
 						 ARRAY_SIZE(gang),
 						 BTRFS_ROOT_TRANS_TAG);
 		if (ret == 0)
 			break;
 		for (i = 0; i < ret; i++) {
 			root = gang[i];
-			radix_tree_tag_clear(radix,
-				     (unsigned long)root->root_key.objectid,
-				     BTRFS_ROOT_TRANS_TAG);
-
-			BUG_ON(!root->ref_tree);
-			dirty = root->dirty_root;
+			radix_tree_tag_clear(&fs_info->fs_roots_radix,
+					(unsigned long)root->root_key.objectid,
+					BTRFS_ROOT_TRANS_TAG);
 
 			btrfs_free_log(trans, root);
-			btrfs_free_reloc_root(trans, root);
-
-			if (root->commit_root == root->node) {
-				WARN_ON(root->node->start !=
-					btrfs_root_bytenr(&root->root_item));
-
-				free_extent_buffer(root->commit_root);
-				root->commit_root = NULL;
-				root->dirty_root = NULL;
-
-				spin_lock(&root->list_lock);
-				list_del_init(&dirty->root->dead_list);
-				spin_unlock(&root->list_lock);
+			btrfs_update_reloc_root(trans, root);
 
-				kfree(dirty->root);
-				kfree(dirty);
-
-				/* make sure to update the root on disk
-				 * so we get any updates to the block used
-				 * counts
-				 */
-				err = btrfs_update_root(trans,
-						root->fs_info->tree_root,
-						&root->root_key,
-						&root->root_item);
+			if (root->commit_root == root->node)
 				continue;
-			}
 
-			memset(&root->root_item.drop_progress, 0,
-			       sizeof(struct btrfs_disk_key));
-			root->root_item.drop_level = 0;
-			root->commit_root = NULL;
-			root->dirty_root = NULL;
-			root->root_key.offset = root->fs_info->generation;
-			btrfs_set_root_bytenr(&root->root_item,
-					      root->node->start);
-			btrfs_set_root_level(&root->root_item,
-					     btrfs_header_level(root->node));
-			btrfs_set_root_generation(&root->root_item,
-						  root->root_key.offset);
-
-			err = btrfs_insert_root(trans, root->fs_info->tree_root,
+			free_extent_buffer(root->commit_root);
+			root->commit_root = btrfs_root_node(root);
+
+			btrfs_set_root_node(&root->root_item, root->node);
+			err = btrfs_update_root(trans, fs_info->tree_root,
 						&root->root_key,
 						&root->root_item);
 			if (err)
 				break;
-
-			refs = btrfs_root_refs(&dirty->root->root_item);
-			btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
-			err = btrfs_update_root(trans, root->fs_info->tree_root,
-						&dirty->root->root_key,
-						&dirty->root->root_item);
-
-			BUG_ON(err);
-			if (refs == 1) {
-				list_add(&dirty->list, list);
-			} else {
-				WARN_ON(1);
-				free_extent_buffer(dirty->root->node);
-				kfree(dirty->root);
-				kfree(dirty);
-			}
 		}
 	}
 	return err;
@@ -688,12 +611,8 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
 				TASK_UNINTERRUPTIBLE);
 		mutex_unlock(&info->trans_mutex);
 
-		atomic_dec(&info->throttles);
-		wake_up(&info->transaction_throttle);
-
 		schedule();
 
-		atomic_inc(&info->throttles);
 		mutex_lock(&info->trans_mutex);
 		finish_wait(&info->transaction_wait, &wait);
 	}
@@ -705,111 +624,61 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
  * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
  * all of them
  */
-static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
-				     struct list_head *list)
+int btrfs_drop_dead_root(struct btrfs_root *root)
 {
-	struct btrfs_dirty_root *dirty;
 	struct btrfs_trans_handle *trans;
+	struct btrfs_root *tree_root = root->fs_info->tree_root;
 	unsigned long nr;
-	u64 num_bytes;
-	u64 bytes_used;
-	u64 max_useless;
-	int ret = 0;
-	int err;
-
-	while (!list_empty(list)) {
-		struct btrfs_root *root;
-
-		dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
-		list_del_init(&dirty->list);
-
-		num_bytes = btrfs_root_used(&dirty->root->root_item);
-		root = dirty->latest_root;
-		atomic_inc(&root->fs_info->throttles);
-
-		while (1) {
-			/*
-			 * we don't want to jump in and create a bunch of
-			 * delayed refs if the transaction is starting to close
-			 */
-			wait_transaction_pre_flush(tree_root->fs_info);
-			trans = btrfs_start_transaction(tree_root, 1);
-
-			/*
-			 * we've joined a transaction, make sure it isn't
-			 * closing right now
-			 */
-			if (trans->transaction->delayed_refs.flushing) {
-				btrfs_end_transaction(trans, tree_root);
-				continue;
-			}
-
-			mutex_lock(&root->fs_info->drop_mutex);
-			ret = btrfs_drop_snapshot(trans, dirty->root);
-			if (ret != -EAGAIN)
-				break;
-			mutex_unlock(&root->fs_info->drop_mutex);
+	int ret;
 
-			err = btrfs_update_root(trans,
-					tree_root,
-					&dirty->root->root_key,
-					&dirty->root->root_item);
-			if (err)
-				ret = err;
-			nr = trans->blocks_used;
-			ret = btrfs_end_transaction(trans, tree_root);
-			BUG_ON(ret);
+	while (1) {
+		/*
+		 * we don't want to jump in and create a bunch of
+		 * delayed refs if the transaction is starting to close
+		 */
+		wait_transaction_pre_flush(tree_root->fs_info);
+		trans = btrfs_start_transaction(tree_root, 1);
 
-			btrfs_btree_balance_dirty(tree_root, nr);
-			cond_resched();
+		/*
+		 * we've joined a transaction, make sure it isn't
+		 * closing right now
+		 */
+		if (trans->transaction->delayed_refs.flushing) {
+			btrfs_end_transaction(trans, tree_root);
+			continue;
 		}
-		BUG_ON(ret);
-		atomic_dec(&root->fs_info->throttles);
-		wake_up(&root->fs_info->transaction_throttle);
 
-		num_bytes -= btrfs_root_used(&dirty->root->root_item);
-		bytes_used = btrfs_root_used(&root->root_item);
-		if (num_bytes) {
-			mutex_lock(&root->fs_info->trans_mutex);
-			btrfs_record_root_in_trans(root);
-			mutex_unlock(&root->fs_info->trans_mutex);
-			btrfs_set_root_used(&root->root_item,
-					    bytes_used - num_bytes);
-		}
+		ret = btrfs_drop_snapshot(trans, root);
+		if (ret != -EAGAIN)
+			break;
 
-		ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
-		if (ret) {
-			BUG();
+		ret = btrfs_update_root(trans, tree_root,
+					&root->root_key,
+					&root->root_item);
+		if (ret)
 			break;
-		}
-		mutex_unlock(&root->fs_info->drop_mutex);
-
-		spin_lock(&root->list_lock);
-		list_del_init(&dirty->root->dead_list);
-		if (!list_empty(&root->dead_list)) {
-			struct btrfs_root *oldest;
-			oldest = list_entry(root->dead_list.prev,
-					    struct btrfs_root, dead_list);
-			max_useless = oldest->root_key.offset - 1;
-		} else {
-			max_useless = root->root_key.offset - 1;
-		}
-		spin_unlock(&root->list_lock);
 
 		nr = trans->blocks_used;
 		ret = btrfs_end_transaction(trans, tree_root);
 		BUG_ON(ret);
 
-		ret = btrfs_remove_leaf_refs(root, max_useless, 0);
-		BUG_ON(ret);
-
-		free_extent_buffer(dirty->root->node);
-		kfree(dirty->root);
-		kfree(dirty);
-
 		btrfs_btree_balance_dirty(tree_root, nr);
 		cond_resched();
 	}
+	BUG_ON(ret);
+
+	ret = btrfs_del_root(trans, tree_root, &root->root_key);
+	BUG_ON(ret);
+
+	nr = trans->blocks_used;
+	ret = btrfs_end_transaction(trans, tree_root);
+	BUG_ON(ret);
+
+	free_extent_buffer(root->node);
+	free_extent_buffer(root->commit_root);
+	kfree(root);
+
+	btrfs_btree_balance_dirty(tree_root, nr);
 	return ret;
 }
 
@@ -839,24 +708,23 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	if (ret)
 		goto fail;
 
-	btrfs_record_root_in_trans(root);
+	record_root_in_trans(trans, root);
 	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
 	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
 
 	key.objectid = objectid;
-	key.offset = trans->transid;
+	key.offset = 0;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 
 	old = btrfs_lock_root_node(root);
 	btrfs_cow_block(trans, root, old, NULL, 0, &old);
+	btrfs_set_lock_blocking(old);
 
 	btrfs_copy_root(trans, root, old, &tmp, objectid);
 	btrfs_tree_unlock(old);
 	free_extent_buffer(old);
 
-	btrfs_set_root_bytenr(new_root_item, tmp->start);
-	btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
-	btrfs_set_root_generation(new_root_item, trans->transid);
+	btrfs_set_root_node(new_root_item, tmp);
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 				new_root_item);
 	btrfs_tree_unlock(tmp);
@@ -964,6 +832,24 @@ static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static void update_super_roots(struct btrfs_root *root)
+{
+	struct btrfs_root_item *root_item;
+	struct btrfs_super_block *super;
+
+	super = &root->fs_info->super_copy;
+
+	root_item = &root->fs_info->chunk_root->root_item;
+	super->chunk_root = root_item->bytenr;
+	super->chunk_root_generation = root_item->generation;
+	super->chunk_root_level = root_item->level;
+
+	root_item = &root->fs_info->tree_root->root_item;
+	super->root = root_item->bytenr;
+	super->generation = root_item->generation;
+	super->root_level = root_item->level;
+}
+
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root)
 {
@@ -971,8 +857,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	unsigned long timeout = 1;
 	struct btrfs_transaction *cur_trans;
 	struct btrfs_transaction *prev_trans = NULL;
-	struct btrfs_root *chunk_root = root->fs_info->chunk_root;
-	struct list_head dirty_fs_roots;
 	struct extent_io_tree *pinned_copy;
 	DEFINE_WAIT(wait);
 	int ret;
@@ -999,7 +883,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 
 	mutex_lock(&root->fs_info->trans_mutex);
-	INIT_LIST_HEAD(&dirty_fs_roots);
 	if (cur_trans->in_commit) {
 		cur_trans->use_count++;
 		mutex_unlock(&root->fs_info->trans_mutex);
@@ -1105,41 +988,36 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	 * with the tree-log code.
 	 */
 	mutex_lock(&root->fs_info->tree_log_mutex);
-	/*
-	 * keep tree reloc code from adding new reloc trees
-	 */
-	mutex_lock(&root->fs_info->tree_reloc_mutex);
-
 
-	ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
-			      &dirty_fs_roots);
+	ret = commit_fs_roots(trans, root);
 	BUG_ON(ret);
 
-	/* add_dirty_roots gets rid of all the tree log roots, it is now
+	/* commit_fs_roots gets rid of all the tree log roots, it is now
 	 * safe to free the root of tree log roots
 	 */
 	btrfs_free_log_root_tree(trans, root->fs_info);
 
-	ret = btrfs_commit_tree_roots(trans, root);
+	ret = commit_cowonly_roots(trans, root);
 	BUG_ON(ret);
 
 	cur_trans = root->fs_info->running_transaction;
 	spin_lock(&root->fs_info->new_trans_lock);
 	root->fs_info->running_transaction = NULL;
 	spin_unlock(&root->fs_info->new_trans_lock);
-	btrfs_set_super_generation(&root->fs_info->super_copy,
-				   cur_trans->transid);
-	btrfs_set_super_root(&root->fs_info->super_copy,
-			     root->fs_info->tree_root->node->start);
-	btrfs_set_super_root_level(&root->fs_info->super_copy,
-			   btrfs_header_level(root->fs_info->tree_root->node));
-
-	btrfs_set_super_chunk_root(&root->fs_info->super_copy,
-				   chunk_root->node->start);
-	btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
-					 btrfs_header_level(chunk_root->node));
-	btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy,
-				btrfs_header_generation(chunk_root->node));
+
+	btrfs_set_root_node(&root->fs_info->tree_root->root_item,
+			    root->fs_info->tree_root->node);
+	free_extent_buffer(root->fs_info->tree_root->commit_root);
+	root->fs_info->tree_root->commit_root =
+				btrfs_root_node(root->fs_info->tree_root);
+
+	btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
+			    root->fs_info->chunk_root->node);
+	free_extent_buffer(root->fs_info->chunk_root->commit_root);
+	root->fs_info->chunk_root->commit_root =
+				btrfs_root_node(root->fs_info->chunk_root);
+
+	update_super_roots(root);
 
 	if (!root->fs_info->log_root_recovering) {
 		btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
@@ -1153,7 +1031,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	trans->transaction->blocked = 0;
 
-	wake_up(&root->fs_info->transaction_throttle);
 	wake_up(&root->fs_info->transaction_wait);
 
 	mutex_unlock(&root->fs_info->trans_mutex);
@@ -1170,9 +1047,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	btrfs_finish_extent_commit(trans, root, pinned_copy);
 	kfree(pinned_copy);
 
-	btrfs_drop_dead_reloc_roots(root);
-	mutex_unlock(&root->fs_info->tree_reloc_mutex);
-
 	/* do the directory inserts of any pending snapshot creations */
 	finish_pending_snapshots(trans, root->fs_info);
 
@@ -1186,16 +1060,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	put_transaction(cur_trans);
 	put_transaction(cur_trans);
 
-	list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
-	if (root->fs_info->closing)
-		list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
-
 	mutex_unlock(&root->fs_info->trans_mutex);
 
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
-
-	if (root->fs_info->closing)
-		drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
 	return ret;
 }
 
@@ -1204,16 +1071,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
  */
 int btrfs_clean_old_snapshots(struct btrfs_root *root)
 {
-	struct list_head dirty_roots;
-	INIT_LIST_HEAD(&dirty_roots);
-again:
-	mutex_lock(&root->fs_info->trans_mutex);
-	list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
-	mutex_unlock(&root->fs_info->trans_mutex);
+	LIST_HEAD(list);
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	mutex_lock(&fs_info->trans_mutex);
+	list_splice_init(&fs_info->dead_roots, &list);
+	mutex_unlock(&fs_info->trans_mutex);
 
-	if (!list_empty(&dirty_roots)) {
-		drop_dirty_roots(root, &dirty_roots);
-		goto again;
+	while (!list_empty(&list)) {
+		root = list_entry(list.next, struct btrfs_root, root_list);
+		list_del_init(&root->root_list);
+		btrfs_drop_dead_root(root);
 	}
 	return 0;
 }
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 94f5bde..961c3ee 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -62,12 +62,6 @@ struct btrfs_pending_snapshot {
 	struct list_head list;
 };
 
-struct btrfs_dirty_root {
-	struct list_head list;
-	struct btrfs_root *root;
-	struct btrfs_root *latest_root;
-};
-
 static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
 					       struct inode *inode)
 {
@@ -100,7 +94,8 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root);
 
-int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest);
+int btrfs_add_dead_root(struct btrfs_root *root);
+int btrfs_drop_dead_root(struct btrfs_root *root);
 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
 int btrfs_clean_old_snapshots(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
@@ -108,7 +103,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root);
 void btrfs_throttle(struct btrfs_root *root);
-int btrfs_record_root_in_trans(struct btrfs_root *root);
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root);
 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 					struct extent_io_tree *dirty_pages);
 #endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index db5e212..2b41fc0 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -430,18 +430,16 @@ no_copy:
 static noinline struct inode *read_one_inode(struct btrfs_root *root,
 					     u64 objectid)
 {
+	struct btrfs_key key;
 	struct inode *inode;
-	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
-	if (inode->i_state & I_NEW) {
-		BTRFS_I(inode)->root = root;
-		BTRFS_I(inode)->location.objectid = objectid;
-		BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
-		BTRFS_I(inode)->location.offset = 0;
-		btrfs_read_locked_inode(inode);
-		unlock_new_inode(inode);
 
-	}
-	if (is_bad_inode(inode)) {
+	key.objectid = objectid;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+	inode = btrfs_iget(root->fs_info->sb, &key, root);
+	if (IS_ERR(inode)) {
+		inode = NULL;
+	} else if (is_bad_inode(inode)) {
 		iput(inode);
 		inode = NULL;
 	}
@@ -541,6 +539,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 
 	if (found_type == BTRFS_FILE_EXTENT_REG ||
 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+		u64 offset;
 		unsigned long dest_offset;
 		struct btrfs_key ins;
 
@@ -555,6 +554,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
 		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
 		ins.type = BTRFS_EXTENT_ITEM_KEY;
+		offset = key->offset - btrfs_file_extent_offset(eb, item);
 
 		if (ins.objectid > 0) {
 			u64 csum_start;
@@ -569,19 +569,16 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 			if (ret == 0) {
 				ret = btrfs_inc_extent_ref(trans, root,
 						ins.objectid, ins.offset,
-						path->nodes[0]->start,
-						root->root_key.objectid,
-						trans->transid, key->objectid);
+						0, root->root_key.objectid,
+						key->objectid, offset);
 			} else {
 				/*
 				 * insert the extent pointer in the extent
 				 * allocation tree
 				 */
-				ret = btrfs_alloc_logged_extent(trans, root,
-						path->nodes[0]->start,
-						root->root_key.objectid,
-						trans->transid, key->objectid,
-						&ins);
+				ret = btrfs_alloc_logged_file_extent(trans,
+						root, root->root_key.objectid,
+						key->objectid, offset, &ins);
 				BUG_ON(ret);
 			}
 			btrfs_release_path(root, path);
@@ -1706,9 +1703,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 				btrfs_wait_tree_block_writeback(next);
 				btrfs_tree_unlock(next);
 
-				ret = btrfs_drop_leaf_ref(trans, root, next);
-				BUG_ON(ret);
-
 				WARN_ON(root_owner !=
 					BTRFS_TREE_LOG_OBJECTID);
 				ret = btrfs_free_reserved_extent(root,
@@ -1753,10 +1747,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 		btrfs_wait_tree_block_writeback(next);
 		btrfs_tree_unlock(next);
 
-		if (*level == 0) {
-			ret = btrfs_drop_leaf_ref(trans, root, next);
-			BUG_ON(ret);
-		}
 		WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
 		ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
 		BUG_ON(ret);
@@ -1811,12 +1801,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 				btrfs_wait_tree_block_writeback(next);
 				btrfs_tree_unlock(next);
 
-				if (*level == 0) {
-					ret = btrfs_drop_leaf_ref(trans, root,
-								  next);
-					BUG_ON(ret);
-				}
-
 				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
 				ret = btrfs_free_reserved_extent(root,
 						path->nodes[*level]->start,
@@ -1884,11 +1868,6 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 			btrfs_wait_tree_block_writeback(next);
 			btrfs_tree_unlock(next);
 
-			if (orig_level == 0) {
-				ret = btrfs_drop_leaf_ref(trans, log,
-							  next);
-				BUG_ON(ret);
-			}
 			WARN_ON(log->root_key.objectid !=
 				BTRFS_TREE_LOG_OBJECTID);
 			ret = btrfs_free_reserved_extent(log, next->start,
@@ -2027,9 +2006,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
 	BUG_ON(ret);
 
-	btrfs_set_root_bytenr(&log->root_item, log->node->start);
-	btrfs_set_root_generation(&log->root_item, trans->transid);
-	btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
+	btrfs_set_root_node(&log->root_item, log->node);
 
 	root->log_batch = 0;
 	root->log_transid++;
@@ -2581,7 +2558,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 				       ins_keys, ins_sizes, nr);
 	BUG_ON(ret);
 
-	for (i = 0; i < nr; i++) {
+	for (i = 0; i < nr; i++, dst_path->slots[0]++) {
 		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
 						   dst_path->slots[0]);
 
@@ -2617,36 +2594,31 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 			found_type = btrfs_file_extent_type(src, extent);
 			if (found_type == BTRFS_FILE_EXTENT_REG ||
 			    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
-				u64 ds = btrfs_file_extent_disk_bytenr(src,
-								   extent);
-				u64 dl = btrfs_file_extent_disk_num_bytes(src,
-								      extent);
-				u64 cs = btrfs_file_extent_offset(src, extent);
-				u64 cl = btrfs_file_extent_num_bytes(src,
-								     extent);;
+				u64 ds, dl, cs, cl;
+				ds = btrfs_file_extent_disk_bytenr(src,
+								extent);
+				/* ds == 0 is a hole */
+				if (ds == 0)
+					continue;
+
+				dl = btrfs_file_extent_disk_num_bytes(src,
+								extent);
+				cs = btrfs_file_extent_offset(src, extent);
+				cl = btrfs_file_extent_num_bytes(src,
+								extent);;
 				if (btrfs_file_extent_compression(src,
 								  extent)) {
 					cs = 0;
 					cl = dl;
 				}
-				/* ds == 0 is a hole */
-				if (ds != 0) {
-					ret = btrfs_inc_extent_ref(trans, log,
-						   ds, dl,
-						   dst_path->nodes[0]->start,
-						   BTRFS_TREE_LOG_OBJECTID,
-						   trans->transid,
-						   ins_keys[i].objectid);
-					BUG_ON(ret);
-					ret = btrfs_lookup_csums_range(
-						   log->fs_info->csum_root,
-						   ds + cs, ds + cs + cl - 1,
-						   &ordered_sums);
-					BUG_ON(ret);
-				}
+
+				ret = btrfs_lookup_csums_range(
+						log->fs_info->csum_root,
+						ds + cs, ds + cs + cl - 1,
+						&ordered_sums);
+				BUG_ON(ret);
 			}
 		}
-		dst_path->slots[0]++;
 	}
 
 	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
@@ -3029,9 +3001,7 @@ again:
 		BUG_ON(!wc.replay_dest);
 
 		wc.replay_dest->log_root = log;
-		mutex_lock(&fs_info->trans_mutex);
-		btrfs_record_root_in_trans(wc.replay_dest);
-		mutex_unlock(&fs_info->trans_mutex);
+		btrfs_record_root_in_trans(trans, wc.replay_dest);
 		ret = walk_log_tree(trans, log, &wc);
 		BUG_ON(ret);
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a6d35b0..8bc6a88 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1671,8 +1671,6 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 	int ret;
 	int i;
 
-	printk(KERN_INFO "btrfs relocating chunk %llu\n",
-	       (unsigned long long)chunk_offset);
 	root = root->fs_info->chunk_root;
 	extent_root = root->fs_info->extent_root;
 	em_tree = &root->fs_info->mapping_tree.map_tree;
-- 
cgit v0.10.2


From b36124210248706186a02093427bdff4b3f548e8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 13 May 2009 19:12:15 -0400
Subject: Btrfs: stop avoiding balancing at the end of the transaction.

When the delayed reference code was added, some checks were added
to avoid extra balancing while the delayed references were being flushed.
This made for less efficient btrees, but it reduced the chances of
loops where no forward progress was made because the balances made
more delayed ref updates.

With the new dead root removal code and the mixed back references,
the extent allocation tree is no longer using precise back refs, and
the delayed reference updates don't carry the risk of looping forever
anymore.  So, the balance avoidance is no longer required.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 2b96027..2f633e7 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1052,8 +1052,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
 		return 0;
 
-	if (trans->transaction->delayed_refs.flushing &&
-	    btrfs_header_nritems(mid) > 2)
+	if (btrfs_header_nritems(mid) > 2)
 		return 0;
 
 	if (btrfs_header_nritems(mid) < 2)
@@ -2194,7 +2193,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 		ret = insert_new_root(trans, root, path, level + 1);
 		if (ret)
 			return ret;
-	} else if (!trans->transaction->delayed_refs.flushing) {
+	} else {
 		ret = push_nodes_for_insert(trans, root, path, level);
 		c = path->nodes[level];
 		if (!ret && btrfs_header_nritems(c) <
@@ -2869,8 +2868,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	int num_doubles = 0;
 
 	/* first try to make some room by pushing left and right */
-	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY &&
-	    !trans->transaction->delayed_refs.flushing) {
+	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
 		wret = push_leaf_right(trans, root, path, data_size, 0);
 		if (wret < 0)
 			return wret;
@@ -3809,8 +3807,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		}
 
 		/* delete the leaf if it is mostly empty */
-		if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 &&
-		    !trans->transaction->delayed_refs.flushing) {
+		if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
 			/* push_leaf_left fixes the path.
 			 * make sure the path still points to our leaf
 			 * for possible call to del_ptr below
-- 
cgit v0.10.2


From cfbb9308463f6dad1334884db046ccf0f1a77918 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 18 May 2009 10:41:58 -0400
Subject: Btrfs: balance btree more often

With the new back reference code, the cost of a balance has gone down
in terms of the number of back reference updates done.  This commit
makes us more aggressively balance leaves and nodes as they become
less full.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 2f633e7..60a45f3 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1651,7 +1651,7 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
 		}
 		b = p->nodes[level];
 	} else if (ins_len < 0 && btrfs_header_nritems(b) <
-		   BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
+		   BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
 		int sret;
 
 		sret = reada_for_balance(root, p, level);
@@ -3807,7 +3807,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		}
 
 		/* delete the leaf if it is mostly empty */
-		if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
+		if (used < BTRFS_LEAF_DATA_SIZE(root) / 2) {
 			/* push_leaf_left fixes the path.
 			 * make sure the path still points to our leaf
 			 * for possible call to del_ptr below
-- 
cgit v0.10.2


From 2c943de6ad795a174dcc424c293bb77f15ae3b8c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 18 May 2009 10:41:58 -0400
Subject: Btrfs: reduce mount -o ssd CPU usage

The block allocator in SSD mode will try to find groups of free blocks
that are close together.  This commit makes it loop less on a given
group size before bumping it.

The end result is that we are less likely to fill small holes in the
available free space, but we don't waste as much CPU building the
large cluster used by ssd mode.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 0bc9365..28016558 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -652,7 +652,7 @@ again:
 			last = entry;
 			max_extent = 0;
 			total_retries++;
-			if (total_retries % 256 == 0) {
+			if (total_retries % 64 == 0) {
 				if (min_bytes >= (bytes + empty_size)) {
 					ret = -ENOSPC;
 					goto out;
-- 
cgit v0.10.2


From 585ad2c3797dcaa643aeba75b9f072778adf3490 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 18 May 2009 10:41:58 -0400
Subject: Btrfs: fix metadata dirty throttling limits

Once a metadata block has been written, it must be recowed, so the
btrfs dirty balancing call has a check to make sure a fair amount of metadata
was actually dirty before it started writing it back to disk.

A previous commit had changed the dirty tracking for metadata without
updating the btrfs dirty balancing checks.  This commit switches it
to use the correct counter.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7f5c6e3..e572cf4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2378,17 +2378,14 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 	 * looks as though older kernels can get into trouble with
 	 * this code, they end up stuck in balance_dirty_pages forever
 	 */
-	struct extent_io_tree *tree;
 	u64 num_dirty;
-	u64 start = 0;
 	unsigned long thresh = 32 * 1024 * 1024;
-	tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
 
 	if (current->flags & PF_MEMALLOC)
 		return;
 
-	num_dirty = count_range_bits(tree, &start, (u64)-1,
-				     thresh, EXTENT_DIRTY);
+	num_dirty = root->fs_info->dirty_metadata_bytes;
+
 	if (num_dirty > thresh) {
 		balance_dirty_pages_ratelimited_nr(
 				   root->fs_info->btree_inode->i_mapping, 1);
-- 
cgit v0.10.2


From d84275c938e1a5e2dc5b89eb9b878e0ddb2c55e0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 9 Jun 2009 15:39:08 -0400
Subject: Btrfs: don't allow WRITE_SYNC bios to starve out regular writes

Btrfs uses dedicated threads to submit bios when checksumming is on,
which allows us to make sure the threads dedicated to checksumming don't get
stuck waiting for requests.  For each btrfs device, there are
two lists of bios.  One list is for WRITE_SYNC bios and the other
is for regular priority bios.

The IO submission threads used to process all of the WRITE_SYNC bios first and
then switch to the regular bios.  This commit makes sure we don't completely
starve the regular bios by rotating between the two lists.

WRITE_SYNC bios are still favored 2:1 over the regular bios, and this tries
to run in batches to avoid seeking.  Benchmarking shows this eliminates
stalls during streaming buffered writes on both multi-device and
single device filesystems.

If the regular bios starve, the system can end up with a large amount of ram
pinned down in writeback pages.  If we are a little more fair between the two
classes, we're able to keep throughput up and make progress on the bulk of
our dirty ram.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8bc6a88..9d36181 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -163,6 +163,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 	unsigned long num_sync_run;
 	unsigned long limit;
 	unsigned long last_waited = 0;
+	int force_reg = 0;
 
 	bdi = blk_get_backing_dev_info(device->bdev);
 	fs_info = device->dev_root->fs_info;
@@ -176,19 +177,22 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 
 loop:
 	spin_lock(&device->io_lock);
-	num_run = 0;
 
 loop_lock:
+	num_run = 0;
 
 	/* take all the bios off the list at once and process them
 	 * later on (without the lock held).  But, remember the
 	 * tail and other pointers so the bios can be properly reinserted
 	 * into the list if we hit congestion
 	 */
-	if (device->pending_sync_bios.head)
+	if (!force_reg && device->pending_sync_bios.head) {
 		pending_bios = &device->pending_sync_bios;
-	else
+		force_reg = 1;
+	} else {
 		pending_bios = &device->pending_bios;
+		force_reg = 0;
+	}
 
 	pending = pending_bios->head;
 	tail = pending_bios->tail;
@@ -228,10 +232,14 @@ loop_lock:
 	while (pending) {
 
 		rmb();
-		if (pending_bios != &device->pending_sync_bios &&
-		    device->pending_sync_bios.head &&
-		    num_run > 16) {
-			cond_resched();
+		/* we want to work on both lists, but do more bios on the
+		 * sync list than the regular list
+		 */
+		if ((num_run > 32 &&
+		    pending_bios != &device->pending_sync_bios &&
+		    device->pending_sync_bios.head) ||
+		   (num_run > 64 && pending_bios == &device->pending_sync_bios &&
+		    device->pending_bios.head)) {
 			spin_lock(&device->io_lock);
 			requeue_list(pending_bios, pending, tail);
 			goto loop_lock;
-- 
cgit v0.10.2


From d644d8a1e30b88a93bcfb63cada2ae628462ddba Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 9 Jun 2009 15:59:22 -0400
Subject: Btrfs: avoid IO stalls behind congested devices in a multi-device FS

The btrfs IO submission threads try to service a bunch of devices with a small
number of threads.  They do a congestion check to try and avoid waiting
on requests for a busy device.

The checks make sure we've sent a few requests down to a given device just so
that we aren't bouncing between busy devices without actually sending down
any IO.  The counter used to decide if we can switch to the next device
is somewhat overloaded.  It is also being used to decide if we've done
a good batch of requests between the WRITE_SYNC or regular priority lists.
It may get reset to zero often, leaving us hammering on a busy device
instead of moving on to another disk.

This commit adds a new counter for the number of bios sent while
servicing a device.  It doesn't get reset or fiddled with.  On
multi-device filesystems, this fixes IO stalls in streaming
write workloads.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9d36181..27d5f37 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -161,6 +161,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 	int again = 0;
 	unsigned long num_run;
 	unsigned long num_sync_run;
+	unsigned long batch_run = 0;
 	unsigned long limit;
 	unsigned long last_waited = 0;
 	int force_reg = 0;
@@ -257,6 +258,8 @@ loop_lock:
 		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
 		submit_bio(cur->bi_rw, cur);
 		num_run++;
+		batch_run++;
+
 		if (bio_sync(cur))
 			num_sync_run++;
 
@@ -273,7 +276,7 @@ loop_lock:
 		 * is now congested.  Back off and let other work structs
 		 * run instead
 		 */
-		if (pending && bdi_write_congested(bdi) && num_run > 16 &&
+		if (pending && bdi_write_congested(bdi) && batch_run > 32 &&
 		    fs_info->fs_devices->open_devices > 1) {
 			struct io_context *ioc;
 
-- 
cgit v0.10.2


From 3b30c22f64a6bb297719c60e494af1d26563f584 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 9 Jun 2009 16:42:22 -0400
Subject: Btrfs: Add mount -o nossd

This allows you to turn off the ssd mode via remount.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index e9ef8c3..22855a1 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -66,8 +66,8 @@ static void btrfs_put_super(struct super_block *sb)
 enum {
 	Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
 	Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
-	Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_notreelog,
-	Opt_ratio, Opt_flushoncommit, Opt_err,
+	Opt_ssd, Opt_nossd, Opt_thread_pool, Opt_noacl,  Opt_compress,
+	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -83,6 +83,7 @@ static match_table_t tokens = {
 	{Opt_thread_pool, "thread_pool=%d"},
 	{Opt_compress, "compress"},
 	{Opt_ssd, "ssd"},
+	{Opt_nossd, "nossd"},
 	{Opt_noacl, "noacl"},
 	{Opt_notreelog, "notreelog"},
 	{Opt_flushoncommit, "flushoncommit"},
@@ -173,6 +174,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
 			btrfs_set_opt(info->mount_opt, SSD);
 			break;
+		case Opt_nossd:
+			printk(KERN_INFO "btrfs: not using ssd allocation scheme\n");
+			btrfs_clear_opt(info->mount_opt, SSD);
+			break;
 		case Opt_nobarrier:
 			printk(KERN_INFO "btrfs: turning off barriers\n");
 			btrfs_set_opt(info->mount_opt, NOBARRIER);
-- 
cgit v0.10.2


From c604480171c510c1beeb81b82418e5bc4de8f1ae Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 9 Jun 2009 18:35:15 -0400
Subject: Btrfs: avoid allocation clusters that are too spread out

In SSD mode for data, and all the time for metadata the allocator
will try to find a cluster of nearby blocks for allocations.  This
commit adds extra checks to make sure that each free block in the
cluster is close to the last one.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 28016558..ac23476 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -645,7 +645,8 @@ again:
 		 * we haven't filled the empty size and the window is
 		 * very large.  reset and try again
 		 */
-		if (next->offset - window_start > (bytes + empty_size) * 2) {
+		if (next->offset - (last->offset + last->bytes) > 128 * 1024 ||
+		    next->offset - window_start > (bytes + empty_size) * 2) {
 			entry = next;
 			window_start = entry->offset;
 			window_free = entry->bytes;
-- 
cgit v0.10.2


From 451d7585a8bb1b9bec0d676ce3dece1923164e55 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 9 Jun 2009 20:28:34 -0400
Subject: Btrfs: add mount -o ssd_spread to spread allocations out

Some SSDs perform best when reusing block numbers often, while
others perform much better when clustering strictly allocates
big chunks of unused space.

The default mount -o ssd will find rough groupings of blocks
where there are a bunch of free blocks that might have some
allocated blocks mixed in.

mount -o ssd_spread will make sure there are no allocated blocks
mixed in.  It should perform better on lower end SSDs.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ce3ab4e..b9d8788 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1100,6 +1100,7 @@ struct btrfs_root {
 #define BTRFS_MOUNT_COMPRESS		(1 << 5)
 #define BTRFS_MOUNT_NOTREELOG           (1 << 6)
 #define BTRFS_MOUNT_FLUSHONCOMMIT       (1 << 7)
+#define BTRFS_MOUNT_SSD_SPREAD		(1 << 8)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a42419c..3355d7e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3607,7 +3607,7 @@ refill_cluster:
 			last_ptr_loop = 0;
 
 			/* allocate a cluster in this block group */
-			ret = btrfs_find_space_cluster(trans,
+			ret = btrfs_find_space_cluster(trans, root,
 					       block_group, last_ptr,
 					       offset, num_bytes,
 					       empty_cluster + empty_size);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ac23476..4538e48 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -579,6 +579,7 @@ out:
  * it returns -enospc
  */
 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
 			     struct btrfs_block_group_cache *block_group,
 			     struct btrfs_free_cluster *cluster,
 			     u64 offset, u64 bytes, u64 empty_size)
@@ -595,7 +596,9 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
 	int ret;
 
 	/* for metadata, allow allocates with more holes */
-	if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
+	if (btrfs_test_opt(root, SSD_SPREAD)) {
+		min_bytes = bytes + empty_size;
+	} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
 		/*
 		 * we want to do larger allocations when we are
 		 * flushing out the delayed refs, it helps prevent
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index ab0bdc0..266fb87 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -31,6 +31,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
 			   u64 bytes);
 u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
 			     struct btrfs_block_group_cache *block_group,
 			     struct btrfs_free_cluster *cluster,
 			     u64 offset, u64 bytes, u64 empty_size);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 22855a1..7f5b288 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -66,8 +66,8 @@ static void btrfs_put_super(struct super_block *sb)
 enum {
 	Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
 	Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
-	Opt_ssd, Opt_nossd, Opt_thread_pool, Opt_noacl,  Opt_compress,
-	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err,
+	Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl,
+	Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -83,6 +83,7 @@ static match_table_t tokens = {
 	{Opt_thread_pool, "thread_pool=%d"},
 	{Opt_compress, "compress"},
 	{Opt_ssd, "ssd"},
+	{Opt_ssd_spread, "ssd_spread"},
 	{Opt_nossd, "nossd"},
 	{Opt_noacl, "noacl"},
 	{Opt_notreelog, "notreelog"},
@@ -174,9 +175,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
 			btrfs_set_opt(info->mount_opt, SSD);
 			break;
+		case Opt_ssd_spread:
+			printk(KERN_INFO "btrfs: use spread ssd "
+			       "allocation scheme\n");
+			btrfs_set_opt(info->mount_opt, SSD);
+			btrfs_set_opt(info->mount_opt, SSD_SPREAD);
+			break;
 		case Opt_nossd:
-			printk(KERN_INFO "btrfs: not using ssd allocation scheme\n");
+			printk(KERN_INFO "btrfs: not using ssd allocation "
+			       "scheme\n");
 			btrfs_clear_opt(info->mount_opt, SSD);
+			btrfs_clear_opt(info->mount_opt, SSD_SPREAD);
 			break;
 		case Opt_nobarrier:
 			printk(KERN_INFO "btrfs: turning off barriers\n");
@@ -429,7 +438,9 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
 	if (btrfs_test_opt(root, COMPRESS))
 		seq_puts(seq, ",compress");
-	if (btrfs_test_opt(root, SSD))
+	if (btrfs_test_opt(root, SSD_SPREAD))
+		seq_puts(seq, ",ssd_spread");
+	else if (btrfs_test_opt(root, SSD))
 		seq_puts(seq, ",ssd");
 	if (btrfs_test_opt(root, NOTREELOG))
 		seq_puts(seq, ",notreelog");
-- 
cgit v0.10.2


From c289811cc096c57ff35550ee8132793a4f9b5b59 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 10 Jun 2009 09:51:32 -0400
Subject: Btrfs: autodetect SSD devices

During mount, btrfs will check the queue nonrot flag
for all the devices found in the FS.  If they are all
non-rotating, SSD mode is enabled by default.

If the FS was mounted with -o nossd, the non-rotating
flag is ignored.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b9d8788..5fa7d7d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1101,6 +1101,7 @@ struct btrfs_root {
 #define BTRFS_MOUNT_NOTREELOG           (1 << 6)
 #define BTRFS_MOUNT_FLUSHONCOMMIT       (1 << 7)
 #define BTRFS_MOUNT_SSD_SPREAD		(1 << 8)
+#define BTRFS_MOUNT_NOSSD		(1 << 9)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e572cf4..f4dfbb7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1850,6 +1850,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (IS_ERR(fs_info->transaction_kthread))
 		goto fail_cleaner;
 
+	if (!btrfs_test_opt(tree_root, SSD) &&
+	    !btrfs_test_opt(tree_root, NOSSD) &&
+	    !fs_info->fs_devices->rotating) {
+		printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD "
+		       "mode\n");
+		btrfs_set_opt(fs_info->mount_opt, SSD);
+	}
+
 	if (btrfs_super_log_root(disk_super) != 0) {
 		u64 bytenr = btrfs_super_log_root(disk_super);
 
@@ -1893,6 +1901,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
 	if (!fs_info->fs_root)
 		goto fail_trans_kthread;
+
 	return tree_root;
 
 fail_trans_kthread:
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 7f5b288..3427db2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -184,6 +184,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		case Opt_nossd:
 			printk(KERN_INFO "btrfs: not using ssd allocation "
 			       "scheme\n");
+			btrfs_set_opt(info->mount_opt, NOSSD);
 			btrfs_clear_opt(info->mount_opt, SSD);
 			btrfs_clear_opt(info->mount_opt, SSD_SPREAD);
 			break;
@@ -438,6 +439,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
 	if (btrfs_test_opt(root, COMPRESS))
 		seq_puts(seq, ",compress");
+	if (btrfs_test_opt(root, NOSSD))
+		seq_puts(seq, ",nossd");
 	if (btrfs_test_opt(root, SSD_SPREAD))
 		seq_puts(seq, ",ssd_spread");
 	else if (btrfs_test_opt(root, SSD))
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 27d5f37..3f4a593 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -605,6 +605,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		device->in_fs_metadata = 0;
 		device->mode = flags;
 
+		if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+			fs_devices->rotating = 1;
+
 		fs_devices->open_devices++;
 		if (device->writeable) {
 			fs_devices->rw_devices++;
@@ -1473,6 +1476,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	root->fs_info->fs_devices->rw_devices++;
 	root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
 
+	if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+		root->fs_info->fs_devices->rotating = 1;
+
 	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
 				    total_bytes + device->total_bytes);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5c3ff6d..3c1f731 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -107,6 +107,11 @@ struct btrfs_fs_devices {
 	int seeding;
 
 	int opened;
+
+	/* set when we find or add a device that doesn't have the
+	 * nonrot flag set
+	 */
+	int rotating;
 };
 
 struct btrfs_bio_stripe {
-- 
cgit v0.10.2


From 6cbff00f4632c8060b06bfc9585805217f11e12e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 17 Apr 2009 10:37:41 +0200
Subject: Btrfs: implement FS_IOC_GETFLAGS/SETFLAGS/GETVERSION

Add support for the standard attributes set via chattr and read via
lsattr.  Currently we store the attributes in the flags value in
the btrfs inode, but I wonder whether we should split it into two so
that we don't have to keep converting between the two formats.

Remove the btrfs_clear_flag/btrfs_set_flag/btrfs_test_flag macros
as they were confusing the existing code and got in the way of the
new additions.

Also add the FS_IOC_GETVERSION ioctl for getting i_generation as it's
trivial.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ecf5f7d..acb4f35 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -157,5 +157,4 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size)
 	BTRFS_I(inode)->disk_i_size = size;
 }
 
-
 #endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ab07627..de1e2fd 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -123,7 +123,7 @@ static int check_compressed_csum(struct inode *inode,
 	u32 csum;
 	u32 *cb_sum = &cb->sums;
 
-	if (btrfs_test_flag(inode, NODATASUM))
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
 		return 0;
 
 	for (i = 0; i < cb->nr_pages; i++) {
@@ -670,7 +670,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 			 */
 			atomic_inc(&cb->pending_bios);
 
-			if (!btrfs_test_flag(inode, NODATASUM)) {
+			if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
 				btrfs_lookup_bio_sums(root, inode, comp_bio,
 						      sums);
 			}
@@ -697,7 +697,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
 	BUG_ON(ret);
 
-	if (!btrfs_test_flag(inode, NODATASUM))
+	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
 		btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
 
 	ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5fa7d7d..4d6e0b6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1115,12 +1115,14 @@ struct btrfs_root {
 #define BTRFS_INODE_READONLY		(1 << 2)
 #define BTRFS_INODE_NOCOMPRESS		(1 << 3)
 #define BTRFS_INODE_PREALLOC		(1 << 4)
-#define btrfs_clear_flag(inode, flag)	(BTRFS_I(inode)->flags &= \
-					 ~BTRFS_INODE_##flag)
-#define btrfs_set_flag(inode, flag)	(BTRFS_I(inode)->flags |= \
-					 BTRFS_INODE_##flag)
-#define btrfs_test_flag(inode, flag)	(BTRFS_I(inode)->flags & \
-					 BTRFS_INODE_##flag)
+#define BTRFS_INODE_SYNC		(1 << 5)
+#define BTRFS_INODE_IMMUTABLE		(1 << 6)
+#define BTRFS_INODE_APPEND		(1 << 7)
+#define BTRFS_INODE_NODUMP		(1 << 8)
+#define BTRFS_INODE_NOATIME		(1 << 9)
+#define BTRFS_INODE_DIRSYNC		(1 << 10)
+
+
 /* some macros to generate set/get funcs for the struct fields.  This
  * assumes there is a lefoo_to_cpu for every type, so lets make a simple
  * one for u8:
@@ -2260,6 +2262,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t size);
 
 /* ioctl.c */
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+void btrfs_update_iflags(struct inode *inode);
+void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
 
 /* file.c */
 int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 917bf10..5b68330 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -368,7 +368,7 @@ again:
 	 * inode has not been flagged as nocompress.  This flag can
 	 * change at any time if we discover bad compression ratios.
 	 */
-	if (!btrfs_test_flag(inode, NOCOMPRESS) &&
+	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
 	    btrfs_test_opt(root, COMPRESS)) {
 		WARN_ON(pages);
 		pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
@@ -469,7 +469,7 @@ again:
 		nr_pages_ret = 0;
 
 		/* flag the file so we don't compress in the future */
-		btrfs_set_flag(inode, NOCOMPRESS);
+		BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
 	}
 	if (will_compress) {
 		*num_added += 1;
@@ -862,7 +862,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 		async_cow->locked_page = locked_page;
 		async_cow->start = start;
 
-		if (btrfs_test_flag(inode, NOCOMPRESS))
+		if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
 			cur_end = end;
 		else
 			cur_end = min(end, start + 512 * 1024 - 1);
@@ -1133,10 +1133,10 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 	int ret;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 
-	if (btrfs_test_flag(inode, NODATACOW))
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
 					 page_started, 1, nr_written);
-	else if (btrfs_test_flag(inode, PREALLOC))
+	else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
 					 page_started, 0, nr_written);
 	else if (!btrfs_test_opt(root, COMPRESS))
@@ -1290,7 +1290,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	int ret = 0;
 	int skip_sum;
 
-	skip_sum = btrfs_test_flag(inode, NODATASUM);
+	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
 	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
 	BUG_ON(ret);
@@ -1790,7 +1790,8 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 		ClearPageChecked(page);
 		goto good;
 	}
-	if (btrfs_test_flag(inode, NODATASUM))
+
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
 		return 0;
 
 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
@@ -2156,6 +2157,8 @@ static void btrfs_read_locked_inode(struct inode *inode)
 		init_special_inode(inode, inode->i_mode, rdev);
 		break;
 	}
+
+	btrfs_update_iflags(inode);
 	return;
 
 make_bad:
@@ -3586,9 +3589,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 			btrfs_find_block_group(root, 0, alloc_hint, owner);
 	if ((mode & S_IFREG)) {
 		if (btrfs_test_opt(root, NODATASUM))
-			btrfs_set_flag(inode, NODATASUM);
+			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
 		if (btrfs_test_opt(root, NODATACOW))
-			btrfs_set_flag(inode, NODATACOW);
+			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
 	}
 
 	key[0].objectid = objectid;
@@ -3642,6 +3645,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	location->offset = 0;
 	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
 
+	btrfs_inherit_iflags(inode, dir);
+
 	insert_inode_hash(inode);
 	inode_tree_add(inode);
 	return inode;
@@ -5075,7 +5080,7 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
 out:
 	if (cur_offset > start) {
 		inode->i_ctime = CURRENT_TIME;
-		btrfs_set_flag(inode, PREALLOC);
+		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
 		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
 		    cur_offset > i_size_read(inode))
 			btrfs_i_size_write(inode, cur_offset);
@@ -5196,7 +5201,7 @@ static int btrfs_set_page_dirty(struct page *page)
 
 static int btrfs_permission(struct inode *inode, int mask)
 {
-	if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
+	if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
 		return -EACCES;
 	return generic_permission(inode, mask, btrfs_check_acl);
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 54dfd45..926332a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -50,7 +50,172 @@
 #include "volumes.h"
 #include "locking.h"
 
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
+{
+	if (S_ISDIR(mode))
+		return flags;
+	else if (S_ISREG(mode))
+		return flags & ~FS_DIRSYNC_FL;
+	else
+		return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
+}
+
+/*
+ * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl.
+ */
+static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
+{
+	unsigned int iflags = 0;
+
+	if (flags & BTRFS_INODE_SYNC)
+		iflags |= FS_SYNC_FL;
+	if (flags & BTRFS_INODE_IMMUTABLE)
+		iflags |= FS_IMMUTABLE_FL;
+	if (flags & BTRFS_INODE_APPEND)
+		iflags |= FS_APPEND_FL;
+	if (flags & BTRFS_INODE_NODUMP)
+		iflags |= FS_NODUMP_FL;
+	if (flags & BTRFS_INODE_NOATIME)
+		iflags |= FS_NOATIME_FL;
+	if (flags & BTRFS_INODE_DIRSYNC)
+		iflags |= FS_DIRSYNC_FL;
+
+	return iflags;
+}
+
+/*
+ * Update inode->i_flags based on the btrfs internal flags.
+ */
+void btrfs_update_iflags(struct inode *inode)
+{
+	struct btrfs_inode *ip = BTRFS_I(inode);
+
+	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+
+	if (ip->flags & BTRFS_INODE_SYNC)
+		inode->i_flags |= S_SYNC;
+	if (ip->flags & BTRFS_INODE_IMMUTABLE)
+		inode->i_flags |= S_IMMUTABLE;
+	if (ip->flags & BTRFS_INODE_APPEND)
+		inode->i_flags |= S_APPEND;
+	if (ip->flags & BTRFS_INODE_NOATIME)
+		inode->i_flags |= S_NOATIME;
+	if (ip->flags & BTRFS_INODE_DIRSYNC)
+		inode->i_flags |= S_DIRSYNC;
+}
+
+/*
+ * Inherit flags from the parent inode.
+ *
+ * Unlike extN we don't have any flags we don't want to inherit currently.
+ */
+void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
+{
+	unsigned int flags = BTRFS_I(dir)->flags;
+
+	if (S_ISREG(inode->i_mode))
+		flags &= ~BTRFS_INODE_DIRSYNC;
+	else if (!S_ISDIR(inode->i_mode))
+		flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME);
+
+	BTRFS_I(inode)->flags = flags;
+	btrfs_update_iflags(inode);
+}
+
+static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
+{
+	struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode);
+	unsigned int flags = btrfs_flags_to_ioctl(ip->flags);
+
+	if (copy_to_user(arg, &flags, sizeof(flags)))
+		return -EFAULT;
+	return 0;
+}
+
+static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct btrfs_inode *ip = BTRFS_I(inode);
+	struct btrfs_root *root = ip->root;
+	struct btrfs_trans_handle *trans;
+	unsigned int flags, oldflags;
+	int ret;
+
+	if (copy_from_user(&flags, arg, sizeof(flags)))
+		return -EFAULT;
+
+	if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
+		      FS_NOATIME_FL | FS_NODUMP_FL | \
+		      FS_SYNC_FL | FS_DIRSYNC_FL))
+		return -EOPNOTSUPP;
 
+	if (!is_owner_or_cap(inode))
+		return -EACCES;
+
+	mutex_lock(&inode->i_mutex);
+
+	flags = btrfs_mask_flags(inode->i_mode, flags);
+	oldflags = btrfs_flags_to_ioctl(ip->flags);
+	if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+		if (!capable(CAP_LINUX_IMMUTABLE)) {
+			ret = -EPERM;
+			goto out_unlock;
+		}
+	}
+
+	ret = mnt_want_write(file->f_path.mnt);
+	if (ret)
+		goto out_unlock;
+
+	if (flags & FS_SYNC_FL)
+		ip->flags |= BTRFS_INODE_SYNC;
+	else
+		ip->flags &= ~BTRFS_INODE_SYNC;
+	if (flags & FS_IMMUTABLE_FL)
+		ip->flags |= BTRFS_INODE_IMMUTABLE;
+	else
+		ip->flags &= ~BTRFS_INODE_IMMUTABLE;
+	if (flags & FS_APPEND_FL)
+		ip->flags |= BTRFS_INODE_APPEND;
+	else
+		ip->flags &= ~BTRFS_INODE_APPEND;
+	if (flags & FS_NODUMP_FL)
+		ip->flags |= BTRFS_INODE_NODUMP;
+	else
+		ip->flags &= ~BTRFS_INODE_NODUMP;
+	if (flags & FS_NOATIME_FL)
+		ip->flags |= BTRFS_INODE_NOATIME;
+	else
+		ip->flags &= ~BTRFS_INODE_NOATIME;
+	if (flags & FS_DIRSYNC_FL)
+		ip->flags |= BTRFS_INODE_DIRSYNC;
+	else
+		ip->flags &= ~BTRFS_INODE_DIRSYNC;
+
+
+	trans = btrfs_join_transaction(root, 1);
+	BUG_ON(!trans);
+
+	ret = btrfs_update_inode(trans, root, inode);
+	BUG_ON(ret);
+
+	btrfs_update_iflags(inode);
+	inode->i_ctime = CURRENT_TIME;
+	btrfs_end_transaction(trans, root);
+
+	mnt_drop_write(file->f_path.mnt);
+ out_unlock:
+	mutex_unlock(&inode->i_mutex);
+	return 0;
+}
+
+static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+
+	return put_user(inode->i_generation, arg);
+}
 
 static noinline int create_subvol(struct btrfs_root *root,
 				  struct dentry *dentry,
@@ -1077,6 +1242,12 @@ long btrfs_ioctl(struct file *file, unsigned int
 	void __user *argp = (void __user *)arg;
 
 	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		return btrfs_ioctl_getflags(file, argp);
+	case FS_IOC_SETFLAGS:
+		return btrfs_ioctl_setflags(file, argp);
+	case FS_IOC_GETVERSION:
+		return btrfs_ioctl_getversion(file, argp);
 	case BTRFS_IOC_SNAP_CREATE:
 		return btrfs_ioctl_snap_create(file, argp, 0);
 	case BTRFS_IOC_SUBVOL_CREATE:
-- 
cgit v0.10.2


From 163e783e6a8b1e8bcb4c9084d438091386b589df Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 19 Apr 2009 13:02:41 +0100
Subject: Btrfs: remove crc32c.h and use libcrc32c directly.

There's no need to preserve this abstraction; it used to let us use
hardware crc32c support directly, but libcrc32c is already doing that for us
through the crypto API -- so we're already using the Intel crc32c
acceleration where appropriate.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
deleted file mode 100644
index 6e1b3de..0000000
--- a/fs/btrfs/crc32c.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (C) 2008 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef __BTRFS_CRC32C__
-#define __BTRFS_CRC32C__
-#include <linux/crc32c.h>
-
-/*
- * this file used to do more for selecting the HW version of crc32c,
- * perhaps it will one day again soon.
- */
-#define btrfs_crc32c(seed, data, length) crc32c(seed, data, length)
-#endif
-
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f4dfbb7..6c54c21 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -26,8 +26,8 @@
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/crc32c.h>
 #include "compat.h"
-#include "crc32c.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -171,7 +171,7 @@ out:
 
 u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
 {
-	return btrfs_crc32c(seed, data, len);
+	return crc32c(seed, data, len);
 }
 
 void btrfs_csum_final(u32 crc, char *result)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3355d7e..33a65f2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,7 +23,6 @@
 #include <linux/rcupdate.h>
 #include "compat.h"
 #include "hash.h"
-#include "crc32c.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "print-tree.h"
@@ -625,11 +624,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
 	__le64 lenum;
 
 	lenum = cpu_to_le64(root_objectid);
-	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
+	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
 	lenum = cpu_to_le64(owner);
-	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
 	lenum = cpu_to_le64(offset);
-	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
 
 	return ((u64)high_crc << 31) ^ (u64)low_crc;
 }
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index 2a020b2..db2ff97 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -19,9 +19,9 @@
 #ifndef __HASH__
 #define __HASH__
 
-#include "crc32c.h"
+#include <linux/crc32c.h>
 static inline u64 btrfs_name_hash(const char *name, int len)
 {
-	return btrfs_crc32c((u32)~1, name, len);
+	return crc32c((u32)~1, name, len);
 }
 #endif
-- 
cgit v0.10.2


From 524724ed1f224875a117be593540591ed050c73d Mon Sep 17 00:00:00 2001
From: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp>
Date: Wed, 10 Jun 2009 11:13:17 -0400
Subject: Btrfs: fdatasync should skip metadata writeout

In btrfs, fdatasync and fsync are identical, but
fdatasync should skip committing transaction when
inode->i_state is set just I_DIRTY_SYNC and this indicates
only atime or/and mtime updates.
Following patch improves fdatasync throughput.

--file-block-size=4K --file-total-size=16G --file-test-mode=rndwr
--file-fsync-mode=fdatasync run

Results:
-2.6.30-rc8
Test execution summary:
    total time:                          1980.6540s
    total number of events:              10001
    total time taken by event execution: 1192.9804
    per-request statistics:
         min:                            0.0000s
         avg:                            0.1193s
         max:                            15.3720s
         approx.  95 percentile:         0.7257s

Threads fairness:
    events (avg/stddev):           625.0625/151.32
    execution time (avg/stddev):   74.5613/9.46

-2.6.30-rc8-patched
Test execution summary:
    total time:                          1695.9118s
    total number of events:              10000
    total time taken by event execution: 871.3214
    per-request statistics:
         min:                            0.0000s
         avg:                            0.0871s
         max:                            10.4644s
         approx.  95 percentile:         0.4787s

Threads fairness:
    events (avg/stddev):           625.0000/131.86
    execution time (avg/stddev):   54.4576/8.98

Signed-off-by: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0726a73..126477e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1157,6 +1157,8 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
 	root->log_batch++;
 
+	if (datasync && !(inode->i_state & I_DIRTY_PAGES))
+		goto out;
 	/*
 	 * ok we haven't committed the transaction yet, lets do a commit
 	 */
-- 
cgit v0.10.2


From 7df336ec1266dccbb253bac52c529d3dcc7c22d0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Wed, 10 Jun 2009 11:36:43 -0400
Subject: Fix btrfs when ACLs are configured out

... otherwise generic_permission() will allow *anything* for all
files you don't own and that have some group permissions.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index cbba000..6039725 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -351,9 +351,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
 	return 0;
 }
 
-int btrfs_check_acl(struct inode *inode, int mask)
-{
-	return 0;
-}
-
 #endif /* CONFIG_FS_POSIX_ACL */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4d6e0b6..03441a9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2301,7 +2301,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
 int btrfs_sync_fs(struct super_block *sb, int wait);
 
 /* acl.c */
+#ifdef CONFIG_FS_POSIX_ACL
 int btrfs_check_acl(struct inode *inode, int mask);
+#else
+#define btrfs_check_acl NULL
+#endif
 int btrfs_init_acl(struct inode *inode, struct inode *dir);
 int btrfs_acl_chmod(struct inode *inode);
 
-- 
cgit v0.10.2


From 58f7f68f228c3aba2ba4468d92e2cec35724ba2e Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 10 Jun 2009 09:57:55 -0400
Subject: cifs: add addr= mount option alias for ip=

When you look in /proc/mounts, the address of the server gets displayed
as "addr=". That's really a better option to use anyway since it's more
generic. What if we eventually want to support non-IP transports? It
also makes CIFS option consistent with the NFS option of the same name.

Begin the migration to that option name by adding an alias for ip=
called addr=.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 10151f8..6298dc3 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -958,7 +958,8 @@ cifs_parse_mount_options(char *options, const char *devname,
 				}
 				strcpy(vol->password, value);
 			}
-		} else if (strnicmp(data, "ip", 2) == 0) {
+		} else if (!strnicmp(data, "ip", 2) ||
+			   !strnicmp(data, "addr", 4)) {
 			if (!value || !*value) {
 				vol->UNCip = NULL;
 			} else if (strnlen(value, INET6_ADDRSTRLEN) <
-- 
cgit v0.10.2


From bc5c6c043d8381676339fb3da59cc4cc5921d368 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Wed, 10 Jun 2009 04:48:41 -0400
Subject: ftrace/documentation: fix typo in function grapher name

The function graph tracer is called just "function_graph" (no trailing
"_tracer" needed).

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
LKML-Reference: <1244623722-6325-1-git-send-email-vapier@gentoo.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index fd9a3e6..5ad2ded 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -179,7 +179,7 @@ Here is the list of current tracers that may be configured.
 
 	Function call tracer to trace all kernel functions.
 
-  "function_graph_tracer"
+  "function_graph"
 
 	Similar to the function tracer except that the
 	function tracer probes the functions on their entry
-- 
cgit v0.10.2


From a41f20716975910d9beb90b7efc61107901492b8 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 10 Jun 2009 14:22:55 -0400
Subject: ext4: Avoid corrupting the uninitialized bit in the extent during
 truncate

The unitialized bit was not properly getting preserved in in an extent
which is partially truncated because the it was geting set to the
value of the first extent to be removed or truncated as part of the
truncate operation, and if there are multiple extents are getting
removed or modified as part of the truncate operation, it is only the
last extent which will might be partially truncated, and its
uninitalized bit is not necessarily the same as the first extent to be
truncated.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 9c35a7b..2593f74 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2083,12 +2083,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 	ex = EXT_LAST_EXTENT(eh);
 
 	ex_ee_block = le32_to_cpu(ex->ee_block);
-	if (ext4_ext_is_uninitialized(ex))
-		uninitialized = 1;
 	ex_ee_len = ext4_ext_get_actual_len(ex);
 
 	while (ex >= EXT_FIRST_EXTENT(eh) &&
 			ex_ee_block + ex_ee_len > start) {
+
+		if (ext4_ext_is_uninitialized(ex))
+			uninitialized = 1;
+		else
+			uninitialized = 0;
+
 		ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
 		path[depth].p_ext = ex;
 
-- 
cgit v0.10.2


From 6ff9a64d2aaa6eae396adc95e9c91c0cbfa6dbe4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 10 Jun 2009 14:28:34 -0400
Subject: tracing: do not translate event helper macros in print format

By moving the macro that creates the print format code above the
defining of the event macro helpers (__get_str, __print_symbolic,
and __get_dynamic_array), we get a little cleaner print format.

Instead of:

  (char *)((void *)REC + REC->__data_loc_name)

we get:

   __get_str(name)

Instead of:

   ({ static const struct trace_print_flags symbols[] = { { HI_SOFTIRQ, "HI" }, {

we get:

   __print_symbolic(REC->vec, { HI_SOFTIRQ, "HI" }, {

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 40ede4d..1867553 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -81,6 +81,87 @@
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 /*
+ * Setup the showing format of trace point.
+ *
+ * int
+ * ftrace_format_##call(struct trace_seq *s)
+ * {
+ *	struct ftrace_raw_##call field;
+ *	int ret;
+ *
+ *	ret = trace_seq_printf(s, #type " " #item ";"
+ *			       " offset:%u; size:%u;\n",
+ *			       offsetof(struct ftrace_raw_##call, item),
+ *			       sizeof(field.type));
+ *
+ * }
+ */
+
+#undef TP_STRUCT__entry
+#define TP_STRUCT__entry(args...) args
+
+#undef __field
+#define __field(type, item)					\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
+
+#undef __array
+#define __array(type, item, len)						\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
+
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)				       \
+	ret = trace_seq_printf(s, "\tfield:__data_loc " #item ";\t"	       \
+			       "offset:%u;\tsize:%u;\n",		       \
+			       (unsigned int)offsetof(typeof(field),	       \
+					__data_loc_##item),		       \
+			       (unsigned int)sizeof(field.__data_loc_##item)); \
+	if (!ret)							       \
+		return 0;
+
+#undef __string
+#define __string(item, src) __dynamic_array(char, item, -1)
+
+#undef __entry
+#define __entry REC
+
+#undef __print_symbolic
+#undef __get_dynamic_array
+#undef __get_str
+
+#undef TP_printk
+#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
+
+#undef TP_fast_assign
+#define TP_fast_assign(args...) args
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
+static int								\
+ftrace_format_##call(struct trace_seq *s)				\
+{									\
+	struct ftrace_raw_##call field __attribute__((unused));		\
+	int ret = 0;							\
+									\
+	tstruct;							\
+									\
+	trace_seq_printf(s, "\nprint fmt: " print);			\
+									\
+	return ret;							\
+}
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+/*
  * Stage 3 of the trace events.
  *
  * Override the macros in <trace/trace_events.h> to include the following:
@@ -179,83 +260,6 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 	
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
-/*
- * Setup the showing format of trace point.
- *
- * int
- * ftrace_format_##call(struct trace_seq *s)
- * {
- *	struct ftrace_raw_##call field;
- *	int ret;
- *
- *	ret = trace_seq_printf(s, #type " " #item ";"
- *			       " offset:%u; size:%u;\n",
- *			       offsetof(struct ftrace_raw_##call, item),
- *			       sizeof(field.type));
- *
- * }
- */
-
-#undef TP_STRUCT__entry
-#define TP_STRUCT__entry(args...) args
-
-#undef __field
-#define __field(type, item)					\
-	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
-			       "offset:%u;\tsize:%u;\n",		\
-			       (unsigned int)offsetof(typeof(field), item), \
-			       (unsigned int)sizeof(field.item));	\
-	if (!ret)							\
-		return 0;
-
-#undef __array
-#define __array(type, item, len)						\
-	ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t"	\
-			       "offset:%u;\tsize:%u;\n",		\
-			       (unsigned int)offsetof(typeof(field), item), \
-			       (unsigned int)sizeof(field.item));	\
-	if (!ret)							\
-		return 0;
-
-#undef __dynamic_array
-#define __dynamic_array(type, item, len)				       \
-	ret = trace_seq_printf(s, "\tfield:__data_loc " #item ";\t"	       \
-			       "offset:%u;\tsize:%u;\n",		       \
-			       (unsigned int)offsetof(typeof(field),	       \
-					__data_loc_##item),		       \
-			       (unsigned int)sizeof(field.__data_loc_##item)); \
-	if (!ret)							       \
-		return 0;
-
-#undef __string
-#define __string(item, src) __dynamic_array(char, item, -1)
-
-#undef __entry
-#define __entry REC
-
-#undef TP_printk
-#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
-
-#undef TP_fast_assign
-#define TP_fast_assign(args...) args
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
-static int								\
-ftrace_format_##call(struct trace_seq *s)				\
-{									\
-	struct ftrace_raw_##call field __attribute__((unused));		\
-	int ret = 0;							\
-									\
-	tstruct;							\
-									\
-	trace_seq_printf(s, "\nprint fmt: " print);			\
-									\
-	return ret;							\
-}
-
-#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
-
 #undef __field
 #define __field(type, item)						\
 	ret = trace_define_field(event_call, #type, #item,		\
-- 
cgit v0.10.2


From 61b6bc525a34931bb73e4c95bfe009cd9572a288 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 10 Jun 2009 10:04:58 -0400
Subject: cifs: remove never-used in6_addr option

This option was never used to my knowledge. Remove it before someone
does...

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 6298dc3..97f4311 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -62,7 +62,6 @@ struct smb_vol {
 	char *domainname;
 	char *UNC;
 	char *UNCip;
-	char *in6_addr;   /* ipv6 address as human readable form of in6_addr */
 	char *iocharset;  /* local code page for mapping to and from Unicode */
 	char source_rfc1001_name[16]; /* netbios name of client */
 	char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */
@@ -1320,16 +1319,6 @@ cifs_parse_mount_options(char *options, const char *devname,
 			vol->direct_io = 1;
 		} else if (strnicmp(data, "forcedirectio", 13) == 0) {
 			vol->direct_io = 1;
-		} else if (strnicmp(data, "in6_addr", 8) == 0) {
-			if (!value || !*value) {
-				vol->in6_addr = NULL;
-			} else if (strnlen(value, 49) == 48) {
-				vol->in6_addr = value;
-			} else {
-				printk(KERN_WARNING "CIFS: ip v6 address not "
-						    "48 characters long\n");
-				return 1;
-			}
 		} else if (strnicmp(data, "noac", 4) == 0) {
 			printk(KERN_WARNING "CIFS: Mount option noac not "
 				"supported. Instead set "
-- 
cgit v0.10.2


From e5e9a5206a171b2c467e494aebcdcf70c47289bc Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 10 Jun 2009 15:17:02 -0400
Subject: Btrfs: avoid races between super writeout and device list updates

On multi-device filesystems, btrfs writes supers to all of the devices
before considering a sync complete.  There wasn't any additional
locking between super writeout and the device list management code
because device management was done inside a transaction and
super writeout only happened  with no transation writers running.

With the btrfs fsync log and other async transaction updates, this
has been racey for some time.  This adds a mutex to protect
the device list.  The existing volume mutex could not be reused due to
transaction lock ordering requirements.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6c54c21..b7ddc77 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2111,7 +2111,7 @@ static int write_dev_supers(struct btrfs_device *device,
 
 int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
-	struct list_head *head = &root->fs_info->fs_devices->devices;
+	struct list_head *head;
 	struct btrfs_device *dev;
 	struct btrfs_super_block *sb;
 	struct btrfs_dev_item *dev_item;
@@ -2126,6 +2126,9 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 
 	sb = &root->fs_info->super_for_commit;
 	dev_item = &sb->dev_item;
+
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	head = &root->fs_info->fs_devices->devices;
 	list_for_each_entry(dev, head, dev_list) {
 		if (!dev->bdev) {
 			total_errors++;
@@ -2169,6 +2172,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 		if (ret)
 			total_errors++;
 	}
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 	if (total_errors > max_errors) {
 		printk(KERN_ERR "btrfs: %d errors while writing supers\n",
 		       total_errors);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3f4a593..3ab80e9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -377,6 +377,7 @@ static noinline int device_list_add(const char *path,
 		memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
 		fs_devices->latest_devid = devid;
 		fs_devices->latest_trans = found_transid;
+		mutex_init(&fs_devices->device_list_mutex);
 		device = NULL;
 	} else {
 		device = __find_device(&fs_devices->devices, devid,
@@ -403,7 +404,11 @@ static noinline int device_list_add(const char *path,
 			return -ENOMEM;
 		}
 		INIT_LIST_HEAD(&device->dev_alloc_list);
+
+		mutex_lock(&fs_devices->device_list_mutex);
 		list_add(&device->dev_list, &fs_devices->devices);
+		mutex_unlock(&fs_devices->device_list_mutex);
+
 		device->fs_devices = fs_devices;
 		fs_devices->num_devices++;
 	}
@@ -429,10 +434,12 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 	INIT_LIST_HEAD(&fs_devices->devices);
 	INIT_LIST_HEAD(&fs_devices->alloc_list);
 	INIT_LIST_HEAD(&fs_devices->list);
+	mutex_init(&fs_devices->device_list_mutex);
 	fs_devices->latest_devid = orig->latest_devid;
 	fs_devices->latest_trans = orig->latest_trans;
 	memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
 
+	mutex_lock(&orig->device_list_mutex);
 	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
 		device = kzalloc(sizeof(*device), GFP_NOFS);
 		if (!device)
@@ -454,8 +461,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 		device->fs_devices = fs_devices;
 		fs_devices->num_devices++;
 	}
+	mutex_unlock(&orig->device_list_mutex);
 	return fs_devices;
 error:
+	mutex_unlock(&orig->device_list_mutex);
 	free_fs_devices(fs_devices);
 	return ERR_PTR(-ENOMEM);
 }
@@ -466,6 +475,7 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
 
 	mutex_lock(&uuid_mutex);
 again:
+	mutex_lock(&fs_devices->device_list_mutex);
 	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
 		if (device->in_fs_metadata)
 			continue;
@@ -485,6 +495,7 @@ again:
 		kfree(device->name);
 		kfree(device);
 	}
+	mutex_unlock(&fs_devices->device_list_mutex);
 
 	if (fs_devices->seed) {
 		fs_devices = fs_devices->seed;
@@ -1135,12 +1146,14 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
 		device = NULL;
 		devices = &root->fs_info->fs_devices->devices;
+		mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 		list_for_each_entry(tmp, devices, dev_list) {
 			if (tmp->in_fs_metadata && !tmp->bdev) {
 				device = tmp;
 				break;
 			}
 		}
+		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 		bdev = NULL;
 		bh = NULL;
 		disk_super = NULL;
@@ -1195,7 +1208,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		goto error_brelse;
 
 	device->in_fs_metadata = 0;
+
+	/*
+	 * the device list mutex makes sure that we don't change
+	 * the device list while someone else is writing out all
+	 * the device supers.
+	 */
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 	list_del_init(&device->dev_list);
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
 	device->fs_devices->num_devices--;
 
 	next_device = list_entry(root->fs_info->fs_devices->devices.next,
@@ -1289,6 +1311,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
 	seed_devices->opened = 1;
 	INIT_LIST_HEAD(&seed_devices->devices);
 	INIT_LIST_HEAD(&seed_devices->alloc_list);
+	mutex_init(&seed_devices->device_list_mutex);
 	list_splice_init(&fs_devices->devices, &seed_devices->devices);
 	list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
 	list_for_each_entry(device, &seed_devices->devices, dev_list) {
@@ -1414,6 +1437,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	mutex_lock(&root->fs_info->volume_mutex);
 
 	devices = &root->fs_info->fs_devices->devices;
+	/*
+	 * we have the volume lock, so we don't need the extra
+	 * device list mutex while reading the list here.
+	 */
 	list_for_each_entry(device, devices, dev_list) {
 		if (device->bdev == bdev) {
 			ret = -EEXIST;
@@ -1468,6 +1495,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	}
 
 	device->fs_devices = root->fs_info->fs_devices;
+
+	/*
+	 * we don't want write_supers to jump in here with our device
+	 * half setup
+	 */
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 	list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
 	list_add(&device->dev_alloc_list,
 		 &root->fs_info->fs_devices->alloc_list);
@@ -1486,6 +1519,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
 	btrfs_set_super_num_devices(&root->fs_info->super_copy,
 				    total_bytes + 1);
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 
 	if (seeding_dev) {
 		ret = init_first_rw_device(trans, root, device);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 3c1f731..5139a83 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -96,7 +96,12 @@ struct btrfs_fs_devices {
 	u64 rw_devices;
 	u64 total_rw_bytes;
 	struct block_device *latest_bdev;
-	/* all of the devices in the FS */
+
+	/* all of the devices in the FS, protected by a mutex
+	 * so we can safely walk it to write out the supers without
+	 * worrying about add/remove by the multi-device code
+	 */
+	struct mutex device_list_mutex;
 	struct list_head devices;
 
 	/* devices not currently being allocated */
-- 
cgit v0.10.2


From 4eedeb75e7f15ffdb12d1ad559b565e7505bdbaf Mon Sep 17 00:00:00 2001
From: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp>
Date: Wed, 10 Jun 2009 15:28:55 -0400
Subject: Btrfs: pin buffers during write_dev_supers

write_dev_supers is called in sequence.  First is it called with wait == 0,
which starts IO on all of the super blocks for a given device.  Then it is
called with wait == 1 to make sure they all reach the disk.

It doesn't currently pin the buffers between the two calls, and it also
assumes the buffers won't go away between the two calls, leading to
an oops if the VM manages to free the buffers in the middle of the sync.

This fixes that assumption and updates the code to return an error if things
are not up to date when the wait == 1 run is done.

Signed-off-by: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b7ddc77..0d50d49 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2020,6 +2020,17 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
 	return latest;
 }
 
+/*
+ * this should be called twice, once with wait == 0 and
+ * once with wait == 1.  When wait == 0 is done, all the buffer heads
+ * we write are pinned.
+ *
+ * They are released when wait == 1 is done.
+ * max_mirrors must be the same for both runs, and it indicates how
+ * many supers on this one device should be written.
+ *
+ * max_mirrors == 0 means to write them all.
+ */
 static int write_dev_supers(struct btrfs_device *device,
 			    struct btrfs_super_block *sb,
 			    int do_barriers, int wait, int max_mirrors)
@@ -2055,12 +2066,16 @@ static int write_dev_supers(struct btrfs_device *device,
 			bh = __find_get_block(device->bdev, bytenr / 4096,
 					      BTRFS_SUPER_INFO_SIZE);
 			BUG_ON(!bh);
-			brelse(bh);
 			wait_on_buffer(bh);
-			if (buffer_uptodate(bh)) {
-				brelse(bh);
-				continue;
-			}
+			if (!buffer_uptodate(bh))
+				errors++;
+
+			/* drop our reference */
+			brelse(bh);
+
+			/* drop the reference from the wait == 0 run */
+			brelse(bh);
+			continue;
 		} else {
 			btrfs_set_super_bytenr(sb, bytenr);
 
@@ -2071,12 +2086,18 @@ static int write_dev_supers(struct btrfs_device *device,
 					      BTRFS_CSUM_SIZE);
 			btrfs_csum_final(crc, sb->csum);
 
+			/*
+			 * one reference for us, and we leave it for the
+			 * caller
+			 */
 			bh = __getblk(device->bdev, bytenr / 4096,
 				      BTRFS_SUPER_INFO_SIZE);
 			memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
 
-			set_buffer_uptodate(bh);
+			/* one reference for submit_bh */
 			get_bh(bh);
+
+			set_buffer_uptodate(bh);
 			lock_buffer(bh);
 			bh->b_end_io = btrfs_end_buffer_write_sync;
 		}
@@ -2088,6 +2109,7 @@ static int write_dev_supers(struct btrfs_device *device,
 				       device->name);
 				set_buffer_uptodate(bh);
 				device->barriers = 0;
+				/* one reference for submit_bh */
 				get_bh(bh);
 				lock_buffer(bh);
 				ret = submit_bh(WRITE_SYNC, bh);
@@ -2096,15 +2118,8 @@ static int write_dev_supers(struct btrfs_device *device,
 			ret = submit_bh(WRITE_SYNC, bh);
 		}
 
-		if (!ret && wait) {
-			wait_on_buffer(bh);
-			if (!buffer_uptodate(bh))
-				errors++;
-		} else if (ret) {
+		if (ret)
 			errors++;
-		}
-		if (wait)
-			brelse(bh);
 	}
 	return errors < i ? 0 : -1;
 }
-- 
cgit v0.10.2


From d9c7d394a8ebacb60097b192939ae9f15235225e Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@novell.com>
Date: Wed, 10 Jun 2009 12:57:06 -0700
Subject: block: prevent possible io_context->refcount overflow

Currently io_context has an atomic_t(32-bit) as refcount.  In the case of
cfq, for each device against whcih a task does I/O, a reference to the
io_context would be taken.  And when there are multiple process sharing
io_contexts(CLONE_IO) would also have a reference to the same io_context.

Theoretically the possible maximum number of processes sharing the same
io_context + the number of disks/cfq_data referring to the same io_context
can overflow the 32-bit counter on a very high-end machine.

Even though it is an improbable case, let us make it atomic_long_t.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 012f065..d4ed600 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -35,9 +35,9 @@ int put_io_context(struct io_context *ioc)
 	if (ioc == NULL)
 		return 1;
 
-	BUG_ON(atomic_read(&ioc->refcount) == 0);
+	BUG_ON(atomic_long_read(&ioc->refcount) == 0);
 
-	if (atomic_dec_and_test(&ioc->refcount)) {
+	if (atomic_long_dec_and_test(&ioc->refcount)) {
 		rcu_read_lock();
 		if (ioc->aic && ioc->aic->dtor)
 			ioc->aic->dtor(ioc->aic);
@@ -90,7 +90,7 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 
 	ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
 	if (ret) {
-		atomic_set(&ret->refcount, 1);
+		atomic_long_set(&ret->refcount, 1);
 		atomic_set(&ret->nr_tasks, 1);
 		spin_lock_init(&ret->lock);
 		ret->ioprio_changed = 0;
@@ -151,7 +151,7 @@ struct io_context *get_io_context(gfp_t gfp_flags, int node)
 		ret = current_io_context(gfp_flags, node);
 		if (unlikely(!ret))
 			break;
-	} while (!atomic_inc_not_zero(&ret->refcount));
+	} while (!atomic_long_inc_not_zero(&ret->refcount));
 
 	return ret;
 }
@@ -163,8 +163,8 @@ void copy_io_context(struct io_context **pdst, struct io_context **psrc)
 	struct io_context *dst = *pdst;
 
 	if (src) {
-		BUG_ON(atomic_read(&src->refcount) == 0);
-		atomic_inc(&src->refcount);
+		BUG_ON(atomic_long_read(&src->refcount) == 0);
+		atomic_long_inc(&src->refcount);
 		put_io_context(dst);
 		*pdst = src;
 	}
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 99ac430..ef2f72d 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1282,7 +1282,7 @@ static void cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	if (!cfqd->active_cic) {
 		struct cfq_io_context *cic = RQ_CIC(rq);
 
-		atomic_inc(&cic->ioc->refcount);
+		atomic_long_inc(&cic->ioc->refcount);
 		cfqd->active_cic = cic;
 	}
 }
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 08b987b..dd05434 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -64,7 +64,7 @@ struct cfq_io_context {
  * and kmalloc'ed. These could be shared between processes.
  */
 struct io_context {
-	atomic_t refcount;
+	atomic_long_t refcount;
 	atomic_t nr_tasks;
 
 	/* all the fields below are protected by this lock */
@@ -91,8 +91,8 @@ static inline struct io_context *ioc_task_link(struct io_context *ioc)
 	 * if ref count is zero, don't allow sharing (ioc is going away, it's
 	 * a race).
 	 */
-	if (ioc && atomic_inc_not_zero(&ioc->refcount)) {
-		atomic_inc(&ioc->nr_tasks);
+	if (ioc && atomic_long_inc_not_zero(&ioc->refcount)) {
+		atomic_long_inc(&ioc->refcount);
 		return ioc;
 	}
 
-- 
cgit v0.10.2


From 0e0c62123b517d2b3c26922342c0cc5bb63a93f8 Mon Sep 17 00:00:00 2001
From: Michal Simek <monstr@monstr.eu>
Date: Wed, 10 Jun 2009 12:57:07 -0700
Subject: fs/bio.c: add missing __user annotation

As reported by sparse:

fs/bio.c:720:13: warning: incorrect type in assignment (different address spaces)
fs/bio.c:720:13:    expected char *iov_addr
fs/bio.c:720:13:    got void [noderef] <asn:1>*
fs/bio.c:724:36: warning: incorrect type in argument 2 (different address spaces)
fs/bio.c:724:36:    expected void const [noderef] <asn:1>*from
fs/bio.c:724:36:    got char *iov_addr

Signed-off-by: Michal Simek <monstr@monstr.eu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/fs/bio.c b/fs/bio.c
index ab423a1..533266a 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -722,7 +722,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
 
 		while (bv_len && iov_idx < iov_count) {
 			unsigned int bytes;
-			char *iov_addr;
+			char __user *iov_addr;
 
 			bytes = min_t(unsigned int,
 				      iov[iov_idx].iov_len - iov_off, bv_len);
-- 
cgit v0.10.2


From 1adbee50fd6fce5af4feb34d2db93cfe4d2066a4 Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@crashcourse.ca>
Date: Wed, 10 Jun 2009 12:57:08 -0700
Subject: ramdisk: remove long-deprecated "ramdisk=" boot-time parameter

The "ramdisk" parameter was removed from the defunct rd.c file quite some
time ago, in favour of the more specific "ramdisk_size" parameter so, for
consistency, the same should be done here.

Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 5f7e64b..4bf8705 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -407,12 +407,7 @@ static int __init ramdisk_size(char *str)
 	rd_size = simple_strtol(str, NULL, 0);
 	return 1;
 }
-static int __init ramdisk_size2(char *str)
-{
-	return ramdisk_size(str);
-}
-__setup("ramdisk=", ramdisk_size);
-__setup("ramdisk_size=", ramdisk_size2);
+__setup("ramdisk_size=", ramdisk_size);
 #endif
 
 /*
-- 
cgit v0.10.2


From 5e50b9ef975219304cc91d601530994861585bfe Mon Sep 17 00:00:00 2001
From: Ondrej Zary <linux@rainbow-software.org>
Date: Wed, 10 Jun 2009 12:57:09 -0700
Subject: floppy: fix hibernation

Based on Ingo Molnar's patch from 2006, this makes the floppy work after
resume from hibernation, at least on my machine.

This fix resets the floppy controller on resume.  It was experimentally
determined to bring the controller back to life - we don't really know why
it works.

floppy_init() does the same thing at boot/modprobe time.

Signed-off-by: Ondrej Zary <linux@rainbow-software.org>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 90877fe..862b40c 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -4148,6 +4148,24 @@ static void floppy_device_release(struct device *dev)
 {
 }
 
+static int floppy_resume(struct platform_device *dev)
+{
+	int fdc;
+
+	for (fdc = 0; fdc < N_FDC; fdc++)
+		if (FDCS->address != -1)
+			user_reset_fdc(-1, FD_RESET_ALWAYS, 0);
+
+	return 0;
+}
+
+static struct platform_driver floppy_driver = {
+	.resume = floppy_resume,
+	.driver = {
+		.name = "floppy",
+	},
+};
+
 static struct platform_device floppy_device[N_DRIVE];
 
 static struct kobject *floppy_find(dev_t dev, int *part, void *data)
@@ -4196,10 +4214,14 @@ static int __init floppy_init(void)
 	if (err)
 		goto out_put_disk;
 
+	err = platform_driver_register(&floppy_driver);
+	if (err)
+		goto out_unreg_blkdev;
+
 	floppy_queue = blk_init_queue(do_fd_request, &floppy_lock);
 	if (!floppy_queue) {
 		err = -ENOMEM;
-		goto out_unreg_blkdev;
+		goto out_unreg_driver;
 	}
 	blk_queue_max_sectors(floppy_queue, 64);
 
@@ -4346,6 +4368,8 @@ out_flush_work:
 out_unreg_region:
 	blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
 	blk_cleanup_queue(floppy_queue);
+out_unreg_driver:
+	platform_driver_unregister(&floppy_driver);
 out_unreg_blkdev:
 	unregister_blkdev(FLOPPY_MAJOR, "fd");
 out_put_disk:
@@ -4566,6 +4590,7 @@ static void __exit floppy_module_exit(void)
 
 	blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
 	unregister_blkdev(FLOPPY_MAJOR, "fd");
+	platform_driver_unregister(&floppy_driver);
 
 	for (drive = 0; drive < N_DRIVE; drive++) {
 		del_timer_sync(&motor_off_timer[drive]);
-- 
cgit v0.10.2


From 0de51088e6a82bc8413d3ca9e28bbca2788b5b53 Mon Sep 17 00:00:00 2001
From: Harald Welte <HaraldWelte@viatech.com>
Date: Mon, 8 Jun 2009 18:27:54 +0800
Subject: CPUFREQ: Enable acpi-cpufreq driver for VIA/Centaur CPUs

The VIA/Centaur C7, C7-M and Nano CPU's all support ACPI based cpu p-states
using a MSR interface.  The Linux driver just never made use of it, since in
addition to the check for the EST flag it also checked if the vendor is Intel.

Signed-off-by: Harald Welte <HaraldWelte@viatech.com>
[ Removed the vendor checks entirely  - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 752e8c6..ae9b5032 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -90,11 +90,7 @@ static int check_est_cpu(unsigned int cpuid)
 {
 	struct cpuinfo_x86 *cpu = &cpu_data(cpuid);
 
-	if (cpu->x86_vendor != X86_VENDOR_INTEL ||
-	    !cpu_has(cpu, X86_FEATURE_EST))
-		return 0;
-
-	return 1;
+	return cpu_has(cpu, X86_FEATURE_EST);
 }
 
 static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
-- 
cgit v0.10.2


From 0fea615e526b4b7eff0363ee02d5753e5f924089 Mon Sep 17 00:00:00 2001
From: Harald Welte <HaraldWelte@viatech.com>
Date: Mon, 8 Jun 2009 18:29:36 +0800
Subject: CPUFREQ: Mark e_powersaver driver as EXPERIMENTAL and DANGEROUS

The e_powersaver driver for VIA's C7 CPU's needs to be marked as
DANGEROUS as it configures the CPU to power states that are out
of specification.

According to Centaur, all systems with C7 and Nano CPU's support
the ACPI p-state method.  Thus, the acpi-cpufreq driver should
be used instead.

Signed-off-by: Harald Welte <HaraldWelte@viatech.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index 52c8398..f138c6c 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -220,11 +220,14 @@ config X86_LONGHAUL
 	  If in doubt, say N.
 
 config X86_E_POWERSAVER
-	tristate "VIA C7 Enhanced PowerSaver"
+	tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"
 	select CPU_FREQ_TABLE
-	depends on X86_32
+	depends on X86_32 && EXPERIMENTAL
 	help
-	  This adds the CPUFreq driver for VIA C7 processors.
+	  This adds the CPUFreq driver for VIA C7 processors.  However, this driver
+	  does not have any safeguards to prevent operating the CPU out of spec
+	  and is thus considered dangerous.  Please use the regular ACPI cpufreq
+	  driver, enabled by CONFIG_X86_ACPI_CPUFREQ.
 
 	  If in doubt, say N.
 
-- 
cgit v0.10.2


From 511b01bdf64ad8a38414096eab283c7784aebfc4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Jun 2009 00:32:00 +0200
Subject: Revert "x86, bts: reenable ptrace branch trace support"

This reverts commit 7e0bfad24d85de7cf2202a7b0ce51de11a077b21.

A late objection to the ABI has arrived:

   http://lkml.org/lkml/2009/6/10/253

Keep the ABI disabled out of caution, to not create premature
user-space expectations.

While the hw-branch-tracing variant uses and tests the BTS code.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Markus Metzger <markus.t.metzger@intel.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 924e156..8130334 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -506,6 +506,7 @@ config X86_PTRACE_BTS
 	bool "Branch Trace Store"
 	default y
 	depends on X86_DEBUGCTLMSR
+	depends on BROKEN
 	---help---
 	  This adds a ptrace interface to the hardware's branch trace store.
 
-- 
cgit v0.10.2


From fd0fb038d5a308c7faddd1701be5e70aaffec98b Mon Sep 17 00:00:00 2001
From: Shin Hong <hongshin@gmail.com>
Date: Wed, 10 Jun 2009 20:11:29 -0400
Subject: Btrfs: init worker struct fields before kthread-run

This patch fixes a bug which may result race condition
between btrfs_start_workers() and worker_loop().

btrfs_start_workers() executed in a parent thread writes
on workers->worker and worker_loop() in a child thread
reads workers->worker. However, there is no synchronization
enforcing the order of two operations.

This patch makes btrfs_start_workers() fill workers->worker
before it starts a child thread with worker_loop()

Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 502c3d6..7f88628 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -294,10 +294,10 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
 		INIT_LIST_HEAD(&worker->worker_list);
 		spin_lock_init(&worker->lock);
 		atomic_set(&worker->num_pending, 0);
+		worker->workers = workers;
 		worker->task = kthread_run(worker_loop, worker,
 					   "btrfs-%s-%d", workers->name,
 					   workers->num_workers + i);
-		worker->workers = workers;
 		if (IS_ERR(worker->task)) {
 			kfree(worker);
 			ret = PTR_ERR(worker->task);
-- 
cgit v0.10.2


From 66fff22483d8542dfb4d61a28d21277bbde321e8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 10 Jun 2009 22:53:37 +0200
Subject: perf_counter: Annotate exit ctx recursion

Ever since Paul fixed it to unclone the context before taking the
ctx->lock this became a false positive, annotate it away.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 51c571e..ae591a1 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3879,7 +3879,18 @@ void perf_counter_exit_task(struct task_struct *child)
 	spin_unlock(&child_ctx->lock);
 	local_irq_restore(flags);
 
-	mutex_lock(&child_ctx->mutex);
+	/*
+	 * We can recurse on the same lock type through:
+	 *
+	 *   __perf_counter_exit_task()
+	 *     sync_child_counter()
+	 *       fput(parent_counter->filp)
+	 *         perf_release()
+	 *           mutex_lock(&ctx->mutex)
+	 *
+	 * But since its the parent context it won't be the same instance.
+	 */
+	mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
 
 again:
 	list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
-- 
cgit v0.10.2


From ea1900e571d40a3ce60c835c2f21e1fd8c5cb663 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 10 Jun 2009 21:45:22 +0200
Subject: perf_counter tools: Normalize data using per sample period data

When we use variable period sampling, add the period to the sample
data and use that to normalize the samples.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index c10553c..919f23c 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -350,8 +350,9 @@ static void create_counter(int counter, int cpu, pid_t pid)
 	struct perf_counter_attr *attr = attrs + counter;
 	int track = 1;
 
-	attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_PERIOD;
+	attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
 	if (freq) {
+		attr->sample_type	|= PERF_SAMPLE_PERIOD;
 		attr->freq		= 1;
 		attr->sample_freq	= freq;
 	}
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 9a0e31e..f57fd5c 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -456,7 +456,7 @@ struct hist_entry {
 	uint64_t	 ip;
 	char		 level;
 
-	uint32_t	 count;
+	uint64_t	 count;
 };
 
 /*
@@ -726,7 +726,7 @@ hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
 
 static int
 hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
-		struct symbol *sym, uint64_t ip, char level)
+		struct symbol *sym, uint64_t ip, char level, uint64_t count)
 {
 	struct rb_node **p = &hist.rb_node;
 	struct rb_node *parent = NULL;
@@ -738,7 +738,7 @@ hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
 		.sym	= sym,
 		.ip	= ip,
 		.level	= level,
-		.count	= 1,
+		.count	= count,
 	};
 	int cmp;
 
@@ -749,7 +749,7 @@ hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
 		cmp = hist_entry__cmp(&entry, he);
 
 		if (!cmp) {
-			he->count++;
+			he->count += count;
 			return 0;
 		}
 
@@ -942,15 +942,19 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 	struct dso *dso = NULL;
 	struct thread *thread = threads__findnew(event->ip.pid);
 	uint64_t ip = event->ip.ip;
+	uint64_t period = 1;
 	struct map *map = NULL;
 
+	if (event->header.type & PERF_SAMPLE_PERIOD)
+		period = event->ip.period;
+
 	dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p period: %Ld\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->header.misc,
 		event->ip.pid,
 		(void *)(long)ip,
-		(long long)event->ip.period);
+		(long long)period);
 
 	dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
 
@@ -1001,13 +1005,13 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 		if (dso)
 			sym = dso->find_symbol(dso, ip);
 
-		if (hist_entry__add(thread, map, dso, sym, ip, level)) {
+		if (hist_entry__add(thread, map, dso, sym, ip, level, period)) {
 			fprintf(stderr,
 		"problem incrementing symbol count, skipping event\n");
 			return -1;
 		}
 	}
-	total++;
+	total += period;
 
 	return 0;
 }
-- 
cgit v0.10.2


From df1a132bf3d3508f863336c80a27806a2ac947e0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 10 Jun 2009 21:02:22 +0200
Subject: perf_counter: Introduce struct for sample data

For easy extension of the sample data, put it in a structure.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 4786ad9..5e0bf399 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -1001,7 +1001,11 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 * Finally record data if requested.
 	 */
 	if (record) {
-		addr = 0;
+		struct perf_sample_data data = {
+			.regs = regs,
+			.addr = 0,
+		};
+
 		if (counter->attr.sample_type & PERF_SAMPLE_ADDR) {
 			/*
 			 * The user wants a data address recorded.
@@ -1016,9 +1020,9 @@ static void record_and_restart(struct perf_counter *counter, long val,
 			sdsync = (ppmu->flags & PPMU_ALT_SIPR) ?
 				POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC;
 			if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
-				addr = mfspr(SPRN_SDAR);
+				data.addr = mfspr(SPRN_SDAR);
 		}
-		if (perf_counter_overflow(counter, nmi, regs, addr)) {
+		if (perf_counter_overflow(counter, nmi, &data)) {
 			/*
 			 * Interrupts are coming too fast - throttle them
 			 * by setting the counter to 0, so it will be
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 240ca56..82a23d4 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1173,11 +1173,14 @@ static void intel_pmu_reset(void)
  */
 static int intel_pmu_handle_irq(struct pt_regs *regs)
 {
+	struct perf_sample_data data;
 	struct cpu_hw_counters *cpuc;
-	struct cpu_hw_counters;
 	int bit, cpu, loops;
 	u64 ack, status;
 
+	data.regs = regs;
+	data.addr = 0;
+
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
@@ -1210,7 +1213,7 @@ again:
 		if (!intel_pmu_save_and_restart(counter))
 			continue;
 
-		if (perf_counter_overflow(counter, 1, regs, 0))
+		if (perf_counter_overflow(counter, 1, &data))
 			intel_pmu_disable_counter(&counter->hw, bit);
 	}
 
@@ -1230,12 +1233,16 @@ again:
 
 static int amd_pmu_handle_irq(struct pt_regs *regs)
 {
-	int cpu, idx, handled = 0;
+	struct perf_sample_data data;
 	struct cpu_hw_counters *cpuc;
 	struct perf_counter *counter;
 	struct hw_perf_counter *hwc;
+	int cpu, idx, handled = 0;
 	u64 val;
 
+	data.regs = regs;
+	data.addr = 0;
+
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
@@ -1256,7 +1263,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
 		if (!x86_perf_counter_set_period(counter, hwc, idx))
 			continue;
 
-		if (perf_counter_overflow(counter, 1, regs, 0))
+		if (perf_counter_overflow(counter, 1, &data))
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 282d8cc..d8c0eb4 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -605,8 +605,14 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_counter_context *ctx, int cpu);
 extern void perf_counter_update_userpage(struct perf_counter *counter);
 
-extern int perf_counter_overflow(struct perf_counter *counter,
-				 int nmi, struct pt_regs *regs, u64 addr);
+struct perf_sample_data {
+	struct pt_regs	*regs;
+	u64		addr;
+};
+
+extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
+				 struct perf_sample_data *data);
+
 /*
  * Return 1 for a software counter, 0 for a hardware counter
  */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ae591a1..4fe85e8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2378,8 +2378,8 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
 	return task_pid_nr_ns(p, counter->ns);
 }
 
-static void perf_counter_output(struct perf_counter *counter,
-				int nmi, struct pt_regs *regs, u64 addr)
+static void perf_counter_output(struct perf_counter *counter, int nmi,
+				struct perf_sample_data *data)
 {
 	int ret;
 	u64 sample_type = counter->attr.sample_type;
@@ -2404,10 +2404,10 @@ static void perf_counter_output(struct perf_counter *counter,
 	header.size = sizeof(header);
 
 	header.misc = PERF_EVENT_MISC_OVERFLOW;
-	header.misc |= perf_misc_flags(regs);
+	header.misc |= perf_misc_flags(data->regs);
 
 	if (sample_type & PERF_SAMPLE_IP) {
-		ip = perf_instruction_pointer(regs);
+		ip = perf_instruction_pointer(data->regs);
 		header.type |= PERF_SAMPLE_IP;
 		header.size += sizeof(ip);
 	}
@@ -2460,7 +2460,7 @@ static void perf_counter_output(struct perf_counter *counter,
 	}
 
 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
-		callchain = perf_callchain(regs);
+		callchain = perf_callchain(data->regs);
 
 		if (callchain) {
 			callchain_size = (1 + callchain->nr) * sizeof(u64);
@@ -2486,7 +2486,7 @@ static void perf_counter_output(struct perf_counter *counter,
 		perf_output_put(&handle, time);
 
 	if (sample_type & PERF_SAMPLE_ADDR)
-		perf_output_put(&handle, addr);
+		perf_output_put(&handle, data->addr);
 
 	if (sample_type & PERF_SAMPLE_ID)
 		perf_output_put(&handle, counter->id);
@@ -2950,8 +2950,8 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
  * Generic counter overflow handling.
  */
 
-int perf_counter_overflow(struct perf_counter *counter,
-			  int nmi, struct pt_regs *regs, u64 addr)
+int perf_counter_overflow(struct perf_counter *counter, int nmi,
+			  struct perf_sample_data *data)
 {
 	int events = atomic_read(&counter->event_limit);
 	int throttle = counter->pmu->unthrottle != NULL;
@@ -3005,7 +3005,7 @@ int perf_counter_overflow(struct perf_counter *counter,
 			perf_counter_disable(counter);
 	}
 
-	perf_counter_output(counter, nmi, regs, addr);
+	perf_counter_output(counter, nmi, data);
 	return ret;
 }
 
@@ -3054,24 +3054,25 @@ static void perf_swcounter_set_period(struct perf_counter *counter)
 static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 {
 	enum hrtimer_restart ret = HRTIMER_RESTART;
+	struct perf_sample_data data;
 	struct perf_counter *counter;
-	struct pt_regs *regs;
 	u64 period;
 
 	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer);
 	counter->pmu->read(counter);
 
-	regs = get_irq_regs();
+	data.addr = 0;
+	data.regs = get_irq_regs();
 	/*
 	 * In case we exclude kernel IPs or are somehow not in interrupt
 	 * context, provide the next best thing, the user IP.
 	 */
-	if ((counter->attr.exclude_kernel || !regs) &&
+	if ((counter->attr.exclude_kernel || !data.regs) &&
 			!counter->attr.exclude_user)
-		regs = task_pt_regs(current);
+		data.regs = task_pt_regs(current);
 
-	if (regs) {
-		if (perf_counter_overflow(counter, 0, regs, 0))
+	if (data.regs) {
+		if (perf_counter_overflow(counter, 0, &data))
 			ret = HRTIMER_NORESTART;
 	}
 
@@ -3084,9 +3085,14 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 static void perf_swcounter_overflow(struct perf_counter *counter,
 				    int nmi, struct pt_regs *regs, u64 addr)
 {
+	struct perf_sample_data data = {
+		.regs = regs,
+		.addr = addr,
+	};
+
 	perf_swcounter_update(counter);
 	perf_swcounter_set_period(counter);
-	if (perf_counter_overflow(counter, nmi, regs, addr))
+	if (perf_counter_overflow(counter, nmi, &data))
 		/* soft-disable the counter */
 		;
 
-- 
cgit v0.10.2


From 9e350de37ac9607012fcf9c5314a28fbddf8f43c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 10 Jun 2009 21:34:59 +0200
Subject: perf_counter: Accurate period data

We currently log hw.sample_period for PERF_SAMPLE_PERIOD, however this is
incorrect. When we adjust the period, it will only take effect the next
cycle but report it for the current cycle. So when we adjust the period
for every cycle, we're always wrong.

Solve this by keeping track of the last_period.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 5e0bf399..4990ce2 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -767,6 +767,7 @@ static void power_pmu_unthrottle(struct perf_counter *counter)
 	perf_disable();
 	power_pmu_read(counter);
 	left = counter->hw.sample_period;
+	counter->hw.last_period = left;
 	val = 0;
 	if (left < 0x80000000L)
 		val = 0x80000000L - left;
@@ -937,7 +938,8 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	counter->hw.config = events[n];
 	counter->hw.counter_base = cflags[n];
-	atomic64_set(&counter->hw.period_left, counter->hw.sample_period);
+	counter->hw.last_period = counter->hw.sample_period;
+	atomic64_set(&counter->hw.period_left, counter->hw.last_period);
 
 	/*
 	 * See if we need to reserve the PMU.
@@ -1002,8 +1004,9 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 */
 	if (record) {
 		struct perf_sample_data data = {
-			.regs = regs,
-			.addr = 0,
+			.regs	= regs,
+			.addr	= 0,
+			.period	= counter->hw.last_period,
 		};
 
 		if (counter->attr.sample_type & PERF_SAMPLE_ADDR) {
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 82a23d4..57ae1be 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -698,6 +698,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 
 	if (!hwc->sample_period) {
 		hwc->sample_period = x86_pmu.max_period;
+		hwc->last_period = hwc->sample_period;
 		atomic64_set(&hwc->period_left, hwc->sample_period);
 	}
 
@@ -880,12 +881,14 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 	if (unlikely(left <= -period)) {
 		left = period;
 		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
 		ret = 1;
 	}
 
 	if (unlikely(left <= 0)) {
 		left += period;
 		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
 		ret = 1;
 	}
 	/*
@@ -1257,9 +1260,12 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
 		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
 			continue;
 
-		/* counter overflow */
-		handled = 1;
-		inc_irq_stat(apic_perf_irqs);
+		/*
+		 * counter overflow
+		 */
+		handled		= 1;
+		data.period	= counter->hw.last_period;
+
 		if (!x86_perf_counter_set_period(counter, hwc, idx))
 			continue;
 
@@ -1267,6 +1273,9 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
+	if (handled)
+		inc_irq_stat(apic_perf_irqs);
+
 	return handled;
 }
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index d8c0eb4..5b96647 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -366,6 +366,7 @@ struct hw_perf_counter {
 	};
 	atomic64_t			prev_count;
 	u64				sample_period;
+	u64				last_period;
 	atomic64_t			period_left;
 	u64				interrupts;
 
@@ -606,8 +607,9 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 extern void perf_counter_update_userpage(struct perf_counter *counter);
 
 struct perf_sample_data {
-	struct pt_regs	*regs;
-	u64		addr;
+	struct pt_regs		*regs;
+	u64			addr;
+	u64			period;
 };
 
 extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4fe85e8..8b89b40 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2495,7 +2495,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		perf_output_put(&handle, cpu_entry);
 
 	if (sample_type & PERF_SAMPLE_PERIOD)
-		perf_output_put(&handle, counter->hw.sample_period);
+		perf_output_put(&handle, data->period);
 
 	/*
 	 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
@@ -3040,11 +3040,13 @@ static void perf_swcounter_set_period(struct perf_counter *counter)
 	if (unlikely(left <= -period)) {
 		left = period;
 		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
 	}
 
 	if (unlikely(left <= 0)) {
 		left += period;
 		atomic64_add(period, &hwc->period_left);
+		hwc->last_period = period;
 	}
 
 	atomic64_set(&hwc->prev_count, -left);
@@ -3086,8 +3088,9 @@ static void perf_swcounter_overflow(struct perf_counter *counter,
 				    int nmi, struct pt_regs *regs, u64 addr)
 {
 	struct perf_sample_data data = {
-		.regs = regs,
-		.addr = addr,
+		.regs	= regs,
+		.addr	= addr,
+		.period	= counter->hw.last_period,
 	};
 
 	perf_swcounter_update(counter);
-- 
cgit v0.10.2


From 7f72b47cedd1efc301576ff1050ec0a26c57eb48 Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Tue, 28 Apr 2009 14:13:21 +1000
Subject: m68knommu: fix system reset for ColdFire 527x family

The sofwtare reset control for the 527x ColdFire family is based on
the same Reset Control Unit as the 528x ColdFire family. So use the
same reset code for both.

Signed-off-by: Greg Ungerer <gerg@uclinux.org>

diff --git a/arch/m68k/include/asm/system_no.h b/arch/m68k/include/asm/system_no.h
index 4496c0a..5fbc96d 100644
--- a/arch/m68k/include/asm/system_no.h
+++ b/arch/m68k/include/asm/system_no.h
@@ -264,18 +264,18 @@ static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int siz
 	: /* No output */		\
 	: "o" (*(char *)MCF_MBAR) );	\
 })
-#elif defined(CONFIG_M528x)
+#elif defined(CONFIG_M528x) || defined(CONFIG_M527x)
 /*
- * The MCF528x has a bit (SOFTRST) in memory (Reset Control Register RCR),
- * that when set, resets the MCF528x.
+ * Most of the newer ColdFire family members have a proper RESET unit.
+ * Use the software reset control bit in the Reset Control Register (RCR).
  */
 #define HARD_RESET_NOW() \
-({						\
-	unsigned char volatile *reset;		\
-	asm("move.w	#0x2700, %sr");		\
+({									\
+	unsigned char volatile *reset;					\
+	asm("move.w #0x2700, %sr");					\
 	reset = ((volatile unsigned char *)(MCF_IPSBAR + 0x110000));	\
-	while(1)				\
-	*reset |= (0x01 << 7);\
+	while (1)							\
+		*reset |= (0x01 << 7);					\
 })
 #elif defined(CONFIG_M523x)
 #define HARD_RESET_NOW() ({		\
-- 
cgit v0.10.2


From 293ca0f75415dd8373c716bf9755569daca7ba5d Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Tue, 28 Apr 2009 16:56:03 +1000
Subject: m68knommu: merge system reset for code ColdFire 523x family

The sofwtare reset control code for the 523x ColdFire family uses the
same Reset unit hardware as the 527x and 528x ColdFire parts. So they
should all use the same code. Merge them.

Signed-off-by: Greg Ungerer <gerg@uclinux.org>

diff --git a/arch/m68k/include/asm/system_no.h b/arch/m68k/include/asm/system_no.h
index 5fbc96d..a0a1ae8 100644
--- a/arch/m68k/include/asm/system_no.h
+++ b/arch/m68k/include/asm/system_no.h
@@ -264,7 +264,7 @@ static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int siz
 	: /* No output */		\
 	: "o" (*(char *)MCF_MBAR) );	\
 })
-#elif defined(CONFIG_M528x) || defined(CONFIG_M527x)
+#elif defined(CONFIG_M523x) || defined(CONFIG_M528x) || defined(CONFIG_M527x)
 /*
  * Most of the newer ColdFire family members have a proper RESET unit.
  * Use the software reset control bit in the Reset Control Register (RCR).
@@ -275,16 +275,7 @@ static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int siz
 	asm("move.w #0x2700, %sr");					\
 	reset = ((volatile unsigned char *)(MCF_IPSBAR + 0x110000));	\
 	while (1)							\
-		*reset |= (0x01 << 7);					\
-})
-#elif defined(CONFIG_M523x)
-#define HARD_RESET_NOW() ({		\
-	asm("				\
-	movew #0x2700, %sr;		\
-	movel #0x01000000, %sp;		\
-	moveal #0x40110000, %a0;	\
-	moveb #0x80, (%a0);		\
-	");				\
+		*reset |= 0x80;						\
 })
 #elif defined(CONFIG_M520x)
 	/*
-- 
cgit v0.10.2


From c18e52c7696e39c7cb23cd083bbda5652c338b79 Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Thu, 30 Apr 2009 17:03:09 +1000
Subject: m68knommu: add CPU reset code for the 5307 ColdFire

Signed-off-by: Greg Ungerer <gerg@uclinux.org>

diff --git a/arch/m68knommu/platform/5307/config.c b/arch/m68knommu/platform/5307/config.c
index 44803bf..39da9e9 100644
--- a/arch/m68knommu/platform/5307/config.c
+++ b/arch/m68knommu/platform/5307/config.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
-#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -22,8 +21,6 @@
 
 /***************************************************************************/
 
-void coldfire_reset(void);
-
 extern unsigned int mcf_timervector;
 extern unsigned int mcf_profilevector;
 extern unsigned int mcf_timerlevel;
@@ -119,6 +116,17 @@ void mcf_settimericr(unsigned int timer, unsigned int level)
 
 /***************************************************************************/
 
+void m5307_cpu_reset(void)
+{
+	local_irq_disable();
+	/* Set watchdog to soft reset, and enabled */
+	__raw_writeb(0xc0, MCF_MBAR + MCFSIM_SYPCR);
+	for (;;)
+		/* wait for watchdog to timeout */;
+}
+
+/***************************************************************************/
+
 void __init config_BSP(char *commandp, int size)
 {
 	mcf_setimr(MCFSIM_IMR_MASKALL);
@@ -134,7 +142,7 @@ void __init config_BSP(char *commandp, int size)
 	mcf_timerlevel = 6;
 #endif
 
-	mach_reset = coldfire_reset;
+	mach_reset = m5307_cpu_reset;
 
 #ifdef CONFIG_BDM_DISABLE
 	/*
-- 
cgit v0.10.2


From eb7c874d5c81dae6d7b80857e9f0dab1a5d05ebd Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Thu, 30 Apr 2009 17:15:18 +1000
Subject: m68knommu: add CPU reset code for the 5407 ColdFire

Signed-off-by: Greg Ungerer <gerg@uclinux.org>

diff --git a/arch/m68knommu/platform/5407/config.c b/arch/m68knommu/platform/5407/config.c
index 0ee8c1a..b41d942 100644
--- a/arch/m68knommu/platform/5407/config.c
+++ b/arch/m68knommu/platform/5407/config.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
-#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -21,8 +20,6 @@
 
 /***************************************************************************/
 
-void coldfire_reset(void);
-
 extern unsigned int mcf_timervector;
 extern unsigned int mcf_profilevector;
 extern unsigned int mcf_timerlevel;
@@ -110,6 +107,17 @@ void mcf_settimericr(unsigned int timer, unsigned int level)
 
 /***************************************************************************/
 
+void m5407_cpu_reset(void)
+{
+	local_irq_disable();
+	/* set watchdog to soft reset, and enabled */
+	__raw_writeb(0xc0, MCF_MBAR + MCFSIM_SYPCR);
+	for (;;)
+		/* wait for watchdog to timeout */;
+}
+
+/***************************************************************************/
+
 void __init config_BSP(char *commandp, int size)
 {
 	mcf_setimr(MCFSIM_IMR_MASKALL);
@@ -121,7 +129,7 @@ void __init config_BSP(char *commandp, int size)
 	mcf_timerlevel = 6;
 #endif
 
-	mach_reset = coldfire_reset;
+	mach_reset = m5407_cpu_reset;
 }
 
 /***************************************************************************/
-- 
cgit v0.10.2


From 851377bca613175473b6dafd6c7d1bfbcf24efe2 Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Thu, 30 Apr 2009 17:22:05 +1000
Subject: m68knommu: add CPU reset code for the 5206 ColdFire

Signed-off-by: Greg Ungerer <gerg@uclinux.org>

diff --git a/arch/m68knommu/platform/5206/config.c b/arch/m68knommu/platform/5206/config.c
index 53a5920..f6f7987 100644
--- a/arch/m68knommu/platform/5206/config.c
+++ b/arch/m68knommu/platform/5206/config.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
-#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -21,10 +20,6 @@
 
 /***************************************************************************/
 
-void coldfire_reset(void);
-
-/***************************************************************************/
-
 static struct mcf_platform_uart m5206_uart_platform[] = {
 	{
 		.mapbase	= MCF_MBAR + MCFUART_BASE1,
@@ -109,10 +104,21 @@ void mcf_settimericr(unsigned int timer, unsigned int level)
 
 /***************************************************************************/
 
+void m5206_cpu_reset(void)
+{
+	local_irq_disable();
+	/* Set watchdog to soft reset, and enabled */
+	__raw_writeb(0xc0, MCF_MBAR + MCFSIM_SYPCR);
+	for (;;)
+		/* wait for watchdog to timeout */;
+}
+
+/***************************************************************************/
+
 void __init config_BSP(char *commandp, int size)
 {
 	mcf_setimr(MCFSIM_IMR_MASKALL);
-	mach_reset = coldfire_reset;
+	mach_reset = m5206_cpu_reset;
 }
 
 /***************************************************************************/
-- 
cgit v0.10.2


From b61a7260ff70d10ca27edcc74535af0d46a277fa Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Thu, 30 Apr 2009 17:22:56 +1000
Subject: m68knommu: add CPU reset code for the 5206e ColdFire

Signed-off-by: Greg Ungerer <gerg@uclinux.org>

diff --git a/arch/m68knommu/platform/5206e/config.c b/arch/m68knommu/platform/5206e/config.c
index db90254..65887799 100644
--- a/arch/m68knommu/platform/5206e/config.c
+++ b/arch/m68knommu/platform/5206e/config.c
@@ -11,7 +11,6 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
-#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -21,10 +20,6 @@
 
 /***************************************************************************/
 
-void coldfire_reset(void);
-
-/***************************************************************************/
-
 static struct mcf_platform_uart m5206e_uart_platform[] = {
 	{
 		.mapbase	= MCF_MBAR + MCFUART_BASE1,
@@ -109,6 +104,17 @@ void mcf_settimericr(unsigned int timer, unsigned int level)
 
 /***************************************************************************/
 
+void m5206e_cpu_reset(void)
+{
+	local_irq_disable();
+	/* Set watchdog to soft reset, and enabled */
+	__raw_writeb(0xc0, MCF_MBAR + MCFSIM_SYPCR);
+	for (;;)
+		/* wait for watchdog to timeout */;
+}
+
+/***************************************************************************/
+
 void __init config_BSP(char *commandp, int size)
 {
 	mcf_setimr(MCFSIM_IMR_MASKALL);
@@ -119,7 +125,7 @@ void __init config_BSP(char *commandp, int size)
 	commandp[size-1] = 0;
 #endif /* CONFIG_NETtel */
 
-	mach_reset = coldfire_reset;
+	mach_reset = m5206e_cpu_reset;
 }
 
 /***************************************************************************/
-- 
cgit v0.10.2


From 6f5aa7ce38dfb9c04eecac1222fc4b2521f02b83 Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Thu, 30 Apr 2009 17:31:19 +1000
Subject: m68knommu: add CPU reset code for the 5249 ColdFire

Signed-off-by: Greg Ungerer <gerg@uclinux.org>

diff --git a/arch/m68knommu/platform/5249/config.c b/arch/m68knommu/platform/5249/config.c
index 9eab19d..93d9988 100644
--- a/arch/m68knommu/platform/5249/config.c
+++ b/arch/m68knommu/platform/5249/config.c
@@ -11,7 +11,6 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
-#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -20,10 +19,6 @@
 
 /***************************************************************************/
 
-void coldfire_reset(void);
-
-/***************************************************************************/
-
 static struct mcf_platform_uart m5249_uart_platform[] = {
 	{
 		.mapbase	= MCF_MBAR + MCFUART_BASE1,
@@ -106,10 +101,21 @@ void mcf_settimericr(unsigned int timer, unsigned int level)
 
 /***************************************************************************/
 
+void m5249_cpu_reset(void)
+{
+	local_irq_disable();
+	/* Set watchdog to soft reset, and enabled */
+	__raw_writeb(0xc0, MCF_MBAR + MCFSIM_SYPCR);
+	for (;;)
+		/* wait for watchdog to timeout */;
+}
+
+/***************************************************************************/
+
 void __init config_BSP(char *commandp, int size)
 {
 	mcf_setimr(MCFSIM_IMR_MASKALL);
-	mach_reset = coldfire_reset;
+	mach_reset = m5249_cpu_reset;
 }
 
 /***************************************************************************/
-- 
cgit v0.10.2


From 384feb91319766298c6358f23f54873d44c9d8cb Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Thu, 30 Apr 2009 22:03:43 +1000
Subject: m68knommu: add CPU reset code for the 532x ColdFire

Signed-off-by: Greg Ungerer <gerg@uclinux.org>

diff --git a/arch/m68k/include/asm/m532xsim.h b/arch/m68k/include/asm/m532xsim.h
index ce60345..eb7fd44 100644
--- a/arch/m68k/include/asm/m532xsim.h
+++ b/arch/m68k/include/asm/m532xsim.h
@@ -127,6 +127,18 @@
 
 /*********************************************************************
  *
+ * Reset Controller Module
+ *
+ *********************************************************************/
+
+#define	MCF_RCR			0xFC0A0000
+#define	MCF_RSR			0xFC0A0001
+
+#define	MCF_RCR_SWRESET		0x80		/* Software reset bit */
+#define	MCF_RCR_FRCSTOUT	0x40		/* Force external reset */
+
+/*********************************************************************
+ *
  * Inter-IC (I2C) Module
  *
  *********************************************************************/
diff --git a/arch/m68knommu/platform/532x/config.c b/arch/m68knommu/platform/532x/config.c
index 591f2f8..cdb7619 100644
--- a/arch/m68knommu/platform/532x/config.c
+++ b/arch/m68knommu/platform/532x/config.c
@@ -31,8 +31,6 @@
 
 /***************************************************************************/
 
-void coldfire_reset(void);
-
 extern unsigned int mcf_timervector;
 extern unsigned int mcf_profilevector;
 extern unsigned int mcf_timerlevel;
@@ -164,6 +162,14 @@ void mcf_settimericr(unsigned int timer, unsigned int level)
 
 /***************************************************************************/
 
+static void m532x_cpu_reset(void)
+{
+	local_irq_disable();
+	__raw_writeb(MCF_RCR_SWRESET, MCF_RCR);
+}
+
+/***************************************************************************/
+
 void __init config_BSP(char *commandp, int size)
 {
 	mcf_setimr(MCFSIM_IMR_MASKALL);
@@ -181,7 +187,7 @@ void __init config_BSP(char *commandp, int size)
 
 	mcf_timervector = 64+32;
 	mcf_profilevector = 64+33;
-	mach_reset = coldfire_reset;
+	mach_reset = m532x_cpu_reset;
 
 #ifdef CONFIG_BDM_DISABLE
 	/*
-- 
cgit v0.10.2


From 25ce4a908abebf8c7d2e89908d36050e1de9cbec Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Thu, 30 Apr 2009 22:39:50 +1000
Subject: m68knommu: move CPU reset code for the 520x ColdFire into its
 platform code

Signed-off-by: Greg Ungerer <gerg@uclinux.org>

diff --git a/arch/m68k/include/asm/m520xsim.h b/arch/m68k/include/asm/m520xsim.h
index 49d016e..83bbcfd 100644
--- a/arch/m68k/include/asm/m520xsim.h
+++ b/arch/m68k/include/asm/m520xsim.h
@@ -59,5 +59,14 @@
 #define MCFPIT_IMR		MCFINTC_IMRL
 #define MCFPIT_IMR_IBIT		(1 << MCFINT_PIT1)
 
+/*
+ *  Reset Controll Unit.
+ */
+#define	MCF_RCR			0xFC0A0000
+#define	MCF_RSR			0xFC0A0001
+
+#define	MCF_RCR_SWRESET		0x80		/* Software reset bit */
+#define	MCF_RCR_FRCSTOUT	0x40		/* Force external reset */
+
 /****************************************************************************/
 #endif  /* m520xsim_h */
diff --git a/arch/m68knommu/platform/520x/config.c b/arch/m68knommu/platform/520x/config.c
index 855fc6a..1c43a8a 100644
--- a/arch/m68knommu/platform/520x/config.c
+++ b/arch/m68knommu/platform/520x/config.c
@@ -14,7 +14,6 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
-#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -23,10 +22,6 @@
 
 /***************************************************************************/
 
-void coldfire_reset(void);
-
-/***************************************************************************/
-
 static struct mcf_platform_uart m520x_uart_platform[] = {
 	{
 		.mapbase	= MCF_MBAR + MCFUART_BASE1,
@@ -169,9 +164,17 @@ void mcf_autovector(unsigned int vec)
 
 /***************************************************************************/
 
+static void m520x_cpu_reset(void)
+{
+	local_irq_disable();
+	__raw_writeb(MCF_RCR_SWRESET, MCF_RCR);
+}
+
+/***************************************************************************/
+
 void __init config_BSP(char *commandp, int size)
 {
-	mach_reset = coldfire_reset;
+	mach_reset = m520x_cpu_reset;
 	m520x_uarts_init();
 	m520x_fec_init();
 }
-- 
cgit v0.10.2


From 55b33f316d25c1ffb13a65de7f846b950b5ef5a6 Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Thu, 30 Apr 2009 22:58:35 +1000
Subject: m68knommu: move CPU reset code for the 523x ColdFire into its
 platform code

Signed-off-by: Greg Ungerer <gerg@uclinux.org>

diff --git a/arch/m68k/include/asm/m523xsim.h b/arch/m68k/include/asm/m523xsim.h
index bf39731..55183b5d 100644
--- a/arch/m68k/include/asm/m523xsim.h
+++ b/arch/m68k/include/asm/m523xsim.h
@@ -41,5 +41,14 @@
 #define	MCFSIM_DACR1		0x50		/* SDRAM base address 1 */
 #define	MCFSIM_DMR1		0x54		/* SDRAM address mask 1 */
 
+/*
+ *  Reset Controll Unit (relative to IPSBAR).
+ */
+#define	MCF_RCR			0x110000
+#define	MCF_RSR			0x110001
+
+#define	MCF_RCR_SWRESET		0x80		/* Software reset bit */
+#define	MCF_RCR_FRCSTOUT	0x40		/* Force external reset */
+
 /****************************************************************************/
 #endif	/* m523xsim_h */
diff --git a/arch/m68knommu/platform/523x/config.c b/arch/m68knommu/platform/523x/config.c
index 74133f2..961fefe 100644
--- a/arch/m68knommu/platform/523x/config.c
+++ b/arch/m68knommu/platform/523x/config.c
@@ -15,7 +15,6 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
-#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -24,10 +23,6 @@
 
 /***************************************************************************/
 
-void coldfire_reset(void);
-
-/***************************************************************************/
-
 static struct mcf_platform_uart m523x_uart_platform[] = {
 	{
 		.mapbase	= MCF_MBAR + MCFUART_BASE1,
@@ -145,13 +140,20 @@ void mcf_autovector(unsigned int vec)
 {
 	/* Everything is auto-vectored on the 523x */
 }
+/***************************************************************************/
+
+static void m523x_cpu_reset(void)
+{
+	local_irq_disable();
+	__raw_writeb(MCF_RCR_SWRESET, MCF_IPSBAR + MCF_RCR);
+}
 
 /***************************************************************************/
 
 void __init config_BSP(char *commandp, int size)
 {
 	mcf_disableall();
-	mach_reset = coldfire_reset;
+	mach_reset = m523x_cpu_reset;
 	m523x_uarts_init();
 	m523x_fec_init();
 }
-- 
cgit v0.10.2


From 4c0b008d49e728735f54419e60446480017bfe1b Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Thu, 30 Apr 2009 23:06:45 +1000
Subject: m68knommu: move CPU reset code for the 527x ColdFire into its
 platform code

Signed-off-by: Greg Ungerer <gerg@uclinux.org>

diff --git a/arch/m68k/include/asm/m527xsim.h b/arch/m68k/include/asm/m527xsim.h
index 1f63ab3..95f4f8e 100644
--- a/arch/m68k/include/asm/m527xsim.h
+++ b/arch/m68k/include/asm/m527xsim.h
@@ -70,5 +70,14 @@
 #define UART2_ENABLE_MASK	0x3f00 
 #endif
 
+/*
+ *  Reset Controll Unit (relative to IPSBAR).
+ */
+#define	MCF_RCR			0x110000
+#define	MCF_RSR			0x110001
+
+#define	MCF_RCR_SWRESET		0x80		/* Software reset bit */
+#define	MCF_RCR_FRCSTOUT	0x40		/* Force external reset */
+
 /****************************************************************************/
 #endif	/* m527xsim_h */
diff --git a/arch/m68knommu/platform/527x/config.c b/arch/m68knommu/platform/527x/config.c
index 428b159..f746439 100644
--- a/arch/m68knommu/platform/527x/config.c
+++ b/arch/m68knommu/platform/527x/config.c
@@ -15,7 +15,6 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
-#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -24,10 +23,6 @@
 
 /***************************************************************************/
 
-void coldfire_reset(void);
-
-/***************************************************************************/
-
 static struct mcf_platform_uart m527x_uart_platform[] = {
 	{
 		.mapbase	= MCF_MBAR + MCFUART_BASE1,
@@ -227,10 +222,18 @@ void mcf_autovector(unsigned int vec)
 
 /***************************************************************************/
 
+static void m527x_cpu_reset(void)
+{
+	local_irq_disable();
+	__raw_writeb(MCF_RCR_SWRESET, MCF_IPSBAR + MCF_RCR);
+}
+
+/***************************************************************************/
+
 void __init config_BSP(char *commandp, int size)
 {
 	mcf_disableall();
-	mach_reset = coldfire_reset;
+	mach_reset = m527x_cpu_reset;
 	m527x_uarts_init();
 	m527x_fec_init();
 }
-- 
cgit v0.10.2


From dd65b1de553ddfd85e8fea9d3aa9db7479ccf2aa Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Thu, 30 Apr 2009 23:15:56 +1000
Subject: m68knommu: move CPU reset code for the 528x ColdFire into its
 platform code

Signed-off-by: Greg Ungerer <gerg@uclinux.org>

diff --git a/arch/m68k/include/asm/m528xsim.h b/arch/m68k/include/asm/m528xsim.h
index 28bf783..d79c49f 100644
--- a/arch/m68k/include/asm/m528xsim.h
+++ b/arch/m68k/include/asm/m528xsim.h
@@ -56,6 +56,14 @@
 #define MCF5282_INTC0_ICR17     (volatile u8 *) (MCF_IPSBAR + 0x0C51)
 
 
+/*
+ *  Reset Control Unit (relative to IPSBAR).
+ */
+#define	MCF_RCR			0x110000
+#define	MCF_RSR			0x110001
+
+#define	MCF_RCR_SWRESET		0x80		/* Software reset bit */
+#define	MCF_RCR_FRCSTOUT	0x40		/* Force external reset */
 
 /*********************************************************************
 *
diff --git a/arch/m68knommu/platform/528x/config.c b/arch/m68knommu/platform/528x/config.c
index bee526f..a1d1a61 100644
--- a/arch/m68knommu/platform/528x/config.c
+++ b/arch/m68knommu/platform/528x/config.c
@@ -31,10 +31,6 @@
 
 /***************************************************************************/
 
-void coldfire_reset(void);
-
-/***************************************************************************/
-
 static struct mcf_platform_uart m528x_uart_platform[] = {
 	{
 		.mapbase	= MCF_MBAR + MCFUART_BASE1,
@@ -171,6 +167,14 @@ void mcf_autovector(unsigned int vec)
 
 /***************************************************************************/
 
+static void m528x_cpu_reset(void)
+{
+	local_irq_disable();
+	__raw_writeb(MCF_RCR_SWRESET, MCF_IPSBAR + MCF_RCR);
+}
+
+/***************************************************************************/
+
 #ifdef CONFIG_WILDFIRE
 void wildfire_halt(void)
 {
@@ -214,6 +218,7 @@ void __init config_BSP(char *commandp, int size)
 
 static int __init init_BSP(void)
 {
+	mach_reset = m528x_cpu_reset;
 	m528x_uarts_init();
 	m528x_fec_init();
 	platform_add_devices(m528x_devices, ARRAY_SIZE(m528x_devices));
-- 
cgit v0.10.2


From 05728aec8b05ec3fa859add8b22ba46432611e9e Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Thu, 30 Apr 2009 23:32:52 +1000
Subject: m68knommu: move CPU reset code for the 5272 ColdFire into its
 platform code

Signed-off-by: Greg Ungerer <gerg@uclinux.org>

diff --git a/arch/m68knommu/platform/5272/config.c b/arch/m68knommu/platform/5272/config.c
index e049245..5f95fcd 100644
--- a/arch/m68knommu/platform/5272/config.c
+++ b/arch/m68knommu/platform/5272/config.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
-#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -21,8 +20,6 @@
 
 /***************************************************************************/
 
-void coldfire_reset(void);
-
 extern unsigned int mcf_timervector;
 extern unsigned int mcf_profilevector;
 extern unsigned int mcf_timerlevel;
@@ -170,6 +167,19 @@ void mcf_settimericr(int timer, int level)
 
 /***************************************************************************/
 
+static void m5272_cpu_reset(void)
+{
+	local_irq_disable();
+	/* Set watchdog to reset, and enabled */
+	__raw_writew(0, MCF_MBAR + MCFSIM_WIRR);
+	__raw_writew(1, MCF_MBAR + MCFSIM_WRRR);
+	__raw_writew(0, MCF_MBAR + MCFSIM_WCR);
+	for (;;)
+		/* wait for watchdog to timeout */;
+}
+
+/***************************************************************************/
+
 void __init config_BSP(char *commandp, int size)
 {
 #if defined (CONFIG_MOD5272)
@@ -194,7 +204,7 @@ void __init config_BSP(char *commandp, int size)
 
 	mcf_timervector = 69;
 	mcf_profilevector = 70;
-	mach_reset = coldfire_reset;
+	mach_reset = m5272_cpu_reset;
 }
 
 /***************************************************************************/
-- 
cgit v0.10.2


From fb29ad7949aeed3d464ba8d2c9772835f456d4c4 Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Fri, 1 May 2009 16:09:17 +1000
Subject: m68knommu: remove obsolete reset code

All ColdFire and non-MMU 68k code has custom reset routines.
Remove the obsolete and now un-used reset macros.

Signed-off-by: Greg Ungerer <gerg@uclinux.org>

diff --git a/arch/m68k/include/asm/system_no.h b/arch/m68k/include/asm/system_no.h
index a0a1ae8..3c0718d 100644
--- a/arch/m68k/include/asm/system_no.h
+++ b/arch/m68k/include/asm/system_no.h
@@ -203,104 +203,6 @@ static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int siz
 #include <asm-generic/cmpxchg.h>
 #endif
 
-#if defined( CONFIG_M68328 ) || defined( CONFIG_M68EZ328 ) || \
-	defined (CONFIG_M68360) || defined( CONFIG_M68VZ328 )
-#define HARD_RESET_NOW() ({		\
-        local_irq_disable();		\
-        asm("				\
-        moveal #0x10c00000, %a0;	\
-        moveb #0, 0xFFFFF300;		\
-        moveal 0(%a0), %sp;		\
-        moveal 4(%a0), %a0;		\
-        jmp (%a0);			\
-        ");				\
-})
-#endif
-
-#ifdef CONFIG_COLDFIRE
-#if defined(CONFIG_M5272) && defined(CONFIG_NETtel)
-/*
- * Need to account for broken early mask of 5272 silicon. So don't
- * jump through the original start address. Jump strait into the
- * known start of the FLASH code.
- */
-#define HARD_RESET_NOW() ({		\
-        asm("				\
-	movew #0x2700, %sr;		\
-        jmp 0xf0000400;			\
-        ");				\
-})
-#elif defined(CONFIG_NETtel) || \
-      defined(CONFIG_SECUREEDGEMP3) || defined(CONFIG_CLEOPATRA)
-#define HARD_RESET_NOW() ({		\
-        asm("				\
-	movew #0x2700, %sr;		\
-	moveal #0x10000044, %a0;	\
-	movel #0xffffffff, (%a0);	\
-	moveal #0x10000001, %a0;	\
-	moveb #0x00, (%a0);		\
-        moveal #0xf0000004, %a0;	\
-        moveal (%a0), %a0;		\
-        jmp (%a0);			\
-        ");				\
-})
-#elif defined(CONFIG_M5272)
-/*
- * Retrieve the boot address in flash using CSBR0 and CSOR0
- * find the reset vector at flash_address + 4 (e.g. 0x400)
- * remap it in the flash's current location (e.g. 0xf0000400)
- * and jump there.
- */ 
-#define HARD_RESET_NOW() ({		\
-	asm("				\
-	movew #0x2700, %%sr;		\
-	move.l	%0+0x40,%%d0;		\
-	and.l	%0+0x44,%%d0;		\
-	andi.l	#0xfffff000,%%d0;	\
-	mov.l	%%d0,%%a0;		\
-	or.l	4(%%a0),%%d0;		\
-	mov.l	%%d0,%%a0;		\
-	jmp (%%a0);"			\
-	: /* No output */		\
-	: "o" (*(char *)MCF_MBAR) );	\
-})
-#elif defined(CONFIG_M523x) || defined(CONFIG_M528x) || defined(CONFIG_M527x)
-/*
- * Most of the newer ColdFire family members have a proper RESET unit.
- * Use the software reset control bit in the Reset Control Register (RCR).
- */
-#define HARD_RESET_NOW() \
-({									\
-	unsigned char volatile *reset;					\
-	asm("move.w #0x2700, %sr");					\
-	reset = ((volatile unsigned char *)(MCF_IPSBAR + 0x110000));	\
-	while (1)							\
-		*reset |= 0x80;						\
-})
-#elif defined(CONFIG_M520x)
-	/*
-	 * The MCF5208 has a bit (SOFTRST) in memory (Reset Control Register 
-	 * RCR), that when set, resets the MCF5208.
-	 */
-#define HARD_RESET_NOW() 		\
-({					\
-	unsigned char volatile *reset;	\
-	asm("move.w     #0x2700, %sr");	\
-	reset = ((volatile unsigned char *)(MCF_IPSBAR + 0xA0000));	\
-	while(1)			\
-		*reset |= 0x80;		\
-})
-#else
-#define HARD_RESET_NOW() ({		\
-        asm("				\
-	movew #0x2700, %sr;		\
-        moveal #0x4, %a0;		\
-        moveal (%a0), %a0;		\
-        jmp (%a0);			\
-        ");				\
-})
-#endif
-#endif
 #define arch_align_stack(x) (x)
 
 
diff --git a/arch/m68knommu/platform/coldfire/vectors.c b/arch/m68knommu/platform/coldfire/vectors.c
index 6cf8946..bdca029 100644
--- a/arch/m68knommu/platform/coldfire/vectors.c
+++ b/arch/m68knommu/platform/coldfire/vectors.c
@@ -96,10 +96,3 @@ void ack_vector(unsigned int irq)
 }
 
 /***************************************************************************/
-
-void coldfire_reset(void)
-{
-	HARD_RESET_NOW();
-}
-
-/***************************************************************************/
-- 
cgit v0.10.2


From 308e610e8c435d7875842b427dbc99de43a4aec9 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Fri, 15 May 2009 10:59:37 -0700
Subject: arch/m68knommu: Convert #ifdef DEBUG printk(KERN_DEBUG to pr_debug(

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Greg Ungerer <gerg@uclinux.org>

diff --git a/arch/m68knommu/kernel/setup.c b/arch/m68knommu/kernel/setup.c
index 5985f19..5c2bb3e 100644
--- a/arch/m68knommu/kernel/setup.c
+++ b/arch/m68knommu/kernel/setup.c
@@ -166,15 +166,13 @@ void __init setup_arch(char **cmdline_p)
 	printk(KERN_INFO "Motorola M5235EVB support (C)2005 Syn-tech Systems, Inc. (Jate Sujjavanich)\n");
 #endif
 
-#ifdef DEBUG
-	printk(KERN_DEBUG "KERNEL -> TEXT=0x%06x-0x%06x DATA=0x%06x-0x%06x "
-		"BSS=0x%06x-0x%06x\n", (int) &_stext, (int) &_etext,
-		(int) &_sdata, (int) &_edata,
-		(int) &_sbss, (int) &_ebss);
-	printk(KERN_DEBUG "MEMORY -> ROMFS=0x%06x-0x%06x MEM=0x%06x-0x%06x\n ",
-		(int) &_ebss, (int) memory_start,
-		(int) memory_start, (int) memory_end);
-#endif
+	pr_debug("KERNEL -> TEXT=0x%06x-0x%06x DATA=0x%06x-0x%06x "
+		 "BSS=0x%06x-0x%06x\n", (int) &_stext, (int) &_etext,
+		 (int) &_sdata, (int) &_edata,
+		 (int) &_sbss, (int) &_ebss);
+	pr_debug("MEMORY -> ROMFS=0x%06x-0x%06x MEM=0x%06x-0x%06x\n ",
+		 (int) &_ebss, (int) memory_start,
+		 (int) memory_start, (int) memory_end);
 
 	/* Keep a copy of command line */
 	*cmdline_p = &command_line[0];
diff --git a/arch/m68knommu/mm/init.c b/arch/m68knommu/mm/init.c
index 7befc0c..b1703c6 100644
--- a/arch/m68knommu/mm/init.c
+++ b/arch/m68knommu/mm/init.c
@@ -126,9 +126,7 @@ void __init mem_init(void)
 	unsigned long start_mem = memory_start; /* DAVIDM - these must start at end of kernel */
 	unsigned long end_mem   = memory_end; /* DAVIDM - this must not include kernel stack at top */
 
-#ifdef DEBUG
-	printk(KERN_DEBUG "Mem_init: start=%lx, end=%lx\n", start_mem, end_mem);
-#endif
+	pr_debug("Mem_init: start=%lx, end=%lx\n", start_mem, end_mem);
 
 	end_mem &= PAGE_MASK;
 	high_memory = (void *) end_mem;
-- 
cgit v0.10.2


From 4f308e35a9bde9d6b671e8409157edb9065f117c Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Wed, 10 Jun 2009 18:47:47 +0530
Subject: headers_check fix: m68k, swab.h

fix the following 'make headers_check' warnings:

  usr/include/asm-m68k/swab.h:4: include of <linux/types.h> is preferred over <asm/types.h>
  usr/include/asm-m68k/swab.h:10: found __[us]{8,16,32,64} type without #include <linux/types.h>

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Greg Ungerer <gerg@uclinux.org>

diff --git a/arch/m68k/include/asm/swab.h b/arch/m68k/include/asm/swab.h
index 9e3054e..5b754aa 100644
--- a/arch/m68k/include/asm/swab.h
+++ b/arch/m68k/include/asm/swab.h
@@ -1,7 +1,7 @@
 #ifndef _M68K_SWAB_H
 #define _M68K_SWAB_H
 
-#include <asm/types.h>
+#include <linux/types.h>
 #include <linux/compiler.h>
 
 #define __SWAB_64_THRU_32__
-- 
cgit v0.10.2


From 9d4f94135385b565e2ce4a37f71b53d0fd3664e8 Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Thu, 11 Jun 2009 13:05:00 +1000
Subject: m68knommu: enumerate INIT_THREAD fields properly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use proper field value setting init INIT_THREAD macro.
Fixes this:

arch/m68knommu/kernel/init_task.c:27: warning: excess elements in array initializer
arch/m68knommu/kernel/init_task.c:27: warning: (near initialization for ‘init_task.thread.fpstate’)

Signed-off-by: Greg Ungerer <gerg@uclinux.org>

diff --git a/arch/m68k/include/asm/processor_no.h b/arch/m68k/include/asm/processor_no.h
index 91cba18..7a1e0ba 100644
--- a/arch/m68k/include/asm/processor_no.h
+++ b/arch/m68k/include/asm/processor_no.h
@@ -72,10 +72,10 @@ struct thread_struct {
 	unsigned char  fpstate[FPSTATESIZE];  /* floating point state */
 };
 
-#define INIT_THREAD  { \
-	sizeof(init_stack) + (unsigned long) init_stack, 0, \
-	PS_S, __KERNEL_DS, \
-	{0, 0}, 0, {0,}, {0, 0, 0}, {0,}, \
+#define INIT_THREAD  {							\
+	.ksp	= sizeof(init_stack) + (unsigned long) init_stack,	\
+	.sr	= PS_S,							\
+	.fs	= __KERNEL_DS,						\
 }
 
 /*
-- 
cgit v0.10.2


From 193e98401a89707ea374e9c32c1b9a5a9dd87a48 Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Thu, 23 Apr 2009 11:24:03 +1000
Subject: m68knommu: remove unecessary include of thread_info.h in entry.S

Signed-off-by: Greg Ungerer <gerg@uclinux.org>

diff --git a/arch/m68knommu/kernel/entry.S b/arch/m68knommu/kernel/entry.S
index f4782d2..f56faa5 100644
--- a/arch/m68knommu/kernel/entry.S
+++ b/arch/m68knommu/kernel/entry.S
@@ -26,7 +26,6 @@
 
 #include <linux/sys.h>
 #include <linux/linkage.h>
-#include <asm/thread_info.h>
 #include <asm/errno.h>
 #include <asm/setup.h>
 #include <asm/segment.h>
-- 
cgit v0.10.2


From a50de78dc6d21ee074e9561c800d194bec12128b Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 2 Jun 2009 08:43:59 +0000
Subject: sh: clock div4 frequency table offset fix

This patch fixes the per clock offset calculation in
sh_clk_div4_register(). Without this patch the offset
to the frequency table for each clock is incorrect.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/clock-cpg.c b/arch/sh/kernel/cpu/clock-cpg.c
index 88fc30d..e604a6f 100644
--- a/arch/sh/kernel/cpu/clock-cpg.c
+++ b/arch/sh/kernel/cpu/clock-cpg.c
@@ -71,8 +71,9 @@ int __init sh_clk_div4_register(struct clk *clks, int nr,
 	int ret = 0;
 	int k;
 
-	k = nr_divs + 1;
-	freq_table = alloc_bootmem(freq_table_size * nr * (nr_divs + 1));
+	freq_table_size *= (nr_divs + 1);
+
+	freq_table = alloc_bootmem(freq_table_size * nr);
 	if (!freq_table)
 		return -ENOMEM;
 
-- 
cgit v0.10.2


From 2693e2740ddae364a80e6083043ba760b6366b69 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 2 Jun 2009 08:53:54 +0000
Subject: sh: clock div6 helper code

This patch adds div6 clock helper code. The div6 clocks
are simply 6-bit divide-by-n modules where n is 1 to 64.

Needed for vclk on sh7722, sh7723, sh7343 and sh7366.
sh7724 needs this even more for vclk, fclka, fclkb,
irdaclk and spuclk.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index 7435e40..026b9da 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -145,4 +145,14 @@ int sh_clk_mstp32_register(struct clk *clks, int nr);
 int sh_clk_div4_register(struct clk *clks, int nr,
 			 struct clk_div_mult_table *table);
 
+#define SH_CLK_DIV6(_name, _parent, _reg, _flags)	\
+{							\
+	.name = _name,					\
+	.parent = _parent,				\
+	.enable_reg = (void __iomem *)_reg,		\
+	.flags = _flags,				\
+}
+
+int sh_clk_div6_register(struct clk *clks, int nr);
+
 #endif /* __ASM_SH_CLOCK_H */
diff --git a/arch/sh/kernel/cpu/clock-cpg.c b/arch/sh/kernel/cpu/clock-cpg.c
index e604a6f..fedc8b8 100644
--- a/arch/sh/kernel/cpu/clock-cpg.c
+++ b/arch/sh/kernel/cpu/clock-cpg.c
@@ -38,6 +38,70 @@ int __init sh_clk_mstp32_register(struct clk *clks, int nr)
 	return ret;
 }
 
+static long sh_clk_div_round_rate(struct clk *clk, unsigned long rate)
+{
+	return clk_rate_table_round(clk, clk->freq_table, rate);
+}
+
+static int sh_clk_div6_divisors[64] = {
+	1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+	17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+	49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64
+};
+
+static struct clk_div_mult_table sh_clk_div6_table = {
+	.divisors = sh_clk_div6_divisors,
+	.nr_divisors = ARRAY_SIZE(sh_clk_div6_divisors),
+};
+
+static unsigned long sh_clk_div6_recalc(struct clk *clk)
+{
+	struct clk_div_mult_table *table = &sh_clk_div6_table;
+	unsigned int idx;
+
+	clk_rate_table_build(clk, clk->freq_table, table->nr_divisors,
+			     table, NULL);
+
+	idx = __raw_readl(clk->enable_reg) & 0x003f;
+
+	return clk->freq_table[idx].frequency;
+}
+
+static struct clk_ops sh_clk_div6_clk_ops = {
+	.recalc		= sh_clk_div6_recalc,
+	.round_rate	= sh_clk_div_round_rate,
+};
+
+int __init sh_clk_div6_register(struct clk *clks, int nr)
+{
+	struct clk *clkp;
+	void *freq_table;
+	int nr_divs = sh_clk_div6_table.nr_divisors;
+	int freq_table_size = sizeof(struct cpufreq_frequency_table);
+	int ret = 0;
+	int k;
+
+	freq_table_size *= (nr_divs + 1);
+
+	freq_table = alloc_bootmem(freq_table_size * nr);
+	if (!freq_table)
+		return -ENOMEM;
+
+	for (k = 0; !ret && (k < nr); k++) {
+		clkp = clks + k;
+
+		clkp->ops = &sh_clk_div6_clk_ops;
+		clkp->id = -1;
+		clkp->freq_table = freq_table + (k * freq_table_size);
+		clkp->freq_table[nr_divs].frequency = CPUFREQ_TABLE_END;
+
+		ret = clk_register(clkp);
+	}
+
+	return ret;
+}
+
 static unsigned long sh_clk_div4_recalc(struct clk *clk)
 {
 	struct clk_div_mult_table *table = clk->priv;
@@ -51,14 +115,9 @@ static unsigned long sh_clk_div4_recalc(struct clk *clk)
 	return clk->freq_table[idx].frequency;
 }
 
-static long sh_clk_div4_round_rate(struct clk *clk, unsigned long rate)
-{
-	return clk_rate_table_round(clk, clk->freq_table, rate);
-}
-
 static struct clk_ops sh_clk_div4_clk_ops = {
 	.recalc		= sh_clk_div4_recalc,
-	.round_rate	= sh_clk_div4_round_rate,
+	.round_rate	= sh_clk_div_round_rate,
 };
 
 int __init sh_clk_div4_register(struct clk *clks, int nr,
-- 
cgit v0.10.2


From 0d4fdbb64f472ef31195714993f1263f77cf85ca Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 2 Jun 2009 09:22:02 +0000
Subject: sh: rework mode pin code

This patch reworks the mode pin code to keep the pin
definitions in one place. The mode pins values are now
the value of the bit instead of bit number.

With this patch in place the sh7785 header file contains
mode pin comments. The sh7785 clock code and the sh7785lcr
board code are updated to reflect the new shared mode pins.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/boards/board-sh7785lcr.c b/arch/sh/boards/board-sh7785lcr.c
index c2894c5..7be56fb 100644
--- a/arch/sh/boards/board-sh7785lcr.c
+++ b/arch/sh/boards/board-sh7785lcr.c
@@ -330,13 +330,13 @@ static int sh7785lcr_mode_pins(void)
 	 * If you change these dip switches then you will need to
 	 * adjust the values below as well.
 	 */
-	value |= 1 << MODE_PIN_MODE4; /* Clock Mode 16 */
-	value |= 1 << MODE_PIN_MODE5; /* 32-bit Area0 bus width */
-	value |= 1 << MODE_PIN_MODE6; /* 32-bit Area0 bus width */
-	value |= 1 << MODE_PIN_MODE7; /* Area 0 SRAM interface [fixed] */
-	value |= 1 << MODE_PIN_MODE8; /* Little Endian */
-	value |= 1 << MODE_PIN_MODE9; /* Master Mode */
-	value |= 1 << MODE_PIN_MODE14; /* No PLL step-up */
+	value |= MODE_PIN4; /* Clock Mode 16 */
+	value |= MODE_PIN5; /* 32-bit Area0 bus width */
+	value |= MODE_PIN6; /* 32-bit Area0 bus width */
+	value |= MODE_PIN7; /* Area 0 SRAM interface [fixed] */
+	value |= MODE_PIN8; /* Little Endian */
+	value |= MODE_PIN9; /* Master Mode */
+	value |= MODE_PIN14; /* No PLL step-up */
 
 	return value;
 }
diff --git a/arch/sh/include/asm/processor.h b/arch/sh/include/asm/processor.h
index fb67482..ff7daaf 100644
--- a/arch/sh/include/asm/processor.h
+++ b/arch/sh/include/asm/processor.h
@@ -95,6 +95,23 @@ const char *get_cpu_subtype(struct sh_cpuinfo *c);
 extern const struct seq_operations cpuinfo_op;
 
 /* processor boot mode configuration */
+#define MODE_PIN0 (1 << 0)
+#define MODE_PIN1 (1 << 1)
+#define MODE_PIN2 (1 << 2)
+#define MODE_PIN3 (1 << 3)
+#define MODE_PIN4 (1 << 4)
+#define MODE_PIN5 (1 << 5)
+#define MODE_PIN6 (1 << 6)
+#define MODE_PIN7 (1 << 7)
+#define MODE_PIN8 (1 << 8)
+#define MODE_PIN9 (1 << 9)
+#define MODE_PIN10 (1 << 10)
+#define MODE_PIN11 (1 << 11)
+#define MODE_PIN12 (1 << 12)
+#define MODE_PIN13 (1 << 13)
+#define MODE_PIN14 (1 << 14)
+#define MODE_PIN15 (1 << 15)
+
 int generic_mode_pins(void);
 int test_mode_pin(int pin);
 
diff --git a/arch/sh/include/cpu-sh4/cpu/sh7785.h b/arch/sh/include/cpu-sh4/cpu/sh7785.h
index 89afaa6..9dc9d91 100644
--- a/arch/sh/include/cpu-sh4/cpu/sh7785.h
+++ b/arch/sh/include/cpu-sh4/cpu/sh7785.h
@@ -1,25 +1,26 @@
 #ifndef __ASM_SH7785_H__
 #define __ASM_SH7785_H__
 
-/* Boot Mode Pins, more information in sh7785 manual Rev.1.00, page 1628 */
-enum {
-	MODE_PIN_MODE0,  /* CPG - Initial Pck/Bck Frequency [FRQMR1] */
-	MODE_PIN_MODE1,  /* CPG - Initial Uck/SHck/DDRck Frequency [FRQMR1] */
-	MODE_PIN_MODE2,  /* CPG - Reserved (L: Normal operation) */
-	MODE_PIN_MODE3,  /* CPG - Reserved (L: Normal operation) */
-	MODE_PIN_MODE4,  /* CPG - Initial PLL setting (72x/36x) */
-	MODE_PIN_MODE5,  /* LBSC - Area0 Memory Type / Bus Width [CS0BCR.8] */
-	MODE_PIN_MODE6,  /* LBSC - Area0 Memory Type / Bus Width [CS0BCR.9] */
-	MODE_PIN_MODE7,  /* LBSC - Area0 Memory Type / Bus Width [CS0BCR.3] */
-	MODE_PIN_MODE8,  /* LBSC - Endian Mode (L: Big, H: Little) [BCR.31] */
-	MODE_PIN_MODE9,  /* LBSC - Master/Slave Mode (L: Slave) [BCR.30] */
-	MODE_PIN_MODE10, /* CPG - Clock Input (L: Ext Clk, H: Crystal) */
-	MODE_PIN_MODE11, /* PCI - Pin Mode (LL: PCI host, LH: PCI slave) */
-	MODE_PIN_MODE12, /* PCI - Pin Mode (HL: Local bus, HH: DU) */
-	MODE_PIN_MODE13, /* Boot Address Mode (L: 29-bit, H: 32-bit) */
-	MODE_PIN_MODE14, /* Reserved (H: Normal operation) */
-	MODE_PIN_MPMD,   /* Emulation Mode (L: Emulation mode, H: LSI mode) */
-};
+/* Boot Mode Pins:
+ *
+ * MODE0: CPG - Initial Pck/Bck Frequency [FRQMR1]
+ * MODE1: CPG - Initial Uck/SHck/DDRck Frequency [FRQMR1]
+ * MODE2: CPG - Reserved (L: Normal operation)
+ * MODE3: CPG - Reserved (L: Normal operation)
+ * MODE4: CPG - Initial PLL setting (72x/36x)
+ * MODE5: LBSC - Area0 Memory Type / Bus Width [CS0BCR.8]
+ * MODE6: LBSC - Area0 Memory Type / Bus Width [CS0BCR.9]
+ * MODE7: LBSC - Area0 Memory Type / Bus Width [CS0BCR.3]
+ * MODE8: LBSC - Endian Mode (L: Big, H: Little) [BCR.31]
+ * MODE9: LBSC - Master/Slave Mode (L: Slave) [BCR.30]
+ * MODE10: CPG - Clock Input (L: Ext Clk, H: Crystal)
+ * MODE11: PCI - Pin Mode (LL: PCI host, LH: PCI slave)
+ * MODE12: PCI - Pin Mode (HL: Local bus, HH: DU)
+ * MODE13: Boot Address Mode (L: 29-bit, H: 32-bit)
+ * MODE14: Reserved (H: Normal operation)
+ *
+ * More information in sh7785 manual Rev.1.00, page 1628.
+ */
 
 /* Pin Function Controller:
  * GPIO_FN_xx - GPIO used to select pin function
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index dae20ac..73abfbf 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -32,7 +32,7 @@ static unsigned long pll_recalc(struct clk *clk)
 {
 	int multiplier;
 
-	multiplier = test_mode_pin(MODE_PIN_MODE4) ? 36 : 72;
+	multiplier = test_mode_pin(MODE_PIN4) ? 36 : 72;
 
 	return clk->parent->rate * multiplier;
 }
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index 050131e..dd38338 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -429,7 +429,7 @@ int generic_mode_pins(void)
 
 int test_mode_pin(int pin)
 {
-	return sh_mv.mv_mode_pins() & (1 << pin);
+	return sh_mv.mv_mode_pins() & pin;
 }
 
 static const char *cpu_name[] = {
-- 
cgit v0.10.2


From e4218ef506a2f8b0b5bf1a4b6effcc2f8983f86b Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 2 Jun 2009 09:45:16 +0000
Subject: sh: sh7723 mode pin V2

This patch is sh7723 mode pin V2. Mode pins and
pin function controller comments are added.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/cpu-sh4/cpu/sh7723.h b/arch/sh/include/cpu-sh4/cpu/sh7723.h
index 9d2f6d7..14c8ca9 100644
--- a/arch/sh/include/cpu-sh4/cpu/sh7723.h
+++ b/arch/sh/include/cpu-sh4/cpu/sh7723.h
@@ -1,6 +1,20 @@
 #ifndef __ASM_SH7723_H__
 #define __ASM_SH7723_H__
 
+/* Boot Mode Pins:
+ *
+ * MD0: CPG - Clock Mode 0->3
+ * MD1: CPG - Clock Mode 0->3
+ * MD2: CPG - Reserved (L: Normal operation)
+ * MD3: BSC - Area0 Bus Width (16/32-bit) [CS0BCR.9,10]
+ * MD5: BSC - Endian Mode (L: Big, H: Little) [CMNCR.3]
+ * MD8: Test Mode
+ */
+
+/* Pin Function Controller:
+ * GPIO_FN_xx - GPIO used to select pin function
+ * GPIO_Pxx - GPIO mapped to real I/O pin on CPU
+ */
 enum {
 	/* PTA */
 	GPIO_PTA7, GPIO_PTA6, GPIO_PTA5, GPIO_PTA4,
-- 
cgit v0.10.2


From 36e5b26bdac0ee3791071d240334853ddd7b9790 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 2 Jun 2009 10:47:32 +0000
Subject: sh: sh7724 mode pin comments

This patch adds comments for the sh7724 mode pins
and pin function controller.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/cpu-sh4/cpu/sh7724.h b/arch/sh/include/cpu-sh4/cpu/sh7724.h
index 34605c9e3..66fd118 100644
--- a/arch/sh/include/cpu-sh4/cpu/sh7724.h
+++ b/arch/sh/include/cpu-sh4/cpu/sh7724.h
@@ -1,6 +1,20 @@
 #ifndef __ASM_SH7724_H__
 #define __ASM_SH7724_H__
 
+/* Boot Mode Pins:
+ *
+ * MD0: CPG - Clock Mode 0->7
+ * MD1: CPG - Clock Mode 0->7
+ * MD2: CPG - Clock Mode 0->7
+ * MD3: BSC - Area0 Bus Width (16/32-bit) [CS0BCR.9,10]
+ * MD5: BSC - Endian Mode (L: Big, H: Little) [CMNCR.3]
+ * MD8: Test Mode
+ */
+
+/* Pin Function Controller:
+ * GPIO_FN_xx - GPIO used to select pin function
+ * GPIO_Pxx - GPIO mapped to real I/O pin on CPU
+ */
 enum {
 	/* PTA */
 	GPIO_PTA7, GPIO_PTA6, GPIO_PTA5, GPIO_PTA4,
-- 
cgit v0.10.2


From ed740cb9b7f6eaee7bb45a07ce53ff9e73a92798 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 3 Jun 2009 08:18:56 +0000
Subject: sh: sh7722 mode pin definitions

This patch adds sh7722 mode pin and pin function
controller comments.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/cpu-sh4/cpu/sh7722.h b/arch/sh/include/cpu-sh4/cpu/sh7722.h
index 4b3096f..738ea43 100644
--- a/arch/sh/include/cpu-sh4/cpu/sh7722.h
+++ b/arch/sh/include/cpu-sh4/cpu/sh7722.h
@@ -1,6 +1,20 @@
 #ifndef __ASM_SH7722_H__
 #define __ASM_SH7722_H__
 
+/* Boot Mode Pins:
+ *
+ * MD0: CPG - Clock Mode 0->3
+ * MD1: CPG - Clock Mode 0->3
+ * MD2: CPG - Reserved (L: Normal operation)
+ * MD3: BSC - Area0 Bus Width (16/32-bit) [CS0BCR.9,10]
+ * MD5: BSC - Endian Mode (L: Big, H: Little) [CMNCR.3]
+ * MD8: Test Mode
+ */
+
+/* Pin Function Controller:
+ * GPIO_FN_xx - GPIO used to select pin function
+ * GPIO_Pxx - GPIO mapped to real I/O pin on CPU
+ */
 enum {
 	/* PTA */
 	GPIO_PTA7, GPIO_PTA6, GPIO_PTA5, GPIO_PTA4,
-- 
cgit v0.10.2


From 0ec80fddf1674579d52b5ff94774ba73107cad31 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 3 Jun 2009 08:22:50 +0000
Subject: sh: add Migo-R mode pin configuration

This patch adds mode pin configuration and
a machvec structure to Migo-R.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/boards/mach-migor/setup.c b/arch/sh/boards/mach-migor/setup.c
index 1ee1de0..6ed401c 100644
--- a/arch/sh/boards/mach-migor/setup.c
+++ b/arch/sh/boards/mach-migor/setup.c
@@ -584,3 +584,22 @@ static int __init migor_devices_setup(void)
 	return platform_add_devices(migor_devices, ARRAY_SIZE(migor_devices));
 }
 __initcall(migor_devices_setup);
+
+/* Return the board specific boot mode pin configuration */
+static int migor_mode_pins(void)
+{
+	/* MD0=1, MD1=1, MD2=0: Clock Mode 3
+	 * MD3=0: 16-bit Area0 Bus Width
+	 * MD5=1: Little Endian
+	 * TSTMD=1, MD8=0: Test Mode Disabled
+	 */
+	return MODE_PIN0 | MODE_PIN1 | MODE_PIN5;
+}
+
+/*
+ * The Machine Vector
+ */
+static struct sh_machine_vector mv_migor __initmv = {
+	.mv_name		= "Migo-R",
+	.mv_mode_pins		= migor_mode_pins,
+};
-- 
cgit v0.10.2


From c01641b42a88c475fd4b72cff2b10e42262a80fe Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 3 Jun 2009 08:26:40 +0000
Subject: sh: add AP325RXA mode pin configuration

This patch adds mode pin configuration to ap325rxa.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/boards/board-ap325rxa.c b/arch/sh/boards/board-ap325rxa.c
index f2a2964..1c4d83e 100644
--- a/arch/sh/boards/board-ap325rxa.c
+++ b/arch/sh/boards/board-ap325rxa.c
@@ -535,6 +535,18 @@ static int __init ap325rxa_devices_setup(void)
 }
 device_initcall(ap325rxa_devices_setup);
 
+/* Return the board specific boot mode pin configuration */
+static int ap325rxa_mode_pins(void)
+{
+	/* MD0=0, MD1=0, MD2=0: Clock Mode 0
+	 * MD3=0: 16-bit Area0 Bus Width
+	 * MD5=1: Little Endian
+	 * TSTMD=1, MD8=1: Test Mode Disabled
+	 */
+	return MODE_PIN5 | MODE_PIN8;
+}
+
 static struct sh_machine_vector mv_ap325rxa __initmv = {
 	.mv_name = "AP-325RXA",
+	.mv_mode_pins = ap325rxa_mode_pins,
 };
-- 
cgit v0.10.2


From 098dee99d14e8324d3793df442d6078d0c134140 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 4 Jun 2009 05:31:41 +0000
Subject: sh: add enable()/disable()/set_rate() to div6 code

This patch updates the div6 clock helper code to add support
for enable(), disable() and set_rate() callbacks.

Needed by the camera clock enabling board code on Migo-R.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index 026b9da..9fe7d7f 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -119,6 +119,10 @@ long clk_rate_table_round(struct clk *clk,
 			  struct cpufreq_frequency_table *freq_table,
 			  unsigned long rate);
 
+int clk_rate_table_find(struct clk *clk,
+			struct cpufreq_frequency_table *freq_table,
+			unsigned long rate);
+
 #define SH_CLK_MSTP32(_name, _id, _parent, _enable_reg,	\
 	    _enable_bit, _flags)			\
 {							\
diff --git a/arch/sh/kernel/cpu/clock-cpg.c b/arch/sh/kernel/cpu/clock-cpg.c
index fedc8b8..275942e 100644
--- a/arch/sh/kernel/cpu/clock-cpg.c
+++ b/arch/sh/kernel/cpu/clock-cpg.c
@@ -68,9 +68,53 @@ static unsigned long sh_clk_div6_recalc(struct clk *clk)
 	return clk->freq_table[idx].frequency;
 }
 
+static int sh_clk_div6_set_rate(struct clk *clk,
+				unsigned long rate, int algo_id)
+{
+	unsigned long value;
+	int idx;
+
+	idx = clk_rate_table_find(clk, clk->freq_table, rate);
+	if (idx < 0)
+		return idx;
+
+	value = __raw_readl(clk->enable_reg);
+	value &= ~0x3f;
+	value |= idx;
+	__raw_writel(value, clk->enable_reg);
+	return 0;
+}
+
+static int sh_clk_div6_enable(struct clk *clk)
+{
+	unsigned long value;
+	int ret;
+
+	ret = sh_clk_div6_set_rate(clk, clk->rate, 0);
+	if (ret == 0) {
+		value = __raw_readl(clk->enable_reg);
+		value &= ~0x100; /* clear stop bit to enable clock */
+		__raw_writel(value, clk->enable_reg);
+	}
+	return ret;
+}
+
+static void sh_clk_div6_disable(struct clk *clk)
+{
+	unsigned long value;
+
+	value = __raw_readl(clk->enable_reg);
+	value |= 0x100; /* stop clock */
+	value |= 0x3f; /* VDIV bits must be non-zero, overwrite divider */
+	__raw_writel(value, clk->enable_reg);
+}
+
 static struct clk_ops sh_clk_div6_clk_ops = {
 	.recalc		= sh_clk_div6_recalc,
 	.round_rate	= sh_clk_div_round_rate,
+	.set_rate	= sh_clk_div6_set_rate,
+	.enable		= sh_clk_div6_enable,
+	.disable	= sh_clk_div6_disable,
 };
 
 int __init sh_clk_div6_register(struct clk *clks, int nr)
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index aa0fd08..f3a46be 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -111,6 +111,25 @@ long clk_rate_table_round(struct clk *clk,
 	return rate_best_fit;
 }
 
+int clk_rate_table_find(struct clk *clk,
+			struct cpufreq_frequency_table *freq_table,
+			unsigned long rate)
+{
+	int i;
+
+	for (i = 0; freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
+		unsigned long freq = freq_table[i].frequency;
+
+		if (freq == CPUFREQ_ENTRY_INVALID)
+			continue;
+
+		if (freq == rate)
+			return i;
+	}
+
+	return -ENOENT;
+}
+
 /* Used for clocks that always have same value as the parent clock */
 unsigned long followparent_recalc(struct clk *clk)
 {
-- 
cgit v0.10.2


From c521dc02034df3681394a30b428bf081cfa22253 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 3 Jun 2009 08:39:59 +0000
Subject: sh: sh7723 clock framework rewrite V2

This patch contains V2 of the sh7723 clock framework
rewrite. The new code makes use of the recently merged
div4, div6 and mstp32 helper code. Both extal and dll
are supported as input clocks to the pll.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index dec8b7b..d13390b 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -516,7 +516,7 @@ config SH_CLK_CPG
 
 config SH_CLK_CPG_LEGACY
 	depends on SH_CLK_CPG
-	def_bool y if !CPU_SUBTYPE_SH7785
+	def_bool y if !CPU_SUBTYPE_SH7785 && !CPU_SUBTYPE_SH7723
 
 config SH_CLK_MD
 	int "CPU Mode Pin Setting"
diff --git a/arch/sh/kernel/cpu/sh4a/Makefile b/arch/sh/kernel/cpu/sh4a/Makefile
index afd6fba4..25b2833 100644
--- a/arch/sh/kernel/cpu/sh4a/Makefile
+++ b/arch/sh/kernel/cpu/sh4a/Makefile
@@ -26,7 +26,7 @@ clock-$(CONFIG_CPU_SUBTYPE_SH7785)	:= clock-sh7785.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7786)	:= clock-sh7786.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7343)	:= clock-sh7722.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7722)	:= clock-sh7722.o
-clock-$(CONFIG_CPU_SUBTYPE_SH7723)	:= clock-sh7722.o
+clock-$(CONFIG_CPU_SUBTYPE_SH7723)	:= clock-sh7723.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7724)	:= clock-sh7722.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7366)	:= clock-sh7722.o
 clock-$(CONFIG_CPU_SUBTYPE_SHX3)	:= clock-shx3.o
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index c090c9a..ccefd7d 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -1,7 +1,7 @@
 /*
  * arch/sh/kernel/cpu/sh4a/clock-sh7722.c
  *
- * SH7343, SH7722, SH7723 & SH7366 support for the clock framework
+ * SH7343, SH7722 & SH7366 support for the clock framework
  *
  * Copyright (c) 2006-2007 Nomad Global Solutions Inc
  * Based on code for sh7343 by Paul Mundt
@@ -176,11 +176,6 @@ static unsigned long module_clk_recalc(struct clk *clk)
 #define STCMASK		0x3f
 #define DIVCALC(div)	(div/2-1)
 #define FRQCRKICK	0x80000000
-#elif defined(CONFIG_CPU_SUBTYPE_SH7723)
-#define MASTERDIVS	{ 6, 8, 12, 16 }
-#define STCMASK		0x1f
-#define DIVCALC(div)	(div-1)
-#define FRQCRKICK	0x00000000
 #else
 #define MASTERDIVS	{ 2, 3, 4, 6, 8, 16 }
 #define STCMASK		0x1f
@@ -681,56 +676,6 @@ static struct clk sh7722_mstpcr_clocks[] = {
 	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
 	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
 #endif
-#if defined(CONFIG_CPU_SUBTYPE_SH7723)
-	/* See page 60 of Datasheet V1.0: Overview -> Block Diagram */
-	MSTPCR("tlb0", "cpu_clk", 0, 31, 0),
-	MSTPCR("ic0", "cpu_clk", 0, 30, 0),
-	MSTPCR("oc0", "cpu_clk", 0, 29, 0),
-	MSTPCR("l2c0", "sh_clk", 0, 28, 0),
-	MSTPCR("ilmem0", "cpu_clk", 0, 27, 0),
-	MSTPCR("fpu0", "cpu_clk", 0, 24, 0),
-	MSTPCR("intc0", "cpu_clk", 0, 22, 0),
-	MSTPCR("dmac0", "bus_clk", 0, 21, 0),
-	MSTPCR("sh0", "sh_clk", 0, 20, 0),
-	MSTPCR("hudi0", "peripheral_clk", 0, 19, 0),
-	MSTPCR("ubc0", "cpu_clk", 0, 17, 0),
-	MSTPCR("tmu0", "peripheral_clk", 0, 15, 0),
-	MSTPCR("cmt0", "r_clk", 0, 14, 0),
-	MSTPCR("rwdt0", "r_clk", 0, 13, 0),
-	MSTPCR("dmac1", "bus_clk", 0, 12, 0),
-	MSTPCR("tmu1", "peripheral_clk", 0, 11, 0),
-	MSTPCR("flctl0", "peripheral_clk", 0, 10, 0),
-	MSTPCR("scif0", "peripheral_clk", 0, 9, 0),
-	MSTPCR("scif1", "peripheral_clk", 0, 8, 0),
-	MSTPCR("scif2", "peripheral_clk", 0, 7, 0),
-	MSTPCR("scif3", "bus_clk", 0, 6, 0),
-	MSTPCR("scif4", "bus_clk", 0, 5, 0),
-	MSTPCR("scif5", "bus_clk", 0, 4, 0),
-	MSTPCR("msiof0", "bus_clk", 0, 2, 0),
-	MSTPCR("msiof1", "bus_clk", 0, 1, 0),
-	MSTPCR("meram0", "sh_clk", 0, 0, CLK_ENABLE_ON_INIT),
-	MSTPCR("i2c0", "peripheral_clk", 1, 9, 0),
-	MSTPCR("rtc0", "r_clk", 1, 8, 0),
-	MSTPCR("atapi0", "sh_clk", 2, 28, 0),
-	MSTPCR("adc0", "peripheral_clk", 2, 28, 0),
-	MSTPCR("tpu0", "bus_clk", 2, 25, 0),
-	MSTPCR("irda0", "peripheral_clk", 2, 24, 0),
-	MSTPCR("tsif0", "bus_clk", 2, 22, 0),
-	MSTPCR("icb0", "bus_clk", 2, 21, 0),
-	MSTPCR("sdhi0", "bus_clk", 2, 18, 0),
-	MSTPCR("sdhi1", "bus_clk", 2, 17, 0),
-	MSTPCR("keysc0", "r_clk", 2, 14, 0),
-	MSTPCR("usb0", "bus_clk", 2, 11, 0),
-	MSTPCR("2dg0", "bus_clk", 2, 10, 0),
-	MSTPCR("siu0", "bus_clk", 2, 8, 0),
-	MSTPCR("veu1", "bus_clk", 2, 6, CLK_ENABLE_ON_INIT),
-	MSTPCR("vou0", "bus_clk", 2, 5, 0),
-	MSTPCR("beu0", "bus_clk", 2, 4, 0),
-	MSTPCR("ceu0", "bus_clk", 2, 3, 0),
-	MSTPCR("veu0", "bus_clk", 2, 2, CLK_ENABLE_ON_INIT),
-	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
-	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
-#endif
 #if defined(CONFIG_CPU_SUBTYPE_SH7724)
 	/* See Datasheet : Overview -> Block Diagram */
 	MSTPCR("tlb0", "cpu_clk", 0, 31, 0),
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7723.c b/arch/sh/kernel/cpu/sh4a/clock-sh7723.c
new file mode 100644
index 0000000..e67c267
--- /dev/null
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7723.c
@@ -0,0 +1,222 @@
+/*
+ * arch/sh/kernel/cpu/sh4a/clock-sh7723.c
+ *
+ * SH7723 clock framework support
+ *
+ * Copyright (C) 2009 Magnus Damm
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/io.h>
+#include <asm/clock.h>
+
+/* SH7723 registers */
+#define FRQCR		0xa4150000
+#define VCLKCR		0xa4150004
+#define SCLKACR		0xa4150008
+#define SCLKBCR		0xa415000c
+#define IRDACLKCR	0xa4150018
+#define PLLCR		0xa4150024
+#define MSTPCR0		0xa4150030
+#define MSTPCR1		0xa4150034
+#define MSTPCR2		0xa4150038
+#define DLLFRQ		0xa4150050
+
+/* Fixed 32 KHz root clock for RTC and Power Management purposes */
+static struct clk r_clk = {
+	.name           = "rclk",
+	.id             = -1,
+	.rate           = 32768,
+};
+
+/*
+ * Default rate for the root input clock, reset this with clk_set_rate()
+ * from the platform code.
+ */
+struct clk extal_clk = {
+	.name		= "extal",
+	.id		= -1,
+	.rate		= 33333333,
+};
+
+/* The dll multiplies the 32khz r_clk, may be used instead of extal */
+static unsigned long dll_recalc(struct clk *clk)
+{
+	unsigned long mult;
+
+	if (__raw_readl(PLLCR) & 0x1000)
+		mult = __raw_readl(DLLFRQ);
+	else
+		mult = 0;
+
+	return clk->parent->rate * mult;
+}
+
+static struct clk_ops dll_clk_ops = {
+	.recalc		= dll_recalc,
+};
+
+static struct clk dll_clk = {
+	.name           = "dll_clk",
+	.id             = -1,
+	.ops		= &dll_clk_ops,
+	.parent		= &r_clk,
+	.flags		= CLK_ENABLE_ON_INIT,
+};
+
+static unsigned long pll_recalc(struct clk *clk)
+{
+	unsigned long mult = 1;
+	unsigned long div = 1;
+
+	if (__raw_readl(PLLCR) & 0x4000)
+		mult = (((__raw_readl(FRQCR) >> 24) & 0x1f) + 1);
+	else
+		div = 2;
+
+	return (clk->parent->rate * mult) / div;
+}
+
+static struct clk_ops pll_clk_ops = {
+	.recalc		= pll_recalc,
+};
+
+static struct clk pll_clk = {
+	.name		= "pll_clk",
+	.id		= -1,
+	.ops		= &pll_clk_ops,
+	.flags		= CLK_ENABLE_ON_INIT,
+};
+
+struct clk *main_clks[] = {
+	&r_clk,
+	&extal_clk,
+	&dll_clk,
+	&pll_clk,
+};
+
+static int multipliers[] = { 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
+static int divisors[] = { 1, 3, 2, 5, 3, 4, 5, 6, 8, 10, 12, 16, 20 };
+
+static struct clk_div_mult_table div4_table = {
+	.divisors = divisors,
+	.nr_divisors = ARRAY_SIZE(divisors),
+	.multipliers = multipliers,
+	.nr_multipliers = ARRAY_SIZE(multipliers),
+};
+
+enum { DIV4_I, DIV4_U, DIV4_SH, DIV4_B, DIV4_B3, DIV4_P,
+       DIV4_SIUA, DIV4_SIUB, DIV4_IRDA, DIV4_NR };
+
+#define DIV4(_str, _reg, _bit, _mask, _flags) \
+  SH_CLK_DIV4(_str, &pll_clk, _reg, _bit, _mask, _flags)
+
+struct clk div4_clks[DIV4_NR] = {
+	[DIV4_I] = DIV4("cpu_clk", FRQCR, 20, 0x0dbf, CLK_ENABLE_ON_INIT),
+	[DIV4_U] = DIV4("umem_clk", FRQCR, 16, 0x0dbf, CLK_ENABLE_ON_INIT),
+	[DIV4_SH] = DIV4("shyway_clk", FRQCR, 12, 0x0dbf, CLK_ENABLE_ON_INIT),
+	[DIV4_B] = DIV4("bus_clk", FRQCR, 8, 0x0dbf, CLK_ENABLE_ON_INIT),
+	[DIV4_B3] = DIV4("b3_clk", FRQCR, 4, 0x0db4, CLK_ENABLE_ON_INIT),
+	[DIV4_P] = DIV4("peripheral_clk", FRQCR, 0, 0x0dbf, 0),
+	[DIV4_SIUA] = DIV4("siua_clk", SCLKACR, 0, 0x0dbf, 0),
+	[DIV4_SIUB] = DIV4("siub_clk", SCLKBCR, 0, 0x0dbf, 0),
+	[DIV4_IRDA] = DIV4("irda_clk", IRDACLKCR, 0, 0x0dbf, 0),
+};
+
+struct clk div6_clks[] = {
+	SH_CLK_DIV6("video_clk", &pll_clk, VCLKCR, 0),
+};
+
+#define MSTP(_str, _parent, _reg, _bit, _force_on, _need_cpg, _need_ram) \
+  SH_CLK_MSTP32(_str, -1, _parent, _reg, _bit, _force_on * CLK_ENABLE_ON_INIT)
+
+static struct clk mstp_clks[] = {
+	/* See page 60 of Datasheet V1.0: Overview -> Block Diagram */
+	MSTP("tlb0", &div4_clks[DIV4_I], MSTPCR0, 31, 1, 1, 0),
+	MSTP("ic0", &div4_clks[DIV4_I], MSTPCR0, 30, 1, 1, 0),
+	MSTP("oc0", &div4_clks[DIV4_I], MSTPCR0, 29, 1, 1, 0),
+	MSTP("l2c0", &div4_clks[DIV4_SH], MSTPCR0, 28, 1, 1, 0),
+	MSTP("ilmem0", &div4_clks[DIV4_I], MSTPCR0, 27, 1, 1, 0),
+	MSTP("fpu0", &div4_clks[DIV4_I], MSTPCR0, 24, 1, 1, 0),
+	MSTP("intc0", &div4_clks[DIV4_I], MSTPCR0, 22, 1, 1, 0),
+	MSTP("dmac0", &div4_clks[DIV4_B], MSTPCR0, 21, 0, 1, 1),
+	MSTP("sh0", &div4_clks[DIV4_SH], MSTPCR0, 20, 0, 1, 0),
+	MSTP("hudi0", &div4_clks[DIV4_P], MSTPCR0, 19, 0, 1, 0),
+	MSTP("ubc0", &div4_clks[DIV4_I], MSTPCR0, 17, 0, 1, 0),
+	MSTP("tmu0", &div4_clks[DIV4_P], MSTPCR0, 15, 0, 1, 0),
+	MSTP("cmt0", &r_clk, MSTPCR0, 14, 0, 0, 0),
+	MSTP("rwdt0", &r_clk, MSTPCR0, 13, 0, 0, 0),
+	MSTP("dmac1", &div4_clks[DIV4_B], MSTPCR0, 12, 0, 1, 1),
+	MSTP("tmu1", &div4_clks[DIV4_P], MSTPCR0, 11, 0, 1, 0),
+	MSTP("flctl0", &div4_clks[DIV4_P], MSTPCR0, 10, 0, 1, 0),
+	MSTP("scif0", &div4_clks[DIV4_P], MSTPCR0, 9, 0, 1, 0),
+	MSTP("scif1", &div4_clks[DIV4_P], MSTPCR0, 8, 0, 1, 0),
+	MSTP("scif2", &div4_clks[DIV4_P], MSTPCR0, 7, 0, 1, 0),
+	MSTP("scif3", &div4_clks[DIV4_B], MSTPCR0, 6, 0, 1, 0),
+	MSTP("scif4", &div4_clks[DIV4_B], MSTPCR0, 5, 0, 1, 0),
+	MSTP("scif5", &div4_clks[DIV4_B], MSTPCR0, 4, 0, 1, 0),
+	MSTP("msiof0", &div4_clks[DIV4_B], MSTPCR0, 2, 0, 1, 0),
+	MSTP("msiof1", &div4_clks[DIV4_B], MSTPCR0, 1, 0, 1, 0),
+	MSTP("meram0", &div4_clks[DIV4_SH], MSTPCR0, 0, 1, 1, 0),
+
+	MSTP("i2c0", &div4_clks[DIV4_P], MSTPCR1, 9, 0, 1, 0),
+	MSTP("rtc0", &r_clk, MSTPCR1, 8, 0, 0, 0),
+
+	MSTP("atapi0", &div4_clks[DIV4_SH], MSTPCR2, 28, 0, 1, 0),
+	MSTP("adc0", &div4_clks[DIV4_P], MSTPCR2, 27, 0, 1, 0),
+	MSTP("tpu0", &div4_clks[DIV4_B], MSTPCR2, 25, 0, 1, 0),
+	MSTP("irda0", &div4_clks[DIV4_P], MSTPCR2, 24, 0, 1, 0),
+	MSTP("tsif0", &div4_clks[DIV4_B], MSTPCR2, 22, 0, 1, 0),
+	MSTP("icb0", &div4_clks[DIV4_B], MSTPCR2, 21, 0, 1, 1),
+	MSTP("sdhi0", &div4_clks[DIV4_B], MSTPCR2, 18, 0, 1, 0),
+	MSTP("sdhi1", &div4_clks[DIV4_B], MSTPCR2, 17, 0, 1, 0),
+	MSTP("keysc0", &r_clk, MSTPCR2, 14, 0, 0, 0),
+	MSTP("usb0", &div4_clks[DIV4_B], MSTPCR2, 11, 0, 1, 0),
+	MSTP("2dg0", &div4_clks[DIV4_B], MSTPCR2, 10, 0, 1, 1),
+	MSTP("siu0", &div4_clks[DIV4_B], MSTPCR2, 8, 0, 1, 0),
+	MSTP("veu1", &div4_clks[DIV4_B], MSTPCR2, 6, 1, 1, 1),
+	MSTP("vou0", &div4_clks[DIV4_B], MSTPCR2, 5, 0, 1, 1),
+	MSTP("beu0", &div4_clks[DIV4_B], MSTPCR2, 4, 0, 1, 1),
+	MSTP("ceu0", &div4_clks[DIV4_B], MSTPCR2, 3, 0, 1, 1),
+	MSTP("veu0", &div4_clks[DIV4_B], MSTPCR2, 2, 1, 1, 1),
+	MSTP("vpu0", &div4_clks[DIV4_B], MSTPCR2, 1, 1, 1, 1),
+	MSTP("lcdc0", &div4_clks[DIV4_B], MSTPCR2, 0, 0, 1, 1),
+};
+
+int __init arch_clk_init(void)
+{
+	int k, ret = 0;
+
+	/* autodetect extal or dll configuration */
+	if (__raw_readl(PLLCR) & 0x1000)
+		pll_clk.parent = &dll_clk;
+	else
+		pll_clk.parent = &extal_clk;
+
+	for (k = 0; !ret && (k < ARRAY_SIZE(main_clks)); k++)
+		ret = clk_register(main_clks[k]);
+
+	if (!ret)
+		ret = sh_clk_div4_register(div4_clks, DIV4_NR, &div4_table);
+
+	if (!ret)
+		ret = sh_clk_div6_register(div6_clks, ARRAY_SIZE(div6_clks));
+
+	if (!ret)
+		ret = sh_clk_mstp32_register(mstp_clks, ARRAY_SIZE(mstp_clks));
+
+	return ret;
+}
-- 
cgit v0.10.2


From b621370a3505f8bd42acc41736cae47d5ce8bd06 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 10 Jun 2009 11:31:16 +0000
Subject: sh: sh7724 clock framework rewrite V3

This patch contains V3 of the sh7724 clock framework
rewrite. The new code makes use of the recently merged
div4, div6 and mstp32 helper code. Both extal and fll are
supported as input clocks to the pll. The div6 clocks are
fed through a divide-by-3 block.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index d13390b..7b564bd 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -516,7 +516,7 @@ config SH_CLK_CPG
 
 config SH_CLK_CPG_LEGACY
 	depends on SH_CLK_CPG
-	def_bool y if !CPU_SUBTYPE_SH7785 && !CPU_SUBTYPE_SH7723
+	def_bool y if !CPU_SUBTYPE_SH7785 && !CPU_SUBTYPE_SH7723 && !CPU_SUBTYPE_SH7724
 
 config SH_CLK_MD
 	int "CPU Mode Pin Setting"
diff --git a/arch/sh/kernel/cpu/sh4a/Makefile b/arch/sh/kernel/cpu/sh4a/Makefile
index 25b2833..02a0b17 100644
--- a/arch/sh/kernel/cpu/sh4a/Makefile
+++ b/arch/sh/kernel/cpu/sh4a/Makefile
@@ -27,7 +27,7 @@ clock-$(CONFIG_CPU_SUBTYPE_SH7786)	:= clock-sh7786.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7343)	:= clock-sh7722.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7722)	:= clock-sh7722.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7723)	:= clock-sh7723.o
-clock-$(CONFIG_CPU_SUBTYPE_SH7724)	:= clock-sh7722.o
+clock-$(CONFIG_CPU_SUBTYPE_SH7724)	:= clock-sh7724.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7366)	:= clock-sh7722.o
 clock-$(CONFIG_CPU_SUBTYPE_SHX3)	:= clock-shx3.o
 
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index ccefd7d..5e08504 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -130,11 +130,7 @@ static void adjust_clocks(int originate, int *l, unsigned long v[],
  * is quite simple..
  */
 
-#if defined(CONFIG_CPU_SUBTYPE_SH7724)
-#define STCPLL(frqcr) ((((frqcr >> 24) & 0x3f) + 1) * 2)
-#else
 #define STCPLL(frqcr) (((frqcr >> 24) & 0x1f) + 1)
-#endif
 
 /*
  * Instead of having two separate multipliers/divisors set, like this:
@@ -145,11 +141,7 @@ static void adjust_clocks(int originate, int *l, unsigned long v[],
  * I created the divisors2 array, which is used to calculate rate like
  *   rate = parent * 2 / divisors2[ divisor ];
 */
-#if defined(CONFIG_CPU_SUBTYPE_SH7724)
-static int divisors2[] = { 4, 1, 8, 12, 16, 24, 32, 1, 48, 64, 72, 96, 1, 144 };
-#else
 static int divisors2[] = { 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32, 40 };
-#endif
 
 static unsigned long master_clk_recalc(struct clk *clk)
 {
@@ -171,17 +163,10 @@ static unsigned long module_clk_recalc(struct clk *clk)
 	return clk->parent->rate / STCPLL(frqcr);
 }
 
-#if defined(CONFIG_CPU_SUBTYPE_SH7724)
-#define MASTERDIVS	{ 12, 16, 24, 30, 32, 36, 48 }
-#define STCMASK		0x3f
-#define DIVCALC(div)	(div/2-1)
-#define FRQCRKICK	0x80000000
-#else
 #define MASTERDIVS	{ 2, 3, 4, 6, 8, 16 }
 #define STCMASK		0x1f
 #define DIVCALC(div)	(div-1)
 #define FRQCRKICK	0x00000000
-#endif
 
 static int master_clk_setrate(struct clk *clk, unsigned long rate, int id)
 {
@@ -557,8 +542,7 @@ static struct clk sh7722_r_clock = {
 	.rate = 32768,
 };
 
-#if !defined(CONFIG_CPU_SUBTYPE_SH7343) &&\
-    !defined(CONFIG_CPU_SUBTYPE_SH7724)
+#if !defined(CONFIG_CPU_SUBTYPE_SH7343)
 /*
  * these three clocks - SIU A, SIU B, IrDA - share the same clk_ops
  * methods of clk_ops determine which register they should access by
@@ -575,10 +559,9 @@ static struct clk sh7722_siu_b_clock = {
 	.arch_flags = SCLKBCR,
 	.ops = &sh7722_siu_clk_ops,
 };
-#endif /* CONFIG_CPU_SUBTYPE_SH7343, SH7724 */
+#endif /* CONFIG_CPU_SUBTYPE_SH7343 */
 
-#if defined(CONFIG_CPU_SUBTYPE_SH7722) ||\
-    defined(CONFIG_CPU_SUBTYPE_SH7724)
+#if defined(CONFIG_CPU_SUBTYPE_SH7722)
 static struct clk sh7722_irda_clock = {
 	.name = "irda_clk",
 	.arch_flags = IrDACLKCR,
@@ -676,61 +659,6 @@ static struct clk sh7722_mstpcr_clocks[] = {
 	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
 	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
 #endif
-#if defined(CONFIG_CPU_SUBTYPE_SH7724)
-	/* See Datasheet : Overview -> Block Diagram */
-	MSTPCR("tlb0", "cpu_clk", 0, 31, 0),
-	MSTPCR("ic0", "cpu_clk", 0, 30, 0),
-	MSTPCR("oc0", "cpu_clk", 0, 29, 0),
-	MSTPCR("rs0", "bus_clk", 0, 28, 0),
-	MSTPCR("ilmem0", "cpu_clk", 0, 27, 0),
-	MSTPCR("l2c0", "sh_clk", 0, 26, 0),
-	MSTPCR("fpu0", "cpu_clk", 0, 24, 0),
-	MSTPCR("intc0", "peripheral_clk", 0, 22, 0),
-	MSTPCR("dmac0", "bus_clk", 0, 21, 0),
-	MSTPCR("sh0", "sh_clk", 0, 20, 0),
-	MSTPCR("hudi0", "peripheral_clk", 0, 19, 0),
-	MSTPCR("ubc0", "cpu_clk", 0, 17, 0),
-	MSTPCR("tmu0", "peripheral_clk", 0, 15, 0),
-	MSTPCR("cmt0", "r_clk", 0, 14, 0),
-	MSTPCR("rwdt0", "r_clk", 0, 13, 0),
-	MSTPCR("dmac1", "bus_clk", 0, 12, 0),
-	MSTPCR("tmu1", "peripheral_clk", 0, 10, 0),
-	MSTPCR("scif0", "peripheral_clk", 0, 9, 0),
-	MSTPCR("scif1", "peripheral_clk", 0, 8, 0),
-	MSTPCR("scif2", "peripheral_clk", 0, 7, 0),
-	MSTPCR("scif3", "bus_clk", 0, 6, 0),
-	MSTPCR("scif4", "bus_clk", 0, 5, 0),
-	MSTPCR("scif5", "bus_clk", 0, 4, 0),
-	MSTPCR("msiof0", "bus_clk", 0, 2, 0),
-	MSTPCR("msiof1", "bus_clk", 0, 1, 0),
-	MSTPCR("keysc0", "r_clk", 1, 12, 0),
-	MSTPCR("rtc0", "r_clk", 1, 11, 0),
-	MSTPCR("i2c0", "peripheral_clk", 1, 9, 0),
-	MSTPCR("i2c1", "peripheral_clk", 1, 8, 0),
-	MSTPCR("mmc0", "bus_clk", 2, 29, 0),
-	MSTPCR("eth0", "bus_clk", 2, 28, 0),
-	MSTPCR("atapi0", "bus_clk", 2, 26, 0),
-	MSTPCR("tpu0", "bus_clk", 2, 25, 0),
-	MSTPCR("irda0", "peripheral_clk", 2, 24, 0),
-	MSTPCR("tsif0", "bus_clk", 2, 22, 0),
-	MSTPCR("usb1", "bus_clk", 2, 21, 0),
-	MSTPCR("usb0", "bus_clk", 2, 20, 0),
-	MSTPCR("2dg0", "bus_clk", 2, 19, 0),
-	MSTPCR("sdhi0", "bus_clk", 2, 18, 0),
-	MSTPCR("sdhi1", "bus_clk", 2, 17, 0),
-	MSTPCR("veu1", "bus_clk", 2, 15, CLK_ENABLE_ON_INIT),
-	MSTPCR("ceu1", "bus_clk", 2, 13, 0),
-	MSTPCR("beu1", "bus_clk", 2, 12, 0),
-	MSTPCR("2ddmac0", "sh_clk", 2, 10, 0),
-	MSTPCR("spu0", "bus_clk", 2, 9, 0),
-	MSTPCR("jpu0", "bus_clk", 2, 6, 0),
-	MSTPCR("vou0", "bus_clk", 2, 5, 0),
-	MSTPCR("beu0", "bus_clk", 2, 4, 0),
-	MSTPCR("ceu0", "bus_clk", 2, 3, 0),
-	MSTPCR("veu0", "bus_clk", 2, 2, CLK_ENABLE_ON_INIT),
-	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
-	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
-#endif
 #if defined(CONFIG_CPU_SUBTYPE_SH7343)
 	MSTPCR("uram0", "umem_clk", 0, 28, CLK_ENABLE_ON_INIT),
 	MSTPCR("xymem0", "bus_clk", 0, 26, CLK_ENABLE_ON_INIT),
@@ -802,14 +730,11 @@ static struct clk *sh7722_clocks[] = {
 	&sh7722_sh_clock,
 	&sh7722_peripheral_clock,
 	&sh7722_sdram_clock,
-#if !defined(CONFIG_CPU_SUBTYPE_SH7343) &&\
-    !defined(CONFIG_CPU_SUBTYPE_SH7724)
+#if !defined(CONFIG_CPU_SUBTYPE_SH7343)
 	&sh7722_siu_a_clock,
 	&sh7722_siu_b_clock,
 #endif
-/* 7724 should support FSI clock */
-#if defined(CONFIG_CPU_SUBTYPE_SH7722) || \
-    defined(CONFIG_CPU_SUBTYPE_SH7724)
+#if defined(CONFIG_CPU_SUBTYPE_SH7722)
 	&sh7722_irda_clock,
 #endif
 	&sh7722_video_clock,
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7724.c b/arch/sh/kernel/cpu/sh4a/clock-sh7724.c
new file mode 100644
index 0000000..5d5c9b9
--- /dev/null
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7724.c
@@ -0,0 +1,242 @@
+/*
+ * arch/sh/kernel/cpu/sh4a/clock-sh7724.c
+ *
+ * SH7724 clock framework support
+ *
+ * Copyright (C) 2009 Magnus Damm
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/io.h>
+#include <asm/clock.h>
+
+/* SH7724 registers */
+#define FRQCRA		0xa4150000
+#define FRQCRB		0xa4150004
+#define VCLKCR		0xa4150048
+#define FCLKACR		0xa4150008
+#define FCLKBCR		0xa415000c
+#define IRDACLKCR	0xa4150018
+#define PLLCR		0xa4150024
+#define MSTPCR0		0xa4150030
+#define MSTPCR1		0xa4150034
+#define MSTPCR2		0xa4150038
+#define SPUCLKCR	0xa415003c
+#define FLLFRQ		0xa4150050
+#define LSTATS		0xa4150060
+
+/* Fixed 32 KHz root clock for RTC and Power Management purposes */
+static struct clk r_clk = {
+	.name           = "rclk",
+	.id             = -1,
+	.rate           = 32768,
+};
+
+/*
+ * Default rate for the root input clock, reset this with clk_set_rate()
+ * from the platform code.
+ */
+struct clk extal_clk = {
+	.name		= "extal",
+	.id		= -1,
+	.rate		= 33333333,
+};
+
+/* The fll multiplies the 32khz r_clk, may be used instead of extal */
+static unsigned long fll_recalc(struct clk *clk)
+{
+	unsigned long mult = 0;
+	unsigned long div = 1;
+
+	if (__raw_readl(PLLCR) & 0x1000)
+		mult = __raw_readl(FLLFRQ) & 0x3ff;
+
+	if (__raw_readl(FLLFRQ) & 0x4000)
+		div = 2;
+
+	return (clk->parent->rate * mult) / div;
+}
+
+static struct clk_ops fll_clk_ops = {
+	.recalc		= fll_recalc,
+};
+
+static struct clk fll_clk = {
+	.name           = "fll_clk",
+	.id             = -1,
+	.ops		= &fll_clk_ops,
+	.parent		= &r_clk,
+	.flags		= CLK_ENABLE_ON_INIT,
+};
+
+static unsigned long pll_recalc(struct clk *clk)
+{
+	unsigned long mult = 1;
+
+	if (__raw_readl(PLLCR) & 0x4000)
+		mult = (((__raw_readl(FRQCRA) >> 24) & 0x3f) + 1) * 2;
+
+	return clk->parent->rate * mult;
+}
+
+static struct clk_ops pll_clk_ops = {
+	.recalc		= pll_recalc,
+};
+
+static struct clk pll_clk = {
+	.name		= "pll_clk",
+	.id		= -1,
+	.ops		= &pll_clk_ops,
+	.flags		= CLK_ENABLE_ON_INIT,
+};
+
+/* A fixed divide-by-3 block use by the div6 clocks */
+static unsigned long div3_recalc(struct clk *clk)
+{
+	return clk->parent->rate / 3;
+}
+
+static struct clk_ops div3_clk_ops = {
+	.recalc		= div3_recalc,
+};
+
+static struct clk div3_clk = {
+	.name		= "div3_clk",
+	.id		= -1,
+	.ops		= &div3_clk_ops,
+	.parent		= &pll_clk,
+};
+
+struct clk *main_clks[] = {
+	&r_clk,
+	&extal_clk,
+	&fll_clk,
+	&pll_clk,
+	&div3_clk,
+};
+
+static int divisors[] = { 2, 0, 4, 6, 8, 12, 16, 0, 24, 32, 36, 48, 0, 72 };
+
+static struct clk_div_mult_table div4_table = {
+	.divisors = divisors,
+	.nr_divisors = ARRAY_SIZE(divisors),
+};
+
+enum { DIV4_I, DIV4_SH, DIV4_B, DIV4_P, DIV4_M1, DIV4_NR };
+
+#define DIV4(_str, _reg, _bit, _mask, _flags) \
+  SH_CLK_DIV4(_str, &pll_clk, _reg, _bit, _mask, _flags)
+
+struct clk div4_clks[DIV4_NR] = {
+	[DIV4_I] = DIV4("cpu_clk", FRQCRA, 20, 0x2f7d, CLK_ENABLE_ON_INIT),
+	[DIV4_SH] = DIV4("shyway_clk", FRQCRA, 12, 0x2f7c, CLK_ENABLE_ON_INIT),
+	[DIV4_B] = DIV4("bus_clk", FRQCRA, 8, 0x2f7c, CLK_ENABLE_ON_INIT),
+	[DIV4_P] = DIV4("peripheral_clk", FRQCRA, 0, 0x2f7c, 0),
+	[DIV4_M1] = DIV4("vpu_clk", FRQCRB, 4, 0x2f7c, 0),
+};
+
+struct clk div6_clks[] = {
+	SH_CLK_DIV6("video_clk", &div3_clk, VCLKCR, 0),
+	SH_CLK_DIV6("fsia_clk", &div3_clk, FCLKACR, 0),
+	SH_CLK_DIV6("fsib_clk", &div3_clk, FCLKBCR, 0),
+	SH_CLK_DIV6("irda_clk", &div3_clk, IRDACLKCR, 0),
+	SH_CLK_DIV6("spu_clk", &div3_clk, SPUCLKCR, 0),
+};
+
+#define MSTP(_str, _parent, _reg, _bit, _force_on, _need_cpg, _need_ram) \
+  SH_CLK_MSTP32(_str, -1, _parent, _reg, _bit, _force_on * CLK_ENABLE_ON_INIT)
+
+static struct clk mstp_clks[] = {
+	MSTP("tlb0", &div4_clks[DIV4_I], MSTPCR0, 31, 1, 1, 0),
+	MSTP("ic0", &div4_clks[DIV4_I], MSTPCR0, 30, 1, 1, 0),
+	MSTP("oc0", &div4_clks[DIV4_I], MSTPCR0, 29, 1, 1, 0),
+	MSTP("rs0", &div4_clks[DIV4_B], MSTPCR0, 28, 1, 1, 0),
+	MSTP("ilmem0", &div4_clks[DIV4_I], MSTPCR0, 27, 1, 1, 0),
+	MSTP("l2c0", &div4_clks[DIV4_SH], MSTPCR0, 26, 1, 1, 0),
+	MSTP("fpu0", &div4_clks[DIV4_I], MSTPCR0, 24, 1, 1, 0),
+	MSTP("intc0", &div4_clks[DIV4_P], MSTPCR0, 22, 1, 1, 0),
+	MSTP("dmac0", &div4_clks[DIV4_B], MSTPCR0, 21, 0, 1, 1),
+	MSTP("sh0", &div4_clks[DIV4_SH], MSTPCR0, 20, 0, 1, 0),
+	MSTP("hudi0", &div4_clks[DIV4_P], MSTPCR0, 19, 0, 1, 0),
+	MSTP("ubc0", &div4_clks[DIV4_I], MSTPCR0, 17, 0, 1, 0),
+	MSTP("tmu0", &div4_clks[DIV4_P], MSTPCR0, 15, 0, 1, 0),
+	MSTP("cmt0", &r_clk, MSTPCR0, 14, 0, 0, 0),
+	MSTP("rwdt0", &r_clk, MSTPCR0, 13, 0, 0, 0),
+	MSTP("dmac1", &div4_clks[DIV4_B], MSTPCR0, 12, 0, 1, 1),
+	MSTP("tmu1", &div4_clks[DIV4_P], MSTPCR0, 10, 0, 1, 0),
+	MSTP("scif0", &div4_clks[DIV4_P], MSTPCR0, 9, 0, 1, 0),
+	MSTP("scif1", &div4_clks[DIV4_P], MSTPCR0, 8, 0, 1, 0),
+	MSTP("scif2", &div4_clks[DIV4_P], MSTPCR0, 7, 0, 1, 0),
+	MSTP("scif3", &div4_clks[DIV4_B], MSTPCR0, 6, 0, 1, 0),
+	MSTP("scif4", &div4_clks[DIV4_B], MSTPCR0, 5, 0, 1, 0),
+	MSTP("scif5", &div4_clks[DIV4_B], MSTPCR0, 4, 0, 1, 0),
+	MSTP("msiof0", &div4_clks[DIV4_B], MSTPCR0, 2, 0, 1, 0),
+	MSTP("msiof1", &div4_clks[DIV4_B], MSTPCR0, 1, 0, 1, 0),
+
+	MSTP("keysc0", &r_clk, MSTPCR1, 12, 0, 0, 0),
+	MSTP("rtc0", &r_clk, MSTPCR1, 11, 0, 0, 0),
+	MSTP("i2c0", &div4_clks[DIV4_P], MSTPCR1, 9, 0, 1, 0),
+	MSTP("i2c1", &div4_clks[DIV4_P], MSTPCR1, 8, 0, 1, 0),
+
+	MSTP("mmc0", &div4_clks[DIV4_B], MSTPCR2, 29, 0, 1, 0),
+	MSTP("eth0", &div4_clks[DIV4_B], MSTPCR2, 28, 0, 1, 0),
+	MSTP("atapi0", &div4_clks[DIV4_B], MSTPCR2, 26, 0, 1, 0),
+	MSTP("tpu0", &div4_clks[DIV4_B], MSTPCR2, 25, 0, 1, 0),
+	MSTP("irda0", &div4_clks[DIV4_P], MSTPCR2, 24, 0, 1, 0),
+	MSTP("tsif0", &div4_clks[DIV4_B], MSTPCR2, 22, 0, 1, 0),
+	MSTP("usb1", &div4_clks[DIV4_B], MSTPCR2, 21, 0, 1, 1),
+	MSTP("usb0", &div4_clks[DIV4_B], MSTPCR2, 20, 0, 1, 1),
+	MSTP("2dg0", &div4_clks[DIV4_B], MSTPCR2, 19, 0, 1, 1),
+	MSTP("sdhi0", &div4_clks[DIV4_B], MSTPCR2, 18, 0, 1, 0),
+	MSTP("sdhi1", &div4_clks[DIV4_B], MSTPCR2, 17, 0, 1, 0),
+	MSTP("veu1", &div4_clks[DIV4_B], MSTPCR2, 15, 1, 1, 1),
+	MSTP("ceu1", &div4_clks[DIV4_B], MSTPCR2, 13, 0, 1, 1),
+	MSTP("beu1", &div4_clks[DIV4_B], MSTPCR2, 12, 0, 1, 1),
+	MSTP("2ddmac0", &div4_clks[DIV4_SH], MSTPCR2, 10, 0, 1, 1),
+	MSTP("spu0", &div4_clks[DIV4_B], MSTPCR2, 9, 0, 1, 0),
+	MSTP("jpu0", &div4_clks[DIV4_B], MSTPCR2, 6, 1, 1, 1),
+	MSTP("vou0", &div4_clks[DIV4_B], MSTPCR2, 5, 0, 1, 1),
+	MSTP("beu0", &div4_clks[DIV4_B], MSTPCR2, 4, 0, 1, 1),
+	MSTP("ceu0", &div4_clks[DIV4_B], MSTPCR2, 3, 0, 1, 1),
+	MSTP("veu0", &div4_clks[DIV4_B], MSTPCR2, 2, 1, 1, 1),
+	MSTP("vpu0", &div4_clks[DIV4_B], MSTPCR2, 1, 1, 1, 1),
+	MSTP("lcdc0", &div4_clks[DIV4_B], MSTPCR2, 0, 0, 1, 1),
+};
+
+int __init arch_clk_init(void)
+{
+	int k, ret = 0;
+
+	/* autodetect extal or fll configuration */
+	if (__raw_readl(PLLCR) & 0x1000)
+		pll_clk.parent = &fll_clk;
+	else
+		pll_clk.parent = &extal_clk;
+
+	for (k = 0; !ret && (k < ARRAY_SIZE(main_clks)); k++)
+		ret = clk_register(main_clks[k]);
+
+	if (!ret)
+		ret = sh_clk_div4_register(div4_clks, DIV4_NR, &div4_table);
+
+	if (!ret)
+		ret = sh_clk_div6_register(div6_clks, ARRAY_SIZE(div6_clks));
+
+	if (!ret)
+		ret = sh_clk_mstp32_register(mstp_clks, ARRAY_SIZE(mstp_clks));
+
+	return ret;
+}
-- 
cgit v0.10.2


From bc49b6eaac6eff86f902a36d846c310e1e0beedf Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 4 Jun 2009 07:32:11 +0000
Subject: sh: sh7343 clock framework rewrite

This patch rewrites the sh7343 clock framework code.
The new code makes use of the recently merged div4,
div6 and mstp32 helper code. Both extal and dll are
supported as input clocks to the pll.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 7b564bd..3980714 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -516,7 +516,7 @@ config SH_CLK_CPG
 
 config SH_CLK_CPG_LEGACY
 	depends on SH_CLK_CPG
-	def_bool y if !CPU_SUBTYPE_SH7785 && !CPU_SUBTYPE_SH7723 && !CPU_SUBTYPE_SH7724
+	def_bool y if !CPU_SUBTYPE_SH7785 && !CPU_SUBTYPE_SH7723 && !CPU_SUBTYPE_SH7724 && !CPU_SUBTYPE_SH7343
 
 config SH_CLK_MD
 	int "CPU Mode Pin Setting"
diff --git a/arch/sh/kernel/cpu/sh4a/Makefile b/arch/sh/kernel/cpu/sh4a/Makefile
index 02a0b17..f4415f9 100644
--- a/arch/sh/kernel/cpu/sh4a/Makefile
+++ b/arch/sh/kernel/cpu/sh4a/Makefile
@@ -24,7 +24,7 @@ clock-$(CONFIG_CPU_SUBTYPE_SH7770)	:= clock-sh7770.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7780)	:= clock-sh7780.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7785)	:= clock-sh7785.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7786)	:= clock-sh7786.o
-clock-$(CONFIG_CPU_SUBTYPE_SH7343)	:= clock-sh7722.o
+clock-$(CONFIG_CPU_SUBTYPE_SH7343)	:= clock-sh7343.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7722)	:= clock-sh7722.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7723)	:= clock-sh7723.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7724)	:= clock-sh7724.o
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7343.c b/arch/sh/kernel/cpu/sh4a/clock-sh7343.c
new file mode 100644
index 0000000..0ee3ee8
--- /dev/null
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7343.c
@@ -0,0 +1,211 @@
+/*
+ * arch/sh/kernel/cpu/sh4a/clock-sh7343.c
+ *
+ * SH7343 clock framework support
+ *
+ * Copyright (C) 2009 Magnus Damm
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/io.h>
+#include <asm/clock.h>
+
+/* SH7343 registers */
+#define FRQCR		0xa4150000
+#define VCLKCR		0xa4150004
+#define SCLKACR		0xa4150008
+#define SCLKBCR		0xa415000c
+#define PLLCR		0xa4150024
+#define MSTPCR0		0xa4150030
+#define MSTPCR1		0xa4150034
+#define MSTPCR2		0xa4150038
+#define DLLFRQ		0xa4150050
+
+/* Fixed 32 KHz root clock for RTC and Power Management purposes */
+static struct clk r_clk = {
+	.name           = "rclk",
+	.id             = -1,
+	.rate           = 32768,
+};
+
+/*
+ * Default rate for the root input clock, reset this with clk_set_rate()
+ * from the platform code.
+ */
+struct clk extal_clk = {
+	.name		= "extal",
+	.id		= -1,
+	.rate		= 33333333,
+};
+
+/* The dll block multiplies the 32khz r_clk, may be used instead of extal */
+static unsigned long dll_recalc(struct clk *clk)
+{
+	unsigned long mult;
+
+	if (__raw_readl(PLLCR) & 0x1000)
+		mult = __raw_readl(DLLFRQ);
+	else
+		mult = 0;
+
+	return clk->parent->rate * mult;
+}
+
+static struct clk_ops dll_clk_ops = {
+	.recalc		= dll_recalc,
+};
+
+static struct clk dll_clk = {
+	.name           = "dll_clk",
+	.id             = -1,
+	.ops		= &dll_clk_ops,
+	.parent		= &r_clk,
+	.flags		= CLK_ENABLE_ON_INIT,
+};
+
+static unsigned long pll_recalc(struct clk *clk)
+{
+	unsigned long mult = 1;
+
+	if (__raw_readl(PLLCR) & 0x4000)
+		mult = (((__raw_readl(FRQCR) >> 24) & 0x1f) + 1);
+
+	return clk->parent->rate * mult;
+}
+
+static struct clk_ops pll_clk_ops = {
+	.recalc		= pll_recalc,
+};
+
+static struct clk pll_clk = {
+	.name		= "pll_clk",
+	.id		= -1,
+	.ops		= &pll_clk_ops,
+	.flags		= CLK_ENABLE_ON_INIT,
+};
+
+struct clk *main_clks[] = {
+	&r_clk,
+	&extal_clk,
+	&dll_clk,
+	&pll_clk,
+};
+
+static int multipliers[] = { 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
+static int divisors[] = { 1, 3, 2, 5, 3, 4, 5, 6, 8, 10, 12, 16, 20 };
+
+static struct clk_div_mult_table div4_table = {
+	.divisors = divisors,
+	.nr_divisors = ARRAY_SIZE(divisors),
+	.multipliers = multipliers,
+	.nr_multipliers = ARRAY_SIZE(multipliers),
+};
+
+enum { DIV4_I, DIV4_U, DIV4_SH, DIV4_B, DIV4_B3, DIV4_P,
+       DIV4_SIUA, DIV4_SIUB, DIV4_NR };
+
+#define DIV4(_str, _reg, _bit, _mask, _flags) \
+  SH_CLK_DIV4(_str, &pll_clk, _reg, _bit, _mask, _flags)
+
+struct clk div4_clks[DIV4_NR] = {
+	[DIV4_I] = DIV4("cpu_clk", FRQCR, 20, 0x1fff, CLK_ENABLE_ON_INIT),
+	[DIV4_U] = DIV4("umem_clk", FRQCR, 16, 0x1fff, CLK_ENABLE_ON_INIT),
+	[DIV4_SH] = DIV4("shyway_clk", FRQCR, 12, 0x1fff, CLK_ENABLE_ON_INIT),
+	[DIV4_B] = DIV4("bus_clk", FRQCR, 8, 0x1fff, CLK_ENABLE_ON_INIT),
+	[DIV4_B3] = DIV4("b3_clk", FRQCR, 4, 0x1fff, CLK_ENABLE_ON_INIT),
+	[DIV4_P] = DIV4("peripheral_clk", FRQCR, 0, 0x1fff, 0),
+	[DIV4_SIUA] = DIV4("siua_clk", SCLKACR, 0, 0x1fff, 0),
+	[DIV4_SIUB] = DIV4("siub_clk", SCLKBCR, 0, 0x1fff, 0),
+};
+
+struct clk div6_clks[] = {
+	SH_CLK_DIV6("video_clk", &pll_clk, VCLKCR, 0),
+};
+
+#define MSTP(_str, _parent, _reg, _bit, _flags) \
+  SH_CLK_MSTP32(_str, -1, _parent, _reg, _bit, _flags)
+
+static struct clk mstp_clks[] = {
+	MSTP("tlb0", &div4_clks[DIV4_I], MSTPCR0, 31, CLK_ENABLE_ON_INIT),
+	MSTP("ic0", &div4_clks[DIV4_I], MSTPCR0, 30, CLK_ENABLE_ON_INIT),
+	MSTP("oc0", &div4_clks[DIV4_I], MSTPCR0, 29, CLK_ENABLE_ON_INIT),
+	MSTP("uram0", &div4_clks[DIV4_U], MSTPCR0, 28, CLK_ENABLE_ON_INIT),
+	MSTP("xymem0", &div4_clks[DIV4_B], MSTPCR0, 26, CLK_ENABLE_ON_INIT),
+	MSTP("intc3", &div4_clks[DIV4_P], MSTPCR0, 23, 0),
+	MSTP("intc0", &div4_clks[DIV4_P], MSTPCR0, 22, 0),
+	MSTP("dmac0", &div4_clks[DIV4_P], MSTPCR0, 21, 0),
+	MSTP("sh0", &div4_clks[DIV4_P], MSTPCR0, 20, 0),
+	MSTP("hudi0", &div4_clks[DIV4_P], MSTPCR0, 19, 0),
+	MSTP("ubc0", &div4_clks[DIV4_P], MSTPCR0, 17, 0),
+	MSTP("tmu0", &div4_clks[DIV4_P], MSTPCR0, 15, 0),
+	MSTP("cmt0", &r_clk, MSTPCR0, 14, 0),
+	MSTP("rwdt0", &r_clk, MSTPCR0, 13, 0),
+	MSTP("mfi0", &div4_clks[DIV4_P], MSTPCR0, 11, 0),
+	MSTP("flctl0", &div4_clks[DIV4_P], MSTPCR0, 10, 0),
+	MSTP("scif0", &div4_clks[DIV4_P], MSTPCR0, 7, 0),
+	MSTP("scif1", &div4_clks[DIV4_P], MSTPCR0, 6, 0),
+	MSTP("scif2", &div4_clks[DIV4_P], MSTPCR0, 5, 0),
+	MSTP("scif3", &div4_clks[DIV4_P], MSTPCR0, 4, 0),
+	MSTP("sio0", &div4_clks[DIV4_P], MSTPCR0, 3, 0),
+	MSTP("siof0", &div4_clks[DIV4_P], MSTPCR0, 2, 0),
+	MSTP("siof1", &div4_clks[DIV4_P], MSTPCR0, 1, 0),
+
+	MSTP("i2c0", &div4_clks[DIV4_P], MSTPCR1, 9, 0),
+	MSTP("i2c1", &div4_clks[DIV4_P], MSTPCR1, 8, 0),
+
+	MSTP("tpu0", &div4_clks[DIV4_P], MSTPCR2, 25, 0),
+	MSTP("irda0", &div4_clks[DIV4_P], MSTPCR2, 24, 0),
+	MSTP("sdhi0", &div4_clks[DIV4_P], MSTPCR2, 18, 0),
+	MSTP("mmcif0", &div4_clks[DIV4_P], MSTPCR2, 17, 0),
+	MSTP("sim0", &div4_clks[DIV4_P], MSTPCR2, 16, 0),
+	MSTP("keysc0", &r_clk, MSTPCR2, 14, 0),
+	MSTP("tsif0", &div4_clks[DIV4_P], MSTPCR2, 13, 0),
+	MSTP("s3d40", &div4_clks[DIV4_P], MSTPCR2, 12, 0),
+	MSTP("usbf0", &div4_clks[DIV4_P], MSTPCR2, 11, 0),
+	MSTP("siu0", &div4_clks[DIV4_B], MSTPCR2, 8, 0),
+	MSTP("jpu0", &div4_clks[DIV4_B], MSTPCR2, 6, CLK_ENABLE_ON_INIT),
+	MSTP("vou0", &div4_clks[DIV4_B], MSTPCR2, 5, 0),
+	MSTP("beu0", &div4_clks[DIV4_B], MSTPCR2, 4, 0),
+	MSTP("ceu0", &div4_clks[DIV4_B], MSTPCR2, 3, 0),
+	MSTP("veu0", &div4_clks[DIV4_B], MSTPCR2, 2, CLK_ENABLE_ON_INIT),
+	MSTP("vpu0", &div4_clks[DIV4_B], MSTPCR2, 1, CLK_ENABLE_ON_INIT),
+	MSTP("lcdc0", &div4_clks[DIV4_B], MSTPCR2, 0, 0),
+};
+
+int __init arch_clk_init(void)
+{
+	int k, ret = 0;
+
+	/* autodetect extal or dll configuration */
+	if (__raw_readl(PLLCR) & 0x1000)
+		pll_clk.parent = &dll_clk;
+	else
+		pll_clk.parent = &extal_clk;
+
+	for (k = 0; !ret && (k < ARRAY_SIZE(main_clks)); k++)
+		ret = clk_register(main_clks[k]);
+
+	if (!ret)
+		ret = sh_clk_div4_register(div4_clks, DIV4_NR, &div4_table);
+
+	if (!ret)
+		ret = sh_clk_div6_register(div6_clks, ARRAY_SIZE(div6_clks));
+
+	if (!ret)
+		ret = sh_clk_mstp32_register(mstp_clks, ARRAY_SIZE(mstp_clks));
+
+	return ret;
+}
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index 5e08504..8aaaac2 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -1,7 +1,7 @@
 /*
  * arch/sh/kernel/cpu/sh4a/clock-sh7722.c
  *
- * SH7343, SH7722 & SH7366 support for the clock framework
+ * SH7722 & SH7366 support for the clock framework
  *
  * Copyright (c) 2006-2007 Nomad Global Solutions Inc
  * Based on code for sh7343 by Paul Mundt
@@ -417,7 +417,6 @@ static struct clk_ops sh7722_frqcr_clk_ops = {
 /*
  * clock ops methods for SIU A/B and IrDA clock
  */
-#ifndef CONFIG_CPU_SUBTYPE_SH7343
 static int sh7722_siu_set_rate(struct clk *clk, unsigned long rate, int algo_id)
 {
 	unsigned long r;
@@ -469,8 +468,6 @@ static struct clk_ops sh7722_siu_clk_ops = {
 	.disable = sh7722_siu_disable,
 };
 
-#endif /* CONFIG_CPU_SUBTYPE_SH7343 */
-
 static int sh7722_video_enable(struct clk *clk)
 {
 	unsigned long r;
@@ -542,7 +539,6 @@ static struct clk sh7722_r_clock = {
 	.rate = 32768,
 };
 
-#if !defined(CONFIG_CPU_SUBTYPE_SH7343)
 /*
  * these three clocks - SIU A, SIU B, IrDA - share the same clk_ops
  * methods of clk_ops determine which register they should access by
@@ -559,7 +555,6 @@ static struct clk sh7722_siu_b_clock = {
 	.arch_flags = SCLKBCR,
 	.ops = &sh7722_siu_clk_ops,
 };
-#endif /* CONFIG_CPU_SUBTYPE_SH7343 */
 
 #if defined(CONFIG_CPU_SUBTYPE_SH7722)
 static struct clk sh7722_irda_clock = {
@@ -659,30 +654,6 @@ static struct clk sh7722_mstpcr_clocks[] = {
 	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
 	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
 #endif
-#if defined(CONFIG_CPU_SUBTYPE_SH7343)
-	MSTPCR("uram0", "umem_clk", 0, 28, CLK_ENABLE_ON_INIT),
-	MSTPCR("xymem0", "bus_clk", 0, 26, CLK_ENABLE_ON_INIT),
-	MSTPCR("tmu0", "peripheral_clk", 0, 15, 0),
-	MSTPCR("cmt0", "r_clk", 0, 14, 0),
-	MSTPCR("rwdt0", "r_clk", 0, 13, 0),
-	MSTPCR("scif0", "peripheral_clk", 0, 7, 0),
-	MSTPCR("scif1", "peripheral_clk", 0, 6, 0),
-	MSTPCR("scif2", "peripheral_clk", 0, 5, 0),
-	MSTPCR("scif3", "peripheral_clk", 0, 4, 0),
-	MSTPCR("i2c0", "peripheral_clk", 1, 9, 0),
-	MSTPCR("i2c1", "peripheral_clk", 1, 8, 0),
-	MSTPCR("sdhi0", "peripheral_clk", 2, 18, 0),
-	MSTPCR("keysc0", "r_clk", 2, 14, 0),
-	MSTPCR("usbf0", "peripheral_clk", 2, 11, 0),
-	MSTPCR("siu0", "bus_clk", 2, 8, 0),
-	MSTPCR("jpu0", "bus_clk", 2, 6, CLK_ENABLE_ON_INIT),
-	MSTPCR("vou0", "bus_clk", 2, 5, 0),
-	MSTPCR("beu0", "bus_clk", 2, 4, 0),
-	MSTPCR("ceu0", "bus_clk", 2, 3, 0),
-	MSTPCR("veu0", "bus_clk", 2, 2, CLK_ENABLE_ON_INIT),
-	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
-	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
-#endif
 #if defined(CONFIG_CPU_SUBTYPE_SH7366)
 	/* See page 52 of Datasheet V0.40: Overview -> Block Diagram */
 	MSTPCR("tlb0", "cpu_clk", 0, 31, 0),
@@ -730,10 +701,8 @@ static struct clk *sh7722_clocks[] = {
 	&sh7722_sh_clock,
 	&sh7722_peripheral_clock,
 	&sh7722_sdram_clock,
-#if !defined(CONFIG_CPU_SUBTYPE_SH7343)
 	&sh7722_siu_a_clock,
 	&sh7722_siu_b_clock,
-#endif
 #if defined(CONFIG_CPU_SUBTYPE_SH7722)
 	&sh7722_irda_clock,
 #endif
-- 
cgit v0.10.2


From 4ed37394842e5f9c0c4ea087e8716d4458d03bfb Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 4 Jun 2009 07:35:25 +0000
Subject: sh: sh7366 clock framework rewrite

This patch rewrites the sh7366 clock framework code.
The new code makes use of the recently merged div4,
div6 and mstp32 helper code. Both extal and dll are
supported as input clocks to the pll.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 3980714..dbc10ff 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -516,7 +516,7 @@ config SH_CLK_CPG
 
 config SH_CLK_CPG_LEGACY
 	depends on SH_CLK_CPG
-	def_bool y if !CPU_SUBTYPE_SH7785 && !CPU_SUBTYPE_SH7723 && !CPU_SUBTYPE_SH7724 && !CPU_SUBTYPE_SH7343
+	def_bool y if !CPU_SUBTYPE_SH7785 && !CPU_SUBTYPE_SH7723 && !CPU_SUBTYPE_SH7724 && !CPU_SUBTYPE_SH7343 && !CPU_SUBTYPE_SH7366
 
 config SH_CLK_MD
 	int "CPU Mode Pin Setting"
diff --git a/arch/sh/kernel/cpu/sh4a/Makefile b/arch/sh/kernel/cpu/sh4a/Makefile
index f4415f9..96ea09c 100644
--- a/arch/sh/kernel/cpu/sh4a/Makefile
+++ b/arch/sh/kernel/cpu/sh4a/Makefile
@@ -28,7 +28,7 @@ clock-$(CONFIG_CPU_SUBTYPE_SH7343)	:= clock-sh7343.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7722)	:= clock-sh7722.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7723)	:= clock-sh7723.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7724)	:= clock-sh7724.o
-clock-$(CONFIG_CPU_SUBTYPE_SH7366)	:= clock-sh7722.o
+clock-$(CONFIG_CPU_SUBTYPE_SH7366)	:= clock-sh7366.o
 clock-$(CONFIG_CPU_SUBTYPE_SHX3)	:= clock-shx3.o
 
 # Pinmux setup
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7366.c b/arch/sh/kernel/cpu/sh4a/clock-sh7366.c
new file mode 100644
index 0000000..a95ebab
--- /dev/null
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7366.c
@@ -0,0 +1,211 @@
+/*
+ * arch/sh/kernel/cpu/sh4a/clock-sh7366.c
+ *
+ * SH7366 clock framework support
+ *
+ * Copyright (C) 2009 Magnus Damm
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/io.h>
+#include <asm/clock.h>
+
+/* SH7366 registers */
+#define FRQCR		0xa4150000
+#define VCLKCR		0xa4150004
+#define SCLKACR		0xa4150008
+#define SCLKBCR		0xa415000c
+#define PLLCR		0xa4150024
+#define MSTPCR0		0xa4150030
+#define MSTPCR1		0xa4150034
+#define MSTPCR2		0xa4150038
+#define DLLFRQ		0xa4150050
+
+/* Fixed 32 KHz root clock for RTC and Power Management purposes */
+static struct clk r_clk = {
+	.name           = "rclk",
+	.id             = -1,
+	.rate           = 32768,
+};
+
+/*
+ * Default rate for the root input clock, reset this with clk_set_rate()
+ * from the platform code.
+ */
+struct clk extal_clk = {
+	.name		= "extal",
+	.id		= -1,
+	.rate		= 33333333,
+};
+
+/* The dll block multiplies the 32khz r_clk, may be used instead of extal */
+static unsigned long dll_recalc(struct clk *clk)
+{
+	unsigned long mult;
+
+	if (__raw_readl(PLLCR) & 0x1000)
+		mult = __raw_readl(DLLFRQ);
+	else
+		mult = 0;
+
+	return clk->parent->rate * mult;
+}
+
+static struct clk_ops dll_clk_ops = {
+	.recalc		= dll_recalc,
+};
+
+static struct clk dll_clk = {
+	.name           = "dll_clk",
+	.id             = -1,
+	.ops		= &dll_clk_ops,
+	.parent		= &r_clk,
+	.flags		= CLK_ENABLE_ON_INIT,
+};
+
+static unsigned long pll_recalc(struct clk *clk)
+{
+	unsigned long mult = 1;
+	unsigned long div = 1;
+
+	if (__raw_readl(PLLCR) & 0x4000)
+		mult = (((__raw_readl(FRQCR) >> 24) & 0x1f) + 1);
+	else
+		div = 2;
+
+	return (clk->parent->rate * mult) / div;
+}
+
+static struct clk_ops pll_clk_ops = {
+	.recalc		= pll_recalc,
+};
+
+static struct clk pll_clk = {
+	.name		= "pll_clk",
+	.id		= -1,
+	.ops		= &pll_clk_ops,
+	.flags		= CLK_ENABLE_ON_INIT,
+};
+
+struct clk *main_clks[] = {
+	&r_clk,
+	&extal_clk,
+	&dll_clk,
+	&pll_clk,
+};
+
+static int multipliers[] = { 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
+static int divisors[] = { 1, 3, 2, 5, 3, 4, 5, 6, 8, 10, 12, 16, 20 };
+
+static struct clk_div_mult_table div4_table = {
+	.divisors = divisors,
+	.nr_divisors = ARRAY_SIZE(divisors),
+	.multipliers = multipliers,
+	.nr_multipliers = ARRAY_SIZE(multipliers),
+};
+
+enum { DIV4_I, DIV4_U, DIV4_SH, DIV4_B, DIV4_B3, DIV4_P,
+       DIV4_SIUA, DIV4_SIUB, DIV4_NR };
+
+#define DIV4(_str, _reg, _bit, _mask, _flags) \
+  SH_CLK_DIV4(_str, &pll_clk, _reg, _bit, _mask, _flags)
+
+struct clk div4_clks[DIV4_NR] = {
+	[DIV4_I] = DIV4("cpu_clk", FRQCR, 20, 0x1fef, CLK_ENABLE_ON_INIT),
+	[DIV4_U] = DIV4("umem_clk", FRQCR, 16, 0x1fff, CLK_ENABLE_ON_INIT),
+	[DIV4_SH] = DIV4("shyway_clk", FRQCR, 12, 0x1fff, CLK_ENABLE_ON_INIT),
+	[DIV4_B] = DIV4("bus_clk", FRQCR, 8, 0x1fff, CLK_ENABLE_ON_INIT),
+	[DIV4_B3] = DIV4("b3_clk", FRQCR, 4, 0x1fff, CLK_ENABLE_ON_INIT),
+	[DIV4_P] = DIV4("peripheral_clk", FRQCR, 0, 0x1fff, 0),
+	[DIV4_SIUA] = DIV4("siua_clk", SCLKACR, 0, 0x1fff, 0),
+	[DIV4_SIUB] = DIV4("siub_clk", SCLKBCR, 0, 0x1fff, 0),
+};
+
+struct clk div6_clks[] = {
+	SH_CLK_DIV6("video_clk", &pll_clk, VCLKCR, 0),
+};
+
+#define MSTP(_str, _parent, _reg, _bit, _flags) \
+  SH_CLK_MSTP32(_str, -1, _parent, _reg, _bit, _flags)
+
+static struct clk mstp_clks[] = {
+	/* See page 52 of Datasheet V0.40: Overview -> Block Diagram */
+	MSTP("tlb0", &div4_clks[DIV4_I], MSTPCR0, 31, CLK_ENABLE_ON_INIT),
+	MSTP("ic0", &div4_clks[DIV4_I], MSTPCR0, 30, CLK_ENABLE_ON_INIT),
+	MSTP("oc0", &div4_clks[DIV4_I], MSTPCR0, 29, CLK_ENABLE_ON_INIT),
+	MSTP("rsmem0", &div4_clks[DIV4_SH], MSTPCR0, 28, CLK_ENABLE_ON_INIT),
+	MSTP("xymem0", &div4_clks[DIV4_B], MSTPCR0, 26, CLK_ENABLE_ON_INIT),
+	MSTP("intc3", &div4_clks[DIV4_P], MSTPCR0, 23, 0),
+	MSTP("intc0", &div4_clks[DIV4_P], MSTPCR0, 22, 0),
+	MSTP("dmac0", &div4_clks[DIV4_P], MSTPCR0, 21, 0),
+	MSTP("sh0", &div4_clks[DIV4_P], MSTPCR0, 20, 0),
+	MSTP("hudi0", &div4_clks[DIV4_P], MSTPCR0, 19, 0),
+	MSTP("ubc0", &div4_clks[DIV4_P], MSTPCR0, 17, 0),
+	MSTP("tmu0", &div4_clks[DIV4_P], MSTPCR0, 15, 0),
+	MSTP("cmt0", &r_clk, MSTPCR0, 14, 0),
+	MSTP("rwdt0", &r_clk, MSTPCR0, 13, 0),
+	MSTP("mfi0", &div4_clks[DIV4_P], MSTPCR0, 11, 0),
+	MSTP("flctl0", &div4_clks[DIV4_P], MSTPCR0, 10, 0),
+	MSTP("scif0", &div4_clks[DIV4_P], MSTPCR0, 7, 0),
+	MSTP("scif1", &div4_clks[DIV4_P], MSTPCR0, 6, 0),
+	MSTP("scif2", &div4_clks[DIV4_P], MSTPCR0, 5, 0),
+	MSTP("msiof0", &div4_clks[DIV4_P], MSTPCR0, 2, 0),
+	MSTP("sbr0", &div4_clks[DIV4_P], MSTPCR0, 1, 0),
+
+	MSTP("i2c0", &div4_clks[DIV4_P], MSTPCR1, 9, 0),
+
+	MSTP("icb0", &div4_clks[DIV4_P], MSTPCR2, 27, 0),
+	MSTP("meram0", &div4_clks[DIV4_P], MSTPCR2, 26, 0),
+	MSTP("dacy1", &div4_clks[DIV4_P], MSTPCR2, 24, 0),
+	MSTP("dacy0", &div4_clks[DIV4_P], MSTPCR2, 23, 0),
+	MSTP("tsif0", &div4_clks[DIV4_P], MSTPCR2, 22, 0),
+	MSTP("sdhi0", &div4_clks[DIV4_P], MSTPCR2, 18, 0),
+	MSTP("mmcif0", &div4_clks[DIV4_P], MSTPCR2, 17, 0),
+	MSTP("usbf0", &div4_clks[DIV4_P], MSTPCR2, 11, 0),
+	MSTP("siu0", &div4_clks[DIV4_B], MSTPCR2, 9, 0),
+	MSTP("veu1", &div4_clks[DIV4_B], MSTPCR2, 7, CLK_ENABLE_ON_INIT),
+	MSTP("vou0", &div4_clks[DIV4_B], MSTPCR2, 5, 0),
+	MSTP("beu0", &div4_clks[DIV4_B], MSTPCR2, 4, 0),
+	MSTP("ceu0", &div4_clks[DIV4_B], MSTPCR2, 3, 0),
+	MSTP("veu0", &div4_clks[DIV4_B], MSTPCR2, 2, CLK_ENABLE_ON_INIT),
+	MSTP("vpu0", &div4_clks[DIV4_B], MSTPCR2, 1, CLK_ENABLE_ON_INIT),
+	MSTP("lcdc0", &div4_clks[DIV4_B], MSTPCR2, 0, 0),
+};
+
+int __init arch_clk_init(void)
+{
+	int k, ret = 0;
+
+	/* autodetect extal or dll configuration */
+	if (__raw_readl(PLLCR) & 0x1000)
+		pll_clk.parent = &dll_clk;
+	else
+		pll_clk.parent = &extal_clk;
+
+	for (k = 0; !ret && (k < ARRAY_SIZE(main_clks)); k++)
+		ret = clk_register(main_clks[k]);
+
+	if (!ret)
+		ret = sh_clk_div4_register(div4_clks, DIV4_NR, &div4_table);
+
+	if (!ret)
+		ret = sh_clk_div6_register(div6_clks, ARRAY_SIZE(div6_clks));
+
+	if (!ret)
+		ret = sh_clk_mstp32_register(mstp_clks, ARRAY_SIZE(mstp_clks));
+
+	return ret;
+}
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index 8aaaac2..6ca7c74 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -1,7 +1,7 @@
 /*
  * arch/sh/kernel/cpu/sh4a/clock-sh7722.c
  *
- * SH7722 & SH7366 support for the clock framework
+ * SH7722 support for the clock framework
  *
  * Copyright (c) 2006-2007 Nomad Global Solutions Inc
  * Based on code for sh7343 by Paul Mundt
@@ -654,46 +654,6 @@ static struct clk sh7722_mstpcr_clocks[] = {
 	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
 	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
 #endif
-#if defined(CONFIG_CPU_SUBTYPE_SH7366)
-	/* See page 52 of Datasheet V0.40: Overview -> Block Diagram */
-	MSTPCR("tlb0", "cpu_clk", 0, 31, 0),
-	MSTPCR("ic0", "cpu_clk", 0, 30, 0),
-	MSTPCR("oc0", "cpu_clk", 0, 29, 0),
-	MSTPCR("rsmem0", "sh_clk", 0, 28, CLK_ENABLE_ON_INIT),
-	MSTPCR("xymem0", "cpu_clk", 0, 26, CLK_ENABLE_ON_INIT),
-	MSTPCR("intc30", "peripheral_clk", 0, 23, 0),
-	MSTPCR("intc0", "peripheral_clk", 0, 22, 0),
-	MSTPCR("dmac0", "bus_clk", 0, 21, 0),
-	MSTPCR("sh0", "sh_clk", 0, 20, 0),
-	MSTPCR("hudi0", "peripheral_clk", 0, 19, 0),
-	MSTPCR("ubc0", "cpu_clk", 0, 17, 0),
-	MSTPCR("tmu0", "peripheral_clk", 0, 15, 0),
-	MSTPCR("cmt0", "r_clk", 0, 14, 0),
-	MSTPCR("rwdt0", "r_clk", 0, 13, 0),
-	MSTPCR("flctl0", "peripheral_clk", 0, 10, 0),
-	MSTPCR("scif0", "peripheral_clk", 0, 7, 0),
-	MSTPCR("scif1", "bus_clk", 0, 6, 0),
-	MSTPCR("scif2", "bus_clk", 0, 5, 0),
-	MSTPCR("msiof0", "peripheral_clk", 0, 2, 0),
-	MSTPCR("sbr0", "peripheral_clk", 0, 1, 0),
-	MSTPCR("i2c0", "peripheral_clk", 1, 9, 0),
-	MSTPCR("icb0", "bus_clk", 2, 27, 0),
-	MSTPCR("meram0", "sh_clk", 2, 26, 0),
-	MSTPCR("dacc0", "peripheral_clk", 2, 24, 0),
-	MSTPCR("dacy0", "peripheral_clk", 2, 23, 0),
-	MSTPCR("tsif0", "bus_clk", 2, 22, 0),
-	MSTPCR("sdhi0", "bus_clk", 2, 18, 0),
-	MSTPCR("mmcif0", "bus_clk", 2, 17, 0),
-	MSTPCR("usb0", "bus_clk", 2, 11, 0),
-	MSTPCR("siu0", "bus_clk", 2, 8, 0),
-	MSTPCR("veu1", "bus_clk", 2, 7, CLK_ENABLE_ON_INIT),
-	MSTPCR("vou0", "bus_clk", 2, 5, 0),
-	MSTPCR("beu0", "bus_clk", 2, 4, 0),
-	MSTPCR("ceu0", "bus_clk", 2, 3, 0),
-	MSTPCR("veu0", "bus_clk", 2, 2, CLK_ENABLE_ON_INIT),
-	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
-	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
-#endif
 };
 
 static struct clk *sh7722_clocks[] = {
-- 
cgit v0.10.2


From 46e9371c0eaade8391b3969431a72272b9007158 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 4 Jun 2009 07:37:59 +0000
Subject: sh: sh7722 clock framework rewrite

This patch rewrites the sh7722 clock framework code.
The new code makes use of the recently merged div4,
div6 and mstp32 helper code. Both extal and dll are
supported as input clocks to the pll.

While at it, now when all SuperH Mobile processors
are converted, fix CONFIG_SH_CLK_CPG_LEGACY to depend
on CONFIG_ARCH_SHMOBILE.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index dbc10ff..b5629cd 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -516,7 +516,7 @@ config SH_CLK_CPG
 
 config SH_CLK_CPG_LEGACY
 	depends on SH_CLK_CPG
-	def_bool y if !CPU_SUBTYPE_SH7785 && !CPU_SUBTYPE_SH7723 && !CPU_SUBTYPE_SH7724 && !CPU_SUBTYPE_SH7343 && !CPU_SUBTYPE_SH7366
+	def_bool y if !CPU_SUBTYPE_SH7785 && !ARCH_SHMOBILE
 
 config SH_CLK_MD
 	int "CPU Mode Pin Setting"
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index 6ca7c74..40f8593 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -1,719 +1,197 @@
 /*
  * arch/sh/kernel/cpu/sh4a/clock-sh7722.c
  *
- * SH7722 support for the clock framework
+ * SH7722 clock framework support
  *
- * Copyright (c) 2006-2007 Nomad Global Solutions Inc
- * Based on code for sh7343 by Paul Mundt
+ * Copyright (C) 2009 Magnus Damm
  *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/io.h>
-#include <linux/errno.h>
-#include <linux/stringify.h>
 #include <asm/clock.h>
-#include <asm/freq.h>
-
-#define N  (-1)
-#define NM (-2)
-#define ROUND_NEAREST 0
-#define ROUND_DOWN -1
-#define ROUND_UP   +1
-
-static int adjust_algos[][3] = {
-	{},	/* NO_CHANGE */
-	{ NM, N, 1 },   /* N:1, N:1 */
-	{ 3, 2, 2 },	/* 3:2:2 */
-	{ 5, 2, 2 },    /* 5:2:2 */
-	{ N, 1, 1 },	/* N:1:1 */
-
-	{ N, 1 },	/* N:1 */
 
-	{ N, 1 },	/* N:1 */
-	{ 3, 2 },
-	{ 4, 3 },
-	{ 5, 4 },
-
-	{ N, 1 }
+/* SH7722 registers */
+#define FRQCR		0xa4150000
+#define VCLKCR		0xa4150004
+#define SCLKACR		0xa4150008
+#define SCLKBCR		0xa415000c
+#define IRDACLKCR	0xa4150018
+#define PLLCR		0xa4150024
+#define MSTPCR0		0xa4150030
+#define MSTPCR1		0xa4150034
+#define MSTPCR2		0xa4150038
+#define DLLFRQ		0xa4150050
+
+/* Fixed 32 KHz root clock for RTC and Power Management purposes */
+static struct clk r_clk = {
+	.name           = "rclk",
+	.id             = -1,
+	.rate           = 32768,
 };
 
-static unsigned long adjust_pair_of_clocks(unsigned long r1, unsigned long r2,
-			int m1, int m2, int round_flag)
-{
-	unsigned long rem, div;
-	int the_one = 0;
-
-	pr_debug( "Actual values: r1 = %ld\n", r1);
-	pr_debug( "...............r2 = %ld\n", r2);
-
-	if (m1 == m2) {
-		r2 = r1;
-		pr_debug( "setting equal rates: r2 now %ld\n", r2);
-	} else if ((m2 == N  && m1 == 1) ||
-		   (m2 == NM && m1 == N)) { /* N:1 or NM:N */
-		pr_debug( "Setting rates as 1:N (N:N*M)\n");
-		rem = r2 % r1;
-		pr_debug( "...remainder = %ld\n", rem);
-		if (rem) {
-			div = r2 / r1;
-			pr_debug( "...div = %ld\n", div);
-			switch (round_flag) {
-			case ROUND_NEAREST:
-				the_one = rem >= r1/2 ? 1 : 0; break;
-			case ROUND_UP:
-				the_one = 1; break;
-			case ROUND_DOWN:
-				the_one = 0; break;
-			}
-
-			r2 = r1 * (div + the_one);
-			pr_debug( "...setting r2 to %ld\n", r2);
-		}
-	} else if ((m2 == 1  && m1 == N) ||
-		   (m2 == N && m1 == NM)) { /* 1:N or N:NM */
-		pr_debug( "Setting rates as N:1 (N*M:N)\n");
-		rem = r1 % r2;
-		pr_debug( "...remainder = %ld\n", rem);
-		if (rem) {
-			div = r1 / r2;
-			pr_debug( "...div = %ld\n", div);
-			switch (round_flag) {
-			case ROUND_NEAREST:
-				the_one = rem > r2/2 ? 1 : 0; break;
-			case ROUND_UP:
-				the_one = 0; break;
-			case ROUND_DOWN:
-				the_one = 1; break;
-			}
-
-			r2 = r1 / (div + the_one);
-			pr_debug( "...setting r2 to %ld\n", r2);
-		}
-	} else { /* value:value */
-		pr_debug( "Setting rates as %d:%d\n", m1, m2);
-		div = r1 / m1;
-		r2 = div * m2;
-		pr_debug( "...div = %ld\n", div);
-		pr_debug( "...setting r2 to %ld\n", r2);
-	}
-
-	return r2;
-}
-
-static void adjust_clocks(int originate, int *l, unsigned long v[],
-			  int n_in_line)
-{
-	int x;
-
-	pr_debug( "Go down from %d...\n", originate);
-	/* go up recalculation clocks */
-	for (x = originate; x>0; x -- )
-		v[x-1] = adjust_pair_of_clocks(v[x], v[x-1],
-					l[x], l[x-1],
-					ROUND_UP);
-
-	pr_debug( "Go up from %d...\n", originate);
-	/* go down recalculation clocks */
-	for (x = originate; x<n_in_line - 1; x ++ )
-		v[x+1] = adjust_pair_of_clocks(v[x], v[x+1],
-					l[x], l[x+1],
-					ROUND_UP);
-}
-
-
 /*
- * SH7722 uses a common set of multipliers and divisors, so this
- * is quite simple..
+ * Default rate for the root input clock, reset this with clk_set_rate()
+ * from the platform code.
  */
-
-#define STCPLL(frqcr) (((frqcr >> 24) & 0x1f) + 1)
-
-/*
- * Instead of having two separate multipliers/divisors set, like this:
- *
- * static int multipliers[] = { 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
- * static int divisors[] = { 1, 3, 2, 5, 3, 4, 5, 6, 8, 10, 12, 16, 20 };
- *
- * I created the divisors2 array, which is used to calculate rate like
- *   rate = parent * 2 / divisors2[ divisor ];
-*/
-static int divisors2[] = { 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32, 40 };
-
-static unsigned long master_clk_recalc(struct clk *clk)
-{
-	unsigned frqcr = ctrl_inl(FRQCR);
-
-	return CONFIG_SH_PCLK_FREQ * STCPLL(frqcr);
-}
-
-static void master_clk_init(struct clk *clk)
-{
-	clk->parent = NULL;
-	clk->rate = master_clk_recalc(clk);
-}
-
-static unsigned long module_clk_recalc(struct clk *clk)
-{
-	unsigned long frqcr = ctrl_inl(FRQCR);
-
-	return clk->parent->rate / STCPLL(frqcr);
-}
-
-#define MASTERDIVS	{ 2, 3, 4, 6, 8, 16 }
-#define STCMASK		0x1f
-#define DIVCALC(div)	(div-1)
-#define FRQCRKICK	0x00000000
-
-static int master_clk_setrate(struct clk *clk, unsigned long rate, int id)
-{
-	int div = rate / clk->rate;
-	int master_divs[] = MASTERDIVS;
-	int index;
-	unsigned long frqcr;
-
-	for (index = 1; index < ARRAY_SIZE(master_divs); index++)
-		if (div >= master_divs[index - 1] && div < master_divs[index])
-			break;
-
-	if (index >= ARRAY_SIZE(master_divs))
-		index = ARRAY_SIZE(master_divs);
-	div = master_divs[index - 1];
-
-	frqcr = ctrl_inl(FRQCR);
-	frqcr &= ~(STCMASK << 24);
-	frqcr |= (DIVCALC(div) << 24);
-	frqcr |= FRQCRKICK;
-	ctrl_outl(frqcr, FRQCR);
-
-	return 0;
-}
-
-static struct clk_ops sh7722_master_clk_ops = {
-	.init = master_clk_init,
-	.recalc = master_clk_recalc,
-	.set_rate = master_clk_setrate,
-};
-
-static struct clk_ops sh7722_module_clk_ops = {
-       .recalc = module_clk_recalc,
-};
-
-struct frqcr_context {
-	unsigned mask;
-	unsigned shift;
-};
-
-struct frqcr_context sh7722_get_clk_context(const char *name)
-{
-	struct frqcr_context ctx = { 0, };
-
-	if (!strcmp(name, "peripheral_clk")) {
-		ctx.shift = 0;
-		ctx.mask = 0xF;
-	} else if (!strcmp(name, "sdram_clk")) {
-		ctx.shift = 4;
-		ctx.mask = 0xF;
-	} else if (!strcmp(name, "bus_clk")) {
-		ctx.shift = 8;
-		ctx.mask = 0xF;
-	} else if (!strcmp(name, "sh_clk")) {
-		ctx.shift = 12;
-		ctx.mask = 0xF;
-	} else if (!strcmp(name, "umem_clk")) {
-		ctx.shift = 16;
-		ctx.mask = 0xF;
-	} else if (!strcmp(name, "cpu_clk")) {
-		ctx.shift = 20;
-		ctx.mask = 7;
-	}
-	return ctx;
-}
-
-/**
- * sh7722_find_div_index - find divisor for setting rate
- *
- * All sh7722 clocks use the same set of multipliers/divisors. This function
- * chooses correct divisor to set the rate of clock with parent clock that
- * generates frequency of 'parent_rate'
- *
- * @parent_rate: rate of parent clock
- * @rate: requested rate to be set
- */
-static int sh7722_find_div_index(unsigned long parent_rate, unsigned rate)
-{
-	unsigned div2 = parent_rate * 2 / rate;
-	int index;
-
-	if (rate > parent_rate)
-		return -EINVAL;
-
-	for (index = 1; index < ARRAY_SIZE(divisors2); index++) {
-		if (div2 > divisors2[index - 1] && div2 <= divisors2[index])
-			break;
-	}
-	if (index >= ARRAY_SIZE(divisors2))
-		index = ARRAY_SIZE(divisors2) - 1;
-	return index;
-}
-
-static unsigned long sh7722_frqcr_recalc(struct clk *clk)
-{
-	struct frqcr_context ctx = sh7722_get_clk_context(clk->name);
-	unsigned long frqcr = ctrl_inl(FRQCR);
-	int index;
-
-	index = (frqcr >> ctx.shift) & ctx.mask;
-	return clk->parent->rate * 2 / divisors2[index];
-}
-
-static int sh7722_frqcr_set_rate(struct clk *clk, unsigned long rate,
-				 int algo_id)
-{
-	struct frqcr_context ctx = sh7722_get_clk_context(clk->name);
-	unsigned long parent_rate = clk->parent->rate;
-	int div;
-	unsigned long frqcr;
-	int err = 0;
-
-	/* pretty invalid */
-	if (parent_rate < rate)
-		return -EINVAL;
-
-	/* look for multiplier/divisor pair */
-	div = sh7722_find_div_index(parent_rate, rate);
-	if (div<0)
-		return div;
-
-	/* calculate new value of clock rate */
-	clk->rate = parent_rate * 2 / divisors2[div];
-	frqcr = ctrl_inl(FRQCR);
-
-	/* FIXME: adjust as algo_id specifies */
-	if (algo_id != NO_CHANGE) {
-		int originator;
-		char *algo_group_1[] = { "cpu_clk", "umem_clk", "sh_clk" };
-		char *algo_group_2[] = { "sh_clk", "bus_clk" };
-		char *algo_group_3[] = { "sh_clk", "sdram_clk" };
-		char *algo_group_4[] = { "bus_clk", "peripheral_clk" };
-		char *algo_group_5[] = { "cpu_clk", "peripheral_clk" };
-		char **algo_current = NULL;
-		/* 3 is the maximum number of clocks in relation */
-		struct clk *ck[3];
-		unsigned long values[3]; /* the same comment as above */
-		int part_length = -1;
-		int i;
-
-		/*
-		 * all the steps below only required if adjustion was
-		 * requested
-		 */
-		if (algo_id == IUS_N1_N1 ||
-		    algo_id == IUS_322 ||
-		    algo_id == IUS_522 ||
-		    algo_id == IUS_N11) {
-			algo_current = algo_group_1;
-			part_length = 3;
-		}
-		if (algo_id == SB_N1) {
-			algo_current = algo_group_2;
-			part_length = 2;
-		}
-		if (algo_id == SB3_N1 ||
-		    algo_id == SB3_32 ||
-		    algo_id == SB3_43 ||
-		    algo_id == SB3_54) {
-			algo_current = algo_group_3;
-			part_length = 2;
-		}
-		if (algo_id == BP_N1) {
-			algo_current = algo_group_4;
-			part_length = 2;
-		}
-		if (algo_id == IP_N1) {
-			algo_current = algo_group_5;
-			part_length = 2;
-		}
-		if (!algo_current)
-			goto incorrect_algo_id;
-
-		originator = -1;
-		for (i = 0; i < part_length; i ++ ) {
-			if (originator >= 0 && !strcmp(clk->name,
-						       algo_current[i]))
-				originator = i;
-			ck[i] = clk_get(NULL, algo_current[i]);
-			values[i] = clk_get_rate(ck[i]);
-		}
-
-		if (originator >= 0)
-			adjust_clocks(originator, adjust_algos[algo_id],
-				      values, part_length);
-
-		for (i = 0; i < part_length; i ++ ) {
-			struct frqcr_context part_ctx;
-			int part_div;
-
-			if (likely(!err)) {
-				part_div = sh7722_find_div_index(parent_rate,
-								rate);
-				if (part_div > 0) {
-					part_ctx = sh7722_get_clk_context(
-								ck[i]->name);
-					frqcr &= ~(part_ctx.mask <<
-						   part_ctx.shift);
-					frqcr |= part_div << part_ctx.shift;
-				} else
-					err = part_div;
-			}
-
-			ck[i]->ops->recalc(ck[i]);
-			clk_put(ck[i]);
-		}
-	}
-
-	/* was there any error during recalculation ? If so, bail out.. */
-	if (unlikely(err!=0))
-		goto out_err;
-
-	/* clear FRQCR bits */
-	frqcr &= ~(ctx.mask << ctx.shift);
-	frqcr |= div << ctx.shift;
-	frqcr |= FRQCRKICK;
-
-	/* ...and perform actual change */
-	ctrl_outl(frqcr, FRQCR);
-	return 0;
-
-incorrect_algo_id:
-	return -EINVAL;
-out_err:
-	return err;
-}
-
-static long sh7722_frqcr_round_rate(struct clk *clk, unsigned long rate)
-{
-	unsigned long parent_rate = clk->parent->rate;
-	int div;
-
-	/* look for multiplier/divisor pair */
-	div = sh7722_find_div_index(parent_rate, rate);
-	if (div < 0)
-		return clk->rate;
-
-	/* calculate new value of clock rate */
-	return parent_rate * 2 / divisors2[div];
-}
-
-static struct clk_ops sh7722_frqcr_clk_ops = {
-	.recalc = sh7722_frqcr_recalc,
-	.set_rate = sh7722_frqcr_set_rate,
-	.round_rate = sh7722_frqcr_round_rate,
+struct clk extal_clk = {
+	.name		= "extal",
+	.id		= -1,
+	.rate		= 33333333,
 };
 
-/*
- * clock ops methods for SIU A/B and IrDA clock
- */
-static int sh7722_siu_set_rate(struct clk *clk, unsigned long rate, int algo_id)
-{
-	unsigned long r;
-	int div;
-
-	r = ctrl_inl(clk->arch_flags);
-	div = sh7722_find_div_index(clk->parent->rate, rate);
-	if (div < 0)
-		return div;
-	r = (r & ~0xF) | div;
-	ctrl_outl(r, clk->arch_flags);
-	return 0;
-}
-
-static unsigned long sh7722_siu_recalc(struct clk *clk)
-{
-	unsigned long r;
-
-	r = ctrl_inl(clk->arch_flags);
-	return clk->parent->rate * 2 / divisors2[r & 0xF];
-}
-
-static int sh7722_siu_start_stop(struct clk *clk, int enable)
+/* The dll block multiplies the 32khz r_clk, may be used instead of extal */
+static unsigned long dll_recalc(struct clk *clk)
 {
-	unsigned long r;
+	unsigned long mult;
 
-	r = ctrl_inl(clk->arch_flags);
-	if (enable)
-		ctrl_outl(r & ~(1 << 8), clk->arch_flags);
+	if (__raw_readl(PLLCR) & 0x1000)
+		mult = __raw_readl(DLLFRQ);
 	else
-		ctrl_outl(r | (1 << 8), clk->arch_flags);
-	return 0;
-}
+		mult = 0;
 
-static int sh7722_siu_enable(struct clk *clk)
-{
-	return sh7722_siu_start_stop(clk, 1);
-}
-
-static void sh7722_siu_disable(struct clk *clk)
-{
-	sh7722_siu_start_stop(clk, 0);
+	return clk->parent->rate * mult;
 }
 
-static struct clk_ops sh7722_siu_clk_ops = {
-	.recalc = sh7722_siu_recalc,
-	.set_rate = sh7722_siu_set_rate,
-	.enable = sh7722_siu_enable,
-	.disable = sh7722_siu_disable,
+static struct clk_ops dll_clk_ops = {
+	.recalc		= dll_recalc,
 };
 
-static int sh7722_video_enable(struct clk *clk)
-{
-	unsigned long r;
-
-	r = ctrl_inl(VCLKCR);
-	ctrl_outl( r & ~(1<<8), VCLKCR);
-	return 0;
-}
-
-static void sh7722_video_disable(struct clk *clk)
-{
-	unsigned long r;
-
-	r = ctrl_inl(VCLKCR);
-	ctrl_outl( r | (1<<8), VCLKCR);
-}
+static struct clk dll_clk = {
+	.name           = "dll_clk",
+	.id             = -1,
+	.ops		= &dll_clk_ops,
+	.parent		= &r_clk,
+	.flags		= CLK_ENABLE_ON_INIT,
+};
 
-static int sh7722_video_set_rate(struct clk *clk, unsigned long rate,
-				 int algo_id)
+static unsigned long pll_recalc(struct clk *clk)
 {
-	unsigned long r;
+	unsigned long mult = 1;
+	unsigned long div = 1;
 
-	r = ctrl_inl(VCLKCR);
-	r &= ~0x3F;
-	r |= ((clk->parent->rate / rate - 1) & 0x3F);
-	ctrl_outl(r, VCLKCR);
-	return 0;
-}
-
-static unsigned long sh7722_video_recalc(struct clk *clk)
-{
-	unsigned long r;
+	if (__raw_readl(PLLCR) & 0x4000)
+		mult = (((__raw_readl(FRQCR) >> 24) & 0x1f) + 1);
+	else
+		div = 2;
 
-	r = ctrl_inl(VCLKCR);
-	return clk->parent->rate / ((r & 0x3F) + 1);
+	return (clk->parent->rate * mult) / div;
 }
 
-static struct clk_ops sh7722_video_clk_ops = {
-	.recalc = sh7722_video_recalc,
-	.set_rate = sh7722_video_set_rate,
-	.enable = sh7722_video_enable,
-	.disable = sh7722_video_disable,
-};
-/*
- * and at last, clock definitions themselves
- */
-static struct clk sh7722_umem_clock = {
-	.name = "umem_clk",
-	.ops = &sh7722_frqcr_clk_ops,
+static struct clk_ops pll_clk_ops = {
+	.recalc		= pll_recalc,
 };
 
-static struct clk sh7722_sh_clock = {
-	.name = "sh_clk",
-	.ops = &sh7722_frqcr_clk_ops,
+static struct clk pll_clk = {
+	.name		= "pll_clk",
+	.id		= -1,
+	.ops		= &pll_clk_ops,
+	.flags		= CLK_ENABLE_ON_INIT,
 };
 
-static struct clk sh7722_peripheral_clock = {
-	.name = "peripheral_clk",
-	.ops = &sh7722_frqcr_clk_ops,
+struct clk *main_clks[] = {
+	&r_clk,
+	&extal_clk,
+	&dll_clk,
+	&pll_clk,
 };
 
-static struct clk sh7722_sdram_clock = {
-	.name = "sdram_clk",
-	.ops = &sh7722_frqcr_clk_ops,
-};
+static int multipliers[] = { 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
+static int divisors[] = { 1, 3, 2, 5, 3, 4, 5, 6, 8, 10, 12, 16, 20 };
 
-static struct clk sh7722_r_clock = {
-	.name = "r_clk",
-	.rate = 32768,
+static struct clk_div_mult_table div4_table = {
+	.divisors = divisors,
+	.nr_divisors = ARRAY_SIZE(divisors),
+	.multipliers = multipliers,
+	.nr_multipliers = ARRAY_SIZE(multipliers),
 };
 
-/*
- * these three clocks - SIU A, SIU B, IrDA - share the same clk_ops
- * methods of clk_ops determine which register they should access by
- * examining clk->name field
- */
-static struct clk sh7722_siu_a_clock = {
-	.name = "siu_a_clk",
-	.arch_flags = SCLKACR,
-	.ops = &sh7722_siu_clk_ops,
-};
+enum { DIV4_I, DIV4_U, DIV4_SH, DIV4_B, DIV4_B3, DIV4_P,
+       DIV4_SIUA, DIV4_SIUB, DIV4_IRDA, DIV4_NR };
 
-static struct clk sh7722_siu_b_clock = {
-	.name = "siu_b_clk",
-	.arch_flags = SCLKBCR,
-	.ops = &sh7722_siu_clk_ops,
-};
+#define DIV4(_str, _reg, _bit, _mask, _flags) \
+  SH_CLK_DIV4(_str, &pll_clk, _reg, _bit, _mask, _flags)
 
-#if defined(CONFIG_CPU_SUBTYPE_SH7722)
-static struct clk sh7722_irda_clock = {
-	.name = "irda_clk",
-	.arch_flags = IrDACLKCR,
-	.ops = &sh7722_siu_clk_ops,
+struct clk div4_clks[DIV4_NR] = {
+	[DIV4_I] = DIV4("cpu_clk", FRQCR, 20, 0x1fef, CLK_ENABLE_ON_INIT),
+	[DIV4_U] = DIV4("umem_clk", FRQCR, 16, 0x1fff, CLK_ENABLE_ON_INIT),
+	[DIV4_SH] = DIV4("shyway_clk", FRQCR, 12, 0x1fff, CLK_ENABLE_ON_INIT),
+	[DIV4_B] = DIV4("bus_clk", FRQCR, 8, 0x1fff, CLK_ENABLE_ON_INIT),
+	[DIV4_B3] = DIV4("b3_clk", FRQCR, 4, 0x1fff, CLK_ENABLE_ON_INIT),
+	[DIV4_P] = DIV4("peripheral_clk", FRQCR, 0, 0x1fff, 0),
+	[DIV4_SIUA] = DIV4("siua_clk", SCLKACR, 0, 0x1fff, 0),
+	[DIV4_SIUB] = DIV4("siub_clk", SCLKBCR, 0, 0x1fff, 0),
+	[DIV4_IRDA] = DIV4("irda_clk", IRDACLKCR, 0, 0x1fff, 0),
 };
-#endif
 
-static struct clk sh7722_video_clock = {
-	.name = "video_clk",
-	.ops = &sh7722_video_clk_ops,
+struct clk div6_clks[] = {
+	SH_CLK_DIV6("video_clk", &pll_clk, VCLKCR, 0),
 };
 
-#define MSTPCR_ARCH_FLAGS(reg, bit) (((reg) << 8) | (bit))
-#define MSTPCR_ARCH_FLAGS_REG(value) ((value) >> 8)
-#define MSTPCR_ARCH_FLAGS_BIT(value) ((value) & 0xff)
-
-static int sh7722_mstpcr_start_stop(struct clk *clk, int enable)
-{
-	unsigned long bit = MSTPCR_ARCH_FLAGS_BIT(clk->arch_flags);
-	unsigned long reg;
-	unsigned long r;
+#define MSTP(_str, _parent, _reg, _bit, _flags) \
+  SH_CLK_MSTP32(_str, -1, _parent, _reg, _bit, _flags)
 
-	switch(MSTPCR_ARCH_FLAGS_REG(clk->arch_flags)) {
-	case 0:
-		reg = MSTPCR0;
-		break;
-	case 1:
-		reg = MSTPCR1;
-		break;
-	case 2:
-		reg = MSTPCR2;
-		break;
-	default:
-		return -EINVAL;
-	}
+static struct clk mstp_clks[] = {
+	MSTP("uram0", &div4_clks[DIV4_U], MSTPCR0, 28, CLK_ENABLE_ON_INIT),
+	MSTP("xymem0", &div4_clks[DIV4_B], MSTPCR0, 26, CLK_ENABLE_ON_INIT),
+	MSTP("tmu0", &div4_clks[DIV4_P], MSTPCR0, 15, 0),
+	MSTP("cmt0", &r_clk, MSTPCR0, 14, 0),
+	MSTP("rwdt0", &r_clk, MSTPCR0, 13, 0),
+	MSTP("flctl0", &div4_clks[DIV4_P], MSTPCR0, 10, 0),
+	MSTP("scif0", &div4_clks[DIV4_P], MSTPCR0, 7, 0),
+	MSTP("scif1", &div4_clks[DIV4_P], MSTPCR0, 6, 0),
+	MSTP("scif2", &div4_clks[DIV4_P], MSTPCR0, 5, 0),
 
-	r = ctrl_inl(reg);
-
-	if (enable)
-		r &= ~(1 << bit);
-	else
-		r |= (1 << bit);
+	MSTP("i2c0", &div4_clks[DIV4_P], MSTPCR1, 9, 0),
+	MSTP("rtc0", &r_clk, MSTPCR1, 8, 0),
 
-	ctrl_outl(r, reg);
-	return 0;
-}
-
-static int sh7722_mstpcr_enable(struct clk *clk)
-{
-	return sh7722_mstpcr_start_stop(clk, 1);
-}
-
-static void sh7722_mstpcr_disable(struct clk *clk)
-{
-	sh7722_mstpcr_start_stop(clk, 0);
-}
-
-static struct clk_ops sh7722_mstpcr_clk_ops = {
-	.enable = sh7722_mstpcr_enable,
-	.disable = sh7722_mstpcr_disable,
-	.recalc = followparent_recalc,
-};
-
-#define MSTPCR(_name, _parent, regnr, bitnr, _flags) \
-{						\
-	.name = _name,				\
-	.flags = _flags,			\
-	.arch_flags = MSTPCR_ARCH_FLAGS(regnr, bitnr),	\
-	.ops = (void *)_parent,		\
-}
-
-static struct clk sh7722_mstpcr_clocks[] = {
-#if defined(CONFIG_CPU_SUBTYPE_SH7722)
-	MSTPCR("uram0", "umem_clk", 0, 28, CLK_ENABLE_ON_INIT),
-	MSTPCR("xymem0", "bus_clk", 0, 26, CLK_ENABLE_ON_INIT),
-	MSTPCR("tmu0", "peripheral_clk", 0, 15, 0),
-	MSTPCR("cmt0", "r_clk", 0, 14, 0),
-	MSTPCR("rwdt0", "r_clk", 0, 13, 0),
-	MSTPCR("flctl0", "peripheral_clk", 0, 10, 0),
-	MSTPCR("scif0", "peripheral_clk", 0, 7, 0),
-	MSTPCR("scif1", "peripheral_clk", 0, 6, 0),
-	MSTPCR("scif2", "peripheral_clk", 0, 5, 0),
-	MSTPCR("i2c0", "peripheral_clk", 1, 9, 0),
-	MSTPCR("rtc0", "r_clk", 1, 8, 0),
-	MSTPCR("sdhi0", "peripheral_clk", 2, 18, 0),
-	MSTPCR("keysc0", "r_clk", 2, 14, 0),
-	MSTPCR("usbf0", "peripheral_clk", 2, 11, 0),
-	MSTPCR("2dg0", "bus_clk", 2, 9, 0),
-	MSTPCR("siu0", "bus_clk", 2, 8, 0),
-	MSTPCR("vou0", "bus_clk", 2, 5, 0),
-	MSTPCR("jpu0", "bus_clk", 2, 6, CLK_ENABLE_ON_INIT),
-	MSTPCR("beu0", "bus_clk", 2, 4, 0),
-	MSTPCR("ceu0", "bus_clk", 2, 3, 0),
-	MSTPCR("veu0", "bus_clk", 2, 2, CLK_ENABLE_ON_INIT),
-	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
-	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
-#endif
-};
-
-static struct clk *sh7722_clocks[] = {
-	&sh7722_umem_clock,
-	&sh7722_sh_clock,
-	&sh7722_peripheral_clock,
-	&sh7722_sdram_clock,
-	&sh7722_siu_a_clock,
-	&sh7722_siu_b_clock,
-#if defined(CONFIG_CPU_SUBTYPE_SH7722)
-	&sh7722_irda_clock,
-#endif
-	&sh7722_video_clock,
-};
-
-/*
- * init in order: master, module, bus, cpu
- */
-struct clk_ops *onchip_ops[] = {
-	&sh7722_master_clk_ops,
-	&sh7722_module_clk_ops,
-	&sh7722_frqcr_clk_ops,
-	&sh7722_frqcr_clk_ops,
+	MSTP("sdhi0", &div4_clks[DIV4_P], MSTPCR2, 18, 0),
+	MSTP("keysc0", &r_clk, MSTPCR2, 14, 0),
+	MSTP("usbf0", &div4_clks[DIV4_P], MSTPCR2, 11, 0),
+	MSTP("2dg0", &div4_clks[DIV4_B], MSTPCR2, 9, 0),
+	MSTP("siu0", &div4_clks[DIV4_B], MSTPCR2, 8, 0),
+	MSTP("vou0", &div4_clks[DIV4_B], MSTPCR2, 5, 0),
+	MSTP("jpu0", &div4_clks[DIV4_B], MSTPCR2, 6, CLK_ENABLE_ON_INIT),
+	MSTP("beu0", &div4_clks[DIV4_B], MSTPCR2, 4, 0),
+	MSTP("ceu0", &div4_clks[DIV4_B], MSTPCR2, 3, 0),
+	MSTP("veu0", &div4_clks[DIV4_B], MSTPCR2, 2, CLK_ENABLE_ON_INIT),
+	MSTP("vpu0", &div4_clks[DIV4_B], MSTPCR2, 1, CLK_ENABLE_ON_INIT),
+	MSTP("lcdc0", &div4_clks[DIV4_B], MSTPCR2, 0, 0),
 };
 
-void __init
-arch_init_clk_ops(struct clk_ops **ops, int type)
-{
-	BUG_ON(type < 0 || type >= ARRAY_SIZE(onchip_ops));
-	*ops = onchip_ops[type];
-}
-
 int __init arch_clk_init(void)
 {
-	struct clk *clk;
-	int i;
+	int k, ret = 0;
 
-	cpg_clk_init();
+	/* autodetect extal or dll configuration */
+	if (__raw_readl(PLLCR) & 0x1000)
+		pll_clk.parent = &dll_clk;
+	else
+		pll_clk.parent = &extal_clk;
 
-	clk = clk_get(NULL, "master_clk");
-	for (i = 0; i < ARRAY_SIZE(sh7722_clocks); i++) {
-		pr_debug( "Registering clock '%s'\n", sh7722_clocks[i]->name);
-		sh7722_clocks[i]->parent = clk;
-		clk_register(sh7722_clocks[i]);
-	}
-	clk_put(clk);
+	for (k = 0; !ret && (k < ARRAY_SIZE(main_clks)); k++)
+		ret = clk_register(main_clks[k]);
 
-	clk_register(&sh7722_r_clock);
+	if (!ret)
+		ret = sh_clk_div4_register(div4_clks, DIV4_NR, &div4_table);
 
-	for (i = 0; i < ARRAY_SIZE(sh7722_mstpcr_clocks); i++) {
-		pr_debug( "Registering mstpcr clock '%s'\n",
-			  sh7722_mstpcr_clocks[i].name);
-		clk = clk_get(NULL, (void *) sh7722_mstpcr_clocks[i].ops);
-		sh7722_mstpcr_clocks[i].parent = clk;
-		sh7722_mstpcr_clocks[i].ops = &sh7722_mstpcr_clk_ops;
-		clk_register(&sh7722_mstpcr_clocks[i]);
-		clk_put(clk);
-	}
+	if (!ret)
+		ret = sh_clk_div6_register(div6_clks, ARRAY_SIZE(div6_clks));
 
-	propagate_rate(&sh7722_r_clock); /* make sure rate gets propagated */
+	if (!ret)
+		ret = sh_clk_mstp32_register(mstp_clks, ARRAY_SIZE(mstp_clks));
 
-	return 0;
+	return ret;
 }
-- 
cgit v0.10.2


From c5eeff1f8ecbc4bc7c1dd8e97a8610bc4dd3def8 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Mon, 8 Jun 2009 10:05:12 +0000
Subject: sh: sh7724: INTC setting update

This patch follows Rev 0.50 manual

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index 000f3b8..585dd85 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -511,46 +511,46 @@ enum {
 	IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5, IRQ6, IRQ7,
 	HUDI,
 	DMAC1A_DEI0, DMAC1A_DEI1, DMAC1A_DEI2, DMAC1A_DEI3,
-	_2DG_TRI, _2DG_INI, _2DG_CEI, _2DG_BRK,
+	_2DG_TRI, _2DG_INI, _2DG_CEI,
 	DMAC0A_DEI0, DMAC0A_DEI1, DMAC0A_DEI2, DMAC0A_DEI3,
-	VIO_CEU20I, VIO_BEU20I, VIO_VEU3F1, VIO_VOUI,
-	SCIFA_SCIFA0,
-	VPU_VPUI,
-	TPU_TPUI,
-	CEU21I,
-	BEU21I,
-	USB_USI0,
+	VIO_CEU0, VIO_BEU0, VIO_VEU1, VIO_VOU,
+	SCIFA3,
+	VPU,
+	TPU,
+	CEU1,
+	BEU1,
+	USB0, USB1,
 	ATAPI,
 	RTC_ATI, RTC_PRI, RTC_CUI,
 	DMAC1B_DEI4, DMAC1B_DEI5, DMAC1B_DADERR,
 	DMAC0B_DEI4, DMAC0B_DEI5, DMAC0B_DADERR,
-	KEYSC_KEYI,
+	KEYSC,
 	SCIF_SCIF0, SCIF_SCIF1, SCIF_SCIF2,
-	VEU3F0I,
+	VEU0,
 	MSIOF_MSIOFI0, MSIOF_MSIOFI1,
 	SPU_SPUI0, SPU_SPUI1,
-	SCIFA_SCIFA1,
-/*	ICB_ICBI, */
+	SCIFA4,
+	ICB,
 	ETHI,
 	I2C1_ALI, I2C1_TACKI, I2C1_WAITI, I2C1_DTEI,
 	I2C0_ALI, I2C0_TACKI, I2C0_WAITI, I2C0_DTEI,
-	SDHI0_SDHII0, SDHI0_SDHII1, SDHI0_SDHII2,
-	CMT_CMTI,
-	TSIF_TSIFI,
-/*	ICB_LMBI, */
-	FSI_FSI,
-	SCIFA_SCIFA2,
+	SDHI0_SDHII0, SDHI0_SDHII1, SDHI0_SDHII2, SDHI0_SDHII3,
+	CMT,
+	TSIF,
+	FSI,
+	SCIFA5,
 	TMU0_TUNI0, TMU0_TUNI1, TMU0_TUNI2,
-	IRDA_IRDAI,
+	IRDA,
 	SDHI1_SDHII0, SDHI1_SDHII1, SDHI1_SDHII2,
-	JPU_JPUI,
-	MMC_MMCI0, MMC_MMCI1, MMC_MMCI2,
-	LCDC_LCDCI,
+	JPU,
+	_2DDMAC,
+	MMC_MMC2I, MMC_MMC3I,
+	LCDC,
 	TMU1_TUNI0, TMU1_TUNI1, TMU1_TUNI2,
 
 	/* interrupt groups */
-	DMAC1A, _2DG, DMAC0A, VIO, RTC,
-	DMAC1B, DMAC0B, I2C0, I2C1, SDHI0, SDHI1, SPU, MMC,
+	DMAC1A, _2DG, DMAC0A, VIO, USB, RTC,
+	DMAC1B, DMAC0B, I2C0, I2C1, SDHI0, SDHI1, SPU, MMCIF,
 };
 
 static struct intc_vect vectors[] __initdata = {
@@ -567,25 +567,25 @@ static struct intc_vect vectors[] __initdata = {
 	INTC_VECT(_2DG_TRI, 0x780),
 	INTC_VECT(_2DG_INI, 0x7A0),
 	INTC_VECT(_2DG_CEI, 0x7C0),
-	INTC_VECT(_2DG_BRK, 0x7E0),
 
 	INTC_VECT(DMAC0A_DEI0, 0x800),
 	INTC_VECT(DMAC0A_DEI1, 0x820),
 	INTC_VECT(DMAC0A_DEI2, 0x840),
 	INTC_VECT(DMAC0A_DEI3, 0x860),
 
-	INTC_VECT(VIO_CEU20I, 0x880),
-	INTC_VECT(VIO_BEU20I, 0x8A0),
-	INTC_VECT(VIO_VEU3F1, 0x8C0),
-	INTC_VECT(VIO_VOUI, 0x8E0),
+	INTC_VECT(VIO_CEU0, 0x880),
+	INTC_VECT(VIO_BEU0, 0x8A0),
+	INTC_VECT(VIO_VEU1, 0x8C0),
+	INTC_VECT(VIO_VOU,  0x8E0),
 
-	INTC_VECT(SCIFA_SCIFA0, 0x900),
-	INTC_VECT(VPU_VPUI, 0x980),
-	INTC_VECT(TPU_TPUI, 0x9A0),
-	INTC_VECT(CEU21I, 0x9E0),
-	INTC_VECT(BEU21I, 0xA00),
-	INTC_VECT(USB_USI0, 0xA20),
-	INTC_VECT(ATAPI, 0xA60),
+	INTC_VECT(SCIFA3, 0x900),
+	INTC_VECT(VPU,    0x980),
+	INTC_VECT(TPU,    0x9A0),
+	INTC_VECT(CEU1,   0x9E0),
+	INTC_VECT(BEU1,   0xA00),
+	INTC_VECT(USB0,   0xA20),
+	INTC_VECT(USB1,   0xA40),
+	INTC_VECT(ATAPI,  0xA60),
 
 	INTC_VECT(RTC_ATI, 0xA80),
 	INTC_VECT(RTC_PRI, 0xAA0),
@@ -599,18 +599,18 @@ static struct intc_vect vectors[] __initdata = {
 	INTC_VECT(DMAC0B_DEI5, 0xBA0),
 	INTC_VECT(DMAC0B_DADERR, 0xBC0),
 
-	INTC_VECT(KEYSC_KEYI, 0xBE0),
+	INTC_VECT(KEYSC,      0xBE0),
 	INTC_VECT(SCIF_SCIF0, 0xC00),
 	INTC_VECT(SCIF_SCIF1, 0xC20),
 	INTC_VECT(SCIF_SCIF2, 0xC40),
-	INTC_VECT(VEU3F0I, 0xC60),
+	INTC_VECT(VEU0,       0xC60),
 	INTC_VECT(MSIOF_MSIOFI0, 0xC80),
 	INTC_VECT(MSIOF_MSIOFI1, 0xCA0),
 	INTC_VECT(SPU_SPUI0, 0xCC0),
 	INTC_VECT(SPU_SPUI1, 0xCE0),
-	INTC_VECT(SCIFA_SCIFA1, 0xD00),
+	INTC_VECT(SCIFA4,    0xD00),
 
-/*	INTC_VECT(ICB_ICBI, 0xD20), */
+	INTC_VECT(ICB,  0xD20),
 	INTC_VECT(ETHI, 0xD60),
 
 	INTC_VECT(I2C1_ALI, 0xD80),
@@ -626,30 +626,30 @@ static struct intc_vect vectors[] __initdata = {
 	INTC_VECT(SDHI0_SDHII0, 0xE80),
 	INTC_VECT(SDHI0_SDHII1, 0xEA0),
 	INTC_VECT(SDHI0_SDHII2, 0xEC0),
+	INTC_VECT(SDHI0_SDHII3, 0xEE0),
 
-	INTC_VECT(CMT_CMTI, 0xF00),
-	INTC_VECT(TSIF_TSIFI, 0xF20),
-/*	INTC_VECT(ICB_LMBI, 0xF60), */
-	INTC_VECT(FSI_FSI, 0xF80),
-	INTC_VECT(SCIFA_SCIFA2, 0xFA0),
+	INTC_VECT(CMT,    0xF00),
+	INTC_VECT(TSIF,   0xF20),
+	INTC_VECT(FSI,    0xF80),
+	INTC_VECT(SCIFA5, 0xFA0),
 
 	INTC_VECT(TMU0_TUNI0, 0x400),
 	INTC_VECT(TMU0_TUNI1, 0x420),
 	INTC_VECT(TMU0_TUNI2, 0x440),
 
-	INTC_VECT(IRDA_IRDAI, 0x480),
+	INTC_VECT(IRDA,    0x480),
 
 	INTC_VECT(SDHI1_SDHII0, 0x4E0),
 	INTC_VECT(SDHI1_SDHII1, 0x500),
 	INTC_VECT(SDHI1_SDHII2, 0x520),
 
-	INTC_VECT(JPU_JPUI, 0x560),
+	INTC_VECT(JPU, 0x560),
+	INTC_VECT(_2DDMAC, 0x4A0),
 
-	INTC_VECT(MMC_MMCI0, 0x580),
-	INTC_VECT(MMC_MMCI1, 0x5A0),
-	INTC_VECT(MMC_MMCI2, 0x5C0),
+	INTC_VECT(MMC_MMC2I, 0x5A0),
+	INTC_VECT(MMC_MMC3I, 0x5C0),
 
-	INTC_VECT(LCDC_LCDCI, 0xF40),
+	INTC_VECT(LCDC, 0xF40),
 
 	INTC_VECT(TMU1_TUNI0, 0x920),
 	INTC_VECT(TMU1_TUNI1, 0x940),
@@ -658,86 +658,79 @@ static struct intc_vect vectors[] __initdata = {
 
 static struct intc_group groups[] __initdata = {
 	INTC_GROUP(DMAC1A, DMAC1A_DEI0, DMAC1A_DEI1, DMAC1A_DEI2, DMAC1A_DEI3),
-	INTC_GROUP(_2DG, _2DG_TRI, _2DG_INI, _2DG_CEI, _2DG_BRK),
+	INTC_GROUP(_2DG, _2DG_TRI, _2DG_INI, _2DG_CEI),
 	INTC_GROUP(DMAC0A, DMAC0A_DEI0, DMAC0A_DEI1, DMAC0A_DEI2, DMAC0A_DEI3),
-	INTC_GROUP(VIO, VIO_CEU20I, VIO_BEU20I, VIO_VEU3F1, VIO_VOUI),
+	INTC_GROUP(VIO, VIO_CEU0, VIO_BEU0, VIO_VEU1, VIO_VOU),
+	INTC_GROUP(USB, USB0, USB1),
 	INTC_GROUP(RTC, RTC_ATI, RTC_PRI, RTC_CUI),
 	INTC_GROUP(DMAC1B, DMAC1B_DEI4, DMAC1B_DEI5, DMAC1B_DADERR),
 	INTC_GROUP(DMAC0B, DMAC0B_DEI4, DMAC0B_DEI5, DMAC0B_DADERR),
 	INTC_GROUP(I2C0, I2C0_ALI, I2C0_TACKI, I2C0_WAITI, I2C0_DTEI),
 	INTC_GROUP(I2C1, I2C1_ALI, I2C1_TACKI, I2C1_WAITI, I2C1_DTEI),
-	INTC_GROUP(SDHI0, SDHI0_SDHII0, SDHI0_SDHII1, SDHI0_SDHII2),
+	INTC_GROUP(SDHI0, SDHI0_SDHII0, SDHI0_SDHII1, SDHI0_SDHII2, SDHI0_SDHII3),
 	INTC_GROUP(SDHI1, SDHI1_SDHII0, SDHI1_SDHII1, SDHI1_SDHII2),
 	INTC_GROUP(SPU, SPU_SPUI0, SPU_SPUI1),
-	INTC_GROUP(MMC, MMC_MMCI0, MMC_MMCI1, MMC_MMCI2),
+	INTC_GROUP(MMCIF, MMC_MMC2I, MMC_MMC3I),
 };
 
-/* FIXMEEEEEEEEEEEEEEEEEEE !!!!! */
-/* very bad manual !! */
 static struct intc_mask_reg mask_registers[] __initdata = {
 	{ 0xa4080080, 0xa40800c0, 8, /* IMR0 / IMCR0 */
 	  { 0, TMU1_TUNI2, TMU1_TUNI1, TMU1_TUNI0,
-	    /*SDHII3?*/0, SDHI1_SDHII2, SDHI1_SDHII1, SDHI1_SDHII0 } },
+	    0, SDHI1_SDHII2, SDHI1_SDHII1, SDHI1_SDHII0 } },
 	{ 0xa4080084, 0xa40800c4, 8, /* IMR1 / IMCR1 */
-	  { VIO_VOUI, VIO_VEU3F1, VIO_BEU20I, VIO_CEU20I,
+	  { VIO_VOU, VIO_VEU1, VIO_BEU0, VIO_CEU0,
 	    DMAC0A_DEI3, DMAC0A_DEI2, DMAC0A_DEI1, DMAC0A_DEI0 } },
 	{ 0xa4080088, 0xa40800c8, 8, /* IMR2 / IMCR2 */
-	  { 0, 0, 0, VPU_VPUI, ATAPI, ETHI, 0, /*SCIFA3*/SCIFA_SCIFA0 } },
+	  { 0, 0, 0, VPU, ATAPI, ETHI, 0, SCIFA3 } },
 	{ 0xa408008c, 0xa40800cc, 8, /* IMR3 / IMCR3 */
 	  { DMAC1A_DEI3, DMAC1A_DEI2, DMAC1A_DEI1, DMAC1A_DEI0,
-	    SPU_SPUI1, SPU_SPUI0, BEU21I, IRDA_IRDAI } },
+	    SPU_SPUI1, SPU_SPUI0, BEU1, IRDA } },
 	{ 0xa4080090, 0xa40800d0, 8, /* IMR4 / IMCR4 */
 	  { 0, TMU0_TUNI2, TMU0_TUNI1, TMU0_TUNI0,
-	    JPU_JPUI, 0, 0, LCDC_LCDCI } },
+	    JPU, 0, 0, LCDC } },
 	{ 0xa4080094, 0xa40800d4, 8, /* IMR5 / IMCR5 */
-	  { KEYSC_KEYI, DMAC0B_DADERR, DMAC0B_DEI5, DMAC0B_DEI4,
-	    VEU3F0I, SCIF_SCIF2, SCIF_SCIF1, SCIF_SCIF0 } },
+	  { KEYSC, DMAC0B_DADERR, DMAC0B_DEI5, DMAC0B_DEI4,
+	    VEU0, SCIF_SCIF2, SCIF_SCIF1, SCIF_SCIF0 } },
 	{ 0xa4080098, 0xa40800d8, 8, /* IMR6 / IMCR6 */
-	  { 0, 0, /*ICB_ICBI*/0, /*SCIFA4*/SCIFA_SCIFA1,
-	    CEU21I, 0, MSIOF_MSIOFI1, MSIOF_MSIOFI0 } },
+	  { 0, 0, ICB, SCIFA4,
+	    CEU1, 0, MSIOF_MSIOFI1, MSIOF_MSIOFI0 } },
 	{ 0xa408009c, 0xa40800dc, 8, /* IMR7 / IMCR7 */
 	  { I2C0_DTEI, I2C0_WAITI, I2C0_TACKI, I2C0_ALI,
 	    I2C1_DTEI, I2C1_WAITI, I2C1_TACKI, I2C1_ALI } },
 	{ 0xa40800a0, 0xa40800e0, 8, /* IMR8 / IMCR8 */
-	  { /*SDHII3*/0, SDHI0_SDHII2, SDHI0_SDHII1, SDHI0_SDHII0,
-	    0, 0, /*SCIFA5*/SCIFA_SCIFA2, FSI_FSI } },
+	  { SDHI0_SDHII3, SDHI0_SDHII2, SDHI0_SDHII1, SDHI0_SDHII0,
+	    0, 0, SCIFA5, FSI } },
 	{ 0xa40800a4, 0xa40800e4, 8, /* IMR9 / IMCR9 */
-	  { 0, 0, 0, CMT_CMTI, 0, /*USB1*/0, USB_USI0, 0 } },
+	  { 0, 0, 0, CMT, 0, USB1, USB0, 0 } },
 	{ 0xa40800a8, 0xa40800e8, 8, /* IMR10 / IMCR10 */
 	  { 0, DMAC1B_DADERR, DMAC1B_DEI5, DMAC1B_DEI4,
-	    0, RTC_ATI, RTC_PRI, RTC_CUI } },
+	    0, RTC_CUI, RTC_PRI, RTC_ATI } },
 	{ 0xa40800ac, 0xa40800ec, 8, /* IMR11 / IMCR11 */
-	  { _2DG_BRK, _2DG_CEI, _2DG_INI, _2DG_TRI,
-	    0, TPU_TPUI, /*ICB_LMBI*/0, TSIF_TSIFI } },
+	  { 0, _2DG_CEI, _2DG_INI, _2DG_TRI,
+	    0, TPU, 0, TSIF } },
 	{ 0xa40800b0, 0xa40800f0, 8, /* IMR12 / IMCR12 */
-	  { 0, 0, 0, 0, 0, 0, 0, 0/*2DDMAC*/ } },
+	  { 0, 0, MMC_MMC3I, MMC_MMC2I, 0, 0, 0, _2DDMAC } },
 	{ 0xa4140044, 0xa4140064, 8, /* INTMSK00 / INTMSKCLR00 */
 	  { IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5, IRQ6, IRQ7 } },
 };
 
 static struct intc_prio_reg prio_registers[] __initdata = {
 	{ 0xa4080000, 0, 16, 4, /* IPRA */ { TMU0_TUNI0, TMU0_TUNI1,
-					     TMU0_TUNI2, IRDA_IRDAI } },
-	{ 0xa4080004, 0, 16, 4, /* IPRB */ { JPU_JPUI, LCDC_LCDCI,
-					     DMAC1A, BEU21I } },
+					     TMU0_TUNI2, IRDA } },
+	{ 0xa4080004, 0, 16, 4, /* IPRB */ { JPU, LCDC, DMAC1A, BEU1 } },
 	{ 0xa4080008, 0, 16, 4, /* IPRC */ { TMU1_TUNI0, TMU1_TUNI1,
 					     TMU1_TUNI2, SPU } },
-	{ 0xa408000c, 0, 16, 4, /* IPRD */ { 0, MMC, 0, ATAPI } },
-	{ 0xa4080010, 0, 16, 4, /* IPRE */
-	  { DMAC0A, /*BEU?VEU?*/VIO, /*SCIFA3*/SCIFA_SCIFA0, /*VPU5F*/
-	    VPU_VPUI } },
-	{ 0xa4080014, 0, 16, 4, /* IPRF */ { KEYSC_KEYI, DMAC0B,
-					     USB_USI0, CMT_CMTI } },
+	{ 0xa408000c, 0, 16, 4, /* IPRD */ { 0, MMCIF, 0, ATAPI } },
+	{ 0xa4080010, 0, 16, 4, /* IPRE */ { DMAC0A, VIO, SCIFA3, VPU } },
+	{ 0xa4080014, 0, 16, 4, /* IPRF */ { KEYSC, DMAC0B, USB, CMT } },
 	{ 0xa4080018, 0, 16, 4, /* IPRG */ { SCIF_SCIF0, SCIF_SCIF1,
-					     SCIF_SCIF2, VEU3F0I } },
+					     SCIF_SCIF2, VEU0 } },
 	{ 0xa408001c, 0, 16, 4, /* IPRH */ { MSIOF_MSIOFI0, MSIOF_MSIOFI1,
 					     I2C1, I2C0 } },
-	{ 0xa4080020, 0, 16, 4, /* IPRI */ { /*SCIFA4*/SCIFA_SCIFA1, /*ICB*/0,
-					     TSIF_TSIFI, _2DG/*ICB?*/ } },
-	{ 0xa4080024, 0, 16, 4, /* IPRJ */ { CEU21I, ETHI, FSI_FSI, SDHI1 } },
-	{ 0xa4080028, 0, 16, 4, /* IPRK */ { RTC, DMAC1B, /*ICB?*/0, SDHI0 } },
-	{ 0xa408002c, 0, 16, 4, /* IPRL */ { /*SCIFA5*/SCIFA_SCIFA2, 0,
-					     TPU_TPUI, /*2DDMAC*/0 } },
+	{ 0xa4080020, 0, 16, 4, /* IPRI */ { SCIFA4, ICB, TSIF, _2DG } },
+	{ 0xa4080024, 0, 16, 4, /* IPRJ */ { CEU1, ETHI, FSI, SDHI1 } },
+	{ 0xa4080028, 0, 16, 4, /* IPRK */ { RTC, DMAC1B, 0, SDHI0 } },
+	{ 0xa408002c, 0, 16, 4, /* IPRL */ { SCIFA5, 0, TPU, _2DDMAC } },
 	{ 0xa4140010, 0, 32, 4, /* INTPRI00 */
 	  { IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5, IRQ6, IRQ7 } },
 };
-- 
cgit v0.10.2


From f168dd00a9440a6f644db73bda47718fd12008e4 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Wed, 10 Jun 2009 05:04:32 +0000
Subject: sh: sh7724: Add JPU support

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Acked-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index 585dd85..e5ac9eb 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -451,6 +451,35 @@ static struct platform_device tmu5_device = {
 	.num_resources	= ARRAY_SIZE(tmu5_resources),
 };
 
+/* JPU */
+static struct uio_info jpu_platform_data = {
+	.name = "JPU",
+	.version = "0",
+	.irq = 27,
+};
+
+static struct resource jpu_resources[] = {
+	[0] = {
+		.name	= "JPU",
+		.start	= 0xfe980000,
+		.end	= 0xfe9902d3,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		/* place holder for contiguous memory */
+	},
+};
+
+static struct platform_device jpu_device = {
+	.name		= "uio_pdrv_genirq",
+	.id		= 3,
+	.dev = {
+		.platform_data	= &jpu_platform_data,
+	},
+	.resource	= jpu_resources,
+	.num_resources	= ARRAY_SIZE(jpu_resources),
+};
+
 static struct platform_device *sh7724_devices[] __initdata = {
 	&cmt_device,
 	&tmu0_device,
@@ -466,6 +495,7 @@ static struct platform_device *sh7724_devices[] __initdata = {
 	&vpu_device,
 	&veu0_device,
 	&veu1_device,
+	&jpu_device,
 };
 
 static int __init sh7724_devices_setup(void)
@@ -473,6 +503,7 @@ static int __init sh7724_devices_setup(void)
 	platform_resource_setup_memory(&vpu_device, "vpu", 2 << 20);
 	platform_resource_setup_memory(&veu0_device, "veu0", 2 << 20);
 	platform_resource_setup_memory(&veu1_device, "veu1", 2 << 20);
+	platform_resource_setup_memory(&jpu_device,  "jpu",  2 << 20);
 
 	return platform_add_devices(sh7724_devices,
 				    ARRAY_SIZE(sh7724_devices));
-- 
cgit v0.10.2


From 4c7c99788631bab177bd51e15e893be4689bb085 Mon Sep 17 00:00:00 2001
From: Aoi Shinkai <shinkoi2005@gmail.com>
Date: Wed, 10 Jun 2009 16:15:42 +0000
Subject: sh: Fix sh4a llsc-based cmpxchg()

This fixes up a typo in the ll/sc based cmpxchg code which apparently
wasn't getting a lot of testing due to the swapped old/new pair. With
that fixed up, the ll/sc code also starts using it and provides its own
atomic_add_unless().

Signed-off-by: Aoi Shinkai <shinkoi2005@gmail.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/atomic-llsc.h b/arch/sh/include/asm/atomic-llsc.h
index 4b00b78..b040e1e 100644
--- a/arch/sh/include/asm/atomic-llsc.h
+++ b/arch/sh/include/asm/atomic-llsc.h
@@ -104,4 +104,31 @@ static inline void atomic_set_mask(unsigned int mask, atomic_t *v)
 	: "t");
 }
 
+#define atomic_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n)))
+
+/**
+ * atomic_add_unless - add unless the number is a given value
+ * @v: pointer of type atomic_t
+ * @a: the amount to add to v...
+ * @u: ...unless v is equal to u.
+ *
+ * Atomically adds @a to @v, so long as it was not @u.
+ * Returns non-zero if @v was not @u, and zero otherwise.
+ */
+static inline int atomic_add_unless(atomic_t *v, int a, int u)
+{
+	int c, old;
+	c = atomic_read(v);
+	for (;;) {
+		if (unlikely(c == (u)))
+			break;
+		old = atomic_cmpxchg((v), c, c + (a));
+		if (likely(old == c))
+			break;
+		c = old;
+	}
+
+	return c != (u);
+}
+
 #endif /* __ASM_SH_ATOMIC_LLSC_H */
diff --git a/arch/sh/include/asm/atomic.h b/arch/sh/include/asm/atomic.h
index 6327ffb..978b58e 100644
--- a/arch/sh/include/asm/atomic.h
+++ b/arch/sh/include/asm/atomic.h
@@ -45,7 +45,7 @@
 #define atomic_inc(v) atomic_add(1,(v))
 #define atomic_dec(v) atomic_sub(1,(v))
 
-#ifndef CONFIG_GUSA_RB
+#if !defined(CONFIG_GUSA_RB) && !defined(CONFIG_CPU_SH4A)
 static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 {
 	int ret;
@@ -73,7 +73,7 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
 
 	return ret != u;
 }
-#endif
+#endif /* !CONFIG_GUSA_RB && !CONFIG_CPU_SH4A */
 
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 #define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
diff --git a/arch/sh/include/asm/cmpxchg-llsc.h b/arch/sh/include/asm/cmpxchg-llsc.h
index 0fac3da..4713666 100644
--- a/arch/sh/include/asm/cmpxchg-llsc.h
+++ b/arch/sh/include/asm/cmpxchg-llsc.h
@@ -55,7 +55,7 @@ __cmpxchg_u32(volatile int *m, unsigned long old, unsigned long new)
 		"mov		%0, %1				\n\t"
 		"cmp/eq		%1, %3				\n\t"
 		"bf		2f				\n\t"
-		"mov		%3, %0				\n\t"
+		"mov		%4, %0				\n\t"
 		"2:						\n\t"
 		"movco.l	%0, @%2				\n\t"
 		"bf		1b				\n\t"
diff --git a/arch/sh/include/asm/spinlock.h b/arch/sh/include/asm/spinlock.h
index 6028356..a28c9f0 100644
--- a/arch/sh/include/asm/spinlock.h
+++ b/arch/sh/include/asm/spinlock.h
@@ -26,7 +26,7 @@
 #define __raw_spin_is_locked(x)		((x)->lock <= 0)
 #define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
 #define __raw_spin_unlock_wait(x) \
-	do { cpu_relax(); } while ((x)->lock)
+	do { while (__raw_spin_is_locked(x)) cpu_relax(); } while (0)
 
 /*
  * Simple spin lock operations.  There are two variants, one clears IRQ's
-- 
cgit v0.10.2


From 75c936aec050113c12f74581f51779ffc5502a51 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 11 Jun 2009 09:33:53 +0300
Subject: sh: Fix sys_pwritev() syscall table entry for sh32.

There was a typo here that had this as sys_writev() instead of
sys_pwritev(), fix this up. sh64 got this right, as did the preadv()
case.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/kernel/syscalls_32.S b/arch/sh/kernel/syscalls_32.S
index 05202ed..6118f35 100644
--- a/arch/sh/kernel/syscalls_32.S
+++ b/arch/sh/kernel/syscalls_32.S
@@ -350,4 +350,4 @@ ENTRY(sys_call_table)
 	.long sys_pipe2
 	.long sys_inotify_init1
 	.long sys_preadv
-	.long sys_writev
+	.long sys_pwritev
-- 
cgit v0.10.2


From 6a1555fdde407dad23b8a119cf5feeb7c6466de9 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 11 Jun 2009 09:38:05 +0300
Subject: sh: Wire up sys_rt_tgsigqueueinfo.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/include/asm/unistd_32.h b/arch/sh/include/asm/unistd_32.h
index 2efb819..6519708 100644
--- a/arch/sh/include/asm/unistd_32.h
+++ b/arch/sh/include/asm/unistd_32.h
@@ -343,8 +343,9 @@
 #define __NR_inotify_init1	332
 #define __NR_preadv		333
 #define __NR_pwritev		334
+#define __NR_rt_tgsigqueueinfo	335
 
-#define NR_syscalls 335
+#define NR_syscalls 336
 
 #ifdef __KERNEL__
 
diff --git a/arch/sh/include/asm/unistd_64.h b/arch/sh/include/asm/unistd_64.h
index 6eb9d29..8014aea 100644
--- a/arch/sh/include/asm/unistd_64.h
+++ b/arch/sh/include/asm/unistd_64.h
@@ -383,10 +383,11 @@
 #define __NR_inotify_init1	360
 #define __NR_preadv		361
 #define __NR_pwritev		362
+#define __NR_rt_tgsigqueueinfo	363
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 363
+#define NR_syscalls 364
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/sh/kernel/syscalls_32.S b/arch/sh/kernel/syscalls_32.S
index 6118f35..a9fff9f 100644
--- a/arch/sh/kernel/syscalls_32.S
+++ b/arch/sh/kernel/syscalls_32.S
@@ -351,3 +351,4 @@ ENTRY(sys_call_table)
 	.long sys_inotify_init1
 	.long sys_preadv
 	.long sys_pwritev
+	.long sys_rt_tgsigqueueinfo	/* 335 */
diff --git a/arch/sh/kernel/syscalls_64.S b/arch/sh/kernel/syscalls_64.S
index a083609..75c1889 100644
--- a/arch/sh/kernel/syscalls_64.S
+++ b/arch/sh/kernel/syscalls_64.S
@@ -389,3 +389,4 @@ sys_call_table:
 	.long sys_inotify_init1		/* 360 */
 	.long sys_preadv
 	.long sys_pwritev
+	.long sys_rt_tgsigqueueinfo
-- 
cgit v0.10.2


From 54ff328b46e58568c4b3350c2fa3223ef862e5a4 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 11 Jun 2009 10:33:09 +0300
Subject: sh: Tie sparseirq in to Kconfig.

Now that the dependent patches are merged, we are ready to enable
sparseirq support. This simply adds the Kconfig option, and then converts
from the _cpu to the _node allocation routines to follow the upstream
sparseirq API changes.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index b5629cd..586cd04 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -678,6 +678,18 @@ config GUSA_RB
 	  LLSC, this should be more efficient than the other alternative of
 	  disabling interrupts around the atomic sequence.
 
+config SPARSE_IRQ
+	bool "Support sparse irq numbering"
+	depends on EXPERIMENTAL
+	help
+	  This enables support for sparse irqs. This is useful in general
+	  as most CPUs have a fairly sparse array of IRQ vectors, which
+	  the irq_desc then maps directly on to. Systems with a high
+	  number of off-chip IRQs will want to treat this as
+	  experimental until they have been independently verified.
+
+	  If you don't know what to do here, say N.
+
 endmenu
 
 menu "Boot options"
diff --git a/arch/sh/kernel/cpu/irq/ipr.c b/arch/sh/kernel/cpu/irq/ipr.c
index 6ad40db..808d99a 100644
--- a/arch/sh/kernel/cpu/irq/ipr.c
+++ b/arch/sh/kernel/cpu/irq/ipr.c
@@ -21,6 +21,7 @@
 #include <linux/module.h>
 #include <linux/io.h>
 #include <linux/interrupt.h>
+#include <linux/topology.h>
 
 static inline struct ipr_desc *get_ipr_desc(unsigned int irq)
 {
@@ -59,21 +60,17 @@ void register_ipr_controller(struct ipr_desc *desc)
 
 	for (i = 0; i < desc->nr_irqs; i++) {
 		struct ipr_data *p = desc->ipr_data + i;
-#ifdef CONFIG_SPARSE_IRQ
 		struct irq_desc *irq_desc;
-#endif
 
 		BUG_ON(p->ipr_idx >= desc->nr_offsets);
 		BUG_ON(!desc->ipr_offsets[p->ipr_idx]);
 
-#ifdef CONFIG_SPARSE_IRQ
-		irq_desc = irq_to_desc_alloc_cpu(p->irq, smp_processor_id());
+		irq_desc = irq_to_desc_alloc_node(p->irq, numa_node_id());
 		if (unlikely(!irq_desc)) {
 			printk(KERN_INFO "can not get irq_desc for %d\n",
 			       p->irq);
 			continue;
 		}
-#endif
 
 		disable_irq_nosync(p->irq);
 		set_irq_chip_and_handler_name(p->irq, &desc->chip,
diff --git a/drivers/sh/intc.c b/drivers/sh/intc.c
index caf0656..d687a9b 100644
--- a/drivers/sh/intc.c
+++ b/drivers/sh/intc.c
@@ -24,6 +24,7 @@
 #include <linux/sh_intc.h>
 #include <linux/sysdev.h>
 #include <linux/list.h>
+#include <linux/topology.h>
 
 #define _INTC_MK(fn, mode, addr_e, addr_d, width, shift) \
 	((shift) | ((width) << 5) | ((fn) << 9) | ((mode) << 13) | \
@@ -671,7 +672,7 @@ unsigned int intc_evt2irq(unsigned int vector)
 
 void __init register_intc_controller(struct intc_desc *desc)
 {
-	unsigned int i, k, smp, cpu = smp_processor_id();
+	unsigned int i, k, smp;
 	struct intc_desc_int *d;
 
 	d = alloc_bootmem(sizeof(*d));
@@ -771,19 +772,16 @@ void __init register_intc_controller(struct intc_desc *desc)
 	for (i = 0; i < desc->nr_vectors; i++) {
 		struct intc_vect *vect = desc->vectors + i;
 		unsigned int irq = evt2irq(vect->vect);
-#ifdef CONFIG_SPARSE_IRQ
 		struct irq_desc *irq_desc;
-#endif
+
 		if (!vect->enum_id)
 			continue;
 
-#ifdef CONFIG_SPARSE_IRQ
-		irq_desc = irq_to_desc_alloc_cpu(irq, cpu);
+		irq_desc = irq_to_desc_alloc_node(irq, numa_node_id());
 		if (unlikely(!irq_desc)) {
 			printk(KERN_INFO "can not get irq_desc for %d\n", irq);
 			continue;
 		}
-#endif
 
 		intc_register_irq(desc, d, vect->enum_id, irq);
 	}
-- 
cgit v0.10.2


From b0fd271d5fba0b2d00888363f3869e3f9b26caa9 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Thu, 11 Jun 2009 13:10:16 +0200
Subject: block: add request clone interface (v2)

This patch adds the following 2 interfaces for request-stacking drivers:

  - blk_rq_prep_clone(struct request *clone, struct request *orig,
		      struct bio_set *bs, gfp_t gfp_mask,
		      int (*bio_ctr)(struct bio *, struct bio*, void *),
		      void *data)
      * Clones bios in the original request to the clone request
        (bio_ctr is called for each cloned bios.)
      * Copies attributes of the original request to the clone request.
        The actual data parts (e.g. ->cmd, ->buffer, ->sense) are not
        copied.

  - blk_rq_unprep_clone(struct request *clone)
      * Frees cloned bios from the clone request.

Request stacking drivers (e.g. request-based dm) need to make a clone
request for a submitted request and dispatch it to other devices.

To allocate request for the clone, request stacking drivers may not
be able to use blk_get_request() because the allocation may be done
in an irq-disabled context.
So blk_rq_prep_clone() takes a request allocated by the caller
as an argument.

For each clone bio in the clone request, request stacking drivers
should be able to set up their own completion handler.
So blk_rq_prep_clone() takes a callback function which is called
for each clone bio, and a pointer for private data which is passed
to the callback.

NOTE:
blk_rq_prep_clone() doesn't copy any actual data of the original
request.  Pages are shared between original bios and cloned bios.
So caller must not complete the original request before the clone
request.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blk-core.c b/block/blk-core.c
index 03c5a64..02a9252 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2295,6 +2295,106 @@ int blk_lld_busy(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_lld_busy);
 
+/**
+ * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
+ * @rq: the clone request to be cleaned up
+ *
+ * Description:
+ *     Free all bios in @rq for a cloned request.
+ */
+void blk_rq_unprep_clone(struct request *rq)
+{
+	struct bio *bio;
+
+	while ((bio = rq->bio) != NULL) {
+		rq->bio = bio->bi_next;
+
+		bio_put(bio);
+	}
+}
+EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
+
+/*
+ * Copy attributes of the original request to the clone request.
+ * The actual data parts (e.g. ->cmd, ->buffer, ->sense) are not copied.
+ */
+static void __blk_rq_prep_clone(struct request *dst, struct request *src)
+{
+	dst->cpu = src->cpu;
+	dst->cmd_flags = (rq_data_dir(src) | REQ_NOMERGE);
+	dst->cmd_type = src->cmd_type;
+	dst->__sector = blk_rq_pos(src);
+	dst->__data_len = blk_rq_bytes(src);
+	dst->nr_phys_segments = src->nr_phys_segments;
+	dst->ioprio = src->ioprio;
+	dst->extra_len = src->extra_len;
+}
+
+/**
+ * blk_rq_prep_clone - Helper function to setup clone request
+ * @rq: the request to be setup
+ * @rq_src: original request to be cloned
+ * @bs: bio_set that bios for clone are allocated from
+ * @gfp_mask: memory allocation mask for bio
+ * @bio_ctr: setup function to be called for each clone bio.
+ *           Returns %0 for success, non %0 for failure.
+ * @data: private data to be passed to @bio_ctr
+ *
+ * Description:
+ *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
+ *     The actual data parts of @rq_src (e.g. ->cmd, ->buffer, ->sense)
+ *     are not copied, and copying such parts is the caller's responsibility.
+ *     Also, pages which the original bios are pointing to are not copied
+ *     and the cloned bios just point same pages.
+ *     So cloned bios must be completed before original bios, which means
+ *     the caller must complete @rq before @rq_src.
+ */
+int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
+		      struct bio_set *bs, gfp_t gfp_mask,
+		      int (*bio_ctr)(struct bio *, struct bio *, void *),
+		      void *data)
+{
+	struct bio *bio, *bio_src;
+
+	if (!bs)
+		bs = fs_bio_set;
+
+	blk_rq_init(NULL, rq);
+
+	__rq_for_each_bio(bio_src, rq_src) {
+		bio = bio_alloc_bioset(gfp_mask, bio_src->bi_max_vecs, bs);
+		if (!bio)
+			goto free_and_out;
+
+		__bio_clone(bio, bio_src);
+
+		if (bio_integrity(bio_src) &&
+		    bio_integrity_clone(bio, bio_src, gfp_mask))
+			goto free_and_out;
+
+		if (bio_ctr && bio_ctr(bio, bio_src, data))
+			goto free_and_out;
+
+		if (rq->bio) {
+			rq->biotail->bi_next = bio;
+			rq->biotail = bio;
+		} else
+			rq->bio = rq->biotail = bio;
+	}
+
+	__blk_rq_prep_clone(rq, rq_src);
+
+	return 0;
+
+free_and_out:
+	if (bio)
+		bio_free(bio, bs);
+	blk_rq_unprep_clone(rq);
+
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
+
 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
 {
 	return queue_work(kblockd_workqueue, work);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5e740a1..ebdfde8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -765,6 +765,11 @@ extern void blk_insert_request(struct request_queue *, struct request *, int, vo
 extern void blk_requeue_request(struct request_queue *, struct request *);
 extern int blk_rq_check_limits(struct request_queue *q, struct request *rq);
 extern int blk_lld_busy(struct request_queue *q);
+extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
+			     struct bio_set *bs, gfp_t gfp_mask,
+			     int (*bio_ctr)(struct bio *, struct bio *, void *),
+			     void *data);
+extern void blk_rq_unprep_clone(struct request *rq);
 extern int blk_insert_cloned_request(struct request_queue *q,
 				     struct request *rq);
 extern void blk_plug_device(struct request_queue *);
-- 
cgit v0.10.2


From 85d4198e40c289dd623cecd16601fa613559bed7 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Thu, 11 Jun 2009 08:51:10 -0400
Subject: Btrfs: check duplicate backrefs for both data and metadata

lookup_inline_extent_backref only checks for duplicate backref for data
extents. It assumes backrefs for tree block never conflict.

This patch makes lookup_inline_extent_backref check for duplicate backrefs
for both data and tree block, so that we can detect potential bug earlier.
This is a safety check, strictly speaking it is not required.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 33a65f2..edc7d20 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1056,8 +1056,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
 	want = extent_ref_type(parent, owner);
 	if (insert) {
 		extra_size = btrfs_extent_inline_ref_size(want);
-		if (owner >= BTRFS_FIRST_FREE_OBJECTID)
-			path->keep_locks = 1;
+		path->keep_locks = 1;
 	} else
 		extra_size = -1;
 	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
@@ -1087,12 +1086,6 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
 #endif
 	BUG_ON(item_size < sizeof(*ei));
 
-	if (owner < BTRFS_FIRST_FREE_OBJECTID && insert &&
-	    item_size + extra_size >= BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
-		err = -EAGAIN;
-		goto out;
-	}
-
 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 	flags = btrfs_extent_flags(leaf, ei);
 
@@ -1165,15 +1158,15 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
 		 * For simplicity, we just do not add new inline back
 		 * ref if there is any kind of item for this block
 		 */
-		if (owner >= BTRFS_FIRST_FREE_OBJECTID &&
-		    find_next_key(path, &key) == 0 && key.objectid == bytenr) {
+		if (find_next_key(path, &key) == 0 && key.objectid == bytenr &&
+		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
 			err = -EAGAIN;
 			goto out;
 		}
 	}
 	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
 out:
-	if (insert && owner >= BTRFS_FIRST_FREE_OBJECTID) {
+	if (insert) {
 		path->keep_locks = 0;
 		btrfs_unlock_up_safe(path, 1);
 	}
-- 
cgit v0.10.2


From 067c28adc53807514ac0c6ebb6af3243cbd071fa Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 11 Jun 2009 09:30:13 -0400
Subject: Btrfs: fix -o nodatasum printk spelling

It was printing nodatacsum, which was not the correct option name.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 3427db2..708ac06 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -159,7 +159,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			 */
 			break;
 		case Opt_nodatasum:
-			printk(KERN_INFO "btrfs: setting nodatacsum\n");
+			printk(KERN_INFO "btrfs: setting nodatasum\n");
 			btrfs_set_opt(info->mount_opt, NODATASUM);
 			break;
 		case Opt_nodatacow:
-- 
cgit v0.10.2


From 9866b7e86a2ce4daa677be750e3ccbfc65d187f5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 11 Jun 2009 16:25:01 +0200
Subject: x86: memtest: use pointers of equal type for comparison

Commit c9690998ef48ffefeccb91c70a7739eebdea57f9 (x86: memtest: remove
64-bit division) introduced following compile warning:

arch/x86/mm/memtest.c: In function 'memtest':
arch/x86/mm/memtest.c:56: warning: comparison of distinct pointer types lacks a cast
arch/x86/mm/memtest.c:58: warning: comparison of distinct pointer types lacks a cast

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index c0bedcd..18d244f 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -40,21 +40,20 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
 
 static void __init memtest(u64 pattern, u64 start_phys, u64 size)
 {
-	u64 *p;
-	void *start, *end;
+	u64 *p, *start, *end;
 	u64 start_bad, last_bad;
 	u64 start_phys_aligned;
-	size_t incr;
+	const size_t incr = sizeof(pattern);
 
-	incr = sizeof(pattern);
 	start_phys_aligned = ALIGN(start_phys, incr);
 	start = __va(start_phys_aligned);
-	end = start + size - (start_phys_aligned - start_phys);
+	end = start + (size - (start_phys_aligned - start_phys)) / incr;
 	start_bad = 0;
 	last_bad = 0;
 
 	for (p = start; p < end; p++)
 		*p = pattern;
+
 	for (p = start; p < end; p++, start_phys_aligned += incr) {
 		if (*p == pattern)
 			continue;
-- 
cgit v0.10.2


From 4da52960fd1ae3ddd14901bc88b608cbeaa4b9a6 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 11 Jun 2009 14:54:01 +1000
Subject: perf_counters: powerpc: Add support for POWER7 processors

This adds the back-end for the PMU on POWER7 processors.  POWER7
has 4 fully-programmable counters and two fixed-function counters
(which do respect the freeze conditions, can generate interrupts,
and are writable, unlike PMC5/6 on POWER5+/6).

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18992.36329.189378.17992@drongo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 9ba1bb7..a2c6834 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -95,7 +95,8 @@ obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
 obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o power4-pmu.o ppc970-pmu.o \
-				   power5-pmu.o power5+-pmu.o power6-pmu.o
+				   power5-pmu.o power5+-pmu.o power6-pmu.o \
+				   power7-pmu.o
 
 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
 
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 4990ce2..5d12e68 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -1181,6 +1181,7 @@ extern struct power_pmu ppc970_pmu;
 extern struct power_pmu power5_pmu;
 extern struct power_pmu power5p_pmu;
 extern struct power_pmu power6_pmu;
+extern struct power_pmu power7_pmu;
 
 static int init_perf_counters(void)
 {
@@ -1207,6 +1208,9 @@ static int init_perf_counters(void)
 	case 0x3e:
 		ppmu = &power6_pmu;
 		break;
+	case 0x3f:
+		ppmu = &power7_pmu;
+		break;
 	}
 
 	/*
diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c
new file mode 100644
index 0000000..dfac48d
--- /dev/null
+++ b/arch/powerpc/kernel/power7-pmu.c
@@ -0,0 +1,316 @@
+/*
+ * Performance counter support for POWER7 processors.
+ *
+ * Copyright 2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for POWER7
+ */
+#define PM_PMC_SH	16	/* PMC number (1-based) for direct events */
+#define PM_PMC_MSK	0xf
+#define PM_PMC_MSKS	(PM_PMC_MSK << PM_PMC_SH)
+#define PM_UNIT_SH	12	/* TTMMUX number and setting - unit select */
+#define PM_UNIT_MSK	0xf
+#define PM_COMBINE_SH	11	/* Combined event bit */
+#define PM_COMBINE_MSK	1
+#define PM_COMBINE_MSKS	0x800
+#define PM_L2SEL_SH	8	/* L2 event select */
+#define PM_L2SEL_MSK	7
+#define PM_PMCSEL_MSK	0xff
+
+/*
+ * Bits in MMCR1 for POWER7
+ */
+#define MMCR1_TTM0SEL_SH	60
+#define MMCR1_TTM1SEL_SH	56
+#define MMCR1_TTM2SEL_SH	52
+#define MMCR1_TTM3SEL_SH	48
+#define MMCR1_TTMSEL_MSK	0xf
+#define MMCR1_L2SEL_SH		45
+#define MMCR1_L2SEL_MSK		7
+#define MMCR1_PMC1_COMBINE_SH	35
+#define MMCR1_PMC2_COMBINE_SH	34
+#define MMCR1_PMC3_COMBINE_SH	33
+#define MMCR1_PMC4_COMBINE_SH	32
+#define MMCR1_PMC1SEL_SH	24
+#define MMCR1_PMC2SEL_SH	16
+#define MMCR1_PMC3SEL_SH	8
+#define MMCR1_PMC4SEL_SH	0
+#define MMCR1_PMCSEL_SH(n)	(MMCR1_PMC1SEL_SH - (n) * 8)
+#define MMCR1_PMCSEL_MSK	0xff
+
+/*
+ * Bits in MMCRA
+ */
+
+/*
+ * Layout of constraint bits:
+ * 6666555555555544444444443333333333222222222211111111110000000000
+ * 3210987654321098765432109876543210987654321098765432109876543210
+ *                                                 [  ><><><><><><>
+ *                                                  NC P6P5P4P3P2P1
+ *
+ * NC - number of counters
+ *     15: NC error 0x8000
+ *     12-14: number of events needing PMC1-4 0x7000
+ *
+ * P6
+ *     11: P6 error 0x800
+ *     10-11: Count of events needing PMC6
+ *
+ * P1..P5
+ *     0-9: Count of events needing PMC1..PMC5
+ */
+
+static int power7_get_constraint(u64 event, u64 *maskp, u64 *valp)
+{
+	int pmc, sh;
+	u64 mask = 0, value = 0;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	if (pmc) {
+		if (pmc > 6)
+			return -1;
+		sh = (pmc - 1) * 2;
+		mask |= 2 << sh;
+		value |= 1 << sh;
+		if (pmc >= 5 && !(event == 0x500fa || event == 0x600f4))
+			return -1;
+	}
+	if (pmc < 5) {
+		/* need a counter from PMC1-4 set */
+		mask  |= 0x8000;
+		value |= 0x1000;
+	}
+	*maskp = mask;
+	*valp = value;
+	return 0;
+}
+
+#define MAX_ALT	2	/* at most 2 alternatives for any event */
+
+static const unsigned int event_alternatives[][MAX_ALT] = {
+	{ 0x200f2, 0x300f2 },		/* PM_INST_DISP */
+	{ 0x200f4, 0x600f4 },		/* PM_RUN_CYC */
+	{ 0x400fa, 0x500fa },		/* PM_RUN_INST_CMPL */
+};
+
+/*
+ * Scan the alternatives table for a match and return the
+ * index into the alternatives table if found, else -1.
+ */
+static int find_alternative(u64 event)
+{
+	int i, j;
+
+	for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
+		if (event < event_alternatives[i][0])
+			break;
+		for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
+			if (event == event_alternatives[i][j])
+				return i;
+	}
+	return -1;
+}
+
+static s64 find_alternative_decode(u64 event)
+{
+	int pmc, psel;
+
+	/* this only handles the 4x decode events */
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	psel = event & PM_PMCSEL_MSK;
+	if ((pmc == 2 || pmc == 4) && (psel & ~7) == 0x40)
+		return event - (1 << PM_PMC_SH) + 8;
+	if ((pmc == 1 || pmc == 3) && (psel & ~7) == 0x48)
+		return event + (1 << PM_PMC_SH) - 8;
+	return -1;
+}
+
+static int power7_get_alternatives(u64 event, unsigned int flags, u64 alt[])
+{
+	int i, j, nalt = 1;
+	s64 ae;
+
+	alt[0] = event;
+	nalt = 1;
+	i = find_alternative(event);
+	if (i >= 0) {
+		for (j = 0; j < MAX_ALT; ++j) {
+			ae = event_alternatives[i][j];
+			if (ae && ae != event)
+				alt[nalt++] = ae;
+		}
+	} else {
+		ae = find_alternative_decode(event);
+		if (ae > 0)
+			alt[nalt++] = ae;
+	}
+
+	if (flags & PPMU_ONLY_COUNT_RUN) {
+		/*
+		 * We're only counting in RUN state,
+		 * so PM_CYC is equivalent to PM_RUN_CYC
+		 * and PM_INST_CMPL === PM_RUN_INST_CMPL.
+		 * This doesn't include alternatives that don't provide
+		 * any extra flexibility in assigning PMCs.
+		 */
+		j = nalt;
+		for (i = 0; i < nalt; ++i) {
+			switch (alt[i]) {
+			case 0x1e:	/* PM_CYC */
+				alt[j++] = 0x600f4;	/* PM_RUN_CYC */
+				break;
+			case 0x600f4:	/* PM_RUN_CYC */
+				alt[j++] = 0x1e;
+				break;
+			case 0x2:	/* PM_PPC_CMPL */
+				alt[j++] = 0x500fa;	/* PM_RUN_INST_CMPL */
+				break;
+			case 0x500fa:	/* PM_RUN_INST_CMPL */
+				alt[j++] = 0x2;	/* PM_PPC_CMPL */
+				break;
+			}
+		}
+		nalt = j;
+	}
+
+	return nalt;
+}
+
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int power7_marked_instr_event(u64 event)
+{
+	int pmc, psel;
+	int unit;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+	psel = event & PM_PMCSEL_MSK & ~1;	/* trim off edge/level bit */
+	if (pmc >= 5)
+		return 0;
+
+	switch (psel >> 4) {
+	case 2:
+		return pmc == 2 || pmc == 4;
+	case 3:
+		if (psel == 0x3c)
+			return pmc == 1;
+		if (psel == 0x3e)
+			return pmc != 2;
+		return 1;
+	case 4:
+	case 5:
+		return unit == 0xd;
+	case 6:
+		if (psel == 0x64)
+			return pmc >= 3;
+	case 8:
+		return unit == 0xd;
+	}
+	return 0;
+}
+
+static int power7_compute_mmcr(u64 event[], int n_ev,
+			       unsigned int hwc[], u64 mmcr[])
+{
+	u64 mmcr1 = 0;
+	u64 mmcra = 0;
+	unsigned int pmc, unit, combine, l2sel, psel;
+	unsigned int pmc_inuse = 0;
+	int i;
+
+	/* First pass to count resource use */
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		if (pmc) {
+			if (pmc > 6)
+				return -1;
+			if (pmc_inuse & (1 << (pmc - 1)))
+				return -1;
+			pmc_inuse |= 1 << (pmc - 1);
+		}
+	}
+
+	/* Second pass: assign PMCs, set all MMCR1 fields */
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+		combine = (event[i] >> PM_COMBINE_SH) & PM_COMBINE_MSK;
+		l2sel = (event[i] >> PM_L2SEL_SH) & PM_L2SEL_MSK;
+		psel = event[i] & PM_PMCSEL_MSK;
+		if (!pmc) {
+			/* Bus event or any-PMC direct event */
+			for (pmc = 0; pmc < 4; ++pmc) {
+				if (!(pmc_inuse & (1 << pmc)))
+					break;
+			}
+			if (pmc >= 4)
+				return -1;
+			pmc_inuse |= 1 << pmc;
+		} else {
+			/* Direct or decoded event */
+			--pmc;
+		}
+		if (pmc <= 3) {
+			mmcr1 |= (u64) unit << (MMCR1_TTM0SEL_SH - 4 * pmc);
+			mmcr1 |= (u64) combine << (MMCR1_PMC1_COMBINE_SH - pmc);
+			mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
+			if (unit == 6)	/* L2 events */
+				mmcr1 |= (u64) l2sel << MMCR1_L2SEL_SH;
+		}
+		if (power7_marked_instr_event(event[i]))
+			mmcra |= MMCRA_SAMPLE_ENABLE;
+		hwc[i] = pmc;
+	}
+
+	/* Return MMCRx values */
+	mmcr[0] = 0;
+	if (pmc_inuse & 1)
+		mmcr[0] = MMCR0_PMC1CE;
+	if (pmc_inuse & 0x3e)
+		mmcr[0] |= MMCR0_PMCjCE;
+	mmcr[1] = mmcr1;
+	mmcr[2] = mmcra;
+	return 0;
+}
+
+static void power7_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+	if (pmc <= 3)
+		mmcr[1] &= ~(0xffULL << MMCR1_PMCSEL_SH(pmc));
+}
+
+static int power7_generic_events[] = {
+	[PERF_COUNT_CPU_CYCLES] = 0x1e,
+	[PERF_COUNT_INSTRUCTIONS] = 2,
+	[PERF_COUNT_CACHE_REFERENCES] = 0xc880,		/* LD_REF_L1_LSU */
+	[PERF_COUNT_CACHE_MISSES] = 0x400f0,		/* LD_MISS_L1 */
+	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x10068,	/* BRU_FIN */
+	[PERF_COUNT_BRANCH_MISSES] = 0x400f6,		/* BR_MPRED */
+};
+
+struct power_pmu power7_pmu = {
+	.n_counter = 6,
+	.max_alternatives = MAX_ALT + 1,
+	.add_fields = 0x1555ull,
+	.test_adder = 0x3000ull,
+	.compute_mmcr = power7_compute_mmcr,
+	.get_constraint = power7_get_constraint,
+	.get_alternatives = power7_get_alternatives,
+	.disable_pmc = power7_disable_pmc,
+	.n_generic = ARRAY_SIZE(power7_generic_events),
+	.generic_events = power7_generic_events,
+};
-- 
cgit v0.10.2


From 106b506c3a8b74daa5751e83ed3e46438fcf9a52 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 11 Jun 2009 14:55:42 +1000
Subject: perf_counter: powerpc: Implement generalized cache events for POWER
 processors

This adds tables of event codes for the generalized cache events for
all the currently supported powerpc processors: POWER{4,5,5+,6,7} and
PPC970*, plus powerpc-specific code to use these tables when a
generalized cache event is requested.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18992.36430.933526.742969@drongo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
index 1c60f0c..cc7c887 100644
--- a/arch/powerpc/include/asm/perf_counter.h
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -33,6 +33,9 @@ struct power_pmu {
 	u32	flags;
 	int	n_generic;
 	int	*generic_events;
+	int	(*cache_events)[PERF_COUNT_HW_CACHE_MAX]
+			       [PERF_COUNT_HW_CACHE_OP_MAX]
+			       [PERF_COUNT_HW_CACHE_RESULT_MAX];
 };
 
 extern struct power_pmu *ppmu;
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 5d12e68..bb20238 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -856,6 +856,36 @@ static void hw_perf_counter_destroy(struct perf_counter *counter)
 	}
 }
 
+/*
+ * Translate a generic cache event config to a raw event code.
+ */
+static int hw_perf_cache_event(u64 config, u64 *eventp)
+{
+	unsigned long type, op, result;
+	int ev;
+
+	if (!ppmu->cache_events)
+		return -EINVAL;
+
+	/* unpack config */
+	type = config & 0xff;
+	op = (config >> 8) & 0xff;
+	result = (config >> 16) & 0xff;
+
+	if (type >= PERF_COUNT_HW_CACHE_MAX ||
+	    op >= PERF_COUNT_HW_CACHE_OP_MAX ||
+	    result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+		return -EINVAL;
+
+	ev = (*ppmu->cache_events)[type][op][result];
+	if (ev == 0)
+		return -EOPNOTSUPP;
+	if (ev == -1)
+		return -EINVAL;
+	*eventp = ev;
+	return 0;
+}
+
 const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 {
 	u64 ev;
@@ -868,13 +898,21 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	if (!ppmu)
 		return ERR_PTR(-ENXIO);
-	if (counter->attr.type != PERF_TYPE_RAW) {
+	switch (counter->attr.type) {
+	case PERF_TYPE_HARDWARE:
 		ev = counter->attr.config;
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
 			return ERR_PTR(-EOPNOTSUPP);
 		ev = ppmu->generic_events[ev];
-	} else {
+		break;
+	case PERF_TYPE_HW_CACHE:
+		err = hw_perf_cache_event(counter->attr.config, &ev);
+		if (err)
+			return ERR_PTR(err);
+		break;
+	case PERF_TYPE_RAW:
 		ev = counter->attr.config;
+		break;
 	}
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
index 836fa11..0e94b68 100644
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -543,6 +543,46 @@ static int p4_generic_events[] = {
 	[PERF_COUNT_BRANCH_MISSES] = 0x331,		/* PM_BR_MPRED_CR */
 };
 
+#define C(x)	PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int power4_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x8c10,		0x3c10	},
+		[C(OP_WRITE)] = {	0x7c10,		0xc13	},
+		[C(OP_PREFETCH)] = {	0xc35,		0	},
+	},
+	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	0,		0	},
+	},
+	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0	},
+		[C(OP_WRITE)] = {	0,		0	},
+		[C(OP_PREFETCH)] = {	0xc34,		0	},
+	},
+	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x904	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x900	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x330,		0x331	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+};
+
 struct power_pmu power4_pmu = {
 	.n_counter = 8,
 	.max_alternatives = 5,
@@ -554,4 +594,5 @@ struct power_pmu power4_pmu = {
 	.disable_pmc = p4_disable_pmc,
 	.n_generic = ARRAY_SIZE(p4_generic_events),
 	.generic_events = p4_generic_events,
+	.cache_events = &power4_cache_events,
 };
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index 8471e3c..bbf2cbb 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -614,6 +614,46 @@ static int power5p_generic_events[] = {
 	[PERF_COUNT_BRANCH_MISSES] = 0x230e5,		/* BR_MPRED_CR */
 };
 
+#define C(x)	PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int power5p_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x1c10a8,	0x3c1088	},
+		[C(OP_WRITE)] = {	0x2c10a8,	0xc10c3		},
+		[C(OP_PREFETCH)] = {	0xc70e7,	-1		},
+	},
+	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	0,		0		},
+	},
+	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0		},
+		[C(OP_WRITE)] = {	0,		0		},
+		[C(OP_PREFETCH)] = {	0xc50c3,	0		},
+	},
+	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0xc20e4,	0x800c4		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x800c0		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x230e4,	0x230e5		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+};
+
 struct power_pmu power5p_pmu = {
 	.n_counter = 6,
 	.max_alternatives = MAX_ALT,
@@ -623,8 +663,9 @@ struct power_pmu power5p_pmu = {
 	.get_constraint = power5p_get_constraint,
 	.get_alternatives = power5p_get_alternatives,
 	.disable_pmc = power5p_disable_pmc,
+	.limited_pmc_event = power5p_limited_pmc_event,
+	.flags = PPMU_LIMITED_PMC5_6,
 	.n_generic = ARRAY_SIZE(power5p_generic_events),
 	.generic_events = power5p_generic_events,
-	.flags = PPMU_LIMITED_PMC5_6,
-	.limited_pmc_event = power5p_limited_pmc_event,
+	.cache_events = &power5p_cache_events,
 };
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index 1b44c5f..670cf10 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -556,6 +556,46 @@ static int power5_generic_events[] = {
 	[PERF_COUNT_BRANCH_MISSES] = 0x230e5,		/* BR_MPRED_CR */
 };
 
+#define C(x)	PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int power5_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x4c1090,	0x3c1088	},
+		[C(OP_WRITE)] = {	0x3c1090,	0xc10c3		},
+		[C(OP_PREFETCH)] = {	0xc70e7,	0		},
+	},
+	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	0,		0		},
+	},
+	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x3c309b	},
+		[C(OP_WRITE)] = {	0,		0		},
+		[C(OP_PREFETCH)] = {	0xc50c3,	0		},
+	},
+	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x2c4090,	0x800c4		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x800c0		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x230e4,	0x230e5		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+};
+
 struct power_pmu power5_pmu = {
 	.n_counter = 6,
 	.max_alternatives = MAX_ALT,
@@ -567,4 +607,5 @@ struct power_pmu power5_pmu = {
 	.disable_pmc = power5_disable_pmc,
 	.n_generic = ARRAY_SIZE(power5_generic_events),
 	.generic_events = power5_generic_events,
+	.cache_events = &power5_cache_events,
 };
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index cd4fbe0..4da7078 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -474,6 +474,47 @@ static int power6_generic_events[] = {
 	[PERF_COUNT_BRANCH_MISSES] = 0x400052,		/* BR_MPRED */
 };
 
+#define C(x)	PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ * The "DTLB" and "ITLB" events relate to the DERAT and IERAT.
+ */
+static int power6_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x80082,	0x80080		},
+		[C(OP_WRITE)] = {	0x80086,	0x80088		},
+		[C(OP_PREFETCH)] = {	0x810a4,	0		},
+	},
+	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x100056 	},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	0x4008c,	0		},
+	},
+	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x150730,	0x250532	},
+		[C(OP_WRITE)] = {	0x250432,	0x150432	},
+		[C(OP_PREFETCH)] = {	0x810a6,	0		},
+	},
+	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x20000e	},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x420ce		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x430e6,	0x400052	},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+};
+
 struct power_pmu power6_pmu = {
 	.n_counter = 6,
 	.max_alternatives = MAX_ALT,
@@ -483,8 +524,9 @@ struct power_pmu power6_pmu = {
 	.get_constraint = p6_get_constraint,
 	.get_alternatives = p6_get_alternatives,
 	.disable_pmc = p6_disable_pmc,
+	.limited_pmc_event = p6_limited_pmc_event,
+	.flags = PPMU_LIMITED_PMC5_6 | PPMU_ALT_SIPR,
 	.n_generic = ARRAY_SIZE(power6_generic_events),
 	.generic_events = power6_generic_events,
-	.flags = PPMU_LIMITED_PMC5_6 | PPMU_ALT_SIPR,
-	.limited_pmc_event = p6_limited_pmc_event,
+	.cache_events = &power6_cache_events,
 };
diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c
index dfac48d..060e0de 100644
--- a/arch/powerpc/kernel/power7-pmu.c
+++ b/arch/powerpc/kernel/power7-pmu.c
@@ -302,6 +302,46 @@ static int power7_generic_events[] = {
 	[PERF_COUNT_BRANCH_MISSES] = 0x400f6,		/* BR_MPRED */
 };
 
+#define C(x)	PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int power7_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x400f0,	0xc880	},
+		[C(OP_WRITE)] = {	0,		0x300f0	},
+		[C(OP_PREFETCH)] = {	0xd8b8,		0	},
+	},
+	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x200fc	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	0x408a,		0	},
+	},
+	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x6080,		0x6084	},
+		[C(OP_WRITE)] = {	0x6082,		0x6086	},
+		[C(OP_PREFETCH)] = {	0,		0	},
+	},
+	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x300fc	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x400fc	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x10068,	0x400f6	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+};
+
 struct power_pmu power7_pmu = {
 	.n_counter = 6,
 	.max_alternatives = MAX_ALT + 1,
@@ -313,4 +353,5 @@ struct power_pmu power7_pmu = {
 	.disable_pmc = power7_disable_pmc,
 	.n_generic = ARRAY_SIZE(power7_generic_events),
 	.generic_events = power7_generic_events,
+	.cache_events = &power7_cache_events,
 };
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index eed47c4..336adf1 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -427,6 +427,46 @@ static int ppc970_generic_events[] = {
 	[PERF_COUNT_BRANCH_MISSES] = 0x327,		/* PM_GRP_BR_MPRED */
 };
 
+#define C(x)	PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int ppc970_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x8810,		0x3810	},
+		[C(OP_WRITE)] = {	0x7810,		0x813	},
+		[C(OP_PREFETCH)] = {	0x731,		0	},
+	},
+	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	0,		0	},
+	},
+	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0	},
+		[C(OP_WRITE)] = {	0,		0	},
+		[C(OP_PREFETCH)] = {	0x733,		0	},
+	},
+	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x704	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x700	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x431,		0x327	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+};
+
 struct power_pmu ppc970_pmu = {
 	.n_counter = 8,
 	.max_alternatives = 2,
@@ -438,4 +478,5 @@ struct power_pmu ppc970_pmu = {
 	.disable_pmc = p970_disable_pmc,
 	.n_generic = ARRAY_SIZE(ppc970_generic_events),
 	.generic_events = ppc970_generic_events,
+	.cache_events = &ppc970_cache_events,
 };
-- 
cgit v0.10.2


From 0764771dab80d7b84b9a271bee7f1b21a04a3f0c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Jun 2009 11:18:36 +0200
Subject: perf_counter: More paranoia settings

Rename the perf_counter_priv knob to perf_counter_paranoia (because
priv can be read as private, as opposed to privileged) and provide
one more level:

 0 - permissive
 1 - restrict cpu counters to privilidged contexts
 2 - restrict kernel-mode code counting and profiling

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 5b96647..386be91 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -648,7 +648,7 @@ struct perf_callchain_entry {
 
 extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
 
-extern int sysctl_perf_counter_priv;
+extern int sysctl_perf_counter_paranoid;
 extern int sysctl_perf_counter_mlock;
 extern int sysctl_perf_counter_limit;
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 8b89b40..63f1987 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -43,7 +43,23 @@ static atomic_t nr_counters __read_mostly;
 static atomic_t nr_mmap_counters __read_mostly;
 static atomic_t nr_comm_counters __read_mostly;
 
-int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
+/*
+ * 0 - not paranoid
+ * 1 - disallow cpu counters to unpriv
+ * 2 - disallow kernel profiling to unpriv
+ */
+int sysctl_perf_counter_paranoid __read_mostly; /* do we need to be privileged */
+
+static inline bool perf_paranoid_cpu(void)
+{
+	return sysctl_perf_counter_paranoid > 0;
+}
+
+static inline bool perf_paranoid_kernel(void)
+{
+	return sysctl_perf_counter_paranoid > 1;
+}
+
 int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
 int sysctl_perf_counter_limit __read_mostly = 100000; /* max NMIs per second */
 
@@ -1385,7 +1401,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	 */
 	if (cpu != -1) {
 		/* Must be root to operate on a CPU counter: */
-		if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN))
+		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
 			return ERR_PTR(-EACCES);
 
 		if (cpu < 0 || cpu > num_possible_cpus())
@@ -3618,6 +3634,11 @@ SYSCALL_DEFINE5(perf_counter_open,
 	if (copy_from_user(&attr, attr_uptr, sizeof(attr)) != 0)
 		return -EFAULT;
 
+	if (!attr.exclude_kernel) {
+		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+			return -EACCES;
+	}
+
 	/*
 	 * Get the target context (task or percpu):
 	 */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0c4bf86..344a659 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -916,9 +916,9 @@ static struct ctl_table kern_table[] = {
 #ifdef CONFIG_PERF_COUNTERS
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "perf_counter_privileged",
-		.data		= &sysctl_perf_counter_priv,
-		.maxlen		= sizeof(sysctl_perf_counter_priv),
+		.procname	= "perf_counter_paranoid",
+		.data		= &sysctl_perf_counter_paranoid,
+		.maxlen		= sizeof(sysctl_perf_counter_paranoid),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-- 
cgit v0.10.2


From df58ab24bf26b166874bfb18b3b5a2e0a8e63179 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Jun 2009 11:25:05 +0200
Subject: perf_counter: Rename perf_counter_limit sysctl

Rename perf_counter_limit to perf_counter_max_sample_rate and
prohibit creation of counters with a known higher sample
frequency.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 386be91..95c797c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -650,7 +650,7 @@ extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
 
 extern int sysctl_perf_counter_paranoid;
 extern int sysctl_perf_counter_mlock;
-extern int sysctl_perf_counter_limit;
+extern int sysctl_perf_counter_sample_rate;
 
 extern void perf_counter_init(void);
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 63f1987..3b2829d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -44,11 +44,12 @@ static atomic_t nr_mmap_counters __read_mostly;
 static atomic_t nr_comm_counters __read_mostly;
 
 /*
- * 0 - not paranoid
- * 1 - disallow cpu counters to unpriv
- * 2 - disallow kernel profiling to unpriv
+ * perf counter paranoia level:
+ *  0 - not paranoid
+ *  1 - disallow cpu counters to unpriv
+ *  2 - disallow kernel profiling to unpriv
  */
-int sysctl_perf_counter_paranoid __read_mostly; /* do we need to be privileged */
+int sysctl_perf_counter_paranoid __read_mostly;
 
 static inline bool perf_paranoid_cpu(void)
 {
@@ -61,7 +62,11 @@ static inline bool perf_paranoid_kernel(void)
 }
 
 int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
-int sysctl_perf_counter_limit __read_mostly = 100000; /* max NMIs per second */
+
+/*
+ * max perf counter sample rate
+ */
+int sysctl_perf_counter_sample_rate __read_mostly = 100000;
 
 static atomic64_t perf_counter_id;
 
@@ -1244,7 +1249,7 @@ static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
 		if (interrupts == MAX_INTERRUPTS) {
 			perf_log_throttle(counter, 1);
 			counter->pmu->unthrottle(counter);
-			interrupts = 2*sysctl_perf_counter_limit/HZ;
+			interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
 		}
 
 		if (!counter->attr.freq || !counter->attr.sample_freq)
@@ -1682,7 +1687,7 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
 
 	spin_lock_irq(&ctx->lock);
 	if (counter->attr.freq) {
-		if (value > sysctl_perf_counter_limit) {
+		if (value > sysctl_perf_counter_sample_rate) {
 			ret = -EINVAL;
 			goto unlock;
 		}
@@ -2979,7 +2984,8 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
 	} else {
 		if (hwc->interrupts != MAX_INTERRUPTS) {
 			hwc->interrupts++;
-			if (HZ * hwc->interrupts > (u64)sysctl_perf_counter_limit) {
+			if (HZ * hwc->interrupts >
+					(u64)sysctl_perf_counter_sample_rate) {
 				hwc->interrupts = MAX_INTERRUPTS;
 				perf_log_throttle(counter, 0);
 				ret = 1;
@@ -3639,6 +3645,11 @@ SYSCALL_DEFINE5(perf_counter_open,
 			return -EACCES;
 	}
 
+	if (attr.freq) {
+		if (attr.sample_freq > sysctl_perf_counter_sample_rate)
+			return -EINVAL;
+	}
+
 	/*
 	 * Get the target context (task or percpu):
 	 */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 344a659..9fd4e43 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -932,9 +932,9 @@ static struct ctl_table kern_table[] = {
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "perf_counter_int_limit",
-		.data		= &sysctl_perf_counter_limit,
-		.maxlen		= sizeof(sysctl_perf_counter_limit),
+		.procname	= "perf_counter_max_sample_rate",
+		.data		= &sysctl_perf_counter_sample_rate,
+		.maxlen		= sizeof(sysctl_perf_counter_sample_rate),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-- 
cgit v0.10.2


From 729ff5e2aaf181f5d3ab849337fce406cd19b1d9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Jun 2009 14:16:15 +0200
Subject: perf_counter tools: Clean up u64 usage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A build error slipped in:

 builtin-report.c: In function ‘hist_entry__fprintf’:
 builtin-report.c:711: error: format ‘%12d’ expects type ‘int’, but argument 3 has type ‘uint64_t’

Because we got a bit sloppy with those types. uint64_t really sucks,
because there's no printf format for it. So standardize on __u64
instead - for all types that go to or come from the ABI (which is __u64),
or for values that need to be large enough even on 32-bit.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 3334a8b..b1ed5f7 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -145,7 +145,7 @@ static void dsos__fprintf(FILE *fp)
 		dso__fprintf(pos, fp);
 }
 
-static struct symbol *vdso__find_symbol(struct dso *dso, uint64_t ip)
+static struct symbol *vdso__find_symbol(struct dso *dso, __u64 ip)
 {
 	return dso__find_symbol(kernel_dso, ip);
 }
@@ -178,19 +178,19 @@ static int load_kernel(void)
 
 struct map {
 	struct list_head node;
-	uint64_t	 start;
-	uint64_t	 end;
-	uint64_t	 pgoff;
-	uint64_t	 (*map_ip)(struct map *, uint64_t);
+	__u64	 start;
+	__u64	 end;
+	__u64	 pgoff;
+	__u64	 (*map_ip)(struct map *, __u64);
 	struct dso	 *dso;
 };
 
-static uint64_t map__map_ip(struct map *map, uint64_t ip)
+static __u64 map__map_ip(struct map *map, __u64 ip)
 {
 	return ip - map->start + map->pgoff;
 }
 
-static uint64_t vdso__map_ip(struct map *map, uint64_t ip)
+static __u64 vdso__map_ip(struct map *map, __u64 ip)
 {
 	return ip;
 }
@@ -249,7 +249,7 @@ static int map__overlap(struct map *l, struct map *r)
 
 static size_t map__fprintf(struct map *self, FILE *fp)
 {
-	return fprintf(fp, " %"PRIx64"-%"PRIx64" %"PRIx64" %s\n",
+	return fprintf(fp, " %Lx-%Lx %Lx %s\n",
 		       self->start, self->end, self->pgoff, self->dso->name);
 }
 
@@ -373,7 +373,7 @@ static int thread__fork(struct thread *self, struct thread *parent)
 	return 0;
 }
 
-static struct map *thread__find_map(struct thread *self, uint64_t ip)
+static struct map *thread__find_map(struct thread *self, __u64 ip)
 {
 	struct map *pos;
 
@@ -414,7 +414,7 @@ struct hist_entry {
 	struct map	 *map;
 	struct dso	 *dso;
 	struct symbol	 *sym;
-	uint64_t	 ip;
+	__u64	 ip;
 	char		 level;
 
 	uint32_t	 count;
@@ -533,7 +533,7 @@ static struct sort_entry sort_dso = {
 static int64_t
 sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
 {
-	uint64_t ip_l, ip_r;
+	__u64 ip_l, ip_r;
 
 	if (left->sym == right->sym)
 		return 0;
@@ -647,7 +647,7 @@ hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
 /*
  * collect histogram counts
  */
-static void hist_hit(struct hist_entry *he, uint64_t ip)
+static void hist_hit(struct hist_entry *he, __u64 ip)
 {
 	unsigned int sym_size, offset;
 	struct symbol *sym = he->sym;
@@ -676,7 +676,7 @@ static void hist_hit(struct hist_entry *he, uint64_t ip)
 
 static int
 hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
-		struct symbol *sym, uint64_t ip, char level)
+		struct symbol *sym, __u64 ip, char level)
 {
 	struct rb_node **p = &hist.rb_node;
 	struct rb_node *parent = NULL;
@@ -848,7 +848,7 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 	int show = 0;
 	struct dso *dso = NULL;
 	struct thread *thread = threads__findnew(event->ip.pid);
-	uint64_t ip = event->ip.ip;
+	__u64 ip = event->ip.ip;
 	struct map *map = NULL;
 
 	dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
@@ -1031,7 +1031,7 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 }
 
 static int
-parse_line(FILE *file, struct symbol *sym, uint64_t start, uint64_t len)
+parse_line(FILE *file, struct symbol *sym, __u64 start, __u64 len)
 {
 	char *line = NULL, *tmp, *tmp2;
 	unsigned int offset;
@@ -1112,7 +1112,7 @@ parse_line(FILE *file, struct symbol *sym, uint64_t start, uint64_t len)
 static void annotate_sym(struct dso *dso, struct symbol *sym)
 {
 	char *filename = dso->name;
-	uint64_t start, end, len;
+	__u64 start, end, len;
 	char command[PATH_MAX*2];
 	FILE *file;
 
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 919f23c..84cd336 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -223,7 +223,7 @@ static void pid_synthesize_comm_event(pid_t pid, int full)
 
 	comm_ev.pid = pid;
 	comm_ev.header.type = PERF_EVENT_COMM;
-	size = ALIGN(size, sizeof(uint64_t));
+	size = ALIGN(size, sizeof(__u64));
 	comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
 
 	if (!full) {
@@ -304,7 +304,7 @@ static void pid_synthesize_mmap_samples(pid_t pid)
 			size = strlen(execname);
 			execname[size - 1] = '\0'; /* Remove \n */
 			memcpy(mmap_ev.filename, execname, size);
-			size = ALIGN(size, sizeof(uint64_t));
+			size = ALIGN(size, sizeof(__u64));
 			mmap_ev.len -= mmap_ev.start;
 			mmap_ev.header.size = (sizeof(mmap_ev) -
 					       (sizeof(mmap_ev.filename) - size));
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index f57fd5c..82fa93b 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -146,7 +146,7 @@ static void dsos__fprintf(FILE *fp)
 		dso__fprintf(pos, fp);
 }
 
-static struct symbol *vdso__find_symbol(struct dso *dso, uint64_t ip)
+static struct symbol *vdso__find_symbol(struct dso *dso, __u64 ip)
 {
 	return dso__find_symbol(kernel_dso, ip);
 }
@@ -193,19 +193,19 @@ static int strcommon(const char *pathname)
 
 struct map {
 	struct list_head node;
-	uint64_t	 start;
-	uint64_t	 end;
-	uint64_t	 pgoff;
-	uint64_t	 (*map_ip)(struct map *, uint64_t);
+	__u64	 start;
+	__u64	 end;
+	__u64	 pgoff;
+	__u64	 (*map_ip)(struct map *, __u64);
 	struct dso	 *dso;
 };
 
-static uint64_t map__map_ip(struct map *map, uint64_t ip)
+static __u64 map__map_ip(struct map *map, __u64 ip)
 {
 	return ip - map->start + map->pgoff;
 }
 
-static uint64_t vdso__map_ip(struct map *map, uint64_t ip)
+static __u64 vdso__map_ip(struct map *map, __u64 ip)
 {
 	return ip;
 }
@@ -288,7 +288,7 @@ static int map__overlap(struct map *l, struct map *r)
 
 static size_t map__fprintf(struct map *self, FILE *fp)
 {
-	return fprintf(fp, " %"PRIx64"-%"PRIx64" %"PRIx64" %s\n",
+	return fprintf(fp, " %Lx-%Lx %Lx %s\n",
 		       self->start, self->end, self->pgoff, self->dso->name);
 }
 
@@ -412,7 +412,7 @@ static int thread__fork(struct thread *self, struct thread *parent)
 	return 0;
 }
 
-static struct map *thread__find_map(struct thread *self, uint64_t ip)
+static struct map *thread__find_map(struct thread *self, __u64 ip)
 {
 	struct map *pos;
 
@@ -453,10 +453,10 @@ struct hist_entry {
 	struct map	 *map;
 	struct dso	 *dso;
 	struct symbol	 *sym;
-	uint64_t	 ip;
+	__u64		 ip;
 	char		 level;
 
-	uint64_t	 count;
+	__u64		 count;
 };
 
 /*
@@ -572,7 +572,7 @@ static struct sort_entry sort_dso = {
 static int64_t
 sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
 {
-	uint64_t ip_l, ip_r;
+	__u64 ip_l, ip_r;
 
 	if (left->sym == right->sym)
 		return 0;
@@ -684,7 +684,7 @@ hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
 }
 
 static size_t
-hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
+hist_entry__fprintf(FILE *fp, struct hist_entry *self, __u64 total_samples)
 {
 	struct sort_entry *se;
 	size_t ret;
@@ -708,7 +708,7 @@ hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
 		ret = color_fprintf(fp, color, "   %6.2f%%",
 				(self->count * 100.0) / total_samples);
 	} else
-		ret = fprintf(fp, "%12d ", self->count);
+		ret = fprintf(fp, "%12Ld ", self->count);
 
 	list_for_each_entry(se, &hist_entry__sort_list, list) {
 		fprintf(fp, "  ");
@@ -726,7 +726,7 @@ hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
 
 static int
 hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
-		struct symbol *sym, uint64_t ip, char level, uint64_t count)
+		struct symbol *sym, __u64 ip, char level, __u64 count)
 {
 	struct rb_node **p = &hist.rb_node;
 	struct rb_node *parent = NULL;
@@ -873,7 +873,7 @@ static void output__resort(void)
 	}
 }
 
-static size_t output__fprintf(FILE *fp, uint64_t total_samples)
+static size_t output__fprintf(FILE *fp, __u64 total_samples)
 {
 	struct hist_entry *pos;
 	struct sort_entry *se;
@@ -941,8 +941,8 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 	int show = 0;
 	struct dso *dso = NULL;
 	struct thread *thread = threads__findnew(event->ip.pid);
-	uint64_t ip = event->ip.ip;
-	uint64_t period = 1;
+	__u64 ip = event->ip.ip;
+	__u64 period = 1;
 	struct map *map = NULL;
 
 	if (event->header.type & PERF_SAMPLE_PERIOD)
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 8ba2480..309dbc7 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -79,8 +79,8 @@ static int			dump_symtab;
  * Symbols
  */
 
-static uint64_t			min_ip;
-static uint64_t			max_ip = -1ll;
+static __u64			min_ip;
+static __u64			max_ip = -1ll;
 
 struct sym_entry {
 	struct rb_node		rb_node;
@@ -372,7 +372,7 @@ out_delete_dso:
 /*
  * Binary search in the histogram table and record the hit:
  */
-static void record_ip(uint64_t ip, int counter)
+static void record_ip(__u64 ip, int counter)
 {
 	struct symbol *sym = dso__find_symbol(kernel_dso, ip);
 
@@ -392,7 +392,7 @@ static void record_ip(uint64_t ip, int counter)
 	samples--;
 }
 
-static void process_event(uint64_t ip, int counter)
+static void process_event(__u64 ip, int counter)
 {
 	samples++;
 
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 32dd47d..49a55f8 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -9,9 +9,9 @@
 
 const char *sym_hist_filter;
 
-static struct symbol *symbol__new(uint64_t start, uint64_t len,
+static struct symbol *symbol__new(__u64 start, __u64 len,
 				  const char *name, unsigned int priv_size,
-				  uint64_t obj_start, int verbose)
+				  __u64 obj_start, int verbose)
 {
 	size_t namelen = strlen(name) + 1;
 	struct symbol *self = calloc(1, priv_size + sizeof(*self) + namelen);
@@ -89,7 +89,7 @@ static void dso__insert_symbol(struct dso *self, struct symbol *sym)
 {
 	struct rb_node **p = &self->syms.rb_node;
 	struct rb_node *parent = NULL;
-	const uint64_t ip = sym->start;
+	const __u64 ip = sym->start;
 	struct symbol *s;
 
 	while (*p != NULL) {
@@ -104,7 +104,7 @@ static void dso__insert_symbol(struct dso *self, struct symbol *sym)
 	rb_insert_color(&sym->rb_node, &self->syms);
 }
 
-struct symbol *dso__find_symbol(struct dso *self, uint64_t ip)
+struct symbol *dso__find_symbol(struct dso *self, __u64 ip)
 {
 	struct rb_node *n;
 
@@ -523,7 +523,7 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
 
 	elf_symtab__for_each_symbol(syms, nr_syms, index, sym) {
 		struct symbol *f;
-		uint64_t obj_start;
+		__u64 obj_start;
 
 		if (!elf_sym__is_function(&sym))
 			continue;
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 4839d68..0d1292b 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -19,7 +19,7 @@ struct dso {
 	struct list_head node;
 	struct rb_root	 syms;
 	unsigned int	 sym_priv_size;
-	struct symbol    *(*find_symbol)(struct dso *, uint64_t ip);
+	struct symbol    *(*find_symbol)(struct dso *, __u64 ip);
 	char		 name[0];
 };
 
@@ -35,7 +35,7 @@ static inline void *dso__sym_priv(struct dso *self, struct symbol *sym)
 	return ((void *)sym) - self->sym_priv_size;
 }
 
-struct symbol *dso__find_symbol(struct dso *self, uint64_t ip);
+struct symbol *dso__find_symbol(struct dso *self, __u64 ip);
 
 int dso__load_kernel(struct dso *self, const char *vmlinux,
 		     symbol_filter_t filter, int verbose);
-- 
cgit v0.10.2


From 0b4dcea579a1b6f4d249d61f5bc8adeaa7c895d8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 11 Jun 2009 11:13:35 -0400
Subject: Btrfs: fix oops when btrfs_inherit_iflags called with a NULL dir

This happens during subvol creation.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 926332a..eff18f5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -112,7 +112,12 @@ void btrfs_update_iflags(struct inode *inode)
  */
 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
 {
-	unsigned int flags = BTRFS_I(dir)->flags;
+	unsigned int flags;
+
+	if (!dir)
+		return;
+
+	flags = BTRFS_I(dir)->flags;
 
 	if (S_ISREG(inode->i_mode))
 		flags &= ~BTRFS_INODE_DIRSYNC;
-- 
cgit v0.10.2


From b263c2c8bf13c273485bd99dbbeba79c844409dd Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 11 Jun 2009 11:24:47 -0400
Subject: Btrfs: fix extent_buffer leak during tree log replay

During tree log replay, we read in the tree log roots,
process them and then free them.  A recent change
takes an extra reference on the root node of the tree
when the root is read in, and stores that reference
in root->commit_root.

This reference was not being freed, leaving us with
one buffer pinned in ram for each subvol with
a tree log root after a crash.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 2b41fc0..c139222 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3019,6 +3019,7 @@ again:
 		key.offset = found_key.offset - 1;
 		wc.replay_dest->log_root = NULL;
 		free_extent_buffer(log->node);
+		free_extent_buffer(log->commit_root);
 		kfree(log);
 
 		if (found_key.offset == 0)
-- 
cgit v0.10.2


From ace08c3c4403140e5ce82116c8f2acb38f58f61d Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:20:38 +0100
Subject: tty:cyclades, load firmware even on Ze

Ze needs firmware to be loaded as well as Zo. Move cyz_load_fw one
level upper to achieve that.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index 1fdb9f6..bde2458 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -5043,6 +5043,7 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 			nchan = ZE_V1_NPORTS;
 		} else {
 			card_name = "Cyclades-8Zo";
+			nchan = 8;
 
 #ifdef CY_PCI_DEBUG
 			if (mailbox == ZO_V1) {
@@ -5065,15 +5066,11 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 			 */
 			if ((mailbox == ZO_V1) || (mailbox == ZO_V2))
 				cy_writel(addr2 + ID_ADDRESS, 0L);
-
-			retval = cyz_load_fw(pdev, addr2, addr0, irq);
-			if (retval)
-				goto err_unmap;
-			/* This must be a Cyclades-8Zo/PCI.  The extendable
-			   version will have a different device_id and will
-			   be allocated its maximum number of ports. */
-			nchan = 8;
 		}
+
+		retval = cyz_load_fw(pdev, addr2, addr0, irq);
+		if (retval)
+			goto err_unmap;
 	}
 
 	if ((cy_next_channel + nchan) > NR_PORTS) {
-- 
cgit v0.10.2


From eca1b592d600bb2b7d24b66e4106de1e751ae510 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:21:30 +0100
Subject: tty: cyclades, don't kill FW

Don't reset the PLX chip after FW load, which effectively kills
the FW, so that user had to boot manually.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index bde2458..6d52439 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -4932,8 +4932,6 @@ static int __devinit cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
 	cy_writel(&ctl_addr->intr_ctrl_stat, readl(&ctl_addr->intr_ctrl_stat) |
 			0x00030800UL);
 
-	plx_init(pdev, irq, ctl_addr);
-
 	return 0;
 err_rel:
 	release_firmware(fw);
-- 
cgit v0.10.2


From 65a29f60e121ae5116ac1736b50a237bf2db3225 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:22:27 +0100
Subject: tty: cyclades, remove spurious check in ISR

No need to check if dev_id is NULL, it never is.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index 6d52439..ccf68a9 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -1737,14 +1737,6 @@ static irqreturn_t cyz_interrupt(int irq, void *dev_id)
 {
 	struct cyclades_card *cinfo = dev_id;
 
-	if (unlikely(cinfo == NULL)) {
-#ifdef CY_DEBUG_INTERRUPTS
-		printk(KERN_DEBUG "cyz_interrupt: spurious interrupt %d\n",
-									irq);
-#endif
-		return IRQ_NONE;	/* spurious interrupt */
-	}
-
 	if (unlikely(!ISZLOADED(*cinfo))) {
 #ifdef CY_DEBUG_INTERRUPTS
 		printk(KERN_DEBUG "cyz_interrupt: board not yet loaded "
-- 
cgit v0.10.2


From fcc8ac1825d3d0fb81f73bc1a80ebc863168bb56 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Thu, 11 Jun 2009 12:24:17 +0100
Subject: tty: Add carrier processing on close to the tty_port core

Some drivers implement this internally, others miss it out. Push the
behaviour into the core code as that way everyone will do it consistently.

Update the dtr rts method to raise or lower depending upon flags. Having a
single method in this style fits most of the implementations more cleanly than
two funtions.

We need this in place before we tackle the USB side

Signed-off-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/epca.c b/drivers/char/epca.c
index af7c13c..8797b77 100644
--- a/drivers/char/epca.c
+++ b/drivers/char/epca.c
@@ -745,7 +745,7 @@ static int epca_carrier_raised(struct tty_port *port)
 	return 0;
 }
 
-static void epca_raise_dtr_rts(struct tty_port *port)
+static void epca_dtr_rts(struct tty_port *port, int onoff)
 {
 }
 
@@ -925,7 +925,7 @@ static const struct tty_operations pc_ops = {
 
 static const struct tty_port_operations epca_port_ops = {
 	.carrier_raised = epca_carrier_raised,
-	.raise_dtr_rts = epca_raise_dtr_rts,
+	.dtr_rts = epca_dtr_rts,
 };
 
 static int info_open(struct tty_struct *tty, struct file *filp)
diff --git a/drivers/char/isicom.c b/drivers/char/isicom.c
index a59eac5..4d745a8 100644
--- a/drivers/char/isicom.c
+++ b/drivers/char/isicom.c
@@ -329,7 +329,7 @@ static inline void drop_rts(struct isi_port *port)
 
 /* card->lock MUST NOT be held */
 
-static void isicom_raise_dtr_rts(struct tty_port *port)
+static void isicom_dtr_rts(struct tty_port *port, int on)
 {
 	struct isi_port *ip = container_of(port, struct isi_port, port);
 	struct isi_board *card = ip->card;
@@ -339,10 +339,17 @@ static void isicom_raise_dtr_rts(struct tty_port *port)
 	if (!lock_card(card))
 		return;
 
-	outw(0x8000 | (channel << card->shift_count) | 0x02, base);
-	outw(0x0f04, base);
-	InterruptTheCard(base);
-	ip->status |= (ISI_DTR | ISI_RTS);
+	if (on) {
+		outw(0x8000 | (channel << card->shift_count) | 0x02, base);
+		outw(0x0f04, base);
+		InterruptTheCard(base);
+		ip->status |= (ISI_DTR | ISI_RTS);
+	} else {
+		outw(0x8000 | (channel << card->shift_count) | 0x02, base);
+		outw(0x0C04, base);
+		InterruptTheCard(base);
+		ip->status &= ~(ISI_DTR | ISI_RTS);
+	}
 	unlock_card(card);
 }
 
@@ -1339,7 +1346,7 @@ static const struct tty_operations isicom_ops = {
 
 static const struct tty_port_operations isicom_port_ops = {
 	.carrier_raised		= isicom_carrier_raised,
-	.raise_dtr_rts		= isicom_raise_dtr_rts,
+	.dtr_rts		= isicom_dtr_rts,
 };
 
 static int __devinit reset_card(struct pci_dev *pdev,
diff --git a/drivers/char/istallion.c b/drivers/char/istallion.c
index fff19f7..e18800c 100644
--- a/drivers/char/istallion.c
+++ b/drivers/char/istallion.c
@@ -1140,14 +1140,14 @@ static int stli_carrier_raised(struct tty_port *port)
 	return (portp->sigs & TIOCM_CD) ? 1 : 0;
 }
 
-static void stli_raise_dtr_rts(struct tty_port *port)
+static void stli_dtr_rts(struct tty_port *port, int on)
 {
 	struct stliport *portp = container_of(port, struct stliport, port);
 	struct stlibrd *brdp = stli_brds[portp->brdnr];
-	stli_mkasysigs(&portp->asig, 1, 1);
+	stli_mkasysigs(&portp->asig, on, on);
 	if (stli_cmdwait(brdp, portp, A_SETSIGNALS, &portp->asig,
 		sizeof(asysigs_t), 0) < 0)
-			printk(KERN_WARNING "istallion: dtr raise failed.\n");
+			printk(KERN_WARNING "istallion: dtr set failed.\n");
 }
 
 
@@ -4417,7 +4417,7 @@ static const struct tty_operations stli_ops = {
 
 static const struct tty_port_operations stli_port_ops = {
 	.carrier_raised = stli_carrier_raised,
-	.raise_dtr_rts = stli_raise_dtr_rts,
+	.dtr_rts = stli_dtr_rts,
 };
 
 /*****************************************************************************/
diff --git a/drivers/char/mxser.c b/drivers/char/mxser.c
index 13f8871..9533f43 100644
--- a/drivers/char/mxser.c
+++ b/drivers/char/mxser.c
@@ -547,14 +547,18 @@ static int mxser_carrier_raised(struct tty_port *port)
 	return (inb(mp->ioaddr + UART_MSR) & UART_MSR_DCD)?1:0;
 }
 
-static void mxser_raise_dtr_rts(struct tty_port *port)
+static void mxser_dtr_rts(struct tty_port *port, int on)
 {
 	struct mxser_port *mp = container_of(port, struct mxser_port, port);
 	unsigned long flags;
 
 	spin_lock_irqsave(&mp->slock, flags);
-	outb(inb(mp->ioaddr + UART_MCR) |
-		UART_MCR_DTR | UART_MCR_RTS, mp->ioaddr + UART_MCR);
+	if (on)
+		outb(inb(mp->ioaddr + UART_MCR) |
+			UART_MCR_DTR | UART_MCR_RTS, mp->ioaddr + UART_MCR);
+	else
+		outb(inb(mp->ioaddr + UART_MCR)&~(UART_MCR_DTR | UART_MCR_RTS),
+			mp->ioaddr + UART_MCR);
 	spin_unlock_irqrestore(&mp->slock, flags);
 }
 
@@ -2356,7 +2360,7 @@ static const struct tty_operations mxser_ops = {
 
 struct tty_port_operations mxser_port_ops = {
 	.carrier_raised = mxser_carrier_raised,
-	.raise_dtr_rts = mxser_raise_dtr_rts,
+	.dtr_rts = mxser_dtr_rts,
 };
 
 /*
diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index 19d79fc..77b3648 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -383,7 +383,7 @@ static void async_mode(MGSLPC_INFO *info);
 static void tx_timeout(unsigned long context);
 
 static int carrier_raised(struct tty_port *port);
-static void raise_dtr_rts(struct tty_port *port);
+static void dtr_rts(struct tty_port *port, int onoff);
 
 #if SYNCLINK_GENERIC_HDLC
 #define dev_to_port(D) (dev_to_hdlc(D)->priv)
@@ -513,7 +513,7 @@ static void ldisc_receive_buf(struct tty_struct *tty,
 
 static const struct tty_port_operations mgslpc_port_ops = {
 	.carrier_raised = carrier_raised,
-	.raise_dtr_rts = raise_dtr_rts
+	.dtr_rts = dtr_rts
 };
 
 static int mgslpc_probe(struct pcmcia_device *link)
@@ -2528,13 +2528,16 @@ static int carrier_raised(struct tty_port *port)
 	return 0;
 }
 
-static void raise_dtr_rts(struct tty_port *port)
+static void dtr_rts(struct tty_port *port, int onoff)
 {
 	MGSLPC_INFO *info = container_of(port, MGSLPC_INFO, port);
 	unsigned long flags;
 
 	spin_lock_irqsave(&info->lock,flags);
-	info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
+	if (onoff)
+		info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
+	else
+		info->serial_signals &= ~SerialSignal_RTS + SerialSignal_DTR;
 	set_signals(info);
 	spin_unlock_irqrestore(&info->lock,flags);
 }
diff --git a/drivers/char/rocket.c b/drivers/char/rocket.c
index f59fc5c..7399188 100644
--- a/drivers/char/rocket.c
+++ b/drivers/char/rocket.c
@@ -872,11 +872,16 @@ static int carrier_raised(struct tty_port *port)
 	return (sGetChanStatusLo(&info->channel) & CD_ACT) ? 1 : 0;
 }
 
-static void raise_dtr_rts(struct tty_port *port)
+static void dtr_rts(struct tty_port *port, int on)
 {
 	struct r_port *info = container_of(port, struct r_port, port);
-	sSetDTR(&info->channel);
-	sSetRTS(&info->channel);
+	if (on) {
+		sSetDTR(&info->channel);
+		sSetRTS(&info->channel);
+	} else {
+		sClrDTR(&info->channel);
+		sClrRTS(&info->channel);
+	}
 }
 
 /*
@@ -2250,7 +2255,7 @@ static const struct tty_operations rocket_ops = {
 
 static const struct tty_port_operations rocket_port_ops = {
 	.carrier_raised = carrier_raised,
-	.raise_dtr_rts = raise_dtr_rts,
+	.dtr_rts = dtr_rts,
 };
 
 /*
diff --git a/drivers/char/stallion.c b/drivers/char/stallion.c
index 2ad813a..53e504f 100644
--- a/drivers/char/stallion.c
+++ b/drivers/char/stallion.c
@@ -772,11 +772,11 @@ static int stl_carrier_raised(struct tty_port *port)
 	return (portp->sigs & TIOCM_CD) ? 1 : 0;
 }
 
-static void stl_raise_dtr_rts(struct tty_port *port)
+static void stl_dtr_rts(struct tty_port *port, int on)
 {
 	struct stlport *portp = container_of(port, struct stlport, port);
 	/* Takes brd_lock internally */
-	stl_setsignals(portp, 1, 1);
+	stl_setsignals(portp, on, on);
 }
 
 /*****************************************************************************/
@@ -2547,7 +2547,7 @@ static const struct tty_operations stl_ops = {
 
 static const struct tty_port_operations stl_port_ops = {
 	.carrier_raised = stl_carrier_raised,
-	.raise_dtr_rts = stl_raise_dtr_rts,
+	.dtr_rts = stl_dtr_rts,
 };
 
 /*****************************************************************************/
diff --git a/drivers/char/synclink.c b/drivers/char/synclink.c
index afd0b26..afded3a 100644
--- a/drivers/char/synclink.c
+++ b/drivers/char/synclink.c
@@ -3247,13 +3247,16 @@ static int carrier_raised(struct tty_port *port)
 	return (info->serial_signals & SerialSignal_DCD) ? 1 : 0;
 }
 
-static void raise_dtr_rts(struct tty_port *port)
+static void dtr_rts(struct tty_port *port, int on)
 {
 	struct mgsl_struct *info = container_of(port, struct mgsl_struct, port);
 	unsigned long flags;
 
 	spin_lock_irqsave(&info->irq_spinlock,flags);
-	info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
+	if (on)
+		info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
+	else
+		info->serial_signals &= ~(SerialSignal_RTS + SerialSignal_DTR);
  	usc_set_serial_signals(info);
 	spin_unlock_irqrestore(&info->irq_spinlock,flags);
 }
@@ -4258,7 +4261,7 @@ static void mgsl_add_device( struct mgsl_struct *info )
 
 static const struct tty_port_operations mgsl_port_ops = {
 	.carrier_raised = carrier_raised,
-	.raise_dtr_rts = raise_dtr_rts,
+	.dtr_rts = dtr_rts,
 };
 
 
diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c
index 5e25649..67986ea 100644
--- a/drivers/char/synclink_gt.c
+++ b/drivers/char/synclink_gt.c
@@ -3099,13 +3099,16 @@ static int carrier_raised(struct tty_port *port)
 	return (info->signals & SerialSignal_DCD) ? 1 : 0;
 }
 
-static void raise_dtr_rts(struct tty_port *port)
+static void dtr_rts(struct tty_port *port, int on)
 {
 	unsigned long flags;
 	struct slgt_info *info = container_of(port, struct slgt_info, port);
 
 	spin_lock_irqsave(&info->lock,flags);
-	info->signals |= SerialSignal_RTS + SerialSignal_DTR;
+	if (on)
+		info->signals |= SerialSignal_RTS + SerialSignal_DTR;
+	else
+		info->signals &= ~(SerialSignal_RTS + SerialSignal_DTR);
  	set_signals(info);
 	spin_unlock_irqrestore(&info->lock,flags);
 }
@@ -3419,7 +3422,7 @@ static void add_device(struct slgt_info *info)
 
 static const struct tty_port_operations slgt_port_ops = {
 	.carrier_raised = carrier_raised,
-	.raise_dtr_rts = raise_dtr_rts,
+	.dtr_rts = dtr_rts,
 };
 
 /*
diff --git a/drivers/char/synclinkmp.c b/drivers/char/synclinkmp.c
index 26de60e..6f727e3 100644
--- a/drivers/char/synclinkmp.c
+++ b/drivers/char/synclinkmp.c
@@ -3277,13 +3277,16 @@ static int carrier_raised(struct tty_port *port)
 	return (info->serial_signals & SerialSignal_DCD) ? 1 : 0;
 }
 
-static void raise_dtr_rts(struct tty_port *port)
+static void dtr_rts(struct tty_port *port, int on)
 {
 	SLMP_INFO *info = container_of(port, SLMP_INFO, port);
 	unsigned long flags;
 
 	spin_lock_irqsave(&info->lock,flags);
-	info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
+	if (on)
+		info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
+	else
+		info->serial_signals &= ~(SerialSignal_RTS + SerialSignal_DTR);
  	set_signals(info);
 	spin_unlock_irqrestore(&info->lock,flags);
 }
@@ -3746,7 +3749,7 @@ static void add_device(SLMP_INFO *info)
 
 static const struct tty_port_operations port_ops = {
 	.carrier_raised = carrier_raised,
-	.raise_dtr_rts = raise_dtr_rts,
+	.dtr_rts = dtr_rts,
 };
 
 /* Allocate and initialize a device instance structure
diff --git a/drivers/char/tty_port.c b/drivers/char/tty_port.c
index 9b8004c..926d4a5 100644
--- a/drivers/char/tty_port.c
+++ b/drivers/char/tty_port.c
@@ -137,7 +137,7 @@ int tty_port_carrier_raised(struct tty_port *port)
 EXPORT_SYMBOL(tty_port_carrier_raised);
 
 /**
- *	tty_port_raise_dtr_rts	-	Riase DTR/RTS
+ *	tty_port_raise_dtr_rts	-	Raise DTR/RTS
  *	@port: tty port
  *
  *	Wrapper for the DTR/RTS raise logic. For the moment this is used
@@ -147,12 +147,28 @@ EXPORT_SYMBOL(tty_port_carrier_raised);
 
 void tty_port_raise_dtr_rts(struct tty_port *port)
 {
-	if (port->ops->raise_dtr_rts)
-		port->ops->raise_dtr_rts(port);
+	if (port->ops->dtr_rts)
+		port->ops->dtr_rts(port, 1);
 }
 EXPORT_SYMBOL(tty_port_raise_dtr_rts);
 
 /**
+ *	tty_port_lower_dtr_rts	-	Lower DTR/RTS
+ *	@port: tty port
+ *
+ *	Wrapper for the DTR/RTS raise logic. For the moment this is used
+ *	to hide some internal details. This will eventually become entirely
+ *	internal to the tty port.
+ */
+
+void tty_port_lower_dtr_rts(struct tty_port *port)
+{
+	if (port->ops->dtr_rts)
+		port->ops->dtr_rts(port, 0);
+}
+EXPORT_SYMBOL(tty_port_lower_dtr_rts);
+
+/**
  *	tty_port_block_til_ready	-	Waiting logic for tty open
  *	@port: the tty port being opened
  *	@tty: the tty device being bound
@@ -167,7 +183,7 @@ EXPORT_SYMBOL(tty_port_raise_dtr_rts);
  *		- port flags and counts
  *
  *	The passed tty_port must implement the carrier_raised method if it can
- *	do carrier detect and the raise_dtr_rts method if it supports software
+ *	do carrier detect and the dtr_rts method if it supports software
  *	management of these lines. Note that the dtr/rts raise is done each
  *	iteration as a hangup may have previously dropped them while we wait.
  */
@@ -302,6 +318,9 @@ void tty_port_close_end(struct tty_port *port, struct tty_struct *tty)
 
 	tty_ldisc_flush(tty);
 
+	if (tty->termios->c_cflag & HUPCL)
+		tty_port_lower_dtr_rts(port);
+
 	spin_lock_irqsave(&port->lock, flags);
 	tty->closing = 0;
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index fc39db95..9869436 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -185,7 +185,7 @@ struct tty_port;
 struct tty_port_operations {
 	/* Return 1 if the carrier is raised */
 	int (*carrier_raised)(struct tty_port *port);
-	void (*raise_dtr_rts)(struct tty_port *port);
+	void (*dtr_rts)(struct tty_port *port, int raise);
 };
 	
 struct tty_port {
@@ -438,6 +438,7 @@ extern struct tty_struct *tty_port_tty_get(struct tty_port *port);
 extern void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty);
 extern int tty_port_carrier_raised(struct tty_port *port);
 extern void tty_port_raise_dtr_rts(struct tty_port *port);
+extern void tty_port_lower_dtr_rts(struct tty_port *port);
 extern void tty_port_hangup(struct tty_port *port);
 extern int tty_port_block_til_ready(struct tty_port *port,
 				struct tty_struct *tty, struct file *filp);
-- 
cgit v0.10.2


From 1ec739be75a6cb961a46ba0b1982d0edb7f27558 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Thu, 11 Jun 2009 12:25:25 +0100
Subject: tty: Implement a drain delay in the tty port

We need this for devices that cannot flush and wait, but which do not order
data and modem events. Without it we will hang up before all the data
clears the hardware. Needed for the USB changes.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/tty_port.c b/drivers/char/tty_port.c
index 926d4a5..4d08b6d 100644
--- a/drivers/char/tty_port.c
+++ b/drivers/char/tty_port.c
@@ -308,6 +308,17 @@ int tty_port_close_start(struct tty_port *port, struct tty_struct *tty, struct f
 	if (port->flags & ASYNC_INITIALIZED &&
 			port->closing_wait != ASYNC_CLOSING_WAIT_NONE)
 		tty_wait_until_sent(tty, port->closing_wait);
+	if (port->drain_delay) {
+		unsigned int bps = tty_get_baud_rate(tty);
+		long timeout;
+
+		if (bps > 1200)
+			timeout = max_t(long, (HZ * 10 * port->drain_delay) / bps,
+								HZ / 10);
+		else
+			timeout = 2 * HZ;
+		schedule_timeout_interruptible(timeout);
+	}
 	return 1;
 }
 EXPORT_SYMBOL(tty_port_close_start);
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 9869436..bed5a3d 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -201,6 +201,9 @@ struct tty_port {
 	unsigned char		*xmit_buf;	/* Optional buffer */
 	int			close_delay;	/* Close port delay */
 	int			closing_wait;	/* Delay for output */
+	int			drain_delay;	/* Set to zero if no pure time
+						   based drain is needed else
+						   set to size of fifo */
 };
 
 /*
-- 
cgit v0.10.2


From 335f8514f200e63d689113d29cb7253a5c282967 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Thu, 11 Jun 2009 12:26:29 +0100
Subject: tty: Bring the usb tty port structure into more use

This allows us to clean stuff up, but is probably also going to cause
some app breakage with buggy apps as we now implement proper POSIX behaviour
for USB ports matching all the other ports. This does also mean other apps
that break on USB will now work properly.

Signed-off-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/usb/serial/belkin_sa.c b/drivers/usb/serial/belkin_sa.c
index b7eacad..2bfd6dd 100644
--- a/drivers/usb/serial/belkin_sa.c
+++ b/drivers/usb/serial/belkin_sa.c
@@ -93,8 +93,7 @@ static int  belkin_sa_startup(struct usb_serial *serial);
 static void belkin_sa_shutdown(struct usb_serial *serial);
 static int  belkin_sa_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void belkin_sa_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void belkin_sa_close(struct usb_serial_port *port);
 static void belkin_sa_read_int_callback(struct urb *urb);
 static void belkin_sa_set_termios(struct tty_struct *tty,
 			struct usb_serial_port *port, struct ktermios * old);
@@ -244,8 +243,7 @@ exit:
 } /* belkin_sa_open */
 
 
-static void belkin_sa_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void belkin_sa_close(struct usb_serial_port *port)
 {
 	dbg("%s port %d", __func__, port->number);
 
diff --git a/drivers/usb/serial/ch341.c b/drivers/usb/serial/ch341.c
index ab4cc27..2830766 100644
--- a/drivers/usb/serial/ch341.c
+++ b/drivers/usb/serial/ch341.c
@@ -262,32 +262,40 @@ error:	kfree(priv);
 	return r;
 }
 
-static void ch341_close(struct tty_struct *tty, struct usb_serial_port *port,
-				struct file *filp)
+static int ch341_carrier_raised(struct usb_serial_port *port)
+{
+	struct ch341_private *priv = usb_get_serial_port_data(port);
+	if (priv->line_status & CH341_BIT_DCD)
+		return 1;
+	return 0;
+}
+
+static void ch341_dtr_rts(struct usb_serial_port *port, int on)
 {
 	struct ch341_private *priv = usb_get_serial_port_data(port);
 	unsigned long flags;
-	unsigned int c_cflag;
 
 	dbg("%s - port %d", __func__, port->number);
+	/* drop DTR and RTS */
+	spin_lock_irqsave(&priv->lock, flags);
+	if (on)
+		priv->line_control |= CH341_BIT_RTS | CH341_BIT_DTR;
+	else
+		priv->line_control &= ~(CH341_BIT_RTS | CH341_BIT_DTR);
+	spin_unlock_irqrestore(&priv->lock, flags);
+	ch341_set_handshake(port->serial->dev, priv->line_control);
+	wake_up_interruptible(&priv->delta_msr_wait);
+}
+
+static void ch341_close(struct usb_serial_port *port)
+{
+	dbg("%s - port %d", __func__, port->number);
 
 	/* shutdown our urbs */
 	dbg("%s - shutting down urbs", __func__);
 	usb_kill_urb(port->write_urb);
 	usb_kill_urb(port->read_urb);
 	usb_kill_urb(port->interrupt_in_urb);
-
-	if (tty) {
-		c_cflag = tty->termios->c_cflag;
-		if (c_cflag & HUPCL) {
-			/* drop DTR and RTS */
-			spin_lock_irqsave(&priv->lock, flags);
-			priv->line_control = 0;
-			spin_unlock_irqrestore(&priv->lock, flags);
-			ch341_set_handshake(port->serial->dev, 0);
-		}
-	}
-	wake_up_interruptible(&priv->delta_msr_wait);
 }
 
 
@@ -302,7 +310,6 @@ static int ch341_open(struct tty_struct *tty, struct usb_serial_port *port,
 	dbg("ch341_open()");
 
 	priv->baud_rate = DEFAULT_BAUD_RATE;
-	priv->line_control = CH341_BIT_RTS | CH341_BIT_DTR;
 
 	r = ch341_configure(serial->dev, priv);
 	if (r)
@@ -322,7 +329,7 @@ static int ch341_open(struct tty_struct *tty, struct usb_serial_port *port,
 	if (r) {
 		dev_err(&port->dev, "%s - failed submitting interrupt urb,"
 			" error %d\n", __func__, r);
-		ch341_close(tty, port, NULL);
+		ch341_close(port);
 		return -EPROTO;
 	}
 
@@ -343,9 +350,6 @@ static void ch341_set_termios(struct tty_struct *tty,
 
 	dbg("ch341_set_termios()");
 
-	if (!tty || !tty->termios)
-		return;
-
 	baud_rate = tty_get_baud_rate(tty);
 
 	priv->baud_rate = baud_rate;
@@ -568,6 +572,8 @@ static struct usb_serial_driver ch341_device = {
 	.usb_driver        = &ch341_driver,
 	.num_ports         = 1,
 	.open              = ch341_open,
+	.dtr_rts	   = ch341_dtr_rts,
+	.carrier_raised	   = ch341_carrier_raised,
 	.close             = ch341_close,
 	.ioctl             = ch341_ioctl,
 	.set_termios       = ch341_set_termios,
diff --git a/drivers/usb/serial/console.c b/drivers/usb/serial/console.c
index 19e2404..247b61b 100644
--- a/drivers/usb/serial/console.c
+++ b/drivers/usb/serial/console.c
@@ -169,7 +169,9 @@ static int usb_console_setup(struct console *co, char *options)
 			kfree(tty);
 		}
 	}
-
+	/* So we know not to kill the hardware on a hangup on this
+	   port. We have also bumped the use count by one so it won't go
+	   idle */
 	port->console = 1;
 	retval = 0;
 
@@ -182,7 +184,7 @@ free_tty:
 	kfree(tty);
 reset_open_count:
 	port->port.count = 0;
-goto out;
+	goto out;
 }
 
 static void usb_console_write(struct console *co,
diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index e8d5133..d9f586d 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -36,8 +36,7 @@
 static int cp2101_open(struct tty_struct *, struct usb_serial_port *,
 							struct file *);
 static void cp2101_cleanup(struct usb_serial_port *);
-static void cp2101_close(struct tty_struct *, struct usb_serial_port *,
-							struct file*);
+static void cp2101_close(struct usb_serial_port *);
 static void cp2101_get_termios(struct tty_struct *,
 	struct usb_serial_port *port);
 static void cp2101_get_termios_port(struct usb_serial_port *port,
@@ -398,8 +397,7 @@ static void cp2101_cleanup(struct usb_serial_port *port)
 	}
 }
 
-static void cp2101_close(struct tty_struct *tty, struct usb_serial_port *port,
-					struct file *filp)
+static void cp2101_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
 
diff --git a/drivers/usb/serial/cyberjack.c b/drivers/usb/serial/cyberjack.c
index dd501bb..933ba91 100644
--- a/drivers/usb/serial/cyberjack.c
+++ b/drivers/usb/serial/cyberjack.c
@@ -61,8 +61,7 @@ static int cyberjack_startup(struct usb_serial *serial);
 static void cyberjack_shutdown(struct usb_serial *serial);
 static int  cyberjack_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void cyberjack_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void cyberjack_close(struct usb_serial_port *port);
 static int cyberjack_write(struct tty_struct *tty,
 	struct usb_serial_port *port, const unsigned char *buf, int count);
 static int cyberjack_write_room(struct tty_struct *tty);
@@ -185,8 +184,7 @@ static int  cyberjack_open(struct tty_struct *tty,
 	return result;
 }
 
-static void cyberjack_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void cyberjack_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
 
diff --git a/drivers/usb/serial/cypress_m8.c b/drivers/usb/serial/cypress_m8.c
index e568710..669f938 100644
--- a/drivers/usb/serial/cypress_m8.c
+++ b/drivers/usb/serial/cypress_m8.c
@@ -174,8 +174,8 @@ static int  cypress_ca42v2_startup(struct usb_serial *serial);
 static void cypress_shutdown(struct usb_serial *serial);
 static int  cypress_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void cypress_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void cypress_close(struct usb_serial_port *port);
+static void cypress_dtr_rts(struct usb_serial_port *port, int on);
 static int  cypress_write(struct tty_struct *tty, struct usb_serial_port *port,
 			const unsigned char *buf, int count);
 static void cypress_send(struct usb_serial_port *port);
@@ -218,6 +218,7 @@ static struct usb_serial_driver cypress_earthmate_device = {
 	.shutdown =			cypress_shutdown,
 	.open =				cypress_open,
 	.close =			cypress_close,
+	.dtr_rts =			cypress_dtr_rts,
 	.write =			cypress_write,
 	.write_room =			cypress_write_room,
 	.ioctl =			cypress_ioctl,
@@ -244,6 +245,7 @@ static struct usb_serial_driver cypress_hidcom_device = {
 	.shutdown =			cypress_shutdown,
 	.open =				cypress_open,
 	.close =			cypress_close,
+	.dtr_rts =			cypress_dtr_rts,
 	.write =			cypress_write,
 	.write_room =			cypress_write_room,
 	.ioctl =			cypress_ioctl,
@@ -270,6 +272,7 @@ static struct usb_serial_driver cypress_ca42v2_device = {
 	.shutdown =			cypress_shutdown,
 	.open =				cypress_open,
 	.close =			cypress_close,
+	.dtr_rts =			cypress_dtr_rts,
 	.write =			cypress_write,
 	.write_room =			cypress_write_room,
 	.ioctl =			cypress_ioctl,
@@ -656,11 +659,7 @@ static int cypress_open(struct tty_struct *tty,
 	priv->rx_flags = 0;
 	spin_unlock_irqrestore(&priv->lock, flags);
 
-	/* raise both lines and set termios */
-	spin_lock_irqsave(&priv->lock, flags);
-	priv->line_control = CONTROL_DTR | CONTROL_RTS;
-	priv->cmd_ctrl = 1;
-	spin_unlock_irqrestore(&priv->lock, flags);
+	/* Set termios */
 	result = cypress_write(tty, port, NULL, 0);
 
 	if (result) {
@@ -694,76 +693,42 @@ static int cypress_open(struct tty_struct *tty,
 							__func__, result);
 		cypress_set_dead(port);
 	}
-
+	port->port.drain_delay = 256;
 	return result;
 } /* cypress_open */
 
+static void cypress_dtr_rts(struct usb_serial_port *port, int on)
+{
+	struct cypress_private *priv = usb_get_serial_port_data(port);
+	/* drop dtr and rts */
+	priv = usb_get_serial_port_data(port);
+	spin_lock_irq(&priv->lock);
+	if (on == 0)
+		priv->line_control = 0;
+	else 
+		priv->line_control = CONTROL_DTR | CONTROL_RTS;
+	priv->cmd_ctrl = 1;
+	spin_unlock_irq(&priv->lock);
+	cypress_write(NULL, port, NULL, 0);
+}
 
-static void cypress_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void cypress_close(struct usb_serial_port *port)
 {
 	struct cypress_private *priv = usb_get_serial_port_data(port);
-	unsigned int c_cflag;
-	int bps;
-	long timeout;
-	wait_queue_t wait;
 
 	dbg("%s - port %d", __func__, port->number);
 
-	/* wait for data to drain from buffer */
-	spin_lock_irq(&priv->lock);
-	timeout = CYPRESS_CLOSING_WAIT;
-	init_waitqueue_entry(&wait, current);
-	add_wait_queue(&tty->write_wait, &wait);
-	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (cypress_buf_data_avail(priv->buf) == 0
-		|| timeout == 0 || signal_pending(current)
-		/* without mutex, allowed due to harmless failure mode */
-		|| port->serial->disconnected)
-			break;
-		spin_unlock_irq(&priv->lock);
-		timeout = schedule_timeout(timeout);
-		spin_lock_irq(&priv->lock);
-	}
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&tty->write_wait, &wait);
-	/* clear out any remaining data in the buffer */
-	cypress_buf_clear(priv->buf);
-	spin_unlock_irq(&priv->lock);
-
 	/* writing is potentially harmful, lock must be taken */
 	mutex_lock(&port->serial->disc_mutex);
 	if (port->serial->disconnected) {
 		mutex_unlock(&port->serial->disc_mutex);
 		return;
 	}
-	/* wait for characters to drain from device */
-	if (tty) {
-		bps = tty_get_baud_rate(tty);
-		if (bps > 1200)
-			timeout = max((HZ * 2560) / bps, HZ / 10);
-		else
-			timeout = 2 * HZ;
-		schedule_timeout_interruptible(timeout);
-	}
-
+	cypress_buf_clear(priv->buf);
 	dbg("%s - stopping urbs", __func__);
 	usb_kill_urb(port->interrupt_in_urb);
 	usb_kill_urb(port->interrupt_out_urb);
 
-	if (tty) {
-		c_cflag = tty->termios->c_cflag;
-		if (c_cflag & HUPCL) {
-			/* drop dtr and rts */
-			priv = usb_get_serial_port_data(port);
-			spin_lock_irq(&priv->lock);
-			priv->line_control = 0;
-			priv->cmd_ctrl = 1;
-			spin_unlock_irq(&priv->lock);
-			cypress_write(tty, port, NULL, 0);
-		}
-	}
 
 	if (stats)
 		dev_info(&port->dev, "Statistics: %d Bytes In | %d Bytes Out | %d Commands Issued\n",
diff --git a/drivers/usb/serial/digi_acceleport.c b/drivers/usb/serial/digi_acceleport.c
index 38ba4ea..30f5140 100644
--- a/drivers/usb/serial/digi_acceleport.c
+++ b/drivers/usb/serial/digi_acceleport.c
@@ -422,7 +422,6 @@ struct digi_port {
 	int dp_throttled;
 	int dp_throttle_restart;
 	wait_queue_head_t dp_flush_wait;
-	int dp_in_close;			/* close in progress */
 	wait_queue_head_t dp_close_wait;	/* wait queue for close */
 	struct work_struct dp_wakeup_work;
 	struct usb_serial_port *dp_port;
@@ -456,8 +455,9 @@ static int digi_write_room(struct tty_struct *tty);
 static int digi_chars_in_buffer(struct tty_struct *tty);
 static int digi_open(struct tty_struct *tty, struct usb_serial_port *port,
 	struct file *filp);
-static void digi_close(struct tty_struct *tty, struct usb_serial_port *port,
-	struct file *filp);
+static void digi_close(struct usb_serial_port *port);
+static int digi_carrier_raised(struct usb_serial_port *port);
+static void digi_dtr_rts(struct usb_serial_port *port, int on);
 static int digi_startup_device(struct usb_serial *serial);
 static int digi_startup(struct usb_serial *serial);
 static void digi_shutdown(struct usb_serial *serial);
@@ -510,6 +510,8 @@ static struct usb_serial_driver digi_acceleport_2_device = {
 	.num_ports =			3,
 	.open =				digi_open,
 	.close =			digi_close,
+	.dtr_rts =			digi_dtr_rts,
+	.carrier_raised =		digi_carrier_raised,
 	.write =			digi_write,
 	.write_room =			digi_write_room,
 	.write_bulk_callback = 		digi_write_bulk_callback,
@@ -1328,6 +1330,19 @@ static int digi_chars_in_buffer(struct tty_struct *tty)
 
 }
 
+static void digi_dtr_rts(struct usb_serial_port *port, int on)
+{
+	/* Adjust DTR and RTS */
+	digi_set_modem_signals(port, on * (TIOCM_DTR|TIOCM_RTS), 1);
+}
+
+static int digi_carrier_raised(struct usb_serial_port *port)
+{
+	struct digi_port *priv = usb_get_serial_port_data(port);
+	if (priv->dp_modem_signals & TIOCM_CD)
+		return 1;
+	return 0;
+}
 
 static int digi_open(struct tty_struct *tty, struct usb_serial_port *port,
 				struct file *filp)
@@ -1336,7 +1351,6 @@ static int digi_open(struct tty_struct *tty, struct usb_serial_port *port,
 	unsigned char buf[32];
 	struct digi_port *priv = usb_get_serial_port_data(port);
 	struct ktermios not_termios;
-	unsigned long flags = 0;
 
 	dbg("digi_open: TOP: port=%d, open_count=%d",
 		priv->dp_port_num, port->port.count);
@@ -1345,26 +1359,6 @@ static int digi_open(struct tty_struct *tty, struct usb_serial_port *port,
 	if (digi_startup_device(port->serial) != 0)
 		return -ENXIO;
 
-	spin_lock_irqsave(&priv->dp_port_lock, flags);
-
-	/* don't wait on a close in progress for non-blocking opens */
-	if (priv->dp_in_close && (filp->f_flags&(O_NDELAY|O_NONBLOCK)) == 0) {
-		spin_unlock_irqrestore(&priv->dp_port_lock, flags);
-		return -EAGAIN;
-	}
-
-	/* wait for a close in progress to finish */
-	while (priv->dp_in_close) {
-		cond_wait_interruptible_timeout_irqrestore(
-			&priv->dp_close_wait, DIGI_RETRY_TIMEOUT,
-			&priv->dp_port_lock, flags);
-		if (signal_pending(current))
-			return -EINTR;
-		spin_lock_irqsave(&priv->dp_port_lock, flags);
-	}
-
-	spin_unlock_irqrestore(&priv->dp_port_lock, flags);
-
 	/* read modem signals automatically whenever they change */
 	buf[0] = DIGI_CMD_READ_INPUT_SIGNALS;
 	buf[1] = priv->dp_port_num;
@@ -1387,16 +1381,11 @@ static int digi_open(struct tty_struct *tty, struct usb_serial_port *port,
 		not_termios.c_iflag = ~tty->termios->c_iflag;
 		digi_set_termios(tty, port, &not_termios);
 	}
-
-	/* set DTR and RTS */
-	digi_set_modem_signals(port, TIOCM_DTR|TIOCM_RTS, 1);
-
 	return 0;
 }
 
 
-static void digi_close(struct tty_struct *tty, struct usb_serial_port *port,
-				struct file *filp)
+static void digi_close(struct usb_serial_port *port)
 {
 	DEFINE_WAIT(wait);
 	int ret;
@@ -1411,28 +1400,9 @@ static void digi_close(struct tty_struct *tty, struct usb_serial_port *port,
 	if (port->serial->disconnected)
 		goto exit;
 
-	/* do cleanup only after final close on this port */
-	spin_lock_irq(&priv->dp_port_lock);
-	priv->dp_in_close = 1;
-	spin_unlock_irq(&priv->dp_port_lock);
-
-	/* tell line discipline to process only XON/XOFF */
-	tty->closing = 1;
-
-	/* wait for output to drain */
-	if ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) == 0)
-		tty_wait_until_sent(tty, DIGI_CLOSE_TIMEOUT);
-
-	/* flush driver and line discipline buffers */
-	tty_driver_flush_buffer(tty);
-	tty_ldisc_flush(tty);
-
 	if (port->serial->dev) {
-		/* wait for transmit idle */
-		if ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) == 0)
-			digi_transmit_idle(port, DIGI_CLOSE_TIMEOUT);
-		/* drop DTR and RTS */
-		digi_set_modem_signals(port, 0, 0);
+		/* FIXME: Transmit idle belongs in the wait_unti_sent path */
+		digi_transmit_idle(port, DIGI_CLOSE_TIMEOUT);
 
 		/* disable input flow control */
 		buf[0] = DIGI_CMD_SET_INPUT_FLOW_CONTROL;
@@ -1477,11 +1447,9 @@ static void digi_close(struct tty_struct *tty, struct usb_serial_port *port,
 		/* shutdown any outstanding bulk writes */
 		usb_kill_urb(port->write_urb);
 	}
-	tty->closing = 0;
 exit:
 	spin_lock_irq(&priv->dp_port_lock);
 	priv->dp_write_urb_in_use = 0;
-	priv->dp_in_close = 0;
 	wake_up_interruptible(&priv->dp_close_wait);
 	spin_unlock_irq(&priv->dp_port_lock);
 	mutex_unlock(&port->serial->disc_mutex);
@@ -1560,7 +1528,6 @@ static int digi_startup(struct usb_serial *serial)
 		priv->dp_throttled = 0;
 		priv->dp_throttle_restart = 0;
 		init_waitqueue_head(&priv->dp_flush_wait);
-		priv->dp_in_close = 0;
 		init_waitqueue_head(&priv->dp_close_wait);
 		INIT_WORK(&priv->dp_wakeup_work, digi_wakeup_write_lock);
 		priv->dp_port = serial->port[i];
diff --git a/drivers/usb/serial/empeg.c b/drivers/usb/serial/empeg.c
index c709ec4..2b141cc 100644
--- a/drivers/usb/serial/empeg.c
+++ b/drivers/usb/serial/empeg.c
@@ -81,8 +81,7 @@ static int debug;
 /* function prototypes for an empeg-car player */
 static int  empeg_open(struct tty_struct *tty, struct usb_serial_port *port,
 						struct file *filp);
-static void empeg_close(struct tty_struct *tty, struct usb_serial_port *port,
-						struct file *filp);
+static void empeg_close(struct usb_serial_port *port);
 static int  empeg_write(struct tty_struct *tty, struct usb_serial_port *port,
 						const unsigned char *buf,
 						int count);
@@ -181,8 +180,7 @@ static int empeg_open(struct tty_struct *tty, struct usb_serial_port *port,
 }
 
 
-static void empeg_close(struct tty_struct *tty, struct usb_serial_port *port,
-				struct file *filp)
+static void empeg_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
 
diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index d9fcdae..d9d8711 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -719,8 +719,8 @@ static int  ftdi_sio_port_probe(struct usb_serial_port *port);
 static int  ftdi_sio_port_remove(struct usb_serial_port *port);
 static int  ftdi_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void ftdi_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void ftdi_close(struct usb_serial_port *port);
+static void ftdi_dtr_rts(struct usb_serial_port *port, int on);
 static int  ftdi_write(struct tty_struct *tty, struct usb_serial_port *port,
 			const unsigned char *buf, int count);
 static int  ftdi_write_room(struct tty_struct *tty);
@@ -758,6 +758,7 @@ static struct usb_serial_driver ftdi_sio_device = {
 	.port_remove =		ftdi_sio_port_remove,
 	.open =			ftdi_open,
 	.close =		ftdi_close,
+	.dtr_rts =		ftdi_dtr_rts,
 	.throttle =		ftdi_throttle,
 	.unthrottle =		ftdi_unthrottle,
 	.write =		ftdi_write,
@@ -1558,6 +1559,30 @@ static int ftdi_open(struct tty_struct *tty,
 } /* ftdi_open */
 
 
+static void ftdi_dtr_rts(struct usb_serial_port *port, int on)
+{
+	struct ftdi_private *priv = usb_get_serial_port_data(port);
+	char buf[1];
+
+	mutex_lock(&port->serial->disc_mutex);
+	if (!port->serial->disconnected) {
+		/* Disable flow control */
+		if (!on && usb_control_msg(port->serial->dev,
+			    usb_sndctrlpipe(port->serial->dev, 0),
+			    FTDI_SIO_SET_FLOW_CTRL_REQUEST,
+			    FTDI_SIO_SET_FLOW_CTRL_REQUEST_TYPE,
+			    0, priv->interface, buf, 0,
+			    WDR_TIMEOUT) < 0) {
+			    dev_err(&port->dev, "error from flowcontrol urb\n");
+		}
+		/* drop RTS and DTR */
+		if (on)
+			set_mctrl(port, TIOCM_DTR | TIOCM_RTS);
+		else
+			clear_mctrl(port, TIOCM_DTR | TIOCM_RTS);
+	}
+	mutex_unlock(&port->serial->disc_mutex);
+}
 
 /*
  * usbserial:__serial_close  only calls ftdi_close if the point is open
@@ -1567,31 +1592,12 @@ static int ftdi_open(struct tty_struct *tty,
  *
  */
 
-static void ftdi_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void ftdi_close(struct usb_serial_port *port)
 { /* ftdi_close */
-	unsigned int c_cflag = tty->termios->c_cflag;
 	struct ftdi_private *priv = usb_get_serial_port_data(port);
-	char buf[1];
 
 	dbg("%s", __func__);
 
-	mutex_lock(&port->serial->disc_mutex);
-	if (c_cflag & HUPCL && !port->serial->disconnected) {
-		/* Disable flow control */
-		if (usb_control_msg(port->serial->dev,
-				    usb_sndctrlpipe(port->serial->dev, 0),
-				    FTDI_SIO_SET_FLOW_CTRL_REQUEST,
-				    FTDI_SIO_SET_FLOW_CTRL_REQUEST_TYPE,
-				    0, priv->interface, buf, 0,
-				    WDR_TIMEOUT) < 0) {
-			dev_err(&port->dev, "error from flowcontrol urb\n");
-		}
-
-		/* drop RTS and DTR */
-		clear_mctrl(port, TIOCM_DTR | TIOCM_RTS);
-	} /* Note change no line if hupcl is off */
-	mutex_unlock(&port->serial->disc_mutex);
 
 	/* cancel any scheduled reading */
 	cancel_delayed_work_sync(&priv->rx_work);
diff --git a/drivers/usb/serial/garmin_gps.c b/drivers/usb/serial/garmin_gps.c
index 586d30f..ee25a3f 100644
--- a/drivers/usb/serial/garmin_gps.c
+++ b/drivers/usb/serial/garmin_gps.c
@@ -993,8 +993,7 @@ static int garmin_open(struct tty_struct *tty,
 }
 
 
-static void garmin_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void garmin_close(struct usb_serial_port *port)
 {
 	struct usb_serial *serial = port->serial;
 	struct garmin_data *garmin_data_p = usb_get_serial_port_data(port);
diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
index 4cec990..be82ea9 100644
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -184,8 +184,7 @@ int usb_serial_generic_resume(struct usb_serial *serial)
 }
 EXPORT_SYMBOL_GPL(usb_serial_generic_resume);
 
-void usb_serial_generic_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+void usb_serial_generic_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
 	generic_cleanup(port);
diff --git a/drivers/usb/serial/io_edgeport.c b/drivers/usb/serial/io_edgeport.c
index fb4a73d..53ef599 100644
--- a/drivers/usb/serial/io_edgeport.c
+++ b/drivers/usb/serial/io_edgeport.c
@@ -207,8 +207,7 @@ static void edge_bulk_out_cmd_callback(struct urb *urb);
 /* function prototypes for the usbserial callbacks */
 static int edge_open(struct tty_struct *tty, struct usb_serial_port *port,
 					struct file *filp);
-static void edge_close(struct tty_struct *tty, struct usb_serial_port *port,
-					struct file *filp);
+static void edge_close(struct usb_serial_port *port);
 static int edge_write(struct tty_struct *tty, struct usb_serial_port *port,
 					const unsigned char *buf, int count);
 static int edge_write_room(struct tty_struct *tty);
@@ -965,7 +964,7 @@ static int edge_open(struct tty_struct *tty,
 
 	if (!edge_port->txfifo.fifo) {
 		dbg("%s - no memory", __func__);
-		edge_close(tty, port, filp);
+		edge_close(port);
 		return -ENOMEM;
 	}
 
@@ -975,7 +974,7 @@ static int edge_open(struct tty_struct *tty,
 
 	if (!edge_port->write_urb) {
 		dbg("%s - no memory", __func__);
-		edge_close(tty, port, filp);
+		edge_close(port);
 		return -ENOMEM;
 	}
 
@@ -1099,8 +1098,7 @@ static void block_until_tx_empty(struct edgeport_port *edge_port)
  * edge_close
  *	this function is called by the tty driver when a port is closed
  *****************************************************************************/
-static void edge_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void edge_close(struct usb_serial_port *port)
 {
 	struct edgeport_serial *edge_serial;
 	struct edgeport_port *edge_port;
diff --git a/drivers/usb/serial/io_ti.c b/drivers/usb/serial/io_ti.c
index 513b25e..eabf20e 100644
--- a/drivers/usb/serial/io_ti.c
+++ b/drivers/usb/serial/io_ti.c
@@ -2009,8 +2009,7 @@ release_es_lock:
 	return status;
 }
 
-static void edge_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void edge_close(struct usb_serial_port *port)
 {
 	struct edgeport_serial *edge_serial;
 	struct edgeport_port *edge_port;
diff --git a/drivers/usb/serial/ipaq.c b/drivers/usb/serial/ipaq.c
index cd62825..c610a99 100644
--- a/drivers/usb/serial/ipaq.c
+++ b/drivers/usb/serial/ipaq.c
@@ -76,8 +76,7 @@ static int initial_wait;
 /* Function prototypes for an ipaq */
 static int  ipaq_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void ipaq_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void ipaq_close(struct usb_serial_port *port);
 static int  ipaq_calc_num_ports(struct usb_serial *serial);
 static int  ipaq_startup(struct usb_serial *serial);
 static void ipaq_shutdown(struct usb_serial *serial);
@@ -714,8 +713,7 @@ error:
 }
 
 
-static void ipaq_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void ipaq_close(struct usb_serial_port *port)
 {
 	struct ipaq_private	*priv = usb_get_serial_port_data(port);
 
diff --git a/drivers/usb/serial/ipw.c b/drivers/usb/serial/ipw.c
index da2a2b4..29ad038 100644
--- a/drivers/usb/serial/ipw.c
+++ b/drivers/usb/serial/ipw.c
@@ -302,23 +302,17 @@ static int ipw_open(struct tty_struct *tty,
 	return 0;
 }
 
-static void ipw_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void ipw_dtr_rts(struct usb_serial_port *port, int on)
 {
 	struct usb_device *dev = port->serial->dev;
 	int result;
 
-	if (tty_hung_up_p(filp)) {
-		dbg("%s: tty_hung_up_p ...", __func__);
-		return;
-	}
-
 	/*--1: drop the dtr */
 	dbg("%s:dropping dtr", __func__);
 	result = usb_control_msg(dev, usb_sndctrlpipe(dev, 0),
 			 IPW_SIO_SET_PIN,
 			 USB_TYPE_VENDOR | USB_RECIP_INTERFACE | USB_DIR_OUT,
-			 IPW_PIN_CLRDTR,
+			 on ? IPW_PIN_SETDTR : IPW_PIN_CLRDTR,
 			 0,
 			 NULL,
 			 0,
@@ -332,7 +326,7 @@ static void ipw_close(struct tty_struct *tty,
 	result = usb_control_msg(dev, usb_sndctrlpipe(dev, 0),
 			 IPW_SIO_SET_PIN, USB_TYPE_VENDOR |
 			 		USB_RECIP_INTERFACE | USB_DIR_OUT,
-			 IPW_PIN_CLRRTS,
+			 on ? IPW_PIN_SETRTS : IPW_PIN_CLRRTS,
 			 0,
 			 NULL,
 			 0,
@@ -340,7 +334,12 @@ static void ipw_close(struct tty_struct *tty,
 	if (result < 0)
 		dev_err(&port->dev,
 				"dropping rts failed (error = %d)\n", result);
+}
 
+static void ipw_close(struct usb_serial_port *port)
+{
+	struct usb_device *dev = port->serial->dev;
+	int result;
 
 	/*--3: purge */
 	dbg("%s:sending purge", __func__);
@@ -461,6 +460,7 @@ static struct usb_serial_driver ipw_device = {
 	.num_ports =		1,
 	.open =			ipw_open,
 	.close =		ipw_close,
+	.dtr_rts =		ipw_dtr_rts,
 	.port_probe = 		ipw_probe,
 	.port_remove =		ipw_disconnect,
 	.write =		ipw_write,
diff --git a/drivers/usb/serial/ir-usb.c b/drivers/usb/serial/ir-usb.c
index 4e2cda9..66009b6 100644
--- a/drivers/usb/serial/ir-usb.c
+++ b/drivers/usb/serial/ir-usb.c
@@ -88,8 +88,7 @@ static int xbof = -1;
 static int  ir_startup (struct usb_serial *serial);
 static int  ir_open(struct tty_struct *tty, struct usb_serial_port *port,
 					struct file *filep);
-static void ir_close(struct tty_struct *tty, struct usb_serial_port *port,
-					struct file *filep);
+static void ir_close(struct usb_serial_port *port);
 static int  ir_write(struct tty_struct *tty, struct usb_serial_port *port,
 					const unsigned char *buf, int count);
 static void ir_write_bulk_callback (struct urb *urb);
@@ -346,8 +345,7 @@ static int ir_open(struct tty_struct *tty,
 	return result;
 }
 
-static void ir_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file * filp)
+static void ir_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
 
diff --git a/drivers/usb/serial/iuu_phoenix.c b/drivers/usb/serial/iuu_phoenix.c
index 4473d44..bb572ce 100644
--- a/drivers/usb/serial/iuu_phoenix.c
+++ b/drivers/usb/serial/iuu_phoenix.c
@@ -70,7 +70,6 @@ static void read_rxcmd_callback(struct urb *urb);
 struct iuu_private {
 	spinlock_t lock;	/* store irq state */
 	wait_queue_head_t delta_msr_wait;
-	u8 line_control;
 	u8 line_status;
 	u8 termios_initialized;
 	int tiostatus;		/* store IUART SIGNAL for tiocmget call */
@@ -946,19 +945,10 @@ static int iuu_uart_baud(struct usb_serial_port *port, u32 baud,
 	return status;
 }
 
-static int set_control_lines(struct usb_device *dev, u8 value)
-{
-	return 0;
-}
-
-static void iuu_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void iuu_close(struct usb_serial_port *port)
 {
 	/* iuu_led (port,255,0,0,0); */
 	struct usb_serial *serial;
-	struct iuu_private *priv = usb_get_serial_port_data(port);
-	unsigned long flags;
-	unsigned int c_cflag;
 
 	serial = port->serial;
 	if (!serial)
@@ -968,17 +958,6 @@ static void iuu_close(struct tty_struct *tty,
 
 	iuu_uart_off(port);
 	if (serial->dev) {
-		if (tty) {
-			c_cflag = tty->termios->c_cflag;
-			if (c_cflag & HUPCL) {
-				/* drop DTR and RTS */
-				priv = usb_get_serial_port_data(port);
-				spin_lock_irqsave(&priv->lock, flags);
-				priv->line_control = 0;
-				spin_unlock_irqrestore(&priv->lock, flags);
-				set_control_lines(port->serial->dev, 0);
-			}
-		}
 		/* free writebuf */
 		/* shutdown our urbs */
 		dbg("%s - shutting down urbs", __func__);
@@ -1154,7 +1133,7 @@ static int iuu_open(struct tty_struct *tty,
 	if (result) {
 		dev_err(&port->dev, "%s - failed submitting read urb,"
 			" error %d\n", __func__, result);
-		iuu_close(tty, port, NULL);
+		iuu_close(port);
 		return -EPROTO;
 	} else {
 		dbg("%s - rxcmd OK", __func__);
diff --git a/drivers/usb/serial/keyspan.c b/drivers/usb/serial/keyspan.c
index 00daa8f..f1195a9 100644
--- a/drivers/usb/serial/keyspan.c
+++ b/drivers/usb/serial/keyspan.c
@@ -1298,8 +1298,16 @@ static inline void stop_urb(struct urb *urb)
 		usb_kill_urb(urb);
 }
 
-static void keyspan_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void keyspan_dtr_rts(struct usb_serial_port *port, int on)
+{
+	struct keyspan_port_private *p_priv = usb_get_serial_port_data(port);
+
+	p_priv->rts_state = on;
+	p_priv->dtr_state = on;
+	keyspan_send_setup(port, 0);
+}
+
+static void keyspan_close(struct usb_serial_port *port)
 {
 	int			i;
 	struct usb_serial	*serial = port->serial;
@@ -1336,7 +1344,6 @@ static void keyspan_close(struct tty_struct *tty,
 			stop_urb(p_priv->out_urbs[i]);
 		}
 	}
-	tty_port_tty_set(&port->port, NULL);
 }
 
 /* download the firmware to a pre-renumeration device */
diff --git a/drivers/usb/serial/keyspan.h b/drivers/usb/serial/keyspan.h
index 38b4582..0d4569b 100644
--- a/drivers/usb/serial/keyspan.h
+++ b/drivers/usb/serial/keyspan.h
@@ -38,9 +38,8 @@
 static int  keyspan_open		(struct tty_struct *tty,
 					 struct usb_serial_port *port,
 					 struct file *filp);
-static void keyspan_close		(struct tty_struct *tty,
-					 struct usb_serial_port *port,
-					 struct file *filp);
+static void keyspan_close		(struct usb_serial_port *port);
+static void keyspan_dtr_rts		(struct usb_serial_port *port, int on);
 static int  keyspan_startup		(struct usb_serial *serial);
 static void keyspan_shutdown		(struct usb_serial *serial);
 static int  keyspan_write_room		(struct tty_struct *tty);
@@ -562,6 +561,7 @@ static struct usb_serial_driver keyspan_1port_device = {
 	.num_ports		= 1,
 	.open			= keyspan_open,
 	.close			= keyspan_close,
+	.dtr_rts		= keyspan_dtr_rts,
 	.write			= keyspan_write,
 	.write_room		= keyspan_write_room,
 	.set_termios		= keyspan_set_termios,
@@ -582,6 +582,7 @@ static struct usb_serial_driver keyspan_2port_device = {
 	.num_ports		= 2,
 	.open			= keyspan_open,
 	.close			= keyspan_close,
+	.dtr_rts		= keyspan_dtr_rts,
 	.write			= keyspan_write,
 	.write_room		= keyspan_write_room,
 	.set_termios		= keyspan_set_termios,
@@ -602,6 +603,7 @@ static struct usb_serial_driver keyspan_4port_device = {
 	.num_ports		= 4,
 	.open			= keyspan_open,
 	.close			= keyspan_close,
+	.dtr_rts		= keyspan_dtr_rts,
 	.write			= keyspan_write,
 	.write_room		= keyspan_write_room,
 	.set_termios		= keyspan_set_termios,
diff --git a/drivers/usb/serial/keyspan_pda.c b/drivers/usb/serial/keyspan_pda.c
index bf1ae24..ab769db 100644
--- a/drivers/usb/serial/keyspan_pda.c
+++ b/drivers/usb/serial/keyspan_pda.c
@@ -651,6 +651,35 @@ static int keyspan_pda_chars_in_buffer(struct tty_struct *tty)
 }
 
 
+static void keyspan_pda_dtr_rts(struct usb_serial_port *port, int on)
+{
+	struct usb_serial *serial = port->serial;
+
+	if (serial->dev) {
+		if (on)
+			keyspan_pda_set_modem_info(serial, (1<<7) | (1<< 2));
+		else
+			keyspan_pda_set_modem_info(serial, 0);
+	}
+}
+
+static int keyspan_pda_carrier_raised(struct usb_serial_port *port)
+{
+	struct usb_serial *serial = port->serial;
+	unsigned char modembits;
+
+	/* If we can read the modem status and the DCD is low then
+	   carrier is not raised yet */
+	if (keyspan_pda_get_modem_info(serial, &modembits) >= 0) {
+		if (!(modembits & (1>>6)))
+			return 0;
+	}
+	/* Carrier raised, or we failed (eg disconnected) so
+	   progress accordingly */
+	return 1;
+}
+
+
 static int keyspan_pda_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp)
 {
@@ -682,13 +711,6 @@ static int keyspan_pda_open(struct tty_struct *tty,
 	priv->tx_room = room;
 	priv->tx_throttled = room ? 0 : 1;
 
-	/* the normal serial device seems to always turn on DTR and RTS here,
-	   so do the same */
-	if (tty && (tty->termios->c_cflag & CBAUD))
-		keyspan_pda_set_modem_info(serial, (1<<7) | (1<<2));
-	else
-		keyspan_pda_set_modem_info(serial, 0);
-
 	/*Start reading from the device*/
 	port->interrupt_in_urb->dev = serial->dev;
 	rc = usb_submit_urb(port->interrupt_in_urb, GFP_KERNEL);
@@ -700,19 +722,11 @@ static int keyspan_pda_open(struct tty_struct *tty,
 error:
 	return rc;
 }
-
-
-static void keyspan_pda_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void keyspan_pda_close(struct usb_serial_port *port)
 {
 	struct usb_serial *serial = port->serial;
 
 	if (serial->dev) {
-		/* the normal serial device seems to always shut
-		   off DTR and RTS now */
-		if (tty->termios->c_cflag & HUPCL)
-			keyspan_pda_set_modem_info(serial, 0);
-
 		/* shutdown our bulk reads and writes */
 		usb_kill_urb(port->write_urb);
 		usb_kill_urb(port->interrupt_in_urb);
@@ -839,6 +853,8 @@ static struct usb_serial_driver keyspan_pda_device = {
 	.usb_driver = 		&keyspan_pda_driver,
 	.id_table =		id_table_std,
 	.num_ports =		1,
+	.dtr_rts =		keyspan_pda_dtr_rts,
+	.carrier_raised	=	keyspan_pda_carrier_raised,
 	.open =			keyspan_pda_open,
 	.close =		keyspan_pda_close,
 	.write =		keyspan_pda_write,
diff --git a/drivers/usb/serial/kl5kusb105.c b/drivers/usb/serial/kl5kusb105.c
index fcd9082..fa817c6 100644
--- a/drivers/usb/serial/kl5kusb105.c
+++ b/drivers/usb/serial/kl5kusb105.c
@@ -76,8 +76,7 @@ static int  klsi_105_startup(struct usb_serial *serial);
 static void klsi_105_shutdown(struct usb_serial *serial);
 static int  klsi_105_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void klsi_105_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void klsi_105_close(struct usb_serial_port *port);
 static int  klsi_105_write(struct tty_struct *tty,
 	struct usb_serial_port *port, const unsigned char *buf, int count);
 static void klsi_105_write_bulk_callback(struct urb *urb);
@@ -447,8 +446,7 @@ exit:
 } /* klsi_105_open */
 
 
-static void klsi_105_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void klsi_105_close(struct usb_serial_port *port)
 {
 	struct klsi_105_private *priv = usb_get_serial_port_data(port);
 	int rc;
diff --git a/drivers/usb/serial/kobil_sct.c b/drivers/usb/serial/kobil_sct.c
index c148544..6b57049 100644
--- a/drivers/usb/serial/kobil_sct.c
+++ b/drivers/usb/serial/kobil_sct.c
@@ -72,8 +72,7 @@ static int  kobil_startup(struct usb_serial *serial);
 static void kobil_shutdown(struct usb_serial *serial);
 static int  kobil_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void kobil_close(struct tty_struct *tty, struct usb_serial_port *port,
-			struct file *filp);
+static void kobil_close(struct usb_serial_port *port);
 static int  kobil_write(struct tty_struct *tty, struct usb_serial_port *port,
 			 const unsigned char *buf, int count);
 static int  kobil_write_room(struct tty_struct *tty);
@@ -209,7 +208,7 @@ static void kobil_shutdown(struct usb_serial *serial)
 
 	for (i = 0; i < serial->num_ports; ++i) {
 		while (serial->port[i]->port.count > 0)
-			kobil_close(NULL, serial->port[i], NULL);
+			kobil_close(serial->port[i]);
 		kfree(usb_get_serial_port_data(serial->port[i]));
 		usb_set_serial_port_data(serial->port[i], NULL);
 	}
@@ -346,11 +345,11 @@ static int kobil_open(struct tty_struct *tty,
 }
 
 
-static void kobil_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void kobil_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
 
+	/* FIXME: Add rts/dtr methods */
 	if (port->write_urb) {
 		usb_kill_urb(port->write_urb);
 		usb_free_urb(port->write_urb);
diff --git a/drivers/usb/serial/mct_u232.c b/drivers/usb/serial/mct_u232.c
index 82930a7..8737955 100644
--- a/drivers/usb/serial/mct_u232.c
+++ b/drivers/usb/serial/mct_u232.c
@@ -95,8 +95,8 @@ static int  mct_u232_startup(struct usb_serial *serial);
 static void mct_u232_shutdown(struct usb_serial *serial);
 static int  mct_u232_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void mct_u232_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void mct_u232_close(struct usb_serial_port *port);
+static void mct_u232_dtr_rts(struct usb_serial_port *port, int on);
 static void mct_u232_read_int_callback(struct urb *urb);
 static void mct_u232_set_termios(struct tty_struct *tty,
 			struct usb_serial_port *port, struct ktermios *old);
@@ -140,6 +140,7 @@ static struct usb_serial_driver mct_u232_device = {
 	.num_ports =	     1,
 	.open =		     mct_u232_open,
 	.close =	     mct_u232_close,
+	.dtr_rts =	     mct_u232_dtr_rts,
 	.throttle =	     mct_u232_throttle,
 	.unthrottle =	     mct_u232_unthrottle,
 	.read_int_callback = mct_u232_read_int_callback,
@@ -496,29 +497,29 @@ error:
 	return retval;
 } /* mct_u232_open */
 
-
-static void mct_u232_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void mct_u232_dtr_rts(struct usb_serial_port *port, int on)
 {
-	unsigned int c_cflag;
 	unsigned int control_state;
 	struct mct_u232_private *priv = usb_get_serial_port_data(port);
-	dbg("%s port %d", __func__, port->number);
 
-	if (tty) {
-		c_cflag = tty->termios->c_cflag;
-		mutex_lock(&port->serial->disc_mutex);
-		if (c_cflag & HUPCL && !port->serial->disconnected) {
-			/* drop DTR and RTS */
-			spin_lock_irq(&priv->lock);
+	mutex_lock(&port->serial->disc_mutex);
+	if (!port->serial->disconnected) {
+		/* drop DTR and RTS */
+		spin_lock_irq(&priv->lock);
+		if (on)
+			priv->control_state |= TIOCM_DTR | TIOCM_RTS;
+		else
 			priv->control_state &= ~(TIOCM_DTR | TIOCM_RTS);
-			control_state = priv->control_state;
-			spin_unlock_irq(&priv->lock);
-			mct_u232_set_modem_ctrl(port->serial, control_state);
-		}
-		mutex_unlock(&port->serial->disc_mutex);
+		control_state = priv->control_state;
+		spin_unlock_irq(&priv->lock);
+		mct_u232_set_modem_ctrl(port->serial, control_state);
 	}
+	mutex_unlock(&port->serial->disc_mutex);
+}
 
+static void mct_u232_close(struct usb_serial_port *port)
+{
+	dbg("%s port %d", __func__, port->number);
 
 	if (port->serial->dev) {
 		/* shutdown our urbs */
diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c
index 24e3b5d..9e1a013 100644
--- a/drivers/usb/serial/mos7720.c
+++ b/drivers/usb/serial/mos7720.c
@@ -533,8 +533,7 @@ static int mos7720_chars_in_buffer(struct tty_struct *tty)
 	return chars;
 }
 
-static void mos7720_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void mos7720_close(struct usb_serial_port *port)
 {
 	struct usb_serial *serial;
 	struct moschip_port *mos7720_port;
diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c
index 84fb1dc..10b78a3 100644
--- a/drivers/usb/serial/mos7840.c
+++ b/drivers/usb/serial/mos7840.c
@@ -1135,54 +1135,12 @@ static int mos7840_chars_in_buffer(struct tty_struct *tty)
 
 }
 
-/************************************************************************
- *
- * mos7840_block_until_tx_empty
- *
- *	This function will block the close until one of the following:
- *		1. TX count are 0
- *		2. The mos7840 has stopped
- *		3. A timeout of 3 seconds without activity has expired
- *
- ************************************************************************/
-static void mos7840_block_until_tx_empty(struct tty_struct *tty,
-				struct moschip_port *mos7840_port)
-{
-	int timeout = HZ / 10;
-	int wait = 30;
-	int count;
-
-	while (1) {
-
-		count = mos7840_chars_in_buffer(tty);
-
-		/* Check for Buffer status */
-		if (count <= 0)
-			return;
-
-		/* Block the thread for a while */
-		interruptible_sleep_on_timeout(&mos7840_port->wait_chase,
-					       timeout);
-
-		/* No activity.. count down section */
-		wait--;
-		if (wait == 0) {
-			dbg("%s - TIMEOUT", __func__);
-			return;
-		} else {
-			/* Reset timeout value back to seconds */
-			wait = 30;
-		}
-	}
-}
-
 /*****************************************************************************
  * mos7840_close
  *	this function is called by the tty driver when a port is closed
  *****************************************************************************/
 
-static void mos7840_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void mos7840_close(struct usb_serial_port *port)
 {
 	struct usb_serial *serial;
 	struct moschip_port *mos7840_port;
@@ -1223,10 +1181,6 @@ static void mos7840_close(struct tty_struct *tty,
 		}
 	}
 
-	if (serial->dev)
-		/* flush and block until tx is empty */
-		mos7840_block_until_tx_empty(tty, mos7840_port);
-
 	/* While closing port, shutdown all bulk read, write  *
 	 * and interrupt read if they exists                  */
 	if (serial->dev) {
diff --git a/drivers/usb/serial/navman.c b/drivers/usb/serial/navman.c
index bcdcbb8..f5f3751a8 100644
--- a/drivers/usb/serial/navman.c
+++ b/drivers/usb/serial/navman.c
@@ -98,8 +98,7 @@ static int navman_open(struct tty_struct *tty,
 	return result;
 }
 
-static void navman_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void navman_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
 
diff --git a/drivers/usb/serial/omninet.c b/drivers/usb/serial/omninet.c
index df65397..1104617 100644
--- a/drivers/usb/serial/omninet.c
+++ b/drivers/usb/serial/omninet.c
@@ -66,8 +66,7 @@ static int debug;
 /* function prototypes */
 static int  omninet_open(struct tty_struct *tty, struct usb_serial_port *port,
 							struct file *filp);
-static void omninet_close(struct tty_struct *tty, struct usb_serial_port *port,
-							struct file *filp);
+static void omninet_close(struct usb_serial_port *port);
 static void omninet_read_bulk_callback(struct urb *urb);
 static void omninet_write_bulk_callback(struct urb *urb);
 static int  omninet_write(struct tty_struct *tty, struct usb_serial_port *port,
@@ -189,8 +188,7 @@ static int omninet_open(struct tty_struct *tty,
 	return result;
 }
 
-static void omninet_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void omninet_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
 	usb_kill_urb(port->read_urb);
diff --git a/drivers/usb/serial/opticon.c b/drivers/usb/serial/opticon.c
index b500ad1..c20480a 100644
--- a/drivers/usb/serial/opticon.c
+++ b/drivers/usb/serial/opticon.c
@@ -173,8 +173,7 @@ static int opticon_open(struct tty_struct *tty, struct usb_serial_port *port,
 	return result;
 }
 
-static void opticon_close(struct tty_struct *tty, struct usb_serial_port *port,
-			  struct file *filp)
+static void opticon_close(struct usb_serial_port *port)
 {
 	struct opticon_private *priv = usb_get_serial_data(port->serial);
 
diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 7817b82..a16d69f 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -45,8 +45,9 @@
 /* Function prototypes */
 static int  option_open(struct tty_struct *tty, struct usb_serial_port *port,
 							struct file *filp);
-static void option_close(struct tty_struct *tty, struct usb_serial_port *port,
-							struct file *filp);
+static void option_close(struct usb_serial_port *port);
+static void option_dtr_rts(struct usb_serial_port *port, int on);
+
 static int  option_startup(struct usb_serial *serial);
 static void option_shutdown(struct usb_serial *serial);
 static int  option_write_room(struct tty_struct *tty);
@@ -61,7 +62,7 @@ static void option_set_termios(struct tty_struct *tty,
 static int  option_tiocmget(struct tty_struct *tty, struct file *file);
 static int  option_tiocmset(struct tty_struct *tty, struct file *file,
 				unsigned int set, unsigned int clear);
-static int  option_send_setup(struct tty_struct *tty, struct usb_serial_port *port);
+static int  option_send_setup(struct usb_serial_port *port);
 static int  option_suspend(struct usb_serial *serial, pm_message_t message);
 static int  option_resume(struct usb_serial *serial);
 
@@ -551,6 +552,7 @@ static struct usb_serial_driver option_1port_device = {
 	.num_ports         = 1,
 	.open              = option_open,
 	.close             = option_close,
+	.dtr_rts	   = option_dtr_rts,
 	.write             = option_write,
 	.write_room        = option_write_room,
 	.chars_in_buffer   = option_chars_in_buffer,
@@ -630,7 +632,7 @@ static void option_set_termios(struct tty_struct *tty,
 	dbg("%s", __func__);
 	/* Doesn't support option setting */
 	tty_termios_copy_hw(tty->termios, old_termios);
-	option_send_setup(tty, port);
+	option_send_setup(port);
 }
 
 static int option_tiocmget(struct tty_struct *tty, struct file *file)
@@ -669,7 +671,7 @@ static int option_tiocmset(struct tty_struct *tty, struct file *file,
 		portdata->rts_state = 0;
 	if (clear & TIOCM_DTR)
 		portdata->dtr_state = 0;
-	return option_send_setup(tty, port);
+	return option_send_setup(port);
 }
 
 /* Write */
@@ -897,10 +899,6 @@ static int option_open(struct tty_struct *tty,
 
 	dbg("%s", __func__);
 
-	/* Set some sane defaults */
-	portdata->rts_state = 1;
-	portdata->dtr_state = 1;
-
 	/* Reset low level data toggle and start reading from endpoints */
 	for (i = 0; i < N_IN_URB; i++) {
 		urb = portdata->in_urbs[i];
@@ -936,37 +934,43 @@ static int option_open(struct tty_struct *tty,
 				usb_pipeout(urb->pipe), 0); */
 	}
 
-	option_send_setup(tty, port);
+	option_send_setup(port);
 
 	return 0;
 }
 
-static void option_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void option_dtr_rts(struct usb_serial_port *port, int on)
 {
-	int i;
 	struct usb_serial *serial = port->serial;
 	struct option_port_private *portdata;
 
 	dbg("%s", __func__);
 	portdata = usb_get_serial_port_data(port);
+	mutex_lock(&serial->disc_mutex);
+	portdata->rts_state = on;
+	portdata->dtr_state = on;
+	if (serial->dev)
+		option_send_setup(port);
+	mutex_unlock(&serial->disc_mutex);
+}
 
-	portdata->rts_state = 0;
-	portdata->dtr_state = 0;
 
-	if (serial->dev) {
-		mutex_lock(&serial->disc_mutex);
-		if (!serial->disconnected)
-			option_send_setup(tty, port);
-		mutex_unlock(&serial->disc_mutex);
+static void option_close(struct usb_serial_port *port)
+{
+	int i;
+	struct usb_serial *serial = port->serial;
+	struct option_port_private *portdata;
+
+	dbg("%s", __func__);
+	portdata = usb_get_serial_port_data(port);
 
+	if (serial->dev) {
 		/* Stop reading/writing urbs */
 		for (i = 0; i < N_IN_URB; i++)
 			usb_kill_urb(portdata->in_urbs[i]);
 		for (i = 0; i < N_OUT_URB; i++)
 			usb_kill_urb(portdata->out_urbs[i]);
 	}
-	tty_port_tty_set(&port->port, NULL);
 }
 
 /* Helper functions used by option_setup_urbs */
@@ -1032,28 +1036,24 @@ static void option_setup_urbs(struct usb_serial *serial)
  * This is exactly the same as SET_CONTROL_LINE_STATE from the PSTN
  * CDC.
 */
-static int option_send_setup(struct tty_struct *tty,
-						struct usb_serial_port *port)
+static int option_send_setup(struct usb_serial_port *port)
 {
 	struct usb_serial *serial = port->serial;
 	struct option_port_private *portdata;
 	int ifNum = serial->interface->cur_altsetting->desc.bInterfaceNumber;
+	int val = 0;
 	dbg("%s", __func__);
 
 	portdata = usb_get_serial_port_data(port);
 
-	if (tty) {
-		int val = 0;
-		if (portdata->dtr_state)
-			val |= 0x01;
-		if (portdata->rts_state)
-			val |= 0x02;
+	if (portdata->dtr_state)
+		val |= 0x01;
+	if (portdata->rts_state)
+		val |= 0x02;
 
-		return usb_control_msg(serial->dev,
-			usb_rcvctrlpipe(serial->dev, 0),
-			0x22, 0x21, val, ifNum, NULL, 0, USB_CTRL_SET_TIMEOUT);
-	}
-	return 0;
+	return usb_control_msg(serial->dev,
+		usb_rcvctrlpipe(serial->dev, 0),
+		0x22, 0x21, val, ifNum, NULL, 0, USB_CTRL_SET_TIMEOUT);
 }
 
 static int option_startup(struct usb_serial *serial)
diff --git a/drivers/usb/serial/oti6858.c b/drivers/usb/serial/oti6858.c
index ba551f0..7de5478 100644
--- a/drivers/usb/serial/oti6858.c
+++ b/drivers/usb/serial/oti6858.c
@@ -143,8 +143,7 @@ struct oti6858_control_pkt {
 /* function prototypes */
 static int oti6858_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void oti6858_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void oti6858_close(struct usb_serial_port *port);
 static void oti6858_set_termios(struct tty_struct *tty,
 			struct usb_serial_port *port, struct ktermios *old);
 static int oti6858_ioctl(struct tty_struct *tty, struct file *file,
@@ -622,67 +621,30 @@ static int oti6858_open(struct tty_struct *tty,
 	if (result != 0) {
 		dev_err(&port->dev, "%s(): usb_submit_urb() failed"
 			       " with error %d\n", __func__, result);
-		oti6858_close(tty, port, NULL);
+		oti6858_close(port);
 		return -EPROTO;
 	}
 
 	/* setup termios */
 	if (tty)
 		oti6858_set_termios(tty, port, &tmp_termios);
-
+	port->port.drain_delay = 256;	/* FIXME: check the FIFO length */
 	return 0;
 }
 
-static void oti6858_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void oti6858_close(struct usb_serial_port *port)
 {
 	struct oti6858_private *priv = usb_get_serial_port_data(port);
 	unsigned long flags;
-	long timeout;
-	wait_queue_t wait;
 
 	dbg("%s(port = %d)", __func__, port->number);
 
-	/* wait for data to drain from the buffer */
 	spin_lock_irqsave(&priv->lock, flags);
-	timeout = 30 * HZ;	/* PL2303_CLOSING_WAIT */
-	init_waitqueue_entry(&wait, current);
-	add_wait_queue(&tty->write_wait, &wait);
-	dbg("%s(): entering wait loop", __func__);
-	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (oti6858_buf_data_avail(priv->buf) == 0
-		|| timeout == 0 || signal_pending(current)
-		|| port->serial->disconnected)
-			break;
-		spin_unlock_irqrestore(&priv->lock, flags);
-		timeout = schedule_timeout(timeout);
-		spin_lock_irqsave(&priv->lock, flags);
-	}
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&tty->write_wait, &wait);
-	dbg("%s(): after wait loop", __func__);
-
 	/* clear out any remaining data in the buffer */
 	oti6858_buf_clear(priv->buf);
 	spin_unlock_irqrestore(&priv->lock, flags);
 
-	/* wait for characters to drain from the device */
-	/* (this is long enough for the entire 256 byte */
-	/* pl2303 hardware buffer to drain with no flow */
-	/* control for data rates of 1200 bps or more, */
-	/* for lower rates we should really know how much */
-	/* data is in the buffer to compute a delay */
-	/* that is not unnecessarily long) */
-	/* FIXME
-	bps = tty_get_baud_rate(tty);
-	if (bps > 1200)
-		timeout = max((HZ*2560)/bps,HZ/10);
-	else
-	*/
-		timeout = 2*HZ;
-	schedule_timeout_interruptible(timeout);
-	dbg("%s(): after schedule_timeout_interruptible()", __func__);
+	dbg("%s(): after buf_clear()", __func__);
 
 	/* cancel scheduled setup */
 	cancel_delayed_work(&priv->delayed_setup_work);
@@ -694,15 +656,6 @@ static void oti6858_close(struct tty_struct *tty,
 	usb_kill_urb(port->write_urb);
 	usb_kill_urb(port->read_urb);
 	usb_kill_urb(port->interrupt_in_urb);
-
-	/*
-	if (tty && (tty->termios->c_cflag) & HUPCL) {
-		// drop DTR and RTS
-		spin_lock_irqsave(&priv->lock, flags);
-		priv->pending_setup.control &= ~CONTROL_MASK;
-		spin_unlock_irqrestore(&priv->lock, flags);
-	}
-	*/
 }
 
 static int oti6858_tiocmset(struct tty_struct *tty, struct file *file,
diff --git a/drivers/usb/serial/pl2303.c b/drivers/usb/serial/pl2303.c
index 751a533..e02dc3d 100644
--- a/drivers/usb/serial/pl2303.c
+++ b/drivers/usb/serial/pl2303.c
@@ -652,69 +652,41 @@ static void pl2303_set_termios(struct tty_struct *tty,
 	kfree(buf);
 }
 
-static void pl2303_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void pl2303_dtr_rts(struct usb_serial_port *port, int on)
+{
+	struct pl2303_private *priv = usb_get_serial_port_data(port);
+	unsigned long flags;
+	u8 control;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	/* Change DTR and RTS */
+	if (on)
+		priv->line_control |= (CONTROL_DTR | CONTROL_RTS);
+	else
+		priv->line_control &= ~(CONTROL_DTR | CONTROL_RTS);
+	control = priv->line_control;
+	spin_unlock_irqrestore(&priv->lock, flags);
+	set_control_lines(port->serial->dev, control);
+}
+
+static void pl2303_close(struct usb_serial_port *port)
 {
 	struct pl2303_private *priv = usb_get_serial_port_data(port);
 	unsigned long flags;
-	unsigned int c_cflag;
-	int bps;
-	long timeout;
-	wait_queue_t wait;
 
 	dbg("%s - port %d", __func__, port->number);
 
-	/* wait for data to drain from the buffer */
 	spin_lock_irqsave(&priv->lock, flags);
-	timeout = PL2303_CLOSING_WAIT;
-	init_waitqueue_entry(&wait, current);
-	add_wait_queue(&tty->write_wait, &wait);
-	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (pl2303_buf_data_avail(priv->buf) == 0 ||
-		    timeout == 0 || signal_pending(current) ||
-		    port->serial->disconnected)
-			break;
-		spin_unlock_irqrestore(&priv->lock, flags);
-		timeout = schedule_timeout(timeout);
-		spin_lock_irqsave(&priv->lock, flags);
-	}
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&tty->write_wait, &wait);
 	/* clear out any remaining data in the buffer */
 	pl2303_buf_clear(priv->buf);
 	spin_unlock_irqrestore(&priv->lock, flags);
 
-	/* wait for characters to drain from the device */
-	/* (this is long enough for the entire 256 byte */
-	/* pl2303 hardware buffer to drain with no flow */
-	/* control for data rates of 1200 bps or more, */
-	/* for lower rates we should really know how much */
-	/* data is in the buffer to compute a delay */
-	/* that is not unnecessarily long) */
-	bps = tty_get_baud_rate(tty);
-	if (bps > 1200)
-		timeout = max((HZ*2560)/bps, HZ/10);
-	else
-		timeout = 2*HZ;
-	schedule_timeout_interruptible(timeout);
-
 	/* shutdown our urbs */
 	dbg("%s - shutting down urbs", __func__);
 	usb_kill_urb(port->write_urb);
 	usb_kill_urb(port->read_urb);
 	usb_kill_urb(port->interrupt_in_urb);
 
-	if (tty) {
-		c_cflag = tty->termios->c_cflag;
-		if (c_cflag & HUPCL) {
-			/* drop DTR and RTS */
-			spin_lock_irqsave(&priv->lock, flags);
-			priv->line_control = 0;
-			spin_unlock_irqrestore(&priv->lock, flags);
-			set_control_lines(port->serial->dev, 0);
-		}
-	}
 }
 
 static int pl2303_open(struct tty_struct *tty,
@@ -748,7 +720,7 @@ static int pl2303_open(struct tty_struct *tty,
 	if (result) {
 		dev_err(&port->dev, "%s - failed submitting read urb,"
 			" error %d\n", __func__, result);
-		pl2303_close(tty, port, NULL);
+		pl2303_close(port);
 		return -EPROTO;
 	}
 
@@ -758,9 +730,10 @@ static int pl2303_open(struct tty_struct *tty,
 	if (result) {
 		dev_err(&port->dev, "%s - failed submitting interrupt urb,"
 			" error %d\n", __func__, result);
-		pl2303_close(tty, port, NULL);
+		pl2303_close(port);
 		return -EPROTO;
 	}
+	port->port.drain_delay = 256;
 	return 0;
 }
 
@@ -821,6 +794,14 @@ static int pl2303_tiocmget(struct tty_struct *tty, struct file *file)
 	return result;
 }
 
+static int pl2303_carrier_raised(struct usb_serial_port *port)
+{
+	struct pl2303_private *priv = usb_get_serial_port_data(port);
+	if (priv->line_status & UART_DCD)
+		return 1;
+	return 0;
+}
+
 static int wait_modem_info(struct usb_serial_port *port, unsigned int arg)
 {
 	struct pl2303_private *priv = usb_get_serial_port_data(port);
@@ -1125,6 +1106,8 @@ static struct usb_serial_driver pl2303_device = {
 	.num_ports =		1,
 	.open =			pl2303_open,
 	.close =		pl2303_close,
+	.dtr_rts = 		pl2303_dtr_rts,
+	.carrier_raised =	pl2303_carrier_raised,
 	.write =		pl2303_write,
 	.ioctl =		pl2303_ioctl,
 	.break_ctl =		pl2303_break_ctl,
diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c
index 913225c..1319b89 100644
--- a/drivers/usb/serial/sierra.c
+++ b/drivers/usb/serial/sierra.c
@@ -240,57 +240,39 @@ struct sierra_port_private {
 	int ri_state;
 };
 
-static int sierra_send_setup(struct tty_struct *tty,
-						struct usb_serial_port *port)
+static int sierra_send_setup(struct usb_serial_port *port)
 {
 	struct usb_serial *serial = port->serial;
 	struct sierra_port_private *portdata;
 	__u16 interface = 0;
+	int val = 0;
 
 	dev_dbg(&port->dev, "%s", __func__);
 
 	portdata = usb_get_serial_port_data(port);
 
-	if (tty) {
-		int val = 0;
-		if (portdata->dtr_state)
-			val |= 0x01;
-		if (portdata->rts_state)
-			val |= 0x02;
-
-		/* If composite device then properly report interface */
-		if (serial->num_ports == 1) {
-			interface = sierra_calc_interface(serial);
-
-			/* Control message is sent only to interfaces with
-			 * interrupt_in endpoints
-			 */
-			if (port->interrupt_in_urb) {
-				/* send control message */
-				return usb_control_msg(serial->dev,
-					usb_rcvctrlpipe(serial->dev, 0),
-					0x22, 0x21, val, interface,
-					NULL, 0, USB_CTRL_SET_TIMEOUT);
-			}
-		}
-
-		/* Otherwise the need to do non-composite mapping */
-		else {
-			if (port->bulk_out_endpointAddress == 2)
-				interface = 0;
-			else if (port->bulk_out_endpointAddress == 4)
-				interface = 1;
-			else if (port->bulk_out_endpointAddress == 5)
-				interface = 2;
-
-			return usb_control_msg(serial->dev,
-				usb_rcvctrlpipe(serial->dev, 0),
-				0x22, 0x21, val, interface,
-				NULL, 0, USB_CTRL_SET_TIMEOUT);
-
-		}
+	if (portdata->dtr_state)
+		val |= 0x01;
+	if (portdata->rts_state)
+		val |= 0x02;
+
+	/* If composite device then properly report interface */
+	if (serial->num_ports == 1)
+		interface = sierra_calc_interface(serial);
+
+	/* Otherwise the need to do non-composite mapping */
+	else {
+		if (port->bulk_out_endpointAddress == 2)
+			interface = 0;
+		else if (port->bulk_out_endpointAddress == 4)
+			interface = 1;
+		else if (port->bulk_out_endpointAddress == 5)
+			interface = 2;
 	}
-
+	return usb_control_msg(serial->dev,
+			usb_rcvctrlpipe(serial->dev, 0),
+			0x22, 0x21, val, interface,
+			NULL, 0, USB_CTRL_SET_TIMEOUT);
 	return 0;
 }
 
@@ -299,7 +281,7 @@ static void sierra_set_termios(struct tty_struct *tty,
 {
 	dev_dbg(&port->dev, "%s", __func__);
 	tty_termios_copy_hw(tty->termios, old_termios);
-	sierra_send_setup(tty, port);
+	sierra_send_setup(port);
 }
 
 static int sierra_tiocmget(struct tty_struct *tty, struct file *file)
@@ -338,7 +320,7 @@ static int sierra_tiocmset(struct tty_struct *tty, struct file *file,
 		portdata->rts_state = 0;
 	if (clear & TIOCM_DTR)
 		portdata->dtr_state = 0;
-	return sierra_send_setup(tty, port);
+	return sierra_send_setup(port);
 }
 
 static void sierra_outdat_callback(struct urb *urb)
@@ -598,7 +580,7 @@ static int sierra_open(struct tty_struct *tty,
 		}
 	}
 
-	sierra_send_setup(tty, port);
+	sierra_send_setup(port);
 
 	/* start up the interrupt endpoint if we have one */
 	if (port->interrupt_in_urb) {
@@ -610,32 +592,38 @@ static int sierra_open(struct tty_struct *tty,
 	return 0;
 }
 
-static void sierra_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void sierra_dtr_rts(struct usb_serial_port *port, int on)
 {
-	int i;
 	struct usb_serial *serial = port->serial;
 	struct sierra_port_private *portdata;
 
-	dev_dbg(&port->dev, "%s", __func__);
 	portdata = usb_get_serial_port_data(port);
-
-	portdata->rts_state = 0;
-	portdata->dtr_state = 0;
+	portdata->rts_state = on;
+	portdata->dtr_state = on;
 
 	if (serial->dev) {
 		mutex_lock(&serial->disc_mutex);
 		if (!serial->disconnected)
-			sierra_send_setup(tty, port);
+			sierra_send_setup(port);
 		mutex_unlock(&serial->disc_mutex);
+	}
+}
+
+static void sierra_close(struct usb_serial_port *port)
+{
+	int i;
+	struct usb_serial *serial = port->serial;
+	struct sierra_port_private *portdata;
 
+	dev_dbg(&port->dev, "%s", __func__);
+	portdata = usb_get_serial_port_data(port);
+
+	if (serial->dev) {
 		/* Stop reading/writing urbs */
 		for (i = 0; i < N_IN_URB; i++)
 			usb_kill_urb(portdata->in_urbs[i]);
 	}
-
 	usb_kill_urb(port->interrupt_in_urb);
-	tty_port_tty_set(&port->port, NULL);
 }
 
 static int sierra_startup(struct usb_serial *serial)
@@ -737,6 +725,7 @@ static struct usb_serial_driver sierra_device = {
 	.probe		   = sierra_probe,
 	.open              = sierra_open,
 	.close             = sierra_close,
+	.dtr_rts	   = sierra_dtr_rts,
 	.write             = sierra_write,
 	.write_room        = sierra_write_room,
 	.set_termios       = sierra_set_termios,
diff --git a/drivers/usb/serial/spcp8x5.c b/drivers/usb/serial/spcp8x5.c
index 5e7528c..8f7ed8f 100644
--- a/drivers/usb/serial/spcp8x5.c
+++ b/drivers/usb/serial/spcp8x5.c
@@ -446,66 +446,47 @@ static void spcp8x5_set_workMode(struct usb_device *dev, u16 value,
 			"RTSCTS usb_control_msg(enable flowctrl) = %d\n", ret);
 }
 
+static int spcp8x5_carrier_raised(struct usb_serial_port *port)
+{
+	struct spcp8x5_private *priv = usb_get_serial_port_data(port);
+	if (priv->line_status & MSR_STATUS_LINE_DCD)
+		return 1;
+	return 0;
+}
+
+static void spcp8x5_dtr_rts(struct usb_serial_port *port, int on)
+{
+	struct spcp8x5_private *priv = usb_get_serial_port_data(port);
+	unsigned long flags;
+	u8 control;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	if (on)
+		priv->line_control = MCR_CONTROL_LINE_DTR
+						| MCR_CONTROL_LINE_RTS;
+	else
+		priv->line_control &= ~ (MCR_CONTROL_LINE_DTR
+						| MCR_CONTROL_LINE_RTS);
+	control = priv->line_control;
+	spin_unlock_irqrestore(&priv->lock, flags);
+	spcp8x5_set_ctrlLine(port->serial->dev, control , priv->type);
+}
+
 /* close the serial port. We should wait for data sending to device 1st and
  * then kill all urb. */
-static void spcp8x5_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void spcp8x5_close(struct usb_serial_port *port)
 {
 	struct spcp8x5_private *priv = usb_get_serial_port_data(port);
 	unsigned long flags;
-	unsigned int c_cflag;
-	int bps;
-	long timeout;
-	wait_queue_t wait;
 	int result;
 
 	dbg("%s - port %d", __func__, port->number);
 
-	/* wait for data to drain from the buffer */
 	spin_lock_irqsave(&priv->lock, flags);
-	timeout = SPCP8x5_CLOSING_WAIT;
-	init_waitqueue_entry(&wait, current);
-	add_wait_queue(&tty->write_wait, &wait);
-	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (ringbuf_avail_data(priv->buf) == 0 ||
-		    timeout == 0 || signal_pending(current))
-			break;
-		spin_unlock_irqrestore(&priv->lock, flags);
-		timeout = schedule_timeout(timeout);
-		spin_lock_irqsave(&priv->lock, flags);
-	}
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&tty->write_wait, &wait);
-
 	/* clear out any remaining data in the buffer */
 	clear_ringbuf(priv->buf);
 	spin_unlock_irqrestore(&priv->lock, flags);
 
-	/* wait for characters to drain from the device (this is long enough
-	 * for the entire all byte spcp8x5 hardware buffer to drain with no
-	 * flow control for data rates of 1200 bps or more, for lower rates we
-	 * should really know how much data is in the buffer to compute a delay
-	 * that is not unnecessarily long) */
-	bps = tty_get_baud_rate(tty);
-	if (bps > 1200)
-		timeout = max((HZ*2560) / bps, HZ/10);
-	else
-		timeout = 2*HZ;
-	set_current_state(TASK_INTERRUPTIBLE);
-	schedule_timeout(timeout);
-
-	/* clear control lines */
-	if (tty) {
-		c_cflag = tty->termios->c_cflag;
-		if (c_cflag & HUPCL) {
-			spin_lock_irqsave(&priv->lock, flags);
-			priv->line_control = 0;
-			spin_unlock_irqrestore(&priv->lock, flags);
-			spcp8x5_set_ctrlLine(port->serial->dev, 0 , priv->type);
-		}
-	}
-
 	/* kill urb */
 	if (port->write_urb != NULL) {
 		result = usb_unlink_urb(port->write_urb);
@@ -665,13 +646,6 @@ static int spcp8x5_open(struct tty_struct *tty,
 	if (ret)
 		return ret;
 
-	spin_lock_irqsave(&priv->lock, flags);
-	if (tty && (tty->termios->c_cflag & CBAUD))
-		priv->line_control = MCR_DTR | MCR_RTS;
-	else
-		priv->line_control = 0;
-	spin_unlock_irqrestore(&priv->lock, flags);
-
 	spcp8x5_set_ctrlLine(serial->dev, priv->line_control , priv->type);
 
 	/* Setup termios */
@@ -691,9 +665,10 @@ static int spcp8x5_open(struct tty_struct *tty,
 	port->read_urb->dev = serial->dev;
 	ret = usb_submit_urb(port->read_urb, GFP_KERNEL);
 	if (ret) {
-		spcp8x5_close(tty, port, NULL);
+		spcp8x5_close(port);
 		return -EPROTO;
 	}
+	port->port.drain_delay = 256;
 	return 0;
 }
 
@@ -1033,6 +1008,8 @@ static struct usb_serial_driver spcp8x5_device = {
 	.num_ports		= 1,
 	.open 			= spcp8x5_open,
 	.close 			= spcp8x5_close,
+	.dtr_rts		= spcp8x5_dtr_rts,
+	.carrier_raised		= spcp8x5_carrier_raised,
 	.write 			= spcp8x5_write,
 	.set_termios 		= spcp8x5_set_termios,
 	.ioctl 			= spcp8x5_ioctl,
diff --git a/drivers/usb/serial/symbolserial.c b/drivers/usb/serial/symbolserial.c
index 69879e4..8b07ebc 100644
--- a/drivers/usb/serial/symbolserial.c
+++ b/drivers/usb/serial/symbolserial.c
@@ -152,8 +152,7 @@ static int symbol_open(struct tty_struct *tty, struct usb_serial_port *port,
 	return result;
 }
 
-static void symbol_close(struct tty_struct *tty, struct usb_serial_port *port,
-			  struct file *filp)
+static void symbol_close(struct usb_serial_port *port)
 {
 	struct symbol_private *priv = usb_get_serial_data(port->serial);
 
diff --git a/drivers/usb/serial/ti_usb_3410_5052.c b/drivers/usb/serial/ti_usb_3410_5052.c
index 0a64bac..42cb04c 100644
--- a/drivers/usb/serial/ti_usb_3410_5052.c
+++ b/drivers/usb/serial/ti_usb_3410_5052.c
@@ -100,8 +100,7 @@ static int ti_startup(struct usb_serial *serial);
 static void ti_shutdown(struct usb_serial *serial);
 static int ti_open(struct tty_struct *tty, struct usb_serial_port *port,
 		struct file *file);
-static void ti_close(struct tty_struct *tty, struct usb_serial_port *port,
-		struct file *file);
+static void ti_close(struct usb_serial_port *port);
 static int ti_write(struct tty_struct *tty, struct usb_serial_port *port,
 		const unsigned char *data, int count);
 static int ti_write_room(struct tty_struct *tty);
@@ -647,8 +646,7 @@ release_lock:
 }
 
 
-static void ti_close(struct tty_struct *tty, struct usb_serial_port *port,
-							struct file *file)
+static void ti_close(struct usb_serial_port *port)
 {
 	struct ti_device *tdev;
 	struct ti_port *tport;
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index f331e2b..1967a7e 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -238,9 +238,11 @@ static int serial_open (struct tty_struct *tty, struct file *filp)
 			goto bailout_interface_put;
 		mutex_unlock(&serial->disc_mutex);
 	}
-
 	mutex_unlock(&port->mutex);
-	return 0;
+	/* Now do the correct tty layer semantics */
+	retval = tty_port_block_til_ready(&port->port, tty, filp);
+	if (retval == 0)
+		return 0;
 
 bailout_interface_put:
 	usb_autopm_put_interface(serial->interface);
@@ -259,64 +261,89 @@ bailout_serial_put:
 	return retval;
 }
 
-static void serial_close(struct tty_struct *tty, struct file *filp)
+/**
+ *	serial_do_down		-	shut down hardware
+ *	@port: port to shut down
+ *
+ *	Shut down a USB port unless it is the console. We never shut down the
+ *	console hardware as it will always be in use.
+ *
+ *	Don't free any resources at this point
+ */
+static void serial_do_down(struct usb_serial_port *port)
 {
-	struct usb_serial_port *port = tty->driver_data;
+	struct usb_serial_driver *drv = port->serial->type;
 	struct usb_serial *serial;
 	struct module *owner;
-	int count;
 
-	if (!port)
+	/* The console is magical, do not hang up the console hardware
+	   or there will be tears */
+	if (port->console)
 		return;
 
-	dbg("%s - port %d", __func__, port->number);
-
 	mutex_lock(&port->mutex);
 	serial = port->serial;
 	owner = serial->type->driver.owner;
 
-	if (port->port.count == 0) {
-		mutex_unlock(&port->mutex);
-		return;
-	}
-
-	if (port->port.count == 1)
-		/* only call the device specific close if this
-		 * port is being closed by the last owner. Ensure we do
-		 * this before we drop the port count. The call is protected
-		 * by the port mutex
-		 */
-		serial->type->close(tty, port, filp);
-
-	if (port->port.count == (port->console ? 2 : 1)) {
-		struct tty_struct *tty = tty_port_tty_get(&port->port);
-		if (tty) {
-			/* We must do this before we drop the port count to
-			   zero. */
-			if (tty->driver_data)
-				tty->driver_data = NULL;
-			tty_port_tty_set(&port->port, NULL);
-			tty_kref_put(tty);
-		}
-	}
+	if (drv->close)
+		drv->close(port);
 
-	--port->port.count;
-	count = port->port.count;
 	mutex_unlock(&port->mutex);
-	put_device(&port->dev);
+}
+
+/**
+ *	serial_do_free		-	free resources post close/hangup
+ *	@port: port to free up
+ *
+ *	Do the resource freeing and refcount dropping for the port. We must
+ *	be careful about ordering and we must avoid freeing up the console.
+ */
 
+static void serial_do_free(struct usb_serial_port *port)
+{
+	struct usb_serial *serial;
+	struct module *owner;
+
+	/* The console is magical, do not hang up the console hardware
+	   or there will be tears */
+	if (port->console)
+		return;
+
+	serial = port->serial;
+	owner = serial->type->driver.owner;
+	put_device(&port->dev);
 	/* Mustn't dereference port any more */
-	if (count == 0) {
-		mutex_lock(&serial->disc_mutex);
-		if (!serial->disconnected)
-			usb_autopm_put_interface(serial->interface);
-		mutex_unlock(&serial->disc_mutex);
-	}
+	mutex_lock(&serial->disc_mutex);
+	if (!serial->disconnected)
+		usb_autopm_put_interface(serial->interface);
+	mutex_unlock(&serial->disc_mutex);
 	usb_serial_put(serial);
-
 	/* Mustn't dereference serial any more */
-	if (count == 0)
-		module_put(owner);
+	module_put(owner);
+}
+
+static void serial_close(struct tty_struct *tty, struct file *filp)
+{
+	struct usb_serial_port *port = tty->driver_data;
+
+	dbg("%s - port %d", __func__, port->number);
+
+
+	if (tty_port_close_start(&port->port, tty, filp) == 0)
+		return;
+
+	serial_do_down(port);		
+	tty_port_close_end(&port->port, tty);
+	tty_port_tty_set(&port->port, NULL);
+	serial_do_free(port);
+}
+
+static void serial_hangup(struct tty_struct *tty)
+{
+	struct usb_serial_port *port = tty->driver_data;
+	serial_do_down(port);
+	tty_port_hangup(&port->port);
+	serial_do_free(port);
 }
 
 static int serial_write(struct tty_struct *tty, const unsigned char *buf,
@@ -648,6 +675,29 @@ static struct usb_serial_driver *search_serial_device(
 	return NULL;
 }
 
+static int serial_carrier_raised(struct tty_port *port)
+{
+	struct usb_serial_port *p = container_of(port, struct usb_serial_port, port);
+	struct usb_serial_driver *drv = p->serial->type;
+	if (drv->carrier_raised)
+		return drv->carrier_raised(p);
+	/* No carrier control - don't block */
+	return 1;	
+}
+
+static void serial_dtr_rts(struct tty_port *port, int on)
+{
+	struct usb_serial_port *p = container_of(port, struct usb_serial_port, port);
+	struct usb_serial_driver *drv = p->serial->type;
+	if (drv->dtr_rts)
+		drv->dtr_rts(p, on);
+}
+
+static const struct tty_port_operations serial_port_ops = {
+	.carrier_raised = serial_carrier_raised,
+	.dtr_rts = serial_dtr_rts,
+};
+
 int usb_serial_probe(struct usb_interface *interface,
 			       const struct usb_device_id *id)
 {
@@ -841,6 +891,7 @@ int usb_serial_probe(struct usb_interface *interface,
 		if (!port)
 			goto probe_error;
 		tty_port_init(&port->port);
+		port->port.ops = &serial_port_ops;
 		port->serial = serial;
 		spin_lock_init(&port->lock);
 		mutex_init(&port->mutex);
@@ -1071,6 +1122,9 @@ void usb_serial_disconnect(struct usb_interface *interface)
 		if (port) {
 			struct tty_struct *tty = tty_port_tty_get(&port->port);
 			if (tty) {
+				/* The hangup will occur asynchronously but
+				   the object refcounts will sort out all the
+				   cleanup */
 				tty_hangup(tty);
 				tty_kref_put(tty);
 			}
@@ -1135,6 +1189,7 @@ static const struct tty_operations serial_ops = {
 	.open =			serial_open,
 	.close =		serial_close,
 	.write =		serial_write,
+	.hangup = 		serial_hangup,
 	.write_room =		serial_write_room,
 	.ioctl =		serial_ioctl,
 	.set_termios =		serial_set_termios,
@@ -1147,6 +1202,7 @@ static const struct tty_operations serial_ops = {
 	.proc_fops =		&serial_proc_fops,
 };
 
+
 struct tty_driver *usb_serial_tty_driver;
 
 static int __init usb_serial_init(void)
diff --git a/drivers/usb/serial/visor.c b/drivers/usb/serial/visor.c
index 5ac414b..b15f1c0 100644
--- a/drivers/usb/serial/visor.c
+++ b/drivers/usb/serial/visor.c
@@ -38,8 +38,7 @@
 /* function prototypes for a handspring visor */
 static int  visor_open(struct tty_struct *tty, struct usb_serial_port *port,
 					struct file *filp);
-static void visor_close(struct tty_struct *tty, struct usb_serial_port *port,
-					struct file *filp);
+static void visor_close(struct usb_serial_port *port);
 static int  visor_write(struct tty_struct *tty, struct usb_serial_port *port,
 					const unsigned char *buf, int count);
 static int  visor_write_room(struct tty_struct *tty);
@@ -324,8 +323,7 @@ exit:
 }
 
 
-static void visor_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void visor_close(struct usb_serial_port *port)
 {
 	struct visor_private *priv = usb_get_serial_port_data(port);
 	unsigned char *transfer_buffer;
diff --git a/drivers/usb/serial/whiteheat.c b/drivers/usb/serial/whiteheat.c
index 5335d32..7c7295d 100644
--- a/drivers/usb/serial/whiteheat.c
+++ b/drivers/usb/serial/whiteheat.c
@@ -147,8 +147,7 @@ static int  whiteheat_attach(struct usb_serial *serial);
 static void whiteheat_shutdown(struct usb_serial *serial);
 static int  whiteheat_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void whiteheat_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void whiteheat_close(struct usb_serial_port *port);
 static int  whiteheat_write(struct tty_struct *tty,
 			struct usb_serial_port *port,
 			const unsigned char *buf, int count);
@@ -712,8 +711,7 @@ exit:
 }
 
 
-static void whiteheat_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void whiteheat_close(struct usb_serial_port *port)
 {
 	struct whiteheat_private *info = usb_get_serial_port_data(port);
 	struct whiteheat_urb_wrap *wrap;
@@ -723,31 +721,7 @@ static void whiteheat_close(struct tty_struct *tty,
 
 	dbg("%s - port %d", __func__, port->number);
 
-	mutex_lock(&port->serial->disc_mutex);
-	/* filp is NULL when called from usb_serial_disconnect */
-	if ((filp && (tty_hung_up_p(filp))) || port->serial->disconnected) {
-		mutex_unlock(&port->serial->disc_mutex);
-		return;
-	}
-	mutex_unlock(&port->serial->disc_mutex);
-
-	tty->closing = 1;
-
-/*
- * Not currently in use; tty_wait_until_sent() calls
- * serial_chars_in_buffer() which deadlocks on the second semaphore
- * acquisition. This should be fixed at some point. Greg's been
- * notified.
-	if ((filp->f_flags & (O_NDELAY | O_NONBLOCK)) == 0) {
-		tty_wait_until_sent(tty, CLOSING_DELAY);
-	}
-*/
-
-	tty_driver_flush_buffer(tty);
-	tty_ldisc_flush(tty);
-
 	firm_report_tx_done(port);
-
 	firm_close(port);
 
 	/* shutdown our bulk reads and writes */
@@ -775,10 +749,7 @@ static void whiteheat_close(struct tty_struct *tty,
 	}
 	spin_unlock_irq(&info->lock);
 	mutex_unlock(&info->deathwarrant);
-
 	stop_command_port(port->serial);
-
-	tty->closing = 0;
 }
 
 
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 625e9e4..8cdfed7 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -224,8 +224,7 @@ struct usb_serial_driver {
 	/* Called by console with tty = NULL and by tty */
 	int  (*open)(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-	void (*close)(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+	void (*close)(struct usb_serial_port *port);
 	int  (*write)(struct tty_struct *tty, struct usb_serial_port *port,
 			const unsigned char *buf, int count);
 	/* Called only by the tty layer */
@@ -241,6 +240,10 @@ struct usb_serial_driver {
 	int  (*tiocmget)(struct tty_struct *tty, struct file *file);
 	int  (*tiocmset)(struct tty_struct *tty, struct file *file,
 			 unsigned int set, unsigned int clear);
+	/* Called by the tty layer for port level work. There may or may not
+	   be an attached tty at this point */
+	void (*dtr_rts)(struct usb_serial_port *port, int on);
+	int  (*carrier_raised)(struct usb_serial_port *port);
 	/* USB events */
 	void (*read_int_callback)(struct urb *urb);
 	void (*write_int_callback)(struct urb *urb);
@@ -283,8 +286,7 @@ extern int usb_serial_generic_open(struct tty_struct *tty,
 		struct usb_serial_port *port, struct file *filp);
 extern int usb_serial_generic_write(struct tty_struct *tty,
 	struct usb_serial_port *port, const unsigned char *buf, int count);
-extern void usb_serial_generic_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+extern void usb_serial_generic_close(struct usb_serial_port *port);
 extern int usb_serial_generic_resume(struct usb_serial *serial);
 extern int usb_serial_generic_write_room(struct tty_struct *tty);
 extern int usb_serial_generic_chars_in_buffer(struct tty_struct *tty);
-- 
cgit v0.10.2


From 739e0285cbb162c8ddd0061fda581ee54a34c19a Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 12:27:50 +0100
Subject: tty: Update cdc_acm

The CDC ACM driver uses the tty layer correctly so needs conversion. Start by
adding and initializing the port structures.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index 7a1164d..41d4ca5 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -89,6 +89,9 @@ static DEFINE_MUTEX(open_mutex);
 
 #define ACM_READY(acm)	(acm && acm->dev && acm->used)
 
+static const struct tty_port_operations acm_port_ops = {
+};
+
 #ifdef VERBOSE_DEBUG
 #define verbose	1
 #else
@@ -1082,6 +1085,8 @@ skip_normal_probe:
 	spin_lock_init(&acm->read_lock);
 	mutex_init(&acm->mutex);
 	acm->rx_endpoint = usb_rcvbulkpipe(usb_dev, epread->bEndpointAddress);
+	tty_port_init(&acm->port);
+	acm->port.ops = &acm_port_ops;
 
 	buf = usb_buffer_alloc(usb_dev, ctrlsize, GFP_KERNEL, &acm->ctrl_dma);
 	if (!buf) {
diff --git a/drivers/usb/class/cdc-acm.h b/drivers/usb/class/cdc-acm.h
index 1f95e7a..19967cd 100644
--- a/drivers/usb/class/cdc-acm.h
+++ b/drivers/usb/class/cdc-acm.h
@@ -90,7 +90,8 @@ struct acm {
 	struct usb_interface *control;			/* control interface */
 	struct usb_interface *data;			/* data interface */
 	struct tty_struct *tty;				/* the corresponding tty */
-	struct urb *ctrlurb;			/* urbs */
+	struct tty_port port;			 	/* our tty port data */
+	struct urb *ctrlurb;				/* urbs */
 	u8 *ctrl_buffer;				/* buffers of urbs */
 	dma_addr_t ctrl_dma;				/* dma handles of buffers */
 	u8 *country_codes;				/* country codes from device */
-- 
cgit v0.10.2


From 5ba5a5d21204f61ff655b322a90ed91c75194e4b Mon Sep 17 00:00:00 2001
From: Paul Fulghum <paulkf@microgate.com>
Date: Thu, 11 Jun 2009 12:28:37 +0100
Subject: tty: synclink_gt add receive pio mode

Add receive programmed IO mode to reduce receive latency
when using low data rates. The receive FIFO trigger
level of 128 bytes used in DMA mode creates excessive latency
when operating at low data rates. PIO mode is selected when user
application requests data in blocks of less than 128 bytes.

Signed-off-by: Paul Fulghum <paulkf@microgate.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c
index 67986ea..1386625 100644
--- a/drivers/char/synclink_gt.c
+++ b/drivers/char/synclink_gt.c
@@ -214,6 +214,7 @@ struct slgt_desc
 #define set_desc_next(a,b) (a).next   = cpu_to_le32((unsigned int)(b))
 #define set_desc_count(a,b)(a).count  = cpu_to_le16((unsigned short)(b))
 #define set_desc_eof(a,b)  (a).status = cpu_to_le16((b) ? (le16_to_cpu((a).status) | BIT0) : (le16_to_cpu((a).status) & ~BIT0))
+#define set_desc_status(a, b) (a).status = cpu_to_le16((unsigned short)(b))
 #define desc_count(a)      (le16_to_cpu((a).count))
 #define desc_status(a)     (le16_to_cpu((a).status))
 #define desc_complete(a)   (le16_to_cpu((a).status) & BIT15)
@@ -297,6 +298,7 @@ struct slgt_info {
 	u32 max_frame_size;       /* as set by device config */
 
 	unsigned int rbuf_fill_level;
+	unsigned int rx_pio;
 	unsigned int if_mode;
 	unsigned int base_clock;
 
@@ -331,6 +333,8 @@ struct slgt_info {
 	struct slgt_desc *rbufs;
 	unsigned int rbuf_current;
 	unsigned int rbuf_index;
+	unsigned int rbuf_fill_index;
+	unsigned short rbuf_fill_count;
 
 	unsigned int tbuf_count;
 	struct slgt_desc *tbufs;
@@ -2110,6 +2114,40 @@ static void ri_change(struct slgt_info *info, unsigned short status)
 	info->pending_bh |= BH_STATUS;
 }
 
+static void isr_rxdata(struct slgt_info *info)
+{
+	unsigned int count = info->rbuf_fill_count;
+	unsigned int i = info->rbuf_fill_index;
+	unsigned short reg;
+
+	while (rd_reg16(info, SSR) & IRQ_RXDATA) {
+		reg = rd_reg16(info, RDR);
+		DBGISR(("isr_rxdata %s RDR=%04X\n", info->device_name, reg));
+		if (desc_complete(info->rbufs[i])) {
+			/* all buffers full */
+			rx_stop(info);
+			info->rx_restart = 1;
+			continue;
+		}
+		info->rbufs[i].buf[count++] = (unsigned char)reg;
+		/* async mode saves status byte to buffer for each data byte */
+		if (info->params.mode == MGSL_MODE_ASYNC)
+			info->rbufs[i].buf[count++] = (unsigned char)(reg >> 8);
+		if (count == info->rbuf_fill_level || (reg & BIT10)) {
+			/* buffer full or end of frame */
+			set_desc_count(info->rbufs[i], count);
+			set_desc_status(info->rbufs[i], BIT15 | (reg >> 8));
+			info->rbuf_fill_count = count = 0;
+			if (++i == info->rbuf_count)
+				i = 0;
+			info->pending_bh |= BH_RECEIVE;
+		}
+	}
+
+	info->rbuf_fill_index = i;
+	info->rbuf_fill_count = count;
+}
+
 static void isr_serial(struct slgt_info *info)
 {
 	unsigned short status = rd_reg16(info, SSR);
@@ -2125,6 +2163,8 @@ static void isr_serial(struct slgt_info *info)
 			if (info->tx_count)
 				isr_txeom(info, status);
 		}
+		if (info->rx_pio && (status & IRQ_RXDATA))
+			isr_rxdata(info);
 		if ((status & IRQ_RXBREAK) && (status & RXBREAK)) {
 			info->icount.brk++;
 			/* process break detection if tty control allows */
@@ -2141,7 +2181,8 @@ static void isr_serial(struct slgt_info *info)
 	} else {
 		if (status & (IRQ_TXIDLE + IRQ_TXUNDER))
 			isr_txeom(info, status);
-
+		if (info->rx_pio && (status & IRQ_RXDATA))
+			isr_rxdata(info);
 		if (status & IRQ_RXIDLE) {
 			if (status & RXIDLE)
 				info->icount.rxidle++;
@@ -2642,6 +2683,10 @@ static int rx_enable(struct slgt_info *info, int enable)
 			return -EINVAL;
 		}
 		info->rbuf_fill_level = rbuf_fill_level;
+		if (rbuf_fill_level < 128)
+			info->rx_pio = 1; /* PIO mode */
+		else
+			info->rx_pio = 0; /* DMA mode */
 		rx_stop(info); /* restart receiver to use new fill level */
 	}
 
@@ -3844,15 +3889,27 @@ static void rx_start(struct slgt_info *info)
 	rdma_reset(info);
 	reset_rbufs(info);
 
-	/* set 1st descriptor address */
-	wr_reg32(info, RDDAR, info->rbufs[0].pdesc);
-
-	if (info->params.mode != MGSL_MODE_ASYNC) {
-		/* enable rx DMA and DMA interrupt */
-		wr_reg32(info, RDCSR, (BIT2 + BIT0));
+	if (info->rx_pio) {
+		/* rx request when rx FIFO not empty */
+		wr_reg16(info, SCR, (unsigned short)(rd_reg16(info, SCR) & ~BIT14));
+		slgt_irq_on(info, IRQ_RXDATA);
+		if (info->params.mode == MGSL_MODE_ASYNC) {
+			/* enable saving of rx status */
+			wr_reg32(info, RDCSR, BIT6);
+		}
 	} else {
-		/* enable saving of rx status, rx DMA and DMA interrupt */
-		wr_reg32(info, RDCSR, (BIT6 + BIT2 + BIT0));
+		/* rx request when rx FIFO half full */
+		wr_reg16(info, SCR, (unsigned short)(rd_reg16(info, SCR) | BIT14));
+		/* set 1st descriptor address */
+		wr_reg32(info, RDDAR, info->rbufs[0].pdesc);
+
+		if (info->params.mode != MGSL_MODE_ASYNC) {
+			/* enable rx DMA and DMA interrupt */
+			wr_reg32(info, RDCSR, (BIT2 + BIT0));
+		} else {
+			/* enable saving of rx status, rx DMA and DMA interrupt */
+			wr_reg32(info, RDCSR, (BIT6 + BIT2 + BIT0));
+		}
 	}
 
 	slgt_irq_on(info, IRQ_RXOVER);
@@ -4470,6 +4527,8 @@ static void free_rbufs(struct slgt_info *info, unsigned int i, unsigned int last
 static void reset_rbufs(struct slgt_info *info)
 {
 	free_rbufs(info, 0, info->rbuf_count - 1);
+	info->rbuf_fill_index = 0;
+	info->rbuf_fill_count = 0;
 }
 
 /*
-- 
cgit v0.10.2


From 97e87f8ebe978e881c7325ba490574bd5500b133 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:29:27 +0100
Subject: tty: cyclades, plx9060 casts cleanup

Remove ugly all-over-the-code casts of ctl_addr to 9060 space.
Add an union to the cyclades_card structure, which contains
a pointer to both 9050 and 9060 spaces.

The 9050 space layout is unknown, so let it still as a void
__iomem pointer.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index ccf68a9..2cbf741 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -666,12 +666,10 @@ static void cy_send_xchar(struct tty_struct *tty, char ch);
 #define IS_CYC_Z(card) ((card).num_chips == (unsigned int)-1)
 
 #define Z_FPGA_CHECK(card) \
-	((readl(&((struct RUNTIME_9060 __iomem *) \
-		((card).ctl_addr))->init_ctrl) & (1<<17)) != 0)
+	((readl(&(card).ctl_addr.p9060->init_ctrl) & (1<<17)) != 0)
 
-#define ISZLOADED(card)	(((ZO_V1 == readl(&((struct RUNTIME_9060 __iomem *) \
-			((card).ctl_addr))->mail_box_0)) || \
-			Z_FPGA_CHECK(card)) && \
+#define ISZLOADED(card)	(((ZO_V1 == readl(&(card).ctl_addr.p9060->mail_box_0)) \
+			|| Z_FPGA_CHECK(card)) && \
 			(ZFIRM_ID == readl(&((struct FIRM_ID __iomem *) \
 			((card).base_addr+ID_ADDRESS))->signature)))
 
@@ -1400,14 +1398,12 @@ cyz_fetch_msg(struct cyclades_card *cinfo,
 	zfw_ctrl = cinfo->base_addr + (readl(&firm_id->zfwctrl_addr) & 0xfffff);
 	board_ctrl = &zfw_ctrl->board_ctrl;
 
-	loc_doorbell = readl(&((struct RUNTIME_9060 __iomem *)
-				  (cinfo->ctl_addr))->loc_doorbell);
+	loc_doorbell = readl(&cinfo->ctl_addr.p9060->loc_doorbell);
 	if (loc_doorbell) {
 		*cmd = (char)(0xff & loc_doorbell);
 		*channel = readl(&board_ctrl->fwcmd_channel);
 		*param = (__u32) readl(&board_ctrl->fwcmd_param);
-		cy_writel(&((struct RUNTIME_9060 __iomem *)(cinfo->ctl_addr))->
-			  loc_doorbell, 0xffffffff);
+		cy_writel(&cinfo->ctl_addr.p9060->loc_doorbell, 0xffffffff);
 		return 1;
 	}
 	return 0;
@@ -1431,8 +1427,7 @@ cyz_issue_cmd(struct cyclades_card *cinfo,
 	board_ctrl = &zfw_ctrl->board_ctrl;
 
 	index = 0;
-	pci_doorbell =
-	    &((struct RUNTIME_9060 __iomem *)(cinfo->ctl_addr))->pci_doorbell;
+	pci_doorbell = &cinfo->ctl_addr.p9060->pci_doorbell;
 	while ((readl(pci_doorbell) & 0xff) != 0) {
 		if (index++ == 1000)
 			return (int)(readl(pci_doorbell) & 0xff);
@@ -1635,8 +1630,7 @@ static void cyz_handle_cmd(struct cyclades_card *cinfo)
 	zfw_ctrl = cinfo->base_addr + (readl(&firm_id->zfwctrl_addr) & 0xfffff);
 	board_ctrl = &zfw_ctrl->board_ctrl;
 	fw_ver = readl(&board_ctrl->fw_version);
-	hw_ver = readl(&((struct RUNTIME_9060 __iomem *)(cinfo->ctl_addr))->
-			mail_box_0);
+	hw_ver = readl(&cinfo->ctl_addr.p9060->mail_box_0);
 
 	while (cyz_fetch_msg(cinfo, &channel, &cmd, &param) == 1) {
 		special_count = 0;
@@ -2394,8 +2388,8 @@ static int cy_open(struct tty_struct *tty, struct file *filp)
 		struct FIRM_ID __iomem *firm_id = cinfo->base_addr + ID_ADDRESS;
 
 		if (!ISZLOADED(*cinfo)) {
-			if (((ZE_V1 == readl(&((struct RUNTIME_9060 __iomem *)
-					 (cinfo->ctl_addr))->mail_box_0)) &&
+			if (((ZE_V1 == readl(&cinfo->ctl_addr.p9060->
+							mail_box_0)) &&
 					Z_FPGA_CHECK(*cinfo)) &&
 					(ZFIRM_HLT == readl(
 						&firm_id->signature))) {
@@ -2417,6 +2411,7 @@ static int cy_open(struct tty_struct *tty, struct file *filp)
 			if (!cinfo->intr_enabled) {
 				struct ZFW_CTRL __iomem *zfw_ctrl;
 				struct BOARD_CTRL __iomem *board_ctrl;
+				u16 intr;
 
 				zfw_ctrl = cinfo->base_addr +
 					(readl(&firm_id->zfwctrl_addr) &
@@ -2425,8 +2420,10 @@ static int cy_open(struct tty_struct *tty, struct file *filp)
 				board_ctrl = &zfw_ctrl->board_ctrl;
 
 				/* Enable interrupts on the PLX chip */
-				cy_writew(cinfo->ctl_addr + 0x68,
-					readw(cinfo->ctl_addr + 0x68) | 0x0900);
+				intr = readw(&cinfo->ctl_addr.p9060->
+						intr_ctrl_stat) | 0x0900;
+				cy_writew(&cinfo->ctl_addr.p9060->
+						intr_ctrl_stat, intr);
 				/* Enable interrupts on the FW */
 				retval = cyz_issue_cmd(cinfo, 0,
 						C_CM_IRQ_ENBL, 0L);
@@ -4347,8 +4344,7 @@ static int __devinit cy_init_card(struct cyclades_card *cinfo)
 	spin_lock_init(&cinfo->card_lock);
 
 	if (IS_CYC_Z(*cinfo)) {	/* Cyclades-Z */
-		mailbox = readl(&((struct RUNTIME_9060 __iomem *)
-				     cinfo->ctl_addr)->mail_box_0);
+		mailbox = readl(&cinfo->ctl_addr.p9060->mail_box_0);
 		nports = (mailbox == ZE_V1) ? ZE_V1_NPORTS : 8;
 		cinfo->intr_enabled = 0;
 		cinfo->nports = 0;	/* Will be correctly set later, after
@@ -4613,7 +4609,7 @@ static int __init cy_detect_isa(void)
 
 		/* set cy_card */
 		cy_card[j].base_addr = cy_isa_address;
-		cy_card[j].ctl_addr = NULL;
+		cy_card[j].ctl_addr.p9050 = NULL;
 		cy_card[j].irq = (int)cy_isa_irq;
 		cy_card[j].bus_index = 0;
 		cy_card[j].first_line = cy_next_channel;
@@ -5013,7 +5009,8 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 		}
 
 		/* Disable interrupts on the PLX before resetting it */
-		cy_writew(addr0 + 0x68, readw(addr0 + 0x68) & ~0x0900);
+		cy_writew(&ctl_addr->intr_ctrl_stat,
+				readw(&ctl_addr->intr_ctrl_stat) & ~0x0900);
 
 		plx_init(pdev, irq, addr0);
 
@@ -5109,7 +5106,7 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 
 	/* set cy_card */
 	cy_card[card_no].base_addr = addr2;
-	cy_card[card_no].ctl_addr = addr0;
+	cy_card[card_no].ctl_addr.p9050 = addr0;
 	cy_card[card_no].irq = irq;
 	cy_card[card_no].bus_index = 1;
 	cy_card[card_no].first_line = cy_next_channel;
@@ -5125,17 +5122,20 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 		plx_ver = readb(addr2 + CyPLX_VER) & 0x0f;
 		switch (plx_ver) {
 		case PLX_9050:
-
 			cy_writeb(addr0 + 0x4c, 0x43);
 			break;
 
 		case PLX_9060:
 		case PLX_9080:
 		default:	/* Old boards, use PLX_9060 */
-			plx_init(pdev, irq, addr0);
-			cy_writew(addr0 + 0x68, readw(addr0 + 0x68) | 0x0900);
+		{
+			struct RUNTIME_9060 __iomem *ctl_addr = addr0;
+			plx_init(pdev, irq, ctl_addr);
+			cy_writew(&ctl_addr->intr_ctrl_stat,
+				readw(&ctl_addr->intr_ctrl_stat) | 0x0900);
 			break;
 		}
+		}
 	}
 
 	dev_info(&pdev->dev, "%s/PCI #%d found: %d channels starting from "
@@ -5168,17 +5168,18 @@ static void __devexit cy_pci_remove(struct pci_dev *pdev)
 	/* non-Z with old PLX */
 	if (!IS_CYC_Z(*cinfo) && (readb(cinfo->base_addr + CyPLX_VER) & 0x0f) ==
 			PLX_9050)
-		cy_writeb(cinfo->ctl_addr + 0x4c, 0);
+		cy_writeb(cinfo->ctl_addr.p9050 + 0x4c, 0);
 	else
 #ifndef CONFIG_CYZ_INTR
 		if (!IS_CYC_Z(*cinfo))
 #endif
-		cy_writew(cinfo->ctl_addr + 0x68,
-				readw(cinfo->ctl_addr + 0x68) & ~0x0900);
+		cy_writew(&cinfo->ctl_addr.p9060->intr_ctrl_stat,
+			readw(&cinfo->ctl_addr.p9060->intr_ctrl_stat) &
+			~0x0900);
 
 	iounmap(cinfo->base_addr);
-	if (cinfo->ctl_addr)
-		iounmap(cinfo->ctl_addr);
+	if (cinfo->ctl_addr.p9050)
+		iounmap(cinfo->ctl_addr.p9050);
 	if (cinfo->irq
 #ifndef CONFIG_CYZ_INTR
 		&& !IS_CYC_Z(*cinfo)
@@ -5373,8 +5374,8 @@ static void __exit cy_cleanup_module(void)
 			/* clear interrupt */
 			cy_writeb(card->base_addr + Cy_ClrIntr, 0);
 			iounmap(card->base_addr);
-			if (card->ctl_addr)
-				iounmap(card->ctl_addr);
+			if (card->ctl_addr.p9050)
+				iounmap(card->ctl_addr.p9050);
 			if (card->irq
 #ifndef CONFIG_CYZ_INTR
 				&& !IS_CYC_Z(*card)
diff --git a/include/linux/cyclades.h b/include/linux/cyclades.h
index 788850b..9ae03d5 100644
--- a/include/linux/cyclades.h
+++ b/include/linux/cyclades.h
@@ -507,16 +507,19 @@ struct ZFW_CTRL {
 
 /* Per card data structure */
 struct cyclades_card {
-    void __iomem *base_addr;
-    void __iomem *ctl_addr;
-    int irq;
-    unsigned int num_chips;	/* 0 if card absent, -1 if Z/PCI, else Y */
-    unsigned int first_line;	/* minor number of first channel on card */
-    unsigned int nports;	/* Number of ports in the card */
-    int bus_index;		/* address shift - 0 for ISA, 1 for PCI */
-    int intr_enabled;		/* FW Interrupt flag - 0 disabled, 1 enabled */
-    spinlock_t card_lock;
-    struct cyclades_port *ports;
+	void __iomem *base_addr;
+	union {
+		void __iomem *p9050;
+		struct RUNTIME_9060 __iomem *p9060;
+	} ctl_addr;
+	int irq;
+	unsigned int num_chips;	/* 0 if card absent, -1 if Z/PCI, else Y */
+	unsigned int first_line;	/* minor number of first channel on card */
+	unsigned int nports;	/* Number of ports in the card */
+	int bus_index;		/* address shift - 0 for ISA, 1 for PCI */
+	int intr_enabled;		/* FW Interrupt flag - 0 disabled, 1 enabled */
+	spinlock_t card_lock;
+	struct cyclades_port *ports;
 };
 
 /***************************************
-- 
cgit v0.10.2


From 101b81590d8df0a74c33cf739886247c0a13f4af Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:30:10 +0100
Subject: tty: cyclades, cache HW version

Store HW version locally to not read it all the time in interrupts
and alike.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index 2cbf741..cf191cc 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -668,8 +668,7 @@ static void cy_send_xchar(struct tty_struct *tty, char ch);
 #define Z_FPGA_CHECK(card) \
 	((readl(&(card).ctl_addr.p9060->init_ctrl) & (1<<17)) != 0)
 
-#define ISZLOADED(card)	(((ZO_V1 == readl(&(card).ctl_addr.p9060->mail_box_0)) \
-			|| Z_FPGA_CHECK(card)) && \
+#define ISZLOADED(card)	((ZO_V1 == (card).hw_ver || Z_FPGA_CHECK(card)) && \
 			(ZFIRM_ID == readl(&((struct FIRM_ID __iomem *) \
 			((card).base_addr+ID_ADDRESS))->signature)))
 
@@ -1393,8 +1392,6 @@ cyz_fetch_msg(struct cyclades_card *cinfo,
 	unsigned long loc_doorbell;
 
 	firm_id = cinfo->base_addr + ID_ADDRESS;
-	if (!ISZLOADED(*cinfo))
-		return -1;
 	zfw_ctrl = cinfo->base_addr + (readl(&firm_id->zfwctrl_addr) & 0xfffff);
 	board_ctrl = &zfw_ctrl->board_ctrl;
 
@@ -1619,10 +1616,8 @@ static void cyz_handle_cmd(struct cyclades_card *cinfo)
 	static struct BOARD_CTRL __iomem *board_ctrl;
 	static struct CH_CTRL __iomem *ch_ctrl;
 	static struct BUF_CTRL __iomem *buf_ctrl;
-	__u32 channel;
+	__u32 channel, param, fw_ver;
 	__u8 cmd;
-	__u32 param;
-	__u32 hw_ver, fw_ver;
 	int special_count;
 	int delta_count;
 
@@ -1630,7 +1625,6 @@ static void cyz_handle_cmd(struct cyclades_card *cinfo)
 	zfw_ctrl = cinfo->base_addr + (readl(&firm_id->zfwctrl_addr) & 0xfffff);
 	board_ctrl = &zfw_ctrl->board_ctrl;
 	fw_ver = readl(&board_ctrl->fw_version);
-	hw_ver = readl(&cinfo->ctl_addr.p9060->mail_box_0);
 
 	while (cyz_fetch_msg(cinfo, &channel, &cmd, &param) == 1) {
 		special_count = 0;
@@ -2388,11 +2382,9 @@ static int cy_open(struct tty_struct *tty, struct file *filp)
 		struct FIRM_ID __iomem *firm_id = cinfo->base_addr + ID_ADDRESS;
 
 		if (!ISZLOADED(*cinfo)) {
-			if (((ZE_V1 == readl(&cinfo->ctl_addr.p9060->
-							mail_box_0)) &&
-					Z_FPGA_CHECK(*cinfo)) &&
-					(ZFIRM_HLT == readl(
-						&firm_id->signature))) {
+			if (cinfo->hw_ver == ZE_V1 && Z_FPGA_CHECK(*cinfo) &&
+					readl(&firm_id->signature) ==
+					ZFIRM_HLT) {
 				printk(KERN_ERR "cyc:Cyclades-Z Error: you "
 					"need an external power supply for "
 					"this number of ports.\nFirmware "
@@ -4336,7 +4328,6 @@ static void cy_hangup(struct tty_struct *tty)
 static int __devinit cy_init_card(struct cyclades_card *cinfo)
 {
 	struct cyclades_port *info;
-	u32 uninitialized_var(mailbox);
 	unsigned int nports, port;
 	unsigned short chip_number;
 	int uninitialized_var(index);
@@ -4344,8 +4335,7 @@ static int __devinit cy_init_card(struct cyclades_card *cinfo)
 	spin_lock_init(&cinfo->card_lock);
 
 	if (IS_CYC_Z(*cinfo)) {	/* Cyclades-Z */
-		mailbox = readl(&cinfo->ctl_addr.p9060->mail_box_0);
-		nports = (mailbox == ZE_V1) ? ZE_V1_NPORTS : 8;
+		nports = (cinfo->hw_ver == ZE_V1) ? ZE_V1_NPORTS : 8;
 		cinfo->intr_enabled = 0;
 		cinfo->nports = 0;	/* Will be correctly set later, after
 					   Z FW is loaded */
@@ -4377,7 +4367,7 @@ static int __devinit cy_init_card(struct cyclades_card *cinfo)
 
 		if (IS_CYC_Z(*cinfo)) {
 			info->type = PORT_STARTECH;
-			if (mailbox == ZO_V1)
+			if (cinfo->hw_ver == ZO_V1)
 				info->xmit_fifo_size = CYZ_FIFO_SIZE;
 			else
 				info->xmit_fifo_size = 4 * CYZ_FIFO_SIZE;
@@ -4932,7 +4922,7 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 {
 	void __iomem *addr0 = NULL, *addr2 = NULL;
 	char *card_name = NULL;
-	u32 mailbox;
+	u32 uninitialized_var(mailbox);
 	unsigned int device_id, nchan = 0, card_no, i;
 	unsigned char plx_ver;
 	int retval, irq;
@@ -5014,7 +5004,7 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 
 		plx_init(pdev, irq, addr0);
 
-		mailbox = (u32)readl(&ctl_addr->mail_box_0);
+		mailbox = readl(&ctl_addr->mail_box_0);
 
 		addr2 = ioremap_nocache(pci_resource_start(pdev, 2),
 				mailbox == ZE_V1 ? CyPCI_Ze_win : CyPCI_Zwin);
@@ -5026,7 +5016,6 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 		if (mailbox == ZE_V1) {
 			card_name = "Cyclades-Ze";
 
-			readl(&ctl_addr->mail_box_0);
 			nchan = ZE_V1_NPORTS;
 		} else {
 			card_name = "Cyclades-8Zo";
@@ -5089,6 +5078,8 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 		}
 		cy_card[card_no].num_chips = nchan / 4;
 	} else {
+		cy_card[card_no].hw_ver = mailbox;
+		cy_card[card_no].num_chips = (unsigned int)-1;
 #ifdef CONFIG_CYZ_INTR
 		/* allocate IRQ only if board has an IRQ */
 		if (irq != 0 && irq != 255) {
@@ -5101,7 +5092,6 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 			}
 		}
 #endif				/* CONFIG_CYZ_INTR */
-		cy_card[card_no].num_chips = (unsigned int)-1;
 	}
 
 	/* set cy_card */
diff --git a/include/linux/cyclades.h b/include/linux/cyclades.h
index 9ae03d5..a14fe30 100644
--- a/include/linux/cyclades.h
+++ b/include/linux/cyclades.h
@@ -518,6 +518,7 @@ struct cyclades_card {
 	unsigned int nports;	/* Number of ports in the card */
 	int bus_index;		/* address shift - 0 for ISA, 1 for PCI */
 	int intr_enabled;		/* FW Interrupt flag - 0 disabled, 1 enabled */
+	u32 hw_ver;
 	spinlock_t card_lock;
 	struct cyclades_port *ports;
 };
-- 
cgit v0.10.2


From 2693f485c22d18474c077f12fd0f797ee8679b33 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:31:06 +0100
Subject: tty: cyclades, convert macros to inlines

Remove ugly macros and add inlines instead of them. This improves
readability and type checking a much.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index cf191cc..c9a503f 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -663,15 +663,6 @@
 static void cy_throttle(struct tty_struct *tty);
 static void cy_send_xchar(struct tty_struct *tty, char ch);
 
-#define IS_CYC_Z(card) ((card).num_chips == (unsigned int)-1)
-
-#define Z_FPGA_CHECK(card) \
-	((readl(&(card).ctl_addr.p9060->init_ctrl) & (1<<17)) != 0)
-
-#define ISZLOADED(card)	((ZO_V1 == (card).hw_ver || Z_FPGA_CHECK(card)) && \
-			(ZFIRM_ID == readl(&((struct FIRM_ID __iomem *) \
-			((card).base_addr+ID_ADDRESS))->signature)))
-
 #ifndef SERIAL_XMIT_SIZE
 #define	SERIAL_XMIT_SIZE	(min(PAGE_SIZE, 4096))
 #endif
@@ -684,8 +675,6 @@ static void cy_send_xchar(struct tty_struct *tty, char ch);
 #define DRIVER_VERSION	0x02010203
 #define RAM_SIZE 0x80000
 
-#define Z_FPGA_LOADED(X)	((readl(&(X)->init_ctrl) & (1<<17)) != 0)
-
 enum zblock_type {
 	ZBLOCK_PRG = 0,
 	ZBLOCK_FPGA = 1
@@ -880,6 +869,29 @@ static void cyz_rx_restart(unsigned long);
 static struct timer_list cyz_rx_full_timer[NR_PORTS];
 #endif				/* CONFIG_CYZ_INTR */
 
+static inline bool cy_is_Z(struct cyclades_card *card)
+{
+	return card->num_chips == (unsigned int)-1;
+}
+
+static inline bool __cyz_fpga_loaded(struct RUNTIME_9060 __iomem *ctl_addr)
+{
+	return readl(&ctl_addr->init_ctrl) & (1 << 17);
+}
+
+static inline bool cyz_fpga_loaded(struct cyclades_card *card)
+{
+	return __cyz_fpga_loaded(card->ctl_addr.p9060);
+}
+
+static inline bool cyz_is_loaded(struct cyclades_card *card)
+{
+	struct FIRM_ID __iomem *fw_id = card->base_addr + ID_ADDRESS;
+
+	return (card->hw_ver == ZO_V1 || cyz_fpga_loaded(card)) &&
+			readl(&fw_id->signature) == ZFIRM_ID;
+}
+
 static inline int serial_paranoia_check(struct cyclades_port *info,
 		char *name, const char *routine)
 {
@@ -1417,7 +1429,7 @@ cyz_issue_cmd(struct cyclades_card *cinfo,
 	unsigned int index;
 
 	firm_id = cinfo->base_addr + ID_ADDRESS;
-	if (!ISZLOADED(*cinfo))
+	if (!cyz_is_loaded(cinfo))
 		return -1;
 
 	zfw_ctrl = cinfo->base_addr + (readl(&firm_id->zfwctrl_addr) & 0xfffff);
@@ -1725,7 +1737,7 @@ static irqreturn_t cyz_interrupt(int irq, void *dev_id)
 {
 	struct cyclades_card *cinfo = dev_id;
 
-	if (unlikely(!ISZLOADED(*cinfo))) {
+	if (unlikely(!cyz_is_loaded(cinfo))) {
 #ifdef CY_DEBUG_INTERRUPTS
 		printk(KERN_DEBUG "cyz_interrupt: board not yet loaded "
 				"(IRQ%d).\n", irq);
@@ -1773,9 +1785,9 @@ static void cyz_poll(unsigned long arg)
 	for (card = 0; card < NR_CARDS; card++) {
 		cinfo = &cy_card[card];
 
-		if (!IS_CYC_Z(*cinfo))
+		if (!cy_is_Z(cinfo))
 			continue;
-		if (!ISZLOADED(*cinfo))
+		if (!cyz_is_loaded(cinfo))
 			continue;
 
 		firm_id = cinfo->base_addr + ID_ADDRESS;
@@ -1854,7 +1866,7 @@ static int startup(struct cyclades_port *info)
 
 	set_line_char(info);
 
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 		chip = channel >> 2;
 		channel &= 0x03;
 		index = card->bus_index;
@@ -1911,7 +1923,7 @@ static int startup(struct cyclades_port *info)
 		base_addr = card->base_addr;
 
 		firm_id = base_addr + ID_ADDRESS;
-		if (!ISZLOADED(*card))
+		if (!cyz_is_loaded(card))
 			return -ENODEV;
 
 		zfw_ctrl = card->base_addr +
@@ -2006,7 +2018,7 @@ static void start_xmit(struct cyclades_port *info)
 
 	card = info->card;
 	channel = info->line - card->first_line;
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 		chip = channel >> 2;
 		channel &= 0x03;
 		index = card->bus_index;
@@ -2050,7 +2062,7 @@ static void shutdown(struct cyclades_port *info)
 
 	card = info->card;
 	channel = info->line - card->first_line;
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 		chip = channel >> 2;
 		channel &= 0x03;
 		index = card->bus_index;
@@ -2106,7 +2118,7 @@ static void shutdown(struct cyclades_port *info)
 #endif
 
 		firm_id = base_addr + ID_ADDRESS;
-		if (!ISZLOADED(*card))
+		if (!cyz_is_loaded(card))
 			return;
 
 		zfw_ctrl = card->base_addr +
@@ -2213,7 +2225,7 @@ block_til_ready(struct tty_struct *tty, struct file *filp,
 #endif
 	info->port.blocked_open++;
 
-	if (!IS_CYC_Z(*cinfo)) {
+	if (!cy_is_Z(cinfo)) {
 		chip = channel >> 2;
 		channel &= 0x03;
 		index = cinfo->bus_index;
@@ -2276,7 +2288,7 @@ block_til_ready(struct tty_struct *tty, struct file *filp,
 
 		base_addr = cinfo->base_addr;
 		firm_id = base_addr + ID_ADDRESS;
-		if (!ISZLOADED(*cinfo)) {
+		if (!cyz_is_loaded(cinfo)) {
 			__set_current_state(TASK_RUNNING);
 			remove_wait_queue(&info->port.open_wait, &wait);
 			return -EINVAL;
@@ -2377,12 +2389,12 @@ static int cy_open(struct tty_struct *tty, struct file *filp)
 	   treat it as absent from the system.  This
 	   will make the user pay attention.
 	 */
-	if (IS_CYC_Z(*info->card)) {
+	if (cy_is_Z(info->card)) {
 		struct cyclades_card *cinfo = info->card;
 		struct FIRM_ID __iomem *firm_id = cinfo->base_addr + ID_ADDRESS;
 
-		if (!ISZLOADED(*cinfo)) {
-			if (cinfo->hw_ver == ZE_V1 && Z_FPGA_CHECK(*cinfo) &&
+		if (!cyz_is_loaded(cinfo)) {
+			if (cinfo->hw_ver == ZE_V1 && cyz_fpga_loaded(cinfo) &&
 					readl(&firm_id->signature) ==
 					ZFIRM_HLT) {
 				printk(KERN_ERR "cyc:Cyclades-Z Error: you "
@@ -2537,7 +2549,7 @@ static void cy_wait_until_sent(struct tty_struct *tty, int timeout)
 #endif
 	card = info->card;
 	channel = (info->line) - (card->first_line);
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 		chip = channel >> 2;
 		channel &= 0x03;
 		index = card->bus_index;
@@ -2582,7 +2594,7 @@ static void cy_flush_buffer(struct tty_struct *tty)
 	info->xmit_cnt = info->xmit_head = info->xmit_tail = 0;
 	spin_unlock_irqrestore(&card->card_lock, flags);
 
-	if (IS_CYC_Z(*card)) {	/* If it is a Z card, flush the on-board
+	if (cy_is_Z(card)) {	/* If it is a Z card, flush the on-board
 					   buffers as well */
 		spin_lock_irqsave(&card->card_lock, flags);
 		retval = cyz_issue_cmd(card, channel, C_CM_FLUSH_TX, 0L);
@@ -2663,7 +2675,7 @@ static void cy_close(struct tty_struct *tty, struct file *filp)
 
 	spin_lock_irqsave(&card->card_lock, flags);
 
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 		int channel = info->line - card->first_line;
 		int index = card->bus_index;
 		void __iomem *base_addr = card->base_addr +
@@ -2883,7 +2895,7 @@ static int cy_chars_in_buffer(struct tty_struct *tty)
 	channel = (info->line) - (card->first_line);
 
 #ifdef Z_EXT_CHARS_IN_BUFFER
-	if (!IS_CYC_Z(cy_card[card])) {
+	if (!cy_is_Z(card)) {
 #endif				/* Z_EXT_CHARS_IN_BUFFER */
 #ifdef CY_DEBUG_IO
 		printk(KERN_DEBUG "cyc:cy_chars_in_buffer ttyC%d %d\n",
@@ -2996,7 +3008,7 @@ static void set_line_char(struct cyclades_port *info)
 	channel = info->line - card->first_line;
 	chip_number = channel / 4;
 
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 
 		index = card->bus_index;
 
@@ -3221,7 +3233,7 @@ static void set_line_char(struct cyclades_port *info)
 		int retval;
 
 		firm_id = card->base_addr + ID_ADDRESS;
-		if (!ISZLOADED(*card))
+		if (!cyz_is_loaded(card))
 			return;
 
 		zfw_ctrl = card->base_addr +
@@ -3438,7 +3450,7 @@ static int get_lsr_info(struct cyclades_port *info, unsigned int __user *value)
 
 	card = info->card;
 	channel = (info->line) - (card->first_line);
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 		chip = channel >> 2;
 		channel &= 0x03;
 		index = card->bus_index;
@@ -3478,7 +3490,7 @@ static int cy_tiocmget(struct tty_struct *tty, struct file *file)
 
 	card = info->card;
 	channel = info->line - card->first_line;
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 		chip = channel >> 2;
 		channel &= 0x03;
 		index = card->bus_index;
@@ -3504,7 +3516,7 @@ static int cy_tiocmget(struct tty_struct *tty, struct file *file)
 	} else {
 		base_addr = card->base_addr;
 		firm_id = card->base_addr + ID_ADDRESS;
-		if (ISZLOADED(*card)) {
+		if (cyz_is_loaded(card)) {
 			zfw_ctrl = card->base_addr +
 				(readl(&firm_id->zfwctrl_addr) & 0xfffff);
 			board_ctrl = &zfw_ctrl->board_ctrl;
@@ -3547,7 +3559,7 @@ cy_tiocmset(struct tty_struct *tty, struct file *file,
 
 	card = info->card;
 	channel = (info->line) - (card->first_line);
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 		chip = channel >> 2;
 		channel &= 0x03;
 		index = card->bus_index;
@@ -3622,7 +3634,7 @@ cy_tiocmset(struct tty_struct *tty, struct file *file,
 		base_addr = card->base_addr;
 
 		firm_id = card->base_addr + ID_ADDRESS;
-		if (ISZLOADED(*card)) {
+		if (cyz_is_loaded(card)) {
 			zfw_ctrl = card->base_addr +
 				(readl(&firm_id->zfwctrl_addr) & 0xfffff);
 			board_ctrl = &zfw_ctrl->board_ctrl;
@@ -3694,7 +3706,7 @@ static int cy_break(struct tty_struct *tty, int break_state)
 	card = info->card;
 
 	spin_lock_irqsave(&card->card_lock, flags);
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 		/* Let the transmit ISR take care of this (since it
 		   requires stuffing characters into the output stream).
 		 */
@@ -3763,7 +3775,7 @@ static int set_threshold(struct cyclades_port *info, unsigned long value)
 
 	card = info->card;
 	channel = info->line - card->first_line;
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 		chip = channel >> 2;
 		channel &= 0x03;
 		index = card->bus_index;
@@ -3791,7 +3803,7 @@ static int get_threshold(struct cyclades_port *info,
 
 	card = info->card;
 	channel = info->line - card->first_line;
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 		chip = channel >> 2;
 		channel &= 0x03;
 		index = card->bus_index;
@@ -3825,7 +3837,7 @@ static int set_timeout(struct cyclades_port *info, unsigned long value)
 
 	card = info->card;
 	channel = info->line - card->first_line;
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 		chip = channel >> 2;
 		channel &= 0x03;
 		index = card->bus_index;
@@ -3848,7 +3860,7 @@ static int get_timeout(struct cyclades_port *info,
 
 	card = info->card;
 	channel = info->line - card->first_line;
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 		chip = channel >> 2;
 		channel &= 0x03;
 		index = card->bus_index;
@@ -4102,7 +4114,7 @@ static void cy_send_xchar(struct tty_struct *tty, char ch)
 	card = info->card;
 	channel = info->line - card->first_line;
 
-	if (IS_CYC_Z(*card)) {
+	if (cy_is_Z(card)) {
 		if (ch == STOP_CHAR(tty))
 			cyz_issue_cmd(card, channel, C_CM_SENDXOFF, 0L);
 		else if (ch == START_CHAR(tty))
@@ -4135,7 +4147,7 @@ static void cy_throttle(struct tty_struct *tty)
 	card = info->card;
 
 	if (I_IXOFF(tty)) {
-		if (!IS_CYC_Z(*card))
+		if (!cy_is_Z(card))
 			cy_send_xchar(tty, STOP_CHAR(tty));
 		else
 			info->throttle = 1;
@@ -4143,7 +4155,7 @@ static void cy_throttle(struct tty_struct *tty)
 
 	if (tty->termios->c_cflag & CRTSCTS) {
 		channel = info->line - card->first_line;
-		if (!IS_CYC_Z(*card)) {
+		if (!cy_is_Z(card)) {
 			chip = channel >> 2;
 			channel &= 0x03;
 			index = card->bus_index;
@@ -4200,7 +4212,7 @@ static void cy_unthrottle(struct tty_struct *tty)
 	if (tty->termios->c_cflag & CRTSCTS) {
 		card = info->card;
 		channel = info->line - card->first_line;
-		if (!IS_CYC_Z(*card)) {
+		if (!cy_is_Z(card)) {
 			chip = channel >> 2;
 			channel &= 0x03;
 			index = card->bus_index;
@@ -4244,7 +4256,7 @@ static void cy_stop(struct tty_struct *tty)
 
 	cinfo = info->card;
 	channel = info->line - cinfo->first_line;
-	if (!IS_CYC_Z(*cinfo)) {
+	if (!cy_is_Z(cinfo)) {
 		index = cinfo->bus_index;
 		chip = channel >> 2;
 		channel &= 0x03;
@@ -4277,7 +4289,7 @@ static void cy_start(struct tty_struct *tty)
 	cinfo = info->card;
 	channel = info->line - cinfo->first_line;
 	index = cinfo->bus_index;
-	if (!IS_CYC_Z(*cinfo)) {
+	if (!cy_is_Z(cinfo)) {
 		chip = channel >> 2;
 		channel &= 0x03;
 		base_addr = cinfo->base_addr + (cy_chip_offset[chip] << index);
@@ -4334,7 +4346,7 @@ static int __devinit cy_init_card(struct cyclades_card *cinfo)
 
 	spin_lock_init(&cinfo->card_lock);
 
-	if (IS_CYC_Z(*cinfo)) {	/* Cyclades-Z */
+	if (cy_is_Z(cinfo)) {	/* Cyclades-Z */
 		nports = (cinfo->hw_ver == ZE_V1) ? ZE_V1_NPORTS : 8;
 		cinfo->intr_enabled = 0;
 		cinfo->nports = 0;	/* Will be correctly set later, after
@@ -4365,7 +4377,7 @@ static int __devinit cy_init_card(struct cyclades_card *cinfo)
 		init_completion(&info->shutdown_wait);
 		init_waitqueue_head(&info->delta_msr_wait);
 
-		if (IS_CYC_Z(*cinfo)) {
+		if (cy_is_Z(cinfo)) {
 			info->type = PORT_STARTECH;
 			if (cinfo->hw_ver == ZO_V1)
 				info->xmit_fifo_size = CYZ_FIFO_SIZE;
@@ -4408,7 +4420,7 @@ static int __devinit cy_init_card(struct cyclades_card *cinfo)
 	}
 
 #ifndef CONFIG_CYZ_INTR
-	if (IS_CYC_Z(*cinfo) && !timer_pending(&cyz_timerlist)) {
+	if (cy_is_Z(cinfo) && !timer_pending(&cyz_timerlist)) {
 		mod_timer(&cyz_timerlist, jiffies + 1);
 #ifdef CY_PCI_DEBUG
 		printk(KERN_DEBUG "Cyclades-Z polling initialized\n");
@@ -4771,7 +4783,7 @@ static int __devinit cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
 
 	/* Check whether the firmware is already loaded and running. If
 	   positive, skip this board */
-	if (Z_FPGA_LOADED(ctl_addr) && readl(&fid->signature) == ZFIRM_ID) {
+	if (__cyz_fpga_loaded(ctl_addr) && readl(&fid->signature) == ZFIRM_ID) {
 		u32 cntval = readl(base_addr + 0x190);
 
 		udelay(100);
@@ -4790,7 +4802,7 @@ static int __devinit cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
 
 	mailbox = readl(&ctl_addr->mail_box_0);
 
-	if (mailbox == 0 || Z_FPGA_LOADED(ctl_addr)) {
+	if (mailbox == 0 || __cyz_fpga_loaded(ctl_addr)) {
 		/* stops CPU and set window to beginning of RAM */
 		cy_writel(&ctl_addr->loc_addr_base, WIN_CREG);
 		cy_writel(&cust->cpu_stop, 0);
@@ -4806,7 +4818,7 @@ static int __devinit cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
 				base_addr);
 		if (retval)
 			goto err_rel;
-		if (!Z_FPGA_LOADED(ctl_addr)) {
+		if (!__cyz_fpga_loaded(ctl_addr)) {
 			dev_err(&pdev->dev, "fw upload successful, but fw is "
 					"not loaded\n");
 			goto err_rel;
@@ -4865,7 +4877,7 @@ static int __devinit cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
 				"system before loading the new FW to the "
 				"Cyclades-Z.\n");
 
-			if (Z_FPGA_LOADED(ctl_addr))
+			if (__cyz_fpga_loaded(ctl_addr))
 				plx_init(pdev, irq, ctl_addr);
 
 			retval = -EIO;
@@ -4889,7 +4901,7 @@ static int __devinit cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
 			"check the connection between the Z host card and the "
 			"serial expanders.\n");
 
-		if (Z_FPGA_LOADED(ctl_addr))
+		if (__cyz_fpga_loaded(ctl_addr))
 			plx_init(pdev, irq, ctl_addr);
 
 		dev_info(&pdev->dev, "Null number of ports detected. Board "
@@ -5156,12 +5168,12 @@ static void __devexit cy_pci_remove(struct pci_dev *pdev)
 	unsigned int i;
 
 	/* non-Z with old PLX */
-	if (!IS_CYC_Z(*cinfo) && (readb(cinfo->base_addr + CyPLX_VER) & 0x0f) ==
+	if (!cy_is_Z(cinfo) && (readb(cinfo->base_addr + CyPLX_VER) & 0x0f) ==
 			PLX_9050)
 		cy_writeb(cinfo->ctl_addr.p9050 + 0x4c, 0);
 	else
 #ifndef CONFIG_CYZ_INTR
-		if (!IS_CYC_Z(*cinfo))
+		if (!cy_is_Z(cinfo))
 #endif
 		cy_writew(&cinfo->ctl_addr.p9060->intr_ctrl_stat,
 			readw(&cinfo->ctl_addr.p9060->intr_ctrl_stat) &
@@ -5172,7 +5184,7 @@ static void __devexit cy_pci_remove(struct pci_dev *pdev)
 		iounmap(cinfo->ctl_addr.p9050);
 	if (cinfo->irq
 #ifndef CONFIG_CYZ_INTR
-		&& !IS_CYC_Z(*cinfo)
+		&& !cy_is_Z(cinfo)
 #endif /* CONFIG_CYZ_INTR */
 		)
 		free_irq(cinfo->irq, cinfo);
@@ -5368,7 +5380,7 @@ static void __exit cy_cleanup_module(void)
 				iounmap(card->ctl_addr.p9050);
 			if (card->irq
 #ifndef CONFIG_CYZ_INTR
-				&& !IS_CYC_Z(*card)
+				&& !cy_is_Z(card)
 #endif /* CONFIG_CYZ_INTR */
 				)
 				free_irq(card->irq, card);
-- 
cgit v0.10.2


From 08a951e1693e324bd035b194002a82b9057fd9c5 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:33:40 +0100
Subject: tty: cyclades, remove typedefs

They are unused anyway.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/cyclades.h b/include/linux/cyclades.h
index a14fe30..1fbdea4 100644
--- a/include/linux/cyclades.h
+++ b/include/linux/cyclades.h
@@ -142,19 +142,6 @@ struct CYZ_BOOT_CTRL {
 
 
 #ifndef DP_WINDOW_SIZE
-/* #include "cyclomz.h" */
-/****************** ****************** *******************/
-/*
- *	The data types defined below are used in all ZFIRM interface
- *	data structures. They accomodate differences between HW
- *	architectures and compilers.
- */
-
-typedef __u64  ucdouble;		/* 64 bits, unsigned */
-typedef __u32  uclong;			/* 32 bits, unsigned */
-typedef __u16  ucshort;		/* 16 bits, unsigned */
-typedef __u8   ucchar;			/* 8 bits, unsigned */
-
 /*
  *	Memory Window Sizes
  */
-- 
cgit v0.10.2


From 963118eef9e6706e2b5356309fb0cdd9c9eba81d Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:34:27 +0100
Subject: tty: cyclades, fix nports handling

Set up ports right after FW load so that we won't allocate maximal
(64) ports when we use few.

Also remove reading of nports in irq context, since we know it from
initialisation now.

This also fixes a tty ports unregistration on some fail paths and for
Ze which registered 64 and unregistered real port count.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index c9a503f..6adbc97 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -604,7 +604,6 @@
 
 #define NR_PORTS	256
 
-#define ZE_V1_NPORTS	64
 #define ZO_V1	0
 #define ZO_V2	1
 #define ZE_V1	2
@@ -1777,7 +1776,6 @@ static void cyz_poll(unsigned long arg)
 	struct tty_struct *tty;
 	struct FIRM_ID __iomem *firm_id;
 	struct ZFW_CTRL __iomem *zfw_ctrl;
-	struct BOARD_CTRL __iomem *board_ctrl;
 	struct BUF_CTRL __iomem *buf_ctrl;
 	unsigned long expires = jiffies + HZ;
 	unsigned int port, card;
@@ -1793,11 +1791,9 @@ static void cyz_poll(unsigned long arg)
 		firm_id = cinfo->base_addr + ID_ADDRESS;
 		zfw_ctrl = cinfo->base_addr +
 				(readl(&firm_id->zfwctrl_addr) & 0xfffff);
-		board_ctrl = &(zfw_ctrl->board_ctrl);
 
 	/* Skip first polling cycle to avoid racing conditions with the FW */
 		if (!cinfo->intr_enabled) {
-			cinfo->nports = (int)readl(&board_ctrl->n_channel);
 			cinfo->intr_enabled = 1;
 			continue;
 		}
@@ -2413,16 +2409,8 @@ static int cy_open(struct tty_struct *tty, struct file *filp)
 		   interrupts should be enabled as soon as the first open
 		   happens to one of its ports. */
 			if (!cinfo->intr_enabled) {
-				struct ZFW_CTRL __iomem *zfw_ctrl;
-				struct BOARD_CTRL __iomem *board_ctrl;
 				u16 intr;
 
-				zfw_ctrl = cinfo->base_addr +
-					(readl(&firm_id->zfwctrl_addr) &
-					 0xfffff);
-
-				board_ctrl = &zfw_ctrl->board_ctrl;
-
 				/* Enable interrupts on the PLX chip */
 				intr = readw(&cinfo->ctl_addr.p9060->
 						intr_ctrl_stat) | 0x0900;
@@ -2435,8 +2423,6 @@ static int cy_open(struct tty_struct *tty, struct file *filp)
 					printk(KERN_ERR "cyc:IRQ enable retval "
 						"was %x\n", retval);
 				}
-				cinfo->nports =
-					(int)readl(&board_ctrl->n_channel);
 				cinfo->intr_enabled = 1;
 			}
 		}
@@ -4340,30 +4326,20 @@ static void cy_hangup(struct tty_struct *tty)
 static int __devinit cy_init_card(struct cyclades_card *cinfo)
 {
 	struct cyclades_port *info;
-	unsigned int nports, port;
+	unsigned int port;
 	unsigned short chip_number;
-	int uninitialized_var(index);
 
 	spin_lock_init(&cinfo->card_lock);
+	cinfo->intr_enabled = 0;
 
-	if (cy_is_Z(cinfo)) {	/* Cyclades-Z */
-		nports = (cinfo->hw_ver == ZE_V1) ? ZE_V1_NPORTS : 8;
-		cinfo->intr_enabled = 0;
-		cinfo->nports = 0;	/* Will be correctly set later, after
-					   Z FW is loaded */
-	} else {
-		index = cinfo->bus_index;
-		nports = cinfo->nports = CyPORTS_PER_CHIP * cinfo->num_chips;
-	}
-
-	cinfo->ports = kzalloc(sizeof(*cinfo->ports) * nports, GFP_KERNEL);
+	cinfo->ports = kcalloc(cinfo->nports, sizeof(*cinfo->ports),
+			GFP_KERNEL);
 	if (cinfo->ports == NULL) {
 		printk(KERN_ERR "Cyclades: cannot allocate ports\n");
-		cinfo->nports = 0;
 		return -ENOMEM;
 	}
 
-	for (port = cinfo->first_line; port < cinfo->first_line + nports;
+	for (port = cinfo->first_line; port < cinfo->first_line + cinfo->nports;
 			port++) {
 		info = &cinfo->ports[port - cinfo->first_line];
 		tty_port_init(&info->port);
@@ -4388,6 +4364,7 @@ static int __devinit cy_init_card(struct cyclades_card *cinfo)
 				cyz_rx_restart, (unsigned long)info);
 #endif
 		} else {
+			int index = cinfo->bus_index;
 			info->type = PORT_CIRRUS;
 			info->xmit_fifo_size = CyMAX_CHAR_FIFO;
 			info->cor1 = CyPARITY_NONE | Cy_1_STOP | Cy_8_BITS;
@@ -4615,7 +4592,8 @@ static int __init cy_detect_isa(void)
 		cy_card[j].irq = (int)cy_isa_irq;
 		cy_card[j].bus_index = 0;
 		cy_card[j].first_line = cy_next_channel;
-		cy_card[j].num_chips = cy_isa_nchan / 4;
+		cy_card[j].num_chips = cy_isa_nchan / CyPORTS_PER_CHIP;
+		cy_card[j].nports = cy_isa_nchan;
 		if (cy_init_card(&cy_card[j])) {
 			cy_card[j].base_addr = NULL;
 			free_irq(cy_isa_irq, &cy_card[j]);
@@ -4771,7 +4749,7 @@ static int __devinit cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
 	struct CUSTOM_REG __iomem *cust = base_addr;
 	struct ZFW_CTRL __iomem *pt_zfwctrl;
 	void __iomem *tmp;
-	u32 mailbox, status;
+	u32 mailbox, status, nchan;
 	unsigned int i;
 	int retval;
 
@@ -4892,11 +4870,11 @@ static int __devinit cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
 			base_addr + ID_ADDRESS, readl(&fid->zfwctrl_addr),
 			base_addr + readl(&fid->zfwctrl_addr));
 
+	nchan = readl(&pt_zfwctrl->board_ctrl.n_channel);
 	dev_info(&pdev->dev, "Cyclades-Z FW loaded: version = %x, ports = %u\n",
-		readl(&pt_zfwctrl->board_ctrl.fw_version),
-		readl(&pt_zfwctrl->board_ctrl.n_channel));
+		readl(&pt_zfwctrl->board_ctrl.fw_version), nchan);
 
-	if (readl(&pt_zfwctrl->board_ctrl.n_channel) == 0) {
+	if (nchan == 0) {
 		dev_warn(&pdev->dev, "no Cyclades-Z ports were found. Please "
 			"check the connection between the Z host card and the "
 			"serial expanders.\n");
@@ -4922,7 +4900,7 @@ static int __devinit cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
 	cy_writel(&ctl_addr->intr_ctrl_stat, readl(&ctl_addr->intr_ctrl_stat) |
 			0x00030800UL);
 
-	return 0;
+	return nchan;
 err_rel:
 	release_firmware(fw);
 err:
@@ -5027,12 +5005,8 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 
 		if (mailbox == ZE_V1) {
 			card_name = "Cyclades-Ze";
-
-			nchan = ZE_V1_NPORTS;
 		} else {
 			card_name = "Cyclades-8Zo";
-			nchan = 8;
-
 #ifdef CY_PCI_DEBUG
 			if (mailbox == ZO_V1) {
 				cy_writel(&ctl_addr->loc_addr_base, WIN_CREG);
@@ -5057,8 +5031,9 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 		}
 
 		retval = cyz_load_fw(pdev, addr2, addr0, irq);
-		if (retval)
+		if (retval <= 0)
 			goto err_unmap;
+		nchan = retval;
 	}
 
 	if ((cy_next_channel + nchan) > NR_PORTS) {
@@ -5088,7 +5063,7 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 			dev_err(&pdev->dev, "could not allocate IRQ\n");
 			goto err_unmap;
 		}
-		cy_card[card_no].num_chips = nchan / 4;
+		cy_card[card_no].num_chips = nchan / CyPORTS_PER_CHIP;
 	} else {
 		cy_card[card_no].hw_ver = mailbox;
 		cy_card[card_no].num_chips = (unsigned int)-1;
@@ -5112,6 +5087,7 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 	cy_card[card_no].irq = irq;
 	cy_card[card_no].bus_index = 1;
 	cy_card[card_no].first_line = cy_next_channel;
+	cy_card[card_no].nports = nchan;
 	retval = cy_init_card(&cy_card[card_no]);
 	if (retval)
 		goto err_null;
-- 
cgit v0.10.2


From b39933fbd304021580800796683b8ddaa3dd0a6a Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:35:21 +0100
Subject: tty: cyclades, remove unused variables

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index 6adbc97..4560190 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -2963,7 +2963,6 @@ static void set_line_char(struct cyclades_port *info)
 	void __iomem *base_addr;
 	int chip, channel, index;
 	unsigned cflag, iflag;
-	unsigned short chip_number;
 	int baud, baud_rate = 0;
 	int i;
 
@@ -2992,7 +2991,6 @@ static void set_line_char(struct cyclades_port *info)
 
 	card = info->card;
 	channel = info->line - card->first_line;
-	chip_number = channel / 4;
 
 	if (!cy_is_Z(card)) {
 
@@ -3212,9 +3210,7 @@ static void set_line_char(struct cyclades_port *info)
 	} else {
 		struct FIRM_ID __iomem *firm_id;
 		struct ZFW_CTRL __iomem *zfw_ctrl;
-		struct BOARD_CTRL __iomem *board_ctrl;
 		struct CH_CTRL __iomem *ch_ctrl;
-		struct BUF_CTRL __iomem *buf_ctrl;
 		__u32 sw_flow;
 		int retval;
 
@@ -3224,9 +3220,7 @@ static void set_line_char(struct cyclades_port *info)
 
 		zfw_ctrl = card->base_addr +
 			(readl(&firm_id->zfwctrl_addr) & 0xfffff);
-		board_ctrl = &zfw_ctrl->board_ctrl;
 		ch_ctrl = &(zfw_ctrl->ch_ctrl[channel]);
-		buf_ctrl = &zfw_ctrl->buf_ctrl[channel];
 
 		/* baud rate */
 		baud = tty_get_baud_rate(info->port.tty);
-- 
cgit v0.10.2


From 10077d4a6674f535abdbe25cdecb1202af7948f1 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 12:36:09 +0100
Subject: tty: cdc_acm add krefs

Now we have a port structure begin using the fields and kref counts

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index 41d4ca5..b4e73aa 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -62,7 +62,7 @@
 #include <linux/tty_flip.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/usb.h>
 #include <linux/usb/cdc.h>
 #include <asm/byteorder.h>
@@ -87,7 +87,7 @@ static struct acm *acm_table[ACM_TTY_MINORS];
 
 static DEFINE_MUTEX(open_mutex);
 
-#define ACM_READY(acm)	(acm && acm->dev && acm->used)
+#define ACM_READY(acm)	(acm && acm->dev && acm->port.count)
 
 static const struct tty_port_operations acm_port_ops = {
 };
@@ -265,6 +265,7 @@ static void acm_ctrl_irq(struct urb *urb)
 {
 	struct acm *acm = urb->context;
 	struct usb_cdc_notification *dr = urb->transfer_buffer;
+	struct tty_struct *tty;
 	unsigned char *data;
 	int newctrl;
 	int retval;
@@ -297,12 +298,16 @@ static void acm_ctrl_irq(struct urb *urb)
 			break;
 
 		case USB_CDC_NOTIFY_SERIAL_STATE:
-
+			tty = tty_port_tty_get(&acm->port);
 			newctrl = get_unaligned_le16(data);
 
-			if (acm->tty && !acm->clocal && (acm->ctrlin & ~newctrl & ACM_CTRL_DCD)) {
-				dbg("calling hangup");
-				tty_hangup(acm->tty);
+			if (tty) {
+				if (!acm->clocal &&
+				    (acm->ctrlin & ~newctrl & ACM_CTRL_DCD)) {
+					dbg("calling hangup");
+					tty_hangup(tty);
+				}
+				tty_kref_put(tty);
 			}
 
 			acm->ctrlin = newctrl;
@@ -374,15 +379,14 @@ static void acm_rx_tasklet(unsigned long _acm)
 {
 	struct acm *acm = (void *)_acm;
 	struct acm_rb *buf;
-	struct tty_struct *tty = acm->tty;
+	struct tty_struct *tty;
 	struct acm_ru *rcv;
 	unsigned long flags;
 	unsigned char throttled;
 
 	dbg("Entering acm_rx_tasklet");
 
-	if (!ACM_READY(acm))
-	{
+	if (!ACM_READY(acm)) {
 		dbg("acm_rx_tasklet: ACM not ready");
 		return;
 	}
@@ -390,12 +394,13 @@ static void acm_rx_tasklet(unsigned long _acm)
 	spin_lock_irqsave(&acm->throttle_lock, flags);
 	throttled = acm->throttle;
 	spin_unlock_irqrestore(&acm->throttle_lock, flags);
-	if (throttled)
-	{
+	if (throttled) {
 		dbg("acm_rx_tasklet: throttled");
 		return;
 	}
 
+	tty = tty_port_tty_get(&acm->port);
+
 next_buffer:
 	spin_lock_irqsave(&acm->read_lock, flags);
 	if (list_empty(&acm->filled_read_bufs)) {
@@ -409,20 +414,22 @@ next_buffer:
 
 	dbg("acm_rx_tasklet: procesing buf 0x%p, size = %d", buf, buf->size);
 
-	tty_buffer_request_room(tty, buf->size);
-	spin_lock_irqsave(&acm->throttle_lock, flags);
-	throttled = acm->throttle;
-	spin_unlock_irqrestore(&acm->throttle_lock, flags);
-	if (!throttled)
-		tty_insert_flip_string(tty, buf->base, buf->size);
-	tty_flip_buffer_push(tty);
-
-	if (throttled) {
-		dbg("Throttling noticed");
-		spin_lock_irqsave(&acm->read_lock, flags);
-		list_add(&buf->list, &acm->filled_read_bufs);
-		spin_unlock_irqrestore(&acm->read_lock, flags);
-		return;
+	if (tty) {
+		spin_lock_irqsave(&acm->throttle_lock, flags);
+		throttled = acm->throttle;
+		spin_unlock_irqrestore(&acm->throttle_lock, flags);
+		if (!throttled) {
+			tty_buffer_request_room(tty, buf->size);
+			tty_insert_flip_string(tty, buf->base, buf->size);
+			tty_flip_buffer_push(tty);
+		} else {
+			tty_kref_put(tty);
+			dbg("Throttling noticed");
+			spin_lock_irqsave(&acm->read_lock, flags);
+			list_add(&buf->list, &acm->filled_read_bufs);
+			spin_unlock_irqrestore(&acm->read_lock, flags);
+			return;
+		}
 	}
 
 	spin_lock_irqsave(&acm->read_lock, flags);
@@ -431,6 +438,8 @@ next_buffer:
 	goto next_buffer;
 
 urbs:
+	tty_kref_put(tty);
+
 	while (!list_empty(&acm->spare_read_bufs)) {
 		spin_lock_irqsave(&acm->read_lock, flags);
 		if (list_empty(&acm->spare_read_urbs)) {
@@ -502,11 +511,14 @@ static void acm_write_bulk(struct urb *urb)
 static void acm_softint(struct work_struct *work)
 {
 	struct acm *acm = container_of(work, struct acm, work);
+	struct tty_struct *tty;
 
 	dev_vdbg(&acm->data->dev, "tx work\n");
 	if (!ACM_READY(acm))
 		return;
-	tty_wakeup(acm->tty);
+	tty = tty_port_tty_get(&acm->port);
+	tty_wakeup(tty);
+	tty_kref_put(tty);
 }
 
 static void acm_waker(struct work_struct *waker)
@@ -546,8 +558,9 @@ static int acm_tty_open(struct tty_struct *tty, struct file *filp)
 		rv = 0;
 
 	set_bit(TTY_NO_WRITE_SPLIT, &tty->flags);
+
 	tty->driver_data = acm;
-	acm->tty = tty;
+	tty_port_tty_set(&acm->port, tty);
 
 	if (usb_autopm_get_interface(acm->control) < 0)
 		goto early_bail;
@@ -555,11 +568,10 @@ static int acm_tty_open(struct tty_struct *tty, struct file *filp)
 		acm->control->needs_remote_wakeup = 1;
 
 	mutex_lock(&acm->mutex);
-	if (acm->used++) {
+	if (acm->port.count++) {
 		usb_autopm_put_interface(acm->control);
 		goto done;
-        }
-
+	}
 
 	acm->ctrlurb->dev = acm->dev;
 	if (usb_submit_urb(acm->ctrlurb, GFP_KERNEL)) {
@@ -570,6 +582,7 @@ static int acm_tty_open(struct tty_struct *tty, struct file *filp)
 	if (0 > acm_set_control(acm, acm->ctrlout = ACM_CTRL_DTR | ACM_CTRL_RTS) &&
 	    (acm->ctrl_caps & USB_CDC_CAP_LINE))
 		goto full_bailout;
+
 	usb_autopm_put_interface(acm->control);
 
 	INIT_LIST_HEAD(&acm->spare_read_urbs);
@@ -585,7 +598,7 @@ static int acm_tty_open(struct tty_struct *tty, struct file *filp)
 	acm->throttle = 0;
 
 	tasklet_schedule(&acm->urb_task);
-
+	rv = tty_port_block_til_ready(&acm->port, tty, filp);
 done:
 	mutex_unlock(&acm->mutex);
 err_out:
@@ -596,16 +609,17 @@ full_bailout:
 	usb_kill_urb(acm->ctrlurb);
 bail_out:
 	usb_autopm_put_interface(acm->control);
-	acm->used--;
+	acm->port.count--;
 	mutex_unlock(&acm->mutex);
 early_bail:
 	mutex_unlock(&open_mutex);
+	tty_port_tty_set(&acm->port, NULL);
 	return -EIO;
 }
 
 static void acm_tty_unregister(struct acm *acm)
 {
-	int i,nr;
+	int i, nr;
 
 	nr = acm->rx_buflimit;
 	tty_unregister_device(acm_tty_driver, acm->minor);
@@ -622,37 +636,51 @@ static void acm_tty_unregister(struct acm *acm)
 
 static int acm_tty_chars_in_buffer(struct tty_struct *tty);
 
+static void acm_port_down(struct acm *acm, int drain)
+{
+	int i, nr = acm->rx_buflimit;
+	mutex_lock(&open_mutex);
+	if (acm->dev) {
+		usb_autopm_get_interface(acm->control);
+		acm_set_control(acm, acm->ctrlout = 0);
+		/* try letting the last writes drain naturally */
+		if (drain) {
+			wait_event_interruptible_timeout(acm->drain_wait,
+				(ACM_NW == acm_wb_is_avail(acm)) || !acm->dev,
+					ACM_CLOSE_TIMEOUT * HZ);
+		}
+		usb_kill_urb(acm->ctrlurb);
+		for (i = 0; i < ACM_NW; i++)
+			usb_kill_urb(acm->wb[i].urb);
+		for (i = 0; i < nr; i++)
+			usb_kill_urb(acm->ru[i].urb);
+		acm->control->needs_remote_wakeup = 0;
+		usb_autopm_put_interface(acm->control);
+	}
+	mutex_unlock(&open_mutex);
+}
+
+static void acm_tty_hangup(struct tty_struct *tty)
+{
+	struct acm *acm = tty->driver_data;
+	tty_port_hangup(&acm->port);
+	acm_port_down(acm, 0);
+}
+
 static void acm_tty_close(struct tty_struct *tty, struct file *filp)
 {
 	struct acm *acm = tty->driver_data;
-	int i,nr;
 
-	if (!acm || !acm->used)
+	/* Perform the closing process and see if we need to do the hardware
+	   shutdown */
+	if (tty_port_close_start(&acm->port, tty, filp) == 0)
 		return;
-
-	nr = acm->rx_buflimit;
+	acm_port_down(acm, 0);
+	tty_port_close_end(&acm->port, tty);
 	mutex_lock(&open_mutex);
-	if (!--acm->used) {
-		if (acm->dev) {
-			usb_autopm_get_interface(acm->control);
-			acm_set_control(acm, acm->ctrlout = 0);
-
-			/* try letting the last writes drain naturally */
-			wait_event_interruptible_timeout(acm->drain_wait,
-					(ACM_NW == acm_wb_is_avail(acm))
-						|| !acm->dev,
-					ACM_CLOSE_TIMEOUT * HZ);
-
-			usb_kill_urb(acm->ctrlurb);
-			for (i = 0; i < ACM_NW; i++)
-				usb_kill_urb(acm->wb[i].urb);
-			for (i = 0; i < nr; i++)
-				usb_kill_urb(acm->ru[i].urb);
-			acm->control->needs_remote_wakeup = 0;
-			usb_autopm_put_interface(acm->control);
-		} else
-			acm_tty_unregister(acm);
-	}
+	tty_port_tty_set(&acm->port, NULL);
+	if (!acm->dev)
+		acm_tty_unregister(acm);
 	mutex_unlock(&open_mutex);
 }
 
@@ -885,8 +913,8 @@ static int acm_write_buffers_alloc(struct acm *acm)
 	return 0;
 }
 
-static int acm_probe (struct usb_interface *intf,
-		      const struct usb_device_id *id)
+static int acm_probe(struct usb_interface *intf,
+		     const struct usb_device_id *id)
 {
 	struct usb_cdc_union_desc *union_header = NULL;
 	struct usb_cdc_country_functional_desc *cfd = NULL;
@@ -1232,6 +1260,7 @@ static void acm_disconnect(struct usb_interface *intf)
 {
 	struct acm *acm = usb_get_intfdata(intf);
 	struct usb_device *usb_dev = interface_to_usbdev(intf);
+	struct tty_struct *tty;
 
 	/* sibling interface is already cleaning up */
 	if (!acm)
@@ -1258,16 +1287,18 @@ static void acm_disconnect(struct usb_interface *intf)
 	usb_driver_release_interface(&acm_driver, intf == acm->control ?
 					acm->data : acm->control);
 
-	if (!acm->used) {
+	if (acm->port.count == 0) {
 		acm_tty_unregister(acm);
 		mutex_unlock(&open_mutex);
 		return;
 	}
 
 	mutex_unlock(&open_mutex);
-
-	if (acm->tty)
-		tty_hangup(acm->tty);
+	tty = tty_port_tty_get(&acm->port);
+	if (tty) {
+		tty_hangup(tty);
+		tty_kref_put(tty);
+	}
 }
 
 #ifdef CONFIG_PM
@@ -1302,7 +1333,7 @@ static int acm_suspend(struct usb_interface *intf, pm_message_t message)
 	*/
 	mutex_lock(&acm->mutex);
 
-	if (acm->used)
+	if (acm->port.count)
 		stop_data_traffic(acm);
 
 	mutex_unlock(&acm->mutex);
@@ -1324,7 +1355,7 @@ static int acm_resume(struct usb_interface *intf)
 		return 0;
 
 	mutex_lock(&acm->mutex);
-	if (acm->used) {
+	if (acm->port.count) {
 		rv = usb_submit_urb(acm->ctrlurb, GFP_NOIO);
 		if (rv < 0)
 			goto err_out;
@@ -1434,6 +1465,7 @@ static struct usb_driver acm_driver = {
 static const struct tty_operations acm_ops = {
 	.open =			acm_tty_open,
 	.close =		acm_tty_close,
+	.hangup =		acm_tty_hangup,
 	.write =		acm_tty_write,
 	.write_room =		acm_tty_write_room,
 	.ioctl =		acm_tty_ioctl,
diff --git a/drivers/usb/class/cdc-acm.h b/drivers/usb/class/cdc-acm.h
index 19967cd..4c38564 100644
--- a/drivers/usb/class/cdc-acm.h
+++ b/drivers/usb/class/cdc-acm.h
@@ -89,7 +89,6 @@ struct acm {
 	struct usb_device *dev;				/* the corresponding usb device */
 	struct usb_interface *control;			/* control interface */
 	struct usb_interface *data;			/* data interface */
-	struct tty_struct *tty;				/* the corresponding tty */
 	struct tty_port port;			 	/* our tty port data */
 	struct urb *ctrlurb;				/* urbs */
 	u8 *ctrl_buffer;				/* buffers of urbs */
@@ -121,7 +120,6 @@ struct acm {
 	unsigned int ctrlout;				/* output control lines (DTR, RTS) */
 	unsigned int writesize;				/* max packet size for the output bulk endpoint */
 	unsigned int readsize,ctrlsize;			/* buffer sizes for freeing */
-	unsigned int used;				/* someone has this acm's device open */
 	unsigned int minor;				/* acm minor number */
 	unsigned char throttle;				/* throttled by tty layer */
 	unsigned char clocal;				/* termios CLOCAL */
-- 
cgit v0.10.2


From 6e47e069eb4dffa88ad91ddfc3fd85f32c35654b Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 12:37:06 +0100
Subject: tty: Clean up the ACM driver to CodingStyle

Or at least most of it. There are further clean ups possible and there are
are also thing checkpatch moans about that would be silly to "fix".

Also note some FIXME points found as the cleanup was done.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index b4e73aa..ddeb691 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -16,7 +16,8 @@
  *	v0.9  - thorough cleaning, URBification, almost a rewrite
  *	v0.10 - some more cleanups
  *	v0.11 - fixed flow control, read error doesn't stop reads
- *	v0.12 - added TIOCM ioctls, added break handling, made struct acm kmalloced
+ *	v0.12 - added TIOCM ioctls, added break handling, made struct acm
+ *		kmalloced
  *	v0.13 - added termios, added hangup
  *	v0.14 - sized down struct acm
  *	v0.15 - fixed flow control again - characters could be lost
@@ -102,13 +103,15 @@ static const struct tty_port_operations acm_port_ops = {
  * Functions for ACM control messages.
  */
 
-static int acm_ctrl_msg(struct acm *acm, int request, int value, void *buf, int len)
+static int acm_ctrl_msg(struct acm *acm, int request, int value,
+							void *buf, int len)
 {
 	int retval = usb_control_msg(acm->dev, usb_sndctrlpipe(acm->dev, 0),
 		request, USB_RT_ACM, value,
 		acm->control->altsetting[0].desc.bInterfaceNumber,
 		buf, len, 5000);
-	dbg("acm_control_msg: rq: 0x%02x val: %#x len: %#x result: %d", request, value, len, retval);
+	dbg("acm_control_msg: rq: 0x%02x val: %#x len: %#x result: %d",
+						request, value, len, retval);
 	return retval < 0 ? retval : 0;
 }
 
@@ -153,9 +156,8 @@ static int acm_wb_is_avail(struct acm *acm)
 
 	n = ACM_NW;
 	spin_lock_irqsave(&acm->write_lock, flags);
-	for (i = 0; i < ACM_NW; i++) {
+	for (i = 0; i < ACM_NW; i++)
 		n -= acm->wb[i].use;
-	}
 	spin_unlock_irqrestore(&acm->write_lock, flags);
 	return n;
 }
@@ -186,7 +188,8 @@ static int acm_start_wb(struct acm *acm, struct acm_wb *wb)
 	wb->urb->transfer_buffer_length = wb->len;
 	wb->urb->dev = acm->dev;
 
-	if ((rc = usb_submit_urb(wb->urb, GFP_ATOMIC)) < 0) {
+	rc = usb_submit_urb(wb->urb, GFP_ATOMIC);
+	if (rc < 0) {
 		dbg("usb_submit_urb(write bulk) failed: %d", rc);
 		acm_write_done(acm, wb);
 	}
@@ -291,44 +294,45 @@ static void acm_ctrl_irq(struct urb *urb)
 
 	data = (unsigned char *)(dr + 1);
 	switch (dr->bNotificationType) {
+	case USB_CDC_NOTIFY_NETWORK_CONNECTION:
+		dbg("%s network", dr->wValue ?
+					"connected to" : "disconnected from");
+		break;
 
-		case USB_CDC_NOTIFY_NETWORK_CONNECTION:
+	case USB_CDC_NOTIFY_SERIAL_STATE:
+		tty = tty_port_tty_get(&acm->port);
+		newctrl = get_unaligned_le16(data);
 
-			dbg("%s network", dr->wValue ? "connected to" : "disconnected from");
-			break;
-
-		case USB_CDC_NOTIFY_SERIAL_STATE:
-			tty = tty_port_tty_get(&acm->port);
-			newctrl = get_unaligned_le16(data);
-
-			if (tty) {
-				if (!acm->clocal &&
-				    (acm->ctrlin & ~newctrl & ACM_CTRL_DCD)) {
-					dbg("calling hangup");
-					tty_hangup(tty);
-				}
-				tty_kref_put(tty);
+		if (tty) {
+			if (!acm->clocal &&
+				(acm->ctrlin & ~newctrl & ACM_CTRL_DCD)) {
+				dbg("calling hangup");
+				tty_hangup(tty);
 			}
+			tty_kref_put(tty);
+		}
 
-			acm->ctrlin = newctrl;
-
-			dbg("input control lines: dcd%c dsr%c break%c ring%c framing%c parity%c overrun%c",
-				acm->ctrlin & ACM_CTRL_DCD ? '+' : '-',	acm->ctrlin & ACM_CTRL_DSR ? '+' : '-',
-				acm->ctrlin & ACM_CTRL_BRK ? '+' : '-',	acm->ctrlin & ACM_CTRL_RI  ? '+' : '-',
-				acm->ctrlin & ACM_CTRL_FRAMING ? '+' : '-',	acm->ctrlin & ACM_CTRL_PARITY ? '+' : '-',
-				acm->ctrlin & ACM_CTRL_OVERRUN ? '+' : '-');
+		acm->ctrlin = newctrl;
 
+		dbg("input control lines: dcd%c dsr%c break%c ring%c framing%c parity%c overrun%c",
+			acm->ctrlin & ACM_CTRL_DCD ? '+' : '-',
+			acm->ctrlin & ACM_CTRL_DSR ? '+' : '-',
+			acm->ctrlin & ACM_CTRL_BRK ? '+' : '-',
+			acm->ctrlin & ACM_CTRL_RI  ? '+' : '-',
+			acm->ctrlin & ACM_CTRL_FRAMING ? '+' : '-',
+			acm->ctrlin & ACM_CTRL_PARITY ? '+' : '-',
+			acm->ctrlin & ACM_CTRL_OVERRUN ? '+' : '-');
 			break;
 
-		default:
-			dbg("unknown notification %d received: index %d len %d data0 %d data1 %d",
-				dr->bNotificationType, dr->wIndex,
-				dr->wLength, data[0], data[1]);
-			break;
+	default:
+		dbg("unknown notification %d received: index %d len %d data0 %d data1 %d",
+			dr->bNotificationType, dr->wIndex,
+			dr->wLength, data[0], data[1]);
+		break;
 	}
 exit:
 	usb_mark_last_busy(acm->dev);
-	retval = usb_submit_urb (urb, GFP_ATOMIC);
+	retval = usb_submit_urb(urb, GFP_ATOMIC);
 	if (retval)
 		dev_err(&urb->dev->dev, "%s - usb_submit_urb failed with "
 			"result %d", __func__, retval);
@@ -466,10 +470,11 @@ urbs:
 		rcv->urb->transfer_dma = buf->dma;
 		rcv->urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
 
-		/* This shouldn't kill the driver as unsuccessful URBs are returned to the
-		   free-urbs-pool and resubmited ASAP */
+		/* This shouldn't kill the driver as unsuccessful URBs are
+		   returned to the free-urbs-pool and resubmited ASAP */
 		spin_lock_irqsave(&acm->read_lock, flags);
-		if (acm->susp_count || usb_submit_urb(rcv->urb, GFP_ATOMIC) < 0) {
+		if (acm->susp_count ||
+				usb_submit_urb(rcv->urb, GFP_ATOMIC) < 0) {
 			list_add(&buf->list, &acm->spare_read_bufs);
 			list_add(&rcv->list, &acm->spare_read_urbs);
 			acm->processing = 0;
@@ -588,12 +593,11 @@ static int acm_tty_open(struct tty_struct *tty, struct file *filp)
 	INIT_LIST_HEAD(&acm->spare_read_urbs);
 	INIT_LIST_HEAD(&acm->spare_read_bufs);
 	INIT_LIST_HEAD(&acm->filled_read_bufs);
-	for (i = 0; i < acm->rx_buflimit; i++) {
+
+	for (i = 0; i < acm->rx_buflimit; i++)
 		list_add(&(acm->ru[i].list), &acm->spare_read_urbs);
-	}
-	for (i = 0; i < acm->rx_buflimit; i++) {
+	for (i = 0; i < acm->rx_buflimit; i++)
 		list_add(&(acm->rb[i].list), &acm->spare_read_bufs);
-	}
 
 	acm->throttle = 0;
 
@@ -684,7 +688,8 @@ static void acm_tty_close(struct tty_struct *tty, struct file *filp)
 	mutex_unlock(&open_mutex);
 }
 
-static int acm_tty_write(struct tty_struct *tty, const unsigned char *buf, int count)
+static int acm_tty_write(struct tty_struct *tty,
+					const unsigned char *buf, int count)
 {
 	struct acm *acm = tty->driver_data;
 	int stat;
@@ -700,7 +705,8 @@ static int acm_tty_write(struct tty_struct *tty, const unsigned char *buf, int c
 		return 0;
 
 	spin_lock_irqsave(&acm->write_lock, flags);
-	if ((wbn = acm_wb_alloc(acm)) < 0) {
+	wbn = acm_wb_alloc(acm);
+	if (wbn < 0) {
 		spin_unlock_irqrestore(&acm->write_lock, flags);
 		return 0;
 	}
@@ -712,7 +718,8 @@ static int acm_tty_write(struct tty_struct *tty, const unsigned char *buf, int c
 	wb->len = count;
 	spin_unlock_irqrestore(&acm->write_lock, flags);
 
-	if ((stat = acm_write_start(acm, wbn)) < 0)
+	stat = acm_write_start(acm, wbn);
+	if (stat < 0)
 		return stat;
 	return count;
 }
@@ -798,8 +805,10 @@ static int acm_tty_tiocmset(struct tty_struct *tty, struct file *file,
 		return -EINVAL;
 
 	newctrl = acm->ctrlout;
-	set = (set & TIOCM_DTR ? ACM_CTRL_DTR : 0) | (set & TIOCM_RTS ? ACM_CTRL_RTS : 0);
-	clear = (clear & TIOCM_DTR ? ACM_CTRL_DTR : 0) | (clear & TIOCM_RTS ? ACM_CTRL_RTS : 0);
+	set = (set & TIOCM_DTR ? ACM_CTRL_DTR : 0) |
+					(set & TIOCM_RTS ? ACM_CTRL_RTS : 0);
+	clear = (clear & TIOCM_DTR ? ACM_CTRL_DTR : 0) |
+					(clear & TIOCM_RTS ? ACM_CTRL_RTS : 0);
 
 	newctrl = (newctrl & ~clear) | set;
 
@@ -808,7 +817,8 @@ static int acm_tty_tiocmset(struct tty_struct *tty, struct file *file,
 	return acm_set_control(acm, acm->ctrlout = newctrl);
 }
 
-static int acm_tty_ioctl(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg)
+static int acm_tty_ioctl(struct tty_struct *tty, struct file *file,
+					unsigned int cmd, unsigned long arg)
 {
 	struct acm *acm = tty->driver_data;
 
@@ -830,7 +840,8 @@ static const __u8 acm_tty_size[] = {
 	5, 6, 7, 8
 };
 
-static void acm_tty_set_termios(struct tty_struct *tty, struct ktermios *termios_old)
+static void acm_tty_set_termios(struct tty_struct *tty,
+						struct ktermios *termios_old)
 {
 	struct acm *acm = tty->driver_data;
 	struct ktermios *termios = tty->termios;
@@ -840,19 +851,23 @@ static void acm_tty_set_termios(struct tty_struct *tty, struct ktermios *termios
 	if (!ACM_READY(acm))
 		return;
 
+	/* FIXME: Needs to support the tty_baud interface */
+	/* FIXME: Broken on sparc */
 	newline.dwDTERate = cpu_to_le32p(acm_tty_speed +
 		(termios->c_cflag & CBAUD & ~CBAUDEX) + (termios->c_cflag & CBAUDEX ? 15 : 0));
 	newline.bCharFormat = termios->c_cflag & CSTOPB ? 2 : 0;
 	newline.bParityType = termios->c_cflag & PARENB ?
-		(termios->c_cflag & PARODD ? 1 : 2) + (termios->c_cflag & CMSPAR ? 2 : 0) : 0;
+				(termios->c_cflag & PARODD ? 1 : 2) +
+				(termios->c_cflag & CMSPAR ? 2 : 0) : 0;
 	newline.bDataBits = acm_tty_size[(termios->c_cflag & CSIZE) >> 4];
-
+	/* FIXME: Needs to clear unsupported bits in the termios */
 	acm->clocal = ((termios->c_cflag & CLOCAL) != 0);
 
 	if (!newline.dwDTERate) {
 		newline.dwDTERate = acm->line.dwDTERate;
 		newctrl &= ~ACM_CTRL_DTR;
-	} else  newctrl |=  ACM_CTRL_DTR;
+	} else
+		newctrl |=  ACM_CTRL_DTR;
 
 	if (newctrl != acm->ctrlout)
 		acm_set_control(acm, acm->ctrlout = newctrl);
@@ -877,9 +892,8 @@ static void acm_write_buffers_free(struct acm *acm)
 	struct acm_wb *wb;
 	struct usb_device *usb_dev = interface_to_usbdev(acm->control);
 
-	for (wb = &acm->wb[0], i = 0; i < ACM_NW; i++, wb++) {
+	for (wb = &acm->wb[0], i = 0; i < ACM_NW; i++, wb++)
 		usb_buffer_free(usb_dev, acm->writesize, wb->buf, wb->dmah);
-	}
 }
 
 static void acm_read_buffers_free(struct acm *acm)
@@ -888,7 +902,8 @@ static void acm_read_buffers_free(struct acm *acm)
 	int i, n = acm->rx_buflimit;
 
 	for (i = 0; i < n; i++)
-		usb_buffer_free(usb_dev, acm->readsize, acm->rb[i].base, acm->rb[i].dma);
+		usb_buffer_free(usb_dev, acm->readsize,
+					acm->rb[i].base, acm->rb[i].dma);
 }
 
 /* Little helper: write buffers allocate */
@@ -928,7 +943,7 @@ static int acm_probe(struct usb_interface *intf,
 	struct usb_device *usb_dev = interface_to_usbdev(intf);
 	struct acm *acm;
 	int minor;
-	int ctrlsize,readsize;
+	int ctrlsize, readsize;
 	u8 *buf;
 	u8 ac_management_function = 0;
 	u8 call_management_function = 0;
@@ -948,7 +963,7 @@ static int acm_probe(struct usb_interface *intf,
 		control_interface = usb_ifnum_to_if(usb_dev, 0);
 		goto skip_normal_probe;
 	}
-	
+
 	/* normal probing*/
 	if (!buffer) {
 		dev_err(&intf->dev, "Weird descriptor references\n");
@@ -956,8 +971,10 @@ static int acm_probe(struct usb_interface *intf,
 	}
 
 	if (!buflen) {
-		if (intf->cur_altsetting->endpoint->extralen && intf->cur_altsetting->endpoint->extra) {
-			dev_dbg(&intf->dev,"Seeking extra descriptors on endpoint\n");
+		if (intf->cur_altsetting->endpoint->extralen &&
+				intf->cur_altsetting->endpoint->extra) {
+			dev_dbg(&intf->dev,
+				"Seeking extra descriptors on endpoint\n");
 			buflen = intf->cur_altsetting->endpoint->extralen;
 			buffer = intf->cur_altsetting->endpoint->extra;
 		} else {
@@ -968,47 +985,43 @@ static int acm_probe(struct usb_interface *intf,
 	}
 
 	while (buflen > 0) {
-		if (buffer [1] != USB_DT_CS_INTERFACE) {
+		if (buffer[1] != USB_DT_CS_INTERFACE) {
 			dev_err(&intf->dev, "skipping garbage\n");
 			goto next_desc;
 		}
 
-		switch (buffer [2]) {
-			case USB_CDC_UNION_TYPE: /* we've found it */
-				if (union_header) {
-					dev_err(&intf->dev, "More than one "
-						"union descriptor, "
-						"skipping ...\n");
-					goto next_desc;
-				}
-				union_header = (struct usb_cdc_union_desc *)
-							buffer;
-				break;
-			case USB_CDC_COUNTRY_TYPE: /* export through sysfs*/
-				cfd = (struct usb_cdc_country_functional_desc *)buffer;
-				break;
-			case USB_CDC_HEADER_TYPE: /* maybe check version */ 
-				break; /* for now we ignore it */ 
-			case USB_CDC_ACM_TYPE:
-				ac_management_function = buffer[3];
-				break;
-			case USB_CDC_CALL_MANAGEMENT_TYPE:
-				call_management_function = buffer[3];
-				call_interface_num = buffer[4];
-				if ((call_management_function & 3) != 3)
-					dev_err(&intf->dev, "This device "
-						"cannot do calls on its own. "
-						"It is no modem.\n");
-				break;
-			default:
-				/* there are LOTS more CDC descriptors that
-				 * could legitimately be found here.
-				 */
-				dev_dbg(&intf->dev, "Ignoring descriptor: "
-						"type %02x, length %d\n",
-						buffer[2], buffer[0]);
-				break;
+		switch (buffer[2]) {
+		case USB_CDC_UNION_TYPE: /* we've found it */
+			if (union_header) {
+				dev_err(&intf->dev, "More than one "
+					"union descriptor, skipping ...\n");
+				goto next_desc;
 			}
+			union_header = (struct usb_cdc_union_desc *)buffer;
+			break;
+		case USB_CDC_COUNTRY_TYPE: /* export through sysfs*/
+			cfd = (struct usb_cdc_country_functional_desc *)buffer;
+			break;
+		case USB_CDC_HEADER_TYPE: /* maybe check version */
+			break; /* for now we ignore it */
+		case USB_CDC_ACM_TYPE:
+			ac_management_function = buffer[3];
+			break;
+		case USB_CDC_CALL_MANAGEMENT_TYPE:
+			call_management_function = buffer[3];
+			call_interface_num = buffer[4];
+			if ((call_management_function & 3) != 3)
+				dev_err(&intf->dev, "This device cannot do calls on its own. It is not a modem.\n");
+			break;
+		default:
+			/* there are LOTS more CDC descriptors that
+			 * could legitimately be found here.
+			 */
+			dev_dbg(&intf->dev, "Ignoring descriptor: "
+					"type %02x, length %d\n",
+					buffer[2], buffer[0]);
+			break;
+		}
 next_desc:
 		buflen -= buffer[0];
 		buffer += buffer[0];
@@ -1016,33 +1029,36 @@ next_desc:
 
 	if (!union_header) {
 		if (call_interface_num > 0) {
-			dev_dbg(&intf->dev,"No union descriptor, using call management descriptor\n");
+			dev_dbg(&intf->dev, "No union descriptor, using call management descriptor\n");
 			data_interface = usb_ifnum_to_if(usb_dev, (data_interface_num = call_interface_num));
 			control_interface = intf;
 		} else {
-			dev_dbg(&intf->dev,"No union descriptor, giving up\n");
+			dev_dbg(&intf->dev,
+					"No union descriptor, giving up\n");
 			return -ENODEV;
 		}
 	} else {
 		control_interface = usb_ifnum_to_if(usb_dev, union_header->bMasterInterface0);
 		data_interface = usb_ifnum_to_if(usb_dev, (data_interface_num = union_header->bSlaveInterface0));
 		if (!control_interface || !data_interface) {
-			dev_dbg(&intf->dev,"no interfaces\n");
+			dev_dbg(&intf->dev, "no interfaces\n");
 			return -ENODEV;
 		}
 	}
-	
+
 	if (data_interface_num != call_interface_num)
-		dev_dbg(&intf->dev,"Separate call control interface. That is not fully supported.\n");
+		dev_dbg(&intf->dev, "Separate call control interface. That is not fully supported.\n");
 
 skip_normal_probe:
 
 	/*workaround for switched interfaces */
-	if (data_interface->cur_altsetting->desc.bInterfaceClass != CDC_DATA_INTERFACE_TYPE) {
-		if (control_interface->cur_altsetting->desc.bInterfaceClass == CDC_DATA_INTERFACE_TYPE) {
+	if (data_interface->cur_altsetting->desc.bInterfaceClass
+						!= CDC_DATA_INTERFACE_TYPE) {
+		if (control_interface->cur_altsetting->desc.bInterfaceClass
+						== CDC_DATA_INTERFACE_TYPE) {
 			struct usb_interface *t;
-			dev_dbg(&intf->dev,"Your device has switched interfaces.\n");
-
+			dev_dbg(&intf->dev,
+				"Your device has switched interfaces.\n");
 			t = control_interface;
 			control_interface = data_interface;
 			data_interface = t;
@@ -1054,9 +1070,9 @@ skip_normal_probe:
 	/* Accept probe requests only for the control interface */
 	if (intf != control_interface)
 		return -ENODEV;
-	
+
 	if (usb_interface_claimed(data_interface)) { /* valid in this context */
-		dev_dbg(&intf->dev,"The data interface isn't available\n");
+		dev_dbg(&intf->dev, "The data interface isn't available\n");
 		return -EBUSY;
 	}
 
@@ -1073,8 +1089,8 @@ skip_normal_probe:
 	if (!usb_endpoint_dir_in(epread)) {
 		/* descriptors are swapped */
 		struct usb_endpoint_descriptor *t;
-		dev_dbg(&intf->dev,"The data interface has switched endpoints\n");
-		
+		dev_dbg(&intf->dev,
+			"The data interface has switched endpoints\n");
 		t = epread;
 		epread = epwrite;
 		epwrite = t;
@@ -1087,13 +1103,15 @@ skip_normal_probe:
 		return -ENODEV;
 	}
 
-	if (!(acm = kzalloc(sizeof(struct acm), GFP_KERNEL))) {
+	acm = kzalloc(sizeof(struct acm), GFP_KERNEL);
+	if (acm == NULL) {
 		dev_dbg(&intf->dev, "out of memory (acm kzalloc)\n");
 		goto alloc_fail;
 	}
 
 	ctrlsize = le16_to_cpu(epctrl->wMaxPacketSize);
-	readsize = le16_to_cpu(epread->wMaxPacketSize)* ( quirks == SINGLE_RX_URB ? 1 : 2);
+	readsize = le16_to_cpu(epread->wMaxPacketSize) *
+				(quirks == SINGLE_RX_URB ? 1 : 2);
 	acm->writesize = le16_to_cpu(epwrite->wMaxPacketSize) * 20;
 	acm->control = control_interface;
 	acm->data = data_interface;
@@ -1136,8 +1154,10 @@ skip_normal_probe:
 	for (i = 0; i < num_rx_buf; i++) {
 		struct acm_ru *rcv = &(acm->ru[i]);
 
-		if (!(rcv->urb = usb_alloc_urb(0, GFP_KERNEL))) {
-			dev_dbg(&intf->dev, "out of memory (read urbs usb_alloc_urb)\n");
+		rcv->urb = usb_alloc_urb(0, GFP_KERNEL);
+		if (rcv->urb == NULL) {
+			dev_dbg(&intf->dev,
+				"out of memory (read urbs usb_alloc_urb)\n");
 			goto alloc_fail7;
 		}
 
@@ -1150,26 +1170,29 @@ skip_normal_probe:
 		rb->base = usb_buffer_alloc(acm->dev, readsize,
 				GFP_KERNEL, &rb->dma);
 		if (!rb->base) {
-			dev_dbg(&intf->dev, "out of memory (read bufs usb_buffer_alloc)\n");
+			dev_dbg(&intf->dev,
+				"out of memory (read bufs usb_buffer_alloc)\n");
 			goto alloc_fail7;
 		}
 	}
-	for(i = 0; i < ACM_NW; i++)
-	{
+	for (i = 0; i < ACM_NW; i++) {
 		struct acm_wb *snd = &(acm->wb[i]);
 
-		if (!(snd->urb = usb_alloc_urb(0, GFP_KERNEL))) {
-			dev_dbg(&intf->dev, "out of memory (write urbs usb_alloc_urb)");
+		snd->urb = usb_alloc_urb(0, GFP_KERNEL);
+		if (snd->urb == NULL) {
+			dev_dbg(&intf->dev,
+				"out of memory (write urbs usb_alloc_urb)");
 			goto alloc_fail7;
 		}
 
-		usb_fill_bulk_urb(snd->urb, usb_dev, usb_sndbulkpipe(usb_dev, epwrite->bEndpointAddress),
-				NULL, acm->writesize, acm_write_bulk, snd);
+		usb_fill_bulk_urb(snd->urb, usb_dev,
+			usb_sndbulkpipe(usb_dev, epwrite->bEndpointAddress),
+			NULL, acm->writesize, acm_write_bulk, snd);
 		snd->urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
 		snd->instance = acm;
 	}
 
-	usb_set_intfdata (intf, acm);
+	usb_set_intfdata(intf, acm);
 
 	i = device_create_file(&intf->dev, &dev_attr_bmCapabilities);
 	if (i < 0)
@@ -1180,7 +1203,8 @@ skip_normal_probe:
 		if (!acm->country_codes)
 			goto skip_countries;
 		acm->country_code_size = cfd->bLength - 4;
-		memcpy(acm->country_codes, (u8 *)&cfd->wCountyCode0, cfd->bLength - 4);
+		memcpy(acm->country_codes, (u8 *)&cfd->wCountyCode0,
+							cfd->bLength - 4);
 		acm->country_rel_date = cfd->iCountryCodeRelDate;
 
 		i = device_create_file(&intf->dev, &dev_attr_wCountryCodes);
@@ -1189,7 +1213,8 @@ skip_normal_probe:
 			goto skip_countries;
 		}
 
-		i = device_create_file(&intf->dev, &dev_attr_iCountryCodeRelDate);
+		i = device_create_file(&intf->dev,
+						&dev_attr_iCountryCodeRelDate);
 		if (i < 0) {
 			kfree(acm->country_codes);
 			goto skip_countries;
@@ -1197,8 +1222,10 @@ skip_normal_probe:
 	}
 
 skip_countries:
-	usb_fill_int_urb(acm->ctrlurb, usb_dev, usb_rcvintpipe(usb_dev, epctrl->bEndpointAddress),
-			 acm->ctrl_buffer, ctrlsize, acm_ctrl_irq, acm, epctrl->bInterval);
+	usb_fill_int_urb(acm->ctrlurb, usb_dev,
+			usb_rcvintpipe(usb_dev, epctrl->bEndpointAddress),
+			acm->ctrl_buffer, ctrlsize, acm_ctrl_irq, acm,
+			epctrl->bInterval);
 	acm->ctrlurb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
 	acm->ctrlurb->transfer_dma = acm->ctrl_dma;
 
@@ -1245,7 +1272,7 @@ static void stop_data_traffic(struct acm *acm)
 	tasklet_disable(&acm->urb_task);
 
 	usb_kill_urb(acm->ctrlurb);
-	for(i = 0; i < ACM_NW; i++)
+	for (i = 0; i < ACM_NW; i++)
 		usb_kill_urb(acm->wb[i].urb);
 	for (i = 0; i < acm->rx_buflimit; i++)
 		usb_kill_urb(acm->ru[i].urb);
@@ -1267,7 +1294,7 @@ static void acm_disconnect(struct usb_interface *intf)
 		return;
 
 	mutex_lock(&open_mutex);
-	if (acm->country_codes){
+	if (acm->country_codes) {
 		device_remove_file(&acm->control->dev,
 				&dev_attr_wCountryCodes);
 		device_remove_file(&acm->control->dev,
@@ -1281,7 +1308,8 @@ static void acm_disconnect(struct usb_interface *intf)
 	stop_data_traffic(acm);
 
 	acm_write_buffers_free(acm);
-	usb_buffer_free(usb_dev, acm->ctrlsize, acm->ctrl_buffer, acm->ctrl_dma);
+	usb_buffer_free(usb_dev, acm->ctrlsize, acm->ctrl_buffer,
+								acm->ctrl_dma);
 	acm_read_buffers_free(acm);
 
 	usb_driver_release_interface(&acm_driver, intf == acm->control ?
@@ -1434,7 +1462,7 @@ static struct usb_device_id acm_ids[] = {
 	{ USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ACM,
 		USB_CDC_ACM_PROTO_AT_GSM) },
 	{ USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ACM,
-		USB_CDC_ACM_PROTO_AT_3G	) },
+		USB_CDC_ACM_PROTO_AT_3G) },
 	{ USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ACM,
 		USB_CDC_ACM_PROTO_AT_CDMA) },
 
@@ -1442,7 +1470,7 @@ static struct usb_device_id acm_ids[] = {
 	{ }
 };
 
-MODULE_DEVICE_TABLE (usb, acm_ids);
+MODULE_DEVICE_TABLE(usb, acm_ids);
 
 static struct usb_driver acm_driver = {
 	.name =		"cdc_acm",
@@ -1497,7 +1525,8 @@ static int __init acm_init(void)
 	acm_tty_driver->subtype = SERIAL_TYPE_NORMAL,
 	acm_tty_driver->flags = TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV;
 	acm_tty_driver->init_termios = tty_std_termios;
-	acm_tty_driver->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL | CLOCAL;
+	acm_tty_driver->init_termios.c_cflag = B9600 | CS8 | CREAD |
+								HUPCL | CLOCAL;
 	tty_set_operations(acm_tty_driver, &acm_ops);
 
 	retval = tty_register_driver(acm_tty_driver);
@@ -1529,7 +1558,7 @@ static void __exit acm_exit(void)
 module_init(acm_init);
 module_exit(acm_exit);
 
-MODULE_AUTHOR( DRIVER_AUTHOR );
-MODULE_DESCRIPTION( DRIVER_DESC );
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_CHARDEV_MAJOR(ACM_TTY_MAJOR);
-- 
cgit v0.10.2


From 70beaed22cbe12979e55d99b370e147e2e168562 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:39:12 +0100
Subject: serial: refactor ASYNC_ flags

Define ASYNCB_* flags which are bit numbers of the ASYNC_* flags.
This is useful for {test,set,clear}_bit.

Also convert each ASYNC_% to be (1 << ASYNCB_%) and define masks
with the macros, not constants.

Tested with:
#include "PATH_TO_KERNEL/include/linux/serial.h"
static struct {
        unsigned int new, old;
} as[] = {
        { ASYNC_HUP_NOTIFY, 0x0001 },
        { ASYNC_FOURPORT, 0x0002 },
...
	{ ASYNC_BOOT_ONLYMCA, 0x00400000 },
        { ASYNC_INTERNAL_FLAGS, 0xFFC00000 }
};
...
        for (a = 0; a < ARRAY_SIZE(as); a++)
                if (as[a].old != as[a].new)
                        printf("%.8x != %.8x\n", as[a].old, as[a].new);

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/serial.h b/include/linux/serial.h
index 9136cc5..e5bb75a 100644
--- a/include/linux/serial.h
+++ b/include/linux/serial.h
@@ -96,54 +96,76 @@ struct serial_uart_config {
 
 /*
  * Definitions for async_struct (and serial_struct) flags field
+ *
+ * Define ASYNCB_* for convenient use with {test,set,clear}_bit.
  */
-#define ASYNC_HUP_NOTIFY 0x0001 /* Notify getty on hangups and closes 
-				   on the callout port */
-#define ASYNC_FOURPORT  0x0002	/* Set OU1, OUT2 per AST Fourport settings */
-#define ASYNC_SAK	0x0004	/* Secure Attention Key (Orange book) */
-#define ASYNC_SPLIT_TERMIOS 0x0008 /* Separate termios for dialin/callout */
-
-#define ASYNC_SPD_MASK	0x1030
-#define ASYNC_SPD_HI	0x0010	/* Use 56000 instead of 38400 bps */
-
-#define ASYNC_SPD_VHI	0x0020  /* Use 115200 instead of 38400 bps */
-#define ASYNC_SPD_CUST	0x0030  /* Use user-specified divisor */
-
-#define ASYNC_SKIP_TEST	0x0040 /* Skip UART test during autoconfiguration */
-#define ASYNC_AUTO_IRQ  0x0080 /* Do automatic IRQ during autoconfiguration */
-#define ASYNC_SESSION_LOCKOUT 0x0100 /* Lock out cua opens based on session */
-#define ASYNC_PGRP_LOCKOUT    0x0200 /* Lock out cua opens based on pgrp */
-#define ASYNC_CALLOUT_NOHUP   0x0400 /* Don't do hangups for cua device */
-
-#define ASYNC_HARDPPS_CD	0x0800	/* Call hardpps when CD goes high  */
-
-#define ASYNC_SPD_SHI	0x1000	/* Use 230400 instead of 38400 bps */
-#define ASYNC_SPD_WARP	0x1010	/* Use 460800 instead of 38400 bps */
-
-#define ASYNC_LOW_LATENCY 0x2000 /* Request low latency behaviour */
-
-#define ASYNC_BUGGY_UART  0x4000 /* This is a buggy UART, skip some safety
-				  * checks.  Note: can be dangerous! */
-
-#define ASYNC_AUTOPROBE	 0x8000 /* Port was autoprobed by PCI or PNP code */
-
-#define ASYNC_FLAGS	0x7FFF	/* Possible legal async flags */
-#define ASYNC_USR_MASK	0x3430	/* Legal flags that non-privileged
-				 * users can set or reset */
-
-/* Internal flags used only by kernel/chr_drv/serial.c */
-#define ASYNC_INITIALIZED	0x80000000 /* Serial port was initialized */
-#define ASYNC_NORMAL_ACTIVE	0x20000000 /* Normal device is active */
-#define ASYNC_BOOT_AUTOCONF	0x10000000 /* Autoconfigure port on bootup */
-#define ASYNC_CLOSING		0x08000000 /* Serial port is closing */
-#define ASYNC_CTS_FLOW		0x04000000 /* Do CTS flow control */
-#define ASYNC_CHECK_CD		0x02000000 /* i.e., CLOCAL */
-#define ASYNC_SHARE_IRQ		0x01000000 /* for multifunction cards
-					     --- no longer used */
-#define ASYNC_CONS_FLOW		0x00800000 /* flow control for console  */
-
-#define ASYNC_BOOT_ONLYMCA	0x00400000 /* Probe only if MCA bus */
-#define ASYNC_INTERNAL_FLAGS	0xFFC00000 /* Internal flags */
+#define ASYNCB_HUP_NOTIFY	 0 /* Notify getty on hangups and closes
+				    * on the callout port */
+#define ASYNCB_FOURPORT		 1 /* Set OU1, OUT2 per AST Fourport settings */
+#define ASYNCB_SAK		 2 /* Secure Attention Key (Orange book) */
+#define ASYNCB_SPLIT_TERMIOS	 3 /* Separate termios for dialin/callout */
+#define ASYNCB_SPD_HI		 4 /* Use 56000 instead of 38400 bps */
+#define ASYNCB_SPD_VHI		 5 /* Use 115200 instead of 38400 bps */
+#define ASYNCB_SKIP_TEST	 6 /* Skip UART test during autoconfiguration */
+#define ASYNCB_AUTO_IRQ		 7 /* Do automatic IRQ during
+				    * autoconfiguration */
+#define ASYNCB_SESSION_LOCKOUT	 8 /* Lock out cua opens based on session */
+#define ASYNCB_PGRP_LOCKOUT	 9 /* Lock out cua opens based on pgrp */
+#define ASYNCB_CALLOUT_NOHUP	10 /* Don't do hangups for cua device */
+#define ASYNCB_HARDPPS_CD	11 /* Call hardpps when CD goes high  */
+#define ASYNCB_SPD_SHI		12 /* Use 230400 instead of 38400 bps */
+#define ASYNCB_LOW_LATENCY	13 /* Request low latency behaviour */
+#define ASYNCB_BUGGY_UART	14 /* This is a buggy UART, skip some safety
+				    * checks.  Note: can be dangerous! */
+#define ASYNCB_AUTOPROBE	15 /* Port was autoprobed by PCI or PNP code */
+#define ASYNCB_LAST_USER	15
+
+/* Internal flags used only by kernel */
+#define ASYNCB_INITIALIZED	31 /* Serial port was initialized */
+#define ASYNCB_NORMAL_ACTIVE	29 /* Normal device is active */
+#define ASYNCB_BOOT_AUTOCONF	28 /* Autoconfigure port on bootup */
+#define ASYNCB_CLOSING		27 /* Serial port is closing */
+#define ASYNCB_CTS_FLOW		26 /* Do CTS flow control */
+#define ASYNCB_CHECK_CD		25 /* i.e., CLOCAL */
+#define ASYNCB_SHARE_IRQ	24 /* for multifunction cards, no longer used */
+#define ASYNCB_CONS_FLOW	23 /* flow control for console  */
+#define ASYNCB_BOOT_ONLYMCA	22 /* Probe only if MCA bus */
+#define ASYNCB_FIRST_KERNEL	22
+
+#define ASYNC_HUP_NOTIFY	(1U << ASYNCB_HUP_NOTIFY)
+#define ASYNC_FOURPORT		(1U << ASYNCB_FOURPORT)
+#define ASYNC_SAK		(1U << ASYNCB_SAK)
+#define ASYNC_SPLIT_TERMIOS	(1U << ASYNCB_SPLIT_TERMIOS)
+#define ASYNC_SPD_HI		(1U << ASYNCB_SPD_HI)
+#define ASYNC_SPD_VHI		(1U << ASYNCB_SPD_VHI)
+#define ASYNC_SKIP_TEST		(1U << ASYNCB_SKIP_TEST)
+#define ASYNC_AUTO_IRQ		(1U << ASYNCB_AUTO_IRQ)
+#define ASYNC_SESSION_LOCKOUT	(1U << ASYNCB_SESSION_LOCKOUT)
+#define ASYNC_PGRP_LOCKOUT	(1U << ASYNCB_PGRP_LOCKOUT)
+#define ASYNC_CALLOUT_NOHUP	(1U << ASYNCB_CALLOUT_NOHUP)
+#define ASYNC_HARDPPS_CD	(1U << ASYNCB_HARDPPS_CD)
+#define ASYNC_SPD_SHI		(1U << ASYNCB_SPD_SHI)
+#define ASYNC_LOW_LATENCY	(1U << ASYNCB_LOW_LATENCY)
+#define ASYNC_BUGGY_UART	(1U << ASYNCB_BUGGY_UART)
+#define ASYNC_AUTOPROBE		(1U << ASYNCB_AUTOPROBE)
+
+#define ASYNC_FLAGS		((1U << ASYNCB_LAST_USER) - 1)
+#define ASYNC_USR_MASK		(ASYNC_SPD_HI|ASYNC_SPD_VHI| \
+		ASYNC_CALLOUT_NOHUP|ASYNC_SPD_SHI|ASYNC_LOW_LATENCY)
+#define ASYNC_SPD_CUST		(ASYNC_SPD_HI|ASYNC_SPD_VHI)
+#define ASYNC_SPD_WARP		(ASYNC_SPD_HI|ASYNC_SPD_SHI)
+#define ASYNC_SPD_MASK		(ASYNC_SPD_HI|ASYNC_SPD_VHI|ASYNC_SPD_SHI)
+
+#define ASYNC_INITIALIZED	(1U << ASYNCB_INITIALIZED)
+#define ASYNC_NORMAL_ACTIVE	(1U << ASYNCB_NORMAL_ACTIVE)
+#define ASYNC_BOOT_AUTOCONF	(1U << ASYNCB_BOOT_AUTOCONF)
+#define ASYNC_CLOSING		(1U << ASYNCB_CLOSING)
+#define ASYNC_CTS_FLOW		(1U << ASYNCB_CTS_FLOW)
+#define ASYNC_CHECK_CD		(1U << ASYNCB_CHECK_CD)
+#define ASYNC_SHARE_IRQ		(1U << ASYNCB_SHARE_IRQ)
+#define ASYNC_CONS_FLOW		(1U << ASYNCB_CONS_FLOW)
+#define ASYNC_BOOT_ONLYMCA	(1U << ASYNCB_BOOT_ONLYMCA)
+#define ASYNC_INTERNAL_FLAGS	(~((1U << ASYNCB_FIRST_KERNEL) - 1))
 
 /*
  * Multiport serial configuration structure --- external structure
-- 
cgit v0.10.2


From a391ad0f09014856bbc4eeea309593eba977b3b0 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:40:17 +0100
Subject: rocket: fix test_bit parameters

Switch from ASYNC_* to ASYNCB_*, because {test,set}_bit expect
bit number, not mask.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/rocket.c b/drivers/char/rocket.c
index 7399188..63d5b62 100644
--- a/drivers/char/rocket.c
+++ b/drivers/char/rocket.c
@@ -939,7 +939,7 @@ static int rp_open(struct tty_struct *tty, struct file *filp)
 	/*
 	 * Info->count is now 1; so it's safe to sleep now.
 	 */
-	if (!test_bit(ASYNC_INITIALIZED, &port->flags)) {
+	if (!test_bit(ASYNCB_INITIALIZED, &port->flags)) {
 		cp = &info->channel;
 		sSetRxTrigger(cp, TRIG_1);
 		if (sGetChanStatus(cp) & CD_ACT)
@@ -963,7 +963,7 @@ static int rp_open(struct tty_struct *tty, struct file *filp)
 		sEnRxFIFO(cp);
 		sEnTransmit(cp);
 
-		set_bit(ASYNC_INITIALIZED, &info->port.flags);
+		set_bit(ASYNCB_INITIALIZED, &info->port.flags);
 
 		/*
 		 * Set up the tty->alt_speed kludge
@@ -1646,7 +1646,7 @@ static int rp_write(struct tty_struct *tty,
 	/*  Write remaining data into the port's xmit_buf */
 	while (1) {
 		/* Hung up ? */
-		if (!test_bit(ASYNC_NORMAL_ACTIVE, &info->port.flags))
+		if (!test_bit(ASYNCB_NORMAL_ACTIVE, &info->port.flags))
 			goto end;
 		c = min(count, XMIT_BUF_SIZE - info->xmit_cnt - 1);
 		c = min(c, XMIT_BUF_SIZE - info->xmit_head);
-- 
cgit v0.10.2


From c3301a5c04800bcf8afc8a815bf9e570a4e25a08 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:41:05 +0100
Subject: epca: fix test_bit parameters

Switch from ASYNC_* to ASYNCB_*, because test_bit expects
bit number, not mask.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/epca.c b/drivers/char/epca.c
index 8797b77..710ee93 100644
--- a/drivers/char/epca.c
+++ b/drivers/char/epca.c
@@ -1518,7 +1518,7 @@ static void doevent(int crd)
 		if (event & MODEMCHG_IND) {
 			/* A modem signal change has been indicated */
 			ch->imodem = mstat;
-			if (test_bit(ASYNC_CHECK_CD, &ch->port.flags)) {
+			if (test_bit(ASYNCB_CHECK_CD, &ch->port.flags)) {
 				/* We are now receiving dcd */
 				if (mstat & ch->dcd)
 					wake_up_interruptible(&ch->port.open_wait);
@@ -1765,9 +1765,9 @@ static void epcaparam(struct tty_struct *tty, struct channel *ch)
 		 * that the driver will wait on carrier detect.
 		 */
 		if (ts->c_cflag & CLOCAL)
-			clear_bit(ASYNC_CHECK_CD, &ch->port.flags);
+			clear_bit(ASYNCB_CHECK_CD, &ch->port.flags);
 		else
-			set_bit(ASYNC_CHECK_CD, &ch->port.flags);
+			set_bit(ASYNCB_CHECK_CD, &ch->port.flags);
 		mval = ch->m_dtr | ch->m_rts;
 	} /* End CBAUD not detected */
 	iflag = termios2digi_i(ch, ts->c_iflag);
@@ -2244,7 +2244,8 @@ static void do_softint(struct work_struct *work)
 			if (test_and_clear_bit(EPCA_EVENT_HANGUP, &ch->event)) {
 				tty_hangup(tty);
 				wake_up_interruptible(&ch->port.open_wait);
-				clear_bit(ASYNC_NORMAL_ACTIVE, &ch->port.flags);
+				clear_bit(ASYNCB_NORMAL_ACTIVE,
+						&ch->port.flags);
 			}
 		}
 		tty_kref_put(tty);
-- 
cgit v0.10.2


From 70fd8fdecc4430ffcede7704dd812d4054d1faf9 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@amd.com>
Date: Thu, 11 Jun 2009 12:41:57 +0100
Subject: 8250_pci: add the OXCB950 chip to the 8250 PCI driver.

This adds support for the following serial controller chip:
Oxford Semiconductor OXCB950 for PCI Cardbus interface
http://www.transdimension.com/products/serial/OXCB950.html

on this card:
ExSys EX-1370 1 port high-speed serial card for ExpressCard/34 slot

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/8250_pci.c b/drivers/serial/8250_pci.c
index 938bc1b..e371a9c 100644
--- a/drivers/serial/8250_pci.c
+++ b/drivers/serial/8250_pci.c
@@ -2776,6 +2776,9 @@ static struct pci_device_id serial_pci_tbl[] = {
 	{	PCI_VENDOR_ID_OXSEMI, 0x950a,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
 		pbn_b0_2_1130000 },
+	{	PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_C950,
+		PCI_VENDOR_ID_OXSEMI, PCI_SUBDEVICE_ID_OXSEMI_C950, 0, 0,
+		pbn_b0_1_921600 },
 	{	PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_16PCI954,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
 		pbn_b0_4_115200 },
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 0f71812..d7d1c41 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1996,10 +1996,12 @@
 #define PCI_DEVICE_ID_OXSEMI_PCIe952_1_U	0xC118
 #define PCI_DEVICE_ID_OXSEMI_PCIe952_1_GU	0xC11C
 #define PCI_DEVICE_ID_OXSEMI_16PCI954	0x9501
+#define PCI_DEVICE_ID_OXSEMI_C950	0x950B
 #define PCI_DEVICE_ID_OXSEMI_16PCI95N	0x9511
 #define PCI_DEVICE_ID_OXSEMI_16PCI954PP	0x9513
 #define PCI_DEVICE_ID_OXSEMI_16PCI952	0x9521
 #define PCI_DEVICE_ID_OXSEMI_16PCI952PP	0x9523
+#define PCI_SUBDEVICE_ID_OXSEMI_C950	0x0001
 
 #define PCI_VENDOR_ID_CHELSIO		0x1425
 
-- 
cgit v0.10.2


From 5b0ed5263cb089500052f8c1ab6e0706bebf0d83 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 11 Jun 2009 12:42:49 +0100
Subject: x86: fix ktermios-termio conversion

The legacy TCSETA{,W,F} ioctls failed to set the termio->c_line field
on x86. This adds a missing get_user.

The same ioctls also fail to report faulting user pointers, which
we keep ignoring.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/x86/include/asm/termios.h b/arch/x86/include/asm/termios.h
index f729563..c4ee805 100644
--- a/arch/x86/include/asm/termios.h
+++ b/arch/x86/include/asm/termios.h
@@ -67,6 +67,7 @@ static inline int user_termio_to_kernel_termios(struct ktermios *termios,
 	SET_LOW_TERMIOS_BITS(termios, termio, c_oflag);
 	SET_LOW_TERMIOS_BITS(termios, termio, c_cflag);
 	SET_LOW_TERMIOS_BITS(termios, termio, c_lflag);
+	get_user(termios->c_line, &termio->c_line);
 	return copy_from_user(termios->c_cc, termio->c_cc, NCC);
 }
 
-- 
cgit v0.10.2


From 38db89799bdf11625a831c5af33938dcb11908b6 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 12:44:17 +0100
Subject: tty: throttling race fix

The tty throttling code can race due to the lock drops. It takes very high
loads but this has been observed and verified by Rob Duncan.

The basic problem is that on an SMP box we can go

	CPU #1				CPU #2
	need to throttle ?
	suppose we should		buffer space cleared
					are we throttled
					yes ? - unthrottle
	call throttle method

This changeet take the termios lock to protect against this. The termios
lock isn't the initial obvious candidate but many implementations of throttle
methods already need to poke around their own termios structures (and nobody
really locks them against a racing change of flow control).

This does mean that anyone who is setting tty->low_latency = 1 and then
calling tty_flip_buffer_push from their unthrottle method is going to end up
collapsing in a pile of locks. However we've removed all the known bogus
users of low_latency = 1 and such use isn't safe anyway for other reasons so
catching it would be an improvement.

Signed-off-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/tty_ioctl.c b/drivers/char/tty_ioctl.c
index 6f4c7d0..2401dbc 100644
--- a/drivers/char/tty_ioctl.c
+++ b/drivers/char/tty_ioctl.c
@@ -97,14 +97,19 @@ EXPORT_SYMBOL(tty_driver_flush_buffer);
  *	@tty: terminal
  *
  *	Indicate that a tty should stop transmitting data down the stack.
+ *	Takes the termios mutex to protect against parallel throttle/unthrottle
+ *	and also to ensure the driver can consistently reference its own
+ *	termios data at this point when implementing software flow control.
  */
 
 void tty_throttle(struct tty_struct *tty)
 {
+	mutex_lock(&tty->termios_mutex);
 	/* check TTY_THROTTLED first so it indicates our state */
 	if (!test_and_set_bit(TTY_THROTTLED, &tty->flags) &&
 	    tty->ops->throttle)
 		tty->ops->throttle(tty);
+	mutex_unlock(&tty->termios_mutex);
 }
 EXPORT_SYMBOL(tty_throttle);
 
@@ -113,13 +118,21 @@ EXPORT_SYMBOL(tty_throttle);
  *	@tty: terminal
  *
  *	Indicate that a tty may continue transmitting data down the stack.
+ *	Takes the termios mutex to protect against parallel throttle/unthrottle
+ *	and also to ensure the driver can consistently reference its own
+ *	termios data at this point when implementing software flow control.
+ *
+ *	Drivers should however remember that the stack can issue a throttle,
+ *	then change flow control method, then unthrottle.
  */
 
 void tty_unthrottle(struct tty_struct *tty)
 {
+	mutex_lock(&tty->termios_mutex);
 	if (test_and_clear_bit(TTY_THROTTLED, &tty->flags) &&
 	    tty->ops->unthrottle)
 		tty->ops->unthrottle(tty);
+	mutex_unlock(&tty->termios_mutex);
 }
 EXPORT_SYMBOL(tty_unthrottle);
 
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index bcba84e..3566129 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -127,7 +127,8 @@
  * 	the line discipline are close to full, and it should somehow
  * 	signal that no more characters should be sent to the tty.
  *
- *	Optional: Always invoke via tty_throttle();
+ *	Optional: Always invoke via tty_throttle(), called under the
+ *	termios lock.
  * 
  * void (*unthrottle)(struct tty_struct * tty);
  *
@@ -135,7 +136,8 @@
  * 	that characters can now be sent to the tty without fear of
  * 	overrunning the input buffers of the line disciplines.
  * 
- *	Optional: Always invoke via tty_unthrottle();
+ *	Optional: Always invoke via tty_unthrottle(), called under the
+ *	termios lock.
  *
  * void (*stop)(struct tty_struct *tty);
  *
-- 
cgit v0.10.2


From 4b6ae89ef40d0d5d4c85098c1d0a012333a68729 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 12:45:08 +0100
Subject: serial: update maintainers file

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/MAINTAINERS b/MAINTAINERS
index cf4abdd..1979167 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -159,7 +159,8 @@ F:	drivers/net/r8169.c
 8250/16?50 (AND CLONE UARTS) SERIAL DRIVER
 L:	linux-serial@vger.kernel.org
 W:	http://serial.sourceforge.net
-S:	Orphan
+M:	alan@lxorguk.ukuu.org.uk
+S:	Odd Fixes
 F:	drivers/serial/8250*
 F:	include/linux/serial_8250.h
 
-- 
cgit v0.10.2


From 620df3c0a5b70656c4de6049825de214f108218e Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 12:45:53 +0100
Subject: pty: Fix a comment

We fixed the globals, so now fix the comment

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/pty.c b/drivers/char/pty.c
index 31038a0..da2cb8c 100644
--- a/drivers/char/pty.c
+++ b/drivers/char/pty.c
@@ -30,7 +30,6 @@
 
 #include <asm/system.h>
 
-/* These are global because they are accessed in tty_io.c */
 #ifdef CONFIG_UNIX98_PTYS
 static struct tty_driver *ptm_driver;
 static struct tty_driver *pts_driver;
-- 
cgit v0.10.2


From 5f0878acba7db24323f5ba4055ec9a96895bb150 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 12:46:41 +0100
Subject: tty: Fix oops when scanning the polling list for kgdb

Costantino Leandro found a bug in tty_find_polling_driver and provided a
patch that fixed the crash but not the underlying bug. This fixes the
underlying bug where the list walk corrupts the values it is using on a
match but then reuses them if the open fails.

Signed-off-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 66b99a2..6c81739 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -295,7 +295,7 @@ struct tty_driver *tty_find_polling_driver(char *name, int *line)
 	struct tty_driver *p, *res = NULL;
 	int tty_line = 0;
 	int len;
-	char *str;
+	char *str, *stp;
 
 	for (str = name; *str; str++)
 		if ((*str >= '0' && *str <= '9') || *str == ',')
@@ -311,13 +311,14 @@ struct tty_driver *tty_find_polling_driver(char *name, int *line)
 	list_for_each_entry(p, &tty_drivers, tty_drivers) {
 		if (strncmp(name, p->name, len) != 0)
 			continue;
-		if (*str == ',')
-			str++;
-		if (*str == '\0')
-			str = NULL;
+		stp = str;
+		if (*stp == ',')
+			stp++;
+		if (*stp == '\0')
+			stp = NULL;
 
 		if (tty_line >= 0 && tty_line <= p->num && p->ops &&
-		    p->ops->poll_init && !p->ops->poll_init(p, tty_line, str)) {
+		    p->ops->poll_init && !p->ops->poll_init(p, tty_line, stp)) {
 			res = tty_driver_kref_get(p);
 			*line = tty_line;
 			break;
-- 
cgit v0.10.2


From e8b70e7d3e86319a8b2aaabde3866833d92cd80f Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 12:48:02 +0100
Subject: tty: Extract various bits of ldisc code

Before trying to tackle the ldisc bugs the code needs to be a good deal
more readable, so do the simple extractions of routines first.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 6c81739..be49d07 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -2481,6 +2481,24 @@ static int tty_tiocmset(struct tty_struct *tty, struct file *file, unsigned int
 	return tty->ops->tiocmset(tty, file, set, clear);
 }
 
+struct tty_struct *tty_pair_get_tty(struct tty_struct *tty)
+{
+	if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
+	    tty->driver->subtype == PTY_TYPE_MASTER)
+		tty = tty->link;
+	return tty;
+}
+EXPORT_SYMBOL(tty_pair_get_tty);
+
+struct tty_struct *tty_pair_get_pty(struct tty_struct *tty)
+{
+	if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
+	    tty->driver->subtype == PTY_TYPE_MASTER)
+	    return tty;
+	return tty->link;
+}
+EXPORT_SYMBOL(tty_pair_get_pty);
+
 /*
  * Split this up, as gcc can choke on it otherwise..
  */
@@ -2496,11 +2514,7 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	if (tty_paranoia_check(tty, inode, "tty_ioctl"))
 		return -EINVAL;
 
-	real_tty = tty;
-	if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
-	    tty->driver->subtype == PTY_TYPE_MASTER)
-		real_tty = tty->link;
-
+	real_tty = tty_pair_get_tty(tty);
 
 	/*
 	 * Factor out some common prep work
diff --git a/drivers/char/tty_ldisc.c b/drivers/char/tty_ldisc.c
index f78f5b0..e3c6416 100644
--- a/drivers/char/tty_ldisc.c
+++ b/drivers/char/tty_ldisc.c
@@ -444,6 +444,50 @@ static void tty_ldisc_restore(struct tty_struct *tty, struct tty_ldisc *old)
 }
 
 /**
+ *	tty_ldisc_halt		-	shutdown the line discipline
+ *	@tty: tty device
+ *
+ *	Shut down the line discipline and work queue for this tty device.
+ *	The TTY_LDISC flag being cleared ensures no further references can
+ *	be obtained while the delayed work queue halt ensures that no more
+ *	data is fed to the ldisc.
+ *
+ *	In order to wait for any existing references to complete see 
+ *	tty_ldisc_wait_idle.
+ */
+
+static void tty_ldisc_halt(struct tty_struct *tty)
+{
+	clear_bit(TTY_LDISC, &tty->flags);
+	cancel_delayed_work(&tty->buf.work);
+	/*
+	 * Wait for ->hangup_work and ->buf.work handlers to terminate
+	 */
+	flush_scheduled_work();
+}
+
+/**
+ *	tty_ldisc_wait_idle	-	wait for the ldisc to become idle
+ *	@tty: tty to wait for
+ *
+ *	Wait for the line discipline to become idle. The discipline must
+ *	have been halted for this to guarantee it remains idle.
+ *
+ */
+
+static void tty_ldisc_wait_idle(struct tty_struct *tty)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&tty_ldisc_lock, flags);
+	while (tty->ldisc.refcount) {
+		spin_unlock_irqrestore(&tty_ldisc_lock, flags);
+		wait_event(tty_ldisc_wait, tty->ldisc.refcount == 0);
+		spin_lock_irqsave(&tty_ldisc_lock, flags);
+	}
+	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
+}
+
+/**
  *	tty_set_ldisc		-	set line discipline
  *	@tty: the terminal to set
  *	@ldisc: the line discipline
@@ -636,6 +680,21 @@ int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty)
 	return 0;
 }
 
+static void tty_ldisc_reinit(struct tty_struct *tty)
+{
+	struct tty_ldisc ld;
+
+	if (tty->ldisc.ops->close)
+		(tty->ldisc.ops->close)(tty);
+	tty_ldisc_put(tty->ldisc.ops);
+	/*
+	 *	Switch the line discipline back
+	 */
+	WARN_ON(tty_ldisc_get(N_TTY, &ld));
+	tty_ldisc_assign(tty, &ld);
+	tty_set_termios_ldisc(tty, N_TTY);
+}
+
 /**
  *	tty_ldisc_release		-	release line discipline
  *	@tty: tty being shut down
@@ -647,58 +706,34 @@ int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty)
 
 void tty_ldisc_release(struct tty_struct *tty, struct tty_struct *o_tty)
 {
-	unsigned long flags;
-	struct tty_ldisc ld;
+
 	/*
 	 * Prevent flush_to_ldisc() from rescheduling the work for later.  Then
 	 * kill any delayed work. As this is the final close it does not
 	 * race with the set_ldisc code path.
 	 */
-	clear_bit(TTY_LDISC, &tty->flags);
-	cancel_delayed_work(&tty->buf.work);
 
-	/*
-	 * Wait for ->hangup_work and ->buf.work handlers to terminate
-	 */
-
-	flush_scheduled_work();
+	tty_ldisc_halt(tty);
 
 	/*
 	 * Wait for any short term users (we know they are just driver
 	 * side waiters as the file is closing so user count on the file
 	 * side is zero.
 	 */
-	spin_lock_irqsave(&tty_ldisc_lock, flags);
-	while (tty->ldisc.refcount) {
-		spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-		wait_event(tty_ldisc_wait, tty->ldisc.refcount == 0);
-		spin_lock_irqsave(&tty_ldisc_lock, flags);
-	}
-	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
+
+	tty_ldisc_wait_idle(tty);
+
 	/*
 	 * Shutdown the current line discipline, and reset it to N_TTY.
 	 *
 	 * FIXME: this MUST get fixed for the new reflocking
 	 */
-	if (tty->ldisc.ops->close)
-		(tty->ldisc.ops->close)(tty);
-	tty_ldisc_put(tty->ldisc.ops);
 
-	/*
-	 *	Switch the line discipline back
-	 */
-	WARN_ON(tty_ldisc_get(N_TTY, &ld));
-	tty_ldisc_assign(tty, &ld);
-	tty_set_termios_ldisc(tty, N_TTY);
+	tty_ldisc_reinit(tty);
 	if (o_tty) {
 		/* FIXME: could o_tty be in setldisc here ? */
 		clear_bit(TTY_LDISC, &o_tty->flags);
-		if (o_tty->ldisc.ops->close)
-			(o_tty->ldisc.ops->close)(o_tty);
-		tty_ldisc_put(o_tty->ldisc.ops);
-		WARN_ON(tty_ldisc_get(N_TTY, &ld));
-		tty_ldisc_assign(o_tty, &ld);
-		tty_set_termios_ldisc(o_tty, N_TTY);
+		tty_ldisc_reinit(o_tty);
 	}
 }
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index bed5a3d..f9c13c8 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -428,6 +428,9 @@ extern struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx,
 extern void tty_release_dev(struct file *filp);
 extern int tty_init_termios(struct tty_struct *tty);
 
+extern struct tty_struct *tty_pair_get_tty(struct tty_struct *tty);
+extern struct tty_struct *tty_pair_get_pty(struct tty_struct *tty);
+
 extern struct mutex tty_mutex;
 
 extern void tty_write_unlock(struct tty_struct *tty);
-- 
cgit v0.10.2


From c65c9bc3efa5589f691276bb9db689119a711222 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 12:50:12 +0100
Subject: tty: rewrite the ldisc locking

There are several pretty much unfixable races in the old ldisc code, especially
with respect to pty behaviour and also to hangup. It's easier to rewrite the
code than simply try and patch it up.

This patch
- splits the ldisc from the tty (so we will be able to refcount it more cleanly
  later)
- introduces a mutex lock for ldisc changing on an active device
- fixes the complete mess that hangup caused
- implements hopefully correct setldisc/close/hangup locking

There are still some problems around pty pairs that have always been there but
at least it is now possible to understand the code and fix further problems.

This fixes the following known bugs
- hang up can leak ldisc references
- hang up may not call open/close on ldisc in a matched way
- pty/tty pairs can deadlock during an ldisc change
- reading the ldisc proc files can cause every ldisc to be loaded

and probably a few other of the mysterious ldisc race reports.

I'm sure it also adds the odd new one.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c
index af761dc..6880151 100644
--- a/drivers/bluetooth/hci_ldisc.c
+++ b/drivers/bluetooth/hci_ldisc.c
@@ -277,8 +277,8 @@ static int hci_uart_tty_open(struct tty_struct *tty)
 	/* FIXME: why is this needed. Note don't use ldisc_ref here as the
 	   open path is before the ldisc is referencable */
 
-	if (tty->ldisc.ops->flush_buffer)
-		tty->ldisc.ops->flush_buffer(tty);
+	if (tty->ldisc->ops->flush_buffer)
+		tty->ldisc->ops->flush_buffer(tty);
 	tty_driver_flush_buffer(tty);
 
 	return 0;
diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index 4560190..f3366d3 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -5200,7 +5200,7 @@ static int cyclades_proc_show(struct seq_file *m, void *v)
 					(cur_jifs - info->idle_stats.recv_idle)/
 					HZ, info->idle_stats.overruns,
 					/* FIXME: double check locking */
-					(long)info->port.tty->ldisc.ops->num);
+					(long)info->port.tty->ldisc->ops->num);
 			else
 				seq_printf(m, "%3d %8lu %10lu %8lu "
 					"%10lu %8lu %9lu %6ld\n",
diff --git a/drivers/char/epca.c b/drivers/char/epca.c
index 710ee93..abef1f7 100644
--- a/drivers/char/epca.c
+++ b/drivers/char/epca.c
@@ -2114,8 +2114,8 @@ static int pc_ioctl(struct tty_struct *tty, struct file *file,
 			tty_wait_until_sent(tty, 0);
 		} else {
 			/* ldisc lock already held in ioctl */
-			if (tty->ldisc.ops->flush_buffer)
-				tty->ldisc.ops->flush_buffer(tty);
+			if (tty->ldisc->ops->flush_buffer)
+				tty->ldisc->ops->flush_buffer(tty);
 		}
 		unlock_kernel();
 		/* Fall Thru */
diff --git a/drivers/char/ip2/i2lib.c b/drivers/char/ip2/i2lib.c
index 0061e18..0d10b89 100644
--- a/drivers/char/ip2/i2lib.c
+++ b/drivers/char/ip2/i2lib.c
@@ -868,11 +868,11 @@ i2Input(i2ChanStrPtr pCh)
 		amountToMove = count;
 	}
 	// Move the first block
-	pCh->pTTY->ldisc.ops->receive_buf( pCh->pTTY,
+	pCh->pTTY->ldisc->ops->receive_buf( pCh->pTTY,
 		 &(pCh->Ibuf[stripIndex]), NULL, amountToMove );
 	// If we needed to wrap, do the second data move
 	if (count > amountToMove) {
-		pCh->pTTY->ldisc.ops->receive_buf( pCh->pTTY,
+		pCh->pTTY->ldisc->ops->receive_buf( pCh->pTTY,
 		 pCh->Ibuf, NULL, count - amountToMove );
 	}
 	// Bump and wrap the stripIndex all at once by the amount of data read. This
diff --git a/drivers/char/ip2/ip2main.c b/drivers/char/ip2/ip2main.c
index afd9247..517271c 100644
--- a/drivers/char/ip2/ip2main.c
+++ b/drivers/char/ip2/ip2main.c
@@ -1315,8 +1315,8 @@ static inline void  isig(int sig, struct tty_struct *tty, int flush)
 	if (tty->pgrp)
 		kill_pgrp(tty->pgrp, sig, 1);
 	if (flush || !L_NOFLSH(tty)) {
-		if ( tty->ldisc.ops->flush_buffer )  
-			tty->ldisc.ops->flush_buffer(tty);
+		if ( tty->ldisc->ops->flush_buffer )  
+			tty->ldisc->ops->flush_buffer(tty);
 		i2InputFlush( tty->driver_data );
 	}
 }
diff --git a/drivers/char/n_hdlc.c b/drivers/char/n_hdlc.c
index bacb3e2..461ece5 100644
--- a/drivers/char/n_hdlc.c
+++ b/drivers/char/n_hdlc.c
@@ -342,8 +342,8 @@ static int n_hdlc_tty_open (struct tty_struct *tty)
 #endif
 	
 	/* Flush any pending characters in the driver and discipline. */
-	if (tty->ldisc.ops->flush_buffer)
-		tty->ldisc.ops->flush_buffer(tty);
+	if (tty->ldisc->ops->flush_buffer)
+		tty->ldisc->ops->flush_buffer(tty);
 
 	tty_driver_flush_buffer(tty);
 		
diff --git a/drivers/char/pty.c b/drivers/char/pty.c
index da2cb8c..5acd29e 100644
--- a/drivers/char/pty.c
+++ b/drivers/char/pty.c
@@ -110,7 +110,7 @@ static int pty_write(struct tty_struct *tty, const unsigned char *buf,
 	c = to->receive_room;
 	if (c > count)
 		c = count;
-	to->ldisc.ops->receive_buf(to, buf, NULL, c);
+	to->ldisc->ops->receive_buf(to, buf, NULL, c);
 
 	return c;
 }
@@ -148,11 +148,11 @@ static int pty_chars_in_buffer(struct tty_struct *tty)
 	int count;
 
 	/* We should get the line discipline lock for "tty->link" */
-	if (!to || !to->ldisc.ops->chars_in_buffer)
+	if (!to || !to->ldisc->ops->chars_in_buffer)
 		return 0;
 
 	/* The ldisc must report 0 if no characters available to be read */
-	count = to->ldisc.ops->chars_in_buffer(to);
+	count = to->ldisc->ops->chars_in_buffer(to);
 
 	if (tty->driver->subtype == PTY_TYPE_SLAVE)
 		return count;
@@ -186,8 +186,8 @@ static void pty_flush_buffer(struct tty_struct *tty)
 	if (!to)
 		return;
 
-	if (to->ldisc.ops->flush_buffer)
-		to->ldisc.ops->flush_buffer(to);
+	if (to->ldisc->ops->flush_buffer)
+		to->ldisc->ops->flush_buffer(to);
 
 	if (to->packet) {
 		spin_lock_irqsave(&tty->ctrl_lock, flags);
diff --git a/drivers/char/selection.c b/drivers/char/selection.c
index cb8ca56..f97b9e8 100644
--- a/drivers/char/selection.c
+++ b/drivers/char/selection.c
@@ -327,7 +327,7 @@ int paste_selection(struct tty_struct *tty)
 		}
 		count = sel_buffer_lth - pasted;
 		count = min(count, tty->receive_room);
-		tty->ldisc.ops->receive_buf(tty, sel_buffer + pasted,
+		tty->ldisc->ops->receive_buf(tty, sel_buffer + pasted,
 								NULL, count);
 		pasted += count;
 	}
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index be49d07..2f44b0b 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -492,22 +492,6 @@ void tty_ldisc_flush(struct tty_struct *tty)
 EXPORT_SYMBOL_GPL(tty_ldisc_flush);
 
 /**
- *	tty_reset_termios	-	reset terminal state
- *	@tty: tty to reset
- *
- *	Restore a terminal to the driver default state
- */
-
-static void tty_reset_termios(struct tty_struct *tty)
-{
-	mutex_lock(&tty->termios_mutex);
-	*tty->termios = tty->driver->init_termios;
-	tty->termios->c_ispeed = tty_termios_input_baud_rate(tty->termios);
-	tty->termios->c_ospeed = tty_termios_baud_rate(tty->termios);
-	mutex_unlock(&tty->termios_mutex);
-}
-
-/**
  *	do_tty_hangup		-	actual handler for hangup events
  *	@work: tty device
  *
@@ -536,7 +520,6 @@ static void do_tty_hangup(struct work_struct *work)
 	struct file *cons_filp = NULL;
 	struct file *filp, *f = NULL;
 	struct task_struct *p;
-	struct tty_ldisc *ld;
 	int    closecount = 0, n;
 	unsigned long flags;
 	int refs = 0;
@@ -567,40 +550,8 @@ static void do_tty_hangup(struct work_struct *work)
 		filp->f_op = &hung_up_tty_fops;
 	}
 	file_list_unlock();
-	/*
-	 * FIXME! What are the locking issues here? This may me overdoing
-	 * things... This question is especially important now that we've
-	 * removed the irqlock.
-	 */
-	ld = tty_ldisc_ref(tty);
-	if (ld != NULL) {
-		/* We may have no line discipline at this point */
-		if (ld->ops->flush_buffer)
-			ld->ops->flush_buffer(tty);
-		tty_driver_flush_buffer(tty);
-		if ((test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) &&
-		    ld->ops->write_wakeup)
-			ld->ops->write_wakeup(tty);
-		if (ld->ops->hangup)
-			ld->ops->hangup(tty);
-	}
-	/*
-	 * FIXME: Once we trust the LDISC code better we can wait here for
-	 * ldisc completion and fix the driver call race
-	 */
-	wake_up_interruptible_poll(&tty->write_wait, POLLOUT);
-	wake_up_interruptible_poll(&tty->read_wait, POLLIN);
-	/*
-	 * Shutdown the current line discipline, and reset it to
-	 * N_TTY.
-	 */
-	if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS)
-		tty_reset_termios(tty);
-	/* Defer ldisc switch */
-	/* tty_deferred_ldisc_switch(N_TTY);
 
-	  This should get done automatically when the port closes and
-	  tty_release is called */
+	tty_ldisc_hangup(tty);
 
 	read_lock(&tasklist_lock);
 	if (tty->session) {
@@ -629,12 +580,15 @@ static void do_tty_hangup(struct work_struct *work)
 	read_unlock(&tasklist_lock);
 
 	spin_lock_irqsave(&tty->ctrl_lock, flags);
-	tty->flags = 0;
+	clear_bit(TTY_THROTTLED, &tty->flags);
+	clear_bit(TTY_PUSH, &tty->flags);
+	clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags);
 	put_pid(tty->session);
 	put_pid(tty->pgrp);
 	tty->session = NULL;
 	tty->pgrp = NULL;
 	tty->ctrl_status = 0;
+	set_bit(TTY_HUPPED, &tty->flags);
 	spin_unlock_irqrestore(&tty->ctrl_lock, flags);
 
 	/* Account for the p->signal references we killed */
@@ -660,10 +614,7 @@ static void do_tty_hangup(struct work_struct *work)
 	 * can't yet guarantee all that.
 	 */
 	set_bit(TTY_HUPPED, &tty->flags);
-	if (ld) {
-		tty_ldisc_enable(tty);
-		tty_ldisc_deref(ld);
-	}
+	tty_ldisc_enable(tty);
 	unlock_kernel();
 	if (f)
 		fput(f);
@@ -2570,7 +2521,7 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case TIOCGSID:
 		return tiocgsid(tty, real_tty, p);
 	case TIOCGETD:
-		return put_user(tty->ldisc.ops->num, (int __user *)p);
+		return put_user(tty->ldisc->ops->num, (int __user *)p);
 	case TIOCSETD:
 		return tiocsetd(tty, p);
 	/*
@@ -2785,6 +2736,7 @@ void initialize_tty_struct(struct tty_struct *tty,
 	tty->buf.head = tty->buf.tail = NULL;
 	tty_buffer_init(tty);
 	mutex_init(&tty->termios_mutex);
+	mutex_init(&tty->ldisc_mutex);
 	init_waitqueue_head(&tty->write_wait);
 	init_waitqueue_head(&tty->read_wait);
 	INIT_WORK(&tty->hangup_work, do_tty_hangup);
diff --git a/drivers/char/tty_ldisc.c b/drivers/char/tty_ldisc.c
index e3c6416..a58a19a 100644
--- a/drivers/char/tty_ldisc.c
+++ b/drivers/char/tty_ldisc.c
@@ -115,19 +115,22 @@ EXPORT_SYMBOL(tty_unregister_ldisc);
 /**
  *	tty_ldisc_try_get	-	try and reference an ldisc
  *	@disc: ldisc number
- *	@ld: tty ldisc structure to complete
  *
  *	Attempt to open and lock a line discipline into place. Return
- *	the line discipline refcounted and assigned in ld. On an error
- *	report the error code back
+ *	the line discipline refcounted or an error.
  */
 
-static int tty_ldisc_try_get(int disc, struct tty_ldisc *ld)
+static struct tty_ldisc *tty_ldisc_try_get(int disc)
 {
 	unsigned long flags;
+	struct tty_ldisc *ld;
 	struct tty_ldisc_ops *ldops;
 	int err = -EINVAL;
 	
+	ld = kmalloc(sizeof(struct tty_ldisc), GFP_KERNEL);
+	if (ld == NULL)
+		return ERR_PTR(-ENOMEM);
+
 	spin_lock_irqsave(&tty_ldisc_lock, flags);
 	ld->ops = NULL;
 	ldops = tty_ldiscs[disc];
@@ -140,17 +143,19 @@ static int tty_ldisc_try_get(int disc, struct tty_ldisc *ld)
 			/* lock it */
 			ldops->refcount++;
 			ld->ops = ldops;
+			ld->refcount = 0;
 			err = 0;
 		}
 	}
 	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-	return err;
+	if (err)
+		return ERR_PTR(err);
+	return ld;
 }
 
 /**
  *	tty_ldisc_get		-	take a reference to an ldisc
  *	@disc: ldisc number
- *	@ld: tty line discipline structure to use
  *
  *	Takes a reference to a line discipline. Deals with refcounts and
  *	module locking counts. Returns NULL if the discipline is not available.
@@ -161,44 +166,46 @@ static int tty_ldisc_try_get(int disc, struct tty_ldisc *ld)
  *		takes tty_ldisc_lock to guard against ldisc races
  */
 
-static int tty_ldisc_get(int disc, struct tty_ldisc *ld)
+static struct tty_ldisc *tty_ldisc_get(int disc)
 {
-	int err;
+	struct tty_ldisc *ld;
 
 	if (disc < N_TTY || disc >= NR_LDISCS)
-		return -EINVAL;
-	err = tty_ldisc_try_get(disc, ld);
-	if (err < 0) {
+		return ERR_PTR(-EINVAL);
+	ld = tty_ldisc_try_get(disc);
+	if (IS_ERR(ld)) {
 		request_module("tty-ldisc-%d", disc);
-		err = tty_ldisc_try_get(disc, ld);
+		ld = tty_ldisc_try_get(disc);
 	}
-	return err;
+	return ld;
 }
 
 /**
  *	tty_ldisc_put		-	drop ldisc reference
- *	@disc: ldisc number
+ *	@ld: ldisc
  *
  *	Drop a reference to a line discipline. Manage refcounts and
- *	module usage counts
+ *	module usage counts. Free the ldisc once the recount hits zero.
  *
  *	Locking:
  *		takes tty_ldisc_lock to guard against ldisc races
  */
 
-static void tty_ldisc_put(struct tty_ldisc_ops *ld)
+static void tty_ldisc_put(struct tty_ldisc *ld)
 {
 	unsigned long flags;
-	int disc = ld->num;
+	int disc = ld->ops->num;
+	struct tty_ldisc_ops *ldo;
 
 	BUG_ON(disc < N_TTY || disc >= NR_LDISCS);
 
 	spin_lock_irqsave(&tty_ldisc_lock, flags);
-	ld = tty_ldiscs[disc];
-	BUG_ON(ld->refcount == 0);
-	ld->refcount--;
-	module_put(ld->owner);
+	ldo = tty_ldiscs[disc];
+	BUG_ON(ldo->refcount == 0);
+	ldo->refcount--;
+	module_put(ldo->owner);
 	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
+	kfree(ld);
 }
 
 static void * tty_ldiscs_seq_start(struct seq_file *m, loff_t *pos)
@@ -219,12 +226,13 @@ static void tty_ldiscs_seq_stop(struct seq_file *m, void *v)
 static int tty_ldiscs_seq_show(struct seq_file *m, void *v)
 {
 	int i = *(loff_t *)v;
-	struct tty_ldisc ld;
+	struct tty_ldisc *ld;
 	
-	if (tty_ldisc_get(i, &ld) < 0)
+	ld = tty_ldisc_try_get(i);
+	if (IS_ERR(ld))
 		return 0;
-	seq_printf(m, "%-10s %2d\n", ld.ops->name ? ld.ops->name : "???", i);
-	tty_ldisc_put(ld.ops);
+	seq_printf(m, "%-10s %2d\n", ld->ops->name ? ld->ops->name : "???", i);
+	tty_ldisc_put(ld);
 	return 0;
 }
 
@@ -263,8 +271,7 @@ const struct file_operations tty_ldiscs_proc_fops = {
 
 static void tty_ldisc_assign(struct tty_struct *tty, struct tty_ldisc *ld)
 {
-	ld->refcount = 0;
-	tty->ldisc = *ld;
+	tty->ldisc = ld;
 }
 
 /**
@@ -286,7 +293,7 @@ static int tty_ldisc_try(struct tty_struct *tty)
 	int ret = 0;
 
 	spin_lock_irqsave(&tty_ldisc_lock, flags);
-	ld = &tty->ldisc;
+	ld = tty->ldisc;
 	if (test_bit(TTY_LDISC, &tty->flags)) {
 		ld->refcount++;
 		ret = 1;
@@ -315,8 +322,8 @@ struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *tty)
 {
 	/* wait_event is a macro */
 	wait_event(tty_ldisc_wait, tty_ldisc_try(tty));
-	WARN_ON(tty->ldisc.refcount == 0);
-	return &tty->ldisc;
+	WARN_ON(tty->ldisc->refcount == 0);
+	return tty->ldisc;
 }
 
 EXPORT_SYMBOL_GPL(tty_ldisc_ref_wait);
@@ -335,7 +342,7 @@ EXPORT_SYMBOL_GPL(tty_ldisc_ref_wait);
 struct tty_ldisc *tty_ldisc_ref(struct tty_struct *tty)
 {
 	if (tty_ldisc_try(tty))
-		return &tty->ldisc;
+		return tty->ldisc;
 	return NULL;
 }
 
@@ -407,6 +414,39 @@ static void tty_set_termios_ldisc(struct tty_struct *tty, int num)
 	mutex_unlock(&tty->termios_mutex);
 }
 
+/**
+ *	tty_ldisc_open		-	open a line discipline
+ *	@tty: tty we are opening the ldisc on
+ *	@ld: discipline to open
+ *
+ *	A helper opening method. Also a convenient debugging and check
+ *	point.
+ */
+
+static int tty_ldisc_open(struct tty_struct *tty, struct tty_ldisc *ld)
+{
+	WARN_ON(test_and_set_bit(TTY_LDISC_OPEN, &tty->flags));
+	if (ld->ops->open)
+		return ld->ops->open(tty);
+	return 0;
+}
+
+/**
+ *	tty_ldisc_close		-	close a line discipline
+ *	@tty: tty we are opening the ldisc on
+ *	@ld: discipline to close
+ *
+ *	A helper close method. Also a convenient debugging and check
+ *	point.
+ */
+
+static void tty_ldisc_close(struct tty_struct *tty, struct tty_ldisc *ld)
+{
+	WARN_ON(!test_bit(TTY_LDISC_OPEN, &tty->flags));
+	clear_bit(TTY_LDISC_OPEN, &tty->flags);
+	if (ld->ops->close)
+		ld->ops->close(tty);
+}
 
 /**
  *	tty_ldisc_restore	-	helper for tty ldisc change
@@ -420,31 +460,32 @@ static void tty_set_termios_ldisc(struct tty_struct *tty, int num)
 static void tty_ldisc_restore(struct tty_struct *tty, struct tty_ldisc *old)
 {
 	char buf[64];
-	struct tty_ldisc new_ldisc;
+	struct tty_ldisc *new_ldisc;
+	int r;
 
 	/* There is an outstanding reference here so this is safe */
-	tty_ldisc_get(old->ops->num, old);
+	old = tty_ldisc_get(old->ops->num);
+	WARN_ON(IS_ERR(old));
 	tty_ldisc_assign(tty, old);
 	tty_set_termios_ldisc(tty, old->ops->num);
-	if (old->ops->open && (old->ops->open(tty) < 0)) {
-		tty_ldisc_put(old->ops);
+	if (tty_ldisc_open(tty, old) < 0) {
+		tty_ldisc_put(old);
 		/* This driver is always present */
-		if (tty_ldisc_get(N_TTY, &new_ldisc) < 0)
+		new_ldisc =tty_ldisc_get(N_TTY);
+		if (IS_ERR(new_ldisc))
 			panic("n_tty: get");
-		tty_ldisc_assign(tty, &new_ldisc);
+		tty_ldisc_assign(tty, new_ldisc);
 		tty_set_termios_ldisc(tty, N_TTY);
-		if (new_ldisc.ops->open) {
-			int r = new_ldisc.ops->open(tty);
-				if (r < 0)
-				panic("Couldn't open N_TTY ldisc for "
-				      "%s --- error %d.",
-				      tty_name(tty, buf), r);
-		}
+		r = tty_ldisc_open(tty, new_ldisc);
+		if (r < 0)
+			panic("Couldn't open N_TTY ldisc for "
+			      "%s --- error %d.",
+			      tty_name(tty, buf), r);
 	}
 }
 
 /**
- *	tty_ldisc_halt		-	shutdown the line discipline
+ *	tty_ldisc_halt		-	shut down the line discipline
  *	@tty: tty device
  *
  *	Shut down the line discipline and work queue for this tty device.
@@ -456,14 +497,10 @@ static void tty_ldisc_restore(struct tty_struct *tty, struct tty_ldisc *old)
  *	tty_ldisc_wait_idle.
  */
 
-static void tty_ldisc_halt(struct tty_struct *tty)
+static int tty_ldisc_halt(struct tty_struct *tty)
 {
 	clear_bit(TTY_LDISC, &tty->flags);
-	cancel_delayed_work(&tty->buf.work);
-	/*
-	 * Wait for ->hangup_work and ->buf.work handlers to terminate
-	 */
-	flush_scheduled_work();
+	return cancel_delayed_work(&tty->buf.work);
 }
 
 /**
@@ -473,18 +510,22 @@ static void tty_ldisc_halt(struct tty_struct *tty)
  *	Wait for the line discipline to become idle. The discipline must
  *	have been halted for this to guarantee it remains idle.
  *
+ *	tty_ldisc_lock protects the ref counts currently.
  */
 
-static void tty_ldisc_wait_idle(struct tty_struct *tty)
+static int tty_ldisc_wait_idle(struct tty_struct *tty)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&tty_ldisc_lock, flags);
-	while (tty->ldisc.refcount) {
+	while (tty->ldisc->refcount) {
 		spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-		wait_event(tty_ldisc_wait, tty->ldisc.refcount == 0);
+		if (wait_event_timeout(tty_ldisc_wait,
+				tty->ldisc->refcount == 0, 5 * HZ) == 0)
+			return -EBUSY;
 		spin_lock_irqsave(&tty_ldisc_lock, flags);
 	}
 	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
+	return 0;
 }
 
 /**
@@ -493,39 +534,64 @@ static void tty_ldisc_wait_idle(struct tty_struct *tty)
  *	@ldisc: the line discipline
  *
  *	Set the discipline of a tty line. Must be called from a process
- *	context.
+ *	context. The ldisc change logic has to protect itself against any
+ *	overlapping ldisc change (including on the other end of pty pairs),
+ *	the close of one side of a tty/pty pair, and eventually hangup.
  *
- *	Locking: takes tty_ldisc_lock.
- *		 called functions take termios_mutex
+ *	Locking: takes tty_ldisc_lock, termios_mutex
  */
 
 int tty_set_ldisc(struct tty_struct *tty, int ldisc)
 {
 	int retval;
-	struct tty_ldisc o_ldisc, new_ldisc;
-	int work;
-	unsigned long flags;
+	struct tty_ldisc *o_ldisc, *new_ldisc;
+	int work, o_work = 0;
 	struct tty_struct *o_tty;
 
-restart:
-	/* This is a bit ugly for now but means we can break the 'ldisc
-	   is part of the tty struct' assumption later */
-	retval = tty_ldisc_get(ldisc, &new_ldisc);
-	if (retval)
-		return retval;
+	new_ldisc = tty_ldisc_get(ldisc);
+	if (IS_ERR(new_ldisc))
+		return PTR_ERR(new_ldisc);
 
 	/*
-	 *	Problem: What do we do if this blocks ?
+	 *	We need to look at the tty locking here for pty/tty pairs
+	 *	when both sides try to change in parallel.
 	 */
 
-	tty_wait_until_sent(tty, 0);
+	o_tty = tty->link;	/* o_tty is the pty side or NULL */
+
 
-	if (tty->ldisc.ops->num == ldisc) {
-		tty_ldisc_put(new_ldisc.ops);
+	/*
+	 *	Check the no-op case
+	 */
+
+	if (tty->ldisc->ops->num == ldisc) {
+		tty_ldisc_put(new_ldisc);
 		return 0;
 	}
 
 	/*
+	 *	Problem: What do we do if this blocks ?
+	 *	We could deadlock here
+	 */
+
+	tty_wait_until_sent(tty, 0);
+
+	mutex_lock(&tty->ldisc_mutex);
+
+	/*
+	 *	We could be midstream of another ldisc change which has
+	 *	dropped the lock during processing. If so we need to wait.
+	 */
+
+	while (test_bit(TTY_LDISC_CHANGING, &tty->flags)) {
+		mutex_unlock(&tty->ldisc_mutex);
+		wait_event(tty_ldisc_wait,
+			test_bit(TTY_LDISC_CHANGING, &tty->flags) == 0);
+		mutex_lock(&tty->ldisc_mutex);
+	}
+	set_bit(TTY_LDISC_CHANGING, &tty->flags);
+		
+	/*
 	 *	No more input please, we are switching. The new ldisc
 	 *	will update this value in the ldisc open function
 	 */
@@ -533,8 +599,6 @@ restart:
 	tty->receive_room = 0;
 
 	o_ldisc = tty->ldisc;
-	o_tty = tty->link;
-
 	/*
 	 *	Make sure we don't change while someone holds a
 	 *	reference to the line discipline. The TTY_LDISC bit
@@ -545,108 +609,181 @@ restart:
 	 *	with a userspace app continually trying to use the tty in
 	 *	parallel to the change and re-referencing the tty.
 	 */
-	clear_bit(TTY_LDISC, &tty->flags);
-	if (o_tty)
-		clear_bit(TTY_LDISC, &o_tty->flags);
 
-	spin_lock_irqsave(&tty_ldisc_lock, flags);
-	if (tty->ldisc.refcount || (o_tty && o_tty->ldisc.refcount)) {
-		if (tty->ldisc.refcount) {
-			/* Free the new ldisc we grabbed. Must drop the lock
-			   first. */
-			spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-			tty_ldisc_put(o_ldisc.ops);
-			/*
-			 * There are several reasons we may be busy, including
-			 * random momentary I/O traffic. We must therefore
-			 * retry. We could distinguish between blocking ops
-			 * and retries if we made tty_ldisc_wait() smarter.
-			 * That is up for discussion.
-			 */
-			if (wait_event_interruptible(tty_ldisc_wait, tty->ldisc.refcount == 0) < 0)
-				return -ERESTARTSYS;
-			goto restart;
-		}
-		if (o_tty && o_tty->ldisc.refcount) {
-			spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-			tty_ldisc_put(o_tty->ldisc.ops);
-			if (wait_event_interruptible(tty_ldisc_wait, o_tty->ldisc.refcount == 0) < 0)
-				return -ERESTARTSYS;
-			goto restart;
-		}
-	}
-	/*
-	 *	If the TTY_LDISC bit is set, then we are racing against
-	 *	another ldisc change
-	 */
-	if (test_bit(TTY_LDISC_CHANGING, &tty->flags)) {
-		struct tty_ldisc *ld;
-		spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-		tty_ldisc_put(new_ldisc.ops);
-		ld = tty_ldisc_ref_wait(tty);
-		tty_ldisc_deref(ld);
-		goto restart;
-	}
-	/*
-	 *	This flag is used to avoid two parallel ldisc changes. Once
-	 *	open and close are fine grained locked this may work better
-	 *	as a mutex shared with the open/close/hup paths
-	 */
-	set_bit(TTY_LDISC_CHANGING, &tty->flags);
+	work = tty_ldisc_halt(tty);
 	if (o_tty)
-		set_bit(TTY_LDISC_CHANGING, &o_tty->flags);
-	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-	
-	/*
-	 *	From this point on we know nobody has an ldisc
-	 *	usage reference, nor can they obtain one until
-	 *	we say so later on.
-	 */
+		o_work = tty_ldisc_halt(o_tty);
 
-	work = cancel_delayed_work(&tty->buf.work);
 	/*
-	 * Wait for ->hangup_work and ->buf.work handlers to terminate
-	 * MUST NOT hold locks here.
+	 * Wait for ->hangup_work and ->buf.work handlers to terminate.
+	 * We must drop the mutex here in case a hangup is also in process.
 	 */
+
+	mutex_unlock(&tty->ldisc_mutex);
+
 	flush_scheduled_work();
+
+	/* Let any existing reference holders finish */
+	retval = tty_ldisc_wait_idle(tty);
+	if (retval < 0) {
+		clear_bit(TTY_LDISC_CHANGING, &tty->flags);
+		tty_ldisc_put(new_ldisc);
+		return retval;
+	}
+
+	mutex_lock(&tty->ldisc_mutex);
+	if (test_bit(TTY_HUPPED, &tty->flags)) {
+		/* We were raced by the hangup method. It will have stomped
+		   the ldisc data and closed the ldisc down */
+		clear_bit(TTY_LDISC_CHANGING, &tty->flags);
+		mutex_unlock(&tty->ldisc_mutex);
+		tty_ldisc_put(new_ldisc);
+		return -EIO;
+	}
+
 	/* Shutdown the current discipline. */
-	if (o_ldisc.ops->close)
-		(o_ldisc.ops->close)(tty);
+	tty_ldisc_close(tty, o_ldisc);
 
 	/* Now set up the new line discipline. */
-	tty_ldisc_assign(tty, &new_ldisc);
+	tty_ldisc_assign(tty, new_ldisc);
 	tty_set_termios_ldisc(tty, ldisc);
-	if (new_ldisc.ops->open)
-		retval = (new_ldisc.ops->open)(tty);
+
+	retval = tty_ldisc_open(tty, new_ldisc);
 	if (retval < 0) {
-		tty_ldisc_put(new_ldisc.ops);
-		tty_ldisc_restore(tty, &o_ldisc);
+		/* Back to the old one or N_TTY if we can't */
+		tty_ldisc_put(new_ldisc);
+		tty_ldisc_restore(tty, o_ldisc);
 	}
+
 	/* At this point we hold a reference to the new ldisc and a
 	   a reference to the old ldisc. If we ended up flipping back
 	   to the existing ldisc we have two references to it */
 
-	if (tty->ldisc.ops->num != o_ldisc.ops->num && tty->ops->set_ldisc)
+	if (tty->ldisc->ops->num != o_ldisc->ops->num && tty->ops->set_ldisc)
 		tty->ops->set_ldisc(tty);
 
-	tty_ldisc_put(o_ldisc.ops);
+	tty_ldisc_put(o_ldisc);
 
 	/*
-	 *	Allow ldisc referencing to occur as soon as the driver
-	 *	ldisc callback completes.
+	 *	Allow ldisc referencing to occur again
 	 */
 
 	tty_ldisc_enable(tty);
 	if (o_tty)
 		tty_ldisc_enable(o_tty);
 
-	/* Restart it in case no characters kick it off. Safe if
+	/* Restart the work queue in case no characters kick it off. Safe if
 	   already running */
 	if (work)
 		schedule_delayed_work(&tty->buf.work, 1);
+	if (o_work)
+		schedule_delayed_work(&o_tty->buf.work, 1);
+	mutex_unlock(&tty->ldisc_mutex);
 	return retval;
 }
 
+/**
+ *	tty_reset_termios	-	reset terminal state
+ *	@tty: tty to reset
+ *
+ *	Restore a terminal to the driver default state.
+ */
+
+static void tty_reset_termios(struct tty_struct *tty)
+{
+	mutex_lock(&tty->termios_mutex);
+	*tty->termios = tty->driver->init_termios;
+	tty->termios->c_ispeed = tty_termios_input_baud_rate(tty->termios);
+	tty->termios->c_ospeed = tty_termios_baud_rate(tty->termios);
+	mutex_unlock(&tty->termios_mutex);
+}
+
+
+/**
+ *	tty_ldisc_reinit	-	reinitialise the tty ldisc
+ *	@tty: tty to reinit
+ *
+ *	Switch the tty back to N_TTY line discipline and leave the
+ *	ldisc state closed
+ */
+
+static void tty_ldisc_reinit(struct tty_struct *tty)
+{
+	struct tty_ldisc *ld;
+
+	tty_ldisc_close(tty, tty->ldisc);
+	tty_ldisc_put(tty->ldisc);
+	tty->ldisc = NULL;
+	/*
+	 *	Switch the line discipline back
+	 */
+	ld = tty_ldisc_get(N_TTY);
+	BUG_ON(IS_ERR(ld));
+	tty_ldisc_assign(tty, ld);
+	tty_set_termios_ldisc(tty, N_TTY);
+}
+
+/**
+ *	tty_ldisc_hangup		-	hangup ldisc reset
+ *	@tty: tty being hung up
+ *
+ *	Some tty devices reset their termios when they receive a hangup
+ *	event. In that situation we must also switch back to N_TTY properly
+ *	before we reset the termios data.
+ *
+ *	Locking: We can take the ldisc mutex as the rest of the code is
+ *	careful to allow for this.
+ *
+ *	In the pty pair case this occurs in the close() path of the
+ *	tty itself so we must be careful about locking rules.
+ */
+
+void tty_ldisc_hangup(struct tty_struct *tty)
+{
+	struct tty_ldisc *ld;
+
+	/*
+	 * FIXME! What are the locking issues here? This may me overdoing
+	 * things... This question is especially important now that we've
+	 * removed the irqlock.
+	 */
+	ld = tty_ldisc_ref(tty);
+	if (ld != NULL) {
+		/* We may have no line discipline at this point */
+		if (ld->ops->flush_buffer)
+			ld->ops->flush_buffer(tty);
+		tty_driver_flush_buffer(tty);
+		if ((test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) &&
+		    ld->ops->write_wakeup)
+			ld->ops->write_wakeup(tty);
+		if (ld->ops->hangup)
+			ld->ops->hangup(tty);
+		tty_ldisc_deref(ld);
+	}
+	/*
+	 * FIXME: Once we trust the LDISC code better we can wait here for
+	 * ldisc completion and fix the driver call race
+	 */
+	wake_up_interruptible_poll(&tty->write_wait, POLLOUT);
+	wake_up_interruptible_poll(&tty->read_wait, POLLIN);
+	/*
+	 * Shutdown the current line discipline, and reset it to
+	 * N_TTY.
+	 */
+	if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS) {
+		/* Avoid racing set_ldisc */
+		mutex_lock(&tty->ldisc_mutex);
+		/* Switch back to N_TTY */
+		tty_ldisc_reinit(tty);
+		/* At this point we have a closed ldisc and we want to
+		   reopen it. We could defer this to the next open but
+		   it means auditing a lot of other paths so this is a FIXME */
+		WARN_ON(tty_ldisc_open(tty, tty->ldisc));
+		tty_ldisc_enable(tty);
+		mutex_unlock(&tty->ldisc_mutex);
+		tty_reset_termios(tty);
+	}
+}
 
 /**
  *	tty_ldisc_setup			-	open line discipline
@@ -654,24 +791,23 @@ restart:
  *	@o_tty: pair tty for pty/tty pairs
  *
  *	Called during the initial open of a tty/pty pair in order to set up the
- *	line discplines and bind them to the tty.
+ *	line disciplines and bind them to the tty. This has no locking issues
+ *	as the device isn't yet active.
  */
 
 int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty)
 {
-	struct tty_ldisc *ld = &tty->ldisc;
+	struct tty_ldisc *ld = tty->ldisc;
 	int retval;
 
-	if (ld->ops->open) {
-		retval = (ld->ops->open)(tty);
-		if (retval)
-			return retval;
-	}
-	if (o_tty && o_tty->ldisc.ops->open) {
-		retval = (o_tty->ldisc.ops->open)(o_tty);
+	retval = tty_ldisc_open(tty, ld);
+	if (retval)
+		return retval;
+
+	if (o_tty) {
+		retval = tty_ldisc_open(o_tty, o_tty->ldisc);
 		if (retval) {
-			if (ld->ops->close)
-				(ld->ops->close)(tty);
+			tty_ldisc_close(tty, ld);
 			return retval;
 		}
 		tty_ldisc_enable(o_tty);
@@ -679,34 +815,18 @@ int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty)
 	tty_ldisc_enable(tty);
 	return 0;
 }
-
-static void tty_ldisc_reinit(struct tty_struct *tty)
-{
-	struct tty_ldisc ld;
-
-	if (tty->ldisc.ops->close)
-		(tty->ldisc.ops->close)(tty);
-	tty_ldisc_put(tty->ldisc.ops);
-	/*
-	 *	Switch the line discipline back
-	 */
-	WARN_ON(tty_ldisc_get(N_TTY, &ld));
-	tty_ldisc_assign(tty, &ld);
-	tty_set_termios_ldisc(tty, N_TTY);
-}
-
 /**
  *	tty_ldisc_release		-	release line discipline
  *	@tty: tty being shut down
  *	@o_tty: pair tty for pty/tty pairs
  *
  *	Called during the final close of a tty/pty pair in order to shut down the
- *	line discpline layer.
+ *	line discpline layer. On exit the ldisc assigned is N_TTY and the
+ *	ldisc has not been opened.
  */
 
 void tty_ldisc_release(struct tty_struct *tty, struct tty_struct *o_tty)
 {
-
 	/*
 	 * Prevent flush_to_ldisc() from rescheduling the work for later.  Then
 	 * kill any delayed work. As this is the final close it does not
@@ -714,6 +834,7 @@ void tty_ldisc_release(struct tty_struct *tty, struct tty_struct *o_tty)
 	 */
 
 	tty_ldisc_halt(tty);
+	flush_scheduled_work();
 
 	/*
 	 * Wait for any short term users (we know they are just driver
@@ -730,11 +851,9 @@ void tty_ldisc_release(struct tty_struct *tty, struct tty_struct *o_tty)
 	 */
 
 	tty_ldisc_reinit(tty);
-	if (o_tty) {
-		/* FIXME: could o_tty be in setldisc here ? */
-		clear_bit(TTY_LDISC, &o_tty->flags);
-		tty_ldisc_reinit(o_tty);
-	}
+	/* This will need doing differently if we need to lock */
+	if (o_tty)
+		tty_ldisc_release(o_tty, NULL);
 }
 
 /**
@@ -747,10 +866,10 @@ void tty_ldisc_release(struct tty_struct *tty, struct tty_struct *o_tty)
 
 void tty_ldisc_init(struct tty_struct *tty)
 {
-	struct tty_ldisc ld;
-	if (tty_ldisc_get(N_TTY, &ld) < 0)
+	struct tty_ldisc *ld = tty_ldisc_get(N_TTY);
+	if (IS_ERR(ld))
 		panic("n_tty: init_tty");
-	tty_ldisc_assign(tty, &ld);
+	tty_ldisc_assign(tty, ld);
 }
 
 void tty_ldisc_begin(void)
diff --git a/include/linux/tty.h b/include/linux/tty.h
index f9c13c8..1488d8c 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -226,8 +226,11 @@ struct tty_struct {
 	struct tty_driver *driver;
 	const struct tty_operations *ops;
 	int index;
-	/* The ldisc objects are protected by tty_ldisc_lock at the moment */
-	struct tty_ldisc ldisc;
+
+	/* Protects ldisc changes: Lock tty not pty */
+	struct mutex ldisc_mutex;
+	struct tty_ldisc *ldisc;
+
 	struct mutex termios_mutex;
 	spinlock_t ctrl_lock;
 	/* Termios values are protected by the termios mutex */
@@ -314,6 +317,7 @@ struct tty_struct {
 #define TTY_CLOSING 		7	/* ->close() in progress */
 #define TTY_LDISC 		9	/* Line discipline attached */
 #define TTY_LDISC_CHANGING 	10	/* Line discipline changing */
+#define TTY_LDISC_OPEN	 	11	/* Line discipline is open */
 #define TTY_HW_COOK_OUT 	14	/* Hardware can do output cooking */
 #define TTY_HW_COOK_IN 		15	/* Hardware can do input cooking */
 #define TTY_PTY_LOCK 		16	/* pty private */
@@ -406,6 +410,7 @@ extern int tty_termios_hw_change(struct ktermios *a, struct ktermios *b);
 extern struct tty_ldisc *tty_ldisc_ref(struct tty_struct *);
 extern void tty_ldisc_deref(struct tty_ldisc *);
 extern struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *);
+extern void tty_ldisc_hangup(struct tty_struct *tty);
 extern const struct file_operations tty_ldiscs_proc_fops;
 
 extern void tty_wakeup(struct tty_struct *tty);
-- 
cgit v0.10.2


From f2c4c65c8350d885d74dcdea7061dcecc1223c36 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 12:50:58 +0100
Subject: tty: Move ldisc_flush

We have a tty_ldisc file now so put tty_ldisc_flush in the right place

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 2f44b0b..939e198 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -471,27 +471,6 @@ void tty_wakeup(struct tty_struct *tty)
 EXPORT_SYMBOL_GPL(tty_wakeup);
 
 /**
- *	tty_ldisc_flush	-	flush line discipline queue
- *	@tty: tty
- *
- *	Flush the line discipline queue (if any) for this tty. If there
- *	is no line discipline active this is a no-op.
- */
-
-void tty_ldisc_flush(struct tty_struct *tty)
-{
-	struct tty_ldisc *ld = tty_ldisc_ref(tty);
-	if (ld) {
-		if (ld->ops->flush_buffer)
-			ld->ops->flush_buffer(tty);
-		tty_ldisc_deref(ld);
-	}
-	tty_buffer_flush(tty);
-}
-
-EXPORT_SYMBOL_GPL(tty_ldisc_flush);
-
-/**
  *	do_tty_hangup		-	actual handler for hangup events
  *	@work: tty device
  *
diff --git a/drivers/char/tty_ldisc.c b/drivers/char/tty_ldisc.c
index a58a19a..01c70c0 100644
--- a/drivers/char/tty_ldisc.c
+++ b/drivers/char/tty_ldisc.c
@@ -396,6 +396,27 @@ void tty_ldisc_enable(struct tty_struct *tty)
 }
 
 /**
+ *	tty_ldisc_flush	-	flush line discipline queue
+ *	@tty: tty
+ *
+ *	Flush the line discipline queue (if any) for this tty. If there
+ *	is no line discipline active this is a no-op.
+ */
+
+void tty_ldisc_flush(struct tty_struct *tty)
+{
+	struct tty_ldisc *ld = tty_ldisc_ref(tty);
+	if (ld) {
+		if (ld->ops->flush_buffer)
+			ld->ops->flush_buffer(tty);
+		tty_ldisc_deref(ld);
+	}
+	tty_buffer_flush(tty);
+}
+
+EXPORT_SYMBOL_GPL(tty_ldisc_flush);
+
+/**
  *	tty_set_termios_ldisc		-	set ldisc field
  *	@tty: tty structure
  *	@num: line discipline number
-- 
cgit v0.10.2


From 852e99d22f2231d232c45216b027565e3bae7add Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 12:51:41 +0100
Subject: tty: bring ldisc into CodingStyle

Signed-off-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/tty_ldisc.c b/drivers/char/tty_ldisc.c
index 01c70c0..39c8f86 100644
--- a/drivers/char/tty_ldisc.c
+++ b/drivers/char/tty_ldisc.c
@@ -126,7 +126,7 @@ static struct tty_ldisc *tty_ldisc_try_get(int disc)
 	struct tty_ldisc *ld;
 	struct tty_ldisc_ops *ldops;
 	int err = -EINVAL;
-	
+
 	ld = kmalloc(sizeof(struct tty_ldisc), GFP_KERNEL);
 	if (ld == NULL)
 		return ERR_PTR(-ENOMEM);
@@ -208,12 +208,12 @@ static void tty_ldisc_put(struct tty_ldisc *ld)
 	kfree(ld);
 }
 
-static void * tty_ldiscs_seq_start(struct seq_file *m, loff_t *pos)
+static void *tty_ldiscs_seq_start(struct seq_file *m, loff_t *pos)
 {
 	return (*pos < NR_LDISCS) ? pos : NULL;
 }
 
-static void * tty_ldiscs_seq_next(struct seq_file *m, void *v, loff_t *pos)
+static void *tty_ldiscs_seq_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	(*pos)++;
 	return (*pos < NR_LDISCS) ? pos : NULL;
@@ -227,7 +227,7 @@ static int tty_ldiscs_seq_show(struct seq_file *m, void *v)
 {
 	int i = *(loff_t *)v;
 	struct tty_ldisc *ld;
-	
+
 	ld = tty_ldisc_try_get(i);
 	if (IS_ERR(ld))
 		return 0;
@@ -325,7 +325,6 @@ struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *tty)
 	WARN_ON(tty->ldisc->refcount == 0);
 	return tty->ldisc;
 }
-
 EXPORT_SYMBOL_GPL(tty_ldisc_ref_wait);
 
 /**
@@ -345,7 +344,6 @@ struct tty_ldisc *tty_ldisc_ref(struct tty_struct *tty)
 		return tty->ldisc;
 	return NULL;
 }
-
 EXPORT_SYMBOL_GPL(tty_ldisc_ref);
 
 /**
@@ -373,7 +371,6 @@ void tty_ldisc_deref(struct tty_ldisc *ld)
 		wake_up(&tty_ldisc_wait);
 	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
 }
-
 EXPORT_SYMBOL_GPL(tty_ldisc_deref);
 
 /**
@@ -413,7 +410,6 @@ void tty_ldisc_flush(struct tty_struct *tty)
 	}
 	tty_buffer_flush(tty);
 }
-
 EXPORT_SYMBOL_GPL(tty_ldisc_flush);
 
 /**
@@ -492,7 +488,7 @@ static void tty_ldisc_restore(struct tty_struct *tty, struct tty_ldisc *old)
 	if (tty_ldisc_open(tty, old) < 0) {
 		tty_ldisc_put(old);
 		/* This driver is always present */
-		new_ldisc =tty_ldisc_get(N_TTY);
+		new_ldisc = tty_ldisc_get(N_TTY);
 		if (IS_ERR(new_ldisc))
 			panic("n_tty: get");
 		tty_ldisc_assign(tty, new_ldisc);
@@ -514,7 +510,7 @@ static void tty_ldisc_restore(struct tty_struct *tty, struct tty_ldisc *old)
  *	be obtained while the delayed work queue halt ensures that no more
  *	data is fed to the ldisc.
  *
- *	In order to wait for any existing references to complete see 
+ *	In order to wait for any existing references to complete see
  *	tty_ldisc_wait_idle.
  */
 
@@ -611,7 +607,7 @@ int tty_set_ldisc(struct tty_struct *tty, int ldisc)
 		mutex_lock(&tty->ldisc_mutex);
 	}
 	set_bit(TTY_LDISC_CHANGING, &tty->flags);
-		
+
 	/*
 	 *	No more input please, we are switching. The new ldisc
 	 *	will update this value in the ldisc open function
@@ -841,8 +837,8 @@ int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty)
  *	@tty: tty being shut down
  *	@o_tty: pair tty for pty/tty pairs
  *
- *	Called during the final close of a tty/pty pair in order to shut down the
- *	line discpline layer. On exit the ldisc assigned is N_TTY and the
+ *	Called during the final close of a tty/pty pair in order to shut down
+ *	the line discpline layer. On exit the ldisc assigned is N_TTY and the
  *	ldisc has not been opened.
  */
 
-- 
cgit v0.10.2


From 5fcf62b0f1f24ee25931636216f28bc87448a60f Mon Sep 17 00:00:00 2001
From: Olivier Bornet <Olivier.Bornet@puck.ch>
Date: Thu, 11 Jun 2009 12:52:26 +0100
Subject: tty: iuu_phoenix: fix locking.

Bring in the relevant bits of the 0.9 vendor driver.

Signed-off-by: Olivier Bornet <Olivier.Bornet@puck.ch>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/usb/serial/iuu_phoenix.c b/drivers/usb/serial/iuu_phoenix.c
index bb572ce..637d0dd 100644
--- a/drivers/usb/serial/iuu_phoenix.c
+++ b/drivers/usb/serial/iuu_phoenix.c
@@ -650,32 +650,33 @@ static int iuu_bulk_write(struct usb_serial_port *port)
 	unsigned long flags;
 	int result;
 	int i;
+	int buf_len;
 	char *buf_ptr = port->write_urb->transfer_buffer;
 	dbg("%s - enter", __func__);
 
+	spin_lock_irqsave(&priv->lock, flags);
 	*buf_ptr++ = IUU_UART_ESC;
 	*buf_ptr++ = IUU_UART_TX;
 	*buf_ptr++ = priv->writelen;
 
-	memcpy(buf_ptr, priv->writebuf,
-	       priv->writelen);
+	memcpy(buf_ptr, priv->writebuf, priv->writelen);
+	buf_len = priv->writelen;
+	priv->writelen = 0;
+	spin_unlock_irqrestore(&priv->lock, flags);
 	if (debug == 1) {
-		for (i = 0; i < priv->writelen; i++)
+		for (i = 0; i < buf_len; i++)
 			sprintf(priv->dbgbuf + i*2 ,
 				"%02X", priv->writebuf[i]);
-		priv->dbgbuf[priv->writelen+i*2] = 0;
+		priv->dbgbuf[buf_len+i*2] = 0;
 		dbg("%s - writing %i chars : %s", __func__,
-		    priv->writelen, priv->dbgbuf);
+		    buf_len, priv->dbgbuf);
 	}
 	usb_fill_bulk_urb(port->write_urb, port->serial->dev,
 			  usb_sndbulkpipe(port->serial->dev,
 					  port->bulk_out_endpointAddress),
-			  port->write_urb->transfer_buffer, priv->writelen + 3,
+			  port->write_urb->transfer_buffer, buf_len + 3,
 			  iuu_rxcmd, port);
 	result = usb_submit_urb(port->write_urb, GFP_ATOMIC);
-	spin_lock_irqsave(&priv->lock, flags);
-	priv->writelen = 0;
-	spin_unlock_irqrestore(&priv->lock, flags);
 	usb_serial_port_softint(port);
 	return result;
 }
@@ -769,14 +770,10 @@ static int iuu_uart_write(struct tty_struct *tty, struct usb_serial_port *port,
 		return -ENOMEM;
 
 	spin_lock_irqsave(&priv->lock, flags);
-	if (priv->writelen > 0) {
-		/* buffer already filled but not commited */
-		spin_unlock_irqrestore(&priv->lock, flags);
-		return 0;
-	}
+
 	/* fill the buffer */
-	memcpy(priv->writebuf, buf, count);
-	priv->writelen = count;
+	memcpy(priv->writebuf + priv->writelen, buf, count);
+	priv->writelen += count;
 	spin_unlock_irqrestore(&priv->lock, flags);
 
 	return count;
-- 
cgit v0.10.2


From cc3447d179d8a5e16807e52b77d7f4c095ffedb7 Mon Sep 17 00:00:00 2001
From: Olivier Bornet <Olivier.Bornet@puck.ch>
Date: Thu, 11 Jun 2009 12:53:24 +0100
Subject: tty: iuu_phoenix: Fix stopbit when uart goes on.

Signed-off-by: Olivier Bornet <Olivier.Bornet@puck.ch>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/usb/serial/iuu_phoenix.c b/drivers/usb/serial/iuu_phoenix.c
index 637d0dd..ef6d12f 100644
--- a/drivers/usb/serial/iuu_phoenix.c
+++ b/drivers/usb/serial/iuu_phoenix.c
@@ -815,7 +815,7 @@ static int iuu_uart_on(struct usb_serial_port *port)
 	buf[0] = IUU_UART_ENABLE;
 	buf[1] = (u8) ((IUU_BAUD_9600 >> 8) & 0x00FF);
 	buf[2] = (u8) (0x00FF & IUU_BAUD_9600);
-	buf[3] = (u8) (0x0F0 & IUU_TWO_STOP_BITS) | (0x07 & IUU_PARITY_EVEN);
+	buf[3] = (u8) (0x0F0 & IUU_ONE_STOP_BIT) | (0x07 & IUU_PARITY_EVEN);
 
 	status = bulk_immediate(port, buf, 4);
 	if (status != IUU_OPERATION_OK) {
-- 
cgit v0.10.2


From 96dab77ebf3868cc8723ac95e048e8a9c1dccf22 Mon Sep 17 00:00:00 2001
From: Olivier Bornet <Olivier.Bornet@puck.ch>
Date: Thu, 11 Jun 2009 12:54:20 +0100
Subject: tty: iuu_phoenix: set termios.

set_termios can now be used for setting the parity and the stopbits. This is
needed to use with cards which use a different parity then the parity used at
start (even).

If the iuu_uart_baud function return an error, we will return the old_termios
instead of the new one.

Signed-off-by: Olivier Bornet <Olivier.Bornet@puck.ch>

This was then revamped to use the various helpers, not copy non-hardware
bits any to add mark/space parity and csize reporting

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/usb/serial/iuu_phoenix.c b/drivers/usb/serial/iuu_phoenix.c
index ef6d12f..fa86f6e 100644
--- a/drivers/usb/serial/iuu_phoenix.c
+++ b/drivers/usb/serial/iuu_phoenix.c
@@ -942,6 +942,55 @@ static int iuu_uart_baud(struct usb_serial_port *port, u32 baud,
 	return status;
 }
 
+static void iuu_set_termios(struct tty_struct *tty,
+		struct usb_serial_port *port, struct ktermios *old_termios)
+{
+	const u32 supported_mask = CMSPAR|PARENB|PARODD;
+
+	unsigned int cflag = tty->termios->c_cflag;
+	int status;
+	u32 actual;
+	u32 parity;
+	int csize = CS7;
+	int baud = 9600;	/* Fixed for the moment */
+	u32 newval = cflag & supported_mask;
+
+	/* compute the parity parameter */
+	parity = 0;
+	if (cflag & CMSPAR) {	/* Using mark space */
+		if (cflag & PARODD)
+			parity |= IUU_PARITY_SPACE;
+		else
+			parity |= IUU_PARITY_MARK;
+	} else if (!(cflag & PARENB)) {
+		parity |= IUU_PARITY_NONE;
+		csize = CS8;
+	} else if (cflag & PARODD)
+		parity |= IUU_PARITY_ODD;
+	else
+		parity |= IUU_PARITY_EVEN;
+
+	parity |= (cflag & CSTOPB ? IUU_TWO_STOP_BITS : IUU_ONE_STOP_BIT);
+
+	/* set it */
+	status = iuu_uart_baud(port,
+			(clockmode == 2) ? 16457 : 9600 * boost / 100,
+			&actual, parity);
+
+	/* set the termios value to the real one, so the user now what has
+	 * changed. We support few fields so its easies to copy the old hw
+	 * settings back over and then adjust them
+	 */
+ 	if (old_termios)
+ 		tty_termios_copy_hw(tty->termios, old_termios);
+	if (status != 0)	/* Set failed - return old bits */
+		return;
+	/* Re-encode speed, parity and csize */
+	tty_encode_baud_rate(tty, baud, baud);
+	tty->termios->c_cflag &= ~(supported_mask|CSIZE);
+	tty->termios->c_cflag |= newval | csize;
+}
+
 static void iuu_close(struct usb_serial_port *port)
 {
 	/* iuu_led (port,255,0,0,0); */
@@ -1151,6 +1200,7 @@ static struct usb_serial_driver iuu_device = {
 	.read_bulk_callback = iuu_uart_read_callback,
 	.tiocmget = iuu_tiocmget,
 	.tiocmset = iuu_tiocmset,
+	.set_termios = iuu_set_termios,
 	.attach = iuu_startup,
 	.shutdown = iuu_shutdown,
 };
-- 
cgit v0.10.2


From 9bb41699ad5c74519dc054bfe469a8074799c863 Mon Sep 17 00:00:00 2001
From: Olivier Bornet <Olivier.Bornet@puck.ch>
Date: Thu, 11 Jun 2009 12:55:01 +0100
Subject: tty: iuu_phoenix: update version number.

Signed-off-by: Olivier Bornet <Olivier.Bornet@puck.ch>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/usb/serial/iuu_phoenix.c b/drivers/usb/serial/iuu_phoenix.c
index fa86f6e..76a3cc3 100644
--- a/drivers/usb/serial/iuu_phoenix.c
+++ b/drivers/usb/serial/iuu_phoenix.c
@@ -40,7 +40,7 @@ static int debug;
 /*
  * Version Information
  */
-#define DRIVER_VERSION "v0.5"
+#define DRIVER_VERSION "v0.10"
 #define DRIVER_DESC "Infinity USB Unlimited Phoenix driver"
 
 static struct usb_device_id id_table[] = {
-- 
cgit v0.10.2


From 13858d36903990441a89eb342247b71436808eeb Mon Sep 17 00:00:00 2001
From: "Alexander Y. Fomichev" <git.user@gmail.com>
Date: Thu, 11 Jun 2009 12:56:00 +0100
Subject: jsm: correctly support multiple 4/8-port boards

If there are more then one 4/8-port board jsm_uart_port_init
allocate a line numbers of the second and further boards
from range of previous one.

This patch fixed the problem.

Signed-off-by: Alexander Y. Fomichev <git.user@gmail.com>

[printks fixed to add jsm: ]

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/jsm/jsm.h b/drivers/serial/jsm/jsm.h
index c0a3e27..4e5f3bd 100644
--- a/drivers/serial/jsm/jsm.h
+++ b/drivers/serial/jsm/jsm.h
@@ -61,6 +61,7 @@ enum {
 	if ((DBG_##nlevel & jsm_debug))			\
 	dev_printk(KERN_##klevel, pdev->dev, fmt, ## args)
 
+#define	MAXLINES	256
 #define MAXPORTS	8
 #define MAX_STOPS_SENT	5
 
diff --git a/drivers/serial/jsm/jsm_tty.c b/drivers/serial/jsm/jsm_tty.c
index 31496dc0..107ce2e 100644
--- a/drivers/serial/jsm/jsm_tty.c
+++ b/drivers/serial/jsm/jsm_tty.c
@@ -33,6 +33,8 @@
 
 #include "jsm.h"
 
+static DECLARE_BITMAP(linemap, MAXLINES);
+
 static void jsm_carrier(struct jsm_channel *ch);
 
 static inline int jsm_get_mstat(struct jsm_channel *ch)
@@ -433,6 +435,7 @@ int __devinit jsm_tty_init(struct jsm_board *brd)
 int __devinit jsm_uart_port_init(struct jsm_board *brd)
 {
 	int i;
+	unsigned int line;
 	struct jsm_channel *ch;
 
 	if (!brd)
@@ -459,9 +462,15 @@ int __devinit jsm_uart_port_init(struct jsm_board *brd)
 		brd->channels[i]->uart_port.membase = brd->re_map_membase;
 		brd->channels[i]->uart_port.fifosize = 16;
 		brd->channels[i]->uart_port.ops = &jsm_ops;
-		brd->channels[i]->uart_port.line = brd->channels[i]->ch_portnum + brd->boardnum * 2;
+		line = find_first_zero_bit(linemap, MAXLINES);
+		if (line >= MAXLINES) {
+			printk(KERN_INFO "jsm: linemap is full, added device failed\n");
+			continue;
+		} else
+			set_bit((int)line, linemap);
+		brd->channels[i]->uart_port.line = line;
 		if (uart_add_one_port (&jsm_uart_driver, &brd->channels[i]->uart_port))
-			printk(KERN_INFO "Added device failed\n");
+			printk(KERN_INFO "jsm: add device failed\n");
 		else
 			printk(KERN_INFO "Added device \n");
 	}
@@ -494,6 +503,7 @@ int jsm_remove_uart_port(struct jsm_board *brd)
 
 		ch = brd->channels[i];
 
+		clear_bit((int)(ch->uart_port.line), linemap);
 		uart_remove_one_port(&jsm_uart_driver, &brd->channels[i]->uart_port);
 	}
 
-- 
cgit v0.10.2


From aba6593bf77371e71331ba76dacc98b47760cba3 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@linux.vnet.ibm.com>
Date: Thu, 11 Jun 2009 13:02:59 +0100
Subject: icom: fixing a if clause spaghetti

adapter->version can only be ADAPTER_V2 or ADAPTER_V1. So,
that OR operand in the "if" clause is non-sense and can be removed.

Signed-off-by: Breno Leitao <leitao@linux.vnet.ibm.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/icom.c b/drivers/serial/icom.c
index a461b3b..fd5fd23 100644
--- a/drivers/serial/icom.c
+++ b/drivers/serial/icom.c
@@ -408,7 +408,7 @@ static void load_code(struct icom_port *icom_port)
 	release_firmware(fw);
 
 	/* Set Hardware level */
-	if ((icom_port->adapter->version | ADAPTER_V2) == ADAPTER_V2)
+	if (icom_port->adapter->version == ADAPTER_V2)
 		writeb(V2_HARDWARE, &(icom_port->dram->misc_flags));
 
 	/* Start the processor in Adapter */
@@ -861,7 +861,7 @@ static irqreturn_t icom_interrupt(int irq, void *dev_id)
 	/* find icom_port for this interrupt */
 	icom_adapter = (struct icom_adapter *) dev_id;
 
-	if ((icom_adapter->version | ADAPTER_V2) == ADAPTER_V2) {
+	if (icom_adapter->version == ADAPTER_V2) {
 		int_reg = icom_adapter->base_addr + 0x8024;
 
 		adapter_interrupts = readl(int_reg);
-- 
cgit v0.10.2


From c481c707fe4b07783d9a2499a9bbbb94497e9b18 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 13:04:27 +0100
Subject: tty: remove buffer special casing

Long long ago a 4K kmalloc allocated two pages so the tty layer used the
page allocator, except on some machines where the page size was huge. This was
removed from the core tty layer with the tty buffer re-implementation but not
from tty_audit or the n_tty ldisc.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/n_tty.c b/drivers/char/n_tty.c
index f6f0e4e..b4b12b4 100644
--- a/drivers/char/n_tty.c
+++ b/drivers/char/n_tty.c
@@ -76,19 +76,12 @@
 static inline unsigned char *alloc_buf(void)
 {
 	gfp_t prio = in_interrupt() ? GFP_ATOMIC : GFP_KERNEL;
-
-	if (PAGE_SIZE != N_TTY_BUF_SIZE)
-		return kmalloc(N_TTY_BUF_SIZE, prio);
-	else
-		return (unsigned char *)__get_free_page(prio);
+	return kmalloc(N_TTY_BUF_SIZE, prio);
 }
 
 static inline void free_buf(unsigned char *buf)
 {
-	if (PAGE_SIZE != N_TTY_BUF_SIZE)
-		kfree(buf);
-	else
-		free_page((unsigned long) buf);
+	kfree(buf);
 }
 
 static inline int tty_put_user(struct tty_struct *tty, unsigned char x,
diff --git a/drivers/char/tty_audit.c b/drivers/char/tty_audit.c
index 55ba6f1..ac16fbe 100644
--- a/drivers/char/tty_audit.c
+++ b/drivers/char/tty_audit.c
@@ -29,10 +29,7 @@ static struct tty_audit_buf *tty_audit_buf_alloc(int major, int minor,
 	buf = kmalloc(sizeof(*buf), GFP_KERNEL);
 	if (!buf)
 		goto err;
-	if (PAGE_SIZE != N_TTY_BUF_SIZE)
-		buf->data = kmalloc(N_TTY_BUF_SIZE, GFP_KERNEL);
-	else
-		buf->data = (unsigned char *)__get_free_page(GFP_KERNEL);
+	buf->data = kmalloc(N_TTY_BUF_SIZE, GFP_KERNEL);
 	if (!buf->data)
 		goto err_buf;
 	atomic_set(&buf->count, 1);
@@ -52,10 +49,7 @@ err:
 static void tty_audit_buf_free(struct tty_audit_buf *buf)
 {
 	WARN_ON(buf->valid != 0);
-	if (PAGE_SIZE != N_TTY_BUF_SIZE)
-		kfree(buf->data);
-	else
-		free_page((unsigned long)buf->data);
+	kfree(buf->data);
 	kfree(buf);
 }
 
-- 
cgit v0.10.2


From 0b4068a1287b02018d1b3159e7be6f27f3e3e68c Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 13:05:49 +0100
Subject: tty: simplify buffer allocator cleanups

Having cleaned up the allocators we might as well remove the inline helpers
for some of it

Signed-off-by: Alan Cox <alan@lxorguk.ukuu.org.uk
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/n_tty.c b/drivers/char/n_tty.c
index b4b12b4..94a5d50 100644
--- a/drivers/char/n_tty.c
+++ b/drivers/char/n_tty.c
@@ -73,17 +73,6 @@
 #define ECHO_OP_SET_CANON_COL 0x81
 #define ECHO_OP_ERASE_TAB 0x82
 
-static inline unsigned char *alloc_buf(void)
-{
-	gfp_t prio = in_interrupt() ? GFP_ATOMIC : GFP_KERNEL;
-	return kmalloc(N_TTY_BUF_SIZE, prio);
-}
-
-static inline void free_buf(unsigned char *buf)
-{
-	kfree(buf);
-}
-
 static inline int tty_put_user(struct tty_struct *tty, unsigned char x,
 			       unsigned char __user *ptr)
 {
@@ -1551,11 +1540,11 @@ static void n_tty_close(struct tty_struct *tty)
 {
 	n_tty_flush_buffer(tty);
 	if (tty->read_buf) {
-		free_buf(tty->read_buf);
+		kfree(tty->read_buf);
 		tty->read_buf = NULL;
 	}
 	if (tty->echo_buf) {
-		free_buf(tty->echo_buf);
+		kfree(tty->echo_buf);
 		tty->echo_buf = NULL;
 	}
 }
@@ -1577,17 +1566,16 @@ static int n_tty_open(struct tty_struct *tty)
 
 	/* These are ugly. Currently a malloc failure here can panic */
 	if (!tty->read_buf) {
-		tty->read_buf = alloc_buf();
+		tty->read_buf = kzalloc(N_TTY_BUF_SIZE, GFP_KERNEL);
 		if (!tty->read_buf)
 			return -ENOMEM;
 	}
 	if (!tty->echo_buf) {
-		tty->echo_buf = alloc_buf();
+		tty->echo_buf = kzalloc(N_TTY_BUF_SIZE, GFP_KERNEL);
+
 		if (!tty->echo_buf)
 			return -ENOMEM;
 	}
-	memset(tty->read_buf, 0, N_TTY_BUF_SIZE);
-	memset(tty->echo_buf, 0, N_TTY_BUF_SIZE);
 	reset_buffer_flags(tty);
 	tty->column = 0;
 	n_tty_set_termios(tty, NULL);
-- 
cgit v0.10.2


From 73e0d48b8c28fb39a0bb6713c875e9919a9af546 Mon Sep 17 00:00:00 2001
From: Michael Buesch <mb@bu3sch.de>
Date: Thu, 11 Jun 2009 13:06:31 +0100
Subject: parport_pc: Fix subscription bugs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch fixes array subscription bugs in the parport_pc driver.

drivers/parport/parport_pc.c: In function ‘parport_irq_probe’:
drivers/parport/parport_pc.c:1589: warning: array subscript is above array bounds
drivers/parport/parport_pc.c: In function ‘parport_pc_probe_port’:
drivers/parport/parport_pc.c:1579: warning: array subscript is above array bounds

The patch also fixes a few other array bugs, which the compiler was
unable to find. Coding style violations are also fixed.

Signed-off-by: Michael Buesch <mb@bu3sch.de>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c
index 4e63cc9..24984c4 100644
--- a/drivers/parport/parport_pc.c
+++ b/drivers/parport/parport_pc.c
@@ -1246,17 +1246,17 @@ static void __devinit show_parconfig_smsc37c669(int io, int key)
 		       (cr1 & 0x08 ) ? "Standard mode only (SPP)" : modes[cr4 & 0x03], 
 		       (cr4 & 0x40) ? "1.7" : "1.9");
 	}
-		
+
 	/* Heuristics !  BIOS setup for this mainboard device limits
 	   the choices to standard settings, i.e. io-address and IRQ
 	   are related, however DMA can be 1 or 3, assume DMA_A=DMA1,
 	   DMA_C=DMA3 (this is true e.g. for TYAN 1564D Tomcat IV) */
-	if(cr23*4 >=0x100) { /* if active */
-		while((superios[i].io!= 0) && (i<NR_SUPERIOS))
+	if (cr23 * 4 >= 0x100) { /* if active */
+		while ((i < NR_SUPERIOS) && (superios[i].io != 0))
 			i++;
-		if(i==NR_SUPERIOS)
+		if (i == NR_SUPERIOS) {
 			printk(KERN_INFO "Super-IO: too many chips!\n");
-		else {
+		} else {
 			int d;
 			switch (cr23*4) {
 				case 0x3bc:
@@ -1332,12 +1332,12 @@ static void __devinit show_parconfig_winbond(int io, int key)
 		printk(KERN_INFO "Winbond LPT Config: Port mode=%s\n", modes[crf0 & 0x07]);
 	}
 
-	if(cr30 & 0x01) { /* the settings can be interrogated later ... */
-		while((superios[i].io!= 0) && (i<NR_SUPERIOS))
+	if (cr30 & 0x01) { /* the settings can be interrogated later ... */
+		while ((i < NR_SUPERIOS) && (superios[i].io != 0))
 			i++;
-		if(i==NR_SUPERIOS) 
+		if (i == NR_SUPERIOS) {
 			printk(KERN_INFO "Super-IO: too many chips!\n");
-		else {
+		} else {
 			superios[i].io = (cr60<<8)|cr61;
 			superios[i].irq = cr70&0x0f;
 			superios[i].dma = (((cr74 & 0x07) > 3) ?
@@ -1575,24 +1575,26 @@ static void __devinit detect_and_report_it87(void)
 
 static int get_superio_dma (struct parport *p)
 {
-	int i=0;
-	while( (superios[i].io != p->base) && (i<NR_SUPERIOS))
+	int i = 0;
+
+	while ((i < NR_SUPERIOS) && (superios[i].io != p->base))
 		i++;
-	if (i!=NR_SUPERIOS)
+	if (i != NR_SUPERIOS)
 		return superios[i].dma;
 	return PARPORT_DMA_NONE;
 }
 
 static int get_superio_irq (struct parport *p)
 {
-	int i=0;
-        while( (superios[i].io != p->base) && (i<NR_SUPERIOS))
+	int i = 0;
+
+        while ((i < NR_SUPERIOS) && (superios[i].io != p->base))
                 i++;
-        if (i!=NR_SUPERIOS)
+        if (i != NR_SUPERIOS)
                 return superios[i].irq;
         return PARPORT_IRQ_NONE;
 }
-	
+
 
 /* --- Mode detection ------------------------------------- */
 
-- 
cgit v0.10.2


From 3aeda9bc95d064308a70664e6f4a158c96cc1918 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 13:07:29 +0100
Subject: parport_pc: Coding style

Michael's patch fixed some of the coding style so the style is now
inconsistent. Sort the rest out

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c
index 24984c4..edf83e9 100644
--- a/drivers/parport/parport_pc.c
+++ b/drivers/parport/parport_pc.c
@@ -1,5 +1,5 @@
 /* Low-level parallel-port routines for 8255-based PC-style hardware.
- * 
+ *
  * Authors: Phil Blundell <philb@gnu.org>
  *          Tim Waugh <tim@cyberelk.demon.co.uk>
  *	    Jose Renau <renau@acm.org>
@@ -11,7 +11,7 @@
  * Cleaned up include files - Russell King <linux@arm.uk.linux.org>
  * DMA support - Bert De Jonghe <bert@sophis.be>
  * Many ECP bugs fixed.  Fred Barnes & Jamie Lokier, 1999
- * More PCI support now conditional on CONFIG_PCI, 03/2001, Paul G. 
+ * More PCI support now conditional on CONFIG_PCI, 03/2001, Paul G.
  * Various hacks, Fred Barnes, 04/2001
  * Updated probing logic - Adam Belay <ambx1@neo.rr.com>
  */
@@ -56,10 +56,10 @@
 #include <linux/pnp.h>
 #include <linux/platform_device.h>
 #include <linux/sysctl.h>
+#include <linux/io.h>
+#include <linux/uaccess.h>
 
-#include <asm/io.h>
 #include <asm/dma.h>
-#include <asm/uaccess.h>
 
 #include <linux/parport.h>
 #include <linux/parport_pc.h>
@@ -82,7 +82,7 @@
 #define ECR_TST 06
 #define ECR_CNF 07
 #define ECR_MODE_MASK 0xe0
-#define ECR_WRITE(p,v) frob_econtrol((p),0xff,(v))
+#define ECR_WRITE(p, v) frob_econtrol((p), 0xff, (v))
 
 #undef DEBUG
 
@@ -109,27 +109,27 @@ static int pci_registered_parport;
 static int pnp_registered_parport;
 
 /* frob_control, but for ECR */
-static void frob_econtrol (struct parport *pb, unsigned char m,
+static void frob_econtrol(struct parport *pb, unsigned char m,
 			   unsigned char v)
 {
 	unsigned char ectr = 0;
 
 	if (m != 0xff)
-		ectr = inb (ECONTROL (pb));
+		ectr = inb(ECONTROL(pb));
 
-	DPRINTK (KERN_DEBUG "frob_econtrol(%02x,%02x): %02x -> %02x\n",
+	DPRINTK(KERN_DEBUG "frob_econtrol(%02x,%02x): %02x -> %02x\n",
 		m, v, ectr, (ectr & ~m) ^ v);
 
-	outb ((ectr & ~m) ^ v, ECONTROL (pb));
+	outb((ectr & ~m) ^ v, ECONTROL(pb));
 }
 
-static __inline__ void frob_set_mode (struct parport *p, int mode)
+static inline void frob_set_mode(struct parport *p, int mode)
 {
-	frob_econtrol (p, ECR_MODE_MASK, mode << 5);
+	frob_econtrol(p, ECR_MODE_MASK, mode << 5);
 }
 
 #ifdef CONFIG_PARPORT_PC_FIFO
-/* Safely change the mode bits in the ECR 
+/* Safely change the mode bits in the ECR
    Returns:
 	    0    : Success
 	   -EBUSY: Could not drain FIFO in some finite amount of time,
@@ -141,17 +141,18 @@ static int change_mode(struct parport *p, int m)
 	unsigned char oecr;
 	int mode;
 
-	DPRINTK(KERN_INFO "parport change_mode ECP-ISA to mode 0x%02x\n",m);
+	DPRINTK(KERN_INFO "parport change_mode ECP-ISA to mode 0x%02x\n", m);
 
 	if (!priv->ecr) {
-		printk (KERN_DEBUG "change_mode: but there's no ECR!\n");
+		printk(KERN_DEBUG "change_mode: but there's no ECR!\n");
 		return 0;
 	}
 
 	/* Bits <7:5> contain the mode. */
-	oecr = inb (ECONTROL (p));
+	oecr = inb(ECONTROL(p));
 	mode = (oecr >> 5) & 0x7;
-	if (mode == m) return 0;
+	if (mode == m)
+		return 0;
 
 	if (mode >= 2 && !(priv->ctr & 0x20)) {
 		/* This mode resets the FIFO, so we may
@@ -163,19 +164,21 @@ static int change_mode(struct parport *p, int m)
 		case ECR_ECP: /* ECP Parallel Port mode */
 			/* Busy wait for 200us */
 			for (counter = 0; counter < 40; counter++) {
-				if (inb (ECONTROL (p)) & 0x01)
+				if (inb(ECONTROL(p)) & 0x01)
 					break;
-				if (signal_pending (current)) break;
-				udelay (5);
+				if (signal_pending(current))
+					break;
+				udelay(5);
 			}
 
 			/* Poll slowly. */
-			while (!(inb (ECONTROL (p)) & 0x01)) {
-				if (time_after_eq (jiffies, expire))
+			while (!(inb(ECONTROL(p)) & 0x01)) {
+				if (time_after_eq(jiffies, expire))
 					/* The FIFO is stuck. */
 					return -EBUSY;
-				schedule_timeout_interruptible(msecs_to_jiffies(10));
-				if (signal_pending (current))
+				schedule_timeout_interruptible(
+							msecs_to_jiffies(10));
+				if (signal_pending(current))
 					break;
 			}
 		}
@@ -185,20 +188,20 @@ static int change_mode(struct parport *p, int m)
 		/* We have to go through mode 001 */
 		oecr &= ~(7 << 5);
 		oecr |= ECR_PS2 << 5;
-		ECR_WRITE (p, oecr);
+		ECR_WRITE(p, oecr);
 	}
 
 	/* Set the mode. */
 	oecr &= ~(7 << 5);
 	oecr |= m << 5;
-	ECR_WRITE (p, oecr);
+	ECR_WRITE(p, oecr);
 	return 0;
 }
 
 #ifdef CONFIG_PARPORT_1284
 /* Find FIFO lossage; FIFO is reset */
 #if 0
-static int get_fifo_residue (struct parport *p)
+static int get_fifo_residue(struct parport *p)
 {
 	int residue;
 	int cnfga;
@@ -206,26 +209,26 @@ static int get_fifo_residue (struct parport *p)
 
 	/* Adjust for the contents of the FIFO. */
 	for (residue = priv->fifo_depth; ; residue--) {
-		if (inb (ECONTROL (p)) & 0x2)
+		if (inb(ECONTROL(p)) & 0x2)
 				/* Full up. */
 			break;
 
-		outb (0, FIFO (p));
+		outb(0, FIFO(p));
 	}
 
-	printk (KERN_DEBUG "%s: %d PWords were left in FIFO\n", p->name,
+	printk(KERN_DEBUG "%s: %d PWords were left in FIFO\n", p->name,
 		residue);
 
 	/* Reset the FIFO. */
-	frob_set_mode (p, ECR_PS2);
+	frob_set_mode(p, ECR_PS2);
 
 	/* Now change to config mode and clean up. FIXME */
-	frob_set_mode (p, ECR_CNF);
-	cnfga = inb (CONFIGA (p));
-	printk (KERN_DEBUG "%s: cnfgA contains 0x%02x\n", p->name, cnfga);
+	frob_set_mode(p, ECR_CNF);
+	cnfga = inb(CONFIGA(p));
+	printk(KERN_DEBUG "%s: cnfgA contains 0x%02x\n", p->name, cnfga);
 
 	if (!(cnfga & (1<<2))) {
-		printk (KERN_DEBUG "%s: Accounting for extra byte\n", p->name);
+		printk(KERN_DEBUG "%s: Accounting for extra byte\n", p->name);
 		residue++;
 	}
 
@@ -233,9 +236,11 @@ static int get_fifo_residue (struct parport *p)
 	 * PWord != 1 byte. */
 
 	/* Back to PS2 mode. */
-	frob_set_mode (p, ECR_PS2);
+	frob_set_mode(p, ECR_PS2);
 
-	DPRINTK (KERN_DEBUG "*** get_fifo_residue: done residue collecting (ecr = 0x%2.2x)\n", inb (ECONTROL (p)));
+	DPRINTK(KERN_DEBUG
+	     "*** get_fifo_residue: done residue collecting (ecr = 0x%2.2x)\n",
+							inb(ECONTROL(p)));
 	return residue;
 }
 #endif  /*  0 */
@@ -257,8 +262,8 @@ static int clear_epp_timeout(struct parport *pb)
 	/* To clear timeout some chips require double read */
 	parport_pc_read_status(pb);
 	r = parport_pc_read_status(pb);
-	outb (r | 0x01, STATUS (pb)); /* Some reset by writing 1 */
-	outb (r & 0xfe, STATUS (pb)); /* Others by writing 0 */
+	outb(r | 0x01, STATUS(pb)); /* Some reset by writing 1 */
+	outb(r & 0xfe, STATUS(pb)); /* Others by writing 0 */
 	r = parport_pc_read_status(pb);
 
 	return !(r & 0x01);
@@ -272,7 +277,8 @@ static int clear_epp_timeout(struct parport *pb)
  * of these are in parport_pc.h.
  */
 
-static void parport_pc_init_state(struct pardevice *dev, struct parport_state *s)
+static void parport_pc_init_state(struct pardevice *dev,
+						struct parport_state *s)
 {
 	s->u.pc.ctr = 0xc;
 	if (dev->irq_func &&
@@ -289,22 +295,23 @@ static void parport_pc_save_state(struct parport *p, struct parport_state *s)
 	const struct parport_pc_private *priv = p->physport->private_data;
 	s->u.pc.ctr = priv->ctr;
 	if (priv->ecr)
-		s->u.pc.ecr = inb (ECONTROL (p));
+		s->u.pc.ecr = inb(ECONTROL(p));
 }
 
-static void parport_pc_restore_state(struct parport *p, struct parport_state *s)
+static void parport_pc_restore_state(struct parport *p,
+						struct parport_state *s)
 {
 	struct parport_pc_private *priv = p->physport->private_data;
 	register unsigned char c = s->u.pc.ctr & priv->ctr_writable;
-	outb (c, CONTROL (p));
+	outb(c, CONTROL(p));
 	priv->ctr = c;
 	if (priv->ecr)
-		ECR_WRITE (p, s->u.pc.ecr);
+		ECR_WRITE(p, s->u.pc.ecr);
 }
 
 #ifdef CONFIG_PARPORT_1284
-static size_t parport_pc_epp_read_data (struct parport *port, void *buf,
-					size_t length, int flags)
+static size_t parport_pc_epp_read_data(struct parport *port, void *buf,
+				       size_t length, int flags)
 {
 	size_t got = 0;
 
@@ -316,54 +323,52 @@ static size_t parport_pc_epp_read_data (struct parport *port, void *buf,
 		 *  nFault is 0 if there is at least 1 byte in the Warp's FIFO
 		 *  pError is 1 if there are 16 bytes in the Warp's FIFO
 		 */
-		status = inb (STATUS (port));
+		status = inb(STATUS(port));
 
-		while (!(status & 0x08) && (got < length)) {
-			if ((left >= 16) && (status & 0x20) && !(status & 0x08)) {
+		while (!(status & 0x08) && got < length) {
+			if (left >= 16 && (status & 0x20) && !(status & 0x08)) {
 				/* can grab 16 bytes from warp fifo */
-				if (!((long)buf & 0x03)) {
-					insl (EPPDATA (port), buf, 4);
-				} else {
-					insb (EPPDATA (port), buf, 16);
-				}
+				if (!((long)buf & 0x03))
+					insl(EPPDATA(port), buf, 4);
+				else
+					insb(EPPDATA(port), buf, 16);
 				buf += 16;
 				got += 16;
 				left -= 16;
 			} else {
 				/* grab single byte from the warp fifo */
-				*((char *)buf) = inb (EPPDATA (port));
+				*((char *)buf) = inb(EPPDATA(port));
 				buf++;
 				got++;
 				left--;
 			}
-			status = inb (STATUS (port));
+			status = inb(STATUS(port));
 			if (status & 0x01) {
 				/* EPP timeout should never occur... */
-				printk (KERN_DEBUG "%s: EPP timeout occurred while talking to "
-					"w91284pic (should not have done)\n", port->name);
-				clear_epp_timeout (port);
+				printk(KERN_DEBUG
+"%s: EPP timeout occurred while talking to w91284pic (should not have done)\n", port->name);
+				clear_epp_timeout(port);
 			}
 		}
 		return got;
 	}
 	if ((flags & PARPORT_EPP_FAST) && (length > 1)) {
-		if (!(((long)buf | length) & 0x03)) {
-			insl (EPPDATA (port), buf, (length >> 2));
-		} else {
-			insb (EPPDATA (port), buf, length);
-		}
-		if (inb (STATUS (port)) & 0x01) {
-			clear_epp_timeout (port);
+		if (!(((long)buf | length) & 0x03))
+			insl(EPPDATA(port), buf, (length >> 2));
+		else
+			insb(EPPDATA(port), buf, length);
+		if (inb(STATUS(port)) & 0x01) {
+			clear_epp_timeout(port);
 			return -EIO;
 		}
 		return length;
 	}
 	for (; got < length; got++) {
-		*((char*)buf) = inb (EPPDATA(port));
+		*((char *)buf) = inb(EPPDATA(port));
 		buf++;
-		if (inb (STATUS (port)) & 0x01) {
+		if (inb(STATUS(port)) & 0x01) {
 			/* EPP timeout */
-			clear_epp_timeout (port);
+			clear_epp_timeout(port);
 			break;
 		}
 	}
@@ -371,28 +376,27 @@ static size_t parport_pc_epp_read_data (struct parport *port, void *buf,
 	return got;
 }
 
-static size_t parport_pc_epp_write_data (struct parport *port, const void *buf,
-					 size_t length, int flags)
+static size_t parport_pc_epp_write_data(struct parport *port, const void *buf,
+					size_t length, int flags)
 {
 	size_t written = 0;
 
 	if ((flags & PARPORT_EPP_FAST) && (length > 1)) {
-		if (!(((long)buf | length) & 0x03)) {
-			outsl (EPPDATA (port), buf, (length >> 2));
-		} else {
-			outsb (EPPDATA (port), buf, length);
-		}
-		if (inb (STATUS (port)) & 0x01) {
-			clear_epp_timeout (port);
+		if (!(((long)buf | length) & 0x03))
+			outsl(EPPDATA(port), buf, (length >> 2));
+		else
+			outsb(EPPDATA(port), buf, length);
+		if (inb(STATUS(port)) & 0x01) {
+			clear_epp_timeout(port);
 			return -EIO;
 		}
 		return length;
 	}
 	for (; written < length; written++) {
-		outb (*((char*)buf), EPPDATA(port));
+		outb(*((char *)buf), EPPDATA(port));
 		buf++;
-		if (inb (STATUS(port)) & 0x01) {
-			clear_epp_timeout (port);
+		if (inb(STATUS(port)) & 0x01) {
+			clear_epp_timeout(port);
 			break;
 		}
 	}
@@ -400,24 +404,24 @@ static size_t parport_pc_epp_write_data (struct parport *port, const void *buf,
 	return written;
 }
 
-static size_t parport_pc_epp_read_addr (struct parport *port, void *buf,
+static size_t parport_pc_epp_read_addr(struct parport *port, void *buf,
 					size_t length, int flags)
 {
 	size_t got = 0;
 
 	if ((flags & PARPORT_EPP_FAST) && (length > 1)) {
-		insb (EPPADDR (port), buf, length);
-		if (inb (STATUS (port)) & 0x01) {
-			clear_epp_timeout (port);
+		insb(EPPADDR(port), buf, length);
+		if (inb(STATUS(port)) & 0x01) {
+			clear_epp_timeout(port);
 			return -EIO;
 		}
 		return length;
 	}
 	for (; got < length; got++) {
-		*((char*)buf) = inb (EPPADDR (port));
+		*((char *)buf) = inb(EPPADDR(port));
 		buf++;
-		if (inb (STATUS (port)) & 0x01) {
-			clear_epp_timeout (port);
+		if (inb(STATUS(port)) & 0x01) {
+			clear_epp_timeout(port);
 			break;
 		}
 	}
@@ -425,25 +429,25 @@ static size_t parport_pc_epp_read_addr (struct parport *port, void *buf,
 	return got;
 }
 
-static size_t parport_pc_epp_write_addr (struct parport *port,
+static size_t parport_pc_epp_write_addr(struct parport *port,
 					 const void *buf, size_t length,
 					 int flags)
 {
 	size_t written = 0;
 
 	if ((flags & PARPORT_EPP_FAST) && (length > 1)) {
-		outsb (EPPADDR (port), buf, length);
-		if (inb (STATUS (port)) & 0x01) {
-			clear_epp_timeout (port);
+		outsb(EPPADDR(port), buf, length);
+		if (inb(STATUS(port)) & 0x01) {
+			clear_epp_timeout(port);
 			return -EIO;
 		}
 		return length;
 	}
 	for (; written < length; written++) {
-		outb (*((char*)buf), EPPADDR (port));
+		outb(*((char *)buf), EPPADDR(port));
 		buf++;
-		if (inb (STATUS (port)) & 0x01) {
-			clear_epp_timeout (port);
+		if (inb(STATUS(port)) & 0x01) {
+			clear_epp_timeout(port);
 			break;
 		}
 	}
@@ -451,74 +455,74 @@ static size_t parport_pc_epp_write_addr (struct parport *port,
 	return written;
 }
 
-static size_t parport_pc_ecpepp_read_data (struct parport *port, void *buf,
-					   size_t length, int flags)
+static size_t parport_pc_ecpepp_read_data(struct parport *port, void *buf,
+					  size_t length, int flags)
 {
 	size_t got;
 
-	frob_set_mode (port, ECR_EPP);
-	parport_pc_data_reverse (port);
-	parport_pc_write_control (port, 0x4);
-	got = parport_pc_epp_read_data (port, buf, length, flags);
-	frob_set_mode (port, ECR_PS2);
+	frob_set_mode(port, ECR_EPP);
+	parport_pc_data_reverse(port);
+	parport_pc_write_control(port, 0x4);
+	got = parport_pc_epp_read_data(port, buf, length, flags);
+	frob_set_mode(port, ECR_PS2);
 
 	return got;
 }
 
-static size_t parport_pc_ecpepp_write_data (struct parport *port,
-					    const void *buf, size_t length,
-					    int flags)
+static size_t parport_pc_ecpepp_write_data(struct parport *port,
+					   const void *buf, size_t length,
+					   int flags)
 {
 	size_t written;
 
-	frob_set_mode (port, ECR_EPP);
-	parport_pc_write_control (port, 0x4);
-	parport_pc_data_forward (port);
-	written = parport_pc_epp_write_data (port, buf, length, flags);
-	frob_set_mode (port, ECR_PS2);
+	frob_set_mode(port, ECR_EPP);
+	parport_pc_write_control(port, 0x4);
+	parport_pc_data_forward(port);
+	written = parport_pc_epp_write_data(port, buf, length, flags);
+	frob_set_mode(port, ECR_PS2);
 
 	return written;
 }
 
-static size_t parport_pc_ecpepp_read_addr (struct parport *port, void *buf,
-					   size_t length, int flags)
+static size_t parport_pc_ecpepp_read_addr(struct parport *port, void *buf,
+					  size_t length, int flags)
 {
 	size_t got;
 
-	frob_set_mode (port, ECR_EPP);
-	parport_pc_data_reverse (port);
-	parport_pc_write_control (port, 0x4);
-	got = parport_pc_epp_read_addr (port, buf, length, flags);
-	frob_set_mode (port, ECR_PS2);
+	frob_set_mode(port, ECR_EPP);
+	parport_pc_data_reverse(port);
+	parport_pc_write_control(port, 0x4);
+	got = parport_pc_epp_read_addr(port, buf, length, flags);
+	frob_set_mode(port, ECR_PS2);
 
 	return got;
 }
 
-static size_t parport_pc_ecpepp_write_addr (struct parport *port,
+static size_t parport_pc_ecpepp_write_addr(struct parport *port,
 					    const void *buf, size_t length,
 					    int flags)
 {
 	size_t written;
 
-	frob_set_mode (port, ECR_EPP);
-	parport_pc_write_control (port, 0x4);
-	parport_pc_data_forward (port);
-	written = parport_pc_epp_write_addr (port, buf, length, flags);
-	frob_set_mode (port, ECR_PS2);
+	frob_set_mode(port, ECR_EPP);
+	parport_pc_write_control(port, 0x4);
+	parport_pc_data_forward(port);
+	written = parport_pc_epp_write_addr(port, buf, length, flags);
+	frob_set_mode(port, ECR_PS2);
 
 	return written;
 }
 #endif /* IEEE 1284 support */
 
 #ifdef CONFIG_PARPORT_PC_FIFO
-static size_t parport_pc_fifo_write_block_pio (struct parport *port,
+static size_t parport_pc_fifo_write_block_pio(struct parport *port,
 					       const void *buf, size_t length)
 {
 	int ret = 0;
 	const unsigned char *bufp = buf;
 	size_t left = length;
 	unsigned long expire = jiffies + port->physport->cad->timeout;
-	const int fifo = FIFO (port);
+	const int fifo = FIFO(port);
 	int poll_for = 8; /* 80 usecs */
 	const struct parport_pc_private *priv = port->physport->private_data;
 	const int fifo_depth = priv->fifo_depth;
@@ -526,25 +530,25 @@ static size_t parport_pc_fifo_write_block_pio (struct parport *port,
 	port = port->physport;
 
 	/* We don't want to be interrupted every character. */
-	parport_pc_disable_irq (port);
+	parport_pc_disable_irq(port);
 	/* set nErrIntrEn and serviceIntr */
-	frob_econtrol (port, (1<<4) | (1<<2), (1<<4) | (1<<2));
+	frob_econtrol(port, (1<<4) | (1<<2), (1<<4) | (1<<2));
 
 	/* Forward mode. */
-	parport_pc_data_forward (port); /* Must be in PS2 mode */
+	parport_pc_data_forward(port); /* Must be in PS2 mode */
 
 	while (left) {
 		unsigned char byte;
-		unsigned char ecrval = inb (ECONTROL (port));
+		unsigned char ecrval = inb(ECONTROL(port));
 		int i = 0;
 
-		if (need_resched() && time_before (jiffies, expire))
+		if (need_resched() && time_before(jiffies, expire))
 			/* Can't yield the port. */
-			schedule ();
+			schedule();
 
 		/* Anyone else waiting for the port? */
 		if (port->waithead) {
-			printk (KERN_DEBUG "Somebody wants the port\n");
+			printk(KERN_DEBUG "Somebody wants the port\n");
 			break;
 		}
 
@@ -552,21 +556,22 @@ static size_t parport_pc_fifo_write_block_pio (struct parport *port,
 			/* FIFO is full. Wait for interrupt. */
 
 			/* Clear serviceIntr */
-			ECR_WRITE (port, ecrval & ~(1<<2));
-		false_alarm:
-			ret = parport_wait_event (port, HZ);
-			if (ret < 0) break;
+			ECR_WRITE(port, ecrval & ~(1<<2));
+false_alarm:
+			ret = parport_wait_event(port, HZ);
+			if (ret < 0)
+				break;
 			ret = 0;
-			if (!time_before (jiffies, expire)) {
+			if (!time_before(jiffies, expire)) {
 				/* Timed out. */
-				printk (KERN_DEBUG "FIFO write timed out\n");
+				printk(KERN_DEBUG "FIFO write timed out\n");
 				break;
 			}
-			ecrval = inb (ECONTROL (port));
+			ecrval = inb(ECONTROL(port));
 			if (!(ecrval & (1<<2))) {
 				if (need_resched() &&
-				    time_before (jiffies, expire))
-					schedule ();
+				    time_before(jiffies, expire))
+					schedule();
 
 				goto false_alarm;
 			}
@@ -577,38 +582,38 @@ static size_t parport_pc_fifo_write_block_pio (struct parport *port,
 		/* Can't fail now. */
 		expire = jiffies + port->cad->timeout;
 
-	poll:
-		if (signal_pending (current))
+poll:
+		if (signal_pending(current))
 			break;
 
 		if (ecrval & 0x01) {
 			/* FIFO is empty. Blast it full. */
 			const int n = left < fifo_depth ? left : fifo_depth;
-			outsb (fifo, bufp, n);
+			outsb(fifo, bufp, n);
 			bufp += n;
 			left -= n;
 
 			/* Adjust the poll time. */
-			if (i < (poll_for - 2)) poll_for--;
+			if (i < (poll_for - 2))
+				poll_for--;
 			continue;
 		} else if (i++ < poll_for) {
-			udelay (10);
-			ecrval = inb (ECONTROL (port));
+			udelay(10);
+			ecrval = inb(ECONTROL(port));
 			goto poll;
 		}
 
-		/* Half-full (call me an optimist) */
+		/* Half-full(call me an optimist) */
 		byte = *bufp++;
-		outb (byte, fifo);
+		outb(byte, fifo);
 		left--;
-        }
-
-dump_parport_state ("leave fifo_write_block_pio", port);
+	}
+	dump_parport_state("leave fifo_write_block_pio", port);
 	return length - left;
 }
 
 #ifdef HAS_DMA
-static size_t parport_pc_fifo_write_block_dma (struct parport *port,
+static size_t parport_pc_fifo_write_block_dma(struct parport *port,
 					       const void *buf, size_t length)
 {
 	int ret = 0;
@@ -621,7 +626,7 @@ static size_t parport_pc_fifo_write_block_dma (struct parport *port,
 	unsigned long start = (unsigned long) buf;
 	unsigned long end = (unsigned long) buf + length - 1;
 
-dump_parport_state ("enter fifo_write_block_dma", port);
+dump_parport_state("enter fifo_write_block_dma", port);
 	if (end < MAX_DMA_ADDRESS) {
 		/* If it would cross a 64k boundary, cap it at the end. */
 		if ((start ^ end) & ~0xffffUL)
@@ -629,8 +634,9 @@ dump_parport_state ("enter fifo_write_block_dma", port);
 
 		dma_addr = dma_handle = dma_map_single(dev, (void *)buf, length,
 						       DMA_TO_DEVICE);
-        } else {
-		/* above 16 MB we use a bounce buffer as ISA-DMA is not possible */
+	} else {
+		/* above 16 MB we use a bounce buffer as ISA-DMA
+		   is not possible */
 		maxlen   = PAGE_SIZE;          /* sizeof(priv->dma_buf) */
 		dma_addr = priv->dma_handle;
 		dma_handle = 0;
@@ -639,12 +645,12 @@ dump_parport_state ("enter fifo_write_block_dma", port);
 	port = port->physport;
 
 	/* We don't want to be interrupted every character. */
-	parport_pc_disable_irq (port);
+	parport_pc_disable_irq(port);
 	/* set nErrIntrEn and serviceIntr */
-	frob_econtrol (port, (1<<4) | (1<<2), (1<<4) | (1<<2));
+	frob_econtrol(port, (1<<4) | (1<<2), (1<<4) | (1<<2));
 
 	/* Forward mode. */
-	parport_pc_data_forward (port); /* Must be in PS2 mode */
+	parport_pc_data_forward(port); /* Must be in PS2 mode */
 
 	while (left) {
 		unsigned long expire = jiffies + port->physport->cad->timeout;
@@ -665,10 +671,10 @@ dump_parport_state ("enter fifo_write_block_dma", port);
 		set_dma_count(port->dma, count);
 
 		/* Set DMA mode */
-		frob_econtrol (port, 1<<3, 1<<3);
+		frob_econtrol(port, 1<<3, 1<<3);
 
 		/* Clear serviceIntr */
-		frob_econtrol (port, 1<<2, 0);
+		frob_econtrol(port, 1<<2, 0);
 
 		enable_dma(port->dma);
 		release_dma_lock(dmaflag);
@@ -676,20 +682,22 @@ dump_parport_state ("enter fifo_write_block_dma", port);
 		/* assume DMA will be successful */
 		left -= count;
 		buf  += count;
-		if (dma_handle) dma_addr += count;
+		if (dma_handle)
+			dma_addr += count;
 
 		/* Wait for interrupt. */
-	false_alarm:
-		ret = parport_wait_event (port, HZ);
-		if (ret < 0) break;
+false_alarm:
+		ret = parport_wait_event(port, HZ);
+		if (ret < 0)
+			break;
 		ret = 0;
-		if (!time_before (jiffies, expire)) {
+		if (!time_before(jiffies, expire)) {
 			/* Timed out. */
-			printk (KERN_DEBUG "DMA write timed out\n");
+			printk(KERN_DEBUG "DMA write timed out\n");
 			break;
 		}
 		/* Is serviceIntr set? */
-		if (!(inb (ECONTROL (port)) & (1<<2))) {
+		if (!(inb(ECONTROL(port)) & (1<<2))) {
 			cond_resched();
 
 			goto false_alarm;
@@ -705,14 +713,15 @@ dump_parport_state ("enter fifo_write_block_dma", port);
 
 		/* Anyone else waiting for the port? */
 		if (port->waithead) {
-			printk (KERN_DEBUG "Somebody wants the port\n");
+			printk(KERN_DEBUG "Somebody wants the port\n");
 			break;
 		}
 
 		/* update for possible DMA residue ! */
 		buf  -= count;
 		left += count;
-		if (dma_handle) dma_addr -= count;
+		if (dma_handle)
+			dma_addr -= count;
 	}
 
 	/* Maybe got here through break, so adjust for DMA residue! */
@@ -723,12 +732,12 @@ dump_parport_state ("enter fifo_write_block_dma", port);
 	release_dma_lock(dmaflag);
 
 	/* Turn off DMA mode */
-	frob_econtrol (port, 1<<3, 0);
+	frob_econtrol(port, 1<<3, 0);
 
 	if (dma_handle)
 		dma_unmap_single(dev, dma_handle, length, DMA_TO_DEVICE);
 
-dump_parport_state ("leave fifo_write_block_dma", port);
+dump_parport_state("leave fifo_write_block_dma", port);
 	return length - left;
 }
 #endif
@@ -738,13 +747,13 @@ static inline size_t parport_pc_fifo_write_block(struct parport *port,
 {
 #ifdef HAS_DMA
 	if (port->dma != PARPORT_DMA_NONE)
-		return parport_pc_fifo_write_block_dma (port, buf, length);
+		return parport_pc_fifo_write_block_dma(port, buf, length);
 #endif
-	return parport_pc_fifo_write_block_pio (port, buf, length);
+	return parport_pc_fifo_write_block_pio(port, buf, length);
 }
 
 /* Parallel Port FIFO mode (ECP chipsets) */
-static size_t parport_pc_compat_write_block_pio (struct parport *port,
+static size_t parport_pc_compat_write_block_pio(struct parport *port,
 						 const void *buf, size_t length,
 						 int flags)
 {
@@ -756,14 +765,16 @@ static size_t parport_pc_compat_write_block_pio (struct parport *port,
 	/* Special case: a timeout of zero means we cannot call schedule().
 	 * Also if O_NONBLOCK is set then use the default implementation. */
 	if (port->physport->cad->timeout <= PARPORT_INACTIVITY_O_NONBLOCK)
-		return parport_ieee1284_write_compat (port, buf,
+		return parport_ieee1284_write_compat(port, buf,
 						      length, flags);
 
 	/* Set up parallel port FIFO mode.*/
-	parport_pc_data_forward (port); /* Must be in PS2 mode */
-	parport_pc_frob_control (port, PARPORT_CONTROL_STROBE, 0);
-	r = change_mode (port, ECR_PPF); /* Parallel port FIFO */
-	if (r)  printk (KERN_DEBUG "%s: Warning change_mode ECR_PPF failed\n", port->name);
+	parport_pc_data_forward(port); /* Must be in PS2 mode */
+	parport_pc_frob_control(port, PARPORT_CONTROL_STROBE, 0);
+	r = change_mode(port, ECR_PPF); /* Parallel port FIFO */
+	if (r)
+		printk(KERN_DEBUG "%s: Warning change_mode ECR_PPF failed\n",
+								port->name);
 
 	port->physport->ieee1284.phase = IEEE1284_PH_FWD_DATA;
 
@@ -775,40 +786,39 @@ static size_t parport_pc_compat_write_block_pio (struct parport *port,
 	 * the FIFO is empty, so allow 4 seconds for each position
 	 * in the fifo.
 	 */
-        expire = jiffies + (priv->fifo_depth * HZ * 4);
+	expire = jiffies + (priv->fifo_depth * HZ * 4);
 	do {
 		/* Wait for the FIFO to empty */
-		r = change_mode (port, ECR_PS2);
-		if (r != -EBUSY) {
+		r = change_mode(port, ECR_PS2);
+		if (r != -EBUSY)
 			break;
-		}
-	} while (time_before (jiffies, expire));
+	} while (time_before(jiffies, expire));
 	if (r == -EBUSY) {
 
-		printk (KERN_DEBUG "%s: FIFO is stuck\n", port->name);
+		printk(KERN_DEBUG "%s: FIFO is stuck\n", port->name);
 
 		/* Prevent further data transfer. */
-		frob_set_mode (port, ECR_TST);
+		frob_set_mode(port, ECR_TST);
 
 		/* Adjust for the contents of the FIFO. */
 		for (written -= priv->fifo_depth; ; written++) {
-			if (inb (ECONTROL (port)) & 0x2) {
+			if (inb(ECONTROL(port)) & 0x2) {
 				/* Full up. */
 				break;
 			}
-			outb (0, FIFO (port));
+			outb(0, FIFO(port));
 		}
 
 		/* Reset the FIFO and return to PS2 mode. */
-		frob_set_mode (port, ECR_PS2);
+		frob_set_mode(port, ECR_PS2);
 	}
 
-	r = parport_wait_peripheral (port,
+	r = parport_wait_peripheral(port,
 				     PARPORT_STATUS_BUSY,
 				     PARPORT_STATUS_BUSY);
 	if (r)
-		printk (KERN_DEBUG
-			"%s: BUSY timeout (%d) in compat_write_block_pio\n", 
+		printk(KERN_DEBUG
+			"%s: BUSY timeout (%d) in compat_write_block_pio\n",
 			port->name, r);
 
 	port->physport->ieee1284.phase = IEEE1284_PH_FWD_IDLE;
@@ -818,7 +828,7 @@ static size_t parport_pc_compat_write_block_pio (struct parport *port,
 
 /* ECP */
 #ifdef CONFIG_PARPORT_1284
-static size_t parport_pc_ecp_write_block_pio (struct parport *port,
+static size_t parport_pc_ecp_write_block_pio(struct parport *port,
 					      const void *buf, size_t length,
 					      int flags)
 {
@@ -830,36 +840,38 @@ static size_t parport_pc_ecp_write_block_pio (struct parport *port,
 	/* Special case: a timeout of zero means we cannot call schedule().
 	 * Also if O_NONBLOCK is set then use the default implementation. */
 	if (port->physport->cad->timeout <= PARPORT_INACTIVITY_O_NONBLOCK)
-		return parport_ieee1284_ecp_write_data (port, buf,
+		return parport_ieee1284_ecp_write_data(port, buf,
 							length, flags);
 
 	/* Switch to forward mode if necessary. */
 	if (port->physport->ieee1284.phase != IEEE1284_PH_FWD_IDLE) {
 		/* Event 47: Set nInit high. */
-		parport_frob_control (port,
+		parport_frob_control(port,
 				      PARPORT_CONTROL_INIT
 				      | PARPORT_CONTROL_AUTOFD,
 				      PARPORT_CONTROL_INIT
 				      | PARPORT_CONTROL_AUTOFD);
 
 		/* Event 49: PError goes high. */
-		r = parport_wait_peripheral (port,
+		r = parport_wait_peripheral(port,
 					     PARPORT_STATUS_PAPEROUT,
 					     PARPORT_STATUS_PAPEROUT);
 		if (r) {
-			printk (KERN_DEBUG "%s: PError timeout (%d) "
+			printk(KERN_DEBUG "%s: PError timeout (%d) "
 				"in ecp_write_block_pio\n", port->name, r);
 		}
 	}
 
 	/* Set up ECP parallel port mode.*/
-	parport_pc_data_forward (port); /* Must be in PS2 mode */
-	parport_pc_frob_control (port,
+	parport_pc_data_forward(port); /* Must be in PS2 mode */
+	parport_pc_frob_control(port,
 				 PARPORT_CONTROL_STROBE |
 				 PARPORT_CONTROL_AUTOFD,
 				 0);
-	r = change_mode (port, ECR_ECP); /* ECP FIFO */
-	if (r) printk (KERN_DEBUG "%s: Warning change_mode ECR_ECP failed\n", port->name);
+	r = change_mode(port, ECR_ECP); /* ECP FIFO */
+	if (r)
+		printk(KERN_DEBUG "%s: Warning change_mode ECR_ECP failed\n",
+								port->name);
 	port->physport->ieee1284.phase = IEEE1284_PH_FWD_DATA;
 
 	/* Write the data to the FIFO. */
@@ -873,55 +885,54 @@ static size_t parport_pc_ecp_write_block_pio (struct parport *port,
 	expire = jiffies + (priv->fifo_depth * (HZ * 4));
 	do {
 		/* Wait for the FIFO to empty */
-		r = change_mode (port, ECR_PS2);
-		if (r != -EBUSY) {
+		r = change_mode(port, ECR_PS2);
+		if (r != -EBUSY)
 			break;
-		}
-	} while (time_before (jiffies, expire));
+	} while (time_before(jiffies, expire));
 	if (r == -EBUSY) {
 
-		printk (KERN_DEBUG "%s: FIFO is stuck\n", port->name);
+		printk(KERN_DEBUG "%s: FIFO is stuck\n", port->name);
 
 		/* Prevent further data transfer. */
-		frob_set_mode (port, ECR_TST);
+		frob_set_mode(port, ECR_TST);
 
 		/* Adjust for the contents of the FIFO. */
 		for (written -= priv->fifo_depth; ; written++) {
-			if (inb (ECONTROL (port)) & 0x2) {
+			if (inb(ECONTROL(port)) & 0x2) {
 				/* Full up. */
 				break;
 			}
-			outb (0, FIFO (port));
+			outb(0, FIFO(port));
 		}
 
 		/* Reset the FIFO and return to PS2 mode. */
-		frob_set_mode (port, ECR_PS2);
+		frob_set_mode(port, ECR_PS2);
 
 		/* Host transfer recovery. */
-		parport_pc_data_reverse (port); /* Must be in PS2 mode */
-		udelay (5);
-		parport_frob_control (port, PARPORT_CONTROL_INIT, 0);
-		r = parport_wait_peripheral (port, PARPORT_STATUS_PAPEROUT, 0);
+		parport_pc_data_reverse(port); /* Must be in PS2 mode */
+		udelay(5);
+		parport_frob_control(port, PARPORT_CONTROL_INIT, 0);
+		r = parport_wait_peripheral(port, PARPORT_STATUS_PAPEROUT, 0);
 		if (r)
-			printk (KERN_DEBUG "%s: PE,1 timeout (%d) "
+			printk(KERN_DEBUG "%s: PE,1 timeout (%d) "
 				"in ecp_write_block_pio\n", port->name, r);
 
-		parport_frob_control (port,
+		parport_frob_control(port,
 				      PARPORT_CONTROL_INIT,
 				      PARPORT_CONTROL_INIT);
-		r = parport_wait_peripheral (port,
+		r = parport_wait_peripheral(port,
 					     PARPORT_STATUS_PAPEROUT,
 					     PARPORT_STATUS_PAPEROUT);
-                if (r)
-                        printk (KERN_DEBUG "%s: PE,2 timeout (%d) "
+		if (r)
+			printk(KERN_DEBUG "%s: PE,2 timeout (%d) "
 				"in ecp_write_block_pio\n", port->name, r);
 	}
 
-	r = parport_wait_peripheral (port,
-				     PARPORT_STATUS_BUSY, 
+	r = parport_wait_peripheral(port,
+				     PARPORT_STATUS_BUSY,
 				     PARPORT_STATUS_BUSY);
-	if(r)
-		printk (KERN_DEBUG
+	if (r)
+		printk(KERN_DEBUG
 			"%s: BUSY timeout (%d) in ecp_write_block_pio\n",
 			port->name, r);
 
@@ -931,7 +942,7 @@ static size_t parport_pc_ecp_write_block_pio (struct parport *port,
 }
 
 #if 0
-static size_t parport_pc_ecp_read_block_pio (struct parport *port,
+static size_t parport_pc_ecp_read_block_pio(struct parport *port,
 					     void *buf, size_t length,
 					     int flags)
 {
@@ -944,13 +955,13 @@ static size_t parport_pc_ecp_read_block_pio (struct parport *port,
 	char *bufp = buf;
 
 	port = port->physport;
-DPRINTK (KERN_DEBUG "parport_pc: parport_pc_ecp_read_block_pio\n");
-dump_parport_state ("enter fcn", port);
+DPRINTK(KERN_DEBUG "parport_pc: parport_pc_ecp_read_block_pio\n");
+dump_parport_state("enter fcn", port);
 
 	/* Special case: a timeout of zero means we cannot call schedule().
 	 * Also if O_NONBLOCK is set then use the default implementation. */
 	if (port->cad->timeout <= PARPORT_INACTIVITY_O_NONBLOCK)
-		return parport_ieee1284_ecp_read_data (port, buf,
+		return parport_ieee1284_ecp_read_data(port, buf,
 						       length, flags);
 
 	if (port->ieee1284.mode == IEEE1284_MODE_ECPRLE) {
@@ -966,173 +977,177 @@ dump_parport_state ("enter fcn", port);
 	 * go through software emulation.  Otherwise we may have to throw
 	 * away data. */
 	if (length < fifofull)
-		return parport_ieee1284_ecp_read_data (port, buf,
+		return parport_ieee1284_ecp_read_data(port, buf,
 						       length, flags);
 
 	if (port->ieee1284.phase != IEEE1284_PH_REV_IDLE) {
 		/* change to reverse-idle phase (must be in forward-idle) */
 
 		/* Event 38: Set nAutoFd low (also make sure nStrobe is high) */
-		parport_frob_control (port,
+		parport_frob_control(port,
 				      PARPORT_CONTROL_AUTOFD
 				      | PARPORT_CONTROL_STROBE,
 				      PARPORT_CONTROL_AUTOFD);
-		parport_pc_data_reverse (port); /* Must be in PS2 mode */
-		udelay (5);
+		parport_pc_data_reverse(port); /* Must be in PS2 mode */
+		udelay(5);
 		/* Event 39: Set nInit low to initiate bus reversal */
-		parport_frob_control (port,
+		parport_frob_control(port,
 				      PARPORT_CONTROL_INIT,
 				      0);
 		/* Event 40: Wait for  nAckReverse (PError) to go low */
-		r = parport_wait_peripheral (port, PARPORT_STATUS_PAPEROUT, 0);
-                if (r) {
-                        printk (KERN_DEBUG "%s: PE timeout Event 40 (%d) "
+		r = parport_wait_peripheral(port, PARPORT_STATUS_PAPEROUT, 0);
+		if (r) {
+			printk(KERN_DEBUG "%s: PE timeout Event 40 (%d) "
 				"in ecp_read_block_pio\n", port->name, r);
 			return 0;
 		}
 	}
 
 	/* Set up ECP FIFO mode.*/
-/*	parport_pc_frob_control (port,
+/*	parport_pc_frob_control(port,
 				 PARPORT_CONTROL_STROBE |
 				 PARPORT_CONTROL_AUTOFD,
 				 PARPORT_CONTROL_AUTOFD); */
-	r = change_mode (port, ECR_ECP); /* ECP FIFO */
-	if (r) printk (KERN_DEBUG "%s: Warning change_mode ECR_ECP failed\n", port->name);
+	r = change_mode(port, ECR_ECP); /* ECP FIFO */
+	if (r)
+		printk(KERN_DEBUG "%s: Warning change_mode ECR_ECP failed\n",
+								port->name);
 
 	port->ieee1284.phase = IEEE1284_PH_REV_DATA;
 
 	/* the first byte must be collected manually */
-dump_parport_state ("pre 43", port);
+	dump_parport_state("pre 43", port);
 	/* Event 43: Wait for nAck to go low */
-	r = parport_wait_peripheral (port, PARPORT_STATUS_ACK, 0);
+	r = parport_wait_peripheral(port, PARPORT_STATUS_ACK, 0);
 	if (r) {
 		/* timed out while reading -- no data */
-		printk (KERN_DEBUG "PIO read timed out (initial byte)\n");
+		printk(KERN_DEBUG "PIO read timed out (initial byte)\n");
 		goto out_no_data;
 	}
 	/* read byte */
-	*bufp++ = inb (DATA (port));
+	*bufp++ = inb(DATA(port));
 	left--;
-dump_parport_state ("43-44", port);
+	dump_parport_state("43-44", port);
 	/* Event 44: nAutoFd (HostAck) goes high to acknowledge */
-	parport_pc_frob_control (port,
+	parport_pc_frob_control(port,
 				 PARPORT_CONTROL_AUTOFD,
 				 0);
-dump_parport_state ("pre 45", port);
+	dump_parport_state("pre 45", port);
 	/* Event 45: Wait for nAck to go high */
-/*	r = parport_wait_peripheral (port, PARPORT_STATUS_ACK, PARPORT_STATUS_ACK); */
-dump_parport_state ("post 45", port);
-r = 0;
+	/* r = parport_wait_peripheral(port, PARPORT_STATUS_ACK,
+						PARPORT_STATUS_ACK); */
+	dump_parport_state("post 45", port);
+	r = 0;
 	if (r) {
 		/* timed out while waiting for peripheral to respond to ack */
-		printk (KERN_DEBUG "ECP PIO read timed out (waiting for nAck)\n");
+		printk(KERN_DEBUG "ECP PIO read timed out (waiting for nAck)\n");
 
 		/* keep hold of the byte we've got already */
 		goto out_no_data;
 	}
 	/* Event 46: nAutoFd (HostAck) goes low to accept more data */
-	parport_pc_frob_control (port,
+	parport_pc_frob_control(port,
 				 PARPORT_CONTROL_AUTOFD,
 				 PARPORT_CONTROL_AUTOFD);
 
 
-dump_parport_state ("rev idle", port);
+	dump_parport_state("rev idle", port);
 	/* Do the transfer. */
 	while (left > fifofull) {
 		int ret;
 		unsigned long expire = jiffies + port->cad->timeout;
-		unsigned char ecrval = inb (ECONTROL (port));
+		unsigned char ecrval = inb(ECONTROL(port));
 
-		if (need_resched() && time_before (jiffies, expire))
+		if (need_resched() && time_before(jiffies, expire))
 			/* Can't yield the port. */
-			schedule ();
+			schedule();
 
 		/* At this point, the FIFO may already be full. In
-                 * that case ECP is already holding back the
-                 * peripheral (assuming proper design) with a delayed
-                 * handshake.  Work fast to avoid a peripheral
-                 * timeout.  */
+		 * that case ECP is already holding back the
+		 * peripheral (assuming proper design) with a delayed
+		 * handshake.  Work fast to avoid a peripheral
+		 * timeout.  */
 
 		if (ecrval & 0x01) {
 			/* FIFO is empty. Wait for interrupt. */
-dump_parport_state ("FIFO empty", port);
+			dump_parport_state("FIFO empty", port);
 
 			/* Anyone else waiting for the port? */
 			if (port->waithead) {
-				printk (KERN_DEBUG "Somebody wants the port\n");
+				printk(KERN_DEBUG "Somebody wants the port\n");
 				break;
 			}
 
 			/* Clear serviceIntr */
-			ECR_WRITE (port, ecrval & ~(1<<2));
-		false_alarm:
-dump_parport_state ("waiting", port);
-			ret = parport_wait_event (port, HZ);
-DPRINTK (KERN_DEBUG "parport_wait_event returned %d\n", ret);
+			ECR_WRITE(port, ecrval & ~(1<<2));
+false_alarm:
+			dump_parport_state("waiting", port);
+			ret = parport_wait_event(port, HZ);
+			DPRINTK(KERN_DEBUG "parport_wait_event returned %d\n",
+									ret);
 			if (ret < 0)
 				break;
 			ret = 0;
-			if (!time_before (jiffies, expire)) {
+			if (!time_before(jiffies, expire)) {
 				/* Timed out. */
-dump_parport_state ("timeout", port);
-				printk (KERN_DEBUG "PIO read timed out\n");
+				dump_parport_state("timeout", port);
+				printk(KERN_DEBUG "PIO read timed out\n");
 				break;
 			}
-			ecrval = inb (ECONTROL (port));
+			ecrval = inb(ECONTROL(port));
 			if (!(ecrval & (1<<2))) {
 				if (need_resched() &&
-				    time_before (jiffies, expire)) {
-					schedule ();
+				    time_before(jiffies, expire)) {
+					schedule();
 				}
 				goto false_alarm;
 			}
 
 			/* Depending on how the FIFO threshold was
-                         * set, how long interrupt service took, and
-                         * how fast the peripheral is, we might be
-                         * lucky and have a just filled FIFO. */
+			 * set, how long interrupt service took, and
+			 * how fast the peripheral is, we might be
+			 * lucky and have a just filled FIFO. */
 			continue;
 		}
 
 		if (ecrval & 0x02) {
 			/* FIFO is full. */
-dump_parport_state ("FIFO full", port);
-			insb (fifo, bufp, fifo_depth);
+dump_parport_state("FIFO full", port);
+			insb(fifo, bufp, fifo_depth);
 			bufp += fifo_depth;
 			left -= fifo_depth;
 			continue;
 		}
 
-DPRINTK (KERN_DEBUG "*** ecp_read_block_pio: reading one byte from the FIFO\n");
+DPRINTK(KERN_DEBUG "*** ecp_read_block_pio: reading one byte from the FIFO\n");
 
 		/* FIFO not filled.  We will cycle this loop for a while
-                 * and either the peripheral will fill it faster,
-                 * tripping a fast empty with insb, or we empty it. */
-		*bufp++ = inb (fifo);
+		 * and either the peripheral will fill it faster,
+		 * tripping a fast empty with insb, or we empty it. */
+		*bufp++ = inb(fifo);
 		left--;
 	}
 
 	/* scoop up anything left in the FIFO */
-	while (left && !(inb (ECONTROL (port) & 0x01))) {
-		*bufp++ = inb (fifo);
+	while (left && !(inb(ECONTROL(port) & 0x01))) {
+		*bufp++ = inb(fifo);
 		left--;
 	}
 
 	port->ieee1284.phase = IEEE1284_PH_REV_IDLE;
-dump_parport_state ("rev idle2", port);
+dump_parport_state("rev idle2", port);
 
 out_no_data:
 
 	/* Go to forward idle mode to shut the peripheral up (event 47). */
-	parport_frob_control (port, PARPORT_CONTROL_INIT, PARPORT_CONTROL_INIT);
+	parport_frob_control(port, PARPORT_CONTROL_INIT, PARPORT_CONTROL_INIT);
 
 	/* event 49: PError goes high */
-	r = parport_wait_peripheral (port,
+	r = parport_wait_peripheral(port,
 				     PARPORT_STATUS_PAPEROUT,
 				     PARPORT_STATUS_PAPEROUT);
 	if (r) {
-		printk (KERN_DEBUG
+		printk(KERN_DEBUG
 			"%s: PE timeout FWDIDLE (%d) in ecp_read_block_pio\n",
 			port->name, r);
 	}
@@ -1141,14 +1156,14 @@ out_no_data:
 
 	/* Finish up. */
 	{
-		int lost = get_fifo_residue (port);
+		int lost = get_fifo_residue(port);
 		if (lost)
 			/* Shouldn't happen with compliant peripherals. */
-			printk (KERN_DEBUG "%s: DATA LOSS (%d bytes)!\n",
+			printk(KERN_DEBUG "%s: DATA LOSS (%d bytes)!\n",
 				port->name, lost);
 	}
 
-dump_parport_state ("fwd idle", port);
+dump_parport_state("fwd idle", port);
 	return length - left;
 }
 #endif  /*  0  */
@@ -1164,8 +1179,7 @@ dump_parport_state ("fwd idle", port);
 
 /* GCC is not inlining extern inline function later overwriten to non-inline,
    so we use outlined_ variants here.  */
-static const struct parport_operations parport_pc_ops =
-{
+static const struct parport_operations parport_pc_ops = {
 	.write_data	= parport_pc_write_data,
 	.read_data	= parport_pc_read_data,
 
@@ -1205,46 +1219,52 @@ static const struct parport_operations parport_pc_ops =
 /* Super-IO chipset detection, Winbond, SMSC */
 static void __devinit show_parconfig_smsc37c669(int io, int key)
 {
-	int cr1,cr4,cra,cr23,cr26,cr27,i=0;
-	static const char *const modes[]={
+	int cr1, cr4, cra, cr23, cr26, cr27, i = 0;
+	static const char *const modes[] = {
 		"SPP and Bidirectional (PS/2)",
 		"EPP and SPP",
 		"ECP",
 		"ECP and EPP" };
 
-	outb(key,io);
-	outb(key,io);
-	outb(1,io);
-	cr1=inb(io+1);
-	outb(4,io);
-	cr4=inb(io+1);
-	outb(0x0a,io);
-	cra=inb(io+1);
-	outb(0x23,io);
-	cr23=inb(io+1);
-	outb(0x26,io);
-	cr26=inb(io+1);
-	outb(0x27,io);
-	cr27=inb(io+1);
-	outb(0xaa,io);
+	outb(key, io);
+	outb(key, io);
+	outb(1, io);
+	cr1 = inb(io + 1);
+	outb(4, io);
+	cr4 = inb(io + 1);
+	outb(0x0a, io);
+	cra = inb(io + 1);
+	outb(0x23, io);
+	cr23 = inb(io + 1);
+	outb(0x26, io);
+	cr26 = inb(io + 1);
+	outb(0x27, io);
+	cr27 = inb(io + 1);
+	outb(0xaa, io);
 
 	if (verbose_probing) {
-		printk (KERN_INFO "SMSC 37c669 LPT Config: cr_1=0x%02x, 4=0x%02x, "
+		printk(KERN_INFO
+			"SMSC 37c669 LPT Config: cr_1=0x%02x, 4=0x%02x, "
 			"A=0x%2x, 23=0x%02x, 26=0x%02x, 27=0x%02x\n",
-			cr1,cr4,cra,cr23,cr26,cr27);
-		
+			cr1, cr4, cra, cr23, cr26, cr27);
+
 		/* The documentation calls DMA and IRQ-Lines by letters, so
 		   the board maker can/will wire them
 		   appropriately/randomly...  G=reserved H=IDE-irq, */
-		printk (KERN_INFO "SMSC LPT Config: io=0x%04x, irq=%c, dma=%c, "
-			"fifo threshold=%d\n", cr23*4,
-			(cr27 &0x0f) ? 'A'-1+(cr27 &0x0f): '-',
-			(cr26 &0x0f) ? 'A'-1+(cr26 &0x0f): '-', cra & 0x0f);
+		printk(KERN_INFO
+	"SMSC LPT Config: io=0x%04x, irq=%c, dma=%c, fifo threshold=%d\n",
+				cr23 * 4,
+				(cr27 & 0x0f) ? 'A' - 1 + (cr27 & 0x0f) : '-',
+				(cr26 & 0x0f) ? 'A' - 1 + (cr26 & 0x0f) : '-',
+				cra & 0x0f);
 		printk(KERN_INFO "SMSC LPT Config: enabled=%s power=%s\n",
-		       (cr23*4 >=0x100) ?"yes":"no", (cr1 & 4) ? "yes" : "no");
-		printk(KERN_INFO "SMSC LPT Config: Port mode=%s, EPP version =%s\n",
-		       (cr1 & 0x08 ) ? "Standard mode only (SPP)" : modes[cr4 & 0x03], 
-		       (cr4 & 0x40) ? "1.7" : "1.9");
+		       (cr23 * 4 >= 0x100) ? "yes" : "no",
+		       (cr1 & 4) ? "yes" : "no");
+		printk(KERN_INFO
+			"SMSC LPT Config: Port mode=%s, EPP version =%s\n",
+				(cr1 & 0x08) ? "Standard mode only (SPP)"
+					      : modes[cr4 & 0x03],
+				(cr4 & 0x40) ? "1.7" : "1.9");
 	}
 
 	/* Heuristics !  BIOS setup for this mainboard device limits
@@ -1258,32 +1278,32 @@ static void __devinit show_parconfig_smsc37c669(int io, int key)
 			printk(KERN_INFO "Super-IO: too many chips!\n");
 		} else {
 			int d;
-			switch (cr23*4) {
-				case 0x3bc:
-					superios[i].io = 0x3bc;
-					superios[i].irq = 7;
-					break;
-				case 0x378:
-					superios[i].io = 0x378;
-					superios[i].irq = 7;
-					break;
-				case 0x278:
-					superios[i].io = 0x278;
-					superios[i].irq = 5;
+			switch (cr23 * 4) {
+			case 0x3bc:
+				superios[i].io = 0x3bc;
+				superios[i].irq = 7;
+				break;
+			case 0x378:
+				superios[i].io = 0x378;
+				superios[i].irq = 7;
+				break;
+			case 0x278:
+				superios[i].io = 0x278;
+				superios[i].irq = 5;
 			}
-			d=(cr26 &0x0f);
-			if((d==1) || (d==3)) 
-				superios[i].dma= d;
+			d = (cr26 & 0x0f);
+			if (d == 1 || d == 3)
+				superios[i].dma = d;
 			else
-				superios[i].dma= PARPORT_DMA_NONE;
+				superios[i].dma = PARPORT_DMA_NONE;
 		}
- 	}
+	}
 }
 
 
 static void __devinit show_parconfig_winbond(int io, int key)
 {
-	int cr30,cr60,cr61,cr70,cr74,crf0,i=0;
+	int cr30, cr60, cr61, cr70, cr74, crf0, i = 0;
 	static const char *const modes[] = {
 		"Standard (SPP) and Bidirectional(PS/2)", /* 0 */
 		"EPP-1.9 and SPP",
@@ -1296,40 +1316,43 @@ static void __devinit show_parconfig_winbond(int io, int key)
 	static char *const irqtypes[] = {
 		"pulsed low, high-Z",
 		"follows nACK" };
-		
+
 	/* The registers are called compatible-PnP because the
-           register layout is modelled after ISA-PnP, the access
-           method is just another ... */
-	outb(key,io);
-	outb(key,io);
-	outb(0x07,io);   /* Register 7: Select Logical Device */
-	outb(0x01,io+1); /* LD1 is Parallel Port */
-	outb(0x30,io);
-	cr30=inb(io+1);
-	outb(0x60,io);
-	cr60=inb(io+1);
-	outb(0x61,io);
-	cr61=inb(io+1);
-	outb(0x70,io);
-	cr70=inb(io+1);
-	outb(0x74,io);
-	cr74=inb(io+1);
-	outb(0xf0,io);
-	crf0=inb(io+1);
-	outb(0xaa,io);
+	   register layout is modelled after ISA-PnP, the access
+	   method is just another ... */
+	outb(key, io);
+	outb(key, io);
+	outb(0x07, io);   /* Register 7: Select Logical Device */
+	outb(0x01, io + 1); /* LD1 is Parallel Port */
+	outb(0x30, io);
+	cr30 = inb(io + 1);
+	outb(0x60, io);
+	cr60 = inb(io + 1);
+	outb(0x61, io);
+	cr61 = inb(io + 1);
+	outb(0x70, io);
+	cr70 = inb(io + 1);
+	outb(0x74, io);
+	cr74 = inb(io + 1);
+	outb(0xf0, io);
+	crf0 = inb(io + 1);
+	outb(0xaa, io);
 
 	if (verbose_probing) {
-		printk(KERN_INFO "Winbond LPT Config: cr_30=%02x 60,61=%02x%02x "
-		       "70=%02x 74=%02x, f0=%02x\n", cr30,cr60,cr61,cr70,cr74,crf0);
-		printk(KERN_INFO "Winbond LPT Config: active=%s, io=0x%02x%02x irq=%d, ", 
-		       (cr30 & 0x01) ? "yes":"no", cr60,cr61,cr70&0x0f );
+		printk(KERN_INFO
+    "Winbond LPT Config: cr_30=%02x 60,61=%02x%02x 70=%02x 74=%02x, f0=%02x\n",
+					cr30, cr60, cr61, cr70, cr74, crf0);
+		printk(KERN_INFO "Winbond LPT Config: active=%s, io=0x%02x%02x irq=%d, ",
+		       (cr30 & 0x01) ? "yes" : "no", cr60, cr61, cr70 & 0x0f);
 		if ((cr74 & 0x07) > 3)
 			printk("dma=none\n");
 		else
-			printk("dma=%d\n",cr74 & 0x07);
-		printk(KERN_INFO "Winbond LPT Config: irqtype=%s, ECP fifo threshold=%d\n",
-		       irqtypes[crf0>>7], (crf0>>3)&0x0f);
-		printk(KERN_INFO "Winbond LPT Config: Port mode=%s\n", modes[crf0 & 0x07]);
+			printk("dma=%d\n", cr74 & 0x07);
+		printk(KERN_INFO
+		    "Winbond LPT Config: irqtype=%s, ECP fifo threshold=%d\n",
+					irqtypes[crf0>>7], (crf0>>3)&0x0f);
+		printk(KERN_INFO "Winbond LPT Config: Port mode=%s\n",
+					modes[crf0 & 0x07]);
 	}
 
 	if (cr30 & 0x01) { /* the settings can be interrogated later ... */
@@ -1346,60 +1369,82 @@ static void __devinit show_parconfig_winbond(int io, int key)
 	}
 }
 
-static void __devinit decode_winbond(int efer, int key, int devid, int devrev, int oldid)
+static void __devinit decode_winbond(int efer, int key, int devid,
+							int devrev, int oldid)
 {
 	const char *type = "unknown";
-	int id,progif=2;
+	int id, progif = 2;
 
 	if (devid == devrev)
 		/* simple heuristics, we happened to read some
-                   non-winbond register */
+		   non-winbond register */
 		return;
 
-	id=(devid<<8) | devrev;
+	id = (devid << 8) | devrev;
 
 	/* Values are from public data sheets pdf files, I can just
-           confirm 83977TF is correct :-) */
-	if      (id == 0x9771) type="83977F/AF";
-	else if (id == 0x9773) type="83977TF / SMSC 97w33x/97w34x";
-	else if (id == 0x9774) type="83977ATF";
-	else if ((id & ~0x0f) == 0x5270) type="83977CTF / SMSC 97w36x";
-	else if ((id & ~0x0f) == 0x52f0) type="83977EF / SMSC 97w35x";
-	else if ((id & ~0x0f) == 0x5210) type="83627";
-	else if ((id & ~0x0f) == 0x6010) type="83697HF";
-	else if ((oldid &0x0f ) == 0x0a) { type="83877F"; progif=1;}
-	else if ((oldid &0x0f ) == 0x0b) { type="83877AF"; progif=1;}
-	else if ((oldid &0x0f ) == 0x0c) { type="83877TF"; progif=1;}
-	else if ((oldid &0x0f ) == 0x0d) { type="83877ATF"; progif=1;}
-	else progif=0;
+	   confirm 83977TF is correct :-) */
+	if (id == 0x9771)
+		type = "83977F/AF";
+	else if (id == 0x9773)
+		type = "83977TF / SMSC 97w33x/97w34x";
+	else if (id == 0x9774)
+		type = "83977ATF";
+	else if ((id & ~0x0f) == 0x5270)
+		type = "83977CTF / SMSC 97w36x";
+	else if ((id & ~0x0f) == 0x52f0)
+		type = "83977EF / SMSC 97w35x";
+	else if ((id & ~0x0f) == 0x5210)
+		type = "83627";
+	else if ((id & ~0x0f) == 0x6010)
+		type = "83697HF";
+	else if ((oldid & 0x0f) == 0x0a) {
+		type = "83877F";
+		progif = 1;
+	} else if ((oldid & 0x0f) == 0x0b) {
+		type = "83877AF";
+		progif = 1;
+	} else if ((oldid & 0x0f) == 0x0c) {
+		type = "83877TF";
+		progif = 1;
+	} else if ((oldid & 0x0f) == 0x0d) {
+		type = "83877ATF";
+		progif = 1;
+	} else
+		progif = 0;
 
 	if (verbose_probing)
 		printk(KERN_INFO "Winbond chip at EFER=0x%x key=0x%02x "
-		       "devid=%02x devrev=%02x oldid=%02x type=%s\n", 
+		       "devid=%02x devrev=%02x oldid=%02x type=%s\n",
 		       efer, key, devid, devrev, oldid, type);
 
 	if (progif == 2)
-		show_parconfig_winbond(efer,key);
+		show_parconfig_winbond(efer, key);
 }
 
 static void __devinit decode_smsc(int efer, int key, int devid, int devrev)
 {
-        const char *type = "unknown";
+	const char *type = "unknown";
 	void (*func)(int io, int key);
-        int id;
+	int id;
 
-        if (devid == devrev)
+	if (devid == devrev)
 		/* simple heuristics, we happened to read some
-                   non-smsc register */
+		   non-smsc register */
 		return;
 
-	func=NULL;
-        id=(devid<<8) | devrev;
+	func = NULL;
+	id = (devid << 8) | devrev;
 
-	if	(id==0x0302) {type="37c669"; func=show_parconfig_smsc37c669;}
-	else if	(id==0x6582) type="37c665IR";
-	else if	(devid==0x65) type="37c665GT";
-	else if	(devid==0x66) type="37c666GT";
+	if (id == 0x0302) {
+		type = "37c669";
+		func = show_parconfig_smsc37c669;
+	} else if (id == 0x6582)
+		type = "37c665IR";
+	else if	(devid == 0x65)
+		type = "37c665GT";
+	else if	(devid == 0x66)
+		type = "37c666GT";
 
 	if (verbose_probing)
 		printk(KERN_INFO "SMSC chip at EFER=0x%x "
@@ -1407,138 +1452,138 @@ static void __devinit decode_smsc(int efer, int key, int devid, int devrev)
 		       efer, key, devid, devrev, type);
 
 	if (func)
-		func(efer,key);
+		func(efer, key);
 }
 
 
 static void __devinit winbond_check(int io, int key)
 {
-	int devid,devrev,oldid,x_devid,x_devrev,x_oldid;
+	int devid, devrev, oldid, x_devid, x_devrev, x_oldid;
 
 	if (!request_region(io, 3, __func__))
 		return;
 
 	/* First probe without key */
-	outb(0x20,io);
-	x_devid=inb(io+1);
-	outb(0x21,io);
-	x_devrev=inb(io+1);
-	outb(0x09,io);
-	x_oldid=inb(io+1);
-
-	outb(key,io);
-	outb(key,io);     /* Write Magic Sequence to EFER, extended
-                             funtion enable register */
-	outb(0x20,io);    /* Write EFIR, extended function index register */
-	devid=inb(io+1);  /* Read EFDR, extended function data register */
-	outb(0x21,io);
-	devrev=inb(io+1);
-	outb(0x09,io);
-	oldid=inb(io+1);
-	outb(0xaa,io);    /* Magic Seal */
+	outb(0x20, io);
+	x_devid = inb(io + 1);
+	outb(0x21, io);
+	x_devrev = inb(io + 1);
+	outb(0x09, io);
+	x_oldid = inb(io + 1);
+
+	outb(key, io);
+	outb(key, io);     /* Write Magic Sequence to EFER, extended
+			      funtion enable register */
+	outb(0x20, io);    /* Write EFIR, extended function index register */
+	devid = inb(io + 1);  /* Read EFDR, extended function data register */
+	outb(0x21, io);
+	devrev = inb(io + 1);
+	outb(0x09, io);
+	oldid = inb(io + 1);
+	outb(0xaa, io);    /* Magic Seal */
 
 	if ((x_devid == devid) && (x_devrev == devrev) && (x_oldid == oldid))
 		goto out; /* protection against false positives */
 
-	decode_winbond(io,key,devid,devrev,oldid);
+	decode_winbond(io, key, devid, devrev, oldid);
 out:
 	release_region(io, 3);
 }
 
-static void __devinit winbond_check2(int io,int key)
+static void __devinit winbond_check2(int io, int key)
 {
-        int devid,devrev,oldid,x_devid,x_devrev,x_oldid;
+	int devid, devrev, oldid, x_devid, x_devrev, x_oldid;
 
 	if (!request_region(io, 3, __func__))
 		return;
 
 	/* First probe without the key */
-	outb(0x20,io+2);
-	x_devid=inb(io+2);
-	outb(0x21,io+1);
-	x_devrev=inb(io+2);
-	outb(0x09,io+1);
-	x_oldid=inb(io+2);
-
-        outb(key,io);     /* Write Magic Byte to EFER, extended
-                             funtion enable register */
-        outb(0x20,io+2);  /* Write EFIR, extended function index register */
-        devid=inb(io+2);  /* Read EFDR, extended function data register */
-        outb(0x21,io+1);
-        devrev=inb(io+2);
-        outb(0x09,io+1);
-        oldid=inb(io+2);
-        outb(0xaa,io);    /* Magic Seal */
-
-	if ((x_devid == devid) && (x_devrev == devrev) && (x_oldid == oldid))
+	outb(0x20, io + 2);
+	x_devid = inb(io + 2);
+	outb(0x21, io + 1);
+	x_devrev = inb(io + 2);
+	outb(0x09, io + 1);
+	x_oldid = inb(io + 2);
+
+	outb(key, io);     /* Write Magic Byte to EFER, extended
+			      funtion enable register */
+	outb(0x20, io + 2);  /* Write EFIR, extended function index register */
+	devid = inb(io + 2);  /* Read EFDR, extended function data register */
+	outb(0x21, io + 1);
+	devrev = inb(io + 2);
+	outb(0x09, io + 1);
+	oldid = inb(io + 2);
+	outb(0xaa, io);    /* Magic Seal */
+
+	if (x_devid == devid && x_devrev == devrev && x_oldid == oldid)
 		goto out; /* protection against false positives */
 
-	decode_winbond(io,key,devid,devrev,oldid);
+	decode_winbond(io, key, devid, devrev, oldid);
 out:
 	release_region(io, 3);
 }
 
 static void __devinit smsc_check(int io, int key)
 {
-        int id,rev,oldid,oldrev,x_id,x_rev,x_oldid,x_oldrev;
+	int id, rev, oldid, oldrev, x_id, x_rev, x_oldid, x_oldrev;
 
 	if (!request_region(io, 3, __func__))
 		return;
 
 	/* First probe without the key */
-	outb(0x0d,io);
-	x_oldid=inb(io+1);
-	outb(0x0e,io);
-	x_oldrev=inb(io+1);
-	outb(0x20,io);
-	x_id=inb(io+1);
-	outb(0x21,io);
-	x_rev=inb(io+1);
-
-        outb(key,io);
-        outb(key,io);     /* Write Magic Sequence to EFER, extended
-                             funtion enable register */
-        outb(0x0d,io);    /* Write EFIR, extended function index register */
-        oldid=inb(io+1);  /* Read EFDR, extended function data register */
-        outb(0x0e,io);
-        oldrev=inb(io+1);
-	outb(0x20,io);
-	id=inb(io+1);
-	outb(0x21,io);
-	rev=inb(io+1);
-        outb(0xaa,io);    /* Magic Seal */
-
-	if ((x_id == id) && (x_oldrev == oldrev) &&
-	    (x_oldid == oldid) && (x_rev == rev))
+	outb(0x0d, io);
+	x_oldid = inb(io + 1);
+	outb(0x0e, io);
+	x_oldrev = inb(io + 1);
+	outb(0x20, io);
+	x_id = inb(io + 1);
+	outb(0x21, io);
+	x_rev = inb(io + 1);
+
+	outb(key, io);
+	outb(key, io);     /* Write Magic Sequence to EFER, extended
+			      funtion enable register */
+	outb(0x0d, io);    /* Write EFIR, extended function index register */
+	oldid = inb(io + 1);  /* Read EFDR, extended function data register */
+	outb(0x0e, io);
+	oldrev = inb(io + 1);
+	outb(0x20, io);
+	id = inb(io + 1);
+	outb(0x21, io);
+	rev = inb(io + 1);
+	outb(0xaa, io);    /* Magic Seal */
+
+	if (x_id == id && x_oldrev == oldrev &&
+	    x_oldid == oldid && x_rev == rev)
 		goto out; /* protection against false positives */
 
-        decode_smsc(io,key,oldid,oldrev);
+	decode_smsc(io, key, oldid, oldrev);
 out:
 	release_region(io, 3);
 }
 
 
-static void __devinit detect_and_report_winbond (void)
-{ 
+static void __devinit detect_and_report_winbond(void)
+{
 	if (verbose_probing)
 		printk(KERN_DEBUG "Winbond Super-IO detection, now testing ports 3F0,370,250,4E,2E ...\n");
-	winbond_check(0x3f0,0x87);
-	winbond_check(0x370,0x87);
-	winbond_check(0x2e ,0x87);
-	winbond_check(0x4e ,0x87);
-	winbond_check(0x3f0,0x86);
-	winbond_check2(0x250,0x88); 
-	winbond_check2(0x250,0x89);
+	winbond_check(0x3f0, 0x87);
+	winbond_check(0x370, 0x87);
+	winbond_check(0x2e , 0x87);
+	winbond_check(0x4e , 0x87);
+	winbond_check(0x3f0, 0x86);
+	winbond_check2(0x250, 0x88);
+	winbond_check2(0x250, 0x89);
 }
 
-static void __devinit detect_and_report_smsc (void)
+static void __devinit detect_and_report_smsc(void)
 {
 	if (verbose_probing)
 		printk(KERN_DEBUG "SMSC Super-IO detection, now testing Ports 2F0, 370 ...\n");
-	smsc_check(0x3f0,0x55);
-	smsc_check(0x370,0x55);
-	smsc_check(0x3f0,0x44);
-	smsc_check(0x370,0x44);
+	smsc_check(0x3f0, 0x55);
+	smsc_check(0x370, 0x55);
+	smsc_check(0x3f0, 0x44);
+	smsc_check(0x370, 0x44);
 }
 
 static void __devinit detect_and_report_it87(void)
@@ -1573,7 +1618,7 @@ static void __devinit detect_and_report_it87(void)
 }
 #endif /* CONFIG_PARPORT_PC_SUPERIO */
 
-static int get_superio_dma (struct parport *p)
+static int get_superio_dma(struct parport *p)
 {
 	int i = 0;
 
@@ -1584,15 +1629,15 @@ static int get_superio_dma (struct parport *p)
 	return PARPORT_DMA_NONE;
 }
 
-static int get_superio_irq (struct parport *p)
+static int get_superio_irq(struct parport *p)
 {
 	int i = 0;
 
-        while ((i < NR_SUPERIOS) && (superios[i].io != p->base))
-                i++;
-        if (i != NR_SUPERIOS)
-                return superios[i].irq;
-        return PARPORT_IRQ_NONE;
+	while ((i < NR_SUPERIOS) && (superios[i].io != p->base))
+		i++;
+	if (i != NR_SUPERIOS)
+		return superios[i].irq;
+	return PARPORT_IRQ_NONE;
 }
 
 
@@ -1600,9 +1645,9 @@ static int get_superio_irq (struct parport *p)
 
 /*
  * Checks for port existence, all ports support SPP MODE
- * Returns: 
+ * Returns:
  *         0           :  No parallel port at this address
- *  PARPORT_MODE_PCSPP :  SPP port detected 
+ *  PARPORT_MODE_PCSPP :  SPP port detected
  *                        (if the user specified an ioport himself,
  *                         this shall always be the case!)
  *
@@ -1612,7 +1657,7 @@ static int parport_SPP_supported(struct parport *pb)
 	unsigned char r, w;
 
 	/*
-	 * first clear an eventually pending EPP timeout 
+	 * first clear an eventually pending EPP timeout
 	 * I (sailer@ife.ee.ethz.ch) have an SMSC chipset
 	 * that does not even respond to SPP cycles if an EPP
 	 * timeout is pending
@@ -1621,19 +1666,19 @@ static int parport_SPP_supported(struct parport *pb)
 
 	/* Do a simple read-write test to make sure the port exists. */
 	w = 0xc;
-	outb (w, CONTROL (pb));
+	outb(w, CONTROL(pb));
 
 	/* Is there a control register that we can read from?  Some
 	 * ports don't allow reads, so read_control just returns a
 	 * software copy. Some ports _do_ allow reads, so bypass the
 	 * software copy here.  In addition, some bits aren't
 	 * writable. */
-	r = inb (CONTROL (pb));
+	r = inb(CONTROL(pb));
 	if ((r & 0xf) == w) {
 		w = 0xe;
-		outb (w, CONTROL (pb));
-		r = inb (CONTROL (pb));
-		outb (0xc, CONTROL (pb));
+		outb(w, CONTROL(pb));
+		r = inb(CONTROL(pb));
+		outb(0xc, CONTROL(pb));
 		if ((r & 0xf) == w)
 			return PARPORT_MODE_PCSPP;
 	}
@@ -1641,18 +1686,18 @@ static int parport_SPP_supported(struct parport *pb)
 	if (user_specified)
 		/* That didn't work, but the user thinks there's a
 		 * port here. */
-		printk (KERN_INFO "parport 0x%lx (WARNING): CTR: "
+		printk(KERN_INFO "parport 0x%lx (WARNING): CTR: "
 			"wrote 0x%02x, read 0x%02x\n", pb->base, w, r);
 
 	/* Try the data register.  The data lines aren't tri-stated at
 	 * this stage, so we expect back what we wrote. */
 	w = 0xaa;
-	parport_pc_write_data (pb, w);
-	r = parport_pc_read_data (pb);
+	parport_pc_write_data(pb, w);
+	r = parport_pc_read_data(pb);
 	if (r == w) {
 		w = 0x55;
-		parport_pc_write_data (pb, w);
-		r = parport_pc_read_data (pb);
+		parport_pc_write_data(pb, w);
+		r = parport_pc_read_data(pb);
 		if (r == w)
 			return PARPORT_MODE_PCSPP;
 	}
@@ -1660,9 +1705,9 @@ static int parport_SPP_supported(struct parport *pb)
 	if (user_specified) {
 		/* Didn't work, but the user is convinced this is the
 		 * place. */
-		printk (KERN_INFO "parport 0x%lx (WARNING): DATA: "
+		printk(KERN_INFO "parport 0x%lx (WARNING): DATA: "
 			"wrote 0x%02x, read 0x%02x\n", pb->base, w, r);
-		printk (KERN_INFO "parport 0x%lx: You gave this address, "
+		printk(KERN_INFO "parport 0x%lx: You gave this address, "
 			"but there is probably no parallel port there!\n",
 			pb->base);
 	}
@@ -1693,33 +1738,33 @@ static int parport_ECR_present(struct parport *pb)
 	struct parport_pc_private *priv = pb->private_data;
 	unsigned char r = 0xc;
 
-	outb (r, CONTROL (pb));
-	if ((inb (ECONTROL (pb)) & 0x3) == (r & 0x3)) {
-		outb (r ^ 0x2, CONTROL (pb)); /* Toggle bit 1 */
+	outb(r, CONTROL(pb));
+	if ((inb(ECONTROL(pb)) & 0x3) == (r & 0x3)) {
+		outb(r ^ 0x2, CONTROL(pb)); /* Toggle bit 1 */
 
-		r = inb (CONTROL (pb));
-		if ((inb (ECONTROL (pb)) & 0x2) == (r & 0x2))
+		r = inb(CONTROL(pb));
+		if ((inb(ECONTROL(pb)) & 0x2) == (r & 0x2))
 			goto no_reg; /* Sure that no ECR register exists */
 	}
-	
-	if ((inb (ECONTROL (pb)) & 0x3 ) != 0x1)
+
+	if ((inb(ECONTROL(pb)) & 0x3) != 0x1)
 		goto no_reg;
 
-	ECR_WRITE (pb, 0x34);
-	if (inb (ECONTROL (pb)) != 0x35)
+	ECR_WRITE(pb, 0x34);
+	if (inb(ECONTROL(pb)) != 0x35)
 		goto no_reg;
 
 	priv->ecr = 1;
-	outb (0xc, CONTROL (pb));
-	
+	outb(0xc, CONTROL(pb));
+
 	/* Go to mode 000 */
-	frob_set_mode (pb, ECR_SPP);
+	frob_set_mode(pb, ECR_SPP);
 
 	return 1;
 
  no_reg:
-	outb (0xc, CONTROL (pb));
-	return 0; 
+	outb(0xc, CONTROL(pb));
+	return 0;
 }
 
 #ifdef CONFIG_PARPORT_1284
@@ -1729,7 +1774,7 @@ static int parport_ECR_present(struct parport *pb)
  * allows us to read data from the data lines.  In theory we would get back
  * 0xff but any peripheral attached to the port may drag some or all of the
  * lines down to zero.  So if we get back anything that isn't the contents
- * of the data register we deem PS/2 support to be present. 
+ * of the data register we deem PS/2 support to be present.
  *
  * Some SPP ports have "half PS/2" ability - you can't turn off the line
  * drivers, but an external peripheral with sufficiently beefy drivers of
@@ -1737,26 +1782,28 @@ static int parport_ECR_present(struct parport *pb)
  * where they can then be read back as normal.  Ports with this property
  * and the right type of device attached are likely to fail the SPP test,
  * (as they will appear to have stuck bits) and so the fact that they might
- * be misdetected here is rather academic. 
+ * be misdetected here is rather academic.
  */
 
 static int parport_PS2_supported(struct parport *pb)
 {
 	int ok = 0;
-  
+
 	clear_epp_timeout(pb);
 
 	/* try to tri-state the buffer */
-	parport_pc_data_reverse (pb);
-	
+	parport_pc_data_reverse(pb);
+
 	parport_pc_write_data(pb, 0x55);
-	if (parport_pc_read_data(pb) != 0x55) ok++;
+	if (parport_pc_read_data(pb) != 0x55)
+		ok++;
 
 	parport_pc_write_data(pb, 0xaa);
-	if (parport_pc_read_data(pb) != 0xaa) ok++;
+	if (parport_pc_read_data(pb) != 0xaa)
+		ok++;
 
 	/* cancel input mode */
-	parport_pc_data_forward (pb);
+	parport_pc_data_forward(pb);
 
 	if (ok) {
 		pb->modes |= PARPORT_MODE_TRISTATE;
@@ -1775,68 +1822,68 @@ static int parport_ECP_supported(struct parport *pb)
 	int config, configb;
 	int pword;
 	struct parport_pc_private *priv = pb->private_data;
-	/* Translate ECP intrLine to ISA irq value */	
-	static const int intrline[]= { 0, 7, 9, 10, 11, 14, 15, 5 }; 
+	/* Translate ECP intrLine to ISA irq value */
+	static const int intrline[] = { 0, 7, 9, 10, 11, 14, 15, 5 };
 
 	/* If there is no ECR, we have no hope of supporting ECP. */
 	if (!priv->ecr)
 		return 0;
 
 	/* Find out FIFO depth */
-	ECR_WRITE (pb, ECR_SPP << 5); /* Reset FIFO */
-	ECR_WRITE (pb, ECR_TST << 5); /* TEST FIFO */
-	for (i=0; i < 1024 && !(inb (ECONTROL (pb)) & 0x02); i++)
-		outb (0xaa, FIFO (pb));
+	ECR_WRITE(pb, ECR_SPP << 5); /* Reset FIFO */
+	ECR_WRITE(pb, ECR_TST << 5); /* TEST FIFO */
+	for (i = 0; i < 1024 && !(inb(ECONTROL(pb)) & 0x02); i++)
+		outb(0xaa, FIFO(pb));
 
 	/*
 	 * Using LGS chipset it uses ECR register, but
 	 * it doesn't support ECP or FIFO MODE
 	 */
 	if (i == 1024) {
-		ECR_WRITE (pb, ECR_SPP << 5);
+		ECR_WRITE(pb, ECR_SPP << 5);
 		return 0;
 	}
 
 	priv->fifo_depth = i;
 	if (verbose_probing)
-		printk (KERN_DEBUG "0x%lx: FIFO is %d bytes\n", pb->base, i);
+		printk(KERN_DEBUG "0x%lx: FIFO is %d bytes\n", pb->base, i);
 
 	/* Find out writeIntrThreshold */
-	frob_econtrol (pb, 1<<2, 1<<2);
-	frob_econtrol (pb, 1<<2, 0);
+	frob_econtrol(pb, 1<<2, 1<<2);
+	frob_econtrol(pb, 1<<2, 0);
 	for (i = 1; i <= priv->fifo_depth; i++) {
-		inb (FIFO (pb));
-		udelay (50);
-		if (inb (ECONTROL (pb)) & (1<<2))
+		inb(FIFO(pb));
+		udelay(50);
+		if (inb(ECONTROL(pb)) & (1<<2))
 			break;
 	}
 
 	if (i <= priv->fifo_depth) {
 		if (verbose_probing)
-			printk (KERN_DEBUG "0x%lx: writeIntrThreshold is %d\n",
+			printk(KERN_DEBUG "0x%lx: writeIntrThreshold is %d\n",
 				pb->base, i);
 	} else
 		/* Number of bytes we know we can write if we get an
-                   interrupt. */
+		   interrupt. */
 		i = 0;
 
 	priv->writeIntrThreshold = i;
 
 	/* Find out readIntrThreshold */
-	frob_set_mode (pb, ECR_PS2); /* Reset FIFO and enable PS2 */
-	parport_pc_data_reverse (pb); /* Must be in PS2 mode */
-	frob_set_mode (pb, ECR_TST); /* Test FIFO */
-	frob_econtrol (pb, 1<<2, 1<<2);
-	frob_econtrol (pb, 1<<2, 0);
+	frob_set_mode(pb, ECR_PS2); /* Reset FIFO and enable PS2 */
+	parport_pc_data_reverse(pb); /* Must be in PS2 mode */
+	frob_set_mode(pb, ECR_TST); /* Test FIFO */
+	frob_econtrol(pb, 1<<2, 1<<2);
+	frob_econtrol(pb, 1<<2, 0);
 	for (i = 1; i <= priv->fifo_depth; i++) {
-		outb (0xaa, FIFO (pb));
-		if (inb (ECONTROL (pb)) & (1<<2))
+		outb(0xaa, FIFO(pb));
+		if (inb(ECONTROL(pb)) & (1<<2))
 			break;
 	}
 
 	if (i <= priv->fifo_depth) {
 		if (verbose_probing)
-			printk (KERN_INFO "0x%lx: readIntrThreshold is %d\n",
+			printk(KERN_INFO "0x%lx: readIntrThreshold is %d\n",
 				pb->base, i);
 	} else
 		/* Number of bytes we can read if we get an interrupt. */
@@ -1844,23 +1891,23 @@ static int parport_ECP_supported(struct parport *pb)
 
 	priv->readIntrThreshold = i;
 
-	ECR_WRITE (pb, ECR_SPP << 5); /* Reset FIFO */
-	ECR_WRITE (pb, 0xf4); /* Configuration mode */
-	config = inb (CONFIGA (pb));
+	ECR_WRITE(pb, ECR_SPP << 5); /* Reset FIFO */
+	ECR_WRITE(pb, 0xf4); /* Configuration mode */
+	config = inb(CONFIGA(pb));
 	pword = (config >> 4) & 0x7;
 	switch (pword) {
 	case 0:
 		pword = 2;
-		printk (KERN_WARNING "0x%lx: Unsupported pword size!\n",
+		printk(KERN_WARNING "0x%lx: Unsupported pword size!\n",
 			pb->base);
 		break;
 	case 2:
 		pword = 4;
-		printk (KERN_WARNING "0x%lx: Unsupported pword size!\n",
+		printk(KERN_WARNING "0x%lx: Unsupported pword size!\n",
 			pb->base);
 		break;
 	default:
-		printk (KERN_WARNING "0x%lx: Unknown implementation ID\n",
+		printk(KERN_WARNING "0x%lx: Unknown implementation ID\n",
 			pb->base);
 		/* Assume 1 */
 	case 1:
@@ -1869,28 +1916,29 @@ static int parport_ECP_supported(struct parport *pb)
 	priv->pword = pword;
 
 	if (verbose_probing) {
-		printk (KERN_DEBUG "0x%lx: PWord is %d bits\n", pb->base, 8 * pword);
-		
-		printk (KERN_DEBUG "0x%lx: Interrupts are ISA-%s\n", pb->base,
+		printk(KERN_DEBUG "0x%lx: PWord is %d bits\n",
+			pb->base, 8 * pword);
+
+		printk(KERN_DEBUG "0x%lx: Interrupts are ISA-%s\n", pb->base,
 			config & 0x80 ? "Level" : "Pulses");
 
-		configb = inb (CONFIGB (pb));
-		printk (KERN_DEBUG "0x%lx: ECP port cfgA=0x%02x cfgB=0x%02x\n",
+		configb = inb(CONFIGB(pb));
+		printk(KERN_DEBUG "0x%lx: ECP port cfgA=0x%02x cfgB=0x%02x\n",
 			pb->base, config, configb);
-		printk (KERN_DEBUG "0x%lx: ECP settings irq=", pb->base);
-		if ((configb >>3) & 0x07)
-			printk("%d",intrline[(configb >>3) & 0x07]);
+		printk(KERN_DEBUG "0x%lx: ECP settings irq=", pb->base);
+		if ((configb >> 3) & 0x07)
+			printk("%d", intrline[(configb >> 3) & 0x07]);
 		else
 			printk("<none or set by other means>");
-		printk (" dma=");
-		if( (configb & 0x03 ) == 0x00)
+		printk(" dma=");
+		if ((configb & 0x03) == 0x00)
 			printk("<none or set by other means>\n");
 		else
-			printk("%d\n",configb & 0x07);
+			printk("%d\n", configb & 0x07);
 	}
 
 	/* Go back to mode 000 */
-	frob_set_mode (pb, ECR_SPP);
+	frob_set_mode(pb, ECR_SPP);
 
 	return 1;
 }
@@ -1905,10 +1953,10 @@ static int parport_ECPPS2_supported(struct parport *pb)
 	if (!priv->ecr)
 		return 0;
 
-	oecr = inb (ECONTROL (pb));
-	ECR_WRITE (pb, ECR_PS2 << 5);
+	oecr = inb(ECONTROL(pb));
+	ECR_WRITE(pb, ECR_PS2 << 5);
 	result = parport_PS2_supported(pb);
-	ECR_WRITE (pb, oecr);
+	ECR_WRITE(pb, oecr);
 	return result;
 }
 
@@ -1932,16 +1980,15 @@ static int parport_EPP_supported(struct parport *pb)
 	 */
 
 	/* If EPP timeout bit clear then EPP available */
-	if (!clear_epp_timeout(pb)) {
+	if (!clear_epp_timeout(pb))
 		return 0;  /* No way to clear timeout */
-	}
 
 	/* Check for Intel bug. */
 	if (priv->ecr) {
 		unsigned char i;
 		for (i = 0x00; i < 0x80; i += 0x20) {
-			ECR_WRITE (pb, i);
-			if (clear_epp_timeout (pb)) {
+			ECR_WRITE(pb, i);
+			if (clear_epp_timeout(pb)) {
 				/* Phony EPP in ECP. */
 				return 0;
 			}
@@ -1965,17 +2012,16 @@ static int parport_ECPEPP_supported(struct parport *pb)
 	int result;
 	unsigned char oecr;
 
-	if (!priv->ecr) {
+	if (!priv->ecr)
 		return 0;
-	}
 
-	oecr = inb (ECONTROL (pb));
+	oecr = inb(ECONTROL(pb));
 	/* Search for SMC style EPP+ECP mode */
-	ECR_WRITE (pb, 0x80);
-	outb (0x04, CONTROL (pb));
+	ECR_WRITE(pb, 0x80);
+	outb(0x04, CONTROL(pb));
 	result = parport_EPP_supported(pb);
 
-	ECR_WRITE (pb, oecr);
+	ECR_WRITE(pb, oecr);
 
 	if (result) {
 		/* Set up access functions to use ECP+EPP hardware. */
@@ -1993,11 +2039,25 @@ static int parport_ECPEPP_supported(struct parport *pb)
 /* Don't bother probing for modes we know we won't use. */
 static int __devinit parport_PS2_supported(struct parport *pb) { return 0; }
 #ifdef CONFIG_PARPORT_PC_FIFO
-static int parport_ECP_supported(struct parport *pb) { return 0; }
+static int parport_ECP_supported(struct parport *pb)
+{
+	return 0;
+}
 #endif
-static int __devinit parport_EPP_supported(struct parport *pb) { return 0; }
-static int __devinit parport_ECPEPP_supported(struct parport *pb){return 0;}
-static int __devinit parport_ECPPS2_supported(struct parport *pb){return 0;}
+static int __devinit parport_EPP_supported(struct parport *pb)
+{
+	return 0;
+}
+
+static int __devinit parport_ECPEPP_supported(struct parport *pb)
+{
+	return 0;
+}
+
+static int __devinit parport_ECPPS2_supported(struct parport *pb)
+{
+	return 0;
+}
 
 #endif /* No IEEE 1284 support */
 
@@ -2007,17 +2067,17 @@ static int __devinit parport_ECPPS2_supported(struct parport *pb){return 0;}
 static int programmable_irq_support(struct parport *pb)
 {
 	int irq, intrLine;
-	unsigned char oecr = inb (ECONTROL (pb));
+	unsigned char oecr = inb(ECONTROL(pb));
 	static const int lookup[8] = {
 		PARPORT_IRQ_NONE, 7, 9, 10, 11, 14, 15, 5
 	};
 
-	ECR_WRITE (pb, ECR_CNF << 5); /* Configuration MODE */
+	ECR_WRITE(pb, ECR_CNF << 5); /* Configuration MODE */
 
-	intrLine = (inb (CONFIGB (pb)) >> 3) & 0x07;
+	intrLine = (inb(CONFIGB(pb)) >> 3) & 0x07;
 	irq = lookup[intrLine];
 
-	ECR_WRITE (pb, oecr);
+	ECR_WRITE(pb, oecr);
 	return irq;
 }
 
@@ -2027,17 +2087,17 @@ static int irq_probe_ECP(struct parport *pb)
 	unsigned long irqs;
 
 	irqs = probe_irq_on();
-		
-	ECR_WRITE (pb, ECR_SPP << 5); /* Reset FIFO */
-	ECR_WRITE (pb, (ECR_TST << 5) | 0x04);
-	ECR_WRITE (pb, ECR_TST << 5);
+
+	ECR_WRITE(pb, ECR_SPP << 5); /* Reset FIFO */
+	ECR_WRITE(pb, (ECR_TST << 5) | 0x04);
+	ECR_WRITE(pb, ECR_TST << 5);
 
 	/* If Full FIFO sure that writeIntrThreshold is generated */
-	for (i=0; i < 1024 && !(inb (ECONTROL (pb)) & 0x02) ; i++) 
-		outb (0xaa, FIFO (pb));
-		
+	for (i = 0; i < 1024 && !(inb(ECONTROL(pb)) & 0x02) ; i++)
+		outb(0xaa, FIFO(pb));
+
 	pb->irq = probe_irq_off(irqs);
-	ECR_WRITE (pb, ECR_SPP << 5);
+	ECR_WRITE(pb, ECR_SPP << 5);
 
 	if (pb->irq <= 0)
 		pb->irq = PARPORT_IRQ_NONE;
@@ -2047,7 +2107,7 @@ static int irq_probe_ECP(struct parport *pb)
 
 /*
  * This detection seems that only works in National Semiconductors
- * This doesn't work in SMC, LGS, and Winbond 
+ * This doesn't work in SMC, LGS, and Winbond
  */
 static int irq_probe_EPP(struct parport *pb)
 {
@@ -2058,16 +2118,16 @@ static int irq_probe_EPP(struct parport *pb)
 	unsigned char oecr;
 
 	if (pb->modes & PARPORT_MODE_PCECR)
-		oecr = inb (ECONTROL (pb));
+		oecr = inb(ECONTROL(pb));
 
 	irqs = probe_irq_on();
 
 	if (pb->modes & PARPORT_MODE_PCECR)
-		frob_econtrol (pb, 0x10, 0x10);
-	
+		frob_econtrol(pb, 0x10, 0x10);
+
 	clear_epp_timeout(pb);
-	parport_pc_frob_control (pb, 0x20, 0x20);
-	parport_pc_frob_control (pb, 0x10, 0x10);
+	parport_pc_frob_control(pb, 0x20, 0x20);
+	parport_pc_frob_control(pb, 0x10, 0x10);
 	clear_epp_timeout(pb);
 
 	/* Device isn't expecting an EPP read
@@ -2076,9 +2136,9 @@ static int irq_probe_EPP(struct parport *pb)
 	parport_pc_read_epp(pb);
 	udelay(20);
 
-	pb->irq = probe_irq_off (irqs);
+	pb->irq = probe_irq_off(irqs);
 	if (pb->modes & PARPORT_MODE_PCECR)
-		ECR_WRITE (pb, oecr);
+		ECR_WRITE(pb, oecr);
 	parport_pc_write_control(pb, 0xc);
 
 	if (pb->irq <= 0)
@@ -2135,28 +2195,28 @@ static int parport_irq_probe(struct parport *pb)
 /* --- DMA detection -------------------------------------- */
 
 /* Only if chipset conforms to ECP ISA Interface Standard */
-static int programmable_dma_support (struct parport *p)
+static int programmable_dma_support(struct parport *p)
 {
-	unsigned char oecr = inb (ECONTROL (p));
+	unsigned char oecr = inb(ECONTROL(p));
 	int dma;
 
-	frob_set_mode (p, ECR_CNF);
-	
-	dma = inb (CONFIGB(p)) & 0x07;
+	frob_set_mode(p, ECR_CNF);
+
+	dma = inb(CONFIGB(p)) & 0x07;
 	/* 000: Indicates jumpered 8-bit DMA if read-only.
 	   100: Indicates jumpered 16-bit DMA if read-only. */
 	if ((dma & 0x03) == 0)
 		dma = PARPORT_DMA_NONE;
 
-	ECR_WRITE (p, oecr);
+	ECR_WRITE(p, oecr);
 	return dma;
 }
 
-static int parport_dma_probe (struct parport *p)
+static int parport_dma_probe(struct parport *p)
 {
 	const struct parport_pc_private *priv = p->private_data;
-	if (priv->ecr)
-		p->dma = programmable_dma_support(p); /* ask ECP chipset first */
+	if (priv->ecr)		/* ask ECP chipset first */
+		p->dma = programmable_dma_support(p);
 	if (p->dma == PARPORT_DMA_NONE) {
 		/* ask known Super-IO chips proper, although these
 		   claim ECP compatible, some don't report their DMA
@@ -2214,7 +2274,7 @@ struct parport *parport_pc_probe_port(unsigned long int base,
 	if (!base_res)
 		goto out4;
 
-	memcpy(ops, &parport_pc_ops, sizeof (struct parport_operations));
+	memcpy(ops, &parport_pc_ops, sizeof(struct parport_operations));
 	priv->ctr = 0xc;
 	priv->ctr_writable = ~0x10;
 	priv->ecr = 0;
@@ -2241,7 +2301,7 @@ struct parport *parport_pc_probe_port(unsigned long int base,
 			if (!parport_EPP_supported(p))
 				parport_ECPEPP_supported(p);
 	}
-	if (!parport_SPP_supported (p))
+	if (!parport_SPP_supported(p))
 		/* No port. */
 		goto out5;
 	if (priv->ecr)
@@ -2249,7 +2309,7 @@ struct parport *parport_pc_probe_port(unsigned long int base,
 	else
 		parport_PS2_supported(p);
 
-	p->size = (p->modes & PARPORT_MODE_EPP)?8:3;
+	p->size = (p->modes & PARPORT_MODE_EPP) ? 8 : 3;
 
 	printk(KERN_INFO "%s: PC-style at 0x%lx", p->name, p->base);
 	if (p->base_hi && priv->ecr)
@@ -2273,7 +2333,7 @@ struct parport *parport_pc_probe_port(unsigned long int base,
 		}
 	}
 	if (p->dma == PARPORT_DMA_AUTO) /* To use DMA, giving the irq
-                                           is mandatory (see above) */
+					   is mandatory (see above) */
 		p->dma = PARPORT_DMA_NONE;
 
 #ifdef CONFIG_PARPORT_PC_FIFO
@@ -2290,16 +2350,23 @@ struct parport *parport_pc_probe_port(unsigned long int base,
 		if (p->dma != PARPORT_DMA_NONE) {
 			printk(", dma %d", p->dma);
 			p->modes |= PARPORT_MODE_DMA;
-		}
-		else printk(", using FIFO");
-	}
-	else
+		} else
+			printk(", using FIFO");
+	} else
 		/* We can't use the DMA channel after all. */
 		p->dma = PARPORT_DMA_NONE;
 #endif /* Allowed to use FIFO/DMA */
 
 	printk(" [");
-#define printmode(x) {if(p->modes&PARPORT_MODE_##x){printk("%s%s",f?",":"",#x);f++;}}
+
+#define printmode(x) \
+	{\
+		if (p->modes & PARPORT_MODE_##x) {\
+			printk("%s%s", f ? "," : "", #x);\
+			f++;\
+		} \
+	}
+
 	{
 		int f = 0;
 		printmode(PCSPP);
@@ -2311,10 +2378,10 @@ struct parport *parport_pc_probe_port(unsigned long int base,
 	}
 #undef printmode
 #ifndef CONFIG_PARPORT_1284
-	printk ("(,...)");
+	printk("(,...)");
 #endif /* CONFIG_PARPORT_1284 */
 	printk("]\n");
-	if (probedirq != PARPORT_IRQ_NONE) 
+	if (probedirq != PARPORT_IRQ_NONE)
 		printk(KERN_INFO "%s: irq %d detected\n", p->name, probedirq);
 
 	/* If No ECP release the ports grabbed above. */
@@ -2330,7 +2397,7 @@ struct parport *parport_pc_probe_port(unsigned long int base,
 	if (p->irq != PARPORT_IRQ_NONE) {
 		if (request_irq(p->irq, parport_irq_handler,
 				 irqflags, p->name, p)) {
-			printk (KERN_WARNING "%s: irq %d in use, "
+			printk(KERN_WARNING "%s: irq %d in use, "
 				"resorting to polled operation\n",
 				p->name, p->irq);
 			p->irq = PARPORT_IRQ_NONE;
@@ -2340,8 +2407,8 @@ struct parport *parport_pc_probe_port(unsigned long int base,
 #ifdef CONFIG_PARPORT_PC_FIFO
 #ifdef HAS_DMA
 		if (p->dma != PARPORT_DMA_NONE) {
-			if (request_dma (p->dma, p->name)) {
-				printk (KERN_WARNING "%s: dma %d in use, "
+			if (request_dma(p->dma, p->name)) {
+				printk(KERN_WARNING "%s: dma %d in use, "
 					"resorting to PIO operation\n",
 					p->name, p->dma);
 				p->dma = PARPORT_DMA_NONE;
@@ -2351,8 +2418,8 @@ struct parport *parport_pc_probe_port(unsigned long int base,
 						       PAGE_SIZE,
 						       &priv->dma_handle,
 						       GFP_KERNEL);
-				if (! priv->dma_buf) {
-					printk (KERN_WARNING "%s: "
+				if (!priv->dma_buf) {
+					printk(KERN_WARNING "%s: "
 						"cannot get buffer for DMA, "
 						"resorting to PIO operation\n",
 						p->name);
@@ -2371,10 +2438,10 @@ struct parport *parport_pc_probe_port(unsigned long int base,
 		 * Put the ECP detected port in PS2 mode.
 		 * Do this also for ports that have ECR but don't do ECP.
 		 */
-		ECR_WRITE (p, 0x34);
+		ECR_WRITE(p, 0x34);
 
 	parport_pc_write_data(p, 0);
-	parport_pc_data_forward (p);
+	parport_pc_data_forward(p);
 
 	/* Now that we've told the sharing engine about the port, and
 	   found out its characteristics, let the high-level drivers
@@ -2382,7 +2449,7 @@ struct parport *parport_pc_probe_port(unsigned long int base,
 	spin_lock(&ports_lock);
 	list_add(&priv->list, &ports_list);
 	spin_unlock(&ports_lock);
-	parport_announce_port (p);
+	parport_announce_port(p);
 
 	return p;
 
@@ -2395,18 +2462,17 @@ out5:
 out4:
 	parport_put_port(p);
 out3:
-	kfree (priv);
+	kfree(priv);
 out2:
-	kfree (ops);
+	kfree(ops);
 out1:
 	if (pdev)
 		platform_device_unregister(pdev);
 	return NULL;
 }
+EXPORT_SYMBOL(parport_pc_probe_port);
 
-EXPORT_SYMBOL (parport_pc_probe_port);
-
-void parport_pc_unregister_port (struct parport *p)
+void parport_pc_unregister_port(struct parport *p)
 {
 	struct parport_pc_private *priv = p->private_data;
 	struct parport_operations *ops = p->ops;
@@ -2432,17 +2498,16 @@ void parport_pc_unregister_port (struct parport *p)
 				    priv->dma_buf,
 				    priv->dma_handle);
 #endif
-	kfree (p->private_data);
+	kfree(p->private_data);
 	parport_put_port(p);
-	kfree (ops); /* hope no-one cached it */
+	kfree(ops); /* hope no-one cached it */
 }
-
-EXPORT_SYMBOL (parport_pc_unregister_port);
+EXPORT_SYMBOL(parport_pc_unregister_port);
 
 #ifdef CONFIG_PCI
 
 /* ITE support maintained by Rich Liu <richliu@poorman.org> */
-static int __devinit sio_ite_8872_probe (struct pci_dev *pdev, int autoirq,
+static int __devinit sio_ite_8872_probe(struct pci_dev *pdev, int autoirq,
 					 int autodma,
 					 const struct parport_pc_via_data *via)
 {
@@ -2454,73 +2519,74 @@ static int __devinit sio_ite_8872_probe (struct pci_dev *pdev, int autoirq,
 	int irq;
 	int i;
 
-	DPRINTK (KERN_DEBUG "sio_ite_8872_probe()\n");
-	
-	// make sure which one chip
-	for(i = 0; i < 5; i++) {
+	DPRINTK(KERN_DEBUG "sio_ite_8872_probe()\n");
+
+	/* make sure which one chip */
+	for (i = 0; i < 5; i++) {
 		base_res = request_region(inta_addr[i], 32, "it887x");
 		if (base_res) {
 			int test;
-			pci_write_config_dword (pdev, 0x60,
+			pci_write_config_dword(pdev, 0x60,
 						0xe5000000 | inta_addr[i]);
-			pci_write_config_dword (pdev, 0x78,
+			pci_write_config_dword(pdev, 0x78,
 						0x00000000 | inta_addr[i]);
-			test = inb (inta_addr[i]);
-			if (test != 0xff) break;
+			test = inb(inta_addr[i]);
+			if (test != 0xff)
+				break;
 			release_region(inta_addr[i], 0x8);
 		}
 	}
-	if(i >= 5) {
-		printk (KERN_INFO "parport_pc: cannot find ITE8872 INTA\n");
+	if (i >= 5) {
+		printk(KERN_INFO "parport_pc: cannot find ITE8872 INTA\n");
 		return 0;
 	}
 
-	type = inb (inta_addr[i] + 0x18);
+	type = inb(inta_addr[i] + 0x18);
 	type &= 0x0f;
 
 	switch (type) {
 	case 0x2:
-		printk (KERN_INFO "parport_pc: ITE8871 found (1P)\n");
+		printk(KERN_INFO "parport_pc: ITE8871 found (1P)\n");
 		ite8872set = 0x64200000;
 		break;
 	case 0xa:
-		printk (KERN_INFO "parport_pc: ITE8875 found (1P)\n");
+		printk(KERN_INFO "parport_pc: ITE8875 found (1P)\n");
 		ite8872set = 0x64200000;
 		break;
 	case 0xe:
-		printk (KERN_INFO "parport_pc: ITE8872 found (2S1P)\n");
+		printk(KERN_INFO "parport_pc: ITE8872 found (2S1P)\n");
 		ite8872set = 0x64e00000;
 		break;
 	case 0x6:
-		printk (KERN_INFO "parport_pc: ITE8873 found (1S)\n");
+		printk(KERN_INFO "parport_pc: ITE8873 found (1S)\n");
 		return 0;
 	case 0x8:
-		DPRINTK (KERN_DEBUG "parport_pc: ITE8874 found (2S)\n");
+		DPRINTK(KERN_DEBUG "parport_pc: ITE8874 found (2S)\n");
 		return 0;
 	default:
-		printk (KERN_INFO "parport_pc: unknown ITE887x\n");
-		printk (KERN_INFO "parport_pc: please mail 'lspci -nvv' "
+		printk(KERN_INFO "parport_pc: unknown ITE887x\n");
+		printk(KERN_INFO "parport_pc: please mail 'lspci -nvv' "
 			"output to Rich.Liu@ite.com.tw\n");
 		return 0;
 	}
 
-	pci_read_config_byte (pdev, 0x3c, &ite8872_irq);
-	pci_read_config_dword (pdev, 0x1c, &ite8872_lpt);
+	pci_read_config_byte(pdev, 0x3c, &ite8872_irq);
+	pci_read_config_dword(pdev, 0x1c, &ite8872_lpt);
 	ite8872_lpt &= 0x0000ff00;
-	pci_read_config_dword (pdev, 0x20, &ite8872_lpthi);
+	pci_read_config_dword(pdev, 0x20, &ite8872_lpthi);
 	ite8872_lpthi &= 0x0000ff00;
-	pci_write_config_dword (pdev, 0x6c, 0xe3000000 | ite8872_lpt);
-	pci_write_config_dword (pdev, 0x70, 0xe3000000 | ite8872_lpthi);
-	pci_write_config_dword (pdev, 0x80, (ite8872_lpthi<<16) | ite8872_lpt);
-	// SET SPP&EPP , Parallel Port NO DMA , Enable All Function
-	// SET Parallel IRQ
-	pci_write_config_dword (pdev, 0x9c,
+	pci_write_config_dword(pdev, 0x6c, 0xe3000000 | ite8872_lpt);
+	pci_write_config_dword(pdev, 0x70, 0xe3000000 | ite8872_lpthi);
+	pci_write_config_dword(pdev, 0x80, (ite8872_lpthi<<16) | ite8872_lpt);
+	/* SET SPP&EPP , Parallel Port NO DMA , Enable All Function */
+	/* SET Parallel IRQ */
+	pci_write_config_dword(pdev, 0x9c,
 				ite8872set | (ite8872_irq * 0x11111));
 
-	DPRINTK (KERN_DEBUG "ITE887x: The IRQ is %d.\n", ite8872_irq);
-	DPRINTK (KERN_DEBUG "ITE887x: The PARALLEL I/O port is 0x%x.\n",
+	DPRINTK(KERN_DEBUG "ITE887x: The IRQ is %d.\n", ite8872_irq);
+	DPRINTK(KERN_DEBUG "ITE887x: The PARALLEL I/O port is 0x%x.\n",
 		 ite8872_lpt);
-	DPRINTK (KERN_DEBUG "ITE887x: The PARALLEL I/O porthi is 0x%x.\n",
+	DPRINTK(KERN_DEBUG "ITE887x: The PARALLEL I/O porthi is 0x%x.\n",
 		 ite8872_lpthi);
 
 	/* Let the user (or defaults) steer us away from interrupts */
@@ -2532,14 +2598,14 @@ static int __devinit sio_ite_8872_probe (struct pci_dev *pdev, int autoirq,
 	 * Release the resource so that parport_pc_probe_port can get it.
 	 */
 	release_resource(base_res);
-	if (parport_pc_probe_port (ite8872_lpt, ite8872_lpthi,
+	if (parport_pc_probe_port(ite8872_lpt, ite8872_lpthi,
 				   irq, PARPORT_DMA_NONE, &pdev->dev, 0)) {
-		printk (KERN_INFO
+		printk(KERN_INFO
 			"parport_pc: ITE 8872 parallel port: io=0x%X",
-			ite8872_lpt);
+								ite8872_lpt);
 		if (irq != PARPORT_IRQ_NONE)
-			printk (", irq=%d", irq);
-		printk ("\n");
+			printk(", irq=%d", irq);
+		printk("\n");
 		return 1;
 	}
 
@@ -2548,7 +2614,7 @@ static int __devinit sio_ite_8872_probe (struct pci_dev *pdev, int autoirq,
 
 /* VIA 8231 support by Pavel Fedin <sonic_amiga@rambler.ru>
    based on VIA 686a support code by Jeff Garzik <jgarzik@pobox.com> */
-static int __devinitdata parport_init_mode = 0;
+static int __devinitdata parport_init_mode;
 
 /* Data for two known VIA chips */
 static struct parport_pc_via_data via_686a_data __devinitdata = {
@@ -2570,7 +2636,7 @@ static struct parport_pc_via_data via_8231_data __devinitdata = {
 	0xF6
 };
 
-static int __devinit sio_via_probe (struct pci_dev *pdev, int autoirq,
+static int __devinit sio_via_probe(struct pci_dev *pdev, int autoirq,
 				    int autodma,
 				    const struct parport_pc_via_data *via)
 {
@@ -2582,38 +2648,38 @@ static int __devinit sio_via_probe (struct pci_dev *pdev, int autoirq,
 
 	printk(KERN_DEBUG "parport_pc: VIA 686A/8231 detected\n");
 
-	switch(parport_init_mode)
-	{
+	switch (parport_init_mode) {
 	case 1:
-	    printk(KERN_DEBUG "parport_pc: setting SPP mode\n");
-	    siofunc = VIA_FUNCTION_PARPORT_SPP;
-	    break;
+		printk(KERN_DEBUG "parport_pc: setting SPP mode\n");
+		siofunc = VIA_FUNCTION_PARPORT_SPP;
+		break;
 	case 2:
-	    printk(KERN_DEBUG "parport_pc: setting PS/2 mode\n");
-	    siofunc = VIA_FUNCTION_PARPORT_SPP;
-	    ppcontrol = VIA_PARPORT_BIDIR;
-	    break;
+		printk(KERN_DEBUG "parport_pc: setting PS/2 mode\n");
+		siofunc = VIA_FUNCTION_PARPORT_SPP;
+		ppcontrol = VIA_PARPORT_BIDIR;
+		break;
 	case 3:
-	    printk(KERN_DEBUG "parport_pc: setting EPP mode\n");
-	    siofunc = VIA_FUNCTION_PARPORT_EPP;
-	    ppcontrol = VIA_PARPORT_BIDIR;
-	    have_epp = 1;
-	    break;
+		printk(KERN_DEBUG "parport_pc: setting EPP mode\n");
+		siofunc = VIA_FUNCTION_PARPORT_EPP;
+		ppcontrol = VIA_PARPORT_BIDIR;
+		have_epp = 1;
+		break;
 	case 4:
-	    printk(KERN_DEBUG "parport_pc: setting ECP mode\n");
-	    siofunc = VIA_FUNCTION_PARPORT_ECP;
-	    ppcontrol = VIA_PARPORT_BIDIR;
-	    break;
+		printk(KERN_DEBUG "parport_pc: setting ECP mode\n");
+		siofunc = VIA_FUNCTION_PARPORT_ECP;
+		ppcontrol = VIA_PARPORT_BIDIR;
+		break;
 	case 5:
-	    printk(KERN_DEBUG "parport_pc: setting EPP+ECP mode\n");
-	    siofunc = VIA_FUNCTION_PARPORT_ECP;
-	    ppcontrol = VIA_PARPORT_BIDIR|VIA_PARPORT_ECPEPP;
-	    have_epp = 1;
-	    break;
-	 default:
-	    printk(KERN_DEBUG "parport_pc: probing current configuration\n");
-	    siofunc = VIA_FUNCTION_PROBE;
-	    break;
+		printk(KERN_DEBUG "parport_pc: setting EPP+ECP mode\n");
+		siofunc = VIA_FUNCTION_PARPORT_ECP;
+		ppcontrol = VIA_PARPORT_BIDIR|VIA_PARPORT_ECPEPP;
+		have_epp = 1;
+		break;
+	default:
+		printk(KERN_DEBUG
+			"parport_pc: probing current configuration\n");
+		siofunc = VIA_FUNCTION_PROBE;
+		break;
 	}
 	/*
 	 * unlock super i/o configuration
@@ -2624,38 +2690,36 @@ static int __devinit sio_via_probe (struct pci_dev *pdev, int autoirq,
 
 	/* Bits 1-0: Parallel Port Mode / Enable */
 	outb(via->viacfg_function, VIA_CONFIG_INDEX);
-	tmp = inb (VIA_CONFIG_DATA);
+	tmp = inb(VIA_CONFIG_DATA);
 	/* Bit 5: EPP+ECP enable; bit 7: PS/2 bidirectional port enable */
 	outb(via->viacfg_parport_control, VIA_CONFIG_INDEX);
-	tmp2 = inb (VIA_CONFIG_DATA);
-	if (siofunc == VIA_FUNCTION_PROBE)
-	{
-	    siofunc = tmp & VIA_FUNCTION_PARPORT_DISABLE;
-	    ppcontrol = tmp2;
+	tmp2 = inb(VIA_CONFIG_DATA);
+	if (siofunc == VIA_FUNCTION_PROBE) {
+		siofunc = tmp & VIA_FUNCTION_PARPORT_DISABLE;
+		ppcontrol = tmp2;
+	} else {
+		tmp &= ~VIA_FUNCTION_PARPORT_DISABLE;
+		tmp |= siofunc;
+		outb(via->viacfg_function, VIA_CONFIG_INDEX);
+		outb(tmp, VIA_CONFIG_DATA);
+		tmp2 &= ~(VIA_PARPORT_BIDIR|VIA_PARPORT_ECPEPP);
+		tmp2 |= ppcontrol;
+		outb(via->viacfg_parport_control, VIA_CONFIG_INDEX);
+		outb(tmp2, VIA_CONFIG_DATA);
 	}
-	else
-	{
-	    tmp &= ~VIA_FUNCTION_PARPORT_DISABLE;
-	    tmp |= siofunc;
-	    outb(via->viacfg_function, VIA_CONFIG_INDEX);
-	    outb(tmp, VIA_CONFIG_DATA);
-	    tmp2 &= ~(VIA_PARPORT_BIDIR|VIA_PARPORT_ECPEPP);
-	    tmp2 |= ppcontrol;
-	    outb(via->viacfg_parport_control, VIA_CONFIG_INDEX);
-	    outb(tmp2, VIA_CONFIG_DATA);
-	}
-	
+
 	/* Parallel Port I/O Base Address, bits 9-2 */
 	outb(via->viacfg_parport_base, VIA_CONFIG_INDEX);
 	port1 = inb(VIA_CONFIG_DATA) << 2;
-	
-	printk (KERN_DEBUG "parport_pc: Current parallel port base: 0x%X\n",port1);
-	if ((port1 == 0x3BC) && have_epp)
-	{
-	    outb(via->viacfg_parport_base, VIA_CONFIG_INDEX);
-	    outb((0x378 >> 2), VIA_CONFIG_DATA);
-	    printk(KERN_DEBUG "parport_pc: Parallel port base changed to 0x378\n");
-	    port1 = 0x378;
+
+	printk(KERN_DEBUG "parport_pc: Current parallel port base: 0x%X\n",
+									port1);
+	if (port1 == 0x3BC && have_epp) {
+		outb(via->viacfg_parport_base, VIA_CONFIG_INDEX);
+		outb((0x378 >> 2), VIA_CONFIG_DATA);
+		printk(KERN_DEBUG
+			"parport_pc: Parallel port base changed to 0x378\n");
+		port1 = 0x378;
 	}
 
 	/*
@@ -2669,36 +2733,39 @@ static int __devinit sio_via_probe (struct pci_dev *pdev, int autoirq,
 		printk(KERN_INFO "parport_pc: VIA parallel port disabled in BIOS\n");
 		return 0;
 	}
-	
+
 	/* Bits 7-4: PnP Routing for Parallel Port IRQ */
 	pci_read_config_byte(pdev, via->via_pci_parport_irq_reg, &tmp);
 	irq = ((tmp & VIA_IRQCONTROL_PARALLEL) >> 4);
 
-	if (siofunc == VIA_FUNCTION_PARPORT_ECP)
-	{
-	    /* Bits 3-2: PnP Routing for Parallel Port DMA */
-	    pci_read_config_byte(pdev, via->via_pci_parport_dma_reg, &tmp);
-	    dma = ((tmp & VIA_DMACONTROL_PARALLEL) >> 2);
-	}
-	else
-	    /* if ECP not enabled, DMA is not enabled, assumed bogus 'dma' value */
-	    dma = PARPORT_DMA_NONE;
+	if (siofunc == VIA_FUNCTION_PARPORT_ECP) {
+		/* Bits 3-2: PnP Routing for Parallel Port DMA */
+		pci_read_config_byte(pdev, via->via_pci_parport_dma_reg, &tmp);
+		dma = ((tmp & VIA_DMACONTROL_PARALLEL) >> 2);
+	} else
+		/* if ECP not enabled, DMA is not enabled, assumed
+		   bogus 'dma' value */
+		dma = PARPORT_DMA_NONE;
 
 	/* Let the user (or defaults) steer us away from interrupts and DMA */
 	if (autoirq == PARPORT_IRQ_NONE) {
-	    irq = PARPORT_IRQ_NONE;
-	    dma = PARPORT_DMA_NONE;
+		irq = PARPORT_IRQ_NONE;
+		dma = PARPORT_DMA_NONE;
 	}
 	if (autodma == PARPORT_DMA_NONE)
-	    dma = PARPORT_DMA_NONE;
+		dma = PARPORT_DMA_NONE;
 
 	switch (port1) {
-	case 0x3bc: port2 = 0x7bc; break;
-	case 0x378: port2 = 0x778; break;
-	case 0x278: port2 = 0x678; break;
+	case 0x3bc:
+		port2 = 0x7bc; break;
+	case 0x378:
+		port2 = 0x778; break;
+	case 0x278:
+		port2 = 0x678; break;
 	default:
-		printk(KERN_INFO "parport_pc: Weird VIA parport base 0x%X, ignoring\n",
-			port1);
+		printk(KERN_INFO
+			"parport_pc: Weird VIA parport base 0x%X, ignoring\n",
+									port1);
 		return 0;
 	}
 
@@ -2716,17 +2783,17 @@ static int __devinit sio_via_probe (struct pci_dev *pdev, int autoirq,
 	}
 
 	/* finally, do the probe with values obtained */
-	if (parport_pc_probe_port (port1, port2, irq, dma, &pdev->dev, 0)) {
-		printk (KERN_INFO
+	if (parport_pc_probe_port(port1, port2, irq, dma, &pdev->dev, 0)) {
+		printk(KERN_INFO
 			"parport_pc: VIA parallel port: io=0x%X", port1);
 		if (irq != PARPORT_IRQ_NONE)
-			printk (", irq=%d", irq);
+			printk(", irq=%d", irq);
 		if (dma != PARPORT_DMA_NONE)
-			printk (", dma=%d", dma);
-		printk ("\n");
+			printk(", dma=%d", dma);
+		printk("\n");
 		return 1;
 	}
-	
+
 	printk(KERN_WARNING "parport_pc: Strange, can't probe VIA parallel port: io=0x%X, irq=%d, dma=%d\n",
 		port1, irq, dma);
 	return 0;
@@ -2734,8 +2801,8 @@ static int __devinit sio_via_probe (struct pci_dev *pdev, int autoirq,
 
 
 enum parport_pc_sio_types {
-	sio_via_686a = 0,	/* Via VT82C686A motherboard Super I/O */
-	sio_via_8231,		/* Via VT8231 south bridge integrated Super IO */
+	sio_via_686a = 0,   /* Via VT82C686A motherboard Super I/O */
+	sio_via_8231,	    /* Via VT8231 south bridge integrated Super IO */
 	sio_ite_8872,
 	last_sio
 };
@@ -2806,15 +2873,15 @@ enum parport_pc_pci_cards {
 };
 
 
-/* each element directly indexed from enum list, above 
+/* each element directly indexed from enum list, above
  * (but offset by last_sio) */
 static struct parport_pc_pci {
 	int numports;
 	struct { /* BAR (base address registers) numbers in the config
-                    space header */
+		    space header */
 		int lo;
-		int hi; /* -1 if not there, >6 for offset-method (max
-                           BAR is 6) */
+		int hi;
+		/* -1 if not there, >6 for offset-method (max BAR is 6) */
 	} addr[4];
 
 	/* If set, this is called immediately after pci_enable_device.
@@ -2859,7 +2926,7 @@ static struct parport_pc_pci {
 	/* timedia_4018  */             { 2, { { 0, 1 }, { 2, 3 }, } },
 	/* timedia_9018a */             { 2, { { 0, 1 }, { 2, 3 }, } },
 					/* SYBA uses fixed offsets in
-                                           a 1K io window */
+					   a 1K io window */
 	/* syba_2p_epp AP138B */	{ 2, { { 0, 0x078 }, { 0, 0x178 }, } },
 	/* syba_1p_ecp W83787 */	{ 1, { { 0, 0x078 }, } },
 	/* titan_010l */		{ 1, { { 3, -1 }, } },
@@ -2875,11 +2942,14 @@ static struct parport_pc_pci {
 	/* oxsemi_pcie_pport */		{ 1, { { 0, 1 }, } },
 	/* aks_0100 */                  { 1, { { 0, -1 }, } },
 	/* mobility_pp */		{ 1, { { 0, 1 }, } },
-	/* netmos_9705 */               { 1, { { 0, -1 }, } }, /* untested */
-        /* netmos_9715 */               { 2, { { 0, 1 }, { 2, 3 },} }, /* untested */
-        /* netmos_9755 */               { 2, { { 0, 1 }, { 2, 3 },} }, /* untested */
-	/* netmos_9805 */               { 1, { { 0, -1 }, } }, /* untested */
-	/* netmos_9815 */               { 2, { { 0, -1 }, { 2, -1 }, } }, /* untested */
+
+	/* The netmos entries below are untested */
+	/* netmos_9705 */               { 1, { { 0, -1 }, } },
+	/* netmos_9715 */               { 2, { { 0, 1 }, { 2, 3 },} },
+	/* netmos_9755 */               { 2, { { 0, 1 }, { 2, 3 },} },
+	/* netmos_9805 */               { 1, { { 0, -1 }, } },
+	/* netmos_9815 */               { 2, { { 0, -1 }, { 2, -1 }, } },
+
 	/* quatech_sppxp100 */		{ 1, { { 0, 1 }, } },
 };
 
@@ -2908,7 +2978,7 @@ static const struct pci_device_id parport_pc_pci_tbl[] = {
 	{ PCI_VENDOR_ID_LAVA, PCI_DEVICE_ID_LAVA_BOCA_IOPPAR,
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, boca_ioppar },
 	{ PCI_VENDOR_ID_PLX, PCI_DEVICE_ID_PLX_9050,
-	  PCI_SUBVENDOR_ID_EXSYS, PCI_SUBDEVICE_ID_EXSYS_4014, 0,0, plx_9050 },
+	  PCI_SUBVENDOR_ID_EXSYS, PCI_SUBDEVICE_ID_EXSYS_4014, 0, 0, plx_9050 },
 	/* PCI_VENDOR_ID_TIMEDIA/SUNIX has many differing cards ...*/
 	{ 0x1409, 0x7168, 0x1409, 0x4078, 0, 0, timedia_4078a },
 	{ 0x1409, 0x7168, 0x1409, 0x4079, 0, 0, timedia_4079h },
@@ -2942,7 +3012,8 @@ static const struct pci_device_id parport_pc_pci_tbl[] = {
 	{ 0x9710, 0x9805, 0x1000, 0x0010, 0, 0, titan_1284p1 },
 	{ 0x9710, 0x9815, 0x1000, 0x0020, 0, 0, titan_1284p2 },
 	/* PCI_VENDOR_ID_AVLAB/Intek21 has another bunch of cards ...*/
-	{ 0x14db, 0x2120, PCI_ANY_ID, PCI_ANY_ID, 0, 0, avlab_1p}, /* AFAVLAB_TK9902 */
+	/* AFAVLAB_TK9902 */
+	{ 0x14db, 0x2120, PCI_ANY_ID, PCI_ANY_ID, 0, 0, avlab_1p},
 	{ 0x14db, 0x2121, PCI_ANY_ID, PCI_ANY_ID, 0, 0, avlab_2p},
 	{ PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_16PCI952PP,
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, oxsemi_952 },
@@ -2985,14 +3056,14 @@ static const struct pci_device_id parport_pc_pci_tbl[] = {
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, quatech_sppxp100 },
 	{ 0, } /* terminate list */
 };
-MODULE_DEVICE_TABLE(pci,parport_pc_pci_tbl);
+MODULE_DEVICE_TABLE(pci, parport_pc_pci_tbl);
 
 struct pci_parport_data {
 	int num;
 	struct parport *ports[2];
 };
 
-static int parport_pc_pci_probe (struct pci_dev *dev,
+static int parport_pc_pci_probe(struct pci_dev *dev,
 					   const struct pci_device_id *id)
 {
 	int err, count, n, i = id->driver_data;
@@ -3005,7 +3076,8 @@ static int parport_pc_pci_probe (struct pci_dev *dev,
 	/* This is a PCI card */
 	i -= last_sio;
 	count = 0;
-	if ((err = pci_enable_device (dev)) != 0)
+	err = pci_enable_device(dev);
+	if (err)
 		return err;
 
 	data = kmalloc(sizeof(struct pci_parport_data), GFP_KERNEL);
@@ -3013,7 +3085,7 @@ static int parport_pc_pci_probe (struct pci_dev *dev,
 		return -ENOMEM;
 
 	if (cards[i].preinit_hook &&
-	    cards[i].preinit_hook (dev, PARPORT_IRQ_NONE, PARPORT_DMA_NONE)) {
+	    cards[i].preinit_hook(dev, PARPORT_IRQ_NONE, PARPORT_DMA_NONE)) {
 		kfree(data);
 		return -ENODEV;
 	}
@@ -3023,25 +3095,25 @@ static int parport_pc_pci_probe (struct pci_dev *dev,
 		int hi = cards[i].addr[n].hi;
 		int irq;
 		unsigned long io_lo, io_hi;
-		io_lo = pci_resource_start (dev, lo);
+		io_lo = pci_resource_start(dev, lo);
 		io_hi = 0;
 		if ((hi >= 0) && (hi <= 6))
-			io_hi = pci_resource_start (dev, hi);
+			io_hi = pci_resource_start(dev, hi);
 		else if (hi > 6)
 			io_lo += hi; /* Reinterpret the meaning of
-                                        "hi" as an offset (see SYBA
-                                        def.) */
+					"hi" as an offset (see SYBA
+					def.) */
 		/* TODO: test if sharing interrupts works */
 		irq = dev->irq;
 		if (irq == IRQ_NONE) {
-			printk (KERN_DEBUG
+			printk(KERN_DEBUG
 	"PCI parallel port detected: %04x:%04x, I/O at %#lx(%#lx)\n",
 				parport_pc_pci_tbl[i + last_sio].vendor,
 				parport_pc_pci_tbl[i + last_sio].device,
 				io_lo, io_hi);
 			irq = PARPORT_IRQ_NONE;
 		} else {
-			printk (KERN_DEBUG
+			printk(KERN_DEBUG
 	"PCI parallel port detected: %04x:%04x, I/O at %#lx(%#lx), IRQ %d\n",
 				parport_pc_pci_tbl[i + last_sio].vendor,
 				parport_pc_pci_tbl[i + last_sio].device,
@@ -3058,7 +3130,7 @@ static int parport_pc_pci_probe (struct pci_dev *dev,
 	data->num = count;
 
 	if (cards[i].postinit_hook)
-		cards[i].postinit_hook (dev, count == 0);
+		cards[i].postinit_hook(dev, count == 0);
 
 	if (count) {
 		pci_set_drvdata(dev, data);
@@ -3092,7 +3164,7 @@ static struct pci_driver parport_pc_pci_driver = {
 	.remove		= __devexit_p(parport_pc_pci_remove),
 };
 
-static int __init parport_pc_init_superio (int autoirq, int autodma)
+static int __init parport_pc_init_superio(int autoirq, int autodma)
 {
 	const struct pci_device_id *id;
 	struct pci_dev *pdev = NULL;
@@ -3103,8 +3175,9 @@ static int __init parport_pc_init_superio (int autoirq, int autodma)
 		if (id == NULL || id->driver_data >= last_sio)
 			continue;
 
-		if (parport_pc_superio_info[id->driver_data].probe
-			(pdev, autoirq, autodma,parport_pc_superio_info[id->driver_data].via)) {
+		if (parport_pc_superio_info[id->driver_data].probe(
+			pdev, autoirq, autodma,
+			parport_pc_superio_info[id->driver_data].via)) {
 			ret++;
 		}
 	}
@@ -3113,7 +3186,10 @@ static int __init parport_pc_init_superio (int autoirq, int autodma)
 }
 #else
 static struct pci_driver parport_pc_pci_driver;
-static int __init parport_pc_init_superio(int autoirq, int autodma) {return 0;}
+static int __init parport_pc_init_superio(int autoirq, int autodma)
+{
+	return 0;
+}
 #endif /* CONFIG_PCI */
 
 #ifdef CONFIG_PNP
@@ -3126,44 +3202,45 @@ static const struct pnp_device_id parport_pc_pnp_tbl[] = {
 	{ }
 };
 
-MODULE_DEVICE_TABLE(pnp,parport_pc_pnp_tbl);
+MODULE_DEVICE_TABLE(pnp, parport_pc_pnp_tbl);
 
-static int parport_pc_pnp_probe(struct pnp_dev *dev, const struct pnp_device_id *id)
+static int parport_pc_pnp_probe(struct pnp_dev *dev,
+						const struct pnp_device_id *id)
 {
 	struct parport *pdata;
 	unsigned long io_lo, io_hi;
 	int dma, irq;
 
-	if (pnp_port_valid(dev,0) &&
-		!(pnp_port_flags(dev,0) & IORESOURCE_DISABLED)) {
-		io_lo = pnp_port_start(dev,0);
+	if (pnp_port_valid(dev, 0) &&
+		!(pnp_port_flags(dev, 0) & IORESOURCE_DISABLED)) {
+		io_lo = pnp_port_start(dev, 0);
 	} else
 		return -EINVAL;
 
-	if (pnp_port_valid(dev,1) &&
-		!(pnp_port_flags(dev,1) & IORESOURCE_DISABLED)) {
-		io_hi = pnp_port_start(dev,1);
+	if (pnp_port_valid(dev, 1) &&
+		!(pnp_port_flags(dev, 1) & IORESOURCE_DISABLED)) {
+		io_hi = pnp_port_start(dev, 1);
 	} else
 		io_hi = 0;
 
-	if (pnp_irq_valid(dev,0) &&
-		!(pnp_irq_flags(dev,0) & IORESOURCE_DISABLED)) {
-		irq = pnp_irq(dev,0);
+	if (pnp_irq_valid(dev, 0) &&
+		!(pnp_irq_flags(dev, 0) & IORESOURCE_DISABLED)) {
+		irq = pnp_irq(dev, 0);
 	} else
 		irq = PARPORT_IRQ_NONE;
 
-	if (pnp_dma_valid(dev,0) &&
-		!(pnp_dma_flags(dev,0) & IORESOURCE_DISABLED)) {
-		dma = pnp_dma(dev,0);
+	if (pnp_dma_valid(dev, 0) &&
+		!(pnp_dma_flags(dev, 0) & IORESOURCE_DISABLED)) {
+		dma = pnp_dma(dev, 0);
 	} else
 		dma = PARPORT_DMA_NONE;
 
 	dev_info(&dev->dev, "reported by %s\n", dev->protocol->name);
-	if (!(pdata = parport_pc_probe_port(io_lo, io_hi,
-					irq, dma, &dev->dev, 0)))
+	pdata = parport_pc_probe_port(io_lo, io_hi, irq, dma, &dev->dev, 0);
+	if (pdata == NULL)
 		return -ENODEV;
 
-	pnp_set_drvdata(dev,pdata);
+	pnp_set_drvdata(dev, pdata);
 	return 0;
 }
 
@@ -3205,7 +3282,7 @@ static struct platform_driver parport_pc_platform_driver = {
 
 /* This is called by parport_pc_find_nonpci_ports (in asm/parport.h) */
 static int __devinit __attribute__((unused))
-parport_pc_find_isa_ports (int autoirq, int autodma)
+parport_pc_find_isa_ports(int autoirq, int autodma)
 {
 	int count = 0;
 
@@ -3229,7 +3306,7 @@ parport_pc_find_isa_ports (int autoirq, int autodma)
  * autoirq is PARPORT_IRQ_NONE, PARPORT_IRQ_AUTO, or PARPORT_IRQ_PROBEONLY
  * autodma is PARPORT_DMA_NONE or PARPORT_DMA_AUTO
  */
-static void __init parport_pc_find_ports (int autoirq, int autodma)
+static void __init parport_pc_find_ports(int autoirq, int autodma)
 {
 	int count = 0, err;
 
@@ -3263,11 +3340,18 @@ static void __init parport_pc_find_ports (int autoirq, int autodma)
  *	syntax and keep in mind that code below is a cleaned up version.
  */
 
-static int __initdata io[PARPORT_PC_MAX_PORTS+1] = { [0 ... PARPORT_PC_MAX_PORTS] = 0 };
-static int __initdata io_hi[PARPORT_PC_MAX_PORTS+1] =
-	{ [0 ... PARPORT_PC_MAX_PORTS] = PARPORT_IOHI_AUTO };
-static int __initdata dmaval[PARPORT_PC_MAX_PORTS] = { [0 ... PARPORT_PC_MAX_PORTS-1] = PARPORT_DMA_NONE };
-static int __initdata irqval[PARPORT_PC_MAX_PORTS] = { [0 ... PARPORT_PC_MAX_PORTS-1] = PARPORT_IRQ_PROBEONLY };
+static int __initdata io[PARPORT_PC_MAX_PORTS+1] = {
+	[0 ... PARPORT_PC_MAX_PORTS] = 0
+};
+static int __initdata io_hi[PARPORT_PC_MAX_PORTS+1] = {
+	[0 ... PARPORT_PC_MAX_PORTS] = PARPORT_IOHI_AUTO
+};
+static int __initdata dmaval[PARPORT_PC_MAX_PORTS] = {
+	[0 ... PARPORT_PC_MAX_PORTS-1] = PARPORT_DMA_NONE
+};
+static int __initdata irqval[PARPORT_PC_MAX_PORTS] = {
+	[0 ... PARPORT_PC_MAX_PORTS-1] = PARPORT_IRQ_PROBEONLY
+};
 
 static int __init parport_parse_param(const char *s, int *val,
 				int automatic, int none, int nofifo)
@@ -3308,18 +3392,19 @@ static int __init parport_parse_dma(const char *dmastr, int *val)
 #ifdef CONFIG_PCI
 static int __init parport_init_mode_setup(char *str)
 {
-	printk(KERN_DEBUG "parport_pc.c: Specified parameter parport_init_mode=%s\n", str);
-
-	if (!strcmp (str, "spp"))
-		parport_init_mode=1;
-	if (!strcmp (str, "ps2"))
-		parport_init_mode=2;
-	if (!strcmp (str, "epp"))
-		parport_init_mode=3;
-	if (!strcmp (str, "ecp"))
-		parport_init_mode=4;
-	if (!strcmp (str, "ecpepp"))
-		parport_init_mode=5;
+	printk(KERN_DEBUG
+	     "parport_pc.c: Specified parameter parport_init_mode=%s\n", str);
+
+	if (!strcmp(str, "spp"))
+		parport_init_mode = 1;
+	if (!strcmp(str, "ps2"))
+		parport_init_mode = 2;
+	if (!strcmp(str, "epp"))
+		parport_init_mode = 3;
+	if (!strcmp(str, "ecp"))
+		parport_init_mode = 4;
+	if (!strcmp(str, "ecpepp"))
+		parport_init_mode = 5;
 	return 1;
 }
 #endif
@@ -3343,7 +3428,8 @@ module_param(verbose_probing, int, 0644);
 #endif
 #ifdef CONFIG_PCI
 static char *init_mode;
-MODULE_PARM_DESC(init_mode, "Initialise mode for VIA VT8231 port (spp, ps2, epp, ecp or ecpepp)");
+MODULE_PARM_DESC(init_mode,
+	"Initialise mode for VIA VT8231 port (spp, ps2, epp, ecp or ecpepp)");
 module_param(init_mode, charp, 0);
 #endif
 
@@ -3374,7 +3460,7 @@ static int __init parse_parport_params(void)
 				irqval[0] = val;
 				break;
 			default:
-				printk (KERN_WARNING
+				printk(KERN_WARNING
 					"parport_pc: irq specified "
 					"without base address.  Use 'io=' "
 					"to specify one\n");
@@ -3387,7 +3473,7 @@ static int __init parse_parport_params(void)
 				dmaval[0] = val;
 				break;
 			default:
-				printk (KERN_WARNING
+				printk(KERN_WARNING
 					"parport_pc: dma specified "
 					"without base address.  Use 'io=' "
 					"to specify one\n");
@@ -3398,7 +3484,7 @@ static int __init parse_parport_params(void)
 
 #else
 
-static int parport_setup_ptr __initdata = 0;
+static int parport_setup_ptr __initdata;
 
 /*
  * Acceptable parameters:
@@ -3409,7 +3495,7 @@ static int parport_setup_ptr __initdata = 0;
  *
  * IRQ/DMA may be numeric or 'auto' or 'none'
  */
-static int __init parport_setup (char *str)
+static int __init parport_setup(char *str)
 {
 	char *endptr;
 	char *sep;
@@ -3421,15 +3507,15 @@ static int __init parport_setup (char *str)
 		return 1;
 	}
 
-	if (!strncmp (str, "auto", 4)) {
+	if (!strncmp(str, "auto", 4)) {
 		irqval[0] = PARPORT_IRQ_AUTO;
 		dmaval[0] = PARPORT_DMA_AUTO;
 		return 1;
 	}
 
-	val = simple_strtoul (str, &endptr, 0);
+	val = simple_strtoul(str, &endptr, 0);
 	if (endptr == str) {
-		printk (KERN_WARNING "parport=%s not understood\n", str);
+		printk(KERN_WARNING "parport=%s not understood\n", str);
 		return 1;
 	}
 
@@ -3463,7 +3549,7 @@ static int __init parse_parport_params(void)
 	return io[0] == PARPORT_DISABLE;
 }
 
-__setup ("parport=", parport_setup);
+__setup("parport=", parport_setup);
 
 /*
  * Acceptable parameters:
@@ -3471,7 +3557,7 @@ __setup ("parport=", parport_setup);
  * parport_init_mode=[spp|ps2|epp|ecp|ecpepp]
  */
 #ifdef CONFIG_PCI
-__setup("parport_init_mode=",parport_init_mode_setup);
+__setup("parport_init_mode=", parport_init_mode_setup);
 #endif
 #endif
 
@@ -3495,13 +3581,13 @@ static int __init parport_pc_init(void)
 		for (i = 0; i < PARPORT_PC_MAX_PORTS; i++) {
 			if (!io[i])
 				break;
-			if ((io_hi[i]) == PARPORT_IOHI_AUTO)
-			       io_hi[i] = 0x400 + io[i];
+			if (io_hi[i] == PARPORT_IOHI_AUTO)
+				io_hi[i] = 0x400 + io[i];
 			parport_pc_probe_port(io[i], io_hi[i],
-					  irqval[i], dmaval[i], NULL, 0);
+					irqval[i], dmaval[i], NULL, 0);
 		}
 	} else
-		parport_pc_find_ports (irqval[0], dmaval[0]);
+		parport_pc_find_ports(irqval[0], dmaval[0]);
 
 	return 0;
 }
@@ -3509,9 +3595,9 @@ static int __init parport_pc_init(void)
 static void __exit parport_pc_exit(void)
 {
 	if (pci_registered_parport)
-		pci_unregister_driver (&parport_pc_pci_driver);
+		pci_unregister_driver(&parport_pc_pci_driver);
 	if (pnp_registered_parport)
-		pnp_unregister_driver (&parport_pc_pnp_driver);
+		pnp_unregister_driver(&parport_pc_pnp_driver);
 	platform_driver_unregister(&parport_pc_platform_driver);
 
 	while (!list_empty(&ports_list)) {
-- 
cgit v0.10.2


From 181bf1e815a2ad2a9a8b30ef6e92583d1530b255 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 13:08:10 +0100
Subject: parport_pc: clean up the modified while loops using for

And tidy up a few bits coding style detectors missed

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c
index edf83e9..151bf5b 100644
--- a/drivers/parport/parport_pc.c
+++ b/drivers/parport/parport_pc.c
@@ -626,7 +626,7 @@ static size_t parport_pc_fifo_write_block_dma(struct parport *port,
 	unsigned long start = (unsigned long) buf;
 	unsigned long end = (unsigned long) buf + length - 1;
 
-dump_parport_state("enter fifo_write_block_dma", port);
+	dump_parport_state("enter fifo_write_block_dma", port);
 	if (end < MAX_DMA_ADDRESS) {
 		/* If it would cross a 64k boundary, cap it at the end. */
 		if ((start ^ end) & ~0xffffUL)
@@ -737,7 +737,7 @@ false_alarm:
 	if (dma_handle)
 		dma_unmap_single(dev, dma_handle, length, DMA_TO_DEVICE);
 
-dump_parport_state("leave fifo_write_block_dma", port);
+	dump_parport_state("leave fifo_write_block_dma", port);
 	return length - left;
 }
 #endif
@@ -955,8 +955,8 @@ static size_t parport_pc_ecp_read_block_pio(struct parport *port,
 	char *bufp = buf;
 
 	port = port->physport;
-DPRINTK(KERN_DEBUG "parport_pc: parport_pc_ecp_read_block_pio\n");
-dump_parport_state("enter fcn", port);
+	DPRINTK(KERN_DEBUG "parport_pc: parport_pc_ecp_read_block_pio\n");
+	dump_parport_state("enter fcn", port);
 
 	/* Special case: a timeout of zero means we cannot call schedule().
 	 * Also if O_NONBLOCK is set then use the default implementation. */
@@ -1112,14 +1112,15 @@ false_alarm:
 
 		if (ecrval & 0x02) {
 			/* FIFO is full. */
-dump_parport_state("FIFO full", port);
+			dump_parport_state("FIFO full", port);
 			insb(fifo, bufp, fifo_depth);
 			bufp += fifo_depth;
 			left -= fifo_depth;
 			continue;
 		}
 
-DPRINTK(KERN_DEBUG "*** ecp_read_block_pio: reading one byte from the FIFO\n");
+		DPRINTK(KERN_DEBUG
+		  "*** ecp_read_block_pio: reading one byte from the FIFO\n");
 
 		/* FIFO not filled.  We will cycle this loop for a while
 		 * and either the peripheral will fill it faster,
@@ -1135,7 +1136,7 @@ DPRINTK(KERN_DEBUG "*** ecp_read_block_pio: reading one byte from the FIFO\n");
 	}
 
 	port->ieee1284.phase = IEEE1284_PH_REV_IDLE;
-dump_parport_state("rev idle2", port);
+	dump_parport_state("rev idle2", port);
 
 out_no_data:
 
@@ -1163,7 +1164,7 @@ out_no_data:
 				port->name, lost);
 	}
 
-dump_parport_state("fwd idle", port);
+	dump_parport_state("fwd idle", port);
 	return length - left;
 }
 #endif  /*  0  */
@@ -1216,10 +1217,23 @@ static const struct parport_operations parport_pc_ops = {
 };
 
 #ifdef CONFIG_PARPORT_PC_SUPERIO
+
+static struct superio_struct *find_free_superio(void)
+{
+	int i;
+	for (i = 0; i < NR_SUPERIOS; i++)
+		if (superios[i].io == 0)
+			return &superios[i];
+	return NULL;
+}
+
+
 /* Super-IO chipset detection, Winbond, SMSC */
 static void __devinit show_parconfig_smsc37c669(int io, int key)
 {
-	int cr1, cr4, cra, cr23, cr26, cr27, i = 0;
+	int cr1, cr4, cra, cr23, cr26, cr27;
+	struct superio_struct *s;
+
 	static const char *const modes[] = {
 		"SPP and Bidirectional (PS/2)",
 		"EPP and SPP",
@@ -1272,30 +1286,29 @@ static void __devinit show_parconfig_smsc37c669(int io, int key)
 	   are related, however DMA can be 1 or 3, assume DMA_A=DMA1,
 	   DMA_C=DMA3 (this is true e.g. for TYAN 1564D Tomcat IV) */
 	if (cr23 * 4 >= 0x100) { /* if active */
-		while ((i < NR_SUPERIOS) && (superios[i].io != 0))
-			i++;
-		if (i == NR_SUPERIOS) {
+		s = find_free_superio();
+		if (s == NULL)
 			printk(KERN_INFO "Super-IO: too many chips!\n");
-		} else {
+		else {
 			int d;
 			switch (cr23 * 4) {
 			case 0x3bc:
-				superios[i].io = 0x3bc;
-				superios[i].irq = 7;
+				s->io = 0x3bc;
+				s->irq = 7;
 				break;
 			case 0x378:
-				superios[i].io = 0x378;
-				superios[i].irq = 7;
+				s->io = 0x378;
+				s->irq = 7;
 				break;
 			case 0x278:
-				superios[i].io = 0x278;
-				superios[i].irq = 5;
+				s->io = 0x278;
+				s->irq = 5;
 			}
 			d = (cr26 & 0x0f);
 			if (d == 1 || d == 3)
-				superios[i].dma = d;
+				s->dma = d;
 			else
-				superios[i].dma = PARPORT_DMA_NONE;
+				s->dma = PARPORT_DMA_NONE;
 		}
 	}
 }
@@ -1303,7 +1316,8 @@ static void __devinit show_parconfig_smsc37c669(int io, int key)
 
 static void __devinit show_parconfig_winbond(int io, int key)
 {
-	int cr30, cr60, cr61, cr70, cr74, crf0, i = 0;
+	int cr30, cr60, cr61, cr70, cr74, crf0;
+	struct superio_struct *s;
 	static const char *const modes[] = {
 		"Standard (SPP) and Bidirectional(PS/2)", /* 0 */
 		"EPP-1.9 and SPP",
@@ -1356,14 +1370,13 @@ static void __devinit show_parconfig_winbond(int io, int key)
 	}
 
 	if (cr30 & 0x01) { /* the settings can be interrogated later ... */
-		while ((i < NR_SUPERIOS) && (superios[i].io != 0))
-			i++;
-		if (i == NR_SUPERIOS) {
+		s = find_free_superio();
+		if (s == NULL)
 			printk(KERN_INFO "Super-IO: too many chips!\n");
-		} else {
-			superios[i].io = (cr60<<8)|cr61;
-			superios[i].irq = cr70&0x0f;
-			superios[i].dma = (((cr74 & 0x07) > 3) ?
+		else {
+			s->io = (cr60 << 8) | cr61;
+			s->irq = cr70 & 0x0f;
+			s->dma = (((cr74 & 0x07) > 3) ?
 					   PARPORT_DMA_NONE : (cr74 & 0x07));
 		}
 	}
@@ -1618,25 +1631,28 @@ static void __devinit detect_and_report_it87(void)
 }
 #endif /* CONFIG_PARPORT_PC_SUPERIO */
 
-static int get_superio_dma(struct parport *p)
+static struct superio_struct *find_superio(struct parport *p)
 {
-	int i = 0;
+	int i;
+	for (i = 0; i < NR_SUPERIOS; i++)
+		if (superios[i].io != p->base)
+			return &superios[i];
+	return NULL;
+}
 
-	while ((i < NR_SUPERIOS) && (superios[i].io != p->base))
-		i++;
-	if (i != NR_SUPERIOS)
-		return superios[i].dma;
+static int get_superio_dma(struct parport *p)
+{
+	struct superio_struct *s = find_superio(p);
+	if (s)
+		return s->dma;
 	return PARPORT_DMA_NONE;
 }
 
 static int get_superio_irq(struct parport *p)
 {
-	int i = 0;
-
-	while ((i < NR_SUPERIOS) && (superios[i].io != p->base))
-		i++;
-	if (i != NR_SUPERIOS)
-		return superios[i].irq;
+	struct superio_struct *s = find_superio(p);
+	if (s)
+		return s->irq;
 	return PARPORT_IRQ_NONE;
 }
 
-- 
cgit v0.10.2


From 257a6e8cc7f9274f0af090494a3f1ee06548b5bd Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@linux.vnet.ibm.com>
Date: Thu, 11 Jun 2009 13:20:09 +0100
Subject: icom: fix compile errors when defining ICOM_TRACE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As it is, defining ICOM_TRACE produces some compile errors, as
"parameter name omitted" and "redefinition of ‘trace’"

This patch removes the wrong trace definition.

Signed-off-by: Breno Leitao <leitao@linux.vnet.ibm.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/icom.c b/drivers/serial/icom.c
index fd5fd23..9f2891c 100644
--- a/drivers/serial/icom.c
+++ b/drivers/serial/icom.c
@@ -137,7 +137,12 @@ static LIST_HEAD(icom_adapter_head);
 static spinlock_t icom_lock;
 
 #ifdef ICOM_TRACE
-static inline void trace(struct icom_port *, char *, unsigned long) {};
+static inline void trace(struct icom_port *icom_port, char *trace_pt,
+			unsigned long trace_data)
+{
+	dev_info(&icom_port->adapter->pci_dev->dev, ":%d:%s - %lx\n",
+	icom_port->port, trace_pt, trace_data);
+}
 #else
 static inline void trace(struct icom_port *icom_port, char *trace_pt, unsigned long trace_data) {};
 #endif
@@ -1647,15 +1652,6 @@ static void __exit icom_exit(void)
 module_init(icom_init);
 module_exit(icom_exit);
 
-#ifdef ICOM_TRACE
-static inline void trace(struct icom_port *icom_port, char *trace_pt,
-		  unsigned long trace_data)
-{
-	dev_info(&icom_port->adapter->pci_dev->dev, ":%d:%s - %lx\n",
-		 icom_port->port, trace_pt, trace_data);
-}
-#endif
-
 MODULE_AUTHOR("Michael Anderson <mjanders@us.ibm.com>");
 MODULE_DESCRIPTION("IBM iSeries Serial IOA driver");
 MODULE_SUPPORTED_DEVICE
-- 
cgit v0.10.2


From 08e0992f60ad44025a8a8b8a821838ca4a562686 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <florian@openwrt.org>
Date: Thu, 11 Jun 2009 13:21:24 +0100
Subject: serial: add support for the TI AR7 internal UART

This patch adds support for the TI AR7 internal UART.

Signed-off-by: Florian Fainelli <florian@openwrt.org>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index a0127e9..fb867a9 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -287,6 +287,13 @@ static const struct serial8250_config uart_config[] = {
 		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
 		.flags		= UART_CAP_FIFO,
 	},
+	[PORT_AR7] = {
+		.name		= "AR7",
+		.fifo_size	= 16,
+		.tx_loadsz	= 16,
+		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_00,
+		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
+	},
 };
 
 #if defined (CONFIG_SERIAL_8250_AU1X00)
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 57a97e5..48766ea 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -41,7 +41,8 @@
 #define PORT_XSCALE	15
 #define PORT_RM9000	16	/* PMC-Sierra RM9xxx internal UART */
 #define PORT_OCTEON	17	/* Cavium OCTEON internal UART */
-#define PORT_MAX_8250	17	/* max port ID */
+#define PORT_AR7	18	/* Texas Instruments AR7 internal UART */
+#define PORT_MAX_8250	18	/* max port ID */
 
 /*
  * ARM specific type numbers.  These are not currently guaranteed
-- 
cgit v0.10.2


From b5c6794fe4256fd63664aa185c468647c28bfd4a Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Thu, 11 Jun 2009 13:23:02 +0100
Subject: Blackfin SPORT UART: fix typo in sport_set_termios prototype

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/bfin_sport_uart.c b/drivers/serial/bfin_sport_uart.c
index 529c0ff..fd5cb95 100644
--- a/drivers/serial/bfin_sport_uart.c
+++ b/drivers/serial/bfin_sport_uart.c
@@ -419,7 +419,7 @@ static void sport_shutdown(struct uart_port *port)
 }
 
 static void sport_set_termios(struct uart_port *port,
-		struct termios *termios, struct termios *old)
+		struct ktermios *termios, struct ktermios *old)
 {
 	pr_debug("%s enter, c_cflag:%08x\n", __func__, termios->c_cflag);
 	uart_update_timeout(port, CS8 ,port->uartclk);
-- 
cgit v0.10.2


From a19e8b205915b2925aca75b2d2bf0c3104c8be14 Mon Sep 17 00:00:00 2001
From: Michael Hennerich <michael.hennerich@analog.com>
Date: Thu, 11 Jun 2009 13:23:42 +0100
Subject: Blackfin SPORT UART: fix data misses while using transmit frame sync

SPORT transmit frame sync (TFS) isn't used as an electrical signal during
normal SPORT UART emulation.  However, it is useful in EIA RS-485
emulation as RS-485 Transceiver Driver Enable DE strobe.

This patch configures:
TFS to be active high in order to drive an DE strobe of
an eventually connected RS-485 Transceiver.

Late frame sync mode (LATFS) gating the entire TX shift cycle.

Signed-off-by: Michael Hennerich <michael.hennerich@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/bfin_sport_uart.c b/drivers/serial/bfin_sport_uart.c
index fd5cb95..6687ccd 100644
--- a/drivers/serial/bfin_sport_uart.c
+++ b/drivers/serial/bfin_sport_uart.c
@@ -149,7 +149,7 @@ static int sport_uart_setup(struct sport_uart_port *up, int sclk, int baud_rate)
 	int tclkdiv, tfsdiv, rclkdiv;
 
 	/* Set TCR1 and TCR2 */
-	SPORT_PUT_TCR1(up, (LTFS | ITFS | TFSR | TLSBIT | ITCLK));
+	SPORT_PUT_TCR1(up, (LATFS | ITFS | TFSR | TLSBIT | ITCLK));
 	SPORT_PUT_TCR2(up, 10);
 	pr_debug("%s TCR1:%x, TCR2:%x\n", __func__, SPORT_GET_TCR1(up), SPORT_GET_TCR2(up));
 
-- 
cgit v0.10.2


From 4328e3e5ef1ae3427a4f6863aa65916a68ec2dd9 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Thu, 11 Jun 2009 13:37:11 +0100
Subject: Blackfin SPORT UART: rewrite inline assembly

Hopefuly the new version is easier to read, but in the process it declares
proper clobber lists and better constraints so that GCC can do a better
job at allocating free registers.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/bfin_sport_uart.c b/drivers/serial/bfin_sport_uart.c
index 6687ccd..34b4ae0 100644
--- a/drivers/serial/bfin_sport_uart.c
+++ b/drivers/serial/bfin_sport_uart.c
@@ -101,15 +101,16 @@ static inline void tx_one_byte(struct sport_uart_port *up, unsigned int value)
 {
 	pr_debug("%s value:%x\n", __func__, value);
 	/* Place a Start and Stop bit */
-	__asm__ volatile (
-		"R2 = b#01111111100;\n\t"
-		"R3 = b#10000000001;\n\t"
-		"%0 <<= 2;\n\t"
-		"%0 = %0 & R2;\n\t"
-		"%0 = %0 | R3;\n\t"
-		:"=r"(value)
-		:"0"(value)
-		:"R2", "R3");
+	__asm__ __volatile__ (
+		"R2 = b#01111111100;"
+		"R3 = b#10000000001;"
+		"%0 <<= 2;"
+		"%0 = %0 & R2;"
+		"%0 = %0 | R3;"
+		: "=d"(value)
+		: "d"(value)
+		: "ASTAT", "R2", "R3"
+	);
 	pr_debug("%s value:%x\n", __func__, value);
 
 	SPORT_PUT_TX(up, value);
@@ -118,27 +119,30 @@ static inline void tx_one_byte(struct sport_uart_port *up, unsigned int value)
 static inline unsigned int rx_one_byte(struct sport_uart_port *up)
 {
 	unsigned int value, extract;
+	u32 tmp_mask1, tmp_mask2, tmp_shift, tmp;
 
 	value = SPORT_GET_RX32(up);
 	pr_debug("%s value:%x\n", __func__, value);
 
 	/* Extract 8 bits data */
-	__asm__ volatile (
-		"R5 = 0;\n\t"
-		"P0 = 8;\n\t"
-		"R1 = 0x1801(Z);\n\t"
-		"R3 = 0x0300(Z);\n\t"
-		"R4 = 0;\n\t"
-		"LSETUP(loop_s, loop_e) LC0 = P0;\nloop_s:\t"
-		"R2 = extract(%1, R1.L)(Z);\n\t"
-		"R2 <<= R4;\n\t"
-		"R5 = R5 | R2;\n\t"
-		"R1 = R1 - R3;\nloop_e:\t"
-		"R4 += 1;\n\t"
-		"%0 = R5;\n\t"
-		:"=r"(extract)
-		:"r"(value)
-		:"P0", "R1", "R2","R3","R4", "R5");
+	__asm__ __volatile__ (
+		"%[extr] = 0;"
+		"%[mask1] = 0x1801(Z);"
+		"%[mask2] = 0x0300(Z);"
+		"%[shift] = 0;"
+		"LSETUP(.Lloop_s, .Lloop_e) LC0 = %[lc];"
+		".Lloop_s:"
+		"%[tmp] = extract(%[val], %[mask1].L)(Z);"
+		"%[tmp] <<= %[shift];"
+		"%[extr] = %[extr] | %[tmp];"
+		"%[mask1] = %[mask1] - %[mask2];"
+		".Lloop_e:"
+		"%[shift] += 1;"
+		: [val]"=d"(value), [extr]"=d"(extract), [shift]"=d"(tmp_shift), [tmp]"=d"(tmp),
+		  [mask1]"=d"(tmp_mask1), [mask2]"=d"(tmp_mask2)
+		: "d"(value), [lc]"a"(8)
+		: "ASTAT", "LB0", "LC0", "LT0"
+	);
 
 	pr_debug("	extract:%x\n", extract);
 	return extract;
-- 
cgit v0.10.2


From 8516c568f25666a012ec4c859e640a76fc9b6ec0 Mon Sep 17 00:00:00 2001
From: Sonic Zhang <sonic.zhang@analog.com>
Date: Thu, 11 Jun 2009 13:38:16 +0100
Subject: Blackfin Serial Driver: fix error while transferring large files

Ignore receiving data if new position is in the same line of current
buffer tail and is small.  This should decrease overruns.

Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/bfin_5xx.c b/drivers/serial/bfin_5xx.c
index d86123e..65a4c07 100644
--- a/drivers/serial/bfin_5xx.c
+++ b/drivers/serial/bfin_5xx.c
@@ -477,6 +477,15 @@ void bfin_serial_rx_dma_timeout(struct bfin_serial_port *uart)
 
 	spin_lock_irqsave(&uart->port.lock, flags);
 
+	/* 2D DMA RX buffer ring is used. Because curr_y_count and
+	 * curr_x_count can't be read as an atomic operation,
+	 * curr_y_count should be read before curr_x_count. When
+	 * curr_x_count is read, curr_y_count may already indicate
+	 * next buffer line. But, the position calculated here is
+	 * still indicate the old line. The wrong position data may
+	 * be smaller than current buffer tail, which cause garbages
+	 * are received if it is not prohibit.
+	 */
 	uart->rx_dma_nrows = get_dma_curr_ycount(uart->rx_dma_channel);
 	x_pos = get_dma_curr_xcount(uart->rx_dma_channel);
 	uart->rx_dma_nrows = DMA_RX_YCOUNT - uart->rx_dma_nrows;
@@ -487,7 +496,11 @@ void bfin_serial_rx_dma_timeout(struct bfin_serial_port *uart)
 		x_pos = 0;
 
 	pos = uart->rx_dma_nrows * DMA_RX_XCOUNT + x_pos;
-	if (pos != uart->rx_dma_buf.tail) {
+	/* Ignore receiving data if new position is in the same line of
+	 * current buffer tail and small.
+	 */
+	if (pos > uart->rx_dma_buf.tail ||
+		uart->rx_dma_nrows < (uart->rx_dma_buf.tail/DMA_RX_XCOUNT)) {
 		uart->rx_dma_buf.head = pos;
 		bfin_serial_dma_rx_chars(uart);
 		uart->rx_dma_buf.tail = uart->rx_dma_buf.head;
@@ -532,11 +545,25 @@ static irqreturn_t bfin_serial_dma_rx_int(int irq, void *dev_id)
 {
 	struct bfin_serial_port *uart = dev_id;
 	unsigned short irqstat;
+	int pos;
 
 	spin_lock(&uart->port.lock);
 	irqstat = get_dma_curr_irqstat(uart->rx_dma_channel);
 	clear_dma_irqstat(uart->rx_dma_channel);
-	bfin_serial_dma_rx_chars(uart);
+
+	uart->rx_dma_nrows = get_dma_curr_ycount(uart->rx_dma_channel);
+	uart->rx_dma_nrows = DMA_RX_YCOUNT - uart->rx_dma_nrows;
+	if (uart->rx_dma_nrows == DMA_RX_YCOUNT)
+		uart->rx_dma_nrows = 0;
+
+	pos = uart->rx_dma_nrows * DMA_RX_XCOUNT;
+	if (pos > uart->rx_dma_buf.tail ||
+		uart->rx_dma_nrows < (uart->rx_dma_buf.tail/DMA_RX_XCOUNT)) {
+		uart->rx_dma_buf.head = pos;
+		bfin_serial_dma_rx_chars(uart);
+		uart->rx_dma_buf.tail = uart->rx_dma_buf.head;
+	}
+
 	spin_unlock(&uart->port.lock);
 
 	return IRQ_HANDLED;
-- 
cgit v0.10.2


From 7de7c55bf54dede2bd2262349fc7b558bcc8e413 Mon Sep 17 00:00:00 2001
From: Robin Getz <rgetz@blackfin.uclinux.org>
Date: Thu, 11 Jun 2009 13:38:57 +0100
Subject: Blackfin Serial Driver: fix baudrate for early_printk

Since we already setup the early console UART in
arch/blackfin/kernel/early_printk.c, and common functions which are
enabled from the .setup will override the proper settings later, don't
fill in these structures.  Otherwise we get mangled baudrate settings when
using early_printk.

Signed-off-by: Robin Getz <rgetz@blackfin.uclinux.org>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/bfin_5xx.c b/drivers/serial/bfin_5xx.c
index 65a4c07..dfae22d 100644
--- a/drivers/serial/bfin_5xx.c
+++ b/drivers/serial/bfin_5xx.c
@@ -1272,12 +1272,17 @@ static __init void early_serial_write(struct console *con, const char *s,
 	}
 }
 
+/*
+ * This should have a .setup or .early_setup in it, but then things get called
+ * without the command line options, and the baud rate gets messed up - so
+ * don't let the common infrastructure play with things. (see calls to setup
+ * & earlysetup in ./kernel/printk.c:register_console()
+ */
 static struct __initdata console bfin_early_serial_console = {
 	.name = "early_BFuart",
 	.write = early_serial_write,
 	.device = uart_console_device,
 	.flags = CON_PRINTBUFFER,
-	.setup = bfin_serial_console_setup,
 	.index = -1,
 	.data  = &bfin_serial_reg,
 };
-- 
cgit v0.10.2


From f9d36da9cdc2504cd9bb6034cfaba0673ce2d6df Mon Sep 17 00:00:00 2001
From: Graf Yang <graf.yang@analog.com>
Date: Thu, 11 Jun 2009 13:42:17 +0100
Subject: Blackfin Serial Driver: fix missing new lines when under load

Add a SSYNC() into bfin_serial_dma_tx_chars() to ensure DMA registers are
written with new data otherwise we might miss a byte or two when the
system is under load.  PIO mode is OK though.

Signed-off-by: Graf Yang <graf.yang@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/bfin_5xx.c b/drivers/serial/bfin_5xx.c
index dfae22d..676efda 100644
--- a/drivers/serial/bfin_5xx.c
+++ b/drivers/serial/bfin_5xx.c
@@ -415,6 +415,7 @@ static void bfin_serial_dma_tx_chars(struct bfin_serial_port *uart)
 	set_dma_start_addr(uart->tx_dma_channel, (unsigned long)(xmit->buf+xmit->tail));
 	set_dma_x_count(uart->tx_dma_channel, uart->tx_count);
 	set_dma_x_modify(uart->tx_dma_channel, 1);
+	SSYNC();
 	enable_dma(uart->tx_dma_channel);
 
 	UART_SET_IER(uart, ETBEI);
-- 
cgit v0.10.2


From 35ff69357949cfff5c3d8e3038b77146872e3bd3 Mon Sep 17 00:00:00 2001
From: Sonic Zhang <sonic.zhang@analog.com>
Date: Thu, 11 Jun 2009 13:42:57 +0100
Subject: Blackfin Serial Driver: handle irregular DMA register status in auto
 start mode

This bug is caused by irregular behavior of DMA register CURR_X_COUNT
and CURR_Y_COUNT when an auto restart uart rx DMA run to last byte in
DMA buffer, trigger the interrupt and stay at this possiton. The status
of current x and y is 0:7 instead of 512:8 or 0:8. The driver doesn't
take care of this case when calculating the position.

URL: http://blackfin.uclinux.org/gf/tracker/5063
Reported-by: Tomasz Motylewski <t.motylewski@bfad.de>

Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/bfin_5xx.c b/drivers/serial/bfin_5xx.c
index 676efda..854e96d 100644
--- a/drivers/serial/bfin_5xx.c
+++ b/drivers/serial/bfin_5xx.c
@@ -490,7 +490,7 @@ void bfin_serial_rx_dma_timeout(struct bfin_serial_port *uart)
 	uart->rx_dma_nrows = get_dma_curr_ycount(uart->rx_dma_channel);
 	x_pos = get_dma_curr_xcount(uart->rx_dma_channel);
 	uart->rx_dma_nrows = DMA_RX_YCOUNT - uart->rx_dma_nrows;
-	if (uart->rx_dma_nrows == DMA_RX_YCOUNT)
+	if (uart->rx_dma_nrows == DMA_RX_YCOUNT || x_pos == 0)
 		uart->rx_dma_nrows = 0;
 	x_pos = DMA_RX_XCOUNT - x_pos;
 	if (x_pos == DMA_RX_XCOUNT)
@@ -546,15 +546,16 @@ static irqreturn_t bfin_serial_dma_rx_int(int irq, void *dev_id)
 {
 	struct bfin_serial_port *uart = dev_id;
 	unsigned short irqstat;
-	int pos;
+	int x_pos, pos;
 
 	spin_lock(&uart->port.lock);
 	irqstat = get_dma_curr_irqstat(uart->rx_dma_channel);
 	clear_dma_irqstat(uart->rx_dma_channel);
 
 	uart->rx_dma_nrows = get_dma_curr_ycount(uart->rx_dma_channel);
+	x_pos = get_dma_curr_xcount(uart->rx_dma_channel);
 	uart->rx_dma_nrows = DMA_RX_YCOUNT - uart->rx_dma_nrows;
-	if (uart->rx_dma_nrows == DMA_RX_YCOUNT)
+	if (uart->rx_dma_nrows == DMA_RX_YCOUNT || x_pos == 0)
 		uart->rx_dma_nrows = 0;
 
 	pos = uart->rx_dma_nrows * DMA_RX_XCOUNT;
-- 
cgit v0.10.2


From 0efa4f2c944fabffc81918cc86d4d17dba39a021 Mon Sep 17 00:00:00 2001
From: Sonic Zhang <sonic.zhang@analog.com>
Date: Thu, 11 Jun 2009 13:45:07 +0100
Subject: Blackfin Serial Driver: annotate anomalies 05000215 and 05000099

Add some comments for how these anomalies are addressed:

05000215 - UART TX Interrupt Masked Erroneously
We always clear ETBEI within last UART TX interrupt to end
a string.  It is always set when starting a new tx transfer.

05000099 - UART Line Status Register (UART_LSR) Bits Are Not Updated at
the Same Time
This anomaly affects driver only in POLL code where multi bits of
UART_LSR are checked. It doesn't affect current bfin_5xx.c driver.

Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/bfin_5xx.c b/drivers/serial/bfin_5xx.c
index 854e96d..ab583ef 100644
--- a/drivers/serial/bfin_5xx.c
+++ b/drivers/serial/bfin_5xx.c
@@ -330,6 +330,11 @@ static void bfin_serial_tx_chars(struct bfin_serial_port *uart)
 		/* Clear TFI bit */
 		UART_PUT_LSR(uart, TFI);
 #endif
+		/* Anomaly notes:
+		 *  05000215 -	we always clear ETBEI within last UART TX
+		 *		interrupt to end a string. It is always set
+		 *		when start a new tx.
+		 */
 		UART_CLEAR_IER(uart, ETBEI);
 		return;
 	}
@@ -528,6 +533,11 @@ static irqreturn_t bfin_serial_dma_tx_int(int irq, void *dev_id)
 	if (!(get_dma_curr_irqstat(uart->tx_dma_channel)&DMA_RUN)) {
 		disable_dma(uart->tx_dma_channel);
 		clear_dma_irqstat(uart->tx_dma_channel);
+		/* Anomaly notes:
+		 *  05000215 -	we always clear ETBEI within last UART TX
+		 *		interrupt to end a string. It is always set
+		 *		when start a new tx.
+		 */
 		UART_CLEAR_IER(uart, ETBEI);
 		xmit->tail = (xmit->tail + uart->tx_count) & (UART_XMIT_SIZE - 1);
 		uart->port.icount.tx += uart->tx_count;
@@ -969,6 +979,10 @@ static void bfin_serial_reset_irda(struct uart_port *port)
 }
 
 #ifdef CONFIG_CONSOLE_POLL
+/* Anomaly notes:
+ *  05000099 -  Because we only use THRE in poll_put and DR in poll_get,
+ *		losing other bits of UART_LSR is not a problem here.
+ */
 static void bfin_serial_poll_put_char(struct uart_port *port, unsigned char chr)
 {
 	struct bfin_serial_port *uart = (struct bfin_serial_port *)port;
-- 
cgit v0.10.2


From 84507794a9d5d4decd6bc9c111480076dba0d301 Mon Sep 17 00:00:00 2001
From: Sonic Zhang <sonic.zhang@analog.com>
Date: Thu, 11 Jun 2009 13:50:20 +0100
Subject: Blackfin Serial Driver: handle anomaly 05000231

05000231 - UART STB Bit Incorrectly Affects Receiver Setting
For processors affected by this, we cannot safely allow CSTOPB to be set
as the UART will then be unable to properly clock in bytes.

Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/bfin_5xx.c b/drivers/serial/bfin_5xx.c
index ab583ef..64603f5 100644
--- a/drivers/serial/bfin_5xx.c
+++ b/drivers/serial/bfin_5xx.c
@@ -828,8 +828,16 @@ bfin_serial_set_termios(struct uart_port *port, struct ktermios *termios,
 			__func__);
 	}
 
-	if (termios->c_cflag & CSTOPB)
-		lcr |= STB;
+	/* Anomaly notes:
+	 *  05000231 -  STOP bit is always set to 1 whatever the user is set.
+	 */
+	if (termios->c_cflag & CSTOPB) {
+		if (ANOMALY_05000231)
+			printk(KERN_WARNING "STOP bits other than 1 is not "
+				"supported in case of anomaly 05000231.\n");
+		else
+			lcr |= STB;
+	}
 	if (termios->c_cflag & PARENB)
 		lcr |= PEN;
 	if (!(termios->c_cflag & PARODD))
-- 
cgit v0.10.2


From 2860b7911137eabb01c159abefb506e538ff3cb7 Mon Sep 17 00:00:00 2001
From: Sonic Zhang <sonic.zhang@analog.com>
Date: Thu, 11 Jun 2009 13:51:33 +0100
Subject: Blackfin Serial Driver: disable dma rx interrupt only rather than all
 irqs

The UART RX handling code isn't exactly speeding, so don't go disabling
all interrupts when processing the buffer.  Just disable the relevant DMA
interrupt.  This greatly improves latency of the system when utilizing the
UART.

Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/bfin_5xx.c b/drivers/serial/bfin_5xx.c
index 64603f5..e2f6b1b 100644
--- a/drivers/serial/bfin_5xx.c
+++ b/drivers/serial/bfin_5xx.c
@@ -479,9 +479,9 @@ static void bfin_serial_dma_rx_chars(struct bfin_serial_port *uart)
 void bfin_serial_rx_dma_timeout(struct bfin_serial_port *uart)
 {
 	int x_pos, pos;
-	unsigned long flags;
 
-	spin_lock_irqsave(&uart->port.lock, flags);
+	dma_disable_irq(uart->rx_dma_channel);
+	spin_lock_bh(&uart->port.lock);
 
 	/* 2D DMA RX buffer ring is used. Because curr_y_count and
 	 * curr_x_count can't be read as an atomic operation,
@@ -512,7 +512,8 @@ void bfin_serial_rx_dma_timeout(struct bfin_serial_port *uart)
 		uart->rx_dma_buf.tail = uart->rx_dma_buf.head;
 	}
 
-	spin_unlock_irqrestore(&uart->port.lock, flags);
+	spin_unlock_bh(&uart->port.lock);
+	dma_enable_irq(uart->rx_dma_channel);
 
 	mod_timer(&(uart->rx_dma_timer), jiffies + DMA_RX_FLUSH_JIFFIES);
 }
-- 
cgit v0.10.2


From b7c7cbc898e8a97829f33ad3bcd1b5e91690d8f4 Mon Sep 17 00:00:00 2001
From: Craig Shelley <craig@microtron.org.uk>
Date: Thu, 11 Jun 2009 13:52:31 +0100
Subject: USB: CP210X Add device IDs

Signed-off-by: Craig Shelley <craig@microtron.org.uk>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index d9f586d..ac4944a 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -57,13 +57,16 @@ static int debug;
 static struct usb_device_id id_table [] = {
 	{ USB_DEVICE(0x0471, 0x066A) }, /* AKTAKOM ACE-1001 cable */
 	{ USB_DEVICE(0x0489, 0xE000) }, /* Pirelli Broadband S.p.A, DP-L10 SIP/GSM Mobile */
+	{ USB_DEVICE(0x0745, 0x1000) }, /* CipherLab USB CCD Barcode Scanner 1000 */
 	{ USB_DEVICE(0x08e6, 0x5501) }, /* Gemalto Prox-PU/CU contactless smartcard reader */
+	{ USB_DEVICE(0x08FD, 0x000A) }, /* Digianswer A/S , ZigBee/802.15.4 MAC Device */
 	{ USB_DEVICE(0x0FCF, 0x1003) }, /* Dynastream ANT development board */
 	{ USB_DEVICE(0x0FCF, 0x1004) }, /* Dynastream ANT2USB */
 	{ USB_DEVICE(0x0FCF, 0x1006) }, /* Dynastream ANT development board */
 	{ USB_DEVICE(0x10A6, 0xAA26) }, /* Knock-off DCU-11 cable */
 	{ USB_DEVICE(0x10AB, 0x10C5) }, /* Siemens MC60 Cable */
 	{ USB_DEVICE(0x10B5, 0xAC70) }, /* Nokia CA-42 USB */
+	{ USB_DEVICE(0x10C4, 0x0F91) }, /* Vstabi */
 	{ USB_DEVICE(0x10C4, 0x800A) }, /* SPORTident BSM7-D-USB main station */
 	{ USB_DEVICE(0x10C4, 0x803B) }, /* Pololu USB-serial converter */
 	{ USB_DEVICE(0x10C4, 0x8053) }, /* Enfora EDG1228 */
@@ -84,10 +87,12 @@ static struct usb_device_id id_table [] = {
 	{ USB_DEVICE(0x10C4, 0x81C8) }, /* Lipowsky Industrie Elektronik GmbH, Baby-JTAG */
 	{ USB_DEVICE(0x10C4, 0x81E2) }, /* Lipowsky Industrie Elektronik GmbH, Baby-LIN */
 	{ USB_DEVICE(0x10C4, 0x81E7) }, /* Aerocomm Radio */
+	{ USB_DEVICE(0x10C4, 0x81F2) }, /* C1007 HF band RFID controller */
 	{ USB_DEVICE(0x10C4, 0x8218) }, /* Lipowsky Industrie Elektronik GmbH, HARP-1 */
 	{ USB_DEVICE(0x10C4, 0x822B) }, /* Modem EDGE(GSM) Comander 2 */
 	{ USB_DEVICE(0x10C4, 0x826B) }, /* Cygnal Integrated Products, Inc., Fasttrax GPS demostration module */
 	{ USB_DEVICE(0x10c4, 0x8293) }, /* Telegesys ETRX2USB */
+	{ USB_DEVICE(0x10C4, 0x82F9) }, /* Procyon AVS */
 	{ USB_DEVICE(0x10C4, 0x8341) }, /* Siemens MC35PU GPRS Modem */
 	{ USB_DEVICE(0x10C4, 0x83A8) }, /* Amber Wireless AMB2560 */
 	{ USB_DEVICE(0x10C4, 0x846E) }, /* BEI USB Sensor Interface (VCP) */
@@ -98,7 +103,9 @@ static struct usb_device_id id_table [] = {
 	{ USB_DEVICE(0x10C4, 0xF003) }, /* Elan Digital Systems USBpulse100 */
 	{ USB_DEVICE(0x10C4, 0xF004) }, /* Elan Digital Systems USBcount50 */
 	{ USB_DEVICE(0x10C5, 0xEA61) }, /* Silicon Labs MobiData GPRS USB Modem */
+	{ USB_DEVICE(0x10CE, 0xEA6A) }, /* Silicon Labs MobiData GPRS USB Modem 100EU */
 	{ USB_DEVICE(0x13AD, 0x9999) }, /* Baltech card reader */
+	{ USB_DEVICE(0x1555, 0x0004) }, /* Owen AC4 USB-RS485 Converter */
 	{ USB_DEVICE(0x166A, 0x0303) }, /* Clipsal 5500PCU C-Bus USB interface */
 	{ USB_DEVICE(0x16D6, 0x0001) }, /* Jablotron serial interface */
 	{ USB_DEVICE(0x18EF, 0xE00F) }, /* ELV USB-I2C-Interface */
-- 
cgit v0.10.2


From 93ef1f1fbce37f14666e4856ff933d4a1b735d02 Mon Sep 17 00:00:00 2001
From: Craig Shelley <craig@microtron.org.uk>
Date: Thu, 11 Jun 2009 13:53:30 +0100
Subject: USB: CP210X Use official request code definitions

The CP210X driver was developed without official device specifications.
This has lead to an incorrect assumption that all GET request codes are
equal to the corresponding SET request code +1.
This patch removes this incorrect assumption, and uses request code
definitions based on the updated GPL driver from SiLabs.
This modification is needed before extended functionality such as GPIO
on CP2103 can be supported.

Signed-off-by: Craig Shelley <craig@microtron.org.uk>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index ac4944a..b50f27f 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -27,7 +27,7 @@
 /*
  * Version Information
  */
-#define DRIVER_VERSION "v0.08"
+#define DRIVER_VERSION "v0.09"
 #define DRIVER_DESC "Silicon Labs CP2101/CP2102 RS232 serial adaptor driver"
 
 /*
@@ -144,23 +144,40 @@ static struct usb_serial_driver cp2101_device = {
 #define REQTYPE_HOST_TO_DEVICE	0x41
 #define REQTYPE_DEVICE_TO_HOST	0xc1
 
-/* Config SET requests. To GET, add 1 to the request number */
-#define CP2101_UART 		0x00	/* Enable / Disable */
-#define CP2101_BAUDRATE		0x01	/* (BAUD_RATE_GEN_FREQ / baudrate) */
-#define CP2101_BITS		0x03	/* 0x(0)(databits)(parity)(stopbits) */
-#define CP2101_BREAK		0x05	/* On / Off */
-#define CP2101_CONTROL		0x07	/* Flow control line states */
-#define CP2101_MODEMCTL		0x13	/* Modem controls */
-#define CP2101_CONFIG_6		0x19	/* 6 bytes of config data ??? */
-
-/* CP2101_UART */
+/* Config request codes */
+#define CP210X_IFC_ENABLE	0x00
+#define CP210X_SET_BAUDDIV	0x01
+#define CP210X_GET_BAUDDIV	0x02
+#define CP210X_SET_LINE_CTL	0x03
+#define CP210X_GET_LINE_CTL	0x04
+#define CP210X_SET_BREAK	0x05
+#define CP210X_IMM_CHAR		0x06
+#define CP210X_SET_MHS		0x07
+#define CP210X_GET_MDMSTS	0x08
+#define CP210X_SET_XON		0x09
+#define CP210X_SET_XOFF		0x0A
+#define CP210X_SET_EVENTMASK	0x0B
+#define CP210X_GET_EVENTMASK	0x0C
+#define CP210X_SET_CHAR		0x0D
+#define CP210X_GET_CHARS	0x0E
+#define CP210X_GET_PROPS	0x0F
+#define CP210X_GET_COMM_STATUS	0x10
+#define CP210X_RESET		0x11
+#define CP210X_PURGE		0x12
+#define CP210X_SET_FLOW		0x13
+#define CP210X_GET_FLOW		0x14
+#define CP210X_EMBED_EVENTS	0x15
+#define CP210X_GET_EVENTSTATE	0x16
+#define CP210X_SET_CHARS	0x19
+
+/* CP210X_IFC_ENABLE */
 #define UART_ENABLE		0x0001
 #define UART_DISABLE		0x0000
 
-/* CP2101_BAUDRATE */
+/* CP210X_(SET|GET)_BAUDDIV */
 #define BAUD_RATE_GEN_FREQ	0x384000
 
-/* CP2101_BITS */
+/* CP210X_(SET|GET)_LINE_CTL */
 #define BITS_DATA_MASK		0X0f00
 #define BITS_DATA_5		0X0500
 #define BITS_DATA_6		0X0600
@@ -180,11 +197,11 @@ static struct usb_serial_driver cp2101_device = {
 #define BITS_STOP_1_5		0x0001
 #define BITS_STOP_2		0x0002
 
-/* CP2101_BREAK */
+/* CP210X_SET_BREAK */
 #define BREAK_ON		0x0000
 #define BREAK_OFF		0x0001
 
-/* CP2101_CONTROL */
+/* CP210X_(SET_MHS|GET_MDMSTS) */
 #define CONTROL_DTR		0x0001
 #define CONTROL_RTS		0x0002
 #define CONTROL_CTS		0x0010
@@ -217,9 +234,6 @@ static int cp2101_get_config(struct usb_serial_port *port, u8 request,
 		return -ENOMEM;
 	}
 
-	/* For get requests, the request number must be incremented */
-	request++;
-
 	/* Issue the request, attempting to read 'size' bytes */
 	result = usb_control_msg(serial->dev, usb_rcvctrlpipe(serial->dev, 0),
 				request, REQTYPE_DEVICE_TO_HOST, 0x0000,
@@ -357,7 +371,7 @@ static int cp2101_open(struct tty_struct *tty, struct usb_serial_port *port,
 
 	dbg("%s - port %d", __func__, port->number);
 
-	if (cp2101_set_config_single(port, CP2101_UART, UART_ENABLE)) {
+	if (cp2101_set_config_single(port, CP210X_IFC_ENABLE, UART_ENABLE)) {
 		dev_err(&port->dev, "%s - Unable to enable UART\n",
 				__func__);
 		return -EPROTO;
@@ -415,7 +429,7 @@ static void cp2101_close(struct usb_serial_port *port)
 
 	mutex_lock(&port->serial->disc_mutex);
 	if (!port->serial->disconnected)
-		cp2101_set_config_single(port, CP2101_UART, UART_DISABLE);
+		cp2101_set_config_single(port, CP210X_IFC_ENABLE, UART_DISABLE);
 	mutex_unlock(&port->serial->disc_mutex);
 }
 
@@ -456,7 +470,7 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
 
 	dbg("%s - port %d", __func__, port->number);
 
-	cp2101_get_config(port, CP2101_BAUDRATE, &baud, 2);
+	cp2101_get_config(port, CP210X_GET_BAUDDIV, &baud, 2);
 	/* Convert to baudrate */
 	if (baud)
 		baud = cp2101_quantise_baudrate((BAUD_RATE_GEN_FREQ + baud/2)/ baud);
@@ -466,7 +480,7 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
 
 	cflag = *cflagp;
 
-	cp2101_get_config(port, CP2101_BITS, &bits, 2);
+	cp2101_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
 	cflag &= ~CSIZE;
 	switch (bits & BITS_DATA_MASK) {
 	case BITS_DATA_5:
@@ -491,14 +505,14 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
 		cflag |= CS8;
 		bits &= ~BITS_DATA_MASK;
 		bits |= BITS_DATA_8;
-		cp2101_set_config(port, CP2101_BITS, &bits, 2);
+		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	default:
 		dbg("%s - Unknown number of data bits, using 8", __func__);
 		cflag |= CS8;
 		bits &= ~BITS_DATA_MASK;
 		bits |= BITS_DATA_8;
-		cp2101_set_config(port, CP2101_BITS, &bits, 2);
+		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	}
 
@@ -521,20 +535,20 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
 				__func__);
 		cflag &= ~PARENB;
 		bits &= ~BITS_PARITY_MASK;
-		cp2101_set_config(port, CP2101_BITS, &bits, 2);
+		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	case BITS_PARITY_SPACE:
 		dbg("%s - parity = SPACE (not supported, disabling parity)",
 				__func__);
 		cflag &= ~PARENB;
 		bits &= ~BITS_PARITY_MASK;
-		cp2101_set_config(port, CP2101_BITS, &bits, 2);
+		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	default:
 		dbg("%s - Unknown parity mode, disabling parity", __func__);
 		cflag &= ~PARENB;
 		bits &= ~BITS_PARITY_MASK;
-		cp2101_set_config(port, CP2101_BITS, &bits, 2);
+		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	}
 
@@ -547,7 +561,7 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
 		dbg("%s - stop bits = 1.5 (not supported, using 1 stop bit)",
 								__func__);
 		bits &= ~BITS_STOP_MASK;
-		cp2101_set_config(port, CP2101_BITS, &bits, 2);
+		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	case BITS_STOP_2:
 		dbg("%s - stop bits = 2", __func__);
@@ -557,11 +571,11 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
 		dbg("%s - Unknown number of stop bits, using 1 stop bit",
 								__func__);
 		bits &= ~BITS_STOP_MASK;
-		cp2101_set_config(port, CP2101_BITS, &bits, 2);
+		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	}
 
-	cp2101_get_config(port, CP2101_MODEMCTL, modem_ctl, 16);
+	cp2101_get_config(port, CP210X_GET_FLOW, modem_ctl, 16);
 	if (modem_ctl[0] & 0x0008) {
 		dbg("%s - flow control = CRTSCTS", __func__);
 		cflag |= CRTSCTS;
@@ -594,7 +608,7 @@ static void cp2101_set_termios(struct tty_struct *tty,
 	if (baud != tty_termios_baud_rate(old_termios) && baud != 0) {
 		dbg("%s - Setting baud rate to %d baud", __func__,
 				baud);
-		if (cp2101_set_config_single(port, CP2101_BAUDRATE,
+		if (cp2101_set_config_single(port, CP210X_SET_BAUDDIV,
 					((BAUD_RATE_GEN_FREQ + baud/2) / baud))) {
 			dbg("Baud rate requested not supported by device\n");
 			baud = tty_termios_baud_rate(old_termios);
@@ -605,7 +619,7 @@ static void cp2101_set_termios(struct tty_struct *tty,
 
 	/* If the number of data bits is to be updated */
 	if ((cflag & CSIZE) != (old_cflag & CSIZE)) {
-		cp2101_get_config(port, CP2101_BITS, &bits, 2);
+		cp2101_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
 		bits &= ~BITS_DATA_MASK;
 		switch (cflag & CSIZE) {
 		case CS5:
@@ -635,13 +649,13 @@ static void cp2101_set_termios(struct tty_struct *tty,
 				bits |= BITS_DATA_8;
 				break;
 		}
-		if (cp2101_set_config(port, CP2101_BITS, &bits, 2))
+		if (cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2))
 			dbg("Number of data bits requested "
 					"not supported by device\n");
 	}
 
 	if ((cflag & (PARENB|PARODD)) != (old_cflag & (PARENB|PARODD))) {
-		cp2101_get_config(port, CP2101_BITS, &bits, 2);
+		cp2101_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
 		bits &= ~BITS_PARITY_MASK;
 		if (cflag & PARENB) {
 			if (cflag & PARODD) {
@@ -652,13 +666,13 @@ static void cp2101_set_termios(struct tty_struct *tty,
 				dbg("%s - parity = EVEN", __func__);
 			}
 		}
-		if (cp2101_set_config(port, CP2101_BITS, &bits, 2))
+		if (cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2))
 			dbg("Parity mode not supported "
 					"by device\n");
 	}
 
 	if ((cflag & CSTOPB) != (old_cflag & CSTOPB)) {
-		cp2101_get_config(port, CP2101_BITS, &bits, 2);
+		cp2101_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
 		bits &= ~BITS_STOP_MASK;
 		if (cflag & CSTOPB) {
 			bits |= BITS_STOP_2;
@@ -667,13 +681,13 @@ static void cp2101_set_termios(struct tty_struct *tty,
 			bits |= BITS_STOP_1;
 			dbg("%s - stop bits = 1", __func__);
 		}
-		if (cp2101_set_config(port, CP2101_BITS, &bits, 2))
+		if (cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2))
 			dbg("Number of stop bits requested "
 					"not supported by device\n");
 	}
 
 	if ((cflag & CRTSCTS) != (old_cflag & CRTSCTS)) {
-		cp2101_get_config(port, CP2101_MODEMCTL, modem_ctl, 16);
+		cp2101_get_config(port, CP210X_GET_FLOW, modem_ctl, 16);
 		dbg("%s - read modem controls = 0x%.4x 0x%.4x 0x%.4x 0x%.4x",
 				__func__, modem_ctl[0], modem_ctl[1],
 				modem_ctl[2], modem_ctl[3]);
@@ -693,7 +707,7 @@ static void cp2101_set_termios(struct tty_struct *tty,
 		dbg("%s - write modem controls = 0x%.4x 0x%.4x 0x%.4x 0x%.4x",
 				__func__, modem_ctl[0], modem_ctl[1],
 				modem_ctl[2], modem_ctl[3]);
-		cp2101_set_config(port, CP2101_MODEMCTL, modem_ctl, 16);
+		cp2101_set_config(port, CP210X_SET_FLOW, modem_ctl, 16);
 	}
 
 }
@@ -731,7 +745,7 @@ static int cp2101_tiocmset_port(struct usb_serial_port *port, struct file *file,
 
 	dbg("%s - control = 0x%.4x", __func__, control);
 
-	return cp2101_set_config(port, CP2101_CONTROL, &control, 2);
+	return cp2101_set_config(port, CP210X_SET_MHS, &control, 2);
 }
 
 static int cp2101_tiocmget (struct tty_struct *tty, struct file *file)
@@ -742,7 +756,7 @@ static int cp2101_tiocmget (struct tty_struct *tty, struct file *file)
 
 	dbg("%s - port %d", __func__, port->number);
 
-	cp2101_get_config(port, CP2101_CONTROL, &control, 1);
+	cp2101_get_config(port, CP210X_GET_MDMSTS, &control, 1);
 
 	result = ((control & CONTROL_DTR) ? TIOCM_DTR : 0)
 		|((control & CONTROL_RTS) ? TIOCM_RTS : 0)
@@ -768,7 +782,7 @@ static void cp2101_break_ctl (struct tty_struct *tty, int break_state)
 		state = BREAK_ON;
 	dbg("%s - turning break %s", __func__,
 			state == BREAK_OFF ? "off" : "on");
-	cp2101_set_config(port, CP2101_BREAK, &state, 2);
+	cp2101_set_config(port, CP210X_SET_BREAK, &state, 2);
 }
 
 static int cp2101_startup(struct usb_serial *serial)
-- 
cgit v0.10.2


From 4cc27bd6d7d6750dba33b4ccb4585c00b8fca7d2 Mon Sep 17 00:00:00 2001
From: Craig Shelley <craig@microtron.org.uk>
Date: Thu, 11 Jun 2009 13:54:40 +0100
Subject: USB: CP210X Replace CP2101 with CP210x

This patch replaces the string "CP2101" with "CP210x" within cp210x.c
This is to reduce confusion about the fact that the driver is actually
compatible with CP2101, CP2102 and CP2103 devices.

Signed-off-by: Craig Shelley <craig@microtron.org.uk>

(Fixed some collisions merging)

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index b50f27f..16a154d 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -1,5 +1,5 @@
 /*
- * Silicon Laboratories CP2101/CP2102 USB to RS232 serial adaptor driver
+ * Silicon Laboratories CP210x USB to RS232 serial adaptor driver
  *
  * Copyright (C) 2005 Craig Shelley (craig@microtron.org.uk)
  *
@@ -28,29 +28,29 @@
  * Version Information
  */
 #define DRIVER_VERSION "v0.09"
-#define DRIVER_DESC "Silicon Labs CP2101/CP2102 RS232 serial adaptor driver"
+#define DRIVER_DESC "Silicon Labs CP210x RS232 serial adaptor driver"
 
 /*
  * Function Prototypes
  */
-static int cp2101_open(struct tty_struct *, struct usb_serial_port *,
+static int cp210x_open(struct tty_struct *, struct usb_serial_port *,
 							struct file *);
-static void cp2101_cleanup(struct usb_serial_port *);
-static void cp2101_close(struct usb_serial_port *);
-static void cp2101_get_termios(struct tty_struct *,
+static void cp210x_cleanup(struct usb_serial_port *);
+static void cp210x_close(struct usb_serial_port *);
+static void cp210x_get_termios(struct tty_struct *,
 	struct usb_serial_port *port);
-static void cp2101_get_termios_port(struct usb_serial_port *port,
+static void cp210x_get_termios_port(struct usb_serial_port *port,
 	unsigned int *cflagp, unsigned int *baudp);
-static void cp2101_set_termios(struct tty_struct *, struct usb_serial_port *,
+static void cp210x_set_termios(struct tty_struct *, struct usb_serial_port *,
 							struct ktermios*);
-static int cp2101_tiocmget(struct tty_struct *, struct file *);
-static int cp2101_tiocmset(struct tty_struct *, struct file *,
+static int cp210x_tiocmget(struct tty_struct *, struct file *);
+static int cp210x_tiocmset(struct tty_struct *, struct file *,
 		unsigned int, unsigned int);
-static int cp2101_tiocmset_port(struct usb_serial_port *port, struct file *,
+static int cp210x_tiocmset_port(struct usb_serial_port *port, struct file *,
 		unsigned int, unsigned int);
-static void cp2101_break_ctl(struct tty_struct *, int);
-static int cp2101_startup(struct usb_serial *);
-static void cp2101_shutdown(struct usb_serial *);
+static void cp210x_break_ctl(struct tty_struct *, int);
+static int cp210x_startup(struct usb_serial *);
+static void cp210x_shutdown(struct usb_serial *);
 
 static int debug;
 
@@ -114,30 +114,30 @@ static struct usb_device_id id_table [] = {
 
 MODULE_DEVICE_TABLE(usb, id_table);
 
-static struct usb_driver cp2101_driver = {
-	.name		= "cp2101",
+static struct usb_driver cp210x_driver = {
+	.name		= "cp210x",
 	.probe		= usb_serial_probe,
 	.disconnect	= usb_serial_disconnect,
 	.id_table	= id_table,
 	.no_dynamic_id	= 	1,
 };
 
-static struct usb_serial_driver cp2101_device = {
+static struct usb_serial_driver cp210x_device = {
 	.driver = {
 		.owner =	THIS_MODULE,
-		.name = 	"cp2101",
+		.name = 	"cp210x",
 	},
-	.usb_driver		= &cp2101_driver,
+	.usb_driver		= &cp210x_driver,
 	.id_table		= id_table,
 	.num_ports		= 1,
-	.open			= cp2101_open,
-	.close			= cp2101_close,
-	.break_ctl		= cp2101_break_ctl,
-	.set_termios		= cp2101_set_termios,
-	.tiocmget 		= cp2101_tiocmget,
-	.tiocmset		= cp2101_tiocmset,
-	.attach			= cp2101_startup,
-	.shutdown		= cp2101_shutdown,
+	.open			= cp210x_open,
+	.close			= cp210x_close,
+	.break_ctl		= cp210x_break_ctl,
+	.set_termios		= cp210x_set_termios,
+	.tiocmget 		= cp210x_tiocmget,
+	.tiocmset		= cp210x_tiocmset,
+	.attach			= cp210x_startup,
+	.shutdown		= cp210x_shutdown,
 };
 
 /* Config request types */
@@ -212,13 +212,13 @@ static struct usb_serial_driver cp2101_device = {
 #define CONTROL_WRITE_RTS	0x0200
 
 /*
- * cp2101_get_config
- * Reads from the CP2101 configuration registers
+ * cp210x_get_config
+ * Reads from the CP210x configuration registers
  * 'size' is specified in bytes.
  * 'data' is a pointer to a pre-allocated array of integers large
  * enough to hold 'size' bytes (with 4 bytes to each integer)
  */
-static int cp2101_get_config(struct usb_serial_port *port, u8 request,
+static int cp210x_get_config(struct usb_serial_port *port, u8 request,
 		unsigned int *data, int size)
 {
 	struct usb_serial *serial = port->serial;
@@ -256,12 +256,12 @@ static int cp2101_get_config(struct usb_serial_port *port, u8 request,
 }
 
 /*
- * cp2101_set_config
- * Writes to the CP2101 configuration registers
+ * cp210x_set_config
+ * Writes to the CP210x configuration registers
  * Values less than 16 bits wide are sent directly
  * 'size' is specified in bytes.
  */
-static int cp2101_set_config(struct usb_serial_port *port, u8 request,
+static int cp210x_set_config(struct usb_serial_port *port, u8 request,
 		unsigned int *data, int size)
 {
 	struct usb_serial *serial = port->serial;
@@ -312,21 +312,21 @@ static int cp2101_set_config(struct usb_serial_port *port, u8 request,
 }
 
 /*
- * cp2101_set_config_single
- * Convenience function for calling cp2101_set_config on single data values
+ * cp210x_set_config_single
+ * Convenience function for calling cp210x_set_config on single data values
  * without requiring an integer pointer
  */
-static inline int cp2101_set_config_single(struct usb_serial_port *port,
+static inline int cp210x_set_config_single(struct usb_serial_port *port,
 		u8 request, unsigned int data)
 {
-	return cp2101_set_config(port, request, &data, 2);
+	return cp210x_set_config(port, request, &data, 2);
 }
 
 /*
- * cp2101_quantise_baudrate
+ * cp210x_quantise_baudrate
  * Quantises the baud rate as per AN205 Table 1
  */
-static unsigned int cp2101_quantise_baudrate(unsigned int baud) {
+static unsigned int cp210x_quantise_baudrate(unsigned int baud) {
 	if      (baud <= 56)       baud = 0;
 	else if (baud <= 300)      baud = 300;
 	else if (baud <= 600)      baud = 600;
@@ -363,7 +363,7 @@ static unsigned int cp2101_quantise_baudrate(unsigned int baud) {
 	return baud;
 }
 
-static int cp2101_open(struct tty_struct *tty, struct usb_serial_port *port,
+static int cp210x_open(struct tty_struct *tty, struct usb_serial_port *port,
 				struct file *filp)
 {
 	struct usb_serial *serial = port->serial;
@@ -371,7 +371,7 @@ static int cp2101_open(struct tty_struct *tty, struct usb_serial_port *port,
 
 	dbg("%s - port %d", __func__, port->number);
 
-	if (cp2101_set_config_single(port, CP210X_IFC_ENABLE, UART_ENABLE)) {
+	if (cp210x_set_config_single(port, CP210X_IFC_ENABLE, UART_ENABLE)) {
 		dev_err(&port->dev, "%s - Unable to enable UART\n",
 				__func__);
 		return -EPROTO;
@@ -393,17 +393,17 @@ static int cp2101_open(struct tty_struct *tty, struct usb_serial_port *port,
 	}
 
 	/* Configure the termios structure */
-	cp2101_get_termios(tty, port);
+	cp210x_get_termios(tty, port);
 
 	/* Set the DTR and RTS pins low */
-	cp2101_tiocmset_port(tty ? (struct usb_serial_port *) tty->driver_data
+	cp210x_tiocmset_port(tty ? (struct usb_serial_port *) tty->driver_data
 			: port,
 		NULL, TIOCM_DTR | TIOCM_RTS, 0);
 
 	return 0;
 }
 
-static void cp2101_cleanup(struct usb_serial_port *port)
+static void cp210x_cleanup(struct usb_serial_port *port)
 {
 	struct usb_serial *serial = port->serial;
 
@@ -418,7 +418,7 @@ static void cp2101_cleanup(struct usb_serial_port *port)
 	}
 }
 
-static void cp2101_close(struct usb_serial_port *port)
+static void cp210x_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
 
@@ -429,23 +429,23 @@ static void cp2101_close(struct usb_serial_port *port)
 
 	mutex_lock(&port->serial->disc_mutex);
 	if (!port->serial->disconnected)
-		cp2101_set_config_single(port, CP210X_IFC_ENABLE, UART_DISABLE);
+		cp210x_set_config_single(port, CP210X_IFC_ENABLE, UART_DISABLE);
 	mutex_unlock(&port->serial->disc_mutex);
 }
 
 /*
- * cp2101_get_termios
+ * cp210x_get_termios
  * Reads the baud rate, data bits, parity, stop bits and flow control mode
  * from the device, corrects any unsupported values, and configures the
  * termios structure to reflect the state of the device
  */
-static void cp2101_get_termios(struct tty_struct *tty,
+static void cp210x_get_termios(struct tty_struct *tty,
 	struct usb_serial_port *port)
 {
 	unsigned int baud;
 
 	if (tty) {
-		cp2101_get_termios_port(tty->driver_data,
+		cp210x_get_termios_port(tty->driver_data,
 			&tty->termios->c_cflag, &baud);
 		tty_encode_baud_rate(tty, baud, baud);
 	}
@@ -453,15 +453,15 @@ static void cp2101_get_termios(struct tty_struct *tty,
 	else {
 		unsigned int cflag;
 		cflag = 0;
-		cp2101_get_termios_port(port, &cflag, &baud);
+		cp210x_get_termios_port(port, &cflag, &baud);
 	}
 }
 
 /*
- * cp2101_get_termios_port
- * This is the heart of cp2101_get_termios which always uses a &usb_serial_port.
+ * cp210x_get_termios_port
+ * This is the heart of cp210x_get_termios which always uses a &usb_serial_port.
  */
-static void cp2101_get_termios_port(struct usb_serial_port *port,
+static void cp210x_get_termios_port(struct usb_serial_port *port,
 	unsigned int *cflagp, unsigned int *baudp)
 {
 	unsigned int cflag, modem_ctl[4];
@@ -470,17 +470,17 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
 
 	dbg("%s - port %d", __func__, port->number);
 
-	cp2101_get_config(port, CP210X_GET_BAUDDIV, &baud, 2);
+	cp210x_get_config(port, CP210X_GET_BAUDDIV, &baud, 2);
 	/* Convert to baudrate */
 	if (baud)
-		baud = cp2101_quantise_baudrate((BAUD_RATE_GEN_FREQ + baud/2)/ baud);
+		baud = cp210x_quantise_baudrate((BAUD_RATE_GEN_FREQ + baud/2)/ baud);
 
 	dbg("%s - baud rate = %d", __func__, baud);
 	*baudp = baud;
 
 	cflag = *cflagp;
 
-	cp2101_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
+	cp210x_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
 	cflag &= ~CSIZE;
 	switch (bits & BITS_DATA_MASK) {
 	case BITS_DATA_5:
@@ -505,14 +505,14 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
 		cflag |= CS8;
 		bits &= ~BITS_DATA_MASK;
 		bits |= BITS_DATA_8;
-		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
+		cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	default:
 		dbg("%s - Unknown number of data bits, using 8", __func__);
 		cflag |= CS8;
 		bits &= ~BITS_DATA_MASK;
 		bits |= BITS_DATA_8;
-		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
+		cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	}
 
@@ -535,20 +535,20 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
 				__func__);
 		cflag &= ~PARENB;
 		bits &= ~BITS_PARITY_MASK;
-		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
+		cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	case BITS_PARITY_SPACE:
 		dbg("%s - parity = SPACE (not supported, disabling parity)",
 				__func__);
 		cflag &= ~PARENB;
 		bits &= ~BITS_PARITY_MASK;
-		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
+		cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	default:
 		dbg("%s - Unknown parity mode, disabling parity", __func__);
 		cflag &= ~PARENB;
 		bits &= ~BITS_PARITY_MASK;
-		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
+		cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	}
 
@@ -561,7 +561,7 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
 		dbg("%s - stop bits = 1.5 (not supported, using 1 stop bit)",
 								__func__);
 		bits &= ~BITS_STOP_MASK;
-		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
+		cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	case BITS_STOP_2:
 		dbg("%s - stop bits = 2", __func__);
@@ -571,11 +571,11 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
 		dbg("%s - Unknown number of stop bits, using 1 stop bit",
 								__func__);
 		bits &= ~BITS_STOP_MASK;
-		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
+		cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	}
 
-	cp2101_get_config(port, CP210X_GET_FLOW, modem_ctl, 16);
+	cp210x_get_config(port, CP210X_GET_FLOW, modem_ctl, 16);
 	if (modem_ctl[0] & 0x0008) {
 		dbg("%s - flow control = CRTSCTS", __func__);
 		cflag |= CRTSCTS;
@@ -587,7 +587,7 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
 	*cflagp = cflag;
 }
 
-static void cp2101_set_termios(struct tty_struct *tty,
+static void cp210x_set_termios(struct tty_struct *tty,
 		struct usb_serial_port *port, struct ktermios *old_termios)
 {
 	unsigned int cflag, old_cflag;
@@ -602,13 +602,13 @@ static void cp2101_set_termios(struct tty_struct *tty,
 	tty->termios->c_cflag &= ~CMSPAR;
 	cflag = tty->termios->c_cflag;
 	old_cflag = old_termios->c_cflag;
-	baud = cp2101_quantise_baudrate(tty_get_baud_rate(tty));
+	baud = cp210x_quantise_baudrate(tty_get_baud_rate(tty));
 
 	/* If the baud rate is to be updated*/
 	if (baud != tty_termios_baud_rate(old_termios) && baud != 0) {
 		dbg("%s - Setting baud rate to %d baud", __func__,
 				baud);
-		if (cp2101_set_config_single(port, CP210X_SET_BAUDDIV,
+		if (cp210x_set_config_single(port, CP210X_SET_BAUDDIV,
 					((BAUD_RATE_GEN_FREQ + baud/2) / baud))) {
 			dbg("Baud rate requested not supported by device\n");
 			baud = tty_termios_baud_rate(old_termios);
@@ -619,7 +619,7 @@ static void cp2101_set_termios(struct tty_struct *tty,
 
 	/* If the number of data bits is to be updated */
 	if ((cflag & CSIZE) != (old_cflag & CSIZE)) {
-		cp2101_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
+		cp210x_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
 		bits &= ~BITS_DATA_MASK;
 		switch (cflag & CSIZE) {
 		case CS5:
@@ -643,19 +643,19 @@ static void cp2101_set_termios(struct tty_struct *tty,
 			dbg("%s - data bits = 9", __func__);
 			break;*/
 		default:
-			dbg("cp2101 driver does not "
+			dbg("cp210x driver does not "
 					"support the number of bits requested,"
 					" using 8 bit mode\n");
 				bits |= BITS_DATA_8;
 				break;
 		}
-		if (cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2))
+		if (cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2))
 			dbg("Number of data bits requested "
 					"not supported by device\n");
 	}
 
 	if ((cflag & (PARENB|PARODD)) != (old_cflag & (PARENB|PARODD))) {
-		cp2101_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
+		cp210x_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
 		bits &= ~BITS_PARITY_MASK;
 		if (cflag & PARENB) {
 			if (cflag & PARODD) {
@@ -666,13 +666,13 @@ static void cp2101_set_termios(struct tty_struct *tty,
 				dbg("%s - parity = EVEN", __func__);
 			}
 		}
-		if (cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2))
+		if (cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2))
 			dbg("Parity mode not supported "
 					"by device\n");
 	}
 
 	if ((cflag & CSTOPB) != (old_cflag & CSTOPB)) {
-		cp2101_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
+		cp210x_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
 		bits &= ~BITS_STOP_MASK;
 		if (cflag & CSTOPB) {
 			bits |= BITS_STOP_2;
@@ -681,13 +681,13 @@ static void cp2101_set_termios(struct tty_struct *tty,
 			bits |= BITS_STOP_1;
 			dbg("%s - stop bits = 1", __func__);
 		}
-		if (cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2))
+		if (cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2))
 			dbg("Number of stop bits requested "
 					"not supported by device\n");
 	}
 
 	if ((cflag & CRTSCTS) != (old_cflag & CRTSCTS)) {
-		cp2101_get_config(port, CP210X_GET_FLOW, modem_ctl, 16);
+		cp210x_get_config(port, CP210X_GET_FLOW, modem_ctl, 16);
 		dbg("%s - read modem controls = 0x%.4x 0x%.4x 0x%.4x 0x%.4x",
 				__func__, modem_ctl[0], modem_ctl[1],
 				modem_ctl[2], modem_ctl[3]);
@@ -707,19 +707,19 @@ static void cp2101_set_termios(struct tty_struct *tty,
 		dbg("%s - write modem controls = 0x%.4x 0x%.4x 0x%.4x 0x%.4x",
 				__func__, modem_ctl[0], modem_ctl[1],
 				modem_ctl[2], modem_ctl[3]);
-		cp2101_set_config(port, CP210X_SET_FLOW, modem_ctl, 16);
+		cp210x_set_config(port, CP210X_SET_FLOW, modem_ctl, 16);
 	}
 
 }
 
-static int cp2101_tiocmset (struct tty_struct *tty, struct file *file,
+static int cp210x_tiocmset (struct tty_struct *tty, struct file *file,
 		unsigned int set, unsigned int clear)
 {
 	struct usb_serial_port *port = tty->driver_data;
-	return cp2101_tiocmset_port(port, file, set, clear);
+	return cp210x_tiocmset_port(port, file, set, clear);
 }
 
-static int cp2101_tiocmset_port(struct usb_serial_port *port, struct file *file,
+static int cp210x_tiocmset_port(struct usb_serial_port *port, struct file *file,
 		unsigned int set, unsigned int clear)
 {
 	unsigned int control = 0;
@@ -745,10 +745,10 @@ static int cp2101_tiocmset_port(struct usb_serial_port *port, struct file *file,
 
 	dbg("%s - control = 0x%.4x", __func__, control);
 
-	return cp2101_set_config(port, CP210X_SET_MHS, &control, 2);
+	return cp210x_set_config(port, CP210X_SET_MHS, &control, 2);
 }
 
-static int cp2101_tiocmget (struct tty_struct *tty, struct file *file)
+static int cp210x_tiocmget (struct tty_struct *tty, struct file *file)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	unsigned int control;
@@ -756,7 +756,7 @@ static int cp2101_tiocmget (struct tty_struct *tty, struct file *file)
 
 	dbg("%s - port %d", __func__, port->number);
 
-	cp2101_get_config(port, CP210X_GET_MDMSTS, &control, 1);
+	cp210x_get_config(port, CP210X_GET_MDMSTS, &control, 1);
 
 	result = ((control & CONTROL_DTR) ? TIOCM_DTR : 0)
 		|((control & CONTROL_RTS) ? TIOCM_RTS : 0)
@@ -770,7 +770,7 @@ static int cp2101_tiocmget (struct tty_struct *tty, struct file *file)
 	return result;
 }
 
-static void cp2101_break_ctl (struct tty_struct *tty, int break_state)
+static void cp210x_break_ctl (struct tty_struct *tty, int break_state)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	unsigned int state;
@@ -782,17 +782,17 @@ static void cp2101_break_ctl (struct tty_struct *tty, int break_state)
 		state = BREAK_ON;
 	dbg("%s - turning break %s", __func__,
 			state == BREAK_OFF ? "off" : "on");
-	cp2101_set_config(port, CP210X_SET_BREAK, &state, 2);
+	cp210x_set_config(port, CP210X_SET_BREAK, &state, 2);
 }
 
-static int cp2101_startup(struct usb_serial *serial)
+static int cp210x_startup(struct usb_serial *serial)
 {
-	/* CP2101 buffers behave strangely unless device is reset */
+	/* cp210x buffers behave strangely unless device is reset */
 	usb_reset_device(serial->dev);
 	return 0;
 }
 
-static void cp2101_shutdown(struct usb_serial *serial)
+static void cp210x_shutdown(struct usb_serial *serial)
 {
 	int i;
 
@@ -800,21 +800,21 @@ static void cp2101_shutdown(struct usb_serial *serial)
 
 	/* Stop reads and writes on all ports */
 	for (i = 0; i < serial->num_ports; ++i)
-		cp2101_cleanup(serial->port[i]);
+		cp210x_cleanup(serial->port[i]);
 }
 
-static int __init cp2101_init(void)
+static int __init cp210x_init(void)
 {
 	int retval;
 
-	retval = usb_serial_register(&cp2101_device);
+	retval = usb_serial_register(&cp210x_device);
 	if (retval)
 		return retval; /* Failed to register */
 
-	retval = usb_register(&cp2101_driver);
+	retval = usb_register(&cp210x_driver);
 	if (retval) {
 		/* Failed to register */
-		usb_serial_deregister(&cp2101_device);
+		usb_serial_deregister(&cp210x_device);
 		return retval;
 	}
 
@@ -824,14 +824,14 @@ static int __init cp2101_init(void)
 	return 0;
 }
 
-static void __exit cp2101_exit(void)
+static void __exit cp210x_exit(void)
 {
-	usb_deregister(&cp2101_driver);
-	usb_serial_deregister(&cp2101_device);
+	usb_deregister(&cp210x_driver);
+	usb_serial_deregister(&cp210x_device);
 }
 
-module_init(cp2101_init);
-module_exit(cp2101_exit);
+module_init(cp210x_init);
+module_exit(cp210x_exit);
 
 MODULE_DESCRIPTION(DRIVER_DESC);
 MODULE_VERSION(DRIVER_VERSION);
-- 
cgit v0.10.2


From 557aaa7ffab639d0190b935a041b16ae44606342 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 13:55:34 +0100
Subject: ft232: support the ASYNC_LOW_LATENCY flag

This allows users to use the standard setserial command with this FT232
feature as well as obscure chip specific interfaces we have now. We keep
track of and respect the sysfs value for non-low-latency cases. In theory we
could do smart stuff with VTIME and the like but this seems of questionable
worth.

Closes-bug: http://bugzilla.kernel.org/show_bug.cgi?id=9120
Signed-off-by: Alan Cox <alan@linux.intel.com)
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index d9d8711..21c053c 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -89,6 +89,7 @@ struct ftdi_private {
 	int force_rtscts;	/* if non-zero, force RTS-CTS to always
 				   be enabled */
 
+	unsigned int latency;		/* latency setting in use */
 	spinlock_t tx_lock;	/* spinlock for transmit state */
 	unsigned long tx_bytes;
 	unsigned long tx_outstanding_bytes;
@@ -1038,7 +1039,54 @@ static int change_speed(struct tty_struct *tty, struct usb_serial_port *port)
 	return rv;
 }
 
+static int write_latency_timer(struct usb_serial_port *port)
+{
+	struct ftdi_private *priv = usb_get_serial_port_data(port);
+	struct usb_device *udev = port->serial->dev;
+	char buf[1];
+	int rv = 0;
+	int l = priv->latency;
+
+	if (priv->flags & ASYNC_LOW_LATENCY)
+		l = 1;
+
+	dbg("%s: setting latency timer = %i", __func__, l);
+
+	rv = usb_control_msg(udev,
+			     usb_sndctrlpipe(udev, 0),
+			     FTDI_SIO_SET_LATENCY_TIMER_REQUEST,
+			     FTDI_SIO_SET_LATENCY_TIMER_REQUEST_TYPE,
+			     l, priv->interface,
+			     buf, 0, WDR_TIMEOUT);
 
+	if (rv < 0)
+		dev_err(&port->dev, "Unable to write latency timer: %i\n", rv);
+	return rv;
+}
+
+static int read_latency_timer(struct usb_serial_port *port)
+{
+	struct ftdi_private *priv = usb_get_serial_port_data(port);
+	struct usb_device *udev = port->serial->dev;
+	unsigned short latency = 0;
+	int rv = 0;
+
+
+	dbg("%s", __func__);
+
+	rv = usb_control_msg(udev,
+			     usb_rcvctrlpipe(udev, 0),
+			     FTDI_SIO_GET_LATENCY_TIMER_REQUEST,
+			     FTDI_SIO_GET_LATENCY_TIMER_REQUEST_TYPE,
+			     0, priv->interface,
+			     (char *) &latency, 1, WDR_TIMEOUT);
+
+	if (rv < 0) {
+		dev_err(&port->dev, "Unable to read latency timer: %i\n", rv);
+		return -EIO;
+	}
+	return latency;
+}
 
 static int get_serial_info(struct usb_serial_port *port,
 				struct serial_struct __user *retinfo)
@@ -1098,6 +1146,7 @@ static int set_serial_info(struct tty_struct *tty,
 	priv->custom_divisor = new_serial.custom_divisor;
 
 	tty->low_latency = (priv->flags & ASYNC_LOW_LATENCY) ? 1 : 0;
+	write_latency_timer(port);
 
 check_and_exit:
 	if ((old_priv.flags & ASYNC_SPD_MASK) !=
@@ -1193,27 +1242,13 @@ static ssize_t show_latency_timer(struct device *dev,
 {
 	struct usb_serial_port *port = to_usb_serial_port(dev);
 	struct ftdi_private *priv = usb_get_serial_port_data(port);
-	struct usb_device *udev = port->serial->dev;
-	unsigned short latency = 0;
-	int rv = 0;
-
-
-	dbg("%s", __func__);
-
-	rv = usb_control_msg(udev,
-			     usb_rcvctrlpipe(udev, 0),
-			     FTDI_SIO_GET_LATENCY_TIMER_REQUEST,
-			     FTDI_SIO_GET_LATENCY_TIMER_REQUEST_TYPE,
-			     0, priv->interface,
-			     (char *) &latency, 1, WDR_TIMEOUT);
-
-	if (rv < 0) {
-		dev_err(dev, "Unable to read latency timer: %i\n", rv);
-		return -EIO;
-	}
-	return sprintf(buf, "%i\n", latency);
+	if (priv->flags & ASYNC_LOW_LATENCY)
+		return sprintf(buf, "1\n");
+	else
+		return sprintf(buf, "%i\n", priv->latency);
 }
 
+
 /* Write a new value of the latency timer, in units of milliseconds. */
 static ssize_t store_latency_timer(struct device *dev,
 			struct device_attribute *attr, const char *valbuf,
@@ -1221,25 +1256,13 @@ static ssize_t store_latency_timer(struct device *dev,
 {
 	struct usb_serial_port *port = to_usb_serial_port(dev);
 	struct ftdi_private *priv = usb_get_serial_port_data(port);
-	struct usb_device *udev = port->serial->dev;
-	char buf[1];
 	int v = simple_strtoul(valbuf, NULL, 10);
 	int rv = 0;
 
-	dbg("%s: setting latency timer = %i", __func__, v);
-
-	rv = usb_control_msg(udev,
-			     usb_sndctrlpipe(udev, 0),
-			     FTDI_SIO_SET_LATENCY_TIMER_REQUEST,
-			     FTDI_SIO_SET_LATENCY_TIMER_REQUEST_TYPE,
-			     v, priv->interface,
-			     buf, 0, WDR_TIMEOUT);
-
-	if (rv < 0) {
-		dev_err(dev, "Unable to write latency timer: %i\n", rv);
+	priv->latency = v;
+	rv = write_latency_timer(port);
+	if (rv < 0)
 		return -EIO;
-	}
-
 	return count;
 }
 
@@ -1393,6 +1416,7 @@ static int ftdi_sio_port_probe(struct usb_serial_port *port)
 	usb_set_serial_port_data(port, priv);
 
 	ftdi_determine_type(port);
+	read_latency_timer(port);
 	create_sysfs_attrs(port);
 	return 0;
 }
@@ -1515,6 +1539,8 @@ static int ftdi_open(struct tty_struct *tty,
 	if (tty)
 		tty->low_latency = (priv->flags & ASYNC_LOW_LATENCY) ? 1 : 0;
 
+	write_latency_timer(port);
+
 	/* No error checking for this (will get errors later anyway) */
 	/* See ftdi_sio.h for description of what is reset */
 	usb_control_msg(dev, usb_sndctrlpipe(dev, 0),
-- 
cgit v0.10.2


From 7f8d09eae26a8108406583192996561665b36371 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 13:56:16 +0100
Subject: tty: fix bluetooth scribbling on low latency flags

Bluetooth shouldn't be doing this as most drivers don't support the flag,
furthermore it shouldn't be needed with newer buffering. This becomes rather
more visible as the locking fixes make the abuse of low_latency visible as
spew on the users console/dmesg.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c
index 6880151..4895f0e 100644
--- a/drivers/bluetooth/hci_ldisc.c
+++ b/drivers/bluetooth/hci_ldisc.c
@@ -463,7 +463,6 @@ static int hci_uart_tty_ioctl(struct tty_struct *tty, struct file * file,
 				clear_bit(HCI_UART_PROTO_SET, &hu->flags);
 				return err;
 			}
-			tty->low_latency = 1;
 		} else
 			return -EBUSY;
 		break;
-- 
cgit v0.10.2


From 7e9cd3a617414cfe74342659ceeb4e92975c1efa Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 13:57:01 +0100
Subject: ftdi_sio: don't override modem bits

The new open/close logic handles DTR and friends, so don't do it in our own
open routine as well.

Signed-off-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index 21c053c..683304d 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -1556,11 +1556,6 @@ static int ftdi_open(struct tty_struct *tty,
 	if (tty)
 		ftdi_set_termios(tty, port, tty->termios);
 
-	/* FIXME: Flow control might be enabled, so it should be checked -
-	   we have no control of defaults! */
-	/* Turn on RTS and DTR since we are not flow controlling by default */
-	set_mctrl(port, TIOCM_DTR | TIOCM_RTS);
-
 	/* Not throttled */
 	spin_lock_irqsave(&priv->rx_lock, flags);
 	priv->rx_flags &= ~(THROTTLED | ACTUALLY_THROTTLED);
-- 
cgit v0.10.2


From 0b91421857414f525690ee452c0b0acb6ab1dba3 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Thu, 11 Jun 2009 14:01:45 +0100
Subject: tty: bfin_jtag_comm: emulate a TTY over the Blackfin EMUDAT/JTAG
 interface

The Blackfin JTAG interface has a 4 byte generic data field (EMUDAT).  With
a little creative thinking, we can turn this into a TTY device.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 735bbe2..02ecfd5 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -97,6 +97,19 @@ config DEVKMEM
 	  kind of kernel debugging operations.
 	  When in doubt, say "N".
 
+config BFIN_JTAG_COMM
+	tristate "Blackfin JTAG Communication"
+	depends on BLACKFIN
+	help
+	  Add support for emulating a TTY device over the Blackfin JTAG.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called bfin_jtag_comm.
+
+config BFIN_JTAG_COMM_CONSOLE
+	bool "Console on Blackfin JTAG"
+	depends on BFIN_JTAG_COMM=y
+
 config SERIAL_NONSTANDARD
 	bool "Non-standard serial port support"
 	depends on HAS_IOMEM
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index 9caf5b5..189efcf 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -13,6 +13,7 @@ obj-$(CONFIG_LEGACY_PTYS)	+= pty.o
 obj-$(CONFIG_UNIX98_PTYS)	+= pty.o
 obj-y				+= misc.o
 obj-$(CONFIG_VT)		+= vt_ioctl.o vc_screen.o selection.o keyboard.o
+obj-$(CONFIG_BFIN_JTAG_COMM)	+= bfin_jtag_comm.o
 obj-$(CONFIG_CONSOLE_TRANSLATIONS) += consolemap.o consolemap_deftbl.o
 obj-$(CONFIG_HW_CONSOLE)	+= vt.o defkeymap.o
 obj-$(CONFIG_AUDIT)		+= tty_audit.o
diff --git a/drivers/char/bfin_jtag_comm.c b/drivers/char/bfin_jtag_comm.c
new file mode 100644
index 0000000..44c113d
--- /dev/null
+++ b/drivers/char/bfin_jtag_comm.c
@@ -0,0 +1,365 @@
+/*
+ * TTY over Blackfin JTAG Communication
+ *
+ * Copyright 2008-2009 Analog Devices Inc.
+ *
+ * Enter bugs at http://blackfin.uclinux.org/
+ *
+ * Licensed under the GPL-2 or later.
+ */
+
+#include <linux/circ_buf.h>
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/tty.h>
+#include <linux/tty_driver.h>
+#include <linux/tty_flip.h>
+#include <asm/atomic.h>
+
+/* See the Debug/Emulation chapter in the HRM */
+#define EMUDOF   0x00000001	/* EMUDAT_OUT full & valid */
+#define EMUDIF   0x00000002	/* EMUDAT_IN full & valid */
+#define EMUDOOVF 0x00000004	/* EMUDAT_OUT overflow */
+#define EMUDIOVF 0x00000008	/* EMUDAT_IN overflow */
+
+#define DRV_NAME "bfin-jtag-comm"
+#define DEV_NAME "ttyBFJC"
+
+#define pr_init(fmt, args...) ({ static const __initdata char __fmt[] = fmt; printk(__fmt, ## args); })
+#define debug(fmt, args...) pr_debug(DRV_NAME ": " fmt, ## args)
+
+static inline uint32_t bfin_write_emudat(uint32_t emudat)
+{
+	__asm__ __volatile__("emudat = %0;" : : "d"(emudat));
+	return emudat;
+}
+
+static inline uint32_t bfin_read_emudat(void)
+{
+	uint32_t emudat;
+	__asm__ __volatile__("%0 = emudat;" : "=d"(emudat));
+	return emudat;
+}
+
+static inline uint32_t bfin_write_emudat_chars(char a, char b, char c, char d)
+{
+	return bfin_write_emudat((a << 0) | (b << 8) | (c << 16) | (d << 24));
+}
+
+#define CIRC_SIZE 2048	/* see comment in tty_io.c:do_tty_write() */
+#define CIRC_MASK (CIRC_SIZE - 1)
+#define circ_empty(circ)     ((circ)->head == (circ)->tail)
+#define circ_free(circ)      CIRC_SPACE((circ)->head, (circ)->tail, CIRC_SIZE)
+#define circ_cnt(circ)       CIRC_CNT((circ)->head, (circ)->tail, CIRC_SIZE)
+#define circ_byte(circ, idx) ((circ)->buf[(idx) & CIRC_MASK])
+
+static struct tty_driver *bfin_jc_driver;
+static struct task_struct *bfin_jc_kthread;
+static struct tty_struct * volatile bfin_jc_tty;
+static unsigned long bfin_jc_count;
+static DEFINE_MUTEX(bfin_jc_tty_mutex);
+static volatile struct circ_buf bfin_jc_write_buf;
+
+static int
+bfin_jc_emudat_manager(void *arg)
+{
+	uint32_t inbound_len = 0, outbound_len = 0;
+
+	while (!kthread_should_stop()) {
+		/* no one left to give data to, so sleep */
+		if (bfin_jc_tty == NULL && circ_empty(&bfin_jc_write_buf)) {
+			debug("waiting for readers\n");
+			__set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule();
+			__set_current_state(TASK_RUNNING);
+		}
+
+		/* no data available, so just chill */
+		if (!(bfin_read_DBGSTAT() & EMUDIF) && circ_empty(&bfin_jc_write_buf)) {
+			debug("waiting for data (in_len = %i) (circ: %i %i)\n",
+				inbound_len, bfin_jc_write_buf.tail, bfin_jc_write_buf.head);
+			if (inbound_len)
+				schedule();
+			else
+				schedule_timeout_interruptible(HZ);
+			continue;
+		}
+
+		/* if incoming data is ready, eat it */
+		if (bfin_read_DBGSTAT() & EMUDIF) {
+			struct tty_struct *tty;
+			mutex_lock(&bfin_jc_tty_mutex);
+			tty = (struct tty_struct *)bfin_jc_tty;
+			if (tty != NULL) {
+				uint32_t emudat = bfin_read_emudat();
+				if (inbound_len == 0) {
+					debug("incoming length: 0x%08x\n", emudat);
+					inbound_len = emudat;
+				} else {
+					size_t num_chars = (4 <= inbound_len ? 4 : inbound_len);
+					debug("  incoming data: 0x%08x (pushing %zu)\n", emudat, num_chars);
+					inbound_len -= num_chars;
+					tty_insert_flip_string(tty, (unsigned char *)&emudat, num_chars);
+					tty_flip_buffer_push(tty);
+				}
+			}
+			mutex_unlock(&bfin_jc_tty_mutex);
+		}
+
+		/* if outgoing data is ready, post it */
+		if (!(bfin_read_DBGSTAT() & EMUDOF) && !circ_empty(&bfin_jc_write_buf)) {
+			if (outbound_len == 0) {
+				outbound_len = circ_cnt(&bfin_jc_write_buf);
+				bfin_write_emudat(outbound_len);
+				debug("outgoing length: 0x%08x\n", outbound_len);
+			} else {
+				struct tty_struct *tty;
+				int tail = bfin_jc_write_buf.tail;
+				size_t ate = (4 <= outbound_len ? 4 : outbound_len);
+				uint32_t emudat =
+				bfin_write_emudat_chars(
+					circ_byte(&bfin_jc_write_buf, tail + 0),
+					circ_byte(&bfin_jc_write_buf, tail + 1),
+					circ_byte(&bfin_jc_write_buf, tail + 2),
+					circ_byte(&bfin_jc_write_buf, tail + 3)
+				);
+				bfin_jc_write_buf.tail += ate;
+				outbound_len -= ate;
+				mutex_lock(&bfin_jc_tty_mutex);
+				tty = (struct tty_struct *)bfin_jc_tty;
+				if (tty)
+					tty_wakeup(tty);
+				mutex_unlock(&bfin_jc_tty_mutex);
+				debug("  outgoing data: 0x%08x (pushing %zu)\n", emudat, ate);
+			}
+		}
+	}
+
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static int
+bfin_jc_open(struct tty_struct *tty, struct file *filp)
+{
+	mutex_lock(&bfin_jc_tty_mutex);
+	debug("open %lu\n", bfin_jc_count);
+	++bfin_jc_count;
+	bfin_jc_tty = tty;
+	wake_up_process(bfin_jc_kthread);
+	mutex_unlock(&bfin_jc_tty_mutex);
+	return 0;
+}
+
+static void
+bfin_jc_close(struct tty_struct *tty, struct file *filp)
+{
+	mutex_lock(&bfin_jc_tty_mutex);
+	debug("close %lu\n", bfin_jc_count);
+	if (--bfin_jc_count == 0)
+		bfin_jc_tty = NULL;
+	wake_up_process(bfin_jc_kthread);
+	mutex_unlock(&bfin_jc_tty_mutex);
+}
+
+/* XXX: we dont handle the put_char() case where we must handle count = 1 */
+static int
+bfin_jc_circ_write(const unsigned char *buf, int count)
+{
+	int i;
+	count = min(count, circ_free(&bfin_jc_write_buf));
+	debug("going to write chunk of %i bytes\n", count);
+	for (i = 0; i < count; ++i)
+		circ_byte(&bfin_jc_write_buf, bfin_jc_write_buf.head + i) = buf[i];
+	bfin_jc_write_buf.head += i;
+	return i;
+}
+
+#ifndef CONFIG_BFIN_JTAG_COMM_CONSOLE
+# define acquire_console_sem()
+# define release_console_sem()
+#endif
+static int
+bfin_jc_write(struct tty_struct *tty, const unsigned char *buf, int count)
+{
+	int i;
+	acquire_console_sem();
+	i = bfin_jc_circ_write(buf, count);
+	release_console_sem();
+	wake_up_process(bfin_jc_kthread);
+	return i;
+}
+
+static void
+bfin_jc_flush_chars(struct tty_struct *tty)
+{
+	wake_up_process(bfin_jc_kthread);
+}
+
+static int
+bfin_jc_write_room(struct tty_struct *tty)
+{
+	return circ_free(&bfin_jc_write_buf);
+}
+
+static int
+bfin_jc_chars_in_buffer(struct tty_struct *tty)
+{
+	return circ_cnt(&bfin_jc_write_buf);
+}
+
+static void
+bfin_jc_wait_until_sent(struct tty_struct *tty, int timeout)
+{
+	unsigned long expire = jiffies + timeout;
+	while (!circ_empty(&bfin_jc_write_buf)) {
+		if (signal_pending(current))
+			break;
+		if (time_after(jiffies, expire))
+			break;
+	}
+}
+
+static struct tty_operations bfin_jc_ops = {
+	.open            = bfin_jc_open,
+	.close           = bfin_jc_close,
+	.write           = bfin_jc_write,
+	/*.put_char        = bfin_jc_put_char,*/
+	.flush_chars     = bfin_jc_flush_chars,
+	.write_room      = bfin_jc_write_room,
+	.chars_in_buffer = bfin_jc_chars_in_buffer,
+	.wait_until_sent = bfin_jc_wait_until_sent,
+};
+
+static int __init bfin_jc_init(void)
+{
+	int ret;
+
+	bfin_jc_kthread = kthread_create(bfin_jc_emudat_manager, NULL, DRV_NAME);
+	if (IS_ERR(bfin_jc_kthread))
+		return PTR_ERR(bfin_jc_kthread);
+
+	ret = -ENOMEM;
+
+	bfin_jc_write_buf.head = bfin_jc_write_buf.tail = 0;
+	bfin_jc_write_buf.buf = kmalloc(CIRC_SIZE, GFP_KERNEL);
+	if (!bfin_jc_write_buf.buf)
+		goto err;
+
+	bfin_jc_driver = alloc_tty_driver(1);
+	if (!bfin_jc_driver)
+		goto err;
+
+	bfin_jc_driver->owner        = THIS_MODULE;
+	bfin_jc_driver->driver_name  = DRV_NAME;
+	bfin_jc_driver->name         = DEV_NAME;
+	bfin_jc_driver->type         = TTY_DRIVER_TYPE_SERIAL;
+	bfin_jc_driver->subtype      = SERIAL_TYPE_NORMAL;
+	bfin_jc_driver->init_termios = tty_std_termios;
+	tty_set_operations(bfin_jc_driver, &bfin_jc_ops);
+
+	ret = tty_register_driver(bfin_jc_driver);
+	if (ret)
+		goto err;
+
+	pr_init(KERN_INFO DRV_NAME ": initialized\n");
+
+	return 0;
+
+ err:
+	put_tty_driver(bfin_jc_driver);
+	kfree(bfin_jc_write_buf.buf);
+	kthread_stop(bfin_jc_kthread);
+	return ret;
+}
+module_init(bfin_jc_init);
+
+static void __exit bfin_jc_exit(void)
+{
+	kthread_stop(bfin_jc_kthread);
+	kfree(bfin_jc_write_buf.buf);
+	tty_unregister_driver(bfin_jc_driver);
+	put_tty_driver(bfin_jc_driver);
+}
+module_exit(bfin_jc_exit);
+
+#if defined(CONFIG_BFIN_JTAG_COMM_CONSOLE) || defined(CONFIG_EARLY_PRINTK)
+static void
+bfin_jc_straight_buffer_write(const char *buf, unsigned count)
+{
+	unsigned ate = 0;
+	while (bfin_read_DBGSTAT() & EMUDOF)
+		continue;
+	bfin_write_emudat(count);
+	while (ate < count) {
+		while (bfin_read_DBGSTAT() & EMUDOF)
+			continue;
+		bfin_write_emudat_chars(buf[ate], buf[ate+1], buf[ate+2], buf[ate+3]);
+		ate += 4;
+	}
+}
+#endif
+
+#ifdef CONFIG_BFIN_JTAG_COMM_CONSOLE
+static void
+bfin_jc_console_write(struct console *co, const char *buf, unsigned count)
+{
+	if (bfin_jc_kthread == NULL)
+		bfin_jc_straight_buffer_write(buf, count);
+	else
+		bfin_jc_circ_write(buf, count);
+}
+
+static struct tty_driver *
+bfin_jc_console_device(struct console *co, int *index)
+{
+	*index = co->index;
+	return bfin_jc_driver;
+}
+
+static struct console bfin_jc_console = {
+	.name    = DEV_NAME,
+	.write   = bfin_jc_console_write,
+	.device  = bfin_jc_console_device,
+	.flags   = CON_ANYTIME | CON_PRINTBUFFER,
+	.index   = -1,
+};
+
+static int __init bfin_jc_console_init(void)
+{
+	register_console(&bfin_jc_console);
+	return 0;
+}
+console_initcall(bfin_jc_console_init);
+#endif
+
+#ifdef CONFIG_EARLY_PRINTK
+static void __init
+bfin_jc_early_write(struct console *co, const char *buf, unsigned int count)
+{
+	bfin_jc_straight_buffer_write(buf, count);
+}
+
+static struct __initdata console bfin_jc_early_console = {
+	.name   = "early_BFJC",
+	.write   = bfin_jc_early_write,
+	.flags   = CON_ANYTIME | CON_PRINTBUFFER,
+	.index   = -1,
+};
+
+struct console * __init
+bfin_jc_early_init(unsigned int port, unsigned int cflag)
+{
+	return &bfin_jc_early_console;
+}
+#endif
+
+MODULE_AUTHOR("Mike Frysinger <vapier@gentoo.org>");
+MODULE_DESCRIPTION("TTY over Blackfin JTAG Communication");
+MODULE_LICENSE("GPL");
-- 
cgit v0.10.2


From 26a2e20f4a966bebc312836aeb4423fbfd4e33e2 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 14:03:13 +0100
Subject: tty: Untangle termios and mm mutex dependencies

Although this doesn't cause any problems it could potentially do so for
future mmap using devices. No real work is needed to sort it out so untangle
it before it causes problems

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/tty_ioctl.c b/drivers/char/tty_ioctl.c
index 2401dbc..8116bb1c 100644
--- a/drivers/char/tty_ioctl.c
+++ b/drivers/char/tty_ioctl.c
@@ -626,9 +626,25 @@ static int set_termios(struct tty_struct *tty, void __user *arg, int opt)
 	return 0;
 }
 
+static void copy_termios(struct tty_struct *tty, struct ktermios *kterm)
+{
+	mutex_lock(&tty->termios_mutex);
+	memcpy(kterm, tty->termios, sizeof(struct ktermios));
+	mutex_unlock(&tty->termios_mutex);
+}
+
+static void copy_termios_locked(struct tty_struct *tty, struct ktermios *kterm)
+{
+	mutex_lock(&tty->termios_mutex);
+	memcpy(kterm, tty->termios_locked, sizeof(struct ktermios));
+	mutex_unlock(&tty->termios_mutex);
+}
+
 static int get_termio(struct tty_struct *tty, struct termio __user *termio)
 {
-	if (kernel_termios_to_user_termio(termio, tty->termios))
+	struct ktermios kterm;
+	copy_termios(tty, &kterm);
+	if (kernel_termios_to_user_termio(termio, &kterm))
 		return -EFAULT;
 	return 0;
 }
@@ -930,6 +946,8 @@ int tty_mode_ioctl(struct tty_struct *tty, struct file *file,
 	struct tty_struct *real_tty;
 	void __user *p = (void __user *)arg;
 	int ret = 0;
+	struct ktermios kterm;
+	struct termiox ktermx;
 
 	if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
 	    tty->driver->subtype == PTY_TYPE_MASTER)
@@ -965,23 +983,20 @@ int tty_mode_ioctl(struct tty_struct *tty, struct file *file,
 		return set_termios(real_tty, p, TERMIOS_OLD);
 #ifndef TCGETS2
 	case TCGETS:
-		mutex_lock(&real_tty->termios_mutex);
-		if (kernel_termios_to_user_termios((struct termios __user *)arg, real_tty->termios))
+		copy_termios(real_tty, &kterm);
+		if (kernel_termios_to_user_termios((struct termios __user *)arg, &kterm))
 			ret = -EFAULT;
-		mutex_unlock(&real_tty->termios_mutex);
 		return ret;
 #else
 	case TCGETS:
-		mutex_lock(&real_tty->termios_mutex);
-		if (kernel_termios_to_user_termios_1((struct termios __user *)arg, real_tty->termios))
+		copy_termios(real_tty, &kterm);
+		if (kernel_termios_to_user_termios_1((struct termios __user *)arg, &kterm))
 			ret = -EFAULT;
-		mutex_unlock(&real_tty->termios_mutex);
 		return ret;
 	case TCGETS2:
-		mutex_lock(&real_tty->termios_mutex);
-		if (kernel_termios_to_user_termios((struct termios2 __user *)arg, real_tty->termios))
+		copy_termios(real_tty, &kterm);
+		if (kernel_termios_to_user_termios((struct termios2 __user *)arg, &kterm))
 			ret = -EFAULT;
-		mutex_unlock(&real_tty->termios_mutex);
 		return ret;
 	case TCSETSF2:
 		return set_termios(real_tty, p,  TERMIOS_FLUSH | TERMIOS_WAIT);
@@ -1000,34 +1015,36 @@ int tty_mode_ioctl(struct tty_struct *tty, struct file *file,
 		return set_termios(real_tty, p, TERMIOS_TERMIO);
 #ifndef TCGETS2
 	case TIOCGLCKTRMIOS:
-		mutex_lock(&real_tty->termios_mutex);
-		if (kernel_termios_to_user_termios((struct termios __user *)arg, real_tty->termios_locked))
+		copy_termios_locked(real_tty, &kterm);
+		if (kernel_termios_to_user_termios((struct termios __user *)arg, &kterm))
 			ret = -EFAULT;
-		mutex_unlock(&real_tty->termios_mutex);
 		return ret;
 	case TIOCSLCKTRMIOS:
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
-		mutex_lock(&real_tty->termios_mutex);
-		if (user_termios_to_kernel_termios(real_tty->termios_locked,
+		copy_termios_locked(real_tty, &kterm);
+		if (user_termios_to_kernel_termios(&kterm,
 					       (struct termios __user *) arg))
-			ret = -EFAULT;
+			return -EFAULT;
+		mutex_lock(&real_tty->termios_mutex);
+		memcpy(real_tty->termios_locked, &kterm, sizeof(struct ktermios));
 		mutex_unlock(&real_tty->termios_mutex);
-		return ret;
+		return 0;
 #else
 	case TIOCGLCKTRMIOS:
-		mutex_lock(&real_tty->termios_mutex);
-		if (kernel_termios_to_user_termios_1((struct termios __user *)arg, real_tty->termios_locked))
+		copy_termios_locked(real_tty, &kterm);
+		if (kernel_termios_to_user_termios_1((struct termios __user *)arg, &kterm))
 			ret = -EFAULT;
-		mutex_unlock(&real_tty->termios_mutex);
 		return ret;
 	case TIOCSLCKTRMIOS:
 		if (!capable(CAP_SYS_ADMIN))
-			ret = -EPERM;
-		mutex_lock(&real_tty->termios_mutex);
-		if (user_termios_to_kernel_termios_1(real_tty->termios_locked,
+			return -EPERM;
+		copy_termios_locked(real_tty, &kterm);
+		if (user_termios_to_kernel_termios_1(&kterm,
 					       (struct termios __user *) arg))
-			ret = -EFAULT;
+			return -EFAULT;
+		mutex_lock(&real_tty->termios_mutex);
+		memcpy(real_tty->termios_locked, &kterm, sizeof(struct ktermios));
 		mutex_unlock(&real_tty->termios_mutex);
 		return ret;
 #endif
@@ -1036,9 +1053,10 @@ int tty_mode_ioctl(struct tty_struct *tty, struct file *file,
 		if (real_tty->termiox == NULL)
 			return -EINVAL;
 		mutex_lock(&real_tty->termios_mutex);
-		if (copy_to_user(p, real_tty->termiox, sizeof(struct termiox)))
-			ret = -EFAULT;
+		memcpy(&ktermx, real_tty->termiox, sizeof(struct termiox));
 		mutex_unlock(&real_tty->termios_mutex);
+		if (copy_to_user(p, &ktermx, sizeof(struct termiox)))
+			ret = -EFAULT;
 		return ret;
 	case TCSETX:
 		return set_termiox(real_tty, p, 0);
@@ -1048,10 +1066,9 @@ int tty_mode_ioctl(struct tty_struct *tty, struct file *file,
 		return set_termiox(real_tty, p, TERMIOS_FLUSH);
 #endif		
 	case TIOCGSOFTCAR:
-		mutex_lock(&real_tty->termios_mutex);
-		ret = put_user(C_CLOCAL(real_tty) ? 1 : 0,
+		copy_termios(real_tty, &kterm);
+		ret = put_user((kterm.c_cflag & CLOCAL) ? 1 : 0,
 						(int __user *)arg);
-		mutex_unlock(&real_tty->termios_mutex);
 		return ret;
 	case TIOCSSOFTCAR:
 		if (get_user(arg, (unsigned int __user *) arg))
-- 
cgit v0.10.2


From 93d5581e20600593ec3236921b6620225fb76034 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 14:03:55 +0100
Subject: devpts: unregister the file system on error

Closes-bug: http://bugzilla.kernel.org/show_bug.cgi?id=13429

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c68edb9..9b1d285 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -557,8 +557,10 @@ static int __init init_devpts_fs(void)
 	int err = register_filesystem(&devpts_fs_type);
 	if (!err) {
 		devpts_mnt = kern_mount(&devpts_fs_type);
-		if (IS_ERR(devpts_mnt))
+		if (IS_ERR(devpts_mnt)) {
 			err = PTR_ERR(devpts_mnt);
+			unregister_filesystem(&devpts_fs_type);
+		}
 	}
 	return err;
 }
-- 
cgit v0.10.2


From e960bf73ddd73714bcfbadb1717e53ecda9924cb Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 14:04:57 +0100
Subject: tty: Add URL for ttydev queue

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/MAINTAINERS b/MAINTAINERS
index 1979167..84285b5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -71,7 +71,7 @@ P: Person
 M: Mail patches to
 L: Mailing list that is relevant to this area
 W: Web-page with status/info
-T: SCM tree type and location.  Type is one of: git, hg, quilt.
+T: SCM tree type and location.  Type is one of: git, hg, quilt, stgit.
 S: Status, one of the following:
 
 	Supported:	Someone is actually paid to look after this.
@@ -5630,6 +5630,7 @@ P:	Alan Cox
 M:	alan@lxorguk.ukuu.org.uk
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
+T:	stgit http://zeniv.linux.org.uk/~alan/ttydev/
 
 TULIP NETWORK DRIVERS
 P:	Grant Grundler
-- 
cgit v0.10.2


From 34aec591847c696339189b070cce2a11f901cfea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Richard=20R=C3=B6jfors?=
 <richard.rojfors.ext@mocean-labs.com>
Date: Thu, 11 Jun 2009 14:05:39 +0100
Subject: serial: Added Timberdale UART driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Driver for the UART found in the Timberdale FPGA

Signed-off-by: Richard Röjfors <richard.rojfors.ext@mocean-labs.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index 343e3a3..3391ef0 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -1433,4 +1433,11 @@ config SPORT_BAUD_RATE
 	default 19200 if (SERIAL_SPORT_BAUD_RATE_19200)
 	default 9600 if (SERIAL_SPORT_BAUD_RATE_9600)
 
+config SERIAL_TIMBERDALE
+	tristate "Support for timberdale UART"
+	depends on MFD_TIMBERDALE
+	select SERIAL_CORE
+	---help---
+	Add support for UART controller on timberdale.
+
 endmenu
diff --git a/drivers/serial/Makefile b/drivers/serial/Makefile
index d438eb2..45a8658 100644
--- a/drivers/serial/Makefile
+++ b/drivers/serial/Makefile
@@ -77,3 +77,4 @@ obj-$(CONFIG_SERIAL_OF_PLATFORM_NWPSERIAL) += nwpserial.o
 obj-$(CONFIG_SERIAL_KS8695) += serial_ks8695.o
 obj-$(CONFIG_KGDB_SERIAL_CONSOLE) += kgdboc.o
 obj-$(CONFIG_SERIAL_QE) += ucc_uart.o
+obj-$(CONFIG_SERIAL_TIMBERDALE)	+= timbuart.o
diff --git a/drivers/serial/timbuart.c b/drivers/serial/timbuart.c
new file mode 100644
index 0000000..30ba3c4
--- /dev/null
+++ b/drivers/serial/timbuart.c
@@ -0,0 +1,520 @@
+/*
+ * timbuart.c timberdale FPGA UART driver
+ * Copyright (c) 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* Supports:
+ * Timberdale FPGA UART
+ */
+
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/serial_core.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+#include <linux/ioport.h>
+
+#include "timbuart.h"
+
+struct timbuart_port {
+	struct uart_port	port;
+	struct tasklet_struct	tasklet;
+	int			usedma;
+	u8			last_ier;
+	struct platform_device  *dev;
+};
+
+static int baudrates[] = {9600, 19200, 38400, 57600, 115200, 230400, 460800,
+	921600, 1843200, 3250000};
+
+static void timbuart_mctrl_check(struct uart_port *port, u8 isr, u8 *ier);
+
+static irqreturn_t timbuart_handleinterrupt(int irq, void *devid);
+
+static void timbuart_stop_rx(struct uart_port *port)
+{
+	/* spin lock held by upper layer, disable all RX interrupts */
+	u8 ier = ioread8(port->membase + TIMBUART_IER) & ~RXFLAGS;
+	iowrite8(ier, port->membase + TIMBUART_IER);
+}
+
+static void timbuart_stop_tx(struct uart_port *port)
+{
+	/* spinlock held by upper layer, disable TX interrupt */
+	u8 ier = ioread8(port->membase + TIMBUART_IER) & ~TXBAE;
+	iowrite8(ier, port->membase + TIMBUART_IER);
+}
+
+static void timbuart_start_tx(struct uart_port *port)
+{
+	struct timbuart_port *uart =
+		container_of(port, struct timbuart_port, port);
+
+	/* do not transfer anything here -> fire off the tasklet */
+	tasklet_schedule(&uart->tasklet);
+}
+
+static void timbuart_flush_buffer(struct uart_port *port)
+{
+	u8 ctl = ioread8(port->membase + TIMBUART_CTRL) | TIMBUART_CTRL_FLSHTX;
+
+	iowrite8(ctl, port->membase + TIMBUART_CTRL);
+	iowrite8(TXBF, port->membase + TIMBUART_ISR);
+}
+
+static void timbuart_rx_chars(struct uart_port *port)
+{
+	struct tty_struct *tty = port->info->port.tty;
+
+	while (ioread8(port->membase + TIMBUART_ISR) & RXDP) {
+		u8 ch = ioread8(port->membase + TIMBUART_RXFIFO);
+		port->icount.rx++;
+		tty_insert_flip_char(tty, ch, TTY_NORMAL);
+	}
+
+	spin_unlock(&port->lock);
+	tty_flip_buffer_push(port->info->port.tty);
+	spin_lock(&port->lock);
+
+	dev_dbg(port->dev, "%s - total read %d bytes\n",
+		__func__, port->icount.rx);
+}
+
+static void timbuart_tx_chars(struct uart_port *port)
+{
+	struct circ_buf *xmit = &port->info->xmit;
+
+	while (!(ioread8(port->membase + TIMBUART_ISR) & TXBF) &&
+		!uart_circ_empty(xmit)) {
+		iowrite8(xmit->buf[xmit->tail],
+			port->membase + TIMBUART_TXFIFO);
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
+		port->icount.tx++;
+	}
+
+	dev_dbg(port->dev,
+		"%s - total written %d bytes, CTL: %x, RTS: %x, baud: %x\n",
+		 __func__,
+		port->icount.tx,
+		ioread8(port->membase + TIMBUART_CTRL),
+		port->mctrl & TIOCM_RTS,
+		ioread8(port->membase + TIMBUART_BAUDRATE));
+}
+
+static void timbuart_handle_tx_port(struct uart_port *port, u8 isr, u8 *ier)
+{
+	struct timbuart_port *uart =
+		container_of(port, struct timbuart_port, port);
+	struct circ_buf *xmit = &port->info->xmit;
+
+	if (uart_circ_empty(xmit) || uart_tx_stopped(port))
+		return;
+
+	if (port->x_char)
+		return;
+
+	if (isr & TXFLAGS) {
+		timbuart_tx_chars(port);
+		/* clear all TX interrupts */
+		iowrite8(TXFLAGS, port->membase + TIMBUART_ISR);
+
+		if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+			uart_write_wakeup(port);
+	} else
+		/* Re-enable any tx interrupt */
+		*ier |= uart->last_ier & TXFLAGS;
+
+	/* enable interrupts if there are chars in the transmit buffer,
+	 * Or if we delivered some bytes and want the almost empty interrupt
+	 * we wake up the upper layer later when we got the interrupt
+	 * to give it some time to go out...
+	 */
+	if (!uart_circ_empty(xmit))
+		*ier |= TXBAE;
+
+	dev_dbg(port->dev, "%s - leaving\n", __func__);
+}
+
+void timbuart_handle_rx_port(struct uart_port *port, u8 isr, u8 *ier)
+{
+	if (isr & RXFLAGS) {
+		/* Some RX status is set */
+		if (isr & RXBF) {
+			u8 ctl = ioread8(port->membase + TIMBUART_CTRL) |
+				TIMBUART_CTRL_FLSHRX;
+			iowrite8(ctl, port->membase + TIMBUART_CTRL);
+			port->icount.overrun++;
+		} else if (isr & (RXDP))
+			timbuart_rx_chars(port);
+
+		/* ack all RX interrupts */
+		iowrite8(RXFLAGS, port->membase + TIMBUART_ISR);
+	}
+
+	/* always have the RX interrupts enabled */
+	*ier |= RXBAF | RXBF | RXTT;
+
+	dev_dbg(port->dev, "%s - leaving\n", __func__);
+}
+
+void timbuart_tasklet(unsigned long arg)
+{
+	struct timbuart_port *uart = (struct timbuart_port *)arg;
+	u8 isr, ier = 0;
+
+	spin_lock(&uart->port.lock);
+
+	isr = ioread8(uart->port.membase + TIMBUART_ISR);
+	dev_dbg(uart->port.dev, "%s ISR: %x\n", __func__, isr);
+
+	if (!uart->usedma)
+		timbuart_handle_tx_port(&uart->port, isr, &ier);
+
+	timbuart_mctrl_check(&uart->port, isr, &ier);
+
+	if (!uart->usedma)
+		timbuart_handle_rx_port(&uart->port, isr, &ier);
+
+	iowrite8(ier, uart->port.membase + TIMBUART_IER);
+
+	spin_unlock(&uart->port.lock);
+	dev_dbg(uart->port.dev, "%s leaving\n", __func__);
+}
+
+static unsigned int timbuart_tx_empty(struct uart_port *port)
+{
+	u8 isr = ioread8(port->membase + TIMBUART_ISR);
+
+	return (isr & TXBAE) ? TIOCSER_TEMT : 0;
+}
+
+static unsigned int timbuart_get_mctrl(struct uart_port *port)
+{
+	u8 cts = ioread8(port->membase + TIMBUART_CTRL);
+	dev_dbg(port->dev, "%s - cts %x\n", __func__, cts);
+
+	if (cts & TIMBUART_CTRL_CTS)
+		return TIOCM_CTS | TIOCM_DSR | TIOCM_CAR;
+	else
+		return TIOCM_DSR | TIOCM_CAR;
+}
+
+static void timbuart_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+	dev_dbg(port->dev, "%s - %x\n", __func__, mctrl);
+
+	if (mctrl & TIOCM_RTS)
+		iowrite8(TIMBUART_CTRL_RTS, port->membase + TIMBUART_CTRL);
+	else
+		iowrite8(TIMBUART_CTRL_RTS, port->membase + TIMBUART_CTRL);
+}
+
+static void timbuart_mctrl_check(struct uart_port *port, u8 isr, u8 *ier)
+{
+	unsigned int cts;
+
+	if (isr & CTS_DELTA) {
+		/* ack */
+		iowrite8(CTS_DELTA, port->membase + TIMBUART_ISR);
+		cts = timbuart_get_mctrl(port);
+		uart_handle_cts_change(port, cts & TIOCM_CTS);
+		wake_up_interruptible(&port->info->delta_msr_wait);
+	}
+
+	*ier |= CTS_DELTA;
+}
+
+static void timbuart_enable_ms(struct uart_port *port)
+{
+	/* N/A */
+}
+
+static void timbuart_break_ctl(struct uart_port *port, int ctl)
+{
+	/* N/A */
+}
+
+static int timbuart_startup(struct uart_port *port)
+{
+	struct timbuart_port *uart =
+		container_of(port, struct timbuart_port, port);
+
+	dev_dbg(port->dev, "%s\n", __func__);
+
+	iowrite8(TIMBUART_CTRL_FLSHRX, port->membase + TIMBUART_CTRL);
+	iowrite8(0xff, port->membase + TIMBUART_ISR);
+	/* Enable all but TX interrupts */
+	iowrite8(RXBAF | RXBF | RXTT | CTS_DELTA,
+		port->membase + TIMBUART_IER);
+
+	return request_irq(port->irq, timbuart_handleinterrupt, IRQF_SHARED,
+		"timb-uart", uart);
+}
+
+static void timbuart_shutdown(struct uart_port *port)
+{
+	struct timbuart_port *uart =
+		container_of(port, struct timbuart_port, port);
+	dev_dbg(port->dev, "%s\n", __func__);
+	free_irq(port->irq, uart);
+	iowrite8(0, port->membase + TIMBUART_IER);
+}
+
+static int get_bindex(int baud)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(baudrates); i++)
+		if (baud == baudrates[i])
+			return i;
+
+	return -1;
+}
+
+static void timbuart_set_termios(struct uart_port *port,
+	struct ktermios *termios,
+	struct ktermios *old)
+{
+	unsigned int baud;
+	short bindex;
+	unsigned long flags;
+
+	baud = uart_get_baud_rate(port, termios, old, 0, port->uartclk / 16);
+	bindex = get_bindex(baud);
+	dev_dbg(port->dev, "%s - bindex %d\n", __func__, bindex);
+
+	if (bindex < 0) {
+		printk(KERN_ALERT "timbuart: Unsupported baud rate\n");
+	} else {
+		spin_lock_irqsave(&port->lock, flags);
+		iowrite8((u8)bindex, port->membase + TIMBUART_BAUDRATE);
+		uart_update_timeout(port, termios->c_cflag, baud);
+		spin_unlock_irqrestore(&port->lock, flags);
+	}
+}
+
+static const char *timbuart_type(struct uart_port *port)
+{
+	return port->type == PORT_UNKNOWN ? "timbuart" : NULL;
+}
+
+/* We do not request/release mappings of the registers here,
+ * currently it's done in the proble function.
+ */
+static void timbuart_release_port(struct uart_port *port)
+{
+	struct platform_device *pdev = to_platform_device(port->dev);
+	int size =
+		resource_size(platform_get_resource(pdev, IORESOURCE_MEM, 0));
+
+	if (port->flags & UPF_IOREMAP) {
+		iounmap(port->membase);
+		port->membase = NULL;
+	}
+
+	release_mem_region(port->mapbase, size);
+}
+
+static int timbuart_request_port(struct uart_port *port)
+{
+	struct platform_device *pdev = to_platform_device(port->dev);
+	int size =
+		resource_size(platform_get_resource(pdev, IORESOURCE_MEM, 0));
+
+	if (!request_mem_region(port->mapbase, size, "timb-uart"))
+		return -EBUSY;
+
+	if (port->flags & UPF_IOREMAP) {
+		port->membase = ioremap(port->mapbase, size);
+		if (port->membase == NULL) {
+			release_mem_region(port->mapbase, size);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+static irqreturn_t timbuart_handleinterrupt(int irq, void *devid)
+{
+	struct timbuart_port *uart = (struct timbuart_port *)devid;
+
+	if (ioread8(uart->port.membase + TIMBUART_IPR)) {
+		uart->last_ier = ioread8(uart->port.membase + TIMBUART_IER);
+
+		/* disable interrupts, the tasklet enables them again */
+		iowrite8(0, uart->port.membase + TIMBUART_IER);
+
+		/* fire off bottom half */
+		tasklet_schedule(&uart->tasklet);
+
+		return IRQ_HANDLED;
+	} else
+		return IRQ_NONE;
+}
+
+/*
+ * Configure/autoconfigure the port.
+ */
+static void timbuart_config_port(struct uart_port *port, int flags)
+{
+	if (flags & UART_CONFIG_TYPE) {
+		port->type = PORT_TIMBUART;
+		timbuart_request_port(port);
+	}
+}
+
+static int timbuart_verify_port(struct uart_port *port,
+	struct serial_struct *ser)
+{
+	/* we don't want the core code to modify any port params */
+	return -EINVAL;
+}
+
+static struct uart_ops timbuart_ops = {
+	.tx_empty = timbuart_tx_empty,
+	.set_mctrl = timbuart_set_mctrl,
+	.get_mctrl = timbuart_get_mctrl,
+	.stop_tx = timbuart_stop_tx,
+	.start_tx = timbuart_start_tx,
+	.flush_buffer = timbuart_flush_buffer,
+	.stop_rx = timbuart_stop_rx,
+	.enable_ms = timbuart_enable_ms,
+	.break_ctl = timbuart_break_ctl,
+	.startup = timbuart_startup,
+	.shutdown = timbuart_shutdown,
+	.set_termios = timbuart_set_termios,
+	.type = timbuart_type,
+	.release_port = timbuart_release_port,
+	.request_port = timbuart_request_port,
+	.config_port = timbuart_config_port,
+	.verify_port = timbuart_verify_port
+};
+
+static struct uart_driver timbuart_driver = {
+	.owner = THIS_MODULE,
+	.driver_name = "timberdale_uart",
+	.dev_name = "ttyTU",
+	.major = TIMBUART_MAJOR,
+	.minor = TIMBUART_MINOR,
+	.nr = 1
+};
+
+static int timbuart_probe(struct platform_device *dev)
+{
+	int err;
+	struct timbuart_port *uart;
+	struct resource *iomem;
+
+	dev_dbg(&dev->dev, "%s\n", __func__);
+
+	uart = kzalloc(sizeof(*uart), GFP_KERNEL);
+	if (!uart) {
+		err = -EINVAL;
+		goto err_mem;
+	}
+
+	uart->usedma = 0;
+
+	uart->port.uartclk = 3250000 * 16;
+	uart->port.fifosize  = TIMBUART_FIFO_SIZE;
+	uart->port.regshift  = 2;
+	uart->port.iotype  = UPIO_MEM;
+	uart->port.ops = &timbuart_ops;
+	uart->port.irq = 0;
+	uart->port.flags = UPF_BOOT_AUTOCONF | UPF_IOREMAP;
+	uart->port.line  = 0;
+	uart->port.dev	= &dev->dev;
+
+	iomem = platform_get_resource(dev, IORESOURCE_MEM, 0);
+	if (!iomem) {
+		err = -ENOMEM;
+		goto err_register;
+	}
+	uart->port.mapbase = iomem->start;
+	uart->port.membase = NULL;
+
+	uart->port.irq = platform_get_irq(dev, 0);
+	if (uart->port.irq < 0) {
+		err = -EINVAL;
+		goto err_register;
+	}
+
+	tasklet_init(&uart->tasklet, timbuart_tasklet, (unsigned long)uart);
+
+	err = uart_register_driver(&timbuart_driver);
+	if (err)
+		goto err_register;
+
+	err = uart_add_one_port(&timbuart_driver, &uart->port);
+	if (err)
+		goto err_add_port;
+
+	platform_set_drvdata(dev, uart);
+
+	return 0;
+
+err_add_port:
+	uart_unregister_driver(&timbuart_driver);
+err_register:
+	kfree(uart);
+err_mem:
+	printk(KERN_ERR "timberdale: Failed to register Timberdale UART: %d\n",
+		err);
+
+	return err;
+}
+
+static int timbuart_remove(struct platform_device *dev)
+{
+	struct timbuart_port *uart = platform_get_drvdata(dev);
+
+	tasklet_kill(&uart->tasklet);
+	uart_remove_one_port(&timbuart_driver, &uart->port);
+	uart_unregister_driver(&timbuart_driver);
+	kfree(uart);
+
+	return 0;
+}
+
+static struct platform_driver timbuart_platform_driver = {
+	.driver = {
+		.name	= "timb-uart",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= timbuart_probe,
+	.remove		= timbuart_remove,
+};
+
+/*--------------------------------------------------------------------------*/
+
+static int __init timbuart_init(void)
+{
+	return platform_driver_register(&timbuart_platform_driver);
+}
+
+static void __exit timbuart_exit(void)
+{
+	platform_driver_unregister(&timbuart_platform_driver);
+}
+
+module_init(timbuart_init);
+module_exit(timbuart_exit);
+
+MODULE_DESCRIPTION("Timberdale UART driver");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:timb-uart");
+
diff --git a/drivers/serial/timbuart.h b/drivers/serial/timbuart.h
new file mode 100644
index 0000000..7e56676
--- /dev/null
+++ b/drivers/serial/timbuart.h
@@ -0,0 +1,58 @@
+/*
+ * timbuart.c timberdale FPGA GPIO driver
+ * Copyright (c) 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* Supports:
+ * Timberdale FPGA UART
+ */
+
+#ifndef _TIMBUART_H
+#define _TIMBUART_H
+
+#define TIMBUART_FIFO_SIZE	2048
+
+#define TIMBUART_RXFIFO		0x08
+#define TIMBUART_TXFIFO		0x0c
+#define TIMBUART_IER		0x10
+#define TIMBUART_IPR		0x14
+#define TIMBUART_ISR		0x18
+#define TIMBUART_CTRL		0x1c
+#define TIMBUART_BAUDRATE	0x20
+
+#define TIMBUART_CTRL_RTS	0x01
+#define TIMBUART_CTRL_CTS	0x02
+#define TIMBUART_CTRL_FLSHTX	0x40
+#define TIMBUART_CTRL_FLSHRX	0x80
+
+#define TXBF		0x01
+#define TXBAE		0x02
+#define CTS_DELTA	0x04
+#define RXDP		0x08
+#define RXBAF		0x10
+#define RXBF		0x20
+#define RXTT		0x40
+#define RXBNAE		0x80
+#define TXBE		0x100
+
+#define RXFLAGS (RXDP | RXBAF | RXBF | RXTT | RXBNAE)
+#define TXFLAGS (TXBF | TXBAE)
+
+#define TIMBUART_MAJOR 204
+#define TIMBUART_MINOR 192
+
+#endif /* _TIMBUART_H */
+
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 48766ea..6fd80c4 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -168,6 +168,9 @@
 /* MAX3100 */
 #define PORT_MAX3100    86
 
+/* Timberdale UART */
+#define PORT_TIMBUART	87
+
 #ifdef __KERNEL__
 
 #include <linux/compiler.h>
-- 
cgit v0.10.2


From 7d55deaf50182c47c1e805dc8cc85f2769f0673e Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 14:27:13 +0100
Subject: timbuart: Fix the termios logic

The driver only handles speeds but it fails to return the current values
for the hardware features it does not support.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/timbuart.c b/drivers/serial/timbuart.c
index 30ba3c4..ac9e5d5 100644
--- a/drivers/serial/timbuart.c
+++ b/drivers/serial/timbuart.c
@@ -278,7 +278,7 @@ static int get_bindex(int baud)
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(baudrates); i++)
-		if (baud == baudrates[i])
+		if (baud <= baudrates[i])
 			return i;
 
 	return -1;
@@ -296,14 +296,20 @@ static void timbuart_set_termios(struct uart_port *port,
 	bindex = get_bindex(baud);
 	dev_dbg(port->dev, "%s - bindex %d\n", __func__, bindex);
 
-	if (bindex < 0) {
-		printk(KERN_ALERT "timbuart: Unsupported baud rate\n");
-	} else {
-		spin_lock_irqsave(&port->lock, flags);
-		iowrite8((u8)bindex, port->membase + TIMBUART_BAUDRATE);
-		uart_update_timeout(port, termios->c_cflag, baud);
-		spin_unlock_irqrestore(&port->lock, flags);
-	}
+	if (bindex < 0)
+		bindex = 0;
+	baud = baudrates[bindex];
+
+	/* The serial layer calls into this once with old = NULL when setting
+	   up initially */
+	if (old)
+		tty_termios_copy_hw(termios, old);
+	tty_termios_encode_baud_rate(termios, baud, baud);
+
+	spin_lock_irqsave(&port->lock, flags);
+	iowrite8((u8)bindex, port->membase + TIMBUART_BAUDRATE);
+	uart_update_timeout(port, termios->c_cflag, baud);
+	spin_unlock_irqrestore(&port->lock, flags);
 }
 
 static const char *timbuart_type(struct uart_port *port)
-- 
cgit v0.10.2


From 00b040deca907a113f5bef67a6cc7a4f65a5ace9 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 14:29:29 +0100
Subject: tty: resolve some sierra breakage

The various merges into the sierra driver inadvertently undid
commit 212b8f0c3f5a2280bfa1d6ab13a6fe98552becaa by Elina Pasheva
<epasheva@sierrawireless.com>. Put it back so the OBEX port works again.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c
index 1319b89..222a623 100644
--- a/drivers/usb/serial/sierra.c
+++ b/drivers/usb/serial/sierra.c
@@ -257,8 +257,19 @@ static int sierra_send_setup(struct usb_serial_port *port)
 		val |= 0x02;
 
 	/* If composite device then properly report interface */
-	if (serial->num_ports == 1)
+	if (serial->num_ports == 1) {
 		interface = sierra_calc_interface(serial);
+		/* Control message is sent only to interfaces with
+		 * interrupt_in endpoints
+		 */
+		if (port->interrupt_in_urb) {
+			/* send control message */
+			return usb_control_msg(serial->dev,
+				usb_rcvctrlpipe(serial->dev, 0),
+				0x22, 0x21, val, interface,
+				NULL, 0, USB_CTRL_SET_TIMEOUT);
+		}
+	}
 
 	/* Otherwise the need to do non-composite mapping */
 	else {
@@ -268,11 +279,11 @@ static int sierra_send_setup(struct usb_serial_port *port)
 			interface = 1;
 		else if (port->bulk_out_endpointAddress == 5)
 			interface = 2;
-	}
-	return usb_control_msg(serial->dev,
+		return usb_control_msg(serial->dev,
 			usb_rcvctrlpipe(serial->dev, 0),
 			0x22, 0x21, val, interface,
 			NULL, 0, USB_CTRL_SET_TIMEOUT);
+	}
 	return 0;
 }
 
-- 
cgit v0.10.2


From b9a44bc19f48fd82b8f411500a9bb0ea4153d23c Mon Sep 17 00:00:00 2001
From: Elina Pasheva <epasheva@sierrawireless.com>
Date: Thu, 11 Jun 2009 14:30:21 +0100
Subject: sierra: driver urb handling improvements

[Folded from eight patches into one as the original set according to the
 author "All of the patches need to be applied to obtain a working product"
 so keeping them split seems unhelpful

 Merge fixes done versus other conflicting changes and moved the
 spin_lock_init from open to setup time -- Alan]

Summary of the changes and code re-organization in this patch:

- The memory for urbs is allocated and urbs are submitted only for the active
  interfaces (instead of pre-allocating these for all interfaces). This will
  save memory especially in the case of using composite devices.
- The code has been re-organized and functionality has been extracted from
  sierra_startup(), sierra_shutdown(), sierra_open(), sierra_close() and added
  in helper functions sierra_release_urb(), sierra_stop_rx_urbs(),
  sierra_submit_rx_urbs() and sierra_setup_urb()

- Added function sierra_release_urb() to free an urb and its transfer
buffer.
- Removed unecessary include file reference and comment.
- Added function sierra_stop_rx_urbs() that takes care of the release of
receive and interrupt urbs. This function is to be called by sierra_close()
whenever an interface is de-activated.
- Added new function sierra_submit_rx_urbs() that handles the submission of
receive urbs and interrupt urbs (if any) during the interface activation.
This function is to be called by sierra_open(). Added a second parameter to
pass the memory allocation (as suggested by Oliver Neukum) so that this
function can be used in post_reset() and resume().
- Added new function sierra_setup_urb() that contains the functionality to
allocate an urb, fill bulk urb using the supplied memory allocation flag
and release urb upon error. Added parameter so that the caller pass the
memory allocation flag for flexibility.
- Moved sierra_close() before sierra_open() to resolve dependencies
introduced by the code reorganization.
- Modified sierra_close() to call sierra_stop_rx_urbs() and
sierra_release_urb() functions added in previous patch.
- Modified sierra_open() to call sierra_setup_urb() and sierra_submit_rx_urbs()
functions; note urbs are allocated and submitted for each activated interface.
- Modified sierra_startup() so that allocation of urbs happens whenever an
interface is activated (urb allocation is moved to sierra_open()).
- Modified sierra_shutdown() so that urbs are freed whenever an interface is
de-activated (urb freeing moved to sierra_close() as shown in previous patch
from the series)
- Removed unecessary data structure from sierra_port_private_data
- Suppress an entry in logs by not re-submitting an urb when usb_submit_urb()
returns -EPERM, as this shows that usb_kill_urb() is running (as suggested by
Oliver Neukum)

Signed-off-by: Elina Pasheva <epasheva@sierrawireless.com>
Signed-off-by: Alan Cox <alan.cox@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c
index 222a623..88ec7bf 100644
--- a/drivers/usb/serial/sierra.c
+++ b/drivers/usb/serial/sierra.c
@@ -26,12 +26,10 @@
 #include <linux/module.h>
 #include <linux/usb.h>
 #include <linux/usb/serial.h>
-#include <linux/usb/ch9.h>
 
 #define SWIMS_USB_REQUEST_SetPower	0x00
 #define SWIMS_USB_REQUEST_SetNmea	0x07
 
-/* per port private data */
 #define N_IN_URB	4
 #define N_OUT_URB	4
 #define IN_BUFLEN	4096
@@ -229,7 +227,6 @@ struct sierra_port_private {
 
 	/* Input endpoints and buffers for this port */
 	struct urb *in_urbs[N_IN_URB];
-	char *in_buffer[N_IN_URB];
 
 	/* Settings for the port */
 	int rts_state;	/* Handshaking pins (outputs) */
@@ -334,6 +331,17 @@ static int sierra_tiocmset(struct tty_struct *tty, struct file *file,
 	return sierra_send_setup(port);
 }
 
+static void sierra_release_urb(struct urb *urb)
+{
+	struct usb_serial_port *port;
+	if (urb) {
+		port =  urb->context;
+		dev_dbg(&port->dev, "%s: %p\n", __func__, urb);
+		kfree(urb->transfer_buffer);
+		usb_free_urb(urb);
+	}
+}
+
 static void sierra_outdat_callback(struct urb *urb)
 {
 	struct usb_serial_port *port = urb->context;
@@ -458,7 +466,7 @@ static void sierra_indat_callback(struct urb *urb)
 				" received", __func__);
 
 		/* Resubmit urb so we continue receiving */
-		if (port->port.count && status != -ESHUTDOWN) {
+		if (port->port.count && status != -ESHUTDOWN && status != -EPERM) {
 			err = usb_submit_urb(urb, GFP_ATOMIC);
 			if (err)
 				dev_err(&port->dev, "resubmit read urb failed."
@@ -550,100 +558,184 @@ static int sierra_write_room(struct tty_struct *tty)
 	return 2048;
 }
 
-static int sierra_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void sierra_stop_rx_urbs(struct usb_serial_port *port)
 {
-	struct sierra_port_private *portdata;
-	struct usb_serial *serial = port->serial;
 	int i;
-	struct urb *urb;
-	int result;
+	struct sierra_port_private *portdata = usb_get_serial_port_data(port);
 
-	portdata = usb_get_serial_port_data(port);
+	for (i = 0; i < ARRAY_SIZE(portdata->in_urbs); i++)
+		usb_kill_urb(portdata->in_urbs[i]);
 
-	dev_dbg(&port->dev, "%s", __func__);
+	usb_kill_urb(port->interrupt_in_urb);
+}
 
-	/* Set some sane defaults */
-	portdata->rts_state = 1;
-	portdata->dtr_state = 1;
+static int sierra_submit_rx_urbs(struct usb_serial_port *port, gfp_t mem_flags)
+{
+	int ok_cnt;
+	int err = -EINVAL;
+	int i;
+	struct urb *urb;
+	struct sierra_port_private *portdata = usb_get_serial_port_data(port);
 
-	/* Reset low level data toggle and start reading from endpoints */
-	for (i = 0; i < N_IN_URB; i++) {
+	ok_cnt = 0;
+	for (i = 0; i < ARRAY_SIZE(portdata->in_urbs); i++) {
 		urb = portdata->in_urbs[i];
 		if (!urb)
 			continue;
-		if (urb->dev != serial->dev) {
-			dev_dbg(&port->dev, "%s: dev %p != %p",
-				 __func__, urb->dev, serial->dev);
-			continue;
+		err = usb_submit_urb(urb, mem_flags);
+		if (err) {
+			dev_err(&port->dev, "%s: submit urb failed: %d\n",
+				__func__, err);
+		} else {
+			ok_cnt++;
 		}
+	}
 
-		/*
-		 * make sure endpoint data toggle is synchronized with the
-		 * device
-		 */
-		usb_clear_halt(urb->dev, urb->pipe);
-
-		result = usb_submit_urb(urb, GFP_KERNEL);
-		if (result) {
-			dev_err(&port->dev, "submit urb %d failed (%d) %d\n",
-				i, result, urb->transfer_buffer_length);
+	if (ok_cnt && port->interrupt_in_urb) {
+		err = usb_submit_urb(port->interrupt_in_urb, mem_flags);
+		if (err) {
+			dev_err(&port->dev, "%s: submit intr urb failed: %d\n",
+				__func__, err);
 		}
 	}
 
-	sierra_send_setup(port);
+	if (ok_cnt > 0) /* at least one rx urb submitted */
+		return 0;
+	else
+		return err;
+}
+
+static struct urb *sierra_setup_urb(struct usb_serial *serial, int endpoint,
+					int dir, void *ctx, int len,
+					gfp_t mem_flags,
+					usb_complete_t callback)
+{
+	struct urb	*urb;
+	u8		*buf;
+
+	if (endpoint == -1)
+		return NULL;
 
-	/* start up the interrupt endpoint if we have one */
-	if (port->interrupt_in_urb) {
-		result = usb_submit_urb(port->interrupt_in_urb, GFP_KERNEL);
-		if (result)
-			dev_err(&port->dev, "submit irq_in urb failed %d\n",
-				result);
+	urb = usb_alloc_urb(0, mem_flags);
+	if (urb == NULL) {
+		dev_dbg(&serial->dev->dev, "%s: alloc for endpoint %d failed\n",
+			__func__, endpoint);
+		return NULL;
 	}
-	return 0;
+
+	buf = kmalloc(len, mem_flags);
+	if (buf) {
+		/* Fill URB using supplied data */
+		usb_fill_bulk_urb(urb, serial->dev,
+			usb_sndbulkpipe(serial->dev, endpoint) | dir,
+			buf, len, callback, ctx);
+
+		/* debug */
+		dev_dbg(&serial->dev->dev, "%s %c u : %p d:%p\n", __func__,
+				dir == USB_DIR_IN ? 'i' : 'o', urb, buf);
+	} else {
+		dev_dbg(&serial->dev->dev, "%s %c u:%p d:%p\n", __func__,
+				dir == USB_DIR_IN ? 'i' : 'o', urb, buf);
+
+		sierra_release_urb(urb);
+		urb = NULL;
+	}
+
+	return urb;
 }
 
-static void sierra_dtr_rts(struct usb_serial_port *port, int on)
+static void sierra_close(struct usb_serial_port *port)
 {
+	int i;
 	struct usb_serial *serial = port->serial;
 	struct sierra_port_private *portdata;
 
+	dev_dbg(&port->dev, "%s\n", __func__);
 	portdata = usb_get_serial_port_data(port);
-	portdata->rts_state = on;
-	portdata->dtr_state = on;
+
+	portdata->rts_state = 0;
+	portdata->dtr_state = 0;
 
 	if (serial->dev) {
 		mutex_lock(&serial->disc_mutex);
 		if (!serial->disconnected)
 			sierra_send_setup(port);
 		mutex_unlock(&serial->disc_mutex);
+
+		/* Stop reading urbs */
+		sierra_stop_rx_urbs(port);
+		/* .. and release them */
+		for (i = 0; i < N_IN_URB; i++) {
+			sierra_release_urb(portdata->in_urbs[i]);
+			portdata->in_urbs[i] = NULL;
+		}
 	}
 }
 
-static void sierra_close(struct usb_serial_port *port)
+static int sierra_open(struct tty_struct *tty,
+			struct usb_serial_port *port, struct file *filp)
 {
+	struct sierra_port_private *portdata;
+	struct usb_serial *serial = port->serial;
 	int i;
+	int err;
+	int endpoint;
+	struct urb *urb;
+
+	portdata = usb_get_serial_port_data(port);
+
+	dev_dbg(&port->dev, "%s", __func__);
+
+	/* Set some sane defaults */
+	portdata->rts_state = 1;
+	portdata->dtr_state = 1;
+
+
+	endpoint = port->bulk_in_endpointAddress;
+	for (i = 0; i < ARRAY_SIZE(portdata->in_urbs); i++) {
+		urb = sierra_setup_urb(serial, endpoint, USB_DIR_IN, port,
+					IN_BUFLEN, GFP_KERNEL,
+					sierra_indat_callback);
+		portdata->in_urbs[i] = urb;
+	}
+	/* clear halt condition */
+	usb_clear_halt(serial->dev,
+			usb_sndbulkpipe(serial->dev, endpoint) | USB_DIR_IN);
+
+	err = sierra_submit_rx_urbs(port, GFP_KERNEL);
+	if (err) {
+		/* get rid of everything as in close */
+		sierra_close(port);
+		return err;
+	}
+	sierra_send_setup(port);
+
+	return 0;
+}
+
+
+static void sierra_dtr_rts(struct usb_serial_port *port, int on)
+{
 	struct usb_serial *serial = port->serial;
 	struct sierra_port_private *portdata;
 
-	dev_dbg(&port->dev, "%s", __func__);
 	portdata = usb_get_serial_port_data(port);
+	portdata->rts_state = on;
+	portdata->dtr_state = on;
 
 	if (serial->dev) {
-		/* Stop reading/writing urbs */
-		for (i = 0; i < N_IN_URB; i++)
-			usb_kill_urb(portdata->in_urbs[i]);
+		mutex_lock(&serial->disc_mutex);
+		if (!serial->disconnected)
+			sierra_send_setup(port);
+		mutex_unlock(&serial->disc_mutex);
 	}
-	usb_kill_urb(port->interrupt_in_urb);
 }
 
 static int sierra_startup(struct usb_serial *serial)
 {
 	struct usb_serial_port *port;
 	struct sierra_port_private *portdata;
-	struct urb *urb;
 	int i;
-	int j;
 
 	dev_dbg(&serial->dev->dev, "%s", __func__);
 
@@ -665,34 +757,8 @@ static int sierra_startup(struct usb_serial *serial)
 			return -ENOMEM;
 		}
 		spin_lock_init(&portdata->lock);
-		for (j = 0; j < N_IN_URB; j++) {
-			portdata->in_buffer[j] = kmalloc(IN_BUFLEN, GFP_KERNEL);
-			if (!portdata->in_buffer[j]) {
-				for (--j; j >= 0; j--)
-					kfree(portdata->in_buffer[j]);
-				kfree(portdata);
-				return -ENOMEM;
-			}
-		}
-
+		/* Set the port private data pointer */
 		usb_set_serial_port_data(port, portdata);
-
-		/* initialize the in urbs */
-		for (j = 0; j < N_IN_URB; ++j) {
-			urb = usb_alloc_urb(0, GFP_KERNEL);
-			if (urb == NULL) {
-				dev_dbg(&port->dev, "%s: alloc for in "
-					"port failed.", __func__);
-				continue;
-			}
-			/* Fill URB using supplied data. */
-			usb_fill_bulk_urb(urb, serial->dev,
-					  usb_rcvbulkpipe(serial->dev,
-						port->bulk_in_endpointAddress),
-					  portdata->in_buffer[j], IN_BUFLEN,
-					  sierra_indat_callback, port);
-			portdata->in_urbs[j] = urb;
-		}
 	}
 
 	return 0;
@@ -700,7 +766,7 @@ static int sierra_startup(struct usb_serial *serial)
 
 static void sierra_shutdown(struct usb_serial *serial)
 {
-	int i, j;
+	int i;
 	struct usb_serial_port *port;
 	struct sierra_port_private *portdata;
 
@@ -713,12 +779,6 @@ static void sierra_shutdown(struct usb_serial *serial)
 		portdata = usb_get_serial_port_data(port);
 		if (!portdata)
 			continue;
-
-		for (j = 0; j < N_IN_URB; j++) {
-			usb_kill_urb(portdata->in_urbs[j]);
-			usb_free_urb(portdata->in_urbs[j]);
-			kfree(portdata->in_buffer[j]);
-		}
 		kfree(portdata);
 		usb_set_serial_port_data(port, NULL);
 	}
-- 
cgit v0.10.2


From 4db2299da213d1ba8cf7f4c0a197ae7ba49db5cb Mon Sep 17 00:00:00 2001
From: Elina Pasheva <epasheva@sierrawireless.com>
Date: Thu, 11 Jun 2009 14:32:01 +0100
Subject: sierra: driver interface blacklisting

Interface blacklisting is necessary for non-serial interfaces that are handled
by a different driver. The interface blacklisting is implemented in sierra
driver per device. Each device in need of a blacklist has a static information
array kept in the driver. This array contains the interface numbers that are
blacklisted. The pointer for each blacklist array and the length
of that blacklist are 'bundled' in data structure sierra_iface_info. A pointer
to this information is set in id_table when the device is added to the id_table.

The following is summary of changes we have made to sierra.c driver in
this patch dealing with interface blacklisting support:
- Added data structure sierra_iface_info and function is_blacklisted()
to support blacklisting
- Modified sierra_probe() to handle blacklisted interfaces accordingly
- Improved comments in id_table
- Added new device in id_table with blacklist interface support

Signed-off-by: Elina Pasheva <epasheva@sierrawireless.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c
index 88ec7bf..17ac34f 100644
--- a/drivers/usb/serial/sierra.c
+++ b/drivers/usb/serial/sierra.c
@@ -37,6 +37,12 @@
 static int debug;
 static int nmea;
 
+/* Used in interface blacklisting */
+struct sierra_iface_info {
+	const u32 infolen;	/* number of interface numbers on blacklist */
+	const u8  *ifaceinfo;	/* pointer to the array holding the numbers */
+};
+
 static int sierra_set_power_state(struct usb_device *udev, __u16 swiState)
 {
 	int result;
@@ -83,6 +89,23 @@ static int sierra_calc_num_ports(struct usb_serial *serial)
 	return result;
 }
 
+static int is_blacklisted(const u8 ifnum,
+				const struct sierra_iface_info *blacklist)
+{
+	const u8  *info;
+	int i;
+
+	if (blacklist) {
+		info = blacklist->ifaceinfo;
+
+		for (i = 0; i < blacklist->infolen; i++) {
+			if (info[i] == ifnum)
+				return 1;
+		}
+	}
+	return 0;
+}
+
 static int sierra_calc_interface(struct usb_serial *serial)
 {
 	int interface;
@@ -151,9 +174,25 @@ static int sierra_probe(struct usb_serial *serial,
 	 */
 	usb_set_serial_data(serial, (void *)num_ports);
 
+	/* ifnum could have changed - by calling usb_set_interface */
+	ifnum = sierra_calc_interface(serial);
+
+	if (is_blacklisted(ifnum,
+				(struct sierra_iface_info *)id->driver_info)) {
+		dev_dbg(&serial->dev->dev,
+			"Ignoring blacklisted interface #%d\n", ifnum);
+		return -ENODEV;
+	}
+
 	return result;
 }
 
+static const u8 direct_ip_non_serial_ifaces[] = { 7, 8, 9, 10, 11 };
+static const struct sierra_iface_info direct_ip_interface_blacklist = {
+	.infolen = ARRAY_SIZE(direct_ip_non_serial_ifaces),
+	.ifaceinfo = direct_ip_non_serial_ifaces,
+};
+
 static struct usb_device_id id_table [] = {
 	{ USB_DEVICE(0x1199, 0x0017) },	/* Sierra Wireless EM5625 */
 	{ USB_DEVICE(0x1199, 0x0018) },	/* Sierra Wireless MC5720 */
@@ -186,9 +225,11 @@ static struct usb_device_id id_table [] = {
 	{ USB_DEVICE(0x1199, 0x6833) },	/* Sierra Wireless MC8781 */
 	{ USB_DEVICE(0x1199, 0x683A) },	/* Sierra Wireless MC8785 */
 	{ USB_DEVICE(0x1199, 0x683B) },	/* Sierra Wireless MC8785 Composite */
-	{ USB_DEVICE(0x1199, 0x683C) },	/* Sierra Wireless MC8790 */
-	{ USB_DEVICE(0x1199, 0x683D) },	/* Sierra Wireless MC8790 */
-	{ USB_DEVICE(0x1199, 0x683E) },	/* Sierra Wireless MC8790 */
+	/* Sierra Wireless MC8790, MC8791, MC8792 Composite */
+	{ USB_DEVICE(0x1199, 0x683C) },
+	{ USB_DEVICE(0x1199, 0x683D) },	/* Sierra Wireless MC8791 Composite */
+	/* Sierra Wireless MC8790, MC8791, MC8792 */
+	{ USB_DEVICE(0x1199, 0x683E) },
 	{ USB_DEVICE(0x1199, 0x6850) },	/* Sierra Wireless AirCard 880 */
 	{ USB_DEVICE(0x1199, 0x6851) },	/* Sierra Wireless AirCard 881 */
 	{ USB_DEVICE(0x1199, 0x6852) },	/* Sierra Wireless AirCard 880 E */
@@ -209,6 +250,10 @@ static struct usb_device_id id_table [] = {
 	{ USB_DEVICE(0x1199, 0x0112) }, /* Sierra Wireless AirCard 580 */
 	{ USB_DEVICE(0x0F3D, 0x0112) }, /* Airprime/Sierra PC 5220 */
 
+	{ USB_DEVICE(0x1199, 0x68A3), 	/* Sierra Wireless Direct IP modems */
+	  .driver_info = (kernel_ulong_t)&direct_ip_interface_blacklist
+	},
+
 	{ }
 };
 MODULE_DEVICE_TABLE(usb, id_table);
-- 
cgit v0.10.2


From 5fc5b42a3bb564f0b6e03f0f1b522ed9100250ad Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 14:32:42 +0100
Subject: tty: remove sleep_on

Use wait_event instead of sleep_on in tty_block_til_ready.

Wait for ASYNC_CLOSING flag being 0.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/tty_port.c b/drivers/char/tty_port.c
index 4d08b6d..931af10 100644
--- a/drivers/char/tty_port.c
+++ b/drivers/char/tty_port.c
@@ -198,7 +198,8 @@ int tty_port_block_til_ready(struct tty_port *port,
 
 	/* block if port is in the process of being closed */
 	if (tty_hung_up_p(filp) || port->flags & ASYNC_CLOSING) {
-		interruptible_sleep_on(&port->close_wait);
+		wait_event_interruptible(port->close_wait,
+				!(port->flags & ASYNC_CLOSING));
 		if (port->flags & ASYNC_HUP_NOTIFY)
 			return -EAGAIN;
 		else
-- 
cgit v0.10.2


From 3e3b5c087799e536871c8261b05bc28e4783c8da Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 14:33:37 +0100
Subject: tty: use prepare/finish_wait

Use prepare_to_wait and finish_wait instead of add_wait_queue and
remove_wait_queue.

This avoids us setting a task state.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/tty_port.c b/drivers/char/tty_port.c
index 931af10..62dadfc 100644
--- a/drivers/char/tty_port.c
+++ b/drivers/char/tty_port.c
@@ -222,7 +222,6 @@ int tty_port_block_til_ready(struct tty_port *port,
 	   before the next open may complete */
 
 	retval = 0;
-	add_wait_queue(&port->open_wait, &wait);
 
 	/* The port lock protects the port counts */
 	spin_lock_irqsave(&port->lock, flags);
@@ -236,7 +235,7 @@ int tty_port_block_til_ready(struct tty_port *port,
 		if (tty->termios->c_cflag & CBAUD)
 			tty_port_raise_dtr_rts(port);
 
-		set_current_state(TASK_INTERRUPTIBLE);
+		prepare_to_wait(&port->open_wait, &wait, TASK_INTERRUPTIBLE);
 		/* Check for a hangup or uninitialised port. Return accordingly */
 		if (tty_hung_up_p(filp) || !(port->flags & ASYNC_INITIALIZED)) {
 			if (port->flags & ASYNC_HUP_NOTIFY)
@@ -257,8 +256,7 @@ int tty_port_block_til_ready(struct tty_port *port,
 		}
 		schedule();
 	}
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&port->open_wait, &wait);
+	finish_wait(&port->open_wait, &wait);
 
 	/* Update counts. A parallel hangup will have set count to zero and
 	   we must not mess that up further */
-- 
cgit v0.10.2


From d3810cd4d7064b109574374f73c41b11b481dbf2 Mon Sep 17 00:00:00 2001
From: Oskar Schirmer <os@emlix.com>
Date: Thu, 11 Jun 2009 14:35:01 +0100
Subject: imx: serial: fix whitespaces (no changes in functionality)

Signed-off-by: Oskar Schirmer <os@emlix.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/imx.c b/drivers/serial/imx.c
index 5f0be40..76c8fa1 100644
--- a/drivers/serial/imx.c
+++ b/drivers/serial/imx.c
@@ -302,8 +302,7 @@ static inline void imx_transmit_buffer(struct imx_port *sport)
 		/* send xmit->buf[xmit->tail]
 		 * out the port here */
 		writel(xmit->buf[xmit->tail], sport->port.membase + URTX0);
-		xmit->tail = (xmit->tail + 1) &
-		         (UART_XMIT_SIZE - 1);
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
 		sport->port.icount.tx++;
 		if (uart_circ_empty(xmit))
 			break;
@@ -395,8 +394,7 @@ static irqreturn_t imx_rxint(int irq, void *dev_id)
 				continue;
 		}
 
-		if (uart_handle_sysrq_char
-		            (&sport->port, (unsigned char)rx))
+		if (uart_handle_sysrq_char(&sport->port, (unsigned char)rx))
 			continue;
 
 		if (rx & (URXD_PRERR | URXD_OVRRUN | URXD_FRMERR) ) {
@@ -471,26 +469,26 @@ static unsigned int imx_tx_empty(struct uart_port *port)
  */
 static unsigned int imx_get_mctrl(struct uart_port *port)
 {
-        struct imx_port *sport = (struct imx_port *)port;
-        unsigned int tmp = TIOCM_DSR | TIOCM_CAR;
+	struct imx_port *sport = (struct imx_port *)port;
+	unsigned int tmp = TIOCM_DSR | TIOCM_CAR;
 
-        if (readl(sport->port.membase + USR1) & USR1_RTSS)
-                tmp |= TIOCM_CTS;
+	if (readl(sport->port.membase + USR1) & USR1_RTSS)
+		tmp |= TIOCM_CTS;
 
-        if (readl(sport->port.membase + UCR2) & UCR2_CTS)
-                tmp |= TIOCM_RTS;
+	if (readl(sport->port.membase + UCR2) & UCR2_CTS)
+		tmp |= TIOCM_RTS;
 
-        return tmp;
+	return tmp;
 }
 
 static void imx_set_mctrl(struct uart_port *port, unsigned int mctrl)
 {
-        struct imx_port *sport = (struct imx_port *)port;
+	struct imx_port *sport = (struct imx_port *)port;
 	unsigned long temp;
 
 	temp = readl(sport->port.membase + UCR2) & ~UCR2_CTS;
 
-        if (mctrl & TIOCM_RTS)
+	if (mctrl & TIOCM_RTS)
 		temp |= UCR2_CTS;
 
 	writel(temp, sport->port.membase + UCR2);
@@ -1072,22 +1070,22 @@ static struct uart_driver imx_reg = {
 
 static int serial_imx_suspend(struct platform_device *dev, pm_message_t state)
 {
-        struct imx_port *sport = platform_get_drvdata(dev);
+	struct imx_port *sport = platform_get_drvdata(dev);
 
-        if (sport)
-                uart_suspend_port(&imx_reg, &sport->port);
+	if (sport)
+		uart_suspend_port(&imx_reg, &sport->port);
 
-        return 0;
+	return 0;
 }
 
 static int serial_imx_resume(struct platform_device *dev)
 {
-        struct imx_port *sport = platform_get_drvdata(dev);
+	struct imx_port *sport = platform_get_drvdata(dev);
 
-        if (sport)
-                uart_resume_port(&imx_reg, &sport->port);
+	if (sport)
+		uart_resume_port(&imx_reg, &sport->port);
 
-        return 0;
+	return 0;
 }
 
 static int serial_imx_probe(struct platform_device *pdev)
@@ -1143,7 +1141,7 @@ static int serial_imx_probe(struct platform_device *pdev)
 	imx_ports[pdev->id] = sport;
 
 	pdata = pdev->dev.platform_data;
-	if(pdata && (pdata->flags & IMXUART_HAVE_RTSCTS))
+	if (pdata && (pdata->flags & IMXUART_HAVE_RTSCTS))
 		sport->have_rtscts = 1;
 
 	if (pdata->init) {
@@ -1193,13 +1191,13 @@ static int serial_imx_remove(struct platform_device *pdev)
 }
 
 static struct platform_driver serial_imx_driver = {
-        .probe          = serial_imx_probe,
-        .remove         = serial_imx_remove,
+	.probe		= serial_imx_probe,
+	.remove		= serial_imx_remove,
 
 	.suspend	= serial_imx_suspend,
 	.resume		= serial_imx_resume,
 	.driver		= {
-	        .name	= "imx-uart",
+		.name	= "imx-uart",
 		.owner	= THIS_MODULE,
 	},
 };
-- 
cgit v0.10.2


From 26bbb3ff1ff6163d6a233055766e26af8054a212 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Gl=C3=B6ckner?= <dg@emlix.com>
Date: Thu, 11 Jun 2009 14:36:29 +0100
Subject: imx: serial: fix one bit field type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

"have_rtscts" is assigned 1, while it is declared
int:1, two's complement, which can hold 0 and -1
only. The code works, as the upper bits are cut
off, and tests are done against 0 only.

Nonetheless, correctly declaring the bit field
as unsigned int:1 renders the code more robust.

Signed-off-by: Daniel Glöckner <dg@emlix.com>
Signed-off-by: Oskar Schirmer <os@emlix.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/imx.c b/drivers/serial/imx.c
index 76c8fa1..6b8f12f4 100644
--- a/drivers/serial/imx.c
+++ b/drivers/serial/imx.c
@@ -211,7 +211,7 @@ struct imx_port {
 	struct timer_list	timer;
 	unsigned int		old_status;
 	int			txirq,rxirq,rtsirq;
-	int			have_rtscts:1;
+	unsigned int		have_rtscts:1;
 	struct clk		*clk;
 };
 
-- 
cgit v0.10.2


From 977757311e50dc5d832c9fef34e7555411f7ccd8 Mon Sep 17 00:00:00 2001
From: Fabian Godehardt <fg@emlix.com>
Date: Thu, 11 Jun 2009 14:37:19 +0100
Subject: imx: serial: notify higher layers in case xmit IRQ was not called

upper layers, namely line discipline, need to be notified
when transmission of more data is possible. For spurious
cases, where IRQ handling does not supply notification
for sure, it is given additionally here, when data has just
been transmitted and space in the buffer will most probably
be available.

Signed-off-by: Fabian Godehardt <fg@emlix.com>
Signed-off-by: Oskar Schirmer <os@emlix.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/imx.c b/drivers/serial/imx.c
index 6b8f12f4..49f2e12 100644
--- a/drivers/serial/imx.c
+++ b/drivers/serial/imx.c
@@ -308,6 +308,9 @@ static inline void imx_transmit_buffer(struct imx_port *sport)
 			break;
 	}
 
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(&sport->port);
+
 	if (uart_circ_empty(xmit))
 		imx_stop_tx(&sport->port);
 }
-- 
cgit v0.10.2


From 2e1463922a35584c863f71d4021e1e71f76eaed0 Mon Sep 17 00:00:00 2001
From: Fabian Godehardt <fg@emlix.com>
Date: Thu, 11 Jun 2009 14:38:38 +0100
Subject: imx: serial: be sure to stop xmit upon shutdown

needed to avoid continued transmission by hardware
while software already shuts down, which might
cause dangling characters to show up in hardware
queues when restarting the device.

Signed-off-by: Fabian Godehardt <fg@emlix.com>
Signed-off-by: Oskar Schirmer <os@emlix.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/imx.c b/drivers/serial/imx.c
index 49f2e12..e6c2ba2 100644
--- a/drivers/serial/imx.c
+++ b/drivers/serial/imx.c
@@ -634,6 +634,10 @@ static void imx_shutdown(struct uart_port *port)
 	struct imx_port *sport = (struct imx_port *)port;
 	unsigned long temp;
 
+	temp = readl(sport->port.membase + UCR2);
+	temp &= ~(UCR2_TXEN);
+	writel(temp, sport->port.membase + UCR2);
+
 	/*
 	 * Stop our timer.
 	 */
-- 
cgit v0.10.2


From 9f322ad064f9210e7d472dfe77e702274d5c9dba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Gl=C3=B6ckner?= <dg@emlix.com>
Date: Thu, 11 Jun 2009 14:39:21 +0100
Subject: imx: serial: handle initialisation failure correctly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

correctly de-initialise device when setting up failed,
call to pdata->exit() was missing.

Signed-off-by: Daniel Glöckner <dg@emlix.com>
Signed-off-by: Oskar Schirmer <os@emlix.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/imx.c b/drivers/serial/imx.c
index e6c2ba2..cbd4f32 100644
--- a/drivers/serial/imx.c
+++ b/drivers/serial/imx.c
@@ -1157,10 +1157,15 @@ static int serial_imx_probe(struct platform_device *pdev)
 			goto clkput;
 	}
 
-	uart_add_one_port(&imx_reg, &sport->port);
+	ret = uart_add_one_port(&imx_reg, &sport->port);
+	if (ret)
+		goto deinit;
 	platform_set_drvdata(pdev, &sport->port);
 
 	return 0;
+deinit:
+	if (pdata->exit)
+		pdata->exit(pdev);
 clkput:
 	clk_put(sport->clk);
 	clk_disable(sport->clk);
-- 
cgit v0.10.2


From 8759ef32d992fc6c0bcbe40fca7aa302190918a5 Mon Sep 17 00:00:00 2001
From: Oskar Schirmer <os@emlix.com>
Date: Thu, 11 Jun 2009 14:51:15 +0100
Subject: lib: isolate rational fractions helper function

Provide a helper function to determine optimum numerator
denominator value pairs taking into account restricted
register size. Useful especially with PLL and other clock
configurations.

Signed-off-by: Oskar Schirmer <os@emlix.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/rational.h b/include/linux/rational.h
new file mode 100644
index 0000000..4f532fc
--- /dev/null
+++ b/include/linux/rational.h
@@ -0,0 +1,19 @@
+/*
+ * rational fractions
+ *
+ * Copyright (C) 2009 emlix GmbH, Oskar Schirmer <os@emlix.com>
+ *
+ * helper functions when coping with rational numbers,
+ * e.g. when calculating optimum numerator/denominator pairs for
+ * pll configuration taking into account restricted register size
+ */
+
+#ifndef _LINUX_RATIONAL_H
+#define _LINUX_RATIONAL_H
+
+void rational_best_approximation(
+	unsigned long given_numerator, unsigned long given_denominator,
+	unsigned long max_numerator, unsigned long max_denominator,
+	unsigned long *best_numerator, unsigned long *best_denominator);
+
+#endif /* _LINUX_RATIONAL_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index 8ade0a7..9960be0 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -10,6 +10,9 @@ menu "Library routines"
 config BITREVERSE
 	tristate
 
+config RATIONAL
+	boolean
+
 config GENERIC_FIND_FIRST_BIT
 	bool
 
diff --git a/lib/Makefile b/lib/Makefile
index 33a40e4..1f6edef 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -50,6 +50,7 @@ ifneq ($(CONFIG_HAVE_DEC_LOCK),y)
 endif
 
 obj-$(CONFIG_BITREVERSE) += bitrev.o
+obj-$(CONFIG_RATIONAL)	+= rational.o
 obj-$(CONFIG_CRC_CCITT)	+= crc-ccitt.o
 obj-$(CONFIG_CRC16)	+= crc16.o
 obj-$(CONFIG_CRC_T10DIF)+= crc-t10dif.o
diff --git a/lib/rational.c b/lib/rational.c
new file mode 100644
index 0000000..b3c099b
--- /dev/null
+++ b/lib/rational.c
@@ -0,0 +1,62 @@
+/*
+ * rational fractions
+ *
+ * Copyright (C) 2009 emlix GmbH, Oskar Schirmer <os@emlix.com>
+ *
+ * helper functions when coping with rational numbers
+ */
+
+#include <linux/rational.h>
+
+/*
+ * calculate best rational approximation for a given fraction
+ * taking into account restricted register size, e.g. to find
+ * appropriate values for a pll with 5 bit denominator and
+ * 8 bit numerator register fields, trying to set up with a
+ * frequency ratio of 3.1415, one would say:
+ *
+ * rational_best_approximation(31415, 10000,
+ *		(1 << 8) - 1, (1 << 5) - 1, &n, &d);
+ *
+ * you may look at given_numerator as a fixed point number,
+ * with the fractional part size described in given_denominator.
+ *
+ * for theoretical background, see:
+ * http://en.wikipedia.org/wiki/Continued_fraction
+ */
+
+void rational_best_approximation(
+	unsigned long given_numerator, unsigned long given_denominator,
+	unsigned long max_numerator, unsigned long max_denominator,
+	unsigned long *best_numerator, unsigned long *best_denominator)
+{
+	unsigned long n, d, n0, d0, n1, d1;
+	n = given_numerator;
+	d = given_denominator;
+	n0 = d1 = 0;
+	n1 = d0 = 1;
+	for (;;) {
+		unsigned long t, a;
+		if ((n1 > max_numerator) || (d1 > max_denominator)) {
+			n1 = n0;
+			d1 = d0;
+			break;
+		}
+		if (d == 0)
+			break;
+		t = d;
+		a = n / d;
+		d = n % d;
+		n = t;
+		t = n0 + a * n1;
+		n0 = n1;
+		n1 = t;
+		t = d0 + a * d1;
+		d0 = d1;
+		d1 = t;
+	}
+	*best_numerator = n1;
+	*best_denominator = d1;
+}
+
+EXPORT_SYMBOL(rational_best_approximation);
-- 
cgit v0.10.2


From 534fca068ec8063ec8b67626b3eb34ba6ec86967 Mon Sep 17 00:00:00 2001
From: Oskar Schirmer <os@emlix.com>
Date: Thu, 11 Jun 2009 14:52:23 +0100
Subject: imx: serial: use rational library function

for calculation of numerator and denominator
used in baud rate setting, use generic library function
for optimum settings.

Signed-off-by: Oskar Schirmer <os@emlix.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index 3391ef0..641e800 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -833,6 +833,7 @@ config SERIAL_IMX
 	bool "IMX serial port support"
 	depends on ARM && (ARCH_IMX || ARCH_MXC)
 	select SERIAL_CORE
+	select RATIONAL
 	help
 	  If you have a machine based on a Motorola IMX CPU you
 	  can enable its onboard serial port by enabling this option.
diff --git a/drivers/serial/imx.c b/drivers/serial/imx.c
index cbd4f32..0de81f7 100644
--- a/drivers/serial/imx.c
+++ b/drivers/serial/imx.c
@@ -41,6 +41,7 @@
 #include <linux/serial_core.h>
 #include <linux/serial.h>
 #include <linux/clk.h>
+#include <linux/rational.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
@@ -670,7 +671,8 @@ imx_set_termios(struct uart_port *port, struct ktermios *termios,
 	unsigned long flags;
 	unsigned int ucr2, old_ucr1, old_txrxen, baud, quot;
 	unsigned int old_csize = old ? old->c_cflag & CSIZE : CS8;
-	unsigned int div, num, denom, ufcr;
+	unsigned int div, ufcr;
+	unsigned long num, denom;
 
 	/*
 	 * If we don't support modem control lines, don't allow
@@ -772,32 +774,20 @@ imx_set_termios(struct uart_port *port, struct ktermios *termios,
 	if (!div)
 		div = 1;
 
-	num = baud;
-	denom = port->uartclk / div / 16;
+	rational_best_approximation(16 * div * baud, sport->port.uartclk,
+		1 << 16, 1 << 16, &num, &denom);
 
-	/* shift num and denom right until they fit into 16 bits */
-	while (num > 0x10000 || denom > 0x10000) {
-		num >>= 1;
-		denom >>= 1;
-	}
-	if (num > 0)
-		num -= 1;
-	if (denom > 0)
-		denom -= 1;
-
-	writel(num, sport->port.membase + UBIR);
-	writel(denom, sport->port.membase + UBMR);
-
-	if (div == 7)
-		div = 6; /* 6 in RFDIV means divide by 7 */
-	else
-		div = 6 - div;
+	num -= 1;
+	denom -= 1;
 
 	ufcr = readl(sport->port.membase + UFCR);
 	ufcr = (ufcr & (~UFCR_RFDIV)) |
 	    (div << 7);
 	writel(ufcr, sport->port.membase + UFCR);
 
+	writel(num, sport->port.membase + UBIR);
+	writel(denom, sport->port.membase + UBMR);
+
 #ifdef ONEMS
 	writel(sport->port.uartclk / div / 1000, sport->port.membase + ONEMS);
 #endif
-- 
cgit v0.10.2


From b6e4913834cd032082e5c76dfade61050212dc98 Mon Sep 17 00:00:00 2001
From: Fabian Godehardt <fg@emlix.com>
Date: Thu, 11 Jun 2009 14:53:18 +0100
Subject: imx: serial: add IrDA support to serial driver

Using the iMX serial driver with an IrDA device
needs extra peripheral settings and specific
timing depending on the transmitter circuitry used.

Signed-off-by: Fabian Godehardt <fg@emlix.com>
Signed-off-by: Oskar Schirmer <os@emlix.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/arm/plat-mxc/include/mach/imx-uart.h b/arch/arm/plat-mxc/include/mach/imx-uart.h
index 599217b..f9bd17d 100644
--- a/arch/arm/plat-mxc/include/mach/imx-uart.h
+++ b/arch/arm/plat-mxc/include/mach/imx-uart.h
@@ -20,11 +20,16 @@
 #define ASMARM_ARCH_UART_H
 
 #define IMXUART_HAVE_RTSCTS (1<<0)
+#define IMXUART_IRDA        (1<<1)
 
 struct imxuart_platform_data {
 	int (*init)(struct platform_device *pdev);
 	int (*exit)(struct platform_device *pdev);
 	unsigned int flags;
+	void (*irda_enable)(int enable);
+	unsigned int irda_inv_rx:1;
+	unsigned int irda_inv_tx:1;
+	unsigned short transceiver_delay;
 };
 
 #endif
diff --git a/drivers/serial/imx.c b/drivers/serial/imx.c
index 0de81f7..8c79e8c 100644
--- a/drivers/serial/imx.c
+++ b/drivers/serial/imx.c
@@ -8,6 +8,9 @@
  *  Author: Sascha Hauer <sascha@saschahauer.de>
  *  Copyright (C) 2004 Pengutronix
  *
+ *  Copyright (C) 2009 emlix GmbH
+ *  Author: Fabian Godehardt (added IrDA support for iMX)
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -41,6 +44,7 @@
 #include <linux/serial_core.h>
 #include <linux/serial.h>
 #include <linux/clk.h>
+#include <linux/delay.h>
 #include <linux/rational.h>
 
 #include <asm/io.h>
@@ -149,6 +153,7 @@
 #define  UCR4_DREN  	 (1<<0)  /* Recv data ready interrupt enable */
 #define  UFCR_RXTL_SHF   0       /* Receiver trigger level shift */
 #define  UFCR_RFDIV      (7<<7)  /* Reference freq divider mask */
+#define  UFCR_RFDIV_REG(x)	(((x) < 7 ? 6 - (x) : 6) << 7)
 #define  UFCR_TXTL_SHF   10      /* Transmitter trigger level shift */
 #define  USR1_PARITYERR  (1<<15) /* Parity error interrupt flag */
 #define  USR1_RTSS  	 (1<<14) /* RTS pin status */
@@ -213,9 +218,19 @@ struct imx_port {
 	unsigned int		old_status;
 	int			txirq,rxirq,rtsirq;
 	unsigned int		have_rtscts:1;
+	unsigned int		use_irda:1;
+	unsigned int		irda_inv_rx:1;
+	unsigned int		irda_inv_tx:1;
+	unsigned short		trcv_delay; /* transceiver delay */
 	struct clk		*clk;
 };
 
+#ifdef CONFIG_IRDA
+#define USE_IRDA(sport)	((sport)->use_irda)
+#else
+#define USE_IRDA(sport)	(0)
+#endif
+
 /*
  * Handle any change of modem status signal since we were last called.
  */
@@ -269,6 +284,48 @@ static void imx_stop_tx(struct uart_port *port)
 	struct imx_port *sport = (struct imx_port *)port;
 	unsigned long temp;
 
+	if (USE_IRDA(sport)) {
+		/* half duplex - wait for end of transmission */
+		int n = 256;
+		while ((--n > 0) &&
+		      !(readl(sport->port.membase + USR2) & USR2_TXDC)) {
+			udelay(5);
+			barrier();
+		}
+		/*
+		 * irda transceiver - wait a bit more to avoid
+		 * cutoff, hardware dependent
+		 */
+		udelay(sport->trcv_delay);
+
+		/*
+		 * half duplex - reactivate receive mode,
+		 * flush receive pipe echo crap
+		 */
+		if (readl(sport->port.membase + USR2) & USR2_TXDC) {
+			temp = readl(sport->port.membase + UCR1);
+			temp &= ~(UCR1_TXMPTYEN | UCR1_TRDYEN);
+			writel(temp, sport->port.membase + UCR1);
+
+			temp = readl(sport->port.membase + UCR4);
+			temp &= ~(UCR4_TCEN);
+			writel(temp, sport->port.membase + UCR4);
+
+			while (readl(sport->port.membase + URXD0) &
+			       URXD_CHARRDY)
+				barrier();
+
+			temp = readl(sport->port.membase + UCR1);
+			temp |= UCR1_RRDYEN;
+			writel(temp, sport->port.membase + UCR1);
+
+			temp = readl(sport->port.membase + UCR4);
+			temp |= UCR4_DREN;
+			writel(temp, sport->port.membase + UCR4);
+		}
+		return;
+	}
+
 	temp = readl(sport->port.membase + UCR1);
 	writel(temp & ~UCR1_TXMPTYEN, sport->port.membase + UCR1);
 }
@@ -324,9 +381,30 @@ static void imx_start_tx(struct uart_port *port)
 	struct imx_port *sport = (struct imx_port *)port;
 	unsigned long temp;
 
+	if (USE_IRDA(sport)) {
+		/* half duplex in IrDA mode; have to disable receive mode */
+		temp = readl(sport->port.membase + UCR4);
+		temp &= ~(UCR4_DREN);
+		writel(temp, sport->port.membase + UCR4);
+
+		temp = readl(sport->port.membase + UCR1);
+		temp &= ~(UCR1_RRDYEN);
+		writel(temp, sport->port.membase + UCR1);
+	}
+
 	temp = readl(sport->port.membase + UCR1);
 	writel(temp | UCR1_TXMPTYEN, sport->port.membase + UCR1);
 
+	if (USE_IRDA(sport)) {
+		temp = readl(sport->port.membase + UCR1);
+		temp |= UCR1_TRDYEN;
+		writel(temp, sport->port.membase + UCR1);
+
+		temp = readl(sport->port.membase + UCR4);
+		temp |= UCR4_TCEN;
+		writel(temp, sport->port.membase + UCR4);
+	}
+
 	if (readl(sport->port.membase + UTS) & UTS_TXEMPTY)
 		imx_transmit_buffer(sport);
 }
@@ -536,12 +614,7 @@ static int imx_setup_ufcr(struct imx_port *sport, unsigned int mode)
 	if(!ufcr_rfdiv)
 		ufcr_rfdiv = 1;
 
-	if(ufcr_rfdiv >= 7)
-		ufcr_rfdiv = 6;
-	else
-		ufcr_rfdiv = 6 - ufcr_rfdiv;
-
-	val |= UFCR_RFDIV & (ufcr_rfdiv << 7);
+	val |= UFCR_RFDIV_REG(ufcr_rfdiv);
 
 	writel(val, sport->port.membase + UFCR);
 
@@ -560,8 +633,24 @@ static int imx_startup(struct uart_port *port)
 	 * requesting IRQs
 	 */
 	temp = readl(sport->port.membase + UCR4);
+
+	if (USE_IRDA(sport))
+		temp |= UCR4_IRSC;
+
 	writel(temp & ~UCR4_DREN, sport->port.membase + UCR4);
 
+	if (USE_IRDA(sport)) {
+		/* reset fifo's and state machines */
+		int i = 100;
+		temp = readl(sport->port.membase + UCR2);
+		temp &= ~UCR2_SRST;
+		writel(temp, sport->port.membase + UCR2);
+		while (!(readl(sport->port.membase + UCR2) & UCR2_SRST) &&
+		    (--i > 0)) {
+			udelay(1);
+		}
+	}
+
 	/*
 	 * Allocate the IRQ(s) i.MX1 has three interrupts whereas later
 	 * chips only have one interrupt.
@@ -577,12 +666,16 @@ static int imx_startup(struct uart_port *port)
 		if (retval)
 			goto error_out2;
 
-		retval = request_irq(sport->rtsirq, imx_rtsint,
-			     (sport->rtsirq < MAX_INTERNAL_IRQ) ? 0 :
-			       IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING,
-				DRIVER_NAME, sport);
-		if (retval)
-			goto error_out3;
+		/* do not use RTS IRQ on IrDA */
+		if (!USE_IRDA(sport)) {
+			retval = request_irq(sport->rtsirq, imx_rtsint,
+				     (sport->rtsirq < MAX_INTERNAL_IRQ) ? 0 :
+				       IRQF_TRIGGER_FALLING |
+				       IRQF_TRIGGER_RISING,
+					DRIVER_NAME, sport);
+			if (retval)
+				goto error_out3;
+		}
 	} else {
 		retval = request_irq(sport->port.irq, imx_int, 0,
 				DRIVER_NAME, sport);
@@ -599,18 +692,49 @@ static int imx_startup(struct uart_port *port)
 
 	temp = readl(sport->port.membase + UCR1);
 	temp |= UCR1_RRDYEN | UCR1_RTSDEN | UCR1_UARTEN;
+
+	if (USE_IRDA(sport)) {
+		temp |= UCR1_IREN;
+		temp &= ~(UCR1_RTSDEN);
+	}
+
 	writel(temp, sport->port.membase + UCR1);
 
 	temp = readl(sport->port.membase + UCR2);
 	temp |= (UCR2_RXEN | UCR2_TXEN);
 	writel(temp, sport->port.membase + UCR2);
 
+	if (USE_IRDA(sport)) {
+		/* clear RX-FIFO */
+		int i = 64;
+		while ((--i > 0) &&
+			(readl(sport->port.membase + URXD0) & URXD_CHARRDY)) {
+			barrier();
+		}
+	}
+
 #if defined CONFIG_ARCH_MX2 || defined CONFIG_ARCH_MX3
 	temp = readl(sport->port.membase + UCR3);
 	temp |= UCR3_RXDMUXSEL;
 	writel(temp, sport->port.membase + UCR3);
 #endif
 
+	if (USE_IRDA(sport)) {
+		temp = readl(sport->port.membase + UCR4);
+		if (sport->irda_inv_rx)
+			temp |= UCR4_INVR;
+		else
+			temp &= ~(UCR4_INVR);
+		writel(temp | UCR4_DREN, sport->port.membase + UCR4);
+
+		temp = readl(sport->port.membase + UCR3);
+		if (sport->irda_inv_tx)
+			temp |= UCR3_INVT;
+		else
+			temp &= ~(UCR3_INVT);
+		writel(temp, sport->port.membase + UCR3);
+	}
+
 	/*
 	 * Enable modem status interrupts
 	 */
@@ -618,6 +742,16 @@ static int imx_startup(struct uart_port *port)
 	imx_enable_ms(&sport->port);
 	spin_unlock_irqrestore(&sport->port.lock,flags);
 
+	if (USE_IRDA(sport)) {
+		struct imxuart_platform_data *pdata;
+		pdata = sport->port.dev->platform_data;
+		sport->irda_inv_rx = pdata->irda_inv_rx;
+		sport->irda_inv_tx = pdata->irda_inv_tx;
+		sport->trcv_delay = pdata->transceiver_delay;
+		if (pdata->irda_enable)
+			pdata->irda_enable(1);
+	}
+
 	return 0;
 
 error_out3:
@@ -639,6 +773,13 @@ static void imx_shutdown(struct uart_port *port)
 	temp &= ~(UCR2_TXEN);
 	writel(temp, sport->port.membase + UCR2);
 
+	if (USE_IRDA(sport)) {
+		struct imxuart_platform_data *pdata;
+		pdata = sport->port.dev->platform_data;
+		if (pdata->irda_enable)
+			pdata->irda_enable(0);
+	}
+
 	/*
 	 * Stop our timer.
 	 */
@@ -648,7 +789,8 @@ static void imx_shutdown(struct uart_port *port)
 	 * Free the interrupts
 	 */
 	if (sport->txirq > 0) {
-		free_irq(sport->rtsirq, sport);
+		if (!USE_IRDA(sport))
+			free_irq(sport->rtsirq, sport);
 		free_irq(sport->txirq, sport);
 		free_irq(sport->rxirq, sport);
 	} else
@@ -660,6 +802,9 @@ static void imx_shutdown(struct uart_port *port)
 
 	temp = readl(sport->port.membase + UCR1);
 	temp &= ~(UCR1_TXMPTYEN | UCR1_RRDYEN | UCR1_RTSDEN | UCR1_UARTEN);
+	if (USE_IRDA(sport))
+		temp &= ~(UCR1_IREN);
+
 	writel(temp, sport->port.membase + UCR1);
 }
 
@@ -768,11 +913,19 @@ imx_set_termios(struct uart_port *port, struct ktermios *termios,
 			sport->port.membase + UCR2);
 	old_txrxen &= (UCR2_TXEN | UCR2_RXEN);
 
-	div = sport->port.uartclk / (baud * 16);
-	if (div > 7)
-		div = 7;
-	if (!div)
+	if (USE_IRDA(sport)) {
+		/*
+		 * use maximum available submodule frequency to
+		 * avoid missing short pulses due to low sampling rate
+		 */
 		div = 1;
+	} else {
+		div = sport->port.uartclk / (baud * 16);
+		if (div > 7)
+			div = 7;
+		if (!div)
+			div = 1;
+	}
 
 	rational_best_approximation(16 * div * baud, sport->port.uartclk,
 		1 << 16, 1 << 16, &num, &denom);
@@ -781,8 +934,7 @@ imx_set_termios(struct uart_port *port, struct ktermios *termios,
 	denom -= 1;
 
 	ufcr = readl(sport->port.membase + UFCR);
-	ufcr = (ufcr & (~UFCR_RFDIV)) |
-	    (div << 7);
+	ufcr = (ufcr & (~UFCR_RFDIV)) | UFCR_RFDIV_REG(div);
 	writel(ufcr, sport->port.membase + UFCR);
 
 	writel(num, sport->port.membase + UBIR);
@@ -1141,6 +1293,11 @@ static int serial_imx_probe(struct platform_device *pdev)
 	if (pdata && (pdata->flags & IMXUART_HAVE_RTSCTS))
 		sport->have_rtscts = 1;
 
+#ifdef CONFIG_IRDA
+	if (pdata && (pdata->flags & IMXUART_IRDA))
+		sport->use_irda = 1;
+#endif
+
 	if (pdata->init) {
 		ret = pdata->init(pdev);
 		if (ret)
-- 
cgit v0.10.2


From d7f8d437bda0ec409e26cffb846bc28a40603ee3 Mon Sep 17 00:00:00 2001
From: Oskar Schirmer <os@emlix.com>
Date: Thu, 11 Jun 2009 14:55:22 +0100
Subject: imx: serial: use tty_encode_baud_rate to set true rate

real baud rate may be different from the one requested.
for upper layers, set the nearest value to the real rate
in favour of the rate previously requested.

Signed-off-by: Oskar Schirmer <os@emlix.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/serial/imx.c b/drivers/serial/imx.c
index 8c79e8c..7b5d1de 100644
--- a/drivers/serial/imx.c
+++ b/drivers/serial/imx.c
@@ -818,6 +818,7 @@ imx_set_termios(struct uart_port *port, struct ktermios *termios,
 	unsigned int old_csize = old ? old->c_cflag & CSIZE : CS8;
 	unsigned int div, ufcr;
 	unsigned long num, denom;
+	uint64_t tdiv64;
 
 	/*
 	 * If we don't support modem control lines, don't allow
@@ -930,6 +931,12 @@ imx_set_termios(struct uart_port *port, struct ktermios *termios,
 	rational_best_approximation(16 * div * baud, sport->port.uartclk,
 		1 << 16, 1 << 16, &num, &denom);
 
+	tdiv64 = sport->port.uartclk;
+	tdiv64 *= num;
+	do_div(tdiv64, denom * 16 * div);
+	tty_encode_baud_rate(sport->port.info->port.tty,
+		(speed_t)tdiv64, (speed_t)tdiv64);
+
 	num -= 1;
 	denom -= 1;
 
-- 
cgit v0.10.2


From f0e8527726b9e56649b9eafde3bc0fbc4dd2dd47 Mon Sep 17 00:00:00 2001
From: Dirk Eibach <eibach@gdsys.de>
Date: Thu, 11 Jun 2009 14:56:44 +0100
Subject: moxa: prevent opening unavailable ports

In moxa.c there are 32 minor numbers reserved for each device. The
number of ports actually available per device is stored in
moxa_board_conf->numPorts. This number is not considered in moxa_open().
Opening a port that is not available results in a kernel oops.
This patch adds a test to moxa_open() that prevents opening unavailable
ports.

Signed-off-by: Dirk Eibach <eibach@gdsys.de>
Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/char/moxa.c b/drivers/char/moxa.c
index 4a4cab7..65b6ff2 100644
--- a/drivers/char/moxa.c
+++ b/drivers/char/moxa.c
@@ -1184,6 +1184,11 @@ static int moxa_open(struct tty_struct *tty, struct file *filp)
 		return -ENODEV;
 	}
 
+	if (port % MAX_PORTS_PER_BOARD >= brd->numPorts) {
+		mutex_unlock(&moxa_openlock);
+		return -ENODEV;
+	}
+
 	ch = &brd->ports[port % MAX_PORTS_PER_BOARD];
 	ch->port.count++;
 	tty->driver_data = ch;
-- 
cgit v0.10.2


From 1c432d899d32d36371ee4ee310fa3609cf0e5742 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Jun 2009 13:19:29 +0200
Subject: perf_counter: Rename enums

Rename the perf enums to be in the 'perf_' namespace and strictly
enumerate the ABI bits.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 95c797c..d5911b0 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -24,24 +24,21 @@
 /*
  * attr.type
  */
-enum perf_event_types {
+enum perf_type_id {
 	PERF_TYPE_HARDWARE		= 0,
 	PERF_TYPE_SOFTWARE		= 1,
 	PERF_TYPE_TRACEPOINT		= 2,
 	PERF_TYPE_HW_CACHE		= 3,
+	PERF_TYPE_RAW			= 4,
 
-	/*
-	 * available TYPE space, raw is the max value.
-	 */
-
-	PERF_TYPE_RAW			= 128,
+	PERF_TYPE_MAX,			/* non ABI */
 };
 
 /*
  * Generalized performance counter event types, used by the attr.event_id
  * parameter of the sys_perf_counter_open() syscall:
  */
-enum attr_ids {
+enum perf_hw_id {
 	/*
 	 * Common hardware events, generalized by the kernel:
 	 */
@@ -53,7 +50,7 @@ enum attr_ids {
 	PERF_COUNT_BRANCH_MISSES	= 5,
 	PERF_COUNT_BUS_CYCLES		= 6,
 
-	PERF_HW_EVENTS_MAX		= 7,
+	PERF_HW_EVENTS_MAX,		/* non ABI */
 };
 
 /*
@@ -63,30 +60,30 @@ enum attr_ids {
  *       { read, write, prefetch } x
  *       { accesses, misses }
  */
-enum hw_cache_id {
-	PERF_COUNT_HW_CACHE_L1D,
-	PERF_COUNT_HW_CACHE_L1I,
-	PERF_COUNT_HW_CACHE_L2,
-	PERF_COUNT_HW_CACHE_DTLB,
-	PERF_COUNT_HW_CACHE_ITLB,
-	PERF_COUNT_HW_CACHE_BPU,
-
-	PERF_COUNT_HW_CACHE_MAX,
+enum perf_hw_cache_id {
+	PERF_COUNT_HW_CACHE_L1D		= 0,
+	PERF_COUNT_HW_CACHE_L1I		= 1,
+	PERF_COUNT_HW_CACHE_L2		= 2,
+	PERF_COUNT_HW_CACHE_DTLB	= 3,
+	PERF_COUNT_HW_CACHE_ITLB	= 4,
+	PERF_COUNT_HW_CACHE_BPU		= 5,
+
+	PERF_COUNT_HW_CACHE_MAX,	/* non ABI */
 };
 
-enum hw_cache_op_id {
-	PERF_COUNT_HW_CACHE_OP_READ,
-	PERF_COUNT_HW_CACHE_OP_WRITE,
-	PERF_COUNT_HW_CACHE_OP_PREFETCH,
+enum perf_hw_cache_op_id {
+	PERF_COUNT_HW_CACHE_OP_READ	= 0,
+	PERF_COUNT_HW_CACHE_OP_WRITE	= 1,
+	PERF_COUNT_HW_CACHE_OP_PREFETCH	= 2,
 
-	PERF_COUNT_HW_CACHE_OP_MAX,
+	PERF_COUNT_HW_CACHE_OP_MAX,	/* non ABI */
 };
 
-enum hw_cache_op_result_id {
-	PERF_COUNT_HW_CACHE_RESULT_ACCESS,
-	PERF_COUNT_HW_CACHE_RESULT_MISS,
+enum perf_hw_cache_op_result_id {
+	PERF_COUNT_HW_CACHE_RESULT_ACCESS	= 0,
+	PERF_COUNT_HW_CACHE_RESULT_MISS		= 1,
 
-	PERF_COUNT_HW_CACHE_RESULT_MAX,
+	PERF_COUNT_HW_CACHE_RESULT_MAX,		/* non ABI */
 };
 
 /*
@@ -95,7 +92,7 @@ enum hw_cache_op_result_id {
  * physical and sw events of the kernel (and allow the profiling of them as
  * well):
  */
-enum sw_event_ids {
+enum perf_sw_ids {
 	PERF_COUNT_CPU_CLOCK		= 0,
 	PERF_COUNT_TASK_CLOCK		= 1,
 	PERF_COUNT_PAGE_FAULTS		= 2,
@@ -104,7 +101,7 @@ enum sw_event_ids {
 	PERF_COUNT_PAGE_FAULTS_MIN	= 5,
 	PERF_COUNT_PAGE_FAULTS_MAJ	= 6,
 
-	PERF_SW_EVENTS_MAX		= 7,
+	PERF_SW_EVENTS_MAX,		/* non ABI */
 };
 
 /*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 3b2829d..c02535b 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3162,7 +3162,7 @@ static int perf_swcounter_is_counting(struct perf_counter *counter)
 }
 
 static int perf_swcounter_match(struct perf_counter *counter,
-				enum perf_event_types type,
+				enum perf_type_id type,
 				u32 event, struct pt_regs *regs)
 {
 	if (!perf_swcounter_is_counting(counter))
@@ -3194,7 +3194,7 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
 }
 
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
-				     enum perf_event_types type, u32 event,
+				     enum perf_type_id type, u32 event,
 				     u64 nr, int nmi, struct pt_regs *regs,
 				     u64 addr)
 {
@@ -3225,7 +3225,7 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
 	return &cpuctx->recursion[0];
 }
 
-static void __perf_swcounter_event(enum perf_event_types type, u32 event,
+static void __perf_swcounter_event(enum perf_type_id type, u32 event,
 				   u64 nr, int nmi, struct pt_regs *regs,
 				   u64 addr)
 {
-- 
cgit v0.10.2


From f4dbfa8f3131a84257223393905f7efad0ca5996 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Jun 2009 14:06:28 +0200
Subject: perf_counter: Standardize event names

Pure renames only, to PERF_COUNT_HW_* and PERF_COUNT_SW_*.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
index 0e94b68..73956f0 100644
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -535,12 +535,12 @@ static void p4_disable_pmc(unsigned int pmc, u64 mmcr[])
 }
 
 static int p4_generic_events[] = {
-	[PERF_COUNT_CPU_CYCLES] = 7,
-	[PERF_COUNT_INSTRUCTIONS] = 0x1001,
-	[PERF_COUNT_CACHE_REFERENCES] = 0x8c10,		/* PM_LD_REF_L1 */
-	[PERF_COUNT_CACHE_MISSES] = 0x3c10,		/* PM_LD_MISS_L1 */
-	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x330,	/* PM_BR_ISSUED */
-	[PERF_COUNT_BRANCH_MISSES] = 0x331,		/* PM_BR_MPRED_CR */
+	[PERF_COUNT_HW_CPU_CYCLES]		= 7,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x1001,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x8c10, /* PM_LD_REF_L1 */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3c10, /* PM_LD_MISS_L1 */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x330,  /* PM_BR_ISSUED */
+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x331,  /* PM_BR_MPRED_CR */
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index bbf2cbb..5f8b774 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -606,12 +606,12 @@ static void power5p_disable_pmc(unsigned int pmc, u64 mmcr[])
 }
 
 static int power5p_generic_events[] = {
-	[PERF_COUNT_CPU_CYCLES] = 0xf,
-	[PERF_COUNT_INSTRUCTIONS] = 0x100009,
-	[PERF_COUNT_CACHE_REFERENCES] = 0x1c10a8,	/* LD_REF_L1 */
-	[PERF_COUNT_CACHE_MISSES] = 0x3c1088,		/* LD_MISS_L1 */
-	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4,	/* BR_ISSUED */ 
-	[PERF_COUNT_BRANCH_MISSES] = 0x230e5,		/* BR_MPRED_CR */
+	[PERF_COUNT_HW_CPU_CYCLES]		= 0xf,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x100009,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x1c10a8, /* LD_REF_L1 */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3c1088, /* LD_MISS_L1 */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x230e4,  /* BR_ISSUED */
+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x230e5,  /* BR_MPRED_CR */
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index 670cf10..d54723a 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -548,12 +548,12 @@ static void power5_disable_pmc(unsigned int pmc, u64 mmcr[])
 }
 
 static int power5_generic_events[] = {
-	[PERF_COUNT_CPU_CYCLES] = 0xf,
-	[PERF_COUNT_INSTRUCTIONS] = 0x100009,
-	[PERF_COUNT_CACHE_REFERENCES] = 0x4c1090,	/* LD_REF_L1 */
-	[PERF_COUNT_CACHE_MISSES] = 0x3c1088,		/* LD_MISS_L1 */
-	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4,	/* BR_ISSUED */ 
-	[PERF_COUNT_BRANCH_MISSES] = 0x230e5,		/* BR_MPRED_CR */
+	[PERF_COUNT_HW_CPU_CYCLES]		= 0xf,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x100009,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4c1090, /* LD_REF_L1 */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3c1088, /* LD_MISS_L1 */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x230e4,  /* BR_ISSUED */
+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x230e5,  /* BR_MPRED_CR */
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index 4da7078..0cd406ee 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -466,12 +466,12 @@ static void p6_disable_pmc(unsigned int pmc, u64 mmcr[])
 }
 
 static int power6_generic_events[] = {
-	[PERF_COUNT_CPU_CYCLES] = 0x1e,
-	[PERF_COUNT_INSTRUCTIONS] = 2,
-	[PERF_COUNT_CACHE_REFERENCES] = 0x280030,	/* LD_REF_L1 */
-	[PERF_COUNT_CACHE_MISSES] = 0x30000c,		/* LD_MISS_L1 */
-	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x410a0,	/* BR_PRED */ 
-	[PERF_COUNT_BRANCH_MISSES] = 0x400052,		/* BR_MPRED */
+	[PERF_COUNT_HW_CPU_CYCLES]		= 0x1e,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 2,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x280030, /* LD_REF_L1 */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x30000c, /* LD_MISS_L1 */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x410a0,  /* BR_PRED */
+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x400052, /* BR_MPRED */
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index 336adf1..46a2064 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -419,12 +419,12 @@ static void p970_disable_pmc(unsigned int pmc, u64 mmcr[])
 }
 
 static int ppc970_generic_events[] = {
-	[PERF_COUNT_CPU_CYCLES] = 7,
-	[PERF_COUNT_INSTRUCTIONS] = 1,
-	[PERF_COUNT_CACHE_REFERENCES] = 0x8810,		/* PM_LD_REF_L1 */
-	[PERF_COUNT_CACHE_MISSES] = 0x3810,		/* PM_LD_MISS_L1 */
-	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x431,	/* PM_BR_ISSUED */
-	[PERF_COUNT_BRANCH_MISSES] = 0x327,		/* PM_GRP_BR_MPRED */
+	[PERF_COUNT_HW_CPU_CYCLES]		= 7,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 1,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x8810, /* PM_LD_REF_L1 */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3810, /* PM_LD_MISS_L1 */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x431,  /* PM_BR_ISSUED */
+	[PERF_COUNT_HW_BRANCH_MISSES] 		= 0x327,  /* PM_GRP_BR_MPRED */
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index ac0e112..5beffc8 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -171,7 +171,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 		die("Weird page fault", regs, SIGSEGV);
 	}
 
-	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address);
+	perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
 
 	/* When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
@@ -312,7 +312,7 @@ good_area:
 	}
 	if (ret & VM_FAULT_MAJOR) {
 		current->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0,
+		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
 				     regs, address);
 #ifdef CONFIG_PPC_SMLPAR
 		if (firmware_has_feature(FW_FEATURE_CMO)) {
@@ -323,7 +323,7 @@ good_area:
 #endif
 	} else {
 		current->min_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0,
+		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
 				     regs, address);
 	}
 	up_read(&mm->mmap_sem);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 57ae1be..572fb43 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -69,13 +69,13 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
  */
 static const u64 intel_perfmon_event_map[] =
 {
-  [PERF_COUNT_CPU_CYCLES]		= 0x003c,
-  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
-  [PERF_COUNT_CACHE_REFERENCES]		= 0x4f2e,
-  [PERF_COUNT_CACHE_MISSES]		= 0x412e,
-  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
-  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
-  [PERF_COUNT_BUS_CYCLES]		= 0x013c,
+  [PERF_COUNT_HW_CPU_CYCLES]		= 0x003c,
+  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4f2e,
+  [PERF_COUNT_HW_CACHE_MISSES]		= 0x412e,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
+  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
+  [PERF_COUNT_HW_BUS_CYCLES]		= 0x013c,
 };
 
 static u64 intel_pmu_event_map(int event)
@@ -485,12 +485,12 @@ static const u64 amd_0f_hw_cache_event_ids
  */
 static const u64 amd_perfmon_event_map[] =
 {
-  [PERF_COUNT_CPU_CYCLES]		= 0x0076,
-  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
-  [PERF_COUNT_CACHE_REFERENCES]		= 0x0080,
-  [PERF_COUNT_CACHE_MISSES]		= 0x0081,
-  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
-  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
+  [PERF_COUNT_HW_CPU_CYCLES]		= 0x0076,
+  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0080,
+  [PERF_COUNT_HW_CACHE_MISSES]		= 0x0081,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
+  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
 };
 
 static u64 amd_pmu_event_map(int event)
@@ -970,11 +970,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 
 	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
 
-	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
 		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
-	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_CPU_CYCLES)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
 		return X86_PMC_IDX_FIXED_CPU_CYCLES;
-	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_BUS_CYCLES)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
 		return X86_PMC_IDX_FIXED_BUS_CYCLES;
 
 	return -1;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 6f9df2b..5c6d816 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1045,7 +1045,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	if (unlikely(error_code & PF_RSVD))
 		pgtable_bad(regs, error_code, address);
 
-	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address);
+	perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
 
 	/*
 	 * If we're in an interrupt, have no user context or are running
@@ -1142,11 +1142,11 @@ good_area:
 
 	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0,
+		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
 				     regs, address);
 	} else {
 		tsk->min_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0,
+		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
 				     regs, address);
 	}
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index d5911b0..887df88 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -42,15 +42,15 @@ enum perf_hw_id {
 	/*
 	 * Common hardware events, generalized by the kernel:
 	 */
-	PERF_COUNT_CPU_CYCLES		= 0,
-	PERF_COUNT_INSTRUCTIONS		= 1,
-	PERF_COUNT_CACHE_REFERENCES	= 2,
-	PERF_COUNT_CACHE_MISSES		= 3,
-	PERF_COUNT_BRANCH_INSTRUCTIONS	= 4,
-	PERF_COUNT_BRANCH_MISSES	= 5,
-	PERF_COUNT_BUS_CYCLES		= 6,
-
-	PERF_HW_EVENTS_MAX,		/* non ABI */
+	PERF_COUNT_HW_CPU_CYCLES		= 0,
+	PERF_COUNT_HW_INSTRUCTIONS		= 1,
+	PERF_COUNT_HW_CACHE_REFERENCES		= 2,
+	PERF_COUNT_HW_CACHE_MISSES		= 3,
+	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4,
+	PERF_COUNT_HW_BRANCH_MISSES		= 5,
+	PERF_COUNT_HW_BUS_CYCLES		= 6,
+
+	PERF_COUNT_HW_MAX,		/* non ABI */
 };
 
 /*
@@ -93,15 +93,15 @@ enum perf_hw_cache_op_result_id {
  * well):
  */
 enum perf_sw_ids {
-	PERF_COUNT_CPU_CLOCK		= 0,
-	PERF_COUNT_TASK_CLOCK		= 1,
-	PERF_COUNT_PAGE_FAULTS		= 2,
-	PERF_COUNT_CONTEXT_SWITCHES	= 3,
-	PERF_COUNT_CPU_MIGRATIONS	= 4,
-	PERF_COUNT_PAGE_FAULTS_MIN	= 5,
-	PERF_COUNT_PAGE_FAULTS_MAJ	= 6,
-
-	PERF_SW_EVENTS_MAX,		/* non ABI */
+	PERF_COUNT_SW_CPU_CLOCK		= 0,
+	PERF_COUNT_SW_TASK_CLOCK	= 1,
+	PERF_COUNT_SW_PAGE_FAULTS	= 2,
+	PERF_COUNT_SW_CONTEXT_SWITCHES	= 3,
+	PERF_COUNT_SW_CPU_MIGRATIONS	= 4,
+	PERF_COUNT_SW_PAGE_FAULTS_MIN	= 5,
+	PERF_COUNT_SW_PAGE_FAULTS_MAJ	= 6,
+
+	PERF_COUNT_SW_MAX,		/* non ABI */
 };
 
 /*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c02535b..8859b97 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1024,7 +1024,7 @@ void perf_counter_task_sched_out(struct task_struct *task,
 	int do_switch = 1;
 
 	regs = task_pt_regs(task);
-	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
+	perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
 
 	if (likely(!ctx || !cpuctx->task_ctx))
 		return;
@@ -3411,13 +3411,13 @@ void perf_counter_task_migration(struct task_struct *task, int cpu)
 	struct perf_counter_context *ctx;
 
 	perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
-				 PERF_COUNT_CPU_MIGRATIONS,
+				 PERF_COUNT_SW_CPU_MIGRATIONS,
 				 1, 1, NULL, 0);
 
 	ctx = perf_pin_task_context(task);
 	if (ctx) {
 		perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
-					 PERF_COUNT_CPU_MIGRATIONS,
+					 PERF_COUNT_SW_CPU_MIGRATIONS,
 					 1, 1, NULL, 0);
 		perf_unpin_context(ctx);
 	}
@@ -3475,11 +3475,11 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	 * events.
 	 */
 	switch (counter->attr.config) {
-	case PERF_COUNT_CPU_CLOCK:
+	case PERF_COUNT_SW_CPU_CLOCK:
 		pmu = &perf_ops_cpu_clock;
 
 		break;
-	case PERF_COUNT_TASK_CLOCK:
+	case PERF_COUNT_SW_TASK_CLOCK:
 		/*
 		 * If the user instantiates this as a per-cpu counter,
 		 * use the cpu_clock counter instead.
@@ -3490,11 +3490,11 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 			pmu = &perf_ops_cpu_clock;
 
 		break;
-	case PERF_COUNT_PAGE_FAULTS:
-	case PERF_COUNT_PAGE_FAULTS_MIN:
-	case PERF_COUNT_PAGE_FAULTS_MAJ:
-	case PERF_COUNT_CONTEXT_SWITCHES:
-	case PERF_COUNT_CPU_MIGRATIONS:
+	case PERF_COUNT_SW_PAGE_FAULTS:
+	case PERF_COUNT_SW_PAGE_FAULTS_MIN:
+	case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
+	case PERF_COUNT_SW_CONTEXT_SWITCHES:
+	case PERF_COUNT_SW_CPU_MIGRATIONS:
 		pmu = &perf_ops_generic;
 		break;
 	}
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 84cd336..29259e7 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -378,12 +378,12 @@ try_again:
 		 * is always available even if no PMU support:
 		 */
 		if (attr->type == PERF_TYPE_HARDWARE
-			&& attr->config == PERF_COUNT_CPU_CYCLES) {
+			&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
 
 			if (verbose)
 				warning(" ... trying to fall back to cpu-clock-ticks\n");
 			attr->type = PERF_TYPE_SOFTWARE;
-			attr->config = PERF_COUNT_CPU_CLOCK;
+			attr->config = PERF_COUNT_SW_CPU_CLOCK;
 			goto try_again;
 		}
 		printf("\n");
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 6404906..c43e4a9 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -46,15 +46,16 @@
 
 static struct perf_counter_attr default_attrs[MAX_COUNTERS] = {
 
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_TASK_CLOCK		},
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_CONTEXT_SWITCHES	},
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_CPU_MIGRATIONS	},
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_PAGE_FAULTS	},
-
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CPU_CYCLES		},
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_INSTRUCTIONS	},
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CACHE_REFERENCES	},
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CACHE_MISSES	},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK	},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS	},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS	},
+
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES	},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS	},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES	},
+
 };
 
 static int			system_wide			=  0;
@@ -120,10 +121,10 @@ static inline int nsec_counter(int counter)
 	if (attrs[counter].type != PERF_TYPE_SOFTWARE)
 		return 0;
 
-	if (attrs[counter].config == PERF_COUNT_CPU_CLOCK)
+	if (attrs[counter].config == PERF_COUNT_SW_CPU_CLOCK)
 		return 1;
 
-	if (attrs[counter].config == PERF_COUNT_TASK_CLOCK)
+	if (attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK)
 		return 1;
 
 	return 0;
@@ -176,10 +177,10 @@ static void read_counter(int counter)
 	 * Save the full runtime - to allow normalization during printout:
 	 */
 	if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
-		attrs[counter].config == PERF_COUNT_TASK_CLOCK)
+		attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK)
 		runtime_nsecs = count[0];
 	if (attrs[counter].type == PERF_TYPE_HARDWARE &&
-		attrs[counter].config == PERF_COUNT_CPU_CYCLES)
+		attrs[counter].config == PERF_COUNT_HW_CPU_CYCLES)
 		runtime_cycles = count[0];
 }
 
@@ -206,7 +207,7 @@ static void print_counter(int counter)
 		fprintf(stderr, " %14.6f  %-20s",
 			msecs, event_name(counter));
 		if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
-			attrs[counter].config == PERF_COUNT_TASK_CLOCK) {
+			attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) {
 
 			if (walltime_nsecs)
 				fprintf(stderr, " # %11.3f CPU utilization factor",
@@ -220,7 +221,7 @@ static void print_counter(int counter)
 				(double)count[0]/runtime_nsecs*1000.0);
 		if (runtime_cycles &&
 			attrs[counter].type == PERF_TYPE_HARDWARE &&
-				attrs[counter].config == PERF_COUNT_INSTRUCTIONS) {
+				attrs[counter].config == PERF_COUNT_HW_INSTRUCTIONS) {
 
 			fprintf(stderr, " # %1.3f per cycle",
 				(double)count[0] / (double)runtime_cycles);
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 309dbc7..fe338d3 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -562,13 +562,13 @@ try_again:
 		 * is always available even if no PMU support:
 		 */
 		if (attr->type == PERF_TYPE_HARDWARE
-			&& attr->config == PERF_COUNT_CPU_CYCLES) {
+			&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
 
 			if (verbose)
 				warning(" ... trying to fall back to cpu-clock-ticks\n");
 
 			attr->type = PERF_TYPE_SOFTWARE;
-			attr->config = PERF_COUNT_CPU_CLOCK;
+			attr->config = PERF_COUNT_SW_CPU_CLOCK;
 			goto try_again;
 		}
 		printf("\n");
diff --git a/tools/perf/design.txt b/tools/perf/design.txt
index d325076..860e116 100644
--- a/tools/perf/design.txt
+++ b/tools/perf/design.txt
@@ -99,13 +99,13 @@ enum hw_event_ids {
 	/*
 	 * Common hardware events, generalized by the kernel:
 	 */
-	PERF_COUNT_CPU_CYCLES		= 0,
-	PERF_COUNT_INSTRUCTIONS		= 1,
-	PERF_COUNT_CACHE_REFERENCES	= 2,
-	PERF_COUNT_CACHE_MISSES		= 3,
-	PERF_COUNT_BRANCH_INSTRUCTIONS	= 4,
-	PERF_COUNT_BRANCH_MISSES	= 5,
-	PERF_COUNT_BUS_CYCLES		= 6,
+	PERF_COUNT_HW_CPU_CYCLES		= 0,
+	PERF_COUNT_HW_INSTRUCTIONS		= 1,
+	PERF_COUNT_HW_CACHE_REFERENCES	= 2,
+	PERF_COUNT_HW_CACHE_MISSES		= 3,
+	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4,
+	PERF_COUNT_HW_BRANCH_MISSES	= 5,
+	PERF_COUNT_HW_BUS_CYCLES		= 6,
 };
 
 These are standardized types of events that work relatively uniformly
@@ -130,13 +130,13 @@ software events, selected by 'event_id':
  * well):
  */
 enum sw_event_ids {
-	PERF_COUNT_CPU_CLOCK		= 0,
-	PERF_COUNT_TASK_CLOCK		= 1,
-	PERF_COUNT_PAGE_FAULTS		= 2,
-	PERF_COUNT_CONTEXT_SWITCHES	= 3,
-	PERF_COUNT_CPU_MIGRATIONS	= 4,
-	PERF_COUNT_PAGE_FAULTS_MIN	= 5,
-	PERF_COUNT_PAGE_FAULTS_MAJ	= 6,
+	PERF_COUNT_SW_CPU_CLOCK		= 0,
+	PERF_COUNT_SW_TASK_CLOCK		= 1,
+	PERF_COUNT_SW_PAGE_FAULTS		= 2,
+	PERF_COUNT_SW_CONTEXT_SWITCHES	= 3,
+	PERF_COUNT_SW_CPU_MIGRATIONS	= 4,
+	PERF_COUNT_SW_PAGE_FAULTS_MIN	= 5,
+	PERF_COUNT_SW_PAGE_FAULTS_MAJ	= 6,
 };
 
 Counters of the type PERF_TYPE_TRACEPOINT are available when the ftrace event
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index f18a9a0..9d5f1ca 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -22,26 +22,26 @@ struct event_symbol {
 #define CR(x, y) .type = PERF_TYPE_##x, .config = y
 
 static struct event_symbol event_symbols[] = {
-  { C(HARDWARE, CPU_CYCLES),		"cpu-cycles",		},
-  { C(HARDWARE, CPU_CYCLES),		"cycles",		},
-  { C(HARDWARE, INSTRUCTIONS),		"instructions",		},
-  { C(HARDWARE, CACHE_REFERENCES),	"cache-references",	},
-  { C(HARDWARE, CACHE_MISSES),		"cache-misses",		},
-  { C(HARDWARE, BRANCH_INSTRUCTIONS),	"branch-instructions",	},
-  { C(HARDWARE, BRANCH_INSTRUCTIONS),	"branches",		},
-  { C(HARDWARE, BRANCH_MISSES),		"branch-misses",	},
-  { C(HARDWARE, BUS_CYCLES),		"bus-cycles",		},
-
-  { C(SOFTWARE, CPU_CLOCK),		"cpu-clock",		},
-  { C(SOFTWARE, TASK_CLOCK),		"task-clock",		},
-  { C(SOFTWARE, PAGE_FAULTS),		"page-faults",		},
-  { C(SOFTWARE, PAGE_FAULTS),		"faults",		},
-  { C(SOFTWARE, PAGE_FAULTS_MIN),	"minor-faults",		},
-  { C(SOFTWARE, PAGE_FAULTS_MAJ),	"major-faults",		},
-  { C(SOFTWARE, CONTEXT_SWITCHES),	"context-switches",	},
-  { C(SOFTWARE, CONTEXT_SWITCHES),	"cs",			},
-  { C(SOFTWARE, CPU_MIGRATIONS),	"cpu-migrations",	},
-  { C(SOFTWARE, CPU_MIGRATIONS),	"migrations",		},
+  { C(HARDWARE, HW_CPU_CYCLES),		"cpu-cycles",		},
+  { C(HARDWARE, HW_CPU_CYCLES),		"cycles",		},
+  { C(HARDWARE, HW_INSTRUCTIONS),	"instructions",		},
+  { C(HARDWARE, HW_CACHE_REFERENCES),	"cache-references",	},
+  { C(HARDWARE, HW_CACHE_MISSES),	"cache-misses",		},
+  { C(HARDWARE, HW_BRANCH_INSTRUCTIONS),"branch-instructions",	},
+  { C(HARDWARE, HW_BRANCH_INSTRUCTIONS),"branches",		},
+  { C(HARDWARE, HW_BRANCH_MISSES),	"branch-misses",	},
+  { C(HARDWARE, HW_BUS_CYCLES),		"bus-cycles",		},
+
+  { C(SOFTWARE, SW_CPU_CLOCK),		"cpu-clock",		},
+  { C(SOFTWARE, SW_TASK_CLOCK),		"task-clock",		},
+  { C(SOFTWARE, SW_PAGE_FAULTS),	"page-faults",		},
+  { C(SOFTWARE, SW_PAGE_FAULTS),	"faults",		},
+  { C(SOFTWARE, SW_PAGE_FAULTS_MIN),	"minor-faults",		},
+  { C(SOFTWARE, SW_PAGE_FAULTS_MAJ),	"major-faults",		},
+  { C(SOFTWARE, SW_CONTEXT_SWITCHES),	"context-switches",	},
+  { C(SOFTWARE, SW_CONTEXT_SWITCHES),	"cs",			},
+  { C(SOFTWARE, SW_CPU_MIGRATIONS),	"cpu-migrations",	},
+  { C(SOFTWARE, SW_CPU_MIGRATIONS),	"migrations",		},
 };
 
 #define __PERF_COUNTER_FIELD(config, name) \
@@ -107,7 +107,7 @@ char *event_name(int counter)
 
 	switch (type) {
 	case PERF_TYPE_HARDWARE:
-		if (config < PERF_HW_EVENTS_MAX)
+		if (config < PERF_COUNT_HW_MAX)
 			return hw_event_names[config];
 		return "unknown-hardware";
 
@@ -136,7 +136,7 @@ char *event_name(int counter)
 	}
 
 	case PERF_TYPE_SOFTWARE:
-		if (config < PERF_SW_EVENTS_MAX)
+		if (config < PERF_COUNT_SW_MAX)
 			return sw_event_names[config];
 		return "unknown-software";
 
-- 
cgit v0.10.2


From 8be6e8f3c3a13900169f1141870562d0c723b010 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Jun 2009 14:19:11 +0200
Subject: perf_counter: Rename L2 to LL cache

The top (fastest) and last level (biggest) caches are the most
interesting ones, performance wise.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
[ Fixed the Nehalem LL table to LLC Reference/Miss events ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
index 73956f0..07bd308 100644
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -561,7 +561,7 @@ static int power4_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1	},
 		[C(OP_PREFETCH)] = {	0,		0	},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0,		0	},
 		[C(OP_WRITE)] = {	0,		0	},
 		[C(OP_PREFETCH)] = {	0xc34,		0	},
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index 5f8b774..41e5d2d 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -632,7 +632,7 @@ static int power5p_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1		},
 		[C(OP_PREFETCH)] = {	0,		0		},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0,		0		},
 		[C(OP_WRITE)] = {	0,		0		},
 		[C(OP_PREFETCH)] = {	0xc50c3,	0		},
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index d54723a..05600b6 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -574,7 +574,7 @@ static int power5_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1		},
 		[C(OP_PREFETCH)] = {	0,		0		},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0,		0x3c309b	},
 		[C(OP_WRITE)] = {	0,		0		},
 		[C(OP_PREFETCH)] = {	0xc50c3,	0		},
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index 0cd406ee..46f74be 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -493,7 +493,7 @@ static int power6_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1		},
 		[C(OP_PREFETCH)] = {	0x4008c,	0		},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0x150730,	0x250532	},
 		[C(OP_WRITE)] = {	0x250432,	0x150432	},
 		[C(OP_PREFETCH)] = {	0x810a6,	0		},
diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c
index 060e0de..b3f7d12 100644
--- a/arch/powerpc/kernel/power7-pmu.c
+++ b/arch/powerpc/kernel/power7-pmu.c
@@ -320,7 +320,7 @@ static int power7_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1	},
 		[C(OP_PREFETCH)] = {	0x408a,		0	},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0x6080,		0x6084	},
 		[C(OP_WRITE)] = {	0x6082,		0x6086	},
 		[C(OP_PREFETCH)] = {	0,		0	},
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index 46a2064..ba0a357 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -445,7 +445,7 @@ static int ppc970_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1	},
 		[C(OP_PREFETCH)] = {	0,		0	},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0,		0	},
 		[C(OP_WRITE)] = {	0,		0	},
 		[C(OP_PREFETCH)] = {	0x733,		0	},
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 572fb43..895c82e 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -131,7 +131,7 @@ static const u64 nehalem_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0x0,
 	},
  },
- [ C(L2  ) ] = {
+ [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
 		[ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
@@ -141,8 +141,8 @@ static const u64 nehalem_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
 	},
 	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0xc024, /* L2_RQSTS.PREFETCHES          */
-		[ C(RESULT_MISS)   ] = 0x8024, /* L2_RQSTS.PREFETCH_MISS       */
+		[ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference                */
+		[ C(RESULT_MISS)   ] = 0x412e, /* LLC Misses                   */
 	},
  },
  [ C(DTLB) ] = {
@@ -222,7 +222,7 @@ static const u64 core2_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0,
 	},
  },
- [ C(L2  ) ] = {
+ [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
 		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
@@ -313,7 +313,7 @@ static const u64 atom_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0,
 	},
  },
- [ C(L2  ) ] = {
+ [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
 		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
@@ -422,7 +422,7 @@ static const u64 amd_0f_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0,
 	},
  },
- [ C(L2  ) ] = {
+ [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0,
 		[ C(RESULT_MISS)   ] = 0,
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 887df88..20cf5af 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -56,14 +56,14 @@ enum perf_hw_id {
 /*
  * Generalized hardware cache counters:
  *
- *       { L1-D, L1-I, L2, LLC, ITLB, DTLB, BPU } x
+ *       { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x
  *       { read, write, prefetch } x
  *       { accesses, misses }
  */
 enum perf_hw_cache_id {
 	PERF_COUNT_HW_CACHE_L1D		= 0,
 	PERF_COUNT_HW_CACHE_L1I		= 1,
-	PERF_COUNT_HW_CACHE_L2		= 2,
+	PERF_COUNT_HW_CACHE_LL		= 2,
 	PERF_COUNT_HW_CACHE_DTLB	= 3,
 	PERF_COUNT_HW_CACHE_ITLB	= 4,
 	PERF_COUNT_HW_CACHE_BPU		= 5,
-- 
cgit v0.10.2


From a308444ceb576d3089f9ca0dfd097eba6f1e623f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Jun 2009 14:44:26 +0200
Subject: perf_counter: Better align code

Whitespace and comment bits. Also update copyrights.

[ Impact: cleanup ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 20cf5af..1fa1a26 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -1,12 +1,13 @@
 /*
  *  Performance counters:
  *
- *   Copyright(C) 2008, Thomas Gleixner <tglx@linutronix.de>
- *   Copyright(C) 2008, Red Hat, Inc., Ingo Molnar
+ *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
+ *    Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar
+ *    Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra
  *
  *  Data type definitions, declarations, prototypes.
  *
- *  Started by: Thomas Gleixner and Ingo Molnar
+ *    Started by: Thomas Gleixner and Ingo Molnar
  *
  *  For licencing details see kernel-base/COPYING
  */
@@ -25,18 +26,19 @@
  * attr.type
  */
 enum perf_type_id {
-	PERF_TYPE_HARDWARE		= 0,
-	PERF_TYPE_SOFTWARE		= 1,
-	PERF_TYPE_TRACEPOINT		= 2,
-	PERF_TYPE_HW_CACHE		= 3,
-	PERF_TYPE_RAW			= 4,
+	PERF_TYPE_HARDWARE			= 0,
+	PERF_TYPE_SOFTWARE			= 1,
+	PERF_TYPE_TRACEPOINT			= 2,
+	PERF_TYPE_HW_CACHE			= 3,
+	PERF_TYPE_RAW				= 4,
 
-	PERF_TYPE_MAX,			/* non ABI */
+	PERF_TYPE_MAX,				/* non-ABI */
 };
 
 /*
- * Generalized performance counter event types, used by the attr.event_id
- * parameter of the sys_perf_counter_open() syscall:
+ * Generalized performance counter event types, used by the
+ * attr.event_id parameter of the sys_perf_counter_open()
+ * syscall:
  */
 enum perf_hw_id {
 	/*
@@ -50,7 +52,7 @@ enum perf_hw_id {
 	PERF_COUNT_HW_BRANCH_MISSES		= 5,
 	PERF_COUNT_HW_BUS_CYCLES		= 6,
 
-	PERF_COUNT_HW_MAX,		/* non ABI */
+	PERF_COUNT_HW_MAX,			/* non-ABI */
 };
 
 /*
@@ -61,29 +63,29 @@ enum perf_hw_id {
  *       { accesses, misses }
  */
 enum perf_hw_cache_id {
-	PERF_COUNT_HW_CACHE_L1D		= 0,
-	PERF_COUNT_HW_CACHE_L1I		= 1,
-	PERF_COUNT_HW_CACHE_LL		= 2,
-	PERF_COUNT_HW_CACHE_DTLB	= 3,
-	PERF_COUNT_HW_CACHE_ITLB	= 4,
-	PERF_COUNT_HW_CACHE_BPU		= 5,
-
-	PERF_COUNT_HW_CACHE_MAX,	/* non ABI */
+	PERF_COUNT_HW_CACHE_L1D			= 0,
+	PERF_COUNT_HW_CACHE_L1I			= 1,
+	PERF_COUNT_HW_CACHE_LL			= 2,
+	PERF_COUNT_HW_CACHE_DTLB		= 3,
+	PERF_COUNT_HW_CACHE_ITLB		= 4,
+	PERF_COUNT_HW_CACHE_BPU			= 5,
+
+	PERF_COUNT_HW_CACHE_MAX,		/* non-ABI */
 };
 
 enum perf_hw_cache_op_id {
-	PERF_COUNT_HW_CACHE_OP_READ	= 0,
-	PERF_COUNT_HW_CACHE_OP_WRITE	= 1,
-	PERF_COUNT_HW_CACHE_OP_PREFETCH	= 2,
+	PERF_COUNT_HW_CACHE_OP_READ		= 0,
+	PERF_COUNT_HW_CACHE_OP_WRITE		= 1,
+	PERF_COUNT_HW_CACHE_OP_PREFETCH		= 2,
 
-	PERF_COUNT_HW_CACHE_OP_MAX,	/* non ABI */
+	PERF_COUNT_HW_CACHE_OP_MAX,		/* non-ABI */
 };
 
 enum perf_hw_cache_op_result_id {
 	PERF_COUNT_HW_CACHE_RESULT_ACCESS	= 0,
 	PERF_COUNT_HW_CACHE_RESULT_MISS		= 1,
 
-	PERF_COUNT_HW_CACHE_RESULT_MAX,		/* non ABI */
+	PERF_COUNT_HW_CACHE_RESULT_MAX,		/* non-ABI */
 };
 
 /*
@@ -93,15 +95,15 @@ enum perf_hw_cache_op_result_id {
  * well):
  */
 enum perf_sw_ids {
-	PERF_COUNT_SW_CPU_CLOCK		= 0,
-	PERF_COUNT_SW_TASK_CLOCK	= 1,
-	PERF_COUNT_SW_PAGE_FAULTS	= 2,
-	PERF_COUNT_SW_CONTEXT_SWITCHES	= 3,
-	PERF_COUNT_SW_CPU_MIGRATIONS	= 4,
-	PERF_COUNT_SW_PAGE_FAULTS_MIN	= 5,
-	PERF_COUNT_SW_PAGE_FAULTS_MAJ	= 6,
-
-	PERF_COUNT_SW_MAX,		/* non ABI */
+	PERF_COUNT_SW_CPU_CLOCK			= 0,
+	PERF_COUNT_SW_TASK_CLOCK		= 1,
+	PERF_COUNT_SW_PAGE_FAULTS		= 2,
+	PERF_COUNT_SW_CONTEXT_SWITCHES		= 3,
+	PERF_COUNT_SW_CPU_MIGRATIONS		= 4,
+	PERF_COUNT_SW_PAGE_FAULTS_MIN		= 5,
+	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6,
+
+	PERF_COUNT_SW_MAX,			/* non-ABI */
 };
 
 /*
@@ -109,15 +111,15 @@ enum perf_sw_ids {
  * in the overflow packets.
  */
 enum perf_counter_sample_format {
-	PERF_SAMPLE_IP			= 1U << 0,
-	PERF_SAMPLE_TID			= 1U << 1,
-	PERF_SAMPLE_TIME		= 1U << 2,
-	PERF_SAMPLE_ADDR		= 1U << 3,
-	PERF_SAMPLE_GROUP		= 1U << 4,
-	PERF_SAMPLE_CALLCHAIN		= 1U << 5,
-	PERF_SAMPLE_ID			= 1U << 6,
-	PERF_SAMPLE_CPU			= 1U << 7,
-	PERF_SAMPLE_PERIOD		= 1U << 8,
+	PERF_SAMPLE_IP				= 1U << 0,
+	PERF_SAMPLE_TID				= 1U << 1,
+	PERF_SAMPLE_TIME			= 1U << 2,
+	PERF_SAMPLE_ADDR			= 1U << 3,
+	PERF_SAMPLE_GROUP			= 1U << 4,
+	PERF_SAMPLE_CALLCHAIN			= 1U << 5,
+	PERF_SAMPLE_ID				= 1U << 6,
+	PERF_SAMPLE_CPU				= 1U << 7,
+	PERF_SAMPLE_PERIOD			= 1U << 8,
 };
 
 /*
@@ -126,9 +128,9 @@ enum perf_counter_sample_format {
  * in increasing order of bit value, after the counter value.
  */
 enum perf_counter_read_format {
-	PERF_FORMAT_TOTAL_TIME_ENABLED	=  1U << 0,
-	PERF_FORMAT_TOTAL_TIME_RUNNING	=  1U << 1,
-	PERF_FORMAT_ID			=  1U << 2,
+	PERF_FORMAT_TOTAL_TIME_ENABLED		= 1U << 0,
+	PERF_FORMAT_TOTAL_TIME_RUNNING		= 1U << 1,
+	PERF_FORMAT_ID				= 1U << 2,
 };
 
 /*
@@ -229,12 +231,12 @@ struct perf_counter_mmap_page {
 	__u64   data_head;		/* head in the data section */
 };
 
-#define PERF_EVENT_MISC_CPUMODE_MASK	(3 << 0)
-#define PERF_EVENT_MISC_CPUMODE_UNKNOWN	(0 << 0)
-#define PERF_EVENT_MISC_KERNEL		(1 << 0)
-#define PERF_EVENT_MISC_USER		(2 << 0)
-#define PERF_EVENT_MISC_HYPERVISOR	(3 << 0)
-#define PERF_EVENT_MISC_OVERFLOW	(1 << 2)
+#define PERF_EVENT_MISC_CPUMODE_MASK		(3 << 0)
+#define PERF_EVENT_MISC_CPUMODE_UNKNOWN		(0 << 0)
+#define PERF_EVENT_MISC_KERNEL			(1 << 0)
+#define PERF_EVENT_MISC_USER			(2 << 0)
+#define PERF_EVENT_MISC_HYPERVISOR		(3 << 0)
+#define PERF_EVENT_MISC_OVERFLOW		(1 << 2)
 
 struct perf_event_header {
 	__u32	type;
@@ -351,14 +353,14 @@ struct hw_perf_counter {
 #ifdef CONFIG_PERF_COUNTERS
 	union {
 		struct { /* hardware */
-			u64				config;
-			unsigned long			config_base;
-			unsigned long			counter_base;
-			int				idx;
+			u64		config;
+			unsigned long	config_base;
+			unsigned long	counter_base;
+			int		idx;
 		};
 		union { /* software */
-			atomic64_t			count;
-			struct hrtimer			hrtimer;
+			atomic64_t	count;
+			struct hrtimer	hrtimer;
 		};
 	};
 	atomic64_t			prev_count;
@@ -523,37 +525,37 @@ struct perf_counter_context {
 	 * Protect the states of the counters in the list,
 	 * nr_active, and the list:
 	 */
-	spinlock_t		lock;
+	spinlock_t			lock;
 	/*
 	 * Protect the list of counters.  Locking either mutex or lock
 	 * is sufficient to ensure the list doesn't change; to change
 	 * the list you need to lock both the mutex and the spinlock.
 	 */
-	struct mutex		mutex;
+	struct mutex			mutex;
 
-	struct list_head	counter_list;
-	struct list_head	event_list;
-	int			nr_counters;
-	int			nr_active;
-	int			is_active;
-	atomic_t		refcount;
-	struct task_struct	*task;
+	struct list_head		counter_list;
+	struct list_head		event_list;
+	int				nr_counters;
+	int				nr_active;
+	int				is_active;
+	atomic_t			refcount;
+	struct task_struct		*task;
 
 	/*
 	 * Context clock, runs when context enabled.
 	 */
-	u64			time;
-	u64			timestamp;
+	u64				time;
+	u64				timestamp;
 
 	/*
 	 * These fields let us detect when two contexts have both
 	 * been cloned (inherited) from a common ancestor.
 	 */
-	struct perf_counter_context *parent_ctx;
-	u64			parent_gen;
-	u64			generation;
-	int			pin_count;
-	struct rcu_head		rcu_head;
+	struct perf_counter_context	*parent_ctx;
+	u64				parent_gen;
+	u64				generation;
+	int				pin_count;
+	struct rcu_head			rcu_head;
 };
 
 /**
@@ -604,9 +606,9 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 extern void perf_counter_update_userpage(struct perf_counter *counter);
 
 struct perf_sample_data {
-	struct pt_regs		*regs;
-	u64			addr;
-	u64			period;
+	struct pt_regs			*regs;
+	u64				addr;
+	u64				period;
 };
 
 extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
@@ -636,11 +638,14 @@ extern void perf_counter_fork(struct task_struct *tsk);
 
 extern void perf_counter_task_migration(struct task_struct *task, int cpu);
 
-#define MAX_STACK_DEPTH		255
+#define MAX_STACK_DEPTH			255
 
 struct perf_callchain_entry {
-	u16	nr, hv, kernel, user;
-	u64	ip[MAX_STACK_DEPTH];
+	u16				nr;
+	u16				hv;
+	u16				kernel;
+	u16				user;
+	u64				ip[MAX_STACK_DEPTH];
 };
 
 extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
-- 
cgit v0.10.2


From cca3f454a85ff42d426401bce7ac804541b2bd03 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Jun 2009 14:57:55 +0200
Subject: perf_counter: Add counter->id to the throttle event

So as to be able to distuinguish between multiple counters.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 1fa1a26..6e13395 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -286,6 +286,7 @@ enum perf_event_type {
 	 * struct {
 	 *	struct perf_event_header	header;
 	 *	u64				time;
+	 *	u64				id;
 	 * };
 	 */
 	PERF_EVENT_THROTTLE		= 5,
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 8859b97..ef5d8a5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2950,13 +2950,15 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
 	struct {
 		struct perf_event_header	header;
 		u64				time;
+		u64				id;
 	} throttle_event = {
 		.header = {
 			.type = PERF_EVENT_THROTTLE + 1,
 			.misc = 0,
 			.size = sizeof(throttle_event),
 		},
-		.time = sched_clock(),
+		.time	= sched_clock(),
+		.id	= counter->id,
 	};
 
 	ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
-- 
cgit v0.10.2


From 8dc8e5e8bc0ce00b0f656bf972f67cd8a72759e5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Jun 2009 16:13:24 +0200
Subject: perf_counter: Turn off by default

Perfcounters were enabled by default to help testing - but now that we
are submitting it upstream, make it default-disabled.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/init/Kconfig b/init/Kconfig
index 8158f1f..aef16f9 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -941,7 +941,6 @@ menu "Performance Counters"
 config PERF_COUNTERS
 	bool "Kernel Performance Counters"
 	depends on HAVE_PERF_COUNTERS
-	default y
 	select ANON_INODES
 	help
 	  Enable kernel support for performance counter hardware.
-- 
cgit v0.10.2


From d2f11bf7fc630e27dfcede367399dddf40521878 Mon Sep 17 00:00:00 2001
From: Justin Chen <jchen@hpdst41.cup.hp.com>
Date: Thu, 11 Jun 2009 13:04:59 +0100
Subject: FRV: bitops: Change the bitmap index from int to unsigned long

Change the index to unsigned long in all bitops for [frv]

Signed-off-by: Justin Chen <justin.chen@hp.com>
Reviewed-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/frv/include/asm/bitops.h b/arch/frv/include/asm/bitops.h
index 287f6f6..50ae91b 100644
--- a/arch/frv/include/asm/bitops.h
+++ b/arch/frv/include/asm/bitops.h
@@ -112,7 +112,7 @@ extern unsigned long atomic_test_and_XOR_mask(unsigned long mask, volatile unsig
 #define atomic_clear_mask(mask, v)	atomic_test_and_ANDNOT_mask((mask), (v))
 #define atomic_set_mask(mask, v)	atomic_test_and_OR_mask((mask), (v))
 
-static inline int test_and_clear_bit(int nr, volatile void *addr)
+static inline int test_and_clear_bit(unsigned long nr, volatile void *addr)
 {
 	volatile unsigned long *ptr = addr;
 	unsigned long mask = 1UL << (nr & 31);
@@ -120,7 +120,7 @@ static inline int test_and_clear_bit(int nr, volatile void *addr)
 	return (atomic_test_and_ANDNOT_mask(mask, ptr) & mask) != 0;
 }
 
-static inline int test_and_set_bit(int nr, volatile void *addr)
+static inline int test_and_set_bit(unsigned long nr, volatile void *addr)
 {
 	volatile unsigned long *ptr = addr;
 	unsigned long mask = 1UL << (nr & 31);
@@ -128,7 +128,7 @@ static inline int test_and_set_bit(int nr, volatile void *addr)
 	return (atomic_test_and_OR_mask(mask, ptr) & mask) != 0;
 }
 
-static inline int test_and_change_bit(int nr, volatile void *addr)
+static inline int test_and_change_bit(unsigned long nr, volatile void *addr)
 {
 	volatile unsigned long *ptr = addr;
 	unsigned long mask = 1UL << (nr & 31);
@@ -136,22 +136,22 @@ static inline int test_and_change_bit(int nr, volatile void *addr)
 	return (atomic_test_and_XOR_mask(mask, ptr) & mask) != 0;
 }
 
-static inline void clear_bit(int nr, volatile void *addr)
+static inline void clear_bit(unsigned long nr, volatile void *addr)
 {
 	test_and_clear_bit(nr, addr);
 }
 
-static inline void set_bit(int nr, volatile void *addr)
+static inline void set_bit(unsigned long nr, volatile void *addr)
 {
 	test_and_set_bit(nr, addr);
 }
 
-static inline void change_bit(int nr, volatile void * addr)
+static inline void change_bit(unsigned long nr, volatile void *addr)
 {
 	test_and_change_bit(nr, addr);
 }
 
-static inline void __clear_bit(int nr, volatile void * addr)
+static inline void __clear_bit(unsigned long nr, volatile void *addr)
 {
 	volatile unsigned long *a = addr;
 	int mask;
@@ -161,7 +161,7 @@ static inline void __clear_bit(int nr, volatile void * addr)
 	*a &= ~mask;
 }
 
-static inline void __set_bit(int nr, volatile void * addr)
+static inline void __set_bit(unsigned long nr, volatile void *addr)
 {
 	volatile unsigned long *a = addr;
 	int mask;
@@ -171,7 +171,7 @@ static inline void __set_bit(int nr, volatile void * addr)
 	*a |= mask;
 }
 
-static inline void __change_bit(int nr, volatile void *addr)
+static inline void __change_bit(unsigned long nr, volatile void *addr)
 {
 	volatile unsigned long *a = addr;
 	int mask;
@@ -181,7 +181,7 @@ static inline void __change_bit(int nr, volatile void *addr)
 	*a ^= mask;
 }
 
-static inline int __test_and_clear_bit(int nr, volatile void * addr)
+static inline int __test_and_clear_bit(unsigned long nr, volatile void *addr)
 {
 	volatile unsigned long *a = addr;
 	int mask, retval;
@@ -193,7 +193,7 @@ static inline int __test_and_clear_bit(int nr, volatile void * addr)
 	return retval;
 }
 
-static inline int __test_and_set_bit(int nr, volatile void * addr)
+static inline int __test_and_set_bit(unsigned long nr, volatile void *addr)
 {
 	volatile unsigned long *a = addr;
 	int mask, retval;
@@ -205,7 +205,7 @@ static inline int __test_and_set_bit(int nr, volatile void * addr)
 	return retval;
 }
 
-static inline int __test_and_change_bit(int nr, volatile void * addr)
+static inline int __test_and_change_bit(unsigned long nr, volatile void *addr)
 {
 	volatile unsigned long *a = addr;
 	int mask, retval;
@@ -220,12 +220,13 @@ static inline int __test_and_change_bit(int nr, volatile void * addr)
 /*
  * This routine doesn't need to be atomic.
  */
-static inline int __constant_test_bit(int nr, const volatile void * addr)
+static inline int
+__constant_test_bit(unsigned long nr, const volatile void *addr)
 {
 	return ((1UL << (nr & 31)) & (((const volatile unsigned int *) addr)[nr >> 5])) != 0;
 }
 
-static inline int __test_bit(int nr, const volatile void * addr)
+static inline int __test_bit(unsigned long nr, const volatile void *addr)
 {
 	int 	* a = (int *) addr;
 	int	mask;
-- 
cgit v0.10.2


From db5c444eeb781788a0db36a2682b2417cf71f764 Mon Sep 17 00:00:00 2001
From: Stoyan Gaydarov <stoyboyker@gmail.com>
Date: Thu, 11 Jun 2009 13:05:04 +0100
Subject: FRV: BUG to BUG_ON changes

Change some BUG()'s to BUG_ON()'s.

Signed-off-by: Stoyan Gaydarov <stoyboyker@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/frv/include/asm/pci.h b/arch/frv/include/asm/pci.h
index 585d9b4..cc685e6 100644
--- a/arch/frv/include/asm/pci.h
+++ b/arch/frv/include/asm/pci.h
@@ -87,8 +87,7 @@ static inline void pci_dma_sync_single(struct pci_dev *hwdev,
 				       dma_addr_t dma_handle,
 				       size_t size, int direction)
 {
-	if (direction == PCI_DMA_NONE)
-                BUG();
+	BUG_ON(direction == PCI_DMA_NONE);
 
 	frv_cache_wback_inv((unsigned long)bus_to_virt(dma_handle),
 			    (unsigned long)bus_to_virt(dma_handle) + size);
@@ -105,9 +104,7 @@ static inline void pci_dma_sync_sg(struct pci_dev *hwdev,
 				   int nelems, int direction)
 {
 	int i;
-
-	if (direction == PCI_DMA_NONE)
-                BUG();
+	BUG_ON(direction == PCI_DMA_NONE);
 
 	for (i = 0; i < nelems; i++)
 		frv_cache_wback_inv(sg_dma_address(&sg[i]),
diff --git a/arch/frv/kernel/uaccess.c b/arch/frv/kernel/uaccess.c
index 9fb771a..374f88d 100644
--- a/arch/frv/kernel/uaccess.c
+++ b/arch/frv/kernel/uaccess.c
@@ -23,8 +23,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count)
 	char *p, ch;
 	long err = -EFAULT;
 
-	if (count < 0)
-		BUG();
+	BUG_ON(count < 0);
 
 	p = dst;
 
@@ -76,8 +75,7 @@ long strnlen_user(const char __user *src, long count)
 	long err = 0;
 	char ch;
 
-	if (count < 0)
-		BUG();
+	BUG_ON(count < 0);
 
 #ifndef CONFIG_MMU
 	if ((unsigned long) src < memory_start)
diff --git a/arch/frv/mb93090-mb00/pci-dma-nommu.c b/arch/frv/mb93090-mb00/pci-dma-nommu.c
index 52ff9ae..4e1ba0b 100644
--- a/arch/frv/mb93090-mb00/pci-dma-nommu.c
+++ b/arch/frv/mb93090-mb00/pci-dma-nommu.c
@@ -116,8 +116,7 @@ EXPORT_SYMBOL(dma_free_coherent);
 dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size,
 			  enum dma_data_direction direction)
 {
-	if (direction == DMA_NONE)
-                BUG();
+	BUG_ON(direction == DMA_NONE);
 
 	frv_cache_wback_inv((unsigned long) ptr, (unsigned long) ptr + size);
 
@@ -151,8 +150,7 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 		frv_cache_wback_inv(sg_dma_address(&sg[i]),
 				    sg_dma_address(&sg[i]) + sg_dma_len(&sg[i]));
 
-	if (direction == DMA_NONE)
-                BUG();
+	BUG_ON(direction == DMA_NONE);
 
 	return nents;
 }
diff --git a/arch/frv/mb93090-mb00/pci-dma.c b/arch/frv/mb93090-mb00/pci-dma.c
index 3ddedeb..45954f0 100644
--- a/arch/frv/mb93090-mb00/pci-dma.c
+++ b/arch/frv/mb93090-mb00/pci-dma.c
@@ -48,8 +48,7 @@ EXPORT_SYMBOL(dma_free_coherent);
 dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size,
 			  enum dma_data_direction direction)
 {
-	if (direction == DMA_NONE)
-                BUG();
+	BUG_ON(direction == DMA_NONE);
 
 	frv_cache_wback_inv((unsigned long) ptr, (unsigned long) ptr + size);
 
@@ -81,8 +80,7 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 	void *vaddr;
 	int i;
 
-	if (direction == DMA_NONE)
-                BUG();
+	BUG_ON(direction == DMA_NONE);
 
 	dampr2 = __get_DAMPR(2);
 
-- 
cgit v0.10.2


From aa1913c0214a53568731617c0afbbfa3f59513fb Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 11 Jun 2009 13:05:09 +0100
Subject: FRV: Remove in-kernel strace code

Remove in-kernel strace code from the FRV arch as it's not really needed any
more.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/frv/kernel/ptrace.c b/arch/frv/kernel/ptrace.c
index 5e7d401..6b15e5d 100644
--- a/arch/frv/kernel/ptrace.c
+++ b/arch/frv/kernel/ptrace.c
@@ -306,387 +306,8 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	return ret;
 }
 
-int __nongprelbss kstrace;
-
-static const struct {
-	const char	*name;
-	unsigned	argmask;
-} __syscall_name_table[NR_syscalls] = {
-	[0]	= { "restart_syscall"			},
-	[1]	= { "exit",		0x000001	},
-	[2]	= { "fork",		0xffffff	},
-	[3]	= { "read",		0x000141	},
-	[4]	= { "write",		0x000141	},
-	[5]	= { "open",		0x000235	},
-	[6]	= { "close",		0x000001	},
-	[7]	= { "waitpid",		0x000141	},
-	[8]	= { "creat",		0x000025	},
-	[9]	= { "link",		0x000055	},
-	[10]	= { "unlink",		0x000005	},
-	[11]	= { "execve",		0x000445	},
-	[12]	= { "chdir",		0x000005	},
-	[13]	= { "time",		0x000004	},
-	[14]	= { "mknod",		0x000325	},
-	[15]	= { "chmod",		0x000025	},
-	[16]	= { "lchown",		0x000025	},
-	[17]	= { "break" },
-	[18]	= { "oldstat",		0x000045	},
-	[19]	= { "lseek",		0x000131	},
-	[20]	= { "getpid",		0xffffff	},
-	[21]	= { "mount",		0x043555	},
-	[22]	= { "umount",		0x000005	},
-	[23]	= { "setuid",		0x000001	},
-	[24]	= { "getuid",		0xffffff	},
-	[25]	= { "stime",		0x000004	},
-	[26]	= { "ptrace",		0x004413	},
-	[27]	= { "alarm",		0x000001	},
-	[28]	= { "oldfstat",		0x000041	},
-	[29]	= { "pause",		0xffffff	},
-	[30]	= { "utime",		0x000045	},
-	[31]	= { "stty" },
-	[32]	= { "gtty" },
-	[33]	= { "access",		0x000025	},
-	[34]	= { "nice",		0x000001	},
-	[35]	= { "ftime" },
-	[36]	= { "sync",		0xffffff	},
-	[37]	= { "kill",		0x000011	},
-	[38]	= { "rename",		0x000055	},
-	[39]	= { "mkdir",		0x000025	},
-	[40]	= { "rmdir",		0x000005	},
-	[41]	= { "dup",		0x000001	},
-	[42]	= { "pipe",		0x000004	},
-	[43]	= { "times",		0x000004	},
-	[44]	= { "prof" },
-	[45]	= { "brk",		0x000004	},
-	[46]	= { "setgid",		0x000001	},
-	[47]	= { "getgid",		0xffffff	},
-	[48]	= { "signal",		0x000041	},
-	[49]	= { "geteuid",		0xffffff	},
-	[50]	= { "getegid",		0xffffff	},
-	[51]	= { "acct",		0x000005	},
-	[52]	= { "umount2",		0x000035	},
-	[53]	= { "lock" },
-	[54]	= { "ioctl",		0x000331	},
-	[55]	= { "fcntl",		0x000331	},
-	[56]	= { "mpx" },
-	[57]	= { "setpgid",		0x000011	},
-	[58]	= { "ulimit" },
-	[60]	= { "umask",		0x000002	},
-	[61]	= { "chroot",		0x000005	},
-	[62]	= { "ustat",		0x000043	},
-	[63]	= { "dup2",		0x000011	},
-	[64]	= { "getppid",		0xffffff	},
-	[65]	= { "getpgrp",		0xffffff	},
-	[66]	= { "setsid",		0xffffff	},
-	[67]	= { "sigaction" },
-	[68]	= { "sgetmask" },
-	[69]	= { "ssetmask" },
-	[70]	= { "setreuid" },
-	[71]	= { "setregid" },
-	[72]	= { "sigsuspend" },
-	[73]	= { "sigpending" },
-	[74]	= { "sethostname" },
-	[75]	= { "setrlimit" },
-	[76]	= { "getrlimit" },
-	[77]	= { "getrusage" },
-	[78]	= { "gettimeofday" },
-	[79]	= { "settimeofday" },
-	[80]	= { "getgroups" },
-	[81]	= { "setgroups" },
-	[82]	= { "select" },
-	[83]	= { "symlink" },
-	[84]	= { "oldlstat" },
-	[85]	= { "readlink" },
-	[86]	= { "uselib" },
-	[87]	= { "swapon" },
-	[88]	= { "reboot" },
-	[89]	= { "readdir" },
-	[91]	= { "munmap",		0x000034	},
-	[92]	= { "truncate" },
-	[93]	= { "ftruncate" },
-	[94]	= { "fchmod" },
-	[95]	= { "fchown" },
-	[96]	= { "getpriority" },
-	[97]	= { "setpriority" },
-	[99]	= { "statfs" },
-	[100]	= { "fstatfs" },
-	[102]	= { "socketcall" },
-	[103]	= { "syslog" },
-	[104]	= { "setitimer" },
-	[105]	= { "getitimer" },
-	[106]	= { "stat" },
-	[107]	= { "lstat" },
-	[108]	= { "fstat" },
-	[111]	= { "vhangup" },
-	[114]	= { "wait4" },
-	[115]	= { "swapoff" },
-	[116]	= { "sysinfo" },
-	[117]	= { "ipc" },
-	[118]	= { "fsync" },
-	[119]	= { "sigreturn" },
-	[120]	= { "clone" },
-	[121]	= { "setdomainname" },
-	[122]	= { "uname" },
-	[123]	= { "modify_ldt" },
-	[123]	= { "cacheflush" },
-	[124]	= { "adjtimex" },
-	[125]	= { "mprotect" },
-	[126]	= { "sigprocmask" },
-	[127]	= { "create_module" },
-	[128]	= { "init_module" },
-	[129]	= { "delete_module" },
-	[130]	= { "get_kernel_syms" },
-	[131]	= { "quotactl" },
-	[132]	= { "getpgid" },
-	[133]	= { "fchdir" },
-	[134]	= { "bdflush" },
-	[135]	= { "sysfs" },
-	[136]	= { "personality" },
-	[137]	= { "afs_syscall" },
-	[138]	= { "setfsuid" },
-	[139]	= { "setfsgid" },
-	[140]	= { "_llseek",			0x014331	},
-	[141]	= { "getdents" },
-	[142]	= { "_newselect",		0x000141	},
-	[143]	= { "flock" },
-	[144]	= { "msync" },
-	[145]	= { "readv" },
-	[146]	= { "writev" },
-	[147]	= { "getsid",			0x000001	},
-	[148]	= { "fdatasync",		0x000001	},
-	[149]	= { "_sysctl",			0x000004	},
-	[150]	= { "mlock" },
-	[151]	= { "munlock" },
-	[152]	= { "mlockall" },
-	[153]	= { "munlockall" },
-	[154]	= { "sched_setparam" },
-	[155]	= { "sched_getparam" },
-	[156]	= { "sched_setscheduler" },
-	[157]	= { "sched_getscheduler" },
-	[158]	= { "sched_yield" },
-	[159]	= { "sched_get_priority_max" },
-	[160]	= { "sched_get_priority_min" },
-	[161]	= { "sched_rr_get_interval" },
-	[162]	= { "nanosleep",		0x000044	},
-	[163]	= { "mremap" },
-	[164]	= { "setresuid" },
-	[165]	= { "getresuid" },
-	[166]	= { "vm86" },
-	[167]	= { "query_module" },
-	[168]	= { "poll" },
-	[169]	= { "nfsservctl" },
-	[170]	= { "setresgid" },
-	[171]	= { "getresgid" },
-	[172]	= { "prctl",			0x333331	},
-	[173]	= { "rt_sigreturn",		0xffffff	},
-	[174]	= { "rt_sigaction",		0x001441	},
-	[175]	= { "rt_sigprocmask",		0x001441	},
-	[176]	= { "rt_sigpending",		0x000014	},
-	[177]	= { "rt_sigtimedwait",		0x001444	},
-	[178]	= { "rt_sigqueueinfo",		0x000411	},
-	[179]	= { "rt_sigsuspend",		0x000014	},
-	[180]	= { "pread",			0x003341	},
-	[181]	= { "pwrite",			0x003341	},
-	[182]	= { "chown",			0x000115	},
-	[183]	= { "getcwd" },
-	[184]	= { "capget" },
-	[185]	= { "capset" },
-	[186]	= { "sigaltstack" },
-	[187]	= { "sendfile" },
-	[188]	= { "getpmsg" },
-	[189]	= { "putpmsg" },
-	[190]	= { "vfork",			0xffffff	},
-	[191]	= { "ugetrlimit" },
-	[192]	= { "mmap2",			0x313314	},
-	[193]	= { "truncate64" },
-	[194]	= { "ftruncate64" },
-	[195]	= { "stat64",			0x000045	},
-	[196]	= { "lstat64",			0x000045	},
-	[197]	= { "fstat64",			0x000041	},
-	[198]	= { "lchown32" },
-	[199]	= { "getuid32",			0xffffff	},
-	[200]	= { "getgid32",			0xffffff	},
-	[201]	= { "geteuid32",		0xffffff	},
-	[202]	= { "getegid32",		0xffffff	},
-	[203]	= { "setreuid32" },
-	[204]	= { "setregid32" },
-	[205]	= { "getgroups32" },
-	[206]	= { "setgroups32" },
-	[207]	= { "fchown32" },
-	[208]	= { "setresuid32" },
-	[209]	= { "getresuid32" },
-	[210]	= { "setresgid32" },
-	[211]	= { "getresgid32" },
-	[212]	= { "chown32" },
-	[213]	= { "setuid32" },
-	[214]	= { "setgid32" },
-	[215]	= { "setfsuid32" },
-	[216]	= { "setfsgid32" },
-	[217]	= { "pivot_root" },
-	[218]	= { "mincore" },
-	[219]	= { "madvise" },
-	[220]	= { "getdents64" },
-	[221]	= { "fcntl64" },
-	[223]	= { "security" },
-	[224]	= { "gettid" },
-	[225]	= { "readahead" },
-	[226]	= { "setxattr" },
-	[227]	= { "lsetxattr" },
-	[228]	= { "fsetxattr" },
-	[229]	= { "getxattr" },
-	[230]	= { "lgetxattr" },
-	[231]	= { "fgetxattr" },
-	[232]	= { "listxattr" },
-	[233]	= { "llistxattr" },
-	[234]	= { "flistxattr" },
-	[235]	= { "removexattr" },
-	[236]	= { "lremovexattr" },
-	[237]	= { "fremovexattr" },
-	[238]	= { "tkill" },
-	[239]	= { "sendfile64" },
-	[240]	= { "futex" },
-	[241]	= { "sched_setaffinity" },
-	[242]	= { "sched_getaffinity" },
-	[243]	= { "set_thread_area" },
-	[244]	= { "get_thread_area" },
-	[245]	= { "io_setup" },
-	[246]	= { "io_destroy" },
-	[247]	= { "io_getevents" },
-	[248]	= { "io_submit" },
-	[249]	= { "io_cancel" },
-	[250]	= { "fadvise64" },
-	[252]	= { "exit_group",		0x000001	},
-	[253]	= { "lookup_dcookie" },
-	[254]	= { "epoll_create" },
-	[255]	= { "epoll_ctl" },
-	[256]	= { "epoll_wait" },
-	[257]	= { "remap_file_pages" },
-	[258]	= { "set_tid_address" },
-	[259]	= { "timer_create" },
-	[260]	= { "timer_settime" },
-	[261]	= { "timer_gettime" },
-	[262]	= { "timer_getoverrun" },
-	[263]	= { "timer_delete" },
-	[264]	= { "clock_settime" },
-	[265]	= { "clock_gettime" },
-	[266]	= { "clock_getres" },
-	[267]	= { "clock_nanosleep" },
-	[268]	= { "statfs64" },
-	[269]	= { "fstatfs64" },
-	[270]	= { "tgkill" },
-	[271]	= { "utimes" },
-	[272]	= { "fadvise64_64" },
-	[273]	= { "vserver" },
-	[274]	= { "mbind" },
-	[275]	= { "get_mempolicy" },
-	[276]	= { "set_mempolicy" },
-	[277]	= { "mq_open" },
-	[278]	= { "mq_unlink" },
-	[279]	= { "mq_timedsend" },
-	[280]	= { "mq_timedreceive" },
-	[281]	= { "mq_notify" },
-	[282]	= { "mq_getsetattr" },
-	[283]	= { "sys_kexec_load" },
-};
-
 asmlinkage void do_syscall_trace(int leaving)
 {
-#if 0
-	unsigned long *argp;
-	const char *name;
-	unsigned argmask;
-	char buffer[16];
-
-	if (!kstrace)
-		return;
-
-	if (!current->mm)
-		return;
-
-	if (__frame->gr7 == __NR_close)
-		return;
-
-#if 0
-	if (__frame->gr7 != __NR_mmap2 &&
-	    __frame->gr7 != __NR_vfork &&
-	    __frame->gr7 != __NR_execve &&
-	    __frame->gr7 != __NR_exit)
-		return;
-#endif
-
-	argmask = 0;
-	name = NULL;
-	if (__frame->gr7 < NR_syscalls) {
-		name = __syscall_name_table[__frame->gr7].name;
-		argmask = __syscall_name_table[__frame->gr7].argmask;
-	}
-	if (!name) {
-		sprintf(buffer, "sys_%lx", __frame->gr7);
-		name = buffer;
-	}
-
-	if (!leaving) {
-		if (!argmask) {
-			printk(KERN_CRIT "[%d] %s(%lx,%lx,%lx,%lx,%lx,%lx)\n",
-			       current->pid,
-			       name,
-			       __frame->gr8,
-			       __frame->gr9,
-			       __frame->gr10,
-			       __frame->gr11,
-			       __frame->gr12,
-			       __frame->gr13);
-		}
-		else if (argmask == 0xffffff) {
-			printk(KERN_CRIT "[%d] %s()\n",
-			       current->pid,
-			       name);
-		}
-		else {
-			printk(KERN_CRIT "[%d] %s(",
-			       current->pid,
-			       name);
-
-			argp = &__frame->gr8;
-
-			do {
-				switch (argmask & 0xf) {
-				case 1:
-					printk("%ld", (long) *argp);
-					break;
-				case 2:
-					printk("%lo", *argp);
-					break;
-				case 3:
-					printk("%lx", *argp);
-					break;
-				case 4:
-					printk("%p", (void *) *argp);
-					break;
-				case 5:
-					printk("\"%s\"", (char *) *argp);
-					break;
-				}
-
-				argp++;
-				argmask >>= 4;
-				if (argmask)
-					printk(",");
-
-			} while (argmask);
-
-			printk(")\n");
-		}
-	}
-	else {
-		if ((int)__frame->gr8 > -4096 && (int)__frame->gr8 < 4096)
-			printk(KERN_CRIT "[%d] %s() = %ld\n", current->pid, name, __frame->gr8);
-		else
-			printk(KERN_CRIT "[%d] %s() = %lx\n", current->pid, name, __frame->gr8);
-	}
-	return;
-#endif
-
 	if (!test_thread_flag(TIF_SYSCALL_TRACE))
 		return;
 
-- 
cgit v0.10.2


From b7bab880c795ec620041ef0295cbbbc5a726f414 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 11 Jun 2009 13:05:14 +0100
Subject: FRV: Implement TIF_NOTIFY_RESUME

Implement the TIF_NOTIFY_RESUME thread flag, making it call do_notify_resume()
which then clears it.  This will be made use of later by tracehooks in the
new-style ptrace implementation

Also discard TIF_IRET as that's not used by FRV.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/frv/include/asm/thread_info.h b/arch/frv/include/asm/thread_info.h
index bb53ab7..e8a5ed7 100644
--- a/arch/frv/include/asm/thread_info.h
+++ b/arch/frv/include/asm/thread_info.h
@@ -109,20 +109,20 @@ register struct thread_info *__current_thread_info asm("gr15");
  * - other flags in MSW
  */
 #define TIF_SYSCALL_TRACE	0	/* syscall trace active */
-#define TIF_SIGPENDING		1	/* signal pending */
-#define TIF_NEED_RESCHED	2	/* rescheduling necessary */
-#define TIF_SINGLESTEP		3	/* restore singlestep on return to user mode */
-#define TIF_IRET		4	/* return with iret */
+#define TIF_NOTIFY_RESUME	1	/* callback before returning to user */
+#define TIF_SIGPENDING		2	/* signal pending */
+#define TIF_NEED_RESCHED	3	/* rescheduling necessary */
+#define TIF_SINGLESTEP		4	/* restore singlestep on return to user mode */
 #define TIF_RESTORE_SIGMASK	5	/* restore signal mask in do_signal() */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_MEMDIE		17	/* OOM killer killed process */
 #define TIF_FREEZE		18	/* freezing for suspend */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
+#define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
 #define _TIF_SINGLESTEP		(1 << TIF_SINGLESTEP)
-#define _TIF_IRET		(1 << TIF_IRET)
 #define _TIF_RESTORE_SIGMASK	(1 << TIF_RESTORE_SIGMASK)
 #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
 #define _TIF_FREEZE		(1 << TIF_FREEZE)
diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index 3bdb368..7ae290a 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -564,4 +564,9 @@ asmlinkage void do_notify_resume(__u32 thread_info_flags)
 	if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
 		do_signal();
 
+	/* deal with notification on about to resume userspace execution */
+	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+		clear_thread_flag(TIF_NOTIFY_RESUME);
+	}
+
 } /* end do_notify_resume() */
-- 
cgit v0.10.2


From 24ceb7e8a6c5dd6e32ac3d43a2424542c97989f5 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 11 Jun 2009 13:05:19 +0100
Subject: FRV: Don't turn on TIF_SYSCALL_TRACE unconditionally in syscall
 prologue

Don't turn on TIF_SYSCALL_TRACE unconditionally in syscall prologue in FRV's
entry.S.  This was originally for debugging stuff and should have been removed
a long time ago.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/frv/kernel/entry.S b/arch/frv/kernel/entry.S
index 1da523b..268dfbd 100644
--- a/arch/frv/kernel/entry.S
+++ b/arch/frv/kernel/entry.S
@@ -886,7 +886,6 @@ system_call:
 	bnc		icc0,#0,__syscall_badsys
 
 	ldi		@(gr15,#TI_FLAGS),gr4
-	ori		gr4,#_TIF_SYSCALL_TRACE,gr4
 	andicc		gr4,#_TIF_SYSCALL_TRACE,gr0,icc0
 	bne		icc0,#0,__syscall_trace_entry
 
-- 
cgit v0.10.2


From 4a3b98932270f5d69f2c081924e356325ed704d9 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 11 Jun 2009 13:05:24 +0100
Subject: FRV: Implement new-style ptrace

Implement the new-style ptrace for FRV, including adding appropriate
tracehooks.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index 9d1552a..8a5bd7a 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -6,6 +6,7 @@ config FRV
 	bool
 	default y
 	select HAVE_IDE
+	select HAVE_ARCH_TRACEHOOK
 
 config ZONE_DMA
 	bool
diff --git a/arch/frv/include/asm/elf.h b/arch/frv/include/asm/elf.h
index 7279ec07d..7bbf6e4 100644
--- a/arch/frv/include/asm/elf.h
+++ b/arch/frv/include/asm/elf.h
@@ -116,6 +116,7 @@ do {											\
 } while(0)
 
 #define USE_ELF_CORE_DUMP
+#define CORE_DUMP_USE_REGSET
 #define ELF_FDPIC_CORE_EFLAGS	EF_FRV_FDPIC
 #define ELF_EXEC_PAGESIZE	16384
 
diff --git a/arch/frv/include/asm/ptrace.h b/arch/frv/include/asm/ptrace.h
index cf69340..a54b535 100644
--- a/arch/frv/include/asm/ptrace.h
+++ b/arch/frv/include/asm/ptrace.h
@@ -65,6 +65,8 @@
 #ifdef __KERNEL__
 #ifndef __ASSEMBLY__
 
+struct task_struct;
+
 /*
  * we dedicate GR28 to keeping a pointer to the current exception frame
  * - gr28 is destroyed on entry to the kernel from userspace
@@ -73,11 +75,18 @@ register struct pt_regs *__frame asm("gr28");
 
 #define user_mode(regs)			(!((regs)->psr & PSR_S))
 #define instruction_pointer(regs)	((regs)->pc)
+#define user_stack_pointer(regs)	((regs)->sp)
 
 extern unsigned long user_stack(const struct pt_regs *);
 extern void show_regs(struct pt_regs *);
 #define profile_pc(regs) ((regs)->pc)
-#endif
+
+#define task_pt_regs(task) ((task)->thread.frame0)
+
+#define arch_has_single_step()	(1)
+extern void user_enable_single_step(struct task_struct *);
+extern void user_disable_single_step(struct task_struct *);
 
 #endif /* !__ASSEMBLY__ */
+#endif /* __KERNEL__ */
 #endif /* _ASM_PTRACE_H */
diff --git a/arch/frv/include/asm/syscall.h b/arch/frv/include/asm/syscall.h
new file mode 100644
index 0000000..70689eb
--- /dev/null
+++ b/arch/frv/include/asm/syscall.h
@@ -0,0 +1,123 @@
+/* syscall parameter access functions
+ *
+ * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _ASM_SYSCALL_H
+#define _ASM_SYSCALL_H
+
+#include <linux/err.h>
+#include <asm/ptrace.h>
+
+/*
+ * Get the system call number or -1
+ */
+static inline long syscall_get_nr(struct task_struct *task,
+				  struct pt_regs *regs)
+{
+	return regs->syscallno;
+}
+
+/*
+ * Restore the clobbered GR8 register
+ * (1st syscall arg was overwritten with syscall return or error)
+ */
+static inline void syscall_rollback(struct task_struct *task,
+				    struct pt_regs *regs)
+{
+	regs->gr8 = regs->orig_gr8;
+}
+
+/*
+ * See if the syscall return value is an error, returning it if it is and 0 if
+ * not
+ */
+static inline long syscall_get_error(struct task_struct *task,
+				     struct pt_regs *regs)
+{
+	return IS_ERR_VALUE(regs->gr8) ? regs->gr8 : 0;
+}
+
+/*
+ * Get the syscall return value
+ */
+static inline long syscall_get_return_value(struct task_struct *task,
+					    struct pt_regs *regs)
+{
+	return regs->gr8;
+}
+
+/*
+ * Set the syscall return value
+ */
+static inline void syscall_set_return_value(struct task_struct *task,
+					    struct pt_regs *regs,
+					    int error, long val)
+{
+	if (error)
+		regs->gr8 = -error;
+	else
+		regs->gr8 = val;
+}
+
+/*
+ * Retrieve the system call arguments
+ */
+static inline void syscall_get_arguments(struct task_struct *task,
+					 struct pt_regs *regs,
+					 unsigned int i, unsigned int n,
+					 unsigned long *args)
+{
+	/*
+	 * Do this simply for now. If we need to start supporting
+	 * fetching arguments from arbitrary indices, this will need some
+	 * extra logic. Presently there are no in-tree users that depend
+	 * on this behaviour.
+	 */
+	BUG_ON(i);
+
+	/* Argument pattern is: GR8, GR9, GR10, GR11, GR12, GR13 */
+	switch (n) {
+	case 6: args[5] = regs->gr13;
+	case 5: args[4] = regs->gr12;
+	case 4: args[3] = regs->gr11;
+	case 3: args[2] = regs->gr10;
+	case 2: args[1] = regs->gr9;
+	case 1:	args[0] = regs->gr8;
+		break;
+	default:
+		BUG();
+	}
+}
+
+/*
+ * Alter the system call arguments
+ */
+static inline void syscall_set_arguments(struct task_struct *task,
+					 struct pt_regs *regs,
+					 unsigned int i, unsigned int n,
+					 const unsigned long *args)
+{
+	/* Same note as above applies */
+	BUG_ON(i);
+
+	switch (n) {
+	case 6: regs->gr13 = args[5];
+	case 5: regs->gr12 = args[4];
+	case 4: regs->gr11 = args[3];
+	case 3: regs->gr10 = args[2];
+	case 2: regs->gr9  = args[1];
+	case 1: regs->gr8  = args[0];
+		break;
+	default:
+		BUG();
+	}
+}
+
+#endif /* _ASM_SYSCALL_H */
diff --git a/arch/frv/kernel/entry.S b/arch/frv/kernel/entry.S
index 268dfbd..356e0e3 100644
--- a/arch/frv/kernel/entry.S
+++ b/arch/frv/kernel/entry.S
@@ -1149,11 +1149,10 @@ __entry_work_notifysig:
 	# perform syscall entry tracing
 __syscall_trace_entry:
 	LEDS		0x6320
-	setlos.p	#0,gr8
-	call		do_syscall_trace
+	call		syscall_trace_entry
 
-	ldi		@(gr28,#REG_SYSCALLNO),gr7
-	lddi		@(gr28,#REG_GR(8)) ,gr8
+	lddi.p		@(gr28,#REG_GR(8)) ,gr8
+	ori		gr8,#0,gr7		; syscall_trace_entry() returned new syscallno
 	lddi		@(gr28,#REG_GR(10)),gr10
 	lddi.p		@(gr28,#REG_GR(12)),gr12
 
@@ -1168,11 +1167,10 @@ __syscall_exit_work:
 	beq		icc0,#1,__entry_work_pending
 
 	movsg		psr,gr23
-	andi		gr23,#~PSR_PIL,gr23	; could let do_syscall_trace() call schedule()
+	andi		gr23,#~PSR_PIL,gr23	; could let syscall_trace_exit() call schedule()
 	movgs		gr23,psr
 
-	setlos.p	#1,gr8
-	call		do_syscall_trace
+	call		syscall_trace_exit
 	bra		__entry_resume_userspace
 
 __syscall_badsys:
diff --git a/arch/frv/kernel/ptrace.c b/arch/frv/kernel/ptrace.c
index 6b15e5d..60eeed3 100644
--- a/arch/frv/kernel/ptrace.c
+++ b/arch/frv/kernel/ptrace.c
@@ -19,6 +19,9 @@
 #include <linux/user.h>
 #include <linux/security.h>
 #include <linux/signal.h>
+#include <linux/regset.h>
+#include <linux/elf.h>
+#include <linux/tracehook.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -33,6 +36,169 @@
  */
 
 /*
+ * retrieve the contents of FRV userspace general registers
+ */
+static int genregs_get(struct task_struct *target,
+		       const struct user_regset *regset,
+		       unsigned int pos, unsigned int count,
+		       void *kbuf, void __user *ubuf)
+{
+	const struct user_int_regs *iregs = &target->thread.user->i;
+	int ret;
+
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  iregs, 0, sizeof(*iregs));
+	if (ret < 0)
+		return ret;
+
+	return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+					sizeof(*iregs), -1);
+}
+
+/*
+ * update the contents of the FRV userspace general registers
+ */
+static int genregs_set(struct task_struct *target,
+		       const struct user_regset *regset,
+		       unsigned int pos, unsigned int count,
+		       const void *kbuf, const void __user *ubuf)
+{
+	struct user_int_regs *iregs = &target->thread.user->i;
+	unsigned int offs_gr0, offs_gr1;
+	int ret;
+
+	/* not allowed to set PSR or __status */
+	if (pos < offsetof(struct user_int_regs, psr) + sizeof(long) &&
+	    pos + count > offsetof(struct user_int_regs, psr))
+		return -EIO;
+
+	if (pos < offsetof(struct user_int_regs, __status) + sizeof(long) &&
+	    pos + count > offsetof(struct user_int_regs, __status))
+		return -EIO;
+
+	/* set the control regs */
+	offs_gr0 = offsetof(struct user_int_regs, gr[0]);
+	offs_gr1 = offsetof(struct user_int_regs, gr[1]);
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 iregs, 0, offs_gr0);
+	if (ret < 0)
+		return ret;
+
+	/* skip GR0/TBR */
+	ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+					offs_gr0, offs_gr1);
+	if (ret < 0)
+		return ret;
+
+	/* set the general regs */
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &iregs->gr[1], offs_gr1, sizeof(*iregs));
+	if (ret < 0)
+		return ret;
+
+	return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+					sizeof(*iregs), -1);
+}
+
+/*
+ * retrieve the contents of FRV userspace FP/Media registers
+ */
+static int fpmregs_get(struct task_struct *target,
+		       const struct user_regset *regset,
+		       unsigned int pos, unsigned int count,
+		       void *kbuf, void __user *ubuf)
+{
+	const struct user_fpmedia_regs *fpregs = &target->thread.user->f;
+	int ret;
+
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  fpregs, 0, sizeof(*fpregs));
+	if (ret < 0)
+		return ret;
+
+	return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+					sizeof(*fpregs), -1);
+}
+
+/*
+ * update the contents of the FRV userspace FP/Media registers
+ */
+static int fpmregs_set(struct task_struct *target,
+		       const struct user_regset *regset,
+		       unsigned int pos, unsigned int count,
+		       const void *kbuf, const void __user *ubuf)
+{
+	struct user_fpmedia_regs *fpregs = &target->thread.user->f;
+	int ret;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 fpregs, 0, sizeof(*fpregs));
+	if (ret < 0)
+		return ret;
+
+	return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+					sizeof(*fpregs), -1);
+}
+
+/*
+ * determine if the FP/Media registers have actually been used
+ */
+static int fpmregs_active(struct task_struct *target,
+			  const struct user_regset *regset)
+{
+	return tsk_used_math(target) ? regset->n : 0;
+}
+
+/*
+ * Define the register sets available on the FRV under Linux
+ */
+enum frv_regset {
+	REGSET_GENERAL,
+	REGSET_FPMEDIA,
+};
+
+static const struct user_regset frv_regsets[] = {
+	/*
+	 * General register format is:
+	 *	PSR, ISR, CCR, CCCR, LR, LCR, PC, (STATUS), SYSCALLNO, ORIG_G8
+	 *	GNER0-1, IACC0, TBR, GR1-63
+	 */
+	[REGSET_GENERAL] = {
+		.core_note_type	= NT_PRSTATUS,
+		.n		= ELF_NGREG,
+		.size		= sizeof(long),
+		.align		= sizeof(long),
+		.get		= genregs_get,
+		.set		= genregs_set,
+	},
+	/*
+	 * FPU/Media register format is:
+	 *	FR0-63, FNER0-1, MSR0-1, ACC0-7, ACCG0-8, FSR
+	 */
+	[REGSET_FPMEDIA] = {
+		.core_note_type	= NT_PRFPREG,
+		.n		= sizeof(struct user_fpmedia_regs) / sizeof(long),
+		.size		= sizeof(long),
+		.align		= sizeof(long),
+		.get		= fpmregs_get,
+		.set		= fpmregs_set,
+		.active		= fpmregs_active,
+	},
+};
+
+static const struct user_regset_view user_frv_native_view = {
+	.name		= "frv",
+	.e_machine	= EM_FRV,
+	.regsets	= frv_regsets,
+	.n		= ARRAY_SIZE(frv_regsets),
+};
+
+const struct user_regset_view *task_user_regset_view(struct task_struct *task)
+{
+	return &user_frv_native_view;
+}
+
+/*
  * Get contents of register REGNO in task TASK.
  */
 static inline long get_reg(struct task_struct *task, int regno)
@@ -69,40 +235,23 @@ static inline int put_reg(struct task_struct *task, int regno,
 }
 
 /*
- * check that an address falls within the bounds of the target process's memory
- * mappings
- */
-static inline int is_user_addr_valid(struct task_struct *child,
-				     unsigned long start, unsigned long len)
-{
-#ifdef CONFIG_MMU
-	if (start >= PAGE_OFFSET || len > PAGE_OFFSET - start)
-		return -EIO;
-	return 0;
-#else
-	struct vm_area_struct *vma;
-
-	vma = find_vma(child->mm, start);
-	if (vma && start >= vma->vm_start && start + len <= vma->vm_end)
-		return 0;
-
-	return -EIO;
-#endif
-}
-
-/*
  * Called by kernel/ptrace.c when detaching..
  *
  * Control h/w single stepping
  */
-void ptrace_disable(struct task_struct *child)
+void user_enable_single_step(struct task_struct *child)
+{
+	child->thread.frame0->__status |= REG__STATUS_STEP;
+}
+
+void user_disable_single_step(struct task_struct *child)
 {
 	child->thread.frame0->__status &= ~REG__STATUS_STEP;
 }
 
-void ptrace_enable(struct task_struct *child)
+void ptrace_disable(struct task_struct *child)
 {
-	child->thread.frame0->__status |= REG__STATUS_STEP;
+	user_disable_single_step(child);
 }
 
 long arch_ptrace(struct task_struct *child, long request, long addr, long data)
@@ -111,15 +260,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	int ret;
 
 	switch (request) {
-		/* when I and D space are separate, these will need to be fixed. */
-	case PTRACE_PEEKTEXT: /* read word at location addr. */
-	case PTRACE_PEEKDATA:
-		ret = -EIO;
-		if (is_user_addr_valid(child, addr, sizeof(tmp)) < 0)
-			break;
-		ret = generic_ptrace_peekdata(child, addr, data);
-		break;
-
 		/* read the word at location addr in the USER area. */
 	case PTRACE_PEEKUSR: {
 		tmp = 0;
@@ -163,15 +303,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		break;
 	}
 
-		/* when I and D space are separate, this will have to be fixed. */
-	case PTRACE_POKETEXT: /* write the word at location addr. */
-	case PTRACE_POKEDATA:
-		ret = -EIO;
-		if (is_user_addr_valid(child, addr, sizeof(tmp)) < 0)
-			break;
-		ret = generic_ptrace_pokedata(child, addr, data);
-		break;
-
 	case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
 		ret = -EIO;
 		if ((addr & 3) || addr < 0)
@@ -179,7 +310,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 
 		ret = 0;
 		switch (addr >> 2) {
-		case 0 ... PT__END-1:
+		case 0 ... PT__END - 1:
 			ret = put_reg(child, addr >> 2, data);
 			break;
 
@@ -189,95 +320,29 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		}
 		break;
 
-	case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
-	case PTRACE_CONT: /* restart after signal. */
-		ret = -EIO;
-		if (!valid_signal(data))
-			break;
-		if (request == PTRACE_SYSCALL)
-			set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		else
-			clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		child->exit_code = data;
-		ptrace_disable(child);
-		wake_up_process(child);
-		ret = 0;
-		break;
-
-		/* make the child exit.  Best I can do is send it a sigkill.
-		 * perhaps it should be put in the status that it wants to
-		 * exit.
-		 */
-	case PTRACE_KILL:
-		ret = 0;
-		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
-			break;
-		child->exit_code = SIGKILL;
-		clear_tsk_thread_flag(child, TIF_SINGLESTEP);
-		ptrace_disable(child);
-		wake_up_process(child);
-		break;
-
-	case PTRACE_SINGLESTEP:  /* set the trap flag. */
-		ret = -EIO;
-		if (!valid_signal(data))
-			break;
-		clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		ptrace_enable(child);
-		child->exit_code = data;
-		wake_up_process(child);
-		ret = 0;
-		break;
-
-	case PTRACE_DETACH:	/* detach a process that was attached. */
-		ret = ptrace_detach(child, data);
-		break;
-
-	case PTRACE_GETREGS: { /* Get all integer regs from the child. */
-		int i;
-		for (i = 0; i < PT__GPEND; i++) {
-			tmp = get_reg(child, i);
-			if (put_user(tmp, (unsigned long *) data)) {
-				ret = -EFAULT;
-				break;
-			}
-			data += sizeof(long);
-		}
-		ret = 0;
-		break;
-	}
-
-	case PTRACE_SETREGS: { /* Set all integer regs in the child. */
-		int i;
-		for (i = 0; i < PT__GPEND; i++) {
-			if (get_user(tmp, (unsigned long *) data)) {
-				ret = -EFAULT;
-				break;
-			}
-			put_reg(child, i, tmp);
-			data += sizeof(long);
-		}
-		ret = 0;
-		break;
-	}
-
-	case PTRACE_GETFPREGS: { /* Get the child FP/Media state. */
-		ret = 0;
-		if (copy_to_user((void *) data,
-				 &child->thread.user->f,
-				 sizeof(child->thread.user->f)))
-			ret = -EFAULT;
-		break;
-	}
-
-	case PTRACE_SETFPREGS: { /* Set the child FP/Media state. */
-		ret = 0;
-		if (copy_from_user(&child->thread.user->f,
-				   (void *) data,
-				   sizeof(child->thread.user->f)))
-			ret = -EFAULT;
-		break;
-	}
+	case PTRACE_GETREGS:	/* Get all integer regs from the child. */
+		return copy_regset_to_user(child, &user_frv_native_view,
+					   REGSET_GENERAL,
+					   0, sizeof(child->thread.user->i),
+					   (void __user *)data);
+
+	case PTRACE_SETREGS:	/* Set all integer regs in the child. */
+		return copy_regset_from_user(child, &user_frv_native_view,
+					     REGSET_GENERAL,
+					     0, sizeof(child->thread.user->i),
+					     (const void __user *)data);
+
+	case PTRACE_GETFPREGS:	/* Get the child FP/Media state. */
+		return copy_regset_to_user(child, &user_frv_native_view,
+					   REGSET_FPMEDIA,
+					   0, sizeof(child->thread.user->f),
+					   (void __user *)data);
+
+	case PTRACE_SETFPREGS:	/* Set the child FP/Media state. */
+		return copy_regset_from_user(child, &user_frv_native_view,
+					     REGSET_FPMEDIA,
+					     0, sizeof(child->thread.user->f),
+					     (const void __user *)data);
 
 	case PTRACE_GETFDPIC:
 		tmp = 0;
@@ -300,35 +365,36 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		break;
 
 	default:
-		ret = -EIO;
+		ret = ptrace_request(child, request, addr, data);
 		break;
 	}
 	return ret;
 }
 
-asmlinkage void do_syscall_trace(int leaving)
+/*
+ * handle tracing of system call entry
+ * - return the revised system call number or ULONG_MAX to cause ENOSYS
+ */
+asmlinkage unsigned long syscall_trace_entry(void)
 {
-	if (!test_thread_flag(TIF_SYSCALL_TRACE))
-		return;
-
-	if (!(current->ptrace & PT_PTRACED))
-		return;
-
-	/* we need to indicate entry or exit to strace */
-	if (leaving)
-		__frame->__status |= REG__STATUS_SYSC_EXIT;
-	else
-		__frame->__status |= REG__STATUS_SYSC_ENTRY;
+	__frame->__status |= REG__STATUS_SYSC_ENTRY;
+	if (tracehook_report_syscall_entry(__frame)) {
+		/* tracing decided this syscall should not happen, so
+		 * We'll return a bogus call number to get an ENOSYS
+		 * error, but leave the original number in
+		 * __frame->syscallno
+		 */
+		return ULONG_MAX;
+	}
 
-	ptrace_notify(SIGTRAP);
+	return __frame->syscallno;
+}
 
-	/*
-	 * this isn't the same as continuing with a signal, but it will do
-	 * for normal use.  strace only continues with a signal if the
-	 * stopping signal is not SIGTRAP.  -brl
-	 */
-	if (current->exit_code) {
-		send_sig(current->exit_code, current, 1);
-		current->exit_code = 0;
-	}
+/*
+ * handle tracing of system call exit
+ */
+asmlinkage void syscall_trace_exit(void)
+{
+	__frame->__status |= REG__STATUS_SYSC_EXIT;
+	tracehook_report_syscall_exit(__frame, 0);
 }
diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index 7ae290a..4a7a62c 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -21,6 +21,7 @@
 #include <linux/unistd.h>
 #include <linux/personality.h>
 #include <linux/freezer.h>
+#include <linux/tracehook.h>
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -516,6 +517,9 @@ static void do_signal(void)
 			 * clear the TIF_RESTORE_SIGMASK flag */
 			if (test_thread_flag(TIF_RESTORE_SIGMASK))
 				clear_thread_flag(TIF_RESTORE_SIGMASK);
+
+			tracehook_signal_handler(signr, &info, &ka, __frame,
+						 test_thread_flag(TIF_SINGLESTEP));
 		}
 
 		return;
@@ -567,6 +571,7 @@ asmlinkage void do_notify_resume(__u32 thread_info_flags)
 	/* deal with notification on about to resume userspace execution */
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
+		tracehook_notify_resume(__frame);
 	}
 
 } /* end do_notify_resume() */
-- 
cgit v0.10.2


From fd4f683d045e053abb093f80d81afce30ceadad2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 11 Jun 2009 13:08:32 +0100
Subject: MN10300: Don't set the dirty bit in the DTLB entries in the TLB-miss
 handler

Remove the special handling for the Data TLB entry dirty bit in the TLB-miss
handler.  As the code stands, all that it does is to cause us to take a second
data address exception to set the dirty bit.  Instead, we can just let
pte_mkdirty() set the bit.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/mn10300/mm/tlb-mn10300.S b/arch/mn10300/mm/tlb-mn10300.S
index 7892080..7095147 100644
--- a/arch/mn10300/mm/tlb-mn10300.S
+++ b/arch/mn10300/mm/tlb-mn10300.S
@@ -165,24 +165,6 @@ ENTRY(itlb_aerror)
 ENTRY(dtlb_aerror)
 	and	~EPSW_NMID,epsw
 	add	-4,sp
-	mov	d1,(sp)
-
-	movhu	(MMUFCR_DFC),d1			# is it the initial valid write
-						# to this page?
-	and	MMUFCR_xFC_INITWR,d1
- 	beq	dtlb_pagefault			# jump if not
-
-	mov	(DPTEL),d1			# set the dirty bit
-						# (don't replace with BSET!)
-	or	_PAGE_DIRTY,d1
-	mov	d1,(DPTEL)
-	mov	(sp),d1
-	add	4,sp
- 	rti
-
-	ALIGN
-dtlb_pagefault:
-	mov	(sp),d1
 	SAVE_ALL
 	add	-4,sp				# need to pass three params
 
-- 
cgit v0.10.2


From 5d289964e1f1e8a2ec4289274bf15bce6a4f8ab8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 11 Jun 2009 13:08:37 +0100
Subject: MN10300: Add utrace/tracehooks support

Add utrace/tracehooks support to MN10300.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig
index 3559267..89faaca 100644
--- a/arch/mn10300/Kconfig
+++ b/arch/mn10300/Kconfig
@@ -8,6 +8,7 @@ mainmenu "Linux Kernel Configuration"
 config MN10300
 	def_bool y
 	select HAVE_OPROFILE
+	select HAVE_ARCH_TRACEHOOK
 
 config AM33
 	def_bool y
diff --git a/arch/mn10300/include/asm/elf.h b/arch/mn10300/include/asm/elf.h
index bf09f8b..4910546 100644
--- a/arch/mn10300/include/asm/elf.h
+++ b/arch/mn10300/include/asm/elf.h
@@ -34,7 +34,7 @@
  */
 typedef unsigned long elf_greg_t;
 
-#define ELF_NGREG (sizeof (struct pt_regs) / sizeof(elf_greg_t))
+#define ELF_NGREG ((sizeof(struct pt_regs) / sizeof(elf_greg_t)) - 1)
 typedef elf_greg_t elf_gregset_t[ELF_NGREG];
 
 #define ELF_NFPREG 32
@@ -76,6 +76,7 @@ do {									\
 } while (0)
 
 #define USE_ELF_CORE_DUMP
+#define CORE_DUMP_USE_REGSET
 #define ELF_EXEC_PAGESIZE	4096
 
 /*
diff --git a/arch/mn10300/include/asm/processor.h b/arch/mn10300/include/asm/processor.h
index 7323927..f7d4b0d 100644
--- a/arch/mn10300/include/asm/processor.h
+++ b/arch/mn10300/include/asm/processor.h
@@ -143,13 +143,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
 
 unsigned long get_wchan(struct task_struct *p);
 
-#define task_pt_regs(task)						\
-({									\
-       struct pt_regs *__regs__;					\
-       __regs__ = (struct pt_regs *) (KSTK_TOP(task_stack_page(task)) - 8); \
-       __regs__ - 1;							\
-})
-
+#define task_pt_regs(task) ((task)->thread.uregs)
 #define KSTK_EIP(task) (task_pt_regs(task)->pc)
 #define KSTK_ESP(task) (task_pt_regs(task)->sp)
 
diff --git a/arch/mn10300/include/asm/ptrace.h b/arch/mn10300/include/asm/ptrace.h
index 7b06cc6..921942e 100644
--- a/arch/mn10300/include/asm/ptrace.h
+++ b/arch/mn10300/include/asm/ptrace.h
@@ -91,9 +91,17 @@ extern struct pt_regs *__frame; /* current frame pointer */
 #if defined(__KERNEL__)
 
 #if !defined(__ASSEMBLY__)
+struct task_struct;
+
 #define user_mode(regs)			(((regs)->epsw & EPSW_nSL) == EPSW_nSL)
 #define instruction_pointer(regs)	((regs)->pc)
+#define user_stack_pointer(regs)	((regs)->sp)
 extern void show_regs(struct pt_regs *);
+
+#define arch_has_single_step()	(1)
+extern void user_enable_single_step(struct task_struct *);
+extern void user_disable_single_step(struct task_struct *);
+
 #endif  /*  !__ASSEMBLY  */
 
 #define profile_pc(regs) ((regs)->pc)
diff --git a/arch/mn10300/kernel/entry.S b/arch/mn10300/kernel/entry.S
index 3dc3e46..7408a27 100644
--- a/arch/mn10300/kernel/entry.S
+++ b/arch/mn10300/kernel/entry.S
@@ -76,7 +76,7 @@ ENTRY(system_call)
 	cmp	nr_syscalls,d0
 	bcc	syscall_badsys
 	btst	_TIF_SYSCALL_TRACE,(TI_flags,a2)
-	bne	syscall_trace_entry
+	bne	syscall_entry_trace
 syscall_call:
 	add	d0,d0,a1
 	add	a1,a1
@@ -104,11 +104,10 @@ restore_all:
 syscall_exit_work:
 	btst	_TIF_SYSCALL_TRACE,d2
 	beq	work_pending
-	__sti				# could let do_syscall_trace() call
+	__sti				# could let syscall_trace_exit() call
 					# schedule() instead
 	mov	fp,d0
-	mov	1,d1
-	call	do_syscall_trace[],0	# do_syscall_trace(regs,entryexit)
+	call	syscall_trace_exit[],0	# do_syscall_trace(regs)
 	jmp	resume_userspace
 
 	ALIGN
@@ -138,13 +137,11 @@ work_notifysig:
 	jmp	resume_userspace
 
 	# perform syscall entry tracing
-syscall_trace_entry:
+syscall_entry_trace:
 	mov	-ENOSYS,d0
 	mov	d0,(REG_D0,fp)
 	mov	fp,d0
-	clr	d1
-	call	do_syscall_trace[],0
-	mov	(REG_ORIG_D0,fp),d0
+	call	syscall_trace_entry[],0	# returns the syscall number to actually use
 	mov	(REG_D1,fp),d1
 	cmp	nr_syscalls,d0
 	bcs	syscall_call
diff --git a/arch/mn10300/kernel/ptrace.c b/arch/mn10300/kernel/ptrace.c
index d6d6cdc..e143339 100644
--- a/arch/mn10300/kernel/ptrace.c
+++ b/arch/mn10300/kernel/ptrace.c
@@ -17,6 +17,9 @@
 #include <linux/errno.h>
 #include <linux/ptrace.h>
 #include <linux/user.h>
+#include <linux/regset.h>
+#include <linux/elf.h>
+#include <linux/tracehook.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/system.h>
@@ -64,12 +67,6 @@ static inline int get_stack_long(struct task_struct *task, int offset)
 		((unsigned long) task->thread.uregs + offset);
 }
 
-/*
- * this routine will put a word on the processes privileged stack.
- * the offset is how far from the base addr as stored in the TSS.
- * this routine assumes that all the privileged stacks are in our
- * data space.
- */
 static inline
 int put_stack_long(struct task_struct *task, int offset, unsigned long data)
 {
@@ -80,94 +77,233 @@ int put_stack_long(struct task_struct *task, int offset, unsigned long data)
 	return 0;
 }
 
-static inline unsigned long get_fpregs(struct fpu_state_struct *buf,
-				       struct task_struct *tsk)
+/*
+ * retrieve the contents of MN10300 userspace general registers
+ */
+static int genregs_get(struct task_struct *target,
+		       const struct user_regset *regset,
+		       unsigned int pos, unsigned int count,
+		       void *kbuf, void __user *ubuf)
 {
-	return __copy_to_user(buf, &tsk->thread.fpu_state,
-			      sizeof(struct fpu_state_struct));
+	const struct pt_regs *regs = task_pt_regs(target);
+	int ret;
+
+	/* we need to skip regs->next */
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  regs, 0, PT_ORIG_D0 * sizeof(long));
+	if (ret < 0)
+		return ret;
+
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  &regs->orig_d0, PT_ORIG_D0 * sizeof(long),
+				  NR_PTREGS * sizeof(long));
+	if (ret < 0)
+		return ret;
+
+	return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+					NR_PTREGS * sizeof(long), -1);
 }
 
-static inline unsigned long set_fpregs(struct task_struct *tsk,
-				       struct fpu_state_struct *buf)
+/*
+ * update the contents of the MN10300 userspace general registers
+ */
+static int genregs_set(struct task_struct *target,
+		       const struct user_regset *regset,
+		       unsigned int pos, unsigned int count,
+		       const void *kbuf, const void __user *ubuf)
 {
-	return __copy_from_user(&tsk->thread.fpu_state, buf,
-				sizeof(struct fpu_state_struct));
+	struct pt_regs *regs = task_pt_regs(target);
+	unsigned long tmp;
+	int ret;
+
+	/* we need to skip regs->next */
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 regs, 0, PT_ORIG_D0 * sizeof(long));
+	if (ret < 0)
+		return ret;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &regs->orig_d0, PT_ORIG_D0 * sizeof(long),
+				 PT_EPSW * sizeof(long));
+	if (ret < 0)
+		return ret;
+
+	/* we need to mask off changes to EPSW */
+	tmp = regs->epsw;
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &tmp, PT_EPSW * sizeof(long),
+				 PT_PC * sizeof(long));
+	tmp &= EPSW_FLAG_V | EPSW_FLAG_C | EPSW_FLAG_N | EPSW_FLAG_Z;
+	tmp |= regs->epsw & ~(EPSW_FLAG_V | EPSW_FLAG_C | EPSW_FLAG_N |
+			      EPSW_FLAG_Z);
+	regs->epsw = tmp;
+
+	if (ret < 0)
+		return ret;
+
+	/* and finally load the PC */
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &regs->pc, PT_PC * sizeof(long),
+				 NR_PTREGS * sizeof(long));
+
+	if (ret < 0)
+		return ret;
+
+	return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+					 NR_PTREGS * sizeof(long), -1);
 }
 
-static inline void fpsave_init(struct task_struct *task)
+/*
+ * retrieve the contents of MN10300 userspace FPU registers
+ */
+static int fpuregs_get(struct task_struct *target,
+		       const struct user_regset *regset,
+		       unsigned int pos, unsigned int count,
+		       void *kbuf, void __user *ubuf)
 {
-	memset(&task->thread.fpu_state, 0, sizeof(struct fpu_state_struct));
+	const struct fpu_state_struct *fpregs = &target->thread.fpu_state;
+	int ret;
+
+	unlazy_fpu(target);
+
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  fpregs, 0, sizeof(*fpregs));
+	if (ret < 0)
+		return ret;
+
+	return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+					sizeof(*fpregs), -1);
 }
 
 /*
- * make sure the single step bit is not set
+ * update the contents of the MN10300 userspace FPU registers
  */
-void ptrace_disable(struct task_struct *child)
+static int fpuregs_set(struct task_struct *target,
+		       const struct user_regset *regset,
+		       unsigned int pos, unsigned int count,
+		       const void *kbuf, const void __user *ubuf)
+{
+	struct fpu_state_struct fpu_state = target->thread.fpu_state;
+	int ret;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &fpu_state, 0, sizeof(fpu_state));
+	if (ret < 0)
+		return ret;
+
+	fpu_kill_state(target);
+	target->thread.fpu_state = fpu_state;
+	set_using_fpu(target);
+
+	return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+					 sizeof(fpu_state), -1);
+}
+
+/*
+ * determine if the FPU registers have actually been used
+ */
+static int fpuregs_active(struct task_struct *target,
+			  const struct user_regset *regset)
+{
+	return is_using_fpu(target) ? regset->n : 0;
+}
+
+/*
+ * Define the register sets available on the MN10300 under Linux
+ */
+enum mn10300_regset {
+	REGSET_GENERAL,
+	REGSET_FPU,
+};
+
+static const struct user_regset mn10300_regsets[] = {
+	/*
+	 * General register format is:
+	 *	A3, A2, D3, D2, MCVF, MCRL, MCRH, MDRQ
+	 *	E1, E0, E7...E2, SP, LAR, LIR, MDR
+	 *	A1, A0, D1, D0, ORIG_D0, EPSW, PC
+	 */
+	[REGSET_GENERAL] = {
+		.core_note_type	= NT_PRSTATUS,
+		.n		= ELF_NGREG,
+		.size		= sizeof(long),
+		.align		= sizeof(long),
+		.get		= genregs_get,
+		.set		= genregs_set,
+	},
+	/*
+	 * FPU register format is:
+	 *	FS0-31, FPCR
+	 */
+	[REGSET_FPU] = {
+		.core_note_type	= NT_PRFPREG,
+		.n		= sizeof(struct fpu_state_struct) / sizeof(long),
+		.size		= sizeof(long),
+		.align		= sizeof(long),
+		.get		= fpuregs_get,
+		.set		= fpuregs_set,
+		.active		= fpuregs_active,
+	},
+};
+
+static const struct user_regset_view user_mn10300_native_view = {
+	.name		= "mn10300",
+	.e_machine	= EM_MN10300,
+	.regsets	= mn10300_regsets,
+	.n		= ARRAY_SIZE(mn10300_regsets),
+};
+
+const struct user_regset_view *task_user_regset_view(struct task_struct *task)
+{
+	return &user_mn10300_native_view;
+}
+
+/*
+ * set the single-step bit
+ */
+void user_enable_single_step(struct task_struct *child)
 {
 #ifndef CONFIG_MN10300_USING_JTAG
 	struct user *dummy = NULL;
 	long tmp;
 
 	tmp = get_stack_long(child, (unsigned long) &dummy->regs.epsw);
-	tmp &= ~EPSW_T;
+	tmp |= EPSW_T;
 	put_stack_long(child, (unsigned long) &dummy->regs.epsw, tmp);
 #endif
 }
 
 /*
- * set the single step bit
+ * make sure the single-step bit is not set
  */
-void ptrace_enable(struct task_struct *child)
+void user_disable_single_step(struct task_struct *child)
 {
 #ifndef CONFIG_MN10300_USING_JTAG
 	struct user *dummy = NULL;
 	long tmp;
 
 	tmp = get_stack_long(child, (unsigned long) &dummy->regs.epsw);
-	tmp |= EPSW_T;
+	tmp &= ~EPSW_T;
 	put_stack_long(child, (unsigned long) &dummy->regs.epsw, tmp);
 #endif
 }
 
+void ptrace_disable(struct task_struct *child)
+{
+	user_disable_single_step(child);
+}
+
 /*
  * handle the arch-specific side of process tracing
  */
 long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
-	struct fpu_state_struct fpu_state;
-	int i, ret;
+	unsigned long tmp;
+	int ret;
 
 	switch (request) {
-	/* read the word at location addr. */
-	case PTRACE_PEEKTEXT: {
-		unsigned long tmp;
-		int copied;
-
-		copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
-		ret = -EIO;
-		if (copied != sizeof(tmp))
-			break;
-		ret = put_user(tmp, (unsigned long *) data);
-		break;
-	}
-
-	/* read the word at location addr. */
-	case PTRACE_PEEKDATA: {
-		unsigned long tmp;
-		int copied;
-
-		copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
-		ret = -EIO;
-		if (copied != sizeof(tmp))
-			break;
-		ret = put_user(tmp, (unsigned long *) data);
-		break;
-	}
-
 	/* read the word at location addr in the USER area. */
-	case PTRACE_PEEKUSR: {
-		unsigned long tmp;
-
+	case PTRACE_PEEKUSR:
 		ret = -EIO;
 		if ((addr & 3) || addr < 0 ||
 		    addr > sizeof(struct user) - 3)
@@ -179,17 +315,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 					     ptrace_regid_to_frame[addr]);
 		ret = put_user(tmp, (unsigned long *) data);
 		break;
-	}
-
-	/* write the word at location addr. */
-	case PTRACE_POKETEXT:
-	case PTRACE_POKEDATA:
-		if (access_process_vm(child, addr, &data, sizeof(data), 1) ==
-		    sizeof(data))
-			ret = 0;
-		else
-			ret = -EIO;
-		break;
 
 		/* write the word at location addr in the USER area */
 	case PTRACE_POKEUSR:
@@ -204,132 +329,32 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 					     data);
 		break;
 
-		/* continue and stop at next (return from) syscall */
-	case PTRACE_SYSCALL:
-		/* restart after signal. */
-	case PTRACE_CONT:
-		ret = -EIO;
-		if ((unsigned long) data > _NSIG)
-			break;
-		if (request == PTRACE_SYSCALL)
-			set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		else
-			clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		child->exit_code = data;
-		ptrace_disable(child);
-		wake_up_process(child);
-		ret = 0;
-		break;
-
-		/*
-		 * make the child exit
-		 * - the best I can do is send it a sigkill
-		 * - perhaps it should be put in the status that it wants to
-		 *   exit
-		 */
-	case PTRACE_KILL:
-		ret = 0;
-		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
-			break;
-		child->exit_code = SIGKILL;
-		clear_tsk_thread_flag(child, TIF_SINGLESTEP);
-		ptrace_disable(child);
-		wake_up_process(child);
-		break;
-
-	case PTRACE_SINGLESTEP: /* set the trap flag. */
-#ifndef CONFIG_MN10300_USING_JTAG
-		ret = -EIO;
-		if ((unsigned long) data > _NSIG)
-			break;
-		clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		ptrace_enable(child);
-		child->exit_code = data;
-		wake_up_process(child);
-		ret = 0;
-#else
-		ret = -EINVAL;
-#endif
-		break;
-
-	case PTRACE_DETACH:	/* detach a process that was attached. */
-		ret = ptrace_detach(child, data);
-		break;
-
-		/* Get all gp regs from the child. */
-	case PTRACE_GETREGS: {
-		unsigned long tmp;
-
-		if (!access_ok(VERIFY_WRITE, (unsigned *) data, NR_PTREGS << 2)) {
-			ret = -EIO;
-			break;
-		}
-
-		for (i = 0; i < NR_PTREGS << 2; i += 4) {
-			tmp = get_stack_long(child, ptrace_regid_to_frame[i]);
-			__put_user(tmp, (unsigned long *) data);
-			data += sizeof(tmp);
-		}
-		ret = 0;
-		break;
-	}
-
-	case PTRACE_SETREGS: { /* Set all gp regs in the child. */
-		unsigned long tmp;
-
-		if (!access_ok(VERIFY_READ, (unsigned long *)data,
-			       sizeof(struct pt_regs))) {
-			ret = -EIO;
-			break;
-		}
-
-		for (i = 0; i < NR_PTREGS << 2; i += 4) {
-			__get_user(tmp, (unsigned long *) data);
-			put_stack_long(child, ptrace_regid_to_frame[i], tmp);
-			data += sizeof(tmp);
-		}
-		ret = 0;
-		break;
-	}
-
-	case PTRACE_GETFPREGS: { /* Get the child FPU state. */
-		if (is_using_fpu(child)) {
-			unlazy_fpu(child);
-			fpu_state = child->thread.fpu_state;
-		} else {
-			memset(&fpu_state, 0, sizeof(fpu_state));
-		}
-
-		ret = -EIO;
-		if (copy_to_user((void *) data, &fpu_state,
-				 sizeof(fpu_state)) == 0)
-			ret = 0;
-		break;
-	}
-
-	case PTRACE_SETFPREGS: { /* Set the child FPU state. */
-		ret = -EFAULT;
-		if (copy_from_user(&fpu_state, (const void *) data,
-				   sizeof(fpu_state)) == 0) {
-			fpu_kill_state(child);
-			child->thread.fpu_state = fpu_state;
-			set_using_fpu(child);
-			ret = 0;
-		}
-		break;
-	}
-
-	case PTRACE_SETOPTIONS: {
-		if (data & PTRACE_O_TRACESYSGOOD)
-			child->ptrace |= PT_TRACESYSGOOD;
-		else
-			child->ptrace &= ~PT_TRACESYSGOOD;
-		ret = 0;
-		break;
-	}
+	case PTRACE_GETREGS:	/* Get all integer regs from the child. */
+		return copy_regset_to_user(child, &user_mn10300_native_view,
+					   REGSET_GENERAL,
+					   0, NR_PTREGS * sizeof(long),
+					   (void __user *)data);
+
+	case PTRACE_SETREGS:	/* Set all integer regs in the child. */
+		return copy_regset_from_user(child, &user_mn10300_native_view,
+					     REGSET_GENERAL,
+					     0, NR_PTREGS * sizeof(long),
+					     (const void __user *)data);
+
+	case PTRACE_GETFPREGS:	/* Get the child FPU state. */
+		return copy_regset_to_user(child, &user_mn10300_native_view,
+					   REGSET_FPU,
+					   0, sizeof(struct fpu_state_struct),
+					   (void __user *)data);
+
+	case PTRACE_SETFPREGS:	/* Set the child FPU state. */
+		return copy_regset_from_user(child, &user_mn10300_native_view,
+					     REGSET_FPU,
+					     0, sizeof(struct fpu_state_struct),
+					     (const void __user *)data);
 
 	default:
-		ret = -EIO;
+		ret = ptrace_request(child, request, addr, data);
 		break;
 	}
 
@@ -337,43 +362,26 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 }
 
 /*
- * notification of system call entry/exit
- * - triggered by current->work.syscall_trace
+ * handle tracing of system call entry
+ * - return the revised system call number or ULONG_MAX to cause ENOSYS
  */
-asmlinkage void do_syscall_trace(struct pt_regs *regs, int entryexit)
+asmlinkage unsigned long syscall_trace_entry(struct pt_regs *regs)
 {
-#if 0
-	/* just in case... */
-	printk(KERN_DEBUG "[%d] syscall_%lu(%lx,%lx,%lx,%lx) = %lx\n",
-	       current->pid,
-	       regs->orig_d0,
-	       regs->a0,
-	       regs->d1,
-	       regs->a3,
-	       regs->a2,
-	       regs->d0);
-	return;
-#endif
-
-	if (!test_thread_flag(TIF_SYSCALL_TRACE) &&
-	    !test_thread_flag(TIF_SINGLESTEP))
-		return;
-	if (!(current->ptrace & PT_PTRACED))
-		return;
+	if (tracehook_report_syscall_entry(regs))
+		/* tracing decided this syscall should not happen, so
+		 * We'll return a bogus call number to get an ENOSYS
+		 * error, but leave the original number in
+		 * regs->orig_d0
+		 */
+		return ULONG_MAX;
 
-	/* the 0x80 provides a way for the tracing parent to distinguish
-	   between a syscall stop and SIGTRAP delivery */
-	ptrace_notify(SIGTRAP |
-		      ((current->ptrace & PT_TRACESYSGOOD) &&
-		       !test_thread_flag(TIF_SINGLESTEP) ? 0x80 : 0));
+	return regs->orig_d0;
+}
 
-	/*
-	 * this isn't the same as continuing with a signal, but it will do
-	 * for normal use.  strace only continues with a signal if the
-	 * stopping signal is not SIGTRAP.  -brl
-	 */
-	if (current->exit_code) {
-		send_sig(current->exit_code, current, 1);
-		current->exit_code = 0;
-	}
+/*
+ * handle tracing of system call exit
+ */
+asmlinkage void syscall_trace_exit(struct pt_regs *regs)
+{
+	tracehook_report_syscall_exit(regs, 0);
 }
diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c
index 841ca99..9f7572a 100644
--- a/arch/mn10300/kernel/signal.c
+++ b/arch/mn10300/kernel/signal.c
@@ -23,6 +23,7 @@
 #include <linux/tty.h>
 #include <linux/personality.h>
 #include <linux/suspend.h>
+#include <linux/tracehook.h>
 #include <asm/cacheflush.h>
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
@@ -511,6 +512,9 @@ static void do_signal(struct pt_regs *regs)
 			 * clear the TIF_RESTORE_SIGMASK flag */
 			if (test_thread_flag(TIF_RESTORE_SIGMASK))
 				clear_thread_flag(TIF_RESTORE_SIGMASK);
+
+			tracehook_signal_handler(signr, &info, &ka, regs,
+						 test_thread_flag(TIF_SINGLESTEP));
 		}
 
 		return;
@@ -561,4 +565,9 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, u32 thread_info_flags)
 	/* deal with pending signal delivery */
 	if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
 		do_signal(regs);
+
+	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+		clear_thread_flag(TIF_NOTIFY_RESUME);
+		tracehook_notify_resume(__frame);
+	}
 }
-- 
cgit v0.10.2


From 3c7b4e6b8be4c16f1e6e5c558e33b7ff0db2dfaf Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:22:39 +0100
Subject: kmemleak: Add the base support

This patch adds the base support for the kernel memory leak
detector. It traces the memory allocation/freeing in a way similar to
the Boehm's conservative garbage collector, the difference being that
the unreferenced objects are not freed but only shown in
/sys/kernel/debug/kmemleak. Enabling this feature introduces an
overhead to memory allocations.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h
new file mode 100644
index 0000000..7796aed
--- /dev/null
+++ b/include/linux/kmemleak.h
@@ -0,0 +1,96 @@
+/*
+ * include/linux/kmemleak.h
+ *
+ * Copyright (C) 2008 ARM Limited
+ * Written by Catalin Marinas <catalin.marinas@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef __KMEMLEAK_H
+#define __KMEMLEAK_H
+
+#ifdef CONFIG_DEBUG_KMEMLEAK
+
+extern void kmemleak_init(void);
+extern void kmemleak_alloc(const void *ptr, size_t size, int min_count,
+			   gfp_t gfp);
+extern void kmemleak_free(const void *ptr);
+extern void kmemleak_padding(const void *ptr, unsigned long offset,
+			     size_t size);
+extern void kmemleak_not_leak(const void *ptr);
+extern void kmemleak_ignore(const void *ptr);
+extern void kmemleak_scan_area(const void *ptr, unsigned long offset,
+			       size_t length, gfp_t gfp);
+extern void kmemleak_no_scan(const void *ptr);
+
+static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
+					    int min_count, unsigned long flags,
+					    gfp_t gfp)
+{
+	if (!(flags & SLAB_NOLEAKTRACE))
+		kmemleak_alloc(ptr, size, min_count, gfp);
+}
+
+static inline void kmemleak_free_recursive(const void *ptr, unsigned long flags)
+{
+	if (!(flags & SLAB_NOLEAKTRACE))
+		kmemleak_free(ptr);
+}
+
+static inline void kmemleak_erase(void **ptr)
+{
+	*ptr = NULL;
+}
+
+#else
+
+static inline void kmemleak_init(void)
+{
+}
+static inline void kmemleak_alloc(const void *ptr, size_t size, int min_count,
+				  gfp_t gfp)
+{
+}
+static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
+					    int min_count, unsigned long flags,
+					    gfp_t gfp)
+{
+}
+static inline void kmemleak_free(const void *ptr)
+{
+}
+static inline void kmemleak_free_recursive(const void *ptr, unsigned long flags)
+{
+}
+static inline void kmemleak_not_leak(const void *ptr)
+{
+}
+static inline void kmemleak_ignore(const void *ptr)
+{
+}
+static inline void kmemleak_scan_area(const void *ptr, unsigned long offset,
+				      size_t length, gfp_t gfp)
+{
+}
+static inline void kmemleak_erase(void **ptr)
+{
+}
+static inline void kmemleak_no_scan(const void *ptr)
+{
+}
+
+#endif	/* CONFIG_DEBUG_KMEMLEAK */
+
+#endif	/* __KMEMLEAK_H */
diff --git a/init/main.c b/init/main.c
index bb7dc57..9d759d6 100644
--- a/init/main.c
+++ b/init/main.c
@@ -56,6 +56,7 @@
 #include <linux/debug_locks.h>
 #include <linux/debugobjects.h>
 #include <linux/lockdep.h>
+#include <linux/kmemleak.h>
 #include <linux/pid_namespace.h>
 #include <linux/device.h>
 #include <linux/kthread.h>
@@ -603,6 +604,7 @@ asmlinkage void __init start_kernel(void)
 	/* init some links before init_ISA_irqs() */
 	early_irq_init();
 	init_IRQ();
+	prio_tree_init();
 	pidhash_init();
 	init_timers();
 	hrtimers_init();
@@ -654,6 +656,7 @@ asmlinkage void __init start_kernel(void)
 	cpu_hotplug_init();
 	kmem_cache_init();
 	kmemtrace_init();
+	kmemleak_init();
 	debug_objects_mem_init();
 	idr_init_cache();
 	setup_per_cpu_pageset();
@@ -663,7 +666,6 @@ asmlinkage void __init start_kernel(void)
 	calibrate_delay();
 	pidmap_init();
 	pgtable_cache_init();
-	prio_tree_init();
 	anon_vma_init();
 #ifdef CONFIG_X86
 	if (efi_enabled)
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
new file mode 100644
index 0000000..58ec86c
--- /dev/null
+++ b/mm/kmemleak.c
@@ -0,0 +1,1498 @@
+/*
+ * mm/kmemleak.c
+ *
+ * Copyright (C) 2008 ARM Limited
+ * Written by Catalin Marinas <catalin.marinas@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *
+ * For more information on the algorithm and kmemleak usage, please see
+ * Documentation/kmemleak.txt.
+ *
+ * Notes on locking
+ * ----------------
+ *
+ * The following locks and mutexes are used by kmemleak:
+ *
+ * - kmemleak_lock (rwlock): protects the object_list modifications and
+ *   accesses to the object_tree_root. The object_list is the main list
+ *   holding the metadata (struct kmemleak_object) for the allocated memory
+ *   blocks. The object_tree_root is a priority search tree used to look-up
+ *   metadata based on a pointer to the corresponding memory block.  The
+ *   kmemleak_object structures are added to the object_list and
+ *   object_tree_root in the create_object() function called from the
+ *   kmemleak_alloc() callback and removed in delete_object() called from the
+ *   kmemleak_free() callback
+ * - kmemleak_object.lock (spinlock): protects a kmemleak_object. Accesses to
+ *   the metadata (e.g. count) are protected by this lock. Note that some
+ *   members of this structure may be protected by other means (atomic or
+ *   kmemleak_lock). This lock is also held when scanning the corresponding
+ *   memory block to avoid the kernel freeing it via the kmemleak_free()
+ *   callback. This is less heavyweight than holding a global lock like
+ *   kmemleak_lock during scanning
+ * - scan_mutex (mutex): ensures that only one thread may scan the memory for
+ *   unreferenced objects at a time. The gray_list contains the objects which
+ *   are already referenced or marked as false positives and need to be
+ *   scanned. This list is only modified during a scanning episode when the
+ *   scan_mutex is held. At the end of a scan, the gray_list is always empty.
+ *   Note that the kmemleak_object.use_count is incremented when an object is
+ *   added to the gray_list and therefore cannot be freed
+ * - kmemleak_mutex (mutex): prevents multiple users of the "kmemleak" debugfs
+ *   file together with modifications to the memory scanning parameters
+ *   including the scan_thread pointer
+ *
+ * The kmemleak_object structures have a use_count incremented or decremented
+ * using the get_object()/put_object() functions. When the use_count becomes
+ * 0, this count can no longer be incremented and put_object() schedules the
+ * kmemleak_object freeing via an RCU callback. All calls to the get_object()
+ * function must be protected by rcu_read_lock() to avoid accessing a freed
+ * structure.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/jiffies.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/prio_tree.h>
+#include <linux/gfp.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/cpumask.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/rcupdate.h>
+#include <linux/stacktrace.h>
+#include <linux/cache.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/mmzone.h>
+#include <linux/slab.h>
+#include <linux/thread_info.h>
+#include <linux/err.h>
+#include <linux/uaccess.h>
+#include <linux/string.h>
+#include <linux/nodemask.h>
+#include <linux/mm.h>
+
+#include <asm/sections.h>
+#include <asm/processor.h>
+#include <asm/atomic.h>
+
+#include <linux/kmemleak.h>
+
+/*
+ * Kmemleak configuration and common defines.
+ */
+#define MAX_TRACE		16	/* stack trace length */
+#define REPORTS_NR		50	/* maximum number of reported leaks */
+#define MSECS_MIN_AGE		5000	/* minimum object age for reporting */
+#define MSECS_SCAN_YIELD	10	/* CPU yielding period */
+#define SECS_FIRST_SCAN		60	/* delay before the first scan */
+#define SECS_SCAN_WAIT		600	/* subsequent auto scanning delay */
+
+#define BYTES_PER_POINTER	sizeof(void *)
+
+/* scanning area inside a memory block */
+struct kmemleak_scan_area {
+	struct hlist_node node;
+	unsigned long offset;
+	size_t length;
+};
+
+/*
+ * Structure holding the metadata for each allocated memory block.
+ * Modifications to such objects should be made while holding the
+ * object->lock. Insertions or deletions from object_list, gray_list or
+ * tree_node are already protected by the corresponding locks or mutex (see
+ * the notes on locking above). These objects are reference-counted
+ * (use_count) and freed using the RCU mechanism.
+ */
+struct kmemleak_object {
+	spinlock_t lock;
+	unsigned long flags;		/* object status flags */
+	struct list_head object_list;
+	struct list_head gray_list;
+	struct prio_tree_node tree_node;
+	struct rcu_head rcu;		/* object_list lockless traversal */
+	/* object usage count; object freed when use_count == 0 */
+	atomic_t use_count;
+	unsigned long pointer;
+	size_t size;
+	/* minimum number of a pointers found before it is considered leak */
+	int min_count;
+	/* the total number of pointers found pointing to this object */
+	int count;
+	/* memory ranges to be scanned inside an object (empty for all) */
+	struct hlist_head area_list;
+	unsigned long trace[MAX_TRACE];
+	unsigned int trace_len;
+	unsigned long jiffies;		/* creation timestamp */
+	pid_t pid;			/* pid of the current task */
+	char comm[TASK_COMM_LEN];	/* executable name */
+};
+
+/* flag representing the memory block allocation status */
+#define OBJECT_ALLOCATED	(1 << 0)
+/* flag set after the first reporting of an unreference object */
+#define OBJECT_REPORTED		(1 << 1)
+/* flag set to not scan the object */
+#define OBJECT_NO_SCAN		(1 << 2)
+
+/* the list of all allocated objects */
+static LIST_HEAD(object_list);
+/* the list of gray-colored objects (see color_gray comment below) */
+static LIST_HEAD(gray_list);
+/* prio search tree for object boundaries */
+static struct prio_tree_root object_tree_root;
+/* rw_lock protecting the access to object_list and prio_tree_root */
+static DEFINE_RWLOCK(kmemleak_lock);
+
+/* allocation caches for kmemleak internal data */
+static struct kmem_cache *object_cache;
+static struct kmem_cache *scan_area_cache;
+
+/* set if tracing memory operations is enabled */
+static atomic_t kmemleak_enabled = ATOMIC_INIT(0);
+/* set in the late_initcall if there were no errors */
+static atomic_t kmemleak_initialized = ATOMIC_INIT(0);
+/* enables or disables early logging of the memory operations */
+static atomic_t kmemleak_early_log = ATOMIC_INIT(1);
+/* set if a fata kmemleak error has occurred */
+static atomic_t kmemleak_error = ATOMIC_INIT(0);
+
+/* minimum and maximum address that may be valid pointers */
+static unsigned long min_addr = ULONG_MAX;
+static unsigned long max_addr;
+
+/* used for yielding the CPU to other tasks during scanning */
+static unsigned long next_scan_yield;
+static struct task_struct *scan_thread;
+static unsigned long jiffies_scan_yield;
+static unsigned long jiffies_min_age;
+/* delay between automatic memory scannings */
+static signed long jiffies_scan_wait;
+/* enables or disables the task stacks scanning */
+static int kmemleak_stack_scan;
+/* mutex protecting the memory scanning */
+static DEFINE_MUTEX(scan_mutex);
+/* mutex protecting the access to the /sys/kernel/debug/kmemleak file */
+static DEFINE_MUTEX(kmemleak_mutex);
+
+/* number of leaks reported (for limitation purposes) */
+static int reported_leaks;
+
+/*
+ * Early object allocation/freeing logging. Kkmemleak is initialized after the
+ * kernel allocator. However, both the kernel allocator and kmemleak may
+ * allocate memory blocks which need to be tracked. Kkmemleak defines an
+ * arbitrary buffer to hold the allocation/freeing information before it is
+ * fully initialized.
+ */
+
+/* kmemleak operation type for early logging */
+enum {
+	KMEMLEAK_ALLOC,
+	KMEMLEAK_FREE,
+	KMEMLEAK_NOT_LEAK,
+	KMEMLEAK_IGNORE,
+	KMEMLEAK_SCAN_AREA,
+	KMEMLEAK_NO_SCAN
+};
+
+/*
+ * Structure holding the information passed to kmemleak callbacks during the
+ * early logging.
+ */
+struct early_log {
+	int op_type;			/* kmemleak operation type */
+	const void *ptr;		/* allocated/freed memory block */
+	size_t size;			/* memory block size */
+	int min_count;			/* minimum reference count */
+	unsigned long offset;		/* scan area offset */
+	size_t length;			/* scan area length */
+};
+
+/* early logging buffer and current position */
+static struct early_log early_log[200];
+static int crt_early_log;
+
+static void kmemleak_disable(void);
+
+/*
+ * Print a warning and dump the stack trace.
+ */
+#define kmemleak_warn(x...)	do {	\
+	pr_warning(x);			\
+	dump_stack();			\
+} while (0)
+
+/*
+ * Macro invoked when a serious kmemleak condition occured and cannot be
+ * recovered from. Kkmemleak will be disabled and further allocation/freeing
+ * tracing no longer available.
+ */
+#define kmemleak_panic(x...)	do {	\
+	kmemleak_warn(x);		\
+	kmemleak_disable();		\
+} while (0)
+
+/*
+ * Object colors, encoded with count and min_count:
+ * - white - orphan object, not enough references to it (count < min_count)
+ * - gray  - not orphan, not marked as false positive (min_count == 0) or
+ *		sufficient references to it (count >= min_count)
+ * - black - ignore, it doesn't contain references (e.g. text section)
+ *		(min_count == -1). No function defined for this color.
+ * Newly created objects don't have any color assigned (object->count == -1)
+ * before the next memory scan when they become white.
+ */
+static int color_white(const struct kmemleak_object *object)
+{
+	return object->count != -1 && object->count < object->min_count;
+}
+
+static int color_gray(const struct kmemleak_object *object)
+{
+	return object->min_count != -1 && object->count >= object->min_count;
+}
+
+/*
+ * Objects are considered referenced if their color is gray and they have not
+ * been deleted.
+ */
+static int referenced_object(struct kmemleak_object *object)
+{
+	return (object->flags & OBJECT_ALLOCATED) && color_gray(object);
+}
+
+/*
+ * Objects are considered unreferenced only if their color is white, they have
+ * not be deleted and have a minimum age to avoid false positives caused by
+ * pointers temporarily stored in CPU registers.
+ */
+static int unreferenced_object(struct kmemleak_object *object)
+{
+	return (object->flags & OBJECT_ALLOCATED) && color_white(object) &&
+		time_is_before_eq_jiffies(object->jiffies + jiffies_min_age);
+}
+
+/*
+ * Printing of the (un)referenced objects information, either to the seq file
+ * or to the kernel log. The print_referenced/print_unreferenced functions
+ * must be called with the object->lock held.
+ */
+#define print_helper(seq, x...)	do {	\
+	struct seq_file *s = (seq);	\
+	if (s)				\
+		seq_printf(s, x);	\
+	else				\
+		pr_info(x);		\
+} while (0)
+
+static void print_referenced(struct kmemleak_object *object)
+{
+	pr_info("kmemleak: referenced object 0x%08lx (size %zu)\n",
+		object->pointer, object->size);
+}
+
+static void print_unreferenced(struct seq_file *seq,
+			       struct kmemleak_object *object)
+{
+	int i;
+
+	print_helper(seq, "kmemleak: unreferenced object 0x%08lx (size %zu):\n",
+		     object->pointer, object->size);
+	print_helper(seq, "  comm \"%s\", pid %d, jiffies %lu\n",
+		     object->comm, object->pid, object->jiffies);
+	print_helper(seq, "  backtrace:\n");
+
+	for (i = 0; i < object->trace_len; i++) {
+		void *ptr = (void *)object->trace[i];
+		print_helper(seq, "    [<%p>] %pS\n", ptr, ptr);
+	}
+}
+
+/*
+ * Print the kmemleak_object information. This function is used mainly for
+ * debugging special cases when kmemleak operations. It must be called with
+ * the object->lock held.
+ */
+static void dump_object_info(struct kmemleak_object *object)
+{
+	struct stack_trace trace;
+
+	trace.nr_entries = object->trace_len;
+	trace.entries = object->trace;
+
+	pr_notice("kmemleak: Object 0x%08lx (size %zu):\n",
+		  object->tree_node.start, object->size);
+	pr_notice("  comm \"%s\", pid %d, jiffies %lu\n",
+		  object->comm, object->pid, object->jiffies);
+	pr_notice("  min_count = %d\n", object->min_count);
+	pr_notice("  count = %d\n", object->count);
+	pr_notice("  backtrace:\n");
+	print_stack_trace(&trace, 4);
+}
+
+/*
+ * Look-up a memory block metadata (kmemleak_object) in the priority search
+ * tree based on a pointer value. If alias is 0, only values pointing to the
+ * beginning of the memory block are allowed. The kmemleak_lock must be held
+ * when calling this function.
+ */
+static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
+{
+	struct prio_tree_node *node;
+	struct prio_tree_iter iter;
+	struct kmemleak_object *object;
+
+	prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr);
+	node = prio_tree_next(&iter);
+	if (node) {
+		object = prio_tree_entry(node, struct kmemleak_object,
+					 tree_node);
+		if (!alias && object->pointer != ptr) {
+			kmemleak_warn("kmemleak: Found object by alias");
+			object = NULL;
+		}
+	} else
+		object = NULL;
+
+	return object;
+}
+
+/*
+ * Increment the object use_count. Return 1 if successful or 0 otherwise. Note
+ * that once an object's use_count reached 0, the RCU freeing was already
+ * registered and the object should no longer be used. This function must be
+ * called under the protection of rcu_read_lock().
+ */
+static int get_object(struct kmemleak_object *object)
+{
+	return atomic_inc_not_zero(&object->use_count);
+}
+
+/*
+ * RCU callback to free a kmemleak_object.
+ */
+static void free_object_rcu(struct rcu_head *rcu)
+{
+	struct hlist_node *elem, *tmp;
+	struct kmemleak_scan_area *area;
+	struct kmemleak_object *object =
+		container_of(rcu, struct kmemleak_object, rcu);
+
+	/*
+	 * Once use_count is 0 (guaranteed by put_object), there is no other
+	 * code accessing this object, hence no need for locking.
+	 */
+	hlist_for_each_entry_safe(area, elem, tmp, &object->area_list, node) {
+		hlist_del(elem);
+		kmem_cache_free(scan_area_cache, area);
+	}
+	kmem_cache_free(object_cache, object);
+}
+
+/*
+ * Decrement the object use_count. Once the count is 0, free the object using
+ * an RCU callback. Since put_object() may be called via the kmemleak_free() ->
+ * delete_object() path, the delayed RCU freeing ensures that there is no
+ * recursive call to the kernel allocator. Lock-less RCU object_list traversal
+ * is also possible.
+ */
+static void put_object(struct kmemleak_object *object)
+{
+	if (!atomic_dec_and_test(&object->use_count))
+		return;
+
+	/* should only get here after delete_object was called */
+	WARN_ON(object->flags & OBJECT_ALLOCATED);
+
+	call_rcu(&object->rcu, free_object_rcu);
+}
+
+/*
+ * Look up an object in the prio search tree and increase its use_count.
+ */
+static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
+{
+	unsigned long flags;
+	struct kmemleak_object *object = NULL;
+
+	rcu_read_lock();
+	read_lock_irqsave(&kmemleak_lock, flags);
+	if (ptr >= min_addr && ptr < max_addr)
+		object = lookup_object(ptr, alias);
+	read_unlock_irqrestore(&kmemleak_lock, flags);
+
+	/* check whether the object is still available */
+	if (object && !get_object(object))
+		object = NULL;
+	rcu_read_unlock();
+
+	return object;
+}
+
+/*
+ * Create the metadata (struct kmemleak_object) corresponding to an allocated
+ * memory block and add it to the object_list and object_tree_root.
+ */
+static void create_object(unsigned long ptr, size_t size, int min_count,
+			  gfp_t gfp)
+{
+	unsigned long flags;
+	struct kmemleak_object *object;
+	struct prio_tree_node *node;
+	struct stack_trace trace;
+
+	object = kmem_cache_alloc(object_cache, gfp & ~GFP_SLAB_BUG_MASK);
+	if (!object) {
+		kmemleak_panic("kmemleak: Cannot allocate a kmemleak_object "
+			       "structure\n");
+		return;
+	}
+
+	INIT_LIST_HEAD(&object->object_list);
+	INIT_LIST_HEAD(&object->gray_list);
+	INIT_HLIST_HEAD(&object->area_list);
+	spin_lock_init(&object->lock);
+	atomic_set(&object->use_count, 1);
+	object->flags = OBJECT_ALLOCATED;
+	object->pointer = ptr;
+	object->size = size;
+	object->min_count = min_count;
+	object->count = -1;			/* no color initially */
+	object->jiffies = jiffies;
+
+	/* task information */
+	if (in_irq()) {
+		object->pid = 0;
+		strncpy(object->comm, "hardirq", sizeof(object->comm));
+	} else if (in_softirq()) {
+		object->pid = 0;
+		strncpy(object->comm, "softirq", sizeof(object->comm));
+	} else {
+		object->pid = current->pid;
+		/*
+		 * There is a small chance of a race with set_task_comm(),
+		 * however using get_task_comm() here may cause locking
+		 * dependency issues with current->alloc_lock. In the worst
+		 * case, the command line is not correct.
+		 */
+		strncpy(object->comm, current->comm, sizeof(object->comm));
+	}
+
+	/* kernel backtrace */
+	trace.max_entries = MAX_TRACE;
+	trace.nr_entries = 0;
+	trace.entries = object->trace;
+	trace.skip = 1;
+	save_stack_trace(&trace);
+	object->trace_len = trace.nr_entries;
+
+	INIT_PRIO_TREE_NODE(&object->tree_node);
+	object->tree_node.start = ptr;
+	object->tree_node.last = ptr + size - 1;
+
+	write_lock_irqsave(&kmemleak_lock, flags);
+	min_addr = min(min_addr, ptr);
+	max_addr = max(max_addr, ptr + size);
+	node = prio_tree_insert(&object_tree_root, &object->tree_node);
+	/*
+	 * The code calling the kernel does not yet have the pointer to the
+	 * memory block to be able to free it.  However, we still hold the
+	 * kmemleak_lock here in case parts of the kernel started freeing
+	 * random memory blocks.
+	 */
+	if (node != &object->tree_node) {
+		unsigned long flags;
+
+		kmemleak_panic("kmemleak: Cannot insert 0x%lx into the object "
+			       "search tree (already existing)\n", ptr);
+		object = lookup_object(ptr, 1);
+		spin_lock_irqsave(&object->lock, flags);
+		dump_object_info(object);
+		spin_unlock_irqrestore(&object->lock, flags);
+
+		goto out;
+	}
+	list_add_tail_rcu(&object->object_list, &object_list);
+out:
+	write_unlock_irqrestore(&kmemleak_lock, flags);
+}
+
+/*
+ * Remove the metadata (struct kmemleak_object) for a memory block from the
+ * object_list and object_tree_root and decrement its use_count.
+ */
+static void delete_object(unsigned long ptr)
+{
+	unsigned long flags;
+	struct kmemleak_object *object;
+
+	write_lock_irqsave(&kmemleak_lock, flags);
+	object = lookup_object(ptr, 0);
+	if (!object) {
+		kmemleak_warn("kmemleak: Freeing unknown object at 0x%08lx\n",
+			      ptr);
+		write_unlock_irqrestore(&kmemleak_lock, flags);
+		return;
+	}
+	prio_tree_remove(&object_tree_root, &object->tree_node);
+	list_del_rcu(&object->object_list);
+	write_unlock_irqrestore(&kmemleak_lock, flags);
+
+	WARN_ON(!(object->flags & OBJECT_ALLOCATED));
+	WARN_ON(atomic_read(&object->use_count) < 1);
+
+	/*
+	 * Locking here also ensures that the corresponding memory block
+	 * cannot be freed when it is being scanned.
+	 */
+	spin_lock_irqsave(&object->lock, flags);
+	if (object->flags & OBJECT_REPORTED)
+		print_referenced(object);
+	object->flags &= ~OBJECT_ALLOCATED;
+	spin_unlock_irqrestore(&object->lock, flags);
+	put_object(object);
+}
+
+/*
+ * Make a object permanently as gray-colored so that it can no longer be
+ * reported as a leak. This is used in general to mark a false positive.
+ */
+static void make_gray_object(unsigned long ptr)
+{
+	unsigned long flags;
+	struct kmemleak_object *object;
+
+	object = find_and_get_object(ptr, 0);
+	if (!object) {
+		kmemleak_warn("kmemleak: Graying unknown object at 0x%08lx\n",
+			      ptr);
+		return;
+	}
+
+	spin_lock_irqsave(&object->lock, flags);
+	object->min_count = 0;
+	spin_unlock_irqrestore(&object->lock, flags);
+	put_object(object);
+}
+
+/*
+ * Mark the object as black-colored so that it is ignored from scans and
+ * reporting.
+ */
+static void make_black_object(unsigned long ptr)
+{
+	unsigned long flags;
+	struct kmemleak_object *object;
+
+	object = find_and_get_object(ptr, 0);
+	if (!object) {
+		kmemleak_warn("kmemleak: Blacking unknown object at 0x%08lx\n",
+			      ptr);
+		return;
+	}
+
+	spin_lock_irqsave(&object->lock, flags);
+	object->min_count = -1;
+	spin_unlock_irqrestore(&object->lock, flags);
+	put_object(object);
+}
+
+/*
+ * Add a scanning area to the object. If at least one such area is added,
+ * kmemleak will only scan these ranges rather than the whole memory block.
+ */
+static void add_scan_area(unsigned long ptr, unsigned long offset,
+			  size_t length, gfp_t gfp)
+{
+	unsigned long flags;
+	struct kmemleak_object *object;
+	struct kmemleak_scan_area *area;
+
+	object = find_and_get_object(ptr, 0);
+	if (!object) {
+		kmemleak_warn("kmemleak: Adding scan area to unknown "
+			      "object at 0x%08lx\n", ptr);
+		return;
+	}
+
+	area = kmem_cache_alloc(scan_area_cache, gfp & ~GFP_SLAB_BUG_MASK);
+	if (!area) {
+		kmemleak_warn("kmemleak: Cannot allocate a scan area\n");
+		goto out;
+	}
+
+	spin_lock_irqsave(&object->lock, flags);
+	if (offset + length > object->size) {
+		kmemleak_warn("kmemleak: Scan area larger than object "
+			      "0x%08lx\n", ptr);
+		dump_object_info(object);
+		kmem_cache_free(scan_area_cache, area);
+		goto out_unlock;
+	}
+
+	INIT_HLIST_NODE(&area->node);
+	area->offset = offset;
+	area->length = length;
+
+	hlist_add_head(&area->node, &object->area_list);
+out_unlock:
+	spin_unlock_irqrestore(&object->lock, flags);
+out:
+	put_object(object);
+}
+
+/*
+ * Set the OBJECT_NO_SCAN flag for the object corresponding to the give
+ * pointer. Such object will not be scanned by kmemleak but references to it
+ * are searched.
+ */
+static void object_no_scan(unsigned long ptr)
+{
+	unsigned long flags;
+	struct kmemleak_object *object;
+
+	object = find_and_get_object(ptr, 0);
+	if (!object) {
+		kmemleak_warn("kmemleak: Not scanning unknown object at "
+			      "0x%08lx\n", ptr);
+		return;
+	}
+
+	spin_lock_irqsave(&object->lock, flags);
+	object->flags |= OBJECT_NO_SCAN;
+	spin_unlock_irqrestore(&object->lock, flags);
+	put_object(object);
+}
+
+/*
+ * Log an early kmemleak_* call to the early_log buffer. These calls will be
+ * processed later once kmemleak is fully initialized.
+ */
+static void log_early(int op_type, const void *ptr, size_t size,
+		      int min_count, unsigned long offset, size_t length)
+{
+	unsigned long flags;
+	struct early_log *log;
+
+	if (crt_early_log >= ARRAY_SIZE(early_log)) {
+		kmemleak_panic("kmemleak: Early log buffer exceeded\n");
+		return;
+	}
+
+	/*
+	 * There is no need for locking since the kernel is still in UP mode
+	 * at this stage. Disabling the IRQs is enough.
+	 */
+	local_irq_save(flags);
+	log = &early_log[crt_early_log];
+	log->op_type = op_type;
+	log->ptr = ptr;
+	log->size = size;
+	log->min_count = min_count;
+	log->offset = offset;
+	log->length = length;
+	crt_early_log++;
+	local_irq_restore(flags);
+}
+
+/*
+ * Memory allocation function callback. This function is called from the
+ * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc,
+ * vmalloc etc.).
+ */
+void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp)
+{
+	pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count);
+
+	if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+		create_object((unsigned long)ptr, size, min_count, gfp);
+	else if (atomic_read(&kmemleak_early_log))
+		log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0);
+}
+EXPORT_SYMBOL_GPL(kmemleak_alloc);
+
+/*
+ * Memory freeing function callback. This function is called from the kernel
+ * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.).
+ */
+void kmemleak_free(const void *ptr)
+{
+	pr_debug("%s(0x%p)\n", __func__, ptr);
+
+	if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+		delete_object((unsigned long)ptr);
+	else if (atomic_read(&kmemleak_early_log))
+		log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0);
+}
+EXPORT_SYMBOL_GPL(kmemleak_free);
+
+/*
+ * Mark an already allocated memory block as a false positive. This will cause
+ * the block to no longer be reported as leak and always be scanned.
+ */
+void kmemleak_not_leak(const void *ptr)
+{
+	pr_debug("%s(0x%p)\n", __func__, ptr);
+
+	if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+		make_gray_object((unsigned long)ptr);
+	else if (atomic_read(&kmemleak_early_log))
+		log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0);
+}
+EXPORT_SYMBOL(kmemleak_not_leak);
+
+/*
+ * Ignore a memory block. This is usually done when it is known that the
+ * corresponding block is not a leak and does not contain any references to
+ * other allocated memory blocks.
+ */
+void kmemleak_ignore(const void *ptr)
+{
+	pr_debug("%s(0x%p)\n", __func__, ptr);
+
+	if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+		make_black_object((unsigned long)ptr);
+	else if (atomic_read(&kmemleak_early_log))
+		log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0);
+}
+EXPORT_SYMBOL(kmemleak_ignore);
+
+/*
+ * Limit the range to be scanned in an allocated memory block.
+ */
+void kmemleak_scan_area(const void *ptr, unsigned long offset, size_t length,
+			gfp_t gfp)
+{
+	pr_debug("%s(0x%p)\n", __func__, ptr);
+
+	if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+		add_scan_area((unsigned long)ptr, offset, length, gfp);
+	else if (atomic_read(&kmemleak_early_log))
+		log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length);
+}
+EXPORT_SYMBOL(kmemleak_scan_area);
+
+/*
+ * Inform kmemleak not to scan the given memory block.
+ */
+void kmemleak_no_scan(const void *ptr)
+{
+	pr_debug("%s(0x%p)\n", __func__, ptr);
+
+	if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+		object_no_scan((unsigned long)ptr);
+	else if (atomic_read(&kmemleak_early_log))
+		log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0);
+}
+EXPORT_SYMBOL(kmemleak_no_scan);
+
+/*
+ * Yield the CPU so that other tasks get a chance to run.  The yielding is
+ * rate-limited to avoid excessive number of calls to the schedule() function
+ * during memory scanning.
+ */
+static void scan_yield(void)
+{
+	might_sleep();
+
+	if (time_is_before_eq_jiffies(next_scan_yield)) {
+		schedule();
+		next_scan_yield = jiffies + jiffies_scan_yield;
+	}
+}
+
+/*
+ * Memory scanning is a long process and it needs to be interruptable. This
+ * function checks whether such interrupt condition occured.
+ */
+static int scan_should_stop(void)
+{
+	if (!atomic_read(&kmemleak_enabled))
+		return 1;
+
+	/*
+	 * This function may be called from either process or kthread context,
+	 * hence the need to check for both stop conditions.
+	 */
+	if (current->mm)
+		return signal_pending(current);
+	else
+		return kthread_should_stop();
+
+	return 0;
+}
+
+/*
+ * Scan a memory block (exclusive range) for valid pointers and add those
+ * found to the gray list.
+ */
+static void scan_block(void *_start, void *_end,
+		       struct kmemleak_object *scanned)
+{
+	unsigned long *ptr;
+	unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
+	unsigned long *end = _end - (BYTES_PER_POINTER - 1);
+
+	for (ptr = start; ptr < end; ptr++) {
+		unsigned long flags;
+		unsigned long pointer = *ptr;
+		struct kmemleak_object *object;
+
+		if (scan_should_stop())
+			break;
+
+		/*
+		 * When scanning a memory block with a corresponding
+		 * kmemleak_object, the CPU yielding is handled in the calling
+		 * code since it holds the object->lock to avoid the block
+		 * freeing.
+		 */
+		if (!scanned)
+			scan_yield();
+
+		object = find_and_get_object(pointer, 1);
+		if (!object)
+			continue;
+		if (object == scanned) {
+			/* self referenced, ignore */
+			put_object(object);
+			continue;
+		}
+
+		/*
+		 * Avoid the lockdep recursive warning on object->lock being
+		 * previously acquired in scan_object(). These locks are
+		 * enclosed by scan_mutex.
+		 */
+		spin_lock_irqsave_nested(&object->lock, flags,
+					 SINGLE_DEPTH_NESTING);
+		if (!color_white(object)) {
+			/* non-orphan, ignored or new */
+			spin_unlock_irqrestore(&object->lock, flags);
+			put_object(object);
+			continue;
+		}
+
+		/*
+		 * Increase the object's reference count (number of pointers
+		 * to the memory block). If this count reaches the required
+		 * minimum, the object's color will become gray and it will be
+		 * added to the gray_list.
+		 */
+		object->count++;
+		if (color_gray(object))
+			list_add_tail(&object->gray_list, &gray_list);
+		else
+			put_object(object);
+		spin_unlock_irqrestore(&object->lock, flags);
+	}
+}
+
+/*
+ * Scan a memory block corresponding to a kmemleak_object. A condition is
+ * that object->use_count >= 1.
+ */
+static void scan_object(struct kmemleak_object *object)
+{
+	struct kmemleak_scan_area *area;
+	struct hlist_node *elem;
+	unsigned long flags;
+
+	/*
+	 * Once the object->lock is aquired, the corresponding memory block
+	 * cannot be freed (the same lock is aquired in delete_object).
+	 */
+	spin_lock_irqsave(&object->lock, flags);
+	if (object->flags & OBJECT_NO_SCAN)
+		goto out;
+	if (!(object->flags & OBJECT_ALLOCATED))
+		/* already freed object */
+		goto out;
+	if (hlist_empty(&object->area_list))
+		scan_block((void *)object->pointer,
+			   (void *)(object->pointer + object->size), object);
+	else
+		hlist_for_each_entry(area, elem, &object->area_list, node)
+			scan_block((void *)(object->pointer + area->offset),
+				   (void *)(object->pointer + area->offset
+					    + area->length), object);
+out:
+	spin_unlock_irqrestore(&object->lock, flags);
+}
+
+/*
+ * Scan data sections and all the referenced memory blocks allocated via the
+ * kernel's standard allocators. This function must be called with the
+ * scan_mutex held.
+ */
+static void kmemleak_scan(void)
+{
+	unsigned long flags;
+	struct kmemleak_object *object, *tmp;
+	struct task_struct *task;
+	int i;
+
+	/* prepare the kmemleak_object's */
+	rcu_read_lock();
+	list_for_each_entry_rcu(object, &object_list, object_list) {
+		spin_lock_irqsave(&object->lock, flags);
+#ifdef DEBUG
+		/*
+		 * With a few exceptions there should be a maximum of
+		 * 1 reference to any object at this point.
+		 */
+		if (atomic_read(&object->use_count) > 1) {
+			pr_debug("kmemleak: object->use_count = %d\n",
+				 atomic_read(&object->use_count));
+			dump_object_info(object);
+		}
+#endif
+		/* reset the reference count (whiten the object) */
+		object->count = 0;
+		if (color_gray(object) && get_object(object))
+			list_add_tail(&object->gray_list, &gray_list);
+
+		spin_unlock_irqrestore(&object->lock, flags);
+	}
+	rcu_read_unlock();
+
+	/* data/bss scanning */
+	scan_block(_sdata, _edata, NULL);
+	scan_block(__bss_start, __bss_stop, NULL);
+
+#ifdef CONFIG_SMP
+	/* per-cpu sections scanning */
+	for_each_possible_cpu(i)
+		scan_block(__per_cpu_start + per_cpu_offset(i),
+			   __per_cpu_end + per_cpu_offset(i), NULL);
+#endif
+
+	/*
+	 * Struct page scanning for each node. The code below is not yet safe
+	 * with MEMORY_HOTPLUG.
+	 */
+	for_each_online_node(i) {
+		pg_data_t *pgdat = NODE_DATA(i);
+		unsigned long start_pfn = pgdat->node_start_pfn;
+		unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
+		unsigned long pfn;
+
+		for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+			struct page *page;
+
+			if (!pfn_valid(pfn))
+				continue;
+			page = pfn_to_page(pfn);
+			/* only scan if page is in use */
+			if (page_count(page) == 0)
+				continue;
+			scan_block(page, page + 1, NULL);
+		}
+	}
+
+	/*
+	 * Scanning the task stacks may introduce false negatives and it is
+	 * not enabled by default.
+	 */
+	if (kmemleak_stack_scan) {
+		read_lock(&tasklist_lock);
+		for_each_process(task)
+			scan_block(task_stack_page(task),
+				   task_stack_page(task) + THREAD_SIZE, NULL);
+		read_unlock(&tasklist_lock);
+	}
+
+	/*
+	 * Scan the objects already referenced from the sections scanned
+	 * above. More objects will be referenced and, if there are no memory
+	 * leaks, all the objects will be scanned. The list traversal is safe
+	 * for both tail additions and removals from inside the loop. The
+	 * kmemleak objects cannot be freed from outside the loop because their
+	 * use_count was increased.
+	 */
+	object = list_entry(gray_list.next, typeof(*object), gray_list);
+	while (&object->gray_list != &gray_list) {
+		scan_yield();
+
+		/* may add new objects to the list */
+		if (!scan_should_stop())
+			scan_object(object);
+
+		tmp = list_entry(object->gray_list.next, typeof(*object),
+				 gray_list);
+
+		/* remove the object from the list and release it */
+		list_del(&object->gray_list);
+		put_object(object);
+
+		object = tmp;
+	}
+	WARN_ON(!list_empty(&gray_list));
+}
+
+/*
+ * Thread function performing automatic memory scanning. Unreferenced objects
+ * at the end of a memory scan are reported but only the first time.
+ */
+static int kmemleak_scan_thread(void *arg)
+{
+	static int first_run = 1;
+
+	pr_info("kmemleak: Automatic memory scanning thread started\n");
+
+	/*
+	 * Wait before the first scan to allow the system to fully initialize.
+	 */
+	if (first_run) {
+		first_run = 0;
+		ssleep(SECS_FIRST_SCAN);
+	}
+
+	while (!kthread_should_stop()) {
+		struct kmemleak_object *object;
+		signed long timeout = jiffies_scan_wait;
+
+		mutex_lock(&scan_mutex);
+
+		kmemleak_scan();
+		reported_leaks = 0;
+
+		rcu_read_lock();
+		list_for_each_entry_rcu(object, &object_list, object_list) {
+			unsigned long flags;
+
+			if (reported_leaks >= REPORTS_NR)
+				break;
+			spin_lock_irqsave(&object->lock, flags);
+			if (!(object->flags & OBJECT_REPORTED) &&
+			    unreferenced_object(object)) {
+				print_unreferenced(NULL, object);
+				object->flags |= OBJECT_REPORTED;
+				reported_leaks++;
+			} else if ((object->flags & OBJECT_REPORTED) &&
+				   referenced_object(object)) {
+				print_referenced(object);
+				object->flags &= ~OBJECT_REPORTED;
+			}
+			spin_unlock_irqrestore(&object->lock, flags);
+		}
+		rcu_read_unlock();
+
+		mutex_unlock(&scan_mutex);
+		/* wait before the next scan */
+		while (timeout && !kthread_should_stop())
+			timeout = schedule_timeout_interruptible(timeout);
+	}
+
+	pr_info("kmemleak: Automatic memory scanning thread ended\n");
+
+	return 0;
+}
+
+/*
+ * Start the automatic memory scanning thread. This function must be called
+ * with the kmemleak_mutex held.
+ */
+void start_scan_thread(void)
+{
+	if (scan_thread)
+		return;
+	scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak");
+	if (IS_ERR(scan_thread)) {
+		pr_warning("kmemleak: Failed to create the scan thread\n");
+		scan_thread = NULL;
+	}
+}
+
+/*
+ * Stop the automatic memory scanning thread. This function must be called
+ * with the kmemleak_mutex held.
+ */
+void stop_scan_thread(void)
+{
+	if (scan_thread) {
+		kthread_stop(scan_thread);
+		scan_thread = NULL;
+	}
+}
+
+/*
+ * Iterate over the object_list and return the first valid object at or after
+ * the required position with its use_count incremented. The function triggers
+ * a memory scanning when the pos argument points to the first position.
+ */
+static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct kmemleak_object *object;
+	loff_t n = *pos;
+
+	if (!n) {
+		kmemleak_scan();
+		reported_leaks = 0;
+	}
+	if (reported_leaks >= REPORTS_NR)
+		return NULL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(object, &object_list, object_list) {
+		if (n-- > 0)
+			continue;
+		if (get_object(object))
+			goto out;
+	}
+	object = NULL;
+out:
+	rcu_read_unlock();
+	return object;
+}
+
+/*
+ * Return the next object in the object_list. The function decrements the
+ * use_count of the previous object and increases that of the next one.
+ */
+static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct kmemleak_object *prev_obj = v;
+	struct kmemleak_object *next_obj = NULL;
+	struct list_head *n = &prev_obj->object_list;
+
+	++(*pos);
+	if (reported_leaks >= REPORTS_NR)
+		goto out;
+
+	rcu_read_lock();
+	list_for_each_continue_rcu(n, &object_list) {
+		next_obj = list_entry(n, struct kmemleak_object, object_list);
+		if (get_object(next_obj))
+			break;
+	}
+	rcu_read_unlock();
+out:
+	put_object(prev_obj);
+	return next_obj;
+}
+
+/*
+ * Decrement the use_count of the last object required, if any.
+ */
+static void kmemleak_seq_stop(struct seq_file *seq, void *v)
+{
+	if (v)
+		put_object(v);
+}
+
+/*
+ * Print the information for an unreferenced object to the seq file.
+ */
+static int kmemleak_seq_show(struct seq_file *seq, void *v)
+{
+	struct kmemleak_object *object = v;
+	unsigned long flags;
+
+	spin_lock_irqsave(&object->lock, flags);
+	if (!unreferenced_object(object))
+		goto out;
+	print_unreferenced(seq, object);
+	reported_leaks++;
+out:
+	spin_unlock_irqrestore(&object->lock, flags);
+	return 0;
+}
+
+static const struct seq_operations kmemleak_seq_ops = {
+	.start = kmemleak_seq_start,
+	.next  = kmemleak_seq_next,
+	.stop  = kmemleak_seq_stop,
+	.show  = kmemleak_seq_show,
+};
+
+static int kmemleak_open(struct inode *inode, struct file *file)
+{
+	int ret = 0;
+
+	if (!atomic_read(&kmemleak_enabled))
+		return -EBUSY;
+
+	ret = mutex_lock_interruptible(&kmemleak_mutex);
+	if (ret < 0)
+		goto out;
+	if (file->f_mode & FMODE_READ) {
+		ret = mutex_lock_interruptible(&scan_mutex);
+		if (ret < 0)
+			goto kmemleak_unlock;
+		ret = seq_open(file, &kmemleak_seq_ops);
+		if (ret < 0)
+			goto scan_unlock;
+	}
+	return ret;
+
+scan_unlock:
+	mutex_unlock(&scan_mutex);
+kmemleak_unlock:
+	mutex_unlock(&kmemleak_mutex);
+out:
+	return ret;
+}
+
+static int kmemleak_release(struct inode *inode, struct file *file)
+{
+	int ret = 0;
+
+	if (file->f_mode & FMODE_READ) {
+		seq_release(inode, file);
+		mutex_unlock(&scan_mutex);
+	}
+	mutex_unlock(&kmemleak_mutex);
+
+	return ret;
+}
+
+/*
+ * File write operation to configure kmemleak at run-time. The following
+ * commands can be written to the /sys/kernel/debug/kmemleak file:
+ *   off	- disable kmemleak (irreversible)
+ *   stack=on	- enable the task stacks scanning
+ *   stack=off	- disable the tasks stacks scanning
+ *   scan=on	- start the automatic memory scanning thread
+ *   scan=off	- stop the automatic memory scanning thread
+ *   scan=...	- set the automatic memory scanning period in seconds (0 to
+ *		  disable it)
+ */
+static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
+			      size_t size, loff_t *ppos)
+{
+	char buf[64];
+	int buf_size;
+
+	if (!atomic_read(&kmemleak_enabled))
+		return -EBUSY;
+
+	buf_size = min(size, (sizeof(buf) - 1));
+	if (strncpy_from_user(buf, user_buf, buf_size) < 0)
+		return -EFAULT;
+	buf[buf_size] = 0;
+
+	if (strncmp(buf, "off", 3) == 0)
+		kmemleak_disable();
+	else if (strncmp(buf, "stack=on", 8) == 0)
+		kmemleak_stack_scan = 1;
+	else if (strncmp(buf, "stack=off", 9) == 0)
+		kmemleak_stack_scan = 0;
+	else if (strncmp(buf, "scan=on", 7) == 0)
+		start_scan_thread();
+	else if (strncmp(buf, "scan=off", 8) == 0)
+		stop_scan_thread();
+	else if (strncmp(buf, "scan=", 5) == 0) {
+		unsigned long secs;
+		int err;
+
+		err = strict_strtoul(buf + 5, 0, &secs);
+		if (err < 0)
+			return err;
+		stop_scan_thread();
+		if (secs) {
+			jiffies_scan_wait = msecs_to_jiffies(secs * 1000);
+			start_scan_thread();
+		}
+	} else
+		return -EINVAL;
+
+	/* ignore the rest of the buffer, only one command at a time */
+	*ppos += size;
+	return size;
+}
+
+static const struct file_operations kmemleak_fops = {
+	.owner		= THIS_MODULE,
+	.open		= kmemleak_open,
+	.read		= seq_read,
+	.write		= kmemleak_write,
+	.llseek		= seq_lseek,
+	.release	= kmemleak_release,
+};
+
+/*
+ * Perform the freeing of the kmemleak internal objects after waiting for any
+ * current memory scan to complete.
+ */
+static int kmemleak_cleanup_thread(void *arg)
+{
+	struct kmemleak_object *object;
+
+	mutex_lock(&kmemleak_mutex);
+	stop_scan_thread();
+	mutex_unlock(&kmemleak_mutex);
+
+	mutex_lock(&scan_mutex);
+	rcu_read_lock();
+	list_for_each_entry_rcu(object, &object_list, object_list)
+		delete_object(object->pointer);
+	rcu_read_unlock();
+	mutex_unlock(&scan_mutex);
+
+	return 0;
+}
+
+/*
+ * Start the clean-up thread.
+ */
+static void kmemleak_cleanup(void)
+{
+	struct task_struct *cleanup_thread;
+
+	cleanup_thread = kthread_run(kmemleak_cleanup_thread, NULL,
+				     "kmemleak-clean");
+	if (IS_ERR(cleanup_thread))
+		pr_warning("kmemleak: Failed to create the clean-up thread\n");
+}
+
+/*
+ * Disable kmemleak. No memory allocation/freeing will be traced once this
+ * function is called. Disabling kmemleak is an irreversible operation.
+ */
+static void kmemleak_disable(void)
+{
+	/* atomically check whether it was already invoked */
+	if (atomic_cmpxchg(&kmemleak_error, 0, 1))
+		return;
+
+	/* stop any memory operation tracing */
+	atomic_set(&kmemleak_early_log, 0);
+	atomic_set(&kmemleak_enabled, 0);
+
+	/* check whether it is too early for a kernel thread */
+	if (atomic_read(&kmemleak_initialized))
+		kmemleak_cleanup();
+
+	pr_info("Kernel memory leak detector disabled\n");
+}
+
+/*
+ * Allow boot-time kmemleak disabling (enabled by default).
+ */
+static int kmemleak_boot_config(char *str)
+{
+	if (!str)
+		return -EINVAL;
+	if (strcmp(str, "off") == 0)
+		kmemleak_disable();
+	else if (strcmp(str, "on") != 0)
+		return -EINVAL;
+	return 0;
+}
+early_param("kmemleak", kmemleak_boot_config);
+
+/*
+ * Kkmemleak initialization.
+ */
+void __init kmemleak_init(void)
+{
+	int i;
+	unsigned long flags;
+
+	jiffies_scan_yield = msecs_to_jiffies(MSECS_SCAN_YIELD);
+	jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
+	jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
+
+	object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
+	scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
+	INIT_PRIO_TREE_ROOT(&object_tree_root);
+
+	/* the kernel is still in UP mode, so disabling the IRQs is enough */
+	local_irq_save(flags);
+	if (!atomic_read(&kmemleak_error)) {
+		atomic_set(&kmemleak_enabled, 1);
+		atomic_set(&kmemleak_early_log, 0);
+	}
+	local_irq_restore(flags);
+
+	/*
+	 * This is the point where tracking allocations is safe. Automatic
+	 * scanning is started during the late initcall. Add the early logged
+	 * callbacks to the kmemleak infrastructure.
+	 */
+	for (i = 0; i < crt_early_log; i++) {
+		struct early_log *log = &early_log[i];
+
+		switch (log->op_type) {
+		case KMEMLEAK_ALLOC:
+			kmemleak_alloc(log->ptr, log->size, log->min_count,
+				       GFP_KERNEL);
+			break;
+		case KMEMLEAK_FREE:
+			kmemleak_free(log->ptr);
+			break;
+		case KMEMLEAK_NOT_LEAK:
+			kmemleak_not_leak(log->ptr);
+			break;
+		case KMEMLEAK_IGNORE:
+			kmemleak_ignore(log->ptr);
+			break;
+		case KMEMLEAK_SCAN_AREA:
+			kmemleak_scan_area(log->ptr, log->offset, log->length,
+					   GFP_KERNEL);
+			break;
+		case KMEMLEAK_NO_SCAN:
+			kmemleak_no_scan(log->ptr);
+			break;
+		default:
+			WARN_ON(1);
+		}
+	}
+}
+
+/*
+ * Late initialization function.
+ */
+static int __init kmemleak_late_init(void)
+{
+	struct dentry *dentry;
+
+	atomic_set(&kmemleak_initialized, 1);
+
+	if (atomic_read(&kmemleak_error)) {
+		/*
+		 * Some error occured and kmemleak was disabled. There is a
+		 * small chance that kmemleak_disable() was called immediately
+		 * after setting kmemleak_initialized and we may end up with
+		 * two clean-up threads but serialized by scan_mutex.
+		 */
+		kmemleak_cleanup();
+		return -ENOMEM;
+	}
+
+	dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL,
+				     &kmemleak_fops);
+	if (!dentry)
+		pr_warning("kmemleak: Failed to create the debugfs kmemleak "
+			   "file\n");
+	mutex_lock(&kmemleak_mutex);
+	start_scan_thread();
+	mutex_unlock(&kmemleak_mutex);
+
+	pr_info("Kernel memory leak detector initialized\n");
+
+	return 0;
+}
+late_initcall(kmemleak_late_init);
-- 
cgit v0.10.2


From 04f70336c80c43a15e617b36c2043dfa0ad6ed0f Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:22:39 +0100
Subject: kmemleak: Add documentation on the memory leak detector

This patch adds the Documentation/kmemleak.txt file with some
information about how kmemleak works.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 4a3c220..04a44cc 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1077,6 +1077,10 @@ and is between 256 and 4096 characters. It is defined in the file
 			Configure the RouterBoard 532 series on-chip
 			Ethernet adapter MAC address.
 
+	kmemleak=	[KNL] Boot-time kmemleak enable/disable
+			Valid arguments: on, off
+			Default: on
+
 	kstack=N	[X86] Print N words from the kernel stack
 			in oops dumps.
 
diff --git a/Documentation/kmemleak.txt b/Documentation/kmemleak.txt
new file mode 100644
index 0000000..0112da3
--- /dev/null
+++ b/Documentation/kmemleak.txt
@@ -0,0 +1,142 @@
+Kernel Memory Leak Detector
+===========================
+
+Introduction
+------------
+
+Kmemleak provides a way of detecting possible kernel memory leaks in a
+way similar to a tracing garbage collector
+(http://en.wikipedia.org/wiki/Garbage_collection_%28computer_science%29#Tracing_garbage_collectors),
+with the difference that the orphan objects are not freed but only
+reported via /sys/kernel/debug/kmemleak. A similar method is used by the
+Valgrind tool (memcheck --leak-check) to detect the memory leaks in
+user-space applications.
+
+Usage
+-----
+
+CONFIG_DEBUG_KMEMLEAK in "Kernel hacking" has to be enabled. A kernel
+thread scans the memory every 10 minutes (by default) and prints any new
+unreferenced objects found. To trigger an intermediate scan and display
+all the possible memory leaks:
+
+  # mount -t debugfs nodev /sys/kernel/debug/
+  # cat /sys/kernel/debug/kmemleak
+
+Note that the orphan objects are listed in the order they were allocated
+and one object at the beginning of the list may cause other subsequent
+objects to be reported as orphan.
+
+Memory scanning parameters can be modified at run-time by writing to the
+/sys/kernel/debug/kmemleak file. The following parameters are supported:
+
+  off		- disable kmemleak (irreversible)
+  stack=on	- enable the task stacks scanning
+  stack=off	- disable the tasks stacks scanning
+  scan=on	- start the automatic memory scanning thread
+  scan=off	- stop the automatic memory scanning thread
+  scan=<secs>	- set the automatic memory scanning period in seconds (0
+		  to disable it)
+
+Kmemleak can also be disabled at boot-time by passing "kmemleak=off" on
+the kernel command line.
+
+Basic Algorithm
+---------------
+
+The memory allocations via kmalloc, vmalloc, kmem_cache_alloc and
+friends are traced and the pointers, together with additional
+information like size and stack trace, are stored in a prio search tree.
+The corresponding freeing function calls are tracked and the pointers
+removed from the kmemleak data structures.
+
+An allocated block of memory is considered orphan if no pointer to its
+start address or to any location inside the block can be found by
+scanning the memory (including saved registers). This means that there
+might be no way for the kernel to pass the address of the allocated
+block to a freeing function and therefore the block is considered a
+memory leak.
+
+The scanning algorithm steps:
+
+  1. mark all objects as white (remaining white objects will later be
+     considered orphan)
+  2. scan the memory starting with the data section and stacks, checking
+     the values against the addresses stored in the prio search tree. If
+     a pointer to a white object is found, the object is added to the
+     gray list
+  3. scan the gray objects for matching addresses (some white objects
+     can become gray and added at the end of the gray list) until the
+     gray set is finished
+  4. the remaining white objects are considered orphan and reported via
+     /sys/kernel/debug/kmemleak
+
+Some allocated memory blocks have pointers stored in the kernel's
+internal data structures and they cannot be detected as orphans. To
+avoid this, kmemleak can also store the number of values pointing to an
+address inside the block address range that need to be found so that the
+block is not considered a leak. One example is __vmalloc().
+
+Kmemleak API
+------------
+
+See the include/linux/kmemleak.h header for the functions prototype.
+
+kmemleak_init		 - initialize kmemleak
+kmemleak_alloc		 - notify of a memory block allocation
+kmemleak_free		 - notify of a memory block freeing
+kmemleak_not_leak	 - mark an object as not a leak
+kmemleak_ignore		 - do not scan or report an object as leak
+kmemleak_scan_area	 - add scan areas inside a memory block
+kmemleak_no_scan	 - do not scan a memory block
+kmemleak_erase		 - erase an old value in a pointer variable
+kmemleak_alloc_recursive - as kmemleak_alloc but checks the recursiveness
+kmemleak_free_recursive	 - as kmemleak_free but checks the recursiveness
+
+Dealing with false positives/negatives
+--------------------------------------
+
+The false negatives are real memory leaks (orphan objects) but not
+reported by kmemleak because values found during the memory scanning
+point to such objects. To reduce the number of false negatives, kmemleak
+provides the kmemleak_ignore, kmemleak_scan_area, kmemleak_no_scan and
+kmemleak_erase functions (see above). The task stacks also increase the
+amount of false negatives and their scanning is not enabled by default.
+
+The false positives are objects wrongly reported as being memory leaks
+(orphan). For objects known not to be leaks, kmemleak provides the
+kmemleak_not_leak function. The kmemleak_ignore could also be used if
+the memory block is known not to contain other pointers and it will no
+longer be scanned.
+
+Some of the reported leaks are only transient, especially on SMP
+systems, because of pointers temporarily stored in CPU registers or
+stacks. Kmemleak defines MSECS_MIN_AGE (defaulting to 1000) representing
+the minimum age of an object to be reported as a memory leak.
+
+Limitations and Drawbacks
+-------------------------
+
+The main drawback is the reduced performance of memory allocation and
+freeing. To avoid other penalties, the memory scanning is only performed
+when the /sys/kernel/debug/kmemleak file is read. Anyway, this tool is
+intended for debugging purposes where the performance might not be the
+most important requirement.
+
+To keep the algorithm simple, kmemleak scans for values pointing to any
+address inside a block's address range. This may lead to an increased
+number of false negatives. However, it is likely that a real memory leak
+will eventually become visible.
+
+Another source of false negatives is the data stored in non-pointer
+values. In a future version, kmemleak could only scan the pointer
+members in the allocated structures. This feature would solve many of
+the false negative cases described above.
+
+The tool can report false positives. These are cases where an allocated
+block doesn't need to be freed (some cases in the init_call functions),
+the pointer is calculated by other methods than the usual container_of
+macro or the pointer is stored in a location not scanned by kmemleak.
+
+Page allocations and ioremap are not tracked. Only the ARM and x86
+architectures are currently supported.
-- 
cgit v0.10.2


From d5cff635290aec9ad7e6ee546aa4fae895361cbb Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:22:40 +0100
Subject: kmemleak: Add the slab memory allocation/freeing hooks

This patch adds the callbacks to kmemleak_(alloc|free) functions from
the slab allocator. The patch also adds the SLAB_NOLEAKTRACE flag to
avoid recursive calls to kmemleak when it allocates its own data
structures.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 24c5602..4880306 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -62,6 +62,8 @@
 # define SLAB_DEBUG_OBJECTS	0x00000000UL
 #endif
 
+#define SLAB_NOLEAKTRACE	0x00800000UL	/* Avoid kmemleak tracing */
+
 /* The following flags affect the page allocator grouping pages by mobility */
 #define SLAB_RECLAIM_ACCOUNT	0x00020000UL		/* Objects are reclaimable */
 #define SLAB_TEMPORARY		SLAB_RECLAIM_ACCOUNT	/* Objects are short-lived */
diff --git a/mm/slab.c b/mm/slab.c
index f85831d..859067f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -107,6 +107,7 @@
 #include	<linux/string.h>
 #include	<linux/uaccess.h>
 #include	<linux/nodemask.h>
+#include	<linux/kmemleak.h>
 #include	<linux/mempolicy.h>
 #include	<linux/mutex.h>
 #include	<linux/fault-inject.h>
@@ -178,13 +179,13 @@
 			 SLAB_STORE_USER | \
 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
-			 SLAB_DEBUG_OBJECTS)
+			 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE)
 #else
 # define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
 			 SLAB_CACHE_DMA | \
 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
-			 SLAB_DEBUG_OBJECTS)
+			 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE)
 #endif
 
 /*
@@ -964,6 +965,14 @@ static struct array_cache *alloc_arraycache(int node, int entries,
 	struct array_cache *nc = NULL;
 
 	nc = kmalloc_node(memsize, GFP_KERNEL, node);
+	/*
+	 * The array_cache structures contain pointers to free object.
+	 * However, when such objects are allocated or transfered to another
+	 * cache the pointers are not cleared and they could be counted as
+	 * valid references during a kmemleak scan. Therefore, kmemleak must
+	 * not scan such objects.
+	 */
+	kmemleak_no_scan(nc);
 	if (nc) {
 		nc->avail = 0;
 		nc->limit = entries;
@@ -2621,6 +2630,14 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
 		/* Slab management obj is off-slab. */
 		slabp = kmem_cache_alloc_node(cachep->slabp_cache,
 					      local_flags, nodeid);
+		/*
+		 * If the first object in the slab is leaked (it's allocated
+		 * but no one has a reference to it), we want to make sure
+		 * kmemleak does not treat the ->s_mem pointer as a reference
+		 * to the object. Otherwise we will not report the leak.
+		 */
+		kmemleak_scan_area(slabp, offsetof(struct slab, list),
+				   sizeof(struct list_head), local_flags);
 		if (!slabp)
 			return NULL;
 	} else {
@@ -3141,6 +3158,12 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 		STATS_INC_ALLOCMISS(cachep);
 		objp = cache_alloc_refill(cachep, flags);
 	}
+	/*
+	 * To avoid a false negative, if an object that is in one of the
+	 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
+	 * treat the array pointers as a reference to the object.
+	 */
+	kmemleak_erase(&ac->entry[ac->avail]);
 	return objp;
 }
 
@@ -3360,6 +3383,8 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
   out:
 	local_irq_restore(save_flags);
 	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
+	kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags,
+				 flags);
 
 	if (unlikely((flags & __GFP_ZERO) && ptr))
 		memset(ptr, 0, obj_size(cachep));
@@ -3415,6 +3440,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
 	objp = __do_cache_alloc(cachep, flags);
 	local_irq_restore(save_flags);
 	objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
+	kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
+				 flags);
 	prefetchw(objp);
 
 	if (unlikely((flags & __GFP_ZERO) && objp))
@@ -3530,6 +3557,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
 	struct array_cache *ac = cpu_cache_get(cachep);
 
 	check_irq_off();
+	kmemleak_free_recursive(objp, cachep->flags);
 	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
 
 	/*
-- 
cgit v0.10.2


From 4374e616d28e65265a5b433ceece275449f3d2e3 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:23:17 +0100
Subject: kmemleak: Add the slob memory allocation/freeing hooks

This patch adds the callbacks to kmemleak_(alloc|free) functions from the
slob allocator.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Matt Mackall <mpm@selenic.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>

diff --git a/mm/slob.c b/mm/slob.c
index 9b1737b..12f2614 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -67,6 +67,7 @@
 #include <linux/rcupdate.h>
 #include <linux/list.h>
 #include <linux/kmemtrace.h>
+#include <linux/kmemleak.h>
 #include <asm/atomic.h>
 
 /*
@@ -509,6 +510,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
 				   size, PAGE_SIZE << order, gfp, node);
 	}
 
+	kmemleak_alloc(ret, size, 1, gfp);
 	return ret;
 }
 EXPORT_SYMBOL(__kmalloc_node);
@@ -521,6 +523,7 @@ void kfree(const void *block)
 
 	if (unlikely(ZERO_OR_NULL_PTR(block)))
 		return;
+	kmemleak_free(block);
 
 	sp = slob_page(block);
 	if (is_slob_page(sp)) {
@@ -584,12 +587,14 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
 	} else if (flags & SLAB_PANIC)
 		panic("Cannot create slab cache %s\n", name);
 
+	kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL);
 	return c;
 }
 EXPORT_SYMBOL(kmem_cache_create);
 
 void kmem_cache_destroy(struct kmem_cache *c)
 {
+	kmemleak_free(c);
 	slob_free(c, sizeof(struct kmem_cache));
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
@@ -613,6 +618,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 	if (c->ctor)
 		c->ctor(b);
 
+	kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags);
 	return b;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node);
@@ -635,6 +641,7 @@ static void kmem_rcu_free(struct rcu_head *head)
 
 void kmem_cache_free(struct kmem_cache *c, void *b)
 {
+	kmemleak_free_recursive(b, c->flags);
 	if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) {
 		struct slob_rcu *slob_rcu;
 		slob_rcu = b + (c->size - sizeof(struct slob_rcu));
-- 
cgit v0.10.2


From 06f22f13f3cc2eff00db09f053218e5d4b757bc8 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:23:18 +0100
Subject: kmemleak: Add the slub memory allocation/freeing hooks

This patch adds the callbacks to kmemleak_(alloc|free) functions from the
slub allocator.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>

diff --git a/mm/slub.c b/mm/slub.c
index 5e805a6..6674a79 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -20,6 +20,7 @@
 #include <linux/kmemtrace.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
+#include <linux/kmemleak.h>
 #include <linux/mempolicy.h>
 #include <linux/ctype.h>
 #include <linux/debugobjects.h>
@@ -143,7 +144,7 @@
  * Set of flags that will prevent slab merging
  */
 #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
-		SLAB_TRACE | SLAB_DESTROY_BY_RCU)
+		SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE)
 
 #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
 		SLAB_CACHE_DMA)
@@ -1617,6 +1618,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
 	if (unlikely((gfpflags & __GFP_ZERO) && object))
 		memset(object, 0, objsize);
 
+	kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags);
 	return object;
 }
 
@@ -1746,6 +1748,7 @@ static __always_inline void slab_free(struct kmem_cache *s,
 	struct kmem_cache_cpu *c;
 	unsigned long flags;
 
+	kmemleak_free_recursive(x, s->flags);
 	local_irq_save(flags);
 	c = get_cpu_slab(s, smp_processor_id());
 	debug_check_no_locks_freed(object, c->objsize);
-- 
cgit v0.10.2


From 89219d37a2377c44fde7bff0bf0623453c05329a Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:23:19 +0100
Subject: kmemleak: Add the vmalloc memory allocation/freeing hooks

This patch adds the callbacks to kmemleak_(alloc|free) functions from
vmalloc/vfree.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 083716e..b7db935 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -25,6 +25,7 @@
 #include <linux/rcupdate.h>
 #include <linux/bootmem.h>
 #include <linux/pfn.h>
+#include <linux/kmemleak.h>
 
 #include <asm/atomic.h>
 #include <asm/uaccess.h>
@@ -1327,6 +1328,9 @@ static void __vunmap(const void *addr, int deallocate_pages)
 void vfree(const void *addr)
 {
 	BUG_ON(in_interrupt());
+
+	kmemleak_free(addr);
+
 	__vunmap(addr, 1);
 }
 EXPORT_SYMBOL(vfree);
@@ -1439,8 +1443,17 @@ fail:
 
 void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
 {
-	return __vmalloc_area_node(area, gfp_mask, prot, -1,
-					__builtin_return_address(0));
+	void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1,
+					 __builtin_return_address(0));
+
+	/*
+	 * A ref_count = 3 is needed because the vm_struct and vmap_area
+	 * structures allocated in the __get_vm_area_node() function contain
+	 * references to the virtual address of the vmalloc'ed block.
+	 */
+	kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask);
+
+	return addr;
 }
 
 /**
@@ -1459,6 +1472,8 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
 						int node, void *caller)
 {
 	struct vm_struct *area;
+	void *addr;
+	unsigned long real_size = size;
 
 	size = PAGE_ALIGN(size);
 	if (!size || (size >> PAGE_SHIFT) > num_physpages)
@@ -1470,7 +1485,16 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
 	if (!area)
 		return NULL;
 
-	return __vmalloc_area_node(area, gfp_mask, prot, node, caller);
+	addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
+
+	/*
+	 * A ref_count = 3 is needed because the vm_struct and vmap_area
+	 * structures allocated in the __get_vm_area_node() function contain
+	 * references to the virtual address of the vmalloc'ed block.
+	 */
+	kmemleak_alloc(addr, real_size, 3, gfp_mask);
+
+	return addr;
 }
 
 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
-- 
cgit v0.10.2


From dbb1f81ca67a56c6cfce4c94d07c76378fd4af9e Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:23:19 +0100
Subject: kmemleak: Add kmemleak_alloc callback from alloc_large_system_hash

The alloc_large_system_hash function is called from various places in
the kernel and it contains pointers to other allocated structures. It
therefore needs to be traced by kmemleak.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 474c7e9..17d5f53 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -46,6 +46,7 @@
 #include <linux/page-isolation.h>
 #include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
+#include <linux/kmemleak.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -4546,6 +4547,16 @@ void *__init alloc_large_system_hash(const char *tablename,
 	if (_hash_mask)
 		*_hash_mask = (1 << log2qty) - 1;
 
+	/*
+	 * If hashdist is set, the table allocation is done with __vmalloc()
+	 * which invokes the kmemleak_alloc() callback. This function may also
+	 * be called before the slab and kmemleak are initialised when
+	 * kmemleak simply buffers the request to be executed later
+	 * (GFP_ATOMIC flag ignored in this case).
+	 */
+	if (!hashdist)
+		kmemleak_alloc(table, size, 1, GFP_ATOMIC);
+
 	return table;
 }
 
-- 
cgit v0.10.2


From 4f2294b6dc88d99295230d97fef2c9863cec44c3 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:23:20 +0100
Subject: kmemleak: Add modules support

This patch handles the kmemleak operations needed for modules loading so
that memory allocations from inside a module are properly tracked.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

diff --git a/kernel/module.c b/kernel/module.c
index 2383e60..5cd55ab 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -53,6 +53,7 @@
 #include <linux/ftrace.h>
 #include <linux/async.h>
 #include <linux/percpu.h>
+#include <linux/kmemleak.h>
 
 #if 0
 #define DEBUGP printk
@@ -430,6 +431,7 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
 	unsigned long extra;
 	unsigned int i;
 	void *ptr;
+	int cpu;
 
 	if (align > PAGE_SIZE) {
 		printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
@@ -459,6 +461,11 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
 			if (!split_block(i, size))
 				return NULL;
 
+		/* add the per-cpu scanning areas */
+		for_each_possible_cpu(cpu)
+			kmemleak_alloc(ptr + per_cpu_offset(cpu), size, 0,
+				       GFP_KERNEL);
+
 		/* Mark allocated */
 		pcpu_size[i] = -pcpu_size[i];
 		return ptr;
@@ -473,6 +480,7 @@ static void percpu_modfree(void *freeme)
 {
 	unsigned int i;
 	void *ptr = __per_cpu_start + block_size(pcpu_size[0]);
+	int cpu;
 
 	/* First entry is core kernel percpu data. */
 	for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
@@ -484,6 +492,10 @@ static void percpu_modfree(void *freeme)
 	BUG();
 
  free:
+	/* remove the per-cpu scanning areas */
+	for_each_possible_cpu(cpu)
+		kmemleak_free(freeme + per_cpu_offset(cpu));
+
 	/* Merge with previous? */
 	if (pcpu_size[i-1] >= 0) {
 		pcpu_size[i-1] += pcpu_size[i];
@@ -1876,6 +1888,36 @@ static void *module_alloc_update_bounds(unsigned long size)
 	return ret;
 }
 
+#ifdef CONFIG_DEBUG_KMEMLEAK
+static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
+				 Elf_Shdr *sechdrs, char *secstrings)
+{
+	unsigned int i;
+
+	/* only scan the sections containing data */
+	kmemleak_scan_area(mod->module_core, (unsigned long)mod -
+			   (unsigned long)mod->module_core,
+			   sizeof(struct module), GFP_KERNEL);
+
+	for (i = 1; i < hdr->e_shnum; i++) {
+		if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+			continue;
+		if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0
+		    && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
+			continue;
+
+		kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr -
+				   (unsigned long)mod->module_core,
+				   sechdrs[i].sh_size, GFP_KERNEL);
+	}
+}
+#else
+static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
+					Elf_Shdr *sechdrs, char *secstrings)
+{
+}
+#endif
+
 /* Allocate and load the module: note that size of section 0 is always
    zero, and we rely on this for optional sections. */
 static noinline struct module *load_module(void __user *umod,
@@ -2046,6 +2088,12 @@ static noinline struct module *load_module(void __user *umod,
 
 	/* Do the allocs. */
 	ptr = module_alloc_update_bounds(mod->core_size);
+	/*
+	 * The pointer to this block is stored in the module structure
+	 * which is inside the block. Just mark it as not being a
+	 * leak.
+	 */
+	kmemleak_not_leak(ptr);
 	if (!ptr) {
 		err = -ENOMEM;
 		goto free_percpu;
@@ -2054,6 +2102,13 @@ static noinline struct module *load_module(void __user *umod,
 	mod->module_core = ptr;
 
 	ptr = module_alloc_update_bounds(mod->init_size);
+	/*
+	 * The pointer to this block is stored in the module structure
+	 * which is inside the block. This block doesn't need to be
+	 * scanned as it contains data and code that will be freed
+	 * after the module is initialized.
+	 */
+	kmemleak_ignore(ptr);
 	if (!ptr && mod->init_size) {
 		err = -ENOMEM;
 		goto free_core;
@@ -2084,6 +2139,7 @@ static noinline struct module *load_module(void __user *umod,
 	}
 	/* Module has been moved. */
 	mod = (void *)sechdrs[modindex].sh_addr;
+	kmemleak_load_module(mod, hdr, sechdrs, secstrings);
 
 #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
 	mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
-- 
cgit v0.10.2


From 2e1483c995bbd0fa6cbd055ad76088a520799ba4 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:24:13 +0100
Subject: kmemleak: Remove some of the kmemleak false positives

There are allocations for which the main pointer cannot be found but
they are not memory leaks. This patch fixes some of them. For more
information on false positives, see Documentation/kmemleak.txt.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 08151d4..961c1a7 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -104,6 +104,7 @@
 #include <linux/io.h>
 #include <asm/system.h>
 #include <linux/uaccess.h>
+#include <linux/kmemleak.h>
 
 #define MAX_NR_CON_DRIVER 16
 
@@ -2880,6 +2881,12 @@ static int __init con_init(void)
 	 */
 	for (currcons = 0; currcons < MIN_NR_CONSOLES; currcons++) {
 		vc_cons[currcons].d = vc = alloc_bootmem(sizeof(struct vc_data));
+		/*
+		 * Kmemleak does not track the memory allocated via
+		 * alloc_bootmem() but this block contains pointers to
+		 * other blocks allocated via kmalloc.
+		 */
+		kmemleak_alloc(vc, sizeof(struct vc_data), 1, GFP_ATOMIC);
 		INIT_WORK(&vc_cons[currcons].SAK_work, vc_SAK);
 		visual_init(vc, currcons, 1);
 		vc->vc_screenbuf = (unsigned short *)alloc_bootmem(vc->vc_screenbuf_size);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index f45dbc1..d250f80 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -25,6 +25,7 @@
 #include <linux/uio.h>
 #include <linux/namei.h>
 #include <linux/log2.h>
+#include <linux/kmemleak.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -492,6 +493,11 @@ void __init bdev_cache_init(void)
 	bd_mnt = kern_mount(&bd_type);
 	if (IS_ERR(bd_mnt))
 		panic("Cannot create bdev pseudo-fs");
+	/*
+	 * This vfsmount structure is only used to obtain the
+	 * blockdev_superblock, so tell kmemleak not to report it.
+	 */
+	kmemleak_not_leak(bd_mnt);
 	blockdev_superblock = bd_mnt->mnt_sb;	/* For writeback */
 }
 
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 1581ff2..26fd9d1 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -86,7 +86,12 @@ struct percpu_data {
 	void *ptrs[1];
 };
 
+/* pointer disguising messes up the kmemleak objects tracking */
+#ifndef CONFIG_DEBUG_KMEMLEAK
 #define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
+#else
+#define __percpu_disguise(pdata) (struct percpu_data *)(pdata)
+#endif
 
 #define per_cpu_ptr(ptr, cpu)						\
 ({									\
-- 
cgit v0.10.2


From 3bba00d7bdd57cb7aa739b751fa0a1fbbb04dc18 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:24:13 +0100
Subject: kmemleak: Enable the building of the memory leak detector

This patch adds the Kconfig.debug and Makefile entries needed for
building kmemleak into the kernel.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 6cdcf38..5cc3506 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -336,6 +336,28 @@ config SLUB_STATS
 	  out which slabs are relevant to a particular load.
 	  Try running: slabinfo -DA
 
+config DEBUG_KMEMLEAK
+	bool "Kernel memory leak detector"
+	depends on DEBUG_KERNEL && EXPERIMENTAL && (X86 || ARM) && \
+		!MEMORY_HOTPLUG
+	select DEBUG_SLAB if SLAB
+	select SLUB_DEBUG if SLUB
+	select DEBUG_FS if SYSFS
+	select STACKTRACE if STACKTRACE_SUPPORT
+	select KALLSYMS
+	help
+	  Say Y here if you want to enable the memory leak
+	  detector. The memory allocation/freeing is traced in a way
+	  similar to the Boehm's conservative garbage collector, the
+	  difference being that the orphan objects are not freed but
+	  only shown in /sys/kernel/debug/kmemleak. Enabling this
+	  feature will introduce an overhead to memory
+	  allocations. See Documentation/kmemleak.txt for more
+	  details.
+
+	  In order to access the kmemleak file, debugfs needs to be
+	  mounted (usually at /sys/kernel/debug).
+
 config DEBUG_PREEMPT
 	bool "Debug preemptible kernel"
 	depends on DEBUG_KERNEL && PREEMPT && (TRACE_IRQFLAGS_SUPPORT || PPC64)
diff --git a/mm/Makefile b/mm/Makefile
index ec73c68..cb84a02 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -38,3 +38,4 @@ obj-$(CONFIG_SMP) += allocpercpu.o
 endif
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
-- 
cgit v0.10.2


From 0822ee4ac1ae6af5a953f97f75553738834b10b9 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:24:14 +0100
Subject: kmemleak: Simple testing module for kmemleak

This patch adds a loadable module that deliberately leaks memory. It
is used for testing various memory leaking scenarios.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 5cc3506..116a350 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -358,6 +358,16 @@ config DEBUG_KMEMLEAK
 	  In order to access the kmemleak file, debugfs needs to be
 	  mounted (usually at /sys/kernel/debug).
 
+config DEBUG_KMEMLEAK_TEST
+	tristate "Simple test for the kernel memory leak detector"
+	depends on DEBUG_KMEMLEAK
+	help
+	  Say Y or M here to build a test for the kernel memory leak
+	  detector. This option enables a module that explicitly leaks
+	  memory.
+
+	  If unsure, say N.
+
 config DEBUG_PREEMPT
 	bool "Debug preemptible kernel"
 	depends on DEBUG_KERNEL && PREEMPT && (TRACE_IRQFLAGS_SUPPORT || PPC64)
diff --git a/mm/Makefile b/mm/Makefile
index cb84a02..e89acb0 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -39,3 +39,4 @@ endif
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
+obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
new file mode 100644
index 0000000..d5292fc
--- /dev/null
+++ b/mm/kmemleak-test.c
@@ -0,0 +1,111 @@
+/*
+ * mm/kmemleak-test.c
+ *
+ * Copyright (C) 2008 ARM Limited
+ * Written by Catalin Marinas <catalin.marinas@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/list.h>
+#include <linux/percpu.h>
+#include <linux/fdtable.h>
+
+#include <linux/kmemleak.h>
+
+struct test_node {
+	long header[25];
+	struct list_head list;
+	long footer[25];
+};
+
+static LIST_HEAD(test_list);
+static DEFINE_PER_CPU(void *, test_pointer);
+
+/*
+ * Some very simple testing. This function needs to be extended for
+ * proper testing.
+ */
+static int __init kmemleak_test_init(void)
+{
+	struct test_node *elem;
+	int i;
+
+	printk(KERN_INFO "Kmemleak testing\n");
+
+	/* make some orphan objects */
+	pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
+	pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
+	pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
+	pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
+	pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
+	pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
+	pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
+	pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
+#ifndef CONFIG_MODULES
+	pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n",
+		kmem_cache_alloc(files_cachep, GFP_KERNEL));
+	pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n",
+		kmem_cache_alloc(files_cachep, GFP_KERNEL));
+#endif
+	pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
+	pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
+	pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
+	pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
+	pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
+
+	/*
+	 * Add elements to a list. They should only appear as orphan
+	 * after the module is removed.
+	 */
+	for (i = 0; i < 10; i++) {
+		elem = kmalloc(sizeof(*elem), GFP_KERNEL);
+		pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem);
+		if (!elem)
+			return -ENOMEM;
+		memset(elem, 0, sizeof(*elem));
+		INIT_LIST_HEAD(&elem->list);
+
+		list_add_tail(&elem->list, &test_list);
+	}
+
+	for_each_possible_cpu(i) {
+		per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL);
+		pr_info("kmemleak: kmalloc(129) = %p\n",
+			per_cpu(test_pointer, i));
+	}
+
+	return 0;
+}
+module_init(kmemleak_test_init);
+
+static void __exit kmemleak_test_exit(void)
+{
+	struct test_node *elem, *tmp;
+
+	/*
+	 * Remove the list elements without actually freeing the
+	 * memory.
+	 */
+	list_for_each_entry_safe(elem, tmp, &test_list, list)
+		list_del(&elem->list);
+}
+module_exit(kmemleak_test_exit);
+
+MODULE_LICENSE("GPL");
-- 
cgit v0.10.2


From 3aa27bbe7a6536d1ec859d3a97caf3319b5081b7 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:24:14 +0100
Subject: kmemleak: Add the corresponding MAINTAINERS entry

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

diff --git a/MAINTAINERS b/MAINTAINERS
index cf4abdd..e5af306 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3359,6 +3359,12 @@ F:	Documentation/trace/kmemtrace.txt
 F:	include/trace/kmemtrace.h
 F:	kernel/trace/kmemtrace.c
 
+KMEMLEAK
+P:	Catalin Marinas
+M:	catalin.marinas@arm.com
+L:	linux-kernel@vger.kernel.org
+S:	Maintained
+
 KPROBES
 P:	Ananth N Mavinakayanahalli
 M:	ananth@in.ibm.com
-- 
cgit v0.10.2


From 441c7e0a2ed38827b48b907bd1fa29faba2017a3 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 10 Jun 2009 20:05:53 +0300
Subject: bootmem: use slab if bootmem is no longer available

As a preparation for initializing the slab allocator early, make sure the
bootmem allocator does not crash and burn if someone calls it after slab is up;
otherwise we'd need a flag day for switching to early slab.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>

diff --git a/mm/bootmem.c b/mm/bootmem.c
index daf9271..457269c 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -532,6 +532,9 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
 					unsigned long size, unsigned long align,
 					unsigned long goal, unsigned long limit)
 {
+	if (WARN_ON_ONCE(slab_is_available()))
+		return kzalloc(size, GFP_NOWAIT);
+
 #ifdef CONFIG_HAVE_ARCH_BOOTMEM
 	bootmem_data_t *p_bdata;
 
-- 
cgit v0.10.2


From c91c4773b334d4d3a6d44626dc2a558ad97b86f3 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 11 Jun 2009 08:10:28 +0300
Subject: bootmem: fix slab fallback on numa

If the user requested bootmem allocation on a specific node, we should use
kzalloc_node() for the fallback allocation.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>

diff --git a/mm/bootmem.c b/mm/bootmem.c
index 457269c..282df0a 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -665,6 +665,9 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
 void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
 				   unsigned long align, unsigned long goal)
 {
+	if (WARN_ON_ONCE(slab_is_available()))
+		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
 	return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
 }
 
@@ -696,6 +699,9 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
 {
 	void *ptr;
 
+	if (WARN_ON_ONCE(slab_is_available()))
+		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
 	ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
 	if (ptr)
 		return ptr;
@@ -748,6 +754,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
 void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
 				       unsigned long align, unsigned long goal)
 {
+	if (WARN_ON_ONCE(slab_is_available()))
+		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
 	return ___alloc_bootmem_node(pgdat->bdata, size, align,
 				goal, ARCH_LOW_ADDRESS_LIMIT);
 }
-- 
cgit v0.10.2


From 83b519e8b9572c319c8e0c615ee5dd7272856090 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 10 Jun 2009 19:40:04 +0300
Subject: slab: setup allocators earlier in the boot sequence

This patch makes kmalloc() available earlier in the boot sequence so we can get
rid of some bootmem allocations. The bulk of the changes are due to
kmem_cache_init() being called with interrupts disabled which requires some
changes to allocator boostrap code.

Note: 32-bit x86 does WP protect test in mem_init() so we must setup traps
before we call mem_init() during boot as reported by Ingo Molnar:

  We have a hard crash in the WP-protect code:

  [    0.000000] Checking if this processor honours the WP bit even in supervisor mode...BUG: Int 14: CR2 ffcff000
  [    0.000000]      EDI 00000188  ESI 00000ac7  EBP c17eaf9c  ESP c17eaf8c
  [    0.000000]      EBX 000014e0  EDX 0000000e  ECX 01856067  EAX 00000001
  [    0.000000]      err 00000003  EIP c10135b1   CS 00000060  flg 00010002
  [    0.000000] Stack: c17eafa8 c17fd410 c16747bc c17eafc4 c17fd7e5 000011fd f8616000 c18237cc
  [    0.000000]        00099800 c17bb000 c17eafec c17f1668 000001c5 c17f1322 c166e039 c1822bf0
  [    0.000000]        c166e033 c153a014 c18237cc 00020800 c17eaff8 c17f106a 00020800 01ba5003
  [    0.000000] Pid: 0, comm: swapper Not tainted 2.6.30-tip-02161-g7a74539-dirty #52203
  [    0.000000] Call Trace:
  [    0.000000]  [<c15357c2>] ? printk+0x14/0x16
  [    0.000000]  [<c10135b1>] ? do_test_wp_bit+0x19/0x23
  [    0.000000]  [<c17fd410>] ? test_wp_bit+0x26/0x64
  [    0.000000]  [<c17fd7e5>] ? mem_init+0x1ba/0x1d8
  [    0.000000]  [<c17f1668>] ? start_kernel+0x164/0x2f7
  [    0.000000]  [<c17f1322>] ? unknown_bootoption+0x0/0x19c
  [    0.000000]  [<c17f106a>] ? __init_begin+0x6a/0x6f

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by Linus Torvalds <torvalds@linux-foundation.org>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>

diff --git a/init/main.c b/init/main.c
index bb7dc57..0ab82a4 100644
--- a/init/main.c
+++ b/init/main.c
@@ -574,6 +574,28 @@ asmlinkage void __init start_kernel(void)
 	setup_nr_cpu_ids();
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
 
+	build_all_zonelists();
+	page_alloc_init();
+
+	printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
+	parse_early_param();
+	parse_args("Booting kernel", static_command_line, __start___param,
+		   __stop___param - __start___param,
+		   &unknown_bootoption);
+	/*
+	 * These use large bootmem allocations and must precede
+	 * kmem_cache_init()
+	 */
+	pidhash_init();
+	vmalloc_init();
+	vfs_caches_init_early();
+	sort_main_extable();
+	trap_init();
+	/*
+	 * Set up kernel memory allocators
+	 */
+	mem_init();
+	kmem_cache_init();
 	/*
 	 * Set up the scheduler prior starting any interrupts (such as the
 	 * timer interrupt). Full topology setup happens at smp_init()
@@ -585,25 +607,15 @@ asmlinkage void __init start_kernel(void)
 	 * fragile until we cpu_idle() for the first time.
 	 */
 	preempt_disable();
-	build_all_zonelists();
-	page_alloc_init();
-	printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
-	parse_early_param();
-	parse_args("Booting kernel", static_command_line, __start___param,
-		   __stop___param - __start___param,
-		   &unknown_bootoption);
 	if (!irqs_disabled()) {
 		printk(KERN_WARNING "start_kernel(): bug: interrupts were "
 				"enabled *very* early, fixing it\n");
 		local_irq_disable();
 	}
-	sort_main_extable();
-	trap_init();
 	rcu_init();
 	/* init some links before init_ISA_irqs() */
 	early_irq_init();
 	init_IRQ();
-	pidhash_init();
 	init_timers();
 	hrtimers_init();
 	softirq_init();
@@ -645,14 +657,10 @@ asmlinkage void __init start_kernel(void)
 		initrd_start = 0;
 	}
 #endif
-	vmalloc_init();
-	vfs_caches_init_early();
 	cpuset_init_early();
 	page_cgroup_init();
-	mem_init();
 	enable_debug_pagealloc();
 	cpu_hotplug_init();
-	kmem_cache_init();
 	kmemtrace_init();
 	debug_objects_mem_init();
 	idr_init_cache();
diff --git a/mm/slab.c b/mm/slab.c
index f85831d..2bd611f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -315,7 +315,7 @@ static int drain_freelist(struct kmem_cache *cache,
 			struct kmem_list3 *l3, int tofree);
 static void free_block(struct kmem_cache *cachep, void **objpp, int len,
 			int node);
-static int enable_cpucache(struct kmem_cache *cachep);
+static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
 static void cache_reap(struct work_struct *unused);
 
 /*
@@ -958,12 +958,12 @@ static void __cpuinit start_cpu_timer(int cpu)
 }
 
 static struct array_cache *alloc_arraycache(int node, int entries,
-					    int batchcount)
+					    int batchcount, gfp_t gfp)
 {
 	int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
 	struct array_cache *nc = NULL;
 
-	nc = kmalloc_node(memsize, GFP_KERNEL, node);
+	nc = kmalloc_node(memsize, gfp, node);
 	if (nc) {
 		nc->avail = 0;
 		nc->limit = entries;
@@ -1003,7 +1003,7 @@ static int transfer_objects(struct array_cache *to,
 #define drain_alien_cache(cachep, alien) do { } while (0)
 #define reap_alien(cachep, l3) do { } while (0)
 
-static inline struct array_cache **alloc_alien_cache(int node, int limit)
+static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
 {
 	return (struct array_cache **)BAD_ALIEN_MAGIC;
 }
@@ -1034,7 +1034,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep,
 static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
 static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
 
-static struct array_cache **alloc_alien_cache(int node, int limit)
+static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
 {
 	struct array_cache **ac_ptr;
 	int memsize = sizeof(void *) * nr_node_ids;
@@ -1042,14 +1042,14 @@ static struct array_cache **alloc_alien_cache(int node, int limit)
 
 	if (limit > 1)
 		limit = 12;
-	ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
+	ac_ptr = kmalloc_node(memsize, gfp, node);
 	if (ac_ptr) {
 		for_each_node(i) {
 			if (i == node || !node_online(i)) {
 				ac_ptr[i] = NULL;
 				continue;
 			}
-			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
+			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
 			if (!ac_ptr[i]) {
 				for (i--; i >= 0; i--)
 					kfree(ac_ptr[i]);
@@ -1282,20 +1282,20 @@ static int __cpuinit cpuup_prepare(long cpu)
 		struct array_cache **alien = NULL;
 
 		nc = alloc_arraycache(node, cachep->limit,
-					cachep->batchcount);
+					cachep->batchcount, GFP_KERNEL);
 		if (!nc)
 			goto bad;
 		if (cachep->shared) {
 			shared = alloc_arraycache(node,
 				cachep->shared * cachep->batchcount,
-				0xbaadf00d);
+				0xbaadf00d, GFP_KERNEL);
 			if (!shared) {
 				kfree(nc);
 				goto bad;
 			}
 		}
 		if (use_alien_caches) {
-			alien = alloc_alien_cache(node, cachep->limit);
+			alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
 			if (!alien) {
 				kfree(shared);
 				kfree(nc);
@@ -1399,10 +1399,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
 {
 	struct kmem_list3 *ptr;
 
-	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
+	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid);
 	BUG_ON(!ptr);
 
-	local_irq_disable();
 	memcpy(ptr, list, sizeof(struct kmem_list3));
 	/*
 	 * Do not assume that spinlocks can be initialized via memcpy:
@@ -1411,7 +1410,6 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
 
 	MAKE_ALL_LISTS(cachep, ptr, nodeid);
 	cachep->nodelists[nodeid] = ptr;
-	local_irq_enable();
 }
 
 /*
@@ -1575,9 +1573,8 @@ void __init kmem_cache_init(void)
 	{
 		struct array_cache *ptr;
 
-		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
 
-		local_irq_disable();
 		BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
 		memcpy(ptr, cpu_cache_get(&cache_cache),
 		       sizeof(struct arraycache_init));
@@ -1587,11 +1584,9 @@ void __init kmem_cache_init(void)
 		spin_lock_init(&ptr->lock);
 
 		cache_cache.array[smp_processor_id()] = ptr;
-		local_irq_enable();
 
-		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
 
-		local_irq_disable();
 		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
 		       != &initarray_generic.cache);
 		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
@@ -1603,7 +1598,6 @@ void __init kmem_cache_init(void)
 
 		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
 		    ptr;
-		local_irq_enable();
 	}
 	/* 5) Replace the bootstrap kmem_list3's */
 	{
@@ -1627,7 +1621,7 @@ void __init kmem_cache_init(void)
 		struct kmem_cache *cachep;
 		mutex_lock(&cache_chain_mutex);
 		list_for_each_entry(cachep, &cache_chain, next)
-			if (enable_cpucache(cachep))
+			if (enable_cpucache(cachep, GFP_NOWAIT))
 				BUG();
 		mutex_unlock(&cache_chain_mutex);
 	}
@@ -2064,10 +2058,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
 	return left_over;
 }
 
-static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
+static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 {
 	if (g_cpucache_up == FULL)
-		return enable_cpucache(cachep);
+		return enable_cpucache(cachep, gfp);
 
 	if (g_cpucache_up == NONE) {
 		/*
@@ -2089,7 +2083,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
 			g_cpucache_up = PARTIAL_AC;
 	} else {
 		cachep->array[smp_processor_id()] =
-			kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+			kmalloc(sizeof(struct arraycache_init), gfp);
 
 		if (g_cpucache_up == PARTIAL_AC) {
 			set_up_list3s(cachep, SIZE_L3);
@@ -2153,6 +2147,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 {
 	size_t left_over, slab_size, ralign;
 	struct kmem_cache *cachep = NULL, *pc;
+	gfp_t gfp;
 
 	/*
 	 * Sanity checks... these are all serious usage bugs.
@@ -2168,8 +2163,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	 * We use cache_chain_mutex to ensure a consistent view of
 	 * cpu_online_mask as well.  Please see cpuup_callback
 	 */
-	get_online_cpus();
-	mutex_lock(&cache_chain_mutex);
+	if (slab_is_available()) {
+		get_online_cpus();
+		mutex_lock(&cache_chain_mutex);
+	}
 
 	list_for_each_entry(pc, &cache_chain, next) {
 		char tmp;
@@ -2278,8 +2275,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	 */
 	align = ralign;
 
+	if (slab_is_available())
+		gfp = GFP_KERNEL;
+	else
+		gfp = GFP_NOWAIT;
+
 	/* Get cache's description obj. */
-	cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);
+	cachep = kmem_cache_zalloc(&cache_cache, gfp);
 	if (!cachep)
 		goto oops;
 
@@ -2382,7 +2384,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	cachep->ctor = ctor;
 	cachep->name = name;
 
-	if (setup_cpu_cache(cachep)) {
+	if (setup_cpu_cache(cachep, gfp)) {
 		__kmem_cache_destroy(cachep);
 		cachep = NULL;
 		goto oops;
@@ -2394,8 +2396,10 @@ oops:
 	if (!cachep && (flags & SLAB_PANIC))
 		panic("kmem_cache_create(): failed to create slab `%s'\n",
 		      name);
-	mutex_unlock(&cache_chain_mutex);
-	put_online_cpus();
+	if (slab_is_available()) {
+		mutex_unlock(&cache_chain_mutex);
+		put_online_cpus();
+	}
 	return cachep;
 }
 EXPORT_SYMBOL(kmem_cache_create);
@@ -3802,7 +3806,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name);
 /*
  * This initializes kmem_list3 or resizes various caches for all nodes.
  */
-static int alloc_kmemlist(struct kmem_cache *cachep)
+static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
 {
 	int node;
 	struct kmem_list3 *l3;
@@ -3812,7 +3816,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
 	for_each_online_node(node) {
 
                 if (use_alien_caches) {
-                        new_alien = alloc_alien_cache(node, cachep->limit);
+                        new_alien = alloc_alien_cache(node, cachep->limit, gfp);
                         if (!new_alien)
                                 goto fail;
                 }
@@ -3821,7 +3825,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
 		if (cachep->shared) {
 			new_shared = alloc_arraycache(node,
 				cachep->shared*cachep->batchcount,
-					0xbaadf00d);
+					0xbaadf00d, gfp);
 			if (!new_shared) {
 				free_alien_cache(new_alien);
 				goto fail;
@@ -3850,7 +3854,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
 			free_alien_cache(new_alien);
 			continue;
 		}
-		l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
+		l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node);
 		if (!l3) {
 			free_alien_cache(new_alien);
 			kfree(new_shared);
@@ -3906,18 +3910,18 @@ static void do_ccupdate_local(void *info)
 
 /* Always called with the cache_chain_mutex held */
 static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
-				int batchcount, int shared)
+				int batchcount, int shared, gfp_t gfp)
 {
 	struct ccupdate_struct *new;
 	int i;
 
-	new = kzalloc(sizeof(*new), GFP_KERNEL);
+	new = kzalloc(sizeof(*new), gfp);
 	if (!new)
 		return -ENOMEM;
 
 	for_each_online_cpu(i) {
 		new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
-						batchcount);
+						batchcount, gfp);
 		if (!new->new[i]) {
 			for (i--; i >= 0; i--)
 				kfree(new->new[i]);
@@ -3944,11 +3948,11 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 		kfree(ccold);
 	}
 	kfree(new);
-	return alloc_kmemlist(cachep);
+	return alloc_kmemlist(cachep, gfp);
 }
 
 /* Called with cache_chain_mutex held always */
-static int enable_cpucache(struct kmem_cache *cachep)
+static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
 {
 	int err;
 	int limit, shared;
@@ -3994,7 +3998,7 @@ static int enable_cpucache(struct kmem_cache *cachep)
 	if (limit > 32)
 		limit = 32;
 #endif
-	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
+	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
 	if (err)
 		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
 		       cachep->name, -err);
@@ -4300,7 +4304,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
 				res = 0;
 			} else {
 				res = do_tune_cpucache(cachep, limit,
-						       batchcount, shared);
+						       batchcount, shared,
+						       GFP_KERNEL);
 			}
 			break;
 		}
diff --git a/mm/slub.c b/mm/slub.c
index 5e805a6..c1815a6 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2557,13 +2557,16 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
 	if (gfp_flags & SLUB_DMA)
 		flags = SLAB_CACHE_DMA;
 
-	down_write(&slub_lock);
+	/*
+	 * This function is called with IRQs disabled during early-boot on
+	 * single CPU so there's no need to take slub_lock here.
+	 */
 	if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
 								flags, NULL))
 		goto panic;
 
 	list_add(&s->list, &slab_caches);
-	up_write(&slub_lock);
+
 	if (sysfs_slab_add(s))
 		goto panic;
 	return s;
@@ -3021,7 +3024,7 @@ void __init kmem_cache_init(void)
 	 * kmem_cache_open for slab_state == DOWN.
 	 */
 	create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
-		sizeof(struct kmem_cache_node), GFP_KERNEL);
+		sizeof(struct kmem_cache_node), GFP_NOWAIT);
 	kmalloc_caches[0].refcount = -1;
 	caches++;
 
@@ -3034,16 +3037,16 @@ void __init kmem_cache_init(void)
 	/* Caches that are not of the two-to-the-power-of size */
 	if (KMALLOC_MIN_SIZE <= 64) {
 		create_kmalloc_cache(&kmalloc_caches[1],
-				"kmalloc-96", 96, GFP_KERNEL);
+				"kmalloc-96", 96, GFP_NOWAIT);
 		caches++;
 		create_kmalloc_cache(&kmalloc_caches[2],
-				"kmalloc-192", 192, GFP_KERNEL);
+				"kmalloc-192", 192, GFP_NOWAIT);
 		caches++;
 	}
 
 	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
 		create_kmalloc_cache(&kmalloc_caches[i],
-			"kmalloc", 1 << i, GFP_KERNEL);
+			"kmalloc", 1 << i, GFP_NOWAIT);
 		caches++;
 	}
 
@@ -3080,7 +3083,7 @@ void __init kmem_cache_init(void)
 	/* Provide the correct kmalloc names now that the caches are up */
 	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
 		kmalloc_caches[i]. name =
-			kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
+			kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
 
 #ifdef CONFIG_SMP
 	register_cpu_notifier(&slab_notifier);
-- 
cgit v0.10.2


From 43ebdac42f16037263b52a5aeedcd1bfa4a9bb29 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Mon, 25 May 2009 15:01:35 +0300
Subject: vmalloc: use kzalloc() instead of alloc_bootmem()

We can call vmalloc_init() after kmem_cache_init() and use kzalloc() instead of
the bootmem allocator when initializing vmalloc data structures.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Nick Piggin <npiggin@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>

diff --git a/init/main.c b/init/main.c
index 0ab82a4..6d38f96 100644
--- a/init/main.c
+++ b/init/main.c
@@ -587,7 +587,6 @@ asmlinkage void __init start_kernel(void)
 	 * kmem_cache_init()
 	 */
 	pidhash_init();
-	vmalloc_init();
 	vfs_caches_init_early();
 	sort_main_extable();
 	trap_init();
@@ -596,6 +595,7 @@ asmlinkage void __init start_kernel(void)
 	 */
 	mem_init();
 	kmem_cache_init();
+	vmalloc_init();
 	/*
 	 * Set up the scheduler prior starting any interrupts (such as the
 	 * timer interrupt). Full topology setup happens at smp_init()
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 083716e..3235138 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -23,7 +23,6 @@
 #include <linux/rbtree.h>
 #include <linux/radix-tree.h>
 #include <linux/rcupdate.h>
-#include <linux/bootmem.h>
 #include <linux/pfn.h>
 
 #include <asm/atomic.h>
@@ -1032,7 +1031,7 @@ void __init vmalloc_init(void)
 
 	/* Import existing vmlist entries. */
 	for (tmp = vmlist; tmp; tmp = tmp->next) {
-		va = alloc_bootmem(sizeof(struct vmap_area));
+		va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
 		va->flags = tmp->flags | VM_VM_AREA;
 		va->va_start = (unsigned long)tmp->addr;
 		va->va_end = va->va_start + tmp->size;
-- 
cgit v0.10.2


From 444f478f65c7ca4606f9965b31feed13fe2bc9fa Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 11 Jun 2009 18:29:06 +0300
Subject: init: introduce mm_init()

As suggested by Christoph Lameter, introduce mm_init() now that we initialize
all the kernel memory allocations together.

Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>

diff --git a/init/main.c b/init/main.c
index 6d38f96..7917695 100644
--- a/init/main.c
+++ b/init/main.c
@@ -533,6 +533,16 @@ void __init __weak thread_info_cache_init(void)
 {
 }
 
+/*
+ * Set up kernel memory allocators
+ */
+static void __init mm_init(void)
+{
+	mem_init();
+	kmem_cache_init();
+	vmalloc_init();
+}
+
 asmlinkage void __init start_kernel(void)
 {
 	char * command_line;
@@ -590,12 +600,7 @@ asmlinkage void __init start_kernel(void)
 	vfs_caches_init_early();
 	sort_main_extable();
 	trap_init();
-	/*
-	 * Set up kernel memory allocators
-	 */
-	mem_init();
-	kmem_cache_init();
-	vmalloc_init();
+	mm_init();
 	/*
 	 * Set up the scheduler prior starting any interrupts (such as the
 	 * timer interrupt). Full topology setup happens at smp_init()
-- 
cgit v0.10.2


From 36b7b6d465489c4754c4fd66fcec6086eba87896 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 10 Jun 2009 23:42:36 +0300
Subject: sched: use kzalloc() instead of the bootmem allocator

Now that kmem_cache_init() happens before sched_init(), we should use kzalloc()
and not the bootmem allocator.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>

diff --git a/kernel/sched.c b/kernel/sched.c
index 14c447a..a9ff953 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -68,7 +68,6 @@
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
-#include <linux/bootmem.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
@@ -7782,21 +7781,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 
 static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
 {
+	gfp_t gfp = GFP_KERNEL;
+
 	memset(rd, 0, sizeof(*rd));
 
-	if (bootmem) {
-		alloc_bootmem_cpumask_var(&def_root_domain.span);
-		alloc_bootmem_cpumask_var(&def_root_domain.online);
-		alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
-		cpupri_init(&rd->cpupri, true);
-		return 0;
-	}
+	if (bootmem)
+		gfp = GFP_NOWAIT;
 
-	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+	if (!alloc_cpumask_var(&rd->span, gfp))
 		goto out;
-	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+	if (!alloc_cpumask_var(&rd->online, gfp))
 		goto free_span;
-	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+	if (!alloc_cpumask_var(&rd->rto_mask, gfp))
 		goto free_online;
 
 	if (cpupri_init(&rd->cpupri, false) != 0)
@@ -9123,7 +9119,7 @@ void __init sched_init(void)
 	 * we use alloc_bootmem().
 	 */
 	if (alloc_size) {
-		ptr = (unsigned long)alloc_bootmem(alloc_size);
+		ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		init_task_group.se = (struct sched_entity **)ptr;
-- 
cgit v0.10.2


From a5f4f52e82114e85aa1a066bd1a450acc19a464d Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 10 Jun 2009 23:53:37 +0300
Subject: vt: use kzalloc() instead of the bootmem allocator

Now that kmem_cache_init() happens before console_init(), we should use
kzalloc() and not the bootmem allocator.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>

diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 08151d4..c796a86 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -95,7 +95,6 @@
 #include <linux/timer.h>
 #include <linux/interrupt.h>
 #include <linux/workqueue.h>
-#include <linux/bootmem.h>
 #include <linux/pm.h>
 #include <linux/font.h>
 #include <linux/bitops.h>
@@ -2875,14 +2874,11 @@ static int __init con_init(void)
 		mod_timer(&console_timer, jiffies + blankinterval);
 	}
 
-	/*
-	 * kmalloc is not running yet - we use the bootmem allocator.
-	 */
 	for (currcons = 0; currcons < MIN_NR_CONSOLES; currcons++) {
-		vc_cons[currcons].d = vc = alloc_bootmem(sizeof(struct vc_data));
+		vc_cons[currcons].d = vc = kzalloc(sizeof(struct vc_data), GFP_NOWAIT);
 		INIT_WORK(&vc_cons[currcons].SAK_work, vc_SAK);
 		visual_init(vc, currcons, 1);
-		vc->vc_screenbuf = (unsigned short *)alloc_bootmem(vc->vc_screenbuf_size);
+		vc->vc_screenbuf = kzalloc(vc->vc_screenbuf_size, GFP_NOWAIT);
 		vc->vc_kmalloced = 0;
 		vc_init(vc, vc->vc_rows, vc->vc_cols,
 			currcons || !vc->vc_sw->con_save_screen);
-- 
cgit v0.10.2


From 38c7fed2f5ffee17e1fa3e0f78b0e1bf43d52d13 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 25 May 2009 15:10:58 +0300
Subject: x86: remove some alloc_bootmem_cpumask_var calling

Now that we set up the slab allocator earlier, we can get rid of some
alloc_bootmem_cpumask_var() calls in boot code.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 1946fac..139201a 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -185,8 +185,8 @@ int __init arch_early_irq_init(void)
 	for (i = 0; i < count; i++) {
 		desc = irq_to_desc(i);
 		desc->chip_data = &cfg[i];
-		alloc_bootmem_cpumask_var(&cfg[i].domain);
-		alloc_bootmem_cpumask_var(&cfg[i].old_domain);
+		alloc_cpumask_var(&cfg[i].domain, GFP_NOWAIT);
+		alloc_cpumask_var(&cfg[i].old_domain, GFP_NOWAIT);
 		if (i < NR_IRQS_LEGACY)
 			cpumask_setall(cfg[i].domain);
 	}
diff --git a/include/linux/irq.h b/include/linux/irq.h
index eedbb8e..1e50c34f 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -430,23 +430,19 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
  * Returns true if successful (or not required).
  */
 static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
-								bool boot)
+							bool boot)
 {
-#ifdef CONFIG_CPUMASK_OFFSTACK
-	if (boot) {
-		alloc_bootmem_cpumask_var(&desc->affinity);
+	gfp_t gfp = GFP_ATOMIC;
 
-#ifdef CONFIG_GENERIC_PENDING_IRQ
-		alloc_bootmem_cpumask_var(&desc->pending_mask);
-#endif
-		return true;
-	}
+	if (boot)
+		gfp = GFP_NOWAIT;
 
-	if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
+#ifdef CONFIG_CPUMASK_OFFSTACK
+	if (!alloc_cpumask_var_node(&desc->affinity, gfp, node))
 		return false;
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-	if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) {
+	if (!alloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
 		free_cpumask_var(desc->affinity);
 		return false;
 	}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 026facc..d5a7e17 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1857,7 +1857,7 @@ struct cgroup_subsys cpuset_subsys = {
 
 int __init cpuset_init_early(void)
 {
-	alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed);
+	alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT);
 
 	top_cpuset.mems_generation = cpuset_mems_generation++;
 	return 0;
diff --git a/kernel/profile.c b/kernel/profile.c
index 7724e04..28cf26a 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -111,12 +111,6 @@ int __ref profile_init(void)
 	/* only text is profiled */
 	prof_len = (_etext - _stext) >> prof_shift;
 	buffer_bytes = prof_len*sizeof(atomic_t);
-	if (!slab_is_available()) {
-		prof_buffer = alloc_bootmem(buffer_bytes);
-		alloc_bootmem_cpumask_var(&prof_cpu_mask);
-		cpumask_copy(prof_cpu_mask, cpu_possible_mask);
-		return 0;
-	}
 
 	if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
 		return -ENOMEM;
diff --git a/lib/cpumask.c b/lib/cpumask.c
index eb23aaa..7bb4142 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -92,15 +92,8 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
  */
 bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
 {
-	if (likely(slab_is_available()))
-		*mask = kmalloc_node(cpumask_size(), flags, node);
-	else {
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
-		printk(KERN_ERR
-			"=> alloc_cpumask_var: kmalloc not available!\n");
-#endif
-		*mask = NULL;
-	}
+	*mask = kmalloc_node(cpumask_size(), flags, node);
+
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
 	if (!*mask) {
 		printk(KERN_ERR "=> alloc_cpumask_var: failed!\n");
-- 
cgit v0.10.2


From dad213aeb59718623fc59defeff95fe8c3feb8a0 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 28 May 2009 18:14:40 -0700
Subject: irq/cpumask: make memoryless node zero happy

Don't hardcode to node zero for early boot IRQ setup memory allocations.

[ penberg@cs.helsinki.fi: minor cleanups ]
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 139201a..94605e7 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -177,16 +177,18 @@ int __init arch_early_irq_init(void)
 	struct irq_cfg *cfg;
 	struct irq_desc *desc;
 	int count;
+	int node;
 	int i;
 
 	cfg = irq_cfgx;
 	count = ARRAY_SIZE(irq_cfgx);
+	node= cpu_to_node(boot_cpu_id);
 
 	for (i = 0; i < count; i++) {
 		desc = irq_to_desc(i);
 		desc->chip_data = &cfg[i];
-		alloc_cpumask_var(&cfg[i].domain, GFP_NOWAIT);
-		alloc_cpumask_var(&cfg[i].old_domain, GFP_NOWAIT);
+		alloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
+		alloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
 		if (i < NR_IRQS_LEGACY)
 			cpumask_setall(cfg[i].domain);
 	}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a600184..e161999 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -150,6 +150,7 @@ int __init early_irq_init(void)
 {
 	struct irq_desc *desc;
 	int legacy_count;
+	int node;
 	int i;
 
 	init_irq_default_affinity();
@@ -160,20 +161,20 @@ int __init early_irq_init(void)
 
 	desc = irq_desc_legacy;
 	legacy_count = ARRAY_SIZE(irq_desc_legacy);
+ 	node = first_online_node;
 
 	/* allocate irq_desc_ptrs array based on nr_irqs */
 	irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *));
 
 	/* allocate based on nr_cpu_ids */
-	/* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */
-	kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids *
-					  sizeof(int));
+	kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
+					  sizeof(int), GFP_NOWAIT, node);
 
 	for (i = 0; i < legacy_count; i++) {
 		desc[i].irq = i;
 		desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
 		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
-		alloc_desc_masks(&desc[i], 0, true);
+		alloc_desc_masks(&desc[i], node, true);
 		init_desc_masks(&desc[i]);
 		irq_desc_ptrs[i] = desc + i;
 	}
-- 
cgit v0.10.2


From 959982fee4e635c61780e989c3e34267143fcc02 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 28 May 2009 18:15:16 -0700
Subject: memcg: don't use bootmem allocator in setup code

The bootmem allocator is no longer available for page_cgroup_init() because we
set up the kernel slab allocator much earlier now.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>

diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 791905c..3dd4a90 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -47,6 +47,8 @@ static int __init alloc_node_page_cgroup(int nid)
 	struct page_cgroup *base, *pc;
 	unsigned long table_size;
 	unsigned long start_pfn, nr_pages, index;
+	struct page *page;
+	unsigned int order;
 
 	start_pfn = NODE_DATA(nid)->node_start_pfn;
 	nr_pages = NODE_DATA(nid)->node_spanned_pages;
@@ -55,11 +57,13 @@ static int __init alloc_node_page_cgroup(int nid)
 		return 0;
 
 	table_size = sizeof(struct page_cgroup) * nr_pages;
-
-	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
-			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
-	if (!base)
+	order = get_order(table_size);
+	page = alloc_pages_node(nid, GFP_NOWAIT | __GFP_ZERO, order);
+	if (!page)
+		page = alloc_pages_node(-1, GFP_NOWAIT | __GFP_ZERO, order);
+	if (!page)
 		return -ENOMEM;
+	base = page_address(page);
 	for (index = 0; index < nr_pages; index++) {
 		pc = base + index;
 		__init_page_cgroup(pc, start_pfn + index);
-- 
cgit v0.10.2


From 4bdddf8ff9bbb8aa7b4d7847586202bd25842c90 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 11 Jun 2009 08:35:27 +0300
Subject: sched: use alloc_cpumask_var() instead of alloc_bootmem_cpumask_var()

Slab is initialized when sched_init() runs now so lets use alloc_cpumask_var().

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>

diff --git a/kernel/sched.c b/kernel/sched.c
index a9ff953..12cc09c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9310,13 +9310,13 @@ void __init sched_init(void)
 	current->sched_class = &fair_sched_class;
 
 	/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
-	alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+	alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
 #ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ
-	alloc_bootmem_cpumask_var(&nohz.cpu_mask);
-	alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask);
+	alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
+	alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
 #endif
-	alloc_bootmem_cpumask_var(&cpu_isolated_map);
+	alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 #endif /* SMP */
 
 	scheduler_running = 1;
-- 
cgit v0.10.2


From 0fb530291621c8b8a1b16abeeab05d9262489f71 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 11 Jun 2009 08:41:22 +0300
Subject: sched: use slab in cpupri_init()

Lets not use the bootmem allocator in cpupri_init() as slab is already up when
it is run.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>

diff --git a/kernel/sched.c b/kernel/sched.c
index 12cc09c..dcf2dc2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7795,7 +7795,7 @@ static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
 	if (!alloc_cpumask_var(&rd->rto_mask, gfp))
 		goto free_online;
 
-	if (cpupri_init(&rd->cpupri, false) != 0)
+	if (cpupri_init(&rd->cpupri, bootmem) != 0)
 		goto free_rto_mask;
 	return 0;
 
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 344712a..7deffc9 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -154,8 +154,12 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
  */
 int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
 {
+	gfp_t gfp = GFP_KERNEL;
 	int i;
 
+	if (bootmem)
+		gfp = GFP_NOWAIT;
+
 	memset(cp, 0, sizeof(*cp));
 
 	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
@@ -163,9 +167,7 @@ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
 
 		spin_lock_init(&vec->lock);
 		vec->count = 0;
-		if (bootmem)
-			alloc_bootmem_cpumask_var(&vec->mask);
-		else if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
+		if (!zalloc_cpumask_var(&vec->mask, gfp))
 			goto cleanup;
 	}
 
-- 
cgit v0.10.2


From 22fb4e71e646695c7e0f379ada66b372c2d1aa1a Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 11 Jun 2009 14:46:49 +0300
Subject: irq: use kcalloc() instead of the bootmem allocator

Fixes the following problem:

[    0.000000] Experimental hierarchical RCU init done.
[    0.000000] NR_IRQS:4352 nr_irqs:256
[    0.000000] ------------[ cut here ]------------
[    0.000000] WARNING: at mm/bootmem.c:537 alloc_arch_preferred_bootmem+0x40/0x7e()
[    0.000000] Hardware name: To Be Filled By O.E.M.
[    0.000000] Pid: 0, comm: swapper Not tainted 2.6.30-tip-02161-g7a74539-dirty #59709
[    0.000000] Call Trace:
[    0.000000]  [<ffffffff823f8c8e>] ? alloc_arch_preferred_bootmem+0x40/0x7e
[    0.000000]  [<ffffffff81067168>] warn_slowpath_common+0x88/0xcb
[    0.000000]  [<ffffffff810671d2>] warn_slowpath_null+0x27/0x3d
[    0.000000]  [<ffffffff823f8c8e>] alloc_arch_preferred_bootmem+0x40/0x7e
[    0.000000]  [<ffffffff823f9307>] ___alloc_bootmem_nopanic+0x4e/0xec
[    0.000000]  [<ffffffff823f93c5>] ___alloc_bootmem+0x20/0x61
[    0.000000]  [<ffffffff823f962e>] __alloc_bootmem+0x1e/0x34
[    0.000000]  [<ffffffff823f757c>] early_irq_init+0x6d/0x118
[    0.000000]  [<ffffffff823e0140>] ? early_idt_handler+0x0/0x71
[    0.000000]  [<ffffffff823e0cf7>] start_kernel+0x192/0x394
[    0.000000]  [<ffffffff823e0140>] ? early_idt_handler+0x0/0x71
[    0.000000]  [<ffffffff823e02ad>] x86_64_start_reservations+0xb4/0xcf
[    0.000000]  [<ffffffff823e0000>] ? __init_begin+0x0/0x140
[    0.000000]  [<ffffffff823e0420>] x86_64_start_kernel+0x158/0x17b
[    0.000000] ---[ end trace a7919e7f17c0a725 ]---
[    0.000000] Fast TSC calibration using PIT
[    0.000000] Detected 2002.510 MHz processor.
[    0.004000] Console: colour VGA+ 80x25

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index e161999..1045785 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -164,7 +164,7 @@ int __init early_irq_init(void)
  	node = first_online_node;
 
 	/* allocate irq_desc_ptrs array based on nr_irqs */
-	irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *));
+	irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
 
 	/* allocate based on nr_cpu_ids */
 	kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
-- 
cgit v0.10.2


From b8ec757390282e21d349bf6b602a8cb182da0429 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 11 Jun 2009 19:25:37 +0300
Subject: vgacon: use slab allocator instead of the bootmem allocator

Slab is initialized before the console subsystem so use the slab allocator in
vgacon_scrollback_startup().

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>

diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c
index 38e86b8..59d7d5e 100644
--- a/drivers/video/console/vgacon.c
+++ b/drivers/video/console/vgacon.c
@@ -180,7 +180,7 @@ static inline void vga_set_mem_top(struct vc_data *c)
 }
 
 #ifdef CONFIG_VGACON_SOFT_SCROLLBACK
-#include <linux/bootmem.h>
+#include <linux/slab.h>
 /* software scrollback */
 static void *vgacon_scrollback;
 static int vgacon_scrollback_tail;
@@ -210,8 +210,7 @@ static void vgacon_scrollback_init(int pitch)
  */
 static void __init_refok vgacon_scrollback_startup(void)
 {
-	vgacon_scrollback = alloc_bootmem(CONFIG_VGACON_SOFT_SCROLLBACK_SIZE
-					  * 1024);
+	vgacon_scrollback = kcalloc(CONFIG_VGACON_SOFT_SCROLLBACK_SIZE, 1024, GFP_NOWAIT);
 	vgacon_scrollback_init(vga_video_num_columns * 2);
 }
 
-- 
cgit v0.10.2


From b415c49a864dab8ee90713833d642dd461eccae9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 11 Jun 2009 13:12:55 +0100
Subject: slow_work_thread() should do the exclusive wait

slow_work_thread() sleeps on slow_work_thread_wq without WQ_FLAG_EXCLUSIVE,
this means that slow_work_enqueue()->__wake_up(nr_exclusive => 1) wakes up all
kslowd threads.  This is not what we want, so we change slow_work_thread() to
use prepare_to_wait_exclusive() instead.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index b28d1913..521ed20 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -372,8 +372,8 @@ static int slow_work_thread(void *_data)
 		vsmax *= atomic_read(&slow_work_thread_count);
 		vsmax /= 100;
 
-		prepare_to_wait(&slow_work_thread_wq, &wait,
-				TASK_INTERRUPTIBLE);
+		prepare_to_wait_exclusive(&slow_work_thread_wq, &wait,
+					  TASK_INTERRUPTIBLE);
 		if (!freezing(current) &&
 		    !slow_work_threads_should_exit &&
 		    !slow_work_available(vsmax) &&
-- 
cgit v0.10.2


From 90586523eb4b349806887c62ee70685a49415124 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:20 -0400
Subject: fsnotify: unified filesystem notification backend

fsnotify is a backend for filesystem notification.  fsnotify does
not provide any userspace interface but does provide the basis
needed for other notification schemes such as dnotify.  fsnotify
can be extended to be the backend for inotify or the upcoming
fanotify.  fsnotify provides a mechanism for "groups" to register for
some set of filesystem events and to then deliver those events to
those groups for processing.

fsnotify has a number of benefits, the first being actually shrinking the size
of an inode.  Before fsnotify to support both dnotify and inotify an inode had

        unsigned long           i_dnotify_mask; /* Directory notify events */
        struct dnotify_struct   *i_dnotify; /* for directory notifications */
        struct list_head        inotify_watches; /* watches on this inode */
        struct mutex            inotify_mutex;  /* protects the watches list

But with fsnotify this same functionallity (and more) is done with just

        __u32                   i_fsnotify_mask; /* all events for this inode */
        struct hlist_head       i_fsnotify_mark_entries; /* marks on this inode */

That's right, inotify, dnotify, and fanotify all in 64 bits.  We used that
much space just in inotify_watches alone, before this patch set.

fsnotify object lifetime and locking is MUCH better than what we have today.
inotify locking is incredibly complex.  See 8f7b0ba1c8539 as an example of
what's been busted since inception.  inotify needs to know internal semantics
of superblock destruction and unmounting to function.  The inode pinning and
vfs contortions are horrible.

no fsnotify implementers do allocation under locks.  This means things like
f04b30de3 which (due to an overabundance of caution) changes GFP_KERNEL to
GFP_NOFS can be reverted.  There are no longer any allocation rules when using
or implementing your own fsnotify listener.

fsnotify paves the way for fanotify.  In brief fanotify is a notification
mechanism that delivers the lisener both an 'event' and an open file descriptor
to the object in question.  This means that fanotify is pathname agnostic.
Some on lkml may not care for the original companies or users that pushed for
TALPA, but fanotify was designed with flexibility and input for other users in
mind.  The readahead group expressed interest in fanotify as it could be used
to profile disk access on boot without breaking the audit system.  The desktop
search groups have also expressed interest in fanotify as it solves a number
of the race conditions and problems present with managing inotify when more
than a limited number of specific files are of interest.  fanotify can provide
for a userspace access control system which makes it a clean interface for AV
vendors to hook without trying to do binary patching on the syscall table,
LSM, and everywhere else they do their things today.  With this patch series
fanotify can be implemented in less than 1200 lines of easy to review code.
Almost all of which is the socket based user interface.

This patch series builds fsnotify to the point that it can implement
dnotify and inotify_user.  Patches exist and will be sent soon after
acceptance to finish the in kernel inotify conversion (audit) and implement
fanotify.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>

diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 50914d7..31dac7e 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -1,2 +1,15 @@
+config FSNOTIFY
+	bool "Filesystem notification backend"
+	default y
+	---help---
+	   fsnotify is a backend for filesystem notification.  fsnotify does
+	   not provide any userspace interface but does provide the basis
+	   needed for other notification schemes such as dnotify, inotify,
+	   and fanotify.
+
+	   Say Y here to enable fsnotify suport.
+
+	   If unsure, say Y.
+
 source "fs/notify/dnotify/Kconfig"
 source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index 5a95b60..db5467b 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,2 +1,4 @@
+obj-$(CONFIG_FSNOTIFY)		+= fsnotify.o notification.o group.o
+
 obj-y			+= dnotify/
 obj-y			+= inotify/
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
new file mode 100644
index 0000000..56bee0f
--- /dev/null
+++ b/fs/notify/fsnotify.c
@@ -0,0 +1,79 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/srcu.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+
+/*
+ * This is the main call to fsnotify.  The VFS calls into hook specific functions
+ * in linux/fsnotify.h.  Those functions then in turn call here.  Here will call
+ * out to all of the registered fsnotify_group.  Those groups can then use the
+ * notification event in whatever means they feel necessary.
+ */
+void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
+{
+	struct fsnotify_group *group;
+	struct fsnotify_event *event = NULL;
+	int idx;
+
+	if (list_empty(&fsnotify_groups))
+		return;
+
+	if (!(mask & fsnotify_mask))
+		return;
+
+	/*
+	 * SRCU!!  the groups list is very very much read only and the path is
+	 * very hot.  The VAST majority of events are not going to need to do
+	 * anything other than walk the list so it's crazy to pre-allocate.
+	 */
+	idx = srcu_read_lock(&fsnotify_grp_srcu);
+	list_for_each_entry_rcu(group, &fsnotify_groups, group_list) {
+		if (mask & group->mask) {
+			if (!event) {
+				event = fsnotify_create_event(to_tell, mask, data, data_is);
+				/* shit, we OOM'd and now we can't tell, maybe
+				 * someday someone else will want to do something
+				 * here */
+				if (!event)
+					break;
+			}
+			group->ops->handle_event(group, event);
+		}
+	}
+	srcu_read_unlock(&fsnotify_grp_srcu, idx);
+	/*
+	 * fsnotify_create_event() took a reference so the event can't be cleaned
+	 * up while we are still trying to add it to lists, drop that one.
+	 */
+	if (event)
+		fsnotify_put_event(event);
+}
+EXPORT_SYMBOL_GPL(fsnotify);
+
+static __init int fsnotify_init(void)
+{
+	return init_srcu_struct(&fsnotify_grp_srcu);
+}
+subsys_initcall(fsnotify_init);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
new file mode 100644
index 0000000..c6a8bd4
--- /dev/null
+++ b/fs/notify/fsnotify.h
@@ -0,0 +1,15 @@
+#ifndef __FS_NOTIFY_FSNOTIFY_H_
+#define __FS_NOTIFY_FSNOTIFY_H_
+
+#include <linux/list.h>
+#include <linux/fsnotify.h>
+#include <linux/srcu.h>
+#include <linux/types.h>
+
+/* protects reads of fsnotify_groups */
+extern struct srcu_struct fsnotify_grp_srcu;
+/* all groups which receive fsnotify events */
+extern struct list_head fsnotify_groups;
+/* all bitwise OR of all event types (FS_*) for all fsnotify_groups */
+extern __u32 fsnotify_mask;
+#endif	/* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/fs/notify/group.c b/fs/notify/group.c
new file mode 100644
index 0000000..c681295
--- /dev/null
+++ b/fs/notify/group.c
@@ -0,0 +1,198 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/srcu.h>
+#include <linux/rculist.h>
+#include <linux/wait.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+
+#include <asm/atomic.h>
+
+/* protects writes to fsnotify_groups and fsnotify_mask */
+static DEFINE_MUTEX(fsnotify_grp_mutex);
+/* protects reads while running the fsnotify_groups list */
+struct srcu_struct fsnotify_grp_srcu;
+/* all groups registered to receive filesystem notifications */
+LIST_HEAD(fsnotify_groups);
+/* bitwise OR of all events (FS_*) interesting to some group on this system */
+__u32 fsnotify_mask;
+
+/*
+ * When a new group registers or changes it's set of interesting events
+ * this function updates the fsnotify_mask to contain all interesting events
+ */
+void fsnotify_recalc_global_mask(void)
+{
+	struct fsnotify_group *group;
+	__u32 mask = 0;
+	int idx;
+
+	idx = srcu_read_lock(&fsnotify_grp_srcu);
+	list_for_each_entry_rcu(group, &fsnotify_groups, group_list)
+		mask |= group->mask;
+	srcu_read_unlock(&fsnotify_grp_srcu, idx);
+	fsnotify_mask = mask;
+}
+
+/*
+ * Take a reference to a group so things found under the fsnotify_grp_mutex
+ * can't get freed under us
+ */
+static void fsnotify_get_group(struct fsnotify_group *group)
+{
+	atomic_inc(&group->refcnt);
+}
+
+/*
+ * Final freeing of a group
+ */
+static void fsnotify_destroy_group(struct fsnotify_group *group)
+{
+	if (group->ops->free_group_priv)
+		group->ops->free_group_priv(group);
+
+	kfree(group);
+}
+
+/*
+ * Remove this group from the global list of groups that will get events
+ * this can be done even if there are still references and things still using
+ * this group.  This just stops the group from getting new events.
+ */
+static void __fsnotify_evict_group(struct fsnotify_group *group)
+{
+	BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
+
+	if (group->on_group_list)
+		list_del_rcu(&group->group_list);
+	group->on_group_list = 0;
+}
+
+/*
+ * Called when a group is no longer interested in getting events.  This can be
+ * used if a group is misbehaving or if for some reason a group should no longer
+ * get any filesystem events.
+ */
+void fsnotify_evict_group(struct fsnotify_group *group)
+{
+	mutex_lock(&fsnotify_grp_mutex);
+	__fsnotify_evict_group(group);
+	mutex_unlock(&fsnotify_grp_mutex);
+}
+
+/*
+ * Drop a reference to a group.  Free it if it's through.
+ */
+void fsnotify_put_group(struct fsnotify_group *group)
+{
+	if (!atomic_dec_and_mutex_lock(&group->refcnt, &fsnotify_grp_mutex))
+		return;
+
+	/*
+	 * OK, now we know that there's no other users *and* we hold mutex,
+	 * so no new references will appear
+	 */
+	__fsnotify_evict_group(group);
+
+	/*
+	 * now it's off the list, so the only thing we might care about is
+	 * srcu access....
+	 */
+	mutex_unlock(&fsnotify_grp_mutex);
+	synchronize_srcu(&fsnotify_grp_srcu);
+
+	/* and now it is really dead. _Nothing_ could be seeing it */
+	fsnotify_recalc_global_mask();
+	fsnotify_destroy_group(group);
+}
+
+/*
+ * Simply run the fsnotify_groups list and find a group which matches
+ * the given parameters.  If a group is found we take a reference to that
+ * group.
+ */
+static struct fsnotify_group *fsnotify_find_group(unsigned int group_num, __u32 mask,
+						  const struct fsnotify_ops *ops)
+{
+	struct fsnotify_group *group_iter;
+	struct fsnotify_group *group = NULL;
+
+	BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
+
+	list_for_each_entry_rcu(group_iter, &fsnotify_groups, group_list) {
+		if (group_iter->group_num == group_num) {
+			if ((group_iter->mask == mask) &&
+			    (group_iter->ops == ops)) {
+				fsnotify_get_group(group_iter);
+				group = group_iter;
+			} else
+				group = ERR_PTR(-EEXIST);
+		}
+	}
+	return group;
+}
+
+/*
+ * Either finds an existing group which matches the group_num, mask, and ops or
+ * creates a new group and adds it to the global group list.  In either case we
+ * take a reference for the group returned.
+ */
+struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask,
+					     const struct fsnotify_ops *ops)
+{
+	struct fsnotify_group *group, *tgroup;
+
+	/* very low use, simpler locking if we just always alloc */
+	group = kmalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
+	if (!group)
+		return ERR_PTR(-ENOMEM);
+
+	atomic_set(&group->refcnt, 1);
+
+	group->on_group_list = 0;
+	group->group_num = group_num;
+	group->mask = mask;
+
+	group->ops = ops;
+
+	mutex_lock(&fsnotify_grp_mutex);
+	tgroup = fsnotify_find_group(group_num, mask, ops);
+	if (tgroup) {
+		/* group already exists */
+		mutex_unlock(&fsnotify_grp_mutex);
+		/* destroy the new one we made */
+		fsnotify_put_group(group);
+		return tgroup;
+	}
+
+	/* group not found, add a new one */
+	list_add_rcu(&group->group_list, &fsnotify_groups);
+	group->on_group_list = 1;
+
+	mutex_unlock(&fsnotify_grp_mutex);
+
+	if (mask)
+		fsnotify_recalc_global_mask();
+
+	return group;
+}
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index 220c13f..40b1cf9 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -32,6 +32,7 @@
 #include <linux/list.h>
 #include <linux/writeback.h>
 #include <linux/inotify.h>
+#include <linux/fsnotify_backend.h>
 
 static atomic_t inotify_cookie;
 
@@ -905,6 +906,25 @@ EXPORT_SYMBOL_GPL(inotify_rm_watch);
  */
 static int __init inotify_setup(void)
 {
+	BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
+	BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
+	BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
+	BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE);
+	BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
+	BUILD_BUG_ON(IN_OPEN != FS_OPEN);
+	BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM);
+	BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO);
+	BUILD_BUG_ON(IN_CREATE != FS_CREATE);
+	BUILD_BUG_ON(IN_DELETE != FS_DELETE);
+	BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF);
+	BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF);
+	BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
+
+	BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
+	BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR);
+	BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
+	BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
+
 	atomic_set(&inotify_cookie, 0);
 
 	return 0;
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
new file mode 100644
index 0000000..b8e9a87
--- /dev/null
+++ b/fs/notify/notification.c
@@ -0,0 +1,121 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mount.h>
+#include <linux/mutex.h>
+#include <linux/namei.h>
+#include <linux/path.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include <asm/atomic.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+
+static struct kmem_cache *fsnotify_event_cachep;
+
+void fsnotify_get_event(struct fsnotify_event *event)
+{
+	atomic_inc(&event->refcnt);
+}
+
+void fsnotify_put_event(struct fsnotify_event *event)
+{
+	if (!event)
+		return;
+
+	if (atomic_dec_and_test(&event->refcnt)) {
+		if (event->data_type == FSNOTIFY_EVENT_PATH)
+			path_put(&event->path);
+
+		kmem_cache_free(fsnotify_event_cachep, event);
+	}
+}
+
+/*
+ * Allocate a new event which will be sent to each group's handle_event function
+ * if the group was interested in this particular event.
+ */
+struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
+					     void *data, int data_type)
+{
+	struct fsnotify_event *event;
+
+	event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
+	if (!event)
+		return NULL;
+
+	atomic_set(&event->refcnt, 1);
+
+	spin_lock_init(&event->lock);
+
+	event->path.dentry = NULL;
+	event->path.mnt = NULL;
+	event->inode = NULL;
+
+	event->to_tell = to_tell;
+
+	switch (data_type) {
+	case FSNOTIFY_EVENT_FILE: {
+		struct file *file = data;
+		struct path *path = &file->f_path;
+		event->path.dentry = path->dentry;
+		event->path.mnt = path->mnt;
+		path_get(&event->path);
+		event->data_type = FSNOTIFY_EVENT_PATH;
+		break;
+	}
+	case FSNOTIFY_EVENT_PATH: {
+		struct path *path = data;
+		event->path.dentry = path->dentry;
+		event->path.mnt = path->mnt;
+		path_get(&event->path);
+		event->data_type = FSNOTIFY_EVENT_PATH;
+		break;
+	}
+	case FSNOTIFY_EVENT_INODE:
+		event->inode = data;
+		event->data_type = FSNOTIFY_EVENT_INODE;
+		break;
+	case FSNOTIFY_EVENT_NONE:
+		event->inode = NULL;
+		event->path.dentry = NULL;
+		event->path.mnt = NULL;
+		break;
+	default:
+		BUG();
+	}
+
+	event->mask = mask;
+
+	return event;
+}
+
+__init int fsnotify_notification_init(void)
+{
+	fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
+
+	return 0;
+}
+subsys_initcall(fsnotify_notification_init);
+
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 00fbd5b..6c9ebef 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -13,6 +13,7 @@
 
 #include <linux/dnotify.h>
 #include <linux/inotify.h>
+#include <linux/fsnotify_backend.h>
 #include <linux/audit.h>
 
 /*
@@ -35,6 +36,16 @@ static inline void fsnotify_d_move(struct dentry *entry)
 }
 
 /*
+ * fsnotify_link_count - inode's link count changed
+ */
+static inline void fsnotify_link_count(struct inode *inode)
+{
+	inotify_inode_queue_event(inode, IN_ATTRIB, 0, NULL, NULL);
+
+	fsnotify(inode, FS_ATTRIB, inode, FSNOTIFY_EVENT_INODE);
+}
+
+/*
  * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
  */
 static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
@@ -43,28 +54,47 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 {
 	struct inode *source = moved->d_inode;
 	u32 cookie = inotify_get_cookie();
+	__u32 old_dir_mask = 0;
+	__u32 new_dir_mask = 0;
 
-	if (old_dir == new_dir)
+	if (old_dir == new_dir) {
 		inode_dir_notify(old_dir, DN_RENAME);
-	else {
+		old_dir_mask = FS_DN_RENAME;
+	} else {
 		inode_dir_notify(old_dir, DN_DELETE);
+		old_dir_mask = FS_DELETE;
 		inode_dir_notify(new_dir, DN_CREATE);
+		new_dir_mask = FS_CREATE;
 	}
 
-	if (isdir)
+	if (isdir) {
 		isdir = IN_ISDIR;
+		old_dir_mask |= FS_IN_ISDIR;
+		new_dir_mask |= FS_IN_ISDIR;
+	}
+
+	old_dir_mask |= FS_MOVED_FROM;
+	new_dir_mask |= FS_MOVED_TO;
+
 	inotify_inode_queue_event(old_dir, IN_MOVED_FROM|isdir,cookie,old_name,
 				  source);
 	inotify_inode_queue_event(new_dir, IN_MOVED_TO|isdir, cookie, new_name,
 				  source);
 
+	fsnotify(old_dir, old_dir_mask, old_dir, FSNOTIFY_EVENT_INODE);
+	fsnotify(new_dir, new_dir_mask, new_dir, FSNOTIFY_EVENT_INODE);
+
 	if (target) {
 		inotify_inode_queue_event(target, IN_DELETE_SELF, 0, NULL, NULL);
 		inotify_inode_is_dead(target);
+
+		/* this is really a link_count change not a removal */
+		fsnotify_link_count(target);
 	}
 
 	if (source) {
 		inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL, NULL);
+		fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE);
 	}
 	audit_inode_child(new_name, moved, new_dir);
 }
@@ -74,10 +104,12 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
  */
 static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
 {
+	__u32 mask = FS_DELETE;
+
 	if (isdir)
-		isdir = IN_ISDIR;
+		mask |= FS_IN_ISDIR;
 	dnotify_parent(dentry, DN_DELETE);
-	inotify_dentry_parent_queue_event(dentry, IN_DELETE|isdir, 0, dentry->d_name.name);
+	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 }
 
 /*
@@ -87,14 +119,8 @@ static inline void fsnotify_inoderemove(struct inode *inode)
 {
 	inotify_inode_queue_event(inode, IN_DELETE_SELF, 0, NULL, NULL);
 	inotify_inode_is_dead(inode);
-}
 
-/*
- * fsnotify_link_count - inode's link count changed
- */
-static inline void fsnotify_link_count(struct inode *inode)
-{
-	inotify_inode_queue_event(inode, IN_ATTRIB, 0, NULL, NULL);
+	fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE);
 }
 
 /*
@@ -106,6 +132,8 @@ static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
 	inotify_inode_queue_event(inode, IN_CREATE, 0, dentry->d_name.name,
 				  dentry->d_inode);
 	audit_inode_child(dentry->d_name.name, dentry, inode);
+
+	fsnotify(inode, FS_CREATE, dentry->d_inode, FSNOTIFY_EVENT_INODE);
 }
 
 /*
@@ -120,6 +148,8 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct
 				  inode);
 	fsnotify_link_count(inode);
 	audit_inode_child(new_dentry->d_name.name, new_dentry, dir);
+
+	fsnotify(dir, FS_CREATE, inode, FSNOTIFY_EVENT_INODE);
 }
 
 /*
@@ -127,10 +157,14 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct
  */
 static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 {
+	__u32 mask = (FS_CREATE | FS_IN_ISDIR);
+	struct inode *d_inode = dentry->d_inode;
+
 	inode_dir_notify(inode, DN_CREATE);
-	inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0, 
-				  dentry->d_name.name, dentry->d_inode);
+	inotify_inode_queue_event(inode, mask, 0, dentry->d_name.name, d_inode);
 	audit_inode_child(dentry->d_name.name, dentry, inode);
+
+	fsnotify(inode, mask, d_inode, FSNOTIFY_EVENT_INODE);
 }
 
 /*
@@ -139,14 +173,16 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 static inline void fsnotify_access(struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
-	u32 mask = IN_ACCESS;
+	__u32 mask = FS_ACCESS;
 
 	if (S_ISDIR(inode->i_mode))
-		mask |= IN_ISDIR;
+		mask |= FS_IN_ISDIR;
 
 	dnotify_parent(dentry, DN_ACCESS);
 	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
+
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 }
 
 /*
@@ -155,14 +191,16 @@ static inline void fsnotify_access(struct dentry *dentry)
 static inline void fsnotify_modify(struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
-	u32 mask = IN_MODIFY;
+	__u32 mask = FS_MODIFY;
 
 	if (S_ISDIR(inode->i_mode))
-		mask |= IN_ISDIR;
+		mask |= FS_IN_ISDIR;
 
 	dnotify_parent(dentry, DN_MODIFY);
 	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
+
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 }
 
 /*
@@ -171,13 +209,15 @@ static inline void fsnotify_modify(struct dentry *dentry)
 static inline void fsnotify_open(struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
-	u32 mask = IN_OPEN;
+	__u32 mask = FS_OPEN;
 
 	if (S_ISDIR(inode->i_mode))
-		mask |= IN_ISDIR;
+		mask |= FS_IN_ISDIR;
 
 	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
+
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 }
 
 /*
@@ -189,13 +229,15 @@ static inline void fsnotify_close(struct file *file)
 	struct inode *inode = dentry->d_inode;
 	const char *name = dentry->d_name.name;
 	fmode_t mode = file->f_mode;
-	u32 mask = (mode & FMODE_WRITE) ? IN_CLOSE_WRITE : IN_CLOSE_NOWRITE;
+	__u32 mask = (mode & FMODE_WRITE) ? FS_CLOSE_WRITE : FS_CLOSE_NOWRITE;
 
 	if (S_ISDIR(inode->i_mode))
-		mask |= IN_ISDIR;
+		mask |= FS_IN_ISDIR;
 
 	inotify_dentry_parent_queue_event(dentry, mask, 0, name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
+
+	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE);
 }
 
 /*
@@ -204,13 +246,15 @@ static inline void fsnotify_close(struct file *file)
 static inline void fsnotify_xattr(struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
-	u32 mask = IN_ATTRIB;
+	__u32 mask = FS_ATTRIB;
 
 	if (S_ISDIR(inode->i_mode))
-		mask |= IN_ISDIR;
+		mask |= FS_IN_ISDIR;
 
 	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
+
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 }
 
 /*
@@ -221,34 +265,34 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
 {
 	struct inode *inode = dentry->d_inode;
 	int dn_mask = 0;
-	u32 in_mask = 0;
+	__u32 in_mask = 0;
 
 	if (ia_valid & ATTR_UID) {
-		in_mask |= IN_ATTRIB;
+		in_mask |= FS_ATTRIB;
 		dn_mask |= DN_ATTRIB;
 	}
 	if (ia_valid & ATTR_GID) {
-		in_mask |= IN_ATTRIB;
+		in_mask |= FS_ATTRIB;
 		dn_mask |= DN_ATTRIB;
 	}
 	if (ia_valid & ATTR_SIZE) {
-		in_mask |= IN_MODIFY;
+		in_mask |= FS_MODIFY;
 		dn_mask |= DN_MODIFY;
 	}
 	/* both times implies a utime(s) call */
 	if ((ia_valid & (ATTR_ATIME | ATTR_MTIME)) == (ATTR_ATIME | ATTR_MTIME))
 	{
-		in_mask |= IN_ATTRIB;
+		in_mask |= FS_ATTRIB;
 		dn_mask |= DN_ATTRIB;
 	} else if (ia_valid & ATTR_ATIME) {
-		in_mask |= IN_ACCESS;
+		in_mask |= FS_ACCESS;
 		dn_mask |= DN_ACCESS;
 	} else if (ia_valid & ATTR_MTIME) {
-		in_mask |= IN_MODIFY;
+		in_mask |= FS_MODIFY;
 		dn_mask |= DN_MODIFY;
 	}
 	if (ia_valid & ATTR_MODE) {
-		in_mask |= IN_ATTRIB;
+		in_mask |= FS_ATTRIB;
 		dn_mask |= DN_ATTRIB;
 	}
 
@@ -256,14 +300,15 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
 		dnotify_parent(dentry, dn_mask);
 	if (in_mask) {
 		if (S_ISDIR(inode->i_mode))
-			in_mask |= IN_ISDIR;
+			in_mask |= FS_IN_ISDIR;
 		inotify_inode_queue_event(inode, in_mask, 0, NULL, NULL);
 		inotify_dentry_parent_queue_event(dentry, in_mask, 0,
 						  dentry->d_name.name);
+		fsnotify(inode, in_mask, inode, FSNOTIFY_EVENT_INODE);
 	}
 }
 
-#ifdef CONFIG_INOTIFY	/* inotify helpers */
+#if defined(CONFIG_INOTIFY) || defined(CONFIG_FSNOTIFY)	/* notify helpers */
 
 /*
  * fsnotify_oldname_init - save off the old filename before we change it
@@ -281,7 +326,7 @@ static inline void fsnotify_oldname_free(const char *old_name)
 	kfree(old_name);
 }
 
-#else	/* CONFIG_INOTIFY */
+#else	/* CONFIG_INOTIFY || CONFIG_FSNOTIFY */
 
 static inline const char *fsnotify_oldname_init(const char *name)
 {
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
new file mode 100644
index 0000000..1a55718
--- /dev/null
+++ b/include/linux/fsnotify_backend.h
@@ -0,0 +1,177 @@
+/*
+ * Filesystem access notification for Linux
+ *
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ */
+
+#ifndef __LINUX_FSNOTIFY_BACKEND_H
+#define __LINUX_FSNOTIFY_BACKEND_H
+
+#ifdef __KERNEL__
+
+#include <linux/fs.h> /* struct inode */
+#include <linux/list.h>
+#include <linux/path.h> /* struct path */
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include <asm/atomic.h>
+
+/*
+ * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
+ * convert between them.  dnotify only needs conversion at watch creation
+ * so no perf loss there.  fanotify isn't defined yet, so it can use the
+ * wholes if it needs more events.
+ */
+#define FS_ACCESS		0x00000001	/* File was accessed */
+#define FS_MODIFY		0x00000002	/* File was modified */
+#define FS_ATTRIB		0x00000004	/* Metadata changed */
+#define FS_CLOSE_WRITE		0x00000008	/* Writtable file was closed */
+#define FS_CLOSE_NOWRITE	0x00000010	/* Unwrittable file closed */
+#define FS_OPEN			0x00000020	/* File was opened */
+#define FS_MOVED_FROM		0x00000040	/* File was moved from X */
+#define FS_MOVED_TO		0x00000080	/* File was moved to Y */
+#define FS_CREATE		0x00000100	/* Subfile was created */
+#define FS_DELETE		0x00000200	/* Subfile was deleted */
+#define FS_DELETE_SELF		0x00000400	/* Self was deleted */
+#define FS_MOVE_SELF		0x00000800	/* Self was moved */
+
+#define FS_UNMOUNT		0x00002000	/* inode on umount fs */
+#define FS_Q_OVERFLOW		0x00004000	/* Event queued overflowed */
+#define FS_IN_IGNORED		0x00008000	/* last inotify event here */
+
+#define FS_IN_ISDIR		0x40000000	/* event occurred against dir */
+#define FS_IN_ONESHOT		0x80000000	/* only send event once */
+
+#define FS_DN_RENAME		0x10000000	/* file renamed */
+#define FS_DN_MULTISHOT		0x20000000	/* dnotify multishot */
+
+struct fsnotify_group;
+struct fsnotify_event;
+
+/*
+ * Each group much define these ops.  The fsnotify infrastructure will call
+ * these operations for each relevant group.
+ *
+ * handle_event - main call for a group to handle an fs event
+ * free_group_priv - called when a group refcnt hits 0 to clean up the private union
+ */
+struct fsnotify_ops {
+	int (*handle_event)(struct fsnotify_group *group, struct fsnotify_event *event);
+	void (*free_group_priv)(struct fsnotify_group *group);
+};
+
+/*
+ * A group is a "thing" that wants to receive notification about filesystem
+ * events.  The mask holds the subset of event types this group cares about.
+ * refcnt on a group is up to the implementor and at any moment if it goes 0
+ * everything will be cleaned up.
+ */
+struct fsnotify_group {
+	/*
+	 * global list of all groups receiving events from fsnotify.
+	 * anchored by fsnotify_groups and protected by either fsnotify_grp_mutex
+	 * or fsnotify_grp_srcu depending on write vs read.
+	 */
+	struct list_head group_list;
+
+	/*
+	 * Defines all of the event types in which this group is interested.
+	 * This mask is a bitwise OR of the FS_* events from above.  Each time
+	 * this mask changes for a group (if it changes) the correct functions
+	 * must be called to update the global structures which indicate global
+	 * interest in event types.
+	 */
+	__u32 mask;
+
+	/*
+	 * How the refcnt is used is up to each group.  When the refcnt hits 0
+	 * fsnotify will clean up all of the resources associated with this group.
+	 * As an example, the dnotify group will always have a refcnt=1 and that
+	 * will never change.  Inotify, on the other hand, has a group per
+	 * inotify_init() and the refcnt will hit 0 only when that fd has been
+	 * closed.
+	 */
+	atomic_t refcnt;		/* things with interest in this group */
+	unsigned int group_num;		/* simply prevents accidental group collision */
+
+	const struct fsnotify_ops *ops;	/* how this group handles things */
+
+	/* prevents double list_del of group_list.  protected by global fsnotify_gr_mutex */
+	bool on_group_list;
+
+	/* groups can define private fields here or use the void *private */
+	union {
+		void *private;
+	};
+};
+
+/*
+ * all of the information about the original object we want to now send to
+ * a group.  If you want to carry more info from the accessing task to the
+ * listener this structure is where you need to be adding fields.
+ */
+struct fsnotify_event {
+	spinlock_t lock;	/* protection for the associated event_holder and private_list */
+	/* to_tell may ONLY be dereferenced during handle_event(). */
+	struct inode *to_tell;	/* either the inode the event happened to or its parent */
+	/*
+	 * depending on the event type we should have either a path or inode
+	 * We hold a reference on path, but NOT on inode.  Since we have the ref on
+	 * the path, it may be dereferenced at any point during this object's
+	 * lifetime.  That reference is dropped when this object's refcnt hits
+	 * 0.  If this event contains an inode instead of a path, the inode may
+	 * ONLY be used during handle_event().
+	 */
+	union {
+		struct path path;
+		struct inode *inode;
+	};
+/* when calling fsnotify tell it if the data is a path or inode */
+#define FSNOTIFY_EVENT_NONE	0
+#define FSNOTIFY_EVENT_PATH	1
+#define FSNOTIFY_EVENT_INODE	2
+#define FSNOTIFY_EVENT_FILE	3
+	int data_type;		/* which of the above union we have */
+	atomic_t refcnt;	/* how many groups still are using/need to send this event */
+	__u32 mask;		/* the type of access, bitwise OR for FS_* event types */
+};
+
+#ifdef CONFIG_FSNOTIFY
+
+/* called from the vfs helpers */
+
+/* main fsnotify call to send events */
+extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is);
+
+
+/* called from fsnotify listeners, such as fanotify or dnotify */
+
+/* must call when a group changes its ->mask */
+extern void fsnotify_recalc_global_mask(void);
+/* get a reference to an existing or create a new group */
+extern struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num,
+						    __u32 mask,
+						    const struct fsnotify_ops *ops);
+/* drop reference on a group from fsnotify_obtain_group */
+extern void fsnotify_put_group(struct fsnotify_group *group);
+
+/* take a reference to an event */
+extern void fsnotify_get_event(struct fsnotify_event *event);
+extern void fsnotify_put_event(struct fsnotify_event *event);
+/* find private data previously attached to an event */
+extern struct fsnotify_event_private_data *fsnotify_get_priv_from_event(struct fsnotify_group *group,
+									struct fsnotify_event *event);
+
+/* put here because inotify does some weird stuff when destroying watches */
+extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
+						    void *data, int data_is);
+#else
+
+static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
+{}
+#endif	/* CONFIG_FSNOTIFY */
+
+#endif	/* __KERNEL __ */
+
+#endif	/* __LINUX_FSNOTIFY_BACKEND_H */
-- 
cgit v0.10.2


From 3be25f49b9d6a97eae9bcb96d3292072b7658bd8 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:26 -0400
Subject: fsnotify: add marks to inodes so groups can interpret how to handle
 those inodes

This patch creates a way for fsnotify groups to attach marks to inodes.
These marks have little meaning to the generic fsnotify infrastructure
and thus their meaning should be interpreted by the group that attached
them to the inode's list.

dnotify and inotify  will make use of these markings to indicate which
inodes are of interest to their respective groups.  But this implementation
has the useful property that in the future other listeners could actually
use the marks for the exact opposite reason, aka to indicate which inodes
it had NO interest in.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>

diff --git a/fs/inode.c b/fs/inode.c
index bca0c61..54c63ce 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -22,6 +22,7 @@
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
 #include <linux/inotify.h>
+#include <linux/fsnotify.h>
 #include <linux/mount.h>
 #include <linux/async.h>
 
@@ -189,6 +190,10 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_private = NULL;
 	inode->i_mapping = mapping;
 
+#ifdef CONFIG_FSNOTIFY
+	inode->i_fsnotify_mask = 0;
+#endif
+
 	return inode;
 
 out_free_security:
@@ -221,6 +226,7 @@ void destroy_inode(struct inode *inode)
 	BUG_ON(inode_has_buffers(inode));
 	ima_inode_free(inode);
 	security_inode_free(inode);
+	fsnotify_inode_delete(inode);
 	if (inode->i_sb->s_op->destroy_inode)
 		inode->i_sb->s_op->destroy_inode(inode);
 	else
@@ -252,6 +258,9 @@ void inode_init_once(struct inode *inode)
 	INIT_LIST_HEAD(&inode->inotify_watches);
 	mutex_init(&inode->inotify_mutex);
 #endif
+#ifdef CONFIG_FSNOTIFY
+	INIT_HLIST_HEAD(&inode->i_fsnotify_mark_entries);
+#endif
 }
 EXPORT_SYMBOL(inode_init_once);
 
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index db5467b..0922cc8 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,4 +1,4 @@
-obj-$(CONFIG_FSNOTIFY)		+= fsnotify.o notification.o group.o
+obj-$(CONFIG_FSNOTIFY)		+= fsnotify.o notification.o group.o inode_mark.o
 
 obj-y			+= dnotify/
 obj-y			+= inotify/
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 56bee0f..d565462 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -26,6 +26,15 @@
 #include "fsnotify.h"
 
 /*
+ * Clear all of the marks on an inode when it is being evicted from core
+ */
+void __fsnotify_inode_delete(struct inode *inode)
+{
+	fsnotify_clear_marks_by_inode(inode);
+}
+EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);
+
+/*
  * This is the main call to fsnotify.  The VFS calls into hook specific functions
  * in linux/fsnotify.h.  Those functions then in turn call here.  Here will call
  * out to all of the registered fsnotify_group.  Those groups can then use the
@@ -43,6 +52,8 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
 	if (!(mask & fsnotify_mask))
 		return;
 
+	if (!(mask & to_tell->i_fsnotify_mask))
+		return;
 	/*
 	 * SRCU!!  the groups list is very very much read only and the path is
 	 * very hot.  The VAST majority of events are not going to need to do
@@ -51,6 +62,8 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
 	idx = srcu_read_lock(&fsnotify_grp_srcu);
 	list_for_each_entry_rcu(group, &fsnotify_groups, group_list) {
 		if (mask & group->mask) {
+			if (!group->ops->should_send_event(group, to_tell, mask))
+				continue;
 			if (!event) {
 				event = fsnotify_create_event(to_tell, mask, data, data_is);
 				/* shit, we OOM'd and now we can't tell, maybe
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index c6a8bd4..8ebcbe8 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -12,4 +12,9 @@ extern struct srcu_struct fsnotify_grp_srcu;
 extern struct list_head fsnotify_groups;
 /* all bitwise OR of all event types (FS_*) for all fsnotify_groups */
 extern __u32 fsnotify_mask;
+
+/* final kfree of a group */
+extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
+/* run the list of all marks associated with inode and flag them to be freed */
+extern void fsnotify_clear_marks_by_inode(struct inode *inode);
 #endif	/* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/fs/notify/group.c b/fs/notify/group.c
index c681295..a29d2fa 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -55,6 +55,29 @@ void fsnotify_recalc_global_mask(void)
 }
 
 /*
+ * Update the group->mask by running all of the marks associated with this
+ * group and finding the bitwise | of all of the mark->mask.  If we change
+ * the group->mask we need to update the global mask of events interesting
+ * to the system.
+ */
+void fsnotify_recalc_group_mask(struct fsnotify_group *group)
+{
+	__u32 mask = 0;
+	__u32 old_mask = group->mask;
+	struct fsnotify_mark_entry *entry;
+
+	spin_lock(&group->mark_lock);
+	list_for_each_entry(entry, &group->mark_entries, g_list)
+		mask |= entry->mask;
+	spin_unlock(&group->mark_lock);
+
+	group->mask = mask;
+
+	if (old_mask != mask)
+		fsnotify_recalc_global_mask();
+}
+
+/*
  * Take a reference to a group so things found under the fsnotify_grp_mutex
  * can't get freed under us
  */
@@ -66,7 +89,7 @@ static void fsnotify_get_group(struct fsnotify_group *group)
 /*
  * Final freeing of a group
  */
-static void fsnotify_destroy_group(struct fsnotify_group *group)
+void fsnotify_final_destroy_group(struct fsnotify_group *group)
 {
 	if (group->ops->free_group_priv)
 		group->ops->free_group_priv(group);
@@ -75,6 +98,24 @@ static void fsnotify_destroy_group(struct fsnotify_group *group)
 }
 
 /*
+ * Trying to get rid of a group.  We need to first get rid of any outstanding
+ * allocations and then free the group.  Remember that fsnotify_clear_marks_by_group
+ * could miss marks that are being freed by inode and those marks could still
+ * hold a reference to this group (via group->num_marks)  If we get into that
+ * situtation, the fsnotify_final_destroy_group will get called when that final
+ * mark is freed.
+ */
+static void fsnotify_destroy_group(struct fsnotify_group *group)
+{
+	/* clear all inode mark entries for this group */
+	fsnotify_clear_marks_by_group(group);
+
+	/* past the point of no return, matches the initial value of 1 */
+	if (atomic_dec_and_test(&group->num_marks))
+		fsnotify_final_destroy_group(group);
+}
+
+/*
  * Remove this group from the global list of groups that will get events
  * this can be done even if there are still references and things still using
  * this group.  This just stops the group from getting new events.
@@ -173,6 +214,10 @@ struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask,
 	group->group_num = group_num;
 	group->mask = mask;
 
+	spin_lock_init(&group->mark_lock);
+	atomic_set(&group->num_marks, 0);
+	INIT_LIST_HEAD(&group->mark_entries);
+
 	group->ops = ops;
 
 	mutex_lock(&fsnotify_grp_mutex);
@@ -188,6 +233,8 @@ struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask,
 	/* group not found, add a new one */
 	list_add_rcu(&group->group_list, &fsnotify_groups);
 	group->on_group_list = 1;
+	/* being on the fsnotify_groups list holds one num_marks */
+	atomic_inc(&group->num_marks);
 
 	mutex_unlock(&fsnotify_grp_mutex);
 
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
new file mode 100644
index 0000000..cdc1541
--- /dev/null
+++ b/fs/notify/inode_mark.c
@@ -0,0 +1,329 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * fsnotify inode mark locking/lifetime/and refcnting
+ *
+ * REFCNT:
+ * The mark->refcnt tells how many "things" in the kernel currently are
+ * referencing this object.  The object typically will live inside the kernel
+ * with a refcnt of 2, one for each list it is on (i_list, g_list).  Any task
+ * which can find this object holding the appropriete locks, can take a reference
+ * and the object itself is guarenteed to survive until the reference is dropped.
+ *
+ * LOCKING:
+ * There are 3 spinlocks involved with fsnotify inode marks and they MUST
+ * be taken in order as follows:
+ *
+ * entry->lock
+ * group->mark_lock
+ * inode->i_lock
+ *
+ * entry->lock protects 2 things, entry->group and entry->inode.  You must hold
+ * that lock to dereference either of these things (they could be NULL even with
+ * the lock)
+ *
+ * group->mark_lock protects the mark_entries list anchored inside a given group
+ * and each entry is hooked via the g_list.  It also sorta protects the
+ * free_g_list, which when used is anchored by a private list on the stack of the
+ * task which held the group->mark_lock.
+ *
+ * inode->i_lock protects the i_fsnotify_mark_entries list anchored inside a
+ * given inode and each entry is hooked via the i_list. (and sorta the
+ * free_i_list)
+ *
+ *
+ * LIFETIME:
+ * Inode marks survive between when they are added to an inode and when their
+ * refcnt==0.
+ *
+ * The inode mark can be cleared for a number of different reasons including:
+ * - The inode is unlinked for the last time.  (fsnotify_inode_remove)
+ * - The inode is being evicted from cache. (fsnotify_inode_delete)
+ * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
+ * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark_by_entry)
+ * - The fsnotify_group associated with the mark is going away and all such marks
+ *   need to be cleaned up. (fsnotify_clear_marks_by_group)
+ *
+ * Worst case we are given an inode and need to clean up all the marks on that
+ * inode.  We take i_lock and walk the i_fsnotify_mark_entries safely.  For each
+ * mark on the list we take a reference (so the mark can't disappear under us).
+ * We remove that mark form the inode's list of marks and we add this mark to a
+ * private list anchored on the stack using i_free_list;  At this point we no
+ * longer fear anything finding the mark using the inode's list of marks.
+ *
+ * We can safely and locklessly run the private list on the stack of everything
+ * we just unattached from the original inode.  For each mark on the private list
+ * we grab the mark-> and can thus dereference mark->group and mark->inode.  If
+ * we see the group and inode are not NULL we take those locks.  Now holding all
+ * 3 locks we can completely remove the mark from other tasks finding it in the
+ * future.  Remember, 10 things might already be referencing this mark, but they
+ * better be holding a ref.  We drop our reference we took before we unhooked it
+ * from the inode.  When the ref hits 0 we can free the mark.
+ *
+ * Very similarly for freeing by group, except we use free_g_list.
+ *
+ * This has the very interesting property of being able to run concurrently with
+ * any (or all) other directions.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include <asm/atomic.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+
+void fsnotify_get_mark(struct fsnotify_mark_entry *entry)
+{
+	atomic_inc(&entry->refcnt);
+}
+
+void fsnotify_put_mark(struct fsnotify_mark_entry *entry)
+{
+	if (atomic_dec_and_test(&entry->refcnt))
+		entry->free_mark(entry);
+}
+
+/*
+ * Recalculate the mask of events relevant to a given inode locked.
+ */
+static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
+{
+	struct fsnotify_mark_entry *entry;
+	struct hlist_node *pos;
+	__u32 new_mask = 0;
+
+	assert_spin_locked(&inode->i_lock);
+
+	hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list)
+		new_mask |= entry->mask;
+	inode->i_fsnotify_mask = new_mask;
+}
+
+/*
+ * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types
+ * any notifier is interested in hearing for this inode.
+ */
+void fsnotify_recalc_inode_mask(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	fsnotify_recalc_inode_mask_locked(inode);
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Any time a mark is getting freed we end up here.
+ * The caller had better be holding a reference to this mark so we don't actually
+ * do the final put under the entry->lock
+ */
+void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
+{
+	struct fsnotify_group *group;
+	struct inode *inode;
+
+	spin_lock(&entry->lock);
+
+	group = entry->group;
+	inode = entry->inode;
+
+	BUG_ON(group && !inode);
+	BUG_ON(!group && inode);
+
+	/* if !group something else already marked this to die */
+	if (!group) {
+		spin_unlock(&entry->lock);
+		return;
+	}
+
+	/* 1 from caller and 1 for being on i_list/g_list */
+	BUG_ON(atomic_read(&entry->refcnt) < 2);
+
+	spin_lock(&group->mark_lock);
+	spin_lock(&inode->i_lock);
+
+	hlist_del_init(&entry->i_list);
+	entry->inode = NULL;
+
+	list_del_init(&entry->g_list);
+	entry->group = NULL;
+
+	fsnotify_put_mark(entry); /* for i_list and g_list */
+
+	/*
+	 * this mark is now off the inode->i_fsnotify_mark_entries list and we
+	 * hold the inode->i_lock, so this is the perfect time to update the
+	 * inode->i_fsnotify_mask
+	 */
+	fsnotify_recalc_inode_mask_locked(inode);
+
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&group->mark_lock);
+	spin_unlock(&entry->lock);
+
+	/*
+	 * Some groups like to know that marks are being freed.  This is a
+	 * callback to the group function to let it know that this entry
+	 * is being freed.
+	 */
+	group->ops->freeing_mark(entry, group);
+
+	/*
+	 * it's possible that this group tried to destroy itself, but this
+	 * this mark was simultaneously being freed by inode.  If that's the
+	 * case, we finish freeing the group here.
+	 */
+	if (unlikely(atomic_dec_and_test(&group->num_marks)))
+		fsnotify_final_destroy_group(group);
+}
+
+/*
+ * Given a group, destroy all of the marks associated with that group.
+ */
+void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
+{
+	struct fsnotify_mark_entry *lentry, *entry;
+	LIST_HEAD(free_list);
+
+	spin_lock(&group->mark_lock);
+	list_for_each_entry_safe(entry, lentry, &group->mark_entries, g_list) {
+		list_add(&entry->free_g_list, &free_list);
+		list_del_init(&entry->g_list);
+		fsnotify_get_mark(entry);
+	}
+	spin_unlock(&group->mark_lock);
+
+	list_for_each_entry_safe(entry, lentry, &free_list, free_g_list) {
+		fsnotify_destroy_mark_by_entry(entry);
+		fsnotify_put_mark(entry);
+	}
+}
+
+/*
+ * Given an inode, destroy all of the marks associated with that inode.
+ */
+void fsnotify_clear_marks_by_inode(struct inode *inode)
+{
+	struct fsnotify_mark_entry *entry, *lentry;
+	struct hlist_node *pos, *n;
+	LIST_HEAD(free_list);
+
+	spin_lock(&inode->i_lock);
+	hlist_for_each_entry_safe(entry, pos, n, &inode->i_fsnotify_mark_entries, i_list) {
+		list_add(&entry->free_i_list, &free_list);
+		hlist_del_init(&entry->i_list);
+		fsnotify_get_mark(entry);
+	}
+	spin_unlock(&inode->i_lock);
+
+	list_for_each_entry_safe(entry, lentry, &free_list, free_i_list) {
+		fsnotify_destroy_mark_by_entry(entry);
+		fsnotify_put_mark(entry);
+	}
+}
+
+/*
+ * given a group and inode, find the mark associated with that combination.
+ * if found take a reference to that mark and return it, else return NULL
+ */
+struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group,
+						     struct inode *inode)
+{
+	struct fsnotify_mark_entry *entry;
+	struct hlist_node *pos;
+
+	assert_spin_locked(&inode->i_lock);
+
+	hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) {
+		if (entry->group == group) {
+			fsnotify_get_mark(entry);
+			return entry;
+		}
+	}
+	return NULL;
+}
+
+/*
+ * Nothing fancy, just initialize lists and locks and counters.
+ */
+void fsnotify_init_mark(struct fsnotify_mark_entry *entry,
+			void (*free_mark)(struct fsnotify_mark_entry *entry))
+
+{
+	spin_lock_init(&entry->lock);
+	atomic_set(&entry->refcnt, 1);
+	INIT_HLIST_NODE(&entry->i_list);
+	entry->group = NULL;
+	entry->mask = 0;
+	entry->inode = NULL;
+	entry->free_mark = free_mark;
+}
+
+/*
+ * Attach an initialized mark entry to a given group and inode.
+ * These marks may be used for the fsnotify backend to determine which
+ * event types should be delivered to which group and for which inodes.
+ */
+int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
+		      struct fsnotify_group *group, struct inode *inode)
+{
+	struct fsnotify_mark_entry *lentry;
+	int ret = 0;
+
+	/*
+	 * LOCKING ORDER!!!!
+	 * entry->lock
+	 * group->mark_lock
+	 * inode->i_lock
+	 */
+	spin_lock(&entry->lock);
+	spin_lock(&group->mark_lock);
+	spin_lock(&inode->i_lock);
+
+	entry->group = group;
+	entry->inode = inode;
+
+	lentry = fsnotify_find_mark_entry(group, inode);
+	if (!lentry) {
+		hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries);
+		list_add(&entry->g_list, &group->mark_entries);
+
+		fsnotify_get_mark(entry); /* for i_list and g_list */
+
+		atomic_inc(&group->num_marks);
+
+		fsnotify_recalc_inode_mask_locked(inode);
+	}
+
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&group->mark_lock);
+	spin_unlock(&entry->lock);
+
+	if (lentry) {
+		ret = -EEXIST;
+		fsnotify_put_mark(lentry);
+	}
+
+	return ret;
+}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 83d6b43..275b086 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -755,6 +755,11 @@ struct inode {
 
 	__u32			i_generation;
 
+#ifdef CONFIG_FSNOTIFY
+	__u32			i_fsnotify_mask; /* all events this inode cares about */
+	struct hlist_head	i_fsnotify_mark_entries; /* fsnotify mark entries */
+#endif
+
 #ifdef CONFIG_DNOTIFY
 	unsigned long		i_dnotify_mask; /* Directory notify events */
 	struct dnotify_struct	*i_dnotify; /* for directory notifications */
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 6c9ebef..3856eb6 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -100,6 +100,14 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 }
 
 /*
+ * fsnotify_inode_delete - and inode is being evicted from cache, clean up is needed
+ */
+static inline void fsnotify_inode_delete(struct inode *inode)
+{
+	__fsnotify_inode_delete(inode);
+}
+
+/*
  * fsnotify_nameremove - a filename was removed from a directory
  */
 static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
@@ -121,6 +129,7 @@ static inline void fsnotify_inoderemove(struct inode *inode)
 	inotify_inode_is_dead(inode);
 
 	fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE);
+	__fsnotify_inode_delete(inode);
 }
 
 /*
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 1a55718..cad5c4d 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -48,17 +48,25 @@
 
 struct fsnotify_group;
 struct fsnotify_event;
+struct fsnotify_mark_entry;
 
 /*
  * Each group much define these ops.  The fsnotify infrastructure will call
  * these operations for each relevant group.
  *
+ * should_send_event - given a group, inode, and mask this function determines
+ *		if the group is interested in this event.
  * handle_event - main call for a group to handle an fs event
  * free_group_priv - called when a group refcnt hits 0 to clean up the private union
+ * freeing-mark - this means that a mark has been flagged to die when everything
+ *		finishes using it.  The function is supplied with what must be a
+ *		valid group and inode to use to clean up.
  */
 struct fsnotify_ops {
+	bool (*should_send_event)(struct fsnotify_group *group, struct inode *inode, __u32 mask);
 	int (*handle_event)(struct fsnotify_group *group, struct fsnotify_event *event);
 	void (*free_group_priv)(struct fsnotify_group *group);
+	void (*freeing_mark)(struct fsnotify_mark_entry *entry, struct fsnotify_group *group);
 };
 
 /*
@@ -97,7 +105,14 @@ struct fsnotify_group {
 
 	const struct fsnotify_ops *ops;	/* how this group handles things */
 
-	/* prevents double list_del of group_list.  protected by global fsnotify_gr_mutex */
+	/* stores all fastapth entries assoc with this group so they can be cleaned on unregister */
+	spinlock_t mark_lock;		/* protect mark_entries list */
+	atomic_t num_marks;		/* 1 for each mark entry and 1 for not being
+					 * past the point of no return when freeing
+					 * a group */
+	struct list_head mark_entries;	/* all inode mark entries for this group */
+
+	/* prevents double list_del of group_list.  protected by global fsnotify_grp_mutex */
 	bool on_group_list;
 
 	/* groups can define private fields here or use the void *private */
@@ -137,12 +152,38 @@ struct fsnotify_event {
 	__u32 mask;		/* the type of access, bitwise OR for FS_* event types */
 };
 
+/*
+ * a mark is simply an entry attached to an in core inode which allows an
+ * fsnotify listener to indicate they are either no longer interested in events
+ * of a type matching mask or only interested in those events.
+ *
+ * these are flushed when an inode is evicted from core and may be flushed
+ * when the inode is modified (as seen by fsnotify_access).  Some fsnotify users
+ * (such as dnotify) will flush these when the open fd is closed and not at
+ * inode eviction or modification.
+ */
+struct fsnotify_mark_entry {
+	__u32 mask;			/* mask this mark entry is for */
+	/* we hold ref for each i_list and g_list.  also one ref for each 'thing'
+	 * in kernel that found and may be using this mark. */
+	atomic_t refcnt;		/* active things looking at this mark */
+	struct inode *inode;		/* inode this entry is associated with */
+	struct fsnotify_group *group;	/* group this mark entry is for */
+	struct hlist_node i_list;	/* list of mark_entries by inode->i_fsnotify_mark_entries */
+	struct list_head g_list;	/* list of mark_entries by group->i_fsnotify_mark_entries */
+	spinlock_t lock;		/* protect group, inode, and killme */
+	struct list_head free_i_list;	/* tmp list used when freeing this mark */
+	struct list_head free_g_list;	/* tmp list used when freeing this mark */
+	void (*free_mark)(struct fsnotify_mark_entry *entry); /* called on final put+free */
+};
+
 #ifdef CONFIG_FSNOTIFY
 
 /* called from the vfs helpers */
 
 /* main fsnotify call to send events */
 extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is);
+extern void __fsnotify_inode_delete(struct inode *inode);
 
 
 /* called from fsnotify listeners, such as fanotify or dnotify */
@@ -153,6 +194,8 @@ extern void fsnotify_recalc_global_mask(void);
 extern struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num,
 						    __u32 mask,
 						    const struct fsnotify_ops *ops);
+/* run all marks associated with this group and update group->mask */
+extern void fsnotify_recalc_group_mask(struct fsnotify_group *group);
 /* drop reference on a group from fsnotify_obtain_group */
 extern void fsnotify_put_group(struct fsnotify_group *group);
 
@@ -163,6 +206,22 @@ extern void fsnotify_put_event(struct fsnotify_event *event);
 extern struct fsnotify_event_private_data *fsnotify_get_priv_from_event(struct fsnotify_group *group,
 									struct fsnotify_event *event);
 
+/* functions used to manipulate the marks attached to inodes */
+
+/* run all marks associated with an inode and update inode->i_fsnotify_mask */
+extern void fsnotify_recalc_inode_mask(struct inode *inode);
+extern void fsnotify_init_mark(struct fsnotify_mark_entry *entry, void (*free_mark)(struct fsnotify_mark_entry *entry));
+/* find (and take a reference) to a mark associated with group and inode */
+extern struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group, struct inode *inode);
+/* attach the mark to both the group and the inode */
+extern int fsnotify_add_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group, struct inode *inode);
+/* given a mark, flag it to be freed when all references are dropped */
+extern void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry);
+/* run all the marks in a group, and flag them to be freed */
+extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group);
+extern void fsnotify_get_mark(struct fsnotify_mark_entry *entry);
+extern void fsnotify_put_mark(struct fsnotify_mark_entry *entry);
+
 /* put here because inotify does some weird stuff when destroying watches */
 extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
 						    void *data, int data_is);
@@ -170,6 +229,10 @@ extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32
 
 static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
 {}
+
+static inline void __fsnotify_inode_delete(struct inode *inode)
+{}
+
 #endif	/* CONFIG_FSNOTIFY */
 
 #endif	/* __KERNEL __ */
-- 
cgit v0.10.2


From c28f7e56e9d95fb531dc3be8df2e7f52bee76d21 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:29 -0400
Subject: fsnotify: parent event notification

inotify and dnotify both use a similar parent notification mechanism.  We
add a generic parent notification mechanism to fsnotify for both of these
to use.  This new machanism also adds the dentry flag optimization which
exists for inotify to dnotify.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index d565462..7fc7600 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -35,6 +35,97 @@ void __fsnotify_inode_delete(struct inode *inode)
 EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);
 
 /*
+ * Given an inode, first check if we care what happens to our children.  Inotify
+ * and dnotify both tell their parents about events.  If we care about any event
+ * on a child we run all of our children and set a dentry flag saying that the
+ * parent cares.  Thus when an event happens on a child it can quickly tell if
+ * if there is a need to find a parent and send the event to the parent.
+ */
+void __fsnotify_update_child_dentry_flags(struct inode *inode)
+{
+	struct dentry *alias;
+	int watched;
+
+	if (!S_ISDIR(inode->i_mode))
+		return;
+
+	/* determine if the children should tell inode about their events */
+	watched = fsnotify_inode_watches_children(inode);
+
+	spin_lock(&dcache_lock);
+	/* run all of the dentries associated with this inode.  Since this is a
+	 * directory, there damn well better only be one item on this list */
+	list_for_each_entry(alias, &inode->i_dentry, d_alias) {
+		struct dentry *child;
+
+		/* run all of the children of the original inode and fix their
+		 * d_flags to indicate parental interest (their parent is the
+		 * original inode) */
+		list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
+			if (!child->d_inode)
+				continue;
+
+			spin_lock(&child->d_lock);
+			if (watched)
+				child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
+			else
+				child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
+			spin_unlock(&child->d_lock);
+		}
+	}
+	spin_unlock(&dcache_lock);
+}
+
+/* Notify this dentry's parent about a child's events. */
+void __fsnotify_parent(struct dentry *dentry, __u32 mask)
+{
+	struct dentry *parent;
+	struct inode *p_inode;
+	bool send = false;
+	bool should_update_children = false;
+
+	if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
+		return;
+
+	spin_lock(&dentry->d_lock);
+	parent = dentry->d_parent;
+	p_inode = parent->d_inode;
+
+	if (fsnotify_inode_watches_children(p_inode)) {
+		if (p_inode->i_fsnotify_mask & mask) {
+			dget(parent);
+			send = true;
+		}
+	} else {
+		/*
+		 * The parent doesn't care about events on it's children but
+		 * at least one child thought it did.  We need to run all the
+		 * children and update their d_flags to let them know p_inode
+		 * doesn't care about them any more.
+		 */
+		dget(parent);
+		should_update_children = true;
+	}
+
+	spin_unlock(&dentry->d_lock);
+
+	if (send) {
+		/* we are notifying a parent so come up with the new mask which
+		 * specifies these are events which came from a child. */
+		mask |= FS_EVENT_ON_CHILD;
+
+		fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE);
+		dput(parent);
+	}
+
+	if (unlikely(should_update_children)) {
+		__fsnotify_update_child_dentry_flags(p_inode);
+		dput(parent);
+	}
+}
+EXPORT_SYMBOL_GPL(__fsnotify_parent);
+
+/*
  * This is the main call to fsnotify.  The VFS calls into hook specific functions
  * in linux/fsnotify.h.  Those functions then in turn call here.  Here will call
  * out to all of the registered fsnotify_group.  Those groups can then use the
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 8ebcbe8..83b8ec0 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -17,4 +17,9 @@ extern __u32 fsnotify_mask;
 extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
 /* run the list of all marks associated with inode and flag them to be freed */
 extern void fsnotify_clear_marks_by_inode(struct inode *inode);
+/*
+ * update the dentry->d_flags of all of inode's children to indicate if inode cares
+ * about events that happen to its children.
+ */
+extern void __fsnotify_update_child_dentry_flags(struct inode *inode);
 #endif	/* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index cdc1541..a395348 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -131,6 +131,8 @@ void fsnotify_recalc_inode_mask(struct inode *inode)
 	spin_lock(&inode->i_lock);
 	fsnotify_recalc_inode_mask_locked(inode);
 	spin_unlock(&inode->i_lock);
+
+	__fsnotify_update_child_dentry_flags(inode);
 }
 
 /*
@@ -190,6 +192,19 @@ void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
 	group->ops->freeing_mark(entry, group);
 
 	/*
+	 * __fsnotify_update_child_dentry_flags(inode);
+	 *
+	 * I really want to call that, but we can't, we have no idea if the inode
+	 * still exists the second we drop the entry->lock.
+	 *
+	 * The next time an event arrive to this inode from one of it's children
+	 * __fsnotify_parent will see that the inode doesn't care about it's
+	 * children and will update all of these flags then.  So really this
+	 * is just a lazy update (and could be a perf win...)
+	 */
+
+
+	/*
 	 * it's possible that this group tried to destroy itself, but this
 	 * this mark was simultaneously being freed by inode.  If that's the
 	 * case, we finish freeing the group here.
@@ -323,6 +338,8 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
 	if (lentry) {
 		ret = -EEXIST;
 		fsnotify_put_mark(lentry);
+	} else {
+		__fsnotify_update_child_dentry_flags(inode);
 	}
 
 	return ret;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 1515636..9797800 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -180,10 +180,12 @@ d_iput:		no		no		no       yes
 #define DCACHE_REFERENCED	0x0008  /* Recently used, don't discard. */
 #define DCACHE_UNHASHED		0x0010	
 
-#define DCACHE_INOTIFY_PARENT_WATCHED	0x0020 /* Parent inode is watched */
+#define DCACHE_INOTIFY_PARENT_WATCHED	0x0020 /* Parent inode is watched by inotify */
 
 #define DCACHE_COOKIE		0x0040	/* For use by dcookie subsystem */
 
+#define DCACHE_FSNOTIFY_PARENT_WATCHED	0x0080 /* Parent inode is watched by some fsnotify listener */
+
 extern spinlock_t dcache_lock;
 extern seqlock_t rename_lock;
 
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 3856eb6..6a662ed 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -23,15 +23,31 @@
 static inline void fsnotify_d_instantiate(struct dentry *entry,
 						struct inode *inode)
 {
+	__fsnotify_d_instantiate(entry, inode);
+
 	inotify_d_instantiate(entry, inode);
 }
 
+/* Notify this dentry's parent about a child's events. */
+static inline void fsnotify_parent(struct dentry *dentry, __u32 mask)
+{
+	__fsnotify_parent(dentry, mask);
+
+	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
+}
+
 /*
  * fsnotify_d_move - entry has been moved
  * Called with dcache_lock and entry->d_lock held.
  */
 static inline void fsnotify_d_move(struct dentry *entry)
 {
+	/*
+	 * On move we need to update entry->d_flags to indicate if the new parent
+	 * cares about events from this entry.
+	 */
+	__fsnotify_update_dcache_flags(entry);
+
 	inotify_d_move(entry);
 }
 
@@ -117,7 +133,8 @@ static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
 	if (isdir)
 		mask |= FS_IN_ISDIR;
 	dnotify_parent(dentry, DN_DELETE);
-	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
+
+	fsnotify_parent(dentry, mask);
 }
 
 /*
@@ -188,9 +205,9 @@ static inline void fsnotify_access(struct dentry *dentry)
 		mask |= FS_IN_ISDIR;
 
 	dnotify_parent(dentry, DN_ACCESS);
-	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
+	fsnotify_parent(dentry, mask);
 	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 }
 
@@ -206,9 +223,9 @@ static inline void fsnotify_modify(struct dentry *dentry)
 		mask |= FS_IN_ISDIR;
 
 	dnotify_parent(dentry, DN_MODIFY);
-	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
+	fsnotify_parent(dentry, mask);
 	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 }
 
@@ -223,9 +240,9 @@ static inline void fsnotify_open(struct dentry *dentry)
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
+	fsnotify_parent(dentry, mask);
 	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 }
 
@@ -236,16 +253,15 @@ static inline void fsnotify_close(struct file *file)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
-	const char *name = dentry->d_name.name;
 	fmode_t mode = file->f_mode;
 	__u32 mask = (mode & FMODE_WRITE) ? FS_CLOSE_WRITE : FS_CLOSE_NOWRITE;
 
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	inotify_dentry_parent_queue_event(dentry, mask, 0, name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
+	fsnotify_parent(dentry, mask);
 	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE);
 }
 
@@ -260,9 +276,9 @@ static inline void fsnotify_xattr(struct dentry *dentry)
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
+	fsnotify_parent(dentry, mask);
 	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 }
 
@@ -311,8 +327,8 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
 		if (S_ISDIR(inode->i_mode))
 			in_mask |= FS_IN_ISDIR;
 		inotify_inode_queue_event(inode, in_mask, 0, NULL, NULL);
-		inotify_dentry_parent_queue_event(dentry, in_mask, 0,
-						  dentry->d_name.name);
+
+		fsnotify_parent(dentry, in_mask);
 		fsnotify(inode, in_mask, inode, FSNOTIFY_EVENT_INODE);
 	}
 }
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index cad5c4d..13d2dd5 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -46,6 +46,17 @@
 #define FS_DN_RENAME		0x10000000	/* file renamed */
 #define FS_DN_MULTISHOT		0x20000000	/* dnotify multishot */
 
+/* This inode cares about things that happen to its children.  Always set for
+ * dnotify and inotify. */
+#define FS_EVENT_ON_CHILD	0x08000000
+
+/* This is a list of all events that may get sent to a parernt based on fs event
+ * happening to inodes inside that directory */
+#define FS_EVENTS_POSS_ON_CHILD   (FS_ACCESS | FS_MODIFY | FS_ATTRIB |\
+				   FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | FS_OPEN |\
+				   FS_MOVED_FROM | FS_MOVED_TO | FS_CREATE |\
+				   FS_DELETE)
+
 struct fsnotify_group;
 struct fsnotify_event;
 struct fsnotify_mark_entry;
@@ -183,8 +194,52 @@ struct fsnotify_mark_entry {
 
 /* main fsnotify call to send events */
 extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is);
+extern void __fsnotify_parent(struct dentry *dentry, __u32 mask);
 extern void __fsnotify_inode_delete(struct inode *inode);
 
+static inline int fsnotify_inode_watches_children(struct inode *inode)
+{
+	/* FS_EVENT_ON_CHILD is set if the inode may care */
+	if (!(inode->i_fsnotify_mask & FS_EVENT_ON_CHILD))
+		return 0;
+	/* this inode might care about child events, does it care about the
+	 * specific set of events that can happen on a child? */
+	return inode->i_fsnotify_mask & FS_EVENTS_POSS_ON_CHILD;
+}
+
+/*
+ * Update the dentry with a flag indicating the interest of its parent to receive
+ * filesystem events when those events happens to this dentry->d_inode.
+ */
+static inline void __fsnotify_update_dcache_flags(struct dentry *dentry)
+{
+	struct dentry *parent;
+
+	assert_spin_locked(&dcache_lock);
+	assert_spin_locked(&dentry->d_lock);
+
+	parent = dentry->d_parent;
+	if (fsnotify_inode_watches_children(parent->d_inode))
+		dentry->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
+	else
+		dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
+}
+
+/*
+ * fsnotify_d_instantiate - instantiate a dentry for inode
+ * Called with dcache_lock held.
+ */
+static inline void __fsnotify_d_instantiate(struct dentry *dentry, struct inode *inode)
+{
+	if (!inode)
+		return;
+
+	assert_spin_locked(&dcache_lock);
+
+	spin_lock(&dentry->d_lock);
+	__fsnotify_update_dcache_flags(dentry);
+	spin_unlock(&dentry->d_lock);
+}
 
 /* called from fsnotify listeners, such as fanotify or dnotify */
 
@@ -230,9 +285,18 @@ extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32
 static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
 {}
 
+static inline void __fsnotify_parent(struct dentry *dentry, __u32 mask)
+{}
+
 static inline void __fsnotify_inode_delete(struct inode *inode)
 {}
 
+static inline void __fsnotify_update_dcache_flags(struct dentry *dentry)
+{}
+
+static inline void __fsnotify_d_instantiate(struct dentry *dentry, struct inode *inode)
+{}
+
 #endif	/* CONFIG_FSNOTIFY */
 
 #endif	/* __KERNEL __ */
-- 
cgit v0.10.2


From 3c5119c05d624f95f4967d16b38c9624b816bdb9 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:33 -0400
Subject: dnotify: reimplement dnotify using fsnotify

Reimplement dnotify using fsnotify.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>

diff --git a/MAINTAINERS b/MAINTAINERS
index ccdb575..96e0c8c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1802,10 +1802,10 @@ F:	drivers/char/epca*
 F:	drivers/char/digi*
 
 DIRECTORY NOTIFICATION (DNOTIFY)
-P:	Stephen Rothwell
-M:	sfr@canb.auug.org.au
+P:	Eric Paris
+M:	eparis@parisplace.org
 L:	linux-kernel@vger.kernel.org
-S:	Supported
+S:	Maintained
 F:	Documentation/filesystems/dnotify.txt
 F:	fs/notify/dnotify/
 F:	include/linux/dnotify.h
diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
index 26adf5d..904ff8d 100644
--- a/fs/notify/dnotify/Kconfig
+++ b/fs/notify/dnotify/Kconfig
@@ -1,5 +1,6 @@
 config DNOTIFY
 	bool "Dnotify support"
+	depends on FSNOTIFY
 	default y
 	help
 	  Dnotify is a directory-based per-fd file change notification system
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index b0aa2cd..d9d80f5 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -3,6 +3,9 @@
  *
  * Copyright (C) 2000,2001,2002 Stephen Rothwell
  *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * dnotify was largly rewritten to use the new fsnotify infrastructure
+ *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the
  * Free Software Foundation; either version 2, or (at your option) any
@@ -21,24 +24,178 @@
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/fdtable.h>
+#include <linux/fsnotify_backend.h>
 
 int dir_notify_enable __read_mostly = 1;
 
-static struct kmem_cache *dn_cache __read_mostly;
+static struct kmem_cache *dnotify_struct_cache __read_mostly;
+static struct kmem_cache *dnotify_mark_entry_cache __read_mostly;
+static struct fsnotify_group *dnotify_group __read_mostly;
+static DEFINE_MUTEX(dnotify_mark_mutex);
+
+/*
+ * dnotify will attach one of these to each inode (i_fsnotify_mark_entries) which
+ * is being watched by dnotify.  If multiple userspace applications are watching
+ * the same directory with dnotify their information is chained in dn
+ */
+struct dnotify_mark_entry {
+	struct fsnotify_mark_entry fsn_entry;
+	struct dnotify_struct *dn;
+};
 
-static void redo_inode_mask(struct inode *inode)
+/*
+ * When a process starts or stops watching an inode the set of events which
+ * dnotify cares about for that inode may change.  This function runs the
+ * list of everything receiving dnotify events about this directory and calculates
+ * the set of all those events.  After it updates what dnotify is interested in
+ * it calls the fsnotify function so it can update the set of all events relevant
+ * to this inode.
+ */
+static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry)
 {
-	unsigned long new_mask;
+	__u32 new_mask, old_mask;
 	struct dnotify_struct *dn;
+	struct dnotify_mark_entry *dnentry  = container_of(entry,
+							   struct dnotify_mark_entry,
+							   fsn_entry);
+
+	assert_spin_locked(&entry->lock);
 
+	old_mask = entry->mask;
 	new_mask = 0;
-	for (dn = inode->i_dnotify; dn != NULL; dn = dn->dn_next)
-		new_mask |= dn->dn_mask & ~DN_MULTISHOT;
-	inode->i_dnotify_mask = new_mask;
+	for (dn = dnentry->dn; dn != NULL; dn = dn->dn_next)
+		new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
+	entry->mask = new_mask;
+
+	if (old_mask == new_mask)
+		return;
+
+	if (entry->inode)
+		fsnotify_recalc_inode_mask(entry->inode);
 }
 
+/*
+ * Mains fsnotify call where events are delivered to dnotify.
+ * Find the dnotify mark on the relevant inode, run the list of dnotify structs
+ * on that mark and determine which of them has expressed interest in receiving
+ * events of this type.  When found send the correct process and signal and
+ * destroy the dnotify struct if it was not registered to receive multiple
+ * events.
+ */
+static int dnotify_handle_event(struct fsnotify_group *group,
+				struct fsnotify_event *event)
+{
+	struct fsnotify_mark_entry *entry = NULL;
+	struct dnotify_mark_entry *dnentry;
+	struct inode *to_tell;
+	struct dnotify_struct *dn;
+	struct dnotify_struct **prev;
+	struct fown_struct *fown;
+
+	to_tell = event->to_tell;
+
+	spin_lock(&to_tell->i_lock);
+	entry = fsnotify_find_mark_entry(group, to_tell);
+	spin_unlock(&to_tell->i_lock);
+
+	/* unlikely since we alreay passed dnotify_should_send_event() */
+	if (unlikely(!entry))
+		return 0;
+	dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
+
+	spin_lock(&entry->lock);
+	prev = &dnentry->dn;
+	while ((dn = *prev) != NULL) {
+		if ((dn->dn_mask & event->mask) == 0) {
+			prev = &dn->dn_next;
+			continue;
+		}
+		fown = &dn->dn_filp->f_owner;
+		send_sigio(fown, dn->dn_fd, POLL_MSG);
+		if (dn->dn_mask & FS_DN_MULTISHOT)
+			prev = &dn->dn_next;
+		else {
+			*prev = dn->dn_next;
+			kmem_cache_free(dnotify_struct_cache, dn);
+			dnotify_recalc_inode_mask(entry);
+		}
+	}
+
+	spin_unlock(&entry->lock);
+	fsnotify_put_mark(entry);
+
+	return 0;
+}
+
+/*
+ * Given an inode and mask determine if dnotify would be interested in sending
+ * userspace notification for that pair.
+ */
+static bool dnotify_should_send_event(struct fsnotify_group *group,
+				      struct inode *inode, __u32 mask)
+{
+	struct fsnotify_mark_entry *entry;
+	bool send;
+
+	/* !dir_notify_enable should never get here, don't waste time checking
+	if (!dir_notify_enable)
+		return 0; */
+
+	/* not a dir, dnotify doesn't care */
+	if (!S_ISDIR(inode->i_mode))
+		return false;
+
+	spin_lock(&inode->i_lock);
+	entry = fsnotify_find_mark_entry(group, inode);
+	spin_unlock(&inode->i_lock);
+
+	/* no mark means no dnotify watch */
+	if (!entry)
+		return false;
+
+	spin_lock(&entry->lock);
+	send = (mask & entry->mask) ? true : false;
+	spin_unlock(&entry->lock);
+	fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */
+
+	return send;
+}
+
+static void dnotify_freeing_mark(struct fsnotify_mark_entry *entry,
+				 struct fsnotify_group *group)
+{
+	/* dnotify doesn't care than an inode is on the way out */
+}
+
+static void dnotify_free_mark(struct fsnotify_mark_entry *entry)
+{
+	struct dnotify_mark_entry *dnentry = container_of(entry,
+							  struct dnotify_mark_entry,
+							  fsn_entry);
+
+	BUG_ON(dnentry->dn);
+
+	kmem_cache_free(dnotify_mark_entry_cache, dnentry);
+}
+
+static struct fsnotify_ops dnotify_fsnotify_ops = {
+	.handle_event = dnotify_handle_event,
+	.should_send_event = dnotify_should_send_event,
+	.free_group_priv = NULL,
+	.freeing_mark = dnotify_freeing_mark,
+};
+
+/*
+ * Called every time a file is closed.  Looks first for a dnotify mark on the
+ * inode.  If one is found run all of the ->dn entries attached to that
+ * mark for one relevant to this process closing the file and remove that
+ * dnotify_struct.  If that was the last dnotify_struct also remove the
+ * fsnotify_mark_entry.
+ */
 void dnotify_flush(struct file *filp, fl_owner_t id)
 {
+	struct fsnotify_mark_entry *entry;
+	struct dnotify_mark_entry *dnentry;
 	struct dnotify_struct *dn;
 	struct dnotify_struct **prev;
 	struct inode *inode;
@@ -46,145 +203,243 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
 	inode = filp->f_path.dentry->d_inode;
 	if (!S_ISDIR(inode->i_mode))
 		return;
+
 	spin_lock(&inode->i_lock);
-	prev = &inode->i_dnotify;
+	entry = fsnotify_find_mark_entry(dnotify_group, inode);
+	spin_unlock(&inode->i_lock);
+	if (!entry)
+		return;
+	dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
+
+	mutex_lock(&dnotify_mark_mutex);
+
+	spin_lock(&entry->lock);
+	prev = &dnentry->dn;
 	while ((dn = *prev) != NULL) {
 		if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
 			*prev = dn->dn_next;
-			redo_inode_mask(inode);
-			kmem_cache_free(dn_cache, dn);
+			kmem_cache_free(dnotify_struct_cache, dn);
+			dnotify_recalc_inode_mask(entry);
 			break;
 		}
 		prev = &dn->dn_next;
 	}
-	spin_unlock(&inode->i_lock);
+
+	spin_unlock(&entry->lock);
+
+	/* nothing else could have found us thanks to the dnotify_mark_mutex */
+	if (dnentry->dn == NULL)
+		fsnotify_destroy_mark_by_entry(entry);
+
+	fsnotify_recalc_group_mask(dnotify_group);
+
+	mutex_unlock(&dnotify_mark_mutex);
+
+	fsnotify_put_mark(entry);
+}
+
+/* this conversion is done only at watch creation */
+static __u32 convert_arg(unsigned long arg)
+{
+	__u32 new_mask = FS_EVENT_ON_CHILD;
+
+	if (arg & DN_MULTISHOT)
+		new_mask |= FS_DN_MULTISHOT;
+	if (arg & DN_DELETE)
+		new_mask |= (FS_DELETE | FS_MOVED_FROM);
+	if (arg & DN_MODIFY)
+		new_mask |= FS_MODIFY;
+	if (arg & DN_ACCESS)
+		new_mask |= FS_ACCESS;
+	if (arg & DN_ATTRIB)
+		new_mask |= FS_ATTRIB;
+	if (arg & DN_RENAME)
+		new_mask |= FS_DN_RENAME;
+	if (arg & DN_CREATE)
+		new_mask |= (FS_CREATE | FS_MOVED_TO);
+
+	return new_mask;
 }
 
+/*
+ * If multiple processes watch the same inode with dnotify there is only one
+ * dnotify mark in inode->i_fsnotify_mark_entries but we chain a dnotify_struct
+ * onto that mark.  This function either attaches the new dnotify_struct onto
+ * that list, or it |= the mask onto an existing dnofiy_struct.
+ */
+static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnentry,
+		     fl_owner_t id, int fd, struct file *filp, __u32 mask)
+{
+	struct dnotify_struct *odn;
+
+	odn = dnentry->dn;
+	while (odn != NULL) {
+		/* adding more events to existing dnofiy_struct? */
+		if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
+			odn->dn_fd = fd;
+			odn->dn_mask |= mask;
+			return -EEXIST;
+		}
+		odn = odn->dn_next;
+	}
+
+	dn->dn_mask = mask;
+	dn->dn_fd = fd;
+	dn->dn_filp = filp;
+	dn->dn_owner = id;
+	dn->dn_next = dnentry->dn;
+	dnentry->dn = dn;
+
+	return 0;
+}
+
+/*
+ * When a process calls fcntl to attach a dnotify watch to a directory it ends
+ * up here.  Allocate both a mark for fsnotify to add and a dnotify_struct to be
+ * attached to the fsnotify_mark.
+ */
 int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 {
+	struct dnotify_mark_entry *new_dnentry, *dnentry;
+	struct fsnotify_mark_entry *new_entry, *entry;
 	struct dnotify_struct *dn;
-	struct dnotify_struct *odn;
-	struct dnotify_struct **prev;
 	struct inode *inode;
 	fl_owner_t id = current->files;
 	struct file *f;
-	int error = 0;
+	int destroy = 0, error = 0;
+	__u32 mask;
+
+	/* we use these to tell if we need to kfree */
+	new_entry = NULL;
+	dn = NULL;
 
+	if (!dir_notify_enable) {
+		error = -EINVAL;
+		goto out_err;
+	}
+
+	/* a 0 mask means we are explicitly removing the watch */
 	if ((arg & ~DN_MULTISHOT) == 0) {
 		dnotify_flush(filp, id);
-		return 0;
+		error = 0;
+		goto out_err;
 	}
-	if (!dir_notify_enable)
-		return -EINVAL;
+
+	/* dnotify only works on directories */
 	inode = filp->f_path.dentry->d_inode;
-	if (!S_ISDIR(inode->i_mode))
-		return -ENOTDIR;
-	dn = kmem_cache_alloc(dn_cache, GFP_KERNEL);
-	if (dn == NULL)
-		return -ENOMEM;
-	spin_lock(&inode->i_lock);
-	prev = &inode->i_dnotify;
-	while ((odn = *prev) != NULL) {
-		if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
-			odn->dn_fd = fd;
-			odn->dn_mask |= arg;
-			inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
-			goto out_free;
-		}
-		prev = &odn->dn_next;
+	if (!S_ISDIR(inode->i_mode)) {
+		error = -ENOTDIR;
+		goto out_err;
 	}
 
-	rcu_read_lock();
-	f = fcheck(fd);
-	rcu_read_unlock();
-	/* we'd lost the race with close(), sod off silently */
-	/* note that inode->i_lock prevents reordering problems
-	 * between accesses to descriptor table and ->i_dnotify */
-	if (f != filp)
-		goto out_free;
+	/* expect most fcntl to add new rather than augment old */
+	dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL);
+	if (!dn) {
+		error = -ENOMEM;
+		goto out_err;
+	}
 
-	error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
-	if (error)
-		goto out_free;
+	/* new fsnotify mark, we expect most fcntl calls to add a new mark */
+	new_dnentry = kmem_cache_alloc(dnotify_mark_entry_cache, GFP_KERNEL);
+	if (!new_dnentry) {
+		error = -ENOMEM;
+		goto out_err;
+	}
 
-	dn->dn_mask = arg;
-	dn->dn_fd = fd;
-	dn->dn_filp = filp;
-	dn->dn_owner = id;
-	inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
-	dn->dn_next = inode->i_dnotify;
-	inode->i_dnotify = dn;
-	spin_unlock(&inode->i_lock);
-	return 0;
+	/* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */
+	mask = convert_arg(arg);
 
-out_free:
-	spin_unlock(&inode->i_lock);
-	kmem_cache_free(dn_cache, dn);
-	return error;
-}
+	/* set up the new_entry and new_dnentry */
+	new_entry = &new_dnentry->fsn_entry;
+	fsnotify_init_mark(new_entry, dnotify_free_mark);
+	new_entry->mask = mask;
+	new_dnentry->dn = NULL;
 
-void __inode_dir_notify(struct inode *inode, unsigned long event)
-{
-	struct dnotify_struct *	dn;
-	struct dnotify_struct **prev;
-	struct fown_struct *	fown;
-	int			changed = 0;
+	/* this is needed to prevent the fcntl/close race described below */
+	mutex_lock(&dnotify_mark_mutex);
 
+	/* add the new_entry or find an old one. */
 	spin_lock(&inode->i_lock);
-	prev = &inode->i_dnotify;
-	while ((dn = *prev) != NULL) {
-		if ((dn->dn_mask & event) == 0) {
-			prev = &dn->dn_next;
-			continue;
-		}
-		fown = &dn->dn_filp->f_owner;
-		send_sigio(fown, dn->dn_fd, POLL_MSG);
-		if (dn->dn_mask & DN_MULTISHOT)
-			prev = &dn->dn_next;
-		else {
-			*prev = dn->dn_next;
-			changed = 1;
-			kmem_cache_free(dn_cache, dn);
-		}
-	}
-	if (changed)
-		redo_inode_mask(inode);
+	entry = fsnotify_find_mark_entry(dnotify_group, inode);
 	spin_unlock(&inode->i_lock);
-}
-
-EXPORT_SYMBOL(__inode_dir_notify);
+	if (entry) {
+		dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
+		spin_lock(&entry->lock);
+	} else {
+		fsnotify_add_mark(new_entry, dnotify_group, inode);
+		spin_lock(&new_entry->lock);
+		entry = new_entry;
+		dnentry = new_dnentry;
+		/* we used new_entry, so don't free it */
+		new_entry = NULL;
+	}
 
-/*
- * This is hopelessly wrong, but unfixable without API changes.  At
- * least it doesn't oops the kernel...
- *
- * To safely access ->d_parent we need to keep d_move away from it.  Use the
- * dentry's d_lock for this.
- */
-void dnotify_parent(struct dentry *dentry, unsigned long event)
-{
-	struct dentry *parent;
+	rcu_read_lock();
+	f = fcheck(fd);
+	rcu_read_unlock();
 
-	if (!dir_notify_enable)
-		return;
+	/* if (f != filp) means that we lost a race and another task/thread
+	 * actually closed the fd we are still playing with before we grabbed
+	 * the dnotify_mark_mutex and entry->lock.  Since closing the fd is the
+	 * only time we clean up the mark entries we need to get our mark off
+	 * the list. */
+	if (f != filp) {
+		/* if we added ourselves, shoot ourselves, it's possible that
+		 * the flush actually did shoot this entry.  That's fine too
+		 * since multiple calls to destroy_mark is perfectly safe, if
+		 * we found a dnentry already attached to the inode, just sod
+		 * off silently as the flush at close time dealt with it.
+		 */
+		if (dnentry == new_dnentry)
+			destroy = 1;
+		goto out;
+	}
 
-	spin_lock(&dentry->d_lock);
-	parent = dentry->d_parent;
-	if (parent->d_inode->i_dnotify_mask & event) {
-		dget(parent);
-		spin_unlock(&dentry->d_lock);
-		__inode_dir_notify(parent->d_inode, event);
-		dput(parent);
-	} else {
-		spin_unlock(&dentry->d_lock);
+	error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
+	if (error) {
+		/* if we added, we must shoot */
+		if (dnentry == new_dnentry)
+			destroy = 1;
+		goto out;
 	}
+
+	error = attach_dn(dn, dnentry, id, fd, filp, mask);
+	/* !error means that we attached the dn to the dnentry, so don't free it */
+	if (!error)
+		dn = NULL;
+	/* -EEXIST means that we didn't add this new dn and used an old one.
+	 * that isn't an error (and the unused dn should be freed) */
+	else if (error == -EEXIST)
+		error = 0;
+
+	dnotify_recalc_inode_mask(entry);
+out:
+	spin_unlock(&entry->lock);
+
+	if (destroy)
+		fsnotify_destroy_mark_by_entry(entry);
+
+	fsnotify_recalc_group_mask(dnotify_group);
+
+	mutex_unlock(&dnotify_mark_mutex);
+	fsnotify_put_mark(entry);
+out_err:
+	if (new_entry)
+		fsnotify_put_mark(new_entry);
+	if (dn)
+		kmem_cache_free(dnotify_struct_cache, dn);
+	return error;
 }
-EXPORT_SYMBOL_GPL(dnotify_parent);
 
 static int __init dnotify_init(void)
 {
-	dn_cache = kmem_cache_create("dnotify_cache",
-		sizeof(struct dnotify_struct), 0, SLAB_PANIC, NULL);
+	dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC);
+	dnotify_mark_entry_cache = KMEM_CACHE(dnotify_mark_entry, SLAB_PANIC);
+
+	dnotify_group = fsnotify_obtain_group(DNOTIFY_GROUP_NUM,
+					      0, &dnotify_fsnotify_ops);
+	if (IS_ERR(dnotify_group))
+		panic("unable to allocate fsnotify group for dnotify\n");
 	return 0;
 }
 
diff --git a/include/linux/dnotify.h b/include/linux/dnotify.h
index 102a902..ecc0628 100644
--- a/include/linux/dnotify.h
+++ b/include/linux/dnotify.h
@@ -10,7 +10,7 @@
 
 struct dnotify_struct {
 	struct dnotify_struct *	dn_next;
-	unsigned long		dn_mask;
+	__u32			dn_mask;
 	int			dn_fd;
 	struct file *		dn_filp;
 	fl_owner_t		dn_owner;
@@ -21,23 +21,18 @@ struct dnotify_struct {
 
 #ifdef CONFIG_DNOTIFY
 
-extern void __inode_dir_notify(struct inode *, unsigned long);
+#define DNOTIFY_ALL_EVENTS (FS_DELETE | FS_DELETE_CHILD |\
+			    FS_MODIFY | FS_MODIFY_CHILD |\
+			    FS_ACCESS | FS_ACCESS_CHILD |\
+			    FS_ATTRIB | FS_ATTRIB_CHILD |\
+			    FS_CREATE | FS_DN_RENAME |\
+			    FS_MOVED_FROM | FS_MOVED_TO)
+
 extern void dnotify_flush(struct file *, fl_owner_t);
 extern int fcntl_dirnotify(int, struct file *, unsigned long);
-extern void dnotify_parent(struct dentry *, unsigned long);
-
-static inline void inode_dir_notify(struct inode *inode, unsigned long event)
-{
-	if (inode->i_dnotify_mask & (event))
-		__inode_dir_notify(inode, event);
-}
 
 #else
 
-static inline void __inode_dir_notify(struct inode *inode, unsigned long event)
-{
-}
-
 static inline void dnotify_flush(struct file *filp, fl_owner_t id)
 {
 }
@@ -47,14 +42,6 @@ static inline int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 	return -EINVAL;
 }
 
-static inline void dnotify_parent(struct dentry *dentry, unsigned long event)
-{
-}
-
-static inline void inode_dir_notify(struct inode *inode, unsigned long event)
-{
-}
-
 #endif /* CONFIG_DNOTIFY */
 
 #endif /* __KERNEL __ */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 275b086..323b5ce 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -760,11 +760,6 @@ struct inode {
 	struct hlist_head	i_fsnotify_mark_entries; /* fsnotify mark entries */
 #endif
 
-#ifdef CONFIG_DNOTIFY
-	unsigned long		i_dnotify_mask; /* Directory notify events */
-	struct dnotify_struct	*i_dnotify; /* for directory notifications */
-#endif
-
 #ifdef CONFIG_INOTIFY
 	struct list_head	inotify_watches; /* watches on this inode */
 	struct mutex		inotify_mutex;	/* protects the watches list */
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 6a662ed..db12d9d 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -74,13 +74,7 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 	__u32 new_dir_mask = 0;
 
 	if (old_dir == new_dir) {
-		inode_dir_notify(old_dir, DN_RENAME);
 		old_dir_mask = FS_DN_RENAME;
-	} else {
-		inode_dir_notify(old_dir, DN_DELETE);
-		old_dir_mask = FS_DELETE;
-		inode_dir_notify(new_dir, DN_CREATE);
-		new_dir_mask = FS_CREATE;
 	}
 
 	if (isdir) {
@@ -132,7 +126,6 @@ static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
 
 	if (isdir)
 		mask |= FS_IN_ISDIR;
-	dnotify_parent(dentry, DN_DELETE);
 
 	fsnotify_parent(dentry, mask);
 }
@@ -154,7 +147,6 @@ static inline void fsnotify_inoderemove(struct inode *inode)
  */
 static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
 {
-	inode_dir_notify(inode, DN_CREATE);
 	inotify_inode_queue_event(inode, IN_CREATE, 0, dentry->d_name.name,
 				  dentry->d_inode);
 	audit_inode_child(dentry->d_name.name, dentry, inode);
@@ -169,7 +161,6 @@ static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
  */
 static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct dentry *new_dentry)
 {
-	inode_dir_notify(dir, DN_CREATE);
 	inotify_inode_queue_event(dir, IN_CREATE, 0, new_dentry->d_name.name,
 				  inode);
 	fsnotify_link_count(inode);
@@ -186,7 +177,6 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 	__u32 mask = (FS_CREATE | FS_IN_ISDIR);
 	struct inode *d_inode = dentry->d_inode;
 
-	inode_dir_notify(inode, DN_CREATE);
 	inotify_inode_queue_event(inode, mask, 0, dentry->d_name.name, d_inode);
 	audit_inode_child(dentry->d_name.name, dentry, inode);
 
@@ -204,7 +194,6 @@ static inline void fsnotify_access(struct dentry *dentry)
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	dnotify_parent(dentry, DN_ACCESS);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
@@ -222,7 +211,6 @@ static inline void fsnotify_modify(struct dentry *dentry)
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	dnotify_parent(dentry, DN_MODIFY);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
@@ -289,47 +277,33 @@ static inline void fsnotify_xattr(struct dentry *dentry)
 static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
 {
 	struct inode *inode = dentry->d_inode;
-	int dn_mask = 0;
-	__u32 in_mask = 0;
+	__u32 mask = 0;
+
+	if (ia_valid & ATTR_UID)
+		mask |= FS_ATTRIB;
+	if (ia_valid & ATTR_GID)
+		mask |= FS_ATTRIB;
+	if (ia_valid & ATTR_SIZE)
+		mask |= FS_MODIFY;
 
-	if (ia_valid & ATTR_UID) {
-		in_mask |= FS_ATTRIB;
-		dn_mask |= DN_ATTRIB;
-	}
-	if (ia_valid & ATTR_GID) {
-		in_mask |= FS_ATTRIB;
-		dn_mask |= DN_ATTRIB;
-	}
-	if (ia_valid & ATTR_SIZE) {
-		in_mask |= FS_MODIFY;
-		dn_mask |= DN_MODIFY;
-	}
 	/* both times implies a utime(s) call */
 	if ((ia_valid & (ATTR_ATIME | ATTR_MTIME)) == (ATTR_ATIME | ATTR_MTIME))
-	{
-		in_mask |= FS_ATTRIB;
-		dn_mask |= DN_ATTRIB;
-	} else if (ia_valid & ATTR_ATIME) {
-		in_mask |= FS_ACCESS;
-		dn_mask |= DN_ACCESS;
-	} else if (ia_valid & ATTR_MTIME) {
-		in_mask |= FS_MODIFY;
-		dn_mask |= DN_MODIFY;
-	}
-	if (ia_valid & ATTR_MODE) {
-		in_mask |= FS_ATTRIB;
-		dn_mask |= DN_ATTRIB;
-	}
+		mask |= FS_ATTRIB;
+	else if (ia_valid & ATTR_ATIME)
+		mask |= FS_ACCESS;
+	else if (ia_valid & ATTR_MTIME)
+		mask |= FS_MODIFY;
+
+	if (ia_valid & ATTR_MODE)
+		mask |= FS_ATTRIB;
 
-	if (dn_mask)
-		dnotify_parent(dentry, dn_mask);
-	if (in_mask) {
+	if (mask) {
 		if (S_ISDIR(inode->i_mode))
-			in_mask |= FS_IN_ISDIR;
-		inotify_inode_queue_event(inode, in_mask, 0, NULL, NULL);
+			mask |= FS_IN_ISDIR;
+		inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
-		fsnotify_parent(dentry, in_mask);
-		fsnotify(inode, in_mask, inode, FSNOTIFY_EVENT_INODE);
+		fsnotify_parent(dentry, mask);
+		fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 	}
 }
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 13d2dd5..9ea800e 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -57,6 +57,9 @@
 				   FS_MOVED_FROM | FS_MOVED_TO | FS_CREATE |\
 				   FS_DELETE)
 
+/* listeners that hard code group numbers near the top */
+#define DNOTIFY_GROUP_NUM	UINT_MAX
+
 struct fsnotify_group;
 struct fsnotify_event;
 struct fsnotify_mark_entry;
-- 
cgit v0.10.2


From a2d8bc6cb4a3024661baf877242f123787d0c054 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:37 -0400
Subject: fsnotify: generic notification queue and waitq

inotify needs to do asyc notification in which event information is stored
on a queue until the listener is ready to receive it.  This patch
implements a generic notification queue for inotify (and later fanotify) to
store events to be sent at a later time.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>

diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 83b8ec0..4dc2408 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -13,8 +13,12 @@ extern struct list_head fsnotify_groups;
 /* all bitwise OR of all event types (FS_*) for all fsnotify_groups */
 extern __u32 fsnotify_mask;
 
+/* destroy all events sitting in this groups notification queue */
+extern void fsnotify_flush_notify(struct fsnotify_group *group);
+
 /* final kfree of a group */
 extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
+
 /* run the list of all marks associated with inode and flag them to be freed */
 extern void fsnotify_clear_marks_by_inode(struct inode *inode);
 /*
@@ -22,4 +26,9 @@ extern void fsnotify_clear_marks_by_inode(struct inode *inode);
  * about events that happen to its children.
  */
 extern void __fsnotify_update_child_dentry_flags(struct inode *inode);
+
+/* allocate and destroy and event holder to attach events to notification/access queues */
+extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void);
+extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder);
+
 #endif	/* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/fs/notify/group.c b/fs/notify/group.c
index a29d2fa..0e16771 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -91,6 +91,9 @@ static void fsnotify_get_group(struct fsnotify_group *group)
  */
 void fsnotify_final_destroy_group(struct fsnotify_group *group)
 {
+	/* clear the notification queue of all events */
+	fsnotify_flush_notify(group);
+
 	if (group->ops->free_group_priv)
 		group->ops->free_group_priv(group);
 
@@ -214,6 +217,12 @@ struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask,
 	group->group_num = group_num;
 	group->mask = mask;
 
+	mutex_init(&group->notification_mutex);
+	INIT_LIST_HEAD(&group->notification_list);
+	init_waitqueue_head(&group->notification_waitq);
+	group->q_len = 0;
+	group->max_events = UINT_MAX;
+
 	spin_lock_init(&group->mark_lock);
 	atomic_set(&group->num_marks, 0);
 	INIT_LIST_HEAD(&group->mark_entries);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index b8e9a87..dddecc7 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -16,6 +16,21 @@
  *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+/*
+ * Basic idea behind the notification queue: An fsnotify group (like inotify)
+ * sends the userspace notification about events asyncronously some time after
+ * the event happened.  When inotify gets an event it will need to add that
+ * event to the group notify queue.  Since a single event might need to be on
+ * multiple group's notification queues we can't add the event directly to each
+ * queue and instead add a small "event_holder" to each queue.  This event_holder
+ * has a pointer back to the original event.  Since the majority of events are
+ * going to end up on one, and only one, notification queue we embed one
+ * event_holder into each event.  This means we have a single allocation instead
+ * of always needing two.  If the embedded event_holder is already in use by
+ * another group a new event_holder (from fsnotify_event_holder_cachep) will be
+ * allocated and used.
+ */
+
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -33,6 +48,21 @@
 #include "fsnotify.h"
 
 static struct kmem_cache *fsnotify_event_cachep;
+static struct kmem_cache *fsnotify_event_holder_cachep;
+/*
+ * This is a magic event we send when the q is too full.  Since it doesn't
+ * hold real event information we just keep one system wide and use it any time
+ * it is needed.  It's refcnt is set 1 at kernel init time and will never
+ * get set to 0 so it will never get 'freed'
+ */
+static struct fsnotify_event q_overflow_event;
+
+/* return true if the notify queue is empty, false otherwise */
+bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
+{
+	BUG_ON(!mutex_is_locked(&group->notification_mutex));
+	return list_empty(&group->notification_list) ? true : false;
+}
 
 void fsnotify_get_event(struct fsnotify_event *event)
 {
@@ -52,19 +82,176 @@ void fsnotify_put_event(struct fsnotify_event *event)
 	}
 }
 
+struct fsnotify_event_holder *fsnotify_alloc_event_holder(void)
+{
+	return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL);
+}
+
+void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
+{
+	kmem_cache_free(fsnotify_event_holder_cachep, holder);
+}
+
+/*
+ * check if 2 events contain the same information.
+ */
+static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
+{
+	if ((old->mask == new->mask) &&
+	    (old->to_tell == new->to_tell) &&
+	    (old->data_type == new->data_type)) {
+		switch (old->data_type) {
+		case (FSNOTIFY_EVENT_INODE):
+			if (old->inode == new->inode)
+				return true;
+			break;
+		case (FSNOTIFY_EVENT_PATH):
+			if ((old->path.mnt == new->path.mnt) &&
+			    (old->path.dentry == new->path.dentry))
+				return true;
+		case (FSNOTIFY_EVENT_NONE):
+			return true;
+		};
+	}
+	return false;
+}
+
 /*
- * Allocate a new event which will be sent to each group's handle_event function
- * if the group was interested in this particular event.
+ * Add an event to the group notification queue.  The group can later pull this
+ * event off the queue to deal with.  If the event is successfully added to the
+ * group's notification queue, a reference is taken on event.
  */
-struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
-					     void *data, int data_type)
+int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event)
+{
+	struct fsnotify_event_holder *holder = NULL;
+	struct list_head *list = &group->notification_list;
+	struct fsnotify_event_holder *last_holder;
+	struct fsnotify_event *last_event;
+
+	/*
+	 * There is one fsnotify_event_holder embedded inside each fsnotify_event.
+	 * Check if we expect to be able to use that holder.  If not alloc a new
+	 * holder.
+	 * For the overflow event it's possible that something will use the in
+	 * event holder before we get the lock so we may need to jump back and
+	 * alloc a new holder, this can't happen for most events...
+	 */
+	if (!list_empty(&event->holder.event_list)) {
+alloc_holder:
+		holder = fsnotify_alloc_event_holder();
+		if (!holder)
+			return -ENOMEM;
+	}
+
+	mutex_lock(&group->notification_mutex);
+
+	if (group->q_len >= group->max_events)
+		event = &q_overflow_event;
+
+	spin_lock(&event->lock);
+
+	if (list_empty(&event->holder.event_list)) {
+		if (unlikely(holder))
+			fsnotify_destroy_event_holder(holder);
+		holder = &event->holder;
+	} else if (unlikely(!holder)) {
+		/* between the time we checked above and got the lock the in
+		 * event holder was used, go back and get a new one */
+		spin_unlock(&event->lock);
+		mutex_unlock(&group->notification_mutex);
+		goto alloc_holder;
+	}
+
+	if (!list_empty(list)) {
+		last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
+		last_event = last_holder->event;
+		if (event_compare(last_event, event)) {
+			spin_unlock(&event->lock);
+			mutex_unlock(&group->notification_mutex);
+			if (holder != &event->holder)
+				fsnotify_destroy_event_holder(holder);
+			return 0;
+		}
+	}
+
+	group->q_len++;
+	holder->event = event;
+
+	fsnotify_get_event(event);
+	list_add_tail(&holder->event_list, list);
+	spin_unlock(&event->lock);
+	mutex_unlock(&group->notification_mutex);
+
+	wake_up(&group->notification_waitq);
+	return 0;
+}
+
+/*
+ * Remove and return the first event from the notification list.  There is a
+ * reference held on this event since it was on the list.  It is the responsibility
+ * of the caller to drop this reference.
+ */
+struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group)
 {
 	struct fsnotify_event *event;
+	struct fsnotify_event_holder *holder;
 
-	event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
-	if (!event)
-		return NULL;
+	BUG_ON(!mutex_is_locked(&group->notification_mutex));
 
+	holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
+
+	event = holder->event;
+
+	spin_lock(&event->lock);
+	holder->event = NULL;
+	list_del_init(&holder->event_list);
+	spin_unlock(&event->lock);
+
+	/* event == holder means we are referenced through the in event holder */
+	if (holder != &event->holder)
+		fsnotify_destroy_event_holder(holder);
+
+	group->q_len--;
+
+	return event;
+}
+
+/*
+ * This will not remove the event, that must be done with fsnotify_remove_notify_event()
+ */
+struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
+{
+	struct fsnotify_event *event;
+	struct fsnotify_event_holder *holder;
+
+	BUG_ON(!mutex_is_locked(&group->notification_mutex));
+
+	holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
+	event = holder->event;
+
+	return event;
+}
+
+/*
+ * Called when a group is being torn down to clean up any outstanding
+ * event notifications.
+ */
+void fsnotify_flush_notify(struct fsnotify_group *group)
+{
+	struct fsnotify_event *event;
+
+	mutex_lock(&group->notification_mutex);
+	while (!fsnotify_notify_queue_is_empty(group)) {
+		event = fsnotify_remove_notify_event(group);
+		fsnotify_put_event(event); /* matches fsnotify_add_notify_event */
+	}
+	mutex_unlock(&group->notification_mutex);
+}
+
+static void initialize_event(struct fsnotify_event *event)
+{
+	event->holder.event = NULL;
+	INIT_LIST_HEAD(&event->holder.event_list);
 	atomic_set(&event->refcnt, 1);
 
 	spin_lock_init(&event->lock);
@@ -72,7 +259,32 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
 	event->path.dentry = NULL;
 	event->path.mnt = NULL;
 	event->inode = NULL;
+	event->data_type = FSNOTIFY_EVENT_NONE;
 
+	event->to_tell = NULL;
+}
+
+/*
+ * fsnotify_create_event - Allocate a new event which will be sent to each
+ * group's handle_event function if the group was interested in this
+ * particular event.
+ *
+ * @to_tell the inode which is supposed to receive the event (sometimes a
+ *	parent of the inode to which the event happened.
+ * @mask what actually happened.
+ * @data pointer to the object which was actually affected
+ * @data_type flag indication if the data is a file, path, inode, nothing...
+ */
+struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
+					     void *data, int data_type)
+{
+	struct fsnotify_event *event;
+
+	event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
+	if (!event)
+		return NULL;
+
+	initialize_event(event);
 	event->to_tell = to_tell;
 
 	switch (data_type) {
@@ -114,6 +326,10 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
 __init int fsnotify_notification_init(void)
 {
 	fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
+	fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
+
+	initialize_event(&q_overflow_event);
+	q_overflow_event.mask = FS_Q_OVERFLOW;
 
 	return 0;
 }
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 9ea800e..15f8f82 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -119,6 +119,13 @@ struct fsnotify_group {
 
 	const struct fsnotify_ops *ops;	/* how this group handles things */
 
+	/* needed to send notification to userspace */
+	struct mutex notification_mutex;	/* protect the notification_list */
+	struct list_head notification_list;	/* list of event_holder this group needs to send to userspace */
+	wait_queue_head_t notification_waitq;	/* read() on the notification file blocks on this waitq */
+	unsigned int q_len;			/* events on the queue */
+	unsigned int max_events;		/* maximum events allowed on the list */
+
 	/* stores all fastapth entries assoc with this group so they can be cleaned on unregister */
 	spinlock_t mark_lock;		/* protect mark_entries list */
 	atomic_t num_marks;		/* 1 for each mark entry and 1 for not being
@@ -136,11 +143,32 @@ struct fsnotify_group {
 };
 
 /*
+ * A single event can be queued in multiple group->notification_lists.
+ *
+ * each group->notification_list will point to an event_holder which in turns points
+ * to the actual event that needs to be sent to userspace.
+ *
+ * Seemed cheaper to create a refcnt'd event and a small holder for every group
+ * than create a different event for every group
+ *
+ */
+struct fsnotify_event_holder {
+	struct fsnotify_event *event;
+	struct list_head event_list;
+};
+
+/*
  * all of the information about the original object we want to now send to
  * a group.  If you want to carry more info from the accessing task to the
  * listener this structure is where you need to be adding fields.
  */
 struct fsnotify_event {
+	/*
+	 * If we create an event we are also likely going to need a holder
+	 * to link to a group.  So embed one holder in the event.  Means only
+	 * one allocation for the common case where we only have one group
+	 */
+	struct fsnotify_event_holder holder;
 	spinlock_t lock;	/* protection for the associated event_holder and private_list */
 	/* to_tell may ONLY be dereferenced during handle_event(). */
 	struct inode *to_tell;	/* either the inode the event happened to or its parent */
@@ -264,6 +292,15 @@ extern void fsnotify_put_event(struct fsnotify_event *event);
 extern struct fsnotify_event_private_data *fsnotify_get_priv_from_event(struct fsnotify_group *group,
 									struct fsnotify_event *event);
 
+/* attach the event to the group notification queue */
+extern int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event);
+/* true if the group notification queue is empty */
+extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
+/* return, but do not dequeue the first event on the notification queue */
+extern struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group);
+/* reutnr AND dequeue the first event on the notification queue */
+extern struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group);
+
 /* functions used to manipulate the marks attached to inodes */
 
 /* run all marks associated with an inode and update inode->i_fsnotify_mask */
-- 
cgit v0.10.2


From 62ffe5dfba056f7ba81d710fee9f28c58a42fdd6 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:43 -0400
Subject: fsnotify: include pathnames with entries when possible

When inotify wants to send events to a directory about a child it includes
the name of the original file.  This patch collects that filename and makes
it available for notification.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 7fc7600..675129f 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -114,7 +114,8 @@ void __fsnotify_parent(struct dentry *dentry, __u32 mask)
 		 * specifies these are events which came from a child. */
 		mask |= FS_EVENT_ON_CHILD;
 
-		fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE);
+		fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
+			 dentry->d_name.name);
 		dput(parent);
 	}
 
@@ -131,7 +132,7 @@ EXPORT_SYMBOL_GPL(__fsnotify_parent);
  * out to all of the registered fsnotify_group.  Those groups can then use the
  * notification event in whatever means they feel necessary.
  */
-void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
+void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name)
 {
 	struct fsnotify_group *group;
 	struct fsnotify_event *event = NULL;
@@ -156,7 +157,7 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
 			if (!group->ops->should_send_event(group, to_tell, mask))
 				continue;
 			if (!event) {
-				event = fsnotify_create_event(to_tell, mask, data, data_is);
+				event = fsnotify_create_event(to_tell, mask, data, data_is, file_name);
 				/* shit, we OOM'd and now we can't tell, maybe
 				 * someday someone else will want to do something
 				 * here */
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index dddecc7..c69b18b 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -78,6 +78,7 @@ void fsnotify_put_event(struct fsnotify_event *event)
 		if (event->data_type == FSNOTIFY_EVENT_PATH)
 			path_put(&event->path);
 
+		kfree(event->file_name);
 		kmem_cache_free(fsnotify_event_cachep, event);
 	}
 }
@@ -262,6 +263,9 @@ static void initialize_event(struct fsnotify_event *event)
 	event->data_type = FSNOTIFY_EVENT_NONE;
 
 	event->to_tell = NULL;
+
+	event->file_name = NULL;
+	event->name_len = 0;
 }
 
 /*
@@ -274,9 +278,10 @@ static void initialize_event(struct fsnotify_event *event)
  * @mask what actually happened.
  * @data pointer to the object which was actually affected
  * @data_type flag indication if the data is a file, path, inode, nothing...
+ * @name the filename, if available
  */
 struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
-					     void *data, int data_type)
+					     void *data, int data_type, const char *name)
 {
 	struct fsnotify_event *event;
 
@@ -285,6 +290,15 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
 		return NULL;
 
 	initialize_event(event);
+
+	if (name) {
+		event->file_name = kstrdup(name, GFP_KERNEL);
+		if (!event->file_name) {
+			kmem_cache_free(fsnotify_event_cachep, event);
+			return NULL;
+		}
+		event->name_len = strlen(event->file_name);
+	}
 	event->to_tell = to_tell;
 
 	switch (data_type) {
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index db12d9d..180740e 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -58,7 +58,7 @@ static inline void fsnotify_link_count(struct inode *inode)
 {
 	inotify_inode_queue_event(inode, IN_ATTRIB, 0, NULL, NULL);
 
-	fsnotify(inode, FS_ATTRIB, inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(inode, FS_ATTRIB, inode, FSNOTIFY_EVENT_INODE, NULL);
 }
 
 /*
@@ -91,8 +91,8 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 	inotify_inode_queue_event(new_dir, IN_MOVED_TO|isdir, cookie, new_name,
 				  source);
 
-	fsnotify(old_dir, old_dir_mask, old_dir, FSNOTIFY_EVENT_INODE);
-	fsnotify(new_dir, new_dir_mask, new_dir, FSNOTIFY_EVENT_INODE);
+	fsnotify(old_dir, old_dir_mask, old_dir, FSNOTIFY_EVENT_INODE, old_name);
+	fsnotify(new_dir, new_dir_mask, new_dir, FSNOTIFY_EVENT_INODE, new_name);
 
 	if (target) {
 		inotify_inode_queue_event(target, IN_DELETE_SELF, 0, NULL, NULL);
@@ -104,7 +104,7 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 
 	if (source) {
 		inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL, NULL);
-		fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE);
+		fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE, NULL);
 	}
 	audit_inode_child(new_name, moved, new_dir);
 }
@@ -138,7 +138,7 @@ static inline void fsnotify_inoderemove(struct inode *inode)
 	inotify_inode_queue_event(inode, IN_DELETE_SELF, 0, NULL, NULL);
 	inotify_inode_is_dead(inode);
 
-	fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE, NULL);
 	__fsnotify_inode_delete(inode);
 }
 
@@ -151,7 +151,7 @@ static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
 				  dentry->d_inode);
 	audit_inode_child(dentry->d_name.name, dentry, inode);
 
-	fsnotify(inode, FS_CREATE, dentry->d_inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(inode, FS_CREATE, dentry->d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name);
 }
 
 /*
@@ -166,7 +166,7 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct
 	fsnotify_link_count(inode);
 	audit_inode_child(new_dentry->d_name.name, new_dentry, dir);
 
-	fsnotify(dir, FS_CREATE, inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(dir, FS_CREATE, inode, FSNOTIFY_EVENT_INODE, new_dentry->d_name.name);
 }
 
 /*
@@ -180,7 +180,7 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, dentry->d_name.name, d_inode);
 	audit_inode_child(dentry->d_name.name, dentry, inode);
 
-	fsnotify(inode, mask, d_inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(inode, mask, d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name);
 }
 
 /*
@@ -197,7 +197,7 @@ static inline void fsnotify_access(struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
 }
 
 /*
@@ -214,7 +214,7 @@ static inline void fsnotify_modify(struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
 }
 
 /*
@@ -231,7 +231,7 @@ static inline void fsnotify_open(struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
 }
 
 /*
@@ -250,7 +250,7 @@ static inline void fsnotify_close(struct file *file)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE);
+	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL);
 }
 
 /*
@@ -267,7 +267,7 @@ static inline void fsnotify_xattr(struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
 }
 
 /*
@@ -303,7 +303,7 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
 		inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 		fsnotify_parent(dentry, mask);
-		fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
+		fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
 	}
 }
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 15f8f82..52692f4 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -192,6 +192,9 @@ struct fsnotify_event {
 	int data_type;		/* which of the above union we have */
 	atomic_t refcnt;	/* how many groups still are using/need to send this event */
 	__u32 mask;		/* the type of access, bitwise OR for FS_* event types */
+
+	char *file_name;
+	size_t name_len;
 };
 
 /*
@@ -224,7 +227,7 @@ struct fsnotify_mark_entry {
 /* called from the vfs helpers */
 
 /* main fsnotify call to send events */
-extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is);
+extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *name);
 extern void __fsnotify_parent(struct dentry *dentry, __u32 mask);
 extern void __fsnotify_inode_delete(struct inode *inode);
 
@@ -319,10 +322,12 @@ extern void fsnotify_put_mark(struct fsnotify_mark_entry *entry);
 
 /* put here because inotify does some weird stuff when destroying watches */
 extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
-						    void *data, int data_is);
+						    void *data, int data_is, const char *name);
+
 #else
 
-static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
+static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
+			    const char *name);
 {}
 
 static inline void __fsnotify_parent(struct dentry *dentry, __u32 mask)
-- 
cgit v0.10.2


From 47882c6f51e8ef41fbbe2bbb746a1ea3228dd7ca Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:47 -0400
Subject: fsnotify: add correlations between events

As part of the standard inotify events it includes a correlation cookie
between two dentry move operations.  This patch includes the same behaviour
in fsnotify events.  It is needed so that inotify userspace can be
implemented on top of fsnotify.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 675129f..f11d75f 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -115,7 +115,7 @@ void __fsnotify_parent(struct dentry *dentry, __u32 mask)
 		mask |= FS_EVENT_ON_CHILD;
 
 		fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
-			 dentry->d_name.name);
+			 dentry->d_name.name, 0);
 		dput(parent);
 	}
 
@@ -132,7 +132,7 @@ EXPORT_SYMBOL_GPL(__fsnotify_parent);
  * out to all of the registered fsnotify_group.  Those groups can then use the
  * notification event in whatever means they feel necessary.
  */
-void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name)
+void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name, u32 cookie)
 {
 	struct fsnotify_group *group;
 	struct fsnotify_event *event = NULL;
@@ -157,7 +157,7 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const
 			if (!group->ops->should_send_event(group, to_tell, mask))
 				continue;
 			if (!event) {
-				event = fsnotify_create_event(to_tell, mask, data, data_is, file_name);
+				event = fsnotify_create_event(to_tell, mask, data, data_is, file_name, cookie);
 				/* shit, we OOM'd and now we can't tell, maybe
 				 * someday someone else will want to do something
 				 * here */
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index c69b18b..346f6e5 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
+#include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/mutex.h>
 #include <linux/namei.h>
@@ -56,6 +57,17 @@ static struct kmem_cache *fsnotify_event_holder_cachep;
  * get set to 0 so it will never get 'freed'
  */
 static struct fsnotify_event q_overflow_event;
+static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
+
+/**
+ * fsnotify_get_cookie - return a unique cookie for use in synchronizing events.
+ * Called from fsnotify_move, which is inlined into filesystem modules.
+ */
+u32 fsnotify_get_cookie(void)
+{
+	return atomic_inc_return(&fsnotify_sync_cookie);
+}
+EXPORT_SYMBOL_GPL(fsnotify_get_cookie);
 
 /* return true if the notify queue is empty, false otherwise */
 bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
@@ -266,6 +278,8 @@ static void initialize_event(struct fsnotify_event *event)
 
 	event->file_name = NULL;
 	event->name_len = 0;
+
+	event->sync_cookie = 0;
 }
 
 /*
@@ -280,8 +294,8 @@ static void initialize_event(struct fsnotify_event *event)
  * @data_type flag indication if the data is a file, path, inode, nothing...
  * @name the filename, if available
  */
-struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
-					     void *data, int data_type, const char *name)
+struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
+					     int data_type, const char *name, u32 cookie)
 {
 	struct fsnotify_event *event;
 
@@ -299,6 +313,8 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
 		}
 		event->name_len = strlen(event->file_name);
 	}
+
+	event->sync_cookie = cookie;
 	event->to_tell = to_tell;
 
 	switch (data_type) {
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 180740e..c25b39d 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -58,7 +58,7 @@ static inline void fsnotify_link_count(struct inode *inode)
 {
 	inotify_inode_queue_event(inode, IN_ATTRIB, 0, NULL, NULL);
 
-	fsnotify(inode, FS_ATTRIB, inode, FSNOTIFY_EVENT_INODE, NULL);
+	fsnotify(inode, FS_ATTRIB, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 }
 
 /*
@@ -69,7 +69,8 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 				 int isdir, struct inode *target, struct dentry *moved)
 {
 	struct inode *source = moved->d_inode;
-	u32 cookie = inotify_get_cookie();
+	u32 in_cookie = inotify_get_cookie();
+	u32 fs_cookie = fsnotify_get_cookie();
 	__u32 old_dir_mask = 0;
 	__u32 new_dir_mask = 0;
 
@@ -86,13 +87,13 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 	old_dir_mask |= FS_MOVED_FROM;
 	new_dir_mask |= FS_MOVED_TO;
 
-	inotify_inode_queue_event(old_dir, IN_MOVED_FROM|isdir,cookie,old_name,
+	inotify_inode_queue_event(old_dir, IN_MOVED_FROM|isdir, in_cookie, old_name,
 				  source);
-	inotify_inode_queue_event(new_dir, IN_MOVED_TO|isdir, cookie, new_name,
+	inotify_inode_queue_event(new_dir, IN_MOVED_TO|isdir, in_cookie, new_name,
 				  source);
 
-	fsnotify(old_dir, old_dir_mask, old_dir, FSNOTIFY_EVENT_INODE, old_name);
-	fsnotify(new_dir, new_dir_mask, new_dir, FSNOTIFY_EVENT_INODE, new_name);
+	fsnotify(old_dir, old_dir_mask, old_dir, FSNOTIFY_EVENT_INODE, old_name, fs_cookie);
+	fsnotify(new_dir, new_dir_mask, new_dir, FSNOTIFY_EVENT_INODE, new_name, fs_cookie);
 
 	if (target) {
 		inotify_inode_queue_event(target, IN_DELETE_SELF, 0, NULL, NULL);
@@ -104,7 +105,7 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 
 	if (source) {
 		inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL, NULL);
-		fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE, NULL);
+		fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 	}
 	audit_inode_child(new_name, moved, new_dir);
 }
@@ -138,7 +139,7 @@ static inline void fsnotify_inoderemove(struct inode *inode)
 	inotify_inode_queue_event(inode, IN_DELETE_SELF, 0, NULL, NULL);
 	inotify_inode_is_dead(inode);
 
-	fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE, NULL);
+	fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 	__fsnotify_inode_delete(inode);
 }
 
@@ -151,7 +152,7 @@ static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
 				  dentry->d_inode);
 	audit_inode_child(dentry->d_name.name, dentry, inode);
 
-	fsnotify(inode, FS_CREATE, dentry->d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name);
+	fsnotify(inode, FS_CREATE, dentry->d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0);
 }
 
 /*
@@ -166,7 +167,7 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct
 	fsnotify_link_count(inode);
 	audit_inode_child(new_dentry->d_name.name, new_dentry, dir);
 
-	fsnotify(dir, FS_CREATE, inode, FSNOTIFY_EVENT_INODE, new_dentry->d_name.name);
+	fsnotify(dir, FS_CREATE, inode, FSNOTIFY_EVENT_INODE, new_dentry->d_name.name, 0);
 }
 
 /*
@@ -180,7 +181,7 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, dentry->d_name.name, d_inode);
 	audit_inode_child(dentry->d_name.name, dentry, inode);
 
-	fsnotify(inode, mask, d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name);
+	fsnotify(inode, mask, d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0);
 }
 
 /*
@@ -197,7 +198,7 @@ static inline void fsnotify_access(struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 }
 
 /*
@@ -214,7 +215,7 @@ static inline void fsnotify_modify(struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 }
 
 /*
@@ -231,7 +232,7 @@ static inline void fsnotify_open(struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 }
 
 /*
@@ -250,7 +251,7 @@ static inline void fsnotify_close(struct file *file)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL);
+	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
 }
 
 /*
@@ -267,7 +268,7 @@ static inline void fsnotify_xattr(struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 }
 
 /*
@@ -303,7 +304,7 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
 		inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 		fsnotify_parent(dentry, mask);
-		fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
+		fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 	}
 }
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 52692f4..b78b557 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -193,6 +193,7 @@ struct fsnotify_event {
 	atomic_t refcnt;	/* how many groups still are using/need to send this event */
 	__u32 mask;		/* the type of access, bitwise OR for FS_* event types */
 
+	u32 sync_cookie;	/* used to corrolate events, namely inotify mv events */
 	char *file_name;
 	size_t name_len;
 };
@@ -227,9 +228,11 @@ struct fsnotify_mark_entry {
 /* called from the vfs helpers */
 
 /* main fsnotify call to send events */
-extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *name);
+extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
+		     const char *name, u32 cookie);
 extern void __fsnotify_parent(struct dentry *dentry, __u32 mask);
 extern void __fsnotify_inode_delete(struct inode *inode);
+extern u32 fsnotify_get_cookie(void);
 
 static inline int fsnotify_inode_watches_children(struct inode *inode)
 {
@@ -322,12 +325,13 @@ extern void fsnotify_put_mark(struct fsnotify_mark_entry *entry);
 
 /* put here because inotify does some weird stuff when destroying watches */
 extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
-						    void *data, int data_is, const char *name);
+						    void *data, int data_is, const char *name,
+						    u32 cookie);
 
 #else
 
 static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
-			    const char *name);
+			    const char *name, u32 cookie)
 {}
 
 static inline void __fsnotify_parent(struct dentry *dentry, __u32 mask)
@@ -342,6 +346,11 @@ static inline void __fsnotify_update_dcache_flags(struct dentry *dentry)
 static inline void __fsnotify_d_instantiate(struct dentry *dentry, struct inode *inode)
 {}
 
+static inline u32 fsnotify_get_cookie(void)
+{
+	return 0;
+}
+
 #endif	/* CONFIG_FSNOTIFY */
 
 #endif	/* __KERNEL __ */
-- 
cgit v0.10.2


From e4aff117368cfdd3567ee41844d216d079b55173 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:50 -0400
Subject: fsnotify: allow groups to add private data to events

inotify needs per group information attached to events.  This patch allows
groups to attach private information and implements a callback so that
information can be freed when an event is being destroyed.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index d9d80f5..12f9e6b 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -183,6 +183,7 @@ static struct fsnotify_ops dnotify_fsnotify_ops = {
 	.should_send_event = dnotify_should_send_event,
 	.free_group_priv = NULL,
 	.freeing_mark = dnotify_freeing_mark,
+	.free_event_priv = NULL,
 };
 
 /*
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 346f6e5..959b73e 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -90,6 +90,8 @@ void fsnotify_put_event(struct fsnotify_event *event)
 		if (event->data_type == FSNOTIFY_EVENT_PATH)
 			path_put(&event->path);
 
+		BUG_ON(!list_empty(&event->private_data_list));
+
 		kfree(event->file_name);
 		kmem_cache_free(fsnotify_event_cachep, event);
 	}
@@ -106,7 +108,29 @@ void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
 }
 
 /*
- * check if 2 events contain the same information.
+ * Find the private data that the group previously attached to this event when
+ * the group added the event to the notification queue (fsnotify_add_notify_event)
+ */
+struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event)
+{
+	struct fsnotify_event_private_data *lpriv;
+	struct fsnotify_event_private_data *priv = NULL;
+
+	assert_spin_locked(&event->lock);
+
+	list_for_each_entry(lpriv, &event->private_data_list, event_list) {
+		if (lpriv->group == group) {
+			priv = lpriv;
+			list_del(&priv->event_list);
+			break;
+		}
+	}
+	return priv;
+}
+
+/*
+ * Check if 2 events contain the same information.  We do not compare private data
+ * but at this moment that isn't a problem for any know fsnotify listeners.
  */
 static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
 {
@@ -134,13 +158,17 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
  * event off the queue to deal with.  If the event is successfully added to the
  * group's notification queue, a reference is taken on event.
  */
-int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event)
+int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
+			      struct fsnotify_event_private_data *priv)
 {
 	struct fsnotify_event_holder *holder = NULL;
 	struct list_head *list = &group->notification_list;
 	struct fsnotify_event_holder *last_holder;
 	struct fsnotify_event *last_event;
 
+	/* easy to tell if priv was attached to the event */
+	INIT_LIST_HEAD(&priv->event_list);
+
 	/*
 	 * There is one fsnotify_event_holder embedded inside each fsnotify_event.
 	 * Check if we expect to be able to use that holder.  If not alloc a new
@@ -158,8 +186,11 @@ alloc_holder:
 
 	mutex_lock(&group->notification_mutex);
 
-	if (group->q_len >= group->max_events)
+	if (group->q_len >= group->max_events) {
 		event = &q_overflow_event;
+		/* sorry, no private data on the overflow event */
+		priv = NULL;
+	}
 
 	spin_lock(&event->lock);
 
@@ -183,7 +214,7 @@ alloc_holder:
 			mutex_unlock(&group->notification_mutex);
 			if (holder != &event->holder)
 				fsnotify_destroy_event_holder(holder);
-			return 0;
+			return -EEXIST;
 		}
 	}
 
@@ -192,6 +223,8 @@ alloc_holder:
 
 	fsnotify_get_event(event);
 	list_add_tail(&holder->event_list, list);
+	if (priv)
+		list_add_tail(&priv->event_list, &event->private_data_list);
 	spin_unlock(&event->lock);
 	mutex_unlock(&group->notification_mutex);
 
@@ -252,10 +285,19 @@ struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
 void fsnotify_flush_notify(struct fsnotify_group *group)
 {
 	struct fsnotify_event *event;
+	struct fsnotify_event_private_data *priv;
 
 	mutex_lock(&group->notification_mutex);
 	while (!fsnotify_notify_queue_is_empty(group)) {
 		event = fsnotify_remove_notify_event(group);
+		/* if they don't implement free_event_priv they better not have attached any */
+		if (group->ops->free_event_priv) {
+			spin_lock(&event->lock);
+			priv = fsnotify_remove_priv_from_event(group, event);
+			spin_unlock(&event->lock);
+			if (priv)
+				group->ops->free_event_priv(priv);
+		}
 		fsnotify_put_event(event); /* matches fsnotify_add_notify_event */
 	}
 	mutex_unlock(&group->notification_mutex);
@@ -274,6 +316,8 @@ static void initialize_event(struct fsnotify_event *event)
 	event->inode = NULL;
 	event->data_type = FSNOTIFY_EVENT_NONE;
 
+	INIT_LIST_HEAD(&event->private_data_list);
+
 	event->to_tell = NULL;
 
 	event->file_name = NULL;
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index b78b557..efdf9e4 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -63,6 +63,7 @@
 struct fsnotify_group;
 struct fsnotify_event;
 struct fsnotify_mark_entry;
+struct fsnotify_event_private_data;
 
 /*
  * Each group much define these ops.  The fsnotify infrastructure will call
@@ -81,6 +82,7 @@ struct fsnotify_ops {
 	int (*handle_event)(struct fsnotify_group *group, struct fsnotify_event *event);
 	void (*free_group_priv)(struct fsnotify_group *group);
 	void (*freeing_mark)(struct fsnotify_mark_entry *entry, struct fsnotify_group *group);
+	void (*free_event_priv)(struct fsnotify_event_private_data *priv);
 };
 
 /*
@@ -158,6 +160,15 @@ struct fsnotify_event_holder {
 };
 
 /*
+ * Inotify needs to tack data onto an event.  This struct lets us later find the
+ * correct private data of the correct group.
+ */
+struct fsnotify_event_private_data {
+	struct fsnotify_group *group;
+	struct list_head event_list;
+};
+
+/*
  * all of the information about the original object we want to now send to
  * a group.  If you want to carry more info from the accessing task to the
  * listener this structure is where you need to be adding fields.
@@ -196,6 +207,8 @@ struct fsnotify_event {
 	u32 sync_cookie;	/* used to corrolate events, namely inotify mv events */
 	char *file_name;
 	size_t name_len;
+
+	struct list_head private_data_list;	/* groups can store private data here */
 };
 
 /*
@@ -294,17 +307,18 @@ extern void fsnotify_put_group(struct fsnotify_group *group);
 /* take a reference to an event */
 extern void fsnotify_get_event(struct fsnotify_event *event);
 extern void fsnotify_put_event(struct fsnotify_event *event);
-/* find private data previously attached to an event */
-extern struct fsnotify_event_private_data *fsnotify_get_priv_from_event(struct fsnotify_group *group,
-									struct fsnotify_event *event);
+/* find private data previously attached to an event and unlink it */
+extern struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group,
+									   struct fsnotify_event *event);
 
 /* attach the event to the group notification queue */
-extern int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event);
+extern int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
+				     struct fsnotify_event_private_data *priv);
 /* true if the group notification queue is empty */
 extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
 /* return, but do not dequeue the first event on the notification queue */
 extern struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group);
-/* reutnr AND dequeue the first event on the notification queue */
+/* return AND dequeue the first event on the notification queue */
 extern struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group);
 
 /* functions used to manipulate the marks attached to inodes */
-- 
cgit v0.10.2


From 1ef5f13c6c8acd3fd10db9f1743f3b4cf30a4abb Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:54 -0400
Subject: fsnotify: fsnotify marks on inodes pin them in core

This patch pins any inodes with an fsnotify mark in core.  The idea is that
as soon as the mark is removed from the inode->fsnotify_mark_entries list
the inode will be iput.  In reality is doesn't quite work exactly this way.
The igrab will happen when the mark is added to an inode, but the iput will
happen when the inode pointer is NULL'd inside the mark.

It's possible that 2 racing things will try to remove the mark from
different directions.  One may try to remove the mark because of an
explicit request and one might try to remove it because the inode was
deleted.  It's possible that the removal because of inode deletion will
remove the mark from the inode's list, but the removal by explicit request
will actually set entry->inode == NULL; and call the iput.  This is safe.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>

diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index a395348..282150f 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -204,6 +204,8 @@ void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
 	 */
 
 
+	iput(inode);
+
 	/*
 	 * it's possible that this group tried to destroy itself, but this
 	 * this mark was simultaneously being freed by inode.  If that's the
@@ -306,6 +308,10 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
 	struct fsnotify_mark_entry *lentry;
 	int ret = 0;
 
+	inode = igrab(inode);
+	if (unlikely(!inode))
+		return -EINVAL;
+
 	/*
 	 * LOCKING ORDER!!!!
 	 * entry->lock
@@ -337,6 +343,7 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
 
 	if (lentry) {
 		ret = -EEXIST;
+		iput(inode);
 		fsnotify_put_mark(lentry);
 	} else {
 		__fsnotify_update_child_dentry_flags(inode);
-- 
cgit v0.10.2


From 164bc6195139047faaf5ada1278332e99494803b Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:58 -0400
Subject: fsnotify: handle filesystem unmounts with fsnotify marks

When an fs is unmounted with an fsnotify mark entry attached to one of its
inodes we need to destroy that mark entry and we also (like inotify) send
an unmount event.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>

diff --git a/fs/inode.c b/fs/inode.c
index 54c63ce..ca33701 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -407,6 +407,7 @@ int invalidate_inodes(struct super_block *sb)
 	mutex_lock(&iprune_mutex);
 	spin_lock(&inode_lock);
 	inotify_unmount_inodes(&sb->s_inodes);
+	fsnotify_unmount_inodes(&sb->s_inodes);
 	busy = invalidate_list(&sb->s_inodes, &throw_away);
 	spin_unlock(&inode_lock);
 
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 282150f..0a499d2 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -89,6 +89,7 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <linux/writeback.h> /* for inode_lock */
 
 #include <asm/atomic.h>
 
@@ -351,3 +352,74 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
 
 	return ret;
 }
+
+/**
+ * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
+ * @list: list of inodes being unmounted (sb->s_inodes)
+ *
+ * Called with inode_lock held, protecting the unmounting super block's list
+ * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
+ * We temporarily drop inode_lock, however, and CAN block.
+ */
+void fsnotify_unmount_inodes(struct list_head *list)
+{
+	struct inode *inode, *next_i, *need_iput = NULL;
+
+	list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
+		struct inode *need_iput_tmp;
+
+		/*
+		 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
+		 * I_WILL_FREE, or I_NEW which is fine because by that point
+		 * the inode cannot have any associated watches.
+		 */
+		if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
+			continue;
+
+		/*
+		 * If i_count is zero, the inode cannot have any watches and
+		 * doing an __iget/iput with MS_ACTIVE clear would actually
+		 * evict all inodes with zero i_count from icache which is
+		 * unnecessarily violent and may in fact be illegal to do.
+		 */
+		if (!atomic_read(&inode->i_count))
+			continue;
+
+		need_iput_tmp = need_iput;
+		need_iput = NULL;
+
+		/* In case fsnotify_inode_delete() drops a reference. */
+		if (inode != need_iput_tmp)
+			__iget(inode);
+		else
+			need_iput_tmp = NULL;
+
+		/* In case the dropping of a reference would nuke next_i. */
+		if ((&next_i->i_sb_list != list) &&
+		    atomic_read(&next_i->i_count) &&
+		    !(next_i->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))) {
+			__iget(next_i);
+			need_iput = next_i;
+		}
+
+		/*
+		 * We can safely drop inode_lock here because we hold
+		 * references on both inode and next_i.  Also no new inodes
+		 * will be added since the umount has begun.  Finally,
+		 * iprune_mutex keeps shrink_icache_memory() away.
+		 */
+		spin_unlock(&inode_lock);
+
+		if (need_iput_tmp)
+			iput(need_iput_tmp);
+
+		/* for each watch, send FS_UNMOUNT and then remove it */
+		fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+
+		fsnotify_inode_delete(inode);
+
+		iput(inode);
+
+		spin_lock(&inode_lock);
+	}
+}
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index efdf9e4..d2c0ee3 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -336,6 +336,7 @@ extern void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry);
 extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group);
 extern void fsnotify_get_mark(struct fsnotify_mark_entry *entry);
 extern void fsnotify_put_mark(struct fsnotify_mark_entry *entry);
+extern void fsnotify_unmount_inodes(struct list_head *list);
 
 /* put here because inotify does some weird stuff when destroying watches */
 extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
@@ -365,6 +366,9 @@ static inline u32 fsnotify_get_cookie(void)
 	return 0;
 }
 
+static inline void fsnotify_unmount_inodes(struct list_head *list)
+{}
+
 #endif	/* CONFIG_FSNOTIFY */
 
 #endif	/* __KERNEL __ */
-- 
cgit v0.10.2


From 63c882a05416e18de6fb59f7dd6da48f3bbe8273 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:02:01 -0400
Subject: inotify: reimplement inotify using fsnotify

Reimplement inotify_user using fsnotify.  This should be feature for feature
exactly the same as the original inotify_user.  This does not make any changes
to the in kernel inotify feature used by audit.  Those patches (and the eventual
removal of in kernel inotify) will come after the new inotify_user proves to be
working correctly.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>

diff --git a/MAINTAINERS b/MAINTAINERS
index 96e0c8c..e697b67 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2858,6 +2858,8 @@ P:	John McCutchan
 M:	john@johnmccutchan.com
 P:	Robert Love
 M:	rlove@rlove.org
+P:	Eric Paris
+M:	eparis@parisplace.org
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	Documentation/filesystems/inotify.txt
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index 4467928..5356884 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -1,26 +1,30 @@
 config INOTIFY
 	bool "Inotify file change notification support"
-	default y
+	default n
 	---help---
-	  Say Y here to enable inotify support.  Inotify is a file change
-	  notification system and a replacement for dnotify.  Inotify fixes
-	  numerous shortcomings in dnotify and introduces several new features
-	  including multiple file events, one-shot support, and unmount
-	  notification.
+	  Say Y here to enable legacy in kernel inotify support.  Inotify is a
+	  file change notification system.  It is a replacement for dnotify.
+	  This option only provides the legacy inotify in kernel API.  There
+	  are no in tree kernel users of this interface since it is deprecated.
+	  You only need this if you are loading an out of tree kernel module
+	  that uses inotify.
 
 	  For more information, see <file:Documentation/filesystems/inotify.txt>
 
-	  If unsure, say Y.
+	  If unsure, say N.
 
 config INOTIFY_USER
 	bool "Inotify support for userspace"
-	depends on INOTIFY
+	depends on FSNOTIFY
 	default y
 	---help---
 	  Say Y here to enable inotify support for userspace, including the
 	  associated system calls.  Inotify allows monitoring of both files and
 	  directories via a single open fd.  Events are read from the file
 	  descriptor, which is also select()- and poll()-able.
+	  Inotify fixes numerous shortcomings in dnotify and introduces several
+	  new features including multiple file events, one-shot support, and
+	  unmount notification.
 
 	  For more information, see <file:Documentation/filesystems/inotify.txt>
 
diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile
index e290f3b..9438281 100644
--- a/fs/notify/inotify/Makefile
+++ b/fs/notify/inotify/Makefile
@@ -1,2 +1,2 @@
 obj-$(CONFIG_INOTIFY)		+= inotify.o
-obj-$(CONFIG_INOTIFY_USER)	+= inotify_user.o
+obj-$(CONFIG_INOTIFY_USER)	+= inotify_fsnotify.o inotify_user.o
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
new file mode 100644
index 0000000..ea2605a
--- /dev/null
+++ b/fs/notify/inotify/inotify.h
@@ -0,0 +1,21 @@
+#include <linux/fsnotify_backend.h>
+#include <linux/inotify.h>
+#include <linux/slab.h> /* struct kmem_cache */
+
+extern struct kmem_cache *event_priv_cachep;
+
+struct inotify_event_private_data {
+	struct fsnotify_event_private_data fsnotify_event_priv_data;
+	int wd;
+};
+
+struct inotify_inode_mark_entry {
+	/* fsnotify_mark_entry MUST be the first thing */
+	struct fsnotify_mark_entry fsn_entry;
+	int wd;
+};
+
+extern void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group);
+extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
+
+extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
new file mode 100644
index 0000000..160da54
--- /dev/null
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -0,0 +1,137 @@
+/*
+ * fs/inotify_user.c - inotify support for userspace
+ *
+ * Authors:
+ *	John McCutchan	<ttb@tentacle.dhs.org>
+ *	Robert Love	<rml@novell.com>
+ *
+ * Copyright (C) 2005 John McCutchan
+ * Copyright 2006 Hewlett-Packard Development Company, L.P.
+ *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * inotify was largely rewriten to make use of the fsnotify infrastructure
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/fs.h> /* struct inode */
+#include <linux/fsnotify_backend.h>
+#include <linux/inotify.h>
+#include <linux/path.h> /* struct path */
+#include <linux/slab.h> /* kmem_* */
+#include <linux/types.h>
+
+#include "inotify.h"
+
+static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event)
+{
+	struct fsnotify_mark_entry *entry;
+	struct inotify_inode_mark_entry *ientry;
+	struct inode *to_tell;
+	struct inotify_event_private_data *event_priv;
+	struct fsnotify_event_private_data *fsn_event_priv;
+	int wd, ret;
+
+	to_tell = event->to_tell;
+
+	spin_lock(&to_tell->i_lock);
+	entry = fsnotify_find_mark_entry(group, to_tell);
+	spin_unlock(&to_tell->i_lock);
+	/* race with watch removal?  We already passes should_send */
+	if (unlikely(!entry))
+		return 0;
+	ientry = container_of(entry, struct inotify_inode_mark_entry,
+			      fsn_entry);
+	wd = ientry->wd;
+
+	event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
+	if (unlikely(!event_priv))
+		return -ENOMEM;
+
+	fsn_event_priv = &event_priv->fsnotify_event_priv_data;
+
+	fsn_event_priv->group = group;
+	event_priv->wd = wd;
+
+	ret = fsnotify_add_notify_event(group, event, fsn_event_priv);
+	/* EEXIST is not an error */
+	if (ret == -EEXIST)
+		ret = 0;
+
+	/* did event_priv get attached? */
+	if (list_empty(&fsn_event_priv->event_list))
+		inotify_free_event_priv(fsn_event_priv);
+
+	/*
+	 * If we hold the entry until after the event is on the queue
+	 * IN_IGNORED won't be able to pass this event in the queue
+	 */
+	fsnotify_put_mark(entry);
+
+	return ret;
+}
+
+static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
+{
+	inotify_destroy_mark_entry(entry, group);
+}
+
+static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask)
+{
+	struct fsnotify_mark_entry *entry;
+	bool send;
+
+	spin_lock(&inode->i_lock);
+	entry = fsnotify_find_mark_entry(group, inode);
+	spin_unlock(&inode->i_lock);
+	if (!entry)
+		return false;
+
+	send = (entry->mask & mask);
+
+	/* find took a reference */
+	fsnotify_put_mark(entry);
+
+	return send;
+}
+
+static int idr_callback(int id, void *p, void *data)
+{
+	BUG();
+	return 0;
+}
+
+static void inotify_free_group_priv(struct fsnotify_group *group)
+{
+	/* ideally the idr is empty and we won't hit the BUG in teh callback */
+	idr_for_each(&group->inotify_data.idr, idr_callback, NULL);
+	idr_remove_all(&group->inotify_data.idr);
+	idr_destroy(&group->inotify_data.idr);
+}
+
+void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
+{
+	struct inotify_event_private_data *event_priv;
+
+
+	event_priv = container_of(fsn_event_priv, struct inotify_event_private_data,
+				  fsnotify_event_priv_data);
+
+	kmem_cache_free(event_priv_cachep, event_priv);
+}
+
+const struct fsnotify_ops inotify_fsnotify_ops = {
+	.handle_event = inotify_handle_event,
+	.should_send_event = inotify_should_send_event,
+	.free_group_priv = inotify_free_group_priv,
+	.free_event_priv = inotify_free_event_priv,
+	.freeing_mark = inotify_freeing_mark,
+};
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 1634319..982a412 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -8,6 +8,9 @@
  * Copyright (C) 2005 John McCutchan
  * Copyright 2006 Hewlett-Packard Development Company, L.P.
  *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * inotify was largely rewriten to make use of the fsnotify infrastructure
+ *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the
  * Free Software Foundation; either version 2, or (at your option) any
@@ -19,94 +22,48 @@
  * General Public License for more details.
  */
 
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
 #include <linux/file.h>
-#include <linux/mount.h>
-#include <linux/namei.h>
-#include <linux/poll.h>
-#include <linux/init.h>
-#include <linux/list.h>
+#include <linux/fs.h> /* struct inode */
+#include <linux/fsnotify_backend.h>
+#include <linux/idr.h>
+#include <linux/init.h> /* module_init */
 #include <linux/inotify.h>
+#include <linux/kernel.h> /* roundup() */
+#include <linux/magic.h> /* superblock magic number */
+#include <linux/mount.h> /* mntget */
+#include <linux/namei.h> /* LOOKUP_FOLLOW */
+#include <linux/path.h> /* struct path */
+#include <linux/sched.h> /* struct user */
+#include <linux/slab.h> /* struct kmem_cache */
 #include <linux/syscalls.h>
-#include <linux/magic.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/poll.h>
+#include <linux/wait.h>
 
-#include <asm/ioctls.h>
+#include "inotify.h"
 
-static struct kmem_cache *watch_cachep __read_mostly;
-static struct kmem_cache *event_cachep __read_mostly;
+#include <asm/ioctls.h>
 
 static struct vfsmount *inotify_mnt __read_mostly;
 
+/* this just sits here and wastes global memory.  used to just pad userspace messages with zeros */
+static struct inotify_event nul_inotify_event;
+
 /* these are configurable via /proc/sys/fs/inotify/ */
 static int inotify_max_user_instances __read_mostly;
-static int inotify_max_user_watches __read_mostly;
 static int inotify_max_queued_events __read_mostly;
+int inotify_max_user_watches __read_mostly;
 
-/*
- * Lock ordering:
- *
- * inotify_dev->up_mutex (ensures we don't re-add the same watch)
- * 	inode->inotify_mutex (protects inode's watch list)
- * 		inotify_handle->mutex (protects inotify_handle's watch list)
- * 			inotify_dev->ev_mutex (protects device's event queue)
- */
+static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
+struct kmem_cache *event_priv_cachep __read_mostly;
+static struct fsnotify_event *inotify_ignored_event;
 
 /*
- * Lifetimes of the main data structures:
- *
- * inotify_device: Lifetime is managed by reference count, from
- * sys_inotify_init() until release.  Additional references can bump the count
- * via get_inotify_dev() and drop the count via put_inotify_dev().
- *
- * inotify_user_watch: Lifetime is from create_watch() to the receipt of an
- * IN_IGNORED event from inotify, or when using IN_ONESHOT, to receipt of the
- * first event, or to inotify_destroy().
+ * When inotify registers a new group it increments this and uses that
+ * value as an offset to set the fsnotify group "name" and priority.
  */
-
-/*
- * struct inotify_device - represents an inotify instance
- *
- * This structure is protected by the mutex 'mutex'.
- */
-struct inotify_device {
-	wait_queue_head_t 	wq;		/* wait queue for i/o */
-	struct mutex		ev_mutex;	/* protects event queue */
-	struct mutex		up_mutex;	/* synchronizes watch updates */
-	struct list_head 	events;		/* list of queued events */
-	struct user_struct	*user;		/* user who opened this dev */
-	struct inotify_handle	*ih;		/* inotify handle */
-	struct fasync_struct    *fa;            /* async notification */
-	atomic_t		count;		/* reference count */
-	unsigned int		queue_size;	/* size of the queue (bytes) */
-	unsigned int		event_count;	/* number of pending events */
-	unsigned int		max_events;	/* maximum number of events */
-};
-
-/*
- * struct inotify_kernel_event - An inotify event, originating from a watch and
- * queued for user-space.  A list of these is attached to each instance of the
- * device.  In read(), this list is walked and all events that can fit in the
- * buffer are returned.
- *
- * Protected by dev->ev_mutex of the device in which we are queued.
- */
-struct inotify_kernel_event {
-	struct inotify_event	event;	/* the user-space event */
-	struct list_head        list;	/* entry in inotify_device's list */
-	char			*name;	/* filename, if any */
-};
-
-/*
- * struct inotify_user_watch - our version of an inotify_watch, we add
- * a reference to the associated inotify_device.
- */
-struct inotify_user_watch {
-	struct inotify_device	*dev;	/* associated device */
-	struct inotify_watch	wdata;	/* inotify watch data */
-};
+static atomic_t inotify_grp_num;
 
 #ifdef CONFIG_SYSCTL
 
@@ -149,280 +106,36 @@ ctl_table inotify_table[] = {
 };
 #endif /* CONFIG_SYSCTL */
 
-static inline void get_inotify_dev(struct inotify_device *dev)
-{
-	atomic_inc(&dev->count);
-}
-
-static inline void put_inotify_dev(struct inotify_device *dev)
-{
-	if (atomic_dec_and_test(&dev->count)) {
-		atomic_dec(&dev->user->inotify_devs);
-		free_uid(dev->user);
-		kfree(dev);
-	}
-}
-
-/*
- * free_inotify_user_watch - cleans up the watch and its references
- */
-static void free_inotify_user_watch(struct inotify_watch *w)
-{
-	struct inotify_user_watch *watch;
-	struct inotify_device *dev;
-
-	watch = container_of(w, struct inotify_user_watch, wdata);
-	dev = watch->dev;
-
-	atomic_dec(&dev->user->inotify_watches);
-	put_inotify_dev(dev);
-	kmem_cache_free(watch_cachep, watch);
-}
-
-/*
- * kernel_event - create a new kernel event with the given parameters
- *
- * This function can sleep.
- */
-static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
-						  const char *name)
-{
-	struct inotify_kernel_event *kevent;
-
-	kevent = kmem_cache_alloc(event_cachep, GFP_NOFS);
-	if (unlikely(!kevent))
-		return NULL;
-
-	/* we hand this out to user-space, so zero it just in case */
-	memset(&kevent->event, 0, sizeof(struct inotify_event));
-
-	kevent->event.wd = wd;
-	kevent->event.mask = mask;
-	kevent->event.cookie = cookie;
-
-	INIT_LIST_HEAD(&kevent->list);
-
-	if (name) {
-		size_t len, rem, event_size = sizeof(struct inotify_event);
-
-		/*
-		 * We need to pad the filename so as to properly align an
-		 * array of inotify_event structures.  Because the structure is
-		 * small and the common case is a small filename, we just round
-		 * up to the next multiple of the structure's sizeof.  This is
-		 * simple and safe for all architectures.
-		 */
-		len = strlen(name) + 1;
-		rem = event_size - len;
-		if (len > event_size) {
-			rem = event_size - (len % event_size);
-			if (len % event_size == 0)
-				rem = 0;
-		}
-
-		kevent->name = kmalloc(len + rem, GFP_NOFS);
-		if (unlikely(!kevent->name)) {
-			kmem_cache_free(event_cachep, kevent);
-			return NULL;
-		}
-		memcpy(kevent->name, name, len);
-		if (rem)
-			memset(kevent->name + len, 0, rem);
-		kevent->event.len = len + rem;
-	} else {
-		kevent->event.len = 0;
-		kevent->name = NULL;
-	}
-
-	return kevent;
-}
-
-/*
- * inotify_dev_get_event - return the next event in the given dev's queue
- *
- * Caller must hold dev->ev_mutex.
- */
-static inline struct inotify_kernel_event *
-inotify_dev_get_event(struct inotify_device *dev)
-{
-	return list_entry(dev->events.next, struct inotify_kernel_event, list);
-}
-
-/*
- * inotify_dev_get_last_event - return the last event in the given dev's queue
- *
- * Caller must hold dev->ev_mutex.
- */
-static inline struct inotify_kernel_event *
-inotify_dev_get_last_event(struct inotify_device *dev)
+static inline __u32 inotify_arg_to_mask(u32 arg)
 {
-	if (list_empty(&dev->events))
-		return NULL;
-	return list_entry(dev->events.prev, struct inotify_kernel_event, list);
-}
+	__u32 mask;
 
-/*
- * inotify_dev_queue_event - event handler registered with core inotify, adds
- * a new event to the given device
- *
- * Can sleep (calls kernel_event()).
- */
-static void inotify_dev_queue_event(struct inotify_watch *w, u32 wd, u32 mask,
-				    u32 cookie, const char *name,
-				    struct inode *ignored)
-{
-	struct inotify_user_watch *watch;
-	struct inotify_device *dev;
-	struct inotify_kernel_event *kevent, *last;
+	/* everything should accept their own ignored and cares about children */
+	mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD);
 
-	watch = container_of(w, struct inotify_user_watch, wdata);
-	dev = watch->dev;
+	/* mask off the flags used to open the fd */
+	mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT));
 
-	mutex_lock(&dev->ev_mutex);
-
-	/* we can safely put the watch as we don't reference it while
-	 * generating the event
-	 */
-	if (mask & IN_IGNORED || w->mask & IN_ONESHOT)
-		put_inotify_watch(w); /* final put */
-
-	/* coalescing: drop this event if it is a dupe of the previous */
-	last = inotify_dev_get_last_event(dev);
-	if (last && last->event.mask == mask && last->event.wd == wd &&
-			last->event.cookie == cookie) {
-		const char *lastname = last->name;
-
-		if (!name && !lastname)
-			goto out;
-		if (name && lastname && !strcmp(lastname, name))
-			goto out;
-	}
-
-	/* the queue overflowed and we already sent the Q_OVERFLOW event */
-	if (unlikely(dev->event_count > dev->max_events))
-		goto out;
-
-	/* if the queue overflows, we need to notify user space */
-	if (unlikely(dev->event_count == dev->max_events))
-		kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
-	else
-		kevent = kernel_event(wd, mask, cookie, name);
-
-	if (unlikely(!kevent))
-		goto out;
-
-	/* queue the event and wake up anyone waiting */
-	dev->event_count++;
-	dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
-	list_add_tail(&kevent->list, &dev->events);
-	wake_up_interruptible(&dev->wq);
-	kill_fasync(&dev->fa, SIGIO, POLL_IN);
-
-out:
-	mutex_unlock(&dev->ev_mutex);
-}
-
-/*
- * remove_kevent - cleans up the given kevent
- *
- * Caller must hold dev->ev_mutex.
- */
-static void remove_kevent(struct inotify_device *dev,
-			  struct inotify_kernel_event *kevent)
-{
-	list_del(&kevent->list);
-
-	dev->event_count--;
-	dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
-}
-
-/*
- * free_kevent - frees the given kevent.
- */
-static void free_kevent(struct inotify_kernel_event *kevent)
-{
-	kfree(kevent->name);
-	kmem_cache_free(event_cachep, kevent);
-}
-
-/*
- * inotify_dev_event_dequeue - destroy an event on the given device
- *
- * Caller must hold dev->ev_mutex.
- */
-static void inotify_dev_event_dequeue(struct inotify_device *dev)
-{
-	if (!list_empty(&dev->events)) {
-		struct inotify_kernel_event *kevent;
-		kevent = inotify_dev_get_event(dev);
-		remove_kevent(dev, kevent);
-		free_kevent(kevent);
-	}
-}
-
-/*
- * find_inode - resolve a user-given path to a specific inode
- */
-static int find_inode(const char __user *dirname, struct path *path,
-		      unsigned flags)
-{
-	int error;
-
-	error = user_path_at(AT_FDCWD, dirname, flags, path);
-	if (error)
-		return error;
-	/* you can only watch an inode if you have read permissions on it */
-	error = inode_permission(path->dentry->d_inode, MAY_READ);
-	if (error)
-		path_put(path);
-	return error;
+	return mask;
 }
 
-/*
- * create_watch - creates a watch on the given device.
- *
- * Callers must hold dev->up_mutex.
- */
-static int create_watch(struct inotify_device *dev, struct inode *inode,
-			u32 mask)
+static inline u32 inotify_mask_to_arg(__u32 mask)
 {
-	struct inotify_user_watch *watch;
-	int ret;
-
-	if (atomic_read(&dev->user->inotify_watches) >=
-			inotify_max_user_watches)
-		return -ENOSPC;
-
-	watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
-	if (unlikely(!watch))
-		return -ENOMEM;
-
-	/* save a reference to device and bump the count to make it official */
-	get_inotify_dev(dev);
-	watch->dev = dev;
-
-	atomic_inc(&dev->user->inotify_watches);
-
-	inotify_init_watch(&watch->wdata);
-	ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask);
-	if (ret < 0)
-		free_inotify_user_watch(&watch->wdata);
-
-	return ret;
+	return mask & (IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT | IN_IGNORED |
+		       IN_Q_OVERFLOW);
 }
 
-/* Device Interface */
-
+/* intofiy userspace file descriptor functions */
 static unsigned int inotify_poll(struct file *file, poll_table *wait)
 {
-	struct inotify_device *dev = file->private_data;
+	struct fsnotify_group *group = file->private_data;
 	int ret = 0;
 
-	poll_wait(file, &dev->wq, wait);
-	mutex_lock(&dev->ev_mutex);
-	if (!list_empty(&dev->events))
+	poll_wait(file, &group->notification_waitq, wait);
+	mutex_lock(&group->notification_mutex);
+	if (!fsnotify_notify_queue_is_empty(group))
 		ret = POLLIN | POLLRDNORM;
-	mutex_unlock(&dev->ev_mutex);
+	mutex_unlock(&group->notification_mutex);
 
 	return ret;
 }
@@ -432,26 +145,29 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
  * enough to fit in "count". Return an error pointer if
  * not large enough.
  *
- * Called with the device ev_mutex held.
+ * Called with the group->notification_mutex held.
  */
-static struct inotify_kernel_event *get_one_event(struct inotify_device *dev,
-						  size_t count)
+static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
+					    size_t count)
 {
 	size_t event_size = sizeof(struct inotify_event);
-	struct inotify_kernel_event *kevent;
+	struct fsnotify_event *event;
 
-	if (list_empty(&dev->events))
+	if (fsnotify_notify_queue_is_empty(group))
 		return NULL;
 
-	kevent = inotify_dev_get_event(dev);
-	if (kevent->name)
-		event_size += kevent->event.len;
+	event = fsnotify_peek_notify_event(group);
+
+	event_size += roundup(event->name_len, event_size);
 
 	if (event_size > count)
 		return ERR_PTR(-EINVAL);
 
-	remove_kevent(dev, kevent);
-	return kevent;
+	/* held the notification_mutex the whole time, so this is the
+	 * same event we peeked above */
+	fsnotify_remove_notify_event(group);
+
+	return event;
 }
 
 /*
@@ -460,51 +176,90 @@ static struct inotify_kernel_event *get_one_event(struct inotify_device *dev,
  * We already checked that the event size is smaller than the
  * buffer we had in "get_one_event()" above.
  */
-static ssize_t copy_event_to_user(struct inotify_kernel_event *kevent,
+static ssize_t copy_event_to_user(struct fsnotify_group *group,
+				  struct fsnotify_event *event,
 				  char __user *buf)
 {
+	struct inotify_event inotify_event;
+	struct fsnotify_event_private_data *fsn_priv;
+	struct inotify_event_private_data *priv;
 	size_t event_size = sizeof(struct inotify_event);
+	size_t name_len;
+
+	/* we get the inotify watch descriptor from the event private data */
+	spin_lock(&event->lock);
+	fsn_priv = fsnotify_remove_priv_from_event(group, event);
+	spin_unlock(&event->lock);
+
+	if (!fsn_priv)
+		inotify_event.wd = -1;
+	else {
+		priv = container_of(fsn_priv, struct inotify_event_private_data,
+				    fsnotify_event_priv_data);
+		inotify_event.wd = priv->wd;
+		inotify_free_event_priv(fsn_priv);
+	}
+
+	/* round up event->name_len so it is a multiple of event_size */
+	name_len = roundup(event->name_len, event_size);
+	inotify_event.len = name_len;
+
+	inotify_event.mask = inotify_mask_to_arg(event->mask);
+	inotify_event.cookie = event->sync_cookie;
 
-	if (copy_to_user(buf, &kevent->event, event_size))
+	/* send the main event */
+	if (copy_to_user(buf, &inotify_event, event_size))
 		return -EFAULT;
 
-	if (kevent->name) {
-		buf += event_size;
+	buf += event_size;
 
-		if (copy_to_user(buf, kevent->name, kevent->event.len))
+	/*
+	 * fsnotify only stores the pathname, so here we have to send the pathname
+	 * and then pad that pathname out to a multiple of sizeof(inotify_event)
+	 * with zeros.  I get my zeros from the nul_inotify_event.
+	 */
+	if (name_len) {
+		unsigned int len_to_zero = name_len - event->name_len;
+		/* copy the path name */
+		if (copy_to_user(buf, event->file_name, event->name_len))
 			return -EFAULT;
+		buf += event->name_len;
 
-		event_size += kevent->event.len;
+		/* fill userspace with 0's from nul_inotify_event */
+		if (copy_to_user(buf, &nul_inotify_event, len_to_zero))
+			return -EFAULT;
+		buf += len_to_zero;
+		event_size += name_len;
 	}
+
 	return event_size;
 }
 
 static ssize_t inotify_read(struct file *file, char __user *buf,
 			    size_t count, loff_t *pos)
 {
-	struct inotify_device *dev;
+	struct fsnotify_group *group;
+	struct fsnotify_event *kevent;
 	char __user *start;
 	int ret;
 	DEFINE_WAIT(wait);
 
 	start = buf;
-	dev = file->private_data;
+	group = file->private_data;
 
 	while (1) {
-		struct inotify_kernel_event *kevent;
+		prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE);
 
-		prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
-
-		mutex_lock(&dev->ev_mutex);
-		kevent = get_one_event(dev, count);
-		mutex_unlock(&dev->ev_mutex);
+		mutex_lock(&group->notification_mutex);
+		kevent = get_one_event(group, count);
+		mutex_unlock(&group->notification_mutex);
 
 		if (kevent) {
 			ret = PTR_ERR(kevent);
 			if (IS_ERR(kevent))
 				break;
-			ret = copy_event_to_user(kevent, buf);
-			free_kevent(kevent);
+			ret = copy_event_to_user(group, kevent, buf);
+			fsnotify_put_event(kevent);
 			if (ret < 0)
 				break;
 			buf += ret;
@@ -525,7 +280,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 		schedule();
 	}
 
-	finish_wait(&dev->wq, &wait);
+	finish_wait(&group->notification_waitq, &wait);
 	if (start != buf && ret != -EFAULT)
 		ret = buf - start;
 	return ret;
@@ -533,25 +288,19 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 
 static int inotify_fasync(int fd, struct file *file, int on)
 {
-	struct inotify_device *dev = file->private_data;
+	struct fsnotify_group *group = file->private_data;
 
-	return fasync_helper(fd, file, on, &dev->fa) >= 0 ? 0 : -EIO;
+	return fasync_helper(fd, file, on, &group->inotify_data.fa) >= 0 ? 0 : -EIO;
 }
 
 static int inotify_release(struct inode *ignored, struct file *file)
 {
-	struct inotify_device *dev = file->private_data;
-
-	inotify_destroy(dev->ih);
+	struct fsnotify_group *group = file->private_data;
 
-	/* destroy all of the events on this device */
-	mutex_lock(&dev->ev_mutex);
-	while (!list_empty(&dev->events))
-		inotify_dev_event_dequeue(dev);
-	mutex_unlock(&dev->ev_mutex);
+	fsnotify_clear_marks_by_group(group);
 
-	/* free this device: the put matching the get in inotify_init() */
-	put_inotify_dev(dev);
+	/* free this group, matching get was inotify_init->fsnotify_obtain_group */
+	fsnotify_put_group(group);
 
 	return 0;
 }
@@ -559,16 +308,27 @@ static int inotify_release(struct inode *ignored, struct file *file)
 static long inotify_ioctl(struct file *file, unsigned int cmd,
 			  unsigned long arg)
 {
-	struct inotify_device *dev;
+	struct fsnotify_group *group;
+	struct fsnotify_event_holder *holder;
+	struct fsnotify_event *event;
 	void __user *p;
 	int ret = -ENOTTY;
+	size_t send_len = 0;
 
-	dev = file->private_data;
+	group = file->private_data;
 	p = (void __user *) arg;
 
 	switch (cmd) {
 	case FIONREAD:
-		ret = put_user(dev->queue_size, (int __user *) p);
+		mutex_lock(&group->notification_mutex);
+		list_for_each_entry(holder, &group->notification_list, event_list) {
+			event = holder->event;
+			send_len += sizeof(struct inotify_event);
+			send_len += roundup(event->name_len,
+					     sizeof(struct inotify_event));
+		}
+		mutex_unlock(&group->notification_mutex);
+		ret = put_user(send_len, (int __user *) p);
 		break;
 	}
 
@@ -576,23 +336,233 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
 }
 
 static const struct file_operations inotify_fops = {
-	.poll           = inotify_poll,
-	.read           = inotify_read,
-	.fasync         = inotify_fasync,
-	.release        = inotify_release,
-	.unlocked_ioctl = inotify_ioctl,
+	.poll		= inotify_poll,
+	.read		= inotify_read,
+	.fasync		= inotify_fasync,
+	.release	= inotify_release,
+	.unlocked_ioctl	= inotify_ioctl,
 	.compat_ioctl	= inotify_ioctl,
 };
 
-static const struct inotify_operations inotify_user_ops = {
-	.handle_event	= inotify_dev_queue_event,
-	.destroy_watch	= free_inotify_user_watch,
-};
 
+/*
+ * find_inode - resolve a user-given path to a specific inode
+ */
+static int inotify_find_inode(const char __user *dirname, struct path *path, unsigned flags)
+{
+	int error;
+
+	error = user_path_at(AT_FDCWD, dirname, flags, path);
+	if (error)
+		return error;
+	/* you can only watch an inode if you have read permissions on it */
+	error = inode_permission(path->dentry->d_inode, MAY_READ);
+	if (error)
+		path_put(path);
+	return error;
+}
+
+/*
+ * When, for whatever reason, inotify is done with a mark (or what used to be a
+ * watch) we need to remove that watch from the idr and we need to send IN_IGNORED
+ * for the given wd.
+ *
+ * There is a bit of recursion here.  The loop looks like:
+ * 	inotify_destroy_mark_entry -> fsnotify_destroy_mark_by_entry ->
+ *	inotify_freeing_mark -> inotify_destory_mark_entry -> restart
+ * But the loop is broken in 2 places.  fsnotify_destroy_mark_by_entry sets
+ * entry->group = NULL before the call to inotify_freeing_mark, so the if (egroup)
+ * test below will not call back to fsnotify again.  But even if that test wasn't
+ * there this would still be safe since fsnotify_destroy_mark_by_entry() is
+ * safe from recursion.
+ */
+void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
+{
+	struct inotify_inode_mark_entry *ientry;
+	struct inotify_event_private_data *event_priv;
+	struct fsnotify_event_private_data *fsn_event_priv;
+	struct fsnotify_group *egroup;
+	struct idr *idr;
+
+	spin_lock(&entry->lock);
+	egroup = entry->group;
+
+	/* if egroup we aren't really done and something might still send events
+	 * for this inode, on the callback we'll send the IN_IGNORED */
+	if (egroup) {
+		spin_unlock(&entry->lock);
+		fsnotify_destroy_mark_by_entry(entry);
+		return;
+	}
+	spin_unlock(&entry->lock);
+
+	ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+
+	event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
+	if (unlikely(!event_priv))
+		goto skip_send_ignore;
+
+	fsn_event_priv = &event_priv->fsnotify_event_priv_data;
+
+	fsn_event_priv->group = group;
+	event_priv->wd = ientry->wd;
+
+	fsnotify_add_notify_event(group, inotify_ignored_event, fsn_event_priv);
+
+	/* did the private data get added? */
+	if (list_empty(&fsn_event_priv->event_list))
+		inotify_free_event_priv(fsn_event_priv);
+
+skip_send_ignore:
+
+	/* remove this entry from the idr */
+	spin_lock(&group->inotify_data.idr_lock);
+	idr = &group->inotify_data.idr;
+	idr_remove(idr, ientry->wd);
+	spin_unlock(&group->inotify_data.idr_lock);
+
+	/* removed from idr, drop that reference */
+	fsnotify_put_mark(entry);
+}
+
+/* ding dong the mark is dead */
+static void inotify_free_mark(struct fsnotify_mark_entry *entry)
+{
+	struct inotify_inode_mark_entry *ientry = (struct inotify_inode_mark_entry *)entry;
+
+	kmem_cache_free(inotify_inode_mark_cachep, ientry);
+}
+
+static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
+{
+	struct fsnotify_mark_entry *entry = NULL;
+	struct inotify_inode_mark_entry *ientry;
+	int ret = 0;
+	int add = (arg & IN_MASK_ADD);
+	__u32 mask;
+	__u32 old_mask, new_mask;
+
+	/* don't allow invalid bits: we don't want flags set */
+	mask = inotify_arg_to_mask(arg);
+	if (unlikely(!mask))
+		return -EINVAL;
+
+	ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
+	if (unlikely(!ientry))
+		return -ENOMEM;
+	/* we set the mask at the end after attaching it */
+	fsnotify_init_mark(&ientry->fsn_entry, inotify_free_mark);
+	ientry->wd = 0;
+
+find_entry:
+	spin_lock(&inode->i_lock);
+	entry = fsnotify_find_mark_entry(group, inode);
+	spin_unlock(&inode->i_lock);
+	if (entry) {
+		kmem_cache_free(inotify_inode_mark_cachep, ientry);
+		ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+	} else {
+		if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) {
+			ret = -ENOSPC;
+			goto out_err;
+		}
+
+		ret = fsnotify_add_mark(&ientry->fsn_entry, group, inode);
+		if (ret == -EEXIST)
+			goto find_entry;
+		else if (ret)
+			goto out_err;
+
+		entry = &ientry->fsn_entry;
+retry:
+		ret = -ENOMEM;
+		if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
+			goto out_err;
+
+		spin_lock(&group->inotify_data.idr_lock);
+		/* if entry is added to the idr we keep the reference obtained
+		 * through fsnotify_mark_add.  remember to drop this reference
+		 * when entry is removed from idr */
+		ret = idr_get_new_above(&group->inotify_data.idr, entry,
+					++group->inotify_data.last_wd,
+					&ientry->wd);
+		spin_unlock(&group->inotify_data.idr_lock);
+		if (ret) {
+			if (ret == -EAGAIN)
+				goto retry;
+			goto out_err;
+		}
+		atomic_inc(&group->inotify_data.user->inotify_watches);
+	}
+
+	spin_lock(&entry->lock);
+
+	old_mask = entry->mask;
+	if (add) {
+		entry->mask |= mask;
+		new_mask = entry->mask;
+	} else {
+		entry->mask = mask;
+		new_mask = entry->mask;
+	}
+
+	spin_unlock(&entry->lock);
+
+	if (old_mask != new_mask) {
+		/* more bits in old than in new? */
+		int dropped = (old_mask & ~new_mask);
+		/* more bits in this entry than the inode's mask? */
+		int do_inode = (new_mask & ~inode->i_fsnotify_mask);
+		/* more bits in this entry than the group? */
+		int do_group = (new_mask & ~group->mask);
+
+		/* update the inode with this new entry */
+		if (dropped || do_inode)
+			fsnotify_recalc_inode_mask(inode);
+
+		/* update the group mask with the new mask */
+		if (dropped || do_group)
+			fsnotify_recalc_group_mask(group);
+	}
+
+	return ientry->wd;
+
+out_err:
+	/* see this isn't supposed to happen, just kill the watch */
+	if (entry) {
+		fsnotify_destroy_mark_by_entry(entry);
+		fsnotify_put_mark(entry);
+	}
+	return ret;
+}
+
+static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsigned int max_events)
+{
+	struct fsnotify_group *group;
+	unsigned int grp_num;
+
+	/* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
+	grp_num = (INOTIFY_GROUP_NUM - atomic_inc_return(&inotify_grp_num));
+	group = fsnotify_obtain_group(grp_num, 0, &inotify_fsnotify_ops);
+	if (IS_ERR(group))
+		return group;
+
+	group->max_events = max_events;
+
+	spin_lock_init(&group->inotify_data.idr_lock);
+	idr_init(&group->inotify_data.idr);
+	group->inotify_data.last_wd = 0;
+	group->inotify_data.user = user;
+	group->inotify_data.fa = NULL;
+
+	return group;
+}
+
+
+/* inotify syscalls */
 SYSCALL_DEFINE1(inotify_init1, int, flags)
 {
-	struct inotify_device *dev;
-	struct inotify_handle *ih;
+	struct fsnotify_group *group;
 	struct user_struct *user;
 	struct file *filp;
 	int fd, ret;
@@ -621,45 +591,27 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
 		goto out_free_uid;
 	}
 
-	dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL);
-	if (unlikely(!dev)) {
-		ret = -ENOMEM;
+	/* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
+	group = inotify_new_group(user, inotify_max_queued_events);
+	if (IS_ERR(group)) {
+		ret = PTR_ERR(group);
 		goto out_free_uid;
 	}
 
-	ih = inotify_init(&inotify_user_ops);
-	if (IS_ERR(ih)) {
-		ret = PTR_ERR(ih);
-		goto out_free_dev;
-	}
-	dev->ih = ih;
-	dev->fa = NULL;
-
 	filp->f_op = &inotify_fops;
 	filp->f_path.mnt = mntget(inotify_mnt);
 	filp->f_path.dentry = dget(inotify_mnt->mnt_root);
 	filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
 	filp->f_mode = FMODE_READ;
 	filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
-	filp->private_data = dev;
-
-	INIT_LIST_HEAD(&dev->events);
-	init_waitqueue_head(&dev->wq);
-	mutex_init(&dev->ev_mutex);
-	mutex_init(&dev->up_mutex);
-	dev->event_count = 0;
-	dev->queue_size = 0;
-	dev->max_events = inotify_max_queued_events;
-	dev->user = user;
-	atomic_set(&dev->count, 0);
-
-	get_inotify_dev(dev);
+	filp->private_data = group;
+
 	atomic_inc(&user->inotify_devs);
+
 	fd_install(fd, filp);
 
 	return fd;
-out_free_dev:
-	kfree(dev);
+
 out_free_uid:
 	free_uid(user);
 	put_filp(filp);
@@ -676,8 +628,8 @@ SYSCALL_DEFINE0(inotify_init)
 SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
 		u32, mask)
 {
+	struct fsnotify_group *group;
 	struct inode *inode;
-	struct inotify_device *dev;
 	struct path path;
 	struct file *filp;
 	int ret, fput_needed;
@@ -698,20 +650,20 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
 	if (mask & IN_ONLYDIR)
 		flags |= LOOKUP_DIRECTORY;
 
-	ret = find_inode(pathname, &path, flags);
-	if (unlikely(ret))
+	ret = inotify_find_inode(pathname, &path, flags);
+	if (ret)
 		goto fput_and_out;
 
-	/* inode held in place by reference to path; dev by fget on fd */
+	/* inode held in place by reference to path; group by fget on fd */
 	inode = path.dentry->d_inode;
-	dev = filp->private_data;
+	group = filp->private_data;
 
-	mutex_lock(&dev->up_mutex);
-	ret = inotify_find_update_watch(dev->ih, inode, mask);
-	if (ret == -ENOENT)
-		ret = create_watch(dev, inode, mask);
-	mutex_unlock(&dev->up_mutex);
+	/* create/update an inode mark */
+	ret = inotify_update_watch(group, inode, mask);
+	if (unlikely(ret))
+		goto path_put_and_out;
 
+path_put_and_out:
 	path_put(&path);
 fput_and_out:
 	fput_light(filp, fput_needed);
@@ -720,9 +672,10 @@ fput_and_out:
 
 SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
 {
+	struct fsnotify_group *group;
+	struct fsnotify_mark_entry *entry;
 	struct file *filp;
-	struct inotify_device *dev;
-	int ret, fput_needed;
+	int ret = 0, fput_needed;
 
 	filp = fget_light(fd, &fput_needed);
 	if (unlikely(!filp))
@@ -734,10 +687,20 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
 		goto out;
 	}
 
-	dev = filp->private_data;
+	group = filp->private_data;
 
-	/* we free our watch data when we get IN_IGNORED */
-	ret = inotify_rm_wd(dev->ih, wd);
+	spin_lock(&group->inotify_data.idr_lock);
+	entry = idr_find(&group->inotify_data.idr, wd);
+	if (unlikely(!entry)) {
+		spin_unlock(&group->inotify_data.idr_lock);
+		ret = -EINVAL;
+		goto out;
+	}
+	fsnotify_get_mark(entry);
+	spin_unlock(&group->inotify_data.idr_lock);
+
+	inotify_destroy_mark_entry(entry, group);
+	fsnotify_put_mark(entry);
 
 out:
 	fput_light(filp, fput_needed);
@@ -753,9 +716,9 @@ inotify_get_sb(struct file_system_type *fs_type, int flags,
 }
 
 static struct file_system_type inotify_fs_type = {
-    .name           = "inotifyfs",
-    .get_sb         = inotify_get_sb,
-    .kill_sb        = kill_anon_super,
+    .name	= "inotifyfs",
+    .get_sb	= inotify_get_sb,
+    .kill_sb	= kill_anon_super,
 };
 
 /*
@@ -775,18 +738,16 @@ static int __init inotify_user_setup(void)
 	if (IS_ERR(inotify_mnt))
 		panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
 
+	inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
+	event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
+	inotify_ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0);
+	if (!inotify_ignored_event)
+		panic("unable to allocate the inotify ignored event\n");
+
 	inotify_max_queued_events = 16384;
 	inotify_max_user_instances = 128;
 	inotify_max_user_watches = 8192;
 
-	watch_cachep = kmem_cache_create("inotify_watch_cache",
-					 sizeof(struct inotify_user_watch),
-					 0, SLAB_PANIC, NULL);
-	event_cachep = kmem_cache_create("inotify_event_cache",
-					 sizeof(struct inotify_kernel_event),
-					 0, SLAB_PANIC, NULL);
-
 	return 0;
 }
-
 module_init(inotify_user_setup);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index d2c0ee3..44848aa 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -9,6 +9,7 @@
 
 #ifdef __KERNEL__
 
+#include <linux/idr.h> /* inotify uses this */
 #include <linux/fs.h> /* struct inode */
 #include <linux/list.h>
 #include <linux/path.h> /* struct path */
@@ -59,6 +60,7 @@
 
 /* listeners that hard code group numbers near the top */
 #define DNOTIFY_GROUP_NUM	UINT_MAX
+#define INOTIFY_GROUP_NUM	(DNOTIFY_GROUP_NUM-1)
 
 struct fsnotify_group;
 struct fsnotify_event;
@@ -141,6 +143,15 @@ struct fsnotify_group {
 	/* groups can define private fields here or use the void *private */
 	union {
 		void *private;
+#ifdef CONFIG_INOTIFY_USER
+		struct inotify_group_private_data {
+			spinlock_t	idr_lock;
+			struct idr      idr;
+			u32             last_wd;
+			struct fasync_struct    *fa;    /* async notification */
+			struct user_struct      *user;
+		} inotify_data;
+#endif
 	};
 };
 
diff --git a/init/Kconfig b/init/Kconfig
index d4e9671..5de1c17c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -302,7 +302,8 @@ config AUDITSYSCALL
 
 config AUDIT_TREE
 	def_bool y
-	depends on AUDITSYSCALL && INOTIFY
+	depends on AUDITSYSCALL
+	select INOTIFY
 
 menu "RCU Subsystem"
 
-- 
cgit v0.10.2


From ff52cc2158b32b3b979ca7802b1fd7c70f36e13c Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 11 Jun 2009 11:09:47 -0400
Subject: fsnotify: move events should indicate the event was on a child

fsnotify tells its listeners explicitly when an event happened on the given
inode verses on the child of the given inode.  (see __fsnotify_parent)
However, the semantics of fsnotify_move() are such that we deliver events
directly to the two parent directories in question (old_dir and new_dir)
directly without using the __fsnotify_parent() call.  fsnotify should be
adding FS_EVENT_ON_CHILD for the notifications to these parents.

Signed-off-by: Eric Paris <eparis@redhat.com>

diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index c25b39d..936f9aa 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -71,12 +71,11 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 	struct inode *source = moved->d_inode;
 	u32 in_cookie = inotify_get_cookie();
 	u32 fs_cookie = fsnotify_get_cookie();
-	__u32 old_dir_mask = 0;
-	__u32 new_dir_mask = 0;
+	__u32 old_dir_mask = (FS_EVENT_ON_CHILD | FS_MOVED_FROM);
+	__u32 new_dir_mask = (FS_EVENT_ON_CHILD | FS_MOVED_TO);
 
-	if (old_dir == new_dir) {
-		old_dir_mask = FS_DN_RENAME;
-	}
+	if (old_dir == new_dir)
+		old_dir_mask |= FS_DN_RENAME;
 
 	if (isdir) {
 		isdir = IN_ISDIR;
@@ -84,9 +83,6 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 		new_dir_mask |= FS_IN_ISDIR;
 	}
 
-	old_dir_mask |= FS_MOVED_FROM;
-	new_dir_mask |= FS_MOVED_TO;
-
 	inotify_inode_queue_event(old_dir, IN_MOVED_FROM|isdir, in_cookie, old_name,
 				  source);
 	inotify_inode_queue_event(new_dir, IN_MOVED_TO|isdir, in_cookie, new_name,
-- 
cgit v0.10.2


From 5ac697b793a3c45005c568df692518da6e690390 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 11 Jun 2009 11:09:47 -0400
Subject: dnotify: do not use ?true:false when assigning to a bool

dnotify_should send event assigned a bool using ?true:false when computing
a bit operation.  This is poitless and the bool type does this for us.

Signed-off-by: Eric Paris <eparis@redhat.com>

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 12f9e6b..5134e89 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -154,7 +154,7 @@ static bool dnotify_should_send_event(struct fsnotify_group *group,
 		return false;
 
 	spin_lock(&entry->lock);
-	send = (mask & entry->mask) ? true : false;
+	send = (mask & entry->mask);
 	spin_unlock(&entry->lock);
 	fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */
 
-- 
cgit v0.10.2


From ce61856bd2aadb064f595e5c0444376a2b117c41 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 11 Jun 2009 11:09:47 -0400
Subject: dnotify: do not bother to lock entry->lock when reading mask

entry->lock is needed to make sure entry->mask does not change while
manipulating it.  In dnotify_should_send_event() we don't care if we get an
old or a new mask value out of this entry so there is no point it taking
the lock.

Signed-off-by: Eric Paris <eparis@redhat.com>

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 5134e89..ec459b6 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -153,9 +153,8 @@ static bool dnotify_should_send_event(struct fsnotify_group *group,
 	if (!entry)
 		return false;
 
-	spin_lock(&entry->lock);
 	send = (mask & entry->mask);
-	spin_unlock(&entry->lock);
+
 	fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */
 
 	return send;
-- 
cgit v0.10.2


From e42e27736de80045f925564ea27a1d32957219e7 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 11 Jun 2009 11:09:47 -0400
Subject: inotify/dnotify: should_send_event shouldn't match on
 FS_EVENT_ON_CHILD

inotify and dnotify will both indicate that they want any event which came
from a child inode.  The fix is to mask off FS_EVENT_ON_CHILD when deciding
if inotify or dnotify is interested in a given event.

Signed-off-by: Eric Paris <eparis@redhat.com>

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index ec459b6..98a7516 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -153,6 +153,7 @@ static bool dnotify_should_send_event(struct fsnotify_group *group,
 	if (!entry)
 		return false;
 
+	mask = (mask & ~FS_EVENT_ON_CHILD);
 	send = (mask & entry->mask);
 
 	fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index f11d75f..ec2f7bd 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -137,14 +137,16 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const
 	struct fsnotify_group *group;
 	struct fsnotify_event *event = NULL;
 	int idx;
+	/* global tests shouldn't care about events on child only the specific event */
+	__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
 
 	if (list_empty(&fsnotify_groups))
 		return;
 
-	if (!(mask & fsnotify_mask))
+	if (!(test_mask & fsnotify_mask))
 		return;
 
-	if (!(mask & to_tell->i_fsnotify_mask))
+	if (!(test_mask & to_tell->i_fsnotify_mask))
 		return;
 	/*
 	 * SRCU!!  the groups list is very very much read only and the path is
@@ -153,7 +155,7 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const
 	 */
 	idx = srcu_read_lock(&fsnotify_grp_srcu);
 	list_for_each_entry_rcu(group, &fsnotify_groups, group_list) {
-		if (mask & group->mask) {
+		if (test_mask & group->mask) {
 			if (!group->ops->should_send_event(group, to_tell, mask))
 				continue;
 			if (!event) {
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 160da54..7ef75b8 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -95,6 +95,7 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode
 	if (!entry)
 		return false;
 
+	mask = (mask & ~FS_EVENT_ON_CHILD);
 	send = (entry->mask & mask);
 
 	/* find took a reference */
-- 
cgit v0.10.2


From a092ee20fd33d2df0990dcbf2235afc181612818 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 11 Jun 2009 11:09:48 -0400
Subject: fsnotify: allow groups to set freeing_mark to null

Most fsnotify listeners (all but inotify) do not care about marks being
freed.  Allow groups to set freeing_mark to null and do not call any
function if it is set that way.

Signed-off-by: Eric Paris <eparis@redhat.com>

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 98a7516..828a889 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -161,12 +161,6 @@ static bool dnotify_should_send_event(struct fsnotify_group *group,
 	return send;
 }
 
-static void dnotify_freeing_mark(struct fsnotify_mark_entry *entry,
-				 struct fsnotify_group *group)
-{
-	/* dnotify doesn't care than an inode is on the way out */
-}
-
 static void dnotify_free_mark(struct fsnotify_mark_entry *entry)
 {
 	struct dnotify_mark_entry *dnentry = container_of(entry,
@@ -182,7 +176,7 @@ static struct fsnotify_ops dnotify_fsnotify_ops = {
 	.handle_event = dnotify_handle_event,
 	.should_send_event = dnotify_should_send_event,
 	.free_group_priv = NULL,
-	.freeing_mark = dnotify_freeing_mark,
+	.freeing_mark = NULL,
 	.free_event_priv = NULL,
 };
 
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 0a499d2..c8a07c6 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -190,7 +190,8 @@ void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
 	 * callback to the group function to let it know that this entry
 	 * is being freed.
 	 */
-	group->ops->freeing_mark(entry, group);
+	if (group->ops->freeing_mark)
+		group->ops->freeing_mark(entry, group);
 
 	/*
 	 * __fsnotify_update_child_dentry_flags(inode);
-- 
cgit v0.10.2


From 73422811d290c628b4ddbf6830e5cd6fa42e84f1 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Sun, 10 May 2009 16:05:39 -0400
Subject: reiserfs: allow exposing privroot w/ xattrs enabled

This patch adds an -oexpose_privroot option to allow access to the privroot.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 45ee3d3..6d2668f 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -44,13 +44,11 @@ static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
 static inline bool is_privroot_deh(struct dentry *dir,
 				   struct reiserfs_de_head *deh)
 {
-	int ret = 0;
-#ifdef CONFIG_REISERFS_FS_XATTR
 	struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
-	ret = (dir == dir->d_parent && privroot->d_inode &&
-	       deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
-#endif
-	return ret;
+	if (reiserfs_expose_privroot(dir->d_sb))
+		return 0;
+	return (dir == dir->d_parent && privroot->d_inode &&
+	        deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
 }
 
 int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 3567fb9..9dbdcfb 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -898,6 +898,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 		{"conv",.setmask = 1 << REISERFS_CONVERT},
 		{"attrs",.setmask = 1 << REISERFS_ATTRS},
 		{"noattrs",.clrmask = 1 << REISERFS_ATTRS},
+		{"expose_privroot", .setmask = 1 << REISERFS_EXPOSE_PRIVROOT},
 #ifdef CONFIG_REISERFS_FS_XATTR
 		{"user_xattr",.setmask = 1 << REISERFS_XATTRS_USER},
 		{"nouser_xattr",.clrmask = 1 << REISERFS_XATTRS_USER},
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 8e7deb0..f3d47d8 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -981,7 +981,8 @@ int reiserfs_lookup_privroot(struct super_block *s)
 				strlen(PRIVROOT_NAME));
 	if (!IS_ERR(dentry)) {
 		REISERFS_SB(s)->priv_root = dentry;
-		s->s_root->d_op = &xattr_lookup_poison_ops;
+		if (!reiserfs_expose_privroot(s))
+			s->s_root->d_op = &xattr_lookup_poison_ops;
 		if (dentry->d_inode)
 			dentry->d_inode->i_flags |= S_PRIVATE;
 	} else
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index 6473650..dab68bb 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -453,6 +453,7 @@ enum reiserfs_mount_options {
 	REISERFS_ATTRS,
 	REISERFS_XATTRS_USER,
 	REISERFS_POSIXACL,
+	REISERFS_EXPOSE_PRIVROOT,
 	REISERFS_BARRIER_NONE,
 	REISERFS_BARRIER_FLUSH,
 
@@ -490,6 +491,7 @@ enum reiserfs_mount_options {
 #define reiserfs_data_writeback(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_WRITEBACK))
 #define reiserfs_xattrs_user(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_XATTRS_USER))
 #define reiserfs_posixacl(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_POSIXACL))
+#define reiserfs_expose_privroot(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_EXPOSE_PRIVROOT))
 #define reiserfs_xattrs_optional(s) (reiserfs_xattrs_user(s) || reiserfs_posixacl(s))
 #define reiserfs_barrier_none(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_NONE))
 #define reiserfs_barrier_flush(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_FLUSH))
-- 
cgit v0.10.2


From 4e44b6852e03c915618ca6776b6697b436246b00 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 7 Apr 2009 11:08:56 -0400
Subject: Get rid of path_lookup in autofs4

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 84168c0..f71dac9 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -192,77 +192,42 @@ static int autofs_dev_ioctl_protosubver(struct file *fp,
 	return 0;
 }
 
-/*
- * Walk down the mount stack looking for an autofs mount that
- * has the requested device number (aka. new_encode_dev(sb->s_dev).
- */
-static int autofs_dev_ioctl_find_super(struct nameidata *nd, dev_t devno)
+static int find_autofs_mount(const char *pathname,
+			     struct path *res,
+			     int test(struct path *path, void *data),
+			     void *data)
 {
-	struct dentry *dentry;
-	struct inode *inode;
-	struct super_block *sb;
-	dev_t s_dev;
-	unsigned int err;
-
+	struct path path;
+	int err = kern_path(pathname, 0, &path);
+	if (err)
+		return err;
 	err = -ENOENT;
-
-	/* Lookup the dentry name at the base of our mount point */
-	dentry = d_lookup(nd->path.dentry, &nd->last);
-	if (!dentry)
-		goto out;
-
-	dput(nd->path.dentry);
-	nd->path.dentry = dentry;
-
-	/* And follow the mount stack looking for our autofs mount */
-	while (follow_down(&nd->path.mnt, &nd->path.dentry)) {
-		inode = nd->path.dentry->d_inode;
-		if (!inode)
-			break;
-
-		sb = inode->i_sb;
-		s_dev = new_encode_dev(sb->s_dev);
-		if (devno == s_dev) {
-			if (sb->s_magic == AUTOFS_SUPER_MAGIC) {
+	while (path.dentry == path.mnt->mnt_root) {
+		if (path.mnt->mnt_sb->s_magic == AUTOFS_SUPER_MAGIC) {
+			if (test(&path, data)) {
+				path_get(&path);
+				if (!err) /* already found some */
+					path_put(res);
+				*res = path;
 				err = 0;
-				break;
 			}
 		}
+		if (!follow_up(&path.mnt, &path.dentry))
+			break;
 	}
-out:
+	path_put(&path);
 	return err;
 }
 
-/*
- * Walk down the mount stack looking for an autofs mount that
- * has the requested mount type (ie. indirect, direct or offset).
- */
-static int autofs_dev_ioctl_find_sbi_type(struct nameidata *nd, unsigned int type)
+static int test_by_dev(struct path *path, void *p)
 {
-	struct dentry *dentry;
-	struct autofs_info *ino;
-	unsigned int err;
-
-	err = -ENOENT;
-
-	/* Lookup the dentry name at the base of our mount point */
-	dentry = d_lookup(nd->path.dentry, &nd->last);
-	if (!dentry)
-		goto out;
-
-	dput(nd->path.dentry);
-	nd->path.dentry = dentry;
+	return path->mnt->mnt_sb->s_dev == *(dev_t *)p;
+}
 
-	/* And follow the mount stack looking for our autofs mount */
-	while (follow_down(&nd->path.mnt, &nd->path.dentry)) {
-		ino = autofs4_dentry_ino(nd->path.dentry);
-		if (ino && ino->sbi->type & type) {
-			err = 0;
-			break;
-		}
-	}
-out:
-	return err;
+static int test_by_type(struct path *path, void *p)
+{
+	struct autofs_info *ino = autofs4_dentry_ino(path->dentry);
+	return ino && ino->sbi->type & *(unsigned *)p;
 }
 
 static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file)
@@ -283,31 +248,25 @@ static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file)
  * Open a file descriptor on the autofs mount point corresponding
  * to the given path and device number (aka. new_encode_dev(sb->s_dev)).
  */
-static int autofs_dev_ioctl_open_mountpoint(const char *path, dev_t devid)
+static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid)
 {
-	struct file *filp;
-	struct nameidata nd;
 	int err, fd;
 
 	fd = get_unused_fd();
 	if (likely(fd >= 0)) {
-		/* Get nameidata of the parent directory */
-		err = path_lookup(path, LOOKUP_PARENT, &nd);
+		struct file *filp;
+		struct path path;
+
+		err = find_autofs_mount(name, &path, test_by_dev, &devid);
 		if (err)
 			goto out;
 
 		/*
-		 * Search down, within the parent, looking for an
-		 * autofs super block that has the device number
+		 * Find autofs super block that has the device number
 		 * corresponding to the autofs fs we want to open.
 		 */
-		err = autofs_dev_ioctl_find_super(&nd, devid);
-		if (err) {
-			path_put(&nd.path);
-			goto out;
-		}
 
-		filp = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY,
+		filp = dentry_open(path.dentry, path.mnt, O_RDONLY,
 				   current_cred());
 		if (IS_ERR(filp)) {
 			err = PTR_ERR(filp);
@@ -340,7 +299,7 @@ static int autofs_dev_ioctl_openmount(struct file *fp,
 	param->ioctlfd = -1;
 
 	path = param->path;
-	devid = param->openmount.devid;
+	devid = new_decode_dev(param->openmount.devid);
 
 	err = 0;
 	fd = autofs_dev_ioctl_open_mountpoint(path, devid);
@@ -475,8 +434,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
 				      struct autofs_dev_ioctl *param)
 {
 	struct autofs_info *ino;
-	struct nameidata nd;
-	const char *path;
+	struct path path;
 	dev_t devid;
 	int err = -ENOENT;
 
@@ -485,32 +443,24 @@ static int autofs_dev_ioctl_requester(struct file *fp,
 		goto out;
 	}
 
-	path = param->path;
-	devid = new_encode_dev(sbi->sb->s_dev);
+	devid = sbi->sb->s_dev;
 
 	param->requester.uid = param->requester.gid = -1;
 
-	/* Get nameidata of the parent directory */
-	err = path_lookup(path, LOOKUP_PARENT, &nd);
+	err = find_autofs_mount(param->path, &path, test_by_dev, &devid);
 	if (err)
 		goto out;
 
-	err = autofs_dev_ioctl_find_super(&nd, devid);
-	if (err)
-		goto out_release;
-
-	ino = autofs4_dentry_ino(nd.path.dentry);
+	ino = autofs4_dentry_ino(path.dentry);
 	if (ino) {
 		err = 0;
-		autofs4_expire_wait(nd.path.dentry);
+		autofs4_expire_wait(path.dentry);
 		spin_lock(&sbi->fs_lock);
 		param->requester.uid = ino->uid;
 		param->requester.gid = ino->gid;
 		spin_unlock(&sbi->fs_lock);
 	}
-
-out_release:
-	path_put(&nd.path);
+	path_put(&path);
 out:
 	return err;
 }
@@ -569,8 +519,8 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 					 struct autofs_sb_info *sbi,
 					 struct autofs_dev_ioctl *param)
 {
-	struct nameidata nd;
-	const char *path;
+	struct path path;
+	const char *name;
 	unsigned int type;
 	unsigned int devid, magic;
 	int err = -ENOENT;
@@ -580,71 +530,46 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 		goto out;
 	}
 
-	path = param->path;
+	name = param->path;
 	type = param->ismountpoint.in.type;
 
 	param->ismountpoint.out.devid = devid = 0;
 	param->ismountpoint.out.magic = magic = 0;
 
 	if (!fp || param->ioctlfd == -1) {
-		if (autofs_type_any(type)) {
-			struct super_block *sb;
-
-			err = path_lookup(path, LOOKUP_FOLLOW, &nd);
-			if (err)
-				goto out;
-
-			sb = nd.path.dentry->d_sb;
-			devid = new_encode_dev(sb->s_dev);
-		} else {
-			struct autofs_info *ino;
-
-			err = path_lookup(path, LOOKUP_PARENT, &nd);
-			if (err)
-				goto out;
-
-			err = autofs_dev_ioctl_find_sbi_type(&nd, type);
-			if (err)
-				goto out_release;
-
-			ino = autofs4_dentry_ino(nd.path.dentry);
-			devid = autofs4_get_dev(ino->sbi);
-		}
-
+		if (autofs_type_any(type))
+			err = kern_path(name, LOOKUP_FOLLOW, &path);
+		else
+			err = find_autofs_mount(name, &path, test_by_type, &type);
+		if (err)
+			goto out;
+		devid = new_encode_dev(path.mnt->mnt_sb->s_dev);
 		err = 0;
-		if (nd.path.dentry->d_inode &&
-		    nd.path.mnt->mnt_root == nd.path.dentry) {
+		if (path.dentry->d_inode &&
+		    path.mnt->mnt_root == path.dentry) {
 			err = 1;
-			magic = nd.path.dentry->d_inode->i_sb->s_magic;
+			magic = path.dentry->d_inode->i_sb->s_magic;
 		}
 	} else {
-		dev_t dev = autofs4_get_dev(sbi);
+		dev_t dev = sbi->sb->s_dev;
 
-		err = path_lookup(path, LOOKUP_PARENT, &nd);
+		err = find_autofs_mount(name, &path, test_by_dev, &dev);
 		if (err)
 			goto out;
 
-		err = autofs_dev_ioctl_find_super(&nd, dev);
-		if (err)
-			goto out_release;
-
-		devid = dev;
+		devid = new_encode_dev(dev);
 
-		err = have_submounts(nd.path.dentry);
+		err = have_submounts(path.dentry);
 
-		if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) {
-			if (follow_down(&nd.path.mnt, &nd.path.dentry)) {
-				struct inode *inode = nd.path.dentry->d_inode;
-				magic = inode->i_sb->s_magic;
-			}
+		if (path.mnt->mnt_mountpoint != path.mnt->mnt_root) {
+			if (follow_down(&path.mnt, &path.dentry))
+				magic = path.mnt->mnt_sb->s_magic;
 		}
 	}
 
 	param->ismountpoint.out.devid = devid;
 	param->ismountpoint.out.magic = magic;
-
-out_release:
-	path_put(&nd.path);
+	path_put(&path);
 out:
 	return err;
 }
-- 
cgit v0.10.2


From 9b4a9b14a793bc69b505ed916051f6f32db13bb8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 7 Apr 2009 11:44:16 -0400
Subject: Preparations to caching root in path_walk()

Split do_path_lookup(), opencode the call from do_filp_open()
do_filp_open() is the only caller of do_path_lookup() that
cares about root afterwards (it keeps resolving symlinks on
O_CREAT path after it'd done LOOKUP_PARENT walk).  So when
we start caching fs->root in path_walk(), it'll need a different
treatment.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index c82805d..895733e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1017,9 +1017,7 @@ static int path_walk(const char *name, struct nameidata *nd)
 	return link_path_walk(name, nd);
 }
 
-/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
-static int do_path_lookup(int dfd, const char *name,
-				unsigned int flags, struct nameidata *nd)
+static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
 {
 	int retval = 0;
 	int fput_needed;
@@ -1063,17 +1061,25 @@ static int do_path_lookup(int dfd, const char *name,
 
 		fput_light(file, fput_needed);
 	}
+	return 0;
 
-	retval = path_walk(name, nd);
+fput_fail:
+	fput_light(file, fput_needed);
+out_fail:
+	return retval;
+}
+
+/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
+static int do_path_lookup(int dfd, const char *name,
+				unsigned int flags, struct nameidata *nd)
+{
+	int retval = path_init(dfd, name, flags, nd);
+	if (!retval)
+		retval = path_walk(name, nd);
 	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
 				nd->path.dentry->d_inode))
 		audit_inode(name, nd->path.dentry);
-out_fail:
 	return retval;
-
-fput_fail:
-	fput_light(file, fput_needed);
-	goto out_fail;
 }
 
 int path_lookup(const char *name, unsigned int flags,
@@ -1676,9 +1682,14 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	/*
 	 * Create - we need to know the parent.
 	 */
-	error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd);
+	error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
 	if (error)
 		return ERR_PTR(error);
+	error = path_walk(pathname, &nd);
+	if (error)
+		return ERR_PTR(error);
+	if (unlikely(!audit_dummy_context()))
+		audit_inode(pathname, nd.path.dentry);
 
 	/*
 	 * We have the parent and last component. First of all, check
-- 
cgit v0.10.2


From 2a737871108de9ba8930f7650d549f1383767f8b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 7 Apr 2009 11:49:53 -0400
Subject: Cache root in nameidata

New field: nd->root.  When pathname resolution wants to know the root,
check if nd->root.mnt is non-NULL; use nd->root if it is, otherwise
copy current->fs->root there.  After path_walk() is finished, we check
if we'd got a cached value in nd->root and drop it.  Before calling
path_walk() we should either set nd->root.mnt to NULL *or* copy (and
pin down) some path to nd->root.  In the latter case we won't be
looking at current->fs->root at all.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 895733e..88baaf2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -552,6 +552,17 @@ static __always_inline int link_path_walk(const char *name, struct nameidata *nd
 	return result;
 }
 
+static __always_inline void set_root(struct nameidata *nd)
+{
+	if (!nd->root.mnt) {
+		struct fs_struct *fs = current->fs;
+		read_lock(&fs->lock);
+		nd->root = fs->root;
+		path_get(&nd->root);
+		read_unlock(&fs->lock);
+	}
+}
+
 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
 {
 	int res = 0;
@@ -560,14 +571,10 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
 		goto fail;
 
 	if (*link == '/') {
-		struct fs_struct *fs = current->fs;
-
+		set_root(nd);
 		path_put(&nd->path);
-
-		read_lock(&fs->lock);
-		nd->path = fs->root;
-		path_get(&fs->root);
-		read_unlock(&fs->lock);
+		nd->path = nd->root;
+		path_get(&nd->root);
 	}
 
 	res = link_path_walk(link, nd);
@@ -741,19 +748,16 @@ int follow_down(struct vfsmount **mnt, struct dentry **dentry)
 
 static __always_inline void follow_dotdot(struct nameidata *nd)
 {
-	struct fs_struct *fs = current->fs;
+	set_root(nd);
 
 	while(1) {
 		struct vfsmount *parent;
 		struct dentry *old = nd->path.dentry;
 
-                read_lock(&fs->lock);
-		if (nd->path.dentry == fs->root.dentry &&
-		    nd->path.mnt == fs->root.mnt) {
-                        read_unlock(&fs->lock);
+		if (nd->path.dentry == nd->root.dentry &&
+		    nd->path.mnt == nd->root.mnt) {
 			break;
 		}
-                read_unlock(&fs->lock);
 		spin_lock(&dcache_lock);
 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
 			nd->path.dentry = dget(nd->path.dentry->d_parent);
@@ -1022,18 +1026,18 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
 	int retval = 0;
 	int fput_needed;
 	struct file *file;
-	struct fs_struct *fs = current->fs;
 
 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
 	nd->flags = flags;
 	nd->depth = 0;
+	nd->root.mnt = NULL;
 
 	if (*name=='/') {
-		read_lock(&fs->lock);
-		nd->path = fs->root;
-		path_get(&fs->root);
-		read_unlock(&fs->lock);
+		set_root(nd);
+		nd->path = nd->root;
+		path_get(&nd->root);
 	} else if (dfd == AT_FDCWD) {
+		struct fs_struct *fs = current->fs;
 		read_lock(&fs->lock);
 		nd->path = fs->pwd;
 		path_get(&fs->pwd);
@@ -1079,6 +1083,10 @@ static int do_path_lookup(int dfd, const char *name,
 	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
 				nd->path.dentry->d_inode))
 		audit_inode(name, nd->path.dentry);
+	if (nd->root.mnt) {
+		path_put(&nd->root);
+		nd->root.mnt = NULL;
+	}
 	return retval;
 }
 
@@ -1115,6 +1123,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 	nd->last_type = LAST_ROOT;
 	nd->flags = flags;
 	nd->depth = 0;
+	nd->root.mnt = NULL;
 
 	nd->path.dentry = dentry;
 	nd->path.mnt = mnt;
@@ -1125,8 +1134,12 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 				nd->path.dentry->d_inode))
 		audit_inode(name, nd->path.dentry);
 
-	return retval;
+	if (nd->root.mnt) {
+		path_put(&nd->root);
+		nd->root.mnt = NULL;
+	}
 
+	return retval;
 }
 
 /**
@@ -1817,6 +1830,8 @@ exit:
 	if (!IS_ERR(nd.intent.open.file))
 		release_open_intent(&nd);
 exit_parent:
+	if (nd.root.mnt)
+		path_put(&nd.root);
 	path_put(&nd.path);
 	return ERR_PTR(error);
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 518098f..325dd3a 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -18,6 +18,7 @@ enum { MAX_NESTED_LINKS = 8 };
 struct nameidata {
 	struct path	path;
 	struct qstr	last;
+	struct path	root;
 	unsigned int	flags;
 	int		last_type;
 	unsigned	depth;
-- 
cgit v0.10.2


From 5b857119538daac7118c1364d7ff3613f12b84d3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 7 Apr 2009 11:53:49 -0400
Subject: Make vfs_path_lookup() use starting point as root

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 88baaf2..4379ef9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1123,21 +1123,20 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 	nd->last_type = LAST_ROOT;
 	nd->flags = flags;
 	nd->depth = 0;
-	nd->root.mnt = NULL;
 
 	nd->path.dentry = dentry;
 	nd->path.mnt = mnt;
 	path_get(&nd->path);
+	nd->root = nd->path;
+	path_get(&nd->root);
 
 	retval = path_walk(name, nd);
 	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
 				nd->path.dentry->d_inode))
 		audit_inode(name, nd->path.dentry);
 
-	if (nd->root.mnt) {
-		path_put(&nd->root);
-		nd->root.mnt = NULL;
-	}
+	path_put(&nd->root);
+	nd->root.mnt = NULL;
 
 	return retval;
 }
-- 
cgit v0.10.2


From dd5cae6e9772ecc62fd374f7a8ec10eb51c96c4d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 7 Apr 2009 12:21:18 -0400
Subject: Don't bother with check_mnt() in do_add_mount() on shrinkable ones

These guys are what we add as submounts; checks for "is that attached in
our namespace" are simply irrelevant for those and counterproductive for
use of private vfsmount trees a-la what NFS folks want.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namespace.c b/fs/namespace.c
index 134d494..88a904d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1698,7 +1698,7 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
 	       follow_down(&path->mnt, &path->dentry))
 		;
 	err = -EINVAL;
-	if (!check_mnt(path->mnt))
+	if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
 		goto unlock;
 
 	/* Refuse the same filesystem on the same mount point */
-- 
cgit v0.10.2


From 55430e2ecee574e729c12d4063b3ecabfa98fa82 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 02:04:46 -0400
Subject: nfsd struct path use: exp_get_by_name()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 5839b22..3f6d51b 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -847,9 +847,8 @@ exp_get_fsid_key(svc_client *clp, int fsid)
 	return exp_find_key(clp, FSID_NUM, fsidv, NULL);
 }
 
-static svc_export *exp_get_by_name(svc_client *clp, struct vfsmount *mnt,
-				   struct dentry *dentry,
-				   struct cache_req *reqp)
+static svc_export *exp_get_by_name(svc_client *clp, const struct path *path,
+				     struct cache_req *reqp)
 {
 	struct svc_export *exp, key;
 	int err;
@@ -858,8 +857,7 @@ static svc_export *exp_get_by_name(svc_client *clp, struct vfsmount *mnt,
 		return ERR_PTR(-ENOENT);
 
 	key.ex_client = clp;
-	key.ex_path.mnt = mnt;
-	key.ex_path.dentry = dentry;
+	key.ex_path = *path;
 
 	exp = svc_export_lookup(&key);
 	if (exp == NULL)
@@ -877,20 +875,19 @@ static struct svc_export *exp_parent(svc_client *clp, struct vfsmount *mnt,
 				     struct dentry *dentry,
 				     struct cache_req *reqp)
 {
+	struct path path = {.mnt = mnt, .dentry = dentry};
 	svc_export *exp;
 
-	dget(dentry);
-	exp = exp_get_by_name(clp, mnt, dentry, reqp);
-
-	while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) {
-		struct dentry *parent;
+	dget(path.dentry);
+	exp = exp_get_by_name(clp, &path, reqp);
 
-		parent = dget_parent(dentry);
-		dput(dentry);
-		dentry = parent;
-		exp = exp_get_by_name(clp, mnt, dentry, reqp);
+	while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path.dentry)) {
+		struct dentry *parent = dget_parent(path.dentry);
+		dput(path.dentry);
+		path.dentry = parent;
+		exp = exp_get_by_name(clp, &path, reqp);
 	}
-	dput(dentry);
+	dput(path.dentry);
 	return exp;
 }
 
@@ -1018,7 +1015,7 @@ exp_export(struct nfsctl_export *nxp)
 		goto out_put_clp;
 	err = -EINVAL;
 
-	exp = exp_get_by_name(clp, path.mnt, path.dentry, NULL);
+	exp = exp_get_by_name(clp, &path, NULL);
 
 	memset(&new, 0, sizeof(new));
 
@@ -1135,7 +1132,7 @@ exp_unexport(struct nfsctl_export *nxp)
 		goto out_domain;
 
 	err = -EINVAL;
-	exp = exp_get_by_name(dom, path.mnt, path.dentry, NULL);
+	exp = exp_get_by_name(dom, &path, NULL);
 	path_put(&path);
 	if (IS_ERR(exp))
 		goto out_domain;
@@ -1207,7 +1204,7 @@ static struct svc_export *exp_find(struct auth_domain *clp, int fsid_type,
 	if (IS_ERR(ek))
 		return ERR_CAST(ek);
 
-	exp = exp_get_by_name(clp, ek->ek_path.mnt, ek->ek_path.dentry, reqp);
+	exp = exp_get_by_name(clp, &ek->ek_path, reqp);
 	cache_put(&ek->h, &svc_expkey_cache);
 
 	if (IS_ERR(exp))
@@ -1251,12 +1248,13 @@ rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt,
 		struct dentry *dentry)
 {
 	struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT);
+	struct path path = {.mnt = mnt, .dentry = dentry};
 
 	if (rqstp->rq_client == NULL)
 		goto gss;
 
 	/* First try the auth_unix client: */
-	exp = exp_get_by_name(rqstp->rq_client, mnt, dentry,
+	exp = exp_get_by_name(rqstp->rq_client, &path,
 						&rqstp->rq_chandle);
 	if (PTR_ERR(exp) == -ENOENT)
 		goto gss;
@@ -1269,7 +1267,7 @@ gss:
 	/* Otherwise, try falling back on gss client */
 	if (rqstp->rq_gssclient == NULL)
 		return exp;
-	gssexp = exp_get_by_name(rqstp->rq_gssclient, mnt, dentry,
+	gssexp = exp_get_by_name(rqstp->rq_gssclient, &path,
 						&rqstp->rq_chandle);
 	if (PTR_ERR(gssexp) == -ENOENT)
 		return exp;
-- 
cgit v0.10.2


From 5bf3bd2b5cb68ba43c91f5bd0ac043543fba2558 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 02:14:32 -0400
Subject: switch exp_parent() to struct path

... and lose the always-NULL last argument (non-NULL case had been
split off a while ago).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 3f6d51b..5149dab 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -871,23 +871,19 @@ static svc_export *exp_get_by_name(svc_client *clp, const struct path *path,
 /*
  * Find the export entry for a given dentry.
  */
-static struct svc_export *exp_parent(svc_client *clp, struct vfsmount *mnt,
-				     struct dentry *dentry,
-				     struct cache_req *reqp)
+static struct svc_export *exp_parent(svc_client *clp, struct path *path)
 {
-	struct path path = {.mnt = mnt, .dentry = dentry};
-	svc_export *exp;
-
-	dget(path.dentry);
-	exp = exp_get_by_name(clp, &path, reqp);
-
-	while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path.dentry)) {
-		struct dentry *parent = dget_parent(path.dentry);
-		dput(path.dentry);
-		path.dentry = parent;
-		exp = exp_get_by_name(clp, &path, reqp);
+	struct dentry *saved = dget(path->dentry);
+	svc_export *exp = exp_get_by_name(clp, path, NULL);
+
+	while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) {
+		struct dentry *parent = dget_parent(path->dentry);
+		dput(path->dentry);
+		path->dentry = parent;
+		exp = exp_get_by_name(clp, path, NULL);
 	}
-	dput(path.dentry);
+	dput(path->dentry);
+	path->dentry = saved;
 	return exp;
 }
 
@@ -1174,7 +1170,7 @@ exp_rootfh(svc_client *clp, char *name, struct knfsd_fh *f, int maxsize)
 	dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n",
 		 name, path.dentry, clp->name,
 		 inode->i_sb->s_id, inode->i_ino);
-	exp = exp_parent(clp, path.mnt, path.dentry, NULL);
+	exp = exp_parent(clp, &path);
 	if (IS_ERR(exp)) {
 		err = PTR_ERR(exp);
 		goto out;
-- 
cgit v0.10.2


From 91c9fa8f75877c0c1e455c23e8f8206c91c8f77f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 02:42:05 -0400
Subject: switch rqst_exp_get_by_name()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 5149dab..84f5e5c 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1240,18 +1240,15 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
  * use exp_get_by_name() or exp_find().
  */
 struct svc_export *
-rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt,
-		struct dentry *dentry)
+rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path)
 {
 	struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT);
-	struct path path = {.mnt = mnt, .dentry = dentry};
 
 	if (rqstp->rq_client == NULL)
 		goto gss;
 
 	/* First try the auth_unix client: */
-	exp = exp_get_by_name(rqstp->rq_client, &path,
-						&rqstp->rq_chandle);
+	exp = exp_get_by_name(rqstp->rq_client, path, &rqstp->rq_chandle);
 	if (PTR_ERR(exp) == -ENOENT)
 		goto gss;
 	if (IS_ERR(exp))
@@ -1263,8 +1260,7 @@ gss:
 	/* Otherwise, try falling back on gss client */
 	if (rqstp->rq_gssclient == NULL)
 		return exp;
-	gssexp = exp_get_by_name(rqstp->rq_gssclient, &path,
-						&rqstp->rq_chandle);
+	gssexp = exp_get_by_name(rqstp->rq_gssclient, path, &rqstp->rq_chandle);
 	if (PTR_ERR(gssexp) == -ENOENT)
 		return exp;
 	if (!IS_ERR(exp))
@@ -1307,9 +1303,10 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt,
 		struct dentry *dentry)
 {
 	struct svc_export *exp;
+	struct path path = {.mnt = mnt, .dentry = dentry};
 
 	dget(dentry);
-	exp = rqst_exp_get_by_name(rqstp, mnt, dentry);
+	exp = rqst_exp_get_by_name(rqstp, &path);
 
 	while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) {
 		struct dentry *parent;
@@ -1317,7 +1314,7 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt,
 		parent = dget_parent(dentry);
 		dput(dentry);
 		dentry = parent;
-		exp = rqst_exp_get_by_name(rqstp, mnt, dentry);
+		exp = rqst_exp_get_by_name(rqstp, &path);
 	}
 	dput(dentry);
 	return exp;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index bd584bc..d84c4ea 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -101,36 +101,36 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 {
 	struct svc_export *exp = *expp, *exp2 = NULL;
 	struct dentry *dentry = *dpp;
-	struct vfsmount *mnt = mntget(exp->ex_path.mnt);
-	struct dentry *mounts = dget(dentry);
+	struct path path = {.mnt = mntget(exp->ex_path.mnt),
+			    .dentry = dget(dentry)};
 	int err = 0;
 
-	while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts));
+	while (follow_down(&path.mnt, &path.dentry) &&
+	       d_mountpoint(path.dentry))
+		;
 
-	exp2 = rqst_exp_get_by_name(rqstp, mnt, mounts);
+	exp2 = rqst_exp_get_by_name(rqstp, &path);
 	if (IS_ERR(exp2)) {
 		if (PTR_ERR(exp2) != -ENOENT)
 			err = PTR_ERR(exp2);
-		dput(mounts);
-		mntput(mnt);
+		path_put(&path);
 		goto out;
 	}
 	if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
 		/* successfully crossed mount point */
 		/*
-		 * This is subtle: dentry is *not* under mnt at this point.
-		 * The only reason we are safe is that original mnt is pinned
-		 * down by exp, so we should dput before putting exp.
+		 * This is subtle: path.dentry is *not* on path.mnt
+		 * at this point.  The only reason we are safe is that
+		 * original mnt is pinned down by exp, so we should
+		 * put path *before* putting exp
 		 */
-		dput(dentry);
-		*dpp = mounts;
-		exp_put(exp);
+		*dpp = path.dentry;
+		path.dentry = dentry;
 		*expp = exp2;
-	} else {
-		exp_put(exp2);
-		dput(mounts);
+		exp2 = exp;
 	}
-	mntput(mnt);
+	path_put(&path);
+	exp_put(exp2);
 out:
 	return err;
 }
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index bcd0201..98f6fd5 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -125,8 +125,7 @@ void			nfsd_export_flush(void);
 void			exp_readlock(void);
 void			exp_readunlock(void);
 struct svc_export *	rqst_exp_get_by_name(struct svc_rqst *,
-					     struct vfsmount *,
-					     struct dentry *);
+					     struct path *);
 struct svc_export *	rqst_exp_parent(struct svc_rqst *,
 					struct vfsmount *mnt,
 					struct dentry *dentry);
-- 
cgit v0.10.2


From e64c390ca0b60fd2119331ef1fa888d7ea27e424 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 03:00:46 -0400
Subject: switch rqst_exp_parent()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 84f5e5c..8b1f8ef 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1299,24 +1299,19 @@ gss:
 }
 
 struct svc_export *
-rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt,
-		struct dentry *dentry)
+rqst_exp_parent(struct svc_rqst *rqstp, struct path *path)
 {
-	struct svc_export *exp;
-	struct path path = {.mnt = mnt, .dentry = dentry};
-
-	dget(dentry);
-	exp = rqst_exp_get_by_name(rqstp, &path);
-
-	while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) {
-		struct dentry *parent;
+	struct dentry *saved = dget(path->dentry);
+	struct svc_export *exp = rqst_exp_get_by_name(rqstp, path);
 
-		parent = dget_parent(dentry);
-		dput(dentry);
-		dentry = parent;
-		exp = rqst_exp_get_by_name(rqstp, &path);
+	while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) {
+		struct dentry *parent = dget_parent(path->dentry);
+		dput(path->dentry);
+		path->dentry = parent;
+		exp = rqst_exp_get_by_name(rqstp, path);
 	}
-	dput(dentry);
+	dput(path->dentry);
+	path->dentry = saved;
 	return exp;
 }
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d84c4ea..9f1ea31 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -169,28 +169,29 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 			/* checking mountpoint crossing is very different when stepping up */
 			struct svc_export *exp2 = NULL;
 			struct dentry *dp;
-			struct vfsmount *mnt = mntget(exp->ex_path.mnt);
-			dentry = dget(dparent);
-			while(dentry == mnt->mnt_root && follow_up(&mnt, &dentry))
+			struct path path = {.mnt = mntget(exp->ex_path.mnt),
+					    .dentry = dget(dparent)};
+
+			while (path.dentry == path.mnt->mnt_root &&
+			       follow_up(&path.mnt, &path.dentry))
 				;
-			dp = dget_parent(dentry);
-			dput(dentry);
-			dentry = dp;
+			dp = dget_parent(path.dentry);
+			dput(path.dentry);
+			path.dentry = dp;
 
-			exp2 = rqst_exp_parent(rqstp, mnt, dentry);
+			exp2 = rqst_exp_parent(rqstp, &path);
 			if (PTR_ERR(exp2) == -ENOENT) {
-				dput(dentry);
 				dentry = dget(dparent);
 			} else if (IS_ERR(exp2)) {
 				host_err = PTR_ERR(exp2);
-				dput(dentry);
-				mntput(mnt);
+				path_put(&path);
 				goto out_nfserr;
 			} else {
+				dentry = dget(path.dentry);
 				exp_put(exp);
 				exp = exp2;
 			}
-			mntput(mnt);
+			path_put(&path);
 		}
 	} else {
 		fh_lock(fhp);
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index 98f6fd5..a6d9ef2 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -127,8 +127,7 @@ void			exp_readunlock(void);
 struct svc_export *	rqst_exp_get_by_name(struct svc_rqst *,
 					     struct path *);
 struct svc_export *	rqst_exp_parent(struct svc_rqst *,
-					struct vfsmount *mnt,
-					struct dentry *dentry);
+					struct path *);
 int			exp_rootfh(struct auth_domain *, 
 					char *path, struct knfsd_fh *, int maxsize);
 __be32			exp_pseudoroot(struct svc_rqst *, struct svc_fh *);
-- 
cgit v0.10.2


From bab77ebf51e3902f608ecf08c9d34a0a52ac35a9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 03:26:48 -0400
Subject: switch follow_up() to struct path

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index f71dac9..6704075 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -212,7 +212,7 @@ static int find_autofs_mount(const char *pathname,
 				err = 0;
 			}
 		}
-		if (!follow_up(&path.mnt, &path.dentry))
+		if (!follow_up(&path))
 			break;
 	}
 	path_put(&path);
diff --git a/fs/namei.c b/fs/namei.c
index 4379ef9..8c1f48a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -675,23 +675,23 @@ loop:
 	return err;
 }
 
-int follow_up(struct vfsmount **mnt, struct dentry **dentry)
+int follow_up(struct path *path)
 {
 	struct vfsmount *parent;
 	struct dentry *mountpoint;
 	spin_lock(&vfsmount_lock);
-	parent=(*mnt)->mnt_parent;
-	if (parent == *mnt) {
+	parent = path->mnt->mnt_parent;
+	if (parent == path->mnt) {
 		spin_unlock(&vfsmount_lock);
 		return 0;
 	}
 	mntget(parent);
-	mountpoint=dget((*mnt)->mnt_mountpoint);
+	mountpoint = dget(path->mnt->mnt_mountpoint);
 	spin_unlock(&vfsmount_lock);
-	dput(*dentry);
-	*dentry = mountpoint;
-	mntput(*mnt);
-	*mnt = parent;
+	dput(path->dentry);
+	path->dentry = mountpoint;
+	mntput(path->mnt);
+	path->mnt = parent;
 	return 1;
 }
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 9f1ea31..7b2b3f7 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -173,7 +173,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 					    .dentry = dget(dparent)};
 
 			while (path.dentry == path.mnt->mnt_root &&
-			       follow_up(&path.mnt, &path.dentry))
+			       follow_up(&path))
 				;
 			dp = dget_parent(path.dentry);
 			dput(path.dentry);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 325dd3a..9cd5a717b 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -79,7 +79,7 @@ extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
 extern struct dentry *lookup_one_noperm(const char *, struct dentry *);
 
 extern int follow_down(struct vfsmount **, struct dentry **);
-extern int follow_up(struct vfsmount **, struct dentry **);
+extern int follow_up(struct path *);
 
 extern struct dentry *lock_rename(struct dentry *, struct dentry *);
 extern void unlock_rename(struct dentry *, struct dentry *);
-- 
cgit v0.10.2


From 589ff870ed60a9ebdd5ec99ec3f5afe1282fe151 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 03:28:19 -0400
Subject: Switch collect_mounts() to struct path

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namespace.c b/fs/namespace.c
index 88a904d..c859622 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1253,11 +1253,11 @@ Enomem:
 	return NULL;
 }
 
-struct vfsmount *collect_mounts(struct vfsmount *mnt, struct dentry *dentry)
+struct vfsmount *collect_mounts(struct path *path)
 {
 	struct vfsmount *tree;
 	down_write(&namespace_sem);
-	tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE);
+	tree = copy_tree(path->mnt, path->dentry, CL_COPY_ALL | CL_PRIVATE);
 	up_write(&namespace_sem);
 	return tree;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 323b5ce..03fb210 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1800,7 +1800,7 @@ extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data);
 extern int may_umount_tree(struct vfsmount *);
 extern int may_umount(struct vfsmount *);
 extern long do_mount(char *, char *, char *, unsigned long, void *);
-extern struct vfsmount *collect_mounts(struct vfsmount *, struct dentry *);
+extern struct vfsmount *collect_mounts(struct path *);
 extern void drop_collected_mounts(struct vfsmount *);
 
 extern int vfs_statfs(struct dentry *, struct kstatfs *);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 6e73517..1f6396d7 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -568,7 +568,7 @@ void audit_trim_trees(void)
 		if (err)
 			goto skip_it;
 
-		root_mnt = collect_mounts(path.mnt, path.dentry);
+		root_mnt = collect_mounts(&path);
 		path_put(&path);
 		if (!root_mnt)
 			goto skip_it;
@@ -660,7 +660,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
 	err = kern_path(tree->pathname, 0, &path);
 	if (err)
 		goto Err;
-	mnt = collect_mounts(path.mnt, path.dentry);
+	mnt = collect_mounts(&path);
 	path_put(&path);
 	if (!mnt) {
 		err = -ENOMEM;
@@ -720,7 +720,7 @@ int audit_tag_tree(char *old, char *new)
 	err = kern_path(new, 0, &path);
 	if (err)
 		return err;
-	tagged = collect_mounts(path.mnt, path.dentry);
+	tagged = collect_mounts(&path);
 	path_put(&path);
 	if (!tagged)
 		return -ENOMEM;
-- 
cgit v0.10.2


From 9393bd07cf218ca51d0e627653f906a9d76a9131 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 13:58:15 -0400
Subject: switch follow_down()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 2b9e2d0..c52be53 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -244,7 +244,7 @@ static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
 	case -EBUSY:
 		/* someone else made a mount here whilst we were busy */
 		while (d_mountpoint(nd->path.dentry) &&
-		       follow_down(&nd->path.mnt, &nd->path.dentry))
+		       follow_down(&nd->path))
 			;
 		err = 0;
 	default:
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index 4eb4d8d..2316e94 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -85,13 +85,12 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
 		}
 		path.mnt = mnt;
 		path_get(&path);
-		if (!follow_down(&path.mnt, &path.dentry)) {
+		if (!follow_down(&path)) {
 			path_put(&path);
 			DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
 			continue;
 		}
-		while (d_mountpoint(path.dentry) &&
-		       follow_down(&path.mnt, &path.dentry))
+		while (d_mountpoint(path.dentry) && follow_down(&path));
 			;
 		umount_ok = may_umount(path.mnt);
 		path_put(&path);
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index b7ff33c..8f7cdde 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -223,12 +223,12 @@ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
 int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
 void autofs4_catatonic_mode(struct autofs_sb_info *);
 
-static inline int autofs4_follow_mount(struct vfsmount **mnt, struct dentry **dentry)
+static inline int autofs4_follow_mount(struct path *path)
 {
 	int res = 0;
 
-	while (d_mountpoint(*dentry)) {
-		int followed = follow_down(mnt, dentry);
+	while (d_mountpoint(path->dentry)) {
+		int followed = follow_down(path);
 		if (!followed)
 			break;
 		res = 1;
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 6704075..f3da2eb 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -562,7 +562,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 		err = have_submounts(path.dentry);
 
 		if (path.mnt->mnt_mountpoint != path.mnt->mnt_root) {
-			if (follow_down(&path.mnt, &path.dentry))
+			if (follow_down(&path))
 				magic = path.mnt->mnt_sb->s_magic;
 		}
 	}
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 3077d8f..aa39ae8 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -48,19 +48,19 @@ static inline int autofs4_can_expire(struct dentry *dentry,
 static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 {
 	struct dentry *top = dentry;
+	struct path path = {.mnt = mnt, .dentry = dentry};
 	int status = 1;
 
 	DPRINTK("dentry %p %.*s",
 		dentry, (int)dentry->d_name.len, dentry->d_name.name);
 
-	mntget(mnt);
-	dget(dentry);
+	path_get(&path);
 
-	if (!follow_down(&mnt, &dentry))
+	if (!follow_down(&path))
 		goto done;
 
-	if (is_autofs4_dentry(dentry)) {
-		struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	if (is_autofs4_dentry(path.dentry)) {
+		struct autofs_sb_info *sbi = autofs4_sbi(path.dentry->d_sb);
 
 		/* This is an autofs submount, we can't expire it */
 		if (autofs_type_indirect(sbi->type))
@@ -70,7 +70,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 		 * Otherwise it's an offset mount and we need to check
 		 * if we can umount its mount, if there is one.
 		 */
-		if (!d_mountpoint(dentry)) {
+		if (!d_mountpoint(path.dentry)) {
 			status = 0;
 			goto done;
 		}
@@ -86,8 +86,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 	status = 0;
 done:
 	DPRINTK("returning = %d", status);
-	dput(dentry);
-	mntput(mnt);
+	path_put(&path);
 	return status;
 }
 
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e383bf0..b96a3c5 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -181,7 +181,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 		nd->flags);
 	/*
 	 * For an expire of a covered direct or offset mount we need
-	 * to beeak out of follow_down() at the autofs mount trigger
+	 * to break out of follow_down() at the autofs mount trigger
 	 * (d_mounted--), so we can see the expiring flag, and manage
 	 * the blocking and following here until the expire is completed.
 	 */
@@ -190,7 +190,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 		if (ino->flags & AUTOFS_INF_EXPIRING) {
 			spin_unlock(&sbi->fs_lock);
 			/* Follow down to our covering mount. */
-			if (!follow_down(&nd->path.mnt, &nd->path.dentry))
+			if (!follow_down(&nd->path))
 				goto done;
 			goto follow;
 		}
@@ -230,8 +230,7 @@ follow:
 	 * to follow it.
 	 */
 	if (d_mountpoint(dentry)) {
-		if (!autofs4_follow_mount(&nd->path.mnt,
-					  &nd->path.dentry)) {
+		if (!autofs4_follow_mount(&nd->path)) {
 			status = -ENOENT;
 			goto out_error;
 		}
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 83d6275..3bb11be 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -275,7 +275,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
 	case -EBUSY:
 		/* someone else made a mount here whilst we were busy */
 		while (d_mountpoint(nd->path.dentry) &&
-		       follow_down(&nd->path.mnt, &nd->path.dentry))
+		       follow_down(&nd->path))
 			;
 		err = 0;
 	default:
diff --git a/fs/namei.c b/fs/namei.c
index 8c1f48a..4d49a3e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -731,16 +731,16 @@ static void follow_mount(struct vfsmount **mnt, struct dentry **dentry)
 /* no need for dcache_lock, as serialization is taken care in
  * namespace.c
  */
-int follow_down(struct vfsmount **mnt, struct dentry **dentry)
+int follow_down(struct path *path)
 {
 	struct vfsmount *mounted;
 
-	mounted = lookup_mnt(*mnt, *dentry);
+	mounted = lookup_mnt(path->mnt, path->dentry);
 	if (mounted) {
-		dput(*dentry);
-		mntput(*mnt);
-		*mnt = mounted;
-		*dentry = dget(mounted->mnt_root);
+		dput(path->dentry);
+		mntput(path->mnt);
+		path->mnt = mounted;
+		path->dentry = dget(mounted->mnt_root);
 		return 1;
 	}
 	return 0;
diff --git a/fs/namespace.c b/fs/namespace.c
index c859622..ba5237b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1601,7 +1601,7 @@ static int do_move_mount(struct path *path, char *old_name)
 
 	down_write(&namespace_sem);
 	while (d_mountpoint(path->dentry) &&
-	       follow_down(&path->mnt, &path->dentry))
+	       follow_down(path))
 		;
 	err = -EINVAL;
 	if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
@@ -1695,7 +1695,7 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
 	down_write(&namespace_sem);
 	/* Something was mounted here while we slept */
 	while (d_mountpoint(path->dentry) &&
-	       follow_down(&path->mnt, &path->dentry))
+	       follow_down(path))
 		;
 	err = -EINVAL;
 	if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 64a288e..f01caec 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -154,7 +154,7 @@ out_err:
 	goto out;
 out_follow:
 	while (d_mountpoint(nd->path.dentry) &&
-	       follow_down(&nd->path.mnt, &nd->path.dentry))
+	       follow_down(&nd->path))
 		;
 	err = 0;
 	goto out;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 7b2b3f7..99f8357 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -105,8 +105,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 			    .dentry = dget(dentry)};
 	int err = 0;
 
-	while (follow_down(&path.mnt, &path.dentry) &&
-	       d_mountpoint(path.dentry))
+	while (d_mountpoint(path.dentry) && follow_down(&path))
 		;
 
 	exp2 = rqst_exp_get_by_name(rqstp, &path);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 9cd5a717b..d870ae2 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -78,7 +78,7 @@ extern void release_open_intent(struct nameidata *);
 extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
 extern struct dentry *lookup_one_noperm(const char *, struct dentry *);
 
-extern int follow_down(struct vfsmount **, struct dentry **);
+extern int follow_down(struct path *);
 extern int follow_up(struct path *);
 
 extern struct dentry *lock_rename(struct dentry *, struct dentry *);
-- 
cgit v0.10.2


From 79ed0226198c628133530b179a90dbf42b1c2eba Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 13:59:41 -0400
Subject: switch follow_mount()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 4d49a3e..c006bc6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -715,16 +715,16 @@ static int __follow_mount(struct path *path)
 	return res;
 }
 
-static void follow_mount(struct vfsmount **mnt, struct dentry **dentry)
+static void follow_mount(struct path *path)
 {
-	while (d_mountpoint(*dentry)) {
-		struct vfsmount *mounted = lookup_mnt(*mnt, *dentry);
+	while (d_mountpoint(path->dentry)) {
+		struct vfsmount *mounted = lookup_mnt(path->mnt, path->dentry);
 		if (!mounted)
 			break;
-		dput(*dentry);
-		mntput(*mnt);
-		*mnt = mounted;
-		*dentry = dget(mounted->mnt_root);
+		dput(path->dentry);
+		mntput(path->mnt);
+		path->mnt = mounted;
+		path->dentry = dget(mounted->mnt_root);
 	}
 }
 
@@ -779,7 +779,7 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
 		mntput(nd->path.mnt);
 		nd->path.mnt = parent;
 	}
-	follow_mount(&nd->path.mnt, &nd->path.dentry);
+	follow_mount(&nd->path);
 }
 
 /*
-- 
cgit v0.10.2


From 1c755af4df75996b0dd4b7e6cacaf9d57a6ef2ef Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 14:06:57 -0400
Subject: switch lookup_mnt()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index c006bc6..527119a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -702,7 +702,7 @@ static int __follow_mount(struct path *path)
 {
 	int res = 0;
 	while (d_mountpoint(path->dentry)) {
-		struct vfsmount *mounted = lookup_mnt(path->mnt, path->dentry);
+		struct vfsmount *mounted = lookup_mnt(path);
 		if (!mounted)
 			break;
 		dput(path->dentry);
@@ -718,7 +718,7 @@ static int __follow_mount(struct path *path)
 static void follow_mount(struct path *path)
 {
 	while (d_mountpoint(path->dentry)) {
-		struct vfsmount *mounted = lookup_mnt(path->mnt, path->dentry);
+		struct vfsmount *mounted = lookup_mnt(path);
 		if (!mounted)
 			break;
 		dput(path->dentry);
@@ -735,7 +735,7 @@ int follow_down(struct path *path)
 {
 	struct vfsmount *mounted;
 
-	mounted = lookup_mnt(path->mnt, path->dentry);
+	mounted = lookup_mnt(path);
 	if (mounted) {
 		dput(path->dentry);
 		mntput(path->mnt);
diff --git a/fs/namespace.c b/fs/namespace.c
index ba5237b..b94ad3d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -442,11 +442,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
  * lookup_mnt increments the ref count before returning
  * the vfsmount struct.
  */
-struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
+struct vfsmount *lookup_mnt(struct path *path)
 {
 	struct vfsmount *child_mnt;
 	spin_lock(&vfsmount_lock);
-	if ((child_mnt = __lookup_mnt(mnt, dentry, 1)))
+	if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
 		mntget(child_mnt);
 	spin_unlock(&vfsmount_lock);
 	return child_mnt;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 9797800..72ce2ae 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -370,7 +370,7 @@ static inline int d_mountpoint(struct dentry *dentry)
 	return dentry->d_mounted;
 }
 
-extern struct vfsmount *lookup_mnt(struct vfsmount *, struct dentry *);
+extern struct vfsmount *lookup_mnt(struct path *);
 extern struct dentry *lookup_create(struct nameidata *nd, int is_dir);
 
 extern int sysctl_vfs_cache_pressure;
-- 
cgit v0.10.2


From 3174c21b74b56c6a53fddd41a30fd6f757a32bd0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 7 Apr 2009 13:19:18 -0400
Subject: Move junk from proc_fs.h to fs/proc/internal.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index f6db961..753ca37 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -92,3 +92,28 @@ struct pde_opener {
 	struct list_head lh;
 };
 void pde_users_dec(struct proc_dir_entry *pde);
+
+extern spinlock_t proc_subdir_lock;
+
+struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
+int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
+unsigned long task_vsize(struct mm_struct *);
+int task_statm(struct mm_struct *, int *, int *, int *, int *);
+void task_mem(struct seq_file *, struct mm_struct *);
+
+struct proc_dir_entry *de_get(struct proc_dir_entry *de);
+void de_put(struct proc_dir_entry *de);
+
+extern struct vfsmount *proc_mnt;
+int proc_fill_super(struct super_block *);
+struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);
+
+/*
+ * These are generic /proc routines that use the internal
+ * "struct proc_dir_entry" tree to traverse the filesystem.
+ *
+ * The /proc root directory has extended versions to take care
+ * of the /proc/<pid> subdirectories.
+ */
+int proc_readdir(struct file *, void *, filldir_t);
+struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index de2bba5..fc6c302 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -11,6 +11,7 @@
 #include <linux/string.h>
 #include <asm/prom.h>
 #include <asm/uaccess.h>
+#include "internal.h"
 
 #ifndef HAVE_ARCH_DEVTREE_FIXUPS
 static inline void set_node_proc_entry(struct device_node *np,
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index fbfa3d4..e6e77d3 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -93,20 +93,9 @@ struct vmcore {
 
 #ifdef CONFIG_PROC_FS
 
-extern spinlock_t proc_subdir_lock;
-
 extern void proc_root_init(void);
 
 void proc_flush_task(struct task_struct *task);
-struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
-int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
-unsigned long task_vsize(struct mm_struct *);
-int task_statm(struct mm_struct *, int *, int *, int *, int *);
-void task_mem(struct seq_file *, struct mm_struct *);
-void clear_refs_smap(struct mm_struct *mm);
-
-struct proc_dir_entry *de_get(struct proc_dir_entry *de);
-void de_put(struct proc_dir_entry *de);
 
 extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
 						struct proc_dir_entry *parent);
@@ -116,20 +105,7 @@ struct proc_dir_entry *proc_create_data(const char *name, mode_t mode,
 				void *data);
 extern void remove_proc_entry(const char *name, struct proc_dir_entry *parent);
 
-extern struct vfsmount *proc_mnt;
 struct pid_namespace;
-extern int proc_fill_super(struct super_block *);
-extern struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);
-
-/*
- * These are generic /proc routines that use the internal
- * "struct proc_dir_entry" tree to traverse the filesystem.
- *
- * The /proc root directory has extended versions to take care
- * of the /proc/<pid> subdirectories.
- */
-extern int proc_readdir(struct file *, void *, filldir_t);
-extern struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
 
 extern int pid_ns_prepare_proc(struct pid_namespace *ns);
 extern void pid_ns_release_proc(struct pid_namespace *ns);
-- 
cgit v0.10.2


From d3ef3d7351ccfbef3e5d926efc5ee332136f40d4 Mon Sep 17 00:00:00 2001
From: "npiggin@suse.de" <npiggin@suse.de>
Date: Sun, 26 Apr 2009 20:25:54 +1000
Subject: fs: mnt_want_write speedup

This patch speeds up lmbench lat_mmap test by about 8%. lat_mmap is set up
basically to mmap a 64MB file on tmpfs, fault in its pages, then unmap it.
A microbenchmark yes, but it exercises some important paths in the mm.

Before:
 avg = 501.9
 std = 14.7773

After:
 avg = 462.286
 std = 5.46106

(50 runs of each, stddev gives a reasonable confidence, but there is quite
a bit of variation there still)

It does this by removing the complex per-cpu locking and counter-cache and
replaces it with a percpu counter in struct vfsmount. This makes the code
much simpler, and avoids spinlocks (although the msync is still pretty
costly, unfortunately). It results in about 900 bytes smaller code too. It
does increase the size of a vfsmount, however.

It should also give a speedup on large systems if CPUs are frequently operating
on different mounts (because the existing scheme has to operate on an atomic in
the struct vfsmount when switching between mounts). But I'm most interested in
the single threaded path performance for the moment.

[AV: minor cleanup]

Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namespace.c b/fs/namespace.c
index b94ad3d..22ae06a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -131,10 +131,20 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_share);
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
-		atomic_set(&mnt->__mnt_writers, 0);
+#ifdef CONFIG_SMP
+		mnt->mnt_writers = alloc_percpu(int);
+		if (!mnt->mnt_writers)
+			goto out_free_devname;
+#else
+		mnt->mnt_writers = 0;
+#endif
 	}
 	return mnt;
 
+#ifdef CONFIG_SMP
+out_free_devname:
+	kfree(mnt->mnt_devname);
+#endif
 out_free_id:
 	mnt_free_id(mnt);
 out_free_cache:
@@ -171,65 +181,38 @@ int __mnt_is_readonly(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
 
-struct mnt_writer {
-	/*
-	 * If holding multiple instances of this lock, they
-	 * must be ordered by cpu number.
-	 */
-	spinlock_t lock;
-	struct lock_class_key lock_class; /* compiles out with !lockdep */
-	unsigned long count;
-	struct vfsmount *mnt;
-} ____cacheline_aligned_in_smp;
-static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
+static inline void inc_mnt_writers(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+	(*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
+#else
+	mnt->mnt_writers++;
+#endif
+}
 
-static int __init init_mnt_writers(void)
+static inline void dec_mnt_writers(struct vfsmount *mnt)
 {
-	int cpu;
-	for_each_possible_cpu(cpu) {
-		struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
-		spin_lock_init(&writer->lock);
-		lockdep_set_class(&writer->lock, &writer->lock_class);
-		writer->count = 0;
-	}
-	return 0;
+#ifdef CONFIG_SMP
+	(*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
+#else
+	mnt->mnt_writers--;
+#endif
 }
-fs_initcall(init_mnt_writers);
 
-static void unlock_mnt_writers(void)
+static unsigned int count_mnt_writers(struct vfsmount *mnt)
 {
+#ifdef CONFIG_SMP
+	unsigned int count = 0;
 	int cpu;
-	struct mnt_writer *cpu_writer;
 
 	for_each_possible_cpu(cpu) {
-		cpu_writer = &per_cpu(mnt_writers, cpu);
-		spin_unlock(&cpu_writer->lock);
+		count += *per_cpu_ptr(mnt->mnt_writers, cpu);
 	}
-}
 
-static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
-{
-	if (!cpu_writer->mnt)
-		return;
-	/*
-	 * This is in case anyone ever leaves an invalid,
-	 * old ->mnt and a count of 0.
-	 */
-	if (!cpu_writer->count)
-		return;
-	atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
-	cpu_writer->count = 0;
-}
- /*
- * must hold cpu_writer->lock
- */
-static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
-					  struct vfsmount *mnt)
-{
-	if (cpu_writer->mnt == mnt)
-		return;
-	__clear_mnt_count(cpu_writer);
-	cpu_writer->mnt = mnt;
+	return count;
+#else
+	return mnt->mnt_writers;
+#endif
 }
 
 /*
@@ -253,75 +236,34 @@ static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
 int mnt_want_write(struct vfsmount *mnt)
 {
 	int ret = 0;
-	struct mnt_writer *cpu_writer;
 
-	cpu_writer = &get_cpu_var(mnt_writers);
-	spin_lock(&cpu_writer->lock);
+	preempt_disable();
+	inc_mnt_writers(mnt);
+	/*
+	 * The store to inc_mnt_writers must be visible before we pass
+	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
+	 * incremented count after it has set MNT_WRITE_HOLD.
+	 */
+	smp_mb();
+	while (mnt->mnt_flags & MNT_WRITE_HOLD)
+		cpu_relax();
+	/*
+	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
+	 * be set to match its requirements. So we must not load that until
+	 * MNT_WRITE_HOLD is cleared.
+	 */
+	smp_rmb();
 	if (__mnt_is_readonly(mnt)) {
+		dec_mnt_writers(mnt);
 		ret = -EROFS;
 		goto out;
 	}
-	use_cpu_writer_for_mount(cpu_writer, mnt);
-	cpu_writer->count++;
 out:
-	spin_unlock(&cpu_writer->lock);
-	put_cpu_var(mnt_writers);
+	preempt_enable();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(mnt_want_write);
 
-static void lock_mnt_writers(void)
-{
-	int cpu;
-	struct mnt_writer *cpu_writer;
-
-	for_each_possible_cpu(cpu) {
-		cpu_writer = &per_cpu(mnt_writers, cpu);
-		spin_lock(&cpu_writer->lock);
-		__clear_mnt_count(cpu_writer);
-		cpu_writer->mnt = NULL;
-	}
-}
-
-/*
- * These per-cpu write counts are not guaranteed to have
- * matched increments and decrements on any given cpu.
- * A file open()ed for write on one cpu and close()d on
- * another cpu will imbalance this count.  Make sure it
- * does not get too far out of whack.
- */
-static void handle_write_count_underflow(struct vfsmount *mnt)
-{
-	if (atomic_read(&mnt->__mnt_writers) >=
-	    MNT_WRITER_UNDERFLOW_LIMIT)
-		return;
-	/*
-	 * It isn't necessary to hold all of the locks
-	 * at the same time, but doing it this way makes
-	 * us share a lot more code.
-	 */
-	lock_mnt_writers();
-	/*
-	 * vfsmount_lock is for mnt_flags.
-	 */
-	spin_lock(&vfsmount_lock);
-	/*
-	 * If coalescing the per-cpu writer counts did not
-	 * get us back to a positive writer count, we have
-	 * a bug.
-	 */
-	if ((atomic_read(&mnt->__mnt_writers) < 0) &&
-	    !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
-		WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
-				"count: %d\n",
-			mnt, atomic_read(&mnt->__mnt_writers));
-		/* use the flag to keep the dmesg spam down */
-		mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
-	}
-	spin_unlock(&vfsmount_lock);
-	unlock_mnt_writers();
-}
-
 /**
  * mnt_drop_write - give up write access to a mount
  * @mnt: the mount on which to give up write access
@@ -332,37 +274,9 @@ static void handle_write_count_underflow(struct vfsmount *mnt)
  */
 void mnt_drop_write(struct vfsmount *mnt)
 {
-	int must_check_underflow = 0;
-	struct mnt_writer *cpu_writer;
-
-	cpu_writer = &get_cpu_var(mnt_writers);
-	spin_lock(&cpu_writer->lock);
-
-	use_cpu_writer_for_mount(cpu_writer, mnt);
-	if (cpu_writer->count > 0) {
-		cpu_writer->count--;
-	} else {
-		must_check_underflow = 1;
-		atomic_dec(&mnt->__mnt_writers);
-	}
-
-	spin_unlock(&cpu_writer->lock);
-	/*
-	 * Logically, we could call this each time,
-	 * but the __mnt_writers cacheline tends to
-	 * be cold, and makes this expensive.
-	 */
-	if (must_check_underflow)
-		handle_write_count_underflow(mnt);
-	/*
-	 * This could be done right after the spinlock
-	 * is taken because the spinlock keeps us on
-	 * the cpu, and disables preemption.  However,
-	 * putting it here bounds the amount that
-	 * __mnt_writers can underflow.  Without it,
-	 * we could theoretically wrap __mnt_writers.
-	 */
-	put_cpu_var(mnt_writers);
+	preempt_disable();
+	dec_mnt_writers(mnt);
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
 
@@ -370,24 +284,41 @@ static int mnt_make_readonly(struct vfsmount *mnt)
 {
 	int ret = 0;
 
-	lock_mnt_writers();
+	spin_lock(&vfsmount_lock);
+	mnt->mnt_flags |= MNT_WRITE_HOLD;
 	/*
-	 * With all the locks held, this value is stable
+	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
+	 * should be visible before we do.
 	 */
-	if (atomic_read(&mnt->__mnt_writers) > 0) {
-		ret = -EBUSY;
-		goto out;
-	}
+	smp_mb();
+
 	/*
-	 * nobody can do a successful mnt_want_write() with all
-	 * of the counts in MNT_DENIED_WRITE and the locks held.
+	 * With writers on hold, if this value is zero, then there are
+	 * definitely no active writers (although held writers may subsequently
+	 * increment the count, they'll have to wait, and decrement it after
+	 * seeing MNT_READONLY).
+	 *
+	 * It is OK to have counter incremented on one CPU and decremented on
+	 * another: the sum will add up correctly. The danger would be when we
+	 * sum up each counter, if we read a counter before it is incremented,
+	 * but then read another CPU's count which it has been subsequently
+	 * decremented from -- we would see more decrements than we should.
+	 * MNT_WRITE_HOLD protects against this scenario, because
+	 * mnt_want_write first increments count, then smp_mb, then spins on
+	 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
+	 * we're counting up here.
 	 */
-	spin_lock(&vfsmount_lock);
-	if (!ret)
+	if (count_mnt_writers(mnt) > 0)
+		ret = -EBUSY;
+	else
 		mnt->mnt_flags |= MNT_READONLY;
+	/*
+	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
+	 * that become unheld will see MNT_READONLY.
+	 */
+	smp_wmb();
+	mnt->mnt_flags &= ~MNT_WRITE_HOLD;
 	spin_unlock(&vfsmount_lock);
-out:
-	unlock_mnt_writers();
 	return ret;
 }
 
@@ -410,6 +341,9 @@ void free_vfsmnt(struct vfsmount *mnt)
 {
 	kfree(mnt->mnt_devname);
 	mnt_free_id(mnt);
+#ifdef CONFIG_SMP
+	free_percpu(mnt->mnt_writers);
+#endif
 	kmem_cache_free(mnt_cache, mnt);
 }
 
@@ -604,38 +538,18 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
 
 static inline void __mntput(struct vfsmount *mnt)
 {
-	int cpu;
 	struct super_block *sb = mnt->mnt_sb;
 	/*
-	 * We don't have to hold all of the locks at the
-	 * same time here because we know that we're the
-	 * last reference to mnt and that no new writers
-	 * can come in.
-	 */
-	for_each_possible_cpu(cpu) {
-		struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
-		spin_lock(&cpu_writer->lock);
-		if (cpu_writer->mnt != mnt) {
-			spin_unlock(&cpu_writer->lock);
-			continue;
-		}
-		atomic_add(cpu_writer->count, &mnt->__mnt_writers);
-		cpu_writer->count = 0;
-		/*
-		 * Might as well do this so that no one
-		 * ever sees the pointer and expects
-		 * it to be valid.
-		 */
-		cpu_writer->mnt = NULL;
-		spin_unlock(&cpu_writer->lock);
-	}
-	/*
 	 * This probably indicates that somebody messed
 	 * up a mnt_want/drop_write() pair.  If this
 	 * happens, the filesystem was probably unable
 	 * to make r/w->r/o transitions.
 	 */
-	WARN_ON(atomic_read(&mnt->__mnt_writers));
+	/*
+	 * atomic_dec_and_lock() used to deal with ->mnt_count decrements
+	 * provides barriers, so count_mnt_writers() below is safe.  AV
+	 */
+	WARN_ON(count_mnt_writers(mnt));
 	dput(mnt->mnt_root);
 	free_vfsmnt(mnt);
 	deactivate_super(sb);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 51f55f9..ac49c1f 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -30,7 +30,7 @@ struct mnt_namespace;
 #define MNT_STRICTATIME 0x80
 
 #define MNT_SHRINKABLE	0x100
-#define MNT_IMBALANCED_WRITE_COUNT	0x200 /* just for debugging */
+#define MNT_WRITE_HOLD	0x200
 
 #define MNT_SHARED	0x1000	/* if the vfsmount is a shared mount */
 #define MNT_UNBINDABLE	0x2000	/* if the vfsmount is a unbindable mount */
@@ -65,13 +65,22 @@ struct vfsmount {
 	int mnt_expiry_mark;		/* true if marked for expiry */
 	int mnt_pinned;
 	int mnt_ghosts;
-	/*
-	 * This value is not stable unless all of the mnt_writers[] spinlocks
-	 * are held, and all mnt_writer[]s on this mount have 0 as their ->count
-	 */
-	atomic_t __mnt_writers;
+#ifdef CONFIG_SMP
+	int *mnt_writers;
+#else
+	int mnt_writers;
+#endif
 };
 
+static inline int *get_mnt_writers_ptr(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+	return mnt->mnt_writers;
+#else
+	return &mnt->mnt_writers;
+#endif
+}
+
 static inline struct vfsmount *mntget(struct vfsmount *mnt)
 {
 	if (mnt)
-- 
cgit v0.10.2


From 96029c4e09ccbd73a6d0ed2b29e80bf2586ad7ef Mon Sep 17 00:00:00 2001
From: "npiggin@suse.de" <npiggin@suse.de>
Date: Sun, 26 Apr 2009 20:25:55 +1000
Subject: fs: introduce mnt_clone_write

This patch speeds up lmbench lat_mmap test by about another 2% after the
first patch.

Before:
 avg = 462.286
 std = 5.46106

After:
 avg = 453.12
 std = 9.58257

(50 runs of each, stddev gives a reasonable confidence)

It does this by introducing mnt_clone_write, which avoids some heavyweight
operations of mnt_want_write if called on a vfsmount which we know already
has a write count; and mnt_want_write_file, which can call mnt_clone_write
if the file is open for write.

After these two patches, mnt_want_write and mnt_drop_write go from 7% on
the profile down to 1.3% (including mnt_clone_write).

[AV: mnt_want_write_file() should take file alone and derive mnt from it;
not only all callers have that form, but that's the only mnt about which
we know that it's already held for write if file is opened for write]

Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/file_table.c b/fs/file_table.c
index 54018fe..3d66dbc 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -214,7 +214,7 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
 	 */
 	if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) {
 		file_take_write(file);
-		error = mnt_want_write(mnt);
+		error = mnt_clone_write(mnt);
 		WARN_ON(error);
 	}
 	return error;
diff --git a/fs/inode.c b/fs/inode.c
index ca33701..a88baeb 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1422,7 +1422,7 @@ void file_update_time(struct file *file)
 	if (IS_NOCMTIME(inode))
 		return;
 
-	err = mnt_want_write(file->f_path.mnt);
+	err = mnt_want_write_file(file);
 	if (err)
 		return;
 
diff --git a/fs/namespace.c b/fs/namespace.c
index 22ae06a..120b8a6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -265,6 +265,46 @@ out:
 EXPORT_SYMBOL_GPL(mnt_want_write);
 
 /**
+ * mnt_clone_write - get write access to a mount
+ * @mnt: the mount on which to take a write
+ *
+ * This is effectively like mnt_want_write, except
+ * it must only be used to take an extra write reference
+ * on a mountpoint that we already know has a write reference
+ * on it. This allows some optimisation.
+ *
+ * After finished, mnt_drop_write must be called as usual to
+ * drop the reference.
+ */
+int mnt_clone_write(struct vfsmount *mnt)
+{
+	/* superblock may be r/o */
+	if (__mnt_is_readonly(mnt))
+		return -EROFS;
+	preempt_disable();
+	inc_mnt_writers(mnt);
+	preempt_enable();
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mnt_clone_write);
+
+/**
+ * mnt_want_write_file - get write access to a file's mount
+ * @file: the file who's mount on which to take a write
+ *
+ * This is like mnt_want_write, but it takes a file and can
+ * do some optimisations if the file is open for write already
+ */
+int mnt_want_write_file(struct file *file)
+{
+	if (!(file->f_mode & FMODE_WRITE))
+		return mnt_want_write(file->f_path.mnt);
+	else
+		return mnt_clone_write(file->f_path.mnt);
+}
+EXPORT_SYMBOL_GPL(mnt_want_write_file);
+
+/**
  * mnt_drop_write - give up write access to a mount
  * @mnt: the mount on which to give up write access
  *
diff --git a/fs/open.c b/fs/open.c
index bdfbf03..7200e23 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -612,7 +612,7 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
 
 	audit_inode(NULL, dentry);
 
-	err = mnt_want_write(file->f_path.mnt);
+	err = mnt_want_write_file(file);
 	if (err)
 		goto out_putf;
 	mutex_lock(&inode->i_mutex);
@@ -761,7 +761,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
 	if (!file)
 		goto out;
 
-	error = mnt_want_write(file->f_path.mnt);
+	error = mnt_want_write_file(file);
 	if (error)
 		goto out_fput;
 	dentry = file->f_path.dentry;
diff --git a/fs/xattr.c b/fs/xattr.c
index d51b8f9..1c3d0af 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -297,7 +297,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
 		return error;
 	dentry = f->f_path.dentry;
 	audit_inode(NULL, dentry);
-	error = mnt_want_write(f->f_path.mnt);
+	error = mnt_want_write_file(f);
 	if (!error) {
 		error = setxattr(dentry, name, value, size, flags);
 		mnt_drop_write(f->f_path.mnt);
@@ -524,7 +524,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
 		return error;
 	dentry = f->f_path.dentry;
 	audit_inode(NULL, dentry);
-	error = mnt_want_write(f->f_path.mnt);
+	error = mnt_want_write_file(f);
 	if (!error) {
 		error = removexattr(dentry, name);
 		mnt_drop_write(f->f_path.mnt);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index ac49c1f..5d52753 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -88,7 +88,11 @@ static inline struct vfsmount *mntget(struct vfsmount *mnt)
 	return mnt;
 }
 
+struct file; /* forward dec */
+
 extern int mnt_want_write(struct vfsmount *mnt);
+extern int mnt_want_write_file(struct file *file);
+extern int mnt_clone_write(struct vfsmount *mnt);
 extern void mnt_drop_write(struct vfsmount *mnt);
 extern void mntput_no_expire(struct vfsmount *mnt);
 extern void mnt_pin(struct vfsmount *mnt);
-- 
cgit v0.10.2


From 864d7c4c068f23642efe91b33be3a84afe5f71e0 Mon Sep 17 00:00:00 2001
From: "npiggin@suse.de" <npiggin@suse.de>
Date: Sun, 26 Apr 2009 20:25:56 +1000
Subject: fs: move mark_files_ro into file_table.c

This function walks the s_files lock, and operates primarily on the
files in a superblock, so it better belongs here (eg. see also
fs_may_remount_ro).

[AV: ... and it shouldn't be static after that move]

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/file_table.c b/fs/file_table.c
index 3d66dbc..334ce39 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -399,6 +399,44 @@ too_bad:
 	return 0;
 }
 
+/**
+ *	mark_files_ro - mark all files read-only
+ *	@sb: superblock in question
+ *
+ *	All files are marked read-only.  We don't care about pending
+ *	delete files so this should be used in 'force' mode only.
+ */
+void mark_files_ro(struct super_block *sb)
+{
+	struct file *f;
+
+retry:
+	file_list_lock();
+	list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
+		struct vfsmount *mnt;
+		if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
+		       continue;
+		if (!file_count(f))
+			continue;
+		if (!(f->f_mode & FMODE_WRITE))
+			continue;
+		f->f_mode &= ~FMODE_WRITE;
+		if (file_check_writeable(f) != 0)
+			continue;
+		file_release_write(f);
+		mnt = mntget(f->f_path.mnt);
+		file_list_unlock();
+		/*
+		 * This can sleep, so we can't hold
+		 * the file_list_lock() spinlock.
+		 */
+		mnt_drop_write(mnt);
+		mntput(mnt);
+		goto retry;
+	}
+	file_list_unlock();
+}
+
 void __init files_init(unsigned long mempages)
 { 
 	int n; 
diff --git a/fs/internal.h b/fs/internal.h
index b4dac4f..6d4ef20 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -66,3 +66,8 @@ extern void __init mnt_init(void);
  * fs_struct.c
  */
 extern void chroot_fs_refs(struct path *, struct path *);
+
+/*
+ * file_table.c
+ */
+extern void mark_files_ro(struct super_block *);
diff --git a/fs/super.c b/fs/super.c
index 1943fdf..c170551 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -616,45 +616,6 @@ out:
 }
 
 /**
- *	mark_files_ro - mark all files read-only
- *	@sb: superblock in question
- *
- *	All files are marked read-only.  We don't care about pending
- *	delete files so this should be used in 'force' mode only.
- */
-
-static void mark_files_ro(struct super_block *sb)
-{
-	struct file *f;
-
-retry:
-	file_list_lock();
-	list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
-		struct vfsmount *mnt;
-		if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
-		       continue;
-		if (!file_count(f))
-			continue;
-		if (!(f->f_mode & FMODE_WRITE))
-			continue;
-		f->f_mode &= ~FMODE_WRITE;
-		if (file_check_writeable(f) != 0)
-			continue;
-		file_release_write(f);
-		mnt = mntget(f->f_path.mnt);
-		file_list_unlock();
-		/*
-		 * This can sleep, so we can't hold
-		 * the file_list_lock() spinlock.
-		 */
-		mnt_drop_write(mnt);
-		mntput(mnt);
-		goto retry;
-	}
-	file_list_unlock();
-}
-
-/**
  *	do_remount_sb - asks filesystem to change mount options.
  *	@sb:	superblock in question
  *	@flags:	numeric part of options
-- 
cgit v0.10.2


From 876a9f76abbcb775f8d21cbc99fa161f9e5937f1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 28 Apr 2009 18:05:55 +0200
Subject: remove s_async_list

Remove the unused s_async_list in the superblock, a leftover of the
broken async inode deletion code that leaked into mainline.  Having this
in the middle of the sync/unmount path is not helpful for the following
cleanups.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/super.c b/fs/super.c
index c170551..3d9f117 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -38,7 +38,6 @@
 #include <linux/kobject.h>
 #include <linux/mutex.h>
 #include <linux/file.h>
-#include <linux/async.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -72,7 +71,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
 		INIT_HLIST_HEAD(&s->s_anon);
 		INIT_LIST_HEAD(&s->s_inodes);
 		INIT_LIST_HEAD(&s->s_dentry_lru);
-		INIT_LIST_HEAD(&s->s_async_list);
 		init_rwsem(&s->s_umount);
 		mutex_init(&s->s_lock);
 		lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -342,11 +340,6 @@ void generic_shutdown_super(struct super_block *sb)
 		lock_super(sb);
 		sb->s_flags &= ~MS_ACTIVE;
 
-		/*
-		 * wait for asynchronous fs operations to finish before going further
-		 */
-		async_synchronize_full_domain(&sb->s_async_list);
-
 		/* bad name - it should be evict_inodes() */
 		invalidate_inodes(sb);
 		lock_kernel();
@@ -517,7 +510,6 @@ restart:
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 		down_read(&sb->s_umount);
-		async_synchronize_full_domain(&sb->s_async_list);
 		if (sb->s_root && (wait || sb->s_dirt))
 			sb->s_op->sync_fs(sb, wait);
 		up_read(&sb->s_umount);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 03fb210..36bcff7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1372,11 +1372,6 @@ struct super_block {
 	 * generic_show_options()
 	 */
 	char *s_options;
-
-	/*
-	 * storage for asynchronous operations
-	 */
-	struct list_head s_async_list;
 };
 
 extern struct timespec current_fs_time(struct super_block *sb);
-- 
cgit v0.10.2


From 5a3e5cb8e08bd876e2542c1451c9a93dab1b0e39 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 27 Apr 2009 16:43:48 +0200
Subject: vfs: Fix sys_sync() and fsync_super() reliability (version 4)

So far, do_sync() called:
  sync_inodes(0);
  sync_supers();
  sync_filesystems(0);
  sync_filesystems(1);
  sync_inodes(1);

This ordering makes it kind of hard for filesystems as sync_inodes(0) need not
submit all the IO (for example it skips inodes with I_SYNC set) so e.g. forcing
transaction to disk in ->sync_fs() is not really enough. Therefore sys_sync has
not been completely reliable on some filesystems (ext3, ext4, reiserfs, ocfs2
and others are hit by this) when racing e.g. with background writeback. A
similar problem hits also other filesystems (e.g. ext2) because of
write_supers() being called before the sync_inodes(1).

Change the ordering of calls in do_sync() - this requires a new function
sync_blockdevs() to preserve the property that block devices are always synced
after write_super() / sync_fs() call.

The same issue is fixed in __fsync_super() function used on umount /
remount read-only.

[AV: build fixes]

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/internal.h b/fs/internal.h
index 6d4ef20..343a537 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -71,3 +71,12 @@ extern void chroot_fs_refs(struct path *, struct path *);
  * file_table.c
  */
 extern void mark_files_ro(struct super_block *);
+
+/*
+ * super.c
+ */
+#ifdef CONFIG_BLOCK
+extern void sync_blockdevs(void);
+#else
+static inline void sync_blockdevs(void) { }
+#endif
diff --git a/fs/super.c b/fs/super.c
index 3d9f117..18d159d 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -293,6 +293,7 @@ void __fsync_super(struct super_block *sb)
 {
 	sync_inodes_sb(sb, 0);
 	vfs_dq_sync(sb);
+	sync_inodes_sb(sb, 1);
 	lock_super(sb);
 	if (sb->s_dirt && sb->s_op->write_super)
 		sb->s_op->write_super(sb);
@@ -300,7 +301,6 @@ void __fsync_super(struct super_block *sb)
 	if (sb->s_op->sync_fs)
 		sb->s_op->sync_fs(sb, 1);
 	sync_blockdev(sb->s_bdev);
-	sync_inodes_sb(sb, 1);
 }
 
 /*
@@ -522,6 +522,33 @@ restart:
 	mutex_unlock(&mutex);
 }
 
+#ifdef CONFIG_BLOCK
+/*
+ *  Sync all block devices underlying some superblock
+ */
+void sync_blockdevs(void)
+{
+	struct super_block *sb;
+
+	spin_lock(&sb_lock);
+restart:
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (!sb->s_bdev)
+			continue;
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+		down_read(&sb->s_umount);
+		if (sb->s_root)
+			sync_blockdev(sb->s_bdev);
+		up_read(&sb->s_umount);
+		spin_lock(&sb_lock);
+		if (__put_super_and_need_restart(sb))
+			goto restart;
+	}
+	spin_unlock(&sb_lock);
+}
+#endif
+
 /**
  *	get_super - get the superblock of a device
  *	@bdev: device to get the superblock for
diff --git a/fs/sync.c b/fs/sync.c
index 7abc65f..631fd5a 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -13,6 +13,7 @@
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
+#include "internal.h"
 
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
 			SYNC_FILE_RANGE_WAIT_AFTER)
@@ -26,10 +27,11 @@ static void do_sync(unsigned long wait)
 	wakeup_pdflush(0);
 	sync_inodes(0);		/* All mappings, inodes and their blockdevs */
 	vfs_dq_sync(NULL);
+	sync_inodes(wait);	/* Mappings, inodes and blockdevs, again. */
 	sync_supers();		/* Write the superblocks */
 	sync_filesystems(0);	/* Start syncing the filesystems */
 	sync_filesystems(wait);	/* Waitingly sync the filesystems */
-	sync_inodes(wait);	/* Mappings, inodes and blockdevs, again. */
+	sync_blockdevs();
 	if (!wait)
 		printk("Emergency Sync complete\n");
 	if (unlikely(laptop_mode))
-- 
cgit v0.10.2


From bfe881255c74800147523b59c85328a1a826ba21 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 27 Apr 2009 16:43:49 +0200
Subject: vfs: Call ->sync_fs() even if s_dirt is 0 (version 4)

sync_filesystems() has a condition that if wait == 0 and s_dirt == 0, then
->sync_fs() isn't called. This does not really make much sence since s_dirt is
generally used by a filesystem to mean that ->write_super() needs to be called.
But ->sync_fs() does different things. I even suspect that some filesystems
(btrfs?) sets s_dirt just to fool this logic.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/super.c b/fs/super.c
index 18d159d..fae91ba 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -510,7 +510,7 @@ restart:
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 		down_read(&sb->s_umount);
-		if (sb->s_root && (wait || sb->s_dirt))
+		if (sb->s_root)
 			sb->s_op->sync_fs(sb, wait);
 		up_read(&sb->s_umount);
 		/* restart only when sb is no longer on the list */
-- 
cgit v0.10.2


From 429479f031322a0cc5c921ffb2321a51718dc875 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 27 Apr 2009 16:43:50 +0200
Subject: vfs: Make __fsync_super() a static function (version 4)

__fsync_super() does the same thing as fsync_super(). So change the only
caller to use fsync_super() and make __fsync_super() static. This removes
unnecessarily duplicated call to sync_blockdev() and prepares ground
for the changes to __fsync_super() in the following patches.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 931f6b8..fe47f72 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -241,7 +241,7 @@ struct super_block *freeze_bdev(struct block_device *bdev)
 		sb->s_frozen = SB_FREEZE_WRITE;
 		smp_wmb();
 
-		__fsync_super(sb);
+		fsync_super(sb);
 
 		sb->s_frozen = SB_FREEZE_TRANS;
 		smp_wmb();
diff --git a/fs/super.c b/fs/super.c
index fae91ba..8dbe1ea 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -289,7 +289,7 @@ EXPORT_SYMBOL(unlock_super);
  * device.  Takes the superblock lock.  Requires a second blkdev
  * flush by the caller to complete the operation.
  */
-void __fsync_super(struct super_block *sb)
+static int __fsync_super(struct super_block *sb)
 {
 	sync_inodes_sb(sb, 0);
 	vfs_dq_sync(sb);
@@ -300,7 +300,7 @@ void __fsync_super(struct super_block *sb)
 	unlock_super(sb);
 	if (sb->s_op->sync_fs)
 		sb->s_op->sync_fs(sb, 1);
-	sync_blockdev(sb->s_bdev);
+	return sync_blockdev(sb->s_bdev);
 }
 
 /*
@@ -310,8 +310,7 @@ void __fsync_super(struct super_block *sb)
  */
 int fsync_super(struct super_block *sb)
 {
-	__fsync_super(sb);
-	return sync_blockdev(sb->s_bdev);
+	return __fsync_super(sb);
 }
 EXPORT_SYMBOL_GPL(fsync_super);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 36bcff7..41a9907 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2078,7 +2078,6 @@ extern int filemap_fdatawrite_range(struct address_space *mapping,
 extern int vfs_fsync(struct file *file, struct dentry *dentry, int datasync);
 extern void sync_supers(void);
 extern void sync_filesystems(int wait);
-extern void __fsync_super(struct super_block *sb);
 extern void emergency_sync(void);
 extern void emergency_remount(void);
 extern int do_remount_sb(struct super_block *sb, int flags,
-- 
cgit v0.10.2


From 5cee5815d1564bbbd505fea86f4550f1efdb5cd0 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 27 Apr 2009 16:43:51 +0200
Subject: vfs: Make sys_sync() use fsync_super() (version 4)

It is unnecessarily fragile to have two places (fsync_super() and do_sync())
doing data integrity sync of the filesystem. Alter __fsync_super() to
accommodate needs of both callers and use it. So after this patch
__fsync_super() is the only place where we gather all the calls needed to
properly send all data on a filesystem to disk.

Nice bonus is that we get a complete livelock avoidance and write_supers()
is now only used for periodic writeback of superblocks.

sync_blockdevs() introduced a couple of patches ago is gone now.

[build fixes folded]

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/block_dev.c b/fs/block_dev.c
index fe47f72..4b6a3b9 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -176,17 +176,22 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 				iov, offset, nr_segs, blkdev_get_blocks, NULL);
 }
 
+int __sync_blockdev(struct block_device *bdev, int wait)
+{
+	if (!bdev)
+		return 0;
+	if (!wait)
+		return filemap_flush(bdev->bd_inode->i_mapping);
+	return filemap_write_and_wait(bdev->bd_inode->i_mapping);
+}
+
 /*
  * Write out and wait upon all the dirty data associated with a block
  * device via its mapping.  Does not take the superblock lock.
  */
 int sync_blockdev(struct block_device *bdev)
 {
-	int ret = 0;
-
-	if (bdev)
-		ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
-	return ret;
+	return __sync_blockdev(bdev, 1);
 }
 EXPORT_SYMBOL(sync_blockdev);
 
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 91013ff..e0fb2e7 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -679,55 +679,6 @@ void sync_inodes_sb(struct super_block *sb, int wait)
 }
 
 /**
- * sync_inodes - writes all inodes to disk
- * @wait: wait for completion
- *
- * sync_inodes() goes through each super block's dirty inode list, writes the
- * inodes out, waits on the writeout and puts the inodes back on the normal
- * list.
- *
- * This is for sys_sync().  fsync_dev() uses the same algorithm.  The subtle
- * part of the sync functions is that the blockdev "superblock" is processed
- * last.  This is because the write_inode() function of a typical fs will
- * perform no I/O, but will mark buffers in the blockdev mapping as dirty.
- * What we want to do is to perform all that dirtying first, and then write
- * back all those inode blocks via the blockdev mapping in one sweep.  So the
- * additional (somewhat redundant) sync_blockdev() calls here are to make
- * sure that really happens.  Because if we call sync_inodes_sb(wait=1) with
- * outstanding dirty inodes, the writeback goes block-at-a-time within the
- * filesystem's write_inode().  This is extremely slow.
- */
-static void __sync_inodes(int wait)
-{
-	struct super_block *sb;
-
-	spin_lock(&sb_lock);
-restart:
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		sb->s_count++;
-		spin_unlock(&sb_lock);
-		down_read(&sb->s_umount);
-		if (sb->s_root) {
-			sync_inodes_sb(sb, wait);
-			sync_blockdev(sb->s_bdev);
-		}
-		up_read(&sb->s_umount);
-		spin_lock(&sb_lock);
-		if (__put_super_and_need_restart(sb))
-			goto restart;
-	}
-	spin_unlock(&sb_lock);
-}
-
-void sync_inodes(int wait)
-{
-	__sync_inodes(0);
-
-	if (wait)
-		__sync_inodes(1);
-}
-
-/**
  * write_inode_now	-	write an inode to disk
  * @inode: inode to write to disk
  * @sync: whether the write should be synchronous or not
diff --git a/fs/internal.h b/fs/internal.h
index 343a537..dbec3cc 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -25,6 +25,8 @@ static inline int sb_is_blkdev_sb(struct super_block *sb)
 	return sb == blockdev_superblock;
 }
 
+extern int __sync_blockdev(struct block_device *bdev, int wait);
+
 #else
 static inline void bdev_cache_init(void)
 {
@@ -34,6 +36,11 @@ static inline int sb_is_blkdev_sb(struct super_block *sb)
 {
 	return 0;
 }
+
+static inline int __sync_blockdev(struct block_device *bdev, int wait)
+{
+	return 0;
+}
 #endif
 
 /*
@@ -71,12 +78,3 @@ extern void chroot_fs_refs(struct path *, struct path *);
  * file_table.c
  */
 extern void mark_files_ro(struct super_block *);
-
-/*
- * super.c
- */
-#ifdef CONFIG_BLOCK
-extern void sync_blockdevs(void);
-#else
-static inline void sync_blockdevs(void) { }
-#endif
diff --git a/fs/super.c b/fs/super.c
index 8dbe1ea..c8ce5ed 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -284,23 +284,23 @@ EXPORT_SYMBOL(lock_super);
 EXPORT_SYMBOL(unlock_super);
 
 /*
- * Write out and wait upon all dirty data associated with this
- * superblock.  Filesystem data as well as the underlying block
- * device.  Takes the superblock lock.  Requires a second blkdev
- * flush by the caller to complete the operation.
+ * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0)
+ * just dirties buffers with inodes so we have to submit IO for these buffers
+ * via __sync_blockdev(). This also speeds up the wait == 1 case since in that
+ * case write_inode() functions do sync_dirty_buffer() and thus effectively
+ * write one block at a time.
  */
-static int __fsync_super(struct super_block *sb)
+static int __fsync_super(struct super_block *sb, int wait)
 {
-	sync_inodes_sb(sb, 0);
 	vfs_dq_sync(sb);
-	sync_inodes_sb(sb, 1);
+	sync_inodes_sb(sb, wait);
 	lock_super(sb);
 	if (sb->s_dirt && sb->s_op->write_super)
 		sb->s_op->write_super(sb);
 	unlock_super(sb);
 	if (sb->s_op->sync_fs)
-		sb->s_op->sync_fs(sb, 1);
-	return sync_blockdev(sb->s_bdev);
+		sb->s_op->sync_fs(sb, wait);
+	return __sync_blockdev(sb->s_bdev, wait);
 }
 
 /*
@@ -310,7 +310,12 @@ static int __fsync_super(struct super_block *sb)
  */
 int fsync_super(struct super_block *sb)
 {
-	return __fsync_super(sb);
+	int ret;
+
+	ret = __fsync_super(sb, 0);
+	if (ret < 0)
+		return ret;
+	return __fsync_super(sb, 1);
 }
 EXPORT_SYMBOL_GPL(fsync_super);
 
@@ -469,20 +474,18 @@ restart:
 }
 
 /*
- * Call the ->sync_fs super_op against all filesystems which are r/w and
- * which implement it.
+ * Sync all the data for all the filesystems (called by sys_sync() and
+ * emergency sync)
  *
  * This operation is careful to avoid the livelock which could easily happen
- * if two or more filesystems are being continuously dirtied.  s_need_sync_fs
+ * if two or more filesystems are being continuously dirtied.  s_need_sync
  * is used only here.  We set it against all filesystems and then clear it as
  * we sync them.  So redirtied filesystems are skipped.
  *
  * But if process A is currently running sync_filesystems and then process B
- * calls sync_filesystems as well, process B will set all the s_need_sync_fs
+ * calls sync_filesystems as well, process B will set all the s_need_sync
  * flags again, which will cause process A to resync everything.  Fix that with
  * a local mutex.
- *
- * (Fabian) Avoid sync_fs with clean fs & wait mode 0
  */
 void sync_filesystems(int wait)
 {
@@ -492,25 +495,23 @@ void sync_filesystems(int wait)
 	mutex_lock(&mutex);		/* Could be down_interruptible */
 	spin_lock(&sb_lock);
 	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (!sb->s_op->sync_fs)
-			continue;
 		if (sb->s_flags & MS_RDONLY)
 			continue;
-		sb->s_need_sync_fs = 1;
+		sb->s_need_sync = 1;
 	}
 
 restart:
 	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (!sb->s_need_sync_fs)
+		if (!sb->s_need_sync)
 			continue;
-		sb->s_need_sync_fs = 0;
+		sb->s_need_sync = 0;
 		if (sb->s_flags & MS_RDONLY)
 			continue;	/* hm.  Was remounted r/o meanwhile */
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 		down_read(&sb->s_umount);
 		if (sb->s_root)
-			sb->s_op->sync_fs(sb, wait);
+			__fsync_super(sb, wait);
 		up_read(&sb->s_umount);
 		/* restart only when sb is no longer on the list */
 		spin_lock(&sb_lock);
@@ -521,33 +522,6 @@ restart:
 	mutex_unlock(&mutex);
 }
 
-#ifdef CONFIG_BLOCK
-/*
- *  Sync all block devices underlying some superblock
- */
-void sync_blockdevs(void)
-{
-	struct super_block *sb;
-
-	spin_lock(&sb_lock);
-restart:
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (!sb->s_bdev)
-			continue;
-		sb->s_count++;
-		spin_unlock(&sb_lock);
-		down_read(&sb->s_umount);
-		if (sb->s_root)
-			sync_blockdev(sb->s_bdev);
-		up_read(&sb->s_umount);
-		spin_lock(&sb_lock);
-		if (__put_super_and_need_restart(sb))
-			goto restart;
-	}
-	spin_unlock(&sb_lock);
-}
-#endif
-
 /**
  *	get_super - get the superblock of a device
  *	@bdev: device to get the superblock for
diff --git a/fs/sync.c b/fs/sync.c
index 631fd5a..be0798c 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -18,35 +18,24 @@
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
 			SYNC_FILE_RANGE_WAIT_AFTER)
 
-/*
- * sync everything.  Start out by waking pdflush, because that writes back
- * all queues in parallel.
- */
-static void do_sync(unsigned long wait)
+SYSCALL_DEFINE0(sync)
 {
-	wakeup_pdflush(0);
-	sync_inodes(0);		/* All mappings, inodes and their blockdevs */
-	vfs_dq_sync(NULL);
-	sync_inodes(wait);	/* Mappings, inodes and blockdevs, again. */
-	sync_supers();		/* Write the superblocks */
-	sync_filesystems(0);	/* Start syncing the filesystems */
-	sync_filesystems(wait);	/* Waitingly sync the filesystems */
-	sync_blockdevs();
-	if (!wait)
-		printk("Emergency Sync complete\n");
+	sync_filesystems(0);
+	sync_filesystems(1);
 	if (unlikely(laptop_mode))
 		laptop_sync_completion();
-}
-
-SYSCALL_DEFINE0(sync)
-{
-	do_sync(1);
 	return 0;
 }
 
 static void do_sync_work(struct work_struct *work)
 {
-	do_sync(0);
+	/*
+	 * Sync twice to reduce the possibility we skipped some inodes / pages
+	 * because they were temporarily locked
+	 */
+	sync_filesystems(0);
+	sync_filesystems(0);
+	printk("Emergency Sync complete\n");
 	kfree(work);
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 41a9907..f00df65 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1321,7 +1321,7 @@ struct super_block {
 	struct rw_semaphore	s_umount;
 	struct mutex		s_lock;
 	int			s_count;
-	int			s_need_sync_fs;
+	int			s_need_sync;
 	atomic_t		s_active;
 #ifdef CONFIG_SECURITY
 	void                    *s_security;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 9344547..3224820 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -79,7 +79,6 @@ struct writeback_control {
 void writeback_inodes(struct writeback_control *wbc);
 int inode_wait(void *);
 void sync_inodes_sb(struct super_block *, int wait);
-void sync_inodes(int wait);
 
 /* writeback.h requires fs.h; it, too, is not included from here. */
 static inline void wait_on_inode(struct inode *inode)
-- 
cgit v0.10.2


From c15c54f5f056ee4819da9fde59a5f2cd45445f23 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 27 Apr 2009 16:43:52 +0200
Subject: vfs: Move syncing code from super.c to sync.c (version 4)

Move sync_filesystems(), __fsync_super(), fsync_super() from
super.c to sync.c where it fits better.

[build fixes folded]

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/super.c b/fs/super.c
index c8ce5ed..f822c74 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -283,42 +283,6 @@ void unlock_super(struct super_block * sb)
 EXPORT_SYMBOL(lock_super);
 EXPORT_SYMBOL(unlock_super);
 
-/*
- * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0)
- * just dirties buffers with inodes so we have to submit IO for these buffers
- * via __sync_blockdev(). This also speeds up the wait == 1 case since in that
- * case write_inode() functions do sync_dirty_buffer() and thus effectively
- * write one block at a time.
- */
-static int __fsync_super(struct super_block *sb, int wait)
-{
-	vfs_dq_sync(sb);
-	sync_inodes_sb(sb, wait);
-	lock_super(sb);
-	if (sb->s_dirt && sb->s_op->write_super)
-		sb->s_op->write_super(sb);
-	unlock_super(sb);
-	if (sb->s_op->sync_fs)
-		sb->s_op->sync_fs(sb, wait);
-	return __sync_blockdev(sb->s_bdev, wait);
-}
-
-/*
- * Write out and wait upon all dirty data associated with this
- * superblock.  Filesystem data as well as the underlying block
- * device.  Takes the superblock lock.
- */
-int fsync_super(struct super_block *sb)
-{
-	int ret;
-
-	ret = __fsync_super(sb, 0);
-	if (ret < 0)
-		return ret;
-	return __fsync_super(sb, 1);
-}
-EXPORT_SYMBOL_GPL(fsync_super);
-
 /**
  *	generic_shutdown_super	-	common helper for ->kill_sb()
  *	@sb: superblock to kill
@@ -473,55 +437,6 @@ restart:
 	spin_unlock(&sb_lock);
 }
 
-/*
- * Sync all the data for all the filesystems (called by sys_sync() and
- * emergency sync)
- *
- * This operation is careful to avoid the livelock which could easily happen
- * if two or more filesystems are being continuously dirtied.  s_need_sync
- * is used only here.  We set it against all filesystems and then clear it as
- * we sync them.  So redirtied filesystems are skipped.
- *
- * But if process A is currently running sync_filesystems and then process B
- * calls sync_filesystems as well, process B will set all the s_need_sync
- * flags again, which will cause process A to resync everything.  Fix that with
- * a local mutex.
- */
-void sync_filesystems(int wait)
-{
-	struct super_block *sb;
-	static DEFINE_MUTEX(mutex);
-
-	mutex_lock(&mutex);		/* Could be down_interruptible */
-	spin_lock(&sb_lock);
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (sb->s_flags & MS_RDONLY)
-			continue;
-		sb->s_need_sync = 1;
-	}
-
-restart:
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (!sb->s_need_sync)
-			continue;
-		sb->s_need_sync = 0;
-		if (sb->s_flags & MS_RDONLY)
-			continue;	/* hm.  Was remounted r/o meanwhile */
-		sb->s_count++;
-		spin_unlock(&sb_lock);
-		down_read(&sb->s_umount);
-		if (sb->s_root)
-			__fsync_super(sb, wait);
-		up_read(&sb->s_umount);
-		/* restart only when sb is no longer on the list */
-		spin_lock(&sb_lock);
-		if (__put_super_and_need_restart(sb))
-			goto restart;
-	}
-	spin_unlock(&sb_lock);
-	mutex_unlock(&mutex);
-}
-
 /**
  *	get_super - get the superblock of a device
  *	@bdev: device to get the superblock for
diff --git a/fs/sync.c b/fs/sync.c
index be0798c..d5fa7b799 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -18,6 +18,91 @@
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
 			SYNC_FILE_RANGE_WAIT_AFTER)
 
+/*
+ * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0)
+ * just dirties buffers with inodes so we have to submit IO for these buffers
+ * via __sync_blockdev(). This also speeds up the wait == 1 case since in that
+ * case write_inode() functions do sync_dirty_buffer() and thus effectively
+ * write one block at a time.
+ */
+static int __fsync_super(struct super_block *sb, int wait)
+{
+	vfs_dq_sync(sb);
+	sync_inodes_sb(sb, wait);
+	lock_super(sb);
+	if (sb->s_dirt && sb->s_op->write_super)
+		sb->s_op->write_super(sb);
+	unlock_super(sb);
+	if (sb->s_op->sync_fs)
+		sb->s_op->sync_fs(sb, wait);
+	return __sync_blockdev(sb->s_bdev, wait);
+}
+
+/*
+ * Write out and wait upon all dirty data associated with this
+ * superblock.  Filesystem data as well as the underlying block
+ * device.  Takes the superblock lock.
+ */
+int fsync_super(struct super_block *sb)
+{
+	int ret;
+
+	ret = __fsync_super(sb, 0);
+	if (ret < 0)
+		return ret;
+	return __fsync_super(sb, 1);
+}
+EXPORT_SYMBOL_GPL(fsync_super);
+
+/*
+ * Sync all the data for all the filesystems (called by sys_sync() and
+ * emergency sync)
+ *
+ * This operation is careful to avoid the livelock which could easily happen
+ * if two or more filesystems are being continuously dirtied.  s_need_sync
+ * is used only here.  We set it against all filesystems and then clear it as
+ * we sync them.  So redirtied filesystems are skipped.
+ *
+ * But if process A is currently running sync_filesystems and then process B
+ * calls sync_filesystems as well, process B will set all the s_need_sync
+ * flags again, which will cause process A to resync everything.  Fix that with
+ * a local mutex.
+ */
+static void sync_filesystems(int wait)
+{
+	struct super_block *sb;
+	static DEFINE_MUTEX(mutex);
+
+	mutex_lock(&mutex);		/* Could be down_interruptible */
+	spin_lock(&sb_lock);
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (sb->s_flags & MS_RDONLY)
+			continue;
+		sb->s_need_sync = 1;
+	}
+
+restart:
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (!sb->s_need_sync)
+			continue;
+		sb->s_need_sync = 0;
+		if (sb->s_flags & MS_RDONLY)
+			continue;	/* hm.  Was remounted r/o meanwhile */
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+		down_read(&sb->s_umount);
+		if (sb->s_root)
+			__fsync_super(sb, wait);
+		up_read(&sb->s_umount);
+		/* restart only when sb is no longer on the list */
+		spin_lock(&sb_lock);
+		if (__put_super_and_need_restart(sb))
+			goto restart;
+	}
+	spin_unlock(&sb_lock);
+	mutex_unlock(&mutex);
+}
+
 SYSCALL_DEFINE0(sync)
 {
 	sync_filesystems(0);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f00df65..d3f7159 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1942,7 +1942,6 @@ extern struct super_block *freeze_bdev(struct block_device *);
 extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 extern int fsync_bdev(struct block_device *);
-extern int fsync_super(struct super_block *);
 extern int fsync_no_super(struct block_device *);
 #else
 static inline void bd_forget(struct inode *inode) {}
@@ -1959,6 +1958,7 @@ static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb)
 	return 0;
 }
 #endif
+extern int fsync_super(struct super_block *);
 extern const struct file_operations def_blk_fops;
 extern const struct file_operations def_chr_fops;
 extern const struct file_operations bad_sock_fops;
@@ -2077,7 +2077,6 @@ extern int filemap_fdatawrite_range(struct address_space *mapping,
 
 extern int vfs_fsync(struct file *file, struct dentry *dentry, int datasync);
 extern void sync_supers(void);
-extern void sync_filesystems(int wait);
 extern void emergency_sync(void);
 extern void emergency_remount(void);
 extern int do_remount_sb(struct super_block *sb, int flags,
-- 
cgit v0.10.2


From 60b0680fa236ac4e17ce31a50048c9d75f9ec831 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 27 Apr 2009 16:43:53 +0200
Subject: vfs: Rename fsync_super() to sync_filesystem() (version 4)

Rename the function so that it better describe what it really does. Also
remove the unnecessary include of buffer_head.h.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4b6a3b9..3a6d4fb 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -204,7 +204,7 @@ int fsync_bdev(struct block_device *bdev)
 {
 	struct super_block *sb = get_super(bdev);
 	if (sb) {
-		int res = fsync_super(sb);
+		int res = sync_filesystem(sb);
 		drop_super(sb);
 		return res;
 	}
@@ -246,7 +246,7 @@ struct super_block *freeze_bdev(struct block_device *bdev)
 		sb->s_frozen = SB_FREEZE_WRITE;
 		smp_wmb();
 
-		fsync_super(sb);
+		sync_filesystem(sb);
 
 		sb->s_frozen = SB_FREEZE_TRANS;
 		smp_wmb();
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 1e96234..dafd484 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -354,7 +354,7 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache)
 	/* make sure all pages pinned by operations on behalf of the netfs are
 	 * written to disc */
 	cachefiles_begin_secure(cache, &saved_cred);
-	ret = fsync_super(cache->mnt->mnt_sb);
+	ret = sync_filesystem(cache->mnt->mnt_sb);
 	cachefiles_end_secure(cache, saved_cred);
 
 	if (ret == -EIO)
diff --git a/fs/super.c b/fs/super.c
index f822c74..721236e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -28,7 +28,6 @@
 #include <linux/blkdev.h>
 #include <linux/quotaops.h>
 #include <linux/namei.h>
-#include <linux/buffer_head.h>		/* for fsync_super() */
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
@@ -304,7 +303,7 @@ void generic_shutdown_super(struct super_block *sb)
 
 	if (sb->s_root) {
 		shrink_dcache_for_umount(sb);
-		fsync_super(sb);
+		sync_filesystem(sb);
 		lock_super(sb);
 		sb->s_flags &= ~MS_ACTIVE;
 
@@ -543,7 +542,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	if (flags & MS_RDONLY)
 		acct_auto_close(sb);
 	shrink_dcache_sb(sb);
-	fsync_super(sb);
+	sync_filesystem(sb);
 
 	/* If we are remounting RDONLY and current sb is read/write,
 	   make sure there are no rw files opened */
diff --git a/fs/sync.c b/fs/sync.c
index d5fa7b799..8aa870a 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -25,7 +25,7 @@
  * case write_inode() functions do sync_dirty_buffer() and thus effectively
  * write one block at a time.
  */
-static int __fsync_super(struct super_block *sb, int wait)
+static int __sync_filesystem(struct super_block *sb, int wait)
 {
 	vfs_dq_sync(sb);
 	sync_inodes_sb(sb, wait);
@@ -43,16 +43,16 @@ static int __fsync_super(struct super_block *sb, int wait)
  * superblock.  Filesystem data as well as the underlying block
  * device.  Takes the superblock lock.
  */
-int fsync_super(struct super_block *sb)
+int sync_filesystem(struct super_block *sb)
 {
 	int ret;
 
-	ret = __fsync_super(sb, 0);
+	ret = __sync_filesystem(sb, 0);
 	if (ret < 0)
 		return ret;
-	return __fsync_super(sb, 1);
+	return __sync_filesystem(sb, 1);
 }
-EXPORT_SYMBOL_GPL(fsync_super);
+EXPORT_SYMBOL_GPL(sync_filesystem);
 
 /*
  * Sync all the data for all the filesystems (called by sys_sync() and
@@ -92,7 +92,7 @@ restart:
 		spin_unlock(&sb_lock);
 		down_read(&sb->s_umount);
 		if (sb->s_root)
-			__fsync_super(sb, wait);
+			__sync_filesystem(sb, wait);
 		up_read(&sb->s_umount);
 		/* restart only when sb is no longer on the list */
 		spin_lock(&sb_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d3f7159..fb1822b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1958,7 +1958,7 @@ static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb)
 	return 0;
 }
 #endif
-extern int fsync_super(struct super_block *);
+extern int sync_filesystem(struct super_block *);
 extern const struct file_operations def_blk_fops;
 extern const struct file_operations def_chr_fops;
 extern const struct file_operations bad_sock_fops;
-- 
cgit v0.10.2


From 850b201b087f5525a0a7278551c2bcd0423c3b26 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 27 Apr 2009 16:43:54 +0200
Subject: quota: cleanup dquota sync functions (version 4)

Currently the VFS calls vfs_dq_sync to sync out disk quotas for a given
superblock.  This is a small wrapper around sync_dquots which for the
case of a non-NULL superblock is a small wrapper around quota_sync_sb.

Just make quota_sync_sb global (rename it to sync_quota_sb) and call it
directly.  Also call it directly for those cases in quota.c that have a
superblock and leave sync_dquots purely an iterator over sync_quota_sb and
remove it's superblock argument.

To make this nicer move the check for the lack of a quota_sync method
from the callers into sync_quota_sb.

[folded build fix from Alexander Beregalov <a.beregalov@gmail.com>]

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index b7f5a46..95c5b42 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -159,10 +159,14 @@ static int check_quotactl_valid(struct super_block *sb, int type, int cmd,
 	return error;
 }
 
-static void quota_sync_sb(struct super_block *sb, int type)
+#ifdef CONFIG_QUOTA
+void sync_quota_sb(struct super_block *sb, int type)
 {
 	int cnt;
 
+	if (!sb->s_qcop->quota_sync)
+		return;
+
 	sb->s_qcop->quota_sync(sb, type);
 
 	if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)
@@ -191,17 +195,13 @@ static void quota_sync_sb(struct super_block *sb, int type)
 	}
 	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 }
+#endif
 
-void sync_dquots(struct super_block *sb, int type)
+static void sync_dquots(int type)
 {
+	struct super_block *sb;
 	int cnt;
 
-	if (sb) {
-		if (sb->s_qcop->quota_sync)
-			quota_sync_sb(sb, type);
-		return;
-	}
-
 	spin_lock(&sb_lock);
 restart:
 	list_for_each_entry(sb, &super_blocks, s_list) {
@@ -222,8 +222,8 @@ restart:
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 		down_read(&sb->s_umount);
-		if (sb->s_root && sb->s_qcop->quota_sync)
-			quota_sync_sb(sb, type);
+		if (sb->s_root)
+			sync_quota_sb(sb, type);
 		up_read(&sb->s_umount);
 		spin_lock(&sb_lock);
 		if (__put_super_and_need_restart(sb))
@@ -301,7 +301,10 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
 			return sb->s_qcop->set_dqblk(sb, type, id, &idq);
 		}
 		case Q_SYNC:
-			sync_dquots(sb, type);
+			if (sb)
+				sync_quota_sb(sb, type);
+			else
+				sync_dquots(type);
 			return 0;
 
 		case Q_XQUOTAON:
diff --git a/fs/sync.c b/fs/sync.c
index 8aa870a..d90ab77 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -27,7 +27,7 @@
  */
 static int __sync_filesystem(struct super_block *sb, int wait)
 {
-	vfs_dq_sync(sb);
+	sync_quota_sb(sb, -1);
 	sync_inodes_sb(sb, wait);
 	lock_super(sb);
 	if (sb->s_dirt && sb->s_op->write_super)
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 36353d9..047310f 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -20,7 +20,7 @@ static inline struct quota_info *sb_dqopt(struct super_block *sb)
 /*
  * declaration of quota_function calls in kernel.
  */
-void sync_dquots(struct super_block *sb, int type);
+void sync_quota_sb(struct super_block *sb, int type);
 
 int dquot_initialize(struct inode *inode, int type);
 int dquot_drop(struct inode *inode);
@@ -253,12 +253,7 @@ static inline void vfs_dq_free_inode(struct inode *inode)
 		inode->i_sb->dq_op->free_inode(inode, 1);
 }
 
-/* The following two functions cannot be called inside a transaction */
-static inline void vfs_dq_sync(struct super_block *sb)
-{
-	sync_dquots(sb, -1);
-}
-
+/* Cannot be called inside a transaction */
 static inline int vfs_dq_off(struct super_block *sb, int remount)
 {
 	int ret = -ENOSYS;
@@ -334,7 +329,7 @@ static inline void vfs_dq_free_inode(struct inode *inode)
 {
 }
 
-static inline void vfs_dq_sync(struct super_block *sb)
+static inline void sync_quota_sb(struct super_block *sb, int type)
 {
 }
 
-- 
cgit v0.10.2


From c3f8a40c1cd5591b882497d1d00d43d0e5bb4698 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 27 Apr 2009 16:43:55 +0200
Subject: quota: Introduce writeout_quota_sb() (version 4)

Introduce this function which just writes all the quota structures but
avoids all the syncing and cache pruning work to expose quota structures
to userspace. Use this function from __sync_filesystem when wait == 0.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/sync.c b/fs/sync.c
index d90ab77..4487b55 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -27,7 +27,11 @@
  */
 static int __sync_filesystem(struct super_block *sb, int wait)
 {
-	sync_quota_sb(sb, -1);
+	/* Avoid doing twice syncing and cache pruning for quota sync */
+	if (!wait)
+		writeout_quota_sb(sb, -1);
+	else
+		sync_quota_sb(sb, -1);
 	sync_inodes_sb(sb, wait);
 	lock_super(sb);
 	if (sb->s_dirt && sb->s_op->write_super)
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 047310f..7bc4575 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -21,6 +21,11 @@ static inline struct quota_info *sb_dqopt(struct super_block *sb)
  * declaration of quota_function calls in kernel.
  */
 void sync_quota_sb(struct super_block *sb, int type);
+static inline void writeout_quota_sb(struct super_block *sb, int type)
+{
+	if (sb->s_qcop->quota_sync)
+		sb->s_qcop->quota_sync(sb, type);
+}
 
 int dquot_initialize(struct inode *inode, int type);
 int dquot_drop(struct inode *inode);
@@ -333,6 +338,10 @@ static inline void sync_quota_sb(struct super_block *sb, int type)
 {
 }
 
+static inline void writeout_quota_sb(struct super_block *sb, int type)
+{
+}
+
 static inline int vfs_dq_off(struct super_block *sb, int remount)
 {
 	return 0;
-- 
cgit v0.10.2


From 59d697b70285c348c01cfc2695c3469ba71d7539 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 27 Apr 2009 09:46:41 -0400
Subject: btrfs: remove ->write_super and stop maintaining ->s_dirt

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5b68330..8612b3a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2322,7 +2322,6 @@ err:
 	btrfs_update_inode(trans, root, dir);
 	btrfs_drop_nlink(inode);
 	ret = btrfs_update_inode(trans, root, inode);
-	dir->i_sb->s_dirt = 1;
 out:
 	return ret;
 }
@@ -2806,7 +2805,6 @@ error:
 				      pending_del_nr);
 	}
 	btrfs_free_path(path);
-	inode->i_sb->s_dirt = 1;
 	return ret;
 }
 
@@ -3768,7 +3766,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 		init_special_inode(inode, inode->i_mode, rdev);
 		btrfs_update_inode(trans, root, inode);
 	}
-	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
 	btrfs_update_inode_block_group(trans, dir);
 out_unlock:
@@ -3833,7 +3830,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		inode->i_op = &btrfs_file_inode_operations;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
-	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
 	btrfs_update_inode_block_group(trans, dir);
 out_unlock:
@@ -3880,7 +3876,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	if (err)
 		drop_inode = 1;
 
-	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, dir);
 	err = btrfs_update_inode(trans, root, inode);
 
@@ -3962,7 +3957,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	d_instantiate(dentry, inode);
 	drop_on_err = 0;
-	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
 	btrfs_update_inode_block_group(trans, dir);
 
@@ -4991,7 +4985,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		inode->i_op = &btrfs_file_inode_operations;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
-	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
 	btrfs_update_inode_block_group(trans, dir);
 	if (drop_inode)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 708ac06..52d8452 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -397,7 +397,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 
-	sb->s_dirt = 0;
 	if (!wait) {
 		filemap_flush(root->fs_info->btree_inode->i_mapping);
 		return 0;
@@ -408,7 +407,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
-	sb->s_dirt = 0;
 	return ret;
 }
 
@@ -454,11 +452,6 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	return 0;
 }
 
-static void btrfs_write_super(struct super_block *sb)
-{
-	sb->s_dirt = 0;
-}
-
 static int btrfs_test_super(struct super_block *s, void *data)
 {
 	struct btrfs_fs_devices *test_fs_devices = data;
@@ -689,7 +682,6 @@ static int btrfs_unfreeze(struct super_block *sb)
 static struct super_operations btrfs_super_ops = {
 	.delete_inode	= btrfs_delete_inode,
 	.put_super	= btrfs_put_super,
-	.write_super	= btrfs_write_super,
 	.sync_fs	= btrfs_sync_fs,
 	.show_options	= btrfs_show_options,
 	.write_inode	= btrfs_write_inode,
-- 
cgit v0.10.2


From ca41f7b918294c2a17780e057568413dcbfc6d49 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 27 Apr 2009 09:46:42 -0400
Subject: ext3: remove ->write_super and stop maintaining ->s_dirt

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 225202d..27967f9 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -649,7 +649,7 @@ do_more:
 		count = overflow;
 		goto do_more;
 	}
-	sb->s_dirt = 1;
+
 error_return:
 	brelse(bitmap_bh);
 	ext3_std_error(sb, err);
@@ -1708,7 +1708,6 @@ allocated:
 	if (!fatal)
 		fatal = err;
 
-	sb->s_dirt = 1;
 	if (fatal)
 		goto out;
 
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index dd13d60..b399912 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -181,7 +181,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
 	err = ext3_journal_dirty_metadata(handle, bitmap_bh);
 	if (!fatal)
 		fatal = err;
-	sb->s_dirt = 1;
+
 error_return:
 	brelse(bitmap_bh);
 	ext3_std_error(sb, fatal);
@@ -537,7 +537,6 @@ got:
 	percpu_counter_dec(&sbi->s_freeinodes_counter);
 	if (S_ISDIR(mode))
 		percpu_counter_inc(&sbi->s_dirs_counter);
-	sb->s_dirt = 1;
 
 	inode->i_uid = current_fsuid();
 	if (test_opt (sb, GRPID))
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index fcfa243..b0248c6 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2960,7 +2960,6 @@ static int ext3_do_update_inode(handle_t *handle,
 				ext3_update_dynamic_rev(sb);
 				EXT3_SET_RO_COMPAT_FEATURE(sb,
 					EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
-				sb->s_dirt = 1;
 				handle->h_sync = 1;
 				err = ext3_journal_dirty_metadata(handle,
 						EXT3_SB(sb)->s_sbh);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 78fdf38..8a0b263 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -934,7 +934,6 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 			   EXT3_INODES_PER_GROUP(sb));
 
 	ext3_journal_dirty_metadata(handle, sbi->s_sbh);
-	sb->s_dirt = 1;
 
 exit_journal:
 	unlock_super(sb);
@@ -1066,7 +1065,6 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 	}
 	es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
 	ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
-	sb->s_dirt = 1;
 	unlock_super(sb);
 	ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count,
 		   o_blocks_count + add);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 3c70d52..1efd958 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -67,7 +67,6 @@ static const char *ext3_decode_error(struct super_block * sb, int errno,
 static int ext3_remount (struct super_block * sb, int * flags, char * data);
 static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
 static int ext3_unfreeze(struct super_block *sb);
-static void ext3_write_super (struct super_block * sb);
 static int ext3_freeze(struct super_block *sb);
 
 /*
@@ -761,7 +760,6 @@ static const struct super_operations ext3_sops = {
 	.dirty_inode	= ext3_dirty_inode,
 	.delete_inode	= ext3_delete_inode,
 	.put_super	= ext3_put_super,
-	.write_super	= ext3_write_super,
 	.sync_fs	= ext3_sync_fs,
 	.freeze_fs	= ext3_freeze,
 	.unfreeze_fs	= ext3_unfreeze,
@@ -1785,7 +1783,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 #else
 		es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
 #endif
-		sb->s_dirt = 1;
 	}
 
 	if (sbi->s_blocks_per_group > blocksize * 8) {
@@ -2265,7 +2262,6 @@ static int ext3_load_journal(struct super_block *sb,
 	if (journal_devnum &&
 	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
 		es->s_journal_dev = cpu_to_le32(journal_devnum);
-		sb->s_dirt = 1;
 
 		/* Make sure we flush the recovery flag to disk. */
 		ext3_commit_super(sb, es, 1);
@@ -2308,7 +2304,6 @@ static int ext3_create_journal(struct super_block * sb,
 	EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL);
 
 	es->s_journal_inum = cpu_to_le32(journal_inum);
-	sb->s_dirt = 1;
 
 	/* Make sure we flush the recovery flag to disk. */
 	ext3_commit_super(sb, es, 1);
@@ -2354,7 +2349,6 @@ static void ext3_mark_recovery_complete(struct super_block * sb,
 	if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
 	    sb->s_flags & MS_RDONLY) {
 		EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-		sb->s_dirt = 0;
 		ext3_commit_super(sb, es, 1);
 	}
 	unlock_super(sb);
@@ -2413,29 +2407,14 @@ int ext3_force_commit(struct super_block *sb)
 		return 0;
 
 	journal = EXT3_SB(sb)->s_journal;
-	sb->s_dirt = 0;
 	ret = ext3_journal_force_commit(journal);
 	return ret;
 }
 
-/*
- * Ext3 always journals updates to the superblock itself, so we don't
- * have to propagate any other updates to the superblock on disk at this
- * point.  (We can probably nuke this function altogether, and remove
- * any mention to sb->s_dirt in all of fs/ext3; eventual cleanup...)
- */
-static void ext3_write_super (struct super_block * sb)
-{
-	if (mutex_trylock(&sb->s_lock) != 0)
-		BUG();
-	sb->s_dirt = 0;
-}
-
 static int ext3_sync_fs(struct super_block *sb, int wait)
 {
 	tid_t target;
 
-	sb->s_dirt = 0;
 	if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
 		if (wait)
 			log_wait_commit(EXT3_SB(sb)->s_journal, target);
@@ -2451,7 +2430,6 @@ static int ext3_freeze(struct super_block *sb)
 {
 	int error = 0;
 	journal_t *journal;
-	sb->s_dirt = 0;
 
 	if (!(sb->s_flags & MS_RDONLY)) {
 		journal = EXT3_SB(sb)->s_journal;
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 83b7be8..545e37c 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -463,7 +463,6 @@ static void ext3_xattr_update_super_block(handle_t *handle,
 
 	if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) {
 		EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR);
-		sb->s_dirt = 1;
 		ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
 	}
 }
-- 
cgit v0.10.2


From b7d245de25d1f0bb75a0d04194b647762b30d3db Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 27 Apr 2009 09:46:43 -0400
Subject: gfs2: remove ->write_super and stop maintaining ->s_dirt

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index aa62cf5..f2e449c 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -764,7 +764,6 @@ void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 	}
 	gfs2_log_unlock(sdp);
 
-	sdp->sd_vfs->s_dirt = 0;
 	up_write(&sdp->sd_log_flush_lock);
 
 	kfree(ai);
@@ -823,7 +822,6 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 	log_refund(sdp, tr);
 	buf_lo_incore_commit(sdp, tr);
 
-	sdp->sd_vfs->s_dirt = 1;
 	up_read(&sdp->sd_log_flush_lock);
 
 	gfs2_log_lock(sdp);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 40bcc37..0a68013 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -788,17 +788,6 @@ restart:
 }
 
 /**
- * gfs2_write_super
- * @sb: the superblock
- *
- */
-
-static void gfs2_write_super(struct super_block *sb)
-{
-	sb->s_dirt = 0;
-}
-
-/**
  * gfs2_sync_fs - sync the filesystem
  * @sb: the superblock
  *
@@ -807,7 +796,6 @@ static void gfs2_write_super(struct super_block *sb)
 
 static int gfs2_sync_fs(struct super_block *sb, int wait)
 {
-	sb->s_dirt = 0;
 	if (wait && sb->s_fs_info)
 		gfs2_log_flush(sb->s_fs_info, NULL);
 	return 0;
@@ -1324,7 +1312,6 @@ const struct super_operations gfs2_super_ops = {
 	.write_inode		= gfs2_write_inode,
 	.delete_inode		= gfs2_delete_inode,
 	.put_super		= gfs2_put_super,
-	.write_super		= gfs2_write_super,
 	.sync_fs		= gfs2_sync_fs,
 	.freeze_fs 		= gfs2_freeze,
 	.unfreeze_fs		= gfs2_unfreeze,
-- 
cgit v0.10.2


From 94cb993f2ee99f3a9318e7b4dbb383497c4bedea Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 27 Apr 2009 09:46:44 -0400
Subject: ocfs2: remove ->write_super and stop maintaining ->s_dirt

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 5c6163f..3eb076c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -126,7 +126,6 @@ static int ocfs2_get_sector(struct super_block *sb,
 			    struct buffer_head **bh,
 			    int block,
 			    int sect_size);
-static void ocfs2_write_super(struct super_block *sb);
 static struct inode *ocfs2_alloc_inode(struct super_block *sb);
 static void ocfs2_destroy_inode(struct inode *inode);
 static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
@@ -141,7 +140,6 @@ static const struct super_operations ocfs2_sops = {
 	.clear_inode	= ocfs2_clear_inode,
 	.delete_inode	= ocfs2_delete_inode,
 	.sync_fs	= ocfs2_sync_fs,
-	.write_super	= ocfs2_write_super,
 	.put_super	= ocfs2_put_super,
 	.remount_fs	= ocfs2_remount,
 	.show_options   = ocfs2_show_options,
@@ -365,24 +363,12 @@ static struct file_operations ocfs2_osb_debug_fops = {
 	.llseek =	generic_file_llseek,
 };
 
-/*
- * write_super and sync_fs ripped right out of ext3.
- */
-static void ocfs2_write_super(struct super_block *sb)
-{
-	if (mutex_trylock(&sb->s_lock) != 0)
-		BUG();
-	sb->s_dirt = 0;
-}
-
 static int ocfs2_sync_fs(struct super_block *sb, int wait)
 {
 	int status;
 	tid_t target;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 
-	sb->s_dirt = 0;
-
 	if (ocfs2_is_hard_readonly(osb))
 		return -EROFS;
 
-- 
cgit v0.10.2


From 517bfae28353e996160518add4d00033d3886e61 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 27 Apr 2009 09:46:45 -0400
Subject: qnx4: remove ->write_super

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index fe1f0f3..95c12fc 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -70,14 +70,6 @@ static void qnx4_delete_inode(struct inode *inode)
 	unlock_kernel();
 }
 
-static void qnx4_write_super(struct super_block *sb)
-{
-	lock_kernel();
-	QNX4DEBUG(("qnx4: write_super\n"));
-	sb->s_dirt = 0;
-	unlock_kernel();
-}
-
 static int qnx4_write_inode(struct inode *inode, int unused)
 {
 	struct qnx4_inode_entry *raw_inode;
@@ -138,7 +130,6 @@ static const struct super_operations qnx4_sops =
 #ifdef CONFIG_QNX4FS_RW
 	.write_inode	= qnx4_write_inode,
 	.delete_inode	= qnx4_delete_inode,
-	.write_super	= qnx4_write_super,
 #endif
 };
 
-- 
cgit v0.10.2


From 8c85e125124a473d6f3e9bb187b0b84207f81d91 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 28 Apr 2009 18:00:26 +0200
Subject: remove ->write_super call in generic_shutdown_super

We just did a full fs writeout using sync_filesystem before, and if
that's not enough for the filesystem it can perform it's own writeout
in ->put_super, which many filesystems already do.

Move a call to foofs_write_super into every foofs_put_super for now to
guarantee identical behaviour until it's cleaned up by the individual
filesystem maintainers.

Exceptions:

 - affs already has identical copy & pasted code at the beginning of
   affs_put_super so no need to do it twice.
 - xfs does the right thing without it and I have changes pending for
   the xfs tree touching this are so I don't really need conflicts
   here..

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index cc4062d..4cf3d26 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -30,6 +30,7 @@ MODULE_LICENSE("GPL");
 #define dprintf(x...)
 #endif
 
+static void bfs_write_super(struct super_block *s);
 void dump_imap(const char *prefix, struct super_block *s);
 
 struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
@@ -216,6 +217,9 @@ static void bfs_put_super(struct super_block *s)
 	if (!info)
 		return;
 
+	if (s->s_dirt)
+		bfs_write_super(s);
+
 	brelse(info->si_sbh);
 	mutex_destroy(&info->bfs_lock);
 	kfree(info->si_imap);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 9f1985e..3cdb761 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -258,6 +258,9 @@ static void exofs_put_super(struct super_block *sb)
 	int num_pend;
 	struct exofs_sb_info *sbi = sb->s_fs_info;
 
+	if (sb->s_dirt)
+		exofs_write_super(sb);
+
 	/* make sure there are no pending commands */
 	for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
 	     num_pend = atomic_read(&sbi->s_curr_pending)) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index e3c748f..932a2bc 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -114,6 +114,9 @@ static void ext2_put_super (struct super_block * sb)
 	int i;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 
+	if (sb->s_dirt)
+		ext2_write_super(sb);
+
 	ext2_xattr_put_super(sb);
 	if (!(sb->s_flags & MS_RDONLY)) {
 		struct ext2_super_block *es = sbi->s_es;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f016707..c7b8f8d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -576,6 +576,9 @@ static void ext4_put_super(struct super_block *sb)
 	struct ext4_super_block *es = sbi->s_es;
 	int i, err;
 
+	if (sb->s_dirt)
+		ext4_write_super(sb);
+
 	ext4_release_system_zone(sb);
 	ext4_mb_release(sb);
 	ext4_ext_release(sb);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 296785a..4978621 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -451,6 +451,9 @@ static void fat_put_super(struct super_block *sb)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 
+	if (sb->s_dirt)
+		fat_write_super(sb);
+
 	if (sbi->nls_disk) {
 		unload_nls(sbi->nls_disk);
 		sbi->nls_disk = NULL;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index a36bb74..e071e6d 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -65,6 +65,8 @@ static void hfs_write_super(struct super_block *sb)
  */
 static void hfs_put_super(struct super_block *sb)
 {
+	if (sb->s_dirt)
+		hfs_write_super(sb);
 	hfs_mdb_close(sb);
 	/* release the MDB's resources */
 	hfs_mdb_put(sb);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index f2a6402..40bdab7 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -199,6 +199,8 @@ static void hfsplus_put_super(struct super_block *sb)
 	dprint(DBG_SUPER, "hfsplus_put_super\n");
 	if (!sb->s_fs_info)
 		return;
+	if (sb->s_dirt)
+		hfsplus_write_super(sb);
 	if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) {
 		struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
 
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 4c4e18c..5059e96 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -174,6 +174,9 @@ static void jffs2_put_super (struct super_block *sb)
 
 	D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n"));
 
+	if (sb->s_dirt)
+		jffs2_write_super(sb);
+
 	mutex_lock(&c->alloc_sem);
 	jffs2_flush_wbuf_pad(c);
 	mutex_unlock(&c->alloc_sem);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 6989b03..7901d8c 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -65,6 +65,7 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
 		   "(NILFS)");
 MODULE_LICENSE("GPL");
 
+static void nilfs_write_super(struct super_block *sb);
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
 static int test_exclusive_mount(struct file_system_type *fs_type,
 				struct block_device *bdev, int flags);
@@ -315,6 +316,9 @@ static void nilfs_put_super(struct super_block *sb)
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 
+	if (sb->s_dirt)
+		nilfs_write_super(sb);
+
 	nilfs_detach_segment_constructor(sbi);
 
 	if (!(sb->s_flags & MS_RDONLY)) {
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 9dbdcfb..1b52daa 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -468,6 +468,9 @@ static void reiserfs_put_super(struct super_block *s)
 	struct reiserfs_transaction_handle th;
 	th.t_trans_id = 0;
 
+	if (s->s_dirt)
+		reiserfs_write_super(s);
+
 	/* change file system state to current state if it was mounted with read-write permissions */
 	if (!(s->s_flags & MS_RDONLY)) {
 		if (!journal_begin(&th, s, 10)) {
diff --git a/fs/super.c b/fs/super.c
index 721236e..d9a29d5 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -311,8 +311,6 @@ void generic_shutdown_super(struct super_block *sb)
 		invalidate_inodes(sb);
 		lock_kernel();
 
-		if (sop->write_super && sb->s_dirt)
-			sop->write_super(sb);
 		if (sop->put_super)
 			sop->put_super(sb);
 
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index da20b48..cd80316 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -72,6 +72,9 @@ static void sysv_put_super(struct super_block *sb)
 {
 	struct sysv_sb_info *sbi = SYSV_SB(sb);
 
+	if (sb->s_dirt)
+		sysv_write_super(sb);
+
 	if (!(sb->s_flags & MS_RDONLY)) {
 		/* XXX ext2 also updates the state here */
 		mark_buffer_dirty(sbi->s_bh1);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 6035929..74afb9f 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1152,6 +1152,9 @@ static void ufs_put_super(struct super_block *sb)
 		
 	UFSD("ENTER\n");
 
+	if (sb->s_dirt)
+		ufs_write_super(sb);
+
 	if (!(sb->s_flags & MS_RDONLY))
 		ufs_put_super_internal(sb);
 	
-- 
cgit v0.10.2


From f3da392e9ff14b9f388e74319e6d195848991c07 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 4 May 2009 03:32:03 +0400
Subject: dcache: extrace and use d_unlinked()

d_unlinked() will be used in middle-term to ban checkpointing when opened
but unlinked file is detected, and in long term, to detect such situation
and special case on it.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/dcache.c b/fs/dcache.c
index 75659a6..9e5cd3c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1910,7 +1910,7 @@ char *__d_path(const struct path *path, struct path *root,
 
 	spin_lock(&vfsmount_lock);
 	prepend(&end, &buflen, "\0", 1);
-	if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
+	if (d_unlinked(dentry) &&
 		(prepend(&end, &buflen, " (deleted)", 10) != 0))
 			goto Elong;
 
@@ -2035,7 +2035,7 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
 
 	spin_lock(&dcache_lock);
 	prepend(&end, &buflen, "\0", 1);
-	if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
+	if (d_unlinked(dentry) &&
 		(prepend(&end, &buflen, "//deleted", 9) != 0))
 			goto Elong;
 	if (buflen < 1)
@@ -2097,9 +2097,8 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 	read_unlock(&current->fs->lock);
 
 	error = -ENOENT;
-	/* Has the current directory has been unlinked? */
 	spin_lock(&dcache_lock);
-	if (IS_ROOT(pwd.dentry) || !d_unhashed(pwd.dentry)) {
+	if (!d_unlinked(pwd.dentry)) {
 		unsigned long len;
 		struct path tmp = root;
 		char * cwd;
diff --git a/fs/namespace.c b/fs/namespace.c
index 120b8a6..7e537f0 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1384,7 +1384,7 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
 		goto out_unlock;
 
 	err = -ENOENT;
-	if (IS_ROOT(path->dentry) || !d_unhashed(path->dentry))
+	if (!d_unlinked(path->dentry))
 		err = attach_recursive_mnt(mnt, path, NULL);
 out_unlock:
 	mutex_unlock(&path->dentry->d_inode->i_mutex);
@@ -1566,7 +1566,7 @@ static int do_move_mount(struct path *path, char *old_name)
 	if (IS_DEADDIR(path->dentry->d_inode))
 		goto out1;
 
-	if (!IS_ROOT(path->dentry) && d_unhashed(path->dentry))
+	if (d_unlinked(path->dentry))
 		goto out1;
 
 	err = -EINVAL;
@@ -2129,9 +2129,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	error = -ENOENT;
 	if (IS_DEADDIR(new.dentry->d_inode))
 		goto out2;
-	if (d_unhashed(new.dentry) && !IS_ROOT(new.dentry))
+	if (d_unlinked(new.dentry))
 		goto out2;
-	if (d_unhashed(old.dentry) && !IS_ROOT(old.dentry))
+	if (d_unlinked(old.dentry))
 		goto out2;
 	error = -EBUSY;
 	if (new.mnt == root.mnt ||
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 72ce2ae..30b93b2 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -353,6 +353,11 @@ static inline int d_unhashed(struct dentry *dentry)
 	return (dentry->d_flags & DCACHE_UNHASHED);
 }
 
+static inline int d_unlinked(struct dentry *dentry)
+{
+	return d_unhashed(dentry) && !IS_ROOT(dentry);
+}
+
 static inline struct dentry *dget_parent(struct dentry *dentry)
 {
 	struct dentry *ret;
-- 
cgit v0.10.2


From e5004753388dcf5e1b8a52ac0ab807d232340fbb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 5 May 2009 16:08:56 +0200
Subject: cleanup sync_supers

Merge the write_super helper into sync_super and move the check for
->write_super earlier so that we can avoid grabbing a reference to
a superblock that doesn't have it.

While we're at it also add a little comment documenting sync_supers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/super.c b/fs/super.c
index d9a29d5..cb19fff 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -399,16 +399,14 @@ void drop_super(struct super_block *sb)
 
 EXPORT_SYMBOL(drop_super);
 
-static inline void write_super(struct super_block *sb)
-{
-	lock_super(sb);
-	if (sb->s_root && sb->s_dirt)
-		if (sb->s_op->write_super)
-			sb->s_op->write_super(sb);
-	unlock_super(sb);
-}
-
-/*
+/**
+ * sync_supers - helper for periodic superblock writeback
+ *
+ * Call the write_super method if present on all dirty superblocks in
+ * the system.  This is for the periodic writeback used by most older
+ * filesystems.  For data integrity superblock writeback use
+ * sync_filesystems() instead.
+ *
  * Note: check the dirty flag before waiting, so we don't
  * hold up the sync while mounting a device. (The newly
  * mounted device won't need syncing.)
@@ -420,12 +418,17 @@ void sync_supers(void)
 	spin_lock(&sb_lock);
 restart:
 	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (sb->s_dirt) {
+		if (sb->s_op->write_super && sb->s_dirt) {
 			sb->s_count++;
 			spin_unlock(&sb_lock);
+
 			down_read(&sb->s_umount);
-			write_super(sb);
+			lock_super(sb);
+			if (sb->s_root && sb->s_dirt)
+				sb->s_op->write_super(sb);
+			unlock_super(sb);
 			up_read(&sb->s_umount);
+
 			spin_lock(&sb_lock);
 			if (__put_super_and_need_restart(sb))
 				goto restart;
-- 
cgit v0.10.2


From 5af7926ff33b68b3ba46531471c6e0564b285efc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 5 May 2009 15:41:25 +0200
Subject: enforce ->sync_fs is only called for rw superblock

Make sure a superblock really is writeable by checking MS_RDONLY
under s_umount.  sync_filesystems needed some re-arragement for
that, but all but one sync_filesystem caller had the correct locking
already so that we could add that check there.  cachefiles grew
s_umount locking.

I've also added a WARN_ON to sync_filesystem to assert this for
future callers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 52d8452..9f179d4 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -394,9 +394,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 	struct btrfs_root *root = btrfs_sb(sb);
 	int ret;
 
-	if (sb->s_flags & MS_RDONLY)
-		return 0;
-
 	if (!wait) {
 		filemap_flush(root->fs_info->btree_inode->i_mapping);
 		return 0;
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index dafd484..431accd 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -354,7 +354,9 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache)
 	/* make sure all pages pinned by operations on behalf of the netfs are
 	 * written to disc */
 	cachefiles_begin_secure(cache, &saved_cred);
+	down_read(&cache->mnt->mnt_sb->s_umount);
 	ret = sync_filesystem(cache->mnt->mnt_sb);
+	up_read(&cache->mnt->mnt_sb->s_umount);
 	cachefiles_end_secure(cache, saved_cred);
 
 	if (ret == -EIO)
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 1b52daa..3da0401 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -64,18 +64,15 @@ static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
 
 static int reiserfs_sync_fs(struct super_block *s, int wait)
 {
-	if (!(s->s_flags & MS_RDONLY)) {
-		struct reiserfs_transaction_handle th;
-		reiserfs_write_lock(s);
-		if (!journal_begin(&th, s, 1))
-			if (!journal_end_sync(&th, s, 1))
-				reiserfs_flush_old_commits(s);
-		s->s_dirt = 0;	/* Even if it's not true.
-				 * We'll loop forever in sync_supers otherwise */
-		reiserfs_write_unlock(s);
-	} else {
-		s->s_dirt = 0;
-	}
+	struct reiserfs_transaction_handle th;
+
+	reiserfs_write_lock(s);
+	if (!journal_begin(&th, s, 1))
+		if (!journal_end_sync(&th, s, 1))
+			reiserfs_flush_old_commits(s);
+	s->s_dirt = 0;	/* Even if it's not true.
+			 * We'll loop forever in sync_supers otherwise */
+	reiserfs_write_unlock(s);
 	return 0;
 }
 
diff --git a/fs/sync.c b/fs/sync.c
index 4487b55..89c37f7 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -51,6 +51,18 @@ int sync_filesystem(struct super_block *sb)
 {
 	int ret;
 
+	/*
+	 * We need to be protected against the filesystem going from
+	 * r/o to r/w or vice versa.
+	 */
+	WARN_ON(!rwsem_is_locked(&sb->s_umount));
+
+	/*
+	 * No point in syncing out anything if the filesystem is read-only.
+	 */
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
 	ret = __sync_filesystem(sb, 0);
 	if (ret < 0)
 		return ret;
@@ -79,25 +91,22 @@ static void sync_filesystems(int wait)
 
 	mutex_lock(&mutex);		/* Could be down_interruptible */
 	spin_lock(&sb_lock);
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (sb->s_flags & MS_RDONLY)
-			continue;
+	list_for_each_entry(sb, &super_blocks, s_list)
 		sb->s_need_sync = 1;
-	}
 
 restart:
 	list_for_each_entry(sb, &super_blocks, s_list) {
 		if (!sb->s_need_sync)
 			continue;
 		sb->s_need_sync = 0;
-		if (sb->s_flags & MS_RDONLY)
-			continue;	/* hm.  Was remounted r/o meanwhile */
 		sb->s_count++;
 		spin_unlock(&sb_lock);
+
 		down_read(&sb->s_umount);
-		if (sb->s_root)
+		if (!(sb->s_flags & MS_RDONLY) && sb->s_root)
 			__sync_filesystem(sb, wait);
 		up_read(&sb->s_umount);
+
 		/* restart only when sb is no longer on the list */
 		spin_lock(&sb_lock);
 		if (__put_super_and_need_restart(sb))
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index e9f7a75..84f3c7f 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -447,9 +447,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
 	if (!wait)
 		return 0;
 
-	if (sb->s_flags & MS_RDONLY)
-		return 0;
-
 	/*
 	 * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
 	 * pages, so synchronize them first, then commit the journal. Strictly
-- 
cgit v0.10.2


From 443b94baaa16771e98b29ca7c24f1e305738ffca Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 5 May 2009 23:48:50 -0400
Subject: Make sure that all callers of remount hold s_umount exclusive

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/super.c b/fs/super.c
index cb19fff..49f670c 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -579,7 +579,7 @@ static void do_emergency_remount(struct work_struct *work)
 	list_for_each_entry(sb, &super_blocks, s_list) {
 		sb->s_count++;
 		spin_unlock(&sb_lock);
-		down_read(&sb->s_umount);
+		down_write(&sb->s_umount);
 		if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) {
 			/*
 			 * ->remount_fs needs lock_kernel().
@@ -590,7 +590,8 @@ static void do_emergency_remount(struct work_struct *work)
 			do_remount_sb(sb, MS_RDONLY, NULL, 1);
 			unlock_kernel();
 		}
-		drop_super(sb);
+		up_write(&sb->s_umount);
+		put_super(sb);
 		spin_lock(&sb_lock);
 	}
 	spin_unlock(&sb_lock);
-- 
cgit v0.10.2


From 62c6943b4b1e818aea60c11c5a68a50785b83119 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 7 May 2009 03:12:29 -0400
Subject: Trim a bit of crap from fs.h

do_remount_sb() is fs/internal.h fodder, fsync_no_super() is long gone.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/internal.h b/fs/internal.h
index dbec3cc..d55ef56 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -78,3 +78,8 @@ extern void chroot_fs_refs(struct path *, struct path *);
  * file_table.c
  */
 extern void mark_files_ro(struct super_block *);
+
+/*
+ * super.c
+ */
+extern int do_remount_sb(struct super_block *, int, void *, int);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fb1822b..e7833ef 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1942,7 +1942,6 @@ extern struct super_block *freeze_bdev(struct block_device *);
 extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 extern int fsync_bdev(struct block_device *);
-extern int fsync_no_super(struct block_device *);
 #else
 static inline void bd_forget(struct inode *inode) {}
 static inline int sync_blockdev(struct block_device *bdev) { return 0; }
@@ -2079,8 +2078,6 @@ extern int vfs_fsync(struct file *file, struct dentry *dentry, int datasync);
 extern void sync_supers(void);
 extern void emergency_sync(void);
 extern void emergency_remount(void);
-extern int do_remount_sb(struct super_block *sb, int flags,
-			 void *data, int force);
 #ifdef CONFIG_BLOCK
 extern sector_t bmap(struct inode *, sector_t);
 #endif
-- 
cgit v0.10.2


From a9e220f8322e2b0e0b8903fe00265461cffad3f0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 5 May 2009 22:10:44 -0400
Subject: No need to do lock_super() for exclusion in generic_shutdown_super()

We can't run into contention on it.  All other callers of lock_super()
either hold s_umount (and we have it exclusive) or hold an active
reference to superblock in question, which prevents the call of
generic_shutdown_super() while the reference is held.  So we can
replace lock_super(s) with get_fs_excl() in generic_shutdown_super()
(and corresponding change for unlock_super(), of course).

Since ext4 expects s_lock held for its put_super, take lock_super()
into it.  The rest of filesystems do not care at all.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c7b8f8d..0d3034c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -576,6 +576,7 @@ static void ext4_put_super(struct super_block *sb)
 	struct ext4_super_block *es = sbi->s_es;
 	int i, err;
 
+	lock_super(sb);
 	if (sb->s_dirt)
 		ext4_write_super(sb);
 
@@ -645,7 +646,6 @@ static void ext4_put_super(struct super_block *sb)
 	unlock_super(sb);
 	kobject_put(&sbi->s_kobj);
 	wait_for_completion(&sbi->s_kobj_unregister);
-	lock_super(sb);
 	lock_kernel();
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
diff --git a/fs/super.c b/fs/super.c
index 49f670c..54fd331 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -304,7 +304,7 @@ void generic_shutdown_super(struct super_block *sb)
 	if (sb->s_root) {
 		shrink_dcache_for_umount(sb);
 		sync_filesystem(sb);
-		lock_super(sb);
+		get_fs_excl();
 		sb->s_flags &= ~MS_ACTIVE;
 
 		/* bad name - it should be evict_inodes() */
@@ -322,7 +322,7 @@ void generic_shutdown_super(struct super_block *sb)
 		}
 
 		unlock_kernel();
-		unlock_super(sb);
+		put_fs_excl();
 	}
 	spin_lock(&sb_lock);
 	/* should be initialized for __put_super_and_need_restart() */
-- 
cgit v0.10.2


From 6cfd0148425e528b859b26e436b01f23f6926224 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 5 May 2009 15:40:36 +0200
Subject: push BKL down into ->put_super

Move BKL into ->put_super from the only caller.  A couple of
filesystems had trivial enough ->put_super (only kfree and NULLing of
s_fs_info + stuff in there) to not get any locking: coda, cramfs, efs,
hugetlbfs, omfs, qnx4, shmem, all others got the full treatment.  Most
of them probably don't need it, but I'd rather sort that out individually.
Preferably after all the other BKL pushdowns in that area.

[AV: original used to move lock_super() down as well; these changes are
removed since we don't do lock_super() at all in generic_shutdown_super()
now]
[AV: fuse, btrfs and xfs are known to need no damn BKL, exempt]

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index dd9becc..0ec5aaf 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -132,11 +132,15 @@ static void adfs_put_super(struct super_block *sb)
 	int i;
 	struct adfs_sb_info *asb = ADFS_SB(sb);
 
+	lock_kernel();
+
 	for (i = 0; i < asb->s_map_size; i++)
 		brelse(asb->s_map[i].dm_bh);
 	kfree(asb->s_map);
 	kfree(asb);
 	sb->s_fs_info = NULL;
+
+	unlock_kernel();
 }
 
 static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 63f5183..d738646 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -29,6 +29,8 @@ affs_put_super(struct super_block *sb)
 	struct affs_sb_info *sbi = AFFS_SB(sb);
 	pr_debug("AFFS: put_super()\n");
 
+	lock_kernel();
+
 	if (!(sb->s_flags & MS_RDONLY)) {
 		AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag = cpu_to_be32(1);
 		secs_to_datestamp(get_seconds(),
@@ -42,7 +44,8 @@ affs_put_super(struct super_block *sb)
 	affs_brelse(sbi->s_root_bh);
 	kfree(sbi);
 	sb->s_fs_info = NULL;
-	return;
+
+	unlock_kernel();
 }
 
 static void
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 76828e5..ad0514d 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -440,8 +440,12 @@ static void afs_put_super(struct super_block *sb)
 
 	_enter("");
 
+	lock_kernel();
+
 	afs_put_volume(as->volume);
 
+	unlock_kernel();
+
 	_leave("");
 }
 
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 76afd0d..9367b62 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -737,6 +737,8 @@ parse_options(char *options, befs_mount_options * opts)
 static void
 befs_put_super(struct super_block *sb)
 {
+	lock_kernel();
+
 	kfree(BEFS_SB(sb)->mount_opts.iocharset);
 	BEFS_SB(sb)->mount_opts.iocharset = NULL;
 
@@ -747,7 +749,8 @@ befs_put_super(struct super_block *sb)
 
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
-	return;
+
+	unlock_kernel();
 }
 
 /* Allocate private field of the superblock, fill it.
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 4cf3d26..3a9a136 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -217,6 +217,8 @@ static void bfs_put_super(struct super_block *s)
 	if (!info)
 		return;
 
+	lock_kernel();
+
 	if (s->s_dirt)
 		bfs_write_super(s);
 
@@ -225,6 +227,8 @@ static void bfs_put_super(struct super_block *s)
 	kfree(info->si_imap);
 	kfree(info);
 	s->s_fs_info = NULL;
+
+	unlock_kernel();
 }
 
 static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 0a10a59..0d92114 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -204,6 +204,9 @@ cifs_put_super(struct super_block *sb)
 		cFYI(1, ("Empty cifs superblock info passed to unmount"));
 		return;
 	}
+
+	lock_kernel();
+
 	rc = cifs_umount(sb, cifs_sb);
 	if (rc)
 		cERROR(1, ("cifs_umount failed with return code %d", rc));
@@ -216,7 +219,8 @@ cifs_put_super(struct super_block *sb)
 
 	unload_nls(cifs_sb->local_nls);
 	kfree(cifs_sb);
-	return;
+
+	unlock_kernel();
 }
 
 static int
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index fa4c7e7..12d6496 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -27,6 +27,7 @@
 #include <linux/mount.h>
 #include <linux/key.h>
 #include <linux/seq_file.h>
+#include <linux/smp_lock.h>
 #include <linux/file.h>
 #include <linux/crypto.h>
 #include "ecryptfs_kernel.h"
@@ -120,9 +121,13 @@ static void ecryptfs_put_super(struct super_block *sb)
 {
 	struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
 
+	lock_kernel();
+
 	ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
 	kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
 	ecryptfs_set_superblock_private(sb, NULL);
+
+	unlock_kernel();
 }
 
 /**
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 3cdb761..cd1f8b1 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -258,6 +258,8 @@ static void exofs_put_super(struct super_block *sb)
 	int num_pend;
 	struct exofs_sb_info *sbi = sb->s_fs_info;
 
+	lock_kernel();
+
 	if (sb->s_dirt)
 		exofs_write_super(sb);
 
@@ -274,6 +276,8 @@ static void exofs_put_super(struct super_block *sb)
 	osduld_put_device(sbi->s_dev);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
+
+	unlock_kernel();
 }
 
 /*
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 932a2bc..a44963d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -114,6 +114,8 @@ static void ext2_put_super (struct super_block * sb)
 	int i;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 
+	lock_kernel();
+
 	if (sb->s_dirt)
 		ext2_write_super(sb);
 
@@ -138,7 +140,7 @@ static void ext2_put_super (struct super_block * sb)
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 
-	return;
+	unlock_kernel();
 }
 
 static struct kmem_cache * ext2_inode_cachep;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 1efd958..546b8d7 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -398,6 +398,8 @@ static void ext3_put_super (struct super_block * sb)
 	struct ext3_super_block *es = sbi->s_es;
 	int i, err;
 
+	lock_kernel();
+
 	ext3_xattr_put_super(sb);
 	err = journal_destroy(sbi->s_journal);
 	sbi->s_journal = NULL;
@@ -446,7 +448,8 @@ static void ext3_put_super (struct super_block * sb)
 	sb->s_fs_info = NULL;
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
-	return;
+
+	unlock_kernel();
 }
 
 static struct kmem_cache *ext3_inode_cachep;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0d3034c..1d4180b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -577,6 +577,7 @@ static void ext4_put_super(struct super_block *sb)
 	int i, err;
 
 	lock_super(sb);
+	lock_kernel();
 	if (sb->s_dirt)
 		ext4_write_super(sb);
 
@@ -646,7 +647,6 @@ static void ext4_put_super(struct super_block *sb)
 	unlock_super(sb);
 	kobject_put(&sbi->s_kobj);
 	wait_for_completion(&sbi->s_kobj_unregister);
-	lock_kernel();
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 }
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 4978621..2b88c93a 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -451,6 +451,8 @@ static void fat_put_super(struct super_block *sb)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 
+	lock_kernel();
+
 	if (sb->s_dirt)
 		fat_write_super(sb);
 
@@ -470,6 +472,8 @@ static void fat_put_super(struct super_block *sb)
 
 	sb->s_fs_info = NULL;
 	kfree(sbi);
+
+	unlock_kernel();
 }
 
 static struct kmem_cache *fat_inode_cachep;
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 1dacda8..cdbd165 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -80,12 +80,16 @@ vxfs_put_super(struct super_block *sbp)
 {
 	struct vxfs_sb_info	*infp = VXFS_SBI(sbp);
 
+	lock_kernel();
+
 	vxfs_put_fake_inode(infp->vsi_fship);
 	vxfs_put_fake_inode(infp->vsi_ilist);
 	vxfs_put_fake_inode(infp->vsi_stilist);
 
 	brelse(infp->vsi_bp);
 	kfree(infp);
+
+	unlock_kernel();
 }
 
 /**
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 0a68013..c8930b3 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -719,6 +719,8 @@ static void gfs2_put_super(struct super_block *sb)
 	int error;
 	struct gfs2_jdesc *jd;
 
+	lock_kernel();
+
 	/*  Unfreeze the filesystem, if we need to  */
 
 	mutex_lock(&sdp->sd_freeze_lock);
@@ -785,6 +787,8 @@ restart:
 
 	/*  At this point, we're through participating in the lockspace  */
 	gfs2_sys_fs_del(sdp);
+
+	unlock_kernel();
 }
 
 /**
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index e071e6d..9f5eaa0 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -65,11 +65,15 @@ static void hfs_write_super(struct super_block *sb)
  */
 static void hfs_put_super(struct super_block *sb)
 {
+	lock_kernel();
+
 	if (sb->s_dirt)
 		hfs_write_super(sb);
 	hfs_mdb_close(sb);
 	/* release the MDB's resources */
 	hfs_mdb_put(sb);
+
+	unlock_kernel();
 }
 
 /*
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 40bdab7..9b292dc 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -199,6 +199,9 @@ static void hfsplus_put_super(struct super_block *sb)
 	dprint(DBG_SUPER, "hfsplus_put_super\n");
 	if (!sb->s_fs_info)
 		return;
+
+	lock_kernel();
+
 	if (sb->s_dirt)
 		hfsplus_write_super(sb);
 	if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) {
@@ -220,6 +223,8 @@ static void hfsplus_put_super(struct super_block *sb)
 		unload_nls(HFSPLUS_SB(sb).nls);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
+
+	unlock_kernel();
 }
 
 static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index fc77965..437a32e 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -99,11 +99,16 @@ int hpfs_stop_cycles(struct super_block *s, int key, int *c1, int *c2,
 static void hpfs_put_super(struct super_block *s)
 {
 	struct hpfs_sb_info *sbi = hpfs_sb(s);
+
+	lock_kernel();
+
 	kfree(sbi->sb_cp_table);
 	kfree(sbi->sb_bmp_dir);
 	unmark_dirty(s);
 	s->s_fs_info = NULL;
 	kfree(sbi);
+
+	unlock_kernel();
 }
 
 unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index b4cbe96..068b34b 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -42,11 +42,16 @@ static int isofs_dentry_cmp_ms(struct dentry *dentry, struct qstr *a, struct qst
 static void isofs_put_super(struct super_block *sb)
 {
 	struct isofs_sb_info *sbi = ISOFS_SB(sb);
+
 #ifdef CONFIG_JOLIET
+	lock_kernel();
+
 	if (sbi->s_nls_iocharset) {
 		unload_nls(sbi->s_nls_iocharset);
 		sbi->s_nls_iocharset = NULL;
 	}
+
+	unlock_kernel();
 #endif
 
 	kfree(sbi);
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 5059e96..37b1212 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -174,6 +174,8 @@ static void jffs2_put_super (struct super_block *sb)
 
 	D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n"));
 
+	lock_kernel();
+
 	if (sb->s_dirt)
 		jffs2_write_super(sb);
 
@@ -195,6 +197,8 @@ static void jffs2_put_super (struct super_block *sb)
 	if (c->mtd->sync)
 		c->mtd->sync(c->mtd);
 
+	unlock_kernel();
+
 	D1(printk(KERN_DEBUG "jffs2_put_super returning\n"));
 }
 
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index d9b0e92..3eb13ad 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -183,6 +183,9 @@ static void jfs_put_super(struct super_block *sb)
 	int rc;
 
 	jfs_info("In jfs_put_super");
+
+	lock_kernel();
+
 	rc = jfs_umount(sb);
 	if (rc)
 		jfs_err("jfs_umount failed with return code %d", rc);
@@ -195,6 +198,8 @@ static void jfs_put_super(struct super_block *sb)
 	sbi->direct_inode = NULL;
 
 	kfree(sbi);
+
+	unlock_kernel();
 }
 
 enum {
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index daad3c2..7eb5397 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -35,6 +35,8 @@ static void minix_put_super(struct super_block *sb)
 	int i;
 	struct minix_sb_info *sbi = minix_sb(sb);
 
+	lock_kernel();
+
 	if (!(sb->s_flags & MS_RDONLY)) {
 		if (sbi->s_version != MINIX_V3)	 /* s_state is now out from V3 sb */
 			sbi->s_ms->s_state = sbi->s_mount_state;
@@ -49,7 +51,7 @@ static void minix_put_super(struct super_block *sb)
 	sb->s_fs_info = NULL;
 	kfree(sbi);
 
-	return;
+	unlock_kernel();
 }
 
 static struct kmem_cache * minix_inode_cachep;
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index d642f0e..b99ce20 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -736,6 +736,8 @@ static void ncp_put_super(struct super_block *sb)
 {
 	struct ncp_server *server = NCP_SBP(sb);
 
+	lock_kernel();
+
 	ncp_lock_server(server);
 	ncp_disconnect(server);
 	ncp_unlock_server(server);
@@ -769,6 +771,8 @@ static void ncp_put_super(struct super_block *sb)
 	vfree(server->packet);
 	sb->s_fs_info = NULL;
 	kfree(server);
+
+	unlock_kernel();
 }
 
 static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 7901d8c..7262e84 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -316,6 +316,8 @@ static void nilfs_put_super(struct super_block *sb)
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 
+	lock_kernel();
+
 	if (sb->s_dirt)
 		nilfs_write_super(sb);
 
@@ -333,6 +335,8 @@ static void nilfs_put_super(struct super_block *sb)
 	sbi->s_super = NULL;
 	sb->s_fs_info = NULL;
 	kfree(sbi);
+
+	unlock_kernel();
 }
 
 /**
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 6aa7c47..a9ec4e1 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2246,6 +2246,9 @@ static void ntfs_put_super(struct super_block *sb)
 	ntfs_volume *vol = NTFS_SB(sb);
 
 	ntfs_debug("Entering.");
+
+	lock_kernel();
+
 #ifdef NTFS_RW
 	/*
 	 * Commit all inodes while they are still open in case some of them
@@ -2444,7 +2447,8 @@ static void ntfs_put_super(struct super_block *sb)
 	}
 	sb->s_fs_info = NULL;
 	kfree(vol);
-	return;
+
+	unlock_kernel();
 }
 
 /**
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 3eb076c..0273759 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1536,9 +1536,13 @@ static void ocfs2_put_super(struct super_block *sb)
 {
 	mlog_entry("(0x%p)\n", sb);
 
+	lock_kernel();
+
 	ocfs2_sync_blockdev(sb);
 	ocfs2_dismount_volume(sb, 0);
 
+	unlock_kernel();
+
 	mlog_exit_void();
 }
 
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 3da0401..90dcb7b 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -465,6 +465,8 @@ static void reiserfs_put_super(struct super_block *s)
 	struct reiserfs_transaction_handle th;
 	th.t_trans_id = 0;
 
+	lock_kernel();
+
 	if (s->s_dirt)
 		reiserfs_write_super(s);
 
@@ -500,7 +502,7 @@ static void reiserfs_put_super(struct super_block *s)
 	kfree(s->s_fs_info);
 	s->s_fs_info = NULL;
 
-	return;
+	unlock_kernel();
 }
 
 static struct kmem_cache *reiserfs_inode_cachep;
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index fc27fbf..1402d2d 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -474,6 +474,8 @@ smb_put_super(struct super_block *sb)
 {
 	struct smb_sb_info *server = SMB_SB(sb);
 
+	lock_kernel();
+
 	smb_lock_server(server);
 	server->state = CONN_INVALID;
 	smbiod_unregister_server(server);
@@ -489,6 +491,8 @@ smb_put_super(struct super_block *sb)
 	smb_unlock_server(server);
 	put_pid(server->conn_pid);
 	kfree(server);
+
+	unlock_kernel();
 }
 
 static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 0adc624..3b52770 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -338,6 +338,8 @@ static int squashfs_remount(struct super_block *sb, int *flags, char *data)
 
 static void squashfs_put_super(struct super_block *sb)
 {
+	lock_kernel();
+
 	if (sb->s_fs_info) {
 		struct squashfs_sb_info *sbi = sb->s_fs_info;
 		squashfs_cache_delete(sbi->block_cache);
@@ -350,6 +352,8 @@ static void squashfs_put_super(struct super_block *sb)
 		kfree(sb->s_fs_info);
 		sb->s_fs_info = NULL;
 	}
+
+	unlock_kernel();
 }
 
 
diff --git a/fs/super.c b/fs/super.c
index 54fd331..bdd7158 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -309,7 +309,6 @@ void generic_shutdown_super(struct super_block *sb)
 
 		/* bad name - it should be evict_inodes() */
 		invalidate_inodes(sb);
-		lock_kernel();
 
 		if (sop->put_super)
 			sop->put_super(sb);
@@ -320,8 +319,6 @@ void generic_shutdown_super(struct super_block *sb)
 			   "Self-destruct in 5 seconds.  Have a nice day...\n",
 			   sb->s_id);
 		}
-
-		unlock_kernel();
 		put_fs_excl();
 	}
 	spin_lock(&sb_lock);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index cd80316..a818986 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -72,6 +72,8 @@ static void sysv_put_super(struct super_block *sb)
 {
 	struct sysv_sb_info *sbi = SYSV_SB(sb);
 
+	lock_kernel();
+
 	if (sb->s_dirt)
 		sysv_write_super(sb);
 
@@ -87,6 +89,8 @@ static void sysv_put_super(struct super_block *sb)
 		brelse(sbi->s_bh2);
 
 	kfree(sbi);
+
+	unlock_kernel();
 }
 
 static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 84f3c7f..522c3fd 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1684,6 +1684,9 @@ static void ubifs_put_super(struct super_block *sb)
 
 	ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num,
 		  c->vi.vol_id);
+
+	lock_kernel();
+
 	/*
 	 * The following asserts are only valid if there has not been a failure
 	 * of the media. For example, there will be dirty inodes if we failed
@@ -1750,6 +1753,8 @@ static void ubifs_put_super(struct super_block *sb)
 	ubi_close_volume(c->ubi);
 	mutex_unlock(&c->umount_mutex);
 	kfree(c);
+
+	unlock_kernel();
 }
 
 static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 0ba4410..04802cc 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -2062,6 +2062,9 @@ static void udf_put_super(struct super_block *sb)
 	struct udf_sb_info *sbi;
 
 	sbi = UDF_SB(sb);
+
+	lock_kernel();
+
 	if (sbi->s_vat_inode)
 		iput(sbi->s_vat_inode);
 	if (sbi->s_partitions)
@@ -2077,6 +2080,8 @@ static void udf_put_super(struct super_block *sb)
 	kfree(sbi->s_partmaps);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
+
+	unlock_kernel();
 }
 
 static int udf_sync_fs(struct super_block *sb, int wait)
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 74afb9f..2b4d2b6 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -594,6 +594,9 @@ static void ufs_put_super_internal(struct super_block *sb)
 
 	
 	UFSD("ENTER\n");
+
+	lock_kernel();
+
 	ufs_put_cstotal(sb);
 	size = uspi->s_cssize;
 	blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
@@ -621,6 +624,9 @@ static void ufs_put_super_internal(struct super_block *sb)
 		brelse (sbi->s_ucg[i]);
 	kfree (sbi->s_ucg);
 	kfree (base);
+
+	unlock_kernel();
+
 	UFSD("EXIT\n");
 }
 
-- 
cgit v0.10.2


From bbd6851a3213a525128473e978b692ab6ac11aba Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 6 May 2009 10:43:07 -0400
Subject: Push lock_super() into the ->remount_fs() of filesystems that care
 about it

Note that since we can't run into contention between remount_fs and write_super
(due to exclusion on s_umount), we have to care only about filesystems that
touch lock_super() on their own.  Out of those ext3, ext4, hpfs, sysv and ufs
do need it; fat doesn't since its ->remount_fs() only accesses assign-once
data (basically, it's "we have no atime on directories and only have atime on
files for vfat; force nodiratime and possibly noatime into *flags").

[folded a build fix from hch]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 546b8d7..e213a26 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2491,6 +2491,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 #endif
 
 	/* Store the original options */
+	lock_super(sb);
 	old_sb_flags = sb->s_flags;
 	old_opts.s_mount_opt = sbi->s_mount_opt;
 	old_opts.s_resuid = sbi->s_resuid;
@@ -2598,6 +2599,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 		    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
 			kfree(old_opts.s_qf_names[i]);
 #endif
+	unlock_super(sb);
 	return 0;
 restore_opts:
 	sb->s_flags = old_sb_flags;
@@ -2614,6 +2616,7 @@ restore_opts:
 		sbi->s_qf_names[i] = old_opts.s_qf_names[i];
 	}
 #endif
+	unlock_super(sb);
 	return err;
 }
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1d4180b..a9c6834 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3421,6 +3421,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 #endif
 
 	/* Store the original options */
+	lock_super(sb);
 	old_sb_flags = sb->s_flags;
 	old_opts.s_mount_opt = sbi->s_mount_opt;
 	old_opts.s_resuid = sbi->s_resuid;
@@ -3554,6 +3555,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 		    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
 			kfree(old_opts.s_qf_names[i]);
 #endif
+	unlock_super(sb);
 	return 0;
 
 restore_opts:
@@ -3573,6 +3575,7 @@ restore_opts:
 		sbi->s_qf_names[i] = old_opts.s_qf_names[i];
 	}
 #endif
+	unlock_super(sb);
 	return err;
 }
 
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 437a32e..f68193c 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -398,6 +398,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
 	
 	*flags |= MS_NOATIME;
 	
+	lock_super(s);
 	uid = sbi->sb_uid; gid = sbi->sb_gid;
 	umask = 0777 & ~sbi->sb_mode;
 	lowercase = sbi->sb_lowercase; conv = sbi->sb_conv;
@@ -430,9 +431,11 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
 
 	replace_mount_options(s, new_opts);
 
+	unlock_super(s);
 	return 0;
 
 out_err:
+	unlock_super(s);
 	kfree(new_opts);
 	return -EINVAL;
 }
diff --git a/fs/super.c b/fs/super.c
index bdd7158..2a49fed 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -556,9 +556,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
 
 	if (sb->s_op->remount_fs) {
-		lock_super(sb);
 		retval = sb->s_op->remount_fs(sb, &flags, data);
-		unlock_super(sb);
 		if (retval)
 			return retval;
 	}
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index a818986..e0a39f1 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -61,10 +61,12 @@ clean:
 static int sysv_remount(struct super_block *sb, int *flags, char *data)
 {
 	struct sysv_sb_info *sbi = SYSV_SB(sb);
+	lock_super(sb);
 	if (sbi->s_forced_ro)
 		*flags |= MS_RDONLY;
 	if (!(*flags & MS_RDONLY))
 		sb->s_dirt = 1;
+	unlock_super(sb);
 	return 0;
 }
 
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 2b4d2b6..a5ecabf 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1181,6 +1181,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	unsigned new_mount_opt, ufstype;
 	unsigned flags;
 	
+	lock_super(sb);
 	uspi = UFS_SB(sb)->s_uspi;
 	flags = UFS_SB(sb)->s_flags;
 	usb1 = ubh_get_usb_first(uspi);
@@ -1193,17 +1194,21 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	ufstype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE;
 	new_mount_opt = 0;
 	ufs_set_opt (new_mount_opt, ONERROR_LOCK);
-	if (!ufs_parse_options (data, &new_mount_opt))
+	if (!ufs_parse_options (data, &new_mount_opt)) {
+		unlock_super(sb);
 		return -EINVAL;
+	}
 	if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
 		new_mount_opt |= ufstype;
 	} else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
 		printk("ufstype can't be changed during remount\n");
+		unlock_super(sb);
 		return -EINVAL;
 	}
 
 	if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
 		UFS_SB(sb)->s_mount_opt = new_mount_opt;
+		unlock_super(sb);
 		return 0;
 	}
 	
@@ -1228,6 +1233,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 #ifndef CONFIG_UFS_FS_WRITE
 		printk("ufs was compiled with read-only support, "
 		"can't be mounted as read-write\n");
+		unlock_super(sb);
 		return -EINVAL;
 #else
 		if (ufstype != UFS_MOUNT_UFSTYPE_SUN && 
@@ -1236,16 +1242,19 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 		    ufstype != UFS_MOUNT_UFSTYPE_SUNx86 &&
 		    ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
 			printk("this ufstype is read-only supported\n");
+			unlock_super(sb);
 			return -EINVAL;
 		}
 		if (!ufs_read_cylinder_structures(sb)) {
 			printk("failed during remounting\n");
+			unlock_super(sb);
 			return -EPERM;
 		}
 		sb->s_flags &= ~MS_RDONLY;
 #endif
 	}
 	UFS_SB(sb)->s_mount_opt = new_mount_opt;
+	unlock_super(sb);
 	return 0;
 }
 
-- 
cgit v0.10.2


From 6fac98dd218653c6aff8a0f56305c424930cea2a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 8 May 2009 13:31:17 -0400
Subject: Push BKL into do_mount()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 42ee059..9a3334a 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -371,8 +371,6 @@ SYSCALL_DEFINE4(osf_mount, unsigned long, typenr, char __user *, path,
 	int retval = -EINVAL;
 	char *name;
 
-	lock_kernel();
-
 	name = getname(path);
 	retval = PTR_ERR(name);
 	if (IS_ERR(name))
@@ -392,7 +390,6 @@ SYSCALL_DEFINE4(osf_mount, unsigned long, typenr, char __user *, path,
 	}
 	putname(name);
  out:
-	unlock_kernel();
 	return retval;
 }
 
diff --git a/fs/compat.c b/fs/compat.c
index bb2a9b2..6aefb77 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -812,10 +812,8 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
 		}
 	}
 
-	lock_kernel();
 	retval = do_mount((char*)dev_page, dir_page, (char*)type_page,
 			flags, (void*)data_page);
-	unlock_kernel();
 
  out4:
 	free_page(data_page);
diff --git a/fs/namespace.c b/fs/namespace.c
index 7e537f0..4740f7b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1921,6 +1921,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 	if (retval)
 		goto dput_out;
 
+	lock_kernel();
 	if (flags & MS_REMOUNT)
 		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
 				    data_page);
@@ -1933,6 +1934,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 	else
 		retval = do_new_mount(&path, type_page, flags, mnt_flags,
 				      dev_name, data_page);
+	unlock_kernel();
 dput_out:
 	path_put(&path);
 	return retval;
@@ -2046,10 +2048,8 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
 	if (retval < 0)
 		goto out3;
 
-	lock_kernel();
 	retval = do_mount((char *)dev_page, dir_page, (char *)type_page,
 			  flags, (void *)data_page);
-	unlock_kernel();
 	free_page(data_page);
 
 out3:
-- 
cgit v0.10.2


From 7f78d4cd4c5d01864943c22b79df1b6bde923129 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 8 May 2009 13:34:06 -0400
Subject: Push BKL down beyond VFS-only parts of do_mount()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namespace.c b/fs/namespace.c
index 4740f7b..b94325f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1515,8 +1515,11 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
 	down_write(&sb->s_umount);
 	if (flags & MS_BIND)
 		err = change_mount_flags(path->mnt, flags);
-	else
+	else {
+		lock_kernel();
 		err = do_remount_sb(sb, flags, data, 0);
+		unlock_kernel();
+	}
 	if (!err)
 		path->mnt->mnt_flags = mnt_flags;
 	up_write(&sb->s_umount);
@@ -1630,7 +1633,9 @@ static int do_new_mount(struct path *path, char *type, int flags,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	lock_kernel();
 	mnt = do_kern_mount(type, flags, name, data);
+	unlock_kernel();
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 
@@ -1921,7 +1926,6 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 	if (retval)
 		goto dput_out;
 
-	lock_kernel();
 	if (flags & MS_REMOUNT)
 		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
 				    data_page);
@@ -1934,7 +1938,6 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 	else
 		retval = do_new_mount(&path, type_page, flags, mnt_flags,
 				      dev_name, data_page);
-	unlock_kernel();
 dput_out:
 	path_put(&path);
 	return retval;
-- 
cgit v0.10.2


From 4aa98cf768b6f2ea4b204620d949a665959214f6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 8 May 2009 13:36:58 -0400
Subject: Push BKL down into do_remount_sb()

[folded fix from Jiri Slaby]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namespace.c b/fs/namespace.c
index b94325f..2dd333b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1060,11 +1060,8 @@ static int do_umount(struct vfsmount *mnt, int flags)
 		 * we just try to remount it readonly.
 		 */
 		down_write(&sb->s_umount);
-		if (!(sb->s_flags & MS_RDONLY)) {
-			lock_kernel();
+		if (!(sb->s_flags & MS_RDONLY))
 			retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
-			unlock_kernel();
-		}
 		up_write(&sb->s_umount);
 		return retval;
 	}
@@ -1515,11 +1512,8 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
 	down_write(&sb->s_umount);
 	if (flags & MS_BIND)
 		err = change_mount_flags(path->mnt, flags);
-	else {
-		lock_kernel();
+	else
 		err = do_remount_sb(sb, flags, data, 0);
-		unlock_kernel();
-	}
 	if (!err)
 		path->mnt->mnt_flags = mnt_flags;
 	up_write(&sb->s_umount);
diff --git a/fs/super.c b/fs/super.c
index 2a49fed..a64f362 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -542,25 +542,33 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	shrink_dcache_sb(sb);
 	sync_filesystem(sb);
 
+	lock_kernel();
 	/* If we are remounting RDONLY and current sb is read/write,
 	   make sure there are no rw files opened */
 	if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) {
 		if (force)
 			mark_files_ro(sb);
-		else if (!fs_may_remount_ro(sb))
+		else if (!fs_may_remount_ro(sb)) {
+			unlock_kernel();
 			return -EBUSY;
+		}
 		retval = vfs_dq_off(sb, 1);
-		if (retval < 0 && retval != -ENOSYS)
+		if (retval < 0 && retval != -ENOSYS) {
+			unlock_kernel();
 			return -EBUSY;
+		}
 	}
 	remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
 
 	if (sb->s_op->remount_fs) {
 		retval = sb->s_op->remount_fs(sb, &flags, data);
-		if (retval)
+		if (retval) {
+			unlock_kernel();
 			return retval;
+		}
 	}
 	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
+	unlock_kernel();
 	if (remount_rw)
 		vfs_dq_quota_on_remount(sb);
 	return 0;
@@ -581,9 +589,7 @@ static void do_emergency_remount(struct work_struct *work)
 			 *
 			 * What lock protects sb->s_flags??
 			 */
-			lock_kernel();
 			do_remount_sb(sb, MS_RDONLY, NULL, 1);
-			unlock_kernel();
 		}
 		up_write(&sb->s_umount);
 		put_super(sb);
-- 
cgit v0.10.2


From 01ba687577647beef6c5f2ea59bfb56fac9fcde2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 11 May 2009 23:34:27 +0200
Subject: jffs2: move jffs2_write_super to super.c

jffs2_write_super is only called from super.c and doesn't use any
functionality from fs.c.  So move it over to super.c and make it
static there.

[should go in through the vfs tree as it is a requirement for the
 next patch]

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 249305d..237b27a 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -402,21 +402,6 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
 	return 0;
 }
 
-void jffs2_write_super (struct super_block *sb)
-{
-	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
-	sb->s_dirt = 0;
-
-	if (sb->s_flags & MS_RDONLY)
-		return;
-
-	D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
-	jffs2_garbage_collect_trigger(c);
-	jffs2_erase_pending_blocks(c, 0);
-	jffs2_flush_wbuf_gc(c, 0);
-}
-
-
 /* jffs2_new_inode: allocate a new inode and inocache, add it to the hash,
    fill in the raw_inode while you're at it. */
 struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_inode *ri)
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 5e194a5..2228380 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -181,7 +181,6 @@ void jffs2_dirty_inode(struct inode *inode);
 struct inode *jffs2_new_inode (struct inode *dir_i, int mode,
 			       struct jffs2_raw_inode *ri);
 int jffs2_statfs (struct dentry *, struct kstatfs *);
-void jffs2_write_super (struct super_block *);
 int jffs2_remount_fs (struct super_block *, int *, char *);
 int jffs2_do_fill_super(struct super_block *sb, void *data, int silent);
 void jffs2_gc_release_inode(struct jffs2_sb_info *c,
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 37b1212..a80a50e 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -53,6 +53,20 @@ static void jffs2_i_init_once(void *foo)
 	inode_init_once(&f->vfs_inode);
 }
 
+static void jffs2_write_super(struct super_block *sb)
+{
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+	sb->s_dirt = 0;
+
+	if (sb->s_flags & MS_RDONLY)
+		return;
+
+	D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
+	jffs2_garbage_collect_trigger(c);
+	jffs2_erase_pending_blocks(c, 0);
+	jffs2_flush_wbuf_gc(c, 0);
+}
+
 static int jffs2_sync_fs(struct super_block *sb, int wait)
 {
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
-- 
cgit v0.10.2


From ebc1ac164560a241d9bf1b7519062910c3f90a01 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 11 May 2009 23:35:03 +0200
Subject: ->write_super lock_super pushdown

Push down lock_super into ->write_super instances and remove it from the
caller.

Following filesystem don't need ->s_lock in ->write_super and are skipped:

 * bfs, nilfs2 - no other uses of s_lock and have internal locks in
	->write_super
 * ext2 - uses BKL in ext2_write_super and has internal calls without s_lock
 * reiserfs - no other uses of s_lock as has reiserfs_write_lock (BKL) in
 	->write_super
 * xfs - no other uses of s_lock and uses internal lock (buffer lock on
	superblock buffer) to serialize ->write_super.  Also xfs_fs_write_super
	is superflous and will go away in the next merge window

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/affs/super.c b/fs/affs/super.c
index d738646..280d361 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -54,6 +54,7 @@ affs_write_super(struct super_block *sb)
 	int clean = 2;
 	struct affs_sb_info *sbi = AFFS_SB(sb);
 
+	lock_super(sb);
 	if (!(sb->s_flags & MS_RDONLY)) {
 		//	if (sbi->s_bitmap[i].bm_bh) {
 		//		if (buffer_dirty(sbi->s_bitmap[i].bm_bh)) {
@@ -66,6 +67,7 @@ affs_write_super(struct super_block *sb)
 		sb->s_dirt = !clean;	/* redo until bitmap synced */
 	} else
 		sb->s_dirt = 0;
+	unlock_super(sb);
 
 	pr_debug("AFFS: write_super() at %lu, clean=%d\n", get_seconds(), clean);
 }
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index cd1f8b1..49e16af 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -214,6 +214,7 @@ static void exofs_write_super(struct super_block *sb)
 		return;
 	}
 
+	lock_super(sb);
 	lock_kernel();
 	sbi = sb->s_fs_info;
 	fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
@@ -246,6 +247,7 @@ out:
 	if (or)
 		osd_end_request(or);
 	unlock_kernel();
+	unlock_super(sb);
 	kfree(fscb);
 }
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a9c6834..c17200a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -579,7 +579,7 @@ static void ext4_put_super(struct super_block *sb)
 	lock_super(sb);
 	lock_kernel();
 	if (sb->s_dirt)
-		ext4_write_super(sb);
+		ext4_commit_super(sb, 1);
 
 	ext4_release_system_zone(sb);
 	ext4_mb_release(sb);
@@ -3336,7 +3336,9 @@ int ext4_force_commit(struct super_block *sb)
 
 static void ext4_write_super(struct super_block *sb)
 {
+	lock_super(sb);
 	ext4_commit_super(sb, 1);
+	unlock_super(sb);
 }
 
 static int ext4_sync_fs(struct super_block *sb, int wait)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 2b88c93a..2292cbf 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -441,10 +441,12 @@ static void fat_clear_inode(struct inode *inode)
 
 static void fat_write_super(struct super_block *sb)
 {
+	lock_super(sb);
 	sb->s_dirt = 0;
 
 	if (!(sb->s_flags & MS_RDONLY))
 		fat_clusters_flush(sb);
+	unlock_super(sb);
 }
 
 static void fat_put_super(struct super_block *sb)
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 9f5eaa0..3aac417 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -49,11 +49,13 @@ MODULE_LICENSE("GPL");
  */
 static void hfs_write_super(struct super_block *sb)
 {
+	lock_super(sb);
 	sb->s_dirt = 0;
-	if (sb->s_flags & MS_RDONLY)
-		return;
+
 	/* sync everything to the buffers */
-	hfs_mdb_commit(sb);
+	if (!(sb->s_flags & MS_RDONLY))
+		hfs_mdb_commit(sb);
+	unlock_super(sb);
 }
 
 /*
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 9b292dc..1aab8aa 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -157,10 +157,12 @@ static void hfsplus_write_super(struct super_block *sb)
 	struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
 
 	dprint(DBG_SUPER, "hfsplus_write_super\n");
+
+	lock_super(sb);
 	sb->s_dirt = 0;
 	if (sb->s_flags & MS_RDONLY)
 		/* warn? */
-		return;
+		goto out;
 
 	vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks);
 	vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc);
@@ -192,6 +194,8 @@ static void hfsplus_write_super(struct super_block *sb)
 		}
 		HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP;
 	}
+ out:
+	unlock_super(sb);
 }
 
 static void hfsplus_put_super(struct super_block *sb)
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index a80a50e..f7bfd3a 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -56,15 +56,18 @@ static void jffs2_i_init_once(void *foo)
 static void jffs2_write_super(struct super_block *sb)
 {
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+
+	lock_super(sb);
 	sb->s_dirt = 0;
 
-	if (sb->s_flags & MS_RDONLY)
-		return;
+	if (!(sb->s_flags & MS_RDONLY)) {
+		D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
+		jffs2_garbage_collect_trigger(c);
+		jffs2_erase_pending_blocks(c, 0);
+		jffs2_flush_wbuf_gc(c, 0);
+	}
 
-	D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
-	jffs2_garbage_collect_trigger(c);
-	jffs2_erase_pending_blocks(c, 0);
-	jffs2_flush_wbuf_gc(c, 0);
+	unlock_super(sb);
 }
 
 static int jffs2_sync_fs(struct super_block *sb, int wait)
diff --git a/fs/super.c b/fs/super.c
index a64f362..1905f4a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -420,10 +420,8 @@ restart:
 			spin_unlock(&sb_lock);
 
 			down_read(&sb->s_umount);
-			lock_super(sb);
 			if (sb->s_root && sb->s_dirt)
 				sb->s_op->write_super(sb);
-			unlock_super(sb);
 			up_read(&sb->s_umount);
 
 			spin_lock(&sb_lock);
diff --git a/fs/sync.c b/fs/sync.c
index 89c37f7..e9d56f6 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -33,10 +33,8 @@ static int __sync_filesystem(struct super_block *sb, int wait)
 	else
 		sync_quota_sb(sb, -1);
 	sync_inodes_sb(sb, wait);
-	lock_super(sb);
 	if (sb->s_dirt && sb->s_op->write_super)
 		sb->s_op->write_super(sb);
-	unlock_super(sb);
 	if (sb->s_op->sync_fs)
 		sb->s_op->sync_fs(sb, wait);
 	return __sync_blockdev(sb->s_bdev, wait);
@@ -164,10 +162,8 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 
 	/* sync the superblock to buffers */
 	sb = inode->i_sb;
-	lock_super(sb);
 	if (sb->s_dirt && sb->s_op->write_super)
 		sb->s_op->write_super(sb);
-	unlock_super(sb);
 
 	/* .. finally sync the buffers to disk */
 	err = sync_blockdev(sb->s_bdev);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index e0a39f1..a3f45fc 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -37,6 +37,7 @@ static void sysv_write_super(struct super_block *sb)
 	struct sysv_sb_info *sbi = SYSV_SB(sb);
 	unsigned long time = get_seconds(), old_time;
 
+	lock_super(sb);
 	lock_kernel();
 	if (sb->s_flags & MS_RDONLY)
 		goto clean;
@@ -56,6 +57,7 @@ static void sysv_write_super(struct super_block *sb)
 clean:
 	sb->s_dirt = 0;
 	unlock_kernel();
+	unlock_super(sb);
 }
 
 static int sysv_remount(struct super_block *sb, int *flags, char *data)
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index a5ecabf..c97210e 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1131,6 +1131,7 @@ static void ufs_write_super(struct super_block *sb)
 	struct ufs_super_block_third * usb3;
 	unsigned flags;
 
+	lock_super(sb);
 	lock_kernel();
 	UFSD("ENTER\n");
 	flags = UFS_SB(sb)->s_flags;
@@ -1150,6 +1151,7 @@ static void ufs_write_super(struct super_block *sb)
 	sb->s_dirt = 0;
 	UFSD("EXIT\n");
 	unlock_kernel();
+	unlock_super(sb);
 }
 
 static void ufs_put_super(struct super_block *sb)
-- 
cgit v0.10.2


From 9fd5746fd3d7838bf6ff991d50f1257057d1156f Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 21 May 2009 16:01:00 -0400
Subject: fs: Remove i_cindex from struct inode

The only user of the i_cindex element in the inode structure is used
is by the firewire drivers.  As part of an attempt to slim down the
inode structure to save memory --- since a typical Linux system will
have hundreds of thousands if not millions of inodes cached, a
reduction in the size inode has high leverage.

The firewire driver does not need i_cindex in any fast path, so it's
simple enough to calculate when it is needed, instead of wasting space
in the inode structure.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: krh@redhat.com
Cc: stefanr@s5r6.in-berlin.de
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/drivers/ieee1394/dv1394.c b/drivers/ieee1394/dv1394.c
index 823a629..2cd00b5 100644
--- a/drivers/ieee1394/dv1394.c
+++ b/drivers/ieee1394/dv1394.c
@@ -1789,12 +1789,13 @@ static int dv1394_open(struct inode *inode, struct file *file)
 	} else {
 		/* look up the card by ID */
 		unsigned long flags;
+		int idx = ieee1394_file_to_instance(file);
 
 		spin_lock_irqsave(&dv1394_cards_lock, flags);
 		if (!list_empty(&dv1394_cards)) {
 			struct video_card *p;
 			list_for_each_entry(p, &dv1394_cards, list) {
-				if ((p->id) == ieee1394_file_to_instance(file)) {
+				if ((p->id) == idx) {
 					video = p;
 					break;
 				}
@@ -1803,7 +1804,7 @@ static int dv1394_open(struct inode *inode, struct file *file)
 		spin_unlock_irqrestore(&dv1394_cards_lock, flags);
 
 		if (!video) {
-			debug_printk("dv1394: OHCI card %d not found", ieee1394_file_to_instance(file));
+			debug_printk("dv1394: OHCI card %d not found", idx);
 			return -ENODEV;
 		}
 
diff --git a/drivers/ieee1394/ieee1394_core.h b/drivers/ieee1394/ieee1394_core.h
index 21d50f7..28b9f58 100644
--- a/drivers/ieee1394/ieee1394_core.h
+++ b/drivers/ieee1394/ieee1394_core.h
@@ -5,6 +5,7 @@
 #include <linux/fs.h>
 #include <linux/list.h>
 #include <linux/types.h>
+#include <linux/cdev.h>
 #include <asm/atomic.h>
 
 #include "hosts.h"
@@ -155,7 +156,10 @@ void hpsb_packet_received(struct hpsb_host *host, quadlet_t *data, size_t size,
  */
 static inline unsigned char ieee1394_file_to_instance(struct file *file)
 {
-	return file->f_path.dentry->d_inode->i_cindex;
+	int idx = cdev_index(file->f_path.dentry->d_inode);
+	if (idx < 0)
+		idx = 0;
+	return idx;
 }
 
 extern int hpsb_disable_irm;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 38f7122..b7c9d51 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -375,7 +375,6 @@ static int chrdev_open(struct inode *inode, struct file *filp)
 		p = inode->i_cdev;
 		if (!p) {
 			inode->i_cdev = p = new;
-			inode->i_cindex = idx;
 			list_add(&inode->i_devices, &p->list);
 			new = NULL;
 		} else if (!cdev_get(p))
@@ -405,6 +404,18 @@ static int chrdev_open(struct inode *inode, struct file *filp)
 	return ret;
 }
 
+int cdev_index(struct inode *inode)
+{
+	int idx;
+	struct kobject *kobj;
+
+	kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
+	if (!kobj)
+		return -1;
+	kobject_put(kobj);
+	return idx;
+}
+
 void cd_forget(struct inode *inode)
 {
 	spin_lock(&cdev_lock);
@@ -557,6 +568,7 @@ EXPORT_SYMBOL(cdev_init);
 EXPORT_SYMBOL(cdev_alloc);
 EXPORT_SYMBOL(cdev_del);
 EXPORT_SYMBOL(cdev_add);
+EXPORT_SYMBOL(cdev_index);
 EXPORT_SYMBOL(register_chrdev);
 EXPORT_SYMBOL(unregister_chrdev);
 EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/include/linux/cdev.h b/include/linux/cdev.h
index fb45919..f389e31 100644
--- a/include/linux/cdev.h
+++ b/include/linux/cdev.h
@@ -28,6 +28,8 @@ int cdev_add(struct cdev *, dev_t, unsigned);
 
 void cdev_del(struct cdev *);
 
+int cdev_index(struct inode *inode);
+
 void cd_forget(struct inode *);
 
 extern struct backing_dev_info directly_mappable_cdev_bdi;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e7833ef..bcd6370 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -751,7 +751,6 @@ struct inode {
 		struct block_device	*i_bdev;
 		struct cdev		*i_cdev;
 	};
-	int			i_cindex;
 
 	__u32			i_generation;
 
-- 
cgit v0.10.2


From 28ad0c118b0ed98b042d362acfe0017591921138 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 21 May 2009 16:01:02 -0400
Subject: fs: Rearrange inode structure elements to avoid waste due to padding

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/include/linux/fs.h b/include/linux/fs.h
index bcd6370..d883aa1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -729,8 +729,8 @@ struct inode {
 	struct timespec		i_atime;
 	struct timespec		i_mtime;
 	struct timespec		i_ctime;
-	unsigned int		i_blkbits;
 	blkcnt_t		i_blocks;
+	unsigned int		i_blkbits;
 	unsigned short          i_bytes;
 	umode_t			i_mode;
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
-- 
cgit v0.10.2


From 13205fb9260c2377438599ef0773c6a3eaeb0b07 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 25 May 2009 09:30:45 +0200
Subject: ntfs: remove old debug check for dirty data in ntfs_put_super()

This should not trigger anymore, so kill it.

Acked-by: Anton Altaparmakov <aia21@cam.ac.uk>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index a9ec4e1..7a7b0d3 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2376,39 +2376,12 @@ static void ntfs_put_super(struct super_block *sb)
 		vol->mftmirr_ino = NULL;
 	}
 	/*
-	 * If any dirty inodes are left, throw away all mft data page cache
-	 * pages to allow a clean umount.  This should never happen any more
-	 * due to mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
-	 * the underlying mft records are written out and cleaned.  If it does,
-	 * happen anyway, we want to know...
+	 * We should have no dirty inodes left, due to
+	 * mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
+	 * the underlying mft records are written out and cleaned.
 	 */
 	ntfs_commit_inode(vol->mft_ino);
 	write_inode_now(vol->mft_ino, 1);
-	if (sb_has_dirty_inodes(sb)) {
-		const char *s1, *s2;
-
-		mutex_lock(&vol->mft_ino->i_mutex);
-		truncate_inode_pages(vol->mft_ino->i_mapping, 0);
-		mutex_unlock(&vol->mft_ino->i_mutex);
-		write_inode_now(vol->mft_ino, 1);
-		if (sb_has_dirty_inodes(sb)) {
-			static const char *_s1 = "inodes";
-			static const char *_s2 = "";
-			s1 = _s1;
-			s2 = _s2;
-		} else {
-			static const char *_s1 = "mft pages";
-			static const char *_s2 = "They have been thrown "
-					"away.  ";
-			s1 = _s1;
-			s2 = _s2;
-		}
-		ntfs_error(sb, "Dirty %s found at umount time.  %sYou should "
-				"run chkdsk.  Please email "
-				"linux-ntfs-dev@lists.sourceforge.net and say "
-				"that you saw this message.  Thank you.", s1,
-				s2);
-	}
 #endif /* NTFS_RW */
 
 	iput(vol->mft_ino);
-- 
cgit v0.10.2


From f95022161d23ee661a48af8f280472209f513a67 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 5 Jun 2009 12:26:23 +0200
Subject: xfs: remove ->write_super and stop maintaining ->s_dirt

the write_super method is used for

 (1) writing back the superblock periodically from pdflush
 (2) called just before ->sync_fs for data integerity syncs

We don't need (1) because we have our own peridoc writeout through xfssyncd,
and we don't need (2) because xfs_fs_sync_fs performs a proper synchronous
superblock writeout after all other data and metadata has been written out.

Also remove ->s_dirt tracking as it's only used to decide when too call
->write_super.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index bb68526..08d6bd9 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1104,15 +1104,6 @@ xfs_fs_put_super(
 	kfree(mp);
 }
 
-STATIC void
-xfs_fs_write_super(
-	struct super_block	*sb)
-{
-	if (!(sb->s_flags & MS_RDONLY))
-		xfs_sync_fsdata(XFS_M(sb), 0);
-	sb->s_dirt = 0;
-}
-
 STATIC int
 xfs_fs_sync_super(
 	struct super_block	*sb,
@@ -1137,7 +1128,6 @@ xfs_fs_sync_super(
 		error = xfs_quiesce_data(mp);
 	else
 		error = xfs_sync_fsdata(mp, 0);
-	sb->s_dirt = 0;
 
 	if (unlikely(laptop_mode)) {
 		int	prev_sync_seq = mp->m_sync_seq;
@@ -1443,7 +1433,6 @@ xfs_fs_fill_super(
 
 	XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname);
 
-	sb->s_dirt = 1;
 	sb->s_magic = XFS_SB_MAGIC;
 	sb->s_blocksize = mp->m_sb.sb_blocksize;
 	sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
@@ -1533,7 +1522,6 @@ static struct super_operations xfs_super_operations = {
 	.write_inode		= xfs_fs_write_inode,
 	.clear_inode		= xfs_fs_clear_inode,
 	.put_super		= xfs_fs_put_super,
-	.write_super		= xfs_fs_write_super,
 	.sync_fs		= xfs_fs_sync_super,
 	.freeze_fs		= xfs_fs_freeze,
 	.statfs			= xfs_fs_statfs,
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 8570b82..bcc39d3 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -628,8 +628,6 @@ xfs_trans_apply_sb_deltas(
 		xfs_trans_log_buf(tp, bp, offsetof(xfs_dsb_t, sb_icount),
 				  offsetof(xfs_dsb_t, sb_frextents) +
 				  sizeof(sbp->sb_frextents) - 1);
-
-	tp->t_mountp->m_super->s_dirt = 1;
 }
 
 /*
-- 
cgit v0.10.2


From 8688b8635266cf98f00c6b0350ea2dbe7c42c321 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Tue, 26 May 2009 05:45:04 -0400
Subject: linux/magic.h: move cramfs magic out of cramfs_fs.h

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
CC: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/include/linux/cramfs_fs.h b/include/linux/cramfs_fs.h
index 3be4e5a..6fc2bed 100644
--- a/include/linux/cramfs_fs.h
+++ b/include/linux/cramfs_fs.h
@@ -2,9 +2,8 @@
 #define __CRAMFS_H
 
 #include <linux/types.h>
+#include <linux/magic.h>
 
-#define CRAMFS_MAGIC		0x28cd3d45	/* some random number */
-#define CRAMFS_MAGIC_WEND	0x453dcd28	/* magic number with the wrong endianess */
 #define CRAMFS_SIGNATURE	"Compressed ROMFS"
 
 /*
diff --git a/include/linux/magic.h b/include/linux/magic.h
index 927138c..1923327 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -6,6 +6,8 @@
 #define AFS_SUPER_MAGIC                0x5346414F
 #define AUTOFS_SUPER_MAGIC	0x0187
 #define CODA_SUPER_MAGIC	0x73757245
+#define CRAMFS_MAGIC		0x28cd3d45	/* some random number */
+#define CRAMFS_MAGIC_WEND	0x453dcd28	/* magic number with the wrong endianess */
 #define DEBUGFS_MAGIC          0x64626720
 #define SYSFS_MAGIC		0x62656572
 #define SECURITYFS_MAGIC	0x73636673
-- 
cgit v0.10.2


From 545b9fd3d737afc0bb5203b1e79194a471605acd Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 2 Jun 2009 12:07:47 +0200
Subject: fs: remove incorrect I_NEW warnings

Some filesystems can call in to sync an inode that is still in the
I_NEW state (eg. ext family, when mounted with -osync). This is OK
because the filesystem has sole access to the new inode, so it can
modify i_state without races (because no other thread should be
modifying it, by definition of I_NEW). Ie. a false positive, so
remove the warnings.

The races are described here 7ef0d7377cb287e08f3ae94cebc919448e1f5dff,
which is also where the warnings were introduced.

Reported-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e0fb2e7..efcedb6 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -289,7 +289,6 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 	int ret;
 
 	BUG_ON(inode->i_state & I_SYNC);
-	WARN_ON(inode->i_state & I_NEW);
 
 	/* Set I_SYNC, reset I_DIRTY */
 	dirty = inode->i_state & I_DIRTY;
@@ -314,7 +313,6 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 	}
 
 	spin_lock(&inode_lock);
-	WARN_ON(inode->i_state & I_NEW);
 	inode->i_state &= ~I_SYNC;
 	if (!(inode->i_state & I_FREEING)) {
 		if (!(inode->i_state & I_DIRTY) &&
-- 
cgit v0.10.2


From 4195f73d1329e49727bcceb028e58cb38376c2b0 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Thu, 28 May 2009 09:01:15 +0200
Subject: fs: block_dump missing dentry locking

I think the block_dump output in __mark_inode_dirty is missing dentry locking.
Surely the i_dentry list can change any time, so we may not even *get* a
dentry there. If we do get one by chance, then it would appear to be able to
go away or get renamed at any time...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index efcedb6..40308e9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -64,6 +64,28 @@ static void writeback_release(struct backing_dev_info *bdi)
 	clear_bit(BDI_pdflush, &bdi->state);
 }
 
+static noinline void block_dump___mark_inode_dirty(struct inode *inode)
+{
+	if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
+		struct dentry *dentry;
+		const char *name = "?";
+
+		dentry = d_find_alias(inode);
+		if (dentry) {
+			spin_lock(&dentry->d_lock);
+			name = (const char *) dentry->d_name.name;
+		}
+		printk(KERN_DEBUG
+		       "%s(%d): dirtied inode %lu (%s) on %s\n",
+		       current->comm, task_pid_nr(current), inode->i_ino,
+		       name, inode->i_sb->s_id);
+		if (dentry) {
+			spin_unlock(&dentry->d_lock);
+			dput(dentry);
+		}
+	}
+}
+
 /**
  *	__mark_inode_dirty -	internal function
  *	@inode: inode to mark
@@ -114,23 +136,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 	if ((inode->i_state & flags) == flags)
 		return;
 
-	if (unlikely(block_dump)) {
-		struct dentry *dentry = NULL;
-		const char *name = "?";
-
-		if (!list_empty(&inode->i_dentry)) {
-			dentry = list_entry(inode->i_dentry.next,
-					    struct dentry, d_alias);
-			if (dentry && dentry->d_name.name)
-				name = (const char *) dentry->d_name.name;
-		}
-
-		if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev"))
-			printk(KERN_DEBUG
-			       "%s(%d): dirtied inode %lu (%s) on %s\n",
-			       current->comm, task_pid_nr(current), inode->i_ino,
-			       name, inode->i_sb->s_id);
-	}
+	if (unlikely(block_dump))
+		block_dump___mark_inode_dirty(inode);
 
 	spin_lock(&inode_lock);
 	if ((inode->i_state & flags) != flags) {
-- 
cgit v0.10.2


From 337eb00a2c3a421999c39c94ce7e33545ee8baa7 Mon Sep 17 00:00:00 2001
From: Alessio Igor Bogani <abogani@texware.it>
Date: Tue, 12 May 2009 15:10:54 +0200
Subject: Push BKL down into ->remount_fs()

[xfs, btrfs, capifs, shmem don't need BKL, exempt]

Signed-off-by: Alessio Igor Bogani <abogani@texware.it>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index dff5760..ffe75e8 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -39,6 +39,7 @@
 #include <linux/parser.h>
 #include <linux/notifier.h>
 #include <linux/seq_file.h>
+#include <linux/smp_lock.h>
 #include <asm/byteorder.h>
 #include "usb.h"
 #include "hcd.h"
@@ -265,9 +266,13 @@ static int remount(struct super_block *sb, int *flags, char *data)
 		return -EINVAL;
 	}
 
+	lock_kernel();
+
 	if (usbfs_mount && usbfs_mount->mnt_sb)
 		update_sb(usbfs_mount->mnt_sb);
 
+	unlock_kernel();
+
 	return 0;
 }
 
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 280d361..c481493 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -16,6 +16,7 @@
 #include <linux/parser.h>
 #include <linux/magic.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include "affs.h"
 
 extern struct timezone sys_tz;
@@ -512,6 +513,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 		kfree(new_opts);
 		return -EINVAL;
 	}
+	lock_kernel();
 	replace_mount_options(sb, new_opts);
 
 	sbi->s_flags = mount_flags;
@@ -519,8 +521,10 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 	sbi->s_uid   = uid;
 	sbi->s_gid   = gid;
 
-	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+		unlock_kernel();
 		return 0;
+	}
 	if (*flags & MS_RDONLY) {
 		sb->s_dirt = 1;
 		while (sb->s_dirt)
@@ -529,6 +533,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 	} else
 		res = affs_init_bitmap(sb, flags);
 
+	unlock_kernel();
 	return res;
 }
 
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index a44963d..f8cbdf5 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1162,6 +1162,8 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 	unsigned long old_sb_flags;
 	int err;
 
+	lock_kernel();
+
 	/* Store the old options */
 	old_sb_flags = sb->s_flags;
 	old_opts.s_mount_opt = sbi->s_mount_opt;
@@ -1197,12 +1199,16 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 		sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
 		sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
 	}
-	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+		unlock_kernel();
 		return 0;
+	}
 	if (*flags & MS_RDONLY) {
 		if (le16_to_cpu(es->s_state) & EXT2_VALID_FS ||
-		    !(sbi->s_mount_state & EXT2_VALID_FS))
+		    !(sbi->s_mount_state & EXT2_VALID_FS)) {
+			unlock_kernel();
 			return 0;
+		}
 		/*
 		 * OK, we are remounting a valid rw partition rdonly, so set
 		 * the rdonly flag and then mark the partition as valid again.
@@ -1229,12 +1235,14 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 			sb->s_flags &= ~MS_RDONLY;
 	}
 	ext2_sync_super(sb, es);
+	unlock_kernel();
 	return 0;
 restore_opts:
 	sbi->s_mount_opt = old_opts.s_mount_opt;
 	sbi->s_resuid = old_opts.s_resuid;
 	sbi->s_resgid = old_opts.s_resgid;
 	sb->s_flags = old_sb_flags;
+	unlock_kernel();
 	return err;
 }
 
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index e213a26..26aa64d 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2490,6 +2490,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 	int i;
 #endif
 
+	lock_kernel();
+
 	/* Store the original options */
 	lock_super(sb);
 	old_sb_flags = sb->s_flags;
@@ -2600,6 +2602,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 			kfree(old_opts.s_qf_names[i]);
 #endif
 	unlock_super(sb);
+	unlock_kernel();
 	return 0;
 restore_opts:
 	sb->s_flags = old_sb_flags;
@@ -2617,6 +2620,7 @@ restore_opts:
 	}
 #endif
 	unlock_super(sb);
+	unlock_kernel();
 	return err;
 }
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c17200a..012c425 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3422,6 +3422,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	int i;
 #endif
 
+	lock_kernel();
+
 	/* Store the original options */
 	lock_super(sb);
 	old_sb_flags = sb->s_flags;
@@ -3558,6 +3560,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			kfree(old_opts.s_qf_names[i]);
 #endif
 	unlock_super(sb);
+	unlock_kernel();
 	return 0;
 
 restore_opts:
@@ -3578,6 +3581,7 @@ restore_opts:
 	}
 #endif
 	unlock_super(sb);
+	unlock_kernel();
 	return err;
 }
 
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index f68193c..f2feaa0 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -13,6 +13,7 @@
 #include <linux/statfs.h>
 #include <linux/magic.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 
 /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
 
@@ -398,6 +399,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
 	
 	*flags |= MS_NOATIME;
 	
+	lock_kernel();
 	lock_super(s);
 	uid = sbi->sb_uid; gid = sbi->sb_gid;
 	umask = 0777 & ~sbi->sb_mode;
@@ -432,10 +434,12 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
 	replace_mount_options(s, new_opts);
 
 	unlock_super(s);
+	unlock_kernel();
 	return 0;
 
 out_err:
 	unlock_super(s);
+	unlock_kernel();
 	kfree(new_opts);
 	return -EINVAL;
 }
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 237b27a..3451a81 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -20,6 +20,7 @@
 #include <linux/vmalloc.h>
 #include <linux/vfs.h>
 #include <linux/crc32.h>
+#include <linux/smp_lock.h>
 #include "nodelist.h"
 
 static int jffs2_flash_setup(struct jffs2_sb_info *c);
@@ -387,6 +388,7 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
 	   This also catches the case where it was stopped and this
 	   is just a remount to restart it.
 	   Flush the writebuffer, if neccecary, else we loose it */
+	lock_kernel();
 	if (!(sb->s_flags & MS_RDONLY)) {
 		jffs2_stop_garbage_collect_thread(c);
 		mutex_lock(&c->alloc_sem);
@@ -399,6 +401,7 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
 
 	*flags |= MS_NOATIME;
 
+	unlock_kernel();
 	return 0;
 }
 
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 3eb13ad..09b1b6e 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -32,6 +32,7 @@
 #include <linux/crc32.h>
 #include <asm/uaccess.h>
 #include <linux/seq_file.h>
+#include <linux/smp_lock.h>
 
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
@@ -375,19 +376,24 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
 	s64 newLVSize = 0;
 	int rc = 0;
 	int flag = JFS_SBI(sb)->flag;
+	int ret;
 
 	if (!parse_options(data, sb, &newLVSize, &flag)) {
 		return -EINVAL;
 	}
+	lock_kernel();
 	if (newLVSize) {
 		if (sb->s_flags & MS_RDONLY) {
 			printk(KERN_ERR
 		  "JFS: resize requires volume to be mounted read-write\n");
+			unlock_kernel();
 			return -EROFS;
 		}
 		rc = jfs_extendfs(sb, newLVSize, 0);
-		if (rc)
+		if (rc) {
+			unlock_kernel();
 			return rc;
+		}
 	}
 
 	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
@@ -398,23 +404,31 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
 		truncate_inode_pages(JFS_SBI(sb)->direct_inode->i_mapping, 0);
 
 		JFS_SBI(sb)->flag = flag;
-		return jfs_mount_rw(sb, 1);
+		ret = jfs_mount_rw(sb, 1);
+		unlock_kernel();
+		return ret;
 	}
 	if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
 		rc = jfs_umount_rw(sb);
 		JFS_SBI(sb)->flag = flag;
+		unlock_kernel();
 		return rc;
 	}
 	if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY))
 		if (!(sb->s_flags & MS_RDONLY)) {
 			rc = jfs_umount_rw(sb);
-			if (rc)
+			if (rc) {
+				unlock_kernel();
 				return rc;
+			}
 			JFS_SBI(sb)->flag = flag;
-			return jfs_mount_rw(sb, 1);
+			ret = jfs_mount_rw(sb, 1);
+			unlock_kernel();
+			return ret;
 		}
 	JFS_SBI(sb)->flag = flag;
 
+	unlock_kernel();
 	return 0;
 }
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d2d6778..26127b6 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1813,6 +1813,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	if (data == NULL)
 		return -ENOMEM;
 
+	lock_kernel();
 	/* fill out struct with values from existing mount */
 	data->flags = nfss->flags;
 	data->rsize = nfss->rsize;
@@ -1837,6 +1838,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	error = nfs_compare_remount_data(nfss, data);
 out:
 	kfree(data);
+	unlock_kernel();
 	return error;
 }
 
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 7262e84..11151ea 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -906,6 +906,8 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 	struct nilfs_mount_options old_opts;
 	int err;
 
+	lock_kernel();
+
 	old_sb_flags = sb->s_flags;
 	old_opts.mount_opt = sbi->s_mount_opt;
 	old_opts.snapshot_cno = sbi->s_snapshot_cno;
@@ -985,6 +987,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 		up(&sb->s_bdev->bd_mount_sem);
 	}
  out:
+	unlock_kernel();
 	return 0;
 
  rw_remount_failed:
@@ -993,6 +996,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 	sb->s_flags = old_sb_flags;
 	sbi->s_mount_opt = old_opts.mount_opt;
 	sbi->s_snapshot_cno = old_opts.snapshot_cno;
+	unlock_kernel();
 	return err;
 }
 
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 7a7b0d3..abaaa1c 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -443,6 +443,8 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 	ntfs_volume *vol = NTFS_SB(sb);
 
 	ntfs_debug("Entering with remount options string: %s", opt);
+
+	lock_kernel();
 #ifndef NTFS_RW
 	/* For read-only compiled driver, enforce read-only flag. */
 	*flags |= MS_RDONLY;
@@ -466,15 +468,18 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 		if (NVolErrors(vol)) {
 			ntfs_error(sb, "Volume has errors and is read-only%s",
 					es);
+			unlock_kernel();
 			return -EROFS;
 		}
 		if (vol->vol_flags & VOLUME_IS_DIRTY) {
 			ntfs_error(sb, "Volume is dirty and read-only%s", es);
+			unlock_kernel();
 			return -EROFS;
 		}
 		if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
 			ntfs_error(sb, "Volume has been modified by chkdsk "
 					"and is read-only%s", es);
+			unlock_kernel();
 			return -EROFS;
 		}
 		if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
@@ -482,11 +487,13 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 					"(0x%x) and is read-only%s",
 					(unsigned)le16_to_cpu(vol->vol_flags),
 					es);
+			unlock_kernel();
 			return -EROFS;
 		}
 		if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
 			ntfs_error(sb, "Failed to set dirty bit in volume "
 					"information flags%s", es);
+			unlock_kernel();
 			return -EROFS;
 		}
 #if 0
@@ -506,18 +513,21 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 			ntfs_error(sb, "Failed to empty journal $LogFile%s",
 					es);
 			NVolSetErrors(vol);
+			unlock_kernel();
 			return -EROFS;
 		}
 		if (!ntfs_mark_quotas_out_of_date(vol)) {
 			ntfs_error(sb, "Failed to mark quotas out of date%s",
 					es);
 			NVolSetErrors(vol);
+			unlock_kernel();
 			return -EROFS;
 		}
 		if (!ntfs_stamp_usnjrnl(vol)) {
 			ntfs_error(sb, "Failed to stamp transation log "
 					"($UsnJrnl)%s", es);
 			NVolSetErrors(vol);
+			unlock_kernel();
 			return -EROFS;
 		}
 	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
@@ -533,8 +543,11 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 
 	// TODO: Deal with *flags.
 
-	if (!parse_options(vol, opt))
+	if (!parse_options(vol, opt)) {
+		unlock_kernel();
 		return -EINVAL;
+	}
+	unlock_kernel();
 	ntfs_debug("Done.");
 	return 0;
 }
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 0273759..201b40a 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -42,6 +42,7 @@
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/quotaops.h>
+#include <linux/smp_lock.h>
 
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
@@ -581,6 +582,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 	struct mount_options parsed_options;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 
+	lock_kernel();
+
 	if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
 		ret = -EINVAL;
 		goto out;
@@ -684,6 +687,7 @@ unlock_osb:
 			ocfs2_set_journal_params(osb);
 	}
 out:
+	unlock_kernel();
 	return ret;
 }
 
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 90dcb7b..2969773 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -28,6 +28,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/crc32.h>
+#include <linux/smp_lock.h>
 
 struct file_system_type reiserfs_fs_type;
 
@@ -1196,6 +1197,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 	memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
 #endif
 
+	lock_kernel();
 	rs = SB_DISK_SUPER_BLOCK(s);
 
 	if (!reiserfs_parse_options
@@ -1318,10 +1320,12 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 
 out_ok:
 	replace_mount_options(s, new_opts);
+	unlock_kernel();
 	return 0;
 
 out_err:
 	kfree(new_opts);
+	unlock_kernel();
 	return err;
 }
 
diff --git a/fs/super.c b/fs/super.c
index 1905f4a..83b47416 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -540,7 +540,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	shrink_dcache_sb(sb);
 	sync_filesystem(sb);
 
-	lock_kernel();
 	/* If we are remounting RDONLY and current sb is read/write,
 	   make sure there are no rw files opened */
 	if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) {
@@ -566,7 +565,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 		}
 	}
 	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
-	unlock_kernel();
 	if (remount_rw)
 		vfs_dq_quota_on_remount(sb);
 	return 0;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 522c3fd..3589eab 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -36,6 +36,7 @@
 #include <linux/mount.h>
 #include <linux/math64.h>
 #include <linux/writeback.h>
+#include <linux/smp_lock.h>
 #include "ubifs.h"
 
 /*
@@ -1770,17 +1771,22 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 		return err;
 	}
 
+	lock_kernel();
 	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
 		if (c->ro_media) {
 			ubifs_msg("cannot re-mount due to prior errors");
+			unlock_kernel();
 			return -EROFS;
 		}
 		err = ubifs_remount_rw(c);
-		if (err)
+		if (err) {
+			unlock_kernel();
 			return err;
+		}
 	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
 		if (c->ro_media) {
 			ubifs_msg("cannot re-mount due to prior errors");
+			unlock_kernel();
 			return -EROFS;
 		}
 		ubifs_remount_ro(c);
@@ -1795,6 +1801,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 	}
 
 	ubifs_assert(c->lst.taken_empty_lebs > 0);
+	unlock_kernel();
 	return 0;
 }
 
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 04802cc..6832135 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -568,6 +568,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 	if (!udf_parse_options(options, &uopt, true))
 		return -EINVAL;
 
+	lock_kernel();
 	sbi->s_flags = uopt.flags;
 	sbi->s_uid   = uopt.uid;
 	sbi->s_gid   = uopt.gid;
@@ -581,13 +582,16 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 			*flags |= MS_RDONLY;
 	}
 
-	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+		unlock_kernel();
 		return 0;
+	}
 	if (*flags & MS_RDONLY)
 		udf_close_lvid(sb);
 	else
 		udf_open_lvid(sb);
 
+	unlock_kernel();
 	return 0;
 }
 
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index c97210e..6560dda 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -263,6 +263,7 @@ void ufs_panic (struct super_block * sb, const char * function,
 	struct ufs_super_block_first * usb1;
 	va_list args;
 	
+	lock_kernel();
 	uspi = UFS_SB(sb)->s_uspi;
 	usb1 = ubh_get_usb_first(uspi);
 	
@@ -1182,7 +1183,8 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	struct ufs_super_block_third * usb3;
 	unsigned new_mount_opt, ufstype;
 	unsigned flags;
-	
+
+	lock_kernel();
 	lock_super(sb);
 	uspi = UFS_SB(sb)->s_uspi;
 	flags = UFS_SB(sb)->s_flags;
@@ -1198,6 +1200,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	ufs_set_opt (new_mount_opt, ONERROR_LOCK);
 	if (!ufs_parse_options (data, &new_mount_opt)) {
 		unlock_super(sb);
+		unlock_kernel();
 		return -EINVAL;
 	}
 	if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
@@ -1205,12 +1208,14 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	} else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
 		printk("ufstype can't be changed during remount\n");
 		unlock_super(sb);
+		unlock_kernel();
 		return -EINVAL;
 	}
 
 	if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
 		UFS_SB(sb)->s_mount_opt = new_mount_opt;
 		unlock_super(sb);
+		unlock_kernel();
 		return 0;
 	}
 	
@@ -1236,6 +1241,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 		printk("ufs was compiled with read-only support, "
 		"can't be mounted as read-write\n");
 		unlock_super(sb);
+		unlock_kernel();
 		return -EINVAL;
 #else
 		if (ufstype != UFS_MOUNT_UFSTYPE_SUN && 
@@ -1245,11 +1251,13 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 		    ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
 			printk("this ufstype is read-only supported\n");
 			unlock_super(sb);
+			unlock_kernel();
 			return -EINVAL;
 		}
 		if (!ufs_read_cylinder_structures(sb)) {
 			printk("failed during remounting\n");
 			unlock_super(sb);
+			unlock_kernel();
 			return -EPERM;
 		}
 		sb->s_flags &= ~MS_RDONLY;
@@ -1257,6 +1265,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	}
 	UFS_SB(sb)->s_mount_opt = new_mount_opt;
 	unlock_super(sb);
+	unlock_kernel();
 	return 0;
 }
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a7267bf..3fb789f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -46,6 +46,7 @@
 #include <linux/cgroupstats.h>
 #include <linux/hash.h>
 #include <linux/namei.h>
+#include <linux/smp_lock.h>
 
 #include <asm/atomic.h>
 
@@ -900,6 +901,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	struct cgroup *cgrp = &root->top_cgroup;
 	struct cgroup_sb_opts opts;
 
+	lock_kernel();
 	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
 	mutex_lock(&cgroup_mutex);
 
@@ -927,6 +929,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	kfree(opts.release_agent);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
+	unlock_kernel();
 	return ret;
 }
 
-- 
cgit v0.10.2


From d5aacad548db1ff547adf35d0a77eb2a8ed4fe14 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Jun 2009 14:56:44 -0400
Subject: New helper - simple_fsync()

writes associated buffers, then does sync_inode() to write
the inode itself (and to make it clean).  Depends on
->write_inode() honouring the second argument.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/libfs.c b/fs/libfs.c
index 80046dd..ddfa899 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -9,6 +9,8 @@
 #include <linux/vfs.h>
 #include <linux/mutex.h>
 #include <linux/exportfs.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
 
 #include <asm/uaccess.h>
 
@@ -807,6 +809,29 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
 }
 EXPORT_SYMBOL_GPL(generic_fh_to_parent);
 
+int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL,
+		.nr_to_write = 0, /* metadata-only; caller takes care of data */
+	};
+	struct inode *inode = dentry->d_inode;
+	int err;
+	int ret;
+
+	ret = sync_mapping_buffers(inode->i_mapping);
+	if (!(inode->i_state & I_DIRTY))
+		return ret;
+	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+		return ret;
+
+	err = sync_inode(inode, &wbc);
+	if (ret == 0)
+		ret = err;
+	return ret;
+}
+EXPORT_SYMBOL(simple_fsync);
+
 EXPORT_SYMBOL(dcache_dir_close);
 EXPORT_SYMBOL(dcache_dir_lseek);
 EXPORT_SYMBOL(dcache_dir_open);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d883aa1..ede84fa 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2345,6 +2345,8 @@ extern void simple_release_fs(struct vfsmount **mount, int *count);
 extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
 			loff_t *ppos, const void *from, size_t available);
 
+extern int simple_fsync(struct file *, struct dentry *, int);
+
 #ifdef CONFIG_MIGRATION
 extern int buffer_migrate_page(struct address_space *,
 				struct page *, struct page *);
-- 
cgit v0.10.2


From 79d25767583e4e086f8309bfd1f502660a64fe7f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Jun 2009 09:30:08 -0400
Subject: Sanitize qnx4 fsync handling

* have directory operations use mark_buffer_dirty_inode(),
  so that sync_mapping_buffers() would get those.
* make qnx4_write_inode() honour its last argument.
* get rid of insane copies of very ancient "walk the indirect blocks"
  in qnx4/fsync - they never matched the actual fs layout and, fortunately,
  never'd been called.  Again, all this junk is not needed; ->fsync()
  should just do sync_mapping_buffers + sync_inode (and if we implement
  block allocation for qnx4, we'll need to use mark_buffer_dirty_inode()
  for extent blocks)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/qnx4/Makefile b/fs/qnx4/Makefile
index 502d7fe..e4d408c 100644
--- a/fs/qnx4/Makefile
+++ b/fs/qnx4/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_QNX4FS_FS) += qnx4.o
 
-qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o fsync.o
+qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index ea9ffef..ff6c1ba 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -84,7 +84,7 @@ const struct file_operations qnx4_dir_operations =
 {
 	.read		= generic_read_dir,
 	.readdir	= qnx4_readdir,
-	.fsync		= file_fsync,
+	.fsync		= simple_fsync,
 };
 
 const struct inode_operations qnx4_dir_inode_operations =
diff --git a/fs/qnx4/file.c b/fs/qnx4/file.c
index 867f42b..e7033ea 100644
--- a/fs/qnx4/file.c
+++ b/fs/qnx4/file.c
@@ -29,7 +29,7 @@ const struct file_operations qnx4_file_operations =
 #ifdef CONFIG_QNX4FS_RW
 	.write		= do_sync_write,
 	.aio_write	= generic_file_aio_write,
-	.fsync		= qnx4_sync_file,
+	.fsync		= simple_fsync,
 #endif
 };
 
diff --git a/fs/qnx4/fsync.c b/fs/qnx4/fsync.c
deleted file mode 100644
index aa3b195..0000000
--- a/fs/qnx4/fsync.c
+++ /dev/null
@@ -1,169 +0,0 @@
-/* 
- * QNX4 file system, Linux implementation.
- * 
- * Version : 0.1
- * 
- * Using parts of the xiafs filesystem.
- * 
- * History :
- * 
- * 24-03-1998 by Richard Frowijn : first release.
- */
-
-#include <linux/errno.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
-
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
-
-#include <asm/system.h>
-
-/*
- * The functions for qnx4 fs file synchronization.
- */
-
-#ifdef CONFIG_QNX4FS_RW
-
-static int sync_block(struct inode *inode, unsigned short *block, int wait)
-{
-	struct buffer_head *bh;
-	unsigned short tmp;
-
-	if (!*block)
-		return 0;
-	tmp = *block;
-	bh = sb_find_get_block(inode->i_sb, *block);
-	if (!bh)
-		return 0;
-	if (*block != tmp) {
-		brelse(bh);
-		return 1;
-	}
-	if (wait && buffer_req(bh) && !buffer_uptodate(bh)) {
-		brelse(bh);
-		return -1;
-	}
-	if (wait || !buffer_uptodate(bh) || !buffer_dirty(bh)) {
-		brelse(bh);
-		return 0;
-	}
-	ll_rw_block(WRITE, 1, &bh);
-	atomic_dec(&bh->b_count);
-	return 0;
-}
-
-#ifdef WTF
-static int sync_iblock(struct inode *inode, unsigned short *iblock,
-		       struct buffer_head **bh, int wait)
-{
-	int rc;
-	unsigned short tmp;
-
-	*bh = NULL;
-	tmp = *iblock;
-	if (!tmp)
-		return 0;
-	rc = sync_block(inode, iblock, wait);
-	if (rc)
-		return rc;
-	*bh = sb_bread(inode->i_sb, tmp);
-	if (tmp != *iblock) {
-		brelse(*bh);
-		*bh = NULL;
-		return 1;
-	}
-	if (!*bh)
-		return -1;
-	return 0;
-}
-#endif
-
-static int sync_direct(struct inode *inode, int wait)
-{
-	int i;
-	int rc, err = 0;
-
-	for (i = 0; i < 7; i++) {
-		rc = sync_block(inode,
-				(unsigned short *) qnx4_raw_inode(inode)->di_first_xtnt.xtnt_blk + i, wait);
-		if (rc > 0)
-			break;
-		if (rc)
-			err = rc;
-	}
-	return err;
-}
-
-#ifdef WTF
-static int sync_indirect(struct inode *inode, unsigned short *iblock, int wait)
-{
-	int i;
-	struct buffer_head *ind_bh;
-	int rc, err = 0;
-
-	rc = sync_iblock(inode, iblock, &ind_bh, wait);
-	if (rc || !ind_bh)
-		return rc;
-
-	for (i = 0; i < 512; i++) {
-		rc = sync_block(inode,
-				((unsigned short *) ind_bh->b_data) + i,
-				wait);
-		if (rc > 0)
-			break;
-		if (rc)
-			err = rc;
-	}
-	brelse(ind_bh);
-	return err;
-}
-
-static int sync_dindirect(struct inode *inode, unsigned short *diblock,
-			  int wait)
-{
-	int i;
-	struct buffer_head *dind_bh;
-	int rc, err = 0;
-
-	rc = sync_iblock(inode, diblock, &dind_bh, wait);
-	if (rc || !dind_bh)
-		return rc;
-
-	for (i = 0; i < 512; i++) {
-		rc = sync_indirect(inode,
-				((unsigned short *) dind_bh->b_data) + i,
-				   wait);
-		if (rc > 0)
-			break;
-		if (rc)
-			err = rc;
-	}
-	brelse(dind_bh);
-	return err;
-}
-#endif
-
-int qnx4_sync_file(struct file *file, struct dentry *dentry, int unused)
-{
-        struct inode *inode = dentry->d_inode;
-	int wait, err = 0;
-        
-        (void) file;
-	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-	      S_ISLNK(inode->i_mode)))
-		return -EINVAL;
-
-	lock_kernel();
-	for (wait = 0; wait <= 1; wait++) {
-		err |= sync_direct(inode, wait);
-	}
-	err |= qnx4_sync_inode(inode);
-	unlock_kernel();
-	return (err < 0) ? -EIO : 0;
-}
-
-#endif
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 95c12fc..4071286 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -24,6 +24,7 @@
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
+#include <linux/writeback.h>
 #include <linux/vfs.h>
 #include <asm/uaccess.h>
 
@@ -34,31 +35,6 @@ static const struct super_operations qnx4_sops;
 
 #ifdef CONFIG_QNX4FS_RW
 
-int qnx4_sync_inode(struct inode *inode)
-{
-	int err = 0;
-# if 0
-	struct buffer_head *bh;
-
-   	bh = qnx4_update_inode(inode);
-	if (bh && buffer_dirty(bh))
-	{
-		sync_dirty_buffer(bh);
-		if (buffer_req(bh) && !buffer_uptodate(bh))
-		{
-			printk ("IO error syncing qnx4 inode [%s:%08lx]\n",
-				inode->i_sb->s_id, inode->i_ino);
-			err = -1;
-		}
-	        brelse (bh);
-	} else if (!bh) {
-		err = -1;
-	}
-# endif
-
-	return err;
-}
-
 static void qnx4_delete_inode(struct inode *inode)
 {
 	QNX4DEBUG(("qnx4: deleting inode [%lu]\n", (unsigned long) inode->i_ino));
@@ -70,7 +46,7 @@ static void qnx4_delete_inode(struct inode *inode)
 	unlock_kernel();
 }
 
-static int qnx4_write_inode(struct inode *inode, int unused)
+static int qnx4_write_inode(struct inode *inode, int do_sync)
 {
 	struct qnx4_inode_entry *raw_inode;
 	int block, ino;
@@ -107,6 +83,16 @@ static int qnx4_write_inode(struct inode *inode, int unused)
 	raw_inode->di_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
 	raw_inode->di_first_xtnt.xtnt_size = cpu_to_le32(inode->i_blocks);
 	mark_buffer_dirty(bh);
+	if (do_sync) {
+		sync_dirty_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh)) {
+			printk("qnx4: IO error syncing inode [%s:%08x]\n",
+					inode->i_sb->s_id, ino);
+			brelse(bh);
+			unlock_kernel();
+			return -EIO;
+		}
+	}
 	brelse(bh);
 	unlock_kernel();
 	return 0;
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 775eed3..123270c 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -187,7 +187,7 @@ int qnx4_rmdir(struct inode *dir, struct dentry *dentry)
 	de->di_status = 0;
 	memset(de->di_fname, 0, sizeof de->di_fname);
 	de->di_mode = 0;
-	mark_buffer_dirty(bh);
+	mark_buffer_dirty_inode(bh, dir);
 	clear_nlink(inode);
 	mark_inode_dirty(inode);
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
@@ -232,7 +232,7 @@ int qnx4_unlink(struct inode *dir, struct dentry *dentry)
 	de->di_status = 0;
 	memset(de->di_fname, 0, sizeof de->di_fname);
 	de->di_mode = 0;
-	mark_buffer_dirty(bh);
+	mark_buffer_dirty_inode(bh, dir);
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
 	mark_inode_dirty(dir);
 	inode->i_ctime = dir->i_ctime;
diff --git a/include/linux/qnx4_fs.h b/include/linux/qnx4_fs.h
index 787d19e..acbaec3 100644
--- a/include/linux/qnx4_fs.h
+++ b/include/linux/qnx4_fs.h
@@ -126,8 +126,6 @@ extern void qnx4_truncate(struct inode *inode);
 extern void qnx4_free_inode(struct inode *inode);
 extern int qnx4_unlink(struct inode *dir, struct dentry *dentry);
 extern int qnx4_rmdir(struct inode *dir, struct dentry *dentry);
-extern int qnx4_sync_file(struct file *file, struct dentry *dentry, int);
-extern int qnx4_sync_inode(struct inode *inode);
 
 static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb)
 {
-- 
cgit v0.10.2


From 964f5369667b342994fe3f384e9ba41d404ee796 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Jun 2009 09:47:13 -0400
Subject: fs/qnx4: sanitize includes

fs-internal parts of qnx4_fs.h taken to fs/qnx4/qnx4.h, includes adjusted,
qnx4_fs.h doesn't need unifdef anymore.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c
index 8425cf6..e1cd061 100644
--- a/fs/qnx4/bitmap.c
+++ b/fs/qnx4/bitmap.c
@@ -13,14 +13,9 @@
  * 28-06-1998 by Frank Denis : qnx4_free_inode (to be fixed) .
  */
 
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
-#include <linux/stat.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
+#include "qnx4.h"
 
 #if 0
 int qnx4_new_block(struct super_block *sb)
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index ff6c1ba..003c68f 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -11,14 +11,9 @@
  * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support.
  */
 
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
-#include <linux/stat.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
-
+#include "qnx4.h"
 
 static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
diff --git a/fs/qnx4/file.c b/fs/qnx4/file.c
index e7033ea..09b170a 100644
--- a/fs/qnx4/file.c
+++ b/fs/qnx4/file.c
@@ -12,8 +12,7 @@
  * 27-06-1998 by Frank Denis : file overwriting.
  */
 
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
+#include "qnx4.h"
 
 /*
  * We have mostly NULL's here: the current defaults are ok for
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 4071286..681df5f 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -13,20 +13,15 @@
  */
 
 #include <linux/module.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/highuid.h>
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
-#include <linux/vfs.h>
-#include <asm/uaccess.h>
+#include <linux/statfs.h>
+#include "qnx4.h"
 
 #define QNX4_VERSION  4
 #define QNX4_BMNAME   ".bitmap"
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 123270c..5972ed2 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -12,16 +12,9 @@
  * 04-07-1998 by Frank Denis : first step for rmdir/unlink.
  */
 
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/errno.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include "qnx4.h"
 
 
 /*
diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h
new file mode 100644
index 0000000..9efc089
--- /dev/null
+++ b/fs/qnx4/qnx4.h
@@ -0,0 +1,57 @@
+#include <linux/fs.h>
+#include <linux/qnx4_fs.h>
+
+#define QNX4_DEBUG 0
+
+#if QNX4_DEBUG
+#define QNX4DEBUG(X) printk X
+#else
+#define QNX4DEBUG(X) (void) 0
+#endif
+
+struct qnx4_sb_info {
+	struct buffer_head	*sb_buf;	/* superblock buffer */
+	struct qnx4_super_block	*sb;		/* our superblock */
+	unsigned int		Version;	/* may be useful */
+	struct qnx4_inode_entry	*BitMap;	/* useful */
+};
+
+struct qnx4_inode_info {
+	struct qnx4_inode_entry raw;
+	loff_t mmu_private;
+	struct inode vfs_inode;
+};
+
+extern struct inode *qnx4_iget(struct super_block *, unsigned long);
+extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd);
+extern unsigned long qnx4_count_free_blocks(struct super_block *sb);
+extern unsigned long qnx4_block_map(struct inode *inode, long iblock);
+
+extern struct buffer_head *qnx4_bread(struct inode *, int, int);
+
+extern const struct inode_operations qnx4_file_inode_operations;
+extern const struct inode_operations qnx4_dir_inode_operations;
+extern const struct file_operations qnx4_file_operations;
+extern const struct file_operations qnx4_dir_operations;
+extern int qnx4_is_free(struct super_block *sb, long block);
+extern int qnx4_set_bitmap(struct super_block *sb, long block, int busy);
+extern int qnx4_create(struct inode *inode, struct dentry *dentry, int mode, struct nameidata *nd);
+extern void qnx4_truncate(struct inode *inode);
+extern void qnx4_free_inode(struct inode *inode);
+extern int qnx4_unlink(struct inode *dir, struct dentry *dentry);
+extern int qnx4_rmdir(struct inode *dir, struct dentry *dentry);
+
+static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct qnx4_inode_info *qnx4_i(struct inode *inode)
+{
+	return container_of(inode, struct qnx4_inode_info, vfs_inode);
+}
+
+static inline struct qnx4_inode_entry *qnx4_raw_inode(struct inode *inode)
+{
+	return &qnx4_i(inode)->raw;
+}
diff --git a/fs/qnx4/truncate.c b/fs/qnx4/truncate.c
index 6437c1c..d94d9ee 100644
--- a/fs/qnx4/truncate.c
+++ b/fs/qnx4/truncate.c
@@ -10,12 +10,8 @@
  * 30-06-1998 by Frank DENIS : ugly filler.
  */
 
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
 #include <linux/smp_lock.h>
-#include <asm/uaccess.h>
+#include "qnx4.h"
 
 #ifdef CONFIG_QNX4FS_RW
 
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 3f0eaa3..b3afd22 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -135,6 +135,7 @@ header-y += posix_types.h
 header-y += ppdev.h
 header-y += prctl.h
 header-y += qnxtypes.h
+header-y += qnx4_fs.h
 header-y += radeonfb.h
 header-y += raw.h
 header-y += resource.h
@@ -308,7 +309,6 @@ unifdef-y += poll.h
 unifdef-y += ppp_defs.h
 unifdef-y += ppp-comp.h
 unifdef-y += ptrace.h
-unifdef-y += qnx4_fs.h
 unifdef-y += quota.h
 unifdef-y += random.h
 unifdef-y += irqnr.h
diff --git a/include/linux/qnx4_fs.h b/include/linux/qnx4_fs.h
index acbaec3..8b9aee1 100644
--- a/include/linux/qnx4_fs.h
+++ b/include/linux/qnx4_fs.h
@@ -85,63 +85,4 @@ struct qnx4_super_block {
 	struct qnx4_inode_entry AltBoot;
 };
 
-#ifdef __KERNEL__
-
-#define QNX4_DEBUG 0
-
-#if QNX4_DEBUG
-#define QNX4DEBUG(X) printk X
-#else
-#define QNX4DEBUG(X) (void) 0
-#endif
-
-struct qnx4_sb_info {
-	struct buffer_head	*sb_buf;	/* superblock buffer */
-	struct qnx4_super_block	*sb;		/* our superblock */
-	unsigned int		Version;	/* may be useful */
-	struct qnx4_inode_entry	*BitMap;	/* useful */
-};
-
-struct qnx4_inode_info {
-	struct qnx4_inode_entry raw;
-	loff_t mmu_private;
-	struct inode vfs_inode;
-};
-
-extern struct inode *qnx4_iget(struct super_block *, unsigned long);
-extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd);
-extern unsigned long qnx4_count_free_blocks(struct super_block *sb);
-extern unsigned long qnx4_block_map(struct inode *inode, long iblock);
-
-extern struct buffer_head *qnx4_bread(struct inode *, int, int);
-
-extern const struct inode_operations qnx4_file_inode_operations;
-extern const struct inode_operations qnx4_dir_inode_operations;
-extern const struct file_operations qnx4_file_operations;
-extern const struct file_operations qnx4_dir_operations;
-extern int qnx4_is_free(struct super_block *sb, long block);
-extern int qnx4_set_bitmap(struct super_block *sb, long block, int busy);
-extern int qnx4_create(struct inode *inode, struct dentry *dentry, int mode, struct nameidata *nd);
-extern void qnx4_truncate(struct inode *inode);
-extern void qnx4_free_inode(struct inode *inode);
-extern int qnx4_unlink(struct inode *dir, struct dentry *dentry);
-extern int qnx4_rmdir(struct inode *dir, struct dentry *dentry);
-
-static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-
-static inline struct qnx4_inode_info *qnx4_i(struct inode *inode)
-{
-	return container_of(inode, struct qnx4_inode_info, vfs_inode);
-}
-
-static inline struct qnx4_inode_entry *qnx4_raw_inode(struct inode *inode)
-{
-	return &qnx4_i(inode)->raw;
-}
-
-#endif				/* __KERNEL__ */
-
 #endif
-- 
cgit v0.10.2


From b522412aeabadbb302fd4338eaabf09d10e2d29c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Jun 2009 13:44:36 -0400
Subject: Sanitize ->fsync() for FAT

* mark directory data blocks as assoc. metadata
* add new inode to deal with FAT, mark FAT blocks as assoc. metadata of that
* now ->fsync() is trivial both for files and directories

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 3a7f603..f350029 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -840,7 +840,7 @@ const struct file_operations fat_dir_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= fat_compat_dir_ioctl,
 #endif
-	.fsync		= file_fsync,
+	.fsync		= fat_file_fsync,
 };
 
 static int fat_get_short_entry(struct inode *dir, loff_t *pos,
@@ -967,7 +967,7 @@ static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots)
 			de++;
 			nr_slots--;
 		}
-		mark_buffer_dirty(bh);
+		mark_buffer_dirty_inode(bh, dir);
 		if (IS_DIRSYNC(dir))
 			err = sync_dirty_buffer(bh);
 		brelse(bh);
@@ -1001,7 +1001,7 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
 		de--;
 		nr_slots--;
 	}
-	mark_buffer_dirty(bh);
+	mark_buffer_dirty_inode(bh, dir);
 	if (IS_DIRSYNC(dir))
 		err = sync_dirty_buffer(bh);
 	brelse(bh);
@@ -1051,7 +1051,7 @@ static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used,
 		}
 		memset(bhs[n]->b_data, 0, sb->s_blocksize);
 		set_buffer_uptodate(bhs[n]);
-		mark_buffer_dirty(bhs[n]);
+		mark_buffer_dirty_inode(bhs[n], dir);
 
 		n++;
 		blknr++;
@@ -1131,7 +1131,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
 	de[0].size = de[1].size = 0;
 	memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de));
 	set_buffer_uptodate(bhs[0]);
-	mark_buffer_dirty(bhs[0]);
+	mark_buffer_dirty_inode(bhs[0], dir);
 
 	err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE);
 	if (err)
@@ -1193,7 +1193,7 @@ static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots,
 			slots += copy;
 			size -= copy;
 			set_buffer_uptodate(bhs[n]);
-			mark_buffer_dirty(bhs[n]);
+			mark_buffer_dirty_inode(bhs[n], dir);
 			if (!size)
 				break;
 			n++;
@@ -1293,7 +1293,7 @@ found:
 		for (i = 0; i < long_bhs; i++) {
 			int copy = min_t(int, sb->s_blocksize - offset, size);
 			memcpy(bhs[i]->b_data + offset, slots, copy);
-			mark_buffer_dirty(bhs[i]);
+			mark_buffer_dirty_inode(bhs[i], dir);
 			offset = 0;
 			slots += copy;
 			size -= copy;
@@ -1304,7 +1304,7 @@ found:
 			/* Fill the short name slot. */
 			int copy = min_t(int, sb->s_blocksize - offset, size);
 			memcpy(bhs[i]->b_data + offset, slots, copy);
-			mark_buffer_dirty(bhs[i]);
+			mark_buffer_dirty_inode(bhs[i], dir);
 			if (IS_DIRSYNC(dir))
 				err = sync_dirty_buffer(bhs[i]);
 		}
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index ea440d6..e4d8852 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -74,6 +74,7 @@ struct msdos_sb_info {
 
 	int fatent_shift;
 	struct fatent_operations *fatent_ops;
+	struct inode *fat_inode;
 
 	spinlock_t inode_hash_lock;
 	struct hlist_head inode_hashtable[FAT_HASH_SIZE];
@@ -251,6 +252,7 @@ struct fat_entry {
 	} u;
 	int nr_bhs;
 	struct buffer_head *bhs[2];
+	struct inode *fat_inode;
 };
 
 static inline void fatent_init(struct fat_entry *fatent)
@@ -259,6 +261,7 @@ static inline void fatent_init(struct fat_entry *fatent)
 	fatent->entry = 0;
 	fatent->u.ent32_p = NULL;
 	fatent->bhs[0] = fatent->bhs[1] = NULL;
+	fatent->fat_inode = NULL;
 }
 
 static inline void fatent_set_entry(struct fat_entry *fatent, int entry)
@@ -275,6 +278,7 @@ static inline void fatent_brelse(struct fat_entry *fatent)
 		brelse(fatent->bhs[i]);
 	fatent->nr_bhs = 0;
 	fatent->bhs[0] = fatent->bhs[1] = NULL;
+	fatent->fat_inode = NULL;
 }
 
 extern void fat_ent_access_init(struct super_block *sb);
@@ -296,6 +300,8 @@ extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
 extern void fat_truncate(struct inode *inode);
 extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		       struct kstat *stat);
+extern int fat_file_fsync(struct file *file, struct dentry *dentry,
+			  int datasync);
 
 /* fat/inode.c */
 extern void fat_attach(struct inode *inode, loff_t i_pos);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index da6eea4..618f530 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -73,6 +73,8 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent,
 	struct buffer_head **bhs = fatent->bhs;
 
 	WARN_ON(blocknr < MSDOS_SB(sb)->fat_start);
+	fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
+
 	bhs[0] = sb_bread(sb, blocknr);
 	if (!bhs[0])
 		goto err;
@@ -103,6 +105,7 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent,
 	struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
 
 	WARN_ON(blocknr < MSDOS_SB(sb)->fat_start);
+	fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
 	fatent->bhs[0] = sb_bread(sb, blocknr);
 	if (!fatent->bhs[0]) {
 		printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n",
@@ -167,9 +170,9 @@ static void fat12_ent_put(struct fat_entry *fatent, int new)
 	}
 	spin_unlock(&fat12_entry_lock);
 
-	mark_buffer_dirty(fatent->bhs[0]);
+	mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
 	if (fatent->nr_bhs == 2)
-		mark_buffer_dirty(fatent->bhs[1]);
+		mark_buffer_dirty_inode(fatent->bhs[1], fatent->fat_inode);
 }
 
 static void fat16_ent_put(struct fat_entry *fatent, int new)
@@ -178,7 +181,7 @@ static void fat16_ent_put(struct fat_entry *fatent, int new)
 		new = EOF_FAT16;
 
 	*fatent->u.ent16_p = cpu_to_le16(new);
-	mark_buffer_dirty(fatent->bhs[0]);
+	mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
 }
 
 static void fat32_ent_put(struct fat_entry *fatent, int new)
@@ -189,7 +192,7 @@ static void fat32_ent_put(struct fat_entry *fatent, int new)
 	WARN_ON(new & 0xf0000000);
 	new |= le32_to_cpu(*fatent->u.ent32_p) & ~0x0fffffff;
 	*fatent->u.ent32_p = cpu_to_le32(new);
-	mark_buffer_dirty(fatent->bhs[0]);
+	mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
 }
 
 static int fat12_ent_next(struct fat_entry *fatent)
@@ -381,7 +384,7 @@ static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs,
 			}
 			memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize);
 			set_buffer_uptodate(c_bh);
-			mark_buffer_dirty(c_bh);
+			mark_buffer_dirty_inode(c_bh, sbi->fat_inode);
 			if (sb->s_flags & MS_SYNCHRONOUS)
 				err = sync_dirty_buffer(c_bh);
 			brelse(c_bh);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 0a7f4a9..e955a56 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -133,6 +133,18 @@ static int fat_file_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+int fat_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+{
+	struct inode *inode = dentry->d_inode;
+	int res, err;
+
+	res = simple_fsync(filp, dentry, datasync);
+	err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
+
+	return res ? res : err;
+}
+
+
 const struct file_operations fat_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -142,7 +154,7 @@ const struct file_operations fat_file_operations = {
 	.mmap		= generic_file_mmap,
 	.release	= fat_file_release,
 	.ioctl		= fat_generic_ioctl,
-	.fsync		= file_fsync,
+	.fsync		= fat_file_fsync,
 	.splice_read	= generic_file_splice_read,
 };
 
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 2292cbf..476f80b 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -458,6 +458,8 @@ static void fat_put_super(struct super_block *sb)
 	if (sb->s_dirt)
 		fat_write_super(sb);
 
+	iput(sbi->fat_inode);
+
 	if (sbi->nls_disk) {
 		unload_nls(sbi->nls_disk);
 		sbi->nls_disk = NULL;
@@ -1183,7 +1185,7 @@ static int fat_read_root(struct inode *inode)
 int fat_fill_super(struct super_block *sb, void *data, int silent,
 		   const struct inode_operations *fs_dir_inode_ops, int isvfat)
 {
-	struct inode *root_inode = NULL;
+	struct inode *root_inode = NULL, *fat_inode = NULL;
 	struct buffer_head *bh;
 	struct fat_boot_sector *b;
 	struct msdos_sb_info *sbi;
@@ -1423,6 +1425,11 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 	}
 
 	error = -ENOMEM;
+	fat_inode = new_inode(sb);
+	if (!fat_inode)
+		goto out_fail;
+	MSDOS_I(fat_inode)->i_pos = 0;
+	sbi->fat_inode = fat_inode;
 	root_inode = new_inode(sb);
 	if (!root_inode)
 		goto out_fail;
@@ -1448,6 +1455,8 @@ out_invalid:
 		       " on dev %s.\n", sb->s_id);
 
 out_fail:
+	if (fat_inode)
+		iput(fat_inode);
 	if (root_inode)
 		iput(root_inode);
 	if (sbi->nls_io)
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index da3f361..20f5228 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -544,7 +544,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
 		int start = MSDOS_I(new_dir)->i_logstart;
 		dotdot_de->start = cpu_to_le16(start);
 		dotdot_de->starthi = cpu_to_le16(start >> 16);
-		mark_buffer_dirty(dotdot_bh);
+		mark_buffer_dirty_inode(dotdot_bh, old_inode);
 		if (IS_DIRSYNC(new_dir)) {
 			err = sync_dirty_buffer(dotdot_bh);
 			if (err)
@@ -586,7 +586,7 @@ error_dotdot:
 		int start = MSDOS_I(old_dir)->i_logstart;
 		dotdot_de->start = cpu_to_le16(start);
 		dotdot_de->starthi = cpu_to_le16(start >> 16);
-		mark_buffer_dirty(dotdot_bh);
+		mark_buffer_dirty_inode(dotdot_bh, old_inode);
 		corrupt |= sync_dirty_buffer(dotdot_bh);
 	}
 error_inode:
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index a0e00e3..b50ecbe 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -965,7 +965,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 		int start = MSDOS_I(new_dir)->i_logstart;
 		dotdot_de->start = cpu_to_le16(start);
 		dotdot_de->starthi = cpu_to_le16(start >> 16);
-		mark_buffer_dirty(dotdot_bh);
+		mark_buffer_dirty_inode(dotdot_bh, old_inode);
 		if (IS_DIRSYNC(new_dir)) {
 			err = sync_dirty_buffer(dotdot_bh);
 			if (err)
@@ -1009,7 +1009,7 @@ error_dotdot:
 		int start = MSDOS_I(old_dir)->i_logstart;
 		dotdot_de->start = cpu_to_le16(start);
 		dotdot_de->starthi = cpu_to_le16(start >> 16);
-		mark_buffer_dirty(dotdot_bh);
+		mark_buffer_dirty_inode(dotdot_bh, old_inode);
 		corrupt |= sync_dirty_buffer(dotdot_bh);
 	}
 error_inode:
-- 
cgit v0.10.2


From e1740a462ecb2eae213be15857b577cc6f6bb8b4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Jun 2009 15:14:02 -0400
Subject: switch ext2 to simple_fsync()

kill ext2_sync_file() (along with ext2/fsync.c), get rid of
ext2_update_inode() - it's an alias of ext2_write_inode().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile
index e0b2b43..f42af45 100644
--- a/fs/ext2/Makefile
+++ b/fs/ext2/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_EXT2_FS) += ext2.o
 
-ext2-y := balloc.o dir.o file.o fsync.o ialloc.o inode.o \
+ext2-y := balloc.o dir.o file.o ialloc.o inode.o \
 	  ioctl.o namei.o super.o symlink.o
 
 ext2-$(CONFIG_EXT2_FS_XATTR)	 += xattr.o xattr_user.o xattr_trusted.o
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 2999d72..0035004 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -720,5 +720,5 @@ const struct file_operations ext2_dir_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext2_compat_ioctl,
 #endif
-	.fsync		= ext2_sync_file,
+	.fsync		= simple_fsync,
 };
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 3203042..b2bbf45 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -113,9 +113,6 @@ extern int ext2_empty_dir (struct inode *);
 extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
 extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *);
 
-/* fsync.c */
-extern int ext2_sync_file (struct file *, struct dentry *, int);
-
 /* ialloc.c */
 extern struct inode * ext2_new_inode (struct inode *, int);
 extern void ext2_free_inode (struct inode *);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 45ed071..2b9e47d 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -55,7 +55,7 @@ const struct file_operations ext2_file_operations = {
 	.mmap		= generic_file_mmap,
 	.open		= generic_file_open,
 	.release	= ext2_release_file,
-	.fsync		= ext2_sync_file,
+	.fsync		= simple_fsync,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
 };
@@ -72,7 +72,7 @@ const struct file_operations ext2_xip_file_operations = {
 	.mmap		= xip_file_mmap,
 	.open		= generic_file_open,
 	.release	= ext2_release_file,
-	.fsync		= ext2_sync_file,
+	.fsync		= simple_fsync,
 };
 #endif
 
diff --git a/fs/ext2/fsync.c b/fs/ext2/fsync.c
deleted file mode 100644
index fc66c93..0000000
--- a/fs/ext2/fsync.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- *  linux/fs/ext2/fsync.c
- *
- *  Copyright (C) 1993  Stephen Tweedie (sct@dcs.ed.ac.uk)
- *  from
- *  Copyright (C) 1992  Remy Card (card@masi.ibp.fr)
- *                      Laboratoire MASI - Institut Blaise Pascal
- *                      Universite Pierre et Marie Curie (Paris VI)
- *  from
- *  linux/fs/minix/truncate.c   Copyright (C) 1991, 1992  Linus Torvalds
- * 
- *  ext2fs fsync primitive
- *
- *  Big-endian to little-endian byte-swapping/bitmaps by
- *        David S. Miller (davem@caip.rutgers.edu), 1995
- * 
- *  Removed unnecessary code duplication for little endian machines
- *  and excessive __inline__s. 
- *        Andi Kleen, 1997
- *
- * Major simplications and cleanup - we only need to do the metadata, because
- * we can depend on generic_block_fdatasync() to sync the data blocks.
- */
-
-#include "ext2.h"
-#include <linux/buffer_head.h>		/* for sync_mapping_buffers() */
-
-
-/*
- *	File may be NULL when we are called. Perhaps we shouldn't
- *	even pass file to fsync ?
- */
-
-int ext2_sync_file(struct file *file, struct dentry *dentry, int datasync)
-{
-	struct inode *inode = dentry->d_inode;
-	int err;
-	int ret;
-
-	ret = sync_mapping_buffers(inode->i_mapping);
-	if (!(inode->i_state & I_DIRTY))
-		return ret;
-	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-		return ret;
-
-	err = ext2_sync_inode(inode);
-	if (ret == 0)
-		ret = err;
-	return ret;
-}
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index acf6788..29ed682 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -41,8 +41,6 @@ MODULE_AUTHOR("Remy Card and others");
 MODULE_DESCRIPTION("Second Extended Filesystem");
 MODULE_LICENSE("GPL");
 
-static int ext2_update_inode(struct inode * inode, int do_sync);
-
 /*
  * Test whether an inode is a fast symlink.
  */
@@ -66,7 +64,7 @@ void ext2_delete_inode (struct inode * inode)
 		goto no_delete;
 	EXT2_I(inode)->i_dtime	= get_seconds();
 	mark_inode_dirty(inode);
-	ext2_update_inode(inode, inode_needs_sync(inode));
+	ext2_write_inode(inode, inode_needs_sync(inode));
 
 	inode->i_size = 0;
 	if (inode->i_blocks)
@@ -1337,7 +1335,7 @@ bad_inode:
 	return ERR_PTR(ret);
 }
 
-static int ext2_update_inode(struct inode * inode, int do_sync)
+int ext2_write_inode(struct inode *inode, int do_sync)
 {
 	struct ext2_inode_info *ei = EXT2_I(inode);
 	struct super_block *sb = inode->i_sb;
@@ -1442,11 +1440,6 @@ static int ext2_update_inode(struct inode * inode, int do_sync)
 	return err;
 }
 
-int ext2_write_inode(struct inode *inode, int wait)
-{
-	return ext2_update_inode(inode, wait);
-}
-
 int ext2_sync_inode(struct inode *inode)
 {
 	struct writeback_control wbc = {
-- 
cgit v0.10.2


From 0d7916d7e985da52cdd2989c900485e17b035972 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Jun 2009 15:21:06 -0400
Subject: switch minix to simple_fsync()

* get minix_write_inode() to honour the second argument
* now we can use simple_fsync() for minixfs

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index d4946c4..e5f2064 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -22,7 +22,7 @@ static int minix_readdir(struct file *, void *, filldir_t);
 const struct file_operations minix_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= minix_readdir,
-	.fsync		= minix_sync_file,
+	.fsync		= simple_fsync,
 };
 
 static inline void dir_put_page(struct page *page)
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 17765f6..3eec3e6 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -6,15 +6,12 @@
  *  minix regular file handling primitives
  */
 
-#include <linux/buffer_head.h>		/* for fsync_inode_buffers() */
 #include "minix.h"
 
 /*
  * We have mostly NULLs here: the current defaults are OK for
  * the minix filesystem.
  */
-int minix_sync_file(struct file *, struct dentry *, int);
-
 const struct file_operations minix_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -22,7 +19,7 @@ const struct file_operations minix_file_operations = {
 	.write		= do_sync_write,
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
-	.fsync		= minix_sync_file,
+	.fsync		= simple_fsync,
 	.splice_read	= generic_file_splice_read,
 };
 
@@ -30,18 +27,3 @@ const struct inode_operations minix_file_inode_operations = {
 	.truncate	= minix_truncate,
 	.getattr	= minix_getattr,
 };
-
-int minix_sync_file(struct file * file, struct dentry *dentry, int datasync)
-{
-	struct inode *inode = dentry->d_inode;
-	int err;
-
-	err = sync_mapping_buffers(inode->i_mapping);
-	if (!(inode->i_state & I_DIRTY))
-		return err;
-	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-		return err;
-	
-	err |= minix_sync_inode(inode);
-	return err ? -EIO : 0;
-}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 7eb5397..f91a236 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -556,38 +556,25 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode)
 	return bh;
 }
 
-static struct buffer_head *minix_update_inode(struct inode *inode)
-{
-	if (INODE_VERSION(inode) == MINIX_V1)
-		return V1_minix_update_inode(inode);
-	else
-		return V2_minix_update_inode(inode);
-}
-
-static int minix_write_inode(struct inode * inode, int wait)
-{
-	brelse(minix_update_inode(inode));
-	return 0;
-}
-
-int minix_sync_inode(struct inode * inode)
+static int minix_write_inode(struct inode *inode, int wait)
 {
 	int err = 0;
 	struct buffer_head *bh;
 
-	bh = minix_update_inode(inode);
-	if (bh && buffer_dirty(bh))
-	{
+	if (INODE_VERSION(inode) == MINIX_V1)
+		bh = V1_minix_update_inode(inode);
+	else
+		bh = V2_minix_update_inode(inode);
+	if (!bh)
+		return -EIO;
+	if (wait && buffer_dirty(bh)) {
 		sync_dirty_buffer(bh);
-		if (buffer_req(bh) && !buffer_uptodate(bh))
-		{
+		if (buffer_req(bh) && !buffer_uptodate(bh)) {
 			printk("IO error syncing minix inode [%s:%08lx]\n",
 				inode->i_sb->s_id, inode->i_ino);
-			err = -1;
+			err = -EIO;
 		}
 	}
-	else if (!bh)
-		err = -1;
 	brelse (bh);
 	return err;
 }
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index e6a0b19..cb7fdd1 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -57,7 +57,6 @@ extern int __minix_write_begin(struct file *file, struct address_space *mapping,
 extern void V1_minix_truncate(struct inode *);
 extern void V2_minix_truncate(struct inode *);
 extern void minix_truncate(struct inode *);
-extern int minix_sync_inode(struct inode *);
 extern void minix_set_inode(struct inode *, dev_t);
 extern int V1_minix_get_block(struct inode *, long, struct buffer_head *, int);
 extern int V2_minix_get_block(struct inode *, long, struct buffer_head *, int);
@@ -72,7 +71,6 @@ extern int minix_empty_dir(struct inode*);
 extern void minix_set_link(struct minix_dir_entry*, struct page*, struct inode*);
 extern struct minix_dir_entry *minix_dotdot(struct inode*, struct page**);
 extern ino_t minix_inode_by_name(struct dentry*);
-extern int minix_sync_file(struct file *, struct dentry *, int);
 
 extern const struct inode_operations minix_file_inode_operations;
 extern const struct inode_operations minix_dir_inode_operations;
-- 
cgit v0.10.2


From 05459ca81ac3064cb040d983342bc453cccec458 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Jun 2009 15:29:45 -0400
Subject: repair sysv_write_inode(), switch sysv to simple_fsync()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 56f6552..c779807 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -24,7 +24,7 @@ static int sysv_readdir(struct file *, void *, filldir_t);
 const struct file_operations sysv_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= sysv_readdir,
-	.fsync		= sysv_sync_file,
+	.fsync		= simple_fsync,
 };
 
 static inline void dir_put_page(struct page *page)
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 589be21..96340c0 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -26,7 +26,7 @@ const struct file_operations sysv_file_operations = {
 	.write		= do_sync_write,
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
-	.fsync		= sysv_sync_file,
+	.fsync		= simple_fsync,
 	.splice_read	= generic_file_splice_read,
 };
 
@@ -34,18 +34,3 @@ const struct inode_operations sysv_file_inode_operations = {
 	.truncate	= sysv_truncate,
 	.getattr	= sysv_getattr,
 };
-
-int sysv_sync_file(struct file * file, struct dentry *dentry, int datasync)
-{
-	struct inode *inode = dentry->d_inode;
-	int err;
-
-	err = sync_mapping_buffers(inode->i_mapping);
-	if (!(inode->i_state & I_DIRTY))
-		return err;
-	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-		return err;
-	
-	err |= sysv_sync_inode(inode);
-	return err ? -EIO : 0;
-}
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index a3f45fc..425c976 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -247,7 +247,7 @@ bad_inode:
 	return ERR_PTR(-EIO);
 }
 
-static struct buffer_head * sysv_update_inode(struct inode * inode)
+int sysv_write_inode(struct inode *inode, int wait)
 {
 	struct super_block * sb = inode->i_sb;
 	struct sysv_sb_info * sbi = SYSV_SB(sb);
@@ -255,19 +255,21 @@ static struct buffer_head * sysv_update_inode(struct inode * inode)
 	struct sysv_inode * raw_inode;
 	struct sysv_inode_info * si;
 	unsigned int ino, block;
+	int err = 0;
 
 	ino = inode->i_ino;
 	if (!ino || ino > sbi->s_ninodes) {
 		printk("Bad inode number on dev %s: %d is out of range\n",
 		       inode->i_sb->s_id, ino);
-		return NULL;
+		return -EIO;
 	}
 	raw_inode = sysv_raw_inode(sb, ino, &bh);
 	if (!raw_inode) {
 		printk("unable to read i-node block\n");
-		return NULL;
+		return -EIO;
 	}
 
+	lock_kernel();
 	raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode);
 	raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(inode->i_uid));
 	raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(inode->i_gid));
@@ -283,38 +285,23 @@ static struct buffer_head * sysv_update_inode(struct inode * inode)
 	for (block = 0; block < 10+1+1+1; block++)
 		write3byte(sbi, (u8 *)&si->i_data[block],
 			&raw_inode->i_data[3*block]);
+	unlock_kernel();
 	mark_buffer_dirty(bh);
-	return bh;
-}
-
-int sysv_write_inode(struct inode * inode, int wait)
-{
-	struct buffer_head *bh;
-	lock_kernel();
-	bh = sysv_update_inode(inode);
+	if (wait) {
+                sync_dirty_buffer(bh);
+                if (buffer_req(bh) && !buffer_uptodate(bh)) {
+                        printk ("IO error syncing sysv inode [%s:%08x]\n",
+                                sb->s_id, ino);
+                        err = -EIO;
+                }
+        }
 	brelse(bh);
-	unlock_kernel();
 	return 0;
 }
 
-int sysv_sync_inode(struct inode * inode)
+int sysv_sync_inode(struct inode *inode)
 {
-        int err = 0;
-        struct buffer_head *bh;
-
-        bh = sysv_update_inode(inode);
-        if (bh && buffer_dirty(bh)) {
-                sync_dirty_buffer(bh);
-                if (buffer_req(bh) && !buffer_uptodate(bh)) {
-                        printk ("IO error syncing sysv inode [%s:%08lx]\n",
-                                inode->i_sb->s_id, inode->i_ino);
-                        err = -1;
-                }
-        }
-        else if (!bh)
-                err = -1;
-        brelse (bh);
-        return err;
+	return sysv_write_inode(inode, 1);
 }
 
 static void sysv_delete_inode(struct inode *inode)
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 5784a31..53786eb 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -144,7 +144,6 @@ extern int __sysv_write_begin(struct file *file, struct address_space *mapping,
 extern struct inode *sysv_iget(struct super_block *, unsigned int);
 extern int sysv_write_inode(struct inode *, int);
 extern int sysv_sync_inode(struct inode *);
-extern int sysv_sync_file(struct file *, struct dentry *, int);
 extern void sysv_set_inode(struct inode *, dev_t);
 extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int sysv_init_icache(void);
-- 
cgit v0.10.2


From a932801543fe74050ebee07fde082234c46b624f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Jun 2009 15:35:18 -0400
Subject: switch ufs to simple_fsync()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 6321b79..6f671f1 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -666,6 +666,6 @@ not_empty:
 const struct file_operations ufs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= ufs_readdir,
-	.fsync		= ufs_sync_file,
+	.fsync		= simple_fsync,
 	.llseek		= generic_file_llseek,
 };
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 2bd3a16..73655c6 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -24,31 +24,10 @@
  */
 
 #include <linux/fs.h>
-#include <linux/buffer_head.h>	/* for sync_mapping_buffers() */
 
 #include "ufs_fs.h"
 #include "ufs.h"
 
-
-int ufs_sync_file(struct file *file, struct dentry *dentry, int datasync)
-{
-	struct inode *inode = dentry->d_inode;
-	int err;
-	int ret;
-
-	ret = sync_mapping_buffers(inode->i_mapping);
-	if (!(inode->i_state & I_DIRTY))
-		return ret;
-	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-		return ret;
-
-	err = ufs_sync_inode(inode);
-	if (ret == 0)
-		ret = err;
-	return ret;
-}
-
-
 /*
  * We have mostly NULL's here: the current defaults are ok for
  * the ufs filesystem.
@@ -62,6 +41,6 @@ const struct file_operations ufs_file_operations = {
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.open           = generic_file_open,
-	.fsync		= ufs_sync_file,
+	.fsync		= simple_fsync,
 	.splice_read	= generic_file_splice_read,
 };
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index d0c4acd..644e77e 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -99,7 +99,6 @@ extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
 extern const struct inode_operations ufs_file_inode_operations;
 extern const struct file_operations ufs_file_operations;
 extern const struct address_space_operations ufs_aops;
-extern int ufs_sync_file(struct file *, struct dentry *, int);
 
 /* ialloc.c */
 extern void ufs_free_inode (struct inode *inode);
-- 
cgit v0.10.2


From 90de066443a8632bb42fed0a8216313d7da07aba Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Jun 2009 15:40:27 -0400
Subject: switch udf to simple_fsync()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/udf/Makefile b/fs/udf/Makefile
index 0d4503f..eb880f6 100644
--- a/fs/udf/Makefile
+++ b/fs/udf/Makefile
@@ -5,5 +5,5 @@
 obj-$(CONFIG_UDF_FS) += udf.o
 
 udf-objs     := balloc.o dir.o file.o ialloc.o inode.o lowlevel.o namei.o \
-		partition.o super.o truncate.o symlink.o fsync.o \
+		partition.o super.o truncate.o symlink.o \
 		directory.o misc.o udftime.o unicode.o
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 2efd4d5..61d9a76 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -210,5 +210,5 @@ const struct file_operations udf_dir_operations = {
 	.read			= generic_read_dir,
 	.readdir		= udf_readdir,
 	.ioctl			= udf_ioctl,
-	.fsync			= udf_fsync_file,
+	.fsync			= simple_fsync,
 };
diff --git a/fs/udf/file.c b/fs/udf/file.c
index eb91f3b..7464305 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -209,7 +209,7 @@ const struct file_operations udf_file_operations = {
 	.write			= do_sync_write,
 	.aio_write		= udf_file_aio_write,
 	.release		= udf_release_file,
-	.fsync			= udf_fsync_file,
+	.fsync			= simple_fsync,
 	.splice_read		= generic_file_splice_read,
 	.llseek			= generic_file_llseek,
 };
diff --git a/fs/udf/fsync.c b/fs/udf/fsync.c
deleted file mode 100644
index b2c472b..0000000
--- a/fs/udf/fsync.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * fsync.c
- *
- * PURPOSE
- *  Fsync handling routines for the OSTA-UDF(tm) filesystem.
- *
- * COPYRIGHT
- *  This file is distributed under the terms of the GNU General Public
- *  License (GPL). Copies of the GPL can be obtained from:
- *      ftp://prep.ai.mit.edu/pub/gnu/GPL
- *  Each contributing author retains all rights to their own work.
- *
- *  (C) 1999-2001 Ben Fennema
- *  (C) 1999-2000 Stelias Computing Inc
- *
- * HISTORY
- *
- *  05/22/99 blf  Created.
- */
-
-#include "udfdecl.h"
-
-#include <linux/fs.h>
-
-static int udf_fsync_inode(struct inode *, int);
-
-/*
- *	File may be NULL when we are called. Perhaps we shouldn't
- *	even pass file to fsync ?
- */
-
-int udf_fsync_file(struct file *file, struct dentry *dentry, int datasync)
-{
-	struct inode *inode = dentry->d_inode;
-
-	return udf_fsync_inode(inode, datasync);
-}
-
-static int udf_fsync_inode(struct inode *inode, int datasync)
-{
-	int err;
-
-	err = sync_mapping_buffers(inode->i_mapping);
-	if (!(inode->i_state & I_DIRTY))
-		return err;
-	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-		return err;
-
-	err |= udf_sync_inode(inode);
-
-	return err ? -EIO : 0;
-}
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index cac51b7..8d46f42 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -223,9 +223,6 @@ extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t,
 extern int udf_new_block(struct super_block *, struct inode *, uint16_t,
 			 uint32_t, int *);
 
-/* fsync.c */
-extern int udf_fsync_file(struct file *, struct dentry *, int);
-
 /* directory.c */
 extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *,
 						struct udf_fileident_bh *,
-- 
cgit v0.10.2


From bea6b64c277f0824cdaea6190209b26a164419d6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Jun 2009 15:44:50 -0400
Subject: switch omfs to simple_fsync()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 834b233..d17e774 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -11,21 +11,6 @@
 #include <linux/mpage.h>
 #include "omfs.h"
 
-static int omfs_sync_file(struct file *file, struct dentry *dentry,
-		int datasync)
-{
-	struct inode *inode = dentry->d_inode;
-	int err;
-
-	err = sync_mapping_buffers(inode->i_mapping);
-	if (!(inode->i_state & I_DIRTY))
-		return err;
-	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-		return err;
-	err |= omfs_sync_inode(inode);
-	return err ? -EIO : 0;
-}
-
 static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset)
 {
 	return (sbi->s_sys_blocksize - offset -
@@ -344,7 +329,7 @@ struct file_operations omfs_file_operations = {
 	.aio_read = generic_file_aio_read,
 	.aio_write = generic_file_aio_write,
 	.mmap = generic_file_mmap,
-	.fsync = omfs_sync_file,
+	.fsync = simple_fsync,
 	.splice_read = generic_file_splice_read,
 };
 
-- 
cgit v0.10.2


From ffdc9064f8b4fa9db37a7d5180f41cce2ea2b7ad Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Jun 2009 00:44:42 -0400
Subject: repair adfs ->write_inode(), switch to simple_fsync()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index e0a85db..a6665f3 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -53,6 +53,7 @@ struct adfs_dir_ops {
 	int	(*update)(struct adfs_dir *dir, struct object_info *obj);
 	int	(*create)(struct adfs_dir *dir, struct object_info *obj);
 	int	(*remove)(struct adfs_dir *dir, struct object_info *obj);
+	int	(*sync)(struct adfs_dir *dir);
 	void	(*free)(struct adfs_dir *dir);
 };
 
@@ -90,7 +91,8 @@ extern const struct dentry_operations adfs_dentry_operations;
 extern struct adfs_dir_ops adfs_f_dir_ops;
 extern struct adfs_dir_ops adfs_fplus_dir_ops;
 
-extern int adfs_dir_update(struct super_block *sb, struct object_info *obj);
+extern int adfs_dir_update(struct super_block *sb, struct object_info *obj,
+			   int wait);
 
 /* file.c */
 extern const struct inode_operations adfs_file_inode_operations;
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index e867ccf..4d40734 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -83,7 +83,7 @@ out:
 }
 
 int
-adfs_dir_update(struct super_block *sb, struct object_info *obj)
+adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait)
 {
 	int ret = -EINVAL;
 #ifdef CONFIG_ADFS_FS_RW
@@ -106,6 +106,12 @@ adfs_dir_update(struct super_block *sb, struct object_info *obj)
 	ret = ops->update(&dir, obj);
 	write_unlock(&adfs_dir_lock);
 
+	if (wait) {
+		int err = ops->sync(&dir);
+		if (!ret)
+			ret = err;
+	}
+
 	ops->free(&dir);
 out:
 #endif
@@ -199,7 +205,7 @@ const struct file_operations adfs_dir_operations = {
 	.read		= generic_read_dir,
 	.llseek		= generic_file_llseek,
 	.readdir	= adfs_readdir,
-	.fsync		= file_fsync,
+	.fsync		= simple_fsync,
 };
 
 static int
diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c
index ea7df21..31df6ad 100644
--- a/fs/adfs/dir_f.c
+++ b/fs/adfs/dir_f.c
@@ -437,6 +437,22 @@ bad_dir:
 #endif
 }
 
+static int
+adfs_f_sync(struct adfs_dir *dir)
+{
+	int err = 0;
+	int i;
+
+	for (i = dir->nr_buffers - 1; i >= 0; i--) {
+		struct buffer_head *bh = dir->bh[i];
+		sync_dirty_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh))
+			err = -EIO;
+	}
+
+	return err;
+}
+
 static void
 adfs_f_free(struct adfs_dir *dir)
 {
@@ -456,5 +472,6 @@ struct adfs_dir_ops adfs_f_dir_ops = {
 	.setpos		= adfs_f_setpos,
 	.getnext	= adfs_f_getnext,
 	.update		= adfs_f_update,
+	.sync		= adfs_f_sync,
 	.free		= adfs_f_free
 };
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index 1ec644e..139e0f3 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c
@@ -161,6 +161,22 @@ out:
 	return ret;
 }
 
+static int
+adfs_fplus_sync(struct adfs_dir *dir)
+{
+	int err = 0;
+	int i;
+
+	for (i = dir->nr_buffers - 1; i >= 0; i--) {
+		struct buffer_head *bh = dir->bh[i];
+		sync_dirty_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh))
+			err = -EIO;
+	}
+
+	return err;
+}
+
 static void
 adfs_fplus_free(struct adfs_dir *dir)
 {
@@ -175,5 +191,6 @@ struct adfs_dir_ops adfs_fplus_dir_ops = {
 	.read		= adfs_fplus_read,
 	.setpos		= adfs_fplus_setpos,
 	.getnext	= adfs_fplus_getnext,
+	.sync		= adfs_fplus_sync,
 	.free		= adfs_fplus_free
 };
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 36e381c..8224d54 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -30,7 +30,7 @@ const struct file_operations adfs_file_operations = {
 	.read		= do_sync_read,
 	.aio_read	= generic_file_aio_read,
 	.mmap		= generic_file_mmap,
-	.fsync		= file_fsync,
+	.fsync		= simple_fsync,
 	.write		= do_sync_write,
 	.aio_write	= generic_file_aio_write,
 	.splice_read	= generic_file_splice_read,
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index e647200..05b3a67 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -376,7 +376,7 @@ out:
  * The adfs-specific inode data has already been updated by
  * adfs_notify_change()
  */
-int adfs_write_inode(struct inode *inode, int unused)
+int adfs_write_inode(struct inode *inode, int wait)
 {
 	struct super_block *sb = inode->i_sb;
 	struct object_info obj;
@@ -391,7 +391,7 @@ int adfs_write_inode(struct inode *inode, int unused)
 	obj.attr	= ADFS_I(inode)->attr;
 	obj.size	= inode->i_size;
 
-	ret = adfs_dir_update(sb, &obj);
+	ret = adfs_dir_update(sb, &obj, wait);
 	unlock_kernel();
 	return ret;
 }
-- 
cgit v0.10.2


From 224c886643e52e6b4c1143489cd0b289b6c03976 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Jun 2009 00:46:40 -0400
Subject: Fix adfs GET_FRAG_ID() on big-endian

Missing conversion to host-endian before doing shifts

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/adfs/map.c b/fs/adfs/map.c
index 92ab4fb..568081b 100644
--- a/fs/adfs/map.c
+++ b/fs/adfs/map.c
@@ -62,7 +62,7 @@ static DEFINE_RWLOCK(adfs_map_lock);
 #define GET_FRAG_ID(_map,_start,_idmask)				\
 	({								\
 		unsigned char *_m = _map + (_start >> 3);		\
-		u32 _frag = get_unaligned((u32 *)_m);			\
+		u32 _frag = get_unaligned_le32(_m);			\
 		_frag >>= (_start & 7);					\
 		_frag & _idmask;					\
 	})
-- 
cgit v0.10.2


From 4427f0c36e22e2cd6696b2fe7643e9756a14b3d3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Jun 2009 01:15:58 -0400
Subject: repair bfs_write_inode(), switch bfs to simple_fsync()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 4dd1b62..54bd07d 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -79,7 +79,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
 const struct file_operations bfs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= bfs_readdir,
-	.fsync		= file_fsync,
+	.fsync		= simple_fsync,
 	.llseek		= generic_file_llseek,
 };
 
@@ -205,7 +205,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
 		inode->i_nlink = 1;
 	}
 	de->ino = 0;
-	mark_buffer_dirty(bh);
+	mark_buffer_dirty_inode(bh, dir);
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
 	mark_inode_dirty(dir);
 	inode->i_ctime = dir->i_ctime;
@@ -267,7 +267,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 		inode_dec_link_count(new_inode);
 	}
-	mark_buffer_dirty(old_bh);
+	mark_buffer_dirty_inode(old_bh, old_dir);
 	error = 0;
 
 end_rename:
@@ -320,7 +320,7 @@ static int bfs_add_entry(struct inode *dir, const unsigned char *name,
 				for (i = 0; i < BFS_NAMELEN; i++)
 					de->name[i] =
 						(i < namelen) ? name[i] : 0;
-				mark_buffer_dirty(bh);
+				mark_buffer_dirty_inode(bh, dir);
 				brelse(bh);
 				return 0;
 			}
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 3a9a136..d1d9d90 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -98,14 +98,15 @@ error:
 	return ERR_PTR(-EIO);
 }
 
-static int bfs_write_inode(struct inode *inode, int unused)
+static int bfs_write_inode(struct inode *inode, int wait)
 {
+	struct bfs_sb_info *info = BFS_SB(inode->i_sb);
 	unsigned int ino = (u16)inode->i_ino;
         unsigned long i_sblock;
 	struct bfs_inode *di;
 	struct buffer_head *bh;
 	int block, off;
-	struct bfs_sb_info *info = BFS_SB(inode->i_sb);
+	int err = 0;
 
         dprintf("ino=%08x\n", ino);
 
@@ -146,9 +147,14 @@ static int bfs_write_inode(struct inode *inode, int unused)
 	di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
 
 	mark_buffer_dirty(bh);
+	if (wait) {
+		sync_dirty_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh))
+			err = -EIO;
+	}
 	brelse(bh);
 	mutex_unlock(&info->bfs_lock);
-	return 0;
+	return err;
 }
 
 static void bfs_delete_inode(struct inode *inode)
-- 
cgit v0.10.2


From c475879556a8602bbe2faa9a06f6e5fcc8c05bb2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Jun 2009 01:22:00 -0400
Subject: sanitize ->fsync() for affs

unfortunately, for affs (especially for affs directories) we have
no real way to keep track of metadata ownership.  So we have to
do more or less what file_fsync() does, but we do *not* need to
call write_super() there.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 1a2d5e3..e511dc6 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -182,6 +182,7 @@ extern int			 affs_add_entry(struct inode *dir, struct inode *inode, struct dent
 
 void		affs_free_prealloc(struct inode *inode);
 extern void	affs_truncate(struct inode *);
+int		affs_file_fsync(struct file *, struct dentry *, int);
 
 /* dir.c */
 
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index 7b36904..8ca8f3a 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -21,7 +21,7 @@ const struct file_operations affs_dir_operations = {
 	.read		= generic_read_dir,
 	.llseek		= generic_file_llseek,
 	.readdir	= affs_readdir,
-	.fsync		= file_fsync,
+	.fsync		= affs_file_fsync,
 };
 
 /*
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 9246cb4..184e55c 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -34,7 +34,7 @@ const struct file_operations affs_file_operations = {
 	.mmap		= generic_file_mmap,
 	.open		= affs_file_open,
 	.release	= affs_file_release,
-	.fsync		= file_fsync,
+	.fsync		= affs_file_fsync,
 	.splice_read	= generic_file_splice_read,
 };
 
@@ -915,3 +915,15 @@ affs_truncate(struct inode *inode)
 	}
 	affs_free_prealloc(inode);
 }
+
+int affs_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+{
+	struct inode * inode = dentry->d_inode;
+	int ret, err;
+
+	ret = write_inode_now(inode, 0);
+	err = sync_blockdev(inode->i_sb->s_bdev);
+	if (!ret)
+		ret = err;
+	return ret;
+}
-- 
cgit v0.10.2


From e28964365faf3b9961695eb62b48cbc9f2a2a245 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 10:03:15 +0200
Subject: affs: add ->sync_fs

Add a ->sync_fs method for data integrity syncs.  Factor out common code
between affs_put_super, affs_write_super and the new affs_sync_fs into
a helper.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/affs/super.c b/fs/affs/super.c
index c481493..104fdcb 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -25,6 +25,19 @@ static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int affs_remount (struct super_block *sb, int *flags, char *data);
 
 static void
+affs_commit_super(struct super_block *sb, int clean)
+{
+	struct affs_sb_info *sbi = AFFS_SB(sb);
+	struct buffer_head *bh = sbi->s_root_bh;
+	struct affs_root_tail *tail = AFFS_ROOT_TAIL(sb, bh);
+
+	tail->bm_flag = cpu_to_be32(clean);
+	secs_to_datestamp(get_seconds(), &tail->disk_change);
+	affs_fix_checksum(sb, bh);
+	mark_buffer_dirty(bh);
+}
+
+static void
 affs_put_super(struct super_block *sb)
 {
 	struct affs_sb_info *sbi = AFFS_SB(sb);
@@ -32,13 +45,8 @@ affs_put_super(struct super_block *sb)
 
 	lock_kernel();
 
-	if (!(sb->s_flags & MS_RDONLY)) {
-		AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag = cpu_to_be32(1);
-		secs_to_datestamp(get_seconds(),
-				  &AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->disk_change);
-		affs_fix_checksum(sb, sbi->s_root_bh);
-		mark_buffer_dirty(sbi->s_root_bh);
-	}
+	if (!(sb->s_flags & MS_RDONLY))
+		affs_commit_super(sb, 1);
 
 	kfree(sbi->s_prefix);
 	affs_free_bitmap(sb);
@@ -53,18 +61,13 @@ static void
 affs_write_super(struct super_block *sb)
 {
 	int clean = 2;
-	struct affs_sb_info *sbi = AFFS_SB(sb);
 
 	lock_super(sb);
 	if (!(sb->s_flags & MS_RDONLY)) {
 		//	if (sbi->s_bitmap[i].bm_bh) {
 		//		if (buffer_dirty(sbi->s_bitmap[i].bm_bh)) {
 		//			clean = 0;
-		AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag = cpu_to_be32(clean);
-		secs_to_datestamp(get_seconds(),
-				  &AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->disk_change);
-		affs_fix_checksum(sb, sbi->s_root_bh);
-		mark_buffer_dirty(sbi->s_root_bh);
+		affs_commit_super(sb, clean);
 		sb->s_dirt = !clean;	/* redo until bitmap synced */
 	} else
 		sb->s_dirt = 0;
@@ -73,6 +76,16 @@ affs_write_super(struct super_block *sb)
 	pr_debug("AFFS: write_super() at %lu, clean=%d\n", get_seconds(), clean);
 }
 
+static int
+affs_sync_fs(struct super_block *sb, int wait)
+{
+	lock_super(sb);
+	affs_commit_super(sb, 2);
+	sb->s_dirt = 0;
+	unlock_super(sb);
+	return 0;
+}
+
 static struct kmem_cache * affs_inode_cachep;
 
 static struct inode *affs_alloc_inode(struct super_block *sb)
@@ -130,6 +143,7 @@ static const struct super_operations affs_sops = {
 	.clear_inode	= affs_clear_inode,
 	.put_super	= affs_put_super,
 	.write_super	= affs_write_super,
+	.sync_fs	= affs_sync_fs,
 	.statfs		= affs_statfs,
 	.remount_fs	= affs_remount,
 	.show_options	= generic_show_options,
-- 
cgit v0.10.2


From 561e47ce7244168788d4ecef9a2271df204b3c89 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 10:03:38 +0200
Subject: bfs: add ->sync_fs

Add a ->sync_fs method for data integrity syncs, and reimplement
->write_super ontop of it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index d1d9d90..6f60336 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -216,6 +216,26 @@ static void bfs_delete_inode(struct inode *inode)
 	clear_inode(inode);
 }
 
+static int bfs_sync_fs(struct super_block *sb, int wait)
+{
+	struct bfs_sb_info *info = BFS_SB(sb);
+
+	mutex_lock(&info->bfs_lock);
+	mark_buffer_dirty(info->si_sbh);
+	sb->s_dirt = 0;
+	mutex_unlock(&info->bfs_lock);
+
+	return 0;
+}
+
+static void bfs_write_super(struct super_block *sb)
+{
+	if (!(sb->s_flags & MS_RDONLY))
+		bfs_sync_fs(sb, 1);
+	else
+		sb->s_dirt = 0;
+}
+
 static void bfs_put_super(struct super_block *s)
 {
 	struct bfs_sb_info *info = BFS_SB(s);
@@ -254,17 +274,6 @@ static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
-static void bfs_write_super(struct super_block *s)
-{
-	struct bfs_sb_info *info = BFS_SB(s);
-
-	mutex_lock(&info->bfs_lock);
-	if (!(s->s_flags & MS_RDONLY))
-		mark_buffer_dirty(info->si_sbh);
-	s->s_dirt = 0;
-	mutex_unlock(&info->bfs_lock);
-}
-
 static struct kmem_cache *bfs_inode_cachep;
 
 static struct inode *bfs_alloc_inode(struct super_block *sb)
@@ -312,6 +321,7 @@ static const struct super_operations bfs_sops = {
 	.delete_inode	= bfs_delete_inode,
 	.put_super	= bfs_put_super,
 	.write_super	= bfs_write_super,
+	.sync_fs	= bfs_sync_fs,
 	.statfs		= bfs_statfs,
 };
 
-- 
cgit v0.10.2


From 80e09fb942d38beb19dcffbeb14d496beeb0a989 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 10:03:58 +0200
Subject: exofs: add ->sync_fs

Add a ->sync_fs method for data integrity syncs, and reimplement
->write_super ontop of it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 49e16af..8216c5b 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -200,18 +200,18 @@ static const struct export_operations exofs_export_ops;
 /*
  * Write the superblock to the OSD
  */
-static void exofs_write_super(struct super_block *sb)
+static int exofs_sync_fs(struct super_block *sb, int wait)
 {
 	struct exofs_sb_info *sbi;
 	struct exofs_fscb *fscb;
 	struct osd_request *or;
 	struct osd_obj_id obj;
-	int ret;
+	int ret = -ENOMEM;
 
 	fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
 	if (!fscb) {
 		EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
-		return;
+		return -ENOMEM;
 	}
 
 	lock_super(sb);
@@ -249,6 +249,15 @@ out:
 	unlock_kernel();
 	unlock_super(sb);
 	kfree(fscb);
+	return ret;
+}
+
+static void exofs_write_super(struct super_block *sb)
+{
+	if (!(sb->s_flags & MS_RDONLY))
+		exofs_sync_fs(sb, 1);
+	else
+		sb->s_dirt = 0;
 }
 
 /*
@@ -493,6 +502,7 @@ static const struct super_operations exofs_sops = {
 	.delete_inode   = exofs_delete_inode,
 	.put_super      = exofs_put_super,
 	.write_super    = exofs_write_super,
+	.sync_fs	= exofs_sync_fs,
 	.statfs         = exofs_statfs,
 };
 
-- 
cgit v0.10.2


From 40f31dd47e7c3d15af1f9845eda0fa0c4c33f32f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 10:04:17 +0200
Subject: ext2: add ->sync_fs

Add a ->sync_fs method for data integrity syncs, and reimplement
->write_super ontop of it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index f8cbdf5..4589996 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -42,6 +42,7 @@ static void ext2_sync_super(struct super_block *sb,
 			    struct ext2_super_block *es);
 static int ext2_remount (struct super_block * sb, int * flags, char * data);
 static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
+static int ext2_sync_fs(struct super_block *sb, int wait);
 
 void ext2_error (struct super_block * sb, const char * function,
 		 const char * fmt, ...)
@@ -309,6 +310,7 @@ static const struct super_operations ext2_sops = {
 	.delete_inode	= ext2_delete_inode,
 	.put_super	= ext2_put_super,
 	.write_super	= ext2_write_super,
+	.sync_fs	= ext2_sync_fs,
 	.statfs		= ext2_statfs,
 	.remount_fs	= ext2_remount,
 	.clear_inode	= ext2_clear_inode,
@@ -1132,25 +1134,36 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
  * set s_state to EXT2_VALID_FS after some corrections.
  */
 
-void ext2_write_super (struct super_block * sb)
+static int ext2_sync_fs(struct super_block *sb, int wait)
 {
-	struct ext2_super_block * es;
+	struct ext2_super_block *es = EXT2_SB(sb)->s_es;
+
 	lock_kernel();
-	if (!(sb->s_flags & MS_RDONLY)) {
-		es = EXT2_SB(sb)->s_es;
-
-		if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
-			ext2_debug ("setting valid to 0\n");
-			es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
-			es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
-			es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb));
-			es->s_mtime = cpu_to_le32(get_seconds());
-			ext2_sync_super(sb, es);
-		} else
-			ext2_commit_super (sb, es);
+	if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
+		ext2_debug("setting valid to 0\n");
+		es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
+		es->s_free_blocks_count =
+			cpu_to_le32(ext2_count_free_blocks(sb));
+		es->s_free_inodes_count =
+			cpu_to_le32(ext2_count_free_inodes(sb));
+		es->s_mtime = cpu_to_le32(get_seconds());
+		ext2_sync_super(sb, es);
+	} else {
+		ext2_commit_super(sb, es);
 	}
 	sb->s_dirt = 0;
 	unlock_kernel();
+
+	return 0;
+}
+
+
+void ext2_write_super(struct super_block *sb)
+{
+	if (!(sb->s_flags & MS_RDONLY))
+		ext2_sync_fs(sb, 1);
+	else
+		sb->s_dirt = 0;
 }
 
 static int ext2_remount (struct super_block * sb, int * flags, char * data)
-- 
cgit v0.10.2


From f83d6d46e7adf241a064a4a425e5cd8a8fd8925f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 10:04:35 +0200
Subject: fat: add ->sync_fs

Add a ->sync_fs method for data integrity syncs.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 476f80b..51a5ecf 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -449,6 +449,16 @@ static void fat_write_super(struct super_block *sb)
 	unlock_super(sb);
 }
 
+static int fat_sync_fs(struct super_block *sb, int wait)
+{
+	lock_super(sb);
+	fat_clusters_flush(sb);
+	sb->s_dirt = 0;
+	unlock_super(sb);
+
+	return 0;
+}
+
 static void fat_put_super(struct super_block *sb)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -643,6 +653,7 @@ static const struct super_operations fat_sops = {
 	.delete_inode	= fat_delete_inode,
 	.put_super	= fat_put_super,
 	.write_super	= fat_write_super,
+	.sync_fs	= fat_sync_fs,
 	.statfs		= fat_statfs,
 	.clear_inode	= fat_clear_inode,
 	.remount_fs	= fat_remount,
-- 
cgit v0.10.2


From 58bc5bbb873eb5d86126a3fd3ff02aaa69ec15d0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 10:04:54 +0200
Subject: hfs: add ->sync_fs

Add a ->sync_fs method for data integrity syncs.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 3aac417..6f833dc 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -58,6 +58,16 @@ static void hfs_write_super(struct super_block *sb)
 	unlock_super(sb);
 }
 
+static int hfs_sync_fs(struct super_block *sb, int wait)
+{
+	lock_super(sb);
+	hfs_mdb_commit(sb);
+	sb->s_dirt = 0;
+	unlock_super(sb);
+
+	return 0;
+}
+
 /*
  * hfs_put_super()
  *
@@ -172,6 +182,7 @@ static const struct super_operations hfs_super_operations = {
 	.clear_inode	= hfs_clear_inode,
 	.put_super	= hfs_put_super,
 	.write_super	= hfs_write_super,
+	.sync_fs	= hfs_sync_fs,
 	.statfs		= hfs_statfs,
 	.remount_fs     = hfs_remount,
 	.show_options	= hfs_show_options,
-- 
cgit v0.10.2


From 7fbc6df0e7a561a313f49faa77829d5de45a97f8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 10:05:12 +0200
Subject: hfsplus: add ->sync_fs

Add a ->sync_fs method for data integrity syncs, and reimplement
->write_super ontop of it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 1aab8aa..9fc3af0c 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -152,7 +152,7 @@ static void hfsplus_clear_inode(struct inode *inode)
 	}
 }
 
-static void hfsplus_write_super(struct super_block *sb)
+static int hfsplus_sync_fs(struct super_block *sb, int wait)
 {
 	struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
 
@@ -160,9 +160,6 @@ static void hfsplus_write_super(struct super_block *sb)
 
 	lock_super(sb);
 	sb->s_dirt = 0;
-	if (sb->s_flags & MS_RDONLY)
-		/* warn? */
-		goto out;
 
 	vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks);
 	vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc);
@@ -194,8 +191,16 @@ static void hfsplus_write_super(struct super_block *sb)
 		}
 		HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP;
 	}
- out:
 	unlock_super(sb);
+	return 0;
+}
+
+static void hfsplus_write_super(struct super_block *sb)
+{
+	if (!(sb->s_flags & MS_RDONLY))
+		hfsplus_sync_fs(sb, 1);
+	else
+		sb->s_dirt = 0;
 }
 
 static void hfsplus_put_super(struct super_block *sb)
@@ -290,6 +295,7 @@ static const struct super_operations hfsplus_sops = {
 	.clear_inode	= hfsplus_clear_inode,
 	.put_super	= hfsplus_put_super,
 	.write_super	= hfsplus_write_super,
+	.sync_fs	= hfsplus_sync_fs,
 	.statfs		= hfsplus_statfs,
 	.remount_fs	= hfsplus_remount,
 	.show_options	= hfsplus_show_options,
-- 
cgit v0.10.2


From ad43ffdeea0a7bd3e6036c4aeec2e6699aef8ac7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 10:07:45 +0200
Subject: sysv: add ->sync_fs

Add a ->sync_fs method for data integrity syncs, and reimplement
->write_super ontop of it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 425c976..4799234 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -31,16 +31,13 @@
 #include <asm/byteorder.h>
 #include "sysv.h"
 
-/* This is only called on sync() and umount(), when s_dirt=1. */
-static void sysv_write_super(struct super_block *sb)
+static int sysv_sync_fs(struct super_block *sb, int wait)
 {
 	struct sysv_sb_info *sbi = SYSV_SB(sb);
 	unsigned long time = get_seconds(), old_time;
 
 	lock_super(sb);
 	lock_kernel();
-	if (sb->s_flags & MS_RDONLY)
-		goto clean;
 
 	/*
 	 * If we are going to write out the super block,
@@ -54,10 +51,19 @@ static void sysv_write_super(struct super_block *sb)
 		*sbi->s_sb_time = cpu_to_fs32(sbi, time);
 		mark_buffer_dirty(sbi->s_bh2);
 	}
-clean:
-	sb->s_dirt = 0;
+
 	unlock_kernel();
 	unlock_super(sb);
+
+	return 0;
+}
+
+static void sysv_write_super(struct super_block *sb)
+{
+	if (!(sb->s_flags & MS_RDONLY))
+		sysv_sync_fs(sb, 1);
+	else
+		sb->s_dirt = 0;
 }
 
 static int sysv_remount(struct super_block *sb, int *flags, char *data)
@@ -345,6 +351,7 @@ const struct super_operations sysv_sops = {
 	.delete_inode	= sysv_delete_inode,
 	.put_super	= sysv_put_super,
 	.write_super	= sysv_write_super,
+	.sync_fs	= sysv_sync_fs,
 	.remount_fs	= sysv_remount,
 	.statfs		= sysv_statfs,
 };
-- 
cgit v0.10.2


From 8c8006564a58d0ea912bf7f2d0a758d97e4b464f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 10:08:05 +0200
Subject: ufs: add ->sync_fs

Add a ->sync_fs method for data integrity syncs, and reimplement
->write_super ontop of it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 6560dda..5faed79 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1125,7 +1125,7 @@ failed_nomem:
 	return -ENOMEM;
 }
 
-static void ufs_write_super(struct super_block *sb)
+static int ufs_sync_fs(struct super_block *sb, int wait)
 {
 	struct ufs_sb_private_info * uspi;
 	struct ufs_super_block_first * usb1;
@@ -1134,25 +1134,36 @@ static void ufs_write_super(struct super_block *sb)
 
 	lock_super(sb);
 	lock_kernel();
+
 	UFSD("ENTER\n");
+
 	flags = UFS_SB(sb)->s_flags;
 	uspi = UFS_SB(sb)->s_uspi;
 	usb1 = ubh_get_usb_first(uspi);
 	usb3 = ubh_get_usb_third(uspi);
 
-	if (!(sb->s_flags & MS_RDONLY)) {
-		usb1->fs_time = cpu_to_fs32(sb, get_seconds());
-		if ((flags & UFS_ST_MASK) == UFS_ST_SUN 
-		  || (flags & UFS_ST_MASK) == UFS_ST_SUNOS
-		  || (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
-			ufs_set_fs_state(sb, usb1, usb3,
-					UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
-		ufs_put_cstotal(sb);
-	}
+	usb1->fs_time = cpu_to_fs32(sb, get_seconds());
+	if ((flags & UFS_ST_MASK) == UFS_ST_SUN  ||
+	    (flags & UFS_ST_MASK) == UFS_ST_SUNOS ||
+	    (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
+		ufs_set_fs_state(sb, usb1, usb3,
+				UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
+	ufs_put_cstotal(sb);
 	sb->s_dirt = 0;
+
 	UFSD("EXIT\n");
 	unlock_kernel();
 	unlock_super(sb);
+
+	return 0;
+}
+
+static void ufs_write_super(struct super_block *sb)
+{
+	if (!(sb->s_flags & MS_RDONLY))
+		ufs_sync_fs(sb, 1);
+	else
+		sb->s_dirt = 0;
 }
 
 static void ufs_put_super(struct super_block *sb)
@@ -1381,6 +1392,7 @@ static const struct super_operations ufs_super_ops = {
 	.delete_inode	= ufs_delete_inode,
 	.put_super	= ufs_put_super,
 	.write_super	= ufs_write_super,
+	.sync_fs	= ufs_sync_fs,
 	.statfs		= ufs_statfs,
 	.remount_fs	= ufs_remount,
 	.show_options   = ufs_show_options,
-- 
cgit v0.10.2


From d579ed00aa96a7f7486978540a0d7cecaff742ae Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 10:08:21 +0200
Subject: jffs2: call jffs2_write_super from jffs2_sync_fs

The call to ->write_super from __sync_filesystem will go away, so make
sure jffs2 performs the same actions from inside ->sync_fs.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index f7bfd3a..07a22ca 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -74,6 +74,8 @@ static int jffs2_sync_fs(struct super_block *sb, int wait)
 {
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
 
+	jffs2_write_super(sb);
+
 	mutex_lock(&c->alloc_sem);
 	jffs2_flush_wbuf_pad(c);
 	mutex_unlock(&c->alloc_sem);
-- 
cgit v0.10.2


From d731e06323cb705003e4172ec209e469be4c18e1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 10:08:36 +0200
Subject: nilfs2: call nilfs2_write_super from nilfs2_sync_fs

The call to ->write_super from __sync_filesystem will go away, so make
sure nilfs2 performs the same actions from inside ->sync_fs.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 11151ea..122dc1e 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -391,6 +391,8 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
 {
 	int err = 0;
 
+	nilfs_write_super(sb);
+
 	/* This function is called when super block should be written back */
 	if (wait)
 		err = nilfs_construct_segment(sb);
-- 
cgit v0.10.2


From 0c95ee190e1dea60c55c834d14695341085c9b7b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 10:08:54 +0200
Subject: remove the call to ->write_super in __sync_filesystem

Now that all filesystems provide ->sync_fs methods we can change
__sync_filesystem to only call ->sync_fs.

This gives us a clear separation between periodic writeouts which
are driven by ->write_super and data integrity syncs that go
through ->sync_fs. (modulo file_fsync which is also going away)

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/sync.c b/fs/sync.c
index e9d56f6..dd20002 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -33,8 +33,6 @@ static int __sync_filesystem(struct super_block *sb, int wait)
 	else
 		sync_quota_sb(sb, -1);
 	sync_inodes_sb(sb, wait);
-	if (sb->s_dirt && sb->s_op->write_super)
-		sb->s_op->write_super(sb);
 	if (sb->s_op->sync_fs)
 		sb->s_op->sync_fs(sb, wait);
 	return __sync_blockdev(sb->s_bdev, wait);
-- 
cgit v0.10.2


From 81fc20bd0e75ba6357bce2403767d7c2585d8f28 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 8 Jun 2009 01:39:28 +0900
Subject: nilfs2: remove meaningless EBUSY case from nilfs_get_sb function

The following EBUSY case in nilfs_get_sb() is meaningless.  Indeed,
this error code is never returned to the caller.

    if (!s->s_root) {
          ...
    } else if (!(s->s_flags & MS_RDONLY)) {
        err = -EBUSY;
    }

This simply removes the else case.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 122dc1e..1c505d0 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1186,8 +1186,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 
 		s->s_flags |= MS_ACTIVE;
 		need_to_close = 0;
-	} else if (!(s->s_flags & MS_RDONLY)) {
-		err = -EBUSY;
 	}
 
 	up(&sd.bdev->bd_mount_sem);
-- 
cgit v0.10.2


From 33c8e57c86d1bd1548c12a4f7c4bceb94b862cca Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 8 Jun 2009 01:39:29 +0900
Subject: nilfs2: get rid of sget use for acquiring nilfs object

This will change the way to obtain nilfs object in nilfs_get_sb()
function.

Previously, a preliminary sget() call was performed, and the nilfs
object was acquired from a super block instance found by the sget()
call.

This patch, instead, instroduces a new dedicated function
find_or_create_nilfs(); as the name implies, the function finds an
existent nilfs object from a global list or creates a new one if no
object is found on the device.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 1c505d0..3c9833e 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1061,13 +1061,6 @@ static int nilfs_set_bdev_super(struct super_block *s, void *data)
 static int nilfs_test_bdev_super(struct super_block *s, void *data)
 {
 	struct nilfs_super_data *sd = data;
-
-	return s->s_bdev == sd->bdev;
-}
-
-static int nilfs_test_bdev_super2(struct super_block *s, void *data)
-{
-	struct nilfs_super_data *sd = data;
 	int ret;
 
 	if (s->s_bdev != sd->bdev)
@@ -1096,8 +1089,8 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	     const char *dev_name, void *data, struct vfsmount *mnt)
 {
 	struct nilfs_super_data sd;
-	struct super_block *s, *s2;
-	struct the_nilfs *nilfs = NULL;
+	struct super_block *s;
+	struct the_nilfs *nilfs;
 	int err, need_to_close = 1;
 
 	sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type);
@@ -1118,11 +1111,12 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 		goto failed;
 	}
 
-	/*
-	 * once the super is inserted into the list by sget, s_umount
-	 * will protect the lockfs code from trying to start a snapshot
-	 * while we are mounting
-	 */
+	nilfs = find_or_create_nilfs(sd.bdev);
+	if (!nilfs) {
+		err = -ENOMEM;
+		goto failed;
+	}
+
 	down(&sd.bdev->bd_mount_sem);
 	if (!sd.cno &&
 	    (err = test_exclusive_mount(fs_type, sd.bdev, flags ^ MS_RDONLY))) {
@@ -1131,51 +1125,22 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	}
 
 	/*
-	 * Phase-1: search any existent instance and get the_nilfs
+	 * Search specified snapshot or R/W mode super_block
 	 */
-	s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
-	if (IS_ERR(s))
-		goto error_s;
-
-	if (!s->s_root) {
-		err = -ENOMEM;
-		nilfs = alloc_nilfs(sd.bdev);
-		if (!nilfs)
-			goto cancel_new;
-	} else {
-		struct nilfs_sb_info *sbi = NILFS_SB(s);
+	if (!sd.cno)
+		/* trying to get the latest checkpoint.  */
+		sd.cno = nilfs_last_cno(nilfs);
 
-		/*
-		 * s_umount protects super_block from unmount process;
-		 * It covers pointers of nilfs_sb_info and the_nilfs.
-		 */
-		nilfs = sbi->s_nilfs;
-		get_nilfs(nilfs);
-		up_write(&s->s_umount);
-
-		/*
-		 * Phase-2: search specified snapshot or R/W mode super_block
-		 */
-		if (!sd.cno)
-			/* trying to get the latest checkpoint.  */
-			sd.cno = nilfs_last_cno(nilfs);
-
-		s2 = sget(fs_type, nilfs_test_bdev_super2,
-			  nilfs_set_bdev_super, &sd);
-		deactivate_super(s);
-		/*
-		 * Although deactivate_super() invokes close_bdev_exclusive() at
-		 * kill_block_super().  Here, s is an existent mount; we need
-		 * one more close_bdev_exclusive() call.
-		 */
-		s = s2;
-		if (IS_ERR(s))
-			goto error_s;
+	s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
+	if (IS_ERR(s)) {
+		err = PTR_ERR(s);
+		goto failed_unlock;
 	}
 
 	if (!s->s_root) {
 		char b[BDEVNAME_SIZE];
 
+		/* New superblock instance created */
 		s->s_flags = flags;
 		strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
 		sb_set_blocksize(s, block_size(sd.bdev));
@@ -1195,15 +1160,9 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	simple_set_mnt(mnt, s);
 	return 0;
 
- error_s:
-	up(&sd.bdev->bd_mount_sem);
-	if (nilfs)
-		put_nilfs(nilfs);
-	close_bdev_exclusive(sd.bdev, flags);
-	return PTR_ERR(s);
-
  failed_unlock:
 	up(&sd.bdev->bd_mount_sem);
+	put_nilfs(nilfs);
  failed:
 	close_bdev_exclusive(sd.bdev, flags);
 
@@ -1212,8 +1171,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
  cancel_new:
 	/* Abandoning the newly allocated superblock */
 	up(&sd.bdev->bd_mount_sem);
-	if (nilfs)
-		put_nilfs(nilfs);
+	put_nilfs(nilfs);
 	up_write(&s->s_umount);
 	deactivate_super(s);
 	/*
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index a91f15b..45dbf6a 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -35,6 +35,10 @@
 #include "seglist.h"
 #include "segbuf.h"
 
+
+static LIST_HEAD(nilfs_objects);
+static DEFINE_SPINLOCK(nilfs_lock);
+
 void nilfs_set_last_segment(struct the_nilfs *nilfs,
 			    sector_t start_blocknr, u64 seq, __u64 cno)
 {
@@ -55,7 +59,7 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs,
  * Return Value: On success, pointer to the_nilfs is returned.
  * On error, NULL is returned.
  */
-struct the_nilfs *alloc_nilfs(struct block_device *bdev)
+static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 {
 	struct the_nilfs *nilfs;
 
@@ -69,6 +73,7 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 	atomic_set(&nilfs->ns_ndirtyblks, 0);
 	init_rwsem(&nilfs->ns_sem);
 	mutex_init(&nilfs->ns_writer_mutex);
+	INIT_LIST_HEAD(&nilfs->ns_list);
 	INIT_LIST_HEAD(&nilfs->ns_supers);
 	spin_lock_init(&nilfs->ns_last_segment_lock);
 	nilfs->ns_gc_inodes_h = NULL;
@@ -78,6 +83,45 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 }
 
 /**
+ * find_or_create_nilfs - find or create nilfs object
+ * @bdev: block device to which the_nilfs is related
+ *
+ * find_nilfs() looks up an existent nilfs object created on the
+ * device and gets the reference count of the object.  If no nilfs object
+ * is found on the device, a new nilfs object is allocated.
+ *
+ * Return Value: On success, pointer to the nilfs object is returned.
+ * On error, NULL is returned.
+ */
+struct the_nilfs *find_or_create_nilfs(struct block_device *bdev)
+{
+	struct the_nilfs *nilfs, *new = NULL;
+
+ retry:
+	spin_lock(&nilfs_lock);
+	list_for_each_entry(nilfs, &nilfs_objects, ns_list) {
+		if (nilfs->ns_bdev == bdev) {
+			get_nilfs(nilfs);
+			spin_unlock(&nilfs_lock);
+			if (new)
+				put_nilfs(new);
+			return nilfs; /* existing object */
+		}
+	}
+	if (new) {
+		list_add_tail(&new->ns_list, &nilfs_objects);
+		spin_unlock(&nilfs_lock);
+		return new; /* new object */
+	}
+	spin_unlock(&nilfs_lock);
+
+	new = alloc_nilfs(bdev);
+	if (new)
+		goto retry;
+	return NULL; /* insufficient memory */
+}
+
+/**
  * put_nilfs - release a reference to the_nilfs
  * @nilfs: the_nilfs structure to be released
  *
@@ -86,13 +130,20 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
  */
 void put_nilfs(struct the_nilfs *nilfs)
 {
-	if (!atomic_dec_and_test(&nilfs->ns_count))
+	spin_lock(&nilfs_lock);
+	if (!atomic_dec_and_test(&nilfs->ns_count)) {
+		spin_unlock(&nilfs_lock);
 		return;
+	}
+	list_del_init(&nilfs->ns_list);
+	spin_unlock(&nilfs_lock);
+
 	/*
-	 * Increment of ns_count never occur below because the caller
+	 * Increment of ns_count never occurs below because the caller
 	 * of get_nilfs() holds at least one reference to the_nilfs.
 	 * Thus its exclusion control is not required here.
 	 */
+
 	might_sleep();
 	if (nilfs_loaded(nilfs)) {
 		nilfs_mdt_clear(nilfs->ns_sufile);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 30fe587..116caf9 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -43,6 +43,7 @@ enum {
  * struct the_nilfs - struct to supervise multiple nilfs mount points
  * @ns_flags: flags
  * @ns_count: reference count
+ * @ns_list: list head for nilfs_list
  * @ns_bdev: block device
  * @ns_bdi: backing dev info
  * @ns_writer: back pointer to writable nilfs_sb_info
@@ -88,6 +89,7 @@ enum {
 struct the_nilfs {
 	unsigned long		ns_flags;
 	atomic_t		ns_count;
+	struct list_head	ns_list;
 
 	struct block_device    *ns_bdev;
 	struct backing_dev_info *ns_bdi;
@@ -191,7 +193,7 @@ THE_NILFS_FNS(DISCONTINUED, discontinued)
 #define NILFS_ALTSB_FREQ	60  /* spare superblock */
 
 void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
-struct the_nilfs *alloc_nilfs(struct block_device *);
+struct the_nilfs *find_or_create_nilfs(struct block_device *);
 void put_nilfs(struct the_nilfs *);
 int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
 int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
-- 
cgit v0.10.2


From 3f82ff55168e92859119bf348e9e0bd6714d2fea Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 8 Jun 2009 01:39:30 +0900
Subject: nilfs2: get rid of sget use for checking if current mount is present

This stops using sget() for checking if an r/w-mount or an r/o-mount
exists on the device.  This elimination uses a back pointer to the
current mount added to nilfs object.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 3c9833e..5a8c5e4 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -67,8 +67,6 @@ MODULE_LICENSE("GPL");
 
 static void nilfs_write_super(struct super_block *sb);
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
-static int test_exclusive_mount(struct file_system_type *fs_type,
-				struct block_device *bdev, int flags);
 
 /**
  * nilfs_error() - report failure condition on a filesystem
@@ -329,6 +327,10 @@ static void nilfs_put_super(struct super_block *sb)
 		nilfs_commit_super(sbi, 1);
 		up_write(&nilfs->ns_sem);
 	}
+	down_write(&nilfs->ns_sem);
+	if (nilfs->ns_current == sbi)
+		nilfs->ns_current = NULL;
+	up_write(&nilfs->ns_sem);
 
 	nilfs_detach_checkpoint(sbi);
 	put_nilfs(sbi->s_nilfs);
@@ -880,6 +882,11 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 		goto failed_root;
 	}
 
+	down_write(&nilfs->ns_sem);
+	if (!nilfs_test_opt(sbi, SNAPSHOT))
+		nilfs->ns_current = sbi;
+	up_write(&nilfs->ns_sem);
+
 	return 0;
 
  failed_root:
@@ -958,14 +965,16 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 		 * by fsck since we originally mounted the partition.)
 		 */
 		down(&sb->s_bdev->bd_mount_sem);
-		/* Check existing RW-mount */
-		if (test_exclusive_mount(sb->s_type, sb->s_bdev, 0)) {
+		down_read(&nilfs->ns_sem);
+		if (nilfs->ns_current && nilfs->ns_current != sbi) {
 			printk(KERN_WARNING "NILFS (device %s): couldn't "
-			       "remount because a RW-mount exists.\n",
+			       "remount because an RW-mount exists.\n",
 			       sb->s_id);
+			up_read(&nilfs->ns_sem);
 			err = -EBUSY;
 			goto rw_remount_failed;
 		}
+		up_read(&nilfs->ns_sem);
 		if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
 			printk(KERN_WARNING "NILFS (device %s): couldn't "
 			       "remount because the current RO-mount is not "
@@ -984,6 +993,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 
 		down_write(&nilfs->ns_sem);
 		nilfs_setup_super(sbi);
+		nilfs->ns_current = sbi;
 		up_write(&nilfs->ns_sem);
 
 		up(&sb->s_bdev->bd_mount_sem);
@@ -1118,10 +1128,23 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	}
 
 	down(&sd.bdev->bd_mount_sem);
-	if (!sd.cno &&
-	    (err = test_exclusive_mount(fs_type, sd.bdev, flags ^ MS_RDONLY))) {
-		err = (err < 0) ? : -EBUSY;
-		goto failed_unlock;
+
+	if (!sd.cno) {
+		/*
+		 * Check if an exclusive mount exists or not.
+		 * Snapshot mounts coexist with a current mount
+		 * (i.e. rw-mount or ro-mount), whereas rw-mount and
+		 * ro-mount are mutually exclusive.
+		 */
+		down_read(&nilfs->ns_sem);
+		if (nilfs->ns_current &&
+		    ((nilfs->ns_current->s_super->s_flags ^ flags)
+		     & MS_RDONLY)) {
+			up_read(&nilfs->ns_sem);
+			err = -EBUSY;
+			goto failed_unlock;
+		}
+		up_read(&nilfs->ns_sem);
 	}
 
 	/*
@@ -1182,57 +1205,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	return err;
 }
 
-static int nilfs_test_bdev_super3(struct super_block *s, void *data)
-{
-	struct nilfs_super_data *sd = data;
-	int ret;
-
-	if (s->s_bdev != sd->bdev)
-		return 0;
-	if (down_read_trylock(&s->s_umount)) {
-		ret = (s->s_flags & MS_RDONLY) && s->s_root &&
-			nilfs_test_opt(NILFS_SB(s), SNAPSHOT);
-		up_read(&s->s_umount);
-		if (ret)
-			return 0; /* ignore snapshot mounts */
-	}
-	return !((sd->flags ^ s->s_flags) & MS_RDONLY);
-}
-
-static int __false_bdev_super(struct super_block *s, void *data)
-{
-#if 0 /* XXX: workaround for lock debug. This is not good idea */
-	up_write(&s->s_umount);
-#endif
-	return -EFAULT;
-}
-
-/**
- * test_exclusive_mount - check whether an exclusive RW/RO mount exists or not.
- * fs_type: filesystem type
- * bdev: block device
- * flag: 0 (check rw-mount) or MS_RDONLY (check ro-mount)
- * res: pointer to an integer to store result
- *
- * This function must be called within a section protected by bd_mount_mutex.
- */
-static int test_exclusive_mount(struct file_system_type *fs_type,
-				struct block_device *bdev, int flags)
-{
-	struct super_block *s;
-	struct nilfs_super_data sd = { .flags = flags, .bdev = bdev };
-
-	s = sget(fs_type, nilfs_test_bdev_super3, __false_bdev_super, &sd);
-	if (IS_ERR(s)) {
-		if (PTR_ERR(s) != -EFAULT)
-			return PTR_ERR(s);
-		return 0;  /* Not found */
-	}
-	up_write(&s->s_umount);
-	deactivate_super(s);
-	return 1;  /* Found */
-}
-
 struct file_system_type nilfs_fs_type = {
 	.owner    = THIS_MODULE,
 	.name     = "nilfs2",
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 116caf9..99f7e29 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -50,6 +50,7 @@ enum {
  * @ns_sem: semaphore for shared states
  * @ns_writer_mutex: mutex protecting ns_writer attach/detach
  * @ns_writer_refcount: number of referrers on ns_writer
+ * @ns_current: back pointer to current mount
  * @ns_sbh: buffer heads of on-disk super blocks
  * @ns_sbp: pointers to super block data
  * @ns_sbwtime: previous write time of super blocks
@@ -98,6 +99,8 @@ struct the_nilfs {
 	struct mutex		ns_writer_mutex;
 	atomic_t		ns_writer_refcount;
 
+	struct nilfs_sb_info   *ns_current;
+
 	/*
 	 * used for
 	 * - loading the latest checkpoint exclusively.
-- 
cgit v0.10.2


From 6dd4740662405a68bb229ac2b9e0aeaaf2188bf2 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 8 Jun 2009 01:39:31 +0900
Subject: nilfs2: simplify remaining sget() use

This simplifies the test function passed on the remaining sget()
callsite in nilfs.

Instead of checking mount type (i.e. ro-mount/rw-mount/snapshot mount)
in the test function passed to sget(), this patch first looks up the
nilfs_sb_info struct which the given mount type matches, and then
acquires the super block instance holding the nilfs_sb_info.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index adccd4f..0776ccc 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -60,6 +60,7 @@ struct nilfs_sb_info {
 	struct super_block *s_super;	/* reverse pointer to super_block */
 	struct the_nilfs *s_nilfs;
 	struct list_head s_list;	/* list head for nilfs->ns_supers */
+	atomic_t s_count;		/* reference count */
 
 	/* Segment constructor */
 	struct list_head s_dirty_files;	/* dirty files list */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 5a8c5e4..1d1b6e1 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -336,7 +336,7 @@ static void nilfs_put_super(struct super_block *sb)
 	put_nilfs(sbi->s_nilfs);
 	sbi->s_super = NULL;
 	sb->s_fs_info = NULL;
-	kfree(sbi);
+	nilfs_put_sbinfo(sbi);
 
 	unlock_kernel();
 }
@@ -785,6 +785,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 	get_nilfs(nilfs);
 	sbi->s_nilfs = nilfs;
 	sbi->s_super = sb;
+	atomic_set(&sbi->s_count, 1);
 
 	err = init_nilfs(nilfs, sbi, (char *)data);
 	if (err)
@@ -902,7 +903,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
  failed_sbi:
 	put_nilfs(nilfs);
 	sb->s_fs_info = NULL;
-	kfree(sbi);
+	nilfs_put_sbinfo(sbi);
 	return err;
 }
 
@@ -1014,6 +1015,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 
 struct nilfs_super_data {
 	struct block_device *bdev;
+	struct nilfs_sb_info *sbi;
 	__u64 cno;
 	int flags;
 };
@@ -1071,27 +1073,8 @@ static int nilfs_set_bdev_super(struct super_block *s, void *data)
 static int nilfs_test_bdev_super(struct super_block *s, void *data)
 {
 	struct nilfs_super_data *sd = data;
-	int ret;
-
-	if (s->s_bdev != sd->bdev)
-		return 0;
-
-	if (!((s->s_flags | sd->flags) & MS_RDONLY))
-		return 1; /* Reuse an old R/W-mode super_block */
-
-	if (s->s_flags & sd->flags & MS_RDONLY) {
-		if (down_read_trylock(&s->s_umount)) {
-			ret = s->s_root &&
-				(sd->cno == NILFS_SB(s)->s_snapshot_cno);
-			up_read(&s->s_umount);
-			/*
-			 * This path is locked with sb_lock by sget().
-			 * So, drop_super() causes deadlock.
-			 */
-			return ret;
-		}
-	}
-	return 0;
+
+	return sd->sbi && s->s_fs_info == (void *)sd->sbi;
 }
 
 static int
@@ -1112,7 +1095,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	 * much more information than normal filesystems to identify mount
 	 * instance.  For snapshot mounts, not only a mount type (ro-mount
 	 * or rw-mount) but also a checkpoint number is required.
-	 * The results are passed in sget() using nilfs_super_data.
 	 */
 	sd.cno = 0;
 	sd.flags = flags;
@@ -1148,13 +1130,23 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	}
 
 	/*
-	 * Search specified snapshot or R/W mode super_block
+	 * Find existing nilfs_sb_info struct
 	 */
+	sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno);
+
 	if (!sd.cno)
 		/* trying to get the latest checkpoint.  */
 		sd.cno = nilfs_last_cno(nilfs);
 
+	/*
+	 * Get super block instance holding the nilfs_sb_info struct.
+	 * A new instance is allocated if no existing mount is present or
+	 * existing instance has been unmounted.
+	 */
 	s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
+	if (sd.sbi)
+		nilfs_put_sbinfo(sd.sbi);
+
 	if (IS_ERR(s)) {
 		err = PTR_ERR(s);
 		goto failed_unlock;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 45dbf6a..221953b 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -664,6 +664,56 @@ int nilfs_near_disk_full(struct the_nilfs *nilfs)
 	return ret;
 }
 
+/**
+ * nilfs_find_sbinfo - find existing nilfs_sb_info structure
+ * @nilfs: nilfs object
+ * @rw_mount: mount type (non-zero value for read/write mount)
+ * @cno: checkpoint number (zero for read-only mount)
+ *
+ * nilfs_find_sbinfo() returns the nilfs_sb_info structure which
+ * @rw_mount and @cno (in case of snapshots) matched.  If no instance
+ * was found, NULL is returned.  Although the super block instance can
+ * be unmounted after this function returns, the nilfs_sb_info struct
+ * is kept on memory until nilfs_put_sbinfo() is called.
+ */
+struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *nilfs,
+					int rw_mount, __u64 cno)
+{
+	struct nilfs_sb_info *sbi;
+
+	down_read(&nilfs->ns_sem);
+	/*
+	 * The SNAPSHOT flag and sb->s_flags are supposed to be
+	 * protected with nilfs->ns_sem.
+	 */
+	sbi = nilfs->ns_current;
+	if (rw_mount) {
+		if (sbi && !(sbi->s_super->s_flags & MS_RDONLY))
+			goto found; /* read/write mount */
+		else
+			goto out;
+	} else if (cno == 0) {
+		if (sbi && (sbi->s_super->s_flags & MS_RDONLY))
+			goto found; /* read-only mount */
+		else
+			goto out;
+	}
+
+	list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
+		if (nilfs_test_opt(sbi, SNAPSHOT) &&
+		    sbi->s_snapshot_cno == cno)
+			goto found; /* snapshot mount */
+	}
+ out:
+	up_read(&nilfs->ns_sem);
+	return NULL;
+
+ found:
+	atomic_inc(&sbi->s_count);
+	up_read(&nilfs->ns_sem);
+	return sbi;
+}
+
 int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
 				int snapshot_mount)
 {
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 99f7e29..be4c040 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -201,6 +201,7 @@ void put_nilfs(struct the_nilfs *);
 int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
 int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
 int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
+struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
 int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
 int nilfs_near_disk_full(struct the_nilfs *);
 void nilfs_fall_back_super_block(struct the_nilfs *);
@@ -243,6 +244,12 @@ nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 	mutex_unlock(&nilfs->ns_writer_mutex);
 }
 
+static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
+{
+	if (!atomic_dec_and_test(&sbi->s_count))
+		kfree(sbi);
+}
+
 static inline void
 nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum,
 			sector_t *seg_start, sector_t *seg_end)
-- 
cgit v0.10.2


From e59399d0102c1813cec48db5cebe1750313f88a0 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 8 Jun 2009 01:39:32 +0900
Subject: nilfs2: correct exclusion control in nilfs_remount function

nilfs_remount() changes mount state of a superblock instance.  Even
though nilfs accesses other superblock instances during mount or
remount, the mount state was not properly protected in
nilfs_remount().

Moreover, nilfs_remount() has a lock order reversal problem;
nilfs_get_sb() holds:

  1. bdev->bd_mount_sem
  2. sb->s_umount  (sget acquires)

and nilfs_remount() holds:

  1. sb->s_umount  (locked by the caller in vfs)
  2. bdev->bd_mount_sem

To avoid these problems, this patch divides a semaphore protecting
super block instances from nilfs->ns_sem, and applies it to the mount
state protection in nilfs_remount().

With this change, bd_mount_sem use is removed from nilfs_remount() and
the lock order reversal will be resolved.  And the new rw-semaphore,
nilfs->ns_super_sem will properly protect the mount state except the
modification from nilfs_error function.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 1d1b6e1..f02762f 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -327,10 +327,10 @@ static void nilfs_put_super(struct super_block *sb)
 		nilfs_commit_super(sbi, 1);
 		up_write(&nilfs->ns_sem);
 	}
-	down_write(&nilfs->ns_sem);
+	down_write(&nilfs->ns_super_sem);
 	if (nilfs->ns_current == sbi)
 		nilfs->ns_current = NULL;
-	up_write(&nilfs->ns_sem);
+	up_write(&nilfs->ns_super_sem);
 
 	nilfs_detach_checkpoint(sbi);
 	put_nilfs(sbi->s_nilfs);
@@ -408,9 +408,9 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
 	struct buffer_head *bh_cp;
 	int err;
 
-	down_write(&nilfs->ns_sem);
+	down_write(&nilfs->ns_super_sem);
 	list_add(&sbi->s_list, &nilfs->ns_supers);
-	up_write(&nilfs->ns_sem);
+	up_write(&nilfs->ns_super_sem);
 
 	sbi->s_ifile = nilfs_mdt_new(
 		nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP);
@@ -448,9 +448,9 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
 	nilfs_mdt_destroy(sbi->s_ifile);
 	sbi->s_ifile = NULL;
 
-	down_write(&nilfs->ns_sem);
+	down_write(&nilfs->ns_super_sem);
 	list_del_init(&sbi->s_list);
-	up_write(&nilfs->ns_sem);
+	up_write(&nilfs->ns_super_sem);
 
 	return err;
 }
@@ -462,9 +462,9 @@ void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
 	nilfs_mdt_clear(sbi->s_ifile);
 	nilfs_mdt_destroy(sbi->s_ifile);
 	sbi->s_ifile = NULL;
-	down_write(&nilfs->ns_sem);
+	down_write(&nilfs->ns_super_sem);
 	list_del_init(&sbi->s_list);
-	up_write(&nilfs->ns_sem);
+	up_write(&nilfs->ns_super_sem);
 }
 
 static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi)
@@ -883,10 +883,10 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 		goto failed_root;
 	}
 
-	down_write(&nilfs->ns_sem);
+	down_write(&nilfs->ns_super_sem);
 	if (!nilfs_test_opt(sbi, SNAPSHOT))
 		nilfs->ns_current = sbi;
-	up_write(&nilfs->ns_sem);
+	up_write(&nilfs->ns_super_sem);
 
 	return 0;
 
@@ -918,6 +918,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 
 	lock_kernel();
 
+	down_write(&nilfs->ns_super_sem);
 	old_sb_flags = sb->s_flags;
 	old_opts.mount_opt = sbi->s_mount_opt;
 	old_opts.snapshot_cno = sbi->s_snapshot_cno;
@@ -965,24 +966,20 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 		 * store the current valid flag.  (It may have been changed
 		 * by fsck since we originally mounted the partition.)
 		 */
-		down(&sb->s_bdev->bd_mount_sem);
-		down_read(&nilfs->ns_sem);
 		if (nilfs->ns_current && nilfs->ns_current != sbi) {
 			printk(KERN_WARNING "NILFS (device %s): couldn't "
 			       "remount because an RW-mount exists.\n",
 			       sb->s_id);
-			up_read(&nilfs->ns_sem);
 			err = -EBUSY;
-			goto rw_remount_failed;
+			goto restore_opts;
 		}
-		up_read(&nilfs->ns_sem);
 		if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
 			printk(KERN_WARNING "NILFS (device %s): couldn't "
 			       "remount because the current RO-mount is not "
 			       "the latest one.\n",
 			       sb->s_id);
 			err = -EINVAL;
-			goto rw_remount_failed;
+			goto restore_opts;
 		}
 		sb->s_flags &= ~MS_RDONLY;
 		nilfs_clear_opt(sbi, SNAPSHOT);
@@ -990,25 +987,24 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 
 		err = nilfs_attach_segment_constructor(sbi);
 		if (err)
-			goto rw_remount_failed;
+			goto restore_opts;
 
 		down_write(&nilfs->ns_sem);
 		nilfs_setup_super(sbi);
-		nilfs->ns_current = sbi;
 		up_write(&nilfs->ns_sem);
 
-		up(&sb->s_bdev->bd_mount_sem);
+		nilfs->ns_current = sbi;
 	}
  out:
+	up_write(&nilfs->ns_super_sem);
 	unlock_kernel();
 	return 0;
 
- rw_remount_failed:
-	up(&sb->s_bdev->bd_mount_sem);
  restore_opts:
 	sb->s_flags = old_sb_flags;
 	sbi->s_mount_opt = old_opts.mount_opt;
 	sbi->s_snapshot_cno = old_opts.snapshot_cno;
+	up_write(&nilfs->ns_super_sem);
 	unlock_kernel();
 	return err;
 }
@@ -1118,15 +1114,15 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 		 * (i.e. rw-mount or ro-mount), whereas rw-mount and
 		 * ro-mount are mutually exclusive.
 		 */
-		down_read(&nilfs->ns_sem);
+		down_read(&nilfs->ns_super_sem);
 		if (nilfs->ns_current &&
 		    ((nilfs->ns_current->s_super->s_flags ^ flags)
 		     & MS_RDONLY)) {
-			up_read(&nilfs->ns_sem);
+			up_read(&nilfs->ns_super_sem);
 			err = -EBUSY;
 			goto failed_unlock;
 		}
-		up_read(&nilfs->ns_sem);
+		up_read(&nilfs->ns_super_sem);
 	}
 
 	/*
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 221953b..06e8dfd 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -72,6 +72,7 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 	atomic_set(&nilfs->ns_writer_refcount, -1);
 	atomic_set(&nilfs->ns_ndirtyblks, 0);
 	init_rwsem(&nilfs->ns_sem);
+	init_rwsem(&nilfs->ns_super_sem);
 	mutex_init(&nilfs->ns_writer_mutex);
 	INIT_LIST_HEAD(&nilfs->ns_list);
 	INIT_LIST_HEAD(&nilfs->ns_supers);
@@ -681,10 +682,10 @@ struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *nilfs,
 {
 	struct nilfs_sb_info *sbi;
 
-	down_read(&nilfs->ns_sem);
+	down_read(&nilfs->ns_super_sem);
 	/*
 	 * The SNAPSHOT flag and sb->s_flags are supposed to be
-	 * protected with nilfs->ns_sem.
+	 * protected with nilfs->ns_super_sem.
 	 */
 	sbi = nilfs->ns_current;
 	if (rw_mount) {
@@ -705,12 +706,12 @@ struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *nilfs,
 			goto found; /* snapshot mount */
 	}
  out:
-	up_read(&nilfs->ns_sem);
+	up_read(&nilfs->ns_super_sem);
 	return NULL;
 
  found:
 	atomic_inc(&sbi->s_count);
-	up_read(&nilfs->ns_sem);
+	up_read(&nilfs->ns_super_sem);
 	return sbi;
 }
 
@@ -720,7 +721,7 @@ int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
 	struct nilfs_sb_info *sbi;
 	int ret = 0;
 
-	down_read(&nilfs->ns_sem);
+	down_read(&nilfs->ns_super_sem);
 	if (cno == 0 || cno > nilfs->ns_cno)
 		goto out_unlock;
 
@@ -737,6 +738,6 @@ int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
 		ret++;
 
  out_unlock:
-	up_read(&nilfs->ns_sem);
+	up_read(&nilfs->ns_super_sem);
 	return ret;
 }
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index be4c040..d0cf4fb 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -48,6 +48,7 @@ enum {
  * @ns_bdi: backing dev info
  * @ns_writer: back pointer to writable nilfs_sb_info
  * @ns_sem: semaphore for shared states
+ * @ns_super_sem: semaphore for global operations across super block instances
  * @ns_writer_mutex: mutex protecting ns_writer attach/detach
  * @ns_writer_refcount: number of referrers on ns_writer
  * @ns_current: back pointer to current mount
@@ -96,10 +97,15 @@ struct the_nilfs {
 	struct backing_dev_info *ns_bdi;
 	struct nilfs_sb_info   *ns_writer;
 	struct rw_semaphore	ns_sem;
+	struct rw_semaphore	ns_super_sem;
 	struct mutex		ns_writer_mutex;
 	atomic_t		ns_writer_refcount;
 
+	/*
+	 * components protected by ns_super_sem
+	 */
 	struct nilfs_sb_info   *ns_current;
+	struct list_head	ns_supers;
 
 	/*
 	 * used for
@@ -113,7 +119,6 @@ struct the_nilfs {
 	time_t			ns_sbwtime[2];
 	unsigned		ns_sbsize;
 	unsigned		ns_mount_state;
-	struct list_head	ns_supers;
 
 	/*
 	 * Following fields are dedicated to a writable FS-instance.
-- 
cgit v0.10.2


From aa7dfb8954ccf49e026ba13d12991a4eb7defb96 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 8 Jun 2009 01:39:33 +0900
Subject: nilfs2: get rid of bd_mount_sem use from nilfs

This will remove every bd_mount_sem use in nilfs.

The intended exclusion control was replaced by the previous patch
("nilfs2: correct exclusion control in nilfs_remount function") for
nilfs_remount(), and this patch will replace remains with a new mutex
that this inserts in nilfs object.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 300f1cd..cadd36b 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -864,11 +864,11 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
 	case NILFS_CHECKPOINT:
 		/*
 		 * Check for protecting existing snapshot mounts:
-		 * bd_mount_sem is used to make this operation atomic and
+		 * ns_mount_mutex is used to make this operation atomic and
 		 * exclusive with a new mount job.  Though it doesn't cover
 		 * umount, it's enough for the purpose.
 		 */
-		down(&nilfs->ns_bdev->bd_mount_sem);
+		mutex_lock(&nilfs->ns_mount_mutex);
 		if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) {
 			/* Current implementation does not have to protect
 			   plain read-only mounts since they are exclusive
@@ -877,7 +877,7 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
 			ret = -EBUSY;
 		} else
 			ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
-		up(&nilfs->ns_bdev->bd_mount_sem);
+		mutex_unlock(&nilfs->ns_mount_mutex);
 		return ret;
 	case NILFS_SNAPSHOT:
 		return nilfs_cpfile_set_snapshot(cpfile, cno);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index f02762f..1777a34 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -764,7 +764,7 @@ int nilfs_store_magic_and_option(struct super_block *sb,
  * @silent: silent mode flag
  * @nilfs: the_nilfs struct
  *
- * This function is called exclusively by bd_mount_mutex.
+ * This function is called exclusively by nilfs->ns_mount_mutex.
  * So, the recovery process is protected from other simultaneous mounts.
  */
 static int
@@ -1105,7 +1105,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 		goto failed;
 	}
 
-	down(&sd.bdev->bd_mount_sem);
+	mutex_lock(&nilfs->ns_mount_mutex);
 
 	if (!sd.cno) {
 		/*
@@ -1164,7 +1164,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 		need_to_close = 0;
 	}
 
-	up(&sd.bdev->bd_mount_sem);
+	mutex_unlock(&nilfs->ns_mount_mutex);
 	put_nilfs(nilfs);
 	if (need_to_close)
 		close_bdev_exclusive(sd.bdev, flags);
@@ -1172,7 +1172,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	return 0;
 
  failed_unlock:
-	up(&sd.bdev->bd_mount_sem);
+	mutex_unlock(&nilfs->ns_mount_mutex);
 	put_nilfs(nilfs);
  failed:
 	close_bdev_exclusive(sd.bdev, flags);
@@ -1181,14 +1181,14 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 
  cancel_new:
 	/* Abandoning the newly allocated superblock */
-	up(&sd.bdev->bd_mount_sem);
+	mutex_unlock(&nilfs->ns_mount_mutex);
 	put_nilfs(nilfs);
 	up_write(&s->s_umount);
 	deactivate_super(s);
 	/*
 	 * deactivate_super() invokes close_bdev_exclusive().
 	 * We must finish all post-cleaning before this call;
-	 * put_nilfs() and unlocking bd_mount_sem need the block device.
+	 * put_nilfs() needs the block device.
 	 */
 	return err;
 }
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 06e8dfd..e4e5c78 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -73,6 +73,7 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 	atomic_set(&nilfs->ns_ndirtyblks, 0);
 	init_rwsem(&nilfs->ns_sem);
 	init_rwsem(&nilfs->ns_super_sem);
+	mutex_init(&nilfs->ns_mount_mutex);
 	mutex_init(&nilfs->ns_writer_mutex);
 	INIT_LIST_HEAD(&nilfs->ns_list);
 	INIT_LIST_HEAD(&nilfs->ns_supers);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index d0cf4fb..e8adbff 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -49,6 +49,7 @@ enum {
  * @ns_writer: back pointer to writable nilfs_sb_info
  * @ns_sem: semaphore for shared states
  * @ns_super_sem: semaphore for global operations across super block instances
+ * @ns_mount_mutex: mutex protecting mount process of nilfs
  * @ns_writer_mutex: mutex protecting ns_writer attach/detach
  * @ns_writer_refcount: number of referrers on ns_writer
  * @ns_current: back pointer to current mount
@@ -98,6 +99,7 @@ struct the_nilfs {
 	struct nilfs_sb_info   *ns_writer;
 	struct rw_semaphore	ns_sem;
 	struct rw_semaphore	ns_super_sem;
+	struct mutex		ns_mount_mutex;
 	struct mutex		ns_writer_mutex;
 	atomic_t		ns_writer_refcount;
 
-- 
cgit v0.10.2


From e14112d1bd5e193166b54be19119cf6440470560 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Fri, 12 Jun 2009 10:14:22 +1000
Subject: perfcounters: remove powerpc definitions of perf_counter_do_pending

Commit 925d519ab82b6dd7aca9420d809ee83819c08db2 ("perf_counter:
unify and fix delayed counter wakeup") added global definitions.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: Paul Mackerras <paulus@samba.org>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index 20a44d0..5351237 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -156,8 +156,6 @@ static inline void clear_perf_counter_pending(void)
 		"i" (offsetof(struct paca_struct, perf_counter_pending)));
 }
 
-extern void perf_counter_do_pending(void);
-
 #else
 
 static inline unsigned long test_perf_counter_pending(void)
@@ -167,7 +165,6 @@ static inline unsigned long test_perf_counter_pending(void)
 
 static inline void set_perf_counter_pending(void) {}
 static inline void clear_perf_counter_pending(void) {}
-static inline void perf_counter_do_pending(void) {}
 #endif /* CONFIG_PERF_COUNTERS */
 
 #endif	/* __KERNEL__ */
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index feff792..844d3f8 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -53,6 +53,7 @@
 #include <linux/bootmem.h>
 #include <linux/pci.h>
 #include <linux/debugfs.h>
+#include <linux/perf_counter.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
-- 
cgit v0.10.2


From 12274e96b42884f043dfaa87eb1e5e10281bfac3 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 11 Jun 2009 15:07:48 -0700
Subject: x86: use zalloc_cpumask_var in arch_early_irq_init

So we make sure MAXSMP gets a cleared cpumask

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 94605e7..ef8d929 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -187,8 +187,8 @@ int __init arch_early_irq_init(void)
 	for (i = 0; i < count; i++) {
 		desc = irq_to_desc(i);
 		desc->chip_data = &cfg[i];
-		alloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
-		alloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
+		zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
+		zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
 		if (i < NR_IRQS_LEGACY)
 			cpumask_setall(cfg[i].domain);
 	}
-- 
cgit v0.10.2


From aee74f3bb3f4fb5dbeae8c1947c6d8ebdc19ee01 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 11 Jun 2009 15:09:00 -0700
Subject: kvm: remove the duplicated cpumask_clear

zalloc_cpumask_var already cleared it.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e211945..7645543 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2604,7 +2604,6 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
 		r = -ENOMEM;
 		goto out_free_0;
 	}
-	cpumask_clear(cpus_hardware_enabled);
 
 	r = kvm_arch_hardware_setup();
 	if (r < 0)
-- 
cgit v0.10.2


From 8ebf975608aaebd7feb33d77f07ba21a6380e086 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 11 Jun 2009 20:00:41 -0700
Subject: block: fix kernel-doc in recent block/ changes

Fix kernel-doc warnings in recently changed block/ source code.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/block/blk-core.c b/block/blk-core.c
index d17d71c..f6452f6 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -884,9 +884,10 @@ EXPORT_SYMBOL(blk_get_request);
 
 /**
  * blk_make_request - given a bio, allocate a corresponding struct request.
- *
+ * @q: target request queue
  * @bio:  The bio describing the memory mappings that will be submitted for IO.
  *        It may be a chained-bio properly constructed by block/bio layer.
+ * @gfp_mask: gfp flags to be used for memory allocation
  *
  * blk_make_request is the parallel of generic_make_request for BLOCK_PC
  * type commands. Where the struct request needs to be farther initialized by
@@ -1872,14 +1873,14 @@ EXPORT_SYMBOL(blk_fetch_request);
 
 /**
  * blk_update_request - Special helper function for request stacking drivers
- * @rq:	      the request being processed
+ * @req:      the request being processed
  * @error:    %0 for success, < %0 for error
- * @nr_bytes: number of bytes to complete @rq
+ * @nr_bytes: number of bytes to complete @req
  *
  * Description:
- *     Ends I/O on a number of bytes attached to @rq, but doesn't complete
- *     the request structure even if @rq doesn't have leftover.
- *     If @rq has leftover, sets it up for the next range of segments.
+ *     Ends I/O on a number of bytes attached to @req, but doesn't complete
+ *     the request structure even if @req doesn't have leftover.
+ *     If @req has leftover, sets it up for the next range of segments.
  *
  *     This special helper function is only for request stacking drivers
  *     (e.g. request-based dm) so that they can handle partial completion.
@@ -2145,7 +2146,7 @@ EXPORT_SYMBOL_GPL(blk_end_request);
 /**
  * blk_end_request_all - Helper function for drives to finish the request.
  * @rq: the request to finish
- * @err: %0 for success, < %0 for error
+ * @error: %0 for success, < %0 for error
  *
  * Description:
  *     Completely finish @rq.
@@ -2166,7 +2167,7 @@ EXPORT_SYMBOL_GPL(blk_end_request_all);
 /**
  * blk_end_request_cur - Helper function to finish the current request chunk.
  * @rq: the request to finish the current chunk for
- * @err: %0 for success, < %0 for error
+ * @error: %0 for success, < %0 for error
  *
  * Description:
  *     Complete the current consecutively mapped chunk from @rq.
@@ -2203,7 +2204,7 @@ EXPORT_SYMBOL_GPL(__blk_end_request);
 /**
  * __blk_end_request_all - Helper function for drives to finish the request.
  * @rq: the request to finish
- * @err: %0 for success, < %0 for error
+ * @error: %0 for success, < %0 for error
  *
  * Description:
  *     Completely finish @rq.  Must be called with queue lock held.
@@ -2224,7 +2225,7 @@ EXPORT_SYMBOL_GPL(__blk_end_request_all);
 /**
  * __blk_end_request_cur - Helper function to finish the current request chunk.
  * @rq: the request to finish the current chunk for
- * @err: %0 for success, < %0 for error
+ * @error: %0 for success, < %0 for error
  *
  * Description:
  *     Complete the current consecutively mapped chunk from @rq.  Must
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 1c4df9b..d71cedc 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -343,7 +343,7 @@ EXPORT_SYMBOL(blk_queue_physical_block_size);
 /**
  * blk_queue_alignment_offset - set physical block alignment offset
  * @q:	the request queue for the device
- * @alignment:	alignment offset in bytes
+ * @offset: alignment offset in bytes
  *
  * Description:
  *   Some devices are naturally misaligned to compensate for things like
@@ -362,7 +362,7 @@ EXPORT_SYMBOL(blk_queue_alignment_offset);
 /**
  * blk_queue_io_min - set minimum request size for the queue
  * @q:	the request queue for the device
- * @io_min:  smallest I/O size in bytes
+ * @min:  smallest I/O size in bytes
  *
  * Description:
  *   Some devices have an internal block size bigger than the reported
@@ -385,7 +385,7 @@ EXPORT_SYMBOL(blk_queue_io_min);
 /**
  * blk_queue_io_opt - set optimal request size for the queue
  * @q:	the request queue for the device
- * @io_opt:  optimal request size in bytes
+ * @opt:  optimal request size in bytes
  *
  * Description:
  *   Drivers can call this function to set the preferred I/O request
-- 
cgit v0.10.2